From bc9f4edf47d2cbed3b1ba7a61d1497dded91ed22 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 11 Jun 2025 16:44:09 +0100
Subject: [PATCH 0001/1322] [LTO] Fix used before intialised warning (#143705)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For whatever reason I can't reproduce this locally but I can on Compiler
Explorer (https://godbolt.org/z/nfv4b83q6) and on our flang gcc bot
(https://lab.llvm.org/buildbot/#/builders/130/builds/13683/steps/5/logs/stdio).

In file included from ../llvm-project/llvm/include/llvm/LTO/LTO.h:33,
from
../llvm-project/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:29:
../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In
constructor ‘llvm::FunctionImporter::ImportListsTy::ImportListsTy()’:
../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:275:33:
warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is
used uninitialized [-Wuninitialized]
  275 |     ImportListsTy() : EmptyList(ImportIDs) {}
      |                                 ^~~~~~~~~
../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In
constructor
‘llvm::FunctionImporter::ImportListsTy::ImportListsTy(size_t)’:

../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:276:44:
warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is
used uninitialized [-Wuninitialized]
276 | ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size)
{}
      |                                            ^~~~~~~~~

ImportIDs was being used during construction of EmptyList, before
ImportIDs itself had been constructed.
---
 llvm/include/llvm/Transforms/IPO/FunctionImport.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 65228bb65ba8..e6ae9ee831d5 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -272,8 +272,9 @@ public:
   // A map from destination modules to lists of imports.
   class ImportListsTy {
   public:
-    ImportListsTy() : EmptyList(ImportIDs) {}
-    ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size) {}
+    ImportListsTy() : ImportIDs(), EmptyList(ImportIDs) {}
+    ImportListsTy(size_t Size)
+        : ImportIDs(), EmptyList(ImportIDs), ListsImpl(Size) {}
 
     ImportMapTy &operator[](StringRef DestMod) {
       return ListsImpl.try_emplace(DestMod, ImportIDs).first->second;
@@ -293,9 +294,9 @@ public:
     const_iterator end() const { return ListsImpl.end(); }
 
   private:
+    ImportIDTable ImportIDs;
     ImportMapTy EmptyList;
     DenseMap<StringRef, ImportMapTy> ListsImpl;
-    ImportIDTable ImportIDs;
   };
 
   /// The set contains an entry for every global value that the module exports.

From 91be47dccfa3480c152916838404d49107fde45c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 08:53:54 -0700
Subject: [PATCH 0002/1322] [flang] Fix warnings

This patch fixes:

  flang/lib/Lower/OpenMP/OpenMP.cpp:3904:9: error: unused variable
  'action0' [-Werror,-Wunused-variable]

  flang/lib/Lower/OpenMP/OpenMP.cpp:3905:9: error: unused variable
  'action1' [-Werror,-Wunused-variable]
---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 3f3b85696db3..c13fa471978d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3911,6 +3911,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       // Capturing operation.
       assert(action0 != analysis.None && action1 != analysis.None &&
              "Expexcing two actions");
+      (void)action0;
+      (void)action1;
       captureOp =
           builder.create<mlir::omp::AtomicCaptureOp>(loc, hint, memOrder);
       // Set the non-atomic insertion point to before the atomic.capture.

From 2ab83e9f68f0c7b1a7199455d7ce05430d93fa44 Mon Sep 17 00:00:00 2001
From: Tony Varghese <tonypalampalliyil@gmail.com>
Date: Wed, 11 Jun 2025 21:28:26 +0530
Subject: [PATCH 0003/1322] [NFC][PowerPC] Rename xxevalPattern to adhere to
 naming convention. (#143675)

Rename class `xxevalPattern` to adhere to naming convention listed in
the coding guideline and used for all other classes in the td file.
---
 llvm/lib/Target/PowerPC/PPCInstrP10.td | 62 +++++++++++++-------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index a7f758745efe..d295f35fb1dd 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2159,7 +2159,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
                                (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
 }
 
-class xxevalPattern <dag pattern, bits<8> imm> :
+class XXEvalPattern <dag pattern, bits<8> imm> :
   Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
 
 let Predicates = [PrefixInstrs, HasP10Vector] in {
@@ -2192,83 +2192,83 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
     // Anonymous patterns for XXEVAL
     // AND
     // and(A, B, C)
-    def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
+    def : XXEvalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
     // and(A, xor(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
+    def : XXEvalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
     // and(A, or(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
+    def : XXEvalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
     // and(A, nor(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
+    def : XXEvalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
     // and(A, eqv(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
+    def : XXEvalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
     // and(A, nand(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
+    def : XXEvalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
 
     // NAND
     // nand(A, B, C)
-    def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+    def : XXEvalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
                          !sub(255, 1)>;
     // nand(A, xor(B, C))
-    def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+    def : XXEvalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
                          !sub(255, 6)>;
     // nand(A, or(B, C))
-    def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+    def : XXEvalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
                          !sub(255, 7)>;
     // nand(A, nor(B, C))
-    def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
                          !sub(255, 8)>;
     // nand(A, eqv(B, C))
-    def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
                          !sub(255, 9)>;
     // nand(A, nand(B, C))
-    def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
                          !sub(255, 14)>;
 
     // EQV
     // (eqv A, B, C)
-    def : xxevalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)),
                             (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)))),
                          150>;
     // (eqv A, (and B, C))
-    def : xxevalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>;
+    def : XXEvalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>;
     // (eqv A, (or B, C))
-    def : xxevalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>;
+    def : XXEvalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>;
 
     // NOR
     // (nor A, B, C)
-    def : xxevalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>;
+    def : XXEvalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>;
     // (nor A, (and B, C))
-    def : xxevalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>;
+    def : XXEvalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>;
     // (nor A, (eqv B, C))
-    def : xxevalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>;
+    def : XXEvalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>;
     // (nor A, (nand B, C))
-    def : xxevalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>;
+    def : XXEvalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>;
     // (nor A, (nor B, C))
-    def : xxevalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>;
+    def : XXEvalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>;
     // (nor A, (xor B, C))
-    def : xxevalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>;
+    def : XXEvalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>;
 
     // OR
     // (or A, B, C)
-    def : xxevalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>;
+    def : XXEvalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>;
     // (or A, (and B, C))
-    def : xxevalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>;
+    def : XXEvalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>;
     // (or A, (eqv B, C))
-    def : xxevalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>;
+    def : XXEvalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>;
     // (or A, (nand B, C))
-    def : xxevalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>;
+    def : XXEvalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>;
     // (or A, (nor B, C))
-    def : xxevalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>;
+    def : XXEvalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>;
     // (or A, (xor B, C))
-    def : xxevalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>;
+    def : XXEvalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>;
 
     // XOR
     // (xor A, B, C)
-    def : xxevalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>;
+    def : XXEvalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>;
     // (xor A, (and B, C))
-    def : xxevalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>;
+    def : XXEvalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>;
     // (xor A, (or B, C))
-    def : xxevalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
+    def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
 
     // Anonymous patterns to select prefixed VSX loads and stores.
     // Load / Store f128

From 38fb0117ab10c4541e58697a4b56de2a646cf3f4 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Wed, 11 Jun 2025 12:13:36 -0400
Subject: [PATCH 0004/1322] [libc++] Make forward_list constexpr as part of
 P3372R3 (#129435)

Fixes #128658
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +
 libcxx/include/__memory/allocation_guard.h    |  20 +-
 libcxx/include/__memory/pointer_traits.h      |  16 +-
 libcxx/include/forward_list                   | 469 ++++++++++--------
 libcxx/include/version                        |   2 +
 .../forwardlist/compare.three_way.pass.cpp    |   7 +-
 .../sequences/forwardlist/empty.pass.cpp      |  13 +-
 .../forwardlist.access/front.pass.cpp         |  16 +-
 .../forwardlist.cons/alloc.compile.fail.cpp   |  13 +-
 .../forwardlist.cons/alloc.pass.cpp           |  13 +-
 .../forwardlist.cons/assign_copy.pass.cpp     |  13 +-
 .../forwardlist.cons/assign_init.pass.cpp     |  13 +-
 .../forwardlist.cons/assign_move.pass.cpp     |  13 +-
 .../forwardlist.cons/assign_op_init.pass.cpp  |  13 +-
 .../forwardlist.cons/assign_range.pass.cpp    |  13 +-
 .../assign_size_value.pass.cpp                |  13 +-
 .../forwardlist.cons/copy.pass.cpp            |  13 +-
 .../forwardlist.cons/copy_alloc.pass.cpp      |  13 +-
 .../forwardlist.cons/default.pass.cpp         |  13 +-
 .../forwardlist.cons/from_range.pass.cpp      |  19 +-
 .../forwardlist.cons/init.pass.cpp            |  13 +-
 .../forwardlist.cons/init_alloc.pass.cpp      |  13 +-
 .../forwardlist.cons/move.pass.cpp            |  13 +-
 .../forwardlist.cons/move_alloc.pass.cpp      |  13 +-
 .../forwardlist.cons/range.pass.cpp           |  13 +-
 .../forwardlist.cons/range_alloc.pass.cpp     |  13 +-
 .../forwardlist.cons/size.pass.cpp            |   4 +-
 .../forwardlist.cons/size_value.pass.cpp      |  13 +-
 .../size_value_alloc.pass.cpp                 |  13 +-
 .../forwardlist.erasure/erase.pass.cpp        |  18 +-
 .../forwardlist.erasure/erase_if.pass.cpp     |  18 +-
 .../forwardlist.iter/before_begin.pass.cpp    |  17 +-
 .../forwardlist.iter/iterators.pass.cpp       |  27 +-
 .../assign_range.pass.cpp                     |  19 +-
 .../forwardlist.modifiers/clear.pass.cpp      |  13 +-
 .../emplace_after.pass.cpp                    |  13 +-
 .../emplace_front.pass.cpp                    |  13 +-
 .../erase_after_many.pass.cpp                 |  13 +-
 .../erase_after_one.pass.cpp                  |  13 +-
 .../insert_after_const.pass.cpp               |  13 +-
 .../insert_after_init.pass.cpp                |  13 +-
 .../insert_after_range.pass.cpp               |  13 +-
 .../insert_after_rv.pass.cpp                  |  13 +-
 .../insert_after_size_value.pass.cpp          |  13 +-
 .../insert_range_after.pass.cpp               |  23 +-
 .../forwardlist.modifiers/pop_front.pass.cpp  |  13 +-
 .../prepend_range.pass.cpp                    |  19 +-
 .../push_front_const.pass.cpp                 |  13 +-
 .../push_front_exception_safety.pass.cpp      |   2 +-
 .../push_front_rv.pass.cpp                    |  13 +-
 .../resize_size.pass.cpp                      |  17 +-
 .../resize_size_value.pass.cpp                |  15 +-
 .../forwardlist.ops/merge_lvalue.pass.cpp     |  17 +-
 .../merge_lvalue_pred.pass.cpp                |  17 +-
 .../forwardlist.ops/merge_rvalue.pass.cpp     |  17 +-
 .../merge_rvalue_pred.pass.cpp                |  17 +-
 .../forwardlist.ops/remove.pass.cpp           |  27 +-
 .../forwardlist.ops/remove_if.pass.cpp        |  25 +-
 .../forwardlist.ops/reverse.pass.cpp          |  19 +-
 .../splice_after_flist.pass.cpp               |  23 +-
 .../forwardlist.ops/splice_after_one.pass.cpp |  25 +-
 .../splice_after_range.pass.cpp               |  27 +-
 .../forwardlist.ops/unique.pass.cpp           |  15 +-
 .../forwardlist.ops/unique_pred.pass.cpp      |  25 +-
 .../forwardlist.spec/equal.pass.cpp           |  17 +-
 .../forwardlist.spec/member_swap.pass.cpp     |  13 +-
 .../forwardlist.spec/non_member_swap.pass.cpp |  13 +-
 .../forwardlist.spec/relational.pass.cpp      |  21 +-
 .../swap_noexcept.compile.pass.cpp            |   4 +-
 .../forwardlist/get_allocator.pass.cpp        |  13 +-
 .../sequences/forwardlist/incomplete.pass.cpp |  17 +-
 .../sequences/forwardlist/max_size.pass.cpp   |  13 +-
 .../forward_list.version.compile.pass.cpp     |  27 +
 .../version.version.compile.pass.cpp          |  27 +
 libcxx/test/support/counting_predicates.h     |  58 +--
 .../generate_feature_test_macro_components.py |   5 +
 76 files changed, 1184 insertions(+), 457 deletions(-)
 mode change 100755 => 100644 libcxx/utils/generate_feature_test_macro_components.py

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index a89d4038785c..3e6fd643f620 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -420,6 +420,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_algorithms``                         ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_forward_list``                       ``202502L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_new``                                ``202406L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_queue``                              ``202502L``
diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h
index 66edcd92ed61..016e1a3a429b 100644
--- a/libcxx/include/__memory/allocation_guard.h
+++ b/libcxx/include/__memory/allocation_guard.h
@@ -49,24 +49,26 @@ struct __allocation_guard {
   using _Size _LIBCPP_NODEBUG    = typename allocator_traits<_Alloc>::size_type;
 
   template <class _AllocT> // we perform the allocator conversion inside the constructor
-  _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n)
       : __alloc_(std::move(__alloc)),
         __n_(__n),
         __ptr_(allocator_traits<_Alloc>::allocate(__alloc_, __n_)) // initialization order is important
   {}
 
-  _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); }
 
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard(const __allocation_guard&) = delete;
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT
+  __allocation_guard(const __allocation_guard&)                    = delete;
+  __allocation_guard& operator=(const __allocation_guard& __other) = delete;
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT
       : __alloc_(std::move(__other.__alloc_)),
         __n_(__other.__n_),
         __ptr_(__other.__ptr_) {
     __other.__ptr_ = nullptr;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(const __allocation_guard& __other) = delete;
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(__allocation_guard&& __other) _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard&
+  operator=(__allocation_guard&& __other) _NOEXCEPT {
     if (std::addressof(__other) != this) {
       __destroy();
 
@@ -79,17 +81,17 @@ struct __allocation_guard {
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _Pointer
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer
   __release_ptr() _NOEXCEPT { // not called __release() because it's a keyword in objective-c++
     _Pointer __tmp = __ptr_;
     __ptr_         = nullptr;
     return __tmp;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; }
 
 private:
-  _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT {
     if (__ptr_ != nullptr) {
       allocator_traits<_Alloc>::deallocate(__alloc_, __ptr_, __n_);
     }
diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index 4ba50898fb37..879b387b9ad1 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -245,8 +245,8 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(_Tp* __p) noexcept {
 }
 
 template <class _Pointer>
-inline _LIBCPP_HIDE_FROM_ABI constexpr auto
-to_address(const _Pointer& __p) noexcept -> decltype(std::__to_address(__p)) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(const _Pointer& __p) noexcept
+    -> decltype(std::__to_address(__p)) {
   return std::__to_address(__p);
 }
 #endif
@@ -302,6 +302,18 @@ concept __resettable_smart_pointer_with_args = requires(_Smart __s, _Pointer __p
 
 #endif
 
+// This function ensures safe conversions between fancy pointers at compile-time, where we avoid casts from/to
+// `__void_pointer` by obtaining the underlying raw pointer from the fancy pointer using `std::to_address`,
+// then dereferencing it to retrieve the pointed-to object, and finally constructing the target fancy pointer
+// to that object using the `std::pointer_traits<>::pinter_to` function.
+template <class _PtrTo, class _PtrFrom>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _PtrTo __static_fancy_pointer_cast(const _PtrFrom& __p) {
+  using __ptr_traits   = pointer_traits<_PtrTo>;
+  using __element_type = typename __ptr_traits::element_type;
+  return __p ? __ptr_traits::pointer_to(*static_cast<__element_type*>(std::addressof(*__p)))
+             : static_cast<_PtrTo>(nullptr);
+}
+
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 5046de27a9da..e9b2c860b89c 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -295,8 +295,8 @@ struct __forward_node_traits {
                 "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic.");
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) {
-    return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__p));
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) {
+    return std::__static_fancy_pointer_cast<__begin_node_pointer>(__p);
   }
 };
 
@@ -307,11 +307,11 @@ struct __forward_begin_node {
 
   pointer __next_;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {}
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {}
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const {
-    return static_cast<__begin_node_pointer>(__next_);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const {
+    return std::__static_fancy_pointer_cast<__begin_node_pointer>(__next_);
   }
 };
 
@@ -335,7 +335,7 @@ private:
   };
 
 public:
-  _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
 #  else
 
 private:
@@ -345,8 +345,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_)); }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {}
-  _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {}
 };
 
 template <class _Tp, class _Alloc = allocator<_Tp> >
@@ -357,24 +357,26 @@ class __forward_list_const_iterator;
 template <class _NodePtr>
 class __forward_list_iterator {
   typedef __forward_node_traits<_NodePtr> __traits;
+  typedef typename __traits::__node_type __node_type;
+  typedef typename __traits::__begin_node __begin_node_type;
   typedef typename __traits::__node_pointer __node_pointer;
   typedef typename __traits::__begin_node_pointer __begin_node_pointer;
   typedef typename __traits::__void_pointer __void_pointer;
 
   __begin_node_pointer __ptr_;
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const {
-    return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_));
-  }
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
-    return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_));
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
+    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT
+      : __ptr_(nullptr) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT
       : __ptr_(__traits::__as_iter_node(__p)) {}
 
   template <class, class>
@@ -389,27 +391,31 @@ public:
   typedef typename pointer_traits<__node_pointer>::difference_type difference_type;
   typedef __rebind_pointer_t<__node_pointer, value_type> pointer;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    return __get_unsafe_node_pointer()->__get_value();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
     return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
   }
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() {
     __ptr_ = __traits::__as_iter_node(__ptr_->__next_);
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) {
     __forward_list_iterator __t(*this);
     ++(*this);
     return __t;
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
     return __x.__ptr_ == __y.__ptr_;
   }
-  friend _LIBCPP_HIDE_FROM_ABI bool operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
     return !(__x == __y);
   }
 };
@@ -421,23 +427,25 @@ class __forward_list_const_iterator {
 
   typedef __forward_node_traits<_NodePtr> __traits;
   typedef typename __traits::__node_type __node_type;
+  typedef typename __traits::__begin_node __begin_node_type;
   typedef typename __traits::__node_pointer __node_pointer;
   typedef typename __traits::__begin_node_pointer __begin_node_pointer;
   typedef typename __traits::__void_pointer __void_pointer;
 
   __begin_node_pointer __ptr_;
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const {
-    return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_));
-  }
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
-    return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_));
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
+    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT
+      : __ptr_(nullptr) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__node_pointer __p) _NOEXCEPT
       : __ptr_(__traits::__as_iter_node(__p)) {}
 
@@ -451,30 +459,32 @@ public:
   typedef typename pointer_traits<__node_pointer>::difference_type difference_type;
   typedef __rebind_pointer_t<__node_pointer, const value_type> pointer;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT
-      : __ptr_(__p.__ptr_) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT : __ptr_(__p.__ptr_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    return __get_unsafe_node_pointer()->__get_value();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
     return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
   }
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() {
     __ptr_ = __traits::__as_iter_node(__ptr_->__next_);
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) {
     __forward_list_const_iterator __t(*this);
     ++(*this);
     return __t;
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
   operator==(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) {
     return __x.__ptr_ == __y.__ptr_;
   }
-  friend _LIBCPP_HIDE_FROM_ABI bool
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
   operator!=(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) {
     return !(__x == __y);
   }
@@ -498,48 +508,53 @@ protected:
 
   _LIBCPP_COMPRESSED_PAIR(__begin_node, __before_begin_, __node_allocator, __alloc_);
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT {
     return pointer_traits<__begin_node_pointer>::pointer_to(__before_begin_);
   }
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT {
-    return pointer_traits<__begin_node_pointer>::pointer_to(const_cast<__begin_node&>(__before_begin_));
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT {
+    return pointer_traits<__begin_node_pointer>::pointer_to(
+        *const_cast<__begin_node*>(std::addressof(__before_begin_)));
   }
 
   typedef __forward_list_iterator<__node_pointer> iterator;
   typedef __forward_list_const_iterator<__node_pointer> const_iterator;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_base() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_base()
+      _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
       : __before_begin_(__begin_node()) {}
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a)
       : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) {}
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a)
       : __before_begin_(__begin_node()), __alloc_(__a) {}
 
 public:
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
   __forward_list_base(__forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value);
-  _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  __forward_list_base(__forward_list_base&& __x, const allocator_type& __a);
 #  endif // _LIBCPP_CXX03_LANG
 
   __forward_list_base(const __forward_list_base&)            = delete;
   __forward_list_base& operator=(const __forward_list_base&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI ~__forward_list_base();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_base();
 
 protected:
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) {
     __copy_assign_alloc(__x, integral_constant<bool, __node_traits::propagate_on_container_copy_assignment::value>());
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x)
       _NOEXCEPT_(!__node_traits::propagate_on_container_move_assignment::value ||
                  is_nothrow_move_assignable<__node_allocator>::value) {
     __move_assign_alloc(__x, integral_constant<bool, __node_traits::propagate_on_container_move_assignment::value>());
   }
 
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__node_pointer __next, _Args&&... __args) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer
+  __create_node(__node_pointer __next, _Args&&... __args) {
     __allocation_guard<__node_allocator> __guard(__alloc_, 1);
     // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value
     // held inside the node, since we need to use the allocator's construct() method for that.
@@ -554,7 +569,7 @@ protected:
     return __guard.__release_ptr();
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
     __node_traits::destroy(__alloc_, std::addressof(__node->__get_value()));
@@ -563,7 +578,7 @@ protected:
   }
 
 public:
-  _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x)
 #  if _LIBCPP_STD_VER >= 14
       _NOEXCEPT;
 #  else
@@ -571,18 +586,21 @@ public:
 #  endif
 
 protected:
-  _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT;
 
 private:
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) {}
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x, true_type) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) {
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  __copy_assign_alloc(const __forward_list_base& __x, true_type) {
     if (__alloc_ != __x.__alloc_)
       clear();
     __alloc_ = __x.__alloc_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {}
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) {
     __alloc_ = std::move(__x.__alloc_);
   }
@@ -591,14 +609,15 @@ private:
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x) noexcept(
-    is_nothrow_move_constructible<__node_allocator>::value)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(
+    __forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value)
     : __before_begin_(std::move(__x.__before_begin_)), __alloc_(std::move(__x.__alloc_)) {
   __x.__before_begin()->__next_ = nullptr;
 }
 
 template <class _Tp, class _Alloc>
-inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x, const allocator_type& __a)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(
+    __forward_list_base&& __x, const allocator_type& __a)
     : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) {
   if (__alloc_ == __x.__alloc_) {
     __before_begin()->__next_     = __x.__before_begin()->__next_;
@@ -609,12 +628,12 @@ inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-__forward_list_base<_Tp, _Alloc>::~__forward_list_base() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 __forward_list_base<_Tp, _Alloc>::~__forward_list_base() {
   clear();
 }
 
 template <class _Tp, class _Alloc>
-inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
 #  if _LIBCPP_STD_VER >= 14
     _NOEXCEPT
 #  else
@@ -627,7 +646,7 @@ inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
 }
 
 template <class _Tp, class _Alloc>
-void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT {
   for (__node_pointer __p = __before_begin()->__next_; __p != nullptr;) {
     __node_pointer __next = __p->__next_;
     __delete_node(__p);
@@ -672,105 +691,123 @@ public:
   typedef void __remove_return_type;
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI forward_list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {
-  } // = default;
-  _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a);
-  _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list()
+      _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {} // = default;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n);
 #  if _LIBCPP_STD_VER >= 14
-  _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a);
 #  endif
-  _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v);
 
   template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v, const allocator_type& __a) : __base(__a) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(size_type __n, const value_type& __v, const allocator_type& __a)
+      : __base(__a) {
     insert_after(cbefore_begin(), __n, __v);
   }
 
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l);
 
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type())
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type())
       : __base(__a) {
     prepend_range(std::forward<_Range>(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x);
-  _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x, const __type_identity_t<allocator_type>& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(const forward_list& __x, const __type_identity_t<allocator_type>& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x);
 
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value)
       : __base(std::move(__x)) {}
-  _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il);
-  _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(initializer_list<value_type> __il, const allocator_type& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept(
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept(
       (__node_traits::propagate_on_container_move_assignment::value &&
        is_nothrow_move_assignable<allocator_type>::value) ||
       allocator_traits<allocator_type>::is_always_equal::value);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list<value_type> __il);
 
-  _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG
 
   // ~forward_list() = default;
 
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) {
     __assign_with_sentinel(ranges::begin(__range), ranges::end(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v);
 
-  _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(this->__alloc_); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT {
+    return allocator_type(this->__alloc_);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__base::__before_begin()->__next_); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
+    return iterator(__base::__before_begin()->__next_);
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(nullptr); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
+    return const_iterator(nullptr);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return const_iterator(nullptr); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
+    return const_iterator(nullptr);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { return iterator(__base::__before_begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT {
+    return iterator(__base::__before_begin());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
 
-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
     return __base::__before_begin()->__next_ == nullptr;
   }
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return std::min<size_type>(__node_traits::max_size(this->__alloc_), numeric_limits<difference_type>::max());
   }
 
-  _LIBCPP_HIDE_FROM_ABI reference front() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
-  _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
@@ -778,54 +815,59 @@ public:
 #  ifndef _LIBCPP_CXX03_LANG
 #    if _LIBCPP_STD_VER >= 17
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args);
 #    else
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args);
 #    endif
-  _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v);
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) {
     insert_range_after(cbefore_begin(), std::forward<_Range>(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void pop_front();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void pop_front();
 
 #  ifndef _LIBCPP_CXX03_LANG
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args);
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v);
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, initializer_list<value_type> __il) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_after(const_iterator __p, initializer_list<value_type> __il) {
     return insert_after(__p, __il.begin(), __il.end());
   }
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v);
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, size_type __n, const value_type& __v) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_after(const_iterator __p, size_type __n, const value_type& __v) {
     return __insert_after(__p, __n, __v);
   }
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_range_after(const_iterator __position, _Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_range_after(const_iterator __position, _Range&& __range) {
     return __insert_after_with_sentinel(__position, ranges::begin(__range), ranges::end(__range));
   }
 #  endif
 
   template <class _InputIterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l);
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p);
-  _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l);
 
-  _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x)
 #  if _LIBCPP_STD_VER >= 14
       _NOEXCEPT
 #  else
@@ -835,58 +877,63 @@ public:
     __base::swap(__x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void resize(size_type __n);
-  _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v);
-  _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); }
 
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x);
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __i);
-  _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  splice_after(const_iterator __p, forward_list&& __x, const_iterator __i);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
   splice_after(const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l);
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x);
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x, const_iterator __i);
-  _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  splice_after(const_iterator __p, forward_list& __x, const_iterator __i);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
   splice_after(const_iterator __p, forward_list& __x, const_iterator __f, const_iterator __l);
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v);
   template <class _Predicate>
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred);
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); }
   template <class _BinaryPredicate>
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); }
   template <class _Compare>
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) {
     merge(__x, std::move(__comp));
   }
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); }
   template <class _Compare>
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp);
-  _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); }
   template <class _Compare>
-  _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp);
-  _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT;
 
 private:
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value);
-  _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type);
 #  endif // _LIBCPP_CXX03_LANG
 
   template <class _Iter, class _Sent>
-  _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l);
 
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_after(const_iterator __p, size_type __n, _Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  __insert_after(const_iterator __p, size_type __n, _Args&&... __args);
 
   template <class _Compare>
-  static _LIBCPP_HIDE_FROM_ABI __node_pointer __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDE_FROM_ABI __node_pointer
+  __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp);
 
   // TODO: Make this _LIBCPP_HIDE_FROM_ABI
   template <class _Compare>
-  static _LIBCPP_HIDDEN __node_pointer __sort(__node_pointer __f, difference_type __sz, _Compare& __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDDEN __node_pointer
+  __sort(__node_pointer __f, difference_type __sz, _Compare& __comp);
 };
 
 #  if _LIBCPP_STD_VER >= 17
@@ -911,10 +958,10 @@ forward_list(from_range_t, _Range&&, _Alloc = _Alloc()) -> forward_list<ranges::
 #  endif
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {}
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {}
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
   if (__n > 0) {
     for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) {
       __p->__next_ = this->__create_node(/* next = */ nullptr);
@@ -924,7 +971,8 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
 
 #  if _LIBCPP_STD_VER >= 14
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) : __base(__base_alloc) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc)
+    : __base(__base_alloc) {
   if (__n > 0) {
     for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) {
       __p->__next_ = this->__create_node(/* next = */ nullptr);
@@ -934,37 +982,39 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __b
 #  endif
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) {
   insert_after(cbefore_begin(), __n, __v);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
-forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) {
   insert_after(cbefore_begin(), __f, __l);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a)
     : __base(__a) {
   insert_after(cbefore_begin(), __f, __l);
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x)
     : __base(__node_traits::select_on_container_copy_construction(__x.__alloc_)) {
   insert_after(cbefore_begin(), __x.begin(), __x.end());
 }
 
 template <class _Tp, class _Alloc>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x, const __type_identity_t<allocator_type>& __a)
     : __base(__a) {
   insert_after(cbefore_begin(), __x.begin(), __x.end());
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) {
   if (this != std::addressof(__x)) {
     __base::__copy_assign_alloc(__x);
     assign(__x.begin(), __x.end());
@@ -974,6 +1024,7 @@ forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_li
 
 #  ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Alloc>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a)
     : __base(std::move(__x), __a) {
   if (this->__alloc_ != __x.__alloc_) {
@@ -983,17 +1034,19 @@ forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identit
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il) {
   insert_after(cbefore_begin(), __il.begin(), __il.end());
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il, const allocator_type& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26
+forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il, const allocator_type& __a)
+    : __base(__a) {
   insert_after(cbefore_begin(), __il.begin(), __il.end());
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type)
     _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value) {
   clear();
   __base::__move_assign_alloc(__x);
@@ -1002,7 +1055,7 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type)
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
   if (this->__alloc_ == __x.__alloc_)
     __move_assign(__x, true_type());
   else {
@@ -1012,7 +1065,8 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
 }
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>&
+forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept(
     (__node_traits::propagate_on_container_move_assignment::value &&
      is_nothrow_move_assignable<allocator_type>::value) ||
     allocator_traits<allocator_type>::is_always_equal::value) {
@@ -1021,7 +1075,8 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_l
 }
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>&
+forward_list<_Tp, _Alloc>::operator=(initializer_list<value_type> __il) {
   assign(__il.begin(), __il.end());
   return *this;
 }
@@ -1030,13 +1085,14 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializ
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
-void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) {
   __assign_with_sentinel(__f, __l);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Iter, class _Sent>
-_LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) {
   iterator __i = before_begin();
   iterator __j = std::next(__i);
   iterator __e = end();
@@ -1049,7 +1105,7 @@ _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_It
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) {
   iterator __i = before_begin();
   iterator __j = std::next(__i);
   iterator __e = end();
@@ -1064,18 +1120,19 @@ void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) {
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-inline void forward_list<_Tp, _Alloc>::assign(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::assign(initializer_list<value_type> __il) {
   assign(__il.begin(), __il.end());
 }
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 #    if _LIBCPP_STD_VER >= 17
-typename forward_list<_Tp, _Alloc>::reference
+    typename forward_list<_Tp, _Alloc>::reference
 #    else
-void
+    void
 #    endif
-forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
+    forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
   __base::__before_begin()->__next_ =
       this->__create_node(/* next = */ __base::__before_begin()->__next_, std::forward<_Args>(__args)...);
 #    if _LIBCPP_STD_VER >= 17
@@ -1084,7 +1141,7 @@ forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) {
   __base::__before_begin()->__next_ =
       this->__create_node(/* next = */ __base::__before_begin()->__next_, std::move(__v));
 }
@@ -1092,12 +1149,12 @@ void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) {
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) {
   __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, __v);
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::pop_front() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::pop_front() {
   _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::pop_front called on an empty list");
   __node_pointer __p                = __base::__before_begin()->__next_;
   __base::__before_begin()->__next_ = __p->__next_;
@@ -1108,7 +1165,7 @@ void forward_list<_Tp, _Alloc>::pop_front() {
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) {
   __begin_node_pointer const __r = __p.__get_begin();
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, std::forward<_Args>(__args)...);
@@ -1116,7 +1173,7 @@ forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args)
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) {
   __begin_node_pointer const __r = __p.__get_begin();
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, std::move(__v));
@@ -1126,7 +1183,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) {
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __v) {
   __begin_node_pointer const __r = __p.__get_begin();
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, __v);
@@ -1135,7 +1192,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Args&&... __args) {
   __begin_node_pointer __r = __p.__get_begin();
   if (__n > 0) {
@@ -1159,21 +1216,21 @@ forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Ar
 #  endif // _LIBCPP_HAS_EXCEPTIONS
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
-    __r             = static_cast<__begin_node_pointer>(__last);
+    __r             = __forward_node_traits<__node_pointer>::__as_iter_node(__last);
   }
   return iterator(__r);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l) {
   return __insert_after_with_sentinel(__p, std::move(__f), std::move(__l));
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, class _Sentinel>
-_LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l) {
   __begin_node_pointer __r = __p.__get_begin();
 
@@ -1200,14 +1257,15 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp
 
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
-    __r             = static_cast<__begin_node_pointer>(__last);
+    __r             = __forward_node_traits<__node_pointer>::__as_iter_node(__last);
   }
 
   return iterator(__r);
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
+forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) {
   __begin_node_pointer __p = __f.__get_begin();
   __node_pointer __n       = __p->__next_;
   __p->__next_             = __n->__next_;
@@ -1216,7 +1274,7 @@ typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_af
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) {
   __node_pointer __e = __l.__get_unsafe_node_pointer();
   if (__f != __l) {
@@ -1236,7 +1294,7 @@ forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::resize(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n) {
   size_type __sz = 0;
   iterator __p   = before_begin();
   iterator __i   = begin();
@@ -1250,7 +1308,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) {
   size_type __sz = 0;
   iterator __p   = before_begin();
   iterator __i   = begin();
@@ -1264,7 +1322,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) {
   if (!__x.empty()) {
     if (__p.__get_begin()->__next_ != nullptr) {
       const_iterator __lm1 = __x.before_begin();
@@ -1278,7 +1336,8 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& _
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void
+forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) {
   const_iterator __lm1 = std::next(__i);
   if (__p != __i && __p != __lm1) {
     __i.__get_begin()->__next_   = __lm1.__get_begin()->__next_;
@@ -1288,7 +1347,7 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::splice_after(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(
     const_iterator __p, forward_list& /*__other*/, const_iterator __f, const_iterator __l) {
   if (__f != __l && __p != __f) {
     const_iterator __lm1 = __f;
@@ -1303,24 +1362,26 @@ void forward_list<_Tp, _Alloc>::splice_after(
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void
+forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) {
   splice_after(__p, __x);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void
 forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x, const_iterator __i) {
   splice_after(__p, __x, __i);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(
     const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l) {
   splice_after(__p, __x, __f, __l);
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove(const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type
+forward_list<_Tp, _Alloc>::remove(const value_type& __v) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
   const iterator __e                                            = end();
@@ -1343,7 +1404,8 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo
 
 template <class _Tp, class _Alloc>
 template <class _Predicate>
-typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type
+forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
   const iterator __e                                            = end();
@@ -1366,7 +1428,7 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo
 
 template <class _Tp, class _Alloc>
 template <class _BinaryPredicate>
-typename forward_list<_Tp, _Alloc>::__remove_return_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type
 forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
@@ -1384,7 +1446,7 @@ forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) {
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) {
   if (this != std::addressof(__x)) {
     __base::__before_begin()->__next_ =
         __merge(__base::__before_begin()->__next_, __x.__before_begin()->__next_, __comp);
@@ -1394,7 +1456,7 @@ void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) {
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-typename forward_list<_Tp, _Alloc>::__node_pointer
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer
 forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp) {
   if (__f1 == nullptr)
     return __f2;
@@ -1431,13 +1493,13 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Co
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) {
   __base::__before_begin()->__next_ = __sort(__base::__before_begin()->__next_, std::distance(begin(), end()), __comp);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-typename forward_list<_Tp, _Alloc>::__node_pointer
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer
 forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Compare& __comp) {
   switch (__sz) {
   case 0:
@@ -1461,7 +1523,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Co
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT {
   __node_pointer __p = __base::__before_begin()->__next_;
   if (__p != nullptr) {
     __node_pointer __f = __p->__next_;
@@ -1477,7 +1539,8 @@ void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT {
 }
 
 template <class _Tp, class _Alloc>
-_LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   typedef forward_list<_Tp, _Alloc> _Cp;
   typedef typename _Cp::const_iterator _Ip;
   _Ip __ix = __x.begin();
@@ -1493,31 +1556,31 @@ _LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, cons
 #  if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator!=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return !(__x == __y);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator<(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return std::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end());
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator>(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return __y < __x;
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator>=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return !(__x < __y);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return !(__y < __x);
 }
@@ -1525,7 +1588,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>
 #  else // #if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Allocator>
-_LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
@@ -1533,20 +1596,20 @@ operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _A
 #  endif // #if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y)
-    _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void
+swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
   __x.swap(__y);
 }
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _Tp, class _Allocator, class _Predicate>
-inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
 erase_if(forward_list<_Tp, _Allocator>& __c, _Predicate __pred) {
   return __c.remove_if(__pred);
 }
 
 template <class _Tp, class _Allocator, class _Up>
-inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
 erase(forward_list<_Tp, _Allocator>& __c, const _Up& __v) {
   return std::erase_if(__c, [&](const auto& __elem) -> bool { return __elem == __v; });
 }
diff --git a/libcxx/include/version b/libcxx/include/version
index 65fae111dc8e..87c4ede9a7e5 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -68,6 +68,7 @@ __cpp_lib_constexpr_charconv                            202207L <charconv>
 __cpp_lib_constexpr_cmath                               202202L <cmath> <cstdlib>
 __cpp_lib_constexpr_complex                             201711L <complex>
 __cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
+__cpp_lib_constexpr_forward_list                        202502L <forward_list>
 __cpp_lib_constexpr_functional                          201907L <functional>
 __cpp_lib_constexpr_iterator                            201811L <iterator>
 __cpp_lib_constexpr_memory                              202202L <memory>
@@ -543,6 +544,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_bitset                               202306L
 # undef  __cpp_lib_constexpr_algorithms
 # define __cpp_lib_constexpr_algorithms                 202306L
+# define __cpp_lib_constexpr_forward_list               202502L
 # if !defined(_LIBCPP_ABI_VCRUNTIME)
 #   define __cpp_lib_constexpr_new                      202406L
 # endif
diff --git a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp
index 52adfc4d8598..a9ef855e9a73 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp
@@ -11,7 +11,7 @@
 
 // template<class T, class Allocator>
 //   synth-three-way-result<T> operator<=>(const forward_list<T, Allocator>& x,
-//                                         const forward_list<T, Allocator>& y);
+//                                         const forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <cassert>
 #include <forward_list>
@@ -20,6 +20,9 @@
 
 int main(int, char**) {
   assert(test_sequence_container_spaceship<std::forward_list>());
-  // `std::forward_list` is not constexpr, so no `static_assert` test here.
+#if TEST_STD_VER >= 26
+  static_assert(test_sequence_container_spaceship<std::forward_list>());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp
index dbc0631d1193..4482d26f308a 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp
@@ -10,7 +10,7 @@
 
 // class forward_list
 
-// bool empty() const noexcept;
+// bool empty() const noexcept; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef std::forward_list<int> C;
     C c;
@@ -42,5 +42,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp
index 757db7d957f5..50b549f17d56 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp
@@ -8,17 +8,18 @@
 
 // <forward_list>
 
-// reference       front();
-// const_reference front() const;
+// reference       front();       // constexpr since C++26
+// const_reference front() const; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
 #include <iterator>
 
+#include "test_allocator.h"
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -58,5 +59,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp
index 31893a1b9599..4645560048cf 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// explicit forward_list(const allocator_type& a);
+// explicit forward_list(const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_allocator.h"
 #include "../../../NotConstructible.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef test_allocator<NotConstructible> A;
     typedef A::value_type T;
@@ -26,5 +26,14 @@ int main(int, char**) {
     assert(c.empty());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp
index bfb330fdaf9f..ffc6d37f2816 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// explicit forward_list(const allocator_type& a);
+// explicit forward_list(const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "../../../NotConstructible.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef test_allocator<NotConstructible> A;
     typedef A::value_type T;
@@ -46,5 +46,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp
index 27d450c63dca..b99af4ccb79e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list& operator=(const forward_list& x);
+// forward_list& operator=(const forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<int> A;
@@ -143,5 +143,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp
index 1cdcca82d335..ea2802b323a9 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// void assign(initializer_list<value_type> il);
+// void assign(initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -65,5 +65,14 @@ int main(int, char**) {
     assert(n == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp
index 998a7e11ef34..9c88db6166ba 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list& operator=(forward_list&& x);
+// forward_list& operator=(forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef test_allocator<T> A;
@@ -194,5 +194,14 @@ int main(int, char**) {
     assert(c0.empty());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp
index a22d6c4985bc..d21898dc4663 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list& operator=(initializer_list<value_type> il);
+// forward_list& operator=(initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -65,5 +65,14 @@ int main(int, char**) {
     assert(n == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp
index 9a3532874079..1601b4b47acd 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class InputIterator>
-//     void assign(InputIterator first, InputIterator last);
+//     void assign(InputIterator first, InputIterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -75,5 +75,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp
index b0fbfa3249e5..75626b47c527 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void assign(size_type n, const value_type& v);
+// void assign(size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -65,5 +65,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp
index 22d5054b9ae1..12d701bff4b6 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(const forward_list& x);
+// forward_list(const forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<int> A;
@@ -64,5 +64,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp
index a61233e4b5d2..fc3ff485b066 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(const forward_list& x, const allocator_type& a);
+// forward_list(const forward_list& x, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<int> A;
@@ -64,5 +64,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp
index b493a89b7800..e0ea8bf66cb3 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list();
+// forward_list(); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -38,5 +38,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp
index 312f6dbad355..d1e1734e86f9 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp
@@ -9,14 +9,14 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23
+//   forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23; constexpr since C++26
 
 #include <forward_list>
 
 #include "../../from_range_sequence_containers.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for_all_iterators_and_allocators<int>([]<class Iter, class Sent, class Alloc>() {
     test_sequence_container<std::forward_list, int, Iter, Sent, Alloc>([](const auto&) {
       // No additional validation to do.
@@ -26,8 +26,19 @@ int main(int, char**) {
 
   static_assert(test_constraints<std::forward_list, int, double>());
 
-  test_exception_safety_throwing_copy<std::forward_list>();
-  test_exception_safety_throwing_allocator<std::forward_list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_exception_safety_throwing_copy<std::forward_list>();
+    test_exception_safety_throwing_allocator<std::forward_list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp
index b42242b0a83d..b7acf60aa70c 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(initializer_list<value_type> il);
+// forward_list(initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -38,5 +38,14 @@ int main(int, char**) {
     assert(n == 10);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp
index 0b29cbfa9254..33d569c921a9 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(initializer_list<value_type> il, const allocator_type& a);
+// forward_list(initializer_list<value_type> il, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -43,5 +43,14 @@ int main(int, char**) {
     assert(c.get_allocator() == A());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp
index 762e252ca76f..20575479f735 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(forward_list&& x);
+// forward_list(forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef test_allocator<T> A;
@@ -68,5 +68,14 @@ int main(int, char**) {
     assert(c.get_allocator() == A());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp
index a9bc2cb12f28..219505bf4fd1 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(forward_list&& x, const allocator_type& a);
+// forward_list(forward_list&& x, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef test_allocator<T> A;
@@ -68,5 +68,14 @@ int main(int, char**) {
     assert(c.get_allocator() == A());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp
index ebd0e6a5bd1e..61393eb28938 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class InputIterator>
-//     forward_list(InputIterator first, InputIterator last);
+//     forward_list(InputIterator first, InputIterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -45,5 +45,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp
index 4a28041ad2cb..c0637420e328 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // template <class InputIterator>
 //     forward_list(InputIterator first, InputIterator last,
-//                  const allocator_type& a);
+//                  const allocator_type& a);                 // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -51,5 +51,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp
index 81b128d2149e..206854560c19 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp
@@ -8,8 +8,8 @@
 
 // <forward_list>
 
-// explicit forward_list(size_type n);
-// explicit forward_list(size_type n, const Alloc& a);
+// explicit forward_list(size_type n);                 // constexpr since C++26
+// explicit forward_list(size_type n, const Alloc& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp
index 663422d1c3c3..85d11e3f40a2 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(size_type n, const value_type& v);
+// forward_list(size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -42,5 +42,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp
index af7f7471d4c9..abcdf62452b8 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(size_type n, const value_type& v, const allocator_type& a);
+// forward_list(size_type n, const value_type& v, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef test_allocator<int> A;
     typedef A::value_type T;
@@ -47,5 +47,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp
index 1044d779220e..86d7769fe16e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator, class U>
 //   typename forward_list<T, Allocator>::size_type
-//   erase(forward_list<T, Allocator>& c, const U& value);
+//   erase(forward_list<T, Allocator>& c, const U& value); // constexpr since C++26
 
 #include <forward_list>
 #include <optional>
@@ -21,14 +21,14 @@
 #include "min_allocator.h"
 
 template <class S, class U>
-void test0(S s, U val, S expected, std::size_t expected_erased_count) {
+TEST_CONSTEXPR_CXX26 void test0(S s, U val, S expected, std::size_t expected_erased_count) {
   ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase(s, val)));
   assert(expected_erased_count == std::erase(s, val));
   assert(s == expected);
 }
 
 template <class S>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   test0(S(), 1, S(), 0);
 
   test0(S({1}), 1, S(), 1);
@@ -62,13 +62,21 @@ void test() {
   test0(S({1, 2, 1}), opt(3), S({1, 2, 1}), 0);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::forward_list<int>>();
   test<std::forward_list<int, min_allocator<int>>>();
   test<std::forward_list<int, test_allocator<int>>>();
-
   test<std::forward_list<long>>();
   test<std::forward_list<double>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp
index c4f45a1069a2..c665f9cccbf0 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator, class Predicate>
 //   typename forward_list<T, Allocator>::size_type
-//   erase_if(forward_list<T, Allocator>& c, Predicate pred);
+//   erase_if(forward_list<T, Allocator>& c, Predicate pred); // constexpr since C++26
 
 #include <forward_list>
 
@@ -20,14 +20,14 @@
 #include "min_allocator.h"
 
 template <class S, class Pred>
-void test0(S s, Pred p, S expected, std::size_t expected_erased_count) {
+TEST_CONSTEXPR_CXX26 void test0(S s, Pred p, S expected, std::size_t expected_erased_count) {
   ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
   assert(s == expected);
 }
 
 template <typename S>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   auto is1   = [](auto v) { return v == 1; };
   auto is2   = [](auto v) { return v == 2; };
   auto is3   = [](auto v) { return v == 3; };
@@ -64,13 +64,21 @@ void test() {
   test0(S({1, 2, 3}), False, S({1, 2, 3}), 0);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::forward_list<int>>();
   test<std::forward_list<int, min_allocator<int>>>();
   test<std::forward_list<int, test_allocator<int>>>();
-
   test<std::forward_list<long>>();
   test<std::forward_list<double>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp
index d66d2cd87951..52b5d87860aa 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp
@@ -8,9 +8,9 @@
 
 // <forward_list>
 
-// iterator       before_begin();
-// const_iterator before_begin() const;
-// const_iterator cbefore_begin() const;
+// iterator       before_begin();        // constexpr since C++26
+// const_iterator before_begin() const;  // constexpr since C++26
+// const_iterator cbefore_begin() const; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -101,5 +101,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp
index 135689b2321c..560c47b17958 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp
@@ -8,12 +8,12 @@
 
 // <forward_list>
 
-// iterator       begin();
-// iterator       end();
-// const_iterator begin()  const;
-// const_iterator end()    const;
-// const_iterator cbegin() const;
-// const_iterator cend()   const;
+// iterator       begin();        // constexpr since C++26
+// iterator       end();          // constexpr since C++26
+// const_iterator begin()  const; // constexpr since C++26
+// const_iterator end()    const; // constexpr since C++26
+// const_iterator cbegin() const; // constexpr since C++26
+// const_iterator cend()   const; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -22,7 +22,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -69,6 +69,8 @@ int main(int, char**) {
     typedef std::forward_list<T> C;
     C::iterator i;
     C::const_iterator j;
+    (void)i;
+    (void)j;
   }
 #if TEST_STD_VER >= 11
   {
@@ -117,6 +119,8 @@ int main(int, char**) {
     typedef std::forward_list<T, min_allocator<T>> C;
     C::iterator i;
     C::const_iterator j;
+    (void)i;
+    (void)j;
   }
 #endif
 #if TEST_STD_VER > 11
@@ -142,5 +146,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp
index a27cc757025b..9a3adec1d975 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void prepend_range(R&& rg); // C++23
+//   constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26
 
 #include <forward_list>
 
@@ -21,7 +21,7 @@
 //   {empty/one-element/full} container);
 // - prepending move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_assign_range<std::forward_list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +31,19 @@ int main(int, char**) {
   });
   test_sequence_prepend_range_move_only<std::forward_list>();
 
-  test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
-  test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
+    test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp
index 9f6d34b701df..2e1768cf8bad 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void clear() noexcept;
+// void clear() noexcept; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "../../../NotConstructible.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef NotConstructible T;
     typedef std::forward_list<T> C;
@@ -64,5 +64,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp
index f77d47ee7c74..6433607af9b3 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp
@@ -11,7 +11,7 @@
 // <forward_list>
 
 // template <class... Args>
-//     iterator emplace_after(const_iterator p, Args&&... args);
+//     iterator emplace_after(const_iterator p, Args&&... args); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -20,7 +20,7 @@
 #include "../../../Emplaceable.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef Emplaceable T;
     typedef std::forward_list<T> C;
@@ -84,5 +84,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
index cd3bb20c52ae..46ae27b43622 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// template <class... Args> reference emplace_front(Args&&... args);
+// template <class... Args> reference emplace_front(Args&&... args); // constexpr since C++26
 // return type is 'reference' in C++17; 'void' before
 
 #include <forward_list>
@@ -21,7 +21,7 @@
 #include "../../../Emplaceable.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef Emplaceable T;
     typedef std::forward_list<T> C;
@@ -67,5 +67,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp
index e85951798526..73cb03c2cb7d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator erase_after(const_iterator first, const_iterator last);
+// iterator erase_after(const_iterator first, const_iterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -153,5 +153,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp
index 892228e76def..12997f1dad3b 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator erase_after(const_iterator p);
+// iterator erase_after(const_iterator p); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -95,5 +95,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp
index 8443158413e7..d93789dd6bb5 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, const value_type& v);
+// iterator insert_after(const_iterator p, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -84,5 +84,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp
index de924a10c18f..54be47f4264f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, initializer_list<value_type> il);
+// iterator insert_after(const_iterator p, initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -70,5 +70,14 @@ int main(int, char**) {
     assert(*std::next(c.begin(), 4) == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp
index af810d0f6961..f89fbd7619da 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp
@@ -10,7 +10,7 @@
 
 // template <class InputIterator>
 //     iterator insert_after(const_iterator p,
-//                           InputIterator first, InputIterator last);
+//                           InputIterator first, InputIterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -77,5 +77,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp
index acd4bc73f724..01b76f5cd64f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, value_type&& v);
+// iterator insert_after(const_iterator p, value_type&& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef std::forward_list<T> C;
@@ -85,5 +85,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp
index 2506f04311e0..f4f0521ad237 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, size_type n, const value_type& v);
+// iterator insert_after(const_iterator p, size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -70,5 +70,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp
index 25f4c43f3848..71a291430b43 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp
@@ -8,8 +8,10 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
+
 // template<container-compatible-range<T> R>
-//   constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23
+//   constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23; constexpr since C++26
 
 #include <forward_list>
 
@@ -321,7 +323,7 @@ constexpr void test_sequence_insert_range_after() {
   }
 }
 
-void test_sequence_insert_range_after_move_only() {
+TEST_CONSTEXPR_CXX26 void test_sequence_insert_range_after_move_only() {
   MoveOnly input[5];
   std::ranges::subrange in(std::move_iterator{input}, std::move_iterator{input + 5});
 
@@ -366,7 +368,7 @@ void test_insert_range_after_exception_safety_throwing_allocator() {
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_insert_range_after<std::forward_list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -374,8 +376,19 @@ int main(int, char**) {
   });
   test_sequence_insert_range_after_move_only();
 
-  test_insert_range_after_exception_safety_throwing_copy();
-  test_insert_range_after_exception_safety_throwing_allocator<int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_insert_range_after_exception_safety_throwing_copy();
+    test_insert_range_after_exception_safety_throwing_allocator<int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp
index 98c7a2634117..9fcade7ff6bb 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void pop_front();
+// void pop_front(); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -71,5 +71,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp
index 418aa72052ba..c4b9cd9bdfc4 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void prepend_range(R&& rg); // C++23
+//   constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26
 
 #include <forward_list>
 
@@ -21,7 +21,7 @@
 //   {empty/one-element/full} container);
 // - prepending move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_prepend_range<std::forward_list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +31,19 @@ int main(int, char**) {
   });
   test_sequence_prepend_range_move_only<std::forward_list>();
 
-  test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
-  test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
+    test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp
index f99c40fa0c1a..61c5dcac0545 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void push_front(const value_type& v);
+// void push_front(const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -44,5 +44,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp
index 467037465eed..cd24d6ff6af0 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: no-exceptions
 // <forward_list>
 
-// void push_front(const value_type& x);
+// void push_front(const value_type& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp
index d3156c5fdd38..b30ff7a0189e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// void push_front(value_type&& v);
+// void push_front(value_type&& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef std::forward_list<T> C;
@@ -45,5 +45,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp
index 2dacf458d7d9..f80886113bf2 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void resize(size_type n);
+// void resize(size_type n); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,8 +18,8 @@
 #include "DefaultOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
-  {
+TEST_CONSTEXPR_CXX26 bool test() {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     typedef DefaultOnly T;
     typedef std::forward_list<T> C;
     C c;
@@ -65,7 +65,7 @@ int main(int, char**) {
     assert(*std::next(c.begin(), 5) == 0);
   }
 #if TEST_STD_VER >= 11
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     typedef DefaultOnly T;
     typedef std::forward_list<T, min_allocator<T>> C;
     C c;
@@ -112,5 +112,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp
index a6af763e6937..4ec859b36336 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void resize(size_type n, const value_type& v);
+// void resize(size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -22,7 +22,7 @@
 #  include "container_test_types.h"
 #endif
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -84,7 +84,7 @@ int main(int, char**) {
     assert(*std::next(c.begin(), 4) == 10);
     assert(*std::next(c.begin(), 5) == 10);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // Test that the allocator's construct method is being used to
     // construct the new elements and that it's called exactly N times.
     typedef std::forward_list<int, ContainerTestAllocator<int, int>> Container;
@@ -99,5 +99,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp
index 9a162789569d..d8e80c56bf39 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void merge(forward_list& x);
+// void merge(forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -30,11 +30,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -116,5 +116,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp
index 4e1814044808..0adadb2dd092 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// template <class Compare> void merge(forward_list& x, Compare comp);
+// template <class Compare> void merge(forward_list& x, Compare comp); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -30,11 +30,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -117,5 +117,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp
index acfa014fe254..906748ec2702 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// void merge(forward_list&& x);
+// void merge(forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <functional>
@@ -29,11 +29,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -109,5 +109,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp
index 41b56ce7a288..2ced0b1596e4 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// template <class Compare> void merge(forward_list&& x, Compare comp);
+// template <class Compare> void merge(forward_list&& x, Compare comp); // constexpr since C++26
 
 #include <forward_list>
 #include <functional>
@@ -29,11 +29,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -110,5 +110,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp
index ec3bf845dcc5..b17708ba60ee 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // void remove(const value_type& v);      // C++17 and before
-// size_type remove(const value_type& v); // C++20 and after
+// size_type remove(const value_type& v); // C++20 and after; // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -19,7 +19,7 @@
 #include "min_allocator.h"
 
 template <class L>
-void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.remove(value)), typename L::size_type);
@@ -32,22 +32,22 @@ void do_remove(L& l, const typename L::value_type& value, typename L::size_type
 }
 
 struct S {
-  S(int i) : i_(new int(i)) {}
-  S(const S& rhs) : i_(new int(*rhs.i_)) {}
-  S& operator=(const S& rhs) {
+  TEST_CONSTEXPR_CXX20 S(int i) : i_(new int(i)) {}
+  TEST_CONSTEXPR_CXX20 S(const S& rhs) : i_(new int(*rhs.i_)) {}
+  TEST_CONSTEXPR_CXX20 S& operator=(const S& rhs) {
     *i_ = *rhs.i_;
     return *this;
   }
-  ~S() {
+  TEST_CONSTEXPR_CXX20 ~S() {
     delete i_;
     i_ = NULL;
   }
-  bool operator==(const S& rhs) const { return *i_ == *rhs.i_; }
-  int get() const { return *i_; }
+  TEST_CONSTEXPR bool operator==(const S& rhs) const { return *i_ == *rhs.i_; }
+  TEST_CONSTEXPR int get() const { return *i_; }
   int* i_;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -171,5 +171,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp
index c6325baea259..f26205d03f64 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class Predicate> void      remove_if(Predicate pred); // C++17 and before
-// template <class Predicate> size_type remove_if(Predicate pred); // C++20 and after
+// template <class Predicate> size_type remove_if(Predicate pred); // C++20 and after; constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -22,7 +22,7 @@
 #include "counting_predicates.h"
 
 template <class L, class Predicate>
-void do_remove_if(L& l, Predicate pred, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_remove_if(L& l, Predicate pred, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.remove_if(pred)), typename L::size_type);
@@ -34,18 +34,18 @@ void do_remove_if(L& l, Predicate pred, typename L::size_type expected) {
   assert(old_size - std::distance(l.begin(), l.end()) == expected);
 }
 
-bool g(int i) { return i < 3; }
+TEST_CONSTEXPR bool g(int i) { return i < 3; }
 
 struct PredLWG526 {
-  PredLWG526(int i) : i_(i) {}
-  ~PredLWG526() { i_ = -32767; }
-  bool operator()(const PredLWG526& p) const { return p.i_ == i_; }
+  TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; }
+  TEST_CONSTEXPR bool operator()(const PredLWG526& p) const { return p.i_ == i_; }
 
-  bool operator==(int i) const { return i == i_; }
+  TEST_CONSTEXPR bool operator==(int i) const { return i == i_; }
   int i_;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef unary_counting_predicate<bool (*)(T), T> Predicate;
@@ -187,5 +187,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp
index 0d0656897f34..38f0e74f6632 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void reverse();
+// void reverse(); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -19,7 +19,7 @@
 #include "min_allocator.h"
 
 template <class C>
-void test(int N) {
+TEST_CONSTEXPR_CXX26 void test1(int N) {
   C c;
   for (int i = 0; i < N; ++i)
     c.push_front(i);
@@ -30,12 +30,21 @@ void test(int N) {
     assert(*j == i);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for (int i = 0; i < 10; ++i)
-    test<std::forward_list<int> >(i);
+    test1<std::forward_list<int> >(i);
 #if TEST_STD_VER >= 11
   for (int i = 0; i < 10; ++i)
-    test<std::forward_list<int, min_allocator<int>> >(i);
+    test1<std::forward_list<int, min_allocator<int>> >(i);
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp
index 4c91d7397adf..f8787d70784d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void splice_after(const_iterator p, forward_list&& x);
+// void splice_after(const_iterator p, forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,13 +19,13 @@
 #include "min_allocator.h"
 
 typedef int T;
-const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
-const T t2[]                 = {10, 11, 12, 13, 14, 15};
-const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+TEST_CONSTEXPR const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
+TEST_CONSTEXPR const T t2[]                 = {10, 11, 12, 13, 14, 15};
+TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
-void testd(const C& c, int p, int l) {
+TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int l) {
   typename C::const_iterator i = c.begin();
   int n1                       = 0;
   for (; n1 < p; ++n1, ++i)
@@ -37,7 +37,7 @@ void testd(const C& c, int p, int l) {
   assert(std::distance(c.begin(), c.end()) == size_t1 + l);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     // splicing different containers
     typedef std::forward_list<T> C;
@@ -67,5 +67,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp
index bb8bdea63254..7202b0e15362 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void splice_after(const_iterator p, forward_list&& x, const_iterator i);
+// void splice_after(const_iterator p, forward_list&& x, const_iterator i); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,13 +19,13 @@
 #include "min_allocator.h"
 
 typedef int T;
-const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
-const T t2[]                 = {10, 11, 12};
-const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+TEST_CONSTEXPR const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
+TEST_CONSTEXPR const T t2[]                 = {10, 11, 12};
+TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
-void testd(const C& c, int p, int f) {
+TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int f) {
   typename C::const_iterator i = c.begin();
   int n1                       = 0;
   for (; n1 < p; ++n1, ++i)
@@ -38,7 +38,7 @@ void testd(const C& c, int p, int f) {
 }
 
 template <class C>
-void tests(const C& c, int p, int f) {
+TEST_CONSTEXPR_CXX26 void tests(const C& c, int p, int f) {
   typename C::const_iterator i = c.begin();
   int n                        = 0;
   if (p == f || p == f + 1) {
@@ -67,7 +67,7 @@ void tests(const C& c, int p, int f) {
   assert(std::distance(c.begin(), c.end()) == size_t1);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     // splicing different containers
     typedef std::forward_list<T> C;
@@ -117,5 +117,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
index 99b3ed1c7836..18da6f12b28d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
@@ -8,8 +8,10 @@
 
 // <forward_list>
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=3000000
+
 // void splice_after(const_iterator p, forward_list&& x,
-//                   const_iterator first, const_iterator last);
+//                   const_iterator first, const_iterator last); // constexpr since C++26
 
 #include <stddef.h>
 #include <forward_list>
@@ -20,13 +22,13 @@
 #include "min_allocator.h"
 
 typedef std::ptrdiff_t T;
-const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
-const T t2[]                 = {10, 11, 12, 13, 14, 15};
-const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+TEST_CONSTEXPR const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
+TEST_CONSTEXPR const T t2[]                 = {10, 11, 12, 13, 14, 15};
+TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
-void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
+TEST_CONSTEXPR_CXX26 void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
   typename C::const_iterator i = c.begin();
   std::ptrdiff_t n1            = 0;
   for (; n1 < p; ++n1, ++i)
@@ -39,7 +41,7 @@ void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
 }
 
 template <class C>
-void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
+TEST_CONSTEXPR_CXX26 void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
   typename C::const_iterator i = c.begin();
   std::ptrdiff_t n             = 0;
   std::ptrdiff_t d             = l > f + 1 ? l - 1 - f : 0;
@@ -69,7 +71,7 @@ void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
   assert(std::distance(c.begin(), c.end()) == size_t1);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     // splicing different containers
     typedef std::forward_list<T> C;
@@ -157,5 +159,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp
index ebd1a79cdb4b..28efff3849e6 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // void unique();      // C++17 and before
-// size_type unique(); // C++20 and after
+// size_type unique(); // C++20 and after; constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -19,7 +19,7 @@
 #include "min_allocator.h"
 
 template <class L>
-void do_unique(L& l, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_unique(L& l, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.unique()), typename L::size_type);
@@ -31,7 +31,7 @@ void do_unique(L& l, typename L::size_type expected) {
   assert(old_size - std::distance(l.begin(), l.end()) == expected);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -131,5 +131,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp
index 408cbf6ae9c2..f07142dffe9d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class BinaryPredicate> void unique(BinaryPredicate binary_pred);      // C++17 and before
-// template <class BinaryPredicate> size_type unique(BinaryPredicate binary_pred); // C++20 and after
+// template <class BinaryPredicate> size_type unique(BinaryPredicate binary_pred); // C++20 and after; constexpr since C++26
 
 #include <cassert>
 #include <forward_list>
@@ -20,7 +20,7 @@
 #include "min_allocator.h"
 
 template <class L, class Predicate>
-void do_unique(L& l, Predicate pred, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_unique(L& l, Predicate pred, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.unique(pred)), typename L::size_type);
@@ -33,17 +33,17 @@ void do_unique(L& l, Predicate pred, typename L::size_type expected) {
 }
 
 struct PredLWG526 {
-  PredLWG526(int i) : i_(i) {}
-  ~PredLWG526() { i_ = -32767; }
-  bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; }
+  TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; }
+  TEST_CONSTEXPR bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; }
 
-  bool operator==(int i) const { return i == i_; }
+  TEST_CONSTEXPR bool operator==(int i) const { return i == i_; }
   int i_;
 };
 
-bool g(int x, int y) { return x == y; }
+TEST_CONSTEXPR bool g(int x, int y) { return x == y; }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -157,5 +157,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
index ef6b72ee360a..cb57b094a077 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
@@ -10,11 +10,11 @@
 
 // template <class T, class Allocator>
 //     bool operator==(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator!=(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class C>
-void test(int N, int M) {
+TEST_CONSTEXPR_CXX26 void test(int N, int M) {
   C c1;
   for (int i = 0; i < N; ++i)
     c1.push_front(i);
@@ -44,7 +44,7 @@ void test(int N, int M) {
   }
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for (int i = 0; i < 10; ++i)
     for (int j = 0; j < 10; ++j)
       test<std::forward_list<int> >(i, j);
@@ -54,5 +54,14 @@ int main(int, char**) {
       test<std::forward_list<int, min_allocator<int>> >(i, j);
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
index e50f9e6e9e47..f4f7c6d1f7e5 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void swap(forward_list& x);
+// void swap(forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -257,5 +257,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
index cae6950436de..ce2547978154 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class T, class Allocator>
-//     void swap(forward_list<T, Allocator>& x, forward_list<T, Allocator>& y);
+//     void swap(forward_list<T, Allocator>& x, forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -258,5 +258,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
index d16acadaeb89..7bf80ca026e8 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
@@ -10,19 +10,19 @@
 
 // template <class T, class Allocator>
 //     bool operator< (const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator> (const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator>=(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator<=(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -33,7 +33,7 @@
 #include "min_allocator.h"
 
 template <class C>
-void test(int N, int M) {
+TEST_CONSTEXPR_CXX26 void test(int N, int M) {
   C c1;
   for (int i = 0; i < N; ++i)
     c1.push_front(i);
@@ -50,7 +50,7 @@ void test(int N, int M) {
     assert(c1 > c2);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for (int i = 0; i < 10; ++i)
     for (int j = 0; j < 10; ++j)
       test<std::forward_list<int> >(i, j);
@@ -60,5 +60,14 @@ int main(int, char**) {
       test<std::forward_list<int, min_allocator<int>> >(i, j);
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
index b50e67589471..02b7b471a1ae 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
@@ -12,10 +12,10 @@
 
 // void swap(forward_list& c)
 //     noexcept(!allocator_type::propagate_on_container_swap::value ||
-//              __is_nothrow_swappable<allocator_type>::value);
+//              __is_nothrow_swappable<allocator_type>::value);          // constexpr since C++26
 //
 //  In C++17, the standard says that swap shall have:
-//     noexcept(is_always_equal<allocator_type>::value);
+//     noexcept(is_always_equal<allocator_type>::value);                 // constexpr since C++26
 
 // This tests a conforming extension
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp
index f37f5c2f513b..624eeb17799c 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp
@@ -10,7 +10,7 @@
 
 // class forward_list
 
-// allocator_type get_allocator() const
+// allocator_type get_allocator() const // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::allocator<int> alloc;
     const std::forward_list<int> fl(alloc);
@@ -30,5 +30,14 @@ int main(int, char**) {
     assert(fl.get_allocator() == alloc);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp
index b7be03f1062d..16c6f0b90f96 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp
@@ -8,9 +8,9 @@
 
 // <forward_list>
 
-// forward_list()
-// forward_list::iterator()
-// forward_list::const_iterator()
+// forward_list()                 // constexpr since C++26
+// forward_list::iterator()       // constexpr since C++26
+// forward_list::const_iterator() // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -33,7 +33,7 @@ struct B {
 };
 #endif
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     A a;
     assert(a.d.empty());
@@ -49,5 +49,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp
index 5ba0d61f104e..aab53351f00e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// size_type max_size() const;
+// size_type max_size() const; // constexpr since C++26
 
 #include <cassert>
 #include <forward_list>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef limited_allocator<int, 10> A;
     typedef std::forward_list<int, A> C;
@@ -42,5 +42,14 @@ int main(int, char**) {
     assert(c.max_size() <= alloc_max_size(c.get_allocator()));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
index 31b3e900aabc..05f903dccafe 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
@@ -24,6 +24,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -54,6 +58,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -87,6 +95,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -126,6 +138,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -171,6 +187,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should be defined in c++23"
 #  endif
@@ -219,6 +239,13 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_forward_list != 202502L
+#    error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index b1cc4afd3069..a13edacd1e46 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -196,6 +196,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should not be defined before c++20"
 #  endif
@@ -1084,6 +1088,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should not be defined before c++20"
 #  endif
@@ -2074,6 +2082,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should not be defined before c++20"
 #  endif
@@ -3304,6 +3316,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should be defined in c++20"
 #  endif
@@ -4756,6 +4772,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should be defined in c++23"
 #  endif
@@ -6427,6 +6447,13 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_forward_list != 202502L
+#    error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should be defined in c++26"
 #  endif
diff --git a/libcxx/test/support/counting_predicates.h b/libcxx/test/support/counting_predicates.h
index 6f34ce76302a..8fb2db1af70d 100644
--- a/libcxx/test/support/counting_predicates.h
+++ b/libcxx/test/support/counting_predicates.h
@@ -16,42 +16,44 @@
 template <typename Predicate, typename Arg>
 struct unary_counting_predicate {
 public:
-    typedef Arg argument_type;
-    typedef bool result_type;
+  typedef Arg argument_type;
+  typedef bool result_type;
 
-    unary_counting_predicate(Predicate p) : p_(p), count_(0) {}
-    unary_counting_predicate(const unary_counting_predicate&) = default;
-    unary_counting_predicate& operator=(const unary_counting_predicate&) = default;
-    ~unary_counting_predicate() {}
+  TEST_CONSTEXPR_CXX20 unary_counting_predicate(Predicate p) : p_(p), count_(0) {}
+  unary_counting_predicate(const unary_counting_predicate&)            = default;
+  unary_counting_predicate& operator=(const unary_counting_predicate&) = default;
+  TEST_CONSTEXPR_CXX20 ~unary_counting_predicate() {}
 
-    bool operator () (const Arg &a) const { ++count_; return p_(a); }
-    std::size_t count() const { return count_; }
-    void reset() { count_ = 0; }
+  TEST_CONSTEXPR_CXX14 bool operator()(const Arg& a) const {
+    ++count_;
+    return p_(a);
+  }
+  TEST_CONSTEXPR std::size_t count() const { return count_; }
+  TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; }
 
 private:
-    Predicate p_;
-    mutable std::size_t count_;
+  Predicate p_;
+  mutable std::size_t count_;
 };
 
-
-template <typename Predicate, typename Arg1, typename Arg2=Arg1>
+template <typename Predicate, typename Arg1, typename Arg2 = Arg1>
 struct binary_counting_predicate {
 public:
-    typedef Arg1 first_argument_type;
-    typedef Arg2 second_argument_type;
-    typedef bool result_type;
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+  typedef bool result_type;
 
-    TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {}
-    TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const {
-      ++count_;
-      return p_(a1, a2);
-    }
-    TEST_CONSTEXPR std::size_t count() const { return count_; }
-    TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; }
+  TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {}
+  TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const {
+    ++count_;
+    return p_(a1, a2);
+  }
+  TEST_CONSTEXPR std::size_t count() const { return count_; }
+  TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; }
 
-  private:
-    Predicate p_;
-    mutable std::size_t count_;
+private:
+  Predicate p_;
+  mutable std::size_t count_;
 };
 
 #if TEST_STD_VER > 14
@@ -66,13 +68,13 @@ public:
   constexpr counting_predicate(Predicate pred, int& count) : pred_(std::move(pred)), count_(&count) {}
 
   template <class... Args>
-  constexpr decltype(auto) operator()(Args&& ...args) {
+  constexpr decltype(auto) operator()(Args&&... args) {
     ++(*count_);
     return pred_(std::forward<Args>(args)...);
   }
 
   template <class... Args>
-  constexpr decltype(auto) operator()(Args&& ...args) const {
+  constexpr decltype(auto) operator()(Args&&... args) const {
     ++(*count_);
     return pred_(std::forward<Args>(args)...);
   }
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
old mode 100755
new mode 100644
index 82f0d09db5c3..b59c7fdaf0a3
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -357,6 +357,11 @@ feature_test_macros = [
             "values": {"c++20": 201907},
             "headers": ["memory"],
         },
+        {
+            "name": "__cpp_lib_constexpr_forward_list",
+            "values": {"c++26": 202502},
+            "headers": ["forward_list"],
+        },
         {
             "name": "__cpp_lib_constexpr_functional",
             "values": {"c++20": 201907},

From 5188bea9afac859fa6523e07d98748527c295aaf Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 11 Jun 2025 09:18:55 -0700
Subject: [PATCH 0005/1322] [llvm] annotate interfaces in llvm/TargetParser for
 DLL export (#143616)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/TargetParser`
library. These annotations currently have no meaningful impact on the
LLVM build; however, they are a prerequisite to support an LLVM Windows
DLL (shared library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

Most of these changes were generated automatically using the [Interface
Definition Scanner (IDS)](https://github.com/compnerd/ids) tool,
followed formatting with `git clang-format`.

Additionally, I manually removed the redundant declaration of
`getCanonicalArchName` from
llvm/include/llvm/TargetParser/ARMTargetParser.h because IDS only
auto-annotates the first declaration it encounters, and the second
un-annotated declaration results in an MSVC warning.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 .../llvm/TargetParser/AArch64TargetParser.h   |  60 +++++----
 .../llvm/TargetParser/ARMTargetParser.h       |  71 +++++-----
 .../llvm/TargetParser/ARMTargetParserCommon.h |  13 +-
 .../llvm/TargetParser/CSKYTargetParser.h      |  30 +++--
 llvm/include/llvm/TargetParser/Host.h         |  25 ++--
 .../llvm/TargetParser/LoongArchTargetParser.h |  13 +-
 .../llvm/TargetParser/PPCTargetParser.h       |  15 ++-
 llvm/include/llvm/TargetParser/RISCVISAInfo.h |  42 +++---
 .../llvm/TargetParser/RISCVTargetParser.h     |  42 +++---
 .../llvm/TargetParser/SubtargetFeature.h      |  17 +--
 llvm/include/llvm/TargetParser/TargetParser.h |  29 +++--
 llvm/include/llvm/TargetParser/Triple.h       | 121 +++++++++---------
 .../llvm/TargetParser/X86TargetParser.h       |  33 ++---
 13 files changed, 271 insertions(+), 240 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 0338770593bc..59e8117ccb73 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -79,7 +80,7 @@ struct FMVInfo {
       : Name(Name), FeatureBit(FeatureBit), PriorityBit(PriorityBit), ID(ID) {};
 };
 
-const std::vector<FMVInfo> &getFMVInfo();
+LLVM_ABI const std::vector<FMVInfo> &getFMVInfo();
 
 // Represents a dependency between two architecture extensions. Later is the
 // feature which was added to the architecture after Earlier, and expands the
@@ -146,7 +147,7 @@ struct ArchInfo {
   StringRef getSubArch() const { return ArchFeature.substr(1); }
 
   // Search for ArchInfo by SubArch name
-  static std::optional<ArchInfo> findBySubArch(StringRef SubArch);
+  LLVM_ABI static std::optional<ArchInfo> findBySubArch(StringRef SubArch);
 };
 
 #define EMIT_ARCHITECTURES
@@ -182,34 +183,36 @@ struct ExtensionSet {
   // Enable the given architecture extension, and any other extensions it
   // depends on. Does not change the base architecture, or follow dependencies
   // between features which are only related by required arcitecture versions.
-  void enable(ArchExtKind E);
+  LLVM_ABI void enable(ArchExtKind E);
 
   // Disable the given architecture extension, and any other extensions which
   // depend on it. Does not change the base architecture, or follow
   // dependencies between features which are only related by required
   // arcitecture versions.
-  void disable(ArchExtKind E);
+  LLVM_ABI void disable(ArchExtKind E);
 
   // Add default extensions for the given CPU. Records the base architecture,
   // to later resolve dependencies which depend on it.
-  void addCPUDefaults(const CpuInfo &CPU);
+  LLVM_ABI void addCPUDefaults(const CpuInfo &CPU);
 
   // Add default extensions for the given architecture version. Records the
   // base architecture, to later resolve dependencies which depend on it.
-  void addArchDefaults(const ArchInfo &Arch);
+  LLVM_ABI void addArchDefaults(const ArchInfo &Arch);
 
   // Add or remove a feature based on a modifier string. The string must be of
   // the form "<name>" to enable a feature or "no<name>" to disable it. This
   // will also enable or disable any features as required by the dependencies
   // between them.
-  bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false);
+  LLVM_ABI bool parseModifier(StringRef Modifier,
+                              const bool AllowNoDashForm = false);
 
   // Constructs a new ExtensionSet by toggling the corresponding bits for every
   // feature in the \p Features list without expanding their dependencies. Used
   // for reconstructing an ExtensionSet from the output of toLLVMFeatures().
   // Features that are not recognized are pushed back to \p NonExtensions.
-  void reconstructFromParsedFeatures(const std::vector<std::string> &Features,
-                                     std::vector<std::string> &NonExtensions);
+  LLVM_ABI void
+  reconstructFromParsedFeatures(const std::vector<std::string> &Features,
+                                std::vector<std::string> &NonExtensions);
 
   // Convert the set of enabled extension to an LLVM feature list, appending
   // them to Features.
@@ -227,7 +230,7 @@ struct ExtensionSet {
     }
   }
 
-  void dump() const;
+  LLVM_ABI void dump() const;
 };
 
 // Name alias.
@@ -239,52 +242,53 @@ struct Alias {
 #define EMIT_CPU_ALIAS
 #include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
-const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID));
+LLVM_ABI const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID));
 
-bool getExtensionFeatures(
-    const AArch64::ExtensionBitset &Extensions,
-    std::vector<StringRef> &Features);
+LLVM_ABI bool getExtensionFeatures(const AArch64::ExtensionBitset &Extensions,
+                                   std::vector<StringRef> &Features);
 
-StringRef getArchExtFeature(StringRef ArchExt);
-StringRef resolveCPUAlias(StringRef CPU);
+LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt);
+LLVM_ABI StringRef resolveCPUAlias(StringRef CPU);
 
 // Information by Name
-const ArchInfo *getArchForCpu(StringRef CPU);
+LLVM_ABI const ArchInfo *getArchForCpu(StringRef CPU);
 
 // Parser
-const ArchInfo *parseArch(StringRef Arch);
+LLVM_ABI const ArchInfo *parseArch(StringRef Arch);
 
 // Return the extension which has the given -target-feature name.
-std::optional<ExtensionInfo> targetFeatureToExtension(StringRef TargetFeature);
+LLVM_ABI std::optional<ExtensionInfo>
+targetFeatureToExtension(StringRef TargetFeature);
 
 // Parse a name as defined by the Extension class in tablegen.
-std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
+LLVM_ABI std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
 
 // Parse a name as defined by the FMVInfo class in tablegen.
-std::optional<FMVInfo> parseFMVExtension(StringRef Extension);
+LLVM_ABI std::optional<FMVInfo> parseFMVExtension(StringRef Extension);
 
 // Given the name of a CPU or alias, return the correponding CpuInfo.
-std::optional<CpuInfo> parseCpu(StringRef Name);
+LLVM_ABI std::optional<CpuInfo> parseCpu(StringRef Name);
 // Used by target parser tests
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
-bool isX18ReservedByDefault(const Triple &TT);
+LLVM_ABI bool isX18ReservedByDefault(const Triple &TT);
 
 // For a given set of feature names, which can be either target-features, or
 // fmv-features metadata, expand their dependencies and then return a bitmask
 // corresponding to the entries of AArch64::FeatPriorities.
-uint64_t getFMVPriority(ArrayRef<StringRef> Features);
+LLVM_ABI uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
 // For a given set of FMV feature names, expand their dependencies and then
 // return a bitmask corresponding to the entries of AArch64::CPUFeatures.
 // The values in CPUFeatures are not bitmasks themselves, they are sequential
 // (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether
 // a certain FMV feature is available on the host.
-uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
+LLVM_ABI uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
 
-void PrintSupportedExtensions();
+LLVM_ABI void PrintSupportedExtensions();
 
-void printEnabledExtensions(const std::set<StringRef> &EnabledFeatureNames);
+LLVM_ABI void
+printEnabledExtensions(const std::set<StringRef> &EnabledFeatureNames);
 
 } // namespace AArch64
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h
index b2403f42f1b7..798c578ced93 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include <vector>
 
@@ -223,53 +224,55 @@ inline ArchKind &operator--(ArchKind &Kind) {
 }
 
 // Information by ID
-StringRef getFPUName(FPUKind FPUKind);
-FPUVersion getFPUVersion(FPUKind FPUKind);
-NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind);
-FPURestriction getFPURestriction(FPUKind FPUKind);
+LLVM_ABI StringRef getFPUName(FPUKind FPUKind);
+LLVM_ABI FPUVersion getFPUVersion(FPUKind FPUKind);
+LLVM_ABI NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind);
+LLVM_ABI FPURestriction getFPURestriction(FPUKind FPUKind);
 
-bool getFPUFeatures(FPUKind FPUKind, std::vector<StringRef> &Features);
-bool getHWDivFeatures(uint64_t HWDivKind, std::vector<StringRef> &Features);
-bool getExtensionFeatures(uint64_t Extensions,
-                          std::vector<StringRef> &Features);
+LLVM_ABI bool getFPUFeatures(FPUKind FPUKind, std::vector<StringRef> &Features);
+LLVM_ABI bool getHWDivFeatures(uint64_t HWDivKind,
+                               std::vector<StringRef> &Features);
+LLVM_ABI bool getExtensionFeatures(uint64_t Extensions,
+                                   std::vector<StringRef> &Features);
 
-StringRef getArchName(ArchKind AK);
-unsigned getArchAttr(ArchKind AK);
-StringRef getCPUAttr(ArchKind AK);
-StringRef getSubArch(ArchKind AK);
-StringRef getArchExtName(uint64_t ArchExtKind);
-StringRef getArchExtFeature(StringRef ArchExt);
-bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
-                           std::vector<StringRef> &Features,
-                           FPUKind &ArgFPUKind);
-ArchKind convertV9toV8(ArchKind AK);
+LLVM_ABI StringRef getArchName(ArchKind AK);
+LLVM_ABI unsigned getArchAttr(ArchKind AK);
+LLVM_ABI StringRef getCPUAttr(ArchKind AK);
+LLVM_ABI StringRef getSubArch(ArchKind AK);
+LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind);
+LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt);
+LLVM_ABI bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK,
+                                    StringRef ArchExt,
+                                    std::vector<StringRef> &Features,
+                                    FPUKind &ArgFPUKind);
+LLVM_ABI ArchKind convertV9toV8(ArchKind AK);
 
 // Information by Name
-FPUKind getDefaultFPU(StringRef CPU, ArchKind AK);
-uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK);
-StringRef getDefaultCPU(StringRef Arch);
-StringRef getCanonicalArchName(StringRef Arch);
-StringRef getFPUSynonym(StringRef FPU);
+LLVM_ABI FPUKind getDefaultFPU(StringRef CPU, ArchKind AK);
+LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK);
+LLVM_ABI StringRef getDefaultCPU(StringRef Arch);
+LLVM_ABI StringRef getFPUSynonym(StringRef FPU);
 
 // Parser
-uint64_t parseHWDiv(StringRef HWDiv);
-FPUKind parseFPU(StringRef FPU);
-ArchKind parseArch(StringRef Arch);
-uint64_t parseArchExt(StringRef ArchExt);
-ArchKind parseCPUArch(StringRef CPU);
-ProfileKind parseArchProfile(StringRef Arch);
-unsigned parseArchVersion(StringRef Arch);
+LLVM_ABI uint64_t parseHWDiv(StringRef HWDiv);
+LLVM_ABI FPUKind parseFPU(StringRef FPU);
+LLVM_ABI ArchKind parseArch(StringRef Arch);
+LLVM_ABI uint64_t parseArchExt(StringRef ArchExt);
+LLVM_ABI ArchKind parseCPUArch(StringRef CPU);
+LLVM_ABI ProfileKind parseArchProfile(StringRef Arch);
+LLVM_ABI unsigned parseArchVersion(StringRef Arch);
 
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
-StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
 
 /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting.
 ///
 /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty
 /// string then the triple's arch name is used.
-StringRef getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch = {});
+LLVM_ABI StringRef getARMCPUForArch(const llvm::Triple &Triple,
+                                    StringRef MArch = {});
 
-void PrintSupportedExtensions(StringMap<StringRef> DescMap);
+LLVM_ABI void PrintSupportedExtensions(StringMap<StringRef> DescMap);
 
 } // namespace ARM
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index f6115718e9f5..7c8030dd5576 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -14,6 +14,7 @@
 #define LLVM_TARGETPARSER_ARMTARGETPARSERCOMMON_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 namespace ARM {
@@ -23,19 +24,19 @@ enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 };
 enum class EndianKind { INVALID = 0, LITTLE, BIG };
 
 /// Converts e.g. "armv8" -> "armv8-a"
-StringRef getArchSynonym(StringRef Arch);
+LLVM_ABI StringRef getArchSynonym(StringRef Arch);
 
 /// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but
 /// (iwmmxt|xscale)(eb)? is also permitted. If the former, return
 /// "v.+", if the latter, return unmodified string, minus 'eb'.
 /// If invalid, return empty string.
-StringRef getCanonicalArchName(StringRef Arch);
+LLVM_ABI StringRef getCanonicalArchName(StringRef Arch);
 
 // ARM, Thumb, AArch64
-ISAKind parseArchISA(StringRef Arch);
+LLVM_ABI ISAKind parseArchISA(StringRef Arch);
 
 // Little/Big endian
-EndianKind parseArchEndian(StringRef Arch);
+LLVM_ABI EndianKind parseArchEndian(StringRef Arch);
 
 struct ParsedBranchProtection {
   StringRef Scope;
@@ -45,8 +46,8 @@ struct ParsedBranchProtection {
   bool GuardedControlStack;
 };
 
-bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
-                           StringRef &Err, bool EnablePAuthLR = false);
+LLVM_ABI bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
+                                    StringRef &Err, bool EnablePAuthLR = false);
 
 } // namespace ARM
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/CSKYTargetParser.h b/llvm/include/llvm/TargetParser/CSKYTargetParser.h
index 4c4ec06f758a..8eab03ca0149 100644
--- a/llvm/include/llvm/TargetParser/CSKYTargetParser.h
+++ b/llvm/include/llvm/TargetParser/CSKYTargetParser.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TARGETPARSER_CSKYTARGETPARSER_H
 #define LLVM_TARGETPARSER_CSKYTARGETPARSER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <vector>
 
@@ -176,25 +177,26 @@ const ArchNames<CSKY::ArchKind> ARCHNames[] = {
 #include "llvm/TargetParser/CSKYTargetParser.def"
 };
 
-StringRef getArchName(ArchKind AK);
-StringRef getDefaultCPU(StringRef Arch);
-StringRef getArchExtName(uint64_t ArchExtKind);
-StringRef getArchExtFeature(StringRef ArchExt);
-uint64_t getDefaultExtensions(StringRef CPU);
-bool getExtensionFeatures(uint64_t Extensions,
-                          std::vector<StringRef> &Features);
+LLVM_ABI StringRef getArchName(ArchKind AK);
+LLVM_ABI StringRef getDefaultCPU(StringRef Arch);
+LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind);
+LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt);
+LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU);
+LLVM_ABI bool getExtensionFeatures(uint64_t Extensions,
+                                   std::vector<StringRef> &Features);
 
 // Information by ID
-StringRef getFPUName(unsigned FPUKind);
-FPUVersion getFPUVersion(unsigned FPUKind);
+LLVM_ABI StringRef getFPUName(unsigned FPUKind);
+LLVM_ABI FPUVersion getFPUVersion(unsigned FPUKind);
 
-bool getFPUFeatures(CSKYFPUKind Kind, std::vector<StringRef> &Features);
+LLVM_ABI bool getFPUFeatures(CSKYFPUKind Kind,
+                             std::vector<StringRef> &Features);
 
 // Parser
-ArchKind parseArch(StringRef Arch);
-ArchKind parseCPUArch(StringRef CPU);
-uint64_t parseArchExt(StringRef ArchExt);
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI ArchKind parseArch(StringRef Arch);
+LLVM_ABI ArchKind parseCPUArch(StringRef CPU);
+LLVM_ABI uint64_t parseArchExt(StringRef ArchExt);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
 } // namespace CSKY
 
diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h
index 443f4f583b55..be3d41e022ad 100644
--- a/llvm/include/llvm/TargetParser/Host.h
+++ b/llvm/include/llvm/TargetParser/Host.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TARGETPARSER_HOST_H
 #define LLVM_TARGETPARSER_HOST_H
 
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -30,18 +31,18 @@ namespace sys {
 ///   CPU_TYPE-VENDOR-OPERATING_SYSTEM
 /// or
 ///   CPU_TYPE-VENDOR-KERNEL-OPERATING_SYSTEM
-std::string getDefaultTargetTriple();
+LLVM_ABI std::string getDefaultTargetTriple();
 
 /// getProcessTriple() - Return an appropriate target triple for generating
 /// code to be loaded into the current process, e.g. when using the JIT.
-std::string getProcessTriple();
+LLVM_ABI std::string getProcessTriple();
 
 /// getHostCPUName - Get the LLVM name for the host CPU. The particular format
 /// of the name is target dependent, and suitable for passing as -mcpu to the
 /// target which matches the host.
 ///
 /// \return - The host CPU name, or empty if the CPU could not be determined.
-StringRef getHostCPUName();
+LLVM_ABI StringRef getHostCPUName();
 
 /// getHostCPUFeatures - Get the LLVM names for the host CPU features.
 /// The particular format of the names are target dependent, and suitable for
@@ -52,20 +53,20 @@ StringRef getHostCPUName();
 /// which features may appear in this map, except that they are all valid LLVM
 /// feature names. The map can be empty, for example if feature detection
 /// fails.
-const StringMap<bool, MallocAllocator> getHostCPUFeatures();
+LLVM_ABI const StringMap<bool, MallocAllocator> getHostCPUFeatures();
 
 /// This is a function compatible with cl::AddExtraVersionPrinter, which adds
 /// info about the current target triple and detected CPU.
-void printDefaultTargetAndDetectedCPU(raw_ostream &OS);
+LLVM_ABI void printDefaultTargetAndDetectedCPU(raw_ostream &OS);
 
 namespace detail {
 /// Helper functions to extract HostCPUName from /proc/cpuinfo on linux.
-StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForBPF();
+LLVM_ABI StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForBPF();
 
 /// Helper functions to extract CPU details from CPUID on x86.
 namespace x86 {
@@ -78,7 +79,7 @@ enum class VendorSignatures {
 /// Returns the host CPU's vendor.
 /// MaxLeaf: if a non-nullptr pointer is specified, the EAX value will be
 /// assigned to its pointee.
-VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr);
+LLVM_ABI VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr);
 } // namespace x86
 } // namespace detail
 } // namespace sys
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index a28e4e9eff81..1357d7474459 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H
 #define LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <vector>
 
@@ -84,12 +85,12 @@ struct ArchInfo {
   uint32_t Features;
 };
 
-bool isValidArchName(StringRef Arch);
-bool isValidFeatureName(StringRef Feature);
-bool getArchFeatures(StringRef Arch, std::vector<StringRef> &Features);
-bool isValidCPUName(StringRef TuneCPU);
-void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
-StringRef getDefaultArch(bool Is64Bit);
+LLVM_ABI bool isValidArchName(StringRef Arch);
+LLVM_ABI bool isValidFeatureName(StringRef Feature);
+LLVM_ABI bool getArchFeatures(StringRef Arch, std::vector<StringRef> &Features);
+LLVM_ABI bool isValidCPUName(StringRef TuneCPU);
+LLVM_ABI void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI StringRef getDefaultArch(bool Is64Bit);
 
 } // namespace LoongArch
 
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index 5f9fe543aff0..59d9f867005a 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -15,25 +15,28 @@
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
 namespace PPC {
-bool isValidCPU(StringRef CPU);
-void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
-void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI bool isValidCPU(StringRef CPU);
+LLVM_ABI void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
 
 // Get target CPU name.
 // If CPUName is empty or generic, return the default CPU name.
 // If CPUName is not empty or generic, return the normalized CPU name.
-StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName = "");
+LLVM_ABI StringRef getNormalizedPPCTargetCPU(const Triple &T,
+                                             StringRef CPUName = "");
 
 // Get the tune CPU name.
-StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName = "");
+LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
+                                           StringRef CPUName = "");
 
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
-StringRef normalizeCPUName(StringRef CPUName);
+LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index 5b2b6f29fd3d..0c308cadba79 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/RISCVISAUtils.h"
 
@@ -31,27 +32,27 @@ public:
   /// extensions with unrecognised versions will be silently dropped, except
   /// for the special case of the base 'i' and 'e' extensions, where the
   /// default version will be used (as ignoring the base is not possible).
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   parseArchString(StringRef Arch, bool EnableExperimentalExtension,
                   bool ExperimentalExtensionVersionCheck = true);
 
   /// Parse RISC-V ISA info from an arch string that is already in normalized
   /// form (as defined in the psABI). Unlike parseArchString, this function
   /// will not error for unrecognized extension names or extension versions.
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   parseNormalizedArchString(StringRef Arch);
 
   /// Parse RISC-V ISA info from feature vector.
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   parseFeatures(unsigned XLen, const std::vector<std::string> &Features);
 
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   createFromExtMap(unsigned XLen,
                    const RISCVISAUtils::OrderedExtensionMap &Exts);
 
   /// Convert RISC-V ISA info to a feature vector.
-  std::vector<std::string> toFeatures(bool AddAllExtensions = false,
-                                      bool IgnoreUnknown = true) const;
+  LLVM_ABI std::vector<std::string> toFeatures(bool AddAllExtensions = false,
+                                               bool IgnoreUnknown = true) const;
 
   const RISCVISAUtils::OrderedExtensionMap &getExtensions() const {
     return Exts;
@@ -64,25 +65,26 @@ public:
   unsigned getMaxELen() const { return MaxELen; }
   unsigned getMaxELenFp() const { return MaxELenFp; }
 
-  bool hasExtension(StringRef Ext) const;
-  std::string toString() const;
-  StringRef computeDefaultABI() const;
+  LLVM_ABI bool hasExtension(StringRef Ext) const;
+  LLVM_ABI std::string toString() const;
+  LLVM_ABI StringRef computeDefaultABI() const;
 
-  static bool isSupportedExtensionFeature(StringRef Ext);
-  static bool isSupportedExtension(StringRef Ext);
-  static bool isSupportedExtensionWithVersion(StringRef Ext);
-  static bool isSupportedExtension(StringRef Ext, unsigned MajorVersion,
-                                   unsigned MinorVersion);
-  static std::string getTargetFeatureForExtension(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtensionFeature(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtension(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtensionWithVersion(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtension(StringRef Ext,
+                                            unsigned MajorVersion,
+                                            unsigned MinorVersion);
+  LLVM_ABI static std::string getTargetFeatureForExtension(StringRef Ext);
 
-  static void printSupportedExtensions(StringMap<StringRef> &DescMap);
-  static void printEnabledExtensions(bool IsRV64,
-                                     std::set<StringRef> &EnabledFeatureNames,
-                                     StringMap<StringRef> &DescMap);
+  LLVM_ABI static void printSupportedExtensions(StringMap<StringRef> &DescMap);
+  LLVM_ABI static void
+  printEnabledExtensions(bool IsRV64, std::set<StringRef> &EnabledFeatureNames,
+                         StringMap<StringRef> &DescMap);
 
   /// Return the group id and bit position of __riscv_feature_bits.  Returns
   /// <-1, -1> if not supported.
-  static std::pair<int, int> getRISCVFeaturesBitsInfo(StringRef Ext);
+  LLVM_ABI static std::pair<int, int> getRISCVFeaturesBitsInfo(StringRef Ext);
 
   // The maximum value of the group ID obtained from getRISCVFeaturesBitsInfo.
   static constexpr unsigned FeatureBitSize = 2;
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index a529479b546d..41fdab6012aa 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGETPARSER_RISCVTARGETPARSER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -43,18 +44,20 @@ struct CPUInfo {
 static constexpr unsigned RVVBitsPerBlock = 64;
 static constexpr unsigned RVVBytesPerBlock = RVVBitsPerBlock / 8;
 
-void getFeaturesForCPU(StringRef CPU,
-                       SmallVectorImpl<std::string> &EnabledFeatures,
-                       bool NeedPlus = false);
-bool parseCPU(StringRef CPU, bool IsRV64);
-bool parseTuneCPU(StringRef CPU, bool IsRV64);
-StringRef getMArchFromMcpu(StringRef CPU);
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
-void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
-bool hasFastScalarUnalignedAccess(StringRef CPU);
-bool hasFastVectorUnalignedAccess(StringRef CPU);
-bool hasValidCPUModel(StringRef CPU);
-CPUModel getCPUModel(StringRef CPU);
+LLVM_ABI void getFeaturesForCPU(StringRef CPU,
+                                SmallVectorImpl<std::string> &EnabledFeatures,
+                                bool NeedPlus = false);
+LLVM_ABI bool parseCPU(StringRef CPU, bool IsRV64);
+LLVM_ABI bool parseTuneCPU(StringRef CPU, bool IsRV64);
+LLVM_ABI StringRef getMArchFromMcpu(StringRef CPU);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                   bool IsRV64);
+LLVM_ABI void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                       bool IsRV64);
+LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU);
+LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU);
+LLVM_ABI bool hasValidCPUModel(StringRef CPU);
+LLVM_ABI CPUModel getCPUModel(StringRef CPU);
 
 } // namespace RISCV
 
@@ -86,10 +89,10 @@ inline static bool isValidLMUL(unsigned LMUL, bool Fractional) {
   return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1);
 }
 
-unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
-                     bool MaskAgnostic);
+LLVM_ABI unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
+                              bool MaskAgnostic);
 
-unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt);
+LLVM_ABI unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt);
 
 inline static VLMUL getVLMUL(unsigned VType) {
   unsigned VLMul = VType & 0x7;
@@ -97,7 +100,7 @@ inline static VLMUL getVLMUL(unsigned VType) {
 }
 
 // Decode VLMUL into 1,2,4,8 and fractional indicator.
-std::pair<unsigned, bool> decodeVLMUL(VLMUL VLMul);
+LLVM_ABI std::pair<unsigned, bool> decodeVLMUL(VLMUL VLMul);
 
 inline static VLMUL encodeLMUL(unsigned LMUL, bool Fractional) {
   assert(isValidLMUL(LMUL, Fractional) && "Unsupported LMUL");
@@ -148,11 +151,12 @@ inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; }
 
 inline static bool isAltFmt(unsigned VType) { return VType & 0x100; }
 
-void printVType(unsigned VType, raw_ostream &OS);
+LLVM_ABI void printVType(unsigned VType, raw_ostream &OS);
 
-unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
+LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
 
-std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, unsigned EEW);
+LLVM_ABI std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL,
+                                               unsigned EEW);
 } // namespace RISCVVType
 
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h
index 2e1f00dad2df..6f1723dec5d0 100644
--- a/llvm/include/llvm/TargetParser/SubtargetFeature.h
+++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <array>
 #include <initializer_list>
@@ -175,27 +176,27 @@ class SubtargetFeatures {
   std::vector<std::string> Features;    ///< Subtarget features as a vector
 
 public:
-  explicit SubtargetFeatures(StringRef Initial = "");
+  LLVM_ABI explicit SubtargetFeatures(StringRef Initial = "");
 
   /// Returns features as a string.
-  std::string getString() const;
+  LLVM_ABI std::string getString() const;
 
   /// Adds Features.
-  void AddFeature(StringRef String, bool Enable = true);
+  LLVM_ABI void AddFeature(StringRef String, bool Enable = true);
 
-  void addFeaturesVector(const ArrayRef<std::string> OtherFeatures);
+  LLVM_ABI void addFeaturesVector(const ArrayRef<std::string> OtherFeatures);
 
   /// Returns the vector of individual subtarget features.
   const std::vector<std::string> &getFeatures() const { return Features; }
 
   /// Prints feature string.
-  void print(raw_ostream &OS) const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 
   // Dumps feature info.
-  void dump() const;
+  LLVM_ABI void dump() const;
 
   /// Adds the default features for the specified target triple.
-  void getDefaultSubtargetFeatures(const Triple& Triple);
+  LLVM_ABI void getDefaultSubtargetFeatures(const Triple &Triple);
 
   /// Determine if a feature has a flag; '+' or '-'
   static bool hasFlag(StringRef Feature) {
@@ -221,7 +222,7 @@ public:
   }
 
   /// Splits a string of comma separated items in to a vector of strings.
-  static void Split(std::vector<std::string> &V, StringRef S);
+  LLVM_ABI static void Split(std::vector<std::string> &V, StringRef S);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index f776b41f3d7c..176205e17ae0 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -164,27 +165,27 @@ enum FeatureError : uint32_t {
   UNSUPPORTED_TARGET_FEATURE
 };
 
-StringRef getArchFamilyNameAMDGCN(GPUKind AK);
+LLVM_ABI StringRef getArchFamilyNameAMDGCN(GPUKind AK);
 
-StringRef getArchNameAMDGCN(GPUKind AK);
-StringRef getArchNameR600(GPUKind AK);
-StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
-GPUKind parseArchAMDGCN(StringRef CPU);
-GPUKind parseArchR600(StringRef CPU);
-unsigned getArchAttrAMDGCN(GPUKind AK);
-unsigned getArchAttrR600(GPUKind AK);
+LLVM_ABI StringRef getArchNameAMDGCN(GPUKind AK);
+LLVM_ABI StringRef getArchNameR600(GPUKind AK);
+LLVM_ABI StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
+LLVM_ABI GPUKind parseArchAMDGCN(StringRef CPU);
+LLVM_ABI GPUKind parseArchR600(StringRef CPU);
+LLVM_ABI unsigned getArchAttrAMDGCN(GPUKind AK);
+LLVM_ABI unsigned getArchAttrR600(GPUKind AK);
 
-void fillValidArchListAMDGCN(SmallVectorImpl<StringRef> &Values);
-void fillValidArchListR600(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidArchListAMDGCN(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidArchListR600(SmallVectorImpl<StringRef> &Values);
 
-IsaVersion getIsaVersion(StringRef GPU);
+LLVM_ABI IsaVersion getIsaVersion(StringRef GPU);
 
 /// Fills Features map with default values for given target GPU
-void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
-                          StringMap<bool> &Features);
+LLVM_ABI void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
+                                   StringMap<bool> &Features);
 
 /// Inserts wave size feature for given GPU into features map
-std::pair<FeatureError, StringRef>
+LLVM_ABI std::pair<FeatureError, StringRef>
 insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index b56e6e18805e..b6f15ef13191 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -10,6 +10,7 @@
 #define LLVM_TARGETPARSER_TRIPLE_H
 
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 
 // Some system headers or GCC predefined macros conflict with identifiers in
@@ -348,10 +349,11 @@ public:
   /// triple fields unknown.
   Triple() = default;
 
-  explicit Triple(const Twine &Str);
-  Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr);
-  Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr,
-         const Twine &EnvironmentStr);
+  LLVM_ABI explicit Triple(const Twine &Str);
+  LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr,
+                  const Twine &OSStr);
+  LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr,
+                  const Twine &OSStr, const Twine &EnvironmentStr);
 
   bool operator==(const Triple &Other) const {
     return Arch == Other.Arch && SubArch == Other.SubArch &&
@@ -381,8 +383,8 @@ public:
   /// reasonably be done).  In particular, it handles the common case in which
   /// otherwise valid components are in the wrong order. \p Form is used to
   /// specify the output canonical form.
-  static std::string normalize(StringRef Str,
-                               CanonicalForm Form = CanonicalForm::ANY);
+  LLVM_ABI static std::string
+  normalize(StringRef Str, CanonicalForm Form = CanonicalForm::ANY);
 
   /// Return the normalized form of this triple's string.
   std::string normalize(CanonicalForm Form = CanonicalForm::ANY) const {
@@ -417,7 +419,7 @@ public:
   /// triple, if present.
   ///
   /// For example, "fooos1.2.3" would return (1, 2, 3).
-  VersionTuple getEnvironmentVersion() const;
+  LLVM_ABI VersionTuple getEnvironmentVersion() const;
 
   /// Get the object format for this triple.
   ObjectFormatType getObjectFormat() const { return ObjectFormat; }
@@ -426,7 +428,7 @@ public:
   /// present.
   ///
   /// For example, "fooos1.2.3" would return (1, 2, 3).
-  VersionTuple getOSVersion() const;
+  LLVM_ABI VersionTuple getOSVersion() const;
 
   /// Return just the major version number, this is specialized because it is a
   /// common query.
@@ -436,26 +438,26 @@ public:
   /// "darwin" versions to the corresponding OS X versions.  This may also be
   /// called with IOS triples but the OS X version number is just set to a
   /// constant 10.4.0 in that case.  Returns true if successful.
-  bool getMacOSXVersion(VersionTuple &Version) const;
+  LLVM_ABI bool getMacOSXVersion(VersionTuple &Version) const;
 
   /// Parse the version number as with getOSVersion.  This should only be called
   /// with IOS or generic triples.
-  VersionTuple getiOSVersion() const;
+  LLVM_ABI VersionTuple getiOSVersion() const;
 
   /// Parse the version number as with getOSVersion.  This should only be called
   /// with WatchOS or generic triples.
-  VersionTuple getWatchOSVersion() const;
+  LLVM_ABI VersionTuple getWatchOSVersion() const;
 
   /// Parse the version number as with getOSVersion.
-  VersionTuple getDriverKitVersion() const;
+  LLVM_ABI VersionTuple getDriverKitVersion() const;
 
   /// Parse the Vulkan version number from the OSVersion and SPIR-V version
   /// (SubArch).  This should only be called with Vulkan SPIR-V triples.
-  VersionTuple getVulkanVersion() const;
+  LLVM_ABI VersionTuple getVulkanVersion() const;
 
   /// Parse the DXIL version number from the OSVersion and DXIL version
   /// (SubArch).  This should only be called with DXIL triples.
-  VersionTuple getDXILVersion() const;
+  LLVM_ABI VersionTuple getDXILVersion() const;
 
   /// @}
   /// @name Direct Component Access
@@ -469,34 +471,34 @@ public:
   bool empty() const { return Data.empty(); }
 
   /// Get the architecture (first) component of the triple.
-  StringRef getArchName() const;
+  LLVM_ABI StringRef getArchName() const;
 
   /// Get the vendor (second) component of the triple.
-  StringRef getVendorName() const;
+  LLVM_ABI StringRef getVendorName() const;
 
   /// Get the operating system (third) component of the triple.
-  StringRef getOSName() const;
+  LLVM_ABI StringRef getOSName() const;
 
   /// Get the optional environment (fourth) component of the triple, or "" if
   /// empty.
-  StringRef getEnvironmentName() const;
+  LLVM_ABI StringRef getEnvironmentName() const;
 
   /// Get the operating system and optional environment components as a single
   /// string (separated by a '-' if the environment component is present).
-  StringRef getOSAndEnvironmentName() const;
+  LLVM_ABI StringRef getOSAndEnvironmentName() const;
 
   /// Get the version component of the environment component as a single
   /// string (the version after the environment).
   ///
   /// For example, "fooos1.2.3" would return "1.2.3".
-  StringRef getEnvironmentVersionString() const;
+  LLVM_ABI StringRef getEnvironmentVersionString() const;
 
   /// @}
   /// @name Convenience Predicates
   /// @{
 
   /// Returns the pointer width of this architecture.
-  static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch);
+  LLVM_ABI static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch);
 
   /// Returns the pointer width of this architecture.
   unsigned getArchPointerBitWidth() const {
@@ -504,7 +506,7 @@ public:
   }
 
   /// Returns the trampoline size in bytes for this configuration.
-  unsigned getTrampolineSize() const;
+  LLVM_ABI unsigned getTrampolineSize() const;
 
   /// Test whether the architecture is 64-bit
   ///
@@ -513,17 +515,17 @@ public:
   /// 16-bit. The inner details of pointer width for particular architectures
   /// is not summed up in the triple, and so only a coarse grained predicate
   /// system is provided.
-  bool isArch64Bit() const;
+  LLVM_ABI bool isArch64Bit() const;
 
   /// Test whether the architecture is 32-bit
   ///
   /// Note that this tests for 32-bit pointer width, and nothing else.
-  bool isArch32Bit() const;
+  LLVM_ABI bool isArch32Bit() const;
 
   /// Test whether the architecture is 16-bit
   ///
   /// Note that this tests for 16-bit pointer width, and nothing else.
-  bool isArch16Bit() const;
+  LLVM_ABI bool isArch16Bit() const;
 
   /// Helper function for doing comparisons against version numbers included in
   /// the target triple.
@@ -544,8 +546,8 @@ public:
 
   /// Comparison function for checking OS X version compatibility, which handles
   /// supporting skewed version numbering schemes used by the "darwin" triples.
-  bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
-                         unsigned Micro = 0) const;
+  LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
+                                  unsigned Micro = 0) const;
 
   /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin"
   /// and "osx" as OS X triples.
@@ -1171,38 +1173,38 @@ public:
   /// @{
 
   /// Set the architecture (first) component of the triple to a known type.
-  void setArch(ArchType Kind, SubArchType SubArch = NoSubArch);
+  LLVM_ABI void setArch(ArchType Kind, SubArchType SubArch = NoSubArch);
 
   /// Set the vendor (second) component of the triple to a known type.
-  void setVendor(VendorType Kind);
+  LLVM_ABI void setVendor(VendorType Kind);
 
   /// Set the operating system (third) component of the triple to a known type.
-  void setOS(OSType Kind);
+  LLVM_ABI void setOS(OSType Kind);
 
   /// Set the environment (fourth) component of the triple to a known type.
-  void setEnvironment(EnvironmentType Kind);
+  LLVM_ABI void setEnvironment(EnvironmentType Kind);
 
   /// Set the object file format.
-  void setObjectFormat(ObjectFormatType Kind);
+  LLVM_ABI void setObjectFormat(ObjectFormatType Kind);
 
   /// Set all components to the new triple \p Str.
-  void setTriple(const Twine &Str);
+  LLVM_ABI void setTriple(const Twine &Str);
 
   /// Set the architecture (first) component of the triple by name.
-  void setArchName(StringRef Str);
+  LLVM_ABI void setArchName(StringRef Str);
 
   /// Set the vendor (second) component of the triple by name.
-  void setVendorName(StringRef Str);
+  LLVM_ABI void setVendorName(StringRef Str);
 
   /// Set the operating system (third) component of the triple by name.
-  void setOSName(StringRef Str);
+  LLVM_ABI void setOSName(StringRef Str);
 
   /// Set the optional environment (fourth) component of the triple by name.
-  void setEnvironmentName(StringRef Str);
+  LLVM_ABI void setEnvironmentName(StringRef Str);
 
   /// Set the operating system and optional environment components with a single
   /// string.
-  void setOSAndEnvironmentName(StringRef Str);
+  LLVM_ABI void setOSAndEnvironmentName(StringRef Str);
 
   /// @}
   /// @name Helpers to build variants of a particular triple.
@@ -1214,7 +1216,7 @@ public:
   ///
   /// \returns A new triple with a 32-bit architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple get32BitArchVariant() const;
+  LLVM_ABI llvm::Triple get32BitArchVariant() const;
 
   /// Form a triple with a 64-bit variant of the current architecture.
   ///
@@ -1222,7 +1224,7 @@ public:
   ///
   /// \returns A new triple with a 64-bit architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple get64BitArchVariant() const;
+  LLVM_ABI llvm::Triple get64BitArchVariant() const;
 
   /// Form a triple with a big endian variant of the current architecture.
   ///
@@ -1230,7 +1232,7 @@ public:
   ///
   /// \returns A new triple with a big endian architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple getBigEndianArchVariant() const;
+  LLVM_ABI llvm::Triple getBigEndianArchVariant() const;
 
   /// Form a triple with a little endian variant of the current architecture.
   ///
@@ -1238,73 +1240,76 @@ public:
   ///
   /// \returns A new triple with a little endian architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple getLittleEndianArchVariant() const;
+  LLVM_ABI llvm::Triple getLittleEndianArchVariant() const;
 
   /// Tests whether the target triple is little endian.
   ///
   /// \returns true if the triple is little endian, false otherwise.
-  bool isLittleEndian() const;
+  LLVM_ABI bool isLittleEndian() const;
 
   /// Test whether target triples are compatible.
-  bool isCompatibleWith(const Triple &Other) const;
+  LLVM_ABI bool isCompatibleWith(const Triple &Other) const;
 
   /// Test whether the target triple is for a GPU.
   bool isGPU() const { return isSPIRV() || isNVPTX() || isAMDGPU(); }
 
   /// Merge target triples.
-  std::string merge(const Triple &Other) const;
+  LLVM_ABI std::string merge(const Triple &Other) const;
 
   /// Some platforms have different minimum supported OS versions that
   /// varies by the architecture specified in the triple. This function
   /// returns the minimum supported OS version for this triple if one an exists,
   /// or an invalid version tuple if this triple doesn't have one.
-  VersionTuple getMinimumSupportedOSVersion() const;
+  LLVM_ABI VersionTuple getMinimumSupportedOSVersion() const;
 
   /// @}
   /// @name Static helpers for IDs.
   /// @{
 
   /// Get the canonical name for the \p Kind architecture.
-  static StringRef getArchTypeName(ArchType Kind);
+  LLVM_ABI static StringRef getArchTypeName(ArchType Kind);
 
   /// Get the architecture name based on \p Kind and \p SubArch.
-  static StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch);
+  LLVM_ABI static StringRef getArchName(ArchType Kind,
+                                        SubArchType SubArch = NoSubArch);
 
   /// Get the "prefix" canonical name for the \p Kind architecture. This is the
   /// prefix used by the architecture specific builtins, and is suitable for
   /// passing to \see Intrinsic::getIntrinsicForClangBuiltin().
   ///
   /// \return - The architecture prefix, or 0 if none is defined.
-  static StringRef getArchTypePrefix(ArchType Kind);
+  LLVM_ABI static StringRef getArchTypePrefix(ArchType Kind);
 
   /// Get the canonical name for the \p Kind vendor.
-  static StringRef getVendorTypeName(VendorType Kind);
+  LLVM_ABI static StringRef getVendorTypeName(VendorType Kind);
 
   /// Get the canonical name for the \p Kind operating system.
-  static StringRef getOSTypeName(OSType Kind);
+  LLVM_ABI static StringRef getOSTypeName(OSType Kind);
 
   /// Get the canonical name for the \p Kind environment.
-  static StringRef getEnvironmentTypeName(EnvironmentType Kind);
+  LLVM_ABI static StringRef getEnvironmentTypeName(EnvironmentType Kind);
 
   /// Get the name for the \p Object format.
-  static StringRef getObjectFormatTypeName(ObjectFormatType ObjectFormat);
+  LLVM_ABI static StringRef
+  getObjectFormatTypeName(ObjectFormatType ObjectFormat);
 
   /// @}
   /// @name Static helpers for converting alternate architecture names.
   /// @{
 
   /// The canonical type for the given LLVM architecture name (e.g., "x86").
-  static ArchType getArchTypeForLLVMName(StringRef Str);
+  LLVM_ABI static ArchType getArchTypeForLLVMName(StringRef Str);
 
   /// @}
 
   /// Returns a canonicalized OS version number for the specified OS.
-  static VersionTuple getCanonicalVersionForOS(OSType OSKind,
-                                               const VersionTuple &Version,
-                                               bool IsInValidRange);
+  LLVM_ABI static VersionTuple
+  getCanonicalVersionForOS(OSType OSKind, const VersionTuple &Version,
+                           bool IsInValidRange);
 
   /// Returns whether an OS version is invalid and would not map to an Apple OS.
-  static bool isValidVersionForOS(OSType OSKind, const VersionTuple &Version);
+  LLVM_ABI static bool isValidVersionForOS(OSType OSKind,
+                                           const VersionTuple &Version);
 };
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index 8447aca7bb92..f6aeaada346e 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Compiler.h"
 #include <array>
 
 namespace llvm {
@@ -153,34 +154,36 @@ enum CPUKind {
 
 /// Parse \p CPU string into a CPUKind. Will only accept 64-bit capable CPUs if
 /// \p Only64Bit is true.
-CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false);
-CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false);
+LLVM_ABI CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false);
+LLVM_ABI CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false);
 
 /// Provide a list of valid CPU names. If \p Only64Bit is true, the list will
 /// only contain 64-bit capable CPUs.
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
-                          bool Only64Bit = false);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                   bool Only64Bit = false);
 /// Provide a list of valid -mtune names.
-void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values,
-                          bool Only64Bit = false);
+LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values,
+                                   bool Only64Bit = false);
 
 /// Get the key feature prioritizing target multiversioning.
-ProcessorFeatures getKeyFeature(CPUKind Kind);
+LLVM_ABI ProcessorFeatures getKeyFeature(CPUKind Kind);
 
 /// Fill in the features that \p CPU supports into \p Features.
 /// "+" will be append in front of each feature if NeedPlus is true.
-void getFeaturesForCPU(StringRef CPU, SmallVectorImpl<StringRef> &Features,
-                       bool NeedPlus = false);
+LLVM_ABI void getFeaturesForCPU(StringRef CPU,
+                                SmallVectorImpl<StringRef> &Features,
+                                bool NeedPlus = false);
 
 /// Set or clear entries in \p Features that are implied to be enabled/disabled
 /// by the provided \p Feature.
-void updateImpliedFeatures(StringRef Feature, bool Enabled,
-                           StringMap<bool> &Features);
+LLVM_ABI void updateImpliedFeatures(StringRef Feature, bool Enabled,
+                                    StringMap<bool> &Features);
 
-char getCPUDispatchMangling(StringRef Name);
-bool validateCPUSpecificCPUDispatch(StringRef Name);
-std::array<uint32_t, 4> getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
-unsigned getFeaturePriority(ProcessorFeatures Feat);
+LLVM_ABI char getCPUDispatchMangling(StringRef Name);
+LLVM_ABI bool validateCPUSpecificCPUDispatch(StringRef Name);
+LLVM_ABI std::array<uint32_t, 4>
+getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
+LLVM_ABI unsigned getFeaturePriority(ProcessorFeatures Feat);
 
 } // namespace X86
 } // namespace llvm

From 8f8ed23c6247e9c1dd2df4494930813b353c52c4 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 11 Jun 2025 09:19:13 -0700
Subject: [PATCH 0006/1322] [llvm] annotate interfaces in llvm/SandboxIR for
 DLL export (#142863)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/SandboxIR` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

The bulk of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

The following manual adjustments were also applied after running IDS on
Linux:
- Remove explicit `GlobalWithNodeAPI::LLVMGVToGV::operator()` template
function instantiations that were previously added for the dylib build.
Instead, directly annotate the `LLVMGVToGV::operator()` method with
`LLVM_ABI`. This is done so the DLL build works with both MSVC and
clang-cl.
- Explicitly `#include "llvm/SandboxIR/Value.h"` in `Tracker.h` so that
the symbol is available for exported templates in this file. These
templates get fully instantiated on DLL export, so they require the full
definition of `Value`.
- Add extern template instantiation declarations for `GlobalWithNodeAPI`
template types in `Constants.h` and annotate them with
`LLVM_TEMPLATE_ABI`.
- Add `LLVM_EXPORT_TEMPLATE` to `GlobalWithNodeAPI` template
instantiations in `Constants.cpp`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/SandboxIR/BasicBlock.h  |  21 +-
 llvm/include/llvm/SandboxIR/Constant.h    | 207 ++++----
 llvm/include/llvm/SandboxIR/Context.h     | 135 +++---
 llvm/include/llvm/SandboxIR/Function.h    |   5 +-
 llvm/include/llvm/SandboxIR/Instruction.h | 545 +++++++++++-----------
 llvm/include/llvm/SandboxIR/Module.h      |  10 +-
 llvm/include/llvm/SandboxIR/PassManager.h |   6 +-
 llvm/include/llvm/SandboxIR/Region.h      |  19 +-
 llvm/include/llvm/SandboxIR/Tracker.h     |  34 +-
 llvm/include/llvm/SandboxIR/Type.h        |  53 ++-
 llvm/include/llvm/SandboxIR/Use.h         |   9 +-
 llvm/include/llvm/SandboxIR/User.h        |  13 +-
 llvm/include/llvm/SandboxIR/Value.h       |  20 +-
 llvm/lib/SandboxIR/Constant.cpp           |  37 +-
 14 files changed, 564 insertions(+), 550 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/BasicBlock.h b/llvm/include/llvm/SandboxIR/BasicBlock.h
index 93e79e2a421f..25bbb6c058fa 100644
--- a/llvm/include/llvm/SandboxIR/BasicBlock.h
+++ b/llvm/include/llvm/SandboxIR/BasicBlock.h
@@ -11,6 +11,7 @@
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -32,20 +33,20 @@ private:
   llvm::BasicBlock *BB;
   llvm::BasicBlock::iterator It;
   Context *Ctx;
-  pointer getInstr(llvm::BasicBlock::iterator It) const;
+  LLVM_ABI pointer getInstr(llvm::BasicBlock::iterator It) const;
 
 public:
   BBIterator() : BB(nullptr), Ctx(nullptr) {}
   BBIterator(llvm::BasicBlock *BB, llvm::BasicBlock::iterator It, Context *Ctx)
       : BB(BB), It(It), Ctx(Ctx) {}
   reference operator*() const { return *getInstr(It); }
-  BBIterator &operator++();
+  LLVM_ABI BBIterator &operator++();
   BBIterator operator++(int) {
     auto Copy = *this;
     ++*this;
     return Copy;
   }
-  BBIterator &operator--();
+  LLVM_ABI BBIterator &operator--();
   BBIterator operator--(int) {
     auto Copy = *this;
     --*this;
@@ -60,14 +61,14 @@ public:
   /// the instruction is not found in the IR-to-SandboxIR tables.
   pointer get() const { return getInstr(It); }
   /// \Returns the parent BB.
-  BasicBlock *getNodeParent() const;
+  LLVM_ABI BasicBlock *getNodeParent() const;
 };
 
 /// Contains a list of sandboxir::Instruction's.
 class BasicBlock : public Value {
   /// Builds a graph that contains all values in \p BB in their original form
   /// i.e., no vectorization is taking place here.
-  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
+  LLVM_ABI void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
   friend class Context;     // For `buildBasicBlockFromIR`
   friend class Instruction; // For LLVM Val.
 
@@ -82,9 +83,9 @@ public:
   static bool classof(const Value *From) {
     return From->getSubclassID() == Value::ClassID::Block;
   }
-  Function *getParent() const;
+  LLVM_ABI Function *getParent() const;
   using iterator = BBIterator;
-  iterator begin() const;
+  LLVM_ABI iterator begin() const;
   iterator end() const {
     auto *BB = cast<llvm::BasicBlock>(Val);
     return iterator(BB, BB->end(), &Ctx);
@@ -96,10 +97,10 @@ public:
     return std::make_reverse_iterator(begin());
   }
   Context &getContext() const { return Ctx; }
-  Instruction *getTerminator() const;
+  LLVM_ABI Instruction *getTerminator() const;
   bool empty() const { return begin() == end(); }
-  Instruction &front() const;
-  Instruction &back() const;
+  LLVM_ABI Instruction &front() const;
+  LLVM_ABI Instruction &back() const;
 
 #ifndef NDEBUG
   void verify() const final;
diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h
index e7b18a442d33..6f682a7059d1 100644
--- a/llvm/include/llvm/SandboxIR/Constant.h
+++ b/llvm/include/llvm/SandboxIR/Constant.h
@@ -76,16 +76,16 @@ class ConstantInt : public Constant {
   }
 
 public:
-  static ConstantInt *getTrue(Context &Ctx);
-  static ConstantInt *getFalse(Context &Ctx);
-  static ConstantInt *getBool(Context &Ctx, bool V);
-  static Constant *getTrue(Type *Ty);
-  static Constant *getFalse(Type *Ty);
-  static Constant *getBool(Type *Ty, bool V);
+  LLVM_ABI static ConstantInt *getTrue(Context &Ctx);
+  LLVM_ABI static ConstantInt *getFalse(Context &Ctx);
+  LLVM_ABI static ConstantInt *getBool(Context &Ctx, bool V);
+  LLVM_ABI static Constant *getTrue(Type *Ty);
+  LLVM_ABI static Constant *getFalse(Type *Ty);
+  LLVM_ABI static Constant *getBool(Type *Ty, bool V);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantInt for the given value.
-  static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
+  LLVM_ABI static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
 
   /// Return a ConstantInt with the specified integer value for the specified
   /// type. If the type is wider than 64 bits, the value will be zero-extended
@@ -93,27 +93,29 @@ public:
   /// be interpreted as a 64-bit signed integer and sign-extended to fit
   /// the type.
   /// Get a ConstantInt for a specific value.
-  static ConstantInt *get(IntegerType *Ty, uint64_t V, bool IsSigned = false);
+  LLVM_ABI static ConstantInt *get(IntegerType *Ty, uint64_t V,
+                                   bool IsSigned = false);
 
   /// Return a ConstantInt with the specified value for the specified type. The
   /// value V will be canonicalized to a an unsigned APInt. Accessing it with
   /// either getSExtValue() or getZExtValue() will yield a correctly sized and
   /// signed value for the type Ty.
   /// Get a ConstantInt for a specific signed value.
-  static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
-  static Constant *getSigned(Type *Ty, int64_t V);
+  LLVM_ABI static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
+  LLVM_ABI static Constant *getSigned(Type *Ty, int64_t V);
 
   /// Return a ConstantInt with the specified value and an implied Type. The
   /// type is the integer type that corresponds to the bit width of the value.
-  static ConstantInt *get(Context &Ctx, const APInt &V);
+  LLVM_ABI static ConstantInt *get(Context &Ctx, const APInt &V);
 
   /// Return a ConstantInt constructed from the string strStart with the given
   /// radix.
-  static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t Radix);
+  LLVM_ABI static ConstantInt *get(IntegerType *Ty, StringRef Str,
+                                   uint8_t Radix);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantInt for the given value.
-  static Constant *get(Type *Ty, const APInt &V);
+  LLVM_ABI static Constant *get(Type *Ty, const APInt &V);
 
   /// Return the constant as an APInt value reference. This allows clients to
   /// obtain a full-precision copy of the value.
@@ -166,7 +168,7 @@ public:
 
   /// Variant of the getType() method to always return an IntegerType, which
   /// reduces the amount of casting needed in parts of the compiler.
-  IntegerType *getIntegerType() const;
+  LLVM_ABI IntegerType *getIntegerType() const;
 
   /// This static method returns true if the type Ty is big enough to
   /// represent the value V. This can be used to avoid having the get method
@@ -177,8 +179,8 @@ public:
   /// to the appropriate unsigned type before calling the method.
   /// @returns true if V is a valid value for type Ty
   /// Determine if the value is in range for the given type.
-  static bool isValueValidForType(Type *Ty, uint64_t V);
-  static bool isValueValidForType(Type *Ty, int64_t V);
+  LLVM_ABI static bool isValueValidForType(Type *Ty, uint64_t V);
+  LLVM_ABI static bool isValueValidForType(Type *Ty, int64_t V);
 
   bool isNegative() const { return cast<llvm::ConstantInt>(Val)->isNegative(); }
 
@@ -264,29 +266,29 @@ public:
   /// for the specified value in the specified type. This should only be used
   /// for simple constant values like 2.0/1.0 etc, that are known-valid both as
   /// host double and as the target format.
-  static Constant *get(Type *Ty, double V);
+  LLVM_ABI static Constant *get(Type *Ty, double V);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantFP for the given value.
-  static Constant *get(Type *Ty, const APFloat &V);
+  LLVM_ABI static Constant *get(Type *Ty, const APFloat &V);
 
-  static Constant *get(Type *Ty, StringRef Str);
+  LLVM_ABI static Constant *get(Type *Ty, StringRef Str);
 
-  static ConstantFP *get(const APFloat &V, Context &Ctx);
+  LLVM_ABI static ConstantFP *get(const APFloat &V, Context &Ctx);
 
-  static Constant *getNaN(Type *Ty, bool Negative = false,
-                          uint64_t Payload = 0);
-  static Constant *getQNaN(Type *Ty, bool Negative = false,
-                           APInt *Payload = nullptr);
-  static Constant *getSNaN(Type *Ty, bool Negative = false,
-                           APInt *Payload = nullptr);
-  static Constant *getZero(Type *Ty, bool Negative = false);
+  LLVM_ABI static Constant *getNaN(Type *Ty, bool Negative = false,
+                                   uint64_t Payload = 0);
+  LLVM_ABI static Constant *getQNaN(Type *Ty, bool Negative = false,
+                                    APInt *Payload = nullptr);
+  LLVM_ABI static Constant *getSNaN(Type *Ty, bool Negative = false,
+                                    APInt *Payload = nullptr);
+  LLVM_ABI static Constant *getZero(Type *Ty, bool Negative = false);
 
-  static Constant *getNegativeZero(Type *Ty);
-  static Constant *getInfinity(Type *Ty, bool Negative = false);
+  LLVM_ABI static Constant *getNegativeZero(Type *Ty);
+  LLVM_ABI static Constant *getInfinity(Type *Ty, bool Negative = false);
 
   /// Return true if Ty is big enough to represent V.
-  static bool isValueValidForType(Type *Ty, const APFloat &V);
+  LLVM_ABI static bool isValueValidForType(Type *Ty, const APFloat &V);
 
   inline const APFloat &getValueAPF() const {
     return cast<llvm::ConstantFP>(Val)->getValueAPF();
@@ -362,8 +364,8 @@ class ConstantArray final : public ConstantAggregate {
   friend class Context; // For constructor.
 
 public:
-  static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
-  ArrayType *getType() const;
+  LLVM_ABI static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
+  LLVM_ABI ArrayType *getType() const;
 
   // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get().
 
@@ -379,7 +381,7 @@ class ConstantStruct final : public ConstantAggregate {
   friend class Context; // For constructor.
 
 public:
-  static Constant *get(StructType *T, ArrayRef<Constant *> V);
+  LLVM_ABI static Constant *get(StructType *T, ArrayRef<Constant *> V);
 
   template <typename... Csts>
   static std::enable_if_t<are_base_of<Constant, Csts...>::value, Constant *>
@@ -396,8 +398,8 @@ public:
     return get(getTypeForElements(Ctx, V, Packed), V);
   }
   /// This version of the method allows an empty list.
-  static StructType *getTypeForElements(Context &Ctx, ArrayRef<Constant *> V,
-                                        bool Packed = false);
+  LLVM_ABI static StructType *
+  getTypeForElements(Context &Ctx, ArrayRef<Constant *> V, bool Packed = false);
   /// Return an anonymous struct type to use for a constant with the specified
   /// set of elements. The list must not be empty.
   static StructType *getTypeForElements(ArrayRef<Constant *> V,
@@ -424,10 +426,10 @@ class ConstantVector final : public ConstantAggregate {
   friend class Context; // For constructor.
 
 public:
-  static Constant *get(ArrayRef<Constant *> V);
+  LLVM_ABI static Constant *get(ArrayRef<Constant *> V);
   /// Return a ConstantVector with the specified constant in each element.
   /// Note that this might not return an instance of ConstantVector
-  static Constant *getSplat(ElementCount EC, Constant *Elt);
+  LLVM_ABI static Constant *getSplat(ElementCount EC, Constant *Elt);
   /// Specialize the getType() method to always return a FixedVectorType,
   /// which reduces the amount of casting needed in parts of the compiler.
   inline FixedVectorType *getType() const {
@@ -436,7 +438,7 @@ public:
   /// If all elements of the vector constant have the same value, return that
   /// value. Otherwise, return nullptr. Ignore poison elements by setting
   /// AllowPoison to true.
-  Constant *getSplatValue(bool AllowPoison = false) const;
+  LLVM_ABI Constant *getSplatValue(bool AllowPoison = false) const;
 
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
@@ -451,18 +453,18 @@ class ConstantAggregateZero final : public Constant {
   friend class Context; // For constructor.
 
 public:
-  static ConstantAggregateZero *get(Type *Ty);
+  LLVM_ABI static ConstantAggregateZero *get(Type *Ty);
   /// If this CAZ has array or vector type, return a zero with the right element
   /// type.
-  Constant *getSequentialElement() const;
+  LLVM_ABI Constant *getSequentialElement() const;
   /// If this CAZ has struct type, return a zero with the right element type for
   /// the specified element.
-  Constant *getStructElement(unsigned Elt) const;
+  LLVM_ABI Constant *getStructElement(unsigned Elt) const;
   /// Return a zero of the right value for the specified GEP index if we can,
   /// otherwise return null (e.g. if C is a ConstantExpr).
-  Constant *getElementValue(Constant *C) const;
+  LLVM_ABI Constant *getElementValue(Constant *C) const;
   /// Return a zero of the right value for the specified GEP index.
-  Constant *getElementValue(unsigned Idx) const;
+  LLVM_ABI Constant *getElementValue(unsigned Idx) const;
   /// Return the number of elements in the array, vector, or struct.
   ElementCount getElementCount() const {
     return cast<llvm::ConstantAggregateZero>(Val)->getElementCount();
@@ -769,9 +771,9 @@ class ConstantPointerNull final : public Constant {
   friend class Context; // For constructor.
 
 public:
-  static ConstantPointerNull *get(PointerType *Ty);
+  LLVM_ABI static ConstantPointerNull *get(PointerType *Ty);
 
-  PointerType *getType() const;
+  LLVM_ABI PointerType *getType() const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -802,22 +804,22 @@ protected:
 
 public:
   /// Static factory methods - Return an 'undef' object of the specified type.
-  static UndefValue *get(Type *T);
+  LLVM_ABI static UndefValue *get(Type *T);
 
   /// If this Undef has array or vector type, return a undef with the right
   /// element type.
-  UndefValue *getSequentialElement() const;
+  LLVM_ABI UndefValue *getSequentialElement() const;
 
   /// If this undef has struct type, return a undef with the right element type
   /// for the specified element.
-  UndefValue *getStructElement(unsigned Elt) const;
+  LLVM_ABI UndefValue *getStructElement(unsigned Elt) const;
 
   /// Return an undef of the right value for the specified GEP index if we can,
   /// otherwise return null (e.g. if C is a ConstantExpr).
-  UndefValue *getElementValue(Constant *C) const;
+  LLVM_ABI UndefValue *getElementValue(Constant *C) const;
 
   /// Return an undef of the right value for the specified GEP index.
-  UndefValue *getElementValue(unsigned Idx) const;
+  LLVM_ABI UndefValue *getElementValue(unsigned Idx) const;
 
   /// Return the number of elements in the array, vector, or struct.
   unsigned getNumElements() const {
@@ -850,22 +852,22 @@ class PoisonValue final : public UndefValue {
 
 public:
   /// Static factory methods - Return an 'poison' object of the specified type.
-  static PoisonValue *get(Type *T);
+  LLVM_ABI static PoisonValue *get(Type *T);
 
   /// If this poison has array or vector type, return a poison with the right
   /// element type.
-  PoisonValue *getSequentialElement() const;
+  LLVM_ABI PoisonValue *getSequentialElement() const;
 
   /// If this poison has struct type, return a poison with the right element
   /// type for the specified element.
-  PoisonValue *getStructElement(unsigned Elt) const;
+  LLVM_ABI PoisonValue *getStructElement(unsigned Elt) const;
 
   /// Return an poison of the right value for the specified GEP index if we can,
   /// otherwise return null (e.g. if C is a ConstantExpr).
-  PoisonValue *getElementValue(Constant *C) const;
+  LLVM_ABI PoisonValue *getElementValue(Constant *C) const;
 
   /// Return an poison of the right value for the specified GEP index.
-  PoisonValue *getElementValue(unsigned Idx) const;
+  LLVM_ABI PoisonValue *getElementValue(unsigned Idx) const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -924,7 +926,7 @@ public:
   UnnamedAddr getUnnamedAddr() const {
     return cast<llvm::GlobalValue>(Val)->getUnnamedAddr();
   }
-  void setUnnamedAddr(UnnamedAddr V);
+  LLVM_ABI void setUnnamedAddr(UnnamedAddr V);
 
   static UnnamedAddr getMinUnnamedAddr(UnnamedAddr A, UnnamedAddr B) {
     return llvm::GlobalValue::getMinUnnamedAddr(A, B);
@@ -946,7 +948,7 @@ public:
   bool hasProtectedVisibility() const {
     return cast<llvm::GlobalValue>(Val)->hasProtectedVisibility();
   }
-  void setVisibility(VisibilityTypes V);
+  LLVM_ABI void setVisibility(VisibilityTypes V);
 
   // TODO: Add missing functions.
 };
@@ -996,7 +998,7 @@ public:
   ///
   /// Setting the section to the empty string tells LLVM to choose an
   /// appropriate default object file section.
-  void setSection(StringRef S);
+  LLVM_ABI void setSection(StringRef S);
 
   bool hasComdat() const { return cast<llvm::GlobalObject>(Val)->hasComdat(); }
 
@@ -1031,7 +1033,7 @@ class GlobalWithNodeAPI : public ParentT {
   struct LLVMGVToGV {
     Context &Ctx;
     LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
-    GlobalT &operator()(LLVMGlobalT &LLVMGV) const;
+    LLVM_ABI GlobalT &operator()(LLVMGlobalT &LLVMGV) const;
   };
 
 public:
@@ -1060,24 +1062,15 @@ public:
   }
 };
 
-// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB
-extern template LLVM_TEMPLATE_ABI GlobalIFunc &
-GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                  llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalIFunc
-                                                                  &LLVMGV)
-    const;
-extern template LLVM_TEMPLATE_ABI Function &
-GlobalWithNodeAPI<Function, llvm::Function, GlobalObject, llvm::GlobalObject>::
-    LLVMGVToGV::operator()(llvm::Function &LLVMGV) const;
-
-extern template LLVM_TEMPLATE_ABI GlobalVariable &GlobalWithNodeAPI<
-    GlobalVariable, llvm::GlobalVariable, GlobalObject,
-    llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV)
-    const;
-extern template LLVM_TEMPLATE_ABI GlobalAlias &
-GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                  llvm::GlobalValue>::LLVMGVToGV::operator()(llvm::GlobalAlias
-                                                                 &LLVMGV) const;
+// Explicit instantiations.
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>;
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    Function, llvm::Function, GlobalObject, llvm::GlobalObject>;
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>;
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>;
 
 class GlobalIFunc final
     : public GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
@@ -1097,13 +1090,13 @@ public:
   // TODO: Missing functions: copyAttributesFrom(), removeFromParent(),
   // eraseFromParent()
 
-  void setResolver(Constant *Resolver);
+  LLVM_ABI void setResolver(Constant *Resolver);
 
-  Constant *getResolver() const;
+  LLVM_ABI Constant *getResolver() const;
 
   // Return the resolver function after peeling off potential ConstantExpr
   // indirection.
-  Function *getResolverFunction();
+  LLVM_ABI Function *getResolverFunction();
   const Function *getResolverFunction() const {
     return const_cast<GlobalIFunc *>(this)->getResolverFunction();
   }
@@ -1136,7 +1129,7 @@ class GlobalVariable final
   struct LLVMGVToGV {
     Context &Ctx;
     LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
-    GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const;
+    LLVM_ABI GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const;
   };
 
 public:
@@ -1181,11 +1174,11 @@ public:
   /// illegal to call this method if the global is external, because we cannot
   /// tell what the value is initialized to!
   ///
-  Constant *getInitializer() const;
+  LLVM_ABI Constant *getInitializer() const;
   /// setInitializer - Sets the initializer for this global variable, removing
   /// any existing initializer if InitVal==NULL. The initializer must have the
   /// type getValueType().
-  void setInitializer(Constant *InitVal);
+  LLVM_ABI void setInitializer(Constant *InitVal);
 
   // TODO: Add missing replaceInitializer(). Requires special tracker
 
@@ -1196,12 +1189,12 @@ public:
   bool isConstant() const {
     return cast<llvm::GlobalVariable>(Val)->isConstant();
   }
-  void setConstant(bool V);
+  LLVM_ABI void setConstant(bool V);
 
   bool isExternallyInitialized() const {
     return cast<llvm::GlobalVariable>(Val)->isExternallyInitialized();
   }
-  void setExternallyInitialized(bool Val);
+  LLVM_ABI void setExternallyInitialized(bool Val);
 
   // TODO: Missing copyAttributesFrom()
 
@@ -1278,7 +1271,7 @@ public:
   /// Sets the alignment attribute of the GlobalVariable.
   /// This method will be deprecated as the alignment property should always be
   /// defined.
-  void setAlignment(MaybeAlign Align);
+  LLVM_ABI void setAlignment(MaybeAlign Align);
 
   // TODO: Missing setCodeModel(). Requires custom tracker.
 
@@ -1311,10 +1304,10 @@ public:
   // TODO: Missing copyAttributresFrom().
   // TODO: Missing removeFromParent(), eraseFromParent().
 
-  void setAliasee(Constant *Aliasee);
-  Constant *getAliasee() const;
+  LLVM_ABI void setAliasee(Constant *Aliasee);
+  LLVM_ABI Constant *getAliasee() const;
 
-  const GlobalObject *getAliaseeObject() const;
+  LLVM_ABI const GlobalObject *getAliaseeObject() const;
   GlobalObject *getAliaseeObject() {
     return const_cast<GlobalObject *>(
         static_cast<const GlobalAlias *>(this)->getAliaseeObject());
@@ -1336,12 +1329,12 @@ class NoCFIValue final : public Constant {
 
 public:
   /// Return a NoCFIValue for the specified function.
-  static NoCFIValue *get(GlobalValue *GV);
+  LLVM_ABI static NoCFIValue *get(GlobalValue *GV);
 
-  GlobalValue *getGlobalValue() const;
+  LLVM_ABI GlobalValue *getGlobalValue() const;
 
   /// NoCFIValue is always a pointer.
-  PointerType *getType() const;
+  LLVM_ABI PointerType *getType() const;
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
     return From->getSubclassID() == ClassID::NoCFIValue;
@@ -1369,21 +1362,21 @@ class ConstantPtrAuth final : public Constant {
 
 public:
   /// Return a pointer signed with the specified parameters.
-  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
-                              ConstantInt *Disc, Constant *AddrDisc);
+  LLVM_ABI static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
+                                       ConstantInt *Disc, Constant *AddrDisc);
   /// The pointer that is signed in this ptrauth signed pointer.
-  Constant *getPointer() const;
+  LLVM_ABI Constant *getPointer() const;
 
   /// The Key ID, an i32 constant.
-  ConstantInt *getKey() const;
+  LLVM_ABI ConstantInt *getKey() const;
 
   /// The integer discriminator, an i64 constant, or 0.
-  ConstantInt *getDiscriminator() const;
+  LLVM_ABI ConstantInt *getDiscriminator() const;
 
   /// The address discriminator if any, or the null constant.
   /// If present, this must be a value equivalent to the storage location of
   /// the only global-initializer user of the ptrauth signed pointer.
-  Constant *getAddrDiscriminator() const;
+  LLVM_ABI Constant *getAddrDiscriminator() const;
 
   /// Whether there is any non-null address discriminator.
   bool hasAddressDiscriminator() const {
@@ -1410,7 +1403,7 @@ public:
 
   /// Produce a new ptrauth expression signing the given value using
   /// the same schema as is stored in one.
-  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
+  LLVM_ABI ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -1438,19 +1431,19 @@ class BlockAddress final : public Constant {
 
 public:
   /// Return a BlockAddress for the specified function and basic block.
-  static BlockAddress *get(Function *F, BasicBlock *BB);
+  LLVM_ABI static BlockAddress *get(Function *F, BasicBlock *BB);
 
   /// Return a BlockAddress for the specified basic block.  The basic
   /// block must be embedded into a function.
-  static BlockAddress *get(BasicBlock *BB);
+  LLVM_ABI static BlockAddress *get(BasicBlock *BB);
 
   /// Lookup an existing \c BlockAddress constant for the given BasicBlock.
   ///
   /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress.
-  static BlockAddress *lookup(const BasicBlock *BB);
+  LLVM_ABI static BlockAddress *lookup(const BasicBlock *BB);
 
-  Function *getFunction() const;
-  BasicBlock *getBasicBlock() const;
+  LLVM_ABI Function *getFunction() const;
+  LLVM_ABI BasicBlock *getBasicBlock() const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -1465,9 +1458,9 @@ class DSOLocalEquivalent final : public Constant {
 
 public:
   /// Return a DSOLocalEquivalent for the specified global value.
-  static DSOLocalEquivalent *get(GlobalValue *GV);
+  LLVM_ABI static DSOLocalEquivalent *get(GlobalValue *GV);
 
-  GlobalValue *getGlobalValue() const;
+  LLVM_ABI GlobalValue *getGlobalValue() const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -1498,7 +1491,7 @@ class ConstantTokenNone final : public Constant {
 
 public:
   /// Return the ConstantTokenNone.
-  static ConstantTokenNone *get(Context &Ctx);
+  LLVM_ABI static ConstantTokenNone *get(Context &Ctx);
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
index a8a21b0db855..7d8b2c86e94a 100644
--- a/llvm/include/llvm/SandboxIR/Context.h
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -15,6 +15,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/SandboxIR/Tracker.h"
 #include "llvm/SandboxIR/Type.h"
+#include "llvm/Support/Compiler.h"
 
 #include <cstdint>
 
@@ -112,32 +113,33 @@ protected:
   CallbackID::ValTy NextCallbackID = 1;
 
   /// Remove \p V from the maps and returns the unique_ptr.
-  std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
+  LLVM_ABI std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
   /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively
   /// detaches \p V from the underlying IR.
-  std::unique_ptr<Value> detach(Value *V);
+  LLVM_ABI std::unique_ptr<Value> detach(Value *V);
   friend class Instruction; // For detach().
   /// Take ownership of VPtr and store it in `LLVMValueToValueMap`.
-  Value *registerValue(std::unique_ptr<Value> &&VPtr);
+  LLVM_ABI Value *registerValue(std::unique_ptr<Value> &&VPtr);
   friend class EraseFromParent; // For registerValue().
   /// This is the actual function that creates sandboxir values for \p V,
   /// and among others handles all instruction types.
-  Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr);
+  LLVM_ABI Value *getOrCreateValueInternal(llvm::Value *V,
+                                           llvm::User *U = nullptr);
   /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg.
-  Argument *getOrCreateArgument(llvm::Argument *LLVMArg);
+  LLVM_ABI Argument *getOrCreateArgument(llvm::Argument *LLVMArg);
   /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV.
   Value *getOrCreateValue(llvm::Value *LLVMV) {
     return getOrCreateValueInternal(LLVMV, 0);
   }
   /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC.
-  Constant *getOrCreateConstant(llvm::Constant *LLVMC);
+  LLVM_ABI Constant *getOrCreateConstant(llvm::Constant *LLVMC);
   friend class ConstantDataSequential; // For getOrCreateConstant().
   friend class Utils; // For getMemoryBase
 
-  void runEraseInstrCallbacks(Instruction *I);
-  void runCreateInstrCallbacks(Instruction *I);
-  void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where);
-  void runSetUseCallbacks(const Use &U, Value *NewSrc);
+  LLVM_ABI void runEraseInstrCallbacks(Instruction *I);
+  LLVM_ABI void runCreateInstrCallbacks(Instruction *I);
+  LLVM_ABI void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where);
+  LLVM_ABI void runSetUseCallbacks(const Use &U, Value *NewSrc);
 
   friend class User;  // For runSetUseCallbacks().
   friend class Value; // For runSetUseCallbacks().
@@ -148,90 +150,97 @@ protected:
 
   /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
   /// also create all contents of the block.
-  BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
+  LLVM_ABI BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
   friend class BasicBlock; // For getOrCreateValue().
 
   IRBuilder<ConstantFolder> LLVMIRBuilder;
   auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
 
-  VAArgInst *createVAArgInst(llvm::VAArgInst *SI);
+  LLVM_ABI VAArgInst *createVAArgInst(llvm::VAArgInst *SI);
   friend VAArgInst; // For createVAArgInst()
-  FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
+  LLVM_ABI FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
   friend FreezeInst; // For createFreezeInst()
-  FenceInst *createFenceInst(llvm::FenceInst *SI);
+  LLVM_ABI FenceInst *createFenceInst(llvm::FenceInst *SI);
   friend FenceInst; // For createFenceInst()
-  SelectInst *createSelectInst(llvm::SelectInst *SI);
+  LLVM_ABI SelectInst *createSelectInst(llvm::SelectInst *SI);
   friend SelectInst; // For createSelectInst()
-  InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI);
+  LLVM_ABI InsertElementInst *
+  createInsertElementInst(llvm::InsertElementInst *IEI);
   friend InsertElementInst; // For createInsertElementInst()
-  ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI);
+  LLVM_ABI ExtractElementInst *
+  createExtractElementInst(llvm::ExtractElementInst *EEI);
   friend ExtractElementInst; // For createExtractElementInst()
-  ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
+  LLVM_ABI ShuffleVectorInst *
+  createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
   friend ShuffleVectorInst; // For createShuffleVectorInst()
-  ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI);
+  LLVM_ABI ExtractValueInst *
+  createExtractValueInst(llvm::ExtractValueInst *IVI);
   friend ExtractValueInst; // For createExtractValueInst()
-  InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
+  LLVM_ABI InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
   friend InsertValueInst; // For createInsertValueInst()
-  BranchInst *createBranchInst(llvm::BranchInst *I);
+  LLVM_ABI BranchInst *createBranchInst(llvm::BranchInst *I);
   friend BranchInst; // For createBranchInst()
-  LoadInst *createLoadInst(llvm::LoadInst *LI);
+  LLVM_ABI LoadInst *createLoadInst(llvm::LoadInst *LI);
   friend LoadInst; // For createLoadInst()
-  StoreInst *createStoreInst(llvm::StoreInst *SI);
+  LLVM_ABI StoreInst *createStoreInst(llvm::StoreInst *SI);
   friend StoreInst; // For createStoreInst()
-  ReturnInst *createReturnInst(llvm::ReturnInst *I);
+  LLVM_ABI ReturnInst *createReturnInst(llvm::ReturnInst *I);
   friend ReturnInst; // For createReturnInst()
-  CallInst *createCallInst(llvm::CallInst *I);
+  LLVM_ABI CallInst *createCallInst(llvm::CallInst *I);
   friend CallInst; // For createCallInst()
-  InvokeInst *createInvokeInst(llvm::InvokeInst *I);
+  LLVM_ABI InvokeInst *createInvokeInst(llvm::InvokeInst *I);
   friend InvokeInst; // For createInvokeInst()
-  CallBrInst *createCallBrInst(llvm::CallBrInst *I);
+  LLVM_ABI CallBrInst *createCallBrInst(llvm::CallBrInst *I);
   friend CallBrInst; // For createCallBrInst()
-  LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
+  LLVM_ABI LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
   friend LandingPadInst; // For createLandingPadInst()
-  CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
+  LLVM_ABI CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
   friend CatchPadInst; // For createCatchPadInst()
-  CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
+  LLVM_ABI CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
   friend CleanupPadInst; // For createCleanupPadInst()
-  CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I);
+  LLVM_ABI CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I);
   friend CatchReturnInst; // For createCatchReturnInst()
-  CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I);
+  LLVM_ABI CleanupReturnInst *
+  createCleanupReturnInst(llvm::CleanupReturnInst *I);
   friend CleanupReturnInst; // For createCleanupReturnInst()
-  GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I);
+  LLVM_ABI GetElementPtrInst *
+  createGetElementPtrInst(llvm::GetElementPtrInst *I);
   friend GetElementPtrInst; // For createGetElementPtrInst()
-  CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I);
+  LLVM_ABI CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I);
   friend CatchSwitchInst; // For createCatchSwitchInst()
-  ResumeInst *createResumeInst(llvm::ResumeInst *I);
+  LLVM_ABI ResumeInst *createResumeInst(llvm::ResumeInst *I);
   friend ResumeInst; // For createResumeInst()
-  SwitchInst *createSwitchInst(llvm::SwitchInst *I);
+  LLVM_ABI SwitchInst *createSwitchInst(llvm::SwitchInst *I);
   friend SwitchInst; // For createSwitchInst()
-  UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I);
+  LLVM_ABI UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I);
   friend UnaryOperator; // For createUnaryOperator()
-  BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I);
+  LLVM_ABI BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I);
   friend BinaryOperator; // For createBinaryOperator()
-  AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I);
+  LLVM_ABI AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I);
   friend AtomicRMWInst; // For createAtomicRMWInst()
-  AtomicCmpXchgInst *createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I);
+  LLVM_ABI AtomicCmpXchgInst *
+  createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I);
   friend AtomicCmpXchgInst; // For createAtomicCmpXchgInst()
-  AllocaInst *createAllocaInst(llvm::AllocaInst *I);
+  LLVM_ABI AllocaInst *createAllocaInst(llvm::AllocaInst *I);
   friend AllocaInst; // For createAllocaInst()
-  CastInst *createCastInst(llvm::CastInst *I);
+  LLVM_ABI CastInst *createCastInst(llvm::CastInst *I);
   friend CastInst; // For createCastInst()
-  PHINode *createPHINode(llvm::PHINode *I);
+  LLVM_ABI PHINode *createPHINode(llvm::PHINode *I);
   friend PHINode; // For createPHINode()
-  UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
+  LLVM_ABI UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
   friend UnreachableInst; // For createUnreachableInst()
-  CmpInst *createCmpInst(llvm::CmpInst *I);
+  LLVM_ABI CmpInst *createCmpInst(llvm::CmpInst *I);
   friend CmpInst; // For createCmpInst()
-  ICmpInst *createICmpInst(llvm::ICmpInst *I);
+  LLVM_ABI ICmpInst *createICmpInst(llvm::ICmpInst *I);
   friend ICmpInst; // For createICmpInst()
-  FCmpInst *createFCmpInst(llvm::FCmpInst *I);
+  LLVM_ABI FCmpInst *createFCmpInst(llvm::FCmpInst *I);
   friend FCmpInst; // For createFCmpInst()
 
 public:
-  Context(LLVMContext &LLVMCtx);
-  ~Context();
+  LLVM_ABI Context(LLVMContext &LLVMCtx);
+  LLVM_ABI ~Context();
   /// Clears function-level state.
-  void clear();
+  LLVM_ABI void clear();
 
   Tracker &getTracker() { return IRTracker; }
   /// Convenience function for `getTracker().save()`
@@ -241,14 +250,14 @@ public:
   /// Convenience function for `getTracker().accept()`
   void accept() { IRTracker.accept(); }
 
-  sandboxir::Value *getValue(llvm::Value *V) const;
+  LLVM_ABI sandboxir::Value *getValue(llvm::Value *V) const;
   const sandboxir::Value *getValue(const llvm::Value *V) const {
     return getValue(const_cast<llvm::Value *>(V));
   }
 
-  Module *getModule(llvm::Module *LLVMM) const;
+  LLVM_ABI Module *getModule(llvm::Module *LLVMM) const;
 
-  Module *getOrCreateModule(llvm::Module *LLVMM);
+  LLVM_ABI Module *getOrCreateModule(llvm::Module *LLVMM);
 
   Type *getType(llvm::Type *LLVMTy) {
     if (LLVMTy == nullptr)
@@ -265,10 +274,10 @@ public:
   /// This is the main API function for creating Sandbox IR.
   /// Note: this will not fully populate its parent module. The only globals
   /// that will be available are those used within the function.
-  Function *createFunction(llvm::Function *F);
+  LLVM_ABI Function *createFunction(llvm::Function *F);
 
   /// Create a sandboxir::Module corresponding to \p LLVMM.
-  Module *createModule(llvm::Module *LLVMM);
+  LLVM_ABI Module *createModule(llvm::Module *LLVMM);
 
   /// \Returns the number of values registered with Context.
   size_t getNumValues() const { return LLVMValueToValueMap.size(); }
@@ -277,26 +286,26 @@ public:
   /// to be removed from its parent. Note that this will also be called when
   /// reverting the creation of an instruction.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerEraseInstrCallback(EraseInstrCallback CB);
-  void unregisterEraseInstrCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerEraseInstrCallback(EraseInstrCallback CB);
+  LLVM_ABI void unregisterEraseInstrCallback(CallbackID ID);
 
   /// Register a callback that gets called right after a SandboxIR instruction
   /// is created. Note that this will also be called when reverting the removal
   /// of an instruction.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerCreateInstrCallback(CreateInstrCallback CB);
-  void unregisterCreateInstrCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerCreateInstrCallback(CreateInstrCallback CB);
+  LLVM_ABI void unregisterCreateInstrCallback(CallbackID ID);
 
   /// Register a callback that gets called when a SandboxIR instruction is about
   /// to be moved. Note that this will also be called when reverting a move.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerMoveInstrCallback(MoveInstrCallback CB);
-  void unregisterMoveInstrCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerMoveInstrCallback(MoveInstrCallback CB);
+  LLVM_ABI void unregisterMoveInstrCallback(CallbackID ID);
 
   /// Register a callback that gets called when a Use gets set.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerSetUseCallback(SetUseCallback CB);
-  void unregisterSetUseCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerSetUseCallback(SetUseCallback CB);
+  LLVM_ABI void unregisterSetUseCallback(CallbackID ID);
 };
 
 } // namespace sandboxir
diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h
index 2c4b53ef6c1e..28c69112b2b7 100644
--- a/llvm/include/llvm/SandboxIR/Function.h
+++ b/llvm/include/llvm/SandboxIR/Function.h
@@ -11,6 +11,7 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/SandboxIR/Constant.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -56,7 +57,7 @@ public:
     LLVMBBToBB BBGetter(Ctx);
     return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
   }
-  FunctionType *getFunctionType() const;
+  LLVM_ABI FunctionType *getFunctionType() const;
 
   /// Returns the alignment of the given function.
   MaybeAlign getAlign() const { return cast<llvm::Function>(Val)->getAlign(); }
@@ -66,7 +67,7 @@ public:
   /// Sets the alignment attribute of the Function.
   /// This method will be deprecated as the alignment property should always be
   /// defined.
-  void setAlignment(MaybeAlign Align);
+  LLVM_ABI void setAlignment(MaybeAlign Align);
 
 #ifndef NDEBUG
   void verify() const final {
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index ce5a2cbec85b..4e3ff19d4778 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -16,6 +16,7 @@
 #include "llvm/SandboxIR/BasicBlock.h"
 #include "llvm/SandboxIR/Constant.h"
 #include "llvm/SandboxIR/User.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -57,7 +58,7 @@ protected:
 
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
-  llvm::Instruction *getTopmostLLVMInstruction() const;
+  LLVM_ABI llvm::Instruction *getTopmostLLVMInstruction() const;
   friend class VAArgInst;          // For getTopmostLLVMInstruction().
   friend class FreezeInst;         // For getTopmostLLVMInstruction().
   friend class FenceInst;          // For getTopmostLLVMInstruction().
@@ -113,17 +114,17 @@ protected:
   }
 
 public:
-  static const char *getOpcodeName(Opcode Opc);
+  LLVM_ABI static const char *getOpcodeName(Opcode Opc);
   /// This is used by BasicBlock::iterator.
   virtual unsigned getNumOfIRInstrs() const = 0;
   /// \Returns a BasicBlock::iterator for this Instruction.
-  BBIterator getIterator() const;
+  LLVM_ABI BBIterator getIterator() const;
   /// \Returns the next sandboxir::Instruction in the block, or nullptr if at
   /// the end of the block.
-  Instruction *getNextNode() const;
+  LLVM_ABI Instruction *getNextNode() const;
   /// \Returns the previous sandboxir::Instruction in the block, or nullptr if
   /// at the beginning of the block.
-  Instruction *getPrevNode() const;
+  LLVM_ABI Instruction *getPrevNode() const;
   /// \Returns this Instruction's opcode. Note that SandboxIR has its own opcode
   /// state to allow for new SandboxIR-specific instructions.
   Opcode getOpcode() const { return Opc; }
@@ -188,17 +189,17 @@ public:
   // TODO: More missing functions
 
   /// Detach this from its parent BasicBlock without deleting it.
-  void removeFromParent();
+  LLVM_ABI void removeFromParent();
   /// Detach this Value from its parent and delete it.
-  void eraseFromParent();
+  LLVM_ABI void eraseFromParent();
   /// Insert this detached instruction before \p BeforeI.
-  void insertBefore(Instruction *BeforeI);
+  LLVM_ABI void insertBefore(Instruction *BeforeI);
   /// Insert this detached instruction after \p AfterI.
-  void insertAfter(Instruction *AfterI);
+  LLVM_ABI void insertAfter(Instruction *AfterI);
   /// Insert this detached instruction into \p BB at \p WhereIt.
-  void insertInto(BasicBlock *BB, const BBIterator &WhereIt);
+  LLVM_ABI void insertInto(BasicBlock *BB, const BBIterator &WhereIt);
   /// Move this instruction to \p WhereIt.
-  void moveBefore(BasicBlock &BB, const BBIterator &WhereIt);
+  LLVM_ABI void moveBefore(BasicBlock &BB, const BBIterator &WhereIt);
   /// Move this instruction before \p Before.
   void moveBefore(Instruction *Before) {
     moveBefore(*Before->getParent(), Before->getIterator());
@@ -217,9 +218,9 @@ public:
   }
   /// \Returns the BasicBlock containing this Instruction, or null if it is
   /// detached.
-  BasicBlock *getParent() const;
+  LLVM_ABI BasicBlock *getParent() const;
   /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From);
+  LLVM_ABI static bool classof(const sandboxir::Value *From);
 
   /// Determine whether the no signed wrap flag is set.
   bool hasNoUnsignedWrap() const {
@@ -227,20 +228,20 @@ public:
   }
   /// Set or clear the nuw flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
-  void setHasNoUnsignedWrap(bool B = true);
+  LLVM_ABI void setHasNoUnsignedWrap(bool B = true);
   /// Determine whether the no signed wrap flag is set.
   bool hasNoSignedWrap() const {
     return cast<llvm::Instruction>(Val)->hasNoSignedWrap();
   }
   /// Set or clear the nsw flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
-  void setHasNoSignedWrap(bool B = true);
+  LLVM_ABI void setHasNoSignedWrap(bool B = true);
   /// Determine whether all fast-math-flags are set.
   bool isFast() const { return cast<llvm::Instruction>(Val)->isFast(); }
   /// Set or clear all fast-math-flags on this instruction, which must be an
   /// operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setFast(bool B);
+  LLVM_ABI void setFast(bool B);
   /// Determine whether the allow-reassociation flag is set.
   bool hasAllowReassoc() const {
     return cast<llvm::Instruction>(Val)->hasAllowReassoc();
@@ -248,24 +249,24 @@ public:
   /// Set or clear the reassociation flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasAllowReassoc(bool B);
+  LLVM_ABI void setHasAllowReassoc(bool B);
   /// Determine whether the exact flag is set.
   bool isExact() const { return cast<llvm::Instruction>(Val)->isExact(); }
   /// Set or clear the exact flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
-  void setIsExact(bool B = true);
+  LLVM_ABI void setIsExact(bool B = true);
   /// Determine whether the no-NaNs flag is set.
   bool hasNoNaNs() const { return cast<llvm::Instruction>(Val)->hasNoNaNs(); }
   /// Set or clear the no-nans flag on this instruction, which must be an
   /// operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasNoNaNs(bool B);
+  LLVM_ABI void setHasNoNaNs(bool B);
   /// Determine whether the no-infs flag is set.
   bool hasNoInfs() const { return cast<llvm::Instruction>(Val)->hasNoInfs(); }
   /// Set or clear the no-infs flag on this instruction, which must be an
   /// operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasNoInfs(bool B);
+  LLVM_ABI void setHasNoInfs(bool B);
   /// Determine whether the no-signed-zeros flag is set.
   bool hasNoSignedZeros() const {
     return cast<llvm::Instruction>(Val)->hasNoSignedZeros();
@@ -273,7 +274,7 @@ public:
   /// Set or clear the no-signed-zeros flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasNoSignedZeros(bool B);
+  LLVM_ABI void setHasNoSignedZeros(bool B);
   /// Determine whether the allow-reciprocal flag is set.
   bool hasAllowReciprocal() const {
     return cast<llvm::Instruction>(Val)->hasAllowReciprocal();
@@ -281,7 +282,7 @@ public:
   /// Set or clear the allow-reciprocal flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasAllowReciprocal(bool B);
+  LLVM_ABI void setHasAllowReciprocal(bool B);
   /// Determine whether the allow-contract flag is set.
   bool hasAllowContract() const {
     return cast<llvm::Instruction>(Val)->hasAllowContract();
@@ -289,7 +290,7 @@ public:
   /// Set or clear the allow-contract flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasAllowContract(bool B);
+  LLVM_ABI void setHasAllowContract(bool B);
   /// Determine whether the approximate-math-functions flag is set.
   bool hasApproxFunc() const {
     return cast<llvm::Instruction>(Val)->hasApproxFunc();
@@ -297,7 +298,7 @@ public:
   /// Set or clear the approximate-math-functions flag on this instruction,
   /// which must be an operator which supports this flag. See LangRef.html for
   /// the meaning of this flag.
-  void setHasApproxFunc(bool B);
+  LLVM_ABI void setHasApproxFunc(bool B);
   /// Convenience function for getting all the fast-math flags, which must be an
   /// operator which supports these flags. See LangRef.html for the meaning of
   /// these flags.
@@ -307,11 +308,11 @@ public:
   /// Convenience function for setting multiple fast-math flags on this
   /// instruction, which must be an operator which supports these flags. See
   /// LangRef.html for the meaning of these flags.
-  void setFastMathFlags(FastMathFlags FMF);
+  LLVM_ABI void setFastMathFlags(FastMathFlags FMF);
   /// Convenience function for transferring all fast-math flag values to this
   /// instruction, which must be an operator which supports these flags. See
   /// LangRef.html for the meaning of these flags.
-  void copyFastMathFlags(FastMathFlags FMF);
+  LLVM_ABI void copyFastMathFlags(FastMathFlags FMF);
 
   bool isAssociative() const {
     return cast<llvm::Instruction>(Val)->isAssociative();
@@ -352,7 +353,7 @@ public:
 
   bool isVolatile() const { return cast<llvm::Instruction>(Val)->isVolatile(); }
 
-  Type *getAccessType() const;
+  LLVM_ABI Type *getAccessType() const;
 
   bool mayThrow(bool IncludePhaseOneUnwind = false) const {
     return cast<llvm::Instruction>(Val)->mayThrow(IncludePhaseOneUnwind);
@@ -414,22 +415,22 @@ class FenceInst : public SingleLLVMInstructionImpl<llvm::FenceInst> {
   friend Context; // For constructor;
 
 public:
-  static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos,
-                           Context &Ctx,
-                           SyncScope::ID SSID = SyncScope::System);
+  LLVM_ABI static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos,
+                                    Context &Ctx,
+                                    SyncScope::ID SSID = SyncScope::System);
   /// Returns the ordering constraint of this fence instruction.
   AtomicOrdering getOrdering() const {
     return cast<llvm::FenceInst>(Val)->getOrdering();
   }
   /// Sets the ordering constraint of this fence instruction.  May only be
   /// Acquire, Release, AcquireRelease, or SequentiallyConsistent.
-  void setOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setOrdering(AtomicOrdering Ordering);
   /// Returns the synchronization scope ID of this fence instruction.
   SyncScope::ID getSyncScopeID() const {
     return cast<llvm::FenceInst>(Val)->getSyncScopeID();
   }
   /// Sets the synchronization scope ID of this fence instruction.
-  void setSyncScopeID(SyncScope::ID SSID);
+  LLVM_ABI void setSyncScopeID(SyncScope::ID SSID);
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Fence;
   }
@@ -443,9 +444,9 @@ class SelectInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
   friend Context; // for SelectInst()
 
 public:
-  static Value *create(Value *Cond, Value *True, Value *False,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Cond, Value *True, Value *False,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
   const Value *getCondition() const { return getOperand(0); }
   const Value *getTrueValue() const { return getOperand(1); }
@@ -457,7 +458,7 @@ public:
   void setCondition(Value *New) { setOperand(0, New); }
   void setTrueValue(Value *New) { setOperand(1, New); }
   void setFalseValue(Value *New) { setOperand(2, New); }
-  void swapValues();
+  LLVM_ABI void swapValues();
 
   /// Return a string if the specified operands are invalid for a select
   /// operation, otherwise return null.
@@ -468,7 +469,7 @@ public:
   }
 
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
+  LLVM_ABI static bool classof(const Value *From);
 };
 
 class InsertElementInst final
@@ -480,9 +481,9 @@ class InsertElementInst final
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static Value *create(Value *Vec, Value *NewElt, Value *Idx,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Vec, Value *NewElt, Value *Idx,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::InsertElement;
   }
@@ -503,8 +504,8 @@ class ExtractElementInst final
                         // create*()
 
 public:
-  static Value *create(Value *Vec, Value *Idx, InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Vec, Value *Idx, InsertPosition Pos,
+                                Context &Ctx, const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ExtractElement;
   }
@@ -516,7 +517,7 @@ public:
   Value *getIndexOperand() { return getOperand(1); }
   const Value *getVectorOperand() const { return getOperand(0); }
   const Value *getIndexOperand() const { return getOperand(1); }
-  VectorType *getVectorOperandType() const;
+  LLVM_ABI VectorType *getVectorOperandType() const;
 };
 
 class ShuffleVectorInst final
@@ -528,18 +529,19 @@ class ShuffleVectorInst final
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static Value *create(Value *V1, Value *V2, Value *Mask, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
-  static Value *create(Value *V1, Value *V2, ArrayRef<int> Mask,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *V1, Value *V2, Value *Mask,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *V1, Value *V2, ArrayRef<int> Mask,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ShuffleVector;
   }
 
   /// Swap the operands and adjust the mask to preserve the semantics of the
   /// instruction.
-  void commute();
+  LLVM_ABI void commute();
 
   /// Return true if a shufflevector instruction can be formed with the
   /// specified operands.
@@ -554,7 +556,7 @@ public:
   }
 
   /// Overload to return most specific vector type.
-  VectorType *getType() const;
+  LLVM_ABI VectorType *getType() const;
 
   /// Return the shuffle mask value of this instruction for the given element
   /// index. Return PoisonMaskElem if the element is undef.
@@ -577,12 +579,12 @@ public:
   }
 
   /// Return the mask for this instruction, for use in bitcode.
-  Constant *getShuffleMaskForBitcode() const;
+  LLVM_ABI Constant *getShuffleMaskForBitcode() const;
 
-  static Constant *convertShuffleMaskForBitcode(ArrayRef<int> Mask,
-                                                Type *ResultTy);
+  LLVM_ABI static Constant *convertShuffleMaskForBitcode(ArrayRef<int> Mask,
+                                                         Type *ResultTy);
 
-  void setShuffleMask(ArrayRef<int> Mask);
+  LLVM_ABI void setShuffleMask(ArrayRef<int> Mask);
 
   ArrayRef<int> getShuffleMask() const {
     return cast<llvm::ShuffleVectorInst>(Val)->getShuffleMask();
@@ -965,9 +967,9 @@ class InsertValueInst
   friend Context; // for InsertValueInst()
 
 public:
-  static Value *create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::InsertValue;
@@ -1024,36 +1026,37 @@ class BranchInst : public SingleLLVMInstructionImpl<llvm::BranchInst> {
   friend Context; // for BranchInst()
 
 public:
-  static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos,
-                            Context &Ctx);
-  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
-                            Value *Cond, InsertPosition Pos, Context &Ctx);
+  LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos,
+                                     Context &Ctx);
+  LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                                     Value *Cond, InsertPosition Pos,
+                                     Context &Ctx);
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
+  LLVM_ABI static bool classof(const Value *From);
   bool isUnconditional() const {
     return cast<llvm::BranchInst>(Val)->isUnconditional();
   }
   bool isConditional() const {
     return cast<llvm::BranchInst>(Val)->isConditional();
   }
-  Value *getCondition() const;
+  LLVM_ABI Value *getCondition() const;
   void setCondition(Value *V) { setOperand(0, V); }
   unsigned getNumSuccessors() const { return 1 + isConditional(); }
-  BasicBlock *getSuccessor(unsigned SuccIdx) const;
-  void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
+  LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
   void swapSuccessors() { swapOperandsInternal(1, 2); }
 
 private:
   struct LLVMBBToSBBB {
     Context &Ctx;
     LLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
-    BasicBlock *operator()(llvm::BasicBlock *BB) const;
+    LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *BB) const;
   };
 
   struct ConstLLVMBBToSBBB {
     Context &Ctx;
     ConstLLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
-    const BasicBlock *operator()(const llvm::BasicBlock *BB) const;
+    LLVM_ABI const BasicBlock *operator()(const llvm::BasicBlock *BB) const;
   };
 
 public:
@@ -1109,8 +1112,9 @@ class ExtractValueInst : public UnaryInstruction {
   friend Context; // for ExtractValueInst()
 
 public:
-  static Value *create(Value *Agg, ArrayRef<unsigned> Idxs, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Agg, ArrayRef<unsigned> Idxs,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ExtractValue;
@@ -1120,7 +1124,7 @@ public:
   /// with an extractvalue instruction with the specified parameters.
   ///
   /// Null is returned if the indices are invalid for the specified type.
-  static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
+  LLVM_ABI static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
 
   using idx_iterator = llvm::ExtractValueInst::idx_iterator;
 
@@ -1163,9 +1167,9 @@ class VAArgInst : public UnaryInstruction {
   friend Context; // For constructor;
 
 public:
-  static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos,
-                           Context &Ctx, const Twine &Name = "");
-  Value *getPointerOperand();
+  LLVM_ABI static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos,
+                                    Context &Ctx, const Twine &Name = "");
+  LLVM_ABI Value *getPointerOperand();
   const Value *getPointerOperand() const {
     return const_cast<VAArgInst *>(this)->getPointerOperand();
   }
@@ -1183,8 +1187,8 @@ class FreezeInst : public UnaryInstruction {
   friend Context; // For constructor;
 
 public:
-  static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx,
-                            const Twine &Name = "");
+  LLVM_ABI static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx,
+                                     const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Freeze;
   }
@@ -1200,11 +1204,11 @@ public:
   /// Return true if this is a load from a volatile memory location.
   bool isVolatile() const { return cast<llvm::LoadInst>(Val)->isVolatile(); }
   /// Specify whether this is a volatile load or not.
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
 
-  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
-                          InsertPosition Pos, bool IsVolatile, Context &Ctx,
-                          const Twine &Name = "");
+  LLVM_ABI static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                                   InsertPosition Pos, bool IsVolatile,
+                                   Context &Ctx, const Twine &Name = "");
   static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
                           InsertPosition Pos, Context &Ctx,
                           const Twine &Name = "") {
@@ -1212,8 +1216,8 @@ public:
   }
 
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  Value *getPointerOperand() const;
+  LLVM_ABI static bool classof(const Value *From);
+  LLVM_ABI Value *getPointerOperand() const;
   Align getAlign() const { return cast<llvm::LoadInst>(Val)->getAlign(); }
   bool isUnordered() const { return cast<llvm::LoadInst>(Val)->isUnordered(); }
   bool isSimple() const { return cast<llvm::LoadInst>(Val)->isSimple(); }
@@ -1229,19 +1233,20 @@ public:
   /// Return true if this is a store from a volatile memory location.
   bool isVolatile() const { return cast<llvm::StoreInst>(Val)->isVolatile(); }
   /// Specify whether this is a volatile store or not.
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
 
-  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
-                           InsertPosition Pos, bool IsVolatile, Context &Ctx);
+  LLVM_ABI static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                                    InsertPosition Pos, bool IsVolatile,
+                                    Context &Ctx);
   static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
                            InsertPosition Pos, Context &Ctx) {
     return create(V, Ptr, Align, Pos, /*IsVolatile=*/false, Ctx);
   }
 
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  Value *getValueOperand() const;
-  Value *getPointerOperand() const;
+  LLVM_ABI static bool classof(const Value *From);
+  LLVM_ABI Value *getValueOperand() const;
+  LLVM_ABI Value *getPointerOperand() const;
   Align getAlign() const { return cast<llvm::StoreInst>(Val)->getAlign(); }
   bool isSimple() const { return cast<llvm::StoreInst>(Val)->isSimple(); }
   bool isUnordered() const { return cast<llvm::StoreInst>(Val)->isUnordered(); }
@@ -1260,8 +1265,8 @@ class UnreachableInst final : public Instruction {
   }
 
 public:
-  static UnreachableInst *create(InsertPosition Pos, Context &Ctx);
-  static bool classof(const Value *From);
+  LLVM_ABI static UnreachableInst *create(InsertPosition Pos, Context &Ctx);
+  LLVM_ABI static bool classof(const Value *From);
   unsigned getNumSuccessors() const { return 0; }
   unsigned getUseOperandNo(const Use &Use) const final {
     llvm_unreachable("UnreachableInst has no operands!");
@@ -1280,12 +1285,13 @@ class ReturnInst final : public SingleLLVMInstructionImpl<llvm::ReturnInst> {
                                   Context &Ctx);
 
 public:
-  static ReturnInst *create(Value *RetVal, InsertPosition Pos, Context &Ctx);
+  LLVM_ABI static ReturnInst *create(Value *RetVal, InsertPosition Pos,
+                                     Context &Ctx);
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Ret;
   }
   /// \Returns null if there is no return value.
-  Value *getReturnValue() const;
+  LLVM_ABI Value *getReturnValue() const;
 };
 
 class CallBase : public SingleLLVMInstructionImpl<llvm::CallBase> {
@@ -1303,7 +1309,7 @@ public:
            Opc == Instruction::ClassID::CallBr;
   }
 
-  FunctionType *getFunctionType() const;
+  LLVM_ABI FunctionType *getFunctionType() const;
 
   op_iterator data_operands_begin() { return op_begin(); }
   const_op_iterator data_operands_begin() const {
@@ -1390,17 +1396,17 @@ public:
   }
   bool hasArgument(const Value *V) const { return is_contained(args(), V); }
 
-  Value *getCalledOperand() const;
-  Use getCalledOperandUse() const;
+  LLVM_ABI Value *getCalledOperand() const;
+  LLVM_ABI Use getCalledOperandUse() const;
 
-  Function *getCalledFunction() const;
+  LLVM_ABI Function *getCalledFunction() const;
   bool isIndirectCall() const {
     return cast<llvm::CallBase>(Val)->isIndirectCall();
   }
   bool isCallee(Use U) const {
     return cast<llvm::CallBase>(Val)->isCallee(U.LLVMUse);
   }
-  Function *getCaller();
+  LLVM_ABI Function *getCaller();
   const Function *getCaller() const {
     return const_cast<CallBase *>(this)->getCaller();
   }
@@ -1412,7 +1418,7 @@ public:
     return cast<llvm::CallBase>(Val)->getIntrinsicID();
   }
   void setCalledOperand(Value *V) { getCalledOperandUse().set(V); }
-  void setCalledFunction(Function *F);
+  LLVM_ABI void setCalledFunction(Function *F);
   CallingConv::ID getCallingConv() const {
     return cast<llvm::CallBase>(Val)->getCallingConv();
   }
@@ -1428,9 +1434,9 @@ class CallInst : public CallBase {
   friend class IntrinsicInst; // For constructor
 
 public:
-  static CallInst *create(FunctionType *FTy, Value *Func,
-                          ArrayRef<Value *> Args, InsertPosition Pos,
-                          Context &Ctx, const Twine &NameStr = "");
+  LLVM_ABI static CallInst *create(FunctionType *FTy, Value *Func,
+                                   ArrayRef<Value *> Args, InsertPosition Pos,
+                                   Context &Ctx, const Twine &NameStr = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Call;
@@ -1446,20 +1452,21 @@ class InvokeInst final : public CallBase {
                         // create*()
 
 public:
-  static InvokeInst *create(FunctionType *FTy, Value *Func,
-                            BasicBlock *IfNormal, BasicBlock *IfException,
-                            ArrayRef<Value *> Args, InsertPosition Pos,
-                            Context &Ctx, const Twine &NameStr = "");
+  LLVM_ABI static InvokeInst *create(FunctionType *FTy, Value *Func,
+                                     BasicBlock *IfNormal,
+                                     BasicBlock *IfException,
+                                     ArrayRef<Value *> Args, InsertPosition Pos,
+                                     Context &Ctx, const Twine &NameStr = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Invoke;
   }
-  BasicBlock *getNormalDest() const;
-  BasicBlock *getUnwindDest() const;
-  void setNormalDest(BasicBlock *BB);
-  void setUnwindDest(BasicBlock *BB);
-  LandingPadInst *getLandingPadInst() const;
-  BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  LLVM_ABI BasicBlock *getNormalDest() const;
+  LLVM_ABI BasicBlock *getUnwindDest() const;
+  LLVM_ABI void setNormalDest(BasicBlock *BB);
+  LLVM_ABI void setUnwindDest(BasicBlock *BB);
+  LLVM_ABI LandingPadInst *getLandingPadInst() const;
+  LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const;
   void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) {
     assert(SuccIdx < 2 && "Successor # out of range for invoke!");
     if (SuccIdx == 0)
@@ -1481,25 +1488,25 @@ class CallBrInst final : public CallBase {
                         // create*()
 
 public:
-  static CallBrInst *create(FunctionType *FTy, Value *Func,
-                            BasicBlock *DefaultDest,
-                            ArrayRef<BasicBlock *> IndirectDests,
-                            ArrayRef<Value *> Args, InsertPosition Pos,
-                            Context &Ctx, const Twine &NameStr = "");
+  LLVM_ABI static CallBrInst *create(FunctionType *FTy, Value *Func,
+                                     BasicBlock *DefaultDest,
+                                     ArrayRef<BasicBlock *> IndirectDests,
+                                     ArrayRef<Value *> Args, InsertPosition Pos,
+                                     Context &Ctx, const Twine &NameStr = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CallBr;
   }
   unsigned getNumIndirectDests() const {
     return cast<llvm::CallBrInst>(Val)->getNumIndirectDests();
   }
-  Value *getIndirectDestLabel(unsigned Idx) const;
-  Value *getIndirectDestLabelUse(unsigned Idx) const;
-  BasicBlock *getDefaultDest() const;
-  BasicBlock *getIndirectDest(unsigned Idx) const;
-  SmallVector<BasicBlock *, 16> getIndirectDests() const;
-  void setDefaultDest(BasicBlock *BB);
-  void setIndirectDest(unsigned Idx, BasicBlock *BB);
-  BasicBlock *getSuccessor(unsigned Idx) const;
+  LLVM_ABI Value *getIndirectDestLabel(unsigned Idx) const;
+  LLVM_ABI Value *getIndirectDestLabelUse(unsigned Idx) const;
+  LLVM_ABI BasicBlock *getDefaultDest() const;
+  LLVM_ABI BasicBlock *getIndirectDest(unsigned Idx) const;
+  LLVM_ABI SmallVector<BasicBlock *, 16> getIndirectDests() const;
+  LLVM_ABI void setDefaultDest(BasicBlock *BB);
+  LLVM_ABI void setIndirectDest(unsigned Idx, BasicBlock *BB);
+  LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const;
   unsigned getNumSuccessors() const {
     return cast<llvm::CallBrInst>(Val)->getNumSuccessors();
   }
@@ -1512,9 +1519,10 @@ class LandingPadInst : public SingleLLVMInstructionImpl<llvm::LandingPadInst> {
   friend class Context; // For constructor.
 
 public:
-  static LandingPadInst *create(Type *RetTy, unsigned NumReservedClauses,
-                                InsertPosition Pos, Context &Ctx,
-                                const Twine &Name = "");
+  LLVM_ABI static LandingPadInst *create(Type *RetTy,
+                                         unsigned NumReservedClauses,
+                                         InsertPosition Pos, Context &Ctx,
+                                         const Twine &Name = "");
   /// Return 'true' if this landingpad instruction is a
   /// cleanup. I.e., it should be run when unwinding even if its landing pad
   /// doesn't catch the exception.
@@ -1522,14 +1530,14 @@ public:
     return cast<llvm::LandingPadInst>(Val)->isCleanup();
   }
   /// Indicate that this landingpad instruction is a cleanup.
-  void setCleanup(bool V);
+  LLVM_ABI void setCleanup(bool V);
 
   // TODO: We are not implementing addClause() because we have no way to revert
   // it for now.
 
   /// Get the value of the clause at index Idx. Use isCatch/isFilter to
   /// determine what type of clause this is.
-  Constant *getClause(unsigned Idx) const;
+  LLVM_ABI Constant *getClause(unsigned Idx) const;
 
   /// Return 'true' if the clause and index Idx is a catch clause.
   bool isCatch(unsigned Idx) const {
@@ -1565,12 +1573,12 @@ public:
   ///
   /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst
   /// is a CatchPadInst.
-  Value *getParentPad() const;
-  void setParentPad(Value *ParentPad);
+  LLVM_ABI Value *getParentPad() const;
+  LLVM_ABI void setParentPad(Value *ParentPad);
   /// Return the Idx-th funcletpad argument.
-  Value *getArgOperand(unsigned Idx) const;
+  LLVM_ABI Value *getArgOperand(unsigned Idx) const;
   /// Set the Idx-th funcletpad argument.
-  void setArgOperand(unsigned Idx, Value *V);
+  LLVM_ABI void setArgOperand(unsigned Idx, Value *V);
 
   // TODO: Implement missing functions: arg_operands().
   static bool classof(const Value *From) {
@@ -1585,13 +1593,13 @@ class CatchPadInst : public FuncletPadInst {
   friend class Context; // For constructor.
 
 public:
-  CatchSwitchInst *getCatchSwitch() const;
+  LLVM_ABI CatchSwitchInst *getCatchSwitch() const;
   // TODO: We have not implemented setCatchSwitch() because we can't revert it
   // for now, as there is no CatchPadInst member function that can undo it.
 
-  static CatchPadInst *create(Value *ParentPad, ArrayRef<Value *> Args,
-                              InsertPosition Pos, Context &Ctx,
-                              const Twine &Name = "");
+  LLVM_ABI static CatchPadInst *create(Value *ParentPad, ArrayRef<Value *> Args,
+                                       InsertPosition Pos, Context &Ctx,
+                                       const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CatchPad;
   }
@@ -1603,9 +1611,10 @@ class CleanupPadInst : public FuncletPadInst {
   friend class Context; // For constructor.
 
 public:
-  static CleanupPadInst *create(Value *ParentPad, ArrayRef<Value *> Args,
-                                InsertPosition Pos, Context &Ctx,
-                                const Twine &Name = "");
+  LLVM_ABI static CleanupPadInst *create(Value *ParentPad,
+                                         ArrayRef<Value *> Args,
+                                         InsertPosition Pos, Context &Ctx,
+                                         const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CleanupPad;
   }
@@ -1619,16 +1628,17 @@ class CatchReturnInst
   friend class Context; // For constructor.
 
 public:
-  static CatchReturnInst *create(CatchPadInst *CatchPad, BasicBlock *BB,
-                                 InsertPosition Pos, Context &Ctx);
-  CatchPadInst *getCatchPad() const;
-  void setCatchPad(CatchPadInst *CatchPad);
-  BasicBlock *getSuccessor() const;
-  void setSuccessor(BasicBlock *NewSucc);
+  LLVM_ABI static CatchReturnInst *create(CatchPadInst *CatchPad,
+                                          BasicBlock *BB, InsertPosition Pos,
+                                          Context &Ctx);
+  LLVM_ABI CatchPadInst *getCatchPad() const;
+  LLVM_ABI void setCatchPad(CatchPadInst *CatchPad);
+  LLVM_ABI BasicBlock *getSuccessor() const;
+  LLVM_ABI void setSuccessor(BasicBlock *NewSucc);
   unsigned getNumSuccessors() {
     return cast<llvm::CatchReturnInst>(Val)->getNumSuccessors();
   }
-  Value *getCatchSwitchParentPad() const;
+  LLVM_ABI Value *getCatchSwitchParentPad() const;
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CatchRet;
   }
@@ -1642,22 +1652,22 @@ class CleanupReturnInst
   friend class Context; // For constructor.
 
 public:
-  static CleanupReturnInst *create(CleanupPadInst *CleanupPad,
-                                   BasicBlock *UnwindBB, InsertPosition Pos,
-                                   Context &Ctx);
+  LLVM_ABI static CleanupReturnInst *create(CleanupPadInst *CleanupPad,
+                                            BasicBlock *UnwindBB,
+                                            InsertPosition Pos, Context &Ctx);
   bool hasUnwindDest() const {
     return cast<llvm::CleanupReturnInst>(Val)->hasUnwindDest();
   }
   bool unwindsToCaller() const {
     return cast<llvm::CleanupReturnInst>(Val)->unwindsToCaller();
   }
-  CleanupPadInst *getCleanupPad() const;
-  void setCleanupPad(CleanupPadInst *CleanupPad);
+  LLVM_ABI CleanupPadInst *getCleanupPad() const;
+  LLVM_ABI void setCleanupPad(CleanupPadInst *CleanupPad);
   unsigned getNumSuccessors() const {
     return cast<llvm::CleanupReturnInst>(Val)->getNumSuccessors();
   }
-  BasicBlock *getUnwindDest() const;
-  void setUnwindDest(BasicBlock *NewDest);
+  LLVM_ABI BasicBlock *getUnwindDest() const;
+  LLVM_ABI void setUnwindDest(BasicBlock *NewDest);
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CleanupRet;
@@ -1677,16 +1687,16 @@ class GetElementPtrInst final
                         // create*()
 
 public:
-  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &NameStr = "");
+  LLVM_ABI static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &NameStr = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::GetElementPtr;
   }
 
-  Type *getSourceElementType() const;
-  Type *getResultElementType() const;
+  LLVM_ABI Type *getSourceElementType() const;
+  LLVM_ABI Type *getResultElementType() const;
   unsigned getAddressSpace() const {
     return cast<llvm::GetElementPtrInst>(Val)->getAddressSpace();
   }
@@ -1706,11 +1716,11 @@ public:
     return const_cast<GetElementPtrInst *>(this)->indices();
   }
 
-  Value *getPointerOperand() const;
+  LLVM_ABI Value *getPointerOperand() const;
   static unsigned getPointerOperandIndex() {
     return llvm::GetElementPtrInst::getPointerOperandIndex();
   }
-  Type *getPointerOperandType() const;
+  LLVM_ABI Type *getPointerOperandType() const;
   unsigned getPointerAddressSpace() const {
     return cast<llvm::GetElementPtrInst>(Val)->getPointerAddressSpace();
   }
@@ -1750,12 +1760,12 @@ class CatchSwitchInst
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static CatchSwitchInst *create(Value *ParentPad, BasicBlock *UnwindBB,
-                                 unsigned NumHandlers, InsertPosition Pos,
-                                 Context &Ctx, const Twine &Name = "");
+  LLVM_ABI static CatchSwitchInst *
+  create(Value *ParentPad, BasicBlock *UnwindBB, unsigned NumHandlers,
+         InsertPosition Pos, Context &Ctx, const Twine &Name = "");
 
-  Value *getParentPad() const;
-  void setParentPad(Value *ParentPad);
+  LLVM_ABI Value *getParentPad() const;
+  LLVM_ABI void setParentPad(Value *ParentPad);
 
   bool hasUnwindDest() const {
     return cast<llvm::CatchSwitchInst>(Val)->hasUnwindDest();
@@ -1763,8 +1773,8 @@ public:
   bool unwindsToCaller() const {
     return cast<llvm::CatchSwitchInst>(Val)->unwindsToCaller();
   }
-  BasicBlock *getUnwindDest() const;
-  void setUnwindDest(BasicBlock *UnwindDest);
+  LLVM_ABI BasicBlock *getUnwindDest() const;
+  LLVM_ABI void setUnwindDest(BasicBlock *UnwindDest);
 
   unsigned getNumHandlers() const {
     return cast<llvm::CatchSwitchInst>(Val)->getNumHandlers();
@@ -1810,7 +1820,7 @@ public:
     return make_range(handler_begin(), handler_end());
   }
 
-  void addHandler(BasicBlock *Dest);
+  LLVM_ABI void addHandler(BasicBlock *Dest);
 
   // TODO: removeHandler() cannot be reverted because there is no equivalent
   // addHandler() with a handler_iterator to specify the position. So we can't
@@ -1839,8 +1849,9 @@ class ResumeInst : public SingleLLVMInstructionImpl<llvm::ResumeInst> {
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static ResumeInst *create(Value *Exn, InsertPosition Pos, Context &Ctx);
-  Value *getValue() const;
+  LLVM_ABI static ResumeInst *create(Value *Exn, InsertPosition Pos,
+                                     Context &Ctx);
+  LLVM_ABI Value *getValue() const;
   unsigned getNumSuccessors() const {
     return cast<llvm::ResumeInst>(Val)->getNumSuccessors();
   }
@@ -1858,17 +1869,17 @@ public:
   static constexpr const unsigned DefaultPseudoIndex =
       llvm::SwitchInst::DefaultPseudoIndex;
 
-  static SwitchInst *create(Value *V, BasicBlock *Dest, unsigned NumCases,
-                            InsertPosition Pos, Context &Ctx,
-                            const Twine &Name = "");
+  LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest,
+                                     unsigned NumCases, InsertPosition Pos,
+                                     Context &Ctx, const Twine &Name = "");
 
-  Value *getCondition() const;
-  void setCondition(Value *V);
-  BasicBlock *getDefaultDest() const;
+  LLVM_ABI Value *getCondition() const;
+  LLVM_ABI void setCondition(Value *V);
+  LLVM_ABI BasicBlock *getDefaultDest() const;
   bool defaultDestUnreachable() const {
     return cast<llvm::SwitchInst>(Val)->defaultDestUnreachable();
   }
-  void setDefaultDest(BasicBlock *DefaultCase);
+  LLVM_ABI void setDefaultDest(BasicBlock *DefaultCase);
   unsigned getNumCases() const {
     return cast<llvm::SwitchInst>(Val)->getNumCases();
   }
@@ -1913,9 +1924,9 @@ public:
       return I;
     return case_default();
   }
-  ConstantInt *findCaseDest(BasicBlock *BB);
+  LLVM_ABI ConstantInt *findCaseDest(BasicBlock *BB);
 
-  void addCase(ConstantInt *OnVal, BasicBlock *Dest);
+  LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest);
   /// This method removes the specified case and its successor from the switch
   /// instruction. Note that this operation may reorder the remaining cases at
   /// index idx and above.
@@ -1923,13 +1934,13 @@ public:
   /// This action invalidates iterators for all cases following the one removed,
   /// including the case_end() iterator. It returns an iterator for the next
   /// case.
-  CaseIt removeCase(CaseIt It);
+  LLVM_ABI CaseIt removeCase(CaseIt It);
 
   unsigned getNumSuccessors() const {
     return cast<llvm::SwitchInst>(Val)->getNumSuccessors();
   }
-  BasicBlock *getSuccessor(unsigned Idx) const;
-  void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
+  LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const;
+  LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Switch;
   }
@@ -1950,11 +1961,13 @@ class UnaryOperator : public UnaryInstruction {
                          Ctx) {}
   friend Context; // for constructor.
 public:
-  static Value *create(Instruction::Opcode Op, Value *OpV, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
-  static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *OpV,
-                                      Value *CopyFrom, InsertPosition Pos,
-                                      Context &Ctx, const Twine &Name = "");
+  LLVM_ABI static Value *create(Instruction::Opcode Op, Value *OpV,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+  LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op,
+                                               Value *OpV, Value *CopyFrom,
+                                               InsertPosition Pos, Context &Ctx,
+                                               const Twine &Name = "");
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::UnOp;
@@ -2013,14 +2026,15 @@ protected:
   friend class Context; // For constructor.
 
 public:
-  static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
-  static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *LHS,
-                                      Value *RHS, Value *CopyFrom,
-                                      InsertPosition Pos, Context &Ctx,
-                                      const Twine &Name = "");
+  LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op,
+                                               Value *LHS, Value *RHS,
+                                               Value *CopyFrom,
+                                               InsertPosition Pos, Context &Ctx,
+                                               const Twine &Name = "");
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::BinaryOperator;
@@ -2033,7 +2047,7 @@ public:
 /// can also be treated as an add.
 class PossiblyDisjointInst : public BinaryOperator {
 public:
-  void setIsDisjoint(bool B);
+  LLVM_ABI void setIsDisjoint(bool B);
   bool isDisjoint() const {
     return cast<llvm::PossiblyDisjointInst>(Val)->isDisjoint();
   }
@@ -2066,24 +2080,24 @@ public:
     cast<llvm::AtomicRMWInst>(Val)->setOperation(Op);
   }
   Align getAlign() const { return cast<llvm::AtomicRMWInst>(Val)->getAlign(); }
-  void setAlignment(Align Align);
+  LLVM_ABI void setAlignment(Align Align);
   bool isVolatile() const {
     return cast<llvm::AtomicRMWInst>(Val)->isVolatile();
   }
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
   AtomicOrdering getOrdering() const {
     return cast<llvm::AtomicRMWInst>(Val)->getOrdering();
   }
-  void setOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setOrdering(AtomicOrdering Ordering);
   SyncScope::ID getSyncScopeID() const {
     return cast<llvm::AtomicRMWInst>(Val)->getSyncScopeID();
   }
-  void setSyncScopeID(SyncScope::ID SSID);
-  Value *getPointerOperand();
+  LLVM_ABI void setSyncScopeID(SyncScope::ID SSID);
+  LLVM_ABI Value *getPointerOperand();
   const Value *getPointerOperand() const {
     return const_cast<AtomicRMWInst *>(this)->getPointerOperand();
   }
-  Value *getValOperand();
+  LLVM_ABI Value *getValOperand();
   const Value *getValOperand() const {
     return const_cast<AtomicRMWInst *>(this)->getValOperand();
   }
@@ -2097,11 +2111,10 @@ public:
     return From->getSubclassID() == ClassID::AtomicRMW;
   }
 
-  static AtomicRMWInst *create(BinOp Op, Value *Ptr, Value *Val,
-                               MaybeAlign Align, AtomicOrdering Ordering,
-                               InsertPosition Pos, Context &Ctx,
-                               SyncScope::ID SSID = SyncScope::System,
-                               const Twine &Name = "");
+  LLVM_ABI static AtomicRMWInst *
+  create(BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align,
+         AtomicOrdering Ordering, InsertPosition Pos, Context &Ctx,
+         SyncScope::ID SSID = SyncScope::System, const Twine &Name = "");
 };
 
 class AtomicCmpXchgInst
@@ -2119,17 +2132,17 @@ public:
     return cast<llvm::AtomicCmpXchgInst>(Val)->getAlign();
   }
 
-  void setAlignment(Align Align);
+  LLVM_ABI void setAlignment(Align Align);
   /// Return true if this is a cmpxchg from a volatile memory
   /// location.
   bool isVolatile() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->isVolatile();
   }
   /// Specify whether this is a volatile cmpxchg.
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
   /// Return true if this cmpxchg may spuriously fail.
   bool isWeak() const { return cast<llvm::AtomicCmpXchgInst>(Val)->isWeak(); }
-  void setWeak(bool IsWeak);
+  LLVM_ABI void setWeak(bool IsWeak);
   static bool isValidSuccessOrdering(AtomicOrdering Ordering) {
     return llvm::AtomicCmpXchgInst::isValidSuccessOrdering(Ordering);
   }
@@ -2139,30 +2152,30 @@ public:
   AtomicOrdering getSuccessOrdering() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getSuccessOrdering();
   }
-  void setSuccessOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setSuccessOrdering(AtomicOrdering Ordering);
 
   AtomicOrdering getFailureOrdering() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getFailureOrdering();
   }
-  void setFailureOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setFailureOrdering(AtomicOrdering Ordering);
   AtomicOrdering getMergedOrdering() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getMergedOrdering();
   }
   SyncScope::ID getSyncScopeID() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getSyncScopeID();
   }
-  void setSyncScopeID(SyncScope::ID SSID);
-  Value *getPointerOperand();
+  LLVM_ABI void setSyncScopeID(SyncScope::ID SSID);
+  LLVM_ABI Value *getPointerOperand();
   const Value *getPointerOperand() const {
     return const_cast<AtomicCmpXchgInst *>(this)->getPointerOperand();
   }
 
-  Value *getCompareOperand();
+  LLVM_ABI Value *getCompareOperand();
   const Value *getCompareOperand() const {
     return const_cast<AtomicCmpXchgInst *>(this)->getCompareOperand();
   }
 
-  Value *getNewValOperand();
+  LLVM_ABI Value *getNewValOperand();
   const Value *getNewValOperand() const {
     return const_cast<AtomicCmpXchgInst *>(this)->getNewValOperand();
   }
@@ -2172,7 +2185,7 @@ public:
     return cast<llvm::AtomicCmpXchgInst>(Val)->getPointerAddressSpace();
   }
 
-  static AtomicCmpXchgInst *
+  LLVM_ABI static AtomicCmpXchgInst *
   create(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align,
          AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering,
          InsertPosition Pos, Context &Ctx,
@@ -2190,9 +2203,10 @@ class AllocaInst final : public UnaryInstruction {
   friend class Context; // For constructor.
 
 public:
-  static AllocaInst *create(Type *Ty, unsigned AddrSpace, InsertPosition Pos,
-                            Context &Ctx, Value *ArraySize = nullptr,
-                            const Twine &Name = "");
+  LLVM_ABI static AllocaInst *create(Type *Ty, unsigned AddrSpace,
+                                     InsertPosition Pos, Context &Ctx,
+                                     Value *ArraySize = nullptr,
+                                     const Twine &Name = "");
 
   /// Return true if there is an allocation size parameter to the allocation
   /// instruction that is not 1.
@@ -2201,12 +2215,12 @@ public:
   }
   /// Get the number of elements allocated. For a simple allocation of a single
   /// element, this will return a constant 1 value.
-  Value *getArraySize();
+  LLVM_ABI Value *getArraySize();
   const Value *getArraySize() const {
     return const_cast<AllocaInst *>(this)->getArraySize();
   }
   /// Overload to return most specific pointer type.
-  PointerType *getType() const;
+  LLVM_ABI PointerType *getType() const;
   /// Return the address space for the allocation.
   unsigned getAddressSpace() const {
     return cast<llvm::AllocaInst>(Val)->getAddressSpace();
@@ -2222,14 +2236,14 @@ public:
     return cast<llvm::AllocaInst>(Val)->getAllocationSizeInBits(DL);
   }
   /// Return the type that is being allocated by the instruction.
-  Type *getAllocatedType() const;
+  LLVM_ABI Type *getAllocatedType() const;
   /// for use only in special circumstances that need to generically
   /// transform a whole instruction (eg: IR linking and vectorization).
-  void setAllocatedType(Type *Ty);
+  LLVM_ABI void setAllocatedType(Type *Ty);
   /// Return the alignment of the memory that is being allocated by the
   /// instruction.
   Align getAlign() const { return cast<llvm::AllocaInst>(Val)->getAlign(); }
-  void setAlignment(Align Align);
+  LLVM_ABI void setAlignment(Align Align);
   /// Return true if this alloca is in the entry block of the function and is a
   /// constant size. If so, the code generator will fold it into the
   /// prolog/epilog code, so it is basically free.
@@ -2242,7 +2256,7 @@ public:
     return cast<llvm::AllocaInst>(Val)->isUsedWithInAlloca();
   }
   /// Specify whether this alloca is used to represent the arguments to a call.
-  void setUsedWithInAlloca(bool V);
+  LLVM_ABI void setUsedWithInAlloca(bool V);
 
   static bool classof(const Value *From) {
     if (auto *I = dyn_cast<Instruction>(From))
@@ -2293,13 +2307,13 @@ class CastInst : public UnaryInstruction {
   friend Context; // for SBCastInstruction()
 
 public:
-  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  Type *getSrcTy() const;
-  Type *getDestTy() const;
+  LLVM_ABI static bool classof(const Value *From);
+  LLVM_ABI Type *getSrcTy() const;
+  LLVM_ABI Type *getDestTy() const;
 };
 
 /// Instruction that can have a nneg flag (zext/uitofp).
@@ -2308,7 +2322,7 @@ public:
   bool hasNonNeg() const {
     return cast<llvm::PossiblyNonNegInst>(Val)->hasNonNeg();
   }
-  void setNonNeg(bool B);
+  LLVM_ABI void setNonNeg(bool B);
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
     if (auto *I = dyn_cast<Instruction>(From)) {
@@ -2383,15 +2397,15 @@ class PHINode final : public SingleLLVMInstructionImpl<llvm::PHINode> {
   struct LLVMBBToBB {
     Context &Ctx;
     LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {}
-    BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const;
+    LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const;
   };
 
 public:
-  static PHINode *create(Type *Ty, unsigned NumReservedValues,
-                         InsertPosition Pos, Context &Ctx,
-                         const Twine &Name = "");
+  LLVM_ABI static PHINode *create(Type *Ty, unsigned NumReservedValues,
+                                  InsertPosition Pos, Context &Ctx,
+                                  const Twine &Name = "");
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
+  LLVM_ABI static bool classof(const Value *From);
 
   using const_block_iterator =
       mapped_iterator<llvm::PHINode::const_block_iterator, LLVMBBToBB>;
@@ -2417,35 +2431,36 @@ public:
   unsigned getNumIncomingValues() const {
     return cast<llvm::PHINode>(Val)->getNumIncomingValues();
   }
-  Value *getIncomingValue(unsigned Idx) const;
-  void setIncomingValue(unsigned Idx, Value *V);
+  LLVM_ABI Value *getIncomingValue(unsigned Idx) const;
+  LLVM_ABI void setIncomingValue(unsigned Idx, Value *V);
   static unsigned getOperandNumForIncomingValue(unsigned Idx) {
     return llvm::PHINode::getOperandNumForIncomingValue(Idx);
   }
   static unsigned getIncomingValueNumForOperand(unsigned Idx) {
     return llvm::PHINode::getIncomingValueNumForOperand(Idx);
   }
-  BasicBlock *getIncomingBlock(unsigned Idx) const;
-  BasicBlock *getIncomingBlock(const Use &U) const;
+  LLVM_ABI BasicBlock *getIncomingBlock(unsigned Idx) const;
+  LLVM_ABI BasicBlock *getIncomingBlock(const Use &U) const;
 
-  void setIncomingBlock(unsigned Idx, BasicBlock *BB);
+  LLVM_ABI void setIncomingBlock(unsigned Idx, BasicBlock *BB);
 
-  void addIncoming(Value *V, BasicBlock *BB);
+  LLVM_ABI void addIncoming(Value *V, BasicBlock *BB);
 
-  Value *removeIncomingValue(unsigned Idx);
-  Value *removeIncomingValue(BasicBlock *BB);
+  LLVM_ABI Value *removeIncomingValue(unsigned Idx);
+  LLVM_ABI Value *removeIncomingValue(BasicBlock *BB);
 
-  int getBasicBlockIndex(const BasicBlock *BB) const;
-  Value *getIncomingValueForBlock(const BasicBlock *BB) const;
+  LLVM_ABI int getBasicBlockIndex(const BasicBlock *BB) const;
+  LLVM_ABI Value *getIncomingValueForBlock(const BasicBlock *BB) const;
 
-  Value *hasConstantValue() const;
+  LLVM_ABI Value *hasConstantValue() const;
 
   bool hasConstantOrUndefValue() const {
     return cast<llvm::PHINode>(Val)->hasConstantOrUndefValue();
   }
   bool isComplete() const { return cast<llvm::PHINode>(Val)->isComplete(); }
-  void replaceIncomingBlockWith(const BasicBlock *Old, BasicBlock *New);
-  void removeIncomingValueIf(function_ref<bool(unsigned)> Predicate);
+  LLVM_ABI void replaceIncomingBlockWith(const BasicBlock *Old,
+                                         BasicBlock *New);
+  LLVM_ABI void removeIncomingValueIf(function_ref<bool(unsigned)> Predicate);
   // TODO: Implement
   // void copyIncomingBlocks(iterator_range<const_block_iterator> BBRange,
   //                         uint32_t ToIdx = 0)
@@ -2471,21 +2486,23 @@ protected:
   CmpInst(llvm::CmpInst *CI, Context &Ctx, ClassID Id, Opcode Opc)
       : SingleLLVMInstructionImpl(Id, Opc, CI, Ctx) {}
   friend Context; // for CmpInst()
-  static Value *createCommon(Value *Cond, Value *True, Value *False,
-                             const Twine &Name, IRBuilder<> &Builder,
-                             Context &Ctx);
+  LLVM_ABI static Value *createCommon(Value *Cond, Value *True, Value *False,
+                                      const Twine &Name, IRBuilder<> &Builder,
+                                      Context &Ctx);
 
 public:
   using Predicate = llvm::CmpInst::Predicate;
 
-  static Value *create(Predicate Pred, Value *S1, Value *S2, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
-  static Value *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
-                                      const Instruction *FlagsSource,
-                                      InsertPosition Pos, Context &Ctx,
-                                      const Twine &Name = "");
-  void setPredicate(Predicate P);
-  void swapOperands();
+  LLVM_ABI static Value *create(Predicate Pred, Value *S1, Value *S2,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+  LLVM_ABI static Value *createWithCopiedFlags(Predicate Pred, Value *S1,
+                                               Value *S2,
+                                               const Instruction *FlagsSource,
+                                               InsertPosition Pos, Context &Ctx,
+                                               const Twine &Name = "");
+  LLVM_ABI void setPredicate(Predicate P);
+  LLVM_ABI void swapOperands();
 
   WRAP_MEMBER(getPredicate);
   WRAP_BOTH(isFPPredicate);
@@ -2517,7 +2534,7 @@ public:
   }
 
   /// Create a result type for fcmp/icmp
-  static Type *makeCmpResultType(Type *OpndType);
+  LLVM_ABI static Type *makeCmpResultType(Type *OpndType);
 
 #ifndef NDEBUG
   void dumpOS(raw_ostream &OS) const override;
@@ -2533,7 +2550,7 @@ class ICmpInst : public CmpInst {
   using LLVMValType = llvm::ICmpInst;
 
 public:
-  void swapOperands();
+  LLVM_ABI void swapOperands();
 
   WRAP_BOTH(getSignedPredicate);
   WRAP_BOTH(getUnsignedPredicate);
@@ -2570,7 +2587,7 @@ class FCmpInst : public CmpInst {
   using LLVMValType = llvm::FCmpInst;
 
 public:
-  void swapOperands();
+  LLVM_ABI void swapOperands();
 
   WRAP_BOTH(isEquality);
   WRAP_MEMBER(isCommutative);
diff --git a/llvm/include/llvm/SandboxIR/Module.h b/llvm/include/llvm/SandboxIR/Module.h
index 429bb04539bc..275960392211 100644
--- a/llvm/include/llvm/SandboxIR/Module.h
+++ b/llvm/include/llvm/SandboxIR/Module.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -38,7 +39,7 @@ class Module {
 public:
   Context &getContext() const { return Ctx; }
 
-  Function *getFunction(StringRef Name) const;
+  LLVM_ABI Function *getFunction(StringRef Name) const;
 
   const DataLayout &getDataLayout() const { return LLVMM.getDataLayout(); }
 
@@ -50,7 +51,8 @@ public:
   /// does not exist, return null. If AllowInternal is set to true, this
   /// function will return types that have InternalLinkage. By default, these
   /// types are not returned.
-  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const;
+  LLVM_ABI GlobalVariable *getGlobalVariable(StringRef Name,
+                                             bool AllowInternal) const;
   GlobalVariable *getGlobalVariable(StringRef Name) const {
     return getGlobalVariable(Name, /*AllowInternal=*/false);
   }
@@ -66,12 +68,12 @@ public:
   /// Return the global alias in the module with the specified name, of
   /// arbitrary type. This method returns null if a global with the specified
   /// name is not found.
-  GlobalAlias *getNamedAlias(StringRef Name) const;
+  LLVM_ABI GlobalAlias *getNamedAlias(StringRef Name) const;
 
   /// Return the global ifunc in the module with the specified name, of
   /// arbitrary type. This method returns null if a global with the specified
   /// name is not found.
-  GlobalIFunc *getNamedIFunc(StringRef Name) const;
+  LLVM_ABI GlobalIFunc *getNamedIFunc(StringRef Name) const;
 
   // TODO: Missing removeGlobalVariable() eraseGlobalVariable(),
   // insertGlobalVariable()
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 55a0301f4756..6fccaf04b270 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -18,6 +18,7 @@
 #ifndef LLVM_SANDBOXIR_PASSMANAGER_H
 #define LLVM_SANDBOXIR_PASSMANAGER_H
 
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 #include "llvm/ADT/DenseMap.h"
@@ -201,7 +202,7 @@ public:
   }
 };
 
-class FunctionPassManager final
+class LLVM_ABI FunctionPassManager final
     : public PassManager<FunctionPass, FunctionPass> {
 public:
   FunctionPassManager(StringRef Name) : PassManager(Name) {}
@@ -211,7 +212,8 @@ public:
   bool runOnFunction(Function &F, const Analyses &A) final;
 };
 
-class RegionPassManager final : public PassManager<RegionPass, RegionPass> {
+class LLVM_ABI RegionPassManager final
+    : public PassManager<RegionPass, RegionPass> {
 public:
   RegionPassManager(StringRef Name) : PassManager(Name) {}
   RegionPassManager(StringRef Name, StringRef Pipeline,
diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h
index f86199ab6c22..d70f21277fb1 100644
--- a/llvm/include/llvm/SandboxIR/Region.h
+++ b/llvm/include/llvm/SandboxIR/Region.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_SANDBOXIR_REGION_H
 #define LLVM_SANDBOXIR_REGION_H
 
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 #include "llvm/ADT/SetVector.h"
@@ -30,7 +31,7 @@ class ScoreBoard {
   /// The cost of all instructions that got removed and replaced by new ones.
   InstructionCost BeforeCost = 0;
   /// Helper for both add() and remove(). \Returns the TTI cost of \p I.
-  InstructionCost getCost(Instruction *I) const;
+  LLVM_ABI InstructionCost getCost(Instruction *I) const;
   /// No need to allow copies.
   ScoreBoard(const ScoreBoard &) = delete;
   const ScoreBoard &operator=(const ScoreBoard &) = delete;
@@ -40,7 +41,7 @@ public:
   /// Mark \p I as a newly added instruction to the region.
   void add(Instruction *I) { AfterCost += getCost(I); }
   /// Mark \p I as a deleted instruction from the region.
-  void remove(Instruction *I);
+  LLVM_ABI void remove(Instruction *I);
   /// \Returns the cost of the newly added instructions.
   InstructionCost getAfterCost() const { return AfterCost; }
   /// \Returns the cost of the Removed instructions.
@@ -122,12 +123,12 @@ class Region {
   /// add an instruction to the auxiliary vector it does get tagged as being a
   /// member of the region (for ownership reasons), but its cost does not get
   /// counted because the instruction hasn't been added in the "normal" way.
-  void addImpl(Instruction *I, bool IgnoreCost);
+  LLVM_ABI void addImpl(Instruction *I, bool IgnoreCost);
   /// Adds I to the set. This is the main API for adding an instruction to the
   /// region.
   void add(Instruction *I) { addImpl(I, /*IgnoreCost=*/false); }
   /// Removes I from the set.
-  void remove(Instruction *I);
+  LLVM_ABI void remove(Instruction *I);
   friend class Context; // The callbacks need to call add() and remove().
   friend class RegionInternalsAttorney; // For unit tests.
   friend class RegionsFromBBs;          // For add().
@@ -141,8 +142,8 @@ class Region {
   void removeFromAux(Instruction *I);
 
 public:
-  Region(Context &Ctx, TargetTransformInfo &TTI);
-  ~Region();
+  LLVM_ABI Region(Context &Ctx, TargetTransformInfo &TTI);
+  LLVM_ABI ~Region();
 
   Context &getContext() const { return Ctx; }
   /// Returns true if I is in the Region.
@@ -150,18 +151,18 @@ public:
   /// Returns true if the Region has no instructions.
   bool empty() const { return Insts.empty(); }
   /// Set the auxiliary vector.
-  void setAux(ArrayRef<Instruction *> Aux);
+  LLVM_ABI void setAux(ArrayRef<Instruction *> Aux);
   /// \Returns the auxiliary vector.
   const SmallVector<Instruction *> &getAux() const { return Aux; }
   /// Clears all auxiliary data.
-  void clearAux();
+  LLVM_ABI void clearAux();
 
   using iterator = decltype(Insts.begin());
   iterator begin() { return Insts.begin(); }
   iterator end() { return Insts.end(); }
   iterator_range<iterator> insts() { return make_range(begin(), end()); }
 
-  static SmallVector<std::unique_ptr<Region>>
+  LLVM_ABI static SmallVector<std::unique_ptr<Region>>
   createRegionsFromMD(Function &F, TargetTransformInfo &TTI);
   /// \Returns the ScoreBoard data structure that keeps track of instr costs.
   const ScoreBoard &getScoreboard() const { return Scoreboard; }
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index f7b469965eae..9a2c9dd51648 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -46,6 +46,8 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/SandboxIR/Use.h"
+#include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <memory>
 
@@ -149,7 +151,7 @@ public:
 #endif
 };
 
-class PHIRemoveIncoming : public IRChangeBase {
+class LLVM_ABI PHIRemoveIncoming : public IRChangeBase {
   PHINode *PHI;
   unsigned RemovedIdx;
   Value *RemovedV;
@@ -165,7 +167,7 @@ public:
 #endif
 };
 
-class PHIAddIncoming : public IRChangeBase {
+class LLVM_ABI PHIAddIncoming : public IRChangeBase {
   PHINode *PHI;
   unsigned Idx;
 
@@ -179,7 +181,7 @@ public:
 #endif
 };
 
-class CmpSwapOperands : public IRChangeBase {
+class LLVM_ABI CmpSwapOperands : public IRChangeBase {
   CmpInst *Cmp;
 
 public:
@@ -210,7 +212,7 @@ public:
 #endif
 };
 
-class EraseFromParent : public IRChangeBase {
+class LLVM_ABI EraseFromParent : public IRChangeBase {
   /// Contains all the data we need to restore an "erased" (i.e., detached)
   /// instruction: the instruction itself and its operands in order.
   struct InstrAndOperands {
@@ -242,7 +244,7 @@ public:
 #endif
 };
 
-class RemoveFromParent : public IRChangeBase {
+class LLVM_ABI RemoveFromParent : public IRChangeBase {
   /// The instruction that is about to get removed.
   Instruction *RemovedI = nullptr;
   /// This is either the next instr, or the parent BB if at the end of the BB.
@@ -327,7 +329,7 @@ public:
 #endif
 };
 
-class CatchSwitchAddHandler : public IRChangeBase {
+class LLVM_ABI CatchSwitchAddHandler : public IRChangeBase {
   CatchSwitchInst *CSI;
   unsigned HandlerIdx;
 
@@ -344,7 +346,7 @@ public:
 #endif // NDEBUG
 };
 
-class SwitchAddCase : public IRChangeBase {
+class LLVM_ABI SwitchAddCase : public IRChangeBase {
   SwitchInst *Switch;
   ConstantInt *Val;
 
@@ -359,7 +361,7 @@ public:
 #endif // NDEBUG
 };
 
-class SwitchRemoveCase : public IRChangeBase {
+class LLVM_ABI SwitchRemoveCase : public IRChangeBase {
   SwitchInst *Switch;
   struct Case {
     ConstantInt *Val;
@@ -378,7 +380,7 @@ public:
 #endif // NDEBUG
 };
 
-class MoveInstr : public IRChangeBase {
+class LLVM_ABI MoveInstr : public IRChangeBase {
   /// The instruction that moved.
   Instruction *MovedI;
   /// This is either the next instruction in the block, or the parent BB if at
@@ -395,7 +397,7 @@ public:
 #endif // NDEBUG
 };
 
-class InsertIntoBB final : public IRChangeBase {
+class LLVM_ABI InsertIntoBB final : public IRChangeBase {
   Instruction *InsertedI = nullptr;
 
 public:
@@ -408,7 +410,7 @@ public:
 #endif // NDEBUG
 };
 
-class CreateAndInsertInst final : public IRChangeBase {
+class LLVM_ABI CreateAndInsertInst final : public IRChangeBase {
   Instruction *NewI = nullptr;
 
 public:
@@ -421,7 +423,7 @@ public:
 #endif
 };
 
-class ShuffleVectorSetMask final : public IRChangeBase {
+class LLVM_ABI ShuffleVectorSetMask final : public IRChangeBase {
   ShuffleVectorInst *SVI;
   SmallVector<int, 8> PrevMask;
 
@@ -472,7 +474,7 @@ public:
   {
   }
 
-  ~Tracker();
+  LLVM_ABI ~Tracker();
   Context &getContext() const { return Ctx; }
   /// \Returns true if there are no changes tracked.
   bool empty() const { return Changes.empty(); }
@@ -506,11 +508,11 @@ public:
   /// \Returns the current state of the tracker.
   TrackerState getState() const { return State; }
   /// Turns on IR tracking.
-  void save();
+  LLVM_ABI void save();
   /// Stops tracking and accept changes.
-  void accept();
+  LLVM_ABI void accept();
   /// Stops tracking and reverts to saved state.
-  void revert();
+  LLVM_ABI void revert();
 
 #ifndef NDEBUG
   void dump(raw_ostream &OS) const;
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index f90ae096443b..d9c5e6c098da 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -264,18 +265,18 @@ public:
 
   /// If this is a vector type, return the element type, otherwise return
   /// 'this'.
-  Type *getScalarType() const;
+  LLVM_ABI Type *getScalarType() const;
 
   // TODO: ADD MISSING
 
-  static Type *getInt64Ty(Context &Ctx);
-  static Type *getInt32Ty(Context &Ctx);
-  static Type *getInt16Ty(Context &Ctx);
-  static Type *getInt8Ty(Context &Ctx);
-  static Type *getInt1Ty(Context &Ctx);
-  static Type *getDoubleTy(Context &Ctx);
-  static Type *getFloatTy(Context &Ctx);
-  static Type *getHalfTy(Context &Ctx);
+  LLVM_ABI static Type *getInt64Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt32Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt16Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt8Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt1Ty(Context &Ctx);
+  LLVM_ABI static Type *getDoubleTy(Context &Ctx);
+  LLVM_ABI static Type *getFloatTy(Context &Ctx);
+  LLVM_ABI static Type *getHalfTy(Context &Ctx);
   // TODO: missing get*
 
   /// Get the address space of this pointer or pointer vector type.
@@ -293,7 +294,7 @@ class PointerType : public Type {
 public:
   // TODO: add missing functions
 
-  static PointerType *get(Context &Ctx, unsigned AddressSpace);
+  LLVM_ABI static PointerType *get(Context &Ctx, unsigned AddressSpace);
 
   static bool classof(const Type *From) {
     return isa<llvm::PointerType>(From->LLVMTy);
@@ -302,7 +303,7 @@ public:
 
 class ArrayType : public Type {
 public:
-  static ArrayType *get(Type *ElementType, uint64_t NumElements);
+  LLVM_ABI static ArrayType *get(Type *ElementType, uint64_t NumElements);
   // TODO: add missing functions
   static bool classof(const Type *From) {
     return isa<llvm::ArrayType>(From->LLVMTy);
@@ -312,8 +313,8 @@ public:
 class StructType : public Type {
 public:
   /// This static method is the primary way to create a literal StructType.
-  static StructType *get(Context &Ctx, ArrayRef<Type *> Elements,
-                         bool IsPacked = false);
+  LLVM_ABI static StructType *get(Context &Ctx, ArrayRef<Type *> Elements,
+                                  bool IsPacked = false);
 
   bool isPacked() const { return cast<llvm::StructType>(LLVMTy)->isPacked(); }
 
@@ -325,13 +326,13 @@ public:
 
 class VectorType : public Type {
 public:
-  static VectorType *get(Type *ElementType, ElementCount EC);
+  LLVM_ABI static VectorType *get(Type *ElementType, ElementCount EC);
   static VectorType *get(Type *ElementType, unsigned NumElements,
                          bool Scalable) {
     return VectorType::get(ElementType,
                            ElementCount::get(NumElements, Scalable));
   }
-  Type *getElementType() const;
+  LLVM_ABI Type *getElementType() const;
 
   static VectorType *get(Type *ElementType, const VectorType *Other) {
     return VectorType::get(ElementType, Other->getElementCount());
@@ -340,13 +341,14 @@ public:
   inline ElementCount getElementCount() const {
     return cast<llvm::VectorType>(LLVMTy)->getElementCount();
   }
-  static VectorType *getInteger(VectorType *VTy);
-  static VectorType *getExtendedElementVectorType(VectorType *VTy);
-  static VectorType *getTruncatedElementVectorType(VectorType *VTy);
-  static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs);
-  static VectorType *getHalfElementsVectorType(VectorType *VTy);
-  static VectorType *getDoubleElementsVectorType(VectorType *VTy);
-  static bool isValidElementType(Type *ElemTy);
+  LLVM_ABI static VectorType *getInteger(VectorType *VTy);
+  LLVM_ABI static VectorType *getExtendedElementVectorType(VectorType *VTy);
+  LLVM_ABI static VectorType *getTruncatedElementVectorType(VectorType *VTy);
+  LLVM_ABI static VectorType *getSubdividedVectorType(VectorType *VTy,
+                                                      int NumSubdivs);
+  LLVM_ABI static VectorType *getHalfElementsVectorType(VectorType *VTy);
+  LLVM_ABI static VectorType *getDoubleElementsVectorType(VectorType *VTy);
+  LLVM_ABI static bool isValidElementType(Type *ElemTy);
 
   static bool classof(const Type *From) {
     return isa<llvm::VectorType>(From->LLVMTy);
@@ -355,7 +357,7 @@ public:
 
 class FixedVectorType : public VectorType {
 public:
-  static FixedVectorType *get(Type *ElementType, unsigned NumElts);
+  LLVM_ABI static FixedVectorType *get(Type *ElementType, unsigned NumElts);
 
   static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) {
     return get(ElementType, FVTy->getNumElements());
@@ -399,7 +401,8 @@ public:
 
 class ScalableVectorType : public VectorType {
 public:
-  static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts);
+  LLVM_ABI static ScalableVectorType *get(Type *ElementType,
+                                          unsigned MinNumElts);
 
   static ScalableVectorType *get(Type *ElementType,
                                  const ScalableVectorType *SVTy) {
@@ -462,7 +465,7 @@ public:
 /// Integer representation type
 class IntegerType : public Type {
 public:
-  static IntegerType *get(Context &C, unsigned NumBits);
+  LLVM_ABI static IntegerType *get(Context &C, unsigned NumBits);
   // TODO: add missing functions
   static bool classof(const Type *From) {
     return isa<llvm::IntegerType>(From->LLVMTy);
diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h
index c4a774aa3a89..5c02c4f2b349 100644
--- a/llvm/include/llvm/SandboxIR/Use.h
+++ b/llvm/include/llvm/SandboxIR/Use.h
@@ -14,6 +14,7 @@
 #define LLVM_SANDBOXIR_USE_H
 
 #include "llvm/IR/Use.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm::sandboxir {
@@ -49,11 +50,11 @@ class Use {
 
 public:
   operator Value *() const { return get(); }
-  Value *get() const;
-  void set(Value *V);
+  LLVM_ABI Value *get() const;
+  LLVM_ABI void set(Value *V);
   class User *getUser() const { return Usr; }
-  unsigned getOperandNo() const;
-  void swap(Use &OtherUse);
+  LLVM_ABI unsigned getOperandNo() const;
+  LLVM_ABI void swap(Use &OtherUse);
   Context *getContext() const { return Ctx; }
   bool operator==(const Use &Other) const {
     assert(Ctx == Other.Ctx && "Contexts differ!");
diff --git a/llvm/include/llvm/SandboxIR/User.h b/llvm/include/llvm/SandboxIR/User.h
index 80e672de3490..c552e2e3378b 100644
--- a/llvm/include/llvm/SandboxIR/User.h
+++ b/llvm/include/llvm/SandboxIR/User.h
@@ -13,6 +13,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Use.h"
 #include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -36,8 +37,8 @@ public:
   using iterator_category = std::input_iterator_tag;
 
   OperandUseIterator() = default;
-  value_type operator*() const;
-  OperandUseIterator &operator++();
+  LLVM_ABI value_type operator*() const;
+  LLVM_ABI OperandUseIterator &operator++();
   OperandUseIterator operator++(int) {
     auto Copy = *this;
     this->operator++();
@@ -49,13 +50,13 @@ public:
   bool operator!=(const OperandUseIterator &Other) const {
     return !(*this == Other);
   }
-  OperandUseIterator operator+(unsigned Num) const;
-  OperandUseIterator operator-(unsigned Num) const;
-  int operator-(const OperandUseIterator &Other) const;
+  LLVM_ABI OperandUseIterator operator+(unsigned Num) const;
+  LLVM_ABI OperandUseIterator operator-(unsigned Num) const;
+  LLVM_ABI int operator-(const OperandUseIterator &Other) const;
 };
 
 /// A sandboxir::User has operands.
-class User : public Value {
+class LLVM_ABI User : public Value {
 protected:
   User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {}
 
diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h
index dbd0208b4f3f..dd0bc76db3e3 100644
--- a/llvm/include/llvm/SandboxIR/Value.h
+++ b/llvm/include/llvm/SandboxIR/Value.h
@@ -12,6 +12,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Use.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -50,7 +51,7 @@ public:
 
   UserUseIterator() = default;
   value_type operator*() const { return Use; }
-  UserUseIterator &operator++();
+  LLVM_ABI UserUseIterator &operator++();
   bool operator==(const UserUseIterator &Other) const {
     return Use == Other.Use;
   }
@@ -179,7 +180,7 @@ protected:
   void clearValue() { Val = nullptr; }
   template <typename ItTy, typename SBTy> friend class LLVMOpUserItToSBTy;
 
-  Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx);
+  LLVM_ABI Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx);
   /// Disable copies.
   Value(const Value &) = delete;
   Value &operator=(const Value &) = delete;
@@ -191,7 +192,7 @@ public:
   using use_iterator = UserUseIterator;
   using const_use_iterator = UserUseIterator;
 
-  use_iterator use_begin();
+  LLVM_ABI use_iterator use_begin();
   const_use_iterator use_begin() const {
     return const_cast<Value *>(this)->use_begin();
   }
@@ -215,7 +216,7 @@ public:
   using user_iterator = mapped_iterator<sandboxir::UserUseIterator, UseToUser>;
   using const_user_iterator = user_iterator;
 
-  user_iterator user_begin();
+  LLVM_ABI user_iterator user_begin();
   user_iterator user_end() {
     return user_iterator(Use(nullptr, nullptr, Ctx), UseToUser());
   }
@@ -234,7 +235,7 @@ public:
   }
   /// \Returns the number of user edges (not necessarily to unique users).
   /// WARNING: This is a linear-time operation.
-  unsigned getNumUses() const;
+  LLVM_ABI unsigned getNumUses() const;
   /// Return true if this value has N uses or more.
   /// This is logically equivalent to getNumUses() >= N.
   /// WARNING: This can be expensive, as it is linear to the number of users.
@@ -256,13 +257,14 @@ public:
     return Cnt == Num;
   }
 
-  Type *getType() const;
+  LLVM_ABI Type *getType() const;
 
   Context &getContext() const { return Ctx; }
 
-  void replaceUsesWithIf(Value *OtherV,
-                         llvm::function_ref<bool(const Use &)> ShouldReplace);
-  void replaceAllUsesWith(Value *Other);
+  LLVM_ABI void
+  replaceUsesWithIf(Value *OtherV,
+                    llvm::function_ref<bool(const Use &)> ShouldReplace);
+  LLVM_ABI void replaceAllUsesWith(Value *Other);
 
   /// \Returns the LLVM IR name of the bottom-most LLVM value.
   StringRef getName() const { return Val->getName(); }
diff --git a/llvm/lib/SandboxIR/Constant.cpp b/llvm/lib/SandboxIR/Constant.cpp
index 82cf0876d580..9de88ef2cf0a 100644
--- a/llvm/lib/SandboxIR/Constant.cpp
+++ b/llvm/lib/SandboxIR/Constant.cpp
@@ -305,35 +305,14 @@ GlobalT &GlobalWithNodeAPI<GlobalT, LLVMGlobalT, ParentT, LLVMParentT>::
 }
 
 // Explicit instantiations.
-template class GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                                 llvm::GlobalObject>;
-template class GlobalWithNodeAPI<Function, llvm::Function, GlobalObject,
-                                 llvm::GlobalObject>;
-template class GlobalWithNodeAPI<GlobalVariable, llvm::GlobalVariable,
-                                 GlobalObject, llvm::GlobalObject>;
-template class GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                                 llvm::GlobalValue>;
-
-#if defined(_MSC_VER) && !defined(__clang__)
-// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB
-template LLVM_EXPORT_TEMPLATE GlobalIFunc &
-GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                  llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalIFunc
-                                                                  &LLVMGV)
-    const;
-template LLVM_EXPORT_TEMPLATE Function &
-GlobalWithNodeAPI<Function, llvm::Function, GlobalObject, llvm::GlobalObject>::
-    LLVMGVToGV::operator()(llvm::Function &LLVMGV) const;
-
-template LLVM_EXPORT_TEMPLATE GlobalVariable &GlobalWithNodeAPI<
-    GlobalVariable, llvm::GlobalVariable, GlobalObject,
-    llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV)
-    const;
-template LLVM_EXPORT_TEMPLATE GlobalAlias &
-GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                  llvm::GlobalValue>::LLVMGVToGV::operator()(llvm::GlobalAlias
-                                                                 &LLVMGV) const;
-#endif
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>;
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    Function, llvm::Function, GlobalObject, llvm::GlobalObject>;
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>;
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>;
 
 void GlobalIFunc::setResolver(Constant *Resolver) {
   Ctx.getTracker()

From 2652d1b2fd65950a66f37ed6d5ed9c4ffabacbee Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 11 Jun 2025 09:19:47 -0700
Subject: [PATCH 0007/1322] [llvm] annotate interfaces in llvm/TextAPI for DLL
 export (#143447)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/TextAPI` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

These changes were generated automatically using the [Interface
Definition Scanner (IDS)](https://github.com/compnerd/ids) tool,
followed formatting with `git clang-format`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/TextAPI/Architecture.h    | 17 ++++---
 llvm/include/llvm/TextAPI/ArchitectureSet.h | 13 +++---
 llvm/include/llvm/TextAPI/DylibReader.h     | 10 ++--
 llvm/include/llvm/TextAPI/InterfaceFile.h   | 34 +++++++-------
 llvm/include/llvm/TextAPI/PackedVersion.h   |  9 ++--
 llvm/include/llvm/TextAPI/Platform.h        | 17 +++----
 llvm/include/llvm/TextAPI/Record.h          | 18 +++----
 llvm/include/llvm/TextAPI/RecordVisitor.h   |  5 +-
 llvm/include/llvm/TextAPI/RecordsSlice.h    | 52 ++++++++++++---------
 llvm/include/llvm/TextAPI/Symbol.h          |  8 ++--
 llvm/include/llvm/TextAPI/SymbolSet.h       | 11 +++--
 llvm/include/llvm/TextAPI/Target.h          | 15 +++---
 llvm/include/llvm/TextAPI/TextAPIError.h    |  3 +-
 llvm/include/llvm/TextAPI/TextAPIReader.h   |  5 +-
 llvm/include/llvm/TextAPI/TextAPIWriter.h   |  8 ++--
 llvm/include/llvm/TextAPI/Utils.h           | 21 +++++----
 16 files changed, 138 insertions(+), 108 deletions(-)

diff --git a/llvm/include/llvm/TextAPI/Architecture.h b/llvm/include/llvm/TextAPI/Architecture.h
index 978359995074..7a7f5416fe7c 100644
--- a/llvm/include/llvm/TextAPI/Architecture.h
+++ b/llvm/include/llvm/TextAPI/Architecture.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_ARCHITECTURE_H
 #define LLVM_TEXTAPI_ARCHITECTURE_H
 
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 #include <utility>
 
@@ -32,24 +33,26 @@ enum Architecture : uint8_t {
 };
 
 /// Convert a CPU Type and Subtype pair to an architecture slice.
-Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType);
+LLVM_ABI Architecture getArchitectureFromCpuType(uint32_t CPUType,
+                                                 uint32_t CPUSubType);
 
 /// Convert a name to an architecture slice.
-Architecture getArchitectureFromName(StringRef Name);
+LLVM_ABI Architecture getArchitectureFromName(StringRef Name);
 
 /// Convert an architecture slice to a string.
-StringRef getArchitectureName(Architecture Arch);
+LLVM_ABI StringRef getArchitectureName(Architecture Arch);
 
 /// Convert an architecture slice to a CPU Type and Subtype pair.
-std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch);
+LLVM_ABI std::pair<uint32_t, uint32_t>
+getCPUTypeFromArchitecture(Architecture Arch);
 
 /// Convert a target to an architecture slice.
-Architecture mapToArchitecture(const llvm::Triple &Target);
+LLVM_ABI Architecture mapToArchitecture(const llvm::Triple &Target);
 
 /// Check if architecture is 64 bit.
-bool is64Bit(Architecture);
+LLVM_ABI bool is64Bit(Architecture);
 
-raw_ostream &operator<<(raw_ostream &OS, Architecture Arch);
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, Architecture Arch);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/ArchitectureSet.h b/llvm/include/llvm/TextAPI/ArchitectureSet.h
index 2cce9dbf0d80..a7d3394c9982 100644
--- a/llvm/include/llvm/TextAPI/ArchitectureSet.h
+++ b/llvm/include/llvm/TextAPI/ArchitectureSet.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_ARCHITECTURESET_H
 #define LLVM_TEXTAPI_ARCHITECTURESET_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Architecture.h"
 #include <cstddef>
 #include <iterator>
@@ -38,7 +39,7 @@ public:
   constexpr ArchitectureSet() = default;
   constexpr ArchitectureSet(ArchSetType Raw) : ArchSet(Raw) {}
   ArchitectureSet(Architecture Arch) : ArchitectureSet() { set(Arch); }
-  ArchitectureSet(const std::vector<Architecture> &Archs);
+  LLVM_ABI ArchitectureSet(const std::vector<Architecture> &Archs);
 
   static ArchitectureSet All() { return ArchitectureSet(EndIndexVal); }
 
@@ -61,7 +62,7 @@ public:
     return (ArchSet & Archs.ArchSet) == Archs.ArchSet;
   }
 
-  size_t count() const;
+  LLVM_ABI size_t count() const;
 
   bool empty() const { return ArchSet == 0; }
 
@@ -158,9 +159,9 @@ public:
   const_iterator begin() const { return {&ArchSet}; }
   const_iterator end() const { return {&ArchSet, EndIndexVal}; }
 
-  operator std::string() const;
-  operator std::vector<Architecture>() const;
-  void print(raw_ostream &OS) const;
+  LLVM_ABI operator std::string() const;
+  LLVM_ABI operator std::vector<Architecture>() const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 };
 
 inline ArchitectureSet operator|(const Architecture &lhs,
@@ -168,7 +169,7 @@ inline ArchitectureSet operator|(const Architecture &lhs,
   return ArchitectureSet(lhs) | ArchitectureSet(rhs);
 }
 
-raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set);
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/DylibReader.h b/llvm/include/llvm/TextAPI/DylibReader.h
index 6861d3cb1591..f3a806d78df7 100644
--- a/llvm/include/llvm/TextAPI/DylibReader.h
+++ b/llvm/include/llvm/TextAPI/DylibReader.h
@@ -14,6 +14,7 @@
 #define LLVM_TEXTAPI_DYLIBREADER_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
@@ -37,20 +38,21 @@ struct ParseOption {
 /// \param Buffer Data that points to dylib.
 /// \param Options Determines which attributes to extract.
 /// \return List of record slices.
-Expected<Records> readFile(MemoryBufferRef Buffer, const ParseOption &Opt);
+LLVM_ABI Expected<Records> readFile(MemoryBufferRef Buffer,
+                                    const ParseOption &Opt);
 
 /// Get TAPI file representation of binary dylib.
 ///
 /// \param Buffer Data that points to dylib.
-Expected<std::unique_ptr<InterfaceFile>> get(MemoryBufferRef Buffer);
+LLVM_ABI Expected<std::unique_ptr<InterfaceFile>> get(MemoryBufferRef Buffer);
 
 using SymbolToSourceLocMap = llvm::StringMap<RecordLoc>;
 /// Get the source location for each symbol from dylib.
 ///
 /// \param DSYM Path to DSYM file.
 /// \param T Requested target slice for dylib.
-SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM,
-                                                 const Target &T);
+LLVM_ABI SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM,
+                                                          const Target &T);
 
 } // namespace llvm::MachO::DylibReader
 
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 23c27cb0f474..747c8d0a208c 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
@@ -60,7 +61,7 @@ public:
 
   StringRef getInstallName() const { return InstallName; };
 
-  void addTarget(const Target &Target);
+  LLVM_ABI void addTarget(const Target &Target);
   template <typename RangeT> void addTargets(RangeT &&Targets) {
     for (const auto &Target : Targets)
       addTarget(Target(Target));
@@ -146,7 +147,7 @@ public:
   /// Set and add target.
   ///
   /// \param Target the target to add into.
-  void addTarget(const Target &Target);
+  LLVM_ABI void addTarget(const Target &Target);
 
   /// Determine if target triple slice exists in file.
   ///
@@ -174,7 +175,7 @@ public:
                             std::function<bool(const Target &)>>;
   using const_filtered_target_range =
       llvm::iterator_range<const_filtered_target_iterator>;
-  const_filtered_target_range targets(ArchitectureSet Archs) const;
+  LLVM_ABI const_filtered_target_range targets(ArchitectureSet Archs) const;
 
   /// Set the install name of the library.
   void setInstallName(StringRef InstallName_) {
@@ -241,7 +242,7 @@ public:
   /// Set the parent umbrella frameworks.
   /// \param Target_ The target applicable to Parent
   /// \param Parent  The name of Parent
-  void addParentUmbrella(const Target &Target_, StringRef Parent);
+  LLVM_ABI void addParentUmbrella(const Target &Target_, StringRef Parent);
 
   /// Get the list of Parent Umbrella frameworks.
   ///
@@ -261,7 +262,7 @@ public:
   /// \param InstallName The name of the client that is allowed to link this
   /// library.
   /// \param Target The target triple for which this applies.
-  void addAllowableClient(StringRef InstallName, const Target &Target);
+  LLVM_ABI void addAllowableClient(StringRef InstallName, const Target &Target);
 
   /// Get the list of allowable clients.
   ///
@@ -274,7 +275,8 @@ public:
   ///
   /// \param InstallName The name of the library to re-export.
   /// \param Target The target triple for which this applies.
-  void addReexportedLibrary(StringRef InstallName, const Target &Target);
+  LLVM_ABI void addReexportedLibrary(StringRef InstallName,
+                                     const Target &Target);
 
   /// Get the list of re-exported libraries.
   ///
@@ -286,7 +288,7 @@ public:
   /// Add a library for inlining to top level library.
   ///
   ///\param Document The library to inline with top level library.
-  void addDocument(std::shared_ptr<InterfaceFile> &&Document);
+  LLVM_ABI void addDocument(std::shared_ptr<InterfaceFile> &&Document);
 
   /// Returns the pointer to parent document if exists or nullptr otherwise.
   InterfaceFile *getParent() const { return Parent; }
@@ -301,7 +303,7 @@ public:
   /// Set the runpath search paths.
   /// \param RPath The name of runpath.
   /// \param InputTarget The target applicable to runpath search path.
-  void addRPath(StringRef RPath, const Target &InputTarget);
+  LLVM_ABI void addRPath(StringRef RPath, const Target &InputTarget);
 
   /// Get the list of runpath search paths.
   ///
@@ -373,14 +375,14 @@ public:
   ///
   /// \param Arch architecture to extract from.
   /// \return New InterfaceFile with extracted architecture slice.
-  llvm::Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI llvm::Expected<std::unique_ptr<InterfaceFile>>
   extract(Architecture Arch) const;
 
   /// Remove architecture slice from Interface.
   ///
   /// \param Arch architecture to remove.
   /// \return New Interface File with removed architecture slice.
-  llvm::Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI llvm::Expected<std::unique_ptr<InterfaceFile>>
   remove(Architecture Arch) const;
 
   /// Merge Interfaces for the same library. The following library attributes
@@ -390,29 +392,29 @@ public:
   ///
   /// \param O The Interface to merge.
   /// \return New Interface File that was merged.
-  llvm::Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI llvm::Expected<std::unique_ptr<InterfaceFile>>
   merge(const InterfaceFile *O) const;
 
   /// Inline reexported library into Interface.
   ///
   /// \param Library Interface of reexported library.
   /// \param Overwrite Whether to overwrite preexisting inlined library.
-  void inlineLibrary(std::shared_ptr<InterfaceFile> Library,
-                     bool Overwrite = false);
+  LLVM_ABI void inlineLibrary(std::shared_ptr<InterfaceFile> Library,
+                              bool Overwrite = false);
 
   /// Set InterfaceFile properties from pre-gathered binary attributes,
   /// if they are not set already.
   ///
   /// \param BA Attributes typically represented in load commands.
   /// \param Targ MachO Target slice to add attributes to.
-  void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
-                          const Target &Targ);
+  LLVM_ABI void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
+                                   const Target &Targ);
 
   /// The equality is determined by attributes that impact linking
   /// compatibilities. Path, & FileKind are irrelevant since these by
   /// itself should not impact linking.
   /// This is an expensive operation.
-  bool operator==(const InterfaceFile &O) const;
+  LLVM_ABI bool operator==(const InterfaceFile &O) const;
 
   bool operator!=(const InterfaceFile &O) const { return !(*this == O); }
 
diff --git a/llvm/include/llvm/TextAPI/PackedVersion.h b/llvm/include/llvm/TextAPI/PackedVersion.h
index e680d40c7104..cabe365e6d97 100644
--- a/llvm/include/llvm/TextAPI/PackedVersion.h
+++ b/llvm/include/llvm/TextAPI/PackedVersion.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_PACKEDVERSION_H
 #define LLVM_TEXTAPI_PACKEDVERSION_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 #include <cstdint>
 #include <string>
@@ -53,8 +54,8 @@ public:
   /// Retrieve the subminor version number, if provided.
   unsigned getSubminor() const { return Version & 0xff; }
 
-  bool parse32(StringRef Str);
-  std::pair<bool, bool> parse64(StringRef Str);
+  LLVM_ABI bool parse32(StringRef Str);
+  LLVM_ABI std::pair<bool, bool> parse64(StringRef Str);
 
   bool operator<(const PackedVersion &O) const { return Version < O.Version; }
 
@@ -64,9 +65,9 @@ public:
 
   uint32_t rawValue() const { return Version; }
 
-  operator std::string() const;
+  LLVM_ABI operator std::string() const;
 
-  void print(raw_ostream &OS) const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const PackedVersion &Version) {
diff --git a/llvm/include/llvm/TextAPI/Platform.h b/llvm/include/llvm/TextAPI/Platform.h
index d828d9ac49f6..8ea187acc02f 100644
--- a/llvm/include/llvm/TextAPI/Platform.h
+++ b/llvm/include/llvm/TextAPI/Platform.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 
 namespace llvm {
@@ -22,14 +23,14 @@ namespace MachO {
 using PlatformSet = SmallSet<PlatformType, 3>;
 using PlatformVersionSet = SmallSet<std::pair<PlatformType, VersionTuple>, 3>;
 
-PlatformType mapToPlatformType(PlatformType Platform, bool WantSim);
-PlatformType mapToPlatformType(const Triple &Target);
-PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets);
-StringRef getPlatformName(PlatformType Platform);
-PlatformType getPlatformFromName(StringRef Name);
-std::string getOSAndEnvironmentName(PlatformType Platform,
-                                    std::string Version = "");
-VersionTuple mapToSupportedOSVersion(const Triple &Triple);
+LLVM_ABI PlatformType mapToPlatformType(PlatformType Platform, bool WantSim);
+LLVM_ABI PlatformType mapToPlatformType(const Triple &Target);
+LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets);
+LLVM_ABI StringRef getPlatformName(PlatformType Platform);
+LLVM_ABI PlatformType getPlatformFromName(StringRef Name);
+LLVM_ABI std::string getOSAndEnvironmentName(PlatformType Platform,
+                                             std::string Version = "");
+LLVM_ABI VersionTuple mapToSupportedOSVersion(const Triple &Triple);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h
index 7d721988ec3d..6e470d97325f 100644
--- a/llvm/include/llvm/TextAPI/Record.h
+++ b/llvm/include/llvm/TextAPI/Record.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Symbol.h"
 #include <string>
 
@@ -104,7 +105,7 @@ public:
   SymbolFlags getFlags() const { return Flags; }
 
 private:
-  SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage);
+  LLVM_ABI SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage);
 
 protected:
   StringRef Name;
@@ -164,9 +165,9 @@ public:
   ObjCContainerRecord(StringRef Name, RecordLinkage Linkage)
       : Record({Name, Linkage, SymbolFlags::Data}) {}
 
-  ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage);
-  ObjCIVarRecord *findObjCIVar(StringRef IVar) const;
-  std::vector<ObjCIVarRecord *> getObjCIVars() const;
+  LLVM_ABI ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage);
+  LLVM_ABI ObjCIVarRecord *findObjCIVar(StringRef IVar) const;
+  LLVM_ABI std::vector<ObjCIVarRecord *> getObjCIVars() const;
   RecordLinkage getLinkage() const { return Linkage; }
 
 private:
@@ -207,11 +208,12 @@ public:
     return getLinkageForSymbol(CurrType) >= RecordLinkage::Rexported;
   }
 
-  RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const;
-  void updateLinkageForSymbols(ObjCIFSymbolKind SymType, RecordLinkage Link);
+  LLVM_ABI RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const;
+  LLVM_ABI void updateLinkageForSymbols(ObjCIFSymbolKind SymType,
+                                        RecordLinkage Link);
 
-  bool addObjCCategory(ObjCCategoryRecord *Record);
-  std::vector<ObjCCategoryRecord *> getObjCCategories() const;
+  LLVM_ABI bool addObjCCategory(ObjCCategoryRecord *Record);
+  LLVM_ABI std::vector<ObjCCategoryRecord *> getObjCCategories() const;
 
 private:
   /// Linkage level for each symbol represented in ObjCInterfaceRecord.
diff --git a/llvm/include/llvm/TextAPI/RecordVisitor.h b/llvm/include/llvm/TextAPI/RecordVisitor.h
index 34e43f5b0027..65bc96df244d 100644
--- a/llvm/include/llvm/TextAPI/RecordVisitor.h
+++ b/llvm/include/llvm/TextAPI/RecordVisitor.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_RECORDVISITOR_H
 #define LLVM_TEXTAPI_RECORDVISITOR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Record.h"
 #include "llvm/TextAPI/SymbolSet.h"
 
@@ -20,7 +21,7 @@ namespace llvm {
 namespace MachO {
 
 /// Base class for any usage of traversing over collected Records.
-class RecordVisitor {
+class LLVM_ABI RecordVisitor {
 public:
   virtual ~RecordVisitor();
 
@@ -32,7 +33,7 @@ public:
 /// Specialized RecordVisitor for collecting exported symbols
 /// and undefined symbols if RecordSlice being visited represents a
 /// flat-namespaced library.
-class SymbolConverter : public RecordVisitor {
+class LLVM_ABI SymbolConverter : public RecordVisitor {
 public:
   SymbolConverter(SymbolSet *Symbols, const Target &T,
                   const bool RecordUndefs = false)
diff --git a/llvm/include/llvm/TextAPI/RecordsSlice.h b/llvm/include/llvm/TextAPI/RecordsSlice.h
index f934cf7607f1..6ecb79a115ae 100644
--- a/llvm/include/llvm/TextAPI/RecordsSlice.h
+++ b/llvm/include/llvm/TextAPI/RecordsSlice.h
@@ -15,6 +15,7 @@
 #define LLVM_TEXTAPI_RECORDSLICE_H
 
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Record.h"
@@ -43,9 +44,10 @@ public:
   /// symbol.
   /// \param Linkage The linkage of symbol.
   /// \return The non-owning pointer to added record in slice.
-  Record *addRecord(StringRef Name, SymbolFlags Flags,
-                    GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown,
-                    RecordLinkage Linkage = RecordLinkage::Unknown);
+  LLVM_ABI Record *
+  addRecord(StringRef Name, SymbolFlags Flags,
+            GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown,
+            RecordLinkage Linkage = RecordLinkage::Unknown);
 
   /// Add non-ObjC global record.
   ///
@@ -56,10 +58,10 @@ public:
   /// \param Inlined Whether declaration is inlined, only applicable to
   /// functions.
   /// \return The non-owning pointer to added record in slice.
-  GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage,
-                          GlobalRecord::Kind GV,
-                          SymbolFlags Flags = SymbolFlags::None,
-                          bool Inlined = false);
+  LLVM_ABI GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage,
+                                   GlobalRecord::Kind GV,
+                                   SymbolFlags Flags = SymbolFlags::None,
+                                   bool Inlined = false);
 
   /// Add ObjC Class record.
   ///
@@ -67,8 +69,9 @@ public:
   /// \param Linkage The linkage of symbol.
   /// \param SymType The symbols this class represents.
   /// \return The non-owning pointer to added record in slice.
-  ObjCInterfaceRecord *addObjCInterface(StringRef Name, RecordLinkage Linkage,
-                                        ObjCIFSymbolKind SymType);
+  LLVM_ABI ObjCInterfaceRecord *addObjCInterface(StringRef Name,
+                                                 RecordLinkage Linkage,
+                                                 ObjCIFSymbolKind SymType);
 
   /// Add ObjC IVar record.
   ///
@@ -76,8 +79,8 @@ public:
   /// \param Name The name of ivar, not symbol.
   /// \param Linkage The linkage of symbol.
   /// \return The non-owning pointer to added record in slice.
-  ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container, StringRef Name,
-                              RecordLinkage Linkage);
+  LLVM_ABI ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container,
+                                       StringRef Name, RecordLinkage Linkage);
 
   /// Add ObjC Category record.
   ///
@@ -85,22 +88,22 @@ public:
   /// category, not symbol.
   /// \param Category The name of category.
   /// \return The non-owning pointer to added record in slice.
-  ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend,
-                                      StringRef Category);
+  LLVM_ABI ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend,
+                                               StringRef Category);
 
   /// Find ObjC Class.
   ///
   /// \param Name name of class, not full symbol name.
   /// \return The non-owning pointer to record in slice.
-  ObjCInterfaceRecord *findObjCInterface(StringRef Name) const;
+  LLVM_ABI ObjCInterfaceRecord *findObjCInterface(StringRef Name) const;
 
   /// Find ObjC Category.
   ///
   /// \param ClassToExtend The name of class, not full symbol name.
   /// \param Category The name of category.
   /// \return The non-owning pointer to record in slice.
-  ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend,
-                                       StringRef Category) const;
+  LLVM_ABI ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend,
+                                                StringRef Category) const;
 
   /// Find ObjC Container. This is commonly used for assigning for looking up
   /// instance variables that are assigned to either a category or class.
@@ -110,21 +113,23 @@ public:
   /// \param Name Either the name of ivar or name of container.
   /// \return The non-owning pointer to record in
   /// slice.
-  ObjCContainerRecord *findContainer(bool IsIVar, StringRef Name) const;
+  LLVM_ABI ObjCContainerRecord *findContainer(bool IsIVar,
+                                              StringRef Name) const;
 
   /// Find ObjC instance variable.
   ///
   /// \param IsScopedName This is used to determine how to parse the name.
   /// \param Name Either the full name of the symbol or just the ivar.
   /// \return The non-owning pointer to record in slice.
-  ObjCIVarRecord *findObjCIVar(bool IsScopedName, StringRef Name) const;
+  LLVM_ABI ObjCIVarRecord *findObjCIVar(bool IsScopedName,
+                                        StringRef Name) const;
 
   /// Find non-objc global.
   ///
   /// \param Name The name of symbol.
   /// \param GV The Kind of global to find.
   /// \return The non-owning pointer to record in slice.
-  GlobalRecord *
+  LLVM_ABI GlobalRecord *
   findGlobal(StringRef Name,
              GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown) const;
 
@@ -138,7 +143,7 @@ public:
   }
 
   // Visit all records known to RecordsSlice.
-  void visit(RecordVisitor &V) const;
+  LLVM_ABI void visit(RecordVisitor &V) const;
 
   struct BinaryAttrs {
     std::vector<StringRef> AllowableClients;
@@ -158,11 +163,11 @@ public:
   };
 
   /// Return reference to BinaryAttrs.
-  BinaryAttrs &getBinaryAttrs();
+  LLVM_ABI BinaryAttrs &getBinaryAttrs();
 
   /// Store any strings owned by RecordSlice into allocator and return back
   /// reference to that.
-  StringRef copyString(StringRef String);
+  LLVM_ABI StringRef copyString(StringRef String);
 
 private:
   const llvm::Triple TargetTriple;
@@ -196,7 +201,8 @@ private:
 
 using Records = llvm::SmallVector<std::shared_ptr<RecordsSlice>, 4>;
 class InterfaceFile;
-std::unique_ptr<InterfaceFile> convertToInterfaceFile(const Records &Slices);
+LLVM_ABI std::unique_ptr<InterfaceFile>
+convertToInterfaceFile(const Records &Slices);
 
 } // namespace MachO
 } // namespace llvm
diff --git a/llvm/include/llvm/TextAPI/Symbol.h b/llvm/include/llvm/TextAPI/Symbol.h
index 5a5eb0eb4832..92ff0746f799 100644
--- a/llvm/include/llvm/TextAPI/Symbol.h
+++ b/llvm/include/llvm/TextAPI/Symbol.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/Target.h"
@@ -152,14 +153,15 @@ public:
                             std::function<bool(const Target &)>>;
   using const_filtered_target_range =
       llvm::iterator_range<const_filtered_target_iterator>;
-  const_filtered_target_range targets(ArchitectureSet architectures) const;
+  LLVM_ABI const_filtered_target_range
+  targets(ArchitectureSet architectures) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump(raw_ostream &OS) const;
   void dump() const { dump(llvm::errs()); }
 #endif
 
-  bool operator==(const Symbol &O) const;
+  LLVM_ABI bool operator==(const Symbol &O) const;
 
   bool operator!=(const Symbol &O) const { return !(*this == O); }
 
@@ -189,7 +191,7 @@ struct SimpleSymbol {
 /// Get symbol classification by parsing the name of a symbol.
 ///
 /// \param SymName The name of symbol.
-SimpleSymbol parseSymbol(StringRef SymName);
+LLVM_ABI SimpleSymbol parseSymbol(StringRef SymName);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h
index 6ccabb907720..cd3066317f3a 100644
--- a/llvm/include/llvm/TextAPI/SymbolSet.h
+++ b/llvm/include/llvm/TextAPI/SymbolSet.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Architecture.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/Symbol.h"
@@ -87,12 +88,12 @@ private:
   using SymbolsMapType = llvm::DenseMap<SymbolsMapKey, Symbol *>;
   SymbolsMapType Symbols;
 
-  Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags);
+  LLVM_ABI Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags);
 
 public:
   SymbolSet() = default;
-  Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
-                    const Target &Targ);
+  LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
+                             const Target &Targ);
   size_t size() const { return Symbols.size(); }
 
   template <typename RangeT, typename ElT = std::remove_reference_t<
@@ -107,7 +108,7 @@ public:
     return Global;
   }
 
-  const Symbol *
+  LLVM_ABI const Symbol *
   findSymbol(EncodeKind Kind, StringRef Name,
              ObjCIFSymbolKind ObjCIF = ObjCIFSymbolKind::None) const;
 
@@ -169,7 +170,7 @@ public:
         fn);
   }
 
-  bool operator==(const SymbolSet &O) const;
+  LLVM_ABI bool operator==(const SymbolSet &O) const;
 
   bool operator!=(const SymbolSet &O) const { return !(Symbols == O.Symbols); }
 
diff --git a/llvm/include/llvm/TextAPI/Target.h b/llvm/include/llvm/TextAPI/Target.h
index edcc0708d147..5cc507fe21f7 100644
--- a/llvm/include/llvm/TextAPI/Target.h
+++ b/llvm/include/llvm/TextAPI/Target.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TEXTAPI_TARGET_H
 #define LLVM_TEXTAPI_TARGET_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/TargetParser/Triple.h"
@@ -35,9 +36,9 @@ public:
       : Arch(mapToArchitecture(Triple)), Platform(mapToPlatformType(Triple)),
         MinDeployment(mapToSupportedOSVersion(Triple)) {}
 
-  static llvm::Expected<Target> create(StringRef Target);
+  LLVM_ABI static llvm::Expected<Target> create(StringRef Target);
 
-  operator std::string() const;
+  LLVM_ABI operator std::string() const;
 
   Architecture Arch;
   PlatformType Platform;
@@ -66,13 +67,13 @@ inline bool operator!=(const Target &LHS, const Architecture &RHS) {
   return LHS.Arch != RHS;
 }
 
-PlatformVersionSet mapToPlatformVersionSet(ArrayRef<Target> Targets);
-PlatformSet mapToPlatformSet(ArrayRef<Target> Targets);
-ArchitectureSet mapToArchitectureSet(ArrayRef<Target> Targets);
+LLVM_ABI PlatformVersionSet mapToPlatformVersionSet(ArrayRef<Target> Targets);
+LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef<Target> Targets);
+LLVM_ABI ArchitectureSet mapToArchitectureSet(ArrayRef<Target> Targets);
 
-std::string getTargetTripleName(const Target &Targ);
+LLVM_ABI std::string getTargetTripleName(const Target &Targ);
 
-raw_ostream &operator<<(raw_ostream &OS, const Target &Target);
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const Target &Target);
 
 } // namespace MachO
 } // namespace llvm
diff --git a/llvm/include/llvm/TextAPI/TextAPIError.h b/llvm/include/llvm/TextAPI/TextAPIError.h
index f0578654697b..7b2182edd621 100644
--- a/llvm/include/llvm/TextAPI/TextAPIError.h
+++ b/llvm/include/llvm/TextAPI/TextAPIError.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TEXTAPI_TEXTAPIERROR_H
 #define LLVM_TEXTAPI_TEXTAPIERROR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm::MachO {
@@ -25,7 +26,7 @@ enum class TextAPIErrorCode {
   UnsupportedTarget
 };
 
-class TextAPIError : public llvm::ErrorInfo<TextAPIError> {
+class LLVM_ABI TextAPIError : public llvm::ErrorInfo<TextAPIError> {
 public:
   static char ID;
   TextAPIErrorCode EC;
diff --git a/llvm/include/llvm/TextAPI/TextAPIReader.h b/llvm/include/llvm/TextAPI/TextAPIReader.h
index 32af0e3601f1..603b24b47283 100644
--- a/llvm/include/llvm/TextAPI/TextAPIReader.h
+++ b/llvm/include/llvm/TextAPI/TextAPIReader.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TEXTAPI_TEXTAPIREADER_H
 #define LLVM_TEXTAPI_TEXTAPIREADER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -29,13 +30,13 @@ public:
   ///
   /// \param InputBuffer Buffer holding contents of TAPI text file.
   /// \return The file format version of TAPI text file.
-  static Expected<FileType> canRead(MemoryBufferRef InputBuffer);
+  LLVM_ABI static Expected<FileType> canRead(MemoryBufferRef InputBuffer);
 
   /// Parse and get an InterfaceFile that represents the full
   /// library.
   ///
   /// \param InputBuffer Buffer holding contents of TAPI text file.
-  static Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI static Expected<std::unique_ptr<InterfaceFile>>
   get(MemoryBufferRef InputBuffer);
 
   TextAPIReader() = delete;
diff --git a/llvm/include/llvm/TextAPI/TextAPIWriter.h b/llvm/include/llvm/TextAPI/TextAPIWriter.h
index 7fd32c6fe2a9..5f06c372fe85 100644
--- a/llvm/include/llvm/TextAPI/TextAPIWriter.h
+++ b/llvm/include/llvm/TextAPI/TextAPIWriter.h
@@ -10,6 +10,7 @@
 #define LLVM_TEXTAPI_TEXTAPIWRITER_H
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/InterfaceFile.h"
 
 namespace llvm {
@@ -30,9 +31,10 @@ public:
   /// \param FileKind File format to write text file as. If not specified, it
   /// will read from File.
   /// \param Compact Whether to limit whitespace in text file.
-  static Error writeToStream(raw_ostream &OS, const InterfaceFile &File,
-                             const FileType FileKind = FileType::Invalid,
-                             bool Compact = false);
+  LLVM_ABI static Error
+  writeToStream(raw_ostream &OS, const InterfaceFile &File,
+                const FileType FileKind = FileType::Invalid,
+                bool Compact = false);
 
   /// Get TAPI FileType from the input string.
   ///
diff --git a/llvm/include/llvm/TextAPI/Utils.h b/llvm/include/llvm/TextAPI/Utils.h
index 00dfd63e14f9..27db717f5a63 100644
--- a/llvm/include/llvm/TextAPI/Utils.h
+++ b/llvm/include/llvm/TextAPI/Utils.h
@@ -14,6 +14,7 @@
 #define LLVM_TEXTAPI_UTILS_H
 
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -51,34 +52,35 @@ struct SymLink {
 ///
 /// \param Path Location of file.
 /// \param Extension File extension to update with.
-void replace_extension(SmallVectorImpl<char> &Path, const Twine &Extension);
+LLVM_ABI void replace_extension(SmallVectorImpl<char> &Path,
+                                const Twine &Extension);
 
 /// Determine whether to skip over symlink due to either too many symlink levels
 /// or is cyclic.
 ///
 /// \param Path Location to symlink.
 /// \param Result Holds whether to skip over Path.
-std::error_code shouldSkipSymLink(const Twine &Path, bool &Result);
+LLVM_ABI std::error_code shouldSkipSymLink(const Twine &Path, bool &Result);
 
 /// Turn absolute symlink into relative.
 ///
 /// \param From The symlink.
 /// \param To What the symlink points to.
 /// \param RelativePath Path location to update what the symlink points to.
-std::error_code make_relative(StringRef From, StringRef To,
-                              SmallVectorImpl<char> &RelativePath);
+LLVM_ABI std::error_code make_relative(StringRef From, StringRef To,
+                                       SmallVectorImpl<char> &RelativePath);
 
 /// Determine if library is private by parsing file path.
 /// It does not touch the file system.
 ///
 /// \param Path File path for library.
 /// \param IsSymLink Whether path points to a symlink.
-bool isPrivateLibrary(StringRef Path, bool IsSymLink = false);
+LLVM_ABI bool isPrivateLibrary(StringRef Path, bool IsSymLink = false);
 
 /// Create a regex rule from provided glob string.
 /// \param Glob String that represents glob input.
 /// \return The equivalent regex rule.
-llvm::Expected<llvm::Regex> createRegexFromGlob(llvm::StringRef Glob);
+LLVM_ABI llvm::Expected<llvm::Regex> createRegexFromGlob(llvm::StringRef Glob);
 
 using AliasEntry = std::pair<std::string, EncodeKind>;
 using AliasMap = std::map<AliasEntry, AliasEntry>;
@@ -87,14 +89,15 @@ using AliasMap = std::map<AliasEntry, AliasEntry>;
 ///
 /// \param Buffer Data contents of file for the alias list.
 /// \return Lookup table of alias to their base symbol.
-Expected<AliasMap> parseAliasList(std::unique_ptr<llvm::MemoryBuffer> &Buffer);
+LLVM_ABI Expected<AliasMap>
+parseAliasList(std::unique_ptr<llvm::MemoryBuffer> &Buffer);
 
 /// Pickup active paths for a given platform.
 ///
 /// \param Paths File or search paths to pick up.
 /// \param Platform Platform to collect paths for.
-PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths,
-                            PlatformType Platform);
+LLVM_ABI PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths,
+                                     PlatformType Platform);
 
 } // namespace llvm::MachO
 #endif // LLVM_TEXTAPI_UTILS_H

From 78765bb856bd6cdc3b1db48e80f74b8de5181f3f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 11 Jun 2025 17:23:04 +0100
Subject: [PATCH 0008/1322] [TableGen] Simplify computeUberWeights. NFC.
 (#143716)

Using RegUnitIterator made the code more complicated than having two
nested loops over each register and each register's regunits.
---
 .../TableGen/Common/CodeGenRegisters.cpp      | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 5ec9b35379fa..4d24eb3de1ed 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -1849,26 +1849,21 @@ static void computeUberWeights(MutableArrayRef<UberRegSet> UberSets,
   // Skip the first unallocatable set.
   for (UberRegSet &S : UberSets.drop_front()) {
     // Initialize all unit weights in this set, and remember the max units/reg.
-    const CodeGenRegister *Reg = nullptr;
-    unsigned MaxWeight = 0, Weight = 0;
-    for (RegUnitIterator UnitI(S.Regs); UnitI.isValid(); ++UnitI) {
-      if (Reg != UnitI.getReg()) {
-        if (Weight > MaxWeight)
-          MaxWeight = Weight;
-        Reg = UnitI.getReg();
-        Weight = 0;
-      }
-      if (!RegBank.getRegUnit(*UnitI).Artificial) {
-        unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight;
-        if (!UWeight) {
-          UWeight = 1;
-          RegBank.increaseRegUnitWeight(*UnitI, UWeight);
+    unsigned MaxWeight = 0;
+    for (const CodeGenRegister *R : S.Regs) {
+      unsigned Weight = 0;
+      for (unsigned U : R->getRegUnits()) {
+        if (!RegBank.getRegUnit(U).Artificial) {
+          unsigned UWeight = RegBank.getRegUnit(U).Weight;
+          if (!UWeight) {
+            UWeight = 1;
+            RegBank.increaseRegUnitWeight(U, UWeight);
+          }
+          Weight += UWeight;
         }
-        Weight += UWeight;
       }
+      MaxWeight = std::max(MaxWeight, Weight);
     }
-    if (Weight > MaxWeight)
-      MaxWeight = Weight;
     if (S.Weight != MaxWeight) {
       LLVM_DEBUG({
         dbgs() << "UberSet " << &S - UberSets.begin() << " Weight "

From 8e4f0d8614dcd48cfe2d885a021e2927c1bc8616 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:24:46 +0200
Subject: [PATCH 0009/1322] [CIR] Upstream minimal builtin function call
 support (#142981)

This patch adds all bits required to implement builtin function calls to
ClangIR. It doesn't actually implement any of the builtins except those
that fold to a constant ahead of CodeGen
(`__builtin_is_constant_evaluated()` being one example).
---
 clang/include/clang/CIR/MissingFeatures.h |  3 +-
 clang/lib/CIR/CodeGen/CIRGenBuilder.cpp   | 28 ++++++++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h     | 11 ++++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp   | 55 ++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenCall.h        | 30 ++++++++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp      | 53 +++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenFunction.h    |  5 ++
 clang/lib/CIR/CodeGen/CMakeLists.txt      |  1 +
 clang/test/CIR/CodeGen/builtin_call.cpp   | 78 +++++++++++++++++++++++
 9 files changed, 255 insertions(+), 9 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
 create mode 100644 clang/test/CIR/CodeGen/builtin_call.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index f89d386378e5..87908e2ec08a 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -83,7 +83,6 @@ struct MissingFeatures {
   static bool opFuncSetComdat() { return false; }
 
   // CallOp handling
-  static bool opCallBuiltinFunc() { return false; }
   static bool opCallPseudoDtor() { return false; }
   static bool opCallAggregateArgs() { return false; }
   static bool opCallPaddingArgs() { return false; }
@@ -225,6 +224,8 @@ struct MissingFeatures {
   static bool isMemcpyEquivalentSpecialMember() { return false; }
   static bool isTrivialCtorOrDtor() { return false; }
   static bool implicitConstructorArgs() { return false; }
+  static bool intrinsics() { return false; }
+  static bool attributeNoBuiltin() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
index 4c8c6ed289c3..9cec17bcb2fd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
@@ -39,6 +39,34 @@ mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin,
   return create<cir::PtrStrideOp>(arrayLocEnd, flatPtrTy, basePtr, idx);
 }
 
+cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc,
+                                             llvm::APSInt intVal) {
+  bool isSigned = intVal.isSigned();
+  unsigned width = intVal.getBitWidth();
+  cir::IntType t = isSigned ? getSIntNTy(width) : getUIntNTy(width);
+  return getConstInt(loc, t,
+                     isSigned ? intVal.getSExtValue() : intVal.getZExtValue());
+}
+
+cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc,
+                                             llvm::APInt intVal) {
+  return getConstInt(loc, llvm::APSInt(intVal));
+}
+
+cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, mlir::Type t,
+                                             uint64_t c) {
+  assert(mlir::isa<cir::IntType>(t) && "expected cir::IntType");
+  return create<cir::ConstantOp>(loc, cir::IntAttr::get(t, c));
+}
+
+cir::ConstantOp
+clang::CIRGen::CIRGenBuilderTy::getConstFP(mlir::Location loc, mlir::Type t,
+                                           llvm::APFloat fpVal) {
+  assert(mlir::isa<cir::CIRFPTypeInterface>(t) &&
+         "expected floating point type");
+  return create<cir::ConstantOp>(loc, getAttr<cir::FPAttr>(t, fpVal));
+}
+
 // This can't be defined in Address.h because that file is included by
 // CIRGenBuilder.h
 Address Address::withElementType(CIRGenBuilderTy &builder,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 03077ee062a6..fb1a290c18fa 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -11,10 +11,12 @@
 
 #include "Address.h"
 #include "CIRGenTypeCache.h"
+#include "clang/CIR/Interfaces/CIRFPTypeInterface.h"
 #include "clang/CIR/MissingFeatures.h"
 
 #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 
 namespace clang::CIRGen {
@@ -229,6 +231,15 @@ public:
   cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; }
   cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; }
 
+  cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal);
+
+  cir::ConstantOp getConstInt(mlir::Location loc, llvm::APInt intVal);
+
+  cir::ConstantOp getConstInt(mlir::Location loc, mlir::Type t, uint64_t c);
+
+  cir::ConstantOp getConstFP(mlir::Location loc, mlir::Type t,
+                             llvm::APFloat fpVal);
+
   bool isInt8Ty(mlir::Type i) {
     return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty;
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
new file mode 100644
index 000000000000..c59ac78210f8
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit Builtin calls as CIR or a function call to be
+// later resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCall.h"
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "CIRGenValue.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/GlobalDecl.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
+                                       const CallExpr *e,
+                                       ReturnValueSlot returnValue) {
+  // See if we can constant fold this builtin.  If so, don't emit it at all.
+  // TODO: Extend this handling to all builtin calls that we can constant-fold.
+  Expr::EvalResult result;
+  if (e->isPRValue() && e->EvaluateAsRValue(result, cgm.getASTContext()) &&
+      !result.hasSideEffects()) {
+    if (result.Val.isInt()) {
+      return RValue::get(builder.getConstInt(getLoc(e->getSourceRange()),
+                                             result.Val.getInt()));
+    }
+    if (result.Val.isFloat()) {
+      // Note: we are using result type of CallExpr to determine the type of
+      // the constant. Classic codegen uses the result value to determine the
+      // type. We feel it should be Ok to use expression type because it is
+      // hard to imagine a builtin function evaluates to a value that
+      // over/underflows its own defined type.
+      mlir::Type type = convertType(e->getType());
+      return RValue::get(builder.getConstFP(getLoc(e->getExprLoc()), type,
+                                            result.Val.getFloat()));
+    }
+  }
+
+  mlir::Location loc = getLoc(e->getExprLoc());
+  cgm.errorNYI(loc, "non constant foldable builtin calls");
+  return getUndefRValue(e->getType());
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h
index 605625705a75..15c9080448c8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.h
@@ -44,16 +44,25 @@ public:
 class CIRGenCallee {
   enum class SpecialKind : uintptr_t {
     Invalid,
+    Builtin,
 
-    Last = Invalid,
+    Last = Builtin,
+  };
+
+  struct BuiltinInfoStorage {
+    const clang::FunctionDecl *decl;
+    unsigned id;
   };
 
   SpecialKind kindOrFunctionPtr;
 
   union {
     CIRGenCalleeInfo abstractInfo;
+    BuiltinInfoStorage builtinInfo;
   };
 
+  explicit CIRGenCallee(SpecialKind kind) : kindOrFunctionPtr(kind) {}
+
 public:
   CIRGenCallee() : kindOrFunctionPtr(SpecialKind::Invalid) {}
 
@@ -69,6 +78,25 @@ public:
     return CIRGenCallee(abstractInfo, funcPtr);
   }
 
+  bool isBuiltin() const { return kindOrFunctionPtr == SpecialKind::Builtin; }
+
+  const clang::FunctionDecl *getBuiltinDecl() const {
+    assert(isBuiltin());
+    return builtinInfo.decl;
+  }
+  unsigned getBuiltinID() const {
+    assert(isBuiltin());
+    return builtinInfo.id;
+  }
+
+  static CIRGenCallee forBuiltin(unsigned builtinID,
+                                 const clang::FunctionDecl *builtinDecl) {
+    CIRGenCallee result(SpecialKind::Builtin);
+    result.builtinInfo.decl = builtinDecl;
+    result.builtinInfo.id = builtinID;
+    return result;
+  }
+
   bool isOrdinary() const {
     return uintptr_t(kindOrFunctionPtr) > uintptr_t(SpecialKind::Last);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index f2c2de7a4f59..f1f86509c9a9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1029,8 +1029,48 @@ static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) {
   return cgm.getAddrOfFunction(gd);
 }
 
-static CIRGenCallee emitDirectCallee(CIRGenModule &cgm, GlobalDecl gd) {
-  assert(!cir::MissingFeatures::opCallBuiltinFunc());
+// Detect the unusual situation where an inline version is shadowed by a
+// non-inline version. In that case we should pick the external one
+// everywhere. That's GCC behavior too.
+static bool onlyHasInlineBuiltinDeclaration(const FunctionDecl *fd) {
+  for (const FunctionDecl *pd = fd; pd; pd = pd->getPreviousDecl())
+    if (!pd->isInlineBuiltinDeclaration())
+      return false;
+  return true;
+}
+
+CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
+  const auto *fd = cast<FunctionDecl>(gd.getDecl());
+
+  if (unsigned builtinID = fd->getBuiltinID()) {
+    if (fd->getAttr<AsmLabelAttr>()) {
+      cgm.errorNYI("AsmLabelAttr");
+    }
+
+    StringRef ident = fd->getName();
+    std::string fdInlineName = (ident + ".inline").str();
+
+    bool isPredefinedLibFunction =
+        cgm.getASTContext().BuiltinInfo.isPredefinedLibFunction(builtinID);
+    bool hasAttributeNoBuiltin = false;
+    assert(!cir::MissingFeatures::attributeNoBuiltin());
+
+    // When directing calling an inline builtin, call it through it's mangled
+    // name to make it clear it's not the actual builtin.
+    auto fn = cast<cir::FuncOp>(curFn);
+    if (fn.getName() != fdInlineName && onlyHasInlineBuiltinDeclaration(fd)) {
+      cgm.errorNYI("Inline only builtin function calls");
+    }
+
+    // Replaceable builtins provide their own implementation of a builtin. If we
+    // are in an inline builtin implementation, avoid trivial infinite
+    // recursion. Honor __attribute__((no_builtin("foo"))) or
+    // __attribute__((no_builtin)) on the current function unless foo is
+    // not a predefined library function which means we must generate the
+    // builtin no matter what.
+    else if (!isPredefinedLibFunction || !hasAttributeNoBuiltin)
+      return CIRGenCallee::forBuiltin(builtinID, fd);
+  }
 
   cir::FuncOp callee = emitFunctionDeclPointer(cgm, gd);
 
@@ -1106,7 +1146,7 @@ CIRGenCallee CIRGenFunction::emitCallee(const clang::Expr *e) {
   } else if (const auto *declRef = dyn_cast<DeclRefExpr>(e)) {
     // Resolve direct calls.
     const auto *funcDecl = cast<FunctionDecl>(declRef->getDecl());
-    return emitDirectCallee(cgm, funcDecl);
+    return emitDirectCallee(funcDecl);
   } else if (isa<MemberExpr>(e)) {
     cgm.errorNYI(e->getSourceRange(),
                  "emitCallee: call to member function is NYI");
@@ -1162,10 +1202,9 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *e,
 
   CIRGenCallee callee = emitCallee(e->getCallee());
 
-  if (e->getBuiltinCallee()) {
-    cgm.errorNYI(e->getSourceRange(), "call to builtin functions");
-  }
-  assert(!cir::MissingFeatures::opCallBuiltinFunc());
+  if (callee.isBuiltin())
+    return emitBuiltinExpr(callee.getBuiltinDecl(), callee.getBuiltinID(), e,
+                           returnValue);
 
   if (isa<CXXPseudoDestructorExpr>(e->getCallee())) {
     cgm.errorNYI(e->getSourceRange(), "call to pseudo destructor");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 7db7f6928fd8..b08dd540e628 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -665,6 +665,8 @@ private:
   void emitAndUpdateRetAlloca(clang::QualType type, mlir::Location loc,
                               clang::CharUnits alignment);
 
+  CIRGenCallee emitDirectCallee(const GlobalDecl &gd);
+
 public:
   Address emitAddrOfFieldStorage(Address base, const FieldDecl *field,
                                  llvm::StringRef fieldName,
@@ -711,6 +713,9 @@ public:
 
   mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s);
 
+  RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
+                         const clang::CallExpr *e, ReturnValueSlot returnValue);
+
   RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                   const CIRGenCallee &callee, ReturnValueSlot returnValue,
                   const CallArgList &args, cir::CIRCallOpInterface *callOp,
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 8bfcd2773d07..beaa9afb31f9 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(clangCIR
   CIRGenClass.cpp
   CIRGenCXXABI.cpp
   CIRGenCXXExpr.cpp
+  CIRGenBuiltin.cpp
   CIRGenDecl.cpp
   CIRGenDeclOpenACC.cpp
   CIRGenExpr.cpp
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
new file mode 100644
index 000000000000..2706ea7f8f85
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+constexpr extern int cx_var = __builtin_is_constant_evaluated();
+
+// CIR: cir.global {{.*}} @cx_var = #cir.int<1> : !s32i
+// LLVM: @cx_var = {{.*}} i32 1
+// OGCG: @cx_var = {{.*}} i32 1
+
+constexpr extern float cx_var_single = __builtin_huge_valf();
+
+// CIR: cir.global {{.*}} @cx_var_single = #cir.fp<0x7F800000> : !cir.float
+// LLVM: @cx_var_single = {{.*}} float 0x7FF0000000000000
+// OGCG: @cx_var_single = {{.*}} float 0x7FF0000000000000
+
+constexpr extern long double cx_var_ld = __builtin_huge_vall();
+
+// CIR: cir.global {{.*}} @cx_var_ld = #cir.fp<0x7FFF8000000000000000> : !cir.long_double<!cir.f80>
+// LLVM: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000
+// OGCG: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000
+
+int is_constant_evaluated() {
+  return __builtin_is_constant_evaluated();
+}
+
+// CIR: cir.func @_Z21is_constant_evaluatedv() -> !s32i
+// CIR: %[[ZERO:.+]] = cir.const #cir.int<0>
+
+// LLVM: define {{.*}}i32 @_Z21is_constant_evaluatedv()
+// LLVM: %[[MEM:.+]] = alloca i32
+// LLVM: store i32 0, ptr %[[MEM]]
+// LLVM: %[[RETVAL:.+]] = load i32, ptr %[[MEM]]
+// LLVM: ret i32 %[[RETVAL]]
+// LLVM: }
+
+// OGCG: define {{.*}}i32 @_Z21is_constant_evaluatedv()
+// OGCG: ret i32 0
+// OGCG: }
+
+long double constant_fp_builtin_ld() {
+  return __builtin_fabsl(-0.1L);
+}
+
+// CIR: cir.func @_Z22constant_fp_builtin_ldv() -> !cir.long_double<!cir.f80>
+// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.long_double<!cir.f80>
+
+// LLVM: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv()
+// LLVM: %[[MEM:.+]] = alloca x86_fp80
+// LLVM: store x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD, ptr %[[MEM]]
+// LLVM: %[[RETVAL:.+]] = load x86_fp80, ptr %[[MEM]]
+// LLVM: ret x86_fp80 %[[RETVAL]]
+// LLVM: }
+
+// OGCG: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv()
+// OGCG: ret x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD
+// OGCG: }
+
+float constant_fp_builtin_single() {
+  return __builtin_fabsf(-0.1f);
+}
+
+// CIR: cir.func @_Z26constant_fp_builtin_singlev() -> !cir.float
+// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.float
+
+// LLVM: define {{.*}}float @_Z26constant_fp_builtin_singlev()
+// LLVM: %[[MEM:.+]] = alloca float
+// LLVM: store float 0x3FB99999A0000000, ptr %[[MEM]]
+// LLVM: %[[RETVAL:.+]] = load float, ptr %[[MEM]]
+// LLVM: ret float %[[RETVAL]]
+// LLVM: }
+
+// OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev()
+// OGCG: ret float 0x3FB99999A0000000
+// OGCG: }

From ec8d68b59f82423e5a6bf452e33ee8c5f64b0edc Mon Sep 17 00:00:00 2001
From: vabridgers <58314289+vabridgers@users.noreply.github.com>
Date: Wed, 11 Jun 2025 11:25:24 -0500
Subject: [PATCH 0010/1322] [clang][analyzer] Correct SMT Layer for _BitInt
 cases refutations (#143310)

Since _BitInt was added later, ASTContext did not comprehend getting a
type by bitwidth that's not a power of 2, and the SMT layer also did not
comprehend this. This led to unexpected crashes using Z3 refutation
during randomized testing. The assertion and redacted and summarized
crash stack is shown here.

clang:
../../clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h:103:
static llvm::SMTExprRef
clang::ento::SMTConv::fromBinOp(llvm::SMTSolverRef &,
const llvm::SMTExprRef &, const BinaryOperator::Opcode, const
llvm::SMTExprRef &, bool):
Assertion `*Solver->getSort(LHS) == *Solver->getSort(RHS) && "AST's must
have the same sort!"' failed.
 ...
<address>
clang::ento::SMTConv::fromBinOp(std::shared_ptr<llvm::SMTSolver>&,
llvm::SMTExpr const* const&, clang::BinaryOperatorKind, llvm::SMTExpr
const* const&,
     bool) SMTConstraintManager.cpp
     clang::ASTContext&, llvm::SMTExpr const* const&, clang::QualType,
clang::BinaryOperatorKind, llvm::SMTExpr const* const&, clang::QualType,
     clang::QualType*) SMTConstraintManager.cpp
clang::ASTContext&, clang::ento::SymExpr const*, llvm::APSInt const&,
     llvm::APSInt const&, bool) SMTConstraintManager.cpp
clang::ento::ExplodedNode const*, clang::ento::PathSensitiveBugReport&)

---------

Co-authored-by: Vince Bridgers <vince.a.bridgers@ericsson.com>
---
 .../Core/PathSensitive/SMTConv.h              | 28 ++++++++++++++-----
 clang/test/Analysis/bitint-z3.c               | 22 +++++++++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/Analysis/bitint-z3.c

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
index 580b49a38dc7..70a7953918ac 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
@@ -18,6 +18,8 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
 #include "llvm/Support/SMTAPI.h"
 
+#include <algorithm>
+
 namespace clang {
 namespace ento {
 
@@ -570,23 +572,35 @@ public:
   // TODO: Refactor to put elsewhere
   static inline QualType getAPSIntType(ASTContext &Ctx,
                                        const llvm::APSInt &Int) {
-    return Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned());
+    const QualType Ty =
+        Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned());
+    if (!Ty.isNull())
+      return Ty;
+    // If Ty is Null, could be because the original type was a _BitInt.
+    // Get the size of the _BitInt type (expressed in bits) and round it up to
+    // the next power of 2 that is at least the bit size of 'char' (usually 8).
+    unsigned CharTypeSize = Ctx.getTypeSize(Ctx.CharTy);
+    unsigned Pow2DestWidth =
+        std::max(llvm::bit_ceil(Int.getBitWidth()), CharTypeSize);
+    return Ctx.getIntTypeForBitwidth(Pow2DestWidth, Int.isSigned());
   }
 
   // Get the QualTy for the input APSInt, and fix it if it has a bitwidth of 1.
   static inline std::pair<llvm::APSInt, QualType>
   fixAPSInt(ASTContext &Ctx, const llvm::APSInt &Int) {
     llvm::APSInt NewInt;
+    unsigned APSIntBitwidth = Int.getBitWidth();
+    QualType Ty = getAPSIntType(Ctx, Int);
 
     // FIXME: This should be a cast from a 1-bit integer type to a boolean type,
     // but the former is not available in Clang. Instead, extend the APSInt
     // directly.
-    if (Int.getBitWidth() == 1 && getAPSIntType(Ctx, Int).isNull()) {
-      NewInt = Int.extend(Ctx.getTypeSize(Ctx.BoolTy));
-    } else
-      NewInt = Int;
-
-    return std::make_pair(NewInt, getAPSIntType(Ctx, NewInt));
+    if (APSIntBitwidth == 1 && Ty.isNull())
+      return {Int.extend(Ctx.getTypeSize(Ctx.BoolTy)),
+              getAPSIntType(Ctx, NewInt)};
+    if (llvm::isPowerOf2_32(APSIntBitwidth) || Ty.isNull())
+      return {Int, Ty};
+    return {Int.extend(Ctx.getTypeSize(Ty)), Ty};
   }
 
   // Perform implicit type conversion on binary symbolic expressions.
diff --git a/clang/test/Analysis/bitint-z3.c b/clang/test/Analysis/bitint-z3.c
new file mode 100644
index 000000000000..4cb97f9de829
--- /dev/null
+++ b/clang/test/Analysis/bitint-z3.c
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -w \
+// RUN:   -analyzer-config crosscheck-with-z3=true -verify %s
+// REQUIRES: z3
+
+// Previously these tests were crashing because the SMTConv layer did not
+// comprehend the _BitInt types.
+
+void clang_analyzer_warnIfReached();
+
+void c(int b, _BitInt(35) a) {
+  int d = 0;
+  if (a)
+    b = d;
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void f(int *d, _BitInt(3) e) {
+  int g;
+  d = &g;
+  e ?: 0;
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}

From fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2 Mon Sep 17 00:00:00 2001
From: Rolf Morel <854835+rolfmorel@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:33:55 +0100
Subject: [PATCH 0011/1322] [MLIR][Transform] apply_registered_pass op's
 options as a dict (#143159)

Improve ApplyRegisteredPassOp's support for taking options by taking
them as a dict (vs a list of string-valued key-value pairs).

Values of options are provided as either static attributes or as params
(which pass in attributes at interpreter runtime). In either case, the
keys and value attributes are converted to strings and a single
options-string, in the format used on the commandline, is constructed to
pass to the `addToPipeline`-pass API.
---
 .../mlir/Dialect/Transform/IR/CMakeLists.txt  |   4 +
 .../Dialect/Transform/IR/TransformAttrs.h     |   3 +
 .../Dialect/Transform/IR/TransformAttrs.td    |  19 ++
 .../Dialect/Transform/IR/TransformDialect.td  |   1 +
 .../mlir/Dialect/Transform/IR/TransformOps.td |  23 +-
 .../Dialect/Transform/IR/TransformDialect.cpp |   9 +
 .../lib/Dialect/Transform/IR/TransformOps.cpp | 219 +++++++++++-------
 .../mlir/dialects/transform/__init__.py       |  82 ++++++-
 .../Transform/test-pass-application.mlir      | 169 ++++++++++++--
 mlir/test/python/dialects/transform.py        |  52 +++++
 10 files changed, 467 insertions(+), 114 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
index df5af7ae710d..9acab9228f10 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
@@ -20,6 +20,10 @@ mlir_tablegen(TransformDialectEnums.h.inc -gen-enum-decls)
 mlir_tablegen(TransformDialectEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRTransformDialectEnumIncGen)
 add_dependencies(mlir-headers MLIRTransformDialectEnumIncGen)
+mlir_tablegen(TransformAttrs.h.inc -gen-attrdef-decls)
+mlir_tablegen(TransformAttrs.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRTransformDialectAttributesIncGen)
+add_dependencies(mlir-headers MLIRTransformDialectAttributesIncGen)
 
 add_mlir_dialect(TransformOps transform)
 add_mlir_doc(TransformOps TransformOps Dialects/ -gen-op-doc -dialect=transform)
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h
index 3cb935003b4c..379af932ca48 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h
@@ -17,4 +17,7 @@
 
 #include "mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h.inc"
+
 #endif // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS_H
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td
index ebad2994880e..e67a9444c24a 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td
@@ -10,6 +10,14 @@
 #define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS
 
 include "mlir/IR/EnumAttr.td"
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+
+class Transform_Attr<string name, string attrMnemonic,
+                     list<Trait> traits = [],
+                     string baseCppClass = "::mlir::Attribute">
+    : AttrDef<Transform_Dialect, name, traits, baseCppClass> {
+  let mnemonic = attrMnemonic;
+}
 
 def PropagateFailuresCase : I32EnumAttrCase<"Propagate", 1, "propagate">;
 def SuppressFailuresCase : I32EnumAttrCase<"Suppress", 2, "suppress">;
@@ -33,4 +41,15 @@ def MatchCmpIPredicateAttr : I32EnumAttr<
   let cppNamespace = "::mlir::transform";
 }
 
+def ParamOperandAttr : Transform_Attr<"ParamOperand", "param_operand"> {
+  let description = [{
+    Used to refer to a specific param-operand (via its index) from within an
+    attribute on a transform operation.
+  }];
+  let parameters = (ins
+    "IntegerAttr":$index
+  );
+  let assemblyFormat = "`<` `index` `=` $index `>`";
+}
+
 #endif  // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
index d03049e186f9..c7ea5ade72ac 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
@@ -19,6 +19,7 @@ def Transform_Dialect : Dialect {
   let cppNamespace = "::mlir::transform";
 
   let hasOperationAttrVerify = 1;
+  let useDefaultAttributePrinterParser = 1;
   let extraClassDeclaration = [{
     /// Symbol name for the default entry point "named sequence".
     constexpr const static ::llvm::StringLiteral
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index e864a65f8cea..f75ba27e58e7 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -405,10 +405,23 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
   let description = [{
     This transform applies the specified pass or pass pipeline to the targeted
     ops. The name of the pass/pipeline is specified as a string attribute, as
-    set during pass/pipeline registration. Optionally, pass options may be
-    specified as (space-separated) string attributes with the option to pass
-    these attributes via params. The pass options syntax is identical to the one
-    used with "mlir-opt".
+    set during pass/pipeline registration.
+
+    Optionally, pass options may be specified via a DictionaryAttr. This
+    dictionary is converted to a string -- formatted `key=value ...` -- which
+    is expected to be in the exact format used by the pass on the commandline.
+    Values are either attributes or (SSA-values of) Transform Dialect params.
+    For example:
+
+    ```mlir
+    transform.apply_registered_pass "canonicalize"
+        with options = { "top-down" = false,
+                         "max-iterations" = %max_iter,
+                         "test-convergence" = true,
+                         "max-num-rewrites" =  %max_rewrites }
+        to %module
+    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    ```
 
     This op first looks for a pass pipeline with the specified name. If no such
     pipeline exists, it looks for a pass with the specified name. If no such
@@ -422,7 +435,7 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
   }];
 
   let arguments = (ins StrAttr:$pass_name,
-                       DefaultValuedAttr<ArrayAttr, "{}">:$options,
+                       DefaultValuedAttr<DictionaryAttr, "{}">:$options,
                        Variadic<TransformParamTypeInterface>:$dynamic_options,
                        TransformHandleTypeInterface:$target);
   let results = (outs TransformHandleTypeInterface:$result);
diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
index 497ceb19f1a2..4a95fe7459e8 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
@@ -8,17 +8,22 @@
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Analysis/CallGraph.h"
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/IR/Utils.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.cpp.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc"
+
 #ifndef NDEBUG
 void transform::detail::checkImplementsTransformOpInterface(
     StringRef name, MLIRContext *context) {
@@ -66,6 +71,10 @@ void transform::TransformDialect::initialize() {
 #include "mlir/Dialect/Transform/IR/TransformOps.cpp.inc"
       >();
   initializeTypes();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc"
+      >();
   initializeLibraryModule();
 }
 
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index a0f9518e3d12..582d082153be 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -54,10 +54,11 @@
 using namespace mlir;
 
 static ParseResult parseApplyRegisteredPassOptions(
-    OpAsmParser &parser, ArrayAttr &options,
+    OpAsmParser &parser, DictionaryAttr &options,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions);
 static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
-                                            Operation *op, ArrayAttr options,
+                                            Operation *op,
+                                            DictionaryAttr options,
                                             ValueRange dynamicOptions);
 static ParseResult parseSequenceOpOperands(
     OpAsmParser &parser, std::optional<OpAsmParser::UnresolvedOperand> &root,
@@ -784,41 +785,50 @@ DiagnosedSilenceableFailure
 transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
                                         transform::TransformResults &results,
                                         transform::TransformState &state) {
-  // Obtain a single options-string from options passed statically as
-  // string attributes as well as "dynamically" through params.
-  std::string options;
-  OperandRange dynamicOptions = getDynamicOptions();
-  size_t dynamicOptionsIdx = 0;
-  for (auto [idx, optionAttr] : llvm::enumerate(getOptions())) {
-    if (idx > 0)
-      options += " "; // Interleave options seperator.
+  // Obtain a single options-string to pass to the pass(-pipeline) from options
+  // passed in as a dictionary of keys mapping to values which are either
+  // attributes or param-operands pointing to attributes.
 
-    if (auto strAttr = dyn_cast<StringAttr>(optionAttr)) {
-      options += strAttr.getValue();
-    } else if (isa<UnitAttr>(optionAttr)) {
-      assert(dynamicOptionsIdx < dynamicOptions.size() &&
+  std::string options;
+  llvm::raw_string_ostream optionsStream(options); // For "printing" attrs.
+
+  OperandRange dynamicOptions = getDynamicOptions();
+  for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) {
+    if (idx > 0)
+      optionsStream << " "; // Interleave options separator.
+    optionsStream << namedAttribute.getName().str(); // Append the key.
+    optionsStream << "="; // And the key-value separator.
+
+    Attribute valueAttrToAppend;
+    if (auto paramOperandIndex =
+            dyn_cast<transform::ParamOperandAttr>(namedAttribute.getValue())) {
+      // The corresponding value attribute is passed in via a param.
+      // Obtain the param-operand via its specified index.
+      size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt();
+      assert(dynamicOptionIdx < dynamicOptions.size() &&
              "number of dynamic option markers (UnitAttr) in options ArrayAttr "
              "should be the same as the number of options passed as params");
       ArrayRef<Attribute> dynamicOption =
-          state.getParams(dynamicOptions[dynamicOptionsIdx++]);
+          state.getParams(dynamicOptions[dynamicOptionIdx]);
       if (dynamicOption.size() != 1)
-        return emitSilenceableError() << "options passed as a param must have "
-                                         "a single value associated, param "
-                                      << dynamicOptionsIdx - 1 << " associates "
-                                      << dynamicOption.size();
-
-      if (auto dynamicOptionStr = dyn_cast<StringAttr>(dynamicOption[0])) {
-        options += dynamicOptionStr.getValue();
-      } else {
         return emitSilenceableError()
-               << "options passed as a param must be a string, got "
-               << dynamicOption[0];
-      }
+               << "options passed as a param must have "
+                  "a single value associated, param "
+               << dynamicOptionIdx << " associates " << dynamicOption.size();
+      valueAttrToAppend = dynamicOption[0];
     } else {
-      llvm_unreachable(
-          "expected options element to be either StringAttr or UnitAttr");
+      // Value is a static attribute.
+      valueAttrToAppend = namedAttribute.getValue();
+    }
+
+    // Append string representation of value attribute.
+    if (auto strAttr = dyn_cast<StringAttr>(valueAttrToAppend)) {
+      optionsStream << strAttr.getValue().str();
+    } else {
+      valueAttrToAppend.print(optionsStream, /*elideType=*/true);
     }
   }
+  optionsStream.flush();
 
   // Get pass or pass pipeline from registry.
   const PassRegistryEntry *info = PassPipelineInfo::lookup(getPassName());
@@ -864,84 +874,121 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
 }
 
 static ParseResult parseApplyRegisteredPassOptions(
-    OpAsmParser &parser, ArrayAttr &options,
+    OpAsmParser &parser, DictionaryAttr &options,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions) {
-  auto dynamicOptionMarker = UnitAttr::get(parser.getContext());
-  SmallVector<Attribute> optionsArray;
+  // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax.
+  SmallVector<NamedAttribute> keyValuePairs;
 
-  auto parseOperandOrString = [&]() -> OptionalParseResult {
-    OpAsmParser::UnresolvedOperand operand;
-    OptionalParseResult parsedOperand = parser.parseOptionalOperand(operand);
-    if (parsedOperand.has_value()) {
-      if (failed(parsedOperand.value()))
-        return failure();
+  size_t dynamicOptionsIdx = 0;
+  auto parseKeyValuePair = [&]() -> ParseResult {
+    // Parse items of the form `key = value` where `key` is a bare identifier or
+    // a string and `value` is either an attribute or an operand.
 
+    std::string key;
+    Attribute valueAttr;
+    if (parser.parseOptionalKeywordOrString(&key))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected key to either be an identifier or a string";
+    if (key.empty())
+      return failure();
+
+    if (parser.parseEqual())
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected '=' after key in key-value pair";
+
+    // Parse the value, which can be either an attribute or an operand.
+    OptionalParseResult parsedValueAttr =
+        parser.parseOptionalAttribute(valueAttr);
+    if (!parsedValueAttr.has_value()) {
+      OpAsmParser::UnresolvedOperand operand;
+      ParseResult parsedOperand = parser.parseOperand(operand);
+      if (failed(parsedOperand))
+        return parser.emitError(parser.getCurrentLocation())
+               << "expected a valid attribute or operand as value associated "
+               << "to key '" << key << "'";
+      // To make use of the operand, we need to store it in the options dict.
+      // As SSA-values cannot occur in attributes, what we do instead is store
+      // an attribute in its place that contains the index of the param-operand,
+      // so that an attr-value associated to the param can be resolved later on.
       dynamicOptions.push_back(operand);
-      optionsArray.push_back(
-          dynamicOptionMarker); // Placeholder for knowing where to
-                                // inject the dynamic option-as-param.
-      return success();
+      auto wrappedIndex = IntegerAttr::get(
+          IntegerType::get(parser.getContext(), 64), dynamicOptionsIdx++);
+      valueAttr =
+          transform::ParamOperandAttr::get(parser.getContext(), wrappedIndex);
+    } else if (failed(parsedValueAttr.value())) {
+      return failure(); // NB: Attempted parse should have output error message.
+    } else if (isa<transform::ParamOperandAttr>(valueAttr)) {
+      return parser.emitError(parser.getCurrentLocation())
+             << "the param_operand attribute is a marker reserved for "
+             << "indicating a value will be passed via params and is only used "
+             << "in the generic print format";
     }
 
-    StringAttr stringAttr;
-    OptionalParseResult parsedStringAttr =
-        parser.parseOptionalAttribute(stringAttr);
-    if (parsedStringAttr.has_value()) {
-      if (failed(parsedStringAttr.value()))
-        return failure();
-      optionsArray.push_back(stringAttr);
-      return success();
-    }
-
-    return std::nullopt;
+    keyValuePairs.push_back(NamedAttribute(key, valueAttr));
+    return success();
   };
 
-  OptionalParseResult parsedOptionsElement = parseOperandOrString();
-  while (parsedOptionsElement.has_value()) {
-    if (failed(parsedOptionsElement.value()))
-      return failure();
-    parsedOptionsElement = parseOperandOrString();
-  }
+  if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Braces,
+                                     parseKeyValuePair,
+                                     " in options dictionary"))
+    return failure(); // NB: Attempted parse should have output error message.
 
-  if (optionsArray.empty()) {
+  if (DictionaryAttr::findDuplicate(
+          keyValuePairs, /*isSorted=*/false) // Also sorts the keyValuePairs.
+          .has_value())
     return parser.emitError(parser.getCurrentLocation())
-           << "expected at least one option (either a string or a param)";
-  }
-  options = parser.getBuilder().getArrayAttr(optionsArray);
+           << "duplicate keys found in options dictionary";
+
+  options = DictionaryAttr::getWithSorted(parser.getContext(), keyValuePairs);
+
   return success();
 }
 
 static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
-                                            Operation *op, ArrayAttr options,
+                                            Operation *op,
+                                            DictionaryAttr options,
                                             ValueRange dynamicOptions) {
-  size_t currentDynamicOptionIdx = 0;
-  for (auto [idx, optionAttr] : llvm::enumerate(options)) {
-    if (idx > 0)
-      printer << " "; // Interleave options separator.
+  if (options.empty())
+    return;
 
-    if (isa<UnitAttr>(optionAttr))
-      printer.printOperand(dynamicOptions[currentDynamicOptionIdx++]);
-    else if (auto strAttr = dyn_cast<StringAttr>(optionAttr))
-      printer.printAttribute(strAttr);
-    else
-      llvm_unreachable("each option should be either a StringAttr or UnitAttr");
-  }
+  printer << "{";
+  llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
+    printer << namedAttribute.getName() << " = ";
+    Attribute value = namedAttribute.getValue();
+    if (auto indexAttr = dyn_cast<transform::ParamOperandAttr>(value)) {
+      // Resolve index of param-operand to its actual SSA-value and print that.
+      printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]);
+    } else {
+      printer.printAttribute(value);
+    }
+  });
+  printer << "}";
 }
 
 LogicalResult transform::ApplyRegisteredPassOp::verify() {
-  size_t numUnitsInOptions = 0;
-  for (Attribute optionsElement : getOptions()) {
-    if (isa<UnitAttr>(optionsElement))
-      numUnitsInOptions++;
-    else if (!isa<StringAttr>(optionsElement))
-      return emitOpError() << "expected each option to be either a StringAttr "
-                           << "or a UnitAttr, got " << optionsElement;
-  }
+  // Check that there is a one-to-one correspondence between param operands
+  // and references to dynamic options in the options dictionary.
 
-  if (getDynamicOptions().size() != numUnitsInOptions)
-    return emitOpError()
-           << "expected the same number of options passed as params as "
-           << "UnitAttr elements in options ArrayAttr";
+  auto dynamicOptions = SmallVector<Value>(getDynamicOptions());
+  for (NamedAttribute namedAttr : getOptions())
+    if (auto paramOperand =
+            dyn_cast<transform::ParamOperandAttr>(namedAttr.getValue())) {
+      size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
+      if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size())
+        return emitOpError()
+               << "dynamic option index " << dynamicOptionIdx
+               << " is out of bounds for the number of dynamic options: "
+               << dynamicOptions.size();
+      if (dynamicOptions[dynamicOptionIdx] == nullptr)
+        return emitOpError() << "dynamic option index " << dynamicOptionIdx
+                             << " is already used in options";
+      dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used.
+    }
+
+  for (Value dynamicOption : dynamicOptions)
+    if (dynamicOption)
+      return emitOpError() << "a param operand does not have a corresponding "
+                           << "param_operand attr in the options dict";
 
   return success();
 }
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 5b158ec6b65f..10a04b0cc14e 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -18,7 +18,12 @@ try:
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Optional, Sequence, Union, NewType
+from typing import Dict, Optional, Sequence, Union, NewType
+
+
+@register_attribute_builder("ParamOperandAttr")
+def _paramOperandAttr(x: int, context) -> Attribute:
+    return Attribute.parse(f"#transform.param_operand<index={x}>", context=context)
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
@@ -214,6 +219,81 @@ class YieldOp(YieldOp):
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
 
 
+@_ods_cext.register_operation(_Dialect, replace=True)
+class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
+    def __init__(
+        self,
+        result: Type,
+        pass_name: Union[str, StringAttr],
+        target: Union[Operation, Value, OpView],
+        *,
+        options: Optional[
+            Dict[
+                Union[str, StringAttr],
+                Union[Attribute, Value, Operation, OpView],
+            ]
+        ] = None,
+        loc=None,
+        ip=None,
+    ):
+        options_dict = {}
+        dynamic_options = []
+
+        ParamOperandAttr = AttrBuilder.get("ParamOperandAttr")
+        context = (loc and loc.context) or Context.current
+
+        cur_param_operand_idx = 0
+        for key, value in options.items() if options is not None else {}:
+            if isinstance(key, StringAttr):
+                key = key.value
+
+            if isinstance(value, (Value, Operation, OpView)):
+                dynamic_options.append(_get_op_result_or_value(value))
+                options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context)
+                cur_param_operand_idx += 1
+            elif isinstance(value, Attribute):
+                options_dict[key] = value
+            elif isinstance(value, str):
+                options_dict[key] = StringAttr.get(value)
+            else:
+                raise TypeError(f"Unsupported option type: {type(value)}")
+        if len(options_dict) > 0:
+            print(options_dict, cur_param_operand_idx)
+        super().__init__(
+            result,
+            pass_name,
+            dynamic_options,
+            target=_get_op_result_or_value(target),
+            options=DictAttr.get(options_dict),
+            loc=loc,
+            ip=ip,
+        )
+
+
+def apply_registered_pass(
+    result: Type,
+    pass_name: Union[str, StringAttr],
+    target: Union[Operation, Value, OpView],
+    *,
+    options: Optional[
+        Dict[
+            Union[str, StringAttr],
+            Union[Attribute, Value, Operation, OpView],
+        ]
+    ] = None,
+    loc=None,
+    ip=None,
+) -> Value:
+    return ApplyRegisteredPassOp(
+        result=result,
+        pass_name=pass_name,
+        target=target,
+        options=options,
+        loc=loc,
+        ip=ip,
+    ).result
+
+
 AnyOpTypeT = NewType("AnyOpType", AnyOpType)
 
 
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 463fd98afa65..6e6d4eb7e249 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -80,7 +80,7 @@ module attributes {transform.with_named_sequence} {
     // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
     // expected-error @below {{<Pass-Options-Parser>: no such option invalid-option}}
     transform.apply_registered_pass "canonicalize"
-        with options = "invalid-option=1" to %1
+        with options = { "invalid-option" = 1 } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -97,7 +97,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false" to %1
+        with options = { "top-down" = false } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -115,7 +115,7 @@ module attributes {transform.with_named_sequence} {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     //transform.apply_registered_pass "canonicalize" with options = "top-down=false,max-iterations=10" to %1 : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false test-convergence=true" to %1
+        with options = { "top-down" = false, "test-convergence" =true } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -132,7 +132,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false" "max-iterations=0" to %1
+        with options = { "top-down" = false, "max-iterations" = 0 } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -148,10 +148,15 @@ func.func @valid_dynamic_pass_options() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param
-    %max_rewrites = transform.param.constant "max-num-rewrites=1" -> !transform.any_param
-    %2 = transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false" %max_iter "test-convergence=true" %max_rewrites to %1
+    %max_iter = transform.param.constant 10 -> !transform.any_param
+    %max_rewrites = transform.param.constant 1 -> !transform.any_param
+    %2 = transform.apply_registered_pass
+        "canonicalize"
+        with options = { "top-down" = false,
+                         "max-iterations" = %max_iter,
+                         "test-convergence" = true,
+                         "max-num-rewrites" =  %max_rewrites }
+        to %1
         : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -159,7 +164,7 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @invalid_dynamic_options_as_array() {
+func.func @invalid_options_as_str() {
   return
 }
 
@@ -167,34 +172,80 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param
-    // expected-error @+2 {{expected at least one option (either a string or a param)}}
+    // expected-error @+2 {{expected '{' in options dictionary}}
     %2 = transform.apply_registered_pass "canonicalize"
-        with options = ["top-down=false" %max_iter] to %1
-        : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+        with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-func.func @invalid_options_as_pairs() {
+func.func @invalid_options_as_pairs_without_braces() {
   return
 }
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @+2 {{expected 'to'}}
+    // expected-error @+2 {{expected '{' in options dictionary}}
     %2 = transform.apply_registered_pass "canonicalize"
-        with options = "top-down=" false to %1
-        : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+        with options = "top-down"=false to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-func.func @invalid_pass_option_param() {
+func.func @invalid_options_due_to_reserved_attr() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    %2 = transform.apply_registered_pass "canonicalize"
+        with options = { "top-down" = #transform.param_operand<index=0> } to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @invalid_options_due_duplicated_key() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+2 {{duplicate keys found in options dictionary}}
+    %2 = transform.apply_registered_pass "canonicalize"
+        with options = {"top-down"=false,"top-down"=true} to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @invalid_options_due_invalid_key() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+2 {{expected key to either be an identifier or a string}}
+    %2 = transform.apply_registered_pass "canonicalize"
+        with options = { @label = 0 } to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @invalid_pass_option_bare_param() {
   return
 }
 
@@ -202,7 +253,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %pass_options = transform.param.constant 42 -> !transform.any_param
-    // expected-error @below {{options passed as a param must be a string, got 42}}
+    // expected-error @+2 {{expected '{' in options dictionary}}
     transform.apply_registered_pass "canonicalize"
         with options = %pass_options to %1
         : (!transform.any_param, !transform.any_op) -> !transform.any_op
@@ -219,12 +270,12 @@ func.func @too_many_pass_option_params() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %x = transform.param.constant "x" -> !transform.any_param
-    %y = transform.param.constant "y" -> !transform.any_param
-    %pass_options = transform.merge_handles %x, %y : !transform.any_param
+    %x = transform.param.constant true -> !transform.any_param
+    %y = transform.param.constant false -> !transform.any_param
+    %topdown_options = transform.merge_handles %x, %y : !transform.any_param
     // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
     transform.apply_registered_pass "canonicalize"
-        with options = %pass_options to %1
+        with options = { "top-down" = %topdown_options } to %1
         : (!transform.any_param, !transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -248,3 +299,77 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+/////////////////////////////////////////////////////////////////////
+// Check that the following cases are caugh in the generic format. //
+/////////////////////////////////////////////////////////////////////
+
+// Invalid due to param_operand occurences in options dict not being
+// one-to-one with the dynamic options provided as params:
+//   param_operand_index out of bounds w.r.t. the number of options provided via params.
+
+"builtin.module"() ({
+  "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
+  ^bb0(%arg0: !transform.any_op):
+    %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
+    %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
+    // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}}
+    %2 = "transform.apply_registered_pass"(%1, %0) <{
+      options = {"max-iterations" = #transform.param_operand<index=1 : i64>,
+                 "test-convergence" = true,
+                 "top-down" = false},
+      pass_name = "canonicalize"}>
+    : (!transform.any_param, !transform.any_op) -> !transform.any_op
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}) {transform.with_named_sequence} : () -> ()
+
+// -----
+
+// Invalid due to param_operand occurences in options dict not being
+// one-to-one with the dynamic options provided as params:
+//   the first option-param is referred to twice and the second one not at all.
+// (In the pretty-printed format, if you want to refer to a param SSA-value twice, it counts as two param arguments.)
+
+"builtin.module"() ({
+  "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
+  ^bb0(%arg0: !transform.any_op):
+    %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
+    %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
+    %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
+    // expected-error @below {{dynamic option index 0 is already used in options}}
+    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+      options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
+                 "max-num-rewrites" = #transform.param_operand<index=0 : i64>,
+                 "test-convergence" = true,
+                 "top-down" = false},
+      pass_name = "canonicalize"}>
+    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}) {transform.with_named_sequence} : () -> ()
+
+// -----
+
+// Invalid due to param_operand occurences in options dict not being
+// one-to-one with the dynamic options provided as params:
+//   two option-params are provide though only the first one is referred to from the options-dict.
+
+"builtin.module"() ({
+  "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
+  ^bb0(%arg0: !transform.any_op):
+    %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
+    %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
+    %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
+    // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}}
+    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+      options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
+                 "test-convergence" = true,
+                 "top-down" = false},
+      pass_name = "canonicalize"}>
+    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}) {transform.with_named_sequence} : () -> ()
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index 6ed4818fc9d2..48bc9bad37a1 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -254,3 +254,55 @@ def testReplicateOp(module: Module):
     # CHECK: %[[FIRST:.+]] = pdl_match
     # CHECK: %[[SECOND:.+]] = pdl_match
     # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
+
+
+@run
+def testApplyRegisteredPassOp(module: Module):
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
+    )
+    with InsertionPoint(sequence.body):
+        mod = transform.ApplyRegisteredPassOp(
+            transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget
+        )
+        mod = transform.ApplyRegisteredPassOp(
+            transform.AnyOpType.get(),
+            "canonicalize",
+            mod.result,
+            options={"top-down": BoolAttr.get(False)},
+        )
+        max_iter = transform.param_constant(
+            transform.AnyParamType.get(),
+            IntegerAttr.get(IntegerType.get_signless(64), 10),
+        )
+        max_rewrites = transform.param_constant(
+            transform.AnyParamType.get(),
+            IntegerAttr.get(IntegerType.get_signless(64), 1),
+        )
+        transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            "canonicalize",
+            mod,
+            options={
+                "top-down": BoolAttr.get(False),
+                "max-iterations": max_iter,
+                "test-convergence": BoolAttr.get(True),
+                "max-rewrites": max_rewrites,
+            },
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: testApplyRegisteredPassOp
+    # CHECK: transform.sequence
+    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
+    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+    # CHECK-SAME:    with options = {"top-down" = false}
+    # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
+    # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
+    # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
+    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+    # NB: MLIR has sorted the dict lexicographically by key:
+    # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
+    # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
+    # CHECK-SAME:                    "test-convergence" = true,
+    # CHECK-SAME:                    "top-down" = false}
+    # CHECK-SAME:    to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op

From 459475020aeff15d0f886ab99c59d66b744d3e17 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 16:35:55 +0100
Subject: [PATCH 0012/1322] Reapply 76197ea6f91f after removing an assertion

Specifically this is the assertion in BasicBlock.cpp. Now that we're not
examining or setting that flag consistently (because it'll be deleted in
about an hour) there's no need to keep this assertion.

Original commit title:

[DebugInfo][RemoveDIs] Remove some debug intrinsic-only codepaths (#143451)
---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp |  3 -
 llvm/lib/IR/AutoUpgrade.cpp                | 25 ++----
 llvm/lib/IR/BasicBlock.cpp                 |  1 -
 llvm/lib/IR/DIBuilder.cpp                  | 97 +++++-----------------
 llvm/lib/IR/DebugInfo.cpp                  | 19 +----
 llvm/lib/Transforms/Utils/LoopUtils.cpp    | 36 +++-----
 llvm/unittests/IR/IRBuilderTest.cpp        | 10 ---
 7 files changed, 40 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 59cd0dc8dd34..e8a3df3366b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1671,9 +1671,6 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
                               const DebugLoc &DbgLoc) {
   const BasicBlock *BB = FuncInfo.MBB->getBasicBlock();
   bool BlockHasMultipleInstrs = &BB->front() != &BB->back();
-  // Handle legacy case of debug intrinsics
-  if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat)
-    BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1;
   if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only non-debug
     // instruction in the block then emit it, otherwise we have the
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index cb90af36f3d9..a0886776ff93 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4490,7 +4490,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
-    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4518,29 +4517,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
     } else if (IsDbg) {
-      // We might have decided we don't want the new format after all between
-      // first requesting the upgrade and now; skip the conversion if that is
-      // the case, and check here to see if the intrinsic needs to be upgraded
-      // normally.
-      if (!CI->getModule()->IsNewDbgInfoFormat) {
-        bool NeedsUpgrade =
-            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
-        if (!NeedsUpgrade)
-          return;
-        FallthroughToDefaultUpgrade = true;
-      } else {
-        upgradeDbgIntrinsicToDbgRecord(Name, CI);
-      }
+      upgradeDbgIntrinsicToDbgRecord(Name, CI);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (!FallthroughToDefaultUpgrade) {
-      if (Rep)
-        CI->replaceAllUsesWith(Rep);
-      CI->eraseFromParent();
-      return;
-    }
+    if (Rep)
+      CI->replaceAllUsesWith(Rep);
+    CI->eraseFromParent();
+    return;
   }
 
   const auto &DefaultCase = [&]() -> void {
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index f716e9970b84..62a75313bb17 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -60,7 +60,6 @@ void BasicBlock::convertToNewDbgValues() {
   // instruction.
   SmallVector<DbgRecord *, 4> DbgVarRecs;
   for (Instruction &I : make_early_inc_range(InstList)) {
-    assert(!I.DebugMarker && "DebugMarker already set on old-format instrs?");
     if (DbgVariableIntrinsic *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
       // Convert this dbg.value to a DbgVariableRecord.
       DbgVariableRecord *Value = new DbgVariableRecord(DVI);
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 5e5ff22132e9..1484c549dd58 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1047,36 +1047,13 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
       LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID));
   assert(Link && "Linked instruction must have DIAssign metadata attached");
 
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
-        Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
-    // Insert after LinkedInstr.
-    BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
-    NextIt.setHeadBit(true);
-    insertDbgVariableRecord(DVR, NextIt);
-    return DVR;
-  }
-
-  LLVMContext &Ctx = LinkedInstr->getContext();
-  Module *M = LinkedInstr->getModule();
-  if (!AssignFn)
-    AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign);
-
-  std::array<Value *, 6> Args = {
-      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
-      MetadataAsValue::get(Ctx, SrcVar),
-      MetadataAsValue::get(Ctx, ValExpr),
-      MetadataAsValue::get(Ctx, Link),
-      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)),
-      MetadataAsValue::get(Ctx, AddrExpr),
-  };
-
-  IRBuilder<> B(Ctx);
-  B.SetCurrentDebugLocation(DL);
-
-  auto *DVI = cast<DbgAssignIntrinsic>(B.CreateCall(AssignFn, Args));
-  DVI->insertAfter(LinkedInstr->getIterator());
-  return DVI;
+  DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
+      Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
+  // Insert after LinkedInstr.
+  BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
+  NextIt.setHeadBit(true);
+  insertDbgVariableRecord(DVR, NextIt);
+  return DVR;
 }
 
 /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
@@ -1101,18 +1078,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
                                               DIExpression *Expr,
                                               const DILocation *DL,
                                               InsertPosition InsertPt) {
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR =
-        DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertPt);
-    return DVR;
-  }
-
-  if (!ValueFn)
-    ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value);
-  auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt);
-  cast<CallInst>(DVI)->setTailCall();
-  return DVI;
+  DbgVariableRecord *DVR =
+      DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
+  insertDbgVariableRecord(DVR, InsertPt);
+  return DVR;
 }
 
 DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
@@ -1124,25 +1093,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
 
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR =
-        DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertPt);
-    return DVR;
-  }
-
-  if (!DeclareFn)
-    DeclareFn = getDeclareIntrin(M);
-
-  trackIfUnresolved(VarInfo);
-  trackIfUnresolved(Expr);
-  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage),
-                   MetadataAsValue::get(VMContext, VarInfo),
-                   MetadataAsValue::get(VMContext, Expr)};
-
-  IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertPt);
-  return B.CreateCall(DeclareFn, Args);
+  DbgVariableRecord *DVR =
+      DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
+  insertDbgVariableRecord(DVR, InsertPt);
+  return DVR;
 }
 
 void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
@@ -1191,23 +1145,12 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
          "Expected matching subprograms");
 
   trackIfUnresolved(LabelInfo);
-  if (M.IsNewDbgInfoFormat) {
-    DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
-    if (InsertPt.isValid()) {
-      auto *BB = InsertPt.getBasicBlock();
-      BB->insertDbgRecordBefore(DLR, InsertPt);
-    }
-    return DLR;
+  DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
+  if (InsertPt.isValid()) {
+    auto *BB = InsertPt.getBasicBlock();
+    BB->insertDbgRecordBefore(DLR, InsertPt);
   }
-
-  if (!LabelFn)
-    LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label);
-
-  Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
-
-  IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertPt);
-  return B.CreateCall(LabelFn, Args);
+  return DLR;
 }
 
 void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 7db9891fdbd7..2a84e7bae0f1 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2123,22 +2123,11 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
     Expr = *R;
   }
   DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {});
-  if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
-    auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
-        &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
-    (void)Assign;
-    LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-    return;
-  }
-  auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
-                                    AddrExpr, VarRec.DL);
+  auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
+      &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
   (void)Assign;
-  LLVM_DEBUG(if (!Assign.isNull()) {
-    if (const auto *Record = dyn_cast<DbgRecord *>(Assign))
-      errs() << " > INSERT: " << *Record << "\n";
-    else
-      errs() << " > INSERT: " << *cast<Instruction *>(Assign) << "\n";
-  });
+  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+  return;
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 0681ebc111cb..ff69fa9f70c4 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -606,7 +606,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
 
   // Use a map to unique and a vector to guarantee deterministic ordering.
   llvm::SmallDenseSet<DebugVariable, 4> DeadDebugSet;
-  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
   llvm::SmallVector<DbgVariableRecord *, 4> DeadDbgVariableRecords;
 
   if (ExitBlock) {
@@ -633,29 +632,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           U.set(Poison);
         }
 
-        // RemoveDIs: do the same as below for DbgVariableRecords.
-        if (Block->IsNewDbgInfoFormat) {
-          for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
-                   filterDbgVars(I.getDbgRecordRange()))) {
-            DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
-                              DVR.getDebugLoc().get());
-            if (!DeadDebugSet.insert(Key).second)
-              continue;
-            // Unlinks the DVR from it's container, for later insertion.
-            DVR.removeFromParent();
-            DeadDbgVariableRecords.push_back(&DVR);
-          }
-        }
-
-        // For one of each variable encountered, preserve a debug intrinsic (set
+        // For one of each variable encountered, preserve a debug record (set
         // to Poison) and transfer it to the loop exit. This terminates any
         // variable locations that were set during the loop.
-        auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
-        if (!DVI)
-          continue;
-        if (!DeadDebugSet.insert(DebugVariable(DVI)).second)
-          continue;
-        DeadDebugInst.push_back(DVI);
+        for (DbgVariableRecord &DVR :
+             llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) {
+          DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                            DVR.getDebugLoc().get());
+          if (!DeadDebugSet.insert(Key).second)
+            continue;
+          // Unlinks the DVR from it's container, for later insertion.
+          DVR.removeFromParent();
+          DeadDbgVariableRecords.push_back(&DVR);
+        }
       }
 
     // After the loop has been deleted all the values defined and modified
@@ -671,9 +660,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
            "There should be a non-PHI instruction in exit block, else these "
            "instructions will have no parent.");
 
-    for (auto *DVI : DeadDebugInst)
-      DVI->moveBefore(*ExitBlock, InsertDbgValueBefore);
-
     // Due to the "head" bit in BasicBlock::iterator, we're going to insert
     // each DbgVariableRecord right at the start of the block, wheras dbg.values
     // would be repeatedly inserted before the first instruction. To replicate
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 3a7ba924792e..aadae5287c38 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -1003,18 +1003,8 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
-  // Test in new-debug mode.
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
-
-  // Test in old-debug mode.
-  // Reset the test then call convertFromNewDbgValues to flip the flag
-  // on the test's Module, Function and BasicBlock.
   TearDown();
-  SetUp();
-  M->convertFromNewDbgValues();
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
-  RunTest();
 }
 
 TEST_F(IRBuilderTest, createArtificialSubprogram) {

From f1575de4c5de9268f92eea1641af755a477e4ee4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 11 Jun 2025 11:37:12 -0500
Subject: [PATCH 0013/1322] [libc][NFC] Remove template from GPU allocator
 reference counter

Summary:
We don't need this to be generic, precommit for
https://github.com/llvm/llvm-project/pull/143607
---
 libc/src/__support/GPU/allocator.cpp | 32 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 135ced3df704..ecc0de1cb6ec 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -283,7 +283,7 @@ struct Slab {
 
 /// A wait-free guard around a pointer resource to be created dynamically if
 /// space is available and freed once there are no more users.
-template <typename T> struct GuardPtr {
+struct GuardPtr {
 private:
   struct RefCounter {
     // Indicates that the object is in its deallocation phase and thus invalid.
@@ -339,22 +339,22 @@ private:
     cpp::Atomic<uint64_t> counter{0};
   };
 
-  cpp::Atomic<T *> ptr{nullptr};
+  cpp::Atomic<Slab *> ptr{nullptr};
   RefCounter ref{};
 
   // Should be called be a single lane for each different pointer.
   template <typename... Args>
-  T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
-    T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
+  Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
+    Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
     if (!expected &&
-        ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
-                                    cpp::MemoryOrder::RELAXED,
-                                    cpp::MemoryOrder::RELAXED)) {
+        ptr.compare_exchange_strong(
+            expected, reinterpret_cast<Slab *>(SENTINEL),
+            cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
       count = cpp::numeric_limits<uint64_t>::max();
-      void *raw = impl::rpc_allocate(sizeof(T));
+      void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
-      T *mem = new (raw) T(cpp::forward<Args>(args)...);
+      Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
 
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
       ptr.store(mem, cpp::MemoryOrder::RELAXED);
@@ -364,7 +364,7 @@ private:
       return mem;
     }
 
-    if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
+    if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
       return nullptr;
 
     if (!ref.acquire(n, count))
@@ -379,10 +379,10 @@ public:
   // The uniform mask represents which lanes share the same pointer. For each
   // uniform value we elect a leader to handle it on behalf of the other lanes.
   template <typename... Args>
-  T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
-              Args &&...args) {
+  Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
+                 Args &&...args) {
     count = 0;
-    T *result = nullptr;
+    Slab *result = nullptr;
     if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
       result = try_lock_impl(cpp::popcount(uniform), count,
                              cpp::forward<Args>(args)...);
@@ -403,8 +403,8 @@ public:
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
         ref.release(cpp::popcount(mask))) {
-      T *p = ptr.load(cpp::MemoryOrder::RELAXED);
-      p->~T();
+      Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
+      p->~Slab();
       impl::rpc_free(p);
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
       ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
@@ -417,7 +417,7 @@ public:
 };
 
 // The global array used to search for a valid slab to allocate from.
-static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
+static GuardPtr slots[ARRAY_SIZE] = {};
 
 // Tries to find a slab in the table that can support the given chunk size.
 static Slab *find_slab(uint32_t chunk_size) {

From aa8a1fa6f515f45db55365b9c1f8453ded24ed32 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Wed, 11 Jun 2025 18:42:10 +0200
Subject: [PATCH 0014/1322] [DLCov][NFC] Annotate intentionally-blank DebugLocs
 in existing code (#136192)

Following the work in PR #107279, this patch applies the annotative
DebugLocs, which indicate that a particular instruction is intentionally
missing a location for a given reason, to existing sites in the compiler
where their conditions apply. This is NFC in ordinary LLVM builds (each
function `DebugLoc::getFoo()` is inlined as `DebugLoc()`), but marks the
instruction in coverage-tracking builds so that it will be ignored by
Debugify, allowing only real errors to be reported. From a developer
standpoint, it also communicates the intentionality and reason for a
missing DebugLoc.

Some notes for reviewers:

- The difference between `I->dropLocation()` and
`I->setDebugLoc(DebugLoc::getDropped())` is that the former _may_ decide
to keep some debug info alive, while the latter will always be empty; in
this patch, I always used the latter (even if the former could
technically be correct), because the former could result in some
(barely) different output, and I'd prefer to keep this patch purely NFC.
- I've generally documented the uses of `DebugLoc::getUnknown()`, with
the exception of the vectorizers - in summary, they are a huge cause of
dropped source locations, and I don't have the time or the domain
knowledge currently to solve that, so I've plastered it all over them as
a form of "fixme".
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 10 ++++--
 llvm/lib/Transforms/IPO/IROutliner.cpp        |  4 +--
 .../Transforms/InstCombine/InstCombinePHI.cpp |  9 ++++-
 .../Scalar/CorrelatedValuePropagation.cpp     |  3 +-
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp |  3 ++
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |  4 ++-
 llvm/lib/Transforms/Scalar/LICM.cpp           |  4 ++-
 .../Transforms/Scalar/LoopLoadElimination.cpp |  3 +-
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |  3 ++
 .../Scalar/TailRecursionElimination.cpp       |  4 ++-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  9 +++++
 llvm/lib/Transforms/Utils/Local.cpp           |  3 +-
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |  4 ++-
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      |  5 +++
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 10 +++---
 .../Vectorize/LoopVectorizationPlanner.h      | 34 ++++++++++++-------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  8 +++--
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 12 +++++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |  6 ++--
 19 files changed, 101 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b3fe0ab8b5cb..7db058638650 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1494,8 +1494,14 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     // FIXME: Pass Global's alignment when globals have alignment
     AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(),
                                         nullptr, GV->getName(), FirstI);
-    if (!isa<UndefValue>(GV->getInitializer()))
-      new StoreInst(GV->getInitializer(), Alloca, FirstI);
+    Alloca->setDebugLoc(DebugLoc::getCompilerGenerated());
+    if (!isa<UndefValue>(GV->getInitializer())) {
+      auto *SI = new StoreInst(GV->getInitializer(), Alloca, FirstI);
+      // FIXME: We're localizing a global and creating a store instruction for
+      // the initial value of that global. Could we logically use the global
+      // variable's (if one exists) line for this?
+      SI->setDebugLoc(DebugLoc::getCompilerGenerated());
+    }
 
     GV->replaceAllUsesWith(Alloca);
     GV->eraseFromParent();
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index ff66a518be75..cb18b55ae218 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -730,7 +730,7 @@ static void moveFunctionData(Function &Old, Function &New,
       // other outlined instructions.
       if (!isa<CallInst>(&Val)) {
         // Remove the debug information for outlined functions.
-        Val.setDebugLoc(DebugLoc());
+        Val.setDebugLoc(DebugLoc::getDropped());
 
         // Loop info metadata may contain line locations. Update them to have no
         // value in the new subprogram since the outlined code could be from
@@ -1864,7 +1864,7 @@ replaceArgumentUses(OutlinableRegion &Region,
       Value *ValueOperand = SI->getValueOperand();
 
       StoreInst *NewI = cast<StoreInst>(I->clone());
-      NewI->setDebugLoc(DebugLoc());
+      NewI->setDebugLoc(DebugLoc::getDropped());
       BasicBlock *OutputBB = VBBIt->second;
       NewI->insertInto(OutputBB, OutputBB->end());
       LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index a842a5edcb8a..6477141ab095 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -870,7 +870,14 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
     NewPhi->addIncoming(NewIncoming[I], Phi.getIncomingBlock(I));
 
   InsertNewInstBefore(NewPhi, Phi.getIterator());
-  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+  auto *CI = CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+
+  // We use a dropped location here because the new ZExt is necessarily a merge
+  // of ZExtInsts and at least one constant from incoming branches; the presence
+  // of the constant means we have no viable DebugLoc from that branch, and
+  // therefore we must use a dropped location.
+  CI->setDebugLoc(DebugLoc::getDropped());
+  return CI;
 }
 
 /// If all operands to a PHI node are the same "unary" operator and they all are
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index b95a851c99b4..4627f537dc16 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -432,7 +432,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
       BasicBlock *NewUnreachableBB =
           BasicBlock::Create(BB->getContext(), "default.unreachable",
                              BB->getParent(), DefaultDest);
-      new UnreachableInst(BB->getContext(), NewUnreachableBB);
+      auto *UI = new UnreachableInst(BB->getContext(), NewUnreachableBB);
+      UI->setDebugLoc(DebugLoc::getTemporary());
 
       DefaultDest->removePredecessor(BB);
       SI->setDefaultDest(NewUnreachableBB);
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 95d52b9b4e18..334c911191cb 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1506,6 +1506,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
       auto *NewRHS = CastInst::Create(
           Instruction::Trunc, RHS, LHSOp->getType(), "",
           L->getLoopPreheader()->getTerminator()->getIterator());
+      // NewRHS is an operation that has been hoisted out of the loop, and
+      // therefore should have a dropped location.
+      NewRHS->setDebugLoc(DebugLoc::getDropped());
       ICmp->setOperand(Swapped ? 1 : 0, LHSOp);
       ICmp->setOperand(Swapped ? 0 : 1, NewRHS);
       // Samesign flag cannot be preserved after narrowing the compare.
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 9449b4cb35b9..37b85bf9de81 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -3001,8 +3001,10 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
       continue;
     // Expand the select.
     Value *Cond = SI->getCondition();
-    if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI))
+    if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) {
       Cond = new FreezeInst(Cond, "cond.fr", SI->getIterator());
+      cast<FreezeInst>(Cond)->setDebugLoc(DebugLoc::getTemporary());
+    }
     MDNode *BranchWeights = getBranchWeightMDNode(*SI);
     Instruction *Term =
         SplitBlockAndInsertIfThen(Cond, SI, false, BranchWeights);
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9773ef778b69..3024ccb330b1 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2248,7 +2248,7 @@ bool llvm::promoteLoopAccessesToScalars(
     if (SawUnorderedAtomic)
       PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
     PreheaderLoad->setAlignment(Alignment);
-    PreheaderLoad->setDebugLoc(DebugLoc());
+    PreheaderLoad->setDebugLoc(DebugLoc::getDropped());
     if (AATags && LoadIsGuaranteedToExecute)
       PreheaderLoad->setAAMetadata(AATags);
 
@@ -2808,6 +2808,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     auto *NewBO =
         BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
                                Ins->getName() + ".reass", Ins->getIterator());
+    NewBO->setDebugLoc(DebugLoc::getDropped());
     NewBO->copyIRFlags(Ins);
     if (VariantOp == Ins)
       VariantOp = NewBO;
@@ -2864,6 +2865,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
 
   auto *NewBO = BinaryOperator::Create(
       Opcode, LV, Inv, BO->getName() + ".reass", BO->getIterator());
+  NewBO->setDebugLoc(DebugLoc::getDropped());
 
   if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) {
     // Intersect FMF flags for FADD and FMUL.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 39e8d702a692..6bdf76f789a4 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -442,7 +442,7 @@ public:
     assert(PH && "Preheader should exist!");
     Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
                                           PH->getTerminator());
-    Value *Initial =
+    Instruction *Initial =
         new LoadInst(Cand.Load->getType(), InitialPtr, "load_initial",
                      /* isVolatile */ false, Cand.Load->getAlign(),
                      PH->getTerminator()->getIterator());
@@ -450,6 +450,7 @@ public:
     // into the loop's preheader. A debug location inside the loop will cause
     // a misleading stepping when debugging. The test update-debugloc-store
     // -forwarded.ll checks this.
+    Initial->setDebugLoc(DebugLoc::getDropped());
 
     PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded");
     PHI->insertBefore(L->getHeader()->begin());
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 0bf90036b8b8..9b40fc03da6b 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -274,6 +274,7 @@ static void buildPartialUnswitchConditionalBranch(
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
     const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) {
   IRBuilder<> IRB(&BB);
+  IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
 
   SmallVector<Value *> FrozenInvariants;
   for (Value *Inv : Invariants) {
@@ -330,6 +331,7 @@ static void buildPartialInvariantUnswitchConditionalBranch(
   }
 
   IRBuilder<> IRB(&BB);
+  IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
   Value *Cond = VMap[ToDuplicate[0]];
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                    Direction ? &NormalSucc : &UnswitchedSucc);
@@ -2369,6 +2371,7 @@ static void unswitchNontrivialInvariants(
         // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted
         // out of the loop.
         Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator());
+        cast<Instruction>(Cond)->setDebugLoc(DebugLoc::getDropped());
       }
       BI->setCondition(Cond);
       DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 7dd6c60370ed..c71c5a70a12f 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -515,7 +515,8 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
   BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
   NewEntry->takeName(HeaderBB);
   HeaderBB->setName("tailrecurse");
-  BranchInst::Create(HeaderBB, NewEntry);
+  auto *BI = BranchInst::Create(HeaderBB, NewEntry);
+  BI->setDebugLoc(DebugLoc::getCompilerGenerated());
   // If the new branch preserves the debug location of CI, it could result in
   // misleading stepping, if CI is located in a conditional branch.
   // So, here we don't give any debug location to the new branch.
@@ -801,6 +802,7 @@ void TailRecursionEliminator::cleanupAndFinalize() {
         SelectInst *SI =
             SelectInst::Create(RetKnownPN, RetPN, RI->getOperand(0),
                                "current.ret.tr", RI->getIterator());
+        SI->setDebugLoc(DebugLoc::getCompilerGenerated());
         RetSelects.push_back(SI);
         RI->setOperand(0, SI);
       }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7a9605bf5f8d..f47c467d1514 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1775,6 +1775,7 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg,
   AllocaInst *NewAlloca =
       new AllocaInst(ByValType, Arg->getType()->getPointerAddressSpace(),
                      nullptr, Alignment, Arg->getName());
+  NewAlloca->setDebugLoc(DebugLoc::getCompilerGenerated());
   NewAlloca->insertBefore(Caller->begin()->begin());
   IFI.StaticAllocas.push_back(NewAlloca);
 
@@ -3258,6 +3259,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
     // Add an unconditional branch to make this look like the CallInst case...
     CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator());
+    // We intend to replace this DebugLoc with another later.
+    CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getTemporary());
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
@@ -3359,6 +3362,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     Returns[0]->eraseFromParent();
     ReturnBB->eraseFromParent();
   } else if (!CB.use_empty()) {
+    // In this case there are no returns to use, so there is no clear source
+    // location for the "return".
+    // FIXME: It may be correct to use the scope end line of the function here,
+    // since this likely means we are falling out of the function.
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getUnknown());
     // No returns, but something is using the return value of the call.  Just
     // nuke the result.
     CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2630a1a7a6af..a3252a69874d 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3127,7 +3127,8 @@ static bool markAliveBlocks(Function &F,
           BasicBlock *UnreachableNormalDest = BasicBlock::Create(
               Ctx, OrigNormalDest->getName() + ".unreachable",
               II->getFunction(), OrigNormalDest);
-          new UnreachableInst(Ctx, UnreachableNormalDest);
+          auto *UI = new UnreachableInst(Ctx, UnreachableNormalDest);
+          UI->setDebugLoc(DebugLoc::getTemporary());
           II->setNormalDest(UnreachableNormalDest);
           if (DTU)
             DTU->applyUpdates(
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 1a2e42235627..f4b378b82dae 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -348,7 +348,9 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU,
         NewUnreachableBB =
             BasicBlock::Create(DefaultDest->getContext(), "default.unreachable",
                                DefaultDest->getParent(), DefaultDest);
-        new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
+        auto *UI =
+            new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
+        UI->setDebugLoc(DebugLoc::getTemporary());
       }
 
       DefaultDest->removePredecessor(BB);
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 48d9528f0c3d..5db7fc956c49 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -318,6 +318,11 @@ public:
                                SSAUpdater *Updater) {
     PHINode *PHI =
         PHINode::Create(Updater->ProtoType, NumPreds, Updater->ProtoName);
+    // FIXME: Ordinarily we don't care about or try to assign DebugLocs to PHI
+    // nodes, but loop optimizations may try to use a PHI node as a DebugLoc
+    // source (e.g. if this is an induction variable), and it's not clear what
+    // location we could attach here, so mark this unknown for now.
+    PHI->setDebugLoc(DebugLoc::getUnknown());
     PHI->insertBefore(BB->begin());
     return PHI;
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index e221022bb836..975ce3bef517 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1137,7 +1137,7 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
         // branch, drop it. When we fold the bonus instructions we want to make
         // sure we reset their debug locations in order to avoid stepping on
         // dead code caused by folding dead branches.
-        NewBonusInst->setDebugLoc(DebugLoc());
+        NewBonusInst->setDebugLoc(DebugLoc::getDropped());
       } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
         mapAtomInstance(DL, VMap);
       }
@@ -2821,7 +2821,8 @@ static void mergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
       // so just form a new block with unreachable terminator.
       BasicBlock *MergedNormalDest = BasicBlock::Create(
           Ctx, II0BB->getName() + ".cont", Func, InsertBeforeBlock);
-      new UnreachableInst(Ctx, MergedNormalDest);
+      auto *UI = new UnreachableInst(Ctx, MergedNormalDest);
+      UI->setDebugLoc(DebugLoc::getTemporary());
       MergedInvoke->setNormalDest(MergedNormalDest);
     }
 
@@ -3389,7 +3390,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
     if (!SpeculatedStoreValue || &I != SpeculatedStore) {
       // Don't update the DILocation of dbg.assign intrinsics.
       if (!isa<DbgAssignIntrinsic>(&I))
-        I.setDebugLoc(DebugLoc());
+        I.setDebugLoc(DebugLoc::getDropped());
     }
     I.dropUBImplyingAttrsAndMetadata();
 
@@ -5707,7 +5708,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   BasicBlock *NewDefaultBlock = BasicBlock::Create(
       BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
       OrigDefaultBlock);
-  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  auto *UI = new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  UI->setDebugLoc(DebugLoc::getTemporary());
   Switch->setDefaultDest(&*NewDefaultBlock);
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index b81d582f07e8..70f541d64b30 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -153,7 +153,7 @@ public:
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
                               const Twine &Name = "") {
-    DebugLoc DL;
+    DebugLoc DL = DebugLoc::getUnknown();
     if (Inst)
       DL = Inst->getDebugLoc();
     VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
@@ -165,7 +165,8 @@ public:
     return createInstruction(Opcode, Operands, DL, Name);
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                              const VPIRFlags &Flags, DebugLoc DL = {},
+                              const VPIRFlags &Flags,
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, Flags, DL, Name));
@@ -174,7 +175,8 @@ public:
   VPInstruction *createNaryOp(unsigned Opcode,
                               std::initializer_list<VPValue *> Operands,
                               Type *ResultTy, const VPIRFlags &Flags = {},
-                              DebugLoc DL = {}, const Twine &Name = "") {
+                              DebugLoc DL = DebugLoc::getUnknown(),
+                              const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
   }
@@ -182,22 +184,25 @@ public:
   VPInstruction *createOverflowingOp(unsigned Opcode,
                                      std::initializer_list<VPValue *> Operands,
                                      VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
-                                     DebugLoc DL = {}, const Twine &Name = "") {
+                                     DebugLoc DL = DebugLoc::getUnknown(),
+                                     const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
 
-  VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
+  VPValue *createNot(VPValue *Operand, DebugLoc DL = DebugLoc::getUnknown(),
                      const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
   }
 
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
+  VPValue *createAnd(VPValue *LHS, VPValue *RHS,
+                     DebugLoc DL = DebugLoc::getUnknown(),
                      const Twine &Name = "") {
     return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
   }
 
-  VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
+  VPValue *createOr(VPValue *LHS, VPValue *RHS,
+                    DebugLoc DL = DebugLoc::getUnknown(),
                     const Twine &Name = "") {
 
     return tryInsertInstruction(new VPInstruction(
@@ -205,14 +210,16 @@ public:
         VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
   }
 
-  VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
+  VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS,
+                            DebugLoc DL = DebugLoc::getUnknown(),
                             const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name));
   }
 
   VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
-                        DebugLoc DL = {}, const Twine &Name = "",
+                        DebugLoc DL = DebugLoc::getUnknown(),
+                        const Twine &Name = "",
                         std::optional<FastMathFlags> FMFs = std::nullopt) {
     auto *Select =
         FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
@@ -226,20 +233,23 @@ public:
   /// and \p B.
   /// TODO: add createFCmp when needed.
   VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
-                      DebugLoc DL = {}, const Twine &Name = "") {
+                      DebugLoc DL = DebugLoc::getUnknown(),
+                      const Twine &Name = "") {
     assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
            Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
         new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
   }
 
-  VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
+  VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
                           GEPNoWrapFlags::none(), DL, Name));
   }
-  VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
+  VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
+                                DebugLoc DL = DebugLoc::getUnknown(),
                                 const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 427c1460fcfc..2a237f42e404 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -772,7 +772,7 @@ protected:
 /// Look for a meaningful debug location on the instruction or its operands.
 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
   if (!I)
-    return DebugLoc();
+    return DebugLoc::getUnknown();
 
   DebugLoc Empty;
   if (I->getDebugLoc() != Empty)
@@ -1881,13 +1881,15 @@ public:
     if (SCEVCheckBlock) {
       SCEVCheckBlock->getTerminator()->moveBefore(
           Preheader->getTerminator()->getIterator());
-      new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
+      auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
+      UI->setDebugLoc(DebugLoc::getTemporary());
       Preheader->getTerminator()->eraseFromParent();
     }
     if (MemCheckBlock) {
       MemCheckBlock->getTerminator()->moveBefore(
           Preheader->getTerminator()->getIterator());
-      new UnreachableInst(Preheader->getContext(), MemCheckBlock);
+      auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
+      UI->setDebugLoc(DebugLoc::getTemporary());
       Preheader->getTerminator()->eraseFromParent();
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ec40124c57a6..c3ca22dce0cc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17434,6 +17434,12 @@ static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return llvm::propagateMetadata(Inst, Insts);
 }
 
+static DebugLoc getDebugLocFromPHI(PHINode &PN) {
+  if (DebugLoc DL = PN.getDebugLoc())
+    return DL;
+  return DebugLoc::getUnknown();
+}
+
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilderBase::InsertPointGuard Guard(Builder);
 
@@ -17599,14 +17605,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent(),
                              PH->getParent()->getFirstNonPHIIt());
-      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
 
       // Adjust insertion point once all PHI's have been generated.
       Builder.SetInsertPoint(PH->getParent(),
                              PH->getParent()->getFirstInsertionPt());
-      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
 
       V = FinalShuffle(V, E);
 
@@ -17638,7 +17644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         }
 
         Builder.SetInsertPoint(IBB->getTerminator());
-        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+        Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
         Value *Vec = vectorizeOperand(E, I);
         if (VecTy != Vec->getType()) {
           assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbcbfee4e471..acc861b99197 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1816,9 +1816,9 @@ public:
 class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors {
 protected:
   VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
-                    VPValue *Start, DebugLoc DL = {})
-      : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>({Start}), UnderlyingInstr, DL) {
-  }
+                    VPValue *Start, DebugLoc DL = DebugLoc::getUnknown())
+      : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>({Start}),
+                          UnderlyingInstr, DL) {}
 
   const VPRecipeBase *getAsRecipe() const override { return this; }
 

From 117e78fe5012087c1ee535b91936bf4d8e3c7785 Mon Sep 17 00:00:00 2001
From: William <113542065+saturn691@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:51:34 +0100
Subject: [PATCH 0015/1322] [libc] Add NULL macro definitions to header files
 (#142764)

By the C standard, <locale.h>, <stddef.h> <stdio.h>, <stdlib.h>,
<string.h>, <time.h>, and <wchar.h> require NULL to be defined.
---
 libc/include/CMakeLists.txt | 5 +++++
 libc/include/locale.yaml    | 3 +++
 libc/include/stdio.yaml     | 2 ++
 libc/include/stdlib.yaml    | 4 +++-
 libc/include/string.h.def   | 2 --
 libc/include/string.yaml    | 4 +++-
 libc/include/time.yaml      | 4 +++-
 libc/include/wchar.yaml     | 4 +++-
 8 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 7209e10c68b8..55268d19529c 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -255,6 +255,7 @@ add_header_macro(
   time.h
   DEPENDS
     .llvm_libc_common_h
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.time_macros
     .llvm-libc-types.clock_t
     .llvm-libc-types.time_t
@@ -329,6 +330,7 @@ add_header_macro(
   stdio.h
   DEPENDS
     .llvm-libc-macros.file_seek_macros
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.stdio_macros
     .llvm-libc-types.FILE
     .llvm-libc-types.cookie_io_functions_t
@@ -343,6 +345,7 @@ add_header_macro(
   ../libc/include/stdlib.yaml
   stdlib.h
   DEPENDS
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.stdlib_macros
     .llvm-libc-types.__atexithandler_t
     .llvm-libc-types.__qsortcompare_t
@@ -709,6 +712,7 @@ add_header_macro(
   wchar.h
   DEPENDS
     .llvm_libc_common_h
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.wchar_macros
     .llvm-libc-types.mbstate_t
     .llvm-libc-types.size_t
@@ -723,6 +727,7 @@ add_header_macro(
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.locale_macros
+    .llvm-libc-macros.null_macro
     .llvm-libc-types.locale_t
     .llvm-libc-types.struct_lconv
 )
diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml
index 6c71b70e59f0..4566984ad83a 100644
--- a/libc/include/locale.yaml
+++ b/libc/include/locale.yaml
@@ -1,5 +1,8 @@
 header: locale.h
 header_template: locale.h.def
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: locale_t
   - type_name: struct_lconv
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 2619984cca26..3d5164fa10ff 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -1,6 +1,8 @@
 header: stdio.h
 header_template: stdio.h.def
 macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
   - macro_name: stdout
     macro_value: stdout
   - macro_name: stdin
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index f7155ba27a16..3b2ff13c684b 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -4,7 +4,9 @@ standards:
   - stdc
 merge_yaml_files:
   - stdlib-malloc.yaml
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: __atexithandler_t
   - type_name: __qsortcompare_t
diff --git a/libc/include/string.h.def b/libc/include/string.h.def
index 1bd2687db2be..339d005e43a4 100644
--- a/libc/include/string.h.def
+++ b/libc/include/string.h.def
@@ -11,8 +11,6 @@
 
 #include "__llvm-libc-common.h"
 
-#include "llvm-libc-macros/null-macro.h"
-
 %%public_api()
 
 #endif // LLVM_LIBC_STRING_H
diff --git a/libc/include/string.yaml b/libc/include/string.yaml
index 9f72b8db6c1e..736deceb453d 100644
--- a/libc/include/string.yaml
+++ b/libc/include/string.yaml
@@ -1,6 +1,8 @@
 header: string.h
 header_template: string.h.def
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: locale_t
   - type_name: size_t
diff --git a/libc/include/time.yaml b/libc/include/time.yaml
index 7bb25dbe85ac..3b9d77c0aaae 100644
--- a/libc/include/time.yaml
+++ b/libc/include/time.yaml
@@ -1,6 +1,8 @@
 header: time.h
 header_template: time.h.def
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: struct_timeval
   - type_name: clockid_t
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 877be48b6a10..57f4f6660827 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -1,6 +1,8 @@
 header: wchar.h
 header_template: wchar.h.def
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: size_t
   - type_name: wint_t

From 469922f7c40a1733fba98e29fa2bd09a9565ddd6 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 11 Jun 2025 16:57:23 +0000
Subject: [PATCH 0016/1322] [X86] Don't emit ENDBR for asm goto branch targets
 (#143439)

Similarly to #141562, which disabled BTI generation for ARM asm goto
branch targets, drop unnecessary ENDBRs from IsInlineAsmBrIndirectTarget
machine basic blocks.
---
 .../Target/X86/X86IndirectBranchTracking.cpp  |  2 +-
 llvm/test/CodeGen/X86/callbr-asm-endbr.ll     | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/callbr-asm-endbr.ll

diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 7740a174af4f..52be14228e55 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -147,7 +147,7 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MBB : MF) {
     // Find all basic blocks that their address was taken (for example
     // in the case of indirect jump) and add ENDBR instruction.
-    if (MBB.hasAddressTaken())
+    if (MBB.isMachineBlockAddressTaken() || MBB.isIRBlockAddressTaken())
       Changed |= addENDBR(MBB, MBB.begin());
 
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
diff --git a/llvm/test/CodeGen/X86/callbr-asm-endbr.ll b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll
new file mode 100644
index 000000000000..133de89d5f3a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i32 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    endbr64
+; CHECK-NEXT:    addl $4, %edi
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:  # %bb.1: # %normal
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # Inline asm indirect target
+; CHECK-NEXT:    # %fail
+; CHECK-NEXT:    # Label of block must be emitted
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+entry:
+  %0 = add i32 %a, 4
+  callbr void asm "xorl $0, $0; jmp ${1:l}", "r,!i,~{dirflag},~{fpsr},~{flags}"(i32 %0) to label %normal [label %fail]
+
+normal:
+  ret i32 0
+
+fail:
+  ret i32 1
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 8, !"cf-protection-branch", i32 1}

From 145b1b0f103e61cfc8a47ed37080e955630a1390 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 11 Jun 2025 09:57:42 -0700
Subject: [PATCH 0017/1322] [lldb][nfc] Factor out code checking if Variable is
 in scope (#143572)

This is useful for checking whether a variable is in scope inside a
specific block.
---
 lldb/include/lldb/Symbol/Variable.h |  3 ++
 lldb/source/Symbol/Variable.cpp     | 46 +++++++++++++++--------------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/lldb/include/lldb/Symbol/Variable.h b/lldb/include/lldb/Symbol/Variable.h
index c437624d1ea6..5b9c709c8b86 100644
--- a/lldb/include/lldb/Symbol/Variable.h
+++ b/lldb/include/lldb/Symbol/Variable.h
@@ -89,6 +89,9 @@ public:
 
   bool IsInScope(StackFrame *frame);
 
+  /// Returns true if this variable is in scope at `addr` inside `block`.
+  bool IsInScope(const Block &block, const Address &addr);
+
   bool LocationIsValidForFrame(StackFrame *frame);
 
   bool LocationIsValidForAddress(const Address &address);
diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp
index 8244725aba54..af32e0e958e5 100644
--- a/lldb/source/Symbol/Variable.cpp
+++ b/lldb/source/Symbol/Variable.cpp
@@ -290,28 +290,9 @@ bool Variable::IsInScope(StackFrame *frame) {
       // this variable was defined in is currently
       Block *deepest_frame_block =
           frame->GetSymbolContext(eSymbolContextBlock).block;
-      if (deepest_frame_block) {
-        SymbolContext variable_sc;
-        CalculateSymbolContext(&variable_sc);
-
-        // Check for static or global variable defined at the compile unit
-        // level that wasn't defined in a block
-        if (variable_sc.block == nullptr)
-          return true;
-
-        // Check if the variable is valid in the current block
-        if (variable_sc.block != deepest_frame_block &&
-            !variable_sc.block->Contains(deepest_frame_block))
-          return false;
-
-        // If no scope range is specified then it means that the scope is the
-        // same as the scope of the enclosing lexical block.
-        if (m_scope_range.IsEmpty())
-          return true;
-
-        addr_t file_address = frame->GetFrameCodeAddress().GetFileAddress();
-        return m_scope_range.FindEntryThatContains(file_address) != nullptr;
-      }
+      Address frame_addr = frame->GetFrameCodeAddress();
+      if (deepest_frame_block)
+        return IsInScope(*deepest_frame_block, frame_addr);
     }
     break;
 
@@ -321,6 +302,27 @@ bool Variable::IsInScope(StackFrame *frame) {
   return false;
 }
 
+bool Variable::IsInScope(const Block &block, const Address &addr) {
+  SymbolContext variable_sc;
+  CalculateSymbolContext(&variable_sc);
+
+  // Check for static or global variable defined at the compile unit
+  // level that wasn't defined in a block
+  if (variable_sc.block == nullptr)
+    return true;
+
+  // Check if the variable is valid in the current block
+  if (variable_sc.block != &block && !variable_sc.block->Contains(&block))
+    return false;
+
+  // If no scope range is specified then it means that the scope is the
+  // same as the scope of the enclosing lexical block.
+  if (m_scope_range.IsEmpty())
+    return true;
+
+  return m_scope_range.FindEntryThatContains(addr.GetFileAddress()) != nullptr;
+}
+
 Status Variable::GetValuesForVariableExpressionPath(
     llvm::StringRef variable_expr_path, ExecutionContextScope *scope,
     GetVariableCallback callback, void *baton, VariableList &variable_list,

From 370e54d03a5bb11f3f283ad5ab479501c74069c7 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 11 Jun 2025 19:02:36 +0200
Subject: [PATCH 0018/1322] [CIR] Upstream splat op for VectorType (#139827)

This change adds support for splat op for VectorType

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 32 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  8 +++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 51 +++++++++++++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   | 10 +++
 clang/test/CIR/CodeGen/vector-ext.cpp         | 64 +++++++++++++++++++
 clang/test/CIR/CodeGen/vector.cpp             | 63 ++++++++++++++++++
 clang/test/CIR/IR/vector.cir                  | 33 ++++++++++
 7 files changed, 261 insertions(+)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 565c0676773e..634f0dd554c7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2277,6 +2277,38 @@ def VecTernaryOp : CIR_Op<"vec.ternary",
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// VecSplatOp
+//===----------------------------------------------------------------------===//
+
+def VecSplatOp : CIR_Op<"vec.splat", [Pure,
+  TypesMatchWith<"type of 'value' matches element type of 'result'", "result",
+                 "value", "cast<VectorType>($_self).getElementType()">]> {
+
+  let summary = "Convert a scalar into a vector";
+  let description = [{
+    The `cir.vec.splat` operation creates a vector value from a scalar value.
+    All elements of the vector have the same value, that of the given scalar.
+
+    It's a separate operation from `cir.vec.create` because more
+    efficient LLVM IR can be generated for it, and because some optimization and
+    analysis passes can benefit from knowing that all elements of the vector
+    have the same value.
+
+    ```mlir
+    %value = cir.const #cir.int<3> : !s32i
+    %value_vec = cir.vec.splat %value : !s32i, !cir.vector<4 x !s32i>
+    ```
+  }];
+
+  let arguments = (ins CIR_VectorElementType:$value);
+  let results = (outs CIR_VectorType:$result);
+
+  let assemblyFormat = [{
+    $value `:` type($value) `,` qualified(type($result)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // BaseClassAddrOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 481eb492d187..30d231e2c61d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1780,6 +1780,14 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
                               cgf.convertType(destTy));
   }
 
+  case CK_VectorSplat: {
+    // Create a vector object and fill all elements with the same scalar value.
+    assert(destTy->isVectorType() && "CK_VectorSplat to non-vector type");
+    return builder.create<cir::VecSplatOp>(
+        cgf.getLoc(subExpr->getSourceRange()), cgf.convertType(destTy),
+        Visit(subExpr));
+  }
+
   default:
     cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(),
                                    "CastExpr: ", ce->getCastKindName());
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 4fdf8f9ec269..1642d10d427b 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1803,6 +1803,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMVecExtractOpLowering,
                CIRToLLVMVecInsertOpLowering,
                CIRToLLVMVecCmpOpLowering,
+               CIRToLLVMVecSplatOpLowering,
                CIRToLLVMVecShuffleOpLowering,
                CIRToLLVMVecShuffleDynamicOpLowering,
                CIRToLLVMVecTernaryOpLowering
@@ -1956,6 +1957,56 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite(
+    cir::VecSplatOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  // Vector splat can be implemented with an `insertelement` and a
+  // `shufflevector`, which is better than an `insertelement` for each
+  // element in the vector. Start with an undef vector. Insert the value into
+  // the first element. Then use a `shufflevector` with a mask of all 0 to
+  // fill out the entire vector with that value.
+  cir::VectorType vecTy = op.getType();
+  mlir::Type llvmTy = typeConverter->convertType(vecTy);
+  mlir::Location loc = op.getLoc();
+  mlir::Value poison = rewriter.create<mlir::LLVM::PoisonOp>(loc, llvmTy);
+
+  mlir::Value elementValue = adaptor.getValue();
+  if (mlir::isa<mlir::LLVM::PoisonOp>(elementValue.getDefiningOp())) {
+    // If the splat value is poison, then we can just use poison value
+    // for the entire vector.
+    rewriter.replaceOp(op, poison);
+    return mlir::success();
+  }
+
+  if (auto constValue =
+          dyn_cast<mlir::LLVM::ConstantOp>(elementValue.getDefiningOp())) {
+    if (auto intAttr = dyn_cast<mlir::IntegerAttr>(constValue.getValue())) {
+      mlir::DenseIntElementsAttr denseVec = mlir::DenseIntElementsAttr::get(
+          mlir::cast<mlir::ShapedType>(llvmTy), intAttr.getValue());
+      rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(
+          op, denseVec.getType(), denseVec);
+      return mlir::success();
+    }
+
+    if (auto fpAttr = dyn_cast<mlir::FloatAttr>(constValue.getValue())) {
+      mlir::DenseFPElementsAttr denseVec = mlir::DenseFPElementsAttr::get(
+          mlir::cast<mlir::ShapedType>(llvmTy), fpAttr.getValue());
+      rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(
+          op, denseVec.getType(), denseVec);
+      return mlir::success();
+    }
+  }
+
+  mlir::Value indexValue =
+      rewriter.create<mlir::LLVM::ConstantOp>(loc, rewriter.getI64Type(), 0);
+  mlir::Value oneElement = rewriter.create<mlir::LLVM::InsertElementOp>(
+      loc, poison, elementValue, indexValue);
+  SmallVector<int32_t> zeroValues(vecTy.getSize(), 0);
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ShuffleVectorOp>(op, oneElement,
+                                                           poison, zeroValues);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMVecShuffleOpLowering::matchAndRewrite(
     cir::VecShuffleOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 22d8a1e7c22e..2eda568c84bd 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -367,6 +367,16 @@ public:
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMVecSplatOpLowering
+    : public mlir::OpConversionPattern<cir::VecSplatOp> {
+public:
+  using mlir::OpConversionPattern<cir::VecSplatOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::VecSplatOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMVecShuffleOpLowering
     : public mlir::OpConversionPattern<cir::VecShuffleOp> {
 public:
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index e1814f216f6b..965c44c9461a 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -990,6 +990,7 @@ void foo14() {
 // OGCG: %[[TMP_B:.*]] = load <4 x float>, ptr %[[VEC_B]], align 16
 // OGCG: %[[GE:.*]] = fcmp oge <4 x float> %[[TMP_A]], %[[TMP_B]]
 // OGCG: %[[RES:.*]] = sext <4 x i1> %[[GE]] to <4 x i32>
+// OGCG: store <4 x i32> %[[RES]], ptr {{.*}}, align 16
 
 void foo15() {
   vi4 a;
@@ -1092,6 +1093,69 @@ void foo17() {
 // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16
 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16>
 
+void foo18() {
+  vi4 a = {1, 2, 3, 4};
+  vi4 shl = a << 3;
+
+  uvi4 b = {1u, 2u, 3u, 4u};
+  uvi4 shr = b >> 3u;
+}
+
+// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
+// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
+// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
+// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
+// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i>
+// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+
+// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
+// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
 void foo19() {
   vi4 a;
   vi4 b;
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index 4f116faa7a1a..23e91724dc0f 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1071,6 +1071,69 @@ void foo17() {
 // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16
 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16>
 
+void foo18() {
+  vi4 a = {1, 2, 3, 4};
+  vi4 shl = a << 3;
+
+  uvi4 b = {1u, 2u, 3u, 4u};
+  uvi4 shr = b >> 3u;
+}
+
+// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
+// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
+// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
+// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
+// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i>
+// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+
+// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
+// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
 void foo19() {
   vi4 a;
   vi4 b;
diff --git a/clang/test/CIR/IR/vector.cir b/clang/test/CIR/IR/vector.cir
index a455acf92ab6..f23f5de9692d 100644
--- a/clang/test/CIR/IR/vector.cir
+++ b/clang/test/CIR/IR/vector.cir
@@ -187,4 +187,37 @@ cir.func @vector_shuffle_dynamic_test() {
 // CHECK:    cir.return
 // CHECK: }
 
+cir.func @vector_splat_test() {
+    %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+    %1 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+    %2 = cir.const #cir.int<1> : !s32i
+    %3 = cir.const #cir.int<2> : !s32i
+    %4 = cir.const #cir.int<3> : !s32i
+    %5 = cir.const #cir.int<4> : !s32i
+    %6 = cir.vec.create(%2, %3, %4, %5 : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+    cir.store %6, %0 : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+    %7 = cir.load %0 : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+    %8 = cir.const #cir.int<3> : !s32i
+    %9 = cir.vec.splat %8 : !s32i, !cir.vector<4 x !s32i>
+    %10 = cir.shift(left, %7 : !cir.vector<4 x !s32i>, %9 : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+    cir.store %10, %1 : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+    cir.return
+}
+
+// CHECK: cir.func @vector_splat_test() {
+// CHECK-NEXT: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CHECK-NEXT: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+// CHECK-NEXT: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CHECK-NEXT: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
+// CHECK-NEXT: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CHECK-NEXT: cir.store %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CHECK-NEXT: %[[TMP:.*]] = cir.load %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CHECK-NEXT: %[[SPLAT_VAL:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SPLAT_VAL]] : !s32i, !cir.vector<4 x !s32i>
+// CHECK-NEXT: %[[SHL:.*]] = cir.shift(left, %[[TMP]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+// CHECK-NEXT: cir.store %[[SHL]], %[[SHL_RES:.*]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CHECK-NEXT: cir.return
+
 }

From 621a7d0f66f3da27e687dd7dd832450334ee81da Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 11 Jun 2025 19:02:47 +0200
Subject: [PATCH 0019/1322] [flang] silence bogus error with BIND(C) variable
 in hermetic module (#143737)

The global name semantic check was firing in a bogus way when BIND(C)
variables are in hermetic module.

Do not raise the error if one of the symbol with the conflicting global
name is an "hermetic variant" of the other.
---
 flang/lib/Semantics/check-declarations.cpp | 10 +++++++++
 flang/test/Semantics/modfile76.F90         | 24 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 flang/test/Semantics/modfile76.F90

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 46a5b970fdf0..f9d64485f140 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2958,6 +2958,14 @@ static std::optional<std::string> DefinesGlobalName(const Symbol &symbol) {
   return std::nullopt;
 }
 
+static bool IsSameSymbolFromHermeticModule(
+    const Symbol &symbol, const Symbol &other) {
+  return symbol.name() == other.name() && symbol.owner().IsModule() &&
+      other.owner().IsModule() && symbol.owner() != other.owner() &&
+      symbol.owner().GetName() &&
+      symbol.owner().GetName() == other.owner().GetName();
+}
+
 // 19.2 p2
 void CheckHelper::CheckGlobalName(const Symbol &symbol) {
   if (auto global{DefinesGlobalName(symbol)}) {
@@ -2975,6 +2983,8 @@ void CheckHelper::CheckGlobalName(const Symbol &symbol) {
           (!IsExternalProcedureDefinition(symbol) ||
               !IsExternalProcedureDefinition(other))) {
         // both are procedures/BLOCK DATA, not both definitions
+      } else if (IsSameSymbolFromHermeticModule(symbol, other)) {
+        // Both symbols are the same thing.
       } else if (symbol.has<ModuleDetails>()) {
         Warn(common::LanguageFeature::BenignNameClash, symbol.name(),
             "Module '%s' conflicts with a global name"_port_en_US,
diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90
new file mode 100644
index 000000000000..50ee9a088e11
--- /dev/null
+++ b/flang/test/Semantics/modfile76.F90
@@ -0,0 +1,24 @@
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
+!RUN: %flang_fc1 -fsyntax-only %s
+
+! Tests that a BIND(C) variable in a module A captured in a hermetic module
+! file USE'd in a module B is not creating bogus complaints about BIND(C) name
+! conflict when both module A and B are later accessed.
+
+#if STEP == 1
+module modfile75a
+  integer, bind(c) :: x
+end
+
+module modfile75b
+  use modfile75a ! capture hermetically
+end
+
+#else
+subroutine test
+  use modfile75a
+  use modfile75b
+  implicit none
+  print *, x
+end subroutine
+#endif

From 7414d88b5f8af1bdf8da6bf2493b485ba5d079f2 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 18:13:56 +0100
Subject: [PATCH 0020/1322] Squelch an unused-function warning

After removing some debug-intrinsic creation code, this function is now
unused (and un-necessary)
---
 llvm/lib/IR/DIBuilder.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1484c549dd58..c56dd7a1d382 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1069,10 +1069,6 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) {
   return MetadataAsValue::get(VMContext, ValueAsMetadata::get(V));
 }
 
-static Function *getDeclareIntrin(Module &M) {
-  return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare);
-}
-
 DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
                                               DILocalVariable *VarInfo,
                                               DIExpression *Expr,

From 3e24dadee0d7ecc5f95fe0760afb7abdeb9a2dc5 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Wed, 11 Jun 2025 10:24:19 -0700
Subject: [PATCH 0021/1322] [Clang][Tooling][NFC] Use move to avoid copies of
 large objects (#143603)

Static analysis flagged these cases in which can use std::move and avoid
copies of large objects.
---
 clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 44a270d5f7b3..b1495163ccc2 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -657,7 +657,7 @@ void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc,
     P1689ModuleInfo RequiredModule;
     RequiredModule.ModuleName = Path[0].getIdentifierInfo()->getName().str();
     RequiredModule.Type = P1689ModuleInfo::ModuleType::NamedCXXModule;
-    MDC.RequiredStdCXXModules.push_back(RequiredModule);
+    MDC.RequiredStdCXXModules.push_back(std::move(RequiredModule));
     return;
   }
 
@@ -920,7 +920,7 @@ void ModuleDepCollectorPP::addAllSubmoduleDeps(
 
 void ModuleDepCollectorPP::addOneModuleDep(const Module *M, const ModuleID ID,
                                            ModuleDeps &MD) {
-  MD.ClangModuleDeps.push_back(ID);
+  MD.ClangModuleDeps.push_back(std::move(ID));
   if (MD.IsInStableDirectories)
     MD.IsInStableDirectories = MDC.ModularDeps[M]->IsInStableDirectories;
 }

From 66f533e7e34d6f6d0e293a67dd54be9e4c240ddd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 10:39:02 -0700
Subject: [PATCH 0022/1322] [IR] Fix warnings (#143752)

This patch fixes:

  llvm/lib/IR/DIBuilder.cpp:1072:18: error: unused function
  'getDeclareIntrin' [-Werror,-Wunused-function]

  llvm/include/llvm/IR/DIBuilder.h:51:15: error: private field
  'DeclareFn' is not used [-Werror,-Wunused-private-field]

  llvm/include/llvm/IR/DIBuilder.h:52:15: error: private field
  'ValueFn' is not used [-Werror,-Wunused-private-field]

  llvm/include/llvm/IR/DIBuilder.h:53:15: error: private field
  'LabelFn' is not used [-Werror,-Wunused-private-field]

  llvm/include/llvm/IR/DIBuilder.h:54:15: error: private field
  'AssignFn' is not used [-Werror,-Wunused-private-field]
---
 llvm/include/llvm/IR/DIBuilder.h | 6 +-----
 llvm/lib/IR/DIBuilder.cpp        | 3 +--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index ebfe41dd59af..43fca571ee6d 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -47,11 +47,7 @@ namespace llvm {
     Module &M;
     LLVMContext &VMContext;
 
-    DICompileUnit *CUNode;   ///< The one compile unit created by this DIBuiler.
-    Function *DeclareFn;     ///< llvm.dbg.declare
-    Function *ValueFn;       ///< llvm.dbg.value
-    Function *LabelFn;       ///< llvm.dbg.label
-    Function *AssignFn;      ///< llvm.dbg.assign
+    DICompileUnit *CUNode; ///< The one compile unit created by this DIBuiler.
 
     SmallVector<TrackingMDNodeRef, 4> AllEnumTypes;
     /// Track the RetainTypes, since they can be updated later on.
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index c56dd7a1d382..fd8c2d7bb5cc 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -25,8 +25,7 @@ using namespace llvm;
 using namespace llvm::dwarf;
 
 DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU)
-    : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr),
-      ValueFn(nullptr), LabelFn(nullptr), AssignFn(nullptr),
+    : M(m), VMContext(M.getContext()), CUNode(CU),
       AllowUnresolvedNodes(AllowUnresolvedNodes) {
   if (CUNode) {
     if (const auto &ETs = CUNode->getEnumTypes())

From c2f0af514beb7618660cf8d145fa9e49fb78869c Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Wed, 11 Jun 2025 10:47:17 -0700
Subject: [PATCH 0023/1322] [GISelValueTracking] Add test case for G_PTRTOINT

While we can only reason about the index/address, the G_PTRTOINT
operations returns all representation bits, so we can't assume the
remaining ones are all zeroes. This behaviour was clarified as part of
the discussion in https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54.
The LangRef semantics of ptrtoint being a full representation bitcast
were documented in https://github.com/llvm/llvm-project/pull/139349.

Prior to 77c8d214131e951e3d3a07b45a7436f54988d6f3 we were incorrectly
assuming known zeroes beyond the index size even if the input was
completely unknown. This commit adds a test case for G_PTRTOINT which
was omitted from that change.

See https://github.com/llvm/llvm-project/issues/139598

Reviewed By: arsenm

Pull Request: https://github.com/llvm/llvm-project/pull/139608
---
 .../AMDGPU/GlobalISel/knownbits-ptrtoint.mir  | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
new file mode 100644
index 000000000000..4073568fd421
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -passes="print<gisel-value-tracking>" %s -filetype=null 2>&1 | FileCheck %s
+## Check that we don't incorrectly assume known zeroes for and extend of a truncated ptrtoint
+## Test case for https://github.com/llvm/llvm-project/issues/139598
+---
+## We should see 128 unknown bits.
+name:            PtrToInt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToInt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s128) = G_PTRTOINT %4(p8)
+...
+---
+## We should see 128 high zeroes followed by 128 unknown bits for extending ptrtoint.
+name:            PtrToIntExt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntExt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:128
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s256) = G_PTRTOINT %4(p8)
+...
+---
+## We should see 48 unknown bits for truncating ptrtoint.
+name:            PtrToIntTrunc
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntTrunc
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s48) = G_PTRTOINT %4(p8)
+...
+---
+## This is the test for issue 139598: Truncating and then extending the
+## G_PTRTOINT result was filling all bits above the index bitwidth with known
+## zeroes even though the incoming value is completely unknown and G_PTRTOINT.
+## is lowered to a bitwise copy.
+## We should see all zero high bits with 48 unknown bits.
+name:            PtrToIntTruncExplicitExt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntTruncExplicitExt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %7:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s128) = G_PTRTOINT %4(p8)
+    %6:_(s48) = G_TRUNC %5(s128)
+    %7:_(s256) = G_ZEXT %6(s48)
+...
+---
+## Same test again but this time have the G_PTRTOINT do the truncation.
+## We should see all zero high bits with 48 unknown bits.
+name:            PtrToIntTruncImplicitExt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntTruncImplicitExt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %6:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s48) = G_PTRTOINT %4(p8)
+    %6:_(s256) = G_ZEXT %5(s48)
+...

From bbe59e19b60b0efa8cc200fb3260fe572e188b26 Mon Sep 17 00:00:00 2001
From: Kewen12 <Kewen.Meng@amd.com>
Date: Wed, 11 Jun 2025 11:12:54 -0700
Subject: [PATCH 0024/1322] [OpenMP][Offload] Update the Logic for Configuring
 Auto Zero-Copy (#143638)

Summary:

Currently the Auto Zero-Copy is enabled by checking every initialized
device to ensure that no dGPU is attached to an APU. However, an APU is
designed to comprise a homogeneous set of GPUs, therefore, it should be
sufficient to check any device for configuring Auto Zero-Copy. In this
PR, it checks the first initialized device in the list.

The changes in this PR are to clearly reflect the design and logic of
enabling the feature for further improving the readibility.
---
 offload/libomptarget/PluginManager.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp
index 93589960a426..c4d99dfa9f10 100644
--- a/offload/libomptarget/PluginManager.cpp
+++ b/offload/libomptarget/PluginManager.cpp
@@ -286,16 +286,16 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
   }
   PM->RTLsMtx.unlock();
 
-  bool UseAutoZeroCopy = Plugins.size() > 0;
+  bool UseAutoZeroCopy = false;
 
   auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
-  for (const auto &Device : *ExclusiveDevicesAccessor)
-    UseAutoZeroCopy &= Device->useAutoZeroCopy();
+  // APUs are homogeneous set of GPUs. Check the first device for
+  // configuring Auto Zero-Copy.
+  if (ExclusiveDevicesAccessor->size() > 0) {
+    auto &Device = *(*ExclusiveDevicesAccessor)[0];
+    UseAutoZeroCopy = Device.useAutoZeroCopy();
+  }
 
-  // Auto Zero-Copy can only be currently triggered when the system is an
-  // homogeneous APU architecture without attached discrete GPUs.
-  // If all devices suggest to use it, change requirement flags to trigger
-  // zero-copy behavior when mapping memory.
   if (UseAutoZeroCopy)
     addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
 

From fad1972d74aead159a5e91b068cbf736e83836b5 Mon Sep 17 00:00:00 2001
From: VISHAKH PRAKASH <vishakh.prakash@multicorewareinc.com>
Date: Wed, 11 Jun 2025 23:43:01 +0530
Subject: [PATCH 0025/1322] [SPIRV] FIX print the symbolic operand for opcode
 for the operation OpSpecConstantOp (#135756)

Current implementation outputs opcode is an immediate but spirv-tools
requires that the name of the operation without "Op" is needed for the
instruction OpSpecConstantOp
that is if the opcode is OpBitcast the instruction must be
`%1 = OpSpecConstantOp %6 Bitcast %17`
instead of
`%1 = OpBitcast %6 124 %17`

[refer this commit for more
info](https://github.com/KhronosGroup/SPIRV-Tools/commit/0f166be68d4b6624a10d6bf312679505d391ec22)

---------

Co-authored-by: Dmitry Sidorov <dmitry.sidorov@intel.com>
Co-authored-by: Ebin-McW <ebin.jose@multicorewareinc.com>
---
 .../SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp      |  3 +-
 .../Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h |  5 ++
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |  2 +-
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 90 +++++++++++++++++++
 llvm/test/CodeGen/SPIRV/const-nested-vecs.ll  |  4 +-
 .../fun-ptr-addrcast.ll                       |  2 +-
 .../opencl/basic/progvar_prog_scope_init.ll   |  2 +-
 .../CodeGen/SPIRV/opt-gepoperator-of-gvar.ll  |  2 +-
 .../pointers/PtrCast-in-OpSpecConstantOp.ll   | 12 +--
 .../CodeGen/SPIRV/pointers/global-ptrtoint.ll |  4 +-
 .../pointers/irtrans-added-int-const-32-64.ll |  2 +-
 11 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index 342456757409..0ed97f5b41c5 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -68,7 +68,8 @@ getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category,
       Category != SPIRV::OperandCategory::FunctionControlOperand &&
       Category != SPIRV::OperandCategory::MemorySemanticsOperand &&
       Category != SPIRV::OperandCategory::MemoryOperandOperand &&
-      Category != SPIRV::OperandCategory::KernelProfilingInfoOperand)
+      Category != SPIRV::OperandCategory::KernelProfilingInfoOperand &&
+      Category != SPIRV::OperandCategory::SpecConstantOpOperandsOperand)
     return "UNKNOWN";
   // Value that encodes many enum values (one bit per enum value).
   std::string Name;
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index 083c7f8460bf..b8c467fef8e8 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -222,6 +222,11 @@ namespace CooperativeMatrixOperands {
 #include "SPIRVGenTables.inc"
 } // namespace CooperativeMatrixOperands
 
+namespace SpecConstantOpOperands {
+#define GET_SpecConstantOpOperands_DECL
+#include "SPIRVGenTables.inc"
+} // namespace SpecConstantOpOperands
+
 struct ExtendedBuiltin {
   StringRef Name;
   InstructionSet::InstructionSet Set;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 338f6809a3e4..049ba0275f22 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -245,7 +245,7 @@ def OpSpecConstantComposite: Op<51, (outs ID:$res), (ins TYPE:$type, variable_op
                   "$res = OpSpecConstantComposite $type">;
 def OpSpecConstantCompositeContinuedINTEL: Op<6092, (outs), (ins variable_ops),
                   "OpSpecConstantCompositeContinuedINTEL">;
-def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, i32imm:$c, ID:$o, variable_ops),
+def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, SpecConstantOpOperands:$c, ID:$o, variable_ops),
                   "$res = OpSpecConstantOp $t $c $o">;
 
 // 3.42.8 Memory Instructions
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index ca8a9a9997a8..f1aae42ea2be 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -172,6 +172,7 @@ def KernelProfilingInfoOperand : OperandCategory;
 def OpcodeOperand : OperandCategory;
 def CooperativeMatrixLayoutOperand : OperandCategory;
 def CooperativeMatrixOperandsOperand : OperandCategory;
+def SpecConstantOpOperandsOperand : OperandCategory;
 def MatrixMultiplyAccumulateOperandsOperand : OperandCategory;
 
 //===----------------------------------------------------------------------===//
@@ -1755,6 +1756,95 @@ defm MatrixAAndBBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x40,
 defm MatrixCBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x80, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>;
 defm MatrixResultBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x100, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>;
 
+//===----------------------------------------------------------------------===//
+// Multiclass used to define SpecConstant Operands enum values and at the
+// same time SymbolicOperand.
+//===----------------------------------------------------------------------===//
+
+def SpecConstantOpOperands : GenericEnum, Operand<i32> {
+  let FilterClass = "SpecConstantOpOperands";
+  let NameField = "Name";
+  let ValueField = "Value";
+  let PrintMethod = !strconcat("printSymbolicOperand<OperandCategory::", FilterClass, "Operand>");
+}
+
+class SpecConstantOpOperands<string name, bits<32> value> {
+  string Name = name;
+  bits<32> Value = value;
+}
+
+multiclass SpecConstantOpOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
+  def : SpecConstantOpOperands<NAME, value>;
+  defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+}
+
+// Conversion
+defm SConvert :  SpecConstantOpOperandsOperand<114, [], []>;
+defm FConvert :  SpecConstantOpOperandsOperand<115, [], []>;
+defm ConvertFToS :  SpecConstantOpOperandsOperand<110, [], [Kernel]>;
+defm ConvertSToF :  SpecConstantOpOperandsOperand<111, [], [Kernel]>;
+defm ConvertFToU :  SpecConstantOpOperandsOperand<109, [], [Kernel]>;
+defm ConvertUToF :  SpecConstantOpOperandsOperand<112, [], [Kernel]>;
+defm UConvert :  SpecConstantOpOperandsOperand<113, [], [Kernel]>;
+defm ConvertPtrToU :  SpecConstantOpOperandsOperand<117, [], [Kernel]>;
+defm ConvertUToPtr :  SpecConstantOpOperandsOperand<120, [], [Kernel]>;
+defm GenericCastToPtr :  SpecConstantOpOperandsOperand<122, [], [Kernel]>;
+defm PtrCastToGeneric :  SpecConstantOpOperandsOperand<121, [], [Kernel]>;
+defm Bitcast :  SpecConstantOpOperandsOperand<124, [], []>;
+defm QuantizeToF16 :  SpecConstantOpOperandsOperand<116, [], [Shader]>;
+// Arithmetic 
+defm SNegate :  SpecConstantOpOperandsOperand<126, [], []>;
+defm Not :  SpecConstantOpOperandsOperand<200, [], []>;
+defm IAdd :  SpecConstantOpOperandsOperand<128, [], []>;
+defm ISub :  SpecConstantOpOperandsOperand<130, [], []>;
+defm IMul :  SpecConstantOpOperandsOperand<132, [], []>;
+defm UDiv :  SpecConstantOpOperandsOperand<134, [], []>;
+defm SDiv :  SpecConstantOpOperandsOperand<135, [], []>;
+defm UMod :  SpecConstantOpOperandsOperand<137, [], []>;
+defm SRem :  SpecConstantOpOperandsOperand<138, [], []>;
+defm SMod :  SpecConstantOpOperandsOperand<139, [], []>;
+defm ShiftRightLogical :  SpecConstantOpOperandsOperand<194, [], []>;
+defm ShiftRightArithmetic :  SpecConstantOpOperandsOperand<195, [], []>;
+defm ShiftLeftLogical :  SpecConstantOpOperandsOperand<196, [], []>;
+defm BitwiseOr :  SpecConstantOpOperandsOperand<197, [], []>;
+defm BitwiseAnd :  SpecConstantOpOperandsOperand<199, [], []>;
+defm BitwiseXor :  SpecConstantOpOperandsOperand<198, [], []>;
+defm FNegate :  SpecConstantOpOperandsOperand<127, [], [Kernel]>;
+defm FAdd :  SpecConstantOpOperandsOperand<129, [], [Kernel]>;
+defm FSub :  SpecConstantOpOperandsOperand<131, [], [Kernel]>;
+defm FMul :  SpecConstantOpOperandsOperand<133, [], [Kernel]>;
+defm FDiv :  SpecConstantOpOperandsOperand<136, [], [Kernel]>;
+defm FRem :  SpecConstantOpOperandsOperand<140, [], [Kernel]>;
+defm FMod :  SpecConstantOpOperandsOperand<141, [], [Kernel]>;
+// Composite;
+defm VectorShuffle :  SpecConstantOpOperandsOperand<79, [], []>;
+defm CompositeExtract :  SpecConstantOpOperandsOperand<81, [], []>;
+defm CompositeInsert :  SpecConstantOpOperandsOperand<82, [], []>;
+// Logical;
+defm LogicalOr :  SpecConstantOpOperandsOperand<166, [], []>;
+defm LogicalAnd :  SpecConstantOpOperandsOperand<167, [], []>;
+defm LogicalNot :  SpecConstantOpOperandsOperand<168, [], []>;
+defm LogicalEqual :  SpecConstantOpOperandsOperand<164, [], []>;
+defm LogicalNotEqual :  SpecConstantOpOperandsOperand<165, [], []>;
+defm Select :  SpecConstantOpOperandsOperand<169, [], []>;
+// Comparison;
+defm IEqual :  SpecConstantOpOperandsOperand<170, [], []>;
+defm INotEqual :  SpecConstantOpOperandsOperand<171, [], []>;
+defm ULessThan :  SpecConstantOpOperandsOperand<176, [], []>;
+defm SLessThan :  SpecConstantOpOperandsOperand<177, [], []>;
+defm UGreaterThan :  SpecConstantOpOperandsOperand<172, [], []>;
+defm SGreaterThan :  SpecConstantOpOperandsOperand<173, [], []>;
+defm ULessThanEqual :  SpecConstantOpOperandsOperand<178, [], []>;
+defm SLessThanEqual :  SpecConstantOpOperandsOperand<179, [], []>;
+defm UGreaterThanEqual :  SpecConstantOpOperandsOperand<174, [], []>;
+defm SGreaterThanEqual :  SpecConstantOpOperandsOperand<175, [], []>;
+// Memory
+defm AccessChain :  SpecConstantOpOperandsOperand<65, [], [Kernel]>;
+defm InBoundsAccessChain :  SpecConstantOpOperandsOperand<66, [], [Kernel]>;
+defm PtrAccessChain :  SpecConstantOpOperandsOperand<67, [], [Kernel]>;
+defm InBoundsPtrAccessChain :  SpecConstantOpOperandsOperand<70, [], [Kernel]>;
+defm CooperativeMatrixLengthKHR : SpecConstantOpOperandsOperand<4460, [], []>;
+
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Matrix Multiply Accumulate Operands enum values and at the same time
 // SymbolicOperand entries with string mnemonics and capabilities.
diff --git a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
index 9234106e5fcd..266b46e65f31 100644
--- a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
+++ b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
@@ -25,8 +25,8 @@
 ; CHECK-SPIRV-DAG: %[[#IntZero:]] = OpConstantNull %[[#IntTy]]
 ; CHECK-SPIRV-DAG: %[[#LongZero:]] = OpConstantNull %[[#LongTy]]
 ; CHECK-SPIRV64-DAG: %[[#ConstLong2:]] = OpConstant %[[#LongTy]] 2
-; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]]
-; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]]
+; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]]
+; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]]
 ; CHECK-SPIRV-DAG: %[[#PtrPtrCharTy:]] = OpTypePointer CrossWorkgroup %[[#PtrCharTy]]
 ; CHECK-SPIRV-DAG: %[[#AVar]] = OpVariable %[[#PtrArr2V2CharTy]] CrossWorkgroup %[[#Arr2V2Char]]
 ; CHECK-SPIRV-DAG: %[[#PVar]] = OpVariable %[[#PtrPtrCharTy]] CrossWorkgroup %[[#PvarInit]]
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
index 8edecc1329d0..e5736b88b63a 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]]
+; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] PtrCastToGeneric %[[#]]
 ; CHECK-COUNT-3: OpPtrCastToGeneric
 
 @G1 = addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @bar to ptr addrspace(4))] }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll
index 9d759a1cf47d..fbc83c7a1e04 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll
@@ -10,7 +10,7 @@
 ; CHECK-DAG: %[[#pt2:]] = OpTypePointer CrossWorkgroup %[[#arr2]]
 ; CHECK-DAG: %[[#pt3:]] = OpTypePointer CrossWorkgroup %[[#pt1]]
 ; CHECK-DAG: %[[#a_var]] = OpVariable %[[#pt2]] CrossWorkgroup
-; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] 70 %[[#a_var]]
+; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] InBoundsPtrAccessChain %[[#a_var]]
 ; CHECK-DAG: %[[#p_var]] = OpVariable %[[#pt3]] CrossWorkgroup %[[#const]]
 @var = addrspace(1) global i8 0, align 1
 @g_var = addrspace(1) global i8 1, align 1
diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
index 5f9229f5a5bd..447dfa701b65 100644
--- a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
+++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
@@ -14,7 +14,7 @@
 ; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]]
 ; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]]
 ; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]]
-; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]]
+; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] InBoundsPtrAccessChain %[[#Bytes]] %[[#C648]]
 
 ; CHECK: OpFunction
 ; CHECK: %[[#]] = OpFunctionParameter %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
index 55d638f80cc5..ca7ca06fbdc8 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
@@ -23,20 +23,20 @@
 ; CHECK-DAG: %[[WPtr:.*]] = OpTypePointer Workgroup %[[Int]]
 
 ; CHECK-DAG: %[[F]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]]
-; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[F]]
+; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[F]]
 ; CHECK-DAG: %[[B]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]]
-; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[B]]
+; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[B]]
 ; CHECK-DAG: %[[GenFB:.*]] = OpConstantComposite %[[Arr2]] %[[GenF]] %[[GenB]]
 ; CHECK-DAG: %[[GenBF:.*]] = OpConstantComposite %[[Arr2]] %[[GenB]] %[[GenF]]
 ; CHECK-DAG: %[[CG1:.*]] = OpConstantComposite %[[Struct2]] %[[GenFB]]
 ; CHECK-DAG: %[[CG2:.*]] = OpConstantComposite %[[Struct2]] %[[GenBF]]
 
 ; CHECK-DAG: %[[X]] = OpVariable %[[WPtr]] Workgroup %[[#]]
-; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[X]]
-; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenX]]
+; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[X]]
+; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenX]]
 ; CHECK-DAG: %[[Y]] = OpVariable %[[WPtr]] Workgroup %[[#]]
-; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[Y]]
-; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenY]]
+; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[Y]]
+; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenY]]
 ; CHECK-DAG: %[[CWXY:.*]] = OpConstantComposite %[[Arr1]] %[[CWX]] %[[CWY]]
 ; CHECK-DAG: %[[CWYX:.*]] = OpConstantComposite %[[Arr1]] %[[CWY]] %[[CWX]]
 ; CHECK-DAG: %[[CG3:.*]] = OpConstantComposite %[[Struct1]] %[[CWXY]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
index 16c20f9067e6..0fd2f622dc84 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
@@ -11,9 +11,9 @@
 ; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyI64]] %[[TyI64]]
 ; CHECK-DAG: %[[Const128:.*]] = OpConstant %[[TyI64]] 128
 ; CHECK-DAG: %[[GlobalValue]] = OpVariable
-; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] 117 %[[GlobalValue]]
+; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] ConvertPtrToU %[[GlobalValue]]
 ; TODO: The following bitcast line looks unneeded and we may expect it to be removed in future
-; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] 124 %[[PtrToInt]]
+; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] Bitcast %[[PtrToInt]]
 ; CHECK-DAG: %[[ConstComposite:.*]] = OpConstantComposite %[[TyStruct]] %[[Const128]] %[[UseGlobalValue]]
 ; CHECK-DAG: %[[TyPtrStruct:.*]] = OpTypePointer CrossWorkgroup %[[TyStruct]]
 ; CHECK: OpVariable %[[TyPtrStruct]] CrossWorkgroup %[[ConstComposite]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
index c2738229aa4d..f5abcd38d040 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
@@ -12,7 +12,7 @@
 ; CHECK-SPIRV64-DAG: %[[#IntTy:]] = OpTypeInt 64 0
 ; CHECK-SPIRV32-DAG: %[[#IntTy:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2
-; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] 70 %[[#]] %[[#]] %[[#Const2]]
+; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] InBoundsPtrAccessChain %[[#]] %[[#]] %[[#Const2]]
 ; CHECK-SPIRV: OpFunction
 
 @a_var = addrspace(1) global [2 x i8] [i8 1, i8 1]

From 42c82fcc29c1c8e19b2265495a5d8f59fb5ea764 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 20:19:26 +0200
Subject: [PATCH 0026/1322] [libc++] Upgrade to GCC 15 (#138293)

---
 .github/workflows/libcxx-build-and-test.yaml      |  8 ++++----
 libcxx/docs/index.rst                             |  2 +-
 libcxx/src/experimental/time_zone.cpp             |  9 +++++++++
 .../alg.contains/ranges.contains.pass.cpp         |  4 ++--
 .../equality_comparable.compile.pass.cpp          |  6 ++++++
 .../equality_comparable_with.compile.pass.cpp     | 15 +++++++++++++++
 .../totally_ordered.compile.pass.cpp              |  3 +++
 .../totally_ordered_with.compile.pass.cpp         | 10 ++++++++++
 .../new.delete.array/new.size.except.pass.cpp     |  3 +++
 .../new.delete/new.delete.array/new.size.pass.cpp |  3 +++
 .../new.size_align.except.pass.cpp                |  3 +++
 .../new.delete.array/new.size_align.pass.cpp      |  3 +++
 .../new.delete.single/new.size.except.pass.cpp    |  3 +++
 .../new.delete.single/new.size.pass.cpp           |  3 +++
 .../new.size_align.except.pass.cpp                |  3 +++
 .../new.delete.single/new.size_align.pass.cpp     |  3 +++
 .../rand.dist.samp.discrete/ctor_func.pass.cpp    |  3 +++
 .../param_ctor_func.pass.cpp                      |  3 +++
 .../range.lazy.split/general.pass.cpp             | 12 ++++++++++++
 .../expected.expected/monadic/transform.pass.cpp  |  4 ++--
 .../monadic/transform_error.pass.cpp              |  4 ++--
 .../monadic/transform_error.pass.cpp              |  4 ++--
 .../formatter.char_array.pass.cpp                 |  2 +-
 .../meta/meta.rel/is_virtual_base_of.pass.cpp     |  7 +++++++
 ...le.pass.cpp => dependent_return_type.pass.cpp} |  4 ++++
 .../meta.unary.prop/is_implicit_lifetime.pass.cpp |  2 +-
 .../make_optional_explicit.pass.cpp               |  3 +++
 ...ke_optional_explicit_initializer_list.pass.cpp |  3 +++
 .../tuple.tuple/tuple.cnstr/PR31384.pass.cpp      |  2 +-
 .../catch_member_function_pointer_02.pass.cpp     |  2 +-
 30 files changed, 119 insertions(+), 17 deletions(-)
 rename libcxx/test/std/utilities/meta/meta.unary/{dependent_return_type.compile.pass.cpp => dependent_return_type.pass.cpp} (94%)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 80f2432b78de..f0bdf6c0b589 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -52,8 +52,8 @@ jobs:
         cxx: [ 'clang++-21' ]
         include:
           - config: 'generic-gcc'
-            cc: 'gcc-14'
-            cxx: 'g++-14'
+            cc: 'gcc-15'
+            cxx: 'g++-15'
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: ${{ matrix.config }}.${{ matrix.cxx }}
@@ -92,8 +92,8 @@ jobs:
         cxx: [ 'clang++-21' ]
         include:
           - config: 'generic-gcc-cxx11'
-            cc: 'gcc-14'
-            cxx: 'g++-14'
+            cc: 'gcc-15'
+            cxx: 'g++-15'
           - config: 'generic-cxx26'
             cc: 'clang-20'
             cxx: 'clang++-20'
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 9c957e9d20cb..ae9cc87c797f 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -135,7 +135,7 @@ Compiler     Versions            Restrictions               Support policy
 Clang        19, 20, 21-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
 AppleClang   15                                             latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1.3 (AIX)                                   latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
-GCC          14                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
+GCC          15                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
 ============ =================== ========================== =====================
 
 Libc++ also supports common platforms and architectures:
diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp
index 289164ab1203..a735800b6031 100644
--- a/libcxx/src/experimental/time_zone.cpp
+++ b/libcxx/src/experimental/time_zone.cpp
@@ -29,6 +29,15 @@
 // These quirks often use a 12h interval; this is the scan interval of zdump,
 // which implies there are no sys_info objects with a duration of less than 12h.
 
+// Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120502
+
+#include <__config>
+
+// TODO(LLVM 23): When upgrading to GCC 16 this can be removed
+#ifdef _LIBCPP_COMPILER_GCC
+#  pragma GCC optimize("-O0")
+#endif
+
 #include <algorithm>
 #include <cctype>
 #include <chrono>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp
index 08d8e119a4d2..1e89cd272e64 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp
@@ -195,7 +195,7 @@ constexpr bool test() {
       std::string a[] = {str1, str1, str, str1, str1};
       auto whole =
           std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5)));
-      bool ret = std::ranges::contains(whole.begin(), whole.end(), "hello world", [&](const std::string i) {
+      bool ret = std::ranges::contains(whole.begin(), whole.end(), +"hello world", [&](const std::string i) {
         ++projection_count;
         return i;
       });
@@ -207,7 +207,7 @@ constexpr bool test() {
       std::string a[] = {str1, str1, str, str1, str1};
       auto whole =
           std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5)));
-      bool ret = std::ranges::contains(whole, "hello world", [&](const std::string i) {
+      bool ret = std::ranges::contains(whole, +"hello world", [&](const std::string i) {
         ++projection_count;
         return i;
       });
diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
index ca0f40eb77d4..0531c0e096a1 100644
--- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include "compare_types.h"
+#include "test_macros.h"
 
 namespace fundamentals {
 static_assert(std::equality_comparable<int>);
@@ -43,7 +44,12 @@ static_assert(std::equality_comparable<unsigned char&&>);
 static_assert(std::equality_comparable<unsigned short const&&>);
 static_assert(std::equality_comparable<unsigned int volatile&&>);
 static_assert(std::equality_comparable<unsigned long const volatile&&>);
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(std::equality_comparable<int[5]>);
+#else
+static_assert(!std::equality_comparable<int[5]>);
+#endif
 static_assert(std::equality_comparable<int (*)(int)>);
 static_assert(std::equality_comparable<int (&)(int)>);
 static_assert(std::equality_comparable<int (*)(int) noexcept>);
diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
index 0afbe582ba89..2f8d7862c0f4 100644
--- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
@@ -107,7 +107,12 @@ static_assert(!check_equality_comparable_with < int,
               int (S::*)() const volatile&& noexcept > ());
 
 static_assert(check_equality_comparable_with<int*, int*>());
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_equality_comparable_with<int*, int[5]>());
+#else
+static_assert(!check_equality_comparable_with<int*, int[5]>());
+#endif
 static_assert(!check_equality_comparable_with<int*, int (*)()>());
 static_assert(!check_equality_comparable_with<int*, int (&)()>());
 static_assert(!check_equality_comparable_with<int*, int (S::*)()>());
@@ -148,7 +153,12 @@ static_assert(
 static_assert(!check_equality_comparable_with < int*,
               int (S::*)() const volatile&& noexcept > ());
 
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_equality_comparable_with<int[5], int[5]>());
+#else
+static_assert(!check_equality_comparable_with<int[5], int[5]>());
+#endif
 static_assert(!check_equality_comparable_with<int[5], int (*)()>());
 static_assert(!check_equality_comparable_with<int[5], int (&)()>());
 static_assert(!check_equality_comparable_with<int[5], int (S::*)()>());
@@ -942,7 +952,12 @@ static_assert(
 
 static_assert(!check_equality_comparable_with<std::nullptr_t, int>());
 static_assert(check_equality_comparable_with<std::nullptr_t, int*>());
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_equality_comparable_with<std::nullptr_t, int[5]>());
+#else
+static_assert(!check_equality_comparable_with<std::nullptr_t, int[5]>());
+#endif
 static_assert(check_equality_comparable_with<std::nullptr_t, int (*)()>());
 static_assert(check_equality_comparable_with<std::nullptr_t, int (&)()>());
 static_assert(check_equality_comparable_with<std::nullptr_t, int (S::*)()>());
diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp
index 6f8324eaf764..5959f70cf396 100644
--- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp
@@ -55,7 +55,10 @@ static_assert(models_totally_ordered<unsigned char&&>());
 static_assert(models_totally_ordered<unsigned short const&&>());
 static_assert(models_totally_ordered<unsigned int volatile&&>());
 static_assert(models_totally_ordered<unsigned long const volatile&&>());
+// Array comparisons are ill-formed in C++26
+#if TEST_STD_VER <= 23
 static_assert(models_totally_ordered<int[5]>());
+#endif
 static_assert(models_totally_ordered<int (*)(int)>());
 static_assert(models_totally_ordered<int (&)(int)>());
 static_assert(models_totally_ordered<int (*)(int) noexcept>());
diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp
index dffc33265aeb..398ef445baf9 100644
--- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp
@@ -89,7 +89,12 @@ static_assert(!check_totally_ordered_with<int, int (S::*)() const volatile&&>())
 static_assert(!check_totally_ordered_with < int, int (S::*)() const volatile&& noexcept > ());
 
 static_assert(check_totally_ordered_with<int*, int*>());
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_totally_ordered_with<int*, int[5]>());
+#else
+static_assert(!check_totally_ordered_with<int*, int[5]>());
+#endif
 static_assert(!check_totally_ordered_with<int*, int (*)()>());
 static_assert(!check_totally_ordered_with<int*, int (&)()>());
 static_assert(!check_totally_ordered_with<int*, int (S::*)()>());
@@ -117,7 +122,12 @@ static_assert(!check_totally_ordered_with < int*, int (S::*)() volatile&& noexce
 static_assert(!check_totally_ordered_with<int*, int (S::*)() const volatile&&>());
 static_assert(!check_totally_ordered_with < int*, int (S::*)() const volatile&& noexcept > ());
 
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_totally_ordered_with<int[5], int[5]>());
+#else
+static_assert(!check_totally_ordered_with<int[5], int[5]>());
+#endif
 static_assert(!check_totally_ordered_with<int[5], int (*)()>());
 static_assert(!check_totally_ordered_with<int[5], int (&)()>());
 static_assert(!check_totally_ordered_with<int[5], int (S::*)()>());
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
index 6a2b098c1b57..9ee32b841783 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp
index 437d06430773..4fdcc3b535a8 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp
@@ -11,6 +11,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cstddef>
 #include <cassert>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
index 4e34ebcb46c7..4dfaf7a30d7a 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
index c9b59ecaff39..a1b8466340a2 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
@@ -13,6 +13,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
index 6a515555e6db..346e881d016b 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp
index 729ef3ec46b0..0013dd3d0cbc 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp
@@ -11,6 +11,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cstddef>
 #include <cassert>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
index 7694314c87bf..fbeb880c83d8 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
index 5d321f08282b..59ecbe205513 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
@@ -13,6 +13,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp
index c3a88af92d36..c05a9434175a 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp
@@ -15,6 +15,9 @@
 //     discrete_distribution(size_t nw, double xmin, double xmax,
 //                           UnaryOperation fw);
 
+// There is a bogus diagnostic about a too large allocation
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <random>
 
 #include <cassert>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp
index 7ef936b7fc35..206bf5a0eb8a 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp
@@ -15,6 +15,9 @@
 //     param_type(size_t nw, double xmin, double xmax,
 //                           UnaryOperation fw);
 
+// There is a bogus diagnostic about a too large allocation
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <random>
 
 #include <cassert>
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp
index f4e87bb47399..521c0b1610bc 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp
@@ -312,7 +312,10 @@ constexpr bool main_test() {
   // Leading separator.
   {
     std::array expected = {""sv, "abc"sv, "def"sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one(" abc def"sv, short_sep, expected);
+#endif
     test_one("12abc12def"sv, long_sep, expected);
   }
 
@@ -326,7 +329,10 @@ constexpr bool main_test() {
   // Input consisting of a single separator.
   {
     std::array expected = {""sv, ""sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one(" "sv, short_sep, expected);
+#endif
     test_one("12"sv, long_sep, expected);
   }
 
@@ -354,7 +360,10 @@ constexpr bool main_test() {
   // Separators after every character.
   {
     std::array expected = {""sv, "a"sv, "b"sv, "c"sv, ""sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one(" a b c "sv, short_sep, expected);
+#endif
     test_one("12a12b12c12"sv, long_sep, expected);
   }
 
@@ -383,7 +392,10 @@ constexpr bool main_test() {
   // Terminating null as a separator.
   {
     std::array expected = {"abc"sv, "def"sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one("abc\0def"sv, '\0', expected);
+#endif
     test_one("abc\0\0def"sv, "\0\0"sv, expected);
   }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
index cbd54d623c0f..97c1e4a40f35 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333
-// XFAIL: gcc-14
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
+// XFAIL: gcc-14, gcc-15
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
index a19e17b01f6a..9570b2faac69 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333.
-// XFAIL: gcc-14
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995.
+// XFAIL: gcc-14, gcc-15
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
index f6d3011d1ea9..2ec15b51d11e 100644
--- a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333
-// XFAIL: gcc-14
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
+// XFAIL: gcc-14, gcc-15
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index bc056db9e254..8c4f3000ec1e 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // TODO FMT __builtin_memcpy isn't constexpr in GCC
-// UNSUPPORTED: gcc-14
+// UNSUPPORTED: gcc-14, gcc-15
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index f443d2030961..47c95c64a085 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -18,6 +18,8 @@
 #include <type_traits>
 #include <cassert>
 
+#include "test_macros.h"
+
 template <bool expected, class Base, class Derived>
 void test() {
   // Test the type of the variables
@@ -98,8 +100,13 @@ int main(int, char**) {
 
   // Test with virtual inheritance
   {
+#ifdef TEST_COMPILER_GCC // FIXME: Is this a GCC or Clang bug? Or is the standards wording ambiguous?
+    test<true, Base, Derived3Virtual>();
+    test<true, Derived, Derived3Virtual>();
+#else
     test<false, Base, Derived3Virtual>();
     test<false, Derived, Derived3Virtual>();
+#endif
     test<true, Derived2b, Derived3Virtual>();
     test<true, Derived2a, Derived3Virtual>();
     test<true, Base, DerivedPrivate>();
diff --git a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp
similarity index 94%
rename from libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp
rename to libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp
index 935a6e3db001..37d66831c7ce 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp
@@ -168,3 +168,7 @@ void instantiate() {
   void_t<int>();
 #endif
 }
+
+// This is not a .compile.pass.cpp because we want to ensure that GCC doesn't complain about incorrect builtins usage,
+// which only happens during CodeGen.
+int main(int, char**) { return 0; }
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index 681ad13a07df..afd76e65060e 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
index e7931e07e31d..23f131d2fc49 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
@@ -12,6 +12,9 @@
 // template <class T, class... Args>
 //   constexpr optional<T> make_optional(Args&&... args);
 
+// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
+// XFAIL: gcc-15
+
 #include <optional>
 #include <string>
 #include <memory>
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
index 80371d633371..5ddb229ad926 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
@@ -12,6 +12,9 @@
 // template <class T, class U, class... Args>
 //   constexpr optional<T> make_optional(initializer_list<U> il, Args&&... args);
 
+// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
+// XFAIL: gcc-15
+
 #include <cassert>
 #include <memory>
 #include <optional>
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
index e6812e9a3a30..ae5984c15530 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03
 
 // FIXME: Why does this start to fail with GCC 14?
-// XFAIL: !(c++11 || c++14) && gcc-14
+// XFAIL: !(c++11 || c++14) && (gcc-14 || gcc-15)
 
 // See https://llvm.org/PR31384.
 
diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
index 5d702031ce35..ec400713620c 100644
--- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
+++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
@@ -12,7 +12,7 @@
 
 // GCC supports noexcept function types but this test still fails.
 // This is likely a bug in their implementation. Investigation needed.
-// XFAIL: gcc-14
+// XFAIL: gcc-14, gcc-15
 
 #include <cassert>
 

From 806333063ff9a09ca001dcd77d4d5d6f0b9ecd74 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang@sifive.com>
Date: Thu, 12 Jun 2025 02:24:10 +0800
Subject: [PATCH 0027/1322] [RISCV] Guard the alternative static chain register
 use on ILP32E/LP64E (#142715)

Asserts the use of t3(x28) as the static chain register when branch control flow protection is enabled with ILP32E/LP64E, because such register is not present within the ABI.
---
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 24 ++++++++++++++--------
 llvm/test/CodeGen/RISCV/nest-register.ll   |  3 +++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index e0d1fb2facc8..cb6117eb0917 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -333,15 +333,23 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
   unsigned XLen = Subtarget.getXLen();
   MVT XLenVT = Subtarget.getXLenVT();
 
-  // Static chain parameter must not be passed in normal argument registers,
-  // so we assign t2/t3 for it as done in GCC's __builtin_call_with_static_chain
-  bool HasCFBranch =
-      Subtarget.hasStdExtZicfilp() &&
-      MF.getFunction().getParent()->getModuleFlag("cf-protection-branch");
-  // Normal: t2, Branch control flow protection: t3
-  const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7;
-
   if (ArgFlags.isNest()) {
+    // Static chain parameter must not be passed in normal argument registers,
+    // so we assign t2/t3 for it as done in GCC's
+    // __builtin_call_with_static_chain
+    bool HasCFBranch =
+        Subtarget.hasStdExtZicfilp() &&
+        MF.getFunction().getParent()->getModuleFlag("cf-protection-branch");
+
+    // Normal: t2, Branch control flow protection: t3
+    const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7;
+
+    RISCVABI::ABI ABI = Subtarget.getTargetABI();
+    if (HasCFBranch &&
+        (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E))
+      reportFatalUsageError(
+          "Nested functions with control flow protection are not "
+          "usable with ILP32E or LP64E ABI.");
     if (MCRegister Reg = State.AllocateReg(StaticChainReg)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
diff --git a/llvm/test/CodeGen/RISCV/nest-register.ll b/llvm/test/CodeGen/RISCV/nest-register.ll
index 9f8e4e1a2d8d..6e892e05c429 100644
--- a/llvm/test/CodeGen/RISCV/nest-register.ll
+++ b/llvm/test/CodeGen/RISCV/nest-register.ll
@@ -5,6 +5,8 @@
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I-ZICFILP %s
+; RUN: not llc -mtriple=riscv64 -target-abi=lp64e -mattr=+experimental-zicfilp \
+; RUN:   -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=LP64E-ZICFILP %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
 ; passed in the right register.
@@ -63,6 +65,7 @@ define ptr @nest_caller(ptr %arg) nounwind {
   ret ptr %result
 }
 
+; LP64E-ZICFILP: LLVM ERROR: Nested functions with control flow protection are not usable with ILP32E or LP64E ABI.
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 8, !"cf-protection-branch", i32 1}

From 7a0c9f607a26b77a7e584fd6734f03b7ee40ca95 Mon Sep 17 00:00:00 2001
From: Tony Varghese <tonypalampalliyil@gmail.com>
Date: Wed, 11 Jun 2025 23:56:15 +0530
Subject: [PATCH 0028/1322] [NFC][PowerPC] Pre-commit test case for
 exploitation of xxeval for the pattern ternary(A,X,or(B,C)) (#143693)

Pre-commit test case for exploitation of `xxeval` for ternary operations
of the pattern `ternary(A,X,or(B,C))`.
Exploitation of `xxeval` to be added later.

Co-authored-by: Tony Varghese <tony.varghese@ibm.com>
---
 .../CodeGen/PowerPC/xxeval-vselect-x-or.ll    | 268 ++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll

diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll
new file mode 100644
index 000000000000..1ad7e95e3682
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector selection instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, and(B, C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_and_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_and_BC_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, and(B, C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_and_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_and_BC_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, B, or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_B_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, B, or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_B_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+
+; Function to test ternary(A, C, or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_C_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %C, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, C, or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_C_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %C, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+
+; Function to test ternary(A, eqv(B,C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_eqv_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %eqv, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, eqv(B,C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_eqv_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %eqv, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_C_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_C_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(B), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_B_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(B), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_B_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nand(B,C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_nand_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nand(B,C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_nand_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %or
+  ret <2 x i64> %res
+}

From 8d7da9a2a40302af25ee70841a4b549f4ed5ee8a Mon Sep 17 00:00:00 2001
From: Yifei Xu <yifei.xu@utexas.edu>
Date: Wed, 11 Jun 2025 13:33:23 -0500
Subject: [PATCH 0029/1322] Update BUILD.bazel

Add missing dependency after https://github.com/llvm/llvm-project/pull/142916.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c1d63de04b8f..f6a7cd7dea85 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6882,6 +6882,7 @@ cc_library(
         ":SPIRVDialect",
         ":Support",
         "//llvm:config",
+        "//llvm:Support",
     ],
 )
 

From 773d357b9882fe0e30ffddee5ac1fbe2254fac05 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 20:39:45 +0200
Subject: [PATCH 0030/1322] [libc++] Simplify the implementation of
 __next_prime a bit (#143512)

---
 libcxx/src/hash.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp
index 41c4eb480a5f..50d8cf9f9f53 100644
--- a/libcxx/src/hash.cpp
+++ b/libcxx/src/hash.cpp
@@ -9,7 +9,6 @@
 #include <__hash_table>
 #include <algorithm>
 #include <stdexcept>
-#include <type_traits>
 
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wtautological-constant-out-of-range-compare")
 
@@ -52,16 +51,15 @@ const unsigned indices[] = {
 // are fewer potential primes to search, and fewer potential primes to divide
 // against.
 
-template <size_t _Sz = sizeof(size_t)>
-inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 4, void>::type __check_for_overflow(size_t N) {
-  if (N > 0xFFFFFFFB)
-    std::__throw_overflow_error("__next_prime overflow");
-}
-
-template <size_t _Sz = sizeof(size_t)>
-inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 8, void>::type __check_for_overflow(size_t N) {
-  if (N > 0xFFFFFFFFFFFFFFC5ull)
-    std::__throw_overflow_error("__next_prime overflow");
+inline void __check_for_overflow(size_t N) {
+  if constexpr (sizeof(size_t) == 4) {
+    if (N > 0xFFFFFFFB)
+      std::__throw_overflow_error("__next_prime overflow");
+  } else {
+    static_assert(sizeof(size_t) == 8);
+    if (N > 0xFFFFFFFFFFFFFFC5ull)
+      std::__throw_overflow_error("__next_prime overflow");
+  }
 }
 
 size_t __next_prime(size_t n) {

From 8dc63ca59003a4b72217221c1c801237614c9d7d Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 11:47:09 -0700
Subject: [PATCH 0031/1322] Make
 clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c write output file
 to temp dir

---
 clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
index 5d65fdafaa25..d761e12e8392 100644
--- a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
+++ b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
@@ -57,7 +57,7 @@
 // RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_VIA_XCLANG
 
 // However, sve2 is actually enabled in clang but disabled for MC.
-// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s \
+// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s -o %t \
 // RUN:     -Xclang -target-feature -Xclang -sve \
 // RUN:     -Xclang -verify -Xclang -verify-ignore-unexpected=note
 

From 0c62571d9f02f7d5c1a649b5b20fdf5b0f6bb41c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 20:57:07 +0200
Subject: [PATCH 0032/1322] [libc++] Remove static_assert from hash.cpp that
 fires unconditionall

---
 libcxx/src/hash.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp
index 50d8cf9f9f53..e1e6d2b4c2bd 100644
--- a/libcxx/src/hash.cpp
+++ b/libcxx/src/hash.cpp
@@ -56,7 +56,6 @@ inline void __check_for_overflow(size_t N) {
     if (N > 0xFFFFFFFB)
       std::__throw_overflow_error("__next_prime overflow");
   } else {
-    static_assert(sizeof(size_t) == 8);
     if (N > 0xFFFFFFFFFFFFFFC5ull)
       std::__throw_overflow_error("__next_prime overflow");
   }

From 02b6849cf1feb425885bf6f5ee505d5cd4a824d7 Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba@intel.com>
Date: Wed, 11 Jun 2025 12:03:55 -0700
Subject: [PATCH 0033/1322] [Clang][OpenMP] Fix mapping of arrays of structs
 with members with mappers (#142511)

This builds upon #101101 from @jyu2-git, which used compiler-generated
mappers when mapping an array-section of structs with members that have
user-defined default mappers.

Now we do the same when mapping arrays of structs.
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/lib/Sema/SemaOpenMP.cpp                 |  38 ++-
 ...of_structs_with_nested_mapper_ast_dump.cpp |  34 ++
 ..._of_structs_with_nested_mapper_codegen.cpp | 323 ++++++++++++++++++
 ...f_structs_with_nested_mapper_ast_dump.cpp} |   0
 ...of_structs_with_nested_mapper_codegen.cpp} |   0
 ...re_mapper_nested_default_mappers_array.cpp |   6 +-
 7 files changed, 388 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp
 create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp
 rename clang/test/OpenMP/{target_map_nest_defalut_mapper_ast_dump.cpp => target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp} (100%)
 rename clang/test/OpenMP/{target_map_nest_defalut_mapper_codegen.cpp => target_map_array_section_of_structs_with_nested_mapper_codegen.cpp} (100%)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b5e6cf088a4b..8043ab48f0b4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1102,6 +1102,9 @@ OpenMP Support
 - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have
   an argument larger than what can fit within a 64-bit integer.
 - Added support for private variable reduction.
+- Fixed mapping of arrays of structs containing nested structs with user defined
+  mappers, by using compiler-generated default mappers for the outer structs for
+  such maps.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a3395ac157d9..2cbe79c5c07c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -22057,20 +22057,34 @@ static void checkMappableExpressionList(
         Type.getCanonicalType(), UnresolvedMapper);
     if (ER.isInvalid())
       continue;
-    if (!ER.get() && isa<ArraySectionExpr>(VE)) {
-      // Create implicit mapper as needed.
-      QualType BaseType = VE->getType().getCanonicalType();
-      if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) {
-        const auto *OASE = cast<ArraySectionExpr>(VE->IgnoreParenImpCasts());
-        QualType BType = ArraySectionExpr::getBaseOriginalType(OASE->getBase());
-        QualType ElemType;
-        if (const auto *ATy = BType->getAsArrayTypeUnsafe())
-          ElemType = ATy->getElementType();
-        else
-          ElemType = BType->getPointeeType();
+
+    // If no user-defined mapper is found, we need to create an implicit one for
+    // arrays/array-sections on structs that have members that have
+    // user-defined mappers. This is needed to ensure that the mapper for the
+    // member is invoked when mapping each element of the array/array-section.
+    if (!ER.get()) {
+      QualType BaseType;
+
+      if (isa<ArraySectionExpr>(VE)) {
+        BaseType = VE->getType().getCanonicalType();
+        if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) {
+          const auto *OASE = cast<ArraySectionExpr>(VE->IgnoreParenImpCasts());
+          QualType BType =
+              ArraySectionExpr::getBaseOriginalType(OASE->getBase());
+          QualType ElemType;
+          if (const auto *ATy = BType->getAsArrayTypeUnsafe())
+            ElemType = ATy->getElementType();
+          else
+            ElemType = BType->getPointeeType();
+          BaseType = ElemType.getCanonicalType();
+        }
+      } else if (VE->getType()->isArrayType()) {
+        const ArrayType *AT = VE->getType()->getAsArrayTypeUnsafe();
+        const QualType ElemType = AT->getElementType();
         BaseType = ElemType.getCanonicalType();
       }
-      if (BaseType->getAsRecordDecl() &&
+
+      if (!BaseType.isNull() && BaseType->getAsRecordDecl() &&
           isImplicitMapperNeeded(SemaRef, DSAS, BaseType, VE)) {
         ER = buildImplicitMapper(SemaRef, BaseType, DSAS);
       }
diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp
new file mode 100644
index 000000000000..a5847709d3e7
--- /dev/null
+++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp
@@ -0,0 +1,34 @@
+//RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -ast-dump  %s | FileCheck %s --check-prefix=DUM
+
+typedef struct {
+  int a;
+} C;
+#pragma omp declare mapper(C s) map(to : s.a)
+
+typedef struct {
+  int e;
+  C f;
+  int h;
+} D;
+
+void foo() {
+  D sa[10];
+  sa[1].e = 111;
+  sa[1].f.a = 222;
+
+#pragma omp target map(tofrom : sa)
+  {
+    sa[0].e = 333;
+    sa[1].f.a = 444;
+  }
+}
+
+// DUM: -OMPDeclareMapperDecl{{.*}}<<invalid sloc>> <invalid sloc>
+// DUM-NEXT:  |-OMPMapClause {{.*}}<<invalid sloc>> <implicit>
+// DUM-NEXT:  | |-MemberExpr {{.*}}<line:9:3> 'int' lvalue .e
+// DUM-NEXT:  | | `-DeclRefExpr {{.*}}<<invalid sloc>> 'D' lvalue Var {{.*}} '_s' 'D'
+// DUM-NEXT:  | |-MemberExpr {{.*}}<line:10:3> 'C' lvalue .f {{.*}}
+// DUM-NEXT:  | | `-DeclRefExpr {{.*}}<<invalid sloc>> 'D' lvalue Var {{.*}} '_s' 'D'
+// DUM-NEXT:  | `-MemberExpr {{.*}}<line:11:3> 'int' lvalue .h {{.*}}
+// DUM-NEXT:  |   `-DeclRefExpr {{.*}}<<invalid sloc>> 'D' lvalue Var {{.*}} '_s' 'D'
+// DUM-NEXT:  `-VarDecl {{.*}} <line:12:1> col:1 implicit used _s 'D'
diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp
new file mode 100644
index 000000000000..5df1e958ad55
--- /dev/null
+++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp
@@ -0,0 +1,323 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex "\.offload_.*"
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef struct {
+  int a;
+} C;
+#pragma omp declare mapper(C s) map(to : s.a)
+
+typedef struct {
+  int e;
+  C f;
+  int h;
+} D;
+
+void foo() {
+  D sa[10];
+  sa[1].e = 111;
+  sa[1].f.a = 222;
+
+#pragma omp target map(tofrom : sa)
+  {
+    sa[1].e = 333;
+    sa[1].f.a = 444;
+  }
+}
+#endif
+//.
+// CHECK: @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 120]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
+//.
+// CHECK-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SA:%.*]] = alloca [10 x %struct.D], align 4
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0
+// CHECK-NEXT:    store i32 111, ptr [[E]], align 4
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0
+// CHECK-NEXT:    store i32 222, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[SA]], ptr [[TMP0]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[SA]], ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT:    store ptr @.omp_mapper._ZTS1D.default, ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT:    store i32 3, ptr [[TMP5]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT:    store ptr [[DOTOFFLOAD_MAPPERS]], ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK:       omp_offload.failed:
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26(ptr [[SA]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK:       omp_offload.cont:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(120) [[SA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[SA]], ptr [[SA_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SA_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0
+// CHECK-NEXT:    store i32 333, ptr [[E]], align 4
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0
+// CHECK-NEXT:    store i32 444, ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1D.default
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 12
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [[STRUCT_D:%.*]], ptr [[TMP2]], i64 [[TMP6]]
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP4]], 16
+// CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+// CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]]
+// CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]]
+// CHECK-NEXT:    [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0
+// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]]
+// CHECK:       .omp.array..init:
+// CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 12
+// CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_ARRAYMAP_HEAD]]
+// CHECK:       omp.arraymap.head:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]]
+// CHECK:       omp.arraymap.body:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END20:%.*]] ]
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 1
+// CHECK-NEXT:    [[H:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[H]], i32 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP18]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[E]] to i64
+// CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[TMP19]], [[TMP20]]
+// CHECK-NEXT:    [[TMP22:%.*]] = sdiv exact i64 [[TMP21]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK-NEXT:    [[TMP23:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP24:%.*]] = shl i64 [[TMP23]], 48
+// CHECK-NEXT:    [[TMP25:%.*]] = add nuw i64 0, [[TMP24]]
+// CHECK-NEXT:    [[TMP26:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0
+// CHECK-NEXT:    br i1 [[TMP27]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]]
+// CHECK:       omp.type.alloc:
+// CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP25]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END:%.*]]
+// CHECK:       omp.type.alloc.else:
+// CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP26]], 1
+// CHECK-NEXT:    br i1 [[TMP29]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]]
+// CHECK:       omp.type.to:
+// CHECK-NEXT:    [[TMP30:%.*]] = and i64 [[TMP25]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.to.else:
+// CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[TMP26]], 2
+// CHECK-NEXT:    br i1 [[TMP31]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]]
+// CHECK:       omp.type.from:
+// CHECK-NEXT:    [[TMP32:%.*]] = and i64 [[TMP25]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.end:
+// CHECK-NEXT:    [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP28]], [[OMP_TYPE_ALLOC]] ], [ [[TMP30]], [[OMP_TYPE_TO]] ], [ [[TMP32]], [[OMP_TYPE_FROM]] ], [ [[TMP25]], [[OMP_TYPE_TO_ELSE]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 [[TMP22]], i64 [[OMP_MAPTYPE]], ptr null)
+// CHECK-NEXT:    [[TMP33:%.*]] = add nuw i64 281474976711171, [[TMP24]]
+// CHECK-NEXT:    [[TMP34:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP34]], 0
+// CHECK-NEXT:    br i1 [[TMP35]], label [[OMP_TYPE_ALLOC1:%.*]], label [[OMP_TYPE_ALLOC_ELSE2:%.*]]
+// CHECK:       omp.type.alloc1:
+// CHECK-NEXT:    [[TMP36:%.*]] = and i64 [[TMP33]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END6:%.*]]
+// CHECK:       omp.type.alloc.else2:
+// CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[TMP34]], 1
+// CHECK-NEXT:    br i1 [[TMP37]], label [[OMP_TYPE_TO3:%.*]], label [[OMP_TYPE_TO_ELSE4:%.*]]
+// CHECK:       omp.type.to3:
+// CHECK-NEXT:    [[TMP38:%.*]] = and i64 [[TMP33]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END6]]
+// CHECK:       omp.type.to.else4:
+// CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[TMP34]], 2
+// CHECK-NEXT:    br i1 [[TMP39]], label [[OMP_TYPE_FROM5:%.*]], label [[OMP_TYPE_END6]]
+// CHECK:       omp.type.from5:
+// CHECK-NEXT:    [[TMP40:%.*]] = and i64 [[TMP33]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END6]]
+// CHECK:       omp.type.end6:
+// CHECK-NEXT:    [[OMP_MAPTYPE7:%.*]] = phi i64 [ [[TMP36]], [[OMP_TYPE_ALLOC1]] ], [ [[TMP38]], [[OMP_TYPE_TO3]] ], [ [[TMP40]], [[OMP_TYPE_FROM5]] ], [ [[TMP33]], [[OMP_TYPE_TO_ELSE4]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 4, i64 [[OMP_MAPTYPE7]], ptr null)
+// CHECK-NEXT:    [[TMP41:%.*]] = add nuw i64 281474976711171, [[TMP24]]
+// CHECK-NEXT:    [[TMP42:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[TMP42]], 0
+// CHECK-NEXT:    br i1 [[TMP43]], label [[OMP_TYPE_ALLOC8:%.*]], label [[OMP_TYPE_ALLOC_ELSE9:%.*]]
+// CHECK:       omp.type.alloc8:
+// CHECK-NEXT:    [[TMP44:%.*]] = and i64 [[TMP41]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END13:%.*]]
+// CHECK:       omp.type.alloc.else9:
+// CHECK-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[TMP42]], 1
+// CHECK-NEXT:    br i1 [[TMP45]], label [[OMP_TYPE_TO10:%.*]], label [[OMP_TYPE_TO_ELSE11:%.*]]
+// CHECK:       omp.type.to10:
+// CHECK-NEXT:    [[TMP46:%.*]] = and i64 [[TMP41]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END13]]
+// CHECK:       omp.type.to.else11:
+// CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[TMP42]], 2
+// CHECK-NEXT:    br i1 [[TMP47]], label [[OMP_TYPE_FROM12:%.*]], label [[OMP_TYPE_END13]]
+// CHECK:       omp.type.from12:
+// CHECK-NEXT:    [[TMP48:%.*]] = and i64 [[TMP41]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END13]]
+// CHECK:       omp.type.end13:
+// CHECK-NEXT:    [[OMP_MAPTYPE14:%.*]] = phi i64 [ [[TMP44]], [[OMP_TYPE_ALLOC8]] ], [ [[TMP46]], [[OMP_TYPE_TO10]] ], [ [[TMP48]], [[OMP_TYPE_FROM12]] ], [ [[TMP41]], [[OMP_TYPE_TO_ELSE11]] ]
+// CHECK-NEXT:    call void @.omp_mapper._ZTS1C.default(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[F]], i64 4, i64 [[OMP_MAPTYPE14]], ptr null) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP49:%.*]] = add nuw i64 281474976711171, [[TMP24]]
+// CHECK-NEXT:    [[TMP50:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[TMP50]], 0
+// CHECK-NEXT:    br i1 [[TMP51]], label [[OMP_TYPE_ALLOC15:%.*]], label [[OMP_TYPE_ALLOC_ELSE16:%.*]]
+// CHECK:       omp.type.alloc15:
+// CHECK-NEXT:    [[TMP52:%.*]] = and i64 [[TMP49]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.alloc.else16:
+// CHECK-NEXT:    [[TMP53:%.*]] = icmp eq i64 [[TMP50]], 1
+// CHECK-NEXT:    br i1 [[TMP53]], label [[OMP_TYPE_TO17:%.*]], label [[OMP_TYPE_TO_ELSE18:%.*]]
+// CHECK:       omp.type.to17:
+// CHECK-NEXT:    [[TMP54:%.*]] = and i64 [[TMP49]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.to.else18:
+// CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[TMP50]], 2
+// CHECK-NEXT:    br i1 [[TMP55]], label [[OMP_TYPE_FROM19:%.*]], label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.from19:
+// CHECK-NEXT:    [[TMP56:%.*]] = and i64 [[TMP49]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.end20:
+// CHECK-NEXT:    [[OMP_MAPTYPE21:%.*]] = phi i64 [ [[TMP52]], [[OMP_TYPE_ALLOC15]] ], [ [[TMP54]], [[OMP_TYPE_TO17]] ], [ [[TMP56]], [[OMP_TYPE_FROM19]] ], [ [[TMP49]], [[OMP_TYPE_TO_ELSE18]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[H]], i64 4, i64 [[OMP_MAPTYPE21]], ptr null)
+// CHECK-NEXT:    [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]]
+// CHECK:       omp.arraymap.exit:
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY22:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP57:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP57]], 0
+// CHECK-NEXT:    [[TMP58:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY22]], [[DOTOMP_ARRAY__DEL__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP58]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]]
+// CHECK:       .omp.array..del:
+// CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP6]], 12
+// CHECK-NEXT:    [[TMP60:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP61:%.*]] = or i64 [[TMP60]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP59]], i64 [[TMP61]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_DONE]]
+// CHECK:       omp.done:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1C.default
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 4
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[TMP2]], i64 [[TMP6]]
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP4]], 16
+// CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+// CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]]
+// CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]]
+// CHECK-NEXT:    [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0
+// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]]
+// CHECK:       .omp.array..init:
+// CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_ARRAYMAP_HEAD]]
+// CHECK:       omp.arraymap.head:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]]
+// CHECK:       omp.arraymap.body:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END:%.*]] ]
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP18:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP19:%.*]] = shl i64 [[TMP18]], 48
+// CHECK-NEXT:    [[TMP20:%.*]] = add nuw i64 1, [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 0
+// CHECK-NEXT:    br i1 [[TMP22]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]]
+// CHECK:       omp.type.alloc:
+// CHECK-NEXT:    [[TMP23:%.*]] = and i64 [[TMP20]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.alloc.else:
+// CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[TMP21]], 1
+// CHECK-NEXT:    br i1 [[TMP24]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]]
+// CHECK:       omp.type.to:
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP20]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.to.else:
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP21]], 2
+// CHECK-NEXT:    br i1 [[TMP26]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]]
+// CHECK:       omp.type.from:
+// CHECK-NEXT:    [[TMP27:%.*]] = and i64 [[TMP20]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.end:
+// CHECK-NEXT:    [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP23]], [[OMP_TYPE_ALLOC]] ], [ [[TMP25]], [[OMP_TYPE_TO]] ], [ [[TMP27]], [[OMP_TYPE_FROM]] ], [ [[TMP20]], [[OMP_TYPE_TO_ELSE]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[A]], i64 4, i64 [[OMP_MAPTYPE]], ptr null)
+// CHECK-NEXT:    [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]]
+// CHECK:       omp.arraymap.exit:
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY1:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP28]], 0
+// CHECK-NEXT:    [[TMP29:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY1]], [[DOTOMP_ARRAY__DEL__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP29]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]]
+// CHECK:       .omp.array..del:
+// CHECK-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:    [[TMP31:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP31]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP30]], i64 [[TMP32]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_DONE]]
+// CHECK:       omp.done:
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp
similarity index 100%
rename from clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp
rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp
diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp
similarity index 100%
rename from clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp
diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp
index d545e98ef6c3..93695d1b388f 100644
--- a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp
+++ b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp
@@ -4,8 +4,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-x86_64-unknown-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
 
-// UNSUPPORTED: clang
-
 #include <cstdio>
 #include <cstdlib>
 
@@ -50,7 +48,7 @@ int main() {
   sa[1].h = N;
 
   printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
+         sa[1].f.b == &y[0] ? 1 : 0);
   // CHECK: 111 222 777 20.00000 1
 
   __intptr_t p = reinterpret_cast<__intptr_t>(&y[0]);
@@ -65,6 +63,6 @@ int main() {
     sa[1].f.b[1] = 40;
   }
   printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
+         sa[1].f.b == &y[0] ? 1 : 0);
   // CHECK: 333 222 777 40.00000 1
 }

From 574f77a1ee34461bc1f4a0823da6c960ff1c9655 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 11 Jun 2025 12:04:26 -0700
Subject: [PATCH 0034/1322] [OpenACC][CIR] Add parallelism determ. to all
 acc.loops (#143751)

PR #143720 adds a requirement to the ACC dialect that every acc.loop
must have a seq, independent, or auto attribute for the 'default'
device_type. The standard has rules for how this can be intuited:

orphan/parallel/parallel loop: independent
kernels/kernels loop: auto
serial/serial loop: seq, unless there is a gang/worker/vector, at which
point it should be 'auto'.

This patch implements all of this rule as a 'cleanup' step on the IR
generation for combined/loop operations. Note that the test impact is
much less since I inadvertently have my 'operation' terminating curley
matching the end curley from 'attribute' instead of the front of the
line, so I've added sufficient tests to ensure I captured the above.
---
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  12 +++
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp   |   2 +
 .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp |  33 ++++++
 clang/test/CIR/CodeGenOpenACC/combined.cpp    |  69 ++++++++++--
 clang/test/CIR/CodeGenOpenACC/loop.cpp        | 101 ++++++++++++++++--
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |   8 ++
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  24 +++++
 7 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b08dd540e628..682d59d63faa 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -34,6 +34,12 @@ namespace {
 class ScalarExprEmitter;
 } // namespace
 
+namespace mlir {
+namespace acc {
+class LoopOp;
+} // namespace acc
+} // namespace mlir
+
 namespace clang::CIRGen {
 
 class CIRGenFunction : public CIRGenTypeCache {
@@ -1082,6 +1088,12 @@ private:
                           OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
                           ArrayRef<const OpenACCClause *> clauses);
 
+  // The OpenACC LoopOp requires that we have auto, seq, or independent on all
+  // LoopOp operations for the 'none' device type case. This function checks if
+  // the LoopOp has one, else it updates it to have one.
+  void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan,
+                               OpenACCDirectiveKind dk);
+
 public:
   mlir::LogicalResult
   emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s);
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 2aab9cecf93d..1feefa55eb27 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
 
     emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);
 
+    updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);
+
     builder.create<TermOp>(end);
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
index 24cd1d399de6..71f3ccb8e040 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
@@ -22,6 +22,36 @@ using namespace clang::CIRGen;
 using namespace cir;
 using namespace mlir::acc;
 
+void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op,
+                                             bool isOrphan,
+                                             OpenACCDirectiveKind dk) {
+  // Check that at least one of auto, independent, or seq is present
+  // for the device-independent default clauses.
+  if (op.hasParallelismFlag(mlir::acc::DeviceType::None))
+    return;
+
+  switch (dk) {
+  default:
+    llvm_unreachable("Invalid parent directive kind");
+  case OpenACCDirectiveKind::Invalid:
+  case OpenACCDirectiveKind::Parallel:
+  case OpenACCDirectiveKind::ParallelLoop:
+    op.addIndependent(builder.getContext(), {});
+    return;
+  case OpenACCDirectiveKind::Kernels:
+  case OpenACCDirectiveKind::KernelsLoop:
+    op.addAuto(builder.getContext(), {});
+    return;
+  case OpenACCDirectiveKind::Serial:
+  case OpenACCDirectiveKind::SerialLoop:
+    if (op.hasDefaultGangWorkerVector())
+      op.addAuto(builder.getContext(), {});
+    else
+      op.addSeq(builder.getContext(), {});
+    return;
+  };
+}
+
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
@@ -90,6 +120,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
   emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
                      s.clauses());
 
+  updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(),
+                          s.getParentComputeConstructKind());
+
   mlir::LogicalResult stmtRes = mlir::success();
   // Emit body.
   {
diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp
index 1f3c9f1a8d3f..5b83a9cb9189 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp
@@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc kernels loop seq device_type(nvidia, radeon)
@@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc kernels loop auto device_type(nvidia, radeon)
@@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc kernels loop independent device_type(nvidia, radeon)
@@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.parallel combined(loop) {
   // CHECK: acc.loop combined(parallel) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
+  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
@@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]}
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
@@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.kernels combined(loop) {
   // CHECK: acc.loop combined(kernels) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
   // CHECK: acc.terminator
   // CHECK-NEXT: } loc
   #pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
@@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.parallel combined(loop) {
   // CHECK: acc.loop combined(parallel) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
@@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) {
   // CHECK-NEXT: } loc
   // CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg2"}
   // CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg1"}
+
+  // Checking the automatic-addition of parallelism clauses.
+#pragma acc parallel loop
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.parallel combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels loop
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.kernels combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop worker
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) worker {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop vector
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) vector {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop gang
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) gang {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index db94e2819b30..c0bf11e35395 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
 #pragma acc loop device_type(radeon) seq
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {seq = [#acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<radeon>]} loc
 #pragma acc loop seq device_type(nvidia, radeon)
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
@@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
 #pragma acc loop device_type(radeon) independent
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>, #acc.device_type<none>]} loc
 #pragma acc loop independent device_type(nvidia, radeon)
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
@@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
 #pragma acc loop device_type(radeon) auto
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
 #pragma acc loop auto device_type(nvidia, radeon)
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
@@ -116,7 +116,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
+  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
 
   #pragma acc loop collapse(1) device_type(radeon) collapse (2)
   for(unsigned I = 0; I < N; ++I)
@@ -124,7 +124,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]}
 
   #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2)
   for(unsigned I = 0; I < N; ++I)
@@ -132,14 +132,14 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>], independent = [#acc.device_type<none>]}
   #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
   for(unsigned I = 0; I < N; ++I)
     for(unsigned J = 0; J < N; ++J)
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
 
   #pragma acc loop tile(1, 2, 3)
   for(unsigned I = 0; I < N; ++I)
@@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
   }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+  // Checking the automatic-addition of parallelism clauses.
+#pragma acc loop
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
+
+#pragma acc parallel
+  {
+    // CHECK-NEXT: acc.parallel {
+#pragma acc loop
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels
+  {
+    // CHECK-NEXT: acc.kernels {
+#pragma acc loop
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop worker
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop worker {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop vector
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop vector {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop gang
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop gang {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
 }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 083a18d80704..34312655115a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     // device_types. This is for the case where there is no expression specified
     // in a 'gang'.
     void addEmptyGang(MLIRContext *, llvm::ArrayRef<DeviceType>);
+
+    // Return whether this LoopOp has an auto, seq, or independent for the
+    // specified device-type.
+    bool hasParallelismFlag(DeviceType);
+
+    // Return whether this LoopOp has a gang, worker, or vector applying to the
+    // 'default'/None device-type.
+    bool hasDefaultGangWorkerVector();
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index c72ec47be9f0..21e6b9d85f1a 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2839,6 +2839,30 @@ void acc::LoopOp::addEmptyGang(
                                                  effectiveDeviceTypes));
 }
 
+bool acc::LoopOp::hasParallelismFlag(DeviceType dt) {
+  auto hasDevice = [=](DeviceTypeAttr attr) -> bool {
+    return attr.getValue() == dt;
+  };
+  auto testFromArr = [=](ArrayAttr arr) -> bool {
+    return llvm::any_of(arr.getAsRange<DeviceTypeAttr>(), hasDevice);
+  };
+
+  if (ArrayAttr arr = getSeqAttr(); arr && testFromArr(arr))
+    return true;
+  if (ArrayAttr arr = getIndependentAttr(); arr && testFromArr(arr))
+    return true;
+  if (ArrayAttr arr = getAuto_Attr(); arr && testFromArr(arr))
+    return true;
+
+  return false;
+}
+
+bool acc::LoopOp::hasDefaultGangWorkerVector() {
+  return hasVector() || getVectorValue() || hasWorker() || getWorkerValue() ||
+         hasGang() || getGangValue(GangArgType::Num) ||
+         getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static);
+}
+
 void acc::LoopOp::addGangOperands(
     MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes,
     llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) {

From d5f68cb145059fc6d2944e1d17ef561e183ade83 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 12:09:44 -0700
Subject: [PATCH 0035/1322] [bazel] Port
 fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f6a7cd7dea85..7bcb1d4ca883 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6881,8 +6881,8 @@ cc_library(
     deps = [
         ":SPIRVDialect",
         ":Support",
-        "//llvm:config",
         "//llvm:Support",
+        "//llvm:config",
     ],
 )
 
@@ -11249,7 +11249,7 @@ td_library(
 )
 
 gentbl_cc_library(
-    name = "TransformDialectEnumsIncGen",
+    name = "TransformAttrsIncGen",
     tbl_outs = {
         "include/mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc": [
             "-gen-enum-decls",
@@ -11257,6 +11257,12 @@ gentbl_cc_library(
         "include/mlir/Dialect/Transform/IR/TransformDialectEnums.cpp.inc": [
             "-gen-enum-defs",
         ],
+        "include/mlir/Dialect/Transform/IR/TransformAttrs.h.inc": [
+            "-gen-attrdef-decls",
+        ],
+        "include/mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc": [
+            "-gen-attrdef-defs",
+        ],
     },
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/Transform/IR/TransformAttrs.td",
@@ -11382,7 +11388,7 @@ cc_library(
         ":Rewrite",
         ":SideEffectInterfaces",
         ":Support",
-        ":TransformDialectEnumsIncGen",
+        ":TransformAttrsIncGen",
         ":TransformDialectIncGen",
         ":TransformDialectInterfaces",
         ":TransformDialectUtils",

From 5dafe9dca867b90f20dcd71c620ad823aee4262b Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 11 Jun 2025 12:23:17 -0700
Subject: [PATCH 0036/1322] [libc] Reduce direct use of errno in src/stdlib and
 src/__support tests. (#143767)

* Get rid of libc_errno assignments in str_to_* __support tests, since
those API have been migrated to return error in a struct instead.
* Migrate tests for atof and to strto* functions from <stdlib.h> and for
strdup from <string.h> to use ErrnoCheckingTest harness.
---
 libc/test/src/__support/CMakeLists.txt        |  2 -
 .../test/src/__support/str_to_double_test.cpp |  1 -
 libc/test/src/__support/str_to_float_test.cpp |  1 -
 libc/test/src/__support/str_to_fp_test.h      |  2 -
 .../src/__support/str_to_integer_test.cpp     |  1 -
 libc/test/src/stdlib/CMakeLists.txt           |  5 ++
 libc/test/src/stdlib/StrtolTest.h             | 60 +------------------
 libc/test/src/stdlib/atof_test.cpp            |  9 ++-
 libc/test/src/stdlib/strtod_test.cpp          |  5 +-
 libc/test/src/stdlib/strtof_test.cpp          |  5 +-
 libc/test/src/stdlib/strtold_test.cpp         |  5 +-
 libc/test/src/string/CMakeLists.txt           |  1 +
 libc/test/src/string/strdup_test.cpp          | 13 ++--
 13 files changed, 24 insertions(+), 86 deletions(-)

diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index c1736c8fe59e..4fb0dae86e5c 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -127,7 +127,6 @@ add_libc_test(
     libc.src.__support.integer_literals
     libc.src.__support.str_to_float
     libc.src.__support.uint128
-    libc.src.errno.errno
 )
 
 
@@ -140,7 +139,6 @@ add_libc_test(
   DEPENDS
     libc.src.__support.integer_literals
     libc.src.__support.str_to_integer
-    libc.src.errno.errno
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/str_to_double_test.cpp b/libc/test/src/__support/str_to_double_test.cpp
index ccfa44f12d8e..dc503aa16f08 100644
--- a/libc/test/src/__support/str_to_double_test.cpp
+++ b/libc/test/src/__support/str_to_double_test.cpp
@@ -99,7 +99,6 @@ TEST(LlvmLibcStrToDblTest, SimpleDecimalConversionExtraTypes) {
   uint64_t double_output_mantissa = 0;
   uint32_t output_exp2 = 0;
 
-  LIBC_NAMESPACE::libc_errno = 0;
   auto double_result =
       internal::simple_decimal_conversion<double>("123456789012345678900");
 
diff --git a/libc/test/src/__support/str_to_float_test.cpp b/libc/test/src/__support/str_to_float_test.cpp
index 66f7db742eb4..03ae80fc2ee3 100644
--- a/libc/test/src/__support/str_to_float_test.cpp
+++ b/libc/test/src/__support/str_to_float_test.cpp
@@ -55,7 +55,6 @@ TEST(LlvmLibcStrToFltTest, SimpleDecimalConversionExtraTypes) {
   uint32_t float_output_mantissa = 0;
   uint32_t output_exp2 = 0;
 
-  LIBC_NAMESPACE::libc_errno = 0;
   auto float_result =
       internal::simple_decimal_conversion<float>("123456789012345678900");
   float_output_mantissa = float_result.num.mantissa;
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index c7bc57b845fe..d349192f107c 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -10,7 +10,6 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
 #include "src/__support/uint128.h"
-#include "src/errno/libc_errno.h"
 
 #include "test/UnitTest/Test.h"
 
@@ -67,7 +66,6 @@ template <typename T> struct LlvmLibcStrToFloatTest : public testing::Test {
                                       const int expectedErrno = 0) {
     StorageType actual_output_mantissa = 0;
     uint32_t actual_output_exp2 = 0;
-    LIBC_NAMESPACE::libc_errno = 0;
 
     auto result = internal::simple_decimal_conversion<T>(numStart);
 
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 34b645b4b38c..1ec882b212b8 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 302971a078c1..45fd49b6d352 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -9,6 +9,7 @@ add_libc_test(
   DEPENDS
     libc.src.errno.errno
     libc.src.stdlib.atof
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_header_library(
@@ -64,6 +65,7 @@ add_fp_unittest(
     libc.src.errno.errno
     libc.src.stdlib.strtod
     libc.src.__support.FPUtil.fenv_impl
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_fp_unittest(
@@ -76,6 +78,7 @@ add_fp_unittest(
     libc.src.errno.errno
     libc.src.stdlib.strtof
     libc.src.__support.FPUtil.fenv_impl
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_header_library(
@@ -86,6 +89,7 @@ add_header_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.type_traits
     libc.src.errno.errno
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -133,6 +137,7 @@ add_libc_test(
     libc.src.errno.errno
     libc.src.__support.uint128
     libc.src.stdlib.strtold
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index ed302f14d03e..03f0a6539c78 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -10,7 +10,7 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/properties/architectures.h"
-#include "src/errno/libc_errno.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <stddef.h>
@@ -18,7 +18,7 @@
 using LIBC_NAMESPACE::cpp::is_signed_v;
 
 template <typename ReturnT>
-struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
+struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
   using FunctionT = ReturnT (*)(const char *, char **, int);
 
   static constexpr ReturnT T_MAX =
@@ -28,7 +28,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
   void InvalidBase(FunctionT func) {
     const char *ten = "10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(ten, nullptr, -1), ReturnT(0));
     ASSERT_ERRNO_EQ(EINVAL);
   }
@@ -38,23 +37,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
     // TODO: Look into collapsing these repeated segments.
     const char *ten = "10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(ten, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - ten, ptrdiff_t(2));
 
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(ten, nullptr, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
 
     const char *hundred = "100";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(hundred, &str_end, 10), ReturnT(100));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - hundred, ptrdiff_t(3));
 
     const char *big_number = "1234567890";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(big_number, &str_end, 10), ReturnT(1234567890));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - big_number, ptrdiff_t(10));
@@ -62,7 +57,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // This number is larger than 2^32, meaning that if long is only 32 bits
     // wide, strtol will return LONG_MAX.
     const char *bigger_number = "12345678900";
-    LIBC_NAMESPACE::libc_errno = 0;
     if constexpr (sizeof(ReturnT) < 8) {
       ASSERT_EQ(func(bigger_number, &str_end, 10), T_MAX);
       ASSERT_ERRNO_EQ(ERANGE);
@@ -73,14 +67,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     EXPECT_EQ(str_end - bigger_number, ptrdiff_t(11));
 
     const char *too_big_number = "123456789012345678901";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(too_big_number, &str_end, 10), T_MAX);
     ASSERT_ERRNO_EQ(ERANGE);
     EXPECT_EQ(str_end - too_big_number, ptrdiff_t(21));
 
     const char *long_number_range_test =
         "10000000000000000000000000000000000000000000000000";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(long_number_range_test, &str_end, 10), T_MAX);
     ASSERT_ERRNO_EQ(ERANGE);
     EXPECT_EQ(str_end - long_number_range_test, ptrdiff_t(50));
@@ -88,19 +80,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // For most negative numbers, the unsigned functions treat it the same as
     // casting a negative variable to an unsigned type.
     const char *negative = "-100";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative, &str_end, 10), ReturnT(-100));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - negative, ptrdiff_t(4));
 
     const char *big_negative_number = "-1234567890";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(big_negative_number, &str_end, 10), ReturnT(-1234567890));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - big_negative_number, ptrdiff_t(11));
 
     const char *too_big_negative_number = "-123456789012345678901";
-    LIBC_NAMESPACE::libc_errno = 0;
     // If the number is signed, it should return the smallest negative number
     // for the current type, but if it's unsigned it should max out and return
     // the largest positive number for the current type. From the standard:
@@ -118,73 +107,61 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *spaces_before = "     10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(spaces_before, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - spaces_before, ptrdiff_t(7));
 
     const char *spaces_after = "10      ";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(spaces_after, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - spaces_after, ptrdiff_t(2));
 
     const char *word_before = "word10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(word_before, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - word_before, ptrdiff_t(0));
 
     const char *word_after = "10word";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(word_after, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - word_after, ptrdiff_t(2));
 
     const char *two_numbers = "10 999";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(two_numbers, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - two_numbers, ptrdiff_t(2));
 
     const char *two_signs = "--10 999";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(two_signs, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - two_signs, ptrdiff_t(0));
 
     const char *sign_before = "+2=4";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(sign_before, &str_end, 10), ReturnT(2));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - sign_before, ptrdiff_t(2));
 
     const char *sign_after = "2+2=4";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(sign_after, &str_end, 10), ReturnT(2));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - sign_after, ptrdiff_t(1));
 
     const char *tab_before = "\t10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(tab_before, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - tab_before, ptrdiff_t(3));
 
     const char *all_together = "\t  -12345and+67890";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(all_together, &str_end, 10), ReturnT(-12345));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - all_together, ptrdiff_t(9));
 
     const char *just_spaces = "  ";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_spaces, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_spaces, ptrdiff_t(0));
 
     const char *just_space_and_sign = " +";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_space_and_sign, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_space_and_sign, ptrdiff_t(0));
@@ -203,12 +180,10 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
         small_string[0] = static_cast<char>(
             LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
         if (first_digit < base) {
-          LIBC_NAMESPACE::libc_errno = 0;
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
           ASSERT_ERRNO_SUCCESS();
         } else {
-          LIBC_NAMESPACE::libc_errno = 0;
           ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
           ASSERT_ERRNO_SUCCESS();
         }
@@ -223,18 +198,15 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
           small_string[1] = static_cast<char>(
               LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
           if (first_digit < base && second_digit < base) {
-            LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(
                 func(small_string, nullptr, base),
                 static_cast<ReturnT>(second_digit + (first_digit * base)));
             ASSERT_ERRNO_SUCCESS();
           } else if (first_digit < base) {
-            LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(func(small_string, nullptr, base),
                       static_cast<ReturnT>(first_digit));
             ASSERT_ERRNO_SUCCESS();
           } else {
-            LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
             ASSERT_ERRNO_SUCCESS();
           }
@@ -255,14 +227,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
-              LIBC_NAMESPACE::libc_errno = 0;
               ASSERT_EQ(func(small_string, nullptr, base),
                         static_cast<ReturnT>(third_digit +
                                              (second_digit * base) +
                                              (first_digit * base * base)));
               ASSERT_ERRNO_SUCCESS();
             } else if (first_digit < base && second_digit < base) {
-              LIBC_NAMESPACE::libc_errno = 0;
               ASSERT_EQ(
                   func(small_string, nullptr, base),
                   static_cast<ReturnT>(second_digit + (first_digit * base)));
@@ -272,23 +242,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
               // The number is treated as a one digit hexadecimal.
               if (base == 16 && first_digit == 0 && second_digit == 33) {
                 if (third_digit < base) {
-                  LIBC_NAMESPACE::libc_errno = 0;
                   ASSERT_EQ(func(small_string, nullptr, base),
                             static_cast<ReturnT>(third_digit));
                   ASSERT_ERRNO_SUCCESS();
                 } else {
-                  LIBC_NAMESPACE::libc_errno = 0;
                   ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
                   ASSERT_ERRNO_SUCCESS();
                 }
               } else {
-                LIBC_NAMESPACE::libc_errno = 0;
                 ASSERT_EQ(func(small_string, nullptr, base),
                           static_cast<ReturnT>(first_digit));
                 ASSERT_ERRNO_SUCCESS();
               }
             } else {
-              LIBC_NAMESPACE::libc_errno = 0;
               ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
               ASSERT_ERRNO_SUCCESS();
             }
@@ -302,19 +268,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *no_prefix = "123abc";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(no_prefix, &str_end, 16), ReturnT(0x123abc));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - no_prefix, ptrdiff_t(6));
 
     const char *yes_prefix = "0x456def";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(yes_prefix, &str_end, 16), ReturnT(0x456def));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - yes_prefix, ptrdiff_t(8));
 
     const char *letter_after_prefix = "0xabc123";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(letter_after_prefix, &str_end, 16), ReturnT(0xabc123));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - letter_after_prefix, ptrdiff_t(8));
@@ -325,7 +288,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for unsigned 32 bit numbers
 
     const char *max_32_bit_value = "0xFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_32_bit_value, &str_end, 0),
               ((is_signed_v<ReturnT> && sizeof(ReturnT) == 4)
                    ? T_MAX
@@ -334,7 +296,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     EXPECT_EQ(str_end - max_32_bit_value, ptrdiff_t(10));
 
     const char *negative_max_32_bit_value = "-0xFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative_max_32_bit_value, &str_end, 0),
               ((is_signed_v<ReturnT> && sizeof(ReturnT) == 4)
                    ? T_MIN
@@ -345,13 +306,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for signed 32 bit numbers
 
     const char *max_31_bit_value = "0x7FFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_31_bit_value, &str_end, 0), ReturnT(0x7FFFFFFF));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - max_31_bit_value, ptrdiff_t(10));
 
     const char *negative_max_31_bit_value = "-0x7FFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative_max_31_bit_value, &str_end, 0),
               -ReturnT(0x7FFFFFFF));
     ASSERT_ERRNO_SUCCESS();
@@ -360,7 +319,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for unsigned 64 bit numbers
 
     const char *max_64_bit_value = "0xFFFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_64_bit_value, &str_end, 0),
               (is_signed_v<ReturnT> || sizeof(ReturnT) < 8
                    ? T_MAX
@@ -371,7 +329,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // See the end of CleanBase10Decode for an explanation of how this large
     // negative number can end up as T_MAX.
     const char *negative_max_64_bit_value = "-0xFFFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(
         func(negative_max_64_bit_value, &str_end, 0),
         (is_signed_v<ReturnT>
@@ -383,14 +340,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for signed 64 bit numbers
 
     const char *max_63_bit_value = "0x7FFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_63_bit_value, &str_end, 0),
               (sizeof(ReturnT) < 8 ? T_MAX : ReturnT(0x7FFFFFFFFFFFFFFF)));
     ASSERT_ERRNO_EQ(sizeof(ReturnT) < 8 ? ERANGE : 0);
     EXPECT_EQ(str_end - max_63_bit_value, ptrdiff_t(18));
 
     const char *negative_max_63_bit_value = "-0x7FFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative_max_63_bit_value, &str_end, 0),
               (sizeof(ReturnT) >= 8 ? -ReturnT(0x7FFFFFFFFFFFFFFF)
                                     : (is_signed_v<ReturnT> ? T_MIN : T_MAX)));
@@ -402,23 +357,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *just_prefix = "0x";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_prefix, &str_end, 16), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1));
 
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_prefix, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1));
 
     const char *prefix_with_x_after = "0xx";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(prefix_with_x_after, &str_end, 16), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1));
 
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(prefix_with_x_after, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1));
@@ -428,43 +379,36 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *base_ten = "12345";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_ten, &str_end, 0), ReturnT(12345));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_ten, ptrdiff_t(5));
 
     const char *base_sixteen_no_prefix = "123abc";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_sixteen_no_prefix, &str_end, 0), ReturnT(123));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_sixteen_no_prefix, ptrdiff_t(3));
 
     const char *base_sixteen_with_prefix = "0x456def";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_sixteen_with_prefix, &str_end, 0), ReturnT(0x456def));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_sixteen_with_prefix, ptrdiff_t(8));
 
     const char *base_eight_with_prefix = "012345";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_eight_with_prefix, &str_end, 0), ReturnT(012345));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_eight_with_prefix, ptrdiff_t(6));
 
     const char *just_zero = "0";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_zero, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_zero, ptrdiff_t(1));
 
     const char *just_zero_x = "0x";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_zero_x, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_zero_x, ptrdiff_t(1));
 
     const char *just_zero_eight = "08";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_zero_eight, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_zero_eight, ptrdiff_t(1));
diff --git a/libc/test/src/stdlib/atof_test.cpp b/libc/test/src/stdlib/atof_test.cpp
index 1e4259b792d7..92b904ecad94 100644
--- a/libc/test/src/stdlib/atof_test.cpp
+++ b/libc/test/src/stdlib/atof_test.cpp
@@ -7,29 +7,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/atof.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <stddef.h>
 
+using LlvmLibcAToFTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 // This is just a simple test to make sure that this function works at all. It's
 // functionally identical to strtod so the bulk of the testing is there.
-TEST(LlvmLibcAToFTest, SimpleTest) {
+TEST_F(LlvmLibcAToFTest, SimpleTest) {
   LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
       LIBC_NAMESPACE::fputil::FPBits<double>(uint64_t(0x405ec00000000000));
 
-  LIBC_NAMESPACE::libc_errno = 0;
   EXPECT_THAT(LIBC_NAMESPACE::atof("123"),
               Succeeds<double>(expected_fp.get_val()));
 }
 
-TEST(LlvmLibcAToFTest, FailedParsingTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+TEST_F(LlvmLibcAToFTest, FailedParsingTest) {
   // atof does not flag errors.
   EXPECT_THAT(LIBC_NAMESPACE::atof("???"), Succeeds<double>(0.0));
 }
diff --git a/libc/test/src/stdlib/strtod_test.cpp b/libc/test/src/stdlib/strtod_test.cpp
index 92d14640e653..db3c1d73bd22 100644
--- a/libc/test/src/stdlib/strtod_test.cpp
+++ b/libc/test/src/stdlib/strtod_test.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/strtod.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
-class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test,
+class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
                            ForceRoundingModeTest<RoundingMode::Nearest> {
 public:
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
@@ -46,7 +46,6 @@ public:
     LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
         LIBC_NAMESPACE::fputil::FPBits<double>(expectedRawData);
 
-    LIBC_NAMESPACE::libc_errno = 0;
     double result = LIBC_NAMESPACE::strtod(inputString, &str_end);
     if (expectedErrno == 0)
       EXPECT_THAT(result, Succeeds<double>(expected_fp.get_val()));
diff --git a/libc/test/src/stdlib/strtof_test.cpp b/libc/test/src/stdlib/strtof_test.cpp
index 6a716c956291..6df1ddda93bf 100644
--- a/libc/test/src/stdlib/strtof_test.cpp
+++ b/libc/test/src/stdlib/strtof_test.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/strtof.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@
 using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest;
 using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 
-class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test,
+class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
                            ForceRoundingModeTest<RoundingMode::Nearest> {
 public:
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
@@ -43,7 +43,6 @@ public:
     LIBC_NAMESPACE::fputil::FPBits<float> expected_fp =
         LIBC_NAMESPACE::fputil::FPBits<float>(expectedRawData);
 
-    LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::strtof(inputString, &str_end);
 
     EXPECT_EQ(str_end - inputString, expectedStrLen);
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index b209c85b88e3..eb4056dc7ba6 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -8,9 +8,9 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/uint128.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/strtold.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <stddef.h>
@@ -25,7 +25,7 @@
 #error "Unknown long double type"
 #endif
 
-class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
@@ -80,7 +80,6 @@ public:
         FPBits(static_cast<FPBits::StorageType>(expectedRawData));
     const int expected_errno = expectedErrno;
 
-    LIBC_NAMESPACE::libc_errno = 0;
     long double result = LIBC_NAMESPACE::strtold(inputString, &str_end);
 
     LIBC_NAMESPACE::fputil::FPBits<long double> actual_fp =
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index a675373938e9..ced60750a45c 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -168,6 +168,7 @@ add_libc_test(
   DEPENDS
     libc.src.string.strdup
     libc.src.errno.errno
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 # FIXME: This is failing on the bot for some reason, disable for now.
diff --git a/libc/test/src/string/strdup_test.cpp b/libc/test/src/string/strdup_test.cpp
index 20b85c37637d..4b18fc7f1bde 100644
--- a/libc/test/src/string/strdup_test.cpp
+++ b/libc/test/src/string/strdup_test.cpp
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
 #include "src/string/strdup.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcStrDupTest, EmptyString) {
+using LlvmLibcStrDupTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStrDupTest, EmptyString) {
   const char *empty = "";
 
-  LIBC_NAMESPACE::libc_errno = 0;
   char *result = LIBC_NAMESPACE::strdup(empty);
   ASSERT_ERRNO_SUCCESS();
 
@@ -23,10 +24,9 @@ TEST(LlvmLibcStrDupTest, EmptyString) {
   ::free(result);
 }
 
-TEST(LlvmLibcStrDupTest, AnyString) {
+TEST_F(LlvmLibcStrDupTest, AnyString) {
   const char *abc = "abc";
 
-  LIBC_NAMESPACE::libc_errno = 0;
   char *result = LIBC_NAMESPACE::strdup(abc);
   ASSERT_ERRNO_SUCCESS();
 
@@ -36,8 +36,7 @@ TEST(LlvmLibcStrDupTest, AnyString) {
   ::free(result);
 }
 
-TEST(LlvmLibcStrDupTest, NullPtr) {
-  LIBC_NAMESPACE::libc_errno = 0;
+TEST_F(LlvmLibcStrDupTest, NullPtr) {
   char *result = LIBC_NAMESPACE::strdup(nullptr);
   ASSERT_ERRNO_SUCCESS();
 

From 22fd11fe66a0d64f5ef359e21ae67a7d40936eaf Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Wed, 11 Jun 2025 15:26:49 -0400
Subject: [PATCH 0037/1322] [SystemZ][z/OS] Refactor AutoConvert.h to remove
 large MVS guard (#143174)

This AutoConvert.h header frequently gets mislabeled as an unused
include because it is guarded by MVS internally and every usage is also
guarded. This refactors the change to remove this guard and instead make
these functions a noop on other non-z/OS platforms.
---
 llvm/include/llvm/Support/AutoConvert.h | 46 +++++++++++++++++++++++--
 llvm/lib/Support/AutoConvert.cpp        | 21 -----------
 llvm/lib/Support/InitLLVM.cpp           | 30 ++++++++++------
 llvm/lib/Support/MemoryBuffer.cpp       | 10 +++---
 llvm/lib/Support/raw_ostream.cpp        | 19 +++++-----
 5 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 352493e9be25..56ad91425bcc 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -16,6 +16,7 @@
 
 #ifdef __MVS__
 #include <_Ccsid.h>
+#endif
 #ifdef __cplusplus
 #include "llvm/Support/ErrorOr.h"
 #include <system_error>
@@ -28,9 +29,11 @@
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
+
 int enablezOSAutoConversion(int FD);
 int disablezOSAutoConversion(int FD);
 int restorezOSStdHandleAutoConversion(int FD);
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
@@ -38,6 +41,46 @@ int restorezOSStdHandleAutoConversion(int FD);
 #ifdef __cplusplus
 namespace llvm {
 
+inline std::error_code disableAutoConversion(int FD) {
+#ifdef __MVS__
+  if (::disablezOSAutoConversion(FD) == -1)
+    return errnoAsErrorCode();
+#endif
+  return std::error_code();
+}
+
+inline std::error_code enableAutoConversion(int FD) {
+#ifdef __MVS__
+  if (::enablezOSAutoConversion(FD) == -1)
+    return errnoAsErrorCode();
+#endif
+  return std::error_code();
+}
+
+inline std::error_code restoreStdHandleAutoConversion(int FD) {
+#ifdef __MVS__
+  if (::restorezOSStdHandleAutoConversion(FD) == -1)
+    return errnoAsErrorCode();
+#endif
+  return std::error_code();
+}
+
+inline std::error_code setFileTag(int FD, int CCSID, bool Text) {
+#ifdef __MVS__
+  return setzOSFileTag(FD, CCSID, Text);
+#endif
+  return std::error_code();
+}
+
+inline ErrorOr<bool> needConversion(const char *FileName, const int FD = -1) {
+#ifdef __MVS__
+  return needzOSConversion(FileName, FD);
+#endif
+  return false;
+}
+
+#ifdef __MVS__
+
 /** \brief Disable the z/OS enhanced ASCII auto-conversion for the file
  * descriptor.
  */
@@ -63,9 +106,8 @@ ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
  */
 ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
 
+#endif /* __MVS__*/
 } /* namespace llvm */
 #endif /* __cplusplus */
 
-#endif /* __MVS__ */
-
 #endif /* LLVM_SUPPORT_AUTOCONVERT_H */
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index f7918548df1d..c69e9a8f97c0 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -83,27 +83,6 @@ int enablezOSAutoConversion(int FD) {
   return fcntl(FD, F_CONTROL_CVT, &Query);
 }
 
-std::error_code llvm::disablezOSAutoConversion(int FD) {
-  if (::disablezOSAutoConversion(FD) == -1)
-    return errnoAsErrorCode();
-
-  return std::error_code();
-}
-
-std::error_code llvm::enablezOSAutoConversion(int FD) {
-  if (::enablezOSAutoConversion(FD) == -1)
-    return errnoAsErrorCode();
-
-  return std::error_code();
-}
-
-std::error_code llvm::restorezOSStdHandleAutoConversion(int FD) {
-  if (::restorezOSStdHandleAutoConversion(FD) == -1)
-    return errnoAsErrorCode();
-
-  return std::error_code();
-}
-
 std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
   assert((!Text || (CCSID != FT_UNTAGGED && CCSID != FT_BINARY)) &&
          "FT_UNTAGGED and FT_BINARY are not allowed for text files");
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index 50f7a43cc34a..b8fbfd21c4f2 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -18,18 +18,28 @@
 #include "llvm/Support/Windows/WindowsSupport.h"
 #endif
 
-#ifdef __MVS__
+#if defined(HAVE_UNISTD_H)
 #include <unistd.h>
+#else
+#ifndef STDIN_FILENO
+#define STDIN_FILENO 0
+#endif
+#ifndef STDOUT_FILENO
+#define STDOUT_FILENO 1
+#endif
+#ifndef STDERR_FILENO
+#define STDERR_FILENO 2
+#endif
+#endif
 
 void CleanupStdHandles(void *Cookie) {
   llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs();
   Outs->flush();
   Errs->flush();
-  llvm::restorezOSStdHandleAutoConversion(STDIN_FILENO);
-  llvm::restorezOSStdHandleAutoConversion(STDOUT_FILENO);
-  llvm::restorezOSStdHandleAutoConversion(STDERR_FILENO);
+  llvm::restoreStdHandleAutoConversion(STDIN_FILENO);
+  llvm::restoreStdHandleAutoConversion(STDOUT_FILENO);
+  llvm::restoreStdHandleAutoConversion(STDERR_FILENO);
 }
-#endif
 
 using namespace llvm;
 using namespace llvm::sys;
@@ -41,10 +51,10 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
   assert(!Initialized && "InitLLVM was already initialized!");
   Initialized = true;
 #endif
-#ifdef __MVS__
+
   // Bring stdin/stdout/stderr into a known state.
   sys::AddSignalHandler(CleanupStdHandles, nullptr);
-#endif
+
   if (InstallPipeSignalExitHandler)
     // The pipe signal handler must be installed before any other handlers are
     // registered. This is because the Unix \ref RegisterHandlers function does
@@ -68,8 +78,8 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
 
   // If turning on conversion for stderr fails then the error message
   // may be garbled. There is no solution to this problem.
-  ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDERR_FILENO)));
-  ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDOUT_FILENO)));
+  ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDERR_FILENO)));
+  ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDOUT_FILENO)));
 #endif
 
 #ifdef _WIN32
@@ -97,8 +107,6 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
 }
 
 InitLLVM::~InitLLVM() {
-#ifdef __MVS__
   CleanupStdHandles(nullptr);
-#endif
   llvm_shutdown();
 }
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index e2044bcc4e4f..601f11f6d23c 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/Support/AutoConvert.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -34,9 +35,6 @@
 #include <io.h>
 #endif
 
-#ifdef __MVS__
-#include "llvm/Support/AutoConvert.h"
-#endif
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -508,15 +506,15 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
   }
 
 #ifdef __MVS__
-  ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
-  if (std::error_code EC = NeedConversion.getError())
+  ErrorOr<bool> NeedsConversion = needConversion(Filename.str().c_str(), FD);
+  if (std::error_code EC = NeedsConversion.getError())
     return EC;
   // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
   // cannot trust the file size and we create the memory buffer by copying
   // off the stream.
   // Note: This only works with the assumption of reading a full file (i.e,
   // Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
-  if (Offset == 0 && MapSize == FileSize && *NeedConversion)
+  if (*NeedsConversion && Offset == 0 && MapSize == FileSize)
     return getMemoryBufferForStream(FD, Filename);
 #endif
 
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 16631a63d192..07b99896543b 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -894,21 +894,24 @@ void raw_fd_ostream::anchor() {}
 raw_fd_ostream &llvm::outs() {
   // Set buffer settings to model stdout behavior.
   std::error_code EC;
-#ifdef __MVS__
-  EC = enablezOSAutoConversion(STDOUT_FILENO);
-  assert(!EC);
-#endif
+
+  // On z/OS we need to enable auto conversion
+  static std::error_code EC1 = enableAutoConversion(STDOUT_FILENO);
+  assert(!EC1);
+  (void)EC1;
+
   static raw_fd_ostream S("-", EC, sys::fs::OF_None);
   assert(!EC);
   return S;
 }
 
 raw_fd_ostream &llvm::errs() {
-  // Set standard error to be unbuffered.
-#ifdef __MVS__
-  std::error_code EC = enablezOSAutoConversion(STDERR_FILENO);
+  // On z/OS we need to enable auto conversion
+  static std::error_code EC = enableAutoConversion(STDERR_FILENO);
   assert(!EC);
-#endif
+  (void)EC;
+
+  // Set standard error to be unbuffered.
   static raw_fd_ostream S(STDERR_FILENO, false, true);
   return S;
 }

From 34a1b8ce2518d7868c080519a05892cd3b197192 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Wed, 11 Jun 2025 12:37:08 -0700
Subject: [PATCH 0038/1322] [acc] acc.loop verifier now requires parallelism
 determination flag (#143720)

The OpenACC specification for `acc loop` describe that a loop's
parallelism determination mode is either auto, independent, or seq. The
rules are as follows.
- As per OpenACC 3.3 standard section 2.9.6 independent clause: A loop
construct with no auto or seq clause is treated as if it has the
independent clause when it is an orphaned loop construct or its parent
compute construct is a parallel construct.
- As per OpenACC 3.3 standard section 2.9.7 auto clause: When the parent
compute construct is a kernels construct, a loop construct with no
independent or seq clause is treated as if it has the auto clause.
- Additionally, loops marked with gang, worker, or vector are not
guaranteed to be parallel. Specifically noted in 2.9.7 auto clause: If
not, or if it is unable to make a determination, it must treat the auto
clause as if it is a seq clause, and it must ignore any gang, worker, or
vector clauses on the loop construct.

The verifier for `acc.loop` was updated to enforce this marking because
the context in which a loop appears is not trivially determined once IR
transformations begin. For example, orphaned loops are implicitly
`independent`, but after inlining into an `acc.kernels` region they
would be implicitly considered `auto`. Thus now the verifier requires
that a frontend specifically generates acc dialect with this marking
since it knows the context.
---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp      | 35 +++++++++--
 mlir/test/Dialect/OpenACC/canonicalize.mlir  |  4 +-
 mlir/test/Dialect/OpenACC/invalid.mlir       | 28 ++++-----
 mlir/test/Dialect/OpenACC/legalize-data.mlir | 16 ++---
 mlir/test/Dialect/OpenACC/ops.mlir           | 66 ++++++++++----------
 5 files changed, 86 insertions(+), 63 deletions(-)

diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 21e6b9d85f1a..0dfead98b7e7 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2461,10 +2461,34 @@ LogicalResult acc::LoopOp::verify() {
   if (hasDuplicateDeviceTypes(getAuto_(), deviceTypes) ||
       hasDuplicateDeviceTypes(getIndependent(), deviceTypes) ||
       hasDuplicateDeviceTypes(getSeq(), deviceTypes)) {
-    return emitError() << "only one of \"" << acc::LoopOp::getAutoAttrStrName()
-                       << "\", " << getIndependentAttrName() << ", "
-                       << getSeqAttrName()
-                       << " can be present at the same time";
+    return emitError() << "only one of auto, independent, seq can be present "
+                          "at the same time";
+  }
+
+  // Check that at least one of auto, independent, or seq is present
+  // for the device-independent default clauses.
+  auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool {
+    return attr.getValue() == mlir::acc::DeviceType::None;
+  };
+  bool hasDefaultSeq =
+      getSeqAttr()
+          ? llvm::any_of(getSeqAttr().getAsRange<mlir::acc::DeviceTypeAttr>(),
+                         hasDeviceNone)
+          : false;
+  bool hasDefaultIndependent =
+      getIndependentAttr()
+          ? llvm::any_of(
+                getIndependentAttr().getAsRange<mlir::acc::DeviceTypeAttr>(),
+                hasDeviceNone)
+          : false;
+  bool hasDefaultAuto =
+      getAuto_Attr()
+          ? llvm::any_of(getAuto_Attr().getAsRange<mlir::acc::DeviceTypeAttr>(),
+                         hasDeviceNone)
+          : false;
+  if (!hasDefaultSeq && !hasDefaultIndependent && !hasDefaultAuto) {
+    return emitError()
+           << "at least one of auto, independent, seq must be present";
   }
 
   // Gang, worker and vector are incompatible with seq.
@@ -2482,8 +2506,7 @@ LogicalResult acc::LoopOp::verify() {
                        deviceTypeAttr.getValue()) ||
           getGangValue(mlir::acc::GangArgType::Static,
                        deviceTypeAttr.getValue()))
-        return emitError()
-               << "gang, worker or vector cannot appear with the seq attr";
+        return emitError() << "gang, worker or vector cannot appear with seq";
     }
   }
 
diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir
index e43a27f6b9e8..fdc8e6b5cae6 100644
--- a/mlir/test/Dialect/OpenACC/canonicalize.mlir
+++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir
@@ -116,10 +116,10 @@ func.func @testhostdataop(%a: memref<f32>, %ifCond: i1) -> () {
   acc.host_data dataOperands(%0 : memref<f32>) if(%false) {
     acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
       acc.yield
-    } attributes { inclusiveUpperbound = array<i1: true> }
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
       acc.yield
-    } attributes { inclusiveUpperbound = array<i1: true> }
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.terminator
   }
   return
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index aadf18927321..8f6e961a0616 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -2,7 +2,7 @@
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -12,7 +12,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -22,7 +22,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -32,7 +32,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -42,7 +42,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -52,7 +52,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -62,7 +62,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -72,7 +72,7 @@ acc.loop {
 
 // expected-error@+1 {{expected non-empty body.}}
 acc.loop {
-}
+} attributes {independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -99,7 +99,7 @@ acc.loop {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{only one of "auto", "independent", "seq" can be present at the same time}}
+// expected-error@+1 {{only one of auto, independent, seq can be present at the same time}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   acc.yield
 } attributes {auto_ = [#acc.device_type<none>], seq = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
@@ -168,7 +168,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32){
 // expected-error@+1 {{'acc.init' op cannot be nested in a compute operation}}
   acc.init
   acc.yield
-} attributes {inclusiveUpperbound = array<i1: true>}
+} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -186,7 +186,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 // expected-error@+1 {{'acc.shutdown' op cannot be nested in a compute operation}}
   acc.shutdown
   acc.yield
-} attributes {inclusiveUpperbound = array<i1: true>}
+} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -198,7 +198,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
     acc.shutdown
   }) : () -> ()
   acc.yield
-} attributes {inclusiveUpperbound = array<i1: true>}
+} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -797,7 +797,7 @@ func.func @acc_loop_container() {
         scf.yield
     }
     acc.yield
-  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>] }
+  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   return
 }
 
@@ -816,6 +816,6 @@ func.func @acc_loop_container() {
       scf.yield
     }
     acc.yield
-  } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>] }
+  } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   return
 }
diff --git a/mlir/test/Dialect/OpenACC/legalize-data.mlir b/mlir/test/Dialect/OpenACC/legalize-data.mlir
index 28ef6761a6ef..40604dcc736d 100644
--- a/mlir/test/Dialect/OpenACC/legalize-data.mlir
+++ b/mlir/test/Dialect/OpenACC/legalize-data.mlir
@@ -96,7 +96,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -109,7 +109,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[CREATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {independent = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
@@ -134,7 +134,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -147,7 +147,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {independent = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
@@ -172,7 +172,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop private(@privatization_memref_10_f32 -> %p1 : memref<10xf32>) control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -185,7 +185,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop private(@privatization_memref_10_f32 -> %[[PRIVATE]] : memref<10xf32>) control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {independent = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
@@ -210,7 +210,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {seq = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -223,7 +223,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {seq = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 550f295f074a..97278f869534 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -19,7 +19,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
       %co = arith.addf %cij, %p : f32
       memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32>
       acc.yield
-    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>, independent = [#acc.device_type<none>]}
     acc.yield
   }
 
@@ -40,7 +40,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
 //  CHECK-NEXT:       %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
 //  CHECK-NEXT:       memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
 //  CHECK-NEXT:       acc.yield
-//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>, independent = [#acc.device_type<none>]}
 //  CHECK-NEXT:     acc.yield
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   return %{{.*}} : memref<10x10xf32>
@@ -129,7 +129,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
           %tmp = arith.addf %axy, %bxy : f32
           memref.store %tmp, %c[%y] : memref<10xf32>
           acc.yield
-        } attributes {inclusiveUpperbound = array<i1: true>}
+        } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
         acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
           // for i = 0 to 10 step 1
@@ -139,9 +139,9 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
           %z = arith.addf %ci, %dx : f32
           memref.store %z, %d[%x] : memref<10xf32>
           acc.yield
-        } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<nvidia>]}
+        } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>]}
         acc.yield
-      } attributes {inclusiveUpperbound = array<i1: true>}
+      } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
       acc.yield
     }
     acc.terminator
@@ -166,16 +166,16 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           acc.yield
-// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 // CHECK-NEXT:         acc.loop control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
 // CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           acc.yield
-// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<nvidia>]}
+// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>]}
 // CHECK-NEXT:         acc.yield
-// CHECK-NEXT:       } attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT:       } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 // CHECK-NEXT:       acc.yield
 // CHECK-NEXT:     }
 // CHECK-NEXT:     acc.terminator
@@ -196,72 +196,72 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
   acc.loop gang vector worker control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop worker(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop worker(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop worker(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop vector(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop vector(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i64Value: i64}) worker vector control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop tile({%i64Value : i64, %i64Value : i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop tile({%i32Value : i32, %i32Value : i32}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   %b = acc.cache varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
   acc.loop cache(%b : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   return
 }
 
@@ -271,7 +271,7 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
 // CHECK:      acc.loop
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 // CHECK:      acc.loop gang({num=[[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
@@ -343,7 +343,7 @@ func.func @acc_loop_multiple_block() {
       cf.br ^bb1(%22 : index)
     ^bb3:
       acc.yield
-    } attributes {inclusiveUpperbound = array<i1: true>}
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -1477,7 +1477,7 @@ func.func @acc_reduc_test(%a : i64) -> () {
   acc.parallel reduction(@reduction_add_i64 -> %a : i64) {
     acc.loop reduction(@reduction_add_i64 -> %a : i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    } attributes { inclusiveUpperbound = array<i1: true> }
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -1869,21 +1869,21 @@ func.func @acc_combined() {
   acc.parallel combined(loop) {
     acc.loop combined(parallel) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.terminator
   }
 
   acc.kernels combined(loop) {
     acc.loop combined(kernels) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes {auto_ = [#acc.device_type<none>]}
     acc.terminator
   }
 
   acc.serial combined(loop) {
     acc.loop combined(serial) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes {seq = [#acc.device_type<none>]}
     acc.terminator
   }
 
@@ -1949,7 +1949,7 @@ func.func @acc_loop_container() {
       scf.yield
     }
     acc.yield
-  }
+  } attributes {independent = [#acc.device_type<none>]}
   return
 }
 
@@ -1971,7 +1971,7 @@ func.func @acc_loop_container() {
       scf.yield
     }
     acc.yield
-  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>] }
+  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   return
 }
 

From 02161c635fd70e0214bd8b8320a80992c50ec325 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Wed, 11 Jun 2025 12:44:51 -0700
Subject: [PATCH 0039/1322] [NVPTX] Misc table-gen cleanup (NFC) (#142877)

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  194 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 2488 ++++-------------
 .../Target/NVPTX/NVPTXReplaceImageHandles.cpp |  840 +++---
 3 files changed, 1056 insertions(+), 2466 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b646d39194c7..9ca4e8d20650 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -160,7 +160,6 @@ def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
 
 def True : Predicate<"true">;
-def False : Predicate<"false">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
@@ -257,6 +256,11 @@ def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
 //                         "prmt.b32${mode}">;
 //         ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
 //
+//   * BasicFlagsNVPTXInst<(outs Int64Regs:$state),
+//                         (ins ADDR:$addr),
+//                         "mbarrier.arrive.b64">;
+//         ---> "mbarrier.arrive.b64 \t$state, [$addr];"
+//
 class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
                           list<dag> pattern = []>
   : NVPTXInst<
@@ -274,7 +278,11 @@ class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmst
             !if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
             !interleave(
               !foreach(i, !range(!size(ins_dag)),
-                "$" # !getdagname(ins_dag, i)),
+                 !if(!eq(!cast<string>(!getdagarg<DAGOperand>(ins_dag, i)), "ADDR"),
+                    "[$" # !getdagname(ins_dag, i) # "]",
+                    "$" # !getdagname(ins_dag, i)
+                 )
+                ),
               ", "))),
         ";"),
       pattern>;
@@ -956,31 +964,17 @@ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
 
 // Matchers for signed, unsigned mul.wide ISD nodes.
-def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
-          (MULWIDES32 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
-          (MULWIDES32Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
-          (MULWIDEU32 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
-          (MULWIDEU32Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
+let Predicates = [doMulWide] in {
+  def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>;
+  def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>;
+  def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>;
+  def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), (MULWIDEU32Imm $a, imm:$b)>;
 
-def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
-          (MULWIDES64 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
-          (MULWIDES64Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
-          (MULWIDEU64 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
-          (MULWIDEU64Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
+  def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), (MULWIDES64 $a, $b)>;
+  def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), (MULWIDES64Imm $a, imm:$b)>;
+  def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), (MULWIDEU64 $a, $b)>;
+  def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>;
+}
 
 // Predicates used for converting some patterns to mul.wide.
 def SInt32Const : PatLeaf<(imm), [{
@@ -1106,18 +1100,12 @@ defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>;
 defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
 }
 
-def INEG16 :
-  BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "neg.s16",
-            [(set i16:$dst, (ineg i16:$src))]>;
-def INEG32 :
-  BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-            "neg.s32",
-            [(set i32:$dst, (ineg i32:$src))]>;
-def INEG64 :
-  BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-            "neg.s64",
-            [(set i64:$dst, (ineg i64:$src))]>;
+foreach t = [I16RT, I32RT, I64RT] in {
+  def NEG_S # t.Size :
+    BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
+              "neg.s" # t.Size,
+              [(set t.Ty:$dst, (ineg t.Ty:$src))]>;
+}
 
 //-----------------------------------
 // Floating Point Arithmetic
@@ -1538,7 +1526,7 @@ def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
 
 def SDTPRMT :
   SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
+                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
 def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
 
 multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
@@ -1961,7 +1949,7 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // f16 -> pred
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr $a, $b, ModeFTZ)>,
-        Requires<[useFP16Math,doF32FTZ]>;
+        Requires<[useFP16Math, doF32FTZ]>;
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
@@ -1969,7 +1957,7 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // bf16 -> pred
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr $a, $b, ModeFTZ)>,
-        Requires<[hasBF16Math,doF32FTZ]>;
+        Requires<[hasBF16Math, doF32FTZ]>;
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
@@ -2497,24 +2485,20 @@ def : Pat<(f16 (uint_to_fp i32:$a)), (CVT_f16_u32 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i64:$a)), (CVT_f16_u64 $a, CvtRN)>;
 
 // sint -> bf16
-def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>, 
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>;
+}
 
 // uint -> bf16
-def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>;
+}
 
 // sint -> f32
 def : Pat<(f32 (sint_to_fp  i1:$a)), (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
@@ -2565,27 +2549,25 @@ def : Pat<(i16 (fp_to_uint bf16:$a)), (CVT_u16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint bf16:$a)), (CVT_u32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint bf16:$a)), (CVT_u64_bf16 $a, CvtRZI)>;
 // f32 -> sint
-def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
+let Predicates = [doF32FTZ] in {
+  def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>;
+}
+def : Pat<(i1  (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI)>;
 
 // f32 -> uint
+let Predicates = [doF32FTZ] in {
+  def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>;
+}
 def : Pat<(i1  (fp_to_uint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>, 
-      Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI)>;
 
 // f64 -> sint
@@ -2707,28 +2689,24 @@ let hasSideEffects = false in {
 
   // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
   // unused high/low part.
-  def I32toI16H_Sink  : NVPTXInst<(outs Int16Regs:$high),
-                             (ins Int32Regs:$s),
-                             "mov.b32 \t{{_, $high}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I32toI16L_Sink  : NVPTXInst<(outs Int16Regs:$low),
-                             (ins Int32Regs:$s),
-                             "mov.b32 \t{{$low, _}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I64toI32H_Sink  : NVPTXInst<(outs Int32Regs:$high),
-                             (ins Int64Regs:$s),
-                             "mov.b64 \t{{_, $high}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I64toI32L_Sink  : NVPTXInst<(outs Int32Regs:$low),
-                             (ins Int64Regs:$s),
-                             "mov.b64 \t{{$low, _}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
+  let Predicates = [hasPTX<71>] in {
+    def I32toI16H_Sink  : NVPTXInst<(outs Int16Regs:$high), (ins Int32Regs:$s),
+                              "mov.b32 \t{{_, $high}}, $s;", []>;
+    def I32toI16L_Sink  : NVPTXInst<(outs Int16Regs:$low), (ins Int32Regs:$s),
+                              "mov.b32 \t{{$low, _}}, $s;", []>;
+    def I64toI32H_Sink  : NVPTXInst<(outs Int32Regs:$high), (ins Int64Regs:$s),
+                              "mov.b64 \t{{_, $high}}, $s;", []>;
+    def I64toI32L_Sink  : NVPTXInst<(outs Int32Regs:$low), (ins Int64Regs:$s),
+                              "mov.b64 \t{{$low, _}}, $s;", []>;
+  }
 }
 
-def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
+let Predicates = [hasPTX<71>] in {
+  def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>;
+  def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>;
+  def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>;
+  def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>;
+}
 
 // Fall back to the old way if we don't have PTX 7.1.
 def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>;
@@ -3061,29 +3039,19 @@ def stacksave :
   SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf,
          [SDNPHasChain, SDNPSideEffect]>;
 
-def STACKRESTORE_32 :
-  BasicNVPTXInst<(outs), (ins Int32Regs:$ptr),
-            "stackrestore.u32",
-            [(stackrestore i32:$ptr)]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
+let Predicates = [hasPTX<73>, hasSM<52>] in {
+  foreach t = [I32RT, I64RT] in {
+    def STACKRESTORE_ # t.Size :
+      BasicNVPTXInst<(outs), (ins t.RC:$ptr),
+                "stackrestore.u" # t.Size,
+              [(stackrestore t.Ty:$ptr)]>;
 
-def STACKSAVE_32 :
-  BasicNVPTXInst<(outs Int32Regs:$dst), (ins),
-            "stacksave.u32",
-            [(set i32:$dst, (i32 stacksave))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKRESTORE_64 :
-  BasicNVPTXInst<(outs), (ins Int64Regs:$ptr),
-            "stackrestore.u64",
-            [(stackrestore i64:$ptr)]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKSAVE_64 :
-  BasicNVPTXInst<(outs Int64Regs:$dst), (ins),
-            "stacksave.u64",
-            [(set i64:$dst, (i64 stacksave))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
+    def STACKSAVE_ # t.Size :
+      BasicNVPTXInst<(outs t.RC:$dst), (ins),
+                "stacksave.u" # t.Size,
+              [(set t.Ty:$dst, (t.Ty stacksave))]>;
+  }
+}
 
 include "NVPTXIntrinsics.td"
 
@@ -3124,7 +3092,7 @@ def : Pat <
 ////////////////////////////////////////////////////////////////////////////////
 
 class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
-    NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
+    BasicNVPTXInst<(outs), (ins), "fence."#sem#"."#scope>,
     Requires<[ptx, hasSM<70>]>;
 
 foreach scope = ["sys", "gpu", "cluster", "cta"] in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f918160001ba..83d7defe6d9a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -52,7 +52,7 @@ class PTX {
 def ptx : PTX;
 
 // Generates list of n sequential register names.
-// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
+// E.g. RegNames<3, "r">.ret -> ["r0", "r1", "r2" ]
 class RegSeq<int n, string prefix> {
   list<string> ret = !if(n, !listconcat(RegSeq<!sub(n, 1), prefix>.ret,
                                         [prefix # !sub(n, 1)]),
@@ -137,7 +137,7 @@ defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive
 
 class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
                           list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
-        NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
+        BasicNVPTXInst<(outs), (ins), "barrier.cluster."# variant, [(Intr)]>,
         Requires<Preds>;
 
 def barrier_cluster_arrive:
@@ -400,13 +400,9 @@ def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS :
 //-----------------------------------
 
 multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
-  def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
-            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
-            [(Intrin i32:$addr)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
-            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
-            [(Intrin i64:$addr)]>,
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+            "cp.async.mbarrier.arrive" # NoInc # AddrSpace # ".b64",
+            [(Intrin addr:$addr)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
 
@@ -420,30 +416,19 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
   CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
 
 multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin, Intrinsic IntrinS> {
-  def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
-            !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
-            [(Intrin i32:$dst, i32:$src)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
-            !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
-            [(Intrin i64:$dst, i64:$src)]>,
+  def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src),
+            "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ";",
+            [(Intrin addr:$dst, addr:$src)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
+
   // Variant with src_size parameter
-  def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i32:$dst, i32:$src, i32:$src_size)]>,
+  def _s : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$src_size),
+             "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;",
+             [(IntrinS addr:$dst, addr:$src, i32:$src_size)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
-  def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i32:$dst, i32:$src, imm:$src_size)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i64:$dst, i64:$src, i32:$src_size)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i64:$dst, i64:$src, imm:$src_size)]>,
+  def _si: NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, i32imm:$src_size),
+             "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;",
+             [(IntrinS addr:$dst, addr:$src, imm:$src_size)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
 
@@ -513,14 +498,14 @@ class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
 }
 
 multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
-  def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+  def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
       !if(has_ch,
           CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;",
           CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"),
       [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
       Requires<[hasPTX<80>, hasSM<90>]>;
 
-  def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
+  def _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
       !if(has_ch,
           CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;",
           CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"),
@@ -533,7 +518,7 @@ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
 multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
   defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
 
-  def NAME : NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
       (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
            Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
       !if(has_ch,
@@ -542,7 +527,7 @@ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
       [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
       Requires<[hasPTX<80>, hasSM<90>]>;
 
-  def NAME # _MC : NVPTXInst<(outs),
+  def _MC : NVPTXInst<(outs),
       (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
            Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
       !if(has_ch,
@@ -561,7 +546,7 @@ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
   Requires<[hasPTX<80>, hasSM<90>]>;
 
 multiclass CP_ASYNC_BULK_PREFETCH_INTR<bit has_ch> {
-  def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+  def "" : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
       !if(has_ch,
           "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;",
           "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"),
@@ -609,19 +594,19 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode>
   defvar asm_str = !if(!eq(mode, "im2col"),
     !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag),
             !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _MC: NVPTXInst<(outs),
+  def _MC : NVPTXInst<(outs),
                   !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)),
                   !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
                   !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _MC_CH: NVPTXInst<(outs),
+  def _MC_CH : NVPTXInst<(outs),
                      !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)),
                      !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>,
                      Requires<[hasPTX<80>, hasSM<90>]>;
@@ -661,11 +646,11 @@ multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, bit shared32, string mode> {
   defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
   defvar rc = !if(shared32, Int32Regs, Int64Regs);
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins rc:$src, Int64Regs:$tmap), dims_dag),
             !strconcat(S2G_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch)),
                   !strconcat(S2G_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
@@ -685,11 +670,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode>
   defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta";
   defvar suffix = "." # mode # ".bulk_group";
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)),
             !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch, TMAReductionFlags:$red_op)),
                   !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
@@ -735,11 +720,11 @@ multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
   defvar asm_str = !if(!eq(mode, "im2col"),
     !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins Int64Regs:$tmap), dims_dag, im2col_dag),
             !strconcat(PREFETCH_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
                   !strconcat(PREFETCH_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
@@ -755,10 +740,10 @@ foreach dim = [1, 2, 3, 4, 5] in {
 //Prefetch and Prefetchu 
 
 class PREFETCH_INTRS<string InstName> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr),
-          InstName # " [$addr];",
+          BasicNVPTXInst<(outs), (ins ADDR:$addr),
+          InstName,
           [(!cast<Intrinsic>(!strconcat("int_nvvm_",
-          !subst(".", "_", InstName))) i64:$addr)]>,
+          !subst(".", "_", InstName))) addr:$addr)]>,
           Requires<[hasPTX<80>, hasSM<90>]>;
    
 
@@ -769,36 +754,39 @@ def PREFETCH_LOCAL_L1  : PREFETCH_INTRS<"prefetch.local.L1">;
 def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
 def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
 
-def PREFETCH_GLOBAL_L2_EVICT_NORMAL : NVPTXInst<(outs), (ins Int64Regs:$addr),
-                                      "prefetch.global.L2::evict_normal" # " [$addr];",
-                                      [(!cast<Intrinsic>("int_nvvm_prefetch_global_L2_evict_normal") i64:$addr)]>,
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+                                      "prefetch.global.L2::evict_normal",
+                                      [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>,
                                       Requires<[hasPTX<80>, hasSM<90>]>;
 
-def PREFETCH_GLOBAL_L2_EVICT_LAST   : NVPTXInst<(outs), (ins Int64Regs:$addr),
-                                      "prefetch.global.L2::evict_last" # " [$addr];",
-                                      [(!cast<Intrinsic>("int_nvvm_prefetch_global_L2_evict_last") i64:$addr)]>,
+def PREFETCH_GLOBAL_L2_EVICT_LAST   : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+                                      "prefetch.global.L2::evict_last",
+                                      [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>,
                                       Requires<[hasPTX<80>, hasSM<90>]>;
 
 
 def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
 
 //Applypriority intrinsics
-class APPLYPRIORITY_L2_INTRS<string addr> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
-          StrJoin<".", ["applypriority", addr , "L2::evict_normal"]>.ret # " [$addr], $size;",
-          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_applypriority", addr , "L2_evict_normal"]>.ret)
-          i64:$addr, i64:$size)]>,
+class APPLYPRIORITY_L2_INTRS<string addrspace> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr, Int64Regs:$size),
+          StrJoin<".", ["applypriority", addrspace , "L2::evict_normal"]>.ret,
+          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_applypriority", addrspace , "L2_evict_normal"]>.ret)
+          addr:$addr, i64:$size)]>,
           Requires<[hasPTX<74>, hasSM<80>]>;
 
 def APPLYPRIORITY_L2_EVICT_NORMAL        : APPLYPRIORITY_L2_INTRS<"">;
 def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
 
 //Discard Intrinsics
-class DISCARD_L2_INTRS<string Addr> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr),
-          StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;",
-          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
-          i64:$addr, (i64 128))]>,
+
+def discard_size_imm : TImmLeaf<i64, [{ return Imm == 128; }]>;
+
+class DISCARD_L2_INTRS<string addrspace> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr, i64imm:$size),
+          StrJoin<".", ["discard", addrspace , "L2"]>.ret,
+          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", addrspace , "L2"]>.ret)
+          addr:$addr, discard_size_imm:$size)]>,
           Requires<[hasPTX<74>, hasSM<80>]>;
 
 def DISCARD_L2        : DISCARD_L2_INTRS<"">;
@@ -809,8 +797,8 @@ def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
 //-----------------------------------
 
 multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count),
-           "mbarrier.init" # AddrSpace # ".b64 [$addr], $count;",
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count),
+           "mbarrier.init" # AddrSpace # ".b64",
     [(Intrin addr:$addr, i32:$count)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -820,8 +808,8 @@ defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
                                           int_nvvm_mbarrier_init_shared>;
 
 multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs), (ins ADDR:$addr),
-           "mbarrier.inval" # AddrSpace # ".b64 [$addr];",
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+           "mbarrier.inval" # AddrSpace # ".b64",
     [(Intrin addr:$addr)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -831,8 +819,8 @@ defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
                                             int_nvvm_mbarrier_inval_shared>;
 
 multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
-           "mbarrier.arrive" # AddrSpace # ".b64 $state, [$addr];",
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
+           "mbarrier.arrive" # AddrSpace # ".b64",
     [(set i64:$state, (Intrin addr:$addr))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -842,9 +830,9 @@ defm MBARRIER_ARRIVE_SHARED :
   MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
 
 multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state),
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state),
            (ins ADDR:$addr, Int32Regs:$count),
-           "mbarrier.arrive.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;",
+           "mbarrier.arrive.noComplete" # AddrSpace # ".b64",
     [(set i64:$state, (Intrin addr:$addr, i32:$count))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -855,8 +843,8 @@ defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
   MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
 
 multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
-           "mbarrier.arrive_drop" # AddrSpace # ".b64 $state, [$addr];",
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
+           "mbarrier.arrive_drop" # AddrSpace # ".b64",
            [(set i64:$state, (Intrin addr:$addr))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -867,9 +855,9 @@ defm MBARRIER_ARRIVE_DROP_SHARED :
   MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
 
 multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state),
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state),
            (ins ADDR:$addr, Int32Regs:$count),
-           "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;",
+           "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64",
            [(set i64:$state, (Intrin addr:$addr, i32:$count))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -881,8 +869,8 @@ defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
                        int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
 
 multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state),
-           "mbarrier.test_wait" # AddrSpace # ".b64 $res, [$addr], $state;",
+  def "" : BasicNVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state),
+           "mbarrier.test_wait" # AddrSpace # ".b64",
            [(set i1:$res, (Intrin addr:$addr, i64:$state))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -1790,93 +1778,74 @@ def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
 def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
           (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
           (CVT_e4m3x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
           (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a),
           (CVT_e5m2x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a),
           (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn Int16Regs:$a),
+def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a),
           (CVT_f16x2_e4m3x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu Int16Regs:$a),
+def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a),
           (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a),
+def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a),
           (CVT_f16x2_e5m2x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a),
+def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a),
           (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
-          (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b),
-          (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b),
-          (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b),
-          (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in {
+  def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
+            (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b),
+            (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b),
+            (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b),
+            (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a),
-          (CVT_f16x2_e2m3x2 $a, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a),
-          (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a),
-          (CVT_f16x2_e3m2x2 $a, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a),
-          (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-      
-def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b),
-          (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b),
-          (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-      
-def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn Int16Regs:$a),
-          (CVT_f16x2_e2m1x2 $a, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu Int16Regs:$a),
-          (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+  def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e2m3x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e3m2x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32 $a, $b, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-          
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2 $a, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-          
-def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a),
-          (CVT_bf16x2_ue8m0x2 $a)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+  def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b),
+            (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b),
+            (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e2m1x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>;
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>;
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32 $a, $b, CvtRP)>;
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>;
+
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>;
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>;
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2 $a, CvtRP)>;
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>;
+
+  def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a),
+            (CVT_bf16x2_ue8m0x2 $a)>;
+}
 
 //
 // FNS
@@ -1920,14 +1889,14 @@ class ATOMIC_GENERIC_CHK <dag frag>
 
 multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
                       SDPatternOperator op, list<Predicate> preds> {
-  defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;";
+  defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-    def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b),
+    def r : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>,
     Requires<preds>;
     if t.SupportsImm then
-      def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b),
+      def i : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b),
         asm_str,
         [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>,
       Requires<preds>;
@@ -1937,27 +1906,27 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
 // has 3 operands
 multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
                       SDPatternOperator op, list<Predicate> preds> {
-  defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+  defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-    def rr : NVPTXInst<(outs t.RC:$dst),
+    def rr : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.RC:$b, t.RC:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>,
     Requires<preds>;
 
-    def ir : NVPTXInst<(outs t.RC:$dst),
+    def ir : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>,
     Requires<preds>;
 
-    def ri : NVPTXInst<(outs t.RC:$dst),
+    def ri : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>,
     Requires<preds>;
 
-    def ii : NVPTXInst<(outs t.RC:$dst),
+    def ii : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>,
@@ -2100,7 +2069,7 @@ multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
     // For now we only need variants for generic space pointers.
     foreach space = ["gen"] in {
       defm _#scope#space : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, scope, space,
-                         t, !listconcat(Preds,[hasAtomScope])>;
+                         t, !listconcat(Preds, [hasAtomScope])>;
     }
   }
 }
@@ -4454,1956 +4423,616 @@ defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", Int32Regs>;
 //-----------------------------------
 
 let IsSurfTexQuery = true in {
-def TXQ_CHANNEL_ORDER_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.channel_order.b32 \t$d, [$a];",
-              []>;
-def TXQ_CHANNEL_ORDER_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.channel_order.b32 \t$d, [$a];",
-              []>;
-def TXQ_CHANNEL_DATA_TYPE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def TXQ_CHANNEL_DATA_TYPE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def TXQ_WIDTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.width.b32 \t$d, [$a];",
-              []>;
-def TXQ_WIDTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.width.b32 \t$d, [$a];",
-              []>;
-def TXQ_HEIGHT_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.height.b32 \t$d, [$a];",
-              []>;
-def TXQ_HEIGHT_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.height.b32 \t$d, [$a];",
-              []>;
-def TXQ_DEPTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.depth.b32 \t$d, [$a];",
-              []>;
-def TXQ_DEPTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.depth.b32 \t$d, [$a];",
-              []>;
-def TXQ_ARRAY_SIZE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.array_size.b32 \t$d, [$a];",
-              []>;
-def TXQ_ARRAY_SIZE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.array_size.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_SAMPLES_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.num_samples.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_SAMPLES_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.num_samples.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_MIPMAP_LEVELS_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.num_mipmap_levels.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_MIPMAP_LEVELS_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.num_mipmap_levels.b32 \t$d, [$a];",
-              []>;
+  foreach query = ["channel_order", "channel_data_type", "width", "height", 
+                   "depth", "array_size", "num_samples", "num_mipmap_levels"] in {
+    def TXQ_ # !toupper(query) # _R
+      : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                  "txq." # query # ".b32 \t$d, [$a];",
+                  [(set i32:$d, (!cast<Intrinsic>("int_nvvm_txq_" # query) i64:$a))]>;
+    def TXQ_ # !toupper(query) # _I
+      : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+                  "txq." # query # ".b32 \t$d, [$a];",
+                  []>;
+  }
 }
 
-def : Pat<(int_nvvm_txq_channel_order i64:$a),
-          (TXQ_CHANNEL_ORDER_R $a)>;
-def : Pat<(int_nvvm_txq_channel_data_type i64:$a),
-          (TXQ_CHANNEL_DATA_TYPE_R $a)>;
-def : Pat<(int_nvvm_txq_width i64:$a),
-          (TXQ_WIDTH_R $a)>;
-def : Pat<(int_nvvm_txq_height i64:$a),
-          (TXQ_HEIGHT_R $a)>;
-def : Pat<(int_nvvm_txq_depth i64:$a),
-          (TXQ_DEPTH_R $a)>;
-def : Pat<(int_nvvm_txq_array_size i64:$a),
-          (TXQ_ARRAY_SIZE_R $a)>;
-def : Pat<(int_nvvm_txq_num_samples i64:$a),
-          (TXQ_NUM_SAMPLES_R $a)>;
-def : Pat<(int_nvvm_txq_num_mipmap_levels i64:$a),
-          (TXQ_NUM_MIPMAP_LEVELS_R $a)>;
-
-
 //-----------------------------------
 // Surface Query Intrinsics
 //-----------------------------------
 
 let IsSurfTexQuery = true in {
-def SUQ_CHANNEL_ORDER_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.channel_order.b32 \t$d, [$a];",
-              []>;
-def SUQ_CHANNEL_ORDER_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.channel_order.b32 \t$d, [$a];",
-              []>;
-def SUQ_CHANNEL_DATA_TYPE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def SUQ_CHANNEL_DATA_TYPE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def SUQ_WIDTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.width.b32 \t$d, [$a];",
-              []>;
-def SUQ_WIDTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.width.b32 \t$d, [$a];",
-              []>;
-def SUQ_HEIGHT_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.height.b32 \t$d, [$a];",
-              []>;
-def SUQ_HEIGHT_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.height.b32 \t$d, [$a];",
-              []>;
-def SUQ_DEPTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.depth.b32 \t$d, [$a];",
-              []>;
-def SUQ_DEPTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.depth.b32 \t$d, [$a];",
-              []>;
-def SUQ_ARRAY_SIZE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.array_size.b32 \t$d, [$a];",
-              []>;
-def SUQ_ARRAY_SIZE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.array_size.b32 \t$d, [$a];",
-              []>;
+  foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size"] in {
+    def SUQ_ # !toupper(query) # _R
+      : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                  "suq." # query # ".b32 \t$d, [$a];",
+                  [(set i32:$d, (!cast<Intrinsic>("int_nvvm_suq_" # query) i64:$a))]>;
+    def SUQ_ # !toupper(query) # _I
+      : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+                  "suq." # query # ".b32 \t$d, [$a];",
+                  []>;
+  }
 }
 
-def : Pat<(int_nvvm_suq_channel_order i64:$a),
-          (SUQ_CHANNEL_ORDER_R $a)>;
-def : Pat<(int_nvvm_suq_channel_data_type i64:$a),
-          (SUQ_CHANNEL_DATA_TYPE_R $a)>;
-def : Pat<(int_nvvm_suq_width i64:$a),
-          (SUQ_WIDTH_R $a)>;
-def : Pat<(int_nvvm_suq_height i64:$a),
-          (SUQ_HEIGHT_R $a)>;
-def : Pat<(int_nvvm_suq_depth i64:$a),
-          (SUQ_DEPTH_R $a)>;
-def : Pat<(int_nvvm_suq_array_size i64:$a),
-          (SUQ_ARRAY_SIZE_R $a)>;
-
-
 //===- Handle Query -------------------------------------------------------===//
 
 // TODO: These intrinsics are not yet finalized, pending PTX ISA design work
 def ISTYPEP_SAMPLER
-  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-              "istypep.samplerref \t$d, $a;",
+  : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.samplerref",
               [(set i1:$d, (int_nvvm_istypep_sampler i64:$a))]>;
 def ISTYPEP_SURFACE
-  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-              "istypep.surfref \t$d, $a;",
+  : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.surfref",
               [(set i1:$d, (int_nvvm_istypep_surface i64:$a))]>;
 def ISTYPEP_TEXTURE
-  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-              "istypep.texref \t$d, $a;",
+  : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.texref",
               [(set i1:$d, (int_nvvm_istypep_texture i64:$a))]>;
 
 //===- Surface Stores -----------------------------------------------------===//
 
 let IsSust = true in {
 
-class SUST_1D_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, intype:$r)),
-                inst # " \t[$s, \\{$x\\}], \\{$r\\};",
-                []>;
+                inst # " \t[$s, \\{$x\\}], \\{$r\\};", pat>;
 multiclass SUST_1D<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+
+  def _R : SUST_1D_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, intype:$r)]>;
+  def _I : SUST_1D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_B8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>;
-defm SUST_B_1D_B16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>;
-defm SUST_B_1D_B32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>;
-defm SUST_B_1D_B64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>;
+defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>;
+defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>;
+defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>;
+defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_B8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>;
-defm SUST_B_1D_B16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>;
-defm SUST_B_1D_B32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>;
-defm SUST_B_1D_B64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>;
+defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>;
+defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>;
+defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>;
+defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>;
 
-defm SUST_B_1D_B8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>;
-defm SUST_B_1D_B16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>;
-defm SUST_B_1D_B32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>;
-defm SUST_B_1D_B64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>;
+defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>;
+defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>;
+defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>;
+defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>;
 
-defm SUST_P_1D_B8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>;
-defm SUST_P_1D_B16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>;
-defm SUST_P_1D_B32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>;
+defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>;
+defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>;
+defm SUST_P_1D_I32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>;
 
-class SUST_1D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$x\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, intype:$r, intype:$g)]>;
+  def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_V2B8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_1D_V2B16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_1D_V2B32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_1D_V2B64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>;
+// int_nvvm_sust_b_1d_v2i8_clamp
 
-defm SUST_B_1D_V2B8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>;
-defm SUST_B_1D_V2B16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>;
-defm SUST_B_1D_V2B32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>;
-defm SUST_B_1D_V2B64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>;
+defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_V2B8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>;
-defm SUST_B_1D_V2B16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>;
-defm SUST_B_1D_V2B32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>;
-defm SUST_B_1D_V2B64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>;
+defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>;
+defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>;
+defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>;
+defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>;
 
-defm SUST_P_1D_V2B8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>;
-defm SUST_P_1D_V2B16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>;
-defm SUST_P_1D_V2B32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>;
+defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>;
+defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>;
+defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>;
+defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>;
 
-class SUST_1D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>;
+defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>;
+defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>;
+
+class SUST_1D_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g,
                                 intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, intype:$r, intype:$g,
+                intype:$b, intype:$a)]>;
+  def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_V4B8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_1D_V4B16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_1D_V4B32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>;
+defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_1D_V4B8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>;
-defm SUST_B_1D_V4B16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>;
-defm SUST_B_1D_V4B32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>;
+defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>;
+defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>;
+defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_1D_V4B8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>;
-defm SUST_B_1D_V4B16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>;
-defm SUST_B_1D_V4B32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>;
+defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>;
+defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>;
+defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_1D_V4B8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>;
-defm SUST_P_1D_V4B16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>;
-defm SUST_P_1D_V4B32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>;
+defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>;
+defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>;
+defm SUST_P_1D_V4I32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>;
 
-class SUST_1D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r)),
                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
+    defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_ARRAY_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, intype:$r)]>;
+  def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_B8_CLAMP
+defm SUST_B_1D_ARRAY_I8_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_B16_CLAMP
+defm SUST_B_1D_ARRAY_I16_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_B32_CLAMP
+defm SUST_B_1D_ARRAY_I32_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", Int32Regs>;
-defm SUST_B_1D_ARRAY_B64_CLAMP
+defm SUST_B_1D_ARRAY_I64_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_B8_TRAP
+defm SUST_B_1D_ARRAY_I8_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_B16_TRAP
+defm SUST_B_1D_ARRAY_I16_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_B32_TRAP
+defm SUST_B_1D_ARRAY_I32_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", Int32Regs>;
-defm SUST_B_1D_ARRAY_B64_TRAP
+defm SUST_B_1D_ARRAY_I64_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_B8_ZERO
+defm SUST_B_1D_ARRAY_I8_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_B16_ZERO
+defm SUST_B_1D_ARRAY_I16_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_B32_ZERO
+defm SUST_B_1D_ARRAY_I32_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", Int32Regs>;
-defm SUST_B_1D_ARRAY_B64_ZERO
+defm SUST_B_1D_ARRAY_I64_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", Int64Regs>;
 
-defm SUST_P_1D_ARRAY_B8_TRAP
+defm SUST_P_1D_ARRAY_I8_TRAP
   : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_B16_TRAP
+defm SUST_P_1D_ARRAY_I16_TRAP
   : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_B32_TRAP
+defm SUST_P_1D_ARRAY_I32_TRAP
   : SUST_1D_ARRAY<"sust.p.a1d.b32.trap", Int32Regs>;
 
-class SUST_1D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_V2B8_CLAMP
+defm SUST_B_1D_ARRAY_V2I8_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B16_CLAMP
+defm SUST_B_1D_ARRAY_V2I16_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B32_CLAMP
+defm SUST_B_1D_ARRAY_V2I32_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_1D_ARRAY_V2B64_CLAMP
+defm SUST_B_1D_ARRAY_V2I64_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_V2B8_TRAP
+defm SUST_B_1D_ARRAY_V2I8_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B16_TRAP
+defm SUST_B_1D_ARRAY_V2I16_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B32_TRAP
+defm SUST_B_1D_ARRAY_V2I32_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", Int32Regs>;
-defm SUST_B_1D_ARRAY_V2B64_TRAP
+defm SUST_B_1D_ARRAY_V2I64_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_V2B8_ZERO
+defm SUST_B_1D_ARRAY_V2I8_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B16_ZERO
+defm SUST_B_1D_ARRAY_V2I16_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B32_ZERO
+defm SUST_B_1D_ARRAY_V2I32_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", Int32Regs>;
-defm SUST_B_1D_ARRAY_V2B64_ZERO
+defm SUST_B_1D_ARRAY_V2I64_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_1D_ARRAY_V2B8_TRAP
+defm SUST_P_1D_ARRAY_V2I8_TRAP
   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V2B16_TRAP
+defm SUST_P_1D_ARRAY_V2I16_TRAP
   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V2B32_TRAP
+defm SUST_P_1D_ARRAY_V2I32_TRAP
   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b32.trap", Int32Regs>;
 
-class SUST_1D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_V4B8_CLAMP
+defm SUST_B_1D_ARRAY_V4I8_CLAMP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B16_CLAMP
+defm SUST_B_1D_ARRAY_V4I16_CLAMP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B32_CLAMP
+defm SUST_B_1D_ARRAY_V4I32_CLAMP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_1D_ARRAY_V4B8_TRAP
+defm SUST_B_1D_ARRAY_V4I8_TRAP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B16_TRAP
+defm SUST_B_1D_ARRAY_V4I16_TRAP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B32_TRAP
+defm SUST_B_1D_ARRAY_V4I32_TRAP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_1D_ARRAY_V4B8_ZERO
+defm SUST_B_1D_ARRAY_V4I8_ZERO
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B16_ZERO
+defm SUST_B_1D_ARRAY_V4I16_ZERO
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B32_ZERO
+defm SUST_B_1D_ARRAY_V4I32_ZERO
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_1D_ARRAY_V4B8_TRAP
+defm SUST_P_1D_ARRAY_V4I8_TRAP
   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V4B16_TRAP
+defm SUST_P_1D_ARRAY_V4I16_TRAP
   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V4B32_TRAP
+defm SUST_P_1D_ARRAY_V4I32_TRAP
   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", Int32Regs>;
 
-class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r)),
                 inst # " \t[$s, \\{$x, $y\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_2D<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, intype:$r)]>;
+  def _I : SUST_2D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_B8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>;
-defm SUST_B_2D_B16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>;
-defm SUST_B_2D_B32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>;
-defm SUST_B_2D_B64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>;
+defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>;
+defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>;
+defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>;
+defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_B8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>;
-defm SUST_B_2D_B16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>;
-defm SUST_B_2D_B32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>;
-defm SUST_B_2D_B64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>;
+defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>;
+defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>;
+defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>;
+defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_B8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>;
-defm SUST_B_2D_B16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>;
-defm SUST_B_2D_B32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>;
-defm SUST_B_2D_B64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>;
+defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>;
+defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>;
+defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>;
+defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_B8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>;
-defm SUST_P_2D_B16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>;
-defm SUST_P_2D_B32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>;
+defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>;
+defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>;
+defm SUST_P_2D_I32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>;
 
-class SUST_2D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_V2B8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_2D_V2B16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_2D_V2B32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_2D_V2B64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>;
+defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_V2B8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>;
-defm SUST_B_2D_V2B16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>;
-defm SUST_B_2D_V2B32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>;
-defm SUST_B_2D_V2B64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>;
+defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>;
+defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>;
+defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>;
+defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_V2B8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>;
-defm SUST_B_2D_V2B16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>;
-defm SUST_B_2D_V2B32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>;
-defm SUST_B_2D_V2B64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>;
+defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>;
+defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>;
+defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>;
+defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_V2B8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>;
-defm SUST_P_2D_V2B16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>;
-defm SUST_P_2D_V2B32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>;
+defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>;
+defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>;
+defm SUST_P_2D_V2I32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>;
 
-class SUST_2D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_V4B8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_2D_V4B16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_2D_V4B32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>;
+defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_2D_V4B8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>;
-defm SUST_B_2D_V4B16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>;
-defm SUST_B_2D_V4B32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>;
+defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>;
+defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>;
+defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_2D_V4B8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>;
-defm SUST_B_2D_V4B16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>;
-defm SUST_B_2D_V4B32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>;
+defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>;
+defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>;
+defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_2D_V4B8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>;
-defm SUST_P_2D_V4B16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>;
-defm SUST_P_2D_V4B32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>;
+defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>;
+defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>;
+defm SUST_P_2D_V4I32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>;
 
-class SUST_2D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
                                 intype:$r)),
                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_ARRAY_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                intype:$r)]>;
+  def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_B8_CLAMP
+defm SUST_B_2D_ARRAY_I8_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_B16_CLAMP
+defm SUST_B_2D_ARRAY_I16_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_B32_CLAMP
+defm SUST_B_2D_ARRAY_I32_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", Int32Regs>;
-defm SUST_B_2D_ARRAY_B64_CLAMP
+defm SUST_B_2D_ARRAY_I64_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_B8_TRAP
+defm SUST_B_2D_ARRAY_I8_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_B16_TRAP
+defm SUST_B_2D_ARRAY_I16_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_B32_TRAP
+defm SUST_B_2D_ARRAY_I32_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", Int32Regs>;
-defm SUST_B_2D_ARRAY_B64_TRAP
+defm SUST_B_2D_ARRAY_I64_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_B8_ZERO
+defm SUST_B_2D_ARRAY_I8_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_B16_ZERO
+defm SUST_B_2D_ARRAY_I16_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_B32_ZERO
+defm SUST_B_2D_ARRAY_I32_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", Int32Regs>;
-defm SUST_B_2D_ARRAY_B64_ZERO
+defm SUST_B_2D_ARRAY_I64_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_ARRAY_B8_TRAP
+defm SUST_P_2D_ARRAY_I8_TRAP
   : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_B16_TRAP
+defm SUST_P_2D_ARRAY_I16_TRAP
   : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_B32_TRAP
+defm SUST_P_2D_ARRAY_I32_TRAP
   : SUST_2D_ARRAY<"sust.p.a2d.b32.trap", Int32Regs>;
 
-class SUST_2D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_V2B8_CLAMP
+defm SUST_B_2D_ARRAY_V2I8_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B16_CLAMP
+defm SUST_B_2D_ARRAY_V2I16_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B32_CLAMP
+defm SUST_B_2D_ARRAY_V2I32_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_2D_ARRAY_V2B64_CLAMP
+defm SUST_B_2D_ARRAY_V2I64_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_V2B8_TRAP
+defm SUST_B_2D_ARRAY_V2I8_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B16_TRAP
+defm SUST_B_2D_ARRAY_V2I16_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B32_TRAP
+defm SUST_B_2D_ARRAY_V2I32_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", Int32Regs>;
-defm SUST_B_2D_ARRAY_V2B64_TRAP
+defm SUST_B_2D_ARRAY_V2I64_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_V2B8_ZERO
+defm SUST_B_2D_ARRAY_V2I8_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B16_ZERO
+defm SUST_B_2D_ARRAY_V2I16_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B32_ZERO
+defm SUST_B_2D_ARRAY_V2I32_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", Int32Regs>;
-defm SUST_B_2D_ARRAY_V2B64_ZERO
+defm SUST_B_2D_ARRAY_V2I64_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_ARRAY_V2B8_TRAP
+defm SUST_P_2D_ARRAY_V2I8_TRAP
   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V2B16_TRAP
+defm SUST_P_2D_ARRAY_V2I16_TRAP
   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V2B32_TRAP
+defm SUST_P_2D_ARRAY_V2I32_TRAP
   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b32.trap", Int32Regs>;
 
-class SUST_2D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_V4B8_CLAMP
+defm SUST_B_2D_ARRAY_V4I8_CLAMP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B16_CLAMP
+defm SUST_B_2D_ARRAY_V4I16_CLAMP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B32_CLAMP
+defm SUST_B_2D_ARRAY_V4I32_CLAMP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_2D_ARRAY_V4B8_TRAP
+defm SUST_B_2D_ARRAY_V4I8_TRAP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B16_TRAP
+defm SUST_B_2D_ARRAY_V4I16_TRAP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B32_TRAP
+defm SUST_B_2D_ARRAY_V4I32_TRAP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_2D_ARRAY_V4B8_ZERO
+defm SUST_B_2D_ARRAY_V4I8_ZERO
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B16_ZERO
+defm SUST_B_2D_ARRAY_V4I16_ZERO
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B32_ZERO
+defm SUST_B_2D_ARRAY_V4I32_ZERO
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_2D_ARRAY_V4B8_TRAP
+defm SUST_P_2D_ARRAY_V4I8_TRAP
   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V4B16_TRAP
+defm SUST_P_2D_ARRAY_V4I16_TRAP
   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V4B32_TRAP
+defm SUST_P_2D_ARRAY_V4I32_TRAP
   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b32.trap", Int32Regs>;
 
-class SUST_3D_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_3D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
                                 intype:$r)),
                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_3D<string inst, NVPTXRegClass intype> {
-  def _R : SUST_3D_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_3D_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_3D_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                intype:$r)]>;
+  def _I : SUST_3D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_B8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>;
-defm SUST_B_3D_B16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>;
-defm SUST_B_3D_B32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>;
-defm SUST_B_3D_B64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>;
+defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>;
+defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>;
+defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>;
+defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>;
 
-defm SUST_B_3D_B8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>;
-defm SUST_B_3D_B16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>;
-defm SUST_B_3D_B32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>;
-defm SUST_B_3D_B64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>;
+defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>;
+defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>;
+defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>;
+defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>;
 
-defm SUST_B_3D_B8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>;
-defm SUST_B_3D_B16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>;
-defm SUST_B_3D_B32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>;
-defm SUST_B_3D_B64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>;
+defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>;
+defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>;
+defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>;
+defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>;
 
-defm SUST_P_3D_B8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>;
-defm SUST_P_3D_B16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>;
-defm SUST_P_3D_B32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>;
+defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>;
+defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>;
+defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>;
 
-class SUST_3D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_3D_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_3D_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_3D_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_V2B8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_3D_V2B16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_3D_V2B32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_3D_V2B64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>;
+defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_3D_V2B8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>;
-defm SUST_B_3D_V2B16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>;
-defm SUST_B_3D_V2B32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>;
-defm SUST_B_3D_V2B64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>;
+defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>;
+defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>;
+defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>;
+defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_3D_V2B8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>;
-defm SUST_B_3D_V2B16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>;
-defm SUST_B_3D_V2B32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>;
-defm SUST_B_3D_V2B64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>;
+defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>;
+defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>;
+defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>;
+defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_3D_V2B8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>;
-defm SUST_P_3D_V2B16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>;
-defm SUST_P_3D_V2B32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>;
+defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>;
+defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>;
+defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>;
 
-class SUST_3D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_3D_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_3D_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_3D_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_V4B8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_3D_V4B16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_3D_V4B32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>;
+defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_3D_V4B8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>;
-defm SUST_B_3D_V4B16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>;
-defm SUST_B_3D_V4B32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>;
+defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>;
+defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>;
+defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_3D_V4B8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>;
-defm SUST_B_3D_V4B16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>;
-defm SUST_B_3D_V4B32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>;
+defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>;
+defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>;
+defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_3D_V4B8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>;
-defm SUST_P_3D_V4B16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>;
-defm SUST_P_3D_V4B32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>;
+defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>;
+defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>;
+defm SUST_P_3D_V4I32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>;
 
 }
 
-// Surface store instruction patterns
-// I'm not sure why we can't just include these in the instruction definitions,
-// but TableGen complains of type errors :(
-
-// .clamp variant
-def : Pat<(int_nvvm_sust_b_1d_i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-           Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_3d_i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_B_3D_B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r),
-          (SUST_B_3D_B64_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-// .trap variant
-def : Pat<(int_nvvm_sust_b_1d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-           Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_3d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_B_3D_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r),
-          (SUST_B_3D_B64_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-// .zero variant
-def : Pat<(int_nvvm_sust_b_1d_i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-           Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_3d_i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_B_3D_B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r),
-          (SUST_B_3D_B64_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-
-def : Pat<(int_nvvm_sust_p_1d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_P_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_P_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_2d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_P_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_P_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_P_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_3d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_P_3D_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_3d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_P_3D_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_3d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_P_3D_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_3D_V2B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_3D_V2B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_3D_V2B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_3D_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_3D_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_3D_V4B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 //-----------------------------------
 // Read Special Registers
@@ -6411,13 +5040,13 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
 
 class PTX_READ_SREG_R64<string regname, Intrinsic intop, list<Predicate> Preds=[]>
   : NVPTXInst<(outs Int64Regs:$d), (ins),
-              !strconcat("mov.u64 \t$d, %", regname, ";"),
+              "mov.u64 \t$d, %" # regname # ";",
               [(set i64:$d, (intop))]>,
     Requires<Preds>;
 
 class PTX_READ_SREG_R32<string regname, Intrinsic intop, list<Predicate> Preds=[]>
   : NVPTXInst<(outs Int32Regs:$d), (ins),
-              !strconcat("mov.u32 \t$d, %", regname, ";"),
+              "mov.u32 \t$d, %" # regname # ";",
               [(set i32:$d, (intop))]>,
     Requires<Preds>;
 
@@ -6547,7 +5176,7 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
          !or(!eq(ptx_elt_type, "f16"),
              !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<60>],
 
-    !and(!eq(geom,"m8n8k4"),
+    !and(!eq(geom, "m8n8k4"),
          !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>],
 
     // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16
@@ -6557,46 +5186,46 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
              !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<61>],
 
     // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
-    !and(!or(!eq(geom,"m16n16k16"),
-             !eq(geom,"m8n32k16"),
-             !eq(geom,"m32n8k16")),
+    !and(!or(!eq(geom, "m16n16k16"),
+             !eq(geom, "m8n32k16"),
+             !eq(geom, "m32n8k16")),
          !or(!eq(ptx_elt_type, "u8"),
              !eq(ptx_elt_type, "s8"),
              !eq(ptx_elt_type, "s32"))) : [hasSM<72>, hasPTX<63>],
 
-    !and(!or(!eq(geom,"m16n16k16"),
-             !eq(geom,"m8n32k16"),
-             !eq(geom,"m32n8k16")),
+    !and(!or(!eq(geom, "m16n16k16"),
+             !eq(geom, "m8n32k16"),
+             !eq(geom, "m32n8k16")),
          !eq(ptx_elt_type, "bf16")) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(geom,"m16n16k8"),
+    !and(!eq(geom, "m16n16k8"),
          !eq(ptx_elt_type, "tf32")) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(geom,"m16n16k8"),
+    !and(!eq(geom, "m16n16k8"),
          !eq(ptx_elt_type, "f32")) : [hasSM<80>, hasPTX<70>],
 
     // b1 -> s32 @ m8n8k128(b1)
-    !and(!ne(op,"mma"),
-         !eq(geom,"m8n8k128")) : [hasSM<75>, hasPTX<63>],
+    !and(!ne(op, "mma"),
+         !eq(geom, "m8n8k128")) : [hasSM<75>, hasPTX<63>],
 
     // u4/s4 -> s32 @ m8n8k32 (u4/s4)
-    !and(!ne(op,"mma"),
-         !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<63>],
+    !and(!ne(op, "mma"),
+         !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<63>],
 
-    !or(!eq(geom,"m16n8k8"),
-        !eq(geom,"m8n8k16")) : [hasSM<75>, hasPTX<65>],
+    !or(!eq(geom, "m16n8k8"),
+        !eq(geom, "m8n8k16")) : [hasSM<75>, hasPTX<65>],
 
-    !and(!ne(ptx_elt_type,"f64"),
+    !and(!ne(ptx_elt_type, "f64"),
          !eq(geom, "m8n8k4")) : [hasSM<70>, hasPTX<64>],
 
     // mma m8n8k32 requires higher PTX version
-    !and(!eq(op,"mma"),
-         !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<65>],
+    !and(!eq(op, "mma"),
+         !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<65>],
 
-    !and(!eq(ptx_elt_type,"f64"),
+    !and(!eq(ptx_elt_type, "f64"),
          !eq(geom, "m8n8k4")) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(op,"mma"),
+    !and(!eq(op, "mma"),
          !or(!eq(geom, "m16n8k16"),
              !eq(geom, "m16n8k4"),
              !eq(geom, "m16n8k32"),
@@ -6605,28 +5234,28 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
              !eq(geom, "m16n8k128"),
              !eq(geom, "m16n8k256"))) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b16"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b16"),
          !eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8"),
          !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b6x16_p32"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b6x16_p32"),
          !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b4x16_p64"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b4x16_p64"),
          !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b6x16_p32"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b6x16_p32"),
          !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b4x16_p64"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b4x16_p64"),
          !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
 
   // template DAGs for instruction inputs/output.
@@ -6655,7 +5284,7 @@ class WMMA_INSTR<string _Intr, list<dag> _Args>
   : NVPTXInst<(outs), (ins), "?", []> {
   Intrinsic Intr = !cast<Intrinsic>(_Intr);
   // Concatenate all arguments into a single dag.
-  dag Args = !foldl((ins), _Args, a, b, !con(a,b));
+  dag Args = !foldl((ins), _Args, a, b, !con(a, b));
   // Pre-build the pattern to match (intrinsic arg0, arg1, ...).
   dag IntrinsicPattern = BuildPatternI<!cast<Intrinsic>(Intr), Args>.ret;
 }
@@ -6761,7 +5390,7 @@ class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> {
   WMMA_REGINFO Frag = FragA;
   list<Predicate> ret = !listconcat(
     FragA.Predicates,
-    !if(!eq(b1op, ".and.popc"), [hasSM<80>,hasPTX<71>],[])
+    !if(!eq(b1op, ".and.popc"), [hasSM<80>, hasPTX<71>], [])
   );
 }
 // WMMA.MMA
@@ -7008,25 +5637,22 @@ def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>;
 // Tcgen05 intrinsics
 let isConvergent = true in {
 
-multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> {
-  def NAME : NVPTXInst<(outs),
-             (ins rc:$dst, Int32Regs:$ncols),
-             !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"),
-             [(Intr rc:$dst, Int32Regs:$ncols)]>,
+multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> {
+  def "" : BasicNVPTXInst<(outs),
+             (ins ADDR:$dst, Int32Regs:$ncols),
+             "tcgen05.alloc.cta_group::" # num # ".sync.aligned" # AS # ".b32",
+             [(Intr addr:$dst, Int32Regs:$ncols)]>,
              Requires<[hasTcgen05Instructions]>;
 }
 
-defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>;
-defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>;
+defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<"", "1", int_nvvm_tcgen05_alloc_cg1>;
+defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<"", "2", int_nvvm_tcgen05_alloc_cg2>;
 
-defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
-defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
-
-defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
-defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
+defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
+defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
 
 multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> {
-  def NAME : BasicNVPTXInst<(outs),
+  def "" : BasicNVPTXInst<(outs),
              (ins Int32Regs:$tmem_addr, Int32Regs:$ncols),
              "tcgen05.dealloc.cta_group::" # num # ".sync.aligned.b32",
              [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>,
@@ -7036,7 +5662,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1
 defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
 
 multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
-  def NAME : BasicNVPTXInst<(outs), (ins),
+  def "" : BasicNVPTXInst<(outs), (ins),
              "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned",
              [(Intr)]>,
              Requires<[hasTcgen05Instructions]>;
@@ -7052,36 +5678,33 @@ def tcgen05_wait_st: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligne
   [(int_nvvm_tcgen05_wait_st)]>,
   Requires<[hasTcgen05Instructions]>;
 
-multiclass TCGEN05_COMMIT_INTR<NVPTXRegClass rc, string AS, string num> {
-  defvar prefix = "tcgen05.commit.cta_group::" # num;
-  defvar suffix = ".mbarrier::arrive::one.shared::cluster";
+multiclass TCGEN05_COMMIT_INTR<string AS, string num> {
+  defvar prefix = "tcgen05.commit.cta_group::" # num #".mbarrier::arrive::one.shared::cluster";
 
   defvar intr_suffix = !if(!eq(AS, "shared"), "_shared", "") # "_cg" # num;
   defvar Intr = !cast<Intrinsic>("int_nvvm_tcgen05_commit" # intr_suffix);
   defvar IntrMC = !cast<Intrinsic>("int_nvvm_tcgen05_commit_mc" # intr_suffix);
 
-  def NAME : NVPTXInst<(outs), (ins rc:$mbar),
-             !strconcat(prefix, suffix, ".b64 [$mbar];"),
-             [(Intr rc:$mbar)]>,
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$mbar),
+             prefix # ".b64",
+             [(Intr addr:$mbar)]>,
              Requires<[hasTcgen05Instructions]>;
-  def NAME # _MC : NVPTXInst<(outs), (ins rc:$mbar, Int16Regs:$mc),
-                   !strconcat(prefix, suffix, ".multicast::cluster.b64 [$mbar], $mc;"),
-                   [(IntrMC rc:$mbar, Int16Regs:$mc)]>,
+  def _MC : BasicNVPTXInst<(outs), (ins ADDR:$mbar, Int16Regs:$mc),
+                   prefix # ".multicast::cluster.b64",
+                   [(IntrMC addr:$mbar, Int16Regs:$mc)]>,
                    Requires<[hasTcgen05Instructions]>;
 }
 
-defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<Int64Regs, "", "1">;
-defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<Int64Regs, "", "2">;
-defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<Int64Regs, "shared", "1">;
-defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<Int64Regs, "shared", "2">;
-defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR<Int32Regs, "shared", "1">;
-defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR<Int32Regs, "shared", "2">;
+defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<"", "1">;
+defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">;
+defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">;
+defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">;
 
 multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
-  def NAME : NVPTXInst<(outs),
-             (ins Int32Regs:$tmem_addr),
-             !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"),
-             [(Intr Int32Regs:$tmem_addr)]>,
+  def "" : BasicNVPTXInst<(outs),
+             (ins ADDR:$tmem_addr),
+             "tcgen05.shift.cta_group::" # num # ".down",
+             [(Intr addr:$tmem_addr)]>,
              Requires<[hasTcgen05Instructions]>;
 }
 defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
@@ -7099,15 +5722,15 @@ multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> {
   defvar IntrCG1 = !cast<Intrinsic>(intr_prefix # "_cg1");
   defvar IntrCG2 = !cast<Intrinsic>(intr_prefix # "_cg2");
 
-  def NAME # _cg1 : NVPTXInst<(outs),
-                    (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc),
-                    "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;",
-                    [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>,
+  def _cg1 : BasicNVPTXInst<(outs),
+                    (ins ADDR:$tmem_addr, Int64Regs:$sdesc),
+                    "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm,
+                    [(IntrCG1 addr:$tmem_addr, Int64Regs:$sdesc)]>,
                     Requires<[hasTcgen05Instructions]>;
-  def NAME # _cg2 : NVPTXInst<(outs),
-                    (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc),
-                    "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;",
-                    [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>,
+  def _cg2 : BasicNVPTXInst<(outs),
+                    (ins ADDR:$tmem_addr, Int64Regs:$sdesc),
+                    "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm,
+                    [(IntrCG2 addr:$tmem_addr, Int64Regs:$sdesc)]>,
                     Requires<[hasTcgen05Instructions]>;
 }
 
@@ -7222,17 +5845,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
 } // isConvergent
 
 // Bulk store instructions
-                            
+def st_bulk_imm : TImmLeaf<i64, [{ return Imm == 0; }]>;
+
 def INT_NVVM_ST_BULK_GENERIC :
-  NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size),
-            "st.bulk [$dest_addr], $size, 0;",
-            [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, (i64 0))]>,
+  BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value),
+            "st.bulk",
+            [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>,
             Requires<[hasSM<100>, hasPTX<86>]>;
 
 def INT_NVVM_ST_BULK_SHARED_CTA:
-  NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size),
-            "st.bulk.shared::cta [$dest_addr], $size, 0;",
-            [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, (i64 0))]>,
+  BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value),
+            "st.bulk.shared::cta",
+            [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>,
             Requires<[hasSM<100>, hasPTX<86>]>;
 
 //
@@ -7240,17 +5864,15 @@ def INT_NVVM_ST_BULK_SHARED_CTA:
 //
 
 def CLUSTERLAUNCHCONTRL_TRY_CANCEL:
-      NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
-                "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 " #
-                "[$addr], [$mbar];",
+      BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
+                "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128",
                 [(int_nvvm_clusterlaunchcontrol_try_cancel_async_shared addr:$addr, addr:$mbar)]>,
       Requires<[hasSM<100>, hasPTX<86>]>;
 
 def CLUSTERLAUNCHCONTRL_TRY_CANCEL_MULTICAST:
-      NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
+      BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
                 "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes" #
-                ".multicast::cluster::all.b128 " #
-                "[$addr], [$mbar];",
+                ".multicast::cluster::all.b128",
                 [(int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared addr:$addr, addr:$mbar)]>,
       Requires<[hasSM<100>, hasArchAccelFeatures, hasPTX<86>]>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index 9b5fe473521a..320c0fb6950a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -408,426 +408,426 @@ static unsigned suldRegisterToIndexOpcode(unsigned RegOC) {
 
 static unsigned sustRegisterToIndexOpcode(unsigned RegOC) {
   switch (RegOC) {
-  case NVPTX::SUST_B_1D_B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_3D_B8_CLAMP_R:
-    return NVPTX::SUST_B_3D_B8_CLAMP_I;
-  case NVPTX::SUST_B_3D_B16_CLAMP_R:
-    return NVPTX::SUST_B_3D_B16_CLAMP_I;
-  case NVPTX::SUST_B_3D_B32_CLAMP_R:
-    return NVPTX::SUST_B_3D_B32_CLAMP_I;
-  case NVPTX::SUST_B_3D_B64_CLAMP_R:
-    return NVPTX::SUST_B_3D_B64_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_3D_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_3D_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_3D_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_3D_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_3D_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_3D_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_B8_TRAP_R:
-    return NVPTX::SUST_B_1D_B8_TRAP_I;
-  case NVPTX::SUST_B_1D_B16_TRAP_R:
-    return NVPTX::SUST_B_1D_B16_TRAP_I;
-  case NVPTX::SUST_B_1D_B32_TRAP_R:
-    return NVPTX::SUST_B_1D_B32_TRAP_I;
-  case NVPTX::SUST_B_1D_B64_TRAP_R:
-    return NVPTX::SUST_B_1D_B64_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B8_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B8_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B16_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B16_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B32_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B32_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B64_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B64_TRAP_I;
-  case NVPTX::SUST_B_1D_V4B8_TRAP_R:
-    return NVPTX::SUST_B_1D_V4B8_TRAP_I;
-  case NVPTX::SUST_B_1D_V4B16_TRAP_R:
-    return NVPTX::SUST_B_1D_V4B16_TRAP_I;
-  case NVPTX::SUST_B_1D_V4B32_TRAP_R:
-    return NVPTX::SUST_B_1D_V4B32_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B64_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B64_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_B_2D_B8_TRAP_R:
-    return NVPTX::SUST_B_2D_B8_TRAP_I;
-  case NVPTX::SUST_B_2D_B16_TRAP_R:
-    return NVPTX::SUST_B_2D_B16_TRAP_I;
-  case NVPTX::SUST_B_2D_B32_TRAP_R:
-    return NVPTX::SUST_B_2D_B32_TRAP_I;
-  case NVPTX::SUST_B_2D_B64_TRAP_R:
-    return NVPTX::SUST_B_2D_B64_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B8_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B8_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B16_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B16_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B32_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B32_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B64_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B64_TRAP_I;
-  case NVPTX::SUST_B_2D_V4B8_TRAP_R:
-    return NVPTX::SUST_B_2D_V4B8_TRAP_I;
-  case NVPTX::SUST_B_2D_V4B16_TRAP_R:
-    return NVPTX::SUST_B_2D_V4B16_TRAP_I;
-  case NVPTX::SUST_B_2D_V4B32_TRAP_R:
-    return NVPTX::SUST_B_2D_V4B32_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B64_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B64_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_B_3D_B8_TRAP_R:
-    return NVPTX::SUST_B_3D_B8_TRAP_I;
-  case NVPTX::SUST_B_3D_B16_TRAP_R:
-    return NVPTX::SUST_B_3D_B16_TRAP_I;
-  case NVPTX::SUST_B_3D_B32_TRAP_R:
-    return NVPTX::SUST_B_3D_B32_TRAP_I;
-  case NVPTX::SUST_B_3D_B64_TRAP_R:
-    return NVPTX::SUST_B_3D_B64_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B8_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B8_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B16_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B16_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B32_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B32_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B64_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B64_TRAP_I;
-  case NVPTX::SUST_B_3D_V4B8_TRAP_R:
-    return NVPTX::SUST_B_3D_V4B8_TRAP_I;
-  case NVPTX::SUST_B_3D_V4B16_TRAP_R:
-    return NVPTX::SUST_B_3D_V4B16_TRAP_I;
-  case NVPTX::SUST_B_3D_V4B32_TRAP_R:
-    return NVPTX::SUST_B_3D_V4B32_TRAP_I;
-  case NVPTX::SUST_B_1D_B8_ZERO_R:
-    return NVPTX::SUST_B_1D_B8_ZERO_I;
-  case NVPTX::SUST_B_1D_B16_ZERO_R:
-    return NVPTX::SUST_B_1D_B16_ZERO_I;
-  case NVPTX::SUST_B_1D_B32_ZERO_R:
-    return NVPTX::SUST_B_1D_B32_ZERO_I;
-  case NVPTX::SUST_B_1D_B64_ZERO_R:
-    return NVPTX::SUST_B_1D_B64_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B8_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B8_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B16_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B16_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B32_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B32_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B64_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B64_ZERO_I;
-  case NVPTX::SUST_B_1D_V4B8_ZERO_R:
-    return NVPTX::SUST_B_1D_V4B8_ZERO_I;
-  case NVPTX::SUST_B_1D_V4B16_ZERO_R:
-    return NVPTX::SUST_B_1D_V4B16_ZERO_I;
-  case NVPTX::SUST_B_1D_V4B32_ZERO_R:
-    return NVPTX::SUST_B_1D_V4B32_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B8_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B8_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B16_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B16_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B32_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B32_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B64_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B64_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_I;
-  case NVPTX::SUST_B_2D_B8_ZERO_R:
-    return NVPTX::SUST_B_2D_B8_ZERO_I;
-  case NVPTX::SUST_B_2D_B16_ZERO_R:
-    return NVPTX::SUST_B_2D_B16_ZERO_I;
-  case NVPTX::SUST_B_2D_B32_ZERO_R:
-    return NVPTX::SUST_B_2D_B32_ZERO_I;
-  case NVPTX::SUST_B_2D_B64_ZERO_R:
-    return NVPTX::SUST_B_2D_B64_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B8_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B8_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B16_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B16_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B32_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B32_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B64_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B64_ZERO_I;
-  case NVPTX::SUST_B_2D_V4B8_ZERO_R:
-    return NVPTX::SUST_B_2D_V4B8_ZERO_I;
-  case NVPTX::SUST_B_2D_V4B16_ZERO_R:
-    return NVPTX::SUST_B_2D_V4B16_ZERO_I;
-  case NVPTX::SUST_B_2D_V4B32_ZERO_R:
-    return NVPTX::SUST_B_2D_V4B32_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B8_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B8_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B16_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B16_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B32_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B32_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B64_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B64_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_I;
-  case NVPTX::SUST_B_3D_B8_ZERO_R:
-    return NVPTX::SUST_B_3D_B8_ZERO_I;
-  case NVPTX::SUST_B_3D_B16_ZERO_R:
-    return NVPTX::SUST_B_3D_B16_ZERO_I;
-  case NVPTX::SUST_B_3D_B32_ZERO_R:
-    return NVPTX::SUST_B_3D_B32_ZERO_I;
-  case NVPTX::SUST_B_3D_B64_ZERO_R:
-    return NVPTX::SUST_B_3D_B64_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B8_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B8_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B16_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B16_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B32_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B32_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B64_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B64_ZERO_I;
-  case NVPTX::SUST_B_3D_V4B8_ZERO_R:
-    return NVPTX::SUST_B_3D_V4B8_ZERO_I;
-  case NVPTX::SUST_B_3D_V4B16_ZERO_R:
-    return NVPTX::SUST_B_3D_V4B16_ZERO_I;
-  case NVPTX::SUST_B_3D_V4B32_ZERO_R:
-    return NVPTX::SUST_B_3D_V4B32_ZERO_I;
-  case NVPTX::SUST_P_1D_B8_TRAP_R:
-    return NVPTX::SUST_P_1D_B8_TRAP_I;
-  case NVPTX::SUST_P_1D_B16_TRAP_R:
-    return NVPTX::SUST_P_1D_B16_TRAP_I;
-  case NVPTX::SUST_P_1D_B32_TRAP_R:
-    return NVPTX::SUST_P_1D_B32_TRAP_I;
-  case NVPTX::SUST_P_1D_V2B8_TRAP_R:
-    return NVPTX::SUST_P_1D_V2B8_TRAP_I;
-  case NVPTX::SUST_P_1D_V2B16_TRAP_R:
-    return NVPTX::SUST_P_1D_V2B16_TRAP_I;
-  case NVPTX::SUST_P_1D_V2B32_TRAP_R:
-    return NVPTX::SUST_P_1D_V2B32_TRAP_I;
-  case NVPTX::SUST_P_1D_V4B8_TRAP_R:
-    return NVPTX::SUST_P_1D_V4B8_TRAP_I;
-  case NVPTX::SUST_P_1D_V4B16_TRAP_R:
-    return NVPTX::SUST_P_1D_V4B16_TRAP_I;
-  case NVPTX::SUST_P_1D_V4B32_TRAP_R:
-    return NVPTX::SUST_P_1D_V4B32_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_P_2D_B8_TRAP_R:
-    return NVPTX::SUST_P_2D_B8_TRAP_I;
-  case NVPTX::SUST_P_2D_B16_TRAP_R:
-    return NVPTX::SUST_P_2D_B16_TRAP_I;
-  case NVPTX::SUST_P_2D_B32_TRAP_R:
-    return NVPTX::SUST_P_2D_B32_TRAP_I;
-  case NVPTX::SUST_P_2D_V2B8_TRAP_R:
-    return NVPTX::SUST_P_2D_V2B8_TRAP_I;
-  case NVPTX::SUST_P_2D_V2B16_TRAP_R:
-    return NVPTX::SUST_P_2D_V2B16_TRAP_I;
-  case NVPTX::SUST_P_2D_V2B32_TRAP_R:
-    return NVPTX::SUST_P_2D_V2B32_TRAP_I;
-  case NVPTX::SUST_P_2D_V4B8_TRAP_R:
-    return NVPTX::SUST_P_2D_V4B8_TRAP_I;
-  case NVPTX::SUST_P_2D_V4B16_TRAP_R:
-    return NVPTX::SUST_P_2D_V4B16_TRAP_I;
-  case NVPTX::SUST_P_2D_V4B32_TRAP_R:
-    return NVPTX::SUST_P_2D_V4B32_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_P_3D_B8_TRAP_R:
-    return NVPTX::SUST_P_3D_B8_TRAP_I;
-  case NVPTX::SUST_P_3D_B16_TRAP_R:
-    return NVPTX::SUST_P_3D_B16_TRAP_I;
-  case NVPTX::SUST_P_3D_B32_TRAP_R:
-    return NVPTX::SUST_P_3D_B32_TRAP_I;
-  case NVPTX::SUST_P_3D_V2B8_TRAP_R:
-    return NVPTX::SUST_P_3D_V2B8_TRAP_I;
-  case NVPTX::SUST_P_3D_V2B16_TRAP_R:
-    return NVPTX::SUST_P_3D_V2B16_TRAP_I;
-  case NVPTX::SUST_P_3D_V2B32_TRAP_R:
-    return NVPTX::SUST_P_3D_V2B32_TRAP_I;
-  case NVPTX::SUST_P_3D_V4B8_TRAP_R:
-    return NVPTX::SUST_P_3D_V4B8_TRAP_I;
-  case NVPTX::SUST_P_3D_V4B16_TRAP_R:
-    return NVPTX::SUST_P_3D_V4B16_TRAP_I;
-  case NVPTX::SUST_P_3D_V4B32_TRAP_R:
-    return NVPTX::SUST_P_3D_V4B32_TRAP_I;
+  case NVPTX::SUST_B_1D_I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_3D_I8_CLAMP_R:
+    return NVPTX::SUST_B_3D_I8_CLAMP_I;
+  case NVPTX::SUST_B_3D_I16_CLAMP_R:
+    return NVPTX::SUST_B_3D_I16_CLAMP_I;
+  case NVPTX::SUST_B_3D_I32_CLAMP_R:
+    return NVPTX::SUST_B_3D_I32_CLAMP_I;
+  case NVPTX::SUST_B_3D_I64_CLAMP_R:
+    return NVPTX::SUST_B_3D_I64_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_I8_TRAP_R:
+    return NVPTX::SUST_B_1D_I8_TRAP_I;
+  case NVPTX::SUST_B_1D_I16_TRAP_R:
+    return NVPTX::SUST_B_1D_I16_TRAP_I;
+  case NVPTX::SUST_B_1D_I32_TRAP_R:
+    return NVPTX::SUST_B_1D_I32_TRAP_I;
+  case NVPTX::SUST_B_1D_I64_TRAP_R:
+    return NVPTX::SUST_B_1D_I64_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I8_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I8_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I16_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I16_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I32_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I32_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I64_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I64_TRAP_I;
+  case NVPTX::SUST_B_1D_V4I8_TRAP_R:
+    return NVPTX::SUST_B_1D_V4I8_TRAP_I;
+  case NVPTX::SUST_B_1D_V4I16_TRAP_R:
+    return NVPTX::SUST_B_1D_V4I16_TRAP_I;
+  case NVPTX::SUST_B_1D_V4I32_TRAP_R:
+    return NVPTX::SUST_B_1D_V4I32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I64_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I64_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_B_2D_I8_TRAP_R:
+    return NVPTX::SUST_B_2D_I8_TRAP_I;
+  case NVPTX::SUST_B_2D_I16_TRAP_R:
+    return NVPTX::SUST_B_2D_I16_TRAP_I;
+  case NVPTX::SUST_B_2D_I32_TRAP_R:
+    return NVPTX::SUST_B_2D_I32_TRAP_I;
+  case NVPTX::SUST_B_2D_I64_TRAP_R:
+    return NVPTX::SUST_B_2D_I64_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I8_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I8_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I16_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I16_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I32_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I32_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I64_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I64_TRAP_I;
+  case NVPTX::SUST_B_2D_V4I8_TRAP_R:
+    return NVPTX::SUST_B_2D_V4I8_TRAP_I;
+  case NVPTX::SUST_B_2D_V4I16_TRAP_R:
+    return NVPTX::SUST_B_2D_V4I16_TRAP_I;
+  case NVPTX::SUST_B_2D_V4I32_TRAP_R:
+    return NVPTX::SUST_B_2D_V4I32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I64_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I64_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_B_3D_I8_TRAP_R:
+    return NVPTX::SUST_B_3D_I8_TRAP_I;
+  case NVPTX::SUST_B_3D_I16_TRAP_R:
+    return NVPTX::SUST_B_3D_I16_TRAP_I;
+  case NVPTX::SUST_B_3D_I32_TRAP_R:
+    return NVPTX::SUST_B_3D_I32_TRAP_I;
+  case NVPTX::SUST_B_3D_I64_TRAP_R:
+    return NVPTX::SUST_B_3D_I64_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I8_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I8_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I16_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I16_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I32_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I32_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I64_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I64_TRAP_I;
+  case NVPTX::SUST_B_3D_V4I8_TRAP_R:
+    return NVPTX::SUST_B_3D_V4I8_TRAP_I;
+  case NVPTX::SUST_B_3D_V4I16_TRAP_R:
+    return NVPTX::SUST_B_3D_V4I16_TRAP_I;
+  case NVPTX::SUST_B_3D_V4I32_TRAP_R:
+    return NVPTX::SUST_B_3D_V4I32_TRAP_I;
+  case NVPTX::SUST_B_1D_I8_ZERO_R:
+    return NVPTX::SUST_B_1D_I8_ZERO_I;
+  case NVPTX::SUST_B_1D_I16_ZERO_R:
+    return NVPTX::SUST_B_1D_I16_ZERO_I;
+  case NVPTX::SUST_B_1D_I32_ZERO_R:
+    return NVPTX::SUST_B_1D_I32_ZERO_I;
+  case NVPTX::SUST_B_1D_I64_ZERO_R:
+    return NVPTX::SUST_B_1D_I64_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I8_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I8_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I16_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I16_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I32_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I32_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I64_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I64_ZERO_I;
+  case NVPTX::SUST_B_1D_V4I8_ZERO_R:
+    return NVPTX::SUST_B_1D_V4I8_ZERO_I;
+  case NVPTX::SUST_B_1D_V4I16_ZERO_R:
+    return NVPTX::SUST_B_1D_V4I16_ZERO_I;
+  case NVPTX::SUST_B_1D_V4I32_ZERO_R:
+    return NVPTX::SUST_B_1D_V4I32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I64_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I64_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_I;
+  case NVPTX::SUST_B_2D_I8_ZERO_R:
+    return NVPTX::SUST_B_2D_I8_ZERO_I;
+  case NVPTX::SUST_B_2D_I16_ZERO_R:
+    return NVPTX::SUST_B_2D_I16_ZERO_I;
+  case NVPTX::SUST_B_2D_I32_ZERO_R:
+    return NVPTX::SUST_B_2D_I32_ZERO_I;
+  case NVPTX::SUST_B_2D_I64_ZERO_R:
+    return NVPTX::SUST_B_2D_I64_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I8_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I8_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I16_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I16_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I32_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I32_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I64_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I64_ZERO_I;
+  case NVPTX::SUST_B_2D_V4I8_ZERO_R:
+    return NVPTX::SUST_B_2D_V4I8_ZERO_I;
+  case NVPTX::SUST_B_2D_V4I16_ZERO_R:
+    return NVPTX::SUST_B_2D_V4I16_ZERO_I;
+  case NVPTX::SUST_B_2D_V4I32_ZERO_R:
+    return NVPTX::SUST_B_2D_V4I32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I64_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I64_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_I;
+  case NVPTX::SUST_B_3D_I8_ZERO_R:
+    return NVPTX::SUST_B_3D_I8_ZERO_I;
+  case NVPTX::SUST_B_3D_I16_ZERO_R:
+    return NVPTX::SUST_B_3D_I16_ZERO_I;
+  case NVPTX::SUST_B_3D_I32_ZERO_R:
+    return NVPTX::SUST_B_3D_I32_ZERO_I;
+  case NVPTX::SUST_B_3D_I64_ZERO_R:
+    return NVPTX::SUST_B_3D_I64_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I8_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I8_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I16_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I16_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I32_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I32_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I64_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I64_ZERO_I;
+  case NVPTX::SUST_B_3D_V4I8_ZERO_R:
+    return NVPTX::SUST_B_3D_V4I8_ZERO_I;
+  case NVPTX::SUST_B_3D_V4I16_ZERO_R:
+    return NVPTX::SUST_B_3D_V4I16_ZERO_I;
+  case NVPTX::SUST_B_3D_V4I32_ZERO_R:
+    return NVPTX::SUST_B_3D_V4I32_ZERO_I;
+  case NVPTX::SUST_P_1D_I8_TRAP_R:
+    return NVPTX::SUST_P_1D_I8_TRAP_I;
+  case NVPTX::SUST_P_1D_I16_TRAP_R:
+    return NVPTX::SUST_P_1D_I16_TRAP_I;
+  case NVPTX::SUST_P_1D_I32_TRAP_R:
+    return NVPTX::SUST_P_1D_I32_TRAP_I;
+  case NVPTX::SUST_P_1D_V2I8_TRAP_R:
+    return NVPTX::SUST_P_1D_V2I8_TRAP_I;
+  case NVPTX::SUST_P_1D_V2I16_TRAP_R:
+    return NVPTX::SUST_P_1D_V2I16_TRAP_I;
+  case NVPTX::SUST_P_1D_V2I32_TRAP_R:
+    return NVPTX::SUST_P_1D_V2I32_TRAP_I;
+  case NVPTX::SUST_P_1D_V4I8_TRAP_R:
+    return NVPTX::SUST_P_1D_V4I8_TRAP_I;
+  case NVPTX::SUST_P_1D_V4I16_TRAP_R:
+    return NVPTX::SUST_P_1D_V4I16_TRAP_I;
+  case NVPTX::SUST_P_1D_V4I32_TRAP_R:
+    return NVPTX::SUST_P_1D_V4I32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_P_2D_I8_TRAP_R:
+    return NVPTX::SUST_P_2D_I8_TRAP_I;
+  case NVPTX::SUST_P_2D_I16_TRAP_R:
+    return NVPTX::SUST_P_2D_I16_TRAP_I;
+  case NVPTX::SUST_P_2D_I32_TRAP_R:
+    return NVPTX::SUST_P_2D_I32_TRAP_I;
+  case NVPTX::SUST_P_2D_V2I8_TRAP_R:
+    return NVPTX::SUST_P_2D_V2I8_TRAP_I;
+  case NVPTX::SUST_P_2D_V2I16_TRAP_R:
+    return NVPTX::SUST_P_2D_V2I16_TRAP_I;
+  case NVPTX::SUST_P_2D_V2I32_TRAP_R:
+    return NVPTX::SUST_P_2D_V2I32_TRAP_I;
+  case NVPTX::SUST_P_2D_V4I8_TRAP_R:
+    return NVPTX::SUST_P_2D_V4I8_TRAP_I;
+  case NVPTX::SUST_P_2D_V4I16_TRAP_R:
+    return NVPTX::SUST_P_2D_V4I16_TRAP_I;
+  case NVPTX::SUST_P_2D_V4I32_TRAP_R:
+    return NVPTX::SUST_P_2D_V4I32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_P_3D_I8_TRAP_R:
+    return NVPTX::SUST_P_3D_I8_TRAP_I;
+  case NVPTX::SUST_P_3D_I16_TRAP_R:
+    return NVPTX::SUST_P_3D_I16_TRAP_I;
+  case NVPTX::SUST_P_3D_I32_TRAP_R:
+    return NVPTX::SUST_P_3D_I32_TRAP_I;
+  case NVPTX::SUST_P_3D_V2I8_TRAP_R:
+    return NVPTX::SUST_P_3D_V2I8_TRAP_I;
+  case NVPTX::SUST_P_3D_V2I16_TRAP_R:
+    return NVPTX::SUST_P_3D_V2I16_TRAP_I;
+  case NVPTX::SUST_P_3D_V2I32_TRAP_R:
+    return NVPTX::SUST_P_3D_V2I32_TRAP_I;
+  case NVPTX::SUST_P_3D_V4I8_TRAP_R:
+    return NVPTX::SUST_P_3D_V4I8_TRAP_I;
+  case NVPTX::SUST_P_3D_V4I16_TRAP_R:
+    return NVPTX::SUST_P_3D_V4I16_TRAP_I;
+  case NVPTX::SUST_P_3D_V4I32_TRAP_R:
+    return NVPTX::SUST_P_3D_V4I32_TRAP_I;
   default:
     llvm_unreachable("Unhandled SUST opcode");
   }

From ace356bc9777e6a5b5aa0ba2335d2546ac6f330e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Jun 2025 20:45:32 +0100
Subject: [PATCH 0040/1322] [VPlan] Always verify VPCanonicalIVPHIRecipe
 placement (NFC).

Loop regions are dissolved since dcef154b5caf6556e69bb1, remove the
check for VerifyLate and corresponding TODO.
---
 llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 45010d002158..fba4a68f4a27 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -429,8 +429,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
     return false;
   }
 
-  // TODO: Remove once loop regions are dissolved before execution.
-  if (!VerifyLate && !isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
+  if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
     errs() << "VPlan vector loop header does not start with a "
               "VPCanonicalIVPHIRecipe\n";
     return false;

From ebc90d50b88a7c46634ea21e40ddb25c679ac874 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:03 -0700
Subject: [PATCH 0041/1322] [SandboxVectorizer] Use llvm::find (NFC) (#143724)

llvm::find allows us to pass a range.
---
 .../llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
index d4cb34647cf5..6d2144b14bb0 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
@@ -68,7 +68,7 @@ public:
   /// the seeds in a bundle. This allows constant time evaluation
   /// and "removal" from the list.
   void setUsed(Instruction *I) {
-    auto It = std::find(begin(), end(), I);
+    auto It = llvm::find(*this, I);
     assert(It != end() && "Instruction not in the bundle!");
     auto Idx = It - begin();
     setUsed(Idx, 1, /*VerifyUnused=*/false);

From e266d6a5da6871c89747416c70a4a39181b594fb Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:11 -0700
Subject: [PATCH 0042/1322] [Format] Use llvm::min_element (NFC) (#143725)

llvm::min_elements allows us to pass a range.
---
 clang/lib/Format/MacroCallReconstructor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/MacroCallReconstructor.cpp b/clang/lib/Format/MacroCallReconstructor.cpp
index 116bbad320e1..895d9f93dfce 100644
--- a/clang/lib/Format/MacroCallReconstructor.cpp
+++ b/clang/lib/Format/MacroCallReconstructor.cpp
@@ -528,10 +528,10 @@ MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line,
       // 1. One level below the current line's level.
       // 2. At the correct level relative to each other.
       unsigned MinChildLevel =
-          std::min_element(N->Children.begin(), N->Children.end(),
-                           [](const auto &E1, const auto &E2) {
-                             return E1->Level < E2->Level;
-                           })
+          llvm::min_element(N->Children,
+                            [](const auto &E1, const auto &E2) {
+                              return E1->Level < E2->Level;
+                            })
               ->get()
               ->Level;
       for (const auto &Child : N->Children) {

From c1d21f44340901f6a23ae7eb7c5379f5ad197b27 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:19 -0700
Subject: [PATCH 0043/1322] [lld] Use std::tie to implement comparison
 operators (NFC) (#143726)

std::tie facilitates lexicographical comparisons through std::tuple's
built-in operator< and operator>.
---
 lld/ELF/SyntheticSections.cpp   | 7 ++-----
 lld/MachO/UnwindInfoSection.cpp | 8 +++-----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 785a56cdb349..0a9c7a081eb8 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1939,11 +1939,8 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize(Ctx &ctx) {
   // For Rela, we also want to sort by r_addend when r_info is the same. This
   // enables us to group by r_addend as well.
   llvm::sort(nonRelatives, [](const Elf_Rela &a, const Elf_Rela &b) {
-    if (a.r_info != b.r_info)
-      return a.r_info < b.r_info;
-    if (a.r_addend != b.r_addend)
-      return a.r_addend < b.r_addend;
-    return a.r_offset < b.r_offset;
+    return std::tie(a.r_info, a.r_addend, a.r_offset) <
+           std::tie(b.r_info, b.r_addend, b.r_offset);
   });
 
   // Group relocations with the same r_info. Note that each group emits a group
diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp
index 624464e41d77..6e9f6c2aba74 100644
--- a/lld/MachO/UnwindInfoSection.cpp
+++ b/lld/MachO/UnwindInfoSection.cpp
@@ -535,11 +535,9 @@ void UnwindInfoSectionImpl::finalize() {
   llvm::sort(commonEncodings,
              [](const std::pair<compact_unwind_encoding_t, size_t> &a,
                 const std::pair<compact_unwind_encoding_t, size_t> &b) {
-               if (a.second == b.second)
-                 // When frequencies match, secondarily sort on encoding
-                 // to maintain parity with validate-unwind-info.py
-                 return a.first > b.first;
-               return a.second > b.second;
+               // When frequencies match, secondarily sort on encoding
+               // to maintain parity with validate-unwind-info.py
+               return std::tie(a.second, a.first) > std::tie(b.second, b.first);
              });
 
   // Truncate the vector to 127 elements.

From 8da1ac98efa0d315824a92d8b563299eccc3e0f1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:27 -0700
Subject: [PATCH 0044/1322] [llvm] Use std::tie to implement operator< (NFC)
 (#143728)

std::tie facilitates lexicographical comparisons through std::tuple's
built-in operator<.
---
 .../ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h    | 12 +++---------
 llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp    |  8 ++------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 24b03a058981..89b20978c40e 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -202,15 +202,9 @@ public:
            IsStubThumb == Other.IsStubThumb;
   }
   inline bool operator<(const RelocationValueRef &Other) const {
-    if (SectionID != Other.SectionID)
-      return SectionID < Other.SectionID;
-    if (Offset != Other.Offset)
-      return Offset < Other.Offset;
-    if (Addend != Other.Addend)
-      return Addend < Other.Addend;
-    if (IsStubThumb != Other.IsStubThumb)
-      return IsStubThumb < Other.IsStubThumb;
-    return SymbolName < Other.SymbolName;
+    return std::tie(SectionID, Offset, Addend, IsStubThumb, SymbolName) <
+           std::tie(Other.SectionID, Other.Offset, Other.Addend,
+                    Other.IsStubThumb, Other.SymbolName);
   }
 };
 
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index f38e7b879e5f..5dde47ab3de5 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -253,7 +253,7 @@ namespace {
       bool operator!=(Register R) const { return !operator==(R); }
       bool operator<(Register R) const {
         // For std::map.
-        return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
+        return std::tie(Reg, Sub) < std::tie(R.Reg, R.Sub);
       }
       llvm::Register Reg;
       unsigned Sub = 0;
@@ -298,11 +298,7 @@ namespace {
         return !operator==(Ex);
       }
       bool operator<(const ExtExpr &Ex) const {
-        if (Rs != Ex.Rs)
-          return Rs < Ex.Rs;
-        if (S != Ex.S)
-          return S < Ex.S;
-        return !Neg && Ex.Neg;
+        return std::tie(Rs, S, Neg) < std::tie(Ex.Rs, Ex.S, Ex.Neg);
       }
     };
 

From 43c35e858ccae05d69151ccf9712a725aae37b52 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:35 -0700
Subject: [PATCH 0045/1322] [mlir] Simplify calls to *Map::{insert,try_emplace}
 (NFC) (#143729)

This patch simplifies code by removing the values from
insert/try_emplace.  Note that default values inserted by try_emplace
are immediately overrideen in all these cases.
---
 mlir/lib/IR/AsmPrinter.cpp             | 3 +--
 mlir/lib/IR/SymbolTable.cpp            | 2 +-
 mlir/lib/Transforms/Utils/CFGToSCF.cpp | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index fc1806900c0a..c7cc6a02ad20 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1146,8 +1146,7 @@ template <typename T, typename... PrintArgs>
 std::pair<size_t, size_t> AliasInitializer::visitImpl(
     T value, llvm::MapVector<const void *, InProgressAliasInfo> &aliases,
     bool canBeDeferred, PrintArgs &&...printArgs) {
-  auto [it, inserted] =
-      aliases.insert({value.getAsOpaquePointer(), InProgressAliasInfo()});
+  auto [it, inserted] = aliases.try_emplace(value.getAsOpaquePointer());
   size_t aliasIndex = std::distance(aliases.begin(), it);
   if (!inserted) {
     // Make sure that the alias isn't deferred if we don't permit it.
diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp
index 075a0ba15d7c..aaa4d5617eb4 100644
--- a/mlir/lib/IR/SymbolTable.cpp
+++ b/mlir/lib/IR/SymbolTable.cpp
@@ -1100,7 +1100,7 @@ void SymbolUserMap::replaceAllUsesWith(Operation *symbol,
   if (newSymbol != symbol) {
     // Transfer over the users to the new symbol.  The reference to the old one
     // is fetched again as the iterator is invalidated during the insertion.
-    auto newIt = symbolToUsers.try_emplace(newSymbol, SetVector<Operation *>{});
+    auto newIt = symbolToUsers.try_emplace(newSymbol);
     auto oldIt = symbolToUsers.find(symbol);
     assert(oldIt != symbolToUsers.end() && "missing old users list");
     if (newIt.second)
diff --git a/mlir/lib/Transforms/Utils/CFGToSCF.cpp b/mlir/lib/Transforms/Utils/CFGToSCF.cpp
index de380fc325f5..7c1781044d2a 100644
--- a/mlir/lib/Transforms/Utils/CFGToSCF.cpp
+++ b/mlir/lib/Transforms/Utils/CFGToSCF.cpp
@@ -709,7 +709,7 @@ transformToReduceLoop(Block *loopHeader, Block *exitBlock,
     llvm::SmallDenseMap<Block *, bool> dominanceCache;
     // Returns true if `loopBlock` dominates `block`.
     auto loopBlockDominates = [&](Block *block) {
-      auto [iter, inserted] = dominanceCache.insert({block, false});
+      auto [iter, inserted] = dominanceCache.try_emplace(block);
       if (!inserted)
         return iter->second;
       iter->second = dominanceInfo.dominates(loopBlock, block);

From ad2a2b8eed2f3ed1e050833ea8a8d88b0878c6a7 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Wed, 11 Jun 2025 13:05:21 -0700
Subject: [PATCH 0046/1322] [llvm] Add a tool to check mustache compliance
 against the public spec (#142813)

This is a cli tool to that tests the conformance of LLVM's mustache
implementation against the public Mustache spec, hosted at
https://github.com/mustache/spec. This is a revised version of the
patches in #111487.

Co-authored-by: Peter Chou <peter.chou@mail.utoronto.ca>
---
 llvm/CMakeLists.txt                           |   1 +
 llvm/docs/CommandGuide/index.rst              |   1 +
 .../CommandGuide/llvm-test-mustache-spec.rst  |  37 +++
 .../llvm-test-mustache-spec/CMakeLists.txt    |   5 +
 .../llvm-test-mustache-spec.cpp               | 268 ++++++++++++++++++
 5 files changed, 312 insertions(+)
 create mode 100644 llvm/docs/CommandGuide/llvm-test-mustache-spec.rst
 create mode 100644 llvm/utils/llvm-test-mustache-spec/CMakeLists.txt
 create mode 100644 llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 206f009b45f5..cfb67472aa71 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1313,6 +1313,7 @@ if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/yaml-bench)
   add_subdirectory(utils/split-file)
   add_subdirectory(utils/mlgo-utils)
+  add_subdirectory(utils/llvm-test-mustache-spec)
   if( LLVM_INCLUDE_TESTS )
     set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test")
     add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 643951eca2a2..88fc1fd326b7 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -87,6 +87,7 @@ Developer Tools
    llvm-exegesis
    llvm-ifs
    llvm-locstats
+   llvm-test-mustache-spec
    llvm-pdbutil
    llvm-profgen
    llvm-tli-checker
diff --git a/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst
new file mode 100644
index 000000000000..8cd5a349e7e4
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst
@@ -0,0 +1,37 @@
+llvm-test-mustache-spec - LLVM tool to test Mustache library compliance
+=======================================================================
+
+.. program:: llvm-test-mustache-spec
+
+SYNOPSIS
+--------
+
+:program:`llvm-test-mustache-spec` [*inputs...*]
+
+Description
+-----------
+
+``llvm-test-mustache-spec`` tests the mustache spec conformance of the LLVM
+mustache library. The spec can be found here: https://github.com/mustache/spec
+
+To test against the spec, simply download the spec and pass the test JSON files
+to the driver. Each spec file should have a list of tests for compliance with
+the spec. These are loaded as test cases, and rendered with our Mustache
+implementation, which is then compared against the expected output from the
+spec.
+
+The current implementation only supports non-optional parts of the spec, so
+we do not expect any of the dynamic-names, inheritance, or lambda tests to
+pass. Additionally, Triple Mustache is not supported. Unsupported tests are
+marked as XFail and are removed from the XFail list as they are fixed.
+
+The tool prints the number of test failures and successes in each of the test
+files to standard output.
+
+EXAMPLE
+-------
+
+.. code-block:: console
+
+   $ llvm-test-mustache-spec path/to/specs/\*.json
+
diff --git a/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt
new file mode 100644
index 000000000000..dc1aa73371ff
--- /dev/null
+++ b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_utility(llvm-test-mustache-spec
+  llvm-test-mustache-spec.cpp
+)
+
+target_link_libraries(llvm-test-mustache-spec PRIVATE LLVMSupport)
diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
new file mode 100644
index 000000000000..28ed1b876672
--- /dev/null
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -0,0 +1,268 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple drivers to test the mustache spec found at:
+// https://github.com/mustache/spec
+//
+// It is used to verify that the current implementation conforms to the spec.
+// Simply download the spec and pass the test JSON files to the driver. Each
+// spec file should have a list of tests for compliance with the spec. These
+// are loaded as test cases, and rendered with our Mustache implementation,
+// which is then compared against the expected output from the spec.
+//
+// The current implementation only supports non-optional parts of the spec, so
+// we do not expect any of the dynamic-names, inheritance, or lambda tests to
+// pass. Additionally, Triple Mustache is not supported. Unsupported tests are
+// marked as XFail and are removed from the XFail list as they are fixed.
+//
+// Usage:
+//  llvm-test-mustache-spec path/to/test/file.json path/to/test/file2.json ...
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Mustache.h"
+#include "llvm/Support/Path.h"
+#include <string>
+
+using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
+
+#define DEBUG_TYPE "llvm-test-mustache-spec"
+
+static cl::OptionCategory Cat("llvm-test-mustache-spec Options");
+
+static cl::list<std::string>
+    InputFiles(cl::Positional, cl::desc("<input files>"), cl::OneOrMore);
+
+static cl::opt<bool> ReportErrors("report-errors",
+                                  cl::desc("Report errors in spec tests"),
+                                  cl::cat(Cat));
+
+static ExitOnError ExitOnErr;
+
+static int NumXFail = 0;
+static int NumSuccess = 0;
+
+static const StringMap<StringSet<>> XFailTestNames = {{
+    {"delimiters.json",
+     {
+         "Pair Behavior",
+         "Special Characters",
+         "Sections",
+         "Inverted Sections",
+         "Partial Inheritence",
+         "Post-Partial Behavior",
+         "Standalone Tag",
+         "Indented Standalone Tag",
+         "Standalone Line Endings",
+         "Standalone Without Previous Line",
+         "Standalone Without Newline",
+     }},
+    {"~dynamic-names.json",
+     {
+         "Basic Behavior - Partial",
+         "Basic Behavior - Name Resolution",
+         "Context",
+         "Dotted Names",
+         "Dotted Names - Failed Lookup",
+         "Dotted names - Context Stacking",
+         "Dotted names - Context Stacking Under Repetition",
+         "Dotted names - Context Stacking Failed Lookup",
+         "Recursion",
+         "Surrounding Whitespace",
+         "Inline Indentation",
+         "Standalone Line Endings",
+         "Standalone Without Previous Line",
+         "Standalone Without Newline",
+         "Standalone Indentation",
+         "Padding Whitespace",
+     }},
+    {"~inheritance.json",
+     {
+         "Default",
+         "Variable",
+         "Triple Mustache",
+         "Sections",
+         "Negative Sections",
+         "Mustache Injection",
+         "Inherit",
+         "Overridden content",
+         "Data does not override block default",
+         "Two overridden parents",
+         "Override parent with newlines",
+         "Inherit indentation",
+         "Only one override",
+         "Parent template",
+         "Recursion",
+         "Multi-level inheritance, no sub child",
+         "Text inside parent",
+         "Text inside parent",
+         "Block scope",
+         "Standalone parent",
+         "Standalone block",
+         "Block reindentation",
+         "Intrinsic indentation",
+         "Nested block reindentation",
+
+     }},
+    {"~lambdas.json",
+     {
+         "Interpolation",
+         "Interpolation - Expansion",
+         "Interpolation - Alternate Delimiters",
+         "Interpolation - Multiple Calls",
+         "Escaping",
+         "Section",
+         "Section - Expansion",
+         "Section - Alternate Delimiters",
+         "Section - Multiple Calls",
+
+     }},
+    {"interpolation.json",
+     {
+         "Triple Mustache",
+         "Triple Mustache Integer Interpolation",
+         "Triple Mustache Decimal Interpolation",
+         "Triple Mustache Null Interpolation",
+         "Triple Mustache Context Miss Interpolation",
+         "Dotted Names - Triple Mustache Interpolation",
+         "Implicit Iterators - Triple Mustache",
+         "Triple Mustache - Surrounding Whitespace",
+         "Triple Mustache - Standalone",
+         "Triple Mustache With Padding",
+     }},
+    {"partials.json", {"Standalone Indentation"}},
+    {"sections.json", {"Implicit Iterator - Triple mustache"}},
+}};
+
+struct TestData {
+  static Expected<TestData> createTestData(json::Object *TestCase,
+                                           StringRef InputFile) {
+    // If any of the needed elements are missing, we cannot continue.
+    // NOTE: partials are optional in the test schema.
+    if (!TestCase || !TestCase->getString("template") ||
+        !TestCase->getString("expected") || !TestCase->getString("name") ||
+        !TestCase->get("data"))
+      return createStringError(
+          llvm::inconvertibleErrorCode(),
+          "invalid JSON schema in test file: " + InputFile + "\n");
+
+    return TestData{TestCase->getString("template").value(),
+                    TestCase->getString("expected").value(),
+                    TestCase->getString("name").value(), TestCase->get("data"),
+                    TestCase->get("partials")};
+  }
+
+  TestData() = default;
+
+  StringRef TemplateStr;
+  StringRef ExpectedStr;
+  StringRef Name;
+  Value *Data;
+  Value *Partials;
+};
+
+static void reportTestFailure(const TestData &TD, StringRef ActualStr,
+                              bool IsXFail) {
+  LLVM_DEBUG(dbgs() << "Template: " << TD.TemplateStr << "\n");
+  if (TD.Partials) {
+    LLVM_DEBUG(dbgs() << "Partial: ");
+    LLVM_DEBUG(TD.Partials->print(dbgs()));
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+  LLVM_DEBUG(dbgs() << "JSON Data: ");
+  LLVM_DEBUG(TD.Data->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+  outs() << formatv("Test {}: {}\n", (IsXFail ? "XFailed" : "Failed"), TD.Name);
+  if (ReportErrors) {
+    outs() << "  Expected: \'" << TD.ExpectedStr << "\'\n"
+           << "  Actual: \'" << ActualStr << "\'\n"
+           << " ====================\n";
+  }
+}
+
+static void registerPartials(Value *Partials, Template &T) {
+  if (!Partials)
+    return;
+  for (const auto &[Partial, Str] : *Partials->getAsObject())
+    T.registerPartial(Partial.str(), Str.getAsString()->str());
+}
+
+static json::Value readJsonFromFile(StringRef &InputFile) {
+  std::unique_ptr<MemoryBuffer> Buffer =
+      ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(InputFile)));
+  return ExitOnErr(parse(Buffer->getBuffer()));
+}
+
+static bool isTestXFail(StringRef FileName, StringRef TestName) {
+  auto P = llvm::sys::path::filename(FileName);
+  auto It = XFailTestNames.find(P);
+  return It != XFailTestNames.end() && It->second.contains(TestName);
+}
+
+static bool evaluateTest(StringRef &InputFile, TestData &TestData,
+                         std::string &ActualStr) {
+  bool IsXFail = isTestXFail(InputFile, TestData.Name);
+  bool Matches = TestData.ExpectedStr == ActualStr;
+  if ((Matches && IsXFail) || (!Matches && !IsXFail)) {
+    reportTestFailure(TestData, ActualStr, IsXFail);
+    return false;
+  }
+  IsXFail ? NumXFail++ : NumSuccess++;
+  return true;
+}
+
+static void runTest(StringRef InputFile) {
+  NumXFail = 0;
+  NumSuccess = 0;
+  outs() << "Running Tests: " << InputFile << "\n";
+  json::Value Json = readJsonFromFile(InputFile);
+
+  json::Object *Obj = Json.getAsObject();
+  Array *TestArray = Obj->getArray("tests");
+  // Even though we parsed the JSON, it can have a bad format, so check it.
+  if (!TestArray)
+    ExitOnErr(createStringError(
+        llvm::inconvertibleErrorCode(),
+        "invalid JSON schema in test file: " + InputFile + "\n"));
+
+  const size_t Total = TestArray->size();
+
+  for (Value V : *TestArray) {
+    auto TestData =
+        ExitOnErr(TestData::createTestData(V.getAsObject(), InputFile));
+    Template T(TestData.TemplateStr);
+    registerPartials(TestData.Partials, T);
+
+    std::string ActualStr;
+    raw_string_ostream OS(ActualStr);
+    T.render(*TestData.Data, OS);
+    evaluateTest(InputFile, TestData, ActualStr);
+  }
+
+  const int NumFailed = Total - NumSuccess - NumXFail;
+  outs() << formatv("===Results===\n"
+                    " Suceeded: {}\n"
+                    " Expectedly Failed: {}\n"
+                    " Failed: {}\n"
+                    " Total: {}\n",
+                    NumSuccess, NumXFail, NumFailed, Total);
+}
+
+int main(int argc, char **argv) {
+  ExitOnErr.setBanner(std::string(argv[0]) + " error: ");
+  cl::ParseCommandLineOptions(argc, argv);
+  for (const auto &FileName : InputFiles)
+    runTest(FileName);
+  return 0;
+}

From e7e491f6ee2baee4e2ab2947e1c64bc54e3ebbec Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 11 Jun 2025 13:06:22 -0700
Subject: [PATCH 0047/1322] [SelectionDAG] Add ISD::VSELECT to
 SelectionDAG::canCreateUndefOrPoison. (#143760)

---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  1 +
 .../RISCV/rvv/combine-reduce-add-to-vcpop.ll  | 73 +++++++++----------
 .../CodeGen/RISCV/rvv/vector-interleave.ll    | 16 ++--
 .../test/CodeGen/X86/avx10_2_512bf16-arith.ll |  2 +-
 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll    |  4 +-
 5 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4fc026ca562b..45a37622a531 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5553,6 +5553,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_VECTOR:
   case ISD::BUILD_PAIR:
   case ISD::SPLAT_VECTOR:
+  case ISD::VSELECT:
     return false;
 
   case ISD::SELECT_CC:
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
index 88894f887cc2..5dc532273b77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
@@ -313,12 +313,12 @@ define i32 @test_nxv128i1(<vscale x 128 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v0, v6, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v7, a1
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v5, v6, a0
-; CHECK-NEXT:    vslidedown.vx v4, v7, a0
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a0
+; CHECK-NEXT:    vslidedown.vx v5, v6, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
@@ -364,9 +364,9 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vmv1r.v v7, v9
 ; CHECK-NEXT:    vmv1r.v v5, v8
 ; CHECK-NEXT:    vmv1r.v v4, v0
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a2, a0
@@ -376,7 +376,7 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v5
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
@@ -388,9 +388,8 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v3, v4, a0
 ; CHECK-NEXT:    vslidedown.vx v2, v5, a0
 ; CHECK-NEXT:    vmv.v.v v0, v3
-; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    mv a3, a2
@@ -398,42 +397,43 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v2
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v3, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v2, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v24, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v4, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v5, a1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v6, a1
 ; CHECK-NEXT:    vslidedown.vx v5, v7, a1
-; CHECK-NEXT:    vslidedown.vx v4, v6, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
-; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vadd.vi v24, v24, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v5
-; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v24
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
@@ -443,7 +443,7 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v0, v4, a1
 ; CHECK-NEXT:    vslidedown.vx v3, v5, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vadd.vi v24, v24, 1, v0.t
+; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v3
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -451,7 +451,7 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -492,16 +492,16 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vadd.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vadd.vv v24, v24, v8
+; CHECK-NEXT:    vadd.vv v0, v24, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v8, v0
-; CHECK-NEXT:    vadd.vv v16, v24, v16
+; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vadd.vv v16, v0, v16
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vmv.s.x v16, zero
 ; CHECK-NEXT:    vredsum.vs v8, v8, v16
@@ -537,18 +537,17 @@ entry:
 define i16 @test_narrow_nxv64i1(<vscale x 64 x i1> %x) {
 ; CHECK-LABEL: test_narrow_nxv64i1:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v0, a0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vredsum.vs v8, v16, v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vmv.s.x v16, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v16
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 77723609a60c..e297e88c71f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -260,18 +260,18 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZIP-NEXT:    vmv1r.v v9, v0
 ; ZIP-NEXT:    vmv1r.v v0, v8
-; ZIP-NEXT:    vmv.v.i v16, 0
-; ZIP-NEXT:    vmerge.vim v24, v16, 1, v0
+; ZIP-NEXT:    vmv.v.i v24, 0
+; ZIP-NEXT:    vmerge.vim v16, v24, 1, v0
 ; ZIP-NEXT:    vmv1r.v v0, v9
-; ZIP-NEXT:    vmerge.vim v8, v16, 1, v0
+; ZIP-NEXT:    vmerge.vim v8, v24, 1, v0
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v4, v8, v24
-; ZIP-NEXT:    ri.vzip2b.vv v20, v12, v28
-; ZIP-NEXT:    ri.vzip2a.vv v0, v8, v24
-; ZIP-NEXT:    ri.vzip2a.vv v16, v12, v28
+; ZIP-NEXT:    ri.vzip2b.vv v4, v8, v16
+; ZIP-NEXT:    ri.vzip2b.vv v28, v12, v20
+; ZIP-NEXT:    ri.vzip2a.vv v0, v8, v16
+; ZIP-NEXT:    ri.vzip2a.vv v24, v12, v20
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZIP-NEXT:    vmsne.vi v9, v0, 0
-; ZIP-NEXT:    vmsne.vi v8, v16, 0
+; ZIP-NEXT:    vmsne.vi v8, v24, 0
 ; ZIP-NEXT:    vmv1r.v v0, v9
 ; ZIP-NEXT:    ret
   %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 1e2cf4956bd0..c22a394e6c4e 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 42831a453cb1..435f67a0f1e4 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]

From 5623b7f2d56ecba84de5d62444feed2dea2b7e25 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Jun 2025 21:08:35 +0100
Subject: [PATCH 0048/1322] [LV] Use GeneratedRTChecks to check if safety
 checks were added (NFC).

Directly check via GeneratedRTChecks if any checks have been added,
instead of needing to go through ILV. This simplifies the code and
enables further refactoring in follow-up patches.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2a237f42e404..d23611183639 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -505,9 +505,6 @@ public:
   /// Fix the vectorized code, taking care of header phi's, and more.
   void fixVectorizedLoop(VPTransformState &State);
 
-  // Return true if any runtime check is added.
-  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
-
   /// Fix the non-induction PHIs in \p Plan.
   void fixNonInductionPHIs(VPTransformState &State);
 
@@ -620,9 +617,6 @@ protected:
   /// The profitablity analysis.
   LoopVectorizationCostModel *Cost;
 
-  // Record whether runtime checks are added.
-  bool AddedSafetyChecks = false;
-
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
@@ -1777,6 +1771,9 @@ class GeneratedRTChecks {
   /// they have been used.
   Value *MemRuntimeCheckCond = nullptr;
 
+  /// True if any checks have been added.
+  bool AddedAnyChecks = false;
+
   DominatorTree *DT;
   LoopInfo *LI;
   TargetTransformInfo *TTI;
@@ -2038,9 +2035,9 @@ public:
     if (AddBranchWeights)
       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
-
     // Mark the check as used, to prevent it from being removed during cleanup.
     SCEVCheckCond = nullptr;
+    AddedAnyChecks = true;
     return SCEVCheckBlock;
   }
 
@@ -2070,8 +2067,12 @@ public:
 
     // Mark the check as used, to prevent it from being removed during cleanup.
     MemRuntimeCheckCond = nullptr;
+    AddedAnyChecks = true;
     return MemCheckBlock;
   }
+
+  /// Return true if any runtime checks have been added
+  bool hasChecks() const { return AddedAnyChecks; }
 };
 } // namespace
 
@@ -2459,7 +2460,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   assert((!Cost->OptForSize ||
           Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
-  AddedSafetyChecks = true;
 
   introduceCheckBlockInVPlan(SCEVCheckBlock);
   return SCEVCheckBlock;
@@ -2494,9 +2494,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-
-  AddedSafetyChecks = true;
-
   introduceCheckBlockInVPlan(MemCheckBlock);
   return MemCheckBlock;
 }
@@ -10287,7 +10284,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         }
         ++LoopsEpilogueVectorized;
 
-        if (!MainILV.areSafetyChecksAdded())
+        if (!Checks.hasChecks())
           DisableRuntimeUnroll = true;
       } else {
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
@@ -10299,7 +10296,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // Add metadata to disable runtime unrolling a scalar loop when there
         // are no runtime checks about strides and memory. A scalar loop that is
         // rarely used is not worth unrolling.
-        if (!LB.areSafetyChecksAdded())
+        if (!Checks.hasChecks())
           DisableRuntimeUnroll = true;
       }
       // Report the vectorization decision.

From c70658e32debfc3b2c0f6c2b2228ac48e976fd51 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 13:09:05 -0700
Subject: [PATCH 0049/1322] [bazel] port
 5dafe9dca867b90f20dcd71c620ad823aee4262b

---
 .../llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 40f672d8099f..610978059d7e 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -107,6 +107,7 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:atof",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -206,6 +207,7 @@ libc_test_library(
         "//libc:__support_macros_properties_architectures",
         "//libc:errno",
         "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -251,6 +253,7 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:strtof",
+        "//libc/test/UnitTest:errno_test_helpers",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
 )
@@ -261,6 +264,7 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:strtod",
+        "//libc/test/UnitTest:errno_test_helpers",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
 )
@@ -272,5 +276,6 @@ libc_test(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_uint128",
         "//libc:strtold",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )

From 52583b3ed7dd39788360361fc1e21039c8eb5479 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Wed, 11 Jun 2025 20:11:31 +0000
Subject: [PATCH 0050/1322] [libc] Character converter skeleton class (#143619)

Made CharacterConverter class skeleton
---
 libc/hdr/types/char32_t.h                     | 22 ++++++
 libc/hdr/types/char8_t.h                      | 22 ++++++
 libc/hdr/uchar_overlay.h                      | 69 +++++++++++++++++++
 libc/src/__support/wchar/CMakeLists.txt       | 26 +++++++
 .../__support/wchar/character_converter.cpp   | 32 +++++++++
 .../src/__support/wchar/character_converter.h | 39 +++++++++++
 libc/src/__support/wchar/mbstate.h            | 27 ++++++++
 libc/src/__support/wchar/utf_ret.h            | 21 ++++++
 8 files changed, 258 insertions(+)
 create mode 100644 libc/hdr/types/char32_t.h
 create mode 100644 libc/hdr/types/char8_t.h
 create mode 100644 libc/hdr/uchar_overlay.h
 create mode 100644 libc/src/__support/wchar/CMakeLists.txt
 create mode 100644 libc/src/__support/wchar/character_converter.cpp
 create mode 100644 libc/src/__support/wchar/character_converter.h
 create mode 100644 libc/src/__support/wchar/mbstate.h
 create mode 100644 libc/src/__support/wchar/utf_ret.h

diff --git a/libc/hdr/types/char32_t.h b/libc/hdr/types/char32_t.h
new file mode 100644
index 000000000000..94fe5747d341
--- /dev/null
+++ b/libc/hdr/types/char32_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of char32_t.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_CHAR32_T_H
+#define LLVM_LIBC_HDR_TYPES_CHAR32_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/char32_t.h"
+
+#else // overlay mode
+
+#include "hdr/uchar_overlay.h"
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_CHAR32_T_H
diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h
new file mode 100644
index 000000000000..31de764658f9
--- /dev/null
+++ b/libc/hdr/types/char8_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of char8_t.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H
+#define LLVM_LIBC_HDR_TYPES_CHAR8_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/char8_t.h"
+
+#else // overlay mode
+
+#include "hdr/uchar_overlay.h"
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H
diff --git a/libc/hdr/uchar_overlay.h b/libc/hdr/uchar_overlay.h
new file mode 100644
index 000000000000..44ed3d48c6c1
--- /dev/null
+++ b/libc/hdr/uchar_overlay.h
@@ -0,0 +1,69 @@
+//===-- Including uchar.h in overlay mode ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_UCHAR_OVERLAY_H
+#define LLVM_LIBC_HDR_UCHAR_OVERLAY_H
+
+#ifdef LIBC_FULL_BUILD
+#error "This header should only be included in overlay mode"
+#endif
+
+// Overlay mode
+
+// glibc <uchar.h> header might provide extern inline definitions for few
+// functions, causing external alias errors.  They are guarded by
+// `__USE_EXTERN_INLINES` macro.  We temporarily disable `__USE_EXTERN_INLINES`
+// macro by defining `__NO_INLINE__` before including <uchar.h>.
+// And the same with `__USE_FORTIFY_LEVEL`, which will be temporarily disabled
+// with `_FORTIFY_SOURCE`.
+
+#ifdef _FORTIFY_SOURCE
+#define LIBC_OLD_FORTIFY_SOURCE _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+
+#ifndef __NO_INLINE__
+#define __NO_INLINE__ 1
+#define LIBC_SET_NO_INLINE
+#endif
+
+#ifdef __USE_EXTERN_INLINES
+#define LIBC_OLD_USE_EXTERN_INLINES
+#undef __USE_EXTERN_INLINES
+#endif
+
+#ifdef __USE_FORTIFY_LEVEL
+#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL 0
+#endif
+
+#include <uchar.h>
+
+#ifdef LIBC_OLD_FORTIFY_SOURCE
+#define _FORTIFY_SOURCE LIBC_OLD_FORTIFY_SOURCE
+#undef LIBC_OLD_FORTIFY_SOURCE
+#endif
+
+#ifdef LIBC_SET_NO_INLINE
+#undef __NO_INLINE__
+#undef LIBC_SET_NO_INLINE
+#endif
+
+#ifdef LIBC_OLD_USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL
+#undef LIBC_OLD_USE_FORTIFY_LEVEL
+#endif
+
+#ifdef LIBC_OLD_USE_EXTERN_INLINES
+#define __USE_EXTERN_INLINES
+#undef LIBC_OLD_USE_EXTERN_INLINES
+#endif
+
+#endif // LLVM_LIBC_HDR_UCHAR_OVERLAY_H
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 000000000000..5cca58400ff4
--- /dev/null
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_header_library(
+  mbstate
+  HDRS
+    mbstate.h
+  DEPENDS
+    libc.hdr.types.char32_t    
+)
+
+add_object_library(
+  character_converter
+  HDRS
+    character_converter.h
+  SRCS 
+    character_converter.cpp
+  DEPENDS
+    libc.hdr.types.char8_t
+    libc.hdr.types.char32_t
+    .mbstate
+    .utf_ret
+)
+
+add_header_library(
+  utf_ret
+  HDRS
+    utf_ret.h
+)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
new file mode 100644
index 000000000000..0afc2a6f59e6
--- /dev/null
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -0,0 +1,32 @@
+//===-- Implementation of a class for conversion --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+
+#include "character_converter.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
+
+bool CharacterConverter::isComplete() {}
+
+int CharacterConverter::push(char8_t utf8_byte) {}
+
+int CharacterConverter::push(char32_t utf32) {}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+
+utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
new file mode 100644
index 000000000000..a6bac4380537
--- /dev/null
+++ b/libc/src/__support/wchar/character_converter.h
@@ -0,0 +1,39 @@
+//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H
+#define LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+class CharacterConverter {
+private:
+  mbstate_t *state;
+
+public:
+  CharacterConverter(mbstate_t *mbstate);
+
+  bool isComplete();
+
+  int push(char8_t utf8_byte);
+  int push(char32_t utf32);
+
+  utf_ret<char8_t> pop_utf8();
+  utf_ret<char32_t> pop_utf32();
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
new file mode 100644
index 000000000000..72ec72756000
--- /dev/null
+++ b/libc/src/__support/wchar/mbstate.h
@@ -0,0 +1,27 @@
+//===-- Definition of mbstate-----------------------------------*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
+#define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
+
+#include "hdr/types/char32_t.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+struct mbstate {
+  char32_t partial;
+  uint8_t bits_processed;
+  uint8_t total_bytes;
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
new file mode 100644
index 000000000000..b8a8f6f09414
--- /dev/null
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -0,0 +1,21 @@
+//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
+#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
+
+namespace LIBC_NAMESPACE_DECL {
+
+template <typename T> struct utf_ret {
+  T out;
+  int error;
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H

From a2d2941830d9c141d7f43da1ff58e7b7235a9f7d Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Wed, 11 Jun 2025 13:12:37 -0700
Subject: [PATCH 0051/1322] [lldb][RPC] Upstream LLDB to RPC converstion Python
 script (#138028)

As part of upstreaming LLDB RPC, this commit adds a python script that
is used by LLDB RPC to modify the public lldb header files for use with
RPC.

https://discourse.llvm.org/t/rfc-upstreaming-lldb-rpc/85804
---
 .../convert-lldb-header-to-rpc-header.py      | 108 ++++++++++++++++++
 .../TestConvertScript/CheckLLDBDefines.test   |  22 ++++
 .../CheckLLDBEnumerations.test                |  17 +++
 .../TestConvertScript/CheckLLDBTypes.test     |  24 ++++
 .../TestConvertScript/CheckSBDefines.test     |  22 ++++
 .../TestConvertScript/Inputs/SBDefines.h      |  22 ++++
 .../TestConvertScript/Inputs/lldb-defines.h   |  23 ++++
 .../Inputs/lldb-enumerations.h                |  17 +++
 .../TestConvertScript/Inputs/lldb-types.h     |  23 ++++
 9 files changed, 278 insertions(+)
 create mode 100755 lldb/scripts/convert-lldb-header-to-rpc-header.py
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h

diff --git a/lldb/scripts/convert-lldb-header-to-rpc-header.py b/lldb/scripts/convert-lldb-header-to-rpc-header.py
new file mode 100755
index 000000000000..d7734280076f
--- /dev/null
+++ b/lldb/scripts/convert-lldb-header-to-rpc-header.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Usage: convert-lldb-header-to-rpc-header.py <path/to/input-header.h> <path/to/output-header.h>
+
+This scripts takes common LLDB headers (such as lldb-defines.h) and replaces references to LLDB
+with those for RPC. This happens for:
+- namespace definitions
+- namespace usage
+- version string macros
+- ifdef/ifndef lines
+"""
+
+import argparse
+import os
+import re
+
+
+INCLUDES_TO_REMOVE_REGEX = re.compile(
+    r'#include "lldb/lldb-forward.h"|#include "lldb/lldb-versioning.h"'
+)
+LLDB_GUARD_REGEX = re.compile(r"(?P<guard_type>#.+)LLDB_LLDB_\s*", re.M)
+LLDB_API_GUARD_REGEX = re.compile(r"(?P<guard_type>#.+)LLDB_API_\s*", re.M)
+LLDB_VERSION_REGEX = re.compile(r"#define LLDB_VERSION", re.M)
+LLDB_REVISION_REGEX = re.compile(r"#define LLDB_REVISION", re.M)
+LLDB_VERSION_STRING_REGEX = re.compile(r"#define LLDB_VERSION_STRING", re.M)
+LLDB_LOCAL_INCLUDE_REGEX = re.compile(r'#include "lldb/lldb-\s*', re.M)
+LLDB_NAMESPACE_DEFINITION_REGEX = re.compile(
+    r"(?P<comment_marker>//\s*){,1}namespace lldb\s{1}", re.M
+)
+LLDB_NAMESPACE_REGEX = re.compile(r"\s*.+lldb::\s*", re.M)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("output")
+    args = parser.parse_args()
+    input_path = str(args.input)
+    output_path = str(args.output)
+    with open(input_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+
+    with open(output_path, "w") as output_file:
+        # NOTE: We do not use lldb-forward.h or lldb-versioning.h in RPC, so remove
+        # all includes that are found for these files.
+        file_buffer = re.sub(INCLUDES_TO_REMOVE_REGEX, r"", file_buffer)
+
+        # For lldb-rpc-defines.h, replace the ifndef LLDB_LLDB_ portion with LLDB_RPC_ as we're not
+        # using LLDB private definitions in RPC.
+        lldb_guard_matches = LLDB_GUARD_REGEX.finditer(file_buffer)
+        for match in lldb_guard_matches:
+            file_buffer = re.sub(
+                match.group(),
+                r"{0}LLDB_RPC_".format(match.group("guard_type")),
+                file_buffer,
+            )
+
+        # Similarly to lldb-rpc-defines.h, replace the ifndef for LLDB_API in SBDefines.h to LLDB_RPC_API_ for the same reason.
+        lldb_api_guard_matches = LLDB_API_GUARD_REGEX.finditer(file_buffer)
+        for match in lldb_api_guard_matches:
+            file_buffer = re.sub(
+                match.group(),
+                r"{0}LLDB_RPC_API_".format(match.group("guard_type")),
+                file_buffer,
+            )
+
+        # Replace the references for the macros that define the versioning strings in
+        # lldb-rpc-defines.h.
+        # NOTE: Here we assume that the versioning info has already been uncommented and
+        # populated from the original lldb-defines.h.
+        file_buffer = re.sub(
+            LLDB_VERSION_REGEX, r"#define LLDB_RPC_VERSION", file_buffer
+        )
+        file_buffer = re.sub(
+            LLDB_REVISION_REGEX, r"#define LLDB_RPC_REVISION", file_buffer
+        )
+        file_buffer = re.sub(
+            LLDB_VERSION_STRING_REGEX, r"#define LLDB_RPC_VERSION_STRING", file_buffer
+        )
+
+        # For local #includes
+        file_buffer = re.sub(
+            LLDB_LOCAL_INCLUDE_REGEX, r'#include "lldb-rpc-', file_buffer
+        )
+
+        # Rename the lldb namespace definition to lldb-rpc.
+        lldb_rpc_namespace_definition_matches = (
+            LLDB_NAMESPACE_DEFINITION_REGEX.finditer(file_buffer)
+        )
+        for match in lldb_rpc_namespace_definition_matches:
+            comment_marker = (
+                match.group("comment_marker") if match.group("comment_marker") else ""
+            )
+            file_buffer = re.sub(
+                match.group(),
+                r"{0}namespace lldb_rpc ".format(comment_marker),
+                file_buffer,
+            )
+
+        # Rename the lldb namespace definition to lldb-rpc.
+        file_buffer = re.sub(LLDB_NAMESPACE_REGEX, r"lldb_rpc::", file_buffer)
+
+        output_file.write(file_buffer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test
new file mode 100644
index 000000000000..0d89d627cfed
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test
@@ -0,0 +1,22 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on lldb-defines.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-defines.h %t/Outputs/lldb-rpc-defines.h
+
+# Check the output
+RUN: cat %t/Outputs/lldb-rpc-defines.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_DEFINES_H to LLDB_RPC_DEFINES_H.
+CHECK: #ifndef LLDB_RPC_DEFINES_H
+CHECK: #define LLDB_RPC_DEFINES_H
+
+# Includes of other lldb headers must begin with "lldb-rpc-".
+CHECK: #include "lldb-rpc-types.h"
+
+# The version info must be changed from LLDB_VERSION to LLDB_RPC_VERSION
+CHECK: #define LLDB_RPC_VERSION 21
+CHECK: #define LLDB_RPC_REVISION 12
+CHECK: #define LLDB_RPC_VERSION_STRING "21.0.12"
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_DEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test
new file mode 100644
index 000000000000..0fb3c6f73dd0
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test
@@ -0,0 +1,17 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on lldb-enumerations.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-enumerations.h %t/Outputs/lldb-rpc-enumerations.h
+
+# Check the output
+RUN: cat %t/Outputs/lldb-rpc-enumerations.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_ENUMERATIONS_H to LLDB_RPC_ENUMERATIONS_H.
+CHECK: #ifndef LLDB_RPC_ENUMERATIONS_H
+CHECK: #define LLDB_RPC_ENUMERATIONS_H
+
+# Change the namespace to lldb_rpc. Also, the comment that closes the namespace should match the namespace.
+CHECK: namespace lldb_rpc {} // namespace lldb_rpc
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_ENUMERATIONS_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test
new file mode 100644
index 000000000000..86f2d290209e
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test
@@ -0,0 +1,24 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on lldb-types.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-types.h %t/Outputs/lldb-rpc-types.h
+
+# Check the output
+RUN: cat %t/Outputs/lldb-rpc-types.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_TYPES_H to LLDB_RPC_TYPES_H.
+CHECK: #ifndef LLDB_RPC_TYPES_H
+CHECK: #define LLDB_RPC_TYPES_H
+
+# Includes of other lldb headers must begin with "lldb-rpc-".
+# Also, the includes for lldb-forward.h should be removed.
+CHECK: #include "lldb-rpc-enumerations.h"
+
+# Change the namespace to lldb_rpc.
+CHECK: namespace lldb_rpc
+
+# The comment that closes the namespace should match the namespace.
+CHECK: // namespace lldb_rpc
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_TYPES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test
new file mode 100644
index 000000000000..72444aaf069a
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test
@@ -0,0 +1,22 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on SBDefines.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/SBDefines.h %t/Outputs/SBDefines.h
+
+# Check the output
+RUN: cat %t/Outputs/SBDefines.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_API_SBDEFINES_H to LLDB_RPC_API_SBDEFINES_H.
+CHECK: #ifndef LLDB_RPC_API_SBDEFINES_H
+CHECK: #define LLDB_RPC_API_SBDEFINES_H
+
+# Includes of other lldb headers must begin with "lldb-rpc-".
+# Also, the includes for lldb-forward.h and lldb-versioning.h should be removed.
+CHECK: #include "lldb-rpc-defines.h"
+CHECK-NOT: #include "lldb-rpc-forward.h"
+CHECK: #include "lldb-rpc-enumerations.h"
+CHECK: #include "lldb-rpc-types.h"
+CHECK-NOT: #include "lldb-rpc-versioning.h"
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_API_SBDEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h
new file mode 100644
index 000000000000..50476c402ba7
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h
@@ -0,0 +1,22 @@
+// This is a truncated version of SBDefines.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_API_SBDEFINES_H -> LLDB_RPC_SBDEFINES_H
+#ifndef LLDB_API_SBDEFINES_H
+#define LLDB_API_SBDEFINES_H
+
+// Includes of public main LLDB headers should change to their RPC equivalents:
+// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h"
+// Also, the includes for lldb-forward.h and lldb-versioning.h should be removed.
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-forward.h"
+#include "lldb/lldb-types.h"
+#include "lldb/lldb-versioning.h"
+
+// The comment that closes the include guard must change in the same way
+// the original guard did.
+// #endif // LLDB_API_SBDEFINES_H -> #endif // LLDB_RPC_API_SBDEFINES_H
+#endif // LLDB_API_SBDEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h
new file mode 100644
index 000000000000..32064430b3d0
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h
@@ -0,0 +1,23 @@
+// This is a truncated version of lldb-defines.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_LLDB_DEFINES_H -> LLDB_RPC_DEFINES_H
+#ifndef LLDB_LLDB_DEFINES_H
+#define LLDB_LLDB_DEFINES_H
+
+// Includes of public main LLDB headers should change to their RPC equivalents:
+// "lldb/lldb-types.h" -> "lldb-rpc-types.h"
+#include "lldb/lldb-types.h"
+
+// The LLDB version must change from LLDB to LLDB_RPC
+// LLDB_VERSION -> LLDB_RPC_VERSION
+#define LLDB_VERSION 21
+#define LLDB_REVISION 12
+#define LLDB_VERSION_STRING "21.0.12"
+
+// The comment that closes the include guard must change in the same way
+// the original guard did.
+// #endif // LLDB_LLDB_DEFINES_H -> #endif // LLDB_RPC_DEFINES_H
+#endif // LLDB_LLDB_DEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h
new file mode 100644
index 000000000000..42c4bb277fc4
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h
@@ -0,0 +1,17 @@
+// This is a truncated version of lldb-enumerations.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_LLDB_ENUMERATIONS_H -> LLDB_RPC_ENUMERATIONS_H
+#ifndef LLDB_LLDB_ENUMERATIONS_H
+#define LLDB_LLDB_ENUMERATIONS_H
+
+// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it:
+// namespace lldb -> namespace lldb_rpc
+namespace lldb {} // namespace lldb
+
+// The comment that closes the include guard must change in the same way
+// the original guard did:
+// #endif // LLDB_LLDB_ENUMERATIONS_H -> #endif // LLDB_RPC_ENUMERATIONS_H
+#endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h
new file mode 100644
index 000000000000..5a49920405ec
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h
@@ -0,0 +1,23 @@
+// This is a truncated version of lldb-types.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_LLDB_TYPES_H -> LLDB_RPC_TYPES_H
+#ifndef LLDB_LLDB_TYPES_H
+#define LLDB_LLDB_TYPES_H
+
+// Includes of public main LLDB headers should change to their RPC equivalents:
+// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h":
+// Also, the includes for lldb-forward.h should be removed.
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-forward.h"
+
+// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it:
+// namespace lldb -> namespace lldb_rpc
+namespace lldb {} // namespace lldb
+
+// The comment that closes the include guard must change in the same way
+// the original guard did:
+// #endif // LLDB_LLDB_TYPES_H -> #endif // LLDB_RPC_TYPES_H
+#endif // LLDB_LLDB_TYPES_H

From b42aef5e6f32a3ac6c259cb4cacf58239400b5aa Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Wed, 11 Jun 2025 13:12:59 -0700
Subject: [PATCH 0052/1322] [flang] Don't duplicate hermetic module file
 dependencies (#143605)

When emitting the modules on which a module depends under the
-fhermetic-module-files options, eliminate duplicates by name rather
than by symbol addresses. This way, when a dependent module is in the
symbol table more than once due to the use of a nested hermetic module,
it doesn't get emitted multiple times to the new module file.
---
 flang/lib/Semantics/mod-file.cpp   | 18 +++++++++------
 flang/test/Semantics/modfile77.F90 | 37 ++++++++++++++++++++++++++++++
 flang/test/Semantics/modfile78.F90 | 33 ++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Semantics/modfile77.F90
 create mode 100644 flang/test/Semantics/modfile78.F90

diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index a72641866aa1..9f9e9f584045 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -143,18 +143,22 @@ void ModFileWriter::Write(const Symbol &symbol) {
   std::string path{context_.moduleDirectory() + '/' +
       ModFileName(symbol.name(), ancestorName, context_.moduleFileSuffix())};
 
-  UnorderedSymbolSet hermeticModules;
-  hermeticModules.insert(symbol);
+  std::set<std::string> hermeticModuleNames;
+  hermeticModuleNames.insert(symbol.name().ToString());
   UnorderedSymbolSet additionalModules;
   PutSymbols(DEREF(symbol.scope()),
       hermeticModuleFileOutput_ ? &additionalModules : nullptr);
   auto asStr{GetAsString(symbol)};
   while (!additionalModules.empty()) {
-    for (auto ref : UnorderedSymbolSet{std::move(additionalModules)}) {
-      if (hermeticModules.insert(*ref).second &&
-          !ref->owner().IsIntrinsicModules()) {
-        PutSymbols(DEREF(ref->scope()), &additionalModules);
-        asStr += GetAsString(*ref);
+    UnorderedSymbolSet nextPass{std::move(additionalModules)};
+    additionalModules.clear();
+    for (const Symbol &modSym : nextPass) {
+      if (!modSym.owner().IsIntrinsicModules() &&
+          hermeticModuleNames.find(modSym.name().ToString()) ==
+              hermeticModuleNames.end()) {
+        hermeticModuleNames.insert(modSym.name().ToString());
+        PutSymbols(DEREF(modSym.scope()), &additionalModules);
+        asStr += GetAsString(modSym);
       }
     }
   }
diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90
new file mode 100644
index 000000000000..a82904ebbcc2
--- /dev/null
+++ b/flang/test/Semantics/modfile77.F90
@@ -0,0 +1,37 @@
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s
+
+#if WHICH == 1
+module modfile77a
+  interface gen
+    procedure proc
+  end interface
+ contains
+  subroutine proc
+    print *, 'ok'
+  end
+end
+#elif WHICH == 2
+module modfile77b
+  use modfile77a
+end
+#else
+module modfile77c
+  use modfile77a
+  use modfile77b
+end
+#endif
+
+!CHECK: module modfile77c
+!CHECK: use modfile77a,only:proc
+!CHECK: use modfile77a,only:gen
+!CHECK: interface gen
+!CHECK: end interface
+!CHECK: end
+!CHECK: module modfile77a
+!CHECK: interface gen
+!CHECK: procedure::proc
+!CHECK: end interface
+!CHECK: contains
+!CHECK: subroutine proc()
+!CHECK: end
+!CHECK: end
diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90
new file mode 100644
index 000000000000..cb3eccd9a410
--- /dev/null
+++ b/flang/test/Semantics/modfile78.F90
@@ -0,0 +1,33 @@
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s
+
+#if WHICH == 1
+module modfile78a
+  integer :: global_variable = 0
+end
+#elif WHICH == 2
+module modfile78b
+  use modfile78a
+ contains
+  subroutine test
+  end
+end
+#else
+module modfile78c
+  use modfile78a
+  use modfile78b
+end
+#endif
+
+!CHECK: module modfile78c
+!CHECK: use modfile78a,only:global_variable
+!CHECK: use modfile78b,only:test
+!CHECK: end
+!CHECK: module modfile78a
+!CHECK: integer(4)::global_variable
+!CHECK: end
+!CHECK: module modfile78b
+!CHECK: use modfile78a,only:global_variable
+!CHECK: contains
+!CHECK: subroutine test()
+!CHECK: end
+!CHECK: end

From e389a0e7bb3d7aabbd10b9ba8f432f292de65649 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Wed, 11 Jun 2025 20:17:35 +0000
Subject: [PATCH 0053/1322] [libc] Switched calls to inline_memcpy to
 __builtin_memcpy for wide char utilities (#143011)

Switched calls to inline_memcpy to __builtin_memcpy for wide char
utilities
Removed unnecessary wctype_utils dependencies from the cmake file
---
 libc/src/wchar/CMakeLists.txt | 9 ---------
 libc/src/wchar/wcscpy.cpp     | 3 +--
 libc/src/wchar/wcsncpy.cpp    | 2 --
 libc/src/wchar/wmemcpy.cpp    | 3 +--
 libc/src/wchar/wmempcpy.cpp   | 3 +--
 5 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 759f708c2247..4b8802ede5f5 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -43,7 +43,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.types.wchar_t
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -54,7 +53,6 @@ add_entrypoint_object(
     wcschr.h
   DEPENDS
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -75,7 +73,6 @@ add_entrypoint_object(
     wcspbrk.h
   DEPENDS
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
     libc.src.__support.macros.null_check
 )
 
@@ -109,7 +106,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.wchar_macros
     libc.hdr.types.size_t
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -121,7 +117,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
     libc.src.__support.macros.null_check
 )
 
@@ -134,7 +129,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -205,8 +199,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
-    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_entrypoint_object(
@@ -218,6 +210,5 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.string.memory_utils.inline_memcpy
     libc.src.string.string_utils
 )
diff --git a/libc/src/wchar/wcscpy.cpp b/libc/src/wchar/wcscpy.cpp
index dc46b972c59f..01ba994cecbb 100644
--- a/libc/src/wchar/wcscpy.cpp
+++ b/libc/src/wchar/wcscpy.cpp
@@ -12,7 +12,6 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 #include "src/string/string_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -20,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(wchar_t *, wcscpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2)) {
   size_t size = internal::string_length(s2) + 1;
-  inline_memcpy(s1, s2, size * sizeof(wchar_t));
+  __builtin_memcpy(s1, s2, size * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wcsncpy.cpp b/libc/src/wchar/wcsncpy.cpp
index e7ae9a4a0da7..7ad6730cd776 100644
--- a/libc/src/wchar/wcsncpy.cpp
+++ b/libc/src/wchar/wcsncpy.cpp
@@ -12,8 +12,6 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-#include "src/string/string_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/wchar/wmemcpy.cpp b/libc/src/wchar/wmemcpy.cpp
index 56708d6cee49..bf92309b2094 100644
--- a/libc/src/wchar/wmemcpy.cpp
+++ b/libc/src/wchar/wmemcpy.cpp
@@ -12,14 +12,13 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmemcpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2,
                     size_t n)) {
-  inline_memcpy(s1, s2, n * sizeof(wchar_t));
+  __builtin_memcpy(s1, s2, n * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wmempcpy.cpp b/libc/src/wchar/wmempcpy.cpp
index d8b89c0a88d0..21e16210a757 100644
--- a/libc/src/wchar/wmempcpy.cpp
+++ b/libc/src/wchar/wmempcpy.cpp
@@ -11,14 +11,13 @@
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmempcpy,
                    (wchar_t *__restrict to, const wchar_t *__restrict from,
                     size_t size)) {
-  inline_memcpy(to, from, size * sizeof(wchar_t));
+  __builtin_memcpy(to, from, size * sizeof(wchar_t));
   return reinterpret_cast<wchar_t *>(to) + size;
 }
 

From fb761aa38b0bc01ab911f5dbbfb474b70aaafbb4 Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Wed, 11 Jun 2025 21:19:52 +0100
Subject: [PATCH 0054/1322] [MLIR][Transform] apply_registered_op fixes: arg
 order & python options auto-conversion (#143779)

---
 .../mlir/Dialect/Transform/IR/TransformOps.td |  6 +++---
 .../mlir/dialects/transform/__init__.py       | 18 +++++++++++-------
 .../Transform/test-pass-application.mlir      | 19 +++++++++----------
 mlir/test/python/dialects/transform.py        | 10 +++++-----
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index f75ba27e58e7..0aa750e62543 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -434,10 +434,10 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
     of targeted ops.
   }];
 
-  let arguments = (ins StrAttr:$pass_name,
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                       StrAttr:$pass_name,
                        DefaultValuedAttr<DictionaryAttr, "{}">:$options,
-                       Variadic<TransformParamTypeInterface>:$dynamic_options,
-                       TransformHandleTypeInterface:$target);
+                       Variadic<TransformParamTypeInterface>:$dynamic_options);
   let results = (outs TransformHandleTypeInterface:$result);
   let assemblyFormat = [{
     $pass_name (`with` `options` `=`
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 10a04b0cc14e..bfe96b1b3e5d 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -224,13 +224,13 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
     def __init__(
         self,
         result: Type,
-        pass_name: Union[str, StringAttr],
         target: Union[Operation, Value, OpView],
+        pass_name: Union[str, StringAttr],
         *,
         options: Optional[
             Dict[
                 Union[str, StringAttr],
-                Union[Attribute, Value, Operation, OpView],
+                Union[Attribute, Value, Operation, OpView, str, int, bool],
             ]
         ] = None,
         loc=None,
@@ -253,17 +253,21 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
                 cur_param_operand_idx += 1
             elif isinstance(value, Attribute):
                 options_dict[key] = value
+            # The following cases auto-convert Python values to attributes.
+            elif isinstance(value, bool):
+                options_dict[key] = BoolAttr.get(value)
+            elif isinstance(value, int):
+                default_int_type = IntegerType.get_signless(64, context)
+                options_dict[key] = IntegerAttr.get(default_int_type, value)
             elif isinstance(value, str):
                 options_dict[key] = StringAttr.get(value)
             else:
                 raise TypeError(f"Unsupported option type: {type(value)}")
-        if len(options_dict) > 0:
-            print(options_dict, cur_param_operand_idx)
         super().__init__(
             result,
+            _get_op_result_or_value(target),
             pass_name,
             dynamic_options,
-            target=_get_op_result_or_value(target),
             options=DictAttr.get(options_dict),
             loc=loc,
             ip=ip,
@@ -272,13 +276,13 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
 
 def apply_registered_pass(
     result: Type,
-    pass_name: Union[str, StringAttr],
     target: Union[Operation, Value, OpView],
+    pass_name: Union[str, StringAttr],
     *,
     options: Optional[
         Dict[
             Union[str, StringAttr],
-            Union[Attribute, Value, Operation, OpView],
+            Union[Attribute, Value, Operation, OpView, str, int, bool],
         ]
     ] = None,
     loc=None,
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 6e6d4eb7e249..1d1be9eda349 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -157,7 +157,7 @@ module attributes {transform.with_named_sequence} {
                          "test-convergence" = true,
                          "max-num-rewrites" =  %max_rewrites }
         to %1
-        : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+        : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
     transform.yield
   }
 }
@@ -171,7 +171,6 @@ func.func @invalid_options_as_str() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param
     // expected-error @+2 {{expected '{' in options dictionary}}
     %2 = transform.apply_registered_pass "canonicalize"
         with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op
@@ -256,7 +255,7 @@ module attributes {transform.with_named_sequence} {
     // expected-error @+2 {{expected '{' in options dictionary}}
     transform.apply_registered_pass "canonicalize"
         with options = %pass_options to %1
-        : (!transform.any_param, !transform.any_op) -> !transform.any_op
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
     transform.yield
   }
 }
@@ -276,7 +275,7 @@ module attributes {transform.with_named_sequence} {
     // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
     transform.apply_registered_pass "canonicalize"
         with options = { "top-down" = %topdown_options } to %1
-        : (!transform.any_param, !transform.any_op) -> !transform.any_op
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
     transform.yield
   }
 }
@@ -316,12 +315,12 @@ module attributes {transform.with_named_sequence} {
     %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
     %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
     // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}}
-    %2 = "transform.apply_registered_pass"(%1, %0) <{
+    %2 = "transform.apply_registered_pass"(%0, %1) <{
       options = {"max-iterations" = #transform.param_operand<index=1 : i64>,
                  "test-convergence" = true,
                  "top-down" = false},
       pass_name = "canonicalize"}>
-    : (!transform.any_param, !transform.any_op) -> !transform.any_op
+    : (!transform.any_op, !transform.any_param) -> !transform.any_op
     "transform.yield"() : () -> ()
   }) : () -> ()
 }) {transform.with_named_sequence} : () -> ()
@@ -340,13 +339,13 @@ module attributes {transform.with_named_sequence} {
     %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
     %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
     // expected-error @below {{dynamic option index 0 is already used in options}}
-    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+    %3 = "transform.apply_registered_pass"(%0, %1, %2) <{
       options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
                  "max-num-rewrites" = #transform.param_operand<index=0 : i64>,
                  "test-convergence" = true,
                  "top-down" = false},
       pass_name = "canonicalize"}>
-    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
     "transform.yield"() : () -> ()
   }) : () -> ()
 }) {transform.with_named_sequence} : () -> ()
@@ -364,12 +363,12 @@ module attributes {transform.with_named_sequence} {
     %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
     %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
     // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}}
-    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+    %3 = "transform.apply_registered_pass"(%0, %1, %2) <{
       options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
                  "test-convergence" = true,
                  "top-down" = false},
       pass_name = "canonicalize"}>
-    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
     "transform.yield"() : () -> ()
   }) : () -> ()
 }) {transform.with_named_sequence} : () -> ()
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index 48bc9bad37a1..eeb95605d7a9 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -263,12 +263,12 @@ def testApplyRegisteredPassOp(module: Module):
     )
     with InsertionPoint(sequence.body):
         mod = transform.ApplyRegisteredPassOp(
-            transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget
+            transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize"
         )
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(),
-            "canonicalize",
             mod.result,
+            "canonicalize",
             options={"top-down": BoolAttr.get(False)},
         )
         max_iter = transform.param_constant(
@@ -281,12 +281,12 @@ def testApplyRegisteredPassOp(module: Module):
         )
         transform.apply_registered_pass(
             transform.AnyOpType.get(),
-            "canonicalize",
             mod,
+            "canonicalize",
             options={
                 "top-down": BoolAttr.get(False),
                 "max-iterations": max_iter,
-                "test-convergence": BoolAttr.get(True),
+                "test-convergence": True,
                 "max-rewrites": max_rewrites,
             },
         )
@@ -305,4 +305,4 @@ def testApplyRegisteredPassOp(module: Module):
     # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
     # CHECK-SAME:                    "test-convergence" = true,
     # CHECK-SAME:                    "top-down" = false}
-    # CHECK-SAME:    to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op

From d87eea35fac5a34a841c637db8908128409a184e Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Wed, 11 Jun 2025 16:25:27 -0400
Subject: [PATCH 0055/1322] [libc] Move libc_errno.h to libc/src/__support and
 make LIBC_ERRNO_MODE_SYSTEM to be header-only. (#143187)

This is the first step in preparation for:
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 .../modules/LLVMLibCCompileOptionRules.cmake  |   4 +
 libc/config/config.json                       |   2 +-
 libc/docs/dev/code_style.rst                  |   4 +-
 libc/shared/fp_bits.h                         |   1 +
 libc/shared/libc_common.h                     |  26 +++++
 libc/shared/rpc_server.h                      |   1 +
 libc/shared/str_to_float.h                    |   1 +
 libc/shared/str_to_integer.h                  |   1 +
 libc/src/__support/CMakeLists.txt             |   9 ++
 libc/src/__support/FPUtil/FEnvImpl.h          |   2 +-
 libc/src/__support/File/dir.cpp               |   2 +-
 libc/src/__support/File/file.cpp              |   2 +-
 libc/src/__support/File/linux/file.cpp        |   2 +-
 libc/src/__support/File/linux/lseekImpl.h     |   2 +-
 libc/src/__support/HashTable/randomness.h     |   2 +-
 libc/src/__support/OSUtil/linux/fcntl.cpp     |   2 +-
 libc/src/__support/OSUtil/linux/vdso.cpp      |   2 +-
 .../tables/linux_extension_errors.h           |   2 +-
 libc/src/__support/libc_errno.h               | 108 ++++++++++++++++++
 libc/src/__support/threads/linux/thread.cpp   |   2 +-
 libc/src/dirent/closedir.cpp                  |   2 +-
 libc/src/dirent/opendir.cpp                   |   2 +-
 libc/src/dirent/readdir.cpp                   |   2 +-
 libc/src/errno/CMakeLists.txt                 |  20 +---
 libc/src/errno/libc_errno.cpp                 |  47 +-------
 libc/src/errno/libc_errno.h                   |  47 --------
 libc/src/fcntl/linux/creat.cpp                |   2 +-
 libc/src/fcntl/linux/open.cpp                 |   2 +-
 libc/src/fcntl/linux/openat.cpp               |   2 +-
 libc/src/inttypes/strtoimax.cpp               |   2 +-
 libc/src/inttypes/strtoumax.cpp               |   2 +-
 libc/src/math/generic/exp10m1f.cpp            |   2 +-
 libc/src/math/generic/exp2m1f.cpp             |   2 +-
 libc/src/math/generic/nan.cpp                 |   2 +-
 libc/src/math/generic/nanf.cpp                |   2 +-
 libc/src/math/generic/nanf128.cpp             |   2 +-
 libc/src/math/generic/nanf16.cpp              |   2 +-
 libc/src/math/generic/nanl.cpp                |   2 +-
 libc/src/poll/linux/poll.cpp                  |   2 +-
 libc/src/pthread/pthread_atfork.cpp           |   2 +-
 .../pthread/pthread_attr_setdetachstate.cpp   |   2 +-
 .../src/pthread/pthread_attr_setguardsize.cpp |   2 +-
 libc/src/pthread/pthread_attr_setstack.cpp    |   2 +-
 .../src/pthread/pthread_attr_setstacksize.cpp |   2 +-
 .../src/pthread/pthread_condattr_setclock.cpp |   2 +-
 .../pthread/pthread_condattr_setpshared.cpp   |   2 +-
 libc/src/pthread/pthread_create.cpp           |   2 +-
 libc/src/pthread/pthread_key_create.cpp       |   2 +-
 libc/src/pthread/pthread_key_delete.cpp       |   2 +-
 .../pthread/pthread_mutexattr_setpshared.cpp  |   2 +-
 .../pthread/pthread_mutexattr_setrobust.cpp   |   2 +-
 .../src/pthread/pthread_mutexattr_settype.cpp |   2 +-
 .../pthread/pthread_rwlock_timedrdlock.cpp    |   2 +-
 libc/src/pthread/pthread_rwlock_trywrlock.cpp |   2 +-
 libc/src/pthread/pthread_rwlock_unlock.cpp    |   2 +-
 .../pthread/pthread_rwlockattr_setkind_np.cpp |   2 +-
 .../pthread/pthread_rwlockattr_setpshared.cpp |   2 +-
 libc/src/pthread/pthread_setspecific.cpp      |   2 +-
 .../sched/linux/sched_get_priority_max.cpp    |   2 +-
 .../sched/linux/sched_get_priority_min.cpp    |   2 +-
 libc/src/sched/linux/sched_getaffinity.cpp    |   2 +-
 libc/src/sched/linux/sched_getparam.cpp       |   2 +-
 libc/src/sched/linux/sched_getscheduler.cpp   |   2 +-
 .../src/sched/linux/sched_rr_get_interval.cpp |   2 +-
 libc/src/sched/linux/sched_setaffinity.cpp    |   2 +-
 libc/src/sched/linux/sched_setparam.cpp       |   2 +-
 libc/src/sched/linux/sched_setscheduler.cpp   |   2 +-
 libc/src/sched/linux/sched_yield.cpp          |   2 +-
 libc/src/search/hcreate.cpp                   |   2 +-
 libc/src/search/hcreate_r.cpp                 |   2 +-
 libc/src/search/hdestroy_r.cpp                |   2 +-
 libc/src/search/hsearch.cpp                   |   2 +-
 libc/src/search/hsearch_r.cpp                 |   2 +-
 libc/src/signal/linux/kill.cpp                |   2 +-
 libc/src/signal/linux/sigaction.cpp           |   2 +-
 libc/src/signal/linux/sigaddset.cpp           |   2 +-
 libc/src/signal/linux/sigaltstack.cpp         |   2 +-
 libc/src/signal/linux/sigdelset.cpp           |   2 +-
 libc/src/signal/linux/sigemptyset.cpp         |   2 +-
 libc/src/signal/linux/sigfillset.cpp          |   2 +-
 libc/src/signal/linux/sigprocmask.cpp         |   2 +-
 .../posix_spawn_file_actions_addclose.cpp     |   2 +-
 .../posix_spawn_file_actions_adddup2.cpp      |   2 +-
 .../posix_spawn_file_actions_addopen.cpp      |   2 +-
 .../posix_spawn_file_actions_destroy.cpp      |   2 +-
 libc/src/stdio/fopencookie.cpp                |   2 +-
 libc/src/stdio/generic/fclose.cpp             |   2 +-
 libc/src/stdio/generic/fflush.cpp             |   2 +-
 libc/src/stdio/generic/fgetc.cpp              |   2 +-
 libc/src/stdio/generic/fgetc_unlocked.cpp     |   2 +-
 libc/src/stdio/generic/fgets.cpp              |   2 +-
 libc/src/stdio/generic/fopen.cpp              |   2 +-
 libc/src/stdio/generic/fputc.cpp              |   2 +-
 libc/src/stdio/generic/fputs.cpp              |   2 +-
 libc/src/stdio/generic/fread.cpp              |   2 +-
 libc/src/stdio/generic/fread_unlocked.cpp     |   2 +-
 libc/src/stdio/generic/fseek.cpp              |   2 +-
 libc/src/stdio/generic/fseeko.cpp             |   2 +-
 libc/src/stdio/generic/ftell.cpp              |   2 +-
 libc/src/stdio/generic/ftello.cpp             |   2 +-
 libc/src/stdio/generic/fwrite.cpp             |   2 +-
 libc/src/stdio/generic/fwrite_unlocked.cpp    |   2 +-
 libc/src/stdio/generic/getc.cpp               |   2 +-
 libc/src/stdio/generic/getc_unlocked.cpp      |   2 +-
 libc/src/stdio/generic/getchar.cpp            |   2 +-
 libc/src/stdio/generic/getchar_unlocked.cpp   |   2 +-
 libc/src/stdio/generic/putc.cpp               |   2 +-
 libc/src/stdio/generic/putchar.cpp            |   2 +-
 libc/src/stdio/generic/puts.cpp               |   2 +-
 libc/src/stdio/gpu/fprintf.cpp                |   2 +-
 libc/src/stdio/gpu/printf.cpp                 |   2 +-
 libc/src/stdio/linux/fdopen.cpp               |   2 +-
 libc/src/stdio/linux/remove.cpp               |   2 +-
 libc/src/stdio/linux/rename.cpp               |   2 +-
 libc/src/stdio/printf_core/parser.h           |   2 +-
 libc/src/stdio/setbuf.cpp                     |   2 +-
 libc/src/stdio/setvbuf.cpp                    |   2 +-
 libc/src/stdlib/atof.cpp                      |   2 +-
 libc/src/stdlib/atoi.cpp                      |   2 +-
 libc/src/stdlib/atol.cpp                      |   2 +-
 libc/src/stdlib/atoll.cpp                     |   2 +-
 libc/src/stdlib/strtod.cpp                    |   2 +-
 libc/src/stdlib/strtod_l.cpp                  |   2 +-
 libc/src/stdlib/strtof.cpp                    |   2 +-
 libc/src/stdlib/strtof_l.cpp                  |   2 +-
 libc/src/stdlib/strtol.cpp                    |   2 +-
 libc/src/stdlib/strtol_l.cpp                  |   2 +-
 libc/src/stdlib/strtold.cpp                   |   2 +-
 libc/src/stdlib/strtold_l.cpp                 |   2 +-
 libc/src/stdlib/strtoll.cpp                   |   2 +-
 libc/src/stdlib/strtoll_l.cpp                 |   2 +-
 libc/src/stdlib/strtoul.cpp                   |   2 +-
 libc/src/stdlib/strtoul_l.cpp                 |   2 +-
 libc/src/stdlib/strtoull.cpp                  |   2 +-
 libc/src/stdlib/strtoull_l.cpp                |   2 +-
 libc/src/string/strdup.cpp                    |   2 +-
 libc/src/sys/auxv/linux/getauxval.cpp         |   2 +-
 libc/src/sys/epoll/linux/epoll_create.cpp     |   2 +-
 libc/src/sys/epoll/linux/epoll_create1.cpp    |   2 +-
 libc/src/sys/epoll/linux/epoll_ctl.cpp        |   2 +-
 libc/src/sys/epoll/linux/epoll_pwait.cpp      |   2 +-
 libc/src/sys/epoll/linux/epoll_pwait2.cpp     |   2 +-
 libc/src/sys/epoll/linux/epoll_wait.cpp       |   2 +-
 libc/src/sys/mman/linux/madvise.cpp           |   2 +-
 libc/src/sys/mman/linux/mincore.cpp           |   2 +-
 libc/src/sys/mman/linux/mlock.cpp             |   2 +-
 libc/src/sys/mman/linux/mlock2.cpp            |   2 +-
 libc/src/sys/mman/linux/mlockall.cpp          |   2 +-
 libc/src/sys/mman/linux/mmap.cpp              |   2 +-
 libc/src/sys/mman/linux/mprotect.cpp          |   2 +-
 libc/src/sys/mman/linux/mremap.cpp            |   2 +-
 libc/src/sys/mman/linux/msync.cpp             |   2 +-
 libc/src/sys/mman/linux/munlock.cpp           |   2 +-
 libc/src/sys/mman/linux/munlockall.cpp        |   2 +-
 libc/src/sys/mman/linux/munmap.cpp            |   4 +-
 libc/src/sys/mman/linux/remap_file_pages.cpp  |   2 +-
 libc/src/sys/mman/linux/shm_common.h          |   2 +-
 libc/src/sys/prctl/linux/prctl.cpp            |   2 +-
 libc/src/sys/random/linux/getrandom.cpp       |   2 +-
 libc/src/sys/resource/linux/getrlimit.cpp     |   2 +-
 libc/src/sys/resource/linux/setrlimit.cpp     |   2 +-
 libc/src/sys/select/linux/select.cpp          |   2 +-
 libc/src/sys/sendfile/linux/sendfile.cpp      |   2 +-
 libc/src/sys/socket/linux/bind.cpp            |   2 +-
 libc/src/sys/socket/linux/recv.cpp            |   2 +-
 libc/src/sys/socket/linux/recvfrom.cpp        |   2 +-
 libc/src/sys/socket/linux/recvmsg.cpp         |   2 +-
 libc/src/sys/socket/linux/send.cpp            |   2 +-
 libc/src/sys/socket/linux/sendmsg.cpp         |   2 +-
 libc/src/sys/socket/linux/sendto.cpp          |   2 +-
 libc/src/sys/socket/linux/socket.cpp          |   2 +-
 libc/src/sys/socket/linux/socketpair.cpp      |   2 +-
 libc/src/sys/stat/linux/chmod.cpp             |   2 +-
 libc/src/sys/stat/linux/fchmod.cpp            |   2 +-
 libc/src/sys/stat/linux/fchmodat.cpp          |   2 +-
 libc/src/sys/stat/linux/fstat.cpp             |   2 +-
 libc/src/sys/stat/linux/lstat.cpp             |   2 +-
 libc/src/sys/stat/linux/mkdir.cpp             |   2 +-
 libc/src/sys/stat/linux/mkdirat.cpp           |   2 +-
 libc/src/sys/stat/linux/stat.cpp              |   2 +-
 libc/src/sys/statvfs/linux/statfs_utils.h     |   2 +-
 libc/src/sys/time/linux/getitimer.cpp         |   2 +-
 libc/src/sys/time/linux/setitimer.cpp         |   2 +-
 libc/src/sys/time/linux/utimes.cpp            |   2 +-
 libc/src/sys/uio/linux/readv.cpp              |   2 +-
 libc/src/sys/uio/linux/writev.cpp             |   2 +-
 libc/src/sys/utsname/linux/uname.cpp          |   2 +-
 libc/src/sys/wait/wait4Impl.h                 |   2 +-
 libc/src/termios/linux/cfsetispeed.cpp        |   2 +-
 libc/src/termios/linux/cfsetospeed.cpp        |   2 +-
 libc/src/termios/linux/tcdrain.cpp            |   2 +-
 libc/src/termios/linux/tcflow.cpp             |   2 +-
 libc/src/termios/linux/tcflush.cpp            |   2 +-
 libc/src/termios/linux/tcgetattr.cpp          |   2 +-
 libc/src/termios/linux/tcgetsid.cpp           |   2 +-
 libc/src/termios/linux/tcsendbreak.cpp        |   2 +-
 libc/src/termios/linux/tcsetattr.cpp          |   2 +-
 libc/src/threads/thrd_create.cpp              |   2 +-
 libc/src/time/linux/clock.cpp                 |   2 +-
 libc/src/time/linux/clock_gettime.cpp         |   2 +-
 libc/src/time/linux/gettimeofday.cpp          |   2 +-
 libc/src/time/linux/nanosleep.cpp             |   2 +-
 libc/src/time/linux/timespec_get.cpp          |   2 +-
 libc/src/time/time.cpp                        |   2 +-
 libc/src/time/time_utils.h                    |   2 +-
 libc/src/time/windows/clock_getres.cpp        |   2 +-
 libc/src/unistd/linux/access.cpp              |   2 +-
 libc/src/unistd/linux/chdir.cpp               |   2 +-
 libc/src/unistd/linux/close.cpp               |   2 +-
 libc/src/unistd/linux/dup.cpp                 |   2 +-
 libc/src/unistd/linux/dup2.cpp                |   2 +-
 libc/src/unistd/linux/dup3.cpp                |   2 +-
 libc/src/unistd/linux/execv.cpp               |   2 +-
 libc/src/unistd/linux/execve.cpp              |   2 +-
 libc/src/unistd/linux/fchdir.cpp              |   2 +-
 libc/src/unistd/linux/fork.cpp                |   2 +-
 libc/src/unistd/linux/fsync.cpp               |   2 +-
 libc/src/unistd/linux/ftruncate.cpp           |   2 +-
 libc/src/unistd/linux/getcwd.cpp              |   2 +-
 libc/src/unistd/linux/getentropy.cpp          |   2 +-
 libc/src/unistd/linux/getsid.cpp              |   2 +-
 libc/src/unistd/linux/isatty.cpp              |   2 +-
 libc/src/unistd/linux/link.cpp                |   2 +-
 libc/src/unistd/linux/linkat.cpp              |   2 +-
 libc/src/unistd/linux/lseek.cpp               |   2 +-
 libc/src/unistd/linux/pathconf.cpp            |   2 +-
 libc/src/unistd/linux/pathconf_utils.cpp      |   2 +-
 libc/src/unistd/linux/pipe.cpp                |   4 +-
 libc/src/unistd/linux/pipe2.cpp               |   2 +-
 libc/src/unistd/linux/pread.cpp               |   6 +-
 libc/src/unistd/linux/pwrite.cpp              |   2 +-
 libc/src/unistd/linux/read.cpp                |   4 +-
 libc/src/unistd/linux/readlink.cpp            |   2 +-
 libc/src/unistd/linux/readlinkat.cpp          |   2 +-
 libc/src/unistd/linux/rmdir.cpp               |   2 +-
 libc/src/unistd/linux/symlink.cpp             |   2 +-
 libc/src/unistd/linux/symlinkat.cpp           |   2 +-
 libc/src/unistd/linux/syscall.cpp             |   2 +-
 libc/src/unistd/linux/sysconf.cpp             |   2 +-
 libc/src/unistd/linux/truncate.cpp            |   2 +-
 libc/src/unistd/linux/unlink.cpp              |   2 +-
 libc/src/unistd/linux/unlinkat.cpp            |   2 +-
 libc/src/unistd/linux/write.cpp               |   2 +-
 libc/src/unistd/windows/getentropy.cpp        |   2 +-
 libc/test/IntegrationTest/test.h              |   9 +-
 libc/test/UnitTest/ErrnoCheckingTest.h        |   4 +-
 libc/test/UnitTest/ErrnoSetterMatcher.h       |   6 +-
 libc/test/UnitTest/FPMatcher.h                |   8 +-
 libc/test/UnitTest/Test.h                     |  11 +-
 .../src/pthread/pthread_create_test.cpp       |   4 +-
 .../src/pthread/pthread_join_test.cpp         |   4 +-
 .../src/pthread/pthread_name_test.cpp         |   2 +-
 .../integration/src/unistd/getcwd_test.cpp    |   6 +-
 .../integration/startup/linux/tls_test.cpp    |   2 +-
 libc/test/src/__support/str_to_fp_test.h      |   1 +
 .../src/__support/str_to_integer_test.cpp     |   1 +
 libc/test/src/dirent/dirent_test.cpp          |  10 +-
 libc/test/src/errno/errno_test.cpp            |   4 +-
 libc/test/src/fcntl/creat_test.cpp            |   2 +-
 libc/test/src/fcntl/fcntl_test.cpp            |   4 +-
 libc/test/src/fcntl/openat_test.cpp           |   2 +-
 libc/test/src/math/RoundToIntegerTest.h       |   2 +-
 libc/test/src/math/acosf_test.cpp             |   4 +-
 libc/test/src/math/acoshf16_test.cpp          |   2 +-
 libc/test/src/math/acoshf_test.cpp            |   4 +-
 libc/test/src/math/asin_test.cpp              |   2 +-
 libc/test/src/math/asinf_test.cpp             |   4 +-
 libc/test/src/math/asinhf_test.cpp            |   4 +-
 libc/test/src/math/atan2f_test.cpp            |   2 +-
 libc/test/src/math/atan_test.cpp              |   2 +-
 libc/test/src/math/atanf_test.cpp             |   4 +-
 libc/test/src/math/atanhf_test.cpp            |   4 +-
 libc/test/src/math/cosf_test.cpp              |   4 +-
 libc/test/src/math/coshf_test.cpp             |   6 +-
 libc/test/src/math/cospif_test.cpp            |   4 +-
 libc/test/src/math/exp10_test.cpp             |   4 +-
 libc/test/src/math/exp10f_test.cpp            |  15 ++-
 libc/test/src/math/exp10m1f_test.cpp          |   8 +-
 libc/test/src/math/exp2_test.cpp              |   4 +-
 libc/test/src/math/exp2f_test.cpp             |  15 ++-
 libc/test/src/math/exp2m1f_test.cpp           |   9 +-
 libc/test/src/math/exp_test.cpp               |   4 +-
 libc/test/src/math/expf_test.cpp              |  15 ++-
 libc/test/src/math/expm1_test.cpp             |   4 +-
 libc/test/src/math/expm1f_test.cpp            |  15 ++-
 libc/test/src/math/log10_test.cpp             |   4 +-
 libc/test/src/math/log1p_test.cpp             |   4 +-
 libc/test/src/math/log1pf_test.cpp            |   4 +-
 libc/test/src/math/log2_test.cpp              |   4 +-
 libc/test/src/math/log2f_test.cpp             |   7 +-
 libc/test/src/math/log_test.cpp               |   4 +-
 libc/test/src/math/powf_test.cpp              |   2 +-
 libc/test/src/math/sin_test.cpp               |   2 +-
 libc/test/src/math/sincosf_test.cpp           |   4 +-
 libc/test/src/math/sinf_test.cpp              |   4 +-
 libc/test/src/math/sinhf_test.cpp             |   6 +-
 libc/test/src/math/sinpif_test.cpp            |   4 +-
 libc/test/src/math/smoke/FModTest.h           |   2 +-
 libc/test/src/math/smoke/RoundToIntegerTest.h |   2 +-
 libc/test/src/math/smoke/acos_test.cpp        |   4 +-
 libc/test/src/math/smoke/acosf16_test.cpp     |   4 +-
 libc/test/src/math/smoke/acosf_test.cpp       |   4 +-
 libc/test/src/math/smoke/acoshf16_test.cpp    |   4 +-
 libc/test/src/math/smoke/acoshf_test.cpp      |   4 +-
 libc/test/src/math/smoke/acospif16_test.cpp   |   4 +-
 libc/test/src/math/smoke/asinf16_test.cpp     |   4 +-
 libc/test/src/math/smoke/asinf_test.cpp       |   4 +-
 libc/test/src/math/smoke/asinhf16_test.cpp    |   4 +-
 libc/test/src/math/smoke/asinhf_test.cpp      |   4 +-
 libc/test/src/math/smoke/atan2f_test.cpp      |   4 +-
 libc/test/src/math/smoke/atanf16_test.cpp     |   4 +-
 libc/test/src/math/smoke/atanf_test.cpp       |   4 +-
 libc/test/src/math/smoke/atanhf16_test.cpp    |   4 +-
 libc/test/src/math/smoke/atanhf_test.cpp      |   4 +-
 libc/test/src/math/smoke/cosf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/cosf_test.cpp        |   4 +-
 libc/test/src/math/smoke/coshf16_test.cpp     |   6 +-
 libc/test/src/math/smoke/coshf_test.cpp       |   6 +-
 libc/test/src/math/smoke/cospif16_test.cpp    |   4 +-
 libc/test/src/math/smoke/cospif_test.cpp      |   4 +-
 libc/test/src/math/smoke/exp10_test.cpp       |   2 +-
 libc/test/src/math/smoke/exp10f16_test.cpp    |   8 +-
 libc/test/src/math/smoke/exp10f_test.cpp      |   6 +-
 libc/test/src/math/smoke/exp10m1f16_test.cpp  |   8 +-
 libc/test/src/math/smoke/exp10m1f_test.cpp    |   8 +-
 libc/test/src/math/smoke/exp2_test.cpp        |   2 +-
 libc/test/src/math/smoke/exp2f16_test.cpp     |   8 +-
 libc/test/src/math/smoke/exp2f_test.cpp       |   6 +-
 libc/test/src/math/smoke/exp2m1f16_test.cpp   |   8 +-
 libc/test/src/math/smoke/exp2m1f_test.cpp     |   8 +-
 libc/test/src/math/smoke/exp_test.cpp         |   2 +-
 libc/test/src/math/smoke/expf16_test.cpp      |   8 +-
 libc/test/src/math/smoke/expf_test.cpp        |   6 +-
 libc/test/src/math/smoke/expm1_test.cpp       |   2 +-
 libc/test/src/math/smoke/expm1f16_test.cpp    |   8 +-
 libc/test/src/math/smoke/expm1f_test.cpp      |   6 +-
 libc/test/src/math/smoke/log10_test.cpp       |   2 +-
 libc/test/src/math/smoke/log10f16_test.cpp    |   4 +-
 libc/test/src/math/smoke/log1p_test.cpp       |   2 +-
 libc/test/src/math/smoke/log1pf_test.cpp      |   2 +-
 libc/test/src/math/smoke/log2_test.cpp        |   2 +-
 libc/test/src/math/smoke/log2f16_test.cpp     |   4 +-
 libc/test/src/math/smoke/log2f_test.cpp       |   2 +-
 libc/test/src/math/smoke/log_test.cpp         |   2 +-
 libc/test/src/math/smoke/logf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/sincosf_test.cpp     |   4 +-
 libc/test/src/math/smoke/sinf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/sinf_test.cpp        |   4 +-
 libc/test/src/math/smoke/sinhf16_test.cpp     |   6 +-
 libc/test/src/math/smoke/sinhf_test.cpp       |   6 +-
 libc/test/src/math/smoke/sinpif16_test.cpp    |   4 +-
 libc/test/src/math/smoke/sinpif_test.cpp      |   4 +-
 libc/test/src/math/smoke/tanf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/tanf_test.cpp        |   4 +-
 libc/test/src/math/smoke/tanhf16_test.cpp     |   6 +-
 libc/test/src/math/smoke/tanhf_test.cpp       |   4 +-
 libc/test/src/math/smoke/tanpif16_test.cpp    |   4 +-
 libc/test/src/math/tanf_test.cpp              |   4 +-
 libc/test/src/math/tanhf_test.cpp             |   4 +-
 libc/test/src/poll/poll_test.cpp              |   6 +-
 libc/test/src/sched/affinity_test.cpp         |  10 +-
 libc/test/src/sched/cpu_count_test.cpp        |   4 +-
 libc/test/src/sched/get_priority_test.cpp     |   4 +-
 .../src/sched/param_and_scheduler_test.cpp    |  49 ++++----
 .../src/sched/sched_rr_get_interval_test.cpp  |  10 +-
 libc/test/src/sched/yield_test.cpp            |   4 +-
 libc/test/src/signal/sigaltstack_test.cpp     |   4 +-
 libc/test/src/signal/signal_test.cpp          |   4 +-
 libc/test/src/signal/sigprocmask_test.cpp     |   4 +-
 .../spawn/posix_spawn_file_actions_test.cpp   |   2 +-
 libc/test/src/stdio/fdopen_test.cpp           |  10 +-
 libc/test/src/stdio/fgetc_test.cpp            |   4 +-
 libc/test/src/stdio/fgetc_unlocked_test.cpp   |   4 +-
 libc/test/src/stdio/fgets_test.cpp            |   4 +-
 libc/test/src/stdio/fileop_test.cpp           |  24 ++--
 libc/test/src/stdio/fopencookie_test.cpp      |  10 +-
 libc/test/src/stdio/remove_test.cpp           |   6 +-
 libc/test/src/stdio/rename_test.cpp           |   4 +-
 libc/test/src/stdio/setvbuf_test.cpp          |   4 +-
 libc/test/src/stdio/sprintf_test.cpp          |  76 ++++++------
 libc/test/src/stdio/unlocked_fileop_test.cpp  |   6 +-
 libc/test/src/stdlib/StrtolTest.h             |   1 +
 libc/test/src/stdlib/strtoint32_test.cpp      |   6 +-
 libc/test/src/stdlib/strtoint64_test.cpp      |   6 +-
 libc/test/src/stdlib/strtold_test.cpp         |   1 +
 libc/test/src/sys/mman/linux/mlock_test.cpp   |  17 ++-
 .../src/sys/statvfs/linux/fstatvfs_test.cpp   |   4 +-
 .../src/sys/statvfs/linux/statvfs_test.cpp    |   4 +-
 libc/test/src/sys/time/setitimer_test.cpp     |   2 +-
 libc/test/src/termios/termios_test.cpp        |  12 +-
 libc/test/src/time/asctime_r_test.cpp         |   2 +-
 libc/test/src/time/asctime_test.cpp           |   2 +-
 libc/test/src/time/ctime_r_test.cpp           |   2 +-
 libc/test/src/time/ctime_test.cpp             |   2 +-
 libc/test/src/time/gmtime_test.cpp            |   4 +-
 libc/test/src/time/nanosleep_test.cpp         |   4 +-
 .../llvm-project-overlay/libc/BUILD.bazel     |   3 +-
 397 files changed, 829 insertions(+), 783 deletions(-)
 create mode 100644 libc/shared/libc_common.h
 create mode 100644 libc/src/__support/libc_errno.h
 delete mode 100644 libc/src/errno/libc_errno.h

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 0facb0b9be0c..a98e7276bef8 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -106,6 +106,10 @@ function(_get_compile_options_from_config output_var)
     list(APPEND config_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}")
   endif()
 
+  if(LIBC_CONF_ERRNO_MODE)
+    set(APPEND config_options "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}")
+  endif()
+
   set(${output_var} ${config_options} PARENT_SCOPE)
 endfunction(_get_compile_options_from_config)
 
diff --git a/libc/config/config.json b/libc/config/config.json
index bfe956855cb5..d53b2936edb0 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -2,7 +2,7 @@
   "errno": {
     "LIBC_CONF_ERRNO_MODE": {
       "value": "LIBC_ERRNO_MODE_DEFAULT",
-      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM."
+      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE."
     }
   },
   "printf": {
diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst
index 0bd3a69ae3ff..86247966552f 100644
--- a/libc/docs/dev/code_style.rst
+++ b/libc/docs/dev/code_style.rst
@@ -101,7 +101,7 @@ test infrastructure itself can be affected. To avoid perturbing the unit test
 infrastructure around the setting of ``errno``, the following rules are to be
 followed:
 
-#. A special macro named ``libc_errno`` defined in ``src/errno/libc_errno.h``
+#. A special macro named ``libc_errno`` defined in ``src/__support/libc_errno.h``
    should be used when setting ``errno`` from libc runtime code. For example,
    code to set ``errno`` to ``EINVAL`` should be:
 
@@ -117,7 +117,7 @@ followed:
    `ErrorOr <https://github.com/llvm/llvm-project/blob/main/libc/src/__support/error_or.h>`_
    to return error values.
 
-#. The header file ``src/errno/libc_errno.h`` is shipped as part of the target
+#. The header file ``src/__support/libc_errno.h`` is shipped as part of the target
    corresponding to the ``errno`` entrypoint ``libc.src.errno.errno``. We do
    not in general allow dependencies between entrypoints. However, the ``errno``
    entrypoint is the only exceptional entrypoint on which other entrypoints
diff --git a/libc/shared/fp_bits.h b/libc/shared/fp_bits.h
index 2898c508b777..e6bb1e17b80c 100644
--- a/libc/shared/fp_bits.h
+++ b/libc/shared/fp_bits.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_FP_BITS_H
 #define LLVM_LIBC_SHARED_FP_BITS_H
 
+#include "libc_common.h"
 #include "src/__support/FPUtil/FPBits.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/shared/libc_common.h b/libc/shared/libc_common.h
new file mode 100644
index 000000000000..c4560bbb0276
--- /dev/null
+++ b/libc/shared/libc_common.h
@@ -0,0 +1,26 @@
+//===-- Common defines for sharing LLVM libc with LLVM projects -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_LIBC_COMMON_H
+#define LLVM_LIBC_SHARED_LIBC_COMMON_H
+
+// Use system errno.
+#ifdef LIBC_ERRNO_MODE
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+#error                                                                         \
+    "LIBC_ERRNO_MODE was set to something different from LIBC_ERRNO_MODE_SYSTEM_INLINE."
+#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+#else
+#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM_INLINE
+#endif // LIBC_ERRNO_MODE
+
+#ifndef LIBC_NAMESPACE
+#define LIBC_NAMESPACE __llvm_libc
+#endif // LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SHARED_LIBC_COMMON_H
diff --git a/libc/shared/rpc_server.h b/libc/shared/rpc_server.h
index 5509094b944a..46e35f13f0ea 100644
--- a/libc/shared/rpc_server.h
+++ b/libc/shared/rpc_server.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_RPC_SERVER_H
 #define LLVM_LIBC_SHARED_RPC_SERVER_H
 
+#include "libc_common.h"
 #include "src/__support/RPC/rpc_server.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/shared/str_to_float.h b/libc/shared/str_to_float.h
index b133a28e26ef..dcc6027d6c77 100644
--- a/libc/shared/str_to_float.h
+++ b/libc/shared/str_to_float.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_STR_TO_FLOAT_H
 #define LLVM_LIBC_SHARED_STR_TO_FLOAT_H
 
+#include "libc_common.h"
 #include "src/__support/str_to_float.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/shared/str_to_integer.h b/libc/shared/str_to_integer.h
index 15bee698d5a6..6ed38c932662 100644
--- a/libc/shared/str_to_integer.h
+++ b/libc/shared/str_to_integer.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_STR_TO_INTEGER_H
 #define LLVM_LIBC_SHARED_STR_TO_INTEGER_H
 
+#include "libc_common.h"
 #include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index f92499fdbf45..327ff5e0c6a3 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -1,6 +1,15 @@
 add_subdirectory(CPP)
 add_subdirectory(macros)
 
+add_header_library(
+  libc_errno
+  HDRS
+    libc_errno.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   block
   HDRS
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 4c8f34a435bd..50a101f833c5 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -12,10 +12,10 @@
 #include "hdr/fenv_macros.h"
 #include "hdr/math_macros.h"
 #include "hdr/types/fenv_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
-#include "src/errno/libc_errno.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
 #if defined(__APPLE__)
diff --git a/libc/src/__support/File/dir.cpp b/libc/src/__support/File/dir.cpp
index 21b0106f7010..aea8862c15f7 100644
--- a/libc/src/__support/File/dir.cpp
+++ b/libc/src/__support/File/dir.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/CPP/mutex.h" // lock_guard
 #include "src/__support/CPP/new.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h" // For error macros
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h" // For error macros
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp
index 528542cccf32..303852dbbb71 100644
--- a/libc/src/__support/File/file.cpp
+++ b/libc/src/__support/File/file.cpp
@@ -13,8 +13,8 @@
 #include "hdr/types/off_t.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/CPP/span.h"
+#include "src/__support/libc_errno.h" // For error macros
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h" // For error macros
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp
index 824c1f200e8c..761e352f74ea 100644
--- a/libc/src/__support/File/linux/file.cpp
+++ b/libc/src/__support/File/linux/file.cpp
@@ -15,8 +15,8 @@
 #include "src/__support/File/linux/lseekImpl.h"
 #include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/libc_errno.h"     // For error macros
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h" // For error macros
 
 #include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall
 #include <sys/stat.h>    // For S_IS*, S_IF*, and S_IR* flags.
diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h
index a034913d9f6e..300e5c5dd55b 100644
--- a/libc/src/__support/File/linux/lseekImpl.h
+++ b/libc/src/__support/File/linux/lseekImpl.h
@@ -13,8 +13,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stdint.h>      // For uint64_t.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/__support/HashTable/randomness.h b/libc/src/__support/HashTable/randomness.h
index 244dd41be3ee..6b58a4125f78 100644
--- a/libc/src/__support/HashTable/randomness.h
+++ b/libc/src/__support/HashTable/randomness.h
@@ -14,7 +14,7 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #if defined(LIBC_HASHTABLE_USE_GETRANDOM)
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sys/random/getrandom.h"
 #endif
 
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index 4742b2a00220..99e16ad58c91 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -15,8 +15,8 @@
 #include "hdr/types/struct_flock64.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp
index 8c9bd3e1bcc7..e4e53c3c2a0f 100644
--- a/libc/src/__support/OSUtil/linux/vdso.cpp
+++ b/libc/src/__support/OSUtil/linux/vdso.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/threads/callonce.h"
 #include "src/__support/threads/linux/futex_word.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/auxv/getauxval.h"
 #include <linux/auxvec.h>
 
diff --git a/libc/src/__support/StringUtil/tables/linux_extension_errors.h b/libc/src/__support/StringUtil/tables/linux_extension_errors.h
index 425590f6e91c..de637d60bea9 100644
--- a/libc/src/__support/StringUtil/tables/linux_extension_errors.h
+++ b/libc/src/__support/StringUtil/tables/linux_extension_errors.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC___SUPPORT_STRINGUTIL_TABLES_LINUX_EXTENSION_ERRORS_H
 
 #include "src/__support/StringUtil/message_mapper.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/__support/libc_errno.h b/libc/src/__support/libc_errno.h
new file mode 100644
index 000000000000..ab5f6a9c4b9d
--- /dev/null
+++ b/libc/src/__support/libc_errno.h
@@ -0,0 +1,108 @@
+//===-- Implementation header for libc_errno --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H
+#define LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H
+
+// This header is to be consumed by internal implementations, in which all of
+// them should refer to `libc_errno` instead of using `errno` directly from
+// <errno.h> header.
+
+// Unit and hermetic tests should:
+// - #include "src/__support/libc_errno.h"
+// - NOT #include <errno.h>
+// - Only use `libc_errno` in the code
+// - Depend on libc.src.errno.errno
+
+// Integration tests should:
+// - NOT #include "src/__support/libc_errno.h"
+// - #include <errno.h>
+// - Use regular `errno` in the code
+// - Still depend on libc.src.errno.errno
+
+// libc uses a fallback default value, either system or thread local.
+#define LIBC_ERRNO_MODE_DEFAULT 0
+// libc never stores a value; `errno` macro uses get link-time failure.
+#define LIBC_ERRNO_MODE_UNDEFINED 1
+// libc maintains per-thread state (requires C++ `thread_local` support).
+#define LIBC_ERRNO_MODE_THREAD_LOCAL 2
+// libc maintains shared state used by all threads, contrary to standard C
+// semantics unless always single-threaded; nothing prevents data races.
+#define LIBC_ERRNO_MODE_SHARED 3
+// libc doesn't maintain any internal state, instead the embedder must define
+// `int *__llvm_libc_errno(void);` C function.
+#define LIBC_ERRNO_MODE_EXTERNAL 4
+// libc uses system `<errno.h>` `errno` macro directly in the overlay mode; in
+// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`.
+// In this mode, the public C++ symbol `LIBC_NAMESPACE::libc_errno ` is still
+// exported and get redirected to the system `errno` inside its implementation.
+
+// TODO: Investigate deprecating LIBC_ERRNO_MODE_SYSTEM in favor of
+//       LIBC_ERRNO_MODE_SYSTEM_INLINE.
+//       https://github.com/llvm/llvm-project/issues/143454
+#define LIBC_ERRNO_MODE_SYSTEM 5
+// In this mode, the libc_errno is simply a macro resolved to `errno` from the
+// system header <errno.h>.  There is no need to link against the
+// `libc.src.errno.errno` object.
+#define LIBC_ERRNO_MODE_SYSTEM_INLINE 6
+
+#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT
+#undef LIBC_ERRNO_MODE
+#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING)
+#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL
+#else
+#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM
+#endif
+#endif // LIBC_ERRNO_MODE
+
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT &&                              \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL &&                         \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED &&                               \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL &&                             \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM &&                               \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+#error LIBC_ERRNO_MODE must be one of the following values: \
+LIBC_ERRNO_MODE_DEFAULT, \
+LIBC_ERRNO_MODE_UNDEFINED, \
+LIBC_ERRNO_MODE_THREAD_LOCAL, \
+LIBC_ERRNO_MODE_SHARED, \
+LIBC_ERRNO_MODE_EXTERNAL, \
+LIBC_ERRNO_MODE_SYSTEM, \
+LIBC_ERRNO_MODE_SYSTEM_INLINE.
+#endif
+
+#if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_SYSTEM_INLINE
+
+#include <errno.h>
+
+#define libc_errno errno
+
+#else // !LIBC_ERRNO_MODE_SYSTEM_INLINE
+
+#include "hdr/errno_macros.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+extern "C" int *__llvm_libc_errno() noexcept;
+
+struct Errno {
+  void operator=(int);
+  operator int();
+};
+
+extern Errno libc_errno;
+
+} // namespace LIBC_NAMESPACE_DECL
+
+using LIBC_NAMESPACE::libc_errno;
+
+#endif // LIBC_ERRNO_MODE_SYSTEM_INLINE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index c531d74c5335..baad26aed685 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -14,9 +14,9 @@
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h" // For error macros
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/linux/futex_utils.h" // For FutexWordType
-#include "src/errno/libc_errno.h"                    // For error macros
 
 #ifdef LIBC_TARGET_ARCH_IS_AARCH64
 #include <arm_acle.h>
diff --git a/libc/src/dirent/closedir.cpp b/libc/src/dirent/closedir.cpp
index 1249ef94cf41..2f8f6f0c044d 100644
--- a/libc/src/dirent/closedir.cpp
+++ b/libc/src/dirent/closedir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/File/dir.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <dirent.h>
 
diff --git a/libc/src/dirent/opendir.cpp b/libc/src/dirent/opendir.cpp
index fee14ef0f558..bf47d0edac18 100644
--- a/libc/src/dirent/opendir.cpp
+++ b/libc/src/dirent/opendir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/File/dir.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <dirent.h>
 
diff --git a/libc/src/dirent/readdir.cpp b/libc/src/dirent/readdir.cpp
index ad460b5e80b8..f95f7c1ae864 100644
--- a/libc/src/dirent/readdir.cpp
+++ b/libc/src/dirent/readdir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/File/dir.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <dirent.h>
 
diff --git a/libc/src/errno/CMakeLists.txt b/libc/src/errno/CMakeLists.txt
index 1d78a5eedff9..2852044e9416 100644
--- a/libc/src/errno/CMakeLists.txt
+++ b/libc/src/errno/CMakeLists.txt
@@ -1,28 +1,16 @@
 # If we are in full build mode, we will provide the errno definition ourselves,
 # and if we are in overlay mode, we will just re-use the system's errno.
-# We are passing LIBC_FULL_BUILD flag in full build mode so that the
-# implementation of libc_errno will know if we are in full build mode or not.
-
-# TODO: Move LIBC_FULL_BUILD flag to _get_common_compile_options.
-set(full_build_flag "")
-if(LLVM_LIBC_FULL_BUILD)
-  set(full_build_flag "-DLIBC_FULL_BUILD")
-endif()
-
-if(LIBC_CONF_ERRNO_MODE)
-  set(errno_config_copts "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}")
-endif()
 
 add_entrypoint_object(
   errno
   SRCS
     libc_errno.cpp
   HDRS
-    libc_errno.h     # Include this
-  COMPILE_OPTIONS
-    ${full_build_flag}
-    ${errno_config_copts}
+    ../__support/libc_errno.h
   DEPENDS
     libc.hdr.errno_macros
     libc.src.__support.common
+    libc.src.__support.libc_errno
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.config
 )
diff --git a/libc/src/errno/libc_errno.cpp b/libc/src/errno/libc_errno.cpp
index d1600d1b050e..8ff1eec1b103 100644
--- a/libc/src/errno/libc_errno.cpp
+++ b/libc/src/errno/libc_errno.cpp
@@ -6,51 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "libc_errno.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
-// libc uses a fallback default value, either system or thread local.
-#define LIBC_ERRNO_MODE_DEFAULT 0
-// libc never stores a value; `errno` macro uses get link-time failure.
-#define LIBC_ERRNO_MODE_UNDEFINED 1
-// libc maintains per-thread state (requires C++ `thread_local` support).
-#define LIBC_ERRNO_MODE_THREAD_LOCAL 2
-// libc maintains shared state used by all threads, contrary to standard C
-// semantics unless always single-threaded; nothing prevents data races.
-#define LIBC_ERRNO_MODE_SHARED 3
-// libc doesn't maintain any internal state, instead the embedder must define
-// `int *__llvm_libc_errno(void);` C function.
-#define LIBC_ERRNO_MODE_EXTERNAL 4
-// libc uses system `<errno.h>` `errno` macro directly in the overlay mode; in
-// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`.
-#define LIBC_ERRNO_MODE_SYSTEM 5
-
-#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT
-#undef LIBC_ERRNO_MODE
-#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING)
-#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL
-#else
-#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM
-#endif
-#endif // LIBC_ERRNO_MODE
-
-#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT &&                              \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL &&                         \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED &&                               \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL &&                             \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM
-#error LIBC_ERRNO_MODE must be one of the following values: \
-LIBC_ERRNO_MODE_DEFAULT, \
-LIBC_ERRNO_MODE_UNDEFINED, \
-LIBC_ERRNO_MODE_THREAD_LOCAL, \
-LIBC_ERRNO_MODE_SHARED, \
-LIBC_ERRNO_MODE_EXTERNAL, \
-LIBC_ERRNO_MODE_SYSTEM
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
 
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+
 #if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_UNDEFINED
 
 void Errno::operator=(int) {}
@@ -93,4 +56,6 @@ Errno::operator int() { return errno; }
 // Define the global `libc_errno` instance.
 Errno libc_errno;
 
+#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/errno/libc_errno.h b/libc/src/errno/libc_errno.h
deleted file mode 100644
index 44ee2714843b..000000000000
--- a/libc/src/errno/libc_errno.h
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- Implementation header for libc_errno --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H
-#define LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H
-
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
-
-#include "hdr/errno_macros.h"
-
-// This header is to be consumed by internal implementations, in which all of
-// them should refer to `libc_errno` instead of using `errno` directly from
-// <errno.h> header.
-
-// Unit and hermetic tests should:
-// - #include "src/errno/libc_errno.h"
-// - NOT #include <errno.h>
-// - Only use `libc_errno` in the code
-// - Depend on libc.src.errno.errno
-
-// Integration tests should:
-// - NOT #include "src/errno/libc_errno.h"
-// - #include <errno.h>
-// - Use regular `errno` in the code
-// - Still depend on libc.src.errno.errno
-
-namespace LIBC_NAMESPACE_DECL {
-
-extern "C" int *__llvm_libc_errno() noexcept;
-
-struct Errno {
-  void operator=(int);
-  operator int();
-};
-
-extern Errno libc_errno;
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H
diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp
index 23abae243aed..71412a8e68c5 100644
--- a/libc/src/fcntl/linux/creat.cpp
+++ b/libc/src/fcntl/linux/creat.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp
index 8b699ecdd204..a21a03788dea 100644
--- a/libc/src/fcntl/linux/open.cpp
+++ b/libc/src/fcntl/linux/open.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp
index 6063d9c00ad6..b47ad1fb3bb0 100644
--- a/libc/src/fcntl/linux/openat.cpp
+++ b/libc/src/fcntl/linux/openat.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/types/mode_t.h"
 #include <stdarg.h>
diff --git a/libc/src/inttypes/strtoimax.cpp b/libc/src/inttypes/strtoimax.cpp
index 85f197c75d90..6e55a4b56aac 100644
--- a/libc/src/inttypes/strtoimax.cpp
+++ b/libc/src/inttypes/strtoimax.cpp
@@ -8,9 +8,9 @@
 
 #include "src/inttypes/strtoimax.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/inttypes/strtoumax.cpp b/libc/src/inttypes/strtoumax.cpp
index 2e9cbc9acba7..ce5a0a782d97 100644
--- a/libc/src/inttypes/strtoumax.cpp
+++ b/libc/src/inttypes/strtoumax.cpp
@@ -8,9 +8,9 @@
 
 #include "src/inttypes/strtoumax.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/exp10m1f.cpp b/libc/src/math/generic/exp10m1f.cpp
index e973b2921c2e..27729104e038 100644
--- a/libc/src/math/generic/exp10m1f.cpp
+++ b/libc/src/math/generic/exp10m1f.cpp
@@ -14,9 +14,9 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
-#include "src/errno/libc_errno.h"
 
 #include "explogxf.h"
 
diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp
index 4913a5e4277e..127c6eaa494d 100644
--- a/libc/src/math/generic/exp2m1f.cpp
+++ b/libc/src/math/generic/exp2m1f.cpp
@@ -14,10 +14,10 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/cpu_features.h"
-#include "src/errno/libc_errno.h"
 
 #include "explogxf.h"
 
diff --git a/libc/src/math/generic/nan.cpp b/libc/src/math/generic/nan.cpp
index f92cd3ff5eb5..829a2ea435ac 100644
--- a/libc/src/math/generic/nan.cpp
+++ b/libc/src/math/generic/nan.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nan.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanf.cpp b/libc/src/math/generic/nanf.cpp
index 7287182406ac..1cb66160e736 100644
--- a/libc/src/math/generic/nanf.cpp
+++ b/libc/src/math/generic/nanf.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanf.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanf128.cpp b/libc/src/math/generic/nanf128.cpp
index 3d8581afa037..4155c5333a9c 100644
--- a/libc/src/math/generic/nanf128.cpp
+++ b/libc/src/math/generic/nanf128.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanf128.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanf16.cpp b/libc/src/math/generic/nanf16.cpp
index 27d9d165f4a8..7b166400601b 100644
--- a/libc/src/math/generic/nanf16.cpp
+++ b/libc/src/math/generic/nanf16.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanf16.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanl.cpp b/libc/src/math/generic/nanl.cpp
index 4f698cb3c88d..58d638c4b531 100644
--- a/libc/src/math/generic/nanl.cpp
+++ b/libc/src/math/generic/nanl.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanl.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/poll/linux/poll.cpp b/libc/src/poll/linux/poll.cpp
index f82fcbcc6577..4cac75b9687c 100644
--- a/libc/src/poll/linux/poll.cpp
+++ b/libc/src/poll/linux/poll.cpp
@@ -13,8 +13,8 @@
 #include "hdr/types/struct_timespec.h"
 #include "src/__support/OSUtil/syscall.h" // syscall_impl
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // SYS_poll, SYS_ppoll
 
diff --git a/libc/src/pthread/pthread_atfork.cpp b/libc/src/pthread/pthread_atfork.cpp
index b2c67c78e5d9..4cad16a02de7 100644
--- a/libc/src/pthread/pthread_atfork.cpp
+++ b/libc/src/pthread/pthread_atfork.cpp
@@ -9,9 +9,9 @@
 #include "pthread_atfork.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/fork_callbacks.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // For pthread_* type definitions.
 
diff --git a/libc/src/pthread/pthread_attr_setdetachstate.cpp b/libc/src/pthread/pthread_attr_setdetachstate.cpp
index 872f694e01f3..c482d25610c2 100644
--- a/libc/src/pthread/pthread_attr_setdetachstate.cpp
+++ b/libc/src/pthread/pthread_attr_setdetachstate.cpp
@@ -9,8 +9,8 @@
 #include "pthread_attr_setdetachstate.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_attr_setguardsize.cpp b/libc/src/pthread/pthread_attr_setguardsize.cpp
index fa4375e915ab..c996210a61d8 100644
--- a/libc/src/pthread/pthread_attr_setguardsize.cpp
+++ b/libc/src/pthread/pthread_attr_setguardsize.cpp
@@ -9,8 +9,8 @@
 #include "pthread_attr_setguardsize.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <pthread.h>
diff --git a/libc/src/pthread/pthread_attr_setstack.cpp b/libc/src/pthread/pthread_attr_setstack.cpp
index 1154055a63a7..767f959b1400 100644
--- a/libc/src/pthread/pthread_attr_setstack.cpp
+++ b/libc/src/pthread/pthread_attr_setstack.cpp
@@ -10,9 +10,9 @@
 #include "pthread_attr_setstacksize.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h" // For STACK_ALIGNMENT
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 #include <stdint.h>
diff --git a/libc/src/pthread/pthread_attr_setstacksize.cpp b/libc/src/pthread/pthread_attr_setstacksize.cpp
index 0a5d1af661ab..38c77ca761d6 100644
--- a/libc/src/pthread/pthread_attr_setstacksize.cpp
+++ b/libc/src/pthread/pthread_attr_setstacksize.cpp
@@ -9,8 +9,8 @@
 #include "pthread_attr_setstacksize.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp
index 5e825d5ecea6..2f63d5e9d194 100644
--- a/libc/src/pthread/pthread_condattr_setclock.cpp
+++ b/libc/src/pthread/pthread_condattr_setclock.cpp
@@ -9,8 +9,8 @@
 #include "pthread_condattr_setclock.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME
 #include <pthread.h>         // pthread_condattr_t
diff --git a/libc/src/pthread/pthread_condattr_setpshared.cpp b/libc/src/pthread/pthread_condattr_setpshared.cpp
index 433b2dc1d2d9..9c117499a559 100644
--- a/libc/src/pthread/pthread_condattr_setpshared.cpp
+++ b/libc/src/pthread/pthread_condattr_setpshared.cpp
@@ -9,8 +9,8 @@
 #include "pthread_condattr_setpshared.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE
 
diff --git a/libc/src/pthread/pthread_create.cpp b/libc/src/pthread/pthread_create.cpp
index e1b1f3b325d1..45be2807fa83 100644
--- a/libc/src/pthread/pthread_create.cpp
+++ b/libc/src/pthread/pthread_create.cpp
@@ -16,10 +16,10 @@
 #include "pthread_attr_getstack.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // For pthread_* type definitions.
 
diff --git a/libc/src/pthread/pthread_key_create.cpp b/libc/src/pthread/pthread_key_create.cpp
index 383762f273e7..7253de14cc0d 100644
--- a/libc/src/pthread/pthread_key_create.cpp
+++ b/libc/src/pthread/pthread_key_create.cpp
@@ -9,9 +9,9 @@
 #include "pthread_key_create.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_key_delete.cpp b/libc/src/pthread/pthread_key_delete.cpp
index b54db821ab05..2b14d874fe31 100644
--- a/libc/src/pthread/pthread_key_delete.cpp
+++ b/libc/src/pthread/pthread_key_delete.cpp
@@ -9,9 +9,9 @@
 #include "pthread_key_delete.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_mutexattr_setpshared.cpp b/libc/src/pthread/pthread_mutexattr_setpshared.cpp
index deeae15be230..a87a08259c4b 100644
--- a/libc/src/pthread/pthread_mutexattr_setpshared.cpp
+++ b/libc/src/pthread/pthread_mutexattr_setpshared.cpp
@@ -10,8 +10,8 @@
 #include "pthread_mutexattr.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_mutexattr_setrobust.cpp b/libc/src/pthread/pthread_mutexattr_setrobust.cpp
index 9fd46f4c928d..fd7a8d7ce1d1 100644
--- a/libc/src/pthread/pthread_mutexattr_setrobust.cpp
+++ b/libc/src/pthread/pthread_mutexattr_setrobust.cpp
@@ -10,8 +10,8 @@
 #include "pthread_mutexattr.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_mutexattr_settype.cpp b/libc/src/pthread/pthread_mutexattr_settype.cpp
index c7e78271f9c3..5a65f031045d 100644
--- a/libc/src/pthread/pthread_mutexattr_settype.cpp
+++ b/libc/src/pthread/pthread_mutexattr_settype.cpp
@@ -10,8 +10,8 @@
 #include "pthread_mutexattr.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp
index 112ff5c9cdad..fcddfed22490 100644
--- a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp
+++ b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp
@@ -9,11 +9,11 @@
 #include "src/pthread/pthread_rwlock_timedrdlock.h"
 #include "src/__support/common.h"
 #include "src/__support/libc_assert.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/threads/linux/rwlock.h"
 #include "src/__support/time/linux/abs_timeout.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlock_trywrlock.cpp b/libc/src/pthread/pthread_rwlock_trywrlock.cpp
index a63dc893e716..660c15a87b36 100644
--- a/libc/src/pthread/pthread_rwlock_trywrlock.cpp
+++ b/libc/src/pthread/pthread_rwlock_trywrlock.cpp
@@ -9,9 +9,9 @@
 #include "src/pthread/pthread_rwlock_trywrlock.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/linux/rwlock.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlock_unlock.cpp b/libc/src/pthread/pthread_rwlock_unlock.cpp
index e61290179bd6..5496bea929c5 100644
--- a/libc/src/pthread/pthread_rwlock_unlock.cpp
+++ b/libc/src/pthread/pthread_rwlock_unlock.cpp
@@ -9,9 +9,9 @@
 #include "src/pthread/pthread_rwlock_unlock.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/linux/rwlock.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp
index 80d34a35c717..e6800311b858 100644
--- a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp
+++ b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp
@@ -9,8 +9,8 @@
 #include "pthread_rwlockattr_setkind_np.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // pthread_rwlockattr_t
 
diff --git a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp
index 5a7191aefd3d..4fbd095ac2b4 100644
--- a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp
+++ b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp
@@ -9,8 +9,8 @@
 #include "pthread_rwlockattr_setpshared.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // pthread_rwlockattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE
 
diff --git a/libc/src/pthread/pthread_setspecific.cpp b/libc/src/pthread/pthread_setspecific.cpp
index 70c29c167084..b147a66d2fad 100644
--- a/libc/src/pthread/pthread_setspecific.cpp
+++ b/libc/src/pthread/pthread_setspecific.cpp
@@ -9,9 +9,9 @@
 #include "pthread_setspecific.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/sched/linux/sched_get_priority_max.cpp b/libc/src/sched/linux/sched_get_priority_max.cpp
index 77a82c77405f..fb30b1e319e7 100644
--- a/libc/src/sched/linux/sched_get_priority_max.cpp
+++ b/libc/src/sched/linux/sched_get_priority_max.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_get_priority_min.cpp b/libc/src/sched/linux/sched_get_priority_min.cpp
index fca66a15edb5..54f67e915fc1 100644
--- a/libc/src/sched/linux/sched_get_priority_min.cpp
+++ b/libc/src/sched/linux/sched_get_priority_min.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_getaffinity.cpp b/libc/src/sched/linux/sched_getaffinity.cpp
index 7b1fd8c5aa2a..e005819e2a97 100644
--- a/libc/src/sched/linux/sched_getaffinity.cpp
+++ b/libc/src/sched/linux/sched_getaffinity.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sched.h>
 #include <stdint.h>
diff --git a/libc/src/sched/linux/sched_getparam.cpp b/libc/src/sched/linux/sched_getparam.cpp
index 75756a65f0ed..b0576c3ac65b 100644
--- a/libc/src/sched/linux/sched_getparam.cpp
+++ b/libc/src/sched/linux/sched_getparam.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_getscheduler.cpp b/libc/src/sched/linux/sched_getscheduler.cpp
index 545cda8e7484..d8e02967a633 100644
--- a/libc/src/sched/linux/sched_getscheduler.cpp
+++ b/libc/src/sched/linux/sched_getscheduler.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_rr_get_interval.cpp b/libc/src/sched/linux/sched_rr_get_interval.cpp
index 1f0ef69dfc89..5668d596bce1 100644
--- a/libc/src/sched/linux/sched_rr_get_interval.cpp
+++ b/libc/src/sched/linux/sched_rr_get_interval.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_setaffinity.cpp b/libc/src/sched/linux/sched_setaffinity.cpp
index cad48c26bf93..93e930dcf2e3 100644
--- a/libc/src/sched/linux/sched_setaffinity.cpp
+++ b/libc/src/sched/linux/sched_setaffinity.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sched.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sched/linux/sched_setparam.cpp b/libc/src/sched/linux/sched_setparam.cpp
index e78e78a707e0..7875d9e2f19b 100644
--- a/libc/src/sched/linux/sched_setparam.cpp
+++ b/libc/src/sched/linux/sched_setparam.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_setscheduler.cpp b/libc/src/sched/linux/sched_setscheduler.cpp
index b6b6f667b3f9..232e5a59b185 100644
--- a/libc/src/sched/linux/sched_setscheduler.cpp
+++ b/libc/src/sched/linux/sched_setscheduler.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_yield.cpp b/libc/src/sched/linux/sched_yield.cpp
index 3de9d0ba3571..c1e9168f34d0 100644
--- a/libc/src/sched/linux/sched_yield.cpp
+++ b/libc/src/sched/linux/sched_yield.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/search/hcreate.cpp b/libc/src/search/hcreate.cpp
index ac816a902e22..68bdb29e51df 100644
--- a/libc/src/search/hcreate.cpp
+++ b/libc/src/search/hcreate.cpp
@@ -9,8 +9,8 @@
 #include "src/search/hcreate.h"
 #include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/search/hsearch/global.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/search/hcreate_r.cpp b/libc/src/search/hcreate_r.cpp
index 17acd808c19a..c89be803b4e1 100644
--- a/libc/src/search/hcreate_r.cpp
+++ b/libc/src/search/hcreate_r.cpp
@@ -9,8 +9,8 @@
 #include "src/search/hcreate_r.h"
 #include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, hcreate_r,
diff --git a/libc/src/search/hdestroy_r.cpp b/libc/src/search/hdestroy_r.cpp
index 7eff5bb6fff9..ba5476098be2 100644
--- a/libc/src/search/hdestroy_r.cpp
+++ b/libc/src/search/hdestroy_r.cpp
@@ -8,8 +8,8 @@
 
 #include "src/search/hdestroy_r.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, hdestroy_r, (struct hsearch_data * htab)) {
diff --git a/libc/src/search/hsearch.cpp b/libc/src/search/hsearch.cpp
index c18b5d3d7f54..034333d17057 100644
--- a/libc/src/search/hsearch.cpp
+++ b/libc/src/search/hsearch.cpp
@@ -9,8 +9,8 @@
 #include "src/search/hsearch.h"
 #include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/search/hsearch/global.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/search/hsearch_r.cpp b/libc/src/search/hsearch_r.cpp
index f93e608a190b..323001e1b103 100644
--- a/libc/src/search/hsearch_r.cpp
+++ b/libc/src/search/hsearch_r.cpp
@@ -8,8 +8,8 @@
 
 #include "src/search/hsearch_r.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, hsearch_r,
diff --git a/libc/src/signal/linux/kill.cpp b/libc/src/signal/linux/kill.cpp
index ed117858f51e..0f5e88757acb 100644
--- a/libc/src/signal/linux/kill.cpp
+++ b/libc/src/signal/linux/kill.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include <signal.h>
diff --git a/libc/src/signal/linux/sigaction.cpp b/libc/src/signal/linux/sigaction.cpp
index 65ec36741683..43a3e195474e 100644
--- a/libc/src/signal/linux/sigaction.cpp
+++ b/libc/src/signal/linux/sigaction.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigaddset.cpp b/libc/src/signal/linux/sigaddset.cpp
index 628883e13b88..2091e8b51453 100644
--- a/libc/src/signal/linux/sigaddset.cpp
+++ b/libc/src/signal/linux/sigaddset.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigaltstack.cpp b/libc/src/signal/linux/sigaltstack.cpp
index c19394cd1791..990b841c6d90 100644
--- a/libc/src/signal/linux/sigaltstack.cpp
+++ b/libc/src/signal/linux/sigaltstack.cpp
@@ -8,8 +8,8 @@
 
 #include "src/signal/sigaltstack.h"
 #include "hdr/types/stack_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include "src/__support/common.h"
diff --git a/libc/src/signal/linux/sigdelset.cpp b/libc/src/signal/linux/sigdelset.cpp
index 2e964051ebde..6fce0d7a6e14 100644
--- a/libc/src/signal/linux/sigdelset.cpp
+++ b/libc/src/signal/linux/sigdelset.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigemptyset.cpp b/libc/src/signal/linux/sigemptyset.cpp
index d347477695e6..034a9e2cbe15 100644
--- a/libc/src/signal/linux/sigemptyset.cpp
+++ b/libc/src/signal/linux/sigemptyset.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/signal/sigemptyset.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include "src/__support/common.h"
diff --git a/libc/src/signal/linux/sigfillset.cpp b/libc/src/signal/linux/sigfillset.cpp
index 3e9897a03bb7..f0b499093b31 100644
--- a/libc/src/signal/linux/sigfillset.cpp
+++ b/libc/src/signal/linux/sigfillset.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigprocmask.cpp b/libc/src/signal/linux/sigprocmask.cpp
index 8838379ae5d3..af3c424c5f34 100644
--- a/libc/src/signal/linux/sigprocmask.cpp
+++ b/libc/src/signal/linux/sigprocmask.cpp
@@ -11,8 +11,8 @@
 #include "hdr/types/sigset_t.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp
index bb8504f655c4..9a575bd59163 100644
--- a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp
@@ -11,8 +11,8 @@
 #include "file_actions.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp
index 710063d52e74..1ad45ed942bb 100644
--- a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp
@@ -11,8 +11,8 @@
 #include "file_actions.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp
index 028d6e895f3c..9977fc2d0a21 100644
--- a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp
@@ -11,8 +11,8 @@
 #include "file_actions.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp
index 168118da249d..affd338005cf 100644
--- a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp
@@ -12,8 +12,8 @@
 
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/stdio/fopencookie.cpp b/libc/src/stdio/fopencookie.cpp
index 9f5694e8e058..da8a132a4db6 100644
--- a/libc/src/stdio/fopencookie.cpp
+++ b/libc/src/stdio/fopencookie.cpp
@@ -14,8 +14,8 @@
 #include "src/__support/CPP/new.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fclose.cpp b/libc/src/stdio/generic/fclose.cpp
index 388407a58d41..902b4cf97237 100644
--- a/libc/src/stdio/generic/fclose.cpp
+++ b/libc/src/stdio/generic/fclose.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fflush.cpp b/libc/src/stdio/generic/fflush.cpp
index 5bdf71ad3594..d0271d9154c8 100644
--- a/libc/src/stdio/generic/fflush.cpp
+++ b/libc/src/stdio/generic/fflush.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fgetc.cpp b/libc/src/stdio/generic/fgetc.cpp
index aa6660ca180c..e65ce2fda49b 100644
--- a/libc/src/stdio/generic/fgetc.cpp
+++ b/libc/src/stdio/generic/fgetc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fgetc_unlocked.cpp b/libc/src/stdio/generic/fgetc_unlocked.cpp
index 34a27f1d1c42..5c07d4feb513 100644
--- a/libc/src/stdio/generic/fgetc_unlocked.cpp
+++ b/libc/src/stdio/generic/fgetc_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fgets.cpp b/libc/src/stdio/generic/fgets.cpp
index de6474087a14..e0ad9b6e2f56 100644
--- a/libc/src/stdio/generic/fgets.cpp
+++ b/libc/src/stdio/generic/fgets.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fopen.cpp b/libc/src/stdio/generic/fopen.cpp
index d6e418bacf37..57c85c2e54e1 100644
--- a/libc/src/stdio/generic/fopen.cpp
+++ b/libc/src/stdio/generic/fopen.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fputc.cpp b/libc/src/stdio/generic/fputc.cpp
index 54a38aeb2f1e..6639f0687c87 100644
--- a/libc/src/stdio/generic/fputc.cpp
+++ b/libc/src/stdio/generic/fputc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fputs.cpp b/libc/src/stdio/generic/fputs.cpp
index 8aef7683b3ce..621b40f63c91 100644
--- a/libc/src/stdio/generic/fputs.cpp
+++ b/libc/src/stdio/generic/fputs.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fread.cpp b/libc/src/stdio/generic/fread.cpp
index 3a04094ea8b4..1b576ec34688 100644
--- a/libc/src/stdio/generic/fread.cpp
+++ b/libc/src/stdio/generic/fread.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fread_unlocked.cpp b/libc/src/stdio/generic/fread_unlocked.cpp
index 151f43c6bbeb..257f1a212add 100644
--- a/libc/src/stdio/generic/fread_unlocked.cpp
+++ b/libc/src/stdio/generic/fread_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fseek.cpp b/libc/src/stdio/generic/fseek.cpp
index 21820da18542..99191e7c4194 100644
--- a/libc/src/stdio/generic/fseek.cpp
+++ b/libc/src/stdio/generic/fseek.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/fseek.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fseeko.cpp b/libc/src/stdio/generic/fseeko.cpp
index 7456b4a21907..afcfc71c7c09 100644
--- a/libc/src/stdio/generic/fseeko.cpp
+++ b/libc/src/stdio/generic/fseeko.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/fseeko.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/ftell.cpp b/libc/src/stdio/generic/ftell.cpp
index ec15ca4e96ca..b55a806007af 100644
--- a/libc/src/stdio/generic/ftell.cpp
+++ b/libc/src/stdio/generic/ftell.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/ftell.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/ftello.cpp b/libc/src/stdio/generic/ftello.cpp
index e3d0726ec484..91031cb7fad7 100644
--- a/libc/src/stdio/generic/ftello.cpp
+++ b/libc/src/stdio/generic/ftello.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/ftello.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fwrite.cpp b/libc/src/stdio/generic/fwrite.cpp
index 66eb9a3c7185..b44ecb283811 100644
--- a/libc/src/stdio/generic/fwrite.cpp
+++ b/libc/src/stdio/generic/fwrite.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fwrite_unlocked.cpp b/libc/src/stdio/generic/fwrite_unlocked.cpp
index a0d9014cd68d..2f9ec26f2f80 100644
--- a/libc/src/stdio/generic/fwrite_unlocked.cpp
+++ b/libc/src/stdio/generic/fwrite_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/getc.cpp b/libc/src/stdio/generic/getc.cpp
index e988468898c5..0ac010ebc599 100644
--- a/libc/src/stdio/generic/getc.cpp
+++ b/libc/src/stdio/generic/getc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/getc_unlocked.cpp b/libc/src/stdio/generic/getc_unlocked.cpp
index 92d5092623ac..eee23a18d05d 100644
--- a/libc/src/stdio/generic/getc_unlocked.cpp
+++ b/libc/src/stdio/generic/getc_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/getchar.cpp b/libc/src/stdio/generic/getchar.cpp
index 371fc70eb214..87d24a2b1f09 100644
--- a/libc/src/stdio/generic/getchar.cpp
+++ b/libc/src/stdio/generic/getchar.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/getchar_unlocked.cpp b/libc/src/stdio/generic/getchar_unlocked.cpp
index b898f5cb2596..f321969483e3 100644
--- a/libc/src/stdio/generic/getchar_unlocked.cpp
+++ b/libc/src/stdio/generic/getchar_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/putc.cpp b/libc/src/stdio/generic/putc.cpp
index b5f008fdce44..83bc3d4131e7 100644
--- a/libc/src/stdio/generic/putc.cpp
+++ b/libc/src/stdio/generic/putc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/putchar.cpp b/libc/src/stdio/generic/putchar.cpp
index e86df23d6716..2b3509e5e414 100644
--- a/libc/src/stdio/generic/putchar.cpp
+++ b/libc/src/stdio/generic/putchar.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/puts.cpp b/libc/src/stdio/generic/puts.cpp
index 7dbe2c79f920..4267dd546c4d 100644
--- a/libc/src/stdio/generic/puts.cpp
+++ b/libc/src/stdio/generic/puts.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
index 5b8f01d7d534..9877817d9209 100644
--- a/libc/src/stdio/gpu/fprintf.cpp
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
 #include <stdarg.h>
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
index 53fe69d5e2eb..8a9174d7397a 100644
--- a/libc/src/stdio/gpu/printf.cpp
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -11,7 +11,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
 #include <stdarg.h>
diff --git a/libc/src/stdio/linux/fdopen.cpp b/libc/src/stdio/linux/fdopen.cpp
index 7d72fdc88e9f..5623f06b7cff 100644
--- a/libc/src/stdio/linux/fdopen.cpp
+++ b/libc/src/stdio/linux/fdopen.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/fdopen.h"
 
 #include "src/__support/File/linux/file.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/linux/remove.cpp b/libc/src/stdio/linux/remove.cpp
index dbb4491d0e6c..ac755db0bc78 100644
--- a/libc/src/stdio/linux/remove.cpp
+++ b/libc/src/stdio/linux/remove.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h" // For AT_* macros.
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp
index fbcb29be48f4..426c8698e557 100644
--- a/libc/src/stdio/linux/rename.cpp
+++ b/libc/src/stdio/linux/rename.cpp
@@ -10,8 +10,8 @@
 #include "hdr/fcntl_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 89556f1a9e5f..cef9b1ae58fa 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -25,7 +25,7 @@
 #include "src/__support/fixed_point/fx_rep.h"
 #endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_STRERROR
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_STRERROR
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/setbuf.cpp b/libc/src/stdio/setbuf.cpp
index f3db97de5837..fcc6df12ddb0 100644
--- a/libc/src/stdio/setbuf.cpp
+++ b/libc/src/stdio/setbuf.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/setbuf.h"
 #include "hdr/stdio_macros.h"
 #include "src/__support/File/file.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/setvbuf.cpp b/libc/src/stdio/setvbuf.cpp
index 0a6b8cacb59c..9fc6cb040233 100644
--- a/libc/src/stdio/setvbuf.cpp
+++ b/libc/src/stdio/setvbuf.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdlib/atof.cpp b/libc/src/stdlib/atof.cpp
index 18a65c67705d..d0d8d211dea8 100644
--- a/libc/src/stdlib/atof.cpp
+++ b/libc/src/stdlib/atof.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atof.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/atoi.cpp b/libc/src/stdlib/atoi.cpp
index 9e46b53b1aa0..420bbc8143d5 100644
--- a/libc/src/stdlib/atoi.cpp
+++ b/libc/src/stdlib/atoi.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atoi.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/atol.cpp b/libc/src/stdlib/atol.cpp
index 7f3414a4afdd..e1110ffa449b 100644
--- a/libc/src/stdlib/atol.cpp
+++ b/libc/src/stdlib/atol.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atol.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/atoll.cpp b/libc/src/stdlib/atoll.cpp
index 4f1a02ad8315..063e817f9b79 100644
--- a/libc/src/stdlib/atoll.cpp
+++ b/libc/src/stdlib/atoll.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atoll.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtod.cpp b/libc/src/stdlib/strtod.cpp
index 2c6819163aa4..deb2390c7fcd 100644
--- a/libc/src/stdlib/strtod.cpp
+++ b/libc/src/stdlib/strtod.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtod.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtod_l.cpp b/libc/src/stdlib/strtod_l.cpp
index 247314398315..ad333b32d240 100644
--- a/libc/src/stdlib/strtod_l.cpp
+++ b/libc/src/stdlib/strtod_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtod_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtof.cpp b/libc/src/stdlib/strtof.cpp
index 351bf64ad4f7..fc52dc85ffc5 100644
--- a/libc/src/stdlib/strtof.cpp
+++ b/libc/src/stdlib/strtof.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtof.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtof_l.cpp b/libc/src/stdlib/strtof_l.cpp
index d54efa66e084..c6e03ff51fa2 100644
--- a/libc/src/stdlib/strtof_l.cpp
+++ b/libc/src/stdlib/strtof_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtof_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtol.cpp b/libc/src/stdlib/strtol.cpp
index 77f8712d7c13..42db36b2052b 100644
--- a/libc/src/stdlib/strtol.cpp
+++ b/libc/src/stdlib/strtol.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtol.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtol_l.cpp b/libc/src/stdlib/strtol_l.cpp
index f94aff1a0d7b..497a4403eff4 100644
--- a/libc/src/stdlib/strtol_l.cpp
+++ b/libc/src/stdlib/strtol_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtol_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtold.cpp b/libc/src/stdlib/strtold.cpp
index 88d29c9f3627..44046c2c6f61 100644
--- a/libc/src/stdlib/strtold.cpp
+++ b/libc/src/stdlib/strtold.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtold.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtold_l.cpp b/libc/src/stdlib/strtold_l.cpp
index d0c57f50246b..c3af30a1b9ec 100644
--- a/libc/src/stdlib/strtold_l.cpp
+++ b/libc/src/stdlib/strtold_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtold_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoll.cpp b/libc/src/stdlib/strtoll.cpp
index 8d1b3efdcf87..c1dca13112e0 100644
--- a/libc/src/stdlib/strtoll.cpp
+++ b/libc/src/stdlib/strtoll.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoll.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoll_l.cpp b/libc/src/stdlib/strtoll_l.cpp
index e82971d59c48..6f30d7794c5c 100644
--- a/libc/src/stdlib/strtoll_l.cpp
+++ b/libc/src/stdlib/strtoll_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoll_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoul.cpp b/libc/src/stdlib/strtoul.cpp
index 1d832318c448..d26ca5e5a10a 100644
--- a/libc/src/stdlib/strtoul.cpp
+++ b/libc/src/stdlib/strtoul.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoul.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoul_l.cpp b/libc/src/stdlib/strtoul_l.cpp
index 74fce00a0ac3..9a875ddee902 100644
--- a/libc/src/stdlib/strtoul_l.cpp
+++ b/libc/src/stdlib/strtoul_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoul_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoull.cpp b/libc/src/stdlib/strtoull.cpp
index dba22611cfb0..8f929f577311 100644
--- a/libc/src/stdlib/strtoull.cpp
+++ b/libc/src/stdlib/strtoull.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoull.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoull_l.cpp b/libc/src/stdlib/strtoull_l.cpp
index 2ea8a43a40ef..9eb056b0e59b 100644
--- a/libc/src/stdlib/strtoull_l.cpp
+++ b/libc/src/stdlib/strtoull_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoull_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/string/strdup.cpp b/libc/src/string/strdup.cpp
index 4cf4173a27bf..dab0ab4288c9 100644
--- a/libc/src/string/strdup.cpp
+++ b/libc/src/string/strdup.cpp
@@ -8,8 +8,8 @@
 
 #include "src/string/strdup.h"
 #include "hdr/stdlib_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/string/allocating_string_utils.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index 236fd25698f6..f3ae7c5c4e07 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -9,8 +9,8 @@
 #include "src/sys/auxv/getauxval.h"
 #include "config/app.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <linux/auxvec.h>
 
 // for guarded initialization
diff --git a/libc/src/sys/epoll/linux/epoll_create.cpp b/libc/src/sys/epoll/linux/epoll_create.cpp
index 7196ac7410c3..2e44e883ddf0 100644
--- a/libc/src/sys/epoll/linux/epoll_create.cpp
+++ b/libc/src/sys/epoll/linux/epoll_create.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/epoll/linux/epoll_create1.cpp b/libc/src/sys/epoll/linux/epoll_create1.cpp
index efff282e2714..3c60090fb7b4 100644
--- a/libc/src/sys/epoll/linux/epoll_create1.cpp
+++ b/libc/src/sys/epoll/linux/epoll_create1.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/epoll/linux/epoll_ctl.cpp b/libc/src/sys/epoll/linux/epoll_ctl.cpp
index 5f7dbb77b1e5..079bd60403b0 100644
--- a/libc/src/sys/epoll/linux/epoll_ctl.cpp
+++ b/libc/src/sys/epoll/linux/epoll_ctl.cpp
@@ -11,8 +11,8 @@
 #include "hdr/types/struct_epoll_event.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index d7836549928c..24fd1dbdc467 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -13,9 +13,9 @@
 #include "hdr/types/struct_epoll_event.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 14b419399fe9..219984528efd 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -14,9 +14,9 @@
 #include "hdr/types/struct_timespec.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index 1a63be5e260f..7fae7b55992f 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -13,9 +13,9 @@
 #include "hdr/types/struct_epoll_event.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/mman/linux/madvise.cpp b/libc/src/sys/mman/linux/madvise.cpp
index 332d6c2db4ac..1bb284f62b89 100644
--- a/libc/src/sys/mman/linux/madvise.cpp
+++ b/libc/src/sys/mman/linux/madvise.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mincore.cpp b/libc/src/sys/mman/linux/mincore.cpp
index b5436fda3853..d583f1ef85f3 100644
--- a/libc/src/sys/mman/linux/mincore.cpp
+++ b/libc/src/sys/mman/linux/mincore.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mlock.cpp b/libc/src/sys/mman/linux/mlock.cpp
index be7eb28e29c4..8582eb7c0063 100644
--- a/libc/src/sys/mman/linux/mlock.cpp
+++ b/libc/src/sys/mman/linux/mlock.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mlock2.cpp b/libc/src/sys/mman/linux/mlock2.cpp
index 7bc557f9bf58..955cfe128de7 100644
--- a/libc/src/sys/mman/linux/mlock2.cpp
+++ b/libc/src/sys/mman/linux/mlock2.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mlockall.cpp b/libc/src/sys/mman/linux/mlockall.cpp
index eae3a9ea0a18..c3502fbb3af3 100644
--- a/libc/src/sys/mman/linux/mlockall.cpp
+++ b/libc/src/sys/mman/linux/mlockall.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mmap.cpp b/libc/src/sys/mman/linux/mmap.cpp
index ee9a0a32e8f5..33f9fe8ff370 100644
--- a/libc/src/sys/mman/linux/mmap.cpp
+++ b/libc/src/sys/mman/linux/mmap.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/mman/linux/mprotect.cpp b/libc/src/sys/mman/linux/mprotect.cpp
index e2351028e2c7..6b14915b60c9 100644
--- a/libc/src/sys/mman/linux/mprotect.cpp
+++ b/libc/src/sys/mman/linux/mprotect.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mremap.cpp b/libc/src/sys/mman/linux/mremap.cpp
index 38bcfce833d3..6cdda9435bb6 100644
--- a/libc/src/sys/mman/linux/mremap.cpp
+++ b/libc/src/sys/mman/linux/mremap.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/mman/linux/msync.cpp b/libc/src/sys/mman/linux/msync.cpp
index e2b4f81d616a..650678bcb36e 100644
--- a/libc/src/sys/mman/linux/msync.cpp
+++ b/libc/src/sys/mman/linux/msync.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/munlock.cpp b/libc/src/sys/mman/linux/munlock.cpp
index 93c25f844c6e..9638949f5fcb 100644
--- a/libc/src/sys/mman/linux/munlock.cpp
+++ b/libc/src/sys/mman/linux/munlock.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/munlockall.cpp b/libc/src/sys/mman/linux/munlockall.cpp
index f5911cb01bc2..f47eaece178e 100644
--- a/libc/src/sys/mman/linux/munlockall.cpp
+++ b/libc/src/sys/mman/linux/munlockall.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/munmap.cpp b/libc/src/sys/mman/linux/munmap.cpp
index 9c01b15ac8dc..61b1f1549dd1 100644
--- a/libc/src/sys/mman/linux/munmap.cpp
+++ b/libc/src/sys/mman/linux/munmap.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
-#include <sys/syscall.h>          // For syscall numbers.
+#include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/mman/linux/remap_file_pages.cpp b/libc/src/sys/mman/linux/remap_file_pages.cpp
index f616e1915ecc..58ae4017f628 100644
--- a/libc/src/sys/mman/linux/remap_file_pages.cpp
+++ b/libc/src/sys/mman/linux/remap_file_pages.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h
index ce75c2b5b699..69911012ff7e 100644
--- a/libc/src/sys/mman/linux/shm_common.h
+++ b/libc/src/sys/mman/linux/shm_common.h
@@ -9,8 +9,8 @@
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
 // TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121
diff --git a/libc/src/sys/prctl/linux/prctl.cpp b/libc/src/sys/prctl/linux/prctl.cpp
index 5d4e9046b877..c726b0a53959 100644
--- a/libc/src/sys/prctl/linux/prctl.cpp
+++ b/libc/src/sys/prctl/linux/prctl.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/random/linux/getrandom.cpp b/libc/src/sys/random/linux/getrandom.cpp
index 9a8869a2d6d3..0b8471ed8b37 100644
--- a/libc/src/sys/random/linux/getrandom.cpp
+++ b/libc/src/sys/random/linux/getrandom.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/resource/linux/getrlimit.cpp b/libc/src/sys/resource/linux/getrlimit.cpp
index 30c2e91b036d..d27213419494 100644
--- a/libc/src/sys/resource/linux/getrlimit.cpp
+++ b/libc/src/sys/resource/linux/getrlimit.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/resource.h> // For struct rlimit
 #include <sys/syscall.h>  // For syscall numbers.
 
diff --git a/libc/src/sys/resource/linux/setrlimit.cpp b/libc/src/sys/resource/linux/setrlimit.cpp
index 85f07900aaef..300bad75baa6 100644
--- a/libc/src/sys/resource/linux/setrlimit.cpp
+++ b/libc/src/sys/resource/linux/setrlimit.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/resource.h> // For struct rlimit
 #include <sys/syscall.h>  // For syscall numbers.
 
diff --git a/libc/src/sys/select/linux/select.cpp b/libc/src/sys/select/linux/select.cpp
index 9ccb1e95f275..6c434eb58459 100644
--- a/libc/src/sys/select/linux/select.cpp
+++ b/libc/src/sys/select/linux/select.cpp
@@ -13,8 +13,8 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stddef.h>      // For size_t
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/sendfile/linux/sendfile.cpp b/libc/src/sys/sendfile/linux/sendfile.cpp
index 9d4174cb8c91..ec892323def5 100644
--- a/libc/src/sys/sendfile/linux/sendfile.cpp
+++ b/libc/src/sys/sendfile/linux/sendfile.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/sendfile.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/socket/linux/bind.cpp b/libc/src/sys/socket/linux/bind.cpp
index 72a3307a91dd..83a3d06f5380 100644
--- a/libc/src/sys/socket/linux/bind.cpp
+++ b/libc/src/sys/socket/linux/bind.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/socket/linux/recv.cpp b/libc/src/sys/socket/linux/recv.cpp
index 5e9f2d3233fc..baf4de1b5eb5 100644
--- a/libc/src/sys/socket/linux/recv.cpp
+++ b/libc/src/sys/socket/linux/recv.cpp
@@ -16,8 +16,8 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/recvfrom.cpp b/libc/src/sys/socket/linux/recvfrom.cpp
index 574e65f64a54..3d8397b478cc 100644
--- a/libc/src/sys/socket/linux/recvfrom.cpp
+++ b/libc/src/sys/socket/linux/recvfrom.cpp
@@ -16,8 +16,8 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/recvmsg.cpp b/libc/src/sys/socket/linux/recvmsg.cpp
index e42b6346f330..bc6d072dbf9a 100644
--- a/libc/src/sys/socket/linux/recvmsg.cpp
+++ b/libc/src/sys/socket/linux/recvmsg.cpp
@@ -15,8 +15,8 @@
 #include "hdr/types/struct_msghdr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/send.cpp b/libc/src/sys/socket/linux/send.cpp
index cb3b4d5a9ece..43b01e7e6e0f 100644
--- a/libc/src/sys/socket/linux/send.cpp
+++ b/libc/src/sys/socket/linux/send.cpp
@@ -16,7 +16,7 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/sendmsg.cpp b/libc/src/sys/socket/linux/sendmsg.cpp
index b4d9c9deda02..b04783ebfe7e 100644
--- a/libc/src/sys/socket/linux/sendmsg.cpp
+++ b/libc/src/sys/socket/linux/sendmsg.cpp
@@ -15,7 +15,7 @@
 #include "hdr/types/struct_msghdr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/sendto.cpp b/libc/src/sys/socket/linux/sendto.cpp
index 2fada192b086..9dda127f872d 100644
--- a/libc/src/sys/socket/linux/sendto.cpp
+++ b/libc/src/sys/socket/linux/sendto.cpp
@@ -16,7 +16,7 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/socket.cpp b/libc/src/sys/socket/linux/socket.cpp
index 3e6df4d487a5..69eb6cfa01ce 100644
--- a/libc/src/sys/socket/linux/socket.cpp
+++ b/libc/src/sys/socket/linux/socket.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/socket/linux/socketpair.cpp b/libc/src/sys/socket/linux/socketpair.cpp
index 60612ac04d61..7ea8ca46cee5 100644
--- a/libc/src/sys/socket/linux/socketpair.cpp
+++ b/libc/src/sys/socket/linux/socketpair.cpp
@@ -10,9 +10,9 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 #include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/chmod.cpp b/libc/src/sys/stat/linux/chmod.cpp
index 1b787e47e7c6..2bd0788ec1df 100644
--- a/libc/src/sys/stat/linux/chmod.cpp
+++ b/libc/src/sys/stat/linux/chmod.cpp
@@ -13,8 +13,8 @@
 
 #include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fchmod.cpp b/libc/src/sys/stat/linux/fchmod.cpp
index 0d6fd359169a..3dadfdd1d943 100644
--- a/libc/src/sys/stat/linux/fchmod.cpp
+++ b/libc/src/sys/stat/linux/fchmod.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/types/mode_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fchmodat.cpp b/libc/src/sys/stat/linux/fchmodat.cpp
index e76db4d160fb..add2192a558a 100644
--- a/libc/src/sys/stat/linux/fchmodat.cpp
+++ b/libc/src/sys/stat/linux/fchmodat.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fstat.cpp b/libc/src/sys/stat/linux/fstat.cpp
index 35cf8f08f782..dea002c5e12a 100644
--- a/libc/src/sys/stat/linux/fstat.cpp
+++ b/libc/src/sys/stat/linux/fstat.cpp
@@ -8,8 +8,8 @@
 
 #include "src/sys/stat/fstat.h"
 #include "kernel_statx.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/common.h"
 
diff --git a/libc/src/sys/stat/linux/lstat.cpp b/libc/src/sys/stat/linux/lstat.cpp
index 354c5b6e029a..5601dd5d78a9 100644
--- a/libc/src/sys/stat/linux/lstat.cpp
+++ b/libc/src/sys/stat/linux/lstat.cpp
@@ -8,8 +8,8 @@
 
 #include "src/sys/stat/lstat.h"
 #include "kernel_statx.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
diff --git a/libc/src/sys/stat/linux/mkdir.cpp b/libc/src/sys/stat/linux/mkdir.cpp
index b319b5c8393d..0829ff4f9432 100644
--- a/libc/src/sys/stat/linux/mkdir.cpp
+++ b/libc/src/sys/stat/linux/mkdir.cpp
@@ -13,8 +13,8 @@
 
 #include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/mkdirat.cpp b/libc/src/sys/stat/linux/mkdirat.cpp
index 097fc158010d..8f4194dc3275 100644
--- a/libc/src/sys/stat/linux/mkdirat.cpp
+++ b/libc/src/sys/stat/linux/mkdirat.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/stat.cpp b/libc/src/sys/stat/linux/stat.cpp
index de9cdb197d68..5553eaf00be2 100644
--- a/libc/src/sys/stat/linux/stat.cpp
+++ b/libc/src/sys/stat/linux/stat.cpp
@@ -8,8 +8,8 @@
 
 #include "src/sys/stat/stat.h"
 #include "kernel_statx.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/common.h"
 
diff --git a/libc/src/sys/statvfs/linux/statfs_utils.h b/libc/src/sys/statvfs/linux/statfs_utils.h
index 1e5be5153101..8ee4de288ef6 100644
--- a/libc/src/sys/statvfs/linux/statfs_utils.h
+++ b/libc/src/sys/statvfs/linux/statfs_utils.h
@@ -12,9 +12,9 @@
 #include "include/llvm-libc-types/struct_statvfs.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/OSUtil/syscall.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <asm/statfs.h>
 #include <sys/syscall.h>
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/time/linux/getitimer.cpp b/libc/src/sys/time/linux/getitimer.cpp
index fec06aa4086e..b87406679694 100644
--- a/libc/src/sys/time/linux/getitimer.cpp
+++ b/libc/src/sys/time/linux/getitimer.cpp
@@ -10,7 +10,7 @@
 #include "hdr/types/struct_itimerval.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/time/linux/setitimer.cpp b/libc/src/sys/time/linux/setitimer.cpp
index def04a474011..1de0d4329776 100644
--- a/libc/src/sys/time/linux/setitimer.cpp
+++ b/libc/src/sys/time/linux/setitimer.cpp
@@ -9,7 +9,7 @@
 #include "hdr/types/struct_itimerval.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp
index 76b69937a5f4..ed37b42aedf6 100644
--- a/libc/src/sys/time/linux/utimes.cpp
+++ b/libc/src/sys/time/linux/utimes.cpp
@@ -15,7 +15,7 @@
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include <sys/syscall.h>
 
diff --git a/libc/src/sys/uio/linux/readv.cpp b/libc/src/sys/uio/linux/readv.cpp
index f1393a9749be..c9d8d87ddc72 100644
--- a/libc/src/sys/uio/linux/readv.cpp
+++ b/libc/src/sys/uio/linux/readv.cpp
@@ -10,7 +10,7 @@
 #include "hdr/types/struct_iovec.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/uio/linux/writev.cpp b/libc/src/sys/uio/linux/writev.cpp
index 8992bed95c98..b0b9e1520792 100644
--- a/libc/src/sys/uio/linux/writev.cpp
+++ b/libc/src/sys/uio/linux/writev.cpp
@@ -10,7 +10,7 @@
 #include "hdr/types/struct_iovec.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/utsname/linux/uname.cpp b/libc/src/sys/utsname/linux/uname.cpp
index 7bb227e801e3..b47ba964faf0 100644
--- a/libc/src/sys/utsname/linux/uname.cpp
+++ b/libc/src/sys/utsname/linux/uname.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 #include <sys/utsname.h>
 
diff --git a/libc/src/sys/wait/wait4Impl.h b/libc/src/sys/wait/wait4Impl.h
index f2bdeb02f866..77ed3ad22f14 100644
--- a/libc/src/sys/wait/wait4Impl.h
+++ b/libc/src/sys/wait/wait4Impl.h
@@ -12,8 +12,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <signal.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/termios/linux/cfsetispeed.cpp b/libc/src/termios/linux/cfsetispeed.cpp
index 9656b714a8ed..47b19974d21b 100644
--- a/libc/src/termios/linux/cfsetispeed.cpp
+++ b/libc/src/termios/linux/cfsetispeed.cpp
@@ -9,8 +9,8 @@
 #include "src/termios/cfsetispeed.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <termios.h>
 
diff --git a/libc/src/termios/linux/cfsetospeed.cpp b/libc/src/termios/linux/cfsetospeed.cpp
index 6130d266dbff..d2f138257a47 100644
--- a/libc/src/termios/linux/cfsetospeed.cpp
+++ b/libc/src/termios/linux/cfsetospeed.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/termios/cfsetospeed.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/common.h"
 
diff --git a/libc/src/termios/linux/tcdrain.cpp b/libc/src/termios/linux/tcdrain.cpp
index 116e3f0e0cbc..570b15c24fe7 100644
--- a/libc/src/termios/linux/tcdrain.cpp
+++ b/libc/src/termios/linux/tcdrain.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcflow.cpp b/libc/src/termios/linux/tcflow.cpp
index d229230b5d13..714ef6aa7129 100644
--- a/libc/src/termios/linux/tcflow.cpp
+++ b/libc/src/termios/linux/tcflow.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcflush.cpp b/libc/src/termios/linux/tcflush.cpp
index 028a5414b196..4c7b9fadc446 100644
--- a/libc/src/termios/linux/tcflush.cpp
+++ b/libc/src/termios/linux/tcflush.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcgetattr.cpp b/libc/src/termios/linux/tcgetattr.cpp
index 63c096ff88eb..2e768269c874 100644
--- a/libc/src/termios/linux/tcgetattr.cpp
+++ b/libc/src/termios/linux/tcgetattr.cpp
@@ -11,8 +11,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcgetsid.cpp b/libc/src/termios/linux/tcgetsid.cpp
index c283d0e4fda9..7487816cf274 100644
--- a/libc/src/termios/linux/tcgetsid.cpp
+++ b/libc/src/termios/linux/tcgetsid.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcsendbreak.cpp b/libc/src/termios/linux/tcsendbreak.cpp
index 30bc91cf3de0..1d546c1d5953 100644
--- a/libc/src/termios/linux/tcsendbreak.cpp
+++ b/libc/src/termios/linux/tcsendbreak.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcsetattr.cpp b/libc/src/termios/linux/tcsetattr.cpp
index 8aa1e5c57b34..8a2c7290217b 100644
--- a/libc/src/termios/linux/tcsetattr.cpp
+++ b/libc/src/termios/linux/tcsetattr.cpp
@@ -11,8 +11,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/threads/thrd_create.cpp b/libc/src/threads/thrd_create.cpp
index 4680944c2eee..67e22e72fd0e 100644
--- a/libc/src/threads/thrd_create.cpp
+++ b/libc/src/threads/thrd_create.cpp
@@ -8,9 +8,9 @@
 
 #include "src/threads/thrd_create.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <threads.h> // For thrd_* type definitions.
 
diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp
index ee4fa82b4f89..c38697cd0668 100644
--- a/libc/src/time/linux/clock.cpp
+++ b/libc/src/time/linux/clock.cpp
@@ -10,10 +10,10 @@
 #include "hdr/time_macros.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/__support/time/units.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp
index 743c644d65d0..b3fcd2b22f9d 100644
--- a/libc/src/time/linux/clock_gettime.cpp
+++ b/libc/src/time/linux/clock_gettime.cpp
@@ -8,9 +8,9 @@
 
 #include "src/time/clock_gettime.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/linux/gettimeofday.cpp b/libc/src/time/linux/gettimeofday.cpp
index e8ddf482fc98..237b05903c70 100644
--- a/libc/src/time/linux/gettimeofday.cpp
+++ b/libc/src/time/linux/gettimeofday.cpp
@@ -10,10 +10,10 @@
 #include "hdr/time_macros.h"
 #include "hdr/types/suseconds_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/__support/time/units.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp
index 7a856376ffb2..6b9704126a0a 100644
--- a/libc/src/time/linux/nanosleep.cpp
+++ b/libc/src/time/linux/nanosleep.cpp
@@ -10,8 +10,8 @@
 #include "hdr/time_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stdint.h>      // For int64_t.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp
index cf5174523aa4..a4d437233273 100644
--- a/libc/src/time/linux/timespec_get.cpp
+++ b/libc/src/time/linux/timespec_get.cpp
@@ -9,9 +9,9 @@
 #include "src/time/timespec_get.h"
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp
index 860909af7488..2a81f0182c31 100644
--- a/libc/src/time/time.cpp
+++ b/libc/src/time/time.cpp
@@ -10,9 +10,9 @@
 
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 // avoid inconsitent clang-format behavior
diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h
index bbbb1c08a475..0541c24ece82 100644
--- a/libc/src/time/time_utils.h
+++ b/libc/src/time/time_utils.h
@@ -15,8 +15,8 @@
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "time_constants.h"
 
 #include <stdint.h>
diff --git a/libc/src/time/windows/clock_getres.cpp b/libc/src/time/windows/clock_getres.cpp
index b8c0c82aa641..969bb66be2d2 100644
--- a/libc/src/time/windows/clock_getres.cpp
+++ b/libc/src/time/windows/clock_getres.cpp
@@ -13,10 +13,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/time/units.h"
 #include "src/__support/time/windows/performance_counter.h"
-#include "src/errno/libc_errno.h"
 #include "src/time/clock_getres.h"
 
 #define WIN32_LEAN_AND_MEAN
diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp
index 2f7ebbcdf9e8..55cd6adca779 100644
--- a/libc/src/unistd/linux/access.cpp
+++ b/libc/src/unistd/linux/access.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/chdir.cpp b/libc/src/unistd/linux/chdir.cpp
index a30d1dc883be..04ba509b49a5 100644
--- a/libc/src/unistd/linux/chdir.cpp
+++ b/libc/src/unistd/linux/chdir.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/close.cpp b/libc/src/unistd/linux/close.cpp
index 58d42a9673fb..b5842f2b64d2 100644
--- a/libc/src/unistd/linux/close.cpp
+++ b/libc/src/unistd/linux/close.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup.cpp b/libc/src/unistd/linux/dup.cpp
index c1710a37f611..81d30c6cdbc4 100644
--- a/libc/src/unistd/linux/dup.cpp
+++ b/libc/src/unistd/linux/dup.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index 7ffc151a053c..0a0e86573b34 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup3.cpp b/libc/src/unistd/linux/dup3.cpp
index c096ba73c96b..770fb73515b2 100644
--- a/libc/src/unistd/linux/dup3.cpp
+++ b/libc/src/unistd/linux/dup3.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/execv.cpp b/libc/src/unistd/linux/execv.cpp
index a3f2525ed7ca..d4f2bd9a5165 100644
--- a/libc/src/unistd/linux/execv.cpp
+++ b/libc/src/unistd/linux/execv.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/execve.cpp b/libc/src/unistd/linux/execve.cpp
index 37162c412178..2214b6df493b 100644
--- a/libc/src/unistd/linux/execve.cpp
+++ b/libc/src/unistd/linux/execve.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/fchdir.cpp b/libc/src/unistd/linux/fchdir.cpp
index 8196dc63ab1e..f7a7422363e6 100644
--- a/libc/src/unistd/linux/fchdir.cpp
+++ b/libc/src/unistd/linux/fchdir.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/fork.cpp b/libc/src/unistd/linux/fork.cpp
index 8aa0477a15d5..75a76fdea50b 100644
--- a/libc/src/unistd/linux/fork.cpp
+++ b/libc/src/unistd/linux/fork.cpp
@@ -15,7 +15,7 @@
 #include "src/__support/threads/identifier.h"
 #include "src/__support/threads/thread.h" // For thread self object
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <signal.h>      // For SIGCHLD
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/fsync.cpp b/libc/src/unistd/linux/fsync.cpp
index ae3895bab15f..fe08aed61e25 100644
--- a/libc/src/unistd/linux/fsync.cpp
+++ b/libc/src/unistd/linux/fsync.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/ftruncate.cpp b/libc/src/unistd/linux/ftruncate.cpp
index ccbb0634664a..f6aa6f8b48cc 100644
--- a/libc/src/unistd/linux/ftruncate.cpp
+++ b/libc/src/unistd/linux/ftruncate.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/unistd_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stdint.h>      // For uint64_t.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/getcwd.cpp b/libc/src/unistd/linux/getcwd.cpp
index 1bb11a7c8e7b..c0e475dd3e8f 100644
--- a/libc/src/unistd/linux/getcwd.cpp
+++ b/libc/src/unistd/linux/getcwd.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/string/allocating_string_utils.h" // For strdup.
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <linux/limits.h> // This is safe to include without any name pollution.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/getentropy.cpp b/libc/src/unistd/linux/getentropy.cpp
index 168a1197734e..65bcbf27601d 100644
--- a/libc/src/unistd/linux/getentropy.cpp
+++ b/libc/src/unistd/linux/getentropy.cpp
@@ -10,7 +10,7 @@
 #include "hdr/errno_macros.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/getsid.cpp b/libc/src/unistd/linux/getsid.cpp
index 5977c5bf10e9..025b8d1691ac 100644
--- a/libc/src/unistd/linux/getsid.cpp
+++ b/libc/src/unistd/linux/getsid.cpp
@@ -11,8 +11,8 @@
 #include "hdr/types/pid_t.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/isatty.cpp b/libc/src/unistd/linux/isatty.cpp
index e6ea22a714c7..a4d17912b57b 100644
--- a/libc/src/unistd/linux/isatty.cpp
+++ b/libc/src/unistd/linux/isatty.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/ioctl.h>   // For ioctl numbers.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/link.cpp b/libc/src/unistd/linux/link.cpp
index 477806a70df7..205cf8a84a5c 100644
--- a/libc/src/unistd/linux/link.cpp
+++ b/libc/src/unistd/linux/link.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/linkat.cpp b/libc/src/unistd/linux/linkat.cpp
index 40f68cc90c48..ea5bc48cbedc 100644
--- a/libc/src/unistd/linux/linkat.cpp
+++ b/libc/src/unistd/linux/linkat.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/lseek.cpp b/libc/src/unistd/linux/lseek.cpp
index 0e957498da74..26a08269fd8d 100644
--- a/libc/src/unistd/linux/lseek.cpp
+++ b/libc/src/unistd/linux/lseek.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/unistd/lseek.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/File/linux/lseekImpl.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
diff --git a/libc/src/unistd/linux/pathconf.cpp b/libc/src/unistd/linux/pathconf.cpp
index ca1c10bb9f7f..7dde857c1cfd 100644
--- a/libc/src/unistd/linux/pathconf.cpp
+++ b/libc/src/unistd/linux/pathconf.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/unistd/pathconf.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/statvfs/linux/statfs_utils.h"
 #include "src/unistd/linux/pathconf_utils.h"
 
diff --git a/libc/src/unistd/linux/pathconf_utils.cpp b/libc/src/unistd/linux/pathconf_utils.cpp
index 035e628dff25..9a62e31fd188 100644
--- a/libc/src/unistd/linux/pathconf_utils.cpp
+++ b/libc/src/unistd/linux/pathconf_utils.cpp
@@ -14,8 +14,8 @@
 #include "hdr/unistd_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/statvfs/linux/statfs_utils.h"
 
 // other linux specific includes
diff --git a/libc/src/unistd/linux/pipe.cpp b/libc/src/unistd/linux/pipe.cpp
index dfcd5bfdaf53..b9943c833805 100644
--- a/libc/src/unistd/linux/pipe.cpp
+++ b/libc/src/unistd/linux/pipe.cpp
@@ -10,10 +10,10 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
-#include "src/errno/libc_errno.h"
-#include <sys/syscall.h> // For syscall numbers.
+#include <sys/syscall.h>                    // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/unistd/linux/pipe2.cpp b/libc/src/unistd/linux/pipe2.cpp
index ebe7e0114ae9..d30f3b37a1ad 100644
--- a/libc/src/unistd/linux/pipe2.cpp
+++ b/libc/src/unistd/linux/pipe2.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/pread.cpp b/libc/src/unistd/linux/pread.cpp
index 3e27857f9a2b..2f86e397feef 100644
--- a/libc/src/unistd/linux/pread.cpp
+++ b/libc/src/unistd/linux/pread.cpp
@@ -10,11 +10,11 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
-#include "src/errno/libc_errno.h"
-#include <stdint.h>      // For uint64_t.
-#include <sys/syscall.h> // For syscall numbers.
+#include <stdint.h>                         // For uint64_t.
+#include <sys/syscall.h>                    // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/unistd/linux/pwrite.cpp b/libc/src/unistd/linux/pwrite.cpp
index 1b81b2a05949..f4cf8e16d766 100644
--- a/libc/src/unistd/linux/pwrite.cpp
+++ b/libc/src/unistd/linux/pwrite.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stdint.h>      // For uint64_t.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/read.cpp b/libc/src/unistd/linux/read.cpp
index 4419900f2330..55676f3f7010 100644
--- a/libc/src/unistd/linux/read.cpp
+++ b/libc/src/unistd/linux/read.cpp
@@ -10,10 +10,10 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
-#include "src/errno/libc_errno.h"
-#include <sys/syscall.h> // For syscall numbers.
+#include <sys/syscall.h>                    // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/unistd/linux/readlink.cpp b/libc/src/unistd/linux/readlink.cpp
index 2055e6b3400f..b297a41ca37b 100644
--- a/libc/src/unistd/linux/readlink.cpp
+++ b/libc/src/unistd/linux/readlink.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/readlinkat.cpp b/libc/src/unistd/linux/readlinkat.cpp
index e5e4d0d39bc9..cd0dcb8e0ff0 100644
--- a/libc/src/unistd/linux/readlinkat.cpp
+++ b/libc/src/unistd/linux/readlinkat.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/rmdir.cpp b/libc/src/unistd/linux/rmdir.cpp
index 075af12af64c..eca6e954ef89 100644
--- a/libc/src/unistd/linux/rmdir.cpp
+++ b/libc/src/unistd/linux/rmdir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/symlink.cpp b/libc/src/unistd/linux/symlink.cpp
index 9e1b2886ea0f..3f43de19d2f4 100644
--- a/libc/src/unistd/linux/symlink.cpp
+++ b/libc/src/unistd/linux/symlink.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/symlinkat.cpp b/libc/src/unistd/linux/symlinkat.cpp
index bcf2d0f8cc05..8cee172f39df 100644
--- a/libc/src/unistd/linux/symlinkat.cpp
+++ b/libc/src/unistd/linux/symlinkat.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/syscall.cpp b/libc/src/unistd/linux/syscall.cpp
index 5394bff46adf..0f7b3da88d62 100644
--- a/libc/src/unistd/linux/syscall.cpp
+++ b/libc/src/unistd/linux/syscall.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stdarg.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/sysconf.cpp b/libc/src/unistd/linux/sysconf.cpp
index f785ff321c7d..03f224b15027 100644
--- a/libc/src/unistd/linux/sysconf.cpp
+++ b/libc/src/unistd/linux/sysconf.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/unistd_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/auxv/getauxval.h"
 #include <sys/auxv.h>
 
diff --git a/libc/src/unistd/linux/truncate.cpp b/libc/src/unistd/linux/truncate.cpp
index 8236edb480d1..6103d4b51350 100644
--- a/libc/src/unistd/linux/truncate.cpp
+++ b/libc/src/unistd/linux/truncate.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/unistd_macros.h"
 #include <stdint.h>      // For uint64_t.
diff --git a/libc/src/unistd/linux/unlink.cpp b/libc/src/unistd/linux/unlink.cpp
index 72d8e2398e3d..5fde2600937b 100644
--- a/libc/src/unistd/linux/unlink.cpp
+++ b/libc/src/unistd/linux/unlink.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/unlinkat.cpp b/libc/src/unistd/linux/unlinkat.cpp
index 4ed20f542f17..b2012c52b885 100644
--- a/libc/src/unistd/linux/unlinkat.cpp
+++ b/libc/src/unistd/linux/unlinkat.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/write.cpp b/libc/src/unistd/linux/write.cpp
index 99d5ab7e480b..eecb74429182 100644
--- a/libc/src/unistd/linux/write.cpp
+++ b/libc/src/unistd/linux/write.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/windows/getentropy.cpp b/libc/src/unistd/windows/getentropy.cpp
index bfaec723ac63..e25a7a8fed40 100644
--- a/libc/src/unistd/windows/getentropy.cpp
+++ b/libc/src/unistd/windows/getentropy.cpp
@@ -9,7 +9,7 @@
 #include "src/unistd/getentropy.h"
 #include "hdr/errno_macros.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <Windows.h>
diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h
index 5be66d9edff0..24c007d2e12e 100644
--- a/libc/test/IntegrationTest/test.h
+++ b/libc/test/IntegrationTest/test.h
@@ -68,12 +68,9 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Errno checks.
 
-#define ASSERT_ERRNO_EQ(VAL)                                                   \
-  ASSERT_EQ(VAL, static_cast<int>(LIBC_NAMESPACE::libc_errno))
-#define ASSERT_ERRNO_SUCCESS()                                                 \
-  ASSERT_EQ(0, static_cast<int>(LIBC_NAMESPACE::libc_errno))
-#define ASSERT_ERRNO_FAILURE()                                                 \
-  ASSERT_NE(0, static_cast<int>(LIBC_NAMESPACE::libc_errno))
+#define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(libc_errno))
+#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
+#define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(libc_errno))
 
 // Integration tests are compiled with -ffreestanding which stops treating
 // the main function as a non-overloadable special function. Hence, we use a
diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h
index 3d3b72f80544..4b7ff452f409 100644
--- a/libc/test/UnitTest/ErrnoCheckingTest.h
+++ b/libc/test/UnitTest/ErrnoCheckingTest.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H
 #define LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "test/UnitTest/Test.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -25,7 +25,7 @@ class ErrnoCheckingTest : public Test {
 public:
   void SetUp() override {
     Test::SetUp();
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
   }
 
   void TearDown() override {
diff --git a/libc/test/UnitTest/ErrnoSetterMatcher.h b/libc/test/UnitTest/ErrnoSetterMatcher.h
index c6eadd25858e..212b7a8f83e7 100644
--- a/libc/test/UnitTest/ErrnoSetterMatcher.h
+++ b/libc/test/UnitTest/ErrnoSetterMatcher.h
@@ -12,9 +12,9 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "src/__support/StringUtil/error_to_string.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
-#include "src/errno/libc_errno.h"
 #include "test/UnitTest/Test.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -114,8 +114,8 @@ public:
 
   bool match(T got) {
     actual_return = got;
-    actual_errno = LIBC_NAMESPACE::libc_errno;
-    LIBC_NAMESPACE::libc_errno = 0;
+    actual_errno = libc_errno;
+    libc_errno = 0;
     if constexpr (ignore_errno())
       return return_cmp.compare(actual_return);
     else
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 21b8a45b0726..da15cf2907f7 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -279,8 +279,8 @@ private:
 #define EXPECT_MATH_ERRNO(expected)                                            \
   do {                                                                         \
     if (math_errhandling & MATH_ERRNO) {                                       \
-      int actual = LIBC_NAMESPACE::libc_errno;                                 \
-      LIBC_NAMESPACE::libc_errno = 0;                                          \
+      int actual = libc_errno;                                                 \
+      libc_errno = 0;                                                          \
       EXPECT_EQ(actual, expected);                                             \
     }                                                                          \
   } while (0)
@@ -288,8 +288,8 @@ private:
 #define ASSERT_MATH_ERRNO(expected)                                            \
   do {                                                                         \
     if (math_errhandling & MATH_ERRNO) {                                       \
-      int actual = LIBC_NAMESPACE::libc_errno;                                 \
-      LIBC_NAMESPACE::libc_errno = 0;                                          \
+      int actual = libc_errno;                                                 \
+      libc_errno = 0;                                                          \
       ASSERT_EQ(actual, expected);                                             \
     }                                                                          \
   } while (0)
diff --git a/libc/test/UnitTest/Test.h b/libc/test/UnitTest/Test.h
index 95d48f40914e..a5a2a3c7cf58 100644
--- a/libc/test/UnitTest/Test.h
+++ b/libc/test/UnitTest/Test.h
@@ -42,15 +42,14 @@
 
 #define ASSERT_ERRNO_EQ(VAL)                                                   \
   do {                                                                         \
-    ASSERT_EQ(VAL, static_cast<int>(LIBC_NAMESPACE::libc_errno));              \
-    LIBC_NAMESPACE::libc_errno = 0;                                            \
+    ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
+    libc_errno = 0;                                                            \
   } while (0)
-#define ASSERT_ERRNO_SUCCESS()                                                 \
-  ASSERT_EQ(0, static_cast<int>(LIBC_NAMESPACE::libc_errno))
+#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
 #define ASSERT_ERRNO_FAILURE()                                                 \
   do {                                                                         \
-    ASSERT_NE(0, static_cast<int>(LIBC_NAMESPACE::libc_errno));                \
-    LIBC_NAMESPACE::libc_errno = 0;                                            \
+    ASSERT_NE(0, static_cast<int>(libc_errno));                                \
+    libc_errno = 0;                                                            \
   } while (0)
 
 #endif // LLVM_LIBC_TEST_UNITTEST_TEST_H
diff --git a/libc/test/integration/src/pthread/pthread_create_test.cpp b/libc/test/integration/src/pthread/pthread_create_test.cpp
index 29da4d5c3c8d..aecbad6514aa 100644
--- a/libc/test/integration/src/pthread/pthread_create_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_create_test.cpp
@@ -29,7 +29,7 @@
 #include "src/__support/CPP/new.h"
 #include "src/__support/threads/thread.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include "test/IntegrationTest/test.h"
 
@@ -332,7 +332,7 @@ static void run_failure_tests() {
 }
 
 TEST_MAIN() {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   run_success_tests();
   run_failure_tests();
   return 0;
diff --git a/libc/test/integration/src/pthread/pthread_join_test.cpp b/libc/test/integration/src/pthread/pthread_join_test.cpp
index 994fa57a6b33..5d0bcd8e2365 100644
--- a/libc/test/integration/src/pthread/pthread_join_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_join_test.cpp
@@ -9,7 +9,7 @@
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_join.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include "test/IntegrationTest/test.h"
 #include <pthread.h>
@@ -25,7 +25,7 @@ static void nullJoinTest() {
 }
 
 TEST_MAIN() {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   nullJoinTest();
   return 0;
 }
diff --git a/libc/test/integration/src/pthread/pthread_name_test.cpp b/libc/test/integration/src/pthread/pthread_name_test.cpp
index 37ceceee880d..35dd3b165e0e 100644
--- a/libc/test/integration/src/pthread/pthread_name_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_name_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_getname_np.h"
 #include "src/pthread/pthread_join.h"
diff --git a/libc/test/integration/src/unistd/getcwd_test.cpp b/libc/test/integration/src/unistd/getcwd_test.cpp
index 551768187bf0..1b321b01e931 100644
--- a/libc/test/integration/src/unistd/getcwd_test.cpp
+++ b/libc/test/integration/src/unistd/getcwd_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/stdlib/getenv.h"
 #include "src/unistd/getcwd.h"
 
@@ -31,12 +31,12 @@ TEST_MAIN(int argc, char **argv, char **envp) {
   cwd = LIBC_NAMESPACE::getcwd(buffer, 0);
   ASSERT_TRUE(cwd == nullptr);
   ASSERT_ERRNO_EQ(EINVAL);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   // Insufficient size
   cwd = LIBC_NAMESPACE::getcwd(buffer, 2);
   ASSERT_TRUE(cwd == nullptr);
-  int err = LIBC_NAMESPACE::libc_errno;
+  int err = libc_errno;
   ASSERT_EQ(err, ERANGE);
 
   return 0;
diff --git a/libc/test/integration/startup/linux/tls_test.cpp b/libc/test/integration/startup/linux/tls_test.cpp
index ef9fd9fcb7ff..de3bd06c39cf 100644
--- a/libc/test/integration/startup/linux/tls_test.cpp
+++ b/libc/test/integration/startup/linux/tls_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sys/mman/mmap.h"
 #include "test/IntegrationTest/test.h"
 
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index d349192f107c..9b4844d410db 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
 #include "src/__support/uint128.h"
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 1ec882b212b8..40cb76a8bd6a 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/str_to_integer.h"
 #include <stddef.h>
 
diff --git a/libc/test/src/dirent/dirent_test.cpp b/libc/test/src/dirent/dirent_test.cpp
index 41f522a6a75f..3f0095ca5ebe 100644
--- a/libc/test/src/dirent/dirent_test.cpp
+++ b/libc/test/src/dirent/dirent_test.cpp
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/libc_errno.h"
 #include "src/dirent/closedir.h"
 #include "src/dirent/dirfd.h"
 #include "src/dirent/opendir.h"
 #include "src/dirent/readdir.h"
-#include "src/errno/libc_errno.h"
 
 #include "test/UnitTest/Test.h"
 
@@ -55,17 +55,17 @@ TEST(LlvmLibcDirentTest, SimpleOpenAndRead) {
 }
 
 TEST(LlvmLibcDirentTest, OpenNonExistentDir) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ::DIR *dir = LIBC_NAMESPACE::opendir("___xyz123__.non_existent__");
   ASSERT_TRUE(dir == nullptr);
   ASSERT_ERRNO_EQ(ENOENT);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
 
 TEST(LlvmLibcDirentTest, OpenFile) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ::DIR *dir = LIBC_NAMESPACE::opendir("testdata/file1.txt");
   ASSERT_TRUE(dir == nullptr);
   ASSERT_ERRNO_EQ(ENOTDIR);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
diff --git a/libc/test/src/errno/errno_test.cpp b/libc/test/src/errno/errno_test.cpp
index b0db22a85f3b..de82b0077f17 100644
--- a/libc/test/src/errno/errno_test.cpp
+++ b/libc/test/src/errno/errno_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcErrnoTest, Basic) {
   int test_val = 123;
-  LIBC_NAMESPACE::libc_errno = test_val;
+  libc_errno = test_val;
   ASSERT_ERRNO_EQ(test_val);
 }
diff --git a/libc/test/src/fcntl/creat_test.cpp b/libc/test/src/fcntl/creat_test.cpp
index 4c9d2cbc33f4..d60c98493470 100644
--- a/libc/test/src/fcntl/creat_test.cpp
+++ b/libc/test/src/fcntl/creat_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/creat.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index 1a21afe51085..082c42481777 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/fcntl_macros.h"
 #include "hdr/stdio_macros.h"
 #include "hdr/types/struct_flock.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/fcntl.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
@@ -166,7 +166,7 @@ TEST(LlvmLibcFcntlTest, UseAfterClose) {
 }
 
 TEST(LlvmLibcFcntlTest, SetGetOwnerTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t pid = LIBC_NAMESPACE::getpid();
   ASSERT_GT(pid, -1);
diff --git a/libc/test/src/fcntl/openat_test.cpp b/libc/test/src/fcntl/openat_test.cpp
index 213b074799c8..1997476f16a6 100644
--- a/libc/test/src/fcntl/openat_test.cpp
+++ b/libc/test/src/fcntl/openat_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/fcntl/openat.h"
 #include "src/unistd/close.h"
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 77b465a3a0e6..6af9cfea0e0a 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -55,7 +55,7 @@ private:
 
   void test_one_input(RoundToIntegerFunc func, FloatType input,
                       IntType expected, bool expectError) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
 
     ASSERT_EQ(func(input), expected);
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index 2e4c8eb2ab96..aa0128fee999 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAcosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acosf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/acoshf16_test.cpp b/libc/test/src/math/acoshf16_test.cpp
index 7348018396bd..2eb95215e4e8 100644
--- a/libc/test/src/math/acoshf16_test.cpp
+++ b/libc/test/src/math/acoshf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index 18ed5a11d50a..3d3b827411a4 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acoshf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/asin_test.cpp b/libc/test/src/math/asin_test.cpp
index 385e341318ae..03ae963e9f92 100644
--- a/libc/test/src/math/asin_test.cpp
+++ b/libc/test/src/math/asin_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::asin(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index 5197810d8bd5..1eaa6b8a5135 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -9,7 +9,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcAsinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index ac125c3520c4..8c78f939cabf 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinhf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp
index 331f4281af83..50ab38208089 100644
--- a/libc/test/src/math/atan2f_test.cpp
+++ b/libc/test/src/math/atan2f_test.cpp
@@ -81,7 +81,7 @@ TEST_F(LlvmLibcAtan2fTest, InFloatRange) {
         if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
-        LIBC_NAMESPACE::libc_errno = 0;
+        libc_errno = 0;
         float result = LIBC_NAMESPACE::atan2f(x, y);
         ++total_count;
         if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/atan_test.cpp b/libc/test/src/math/atan_test.cpp
index 7f52578b9efe..7fa0dffd607e 100644
--- a/libc/test/src/math/atan_test.cpp
+++ b/libc/test/src/math/atan_test.cpp
@@ -39,7 +39,7 @@ TEST_F(LlvmLibcAtanTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::atan(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index 575ec89bd493..a4bdf1867c39 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -23,7 +23,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 // TODO: This test needs to have its checks for exceptions, errno
 // tightened
 TEST_F(LlvmLibcAtanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanf(aNaN));
   // TODO: Uncomment these checks later, RoundingMode affects running
diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
index 8b9db1dfdd97..32272ef482ab 100644
--- a/libc/test/src/math/atanhf_test.cpp
+++ b/libc/test/src/math/atanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -25,7 +25,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 // tightened https://github.com/llvm/llvm-project/issues/88819.
 TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(aNaN));
   // TODO: Uncomment these checks later, RoundingMode affects running
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 2143c36f3d30..90dc8ff6a0ea 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -23,7 +23,7 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cosf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 0d1c322b8e62..bdaba50f1f14 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/coshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::coshf(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcCoshfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp
index 37ec2516f6a3..cb88bfcade0d 100644
--- a/libc/test/src/math/cospif_test.cpp
+++ b/libc/test/src/math/cospif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cospif.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/src/math/sdcomp26094.h"
@@ -19,7 +19,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index 6fb1d2d9d925..6126e5f211ff 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -105,7 +105,7 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::exp10(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index 001b37809d93..89915961c9b9 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp10f(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -55,7 +55,7 @@ TEST_F(LlvmLibcExp10fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       0.0f, LIBC_NAMESPACE::exp10f(FPBits(0xff7fffffU).get_val()),
       FE_UNDERFLOW);
@@ -97,7 +97,7 @@ TEST_F(LlvmLibcExp10fTest, TrickyInputs) {
       0x41200000, // x = 10.0f
   };
   for (int i = 0; i < N; ++i) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
@@ -113,15 +113,14 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp
index aee273384f1a..01802bd68f7e 100644
--- a/libc/test/src/math/exp10m1f_test.cpp
+++ b/libc/test/src/math/exp10m1f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -69,7 +69,7 @@ TEST_F(LlvmLibcExp10m1fTest, TrickyInputs) {
   };
 
   for (float x : INPUTS) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
                                    LIBC_NAMESPACE::exp10m1f(x), 0.5);
   }
@@ -82,14 +82,14 @@ TEST_F(LlvmLibcExp10m1fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_inf_or_nan())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10m1f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_inf_or_nan() || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_inf_or_nan() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
                                    LIBC_NAMESPACE::exp10m1f(x), 0.5);
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index adfceceeef4b..4cd95dd5486e 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::exp2(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index 0c4c82153439..aeecb3e74b07 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp2f(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -71,7 +71,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
       0xc3150000U, /*-0x1.2ap+7f*/
   };
   for (int i = 0; i < N; ++i) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
@@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
 }
 
 TEST_F(LlvmLibcExp2fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       0.0f, LIBC_NAMESPACE::exp2f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -108,15 +108,14 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
index 793cf0cc2cbb..0c87657abc08 100644
--- a/libc/test/src/math/exp2m1f_test.cpp
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) {
   };
 
   for (float x : INPUTS) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
                                    LIBC_NAMESPACE::exp2m1f(x), 0.5);
   }
@@ -51,15 +51,14 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2m1f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
                                    LIBC_NAMESPACE::exp2m1f(x), 0.5);
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 0ab3a4e54346..83addaeb943d 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -78,7 +78,7 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::exp(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index 26a0bca4ce25..3c10812ff5bc 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expf(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpfTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExpfTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       0.0f, LIBC_NAMESPACE::expf(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -76,7 +76,7 @@ TEST_F(LlvmLibcExpfTest, Underflow) {
 TEST_F(LlvmLibcExpfTest, Borderline) {
   float x;
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   x = FPBits(0x42affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
@@ -110,15 +110,14 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::expf(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                    LIBC_NAMESPACE::expf(x), 0.5);
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index 9720773d9f96..0cf07e2e4973 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -64,7 +64,7 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::expm1(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 274fe3bb7afb..cf3fe9c26ae1 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expm1f(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpm1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpm1fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExpm1fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(FPBits(0xff7fffffU).get_val()));
 
   float x = FPBits(0xc2cffff8U).get_val();
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcExpm1fTest, Underflow) {
 TEST_F(LlvmLibcExpm1fTest, Borderline) {
   float x;
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   x = FPBits(0x42affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
@@ -119,15 +119,14 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::expm1f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                    LIBC_NAMESPACE::expm1f(x), 0.5);
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index 01aa1f82ae5d..e9529d87c388 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -101,7 +101,7 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log10(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index 107e965a0d3a..e5747b7e5ec0 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log1p(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index bb181dc5e43b..ffe2dd2c33dd 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -75,7 +75,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
                                    LIBC_NAMESPACE::log1pf(x), 0.5);
   }
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index 8a07991a6888..fc440c09b42b 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log2(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index 83691fb75300..92226c763f45 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -52,14 +52,13 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::log2f(x);
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
                                    LIBC_NAMESPACE::log2f(x), 0.5);
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index 969a469b2e1c..54afaa33d135 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -99,7 +99,7 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 448dcc0035e9..4d189d813e58 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -78,7 +78,7 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) {
         if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
-        LIBC_NAMESPACE::libc_errno = 0;
+        libc_errno = 0;
         float result = LIBC_NAMESPACE::powf(x, y);
         ++cc;
         if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index d4c6bd416a40..4d5d9ddf464b 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -71,7 +71,7 @@ TEST_F(LlvmLibcSinTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::sin(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 2823110331f3..ad2155f329cd 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   float sin, cos;
 
   LIBC_NAMESPACE::sincosf(aNaN, &sin, &cos);
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 8fd3ed1577ce..e0357e6157fd 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index 6867c7aec57d..74f906ebaa98 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinhf(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -65,7 +65,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) {
 }
 
 TEST_F(LlvmLibcSinhfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp
index d00fd77d288c..986c676761f0 100644
--- a/libc/test/src/math/sinpif_test.cpp
+++ b/libc/test/src/math/sinpif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinpif.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/src/math/sdcomp26094.h"
@@ -21,7 +21,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinpifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h
index 8fbcc2a27654..04cbc659ece5 100644
--- a/libc/test/src/math/smoke/FModTest.h
+++ b/libc/test/src/math/smoke/FModTest.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
 
 #include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 6ae97ce35a0d..745ccbc748ec 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -40,7 +40,7 @@ private:
 
   void test_one_input(RoundToIntegerFunc func, F input, I expected,
                       bool expectError) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
 
     ASSERT_EQ(func(input), expected);
diff --git a/libc/test/src/math/smoke/acos_test.cpp b/libc/test/src/math/smoke/acos_test.cpp
index 3a59bce26407..fe2caefb52ab 100644
--- a/libc/test/src/math/smoke/acos_test.cpp
+++ b/libc/test/src/math/smoke/acos_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fenv_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acos.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ TEST_F(LlvmLibcAcosTest, SpecialNumbers) {
   EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(zero));
   EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(neg_zero));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acos(inf),
                                            FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
diff --git a/libc/test/src/math/smoke/acosf16_test.cpp b/libc/test/src/math/smoke/acosf16_test.cpp
index c4274b824509..7103dc33fec3 100644
--- a/libc/test/src/math/smoke/acosf16_test.cpp
+++ b/libc/test/src/math/smoke/acosf16_test.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acosf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAcosf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAcosf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acosf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp
index 74f68e00011a..257c6a3d1d22 100644
--- a/libc/test/src/math/smoke/acosf_test.cpp
+++ b/libc/test/src/math/smoke/acosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAcosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acosf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/acoshf16_test.cpp b/libc/test/src/math/smoke/acoshf16_test.cpp
index 7681c2a4e7fb..6b9c995cf992 100644
--- a/libc/test/src/math/smoke/acoshf16_test.cpp
+++ b/libc/test/src/math/smoke/acoshf16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAcoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAcoshf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acoshf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp
index c5ba88055ac5..b6abfab99929 100644
--- a/libc/test/src/math/smoke/acoshf_test.cpp
+++ b/libc/test/src/math/smoke/acoshf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acoshf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/acospif16_test.cpp b/libc/test/src/math/smoke/acospif16_test.cpp
index 66b94706eab9..4b2f6de3f7e3 100644
--- a/libc/test/src/math/smoke/acospif16_test.cpp
+++ b/libc/test/src/math/smoke/acospif16_test.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acospif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
 using LlvmLibcAcospif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 TEST_F(LlvmLibcAcospif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acospif16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/asinf16_test.cpp b/libc/test/src/math/smoke/asinf16_test.cpp
index 9f675b08319c..b03f0a420a49 100644
--- a/libc/test/src/math/smoke/asinf16_test.cpp
+++ b/libc/test/src/math/smoke/asinf16_test.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAsinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAsinf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp
index d817d2b36619..2615a8ddd16b 100644
--- a/libc/test/src/math/smoke/asinf_test.cpp
+++ b/libc/test/src/math/smoke/asinf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAsinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/asinhf16_test.cpp b/libc/test/src/math/smoke/asinhf16_test.cpp
index dcaab217331c..7f612ce3c467 100644
--- a/libc/test/src/math/smoke/asinhf16_test.cpp
+++ b/libc/test/src/math/smoke/asinhf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcAsinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAsinhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinhf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp
index 4a8743c50075..d812a2dffe8a 100644
--- a/libc/test/src/math/smoke/asinhf_test.cpp
+++ b/libc/test/src/math/smoke/asinhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp
index 1fbcfbe96b2d..7f8cfb9830d2 100644
--- a/libc/test/src/math/smoke/atan2f_test.cpp
+++ b/libc/test/src/math/smoke/atan2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atan2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(sNaN, sNaN),
                               FE_INVALID);
diff --git a/libc/test/src/math/smoke/atanf16_test.cpp b/libc/test/src/math/smoke/atanf16_test.cpp
index af50287d9b22..ba1e3b2fc8be 100644
--- a/libc/test/src/math/smoke/atanf16_test.cpp
+++ b/libc/test/src/math/smoke/atanf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcAtanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAtanf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::atanf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp
index 7d09a28beaa3..b56b9d0162b9 100644
--- a/libc/test/src/math/smoke/atanf_test.cpp
+++ b/libc/test/src/math/smoke/atanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/atanhf16_test.cpp b/libc/test/src/math/smoke/atanhf16_test.cpp
index 81df6da8cee2..c2a520f7638f 100644
--- a/libc/test/src/math/smoke/atanhf16_test.cpp
+++ b/libc/test/src/math/smoke/atanhf16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAtanhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAtanhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf16(sNaN),
                                            FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp
index 73a5b81b0240..038cb30d89a4 100644
--- a/libc/test/src/math/smoke/atanhf_test.cpp
+++ b/libc/test/src/math/smoke/atanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -20,7 +20,7 @@ using LIBC_NAMESPACE::Sign;
 using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
   // TODO: Strengthen errno,exception checks and remove these assert macros
diff --git a/libc/test/src/math/smoke/cosf16_test.cpp b/libc/test/src/math/smoke/cosf16_test.cpp
index 2638551fb1d1..4362a5a3a4bd 100644
--- a/libc/test/src/math/smoke/cosf16_test.cpp
+++ b/libc/test/src/math/smoke/cosf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cosf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcCosf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp
index 99773583dcb1..470a876c63a7 100644
--- a/libc/test/src/math/smoke/cosf_test.cpp
+++ b/libc/test/src/math/smoke/cosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/coshf16_test.cpp b/libc/test/src/math/smoke/coshf16_test.cpp
index 08d05ecce86b..7bf62afa24c4 100644
--- a/libc/test/src/math/smoke/coshf16_test.cpp
+++ b/libc/test/src/math/smoke/coshf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/coshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcCoshf16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp
index 1611ea1b9292..ee8f0199df3b 100644
--- a/libc/test/src/math/smoke/coshf_test.cpp
+++ b/libc/test/src/math/smoke/coshf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/coshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@
 using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcCoshfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/cospif16_test.cpp b/libc/test/src/math/smoke/cospif16_test.cpp
index edd8ed97b30f..fcde0cc79e35 100644
--- a/libc/test/src/math/smoke/cospif16_test.cpp
+++ b/libc/test/src/math/smoke/cospif16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cospif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcCospif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcCospif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp
index 20153897dc45..3d48909cca93 100644
--- a/libc/test/src/math/smoke/cospif_test.cpp
+++ b/libc/test/src/math/smoke/cospif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cospif.h"
 #include "test/UnitTest/FPMatcher.h"
 
@@ -15,7 +15,7 @@
 using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp
index baf8a7681097..50d3de0c7fe7 100644
--- a/libc/test/src/math/smoke/exp10_test.cpp
+++ b/libc/test/src/math/smoke/exp10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/exp10f16_test.cpp b/libc/test/src/math/smoke/exp10f16_test.cpp
index 1c4ef2aa08a7..bda40348f883 100644
--- a/libc/test/src/math/smoke/exp10f16_test.cpp
+++ b/libc/test/src/math/smoke/exp10f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp10f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10f16(max_normal),
                               FE_OVERFLOW);
@@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp10f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10f16Test, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp10f16(neg_max_normal),
                               FE_UNDERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp
index bf39e2cc12d0..fcd334bb9e36 100644
--- a/libc/test/src/math/smoke/exp10f_test.cpp
+++ b/libc/test/src/math/smoke/exp10f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp
index dfa7fa477d3d..ed2d5a48b316 100644
--- a/libc/test/src/math/smoke/exp10m1f16_test.cpp
+++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10m1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
@@ -67,7 +67,7 @@ TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
                               LIBC_NAMESPACE::exp10m1f16(neg_max_normal),
diff --git a/libc/test/src/math/smoke/exp10m1f_test.cpp b/libc/test/src/math/smoke/exp10m1f_test.cpp
index 2c2cfdbb08a3..19369a897aaa 100644
--- a/libc/test/src/math/smoke/exp10m1f_test.cpp
+++ b/libc/test/src/math/smoke/exp10m1f_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10m1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f(0x1.fffffep+127f),
                               FE_OVERFLOW);
@@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp10m1fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10m1fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp10m1f(-max_normal),
                               FE_UNDERFLOW);
diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp
index 9ab9129416da..aebf80835072 100644
--- a/libc/test/src/math/smoke/exp2_test.cpp
+++ b/libc/test/src/math/smoke/exp2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/exp2f16_test.cpp b/libc/test/src/math/smoke/exp2f16_test.cpp
index f69b33a3cf37..1eb7343dcd22 100644
--- a/libc/test/src/math/smoke/exp2f16_test.cpp
+++ b/libc/test/src/math/smoke/exp2f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp2f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2f16(max_normal),
                               FE_OVERFLOW);
@@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp2f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp2f16Test, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp2f16(neg_max_normal),
                               FE_UNDERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp
index a928389cc41b..c5243273d9ed 100644
--- a/libc/test/src/math/smoke/exp2f_test.cpp
+++ b/libc/test/src/math/smoke/exp2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/exp2m1f16_test.cpp b/libc/test/src/math/smoke/exp2m1f16_test.cpp
index f423196a7036..635b7a6e187d 100644
--- a/libc/test/src/math/smoke/exp2m1f16_test.cpp
+++ b/libc/test/src/math/smoke/exp2m1f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2m1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp2m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -39,7 +39,7 @@ TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2m1f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
@@ -65,7 +65,7 @@ TEST_F(LlvmLibcExp2m1f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp2m1f16Test, ResultNearNegOne) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(-1.0, LIBC_NAMESPACE::exp2m1f16(neg_max_normal),
                               FE_INEXACT);
diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp
index 99bdf0035df0..63852e11655a 100644
--- a/libc/test/src/math/smoke/exp2m1f_test.cpp
+++ b/libc/test/src/math/smoke/exp2m1f_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@ using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode;
 using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 
 TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2m1f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2m1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.fffffep+127),
                               FE_OVERFLOW);
@@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp2m1fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExp2m1fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.fffffep+127),
                               FE_UNDERFLOW);
diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp
index f86243092f1f..c3b2ae70e1d9 100644
--- a/libc/test/src/math/smoke/exp_test.cpp
+++ b/libc/test/src/math/smoke/exp_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp
index ab745a3cf6f5..863f694ffc41 100644
--- a/libc/test/src/math/smoke/expf16_test.cpp
+++ b/libc/test/src/math/smoke/expf16_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -17,7 +17,7 @@
 using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExpf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcExpf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpf16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expf16(max_normal),
                               FE_OVERFLOW);
@@ -54,7 +54,7 @@ TEST_F(LlvmLibcExpf16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExpf16Test, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::expf16(neg_max_normal),
                               FE_UNDERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp
index eee830499927..d34151735afa 100644
--- a/libc/test/src/math/smoke/expf_test.cpp
+++ b/libc/test/src/math/smoke/expf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp
index bc71c53abc7a..c842fe3c45fe 100644
--- a/libc/test/src/math/smoke/expm1_test.cpp
+++ b/libc/test/src/math/smoke/expm1_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/expm1f16_test.cpp b/libc/test/src/math/smoke/expm1f16_test.cpp
index f297c5dfc3c7..4d19a9bac5eb 100644
--- a/libc/test/src/math/smoke/expm1f16_test.cpp
+++ b/libc/test/src/math/smoke/expm1f16_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -17,7 +17,7 @@
 using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expm1f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpm1f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expm1f16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
@@ -67,7 +67,7 @@ TEST_F(LlvmLibcExpm1f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExpm1f16Test, ResultNearNegOne) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
                               LIBC_NAMESPACE::expm1f16(neg_max_normal),
diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp
index dfb474d70fb6..214bfe8abd4d 100644
--- a/libc/test/src/math/smoke/expm1f_test.cpp
+++ b/libc/test/src/math/smoke/expm1f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpm1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp
index ff73850c5210..49cfda85111a 100644
--- a/libc/test/src/math/smoke/log10_test.cpp
+++ b/libc/test/src/math/smoke/log10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log10f16_test.cpp b/libc/test/src/math/smoke/log10f16_test.cpp
index 471e19893332..53f5ac46aa60 100644
--- a/libc/test/src/math/smoke/log10f16_test.cpp
+++ b/libc/test/src/math/smoke/log10f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log10f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcLog10f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log10f16(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp
index 631c24b8abcf..61c56cd2c6dd 100644
--- a/libc/test/src/math/smoke/log1p_test.cpp
+++ b/libc/test/src/math/smoke/log1p_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp
index bd828ad58c4c..dc3489fddf99 100644
--- a/libc/test/src/math/smoke/log1pf_test.cpp
+++ b/libc/test/src/math/smoke/log1pf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp
index 9993d442967c..0534d00b1f40 100644
--- a/libc/test/src/math/smoke/log2_test.cpp
+++ b/libc/test/src/math/smoke/log2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log2f16_test.cpp b/libc/test/src/math/smoke/log2f16_test.cpp
index 6d98482aa449..fd20652d2f00 100644
--- a/libc/test/src/math/smoke/log2f16_test.cpp
+++ b/libc/test/src/math/smoke/log2f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcLog2f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log2f16(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp
index 8648b75b88b8..53d54ac36763 100644
--- a/libc/test/src/math/smoke/log2f_test.cpp
+++ b/libc/test/src/math/smoke/log2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp
index d31eb0c1db73..09e9ab0a9a4d 100644
--- a/libc/test/src/math/smoke/log_test.cpp
+++ b/libc/test/src/math/smoke/log_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/logf16_test.cpp b/libc/test/src/math/smoke/logf16_test.cpp
index c7232aa1c1e3..2784f3d5fa54 100644
--- a/libc/test/src/math/smoke/logf16_test.cpp
+++ b/libc/test/src/math/smoke/logf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/logf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcLogf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::logf16(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp
index 5f66868f12a1..8ba0d04347bb 100644
--- a/libc/test/src/math/smoke/sincosf_test.cpp
+++ b/libc/test/src/math/smoke/sincosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   float sin, cos;
 
   LIBC_NAMESPACE::sincosf(sNaN, &sin, &cos);
diff --git a/libc/test/src/math/smoke/sinf16_test.cpp b/libc/test/src/math/smoke/sinf16_test.cpp
index a0e7a7ba321f..6b168ac040db 100644
--- a/libc/test/src/math/smoke/sinf16_test.cpp
+++ b/libc/test/src/math/smoke/sinf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcSinf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp
index de504b4f5335..8173969fb256 100644
--- a/libc/test/src/math/smoke/sinf_test.cpp
+++ b/libc/test/src/math/smoke/sinf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sinhf16_test.cpp b/libc/test/src/math/smoke/sinhf16_test.cpp
index 4f21d33ba78e..d52739a9adb3 100644
--- a/libc/test/src/math/smoke/sinhf16_test.cpp
+++ b/libc/test/src/math/smoke/sinhf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sinhf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcSinhf16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::sinhf16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp
index e22cfc7ea14d..ea6a4474a780 100644
--- a/libc/test/src/math/smoke/sinhf_test.cpp
+++ b/libc/test/src/math/smoke/sinhf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@
 using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) {
 }
 
 TEST_F(LlvmLibcSinhfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp
index b2db6fb9f862..9edf2cc663d4 100644
--- a/libc/test/src/math/smoke/sinpif16_test.cpp
+++ b/libc/test/src/math/smoke/sinpif16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinpif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sinpif_test.cpp b/libc/test/src/math/smoke/sinpif_test.cpp
index 1ba5c1d2b720..b840f3980eda 100644
--- a/libc/test/src/math/smoke/sinpif_test.cpp
+++ b/libc/test/src/math/smoke/sinpif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinpif.h"
 #include "test/UnitTest/FPMatcher.h"
 
@@ -15,7 +15,7 @@
 using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinpifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp
index f65b9fced72c..95d200cf5591 100644
--- a/libc/test/src/math/smoke/tanf16_test.cpp
+++ b/libc/test/src/math/smoke/tanf16_test.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcTanf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp
index 178e9065f430..12deca5cf941 100644
--- a/libc/test/src/math/smoke/tanf_test.cpp
+++ b/libc/test/src/math/smoke/tanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcTanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanhf16_test.cpp b/libc/test/src/math/smoke/tanhf16_test.cpp
index fa6328e9ef0a..eb90f02a8d7c 100644
--- a/libc/test/src/math/smoke/tanhf16_test.cpp
+++ b/libc/test/src/math/smoke/tanhf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tanhf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcTanhf16Test, ResultNearBounds) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(1.0),
                               LIBC_NAMESPACE::tanhf16(max_normal), FE_INEXACT);
diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp
index c09761ef531f..b12a331b3190 100644
--- a/libc/test/src/math/smoke/tanhf_test.cpp
+++ b/libc/test/src/math/smoke/tanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcTanhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanpif16_test.cpp b/libc/test/src/math/smoke/tanpif16_test.cpp
index 74797d1649b1..ea896d7bb3e5 100644
--- a/libc/test/src/math/smoke/tanpif16_test.cpp
+++ b/libc/test/src/math/smoke/tanpif16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanpif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcTanpif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanpif16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index 9061cf6fb30b..ecc70194b649 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcTanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index 389abe4d8589..966ce649e2b3 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcTanhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanhf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/poll/poll_test.cpp b/libc/test/src/poll/poll_test.cpp
index 30f5e41c61ec..97b7b0271817 100644
--- a/libc/test/src/poll/poll_test.cpp
+++ b/libc/test/src/poll/poll_test.cpp
@@ -7,18 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/limits_macros.h" // UINT_MAX
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/poll/poll.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcPollTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int ret = LIBC_NAMESPACE::poll(nullptr, 0, 0);
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(0, ret);
 }
 TEST(LlvmLibcPollTest, SmokeFailureTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int ret = LIBC_NAMESPACE::poll(nullptr, UINT_MAX, 0);
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_EQ(-1, ret);
diff --git a/libc/test/src/sched/affinity_test.cpp b/libc/test/src/sched/affinity_test.cpp
index b5085203e5ce..b77f22f8e60d 100644
--- a/libc/test/src/sched/affinity_test.cpp
+++ b/libc/test/src/sched/affinity_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_getaffinity.h"
 #include "src/sched/sched_setaffinity.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -17,7 +17,7 @@
 
 TEST(LlvmLibcSchedAffinityTest, SmokeTest) {
   cpu_set_t mask;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
   ASSERT_GT(tid, pid_t(0));
@@ -32,15 +32,15 @@ TEST(LlvmLibcSchedAffinityTest, BadMask) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(
       LIBC_NAMESPACE::sched_getaffinity(tid, sizeof(cpu_set_t), nullptr),
       Fails(EFAULT));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(
       LIBC_NAMESPACE::sched_setaffinity(tid, sizeof(cpu_set_t), nullptr),
       Fails(EFAULT));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
diff --git a/libc/test/src/sched/cpu_count_test.cpp b/libc/test/src/sched/cpu_count_test.cpp
index 5250368a2616..919f1475e1d4 100644
--- a/libc/test/src/sched/cpu_count_test.cpp
+++ b/libc/test/src/sched/cpu_count_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_getaffinity.h"
 #include "src/sched/sched_getcpucount.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -17,7 +17,7 @@
 
 TEST(LlvmLibcSchedCpuCountTest, SmokeTest) {
   cpu_set_t mask;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
   ASSERT_GT(tid, pid_t(0));
diff --git a/libc/test/src/sched/get_priority_test.cpp b/libc/test/src/sched/get_priority_test.cpp
index 59205c51e4a1..bb41dc0be201 100644
--- a/libc/test/src/sched/get_priority_test.cpp
+++ b/libc/test/src/sched/get_priority_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_max.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "test/UnitTest/Test.h"
@@ -58,7 +58,7 @@ TEST(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) {
 }
 
 TEST(LlvmLibcSchedGetPriorityTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   // We Test:
   // SCHED_OTHER, SCHED_FIFO, SCHED_RR
diff --git a/libc/test/src/sched/param_and_scheduler_test.cpp b/libc/test/src/sched/param_and_scheduler_test.cpp
index 747c7e3409e4..4f2b6e412a4b 100644
--- a/libc/test/src/sched/param_and_scheduler_test.cpp
+++ b/libc/test/src/sched/param_and_scheduler_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_max.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "src/sched/sched_getparam.h"
@@ -37,7 +37,7 @@
 class SchedTest : public LIBC_NAMESPACE::testing::Test {
 public:
   void testSched(int policy, bool is_mandatory) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     int init_policy = LIBC_NAMESPACE::sched_getscheduler(0);
     ASSERT_GE(init_policy, 0);
@@ -55,30 +55,29 @@ public:
     // Negative pid
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(-1, policy, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(-1), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     // Invalid Policy
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy | 128, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     // Out of bounds priority
     param.sched_priority = min_priority - 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     param.sched_priority = max_priority + 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, &param), -1);
     // A bit hard to test as depending on user privileges we can run into
     // different issues.
-    ASSERT_TRUE(LIBC_NAMESPACE::libc_errno == EINVAL ||
-                LIBC_NAMESPACE::libc_errno == EPERM);
-    LIBC_NAMESPACE::libc_errno = 0;
+    ASSERT_TRUE(libc_errno == EINVAL || libc_errno == EPERM);
+    libc_errno = 0;
 
     param.sched_priority = min_priority;
     // Success/unsupported policy/missing permissions.
@@ -87,10 +86,9 @@ public:
     ASSERT_TRUE(setscheduler_result == 0 || setscheduler_result == -1);
     ASSERT_TRUE(
         setscheduler_result != -1
-            ? (LIBC_NAMESPACE::libc_errno == 0)
-            : ((!is_mandatory && LIBC_NAMESPACE::libc_errno == EINVAL) ||
-               LIBC_NAMESPACE::libc_errno == EPERM));
-    LIBC_NAMESPACE::libc_errno = 0;
+            ? (libc_errno == 0)
+            : ((!is_mandatory && libc_errno == EINVAL) || libc_errno == EPERM));
+    libc_errno = 0;
 
     ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(0),
               setscheduler_result != -1 ? policy : init_policy);
@@ -100,12 +98,12 @@ public:
     param.sched_priority = -1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     param.sched_priority = max_priority + 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     for (int priority = min_priority; priority <= max_priority; ++priority) {
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, &param), 0);
@@ -117,21 +115,20 @@ public:
       // Negative pid
       ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(-1, &param), -1);
       ASSERT_ERRNO_EQ(EINVAL);
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
 
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(-1, &param), -1);
       ASSERT_ERRNO_EQ(EINVAL);
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
 
       // Success/unsupported policy/missing permissions
       int setparam_result = LIBC_NAMESPACE::sched_setparam(0, &param);
       ASSERT_TRUE(setparam_result == 0 || setparam_result == -1);
       ASSERT_TRUE(setparam_result != -1
-                      ? (LIBC_NAMESPACE::libc_errno == 0)
-                      : ((setscheduler_result == -1 &&
-                          LIBC_NAMESPACE::libc_errno == EINVAL) ||
-                         LIBC_NAMESPACE::libc_errno == EPERM));
-      LIBC_NAMESPACE::libc_errno = 0;
+                      ? (libc_errno == 0)
+                      : ((setscheduler_result == -1 && libc_errno == EINVAL) ||
+                         libc_errno == EPERM));
+      libc_errno = 0;
 
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, &param), 0);
       ASSERT_ERRNO_SUCCESS();
@@ -143,7 +140,7 @@ public:
     // Null test
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, nullptr), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
   }
 };
 
@@ -161,13 +158,13 @@ LIST_SCHED_TESTS(SCHED_BATCH, true)
 LIST_SCHED_TESTS(SCHED_IDLE, true)
 
 TEST(LlvmLibcSchedParamAndSchedulerTest, NullParamTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, nullptr), -1);
   ASSERT_ERRNO_EQ(EINVAL);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, nullptr), -1);
   ASSERT_ERRNO_EQ(EINVAL);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
diff --git a/libc/test/src/sched/sched_rr_get_interval_test.cpp b/libc/test/src/sched/sched_rr_get_interval_test.cpp
index c22a2c76d743..a0fe5edbe014 100644
--- a/libc/test/src/sched/sched_rr_get_interval_test.cpp
+++ b/libc/test/src/sched/sched_rr_get_interval_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "src/sched/sched_getscheduler.h"
 #include "src/sched/sched_rr_get_interval.h"
@@ -17,7 +17,7 @@
 #include <sched.h>
 
 TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   auto SetSched = [&](int policy) {
     int min_priority = LIBC_NAMESPACE::sched_get_priority_min(policy);
     ASSERT_GE(min_priority, 0);
@@ -58,19 +58,19 @@ TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
     // Null timespec
     ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, nullptr), -1);
     ASSERT_ERRNO_EQ(EFAULT);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     // Negative pid
     ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(-1, &ts), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
   }
 
   // Negative tests don't have SCHED_RR set
   SetSched(SCHED_OTHER);
   ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, &ts), 0);
   ASSERT_ERRNO_SUCCESS();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   // TODO: Missing unkown pid -> ESRCH. This is read only so safe to try a few
   //       unlikely values.
diff --git a/libc/test/src/sched/yield_test.cpp b/libc/test/src/sched/yield_test.cpp
index f1627a71fa9a..4d13d50e25eb 100644
--- a/libc/test/src/sched/yield_test.cpp
+++ b/libc/test/src/sched/yield_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_yield.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcSchedYieldTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   // sched_yield() always succeeds, just do a basic test that errno/ret are
   // properly 0.
   ASSERT_EQ(LIBC_NAMESPACE::sched_yield(), 0);
diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp
index cc392da8f473..ce4dfddae248 100644
--- a/libc/test/src/signal/sigaltstack_test.cpp
+++ b/libc/test/src/signal/sigaltstack_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/signal_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaction.h"
@@ -46,7 +46,7 @@ static void handler(int) {
 
 TEST(LlvmLibcSignalTest, SigaltstackRunOnAltStack) {
   struct sigaction action;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGUSR1, nullptr, &action),
               Succeeds(0));
   action.sa_handler = handler;
diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp
index bac9c3b8b68b..62b86bf44029 100644
--- a/libc/test/src/signal/signal_test.cpp
+++ b/libc/test/src/signal/signal_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/signal/raise.h"
 #include "src/signal/signal.h"
 
@@ -17,7 +17,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 TEST(LlvmLibcSignal, Invalid) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   auto *valid = +[](int) {};
   EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid),
               Fails(EINVAL, (void *)SIG_ERR));
diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp
index 12403f68b593..891eac0f5bf7 100644
--- a/libc/test/src/signal/sigprocmask_test.cpp
+++ b/libc/test/src/signal/sigprocmask_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaddset.h"
 #include "src/signal/sigemptyset.h"
@@ -33,7 +33,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 // This tests for invalid input.
 TEST_F(LlvmLibcSignalTest, SigprocmaskInvalid) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   sigset_t valid;
   // 17 and -4 are out of the range for sigprocmask's how paramater.
diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
index c1edf56bdbd8..01ccb8218ee2 100644
--- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
+++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/spawn/file_actions.h"
 #include "src/spawn/posix_spawn_file_actions_addclose.h"
 #include "src/spawn/posix_spawn_file_actions_adddup2.h"
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index ef36cff2ffbd..104fc478b100 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,7 +9,7 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
@@ -22,7 +22,7 @@
 
 TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -53,7 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
 }
 
 TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -65,7 +65,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
 }
 
 TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +83,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 2cc8436bd66f..56bde5f0099a 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -17,7 +17,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
@@ -33,7 +33,7 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 46cf12c2c253..90429ecf4e82 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -20,7 +20,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
@@ -36,7 +36,7 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index a8a2c62f07b5..abed3d405293 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -14,7 +14,7 @@
 #include "src/stdio/fwrite.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
@@ -35,7 +35,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index a0368d701a67..e624181c795b 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -21,7 +21,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
@@ -41,7 +41,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +80,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +103,10 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,15 +121,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // LIBC_NAMESPACE::libc_errno = 0;
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // LIBC_NAMESPACE::libc_errno = 0;
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // LIBC_NAMESPACE::libc_errno = 0;
+  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
@@ -165,7 +165,7 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 61ce2a207fa1..03e1ac286b64 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -20,7 +20,7 @@
 
 #include "hdr/stdio_macros.h"
 #include "hdr/types/size_t.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
@@ -67,7 +67,7 @@ int seek_ss(void *cookie, off64_t *offset, int whence) {
   } else if (whence == SEEK_END) {
     new_offset = *offset + ss->endpos;
   } else {
-    LIBC_NAMESPACE::libc_errno = EINVAL;
+    libc_errno = EINVAL;
     return -1;
   }
   if (new_offset < 0 || size_t(new_offset) > ss->bufsize)
@@ -115,7 +115,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -149,7 +149,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -178,7 +178,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 72875600903a..84984e26398c 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -14,13 +14,13 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
 TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -39,7 +39,7 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
 TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index a5dd734c6361..ac494a4ecaf8 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,7 +8,7 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
@@ -19,7 +19,7 @@
 TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a1e1fee25db3..5872943c1bb4 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
@@ -102,6 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index f6af6ad3e364..f1b545ba546f 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -10,7 +10,7 @@
 #include "src/stdio/sprintf.h"
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
 #include <inttypes.h>
@@ -3228,46 +3228,46 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) {
   char buff[1000];
   int written;
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%m");
   ASSERT_STREQ_LEN(written, buff, "Success");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%m");
   ASSERT_STREQ_LEN(written, buff, "Numerical result out of range");
 
   // Check that it correctly consumes no arguments.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%m %d", 1);
   ASSERT_STREQ_LEN(written, buff, "Success 1");
 
   // Width Tests
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%10m");
   ASSERT_STREQ_LEN(written, buff, "   Success");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%10m");
   ASSERT_STREQ_LEN(written, buff, "Numerical result out of range");
 
   // Precision Tests
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%.10m");
   ASSERT_STREQ_LEN(written, buff, "Success");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%.10m");
   ASSERT_STREQ_LEN(written, buff, "Numerical ");
 
   // Flag Tests (Only '-' since the others only affect ints)
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%-10m");
   ASSERT_STREQ_LEN(written, buff, "Success   ");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%-10m");
   ASSERT_STREQ_LEN(written, buff, "Numerical result out of range");
 
@@ -3275,93 +3275,93 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) {
   // Since alt mode here is effectively a completely separate conversion, it
   // gets separate tests.
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#m");
   ASSERT_STREQ_LEN(written, buff, "0");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // Alt Mode Width
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#10m");
   ASSERT_STREQ_LEN(written, buff, "         0");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#10m");
   ASSERT_STREQ_LEN(written, buff, "    ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#10m");
   ASSERT_STREQ_LEN(written, buff, "     -9999");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#3m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#3m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // Alt Mode Precision
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.10m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.10m");
   ASSERT_STREQ_LEN(written, buff, "-0000009999");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.3m");
   ASSERT_STREQ_LEN(written, buff, "ERA");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.3m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // We don't test precision (or int flags) on errno = 0 because it behaves
   // weirdly, see the docs for more information.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.1m");
   ASSERT_STREQ_LEN(written, buff, "0");
 
   // Alt Mode Flags
 
   // '-' flag
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-10m");
   ASSERT_STREQ_LEN(written, buff, "0         ");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-10m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE    ");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-10m");
   ASSERT_STREQ_LEN(written, buff, "-9999     ");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-3m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-3m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // '+' flag
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#+m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#+m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
@@ -3370,38 +3370,38 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) {
   // come up, but I've avoided it for the other %m tests for ease of
   // refactoring if necessary. Here it needs to be positive to test that the
   // flags that only affect positive signed integers are properly passed along.
-  LIBC_NAMESPACE::libc_errno = 9999;
+  libc_errno = 9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#+m");
   ASSERT_STREQ_LEN(written, buff, "+9999");
 
   // ' ' flag
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%# m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%# m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
-  LIBC_NAMESPACE::libc_errno = 9999;
+  libc_errno = 9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%# m");
   ASSERT_STREQ_LEN(written, buff, " 9999");
 
   // '0' flag
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#010m");
   ASSERT_STREQ_LEN(written, buff, "    ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#010m");
   ASSERT_STREQ_LEN(written, buff, "-000009999");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#03m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#03m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 67f1b0ff513b..5d482b70064b 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -17,7 +17,7 @@
 #include "src/stdio/fwrite_unlocked.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
@@ -36,7 +36,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c78..3eeccc5727e7 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtoint32_test.cpp b/libc/test/src/stdlib/strtoint32_test.cpp
index 17df432fc8e6..e6da692714d2 100644
--- a/libc/test/src/stdlib/strtoint32_test.cpp
+++ b/libc/test/src/stdlib/strtoint32_test.cpp
@@ -8,9 +8,9 @@
 
 #include <stdint.h>
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 #include "StrtolTest.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ int32_t strtoint32(const char *__restrict str, char **__restrict str_end,
                    int base) {
   auto result = internal::strtointeger<int32_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
@@ -33,7 +33,7 @@ uint32_t strtouint32(const char *__restrict str, char **__restrict str_end,
                      int base) {
   auto result = internal::strtointeger<uint32_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
diff --git a/libc/test/src/stdlib/strtoint64_test.cpp b/libc/test/src/stdlib/strtoint64_test.cpp
index b5fe69dfaa70..2c5d948f5fae 100644
--- a/libc/test/src/stdlib/strtoint64_test.cpp
+++ b/libc/test/src/stdlib/strtoint64_test.cpp
@@ -8,9 +8,9 @@
 
 #include <stdint.h>
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 #include "StrtolTest.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ int64_t strtoint64(const char *__restrict str, char **__restrict str_end,
                    int base) {
   auto result = internal::strtointeger<int64_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
@@ -33,7 +33,7 @@ uint64_t strtouint64(const char *__restrict str, char **__restrict str_end,
                      int base) {
   auto result = internal::strtointeger<uint64_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index eb4056dc7ba6..c2f2b9c9a11c 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 
diff --git a/libc/test/src/sys/mman/linux/mlock_test.cpp b/libc/test/src/sys/mman/linux/mlock_test.cpp
index 88abacad554e..6b81411ca604 100644
--- a/libc/test/src/sys/mman/linux/mlock_test.cpp
+++ b/libc/test/src/sys/mman/linux/mlock_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sys/mman/madvise.h"
 #include "src/sys/mman/mincore.h"
 #include "src/sys/mman/mlock.h"
@@ -149,9 +149,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) {
         Succeeds());
     auto retval = LIBC_NAMESPACE::mlockall(MCL_CURRENT);
     if (retval == -1) {
-      EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM ||
-                  LIBC_NAMESPACE::libc_errno == EPERM);
-      LIBC_NAMESPACE::libc_errno = 0;
+      EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM);
+      libc_errno = 0;
       return;
     }
     unsigned char vec;
@@ -163,9 +162,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) {
   {
     auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE);
     if (retval == -1) {
-      EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM ||
-                  LIBC_NAMESPACE::libc_errno == EPERM);
-      LIBC_NAMESPACE::libc_errno = 0;
+      EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM);
+      libc_errno = 0;
       return;
     }
     PageHolder holder;
@@ -180,9 +178,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) {
   {
     auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE | MCL_ONFAULT);
     if (retval == -1) {
-      EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM ||
-                  LIBC_NAMESPACE::libc_errno == EPERM);
-      LIBC_NAMESPACE::libc_errno = 0;
+      EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM);
+      libc_errno = 0;
       return;
     }
     PageHolder holder;
diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
index 455a82678e18..ba0ee4f09109 100644
--- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/sys/stat/mkdirat.h"
 #include "src/sys/statvfs/fstatvfs.h"
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcSysFStatvfsTest, FStatvfsInvalidPath) {
 
   // Always delete the folder so that we start in a consistent state.
   LIBC_NAMESPACE::rmdir(TEST_DIR);
-  LIBC_NAMESPACE::libc_errno = 0; // Reset errno
+  libc_errno = 0; // Reset errno
 
   ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU),
               Succeeds(0));
diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
index f356bb3d277b..327dec07a1b7 100644
--- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/stat/mkdirat.h"
 #include "src/sys/statvfs/statvfs.h"
 #include "src/unistd/rmdir.h"
@@ -37,7 +37,7 @@ TEST_F(LlvmLibcSysStatvfsTest, StatvfsInvalidPath) {
 
   // Always delete the folder so that we start in a consistent state.
   LIBC_NAMESPACE::rmdir(TEST_DIR);
-  LIBC_NAMESPACE::libc_errno = 0; // Reset errno
+  libc_errno = 0; // Reset errno
 
   ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU),
               Succeeds(0));
diff --git a/libc/test/src/sys/time/setitimer_test.cpp b/libc/test/src/sys/time/setitimer_test.cpp
index 16d33fdf1e4f..115f9e662ed4 100644
--- a/libc/test/src/sys/time/setitimer_test.cpp
+++ b/libc/test/src/sys/time/setitimer_test.cpp
@@ -24,7 +24,7 @@ static bool timer_fired(false);
 extern "C" void handle_sigalrm(int) { timer_fired = true; }
 
 TEST_F(LlvmLibcSysTimeSetitimerTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   struct sigaction sa;
   sa.sa_handler = handle_sigalrm;
   LIBC_NAMESPACE::sigemptyset(&sa.sa_mask);
diff --git a/libc/test/src/termios/termios_test.cpp b/libc/test/src/termios/termios_test.cpp
index f8fc09a8bbf0..5ec169a886b1 100644
--- a/libc/test/src/termios/termios_test.cpp
+++ b/libc/test/src/termios/termios_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/termios/cfgetispeed.h"
 #include "src/termios/cfgetospeed.h"
@@ -30,21 +30,21 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 TEST(LlvmLibcTermiosTest, SpeedSmokeTest) {
   struct termios t;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, B50), Succeeds(0));
   ASSERT_EQ(LIBC_NAMESPACE::cfgetispeed(&t), speed_t(B50));
   ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, B75), Succeeds(0));
   ASSERT_EQ(LIBC_NAMESPACE::cfgetospeed(&t), speed_t(B75));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, ~CBAUD), Fails(EINVAL));
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, ~CBAUD), Fails(EINVAL));
 }
 
 TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) {
   struct termios t;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY);
   if (fd < 0)
     return; // When /dev/tty is not available, no point continuing.
@@ -54,7 +54,7 @@ TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) {
 }
 
 TEST(LlvmLibcTermiosTest, TcGetSidSmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY);
   if (fd < 0)
     return; // When /dev/tty is not available, no point continuing.
diff --git a/libc/test/src/time/asctime_r_test.cpp b/libc/test/src/time/asctime_r_test.cpp
index b595cfe02486..d840248b7df4 100644
--- a/libc/test/src/time/asctime_r_test.cpp
+++ b/libc/test/src/time/asctime_r_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/asctime_r.h"
 #include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/time/asctime_test.cpp b/libc/test/src/time/asctime_test.cpp
index 169a7463a303..cad25fffc65a 100644
--- a/libc/test/src/time/asctime_test.cpp
+++ b/libc/test/src/time/asctime_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/asctime.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp
index 27011b7e0fbd..fe43877aa499 100644
--- a/libc/test/src/time/ctime_r_test.cpp
+++ b/libc/test/src/time/ctime_r_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/ctime_r.h"
 #include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/time/ctime_test.cpp b/libc/test/src/time/ctime_test.cpp
index 6f1168f0b668..5ff69f6619b4 100644
--- a/libc/test/src/time/ctime_test.cpp
+++ b/libc/test/src/time/ctime_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/ctime.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
diff --git a/libc/test/src/time/gmtime_test.cpp b/libc/test/src/time/gmtime_test.cpp
index 6af5a18d3699..41236665d2ea 100644
--- a/libc/test/src/time/gmtime_test.cpp
+++ b/libc/test/src/time/gmtime_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/types/struct_tm.h"
 #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/gmtime.h"
 #include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -30,7 +30,7 @@ TEST(LlvmLibcGmTime, OutOfRange) {
   EXPECT_TRUE(tm_data == nullptr);
   ASSERT_ERRNO_EQ(EOVERFLOW);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   seconds =
       INT_MIN *
           static_cast<int64_t>(
diff --git a/libc/test/src/time/nanosleep_test.cpp b/libc/test/src/time/nanosleep_test.cpp
index d4f98e29bd98..e0200ff3aaa2 100644
--- a/libc/test/src/time/nanosleep_test.cpp
+++ b/libc/test/src/time/nanosleep_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/types/struct_timespec.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/nanosleep.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -17,7 +17,7 @@ namespace cpp = LIBC_NAMESPACE::cpp;
 TEST(LlvmLibcNanosleep, SmokeTest) {
   // TODO: When we have the code to read clocks, test that time has passed.
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   struct timespec tim = {1, 500};
   struct timespec tim2 = {0, 0};
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index b86d2f27e516..123d9ccc8310 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1601,6 +1601,7 @@ libc_support_library(
 libc_header_library(
     name = "libcxx_shared_headers",
     hdrs = [
+        "shared/libc_common.h",
         "shared/fp_bits.h",
         "shared/str_to_float.h",
         "shared/str_to_integer.h",
@@ -1618,7 +1619,7 @@ libc_header_library(
 libc_support_library(
     name = "errno",
     srcs = ["src/errno/libc_errno.cpp"],
-    hdrs = ["src/errno/libc_errno.h"],
+    hdrs = ["src/__support/libc_errno.h"],
     deps = [
         ":__support_common",
         ":__support_cpp_atomic",

From 79108da325daec08f5b50169a9c35e03ea0645a3 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:28:55 +0000
Subject: [PATCH 0056/1322] [libc][obvious] Changed incorrect type (#143780)

After changing mbstate_t to mbstate we forgot to change the
character_converter files to reflect it.

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 libc/src/__support/wchar/character_converter.h   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 0afc2a6f59e6..3cdb8ca83b7f 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -16,7 +16,7 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
+CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
 bool CharacterConverter::isComplete() {}
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index a6bac4380537..d0602d2defe2 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -19,10 +19,10 @@ namespace internal {
 
 class CharacterConverter {
 private:
-  mbstate_t *state;
+  mbstate *state;
 
 public:
-  CharacterConverter(mbstate_t *mbstate);
+  CharacterConverter(mbstate *mbstate);
 
   bool isComplete();
 

From c0c0f60ca14422dfbfe27fddd8d47faa596165d8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Jun 2025 22:09:55 +0100
Subject: [PATCH 0057/1322] [GlobalOpt] Bail out on non-ConstExprs in
 isSimpleEnoughtToCommit. (#143400)

Bail out for non ConstantExpr constants in
isSimpleEnoughValueToCommitHelper to prevent crash for non-ConstantExpr
constants

PR: https://github.com/llvm/llvm-project/pull/143400
---
 llvm/lib/Transforms/Utils/Evaluator.cpp       |  4 +-
 .../global-constructor-complex-constants.ll   | 64 +++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll

diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index 2af447aadce2..d1db2ee29f3a 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -77,7 +77,9 @@ isSimpleEnoughValueToCommitHelper(Constant *C,
   // We don't know exactly what relocations are allowed in constant expressions,
   // so we allow &global+constantoffset, which is safe and uniformly supported
   // across targets.
-  ConstantExpr *CE = cast<ConstantExpr>(C);
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE)
+    return false;
   switch (CE->getOpcode()) {
   case Instruction::BitCast:
     // Bitcast is fine if the casted value is fine.
diff --git a/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll
new file mode 100644
index 000000000000..6d9bdc41a004
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -p globalopt -S %s | FileCheck %s
+
+@llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }]
+
+@foo = internal global ptr null
+
+declare void @user(ptr)
+
+;.
+; CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }]
+; CHECK: @foo = internal global ptr null
+;.
+define void @ctor() {
+; CHECK-LABEL: define void @ctor() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr ptrauth (ptr @foo, i32 0), ptr [[DST]], align 8
+; CHECK-NEXT:    call void @user(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca ptr, align 8
+  store ptr ptrauth (ptr @foo, i32 0), ptr %dst, align 8
+  call void @user(ptr %dst)
+  ret void
+}
+
+define void @ctor_nocfi() {
+; CHECK-LABEL: define void @ctor_nocfi() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr no_cfi @foo, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @user(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca ptr, align 8
+  store ptr no_cfi @foo, ptr %dst, align 8
+  call void @user(ptr %dst)
+  ret void
+}
+
+define void @fn() {
+; CHECK-LABEL: define void @fn() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @ctor_dso_local_equivalent() {
+; CHECK-LABEL: define void @ctor_dso_local_equivalent() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr dso_local_equivalent @fn, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @user(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca ptr, align 8
+  store ptr dso_local_equivalent @fn, ptr %dst, align 8
+  call void @user(ptr %dst)
+  ret void
+}

From f39f53e569f92987683626d910e9dbcbd59ff410 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Wed, 11 Jun 2025 14:11:19 -0700
Subject: [PATCH 0058/1322] [Clang][NFC] Move HeadingAndSpellings to avoid
 copying (#143611)

Static analysis flagged that we could move HeadingAndSpellings and avoid
a copy of a large object.
---
 clang/utils/TableGen/ClangAttrEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 21d76c12a3cc..42627f02cf35 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -5405,7 +5405,7 @@ void EmitClangAttrDocs(const RecordKeeper &Records, raw_ostream &OS) {
       // Handle Undocumented category separately - no content merging
       if (Cat == "Undocumented" && UndocumentedCategory) {
         UndocumentedDocs.push_back(
-            DocumentationData(Doc, Attr, HeadingAndSpellings));
+            DocumentationData(Doc, Attr, std::move(HeadingAndSpellings)));
         continue;
       }
 

From d7e7f22626f214766f3592341dd1737fd232c6a5 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Thu, 12 Jun 2025 00:19:25 +0300
Subject: [PATCH 0059/1322] [Clang] fix missing source location for errors in
 macro-expanded (#143460)

Fixes #143216

---

This patch fixes diagnostic locations for tokens from macro expansions.
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Parse/Parser.h            |  4 +---
 clang/lib/Parse/ParseExprCXX.cpp              |  4 ++--
 clang/lib/Parse/ParseStmt.cpp                 |  7 ++++--
 clang/lib/Parse/Parser.cpp                    |  5 +++++
 .../test/Parser/macro-expansion-recovery.cpp  | 22 +++++++++++++++++++
 clang/test/Parser/switch-recovery.cpp         | 13 +++++++++++
 7 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/Parser/macro-expansion-recovery.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8043ab48f0b4..b42d5f8425af 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -694,6 +694,7 @@ Bug Fixes in This Version
 - Constant evaluation now correctly runs the destructor of a variable declared in
   the second clause of a C-style ``for`` loop. (#GH139818)
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
+- Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 0b2fab4a45c9..d99de77a5291 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -290,9 +290,7 @@ public:
     return ConsumeToken();
   }
 
-  SourceLocation getEndOfPreviousToken() {
-    return PP.getLocForEndOfToken(PrevTokLocation);
-  }
+  SourceLocation getEndOfPreviousToken() const;
 
   /// GetLookAheadToken - This peeks ahead N tokens and returns that token
   /// without consuming any tokens.  LookAhead(0) returns 'Tok', LookAhead(1)
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index d95260829e4a..55ad7f256fa8 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -421,8 +421,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
       // like we never saw it.
       Token Identifier = Tok; // Stash away the identifier.
       ConsumeToken();         // Eat the identifier, current token is now '::'.
-      Diag(PP.getLocForEndOfToken(ConsumeToken()), diag::err_expected)
-          << tok::identifier;
+      ConsumeToken();
+      Diag(getEndOfPreviousToken(), diag::err_expected) << tok::identifier;
       UnconsumeToken(Identifier); // Stick the identifier back.
       Next = NextToken();         // Point Next at the '{' token.
     }
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index c788723023c8..c00759893b0c 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -832,10 +832,13 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
           << "'case'" << tok::colon
           << FixItHint::CreateReplacement(ColonLoc, ":");
     } else {
-      SourceLocation ExpectedLoc = PP.getLocForEndOfToken(PrevTokLocation);
+      SourceLocation ExpectedLoc = getEndOfPreviousToken();
+
       Diag(ExpectedLoc, diag::err_expected_after)
           << "'case'" << tok::colon
-          << FixItHint::CreateInsertion(ExpectedLoc, ":");
+          << FixItHint::CreateInsertion(ExpectedLoc,
+                                        tok::getTokenName(tok::colon));
+
       ColonLoc = ExpectedLoc;
     }
 
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index db65c05cc114..788ed79e0c1f 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1873,6 +1873,11 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC,
   return AnnotatedNameKind::Unresolved;
 }
 
+SourceLocation Parser::getEndOfPreviousToken() const {
+  SourceLocation TokenEndLoc = PP.getLocForEndOfToken(PrevTokLocation);
+  return TokenEndLoc.isValid() ? TokenEndLoc : Tok.getLocation();
+}
+
 bool Parser::TryKeywordIdentFallback(bool DisableKeyword) {
   assert(Tok.isNot(tok::identifier));
   Diag(Tok, diag::ext_keyword_as_ident)
diff --git a/clang/test/Parser/macro-expansion-recovery.cpp b/clang/test/Parser/macro-expansion-recovery.cpp
new file mode 100644
index 000000000000..6826cc04e4df
--- /dev/null
+++ b/clang/test/Parser/macro-expansion-recovery.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace GH143216 {
+#define A x y
+enum { A }; // expected-error {{missing ',' between enumerators}}
+
+#define B x y
+void f() {
+    int a[2];
+    auto [B] = a; // expected-error {{expected ','}}
+}
+
+#define C <int!
+template <class T> class D;
+D C; // expected-error {{expected unqualified-id}} \
+     // expected-error {{expected '>'}} \
+     // expected-note {{to match this '<'}}
+
+#define E F::{
+class F { E }}; // expected-error {{expected identifier}} \
+                // expected-error {{expected member name or ';' after declaration specifiers}}
+}
diff --git a/clang/test/Parser/switch-recovery.cpp b/clang/test/Parser/switch-recovery.cpp
index baf703cd03ae..7b3909e3b0d3 100644
--- a/clang/test/Parser/switch-recovery.cpp
+++ b/clang/test/Parser/switch-recovery.cpp
@@ -229,3 +229,16 @@ void fn1() {
     }
 } // expected-error{{expected statement}}
 }
+
+namespace GH143216 {
+#define FOO 1 case 3:
+
+int f(int x) {
+  switch (x) {
+  case FOO // expected-error {{expected ':' after 'case'}}
+    return 0;
+  default:
+    return 1;
+  }
+}
+}

From 625bfb7179ad1acab2aba1023095826628275a60 Mon Sep 17 00:00:00 2001
From: Jiachen Yuan <jiacheny@nvidia.com>
Date: Wed, 11 Jun 2025 14:23:41 -0700
Subject: [PATCH 0060/1322] Workaround MSVC Linker Issue when Cross-Compiling
 for ARM64EC (#143659)

This MR presents a temporary workaround for the issue described at
https://github.com/llvm/llvm-project/issues/143575. While an [upstream
MSVC
bug](https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross-Compiling-L/10920141)
is reported, it makes sense to apply a workaround in LLVM code to
quickly unblock anyone affected.
---
 llvm/include/llvm/IR/Mangler.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h
index e3dfe1eac618..232101a8926b 100644
--- a/llvm/include/llvm/IR/Mangler.h
+++ b/llvm/include/llvm/IR/Mangler.h
@@ -26,7 +26,16 @@ class Triple;
 class Twine;
 class raw_ostream;
 
-constexpr std::string_view HybridPatchableTargetSuffix = "$hp_target";
+// TODO: The weird assignment of HybridPatchableTargetSuffix below is a
+// temporary workaround for a linker failure that is only hit when compiling
+// llvm for arm64ec on windows. The description and context of the issue is at
+// https://github.com/llvm/llvm-project/issues/143575.
+// An upstream MSVC bug is filed at
+// https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross-
+// Compiling-L/10920141.
+constexpr char HybridPatchableTargetSuffixArr[] = "$hp_target";
+constexpr std::string_view HybridPatchableTargetSuffix =
+    HybridPatchableTargetSuffixArr;
 
 class Mangler {
   /// We need to give global values the same name every time they are mangled.

From 7838fc0cd3fbe578d9554fdcd3198c2ba3616bcc Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Wed, 11 Jun 2025 23:24:33 +0200
Subject: [PATCH 0061/1322] [Clang] [NFC] Move diagnostics emitting code from
 `DiagnosticIDs` into `DiagnosticsEngine` (#143517)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It makes more sense for this functionality to be all in one place rather
than split up across two files—at least it caused me a bit of a headache
to try and find all places where we were actually forwarding the
diagnostic to the `DiagnosticConsumer`. Moreover, moving these functions
into `DiagnosticsEngine` simplifies the code quite a bit since we access
members of `DiagnosticsEngine` more frequently than those of
`DiagnosticIDs`. There was also a duplicated code snippet that I’ve
moved out into a new function.
---
 clang/include/clang/Basic/Diagnostic.h    | 23 +++---
 clang/include/clang/Basic/DiagnosticIDs.h | 12 ---
 clang/lib/Basic/Diagnostic.cpp            | 98 ++++++++++++++++++++---
 clang/lib/Basic/DiagnosticIDs.cpp         | 97 ----------------------
 4 files changed, 102 insertions(+), 128 deletions(-)

diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index e9c54c3c487c..efee8302e750 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -18,6 +18,7 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
+#include "clang/Basic/UnsignedOrNone.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FunctionExtras.h"
@@ -49,6 +50,7 @@ class FileSystem;
 namespace clang {
 
 class DeclContext;
+class Diagnostic;
 class DiagnosticBuilder;
 class DiagnosticConsumer;
 class IdentifierInfo;
@@ -228,6 +230,8 @@ public:
 class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
 public:
   /// The level of the diagnostic, after it has been through mapping.
+  // FIXME: Make this an alias for DiagnosticIDs::Level as soon as
+  // we can use 'using enum'.
   enum Level {
     Ignored = DiagnosticIDs::Ignored,
     Note = DiagnosticIDs::Note,
@@ -532,7 +536,7 @@ private:
   ///
   /// This is used to emit continuation diagnostics with the same level as the
   /// diagnostic that they follow.
-  DiagnosticIDs::Level LastDiagLevel;
+  Level LastDiagLevel;
 
   /// Number of warnings reported
   unsigned NumWarnings;
@@ -777,18 +781,16 @@ public:
   /// the middle of another diagnostic.
   ///
   /// This can be used by clients who suppress diagnostics themselves.
-  void setLastDiagnosticIgnored(bool Ignored) {
-    if (LastDiagLevel == DiagnosticIDs::Fatal)
+  void setLastDiagnosticIgnored(bool IsIgnored) {
+    if (LastDiagLevel == Fatal)
       FatalErrorOccurred = true;
-    LastDiagLevel = Ignored ? DiagnosticIDs::Ignored : DiagnosticIDs::Warning;
+    LastDiagLevel = IsIgnored ? Ignored : Warning;
   }
 
   /// Determine whether the previous diagnostic was ignored. This can
   /// be used by clients that want to determine whether notes attached to a
   /// diagnostic will be suppressed.
-  bool isLastDiagnosticIgnored() const {
-    return LastDiagLevel == DiagnosticIDs::Ignored;
-  }
+  bool isLastDiagnosticIgnored() const { return LastDiagLevel == Ignored; }
 
   /// Controls whether otherwise-unmapped extension diagnostics are
   /// mapped onto ignore/warning/error.
@@ -1024,9 +1026,10 @@ private:
   /// Used to report a diagnostic that is finally fully formed.
   ///
   /// \returns true if the diagnostic was emitted, false if it was suppressed.
-  bool ProcessDiag(const DiagnosticBuilder &DiagBuilder) {
-    return Diags->ProcessDiag(*this, DiagBuilder);
-  }
+  bool ProcessDiag(const DiagnosticBuilder &DiagBuilder);
+
+  /// Forward a diagnostic to the DiagnosticConsumer.
+  void Report(Level DiagLevel, const Diagnostic &Info);
 
   /// @name Diagnostic Emission
   /// @{
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index 80d52a0d0111..2b095f0fd674 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -483,18 +483,6 @@ private:
 
   Class getDiagClass(unsigned DiagID) const;
 
-  /// Used to report a diagnostic that is finally fully formed.
-  ///
-  /// \returns \c true if the diagnostic was emitted, \c false if it was
-  /// suppressed.
-  bool ProcessDiag(DiagnosticsEngine &Diag,
-                   const DiagnosticBuilder &DiagBuilder) const;
-
-  /// Used to emit a diagnostic that is finally fully formed,
-  /// ignoring suppression.
-  void EmitDiag(DiagnosticsEngine &Diag, const DiagnosticBuilder &DiagBuilder,
-                Level DiagLevel) const;
-
   /// Whether the diagnostic may leave the AST in a state where some
   /// invariants can break.
   bool isUnrecoverable(unsigned DiagID) const;
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 694224071347..95d86cb153b4 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -130,7 +130,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) {
   TrapNumErrorsOccurred = 0;
   TrapNumUnrecoverableErrorsOccurred = 0;
 
-  LastDiagLevel = DiagnosticIDs::Ignored;
+  LastDiagLevel = Ignored;
 
   if (!soft) {
     // Clear state related to #pragma diagnostic.
@@ -658,13 +658,95 @@ void DiagnosticsEngine::Report(const StoredDiagnostic &storedDiag) {
   Level DiagLevel = storedDiag.getLevel();
   Diagnostic Info(this, storedDiag.getLocation(), storedDiag.getID(),
                   DiagStorage, storedDiag.getMessage());
+  Report(DiagLevel, Info);
+}
+
+void DiagnosticsEngine::Report(Level DiagLevel, const Diagnostic &Info) {
+  assert(DiagLevel != Ignored && "Cannot emit ignored diagnostics!");
   Client->HandleDiagnostic(DiagLevel, Info);
   if (Client->IncludeInDiagnosticCounts()) {
-    if (DiagLevel == DiagnosticsEngine::Warning)
+    if (DiagLevel == Warning)
       ++NumWarnings;
   }
 }
 
+/// ProcessDiag - This is the method used to report a diagnostic that is
+/// finally fully formed.
+bool DiagnosticsEngine::ProcessDiag(const DiagnosticBuilder &DiagBuilder) {
+  Diagnostic Info(this, DiagBuilder);
+
+  assert(getClient() && "DiagnosticClient not set!");
+
+  // Figure out the diagnostic level of this message.
+  unsigned DiagID = Info.getID();
+  Level DiagLevel = getDiagnosticLevel(DiagID, Info.getLocation());
+
+  // Update counts for DiagnosticErrorTrap even if a fatal error occurred
+  // or diagnostics are suppressed.
+  if (DiagLevel >= Error) {
+    ++TrapNumErrorsOccurred;
+    if (Diags->isUnrecoverable(DiagID))
+      ++TrapNumUnrecoverableErrorsOccurred;
+  }
+
+  if (SuppressAllDiagnostics)
+    return false;
+
+  if (DiagLevel != Note) {
+    // Record that a fatal error occurred only when we see a second
+    // non-note diagnostic. This allows notes to be attached to the
+    // fatal error, but suppresses any diagnostics that follow those
+    // notes.
+    if (LastDiagLevel == Fatal)
+      FatalErrorOccurred = true;
+
+    LastDiagLevel = DiagLevel;
+  }
+
+  // If a fatal error has already been emitted, silence all subsequent
+  // diagnostics.
+  if (FatalErrorOccurred) {
+    if (DiagLevel >= Error && Client->IncludeInDiagnosticCounts())
+      ++NumErrors;
+
+    return false;
+  }
+
+  // If the client doesn't care about this message, don't issue it.  If this is
+  // a note and the last real diagnostic was ignored, ignore it too.
+  if (DiagLevel == Ignored || (DiagLevel == Note && LastDiagLevel == Ignored))
+    return false;
+
+  if (DiagLevel >= Error) {
+    if (Diags->isUnrecoverable(DiagID))
+      UnrecoverableErrorOccurred = true;
+
+    // Warnings which have been upgraded to errors do not prevent compilation.
+    if (Diags->isDefaultMappingAsError(DiagID))
+      UncompilableErrorOccurred = true;
+
+    ErrorOccurred = true;
+    if (Client->IncludeInDiagnosticCounts())
+      ++NumErrors;
+
+    // If we've emitted a lot of errors, emit a fatal error instead of it to
+    // stop a flood of bogus errors.
+    if (ErrorLimit && NumErrors > ErrorLimit && DiagLevel == Error) {
+      Report(diag::fatal_too_many_errors);
+      return false;
+    }
+  }
+
+  // Make sure we set FatalErrorOccurred to ensure that the notes from the
+  // diagnostic that caused `fatal_too_many_errors` won't be emitted.
+  if (Info.getID() == diag::fatal_too_many_errors)
+    FatalErrorOccurred = true;
+
+  // Finally, report it.
+  Report(DiagLevel, Info);
+  return true;
+}
+
 bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB,
                                        bool Force) {
   assert(getClient() && "DiagnosticClient not set!");
@@ -674,14 +756,12 @@ bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB,
     Diagnostic Info(this, DB);
 
     // Figure out the diagnostic level of this message.
-    DiagnosticIDs::Level DiagLevel =
-        Diags->getDiagnosticLevel(Info.getID(), Info.getLocation(), *this);
+    Level DiagLevel = getDiagnosticLevel(Info.getID(), Info.getLocation());
 
-    Emitted = (DiagLevel != DiagnosticIDs::Ignored);
-    if (Emitted) {
-      // Emit the diagnostic regardless of suppression level.
-      Diags->EmitDiag(*this, DB, DiagLevel);
-    }
+    // Emit the diagnostic regardless of suppression level.
+    Emitted = DiagLevel != Ignored;
+    if (Emitted)
+      Report(DiagLevel, Info);
   } else {
     // Process the diagnostic, sending the accumulated information to the
     // DiagnosticConsumer.
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index 3e90b2d80477..dcf0c6cb5428 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -823,103 +823,6 @@ unsigned DiagnosticIDs::getCXXCompatDiagId(const LangOptions &LangOpts,
   return StdVer >= D.StdVer ? D.DiagId : D.PreDiagId;
 }
 
-/// ProcessDiag - This is the method used to report a diagnostic that is
-/// finally fully formed.
-bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag,
-                                const DiagnosticBuilder &DiagBuilder) const {
-  Diagnostic Info(&Diag, DiagBuilder);
-
-  assert(Diag.getClient() && "DiagnosticClient not set!");
-
-  // Figure out the diagnostic level of this message.
-  unsigned DiagID = Info.getID();
-  DiagnosticIDs::Level DiagLevel
-    = getDiagnosticLevel(DiagID, Info.getLocation(), Diag);
-
-  // Update counts for DiagnosticErrorTrap even if a fatal error occurred
-  // or diagnostics are suppressed.
-  if (DiagLevel >= DiagnosticIDs::Error) {
-    ++Diag.TrapNumErrorsOccurred;
-    if (isUnrecoverable(DiagID))
-      ++Diag.TrapNumUnrecoverableErrorsOccurred;
-  }
-
-  if (Diag.SuppressAllDiagnostics)
-    return false;
-
-  if (DiagLevel != DiagnosticIDs::Note) {
-    // Record that a fatal error occurred only when we see a second
-    // non-note diagnostic. This allows notes to be attached to the
-    // fatal error, but suppresses any diagnostics that follow those
-    // notes.
-    if (Diag.LastDiagLevel == DiagnosticIDs::Fatal)
-      Diag.FatalErrorOccurred = true;
-
-    Diag.LastDiagLevel = DiagLevel;
-  }
-
-  // If a fatal error has already been emitted, silence all subsequent
-  // diagnostics.
-  if (Diag.FatalErrorOccurred) {
-    if (DiagLevel >= DiagnosticIDs::Error &&
-        Diag.Client->IncludeInDiagnosticCounts()) {
-      ++Diag.NumErrors;
-    }
-
-    return false;
-  }
-
-  // If the client doesn't care about this message, don't issue it.  If this is
-  // a note and the last real diagnostic was ignored, ignore it too.
-  if (DiagLevel == DiagnosticIDs::Ignored ||
-      (DiagLevel == DiagnosticIDs::Note &&
-       Diag.LastDiagLevel == DiagnosticIDs::Ignored))
-    return false;
-
-  if (DiagLevel >= DiagnosticIDs::Error) {
-    if (isUnrecoverable(DiagID))
-      Diag.UnrecoverableErrorOccurred = true;
-
-    // Warnings which have been upgraded to errors do not prevent compilation.
-    if (isDefaultMappingAsError(DiagID))
-      Diag.UncompilableErrorOccurred = true;
-
-    Diag.ErrorOccurred = true;
-    if (Diag.Client->IncludeInDiagnosticCounts()) {
-      ++Diag.NumErrors;
-    }
-
-    // If we've emitted a lot of errors, emit a fatal error instead of it to
-    // stop a flood of bogus errors.
-    if (Diag.ErrorLimit && Diag.NumErrors > Diag.ErrorLimit &&
-        DiagLevel == DiagnosticIDs::Error) {
-      Diag.Report(diag::fatal_too_many_errors);
-      return false;
-    }
-  }
-
-  // Make sure we set FatalErrorOccurred to ensure that the notes from the
-  // diagnostic that caused `fatal_too_many_errors` won't be emitted.
-  if (Info.getID() == diag::fatal_too_many_errors)
-    Diag.FatalErrorOccurred = true;
-  // Finally, report it.
-  EmitDiag(Diag, DiagBuilder, DiagLevel);
-  return true;
-}
-
-void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag,
-                             const DiagnosticBuilder &DiagBuilder,
-                             Level DiagLevel) const {
-  Diagnostic Info(&Diag, DiagBuilder);
-  assert(DiagLevel != DiagnosticIDs::Ignored && "Cannot emit ignored diagnostics!");
-
-  Diag.Client->HandleDiagnostic((DiagnosticsEngine::Level)DiagLevel, Info);
-  if (Diag.Client->IncludeInDiagnosticCounts()) {
-    if (DiagLevel == DiagnosticIDs::Warning)
-      ++Diag.NumWarnings;
-  }
-}
-
 bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const {
   // Only errors may be unrecoverable.
   if (getDiagClass(DiagID) < CLASS_ERROR)

From 6f2ba4712f17d7c82228a5b705570571e13a3832 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Wed, 11 Jun 2025 14:34:02 -0700
Subject: [PATCH 0062/1322] [mlir] Fix ComposeExpandOfCollapseOp for dynamic
 case (#142663)

Changes `findCollapsingReassociation` to return nullopt in all cases
where source shape has `>=2` dynamic dims. `expand(collapse)` can
reshape to in any valid output shape but a collapse can only collapse
contiguous dimensions. When there are `>=2` dynamic dimensions it is
impossible to determine if it can be simplified to a collapse or if it
is preforming a more advanced reassociation.


This problem was uncovered by
https://github.com/llvm/llvm-project/pull/137963

---------

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h |  9 ++++++---
 mlir/test/Dialect/Tensor/canonicalize.mlir        | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index af575e10acc8..61c2a50e514c 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -387,11 +387,14 @@ private:
       auto resultSubShape =
           resultShape.slice(resultIndices.front(), resultIndices.size());
 
+      if (llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2 &&
+          llvm::count_if(resultSubShape, ShapedType::isDynamic) >= 2)
+        return std::nullopt;
+
       if (srcSubShape.size() == resultSubShape.size()) {
-        if (srcSubShape != resultSubShape ||
-            llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2) {
+        if (srcSubShape != resultSubShape)
           return std::nullopt;
-        }
+
         for (auto index : llvm::seq<int64_t>(0, srcSubShape.size())) {
           composedReassociation.emplace_back(1, srcIndices.front() + index);
         }
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 65c5b3e8602e..67b03b0a3485 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1272,6 +1272,20 @@ func.func @compose_expand_of_collapse_dynamic(%arg0 : tensor<4x?x10x64x2xf16>, %
 
 // -----
 
+func.func @no_compose_collapse_of_expand_dynamic(%arg0 : tensor<?x8x128x?xf16>, %arg1: index) -> tensor<?x128x?xf16> {
+  %collapse = tensor.collapse_shape %arg0 [[0, 1, 2, 3]] : tensor<?x8x128x?xf16> into tensor<?xf16>
+  %expanded_19 = tensor.expand_shape %collapse [[0, 1, 2]] output_shape [%arg1, 8, %arg1] : tensor<?xf16> into tensor<?x128x?xf16>
+  return %expanded_19 : tensor<?x128x?xf16>
+}
+// CHECK-LABEL: func @no_compose_collapse_of_expand_dynamic
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor
+//  CHECK-SAME:   %[[ARG1:.+]]: index
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]]
+//       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]]
+//       CHECK:   return %[[EXPAND]]
+
+// -----
+
 // CHECK-LABEL: func @zero_rank_reshape_multi
 func.func @zero_rank_reshape_multi(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: return %arg0

From 9c9a4a284e95ea5e27617af7235e3ab049bae680 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Wed, 11 Jun 2025 14:54:30 -0700
Subject: [PATCH 0063/1322] [LOH] Don't emit AdrpAddStr when register could be
 clobbered (#142849)

https://github.com/llvm/llvm-project/commit/b783aa89795635cbe7b25b4143b562931fcec9f6
added a check to ensure an `AdrpAddLdr` LOH isn't created when there is
an instruction between the `add` and `ldr`


https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp#L419-L431

We need a similar check for `AdrpAddStr`. Although this technically
isn't implemented in LLD, it could be in the future.


https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/lld/MachO/Arch/ARM64.cpp#L699-L702
---
 llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 37 +++++++++++-------
 .../AArch64/loh-adrp-add-ldr-clobber.mir      | 39 +++++++++++++------
 2 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 53e8e438c5e5..064716216d1c 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -247,6 +247,17 @@ static bool supportLoadFromLiteral(const MachineInstr &MI) {
   }
 }
 
+/// Returns \p true if there are no non-debug instructions between \p First and
+/// \p Second
+static bool areInstructionsConsecutive(const MachineInstr *First,
+                                       const MachineInstr *Second) {
+  auto It = First->getIterator();
+  auto EndIt = First->getParent()->instr_end();
+  if (It == EndIt)
+    return false;
+  return next_nodbg(It, EndIt) == Second->getIterator();
+}
+
 /// Number of GPR registers tracked by mapRegToGPRIndex()
 static const unsigned N_GPR_REGS = 31;
 /// Map register number to index from 0-30.
@@ -415,7 +426,7 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
         ++NumADRPToLDR;
       }
       break;
-    case MCLOH_AdrpAddLdr: {
+    case MCLOH_AdrpAddLdr:
       // There is a possibility that the linker may try to rewrite:
       // adrp x0, @sym@PAGE
       // add x1, x0, @sym@PAGEOFF
@@ -432,28 +443,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
       // FIXME: Implement proper liveness tracking for all registers. For now,
       // don't emit the LOH if there are any instructions between the add and
       // the ldr.
-      MachineInstr *AddMI = const_cast<MachineInstr *>(Info.MI1);
-      const MachineInstr *LdrMI = Info.MI0;
-      auto AddIt = MachineBasicBlock::iterator(AddMI);
-      auto EndIt = AddMI->getParent()->end();
-      if (AddMI->getIterator() == EndIt || LdrMI != &*next_nodbg(AddIt, EndIt))
+      if (!areInstructionsConsecutive(Info.MI1, Info.MI0))
         break;
-
       LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n"
                         << '\t' << MI << '\t' << *Info.MI1 << '\t'
                         << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
       ++NumADDToLDR;
       break;
-    }
     case MCLOH_AdrpAddStr:
-      if (Info.MI1 != nullptr) {
-        LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
-                          << '\t' << MI << '\t' << *Info.MI1 << '\t'
-                          << *Info.MI0);
-        AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
-        ++NumADDToSTR;
-      }
+      if (!Info.MI1)
+        break;
+      if (!areInstructionsConsecutive(Info.MI1, Info.MI0))
+        break;
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
+                        << '\t' << MI << '\t' << *Info.MI1 << '\t'
+                        << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
+      ++NumADDToSTR;
       break;
     case MCLOH_AdrpLdrGotLdr:
       LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n"
diff --git a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir
index ce2d8f02f4cc..a1d8bf375a19 100644
--- a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir
+++ b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir
@@ -1,16 +1,34 @@
-# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s
+# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s --implicit-check-not=MCLOH_
 # REQUIRES: asserts
+
+# Check that we don't emit LOHs when there is a clobbering def of x8.
 --- |
   @sym2 = local_unnamed_addr global [10000000 x i32] zeroinitializer, align 8
   @sym = local_unnamed_addr global i32 zeroinitializer, align 8
 
-  define i32 @main() {
-    ret i32 0
-  }
-
+  define i32 @adrp_add_ldr() { ret i32 0 }
+  define i32 @adrp_add_str() { ret i32 0 }
 ...
+
 ---
-name:            main
+name:            adrp_add_ldr
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x21', virtual-reg: '' }
+body:             |
+  bb.0:
+    liveins: $x21
+    renamable $x8 = ADRP target-flags(aarch64-page) @sym
+    renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0
+    renamable $x8 = ADDXri killed renamable $x21, 1, 0
+    $x9 = LDRXui $x9, 0
+
+    RET undef $lr
+...
+
+---
+name:            adrp_add_str
 alignment:       4
 tracksRegLiveness: true
 liveins:
@@ -19,13 +37,10 @@ liveins:
 body:             |
   bb.0:
     liveins: $x21, $x22
-    ; Check we don't emit an loh here because there's a clobbering def of x8 before the ldr.
-    ; CHECK-LABEL: main
-    ; CHECK-NOT: MCLOH_AdrpAddLdr
     renamable $x8 = ADRP target-flags(aarch64-page) @sym
     renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0
-    renamable $x8 = ADDXri killed renamable $x22, 1, 0
-    $x9 = LDRXui $x9, 0
-    RET undef $lr
+    renamable $x8 = ADDXri killed renamable $x21, 1, 0
+    STRXui $x22, $x9, 0
 
+    RET undef $lr
 ...

From 74172add65aa14e77e98b048db0074c3f273057f Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 11 Jun 2025 18:18:22 -0400
Subject: [PATCH 0064/1322] [mlir][generate-test-checks] Do not emit the
 autogenerated note if it exists (#143750)

Prior to this PR, the script removed the already existing autogenerated
note if we came across a line that was equal to the note. But the
default note is multiple lines, so there would never be a match.
Instead, check to see if the current line is a substring of the
autogenerated note.

Co-authored-by: Michael Maitland <michaelmaitland@meta.com>
---
 mlir/utils/generate-test-checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index 11fb4e40072e..f77c9688d931 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -208,7 +208,7 @@ def process_source_lines(source_lines, note, args):
     source_segments = [[]]
     for line in source_lines:
         # Remove previous note.
-        if line == note:
+        if line in note:
             continue
         # Remove previous CHECK lines.
         if line.find(args.check_prefix) != -1:

From 0e457315f55889878ccbc3e35d4beb04e277733f Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 11 Jun 2025 18:19:15 -0400
Subject: [PATCH 0065/1322] [mlir][generate-test-checks] Emit attributes with
 rest of CHECK lines (#143759)

Prior to this patch, generating test checks in place put the ATTR
definitions at the very top of the file, above the RUN lines and
autogenerated note. All CHECK lines should below the RUN lines and
autogenerated note.

This change ensures that the attribute definitions are emitted with the
rest of the CHECK lines.

---------

Co-authored-by: Michael Maitland <michaelmaitland@meta.com>
---
 mlir/utils/generate-test-checks.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index f77c9688d931..14a790e6d0e6 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -220,12 +220,19 @@ def process_source_lines(source_lines, note, args):
         source_segments[-1].append(line + "\n")
     return source_segments
 
-def process_attribute_definition(line, attribute_namer, output):
+
+def process_attribute_definition(line, attribute_namer):
     m = ATTR_DEF_RE.match(line)
     if m:
         attribute_name = attribute_namer.generate_name(m.group(1))
-        line = '// CHECK: #[[' + attribute_name + ':.+]] =' + line[len(m.group(0)):] + '\n'
-        output.write(line)
+        return (
+            "// CHECK: #[["
+            + attribute_name
+            + ":.+]] ="
+            + line[len(m.group(0)) :]
+            + "\n"
+        )
+    return None
 
 def process_attribute_references(line, attribute_namer):
 
@@ -340,6 +347,9 @@ def main():
     variable_namer = VariableNamer(args.variable_names)
     attribute_namer = AttributeNamer(args.attribute_names)
 
+    # Store attribute definitions to emit at appropriate scope
+    pending_attr_defs = []
+
     # Process lines
     for input_line in input_lines:
         if not input_line:
@@ -350,8 +360,9 @@ def main():
         if input_line.startswith("// -----"):
             continue
 
-        # Check if this is an attribute definition and process it
-        process_attribute_definition(input_line, attribute_namer, output)
+        if ATTR_DEF_RE.match(input_line):
+            pending_attr_defs.append(input_line)
+            continue
 
         # Lines with blocks begin with a ^. These lines have a trailing comment
         # that needs to be stripped.
@@ -407,6 +418,13 @@ def main():
             output_line += process_line(ssa_split[1:], variable_namer)
 
         else:
+            # Emit any pending attribute definitions at the start of this scope
+            for attr in pending_attr_defs:
+                attr_line = process_attribute_definition(attr, attribute_namer)
+                if attr_line:
+                    output_segments[-1].append(attr_line)
+            pending_attr_defs.clear()
+
             # Output the first line chunk that does not contain an SSA name for the
             # label.
             output_line = "// " + args.check_prefix + "-LABEL: " + ssa_split[0] + "\n"

From ee35e342945d6825c9b2b004fd135cf16c84ea0e Mon Sep 17 00:00:00 2001
From: Nikolay Panchenko <nicholas.panchenko@gmail.com>
Date: Wed, 11 Jun 2025 19:00:29 -0400
Subject: [PATCH 0066/1322] [ConstantFolding] Add folding for [de]interleave2,
 insert and extract (#141301)

The change adds folding for 4 vector intrinsics: `interleave2`,
`deinterleave2`, `vector_extract` and `vector_insert`. For the last 2
intrinsics the change does not use `ShuffleVector` fold mechanism as
it's much simpler to construct result vector explicitly.
---
 llvm/lib/Analysis/ConstantFolding.cpp         | 97 +++++++++++++++++++
 .../InstSimplify/ConstProp/vector-calls.ll    | 68 +++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 1ef0badd2375..139a0b81e299 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1635,6 +1635,10 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::vector_reduce_smax:
   case Intrinsic::vector_reduce_umin:
   case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_extract:
+  case Intrinsic::vector_insert:
+  case Intrinsic::vector_interleave2:
+  case Intrinsic::vector_deinterleave2:
   // Target intrinsics
   case Intrinsic::amdgcn_perm:
   case Intrinsic::amdgcn_wave_reduce_umin:
@@ -3758,6 +3762,72 @@ static Constant *ConstantFoldFixedVectorCall(
     }
     return nullptr;
   }
+  case Intrinsic::vector_extract: {
+    auto *Idx = dyn_cast<ConstantInt>(Operands[1]);
+    Constant *Vec = Operands[0];
+    if (!Idx || !isa<FixedVectorType>(Vec->getType()))
+      return nullptr;
+
+    unsigned NumElements = FVTy->getNumElements();
+    unsigned VecNumElements =
+        cast<FixedVectorType>(Vec->getType())->getNumElements();
+    unsigned StartingIndex = Idx->getZExtValue();
+
+    // Extracting entire vector is nop
+    if (NumElements == VecNumElements && StartingIndex == 0)
+      return Vec;
+
+    for (unsigned I = StartingIndex, E = StartingIndex + NumElements; I < E;
+         ++I) {
+      Constant *Elt = Vec->getAggregateElement(I);
+      if (!Elt)
+        return nullptr;
+      Result[I - StartingIndex] = Elt;
+    }
+
+    return ConstantVector::get(Result);
+  }
+  case Intrinsic::vector_insert: {
+    Constant *Vec = Operands[0];
+    Constant *SubVec = Operands[1];
+    auto *Idx = dyn_cast<ConstantInt>(Operands[2]);
+    if (!Idx || !isa<FixedVectorType>(Vec->getType()))
+      return nullptr;
+
+    unsigned SubVecNumElements =
+        cast<FixedVectorType>(SubVec->getType())->getNumElements();
+    unsigned VecNumElements =
+        cast<FixedVectorType>(Vec->getType())->getNumElements();
+    unsigned IdxN = Idx->getZExtValue();
+    // Replacing entire vector with a subvec is nop
+    if (SubVecNumElements == VecNumElements && IdxN == 0)
+      return SubVec;
+
+    for (unsigned I = 0; I < VecNumElements; ++I) {
+      Constant *Elt;
+      if (I < IdxN + SubVecNumElements)
+        Elt = SubVec->getAggregateElement(I - IdxN);
+      else
+        Elt = Vec->getAggregateElement(I);
+      if (!Elt)
+        return nullptr;
+      Result[I] = Elt;
+    }
+    return ConstantVector::get(Result);
+  }
+  case Intrinsic::vector_interleave2: {
+    unsigned NumElements =
+        cast<FixedVectorType>(Operands[0]->getType())->getNumElements();
+    for (unsigned I = 0; I < NumElements; ++I) {
+      Constant *Elt0 = Operands[0]->getAggregateElement(I);
+      Constant *Elt1 = Operands[1]->getAggregateElement(I);
+      if (!Elt0 || !Elt1)
+        return nullptr;
+      Result[2 * I] = Elt0;
+      Result[2 * I + 1] = Elt1;
+    }
+    return ConstantVector::get(Result);
+  }
   default:
     break;
   }
@@ -3919,6 +3989,33 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID,
       return nullptr;
     return ConstantStruct::get(StTy, SinResult, CosResult);
   }
+  case Intrinsic::vector_deinterleave2: {
+    auto *Vec = dyn_cast<Constant>(Operands[0]);
+    if (!Vec)
+      return nullptr;
+
+    auto *VecTy = cast<VectorType>(Vec->getType());
+    unsigned NumElements = VecTy->getElementCount().getKnownMinValue() / 2;
+    if (isa<ConstantAggregateZero>(Vec)) {
+      auto *HalfVecTy = VectorType::getHalfElementsVectorType(VecTy);
+      return ConstantStruct::get(StTy, ConstantAggregateZero::get(HalfVecTy),
+                                 ConstantAggregateZero::get(HalfVecTy));
+    }
+    if (isa<FixedVectorType>(Vec->getType())) {
+      SmallVector<Constant *, 4> Res0(NumElements), Res1(NumElements);
+      for (unsigned I = 0; I < NumElements; ++I) {
+        Constant *Elt0 = Vec->getAggregateElement(2 * I);
+        Constant *Elt1 = Vec->getAggregateElement(2 * I + 1);
+        if (!Elt0 || !Elt1)
+          return nullptr;
+        Res0[I] = Elt0;
+        Res1[I] = Elt1;
+      }
+      return ConstantStruct::get(StTy, ConstantVector::get(Res0),
+                                 ConstantVector::get(Res1));
+    }
+    return nullptr;
+  }
   default:
     // TODO: Constant folding of vector intrinsics that fall through here does
     // not work (e.g. overflow intrinsics)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
new file mode 100644
index 000000000000..9dbe3d4e50ee
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instsimplify,verify -S | FileCheck %s
+
+define <3 x i32> @fold_vector_extract() {
+; CHECK-LABEL: define <3 x i32> @fold_vector_extract() {
+; CHECK-NEXT:    ret <3 x i32> <i32 3, i32 4, i32 5>
+;
+  %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i64 3)
+  ret <3 x i32> %1
+}
+
+@a = external global i16, align 1
+
+define <3 x i32> @fold_vector_extract_constexpr() {
+; CHECK-LABEL: define <3 x i32> @fold_vector_extract_constexpr() {
+; CHECK-NEXT:    ret <3 x i32> <i32 ptrtoint (ptr @a to i32), i32 1, i32 2>
+;
+  %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> <i32 ptrtoint (ptr @a to i32), i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i64 0)
+  ret <3 x i32> %1
+}
+
+define <8 x i32> @fold_vector_extract_nop() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_extract_nop() {
+; CHECK-NEXT:    ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;
+  %1 = call <8 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @fold_vector_insert() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_insert() {
+; CHECK-NEXT:    ret <8 x i32> <i32 9, i32 10, i32 11, i32 12, i32 5, i32 6, i32 7, i32 8>
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <4 x i32> <i32 9, i32 10, i32 11, i32 12>, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @fold_vector_insert_nop() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_insert_nop() {
+; CHECK-NEXT:    ret <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @fold_vector_interleave2() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_interleave2() {
+; CHECK-NEXT:    ret <8 x i32> <i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 4, i32 8>
+;
+  %1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
+  ret <8 x i32> %1
+}
+
+define {<4 x i32>, <4 x i32>} @fold_vector_deinterleave2() {
+; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @fold_vector_deinterleave2() {
+; CHECK-NEXT:    ret { <4 x i32>, <4 x i32> } { <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8> }
+;
+  %1 = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<8 x i32> <i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 4, i32 8>)
+  ret {<4 x i32>, <4 x i32>} %1
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @fold_scalable_vector_deinterleave2() {
+; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @fold_scalable_vector_deinterleave2() {
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } zeroinitializer
+;
+  %1 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<vscale x 8 x i32> zeroinitializer)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %1
+}

From dc4335a2bf75c7b9928a72a7f15df0276120d7ed Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 11 Jun 2025 18:22:05 -0500
Subject: [PATCH 0067/1322] [libc] Perform bitfield zero initialization
 wave-parallel (#143607)

Summary:
We need to set the bitfield memory to zero because the system does not
guarantee zeroed out memory. Even if fresh pages are zero, the system
allows re-use so we would need a `kfd` level API to skip this step.

Because we can't this patch updates the logic to perform the zero
initialization wave-parallel. This reduces the amount of time it takes
to allocate a fresh by up to a tenth.

This has the unfortunate side effect that the control flow is more
convoluted and we waste some extra registers, but it's worth it to
reduce the slab allocation latency.
---
 libc/src/__support/GPU/allocator.cpp | 46 +++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index ecc0de1cb6ec..66ab155e5c29 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) {
   return (x + N) & ~(N - 1);
 }
 
+// Perform a lane parallel memset on a uint32_t pointer.
+void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t workers = cpp::popcount(uniform);
+  for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
+    s[i] = c;
+}
+
 } // namespace impl
 
 /// A slab allocator used to hand out identically sized slabs of memory.
@@ -157,10 +165,15 @@ struct Slab {
     Header *header = reinterpret_cast<Header *>(memory);
     header->chunk_size = chunk_size;
     header->global_index = global_index;
+  }
 
-    // This memset is expensive and likely not necessary for the current 'kfd'
-    // driver. Until zeroed pages are exposed by the API we must be careful.
-    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  // Set the necessary bitfield bytes to zero in parallel using many lanes. This
+  // must be called before the bitfield can be accessed safely, memory is not
+  // guaranteed to be zero initialized in the current implementation.
+  void initialize(uint64_t uniform) {
+    uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
+                    sizeof(uint32_t);
+    impl::uniform_memset(get_bitfield(), 0, size, uniform);
   }
 
   // Get the number of chunks that can theoretically fit inside this slab.
@@ -354,14 +367,7 @@ private:
       void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
-      Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
-
-      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-      ptr.store(mem, cpp::MemoryOrder::RELAXED);
-      cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
-      if (!ref.acquire(n, count))
-        ref.reset(n, count);
-      return mem;
+      return new (raw) Slab(cpp::forward<Args>(args)...);
     }
 
     if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
@@ -374,6 +380,16 @@ private:
     return ptr.load(cpp::MemoryOrder::RELAXED);
   }
 
+  // Finalize the associated memory and signal that it is ready to use by
+  // resetting the counter.
+  void finalize(Slab *mem, uint32_t n, uint64_t &count) {
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    ptr.store(mem, cpp::MemoryOrder::RELAXED);
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    if (!ref.acquire(n, count))
+      ref.reset(n, count);
+  }
+
 public:
   // Attempt to lock access to the pointer, potentially creating it if empty.
   // The uniform mask represents which lanes share the same pointer. For each
@@ -392,6 +408,14 @@ public:
     if (!result)
       return nullptr;
 
+    // We defer storing the newly allocated slab until now so that we can use
+    // multiple lanes to initialize it and release it for use.
+    if (count == cpp::numeric_limits<uint64_t>::max()) {
+      result->initialize(uniform);
+      if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
+        finalize(result, cpp::popcount(uniform), count);
+    }
+
     if (count != cpp::numeric_limits<uint64_t>::max())
       count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
 

From 1ecd108cb7ceda2b11281b5d173e2827feb60c55 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 11 Jun 2025 16:22:17 -0700
Subject: [PATCH 0068/1322] [libc] Migrate stdio tests to ErrnoCheckingTest.
 (#143802)

Reduce the direct use of libc_errno in stdio unit tests by adopting
ErrnoCheckingTest where appropriate.

Also removes the libc_errno.h inclusions from stdlib.h tests that were
accidentally added in d87eea35fac5a34a841c637db8908128409a184e
---
 libc/test/src/stdio/CMakeLists.txt           | 10 ++++++++++
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++--------
 libc/test/src/stdio/fgetc_test.cpp           |  5 ++---
 libc/test/src/stdio/fgetc_unlocked_test.cpp  |  5 ++---
 libc/test/src/stdio/fgets_test.cpp           |  6 +++---
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++---------------
 libc/test/src/stdio/fopencookie_test.cpp     | 15 +++++++--------
 libc/test/src/stdio/remove_test.cpp          | 10 +++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 +++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  8 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 +++----
 libc/test/src/stdlib/StrtolTest.h            |  1 -
 libc/test/src/stdlib/strtold_test.cpp        |  1 -
 13 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 01904a30504e..3627006ec28f 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -426,6 +430,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -440,6 +445,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -456,6 +462,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -476,6 +483,7 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -498,6 +506,7 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -515,6 +524,7 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100..b53184c30be3 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a..7c652f666a8f 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,12 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -33,7 +33,6 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82..f4471dd82df1 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,12 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -36,7 +36,6 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d405293..c00a9256af52 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,12 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -35,7 +36,6 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b..e097785832d5 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b64..bcf5e674141a 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c..296bff1f5dc1 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8..135fb98c07fb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb4..4144bc1bef44 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -14,9 +14,10 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +53,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +103,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064b..e99b382d1211 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e7..03f0a6539c78 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c..eb4056dc7ba6 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 3c7af175e51c3ab08ac3c442146c2b822f38c01e Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Wed, 11 Jun 2025 16:52:21 -0700
Subject: [PATCH 0069/1322] [libc] Fix stdio tests after #143802 (#143810)

In #143802 the stdio test cleanup missed a few places where errno was
being set to a failing value, and one where the framework needed to
included.
---
 libc/docs/configure.rst                     | 2 +-
 libc/test/src/stdio/fgetc_test.cpp          | 1 +
 libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 +
 libc/test/src/stdio/fgets_test.cpp          | 1 +
 libc/test/src/stdio/setvbuf_test.cpp        | 1 +
 5 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 8d53390ae19b..109412225634 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,7 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 7c652f666a8f..1faa49112fb6 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -33,6 +33,7 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index f4471dd82df1..7b2efe642fb5 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -36,6 +36,7 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index c00a9256af52..2d7c68d49081 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -36,6 +36,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 4144bc1bef44..a0936ba79ef7 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,6 +11,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"

From 6c72084a578a7a1e4dc1013a1a4a30b72ad5c6ab Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 16:56:37 -0700
Subject: [PATCH 0070/1322] [bazel] port
 1ecd108cb7ceda2b11281b5d173e2827feb60c55

---
 utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
index 484d3e5e0a24..505b73fd7711 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
@@ -122,6 +122,7 @@ libc_test(
         "//libc:mkdirat",
         "//libc:open",
         "//libc:remove",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 

From bc7ea63e9c885fbe71dec29581a206bc0543d22a Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 11 Jun 2025 20:04:27 -0400
Subject: [PATCH 0071/1322] [MemCpyOpt] handle memcpy from memset for
 non-constant sizes (#143727)

Allows forwarding memset to memcpy for mismatching unknown sizes if
overread has undef contents. In that case we can refine the undef bytes
to the memset value.

Refs #140954 which laid some of the groundwork for this.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 38 +++++++++----------
 .../MemCpyOpt/variable-sized-memset-memcpy.ll |  6 +--
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 960001bf880c..1c4ec6aa08b4 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1440,7 +1440,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   int64_t MOffset = 0;
   const DataLayout &DL = MemCpy->getModule()->getDataLayout();
   // We can only transforms memcpy's where the dest of one is the source of the
-  // other, or the memory transfer has a known offset from the memset.
+  // other, or they have a known offset.
   if (MemCpy->getSource() != MemSet->getDest()) {
     std::optional<int64_t> Offset =
         MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL);
@@ -1451,28 +1451,28 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 
   if (MOffset != 0 || MemSetSize != CopySize) {
     // Make sure the memcpy doesn't read any more than what the memset wrote,
-    // other than undef. Don't worry about sizes larger than i64. A known memset
-    // size is required.
+    // other than undef. Don't worry about sizes larger than i64.
     auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
-    if (!CMemSetSize)
-      return false;
-
-    // A known memcpy size is also required.
     auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
-    if (!CCopySize)
-      return false;
-    if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
+    if (!CMemSetSize || !CCopySize ||
+        CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
       if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA))
         return false;
-      // Clip the memcpy to the bounds of the memset
-      if (MOffset == 0)
-        CopySize = MemSetSize;
-      else
-        CopySize =
-            ConstantInt::get(CopySize->getType(),
-                             CMemSetSize->getZExtValue() <= (uint64_t)MOffset
-                                 ? 0
-                                 : CMemSetSize->getZExtValue() - MOffset);
+
+      if (CMemSetSize && CCopySize) {
+        // If both have constant sizes and offsets, clip the memcpy to the
+        // bounds of the memset if applicable.
+        assert(CCopySize->getZExtValue() + MOffset >
+               CMemSetSize->getZExtValue());
+        if (MOffset == 0)
+          CopySize = MemSetSize;
+        else
+          CopySize =
+              ConstantInt::get(CopySize->getType(),
+                               CMemSetSize->getZExtValue() <= (uint64_t)MOffset
+                                   ? 0
+                                   : CMemSetSize->getZExtValue() - MOffset);
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
index d5b1ab9b2f29..4b44f8b44f74 100644
--- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
@@ -19,12 +19,12 @@ define void @test(ptr %src, i8 %c, i64 %size) {
 }
 
 ; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca
-define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) {
-; CHECK-LABEL: @negative_test(
+define void @dynsize_test(ptr %src, i8 %c, i64 %size1, i64 %size2) {
+; CHECK-LABEL: @dynsize_test(
 ; CHECK-NEXT:    [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1
 ; CHECK-NEXT:    [[DST2:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[DST1]], i8 [[C:%.*]], i64 [[SIZE1]], i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[DST1]], i64 [[SIZE2]], i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[DST2]], i8 [[C]], i64 [[SIZE2]], i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dst1 = alloca i8, i64 %size1

From d7c6cad744bc7ed28535dc6f75629902eda559ea Mon Sep 17 00:00:00 2001
From: Jake Egan <Jake.egan@ibm.com>
Date: Wed, 11 Jun 2025 20:22:15 -0400
Subject: [PATCH 0072/1322] [sanitizer_common] Implement interception on AIX
 (#138606)

Adjust AIX interceptor support in sanitizer_common.

Issue: https://github.com/llvm/llvm-project/issues/138916
---
 .../sanitizer_common_interceptors.inc         | 43 ++++++++-----
 .../sanitizer_common_interceptors_ioctl.inc   |  2 +
 ...izer_common_interceptors_memintrinsics.inc |  8 ++-
 .../sanitizer_platform_interceptors.h         | 61 +++++++++++--------
 .../sanitizer_redefine_builtins.h             |  2 +-
 5 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 9272e2ab6cbd..2d6cf7fc3282 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -481,7 +481,8 @@ INTERCEPTOR(char*, textdomain, const char *domainname) {
 #endif
 
 #if SANITIZER_INTERCEPT_STRCMP || SANITIZER_INTERCEPT_MEMCMP
-static inline int CharCmpX(unsigned char c1, unsigned char c2) {
+[[maybe_unused]] static inline int CharCmpX(unsigned char c1,
+                                            unsigned char c2) {
   return (c1 == c2) ? 0 : (c1 < c2) ? -1 : 1;
 }
 #endif
@@ -1350,7 +1351,8 @@ INTERCEPTOR(unsigned long, time, unsigned long *t) {
 #if SANITIZER_INTERCEPT_LOCALTIME_AND_FRIENDS
 static void unpoison_tm(void *ctx, __sanitizer_tm *tm) {
   COMMON_INTERCEPTOR_WRITE_RANGE(ctx, tm, sizeof(*tm));
-#if !SANITIZER_SOLARIS
+// AIX tm struct does not have tm_zone field.
+#  if !SANITIZER_SOLARIS && !SANITIZER_AIX
   if (tm->tm_zone) {
     // Can not use COMMON_INTERCEPTOR_WRITE_RANGE here, because tm->tm_zone
     // can point to shared memory and tsan would report a data race.
@@ -1735,10 +1737,12 @@ INTERCEPTOR(int, __vsprintf_chk, char *str, int flag, SIZE_T size_to,
 VSPRINTF_INTERCEPTOR_IMPL(vsprintf, str, format, ap)
 #endif
 
+#  if SANITIZER_INTERCEPT_VASPRINTF
 INTERCEPTOR(int, vasprintf, char **strp, const char *format, va_list ap)
 VASPRINTF_INTERCEPTOR_IMPL(vasprintf, strp, format, ap)
+#  endif
 
-#if SANITIZER_INTERCEPT_ISOC99_PRINTF
+#  if SANITIZER_INTERCEPT_ISOC99_PRINTF
 INTERCEPTOR(int, __isoc99_vprintf, const char *format, va_list ap)
 VPRINTF_INTERCEPTOR_IMPL(__isoc99_vprintf, format, ap)
 
@@ -1787,10 +1791,12 @@ INTERCEPTOR(int, __snprintf_chk, char *str, SIZE_T size, int flag,
 FORMAT_INTERCEPTOR_IMPL(__snprintf_chk, vsnprintf, str, size, format)
 #endif
 
+#  if SANITIZER_INTERCEPT_ASPRINTF
 INTERCEPTOR(int, asprintf, char **strp, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(asprintf, vasprintf, strp, format)
+#  endif
 
-#if SANITIZER_INTERCEPT_ISOC99_PRINTF
+#  if SANITIZER_INTERCEPT_ISOC99_PRINTF
 INTERCEPTOR(int, __isoc99_printf, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(__isoc99_printf, __isoc99_vprintf, format)
 
@@ -1811,17 +1817,24 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_snprintf, __isoc99_vsnprintf, str, size,
 #endif  // SANITIZER_INTERCEPT_PRINTF
 
 #if SANITIZER_INTERCEPT_PRINTF
-#define INIT_PRINTF                     \
-  COMMON_INTERCEPT_FUNCTION_LDBL(printf);    \
-  COMMON_INTERCEPT_FUNCTION_LDBL(sprintf);   \
-  COMMON_INTERCEPT_FUNCTION_LDBL(snprintf);  \
-  COMMON_INTERCEPT_FUNCTION_LDBL(asprintf);  \
-  COMMON_INTERCEPT_FUNCTION_LDBL(fprintf);   \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vprintf);   \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf);  \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf); \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf);
+#  define INIT_PRINTF_COMMON                   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(printf);    \
+    COMMON_INTERCEPT_FUNCTION_LDBL(sprintf);   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(snprintf);  \
+    COMMON_INTERCEPT_FUNCTION_LDBL(fprintf);   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vprintf);   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf);  \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf);
+#  if !SANITIZER_AIX
+// AIX does not have [v]asprintf.
+#    define INIT_PRINTF_EXTRA                   \
+      COMMON_INTERCEPT_FUNCTION_LDBL(asprintf); \
+      COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf);
+#  else
+#    define INIT_PRINTF_EXTRA
+#  endif
+#  define INIT_PRINTF INIT_PRINTF_COMMON INIT_PRINTF_EXTRA
 #else
 #define INIT_PRINTF
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index bc8f02826c61..08c2be47f535 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -79,7 +79,9 @@ static void ioctl_table_fill() {
   _(TIOCMSET, READ, sizeof(int));
   _(TIOCNXCL, NONE, 0);
   _(TIOCOUTQ, WRITE, sizeof(int));
+#  if !SANITIZER_AIX
   _(TIOCSCTTY, NONE, 0);
+#  endif
   _(TIOCSPGRP, READ, pid_t_sz);
   _(TIOCSWINSZ, READ, struct_winsize_sz);
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
index 1565a494140f..0b6731c89950 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
@@ -33,11 +33,13 @@
 
 // Platform-specific options.
 #if SANITIZER_APPLE
-#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
 #elif SANITIZER_WINDOWS64
-#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
+#elif SANITIZER_AIX
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
 #else
-#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1
 #endif  // SANITIZER_APPLE
 
 #ifndef COMMON_INTERCEPTOR_MEMSET_IMPL
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 4bc55d7801db..ccc808b60ca7 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -141,6 +141,12 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SI_SOLARIS 0
 #endif
 
+#if SANITIZER_AIX
+#  define SI_NOT_AIX 0
+#else
+#  define SI_NOT_AIX 1
+#endif
+
 #if SANITIZER_SOLARIS32
 #define SI_SOLARIS32 1
 #else
@@ -161,20 +167,20 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 
 #define SANITIZER_INTERCEPT_STRLEN SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRNLEN (SI_NOT_MAC && SI_NOT_FUCHSIA)
-#define SANITIZER_INTERCEPT_STRCMP SI_NOT_FUCHSIA
+#define SANITIZER_INTERCEPT_STRCMP (SI_NOT_FUCHSIA && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_STRSTR SI_NOT_FUCHSIA
-#define SANITIZER_INTERCEPT_STRCASESTR SI_POSIX
+#define SANITIZER_INTERCEPT_STRCASESTR (SI_POSIX && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_STRTOK SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRCHR SI_NOT_FUCHSIA
-#define SANITIZER_INTERCEPT_STRCHRNUL SI_POSIX_NOT_MAC
+#define SANITIZER_INTERCEPT_STRCHRNUL (SI_POSIX_NOT_MAC && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_STRRCHR SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRSPN SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRPBRK SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_TEXTDOMAIN SI_LINUX_NOT_ANDROID || SI_SOLARIS
 #define SANITIZER_INTERCEPT_STRCASECMP SI_POSIX
 #define SANITIZER_INTERCEPT_MEMSET 1
-#define SANITIZER_INTERCEPT_MEMMOVE 1
-#define SANITIZER_INTERCEPT_MEMCPY 1
+#define SANITIZER_INTERCEPT_MEMMOVE SI_NOT_AIX
+#define SANITIZER_INTERCEPT_MEMCPY SI_NOT_AIX
 #define SANITIZER_INTERCEPT_MEMCMP SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_BCMP \
   SANITIZER_INTERCEPT_MEMCMP &&  \
@@ -233,9 +239,11 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_ISOC99_SCANF SI_GLIBC
 
 #ifndef SANITIZER_INTERCEPT_PRINTF
-#define SANITIZER_INTERCEPT_PRINTF SI_POSIX
-#define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD)
-#define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC
+#  define SANITIZER_INTERCEPT_ASPRINTF SI_NOT_AIX
+#  define SANITIZER_INTERCEPT_VASPRINTF SI_NOT_AIX
+#  define SANITIZER_INTERCEPT_PRINTF SI_POSIX
+#  define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD)
+#  define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC
 #endif
 
 #define SANITIZER_INTERCEPT_SETPROCTITLE (SI_FREEBSD || SI_NETBSD)
@@ -243,8 +251,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT___PRINTF_CHK \
   (SANITIZER_INTERCEPT_PRINTF && SI_GLIBC)
 
-#define SANITIZER_INTERCEPT_FREXP SI_NOT_FUCHSIA
-#define SANITIZER_INTERCEPT_FREXPF SI_POSIX
+// AIX libc does not export FREXP and FREXPF.
+#define SANITIZER_INTERCEPT_FREXP (SI_NOT_FUCHSIA && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_FREXPF (SI_POSIX && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_FREXPL SI_POSIX
 
 #define SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS SI_POSIX
@@ -294,7 +303,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_ACCEPT4 \
   (SI_LINUX_NOT_ANDROID || SI_NETBSD || SI_FREEBSD)
 #define SANITIZER_INTERCEPT_PACCEPT SI_NETBSD
-#define SANITIZER_INTERCEPT_MODF SI_POSIX
+#define SANITIZER_INTERCEPT_MODF (SI_POSIX && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_RECVMSG SI_POSIX
 #define SANITIZER_INTERCEPT_SENDMSG SI_POSIX
 #define SANITIZER_INTERCEPT_RECVMMSG SI_LINUX
@@ -329,8 +338,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT___WCSXFRM_L SI_LINUX
 #define SANITIZER_INTERCEPT_WCSNRTOMBS \
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
-#define SANITIZER_INTERCEPT_WCRTOMB \
-  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
+#define SANITIZER_INTERCEPT_WCRTOMB                                           \
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS || \
+   !SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_WCTOMB \
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_TCGETATTR SI_LINUX_NOT_ANDROID || SI_SOLARIS
@@ -370,7 +380,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_GETMNTENT_R SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_STATFS \
   (SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
-#define SANITIZER_INTERCEPT_STATFS64 SI_GLIBC && SANITIZER_HAS_STATFS64
+#define SANITIZER_INTERCEPT_STATFS64 \
+  ((SI_GLIBC || !SI_NOT_AIX) && SANITIZER_HAS_STATFS64)
 #define SANITIZER_INTERCEPT_STATVFS \
   (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_STATVFS64 SI_GLIBC
@@ -419,10 +430,10 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_TTYNAME_R SI_POSIX
 #define SANITIZER_INTERCEPT_TEMPNAM SI_POSIX
 #define SANITIZER_INTERCEPT_SINCOS SI_LINUX || SI_SOLARIS
-#define SANITIZER_INTERCEPT_REMQUO SI_POSIX
-#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD)
-#define SANITIZER_INTERCEPT_LGAMMA SI_POSIX
-#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD)
+#define SANITIZER_INTERCEPT_REMQUO (SI_POSIX && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_LGAMMA (SI_POSIX && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_LGAMMA_R (SI_FREEBSD || SI_LINUX || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_LGAMMAL_R SI_LINUX_NOT_ANDROID || SI_SOLARIS
 #define SANITIZER_INTERCEPT_DRAND48_R SI_GLIBC
@@ -505,11 +516,13 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE (SI_LINUX || SI_FREEBSD)
 
 #define SI_STAT_LINUX (SI_LINUX && __GLIBC_PREREQ(2, 33))
-#define SANITIZER_INTERCEPT_STAT                                        \
-  (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS ||     \
-   SI_STAT_LINUX)
-#define SANITIZER_INTERCEPT_STAT64 SI_STAT_LINUX && SANITIZER_HAS_STAT64
-#define SANITIZER_INTERCEPT_LSTAT (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX)
+#define SANITIZER_INTERCEPT_STAT                                    \
+  (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS || \
+   SI_STAT_LINUX || !SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_STAT64 \
+  ((SI_STAT_LINUX || !SI_NOT_AIX) && SANITIZER_HAS_STAT64)
+#define SANITIZER_INTERCEPT_LSTAT \
+  (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX || !SI_NOT_AIX)
 #define SANITIZER_INTERCEPT___XSTAT \
   ((!SANITIZER_INTERCEPT_STAT && SI_POSIX) || SI_STAT_LINUX)
 #define SANITIZER_INTERCEPT___XSTAT64 SI_GLIBC
@@ -578,7 +591,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_PROTOENT_R SI_GLIBC
 #define SANITIZER_INTERCEPT_NETENT (SI_LINUX || SI_NETBSD || SI_FREEBSD)
 #define SANITIZER_INTERCEPT_SETVBUF \
-  (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC)
+  (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC || !SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_GETMNTINFO (SI_NETBSD || SI_FREEBSD || SI_MAC)
 #define SANITIZER_INTERCEPT_MI_VECTOR_HASH SI_NETBSD
 #define SANITIZER_INTERCEPT_GETVFSSTAT SI_NETBSD
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h
index 41e0613d6fc1..bda0f0468769 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h
@@ -15,7 +15,7 @@
 #    define SANITIZER_REDEFINE_BUILTINS_H
 
 // The asm hack only works with GCC and Clang.
-#    if !defined(_WIN32)
+#    if !defined(_WIN32) && !defined(_AIX)
 
 asm(R"(
     .set memcpy, __sanitizer_internal_memcpy

From 7a3bcf9f7179e6904d405de36360714da07c31ba Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 11 Jun 2025 21:50:35 +0800
Subject: [PATCH 0073/1322] [RISCV] Add missing predicate for PseudoTHVdotVMAQA
 family instructions

---
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 2fccbcaf2cf3..89441444a994 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -660,10 +660,12 @@ def : Pat<(i32 (sub GPR:$rd, (mul (sexti16 (i32 GPR:$rs1)),
           (TH_MULSH GPR:$rd, GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasVendorXTHeadMac, IsRV32]
 
+let Predicates = [HasVendorXTHeadVdot] in {
 defm PseudoTHVdotVMAQA      : VPseudoVMAQA_VV_VX;
 defm PseudoTHVdotVMAQAU     : VPseudoVMAQA_VV_VX;
 defm PseudoTHVdotVMAQASU    : VPseudoVMAQA_VV_VX;
 defm PseudoTHVdotVMAQAUS    : VPseudoVMAQA_VX;
+}
 
 let Predicates = [HasVendorXTHeadVdot] in {
 defm : VPatTernaryVMAQA_VV_VX<"int_riscv_th_vmaqa",  "PseudoTHVdotVMAQA",

From 7034014d08249a1e159a668a71e96a0b78636a39 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Wed, 11 Jun 2025 18:07:00 -0700
Subject: [PATCH 0074/1322] [InstCombine] Combine or-disjoint (and->mul),
 (and->mul) to and->mul (#136013)

The canonical pattern for bitmasked mul is currently

```
%val = and %x, %bitMask // where %bitMask is some constant
%cmp = icmp eq %val, 0
%sel = select %cmp, 0, %C // where %C is some constant = C' * %bitMask
```

In certain cases, where we are combining multiple of these bitmasked
muls with common factors, we are able to optimize into and->mul (see
https://github.com/llvm/llvm-project/pull/135274 )

This optimization lends itself to further optimizations. This PR
addresses one of such optimizations.

In cases where we have

`or-disjoint ( mul(and (X, C1), D) , mul (and (X, C2), D))`

we can combine into

`mul( and (X, (C1 + C2)), D) `

provided C1 and C2 are disjoint.

Generalized proof: https://alive2.llvm.org/ce/z/MQYMui
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 124 ++++++++++++------
 .../test/Transforms/InstCombine/or-bitmask.ll | 116 ++++++++++++++--
 2 files changed, 187 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index c6c231f81c4a..dce695a03600 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3592,6 +3592,73 @@ static Value *foldOrOfInversions(BinaryOperator &I,
   return nullptr;
 }
 
+// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools
+// track these properities for preservation. Note that we can decompose
+// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask *
+// Factor))
+struct DecomposedBitMaskMul {
+  Value *X;
+  APInt Factor;
+  APInt Mask;
+  bool NUW;
+  bool NSW;
+};
+
+static std::optional<DecomposedBitMaskMul> matchBitmaskMul(Value *V) {
+  Instruction *Op = dyn_cast<Instruction>(V);
+  if (!Op)
+    return std::nullopt;
+
+  // Decompose (A & N) * C) into BitMaskMul
+  Value *Original = nullptr;
+  const APInt *Mask = nullptr;
+  const APInt *MulConst = nullptr;
+  if (match(Op, m_Mul(m_And(m_Value(Original), m_APInt(Mask)),
+                      m_APInt(MulConst)))) {
+    if (MulConst->isZero() || Mask->isZero())
+      return std::nullopt;
+
+    return std::optional<DecomposedBitMaskMul>(
+        {Original, *MulConst, *Mask,
+         cast<BinaryOperator>(Op)->hasNoUnsignedWrap(),
+         cast<BinaryOperator>(Op)->hasNoSignedWrap()});
+  }
+
+  Value *Cond = nullptr;
+  const APInt *EqZero = nullptr, *NeZero = nullptr;
+
+  // Decompose ((A & N) ? 0 : N * C) into BitMaskMul
+  if (match(Op, m_Select(m_Value(Cond), m_APInt(EqZero), m_APInt(NeZero)))) {
+    auto ICmpDecompose =
+        decomposeBitTest(Cond, /*LookThruTrunc=*/true,
+                         /*AllowNonZeroC=*/false, /*DecomposeBitMask=*/true);
+    if (!ICmpDecompose.has_value())
+      return std::nullopt;
+
+    assert(ICmpInst::isEquality(ICmpDecompose->Pred) &&
+           ICmpDecompose->C.isZero());
+
+    if (ICmpDecompose->Pred == ICmpInst::ICMP_NE)
+      std::swap(EqZero, NeZero);
+
+    if (!EqZero->isZero() || NeZero->isZero())
+      return std::nullopt;
+
+    if (!ICmpDecompose->Mask.isPowerOf2() || ICmpDecompose->Mask.isZero() ||
+        NeZero->getBitWidth() != ICmpDecompose->Mask.getBitWidth())
+      return std::nullopt;
+
+    if (!NeZero->urem(ICmpDecompose->Mask).isZero())
+      return std::nullopt;
+
+    return std::optional<DecomposedBitMaskMul>(
+        {ICmpDecompose->X, NeZero->udiv(ICmpDecompose->Mask),
+         ICmpDecompose->Mask, /*NUW=*/false, /*NSW=*/false});
+  }
+
+  return std::nullopt;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -3674,49 +3741,26 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
                                    /*NSW=*/true, /*NUW=*/true))
       return R;
 
-    Value *Cond0 = nullptr, *Cond1 = nullptr;
-    const APInt *Op0Eq = nullptr, *Op0Ne = nullptr;
-    const APInt *Op1Eq = nullptr, *Op1Ne = nullptr;
+    // (A & N) * C + (A & M) * C -> (A & (N + M)) & C
+    // This also accepts the equivalent select form of (A & N) * C
+    // expressions i.e. !(A & N) ? 0 : N * C)
+    auto Decomp1 = matchBitmaskMul(I.getOperand(1));
+    if (Decomp1) {
+      auto Decomp0 = matchBitmaskMul(I.getOperand(0));
+      if (Decomp0 && Decomp0->X == Decomp1->X &&
+          (Decomp0->Mask & Decomp1->Mask).isZero() &&
+          Decomp0->Factor == Decomp1->Factor) {
 
-    //  (!(A & N) ? 0 : N * C) + (!(A & M) ? 0 : M * C) -> A & (N + M) * C
-    if (match(I.getOperand(0),
-              m_Select(m_Value(Cond0), m_APInt(Op0Eq), m_APInt(Op0Ne))) &&
-        match(I.getOperand(1),
-              m_Select(m_Value(Cond1), m_APInt(Op1Eq), m_APInt(Op1Ne)))) {
+        Value *NewAnd = Builder.CreateAnd(
+            Decomp0->X, ConstantInt::get(Decomp0->X->getType(),
+                                         (Decomp0->Mask + Decomp1->Mask)));
 
-      auto LHSDecompose =
-          decomposeBitTest(Cond0, /*LookThruTrunc=*/true,
-                           /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
-      auto RHSDecompose =
-          decomposeBitTest(Cond1, /*LookThruTrunc=*/true,
-                           /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
+        auto *Combined = BinaryOperator::CreateMul(
+            NewAnd, ConstantInt::get(NewAnd->getType(), Decomp1->Factor));
 
-      if (LHSDecompose && RHSDecompose && LHSDecompose->X == RHSDecompose->X &&
-          RHSDecompose->Mask.isPowerOf2() && LHSDecompose->Mask.isPowerOf2() &&
-          LHSDecompose->Mask != RHSDecompose->Mask &&
-          LHSDecompose->Mask.getBitWidth() == Op0Ne->getBitWidth() &&
-          RHSDecompose->Mask.getBitWidth() == Op1Ne->getBitWidth()) {
-        assert(Op0Ne->getBitWidth() == Op1Ne->getBitWidth());
-        assert(ICmpInst::isEquality(LHSDecompose->Pred));
-        if (LHSDecompose->Pred == ICmpInst::ICMP_NE)
-          std::swap(Op0Eq, Op0Ne);
-        if (RHSDecompose->Pred == ICmpInst::ICMP_NE)
-          std::swap(Op1Eq, Op1Ne);
-
-        if (!Op0Ne->isZero() && !Op1Ne->isZero() && Op0Eq->isZero() &&
-            Op1Eq->isZero() && Op0Ne->urem(LHSDecompose->Mask).isZero() &&
-            Op1Ne->urem(RHSDecompose->Mask).isZero() &&
-            Op0Ne->udiv(LHSDecompose->Mask) ==
-                Op1Ne->udiv(RHSDecompose->Mask)) {
-          auto NewAnd = Builder.CreateAnd(
-              LHSDecompose->X,
-              ConstantInt::get(LHSDecompose->X->getType(),
-                               (LHSDecompose->Mask + RHSDecompose->Mask)));
-
-          return BinaryOperator::CreateMul(
-              NewAnd, ConstantInt::get(NewAnd->getType(),
-                                       Op0Ne->udiv(LHSDecompose->Mask)));
-        }
+        Combined->setHasNoUnsignedWrap(Decomp0->NUW && Decomp1->NUW);
+        Combined->setHasNoSignedWrap(Decomp0->NSW && Decomp1->NSW);
+        return Combined;
       }
     }
   }
diff --git a/llvm/test/Transforms/InstCombine/or-bitmask.ll b/llvm/test/Transforms/InstCombine/or-bitmask.ll
index 3b482dc1794d..3c992dfea569 100644
--- a/llvm/test/Transforms/InstCombine/or-bitmask.ll
+++ b/llvm/test/Transforms/InstCombine/or-bitmask.ll
@@ -36,13 +36,9 @@ define i32 @add_select_cmp_and2(i32 %in) {
 
 define i32 @add_select_cmp_and3(i32 %in) {
 ; CHECK-LABEL: @add_select_cmp_and3(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
-; CHECK-NEXT:    [[TEMP:%.*]] = mul nuw nsw i32 [[TMP1]], 72
-; CHECK-NEXT:    [[BITOP2:%.*]] = and i32 [[IN]], 4
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[BITOP2]], 0
-; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 288
-; CHECK-NEXT:    [[OUT:%.*]] = or disjoint i32 [[TEMP]], [[SEL2]]
-; CHECK-NEXT:    ret i32 [[OUT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 7
+; CHECK-NEXT:    [[TEMP1:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[TEMP1]]
 ;
   %bitop0 = and i32 %in, 1
   %cmp0 = icmp eq i32 %bitop0, 0
@@ -60,12 +56,9 @@ define i32 @add_select_cmp_and3(i32 %in) {
 
 define i32 @add_select_cmp_and4(i32 %in) {
 ; CHECK-LABEL: @add_select_cmp_and4(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
-; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
-; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[IN]], 12
-; CHECK-NEXT:    [[TEMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 72
-; CHECK-NEXT:    [[OUT1:%.*]] = or disjoint i32 [[OUT]], [[TEMP3]]
-; CHECK-NEXT:    ret i32 [[OUT1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[IN:%.*]], 15
+; CHECK-NEXT:    [[TEMP2:%.*]] = mul nuw nsw i32 [[TMP2]], 72
+; CHECK-NEXT:    ret i32 [[TEMP2]]
 ;
   %bitop0 = and i32 %in, 1
   %cmp0 = icmp eq i32 %bitop0, 0
@@ -361,6 +354,103 @@ define i64 @mask_select_types_1(i64 %in) {
   ret i64 %out
 }
 
+define i32 @add_select_cmp_mixed1(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_mixed1(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %mask = and i32 %in, 1
+  %sel0 = mul i32 %mask, 72
+  %bitop1 = and i32 %in, 2
+  %cmp1 = icmp eq i32 %bitop1, 0
+  %sel1 = select i1 %cmp1, i32 0, i32 144
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_mixed2(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_mixed2(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %bitop0 = and i32 %in, 1
+  %cmp0 = icmp eq i32 %bitop0, 0
+  %mask = and i32 %in, 2
+  %sel0 = select i1 %cmp0, i32 0, i32 72
+  %sel1 = mul i32 %mask, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_and_mul(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_and_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %mask0 = and i32 %in, 1
+  %sel0 = mul i32 %mask0, 72
+  %mask1 = and i32 %in, 2
+  %sel1 = mul i32 %mask1, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_mixed2_mismatch(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_mixed2_mismatch(
+; CHECK-NEXT:    [[BITOP0:%.*]] = and i32 [[IN:%.*]], 1
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[BITOP0]], 0
+; CHECK-NEXT:    [[MASK:%.*]] = and i32 [[IN]], 2
+; CHECK-NEXT:    [[SEL0:%.*]] = select i1 [[CMP0]], i32 0, i32 73
+; CHECK-NEXT:    [[SEL1:%.*]] = mul nuw nsw i32 [[MASK]], 72
+; CHECK-NEXT:    [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]]
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %bitop0 = and i32 %in, 1
+  %cmp0 = icmp eq i32 %bitop0, 0
+  %mask = and i32 %in, 2
+  %sel0 = select i1 %cmp0, i32 0, i32 73
+  %sel1 = mul i32 %mask, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_and_mul_mismatch(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_and_mul_mismatch(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[IN:%.*]] to i1
+; CHECK-NEXT:    [[SEL0:%.*]] = select i1 [[TMP1]], i32 73, i32 0
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 [[IN]], 2
+; CHECK-NEXT:    [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72
+; CHECK-NEXT:    [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]]
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %mask0 = and i32 %in, 1
+  %sel0 = mul i32 %mask0, 73
+  %mask1 = and i32 %in, 2
+  %sel1 = mul i32 %mask1, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @and_mul_non_disjoint(i32 %in) {
+; CHECK-LABEL: @and_mul_non_disjoint(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 2
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 [[IN]], 4
+; CHECK-NEXT:    [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72
+; CHECK-NEXT:    [[OUT1:%.*]] = or i32 [[OUT]], [[SEL1]]
+; CHECK-NEXT:    ret i32 [[OUT1]]
+;
+  %mask0 = and i32 %in, 2
+  %sel0 = mul i32 %mask0, 72
+  %mask1 = and i32 %in, 4
+  %sel1 = mul i32 %mask1, 72
+  %out = or i32 %sel0, %sel1
+  ret i32 %out
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CONSTSPLAT: {{.*}}
 ; CONSTVEC: {{.*}}

From c4316180418ce8de4b4c9812c7fac791d55b6102 Mon Sep 17 00:00:00 2001
From: Shunsuke Watanabe <watanabe.shu-06@fujitsu.com>
Date: Thu, 12 Jun 2025 10:19:26 +0900
Subject: [PATCH 0075/1322] [Clang][Driver] Override complex number calculation
 method by -fno-fast-math (#132680)

This patch fixes a bug where -fno-fast-math doesn't revert the complex
number calculation method to the default. The priority of overriding
options related to complex number calculations differs slightly from
GCC, as discussed in:


https://discourse.llvm.org/t/the-priority-of-fno-fast-math-regarding-complex-number-calculations/84679
---
 clang/lib/Driver/ToolChains/Clang.cpp |  22 +++++-
 clang/test/Driver/range.c             | 100 +++++++++++++++++++++++---
 2 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a74fa81f3cf5..1d11be1d82be 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2831,8 +2831,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef Float16ExcessPrecision = "";
   StringRef BFloat16ExcessPrecision = "";
   LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None;
-  std::string ComplexRangeStr = "";
-  std::string GccRangeComplexOption = "";
+  std::string ComplexRangeStr;
+  std::string GccRangeComplexOption;
+  std::string LastComplexRangeOption;
 
   auto setComplexRange = [&](LangOptions::ComplexRangeKind NewRange) {
     // Warn if user expects to perform full implementation of complex
@@ -2916,6 +2917,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
           EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-limited-range");
       }
       GccRangeComplexOption = "-fcx-limited-range";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Basic;
       break;
     case options::OPT_fno_cx_limited_range:
@@ -2929,6 +2931,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                "-fno-cx-limited-range");
       }
       GccRangeComplexOption = "-fno-cx-limited-range";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Full;
       break;
     case options::OPT_fcx_fortran_rules:
@@ -2938,6 +2941,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       else
         EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-fortran-rules");
       GccRangeComplexOption = "-fcx-fortran-rules";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Improved;
       break;
     case options::OPT_fno_cx_fortran_rules:
@@ -2950,6 +2954,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                "-fno-cx-fortran-rules");
       }
       GccRangeComplexOption = "-fno-cx-fortran-rules";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Full;
       break;
     case options::OPT_fcomplex_arithmetic_EQ: {
@@ -2984,6 +2989,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                  ComplexArithmeticStr(RangeVal));
         }
       }
+      LastComplexRangeOption =
+          Args.MakeArgString(A->getSpelling() + A->getValue());
       Range = RangeVal;
       break;
     }
@@ -3037,6 +3044,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
+      LastComplexRangeOption = A->getSpelling();
       break;
     }
 
@@ -3222,6 +3230,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       [[fallthrough]];
     case options::OPT_ffast_math:
       applyFastMath(true);
+      LastComplexRangeOption = A->getSpelling();
       if (A->getOption().getID() == options::OPT_Ofast)
         LastFpContractOverrideOption = "-Ofast";
       else
@@ -3239,6 +3248,15 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ApproxFunc = false;
       SignedZeros = true;
       restoreFPContractState();
+      // If the last specified option related to complex range is not
+      // -ffast-math or -ffp-model=, emit warning.
+      if (LastComplexRangeOption != "-ffast-math" &&
+          LastComplexRangeOption != "-ffp-model=" &&
+          Range != LangOptions::ComplexRangeKind::CX_Full)
+        EmitComplexRangeDiag(D, LastComplexRangeOption, "-fno-fast-math");
+      Range = LangOptions::ComplexRangeKind::CX_None;
+      LastComplexRangeOption = "";
+      GccRangeComplexOption = "";
       LastFpContractOverrideOption = "";
       break;
     } // End switch (A->getOption().getID())
diff --git a/clang/test/Driver/range.c b/clang/test/Driver/range.c
index da5748d7c723..30140f3c208e 100644
--- a/clang/test/Driver/range.c
+++ b/clang/test/Driver/range.c
@@ -177,14 +177,83 @@
 // RUN: %clang -### -target x86_64 -ffast-math -fcomplex-arithmetic=basic -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=BASIC %s
 
-// BASIC: -complex-range=basic
-// FULL: -complex-range=full
-// PRMTD: -complex-range=promoted
-// BASIC-NOT: -complex-range=improved
-// CHECK-NOT: -complex-range=basic
-// IMPRVD: -complex-range=improved
-// IMPRVD-NOT: -complex-range=basic
-// CHECK-NOT: -complex-range=improved
+// RUN: %clang -### --target=x86_64 -fcx-limited-range -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN21 %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-cx-limited-range -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### --target=x86_64 -fcx-fortran-rules -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN22 %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-cx-fortran-rules -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffast-math -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=basic -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN23 %s
+
+// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=promoted -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN24 %s
+
+// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=improved -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN25 %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fcomplex-arithmetic=full -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=aggressive -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=fast -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=precise -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=strict -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-limited-range \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-limited-range \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-fortran-rules \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-fortran-rules \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=basic \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=promoted \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=improved \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=full \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=aggressive \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=fast \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=precise \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=strict \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
 
 // WARN1: warning: overriding '-fcx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option]
 // WARN2: warning: overriding '-fno-cx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option]
@@ -196,5 +265,20 @@
 // WARN14: overriding '-complex-range=promoted' option with '-fcx-limited-range' [-Woverriding-option]
 // WARN17: warning: overriding '-fcomplex-arithmetic=full' option with '-fcomplex-arithmetic=basic' [-Woverriding-option]
 // WARN20: warning: overriding '-fcx-fortran-rules' option with '-fcx-limited-range' [-Woverriding-option]
+// WARN21: warning: overriding '-fcx-limited-range' option with '-fno-fast-math' [-Woverriding-option]
+// WARN22: warning: overriding '-fcx-fortran-rules' option with '-fno-fast-math' [-Woverriding-option]
+// WARN23: warning: overriding '-fcomplex-arithmetic=basic' option with '-fno-fast-math' [-Woverriding-option]
+// WARN24: warning: overriding '-fcomplex-arithmetic=promoted' option with '-fno-fast-math' [-Woverriding-option]
+// WARN25: warning: overriding '-fcomplex-arithmetic=improved' option with '-fno-fast-math' [-Woverriding-option]
+
+// BASIC: -complex-range=basic
+// FULL: -complex-range=full
+// PRMTD: -complex-range=promoted
+// BASIC-NOT: -complex-range=improved
+// CHECK-NOT: -complex-range=basic
+// IMPRVD: -complex-range=improved
+// IMPRVD-NOT: -complex-range=basic
+// CHECK-NOT: -complex-range=improved
+// RANGE-NOT: -complex-range=
 
 // ERR: error: unsupported argument 'foo' to option '-fcomplex-arithmetic='

From 52360d195b85608c677d781272534dfa61e9a1c3 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Thu, 12 Jun 2025 09:27:27 +0800
Subject: [PATCH 0076/1322] [NFC] Use `llvm::includes` instead of
 `std::includes` (#143542)

This PR follows up #143297.
---
 clang-tools-extra/clangd/refactor/Rename.cpp              | 2 +-
 llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 6 ++----
 llvm/tools/sancov/sancov.cpp                              | 3 +--
 llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp              | 4 ++--
 llvm/unittests/ADT/DeltaAlgorithmTest.cpp                 | 4 ++--
 llvm/utils/TableGen/AsmMatcherEmitter.cpp                 | 3 +--
 llvm/utils/TableGen/Common/CodeGenRegisters.cpp           | 7 ++-----
 7 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index d9b73b83e902..c56375b1a98d 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -1308,7 +1308,7 @@ getMappedRanges(ArrayRef<Range> Indexed, ArrayRef<SymbolRange> Lexed) {
     return std::nullopt;
   }
   // Fast check for the special subset case.
-  if (std::includes(Indexed.begin(), Indexed.end(), Lexed.begin(), Lexed.end()))
+  if (llvm::includes(Indexed, Lexed))
     return Lexed.vec();
 
   std::vector<size_t> Best;
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index d94a2fbb23d2..61fef1387d82 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1975,12 +1975,10 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2,
   auto V1Elems = ShadowElements.find(V1);
   auto V2Elems = ShadowElements.find(V2);
   if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
-    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
-                      V2Elems->second.begin(), V2Elems->second.end())) {
+    if (llvm::includes(V1Elems->second, V2Elems->second)) {
       return collapseToPrimitiveShadow(V1, Pos);
     }
-    if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
-                      V1Elems->second.begin(), V1Elems->second.end())) {
+    if (llvm::includes(V2Elems->second, V1Elems->second)) {
       return collapseToPrimitiveShadow(V2, Pos);
     }
   } else if (V1Elems != ShadowElements.end()) {
diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp
index 2cc84b47de6b..aebb5effd0be 100644
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -889,8 +889,7 @@ symbolize(const RawCoverage &Data, const std::string ObjectFile) {
   }
 
   std::set<uint64_t> AllAddrs = findCoveragePointAddrs(ObjectFile);
-  if (!std::includes(AllAddrs.begin(), AllAddrs.end(), Data.Addrs->begin(),
-                     Data.Addrs->end())) {
+  if (!llvm::includes(AllAddrs, *Data.Addrs)) {
     fail("Coverage points in binary and .sancov file do not match.");
   }
   Coverage->Points = getCoveragePoints(ObjectFile, AllAddrs, *Data.Addrs);
diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
index 66a67d96d153..f54394789939 100644
--- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DAGDeltaAlgorithm.h"
+#include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <cstdarg>
@@ -23,8 +24,7 @@ class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm {
 protected:
   bool ExecuteOneTest(const changeset_ty &Changes) override {
     ++NumTests;
-    return std::includes(Changes.begin(), Changes.end(),
-                         FailingSet.begin(), FailingSet.end());
+    return llvm::includes(Changes, FailingSet);
   }
 
 public:
diff --git a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
index 5e284129180a..24e18f42eb33 100644
--- a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DeltaAlgorithm.h"
+#include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <cstdarg>
@@ -38,8 +39,7 @@ class FixedDeltaAlgorithm final : public DeltaAlgorithm {
 protected:
   bool ExecuteOneTest(const changeset_ty &Changes) override {
     ++NumTests;
-    return std::includes(Changes.begin(), Changes.end(),
-                         FailingSet.begin(), FailingSet.end());
+    return llvm::includes(Changes, FailingSet);
   }
 
 public:
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 9792eb41ea5d..32098e96ce72 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1330,8 +1330,7 @@ void AsmMatcherInfo::buildRegisterClasses(
   for (const RegisterSet &RS : RegisterSets) {
     ClassInfo *CI = RegisterSetClasses[RS];
     for (const RegisterSet &RS2 : RegisterSets)
-      if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(),
-                                     RS.end(), LessRecordByID()))
+      if (RS != RS2 && llvm::includes(RS2, RS, LessRecordByID()))
         CI->SuperClasses.push_back(RegisterSetClasses[RS2]);
   }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 4d24eb3de1ed..f52c21e97f9c 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -933,9 +933,7 @@ bool CodeGenRegisterClass::Key::operator<(
 static bool testSubClass(const CodeGenRegisterClass *A,
                          const CodeGenRegisterClass *B) {
   return A->RSI.isSubClassOf(B->RSI) &&
-         std::includes(A->getMembers().begin(), A->getMembers().end(),
-                       B->getMembers().begin(), B->getMembers().end(),
-                       deref<std::less<>>());
+         llvm::includes(A->getMembers(), B->getMembers(), deref<std::less<>>());
 }
 
 /// Sorting predicate for register classes.  This provides a topological
@@ -1990,8 +1988,7 @@ findRegUnitSet(const std::vector<RegUnitSet> &UniqueSets,
 // Return true if the RUSubSet is a subset of RUSuperSet.
 static bool isRegUnitSubSet(const std::vector<unsigned> &RUSubSet,
                             const std::vector<unsigned> &RUSuperSet) {
-  return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(),
-                       RUSubSet.end());
+  return llvm::includes(RUSuperSet, RUSubSet);
 }
 
 /// Iteratively prune unit sets. Prune subsets that are close to the superset,

From 082251bba4effea7f60191c6cbddacb3705c07db Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 11 Jun 2025 21:49:01 -0400
Subject: [PATCH 0077/1322] [AArch64] fix trampoline implementation: use X15
 (#126743)

AAPCS64 reserves any of X9-X15 for a compiler to choose to use for this
purpose, and says not to use X16 or X18 like GCC (and the previous
implementation) chose to use. The X18 register may need to get used by
the kernel in some circumstances, as specified by the platform ABI, so
it is generally an unwise choice. Simply choosing a different register
fixes the problem of this being broken on any platform that actually
follows the platform ABI (which is all of them except EABI, if I am
reading this linux kernel bug correctly
https://lkml2.uits.iu.edu/hypermail/linux/kernel/2001.2/01502.html). As
a side benefit, also generate slightly better code and avoids needing
the compiler-rt to be present. I did that by following the XCore
implementation instead of PPC (although in hindsight, following the
RISCV might have been slightly more readable). That X18 is wrong to use
for this purpose has been known for many years (e.g.
https://www.mail-archive.com/gcc@gcc.gnu.org/msg76934.html) and also
known that fixing this to use one of the correct registers is not an ABI
break, since this only appears inside of a translation unit. Some of the
other temporary registers (e.g. X9) are already reserved inside llvm for
internal use as a generic temporary register in the prologue before
saving registers, while X15 was already used in rare cases as a scratch
register in the prologue as well, so I felt that seemed the most logical
choice to choose here.
---
 compiler-rt/lib/builtins/README.txt           |   5 -
 compiler-rt/lib/builtins/trampoline_setup.c   |  42 ---
 .../builtins/Unit/trampoline_setup_test.c     |   2 +-
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  |   8 +-
 flang/test/Fir/boxproc.fir                    |   4 +-
 .../AArch64/AArch64CallingConvention.td       |  25 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   |  85 ++++--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  95 ++++---
 llvm/lib/TargetParser/Triple.cpp              |   2 -
 llvm/test/CodeGen/AArch64/nest-register.ll    |  16 +-
 .../AArch64/statepoint-call-lowering.ll       |   2 +-
 llvm/test/CodeGen/AArch64/trampoline.ll       | 257 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/win64cc-x18.ll      |  27 +-
 .../CodeGen/AArch64/zero-call-used-regs.ll    |  16 +-
 14 files changed, 420 insertions(+), 166 deletions(-)

diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index 19f26c92a0f9..2d213d95f333 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -272,11 +272,6 @@ switch32
 switch8
 switchu8
 
-// This function generates a custom trampoline function with the specific
-// realFunc and localsPtr values.
-void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
-                        const void* realFunc, void* localsPtr);
-
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
index 830e25e4c030..844eb2794414 100644
--- a/compiler-rt/lib/builtins/trampoline_setup.c
+++ b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
-
-// The AArch64 compiler generates calls to __trampoline_setup() when creating
-// trampoline functions on the stack for use with nested functions.
-// This function creates a custom 36-byte trampoline function on the stack
-// which loads x18 with a pointer to the outer function's locals
-// and then jumps to the target nested function.
-// Note: x18 is a reserved platform register on Windows and macOS.
-
-#if defined(__aarch64__) && defined(__ELF__)
-COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
-                                        int trampSizeAllocated,
-                                        const void *realFunc, void *localsPtr) {
-  // This should never happen, but if compiler did not allocate
-  // enough space on stack for the trampoline, abort.
-  if (trampSizeAllocated < 36)
-    compilerrt_abort();
-
-  // create trampoline
-  // Load realFunc into x17. mov/movk 16 bits at a time.
-  trampOnStack[0] =
-      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
-  trampOnStack[1] =
-      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
-  trampOnStack[2] =
-      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
-  trampOnStack[3] =
-      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
-  // Load localsPtr into x18
-  trampOnStack[4] =
-      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
-  trampOnStack[5] =
-      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
-  trampOnStack[6] =
-      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
-  trampOnStack[7] =
-      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
-  trampOnStack[8] = 0xd61f0220; // br x17
-
-  // Clear instruction cache.
-  __clear_cache(trampOnStack, &trampOnStack[9]);
-}
-#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
index d51d35acaa02..da115fe76427 100644
--- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
+++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
@@ -7,7 +7,7 @@
 
 /*
  * Tests nested functions
- * The ppc and aarch64 compilers generates a call to __trampoline_setup
+ * The ppc compiler generates a call to __trampoline_setup
  * The i386 and x86_64 compilers generate a call to ___enable_execute_stack
  */
 
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 82b11ad7db32..69bdb48146a5 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -274,12 +274,12 @@ public:
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // For PPC32 and PPC64, the thunk is populated by a call to
             // __trampoline_setup, which is defined in
             // compiler-rt/lib/builtins/trampoline_setup.c and requires the
-            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
-            // thunk setup doesn't go through __trampoline_setup and fits in 32
-            // bytes.
+            // thunk size greater than 32 bytes.  For AArch64, RISCV and x86_64,
+            // the thunk setup doesn't go through __trampoline_setup and fits in
+            // 32 bytes.
             fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 5d82522055ad..97d9b38ed6f4 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -3,7 +3,7 @@
 // RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
@@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 920cc6727314..1b5a713bffdc 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
 //===----------------------------------------------------------------------===//
 
 defvar AArch64_Common = [
+  // The 'nest' parameter, if any, is passed in X15.
+  // The previous register used here (X18) is also defined to be unavailable
+  // for this purpose, while all of X9-X15 were defined to be free for LLVM to
+  // use for this, so use X15 (which LLVM often already clobbers anyways).
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -117,13 +123,7 @@ defvar AArch64_Common = [
 ];
 
 let Entry = 1 in
-def CC_AArch64_AAPCS : CallingConv<!listconcat(
-  // The 'nest' parameter, if any, is passed in X18.
-  // Darwin and Windows use X18 as the platform register and hence 'nest' isn't
-  // currently supported there.
-  [CCIfNest<CCAssignToReg<[X18]>>],
-  AArch64_Common
-)>;
+def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;
 
 let Entry = 1 in
 def RetCC_AArch64_AAPCS : CallingConv<[
@@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
 // a stack layout compatible with the x64 calling convention.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   // Convert small floating-point values to integer.
   CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
 //     + Stack slots are sized as needed rather than being at least 64-bit.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 // same as the normal Darwin VarArgs handling.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_GHC : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
 
   // Handle all vector types as either f64 or v2f64.
@@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
 
   // We can pass arguments in all general registers, except:
   // - X8, used for sret
+  // - X15 (on Windows), used as a temporary register in the prologue when allocating call frames
   // - X16/X17, used by the linker as IP0/IP1
   // - X18, the platform register
   // - X19, the base pointer
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3335ee04bb0e..2650c621e19f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -331,7 +331,9 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(MachineFunction &MF);
 static bool needsWinCFI(const MachineFunction &MF);
 static StackOffset getSVEStackSize(const MachineFunction &MF);
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
+static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                                 bool HasCall = false);
+static bool requiresSaveVG(const MachineFunction &MF);
 
 /// Returns true if a homogeneous prolog or epilog code can be emitted
 /// for the size optimization. If possible, a frame helper call is injected.
@@ -1006,6 +1008,16 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
   }
 }
 
+static bool windowsRequiresStackProbe(const MachineFunction &MF,
+                                      uint64_t StackSizeInBytes) {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
+  // TODO: When implementing stack protectors, take that into account
+  // for the probe threshold.
+  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
+         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
+}
+
 static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
                                    const MachineBasicBlock &MBB) {
   const MachineFunction *MF = MBB.getParent();
@@ -1027,7 +1039,8 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
 // but we would then have to make sure that we were in fact saving at least one
 // callee-save register in the prologue, which is additional complexity that
 // doesn't seem worth the benefit.
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                                 bool HasCall) {
   MachineFunction *MF = MBB->getParent();
 
   // If MBB is an entry block, use X9 as the scratch register
@@ -1041,6 +1054,11 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
   LivePhysRegs LiveRegs(TRI);
   getLiveRegsForEntryMBB(LiveRegs, *MBB);
+  if (HasCall) {
+    LiveRegs.addReg(AArch64::X16);
+    LiveRegs.addReg(AArch64::X17);
+    LiveRegs.addReg(AArch64::X18);
+  }
 
   // Prefer X9 since it was historically used for the prologue scratch reg.
   const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -1081,23 +1099,18 @@ bool AArch64FrameLowering::canUseAsPrologue(
       MBB.isLiveIn(AArch64::NZCV))
     return false;
 
-  // Don't need a scratch register if we're not going to re-align the stack or
-  // emit stack probes.
-  if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF))
-    return true;
-  // Otherwise, we can use any block as long as it has a scratch register
-  // available.
-  return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
-}
+  if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF))
+    if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister)
+      return false;
 
-static bool windowsRequiresStackProbe(MachineFunction &MF,
-                                      uint64_t StackSizeInBytes) {
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
-  // TODO: When implementing stack protectors, take that into account
-  // for the probe threshold.
-  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
-         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
+  // May need a scratch register (for return value) if require making a special
+  // call
+  if (requiresSaveVG(*MF) ||
+      windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
+    if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister)
+      return false;
+
+  return true;
 }
 
 static bool needsWinCFI(const MachineFunction &MF) {
@@ -1378,8 +1391,8 @@ bool requiresGetVGCall(MachineFunction &MF) {
          !MF.getSubtarget<AArch64Subtarget>().hasSVE();
 }
 
-static bool requiresSaveVG(MachineFunction &MF) {
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+static bool requiresSaveVG(const MachineFunction &MF) {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   // For Darwin platforms we don't save VG for non-SVE functions, even if SME
   // is enabled with streaming mode changes.
   if (!AFI->hasStreamingModeChanges())
@@ -2049,6 +2062,29 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     if (AFI->getSVECalleeSavedStackSize())
       report_fatal_error(
           "SVE callee saves not yet supported with stack probing");
+
+    // Find an available register to spill the value of X15 to, if X15 is being
+    // used already for nest.
+    unsigned X15Scratch = AArch64::NoRegister;
+    const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+    if (llvm::any_of(MBB.liveins(),
+                     [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+                       return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+                           AArch64::X15, LiveIn.PhysReg);
+                     })) {
+      X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
+      assert(X15Scratch != AArch64::NoRegister &&
+             (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
+#ifndef NDEBUG
+      LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
+#endif
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::X15, RegState::Undef)
+          .addReg(AArch64::X15, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
     if (NeedsWinCFI) {
       HasWinCFI = true;
@@ -2171,6 +2207,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // we've set a frame pointer and already finished the SEH prologue.
       assert(!NeedsWinCFI);
     }
+    if (X15Scratch != AArch64::NoRegister) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
+          .addReg(AArch64::XZR)
+          .addReg(X15Scratch, RegState::Undef)
+          .addReg(X15Scratch, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
@@ -3355,7 +3398,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     unsigned X0Scratch = AArch64::NoRegister;
     if (Reg1 == AArch64::VG) {
       // Find an available register to store value of VG to.
-      Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+      Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
       assert(Reg1 != AArch64::NoRegister);
       SMEAttrs Attrs = AFI->getSMEFnAttrs();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 766599d567ef..ad5b90984188 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7126,59 +7126,80 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
                                                       SelectionDAG &DAG) const {
-  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
-  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-    report_fatal_error(
-        "ADJUST_TRAMPOLINE operation is only supported on Linux.");
-
   return Op.getOperand(0);
 }
 
 SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                     SelectionDAG &DAG) const {
-
-  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
-  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-    report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
-
   SDValue Chain = Op.getOperand(0);
-  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  // ldr NestReg, .+16
+  // ldr x17, .+20
+  // br x17
+  // .word 0
+  // .nest: .qword nest
+  // .fptr: .qword fptr
+  SDValue OutChains[5];
+
+  const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+  CallingConv::ID CC = Func->getCallingConv();
+  unsigned NestReg;
+
+  switch (CC) {
+  default:
+    NestReg = 0x0f; // X15
+  case CallingConv::ARM64EC_Thunk_Native:
+  case CallingConv::ARM64EC_Thunk_X64:
+    // Must be kept in sync with AArch64CallingConv.td
+    NestReg = 0x04; // X4
+    break;
+  }
+
+  const char FptrReg = 0x11; // X17
+
+  SDValue Addr = Trmp;
+
   SDLoc dl(Op);
+  OutChains[0] = DAG.getStore(
+      Chain, dl, DAG.getConstant(0x58000080u | NestReg, dl, MVT::i32), Addr,
+      MachinePointerInfo(TrmpAddr));
 
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(4, dl, MVT::i64));
+  OutChains[1] = DAG.getStore(
+      Chain, dl, DAG.getConstant(0x580000b0u | FptrReg, dl, MVT::i32), Addr,
+      MachinePointerInfo(TrmpAddr, 4));
 
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(8, dl, MVT::i64));
+  OutChains[2] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0xd61f0220u, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 8));
 
-  Entry.Ty = IntPtrTy;
-  Entry.Node = Trmp;
-  Args.push_back(Entry);
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(16, dl, MVT::i64));
+  OutChains[3] =
+      DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
 
-  if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineFrameInfo &MFI = MF.getFrameInfo();
-    Entry.Node =
-        DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
-  } else
-    Entry.Node = DAG.getConstant(36, dl, MVT::i64);
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(24, dl, MVT::i64));
+  OutChains[4] =
+      DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
 
-  Args.push_back(Entry);
-  Entry.Node = FPtr;
-  Args.push_back(Entry);
-  Entry.Node = Nest;
-  Args.push_back(Entry);
+  SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 
-  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
-      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
+  SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                                  DAG.getConstant(12, dl, MVT::i64));
 
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.second;
+  // Call clear cache on the trampoline instructions.
+  return DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken, Trmp,
+                     EndOfTrmp);
 }
 
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index bd291e191821..5718ae385bac 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1754,8 +1754,6 @@ unsigned Triple::getTrampolineSize() const {
     if (isOSLinux())
       return 48;
     break;
-  case Triple::aarch64:
-    return 36;
   }
   return 32;
 }
diff --git a/llvm/test/CodeGen/AArch64/nest-register.ll b/llvm/test/CodeGen/AArch64/nest-register.ll
index 1e1c1b044bab..2e94dfba1fa5 100644
--- a/llvm/test/CodeGen/AArch64/nest-register.ll
+++ b/llvm/test/CodeGen/AArch64/nest-register.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
@@ -5,18 +6,21 @@
 
 define ptr @nest_receiver(ptr nest %arg) nounwind {
 ; CHECK-LABEL: nest_receiver:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov x0, x18
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x15
+; CHECK-NEXT:    ret
 
   ret ptr %arg
 }
 
 define ptr @nest_caller(ptr %arg) nounwind {
 ; CHECK-LABEL: nest_caller:
-; CHECK: mov x18, x0
-; CHECK-NEXT: bl nest_receiver
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x15, x0
+; CHECK-NEXT:    bl nest_receiver
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 
   %result = call ptr @nest_receiver(ptr nest %arg)
   ret ptr %result
diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
index 9619895c450c..32c3eaeb9c87 100644
--- a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
@@ -207,7 +207,7 @@ define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" {
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    ldr x8, [sp, #64]
 ; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:    mov x15, xzr
 ; CHECK-NEXT:    mov w0, #42 // =0x2a
 ; CHECK-NEXT:    mov w1, #17 // =0x11
 ; CHECK-NEXT:    str x8, [sp, #16]
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 30ac2aa283b3..d9016b02a0f8 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -1,32 +1,265 @@
-; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-LINUX
+; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s --check-prefixes=CHECK-LINUX
+; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-PC
+; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck %s --check-prefixes=CHECK-APPLE
 
 @trampg = internal global [36 x i8] zeroinitializer, align 8
 
 declare void @llvm.init.trampoline(ptr, ptr, ptr);
 declare ptr @llvm.adjust.trampoline(ptr);
 
-define i64 @f(ptr nest %c, i64 %x, i64 %y) {
-  %sum = add i64 %x, %y
-  ret i64 %sum
+define ptr @f(ptr nest %x, i64 %y) {
+; CHECK-LINUX-LABEL: f:
+; CHECK-LINUX:       // %bb.0:
+; CHECK-LINUX-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-LINUX-NEXT:    sub sp, sp, #237, lsl #12 // =970752
+; CHECK-LINUX-NEXT:    sub sp, sp, #3264
+; CHECK-LINUX-NEXT:    .cfi_def_cfa_offset 974032
+; CHECK-LINUX-NEXT:    .cfi_offset w29, -16
+; CHECK-LINUX-NEXT:    add x0, x15, x0
+; CHECK-LINUX-NEXT:    add sp, sp, #237, lsl #12 // =970752
+; CHECK-LINUX-NEXT:    add sp, sp, #3264
+; CHECK-LINUX-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-LINUX-NEXT:    ret
+;
+; CHECK-PC-LABEL: f:
+; CHECK-PC:       .seh_proc f
+; CHECK-PC-NEXT:  // %bb.0:
+; CHECK-PC-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-PC-NEXT:    .seh_save_fplr_x 16
+; CHECK-PC-NEXT:    mov x9, x15
+; CHECK-PC-NEXT:    mov x15, #60876 // =0xedcc
+; CHECK-PC-NEXT:    .seh_nop
+; CHECK-PC-NEXT:    bl __chkstk
+; CHECK-PC-NEXT:    .seh_nop
+; CHECK-PC-NEXT:    sub sp, sp, x15, lsl #4
+; CHECK-PC-NEXT:    .seh_stackalloc 974016
+; CHECK-PC-NEXT:    mov x15, x9
+; CHECK-PC-NEXT:    .seh_endprologue
+; CHECK-PC-NEXT:    add x0, x15, x0
+; CHECK-PC-NEXT:    .seh_startepilogue
+; CHECK-PC-NEXT:    add sp, sp, #237, lsl #12 // =970752
+; CHECK-PC-NEXT:    .seh_stackalloc 970752
+; CHECK-PC-NEXT:    add sp, sp, #3264
+; CHECK-PC-NEXT:    .seh_stackalloc 3264
+; CHECK-PC-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-PC-NEXT:    .seh_save_fplr_x 16
+; CHECK-PC-NEXT:    .seh_endepilogue
+; CHECK-PC-NEXT:    ret
+; CHECK-PC-NEXT:    .seh_endfunclet
+; CHECK-PC-NEXT:    .seh_endproc
+;
+; CHECK-APPLE-LABEL: f:
+; CHECK-APPLE:       ; %bb.0:
+; CHECK-APPLE-NEXT:    stp x28, x27, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-APPLE-NEXT:    sub sp, sp, #237, lsl #12 ; =970752
+; CHECK-APPLE-NEXT:    sub sp, sp, #3264
+; CHECK-APPLE-NEXT:    .cfi_def_cfa_offset 974032
+; CHECK-APPLE-NEXT:    .cfi_offset w27, -8
+; CHECK-APPLE-NEXT:    .cfi_offset w28, -16
+; CHECK-APPLE-NEXT:    add x0, x15, x0
+; CHECK-APPLE-NEXT:    add sp, sp, #237, lsl #12 ; =970752
+; CHECK-APPLE-NEXT:    add sp, sp, #3264
+; CHECK-APPLE-NEXT:    ldp x28, x27, [sp], #16 ; 16-byte Folded Reload
+; CHECK-APPLE-NEXT:    ret
+  %chkstack = alloca [u0xedcba x i8]
+  %sum = getelementptr i8, ptr %x, i64 %y
+  ret ptr %sum
 }
 
 define i64 @func1() {
+; CHECK-LINUX-LABEL: func1:
+; CHECK-LINUX:       // %bb.0:
+; CHECK-LINUX-NEXT:    sub sp, sp, #64
+; CHECK-LINUX-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-LINUX-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-LINUX-NEXT:    .cfi_offset w30, -16
+; CHECK-LINUX-NEXT:    adrp x8, :got:f
+; CHECK-LINUX-NEXT:    mov w9, #544 // =0x220
+; CHECK-LINUX-NEXT:    add x0, sp, #8
+; CHECK-LINUX-NEXT:    ldr x8, [x8, :got_lo12:f]
+; CHECK-LINUX-NEXT:    movk w9, #54815, lsl #16
+; CHECK-LINUX-NEXT:    str w9, [sp, #16]
+; CHECK-LINUX-NEXT:    add x9, sp, #56
+; CHECK-LINUX-NEXT:    stp x9, x8, [sp, #24]
+; CHECK-LINUX-NEXT:    mov x8, #132 // =0x84
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #16
+; CHECK-LINUX-NEXT:    movk x8, #177, lsl #32
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #48
+; CHECK-LINUX-NEXT:    str x8, [sp, #8]
+; CHECK-LINUX-NEXT:    add x8, sp, #8
+; CHECK-LINUX-NEXT:    add x1, x8, #12
+; CHECK-LINUX-NEXT:    bl __clear_cache
+; CHECK-LINUX-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-LINUX-NEXT:    mov x0, xzr
+; CHECK-LINUX-NEXT:    add sp, sp, #64
+; CHECK-LINUX-NEXT:    ret
+;
+; CHECK-PC-LABEL: func1:
+; CHECK-PC:       .seh_proc func1
+; CHECK-PC-NEXT:  // %bb.0:
+; CHECK-PC-NEXT:    sub sp, sp, #64
+; CHECK-PC-NEXT:    .seh_stackalloc 64
+; CHECK-PC-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-PC-NEXT:    .seh_save_reg x30, 48
+; CHECK-PC-NEXT:    .seh_endprologue
+; CHECK-PC-NEXT:    adrp x8, f
+; CHECK-PC-NEXT:    add x8, x8, :lo12:f
+; CHECK-PC-NEXT:    add x9, sp, #56
+; CHECK-PC-NEXT:    stp x9, x8, [sp, #24]
+; CHECK-PC-NEXT:    mov w8, #544 // =0x220
+; CHECK-PC-NEXT:    add x0, sp, #8
+; CHECK-PC-NEXT:    movk w8, #54815, lsl #16
+; CHECK-PC-NEXT:    str w8, [sp, #16]
+; CHECK-PC-NEXT:    mov x8, #132 // =0x84
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #16
+; CHECK-PC-NEXT:    movk x8, #177, lsl #32
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #48
+; CHECK-PC-NEXT:    str x8, [sp, #8]
+; CHECK-PC-NEXT:    add x8, sp, #8
+; CHECK-PC-NEXT:    add x1, x8, #12
+; CHECK-PC-NEXT:    bl __clear_cache
+; CHECK-PC-NEXT:    mov x0, xzr
+; CHECK-PC-NEXT:    .seh_startepilogue
+; CHECK-PC-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-PC-NEXT:    .seh_save_reg x30, 48
+; CHECK-PC-NEXT:    add sp, sp, #64
+; CHECK-PC-NEXT:    .seh_stackalloc 64
+; CHECK-PC-NEXT:    .seh_endepilogue
+; CHECK-PC-NEXT:    ret
+; CHECK-PC-NEXT:    .seh_endfunclet
+; CHECK-PC-NEXT:    .seh_endproc
+;
+; CHECK-APPLE-LABEL: func1:
+; CHECK-APPLE:       ; %bb.0:
+; CHECK-APPLE-NEXT:    sub sp, sp, #64
+; CHECK-APPLE-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-APPLE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-APPLE-NEXT:    .cfi_offset w30, -8
+; CHECK-APPLE-NEXT:    .cfi_offset w29, -16
+; CHECK-APPLE-NEXT:  Lloh0:
+; CHECK-APPLE-NEXT:    adrp x8, _f@PAGE
+; CHECK-APPLE-NEXT:  Lloh1:
+; CHECK-APPLE-NEXT:    add x8, x8, _f@PAGEOFF
+; CHECK-APPLE-NEXT:    add x9, sp, #40
+; CHECK-APPLE-NEXT:    stp x9, x8, [sp, #16]
+; CHECK-APPLE-NEXT:    mov w8, #544 ; =0x220
+; CHECK-APPLE-NEXT:    mov x0, sp
+; CHECK-APPLE-NEXT:    movk w8, #54815, lsl #16
+; CHECK-APPLE-NEXT:    str w8, [sp, #8]
+; CHECK-APPLE-NEXT:    mov x8, #132 ; =0x84
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #16
+; CHECK-APPLE-NEXT:    movk x8, #177, lsl #32
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #48
+; CHECK-APPLE-NEXT:    str x8, [sp]
+; CHECK-APPLE-NEXT:    mov x8, sp
+; CHECK-APPLE-NEXT:    add x1, x8, #12
+; CHECK-APPLE-NEXT:    bl ___clear_cache
+; CHECK-APPLE-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-APPLE-NEXT:    mov x0, xzr
+; CHECK-APPLE-NEXT:    add sp, sp, #64
+; CHECK-APPLE-NEXT:    ret
+; CHECK-APPLE-NEXT:    .loh AdrpAdd Lloh0, Lloh1
   %val = alloca i64
-  %nval = bitcast ptr %val to ptr
   %tramp = alloca [36 x i8], align 8
-  ; CHECK:	mov	w1, #36
-  ; CHECK:	bl	__trampoline_setup
-  call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval)
+  call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %val)
   %fp = call ptr @llvm.adjust.trampoline(ptr %tramp)
   ret i64 0
 }
 
 define i64 @func2() {
+; CHECK-LINUX-LABEL: func2:
+; CHECK-LINUX:       // %bb.0:
+; CHECK-LINUX-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-LINUX-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-LINUX-NEXT:    .cfi_offset w30, -16
+; CHECK-LINUX-NEXT:    adrp x8, :got:f
+; CHECK-LINUX-NEXT:    mov w9, #544 // =0x220
+; CHECK-LINUX-NEXT:    adrp x0, trampg
+; CHECK-LINUX-NEXT:    add x0, x0, :lo12:trampg
+; CHECK-LINUX-NEXT:    ldr x8, [x8, :got_lo12:f]
+; CHECK-LINUX-NEXT:    movk w9, #54815, lsl #16
+; CHECK-LINUX-NEXT:    str w9, [x0, #8]
+; CHECK-LINUX-NEXT:    add x9, sp, #8
+; CHECK-LINUX-NEXT:    add x1, x0, #12
+; CHECK-LINUX-NEXT:    stp x9, x8, [x0, #16]
+; CHECK-LINUX-NEXT:    mov x8, #132 // =0x84
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #16
+; CHECK-LINUX-NEXT:    movk x8, #177, lsl #32
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #48
+; CHECK-LINUX-NEXT:    str x8, [x0]
+; CHECK-LINUX-NEXT:    bl __clear_cache
+; CHECK-LINUX-NEXT:    mov x0, xzr
+; CHECK-LINUX-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-LINUX-NEXT:    ret
+;
+; CHECK-PC-LABEL: func2:
+; CHECK-PC:       .seh_proc func2
+; CHECK-PC-NEXT:  // %bb.0:
+; CHECK-PC-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-PC-NEXT:    .seh_save_reg_x x30, 16
+; CHECK-PC-NEXT:    .seh_endprologue
+; CHECK-PC-NEXT:    adrp x0, trampg
+; CHECK-PC-NEXT:    add x0, x0, :lo12:trampg
+; CHECK-PC-NEXT:    adrp x8, f
+; CHECK-PC-NEXT:    add x8, x8, :lo12:f
+; CHECK-PC-NEXT:    add x9, sp, #8
+; CHECK-PC-NEXT:    add x1, x0, #12
+; CHECK-PC-NEXT:    stp x9, x8, [x0, #16]
+; CHECK-PC-NEXT:    mov w8, #544 // =0x220
+; CHECK-PC-NEXT:    movk w8, #54815, lsl #16
+; CHECK-PC-NEXT:    str w8, [x0, #8]
+; CHECK-PC-NEXT:    mov x8, #132 // =0x84
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #16
+; CHECK-PC-NEXT:    movk x8, #177, lsl #32
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #48
+; CHECK-PC-NEXT:    str x8, [x0]
+; CHECK-PC-NEXT:    bl __clear_cache
+; CHECK-PC-NEXT:    mov x0, xzr
+; CHECK-PC-NEXT:    .seh_startepilogue
+; CHECK-PC-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-PC-NEXT:    .seh_save_reg_x x30, 16
+; CHECK-PC-NEXT:    .seh_endepilogue
+; CHECK-PC-NEXT:    ret
+; CHECK-PC-NEXT:    .seh_endfunclet
+; CHECK-PC-NEXT:    .seh_endproc
+;
+; CHECK-APPLE-LABEL: func2:
+; CHECK-APPLE:       ; %bb.0:
+; CHECK-APPLE-NEXT:    sub sp, sp, #32
+; CHECK-APPLE-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-APPLE-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-APPLE-NEXT:    .cfi_offset w30, -8
+; CHECK-APPLE-NEXT:    .cfi_offset w29, -16
+; CHECK-APPLE-NEXT:  Lloh2:
+; CHECK-APPLE-NEXT:    adrp x0, _trampg@PAGE
+; CHECK-APPLE-NEXT:  Lloh3:
+; CHECK-APPLE-NEXT:    add x0, x0, _trampg@PAGEOFF
+; CHECK-APPLE-NEXT:  Lloh4:
+; CHECK-APPLE-NEXT:    adrp x8, _f@PAGE
+; CHECK-APPLE-NEXT:  Lloh5:
+; CHECK-APPLE-NEXT:    add x8, x8, _f@PAGEOFF
+; CHECK-APPLE-NEXT:    add x9, sp, #8
+; CHECK-APPLE-NEXT:    add x1, x0, #12
+; CHECK-APPLE-NEXT:    stp x9, x8, [x0, #16]
+; CHECK-APPLE-NEXT:    mov w8, #544 ; =0x220
+; CHECK-APPLE-NEXT:    movk w8, #54815, lsl #16
+; CHECK-APPLE-NEXT:    str w8, [x0, #8]
+; CHECK-APPLE-NEXT:    mov x8, #132 ; =0x84
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #16
+; CHECK-APPLE-NEXT:    movk x8, #177, lsl #32
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #48
+; CHECK-APPLE-NEXT:    str x8, [x0]
+; CHECK-APPLE-NEXT:    bl ___clear_cache
+; CHECK-APPLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-APPLE-NEXT:    mov x0, xzr
+; CHECK-APPLE-NEXT:    add sp, sp, #32
+; CHECK-APPLE-NEXT:    ret
+; CHECK-APPLE-NEXT:    .loh AdrpAdd Lloh4, Lloh5
+; CHECK-APPLE-NEXT:    .loh AdrpAdd Lloh2, Lloh3
   %val = alloca i64
-  %nval = bitcast ptr %val to ptr
-  ; CHECK:	mov	w1, #36
-  ; CHECK:	bl	__trampoline_setup
-  call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %nval)
+  call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %val)
   %fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
   ret i64 0
 }
diff --git a/llvm/test/CodeGen/AArch64/win64cc-x18.ll b/llvm/test/CodeGen/AArch64/win64cc-x18.ll
index b3e78cc9bbb8..4b45c300e9c1 100644
--- a/llvm/test/CodeGen/AArch64/win64cc-x18.ll
+++ b/llvm/test/CodeGen/AArch64/win64cc-x18.ll
@@ -1,35 +1,26 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;; Testing that nest uses x15 on all calling conventions (except Arm64EC)
 
-;; Testing that x18 is not clobbered when passing pointers with the nest
-;; attribute on windows
-
-; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=CHECK,CHECK-NO-X18
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-X18
+; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-darwin- | FileCheck %s
 
 define dso_local i64 @other(ptr nest %p) #0 {
 ; CHECK-LABEL: other:
-; CHECK-X18: ldr x0, [x18]
-; CHECK-NO-X18: ldr x0, [x0]
+; CHECK:    ldr x0, [x15]
+; CHECK:    ret
   %r = load i64, ptr %p
-; CHECK: ret
   ret i64 %r
 }
 
 define dso_local void @func() #0 {
 ; CHECK-LABEL: func:
-
-
+; CHECK:    add x15, sp, #8
+; CHECK:    bl {{_?other}}
+; CHECK:    ret
 entry:
   %p = alloca i64
-; CHECK: mov w8, #1
-; CHECK: stp x30, x8, [sp, #-16]
-; CHECK-X18: add x18, sp, #8
   store i64 1, ptr %p
-; CHECK-NO-X18: add x0, sp, #8
-; CHECK: bl other
   call void @other(ptr nest %p)
-; CHECK: ldr x30, [sp], #16
-; CHECK: ret
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
index 4799ea3bcd19..986666e015e9 100644
--- a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
+++ b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
@@ -93,7 +93,7 @@ define dso_local i32 @all_gpr_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c
 ; CHECK-NEXT:    mov x5, #0 // =0x0
 ; CHECK-NEXT:    mov x6, #0 // =0x0
 ; CHECK-NEXT:    mov x7, #0 // =0x0
-; CHECK-NEXT:    mov x18, #0 // =0x0
+; CHECK-NEXT:    mov x15, #0 // =0x0
 ; CHECK-NEXT:    orr w0, w8, w2
 ; CHECK-NEXT:    mov x2, #0 // =0x0
 ; CHECK-NEXT:    mov x8, #0 // =0x0
@@ -146,7 +146,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; DEFAULT-NEXT:    mov x5, #0 // =0x0
 ; DEFAULT-NEXT:    mov x6, #0 // =0x0
 ; DEFAULT-NEXT:    mov x7, #0 // =0x0
-; DEFAULT-NEXT:    mov x18, #0 // =0x0
+; DEFAULT-NEXT:    mov x15, #0 // =0x0
 ; DEFAULT-NEXT:    movi v0.2d, #0000000000000000
 ; DEFAULT-NEXT:    orr w0, w8, w2
 ; DEFAULT-NEXT:    mov x2, #0 // =0x0
@@ -169,7 +169,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; SVE-OR-SME-NEXT:    mov x5, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x6, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x7, #0 // =0x0
-; SVE-OR-SME-NEXT:    mov x18, #0 // =0x0
+; SVE-OR-SME-NEXT:    mov x15, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z0.d, #0 // =0x0
 ; SVE-OR-SME-NEXT:    orr w0, w8, w2
 ; SVE-OR-SME-NEXT:    mov x2, #0 // =0x0
@@ -196,7 +196,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; STREAMING-COMPAT-NEXT:    mov x5, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x6, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x7, #0 // =0x0
-; STREAMING-COMPAT-NEXT:    mov x18, #0 // =0x0
+; STREAMING-COMPAT-NEXT:    mov x15, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    fmov d0, xzr
 ; STREAMING-COMPAT-NEXT:    orr w0, w8, w2
 ; STREAMING-COMPAT-NEXT:    mov x2, #0 // =0x0
@@ -492,7 +492,7 @@ define dso_local double @all_gpr_arg_float(double noundef %a, float noundef %b)
 ; CHECK-NEXT:    mov x6, #0 // =0x0
 ; CHECK-NEXT:    mov x7, #0 // =0x0
 ; CHECK-NEXT:    mov x8, #0 // =0x0
-; CHECK-NEXT:    mov x18, #0 // =0x0
+; CHECK-NEXT:    mov x15, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -547,7 +547,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; DEFAULT-NEXT:    mov x6, #0 // =0x0
 ; DEFAULT-NEXT:    mov x7, #0 // =0x0
 ; DEFAULT-NEXT:    mov x8, #0 // =0x0
-; DEFAULT-NEXT:    mov x18, #0 // =0x0
+; DEFAULT-NEXT:    mov x15, #0 // =0x0
 ; DEFAULT-NEXT:    movi v1.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v2.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v3.2d, #0000000000000000
@@ -570,7 +570,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; SVE-OR-SME-NEXT:    mov x6, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x7, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x8, #0 // =0x0
-; SVE-OR-SME-NEXT:    mov x18, #0 // =0x0
+; SVE-OR-SME-NEXT:    mov x15, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z1.d, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z2.d, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z3.d, #0 // =0x0
@@ -597,7 +597,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; STREAMING-COMPAT-NEXT:    mov x6, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x7, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x8, #0 // =0x0
-; STREAMING-COMPAT-NEXT:    mov x18, #0 // =0x0
+; STREAMING-COMPAT-NEXT:    mov x15, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    fmov d1, xzr
 ; STREAMING-COMPAT-NEXT:    fmov d2, xzr
 ; STREAMING-COMPAT-NEXT:    fmov d3, xzr

From bb3b8306dc226c4dc4dfde36444b43476eea66ee Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 10:48:32 +0800
Subject: [PATCH 0078/1322] [NFC] [C++20] [Modules] Add a test module local
 declaration lookup

From
https://github.com/llvm/llvm-project/issues/143734, but it looks good on
trunk. Add it as tests are always good.
---
 .../Modules/module-local-declarations.cppm    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 clang/test/Modules/module-local-declarations.cppm

diff --git a/clang/test/Modules/module-local-declarations.cppm b/clang/test/Modules/module-local-declarations.cppm
new file mode 100644
index 000000000000..4fbcf09e4d79
--- /dev/null
+++ b/clang/test/Modules/module-local-declarations.cppm
@@ -0,0 +1,30 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/Base.cppm -emit-module-interface -o %t/Base.pcm
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fsyntax-only -verify -fprebuilt-module-path=%t
+
+//--- Base.cppm
+export module Base;
+export template <class T>
+class Base {};
+
+//--- A.cppm
+export module A;
+import Base;
+struct S {};
+
+export Base<S> a;
+
+//--- B.cppm
+// expected-no-diagnostics
+export module B;
+
+import A;
+import Base;
+
+struct S {};
+
+export Base<S> b;

From de51b2dd3c6fc995e7db56fc50b4c8dceddc0aab Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 11 Jun 2025 19:51:05 -0700
Subject: [PATCH 0079/1322] [lldb] Move Transport class into lldb_private (NFC)
 (#143806)

Move lldb-dap's Transport class into lldb_private so the code can be
shared between the "JSON with header" protocol used by DAP and the JSON
RPC protocol used by MCP (see [1]).

[1]: https://discourse.llvm.org/t/rfc-adding-mcp-support-to-lldb/86798
---
 lldb/include/lldb/Host/JSONTransport.h    | 126 +++++++++++++++++++
 lldb/source/Host/CMakeLists.txt           |   3 +-
 lldb/source/Host/common/JSONTransport.cpp | 147 ++++++++++++++++++++++
 lldb/tools/lldb-dap/DAP.cpp               |   7 +-
 lldb/tools/lldb-dap/Transport.cpp         | 145 +--------------------
 lldb/tools/lldb-dap/Transport.h           |  65 ++--------
 lldb/unittests/DAP/DAPTest.cpp            |   7 +-
 lldb/unittests/DAP/TestBase.cpp           |   3 +-
 lldb/unittests/DAP/TransportTest.cpp      |  16 ++-
 9 files changed, 308 insertions(+), 211 deletions(-)
 create mode 100644 lldb/include/lldb/Host/JSONTransport.h
 create mode 100644 lldb/source/Host/common/JSONTransport.cpp

diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h
new file mode 100644
index 000000000000..4db5e417ea85
--- /dev/null
+++ b/lldb/include/lldb/Host/JSONTransport.h
@@ -0,0 +1,126 @@
+//===-- JSONTransport.h ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Transport layer for encoding and decoding JSON protocol messages.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_JSONTRANSPORT_H
+#define LLDB_HOST_JSONTRANSPORT_H
+
+#include "lldb/lldb-forward.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/JSON.h"
+#include <chrono>
+#include <system_error>
+
+namespace lldb_private {
+
+class TransportEOFError : public llvm::ErrorInfo<TransportEOFError> {
+public:
+  static char ID;
+
+  TransportEOFError() = default;
+
+  void log(llvm::raw_ostream &OS) const override {
+    OS << "transport end of file reached";
+  }
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+
+class TransportTimeoutError : public llvm::ErrorInfo<TransportTimeoutError> {
+public:
+  static char ID;
+
+  TransportTimeoutError() = default;
+
+  void log(llvm::raw_ostream &OS) const override {
+    OS << "transport operation timed out";
+  }
+  std::error_code convertToErrorCode() const override {
+    return std::make_error_code(std::errc::timed_out);
+  }
+};
+
+class TransportClosedError : public llvm::ErrorInfo<TransportClosedError> {
+public:
+  static char ID;
+
+  TransportClosedError() = default;
+
+  void log(llvm::raw_ostream &OS) const override {
+    OS << "transport is closed";
+  }
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+
+/// A transport class that uses JSON for communication.
+class JSONTransport {
+public:
+  JSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output);
+  virtual ~JSONTransport() = default;
+
+  /// Transport is not copyable.
+  /// @{
+  JSONTransport(const JSONTransport &rhs) = delete;
+  void operator=(const JSONTransport &rhs) = delete;
+  /// @}
+
+  /// Writes a message to the output stream.
+  template <typename T> llvm::Error Write(const T &t) {
+    const std::string message = llvm::formatv("{0}", toJSON(t)).str();
+    return WriteImpl(message);
+  }
+
+  /// Reads the next message from the input stream.
+  template <typename T>
+  llvm::Expected<T> Read(const std::chrono::microseconds &timeout) {
+    llvm::Expected<std::string> message = ReadImpl(timeout);
+    if (!message)
+      return message.takeError();
+    return llvm::json::parse<T>(/*JSON=*/*message);
+  }
+
+protected:
+  virtual void Log(llvm::StringRef message);
+
+  virtual llvm::Error WriteImpl(const std::string &message) = 0;
+  virtual llvm::Expected<std::string>
+  ReadImpl(const std::chrono::microseconds &timeout) = 0;
+
+  lldb::IOObjectSP m_input;
+  lldb::IOObjectSP m_output;
+};
+
+/// A transport class for JSON with a HTTP header.
+class HTTPDelimitedJSONTransport : public JSONTransport {
+public:
+  HTTPDelimitedJSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output)
+      : JSONTransport(input, output) {}
+  virtual ~HTTPDelimitedJSONTransport() = default;
+
+protected:
+  virtual llvm::Error WriteImpl(const std::string &message) override;
+  virtual llvm::Expected<std::string>
+  ReadImpl(const std::chrono::microseconds &timeout) override;
+
+  // FIXME: Support any header.
+  static constexpr llvm::StringLiteral kHeaderContentLength =
+      "Content-Length: ";
+  static constexpr llvm::StringLiteral kHeaderSeparator = "\r\n\r\n";
+};
+
+} // namespace lldb_private
+
+#endif
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index 5b713133afea..b15d72e61b6e 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -27,8 +27,9 @@ add_host_subdirectory(common
   common/HostNativeThreadBase.cpp
   common/HostProcess.cpp
   common/HostThread.cpp
-  common/LockFileBase.cpp
+  common/JSONTransport.cpp
   common/LZMA.cpp
+  common/LockFileBase.cpp
   common/MainLoopBase.cpp
   common/MemoryMonitor.cpp
   common/MonitoringProcessLauncher.cpp
diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp
new file mode 100644
index 000000000000..103c76d25daf
--- /dev/null
+++ b/lldb/source/Host/common/JSONTransport.cpp
@@ -0,0 +1,147 @@
+//===-- JSONTransport.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/JSONTransport.h"
+#include "lldb/Utility/IOObject.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/SelectHelper.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/lldb-forward.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+
+/// ReadFull attempts to read the specified number of bytes. If EOF is
+/// encountered, an empty string is returned.
+static Expected<std::string>
+ReadFull(IOObject &descriptor, size_t length,
+         std::optional<std::chrono::microseconds> timeout = std::nullopt) {
+  if (!descriptor.IsValid())
+    return llvm::make_error<TransportClosedError>();
+
+  bool timeout_supported = true;
+  // FIXME: SelectHelper does not work with NativeFile on Win32.
+#if _WIN32
+  timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket;
+#endif
+
+  if (timeout && timeout_supported) {
+    SelectHelper sh;
+    sh.SetTimeout(*timeout);
+    sh.FDSetRead(descriptor.GetWaitableHandle());
+    Status status = sh.Select();
+    if (status.Fail()) {
+      // Convert timeouts into a specific error.
+      if (status.GetType() == lldb::eErrorTypePOSIX &&
+          status.GetError() == ETIMEDOUT)
+        return make_error<TransportTimeoutError>();
+      return status.takeError();
+    }
+  }
+
+  std::string data;
+  data.resize(length);
+  Status status = descriptor.Read(data.data(), length);
+  if (status.Fail())
+    return status.takeError();
+
+  // Read returns '' on EOF.
+  if (length == 0)
+    return make_error<TransportEOFError>();
+
+  // Return the actual number of bytes read.
+  return data.substr(0, length);
+}
+
+static Expected<std::string>
+ReadUntil(IOObject &descriptor, StringRef delimiter,
+          std::optional<std::chrono::microseconds> timeout = std::nullopt) {
+  std::string buffer;
+  buffer.reserve(delimiter.size() + 1);
+  while (!llvm::StringRef(buffer).ends_with(delimiter)) {
+    Expected<std::string> next =
+        ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout);
+    if (auto Err = next.takeError())
+      return std::move(Err);
+    buffer += *next;
+  }
+  return buffer.substr(0, buffer.size() - delimiter.size());
+}
+
+JSONTransport::JSONTransport(IOObjectSP input, IOObjectSP output)
+    : m_input(std::move(input)), m_output(std::move(output)) {}
+
+void JSONTransport::Log(llvm::StringRef message) {
+  LLDB_LOG(GetLog(LLDBLog::Host), "{0}", message);
+}
+
+Expected<std::string>
+HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) {
+  if (!m_input || !m_input->IsValid())
+    return createStringError("transport output is closed");
+
+  IOObject *input = m_input.get();
+  Expected<std::string> message_header =
+      ReadFull(*input, kHeaderContentLength.size(), timeout);
+  if (!message_header)
+    return message_header.takeError();
+  if (*message_header != kHeaderContentLength)
+    return createStringError(formatv("expected '{0}' and got '{1}'",
+                                     kHeaderContentLength, *message_header)
+                                 .str());
+
+  Expected<std::string> raw_length = ReadUntil(*input, kHeaderSeparator);
+  if (!raw_length)
+    return handleErrors(raw_length.takeError(),
+                        [&](const TransportEOFError &E) -> llvm::Error {
+                          return createStringError(
+                              "unexpected EOF while reading header separator");
+                        });
+
+  size_t length;
+  if (!to_integer(*raw_length, length))
+    return createStringError(
+        formatv("invalid content length {0}", *raw_length).str());
+
+  Expected<std::string> raw_json = ReadFull(*input, length);
+  if (!raw_json)
+    return handleErrors(
+        raw_json.takeError(), [&](const TransportEOFError &E) -> llvm::Error {
+          return createStringError("unexpected EOF while reading JSON");
+        });
+
+  Log(llvm::formatv("--> {0}", *raw_json).str());
+
+  return raw_json;
+}
+
+Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) {
+  if (!m_output || !m_output->IsValid())
+    return llvm::make_error<TransportClosedError>();
+
+  Log(llvm::formatv("<-- {0}", message).str());
+
+  std::string Output;
+  raw_string_ostream OS(Output);
+  OS << kHeaderContentLength << message.length() << kHeaderSeparator << message;
+  size_t num_bytes = Output.size();
+  return m_output->Write(Output.data(), num_bytes).takeError();
+}
+
+char TransportEOFError::ID;
+char TransportTimeoutError::ID;
+char TransportClosedError::ID;
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index b034c967594b..9fe8227cd2d6 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -70,6 +70,7 @@
 
 using namespace lldb_dap;
 using namespace lldb_dap::protocol;
+using namespace lldb_private;
 
 namespace {
 #ifdef _WIN32
@@ -893,14 +894,14 @@ llvm::Error DAP::Loop() {
 
         while (!disconnecting) {
           llvm::Expected<Message> next =
-              transport.Read(std::chrono::seconds(1));
-          if (next.errorIsA<EndOfFileError>()) {
+              transport.Read<protocol::Message>(std::chrono::seconds(1));
+          if (next.errorIsA<TransportEOFError>()) {
             consumeError(next.takeError());
             break;
           }
 
           // If the read timed out, continue to check if we should disconnect.
-          if (next.errorIsA<TimeoutError>()) {
+          if (next.errorIsA<TransportTimeoutError>()) {
             consumeError(next.takeError());
             continue;
           }
diff --git a/lldb/tools/lldb-dap/Transport.cpp b/lldb/tools/lldb-dap/Transport.cpp
index 4e322e9ff135..d602920da34e 100644
--- a/lldb/tools/lldb-dap/Transport.cpp
+++ b/lldb/tools/lldb-dap/Transport.cpp
@@ -8,152 +8,19 @@
 
 #include "Transport.h"
 #include "DAPLog.h"
-#include "Protocol/ProtocolBase.h"
-#include "lldb/Utility/IOObject.h"
-#include "lldb/Utility/SelectHelper.h"
-#include "lldb/Utility/Status.h"
 #include "lldb/lldb-forward.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include <optional>
-#include <string>
-#include <utility>
 
 using namespace llvm;
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_dap;
-using namespace lldb_dap::protocol;
 
-/// ReadFull attempts to read the specified number of bytes. If EOF is
-/// encountered, an empty string is returned.
-static Expected<std::string>
-ReadFull(IOObject &descriptor, size_t length,
-         std::optional<std::chrono::microseconds> timeout = std::nullopt) {
-  if (!descriptor.IsValid())
-    return createStringError("transport output is closed");
+Transport::Transport(llvm::StringRef client_name, lldb_dap::Log *log,
+                     lldb::IOObjectSP input, lldb::IOObjectSP output)
+    : HTTPDelimitedJSONTransport(input, output), m_client_name(client_name),
+      m_log(log) {}
 
-  bool timeout_supported = true;
-  // FIXME: SelectHelper does not work with NativeFile on Win32.
-#if _WIN32
-  timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket;
-#endif
-
-  if (timeout && timeout_supported) {
-    SelectHelper sh;
-    sh.SetTimeout(*timeout);
-    sh.FDSetRead(descriptor.GetWaitableHandle());
-    Status status = sh.Select();
-    if (status.Fail()) {
-      // Convert timeouts into a specific error.
-      if (status.GetType() == lldb::eErrorTypePOSIX &&
-          status.GetError() == ETIMEDOUT)
-        return make_error<TimeoutError>();
-      return status.takeError();
-    }
-  }
-
-  std::string data;
-  data.resize(length);
-  Status status = descriptor.Read(data.data(), length);
-  if (status.Fail())
-    return status.takeError();
-
-  // Read returns '' on EOF.
-  if (length == 0)
-    return make_error<EndOfFileError>();
-
-  // Return the actual number of bytes read.
-  return data.substr(0, length);
+void Transport::Log(llvm::StringRef message) {
+  DAP_LOG(m_log, "({0}) {1}", m_client_name, message);
 }
-
-static Expected<std::string>
-ReadUntil(IOObject &descriptor, StringRef delimiter,
-          std::optional<std::chrono::microseconds> timeout = std::nullopt) {
-  std::string buffer;
-  buffer.reserve(delimiter.size() + 1);
-  while (!llvm::StringRef(buffer).ends_with(delimiter)) {
-    Expected<std::string> next =
-        ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout);
-    if (auto Err = next.takeError())
-      return std::move(Err);
-    buffer += *next;
-  }
-  return buffer.substr(0, buffer.size() - delimiter.size());
-}
-
-/// DAP message format
-/// ```
-/// Content-Length: (?<length>\d+)\r\n\r\n(?<content>.{\k<length>})
-/// ```
-static constexpr StringLiteral kHeaderContentLength = "Content-Length: ";
-static constexpr StringLiteral kHeaderSeparator = "\r\n\r\n";
-
-namespace lldb_dap {
-
-char EndOfFileError::ID;
-char TimeoutError::ID;
-
-Transport::Transport(StringRef client_name, Log *log, IOObjectSP input,
-                     IOObjectSP output)
-    : m_client_name(client_name), m_log(log), m_input(std::move(input)),
-      m_output(std::move(output)) {}
-
-Expected<Message> Transport::Read(const std::chrono::microseconds &timeout) {
-  if (!m_input || !m_input->IsValid())
-    return createStringError("transport output is closed");
-
-  IOObject *input = m_input.get();
-  Expected<std::string> message_header =
-      ReadFull(*input, kHeaderContentLength.size(), timeout);
-  if (!message_header)
-    return message_header.takeError();
-  if (*message_header != kHeaderContentLength)
-    return createStringError(formatv("expected '{0}' and got '{1}'",
-                                     kHeaderContentLength, *message_header)
-                                 .str());
-
-  Expected<std::string> raw_length = ReadUntil(*input, kHeaderSeparator);
-  if (!raw_length)
-    return handleErrors(raw_length.takeError(),
-                        [&](const EndOfFileError &E) -> llvm::Error {
-                          return createStringError(
-                              "unexpected EOF while reading header separator");
-                        });
-
-  size_t length;
-  if (!to_integer(*raw_length, length))
-    return createStringError(
-        formatv("invalid content length {0}", *raw_length).str());
-
-  Expected<std::string> raw_json = ReadFull(*input, length);
-  if (!raw_json)
-    return handleErrors(
-        raw_json.takeError(), [&](const EndOfFileError &E) -> llvm::Error {
-          return createStringError("unexpected EOF while reading JSON");
-        });
-
-  DAP_LOG(m_log, "--> ({0}) {1}", m_client_name, *raw_json);
-
-  return json::parse<Message>(/*JSON=*/*raw_json,
-                              /*RootName=*/"protocol_message");
-}
-
-Error Transport::Write(const Message &message) {
-  if (!m_output || !m_output->IsValid())
-    return createStringError("transport output is closed");
-
-  std::string json = formatv("{0}", toJSON(message)).str();
-
-  DAP_LOG(m_log, "<-- ({0}) {1}", m_client_name, json);
-
-  std::string Output;
-  raw_string_ostream OS(Output);
-  OS << kHeaderContentLength << json.length() << kHeaderSeparator << json;
-  size_t num_bytes = Output.size();
-  return m_output->Write(Output.data(), num_bytes).takeError();
-}
-
-} // end namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Transport.h b/lldb/tools/lldb-dap/Transport.h
index 4e347eaa5131..51f62e718a0d 100644
--- a/lldb/tools/lldb-dap/Transport.h
+++ b/lldb/tools/lldb-dap/Transport.h
@@ -15,70 +15,21 @@
 #define LLDB_TOOLS_LLDB_DAP_TRANSPORT_H
 
 #include "DAPForward.h"
-#include "Protocol/ProtocolBase.h"
+#include "lldb/Host/JSONTransport.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include <chrono>
-#include <system_error>
 
 namespace lldb_dap {
 
-class EndOfFileError : public llvm::ErrorInfo<EndOfFileError> {
-public:
-  static char ID;
-
-  EndOfFileError() = default;
-
-  void log(llvm::raw_ostream &OS) const override {
-    OS << "end of file reached";
-  }
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class TimeoutError : public llvm::ErrorInfo<TimeoutError> {
-public:
-  static char ID;
-
-  TimeoutError() = default;
-
-  void log(llvm::raw_ostream &OS) const override {
-    OS << "operation timed out";
-  }
-  std::error_code convertToErrorCode() const override {
-    return std::make_error_code(std::errc::timed_out);
-  }
-};
-
 /// A transport class that performs the Debug Adapter Protocol communication
 /// with the client.
-class Transport {
+class Transport : public lldb_private::HTTPDelimitedJSONTransport {
 public:
-  Transport(llvm::StringRef client_name, Log *log, lldb::IOObjectSP input,
-            lldb::IOObjectSP output);
-  ~Transport() = default;
+  Transport(llvm::StringRef client_name, lldb_dap::Log *log,
+            lldb::IOObjectSP input, lldb::IOObjectSP output);
+  virtual ~Transport() = default;
 
-  /// Transport is not copyable.
-  /// @{
-  Transport(const Transport &rhs) = delete;
-  void operator=(const Transport &rhs) = delete;
-  /// @}
-
-  /// Writes a Debug Adater Protocol message to the output stream.
-  llvm::Error Write(const protocol::Message &M);
-
-  /// Reads the next Debug Adater Protocol message from the input stream.
-  ///
-  /// \param timeout[in]
-  ///     A timeout to wait for reading the initial header. Once a message
-  ///     header is recieved, this will block until the full message is
-  ///     read.
-  ///
-  /// \returns Returns the next protocol message.
-  llvm::Expected<protocol::Message>
-  Read(const std::chrono::microseconds &timeout);
+  virtual void Log(llvm::StringRef message) override;
 
   /// Returns the name of this transport client, for example `stdin/stdout` or
   /// `client_1`.
@@ -86,9 +37,7 @@ public:
 
 private:
   llvm::StringRef m_client_name;
-  Log *m_log;
-  lldb::IOObjectSP m_input;
-  lldb::IOObjectSP m_output;
+  lldb_dap::Log *m_log;
 };
 
 } // namespace lldb_dap
diff --git a/lldb/unittests/DAP/DAPTest.cpp b/lldb/unittests/DAP/DAPTest.cpp
index 5fb6bf7e564a..40ffaf87c9c4 100644
--- a/lldb/unittests/DAP/DAPTest.cpp
+++ b/lldb/unittests/DAP/DAPTest.cpp
@@ -32,7 +32,8 @@ TEST_F(DAPTest, SendProtocolMessages) {
       /*transport=*/*to_dap,
   };
   dap.Send(Event{/*event=*/"my-event", /*body=*/std::nullopt});
-  ASSERT_THAT_EXPECTED(from_dap->Read(std::chrono::milliseconds(1)),
-                       HasValue(testing::VariantWith<Event>(testing::FieldsAre(
-                           /*event=*/"my-event", /*body=*/std::nullopt))));
+  ASSERT_THAT_EXPECTED(
+      from_dap->Read<protocol::Message>(std::chrono::milliseconds(1)),
+      HasValue(testing::VariantWith<Event>(testing::FieldsAre(
+          /*event=*/"my-event", /*body=*/std::nullopt))));
 }
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index 388d1b901507..4063b3425031 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -122,7 +122,8 @@ std::vector<Message> DAPTestBase::DrainOutput() {
   std::vector<Message> msgs;
   output.CloseWriteFileDescriptor();
   while (true) {
-    Expected<Message> next = from_dap->Read(std::chrono::milliseconds(1));
+    Expected<Message> next =
+        from_dap->Read<protocol::Message>(std::chrono::milliseconds(1));
     if (!next) {
       consumeError(next.takeError());
       break;
diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp
index e6dab42e3094..aaf257993af2 100644
--- a/lldb/unittests/DAP/TransportTest.cpp
+++ b/lldb/unittests/DAP/TransportTest.cpp
@@ -26,6 +26,8 @@ using namespace lldb_dap::protocol;
 using lldb_private::File;
 using lldb_private::NativeFile;
 using lldb_private::Pipe;
+using lldb_private::TransportEOFError;
+using lldb_private::TransportTimeoutError;
 
 class TransportTest : public PipeBase {
 protected:
@@ -50,7 +52,7 @@ TEST_F(TransportTest, MalformedRequests) {
       input.Write(malformed_header.data(), malformed_header.size()),
       Succeeded());
   ASSERT_THAT_EXPECTED(
-      transport->Read(std::chrono::milliseconds(1)),
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
       FailedWithMessage(
           "expected 'Content-Length: ' and got 'COnTent-LenGth: '"));
 }
@@ -63,20 +65,22 @@ TEST_F(TransportTest, Read) {
   ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
                        Succeeded());
   ASSERT_THAT_EXPECTED(
-      transport->Read(std::chrono::milliseconds(1)),
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
       HasValue(testing::VariantWith<Request>(testing::FieldsAre(
           /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt))));
 }
 
 TEST_F(TransportTest, ReadWithTimeout) {
-  ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)),
-                       Failed<TimeoutError>());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
+      Failed<TransportTimeoutError>());
 }
 
 TEST_F(TransportTest, ReadWithEOF) {
   input.CloseWriteFileDescriptor();
-  ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)),
-                       Failed<EndOfFileError>());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
+      Failed<TransportEOFError>());
 }
 
 TEST_F(TransportTest, Write) {

From faa49d6662b4c14438cc8e63a3751c22f28d2481 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 12 Jun 2025 02:53:03 +0000
Subject: [PATCH 0080/1322] [gn build] Port de51b2dd3c6f

---
 llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
index ca1acf9ba8aa..b00442d8e1eb 100644
--- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
@@ -27,6 +27,7 @@ static_library("Host") {
     "common/HostNativeThreadBase.cpp",
     "common/HostProcess.cpp",
     "common/HostThread.cpp",
+    "common/JSONTransport.cpp",
     "common/LZMA.cpp",
     "common/LockFileBase.cpp",
     "common/MainLoopBase.cpp",

From d8118ed6db28a3caaf3fa4a4f8d0d51d33b09c30 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 20:00:45 -0700
Subject: [PATCH 0081/1322] [ELF,test] Improve weak-undef-rw.s

---
 lld/test/ELF/weak-undef-rw.s | 54 +++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index bbc37ba49304..902cad87aba9 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -3,12 +3,17 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64 c.s -o c.o
-# RUN: ld.lld a.o -o nopie --export-dynamic
-# RUN: llvm-readelf -r --hex-dump=.data nopie | FileCheck %s --check-prefix=STATIC
-# RUN: ld.lld a.o -o out.pie -pie
-# RUN: llvm-readelf -r --hex-dump=.data out.pie | FileCheck %s --check-prefix=STATIC
-# RUN: ld.lld a.o -o out.so -shared
-# RUN: llvm-readobj -r out.so | FileCheck %s --check-prefix=PIC
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o
+# RUN: ld.lld -shared s.o -o s.so
+
+# RUN: ld.lld a.o -o a --export-dynamic
+# RUN: llvm-readelf -r --hex-dump=.data a | FileCheck %s --check-prefix=STATIC
+# RUN: ld.lld a.o s.so -o as
+# RUN: llvm-readelf -r --hex-dump=.data as | FileCheck %s --check-prefix=STATIC
+# RUN: ld.lld a.o -o a.pie -pie
+# RUN: llvm-readelf -r --hex-dump=.data a.pie | FileCheck %s --check-prefix=STATIC
+# RUN: ld.lld a.o -o a.so -shared
+# RUN: llvm-readelf -r a.so | FileCheck %s --check-prefix=DYN
 
 ## gABI leaves the behavior of weak undefined references implementation defined.
 ## We choose to resolve them statically for static linking and produce dynamic relocations
@@ -19,35 +24,44 @@
 
 # STATIC:      no relocations
 # STATIC:      Hex dump of section '.data':
-# STATIC-NEXT: {{.*}} 00000000 00000000 .
+# STATIC-NEXT: {{.*}} 00000000 00000000 03000000 00000000 .
 # STATIC-EMPTY:
 
-# PIC:      .rela.dyn {
-# PIC-NEXT:   R_X86_64_64 foobar 0x0
-# PIC-NEXT: }
+# DYN:        Relocation section '.rela.dyn' {{.*}} contains 2
+# DYN:        R_X86_64_64 0000000000000000 foobar + 0{{$}}
 
-# RUN: ld.lld a.o b.o -o out1 -z undefs
-# RUN: llvm-readelf -r -x .data out1 | FileCheck %s --check-prefix=STATIC1
-# RUN: ld.lld a.o b.o -o out1.pie -pie -z undefs
-# RUN: llvm-readelf -r -x .data out1.pie | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o -o ab -z undefs
+# RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o s.so -o abs -z undefs
+# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1
+# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs
+# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1
 
 # STATIC1:      no relocations
 # STATIC1:      Hex dump of section '.data':
-# STATIC1-NEXT: {{.*}} 00000000 00000000 00000000 00000000 .
+# STATIC1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 .
+# STATIC1-NEXT: {{.*}} 05000000 00000000                   .
 # STATIC1-EMPTY:
 
+# DYN1:        Relocation section '.rela.dyn' {{.*}} contains 1
+# DYN1:        Hex dump of section '.data':
+# DYN1-NEXT:   {{.*}} 00000000 00000000 03000000 00000000 .
+# DYN1-NEXT:   {{.*}} 00000000 00000000                   .
+# DYN1-EMPTY:
+
 # RUN: ld.lld a.o b.o c.o -pie -z undefs 2>&1 | count 0
 
 #--- a.s
-        .global _start
+.global _start
 _start:
-        .data
-        .weak foobar
-        .quad foobar
+.data
+.weak foobar
+.quad foobar
+.quad foobar+3
 
 #--- b.s
 .data
-.quad undef
+.quad undef+5
 
 #--- c.s
 call undef

From b46f34452e9dec50eee6ddbe07875f05e421a81c Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Wed, 11 Jun 2025 20:22:08 -0700
Subject: [PATCH 0082/1322] libunwind: Do not use 
 __attribute__((target("gcs"))) with non-clang compilers (#138077)

This attribute is unsupported in GCC, so far it worked because before
GCC15 did not define this macros in _CHKFEAT_GCS in arm_acle.h [1]

With gcc15 compiler libunwind's check for this macros is succeeding and
it ends up enabling 'gcs' by using function attribute, this works with
clang but not with gcc.

We can see this in rust compiler bootstrap for aarch64/musl when system
uses gcc15, it ends up with these errors

Building libunwind.a for aarch64-poky-linux-musl
```
cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:191:1: error: arch extension 'gcs' should be prefixed by '+' cargo:warning=  191 | unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) {
cargo:warning=      | ^~~~~~~~~~~~~
cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:337:22: error: arch extension 'gcs' should be prefixed by '+'
cargo:warning=  337 |                      _Unwind_Stop_Fn stop, void *stop_parameter) {
cargo:warning=      |                      ^~~~~~~~~~~~~~~
```

[1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5a6af707f0af

Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
 libunwind/src/UnwindLevel1.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index a258a832a9c3..f3b451ad9b73 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -188,10 +188,11 @@ extern int __unw_step_stage2(unw_cursor_t *);
 
 #if defined(_LIBUNWIND_USE_GCS)
 // Enable the GCS target feature to permit gcspop instructions to be used.
-__attribute__((target("gcs")))
+__attribute__((target("+gcs")))
 #endif
 static _Unwind_Reason_Code
-unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) {
+unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor,
+              _Unwind_Exception *exception_object) {
   __unw_init_local(cursor, uc);
 
   _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_obj=%p)",
@@ -332,12 +333,12 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except
 
 #if defined(_LIBUNWIND_USE_GCS)
 // Enable the GCS target feature to permit gcspop instructions to be used.
-__attribute__((target("gcs")))
+__attribute__((target("+gcs")))
 #endif
 static _Unwind_Reason_Code
 unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor,
-                     _Unwind_Exception *exception_object,
-                     _Unwind_Stop_Fn stop, void *stop_parameter) {
+                     _Unwind_Exception *exception_object, _Unwind_Stop_Fn stop,
+                     void *stop_parameter) {
   __unw_init_local(cursor, uc);
 
   // uc is initialized by __unw_getcontext in the parent frame. The first stack
@@ -443,7 +444,6 @@ unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor,
   return _URC_FATAL_PHASE2_ERROR;
 }
 
-
 /// Called by __cxa_throw.  Only returns if there is a fatal error.
 _LIBUNWIND_EXPORT _Unwind_Reason_Code
 _Unwind_RaiseException(_Unwind_Exception *exception_object) {

From a71210e5abdbae80363cb5956a24a2004f625ca6 Mon Sep 17 00:00:00 2001
From: Kewen12 <Kewen.Meng@amd.com>
Date: Wed, 11 Jun 2025 20:24:56 -0700
Subject: [PATCH 0083/1322] Revert "[libc] Fix stdio tests after #143802"
 (#143824)

Reverts llvm/llvm-project#143810

This PR breaks our buildbot:
https://lab.llvm.org/buildbot/#/builders/10/builds/7159 revert to
unblock downstream merge.
---
 libc/docs/configure.rst                     | 2 +-
 libc/test/src/stdio/fgetc_test.cpp          | 1 -
 libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 -
 libc/test/src/stdio/fgets_test.cpp          | 1 -
 libc/test/src/stdio/setvbuf_test.cpp        | 1 -
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 109412225634..8d53390ae19b 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,7 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 1faa49112fb6..7c652f666a8f 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -33,7 +33,6 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 7b2efe642fb5..f4471dd82df1 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -36,7 +36,6 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index 2d7c68d49081..c00a9256af52 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -36,7 +36,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a0936ba79ef7..4144bc1bef44 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,7 +11,6 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"

From 968d8eaa44c500259fe8d56ad77ec1c71cad35e2 Mon Sep 17 00:00:00 2001
From: Yang Zaizhou <91008302+Mxfg-incense@users.noreply.github.com>
Date: Thu, 12 Jun 2025 11:28:57 +0800
Subject: [PATCH 0084/1322] [OpenMP][Flang]Fix omp_get_cancellation return type
 from integer to logical (#142990)

---
 openmp/runtime/src/include/omp_lib.F90.var | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var
index 3463b698291e..20639f60b5d9 100644
--- a/openmp/runtime/src/include/omp_lib.F90.var
+++ b/openmp/runtime/src/include/omp_lib.F90.var
@@ -399,7 +399,7 @@
 
           function omp_get_cancellation() bind(c)
             use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_cancellation
+            logical (kind=omp_logical_kind) omp_get_cancellation
           end function omp_get_cancellation
 
           function omp_is_initial_device() bind(c)

From 2fcaa00d1e2317a90c9071b735eb0e758b5dd58b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 20:37:15 -0700
Subject: [PATCH 0085/1322] [ELF] -z undefs: handle relocations referencing
 undefined non-weak like undefined weak

* Merge the special case into isStaticLinkTimeConstant
* Generalize isUndefWeak to isUndefined. undefined non-weak is an error
  case. We choose to be general, which also brings us in line with GNU ld.
---
 lld/ELF/Relocations.cpp      | 25 ++++++++++---------------
 lld/test/ELF/weak-undef-rw.s | 12 +++++++-----
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 1af01e7247dc..6c4209a2b81e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -990,10 +990,17 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   // only the low bits are used.
   if (e == R_GOT || e == R_PLT)
     return ctx.target->usesOnlyLowPageBits(type) || !ctx.arg.isPic;
-
   // R_AARCH64_AUTH_ABS64 requires a dynamic relocation.
-  if (sym.isPreemptible || e == RE_AARCH64_AUTH)
+  if (e == RE_AARCH64_AUTH)
     return false;
+
+  // The behavior of an undefined weak reference is implementation defined.
+  // (We treat undefined non-weak the same as undefined weak.) For static
+  // -no-pie linking, dynamic relocations are generally avoided (except
+  // IRELATIVE). Emitting dynamic relocations for -shared aligns with its -z
+  // undefs default. Dynamic -no-pie linking and -pie allow flexibility.
+  if (sym.isPreemptible)
+    return sym.isUndefined() && !ctx.arg.isPic;
   if (!ctx.arg.isPic)
     return true;
 
@@ -1113,19 +1120,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
   // If the relocation is known to be a link-time constant, we know no dynamic
   // relocation will be created, pass the control to relocateAlloc() or
   // relocateNonAlloc() to resolve it.
-  //
-  // The behavior of an undefined weak reference is implementation defined. For
-  // non-link-time constants, we resolve relocations statically (let
-  // relocate{,Non}Alloc() resolve them) for -no-pie and try producing dynamic
-  // relocations for -pie and -shared.
-  //
-  // The general expectation of -no-pie static linking is that there is no
-  // dynamic relocation (except IRELATIVE). Emitting dynamic relocations for
-  // -shared matches the spirit of its -z undefs default. -pie has freedom on
-  // choices, and we choose dynamic relocations to be consistent with the
-  // handling of GOT-generating relocations.
-  if (isStaticLinkTimeConstant(expr, type, sym, offset) ||
-      (!ctx.arg.isPic && sym.isUndefWeak())) {
+  if (isStaticLinkTimeConstant(expr, type, sym, offset)) {
     sec->addReloc({expr, type, offset, addend, &sym});
     return;
   }
diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index 902cad87aba9..497228a3cf90 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -33,9 +33,11 @@
 # RUN: ld.lld a.o b.o -o ab -z undefs
 # RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1
 # RUN: ld.lld a.o b.o s.so -o abs -z undefs
-# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1
-# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs
-# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1
+# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o -o ab.pie -pie -z undefs
+# RUN: llvm-readelf -r -x .data ab.pie | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o s.so -o abs.pie -pie -z undefs
+# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=DYN1
 
 # STATIC1:      no relocations
 # STATIC1:      Hex dump of section '.data':
@@ -43,9 +45,9 @@
 # STATIC1-NEXT: {{.*}} 05000000 00000000                   .
 # STATIC1-EMPTY:
 
-# DYN1:        Relocation section '.rela.dyn' {{.*}} contains 1
+# DYN1:        Relocation section '.rela.dyn' {{.*}} contains 3
 # DYN1:        Hex dump of section '.data':
-# DYN1-NEXT:   {{.*}} 00000000 00000000 03000000 00000000 .
+# DYN1-NEXT:   {{.*}} 00000000 00000000 00000000 00000000 .
 # DYN1-NEXT:   {{.*}} 00000000 00000000                   .
 # DYN1-EMPTY:
 

From 5f231db76482bbdd3e658d8e9797cbd46837d4e1 Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813@gmail.com>
Date: Thu, 12 Jun 2025 11:41:52 +0800
Subject: [PATCH 0086/1322] [RISCV] Use StringRef for RequiredExtensions in
 RVVIntrinsicDef (#143503)

This prevents many duplicated copies of required extensions string.
---
 clang/lib/Sema/SemaRISCV.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 9f70be746eb3..9eab0c2a0df6 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -47,7 +47,7 @@ struct RVVIntrinsicDef {
   std::string BuiltinName;
 
   /// Mapping to RequiredFeatures in riscv_vector.td
-  std::string RequiredExtensions;
+  StringRef RequiredExtensions;
 
   /// Function signature, first element is return type.
   RVVTypes Signature;

From f09050fdc85074869f0b34f0d9e061a74ef549ee Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 11:35:44 +0800
Subject: [PATCH 0087/1322] [C++20] [Modules] Fix module local lookup
 ambiguousity

Close https://github.com/llvm/llvm-project/issues/61360
Close https://github.com/llvm/llvm-project/issues/129525
Close https://github.com/llvm/llvm-project/issues/143734

We shouldn't identify different module local decls in different modules
as the same entity.
---
 clang/include/clang/AST/ASTContext.h          |  6 ++--
 clang/include/clang/AST/DeclBase.h            |  4 +++
 clang/lib/AST/ASTContext.cpp                  |  8 ++++-
 clang/lib/AST/DeclBase.cpp                    |  6 ++++
 .../Modules/module-local-declarations-02.cppm | 31 +++++++++++++++++++
 clang/test/Modules/pr61360.cppm               | 25 +++++++++++++++
 6 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Modules/module-local-declarations-02.cppm
 create mode 100644 clang/test/Modules/pr61360.cppm

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8d24d393eab0..3abb49312255 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -488,8 +488,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// if possible.
   ///
   /// Not serialized intentionally.
-  llvm::StringMap<const Module *> PrimaryModuleNameMap;
-  llvm::DenseMap<const Module *, const Module *> SameModuleLookupSet;
+  mutable llvm::StringMap<const Module *> PrimaryModuleNameMap;
+  mutable llvm::DenseMap<const Module *, const Module *> SameModuleLookupSet;
 
   static constexpr unsigned ConstantArrayTypesLog2InitSize = 8;
   static constexpr unsigned GeneralTypesLog2InitSize = 9;
@@ -1151,7 +1151,7 @@ public:
   ///
   /// FIXME: The signature may be confusing since `clang::Module` means to
   /// a module fragment or a module unit but not a C++20 module.
-  bool isInSameModule(const Module *M1, const Module *M2);
+  bool isInSameModule(const Module *M1, const Module *M2) const;
 
   TranslationUnitDecl *getTranslationUnitDecl() const {
     return TUDecl->getMostRecentDecl();
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 375e9e259250..dd67ebc9873f 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -646,6 +646,10 @@ public:
     return getModuleOwnershipKind() == ModuleOwnershipKind::ModulePrivate;
   }
 
+  /// Whether this declaration was a local declaration to a C++20
+  /// named module.
+  bool isModuleLocal() const;
+
   /// Whether this declaration was exported in a lexical context.
   /// e.g.:
   ///
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index b51f7622288d..4d44f23c0f50 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1175,7 +1175,7 @@ void ASTContext::setCurrentNamedModule(Module *M) {
   CurrentCXXNamedModule = M;
 }
 
-bool ASTContext::isInSameModule(const Module *M1, const Module *M2) {
+bool ASTContext::isInSameModule(const Module *M1, const Module *M2) const {
   if (!M1 != !M2)
     return false;
 
@@ -7429,6 +7429,12 @@ bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const {
                           cast<Decl>(Y->getDeclContext()->getRedeclContext())))
     return false;
 
+  // If either X or Y are local to the owning module, they are only possible to
+  // be the same entity if they are in the same module.
+  if (X->isModuleLocal() || Y->isModuleLocal())
+    if (!isInSameModule(X->getOwningModule(), Y->getOwningModule()))
+      return false;
+
   // Two typedefs refer to the same entity if they have the same underlying
   // type.
   if (const auto *TypedefX = dyn_cast<TypedefNameDecl>(X))
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index a1bb62bcb68f..48c60aa4e449 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1132,6 +1132,12 @@ bool Decl::isInExportDeclContext() const {
   return isa_and_nonnull<ExportDecl>(DC);
 }
 
+bool Decl::isModuleLocal() const {
+  auto *M = getOwningModule();
+  return M && M->isNamedModule() &&
+         getModuleOwnershipKind() == ModuleOwnershipKind::ReachableWhenImported;
+}
+
 bool Decl::isInAnotherModuleUnit() const {
   auto *M = getOwningModule();
 
diff --git a/clang/test/Modules/module-local-declarations-02.cppm b/clang/test/Modules/module-local-declarations-02.cppm
new file mode 100644
index 000000000000..0670c4295abc
--- /dev/null
+++ b/clang/test/Modules/module-local-declarations-02.cppm
@@ -0,0 +1,31 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll
+
+//--- A.cppm
+export module A;
+
+export template<typename>
+struct holder {
+};
+
+struct foo {};
+
+export struct a {
+	holder<foo> m;
+};
+
+//--- B.cppm
+// expected-no-diagnostics
+export module B;
+
+import A;
+
+struct foo {};
+
+struct b {
+	holder<foo> m;
+};
\ No newline at end of file
diff --git a/clang/test/Modules/pr61360.cppm b/clang/test/Modules/pr61360.cppm
new file mode 100644
index 000000000000..a16f65d4be2f
--- /dev/null
+++ b/clang/test/Modules/pr61360.cppm
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll
+
+//--- A.cppm
+export module A;
+export template<typename>
+struct holder {
+};
+
+struct a {
+	holder<struct foo> m;
+};
+
+//--- B.cppm
+// expected-no-diagnostics
+export module B;
+import A;
+
+struct b {
+	holder<struct foo> m;
+};

From 282e471018d234f78b0990100834532389877519 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Thu, 12 Jun 2025 05:58:55 +0200
Subject: [PATCH 0088/1322] [flang] Erase `fir.local` ops before lowering `fir`
 to `llvm` (#143687)

`fir.local` ops are not supposed to have any uses at this point (i.e.
during lowering to LLVM). In case of serialization, the
`fir.do_concurrent` users are expected to have been lowered to
`fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent`
users are expected to have been lowered to the target parallel model
(e.g. OpenMP).

This hopefully resolved a build issue introduced by
https://github.com/llvm/llvm-project/pull/142567 (see for example:
https://lab.llvm.org/buildbot/#/builders/199/builds/4009).
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp | 42 +++++++++++++++++++------
 flang/test/Fir/local.fir                | 10 ++++++
 2 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 flang/test/Fir/local.fir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 82d960a6fc61..a3de3ae9d116 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3294,6 +3294,30 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
   }
 };
 
+struct LocalitySpecifierOpConversion
+    : public fir::FIROpConversion<fir::LocalitySpecifierOp> {
+  using FIROpConversion::FIROpConversion;
+  llvm::LogicalResult
+  matchAndRewrite(fir::LocalitySpecifierOp localizer, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+#ifdef EXPENSIVE_CHECKS
+    auto uses = mlir::SymbolTable::getSymbolUses(
+        localizer, localizer->getParentOfType<mlir::ModuleOp>());
+
+    // `fir.local` ops are not supposed to have any uses at this point (i.e.
+    // during lowering to LLVM). In case of serialization, the
+    // `fir.do_concurrent` users are expected to have been lowered to
+    // `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent`
+    // users are expected to have been lowered to the target parallel model
+    // (e.g. OpenMP).
+    assert(uses && uses->empty());
+#endif
+
+    rewriter.eraseOp(localizer);
+    return mlir::success();
+  }
+};
+
 /// Lower `fir.no_reassoc` to LLVM IR dialect.
 /// TODO: how do we want to enforce this in LLVM-IR? Can we manipulate the fast
 /// math flags?
@@ -4249,15 +4273,15 @@ void fir::populateFIRToLLVMConversionPatterns(
       FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion,
       GlobalLenOpConversion, GlobalOpConversion, InsertOnRangeOpConversion,
       IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion,
-      MulcOpConversion, NegcOpConversion, NoReassocOpConversion,
-      SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion,
-      SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion,
-      ShiftOpConversion, SliceOpConversion, StoreOpConversion,
-      StringLitOpConversion, SubcOpConversion, TypeDescOpConversion,
-      TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion,
-      UndefOpConversion, UnreachableOpConversion, XArrayCoorOpConversion,
-      XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter,
-                                                                options);
+      LocalitySpecifierOpConversion, MulcOpConversion, NegcOpConversion,
+      NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion,
+      SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion,
+      ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion,
+      StoreOpConversion, StringLitOpConversion, SubcOpConversion,
+      TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion,
+      UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
+      XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion,
+      ZeroOpConversion>(converter, options);
 
   // Patterns that are populated without a type converter do not trigger
   // target materializations for the operands of the root op.
diff --git a/flang/test/Fir/local.fir b/flang/test/Fir/local.fir
new file mode 100644
index 000000000000..006f5ca94467
--- /dev/null
+++ b/flang/test/Fir/local.fir
@@ -0,0 +1,10 @@
+// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s
+
+// Tests that `fir.local` ops are dropped from the module before LLVM lowering.
+
+fir.local {type = local} @local_privatizer : i32
+func.func @foo() {
+  return
+}
+
+// CHECK-NOT: fir.local

From c3be4524a56ba01bc1f868fc37e329f24ec5041c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 21:23:06 -0700
Subject: [PATCH 0089/1322] [ELF,test] Improve weak-undef-got-plt.s

---
 lld/test/ELF/weak-undef-got-pie.s | 22 --------------------
 lld/test/ELF/weak-undef-got-plt.s | 34 +++++++++++++++++++++++++++++++
 lld/test/ELF/weak-undef.s         | 31 ----------------------------
 3 files changed, 34 insertions(+), 53 deletions(-)
 delete mode 100644 lld/test/ELF/weak-undef-got-pie.s
 create mode 100644 lld/test/ELF/weak-undef-got-plt.s
 delete mode 100644 lld/test/ELF/weak-undef.s

diff --git a/lld/test/ELF/weak-undef-got-pie.s b/lld/test/ELF/weak-undef-got-pie.s
deleted file mode 100644
index 2301400f4e0b..000000000000
--- a/lld/test/ELF/weak-undef-got-pie.s
+++ /dev/null
@@ -1,22 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o
-# RUN: ld.lld %t1.o -shared -o %t1.so
-# RUN: llvm-mc -filetype=obj -x86-relax-relocations=false -triple=x86_64 %s -o %t.o
-
-# RUN: ld.lld -pie %t.o %t1.so -o %t
-# RUN: llvm-readobj -r %t | FileCheck --check-prefix=RELOCS %s
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=DISASM %s
-
-# RELOCS:      Relocations [
-# RELOCS-NEXT:   Section ({{.*}}) .rela.dyn {
-# RELOCS-NEXT:     R_X86_64_GLOB_DAT foo 0x0
-# RELOCS-NEXT:   }
-# RELOCS-NEXT: ]
-
-.weak foo
-
-.globl _start
-_start:
-# DISASM: <_start>:
-# DISASM-NEXT: movq {{.*}}(%rip), %rax
-mov foo@gotpcrel(%rip), %rax
diff --git a/lld/test/ELF/weak-undef-got-plt.s b/lld/test/ELF/weak-undef-got-plt.s
new file mode 100644
index 000000000000..0ee3da2cd3b4
--- /dev/null
+++ b/lld/test/ELF/weak-undef-got-plt.s
@@ -0,0 +1,34 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 -x86-relax-relocations=false a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o
+# RUN: ld.lld -shared s.o -o s.so
+
+# RUN: ld.lld a.o -o a
+# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
+# RUN: ld.lld a.o s.so -o as
+# RUN: llvm-objdump -dR as | FileCheck %s
+
+# RUN: ld.lld -pie a.o s.so -o as.pie
+# RUN: llvm-objdump -dR as.pie | FileCheck %s
+
+# RUN: ld.lld -shared a.o -o a.so
+# RUN: llvm-objdump -dR a.so | FileCheck %s
+
+# NORELOC:    no relocation
+
+# CHECK:      TYPE                     VALUE
+# CHECK-NEXT: R_X86_64_GLOB_DAT        foo{{$}}
+# CHECK-NEXT: R_X86_64_JUMP_SLOT       foo{{$}}
+# CHECK-EMPTY:
+# CHECK:      <_start>:
+# CHECK-NEXT:   movq {{.*}}(%rip), %rax
+# CHECK-NEXT:   callq {{.*}} <foo@plt>
+
+#--- a.s
+.weak foo
+
+.globl _start
+_start:
+mov foo@gotpcrel(%rip), %rax
+call foo
diff --git a/lld/test/ELF/weak-undef.s b/lld/test/ELF/weak-undef.s
deleted file mode 100644
index 21488023a79e..000000000000
--- a/lld/test/ELF/weak-undef.s
+++ /dev/null
@@ -1,31 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
-# RUN: ld.lld %t.o -o %t --export-dynamic
-# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \
-# RUN:   FileCheck %s --check-prefixes=NORELOC,COMMON
-
-# NORELOC: There are no relocations in this file.
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o
-# RUN: ld.lld %t1.o -shared -o %t1.so
-# RUN: ld.lld %t.o -o %t %t1.so -pie
-# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \
-# RUN:   FileCheck %s --check-prefixes=RELOC,COMMON
-
-# RELOC:      Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
-# RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend
-# RELOC-NEXT: {{.*}} 0000000100000001 R_X86_64_64 0000000000000000 foo + 0
-
-# NORELOC-NOT: Symbol table '.dynsym'
-# RELOC:       Symbol table '.dynsym' contains 2 entries:
-# RELOC-NEXT:  Num: Value Size Type Bind Vis Ndx Name
-# RELOC-NEXT:  0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
-# RELOC-NEXT:  1: 0000000000000000 0 NOTYPE WEAK DEFAULT UND foo
-# COMMON:      Hex dump of section '.data':
-# COMMON-NEXT: {{.*}} 00000000 00000000 
-# COMMON-EMPTY:
-
-.weak foo
-
-.data
-  .dc.a foo

From a93e55e57ed00a55f822c64e3520c7c732b58480 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 11 Jun 2025 21:33:46 -0700
Subject: [PATCH 0090/1322] Revert "[libc] Migrate stdio tests to
 ErrnoCheckingTest." (#143829)

Reverts llvm/llvm-project#143802. Follow-up fix
3c7af175e51c3ab08ac3c442146c2b822f38c01e wasn't robust enough and itself
got reverted.
---
 libc/test/src/stdio/CMakeLists.txt           | 10 ----------
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++++------
 libc/test/src/stdio/fgetc_test.cpp           |  5 +++--
 libc/test/src/stdio/fgetc_unlocked_test.cpp  |  5 +++--
 libc/test/src/stdio/fgets_test.cpp           |  6 +++---
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++++++++++++-----
 libc/test/src/stdio/fopencookie_test.cpp     | 15 ++++++++-------
 libc/test/src/stdio/remove_test.cpp          | 10 +++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++-----
 libc/test/src/stdio/setvbuf_test.cpp         |  8 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 ++++---
 libc/test/src/stdlib/StrtolTest.h            |  1 +
 libc/test/src/stdlib/strtold_test.cpp        |  1 +
 13 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 3627006ec28f..01904a30504e 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,7 +20,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -69,7 +68,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -90,7 +88,6 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -112,7 +109,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -430,7 +426,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -445,7 +440,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -462,7 +456,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -483,7 +476,6 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -506,7 +498,6 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -524,7 +515,6 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index b53184c30be3..104fc478b100 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,21 +9,20 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
+TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
+TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
+  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 7c652f666a8f..56bde5f0099a 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,12 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -33,6 +33,7 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index f4471dd82df1..90429ecf4e82 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,12 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -36,6 +36,7 @@ public:
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index c00a9256af52..abed3d405293 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,12 +12,11 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -36,6 +35,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e097785832d5..e624181c795b 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,18 +17,17 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
+TEST(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST_F(LlvmLibcFILETest, FFlush) {
+TEST(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
+  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index bcf5e674141a..03e1ac286b64 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,7 +15,6 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -23,7 +22,6 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
-using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -90,7 +88,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 296bff1f5dc1..84984e26398c 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,17 +11,16 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index 135fb98c07fb..ac494a4ecaf8 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,19 +8,18 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
+TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
+TEST(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 4144bc1bef44..5872943c1bb4 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -14,10 +14,9 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -53,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -103,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
+  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index e99b382d1211..5d482b70064b 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,12 +15,11 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c78..3eeccc5727e7 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index eb4056dc7ba6..c2f2b9c9a11c 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 99638537cd19b84252685a3dd56535a4d54d690e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 21:56:48 -0700
Subject: [PATCH 0091/1322] [AArch64] Fix a warning

This patch fixes:

  llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:7157:3: error:
  unannotated fall-through between switch labels
  [-Werror,-Wimplicit-fallthrough]
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ad5b90984188..af5dfd6c9b8f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7154,6 +7154,7 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   switch (CC) {
   default:
     NestReg = 0x0f; // X15
+    LLVM_FALLTHROUGH;
   case CallingConv::ARM64EC_Thunk_Native:
   case CallingConv::ARM64EC_Thunk_X64:
     // Must be kept in sync with AArch64CallingConv.td

From 02550da932913bd7c3987c68abc9060c9e5bde2c Mon Sep 17 00:00:00 2001
From: Fazlay Rabbi <106703039+mdfazlay@users.noreply.github.com>
Date: Wed, 11 Jun 2025 22:06:11 -0700
Subject: [PATCH 0092/1322] [OpenMP 60] Initial parsing/sema for
 `need_device_addr` modifier on `adjust_args` clause (#143442)

Adds initial parsing and semantic analysis for `need_device_addr`
modifier on `adjust_args` clause.
---
 clang/include/clang/Basic/Attr.td             |  1 +
 .../clang/Basic/DiagnosticParseKinds.td       |  6 ++--
 clang/include/clang/Basic/OpenMPKinds.def     |  1 +
 clang/include/clang/Sema/SemaOpenMP.h         |  1 +
 clang/lib/AST/AttrImpl.cpp                    |  6 ++++
 clang/lib/Parse/ParseOpenMP.cpp               | 28 +++++++++++++------
 clang/lib/Sema/SemaOpenMP.cpp                 |  5 ++++
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 11 ++++++--
 .../declare_variant_clauses_ast_print.cpp     | 26 ++++++++++-------
 .../declare_variant_clauses_messages.cpp      | 24 +++++++++++-----
 10 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 9e84462eaa66..f113cd2ba2fb 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4630,6 +4630,7 @@ def OMPDeclareVariant : InheritableAttr {
     OMPTraitInfoArgument<"TraitInfos">,
     VariadicExprArgument<"AdjustArgsNothing">,
     VariadicExprArgument<"AdjustArgsNeedDevicePtr">,
+    VariadicExprArgument<"AdjustArgsNeedDeviceAddr">,
     VariadicOMPInteropInfoArgument<"AppendArgs">,
   ];
   let AdditionalMembers = [{
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 3aa36ad59d0b..6c30da376daf 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1581,8 +1581,10 @@ def err_omp_unexpected_append_op : Error<
   "unexpected operation specified in 'append_args' clause, expected 'interop'">;
 def err_omp_unexpected_execution_modifier : Error<
   "unexpected 'execution' modifier in non-executable context">;
-def err_omp_unknown_adjust_args_op : Error<
-  "incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'">;
+def err_omp_unknown_adjust_args_op
+    : Error<
+          "incorrect 'adjust_args' type, expected 'need_device_ptr'%select{|, "
+          "'need_device_addr',}0 or 'nothing'">;
 def err_omp_declare_variant_wrong_clause : Error<
   "expected %select{'match'|'match', 'adjust_args', or 'append_args'}0 clause "
   "on 'omp declare variant' directive">;
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index b0de65df7e39..2b1dc1e0121b 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -214,6 +214,7 @@ OPENMP_ORIGINAL_SHARING_MODIFIER(default)
 // Adjust-op kinds for the 'adjust_args' clause.
 OPENMP_ADJUST_ARGS_KIND(nothing)
 OPENMP_ADJUST_ARGS_KIND(need_device_ptr)
+OPENMP_ADJUST_ARGS_KIND(need_device_addr)
 
 // Binding kinds for the 'bind' clause.
 OPENMP_BIND_KIND(teams)
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 6498390fe96f..be6bec206878 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -849,6 +849,7 @@ public:
       FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
       ArrayRef<Expr *> AdjustArgsNothing,
       ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
+      ArrayRef<Expr *> AdjustArgsNeedDeviceAddr,
       ArrayRef<OMPInteropInfo> AppendArgs, SourceLocation AdjustArgsLoc,
       SourceLocation AppendArgsLoc, SourceRange SR);
 
diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp
index fefb8f55a9ee..5875a925d3fb 100644
--- a/clang/lib/AST/AttrImpl.cpp
+++ b/clang/lib/AST/AttrImpl.cpp
@@ -224,6 +224,12 @@ void OMPDeclareVariantAttr::printPrettyPragma(
     PrintExprs(adjustArgsNeedDevicePtr_begin(), adjustArgsNeedDevicePtr_end());
     OS << ")";
   }
+  if (adjustArgsNeedDeviceAddr_size()) {
+    OS << " adjust_args(need_device_addr:";
+    PrintExprs(adjustArgsNeedDeviceAddr_begin(),
+               adjustArgsNeedDeviceAddr_end());
+    OS << ")";
+  }
 
   auto PrintInteropInfo = [&OS](OMPInteropInfo *Begin, OMPInteropInfo *End) {
     for (OMPInteropInfo *I = Begin; I != End; ++I) {
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index e41e5ba8596b..b69c3abe0b32 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -1483,6 +1483,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
   OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo();
   SmallVector<Expr *, 6> AdjustNothing;
   SmallVector<Expr *, 6> AdjustNeedDevicePtr;
+  SmallVector<Expr *, 6> AdjustNeedDeviceAddr;
   SmallVector<OMPInteropInfo, 3> AppendArgs;
   SourceLocation AdjustArgsLoc, AppendArgsLoc;
 
@@ -1515,11 +1516,21 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
         SmallVector<Expr *> Vars;
         IsError = ParseOpenMPVarList(OMPD_declare_variant, OMPC_adjust_args,
                                      Vars, Data);
-        if (!IsError)
-          llvm::append_range(Data.ExtraModifier == OMPC_ADJUST_ARGS_nothing
-                                 ? AdjustNothing
-                                 : AdjustNeedDevicePtr,
-                             Vars);
+        if (!IsError) {
+          switch (Data.ExtraModifier) {
+          case OMPC_ADJUST_ARGS_nothing:
+            llvm::append_range(AdjustNothing, Vars);
+            break;
+          case OMPC_ADJUST_ARGS_need_device_ptr:
+            llvm::append_range(AdjustNeedDevicePtr, Vars);
+            break;
+          case OMPC_ADJUST_ARGS_need_device_addr:
+            llvm::append_range(AdjustNeedDeviceAddr, Vars);
+            break;
+          default:
+            llvm_unreachable("Unexpected 'adjust_args' clause modifier.");
+          }
+        }
         break;
       }
       case OMPC_append_args:
@@ -1559,8 +1570,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
   if (DeclVarData && !TI.Sets.empty())
     Actions.OpenMP().ActOnOpenMPDeclareVariantDirective(
         DeclVarData->first, DeclVarData->second, TI, AdjustNothing,
-        AdjustNeedDevicePtr, AppendArgs, AdjustArgsLoc, AppendArgsLoc,
-        SourceRange(Loc, Tok.getLocation()));
+        AdjustNeedDevicePtr, AdjustNeedDeviceAddr, AppendArgs, AdjustArgsLoc,
+        AppendArgsLoc, SourceRange(Loc, Tok.getLocation()));
 
   // Skip the last annot_pragma_openmp_end.
   (void)ConsumeAnnotationToken();
@@ -4818,7 +4829,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         getLangOpts());
     Data.ExtraModifierLoc = Tok.getLocation();
     if (Data.ExtraModifier == OMPC_ADJUST_ARGS_unknown) {
-      Diag(Tok, diag::err_omp_unknown_adjust_args_op);
+      Diag(Tok, diag::err_omp_unknown_adjust_args_op)
+          << (getLangOpts().OpenMP >= 60 ? 1 : 0);
       SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch);
     } else {
       ConsumeToken();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 2cbe79c5c07c..d928b7ae2b4c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -7122,6 +7122,7 @@ void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
       getASTContext(), VariantFuncRef, DVScope.TI,
       /*NothingArgs=*/nullptr, /*NothingArgsSize=*/0,
       /*NeedDevicePtrArgs=*/nullptr, /*NeedDevicePtrArgsSize=*/0,
+      /*NeedDeviceAddrArgs=*/nullptr, /*NeedDeviceAddrArgsSize=*/0,
       /*AppendArgs=*/nullptr, /*AppendArgsSize=*/0);
   for (FunctionDecl *BaseFD : Bases)
     BaseFD->addAttr(OMPDeclareVariantA);
@@ -7553,6 +7554,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
     FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
     ArrayRef<Expr *> AdjustArgsNothing,
     ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
+    ArrayRef<Expr *> AdjustArgsNeedDeviceAddr,
     ArrayRef<OMPInteropInfo> AppendArgs, SourceLocation AdjustArgsLoc,
     SourceLocation AppendArgsLoc, SourceRange SR) {
 
@@ -7564,6 +7566,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
   SmallVector<Expr *, 8> AllAdjustArgs;
   llvm::append_range(AllAdjustArgs, AdjustArgsNothing);
   llvm::append_range(AllAdjustArgs, AdjustArgsNeedDevicePtr);
+  llvm::append_range(AllAdjustArgs, AdjustArgsNeedDeviceAddr);
 
   if (!AllAdjustArgs.empty() || !AppendArgs.empty()) {
     VariantMatchInfo VMI;
@@ -7614,6 +7617,8 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
       const_cast<Expr **>(AdjustArgsNothing.data()), AdjustArgsNothing.size(),
       const_cast<Expr **>(AdjustArgsNeedDevicePtr.data()),
       AdjustArgsNeedDevicePtr.size(),
+      const_cast<Expr **>(AdjustArgsNeedDeviceAddr.data()),
+      AdjustArgsNeedDeviceAddr.size(),
       const_cast<OMPInteropInfo *>(AppendArgs.data()), AppendArgs.size(), SR);
   FD->addAttr(NewAttr);
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 57271415f838..a25bfd1c48de 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -527,6 +527,7 @@ static void instantiateOMPDeclareVariantAttr(
 
   SmallVector<Expr *, 8> NothingExprs;
   SmallVector<Expr *, 8> NeedDevicePtrExprs;
+  SmallVector<Expr *, 8> NeedDeviceAddrExprs;
   SmallVector<OMPInteropInfo, 4> AppendArgs;
 
   for (Expr *E : Attr.adjustArgsNothing()) {
@@ -541,14 +542,20 @@ static void instantiateOMPDeclareVariantAttr(
       continue;
     NeedDevicePtrExprs.push_back(ER.get());
   }
+  for (Expr *E : Attr.adjustArgsNeedDeviceAddr()) {
+    ExprResult ER = Subst(E);
+    if (ER.isInvalid())
+      continue;
+    NeedDeviceAddrExprs.push_back(ER.get());
+  }
   for (OMPInteropInfo &II : Attr.appendArgs()) {
     // When prefer_type is implemented for append_args handle them here too.
     AppendArgs.emplace_back(II.IsTarget, II.IsTargetSync);
   }
 
   S.OpenMP().ActOnOpenMPDeclareVariantDirective(
-      FD, E, TI, NothingExprs, NeedDevicePtrExprs, AppendArgs, SourceLocation(),
-      SourceLocation(), Attr.getRange());
+      FD, E, TI, NothingExprs, NeedDevicePtrExprs, NeedDeviceAddrExprs,
+      AppendArgs, SourceLocation(), SourceLocation(), Attr.getRange());
 }
 
 static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
diff --git a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp
index 172dd1670421..c14e19cc8b7e 100644
--- a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp
+++ b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp
@@ -54,9 +54,9 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;}
 //DUMP: DeclRefExpr{{.*}}Function{{.*}}foo_v1
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA'
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB'
-//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB)
+//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB) adjust_args(need_device_addr:AAA)
 
-//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA)
+//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:BBB)
 
 //PRINT: omp declare variant(foo_v1) match(construct={dispatch}, device={arch(arm)}) adjust_args(need_device_ptr:AAA,BBB)
 
@@ -66,42 +66,48 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;}
 
 #pragma omp declare variant(foo_v2)                        \
    match(construct={dispatch}, device={arch(ppc)}),        \
-   adjust_args(need_device_ptr:AAA)
+   adjust_args(need_device_ptr:AAA)                        \
+   adjust_args(need_device_addr:BBB)
 
 #pragma omp declare variant(foo_v3)                        \
    adjust_args(need_device_ptr:BBB) adjust_args(nothing:I) \
+   adjust_args(need_device_addr:AAA)                      \
    match(construct={dispatch}, device={arch(x86,x86_64)})
 
 void foo(float *AAA, float *BBB, int *I) {return;}
 
-void Foo_Var(float *AAA, float *BBB) {return;}
+void Foo_Var(float *AAA, float *BBB, float *CCC) {return;}
 
 #pragma omp declare variant(Foo_Var) \
    match(construct={dispatch}, device={arch(x86_64)}) \
-   adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB)
+   adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB) \
+   adjust_args(need_device_addr:CCC)
 template<typename T>
-void Foo(T *AAA, T *BBB) {return;}
+void Foo(T *AAA, T *BBB, T *CCC) {return;}
 
-//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA)
-//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *)'
+//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:CCC)
+//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *, T *)'
 //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)}
 //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB'
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA'
+//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC'
 //
-//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *)'
+//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *, float *)'
 //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)}
 //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB'
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA'
+//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC'
 
 void func()
 {
   float *A;
   float *B;
+  float *C;
 
   //#pragma omp dispatch
-  Foo(A, B);
+  Foo(A, B, C);
 }
 
 typedef void *omp_interop_t;
diff --git a/clang/test/OpenMP/declare_variant_clauses_messages.cpp b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
index 284e49bbd21b..aadded7699ea 100644
--- a/clang/test/OpenMP/declare_variant_clauses_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 \
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 \
 // RUN:  -DNO_INTEROP_T_DEF -o - %s
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -Wno-strict-prototypes -DC -x c -o - %s
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -x c -o - %s
 // RUN: %clang_cc1 -verify -triple x86_64-pc-windows-msvc -fms-compatibility \
-// RUN:  -fopenmp -Wno-strict-prototypes -DC -DWIN -x c -o - %s
+// RUN:  -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -DWIN -x c -o - %s
 
 #ifdef NO_INTEROP_T_DEF
 void foo_v1(float *, void *);
@@ -114,6 +114,16 @@ void vararg_bar2(const char *fmt) { return; }
    match(construct={dispatch}, device={arch(ppc)}),          \
    adjust_args(need_device_ptr:AAA) adjust_args(nothing:AAA)
 
+// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}}
+#pragma omp declare variant(foo_v1)                          \
+   match(construct={dispatch}, device={arch(arm)})           \
+   adjust_args(need_device_ptr:AAA,BBB) adjust_args(need_device_addr:AAA)
+
+// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}}
+#pragma omp declare variant(foo_v1)                          \
+   match(construct={dispatch}, device={arch(ppc)}),          \
+   adjust_args(need_device_addr:AAA) adjust_args(nothing:AAA)
+
 // expected-error@+2 {{use of undeclared identifier 'J'}}
 #pragma omp declare variant(foo_v1)                          \
    adjust_args(nothing:J)                                    \
@@ -186,12 +196,12 @@ void vararg_bar2(const char *fmt) { return; }
 // expected-error@+1 {{variant in '#pragma omp declare variant' with type 'void (float *, float *, int *, omp_interop_t)' (aka 'void (float *, float *, int *, void *)') is incompatible with type 'void (float *, float *, int *)'}}
 #pragma omp declare variant(foo_v4) match(construct={dispatch})
 
-// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}}
+// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}}
 #pragma omp declare variant(foo_v1)                        \
    match(construct={dispatch}, device={arch(arm)})         \
    adjust_args(badaaop:AAA,BBB)
 
-// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}}
+// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}}
 #pragma omp declare variant(foo_v1)                        \
    match(construct={dispatch}, device={arch(arm)})         \
    adjust_args(badaaop AAA,BBB)

From 28bda778437fea17a25b561f1b3b84545612b565 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 22:19:31 -0700
Subject: [PATCH 0093/1322] Introduce MCAsmInfo::UsesSetToEquateSymbol and
 prefer = to .set

Introduce MCAsmInfo::UsesSetToEquateSymbol to control the preferred
syntax for symbol equating. We now favor the more readable and common
`symbol = expression` syntax over `.set`. This aligns with pre- https://reviews.llvm.org/D44256 behavior.

On Apple platforms, this resolves a clang -S vs -c behavior difference (resolves #104623).

For targets whose = support is unconfirmed, UsesSetToEquateSymbol is set to false.
This also minimizes test updates.

Pull Request: https://github.com/llvm/llvm-project/pull/142289
---
 clang/test/CodeGen/alias.c                    |  6 +--
 llvm/include/llvm/MC/MCAsmInfo.h              |  4 ++
 llvm/lib/MC/MCAsmStreamer.cpp                 |  6 ++-
 .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp   |  1 +
 .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp |  1 +
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp     |  2 +
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp |  1 +
 llvm/test/CodeGen/AArch64/arm64ec-alias.ll    | 14 +++---
 .../AArch64/arm64ec-hybrid-patchable.ll       | 18 +++----
 llvm/test/CodeGen/AArch64/arm64ec-symbols.ll  |  6 +--
 llvm/test/CodeGen/AArch64/arm64ec-varargs.ll  | 16 +++---
 llvm/test/CodeGen/AArch64/ehcontguard.ll      |  2 +-
 llvm/test/CodeGen/AArch64/global-merge-1.ll   |  8 +--
 llvm/test/CodeGen/AArch64/global-merge-2.ll   | 12 ++---
 llvm/test/CodeGen/AArch64/global-merge-3.ll   | 10 ++--
 .../AArch64/global-merge-hidden-minsize.ll    |  4 +-
 llvm/test/CodeGen/AArch64/ifunc-asm.ll        |  2 +-
 llvm/test/CodeGen/AArch64/seh-finally.ll      |  8 +--
 .../CodeGen/AArch64/stackguard-internal.ll    |  2 +-
 llvm/test/CodeGen/ARM/alias_store.ll          |  2 +-
 llvm/test/CodeGen/ARM/aliases.ll              | 14 +++---
 .../CodeGen/ARM/global-merge-dllexport.ll     |  4 +-
 .../CodeGen/ARM/global-merge-external-2.ll    | 12 ++---
 .../test/CodeGen/ARM/global-merge-external.ll | 12 ++---
 llvm/test/CodeGen/AVR/global-aliases.ll       | 28 +++++------
 llvm/test/CodeGen/Mips/hf16call32_body.ll     | 24 ++++-----
 llvm/test/CodeGen/Mips/mips16ex.ll            |  2 +-
 .../PowerPC/asm-printer-topological-order.ll  |  6 +--
 llvm/test/CodeGen/PowerPC/data-align.ll       | 10 ++--
 llvm/test/CodeGen/WebAssembly/aliases.ll      | 22 ++++----
 llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll |  2 +-
 llvm/test/CodeGen/WinCFGuard/cfguard.ll       |  2 +-
 .../CodeGen/X86/2007-09-06-ExtWeakAliasee.ll  |  2 +-
 llvm/test/CodeGen/X86/2009-08-12-badswitch.ll | 50 +++++++++----------
 .../CodeGen/X86/2010-05-26-DotDebugLoc.ll     |  8 +--
 llvm/test/CodeGen/X86/alias-gep.ll            |  8 +--
 llvm/test/CodeGen/X86/aliases.ll              |  8 +--
 .../CodeGen/X86/catchret-empty-fallthrough.ll |  2 +-
 llvm/test/CodeGen/X86/coff-alias-type.ll      |  2 +-
 llvm/test/CodeGen/X86/coff-comdat.ll          |  2 +-
 llvm/test/CodeGen/X86/coff-feat00.ll          |  2 +-
 llvm/test/CodeGen/X86/dllexport-x86_64.ll     | 10 ++--
 llvm/test/CodeGen/X86/dllexport.ll            |  8 +--
 llvm/test/CodeGen/X86/ehcontguard.ll          |  2 +-
 .../CodeGen/X86/fastcall-correct-mangling.ll  |  4 +-
 llvm/test/CodeGen/X86/ifunc-asm.ll            |  2 +-
 .../test/CodeGen/X86/lea-opt-memop-check-1.ll |  6 +--
 llvm/test/CodeGen/X86/linux-preemption.ll     | 16 +++---
 llvm/test/CodeGen/X86/localescape.ll          | 16 +++---
 llvm/test/CodeGen/X86/pr22019.ll              |  8 +--
 llvm/test/CodeGen/X86/seh-catch-all-win32.ll  |  4 +-
 llvm/test/CodeGen/X86/seh-catchpad.ll         |  2 +-
 llvm/test/CodeGen/X86/seh-finally.ll          |  2 +-
 llvm/test/CodeGen/X86/seh-no-invokes.ll       |  2 +-
 llvm/test/CodeGen/X86/seh-stack-realign.ll    |  4 +-
 llvm/test/CodeGen/X86/tailcall-cgp-dup.ll     | 12 ++---
 .../X86/windows-seh-EHa-TryInFinally.ll       |  2 +-
 llvm/test/CodeGen/XCore/globals.ll            |  2 +-
 llvm/test/CodeGen/XCore/linkage.ll            |  4 +-
 llvm/test/DebugInfo/X86/dbg-value-range.ll    |  4 +-
 .../X86/stmt-list-multiple-compile-units.ll   |  4 +-
 llvm/test/MC/AArch64/basic-a64-instructions.s |  2 +-
 llvm/test/MC/AsmParser/assignment.s           | 12 ++---
 llvm/test/MC/AsmParser/directive_include.s    |  2 +-
 llvm/test/MC/AsmParser/directive_set.s        |  6 +--
 llvm/test/MC/AsmParser/include.ll             |  4 +-
 llvm/test/MC/AsmParser/labels.s               |  6 +--
 llvm/test/MC/AsmParser/macro-arg-darwin.s     |  4 +-
 llvm/test/MC/AsmParser/motorola_integers.s    | 16 +++---
 llvm/test/MC/Mips/cpsetup.s                   |  2 +-
 70 files changed, 263 insertions(+), 252 deletions(-)

diff --git a/clang/test/CodeGen/alias.c b/clang/test/CodeGen/alias.c
index bc4167adf53f..9403c55beae0 100644
--- a/clang/test/CodeGen/alias.c
+++ b/clang/test/CodeGen/alias.c
@@ -29,20 +29,20 @@ const int wacom_usb_ids[] = {1, 1, 2, 3, 5, 8, 13, 0};
 extern const int __mod_usb_device_table __attribute__ ((alias("wacom_usb_ids")));
 // CHECKBASIC-DAG: @__mod_usb_device_table ={{.*}} alias i32, ptr @wacom_usb_ids
 // CHECKASM-DAG: .globl __mod_usb_device_table
-// CHECKASM-DAG: .set __mod_usb_device_table, wacom_usb_ids
+// CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids
 // CHECKASM-NOT: .size __mod_usb_device_table
 
 extern int g1;
 extern int g1 __attribute((alias("g0")));
 // CHECKBASIC-DAG: @g1 ={{.*}} alias i32, ptr @g0
 // CHECKASM-DAG: .globl g1
-// CHECKASM-DAG: .set g1, g0
+// CHECKASM-DAG: g1 = g0
 // CHECKASM-NOT: .size g1
 
 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS")));
 // CHECKBASIC-DAG: @__libc_errno ={{.*}} thread_local alias i32, ptr @TL_WITH_ALIAS
 // CHECKASM-DAG: .globl __libc_errno
-// CHECKASM-DAG: .set __libc_errno, TL_WITH_ALIAS
+// CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS
 // CHECKASM-NOT: .size __libc_errno
 
 void f0(void) { }
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 4eb50344d638..e98cd17a9df5 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -141,6 +141,9 @@ protected:
   /// This is appended to emitted labels.  Defaults to ":"
   const char *LabelSuffix;
 
+  /// Use .set instead of = to equate a symbol to an expression.
+  bool UsesSetToEquateSymbol = false;
+
   // Print the EH begin symbol with an assignment. Defaults to false.
   bool UseAssignmentForEHBegin = false;
 
@@ -525,6 +528,7 @@ public:
   bool shouldAllowAdditionalComments() const { return AllowAdditionalComments; }
   const char *getLabelSuffix() const { return LabelSuffix; }
 
+  bool usesSetToEquateSymbol() const { return UsesSetToEquateSymbol; }
   bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; }
   bool needsLocalForSize() const { return NeedsLocalForSize; }
   StringRef getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index da0d99e70d9e..4380f74318e7 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -695,9 +695,11 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
     if (E->inlineAssignedExpr())
       EmitSet = false;
   if (EmitSet) {
-    OS << ".set ";
+    bool UseSet = MAI->usesSetToEquateSymbol();
+    if (UseSet)
+      OS << ".set ";
     Symbol->print(OS, MAI);
-    OS << ", ";
+    OS << (UseSet ? ", " : " = ");
     Value->print(OS, MAI);
 
     EmitEOL();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 6f1d89e500ed..fcf134aa8658 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -42,6 +42,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
   CommentString = ";";
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
+  UsesSetToEquateSymbol = true;
 
   //===--- Data Emission Directives -------------------------------------===//
   UsesELFSectionDirectiveForBSS = true;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 7675b05f106a..ba8faaeb74a0 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -38,6 +38,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
   InlineAsmEnd = "# InlineAsm End";
+  UsesSetToEquateSymbol = true;
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 160ee07fad5c..b5be23c5a96a 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -155,5 +155,7 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
   // Support $ as PC in inline asm
   DollarIsPC = true;
 
+  UsesSetToEquateSymbol = true;
+
   initializeVariantKinds(variantKindDescs);
 }
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 27272cdbbd23..e9d387399bf3 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -49,6 +49,7 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) {
   CalleeSaveStackSlotSize = 8;
   CodePointerSize = 8;
   CommentString = "*";
+  UsesSetToEquateSymbol = true;
   ExceptionsType = ExceptionHandling::ZOS;
   IsHLASM = true;
   IsLittleEndian = false;
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll
index 03cc87313694..18023a95a5d2 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll
@@ -13,30 +13,30 @@ define dso_local void @patchable_func() hybrid_patchable {
 @patchable_alias = alias void (), ptr @patchable_func
 
 ; CHECK:              .weak_anti_dep  func_alias
-; CHECK-NEXT: .set func_alias, "#func_alias"
+; CHECK-NEXT: func_alias = "#func_alias"
 ; CHECK-NEXT:         .weak_anti_dep  func_alias2
-; CHECK-NEXT: .set func_alias2, "#func_alias2"
+; CHECK-NEXT: func_alias2 = "#func_alias2"
 ; CHECK-NEXT:         .weak_anti_dep  func
-; CHECK-NEXT: .set func, "#func"
+; CHECK-NEXT: func = "#func"
 ; CHECK:              .weak_anti_dep  patchable_alias
-; CHECK-NEXT: .set patchable_alias, "#patchable_alias"
+; CHECK-NEXT: patchable_alias = "#patchable_alias"
 
 ; CHECK:              .globl  "#func_alias"
 ; CHECK-NEXT:         .def    "#func_alias";
 ; CHECK-NEXT:         .scl    2;
 ; CHECK-NEXT:         .type   32;
 ; CHECK-NEXT:         .endef
-; CHECK-NEXT: .set "#func_alias", "#func"
+; CHECK-NEXT: "#func_alias" = "#func"
 ; CHECK-NEXT:         .globl  "#func_alias2"
 ; CHECK-NEXT:         .def    "#func_alias2";
 ; CHECK-NEXT:         .scl    2;
 ; CHECK-NEXT:         .type   32;
 ; CHECK-NEXT:         .endef
-; CHECK-NEXT: .set "#func_alias2", "#func_alias"
+; CHECK-NEXT: "#func_alias2" = "#func_alias"
 
 ; CHECK:              .globl  "#patchable_alias"
 ; CHECK-NEXT:         .def    "#patchable_alias";
 ; CHECK-NEXT:         .scl    2;
 ; CHECK-NEXT:         .type   32;
 ; CHECK-NEXT:         .endef
-; CHECK-NEXT: .set "#patchable_alias", "#patchable_func"
+; CHECK-NEXT: "#patchable_alias" = "#patchable_func"
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
index f964484c0c2d..7c77832a9d9a 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
@@ -76,7 +76,7 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .p2align        2
 ; CHECK-NEXT:  "#caller":                              // @"#caller"
 ; CHECK-NEXT:      .weak_anti_dep  caller
-; CHECK-NEXT:  .set caller, "#caller"{{$}}
+; CHECK-NEXT:  caller = "#caller"{{$}}
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:      str     x30, [sp, #-16]!                // 8-byte Folded Spill
 ; CHECK-NEXT:      bl      "#func"
@@ -253,13 +253,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak  func
-; CHECK-NEXT:  .set func, "EXP+#func"{{$}}
+; CHECK-NEXT:  func = "EXP+#func"{{$}}
 ; CHECK-NEXT:      .weak  "#func"
 ; CHECK-NEXT:      .def    "#func";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#func", "#func$hybpatch_thunk"{{$}}
+; CHECK-NEXT:  "#func" = "#func$hybpatch_thunk"{{$}}
 ; CHECK-NEXT:      .def    "EXP+#has_varargs";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -269,13 +269,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak   has_varargs
-; CHECK-NEXT:  .set has_varargs, "EXP+#has_varargs"
+; CHECK-NEXT:  has_varargs = "EXP+#has_varargs"
 ; CHECK-NEXT:      .weak   "#has_varargs"
 ; CHECK-NEXT:      .def    "#has_varargs";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#has_varargs", "#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:  "#has_varargs" = "#has_varargs$hybpatch_thunk"
 ; CHECK-NEXT:      .def    "EXP+#has_sret";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -285,13 +285,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak   has_sret
-; CHECK-NEXT:  .set has_sret, "EXP+#has_sret"
+; CHECK-NEXT:  has_sret = "EXP+#has_sret"
 ; CHECK-NEXT:      .weak   "#has_sret"
 ; CHECK-NEXT:      .def    "#has_sret";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#has_sret", "#has_sret$hybpatch_thunk"
+; CHECK-NEXT:  "#has_sret" = "#has_sret$hybpatch_thunk"
 ; CHECK-NEXT:      .def    "EXP+#exp";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -301,13 +301,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak   exp
-; CHECK-NEXT:  .set exp, "EXP+#exp"
+; CHECK-NEXT:  exp = "EXP+#exp"
 ; CHECK-NEXT:      .weak   "#exp"
 ; CHECK-NEXT:      .def    "#exp";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#exp", "#exp$hybpatch_thunk"
+; CHECK-NEXT:  "#exp" = "#exp$hybpatch_thunk"
 
 ; SYM:      [53](sec 15)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #func$hybpatch_thunk
 ; SYM:      [58](sec 16)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #has_varargs$hybpatch_thunk
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll
index b79dd7d61dd6..b44f39ad7b73 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll
@@ -10,12 +10,12 @@ define void @caller() nounwind {
 }
 
 ; CHECK:      .weak_anti_dep  caller
-; CHECK-NEXT: .set caller, "#caller"{{$}}
+; CHECK-NEXT: caller = "#caller"{{$}}
 
 ; CHECK:      .weak_anti_dep  func
-; CHECK-NEXT: .set func, "#func"{{$}}
+; CHECK-NEXT: func = "#func"{{$}}
 ; CHECK-NEXT: .weak_anti_dep  "#func"
-; CHECK-NEXT: .set "#func", "#func$exit_thunk"{{$}}
+; CHECK-NEXT: "#func" = "#func$exit_thunk"{{$}}
 
 ; SYM:       [ 8](sec  4)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #caller
 ; SYM:       [21](sec  7)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #func$exit_thunk
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
index 5fab5738078d..389969bebaea 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
@@ -45,9 +45,9 @@ define void @varargs_caller() nounwind {
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    str xzr, [sp, #16]
 ; CHECK-NEXT:    .weak_anti_dep varargs_callee
-; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"
+; CHECK-NEXT:  varargs_callee = "#varargs_callee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
-; CHECK-NEXT:  .set "#varargs_callee", varargs_callee
+; CHECK-NEXT:  "#varargs_callee" = varargs_callee
 ; CHECK-NEXT:    bl "#varargs_callee"
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
@@ -86,9 +86,9 @@ define void @varargs_many_argscalleer() nounwind {
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    stp q0, q0, [sp, #16]
 ; CHECK-NEXT:    .weak_anti_dep varargs_many_argscallee
-; CHECK-NEXT:  .set varargs_many_argscallee, "#varargs_many_argscallee"
+; CHECK-NEXT:  varargs_many_argscallee = "#varargs_many_argscallee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_many_argscallee"
-; CHECK-NEXT:  .set "#varargs_many_argscallee", varargs_many_argscallee
+; CHECK-NEXT:  "#varargs_many_argscallee" = varargs_many_argscallee
 ; CHECK-NEXT:    bl "#varargs_many_argscallee"
 ; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #64
@@ -116,9 +116,9 @@ define void @varargs_caller_tail() nounwind {
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    str xzr, [sp, #16]
 ; CHECK-NEXT:    .weak_anti_dep varargs_callee
-; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"
+; CHECK-NEXT:  varargs_callee = "#varargs_callee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
-; CHECK-NEXT:  .set "#varargs_callee", varargs_callee
+; CHECK-NEXT:  "#varargs_callee" = varargs_callee
 ; CHECK-NEXT:    bl "#varargs_callee"
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    add x4, sp, #48
@@ -129,9 +129,9 @@ define void @varargs_caller_tail() nounwind {
 ; CHECK-NEXT:    mov x5, xzr
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    .weak_anti_dep varargs_callee
-; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"
+; CHECK-NEXT:  varargs_callee = "#varargs_callee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
-; CHECK-NEXT:  .set "#varargs_callee", varargs_callee
+; CHECK-NEXT:  "#varargs_callee" = varargs_callee
 ; CHECK-NEXT:    b "#varargs_callee"
   call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> <double 0.0, double 0.0>)
   tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2)
diff --git a/llvm/test/CodeGen/AArch64/ehcontguard.ll b/llvm/test/CodeGen/AArch64/ehcontguard.ll
index eecff391d0f8..cb603a482d22 100644
--- a/llvm/test/CodeGen/AArch64/ehcontguard.ll
+++ b/llvm/test/CodeGen/AArch64/ehcontguard.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=aarch64-windows | FileCheck %s
 ; EHCont Guard is currently only available on Windows
 
-; CHECK: .set "@feat.00", 16384
+; CHECK: "@feat.00" = 16384
 
 ; CHECK: .section .gehcont$y
 
diff --git a/llvm/test/CodeGen/AArch64/global-merge-1.ll b/llvm/test/CodeGen/AArch64/global-merge-1.ll
index cc17e344c211..626310fc4ec2 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-1.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-1.ll
@@ -23,9 +23,9 @@ define void @f1(i32 %a1, i32 %a2) {
 ;CHECK:	.type	.L_MergedGlobals,@object  // @_MergedGlobals
 ;CHECK:	.local	.L_MergedGlobals
 ;CHECK:	.comm	.L_MergedGlobals,8,4
-;CHECK: .set m, .L_MergedGlobals
-;CHECK: .set n, .L_MergedGlobals+4
+;CHECK: m = .L_MergedGlobals
+;CHECK: n = .L_MergedGlobals+4
 
 ;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,2 ; @_MergedGlobals
-;CHECK-APPLE-IOS-NOT: .set _m, l__MergedGlobals
-;CHECK-APPLE-IOS-NOT: .set _n, l__MergedGlobals+4
+;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals
+;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4
diff --git a/llvm/test/CodeGen/AArch64/global-merge-2.ll b/llvm/test/CodeGen/AArch64/global-merge-2.ll
index 85d814c3177b..1b5333b907d2 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-2.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-2.ll
@@ -32,21 +32,21 @@ define dso_local void @g1(i32 %a1, i32 %a2) {
 ;CHECK:	.comm	.L_MergedGlobals,12,4
 
 ;CHECK:	.globl	x
-;CHECK: .set x, .L_MergedGlobals
+;CHECK: x = .L_MergedGlobals
 ;CHECK: .size x, 4
 ;CHECK:	.globl	y
-;CHECK: .set y, .L_MergedGlobals+4
+;CHECK: y = .L_MergedGlobals+4
 ;CHECK: .size y, 4
 ;CHECK:	.globl	z
-;CHECK: .set z, .L_MergedGlobals+8
+;CHECK: z = .L_MergedGlobals+8
 ;CHECK: .size z, 4
 
 ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,2
 
 ;CHECK-APPLE-IOS: .globl	_x
-;CHECK-APPLE-IOS: .set {{.*}}, __MergedGlobals_x
+;CHECK-APPLE-IOS: {{.*}} = __MergedGlobals_x
 ;CHECK-APPLE-IOS: .globl	_y
-;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+4
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
 ;CHECK-APPLE-IOS: .globl	_z
-;CHECK-APPLE-IOS: .set _z, __MergedGlobals_x+8
+;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
 ;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll
index b3f58887139f..2a0ae1227455 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-3.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll
@@ -40,14 +40,14 @@ define dso_local void @f1(i32 %a1, i32 %a2, i32 %a3) {
 
 ;CHECK-APPLE-IOS: .globl  __MergedGlobals_x
 ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,800,2
-;CHECK-APPLE-IOS: .set _x, __MergedGlobals_x
-;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+400
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+400
 
 ;CHECK: .type   .L_MergedGlobals,@object // @_MergedGlobals
 ;CHECK: .local  .L_MergedGlobals
 ;CHECK: .comm   .L_MergedGlobals,800,4
 ;CHECK: globl  x
-;CHECK: .set x, .L_MergedGlobals
+;CHECK: x = .L_MergedGlobals
 ;CHECK: globl  y
-;CHECK: .set y, .L_MergedGlobals+400
-;CHECK-NOT: .set z, .L_MergedGlobals
+;CHECK: y = .L_MergedGlobals+400
+;CHECK-NOT: z = .L_MergedGlobals
diff --git a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll
index 9c694fc4d289..5292aa91fc38 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll
@@ -16,10 +16,10 @@ attributes #0 = { minsize optsize }
 
 ; CHECK: .globl x
 ; CHECK: .hidden x
-; CHECK: .set x, .L_MergedGlobals
+; CHECK: x = .L_MergedGlobals
 ; CHECK: .size x, 4
 
 ; CHECK: .globl y
 ; CHECK: .hidden y
-; CHECK: .set y, .L_MergedGlobals+4
+; CHECK: y = .L_MergedGlobals+4
 ; CHECK: .size y, 4
diff --git a/llvm/test/CodeGen/AArch64/ifunc-asm.ll b/llvm/test/CodeGen/AArch64/ifunc-asm.ll
index 57fc2f0c9d7f..7aad6cce09cf 100644
--- a/llvm/test/CodeGen/AArch64/ifunc-asm.ll
+++ b/llvm/test/CodeGen/AArch64/ifunc-asm.ll
@@ -16,7 +16,7 @@ entry:
 @global_ifunc = ifunc i32 (i32), ptr @the_resolver
 ; ELF:             .globl global_ifunc
 ; ELF-NEXT:        .type global_ifunc,@gnu_indirect_function
-; ELF-NEXT:        .set global_ifunc, the_resolver
+; ELF-NEXT:        global_ifunc = the_resolver
 
 ; MACHO:           .section __DATA,__data
 ; MACHO-NEXT:      .p2align 3, 0x0
diff --git a/llvm/test/CodeGen/AArch64/seh-finally.ll b/llvm/test/CodeGen/AArch64/seh-finally.ll
index 04a30800d929..fd6b3fd0bc1f 100644
--- a/llvm/test/CodeGen/AArch64/seh-finally.ll
+++ b/llvm/test/CodeGen/AArch64/seh-finally.ll
@@ -38,7 +38,7 @@ entry:
 ; CHECK: add     x29, sp, #16
 ; CHECK: mov     x0, #-2
 ; CHECK: stur    x0, [x29, #16]
-; CHECK: .set .Lsimple_seh$frame_escape_0, -8
+; CHECK: .Lsimple_seh$frame_escape_0 = -8
 ; CHECK: ldur    w0, [x29, #-8]
 ; CHECK: bl      foo
 
@@ -89,7 +89,7 @@ entry:
 ; CHECK: mov     x19, sp
 ; CHECK: mov     x0, #-2
 ; CHECK: stur    x0, [x29, #24]
-; CHECK: .set .Lstack_realign$frame_escape_0, 0
+; CHECK: .Lstack_realign$frame_escape_0 = 0
 ; CHECK: ldr     w0, [x19]
 ; CHECK: bl      foo
 
@@ -137,7 +137,7 @@ entry:
 ; CHECK: add     x29, sp, #32
 ; CHECK: mov     x1, #-2
 ; CHECK: stur    x1, [x29, #16]
-; CHECK: .set .Lvla_present$frame_escape_0, -4
+; CHECK: .Lvla_present$frame_escape_0 = -4
 ; CHECK: stur    w0, [x29, #-4]
 ; CHECK: ldur    w8, [x29, #-4]
 ; CHECK: mov     x9, sp
@@ -204,7 +204,7 @@ entry:
 ; CHECK: mov     x19, sp
 ; CHECK: mov     x1, #-2
 ; CHECK: stur    x1, [x29, #24]
-; CHECK: .set .Lvla_and_realign$frame_escape_0, 32
+; CHECK: .Lvla_and_realign$frame_escape_0 = 32
 ; CHECK: str     w0, [x29, #36]
 ; CHECK: ldr     w8, [x29, #36]
 ; CHECK: mov     x9, sp
diff --git a/llvm/test/CodeGen/AArch64/stackguard-internal.ll b/llvm/test/CodeGen/AArch64/stackguard-internal.ll
index a70c8874edba..7b32e8c0caab 100644
--- a/llvm/test/CodeGen/AArch64/stackguard-internal.ll
+++ b/llvm/test/CodeGen/AArch64/stackguard-internal.ll
@@ -6,7 +6,7 @@ target triple = "aarch64-linux-gnu"
 ; is an alias.  (The alias is created by GlobalMerge.)
 ; CHECK: adrp {{.*}}, __stack_chk_guard
 ; CHECK: ldr {{.*}}, [{{.*}}, :lo12:__stack_chk_guard]
-; CHECK: .set __stack_chk_guard, .L_MergedGlobals+4
+; CHECK: __stack_chk_guard = .L_MergedGlobals+4
 
 @__stack_chk_guard = internal global [8 x i32] zeroinitializer, align 4
 @x = internal global i32 0, align 4
diff --git a/llvm/test/CodeGen/ARM/alias_store.ll b/llvm/test/CodeGen/ARM/alias_store.ll
index c6612334eaf1..60aa58d37499 100644
--- a/llvm/test/CodeGen/ARM/alias_store.ll
+++ b/llvm/test/CodeGen/ARM/alias_store.ll
@@ -13,4 +13,4 @@ entry:
 ; CHECK: ldr r{{.*}}, [[L:.*]]
 ; CHECK: [[L]]:
 ; CHECK-NEXT: .long XA
-; CHECK: .set XA, X+1
+; CHECK: XA = X+1
diff --git a/llvm/test/CodeGen/ARM/aliases.ll b/llvm/test/CodeGen/ARM/aliases.ll
index 6075ad813e99..8d9f938155d1 100644
--- a/llvm/test/CodeGen/ARM/aliases.ll
+++ b/llvm/test/CodeGen/ARM/aliases.ll
@@ -6,30 +6,30 @@
 ; CHECK: .size .Lstructvar, 8
 
 ; CHECK: .globl	foo1
-; CHECK: .set foo1, bar
+; CHECK: foo1 = bar
 ; CHECK-NOT: .size foo1
 
 ; CHECK: .globl	foo2
-; CHECK: .set foo2, bar
+; CHECK: foo2 = bar
 ; CHECK-NOT: .size foo2
 
 ; CHECK: .weak	bar_f
-; CHECK: .set bar_f, foo_f
+; CHECK: bar_f = foo_f
 ; CHECK-NOT: .size bar_f
 
-; CHECK: .set bar_i, bar
+; CHECK: bar_i = bar
 ; CHECK-NOT: .size bar_i
 
 ; CHECK: .globl	A
-; CHECK: .set A, bar
+; CHECK: A = bar
 ; CHECK-NOT: .size A
 
 ; CHECK: .globl elem0
-; CHECK: .set elem0, .Lstructvar
+; CHECK: elem0 = .Lstructvar
 ; CHECK: .size elem0, 4
 
 ; CHECK: .globl elem1
-; CHECK: .set elem1, .Lstructvar+4
+; CHECK: elem1 = .Lstructvar+4
 ; CHECK: .size elem1, 4
 
 @bar = global i32 42
diff --git a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll
index 89e8a859b939..f5961d7f79e3 100644
--- a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll
@@ -16,6 +16,6 @@ define void @f1(i32 %a1, i32 %a2) {
 ; CHECK: .section .drectve,"yni"
 ; CHECK: .ascii " /EXPORT:y,DATA"
 ; CHECK: .globl x
-; CHECK: .set x, .L_MergedGlobals
+; CHECK: x = .L_MergedGlobals
 ; CHECK: .globl y
-; CHECK: .set y, .L_MergedGlobals+4
+; CHECK: y = .L_MergedGlobals+4
diff --git a/llvm/test/CodeGen/ARM/global-merge-external-2.ll b/llvm/test/CodeGen/ARM/global-merge-external-2.ll
index 602533e045e0..c9e92d98e484 100644
--- a/llvm/test/CodeGen/ARM/global-merge-external-2.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-external-2.ll
@@ -50,16 +50,16 @@ define dso_local void @g1(i32 %a1, i32 %a2) {
 ;CHECK-WIN32:   .lcomm  .L_MergedGlobals,8,4
 
 ;CHECK-MERGE:   .globl  x
-;CHECK-MERGE: .set x, .L_MergedGlobals
+;CHECK-MERGE: x = .L_MergedGlobals
 ;CHECK-MERGE: .size x, 4
 ;CHECK-MERGE:   .globl  y
-;CHECK-MERGE: .set y, .L_MergedGlobals+4
+;CHECK-MERGE: y = .L_MergedGlobals+4
 ;CHECK-MERGE: .size y, 4
-;CHECK-MERGE-NOT: .set z, .L_MergedGlobals+8
+;CHECK-MERGE-NOT: z = .L_MergedGlobals+8
 
 
 ;CHECK-WIN32:   .globl  x
-;CHECK-WIN32: .set x, .L_MergedGlobals
+;CHECK-WIN32: x = .L_MergedGlobals
 ;CHECK-WIN32:   .globl  y
-;CHECK-WIN32: .set y, .L_MergedGlobals+4
-;CHECK-WIN32-NOT: .set z, .L_MergedGlobals+8
+;CHECK-WIN32: y = .L_MergedGlobals+4
+;CHECK-WIN32-NOT: z = .L_MergedGlobals+8
diff --git a/llvm/test/CodeGen/ARM/global-merge-external.ll b/llvm/test/CodeGen/ARM/global-merge-external.ll
index 364659b36bb9..4fe1914aae35 100644
--- a/llvm/test/CodeGen/ARM/global-merge-external.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-external.ll
@@ -45,18 +45,18 @@ define dso_local void @g1(i32 %a1, i32 %a2) {
 ;CHECK-WIN32:	.lcomm	.L_MergedGlobals,12,4
 
 ;CHECK-MERGE:	.globl	x
-;CHECK-MERGE: .set x, .L_MergedGlobals
+;CHECK-MERGE: x = .L_MergedGlobals
 ;CHECK-MERGE: .size x, 4
 ;CHECK-MERGE:	.globl	y
-;CHECK-MERGE: .set y, .L_MergedGlobals+4
+;CHECK-MERGE: y = .L_MergedGlobals+4
 ;CHECK-MERGE: .size y, 4
 ;CHECK-MERGE:	.globl	z
-;CHECK-MERGE: .set z, .L_MergedGlobals+8
+;CHECK-MERGE: z = .L_MergedGlobals+8
 ;CHECK-MERGE: .size z, 4
 
 ;CHECK-WIN32:	.globl	x
-;CHECK-WIN32: .set x, .L_MergedGlobals
+;CHECK-WIN32: x = .L_MergedGlobals
 ;CHECK-WIN32:	.globl	y
-;CHECK-WIN32: .set y, .L_MergedGlobals+4
+;CHECK-WIN32: y = .L_MergedGlobals+4
 ;CHECK-WIN32:	.globl	z
-;CHECK-WIN32: .set z, .L_MergedGlobals+8
+;CHECK-WIN32: z = .L_MergedGlobals+8
diff --git a/llvm/test/CodeGen/AVR/global-aliases.ll b/llvm/test/CodeGen/AVR/global-aliases.ll
index 91bcedc7e0db..b948003e8b88 100644
--- a/llvm/test/CodeGen/AVR/global-aliases.ll
+++ b/llvm/test/CodeGen/AVR/global-aliases.ll
@@ -1,18 +1,18 @@
 ; RUN: llc < %s -mtriple=avr -mcpu=atxmega384c3 | FileCheck %s --check-prefixes=MEGA
 ; RUN: llc < %s -mtriple=avr -mcpu=attiny40 | FileCheck %s --check-prefixes=TINY
 
-; MEGA: .set __tmp_reg__, 0
-; MEGA: .set __zero_reg__, 1
-; MEGA: .set __SREG__, 63
-; MEGA: .set __SP_H__, 62
-; MEGA: .set __SP_L__, 61
-; MEGA: .set __EIND__, 60
-; MEGA: .set __RAMPZ__, 59
+; MEGA: __tmp_reg__ = 0
+; MEGA: __zero_reg__ = 1
+; MEGA: __SREG__ = 63
+; MEGA: __SP_H__ = 62
+; MEGA: __SP_L__ = 61
+; MEGA: __EIND__ = 60
+; MEGA: __RAMPZ__ = 59
 
-; TINY:     .set __tmp_reg__, 16
-; TINY:     .set __zero_reg__, 17
-; TINY:     .set __SREG__, 63
-; TINY-NOT: .set __SP_H__, 62
-; TINY:     .set __SP_L__, 61
-; TINY-NOT: .set __EIND__, 60
-; TINY-NOT: .set __RAMPZ__, 59
+; TINY:     __tmp_reg__ = 16
+; TINY:     __zero_reg__ = 17
+; TINY:     __SREG__ = 63
+; TINY-NOT: __SP_H__ = 62
+; TINY:     __SP_L__ = 61
+; TINY-NOT: __EIND__ = 60
+; TINY-NOT: __RAMPZ__ = 59
diff --git a/llvm/test/CodeGen/Mips/hf16call32_body.ll b/llvm/test/CodeGen/Mips/hf16call32_body.ll
index ea83f776bd40..3bcb6f6bc015 100644
--- a/llvm/test/CodeGen/Mips/hf16call32_body.ll
+++ b/llvm/test/CodeGen/Mips/hf16call32_body.ll
@@ -24,7 +24,7 @@ entry:
 ; stel: addiu $25, $25, %lo(v_sf)
 ; stel: mfc1 $4, $f12
 ; stel: jr $25
-; stel: .set $__fn_local_v_sf, v_sf
+; stel: $__fn_local_v_sf = v_sf
 ; stel: .end __fn_stub_v_sf
 
 declare i32 @printf(ptr, ...) #1
@@ -46,7 +46,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f13
 ; stel: jr $25
-; stel: .set $__fn_local_v_df, v_df
+; stel: $__fn_local_v_df = v_df
 ; stel: .end __fn_stub_v_df
 
 ; Function Attrs: nounwind
@@ -70,7 +70,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_v_sf_sf, v_sf_sf
+; stel: $__fn_local_v_sf_sf = v_sf_sf
 ; stel: .end __fn_stub_v_sf_sf
 
 ; Function Attrs: nounwind
@@ -95,7 +95,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_v_sf_df, v_sf_df
+; stel: $__fn_local_v_sf_df = v_sf_df
 ; stel: .end __fn_stub_v_sf_df
 
 ; Function Attrs: nounwind
@@ -120,7 +120,7 @@ entry:
 ; stel: mfc1 $5, $f13
 ; stel: mfc1 $6, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_v_df_sf, v_df_sf
+; stel: $__fn_local_v_df_sf = v_df_sf
 ; stel: .end __fn_stub_v_df_sf
 
 ; Function Attrs: nounwind
@@ -146,7 +146,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_v_df_df, v_df_df
+; stel: $__fn_local_v_df_df = v_df_df
 ; stel: .end __fn_stub_v_df_df
 
 ; Function Attrs: nounwind
@@ -174,7 +174,7 @@ entry:
 ; stel: addiu $25, $25, %lo(sf_sf)
 ; stel: mfc1 $4, $f12
 ; stel: jr $25
-; stel: .set $__fn_local_sf_sf, sf_sf
+; stel: $__fn_local_sf_sf = sf_sf
 ; stel: .end __fn_stub_sf_sf
 
 
@@ -196,7 +196,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f13
 ; stel: jr $25
-; stel: .set $__fn_local_sf_df, sf_df
+; stel: $__fn_local_sf_df = sf_df
 ; stel: .end __fn_stub_sf_df
 
 ; Function Attrs: nounwind
@@ -221,7 +221,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_sf_sf_sf, sf_sf_sf
+; stel: $__fn_local_sf_sf_sf = sf_sf_sf
 ; stel: .end __fn_stub_sf_sf_sf
 
 ; Function Attrs: nounwind
@@ -247,7 +247,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_sf_sf_df, sf_sf_df
+; stel: $__fn_local_sf_sf_df = sf_sf_df
 ; stel: .end __fn_stub_sf_sf_df
 
 ; Function Attrs: nounwind
@@ -273,7 +273,7 @@ entry:
 ; stel: mfc1 $5, $f13
 ; stel: mfc1 $6, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_sf_df_sf, sf_df_sf
+; stel: $__fn_local_sf_df_sf = sf_df_sf
 ; stel: .end __fn_stub_sf_df_sf
 
 ; Function Attrs: nounwind
@@ -300,7 +300,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_sf_df_df, sf_df_df
+; stel: $__fn_local_sf_df_df = sf_df_df
 ; stel: .end __fn_stub_sf_df_df
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16ex.ll b/llvm/test/CodeGen/Mips/mips16ex.ll
index fb9a44e76751..f4d1125718a9 100644
--- a/llvm/test/CodeGen/Mips/mips16ex.ll
+++ b/llvm/test/CodeGen/Mips/mips16ex.ll
@@ -2,7 +2,7 @@
 
 ;16: main:
 ;16-NEXT: [[TMP:.*]]:
-;16-NEXT: .set $func_begin0, [[TMP]]
+;16-NEXT: $func_begin0 = [[TMP]]
 ;16-NEXT: .cfi_startproc
 ;16-NEXT: .cfi_personality
 @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
diff --git a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll
index 6299b4e393d9..3218c77f08c8 100644
--- a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll
+++ b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll
@@ -10,6 +10,6 @@ entry:
 }
 
 ; CHECK-LABEL: TestD:
-; CHECK: .set TestC, TestD
-; CHECK-DAG: .set TestB, TestC
-; CHECK-DAG: .set TestA, TestC
+; CHECK: TestC = TestD
+; CHECK-DAG: TestB = TestC
+; CHECK-DAG: TestA = TestC
diff --git a/llvm/test/CodeGen/PowerPC/data-align.ll b/llvm/test/CodeGen/PowerPC/data-align.ll
index bfedec139369..42dee13d152a 100644
--- a/llvm/test/CodeGen/PowerPC/data-align.ll
+++ b/llvm/test/CodeGen/PowerPC/data-align.ll
@@ -2,23 +2,23 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux | FileCheck %s
 
-; CHECK:      .set .Li8,
+; CHECK:      .Li8 =
 ; CHECK-NEXT:  .size	.Li8, 1
 @i8 = private constant i8 42
 
-; CHECK:      .set .Li16,
+; CHECK:      .Li16 =
 ; CHECK-NEXT: .size	.Li16, 2
 @i16 = private constant i16 42
 
-; CHECK:      .set .Li32,
+; CHECK:      .Li32 =
 ; CHECK-NEXT: .size	.Li32, 4
 @i32 = private constant i32 42
 
-; CHECK:      .set .Li64,
+; CHECK:      .Li64 =
 ; CHECK-NEXT: .size	.Li64, 8
 @i64 = private constant i64 42
 
-; CHECK:        .set .Li128,
+; CHECK:        .Li128 =
 ; CHECK-NEXT:	.size	.Li128, 16
 @i128 = private constant i128 42
 
diff --git a/llvm/test/CodeGen/WebAssembly/aliases.ll b/llvm/test/CodeGen/WebAssembly/aliases.ll
index 91b57b90df1d..87b292f53c62 100644
--- a/llvm/test/CodeGen/WebAssembly/aliases.ll
+++ b/llvm/test/CodeGen/WebAssembly/aliases.ll
@@ -4,11 +4,11 @@
 @bar = global i32 42
 
 ; CHECK-DAG: .globl	foo1
-; CHECK-DAG: .set foo1, bar
+; CHECK-DAG: foo1 = bar
 @foo1 = alias i32, ptr @bar
 
 ; CHECK-DAG: .globl	foo2
-; CHECK-DAG: .set foo2, bar
+; CHECK-DAG: foo2 = bar
 @foo2 = alias i32, ptr @bar
 
 %FunTy = type i32()
@@ -19,14 +19,14 @@ define i32 @foo_f() {
 
 ; CHECK-DAG: .weak	bar_f
 ; CHECK-DAG: .type	bar_f,@function
-; CHECK-DAG: .set bar_f, foo_f
+; CHECK-DAG: bar_f = foo_f
 @bar_f = weak alias %FunTy, ptr @foo_f
 
 ; CHECK-DAG: .weak	bar_l
-; CHECK-DAG: .set bar_l, bar
+; CHECK-DAG: bar_l = bar
 @bar_l = linkonce_odr alias i32, ptr @bar
 
-; CHECK-DAG: .set bar_i, bar
+; CHECK-DAG: bar_i = bar
 @bar_i = internal alias i32, ptr @bar
 
 ; CHECK-DAG: .globl	A
@@ -34,24 +34,24 @@ define i32 @foo_f() {
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
-; CHECK-DAG: .set bar_h, bar
+; CHECK-DAG: bar_h = bar
 @bar_h = hidden alias i32, ptr @bar
 
 ; CHECK-DAG: .globl	bar_p
 ; CHECK-DAG: .protected	bar_p
-; CHECK-DAG: .set bar_p, bar
+; CHECK-DAG: bar_p = bar
 @bar_p = protected alias i32, ptr @bar
 
-; CHECK-DAG: .set test2, bar+4
+; CHECK-DAG: test2 = bar+4
 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1)
 
-; CHECK-DAG: .set test3, 42
+; CHECK-DAG: test3 = 42
 @test3 = alias i32, inttoptr(i32 42 to ptr)
 
-; CHECK-DAG: .set test4, bar
+; CHECK-DAG: test4 = bar
 @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr)
 
-; CHECK-DAG: .set test5, test2-bar
+; CHECK-DAG: test5 = test2-bar
 @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32),
                                  i32 ptrtoint (ptr @bar to i32)) to ptr)
 
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
index 7a5baa09f95e..10985de88bf2 100644
--- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
@@ -35,7 +35,7 @@
 ; }
 ;-------------------------------------------------------------------------------
 
-; CHECK: .set @feat.00, 2048
+; CHECK: @feat.00 = 2048
 
 ; CHECK: .section .gfids$y
 ; CHECK: .symidx _ZNK7Derived4calcEv
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard.ll b/llvm/test/CodeGen/WinCFGuard/cfguard.ll
index 2ec2e573f716..a77d5490ef87 100644
--- a/llvm/test/CodeGen/WinCFGuard/cfguard.ll
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s
 ; Control Flow Guard is currently only available on Windows
 
-; CHECK: .set @feat.00, 2048
+; CHECK: @feat.00 = 2048
 
 ; CHECK: .section .gfids$y
 ; CHECK: .symidx "?address_taken@@YAXXZ"
diff --git a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index d59953fb4e37..cc80f87fda31 100644
--- a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -10,4 +10,4 @@ define weak i32 @pthread_once(ptr, ptr) {
 ; CHECK: pthread_once:
 
 ; CHECK: .weak   __gthrw_pthread_once
-; CHECK: .set __gthrw_pthread_once, pthread_once
+; CHECK: __gthrw_pthread_once = pthread_once
diff --git a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
index 7050889d7102..527684f5a27d 100644
--- a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
+++ b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
@@ -125,31 +125,31 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp {
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:    .data_region jt32
-; CHECK-NEXT:  .set L0_0_set_3, LBB0_3-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_4, LBB0_4-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_5, LBB0_5-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_6, LBB0_6-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_7, LBB0_7-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_8, LBB0_8-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_9, LBB0_9-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_10, LBB0_10-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_11, LBB0_11-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_12, LBB0_12-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_13, LBB0_13-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_14, LBB0_14-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_15, LBB0_15-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_16, LBB0_16-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_17, LBB0_17-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_18, LBB0_18-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_19, LBB0_19-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_20, LBB0_20-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_21, LBB0_21-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_22, LBB0_22-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_23, LBB0_23-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_24, LBB0_24-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_25, LBB0_25-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_26, LBB0_26-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_27, LBB0_27-LJTI0_0
+; CHECK-NEXT:  L0_0_set_3 = LBB0_3-LJTI0_0
+; CHECK-NEXT:  L0_0_set_4 = LBB0_4-LJTI0_0
+; CHECK-NEXT:  L0_0_set_5 = LBB0_5-LJTI0_0
+; CHECK-NEXT:  L0_0_set_6 = LBB0_6-LJTI0_0
+; CHECK-NEXT:  L0_0_set_7 = LBB0_7-LJTI0_0
+; CHECK-NEXT:  L0_0_set_8 = LBB0_8-LJTI0_0
+; CHECK-NEXT:  L0_0_set_9 = LBB0_9-LJTI0_0
+; CHECK-NEXT:  L0_0_set_10 = LBB0_10-LJTI0_0
+; CHECK-NEXT:  L0_0_set_11 = LBB0_11-LJTI0_0
+; CHECK-NEXT:  L0_0_set_12 = LBB0_12-LJTI0_0
+; CHECK-NEXT:  L0_0_set_13 = LBB0_13-LJTI0_0
+; CHECK-NEXT:  L0_0_set_14 = LBB0_14-LJTI0_0
+; CHECK-NEXT:  L0_0_set_15 = LBB0_15-LJTI0_0
+; CHECK-NEXT:  L0_0_set_16 = LBB0_16-LJTI0_0
+; CHECK-NEXT:  L0_0_set_17 = LBB0_17-LJTI0_0
+; CHECK-NEXT:  L0_0_set_18 = LBB0_18-LJTI0_0
+; CHECK-NEXT:  L0_0_set_19 = LBB0_19-LJTI0_0
+; CHECK-NEXT:  L0_0_set_20 = LBB0_20-LJTI0_0
+; CHECK-NEXT:  L0_0_set_21 = LBB0_21-LJTI0_0
+; CHECK-NEXT:  L0_0_set_22 = LBB0_22-LJTI0_0
+; CHECK-NEXT:  L0_0_set_23 = LBB0_23-LJTI0_0
+; CHECK-NEXT:  L0_0_set_24 = LBB0_24-LJTI0_0
+; CHECK-NEXT:  L0_0_set_25 = LBB0_25-LJTI0_0
+; CHECK-NEXT:  L0_0_set_26 = LBB0_26-LJTI0_0
+; CHECK-NEXT:  L0_0_set_27 = LBB0_27-LJTI0_0
 ; CHECK-NEXT:  LJTI0_0:
 ; CHECK-NEXT:    .long L0_0_set_3
 ; CHECK-NEXT:    .long L0_0_set_3
diff --git a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index cf20cfaced5d..17df3e10fd3d 100644
--- a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -64,15 +64,15 @@ attributes #1 = { nounwind readnone }
 ; CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]]
 
 ; CHECK: Ldebug_loc0:
-; CHECK-NEXT: .set [[SET1:.*]], Lfunc_begin0-Lfunc_begin0
+; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET1]]
-; CHECK-NEXT: .set [[SET2:.*]], [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET2]]
 ; CHECK-NEXT: .short  1     ## Loc expr size
 ; CHECK-NEXT: .byte   85
-; CHECK-NEXT: .set [[SET3:.*]], [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET3]]
-; CHECK-NEXT: .set [[SET4:.*]], [[CLOBBER]]-Lfunc_begin0
+; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET4]]
 ; CHECK-NEXT: .short  1     ## Loc expr size
 ; CHECK-NEXT: .byte   83
diff --git a/llvm/test/CodeGen/X86/alias-gep.ll b/llvm/test/CodeGen/X86/alias-gep.ll
index 904a611f61d1..65d2ced6df5b 100644
--- a/llvm/test/CodeGen/X86/alias-gep.ll
+++ b/llvm/test/CodeGen/X86/alias-gep.ll
@@ -3,17 +3,17 @@
 
 ;MACHO: .globl _offsetSym0
 ;MACHO-NOT: .alt_entry
-;MACHO: .set _offsetSym0, _s
+;MACHO: _offsetSym0 = _s
 ;MACHO: .globl _offsetSym1
 ;MACHO: .alt_entry _offsetSym1
-;MACHO: .set _offsetSym1, _s+8
+;MACHO: _offsetSym1 = _s+8
 
 ;ELF: .globl offsetSym0
 ;ELF-NOT: .alt_entry
-;ELF: .set offsetSym0, s
+;ELF: offsetSym0 = s
 ;ELF: .globl offsetSym1
 ;ELF-NOT: .alt_entry
-;ELF: .set offsetSym1, s+8
+;ELF: offsetSym1 = s+8
 
 %struct.S1 = type { i32, i32, i32 }
 
diff --git a/llvm/test/CodeGen/X86/aliases.ll b/llvm/test/CodeGen/X86/aliases.ll
index 03ea2579d0f8..d36798820fe8 100644
--- a/llvm/test/CodeGen/X86/aliases.ll
+++ b/llvm/test/CodeGen/X86/aliases.ll
@@ -48,16 +48,16 @@ define i32 @foo_f() {
 ; CHECK-DAG: .protected	bar_p
 @bar_p = protected alias i32, ptr @bar
 
-; CHECK-DAG: .set test2, bar+4
+; CHECK-DAG: test2 = bar+4
 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1)
 
-; CHECK-DAG: .set test3, 42
+; CHECK-DAG: test3 = 42
 @test3 = alias i32, inttoptr(i32 42 to ptr)
 
-; CHECK-DAG: .set test4, bar
+; CHECK-DAG: test4 = bar
 @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr)
 
-; CHECK-DAG: .set test5, test2-bar
+; CHECK-DAG: test5 = test2-bar
 @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32),
                                  i32 ptrtoint (ptr @bar to i32)) to ptr)
 
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index 437d9698ee6b..ab9fa2287ffa 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -44,7 +44,7 @@ return:                                           ; preds = %catch, %entry
 ; CHECK: .LBB0_[[catch:[0-9]+]]:
 
 ; CHECK: .seh_handlerdata
-; CHECK-NEXT: .set .Lfoo$parent_frame_offset, 32
+; CHECK-NEXT: .Lfoo$parent_frame_offset = 32
 ; CHECK-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long   .Ltmp0@IMGREL
diff --git a/llvm/test/CodeGen/X86/coff-alias-type.ll b/llvm/test/CodeGen/X86/coff-alias-type.ll
index a242cd2d77d7..6cc0638b2d4a 100644
--- a/llvm/test/CodeGen/X86/coff-alias-type.ll
+++ b/llvm/test/CodeGen/X86/coff-alias-type.ll
@@ -22,4 +22,4 @@ entry:
 ; CHECK-NEXT: .scl     2
 ; CHECK-NEXT: .type    32
 ; CHECK-NEXT: .endef
-; CHECK-NEXT: .set     _ZN8MyStructC1Ev, _ZN8MyStructC2Ev
+; CHECK-NEXT: _ZN8MyStructC1Ev = _ZN8MyStructC2Ev
diff --git a/llvm/test/CodeGen/X86/coff-comdat.ll b/llvm/test/CodeGen/X86/coff-comdat.ll
index 99b3c0a687af..084a5a71125e 100644
--- a/llvm/test/CodeGen/X86/coff-comdat.ll
+++ b/llvm/test/CodeGen/X86/coff-comdat.ll
@@ -89,4 +89,4 @@ $vftable = comdat largest
 ; CHECK: .globl  _f6
 ; CHECK: .section        .rdata,"dr",largest,_vftable
 ; CHECK: .globl  _vftable
-; CHECK: .set _vftable, L_some_name+4
+; CHECK: _vftable = L_some_name+4
diff --git a/llvm/test/CodeGen/X86/coff-feat00.ll b/llvm/test/CodeGen/X86/coff-feat00.ll
index 21dd04ed34c7..1dcd4276399a 100644
--- a/llvm/test/CodeGen/X86/coff-feat00.ll
+++ b/llvm/test/CodeGen/X86/coff-feat00.ll
@@ -4,4 +4,4 @@ define i32 @foo() {
   ret i32 0
 }
 
-; CHECK: .set @feat.00, 1
+; CHECK: @feat.00 = 1
diff --git a/llvm/test/CodeGen/X86/dllexport-x86_64.ll b/llvm/test/CodeGen/X86/dllexport-x86_64.ll
index 76add98314f5..b640e630e47e 100644
--- a/llvm/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/llvm/test/CodeGen/X86/dllexport-x86_64.ll
@@ -105,23 +105,23 @@ define weak_odr dllexport void @weak1() {
 ; MINGW: .ascii " -export:blob_alias"
 
 ; CHECK: .globl alias
-; CHECK: .set alias, notExported
+; CHECK: alias = notExported
 @alias = dllexport alias void(), ptr @notExported
 
 ; CHECK: .globl aliasNotExported
-; CHECK: .set aliasNotExported, f1
+; CHECK: aliasNotExported = f1
 @aliasNotExported = alias void(), ptr @f1
 
 ; CHECK: .globl alias2
-; CHECK: .set alias2, f1
+; CHECK: alias2 = f1
 @alias2 = dllexport alias void(), ptr @f1
 
 ; CHECK: .globl alias3
-; CHECK: .set alias3, notExported
+; CHECK: alias3 = notExported
 @alias3 = dllexport alias void(), ptr @notExported
 
 ; CHECK: .weak weak_alias
-; CHECK: .set weak_alias, f1
+; CHECK: weak_alias = f1
 @weak_alias = weak_odr dllexport alias void(), ptr @f1
 
 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
diff --git a/llvm/test/CodeGen/X86/dllexport.ll b/llvm/test/CodeGen/X86/dllexport.ll
index 09cc03e7729d..53ecb8e7a1b4 100644
--- a/llvm/test/CodeGen/X86/dllexport.ll
+++ b/llvm/test/CodeGen/X86/dllexport.ll
@@ -135,17 +135,17 @@ define weak_odr dllexport void @weak1() {
 ; CHECK-GCC: .ascii " -export:weak_alias"
 
 ; CHECK: .globl _alias
-; CHECK: .set _alias, _notExported
+; CHECK: _alias = _notExported
 @alias = dllexport alias void(), ptr @notExported
 
 ; CHECK: .globl _alias2
-; CHECK: .set _alias2, _f1
+; CHECK: _alias2 = _f1
 @alias2 = dllexport alias void(), ptr @f1
 
 ; CHECK: .globl _alias3
-; CHECK: .set _alias3, _notExported
+; CHECK: _alias3 = _notExported
 @alias3 = dllexport alias void(), ptr @notExported
 
 ; CHECK: .weak _weak_alias
-; CHECK: .set _weak_alias, _f1
+; CHECK: _weak_alias = _f1
 @weak_alias = weak_odr dllexport alias void(), ptr @f1
diff --git a/llvm/test/CodeGen/X86/ehcontguard.ll b/llvm/test/CodeGen/X86/ehcontguard.ll
index 740621bc5d02..e868209babce 100644
--- a/llvm/test/CodeGen/X86/ehcontguard.ll
+++ b/llvm/test/CodeGen/X86/ehcontguard.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s
 ; EHCont Guard is currently only available on Windows
 
-; CHECK: .set @feat.00, 16384
+; CHECK: @feat.00 = 16384
 
 ; CHECK: .section .gehcont$y
 
diff --git a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll
index 53b4bc8f1df2..4840308a5d49 100644
--- a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll
+++ b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll
@@ -33,5 +33,5 @@ define private x86_fastcallcc void @dontCrash() {
 }
 
 @alias = alias void(i64, i8, i8, i16), ptr @func
-; CHECK32-LABEL: {{^}}.set @alias@20, @func@20
-; CHECK64-LABEL: {{^}}.set alias, func
+; CHECK32-LABEL: {{^}}@alias@20 = @func@20
+; CHECK64-LABEL: {{^}}alias = func
diff --git a/llvm/test/CodeGen/X86/ifunc-asm.ll b/llvm/test/CodeGen/X86/ifunc-asm.ll
index a4c47da7f4c6..bc8e7e3d7d05 100644
--- a/llvm/test/CodeGen/X86/ifunc-asm.ll
+++ b/llvm/test/CodeGen/X86/ifunc-asm.ll
@@ -15,7 +15,7 @@ entry:
 @foo_ifunc = ifunc i32 (i32), ptr @foo_resolver
 ; ELF:             .globl foo_ifunc
 ; ELF-NEXT:        .type foo_ifunc,@gnu_indirect_function
-; ELF-NEXT:        .set foo_ifunc, foo_resolver
+; ELF-NEXT:        foo_ifunc = foo_resolver
 
 ; MACHO:           .section __DATA,__data
 ; MACHO-NEXT:      .p2align 3, 0x0
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index b8f0661225f8..5199b1519ebe 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -47,9 +47,9 @@ entry:
   call fastcc void @"\01?fin$0@0@test2@@"(ptr %tmp0)
   ret void
 ; CHECK-LABEL: test2:
-; CHECK:	.set Ltest2$frame_escape_0, 8
-; CHECK:	.set Ltest2$frame_escape_1, 4
-; CHECK:	.set Ltest2$frame_escape_2, 0
+; CHECK:	Ltest2$frame_escape_0 = 8
+; CHECK:	Ltest2$frame_escape_1 = 4
+; CHECK:	Ltest2$frame_escape_2 = 0
 ; CHECK:	calll "?fin$0@0@test2@@"
 }
 
diff --git a/llvm/test/CodeGen/X86/linux-preemption.ll b/llvm/test/CodeGen/X86/linux-preemption.ll
index 8e60b4787975..dc06a34e1c69 100644
--- a/llvm/test/CodeGen/X86/linux-preemption.ll
+++ b/llvm/test/CodeGen/X86/linux-preemption.ll
@@ -285,18 +285,18 @@ define dso_local ptr @comdat_any_local() comdat {
 ; CHECK-NEXT: .Lstrong_local_global$local:
 
 ; COMMON:      .globl strong_default_alias
-; COMMON-NEXT: .set strong_default_alias, aliasee
+; COMMON-NEXT: strong_default_alias = aliasee
 ; COMMON-NEXT: .globl strong_hidden_alias
 ; COMMON-NEXT: .hidden strong_hidden_alias
-; COMMON-NEXT: .set strong_hidden_alias, aliasee
+; COMMON-NEXT: strong_hidden_alias = aliasee
 ; COMMON-NEXT: .weak weak_default_alias
-; COMMON-NEXT: .set weak_default_alias, aliasee
+; COMMON-NEXT: weak_default_alias = aliasee
 ; COMMON-NEXT: .globl strong_local_alias
-; COMMON-NEXT: .set strong_local_alias, aliasee
-; CHECK-NEXT:  .set .Lstrong_local_alias$local, aliasee
+; COMMON-NEXT: strong_local_alias = aliasee
+; CHECK-NEXT:  .Lstrong_local_alias$local = aliasee
 ; COMMON-NEXT: .weak weak_local_alias
-; COMMON-NEXT: .set weak_local_alias, aliasee
+; COMMON-NEXT: weak_local_alias = aliasee
 ; COMMON-NEXT: .globl strong_preemptable_alias
-; COMMON-NEXT: .set strong_preemptable_alias, aliasee
+; COMMON-NEXT: strong_preemptable_alias = aliasee
 ; COMMON-NEXT: .weak weak_preemptable_alias
-; COMMON-NEXT: .set weak_preemptable_alias, aliasee
+; COMMON-NEXT: weak_preemptable_alias = aliasee
diff --git a/llvm/test/CodeGen/X86/localescape.ll b/llvm/test/CodeGen/X86/localescape.ll
index aee7613273f7..57369be489af 100644
--- a/llvm/test/CodeGen/X86/localescape.ll
+++ b/llvm/test/CodeGen/X86/localescape.ll
@@ -76,8 +76,8 @@ define void @alloc_func(i32 %n) {
 ; X64: .seh_stackalloc 16
 ; X64: leaq    16(%rsp), %rbp
 ; X64: .seh_setframe %rbp, 16
-; X64: .set .Lalloc_func$frame_escape_0, -4
-; X64: .set .Lalloc_func$frame_escape_1, -12
+; X64: .Lalloc_func$frame_escape_0 = -4
+; X64: .Lalloc_func$frame_escape_1 = -12
 ; X64: movl $42, -4(%rbp)
 ; X64: movl $13, -12(%rbp)
 ; X64: movq 	%rbp, %rcx
@@ -88,8 +88,8 @@ define void @alloc_func(i32 %n) {
 ; X86: pushl   %ebp
 ; X86: movl    %esp, %ebp
 ; X86: subl    $12, %esp
-; X86: .set Lalloc_func$frame_escape_0, -4
-; X86: .set Lalloc_func$frame_escape_1, -12
+; X86: Lalloc_func$frame_escape_0 = -4
+; X86: Lalloc_func$frame_escape_1 = -12
 ; X86: movl    $42, -4(%ebp)
 ; X86: movl    $13, -12(%ebp)
 ; X86: pushl   %ebp
@@ -118,8 +118,8 @@ define void @alloc_func_no_frameaddr() {
 ; X64: subq    $40, %rsp
 ; X64: .seh_stackalloc 40
 ; X64: .seh_endprologue
-; X64: .set .Lalloc_func_no_frameaddr$frame_escape_0, 36
-; X64: .set .Lalloc_func_no_frameaddr$frame_escape_1, 32
+; X64: .Lalloc_func_no_frameaddr$frame_escape_0 = 36
+; X64: .Lalloc_func_no_frameaddr$frame_escape_1 = 32
 ; X64: movl $42, 36(%rsp)
 ; X64: movl $13, 32(%rsp)
 ; X64: xorl %ecx, %ecx
@@ -131,8 +131,8 @@ define void @alloc_func_no_frameaddr() {
 
 ; X86-LABEL: alloc_func_no_frameaddr:
 ; X86: subl    $8, %esp
-; X86: .set Lalloc_func_no_frameaddr$frame_escape_0, 4
-; X86: .set Lalloc_func_no_frameaddr$frame_escape_1, 0
+; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4
+; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0
 ; X86: movl $42, 4(%esp)
 ; X86: movl $13, (%esp)
 ; X86: pushl $0
diff --git a/llvm/test/CodeGen/X86/pr22019.ll b/llvm/test/CodeGen/X86/pr22019.ll
index 4e78bae20442..262ee5fad737 100644
--- a/llvm/test/CodeGen/X86/pr22019.ll
+++ b/llvm/test/CodeGen/X86/pr22019.ll
@@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu"
 module asm "pselect = __pselect"
 module asm "var = __var"
 module asm "alias = __alias"
-; CHECK: .set pselect, __pselect
-; CHECK: .set var, __var
-; CHECK: .set alias, __alias
+; CHECK: pselect = __pselect
+; CHECK: var = __var
+; CHECK: alias = __alias
 
 ; CHECK: pselect:
 ; CHECK: retq
@@ -19,5 +19,5 @@ define void @pselect() {
 ; CHECK: .long 0
 @var = global i32 0
 
-; CHECK: .set alias, var
+; CHECK: alias = var
 @alias = alias i32, ptr @var
diff --git a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
index 3acf999fc423..bd51ca76c59d 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -58,7 +58,7 @@ entry:
 ; CHECK: pushl %edi
 ; CHECK: pushl %esi
 
-; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
 ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%ebp)
 ; CHECK: movl $L__ehtable$main,
 ;       EH state 0
@@ -78,7 +78,7 @@ entry:
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]]
+; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
 ; CHECK: .p2align 2
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index 7558c4389be5..d958580e5925 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -119,7 +119,7 @@ __except.ret:                                     ; preds = %catch.dispatch.7
 ; CHECK:         jmp     .LBB1_[[epilogue]]
 
 ; CHECK:         .seh_handlerdata
-; CHECK-NEXT:         .set .Lmain$parent_frame_offset, 32
+; CHECK-NEXT:         .Lmain$parent_frame_offset = 32
 ; CHECK-NEXT:         .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT:         .long   .Ltmp0@IMGREL
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index 28e5cf68dd27..41823dfb38f0 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -26,7 +26,7 @@ lpad:                                             ; preds = %entry
 ; X64: retq
 
 ; X64: .seh_handlerdata
-; X64-NEXT: .set .Lmain$parent_frame_offset, 32
+; X64-NEXT: .Lmain$parent_frame_offset = 32
 ; X64-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
 ; X64-NEXT: .Llsda_begin0:
 ; X64-NEXT: .long   .Ltmp0@IMGREL # LabelStart
diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll
index 99b81f0eb1bb..63e91d33d400 100644
--- a/llvm/test/CodeGen/X86/seh-no-invokes.ll
+++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll
@@ -15,7 +15,7 @@
 ; label. This was PR30431.
 
 ; CHECK-LABEL: _f:                                     # @f
-; CHECK: .set Lf$parent_frame_offset, 0
+; CHECK: Lf$parent_frame_offset = 0
 ; CHECK: retl
 
 ; CHECK-LABEL: "?filt$0@0@f@@":                        # @"\01?filt$0@0@f@@"
diff --git a/llvm/test/CodeGen/X86/seh-stack-realign.ll b/llvm/test/CodeGen/X86/seh-stack-realign.ll
index 2869bff82231..ae687343cc50 100644
--- a/llvm/test/CodeGen/X86/seh-stack-realign.ll
+++ b/llvm/test/CodeGen/X86/seh-stack-realign.ll
@@ -51,7 +51,7 @@ entry:
 ; Check that we can get the exception code from eax to the printf.
 
 ; CHECK-LABEL: _main:
-; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
 ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi)
 ; CHECK: movl $L__ehtable$main,
 ;       EH state 0
@@ -71,7 +71,7 @@ entry:
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]]
+; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
 ; CHECK-NEXT: .long _filt$main
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index d8fcf6d86fa4..ecbbaf3ab362 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -34,12 +34,12 @@ define i32 @foo(i32 %x) nounwind ssp {
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:    .data_region jt32
-; CHECK-NEXT:  .set L0_0_set_2, LBB0_2-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_3, LBB0_3-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_4, LBB0_4-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_5, LBB0_5-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_6, LBB0_6-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_7, LBB0_7-LJTI0_0
+; CHECK-NEXT:  L0_0_set_2 = LBB0_2-LJTI0_0
+; CHECK-NEXT:  L0_0_set_3 = LBB0_3-LJTI0_0
+; CHECK-NEXT:  L0_0_set_4 = LBB0_4-LJTI0_0
+; CHECK-NEXT:  L0_0_set_5 = LBB0_5-LJTI0_0
+; CHECK-NEXT:  L0_0_set_6 = LBB0_6-LJTI0_0
+; CHECK-NEXT:  L0_0_set_7 = LBB0_7-LJTI0_0
 ; CHECK-NEXT:  LJTI0_0:
 ; CHECK-NEXT:    .long L0_0_set_2
 ; CHECK-NEXT:    .long L0_0_set_3
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 16322cbe9980..9e44299083d4 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: "?fin$0@0@main@@"
 ; CHECK:      .seh_handlerdata
-; CHECK:      .set ".L?fin$0@0@main@@$parent_frame_offset", 48
+; CHECK:      ".L?fin$0@0@main@@$parent_frame_offset" = 48
 ; CHECK-NEXT:        .long   (.Llsda_end1-.Llsda_begin1)/16
 ; CHECK-NEXT: .Llsda_begin1:
 ; CHECK-NEXT:        .long   .Ltmp
diff --git a/llvm/test/CodeGen/XCore/globals.ll b/llvm/test/CodeGen/XCore/globals.ll
index 134bbb3444b5..186cfda97104 100644
--- a/llvm/test/CodeGen/XCore/globals.ll
+++ b/llvm/test/CodeGen/XCore/globals.ll
@@ -127,4 +127,4 @@ entry:
 
 @array = global [10 x i16] zeroinitializer, align 2
 ; CHECK: .globl  array.globound
-; CHECK: .set array.globound, 10
+; CHECK: array.globound = 10
diff --git a/llvm/test/CodeGen/XCore/linkage.ll b/llvm/test/CodeGen/XCore/linkage.ll
index 93edf01cf8a9..5bfb83d964df 100644
--- a/llvm/test/CodeGen/XCore/linkage.ll
+++ b/llvm/test/CodeGen/XCore/linkage.ll
@@ -19,14 +19,14 @@ define protected void @test_protected() {
 }
 
 ; CHECK: .globl array.globound
-; CHECK: .set array.globound, 2
+; CHECK: array.globound = 2
 ; CHECK: .weak array.globound
 ; CHECK: .globl array
 ; CHECK: .weak array
 @array = weak global [2 x i32] zeroinitializer
 
 ; CHECK: .globl ac.globound
-; CHECK: .set ac.globound, 2
+; CHECK: ac.globound = 2
 ; CHECK: .weak ac.globound
 ; CHECK: .globl ac
 ; CHECK: .weak ac
diff --git a/llvm/test/DebugInfo/X86/dbg-value-range.ll b/llvm/test/DebugInfo/X86/dbg-value-range.ll
index 0d49b5eeefd1..a6ede2814aba 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-range.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-range.ll
@@ -49,9 +49,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 ;CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]]
 
 ;CHECK:Ldebug_loc0:
-;CHECK-NEXT: .set Lset{{.*}},
+;CHECK-NEXT: Lset{{.*}} =
 ;CHECK-NEXT:	.quad
-;CHECK-NEXT: .set [[CLOBBER_OFF:Lset.*]], [[CLOBBER]]-{{.*}}
+;CHECK-NEXT: [[CLOBBER_OFF:Lset.*]] = [[CLOBBER]]-{{.*}}
 ;CHECK-NEXT:	.quad	[[CLOBBER_OFF]]
 ;CHECK-NEXT:  .short 1 ## Loc expr size
 ;CHECK-NEXT:	.byte	85 ## DW_OP_reg
diff --git a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index 446f31f9a912..8d4d065641fc 100644
--- a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -64,11 +64,11 @@
 ; PR15408
 ; ASM: Lcu_begin0:
 ; ASM-NOT: Lcu_begin
-; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list
 ; ASM-NEXT: .long   Lset[[LT]]
 ; ASM: Lcu_begin1:
 ; ASM-NOT: Lcu_begin
-; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list
 ; ASM-NEXT: .long   Lset[[LT]]
 define i32 @test(i32 %a) nounwind uwtable ssp !dbg !5 {
 entry:
diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s
index 14ac11f581a5..b2ec5b6ac367 100644
--- a/llvm/test/MC/AArch64/basic-a64-instructions.s
+++ b/llvm/test/MC/AArch64/basic-a64-instructions.s
@@ -3349,7 +3349,7 @@ _func:
 
 	.equ equvalue, 0x0001
         movk x1, equvalue, lsl 16
-// CHECK: .set equvalue, 1
+// CHECK: equvalue = 1
 // CHECK-NEXT: movk x1, #1, lsl #16 // encoding: [0x21,0x00,0xa0,0xf2]
 
         movz x2, #:abs_g0:sym
diff --git a/llvm/test/MC/AsmParser/assignment.s b/llvm/test/MC/AsmParser/assignment.s
index 6f84a1c338da..8c8984c12ac3 100644
--- a/llvm/test/MC/AsmParser/assignment.s
+++ b/llvm/test/MC/AsmParser/assignment.s
@@ -1,22 +1,22 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
 
 # CHECK: TEST0:
-# CHECK: .set a, 0
+# CHECK: a = 0
 TEST0:
         a = 0
 
 # CHECK: TEST1:
-# CHECK: .set b, 0
+# CHECK: b = 0
 TEST1:
-        .set b, 0
+        b = 0
 
 # CHECK: .globl	_f1
-# CHECK: .set _f1, 0
+# CHECK: _f1 = 0
         .globl _f1
         _f1 = 0
 
 # CHECK: .globl	_f2
-# CHECK: .set _f2, 0
+# CHECK: _f2 = 0
         .globl _f2
-        .set _f2, 0
+        _f2 = 0
 
diff --git a/llvm/test/MC/AsmParser/directive_include.s b/llvm/test/MC/AsmParser/directive_include.s
index 8d2ef2753b23..f53bc671fc64 100644
--- a/llvm/test/MC/AsmParser/directive_include.s
+++ b/llvm/test/MC/AsmParser/directive_include.s
@@ -2,7 +2,7 @@
 
 # CHECK: TESTA:
 # CHECK: TEST0:
-# CHECK: .set a, 0
+# CHECK: a = 0
 # CHECK: TESTB:
 TESTA:  
 	.include       "directive\137set.s"   # "\137" is underscore "_"
diff --git a/llvm/test/MC/AsmParser/directive_set.s b/llvm/test/MC/AsmParser/directive_set.s
index 65dd33d1d54f..4b93de01b309 100644
--- a/llvm/test/MC/AsmParser/directive_set.s
+++ b/llvm/test/MC/AsmParser/directive_set.s
@@ -1,13 +1,13 @@
 # RUN: llvm-mc -triple i386-unknown-elf %s | FileCheck %s
 
 # CHECK: TEST0:
-# CHECK: .set a, 0
+# CHECK: a = 0
 # CHECK-NOT: .no_dead_strip a
 TEST0:  
-        .set a, 0
+        a = 0
         
 # CHECK: TEST1:
-# CHECK: .set a, 0
+# CHECK: a = 0
 # CHECK-NOT: .no_dead_strip a
 TEST1:  
         .equ a, 0
diff --git a/llvm/test/MC/AsmParser/include.ll b/llvm/test/MC/AsmParser/include.ll
index 3321f0a6a287..22c9eaf7a36e 100644
--- a/llvm/test/MC/AsmParser/include.ll
+++ b/llvm/test/MC/AsmParser/include.ll
@@ -10,5 +10,5 @@ entry:
   ret void
 }
 
-; CHECK: .set MODULE, 1
-; CHECK: .set FUNCTION, 1
+; CHECK: MODULE = 1
+; CHECK: FUNCTION = 1
diff --git a/llvm/test/MC/AsmParser/labels.s b/llvm/test/MC/AsmParser/labels.s
index 599ce72c44ee..6a9870b655f2 100644
--- a/llvm/test/MC/AsmParser/labels.s
+++ b/llvm/test/MC/AsmParser/labels.s
@@ -18,12 +18,12 @@ foo:
 // CHECK: addl $24, a$b+10(%eax)
         addl $24, ("a$b" + 10)(%eax)
 
-// CHECK: .set b$c, 10
+// CHECK: b$c = 10
 "b$c" = 10
 // CHECK: addl $10, %eax
         addl $"b$c", %eax
 
-// CHECK: .set "a 0", 11
+// CHECK: "a 0" = 11
         .set "a 0", 11
 
 // CHECK: .long 11
@@ -49,7 +49,7 @@ foo:
 // CHECX: .lsym "a 8",1
 //        .lsym "a 8", 1
 
-// CHECK: .set "a 9", a-b
+// CHECK: "a 9" = a-b
         .set "a 9", a - b
 
 // CHECK: .long "a 9"
diff --git a/llvm/test/MC/AsmParser/macro-arg-darwin.s b/llvm/test/MC/AsmParser/macro-arg-darwin.s
index 8671107539ce..88c63dd488be 100644
--- a/llvm/test/MC/AsmParser/macro-arg-darwin.s
+++ b/llvm/test/MC/AsmParser/macro-arg-darwin.s
@@ -38,7 +38,7 @@ bar
     .endif
 .endm
 .macro bottom
-    .set fred, $0
+    fred = $0
 .endm
 
 .text
@@ -49,7 +49,7 @@ top bar, 42
 // CHECK: _foo:
 // CHECK-NOT: fred
 // CHECK: _bar
-// CHECK-NEXT: .set fred, 42
+// CHECK-NEXT: fred = 42
 
 
 .macro foo
diff --git a/llvm/test/MC/AsmParser/motorola_integers.s b/llvm/test/MC/AsmParser/motorola_integers.s
index c75d9a5e0cb1..1ec2e02e97f0 100644
--- a/llvm/test/MC/AsmParser/motorola_integers.s
+++ b/llvm/test/MC/AsmParser/motorola_integers.s
@@ -1,10 +1,10 @@
 # RUN: llvm-mc -triple i386-unknown-unknown -motorola-integers %s | FileCheck %s
 
-# CHECK: .set a, 2882400009
-.set a, $aBcDeF09
-# CHECK: .set b, 256
-.set b, $0100
-# CHECK: .set c, 10
-.set c, %01010
-# CHECK: .set d, 1
-.set d, %1
+# CHECK: a = 2882400009
+a = $aBcDeF09
+# CHECK: b = 256
+b = $0100
+# CHECK: c = 10
+c = %01010
+# CHECK: d = 1
+d = %1
diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s
index 4a027c6e796a..f948d650da94 100644
--- a/llvm/test/MC/Mips/cpsetup.s
+++ b/llvm/test/MC/Mips/cpsetup.s
@@ -196,7 +196,7 @@ IMM_8 = 8
 
 # ALL-LABEL: <t1b>:
 # ASM-LABEL: t1b:
-# ASM-NEXT: .set IMM_8, 8
+# ASM-NEXT: IMM_8 = 8
 
 # O32-NOT: __cerror
 

From 95bbaca6c1dcabb03bd67aabe3aaa4730a11200d Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Thu, 12 Jun 2025 10:54:01 +0530
Subject: [PATCH 0094/1322] [AArch64] Extend usage of `XAR` instruction for
 fixed-length operations (#139460)

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 102 +++++--
 llvm/test/CodeGen/AArch64/xar.ll              | 250 +++++++++++++++++-
 2 files changed, 324 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 11cb91fbe02d..009d69b2b943 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4606,7 +4606,33 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
     return false;
   }
 
-  if (!Subtarget->hasSHA3())
+  // We have Neon SHA3 XAR operation for v2i64 but for types
+  // v4i32, v8i16, v16i8 we can use SVE operations when SVE2-SHA3
+  // is available.
+  EVT SVT;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32:
+  case MVT::v2i32:
+    SVT = MVT::nxv4i32;
+    break;
+  case MVT::v8i16:
+  case MVT::v4i16:
+    SVT = MVT::nxv8i16;
+    break;
+  case MVT::v16i8:
+  case MVT::v8i8:
+    SVT = MVT::nxv16i8;
+    break;
+  case MVT::v2i64:
+  case MVT::v1i64:
+    SVT = Subtarget->hasSHA3() ? MVT::v2i64 : MVT::nxv2i64;
+    break;
+  default:
+    return false;
+  }
+
+  if ((!SVT.isScalableVector() && !Subtarget->hasSHA3()) ||
+      (SVT.isScalableVector() && !Subtarget->hasSVE2()))
     return false;
 
   if (N0->getOpcode() != AArch64ISD::VSHL ||
@@ -4632,7 +4658,8 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
   SDValue Imm = CurDAG->getTargetConstant(
       ShAmt, DL, N0.getOperand(1).getValueType(), false);
 
-  if (ShAmt + HsAmt != 64)
+  unsigned VTSizeInBits = VT.getScalarSizeInBits();
+  if (ShAmt + HsAmt != VTSizeInBits)
     return false;
 
   if (!IsXOROperand) {
@@ -4640,33 +4667,76 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
     SDNode *MOV =
         CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, MVT::v2i64, Zero);
     SDValue MOVIV = SDValue(MOV, 0);
+
     R1 = N1->getOperand(0);
     R2 = MOVIV;
   }
 
-  // If the input is a v1i64, widen to a v2i64 to use XAR.
-  assert((VT == MVT::v1i64 || VT == MVT::v2i64) && "Unexpected XAR type!");
-  if (VT == MVT::v1i64) {
-    EVT SVT = MVT::v2i64;
+  if (SVT != VT) {
     SDValue Undef =
-        SDValue(CurDAG->getMachineNode(AArch64::IMPLICIT_DEF, DL, SVT), 0);
-    SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+        SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, SVT), 0);
+
+    if (SVT.isScalableVector() && VT.is64BitVector()) {
+      EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext());
+
+      SDValue UndefQ = SDValue(
+          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, QVT), 0);
+      SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+
+      R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT,
+                                          UndefQ, R1, DSub),
+                   0);
+      if (R2.getValueType() == VT)
+        R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT,
+                                            UndefQ, R2, DSub),
+                     0);
+    }
+
+    SDValue SubReg = CurDAG->getTargetConstant(
+        (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL, MVT::i32);
+
     R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef,
-                                        R1, DSub),
+                                        R1, SubReg),
                  0);
-    if (R2.getValueType() == MVT::v1i64)
+
+    if (SVT.isScalableVector() || R2.getValueType() != SVT)
       R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT,
-                                          Undef, R2, DSub),
+                                          Undef, R2, SubReg),
                    0);
   }
 
   SDValue Ops[] = {R1, R2, Imm};
-  SDNode *XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops);
+  SDNode *XAR = nullptr;
 
-  if (VT == MVT::v1i64) {
-    SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
-    XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
-                                 SDValue(XAR, 0), DSub);
+  if (SVT.isScalableVector()) {
+    if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::Int>(
+            SVT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S,
+                  AArch64::XAR_ZZZI_D}))
+      XAR = CurDAG->getMachineNode(Opc, DL, SVT, Ops);
+  } else {
+    XAR = CurDAG->getMachineNode(AArch64::XAR, DL, SVT, Ops);
+  }
+
+  assert(XAR && "Unexpected NULL value for XAR instruction in DAG");
+
+  if (SVT != VT) {
+    if (VT.is64BitVector() && SVT.isScalableVector()) {
+      EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext());
+
+      SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
+      SDNode *Q = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, QVT,
+                                         SDValue(XAR, 0), ZSub);
+
+      SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+      XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
+                                   SDValue(Q, 0), DSub);
+    } else {
+      SDValue SubReg = CurDAG->getTargetConstant(
+          (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL,
+          MVT::i32);
+      XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
+                                   SDValue(XAR, 0), SubReg);
+    }
   }
   ReplaceNode(N, XAR);
   return true;
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index d682f4f4a1bf..652617b58eaf 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
 ; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
+
+/* 128-bit vectors */
 
 define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
 ; SHA3-LABEL: xar:
@@ -14,6 +17,14 @@ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
 ; NOSHA3-NEXT:    shl v0.2d, v1.2d, #10
 ; NOSHA3-NEXT:    usra v0.2d, v1.2d, #54
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #54
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
     %a = xor <2 x i64> %x, %y
     %b = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 10, i64 10>)
     ret <2 x i64> %b
@@ -34,24 +45,40 @@ define <1 x i64> @xar_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; NOSHA3-NEXT:    shl d0, d1, #1
 ; NOSHA3-NEXT:    usra d0, d1, #63
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v1i64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #63
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
   %v.val = xor <1 x i64> %a, %b
   %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1))
   ret <1 x i64> %fshl
 }
 
-define <2 x i64> @xar_instead_of_or1(<2 x i64> %r) {
-; SHA3-LABEL: xar_instead_of_or1:
+define <2 x i64> @xar_instead_of_or_v2i64(<2 x i64> %r) {
+; SHA3-LABEL: xar_instead_of_or_v2i64:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    movi v1.2d, #0000000000000000
 ; SHA3-NEXT:    xar v0.2d, v0.2d, v1.2d, #39
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or1:
+; NOSHA3-LABEL: xar_instead_of_or_v2i64:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    shl v1.2d, v0.2d, #25
 ; NOSHA3-NEXT:    usra v1.2d, v0.2d, #39
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v2i64:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #39
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 25))
   ret <2 x i64> %or
@@ -72,67 +99,266 @@ define <1 x i64> @xar_instead_of_or_v1i64(<1 x i64> %v.val) {
 ; NOSHA3-NEXT:    usra d1, d0, #63
 ; NOSHA3-NEXT:    fmov d0, d1
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v1i64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #63
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
   %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1))
   ret <1 x i64> %fshl
 }
 
-define <4 x i32> @xar_instead_of_or2(<4 x i32> %r) {
-; SHA3-LABEL: xar_instead_of_or2:
+define <4 x i32> @xar_instead_of_or_v4i32(<4 x i32> %r) {
+; SHA3-LABEL: xar_instead_of_or_v4i32:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    shl v1.4s, v0.4s, #25
 ; SHA3-NEXT:    usra v1.4s, v0.4s, #7
 ; SHA3-NEXT:    mov v0.16b, v1.16b
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or2:
+; NOSHA3-LABEL: xar_instead_of_or_v4i32:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    shl v1.4s, v0.4s, #25
 ; NOSHA3-NEXT:    usra v1.4s, v0.4s, #7
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v4i32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #7
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <4 x i32> @llvm.fshl.v2i32(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 25))
   ret <4 x i32> %or
 }
 
-define <8 x i16> @xar_instead_of_or3(<8 x i16> %r) {
-; SHA3-LABEL: xar_instead_of_or3:
+define <8 x i16> @xar_instead_of_or_v8i16(<8 x i16> %r) {
+; SHA3-LABEL: xar_instead_of_or_v8i16:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    shl v1.8h, v0.8h, #9
 ; SHA3-NEXT:    usra v1.8h, v0.8h, #7
 ; SHA3-NEXT:    mov v0.16b, v1.16b
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or3:
+; NOSHA3-LABEL: xar_instead_of_or_v8i16:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    shl v1.8h, v0.8h, #9
 ; NOSHA3-NEXT:    usra v1.8h, v0.8h, #7
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v8i16:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #7
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <8 x i16> @llvm.fshl.v2i16(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 25))
   ret <8 x i16> %or
 }
 
-define <16 x i8> @xar_instead_of_or4(<16 x i8> %r) {
-; SHA3-LABEL: xar_instead_of_or4:
+define <16 x i8> @xar_instead_of_or_v16i8(<16 x i8> %r) {
+; SHA3-LABEL: xar_instead_of_or_v16i8:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    add v1.16b, v0.16b, v0.16b
 ; SHA3-NEXT:    usra v1.16b, v0.16b, #7
 ; SHA3-NEXT:    mov v0.16b, v1.16b
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or4:
+; NOSHA3-LABEL: xar_instead_of_or_v16i8:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    add v1.16b, v0.16b, v0.16b
 ; NOSHA3-NEXT:    usra v1.16b, v0.16b, #7
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v16i8:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #7
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <16 x i8> @llvm.fshl.v2i8(<16 x i8> %r, <16 x i8> %r, <16 x i8> splat (i8 25))
   ret <16 x i8> %or
 }
 
+/* 64 bit vectors */
+
+define <2 x i32> @xar_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; SHA3-LABEL: xar_v2i32:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; SHA3-NEXT:    shl v0.2s, v1.2s, #25
+; SHA3-NEXT:    usra v0.2s, v1.2s, #7
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v2i32:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    shl v0.2s, v1.2s, #25
+; NOSHA3-NEXT:    usra v0.2s, v1.2s, #7
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v2i32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %a = xor <2 x i32> %x, %y
+  %b = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 25, i32 25>)
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @xar_instead_of_or_v2i32(<2 x i32> %r) {
+; SHA3-LABEL: xar_instead_of_or_v2i32:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    shl v1.2s, v0.2s, #25
+; SHA3-NEXT:    usra v1.2s, v0.2s, #7
+; SHA3-NEXT:    fmov d0, d1
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_instead_of_or_v2i32:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    shl v1.2s, v0.2s, #25
+; NOSHA3-NEXT:    usra v1.2s, v0.2s, #7
+; NOSHA3-NEXT:    fmov d0, d1
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v2i32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %or = call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 25))
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @xar_v4i16(<4 x i16> %x, <4 x i16> %y) {
+; SHA3-LABEL: xar_v4i16:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; SHA3-NEXT:    shl v0.4h, v1.4h, #9
+; SHA3-NEXT:    usra v0.4h, v1.4h, #7
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v4i16:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    shl v0.4h, v1.4h, #9
+; NOSHA3-NEXT:    usra v0.4h, v1.4h, #7
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v4i16:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %a = xor <4 x i16> %x, %y
+  %b = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %a, <4 x i16> splat (i16 25))
+  ret <4 x i16> %b
+}
+
+define <4 x i16> @xar_instead_of_or_v4i16(<4 x i16> %r) {
+; SHA3-LABEL: xar_instead_of_or_v4i16:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    shl v1.4h, v0.4h, #9
+; SHA3-NEXT:    usra v1.4h, v0.4h, #7
+; SHA3-NEXT:    fmov d0, d1
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_instead_of_or_v4i16:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    shl v1.4h, v0.4h, #9
+; NOSHA3-NEXT:    usra v1.4h, v0.4h, #7
+; NOSHA3-NEXT:    fmov d0, d1
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v4i16:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %or = call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 25))
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @xar_v8i8(<8 x i8> %x, <8 x i8> %y) {
+; SHA3-LABEL: xar_v8i8:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; SHA3-NEXT:    add v0.8b, v1.8b, v1.8b
+; SHA3-NEXT:    usra v0.8b, v1.8b, #7
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v8i8:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    add v0.8b, v1.8b, v1.8b
+; NOSHA3-NEXT:    usra v0.8b, v1.8b, #7
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v8i8:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %a = xor <8 x i8> %x, %y
+  %b = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %a, <8 x i8> splat (i8 25))
+  ret <8 x i8> %b
+}
+
+define <8 x i8> @xar_instead_of_or_v8i8(<8 x i8> %r) {
+; SHA3-LABEL: xar_instead_of_or_v8i8:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    add v1.8b, v0.8b, v0.8b
+; SHA3-NEXT:    usra v1.8b, v0.8b, #7
+; SHA3-NEXT:    fmov d0, d1
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_instead_of_or_v8i8:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    add v1.8b, v0.8b, v0.8b
+; NOSHA3-NEXT:    usra v1.8b, v0.8b, #7
+; NOSHA3-NEXT:    fmov d0, d1
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v8i8:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %or = call <8 x i8> @llvm.fshl(<8 x i8> %r, <8 x i8> %r, <8 x i8> splat (i8 25))
+  ret <8 x i8> %or
+}
+
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)

From 2efff47363f18966cd37461323b5db5418183534 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Wed, 11 Jun 2025 22:43:06 -0700
Subject: [PATCH 0095/1322] [NFCI][msan] Show that shadow for partially
 undefined constant vectors is computed as fully initialized (#143823)

This happens because `getShadow(Value *V)` has a special case for fully undefined/poisoned values, but partially undefined values fall-through and are given a clean shadow. This leads to false negatives (no false positives).

Note: MSan correctly handles InsertElementInst, but the shadow of the initial constant vector may still be wrong and be propagated.

Showing that the same approximation happens for other composite types is left as an exercise for the reader.
---
 .../Instrumentation/MemorySanitizer.cpp       |  4 +
 .../MemorySanitizer/partial-poison.ll         | 78 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c2315d5de704..d3c6a7151ec3 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2085,6 +2085,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(ShadowPtr && "Could not find shadow for an argument");
       return ShadowPtr;
     }
+
+    // TODO: Partially undefined vectors are handled by the fall-through case
+    //       below (see partial-poison.ll); this causes false negatives.
+
     // For everything else the shadow is zero.
     return getCleanShadow(V);
   }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
new file mode 100644
index 000000000000..5164441c17e1
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes='msan' 2>&1 | FileCheck %s
+;
+; Test case to show that MSan computes shadows for partially poisoned vectors
+; as fully initialized, resulting in false negatives.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x i64> @left_poison(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @left_poison(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 poison, i64 42>
+;
+  ret <2 x i64> <i64 poison, i64 42>
+}
+
+define <2 x i64> @right_poison(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @right_poison(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 42, i64 poison>
+;
+  ret <2 x i64> <i64 42, i64 poison>
+}
+
+define <2 x i64> @full_poison(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @full_poison(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> poison
+;
+  ret <2 x i64> <i64 poison, i64 poison>
+}
+
+define <2 x i64> @no_poison_or_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @no_poison_or_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> splat (i64 42)
+;
+  ret <2 x i64> <i64 42, i64 42>
+}
+
+define <2 x i64> @left_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @left_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 undef, i64 42>
+;
+  ret <2 x i64> <i64 undef, i64 42>
+}
+
+define <2 x i64> @right_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @right_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 42, i64 undef>
+;
+  ret <2 x i64> <i64 42, i64 undef>
+}
+
+define <2 x i64> @full_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @full_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  ret <2 x i64> <i64 undef, i64 undef>
+}

From bec85f3b187f57713e01191381c88134e122bd35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 12 Jun 2025 08:58:26 +0300
Subject: [PATCH 0096/1322] [LLD] [COFF] [test] Readd lto-late-arm.ll (#143494)

This testcase was removed in 4cafd28b7dd92080103d11cccc78d9a2f01e1242,
as a082f665f85b1002ab22af263eeafceca5288657 had made it no longer
trigger the error that it was supposed to do. (Because the latter of
those two commits makes the symbol "__rt_sdiv" be included among the
potential libcalls listed by lto::LTO::getRuntimeLibcallSymbols().)

Readd the test as a positive test, making sure that such libcalls can
get linked.

We do have preexisting test coverage for LTO libcalls overall in
libcall-archive.ll, but readd this test to cover specifically the ARM
division helper functions as well.
---
 lld/test/COFF/lto-late-arm.ll | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 lld/test/COFF/lto-late-arm.ll

diff --git a/lld/test/COFF/lto-late-arm.ll b/lld/test/COFF/lto-late-arm.ll
new file mode 100644
index 000000000000..1070fc52a013
--- /dev/null
+++ b/lld/test/COFF/lto-late-arm.ll
@@ -0,0 +1,38 @@
+; REQUIRES: arm
+
+;; A bitcode file can generate undefined references to symbols that weren't
+;; listed as undefined on the bitcode file itself, when lowering produces
+;; calls to e.g. builtin helper functions. Ideally all those functions are
+;; listed by lto::LTO::getRuntimeLibcallSymbols(), then we successfully
+;; can link cases when the helper functions are provided as bitcode too.
+;; (In practice, compiler-rt builtins are always compiled with -fno-lto, so
+;; this shouldn't really happen anyway.)
+
+; RUN: rm -rf %t.dir
+; RUN: split-file %s %t.dir
+; RUN: llvm-as %t.dir/main.ll -o %t.main.obj
+; RUN: llvm-as %t.dir/sdiv.ll -o %t.sdiv.obj
+; RUN: llvm-ar rcs %t.sdiv.lib %t.sdiv.obj
+
+; RUN: lld-link /entry:entry %t.main.obj %t.sdiv.lib /out:%t.exe /subsystem:console
+
+;--- main.ll
+target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7-w64-windows-gnu"
+
+@num = dso_local global i32 100
+
+define dso_local arm_aapcs_vfpcc i32 @entry(i32 %param) {
+entry:
+  %0 = load i32, ptr @num
+  %div = sdiv i32 %0, %param
+  ret i32 %div
+}
+;--- sdiv.ll
+target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7-w64-windows-gnu"
+
+define dso_local arm_aapcs_vfpcc void @__rt_sdiv() {
+entry:
+  ret void
+}

From 9d491bc602c2d9730cb42fe25f0753471a3af389 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 12 Jun 2025 07:03:09 +0100
Subject: [PATCH 0097/1322] [AArch64][GlobalISel] Enable
 extract_vec_elt_combines postlegalization.

---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  2 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 51 +++++++------------
 2 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 571e2692cbff..ca09598464d1 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -361,7 +361,7 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs,
+                        commute_constant_to_rhs, extract_vec_elt_combines,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate, extmultomull]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 77483ebb2235..d6d323530946 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -596,23 +596,15 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
 ; CHECK-GI-NEXT:    mov.b v1[3], w8
 ; CHECK-GI-NEXT:    cmeq.8b v0, v0, v1
 ; CHECK-GI-NEXT:    mvn.8b v0, v0
-; CHECK-GI-NEXT:    umov.b w8, v0[0]
-; CHECK-GI-NEXT:    umov.b w9, v0[1]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    umov.b w8, v0[2]
-; CHECK-GI-NEXT:    mov.s v1[1], w9
-; CHECK-GI-NEXT:    umov.b w9, v0[3]
-; CHECK-GI-NEXT:    mov.s v1[2], w8
-; CHECK-GI-NEXT:    mov.s v1[3], w9
-; CHECK-GI-NEXT:    mov.s w8, v1[1]
-; CHECK-GI-NEXT:    mov.s w9, v1[2]
-; CHECK-GI-NEXT:    fmov w11, s1
-; CHECK-GI-NEXT:    mov.s w10, v1[3]
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
 ; CHECK-GI-NEXT:    and w8, w8, #0x1
-; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
-; CHECK-GI-NEXT:    and w8, w9, #0x1
-; CHECK-GI-NEXT:    and w9, w10, #0x1
-; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
 ; CHECK-GI-NEXT:    strb w8, [sp, #15]
 ; CHECK-GI-NEXT:    and w0, w8, #0xff
@@ -871,28 +863,19 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 ; CHECK-GI-NEXT:    cmtst.4s v1, v1, v1
 ; CHECK-GI-NEXT:    mov.s w8, v1[1]
 ; CHECK-GI-NEXT:    mov.s w9, v1[2]
+; CHECK-GI-NEXT:    fmov w11, s1
 ; CHECK-GI-NEXT:    mov.s w10, v1[3]
-; CHECK-GI-NEXT:    mov.h v1[1], w8
-; CHECK-GI-NEXT:    mov.s w8, v0[1]
-; CHECK-GI-NEXT:    mov.h v1[2], w9
-; CHECK-GI-NEXT:    mov.h v1[3], w10
-; CHECK-GI-NEXT:    mov.h v1[4], v0[0]
-; CHECK-GI-NEXT:    mov.h v1[5], w8
-; CHECK-GI-NEXT:    umov.h w8, v1[1]
-; CHECK-GI-NEXT:    umov.h w9, v1[0]
-; CHECK-GI-NEXT:    umov.h w10, v1[2]
-; CHECK-GI-NEXT:    umov.h w11, v1[3]
 ; CHECK-GI-NEXT:    and w8, w8, #0x1
-; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
-; CHECK-GI-NEXT:    and w8, w10, #0x1
-; CHECK-GI-NEXT:    umov.h w10, v1[4]
-; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
-; CHECK-GI-NEXT:    and w9, w11, #0x1
-; CHECK-GI-NEXT:    umov.h w11, v1[5]
-; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
 ; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    mov.s w10, v0[1]
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    and w9, w9, #0x1
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
-; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
 ; CHECK-GI-NEXT:    and w8, w8, #0x3f
 ; CHECK-GI-NEXT:    strb w8, [sp, #15]

From 3f0cf742ac4eb3437450f8f263081ea951248851 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 14:40:38 +0800
Subject: [PATCH 0098/1322] [C++20] [Modules] [Reduced BMI] Don't write
 specializations with local args

Close https://github.com/llvm/llvm-project/issues/119947

As discussed in the above thread, we shouldn't write specializations
with local args in reduced BMI. Since users can't find such
specializations any way.
---
 clang/lib/Serialization/ASTWriterDecl.cpp | 45 +++++++++++++++++++
 clang/test/Modules/pr119947.cppm          | 54 +++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 clang/test/Modules/pr119947.cppm

diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 8f82324a2753..052cb5a253bf 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -221,6 +221,48 @@ namespace clang {
         Record.AddDeclRef(F.second);
     }
 
+    template <typename T> bool shouldSkipWritingSpecializations(T *Spec) {
+      // Now we will only avoid writing specializations if we're generating
+      // reduced BMI.
+      if (!GeneratingReducedBMI)
+        return false;
+
+      assert((isa<FunctionDecl, ClassTemplateSpecializationDecl,
+                  VarTemplateSpecializationDecl>(Spec)));
+
+      ArrayRef<TemplateArgument> Args;
+      if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(Spec))
+        Args = CTSD->getTemplateArgs().asArray();
+      else if (auto *VTSD = dyn_cast<VarTemplateSpecializationDecl>(Spec))
+        Args = VTSD->getTemplateArgs().asArray();
+      else
+        Args = cast<FunctionDecl>(Spec)
+                   ->getTemplateSpecializationArgs()
+                   ->asArray();
+
+      // If there is any template argument is TULocal, we can avoid writing the
+      // specialization since the consumers of reduced BMI won't get the
+      // specialization anyway.
+      for (const TemplateArgument &TA : Args) {
+        switch (TA.getKind()) {
+        case TemplateArgument::Type: {
+          Linkage L = TA.getAsType()->getLinkage();
+          if (!isExternallyVisible(L))
+            return true;
+          break;
+        }
+        case TemplateArgument::Declaration:
+          if (!TA.getAsDecl()->isExternallyVisible())
+            return true;
+          break;
+        default:
+          break;
+        }
+      }
+
+      return false;
+    }
+
     /// Add to the record the first template specialization from each module
     /// file that provides a declaration of D. We store the DeclId and an
     /// ODRHash of the template arguments of D which should provide enough
@@ -235,6 +277,9 @@ namespace clang {
       CollectFirstDeclFromEachModule(D, /*IncludeLocal*/ true, Firsts);
 
       for (const auto &F : Firsts) {
+        if (shouldSkipWritingSpecializations(F.second))
+          continue;
+
         if (isa<ClassTemplatePartialSpecializationDecl,
                 VarTemplatePartialSpecializationDecl>(F.second))
           PartialSpecsInMap.push_back(F.second);
diff --git a/clang/test/Modules/pr119947.cppm b/clang/test/Modules/pr119947.cppm
new file mode 100644
index 000000000000..40de2cad3c0d
--- /dev/null
+++ b/clang/test/Modules/pr119947.cppm
@@ -0,0 +1,54 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -emit-llvm -o -
+
+
+//--- a.cppm
+export module a;
+
+struct a_inner {
+	~a_inner() {
+	}
+	void f(auto) {
+	}
+};
+
+export template<typename T>
+struct a {
+	a() {
+		struct local {};
+		inner.f(local());
+	}
+private:
+	a_inner inner;
+};
+
+
+namespace {
+
+struct s {
+};
+
+} // namespace
+
+void f() {
+	a<s> x;
+}
+
+//--- use.cpp
+import a;
+
+namespace {
+
+struct s {
+};
+
+} // namespace
+
+void g() {
+	a<s> x;
+}
+

From 6157028fea93ff14af18b173dd01eb431cfb6aef Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 09:19:50 +0200
Subject: [PATCH 0099/1322] [BasicAA][ValueTracking] Increase depth for
 underlying object search (#143714)

This depth limits a linear search (rather than the usual potentially
exponential one) and is not particularly important for compile-time in
practice.

The change in #137297 is going to increase the length of GEP chains, so
I'd like to increase this limit a bit to reduce the chance of
regressions (https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2419
showed a 13% increase in SearchLimitReached). There is no particular
significance to the new value of 10.

Compile-time is neutral.
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  2 +-
 .../BasicAA/gep-decomposition-limit.ll        | 38 +++++++++++--------
 .../underlying-objects-2.ll                   |  5 ++-
 .../inline-noalias-unidentify-object.ll       | 22 +++++++----
 4 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 32ab9733d13c..e215c90b5a72 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -47,7 +47,7 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
 
 /// The max limit of the search depth in DecomposeGEPExpression() and
 /// getUnderlyingObject().
-constexpr unsigned MaxLookupSearchDepth = 6;
+constexpr unsigned MaxLookupSearchDepth = 10;
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the KnownZero/KnownOne bit sets.
diff --git a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll
index 23a96ebca848..a256ececbe56 100644
--- a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll
+++ b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll
@@ -2,22 +2,22 @@
 
 ; CHECK-LABEL: Function: test
 ;; Before limit:
-; CHECK-DAG: MustAlias: i8* %gep.add5, i8* %gep.inc5
-; CHECK-DAG: NoAlias: i8* %gep.inc3, i8* %gep.inc5
-; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc5
+; CHECK-DAG: MustAlias: i8* %gep.add9, i8* %gep.inc9
+; CHECK-DAG: NoAlias: i8* %gep.inc7, i8* %gep.inc9
+; CHECK-DAG: NoAlias: i8* %gep.inc8, i8* %gep.inc9
 ;; At limit:
-; CHECK-DAG: MustAlias: i8* %gep.add6, i8* %gep.inc6
-; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc6
-; CHECK-DAG: NoAlias: i8* %gep.inc5, i8* %gep.inc6
+; CHECK-DAG: MustAlias: i8* %gep.add10, i8* %gep.inc10
+; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc8
+; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc9
 ;; After limit:
-; CHECK-DAG: MayAlias: i8* %gep.add7, i8* %gep.inc7
-; CHECK-DAG: MayAlias: i8* %gep.inc5, i8* %gep.inc7
-; CHECK-DAG: NoAlias: i8* %gep.inc6, i8* %gep.inc7
+; CHECK-DAG: MayAlias: i8* %gep.add11, i8* %gep.inc11
+; CHECK-DAG: MayAlias: i8* %gep.inc11, i8* %gep.inc9
+; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc11
 
 define void @test(ptr %base) {
-  %gep.add5 = getelementptr i8, ptr %base, i64 5
-  %gep.add6 = getelementptr i8, ptr %base, i64 6
-  %gep.add7 = getelementptr i8, ptr %base, i64 7
+  %gep.add9 = getelementptr i8, ptr %base, i64 9
+  %gep.add10 = getelementptr i8, ptr %base, i64 10
+  %gep.add11 = getelementptr i8, ptr %base, i64 11
 
   %gep.inc1 = getelementptr i8, ptr %base, i64 1
   %gep.inc2 = getelementptr i8, ptr %gep.inc1, i64 1
@@ -26,15 +26,23 @@ define void @test(ptr %base) {
   %gep.inc5 = getelementptr i8, ptr %gep.inc4, i64 1
   %gep.inc6 = getelementptr i8, ptr %gep.inc5, i64 1
   %gep.inc7 = getelementptr i8, ptr %gep.inc6, i64 1
+  %gep.inc8 = getelementptr i8, ptr %gep.inc7, i64 1
+  %gep.inc9 = getelementptr i8, ptr %gep.inc8, i64 1
+  %gep.inc10 = getelementptr i8, ptr %gep.inc9, i64 1
+  %gep.inc11 = getelementptr i8, ptr %gep.inc10, i64 1
 
-  load i8, ptr %gep.add5
-  load i8, ptr %gep.add6
-  load i8, ptr %gep.add7
+  load i8, ptr %gep.add9
+  load i8, ptr %gep.add10
+  load i8, ptr %gep.add11
   load i8, ptr %gep.inc3
   load i8, ptr %gep.inc4
   load i8, ptr %gep.inc5
   load i8, ptr %gep.inc6
   load i8, ptr %gep.inc7
+  load i8, ptr %gep.inc8
+  load i8, ptr %gep.inc9
+  load i8, ptr %gep.inc10
+  load i8, ptr %gep.inc11
 
   ret void
 }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll
index abfdff79dc11..1d3512128678 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll
@@ -127,9 +127,12 @@ for_j.body:
   %gepB7 = getelementptr inbounds i8, ptr %gepB6, i64 0
   %gepB8 = getelementptr inbounds i8, ptr %gepB7, i64 0
   %gepB9 = getelementptr inbounds i8, ptr %gepB8, i64 0
+  %gepB10 = getelementptr inbounds i8, ptr %gepB9, i64 0
+  %gepB11 = getelementptr inbounds i8, ptr %gepB10, i64 0
+  %gepB12 = getelementptr inbounds i8, ptr %gepB11, i64 0
 
   %loadPrev = load i8, ptr %gepPrev, align 1
-  %loadB = load i8, ptr %gepB9, align 1
+  %loadB = load i8, ptr %gepB12, align 1
 
   %mul = mul i8 %loadPrev, %loadB
 
diff --git a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll
index 54e9ee0918ae..b7ba1b32238a 100644
--- a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll
+++ b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll
@@ -3,15 +3,18 @@
 define i32 @caller(ptr %p) {
 ; CHECK-LABEL: define i32 @caller(ptr %p) {
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    [[P_8_I:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8
-; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P_8_I]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[P_1_I:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    [[P_11_I:%.*]] = getelementptr i8, ptr %p, i64 11
+; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P_11_I]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[P_1_I:%.*]] = getelementptr i8, ptr %p, i64 1
 ; CHECK-NEXT:    [[P_2_I:%.*]] = getelementptr i8, ptr [[P_1_I]], i64 1
 ; CHECK-NEXT:    [[P_3_I:%.*]] = getelementptr i8, ptr [[P_2_I]], i64 1
 ; CHECK-NEXT:    [[P_4_I:%.*]] = getelementptr i8, ptr [[P_3_I]], i64 1
 ; CHECK-NEXT:    [[P_5_I:%.*]] = getelementptr i8, ptr [[P_4_I]], i64 1
 ; CHECK-NEXT:    [[P_6_I:%.*]] = getelementptr i8, ptr [[P_5_I]], i64 1
-; CHECK-NEXT:    [[P_7_I:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1
+; CHECK-NEXT:    [[P_7_I1:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1
+; CHECK-NEXT:    [[P_8_I:%.*]] = getelementptr i8, ptr [[P_7_I1]], i64 1
+; CHECK-NEXT:    [[P_9_I:%.*]] = getelementptr i8, ptr [[P_8_I]], i64 1
+; CHECK-NEXT:    [[P_7_I:%.*]] = getelementptr i8, ptr [[P_9_I]], i64 1
 ; CHECK-NEXT:    [[P_8_ALIAS_I:%.*]] = getelementptr i8, ptr [[P_7_I]], i64 1
 ; CHECK-NEXT:    store i32 42, ptr [[P_8_ALIAS_I]], align 4
 ; CHECK-NEXT:    ret i32 [[V_I]]
@@ -21,8 +24,8 @@ define i32 @caller(ptr %p) {
 }
 
 define internal i32 @callee(ptr noalias %p) {
-  %p.8 = getelementptr i8, ptr %p, i64 8
-  %v = load i32, ptr %p.8
+  %p.11 = getelementptr i8, ptr %p, i64 11
+  %v = load i32, ptr %p.11
   %p.1 = getelementptr i8, ptr %p, i64 1
   %p.2 = getelementptr i8, ptr %p.1, i64 1
   %p.3 = getelementptr i8, ptr %p.2, i64 1
@@ -30,7 +33,10 @@ define internal i32 @callee(ptr noalias %p) {
   %p.5 = getelementptr i8, ptr %p.4, i64 1
   %p.6 = getelementptr i8, ptr %p.5, i64 1
   %p.7 = getelementptr i8, ptr %p.6, i64 1
-  %p.8.alias = getelementptr i8, ptr %p.7, i64 1
-  store i32 42, ptr %p.8.alias
+  %p.8 = getelementptr i8, ptr %p.7, i64 1
+  %p.9 = getelementptr i8, ptr %p.8, i64 1
+  %p.10 = getelementptr i8, ptr %p.9, i64 1
+  %p.11.alias = getelementptr i8, ptr %p.10, i64 1
+  store i32 42, ptr %p.11.alias
   ret i32 %v
 }

From 77062244ed56be61aecda28d6fede3432545f741 Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Thu, 12 Jun 2025 09:29:40 +0200
Subject: [PATCH 0100/1322] Fix two instances of -Wparentheses warnings [NFC]

Add parentheses around the assert conditions.

Without this gcc warned like
 ../lib/Target/AMDGPU/GCNSchedStrategy.cpp:2250: warning: suggest parentheses around '&&' within '||' [-Wparentheses]
  2250 |          NewMI != RegionBounds.second && "cannot remove at region end");
and
 ../../clang/lib/Sema/SemaOverload.cpp:11326:39: warning: suggest parentheses around '&&' within '||' [-Wparentheses]
 11326 |          DeferredCandidatesCount == 0 &&
       |          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~
 11327 |              "Unexpected deferred template candidates");
       |              ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
---
 clang/lib/Sema/SemaOverload.cpp             | 6 +++---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index cf455f4588de..89e86f49a3ca 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11322,9 +11322,9 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S,
                                                            SourceLocation Loc,
                                                            iterator &Best) {
 
-  assert(shouldDeferTemplateArgumentDeduction(S.getLangOpts()) ||
-         DeferredCandidatesCount == 0 &&
-             "Unexpected deferred template candidates");
+  assert((shouldDeferTemplateArgumentDeduction(S.getLangOpts()) ||
+          DeferredCandidatesCount == 0) &&
+         "Unexpected deferred template candidates");
 
   bool TwoPhaseResolution =
       DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0f80462050cd..7165cf89ca45 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2246,8 +2246,8 @@ void PreRARematStage::finalizeGCNSchedStage() {
 void GCNScheduleDAGMILive::updateRegionBoundaries(
     RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
     MachineInstr *NewMI) {
-  assert(!NewMI ||
-         NewMI != RegionBounds.second && "cannot remove at region end");
+  assert((!NewMI || NewMI != RegionBounds.second) &&
+         "cannot remove at region end");
 
   if (RegionBounds.first == RegionBounds.second) {
     assert(NewMI && "cannot remove from an empty region");

From 2d35b568ef949717e35df664d4d9352eddbffbfd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 09:27:24 +0100
Subject: [PATCH 0101/1322] [X86] bsf.ll - add icmp_ne coverage to bsf
 passthrough tests

---
 llvm/test/CodeGen/X86/bsf.ll | 56 ++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 58929115baf5..312f94c04123 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -38,13 +38,13 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB1_1
+; X86-NEXT:    jne .LBB1_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB1_1:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -56,8 +56,8 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true)
-  %2 = icmp eq i8 %x, 0
-  %3 = select i1 %2, i8 %y, i8 %1
+  %2 = icmp ne i8 %x, 0
+  %3 = select i1 %2, i8 %1, i8 %y
   ret i8 %3
 }
 
@@ -66,14 +66,14 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB2_1
+; X86-NEXT:    jne .LBB2_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    orl $65536, %eax # imm = 0x10000
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB2_1:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl $65536, %eax # imm = 0x10000
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -87,8 +87,8 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
-  %2 = icmp eq i16 %x, 0
-  %3 = select i1 %2, i16 %y, i16 %1
+  %2 = icmp ne i16 %x, 0
+  %3 = select i1 %2, i16 %1, i16 %y
   ret i16 %3
 }
 
@@ -157,12 +157,12 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB5_1
+; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmov_bsf32_undef:
@@ -171,8 +171,8 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
 ; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
-  %2 = icmp eq i32 %x, 0
-  %3 = select i1 %2, i32 %y, i32 %1
+  %2 = icmp ne i32 %x, 0
+  %3 = select i1 %2, i32 %1, i32 %y
   ret i32 %3
 }
 
@@ -199,7 +199,7 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl $64, %eax
 ; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    jne .LBB6_7
-; X86-NEXT:  .LBB6_6:
+; X86-NEXT:  .LBB6_6: # %cond.end
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB6_7: # %cond.end
@@ -218,8 +218,8 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
-  %2 = icmp eq i64 %x, 0
-  %3 = select i1 %2, i64 %y, i64 %1
+  %2 = icmp ne i64 %x, 0
+  %3 = select i1 %2, i64 %1, i64 %y
   ret i64 %3
 }
 
@@ -375,10 +375,10 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    orl %edi, %ebp
 ; X86-NEXT:    je .LBB9_11
-; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
-; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:  # %bb.3: # %select.true.sink
 ; X86-NEXT:    rep bsfl %ecx, %edi
 ; X86-NEXT:    addl $32, %edi
 ; X86-NEXT:    testl %ebx, %ebx
@@ -402,20 +402,20 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    rep bsfl %edx, %edi
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    jne .LBB9_5
-; X86-NEXT:  .LBB9_6: # %select.false.sink
+; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
 ; X86-NEXT:    addl $32, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    jne .LBB9_9
-; X86-NEXT:  .LBB9_8: # %select.false.sink
+; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
 ; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:  .LBB9_9: # %select.false.sink
+; X86-NEXT:  .LBB9_9: # %select.true.sink
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
-; X86-NEXT:  .LBB9_10: # %select.false.sink
+; X86-NEXT:  .LBB9_10: # %select.true.sink
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -427,7 +427,7 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    orq %rsi, %rax
 ; X64-NEXT:    je .LBB9_2
-; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:  # %bb.1: # %select.true.sink
 ; X64-NEXT:    rep bsfq %rdi, %rcx
 ; X64-NEXT:    rep bsfq %rsi, %rax
 ; X64-NEXT:    addq $64, %rax
@@ -440,8 +440,8 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X64-NEXT:    movq %rcx, %rdx
 ; X64-NEXT:    retq
   %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
-  %2 = icmp eq i128 %x, 0
-  %3 = select i1 %2, i128 %y, i128 %1
+  %2 = icmp ne i128 %x, 0
+  %3 = select i1 %2, i128 %1, i128 %y
   ret i128 %3
 }
 

From 6e5a1423b752c66273bfcff35aaa8083075788a8 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Thu, 12 Jun 2025 01:28:27 -0700
Subject: [PATCH 0102/1322] [mlir] Reapply "Loosen restrictions on folding
 dynamic reshapes" (#142827)

The original PR https://github.com/llvm/llvm-project/pull/137963 had a
nvidia bot failure. This appears to be a flaky test because rerunning
the build was successful.

This change needs commit 6f2ba47 to fix incorrect usage of
`getReassociationIndicesForCollapse`.

Reverts llvm/llvm-project#142639

Co-authored-by: Artem Gindinson <gindinson@roofline.ai>
---
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp    | 374 +++++++++++++++---
 .../Dialect/Linalg/simplify-pack-unpack.mlir  |   4 +-
 mlir/test/Dialect/Tensor/canonicalize.mlir    |  39 +-
 mlir/unittests/Dialect/Utils/CMakeLists.txt   |   1 +
 .../Dialect/Utils/ReshapeOpsUtilsTest.cpp     | 203 ++++++++++
 5 files changed, 561 insertions(+), 60 deletions(-)
 create mode 100644 mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp

diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index 1a04d702e055..3b1fdb69e8ef 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -10,6 +10,10 @@
 
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
 
 #include <numeric>
 #include <optional>
@@ -28,67 +32,329 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType,
   return std::nullopt;
 }
 
+namespace {
+/// A simple struct to represent ReassociationIndices as an inclusive interval.
+/// It's designed to be feasibly minimal, so the call sites should manage the
+/// validity of the range manually.
+struct ReassociationIndexRange {
+  /// FIXME: Signed type is used for consistency with ReassociationIndices.
+  /// We should consider refactoring all reassociation utilities to use unsigned
+  /// types.
+  int64_t leftIdx = 0, rightIdx = 0;
+
+  /// Util for manual checks of the range's validity
+  LogicalResult verify() const {
+    return leftIdx >= 0 && (leftIdx <= rightIdx) ? success() : failure();
+  }
+
+  /// Checks range's containment within another range. Treats the edges
+  /// non-exclusively.
+  bool isInRange(const ReassociationIndexRange &outerRange) const {
+    return leftIdx >= outerRange.leftIdx && rightIdx <= outerRange.rightIdx;
+  }
+
+  unsigned size() const {
+    assert(succeeded(verify()));
+    return rightIdx - leftIdx + 1;
+  }
+  bool containsSingleIndex() const { return size() == 1; }
+
+  /// Collects indices that do not overlap between this and another range.
+  ReassociationIndices
+  getNonOverlappingIndicesWith(ReassociationIndexRange &rhs) const {
+    if (rightIdx < rhs.leftIdx) {
+      // The intervals do not overlap - concatenate the indices from both.
+      auto jointFullIndices = getFullIndices();
+      jointFullIndices.append(rhs.getFullIndices());
+      return jointFullIndices;
+    }
+    ReassociationIndices result;
+    // Handle the chunk left of the overlapping range.
+    int64_t leftStart = std::min(leftIdx, rhs.leftIdx);
+    int64_t leftEnd = std::max(leftIdx, rhs.leftIdx);
+    llvm::append_range(result, llvm::seq(leftStart, leftEnd));
+    // Handle the chunk right of the overlapping range. Symmetrically, we should
+    // skip the edge of the overlap AND include the rightmost index.
+    int64_t rightStart = std::min(rightIdx, rhs.rightIdx) + 1;
+    int64_t rightEnd = std::max(rightIdx, rhs.rightIdx);
+    if (rightStart < rightEnd)
+      llvm::append_range(result, llvm::seq_inclusive(rightStart, rightEnd));
+    return result;
+  }
+
+  /// Converts the range into ReassociationIndices.
+  ReassociationIndices getFullIndices() const {
+    ReassociationIndices result;
+    for (int64_t idx = leftIdx; idx <= rightIdx; ++idx) {
+      result.push_back(idx);
+    }
+    return result;
+  }
+};
+} // namespace
+
+/// Starting from `sourceStartIdx`, searches `sourceShape` for the first
+/// sequence that can be collapsed into a dynamic dimension (at least one must
+/// be present in the source).
+/// By default, lazily returns once the first dynamic dimension has been found.
+/// Setting `matchGreedily` as `true` will also mark all subsequent
+/// source dimensions for collapsing into the target.
+static FailureOr<ReassociationIndexRange>
+findReassociationRangeForDynamicDim(ArrayRef<int64_t> sourceShape,
+                                    int64_t sourceStartIdx,
+                                    bool matchGreedily = false) {
+  const unsigned numSourceDims = sourceShape.size();
+  ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1};
+  std::optional<ReassociationIndexRange> resultRange = std::nullopt;
+
+  ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx};
+  for (; iterationRange.isInRange(sourceShapeAsRange);
+       iterationRange.rightIdx++) {
+    int64_t sourceSize = sourceShape[iterationRange.rightIdx];
+    if (sourceSize == ShapedType::kDynamic) {
+      resultRange = iterationRange;
+      break;
+    }
+  }
+  if (!resultRange)
+    return failure();
+  if (matchGreedily)
+    resultRange->rightIdx = sourceShapeAsRange.rightIdx;
+  return *resultRange;
+}
+
+/// Starting from `sourceStartIdx`, searches `sourceShape` for the first
+/// sequence of static dimensions such that their product matches `targetSize`.
+/// By default, lazily returns once the product matches the target size. Setting
+/// `matchGreedily` as `true` will append all neighboring unit dimensions
+/// (dimensions of 1) to the match.
+static FailureOr<ReassociationIndexRange>
+findReassociationRangeForSize(ArrayRef<int64_t> sourceShape,
+                              int64_t sourceStartIdx, int64_t targetSize,
+                              bool matchGreedily = false) {
+  const unsigned numSourceDims = sourceShape.size();
+  ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1};
+  std::optional<ReassociationIndexRange> resultRange = std::nullopt;
+
+  ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx};
+  int64_t prodOfCollapsedDims = 1;
+  while (iterationRange.isInRange(sourceShapeAsRange)) {
+    int64_t sourceSize = sourceShape[iterationRange.rightIdx];
+    if (sourceSize == ShapedType::kDynamic) {
+      // Reassociation for a static dim cannot include a dynamic dim. Reset
+      // induction variables to essentially restart the loop from the next
+      // source dimension.
+      prodOfCollapsedDims = 1;
+      iterationRange = {iterationRange.rightIdx + 1,
+                        iterationRange.rightIdx + 1};
+      continue;
+    }
+    prodOfCollapsedDims *= sourceSize;
+    // If the target size has been exceeded without matching, we need to shift
+    // the range start right. From the start of the range, roll back the
+    // multiplication until the target size exceeds the product again.
+    while (prodOfCollapsedDims > targetSize &&
+           !iterationRange.containsSingleIndex()) {
+      int64_t frontSourceSize = sourceShape[iterationRange.leftIdx];
+      prodOfCollapsedDims /= frontSourceSize;
+      // Shrink the range rightwards
+      iterationRange.leftIdx++;
+    }
+    // We could've reached the target size with the current dimension,
+    // also as a result of the above shift to right.
+    if (prodOfCollapsedDims == targetSize) {
+      resultRange = iterationRange;
+      break;
+    }
+    // Increment the iteration range
+    iterationRange.rightIdx++;
+  }
+  if (!resultRange)
+    return failure();
+  if (matchGreedily) {
+    // We now want to collect all unit dimensions directly after the target
+    // product match. Advance the iterator to avoid OOB when the product match
+    // happens at the last element.
+    iterationRange.rightIdx++;
+    while (iterationRange.isInRange(sourceShapeAsRange) &&
+           sourceShape[iterationRange.rightIdx] == 1) {
+      resultRange = iterationRange;
+      iterationRange.rightIdx++;
+    }
+  }
+  return *resultRange;
+}
+
+/// Attempts to find a valid collapsing reassociation of `sourceShape` into
+/// `targetShape` through a simple traversal. If successful, an array of source
+/// index ranges is returned, correspondingly to each dimension in the target
+/// shape. The resulting indices shall fully cover the `sourceShape` without
+/// overlaps.
+///
+/// The algorithm is essentially a lazy one, searching for non-greedy matches -
+/// it will only yield a greedy match for the last target dimension.
+/// FIXME: The algorithm can only backtrack when it needs to append an offset
+/// for a static target dimension to the preceding dynamic one (this retains the
+/// linear complexity). As feasible, consider adding further backtracking
+/// routines to enable more reassociations, e.g.:
+/// - ?x2x?x2 into ?x2
+static FailureOr<SmallVector<ReassociationIndexRange>>
+findReassociationRangesForCollapse(ArrayRef<int64_t> sourceShape,
+                                   ArrayRef<int64_t> targetShape) {
+  unsigned numSourceDims = sourceShape.size(),
+           numTargetDims = targetShape.size();
+  assert(numSourceDims > numTargetDims);
+  ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1};
+
+  SmallVector<ReassociationIndexRange> reassocRanges;
+  reassocRanges.reserve(numTargetDims);
+  // We'll iterate in strides of 2 to enable pseudo-backtracking for simple
+  // cases, e.g.:
+  // - ?x2x3x5 into ?x15
+  std::optional<int64_t> prevTargetSize = std::nullopt;
+  for (unsigned targetDimIdx = 0, sourceDimIdx = 0;
+       targetDimIdx < numTargetDims; ++targetDimIdx) {
+    int64_t targetSize = targetShape[targetDimIdx];
+    // Simply check if there are any subsequent target dimensions left - if not,
+    // the match must be made greedily.
+    bool shouldMatchGreedily = targetDimIdx == numTargetDims - 1;
+    FailureOr<ReassociationIndexRange> sourceRange;
+    if (targetSize == ShapedType::kDynamic) {
+      sourceRange = findReassociationRangeForDynamicDim(
+          sourceShape, sourceDimIdx, shouldMatchGreedily);
+    } else {
+      sourceRange = findReassociationRangeForSize(
+          sourceShape, sourceDimIdx, targetSize, shouldMatchGreedily);
+    }
+
+    // Run sanity checks on the returned index range.
+    if (failed(sourceRange) || failed(sourceRange->verify()) ||
+        !sourceRange->isInRange(sourceShapeAsRange))
+      return failure();
+    if (sourceRange->leftIdx > sourceDimIdx) {
+      // If some source dimensions had to be skipped in order to find a match,
+      // they must be collapsed into the directly preceding dynamic dimension.
+      if (!prevTargetSize || prevTargetSize != ShapedType::kDynamic)
+        return failure();
+      reassocRanges.back().rightIdx = sourceRange->leftIdx - 1;
+    }
+
+    // Store the gathered information as required for the next iteration.
+    prevTargetSize = targetSize;
+    sourceDimIdx = sourceRange->rightIdx + 1;
+    reassocRanges.push_back(*sourceRange);
+  }
+  // Fail if the source shape wasn't a full match for the target shape. We only
+  // need to check the last recorded index - any other gaps should have been
+  // mended by the main loop.
+  if (reassocRanges.back().rightIdx < sourceShapeAsRange.rightIdx)
+    return failure();
+  return reassocRanges;
+}
+
+/// A variant of `findReassociationRangesForCollapse(...)` that can also scan
+/// the shapes right-to-left.
+static FailureOr<SmallVector<ReassociationIndexRange>>
+findReassociationRangesForCollapse(ArrayRef<int64_t> sourceShape,
+                                   ArrayRef<int64_t> targetShape,
+                                   bool iterateRightToLeft) {
+  if (!iterateRightToLeft)
+    return findReassociationRangesForCollapse(sourceShape, targetShape);
+  // NB: To iterate right-to-left, we currently reverse the shapes and then
+  // reverse the result back. The reversed shapes must not be temporary, as
+  // we're passing through an ArrayRef.
+  // FIXME: It would be preferable to avoid the expensive copies. At the moment,
+  // this approach is chosen for readability of the main implementation.
+  std::vector<int64_t> sourceToReverse = sourceShape.vec(),
+                       targetToReverse = targetShape.vec();
+  std::reverse(sourceToReverse.begin(), sourceToReverse.end());
+  std::reverse(targetToReverse.begin(), targetToReverse.end());
+  auto invertedRanges =
+      findReassociationRangesForCollapse(sourceToReverse, targetToReverse);
+  if (failed(invertedRanges))
+    return failure();
+  SmallVector<ReassociationIndexRange> &rangesToInvert = *invertedRanges;
+  unsigned numSourceDims = sourceShape.size();
+  // We have received the ranges for inverted shapes. Now we have to invert
+  // the ranges back to correspond with the original source shape.
+  for (auto &range : rangesToInvert) {
+    int64_t invLeftIdx = range.leftIdx, invRightIdx = range.rightIdx;
+    range.leftIdx = numSourceDims - 1 - invRightIdx;
+    range.rightIdx = numSourceDims - 1 - invLeftIdx;
+  }
+  // Also invert the ordering of the ranges to correspond with the original
+  // target shape.
+  std::reverse(rangesToInvert.begin(), rangesToInvert.end());
+  return rangesToInvert;
+}
+
 std::optional<SmallVector<ReassociationIndices>>
 mlir::getReassociationIndicesForCollapse(ArrayRef<int64_t> sourceShape,
                                          ArrayRef<int64_t> targetShape) {
-  if (sourceShape.size() <= targetShape.size())
+  unsigned numSourceDims = sourceShape.size(),
+           numTargetDims = targetShape.size();
+  // We're supposed to search for a collapsing reassociation. If the sizes
+  // match, there's no actual collapsing taking place - it's either a no-op or a
+  // `tensor.reshape`-style reassociation (that would be beyond the scope of
+  // this utility).
+  if (numSourceDims <= numTargetDims)
     return std::nullopt;
-  unsigned sourceDim = 0;
-  SmallVector<ReassociationIndices> reassociationMap;
-  reassociationMap.reserve(targetShape.size());
-
-  ReassociationIndices currIndices;
-  int64_t prodOfCollapsedDims = 1;
-  while (sourceDim < sourceShape.size()) {
-    unsigned targetDim = reassociationMap.size();
-    // If we have mapped all the target dimensions stop and handle the remaining
-    // tail of size-1 dimensions explicitly.
-    if (targetDim == targetShape.size())
-      break;
-
-    int64_t currTargetShape = targetShape[targetDim];
-    while (sourceDim < (sourceShape.size() - 1) &&
-           sourceShape[sourceDim] != ShapedType::kDynamic &&
-           prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape) {
-      prodOfCollapsedDims *= sourceShape[sourceDim];
-      currIndices.push_back(sourceDim++);
+  // Early handling for scalar target types.
+  if (numTargetDims == 0) {
+    ReassociationIndices allSourceIndices;
+    allSourceIndices.reserve(numSourceDims);
+    for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims;
+         ++sourceDimIdx) {
+      int64_t sourceSize = sourceShape[sourceDimIdx];
+      // All source dimensions must be unit or dynamic.
+      if (sourceSize != 1 && sourceSize != ShapedType::kDynamic)
+        return std::nullopt;
+      allSourceIndices.push_back(sourceDimIdx);
     }
-
-    // If the current expanded dimension is dynamic, then the collapsed
-    // dimensions should also be dynamic and product of all previous unprocessed
-    // dimensions of the expanded shape should be 1.
-    if (sourceShape[sourceDim] == ShapedType::kDynamic &&
-        (currTargetShape != ShapedType::kDynamic || prodOfCollapsedDims != 1))
-      return std::nullopt;
-
-    // If the collapsed dim is dynamic, the current expanded dim should also
-    // be dynamic.
-    if (currTargetShape == ShapedType::kDynamic &&
-        sourceShape[sourceDim] != ShapedType::kDynamic)
-      return std::nullopt;
-
-    // For static shapes, if the product of dimensions of the expanded shape
-    // should match the collapsed dimension shape.
-    if (prodOfCollapsedDims * sourceShape[sourceDim] != currTargetShape)
-      return std::nullopt;
-
-    currIndices.push_back(sourceDim++);
-    reassociationMap.emplace_back(ReassociationIndices{});
-    std::swap(reassociationMap.back(), currIndices);
-    prodOfCollapsedDims = 1;
+    return SmallVector<ReassociationIndices>{allSourceIndices};
   }
-  // All the dimensions in the target must have been processed.
-  if (reassociationMap.size() != targetShape.size())
+
+  // Collect source ranges by iterating over the target shape left-to-right.
+  FailureOr<SmallVector<ReassociationIndexRange>> maybeForwardRanges =
+      findReassociationRangesForCollapse(sourceShape, targetShape);
+  if (failed(maybeForwardRanges))
     return std::nullopt;
-  // Process any remaining entries in the source shape. They all need to be
-  // 1 or dynamic.
-  for (; sourceDim < sourceShape.size(); sourceDim++) {
-    if (sourceShape[sourceDim] != ShapedType::kDynamic &&
-        sourceShape[sourceDim] != 1)
-      return std::nullopt;
-    // The map is empty when the target type is a scalar.
-    if (!reassociationMap.empty())
-      reassociationMap.back().push_back(sourceDim);
+  auto &ranges = *maybeForwardRanges;
+  // Now do the same in reverse. We need to get another valid reassociation
+  // through some other strategy, and then compare the results in order to
+  // disambiguate mixed subshapes, such as:
+  // ?x?x? into ?x?, ?x2x? into ?x?, ?x2x3x6x? into ?x6x?
+  // This leads us to lose some of the reassociation opportunities that can only
+  // be found by iterating in a certain direction, e.g. 2x2x? into 2x? - without
+  // backtracking, the algorithm will fail right-to-left. However, this is the
+  // best way to preserve correctness.
+  FailureOr<SmallVector<ReassociationIndexRange>> maybeReverseRanges =
+      findReassociationRangesForCollapse(sourceShape, targetShape,
+                                         /*iterateRightToLeft=*/true);
+  if (failed(maybeReverseRanges))
+    return std::nullopt;
+  auto &reverseRanges = *maybeReverseRanges;
+
+  if (ranges.size() != numTargetDims || reverseRanges.size() != numTargetDims)
+    return std::nullopt;
+  // Now we can check for ambiguity of each target dimension's reassociation. If
+  // successful, we put the full indices into our result map for the target
+  // shape.
+  SmallVector<ReassociationIndices> reassociationMap(numTargetDims);
+  for (unsigned targetDimIdx = 0; targetDimIdx < numTargetDims;
+       ++targetDimIdx) {
+    ReassociationIndexRange &range = ranges[targetDimIdx];
+    ReassociationIndexRange &reverseRange = reverseRanges[targetDimIdx];
+    // Get non-overlapping indices between the ranges
+    ReassociationIndices nonMatchingIndices =
+        range.getNonOverlappingIndicesWith(reverseRange);
+    // Unit dimensions can be collapsed wherever - this is the only ambiguity
+    // that we allow.
+    for (int64_t sourceDimIdx : nonMatchingIndices) {
+      if (sourceShape[sourceDimIdx] != 1)
+        return std::nullopt;
+    }
+    reassociationMap[targetDimIdx] = range.getFullIndices();
   }
   return reassociationMap;
 }
diff --git a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir
index 51350e5bc849..6979770154ba 100644
--- a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir
@@ -158,8 +158,8 @@ func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> {
 // -----
 
 // CHECK-LABEL: func.func @unpack_dynamic
-// CHECK-NOT:     tensor.collapse
-// CHECK:         linalg.unpack
+// CHECK:     tensor.collapse
+// CHECK-NOT:         linalg.unpack
 func.func @unpack_dynamic(%arg0: tensor<?x32xf32>) -> tensor<?xf32> {
   %c32 = arith.constant 32 : index
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 67b03b0a3485..3251c5a4a2bf 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1101,7 +1101,7 @@ func.func @fold_expand_of_collapse(%arg0 : tensor<3x4x4xf32>) -> tensor<3x4x4xf3
 
 // -----
 
-func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor<?x4x?xf32>, %arg1: index, %arg2: index)
+func.func @fold_expand_of_collapse_mixed_subshape(%arg0 : tensor<?x4x?xf32>, %arg1: index, %arg2: index)
     -> tensor<?x4x?xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1], [2]]
       : tensor<?x4x?xf32> into tensor<?x?xf32>
@@ -1109,12 +1109,28 @@ func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor<?x4x?xf32>, %arg1: ind
       : tensor<?x?xf32> into tensor<?x4x?xf32>
   return %1 : tensor<?x4x?xf32>
 }
-// CHECK-LABEL: @fold_expand_of_collapse_dynamic
+// CHECK-LABEL: @fold_expand_of_collapse_mixed_subshape
 //   CHECK-NOT:   tensor.{{.*}}_shape
 
 // -----
 
-func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1: index, %arg2: index, %arg3: index)
+func.func @fold_expand_of_collapse_mixed_target_subshape(%arg0 : tensor<?x4x?x2xf32>, %arg1: index, %arg2: index)
+    -> tensor<?x4x?xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0, 1], [2, 3]]
+      : tensor<?x4x?x2xf32> into tensor<?x?xf32>
+  %1 = tensor.expand_shape %0 [[0, 1], [2]] output_shape [%arg1, 4, %arg2]
+      : tensor<?x?xf32> into tensor<?x4x?xf32>
+  return %1 : tensor<?x4x?xf32>
+}
+// CHECK-LABEL: @fold_expand_of_collapse_mixed_target_subshape
+//   CHECK-NOT:   tensor.expand_shape
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %arg0 {{\[}}[0], [1], [2, 3]]
+//  CHECK-SAME:     : tensor<?x4x?x2xf32> into tensor<?x4x?xf32>
+//  CHECK-NEXT:   return %[[COLLAPSE]]
+
+// -----
+
+func.func @no_fold_expand_of_collapse_fully_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1: index, %arg2: index, %arg3: index)
     -> tensor<?x?x?xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1], [2]]
       : tensor<?x?x?xf32> into tensor<?x?xf32>
@@ -1122,7 +1138,22 @@ func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1:
       : tensor<?x?xf32> into tensor<?x?x?xf32>
   return %1 : tensor<?x?x?xf32>
 }
-// CHECK-LABEL: @no_fold_expand_of_collapse_dynamic
+// CHECK-LABEL: @no_fold_expand_of_collapse_fully_dynamic
+//       CHECK:   tensor.collapse_shape
+//       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape
+//       CHECK:   return %[[EXPAND]]
+
+// -----
+
+func.func @no_fold_expand_of_collapse_adjacent_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1: index, %arg2: index)
+    -> tensor<?x?xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0, 1, 2]]
+      : tensor<?x?x?xf32> into tensor<?xf32>
+  %1 = tensor.expand_shape %0 [[0, 1]] output_shape [%arg1, %arg2]
+      : tensor<?xf32> into tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+// CHECK-LABEL: @no_fold_expand_of_collapse_adjacent_dynamic
 //       CHECK:   tensor.collapse_shape
 //       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape
 //       CHECK:   return %[[EXPAND]]
diff --git a/mlir/unittests/Dialect/Utils/CMakeLists.txt b/mlir/unittests/Dialect/Utils/CMakeLists.txt
index 61b9cdcb3b8f..e921c8bcfb4e 100644
--- a/mlir/unittests/Dialect/Utils/CMakeLists.txt
+++ b/mlir/unittests/Dialect/Utils/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIRDialectUtilsTests
   StructuredOpsUtilsTest.cpp
+  ReshapeOpsUtilsTest.cpp
   IndexingUtilsTest.cpp
 )
 mlir_target_link_libraries(MLIRDialectUtilsTests
diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
new file mode 100644
index 000000000000..db1a87a4de2d
--- /dev/null
+++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
@@ -0,0 +1,203 @@
+//===- ReshapeOpsUtilsTest.cpp - ReshapeOpsUtils unit tests ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "llvm/ADT/STLExtras.h"
+#include "gtest/gtest.h"
+#include <optional>
+
+using namespace mlir;
+
+/// Helper to make constructing
+/// `std::optional<SmallVector<ReassociationIndices>>` more readable.
+static std::optional<SmallVector<ReassociationIndices>>
+makeOptionalIndices(std::initializer_list<ReassociationIndices> list) {
+  return std::optional<SmallVector<ReassociationIndices>>(list);
+}
+
+TEST(ReassociationIndicesForCollapse, ScalarTest) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}),
+            makeOptionalIndices({{0}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}),
+            makeOptionalIndices({{0}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic,
+                                                ShapedType::kDynamic, 1,
+                                                ShapedType::kDynamic},
+                                               {}),
+            makeOptionalIndices({{0, 1, 2, 3, 4}}));
+}
+
+TEST(ReassociationIndicesForCollapse, ScalarTestFailure) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({}, {}), std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({}, {1}), std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({2}, {}), std::nullopt);
+  EXPECT_EQ(
+      getReassociationIndicesForCollapse({1, 2, ShapedType::kDynamic, 1}, {}),
+      std::nullopt);
+}
+
+TEST(ReassociationIndicesForCollapse, StaticTest) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {200}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {10, 600}),
+            makeOptionalIndices({{0}, {1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 30}),
+            makeOptionalIndices({{0, 1}, {2}}));
+}
+
+TEST(ReassociationIndicesForCollapse, StaticTestFailure) {
+  // No-op reassociation
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10, 20}),
+            std::nullopt);
+  // Invalid static reassociations
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10}), std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 300}),
+            std::nullopt);
+  // Non-collapsing (expanding) reassociation
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {1, 10, 20, 30}),
+            std::nullopt);
+}
+
+TEST(ReassociationIndicesForCollapse, StaticTestUnitDims) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 1}, {10}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 20, 30}, {600}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1}, {1}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1, 1}, {1, 1, 1}),
+            makeOptionalIndices({{0}, {1}, {2, 3}}));
+}
+
+TEST(ReassociationIndicesForCollapse, DynamicTest) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1, 1},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, 1, ShapedType::kDynamic, 1},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}, {2, 3, 4}}));
+  EXPECT_EQ(
+      getReassociationIndicesForCollapse(
+          {ShapedType::kDynamic, ShapedType::kDynamic}, {ShapedType::kDynamic}),
+      makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, ShapedType::kDynamic},
+                {1, ShapedType::kDynamic}),
+            makeOptionalIndices({{0}, {1, 2}}));
+
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, ShapedType::kDynamic},
+                {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 1, 2, ShapedType::kDynamic, 10},
+                {ShapedType::kDynamic, 10}),
+            makeOptionalIndices({{0, 1, 2, 3}, {4}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20},
+                                               {ShapedType::kDynamic, 20}),
+            makeOptionalIndices({{0, 1}, {2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic, 20},
+                                               {ShapedType::kDynamic, 20}),
+            makeOptionalIndices({{0, 1}, {2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 3, 2, 5, 2}, {ShapedType::kDynamic, 20}),
+            makeOptionalIndices({{0, 1}, {2, 3, 4}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {10, ShapedType::kDynamic, 20, ShapedType::kDynamic, 1},
+                {ShapedType::kDynamic, 20, ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}, {2}, {3, 4}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 1},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, ShapedType::kDynamic, 1},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0}, {1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}, {2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 1, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0}, {1, 2}}));
+}
+
+TEST(ReassociationIndicesForCollapse, DynamicTestFailure) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20},
+                                               {ShapedType::kDynamic, 10}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {20, ShapedType::kDynamic, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 5, 3, 2, 2}, {ShapedType::kDynamic, 20}),
+            std::nullopt);
+  EXPECT_EQ(
+      getReassociationIndicesForCollapse(
+          {ShapedType::kDynamic, ShapedType::kDynamic, ShapedType::kDynamic},
+          {ShapedType::kDynamic, ShapedType::kDynamic}),
+      std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, ShapedType::kDynamic, 10, 1,
+                 ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 10, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 2, 2, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 3, 4, 3, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 12, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 8, 4, 2, 16, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 32, ShapedType::kDynamic}),
+            std::nullopt);
+
+  //===----------------------------------------------------------------------===//
+  // TODO: Reassociation for the following examples can be computed, but isn't
+  // supported by `getReassociationIndicesForCollapse`.
+  //===----------------------------------------------------------------------===//
+
+  // TODO: Fails because there's no backtracking when some source dimensions
+  // remain unmatched at either edge.
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, ShapedType::kDynamic, 10},
+                {ShapedType::kDynamic, 10}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 2, 2},
+                                               {1, ShapedType::kDynamic, 2}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({2, 2, ShapedType::kDynamic, 1},
+                                               {2, ShapedType::kDynamic}),
+            std::nullopt);
+}

From edaac11df3f82268e8ca34bf34b3e9d115b7d475 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 09:29:41 +0100
Subject: [PATCH 0103/1322] [X86] combineSelect - attempt to combine with
 shuffles (#143753)

Before legalization we will convert to a vector_shuffle node - but afterward we can try to combine the select into an existing target shuffle chain
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   16 +-
 .../CodeGen/X86/combine-mask-with-shuffle.ll  |   32 +-
 llvm/test/CodeGen/X86/pr132844.ll             |   11 +-
 .../vector-interleaved-load-i8-stride-7.ll    | 1166 ++++---
 .../vector-interleaved-store-i16-stride-8.ll  | 2864 ++++++++---------
 .../vector-interleaved-store-i8-stride-5.ll   |   30 +-
 .../vector-interleaved-store-i8-stride-6.ll   | 2026 ++++++------
 .../vector-interleaved-store-i8-stride-7.ll   |  231 +-
 .../vector-interleaved-store-i8-stride-8.ll   | 1096 +++----
 .../X86/vector-shuffle-combining-avx512f.ll   |   40 +-
 10 files changed, 3610 insertions(+), 3902 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 96714adf78e4..b0553aa4b819 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47785,13 +47785,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                                                            DL, DAG, Subtarget))
       return V;
 
-  // Convert vselects with constant condition into shuffles.
-  if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
-      (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
+  if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
     SmallVector<int, 64> Mask;
     if (createShuffleMaskFromVSELECT(Mask, Cond,
-                                     N->getOpcode() == X86ISD::BLENDV))
-      return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+                                     N->getOpcode() == X86ISD::BLENDV)) {
+      // Convert vselects with constant condition into shuffles.
+      if (DCI.isBeforeLegalizeOps())
+        return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+
+      // Attempt to combine as shuffle.
+      SDValue Op(N, 0);
+      if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+        return Res;
+    }
   }
 
   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
diff --git a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll
index 268ac3dd31b8..7564e65a428b 100644
--- a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll
@@ -67,11 +67,9 @@ define <16 x i32> @combine_mask_with_abs(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_umin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpminud %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpminud %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
@@ -88,11 +86,9 @@ define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_umax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpmaxud %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpmaxud %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
@@ -109,11 +105,9 @@ define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_smin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpminsd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpminsd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
@@ -130,11 +124,9 @@ define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_smax(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_smax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpmaxsd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/pr132844.ll b/llvm/test/CodeGen/X86/pr132844.ll
index ded100b2accc..dc9f006d93d1 100644
--- a/llvm/test/CodeGen/X86/pr132844.ll
+++ b/llvm/test/CodeGen/X86/pr132844.ll
@@ -4,12 +4,11 @@
 define  { ptr, i8 } @PR132844(<4 x ptr> %0, <4 x ptr> %1) {
 ; CHECK-LABEL: PR132844:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vinserti64x2 $1, 16, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    vmovdqu %ymm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, 16, %ymm2, %ymm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; CHECK-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index c132c5ea2ef4..82481269022b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -13723,364 +13723,361 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm17
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm12
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm18
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm6
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm7
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm1 {%k1}
 ; AVX512BW-FCP-NEXT:    kmovq %k1, %k2
 ; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
-; AVX512BW-FCP-NEXT:    kmovq %k1, %k3
-; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm9 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm9, %zmm9
 ; AVX512BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm1 {%k5}
+; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
 ; AVX512BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm8 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm8, %xmm21
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm11 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm11, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k7
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm21 {%k7}
-; AVX512BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm11
+; AVX512BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
 ; AVX512BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm22
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm0, %ymm22
 ; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm23
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm7 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm15 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm23, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k5}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm8 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
 ; AVX512BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm12, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm13, %xmm23
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm13 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm24 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm25 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm24, %xmm25, %xmm24
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm24
 ; AVX512BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm24, %ymm13 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm19, %xmm19
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm23, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm19
 ; AVX512BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm19 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm20
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm19, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm18, %ymm19 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
+; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm19 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm18 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm17 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm18 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm17, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm17 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm17 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm17 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm16, %zmm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm17, %zmm17
 ; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
-; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm18 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm16 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,0,7,14],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm18 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm17 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm19 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm9 {%k4}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm10 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm19 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm16, %zmm20
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm16, %zmm17
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm17 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm22, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm17
+; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm22, %ymm22
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm22, %zmm21, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm16 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm21 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm22
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,xmm22[3,10],zero,zero,zero,xmm22[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u,5,12],zero,zero,xmm21[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm21, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm18 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm18
+; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm21 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14]
+; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm21, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm19 {%k2}
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm20 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
-; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm20, %xmm20
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm9, %xmm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm9, %ymm0, %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512BW-FCP-NEXT:    vporq %xmm9, %xmm17, %xmm9
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm20 {%k3}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm17
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm18
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm9
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm19
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm2 {%k5}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm11, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm9 {%k5}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm10, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm20, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm3 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm6 {%k2}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm8
-; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
-; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm22, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k7}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm21, %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm21, %zmm20, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k5}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm18 {%k1}
+; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm7 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm11, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm9 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k3}
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k7}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k5}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rdi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -14453,362 +14450,359 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm1 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k2
 ; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k3
-; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm9 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm10, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm9, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm1 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm8 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm8, %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm11 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm11, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm21 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm21 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm22
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm16[u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm16, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm8 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm12, %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm12, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm13 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm25 = xmm16[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm25, %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm0, %ymm23
 ; AVX512DQ-BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm23, %ymm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm19, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm13, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm18, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm14 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm17 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm14 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm14 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm17, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm16 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,0,7,14],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm18, %ymm17 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm18 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm19 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm9 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm16[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm22, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm21 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm22, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm22, %zmm21, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm14 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm21 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm21, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm19 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm17, %zmm0, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm17 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm20 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm20, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm9, %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm9, %ymm0, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm9, %xmm17, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm20 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm2 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm11, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm9 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm10, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm20, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm3 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
-; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm16[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm22, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm21, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm21, %zmm20, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm17 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm19, %zmm0, %zmm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm11, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm9 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm15, %zmm0, %zmm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm0, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm16[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rdi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <448 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 9c9dca82f60c..f626dfe5daf0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -4093,139 +4093,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    vpermd %zmm1, %zmm26, %zmm30
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512-NEXT:    vmovdqa (%r10), %xmm0
+; AVX512-NEXT:    vmovdqa (%rax), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm20
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm17, %zmm9
+; AVX512-NEXT:    movb $-86, %r11b
 ; AVX512-NEXT:    kmovw %r11d, %k1
-; AVX512-NEXT:    vpermd %zmm0, %zmm27, %zmm30 {%k1}
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm11
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    vpermd %zmm1, %zmm28, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512-NEXT:    kmovw %r11d, %k2
-; AVX512-NEXT:    vpermd %zmm0, %zmm29, %zmm3 {%k2}
-; AVX512-NEXT:    vmovdqa 32(%r10), %ymm15
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11]
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm12
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-NEXT:    vpermd %zmm13, %zmm19, %zmm31
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-NEXT:    vpermd %zmm6, %zmm18, %zmm31 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-NEXT:    vpermd %zmm6, %zmm20, %zmm14
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11]
-; AVX512-NEXT:    vpermd %zmm4, %zmm21, %zmm14 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15]
-; AVX512-NEXT:    vmovdqa 32(%r10), %xmm2
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15]
-; AVX512-NEXT:    vmovdqa 32(%rax), %xmm7
-; AVX512-NEXT:    vpermd %zmm12, %zmm19, %zmm17
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512-NEXT:    vpermd %zmm4, %zmm18, %zmm17 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15]
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512-NEXT:    vpermd %zmm0, %zmm20, %zmm16
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r10), %ymm5
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm10
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm13
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-NEXT:    vpermt2d %zmm11, %zmm18, %zmm0
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-NEXT:    vpermt2d %zmm14, %zmm19, %zmm11
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm5
+; AVX512-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15]
+; AVX512-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm19, %zmm13
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512-NEXT:    vpermd %zmm4, %zmm21, %zmm16 {%k2}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512-NEXT:    vpermd %zmm4, %zmm26, %zmm23
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX512-NEXT:    vpermd %zmm4, %zmm27, %zmm23 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm4, %zmm28, %zmm22
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512-NEXT:    vpermd %zmm6, %zmm29, %zmm22 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
-; AVX512-NEXT:    vpermd %zmm6, %zmm26, %zmm25
-; AVX512-NEXT:    vpermd %zmm2, %zmm27, %zmm25 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpermd %zmm0, %zmm28, %zmm24
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm24 {%k2}
-; AVX512-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX512-NEXT:    vmovdqa (%r9), %ymm3
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11]
-; AVX512-NEXT:    vpermd %zmm6, %zmm19, %zmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-NEXT:    vpermd %zmm7, %zmm18, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-NEXT:    vpermd %zmm2, %zmm19, %zmm2
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-NEXT:    vpermd %zmm0, %zmm18, %zmm2 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm0, %zmm20, %zmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11]
-; AVX512-NEXT:    vpermd %zmm13, %zmm21, %zmm0 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15]
-; AVX512-NEXT:    vpermd %zmm4, %zmm20, %zmm4
-; AVX512-NEXT:    vpermd %zmm1, %zmm21, %zmm4 {%k2}
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512-NEXT:    vpermd %zmm5, %zmm26, %zmm5
-; AVX512-NEXT:    vpermd %zmm1, %zmm27, %zmm5 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-NEXT:    vpermd %zmm7, %zmm28, %zmm7
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm7 {%k2}
-; AVX512-NEXT:    movb $-86, %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm14 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-NEXT:    vpermt2d %zmm14, %zmm19, %zmm10
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm6, %zmm18, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm3
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm24, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm22, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm15, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm9, 64(%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -4234,139 +4220,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm25
-; AVX512-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm10, %zmm19
+; AVX512-FCP-NEXT:    movb $-86, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm25 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm6, %zmm29
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm29 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm19 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm27
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm17, %zmm27 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm15
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm18, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm19, %zmm30 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm11
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm16, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm28 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm13
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm12 {%k1}
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm11
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm18, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm31 {%k2}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm21
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm21 {%k1}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm20 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm26, %zmm23
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm23 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm22
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm22 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm11
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm11 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm13, %zmm18, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm19, %zmm13 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm17, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm18, %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm2 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm26, %zmm3
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm24, %zmm4
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm4 {%k2}
-; AVX512-FCP-NEXT:    movb $-86, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm30 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm20, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -4374,139 +4344,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm26, %zmm30
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm17, %zmm9
+; AVX512DQ-NEXT:    movb $-86, %r11b
 ; AVX512DQ-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm27, %zmm30 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm11
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm28, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512DQ-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm29, %zmm3 {%k2}
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm15
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm12
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm13, %zmm19, %zmm31
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm18, %zmm31 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm20, %zmm14
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm21, %zmm14 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm2
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm7
-; AVX512DQ-NEXT:    vpermd %zmm12, %zmm19, %zmm17
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm18, %zmm17 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm20, %zmm16
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm10
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm13
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm19, %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15]
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm19, %zmm13
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm14
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm21, %zmm16 {%k2}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm26, %zmm23
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm27, %zmm23 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm28, %zmm22
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm29, %zmm22 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm26, %zmm25
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm27, %zmm25 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm28, %zmm24
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm24 {%k2}
-; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm19, %zmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermd %zmm7, %zmm18, %zmm6 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm19, %zmm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm18, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm20, %zmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11]
-; AVX512DQ-NEXT:    vpermd %zmm13, %zmm21, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm20, %zmm4
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm21, %zmm4 {%k2}
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512DQ-NEXT:    vpermd %zmm5, %zmm26, %zmm5
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm27, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512DQ-NEXT:    vpermd %zmm7, %zmm28, %zmm7
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm7 {%k2}
-; AVX512DQ-NEXT:    movb $-86, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm14 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm19, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm18, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 64(%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -4515,139 +4471,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm25
-; AVX512DQ-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm10, %zmm19
+; AVX512DQ-FCP-NEXT:    movb $-86, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm25 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm6, %zmm29
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm29 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm19 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm27
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm17, %zmm27 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm18, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm19, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm16, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm28 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm13
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm18, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm20 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm26, %zmm23
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm22 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm11
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm13, %zmm18, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm19, %zmm13 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm17, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm18, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm2 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm26, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm24, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT:    movb $-86, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm30 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm20, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -7777,1095 +7717,959 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride8_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $504, %rsp # imm = 0x1F8
+; AVX512-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%r10), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r10), %xmm4
-; AVX512-NEXT:    vmovdqa (%rax), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa (%r9), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    vpermd %zmm2, %zmm30, %zmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512-NEXT:    kmovw %r11d, %k2
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512-NEXT:    movb $-86, %r11b
 ; AVX512-NEXT:    kmovw %r11d, %k1
-; AVX512-NEXT:    vmovdqa 96(%r10), %ymm2
-; AVX512-NEXT:    vmovdqa 96(%rax), %ymm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm8
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-NEXT:    vpermd %zmm10, %zmm19, %zmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm18, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm10
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm11
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512-NEXT:    vmovdqa 96(%rax), %ymm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm13
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm16, %zmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermd %zmm14, %zmm17, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512-NEXT:    vpermd %zmm5, %zmm19, %zmm0
-; AVX512-NEXT:    vpermd %zmm2, %zmm18, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512-NEXT:    vpermd %zmm5, %zmm16, %zmm31
-; AVX512-NEXT:    vpermd %zmm2, %zmm17, %zmm31 {%k1}
-; AVX512-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512-NEXT:    vmovdqa 96(%rax), %xmm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa 96(%r9), %xmm10
-; AVX512-NEXT:    vmovdqa 96(%r8), %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-NEXT:    vpermd %zmm12, %zmm30, %zmm0
-; AVX512-NEXT:    vpermd %zmm9, %zmm29, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512-NEXT:    vpermd %zmm8, %zmm30, %zmm0
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r10), %ymm2
-; AVX512-NEXT:    vmovdqa 64(%rax), %ymm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
-; AVX512-NEXT:    vmovdqa 64(%r9), %ymm10
-; AVX512-NEXT:    vmovdqa 64(%r8), %ymm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512-NEXT:    vpermd %zmm12, %zmm19, %zmm0
-; AVX512-NEXT:    vpermd %zmm8, %zmm18, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm19, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm17, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
 ; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm1
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm16, %zmm26
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm17, %zmm26 {%k1}
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15]
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm9
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512-NEXT:    vpermd %zmm10, %zmm19, %zmm5
-; AVX512-NEXT:    vpermd %zmm2, %zmm18, %zmm5 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-NEXT:    vpermt2d %zmm14, %zmm19, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm7 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm5, %zmm18, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512-NEXT:    vpermd %zmm0, %zmm16, %zmm24
-; AVX512-NEXT:    vpermd %zmm2, %zmm17, %zmm24 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vpermd %zmm1, %zmm30, %zmm1
-; AVX512-NEXT:    vpermd %zmm0, %zmm29, %zmm1 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm4
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm9, %zmm19, %zmm28
-; AVX512-NEXT:    vpermd %zmm3, %zmm18, %zmm28 {%k2}
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
-; AVX512-NEXT:    vpermd %zmm3, %zmm16, %zmm23
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
-; AVX512-NEXT:    vpermd %zmm6, %zmm17, %zmm23 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
-; AVX512-NEXT:    vpermd %zmm1, %zmm19, %zmm25
-; AVX512-NEXT:    vpermd %zmm0, %zmm18, %zmm25 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
-; AVX512-NEXT:    vpermd %zmm0, %zmm16, %zmm21
-; AVX512-NEXT:    vpermd %zmm3, %zmm17, %zmm21 {%k1}
-; AVX512-NEXT:    vmovdqa (%r10), %ymm3
-; AVX512-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm19, %zmm27
-; AVX512-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11]
-; AVX512-NEXT:    vpermd %zmm9, %zmm18, %zmm27 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512-NEXT:    vpermd %zmm4, %zmm19, %zmm20
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-NEXT:    vpermd %zmm3, %zmm18, %zmm20 {%k2}
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm16, %zmm18
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
-; AVX512-NEXT:    vpermd %zmm9, %zmm17, %zmm18 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512-NEXT:    vmovdqa 32(%r10), %xmm4
-; AVX512-NEXT:    vpermd %zmm3, %zmm16, %zmm16
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r10), %xmm7
+; AVX512-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm8
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm9
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm4, %zmm18, %zmm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r10), %xmm10
 ; AVX512-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512-NEXT:    vpermd %zmm6, %zmm17, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512-NEXT:    vpermd %zmm10, %zmm30, %zmm19
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm19 {%k2}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512-NEXT:    vpermd %zmm2, %zmm30, %zmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vpermd %zmm9, %zmm29, %zmm10 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512-NEXT:    vpermd %zmm4, %zmm30, %zmm17
-; AVX512-NEXT:    vpermd %zmm3, %zmm29, %zmm17 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm22
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512-NEXT:    vpermd %zmm6, %zmm30, %zmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512-NEXT:    vpermd %zmm4, %zmm29, %zmm8 {%k2}
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    vpermd %zmm6, %zmm29, %zmm7
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    vpermd %zmm3, %zmm30, %zmm7 {%k1}
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT:    vpermd %zmm6, %zmm29, %zmm6
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm1
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm14, %zmm30, %zmm6 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm5
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm5 {%k1}
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm3, %zmm29, %zmm4
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm14
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512-NEXT:    vpermd %zmm15, %zmm30, %zmm4 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm3
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm14, %zmm29, %zmm14
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512-NEXT:    vpermd %zmm11, %zmm30, %zmm14 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm1
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm2
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm2 {%k1}
-; AVX512-NEXT:    movb $-86, %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm31 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm26 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm24 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm21 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm14 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm18 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm12
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm4
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm7
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm18, %zmm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm21, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm23, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm24, 704(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm26, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 832(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm31, 960(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 896(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512-NEXT:    addq $504, %rsp # imm = 0x1F8
+; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride8_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    subq $328, %rsp # imm = 0x148
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm1
-; AVX512-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm2, %zmm17
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm20
+; AVX512-FCP-NEXT:    movb $-86, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm15
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm23, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm22, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm20, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm21, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm13
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
 ; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
-; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm19
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm19 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm18
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm18 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm25
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm25 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm24
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm24 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm27
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm27 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm26
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm26 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm23, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm28 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm23 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm20, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm21, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm20, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm21, %zmm20 {%k1}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm21 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm29
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm29 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm30 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm31
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm31 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm14, %zmm6 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm9
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm9 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm8 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm7, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm7, %zmm3
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpermd %zmm15, %zmm7, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm4
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm11
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm11 {%k1}
-; AVX512-FCP-NEXT:    movb $-86, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm8 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26 {%k1}
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 512(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 704(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 640(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 832(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 768(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 960(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 896(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
-; AVX512-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512-FCP-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride8_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $504, %rsp # imm = 0x1F8
+; AVX512DQ-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r10), %xmm4
-; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512DQ-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512DQ-NEXT:    movb $-86, %r11b
 ; AVX512DQ-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-NEXT:    vmovdqa 96(%r10), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm10, %zmm19, %zmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm18, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm10
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm13
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm16, %zmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermd %zmm14, %zmm17, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512DQ-NEXT:    vpermd %zmm5, %zmm19, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm18, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512DQ-NEXT:    vpermd %zmm5, %zmm16, %zmm31
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm17, %zmm31 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %xmm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512DQ-NEXT:    vpermd %zmm12, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm29, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r10), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm10
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-NEXT:    vpermd %zmm12, %zmm19, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm18, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm19, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm17, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
 ; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm26
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm17, %zmm26 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm9
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512DQ-NEXT:    vpermd %zmm10, %zmm19, %zmm5
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm18, %zmm5 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm19, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm18, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm16, %zmm24
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm17, %zmm24 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm30, %zmm1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm29, %zmm1 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm19, %zmm28
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm18, %zmm28 {%k2}
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm16, %zmm23
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm17, %zmm23 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm19, %zmm25
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm18, %zmm25 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm16, %zmm21
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm17, %zmm21 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm3
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm19, %zmm27
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm18, %zmm27 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm19, %zmm20
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm18, %zmm20 {%k2}
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm18
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm17, %zmm18 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm4
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm16, %zmm16
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r10), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm10
 ; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm17, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512DQ-NEXT:    vpermd %zmm10, %zmm30, %zmm19
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm19 {%k2}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm30, %zmm10
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm29, %zmm10 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm30, %zmm17
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm29, %zmm17 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm22
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm30, %zmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm29, %zmm8 {%k2}
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm29, %zmm7
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm30, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm29, %zmm6
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm14, %zmm30, %zmm6 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm5
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm29, %zmm4
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm14
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-NEXT:    vpermd %zmm15, %zmm30, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm3
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm14, %zmm29, %zmm14
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512DQ-NEXT:    vpermd %zmm11, %zmm30, %zmm14 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm2
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm2 {%k1}
-; AVX512DQ-NEXT:    movb $-86, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm31 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm6 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm26 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm24 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm21 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm14 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm18 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm7
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 704(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 832(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 960(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 896(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQ-NEXT:    addq $504, %rsp # imm = 0x1F8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQ-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride8_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    subq $328, %rsp # imm = 0x148
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm2, %zmm17
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm20
+; AVX512DQ-FCP-NEXT:    movb $-86, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm15
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm22, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm20, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm9, %zmm21, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm13
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm19
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm19 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm18
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm18 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm25
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm25 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm27
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm27 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm26
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm23, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm28 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm23 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm20, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm21, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm20, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm21, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm21 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm29
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm29 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm31
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm14, %zmm6 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm1 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm9 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm7, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm11
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm11 {%k1}
-; AVX512DQ-FCP-NEXT:    movb $-86, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 832(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 768(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 960(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 896(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 86efcf9c5761..ad9db98711a6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -1190,8 +1190,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
@@ -1233,8 +1232,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
@@ -1461,20 +1459,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
 ; AVX512BW-NEXT:    vpor %ymm4, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
 ; AVX512BW-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX512BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vporq %zmm3, %zmm4, %zmm3
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
 ; AVX512BW-NEXT:    vpermd %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
@@ -1531,20 +1527,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX512DQ-BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT:    vporq %zmm3, %zmm4, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
 ; AVX512DQ-BW-NEXT:    vpermd %zmm2, %zmm4, %zmm4
 ; AVX512DQ-BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 6d499e17bfbc..03f5b90002d3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -2996,94 +2996,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512BW-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512BW-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm10
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm10
+; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512BW-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512BW-NEXT:    kmovd %r10d, %k2
+; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm8
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm13, %zmm14
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm10, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512BW-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm13, %zmm8
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
-; AVX512BW-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm8, %xmm13
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm13, %zmm16
-; AVX512BW-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
+; AVX512BW-NEXT:    vmovdqa (%r8), %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512BW-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512BW-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512BW-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -3092,93 +3086,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
-; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-FCP-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm12
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
 ; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm10, %zmm14, %zmm15
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm8
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm8, %zmm10, %zmm14
-; AVX512BW-FCP-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm7, %zmm10, %zmm16
-; AVX512BW-FCP-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -3187,94 +3176,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm10
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm8
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm8, %zmm13, %zmm14
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm10, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512DQ-BW-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm13, %zmm8
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
-; AVX512DQ-BW-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm7, %zmm13, %zmm16
-; AVX512DQ-BW-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -3283,93 +3266,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm10, %zmm14, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm8, %zmm10, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm7, %zmm10, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
@@ -6368,726 +6346,770 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-LABEL: store_i8_stride6_vf64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512BW-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm3, %ymm8, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm4, %ymm10
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
-; AVX512BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512BW-NEXT:    vpshufb %zmm14, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm7, %zmm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %xmm16
+; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm8
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm17
+; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX512BW-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
 ; AVX512BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512BW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa 32(%r8), %xmm9
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX512BW-NEXT:    vpermt2w %ymm8, %ymm13, %ymm12
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm8, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm12, %zmm3
+; AVX512BW-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm20, %zmm14
+; AVX512BW-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512BW-NEXT:    kmovq %r10, %k2
+; AVX512BW-NEXT:    vmovdqu8 %zmm14, %zmm3 {%k2}
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm15, %xmm14
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm18, %xmm5
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
+; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm7, %zmm14
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm16, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm17, %xmm7
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-NEXT:    vprold $16, %xmm7, %xmm7
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5]
+; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm14 {%k1}
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512BW-NEXT:    vpermi2w %ymm7, %ymm14, %ymm13
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm8, %zmm14
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm7
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm20, %zmm13
+; AVX512BW-NEXT:    vmovdqu8 %zmm13, %zmm7 {%k2}
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm16
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm17
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm18, %zmm15
+; AVX512BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
 ; AVX512BW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k2}
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
-; AVX512BW-NEXT:    movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208
-; AVX512BW-NEXT:    kmovq %r10, %k3
-; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k3}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm9, %ymm5
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm10, %ymm6
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31]
-; AVX512BW-NEXT:    vpermw %ymm6, %ymm8, %ymm6
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %ymm17
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm17, %ymm5
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm19, %ymm7
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31]
-; AVX512BW-NEXT:    vpermw %ymm7, %ymm11, %ymm7
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k1}
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpshufb %zmm14, %zmm13, %zmm6
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k2}
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm5 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %xmm21
-; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm7
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm7, %xmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm22
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm8, %xmm14
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm25, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %xmm23
-; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm15, %xmm14
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm24
-; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm18, %xmm16
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
-; AVX512BW-NEXT:    vprold $16, %xmm16, %xmm16
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm16, %zmm14
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5]
-; AVX512BW-NEXT:    vmovdqu16 %zmm14, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3]
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm27, %zmm16
-; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm16
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpermt2w %zmm28, %zmm27, %zmm26
-; AVX512BW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
-; AVX512BW-NEXT:    kmovq %rcx, %k3
-; AVX512BW-NEXT:    vmovdqu8 %zmm26, %zmm6 {%k3}
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm21, %xmm26
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm22, %xmm20
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm25, %zmm20
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm23, %xmm25
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm24, %xmm12
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512BW-NEXT:    vprold $16, %xmm25, %xmm25
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm25, %zmm12
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5]
-; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k2}
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3]
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero
-; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512BW-NEXT:    vmovdqu8 %zmm12, %zmm20 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm17
+; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm15 {%k2}
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm15, %ymm19
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm20, %ymm5, %ymm22
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512BW-NEXT:    kmovd %r10d, %k3
+; AVX512BW-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-NEXT:    vpermt2w %ymm22, %ymm23, %ymm15
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm15, %zmm15
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm19, %ymm8, %ymm22
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512BW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512BW-NEXT:    kmovq %r10, %k4
+; AVX512BW-NEXT:    vmovdqu8 %zmm24, %zmm15 {%k4}
+; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm21, %zmm10
+; AVX512BW-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm18, %zmm4
+; AVX512BW-NEXT:    vmovdqu16 %zmm10, %zmm4 {%k2}
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
+; AVX512BW-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512BW-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpermt2w %ymm9, %ymm23, %ymm4
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovdqa 32(%r9), %ymm6
+; AVX512BW-NEXT:    vpshufb %ymm19, %ymm6, %ymm6
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm25, %zmm9
+; AVX512BW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k4}
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm11, %ymm9
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm21, %ymm10
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm12, %zmm10
-; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm10 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm9, %ymm13, %ymm17
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm13
-; AVX512BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512BW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-NEXT:    vmovdqu16 %zmm13, %zmm10 {%k2}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm13, %ymm11, %ymm17
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm11
-; AVX512BW-NEXT:    movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512BW-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512BW-NEXT:    vpshufb %zmm9, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
 ; AVX512BW-NEXT:    kmovq %rcx, %k3
-; AVX512BW-NEXT:    vmovdqu8 %zmm11, %zmm10 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
-; AVX512BW-NEXT:    vpermt2w %zmm3, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm16, %ymm1
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm2
-; AVX512BW-NEXT:    vmovdqu16 %zmm4, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k2}
-; AVX512BW-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512BW-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512BW-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512BW-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpshufb %zmm9, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm3, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm24
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm26
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm16, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm20, %xmm13
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm18, %zmm13
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm15, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm19, %xmm14
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm14, %zmm25, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm28
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm28, %xmm17
-; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm17, %zmm30, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm21, %xmm17
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm22, %xmm31
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm18, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm24, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm26, %xmm23
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm23, %zmm25, %zmm18
-; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm29, %xmm25
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm25, %zmm30, %zmm23
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm8, %xmm27
-; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm27, %zmm30, %zmm25
-; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm9, %xmm31
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm30, %zmm27
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %ymm30
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm22
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm24, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm26
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm20, %zmm0, %zmm16
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm31
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm20
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15]
+; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm4
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm11
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm13, %zmm12
+; AVX512BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm12, %ymm4
+; AVX512BW-FCP-NEXT:    vpermt2w %ymm3, %ymm14, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm11, %xmm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm19, %zmm12
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm12, %xmm4
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm23, %zmm22
+; AVX512BW-FCP-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm16, %xmm22
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm5, %zmm22
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm17, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm18, %xmm5
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm13, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm7, %ymm5, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm4, %xmm7
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm19, %zmm5
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm14, %zmm5
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm7, %xmm13
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm23, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm16
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm19, %zmm18, %zmm17
+; AVX512BW-FCP-NEXT:    movl $613566756, %r10d # imm = 0x24924924
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm17 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm17, %ymm19
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-FCP-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm4, %ymm22
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k3
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-FCP-NEXT:    vpermt2w %ymm22, %ymm23, %ymm17
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm17
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
 ; AVX512BW-FCP-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm24, %zmm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm11
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm0, %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm19
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm26, %ymm1
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm22
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512BW-FCP-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm24, %zmm17 {%k4}
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm26, %zmm21, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm26, %zmm18, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm6, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpermt2w %ymm10, %ymm23, %ymm9
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm25, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm6 {%k4}
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm21, %ymm11
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512BW-FCP-NEXT:    vpermw %ymm8, %ymm11, %ymm8
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512BW-FCP-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k2}
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm8, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512BW-FCP-NEXT:    kmovq %rcx, %k3
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm16, %ymm2
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31]
+; AVX512BW-FCP-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm24, %ymm1
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-FCP-NEXT:    movl $613566756, %eax # imm = 0x24924924
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    movl $-1840700270, %eax # imm = 0x92492492
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm4 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k3}
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm3, %ymm5, %ymm3
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm3
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm30, %ymm2
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31]
-; AVX512BW-FCP-NEXT:    vpermw %ymm3, %ymm24, %ymm3
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %zmm1, %zmm8, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm21 {%k1}
-; AVX512BW-FCP-NEXT:    movl $1227133513, %eax # imm = 0x49249249
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm21 {%k3}
-; AVX512BW-FCP-NEXT:    movabsq $2342443691899625602, %rax # imm = 0x2082082082082082
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm21 {%k4}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT:    movabsq $585610922974906400, %rax # imm = 0x820820820820820
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm12 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm18 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm27, %zmm18 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm10 {%k4}
-; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512BW-FCP-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb %zmm8, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm3, %ymm8, %ymm3
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm4, %ymm10
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm14, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm3
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm0, %zmm7, %zmm3
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %xmm16
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX512DQ-BW-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
 ; AVX512DQ-BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %xmm9
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX512DQ-BW-NEXT:    vpermt2w %ymm8, %ymm13, %ymm12
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm8, %zmm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm12, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm20, %zmm14
+; AVX512DQ-BW-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512DQ-BW-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm14, %zmm3 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm15, %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm18, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm5, %zmm7, %zmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm16, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm17, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-NEXT:    vprold $16, %xmm7, %xmm7
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm5, %zmm14 {%k1}
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm7, %ymm14, %ymm13
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm8, %zmm14
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm7
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm20, %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm13, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %ymm16
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm17
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm18, %zmm15
+; AVX512DQ-BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k2}
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-NEXT:    movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208
-; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm9, %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm10, %ymm6
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31]
-; AVX512DQ-BW-NEXT:    vpermw %ymm6, %ymm8, %ymm6
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %ymm17
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm17, %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm19, %ymm7
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31]
-; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm11, %ymm7
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm14, %zmm13, %zmm6
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k2}
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm5 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm7
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm7, %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %xmm22
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm8, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm25, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %xmm23
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm15, %xmm14
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %xmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm18, %xmm16
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
-; AVX512DQ-BW-NEXT:    vprold $16, %xmm16, %xmm16
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm16, %zmm14
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm14, %zmm6 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm27, %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm6 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm16
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm28, %zmm27, %zmm26
-; AVX512DQ-BW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
-; AVX512DQ-BW-NEXT:    kmovq %rcx, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm26, %zmm6 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm21, %xmm26
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm22, %xmm20
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm25, %zmm20
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm23, %xmm25
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm24, %xmm12
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512DQ-BW-NEXT:    vprold $16, %xmm25, %xmm25
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm25, %zmm12
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k2}
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm12, %zmm20 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm17
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm15 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm15, %ymm19
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQ-BW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm5, %ymm22
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k3
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-NEXT:    vpermt2w %ymm22, %ymm23, %ymm15
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm15, %zmm15
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQ-BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm8, %ymm22
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512DQ-BW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512DQ-BW-NEXT:    kmovq %r10, %k4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm24, %zmm15 {%k4}
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm21, %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm18, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm10, %zmm4 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpermt2w %ymm9, %ymm23, %ymm4
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %ymm6
+; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm6, %ymm6
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm6, %zmm25, %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k4}
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm11, %ymm9
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm21, %ymm10
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm12, %zmm10
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm10 {%k1}
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512DQ-BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm13, %ymm17
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm13
-; AVX512DQ-BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm13, %zmm10 {%k2}
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512DQ-BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm11, %ymm17
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm11
-; AVX512DQ-BW-NEXT:    movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512DQ-BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512DQ-BW-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512DQ-BW-NEXT:    vpshufb %zmm9, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
 ; AVX512DQ-BW-NEXT:    kmovq %rcx, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm11, %zmm10 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm3, %zmm19, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm16, %ymm1
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm2
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm4, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512DQ-BW-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512DQ-BW-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb %zmm9, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm3, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm26
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm16, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm20, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm18, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm15, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm19, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm14, %zmm25, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm28
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm28, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm17, %zmm30, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm21, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm22, %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm18, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm24, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm26, %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm23, %zmm25, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm29, %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm25, %zmm30, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm8, %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm27, %zmm30, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm9, %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm30, %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %ymm30
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm24, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm26
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm20, %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15]
+; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm13, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm12, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %ymm3, %ymm14, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm11, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm19, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm12, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm23, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm3 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm16, %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm5, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm17, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm18, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm13, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm7, %ymm5, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm4, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm19, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm14, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm7, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm23, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm19, %zmm18, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    movl $613566756, %r10d # imm = 0x24924924
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm17, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm4, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %ymm22, %ymm23, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm24, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm0, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm26, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm24, %zmm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm26, %zmm21, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm26, %zmm18, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm6, %zmm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %ymm10, %ymm23, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm25, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm6 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm21, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm8, %ymm11, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm8, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm16, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm24, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    movl $613566756, %eax # imm = 0x24924924
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movl $-1840700270, %eax # imm = 0x92492492
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm3, %ymm5, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm30, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm3, %ymm24, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm1, %zmm8, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm21 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    movl $1227133513, %eax # imm = 0x49249249
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm21 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    movabsq $2342443691899625602, %rax # imm = 0x2082082082082082
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm21 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    movabsq $585610922974906400, %rax # imm = 0x820820820820820
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm12 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm18 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm27, %zmm18 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm10 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm8, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index f4055a953bad..25e489eef9d1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -925,16 +925,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
 ; AVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, 48(%rax)
@@ -967,16 +965,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
 ; AVX2-FP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm0
 ; AVX2-FP-NEXT:    vmovq %xmm0, 48(%rax)
@@ -1205,24 +1201,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
-; AVX512BW-NEXT:    kmovq %rcx, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, 48(%rax)
-; AVX512BW-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, 48(%rax)
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -1283,24 +1276,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512DQ-BW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
-; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
-; AVX512DQ-BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq %xmm0, 48(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2
+; AVX512DQ-BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
+; AVX512DQ-BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, 48(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -1824,8 +1814,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
-; AVX2-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero
@@ -1903,8 +1892,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm7, %ymm5, %ymm5
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
+; AVX2-FP-NEXT:    vpor %ymm4, %ymm5, %ymm5
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
@@ -2323,19 +2311,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpor %ymm5, %ymm6, %ymm5
 ; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
 ; AVX512BW-NEXT:    vpor %ymm7, %ymm6, %ymm6
-; AVX512BW-NEXT:    movl $202911840, %ecx # imm = 0xC183060
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm5, %ymm6 {%k1}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28]
-; AVX512BW-NEXT:    vpor %ymm5, %ymm7, %ymm5
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
+; AVX512BW-NEXT:    vporq %zmm5, %zmm6, %zmm5
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm6
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
@@ -2445,12 +2431,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15]
-; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512BW-FCP-NEXT:    movw $-7741, %cx # imm = 0xE1C3
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, 96(%rax)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -2470,19 +2453,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm5, %ymm6, %ymm5
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
 ; AVX512DQ-BW-NEXT:    vpor %ymm7, %ymm6, %ymm6
-; AVX512DQ-BW-NEXT:    movl $202911840, %ecx # imm = 0xC183060
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm5, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28]
-; AVX512DQ-BW-NEXT:    vpor %ymm5, %ymm7, %ymm5
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
+; AVX512DQ-BW-NEXT:    vporq %zmm5, %zmm6, %zmm5
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm6
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
@@ -2592,12 +2573,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    movw $-7741, %cx # imm = 0xE1C3
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, 96(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
@@ -3598,24 +3576,24 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX2-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX2-NEXT:    vmovdqa (%rcx), %ymm5
-; AVX2-NEXT:    vmovdqa (%r8), %ymm7
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%r9), %ymm2
-; AVX2-NEXT:    vmovdqa (%rax), %ymm1
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX2-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX2-NEXT:    vmovdqa (%r8), %ymm5
+; AVX2-NEXT:    vmovdqa (%r9), %ymm6
+; AVX2-NEXT:    vmovdqa (%rax), %ymm4
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0]
 ; AVX2-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0]
 ; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
@@ -3623,13 +3601,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero
 ; AVX2-NEXT:    vpor %ymm8, %ymm9, %ymm8
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
@@ -3698,68 +3676,67 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm10
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0]
 ; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero
 ; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero
 ; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
 ; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27]
 ; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm11, %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero
-; AVX2-NEXT:    vpor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpor %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero
-; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[1,2,3,0,1,14],zero,ymm5[0,1,0,1,14,15],zero,ymm5[15,16,17,18,19,16],zero,ymm5[30,31,16,17,16,17],zero,ymm5[31,30,31]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero
+; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18],zero
+; AVX2-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpor %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm2, 96(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm1, 160(%rax)
@@ -3905,22 +3882,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
+; AVX2-FP-NEXT:    vpor %ymm6, %ymm8, %ymm6
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
+; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero
 ; AVX2-FP-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
-; AVX2-FP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
-; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm6, %ymm0
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm12, 128(%rax)
@@ -4067,22 +4043,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
+; AVX2-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero
 ; AVX2-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
-; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
-; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
-; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm6, %ymm0
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm10, 128(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index a9da7abaa945..3acc94d6e1fc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -2071,9 +2071,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512BW-NEXT:    vpshufb %zmm10, %zmm9, %zmm9
-; AVX512BW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vpord %zmm6, %zmm9, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
 ; AVX512BW-NEXT:    vpshufb %zmm5, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
@@ -2083,9 +2081,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7]
 ; AVX512BW-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -2117,23 +2115,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm10, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vpord %zmm7, %zmm4, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3]
+; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm6, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq %zmm1, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermq %zmm1, %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq %zmm2, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermq %zmm2, %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm9, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm5, %zmm2
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -2167,9 +2163,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm10, %zmm9, %zmm9
-; AVX512DQ-BW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vpord %zmm6, %zmm9, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm5, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
@@ -2179,9 +2173,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7]
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -2213,23 +2207,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm10, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpord %zmm7, %zmm4, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm6, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm1, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm1, %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm2, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm2, %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm9, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm5, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
@@ -8050,128 +8042,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512BW-NEXT:    vmovdqa 48(%rcx), %xmm14
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512BW-NEXT:    vmovdqa64 48(%rdx), %xmm17
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm19
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm22
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512BW-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vpermw %zmm4, %zmm6, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%r10), %xmm4
-; AVX512BW-NEXT:    vmovdqa64 48(%r10), %xmm23
-; AVX512BW-NEXT:    vmovdqa (%rax), %xmm7
-; AVX512BW-NEXT:    vmovdqa64 48(%rax), %xmm24
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm25
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm26
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %zmm11, %zmm12, %zmm11
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512BW-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512BW-NEXT:    kmovd %r11d, %k2
-; AVX512BW-NEXT:    vpermw %zmm9, %zmm13, %zmm11 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm15, %zmm6, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512BW-NEXT:    vpermw %zmm15, %zmm12, %zmm15
-; AVX512BW-NEXT:    vpermw %zmm27, %zmm13, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm27
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
-; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm28
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15]
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm17, %zmm6, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm30
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512BW-NEXT:    vpermw %zmm19, %zmm12, %zmm19
-; AVX512BW-NEXT:    vpermw %zmm17, %zmm13, %zmm19 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm22, %zmm6, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512BW-NEXT:    vpermw %zmm22, %zmm12, %zmm22
-; AVX512BW-NEXT:    vpermw %zmm23, %zmm13, %zmm22 {%k2}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm23
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rdx), %xmm21
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm18, %zmm6, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 16(%rsi), %xmm24
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rdi), %xmm25
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512BW-NEXT:    vpermw %zmm18, %zmm12, %zmm18
-; AVX512BW-NEXT:    vpermw %zmm20, %zmm13, %zmm18 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm26, %zmm6, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%r10), %xmm24
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm21, %zmm6, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 16(%rax), %xmm21
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512BW-NEXT:    vmovdqa 16(%r9), %xmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512BW-NEXT:    vmovdqa 16(%r8), %xmm5
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX512BW-NEXT:    vpermw %zmm6, %zmm12, %zmm6
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm13, %zmm6 {%k2}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
-; AVX512BW-NEXT:    vpermw %zmm2, %zmm12, %zmm2
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm13, %zmm2 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512BW-NEXT:    vpermw %zmm4, %zmm12, %zmm4
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm13, %zmm4 {%k2}
+; AVX512BW-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm16
+; AVX512BW-NEXT:    vmovdqa 48(%r10), %xmm14
+; AVX512BW-NEXT:    vmovdqa (%rax), %xmm3
+; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm17
+; AVX512BW-NEXT:    vmovdqa 48(%rax), %xmm15
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm19
+; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm18
+; AVX512BW-NEXT:    vmovdqa (%r8), %xmm6
+; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm21
+; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm20
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512BW-NEXT:    vmovdqa64 48(%rcx), %xmm22
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512BW-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm25
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0]
+; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7]
+; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm7, %zmm13
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
+; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm12, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm26
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm27
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15]
+; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm28
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm7, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm29
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
+; AVX512BW-NEXT:    vpermt2w %zmm18, %zmm12, %zmm14
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
+; AVX512BW-NEXT:    vpermt2w %zmm18, %zmm7, %zmm20
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
+; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm12, %zmm18
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%r10), %xmm22
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%rax), %xmm19
+; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm7, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 16(%r9), %xmm21
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%r8), %xmm24
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
+; AVX512BW-NEXT:    vpermt2w %zmm23, %zmm12, %zmm17
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
+; AVX512BW-NEXT:    vpermt2w %zmm23, %zmm7, %zmm25
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm22
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm7, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 16(%rdx), %xmm19
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512BW-NEXT:    vmovdqa 16(%rsi), %xmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm7
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm5
 ; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm16, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k1}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -8179,172 +8150,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm16
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm18
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm17
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm6 = [1284,1798]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm4
 ; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm24
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512BW-FCP-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vpermw %zmm6, %zmm8, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa (%r10), %xmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm25
-; AVX512BW-FCP-NEXT:    vmovdqa (%rax), %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa (%r10), %xmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm24
+; AVX512BW-FCP-NEXT:    vmovdqa (%rax), %xmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rax), %xmm26
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm10, %ymm13
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm27
-; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm28
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm12, %zmm13, %zmm12
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512BW-FCP-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm16, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm29
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm29, %ymm15, %ymm15
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm8, %zmm15 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm13, %zmm16
-; AVX512BW-FCP-NEXT:    vpermw %zmm29, %zmm14, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm17, %xmm18
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm29, %ymm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm28
+; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm30
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm14, %zmm11
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm25
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm13, %ymm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm9, %zmm13
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm25
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm25, %zmm14, %zmm15
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm16, %xmm17
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm25, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm25
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm16, %ymm16
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm16, %ymm16
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm29
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm27
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm17
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm17, %ymm17
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm30
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm31
-; AVX512BW-FCP-NEXT:    vpermw %zmm18, %zmm8, %zmm17 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm18, %zmm13, %zmm18
-; AVX512BW-FCP-NEXT:    vpermw %zmm19, %zmm14, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm24
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm24, %ymm24
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm19, %xmm25
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm19, %ymm19
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm24, %zmm19, %zmm19
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm24, %zmm8, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm24, %zmm13, %zmm24
-; AVX512BW-FCP-NEXT:    vpermw %zmm25, %zmm14, %zmm24 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm20, %xmm21
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm25, %ymm21
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm17, %zmm9, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm31
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm18
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm18, %zmm14, %zmm17
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm18, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm23, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm18, %xmm24
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm18, %ymm18
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm18, %zmm18
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm23
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm23, %zmm9, %zmm18
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm24
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm14, %zmm23
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm19, %xmm20
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm19
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm19, %ymm19
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm20, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm22
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm20, %ymm20
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm21, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm25
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm23
-; AVX512BW-FCP-NEXT:    vpermw %zmm21, %zmm8, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm20, %zmm9, %zmm19
 ; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rcx), %xmm26
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm13, %zmm21
-; AVX512BW-FCP-NEXT:    vpermw %zmm22, %zmm14, %zmm21 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm22
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm27, %ymm22
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm21
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm21, %zmm14, %zmm20
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm21, %xmm25
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm27, %ymm25
 ; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rdx), %xmm27
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm23
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm25, %ymm23
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm23, %zmm8, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm2, %ymm23
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm23, %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 16(%r10), %xmm23
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm5
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm4
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm21, %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm21, %ymm21
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm25, %zmm21
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm25, %ymm25
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm25, %zmm9, %zmm21
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm22, %xmm24
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm25, %ymm24
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm22, %ymm22
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm22, %ymm22
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm24, %zmm22
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm24, %ymm24
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm9, %zmm22
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%r10), %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm6
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm2
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa 16(%r8), %xmm7
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm8, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm8, %zmm13, %zmm8
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm4, %zmm13, %zmm4
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm4 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm13, %zmm5
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm9
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm4
 ; AVX512BW-FCP-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm15 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm20 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm21 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm22 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm1 {%k1}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 448(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 384(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 192(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 256(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 448(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -8352,128 +8324,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa 48(%rcx), %xmm14
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdx), %xmm17
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm22
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-NEXT:    vpermw %zmm4, %zmm6, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r10), %xmm23
-; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rax), %xmm24
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm25
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm26
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm11, %zmm12, %zmm11
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512DQ-BW-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512DQ-BW-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-NEXT:    vpermw %zmm9, %zmm13, %zmm11 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm15, %zmm6, %zmm9 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm15, %zmm12, %zmm15
-; AVX512DQ-BW-NEXT:    vpermw %zmm27, %zmm13, %zmm15 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm27
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm28
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm17, %zmm6, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm30
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm19, %zmm12, %zmm19
-; AVX512DQ-BW-NEXT:    vpermw %zmm17, %zmm13, %zmm19 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm22, %zmm6, %zmm17 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm22, %zmm12, %zmm22
-; AVX512DQ-BW-NEXT:    vpermw %zmm23, %zmm13, %zmm22 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm23
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdx), %xmm21
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm18, %zmm6, %zmm16 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rsi), %xmm24
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdi), %xmm25
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm18, %zmm12, %zmm18
-; AVX512DQ-BW-NEXT:    vpermw %zmm20, %zmm13, %zmm18 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm26, %zmm6, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r10), %xmm24
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm6, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rax), %xmm21
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%r9), %xmm2
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%r8), %xmm5
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm6, %zmm12, %zmm6
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm13, %zmm6 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm12, %zmm2
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm13, %zmm2 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm4, %zmm12, %zmm4
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm13, %zmm4 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm16
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%r10), %xmm14
+; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%rax), %xmm15
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm19
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm18
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm6
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm21
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm20
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm7, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rcx), %xmm22
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm25
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm2
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm7, %zmm13
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm12, %zmm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm26
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm27
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm28
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm7, %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm29
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm18, %zmm12, %zmm14
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm18, %zmm7, %zmm20
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm22, %zmm12, %zmm18
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r10), %xmm22
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rax), %xmm19
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm17, %zmm7, %zmm16
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r9), %xmm21
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r8), %xmm24
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm23, %zmm12, %zmm17
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm23, %zmm7, %zmm25
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm22
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm7, %zmm21
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdx), %xmm19
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rsi), %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm7, %zmm4
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm7
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm3
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm5
 ; AVX512DQ-BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm17 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm16 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm6, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm13, %zmm11 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm16, %zmm17 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm7 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k1}
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 192(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 448(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 384(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -8481,172 +8432,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm6 = [1284,1798]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm24
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm6, %zmm8, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r10), %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rax), %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r10), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rax), %xmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rax), %xmm26
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm10, %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm28
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm12, %zmm13, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm14, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm16, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm29
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm29, %ymm15, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm8, %zmm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm13, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm29, %zmm14, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm17, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm29, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm30
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm14, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm13, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm9, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm25
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm25, %zmm14, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm25, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm29
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm27
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm17, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm30
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm18, %zmm8, %zmm17 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm18, %zmm13, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm19, %zmm14, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm24, %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm19, %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm19, %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm24, %zmm19, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm24, %zmm8, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm24, %zmm13, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm25, %zmm14, %zmm24 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm20, %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm25, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm17, %zmm9, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm31
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm18, %zmm14, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm18, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm23, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm18, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm18, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm18, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm23, %zmm9, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm14, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm19, %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm19, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm20, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm22
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm20, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm21, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm21, %zmm8, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm20, %zmm9, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rcx), %xmm26
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm13, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm22, %zmm14, %zmm21 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm27, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm21, %zmm14, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm21, %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm27, %ymm25
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rdx), %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm22, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm25, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm23, %zmm8, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm2, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm23, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%r10), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm21, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm21, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm25, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm25, %ymm25
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm25, %zmm9, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm22, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm25, %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm22, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm22, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm24, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm24, %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm9, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r10), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r8), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm8, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm8, %zmm13, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm8 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm4, %zmm13, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm13, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm20 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm21 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm22 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm1 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 448(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 384(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 256(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 448(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index 68967c2ce653..c33776daf18f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -964,41 +964,11 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
 }
 
 define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) {
-; X86-AVX512F-LABEL: blend_of_permutes_v16i32:
-; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X86-AVX512F-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X86-AVX512F-NEXT:    kmovw %eax, %k1
-; X86-AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X86-AVX512F-NEXT:    retl
-;
-; X86-AVX512BW-LABEL: blend_of_permutes_v16i32:
-; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X86-AVX512BW-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X86-AVX512BW-NEXT:    kmovd %eax, %k1
-; X86-AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X86-AVX512BW-NEXT:    retl
-;
-; X64-AVX512F-LABEL: blend_of_permutes_v16i32:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X64-AVX512F-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X64-AVX512F-NEXT:    kmovw %eax, %k1
-; X64-AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: blend_of_permutes_v16i32:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X64-AVX512BW-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X64-AVX512BW-NEXT:    kmovd %eax, %k1
-; X64-AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X64-AVX512BW-NEXT:    retq
+; CHECK-LABEL: blend_of_permutes_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [4,21,6,23,16,1,2,19,12,29,14,31,24,9,10,27]
+; CHECK-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   %x0 = bitcast <8 x i64> %s0 to <16 x i32>

From 4079ed3c9e72d64746c5d3f05fc585d844c1e8a7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 17:35:55 +0900
Subject: [PATCH 0104/1322] ARM: Move setting of more runtime libcalls to
 RuntimeLibcallInfo (#143826)

These are the easy cases that do not really depend on the subtarget,
other than for the deceptive predicates on the subtarget class. Most
of the rest of the cases here also do not, but this is obscured by
going through helper predicates added onto the subtarget which hide
dependence on TargetOptions.
---
 llvm/lib/IR/RuntimeLibcalls.cpp         | 28 +++++++++++++++++++++++
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 30 -------------------------
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 31013310a746..331b319511ae 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -79,6 +79,34 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
       }
     }
   }
+
+  if (TT.isOSWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char *const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+        {RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP},
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      Info.setLibcallName(LC.Op, LC.Name);
+      Info.setLibcallCallingConv(LC.Op, LC.CC);
+    }
+  }
+
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (TT.isOSBinFormatMachO() && (!TT.isiOS() || !TT.isOSVersionLT(5, 0))) {
+    Info.setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    Info.setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
 }
 
 static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8455eef9bad3..d2e910a248f2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -708,36 +708,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget->isTargetWindows()) {
-    static const struct {
-      const RTLIB::Libcall Op;
-      const char * const Name;
-      const CallingConv::ID CC;
-    } LibraryCalls[] = {
-      { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
-    };
-
-    for (const auto &LC : LibraryCalls) {
-      setLibcallName(LC.Op, LC.Name);
-      setLibcallCallingConv(LC.Op, LC.CC);
-    }
-  }
-
-  // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->isTargetMachO() &&
-      !(Subtarget->isTargetIOS() &&
-        Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
-    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
-  }
-
   // The half <-> float conversion functions are always soft-float on
   // non-watchos platforms, but are needed for some targets which use a
   // hard-float calling convention by default.

From 5434b85d2c7a83d9cebae06dad2f9d630e9a3927 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 17:38:52 +0900
Subject: [PATCH 0105/1322] ARM: Remove fake entries for divrem libcalls
 (#143832)

This was defining aliases of the i32 divrem functions for the i8
and i16 cases. This is unnecessary and was unused. The divrem
candidate cases wouldn't have formed with illegal types in the
first place, so codegen wouldn't even query these.
---
 llvm/lib/IR/RuntimeLibcalls.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 331b319511ae..d84c56f0af5c 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -41,13 +41,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
         const char *const Name;
         const CallingConv::ID CC;
       } LibraryCalls[] = {
-          {RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS},
-          {RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS},
-
-          {RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS},
-          {RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS},
       };
@@ -62,13 +57,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
         const char *const Name;
         const CallingConv::ID CC;
       } LibraryCalls[] = {
-          {RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
-          {RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS},
-
-          {RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
-          {RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS},
       };

From ce621041c2f162c50d630810491c2feee8eb6c64 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Thu, 12 Jun 2025 16:39:57 +0800
Subject: [PATCH 0106/1322] [RISCV] Get host CPU name via hwprobe (#142745)

We can get the `mvendorid/marchid/mimpid` via hwprobe and then we
can compare these IDs with those defined in processors to find the
CPU name.

With this change, `-mcpu/-mtune=native` can set the proper name.
---
 .../llvm/TargetParser/RISCVTargetParser.h     |  8 +++++
 llvm/lib/TargetParser/Host.cpp                | 30 +++++++++++++++----
 llvm/lib/TargetParser/RISCVTargetParser.cpp   | 15 +++++++---
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index 41fdab6012aa..19a8af0cb956 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -29,6 +29,13 @@ struct CPUModel {
   uint32_t MVendorID;
   uint64_t MArchID;
   uint64_t MImpID;
+
+  bool isValid() const { return MVendorID != 0 && MArchID != 0 && MImpID != 0; }
+
+  bool operator==(const CPUModel &Other) const {
+    return MVendorID == Other.MVendorID && MArchID == Other.MArchID &&
+           MImpID == Other.MImpID;
+  }
 };
 
 struct CPUInfo {
@@ -58,6 +65,7 @@ LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU);
 LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU);
 LLVM_ABI bool hasValidCPUModel(StringRef CPU);
 LLVM_ABI CPUModel getCPUModel(StringRef CPU);
+LLVM_ABI StringRef getCPUNameFromCPUModel(const CPUModel &Model);
 
 } // namespace RISCV
 
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 14acef116708..5957e1befe2d 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/RISCVTargetParser.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/TargetParser/X86TargetParser.h"
 #include <string.h>
@@ -1672,8 +1673,32 @@ StringRef sys::getHostCPUName() {
   return "generic";
 }
 #elif defined(__riscv)
+#if defined(__linux__)
+// struct riscv_hwprobe
+struct RISCVHwProbe {
+  int64_t Key;
+  uint64_t Value;
+};
+#endif
+
 StringRef sys::getHostCPUName() {
 #if defined(__linux__)
+  // Try the hwprobe way first.
+  RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_MVENDORID=*/0, 0},
+                       {/*RISCV_HWPROBE_KEY_MARCHID=*/1, 0},
+                       {/*RISCV_HWPROBE_KEY_MIMPID=*/2, 0}};
+  int Ret = syscall(/*__NR_riscv_hwprobe=*/258, /*pairs=*/Query,
+                    /*pair_count=*/std::size(Query), /*cpu_count=*/0,
+                    /*cpus=*/0, /*flags=*/0);
+  if (Ret == 0) {
+    RISCV::CPUModel Model{static_cast<uint32_t>(Query[0].Value), Query[1].Value,
+                          Query[2].Value};
+    StringRef Name = RISCV::getCPUNameFromCPUModel(Model);
+    if (!Name.empty())
+      return Name;
+  }
+
+  // Then try the cpuinfo way.
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
   StringRef Content = P ? P->getBuffer() : "";
   StringRef Name = detail::getHostCPUNameForRISCV(Content);
@@ -2148,11 +2173,6 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(__linux__) && defined(__riscv)
-// struct riscv_hwprobe
-struct RISCVHwProbe {
-  int64_t Key;
-  uint64_t Value;
-};
 const StringMap<bool> sys::getHostCPUFeatures() {
   RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0},
                        {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0},
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index 2e5e8f4e50c9..9957ec0c28d8 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -57,10 +57,7 @@ bool hasFastVectorUnalignedAccess(StringRef CPU) {
   return Info && Info->FastVectorUnalignedAccess;
 }
 
-bool hasValidCPUModel(StringRef CPU) {
-  const CPUModel Model = getCPUModel(CPU);
-  return Model.MVendorID != 0 && Model.MArchID != 0 && Model.MImpID != 0;
-}
+bool hasValidCPUModel(StringRef CPU) { return getCPUModel(CPU).isValid(); }
 
 CPUModel getCPUModel(StringRef CPU) {
   const CPUInfo *Info = getCPUInfoByName(CPU);
@@ -69,6 +66,16 @@ CPUModel getCPUModel(StringRef CPU) {
   return Info->Model;
 }
 
+StringRef getCPUNameFromCPUModel(const CPUModel &Model) {
+  if (!Model.isValid())
+    return "";
+
+  for (auto &C : RISCVCPUInfo)
+    if (C.Model == Model)
+      return C.Name;
+  return "";
+}
+
 bool parseCPU(StringRef CPU, bool IsRV64) {
   const CPUInfo *Info = getCPUInfoByName(CPU);
 

From 4551e5035565606eb04253a35f31d51685657436 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Thu, 12 Jun 2025 10:49:23 +0200
Subject: [PATCH 0107/1322] [clang] Reset FileID based diag state mappings
 (#143695)

When sharing same compiler instance for multiple compilations, we reset
source manager's file id tables in between runs. Diagnostics engine
keeps a cache based on these file ids, that became dangling references
across compilations.

This patch makes sure we reset those whenever sourcemanager is trashing
its FileIDs.
---
 clang/include/clang/Basic/Diagnostic.h        | 13 +++--
 clang/lib/Basic/Diagnostic.cpp                |  4 +-
 clang/lib/Basic/SourceManager.cpp             |  3 ++
 .../Frontend/CompilerInstanceTest.cpp         | 51 +++++++++++++++++++
 4 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index efee8302e750..7ae4ef7df138 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -424,10 +424,13 @@ private:
     bool empty() const { return Files.empty(); }
 
     /// Clear out this map.
-    void clear() {
+    void clear(bool Soft) {
+      // Just clear the cache when in soft mode.
       Files.clear();
-      FirstDiagState = CurDiagState = nullptr;
-      CurDiagStateLoc = SourceLocation();
+      if (!Soft) {
+        FirstDiagState = CurDiagState = nullptr;
+        CurDiagStateLoc = SourceLocation();
+      }
     }
 
     /// Produce a debugging dump of the diagnostic state.
@@ -920,6 +923,10 @@ public:
   /// Reset the state of the diagnostic object to its initial configuration.
   /// \param[in] soft - if true, doesn't reset the diagnostic mappings and state
   void Reset(bool soft = false);
+  /// We keep a cache of FileIDs for diagnostics mapped by pragmas. These might
+  /// get invalidated when diagnostics engine is shared across different
+  /// compilations. Provide users with a way to reset that.
+  void ResetPragmas();
 
   //===--------------------------------------------------------------------===//
   // DiagnosticsEngine classification and reporting interfaces.
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 95d86cb153b4..a30bfa28eca7 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -119,6 +119,8 @@ bool DiagnosticsEngine::popMappings(SourceLocation Loc) {
   return true;
 }
 
+void DiagnosticsEngine::ResetPragmas() { DiagStatesByLoc.clear(/*Soft=*/true); }
+
 void DiagnosticsEngine::Reset(bool soft /*=false*/) {
   ErrorOccurred = false;
   UncompilableErrorOccurred = false;
@@ -135,7 +137,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) {
   if (!soft) {
     // Clear state related to #pragma diagnostic.
     DiagStates.clear();
-    DiagStatesByLoc.clear();
+    DiagStatesByLoc.clear(false);
     DiagStateOnPushStack.clear();
 
     // Create a DiagState and DiagStatePoint representing diagnostic changes
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 09e5c6547fb5..053e82683a4a 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -344,6 +344,9 @@ void SourceManager::clearIDTables() {
   NextLocalOffset = 0;
   CurrentLoadedOffset = MaxLoadedOffset;
   createExpansionLoc(SourceLocation(), SourceLocation(), SourceLocation(), 1);
+  // Diagnostics engine keeps some references to fileids, mostly for dealing
+  // with diagnostic pragmas, make sure they're reset as well.
+  Diag.ResetPragmas();
 }
 
 bool SourceManager::isMainFile(const FileEntry &SourceFile) {
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index a7b258d5e537..459a3864887e 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -9,9 +9,12 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "gtest/gtest.h"
@@ -97,4 +100,52 @@ TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) {
   ASSERT_EQ(DiagnosticOutput, "error: expected no crash\n");
 }
 
+TEST(CompilerInstance, MultipleInputsCleansFileIDs) {
+  auto VFS = makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  VFS->addFile("a.cc", /*ModificationTime=*/{},
+               MemoryBuffer::getMemBuffer(R"cpp(
+      #include "a.h"
+      )cpp"));
+  // Paddings of `void foo();` in the sources below are "important". We're
+  // testing against source locations from previous compilations colliding.
+  // Hence the `unused` variable in `b.h` needs to be within `#pragma clang
+  // diagnostic` block from `a.h`.
+  VFS->addFile("a.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp(
+      #include "b.h"
+      #pragma clang diagnostic push
+      #pragma clang diagnostic warning "-Wunused"
+      void foo();
+      #pragma clang diagnostic pop
+      )cpp"));
+  VFS->addFile("b.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp(
+      void foo(); void foo(); void foo(); void foo();
+      inline void foo() { int unused = 2; }
+      )cpp"));
+
+  DiagnosticOptions DiagOpts;
+  IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      CompilerInstance::createDiagnostics(*VFS, DiagOpts);
+
+  CreateInvocationOptions CIOpts;
+  CIOpts.Diags = Diags;
+
+  const char *Args[] = {"clang", "-xc++", "a.cc"};
+  std::shared_ptr<CompilerInvocation> CInvok =
+      createInvocation(Args, std::move(CIOpts));
+  ASSERT_TRUE(CInvok) << "could not create compiler invocation";
+
+  CompilerInstance Instance(std::move(CInvok));
+  Instance.setDiagnostics(Diags.get());
+  Instance.createFileManager(VFS);
+
+  // Run once for `a.cc` and then for `a.h`. This makes sure we get the same
+  // file ID for `b.h` in the second run as `a.h` from first run.
+  const auto &OrigInputKind = Instance.getFrontendOpts().Inputs[0].getKind();
+  Instance.getFrontendOpts().Inputs.emplace_back("a.h", OrigInputKind);
+
+  SyntaxOnlyAction Act;
+  EXPECT_TRUE(Instance.ExecuteAction(Act)) << "Failed to execute action";
+  EXPECT_FALSE(Diags->hasErrorOccurred());
+  EXPECT_EQ(Diags->getNumWarnings(), 0u);
+}
 } // anonymous namespace

From db8d34db26e9ea92c08d6e813eca9cce40c48478 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 12 Jun 2025 10:04:08 +0100
Subject: [PATCH 0108/1322] [VPlan] Set branch weight metadata on middle term
 in VPlan (NFC) (#143035)

Manage branch weights for the BranchOnCond in the middle block in VPlan.
This requires updating VPInstruction to inherit from VPIRMetadata, which
in general makes sense as there are a number of opcodes that could take
metadata.

There are other branches (part of the skeleton) that also need branch
weights adding.

PR: https://github.com/llvm/llvm-project/pull/143035
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 48 ++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 53 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 ++-
 3 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d23611183639..93ab3353a296 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7273,6 +7273,33 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
 }
 
+/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
+/// BranchOnCond recipe.
+static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
+                                              Loop *OrigLoop) {
+  // 4. Adjust branch weight of the branch in the middle block.
+  Instruction *LatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+  if (!hasBranchWeightMD(*LatchTerm))
+    return;
+
+  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+  auto *MiddleTerm =
+      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
+  // Only add branch metadata if there is a (conditional) terminator.
+  if (!MiddleTerm)
+    return;
+
+  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
+         "must have a BranchOnCond");
+  // Assume that `Count % VectorTripCount` is equally distributed.
+  unsigned TripCount = Plan.getUF() * VF.getKnownMinValue();
+  assert(TripCount > 0 && "trip count should not be zero");
+  MDBuilder MDB(LatchTerm->getContext());
+  MDNode *BranchWeights =
+      MDB.createBranchWeights({1, TripCount - 1}, /*IsExpected=*/false);
+  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+}
+
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
@@ -7295,11 +7322,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
                                             *Legal->getWidestInductionType());
-  // Retrieve and store the middle block before dissolving regions. Regions are
-  // dissolved after optimizing for VF and UF, which completely removes unneeded
-  // loop regions first.
-  VPBasicBlock *MiddleVPBB =
-      BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
+
+  addBranchWeightToMiddleTerminator(BestVPlan, BestVF, OrigLoop);
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7442,20 +7466,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
-  // 4. Adjust branch weight of the branch in the middle block.
-  if (HeaderVPBB) {
-    auto *MiddleTerm =
-        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
-    if (MiddleTerm->isConditional() &&
-        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-      // Assume that `Count % VectorTripCount` is equally distributed.
-      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-      assert(TripCount > 0 && "trip count should not be zero");
-      const uint32_t Weights[] = {1, TripCount - 1};
-      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
-    }
-  }
-
   return ExpandedSCEVs;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index acc861b99197..468284168e9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,11 +882,39 @@ protected:
   unsigned getUnrollPart(VPUser &U) const;
 };
 
+/// Helper to manage IR metadata for recipes. It filters out metadata that
+/// cannot be propagated.
+class VPIRMetadata {
+  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
+
+public:
+  VPIRMetadata() {}
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I.
+  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
+  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
+
+  /// Copy constructor for cloning.
+  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+
+  /// Add all metadata to \p I.
+  void applyMetadata(Instruction &I) const;
+
+  void addMetadata(unsigned Kind, MDNode *Node) {
+    Metadata.emplace_back(Kind, Node);
+  }
+};
+
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeWithIRFlags,
+                      public VPIRMetadata,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -976,7 +1004,7 @@ public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        Opcode(Opcode), Name(Name.str()) {}
+        VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags, DebugLoc DL = {},
@@ -1268,29 +1296,6 @@ protected:
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
-/// Helper to manage IR metadata for recipes. It filters out metadata that
-/// cannot be propagated.
-class VPIRMetadata {
-  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
-
-public:
-  VPIRMetadata() {}
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I.
-  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
-  /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
-
-  /// Add all metadata to \p I.
-  void applyMetadata(Instruction &I) const;
-};
-
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 62b99d98a2b5..f5a2533727b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -410,7 +410,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      Opcode(Opcode), Name(Name.str()) {
+      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
 }
@@ -591,7 +591,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
-    return createCondBranch(Cond, getParent(), State);
+    auto *Br = createCondBranch(Cond, getParent(), State);
+    applyMetadata(*Br);
+    return Br;
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.

From 2a27c059eccd96b6e46464dbdf69fd2f6237a56c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 10:46:08 +0100
Subject: [PATCH 0109/1322] [X86] Use BSR passthrough behaviour to fold (CMOV
 (BSR ?, X), Y, (X == 0)) -> (BSR Y, X) (#143662)

Make use of targets that support BSR "pass through behaviour" on a zero input to remove a CMOV thats performing the same function

BSF will be a trickier patch as we need to make sure it works with the "REP BSF" hack in X86MCInstLower
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++++++
 llvm/test/CodeGen/X86/bsr.ll            | 10 ++++------
 llvm/test/CodeGen/X86/pr40090.ll        | 11 ++++-------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b0553aa4b819..f0fbf55e97be 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49398,6 +49398,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
+  // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
+  // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
     SDValue Add = TrueOp;
@@ -49406,6 +49408,14 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
     if (CC == X86::COND_E)
       std::swap(Add, Const);
 
+    // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
+    if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
+        Add.getResNo() == 0 && Add.hasOneUse() &&
+        Add.getOperand(1) == Cond.getOperand(0)) {
+      return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
+                         Add.getOperand(1));
+    }
+
     // We might have replaced the constant in the cmov with the LHS of the
     // compare. If so change it to the RHS of the compare.
     if (Const == Cond.getOperand(0))
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index 1247b3ec5932..fbca4af425ea 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -162,9 +162,8 @@ define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    bsrl %edi, %eax
-; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
   %2 = xor i32 %1, 31
@@ -188,8 +187,8 @@ define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr32_undef:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    bsrl %edi, %eax
-; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
   %2 = xor i32 %1, 31
@@ -239,9 +238,8 @@ define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    bsrq %rdi, %rax
-; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
   %2 = xor i64 %1, 63
@@ -279,8 +277,8 @@ define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr64_undef:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    bsrq %rdi, %rax
-; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
   %2 = xor i64 %1, 63
diff --git a/llvm/test/CodeGen/X86/pr40090.ll b/llvm/test/CodeGen/X86/pr40090.ll
index 24e957ac59f5..af933c950e11 100644
--- a/llvm/test/CodeGen/X86/pr40090.ll
+++ b/llvm/test/CodeGen/X86/pr40090.ll
@@ -4,10 +4,9 @@
 define i64 @foo(i64 %x, i64 %y) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bsrq %rdi, %rax
-; CHECK-NEXT:    orq $64, %rax
+; CHECK-NEXT:    bsrq %rdi, %rcx
+; CHECK-NEXT:    orq $64, %rcx
 ; CHECK-NEXT:    bsrq %rsi, %rcx
-; CHECK-NEXT:    cmoveq %rax, %rcx
 ; CHECK-NEXT:    movl $63, %eax
 ; CHECK-NEXT:    subq %rcx, %rax
 ; CHECK-NEXT:    retq
@@ -25,11 +24,9 @@ define i64 @bar(i64 %x, i64 %y) {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl $127, %ecx
-; CHECK-NEXT:    movl $127, %eax
-; CHECK-NEXT:    bsrq %rdi, %rax
-; CHECK-NEXT:    xorq $64, %rax
+; CHECK-NEXT:    bsrq %rdi, %rcx
+; CHECK-NEXT:    xorq $64, %rcx
 ; CHECK-NEXT:    bsrq %rsi, %rcx
-; CHECK-NEXT:    cmoveq %rax, %rcx
 ; CHECK-NEXT:    movl $63, %eax
 ; CHECK-NEXT:    subq %rcx, %rax
 ; CHECK-NEXT:    retq

From 1d1f9afe911c360b9505b5fd2c712cb112c8aa5f Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 17:42:00 +0800
Subject: [PATCH 0110/1322] [C++20] [Modules] Treat directly imported internal
 partition unit as reachable

Close https://github.com/llvm/llvm-project/issues/143788

See the discussion for details.
---
 clang/lib/Sema/SemaLookup.cpp    | 23 ++++++++++++++++++-----
 clang/lib/Sema/SemaModule.cpp    | 13 +++++++------
 clang/test/Modules/pr143788.cppm | 28 ++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/Modules/pr143788.cppm

diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index eef134b15843..91822909f1fd 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1978,6 +1978,8 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) {
   if (D->isModulePrivate())
     return false;
 
+  Module *DeclTopModule = DeclModule->getTopLevelModule();
+
   // [module.reach]/p1
   //   A translation unit U is necessarily reachable from a point P if U is a
   //   module interface unit on which the translation unit containing P has an
@@ -1996,17 +1998,28 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) {
   //
   // Here we only check for the first condition. Since we couldn't see
   // DeclModule if it isn't (transitively) imported.
-  if (DeclModule->getTopLevelModule()->isModuleInterfaceUnit())
+  if (DeclTopModule->isModuleInterfaceUnit())
     return true;
 
-  // [module.reach]/p2
+  // [module.reach]/p1,2
+  //   A translation unit U is necessarily reachable from a point P if U is a
+  //   module interface unit on which the translation unit containing P has an
+  //   interface dependency, or the translation unit containing P imports U, in
+  //   either case prior to P
+  //
   //   Additional translation units on
   //   which the point within the program has an interface dependency may be
   //   considered reachable, but it is unspecified which are and under what
   //   circumstances.
-  //
-  // The decision here is to treat all additional tranditional units as
-  // unreachable.
+  Module *CurrentM = SemaRef.getCurrentModule();
+
+  // Directly imported module are necessarily reachable.
+  // Since we can't export import a module implementation partition unit, we
+  // don't need to count for Exports here.
+  if (CurrentM && CurrentM->getTopLevelModule()->Imports.count(DeclTopModule))
+    return true;
+
+  // Then we treat all module implementation partition unit as unreachable.
   return false;
 }
 
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 6c4df0aa35af..9fcaad48d305 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -712,7 +712,13 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
       Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) {
     Diag(ExportLoc, diag::err_export_partition_impl)
         << SourceRange(ExportLoc, Path.back().getLoc());
-  } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) {
+  } else if (ExportLoc.isValid() &&
+             (ModuleScopes.empty() || currentModuleIsImplementation())) {
+    // [module.interface]p1:
+    // An export-declaration shall inhabit a namespace scope and appear in the
+    // purview of a module interface unit.
+    Diag(ExportLoc, diag::err_export_not_in_module_interface);
+  } else if (!ModuleScopes.empty()) {
     // Re-export the module if the imported module is exported.
     // Note that we don't need to add re-exported module to Imports field
     // since `Exports` implies the module is imported already.
@@ -720,11 +726,6 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
       getCurrentModule()->Exports.emplace_back(Mod, false);
     else
       getCurrentModule()->Imports.insert(Mod);
-  } else if (ExportLoc.isValid()) {
-    // [module.interface]p1:
-    // An export-declaration shall inhabit a namespace scope and appear in the
-    // purview of a module interface unit.
-    Diag(ExportLoc, diag::err_export_not_in_module_interface);
   }
 
   return Import;
diff --git a/clang/test/Modules/pr143788.cppm b/clang/test/Modules/pr143788.cppm
new file mode 100644
index 000000000000..5ae36d8d0e85
--- /dev/null
+++ b/clang/test/Modules/pr143788.cppm
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/P.cppm -emit-module-interface -o %t/P.pcm
+// RUN: %clang_cc1 -std=c++20 %t/I.cpp -fmodule-file=M:P=%t/P.pcm -fmodule-file=M=%t/M.pcm -fsyntax-only -verify
+
+//--- H.hpp
+struct S{};
+
+//--- M.cppm
+export module M;
+
+
+//--- P.cppm
+module;
+#include "H.hpp"
+module M:P;
+
+using T = S;
+
+//--- I.cpp
+// expected-no-diagnostics
+module M;
+import :P;
+
+T f() { return {}; }

From 8e4fdff6f02161d878a63900abb35aaa32ff85e9 Mon Sep 17 00:00:00 2001
From: Omair Javaid <omair.javaid@linaro.org>
Date: Thu, 12 Jun 2025 14:48:13 +0500
Subject: [PATCH 0111/1322] [X86] Update tailcc-ssp.ll assertions using
 update_llc_test_checks.py (#143500)

The assertions in llvm/test/CodeGen/X86/tailcc-ssp.ll were outdated. The
initial comment indicated they were generated with
`utils/update_llc_test_checks.py UTC_ARGS: --version 5`, but this was
not accurate based on the file's content.

Running `utils/update_llc_test_checks.py` regenerated the assertions,
aligning them with the current `llc` output.
This commit ensures that the test's claimed behavior accurately reflects
the actual `llc` output, even though the tests were already passing.

This was identified by @efriedma-quic during review of #136290.

Submitting a separate PR to make sure these changes stay isolated.
---
 llvm/test/CodeGen/X86/tailcc-ssp.ll | 55 ++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll
index 5211e4fe9eef..7ea5dd49f024 100644
--- a/llvm/test/CodeGen/X86/tailcc-ssp.ll
+++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll
@@ -78,7 +78,7 @@ define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
-
+;
 ; LINUX-LABEL: tailcall_unrelated_frame:
 ; LINUX:       # %bb.0:
 ; LINUX-NEXT:    pushq %rax
@@ -97,6 +97,7 @@ define void @tailcall_unrelated_frame() sspreq {
 ; LINUX-NEXT:    .cfi_def_cfa_offset 16
 ; LINUX-NEXT:    callq __stack_chk_fail@PLT
 
+
   call void @bar()
   tail call void @bar()
   ret void
@@ -105,18 +106,48 @@ define void @tailcall_unrelated_frame() sspreq {
 declare void @callee()
 define void @caller() sspreq {
 ; WINDOWS-LABEL: caller:
-; WINDOWS: callq   callee
-; WINDOWS: callq   callee
-; WINDOWS: cmpq    __security_cookie(%rip), %rcx
-; WINDOWS: jne
-; WINDOWS: callq   __security_check_cookie
-
+; WINDOWS:       # %bb.0:
+; WINDOWS-NEXT:    subq $40, %rsp
+; WINDOWS-NEXT:    .seh_stackalloc 40
+; WINDOWS-NEXT:    .seh_endprologue
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WINDOWS-NEXT:    callq callee
+; WINDOWS-NEXT:    callq callee
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
+; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    jne .LBB2_2
+; WINDOWS-NEXT:  # %bb.1:
+; WINDOWS-NEXT:    .seh_startepilogue
+; WINDOWS-NEXT:    addq $40, %rsp
+; WINDOWS-NEXT:    .seh_endepilogue
+; WINDOWS-NEXT:    retq
+; WINDOWS-NEXT:  .LBB2_2:
+; WINDOWS-NEXT:    callq __security_check_cookie
+; WINDOWS-NEXT:    int3
+; WINDOWS-NEXT:    .seh_endproc
+;
 ; LINUX-LABEL: caller:
-; LINUX: callq   callee@PLT
-; LINUX: callq   callee@PLT
-; LINUX: cmpq
-; LINUX: jne
-; LINUX: callq   __stack_chk_fail@PLT
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushq %rax
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    movq %rax, (%rsp)
+; LINUX-NEXT:    callq callee@PLT
+; LINUX-NEXT:    callq callee@PLT
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    cmpq (%rsp), %rax
+; LINUX-NEXT:    jne .LBB2_2
+; LINUX-NEXT:  # %bb.1: # %SP_return
+; LINUX-NEXT:    popq %rax
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    retq
+; LINUX-NEXT:  .LBB2_2: # %CallStackCheckFailBlk
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    callq __stack_chk_fail@PLT
+
 
   tail call void @callee()
   call void @callee()

From 3e5d50f9c61bb266ab17919ab5209c7b08520aff Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Thu, 12 Jun 2025 15:20:39 +0530
Subject: [PATCH 0112/1322] [NVPTX] Add cta_group support to TMA G2S intrinsics
 (#143178)

This patch extends the TMA G2S intrinsics with the
support for cta_group::1/2 available from Blackwell onwards.
The existing intrinsics are auto-upgraded with a default
value of '0' for the `cta_group` flag operand.

* lit tests are added for all combinations of the newer variants.
* Negative tests are added to validate the error-handling
   when the value of the cta_group flag falls out-of-range.
* The generated PTX is verified with a 12.8 ptxas executable.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/docs/NVPTXUsage.rst                      |  32 +-
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  32 +-
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h     |   9 +
 llvm/lib/IR/AutoUpgrade.cpp                   | 104 ++++-
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |  19 +
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h     |   1 +
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  19 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |  17 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |   8 +
 .../Assembler/auto_upgrade_nvvm_intrinsics.ll |  16 +-
 .../NVPTX/cp-async-bulk-tensor-g2s-1cta.ll    | 435 ++++++++++++++++++
 .../NVPTX/cp-async-bulk-tensor-g2s-2cta.ll    | 435 ++++++++++++++++++
 .../NVPTX/cp-async-bulk-tensor-g2s-invalid.ll |  15 +
 13 files changed, 1078 insertions(+), 64 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index d51686c0b830..abd7ca545364 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1016,7 +1016,7 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
@@ -1034,18 +1034,26 @@ source tensor is preserved at the destination. The dimension of the
 tensor data ranges from 1d to 5d with the coordinates specified
 by the ``i32 %d0 ... i32 %d4`` arguments.
 
-* The last two arguments to these intrinsics are boolean flags
-  indicating support for cache_hint and/or multicast modifiers.
-  These flag arguments must be compile-time constants. The backend
-  looks through these flags and lowers the intrinsics appropriately.
+* The last three arguments to these intrinsics are flags
+  indicating support for multicast, cache_hint and cta_group::1/2
+  modifiers. These flag arguments must be compile-time constants.
+  The backend looks through these flags and lowers the intrinsics
+  appropriately.
 
-* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
+* The argument denoted by ``i1 %flag_ch`` when set, indicates
   a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
   variant of the PTX instruction.
 
-* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
-  the presence of a multicast mask (``i16 %mc``) and generates the PTX
-  instruction with the ``.multicast::cluster`` modifier.
+* The argument denoted by ``i1 %flag_mc`` when set, indicates
+  the presence of a multicast mask (``i16 %mc``) and generates
+  the PTX instruction with the ``.multicast::cluster`` modifier.
+
+* The argument denoted by ``i32 %flag_cta_group`` takes values within
+  the range [0, 3) i.e. {0,1,2}. When the value of ``%flag_cta_group``
+  is not within the range, it may raise an error from the Verifier.
+  The default value is '0' with no cta_group modifier in the
+  instruction. The values of '1' and '2' lower to ``cta_group::1``
+  and ``cta_group::2`` variants of the PTX instruction respectively.
 
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
@@ -1058,7 +1066,7 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
@@ -1074,8 +1082,8 @@ are unrolled into a single dimensional column at the destination. In this
 mode, the tensor has to be at least three-dimensional. Along with the tensor
 coordinates, im2col offsets are also specified (denoted by
 ``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
-than the number of dimensions of the tensor operation. The last two arguments
-to these intrinsics are boolean flags, with the same functionality as described
+than the number of dimensions of the tensor operation. The last three arguments
+to these intrinsics are flags, with the same functionality as described
 in the ``tile`` mode intrinsics above.
 
 For more information, refer PTX ISA
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 8c8e778b5706..4efdff71c016 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2020,20 +2020,26 @@ foreach dim = 1...5 in {
     defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0);
     defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets);
 
+    defvar g2s_params = !listconcat(
+                          [llvm_shared_cluster_ptr_ty, // dst_ptr
+                           llvm_shared_ptr_ty,  // mbarrier_ptr
+                           llvm_ptr_ty],        // tensormap_ptr
+                          tensor_dim_args,      // actual tensor dims
+                          im2col_offsets_args,  // im2col offsets
+                          [llvm_i16_ty,         // cta_mask
+                           llvm_i64_ty]);       // cache_hint
+    defvar g2s_flags = [llvm_i1_ty,             // Flag for cta_mask
+                        llvm_i1_ty,             // Flag for cache_hint
+                        llvm_i32_ty];           // Flag for cta_group
+    defvar cta_group_idx = !add(
+                             !size(g2s_params),
+                             !sub(!size(g2s_flags), 1));
+    defvar g2s_props = [IntrConvergent,
+                        WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+                        // Allowed values for cta_group are {0,1,2} i.e [0, 3).
+                        Range<ArgIndex<cta_group_idx>, 0, 3>];
     def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
-      DefaultAttrsIntrinsicFlags<[],
-          !listconcat([llvm_shared_cluster_ptr_ty,  // dst_shared_cluster_ptr
-                       llvm_shared_ptr_ty,          // mbarrier_smem_ptr
-                       llvm_ptr_ty],                // tensormap_ptr
-                      tensor_dim_args,              // actual tensor dims
-                      im2col_offsets_args,          // im2col offsets
-                      [llvm_i16_ty,                 // cta_mask
-                       llvm_i64_ty]),               // cache_hint
-          [llvm_i1_ty,                              // Flag for cta_mask
-           llvm_i1_ty],                             // Flag for cache_hint
-          [IntrConvergent,
-           WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>]>;
+      DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
 
     def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[],
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index ce794e257363..737610b73b08 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -38,6 +38,15 @@ enum class TMAReductionOp : uint8_t {
   XOR = 7,
 };
 
+// Enum to represent the cta_group::1 and
+// cta_group::2 variants in TMA/TCGEN05 family of
+// PTX instructions.
+enum class CTAGroupKind : uint8_t {
+  CG_NONE = 0, // default with no cta_group modifier
+  CG_1 = 1,    // cta_group::1 modifier
+  CG_2 = 2,    // cta_group::2 modifier
+};
+
 inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
   switch (IntrinsicID) {
   case Intrinsic::nvvm_f2i_rm_ftz:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a0886776ff93..6e7254ec3e31 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -945,6 +945,53 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
   return false; // No other 'arm.*', 'aarch64.*'.
 }
 
+static Intrinsic::ID shouldUpgradeNVPTXTMAG2SIntrinsics(Function *F,
+                                                        StringRef Name) {
+  if (Name.consume_front("cp.async.bulk.tensor.g2s.")) {
+    Intrinsic::ID ID =
+        StringSwitch<Intrinsic::ID>(Name)
+            .Case("im2col.3d",
+                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
+            .Case("im2col.4d",
+                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
+            .Case("im2col.5d",
+                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
+            .Case("tile.1d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
+            .Case("tile.2d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
+            .Case("tile.3d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
+            .Case("tile.4d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
+            .Case("tile.5d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
+            .Default(Intrinsic::not_intrinsic);
+
+    if (ID == Intrinsic::not_intrinsic)
+      return ID;
+
+    // These intrinsics may need upgrade for two reasons:
+    // (1) When the address-space of the first argument is shared[AS=3]
+    //     (and we upgrade it to use shared_cluster address-space[AS=7])
+    if (F->getArg(0)->getType()->getPointerAddressSpace() ==
+        NVPTXAS::ADDRESS_SPACE_SHARED)
+      return ID;
+
+    // (2) When there are only two boolean flag arguments at the end:
+    //
+    // The last three parameters of the older version of these
+    // intrinsics are: arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag
+    //
+    // The newer version reads as:
+    // arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag, i32 cta_group_flag
+    //
+    // So, when the type of the [N-3]rd argument is "not i1", then
+    // it is the older version and we need to upgrade.
+    size_t FlagStartIndex = F->getFunctionType()->getNumParams() - 3;
+    Type *ArgType = F->getFunctionType()->getParamType(FlagStartIndex);
+    if (!ArgType->isIntegerTy(1))
+      return ID;
+  }
+
+  return Intrinsic::not_intrinsic;
+}
+
 static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
                                                               StringRef Name) {
   if (Name.consume_front("mapa.shared.cluster"))
@@ -959,22 +1006,6 @@ static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
                   Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
             .Case("shared.cta.to.cluster",
                   Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
-            .Case("tensor.g2s.im2col.3d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
-            .Case("tensor.g2s.im2col.4d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
-            .Case("tensor.g2s.im2col.5d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
-            .Case("tensor.g2s.tile.1d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
-            .Case("tensor.g2s.tile.2d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
-            .Case("tensor.g2s.tile.3d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
-            .Case("tensor.g2s.tile.4d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
-            .Case("tensor.g2s.tile.5d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
             .Default(Intrinsic::not_intrinsic);
 
     if (ID != Intrinsic::not_intrinsic)
@@ -1339,6 +1370,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         return true;
       }
 
+      // Upgrade TMA copy G2S Intrinsics
+      IID = shouldUpgradeNVPTXTMAG2SIntrinsics(F, Name);
+      if (IID != Intrinsic::not_intrinsic) {
+        rename(F);
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+        return true;
+      }
+
       // The following nvvm intrinsics correspond exactly to an LLVM idiom, but
       // not to an intrinsic alone.  We expand them in UpgradeIntrinsicCall.
       //
@@ -4831,7 +4870,18 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     return;
   }
   case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
-  case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
+  case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: {
+    // Create a new call with the correct address space.
+    SmallVector<Value *, 4> Args(CI->args());
+    Args[0] = Builder.CreateAddrSpaceCast(
+        Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    NewCall->takeName(CI);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
@@ -4840,10 +4890,22 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
-    // Create a new call with the correct address space.
-    SmallVector<Value *, 4> Args(CI->args());
-    Args[0] = Builder.CreateAddrSpaceCast(
-        Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
+    SmallVector<Value *, 16> Args(CI->args());
+
+    // Create AddrSpaceCast to shared_cluster if needed.
+    // This handles case (1) in shouldUpgradeNVPTXTMAG2SIntrinsics().
+    unsigned AS = CI->getArgOperand(0)->getType()->getPointerAddressSpace();
+    if (AS == NVPTXAS::ADDRESS_SPACE_SHARED)
+      Args[0] = Builder.CreateAddrSpaceCast(
+          Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
+
+    // Attach the flag argument for cta_group, with a
+    // default value of 0. This handles case (2) in
+    // shouldUpgradeNVPTXTMAG2SIntrinsics().
+    size_t NumArgs = CI->arg_size();
+    Value *FlagArg = CI->getArgOperand(NumArgs - 3);
+    if (!FlagArg->getType()->isIntegerTy(1))
+      Args.push_back(ConstantInt::get(Builder.getInt32Ty(), 0));
 
     NewCall = Builder.CreateCall(NewFn, Args);
     NewCall->takeName(CI);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b4616b64bad1..732950deca9f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -437,3 +437,22 @@ void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum,
   llvm_unreachable(
       "Invalid Reduction Op in printCpAsyncBulkTensorReductionMode");
 }
+
+void NVPTXInstPrinter::printCTAGroup(const MCInst *MI, int OpNum,
+                                     raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  using CGTy = nvvm::CTAGroupKind;
+
+  switch (static_cast<CGTy>(MO.getImm())) {
+  case CGTy::CG_NONE:
+    O << "";
+    return;
+  case CGTy::CG_1:
+    O << ".cta_group::1";
+    return;
+  case CGTy::CG_2:
+    O << ".cta_group::2";
+    return;
+  }
+  llvm_unreachable("Invalid cta_group in printCTAGroup");
+}
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index a2dd772cd86d..f73af7a3f2c6 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -51,6 +51,7 @@ public:
   void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O);
   void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O);
   void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printCTAGroup(const MCInst *MI, int OpNum, raw_ostream &O);
 };
 
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 32223bf3d601..a20099788d09 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2556,19 +2556,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
   // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
   // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
   // multicast, cache_hint,
-  // multicast_flag, cache_hint_flag}
+  // multicast_flag, cache_hint_flag, cta_group_flag}
   // NumOperands = {Chain, IID} + {Actual intrinsic args}
-  //             = {2}          + {7 + dims + im2col_offsets}
+  //             = {2}          + {8 + dims + im2col_offsets}
   size_t NumOps = N->getNumOperands();
   size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
-                            : (NumOps - 9);
+                            : (NumOps - 10);
   // Offsets is always 'NumDims - 2' and only for im2col mode
   size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
-  bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
-  bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
+  bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
+  bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
   size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
   size_t MultiCastIdx = NumBaseArgs + 2;         // for Chain and IID
 
+  unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
+  if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
+    report_fatal_error(
+        formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
+                Subtarget->getSmVersion()));
+
   SDLoc DL(N);
   SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
 
@@ -2580,6 +2586,9 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
   if (IsCacheHint)
     Ops.push_back(N->getOperand(MultiCastIdx + 1));
 
+  // Flag for CTA Group
+  Ops.push_back(getI32Imm(CTAGroupVal, DL));
+
   // Finally, the chain operand
   Ops.push_back(N->getOperand(0));
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 83d7defe6d9a..f52ff39c3e1a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -578,10 +578,14 @@ class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
                      # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
 }
 
+def CTAGroupFlags : Operand<i32> {
+  let PrintMethod = "printCTAGroup";
+}
+
 multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
   defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
   defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
-  defvar asm_str_default = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+  defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
   defvar rc = !if(is_shared32, Int32Regs, Int64Regs);
 
   defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
@@ -595,19 +599,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode>
     !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
 
   def "" : NVPTXInst<(outs),
-            !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag),
+            !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
             !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
   def _MC : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)),
+                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag,
+                       (ins Int16Regs:$mc, CTAGroupFlags:$cg)),
                   !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
   def _CH : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
+                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag,
+                       (ins Int64Regs:$ch, CTAGroupFlags:$cg)),
                   !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
   def _MC_CH : NVPTXInst<(outs),
-                     !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)),
+                     !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag,
+                          (ins Int16Regs:$mc, Int64Regs:$ch, CTAGroupFlags:$cg)),
                      !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>,
                      Requires<[hasPTX<80>, hasSM<90>]>;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 5136b1ee2850..d2eae4882682 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -117,6 +117,14 @@ public:
     return HasTcgen05 && PTXVersion >= 86;
   }
 
+  // TMA G2S copy with cta_group::1/2 support
+  bool hasCpAsyncBulkTensorCTAGroupSupport() const {
+    // TODO: Update/tidy-up after the family-conditional support arrives
+    return ((FullSmVersion == 1001 || FullSmVersion == 1011) &&
+            PTXVersion >= 86) ||
+           (FullSmVersion == 1031 && PTXVersion >= 88);
+  }
+
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
   // terminates a basic block. Instead, it would assume that control flow
   // continued to the next instruction. The next instruction could be in the
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index b7bdca42d559..a17f11a680aa 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -307,9 +307,9 @@ define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(
 
 ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
 define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false, i32 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0)
@@ -318,11 +318,11 @@ define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrs
 
 ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
 define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false, i32 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0)
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
new file mode 100644
index 000000000000..5cfa25dfe55f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d
+define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d
+define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d
+define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d
+define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d
+define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d
+define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d
+define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d
+define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
new file mode 100644
index 000000000000..a7e6bec6aef1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d
+define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d
+define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d
+define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d
+define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d
+define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d
+define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d
+define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d
+define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll
new file mode 100644
index 000000000000..1c35fbead389
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll
@@ -0,0 +1,15 @@
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) writeonly, ptr addrspace(3), ptr readonly, i32, i16, i64, i1 immarg, i1 immarg, i32 immarg range(i32 0, 3))
+
+define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+  ; CHECK: immarg value 3 out of range [0, 3)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 3)
+
+  ; CHECK: immarg value -1 out of range [0, 3)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 -1)
+
+  ret void
+}

From a8c6fb4cb8e686f733e022afc549bc085d1558f4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 11:53:32 +0200
Subject: [PATCH 0113/1322] [MemCpyOpt] Fix lifetime marker sizes in tests
 (NFC)

As pointed out in https://github.com/llvm/llvm-project/pull/143782,
these tests were specifying the size in bits instead of bytes.

In order to preserve the intent of the tests, add a use of %src,
which prevents stack-move optimization. These are supposed to test
the handling of scoped alias metadata in call slot optimization.
---
 .../test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll | 7 +++++--
 llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll         | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
index 989049ab67a0..840a5172561d 100644
--- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
+++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
@@ -1,17 +1,20 @@
 ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
 
+declare void @use(ptr)
+
 ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
 define i8 @test(i8 %input) {
   %tmp = alloca i8
   %dst = alloca i8
   %src = alloca i8
 ; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
+  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4
+  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4
   %ret_value = load i8, ptr %dst
+  call void @use(ptr %src)
   ret i8 %ret_value
 }
 
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
index efdbdce401b7..601498e36a7a 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
@@ -1,9 +1,11 @@
 ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
 
+declare void @use(ptr)
+
 ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
 ; Merging here naively generates:
 ;  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3
-;  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0
+;  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0
 ;   ...
 ;  !0 = !{!1}
 ;  !1 = distinct !{!1, !2, !"callee1: %a"}
@@ -18,12 +20,13 @@ define i8 @test(i8 %input) {
   %src = alloca i8
 ; NOTE: we're matching the full line and looking for the lack of !alias.scope here
 ; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false)
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3
+  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3
+  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3
   %ret_value = load i8, ptr %dst
+  call void @use(ptr %src)
   ret i8 %ret_value
 }
 

From 5987f1ee5cc59a05961156c04010ab0f3c857628 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Thu, 12 Jun 2025 11:52:28 +0200
Subject: [PATCH 0114/1322] [InstCombine] Regenerate `narrow-switch.ll` test
 (NFC)

`narrow-switch.ll` test has been regenerated via latest UTC using
`--prefix-filecheck-ir-name _`, so as to avoid conflicts with
scripted variable names.
---
 .../Transforms/InstCombine/narrow-switch.ll   | 194 +++++++++++++-----
 1 file changed, 148 insertions(+), 46 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/narrow-switch.ll b/llvm/test/Transforms/InstCombine/narrow-switch.ll
index 05a30b910e5e..90f56a61fa41 100644
--- a/llvm/test/Transforms/InstCombine/narrow-switch.ll
+++ b/llvm/test/Transforms/InstCombine/narrow-switch.ll
@@ -1,15 +1,27 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name _ --version 5
 ; Vary legal integer types in data layout.
 ; RUN: opt < %s -passes=instcombine -S -data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
 ; RUN: opt < %s -passes=instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
 
 define i32 @positive1(i64 %a) {
-; ALL-LABEL: @positive1(
-; ALL:         switch i32
-; ALL-NEXT:    i32 10, label %return
-; ALL-NEXT:    i32 100, label %sw.bb1
-; ALL-NEXT:    i32 1001, label %sw.bb2
+; ALL-LABEL: define i32 @positive1(
+; ALL-SAME: i64 [[A:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*]]:
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i64 [[A]] to i32
+; ALL-NEXT:    switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; ALL-NEXT:      i32 10, label %[[RETURN:.*]]
+; ALL-NEXT:      i32 100, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i32 1001, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_DEFAULT]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ]
+; ALL-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %and = and i64 %a, 4294967295
@@ -34,12 +46,24 @@ return:
 }
 
 define i32 @negative1(i64 %a) {
-; ALL-LABEL: @negative1(
-; ALL:         switch i32
-; ALL-NEXT:    i32 -10, label %return
-; ALL-NEXT:    i32 -100, label %sw.bb1
-; ALL-NEXT:    i32 -1001, label %sw.bb2
+; ALL-LABEL: define i32 @negative1(
+; ALL-SAME: i64 [[A:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*]]:
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i64 [[A]] to i32
+; ALL-NEXT:    switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; ALL-NEXT:      i32 -10, label %[[RETURN:.*]]
+; ALL-NEXT:      i32 -100, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i32 -1001, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_DEFAULT]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ]
+; ALL-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %or = or i64 %a, -4294967296
@@ -67,12 +91,24 @@ return:
 ; assertion.
 
 define i32 @trunc72to68(i72 %a) {
-; ALL-LABEL: @trunc72to68(
-; ALL:         switch i68
-; ALL-NEXT:    i68 10, label %return
-; ALL-NEXT:    i68 100, label %sw.bb1
-; ALL-NEXT:    i68 1001, label %sw.bb2
+; ALL-LABEL: define i32 @trunc72to68(
+; ALL-SAME: i72 [[A:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*]]:
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i72 [[A]] to i68
+; ALL-NEXT:    switch i68 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; ALL-NEXT:      i68 10, label %[[RETURN:.*]]
+; ALL-NEXT:      i68 100, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i68 1001, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_DEFAULT]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ]
+; ALL-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %and = and i72 %a, 295147905179352825855
@@ -103,15 +139,38 @@ return:
 ; because both are illegal.
 
 define void @trunc64to58(i64 %a) {
-; ALL-LABEL: @trunc64to58(
-; CHECK32:         switch i58
-; CHECK32-NEXT:    i58 0, label %sw.bb1
-; CHECK32-NEXT:    i58 18717182647723699, label %sw.bb2
+; CHECK32-LABEL: define void @trunc64to58(
+; CHECK32-SAME: i64 [[A:%.*]]) {
+; CHECK32-NEXT:  [[ENTRY:.*:]]
+; CHECK32-NEXT:    [[TMP0:%.*]] = trunc i64 [[A]] to i58
+; CHECK32-NEXT:    [[TMP1:%.*]] = and i58 [[TMP0]], 15
+; CHECK32-NEXT:    [[TRUNC:%.*]] = mul nuw i58 [[TMP1]], 18717182647723699
+; CHECK32-NEXT:    switch i58 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; CHECK32-NEXT:      i58 0, label %[[SW_BB1:.*]]
+; CHECK32-NEXT:      i58 18717182647723699, label %[[SW_BB2:.*]]
 ; CHECK32-NEXT:    ]
-; CHECK64:         switch i64
-; CHECK64-NEXT:    i64 0, label %sw.bb1
-; CHECK64-NEXT:    i64 18717182647723699, label %sw.bb2
+; CHECK32:       [[SW_BB1]]:
+; CHECK32-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK32:       [[SW_BB2]]:
+; CHECK32-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK32:       [[SW_DEFAULT]]:
+; CHECK32-NEXT:    ret void
+;
+; CHECK64-LABEL: define void @trunc64to58(
+; CHECK64-SAME: i64 [[A:%.*]]) {
+; CHECK64-NEXT:  [[ENTRY:.*:]]
+; CHECK64-NEXT:    [[_TMP0:%.*]] = and i64 [[A]], 15
+; CHECK64-NEXT:    [[TMP0:%.*]] = mul nuw nsw i64 [[_TMP0]], 18717182647723699
+; CHECK64-NEXT:    switch i64 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+; CHECK64-NEXT:      i64 0, label %[[SW_BB1:.*]]
+; CHECK64-NEXT:      i64 18717182647723699, label %[[SW_BB2:.*]]
 ; CHECK64-NEXT:    ]
+; CHECK64:       [[SW_BB1]]:
+; CHECK64-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK64:       [[SW_BB2]]:
+; CHECK64-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK64:       [[SW_DEFAULT]]:
+; CHECK64-NEXT:    ret void
 ;
 entry:
   %tmp0 = and i64 %a, 15
@@ -136,18 +195,19 @@ sw.default:
 ; https://llvm.org/bugs/show_bug.cgi?id=31260
 
 define i8 @PR31260(i8 %x) {
-; ALL-LABEL: @PR31260(
-; ALL-NEXT:  entry:
-; ALL-NEXT:    [[T4:%.*]] = and i8 [[X:%.*]], 2
-; ALL-NEXT:    switch i8 [[T4]], label [[EXIT:%.*]] [
-; ALL-NEXT:    i8 0, label [[CASE126:%.*]]
-; ALL-NEXT:    i8 2, label [[CASE124:%.*]]
+; ALL-LABEL: define i8 @PR31260(
+; ALL-SAME: i8 [[X:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*:]]
+; ALL-NEXT:    [[T4:%.*]] = and i8 [[X]], 2
+; ALL-NEXT:    switch i8 [[T4]], label %[[EXIT:.*]] [
+; ALL-NEXT:      i8 0, label %[[CASE126:.*]]
+; ALL-NEXT:      i8 2, label %[[CASE124:.*]]
 ; ALL-NEXT:    ]
-; ALL:       exit:
+; ALL:       [[EXIT]]:
 ; ALL-NEXT:    ret i8 1
-; ALL:       case126:
+; ALL:       [[CASE126]]:
 ; ALL-NEXT:    ret i8 3
-; ALL:       case124:
+; ALL:       [[CASE124]]:
 ; ALL-NEXT:    ret i8 5
 ;
 entry:
@@ -169,12 +229,33 @@ case124:
 ; Make sure the arithmetic evaluation of the switch
 ; condition is evaluated on the original type
 define i32 @trunc32to16(i32 %a0) #0 {
-; ALL-LABEL: @trunc32to16(
-; ALL:         switch i16
-; ALL-NEXT:    i16 63, label %sw.bb
-; ALL-NEXT:    i16 1, label %sw.bb1
-; ALL-NEXT:    i16 100, label %sw.bb2
+; ALL-LABEL: define i32 @trunc32to16(
+; ALL-SAME: i32 [[A0:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*:]]
+; ALL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+; ALL-NEXT:    [[XOR:%.*]] = lshr i32 [[A0]], 16
+; ALL-NEXT:    [[TMP0:%.*]] = trunc nuw i32 [[XOR]] to i16
+; ALL-NEXT:    [[TRUNC:%.*]] = xor i16 [[TMP0]], 15784
+; ALL-NEXT:    switch i16 [[TRUNC]], label %[[SW_EPILOG:.*]] [
+; ALL-NEXT:      i16 63, label %[[SW_BB:.*]]
+; ALL-NEXT:      i16 1, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i16 100, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB]]:
+; ALL-NEXT:    store i32 90, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN:.*]]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    store i32 91, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    store i32 92, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_EPILOG]]:
+; ALL-NEXT:    store i32 113, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RVAL:%.*]] = load i32, ptr [[RETVAL]], align 4
+; ALL-NEXT:    ret i32 [[RVAL]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -182,9 +263,9 @@ entry:
   %shr = lshr i32 %xor, 16
   %add = add i32 %shr, -917677090
   switch i32 %add, label %sw.epilog [
-    i32 -917677027, label %sw.bb
-    i32 -917677089, label %sw.bb1
-    i32 -917676990, label %sw.bb2
+  i32 -917677027, label %sw.bb
+  i32 -917677089, label %sw.bb1
+  i32 -917676990, label %sw.bb2
   ]
 
 sw.bb:                                            ; preds = %entry
@@ -219,11 +300,32 @@ declare i32 @goo()
 ; if original type is legal (i32 in this case)
 
 define void @PR29009() {
-; ALL-LABEL: @PR29009(
-; ALL:         switch i32
-; ALL-NEXT:    i32 0, label
-; ALL-NEXT:    i32 3, label
+; ALL-LABEL: define void @PR29009() {
+; ALL-NEXT:    br label %[[BB1:.*]]
+; ALL:       [[BB1]]:
+; ALL-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr @njob, align 4
+; ALL-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[DOTNOT]], label %[[BB10:.*]], label %[[BB3:.*]]
+; ALL:       [[BB3]]:
+; ALL-NEXT:    [[TMP4:%.*]] = call i32 @goo()
+; ALL-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 7
+; ALL-NEXT:    switch i32 [[TMP5]], label %[[BB6:.*]] [
+; ALL-NEXT:      i32 0, label %[[BB7:.*]]
+; ALL-NEXT:      i32 3, label %[[BB8:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[BB6]]:
+; ALL-NEXT:    store i32 6, ptr @a, align 4
+; ALL-NEXT:    br label %[[BB9:.*]]
+; ALL:       [[BB7]]:
+; ALL-NEXT:    store i32 1, ptr @a, align 4
+; ALL-NEXT:    br label %[[BB9]]
+; ALL:       [[BB8]]:
+; ALL-NEXT:    store i32 2, ptr @a, align 4
+; ALL-NEXT:    br label %[[BB9]]
+; ALL:       [[BB9]]:
+; ALL-NEXT:    br label %[[BB1]]
+; ALL:       [[BB10]]:
+; ALL-NEXT:    ret void
 ;
   br label %1
 
@@ -236,8 +338,8 @@ define void @PR29009() {
   %5 = call i32 @goo()
   %6 = and i32 %5, 7
   switch i32 %6, label %7 [
-    i32 0, label %8
-    i32 3, label %9
+  i32 0, label %8
+  i32 3, label %9
   ]
 
 ; <label>:7:                                      ; preds = %4

From 7ef77eb9984d1fb537a409cf4be89560fbb681fe Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 12 Jun 2025 11:09:09 +0100
Subject: [PATCH 0115/1322] [LV] Support scalable interleave groups for factors
 3,5,6 and 7 (#141865)

Currently the loop vectorizer can only vectorize interleave groups for
power-of-2 factors at scalable VFs by recursively interleaving
[de]interleave2 intrinsics.

However after https://github.com/llvm/llvm-project/pull/124825 and
#139893, we now have [de]interleave intrinsics for all factors up to 8,
which is enough to support all types of segmented loads and stores on
RISC-V.

Now that the interleaved access pass has been taught to lower these in
#139373 and #141512, this patch teaches the loop vectorizer to emit
these intrinsics for factors up to 8, which enables scalable
vectorization for non-power-of-2 factors.

As far as I'm aware, no in-tree target will vectorize a scalable
interelave group above factor 8 because the maximum interleave factor is
capped at 4 on AArch64 and 8 on RISC-V, and the
`-max-interleave-group-factor` CLI option defaults to 8, so the
recursive [de]interleaving code has been removed for now.

Factors of 3 with scalable VFs are also turned off in AArch64 since
there's no lowering for [de]interleave3 just yet either.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |   6 +
 llvm/lib/Analysis/VectorUtils.cpp             |  24 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   7 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  14 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  63 +-
 .../AArch64/sve-interleaved-accesses.ll       |  52 +-
 .../sve-interleaved-masked-accesses.ll        |  84 +--
 .../RISCV/interleaved-accesses.ll             | 626 +++++++++---------
 8 files changed, 418 insertions(+), 458 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 52fe6f6cf43f..53ba1e8f7779 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -176,6 +176,12 @@ LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(
 LLVM_ABI Intrinsic::ID
 getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI);
 
+/// Returns the corresponding llvm.vector.interleaveN intrinsic for factor N.
+LLVM_ABI Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor);
+
+/// Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
+LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 968fd2f8c5d7..63fccee63c0a 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -240,6 +240,30 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
   return Intrinsic::not_intrinsic;
 }
 
+struct InterleaveIntrinsic {
+  Intrinsic::ID Interleave, Deinterleave;
+};
+
+static InterleaveIntrinsic InterleaveIntrinsics[] = {
+    {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
+    {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
+    {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
+    {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
+    {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
+    {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
+    {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
+};
+
+Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
+  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  return InterleaveIntrinsics[Factor - 2].Interleave;
+}
+
+Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
+  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  return InterleaveIntrinsics[Factor - 2].Deinterleave;
+}
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index acd37a5ae072..0232ac421aed 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4583,6 +4583,13 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (VecTy->isScalableTy() && !ST->hasSVE())
     return InstructionCost::getInvalid();
 
+  // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
+  // only have lowering for power-of-2 factors.
+  // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
+  // InterleavedAccessPass for ld3/st3
+  if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
+    return InstructionCost::getInvalid();
+
   // Vectorization for masked interleaved accesses is only enabled for scalable
   // VF.
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93ab3353a296..474f856d2046 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3166,10 +3166,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
-  // For scalable vectors, the only interleave factor currently supported
-  // must be power of 2 since we require the (de)interleave2 intrinsics
-  // instead of shufflevectors.
-  if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
+  // For scalable vectors, the interleave factors must be <= 8 since we require
+  // the (de)interleaveN intrinsics instead of shufflevectors.
+  if (VF.isScalable() && InterleaveFactor > 8)
     return false;
 
   // If the group involves a non-integral pointer, we may not be able to
@@ -8718,10 +8717,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       bool Result = (VF.isVector() && // Query is illegal for VF == 1
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
-      // For scalable vectors, the only interleave factor currently supported
-      // must be power of 2 since we require the (de)interleave2 intrinsics
-      // instead of shufflevectors.
-      assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
+      // For scalable vectors, the interleave factors must be <= 8 since we
+      // require the (de)interleaveN intrinsics instead of shufflevectors.
+      assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f5a2533727b3..8863a3fb4b31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3296,21 +3296,13 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
-    assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
-                                    "scalable vectors, must be power of 2");
-    SmallVector<Value *> InterleavingValues(Vals);
-    // When interleaving, the number of values will be shrunk until we have the
-    // single final interleaved value.
-    auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
-    for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
-      InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
-      for (unsigned I = 0; I < Midpoint; ++I)
-        InterleavingValues[I] = Builder.CreateIntrinsic(
-            InterleaveTy, Intrinsic::vector_interleave2,
-            {InterleavingValues[I], InterleavingValues[Midpoint + I]},
-            /*FMFSource=*/nullptr, Name);
-    }
-    return InterleavingValues[0];
+    assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
+    VectorType *InterleaveTy =
+        VectorType::get(VecTy->getElementType(),
+                        VecTy->getElementCount().multiplyCoefficientBy(Factor));
+    return Builder.CreateIntrinsic(InterleaveTy,
+                                   getInterleaveIntrinsicID(Factor), Vals,
+                                   /*FMFSource=*/nullptr, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3396,7 +3388,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(isPowerOf2_32(InterleaveFactor) &&
+      assert(InterleaveFactor <= 8 &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
       SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
@@ -3440,43 +3432,18 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     ArrayRef<VPValue *> VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
-      assert(isPowerOf2_32(InterleaveFactor) &&
-             "Unsupported deinterleave factor for scalable vectors");
-
       // Scalable vectors cannot use arbitrary shufflevectors (only splats),
       // so must use intrinsics to deinterleave.
-      SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
-      DeinterleavedValues[0] = NewLoad;
-      // For the case of InterleaveFactor > 2, we will have to do recursive
-      // deinterleaving, because the current available deinterleave intrinsic
-      // supports only Factor of 2, otherwise it will bailout after first
-      // iteration.
-      // When deinterleaving, the number of values will double until we
-      // have "InterleaveFactor".
-      for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
-           NumVectors *= 2) {
-        // Deinterleave the elements within the vector
-        SmallVector<Value *> TempDeinterleavedValues(NumVectors);
-        for (unsigned I = 0; I < NumVectors; ++I) {
-          auto *DiTy = DeinterleavedValues[I]->getType();
-          TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
-              Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
-              /*FMFSource=*/nullptr, "strided.vec");
-        }
-        // Extract the deinterleaved values:
-        for (unsigned I = 0; I < 2; ++I)
-          for (unsigned J = 0; J < NumVectors; ++J)
-            DeinterleavedValues[NumVectors * I + J] =
-                State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
-      }
+      assert(InterleaveFactor <= 8 &&
+             "Unsupported deinterleave factor for scalable vectors");
+      Value *Deinterleave = State.Builder.CreateIntrinsic(
+          getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
+          NewLoad,
+          /*FMFSource=*/nullptr, "strided.vec");
 
-#ifndef NDEBUG
-      for (Value *Val : DeinterleavedValues)
-        assert(Val && "NULL Deinterleaved Value");
-#endif
       for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
         Instruction *Member = Group->getMember(I);
-        Value *StridedVec = DeinterleavedValues[I];
+        Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
         if (!Member) {
           // This value is not needed as it's not used
           cast<Instruction>(StridedVec)->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 6861644fc996..77e713256d24 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -375,8 +375,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1479,34 +1479,24 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
-; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]])
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1595,18 +1585,14 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
 ; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
@@ -1622,9 +1608,7 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
 ; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE8]], <vscale x 4 x i32> [[REVERSE9]])
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 469faf67a71b..3567aff0ace4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -469,36 +469,26 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -531,37 +521,27 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index feb3b6d42b65..61a3e3561ad9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -354,32 +354,40 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP7]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    store <vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -458,32 +466,40 @@ define void @load_store_factor3_i32(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP1]], align 4
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP7]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[TMP1]], align 4
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
+; SCALABLE-NEXT:    store <vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -541,32 +557,40 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> [[TMP6]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP7]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
+; CHECK-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -645,32 +669,40 @@ define void @load_store_factor3_i64(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> [[TMP6]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP7]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
+; SCALABLE-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -745,22 +777,16 @@ define void @load_store_factor4(ptr %p) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP8]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP9]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP16]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC3:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP17]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC]], <vscale x 4 x i64> [[INTERLEAVED_VEC3]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -877,22 +903,16 @@ define void @load_store_factor4(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 [[INDEX]], 4
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
 ; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP7]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP8]])
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP9]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 3
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
 ; SCALABLE-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP16]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC3:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP17]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC]], <vscale x 4 x i64> [[INTERLEAVED_VEC3]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
 ; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -970,38 +990,41 @@ exit:
 define void @load_store_factor5(ptr %p) {
 ; CHECK-LABEL: @load_store_factor5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <10 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 0, i32 5>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 1, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 2, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 3, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 4, i32 9>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x i64> [[TMP11]], <10 x i64> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; CHECK-NEXT:    store <10 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 5 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP10]], <vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]])
+; CHECK-NEXT:    store <vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1106,38 +1129,41 @@ define void @load_store_factor5(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor5(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 5
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <10 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 0, i32 5>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 1, i32 6>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 2, i32 7>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 3, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 4, i32 9>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x i64> [[TMP11]], <10 x i64> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; SCALABLE-NEXT:    store <10 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 5 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP10:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP10]], <vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]])
+; SCALABLE-NEXT:    store <vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1217,41 +1243,43 @@ exit:
 define void @load_store_factor6(ptr %p) {
 ; CHECK-LABEL: @load_store_factor6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 0, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 1, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 2, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 3, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 4, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 5, i32 11>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]])
+; CHECK-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1369,41 +1397,43 @@ define void @load_store_factor6(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor6(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 6
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 0, i32 6>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 1, i32 7>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 2, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 3, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 4, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 5, i32 11>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]])
+; SCALABLE-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1494,45 +1524,45 @@ exit:
 define void @load_store_factor7(ptr %p) {
 ; CHECK-LABEL: @load_store_factor7(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <14 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 0, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 1, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 2, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 3, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 4, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 5, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 6, i32 13>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <6 x i64> [[TMP14]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <14 x i64> [[TMP16]], <14 x i64> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
-; CHECK-NEXT:    store <14 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 7 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 7)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]])
+; CHECK-NEXT:    store <vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1664,45 +1694,45 @@ define void @load_store_factor7(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor7(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 7
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <14 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 0, i32 7>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 1, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 2, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 3, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 4, i32 11>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 5, i32 12>
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 6, i32 13>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; SCALABLE-NEXT:    [[TMP15:%.*]] = shufflevector <6 x i64> [[TMP14]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <14 x i64> [[TMP16]], <14 x i64> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
-; SCALABLE-NEXT:    store <14 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 7 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 7)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]])
+; SCALABLE-NEXT:    store <vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1818,27 +1848,15 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP5]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP7]])
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 7
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 3)
@@ -1847,13 +1865,7 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 7)
 ; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 8)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP23]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP24]], <vscale x 1 x i64> [[TMP25]], <vscale x 1 x i64> [[TMP26]])
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2019,27 +2031,15 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = shl i64 [[INDEX]], 3
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
 ; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP5]])
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP7]])
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 7
 ; SCALABLE-NEXT:    [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 1)
 ; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 2)
 ; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 3)
@@ -2048,13 +2048,7 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 6)
 ; SCALABLE-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 7)
 ; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 8)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP23]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP24]], <vscale x 1 x i64> [[TMP25]], <vscale x 1 x i64> [[TMP26]])
 ; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[TMP4]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; SCALABLE-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

From 702b9033c115500a934a6c49c325c112b30fe47f Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 12 Jun 2025 11:27:30 +0100
Subject: [PATCH 0116/1322] [LLVM][CodeGen][AArch64] Lower
 vector-(de)interleave to multi-register uzp/zip instructions. (#143128)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  48 ++
 .../AArch64/sve-vector-deinterleave.ll        | 633 ++++++++++++------
 .../CodeGen/AArch64/sve-vector-interleave.ll  | 561 ++++++++++------
 3 files changed, 850 insertions(+), 392 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index af5dfd6c9b8f..ac545534d728 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29451,6 +29451,30 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
 
+  // Are multi-register uzp instructions available?
+  if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
+      OpVT.getVectorElementType() != MVT::i1) {
+    Intrinsic::ID IntID;
+    switch (Op->getNumOperands()) {
+    default:
+      return SDValue();
+    case 2:
+      IntID = Intrinsic::aarch64_sve_uzp_x2;
+      break;
+    case 4:
+      if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
+          OpVT.getScalarSizeInBits() == 64)
+        return SDValue();
+      IntID = Intrinsic::aarch64_sve_uzp_x4;
+      break;
+    }
+
+    SmallVector<SDValue, 5> Ops;
+    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+    Ops.append(Op->op_values().begin(), Op->op_values().end());
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
+  }
+
   if (Op->getNumOperands() != 2)
     return SDValue();
 
@@ -29468,6 +29492,30 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
 
+  // Are multi-register zip instructions available?
+  if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
+      OpVT.getVectorElementType() != MVT::i1) {
+    Intrinsic::ID IntID;
+    switch (Op->getNumOperands()) {
+    default:
+      return SDValue();
+    case 2:
+      IntID = Intrinsic::aarch64_sve_zip_x2;
+      break;
+    case 4:
+      if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
+          OpVT.getScalarSizeInBits() == 64)
+        return SDValue();
+      IntID = Intrinsic::aarch64_sve_zip_x4;
+      break;
+    }
+
+    SmallVector<SDValue, 5> Ops;
+    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+    Ops.append(Op->op_values().begin(), Op->op_values().end());
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
+  }
+
   if (Op->getNumOperands() != 2)
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 89fc10b47bb3..139ecafaff0e 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -1,106 +1,166 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s
+; RUN: llc < %s -mattr=+sve | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sve,+sme2 | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sme2 -force-streaming | FileCheck %s -check-prefixes=CHECK,SME2,SME2-ALL
+; RUN: llc < %s -mattr=+sme2 -force-streaming -aarch64-sve-vector-bits-min=256 | FileCheck %s -check-prefixes=CHECK,SME2,SME2-256
+
+target triple = "aarch64-unknown-linux-gnu"
 
 define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.s, z0.s, z0.s
+; SVE-NEXT:    uzp2 z2.s, z0.s, z0.s
+; SVE-NEXT:    uunpklo z0.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
 define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    uzp2 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.h, z0.h, z0.h
+; SVE-NEXT:    uzp2 z2.h, z0.h, z0.h
+; SVE-NEXT:    uunpklo z0.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.s, z0.h
+; SME2-NEXT:    uunpklo z0.s, z0.h
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
 define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv16f16(<vscale x 16 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8f16_nxv16f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8f16_nxv16f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
 define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.s, z0.s, z0.s
+; SVE-NEXT:    uzp2 z2.s, z0.s, z0.s
+; SVE-NEXT:    uunpklo z0.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
 define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv8f32(<vscale x 8 x float> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4f32_nxv8f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.s, z0.s, z1.s
+; SVE-NEXT:    uzp2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4f32_nxv8f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
   ret {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
 define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv4f64(<vscale x 4 x double> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
 define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.s, z0.s, z0.s
+; SVE-NEXT:    uzp2 z2.s, z0.s, z0.s
+; SVE-NEXT:    uunpklo z0.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
   ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
 }
 
 define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv8bf16(<vscale x 8 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    uzp2 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.h, z0.h, z0.h
+; SVE-NEXT:    uzp2 z2.h, z0.h, z0.h
+; SVE-NEXT:    uunpklo z0.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.s, z0.h
+; SME2-NEXT:    uunpklo z0.s, z0.h
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave2.nxv8bf16(<vscale x 8 x bfloat> %vec)
   ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
 }
 
 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv16bf16(<vscale x 16 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv16bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8bf16_nxv16bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8bf16_nxv16bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave2.nxv16bf16(<vscale x 16 x bfloat> %vec)
   ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %retval
 }
@@ -108,141 +168,259 @@ define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8b
 ; Integers
 
 define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv32i8(<vscale x 32 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.b, z0.b, z1.b
-; CHECK-NEXT:    uzp2 z1.b, z0.b, z1.b
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.b, z0.b, z1.b
+; SVE-NEXT:    uzp2 z1.b, z0.b, z1.b
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.b, z1.b }, z0.b, z1.b
+; SME2-NEXT:    ret
   %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
 define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv16i16(<vscale x 16 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
 define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv8i32(<vscale x 8 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.s, z0.s, z1.s
+; SVE-NEXT:    uzp2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
 define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv4i64(<vscale x 4 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
 define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.b, z2.b, z3.b
-; CHECK-NEXT:    uzp1 z5.b, z0.b, z1.b
-; CHECK-NEXT:    uzp2 z3.b, z2.b, z3.b
-; CHECK-NEXT:    uzp2 z6.b, z0.b, z1.b
-; CHECK-NEXT:    uzp1 z0.b, z5.b, z4.b
-; CHECK-NEXT:    uzp2 z2.b, z5.b, z4.b
-; CHECK-NEXT:    uzp1 z1.b, z6.b, z3.b
-; CHECK-NEXT:    uzp2 z3.b, z6.b, z3.b
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.b, z2.b, z3.b
+; SVE-NEXT:    uzp1 z5.b, z0.b, z1.b
+; SVE-NEXT:    uzp2 z3.b, z2.b, z3.b
+; SVE-NEXT:    uzp2 z6.b, z0.b, z1.b
+; SVE-NEXT:    uzp1 z0.b, z5.b, z4.b
+; SVE-NEXT:    uzp2 z2.b, z5.b, z4.b
+; SVE-NEXT:    uzp1 z1.b, z6.b, z3.b
+; SVE-NEXT:    uzp2 z3.b, z6.b, z3.b
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    uzp { z0.b - z3.b }, { z0.b - z3.b }
+; SME2-NEXT:    ret
   %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
 define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv32i16(<vscale x 32 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.h, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z5.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z3.h, z2.h, z3.h
-; CHECK-NEXT:    uzp2 z6.h, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z0.h, z5.h, z4.h
-; CHECK-NEXT:    uzp2 z2.h, z5.h, z4.h
-; CHECK-NEXT:    uzp1 z1.h, z6.h, z3.h
-; CHECK-NEXT:    uzp2 z3.h, z6.h, z3.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.h, z2.h, z3.h
+; SVE-NEXT:    uzp1 z5.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z3.h, z2.h, z3.h
+; SVE-NEXT:    uzp2 z6.h, z0.h, z1.h
+; SVE-NEXT:    uzp1 z0.h, z5.h, z4.h
+; SVE-NEXT:    uzp2 z2.h, z5.h, z4.h
+; SVE-NEXT:    uzp1 z1.h, z6.h, z3.h
+; SVE-NEXT:    uzp2 z3.h, z6.h, z3.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    uzp { z0.h - z3.h }, { z0.h - z3.h }
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
 define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv16i32(<vscale x 16 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.s, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z5.s, z0.s, z1.s
-; CHECK-NEXT:    uzp2 z3.s, z2.s, z3.s
-; CHECK-NEXT:    uzp2 z6.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.s, z5.s, z4.s
-; CHECK-NEXT:    uzp2 z2.s, z5.s, z4.s
-; CHECK-NEXT:    uzp1 z1.s, z6.s, z3.s
-; CHECK-NEXT:    uzp2 z3.s, z6.s, z3.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.s, z2.s, z3.s
+; SVE-NEXT:    uzp1 z5.s, z0.s, z1.s
+; SVE-NEXT:    uzp2 z3.s, z2.s, z3.s
+; SVE-NEXT:    uzp2 z6.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.s, z5.s, z4.s
+; SVE-NEXT:    uzp2 z2.s, z5.s, z4.s
+; SVE-NEXT:    uzp1 z1.s, z6.s, z3.s
+; SVE-NEXT:    uzp2 z3.s, z6.s, z3.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    uzp { z0.s - z3.s }, { z0.s - z3.s }
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
 define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv8i64(<vscale x 8 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    uzp2 z2.d, z5.d, z4.d
-; CHECK-NEXT:    uzp1 z1.d, z6.d, z3.d
-; CHECK-NEXT:    uzp2 z3.d, z6.d, z3.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z5.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z3.d, z2.d, z3.d
+; SVE-NEXT:    uzp2 z6.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.d, z5.d, z4.d
+; SVE-NEXT:    uzp2 z2.d, z5.d, z4.d
+; SVE-NEXT:    uzp1 z1.d, z6.d, z3.d
+; SVE-NEXT:    uzp2 z3.d, z6.d, z3.d
+; SVE-NEXT:    ret
+;
+; SME2-ALL-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; SME2-ALL:       // %bb.0:
+; SME2-ALL-NEXT:    uzp { z4.d, z5.d }, z2.d, z3.d
+; SME2-ALL-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-ALL-NEXT:    uzp { z2.d, z3.d }, z0.d, z4.d
+; SME2-ALL-NEXT:    uzp { z4.d, z5.d }, z1.d, z5.d
+; SME2-ALL-NEXT:    mov z0.d, z2.d
+; SME2-ALL-NEXT:    mov z1.d, z4.d
+; SME2-ALL-NEXT:    mov z2.d, z3.d
+; SME2-ALL-NEXT:    mov z3.d, z5.d
+; SME2-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    uzp { z0.d - z3.d }, { z0.d - z3.d }
+; SME2-256-NEXT:    ret
   %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
 define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv16i64(<vscale x 16 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z24.d, z6.d, z7.d
-; CHECK-NEXT:    uzp1 z25.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z26.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z27.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z6.d, z6.d, z7.d
-; CHECK-NEXT:    uzp2 z4.d, z4.d, z5.d
-; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z5.d, z25.d, z24.d
-; CHECK-NEXT:    uzp2 z24.d, z25.d, z24.d
-; CHECK-NEXT:    uzp1 z7.d, z27.d, z26.d
-; CHECK-NEXT:    uzp1 z28.d, z4.d, z6.d
-; CHECK-NEXT:    uzp2 z25.d, z27.d, z26.d
-; CHECK-NEXT:    uzp1 z29.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z26.d, z4.d, z6.d
-; CHECK-NEXT:    uzp2 z27.d, z0.d, z2.d
-; CHECK-NEXT:    uzp1 z0.d, z7.d, z5.d
-; CHECK-NEXT:    uzp1 z2.d, z25.d, z24.d
-; CHECK-NEXT:    uzp2 z4.d, z7.d, z5.d
-; CHECK-NEXT:    uzp1 z1.d, z29.d, z28.d
-; CHECK-NEXT:    uzp1 z3.d, z27.d, z26.d
-; CHECK-NEXT:    uzp2 z5.d, z29.d, z28.d
-; CHECK-NEXT:    uzp2 z6.d, z25.d, z24.d
-; CHECK-NEXT:    uzp2 z7.d, z27.d, z26.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z24.d, z6.d, z7.d
+; SVE-NEXT:    uzp1 z25.d, z4.d, z5.d
+; SVE-NEXT:    uzp1 z26.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z27.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z6.d, z6.d, z7.d
+; SVE-NEXT:    uzp2 z4.d, z4.d, z5.d
+; SVE-NEXT:    uzp2 z2.d, z2.d, z3.d
+; SVE-NEXT:    uzp2 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z5.d, z25.d, z24.d
+; SVE-NEXT:    uzp2 z24.d, z25.d, z24.d
+; SVE-NEXT:    uzp1 z7.d, z27.d, z26.d
+; SVE-NEXT:    uzp1 z28.d, z4.d, z6.d
+; SVE-NEXT:    uzp2 z25.d, z27.d, z26.d
+; SVE-NEXT:    uzp1 z29.d, z0.d, z2.d
+; SVE-NEXT:    uzp2 z26.d, z4.d, z6.d
+; SVE-NEXT:    uzp2 z27.d, z0.d, z2.d
+; SVE-NEXT:    uzp1 z0.d, z7.d, z5.d
+; SVE-NEXT:    uzp1 z2.d, z25.d, z24.d
+; SVE-NEXT:    uzp2 z4.d, z7.d, z5.d
+; SVE-NEXT:    uzp1 z1.d, z29.d, z28.d
+; SVE-NEXT:    uzp1 z3.d, z27.d, z26.d
+; SVE-NEXT:    uzp2 z5.d, z29.d, z28.d
+; SVE-NEXT:    uzp2 z6.d, z25.d, z24.d
+; SVE-NEXT:    uzp2 z7.d, z27.d, z26.d
+; SVE-NEXT:    ret
+;
+; SME2-ALL-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; SME2-ALL:       // %bb.0:
+; SME2-ALL-NEXT:    uzp { z6.d, z7.d }, z6.d, z7.d
+; SME2-ALL-NEXT:    uzp { z24.d, z25.d }, z4.d, z5.d
+; SME2-ALL-NEXT:    uzp { z26.d, z27.d }, z24.d, z6.d
+; SME2-ALL-NEXT:    uzp { z2.d, z3.d }, z2.d, z3.d
+; SME2-ALL-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-ALL-NEXT:    uzp { z28.d, z29.d }, z0.d, z2.d
+; SME2-ALL-NEXT:    uzp { z4.d, z5.d }, z28.d, z26.d
+; SME2-ALL-NEXT:    uzp { z30.d, z31.d }, z25.d, z7.d
+; SME2-ALL-NEXT:    uzp { z0.d, z1.d }, z1.d, z3.d
+; SME2-ALL-NEXT:    uzp { z6.d, z7.d }, z0.d, z30.d
+; SME2-ALL-NEXT:    uzp { z24.d, z25.d }, z29.d, z27.d
+; SME2-ALL-NEXT:    uzp { z26.d, z27.d }, z1.d, z31.d
+; SME2-ALL-NEXT:    mov z0.d, z4.d
+; SME2-ALL-NEXT:    mov z1.d, z6.d
+; SME2-ALL-NEXT:    mov z2.d, z24.d
+; SME2-ALL-NEXT:    mov z3.d, z26.d
+; SME2-ALL-NEXT:    mov z4.d, z5.d
+; SME2-ALL-NEXT:    mov z5.d, z7.d
+; SME2-ALL-NEXT:    mov z6.d, z25.d
+; SME2-ALL-NEXT:    mov z7.d, z27.d
+; SME2-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    uzp { z28.d - z31.d }, { z4.d - z7.d }
+; SME2-256-NEXT:    uzp { z0.d - z3.d }, { z0.d - z3.d }
+; SME2-256-NEXT:    uzp { z4.d, z5.d }, z0.d, z28.d
+; SME2-256-NEXT:    uzp { z6.d, z7.d }, z1.d, z29.d
+; SME2-256-NEXT:    uzp { z24.d, z25.d }, z2.d, z30.d
+; SME2-256-NEXT:    uzp { z26.d, z27.d }, z3.d, z31.d
+; SME2-256-NEXT:    mov z0.d, z4.d
+; SME2-256-NEXT:    mov z1.d, z6.d
+; SME2-256-NEXT:    mov z2.d, z24.d
+; SME2-256-NEXT:    mov z3.d, z26.d
+; SME2-256-NEXT:    mov z4.d, z5.d
+; SME2-256-NEXT:    mov z5.d, z7.d
+; SME2-256-NEXT:    mov z6.d, z25.d
+; SME2-256-NEXT:    mov z7.d, z27.d
+; SME2-256-NEXT:    ret
   %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave8.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
@@ -299,39 +477,65 @@ define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1
 ; Split illegal types
 
 define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv8i64(<vscale x 8 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
-; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z4.d
-; CHECK-NEXT:    mov z2.d, z6.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z5.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z6.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z3.d, z2.d, z3.d
+; SVE-NEXT:    mov z0.d, z5.d
+; SVE-NEXT:    mov z1.d, z4.d
+; SVE-NEXT:    mov z2.d, z6.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z4.d, z5.d }, z0.d, z1.d
+; SME2-NEXT:    uzp { z6.d, z7.d }, z2.d, z3.d
+; SME2-NEXT:    mov z0.d, z4.d
+; SME2-NEXT:    mov z1.d, z6.d
+; SME2-NEXT:    mov z2.d, z5.d
+; SME2-NEXT:    mov z3.d, z7.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
 define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv16i64(<vscale x 16 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z26.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z27.d, z6.d, z7.d
-; CHECK-NEXT:    uzp2 z28.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z29.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z30.d, z4.d, z5.d
-; CHECK-NEXT:    uzp2 z7.d, z6.d, z7.d
-; CHECK-NEXT:    mov z0.d, z25.d
-; CHECK-NEXT:    mov z1.d, z24.d
-; CHECK-NEXT:    mov z2.d, z26.d
-; CHECK-NEXT:    mov z3.d, z27.d
-; CHECK-NEXT:    mov z4.d, z28.d
-; CHECK-NEXT:    mov z5.d, z29.d
-; CHECK-NEXT:    mov z6.d, z30.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z24.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z25.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z26.d, z4.d, z5.d
+; SVE-NEXT:    uzp1 z27.d, z6.d, z7.d
+; SVE-NEXT:    uzp2 z28.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z29.d, z2.d, z3.d
+; SVE-NEXT:    uzp2 z30.d, z4.d, z5.d
+; SVE-NEXT:    uzp2 z7.d, z6.d, z7.d
+; SVE-NEXT:    mov z0.d, z25.d
+; SVE-NEXT:    mov z1.d, z24.d
+; SVE-NEXT:    mov z2.d, z26.d
+; SVE-NEXT:    mov z3.d, z27.d
+; SVE-NEXT:    mov z4.d, z28.d
+; SVE-NEXT:    mov z5.d, z29.d
+; SVE-NEXT:    mov z6.d, z30.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z24.d, z25.d }, z0.d, z1.d
+; SME2-NEXT:    uzp { z26.d, z27.d }, z2.d, z3.d
+; SME2-NEXT:    uzp { z28.d, z29.d }, z4.d, z5.d
+; SME2-NEXT:    uzp { z30.d, z31.d }, z6.d, z7.d
+; SME2-NEXT:    mov z0.d, z24.d
+; SME2-NEXT:    mov z1.d, z26.d
+; SME2-NEXT:    mov z2.d, z28.d
+; SME2-NEXT:    mov z3.d, z30.d
+; SME2-NEXT:    mov z4.d, z25.d
+; SME2-NEXT:    mov z5.d, z27.d
+; SME2-NEXT:    mov z6.d, z29.d
+; SME2-NEXT:    mov z7.d, z31.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
@@ -340,37 +544,58 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; Promote illegal type size
 
 define {<vscale x 8 x i8>, <vscale x 8 x i8>} @vector_deinterleave_nxv8i8_nxv16i8(<vscale x 16 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i8_nxv16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.h, z0.b
-; CHECK-NEXT:    uunpklo z2.h, z0.b
-; CHECK-NEXT:    uzp1 z0.h, z2.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z2.h, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i8_nxv16i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z1.h, z0.b
+; SVE-NEXT:    uunpklo z2.h, z0.b
+; SVE-NEXT:    uzp1 z0.h, z2.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z2.h, z1.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i8_nxv16i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.h, z0.b
+; SME2-NEXT:    uunpklo z0.h, z0.b
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
   ret {<vscale x 8 x i8>, <vscale x 8 x i8>} %retval
 }
 
 define {<vscale x 4 x i16>, <vscale x 4 x i16>} @vector_deinterleave_nxv4i16_nxv8i16(<vscale x 8 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i16_nxv8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i16_nxv8i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z1.s, z0.h
+; SVE-NEXT:    uunpklo z2.s, z0.h
+; SVE-NEXT:    uzp1 z0.s, z2.s, z1.s
+; SVE-NEXT:    uzp2 z1.s, z2.s, z1.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i16_nxv8i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.s, z0.h
+; SME2-NEXT:    uunpklo z0.s, z0.h
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
   ret {<vscale x 4 x i16>, <vscale x 4 x i16>} %retval
 }
 
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv4i32(<vscale x 4 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i32_nxv4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i32_nxv4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z1.d, z0.s
+; SVE-NEXT:    uunpklo z2.d, z0.s
+; SVE-NEXT:    uzp1 z0.d, z2.d, z1.d
+; SVE-NEXT:    uzp2 z1.d, z2.d, z1.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2i32_nxv4i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
   ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 34d026f43708..52cb2d9ebe34 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -1,101 +1,156 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
+; RUN: llc < %s -mattr=+sve | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sve,+sme2 | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sme2 -force-streaming | FileCheck %s -check-prefixes=CHECK,SME2,SME-ALL
+; RUN: llc < %s -mattr=+sme2 -force-streaming -aarch64-sve-vector-bits-min=256 | FileCheck %s -check-prefixes=CHECK,SME2,SME2-256
+
+target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 4 x half> @interleave2_nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1) {
-; CHECK-LABEL: interleave2_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
   ret <vscale x 4 x half> %retval
 }
 
 define <vscale x 8 x half> @interleave2_nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1) {
-; CHECK-LABEL: interleave2_nxv8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
   ret <vscale x 8 x half> %retval
 }
 
 define <vscale x 16 x half> @interleave2_nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1) {
-; CHECK-LABEL: interleave2_nxv16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
   ret <vscale x 16 x half> %retval
 }
 
 define <vscale x 4 x float> @interleave2_nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1) {
-; CHECK-LABEL: interleave2_nxv4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
   ret <vscale x 4 x float> %retval
 }
 
 define <vscale x 8 x float> @interleave2_nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) {
-; CHECK-LABEL: interleave2_nxv8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
   ret <vscale x 8 x float> %retval
 }
 
 define <vscale x 4 x double> @interleave2_nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1) {
-; CHECK-LABEL: interleave2_nxv4f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4f64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4f64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x double>@llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
   ret <vscale x 4 x double> %retval
 }
 
 define <vscale x 4 x bfloat> @interleave2_nxv4bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1) {
-; CHECK-LABEL: interleave2_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1)
   ret <vscale x 4 x bfloat> %retval
 }
 
 define <vscale x 8 x bfloat> @interleave2_nxv8bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1) {
-; CHECK-LABEL: interleave2_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1)
   ret <vscale x 8 x bfloat> %retval
 }
 
 define <vscale x 16 x bfloat> @interleave2_nxv16bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1) {
-; CHECK-LABEL: interleave2_nxv16bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1)
   ret <vscale x 16 x bfloat> %retval
 }
@@ -103,141 +158,237 @@ define <vscale x 16 x bfloat> @interleave2_nxv16bf16(<vscale x 8 x bfloat> %vec0
 ; Integers
 
 define <vscale x 32 x i8> @interleave2_nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1) {
-; CHECK-LABEL: interleave2_nxv32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.b, z0.b, z1.b
-; CHECK-NEXT:    zip2 z1.b, z0.b, z1.b
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv32i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.b, z0.b, z1.b
+; SVE-NEXT:    zip2 z1.b, z0.b, z1.b
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv32i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.b, z1.b }, z0.b, z1.b
+; SME2-NEXT:    ret
   %retval = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
   ret <vscale x 32 x i8> %retval
 }
 
 define <vscale x 16 x i16> @interleave2_nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1) {
-; CHECK-LABEL: interleave2_nxv16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
   ret <vscale x 16 x i16> %retval
 }
 
 define <vscale x 8 x i32> @interleave2_nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1) {
-; CHECK-LABEL: interleave2_nxv8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
   ret <vscale x 8 x i32> %retval
 }
 
 define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1) {
-; CHECK-LABEL: interleave2_nxv4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
   ret <vscale x 4 x i64> %retval
 }
 
 define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) {
-; CHECK-LABEL: interleave4_nxv16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.b, z1.b, z3.b
-; CHECK-NEXT:    zip1 z5.b, z0.b, z2.b
-; CHECK-NEXT:    zip2 z3.b, z1.b, z3.b
-; CHECK-NEXT:    zip2 z6.b, z0.b, z2.b
-; CHECK-NEXT:    zip1 z0.b, z5.b, z4.b
-; CHECK-NEXT:    zip2 z1.b, z5.b, z4.b
-; CHECK-NEXT:    zip1 z2.b, z6.b, z3.b
-; CHECK-NEXT:    zip2 z3.b, z6.b, z3.b
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv16i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.b, z1.b, z3.b
+; SVE-NEXT:    zip1 z5.b, z0.b, z2.b
+; SVE-NEXT:    zip2 z3.b, z1.b, z3.b
+; SVE-NEXT:    zip2 z6.b, z0.b, z2.b
+; SVE-NEXT:    zip1 z0.b, z5.b, z4.b
+; SVE-NEXT:    zip2 z1.b, z5.b, z4.b
+; SVE-NEXT:    zip1 z2.b, z6.b, z3.b
+; SVE-NEXT:    zip2 z3.b, z6.b, z3.b
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave4_nxv16i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    zip { z0.b - z3.b }, { z0.b - z3.b }
+; SME2-NEXT:    ret
   %retval = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3)
   ret <vscale x 64 x i8> %retval
 }
 
 define <vscale x 32 x i16> @interleave4_nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3) {
-; CHECK-LABEL: interleave4_nxv8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.h, z1.h, z3.h
-; CHECK-NEXT:    zip1 z5.h, z0.h, z2.h
-; CHECK-NEXT:    zip2 z3.h, z1.h, z3.h
-; CHECK-NEXT:    zip2 z6.h, z0.h, z2.h
-; CHECK-NEXT:    zip1 z0.h, z5.h, z4.h
-; CHECK-NEXT:    zip2 z1.h, z5.h, z4.h
-; CHECK-NEXT:    zip1 z2.h, z6.h, z3.h
-; CHECK-NEXT:    zip2 z3.h, z6.h, z3.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv8i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.h, z1.h, z3.h
+; SVE-NEXT:    zip1 z5.h, z0.h, z2.h
+; SVE-NEXT:    zip2 z3.h, z1.h, z3.h
+; SVE-NEXT:    zip2 z6.h, z0.h, z2.h
+; SVE-NEXT:    zip1 z0.h, z5.h, z4.h
+; SVE-NEXT:    zip2 z1.h, z5.h, z4.h
+; SVE-NEXT:    zip1 z2.h, z6.h, z3.h
+; SVE-NEXT:    zip2 z3.h, z6.h, z3.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave4_nxv8i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    zip { z0.h - z3.h }, { z0.h - z3.h }
+; SME2-NEXT:    ret
   %retval = call <vscale x 32 x i16> @llvm.vector.interleave4.nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3)
   ret <vscale x 32 x i16> %retval
 }
 
 define <vscale x 16 x i32> @interleave4_nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3) {
-; CHECK-LABEL: interleave4_nxv4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
-; CHECK-NEXT:    zip2 z6.s, z0.s, z2.s
-; CHECK-NEXT:    zip1 z0.s, z5.s, z4.s
-; CHECK-NEXT:    zip2 z1.s, z5.s, z4.s
-; CHECK-NEXT:    zip1 z2.s, z6.s, z3.s
-; CHECK-NEXT:    zip2 z3.s, z6.s, z3.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.s, z1.s, z3.s
+; SVE-NEXT:    zip1 z5.s, z0.s, z2.s
+; SVE-NEXT:    zip2 z3.s, z1.s, z3.s
+; SVE-NEXT:    zip2 z6.s, z0.s, z2.s
+; SVE-NEXT:    zip1 z0.s, z5.s, z4.s
+; SVE-NEXT:    zip2 z1.s, z5.s, z4.s
+; SVE-NEXT:    zip1 z2.s, z6.s, z3.s
+; SVE-NEXT:    zip2 z3.s, z6.s, z3.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave4_nxv4i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    zip { z0.s - z3.s }, { z0.s - z3.s }
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3)
   ret <vscale x 16 x i32> %retval
 }
 
 define <vscale x 8 x i64> @interleave4_nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3) {
-; CHECK-LABEL: interleave4_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
-; CHECK-NEXT:    zip2 z6.d, z0.d, z2.d
-; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z2.d, z6.d, z3.d
-; CHECK-NEXT:    zip2 z3.d, z6.d, z3.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.d, z1.d, z3.d
+; SVE-NEXT:    zip1 z5.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z3.d, z1.d, z3.d
+; SVE-NEXT:    zip2 z6.d, z0.d, z2.d
+; SVE-NEXT:    zip1 z0.d, z5.d, z4.d
+; SVE-NEXT:    zip2 z1.d, z5.d, z4.d
+; SVE-NEXT:    zip1 z2.d, z6.d, z3.d
+; SVE-NEXT:    zip2 z3.d, z6.d, z3.d
+; SVE-NEXT:    ret
+;
+; SME-ALL-LABEL: interleave4_nxv8i64:
+; SME-ALL:       // %bb.0:
+; SME-ALL-NEXT:    zip { z4.d, z5.d }, z1.d, z3.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z0.d, z2.d
+; SME-ALL-NEXT:    zip { z0.d, z1.d }, z2.d, z4.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z3.d, z5.d
+; SME-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: interleave4_nxv8i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    zip { z0.d - z3.d }, { z0.d - z3.d }
+; SME2-256-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3)
   ret <vscale x 8 x i64> %retval
 }
 
 define <vscale x 16 x i64> @interleave8_nxv16i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3, <vscale x 2 x i64> %vec4, <vscale x 2 x i64> %vec5, <vscale x 2 x i64> %vec6, <vscale x 2 x i64> %vec7) {
-; CHECK-LABEL: interleave8_nxv16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z24.d, z3.d, z7.d
-; CHECK-NEXT:    zip1 z25.d, z1.d, z5.d
-; CHECK-NEXT:    zip1 z26.d, z2.d, z6.d
-; CHECK-NEXT:    zip1 z27.d, z0.d, z4.d
-; CHECK-NEXT:    zip2 z3.d, z3.d, z7.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z5.d
-; CHECK-NEXT:    zip2 z2.d, z2.d, z6.d
-; CHECK-NEXT:    zip2 z0.d, z0.d, z4.d
-; CHECK-NEXT:    zip1 z4.d, z25.d, z24.d
-; CHECK-NEXT:    zip2 z6.d, z25.d, z24.d
-; CHECK-NEXT:    zip1 z5.d, z27.d, z26.d
-; CHECK-NEXT:    zip2 z7.d, z27.d, z26.d
-; CHECK-NEXT:    zip1 z24.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z25.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z26.d, z1.d, z3.d
-; CHECK-NEXT:    zip2 z27.d, z0.d, z2.d
-; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z2.d, z7.d, z6.d
-; CHECK-NEXT:    zip2 z3.d, z7.d, z6.d
-; CHECK-NEXT:    zip1 z4.d, z25.d, z24.d
-; CHECK-NEXT:    zip2 z5.d, z25.d, z24.d
-; CHECK-NEXT:    zip1 z6.d, z27.d, z26.d
-; CHECK-NEXT:    zip2 z7.d, z27.d, z26.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave8_nxv16i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z24.d, z3.d, z7.d
+; SVE-NEXT:    zip1 z25.d, z1.d, z5.d
+; SVE-NEXT:    zip1 z26.d, z2.d, z6.d
+; SVE-NEXT:    zip1 z27.d, z0.d, z4.d
+; SVE-NEXT:    zip2 z3.d, z3.d, z7.d
+; SVE-NEXT:    zip2 z1.d, z1.d, z5.d
+; SVE-NEXT:    zip2 z2.d, z2.d, z6.d
+; SVE-NEXT:    zip2 z0.d, z0.d, z4.d
+; SVE-NEXT:    zip1 z4.d, z25.d, z24.d
+; SVE-NEXT:    zip2 z6.d, z25.d, z24.d
+; SVE-NEXT:    zip1 z5.d, z27.d, z26.d
+; SVE-NEXT:    zip2 z7.d, z27.d, z26.d
+; SVE-NEXT:    zip1 z24.d, z1.d, z3.d
+; SVE-NEXT:    zip1 z25.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z26.d, z1.d, z3.d
+; SVE-NEXT:    zip2 z27.d, z0.d, z2.d
+; SVE-NEXT:    zip1 z0.d, z5.d, z4.d
+; SVE-NEXT:    zip2 z1.d, z5.d, z4.d
+; SVE-NEXT:    zip1 z2.d, z7.d, z6.d
+; SVE-NEXT:    zip2 z3.d, z7.d, z6.d
+; SVE-NEXT:    zip1 z4.d, z25.d, z24.d
+; SVE-NEXT:    zip2 z5.d, z25.d, z24.d
+; SVE-NEXT:    zip1 z6.d, z27.d, z26.d
+; SVE-NEXT:    zip2 z7.d, z27.d, z26.d
+; SVE-NEXT:    ret
+;
+; SME-ALL-LABEL: interleave8_nxv16i64:
+; SME-ALL:       // %bb.0:
+; SME-ALL-NEXT:    zip { z24.d, z25.d }, z3.d, z7.d
+; SME-ALL-NEXT:    zip { z26.d, z27.d }, z1.d, z5.d
+; SME-ALL-NEXT:    zip { z28.d, z29.d }, z26.d, z24.d
+; SME-ALL-NEXT:    zip { z6.d, z7.d }, z2.d, z6.d
+; SME-ALL-NEXT:    zip { z4.d, z5.d }, z0.d, z4.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z4.d, z6.d
+; SME-ALL-NEXT:    zip { z0.d, z1.d }, z2.d, z28.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z3.d, z29.d
+; SME-ALL-NEXT:    zip { z24.d, z25.d }, z27.d, z25.d
+; SME-ALL-NEXT:    zip { z6.d, z7.d }, z5.d, z7.d
+; SME-ALL-NEXT:    zip { z4.d, z5.d }, z6.d, z24.d
+; SME-ALL-NEXT:    zip { z6.d, z7.d }, z7.d, z25.d
+; SME-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: interleave8_nxv16i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    zip { z26.d, z27.d }, z3.d, z7.d
+; SME2-256-NEXT:    zip { z6.d, z7.d }, z2.d, z6.d
+; SME2-256-NEXT:    zip { z24.d, z25.d }, z1.d, z5.d
+; SME2-256-NEXT:    zip { z0.d, z1.d }, z0.d, z4.d
+; SME2-256-NEXT:    mov z28.d, z0.d
+; SME2-256-NEXT:    mov z29.d, z24.d
+; SME2-256-NEXT:    mov z30.d, z6.d
+; SME2-256-NEXT:    mov z31.d, z26.d
+; SME2-256-NEXT:    mov z24.d, z1.d
+; SME2-256-NEXT:    mov z26.d, z7.d
+; SME2-256-NEXT:    zip { z0.d - z3.d }, { z28.d - z31.d }
+; SME2-256-NEXT:    zip { z4.d - z7.d }, { z24.d - z27.d }
+; SME2-256-NEXT:    ret
   %retval = call <vscale x 16 x i64> @llvm.vector.interleave8.nxv16i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3, <vscale x 2 x i64> %vec4, <vscale x 2 x i64> %vec5, <vscale x 2 x i64> %vec6, <vscale x 2 x i64> %vec7)
   ret <vscale x 16 x i64> %retval
 }
@@ -291,31 +442,47 @@ define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x
 ; Split illegal type size
 
 define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1) {
-; CHECK-LABEL: interleave2_nxv16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z2.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
-; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, z4.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.s, z1.s, z3.s
+; SVE-NEXT:    zip1 z5.s, z0.s, z2.s
+; SVE-NEXT:    zip2 z2.s, z0.s, z2.s
+; SVE-NEXT:    zip2 z3.s, z1.s, z3.s
+; SVE-NEXT:    mov z0.d, z5.d
+; SVE-NEXT:    mov z1.d, z2.d
+; SVE-NEXT:    mov z2.d, z4.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z4.s, z5.s }, z0.s, z2.s
+; SME2-NEXT:    zip { z2.s, z3.s }, z1.s, z3.s
+; SME2-NEXT:    mov z0.d, z4.d
+; SME2-NEXT:    mov z1.d, z5.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
   ret <vscale x 16 x i32> %retval
 }
 
 define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1) {
-; CHECK-LABEL: interleave2_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z2.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
-; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, z4.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.d, z1.d, z3.d
+; SVE-NEXT:    zip1 z5.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z2.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z3.d, z1.d, z3.d
+; SVE-NEXT:    mov z0.d, z5.d
+; SVE-NEXT:    mov z1.d, z2.d
+; SVE-NEXT:    mov z2.d, z4.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z4.d, z5.d }, z0.d, z2.d
+; SME2-NEXT:    zip { z2.d, z3.d }, z1.d, z3.d
+; SME2-NEXT:    mov z0.d, z4.d
+; SME2-NEXT:    mov z1.d, z5.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
   ret <vscale x 8 x i64> %retval
 }
@@ -323,34 +490,52 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; Promote illegal type size
 
 define <vscale x 16 x i8> @interleave2_nxv8i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1) {
-; CHECK-LABEL: interleave2_nxv8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z2.b
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE-NEXT:    uzp1 z0.b, z0.b, z2.b
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    uzp1 z0.b, z0.b, z1.b
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
   ret <vscale x 16 x i8> %retval
 }
 
 define <vscale x 8 x i16> @interleave2_nxv4i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1) {
-; CHECK-LABEL: interleave2_nxv4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
   ret <vscale x 8 x i16> %retval
 }
 
 define <vscale x 4 x i32> @interleave2_nxv2i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1) {
-; CHECK-LABEL: interleave2_nxv2i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv2i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv2i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
   ret <vscale x 4 x i32> %retval
 }

From d517f15e09e49e172387cb6deb76e4ee2d45d0e4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 12:30:35 +0200
Subject: [PATCH 0117/1322] [LICM] Regenerate test checks (NFC)

---
 llvm/test/Transforms/LICM/call-hoisting.ll | 247 ++++++++++++++++-----
 1 file changed, 191 insertions(+), 56 deletions(-)

diff --git a/llvm/test/Transforms/LICM/call-hoisting.ll b/llvm/test/Transforms/LICM/call-hoisting.ll
index e6d2e42e34e8..907f13438623 100644
--- a/llvm/test/Transforms/LICM/call-hoisting.ll
+++ b/llvm/test/Transforms/LICM/call-hoisting.ll
@@ -1,13 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=licm %s | FileCheck %s
 ; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<target-ir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 declare i32 @load(ptr %p) argmemonly readonly nounwind
 
 define void @test_load(ptr noalias %loc, ptr noalias %sink) {
-; CHECK-LABEL: @test_load
-; CHECK-LABEL: entry:
-; CHECK: call i32 @load
-; CHECK-LABEL: loop:
+; CHECK-LABEL: define void @test_load(
+; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[SINK:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @load(ptr [[LOC]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    store volatile i32 [[RET]], ptr [[SINK]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -30,15 +41,26 @@ declare i32 @spec(ptr %p, ptr %q) readonly argmemonly nounwind speculatable
 ; However, we need not strip the nonnull attribute since it just propagates
 ; poison if the parameter was indeed null.
 define void @test_strip_attribute(ptr noalias %loc, ptr noalias %sink, ptr %q) {
-; CHECK-LABEL: @test_strip_attribute(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[RET:%.*]] = call i32 @load(ptr [[LOC:%.*]])
-; CHECK-NEXT:    [[NULLCHK:%.*]] = icmp eq ptr [[Q:%.*]], null
+; CHECK-LABEL: define void @test_strip_attribute(
+; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[SINK:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @load(ptr [[LOC]])
+; CHECK-NEXT:    [[NULLCHK:%.*]] = icmp eq ptr [[Q]], null
 ; CHECK-NEXT:    [[RET2:%.*]] = call i32 @spec(ptr nonnull [[Q]], ptr [[LOC]])
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[ISNULL:%.*]] ]
-; CHECK-NEXT:    br i1 [[NULLCHK]], label [[ISNULL]], label [[NONNULLBB:%.*]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[ISNULL:.*]] ]
+; CHECK-NEXT:    br i1 [[NULLCHK]], label %[[ISNULL]], label %[[NONNULLBB:.*]]
+; CHECK:       [[NONNULLBB]]:
+; CHECK-NEXT:    br label %[[ISNULL]]
+; CHECK:       [[ISNULL]]:
+; CHECK-NEXT:    store volatile i32 [[RET]], ptr [[SINK]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -65,10 +87,19 @@ exit:
 declare void @store(i32 %val, ptr %p) argmemonly writeonly nounwind
 
 define void @test(ptr %loc) {
-; CHECK-LABEL: @test
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -84,10 +115,23 @@ exit:
 }
 
 define void @test_multiexit(ptr %loc, i1 %earlycnd) {
-; CHECK-LABEL: @test_multiexit
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: backedge:
+; CHECK-LABEL: define void @test_multiexit(
+; CHECK-SAME: ptr [[LOC:%.*]], i1 [[EARLYCND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EARLYCND]], label %[[EXIT1:.*]], label %[[BACKEDGE]]
+; CHECK:       [[BACKEDGE]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT2:.*]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[EXIT2]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -108,10 +152,19 @@ exit2:
 }
 
 define void @neg_lv_value(ptr %loc) {
-; CHECK-LABEL: @neg_lv_value
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_lv_value(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 [[IV]], ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -127,10 +180,20 @@ exit:
 }
 
 define void @neg_lv_addr(ptr %loc) {
-; CHECK-LABEL: @neg_lv_addr
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_lv_addr(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i32, ptr [[LOC]], i32 [[IV]]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[P]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -147,10 +210,20 @@ exit:
 }
 
 define void @neg_mod(ptr %loc) {
-; CHECK-LABEL: @neg_mod
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_mod(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    store i32 [[IV]], ptr [[LOC]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -167,10 +240,25 @@ exit:
 }
 
 define void @neg_ref(ptr %loc) {
-; CHECK-LABEL: @neg_ref
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit1:
+; CHECK-LABEL: define void @neg_ref(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[LOC]], align 4
+; CHECK-NEXT:    [[EARLYCND:%.*]] = icmp eq i32 [[V]], 198
+; CHECK-NEXT:    br i1 [[EARLYCND]], label %[[EXIT1:.*]], label %[[BACKEDGE]]
+; CHECK:       [[BACKEDGE]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT2:.*]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[EXIT2]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -195,10 +283,20 @@ exit2:
 declare void @modref()
 
 define void @neg_modref(ptr %loc) {
-; CHECK-LABEL: @neg_modref
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_modref(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    call void @modref()
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -215,10 +313,20 @@ exit:
 }
 
 define void @neg_fence(ptr %loc) {
-; CHECK-LABEL: @neg_fence
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_fence(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    fence seq_cst
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -239,10 +347,19 @@ declare void @not_argmemonly(i32 %v, ptr %p) writeonly nounwind
 declare void @not_writeonly(i32 %v, ptr %p) argmemonly nounwind
 
 define void @neg_not_nounwind(ptr %loc) {
-; CHECK-LABEL: @neg_not_nounwind
-; CHECK-LABEL: loop:
-; CHECK: call void @not_nounwind
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_not_nounwind(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @not_nounwind(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -258,10 +375,19 @@ exit:
 }
 
 define void @neg_not_argmemonly(ptr %loc) {
-; CHECK-LABEL: @neg_not_argmemonly
-; CHECK-LABEL: loop:
-; CHECK: call void @not_argmemonly
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_not_argmemonly(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @not_argmemonly(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -277,10 +403,19 @@ exit:
 }
 
 define void @neg_not_writeonly(ptr %loc) {
-; CHECK-LABEL: @neg_not_writeonly
-; CHECK-LABEL: loop:
-; CHECK: call void @not_writeonly
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_not_writeonly(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @not_writeonly(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 

From 971c49fbf361c22ccf20913f61a58c28b26c4e27 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001@gmail.com>
Date: Thu, 12 Jun 2025 16:01:43 +0530
Subject: [PATCH 0118/1322] [InstCombine] Ensure Safe Handling of Flags in
 foldFNegIntoConstant (#94148)

Fix #93769

alive2: https://alive2.llvm.org/ce/z/MHShQY
---
 .../InstCombine/InstCombineAddSub.cpp         |  10 +-
 llvm/test/Transforms/InstCombine/fneg.ll      | 166 +++++++++++++++++-
 llvm/test/Transforms/InstCombine/fsub.ll      |   2 +-
 3 files changed, 173 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index fc7dd302b27a..f0f709bb16d8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2857,8 +2857,14 @@ static Instruction *foldFNegIntoConstant(Instruction &I, const DataLayout &DL) {
   // Fold negation into constant operand.
   // -(X * C) --> X * (-C)
   if (match(FNegOp, m_FMul(m_Value(X), m_Constant(C))))
-    if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL))
-      return BinaryOperator::CreateFMulFMF(X, NegC, &I);
+    if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL)) {
+      FastMathFlags FNegF = I.getFastMathFlags();
+      FastMathFlags OpF = FNegOp->getFastMathFlags();
+      FastMathFlags FMF = FastMathFlags::unionValue(FNegF, OpF) |
+                          FastMathFlags::intersectRewrite(FNegF, OpF);
+      FMF.setNoInfs(FNegF.noInfs() && OpF.noInfs());
+      return BinaryOperator::CreateFMulFMF(X, NegC, FMF);
+    }
   // -(X / C) --> X / (-C)
   if (match(FNegOp, m_FDiv(m_Value(X), m_Constant(C))))
     if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL))
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index a9d1b9a4ab83..39117f56fa4e 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -42,7 +42,7 @@ define float @fmul_fneg(float %x) {
 
 define float @fmul_fsub_fmf(float %x) {
 ; CHECK-LABEL: @fmul_fsub_fmf(
-; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    [[R:%.*]] = fmul nsz float [[X:%.*]], -4.200000e+01
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %m = fmul float %x, 42.0
@@ -52,7 +52,7 @@ define float @fmul_fsub_fmf(float %x) {
 
 define float @fmul_fneg_fmf(float %x) {
 ; CHECK-LABEL: @fmul_fneg_fmf(
-; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    [[R:%.*]] = fmul nsz float [[X:%.*]], -4.200000e+01
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %m = fmul float %x, 42.0
@@ -1142,4 +1142,166 @@ define <vscale x 2 x double> @test_fneg_select_svec_3(<vscale x 2 x i1> %cond, <
   ret <vscale x 2 x double> %2
 }
 
+define float @test_fneg_ninf_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fneg_ninf_mul_with_anyzero(
+; CHECK-NEXT:    [[F:%.*]] = fmul float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F]]
+;
+  %mul = fmul float %a, 0.0
+  %f = fneg ninf float %mul
+  ret float %f
+}
+
+define float @test_fsub_ninf_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fsub_ninf_mul_with_anyzero(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul nsz float %a, 0.000000
+  %f2 = fsub ninf float -0.000000, %f1
+  ret float %f2
+}
+
+define float @test_fneg_nnan_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fneg_nnan_mul_with_anyzero(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nnan float [[A:%.*]]
+; CHECK-NEXT:    [[F2:%.*]] = call nnan float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul ninf float %a, 0.000000
+  %f2 = fneg nnan float %f1
+  ret float %f2
+}
+
+define float @test_fneg_nsz_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fneg_nsz_mul_with_anyzero(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul ninf float %a, 0.000000
+  %f2 = fneg nsz float %f1
+  ret float %f2
+}
+
+define float @test_fneg_ninf_mul_nnan_with_const(float %a) {
+; CHECK-LABEL: @test_fneg_ninf_mul_nnan_with_const(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg float [[A:%.*]]
+; CHECK-NEXT:    [[F2:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul nnan float %a, 0.000000
+  %f2 = fneg ninf float %f1
+  ret float %f2
+}
+
+define float @test_fneg_ninf_mul_nsz_with_const(float %a) {
+; CHECK-LABEL: @test_fneg_ninf_mul_nsz_with_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul nsz float %a, 0.000000
+  %f2 = fneg ninf float %f1
+  ret float %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_nnan_ninf_with_vec_const(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_nnan_ninf_with_vec_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nnan <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul nnan <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_nsz_ninf_with_vec_const(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_nsz_ninf_with_vec_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul nsz <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_ninf_nnan_mul_with_vec_const(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_ninf_nnan_mul_with_vec_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nnan <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg nnan ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_nnan_ninf_with_vec_const2(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_nnan_ninf_with_vec_const2(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nnan ninf <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul ninf <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg nnan ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_reassoc_ninf_with_vec_const1(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_reassoc_ninf_with_vec_const1(
+; CHECK-NEXT:    [[F2:%.*]] = fmul <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul reassoc <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_reassoc_ninf_with_vec_const2(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_reassoc_ninf_with_vec_const2(
+; CHECK-NEXT:    [[F2:%.*]] = fmul ninf <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul ninf <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg reassoc ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_reassoc_ninf_with_vec_const3(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_reassoc_ninf_with_vec_const3(
+; CHECK-NEXT:    [[F2:%.*]] = fmul reassoc <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul reassoc <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg reassoc ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_contract_ninf_with_vec_const1(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_contract_ninf_with_vec_const1(
+; CHECK-NEXT:    [[F2:%.*]] = fmul <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul contract <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_contract_ninf_with_vec_const2(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_contract_ninf_with_vec_const2(
+; CHECK-NEXT:    [[F2:%.*]] = fmul ninf <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul ninf <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg contract ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_contract_ninf_with_vec_const3(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_contract_ninf_with_vec_const3(
+; CHECK-NEXT:    [[F2:%.*]] = fmul contract <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul contract <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg contract ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
 !0 = !{}
diff --git a/llvm/test/Transforms/InstCombine/fsub.ll b/llvm/test/Transforms/InstCombine/fsub.ll
index cffc63405ddc..28cee50d72c1 100644
--- a/llvm/test/Transforms/InstCombine/fsub.ll
+++ b/llvm/test/Transforms/InstCombine/fsub.ll
@@ -98,7 +98,7 @@ define float @sub_sub_nsz(float %x, float %y, float %z) {
 
 define float @sub_add_neg_x(float %x, float %y) {
 ; CHECK-LABEL: @sub_add_neg_x(
-; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], -5.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = fmul nsz float [[X:%.*]], -5.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %mul = fmul float %x, 5.000000e+00

From 20d5d09e99188dfc7df6e4e4f1c37512e0ab318e Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 12 Jun 2025 11:37:25 +0100
Subject: [PATCH 0119/1322] [compiler-rt] remove unused default in compiler-rt
 lit tests (#143738)

In https://github.com/llvm/llvm-project/pull/143183 was mistakenly added
a default value to `python_root_dir` in lit tests of compiler-rt.

This is unused by the lit tests of compiler-rt, as it was meant to be
used by `lldb`.

This patch removes this change.
---
 compiler-rt/test/lit.common.configured.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 8ca47a8df5ae..04d1a4df5a54 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -25,7 +25,6 @@ set_default("gold_executable", "@GOLD_EXECUTABLE@")
 set_default("clang", "@COMPILER_RT_RESOLVED_TEST_COMPILER@")
 set_default("compiler_id", "@COMPILER_RT_TEST_COMPILER_ID@")
 set_default("python_executable", "@Python3_EXECUTABLE@")
-set_default("python_root_dir", "@Python3_ROOT_DIR@")
 set_default("compiler_rt_debug", @COMPILER_RT_DEBUG_PYBOOL@)
 set_default("compiler_rt_intercept_libdispatch", @COMPILER_RT_INTERCEPT_LIBDISPATCH_PYBOOL@)
 set_default("compiler_rt_output_dir", "@COMPILER_RT_RESOLVED_OUTPUT_DIR@")

From fe28ea37b640ea4842583df3b89e08877220fb8e Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Thu, 12 Jun 2025 18:39:16 +0800
Subject: [PATCH 0120/1322] [LoongArch] Add demanded bits support for
 [X]VMSKLTZ (#143528)

This patch adds a DAG combine hook for the [X]VMSKLTZ nodes to simplify
their input when possible. It also implements target-specific logic in
SimplifyDemandedBitsForTargetNode to optimize away unnecessary
computations when only a subset of the sign bits in the vector results
is actually used.
---
 .../LoongArch/LoongArchISelLowering.cpp       | 73 +++++++++++++++++++
 .../Target/LoongArch/LoongArchISelLowering.h  |  6 ++
 llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll | 15 +---
 llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll   | 16 ----
 4 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index b869ad25e785..99dae6ec3eb0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5634,6 +5634,21 @@ static SDValue performMOVFR2GR_SCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performVMSKLTZCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const LoongArchSubtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumBits = VT.getScalarSizeInBits();
+
+  // Simplify the inputs.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedMask(APInt::getAllOnes(NumBits));
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5658,6 +5673,9 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performMOVGR2FR_WCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::MOVFR2GR_S_LA64:
     return performMOVFR2GR_SCombine(N, DAG, DCI, Subtarget);
+  case LoongArchISD::VMSKLTZ:
+  case LoongArchISD::XVMSKLTZ:
+    return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -8192,3 +8210,58 @@ unsigned LoongArchTargetLowering::getNumRegistersForCallingConv(
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
+
+bool LoongArchTargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = OriginalDemandedBits.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  default:
+    break;
+  case LoongArchISD::VMSKLTZ:
+  case LoongArchISD::XVMSKLTZ: {
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+    unsigned SrcBits = SrcVT.getScalarSizeInBits();
+    unsigned NumElts = SrcVT.getVectorNumElements();
+
+    // If we don't need the sign bits at all just return zero.
+    if (OriginalDemandedBits.countr_zero() >= NumElts)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+    // Only demand the vector elements of the sign bits we need.
+    APInt KnownUndef, KnownZero;
+    APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
+    if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
+      return true;
+
+    Known.Zero = KnownZero.zext(BitWidth);
+    Known.Zero.setHighBits(BitWidth - NumElts);
+
+    // [X]VMSKLTZ only uses the MSB from each vector element.
+    KnownBits KnownSrc;
+    APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+                             Depth + 1))
+      return true;
+
+    if (KnownSrc.One[SrcBits - 1])
+      Known.One.setLowBits(NumElts);
+    else if (KnownSrc.Zero[SrcBits - 1])
+      Known.Zero.setLowBits(NumElts);
+
+    // Attempt to avoid multi-use ops if we don't need anything from it.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
+    return false;
+  }
+  }
+
+  return TargetLowering::SimplifyDemandedBitsForTargetNode(
+      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 53e3f1adb8d2..79aa89726191 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -314,6 +314,12 @@ public:
   bool isFPImmVLDILegal(const APFloat &Imm, EVT VT) const;
   LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
+  bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+                                         const APInt &DemandedElts,
+                                         KnownBits &Known,
+                                         TargetLoweringOpt &TLO,
+                                         unsigned Depth) const override;
+
 private:
   /// Target-specific function used to lower LoongArch calling conventions.
   typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
diff --git a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
index 7e015852e0ab..5a861be95977 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
@@ -383,9 +383,8 @@ define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
 ; CHECK-LABEL: xvmsk_eq_vsel_slt_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvseq.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvslti.w $xr1, $xr2, 0
-; CHECK-NEXT:    xvrepli.b $xr2, -1
-; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr2, $xr0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr2, $xr1, $xr0
 ; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -408,8 +407,7 @@ define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    xvreplgr2vr.w $xr4, $a0
 ; CHECK-NEXT:    xvand.v $xr2, $xr2, $xr4
 ; CHECK-NEXT:    xvseq.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvslti.w $xr1, $xr3, 0
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvor.v $xr0, $xr3, $xr0
 ; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr2
 ; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
@@ -530,7 +528,6 @@ define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) {
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
 ; CHECK-NEXT:    vpackev.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -558,7 +555,6 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) {
 ; CHECK-NEXT:    st.h $a0, $sp, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -586,7 +582,6 @@ define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) {
 ; CHECK-NEXT:    st.h $a0, $sp, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -601,7 +596,6 @@ define i32 @xvmsk_trunc_i8(<32 x i8> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.b $xr0, $xr0, 7
-; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 7
 ; CHECK-NEXT:    xvmskltz.b $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -616,7 +610,6 @@ define i16 @xvmsk_trunc_i16(<16 x i16> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.h $xr0, $xr0, 15
-; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 15
 ; CHECK-NEXT:    xvmskltz.h $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -631,7 +624,6 @@ define i8 @xvmsk_trunc_i32(<8 x i32> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.w $xr0, $xr0, 31
-; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 31
 ; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -646,7 +638,6 @@ define i4 @xvmsk_trunc_i64(<4 x i64> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.d $xr0, $xr0, 63
-; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 63
 ; CHECK-NEXT:    xvmskltz.d $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index d8098ccc9328..0ee30120f77a 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -181,7 +181,6 @@ define i2 @vmsk_sgt_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -197,7 +196,6 @@ define i2 @vmsk_sgt_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -212,7 +210,6 @@ define i2 @vmsk_sgt_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-NEXT:    vslt.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -252,7 +249,6 @@ define i4 @vmsk_sgt_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -267,7 +263,6 @@ define i4 @vmsk_sgt_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:    vslt.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -306,7 +301,6 @@ define i8 @vmsk_sgt_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:    vslt.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -349,7 +343,6 @@ define i2 @vmsk_sgt_and_sgt_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8>
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -369,7 +362,6 @@ define i2 @vmsk_sgt_and_sgt_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -388,7 +380,6 @@ define i2 @vmsk_sgt_and_sgt_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 ; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -440,7 +431,6 @@ define i4 @vmsk_sgt_and_sgt_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -459,7 +449,6 @@ define i4 @vmsk_sgt_and_sgt_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
 ; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -510,7 +499,6 @@ define i8 @vmsk_sgt_and_sgt_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8>
 ; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -557,7 +545,6 @@ define i16 @vmsk_trunc_i8(<16 x i8> %a) {
 ; CHECK-LABEL: vmsk_trunc_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.b $vr0, $vr0, 7
-; CHECK-NEXT:    vsrai.b $vr0, $vr0, 7
 ; CHECK-NEXT:    vmskltz.b $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -570,7 +557,6 @@ define i8 @vmsk_trunc_i16(<8 x i16> %a) {
 ; CHECK-LABEL: vmsk_trunc_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -583,7 +569,6 @@ define i4 @vmsk_trunc_i32(<4 x i32> %a) {
 ; CHECK-LABEL: vmsk_trunc_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 31
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 31
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -596,7 +581,6 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
 ; CHECK-LABEL: vmsk_trunc_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 63
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 63
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret

From 97ac6483aaead89897d9bda8a12f1f4c11fad621 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 12 Jun 2025 11:51:58 +0100
Subject: [PATCH 0121/1322] [DebugInfo][RemoveDIs] Delete debug-info-format
 flag (#143746)

This flag was used to let us incrementally introduce debug records
into LLVM, however everything is now using records. It serves no
purpose now, so delete it.
---
 llvm/include/llvm/IR/BasicBlock.h             |   9 --
 llvm/include/llvm/IR/Function.h               |   9 --
 llvm/include/llvm/IR/Module.h                 |  20 ---
 llvm/lib/AsmParser/LLParser.cpp               |   4 -
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   8 --
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   3 +-
 llvm/lib/IR/BasicBlock.cpp                    |  35 +----
 llvm/lib/IR/Core.cpp                          |   8 +-
 llvm/lib/IR/Function.cpp                      |  18 +--
 llvm/lib/IR/Instruction.cpp                   |  13 +-
 llvm/lib/IR/Module.cpp                        |   4 +-
 llvm/lib/IR/Verifier.cpp                      |  15 +--
 llvm/lib/LTO/LTO.cpp                          |   4 +-
 llvm/lib/Linker/IRMover.cpp                   |   2 -
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |   1 -
 .../AMDGPU/AMDGPUPreloadKernelArguments.cpp   |   1 -
 .../AMDGPU/AMDGPURewriteOutArguments.cpp      |   2 -
 .../WebAssemblyAddMissingPrototypes.cpp       |   1 -
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp |   1 -
 llvm/lib/Transforms/IPO/Attributor.cpp        |   6 -
 .../IPO/DeadArgumentElimination.cpp           |   2 -
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp   |   2 -
 llvm/lib/Transforms/IPO/MergeFunctions.cpp    |   2 -
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 120 +-----------------
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |   7 -
 llvm/lib/Transforms/Utils/CloneModule.cpp     |   1 -
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   5 -
 .../Transforms/Utils/LoopRotationUtils.cpp    |   6 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  12 +-
 llvm/tools/llvm-as/llvm-as.cpp                |   1 -
 llvm/tools/llvm-dis/llvm-dis.cpp              |   1 -
 llvm/tools/llvm-link/llvm-link.cpp            |   8 +-
 llvm/unittests/IR/IRBuilderTest.cpp           |  11 +-
 mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp  |   3 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |   5 +-
 35 files changed, 31 insertions(+), 319 deletions(-)

diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 07444cd6779e..c24f01fe26cc 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -63,9 +63,6 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
 public:
   using InstListType = SymbolTableList<Instruction, ilist_iterator_bits<true>,
                                        ilist_parent<BasicBlock>>;
-  /// Flag recording whether or not this block stores debug-info in the form
-  /// of intrinsic instructions (false) or non-instruction records (true).
-  bool IsNewDbgInfoFormat;
 
 private:
   // Allow Function to renumber blocks.
@@ -95,12 +92,6 @@ public:
   /// IsNewDbgInfoFormat = false.
   LLVM_ABI void convertFromNewDbgValues();
 
-  /// Ensure the block is in "old" dbg.value format (\p NewFlag == false) or
-  /// in the new format (\p NewFlag == true), converting to the desired format
-  /// if necessary.
-  LLVM_ABI void setIsNewDbgInfoFormat(bool NewFlag);
-  LLVM_ABI void setNewDbgInfoFormatFlag(bool NewFlag);
-
   unsigned getNumber() const {
     assert(getParent() && "only basic blocks in functions have valid numbers");
     return Number;
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index f24d03635731..c361be3e752a 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -111,11 +111,6 @@ private:
   friend class SymbolTableListTraits<Function>;
 
 public:
-  /// Is this function using intrinsics to record the position of debugging
-  /// information, or non-intrinsic records? See IsNewDbgInfoFormat in
-  /// \ref BasicBlock.
-  bool IsNewDbgInfoFormat;
-
   /// hasLazyArguments/CheckLazyArguments - The argument list of a function is
   /// built on demand, so that the list isn't allocated until the first client
   /// needs it.  The hasLazyArguments predicate returns true if the arg list
@@ -130,9 +125,6 @@ public:
   /// \see BasicBlock::convertFromNewDbgValues.
   void convertFromNewDbgValues();
 
-  void setIsNewDbgInfoFormat(bool NewVal);
-  void setNewDbgInfoFormatFlag(bool NewVal);
-
 private:
   friend class TargetLibraryInfoImpl;
 
@@ -760,7 +752,6 @@ public:
   /// to the newly inserted BB.
   Function::iterator insert(Function::iterator Position, BasicBlock *BB) {
     Function::iterator FIt = BasicBlocks.insert(Position, BB);
-    BB->setIsNewDbgInfoFormat(IsNewDbgInfoFormat);
     return FIt;
   }
 
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 7a26efb74b32..f4420f460741 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -215,11 +215,6 @@ private:
 /// @name Constructors
 /// @{
 public:
-  /// Is this Module using intrinsics to record the position of debugging
-  /// information, or non-intrinsic records? See IsNewDbgInfoFormat in
-  /// \ref BasicBlock.
-  bool IsNewDbgInfoFormat;
-
   /// Used when printing this module in the new debug info format; removes all
   /// declarations of debug intrinsics that are replaced by non-intrinsic
   /// records in the new format.
@@ -230,7 +225,6 @@ public:
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
-    IsNewDbgInfoFormat = true;
   }
 
   /// \see BasicBlock::convertFromNewDbgValues.
@@ -238,20 +232,6 @@ public:
     for (auto &F : *this) {
       F.convertFromNewDbgValues();
     }
-    IsNewDbgInfoFormat = false;
-  }
-
-  void setIsNewDbgInfoFormat(bool UseNewFormat) {
-    if (UseNewFormat && !IsNewDbgInfoFormat)
-      convertToNewDbgValues();
-    else if (!UseNewFormat && IsNewDbgInfoFormat)
-      convertFromNewDbgValues();
-  }
-  void setNewDbgInfoFormatFlag(bool NewFlag) {
-    for (auto &F : *this) {
-      F.setNewDbgInfoFormatFlag(NewFlag);
-    }
-    IsNewDbgInfoFormat = NewFlag;
   }
 
   /// The Module constructor. Note that there is no default constructor. You
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5c007dcf0022..926dc6211eb8 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -441,8 +441,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
 
-  M->setIsNewDbgInfoFormat(true);
-
   if (!Slots)
     return false;
   // Initialize the slot mapping.
@@ -6906,8 +6904,6 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
       if (SeenOldDbgInfoFormat)
         return error(Lex.getLoc(), "debug record should not appear in a module "
                                    "containing debug info intrinsics");
-      if (!SeenNewDbgInfoFormat)
-        M->setNewDbgInfoFormatFlag(true);
       SeenNewDbgInfoFormat = true;
       Lex.Lex();
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 31129b7e5cf7..fde934fbb3cf 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4479,10 +4479,6 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata,
                                  ParserCallbacks Callbacks) {
-  // Don't allow modules to use debug-intrinsics: autoupgrading them is now
-  // mandatory.
-  TheModule->IsNewDbgInfoFormat = true;
-
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
   if (ResumeBit) {
     if (Error JumpFailed = Stream.JumpToBit(ResumeBit))
@@ -6994,10 +6990,6 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   if (Error JumpFailed = Stream.JumpToBit(DFII->second))
     return JumpFailed;
 
-  // Regardless of the debug info format we want to end up in, we need
-  // IsNewDbgInfoFormat=true to construct any debug records seen in the bitcode.
-  F->IsNewDbgInfoFormat = true;
-
   if (Error Err = parseFunctionBody(F))
     return Err;
   F->setIsMaterializable(false);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 32348a899683..3792b456c836 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3335,8 +3335,7 @@ class TypePromotionTransaction {
 
       // Record where we would have to re-insert the instruction in the sequence
       // of DbgRecords, if we ended up reinserting.
-      if (BB->IsNewDbgInfoFormat)
-        BeforeDbgRecord = Inst->getDbgReinsertionPosition();
+      BeforeDbgRecord = Inst->getDbgReinsertionPosition();
 
       if (HasPrevInstruction) {
         Point.PrevInst = std::prev(Inst->getIterator());
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 62a75313bb17..8b3e91750f86 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -52,8 +52,6 @@ DbgMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
-  IsNewDbgInfoFormat = true;
-
   // Iterate over all instructions in the instruction list, collecting debug
   // info intrinsics and converting them to DbgRecords. Once we find a "real"
   // instruction, attach all those DbgRecords to a DbgMarker in that
@@ -91,7 +89,6 @@ void BasicBlock::convertToNewDbgValues() {
 
 void BasicBlock::convertFromNewDbgValues() {
   invalidateOrders();
-  IsNewDbgInfoFormat = false;
 
   // Iterate over the block, finding instructions annotated with DbgMarkers.
   // Convert any attached DbgRecords to debug intrinsics and insert ahead of the
@@ -126,16 +123,6 @@ void BasicBlock::dumpDbgValues() const {
 }
 #endif
 
-void BasicBlock::setIsNewDbgInfoFormat(bool NewFlag) {
-  if (NewFlag && !IsNewDbgInfoFormat)
-    convertToNewDbgValues();
-  else if (!NewFlag && IsNewDbgInfoFormat)
-    convertFromNewDbgValues();
-}
-void BasicBlock::setNewDbgInfoFormatFlag(bool NewFlag) {
-  IsNewDbgInfoFormat = NewFlag;
-}
-
 ValueSymbolTable *BasicBlock::getValueSymbolTable() {
   if (Function *F = getParent())
     return F->getValueSymbolTable();
@@ -157,8 +144,7 @@ template class llvm::SymbolTableListTraits<
 
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
-    : Value(Type::getLabelTy(C), Value::BasicBlockVal),
-      IsNewDbgInfoFormat(true), Parent(nullptr) {
+    : Value(Type::getLabelTy(C), Value::BasicBlockVal), Parent(nullptr) {
 
   if (NewParent)
     insertInto(NewParent, InsertBefore);
@@ -168,8 +154,6 @@ BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
 
   end().getNodePtr()->setParent(this);
   setName(Name);
-  if (NewParent)
-    setIsNewDbgInfoFormat(NewParent->IsNewDbgInfoFormat);
 }
 
 void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
@@ -180,8 +164,6 @@ void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
     NewParent->insert(InsertBefore->getIterator(), this);
   else
     NewParent->insert(NewParent->end(), this);
-
-  setIsNewDbgInfoFormat(NewParent->IsNewDbgInfoFormat);
 }
 
 BasicBlock::~BasicBlock() {
@@ -725,10 +707,6 @@ void BasicBlock::flushTerminatorDbgRecords() {
   // check whether there's anything trailing at the end and move those
   // DbgRecords in front of the terminator.
 
-  // Do nothing if we're not in new debug-info format.
-  if (!IsNewDbgInfoFormat)
-    return;
-
   // If there's no terminator, there's nothing to do.
   Instruction *Term = getTerminator();
   if (!Term)
@@ -765,10 +743,6 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   // in the iterators whether there was the intention to transfer any debug
   // info.
 
-  // If we're not in "new" debug-info format, do nothing.
-  if (!IsNewDbgInfoFormat)
-    return;
-
   assert(First == Last);
   bool InsertAtHead = Dest.getHeadBit();
   bool ReadFromHead = First.getHeadBit();
@@ -1029,8 +1003,6 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
 
 void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
                         iterator Last) {
-  assert(Src->IsNewDbgInfoFormat == IsNewDbgInfoFormat);
-
 #ifdef EXPENSIVE_CHECKS
   // Check that First is before Last.
   auto FromBBEnd = Src->end();
@@ -1045,9 +1017,7 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
     return;
   }
 
-  // Handle non-instr debug-info specific juggling.
-  if (IsNewDbgInfoFormat)
-    spliceDebugInfo(Dest, Src, First, Last);
+  spliceDebugInfo(Dest, Src, First, Last);
 
   // And move the instructions.
   getInstList().splice(Dest, Src->getInstList(), First, Last);
@@ -1056,7 +1026,6 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
 }
 
 void BasicBlock::insertDbgRecordAfter(DbgRecord *DR, Instruction *I) {
-  assert(IsNewDbgInfoFormat);
   assert(I->getParent() == this);
 
   iterator NextIt = std::next(I->getIterator());
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index a7c3a56dcc22..9810f04cc503 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -431,12 +431,12 @@ void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
                            {Key, KeyLen}, unwrap(Val));
 }
 
-LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M) {
-  return unwrap(M)->IsNewDbgInfoFormat;
-}
+LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M) { return true; }
 
 void LLVMSetIsNewDbgInfoFormat(LLVMModuleRef M, LLVMBool UseNewFormat) {
-  unwrap(M)->setIsNewDbgInfoFormat(UseNewFormat);
+  if (!UseNewFormat)
+    llvm_unreachable("LLVM no longer supports intrinsic based debug-info");
+  (void)M;
 }
 
 /*--.. Printing modules ....................................................--*/
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 493dec72d45a..28fb81055baf 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -87,32 +87,17 @@ void Function::validateBlockNumbers() const {
 }
 
 void Function::convertToNewDbgValues() {
-  IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
     BB.convertToNewDbgValues();
   }
 }
 
 void Function::convertFromNewDbgValues() {
-  IsNewDbgInfoFormat = false;
   for (auto &BB : *this) {
     BB.convertFromNewDbgValues();
   }
 }
 
-void Function::setIsNewDbgInfoFormat(bool NewFlag) {
-  if (NewFlag && !IsNewDbgInfoFormat)
-    convertToNewDbgValues();
-  else if (!NewFlag && IsNewDbgInfoFormat)
-    convertFromNewDbgValues();
-}
-void Function::setNewDbgInfoFormatFlag(bool NewFlag) {
-  for (auto &BB : *this) {
-    BB.setNewDbgInfoFormatFlag(NewFlag);
-  }
-  IsNewDbgInfoFormat = NewFlag;
-}
-
 //===----------------------------------------------------------------------===//
 // Argument Implementation
 //===----------------------------------------------------------------------===//
@@ -490,7 +475,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
                    const Twine &name, Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
-      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(true) {
+      NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -505,7 +490,6 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
 
   if (ParentModule) {
     ParentModule->getFunctionList().push_back(this);
-    IsNewDbgInfoFormat = ParentModule->IsNewDbgInfoFormat;
   }
 
   HasLLVMReservedName = getName().starts_with("llvm.");
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 109d516c61b7..1b60caab6c11 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -86,7 +86,7 @@ void Instruction::removeFromParent() {
 }
 
 void Instruction::handleMarkerRemoval() {
-  if (!getParent()->IsNewDbgInfoFormat || !DebugMarker)
+  if (!DebugMarker)
     return;
 
   DebugMarker->removeMarker();
@@ -136,9 +136,6 @@ void Instruction::insertBefore(BasicBlock &BB,
 
   BB.getInstList().insert(InsertPos, this);
 
-  if (!BB.IsNewDbgInfoFormat)
-    return;
-
   // We've inserted "this": if InsertAtHead is set then it comes before any
   // DbgVariableRecords attached to InsertPos. But if it's not set, then any
   // DbgRecords should now come before "this".
@@ -226,7 +223,7 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
 
   // If we've been given the "Preserve" flag, then just move the DbgRecords with
   // the instruction, no more special handling needed.
-  if (BB.IsNewDbgInfoFormat && DebugMarker && !Preserve) {
+  if (DebugMarker && !Preserve) {
     if (I != this->getIterator() || InsertAtHead) {
       // "this" is definitely moving in the list, or it's moving ahead of its
       // attached DbgVariableRecords. Detach any existing DbgRecords.
@@ -238,7 +235,7 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   // the block splicer, which will do more debug-info things.
   BB.getInstList().splice(I, getParent()->getInstList(), getIterator());
 
-  if (BB.IsNewDbgInfoFormat && !Preserve) {
+  if (!Preserve) {
     DbgMarker *NextMarker = getParent()->getNextMarker(this);
 
     // If we're inserting at point I, and not in front of the DbgRecords
@@ -258,10 +255,6 @@ iterator_range<DbgRecord::self_iterator> Instruction::cloneDebugInfoFrom(
   if (!From->DebugMarker)
     return DbgMarker::getEmptyDbgRecordRange();
 
-  assert(getParent()->IsNewDbgInfoFormat);
-  assert(getParent()->IsNewDbgInfoFormat ==
-         From->getParent()->IsNewDbgInfoFormat);
-
   if (!DebugMarker)
     getParent()->createMarker(this);
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 0a47f9861969..37f4a72d8c20 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -71,8 +71,7 @@ template class LLVM_EXPORT_TEMPLATE llvm::SymbolTableListTraits<GlobalIFunc>;
 
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
-      ModuleID(std::string(MID)), SourceFileName(std::string(MID)),
-      IsNewDbgInfoFormat(true) {
+      ModuleID(std::string(MID)), SourceFileName(std::string(MID)) {
   Context.addModule(this);
 }
 
@@ -83,7 +82,6 @@ Module &Module::operator=(Module &&Other) {
 
   ModuleID = std::move(Other.ModuleID);
   SourceFileName = std::move(Other.SourceFileName);
-  IsNewDbgInfoFormat = std::move(Other.IsNewDbgInfoFormat);
 
   GlobalList.clear();
   GlobalList.splice(GlobalList.begin(), Other.GlobalList);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9ec94a8b8095..1f1041b25973 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2878,11 +2878,6 @@ void Verifier::visitFunction(const Function &F) {
   Check(verifyAttributeCount(Attrs, FT->getNumParams()),
         "Attribute after last parameter!", &F);
 
-  CheckDI(F.IsNewDbgInfoFormat == F.getParent()->IsNewDbgInfoFormat,
-          "Function debug format should match parent module", &F,
-          F.IsNewDbgInfoFormat, F.getParent(),
-          F.getParent()->IsNewDbgInfoFormat);
-
   bool IsIntrinsic = F.isIntrinsic();
 
   // Check function attributes.
@@ -3233,15 +3228,9 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
     Check(I.getParent() == &BB, "Instruction has bogus parent pointer!");
   }
 
-  CheckDI(BB.IsNewDbgInfoFormat == BB.getParent()->IsNewDbgInfoFormat,
-          "BB debug format should match parent function", &BB,
-          BB.IsNewDbgInfoFormat, BB.getParent(),
-          BB.getParent()->IsNewDbgInfoFormat);
-
   // Confirm that no issues arise from the debug program.
-  if (BB.IsNewDbgInfoFormat)
-    CheckDI(!BB.getTrailingDbgRecords(), "Basic Block has trailing DbgRecords!",
-            &BB);
+  CheckDI(!BB.getTrailingDbgRecords(), "Basic Block has trailing DbgRecords!",
+          &BB);
 }
 
 void Verifier::visitTerminator(Instruction &I) {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index df395073359c..adf995cbc9b1 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -599,9 +599,7 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
                                       const Config &Conf)
     : ParallelCodeGenParallelismLevel(ParallelCodeGenParallelismLevel),
       Ctx(Conf), CombinedModule(std::make_unique<Module>("ld-temp.o", Ctx)),
-      Mover(std::make_unique<IRMover>(*CombinedModule)) {
-  CombinedModule->IsNewDbgInfoFormat = true;
-}
+      Mover(std::make_unique<IRMover>(*CombinedModule)) {}
 
 LTO::ThinLTOState::ThinLTOState(ThinBackend BackendParam)
     : Backend(std::move(BackendParam)), CombinedIndex(/*HaveGVs*/ false) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a449185b2b9b..2a9709050162 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -595,7 +595,6 @@ Function *IRLinker::copyFunctionProto(const Function *SF) {
                              SF->getAddressSpace(), SF->getName(), &DstM);
   F->copyAttributesFrom(SF);
   F->setAttributes(mapAttributeTypes(F->getContext(), F->getAttributes()));
-  F->IsNewDbgInfoFormat = SF->IsNewDbgInfoFormat;
   return F;
 }
 
@@ -1030,7 +1029,6 @@ Error IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
     Dst.setPrologueData(Src.getPrologueData());
   if (Src.hasPersonalityFn())
     Dst.setPersonalityFn(Src.getPersonalityFn());
-  assert(Src.IsNewDbgInfoFormat == Dst.IsNewDbgInfoFormat);
 
   // Copy over the metadata attachments without remapping.
   Dst.copyMetadata(&Src, 0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0f002b016af0..67db961e60fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2364,7 +2364,6 @@ static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy,
   bool IsIntrinsic = OldF->isIntrinsic();
   Function *NewF =
       Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace());
-  NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat;
   NewF->copyAttributesFrom(OldF);
   NewF->copyMetadata(OldF, 0);
   NewF->takeName(OldF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index 5027705ef61d..984c1ee89309 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -134,7 +134,6 @@ private:
 
     NF->copyAttributesFrom(&F);
     NF->copyMetadata(&F, 0);
-    NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
 
     F.getParent()->getFunctionList().insert(F.getIterator(), NF);
     NF->takeName(&F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index e1008439a33a..4b1f80c77782 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -325,8 +325,6 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   NewFunc->removeRetAttrs(RetAttrs);
   // TODO: How to preserve metadata?
 
-  NewFunc->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
-
   // Move the body of the function into the new rewritten function, and replace
   // this function with a stub.
   NewFunc->splice(NewFunc->begin(), &F);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
index f02725efc7e0..344a3636b431 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -135,7 +135,6 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
         Function::Create(NewType, F.getLinkage(), F.getName() + ".fixed_sig");
     NewF->setAttributes(F.getAttributes());
     NewF->removeFnAttr("no-prototype");
-    NewF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
     Replacements.emplace_back(&F, NewF);
   }
 
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 0ec5202b8cfe..262c902d40d2 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -179,7 +179,6 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
                                   F->getName());
   NF->copyAttributesFrom(F);
   NF->copyMetadata(F, 0);
-  NF->setIsNewDbgInfoFormat(F->IsNewDbgInfoFormat);
 
   // The new function will have the !dbg metadata copied from the original
   // function. The original function may not be deleted, and dbg metadata need
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index cbdbf9ae1494..050eed376ed3 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2726,8 +2726,6 @@ void Attributor::createShallowWrapper(Function &F) {
       Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
   F.setName(""); // set the inside function anonymous
   M.getFunctionList().insert(F.getIterator(), Wrapper);
-  // Flag whether the function is using new-debug-info or not.
-  Wrapper->IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
 
   F.setLinkage(GlobalValue::InternalLinkage);
 
@@ -2808,8 +2806,6 @@ bool Attributor::internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
       VMap[&Arg] = &(*NewFArgIt++);
     }
     SmallVector<ReturnInst *, 8> Returns;
-    // Flag whether the function is using new-debug-info or not.
-    Copied->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
 
     // Copy the body of the original function to the new one
     CloneFunctionInto(Copied, F, VMap,
@@ -3027,8 +3023,6 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
     NewFn->takeName(OldFn);
     NewFn->copyAttributesFrom(OldFn);
-    // Flag whether the function is using new-debug-info or not.
-    NewFn->IsNewDbgInfoFormat = OldFn->IsNewDbgInfoFormat;
 
     // Patch the pointer to LLVM function in debug info descriptor.
     NewFn->setSubprogram(OldFn->getSubprogram());
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 2e2687a5ff6e..d32b829e2ad7 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -170,7 +170,6 @@ bool DeadArgumentEliminationPass::deleteDeadVarargs(Function &F) {
   NF->setComdat(F.getComdat());
   F.getParent()->getFunctionList().insert(F.getIterator(), NF);
   NF->takeName(&F);
-  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
 
   // Loop over all the callers of the function, transforming the call sites
   // to pass in a smaller number of arguments into the new function.
@@ -884,7 +883,6 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   // it again.
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
-  NF->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
 
   // Loop over all the callers of the function, transforming the call sites to
   // pass in a smaller number of arguments into the new function.
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index e25f23107966..16ffd503300e 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -508,7 +508,6 @@ ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M,
   Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace());
 
   NF->setName(F.getName() + ".varargs");
-  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
 
   F.getParent()->getFunctionList().insert(F.getIterator(), NF);
 
@@ -550,7 +549,6 @@ ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
   NF->setComdat(F.getComdat());
   F.getParent()->getFunctionList().insert(F.getIterator(), NF);
   NF->setName(F.getName() + ".valist");
-  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
 
   AttrBuilder ParamAttrs(Ctx);
 
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index e5397e94c792..d4555e9435f1 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -751,7 +751,6 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
     NewG = Function::Create(G->getFunctionType(), G->getLinkage(),
                             G->getAddressSpace(), "", G->getParent());
     NewG->setComdat(G->getComdat());
-    NewG->IsNewDbgInfoFormat = G->IsNewDbgInfoFormat;
     BB = BasicBlock::Create(F->getContext(), "", NewG);
   }
 
@@ -897,7 +896,6 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     NewF->takeName(F);
     NewF->setComdat(F->getComdat());
     F->setComdat(nullptr);
-    NewF->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
     // Ensure CFI type metadata is propagated to the new function.
     copyMetadataIfPresent(F, NewF, "type");
     copyMetadataIfPresent(F, NewF, "kcfi_type");
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6608515e1cbb..1feed14b4fed 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -437,45 +437,7 @@ DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
 }
 
 static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
-  if (BB->IsNewDbgInfoFormat)
-    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BB);
-
-  SmallVector<DbgValueInst *, 8> ToBeRemoved;
-  SmallDenseSet<DebugVariable> VariableSet;
-  for (auto &I : reverse(*BB)) {
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
-      DebugVariable Key(DVI->getVariable(),
-                        DVI->getExpression(),
-                        DVI->getDebugLoc()->getInlinedAt());
-      auto R = VariableSet.insert(Key);
-      // If the variable fragment hasn't been seen before then we don't want
-      // to remove this dbg intrinsic.
-      if (R.second)
-        continue;
-
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI)) {
-        // Don't delete dbg.assign intrinsics that are linked to instructions.
-        if (!at::getAssignmentInsts(DAI).empty())
-          continue;
-        // Unlinked dbg.assign intrinsics can be treated like dbg.values.
-      }
-
-      // If the same variable fragment is described more than once it is enough
-      // to keep the last one (i.e. the first found since we for reverse
-      // iteration).
-      ToBeRemoved.push_back(DVI);
-      continue;
-    }
-    // Sequence with consecutive dbg.value instrs ended. Clear the map to
-    // restart identifying redundant instructions if case we find another
-    // dbg.value sequence.
-    VariableSet.clear();
-  }
-
-  for (auto &Instr : ToBeRemoved)
-    Instr->eraseFromParent();
-
-  return !ToBeRemoved.empty();
+  return DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BB);
 }
 
 /// Remove redundant dbg.value instructions using a forward scan. This can
@@ -578,49 +540,7 @@ DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
 }
 
 static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
-  if (BB->IsNewDbgInfoFormat)
-    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
-
-  SmallVector<DbgValueInst *, 8> ToBeRemoved;
-  SmallDenseMap<DebugVariable,
-                std::pair<SmallVector<Value *, 4>, DIExpression *>, 4>
-      VariableMap;
-  for (auto &I : *BB) {
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
-      DebugVariable Key(DVI->getVariable(), std::nullopt,
-                        DVI->getDebugLoc()->getInlinedAt());
-      auto [VMI, Inserted] = VariableMap.try_emplace(Key);
-      auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-      // A dbg.assign with no linked instructions can be treated like a
-      // dbg.value (i.e. can be deleted).
-      bool IsDbgValueKind = (!DAI || at::getAssignmentInsts(DAI).empty());
-
-      // Update the map if we found a new value/expression describing the
-      // variable, or if the variable wasn't mapped already.
-      SmallVector<Value *, 4> Values(DVI->getValues());
-      if (Inserted || VMI->second.first != Values ||
-          VMI->second.second != DVI->getExpression()) {
-        // Use a sentinel value (nullptr) for the DIExpression when we see a
-        // linked dbg.assign so that the next debug intrinsic will never match
-        // it (i.e. always treat linked dbg.assigns as if they're unique).
-        if (IsDbgValueKind)
-          VMI->second = {Values, DVI->getExpression()};
-        else
-          VMI->second = {Values, nullptr};
-        continue;
-      }
-
-      // Don't delete dbg.assign intrinsics that are linked to instructions.
-      if (!IsDbgValueKind)
-        continue;
-      ToBeRemoved.push_back(DVI);
-    }
-  }
-
-  for (auto &Instr : ToBeRemoved)
-    Instr->eraseFromParent();
-
-  return !ToBeRemoved.empty();
+  return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
 }
 
 /// Remove redundant undef dbg.assign intrinsic from an entry block using a
@@ -643,41 +563,7 @@ static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
 /// Possible improvements:
 /// - Keep track of non-overlapping fragments.
 static bool removeUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
-  if (BB->IsNewDbgInfoFormat)
-    return DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BB);
-
-  assert(BB->isEntryBlock() && "expected entry block");
-  SmallVector<DbgAssignIntrinsic *, 8> ToBeRemoved;
-  DenseSet<DebugVariable> SeenDefForAggregate;
-  // Returns the DebugVariable for DVI with no fragment info.
-  auto GetAggregateVariable = [](DbgValueInst *DVI) {
-    return DebugVariable(DVI->getVariable(), std::nullopt,
-                         DVI->getDebugLoc()->getInlinedAt());
-  };
-
-  // Remove undef dbg.assign intrinsics that are encountered before
-  // any non-undef intrinsics from the entry block.
-  for (auto &I : *BB) {
-    DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I);
-    if (!DVI)
-      continue;
-    auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-    bool IsDbgValueKind = (!DAI || at::getAssignmentInsts(DAI).empty());
-    DebugVariable Aggregate = GetAggregateVariable(DVI);
-    if (!SeenDefForAggregate.contains(Aggregate)) {
-      bool IsKill = DVI->isKillLocation() && IsDbgValueKind;
-      if (!IsKill) {
-        SeenDefForAggregate.insert(Aggregate);
-      } else if (DAI) {
-        ToBeRemoved.push_back(DAI);
-      }
-    }
-  }
-
-  for (DbgAssignIntrinsic *DAI : ToBeRemoved)
-    DAI->eraseFromParent();
-
-  return !ToBeRemoved.empty();
+  return DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BB);
 }
 
 bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 5487dbef8a43..510d9f97bf8c 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -114,7 +114,6 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
                                   ClonedCodeInfo *CodeInfo, bool MapAtoms) {
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
-  NewBB->IsNewDbgInfoFormat = BB->IsNewDbgInfoFormat;
   if (BB->hasName())
     NewBB->setName(BB->getName() + NameSuffix);
 
@@ -286,7 +285,6 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo,
                              ValueMapTypeRemapper *TypeMapper,
                              ValueMaterializer *Materializer) {
-  NewFunc->setIsNewDbgInfoFormat(OldFunc->IsNewDbgInfoFormat);
   assert(NameSuffix && "NameSuffix cannot be null!");
 
 #ifndef NDEBUG
@@ -391,7 +389,6 @@ Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
   // Create the new function...
   Function *NewF = Function::Create(FTy, F->getLinkage(), F->getAddressSpace(),
                                     F->getName(), F->getParent());
-  NewF->setIsNewDbgInfoFormat(F->IsNewDbgInfoFormat);
 
   // Loop over the arguments, copying the names of the mapped arguments over...
   Function::arg_iterator DestI = NewF->arg_begin();
@@ -525,7 +522,6 @@ void PruningFunctionCloner::CloneBlock(
   BasicBlock *NewBB;
   Twine NewName(BB->hasName() ? Twine(BB->getName()) + NameSuffix : "");
   BBEntry = NewBB = BasicBlock::Create(BB->getContext(), NewName, NewFunc);
-  NewBB->IsNewDbgInfoFormat = BB->IsNewDbgInfoFormat;
 
   // It is only legal to clone a function if a block address within that
   // function is never referenced outside of the function.  Given that, we
@@ -549,9 +545,6 @@ void PruningFunctionCloner::CloneBlock(
   BasicBlock::const_iterator DbgCursor = StartingInst;
   auto CloneDbgRecordsToHere =
       [NewBB, &DbgCursor](Instruction *NewInst, BasicBlock::const_iterator II) {
-        if (!NewBB->IsNewDbgInfoFormat)
-          return;
-
         // Clone debug-info records onto this instruction. Iterate through any
         // source-instructions we've cloned and then subsequently optimised
         // away, so that their debug-info doesn't go missing.
diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
index 88e2bfe45d2c..55fb0acd39ea 100644
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -61,7 +61,6 @@ std::unique_ptr<Module> llvm::CloneModule(
   New->setDataLayout(M.getDataLayout());
   New->setTargetTriple(M.getTargetTriple());
   New->setModuleInlineAsm(M.getModuleInlineAsm());
-  New->IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
 
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index c4894c90c127..1210bdf4a1c9 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -792,7 +792,6 @@ void CodeExtractor::severSplitPHINodesOfExits() {
         NewBB = BasicBlock::Create(ExitBB->getContext(),
                                    ExitBB->getName() + ".split",
                                    ExitBB->getParent(), ExitBB);
-        NewBB->IsNewDbgInfoFormat = ExitBB->IsNewDbgInfoFormat;
         SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBB));
         for (BasicBlock *PredBB : Preds)
           if (Blocks.count(PredBB))
@@ -1548,7 +1547,6 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
   Function *newFunction = constructFunctionDeclaration(
       inputs, outputs, EntryFreq, oldFunction->getName() + "." + SuffixToUse,
       StructValues, StructTy);
-  newFunction->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
   SmallVector<Value *> NewValues;
 
   emitFunctionBody(inputs, outputs, StructValues, newFunction, StructTy, header,
@@ -1637,7 +1635,6 @@ void CodeExtractor::emitFunctionBody(
   // head of the region, but the entry node of a function cannot have preds.
   BasicBlock *newFuncRoot =
       BasicBlock::Create(Context, "newFuncRoot", newFunction);
-  newFuncRoot->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
 
   // Now sink all instructions which only have non-phi uses inside the region.
   // Group the allocas at the start of the block, so that any bitcast uses of
@@ -1871,10 +1868,8 @@ CallInst *CodeExtractor::emitReplacerCall(
   // This takes place of the original loop
   BasicBlock *codeReplacer =
       BasicBlock::Create(Context, "codeRepl", oldFunction, ReplIP);
-  codeReplacer->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
   BasicBlock *AllocaBlock =
       AllocationBlock ? AllocationBlock : &oldFunction->getEntryBlock();
-  AllocaBlock->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
 
   // Update the entry count of the function.
   if (BFI)
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 693b1f517f8d..6b42503b2e01 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -634,8 +634,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           // memory access in coroutines.
           !Inst->getFunction()->isPresplitCoroutine()) {
 
-        if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
-            !NextDbgInsts.empty()) {
+        if (!NextDbgInsts.empty()) {
           auto DbgValueRange =
               LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
           RemapDbgRecordRange(M, DbgValueRange, ValueMap,
@@ -664,8 +663,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
       ++NumInstrsDuplicated;
 
-      if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
-          !NextDbgInsts.empty()) {
+      if (!NextDbgInsts.empty()) {
         auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
         RemapDbgRecordRange(M, Range, ValueMap,
                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 975ce3bef517..f67a6414ca31 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4055,13 +4055,11 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
 
   Module *M = BB->getModule();
 
-  if (PredBlock->IsNewDbgInfoFormat) {
-    PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
-    for (DbgVariableRecord &DVR :
-         filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
-      RemapDbgRecord(M, &DVR, VMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-    }
+  PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
+  for (DbgVariableRecord &DVR :
+       filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
+    RemapDbgRecord(M, &DVR, VMap,
+                   RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
   }
 
   // Now that the Cond was cloned into the predecessor basic block,
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index f42a08e2e9c8..21648674b51f 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -139,7 +139,6 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  M->setIsNewDbgInfoFormat(true);
   M->removeDebugIntrinsicDeclarations();
 
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 8937272abb92..422eb855ba2c 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -268,7 +268,6 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          M->setIsNewDbgInfoFormat(true);
           M->removeDebugIntrinsicDeclarations();
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
         }
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 35b4f0af97f6..22ea54e68358 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -523,16 +523,10 @@ int main(int argc, char **argv) {
 
   if (Verbose)
     errs() << "Writing bitcode...\n";
-  auto SetFormat = [&](bool NewFormat) {
-    Composite->setIsNewDbgInfoFormat(NewFormat);
-    if (NewFormat)
-      Composite->removeDebugIntrinsicDeclarations();
-  };
+  Composite->removeDebugIntrinsicDeclarations();
   if (OutputAssembly) {
-    SetFormat(true);
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    SetFormat(true);
     WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
   }
 
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index aadae5287c38..520735dfc326 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -888,14 +888,9 @@ TEST_F(IRBuilderTest, DIBuilder) {
   };
 
   auto ExpectOrder = [&](DbgInstPtr First, BasicBlock::iterator Second) {
-    if (M->IsNewDbgInfoFormat) {
-      EXPECT_TRUE(isa<DbgRecord *>(First));
-      EXPECT_FALSE(Second->getDbgRecordRange().empty());
-      EXPECT_EQ(GetLastDbgRecord(&*Second), cast<DbgRecord *>(First));
-    } else {
-      EXPECT_TRUE(isa<Instruction *>(First));
-      EXPECT_EQ(&*std::prev(Second), cast<Instruction *>(First));
-    }
+    EXPECT_TRUE(isa<DbgRecord *>(First));
+    EXPECT_FALSE(Second->getDbgRecordRange().empty());
+    EXPECT_EQ(GetLastDbgRecord(&*Second), cast<DbgRecord *>(First));
   };
 
   auto RunTest = [&]() {
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
index 187e2a9b75a9..2dd0640f794e 100644
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -70,8 +70,7 @@ void registerFromLLVMIRTranslation() {
           return nullptr;
 
         // Debug records are not currently supported in the LLVM IR translator.
-        if (llvmModule->IsNewDbgInfoFormat)
-          llvmModule->convertFromNewDbgValues();
+        llvmModule->convertFromNewDbgValues();
 
         return translateLLVMIRToModule(
             std::move(llvmModule), context, emitExpensiveWarnings,
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 2702b7aa544d..e5ca147ea98f 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -2231,9 +2231,6 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext,
                   StringRef name) {
   m->getContext()->getOrLoadDialect<LLVM::LLVMDialect>();
   auto llvmModule = std::make_unique<llvm::Module>(name, llvmContext);
-  // ModuleTranslation can currently only construct modules in the old debug
-  // info format, so set the flag accordingly.
-  llvmModule->setNewDbgInfoFormatFlag(false);
   if (auto dataLayoutAttr =
           m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) {
     llvmModule->setDataLayout(cast<StringAttr>(dataLayoutAttr).getValue());
@@ -2329,7 +2326,7 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   // Once we've finished constructing elements in the module, we should convert
   // it to use the debug info format desired by LLVM.
   // See https://llvm.org/docs/RemoveDIsDebugInfo.html
-  translator.llvmModule->setIsNewDbgInfoFormat(true);
+  translator.llvmModule->convertToNewDbgValues();
 
   // Add the necessary debug info module flags, if they were not encoded in MLIR
   // beforehand.

From 013034cd0f5ae19ef02fc35a83362874e727f13c Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 12 Jun 2025 12:04:41 +0100
Subject: [PATCH 0122/1322] Follow-up to 97ac6483aae, squelch an unused lambda
 capture warning

NewBB here was being captured for some code that was deleted in
97ac6483aae, and that leads to some warnings on some compilers.
---
 llvm/lib/Transforms/Utils/CloneFunction.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 510d9f97bf8c..fccb73a36b18 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -544,7 +544,7 @@ void PruningFunctionCloner::CloneBlock(
   // Keep a cursor pointing at the last place we cloned debug-info records from.
   BasicBlock::const_iterator DbgCursor = StartingInst;
   auto CloneDbgRecordsToHere =
-      [NewBB, &DbgCursor](Instruction *NewInst, BasicBlock::const_iterator II) {
+      [&DbgCursor](Instruction *NewInst, BasicBlock::const_iterator II) {
         // Clone debug-info records onto this instruction. Iterate through any
         // source-instructions we've cloned and then subsequently optimised
         // away, so that their debug-info doesn't go missing.

From d698ede748e66f5519cb8481abc2df89a994a059 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Thu, 12 Jun 2025 13:45:19 +0200
Subject: [PATCH 0123/1322] [mlir][amx] Restore conversion interface for AMX
 (#143871)

Restores mistakenly removed AMX interface which ensures that the custom
tile type is converted to its LLVM equivalent within other operations
such as control flow.

Fix after #140559
---
 mlir/include/mlir/Dialect/AMX/Transforms.h    |  3 +++
 mlir/include/mlir/InitAllExtensions.h         |  2 ++
 .../AMX/Transforms/LegalizeForLLVMExport.cpp  | 19 ++++++++++++++++++
 mlir/test/Target/LLVMIR/amx.mlir              | 20 +++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/mlir/include/mlir/Dialect/AMX/Transforms.h b/mlir/include/mlir/Dialect/AMX/Transforms.h
index 4a751d99ceee..7391ec2ff6b1 100644
--- a/mlir/include/mlir/Dialect/AMX/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMX/Transforms.h
@@ -25,6 +25,9 @@ void populateAMXLegalizeForLLVMExportPatterns(LLVMTypeConverter &converter,
 /// intrinsics.
 void configureAMXLegalizeForExportTarget(LLVMConversionTarget &target);
 
+/// Register LLVM conversion interface for AMX dialect.
+void registerConvertAMXToLLVMInterface(DialectRegistry &registry);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AMX_TRANSFORMS_H
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index 7dcbabe8aafa..f356b91b1b6c 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -32,6 +32,7 @@
 #include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/AMX/Transforms.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
 #include "mlir/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.h"
 #include "mlir/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.h"
@@ -85,6 +86,7 @@ inline void registerAllExtensions(DialectRegistry &registry) {
   registerConvertOpenMPToLLVMInterface(registry);
   registerConvertSCFToEmitCInterface(registry);
   ub::registerConvertUBToLLVMInterface(registry);
+  registerConvertAMXToLLVMInterface(registry);
   gpu::registerConvertGpuToLLVMInterface(registry);
   NVVM::registerConvertGpuToNVVMInterface(registry);
   vector::registerConvertVectorToLLVMInterface(registry);
diff --git a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
index 7471dc797e0f..37aebc9fab3e 100644
--- a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
@@ -60,3 +60,22 @@ void mlir::populateAMXLegalizeForLLVMExportPatterns(
 void mlir::configureAMXLegalizeForExportTarget(LLVMConversionTarget &target) {
   target.addIllegalDialect<AMXDialect>();
 }
+
+namespace {
+/// Implement the interface to convert AMX to LLVM.
+struct AMXToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
+  using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
+
+  void populateConvertToLLVMConversionPatterns(
+      ConversionTarget &target, LLVMTypeConverter &typeConverter,
+      RewritePatternSet &patterns) const final {
+    populateAMXLegalizeForLLVMExportPatterns(typeConverter, patterns);
+  }
+};
+} // namespace
+
+void mlir::registerConvertAMXToLLVMInterface(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, amx::AMXDialect *dialect) {
+    dialect->addInterfaces<AMXToLLVMDialectInterface>();
+  });
+}
diff --git a/mlir/test/Target/LLVMIR/amx.mlir b/mlir/test/Target/LLVMIR/amx.mlir
index 094475040436..abdf2fe3bd53 100644
--- a/mlir/test/Target/LLVMIR/amx.mlir
+++ b/mlir/test/Target/LLVMIR/amx.mlir
@@ -88,3 +88,23 @@ func.func @amx_tile_muli(%matA: memref<?x?xi8>, %matB: memref<?x?xi8>,
   amx.tile_store %out[%c16, %c16], %res3 : memref<?x?xi8>, !amx.tile<16x16xi32>
   return
 }
+
+// CHECK-LABEL: define void @amx_tile_type_through_cf
+func.func @amx_tile_type_through_cf(%src: memref<?x?xi8>, %out: memref<?x?xi8>,
+    %idx: index, %cond: i1) {
+  cf.cond_br %cond, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  // CHECK: call x86_amx @llvm.x86.tileloadd64.internal
+  %0 = amx.tile_load %src[%idx, %idx] : memref<?x?xi8> into !amx.tile<16x64xi8>
+  cf.br ^bb3(%0 : !amx.tile<16x64xi8>)
+^bb2:  // pred: ^bb0
+  // CHECK: call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
+  %1 = amx.tile_zero : !amx.tile<16x64xi8>
+  cf.br ^bb3(%1 : !amx.tile<16x64xi8>)
+^bb3(%2: !amx.tile<16x64xi8>):  // 2 preds: ^bb1, ^bb2
+  cf.br ^bb4
+^bb4:  // pred: ^bb3
+  // CHECK: call void @llvm.x86.tilestored64.internal
+  amx.tile_store %out[%idx, %idx], %2 : memref<?x?xi8>, !amx.tile<16x64xi8>
+  return
+}

From 0604dc199c019b23746f4a54885ba0c75569cdae Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 12 Jun 2025 13:44:18 +0200
Subject: [PATCH 0124/1322] Revert "[VPlan] Set branch weight metadata on
 middle term in VPlan (NFC) (#143035)"

This caused assertion failures:

  llvm/lib/Transforms/Vectorize/VPlan.h:4021:
  llvm::VPBasicBlock* llvm::VPlan::getMiddleBlock():
  Assertion `LoopRegion && "cannot call the function after vector loop region has been removed"' failed.

See comment on the PR.

> Manage branch weights for the BranchOnCond in the middle block in VPlan.
> This requires updating VPInstruction to inherit from VPIRMetadata, which
> in general makes sense as there are a number of opcodes that could take
> metadata.
>
> There are other branches (part of the skeleton) that also need branch
> weights adding.
>
> PR: https://github.com/llvm/llvm-project/pull/143035

This reverts commit db8d34db26e9ea92c08d6e813eca9cce40c48478.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 48 +++++++----------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 53 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 +--
 3 files changed, 45 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 474f856d2046..8177b76ad5bd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7272,33 +7272,6 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
 }
 
-/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
-/// BranchOnCond recipe.
-static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
-                                              Loop *OrigLoop) {
-  // 4. Adjust branch weight of the branch in the middle block.
-  Instruction *LatchTerm = OrigLoop->getLoopLatch()->getTerminator();
-  if (!hasBranchWeightMD(*LatchTerm))
-    return;
-
-  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
-  auto *MiddleTerm =
-      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
-  // Only add branch metadata if there is a (conditional) terminator.
-  if (!MiddleTerm)
-    return;
-
-  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
-         "must have a BranchOnCond");
-  // Assume that `Count % VectorTripCount` is equally distributed.
-  unsigned TripCount = Plan.getUF() * VF.getKnownMinValue();
-  assert(TripCount > 0 && "trip count should not be zero");
-  MDBuilder MDB(LatchTerm->getContext());
-  MDNode *BranchWeights =
-      MDB.createBranchWeights({1, TripCount - 1}, /*IsExpected=*/false);
-  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
-}
-
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
@@ -7321,8 +7294,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
                                             *Legal->getWidestInductionType());
-
-  addBranchWeightToMiddleTerminator(BestVPlan, BestVF, OrigLoop);
+  // Retrieve and store the middle block before dissolving regions. Regions are
+  // dissolved after optimizing for VF and UF, which completely removes unneeded
+  // loop regions first.
+  VPBasicBlock *MiddleVPBB =
+      BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7465,6 +7441,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
+  // 4. Adjust branch weight of the branch in the middle block.
+  if (HeaderVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
+  }
+
   return ExpandedSCEVs;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 468284168e9c..acc861b99197 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,39 +882,11 @@ protected:
   unsigned getUnrollPart(VPUser &U) const;
 };
 
-/// Helper to manage IR metadata for recipes. It filters out metadata that
-/// cannot be propagated.
-class VPIRMetadata {
-  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
-
-public:
-  VPIRMetadata() {}
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I.
-  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
-  /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
-
-  /// Add all metadata to \p I.
-  void applyMetadata(Instruction &I) const;
-
-  void addMetadata(unsigned Kind, MDNode *Node) {
-    Metadata.emplace_back(Kind, Node);
-  }
-};
-
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeWithIRFlags,
-                      public VPIRMetadata,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -1004,7 +976,7 @@ public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
+        Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags, DebugLoc DL = {},
@@ -1296,6 +1268,29 @@ protected:
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
+/// Helper to manage IR metadata for recipes. It filters out metadata that
+/// cannot be propagated.
+class VPIRMetadata {
+  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
+
+public:
+  VPIRMetadata() {}
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I.
+  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
+  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
+
+  /// Copy constructor for cloning.
+  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+
+  /// Add all metadata to \p I.
+  void applyMetadata(Instruction &I) const;
+};
+
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8863a3fb4b31..aa6b13c217bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -410,7 +410,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
+      Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
 }
@@ -591,9 +591,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
-    auto *Br = createCondBranch(Cond, getParent(), State);
-    applyMetadata(*Br);
-    return Br;
+    return createCondBranch(Cond, getParent(), State);
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.

From 9f542f14701cdf70023790b206273ae8174e913a Mon Sep 17 00:00:00 2001
From: Ryan Buchner <92571492+bababuck@users.noreply.github.com>
Date: Thu, 12 Jun 2025 05:05:53 -0700
Subject: [PATCH 0125/1322] [RISCV] Add new tests for RISCV zicond extension
 (#143580)

I have a few patches to improve compilation for these tests which I will
be posting as separate MRs.
---
 llvm/test/CodeGen/RISCV/zicond-opts.ll | 289 +++++++++++++++++++++++++
 1 file changed, 289 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/zicond-opts.ll

diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
new file mode 100644
index 000000000000..f5a25868bd12
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -O2 -verify-machineinstrs -mattr=+b,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
+; RUN: llc -mtriple=riscv64 -O2 -verify-machineinstrs -mattr=+b,+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
+
+; (and (icmp x. 0, ne), (icmp y, 0, ne)) -> (czero.eqz (icmp x, 0, ne), y)
+define i32 @icmp_and(i64 %x, i64 %y) {
+; RV32ZICOND-LABEL: icmp_and:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    snez a1, a2
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    and a0, a0, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    and a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+  %3 = icmp ne i64 %y, 0
+  %4 = icmp ne i64 %x, 0
+  %5 = and i1 %4, %3
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+; (and (and (icmp x, 0, ne), (icmp y, 0, ne)), (icmp z, 0, ne)) -> (czero.eqz (czero.eqz (icmp x, 0, ne), y), z)
+define i32 @icmp_and_and(i64 %x, i64 %y, i64 %z) {
+; RV32ZICOND-LABEL: icmp_and_and:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    or a4, a4, a5
+; RV32ZICOND-NEXT:    snez a1, a2
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    and a0, a1, a0
+; RV32ZICOND-NEXT:    snez a1, a4
+; RV32ZICOND-NEXT:    and a0, a1, a0
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and_and:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    and a0, a1, a0
+; RV64ZICOND-NEXT:    snez a1, a2
+; RV64ZICOND-NEXT:    and a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = icmp ne i64 %y, 0
+  %5 = icmp ne i64 %x, 0
+  %6 = and i1 %4, %5
+  %7 = icmp ne i64 %z, 0
+  %8 = and i1 %7, %6
+  %9 = zext i1 %8 to i32
+  ret i32 %9
+}
+
+; (select cond, x, rotl(x, rot.amt)) -> (rotl x, (czero_nez rot.amt, cond))
+define i64 @rotate_l_nez(i64 %x, i64 %rot.amt, i1 %cond) {
+; RV32ZICOND-LABEL: rotate_l_nez:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    bexti a3, a2, 5
+; RV32ZICOND-NEXT:    not a5, a2
+; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a7, a0, a3
+; RV32ZICOND-NEXT:    czero.nez t0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a3, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
+; RV32ZICOND-NEXT:    or a6, a7, a6
+; RV32ZICOND-NEXT:    or a3, a3, t0
+; RV32ZICOND-NEXT:    sll a7, a6, a2
+; RV32ZICOND-NEXT:    srli t0, a3, 1
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    srli a3, a6, 1
+; RV32ZICOND-NEXT:    srl a6, t0, a5
+; RV32ZICOND-NEXT:    srl a3, a3, a5
+; RV32ZICOND-NEXT:    or a5, a7, a6
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a5, a4
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: rotate_l_nez:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    rol a1, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+  %6 = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %rot.amt)
+  %7 = select i1 %cond, i64 %x, i64 %6
+  ret i64 %7
+}
+
+; (select cond, rotl(x, rot.amt), x) -> (rotl x, (czero_eqz rot.amt, cond))
+define i64 @rotate_l_eqz(i64 %x, i64 %rot.amt, i1 %cond) {
+; RV32ZICOND-LABEL: rotate_l_eqz:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    bexti a3, a2, 5
+; RV32ZICOND-NEXT:    not a5, a2
+; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a7, a0, a3
+; RV32ZICOND-NEXT:    czero.nez t0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a3, a1, a3
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    or a6, a7, a6
+; RV32ZICOND-NEXT:    or a3, a3, t0
+; RV32ZICOND-NEXT:    sll a7, a6, a2
+; RV32ZICOND-NEXT:    srli t0, a3, 1
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    srli a3, a6, 1
+; RV32ZICOND-NEXT:    srl a6, t0, a5
+; RV32ZICOND-NEXT:    srl a3, a3, a5
+; RV32ZICOND-NEXT:    or a5, a7, a6
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a5, a4
+; RV32ZICOND-NEXT:    or a0, a2, a0
+; RV32ZICOND-NEXT:    or a1, a3, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: rotate_l_eqz:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    rol a1, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %6 = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %rot.amt)
+  %7 = select i1 %cond, i64 %6, i64 %x
+  ret i64 %7
+}
+
+; (select cond, const, t) -> (add (czero_nez t - const, cond), const)
+define i64 @select_imm_reg(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_imm_reg:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    li a3, 3
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_imm_reg:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    li a2, 3
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 3, i64 %t
+  ret i64 %4
+}
+
+; (select cond, t, const) -> (add (czero_eqz t - const, cond), const)
+define i64 @select_reg_imm(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_reg_imm:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    li a3, 3
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a2
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_reg_imm:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    li a2, 3
+; RV64ZICOND-NEXT:    czero.nez a2, a2, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a1
+; RV64ZICOND-NEXT:    or a0, a0, a2
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 %t, i64 3
+  ret i64 %4
+}
+
+; (select cond, -2048, t) -> (xor (czero_nez (xor t, -2048), cond), -2048)
+define i64 @select_imm_reg_neg_2048(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_imm_reg_neg_2048:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    li a3, -2048
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    neg a2, a2
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    or a1, a2, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_imm_reg_neg_2048:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    li a2, -2048
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 -2048, i64 %t
+  ret i64 %4
+}
+
+; (select cond, 2048, t) -> no transform
+define i64 @select_imm_reg_2048(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_imm_reg_2048:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    bseti a3, zero, 11
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_imm_reg_2048:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    bseti a2, zero, 11
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 2048, i64 %t
+  ret i64 %4
+}
+
+; (select cond, (and f, ~x), f) -> (andn f, (czero_eqz x, cond))
+define i64 @test_inv_and_nez(i64 %f, i64 %x, i1 %cond) {
+; RV32ZICOND-LABEL: test_inv_and_nez:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    addi a4, a4, -1
+; RV32ZICOND-NEXT:    orn a3, a4, a3
+; RV32ZICOND-NEXT:    orn a2, a4, a2
+; RV32ZICOND-NEXT:    and a0, a2, a0
+; RV32ZICOND-NEXT:    and a1, a3, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: test_inv_and_nez:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    andn a1, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %5 = xor i64 %x, -1
+  %6 = select i1 %cond, i64 %5, i64 -1
+  %7 = and i64 %6, %f
+  ret i64 %7
+}
+
+; (select cond, f, (and f, ~x)) -> (andn f, (czero_nez x, cond))
+define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) {
+; RV32ZICOND-LABEL: test_inv_and_eqz:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    slli a4, a4, 31
+; RV32ZICOND-NEXT:    srai a4, a4, 31
+; RV32ZICOND-NEXT:    orn a3, a4, a3
+; RV32ZICOND-NEXT:    orn a2, a4, a2
+; RV32ZICOND-NEXT:    and a0, a2, a0
+; RV32ZICOND-NEXT:    and a1, a3, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: test_inv_and_eqz:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    andn a1, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %5 = xor i64 %x, -1
+  %6 = select i1 %cond, i64 -1, i64 %5
+  %7 = and i64 %6, %f
+  ret i64 %7
+}

From 2ecbfc0beb42abbbd2c3d28bfd576b38c44a5b46 Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Thu, 12 Jun 2025 20:11:14 +0800
Subject: [PATCH 0126/1322] [LoongArch] Fix '-mno-lsx' option not disabling
 LASX feature (#143821)

When '-march' with LASX feature and '-mno-lsx' options are used
together, '-mno-lsx' fails to disable LASX, leaving
'HasFeatureLASX=true' and causing incorrect '__loongarch_sx/asx=1' macro
definition.

Fixes https://github.com/loongson-community/discussions/issues/95
---
 clang/lib/Driver/ToolChains/Arch/LoongArch.cpp | 1 +
 clang/test/Preprocessor/init-loongarch.c       | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 3318e498a74f..33a655870b01 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -252,6 +252,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
         Features.push_back("+lsx");
     } else /*-mno-lsx*/ {
       Features.push_back("-lsx");
+      Features.push_back("-lasx");
     }
   }
 
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index ac461b371162..71a266b8a915 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -946,6 +946,10 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -march=la464 -mno-lsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lsx -march=la464 -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx

From bc7fafbeea08bf8cd9a18fa10d3d3bc63f0c45a3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 14:13:15 +0200
Subject: [PATCH 0127/1322] [AA] Take read-only provenance captures into
 account (#143097)

Update the AA CaptureAnalysis providers to return CaptureComponents, so
we can distinguish between full provenance and read-only provenance
captures.

Use this to restrict "other" memory effects on call from ModRef to Ref.

Ideally we would also apply the same reasoning for escape sources, but
the current API cannot actually convey the necessary information (we can
only say NoAlias or MayAlias, not MayAlias but only via Ref).
---
 llvm/include/llvm/Analysis/AliasAnalysis.h    | 29 ++++---
 llvm/include/llvm/Analysis/CaptureTracking.h  | 26 +++---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      | 86 +++++++++++--------
 llvm/lib/Analysis/CaptureTracking.cpp         | 11 ++-
 .../Scalar/DeadStoreElimination.cpp           |  3 +-
 llvm/test/Analysis/BasicAA/captures.ll        |  3 +-
 llvm/test/Transforms/GVN/captures.ll          |  3 +-
 7 files changed, 89 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 0e736b92e550..b7d1251aeb72 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -149,23 +149,24 @@ LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, AliasResult AR);
 struct LLVM_ABI CaptureAnalysis {
   virtual ~CaptureAnalysis() = 0;
 
-  /// Check whether Object is not captured before instruction I. If OrAt is
-  /// true, captures by instruction I itself are also considered.
+  /// Return how Object may be captured before instruction I, considering only
+  /// provenance captures. If OrAt is true, captures by instruction I itself
+  /// are also considered.
   ///
   /// If I is nullptr, then captures at any point will be considered.
-  virtual bool isNotCapturedBefore(const Value *Object, const Instruction *I,
-                                   bool OrAt) = 0;
+  virtual CaptureComponents
+  getCapturesBefore(const Value *Object, const Instruction *I, bool OrAt) = 0;
 };
 
 /// Context-free CaptureAnalysis provider, which computes and caches whether an
 /// object is captured in the function at all, but does not distinguish whether
 /// it was captured before or after the context instruction.
 class LLVM_ABI SimpleCaptureAnalysis final : public CaptureAnalysis {
-  SmallDenseMap<const Value *, bool, 8> IsCapturedCache;
+  SmallDenseMap<const Value *, CaptureComponents, 8> IsCapturedCache;
 
 public:
-  bool isNotCapturedBefore(const Value *Object, const Instruction *I,
-                           bool OrAt) override;
+  CaptureComponents getCapturesBefore(const Value *Object, const Instruction *I,
+                                      bool OrAt) override;
 };
 
 /// Context-sensitive CaptureAnalysis provider, which computes and caches the
@@ -176,10 +177,12 @@ class LLVM_ABI EarliestEscapeAnalysis final : public CaptureAnalysis {
   const LoopInfo *LI;
 
   /// Map from identified local object to an instruction before which it does
-  /// not escape, or nullptr if it never escapes. The "earliest" instruction
-  /// may be a conservative approximation, e.g. the first instruction in the
-  /// function is always a legal choice.
-  DenseMap<const Value *, Instruction *> EarliestEscapes;
+  /// not escape (or nullptr if it never escapes) and the possible components
+  /// that may be captured (by any instruction, not necessarily the earliest
+  /// one). The "earliest" instruction may be a conservative approximation,
+  /// e.g. the first instruction in the function is always a legal choice.
+  DenseMap<const Value *, std::pair<Instruction *, CaptureComponents>>
+      EarliestEscapes;
 
   /// Reverse map from instruction to the objects it is the earliest escape for.
   /// This is used for cache invalidation purposes.
@@ -189,8 +192,8 @@ public:
   EarliestEscapeAnalysis(DominatorTree &DT, const LoopInfo *LI = nullptr)
       : DT(DT), LI(LI) {}
 
-  bool isNotCapturedBefore(const Value *Object, const Instruction *I,
-                           bool OrAt) override;
+  CaptureComponents getCapturesBefore(const Value *Object, const Instruction *I,
+                                      bool OrAt) override;
 
   void removeInstruction(Instruction *I);
 };
diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h
index dd6a7f9b14dc..e652bc5a0a5a 100644
--- a/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -95,21 +95,21 @@ namespace llvm {
       function_ref<bool(CaptureComponents)> StopFn = capturesAnything,
       const LoopInfo *LI = nullptr, unsigned MaxUsesToExplore = 0);
 
-  // Returns the 'earliest' instruction that captures \p V in \F. An instruction
-  // A is considered earlier than instruction B, if A dominates B. If 2 escapes
-  // do not dominate each other, the terminator of the common dominator is
-  // chosen. If not all uses can be analyzed, the earliest escape is set to
-  // the first instruction in the function entry block. If \p V does not escape,
-  // nullptr is returned. Note that the caller of the function has to ensure
-  // that the instruction the result value is compared against is not in a
-  // cycle.
+  // Returns the 'earliest' instruction that captures \p V in \F, and which
+  // components may be captured (by any use, not necessarily the earliest one).
+  // An instruction A is considered earlier than instruction B, if A dominates
+  // B. If 2 escapes do not dominate each other, the terminator of the common
+  // dominator is chosen. If not all uses can be analyzed, the earliest escape
+  // is set to the first instruction in the function entry block. If \p V does
+  // not escape, nullptr is returned. Note that the caller of the function has
+  // to ensure that the instruction the result value is compared against is
+  // not in a cycle.
   //
   // Only consider components that are part of \p Mask.
-  LLVM_ABI Instruction *FindEarliestCapture(const Value *V, Function &F,
-                                            bool ReturnCaptures,
-                                            const DominatorTree &DT,
-                                            CaptureComponents Mask,
-                                            unsigned MaxUsesToExplore = 0);
+  LLVM_ABI std::pair<Instruction *, CaptureComponents>
+  FindEarliestCapture(const Value *V, Function &F, bool ReturnCaptures,
+                      const DominatorTree &DT, CaptureComponents Mask,
+                      unsigned MaxUsesToExplore = 0);
 
   /// Capture information for a specific Use.
   struct UseCaptureInfo {
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index f862d6930f54..31611dfe4fd2 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -192,18 +192,20 @@ static bool areBothVScale(const Value *V1, const Value *V2) {
 
 CaptureAnalysis::~CaptureAnalysis() = default;
 
-bool SimpleCaptureAnalysis::isNotCapturedBefore(const Value *Object,
-                                                const Instruction *I,
-                                                bool OrAt) {
+CaptureComponents SimpleCaptureAnalysis::getCapturesBefore(const Value *Object,
+                                                           const Instruction *I,
+                                                           bool OrAt) {
   if (!isIdentifiedFunctionLocal(Object))
-    return false;
+    return CaptureComponents::Provenance;
 
-  auto [CacheIt, Inserted] = IsCapturedCache.insert({Object, false});
+  auto [CacheIt, Inserted] =
+      IsCapturedCache.insert({Object, CaptureComponents::Provenance});
   if (!Inserted)
     return CacheIt->second;
 
-  bool Ret = !capturesAnything(PointerMayBeCaptured(
-      Object, /*ReturnCaptures=*/false, CaptureComponents::Provenance));
+  CaptureComponents Ret = PointerMayBeCaptured(
+      Object, /*ReturnCaptures=*/false, CaptureComponents::Provenance,
+      [](CaptureComponents CC) { return capturesFullProvenance(CC); });
   CacheIt->second = Ret;
   return Ret;
 }
@@ -216,37 +218,44 @@ static bool isNotInCycle(const Instruction *I, const DominatorTree *DT,
          !isPotentiallyReachableFromMany(Succs, BB, nullptr, DT, LI);
 }
 
-bool EarliestEscapeAnalysis::isNotCapturedBefore(const Value *Object,
-                                                 const Instruction *I,
-                                                 bool OrAt) {
+CaptureComponents
+EarliestEscapeAnalysis::getCapturesBefore(const Value *Object,
+                                          const Instruction *I, bool OrAt) {
   if (!isIdentifiedFunctionLocal(Object))
-    return false;
+    return CaptureComponents::Provenance;
 
   auto Iter = EarliestEscapes.try_emplace(Object);
   if (Iter.second) {
-    Instruction *EarliestCapture = FindEarliestCapture(
-        Object, *const_cast<Function *>(DT.getRoot()->getParent()),
-        /*ReturnCaptures=*/false, DT, CaptureComponents::Provenance);
-    if (EarliestCapture)
-      Inst2Obj[EarliestCapture].push_back(Object);
+    std::pair<Instruction *, CaptureComponents> EarliestCapture =
+        FindEarliestCapture(
+            Object, *const_cast<Function *>(DT.getRoot()->getParent()),
+            /*ReturnCaptures=*/false, DT, CaptureComponents::Provenance);
+    if (EarliestCapture.first)
+      Inst2Obj[EarliestCapture.first].push_back(Object);
     Iter.first->second = EarliestCapture;
   }
 
-  // No capturing instruction.
-  if (!Iter.first->second)
-    return true;
+  auto IsNotCapturedBefore = [&]() {
+    // No capturing instruction.
+    Instruction *CaptureInst = Iter.first->second.first;
+    if (!CaptureInst)
+      return true;
 
-  // No context instruction means any use is capturing.
-  if (!I)
-    return false;
-
-  if (I == Iter.first->second) {
-    if (OrAt)
+    // No context instruction means any use is capturing.
+    if (!I)
       return false;
-    return isNotInCycle(I, &DT, LI);
-  }
 
-  return !isPotentiallyReachable(Iter.first->second, I, nullptr, &DT, LI);
+    if (I == CaptureInst) {
+      if (OrAt)
+        return false;
+      return isNotInCycle(I, &DT, LI);
+    }
+
+    return !isPotentiallyReachable(CaptureInst, I, nullptr, &DT, LI);
+  };
+  if (IsNotCapturedBefore())
+    return CaptureComponents::None;
+  return Iter.first->second.second;
 }
 
 void EarliestEscapeAnalysis::removeInstruction(Instruction *I) {
@@ -946,9 +955,14 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   // As an exception, ignore allocas, as setjmp is not required to preserve
   // non-volatile stores for them.
   if (isModOrRefSet(OtherMR) && !isa<Constant>(Object) && Call != Object &&
-      AAQI.CA->isNotCapturedBefore(Object, Call, /*OrAt=*/false) &&
-      (isa<AllocaInst>(Object) || !Call->hasFnAttr(Attribute::ReturnsTwice)))
-    OtherMR = ModRefInfo::NoModRef;
+      (isa<AllocaInst>(Object) || !Call->hasFnAttr(Attribute::ReturnsTwice))) {
+    CaptureComponents CC =
+        AAQI.CA->getCapturesBefore(Object, Call, /*OrAt=*/false);
+    if (capturesNothing(CC))
+      OtherMR = ModRefInfo::NoModRef;
+    else if (capturesReadProvenanceOnly(CC))
+      OtherMR = ModRefInfo::Ref;
+  }
 
   // Refine the modref info for argument memory. We only bother to do this
   // if ArgMR is not a subset of OtherMR, otherwise this won't have an impact
@@ -1614,11 +1628,13 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
     // temporary store the nocapture argument's value in a temporary memory
     // location if that memory location doesn't escape. Or it may pass a
     // nocapture value to other functions as long as they don't capture it.
-    if (isEscapeSource(O1) && AAQI.CA->isNotCapturedBefore(
-                                  O2, dyn_cast<Instruction>(O1), /*OrAt*/ true))
+    if (isEscapeSource(O1) &&
+        capturesNothing(AAQI.CA->getCapturesBefore(
+            O2, dyn_cast<Instruction>(O1), /*OrAt*/ true)))
       return AliasResult::NoAlias;
-    if (isEscapeSource(O2) && AAQI.CA->isNotCapturedBefore(
-                                  O1, dyn_cast<Instruction>(O2), /*OrAt*/ true))
+    if (isEscapeSource(O2) &&
+        capturesNothing(AAQI.CA->getCapturesBefore(
+            O1, dyn_cast<Instruction>(O2), /*OrAt*/ true)))
       return AliasResult::NoAlias;
   }
 
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index d08ed17a655e..076f4176c021 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -249,11 +249,10 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
       capturesAnything, LI, MaxUsesToExplore));
 }
 
-Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
-                                       bool ReturnCaptures,
-                                       const DominatorTree &DT,
-                                       CaptureComponents Mask,
-                                       unsigned MaxUsesToExplore) {
+std::pair<Instruction *, CaptureComponents>
+llvm::FindEarliestCapture(const Value *V, Function &F, bool ReturnCaptures,
+                          const DominatorTree &DT, CaptureComponents Mask,
+                          unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
 
@@ -263,7 +262,7 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
     ++NumCapturedBefore;
   else
     ++NumNotCapturedBefore;
-  return CB.EarliestCapture;
+  return {CB.EarliestCapture, CB.CC};
 }
 
 UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 49a0c88922c3..4a2eb9284a6e 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2345,7 +2345,8 @@ bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB,
                                EarliestEscapeAnalysis &EA) {
   const Value *UnderlyingObj = getUnderlyingObject(Arg);
   return isIdentifiedFunctionLocal(UnderlyingObj) &&
-         EA.isNotCapturedBefore(UnderlyingObj, CB, /*OrAt*/ true);
+         capturesNothing(
+             EA.getCapturesBefore(UnderlyingObj, CB, /*OrAt*/ true));
 }
 
 SmallVector<MemoryLocation, 1>
diff --git a/llvm/test/Analysis/BasicAA/captures.ll b/llvm/test/Analysis/BasicAA/captures.ll
index c212a466f8ed..c9ed1ea74be8 100644
--- a/llvm/test/Analysis/BasicAA/captures.ll
+++ b/llvm/test/Analysis/BasicAA/captures.ll
@@ -17,8 +17,7 @@ define void @address_capture() {
 
 ; CHECK-LABEL: read_only_capture
 ; CHECK: MayAlias:	i32* %a, i32* %p
-; CHECK: Both ModRef:  Ptr: i32* %a	<->  %p = call ptr @get_ptr()
-; TODO: The ModRef could be just Ref.
+; CHECK: Just Ref:  Ptr: i32* %a	<->  %p = call ptr @get_ptr()
 define void @read_only_capture() {
   %a = alloca i32
   call void @capture(ptr captures(address, read_provenance) %a)
diff --git a/llvm/test/Transforms/GVN/captures.ll b/llvm/test/Transforms/GVN/captures.ll
index ae47e92da0f2..96fce438356c 100644
--- a/llvm/test/Transforms/GVN/captures.ll
+++ b/llvm/test/Transforms/GVN/captures.ll
@@ -43,8 +43,7 @@ define i32 @read_provenance_capture() {
 ; CHECK-NEXT:    call void @capture(ptr captures(address, read_provenance) [[A]])
 ; CHECK-NEXT:    store i32 1, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @unknown_call()
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[V]]
+; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca i32
   call void @capture(ptr captures(address, read_provenance) %a)

From 3550662c040024597485d1bfac0d733340514ae1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 21:17:48 +0900
Subject: [PATCH 0128/1322] ARM: Avoid using getTargetLowering in
 TargetLowering (#143833)

This is this.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 37 ++++++++++---------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d2e910a248f2..f17eb72bb2e2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2688,8 +2688,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         unsigned RegBegin, RegEnd;
         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
 
-        EVT PtrVT =
-            DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+        EVT PtrVT = getPointerTy(DAG.getDataLayout());
         unsigned int i, j;
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
@@ -5024,7 +5023,7 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
 SDValue
 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+  if (!isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDValue Value, OverflowCmp;
@@ -5070,7 +5069,7 @@ static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+  if (!isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDValue LHS = Op.getOperand(0);
@@ -5168,7 +5167,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (Cond.getResNo() == 1 &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO)) {
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+    if (!isTypeLegal(Cond->getValueType(0)))
       return SDValue();
 
     SDValue Value, OverflowCmp;
@@ -5530,8 +5529,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (isUnsupportedFloatingType(LHS.getValueType())) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(
-        DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
+    softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -5736,7 +5734,7 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || OptimizeMul)) {
     // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+    if (!isTypeLegal(Cond->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
@@ -5766,8 +5764,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   if (isUnsupportedFloatingType(LHS.getValueType())) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(
-        DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
+    softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -5787,7 +5784,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
        Opc == ISD::USUBO || OptimizeMul) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+    if (!isTypeLegal(LHS->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
@@ -6255,7 +6252,6 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
 /// vectors), since the legalizer won't know what to do with that.
 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
                                          const ARMSubtarget *Subtarget) const {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
@@ -6282,7 +6278,7 @@ SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
-  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+  if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
     // if we can combine the bitcast with its source.
     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
@@ -6294,7 +6290,7 @@ SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
   }
 
   // Turn f64->i64 into VMOVRRD.
-  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
+  if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
     SDValue Cvt;
     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
         SrcVT.getVectorNumElements() > 1)
@@ -9931,7 +9927,6 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
   Type *RetTy = StructType::get(ArgTy, ArgTy);
@@ -9945,7 +9940,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
     const Align StackAlign = DL.getPrefTypeAlign(RetTy);
     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
+    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
 
     ArgListEntry Entry;
     Entry.Node = SRet;
@@ -10003,7 +9998,6 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
 
   const auto &DL = DAG.getDataLayout();
-  const auto &TLI = DAG.getTargetLoweringInfo();
 
   const char *Name = nullptr;
   if (Signed)
@@ -10011,7 +10005,7 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   else
     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
 
-  SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+  SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
 
   ARMTargetLowering::ArgListTy Args;
 
@@ -10101,7 +10095,6 @@ void ARMTargetLowering::ExpandDIV_Windows(
     SDValue Op, SelectionDAG &DAG, bool Signed,
     SmallVectorImpl<SDValue> &Results) const {
   const auto &DL = DAG.getDataLayout();
-  const auto &TLI = DAG.getTargetLoweringInfo();
 
   assert(Op.getValueType() == MVT::i64 &&
          "unexpected type for custom lowering DIV");
@@ -10113,7 +10106,7 @@ void ARMTargetLowering::ExpandDIV_Windows(
 
   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
-                              DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+                              DAG.getConstant(32, dl, getPointerTy(DL)));
   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
 
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
@@ -10525,8 +10518,8 @@ SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
   // If we don't have instructions of this float type then soften to a libcall
   // and use SETCC instead.
   if (isUnsupportedFloatingType(LHS.getValueType())) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(
-      DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
+    softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
+                        Chain, IsSignaling);
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, dl, LHS.getValueType());
       CC = ISD::SETNE;

From 633375a29f52504b0b23a30bb767de521dd3e2a8 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 12 Jun 2025 13:20:36 +0100
Subject: [PATCH 0129/1322] [llvm][DWARFLinker] Fix gcc 13 -Wuninitialized
 warnings (#143867)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A bit awkward that we have to switch from public to protected and back
again, but it seemed neater than putting OS all the way down at the
bottom. Since it is a public member that you're more likely to be
looking for.

llvm-project/llvm/lib/DWARFLinker/Parallel/OutputSections.h:157:67:
warning: member
‘llvm::dwarf_linker::parallel::SectionDescriptor::Contents’ is used
uninitialized [-Wuninitialized]

Which refers to the use in the constructor:
```
  SectionDescriptor(DebugSectionKind SectionKind, LinkingGlobalData &GlobalData,
                    dwarf::FormParams Format, llvm::endianness Endianess)
      : SectionDescriptorBase(SectionKind, Format, Endianess), OS(Contents),
```
Where Contents is passed to `OS`, before Contents has been constructed.
---
 llvm/lib/DWARFLinker/Parallel/OutputSections.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/DWARFLinker/Parallel/OutputSections.h b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
index da47f53b6c3d..5043e918013e 100644
--- a/llvm/lib/DWARFLinker/Parallel/OutputSections.h
+++ b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
@@ -181,6 +181,11 @@ struct SectionDescriptor : SectionDescriptorBase {
   /// to the debug section, corresponding to this object.
   uint64_t StartOffset = 0;
 
+protected:
+  /// Section data bits.
+  OutSectionDataTy Contents;
+
+public:
   /// Stream which stores data to the Contents.
   raw_svector_ostream OS;
 
@@ -287,9 +292,6 @@ protected:
 
   LinkingGlobalData &GlobalData;
 
-  /// Section data bits.
-  OutSectionDataTy Contents;
-
   /// Some sections are generated using AsmPrinter. The real section data
   /// located inside elf file in that case. Following fields points to the
   /// real section content inside elf file.

From aac603c47800bf2e167b53ddfd3bb10be292bc53 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 21:20:45 +0900
Subject: [PATCH 0130/1322] ARM: Avoid repeating hardcoded windows division
 libcall names (#143834)

This is properly set in the runtime libcall info, so query
the name.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f17eb72bb2e2..5b3664c4e961 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -9998,13 +9998,13 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
 
   const auto &DL = DAG.getDataLayout();
-
-  const char *Name = nullptr;
+  RTLIB::Libcall LC;
   if (Signed)
-    Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+    LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
   else
-    Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+    LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
 
+  const char *Name = getLibcallName(LC);
   SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
 
   ARMTargetLowering::ArgListTy Args;

From b9793118423f928b8dcda933aa581f3904ae2b68 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 12 Jun 2025 14:21:29 +0200
Subject: [PATCH 0131/1322] [libc++] Remove allocator support from
 std::function (#140395)

The allocator support was removed in P0302R1, since it was impossible to
implement. We're currently providing the API for this, but ignore the
allocator in all cases but one (which is almost certainly an oversight).
That case is the `function(allocator_arg_t, const Alloc&, Func)`
constuctor. IMO we should remove the API entirely at a later date, but
this only removes most of the code for now, leaving only the public
functions. This not only simplifies the code quite a bit, but also
results in the constructor being instantiated ~8x faster.

Fixes #133901
---
 libcxx/docs/ReleaseNotes/21.rst        |   5 +
 libcxx/include/__functional/function.h | 238 ++++---------------------
 2 files changed, 39 insertions(+), 204 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6cbc0baf2948..2a5b90750eaf 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -99,6 +99,11 @@ Potentially breaking changes
 
 - User-defined specializations of ``std::common_reference`` are diagnosed now. To customize the common reference type, ``std::basic_common_reference`` should be specialized instead.
 
+- ``std::function`` used to have allocator support, which was removed from the Standard by `http://wg21.link/p0302r1`
+  due to issues with its design and inconsistent support from implementations. Previously, libc++ would provide
+  allocator-aware APIs in ``std::function`` in C++11 and C++14, but ignores the allocator argument in all places but
+  one. Starting in this release, the allocator argument is always ignored.
+
 Announcements About Future Releases
 -----------------------------------
 
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 733f321925a4..e71c778386fd 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -17,13 +17,7 @@
 #include <__functional/binary_function.h>
 #include <__functional/invoke.h>
 #include <__functional/unary_function.h>
-#include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
-#include <__memory/allocator.h>
-#include <__memory/allocator_destructor.h>
-#include <__memory/allocator_traits.h>
-#include <__memory/compressed_pair.h>
-#include <__memory/unique_ptr.h>
 #include <__type_traits/aligned_storage.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_core_convertible.h>
@@ -34,9 +28,7 @@
 #include <__type_traits/strip_signature.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
-#include <__utility/piecewise_construct.h>
 #include <__utility/swap.h>
-#include <__verbose_abort>
 #include <tuple>
 #include <typeinfo>
 
@@ -133,71 +125,9 @@ _LIBCPP_HIDE_FROM_ABI bool __not_null(_Rp (^__p)(_Args...)) {
 
 namespace __function {
 
-// __alloc_func holds a functor and an allocator.
-
-template <class _Fp, class _Ap, class _FB>
-class __alloc_func;
 template <class _Fp, class _FB>
 class __default_alloc_func;
 
-template <class _Fp, class _Ap, class _Rp, class... _ArgTypes>
-class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> {
-  _LIBCPP_COMPRESSED_PAIR(_Fp, __func_, _Ap, __alloc_);
-
-public:
-  using _Target _LIBCPP_NODEBUG = _Fp;
-  using _Alloc _LIBCPP_NODEBUG  = _Ap;
-
-  _LIBCPP_HIDE_FROM_ABI const _Target& __target() const { return __func_; }
-
-  // WIN32 APIs may define __allocator, so use __get_allocator instead.
-  _LIBCPP_HIDE_FROM_ABI const _Alloc& __get_allocator() const { return __alloc_; }
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(_Target&& __f) : __func_(std::move(__f)), __alloc_() {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(const _Target& __f, const _Alloc& __a) : __func_(__f), __alloc_(__a) {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(const _Target& __f, _Alloc&& __a)
-      : __func_(__f), __alloc_(std::move(__a)) {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(_Target&& __f, _Alloc&& __a)
-      : __func_(std::move(__f)), __alloc_(std::move(__a)) {}
-
-  _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) {
-    return std::__invoke_r<_Rp>(__func_, std::forward<_ArgTypes>(__arg)...);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI __alloc_func* __clone() const {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __rebind_alloc<__alloc_traits, __alloc_func> _AA;
-    _AA __a(__alloc_);
-    typedef __allocator_destructor<_AA> _Dp;
-    unique_ptr<__alloc_func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-    ::new ((void*)__hold.get()) __alloc_func(__func_, _Alloc(__a));
-    return __hold.release();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI void destroy() _NOEXCEPT {
-    __func_.~_Fp();
-    __alloc_.~_Alloc();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__alloc_func* __f) {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __rebind_alloc<__alloc_traits, __alloc_func> _FunAlloc;
-    _FunAlloc __a(__f->__get_allocator());
-    __f->destroy();
-    __a.deallocate(__f, 1);
-  }
-};
-
-template <class _Tp>
-struct __deallocating_deleter {
-  _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const {
-    std::__libcpp_deallocate<_Tp>(static_cast<_Tp*>(__p), __element_count(1));
-  }
-};
-
 template <class _Fp, class _Rp, class... _ArgTypes>
 class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
   _Fp __f_;
@@ -215,20 +145,9 @@ public:
     return std::__invoke_r<_Rp>(__f_, std::forward<_ArgTypes>(__arg)...);
   }
 
-  _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const {
-    using _Self = __default_alloc_func;
-    unique_ptr<_Self, __deallocating_deleter<_Self>> __hold(std::__libcpp_allocate<_Self>(__element_count(1)));
-    _Self* __res = ::new ((void*)__hold.get()) _Self(__f_);
-    (void)__hold.release();
-    return __res;
-  }
+  _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const { return new __default_alloc_func(__f_); }
 
   _LIBCPP_HIDE_FROM_ABI void destroy() _NOEXCEPT { __f_.~_Target(); }
-
-  _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__default_alloc_func* __f) {
-    __f->destroy();
-    std::__libcpp_deallocate<__default_alloc_func>(__f, __element_count(1));
-  }
 };
 
 // __base provides an abstract interface for copyable functors.
@@ -257,84 +176,38 @@ public:
 
 // __func implements __base for a given functor type.
 
-template <class _FD, class _Alloc, class _FB>
+template <class _FD, class _FB>
 class __func;
 
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-class __func<_Fp, _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
-  __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> __f_;
+template <class _Fp, class _Rp, class... _ArgTypes>
+class __func<_Fp, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
+  _Fp __func_;
 
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __func(_Fp&& __f) : __f_(std::move(__f)) {}
+  _LIBCPP_HIDE_FROM_ABI explicit __func(_Fp&& __f) : __func_(std::move(__f)) {}
+  _LIBCPP_HIDE_FROM_ABI explicit __func(const _Fp& __f) : __func_(__f) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(const _Fp& __f, const _Alloc& __a) : __f_(__f, __a) {}
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL __base<_Rp(_ArgTypes...)>* __clone() const override { return new __func(__func_); }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(const _Fp& __f, _Alloc&& __a) : __f_(__f, std::move(__a)) {}
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL void __clone(__base<_Rp(_ArgTypes...)>* __p) const override {
+    ::new ((void*)__p) __func(__func_);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(_Fp&& __f, _Alloc&& __a) : __f_(std::move(__f), std::move(__a)) {}
-
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual __base<_Rp(_ArgTypes...)>* __clone() const;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void __clone(__base<_Rp(_ArgTypes...)>*) const;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy() _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy_deallocate() _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual _Rp operator()(_ArgTypes&&... __arg);
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL void destroy() _NOEXCEPT override { __func_.~_Fp(); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL void destroy_deallocate() _NOEXCEPT override { delete this; }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL _Rp operator()(_ArgTypes&&... __arg) override {
+    return std::__invoke_r<_Rp>(__func_, std::forward<_ArgTypes>(__arg)...);
+  }
 #  if _LIBCPP_HAS_RTTI
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const void* target(const type_info&) const _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const std::type_info& target_type() const _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL const void* target(const type_info& __ti) const _NOEXCEPT override {
+    if (__ti == typeid(_Fp))
+      return std::addressof(__func_);
+    return nullptr;
+  }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL const std::type_info& target_type() const _NOEXCEPT override { return typeid(_Fp); }
 #  endif // _LIBCPP_HAS_RTTI
 };
 
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-__base<_Rp(_ArgTypes...)>* __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone() const {
-  typedef allocator_traits<_Alloc> __alloc_traits;
-  typedef __rebind_alloc<__alloc_traits, __func> _Ap;
-  _Ap __a(__f_.__get_allocator());
-  typedef __allocator_destructor<_Ap> _Dp;
-  unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-  ::new ((void*)__hold.get()) __func(__f_.__target(), _Alloc(__a));
-  return __hold.release();
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone(__base<_Rp(_ArgTypes...)>* __p) const {
-  ::new ((void*)__p) __func(__f_.__target(), __f_.__get_allocator());
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy() _NOEXCEPT {
-  __f_.destroy();
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy_deallocate() _NOEXCEPT {
-  typedef allocator_traits<_Alloc> __alloc_traits;
-  typedef __rebind_alloc<__alloc_traits, __func> _Ap;
-  _Ap __a(__f_.__get_allocator());
-  __f_.destroy();
-  __a.deallocate(this, 1);
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-_Rp __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::operator()(_ArgTypes&&... __arg) {
-  return __f_(std::forward<_ArgTypes>(__arg)...);
-}
-
-#  if _LIBCPP_HAS_RTTI
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-const void* __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::target(const type_info& __ti) const _NOEXCEPT {
-  if (__ti == typeid(_Fp))
-    return std::addressof(__f_.__target());
-  return nullptr;
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-const std::type_info& __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::target_type() const _NOEXCEPT {
-  return typeid(_Fp);
-}
-
-#  endif // _LIBCPP_HAS_RTTI
-
 // __value_func creates a value-type from a __func.
 
 template <class _Fp>
@@ -354,29 +227,19 @@ class __value_func<_Rp(_ArgTypes...)> {
 public:
   _LIBCPP_HIDE_FROM_ABI __value_func() _NOEXCEPT : __f_(nullptr) {}
 
-  template <class _Fp, class _Alloc>
-  _LIBCPP_HIDE_FROM_ABI __value_func(_Fp&& __f, const _Alloc& __a) : __f_(nullptr) {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
-    typedef __rebind_alloc<__alloc_traits, _Fun> _FunAlloc;
+  template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __value_func>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI explicit __value_func(_Fp&& __f) : __f_(nullptr) {
+    typedef __function::__func<_Fp, _Rp(_ArgTypes...)> _Fun;
 
     if (__function::__not_null(__f)) {
-      _FunAlloc __af(__a);
-      if (sizeof(_Fun) <= sizeof(__buf_) && is_nothrow_copy_constructible<_Fp>::value &&
-          is_nothrow_copy_constructible<_FunAlloc>::value) {
-        __f_ = ::new ((void*)&__buf_) _Fun(std::move(__f), _Alloc(__af));
+      if (sizeof(_Fun) <= sizeof(__buf_) && is_nothrow_copy_constructible<_Fp>::value) {
+        __f_ = ::new (std::addressof(__buf_)) _Fun(std::move(__f));
       } else {
-        typedef __allocator_destructor<_FunAlloc> _Dp;
-        unique_ptr<__func, _Dp> __hold(__af.allocate(1), _Dp(__af, 1));
-        ::new ((void*)__hold.get()) _Fun(std::move(__f), _Alloc(__a));
-        __f_ = __hold.release();
+        __f_ = new _Fun(std::move(__f));
       }
     }
   }
 
-  template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __value_func>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI explicit __value_func(_Fp&& __f) : __value_func(std::forward<_Fp>(__f), allocator<_Fp>()) {}
-
   _LIBCPP_HIDE_FROM_ABI __value_func(const __value_func& __f) {
     if (__f.__f_ == nullptr)
       __f_ = nullptr;
@@ -544,7 +407,7 @@ private:
 
   template <typename _Fun>
   _LIBCPP_HIDE_FROM_ABI static void __large_destroy(void* __s) {
-    _Fun::__destroy_and_delete(static_cast<_Fun*>(__s));
+    delete static_cast<_Fun*>(__s);
   }
 
   template <typename _Fun>
@@ -583,7 +446,7 @@ private:
 template <typename _Tp>
 using __fast_forward _LIBCPP_NODEBUG = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
 
-// __policy_invoker calls an instance of __alloc_func held in __policy_storage.
+// __policy_invoker calls an instance of __default_alloc_func held in __policy_storage.
 
 template <class _Fp>
 struct __policy_invoker;
@@ -641,28 +504,6 @@ class __policy_func<_Rp(_ArgTypes...)> {
 public:
   _LIBCPP_HIDE_FROM_ABI __policy_func() : __policy_(__policy::__create_empty()) {}
 
-  template <class _Fp, class _Alloc>
-  _LIBCPP_HIDE_FROM_ABI __policy_func(_Fp&& __f, const _Alloc& __a) : __policy_(__policy::__create_empty()) {
-    typedef __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __rebind_alloc<__alloc_traits, _Fun> _FunAlloc;
-
-    if (__function::__not_null(__f)) {
-      __invoker_ = __invoker::template __create<_Fun>();
-      __policy_  = __policy::__create<_Fun>();
-
-      _FunAlloc __af(__a);
-      if (__use_small_storage<_Fun>()) {
-        ::new ((void*)&__buf_.__small) _Fun(std::move(__f), _Alloc(__af));
-      } else {
-        typedef __allocator_destructor<_FunAlloc> _Dp;
-        unique_ptr<_Fun, _Dp> __hold(__af.allocate(1), _Dp(__af, 1));
-        ::new ((void*)__hold.get()) _Fun(std::move(__f), _Alloc(__af));
-        __buf_.__large = __hold.release();
-      }
-    }
-  }
-
   template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __policy_func>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI explicit __policy_func(_Fp&& __f) : __policy_(__policy::__create_empty()) {
     typedef __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fun;
@@ -673,9 +514,7 @@ public:
       if (__use_small_storage<_Fun>()) {
         ::new ((void*)&__buf_.__small) _Fun(std::move(__f));
       } else {
-        unique_ptr<_Fun, __deallocating_deleter<_Fun>> __hold(std::__libcpp_allocate<_Fun>(__element_count(1)));
-        __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f));
-        (void)__hold.release();
+        __buf_.__large = ::new _Fun(std::move(__f));
       }
     }
   }
@@ -750,8 +589,8 @@ public:
 extern "C" void* _Block_copy(const void*);
 extern "C" void _Block_release(const void*);
 
-template <class _Rp1, class... _ArgTypes1, class _Alloc, class _Rp, class... _ArgTypes>
-class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
+template <class _Rp1, class... _ArgTypes1, class _Rp, class... _ArgTypes>
+class __func<_Rp1 (^)(_ArgTypes1...), _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
   typedef _Rp1 (^__block_type)(_ArgTypes1...);
   __block_type __f_;
 
@@ -767,15 +606,6 @@ public:
 
   // [TODO] add && to save on a retain
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(__block_type __f, const _Alloc& /* unused */)
-#    if __has_feature(objc_arc)
-      : __f_(__f)
-#    else
-      : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr))
-#    endif
-  {
-  }
-
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual __base<_Rp(_ArgTypes...)>* __clone() const {
     _LIBCPP_ASSERT_INTERNAL(
         false,
@@ -954,7 +784,7 @@ function<_Rp(_ArgTypes...)>::function(_Fp __f) : __f_(std::move(__f)) {}
 #  if _LIBCPP_STD_VER <= 14
 template <class _Rp, class... _ArgTypes>
 template <class _Fp, class _Alloc, class>
-function<_Rp(_ArgTypes...)>::function(allocator_arg_t, const _Alloc& __a, _Fp __f) : __f_(std::move(__f), __a) {}
+function<_Rp(_ArgTypes...)>::function(allocator_arg_t, const _Alloc&, _Fp __f) : __f_(std::move(__f)) {}
 #  endif
 
 template <class _Rp, class... _ArgTypes>

From 5aed4800f33a72c778f3b49f6389fff099ff4ff6 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim@gymni.ch>
Date: Thu, 12 Jun 2025 14:43:40 +0200
Subject: [PATCH 0132/1322] [GISel] KnownFPClass ValueTracking fix handling of
 vectors (#143372)

---
 .../CodeGen/GlobalISel/GISelValueTracking.cpp | 57 ++++++-----
 .../CodeGen/GlobalISel/KnownFPClassTest.cpp   | 98 +++++++++++++++++++
 2 files changed, 129 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 6650ad25bed0..1286af864fb3 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1046,7 +1046,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     //
     if ((Known.KnownFPClasses & fcZero) != fcNone &&
         !Known.isKnownNeverSubnormal()) {
-      DenormalMode Mode = MF->getDenormalMode(getFltSemanticForLLT(DstTy));
+      DenormalMode Mode =
+          MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()));
       if (Mode != DenormalMode::getIEEE())
         Known.KnownFPClasses |= fcZero;
     }
@@ -1108,8 +1109,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
 
     // If the parent function flushes denormals, the canonical output cannot
     // be a denormal.
-    LLT Ty = MRI.getType(Val);
-    const fltSemantics &FPType = getFltSemanticForLLT(Ty.getScalarType());
+    LLT Ty = MRI.getType(Val).getScalarType();
+    const fltSemantics &FPType = getFltSemanticForLLT(Ty);
     DenormalMode DenormMode = MF->getDenormalMode(FPType);
     if (DenormMode == DenormalMode::getIEEE()) {
       if (KnownSrc.isKnownNever(fcPosZero))
@@ -1219,8 +1220,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     if (KnownSrc.isKnownNeverNaN() && KnownSrc.cannotBeOrderedLessThanZero())
       Known.knownNot(fcNan);
 
-    LLT Ty = MRI.getType(Val);
-    const fltSemantics &FltSem = getFltSemanticForLLT(Ty.getScalarType());
+    LLT Ty = MRI.getType(Val).getScalarType();
+    const fltSemantics &FltSem = getFltSemanticForLLT(Ty);
     DenormalMode Mode = MF->getDenormalMode(FltSem);
 
     if (KnownSrc.isKnownNeverLogicalZero(Mode))
@@ -1338,19 +1339,19 @@ void GISelValueTracking::computeKnownFPClass(Register R,
           Known.knownNot(KnownFPClass::OrderedLessThanZeroMask);
 
         // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
-        if ((KnownLHS.isKnownNeverLogicalNegZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy))) ||
-             KnownRHS.isKnownNeverLogicalNegZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+        if ((KnownLHS.isKnownNeverLogicalNegZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType()))) ||
+             KnownRHS.isKnownNeverLogicalNegZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType())))) &&
             // Make sure output negative denormal can't flush to -0
             outputDenormalIsIEEEOrPosZero(*MF, DstTy))
           Known.knownNot(fcNegZero);
       } else {
         // Only fsub -0, +0 can return -0
-        if ((KnownLHS.isKnownNeverLogicalNegZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy))) ||
-             KnownRHS.isKnownNeverLogicalPosZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+        if ((KnownLHS.isKnownNeverLogicalNegZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType()))) ||
+             KnownRHS.isKnownNeverLogicalPosZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType())))) &&
             // Make sure output negative denormal can't flush to -0
             outputDenormalIsIEEEOrPosZero(*MF, DstTy))
           Known.knownNot(fcNegZero);
@@ -1396,11 +1397,11 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     }
 
     if ((KnownRHS.isKnownNeverInfinity() ||
-         KnownLHS.isKnownNeverLogicalZero(
-             MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+         KnownLHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+             getFltSemanticForLLT(DstTy.getScalarType())))) &&
         (KnownLHS.isKnownNeverInfinity() ||
          KnownRHS.isKnownNeverLogicalZero(
-             MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))
+             MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))))
       Known.knownNot(fcNan);
 
     break;
@@ -1452,10 +1453,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
       if (KnownLHS.isKnownNeverNaN() && KnownRHS.isKnownNeverNaN() &&
           (KnownLHS.isKnownNeverInfinity() ||
            KnownRHS.isKnownNeverInfinity()) &&
-          ((KnownLHS.isKnownNeverLogicalZero(
-               MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) ||
-           (KnownRHS.isKnownNeverLogicalZero(
-               MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))) {
+          ((KnownLHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+               getFltSemanticForLLT(DstTy.getScalarType())))) ||
+           (KnownRHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+               getFltSemanticForLLT(DstTy.getScalarType())))))) {
         Known.knownNot(fcNan);
       }
 
@@ -1468,8 +1469,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
       // Inf REM x and x REM 0 produce NaN.
       if (KnownLHS.isKnownNeverNaN() && KnownRHS.isKnownNeverNaN() &&
           KnownLHS.isKnownNeverInfinity() &&
-          KnownRHS.isKnownNeverLogicalZero(
-              MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) {
+          KnownRHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+              getFltSemanticForLLT(DstTy.getScalarType())))) {
         Known.knownNot(fcNan);
       }
 
@@ -1494,10 +1495,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     // Infinity, nan and zero propagate from source.
     computeKnownFPClass(R, DemandedElts, InterestedClasses, Known, Depth + 1);
 
-    LLT DstTy = MRI.getType(Dst);
-    const fltSemantics &DstSem = getFltSemanticForLLT(DstTy.getScalarType());
-    LLT SrcTy = MRI.getType(Src);
-    const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy.getScalarType());
+    LLT DstTy = MRI.getType(Dst).getScalarType();
+    const fltSemantics &DstSem = getFltSemanticForLLT(DstTy);
+    LLT SrcTy = MRI.getType(Src).getScalarType();
+    const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy);
 
     // All subnormal inputs should be in the normal range in the result type.
     if (APFloat::isRepresentableAsNormalIn(SrcSem, DstSem)) {
@@ -1690,6 +1691,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
   }
   case TargetOpcode::COPY: {
     Register Src = MI.getOperand(1).getReg();
+
+    if (!Src.isVirtual())
+      return;
+
     computeKnownFPClass(Src, DemandedElts, InterestedClasses, Known, Depth + 1);
     break;
   }
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp
index 6ee571804e69..040f0cfc9207 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp
@@ -96,6 +96,104 @@ TEST_F(AArch64GISelMITest, TestFPClassCstVecNegZero) {
   EXPECT_EQ(true, Known.SignBit);
 }
 
+TEST_F(AArch64GISelMITest, TestFPClassCstZeroFPExt) {
+  StringRef MIRString = R"(
+   %c0:_(s32) = G_FCONSTANT float 0.0
+   %ext:_(s64) = nnan ninf G_FPEXT %c0
+   %copy_vector:_(s64) = COPY %ext
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcNormal, Known.KnownFPClasses);
+  EXPECT_EQ(std::nullopt, Known.SignBit);
+}
+
+TEST_F(AArch64GISelMITest, TestFPClassCstVecZeroFPExt) {
+  StringRef MIRString = R"(
+   %c0:_(s32) = G_FCONSTANT float 0.0
+   %c1:_(s32) = G_FCONSTANT float 0.0
+   %c2:_(s32) = G_FCONSTANT float 0.0
+   %vector:_(<3 x s32>) = G_BUILD_VECTOR %c0, %c1, %c2
+   %ext:_(<3 x s64>) = nnan ninf G_FPEXT %vector
+   %copy_vector:_(<3 x s64>) = COPY %ext
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcNormal, Known.KnownFPClasses);
+  EXPECT_EQ(std::nullopt, Known.SignBit);
+}
+
+TEST_F(AArch64GISelMITest, TestFPClassCstZeroFPTrunc) {
+  StringRef MIRString = R"(
+   %c0:_(s64) = G_FCONSTANT double 0.0
+   %trunc:_(s32) = nnan ninf G_FPTRUNC %c0
+   %copy_vector:_(s32) = COPY %trunc
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcPosSubnormal | fcPosNormal, Known.KnownFPClasses);
+  EXPECT_EQ(false, Known.SignBit);
+}
+
+TEST_F(AArch64GISelMITest, TestFPClassCstVecZeroFPTrunc) {
+  StringRef MIRString = R"(
+   %c0:_(s64) = G_FCONSTANT double 0.0
+   %c1:_(s64) = G_FCONSTANT double 0.0
+   %c2:_(s64) = G_FCONSTANT double 0.0
+   %vector:_(<3 x s64>) = G_BUILD_VECTOR %c0, %c1, %c2
+   %trunc:_(<3 x s32>) = nnan ninf G_FPTRUNC %vector
+   %copy_vector:_(<3 x s32>) = COPY %trunc
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcPosSubnormal | fcPosNormal, Known.KnownFPClasses);
+  EXPECT_EQ(false, Known.SignBit);
+}
+
 TEST_F(AArch64GISelMITest, TestFPClassSelectPos0) {
   StringRef MIRString = R"(
     %ptr:_(p0) = G_IMPLICIT_DEF

From 41c8df147b83026db8612ad2ca07fc0f007e3448 Mon Sep 17 00:00:00 2001
From: woruyu <99597449+woruyu@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:46:07 +0800
Subject: [PATCH 0133/1322] [DAG] Convert foldMaskedMerge to SDPatternMatch to
 match (m & x) | (~m & y) (#143855)

This PR resolves https://github.com/llvm/llvm-project/issues/143363

Remove foldMaskedMergeImpl entirely to use SDPatternMatch
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 ++++---------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e79a17e86bc8..5d62ded171f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8128,24 +8128,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
-                                   SDValue AndR1, const SDLoc &DL,
-                                   SelectionDAG &DAG) {
-  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
-    return SDValue();
-  SDValue NotOp = AndL0->getOperand(0);
-  if (NotOp == AndR1)
-    std::swap(AndR1, AndL1);
-  if (NotOp != AndL1)
-    return SDValue();
-
-  EVT VT = AndL1.getValueType();
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
-  return Xor1;
-}
-
 /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
 /// equivalent `((x ^ y) & m) ^ y)` pattern.
 /// This is typically a better representation for targets without a fused
@@ -8155,29 +8137,20 @@ static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
   // Note that masked-merge variants using XOR or ADD expressions are
   // normalized to OR by InstCombine so we only check for OR.
   assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
 
   // If the target supports and-not, don't fold this.
   if (TLI.hasAndNot(SDValue(Node, 0)))
     return SDValue();
 
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
+  SDValue M, X, Y;
+  if (sd_match(Node,
+               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
+                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
+    EVT VT = M.getValueType();
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
+    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
+  }
   return SDValue();
 }
 

From 36ac72f4e3e4752f85c16363d630f4cfbd682e48 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 12 Jun 2025 12:51:11 +0000
Subject: [PATCH 0134/1322] [llvm][MemProf] Fix unused variable warning in
 release build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

g++-13 warned that:
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp:1645:8: warning: variable ‘PrevIterCreatedNode’ set but not used [-Wunused-but-set-variable]
 1645 |   bool PrevIterCreatedNode = false;
      |        ^~~~~~~~~~~~~~~~~~~

When asserts were not enabled.
---
 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index f28fe51fb6a5..10120dd0e10c 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1642,7 +1642,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   // this entry.
   DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
 
-  bool PrevIterCreatedNode = false;
+  [[maybe_unused]] bool PrevIterCreatedNode = false;
   bool CreatedNode = false;
   for (unsigned I = 0; I < Calls.size();
        I++, PrevIterCreatedNode = CreatedNode) {

From a08a831515919bcc384b453799f33bc97860c73b Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 12 Jun 2025 15:06:27 +0200
Subject: [PATCH 0135/1322] [DLCov][NFC] Propagate annotated DebugLocs through
 transformations (#138047)

Part of the coverage-tracking feature, following #107279.

In order for DebugLoc coverage testing to work, we firstly have to set
annotations for intentionally-empty DebugLocs, and secondly we have to
ensure that we do not drop these annotations as we propagate DebugLocs
throughout compilation. As the annotations exist as part of the DebugLoc
class, and not the underlying DILocation, they will not survive a
DebugLoc->DILocation->DebugLoc roundtrip. Therefore this patch modifies
a number of places in the compiler to propagate DebugLocs directly
rather than via the underlying DILocation. This has no effect on the
output of normal builds; it only ensures that during coverage builds, we
do not drop incorrectly annotations and therefore create false
positives.

The bulk of these changes are in replacing
DILocation::getMergedLocation(s) with a DebugLoc equivalent, and in
changing the IRBuilder to store a DebugLoc directly rather than storing
DILocations in its general Metadata array. We also use a new function,
`DebugLoc::orElse`, which selects the "best" DebugLoc out of a pair
(valid location > annotated > empty), preferring the current DebugLoc on
a tie - this encapsulates the existing behaviour at a few sites where we
_may_ assign a DebugLoc to an existing instruction, while extending the
logic to handle annotation DebugLocs at the same time.
---
 .../GlobalISel/LegalizationArtifactCombiner.h |  4 +-
 llvm/include/llvm/IR/DebugInfoMetadata.h      | 24 +++++-----
 llvm/include/llvm/IR/DebugLoc.h               | 45 +++++++++++++++++++
 llvm/include/llvm/IR/IRBuilder.h              | 22 +++++++--
 llvm/include/llvm/IR/Instruction.h            |  2 +-
 llvm/lib/CodeGen/BranchFolding.cpp            |  2 +-
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp |  5 +--
 llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp  |  2 +-
 llvm/lib/CodeGen/MachineBasicBlock.cpp        |  2 +-
 llvm/lib/CodeGen/MachineSink.cpp              |  4 +-
 llvm/lib/IR/DebugInfo.cpp                     |  4 +-
 llvm/lib/IR/DebugLoc.cpp                      | 21 +++++++++
 llvm/lib/IR/IRBuilder.cpp                     | 17 +++----
 llvm/lib/IR/Instruction.cpp                   |  5 ++-
 .../Target/BPF/BPFPreserveStaticOffset.cpp    | 11 +++--
 .../InstCombineLoadStoreAlloca.cpp            |  4 +-
 .../InstCombine/InstructionCombining.cpp      |  3 +-
 .../Transforms/Scalar/ConstantHoisting.cpp    |  2 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |  8 ++--
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |  4 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  6 +--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  2 +-
 22 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 3712a7fa06d9..22f6a5fde546 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -100,8 +100,8 @@ public:
       const LLT DstTy = MRI.getType(DstReg);
       if (isInstLegal({TargetOpcode::G_CONSTANT, {DstTy}})) {
         auto &CstVal = SrcMI->getOperand(1);
-        auto *MergedLocation = DILocation::getMergedLocation(
-            MI.getDebugLoc().get(), SrcMI->getDebugLoc().get());
+        auto MergedLocation =
+            DebugLoc::getMergedLocation(MI.getDebugLoc(), SrcMI->getDebugLoc());
         // Set the debug location to the merged location of the SrcMI and the MI
         // if the aext fold is successful.
         Builder.setDebugLoc(MergedLocation);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 02f0a9f677db..18228b775789 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -2437,25 +2437,21 @@ public:
   inline std::optional<const DILocation *>
   cloneByMultiplyingDuplicationFactor(unsigned DF) const;
 
-  /// When two instructions are combined into a single instruction we also
-  /// need to combine the original locations into a single location.
-  /// When the locations are the same we can use either location.
-  /// When they differ, we need a third location which is distinct from either.
-  /// If they share a common scope, use this scope and compare the line/column
-  /// pair of the locations with the common scope:
-  /// * if both match, keep the line and column;
-  /// * if only the line number matches, keep the line and set the column as 0;
-  /// * otherwise set line and column as 0.
-  /// If they do not share a common scope the location is ambiguous and can't be
-  /// represented in a line entry. In this case, set line and column as 0 and
-  /// use the scope of any location.
-  ///
-  /// \p LocA \p LocB: The locations to be merged.
+  /// Attempts to merge \p LocA and \p LocB into a single location; see
+  /// DebugLoc::getMergedLocation for more details.
+  /// NB: When merging the locations of instructions, prefer to use
+  /// DebugLoc::getMergedLocation(), as an instruction's DebugLoc may contain
+  /// additional metadata that will not be preserved when merging the unwrapped
+  /// DILocations.
   LLVM_ABI static DILocation *getMergedLocation(DILocation *LocA,
                                                 DILocation *LocB);
 
   /// Try to combine the vector of locations passed as input in a single one.
   /// This function applies getMergedLocation() repeatedly left-to-right.
+  /// NB: When merging the locations of instructions, prefer to use
+  /// DebugLoc::getMergedLocations(), as an instruction's DebugLoc may contain
+  /// additional metadata that will not be preserved when merging the unwrapped
+  /// DILocations.
   ///
   /// \p Locs: The locations to be merged.
   LLVM_ABI static DILocation *getMergedLocations(ArrayRef<DILocation *> Locs);
diff --git a/llvm/include/llvm/IR/DebugLoc.h b/llvm/include/llvm/IR/DebugLoc.h
index c3d0fb80354a..2fabae9bfc66 100644
--- a/llvm/include/llvm/IR/DebugLoc.h
+++ b/llvm/include/llvm/IR/DebugLoc.h
@@ -142,6 +142,51 @@ namespace llvm {
     static inline DebugLoc getDropped() { return DebugLoc(); }
 #endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+    /// When two instructions are combined into a single instruction we also
+    /// need to combine the original locations into a single location.
+    /// When the locations are the same we can use either location.
+    /// When they differ, we need a third location which is distinct from
+    /// either. If they share a common scope, use this scope and compare the
+    /// line/column pair of the locations with the common scope:
+    /// * if both match, keep the line and column;
+    /// * if only the line number matches, keep the line and set the column as
+    /// 0;
+    /// * otherwise set line and column as 0.
+    /// If they do not share a common scope the location is ambiguous and can't
+    /// be represented in a line entry. In this case, set line and column as 0
+    /// and use the scope of any location.
+    ///
+    /// \p LocA \p LocB: The locations to be merged.
+    LLVM_ABI static DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB);
+
+    /// Try to combine the vector of locations passed as input in a single one.
+    /// This function applies getMergedLocation() repeatedly left-to-right.
+    ///
+    /// \p Locs: The locations to be merged.
+    LLVM_ABI static DebugLoc getMergedLocations(ArrayRef<DebugLoc> Locs);
+
+    /// If this DebugLoc is non-empty, returns this DebugLoc; otherwise, selects
+    /// \p Other.
+    /// In coverage-tracking builds, this also accounts for whether this or
+    /// \p Other have an annotative DebugLocKind applied, such that if both are
+    /// empty but exactly one has an annotation, we prefer that annotated
+    /// location.
+    DebugLoc orElse(DebugLoc Other) const {
+      if (*this)
+        return *this;
+#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+      if (Other)
+        return Other;
+      if (getKind() != DebugLocKind::Normal)
+        return *this;
+      if (Other.getKind() != DebugLocKind::Normal)
+        return Other;
+      return *this;
+#else
+      return Other;
+#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+    }
+
     /// Get the underlying \a DILocation.
     ///
     /// \pre !*this or \c isa<DILocation>(getAsMDNode()).
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index a0cc20d34303..59295089d6e9 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -113,12 +113,19 @@ public:
 /// Common base class shared among various IRBuilders.
 class IRBuilderBase {
   /// Pairs of (metadata kind, MDNode *) that should be added to all newly
-  /// created instructions, like !dbg metadata.
+  /// created instructions, excluding !dbg metadata, which is stored in the
+  /// StoredDL field.
   SmallVector<std::pair<unsigned, MDNode *>, 2> MetadataToCopy;
+  /// The DebugLoc that will be applied to instructions inserted by this
+  /// builder.
+  DebugLoc StoredDL;
 
   /// Add or update the an entry (Kind, MD) to MetadataToCopy, if \p MD is not
   /// null. If \p MD is null, remove the entry with \p Kind.
   void AddOrRemoveMetadataToCopy(unsigned Kind, MDNode *MD) {
+    assert(Kind != LLVMContext::MD_dbg &&
+           "MD_dbg metadata must be stored in StoredDL");
+
     if (!MD) {
       erase_if(MetadataToCopy, [Kind](const std::pair<unsigned, MDNode *> &KV) {
         return KV.first == Kind;
@@ -238,7 +245,9 @@ public:
 
   /// Set location information used by debugging information.
   void SetCurrentDebugLocation(DebugLoc L) {
-    AddOrRemoveMetadataToCopy(LLVMContext::MD_dbg, L.getAsMDNode());
+    // For !dbg metadata attachments, we use DebugLoc instead of the raw MDNode
+    // to include optional introspection data for use in Debugify.
+    StoredDL = std::move(L);
   }
 
   /// Set nosanitize metadata.
@@ -252,8 +261,12 @@ public:
   /// not on \p Src will be dropped from MetadataToCopy.
   void CollectMetadataToCopy(Instruction *Src,
                              ArrayRef<unsigned> MetadataKinds) {
-    for (unsigned K : MetadataKinds)
-      AddOrRemoveMetadataToCopy(K, Src->getMetadata(K));
+    for (unsigned K : MetadataKinds) {
+      if (K == LLVMContext::MD_dbg)
+        SetCurrentDebugLocation(Src->getDebugLoc());
+      else
+        AddOrRemoveMetadataToCopy(K, Src->getMetadata(K));
+    }
   }
 
   /// Get location information used by debugging information.
@@ -267,6 +280,7 @@ public:
   void AddMetadataToInst(Instruction *I) const {
     for (const auto &KV : MetadataToCopy)
       I->setMetadata(KV.first, KV.second);
+    SetInstDebugLocation(I);
   }
 
   /// Get the return type of the current function that we're emitting
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 10fc9c129860..8e1ef2422678 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -698,7 +698,7 @@ public:
   ///     applications, thus the N-way merging should be in code path.
   /// The DebugLoc attached to this instruction will be overwritten by the
   /// merged DebugLoc.
-  LLVM_ABI void applyMergedLocation(DILocation *LocA, DILocation *LocB);
+  LLVM_ABI void applyMergedLocation(DebugLoc LocA, DebugLoc LocB);
 
   /// Updates the debug location given that the instruction has been hoisted
   /// from a block to a predecessor of that block.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index e0f7466ceacf..ff9f0ff5d5bc 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -862,7 +862,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
             "Reached BB end within common tail");
       }
       assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
-      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
+      DL = DebugLoc::getMergedLocation(DL, Pos->getDebugLoc());
       NextCommonInsts[i] = ++Pos;
     }
     MI.setDebugLoc(DL);
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 10c72641ce2d..e3e6c72165eb 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -53,8 +53,7 @@ CSEMIRBuilder::getDominatingInstrForID(FoldingSetNodeID &ID,
     } else if (!dominates(MI, CurrPos)) {
       // Update the spliced machineinstr's debug location by merging it with the
       // debug location of the instruction at the insertion point.
-      auto *Loc = DILocation::getMergedLocation(getDebugLoc().get(),
-                                                MI->getDebugLoc().get());
+      auto Loc = DebugLoc::getMergedLocation(getDebugLoc(), MI->getDebugLoc());
       MI->setDebugLoc(Loc);
       CurMBB->splice(CurrPos, CurMBB, MI);
     }
@@ -170,7 +169,7 @@ CSEMIRBuilder::generateCopiesIfRequired(ArrayRef<DstOp> DstOps,
     if (Observer)
       Observer->changingInstr(*MIB);
     MIB->setDebugLoc(
-        DILocation::getMergedLocation(MIB->getDebugLoc(), getDebugLoc()));
+        DebugLoc::getMergedLocation(MIB->getDebugLoc(), getDebugLoc()));
     if (Observer)
       Observer->changedInstr(*MIB);
   }
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index 78cd9bc7891e..f68420ed66e4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -370,7 +370,7 @@ bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
   // For each store, compute pairwise merged debug locs.
   DebugLoc MergedLoc = Stores.front()->getDebugLoc();
   for (auto *Store : drop_begin(Stores))
-    MergedLoc = DILocation::getMergedLocation(MergedLoc, Store->getDebugLoc());
+    MergedLoc = DebugLoc::getMergedLocation(MergedLoc, Store->getDebugLoc());
 
   Builder.setInstr(*Stores.back());
   Builder.setDebugLoc(MergedLoc);
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index ccc164a0881e..48b406e016c0 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1574,7 +1574,7 @@ MachineBasicBlock::findBranchDebugLoc() {
     DL = TI->getDebugLoc();
     for (++TI ; TI != end() ; ++TI)
       if (TI->isBranch())
-        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc());
+        DL = DebugLoc::getMergedLocation(DL, TI->getDebugLoc());
   }
   return DL;
 }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index e3f6eda8ff06..8411d5c4b09c 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1611,8 +1611,8 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
   // location to prevent debug-info driven tools from potentially reporting
   // wrong location information.
   if (!SuccToSinkTo.empty() && InsertPos != SuccToSinkTo.end())
-    MI.setDebugLoc(DILocation::getMergedLocation(MI.getDebugLoc(),
-                                                 InsertPos->getDebugLoc()));
+    MI.setDebugLoc(DebugLoc::getMergedLocation(MI.getDebugLoc(),
+                                               InsertPos->getDebugLoc()));
   else
     MI.setDebugLoc(DebugLoc());
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 2a84e7bae0f1..9527c3e0b5d6 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -960,8 +960,8 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
   return 0;
 }
 
-void Instruction::applyMergedLocation(DILocation *LocA, DILocation *LocB) {
-  setDebugLoc(DILocation::getMergedLocation(LocA, LocB));
+void Instruction::applyMergedLocation(DebugLoc LocA, DebugLoc LocB) {
+  setDebugLoc(DebugLoc::getMergedLocation(LocA, LocB));
 }
 
 void Instruction::mergeDIAssignID(
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 0e65ddcec893..0be6d55d724e 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -143,6 +143,27 @@ DebugLoc DebugLoc::appendInlinedAt(const DebugLoc &DL, DILocation *InlinedAt,
   return Last;
 }
 
+DebugLoc DebugLoc::getMergedLocations(ArrayRef<DebugLoc> Locs) {
+  if (Locs.empty())
+    return DebugLoc();
+  if (Locs.size() == 1)
+    return Locs[0];
+  DebugLoc Merged = Locs[0];
+  for (const DebugLoc &DL : llvm::drop_begin(Locs)) {
+    Merged = getMergedLocation(Merged, DL);
+    if (!Merged)
+      break;
+  }
+  return Merged;
+}
+DebugLoc DebugLoc::getMergedLocation(DebugLoc LocA, DebugLoc LocB) {
+  if (!LocA)
+    return LocA;
+  if (!LocB)
+    return LocB;
+  return DILocation::getMergedLocation(LocA, LocB);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const { print(dbgs()); }
 #endif
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 59623b4295bb..a33ef9c7d4a1 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -61,19 +61,12 @@ Type *IRBuilderBase::getCurrentFunctionReturnType() const {
   return BB->getParent()->getReturnType();
 }
 
-DebugLoc IRBuilderBase::getCurrentDebugLocation() const {
-  for (auto &KV : MetadataToCopy)
-    if (KV.first == LLVMContext::MD_dbg)
-      return {cast<DILocation>(KV.second)};
-
-  return {};
-}
+DebugLoc IRBuilderBase::getCurrentDebugLocation() const { return StoredDL; }
 void IRBuilderBase::SetInstDebugLocation(Instruction *I) const {
-  for (const auto &KV : MetadataToCopy)
-    if (KV.first == LLVMContext::MD_dbg) {
-      I->setDebugLoc(DebugLoc(KV.second));
-      return;
-    }
+  // We prefer to set our current debug location if any has been set, but if
+  // our debug location is empty and I has a valid location, we shouldn't
+  // overwrite it.
+  I->setDebugLoc(StoredDL.orElse(I->getDebugLoc()));
 }
 
 Value *IRBuilderBase::CreateAggregateCast(Value *V, Type *DestTy) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 1b60caab6c11..cbf39b8adf1b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1354,6 +1354,9 @@ void Instruction::swapProfMetadata() {
 
 void Instruction::copyMetadata(const Instruction &SrcInst,
                                ArrayRef<unsigned> WL) {
+  if (WL.empty() || is_contained(WL, LLVMContext::MD_dbg))
+    setDebugLoc(SrcInst.getDebugLoc().orElse(getDebugLoc()));
+
   if (!SrcInst.hasMetadata())
     return;
 
@@ -1367,8 +1370,6 @@ void Instruction::copyMetadata(const Instruction &SrcInst,
     if (WL.empty() || WLS.count(MD.first))
       setMetadata(MD.first, MD.second);
   }
-  if (WL.empty() || WLS.count(LLVMContext::MD_dbg))
-    setDebugLoc(SrcInst.getDebugLoc());
 }
 
 Instruction *Instruction::clone() const {
diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
index 77bbeab3c279..222eb19e3eee 100644
--- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
@@ -150,10 +150,10 @@ static CallInst *isGEPAndStore(Value *I) {
 }
 
 template <class T = Instruction>
-static DILocation *mergeDILocations(SmallVector<T *> &Insns) {
-  DILocation *Merged = (*Insns.begin())->getDebugLoc();
+static DebugLoc mergeDebugLocs(SmallVector<T *> &Insns) {
+  DebugLoc Merged = (*Insns.begin())->getDebugLoc();
   for (T *I : Insns)
-    Merged = DILocation::getMergedLocation(Merged, I->getDebugLoc());
+    Merged = DebugLoc::getMergedLocation(Merged, I->getDebugLoc());
   return Merged;
 }
 
@@ -227,7 +227,7 @@ static Instruction *makeGEPAndLoad(Module *M, GEPChainInfo &GEP,
   CallInst *Call = makeIntrinsicCall(M, Intrinsic::bpf_getelementptr_and_load,
                                      {Load->getType()}, Args);
   setParamElementType(Call, 0, GEP.SourceElementType);
-  Call->applyMergedLocation(mergeDILocations(GEP.Members), Load->getDebugLoc());
+  Call->applyMergedLocation(mergeDebugLocs(GEP.Members), Load->getDebugLoc());
   Call->setName((*GEP.Members.rbegin())->getName());
   if (Load->isUnordered()) {
     Call->setOnlyReadsMemory();
@@ -251,8 +251,7 @@ static Instruction *makeGEPAndStore(Module *M, GEPChainInfo &GEP,
   setParamElementType(Call, 1, GEP.SourceElementType);
   if (Store->getValueOperand()->getType()->isPointerTy())
     setParamReadNone(Call, 0);
-  Call->applyMergedLocation(mergeDILocations(GEP.Members),
-                            Store->getDebugLoc());
+  Call->applyMergedLocation(mergeDebugLocs(GEP.Members), Store->getDebugLoc());
   if (Store->isUnordered()) {
     Call->setOnlyWritesMemory();
     Call->setOnlyAccessesArgMemory();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 324e6022f3f0..1d208de75db3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1581,8 +1581,8 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getValueOperand();
   // The debug locations of the original instructions might differ. Merge them.
-  DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
-                                                     OtherStore->getDebugLoc());
+  DebugLoc MergedLoc =
+      DebugLoc::getMergedLocation(SI.getDebugLoc(), OtherStore->getDebugLoc());
   if (MergedVal != SI.getValueOperand()) {
     PHINode *PN =
         PHINode::Create(SI.getValueOperand()->getType(), 2, "storemerge");
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index e261807bbc03..dc2a8cb0115e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5340,8 +5340,7 @@ bool InstCombinerImpl::run() {
         // We copy the old instruction's DebugLoc to the new instruction, unless
         // InstCombine already assigned a DebugLoc to it, in which case we
         // should trust the more specifically selected DebugLoc.
-        if (!Result->getDebugLoc())
-          Result->setDebugLoc(I->getDebugLoc());
+        Result->setDebugLoc(Result->getDebugLoc().orElse(I->getDebugLoc()));
         // We also copy annotation metadata to the new instruction.
         Result->copyMetadata(*I, LLVMContext::MD_annotation);
         // Everything uses the new instruction now.
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 07bc623c3dea..839f5933e09b 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -883,7 +883,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
         emitBaseConstants(Base, &R);
         ReBasesNum++;
         // Use the same debug location as the last user of the constant.
-        Base->setDebugLoc(DILocation::getMergedLocation(
+        Base->setDebugLoc(DebugLoc::getMergedLocation(
             Base->getDebugLoc(), R.User.Inst->getDebugLoc()));
       }
       assert(!Base->use_empty() && "The use list is empty!?");
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 3024ccb330b1..bd59caa6a959 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2224,10 +2224,10 @@ bool llvm::promoteLoopAccessesToScalars(
   });
 
   // Look at all the loop uses, and try to merge their locations.
-  std::vector<DILocation *> LoopUsesLocs;
-  for (auto *U : LoopUses)
-    LoopUsesLocs.push_back(U->getDebugLoc().get());
-  auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs));
+  std::vector<DebugLoc> LoopUsesLocs;
+  for (auto U : LoopUses)
+    LoopUsesLocs.push_back(U->getDebugLoc());
+  auto DL = DebugLoc::getMergedLocations(LoopUsesLocs);
 
   // We use the SSAUpdater interface to insert phi nodes as required.
   SmallVector<PHINode *, 16> NewPHIs;
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 4e437e9abeb4..d20378ece4ee 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -128,7 +128,7 @@ performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
 
   // Now, go through each block (with the current terminator type)
   // we've recorded, and rewrite it to branch to the new common block.
-  DILocation *CommonDebugLoc = nullptr;
+  DebugLoc CommonDebugLoc;
   for (BasicBlock *BB : BBs) {
     auto *Term = BB->getTerminator();
     assert(Term->getOpcode() == CanonicalTerm->getOpcode() &&
@@ -145,7 +145,7 @@ performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
       CommonDebugLoc = Term->getDebugLoc();
     else
       CommonDebugLoc =
-          DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
+          DebugLoc::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
 
     // And turn BB into a block that just unconditionally branches
     // to the canonical block.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f67a6414ca31..0980f0e57aa6 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2095,11 +2095,11 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
 
   // Ensure terminator gets a debug location, even an unknown one, in case
   // it involves inlinable calls.
-  SmallVector<DILocation *, 4> Locs;
+  SmallVector<DebugLoc, 4> Locs;
   Locs.push_back(I1->getDebugLoc());
   for (auto *OtherSuccTI : OtherSuccTIs)
     Locs.push_back(OtherSuccTI->getDebugLoc());
-  NT->setDebugLoc(DILocation::getMergedLocations(Locs));
+  NT->setDebugLoc(DebugLoc::getMergedLocations(Locs));
 
   // PHIs created below will adopt NT's merged DebugLoc.
   IRBuilder<NoFolder> Builder(NT);
@@ -2896,7 +2896,7 @@ static void mergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
       MergedDebugLoc = II->getDebugLoc();
     else
       MergedDebugLoc =
-          DILocation::getMergedLocation(MergedDebugLoc, II->getDebugLoc());
+          DebugLoc::getMergedLocation(MergedDebugLoc, II->getDebugLoc());
 
     // And replace the old `invoke` with an unconditionally branch
     // to the block with the merged `invoke`.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1838562f26b8..b74ef91f26e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -395,7 +395,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
       LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
                         << DIL->getFilename() << " Line: " << DIL->getLine());
   } else
-    Builder.SetCurrentDebugLocation(DIL);
+    Builder.SetCurrentDebugLocation(DL);
 }
 
 void VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,

From ce747a16328b2fbc365e1cb1cb01cb400c2c1b4c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Thu, 12 Jun 2025 21:06:58 +0800
Subject: [PATCH 0136/1322] [LV] Pre-commit test case for support
 VPWidenCastRecipe in isSingleScalar. nfc (#143498)

---
 .../LoopVectorize/single-scalar-cast-minbw.ll | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll

diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
new file mode 100644
index 000000000000..b8da9ac84a80
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
+; CHECK-LABEL: define void @minbw_cast(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[BOOL1:%.*]], i1 [[BOOL2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[BOOL1_EXT:%.*]] = zext i1 [[BOOL1]] to i32
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BOOL2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[BOOL1_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BOOL2_EXT:%.*]] = zext i1 [[BOOL2]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[BOOL1_EXT]], [[BOOL2_EXT]]
+; CHECK-NEXT:    [[XOR_TRUNC:%.*]] = trunc i32 [[XOR]] to i8
+; CHECK-NEXT:    store i8 [[XOR_TRUNC]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bool1.ext = zext i1 %bool1 to i32
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %bool2.ext = zext i1 %bool2 to i32
+  %xor = xor i32 %bool1.ext, %bool2.ext
+  %xor.trunc = trunc i32 %xor to i8
+  store i8 %xor.trunc, ptr %dst, align 1
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}

From d49bc5e621c8931679b232fa28abfc89a171105e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 12 Jun 2025 13:10:04 +0000
Subject: [PATCH 0137/1322] [llvm][MemProf] Correct position of LLVM_ABI macro
 in computeFrameHistogram
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous placement resulted in this warning when using g++-13:
/home/david.spickett/llvm-project/llvm/include/llvm/Support/Compiler.h:120:43: warning: attribute ignored [-Wattributes]
  120 | #define LLVM_ATTRIBUTE_VISIBILITY_DEFAULT [[gnu::visibility("default")]]
      |                                           ^
/home/david.spickett/llvm-project/llvm/include/llvm/Support/Compiler.h:213:18: note: in expansion of macro ‘LLVM_ATTRIBUTE_VISIBILITY_DEFAULT’
  213 | #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
      |                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/david.spickett/llvm-project/llvm/lib/ProfileData/MemProfRadixTree.cpp:245:5: note: in expansion of macro ‘LLVM_ABI’
  245 |     LLVM_ABI computeFrameHistogram<FrameId>(
      |     ^~~~~~~~
/home/david.spickett/llvm-project/llvm/include/llvm/Support/Compiler.h:120:43: note: an attribute that appertains to a type-specifier is ignored
  120 | #define LLVM_ATTRIBUTE_VISIBILITY_DEFAULT [[gnu::visibility("default")]]
      |                                           ^

According to the interface guide, that macro should go before the return
type to be effective.

https://llvm.org/docs/InterfaceExportAnnotations.html#specialized-template-functions
---
 llvm/lib/ProfileData/MemProfRadixTree.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/ProfileData/MemProfRadixTree.cpp b/llvm/lib/ProfileData/MemProfRadixTree.cpp
index ea9f5bd25534..c0672eb6da28 100644
--- a/llvm/lib/ProfileData/MemProfRadixTree.cpp
+++ b/llvm/lib/ProfileData/MemProfRadixTree.cpp
@@ -241,13 +241,13 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
 }
 
 // Explicitly instantiate function with the utilized FrameIdTy.
-template llvm::DenseMap<FrameId, FrameStat>
-    LLVM_ABI computeFrameHistogram<FrameId>(
-        llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
-            &MemProfCallStackData);
-template llvm::DenseMap<LinearFrameId, FrameStat>
-    LLVM_ABI computeFrameHistogram<LinearFrameId>(
-        llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
-            &MemProfCallStackData);
+template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>
+computeFrameHistogram<FrameId>(
+    llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+        &MemProfCallStackData);
+template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>
+computeFrameHistogram<LinearFrameId>(
+    llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
+        &MemProfCallStackData);
 } // namespace memprof
 } // namespace llvm

From 843f256623a68f51a80ae503c08b98433eeda04d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 12 Jun 2025 09:23:26 -0400
Subject: [PATCH 0138/1322] [gn] port 20d5d09e99188

---
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index 2ed60b4cc33b..a7ea1cf309b9 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -50,7 +50,6 @@ write_cmake_config("lit_common_configured") {
         rebase_path("$root_build_dir/bin/clang") + ext,
     "COMPILER_RT_TEST_COMPILER_ID=Clang",
     "Python3_EXECUTABLE=$python_path",
-    "Python3_ROOT_DIR=",  # FIXME
     "COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL=False",
     "COMPILER_RT_DEBUG_PYBOOL=False",
     "COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER_PYBOOL=False",

From 622df892b844749440124167e8eee9e652fba613 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 12 Jun 2025 15:27:27 +0200
Subject: [PATCH 0139/1322] [lldb/cmake] Remove EXTRA_CXXFLAGS arg (#143731)

We have one library using this and three libraries directly calling
`target_compile_options`. Might as well standardize on the latter.
---
 lldb/cmake/modules/AddLLDB.cmake              |  5 +----
 .../Plugins/Language/ObjC/CMakeLists.txt      | 22 +++++++++----------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/lldb/cmake/modules/AddLLDB.cmake b/lldb/cmake/modules/AddLLDB.cmake
index 0108fb22e5a0..28bf8d816d89 100644
--- a/lldb/cmake/modules/AddLLDB.cmake
+++ b/lldb/cmake/modules/AddLLDB.cmake
@@ -41,7 +41,7 @@ function(add_lldb_library name)
   cmake_parse_arguments(PARAM
     "MODULE;SHARED;STATIC;OBJECT;PLUGIN;FRAMEWORK;NO_INTERNAL_DEPENDENCIES;NO_PLUGIN_DEPENDENCIES"
     "INSTALL_PREFIX"
-    "EXTRA_CXXFLAGS;LINK_LIBS;CLANG_LIBS"
+    "LINK_LIBS;CLANG_LIBS"
     ${ARGN})
 
   if(PARAM_NO_INTERNAL_DEPENDENCIES)
@@ -130,9 +130,6 @@ function(add_lldb_library name)
     add_dependencies(${name} clang-tablegen-targets)
   endif()
 
-  # Add in any extra C++ compilation flags for this library.
-  target_compile_options(${name} PRIVATE ${PARAM_EXTRA_CXXFLAGS})
-
   if(PARAM_PLUGIN)
     get_property(parent_dir DIRECTORY PROPERTY PARENT_DIRECTORY)
     if(EXISTS ${parent_dir})
diff --git a/lldb/source/Plugins/Language/ObjC/CMakeLists.txt b/lldb/source/Plugins/Language/ObjC/CMakeLists.txt
index 93c23fd32524..b9fc5ce754c4 100644
--- a/lldb/source/Plugins/Language/ObjC/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/ObjC/CMakeLists.txt
@@ -1,13 +1,3 @@
-set(EXTRA_CXXFLAGS "")
-
-if (CXX_SUPPORTS_NO_GNU_ANONYMOUS_STRUCT)
-  set(EXTRA_CXXFLAGS ${EXTRA_CXXFLAGS} -Wno-gnu-anonymous-struct)
-endif ()
-
-if (CXX_SUPPORTS_NO_NESTED_ANON_TYPES)
-  set(EXTRA_CXXFLAGS ${EXTRA_CXXFLAGS} -Wno-nested-anon-types)
-endif ()
-
 add_lldb_library(lldbPluginObjCLanguage PLUGIN
   ObjCLanguage.cpp
   CF.cpp
@@ -36,6 +26,14 @@ add_lldb_library(lldbPluginObjCLanguage PLUGIN
     lldbPluginTypeSystemClang
   CLANG_LIBS
     clangAST
-
-  EXTRA_CXXFLAGS ${EXTRA_CXXFLAGS}
 )
+
+if (CXX_SUPPORTS_NO_GNU_ANONYMOUS_STRUCT)
+  target_compile_options(lldbPluginObjCLanguage
+    PRIVATE -Wno-gnu-anonymous-struct)
+endif ()
+
+if (CXX_SUPPORTS_NO_NESTED_ANON_TYPES)
+  target_compile_options(lldbPluginObjCLanguage
+    PRIVATE -Wno-nested-anon-types)
+endif ()

From b8e3e0749fb62a9845f8790f858e11f2558f94a2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 15:27:19 +0200
Subject: [PATCH 0140/1322] [InstCombine] Export logic for common base pointer
 (NFC)

Make this available to other parts of InstCombine, to be used for
pointer comparison optimization.
---
 .../InstCombine/InstCombineAddSub.cpp         | 19 +++----------------
 .../InstCombine/InstCombineInternal.h         | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f0f709bb16d8..86d318967403 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2068,21 +2068,8 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
   return nullptr;
 }
 
-struct CommonBase {
-  /// Common base pointer.
-  Value *Ptr = nullptr;
-  /// LHS GEPs until common base.
-  SmallVector<GEPOperator *> LHSGEPs;
-  /// RHS GEPs until common base.
-  SmallVector<GEPOperator *> RHSGEPs;
-  /// LHS GEP NoWrapFlags until common base.
-  GEPNoWrapFlags LHSNW = GEPNoWrapFlags::all();
-  /// RHS GEP NoWrapFlags until common base.
-  GEPNoWrapFlags RHSNW = GEPNoWrapFlags::all();
-};
-
-static CommonBase computeCommonBase(Value *LHS, Value *RHS) {
-  CommonBase Base;
+CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
+  CommonPointerBase Base;
 
   if (LHS->getType() != RHS->getType())
     return Base;
@@ -2136,7 +2123,7 @@ static CommonBase computeCommonBase(Value *LHS, Value *RHS) {
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
                                                    Type *Ty, bool IsNUW) {
-  CommonBase Base = computeCommonBase(LHS, RHS);
+  CommonPointerBase Base = CommonPointerBase::compute(LHS, RHS);
   if (!Base.Ptr)
     return nullptr;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 334462d715f9..bf7689bbfde7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -862,6 +862,21 @@ public:
                                      InstCombinerImpl &IC);
 };
 
+struct CommonPointerBase {
+  /// Common base pointer.
+  Value *Ptr = nullptr;
+  /// LHS GEPs until common base.
+  SmallVector<GEPOperator *> LHSGEPs;
+  /// RHS GEPs until common base.
+  SmallVector<GEPOperator *> RHSGEPs;
+  /// LHS GEP NoWrapFlags until common base.
+  GEPNoWrapFlags LHSNW = GEPNoWrapFlags::all();
+  /// RHS GEP NoWrapFlags until common base.
+  GEPNoWrapFlags RHSNW = GEPNoWrapFlags::all();
+
+  static CommonPointerBase compute(Value *LHS, Value *RHS);
+};
+
 } // end namespace llvm
 
 #undef DEBUG_TYPE

From 3100b50f78c06dcd5207140e0d6e5ba6954d8828 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot@amd.com>
Date: Thu, 12 Jun 2025 14:35:28 +0100
Subject: [PATCH 0141/1322] [AMDGPU] Flatten recursive register resource info
 propagation (#142766)

In #112251 I had mentioned I'd follow up with flattening of recursion
for register resource info propagation

Behaviour prior to this patch when a recursive call is used is to take
the module scope worst case function register use (even prior to
AMDGPUMCResourceInfo). With this patch it will, when a cycle is
detected, attempt to do a simple cycle avoidant dfs to find the worst
case constant within the cycle and the cycle's propagates. In other
words, it will attempt to look for the cycle scope worst case rather
than module scope worst case.
---
 .../Target/AMDGPU/AMDGPUMCResourceInfo.cpp    | 100 +++++++++++++++---
 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h |   6 ++
 .../CodeGen/AMDGPU/function-resource-usage.ll |  32 +++---
 .../AMDGPU/recursive-resource-usage-mcexpr.ll |  82 +++++++++++++-
 4 files changed, 188 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index e3e3e411c684..593b3ab22038 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -97,6 +97,86 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
   return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
 }
 
+// Tries to flatten recursive call register resource gathering. Simple cycle
+// avoiding dfs to find the constants in the propagated symbols.
+// Assumes:
+// - RecSym has been confirmed to recurse (this means the callee symbols should
+//   all be populated, started at RecSym).
+// - Shape of the resource symbol's MCExpr (`max` args are order agnostic):
+//   RecSym.MCExpr := max(<constant>+, <callee_symbol>*)
+const MCExpr *MCResourceInfo::flattenedCycleMax(MCSymbol *RecSym,
+                                                ResourceInfoKind RIK,
+                                                MCContext &OutContext) {
+  SmallPtrSet<const MCExpr *, 8> Seen;
+  SmallVector<const MCExpr *, 8> WorkList;
+  int64_t Maximum = 0;
+
+  const MCExpr *RecExpr = RecSym->getVariableValue();
+  WorkList.push_back(RecExpr);
+
+  while (!WorkList.empty()) {
+    const MCExpr *CurExpr = WorkList.pop_back_val();
+    switch (CurExpr->getKind()) {
+    default: {
+      // Assuming the recursion is of shape `max(<constant>, <callee_symbol>)`
+      // where <callee_symbol> will eventually recurse. If this condition holds,
+      // the recursion occurs within some other (possibly unresolvable) MCExpr,
+      // thus using the worst case value then.
+      if (!AMDGPUMCExpr::isSymbolUsedInExpression(RecSym, CurExpr)) {
+        LLVM_DEBUG(dbgs() << "MCResUse:   " << RecSym->getName()
+                          << ": Recursion in unexpected sub-expression, using "
+                             "module maximum\n");
+        switch (RIK) {
+        default:
+          break;
+        case RIK_NumVGPR:
+          return MCSymbolRefExpr::create(getMaxVGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        case RIK_NumSGPR:
+          return MCSymbolRefExpr::create(getMaxSGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        case RIK_NumAGPR:
+          return MCSymbolRefExpr::create(getMaxAGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        }
+      }
+      break;
+    }
+    case MCExpr::ExprKind::Constant: {
+      int64_t Val = cast<MCConstantExpr>(CurExpr)->getValue();
+      Maximum = std::max(Maximum, Val);
+      break;
+    }
+    case MCExpr::ExprKind::SymbolRef: {
+      const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(CurExpr);
+      const MCSymbol &SymRef = SymExpr->getSymbol();
+      if (SymRef.isVariable()) {
+        const MCExpr *SymVal = SymRef.getVariableValue();
+        if (Seen.insert(SymVal).second)
+          WorkList.push_back(SymVal);
+      }
+      break;
+    }
+    case MCExpr::ExprKind::Target: {
+      const AMDGPUMCExpr *TargetExpr = cast<AMDGPUMCExpr>(CurExpr);
+      if (TargetExpr->getKind() == AMDGPUMCExpr::VariantKind::AGVK_Max) {
+        for (auto &Arg : TargetExpr->getArgs())
+          WorkList.push_back(Arg);
+      }
+      break;
+    }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "MCResUse:   " << RecSym->getName()
+                    << ": Using flattened max: << " << Maximum << '\n');
+
+  return MCConstantExpr::create(Maximum, OutContext);
+}
+
 void MCResourceInfo::assignResourceInfoExpr(
     int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
     const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
@@ -133,25 +213,19 @@ void MCResourceInfo::assignResourceInfoExpr(
                           << CalleeValSym->getName() << " as callee\n");
         ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
       } else {
-        LLVM_DEBUG(
-            dbgs() << "MCResUse:   " << Sym->getName()
-                   << ": Recursion found, falling back to module maximum\n");
-        // In case of recursion: make sure to use conservative register counts
-        // (i.e., specifically for VGPR/SGPR/AGPR).
+        LLVM_DEBUG(dbgs() << "MCResUse:   " << Sym->getName()
+                          << ": Recursion found, attempt flattening of cycle "
+                             "for resource usage\n");
+        // In case of recursion for vgpr/sgpr/agpr resource usage: try to
+        // flatten and use the max of the call cycle. May still end up emitting
+        // module max if not fully resolvable.
         switch (RIK) {
         default:
           break;
         case RIK_NumVGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxVGPRSymbol(OutContext), OutContext));
-          break;
         case RIK_NumSGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxSGPRSymbol(OutContext), OutContext));
-          break;
         case RIK_NumAGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxAGPRSymbol(OutContext), OutContext));
+          ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
           break;
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index a670878948c3..fa98f82d1102 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -58,6 +58,12 @@ private:
   // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
   void assignMaxRegs(MCContext &OutContext);
 
+  // Take flattened max of cyclic function calls' knowns. For example, for
+  // a cycle A->B->C->D->A, take max(A, B, C, D) for A and have B, C, D have the
+  // propgated value from A.
+  const MCExpr *flattenedCycleMax(MCSymbol *RecSym, ResourceInfoKind RIK,
+                                  MCContext &OutContext);
+
 public:
   MCResourceInfo() = default;
   void addMaxVGPRCandidate(int32_t candidate) {
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 0a6aa05c2d21..2a18d40e0bd8 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -495,17 +495,17 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; GCN: NumVgprs: max(43, multi_stage_recurse1.num_vgpr)
 ; GCN: ScratchSize: 16+max(multi_stage_recurse1.private_seg_size)
 ; GCN-LABEL: {{^}}multi_stage_recurse1:
-; GCN: .set multi_stage_recurse1.num_vgpr, max(48, amdgpu.max_num_vgpr)
-; GCN: .set multi_stage_recurse1.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, amdgpu.max_num_sgpr)
+; GCN: .set multi_stage_recurse1.num_vgpr, max(48, 43)
+; GCN: .set multi_stage_recurse1.num_agpr, max(0, 0)
+; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, 34)
 ; GCN: .set multi_stage_recurse1.private_seg_size, 16
 ; GCN: .set multi_stage_recurse1.uses_vcc, 1
 ; GCN: .set multi_stage_recurse1.uses_flat_scratch, 0
 ; GCN: .set multi_stage_recurse1.has_dyn_sized_stack, 0
 ; GCN: .set multi_stage_recurse1.has_recursion, 1
 ; GCN: .set multi_stage_recurse1.has_indirect_call, 0
-; GCN: TotalNumSgprs: multi_stage_recurse1.numbered_sgpr+4
-; GCN: NumVgprs: max(48, amdgpu.max_num_vgpr)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 48
 ; GCN: ScratchSize: 16
 define void @multi_stage_recurse1(i32 %val) #2 {
   call void @multi_stage_recurse2(i32 %val)
@@ -528,8 +528,8 @@ define void @multi_stage_recurse2(i32 %val) #2 {
 ; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack)
 ; GCN: .set usage_multi_stage_recurse.has_recursion, or(1, multi_stage_recurse1.has_recursion)
 ; GCN: .set usage_multi_stage_recurse.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call)
-; GCN: TotalNumSgprs: usage_multi_stage_recurse.numbered_sgpr+6
-; GCN: NumVgprs: usage_multi_stage_recurse.num_vgpr
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 48
 ; GCN: ScratchSize: 16
 define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
   call void @multi_stage_recurse1(i32 %n)
@@ -550,17 +550,17 @@ define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
 ; GCN: NumVgprs: max(41, multi_stage_recurse_noattr1.num_vgpr)
 ; GCN: ScratchSize: 16+max(multi_stage_recurse_noattr1.private_seg_size)
 ; GCN-LABEL: {{^}}multi_stage_recurse_noattr1:
-; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, amdgpu.max_num_vgpr)
-; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, amdgpu.max_num_sgpr)
+; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, 41)
+; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, 0)
+; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, 54)
 ; GCN: .set multi_stage_recurse_noattr1.private_seg_size, 16
 ; GCN: .set multi_stage_recurse_noattr1.uses_vcc, 1
 ; GCN: .set multi_stage_recurse_noattr1.uses_flat_scratch, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_dyn_sized_stack, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_recursion, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_indirect_call, 0
-; GCN: TotalNumSgprs: multi_stage_recurse_noattr1.numbered_sgpr+4
-; GCN: NumVgprs: max(41, amdgpu.max_num_vgpr)
+; GCN: TotalNumSgprs: 61
+; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 16
 define void @multi_stage_recurse_noattr1(i32 %val) #0 {
   call void @multi_stage_recurse_noattr2(i32 %val)
@@ -583,8 +583,8 @@ define void @multi_stage_recurse_noattr2(i32 %val) #0 {
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack)
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_recursion, or(0, multi_stage_recurse_noattr1.has_recursion)
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_indirect_call, or(0, multi_stage_recurse_noattr1.has_indirect_call)
-; GCN: TotalNumSgprs: usage_multi_stage_recurse_noattrs.numbered_sgpr+6
-; GCN: NumVgprs: usage_multi_stage_recurse_noattrs.num_vgpr
+; GCN: TotalNumSgprs: 63
+; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 16
 define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
   call void @multi_stage_recurse_noattr1(i32 %n)
@@ -601,8 +601,8 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack)
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_recursion, or(1, use_stack0.has_recursion, use_stack1.has_recursion, multi_stage_recurse1.has_recursion)
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call, multi_stage_recurse1.has_indirect_call)
-; GCN: TotalNumSgprs: multi_call_with_multi_stage_recurse.numbered_sgpr+6
-; GCN: NumVgprs:  multi_call_with_multi_stage_recurse.num_vgpr
+; GCN: TotalNumSgprs: 59
+; GCN: NumVgprs:  48
 ; GCN: ScratchSize: 2052
 define amdgpu_kernel void @multi_call_with_multi_stage_recurse(i32 %n) #0 {
   call void @use_stack0()
diff --git a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
index 3093349bff37..a41a06592f62 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
 
+; Recursion: foo -> bar -> baz -> qux -> foo
+
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, max(71, foo.num_vgpr)
 ; CHECK: .set qux.num_agpr, max(0, foo.num_agpr)
@@ -34,9 +36,9 @@
 ; CHECK: .set bar.has_indirect_call, or(0, baz.has_indirect_call)
 
 ; CHECK-LABEL: {{^}}foo
-; CHECK: .set foo.num_vgpr, max(46, amdgpu.max_num_vgpr)
-; CHECK: .set foo.num_agpr, max(0, amdgpu.max_num_agpr)
-; CHECK: .set foo.numbered_sgpr, max(71, amdgpu.max_num_sgpr)
+; CHECK: .set foo.num_vgpr, max(46, 71)
+; CHECK: .set foo.num_agpr, max(0, 0)
+; CHECK: .set foo.numbered_sgpr, max(71, 61)
 ; CHECK: .set foo.private_seg_size, 16
 ; CHECK: .set foo.uses_vcc, 1
 ; CHECK: .set foo.uses_flat_scratch, 0
@@ -91,3 +93,77 @@ define amdgpu_kernel void @usefoo() {
   ret void
 }
 
+; Recursion: A -> B -> C -> A && C -> D -> C
+
+; CHECK-LABEL: {{^}}D
+; CHECK: .set D.num_vgpr, max(71, C.num_vgpr)
+; CHECK: .set D.num_agpr, max(0, C.num_agpr)
+; CHECK: .set D.numbered_sgpr, max(71, C.numbered_sgpr)
+; CHECK: .set D.private_seg_size, 16+max(C.private_seg_size)
+; CHECK: .set D.uses_vcc, or(1, C.uses_vcc)
+; CHECK: .set D.uses_flat_scratch, or(0, C.uses_flat_scratch)
+; CHECK: .set D.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
+; CHECK: .set D.has_recursion, or(1, C.has_recursion)
+; CHECK: .set D.has_indirect_call, or(0, C.has_indirect_call)
+
+; CHECK-LABEL: {{^}}C
+; CHECK: .set C.num_vgpr, max(42, A.num_vgpr, 71)
+; CHECK: .set C.num_agpr, max(0, A.num_agpr, 0)
+; CHECK: .set C.numbered_sgpr, max(71, A.numbered_sgpr, 71)
+; CHECK: .set C.private_seg_size, 16+max(A.private_seg_size)
+; CHECK: .set C.uses_vcc, or(1, A.uses_vcc)
+; CHECK: .set C.uses_flat_scratch, or(0, A.uses_flat_scratch)
+; CHECK: .set C.has_dyn_sized_stack, or(0, A.has_dyn_sized_stack)
+; CHECK: .set C.has_recursion, or(1, A.has_recursion)
+; CHECK: .set C.has_indirect_call, or(0, A.has_indirect_call)
+
+; CHECK-LABEL: {{^}}B
+; CHECK: .set B.num_vgpr, max(42, C.num_vgpr)
+; CHECK: .set B.num_agpr, max(0, C.num_agpr)
+; CHECK: .set B.numbered_sgpr, max(71, C.numbered_sgpr)
+; CHECK: .set B.private_seg_size, 16+max(C.private_seg_size)
+; CHECK: .set B.uses_vcc, or(1, C.uses_vcc)
+; CHECK: .set B.uses_flat_scratch, or(0, C.uses_flat_scratch)
+; CHECK: .set B.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
+; CHECK: .set B.has_recursion, or(1, C.has_recursion)
+; CHECK: .set B.has_indirect_call, or(0, C.has_indirect_call)
+
+; CHECK-LABEL: {{^}}A
+; CHECK: .set A.num_vgpr, max(42, 71)
+; CHECK: .set A.num_agpr, max(0, 0)
+; CHECK: .set A.numbered_sgpr, max(71, 71)
+; CHECK: .set A.private_seg_size, 16
+; CHECK: .set A.uses_vcc, 1
+; CHECK: .set A.uses_flat_scratch, 0
+; CHECK: .set A.has_dyn_sized_stack, 0
+; CHECK: .set A.has_recursion, 1
+; CHECK: .set A.has_indirect_call, 0
+
+define void @A() {
+  call void @B()
+  call void asm sideeffect "", "~{v10}"()
+  call void asm sideeffect "", "~{s50}"()
+  ret void
+}
+
+define void @B() {
+  call void @C()
+  call void asm sideeffect "", "~{v20}"()
+  call void asm sideeffect "", "~{s30}"()
+  ret void
+}
+
+define void @C() {
+  call void @A()
+  call void @D()
+  call void asm sideeffect "", "~{v30}"()
+  call void asm sideeffect "", "~{s40}"()
+  ret void
+}
+
+define void @D() {
+  call void @C()
+  call void asm sideeffect "", "~{v70}"()
+  call void asm sideeffect "", "~{s70}"()
+  ret void
+}

From 79f4a43839386e785451c8f0a362b2d1e5850b74 Mon Sep 17 00:00:00 2001
From: Shamshura Egor <164661612+egorshamshura@users.noreply.github.com>
Date: Thu, 12 Jun 2025 17:21:05 +0300
Subject: [PATCH 0142/1322] [X86] VPTERNLOG comments - use "mem" just for full
 width loads and "m32bcst" / "m64bcst" for broadcast loads (#143721)

Use "mem" just for full width loads and "m32bcst" / "m64bcst" for 32-bit (D) / 64-bit (Q) broadcasts.

Fixes #143679

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 .../X86/MCTargetDesc/X86InstComments.cpp      | 29 +++++++++++++---
 .../any_extend_vector_inreg_of_broadcast.ll   | 16 ++++-----
 ...d_vector_inreg_of_broadcast_from_memory.ll | 16 ++++-----
 llvm/test/CodeGen/X86/avgfloors.ll            | 12 +++----
 llvm/test/CodeGen/X86/avx512-cvt.ll           |  2 +-
 llvm/test/CodeGen/X86/avx512-logic.ll         |  4 +--
 llvm/test/CodeGen/X86/avx512fp16-arith.ll     |  6 ++--
 llvm/test/CodeGen/X86/avx512vl-logic.ll       |  8 ++---
 llvm/test/CodeGen/X86/combine-bitselect.ll    |  6 ++--
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |  2 +-
 llvm/test/CodeGen/X86/fp-round.ll             | 34 +++++++++----------
 llvm/test/CodeGen/X86/gfni-funnel-shifts.ll   | 12 +++----
 llvm/test/CodeGen/X86/gfni-shifts.ll          |  6 ++--
 llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 10 +++---
 llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 10 +++---
 .../CodeGen/X86/min-legal-vector-width.ll     | 12 +++----
 llvm/test/CodeGen/X86/pmul.ll                 |  4 +--
 llvm/test/CodeGen/X86/psubus.ll               |  6 ++--
 llvm/test/CodeGen/X86/sadd_sat_vec.ll         |  2 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    |  2 +-
 llvm/test/CodeGen/X86/ssub_sat_vec.ll         |  4 +--
 llvm/test/CodeGen/X86/usub_sat_vec.ll         |  2 +-
 llvm/test/CodeGen/X86/vector-fshl-128.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-256.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-512.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-rot-128.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-rot-256.ll  | 22 ++++++------
 llvm/test/CodeGen/X86/vector-fshl-rot-512.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-128.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-256.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-512.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-rot-128.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-rot-256.ll  | 24 ++++++-------
 llvm/test/CodeGen/X86/vector-fshr-rot-512.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll |  6 ++--
 llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll |  2 +-
 .../vector-interleaved-store-i16-stride-5.ll  |  8 ++---
 .../vector-interleaved-store-i16-stride-7.ll  | 16 ++++-----
 llvm/test/CodeGen/X86/vector-rotate-128.ll    | 12 +++----
 llvm/test/CodeGen/X86/vector-rotate-256.ll    | 22 ++++++------
 llvm/test/CodeGen/X86/vector-rotate-512.ll    | 32 ++++++++---------
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |  4 +--
 .../test/CodeGen/X86/vector-shift-ashr-256.ll |  4 +--
 .../test/CodeGen/X86/vector-shift-ashr-512.ll |  2 +-
 .../CodeGen/X86/vector-shift-ashr-sub128.ll   | 12 +++----
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll |  2 +-
 llvm/test/CodeGen/X86/vector-shift-shl-512.ll |  4 +--
 .../test/CodeGen/X86/vector-shuffle-avx512.ll |  2 +-
 llvm/test/CodeGen/X86/vselect-pcmp.ll         |  4 +--
 .../zero_extend_vector_inreg_of_broadcast.ll  |  8 ++---
 ...d_vector_inreg_of_broadcast_from_memory.ll |  8 ++---
 51 files changed, 270 insertions(+), 249 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 4c26fc86f954..547745fdba9d 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -931,20 +931,41 @@ static bool printPTERNLOGComments(const MCInst *MI, raw_ostream &OS,
     // dest, src1, mask, src2, memory, tbl
     CASE_PTERNLOG(PTERNLOGD, m)
     CASE_PTERNLOG(PTERNLOGQ, m)
-    CASE_PTERNLOG(PTERNLOGD, mb)
-    CASE_PTERNLOG(PTERNLOGQ, mb)
     Src2Idx = NumOperands - 7;
     Src3Idx = -1;
     break;
 
+    CASE_PTERNLOG(PTERNLOGD, mb)
+    Src2Idx = NumOperands - 7;
+    Src3Idx = -2;
+    break;
+
+    CASE_PTERNLOG(PTERNLOGQ, mb)
+    Src2Idx = NumOperands - 7;
+    Src3Idx = -3;
+    break;
+
   default:
     return false;
   }
   StringRef DestName = getRegName(MI->getOperand(0).getReg());
   StringRef Src1Name = getRegName(MI->getOperand(1).getReg());
   StringRef Src2Name = getRegName(MI->getOperand(Src2Idx).getReg());
-  StringRef Src3Name =
-      Src3Idx != -1 ? getRegName(MI->getOperand(Src3Idx).getReg()) : "mem";
+  StringRef Src3Name;
+  switch (Src3Idx) {
+  case -1:
+    Src3Name = "mem";
+    break;
+  case -2:
+    Src3Name = "m32bcst";
+    break;
+  case -3:
+    Src3Name = "m64bcst";
+    break;
+  default:
+    Src3Name = getRegName(MI->getOperand(Src3Idx).getReg());
+    break;
+  }
   uint8_t TruthTable = MI->getOperand(NumOperands - 1).getImm();
 
   StringRef SrcNames[] = {Src1Name, Src2Name, Src3Name};
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 7d2915ddc75b..dec829fed353 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1235,7 +1235,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1248,7 +1248,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1359,7 +1359,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1372,7 +1372,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2702,7 +2702,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
@@ -2717,7 +2717,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
@@ -2964,7 +2964,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
 ; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
@@ -2979,7 +2979,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
 ; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index dc723eb713c2..3d4cddbb94c7 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1020,7 +1020,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1030,7 +1030,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1116,7 +1116,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1126,7 +1126,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2125,7 +2125,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdx)
@@ -2137,7 +2137,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rdx)
@@ -2346,7 +2346,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (m64bcst & (ymm0 ^ ymm1))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdx)
@@ -2358,7 +2358,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (m64bcst & (ymm0 ^ ymm1))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rdx)
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index 0508e5ccb543..9cc55c6f7a81 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -53,7 +53,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -108,7 +108,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -405,7 +405,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -478,7 +478,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -966,7 +966,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
@@ -1078,7 +1078,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index 3dd7b571b921..76c87900b04d 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -351,7 +351,7 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 ; NODQ-LABEL: ulto8f64:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; NODQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; NODQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
 ; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
 ; NODQ-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
 ; NODQ-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index 23f4fcb1c77c..bdcc524545fb 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -889,7 +889,7 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 ; ALL-LABEL: ternlog_or_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & mem) | zmm1
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & m32bcst) | zmm1
 ; ALL-NEXT:    retq
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %b = or <16 x i32> %a, %y
@@ -899,7 +899,7 @@ define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) {
 ; ALL-LABEL: ternlog_xor_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m64bcst)
 ; ALL-NEXT:    retq
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %b = xor <8 x i64> %a, %y
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
index b264f5fc3468..d19c9bb55017 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
@@ -384,7 +384,7 @@ declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) {
 ; CHECK-LABEL: fcopysignv8f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1))
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (m32bcst & (xmm0 ^ xmm1))
 ; CHECK-NEXT:    retq
   %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %a
@@ -439,7 +439,7 @@ declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
 define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) {
 ; CHECK-LABEL: fcopysignv16f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-NEXT:    retq
   %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %a
@@ -494,7 +494,7 @@ declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
 define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) {
 ; CHECK-LABEL: fcopysignv32f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (m32bcst & (zmm0 ^ zmm1))
 ; CHECK-NEXT:    retq
   %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y)
   ret <32 x half> %a
diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll
index 284a0eb33047..c1ae0e36c2c0 100644
--- a/llvm/test/CodeGen/X86/avx512vl-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll
@@ -1039,7 +1039,7 @@ define <4 x i32> @ternlog_xor_andn(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: ternlog_or_and_mask:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & mem) | xmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
 ; CHECK-NEXT:    retq
   %a = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
   %b = or <4 x i32> %a, %y
@@ -1049,7 +1049,7 @@ define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) {
 define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: ternlog_or_and_mask_ymm:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = (ymm0 & mem) | ymm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = (ymm0 & m32bcst) | ymm1
 ; CHECK-NEXT:    retq
   %a = and <8 x i32> %x, <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
   %b = or <8 x i32> %a, %y
@@ -1059,7 +1059,7 @@ define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) {
 define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: ternlog_xor_and_mask:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; CHECK-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m64bcst)
 ; CHECK-NEXT:    retq
   %a = and <2 x i64> %x, <i64 1099511627775, i64 1099511627775>
   %b = xor <2 x i64> %a, %y
@@ -1069,7 +1069,7 @@ define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) {
 define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) {
 ; CHECK-LABEL: ternlog_xor_and_mask_ymm:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; CHECK-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m64bcst)
 ; CHECK-NEXT:    retq
   %a = and <4 x i64> %x, <i64 72057594037927935, i64 72057594037927935, i64 72057594037927935, i64 72057594037927935>
   %b = xor <4 x i64> %a, %y
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 25c26d598881..4f1c00b64fa9 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -283,7 +283,7 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, pt
 ;
 ; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (m64bcst & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
   %a2 = load i64, ptr %p2
   %1 = insertelement <2 x i64> undef, i64 %a2, i32 0
@@ -604,7 +604,7 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, pt
 ;
 ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (m64bcst & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
   %a2 = load i64, ptr %p2
   %1 = insertelement <4 x i64> undef, i64 %a2, i32 0
@@ -975,7 +975,7 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, pt
 ;
 ; AVX512-LABEL: bitselect_v8i64_broadcast_rrm:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (m64bcst & (zmm0 ^ zmm1))
 ; AVX512-NEXT:    retq
   %a2 = load i64, ptr %p2
   %1 = insertelement <8 x i64> undef, i64 %a2, i32 0
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 95b5fcf8eac5..54390d8b66f7 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -808,7 +808,7 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
 ; AVX512-LABEL: or_and_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | m64bcst)
 ; AVX512-NEXT:    retq
   %1 = and <2 x i64> %a0, <i64 7, i64 7>
   %2 = or <2 x i64> %1, <i64 3, i64 3>
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index e98fb8e374c0..8595b63fc810 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -52,7 +52,7 @@ define half @round_f16(half %h) {
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -98,7 +98,7 @@ define float @round_f32(float %x) {
 ; AVX512F-LABEL: round_f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -106,7 +106,7 @@ define float @round_f32(float %x) {
 ; AVX512FP16-LABEL: round_f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -142,7 +142,7 @@ define double @round_f64(double %x) {
 ; AVX512F-LABEL: round_f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512F-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -150,7 +150,7 @@ define double @round_f64(double %x) {
 ; AVX512FP16-LABEL: round_f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -208,7 +208,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512F-LABEL: round_v4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -216,7 +216,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512FP16-LABEL: round_v4f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -262,7 +262,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512F-LABEL: round_v2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -270,7 +270,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512FP16-LABEL: round_v2f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -356,7 +356,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512F-LABEL: round_v8f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & m32bcst)
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -364,7 +364,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512FP16-LABEL: round_v8f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -426,7 +426,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512F-LABEL: round_v4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & m64bcst)
 ; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -434,7 +434,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512FP16-LABEL: round_v4f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -582,7 +582,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512F-LABEL: round_v16f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -590,7 +590,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512FP16-LABEL: round_v16f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
@@ -690,7 +690,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512F-LABEL: round_v8f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
 ; AVX512F-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -698,7 +698,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512FP16-LABEL: round_v8f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0ca3380d188b..7001bf7f2807 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -492,7 +492,7 @@ define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $3, %xmm0, %xmm2
 ; GFNIAVX512-NEXT:    vpsrlw $5, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
   ret <16 x i8> %res
@@ -518,7 +518,7 @@ define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpaddw %xmm0, %xmm0, %xmm2
 ; GFNIAVX512-NEXT:    vpsrlw $7, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>)
   ret <16 x i8> %res
@@ -1311,7 +1311,7 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; GFNIAVX512-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <32 x i8> %res
@@ -1349,7 +1349,7 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; GFNIAVX512-NEXT:    vpsrlw $6, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>)
   ret <32 x i8> %res
@@ -2775,7 +2775,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <64 x i8> %res
@@ -2836,7 +2836,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    vpsllw $6, %zmm0, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index 5cd1a2c76762..cd16651123b0 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1261,7 +1261,7 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
 ; GFNIAVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; GFNIAVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; GFNIAVX512VL-NEXT:    vpsllw $8, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & mem)
+; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: constant_shl_v32i8:
@@ -2634,7 +2634,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX512VL-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
+; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & m32bcst)
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: constant_shl_v64i8:
@@ -2642,7 +2642,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; GFNIAVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
 ; GFNIAVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index ac6b7e54ca5b..a798f4c38f68 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2500,7 +2500,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2729,7 +2729,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2961,7 +2961,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -3192,7 +3192,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -3432,7 +3432,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index a8021e3164f3..3a4a638c7330 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -2016,7 +2016,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2194,7 +2194,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} ymm2 = ~ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2372,7 +2372,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2550,7 +2550,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2733,7 +2733,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 24c884211cf9..d752659f94a5 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1993,21 +1993,21 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8:
 ; CHECK-VBMI1:       # %bb.0:
 ; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-VBMI1-NEXT:    retq
 ;
 ; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8:
@@ -2025,7 +2025,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-SKX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-SKX-NEXT:    retq
 ;
@@ -2033,7 +2033,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    retq
 ;
@@ -2041,7 +2041,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-VBMI1:       # %bb.0:
 ; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-VBMI1-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-VBMI1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index c7cc2acaf262..9aee2f11e9ea 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -832,7 +832,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & m32bcst)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8c:
@@ -840,7 +840,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 73ee28a7fd24..e10b360b35b5 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -146,7 +146,7 @@ define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: ashr_xor_and_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ m32bcst)
 ; AVX512-NEXT:    retq
   %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
   %flipsign = xor <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -187,7 +187,7 @@ define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: ashr_add_and_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ m32bcst)
 ; AVX512-NEXT:    retq
   %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
   %flipsign = add <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -230,7 +230,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: usubsat_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ m32bcst)
 ; AVX512-NEXT:    retq
   %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 undef>)
   ret <4 x i32> %res
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index bd563f97b0ac..80b55a364dba 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -567,7 +567,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b..2d0778853fec 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2461,7 +2461,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
 ; CHECK-AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
 ; CHECK-AVX512VL-NEXT:    vpsllw $8, %ymm3, %ymm3
-; CHECK-AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
+; CHECK-AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
 ; CHECK-AVX512VL-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; CHECK-AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,128,1,1,1,128,1,64,128,1,128,1,128,32,1,1]
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 88df3c175ec9..eb2ad4fdff92 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -567,7 +567,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
@@ -601,7 +601,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ;
 ; AVX512BW-LABEL: v16i1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ m32bcst)
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index 4e17ca6fbae3..a5f768e48bae 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -543,7 +543,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ;
 ; AVX512BW-LABEL: v16i1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ m32bcst)
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index fd0525e6d56a..6b8a03ba5eb7 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -2407,7 +2407,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2416,14 +2416,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2432,7 +2432,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -2441,14 +2441,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index fdd0d68b8900..c6e1aa9cd90c 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -2296,7 +2296,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2304,14 +2304,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2319,7 +2319,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -2327,14 +2327,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX10-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 1d807fa85ddc..34ad667f0117 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -1124,7 +1124,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -1137,35 +1137,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index b763b7bac243..e60b56551e58 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1859,7 +1859,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1868,14 +1868,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1884,14 +1884,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -1900,7 +1900,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 9e872cc6d74a..11a02f8cf754 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -443,12 +443,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
@@ -463,17 +463,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1641,7 +1641,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1649,14 +1649,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1664,14 +1664,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -1679,7 +1679,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 436fbe31f7a3..4c6680ac4a19 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index a56b0a6351a3..bf525442a419 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2412,7 +2412,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2421,14 +2421,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2437,7 +2437,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -2446,14 +2446,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 0fa2c858ff00..9479174d964c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -2096,7 +2096,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2104,14 +2104,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2119,7 +2119,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -2127,14 +2127,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX10-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 665223167fbb..3a522ccb6214 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -1166,7 +1166,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -1179,35 +1179,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 9ce682306f18..d9799975cd37 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1928,7 +1928,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1937,14 +1937,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1953,14 +1953,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -1969,7 +1969,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 3d4f283260aa..15e09c3b6737 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -469,17 +469,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -488,17 +488,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1693,7 +1693,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1701,14 +1701,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1716,14 +1716,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -1731,7 +1731,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 11ea650e1f02..1d089e427bfa 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 0ea754873d8b..6bc4fcb6cc1e 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -179,7 +179,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
@@ -500,7 +500,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm2
@@ -606,7 +606,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = srem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index fd7a4c9b8d5a..9c56894f0c59 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -651,7 +651,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = urem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 418c987ab9a3..3311a311c8e4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -1783,7 +1783,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1856,7 +1856,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1932,7 +1932,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -2005,7 +2005,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 6f50d61f4d1f..fafb69be0d38 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1388,7 +1388,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
 ; AVX512-NEXT:    vpsrlq $48, %xmm4, %xmm2
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
@@ -1448,7 +1448,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
 ; AVX512-FCP-NEXT:    vpsrlq $48, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
@@ -1511,7 +1511,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
 ; AVX512DQ-NEXT:    vpsrlq $48, %xmm4, %xmm2
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
@@ -1571,7 +1571,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
@@ -13076,7 +13076,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1))
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm18 & (ymm0 ^ mem))
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm18 & (ymm0 ^ m32bcst))
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
@@ -13752,7 +13752,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6))
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = mem ^ (ymm0 & (ymm2 ^ mem))
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = m32bcst ^ (ymm0 & (ymm2 ^ m32bcst))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
@@ -14403,7 +14403,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1))
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm18 & (ymm0 ^ mem))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm18 & (ymm0 ^ m32bcst))
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0))
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
@@ -15079,7 +15079,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6))
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = mem ^ (ymm0 & (ymm2 ^ mem))
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = m32bcst ^ (ymm0 & (ymm2 ^ m32bcst))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 993e6afc0eaf..6c79be75550e 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1581,7 +1581,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512NOVLX-NEXT:    vzeroupper
 ; AVX512NOVLX-NEXT:    retq
@@ -1590,7 +1590,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLX-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v16i8:
@@ -1739,7 +1739,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $5, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $11, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = m32bcst & (xmm0 | xmm1)
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16:
@@ -1754,7 +1754,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $5, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $11, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1)
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = m32bcst & (xmm0 | xmm1)
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16:
@@ -1819,7 +1819,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512NOVLX-NEXT:    vzeroupper
 ; AVX512NOVLX-NEXT:    retq
@@ -1828,7 +1828,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; AVX512VLX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index c2c6a5f7eba5..684721f434eb 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -387,12 +387,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
@@ -407,17 +407,17 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1392,7 +1392,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512NOVLX-NEXT:    retq
 ;
@@ -1400,7 +1400,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLX-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
@@ -1566,7 +1566,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst & (ymm0 | ymm1)
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16:
@@ -1581,7 +1581,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $5, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $11, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1)
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst & (ymm0 | ymm1)
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16:
@@ -1653,7 +1653,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512NOVLX-NEXT:    retq
 ;
@@ -1661,7 +1661,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; AVX512VLX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 8ac0b178a16d..2cde988ed776 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -754,7 +754,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
@@ -766,35 +766,35 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
@@ -844,7 +844,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $11, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
@@ -856,21 +856,21 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $11, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $5, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $5, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16:
@@ -902,7 +902,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -915,7 +915,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -923,7 +923,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -931,7 +931,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -939,7 +939,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -947,7 +947,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VLVBMI2-NEXT:    retq
   %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 60295f1c145a..02f0f53a0bb3 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -2021,7 +2021,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2029,7 +2029,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 9f3fff34ea20..15855e3bce46 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -2295,7 +2295,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2303,7 +2303,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 74dbee5e5d2c..ea0745b157f5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -540,7 +540,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index 4f8cbc07243f..f7de8d427150 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -2344,7 +2344,7 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2352,7 +2352,7 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
@@ -2414,7 +2414,7 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2422,7 +2422,7 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
@@ -2484,7 +2484,7 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2492,7 +2492,7 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 4f55f7af20f4..3f238b5739f0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1369,7 +1369,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512DQVL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; AVX512DQVL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; AVX512DQVL-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
 ; AVX512DQVL-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: constant_shift_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index a42056be895e..efd742956ed0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -324,7 +324,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & m32bcst)
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
@@ -332,7 +332,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 07498c1233b5..db3be98efa53 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -355,7 +355,7 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
 ;
 ; AVX512F-LABEL: test_mm512_mask_blend_epi16:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32>  <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index ab487ed88898..8543e9fd919b 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -1763,7 +1763,7 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) {
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst ^ (zmm0 & (zmm1 ^ m32bcst))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: PR110875:
@@ -1780,7 +1780,7 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) {
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst ^ (zmm0 & (zmm1 ^ m32bcst))
 ; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: PR110875:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 7ad9fb0c2717..45ccc39fb254 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1235,7 +1235,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1248,7 +1248,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1359,7 +1359,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1372,7 +1372,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 3bef834bbd90..31920d8348fb 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1020,7 +1020,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1030,7 +1030,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1116,7 +1116,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1126,7 +1126,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper

From cc17f68e566ab7db4ac8e95dc857e49e10d8366c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Thu, 12 Jun 2025 07:23:07 -0700
Subject: [PATCH 0143/1322] [SLP] NFC: Precommit test for pull/137419 (#137730)

Precommit for https://github.com/llvm/llvm-project/pull/137419
---
 .../SLPVectorizer/AMDGPU/external-shuffle.ll  | 261 ++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
new file mode 100644
index 000000000000..ce9e47a03dee
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN %s
+
+; The insertelements in the exit block use the various parts of the vectorized tree. These external uses are just creating an identity vector using a sequence
+;  of insert elements. Since these insertelements are just recreating the same vectors that were produced during vectorization, they should not increase the cost of vectorization.
+
+define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, ptr %out2, i32 %flag) {
+; GCN-LABEL: define void @phi_4(
+; GCN-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], ptr [[OUT2:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[ENTRY:.*]]:
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
+; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2
+; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 3
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4
+; GCN-NEXT:    [[GEP5:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 5
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6
+; GCN-NEXT:    [[GEP7:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 7
+; GCN-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8
+; GCN-NEXT:    [[GEP9:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 9
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10
+; GCN-NEXT:    [[GEP11:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 11
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12
+; GCN-NEXT:    [[GEP13:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 13
+; GCN-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14
+; GCN-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; GCN-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; GCN-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; GCN-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
+; GCN-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
+; GCN-NEXT:    [[TMP24:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; GCN-NEXT:    [[TMP26:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; GCN-NEXT:    [[TMP28:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; GCN-NEXT:    [[TMP38:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
+; GCN-NEXT:    br label %[[DO_BODY:.*]]
+; GCN:       [[DO_BODY]]:
+; GCN-NEXT:    [[PHI2:%.*]] = phi i16 [ [[TMP8]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI3:%.*]] = phi i16 [ [[TMP9]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI4:%.*]] = phi i16 [ [[TMP10]], %[[ENTRY]] ], [ [[TMP39:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI5:%.*]] = phi i16 [ [[TMP11]], %[[ENTRY]] ], [ [[OTHERELE5:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI6:%.*]] = phi i16 [ [[TMP12]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI7:%.*]] = phi i16 [ [[TMP13]], %[[ENTRY]] ], [ [[OTHERELE7:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI8:%.*]] = phi i16 [ [[TMP14]], %[[ENTRY]] ], [ [[TMP40:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI9:%.*]] = phi i16 [ [[TMP15]], %[[ENTRY]] ], [ [[OTHERELE9:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI10:%.*]] = phi i16 [ [[TMP24]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI11:%.*]] = phi i16 [ [[TMP26]], %[[ENTRY]] ], [ [[OTHERELE11:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI12:%.*]] = phi i16 [ [[TMP28]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI13:%.*]] = phi i16 [ [[TMP38]], %[[ENTRY]] ], [ [[OTHERELE13:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP41:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
+; GCN-NEXT:    [[OTHERELE3]] = load i16, ptr addrspace(3) [[GEP3]], align 1
+; GCN-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[OTHERELE5]] = load i16, ptr addrspace(3) [[GEP5]], align 1
+; GCN-NEXT:    [[TMP18:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[OTHERELE7]] = load i16, ptr addrspace(3) [[GEP7]], align 1
+; GCN-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[OTHERELE9]] = load i16, ptr addrspace(3) [[GEP9]], align 1
+; GCN-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[OTHERELE11]] = load i16, ptr addrspace(3) [[GEP11]], align 1
+; GCN-NEXT:    [[TMP21:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[OTHERELE13]] = load i16, ptr addrspace(3) [[GEP13]], align 1
+; GCN-NEXT:    [[TMP22:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
+; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
+; GCN-NEXT:    [[TMP30]] = extractelement <2 x i16> [[TMP17]], i32 0
+; GCN-NEXT:    [[TMP39]] = extractelement <2 x i16> [[TMP18]], i32 0
+; GCN-NEXT:    [[TMP32]] = extractelement <2 x i16> [[TMP19]], i32 0
+; GCN-NEXT:    [[TMP40]] = extractelement <2 x i16> [[TMP20]], i32 0
+; GCN-NEXT:    [[TMP34]] = extractelement <2 x i16> [[TMP21]], i32 0
+; GCN-NEXT:    [[TMP35]] = extractelement <2 x i16> [[TMP22]], i32 0
+; GCN-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
+; GCN:       [[EXIT]]:
+; GCN-NEXT:    [[TMP36:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC038:%.*]] = shufflevector <16 x i16> [[TMP36]], <16 x i16> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC059:%.*]] = shufflevector <16 x i16> [[VEC038]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC0710:%.*]] = shufflevector <16 x i16> [[VEC059]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC0911:%.*]] = shufflevector <16 x i16> [[VEC0710]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC01112:%.*]] = shufflevector <16 x i16> [[VEC0911]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[VEC01112]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP41]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC22:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[PHI2]], i64 2
+; GCN-NEXT:    [[VEC23:%.*]] = insertelement <16 x i16> [[VEC22]], i16 [[PHI3]], i64 3
+; GCN-NEXT:    [[VEC24:%.*]] = insertelement <16 x i16> [[VEC23]], i16 [[PHI4]], i64 4
+; GCN-NEXT:    [[VEC25:%.*]] = insertelement <16 x i16> [[VEC24]], i16 [[PHI5]], i64 5
+; GCN-NEXT:    [[VEC26:%.*]] = insertelement <16 x i16> [[VEC25]], i16 [[PHI6]], i64 6
+; GCN-NEXT:    [[VEC27:%.*]] = insertelement <16 x i16> [[VEC26]], i16 [[PHI7]], i64 7
+; GCN-NEXT:    [[VEC28:%.*]] = insertelement <16 x i16> [[VEC27]], i16 [[PHI8]], i64 8
+; GCN-NEXT:    [[VEC29:%.*]] = insertelement <16 x i16> [[VEC28]], i16 [[PHI9]], i64 9
+; GCN-NEXT:    [[VEC210:%.*]] = insertelement <16 x i16> [[VEC29]], i16 [[PHI10]], i64 10
+; GCN-NEXT:    [[VEC211:%.*]] = insertelement <16 x i16> [[VEC210]], i16 [[PHI11]], i64 11
+; GCN-NEXT:    [[VEC212:%.*]] = insertelement <16 x i16> [[VEC211]], i16 [[PHI12]], i64 12
+; GCN-NEXT:    [[VEC213:%.*]] = insertelement <16 x i16> [[VEC212]], i16 [[PHI13]], i64 13
+; GCN-NEXT:    [[TMP61:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2152:%.*]] = shufflevector <16 x i16> [[VEC213]], <16 x i16> [[TMP61]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT1]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2152]], ptr [[OUT2]], align 32
+; GCN-NEXT:    ret void
+;
+entry:
+  %ele0 = load i16, ptr addrspace(3) %inptr0, align 8
+  %gep1 = getelementptr i16, ptr addrspace(3) %inptr0, i32 1
+  %ele1 = load i16, ptr addrspace(3) %gep1, align 1
+  %gep2 = getelementptr i16, ptr addrspace(3) %inptr0, i32 2
+  %ele2 = load i16, ptr addrspace(3) %gep2, align 2
+  %gep3 = getelementptr i16, ptr addrspace(3) %inptr0, i32 3
+  %ele3 = load i16, ptr addrspace(3) %gep3, align 1
+  %gep4 = getelementptr i16, ptr addrspace(3) %inptr0, i32 4
+  %ele4 = load i16, ptr addrspace(3) %gep4, align 8
+  %gep5 = getelementptr i16, ptr addrspace(3) %inptr0, i32 5
+  %ele5 = load i16, ptr addrspace(3) %gep5, align 1
+  %gep6 = getelementptr i16, ptr addrspace(3) %inptr0, i32 6
+  %ele6 = load i16, ptr addrspace(3) %gep6, align 2
+  %gep7 = getelementptr i16, ptr addrspace(3) %inptr0, i32 7
+  %ele7 = load i16, ptr addrspace(3) %gep7, align 1
+  %gep8 = getelementptr i16, ptr addrspace(3) %inptr0, i32 8
+  %ele8 = load i16, ptr addrspace(3) %gep8, align 8
+  %gep9 = getelementptr i16, ptr addrspace(3) %inptr0, i32 9
+  %ele9 = load i16, ptr addrspace(3) %gep9, align 1
+  %gep10 = getelementptr i16, ptr addrspace(3) %inptr0, i32 10
+  %ele10 = load i16, ptr addrspace(3) %gep10, align 2
+  %gep11 = getelementptr i16, ptr addrspace(3) %inptr0, i32 11
+  %ele11 = load i16, ptr addrspace(3) %gep11, align 1
+  %gep12 = getelementptr i16, ptr addrspace(3) %inptr0, i32 12
+  %ele12 = load i16, ptr addrspace(3) %gep12, align 8
+  %gep13 = getelementptr i16, ptr addrspace(3) %inptr0, i32 13
+  %ele13 = load i16, ptr addrspace(3) %gep13, align 1
+  %gep14 = getelementptr i16, ptr addrspace(3) %inptr0, i32 14
+  %ele14 = load i16, ptr addrspace(3) %gep14, align 2
+  %gep15 = getelementptr i16, ptr addrspace(3) %inptr0, i32 15
+  %ele15 = load i16, ptr addrspace(3) %gep15, align 1
+  br label %do.body
+
+do.body:
+  %phi0 = phi i16 [ %ele0, %entry ], [ %otherele0, %do.body ]
+  %phi1 = phi i16 [ %ele1, %entry ], [ %otherele1, %do.body ]
+  %phi2 = phi i16 [ %ele2, %entry ], [ %otherele2, %do.body ]
+  %phi3 = phi i16 [ %ele3, %entry ], [ %otherele3, %do.body ]
+  %phi4 = phi i16 [ %ele4, %entry ], [ %otherele4, %do.body ]
+  %phi5 = phi i16 [ %ele5, %entry ], [ %otherele5, %do.body ]
+  %phi6 = phi i16 [ %ele6, %entry ], [ %otherele6, %do.body ]
+  %phi7 = phi i16 [ %ele7, %entry ], [ %otherele7, %do.body ]
+  %phi8 = phi i16 [ %ele8, %entry ], [ %otherele8, %do.body ]
+  %phi9 = phi i16 [ %ele9, %entry ], [ %otherele9, %do.body ]
+  %phi10 = phi i16 [ %ele10, %entry ], [ %otherele10, %do.body ]
+  %phi11 = phi i16 [ %ele11, %entry ], [ %otherele11, %do.body ]
+  %phi12 = phi i16 [ %ele12, %entry ], [ %otherele12, %do.body ]
+  %phi13 = phi i16 [ %ele13, %entry ], [ %otherele13, %do.body ]
+  %phi14 = phi i16 [ %ele14, %entry ], [ %otherele14, %do.body ]
+  %phi15 = phi i16 [ %ele15, %entry ], [ %otherele15, %do.body ]
+
+  %otherele0 = load i16, ptr addrspace(3) %inptr0, align 8
+  %otherele1 = load i16, ptr addrspace(3) %gep1, align 1
+  %otherele2 = load i16, ptr addrspace(3) %gep2, align 2
+  %otherele3 = load i16, ptr addrspace(3) %gep3, align 1
+  %otherele4 = load i16, ptr addrspace(3) %gep4, align 8
+  %otherele5 = load i16, ptr addrspace(3) %gep5, align 1
+  %otherele6 = load i16, ptr addrspace(3) %gep6, align 2
+  %otherele7 = load i16, ptr addrspace(3) %gep7, align 1
+  %otherele8 = load i16, ptr addrspace(3) %gep8, align 8
+  %otherele9 = load i16, ptr addrspace(3) %gep9, align 1
+  %otherele10 = load i16, ptr addrspace(3) %gep10, align 2
+  %otherele11 = load i16, ptr addrspace(3) %gep11, align 1
+  %otherele12 = load i16, ptr addrspace(3) %gep12, align 8
+  %otherele13 = load i16, ptr addrspace(3) %gep13, align 1
+  %otherele14 = load i16, ptr addrspace(3) %gep14, align 2
+  %otherele15 = load i16, ptr addrspace(3) %gep15, align 1
+  %cmp = icmp eq i32 %flag, 0
+  br i1 %cmp, label %exit, label %do.body
+
+exit:
+  %vec00 = insertelement <16 x i16> poison, i16 %otherele0, i64 0
+  %vec01 = insertelement <16 x i16> %vec00, i16 %otherele1, i64 1
+  %vec02 = insertelement <16 x i16> %vec01, i16 %otherele2, i64 2
+  %vec03 = insertelement <16 x i16> %vec02, i16 %otherele3, i64 3
+  %vec04 = insertelement <16 x i16> %vec03, i16 %otherele4, i64 4
+  %vec05 = insertelement <16 x i16> %vec04, i16 %otherele5, i64 5
+  %vec06 = insertelement <16 x i16> %vec05, i16 %otherele6, i64 6
+  %vec07 = insertelement <16 x i16> %vec06, i16 %otherele7, i64 7
+  %vec08 = insertelement <16 x i16> %vec07, i16 %otherele8, i64 8
+  %vec09 = insertelement <16 x i16> %vec08, i16 %otherele9, i64 9
+  %vec010 = insertelement <16 x i16> %vec09, i16 %otherele10, i64 10
+  %vec011 = insertelement <16 x i16> %vec010, i16 %otherele11, i64 11
+  %vec012 = insertelement <16 x i16> %vec011, i16 %otherele12, i64 12
+  %vec013 = insertelement <16 x i16> %vec012, i16 %otherele13, i64 13
+  %vec014 = insertelement <16 x i16> %vec013, i16 %otherele14, i64 14
+  %vec015 = insertelement <16 x i16> %vec014, i16 %otherele15, i64 15
+
+  %vec10 = insertelement <16 x i16> poison, i16 %ele0, i64 0
+  %vec11 = insertelement <16 x i16> %vec10, i16 %ele1, i64 1
+  %vec12 = insertelement <16 x i16> %vec11, i16 %ele2, i64 2
+  %vec13 = insertelement <16 x i16> %vec12, i16 %ele3, i64 3
+  %vec14 = insertelement <16 x i16> %vec13, i16 %ele4, i64 4
+  %vec15 = insertelement <16 x i16> %vec14, i16 %ele5, i64 5
+  %vec16 = insertelement <16 x i16> %vec15, i16 %ele6, i64 6
+  %vec17 = insertelement <16 x i16> %vec16, i16 %ele7, i64 7
+  %vec18 = insertelement <16 x i16> %vec17, i16 %ele8, i64 8
+  %vec19 = insertelement <16 x i16> %vec18, i16 %ele9, i64 9
+  %vec110 = insertelement <16 x i16> %vec19, i16 %ele10, i64 10
+  %vec111 = insertelement <16 x i16> %vec110, i16 %ele11, i64 11
+  %vec112 = insertelement <16 x i16> %vec111, i16 %ele12, i64 12
+  %vec113 = insertelement <16 x i16> %vec112, i16 %ele13, i64 13
+  %vec114 = insertelement <16 x i16> %vec113, i16 %ele14, i64 14
+  %vec115 = insertelement <16 x i16> %vec114, i16 %ele15, i64 15
+
+  %vec20 = insertelement <16 x i16> poison, i16 %phi0, i64 0
+  %vec21 = insertelement <16 x i16> %vec20, i16 %phi1, i64 1
+  %vec22 = insertelement <16 x i16> %vec21, i16 %phi2, i64 2
+  %vec23 = insertelement <16 x i16> %vec22, i16 %phi3, i64 3
+  %vec24 = insertelement <16 x i16> %vec23, i16 %phi4, i64 4
+  %vec25 = insertelement <16 x i16> %vec24, i16 %phi5, i64 5
+  %vec26 = insertelement <16 x i16> %vec25, i16 %phi6, i64 6
+  %vec27 = insertelement <16 x i16> %vec26, i16 %phi7, i64 7
+  %vec28 = insertelement <16 x i16> %vec27, i16 %phi8, i64 8
+  %vec29 = insertelement <16 x i16> %vec28, i16 %phi9, i64 9
+  %vec210 = insertelement <16 x i16> %vec29, i16 %phi10, i64 10
+  %vec211 = insertelement <16 x i16> %vec210, i16 %phi11, i64 11
+  %vec212 = insertelement <16 x i16> %vec211, i16 %phi12, i64 12
+  %vec213 = insertelement <16 x i16> %vec212, i16 %phi13, i64 13
+  %vec214 = insertelement <16 x i16> %vec213, i16 %phi14, i64 14
+  %vec215 = insertelement <16 x i16> %vec214, i16 %phi15, i64 15
+
+  store <16 x i16> %vec115, ptr %out
+  store <16 x i16> %vec015, ptr %out1
+  store <16 x i16> %vec215, ptr %out2
+
+  ret void
+}

From e1e1836bbd70e4f30bd0be97b9d81eabfd6b45c8 Mon Sep 17 00:00:00 2001
From: Omair Javaid <omair.javaid@linaro.org>
Date: Thu, 12 Jun 2025 19:38:42 +0500
Subject: [PATCH 0144/1322] [CodeGen] Inline stack guard check on Windows
 (#136290)

This patch optimizes the Windows security cookie check mechanism by
moving the comparison inline and only calling __security_check_cookie
when the check fails. This reduces the overhead of making a DLL call
for every function return.

Previously, we implemented this optimization through a machine pass
(X86WinFixupBufferSecurityCheckPass) in PR #95904 submitted by
@mahesh-attarde. We have reverted that pass in favor of this new
approach. Also we have abandoned the AArch64 specific implementation
of same pass in PR #121938 in favor of this more general solution.

The old machine instruction pass approach:
- Scanned the generated code to find __security_check_cookie calls
- Modified these calls by splitting basic blocks
- Added comparison logic and conditional branching
- Required complex block management and live register computation

The new approach:
- Implements the same optimization during instruction selection
- Directly emits the comparison and conditional branching
- No need for post-processing or basic block manipulation
- Disables optimization at -Oz.

Thanks @tamaspetz, @efriedma-quic and @arsenm for their help.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  95 +++++--
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |   8 +-
 llvm/lib/Target/X86/CMakeLists.txt            |   1 -
 llvm/lib/Target/X86/X86.h                     |   4 -
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   1 -
 .../X86/X86WinFixupBufferSecurityCheck.cpp    | 245 ------------------
 .../irtranslator-stack-protector-windows.ll   |  12 +-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   1 -
 .../CodeGen/X86/stack-protector-msvc-oz.ll    | 119 +++++++++
 llvm/test/CodeGen/X86/stack-protector-msvc.ll | 125 +++++++--
 llvm/test/CodeGen/X86/tailcc-ssp.ll           |  28 +-
 llvm/test/DebugInfo/COFF/fpo-stack-protect.ll |   7 +-
 12 files changed, 333 insertions(+), 313 deletions(-)
 delete mode 100644 llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp
 create mode 100644 llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e6a1dc930685..c63eb7fc6b37 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3037,8 +3037,9 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // First create the loads to the guard/stack slot for the comparison.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
-  EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
+  auto &DL = DAG.getDataLayout();
+  EVT PtrTy = TLI.getFrameIndexTy(DL);
+  EVT PtrMemTy = TLI.getPointerMemTy(DL, DL.getAllocaAddrSpace());
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
@@ -3047,8 +3048,8 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   SDLoc dl = getCurSDLoc();
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
   const Module &M = *ParentBB->getParent()->getFunction().getParent();
-  Align Align =
-      DAG.getDataLayout().getPrefTypeAlign(PointerType::get(M.getContext(), 0));
+  Align Align = DL.getPrefTypeAlign(
+      PointerType::get(M.getContext(), DL.getAllocaAddrSpace()));
 
   // Generate code to load the content of the guard slot.
   SDValue GuardVal = DAG.getLoad(
@@ -3059,8 +3060,14 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   if (TLI.useStackGuardXorFP())
     GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
 
-  // Retrieve guard check function, nullptr if instrumentation is inlined.
-  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+  // If we're using function-based instrumentation, call the guard check
+  // function
+  if (SPD.shouldEmitFunctionBasedCheckStackProtector()) {
+    // Get the guard check function from the target and verify it exists since
+    // we're using function-based instrumentation
+    const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M);
+    assert(GuardCheckFn && "Guard check function is null");
+
     // The target provides a guard check function to validate the guard value.
     // Generate a call to that function with the content of the guard slot as
     // argument.
@@ -3101,10 +3108,9 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   }
 
   // Perform the comparison via a getsetcc.
-  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
-                                                        *DAG.getContext(),
-                                                        Guard.getValueType()),
-                             Guard, GuardVal, ISD::SETNE);
+  SDValue Cmp = DAG.getSetCC(
+      dl, TLI.getSetCCResultType(DL, *DAG.getContext(), Guard.getValueType()),
+      Guard, GuardVal, ISD::SETNE);
 
   // If the guard/stackslot do not equal, branch to failure MBB.
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
@@ -3126,14 +3132,69 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 /// For a high level explanation of how this fits into the stack protector
 /// generation see the comment on the declaration of class
 /// StackProtectorDescriptor.
-void
-SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+void SelectionDAGBuilder::visitSPDescriptorFailure(
+    StackProtectorDescriptor &SPD) {
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setDiscardResult(true);
-  SDValue Chain = TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
-                                  MVT::isVoid, {}, CallOptions, getCurSDLoc())
-                      .second;
+  MachineBasicBlock *ParentBB = SPD.getParentMBB();
+  const Module &M = *ParentBB->getParent()->getFunction().getParent();
+  SDValue Chain;
+
+  // For -Oz builds with a guard check function, we use function-based
+  // instrumentation. Otherwise, if we have a guard check function, we call it
+  // in the failure block.
+  auto *GuardCheckFn = TLI.getSSPStackGuardCheck(M);
+  if (GuardCheckFn && !SPD.shouldEmitFunctionBasedCheckStackProtector()) {
+    // First create the loads to the guard/stack slot for the comparison.
+    auto &DL = DAG.getDataLayout();
+    EVT PtrTy = TLI.getFrameIndexTy(DL);
+    EVT PtrMemTy = TLI.getPointerMemTy(DL, DL.getAllocaAddrSpace());
+
+    MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
+    int FI = MFI.getStackProtectorIndex();
+
+    SDLoc dl = getCurSDLoc();
+    SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+    Align Align = DL.getPrefTypeAlign(
+        PointerType::get(M.getContext(), DL.getAllocaAddrSpace()));
+
+    // Generate code to load the content of the guard slot.
+    SDValue GuardVal = DAG.getLoad(
+        PtrMemTy, dl, DAG.getEntryNode(), StackSlotPtr,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
+        MachineMemOperand::MOVolatile);
+
+    if (TLI.useStackGuardXorFP())
+      GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
+
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    FunctionType *FnTy = GuardCheckFn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = GuardVal;
+    Entry.Ty = FnTy->getParamType(0);
+    if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
+      Entry.IsInReg = true;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(DAG.getEntryNode())
+        .setCallee(GuardCheckFn->getCallingConv(), FnTy->getReturnType(),
+                   getValue(GuardCheckFn), std::move(Args));
+
+    Chain = TLI.LowerCallTo(CLI).second;
+  } else {
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setDiscardResult(true);
+    Chain = TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
+                            {}, CallOptions, getCurSDLoc())
+                .second;
+  }
 
   // Emit a trap instruction if we are required to do so.
   const TargetOptions &TargetOpts = DAG.getTarget().Options;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index f59963756471..b02a03c0b0cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1877,7 +1877,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
     if (SP->shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
-          TLI->getSSPStackGuardCheck(*Fn.getParent());
+          TLI->getSSPStackGuardCheck(*Fn.getParent()) && Fn.hasMinSize();
       SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->getMBB(LLVMBB),
                                    FunctionBasedInstrumentation);
     }
@@ -1950,8 +1950,7 @@ SelectionDAGISel::FinishBasicBlock() {
 
     // Add load and check to the basicblock.
     FuncInfo->MBB = ParentMBB;
-    FuncInfo->InsertPt =
-        findSplitPointForStackProtector(ParentMBB, *TII);
+    FuncInfo->InsertPt = findSplitPointForStackProtector(ParentMBB, *TII);
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
@@ -1973,8 +1972,7 @@ SelectionDAGISel::FinishBasicBlock() {
         findSplitPointForStackProtector(ParentMBB, *TII);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
-    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
-                       SplitPoint,
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB, SplitPoint,
                        ParentMBB->end());
 
     // Add compare/jump on neq/jump to the parent BB.
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 6627e97dd094..1bf9f8b46799 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -85,7 +85,6 @@ set(sources
   X86VZeroUpper.cpp
   X86WinEHState.cpp
   X86WinEHUnwindV2.cpp
-  X86WinFixupBufferSecurityCheck.cpp
   X86InsertWait.cpp
   GISel/X86CallLowering.cpp
   GISel/X86InstructionSelector.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index be2ddac35cab..6261fadf10a7 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -73,9 +73,6 @@ FunctionPass *createX86OptimizeLEAs();
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
-/// Return a pass that transform inline buffer security check into seperate bb
-FunctionPass *createX86WinFixupBufferSecurityCheckPass();
-
 /// Return a pass that avoids creating store forward block issues in the hardware.
 FunctionPass *createX86AvoidStoreForwardingBlocks();
 
@@ -195,7 +192,6 @@ void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FastPreTileConfigPass(PassRegistry &);
 void initializeX86FastTileConfigPass(PassRegistry &);
 void initializeX86FixupSetCCPassPass(PassRegistry &);
-void initializeX86WinFixupBufferSecurityCheckPassPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
 void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 7e75c0e56586..2d4afc23f1a4 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -554,7 +554,6 @@ bool X86PassConfig::addPreISel() {
 void X86PassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOptLevel::None) {
     addPass(&LiveRangeShrinkID);
-    addPass(createX86WinFixupBufferSecurityCheckPass());
     addPass(createX86FixupSetCC());
     addPass(createX86OptimizeLEAs());
     addPass(createX86CallFrameOptimization());
diff --git a/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp b/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp
deleted file mode 100644
index 5c12af1fee63..000000000000
--- a/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-//===- X86WinFixupBufferSecurityCheck.cpp Fix Buffer Security Check Call -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Buffer Security Check implementation inserts windows specific callback into
-// code. On windows, __security_check_cookie call gets call everytime function
-// is return without fixup. Since this function is defined in runtime library,
-// it incures cost of call in dll which simply does comparison and returns most
-// time. With Fixup, We selective move to call in DLL only if comparison fails.
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86FrameLowering.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-win-fixup-bscheck"
-
-namespace {
-
-class X86WinFixupBufferSecurityCheckPass : public MachineFunctionPass {
-public:
-  static char ID;
-
-  X86WinFixupBufferSecurityCheckPass() : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override {
-    return "X86 Windows Fixup Buffer Security Check";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  std::pair<MachineBasicBlock *, MachineInstr *>
-  getSecurityCheckerBasicBlock(MachineFunction &MF);
-
-  void getGuardCheckSequence(MachineBasicBlock *CurMBB, MachineInstr *CheckCall,
-                             MachineInstr *SeqMI[5]);
-
-  void SplitBasicBlock(MachineBasicBlock *CurMBB, MachineBasicBlock *NewRetMBB,
-                       MachineBasicBlock::iterator SplitIt);
-
-  void FinishBlock(MachineBasicBlock *MBB);
-
-  void FinishFunction(MachineBasicBlock *FailMBB, MachineBasicBlock *NewRetMBB);
-
-  std::pair<MachineInstr *, MachineInstr *>
-  CreateFailCheckSequence(MachineBasicBlock *CurMBB, MachineBasicBlock *FailMBB,
-                          MachineInstr *SeqMI[5]);
-};
-} // end anonymous namespace
-
-char X86WinFixupBufferSecurityCheckPass::ID = 0;
-
-INITIALIZE_PASS(X86WinFixupBufferSecurityCheckPass, DEBUG_TYPE, DEBUG_TYPE,
-                false, false)
-
-FunctionPass *llvm::createX86WinFixupBufferSecurityCheckPass() {
-  return new X86WinFixupBufferSecurityCheckPass();
-}
-
-void X86WinFixupBufferSecurityCheckPass::SplitBasicBlock(
-    MachineBasicBlock *CurMBB, MachineBasicBlock *NewRetMBB,
-    MachineBasicBlock::iterator SplitIt) {
-  NewRetMBB->splice(NewRetMBB->end(), CurMBB, SplitIt, CurMBB->end());
-}
-
-std::pair<MachineBasicBlock *, MachineInstr *>
-X86WinFixupBufferSecurityCheckPass::getSecurityCheckerBasicBlock(
-    MachineFunction &MF) {
-  MachineBasicBlock::reverse_iterator RBegin, REnd;
-
-  for (auto &MBB : llvm::reverse(MF)) {
-    for (RBegin = MBB.rbegin(), REnd = MBB.rend(); RBegin != REnd; ++RBegin) {
-      auto &MI = *RBegin;
-      if (MI.getOpcode() == X86::CALL64pcrel32 &&
-          MI.getNumExplicitOperands() == 1) {
-        auto MO = MI.getOperand(0);
-        if (MO.isGlobal()) {
-          auto Callee = dyn_cast<Function>(MO.getGlobal());
-          if (Callee && Callee->getName() == "__security_check_cookie") {
-            return std::make_pair(&MBB, &MI);
-            break;
-          }
-        }
-      }
-    }
-  }
-  return std::make_pair(nullptr, nullptr);
-}
-
-void X86WinFixupBufferSecurityCheckPass::getGuardCheckSequence(
-    MachineBasicBlock *CurMBB, MachineInstr *CheckCall,
-    MachineInstr *SeqMI[5]) {
-
-  MachineBasicBlock::iterator UIt(CheckCall);
-  MachineBasicBlock::reverse_iterator DIt(CheckCall);
-  // Seq From StackUp to Stack Down Is fixed.
-  // ADJCALLSTACKUP64
-  ++UIt;
-  SeqMI[4] = &*UIt;
-
-  // CALL __security_check_cookie
-  SeqMI[3] = CheckCall;
-
-  // COPY function slot cookie
-  ++DIt;
-  SeqMI[2] = &*DIt;
-
-  // ADJCALLSTACKDOWN64
-  ++DIt;
-  SeqMI[1] = &*DIt;
-
-  MachineBasicBlock::reverse_iterator XIt(SeqMI[1]);
-  for (; XIt != CurMBB->rbegin(); ++XIt) {
-    auto &CI = *XIt;
-    if ((CI.getOpcode() == X86::XOR64_FP) || (CI.getOpcode() == X86::XOR32_FP))
-      break;
-  }
-  SeqMI[0] = &*XIt;
-}
-
-std::pair<MachineInstr *, MachineInstr *>
-X86WinFixupBufferSecurityCheckPass::CreateFailCheckSequence(
-    MachineBasicBlock *CurMBB, MachineBasicBlock *FailMBB,
-    MachineInstr *SeqMI[5]) {
-
-  auto MF = CurMBB->getParent();
-
-  Module &M = *MF->getFunction().getParent();
-  GlobalVariable *GV = M.getGlobalVariable("__security_cookie");
-  assert(GV && " Security Cookie was not installed!");
-
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-
-  MachineInstr *GuardXor = SeqMI[0];
-  MachineBasicBlock::iterator InsertPt(GuardXor);
-  ++InsertPt;
-
-  // Compare security_Cookie with XOR_Val, if not same, we have violation
-  auto CMI = BuildMI(*CurMBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rm))
-                 .addReg(GuardXor->getOperand(0).getReg())
-                 .addReg(X86::RIP)
-                 .addImm(1)
-                 .addReg(X86::NoRegister)
-                 .addGlobalAddress(GV)
-                 .addReg(X86::NoRegister);
-
-  BuildMI(*CurMBB, InsertPt, DebugLoc(), TII->get(X86::JCC_1))
-      .addMBB(FailMBB)
-      .addImm(X86::COND_NE);
-
-  auto JMI = BuildMI(*CurMBB, InsertPt, DebugLoc(), TII->get(X86::JMP_1));
-
-  return std::make_pair(CMI.getInstr(), JMI.getInstr());
-}
-
-void X86WinFixupBufferSecurityCheckPass::FinishBlock(MachineBasicBlock *MBB) {
-  LivePhysRegs LiveRegs;
-  computeAndAddLiveIns(LiveRegs, *MBB);
-}
-
-void X86WinFixupBufferSecurityCheckPass::FinishFunction(
-    MachineBasicBlock *FailMBB, MachineBasicBlock *NewRetMBB) {
-  FailMBB->getParent()->RenumberBlocks();
-  // FailMBB includes call to MSCV RT  where is __security_check_cookie
-  // function is called. This function uses regcall and it expects cookie
-  // value from stack slot.( even if this is modified)
-  // Before going further we compute back livein for this block to make sure
-  // it is live and provided.
-  FinishBlock(FailMBB);
-  FinishBlock(NewRetMBB);
-}
-
-bool X86WinFixupBufferSecurityCheckPass::runOnMachineFunction(
-    MachineFunction &MF) {
-  bool Changed = false;
-  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
-
-  if (!(STI.isTargetWindowsItanium() || STI.isTargetWindowsMSVC()))
-    return Changed;
-
-  // Check if security cookie was installed or not
-  Module &M = *MF.getFunction().getParent();
-  GlobalVariable *GV = M.getGlobalVariable("__security_cookie");
-  if (!GV)
-    return Changed;
-
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
-  // Check if security check cookie was installed or not
-  auto [CurMBB, CheckCall] = getSecurityCheckerBasicBlock(MF);
-
-  if (!CheckCall)
-    return Changed;
-
-  MachineBasicBlock *FailMBB = MF.CreateMachineBasicBlock();
-  MachineBasicBlock *NewRetMBB = MF.CreateMachineBasicBlock();
-
-  MF.insert(MF.end(), NewRetMBB);
-  MF.insert(MF.end(), FailMBB);
-
-  MachineInstr *SeqMI[5];
-  getGuardCheckSequence(CurMBB, CheckCall, SeqMI);
-  // MachineInstr * GuardXor  = SeqMI[0];
-
-  auto FailSeqRange = CreateFailCheckSequence(CurMBB, FailMBB, SeqMI);
-  MachineInstrBuilder JMI(MF, FailSeqRange.second);
-
-  // After Inserting JMP_1, we can not have two terminators
-  // in same block, split CurrentMBB after JMP_1
-  MachineBasicBlock::iterator SplitIt(SeqMI[4]);
-  ++SplitIt;
-  SplitBasicBlock(CurMBB, NewRetMBB, SplitIt);
-
-  // Fill up Failure Routine, move Fail Check Squence from CurMBB to FailMBB
-  MachineBasicBlock::iterator U1It(SeqMI[1]);
-  MachineBasicBlock::iterator U2It(SeqMI[4]);
-  ++U2It;
-  FailMBB->splice(FailMBB->end(), CurMBB, U1It, U2It);
-  BuildMI(*FailMBB, FailMBB->end(), DebugLoc(), TII->get(X86::INT3));
-
-  // Move left over instruction after StackUp
-  // from Current Basic BLocks into New Return Block
-  JMI.addMBB(NewRetMBB);
-  MachineBasicBlock::iterator SplicePt(JMI.getInstr());
-  ++SplicePt;
-  if (SplicePt != CurMBB->end())
-    NewRetMBB->splice(NewRetMBB->end(), CurMBB, SplicePt);
-
-  // Restructure Basic Blocks
-  CurMBB->addSuccessor(NewRetMBB);
-  CurMBB->addSuccessor(FailMBB);
-
-  FinishFunction(FailMBB, NewRetMBB);
-  return !Changed;
-}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll
index 6aefc5341da0..e7f4785d01df 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll
@@ -17,8 +17,12 @@ define void @caller() sspreq {
 ; CHECK-NEXT:    ldr x8, [x8, :lo12:__security_cookie]
 ; CHECK-NEXT:    str x8, [sp, #8]
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    ldr x0, [sp, #8]
-; CHECK-NEXT:    bl __security_check_cookie
+; CHECK-NEXT:    adrp x8, __security_cookie
+; CHECK-NEXT:    ldr x9, [sp, #8]
+; CHECK-NEXT:    ldr x8, [x8, :lo12:__security_cookie]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 16
@@ -26,6 +30,10 @@ define void @caller() sspreq {
 ; CHECK-NEXT:    .seh_stackalloc 32
 ; CHECK-NEXT:    .seh_endepilogue
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %entry
+; CHECK-NEXT:    ldr x0, [sp, #8]
+; CHECK-NEXT:    bl __security_check_cookie
+; CHECK-NEXT:    brk #0x1
 ; CHECK-NEXT:    .seh_endfunclet
 ; CHECK-NEXT:    .seh_endproc
 entry:
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 540046e6a863..8d155bd57df1 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -121,7 +121,6 @@
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       Live Range Shrink
-; CHECK-NEXT:       X86 Windows Fixup Buffer Security Check
 ; CHECK-NEXT:       X86 Fixup SetCC
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 LEA Optimize
diff --git a/llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll b/llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll
new file mode 100644
index 000000000000..d8a772efbd7e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X64 %s
+
+; Make sure fastisel falls back and does something secure.
+; RUN: llc -mtriple=i686-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X86-O0 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X64-O0 %s
+
+@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"    ; <ptr> [#uses=1]
+
+define void @test(ptr %a) nounwind ssp minsize {
+; MSVC-X86-LABEL: test:
+; MSVC-X86:       # %bb.0: # %entry
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    subl $12, %esp
+; MSVC-X86-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-NEXT:    xorl %esp, %eax
+; MSVC-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-NEXT:    movl %esp, %esi
+; MSVC-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    calll _strcpy
+; MSVC-X86-NEXT:    popl %ecx
+; MSVC-X86-NEXT:    popl %edx
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    pushl $LC
+; MSVC-X86-NEXT:    calll _printf
+; MSVC-X86-NEXT:    popl %ecx
+; MSVC-X86-NEXT:    popl %edx
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-NEXT:    xorl %esp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    addl $12, %esp
+; MSVC-X86-NEXT:    popl %esi
+; MSVC-X86-NEXT:    retl
+;
+; MSVC-X64-LABEL: test:
+; MSVC-X64:       # %bb.0: # %entry
+; MSVC-X64-NEXT:    pushq %rsi
+; MSVC-X64-NEXT:    subq $64, %rsp
+; MSVC-X64-NEXT:    movq %rcx, %rdx
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-NEXT:    xorq %rsp, %rax
+; MSVC-X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; MSVC-X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; MSVC-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; MSVC-X64-NEXT:    movq %rsi, %rcx
+; MSVC-X64-NEXT:    callq strcpy
+; MSVC-X64-NEXT:    leaq LC(%rip), %rcx
+; MSVC-X64-NEXT:    movq %rsi, %rdx
+; MSVC-X64-NEXT:    callq printf
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-NEXT:    callq __security_check_cookie
+; MSVC-X64-NEXT:    addq $64, %rsp
+; MSVC-X64-NEXT:    popq %rsi
+; MSVC-X64-NEXT:    retq
+;
+; MSVC-X86-O0-LABEL: test:
+; MSVC-X86-O0:       # %bb.0: # %entry
+; MSVC-X86-O0-NEXT:    subl $20, %esp
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    xorl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    movl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %ecx, 4(%eax)
+; MSVC-X86-O0-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    movl %ecx, (%eax)
+; MSVC-X86-O0-NEXT:    calll _strcpy
+; MSVC-X86-O0-NEXT:    leal LC, %ecx
+; MSVC-X86-O0-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; MSVC-X86-O0-NEXT:    movl %ecx, (%esp)
+; MSVC-X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-O0-NEXT:    calll _printf
+; MSVC-X86-O0-NEXT:  # %bb.1: # %return
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
+; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:    addl $20, %esp
+; MSVC-X86-O0-NEXT:    retl
+;
+; MSVC-X64-O0-LABEL: test:
+; MSVC-X64-O0:       # %bb.0: # %entry
+; MSVC-X64-O0-NEXT:    subq $56, %rsp
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rax
+; MSVC-X64-O0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; MSVC-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    callq strcpy
+; MSVC-X64-O0-NEXT:    leaq LC(%rip), %rcx
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; MSVC-X64-O0-NEXT:    callq printf
+; MSVC-X64-O0-NEXT:  # %bb.1: # %return
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:    addq $56, %rsp
+; MSVC-X64-O0-NEXT:    retq
+entry:
+ %a_addr = alloca ptr    ; <ptr> [#uses=2]
+ %buf = alloca [8 x i8]    ; <ptr> [#uses=2]
+ store ptr %a, ptr %a_addr
+ %0 = load ptr, ptr %a_addr, align 4    ; <ptr> [#uses=1]
+ %1 = call ptr @strcpy(ptr %buf, ptr %0) nounwind   ; <ptr> [#uses=0]
+ %2 = call i32 (ptr, ...) @printf(ptr @"\01LC", ptr %buf) nounwind    ; <i32> [#uses=0]
+ br label %return
+
+return:    ; preds = %entry
+ ret void
+}
+
+declare ptr @strcpy(ptr, ptr) nounwind
+
+declare i32 @printf(ptr, ...) nounwind
+
diff --git a/llvm/test/CodeGen/X86/stack-protector-msvc.ll b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
index d718062d2c48..a868fa549296 100644
--- a/llvm/test/CodeGen/X86/stack-protector-msvc.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
@@ -25,12 +25,19 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl $LC
 ; MSVC-X86-NEXT:    calll _printf
 ; MSVC-X86-NEXT:    addl $8, %esp
-; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; MSVC-X86-NEXT:    xorl %esp, %ecx
-; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MSVC-X86-NEXT:    xorl %esp, %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    cmpl %eax, %ecx
+; MSVC-X86-NEXT:    jne LBB0_2
+; MSVC-X86-NEXT:  # %bb.1: # %return
 ; MSVC-X86-NEXT:    addl $12, %esp
 ; MSVC-X86-NEXT:    popl %esi
 ; MSVC-X86-NEXT:    retl
+; MSVC-X86-NEXT:  LBB0_2: # %return
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-NEXT:    xorl %esp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
 ;
 ; MSVC-X64-LABEL: test:
 ; MSVC-X64:       # %bb.0: # %entry
@@ -47,17 +54,19 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X64-NEXT:    leaq LC(%rip), %rcx
 ; MSVC-X64-NEXT:    movq %rsi, %rdx
 ; MSVC-X64-NEXT:    callq printf
-; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; MSVC-X64-NEXT:    xorq %rsp, %rcx
-; MSVC-X64-NEXT:    cmpq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; MSVC-X64-NEXT:    xorq %rsp, %rax
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    cmpq %rax, %rcx
 ; MSVC-X64-NEXT:    jne .LBB0_2
-; MSVC-X64-NEXT:  # %bb.1:
+; MSVC-X64-NEXT:  # %bb.1: # %return
 ; MSVC-X64-NEXT:    addq $64, %rsp
 ; MSVC-X64-NEXT:    popq %rsi
 ; MSVC-X64-NEXT:    retq
-; MSVC-X64-NEXT:  .LBB0_2:
+; MSVC-X64-NEXT:  .LBB0_2: # %return
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-NEXT:    xorq %rsp, %rcx
 ; MSVC-X64-NEXT:    callq __security_check_cookie
-; MSVC-X64-NEXT:    int3
 ;
 ; MSVC-X86-O0-LABEL: test:
 ; MSVC-X86-O0:       # %bb.0: # %entry
@@ -80,7 +89,15 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X86-O0-NEXT:  # %bb.1: # %return
 ; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    subl %ecx, %eax
+; MSVC-X86-O0-NEXT:    jne LBB0_3
+; MSVC-X86-O0-NEXT:    jmp LBB0_2
+; MSVC-X86-O0-NEXT:  LBB0_3: # %return
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
 ; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:  LBB0_2: # %return
 ; MSVC-X86-O0-NEXT:    addl $20, %esp
 ; MSVC-X86-O0-NEXT:    retl
 ;
@@ -100,9 +117,18 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X64-O0-NEXT:  # %bb.1: # %return
 ; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    subq %rcx, %rax
+; MSVC-X64-O0-NEXT:    jne .LBB0_3
+; MSVC-X64-O0-NEXT:    jmp .LBB0_2
+; MSVC-X64-O0-NEXT:  .LBB0_3: # %return
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
 ; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:  .LBB0_2: # %return
 ; MSVC-X64-O0-NEXT:    addq $56, %rsp
 ; MSVC-X64-O0-NEXT:    retq
+
 entry:
  %a_addr = alloca ptr    ; <ptr> [#uses=2]
  %buf = alloca [8 x i8]    ; <ptr> [#uses=2]
@@ -134,12 +160,19 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl %eax
 ; MSVC-X86-NEXT:    calll _escape
 ; MSVC-X86-NEXT:    addl $4, %esp
-; MSVC-X86-NEXT:    movl -4(%ebp), %ecx
-; MSVC-X86-NEXT:    xorl %ebp, %ecx
-; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl -4(%ebp), %eax
+; MSVC-X86-NEXT:    xorl %ebp, %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    cmpl %eax, %ecx
+; MSVC-X86-NEXT:    jne LBB1_2
+; MSVC-X86-NEXT:  # %bb.1:
 ; MSVC-X86-NEXT:    movl %ebp, %esp
 ; MSVC-X86-NEXT:    popl %ebp
 ; MSVC-X86-NEXT:    retl
+; MSVC-X86-NEXT:  LBB1_2:
+; MSVC-X86-NEXT:    movl -4(%ebp), %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
 ;
 ; MSVC-X64-LABEL: test_vla:
 ; MSVC-X64:       # %bb.0:
@@ -158,19 +191,20 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X64-NEXT:    subq $32, %rsp
 ; MSVC-X64-NEXT:    callq escape
 ; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    movq -8(%rbp), %rcx
-; MSVC-X64-NEXT:    xorq %rbp, %rcx
-; MSVC-X64-NEXT:    cmpq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    movq -8(%rbp), %rax
+; MSVC-X64-NEXT:    xorq %rbp, %rax
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    cmpq %rax, %rcx
 ; MSVC-X64-NEXT:    jne .LBB1_2
 ; MSVC-X64-NEXT:  # %bb.1:
 ; MSVC-X64-NEXT:    movq %rbp, %rsp
 ; MSVC-X64-NEXT:    popq %rbp
 ; MSVC-X64-NEXT:    retq
 ; MSVC-X64-NEXT:  .LBB1_2:
+; MSVC-X64-NEXT:    movq -8(%rbp), %rcx
+; MSVC-X64-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-NEXT:    subq $32, %rsp
 ; MSVC-X64-NEXT:    callq __security_check_cookie
-; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    int3
 ;
 ; MSVC-X86-O0-LABEL: test_vla:
 ; MSVC-X86-O0:       # %bb.0:
@@ -190,7 +224,15 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X86-O0-NEXT:    addl $4, %esp
 ; MSVC-X86-O0-NEXT:    movl -4(%ebp), %ecx
 ; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    subl %ecx, %eax
+; MSVC-X86-O0-NEXT:    jne LBB1_2
+; MSVC-X86-O0-NEXT:    jmp LBB1_1
+; MSVC-X86-O0-NEXT:  LBB1_2:
+; MSVC-X86-O0-NEXT:    movl -4(%ebp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
 ; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:  LBB1_1:
 ; MSVC-X86-O0-NEXT:    movl %ebp, %esp
 ; MSVC-X86-O0-NEXT:    popl %ebp
 ; MSVC-X86-O0-NEXT:    retl
@@ -215,8 +257,16 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X64-O0-NEXT:    addq $32, %rsp
 ; MSVC-X64-O0-NEXT:    movq -8(%rbp), %rcx
 ; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    subq %rcx, %rax
+; MSVC-X64-O0-NEXT:    jne .LBB1_2
+; MSVC-X64-O0-NEXT:    jmp .LBB1_1
+; MSVC-X64-O0-NEXT:  .LBB1_2:
+; MSVC-X64-O0-NEXT:    movq -8(%rbp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-O0-NEXT:    subq $32, %rsp
 ; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:  .LBB1_1:
 ; MSVC-X64-O0-NEXT:    movq %rbp, %rsp
 ; MSVC-X64-O0-NEXT:    popq %rbp
 ; MSVC-X64-O0-NEXT:    retq
@@ -253,14 +303,21 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl %edi
 ; MSVC-X86-NEXT:    calll _escape
 ; MSVC-X86-NEXT:    addl $4, %esp
-; MSVC-X86-NEXT:    movl 12(%esi), %ecx
-; MSVC-X86-NEXT:    xorl %ebp, %ecx
-; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl 12(%esi), %eax
+; MSVC-X86-NEXT:    xorl %ebp, %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    cmpl %eax, %ecx
+; MSVC-X86-NEXT:    jne LBB2_2
+; MSVC-X86-NEXT:  # %bb.1:
 ; MSVC-X86-NEXT:    leal -8(%ebp), %esp
 ; MSVC-X86-NEXT:    popl %esi
 ; MSVC-X86-NEXT:    popl %edi
 ; MSVC-X86-NEXT:    popl %ebp
 ; MSVC-X86-NEXT:    retl
+; MSVC-X86-NEXT:  LBB2_2:
+; MSVC-X86-NEXT:    movl 12(%esi), %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
 ;
 ; MSVC-X64-LABEL: test_vla_realign:
 ; MSVC-X64:       # %bb.0:
@@ -286,9 +343,10 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X64-NEXT:    movq %rsi, %rcx
 ; MSVC-X64-NEXT:    callq escape
 ; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    movq 24(%rbx), %rcx
-; MSVC-X64-NEXT:    xorq %rbp, %rcx
-; MSVC-X64-NEXT:    cmpq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    movq 24(%rbx), %rax
+; MSVC-X64-NEXT:    xorq %rbp, %rax
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    cmpq %rax, %rcx
 ; MSVC-X64-NEXT:    jne .LBB2_2
 ; MSVC-X64-NEXT:  # %bb.1:
 ; MSVC-X64-NEXT:    movq %rbp, %rsp
@@ -297,10 +355,10 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X64-NEXT:    popq %rbp
 ; MSVC-X64-NEXT:    retq
 ; MSVC-X64-NEXT:  .LBB2_2:
+; MSVC-X64-NEXT:    movq 24(%rbx), %rcx
+; MSVC-X64-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-NEXT:    subq $32, %rsp
 ; MSVC-X64-NEXT:    callq __security_check_cookie
-; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    int3
 ;
 ; MSVC-X86-O0-LABEL: test_vla_realign:
 ; MSVC-X86-O0:       # %bb.0:
@@ -328,7 +386,15 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X86-O0-NEXT:    addl $4, %esp
 ; MSVC-X86-O0-NEXT:    movl 48(%esi), %ecx
 ; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    subl %ecx, %eax
+; MSVC-X86-O0-NEXT:    jne LBB2_2
+; MSVC-X86-O0-NEXT:    jmp LBB2_1
+; MSVC-X86-O0-NEXT:  LBB2_2:
+; MSVC-X86-O0-NEXT:    movl 48(%esi), %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
 ; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:  LBB2_1:
 ; MSVC-X86-O0-NEXT:    leal -4(%ebp), %esp
 ; MSVC-X86-O0-NEXT:    popl %esi
 ; MSVC-X86-O0-NEXT:    popl %ebp
@@ -361,8 +427,16 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X64-O0-NEXT:    addq $32, %rsp
 ; MSVC-X64-O0-NEXT:    movq 64(%rbx), %rcx
 ; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    subq %rcx, %rax
+; MSVC-X64-O0-NEXT:    jne .LBB2_2
+; MSVC-X64-O0-NEXT:    jmp .LBB2_1
+; MSVC-X64-O0-NEXT:  .LBB2_2:
+; MSVC-X64-O0-NEXT:    movq 64(%rbx), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-O0-NEXT:    subq $32, %rsp
 ; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:  .LBB2_1:
 ; MSVC-X64-O0-NEXT:    leaq 8(%rbp), %rsp
 ; MSVC-X64-O0-NEXT:    popq %rbx
 ; MSVC-X64-O0-NEXT:    popq %rbp
@@ -377,3 +451,4 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 declare ptr @strcpy(ptr, ptr) nounwind
 
 declare i32 @printf(ptr, ...) nounwind
+
diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll
index 7ea5dd49f024..ac5dda7d69bd 100644
--- a/llvm/test/CodeGen/X86/tailcc-ssp.ll
+++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll
@@ -13,9 +13,10 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; WINDOWS-NEXT:    movq __security_cookie(%rip), %rax
 ; WINDOWS-NEXT:    xorq %rsp, %rax
 ; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; WINDOWS-NEXT:    xorq %rsp, %rcx
-; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    cmpq %rax, %rcx
 ; WINDOWS-NEXT:    jne .LBB0_1
 ; WINDOWS-NEXT:  # %bb.2:
 ; WINDOWS-NEXT:    xorl %ecx, %ecx
@@ -26,6 +27,8 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; WINDOWS-NEXT:    .seh_endepilogue
 ; WINDOWS-NEXT:    jmp h # TAILCALL
 ; WINDOWS-NEXT:  .LBB0_1:
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
@@ -49,7 +52,6 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; LINUX-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
 ; LINUX-NEXT:    .cfi_def_cfa_offset 32
 ; LINUX-NEXT:    callq __stack_chk_fail@PLT
-
    tail call tailcc void @h(ptr null, i64 0, ptr null)
    ret void
 }
@@ -65,9 +67,10 @@ define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-NEXT:    xorq %rsp, %rax
 ; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WINDOWS-NEXT:    callq bar
-; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; WINDOWS-NEXT:    xorq %rsp, %rcx
-; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    cmpq %rax, %rcx
 ; WINDOWS-NEXT:    jne .LBB1_1
 ; WINDOWS-NEXT:  # %bb.2:
 ; WINDOWS-NEXT:    .seh_startepilogue
@@ -75,6 +78,8 @@ define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-NEXT:    .seh_endepilogue
 ; WINDOWS-NEXT:    jmp bar # TAILCALL
 ; WINDOWS-NEXT:  .LBB1_1:
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
@@ -115,9 +120,10 @@ define void @caller() sspreq {
 ; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WINDOWS-NEXT:    callq callee
 ; WINDOWS-NEXT:    callq callee
-; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; WINDOWS-NEXT:    xorq %rsp, %rcx
-; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    cmpq %rax, %rcx
 ; WINDOWS-NEXT:    jne .LBB2_2
 ; WINDOWS-NEXT:  # %bb.1:
 ; WINDOWS-NEXT:    .seh_startepilogue
@@ -125,6 +131,8 @@ define void @caller() sspreq {
 ; WINDOWS-NEXT:    .seh_endepilogue
 ; WINDOWS-NEXT:    retq
 ; WINDOWS-NEXT:  .LBB2_2:
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
diff --git a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll
index 566d36e87d2b..d0d724910faf 100644
--- a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll
+++ b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll
@@ -15,7 +15,7 @@
 ; CHECK:         subl    $20, %esp
 ; CHECK:         .cv_fpo_stackalloc      20
 ; CHECK:         .cv_fpo_endprologue
-; CHECK:         ___security_cookie
+; CHECK:         movl    ___security_cookie, %ea
 
 ; CHECK:         movl    28(%esp), %esi
 ; CHECK:         movl    %esi, {{[0-9]*}}(%esp)
@@ -24,13 +24,16 @@
 ; CHECK:         movl    %esi, {{[0-9]*}}(%esp)
 
 ; CHECK:         calll   _escape
-; CHECK:         calll   @__security_check_cookie@4
+
+; CHECK:         movl    ___security_cookie, %ecx
+; CHECK:         cmpl    %eax, %ecx
 
 ; CHECK:         movl    %esi, %eax
 ; CHECK:         addl    $20, %esp
 ; CHECK:         popl    %esi
 ; CHECK:         retl
 ; CHECK: Ltmp4:
+; CHECK:         calll   @__security_check_cookie@4
 ; CHECK:         .cv_fpo_endproc
 
 ; ModuleID = 't.c'

From 36878158586b92e53dd615264f883e9d7530d047 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 12 Jun 2025 14:39:15 +0000
Subject: [PATCH 0145/1322] [gn build] Port e1e1836bbd70

---
 llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index f17b9afcbcdd..f22ee4f31741 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -144,7 +144,6 @@ static_library("LLVMX86CodeGen") {
     "X86VZeroUpper.cpp",
     "X86WinEHState.cpp",
     "X86WinEHUnwindV2.cpp",
-    "X86WinFixupBufferSecurityCheck.cpp",
   ]
 }
 

From b6a56b8ef26a6b612eb5f49d37024666b073481e Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Thu, 12 Jun 2025 15:50:31 +0100
Subject: [PATCH 0146/1322] [llvm-remarkutil] bitstream2yaml: Keep output file
 (#143220)

Keep the output file on successful exit, otherwise `llvm-remarkutil
bitstream2yaml -o filename.yaml ...` does not produce any output,
because the output file is deleted when the tool exits.
---
 llvm/test/tools/llvm-remarkutil/convert.test | 7 ++++---
 llvm/tools/llvm-remarkutil/RemarkConvert.cpp | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/test/tools/llvm-remarkutil/convert.test b/llvm/test/tools/llvm-remarkutil/convert.test
index 83023c8ce6a8..0d7ab8e4682a 100644
--- a/llvm/test/tools/llvm-remarkutil/convert.test
+++ b/llvm/test/tools/llvm-remarkutil/convert.test
@@ -1,6 +1,7 @@
-RUN: llvm-remarkutil bitstream2yaml %p/Inputs/two-remarks.bitstream -o - | FileCheck %s -strict-whitespace
-RUN: llvm-remarkutil yaml2bitstream %p/Inputs/two-remarks.yaml -o %t
-RUN: llvm-remarkutil bitstream2yaml %t -o - | FileCheck %s -strict-whitespace
+RUN: llvm-remarkutil bitstream2yaml %p/Inputs/two-remarks.bitstream -o %t.yaml
+RUN: FileCheck %s -strict-whitespace < %t.yaml
+RUN: llvm-remarkutil yaml2bitstream %p/Inputs/two-remarks.yaml -o %t.bitstream
+RUN: llvm-remarkutil bitstream2yaml %t.bitstream -o - | FileCheck %s -strict-whitespace
 
 ; CHECK: --- !Analysis
 ; CHECK-NEXT: Pass:            prologepilog
diff --git a/llvm/tools/llvm-remarkutil/RemarkConvert.cpp b/llvm/tools/llvm-remarkutil/RemarkConvert.cpp
index 35d8dcd99b4a..207c5e0a8048 100644
--- a/llvm/tools/llvm-remarkutil/RemarkConvert.cpp
+++ b/llvm/tools/llvm-remarkutil/RemarkConvert.cpp
@@ -133,6 +133,7 @@ static Error tryBitstream2YAML() {
   if (!E.isA<EndOfFileError>())
     return E;
   consumeError(std::move(E));
+  OF->keep();
   return Error::success();
 }
 } // namespace bitstream2yaml

From ca5b71a4559890a9768558ddea724782fb638bfa Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 12 Jun 2025 10:52:25 -0400
Subject: [PATCH 0147/1322] [Matrix] Propagate shape information through Select
 insts (#141876)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  47 +++++-
 .../LowerMatrixIntrinsics/select.ll           | 146 ++++++++++++++++++
 2 files changed, 188 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/select.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index a7072ea71929..ce6eaa292d8f 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -323,9 +323,11 @@ computeShapeInfoForInst(Instruction *I,
       return OpShape->second;
   }
 
-  if (isUniformShape(I)) {
+  if (isUniformShape(I) || isa<SelectInst>(I)) {
+    auto Ops = I->operands();
+    auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
     // Find the first operand that has a known shape and use that.
-    for (auto &Op : I->operands()) {
+    for (auto &Op : ShapedOps) {
       auto OpShape = ShapeMap.find(Op.get());
       if (OpShape != ShapeMap.end())
         return OpShape->second;
@@ -701,7 +703,8 @@ public:
       default:
         return isUniformShape(II);
       }
-    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) ||
+           isa<SelectInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
@@ -788,10 +791,12 @@ public:
       } else if (isa<StoreInst>(V)) {
         // Nothing to do.  We forward-propagated to this so we would just
         // backward propagate to an instruction with an already known shape.
-      } else if (isUniformShape(V)) {
+      } else if (isUniformShape(V) || isa<SelectInst>(V)) {
+        auto Ops = cast<Instruction>(V)->operands();
+        auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops;
         // Propagate to all operands.
         ShapeInfo Shape = ShapeMap[V];
-        for (Use &U : cast<Instruction>(V)->operands()) {
+        for (Use &U : ShapedOps) {
           if (setShapeInfo(U.get(), Shape))
             pushInstruction(U.get(), WorkList);
         }
@@ -1148,6 +1153,8 @@ public:
         Result = VisitUnaryOperator(UnOp, SI);
       else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))
         Result = VisitIntrinsicInst(Intr, SI);
+      else if (auto *Select = dyn_cast<SelectInst>(Inst))
+        Result = VisitSelectInst(Select, SI);
       else if (match(Inst, m_Load(m_Value(Op1))))
         Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
@@ -2307,6 +2314,36 @@ public:
                                    Result.getNumVectors());
   }
 
+  /// Lower selects.
+  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape) {
+    Value *Cond = Inst->getOperand(0);
+    Value *OpA = Inst->getOperand(1);
+    Value *OpB = Inst->getOperand(2);
+
+    IRBuilder<> Builder(Inst);
+
+    MatrixTy Result;
+    MatrixTy A = getMatrix(OpA, Shape, Builder);
+    MatrixTy B = getMatrix(OpB, Shape, Builder);
+
+    Value *CondV[2];
+    if (isa<FixedVectorType>(Cond->getType())) {
+      MatrixTy C = getMatrix(Cond, Shape, Builder);
+      CondV[0] = C.getVector(0);
+      CondV[1] = C.getVector(1);
+    } else {
+      CondV[0] = Cond;
+      CondV[1] = Cond;
+    }
+
+    for (unsigned I = 0, E = Shape.getNumVectors(); I != E; ++I)
+      Result.addVector(
+          Builder.CreateSelect(CondV[I], A.getVector(I), B.getVector(I)));
+
+    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                   Result.getNumVectors());
+  }
+
   /// Helper to linearize a matrix expression tree into a string. Currently
   /// matrix expressions are linarized by starting at an expression leaf and
   /// linearizing bottom up.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
new file mode 100644
index 000000000000..70b0dfdb3e7e
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @select_2x2_bot(i1 %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_bot(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 4
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %rhsv = load <4 x float>, ptr %rhs
+  %op = select i1 %cond, <4 x float> %lhsv, <4 x float> %rhsv
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @select_2x2_lhs(i1 %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_lhs(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %lhs, i64 2, i1 false, i32 2, i32 2)
+  %rhsv = load <4 x float>, ptr %rhs
+  %op = select i1 %cond, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_rhs(i1 %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_rhs(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS1:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select i1 %cond, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape1(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape1(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[CONDV:%.*]] = load <4 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, ptr [[RHS1:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, ptr [[RHS1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <2 x float>, ptr [[VEC_GEP6]], align 4
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = shufflevector <4 x i1> [[CONDV]], <4 x i1> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = shufflevector <4 x i1> [[CONDV]], <4 x i1> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[COL_LOAD2]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COL_LOAD4]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD7]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP8]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = load <4 x i1>, ptr %cond
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape2(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i1, ptr [[COND]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i1>, ptr [[VEC_GEP3]], align 1
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <2 x float>, ptr [[VEC_GEP6]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[COL_LOAD2]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COL_LOAD4]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD7]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP8]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 2, i1 false, i32 2, i32 2)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape3(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape3(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <4 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, ptr [[VEC_GEP4]], align 4
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i1> [[COL_LOAD2]], <4 x i1> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <4 x i1> [[COL_LOAD2]], <4 x i1> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[SPLIT]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[SPLIT6]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD5]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 4, i1 false, i32 4, i32 1)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}

From bba4ded3c2f94fe0de6011a6941b135b3cb0370a Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Thu, 12 Jun 2025 22:53:41 +0800
Subject: [PATCH 0148/1322] [libc++] Fix constructing `bitset` from
 non-null-terminated arrays (#143691)

Unconditional evaluation of `char_traits<_CharT>::length(__str)` is problematic, because it causes
UB when `__str` points to a non-null-terminated array. We should only call `length` (currently, in
`basic_string_view`'s constructor) when `__n == npos` per [bitset.cons]/8.

Drive-by change: Reduction of conditional compilation, given that
- both `basic_string_view<_CharT>::size_type` and `basic_string<_CharT>::size_type` must be
  `size_t`, and thus
- both `basic_string_view<_CharT>::npos` and `basic_string<_CharT>::npos` must be `size_t(-1)`.

For the type sameness in the standard wording, see:
- [string.view.template.general]
- [basic.string.general]
- [allocator.traits.types]/6
- [default.allocator.general]/1

Fixes #143684
---
 libcxx/include/bitset                         | 13 ++++-----
 .../bitset.cons/char_ptr_ctor.pass.cpp        | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 88dc0e08c995..6be476e2b69d 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -645,16 +645,13 @@ public:
   template <class _CharT, __enable_if_t<_IsCharLikeType<_CharT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit bitset(
       const _CharT* __str,
-#  if _LIBCPP_STD_VER >= 26
-      typename basic_string_view<_CharT>::size_type __n = basic_string_view<_CharT>::npos,
-#  else
-      typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
-#  endif
+      size_t __n    = basic_string<_CharT>::npos,
       _CharT __zero = _CharT('0'),
       _CharT __one  = _CharT('1')) {
-
-    size_t __rlen = std::min(__n, char_traits<_CharT>::length(__str));
-    __init_from_string_view(basic_string_view<_CharT>(__str, __rlen), __zero, __one);
+    if (__n == basic_string<_CharT>::npos)
+      __init_from_string_view(basic_string_view<_CharT>(__str), __zero, __one);
+    else
+      __init_from_string_view(basic_string_view<_CharT>(__str, __n), __zero, __one);
   }
 #  if _LIBCPP_STD_VER >= 26
   template <class _CharT, class _Traits>
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
index 86b144ed87b7..4f9cdaeb38c0 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
@@ -72,6 +72,35 @@ TEST_CONSTEXPR_CXX23 void test_char_pointer_ctor()
     for (std::size_t i = 10; i < v.size(); ++i)
         assert(v[i] == false);
   }
+  // Verify that this constructor doesn't read over the given bound.
+  // See https://github.com/llvm/llvm-project/issues/143684
+  {
+    const char not_null_terminated[] = {'1', '0', '1', '0', '1', '0', '1', '0', '1', '0'};
+    std::bitset<N> v(not_null_terminated, 10);
+    std::size_t M = std::min<std::size_t>(v.size(), 10);
+    for (std::size_t i = 0; i < M; ++i)
+      assert(v[i] == (not_null_terminated[M - 1 - i] == '1'));
+    for (std::size_t i = 10; i < v.size(); ++i)
+      assert(!v[i]);
+  }
+  {
+    const char not_null_terminated[] = {'1', 'a', '1', 'a', '1', 'a', '1', 'a', '1', 'a'};
+    std::bitset<N> v(not_null_terminated, 10, 'a');
+    std::size_t M = std::min<std::size_t>(v.size(), 10);
+    for (std::size_t i = 0; i < M; ++i)
+      assert(v[i] == (not_null_terminated[M - 1 - i] == '1'));
+    for (std::size_t i = 10; i < v.size(); ++i)
+      assert(!v[i]);
+  }
+  {
+    const char not_null_terminated[] = {'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a'};
+    std::bitset<N> v(not_null_terminated, 10, 'a', 'b');
+    std::size_t M = std::min<std::size_t>(v.size(), 10);
+    for (std::size_t i = 0; i < M; ++i)
+      assert(v[i] == (not_null_terminated[M - 1 - i] == 'b'));
+    for (std::size_t i = 10; i < v.size(); ++i)
+      assert(!v[i]);
+  }
 }
 
 TEST_CONSTEXPR_CXX23 bool test() {

From 5c1a021f7f285f702a290d7faaaf0a274b3bf5a1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 07:54:36 -0700
Subject: [PATCH 0149/1322] [libc++] Fix typos in documentation (#143912)

---
 libcxx/docs/ABIGuarantees.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/docs/ABIGuarantees.rst b/libcxx/docs/ABIGuarantees.rst
index c25aaa8e4233..e6ac4f2b5b23 100644
--- a/libcxx/docs/ABIGuarantees.rst
+++ b/libcxx/docs/ABIGuarantees.rst
@@ -40,7 +40,7 @@ significantly.
 ``_LIBCPP_ABI_NO_ITERATOR_BASES``
 ---------------------------------
 This removes the ``iterator`` base class from ``back_insert_iterator``, ``front_insert_iterator``, ``insert_iterator``,
-``istream_iterator``, ``ostream_iterator``, ``ostreambuf_itreator``, ``reverse_iterator``, and ``raw_storage_iterator``.
+``istream_iterator``, ``ostream_iterator``, ``ostreambuf_iterator``, ``reverse_iterator``, and ``raw_storage_iterator``.
 This doesn't directly affect the layout of these types in most cases, but may result in more padding being used when
 they are used in combination, for example ``reverse_iterator<reverse_iterator<T>>``.
 
@@ -63,7 +63,7 @@ removes these workarounds for platforms that don't care about ABI compatibility.
 
 ``_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING``
 ------------------------------------------
-This removes artifical padding from ``_LIBCPP_COMPRESSED_PAIR`` and ``_LIBCPP_COMPRESSED_TRIPLE``.
+This removes artificial padding from ``_LIBCPP_COMPRESSED_PAIR`` and ``_LIBCPP_COMPRESSED_TRIPLE``.
 
 These macros are used inside the associative and unordered containers, ``deque``, ``forward_list``, ``future``,
 ``list``, ``basic_string``, ``function``, ``shared_ptr``, ``unique_ptr``, and ``vector`` to stay ABI compatible with the
@@ -83,7 +83,7 @@ flag removes that artificial padding.
 
 Linking TUs which have been compiled against different releases of libc++
 =========================================================================
-libc++ supports linking TUs which have beeen compiled against different releases of libc++ by marking symbols with
+libc++ supports linking TUs which have been compiled against different releases of libc++ by marking symbols with
 hidden visibility and changing the mangling of header-only functions in every release.
 
 
@@ -104,7 +104,7 @@ behave as the flags say.
 
 Availability of symbols in the built library (both static and shared)
 =====================================================================
-In general, libc++ does not make any guarantees about forwards-compability. That is, a TU compiled against new headers
+In general, libc++ does not make any guarantees about forwards-compatibility. That is, a TU compiled against new headers
 may not work with an older library. Vendors who require such support can leverage availability markup. On the other
 hand, backwards compatibility is generally guaranteed.
 
@@ -166,7 +166,7 @@ There are multiple ABI flags which change which type an alias references:
 
 ``_LIBCPP_ABI_INCOMPLETE_TYPES_IN_DEQUE``
 -----------------------------------------
-This changes ``deque::iterator`` to avoid requring complete types for ``deque``.
+This changes ``deque::iterator`` to avoid requiring complete types for ``deque``.
 
 ``_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE``
 -------------------------------------------------
@@ -198,7 +198,7 @@ This changes the value of ``regex_constants::syntax_option-type::ECMAScript`` to
 ``_LIBCPP_ABI_FIX_CITYHASH_IMPLEMENTATION``
 -------------------------------------------
 This flag fixes the implementation of CityHash used for ``hash<fundamental-type>``. The incorrect implementation of
-CityHash has the roblem that it drops some bits on the floor. Fixing the implementation changes the hash of values,
+CityHash has the problem that it drops some bits on the floor. Fixing the implementation changes the hash of values,
 resulting in an ABI break.
 
 inline namespaces

From 4f60321ca183ebf132e97e54d8d560643c5c3340 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 12 Jun 2025 15:59:59 +0100
Subject: [PATCH 0150/1322] [Offload] Add `ol_dimensions_t` and convert ranges
 from size_t -> uint32_t (#143901)

This is a three element x, y, z size_t vector that can be used any place
where a 3D vector is required. This ensures that all vectors across
liboffload are the same and don't require any resizing/reordering
dances.
---
 offload/liboffload/API/Common.td                    | 10 ++++++++++
 offload/liboffload/API/Kernel.td                    |  8 ++------
 offload/liboffload/src/OffloadImpl.cpp              | 12 ++++++------
 .../unittests/OffloadAPI/kernel/olLaunchKernel.cpp  | 13 ++++---------
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 7674da0438c2..8a2ecd6c6e8f 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -148,6 +148,16 @@ def : Struct {
   ];
 }
 
+def : Struct {
+  let name = "ol_dimensions_t";
+  let desc = "A three element vector";
+  let members = [
+    StructMember<"uint32_t", "x", "X">,
+    StructMember<"uint32_t", "y", "Y">,
+    StructMember<"uint32_t", "z", "Z">,
+  ];
+}
+
 def : Function {
   let name = "olInit";
   let desc = "Perform initialization of the Offload library and plugins";
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 45e3d8112791..0913a036fa04 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -29,12 +29,8 @@ def : Struct {
     let desc = "Size-related arguments for a kernel launch.";
     let members = [
         StructMember<"size_t", "Dimensions", "Number of work dimensions">,
-        StructMember<"size_t", "NumGroupsX", "Number of work groups on the X dimension">,
-        StructMember<"size_t", "NumGroupsY", "Number of work groups on the Y dimension">,
-        StructMember<"size_t", "NumGroupsZ", "Number of work groups on the Z dimension">,
-        StructMember<"size_t", "GroupSizeX", "Size of a work group on the X dimension.">,
-        StructMember<"size_t", "GroupSizeY", "Size of a work group on the Y dimension.">,
-        StructMember<"size_t", "GroupSizeZ", "Size of a work group on the Z dimension.">,
+        StructMember<"struct ol_dimensions_t", "NumGroups", "Number of work groups in each dimension">,
+        StructMember<"struct ol_dimensions_t", "GroupSize", "Size of a work group in each dimension">,
         StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
     ];
 }
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d2b331905ab7..0a784cddeaec 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -499,12 +499,12 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
   AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
   KernelArgsTy LaunchArgs{};
-  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
-  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
-  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ;
-  LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSizeX;
-  LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
-  LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
+  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroups.x;
+  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroups.y;
+  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroups.z;
+  LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSize.x;
+  LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSize.y;
+  LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
 
   KernelLaunchParamsTy Params;
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index d466799c1aca..157f33a36370 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -19,13 +19,8 @@ struct LaunchKernelTestBase : OffloadQueueTest {
                                    DeviceBin->getBufferSize(), &Program));
     ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
     LaunchArgs.Dimensions = 1;
-    LaunchArgs.GroupSizeX = 64;
-    LaunchArgs.GroupSizeY = 1;
-    LaunchArgs.GroupSizeZ = 1;
-
-    LaunchArgs.NumGroupsX = 1;
-    LaunchArgs.NumGroupsY = 1;
-    LaunchArgs.NumGroupsZ = 1;
+    LaunchArgs.GroupSize = {64, 1, 1};
+    LaunchArgs.NumGroups = {1, 1, 1};
 
     LaunchArgs.DynSharedMemory = 0;
   }
@@ -60,7 +55,7 @@ OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
 TEST_P(olLaunchKernelTest, Success) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
-                            LaunchArgs.GroupSizeX * sizeof(uint32_t), &Mem));
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
   struct {
     void *Mem;
   } Args{Mem};
@@ -88,7 +83,7 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
 TEST_P(olLaunchKernelTest, SuccessSynchronous) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
-                            LaunchArgs.GroupSizeX * sizeof(uint32_t), &Mem));
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
 
   struct {
     void *Mem;

From 4bd0a0e50bcfc3263c219acc9709ae234a334456 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Thu, 12 Jun 2025 17:09:55 +0200
Subject: [PATCH 0151/1322] Revert "[flang] Enable delayed localization by
 default for `do concurrent` (#142567)" (#143905)

This reverts commit 937be177528de156922c1b5f6cab08ba3009dbf2.

Resolves https://github.com/llvm/llvm-project/issues/143897 until the
todo is properly handled.
---
 flang/lib/Lower/Bridge.cpp                            | 6 +++++-
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 5ff8101dba09..64b16b3abe99 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,7 +2033,11 @@ private:
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
+    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
+    // default unlike the staging flag) once the implementation of this is more
+    // complete.
+    bool useDelayedPriv =
+        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 039b17808d19..6cae0eb46db1 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index 67f080eb2c1c..a3d0c34ed856 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 798cbb335c8c..d64321385474 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 64f14ff97227..60df27a591dc 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 34d7bcfb7d7a..84db1972cca1 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From 62b694090093ed34d620dd1129b194fc66fa4bb0 Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Thu, 12 Jun 2025 16:10:33 +0100
Subject: [PATCH 0152/1322] [mlir][spirv] Add definition for GL
 Pack/UnpackHalf2x16 (#143889)

---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       |  84 ++++++++++++++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 104 ++++++++++++++++++
 mlir/test/Target/SPIRV/gl-ops.mlir            |  10 +-
 3 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index f3f75240e521..7ffe0c8da1ca 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1317,4 +1317,88 @@ def SPIRV_GLFractOp : SPIRV_GLUnaryArithmeticOp<"Fract", 10, SPIRV_Float> {
   }];
 }
 
+// -----
+
+def SPIRV_GLPackHalf2x16Op : SPIRV_GLOp<"PackHalf2x16", 58, [Pure]> {
+  let summary = "Pack two-component vector of 32-bit floats into a 32-bit integer";
+
+  let description = [{
+    Result is the unsigned integer obtained by converting the components of a
+    two-component floating-point vector to the 16-bit OpTypeFloat, and then packing
+    these two 16-bit integers into a 32-bit unsigned integer. The first vector
+    component specifies the 16 least-significant bits of the result; the second
+    component specifies the 16 most-significant bits.
+
+    The RelaxedPrecision Decoration only affects the conversion step of the instruction.
+
+    The v operand must be a vector of 2 components whose type is a 32-bit floating-point.
+
+    Result Type must be a 32-bit integer type.
+
+    #### Example:
+
+    ```mlir
+    %1 = spirv.GL.PackHalf2x16 %0 : vector<2xf32> -> i32
+    ```
+  }];
+
+  let arguments = (ins
+    VectorOfLengthAndType<[2], [SPIRV_Float32]>:$operand
+  );
+
+  let results = (outs
+    SPIRV_Int32:$result
+  );
+
+  let assemblyFormat = [{
+    attr-dict $operand `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
+// -----
+
+def SPIRV_GLUnpackHalf2x16Op : SPIRV_GLOp<"UnpackHalf2x16", 62, [Pure]> {
+  let summary = "Unpack 32-bit integer into two-component vector of 32-bit floats";
+
+  let description = [{
+    Result is the two-component floating-point vector with components obtained by
+    unpacking a 32-bit unsigned integer into a pair of 16-bit values, interpreting
+    those values as 16-bit floating-point numbers according to the OpenGL
+    Specification, and converting them to 32-bit floating-point values. Subnormal
+    numbers are either preserved or flushed to zero, consistently within an
+    implementation.
+
+    The first component of the vector is obtained from the 16 least-significant bits
+    of v; the second component is obtained from the 16 most-significant bits of v.
+
+    The RelaxedPrecision Decoration only affects the conversion step of the instruction.
+
+    The v operand must be a scalar with 32-bit integer type.
+
+    Result Type must be a vector of 2 components whose type is 32-bit floating point.
+
+    #### Example:
+
+    ```mlir
+    %1 = spirv.GL.UnpackHalf2x16 %0 : i32 -> vector<2xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_Int32:$operand
+  );
+
+  let results = (outs
+    VectorOfLengthAndType<[2], [SPIRV_Float32]>:$result
+  );
+
+  let assemblyFormat = [{
+    attr-dict $operand `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
 #endif // MLIR_DIALECT_SPIRV_IR_GL_OPS
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 29beee5aea93..fbcf2095dc60 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -815,3 +815,107 @@ func.func @exp2_invalid_type(%arg0 : i32) -> () {
   %0 = spirv.GL.Exp2 %arg0 : i32
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.PackHalf2x16 
+//===----------------------------------------------------------------------===//
+
+func.func @pack_half_2x16(%arg0 : vector<2xf32>) -> () {
+  // CHECK: spirv.GL.PackHalf2x16 {{%.*}} : vector<2xf32> -> i32
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<2xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_i16_output(%arg0 : vector<2xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be Int32, but got 'i16'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<2xf32> -> i16
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_wrong_vec_size(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op operand #0 must be vector of Float32 values of length 2, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<3xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_wrong_vec_type(%arg0 : vector<2xi32>) -> () {
+  // expected-error @+1 {{op operand #0 must be vector of Float32 values of length 2, but got 'vector<2xi32>'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<2xi32> -> i32
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_scalar_in(%arg0 : f32) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.vector, but found 'f32'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : f32 -> i32
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_vector_out(%arg0 : vector<2xf32>) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.integer, but found 'vector<2xf32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : vector<2xf32> -> vector<2xi32>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.UnpackHalf2x16 
+//===----------------------------------------------------------------------===//
+
+func.func @unpack_half_2x16(%arg0 : i32) -> () {
+  // CHECK: spirv.GL.UnpackHalf2x16 {{%.*}} : i32 -> vector<2xf32>
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_i16_input(%arg0 : i16) -> () {
+  // expected-error @+1 {{op operand #0 must be Int32, but got 'i16'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i16 -> vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_wrong_vec_size(%arg0 : i32) -> () {
+  // expected-error @+1 {{op result #0 must be vector of Float32 values of length 2, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<3xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_wrong_vec_type(%arg0 : i32) -> () {
+  // expected-error @+1 {{op result #0 must be vector of Float32 values of length 2, but got 'vector<2xi32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<2xi32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_vec_in(%arg0 : vector<2xf32>) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.integer, but found 'vector<2xf32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : vector<2xf32> -> vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_scalar_out(%arg0 : i32) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.vector, but found 'f32'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> f32
+  return
+}
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index 3dee03345e9a..e4a6c6fb5a34 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -96,7 +96,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.Return
   }
 
-spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
+  spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
     // CHECK: {{%.*}} = spirv.GL.Cross {{%.*}}, {{%.*}} : vector<3xf32>
     %0 = spirv.GL.Cross %arg1, %arg2 : vector<3xf32>
     // CHECK: {{%.*}} = spirv.GL.Normalize {{%.*}} : f32
@@ -114,5 +114,11 @@ spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "N
     spirv.Return
   }
 
-
+  spirv.func @pack_half_2x16(%arg0 : i32) "None" {
+    // CHECK: {{%.*}} = spirv.GL.UnpackHalf2x16 {{%.*}} : i32 -> vector<2xf32>
+    %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<2xf32>
+    // CHECK: {{%.*}} = spirv.GL.PackHalf2x16 {{%.*}} : vector<2xf32> -> i32
+    %1 = spirv.GL.PackHalf2x16 %0 : vector<2xf32> -> i32
+    spirv.Return
+  }
 }

From e4de74ba11eadb47cf78afbabffbf2b1a50e7298 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Thu, 12 Jun 2025 17:11:06 +0200
Subject: [PATCH 0153/1322] =?UTF-8?q?[mlir][Vector]=20Tighten=20up=20appli?=
 =?UTF-8?q?cation=20conditions=20in=20TransferReadAfter=E2=80=A6=20(#14386?=
 =?UTF-8?q?9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…WriteToBroadcast

The pattern would previously apply in spurious cases and generate
incorrect IR.

In the process, we disable the application of this pattern in the case
where there is no broadcast; this should be handled separately and may
more easily support masking.

The case {no-broadcast, yes-transpose} was previously caught by this
pattern and arguably could also generate incorrect IR (and was also
untested): this case does not apply anymore.

The last cast {yes-broadcast, yes-transpose} continues to apply but
should arguably be removed from the future because creating transposes
as part of canonicalization feels dangerous.
There are other patterns that move permutation logic:

- either into the transfer, or
- outside of the transfer

Ideally, this would be target-dependent and not a canonicalization (i.e.
does your DMA HW allow transpose on the fly or not) but this is beyond
the scope of this PR.

Co-authored-by: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   |  30 ++++--
 mlir/test/Dialect/Vector/canonicalize.mlir | 108 ++++++++++++++++++---
 2 files changed, 117 insertions(+), 21 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index a295bf1eb4d9..2a2357319bd2 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4668,12 +4668,15 @@ struct TransferReadAfterWriteToBroadcast
 
   LogicalResult matchAndRewrite(TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
-    if (readOp.hasOutOfBoundsDim() ||
-        !llvm::isa<RankedTensorType>(readOp.getShapedType()))
-      return failure();
     auto defWrite = readOp.getBase().getDefiningOp<vector::TransferWriteOp>();
     if (!defWrite)
       return failure();
+    // Bail if we need an alias analysis.
+    if (!readOp.hasPureTensorSemantics() || !defWrite.hasPureTensorSemantics())
+      return failure();
+    // Bail if we need a bounds analysis.
+    if (readOp.hasOutOfBoundsDim() || defWrite.hasOutOfBoundsDim())
+      return failure();
     // TODO: If the written transfer chunk is a superset of the read transfer
     // chunk we could do an extract_strided_slice.
     if (readOp.getTransferChunkAccessed() !=
@@ -4684,15 +4687,28 @@ struct TransferReadAfterWriteToBroadcast
     if (getUnusedDimsBitVector({readOp.getPermutationMap()}) !=
         getUnusedDimsBitVector({defWrite.getPermutationMap()}))
       return failure();
-    if (readOp.getIndices() != defWrite.getIndices() ||
-        readOp.getMask() != defWrite.getMask())
+    // This pattern should only catch the broadcast case, the non-broadcast case
+    // should be done separately to keep application conditions clean and
+    // separate.
+    AffineMap readMap = compressUnusedDims(readOp.getPermutationMap());
+    AffineMap writeMap = compressUnusedDims(defWrite.getPermutationMap());
+    bool bcast = !readMap.getBroadcastDims().empty() ||
+                 !writeMap.getBroadcastDims().empty();
+    if (!bcast)
       return failure();
+    // At this point, we know we have a bcast.
+    // Bail in the masked case (too complex atm and needed to properly account
+    // for padding).
+    if (readOp.getMask() || defWrite.getMask())
+      return failure();
+    // If indices are not the same a shift may be required, bail.
+    if (readOp.getIndices() != defWrite.getIndices())
+      return failure();
+
     Value vec = defWrite.getVector();
     // TODO: loop through the chain of transfer_write if we can prove that they
     // don't overlap with the transfer_read. This requires improving
     // `isDisjointTransferIndices` helper.
-    AffineMap readMap = compressUnusedDims(readOp.getPermutationMap());
-    AffineMap writeMap = compressUnusedDims(defWrite.getPermutationMap());
     AffineMap map = readMap.compose(writeMap);
     if (map.getNumResults() == 0)
       return failure();
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index a06a9f67d54d..6691cb52acdc 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -408,7 +408,7 @@ func.func @extract_strided_fold_insert(%a: vector<6x4xf32>, %b: vector<8x16xf32>
 // -----
 
 // Negative test where the extract is not a subset of the element inserted.
-// CHECK-LABEL: extract_strided_fold_negative
+// CHECK-LABEL: negative_extract_strided_fold
 //  CHECK-SAME: (%[[ARG0:.*]]: vector<4x4xf32>, %[[ARG1:.*]]: vector<8x16xf32>
 //       CHECK:   %[[INS:.*]] = vector.insert_strided_slice %[[ARG0]], %[[ARG1]]
 //  CHECK-SAME:     {offsets = [2, 2], strides = [1, 1]}
@@ -417,7 +417,7 @@ func.func @extract_strided_fold_insert(%a: vector<6x4xf32>, %b: vector<8x16xf32>
 //  CHECK-SAME:     {offsets = [2, 2], sizes = [6, 4], strides = [1, 1]}
 //  CHECK-SAME:       : vector<8x16xf32> to vector<6x4xf32>
 //  CHECK-NEXT:   return %[[EXT]] : vector<6x4xf32>
-func.func @extract_strided_fold_negative(%a: vector<4x4xf32>, %b: vector<8x16xf32>)
+func.func @negative_extract_strided_fold(%a: vector<4x4xf32>, %b: vector<8x16xf32>)
   -> (vector<6x4xf32>) {
   %0 = vector.insert_strided_slice %a, %b {offsets = [2, 2], strides = [1, 1]}
     : vector<4x4xf32> into vector<8x16xf32>
@@ -753,10 +753,10 @@ func.func @fold_extract_broadcast_0dvec_input_scalar_output(%a : vector<f32>,
 
 // -----
 
-// CHECK-LABEL: fold_extract_broadcast_negative
+// CHECK-LABEL: negative_fold_extract_broadcast
 //       CHECK:   vector.broadcast %{{.*}} : vector<1x1xf32> to vector<1x1x4xf32>
 //       CHECK:   vector.extract %{{.*}}[0, 0] : vector<4xf32> from vector<1x1x4xf32>
-func.func @fold_extract_broadcast_negative(%a : vector<1x1xf32>) -> vector<4xf32> {
+func.func @negative_fold_extract_broadcast(%a : vector<1x1xf32>) -> vector<4xf32> {
   %b = vector.broadcast %a : vector<1x1xf32> to vector<1x1x4xf32>
   %r = vector.extract %b[0, 0] : vector<4xf32> from vector<1x1x4xf32>
   return %r : vector<4xf32>
@@ -895,11 +895,11 @@ func.func @fold_extract_shapecast_0d_source(%arg0 : vector<f32>) -> f32 {
 
 // -----
 
-// CHECK-LABEL: fold_extract_shapecast_negative
+// CHECK-LABEL: negative_fold_extract_shapecast
 //       CHECK:   %[[V:.*]] = vector.shape_cast %{{.*}} : vector<16xf32> to vector<2x4x2xf32>
 //       CHECK:   %[[R:.*]] = vector.extract %[[V]][1] : vector<4x2xf32> from vector<2x4x2xf32>
 //       CHECK:   return %[[R]] : vector<4x2xf32>
-func.func @fold_extract_shapecast_negative(%arg0 : vector<16xf32>) -> vector<4x2xf32> {
+func.func @negative_fold_extract_shapecast(%arg0 : vector<16xf32>) -> vector<4x2xf32> {
   %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<2x4x2xf32>
   %r = vector.extract %0[1] : vector<4x2xf32> from vector<2x4x2xf32>
   return %r : vector<4x2xf32>
@@ -1460,11 +1460,11 @@ func.func @store_after_load_tensor(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
 
 // -----
 
-// CHECK-LABEL: func @store_after_load_tensor_negative
+// CHECK-LABEL: func @negative_store_after_load_tensor
 //       CHECK:   vector.transfer_read
 //       CHECK:   vector.transfer_write
 //       CHECK:   return
-func.func @store_after_load_tensor_negative(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
+func.func @negative_store_after_load_tensor(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
@@ -1499,12 +1499,12 @@ func.func @store_to_load_tensor(%arg0 : tensor<4x4xf32>,
 
 // -----
 
-// CHECK-LABEL: func @store_to_load_negative_tensor
+// CHECK-LABEL: func @negative_store_to_load_tensor
 //       CHECK:   vector.transfer_write
 //       CHECK:   vector.transfer_write
 //       CHECK:   %[[V:.*]] = vector.transfer_read
 //       CHECK:   return %[[V]] : vector<1x4xf32>
-func.func @store_to_load_negative_tensor(%arg0 : tensor<4x4xf32>,
+func.func @negative_store_to_load_tensor(%arg0 : tensor<4x4xf32>,
   %v0 : vector<1x4xf32>, %v1 : vector<1x4xf32>, %i : index) -> vector<1x4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -1540,6 +1540,86 @@ func.func @store_to_load_tensor_broadcast(%arg0 : tensor<4x4xf32>,
 
 // -----
 
+// CHECK-LABEL: func @negative_store_to_load_tensor_memref
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_memref(
+    %arg0 : tensor<?x?xf32>,
+    %arg1 : memref<?x?xf32>,
+    %v0 : vector<4x2xf32>
+  ) -> vector<4x2xf32> 
+{
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  vector.transfer_write %v0, %arg1[%c0, %c0] {in_bounds = [true, true]} :
+    vector<4x2xf32>, memref<?x?xf32>
+  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, true]} :
+    tensor<?x?xf32>, vector<4x2xf32>
+  return %0 : vector<4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_store_to_load_tensor_no_actual_broadcast
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_no_actual_broadcast(%arg0 : tensor<?x?xf32>,
+  %v0 : vector<4x2xf32>) -> vector<4x2xf32> {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] :
+    vector<4x2xf32>, tensor<?x?xf32>
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true]} :
+    tensor<?x?xf32>, vector<4x2xf32>
+  return %0 : vector<4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_store_to_load_tensor_broadcast_out_of_bounds
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_broadcast_out_of_bounds(%arg0 : tensor<?x?xf32>,
+  %v0 : vector<4x2xf32>) -> vector<4x2x6xf32> {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] :
+    vector<4x2xf32>, tensor<?x?xf32>
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true, true],
+  permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} :
+    tensor<?x?xf32>, vector<4x2x6xf32>
+  return %0 : vector<4x2x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_store_to_load_tensor_broadcast_masked
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_broadcast_masked(
+    %arg0 : tensor<?x?xf32>, %v0 : vector<4x2xf32>, %mask : vector<4x2xi1>)
+  -> vector<4x2x6xf32> 
+{
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} :
+    vector<4x2xf32>, tensor<?x?xf32>
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true, true],
+  permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} :
+    tensor<?x?xf32>, vector<4x2x6xf32>
+  return %0 : vector<4x2x6xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @store_to_load_tensor_broadcast_scalable
 //  CHECK-SAME: (%[[ARG:.*]]: tensor<?xf32>, %[[V0:.*]]: vector<[4]xf32>)
 //       CHECK:   %[[B:.*]] = vector.broadcast %[[V0]] : vector<[4]xf32> to vector<6x[4]xf32>
@@ -1604,7 +1684,7 @@ func.func @dead_store_tensor(%arg0 : tensor<4x4xf32>,
 
 // -----
 
-// CHECK-LABEL: func @dead_store_tensor_negative
+// CHECK-LABEL: func @negative_dead_store_tensor
 //   CHECK-DAG:      %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:      %[[C1:.*]] = arith.constant 1 : index
 //       CHECK:   vector.transfer_write
@@ -1612,7 +1692,7 @@ func.func @dead_store_tensor(%arg0 : tensor<4x4xf32>,
 //       CHECK:   vector.transfer_read
 //       CHECK:   %[[VTW:.*]] = vector.transfer_write {{.*}}, {{.*}}[%[[C1]], %[[C0]]]
 //       CHECK:   return %[[VTW]] : tensor<4x4xf32>
-func.func @dead_store_tensor_negative(%arg0 : tensor<4x4xf32>,
+func.func @negative_dead_store_tensor(%arg0 : tensor<4x4xf32>,
   %v0 : vector<1x4xf32>, %v1 : vector<1x4xf32>, %i : index) -> tensor<4x4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -2063,10 +2143,10 @@ func.func @extract_insert_rank_reduce(%a: vector<4xf32>, %b: vector<8x16xf32>)
 
 // -----
 
-// CHECK-LABEL: extract_insert_negative
+// CHECK-LABEL: negative_extract_insert
 //       CHECK: vector.insert_strided_slice
 //       CHECK: vector.extract
-func.func @extract_insert_negative(%a: vector<2x15xf32>, %b: vector<12x8x16xf32>)
+func.func @negative_extract_insert(%a: vector<2x15xf32>, %b: vector<12x8x16xf32>)
   -> vector<16xf32> {
   %0 = vector.insert_strided_slice %a, %b {offsets = [4, 2, 0], strides = [1, 1]}
     : vector<2x15xf32> into vector<12x8x16xf32>

From 2e5fb77ce03748608cfad49fd62479fc3d912372 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Thu, 12 Jun 2025 08:22:04 -0700
Subject: [PATCH 0154/1322] [llvm] Make TestData compatible with c++20
 (#143801)

The clang-debian-cpp20 buildbot did not like direct initialization
without a matching constructor. This patch adds a new constructor taking
a json::Object that directly initializes the struct fields. We also
update an internal interface for const correctness.

https://lab.llvm.org/buildbot/#/builders/108/builds/13950
---
 .../llvm-test-mustache-spec.cpp               | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
index 28ed1b876672..1f566e13f070 100644
--- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -146,6 +146,13 @@ static const StringMap<StringSet<>> XFailTestNames = {{
 }};
 
 struct TestData {
+  TestData() = default;
+  explicit TestData(const json::Object &TestCase)
+      : TemplateStr(*TestCase.getString("template")),
+        ExpectedStr(*TestCase.getString("expected")),
+        Name(*TestCase.getString("name")), Data(TestCase.get("data")),
+        Partials(TestCase.get("partials")) {}
+
   static Expected<TestData> createTestData(json::Object *TestCase,
                                            StringRef InputFile) {
     // If any of the needed elements are missing, we cannot continue.
@@ -157,19 +164,14 @@ struct TestData {
           llvm::inconvertibleErrorCode(),
           "invalid JSON schema in test file: " + InputFile + "\n");
 
-    return TestData{TestCase->getString("template").value(),
-                    TestCase->getString("expected").value(),
-                    TestCase->getString("name").value(), TestCase->get("data"),
-                    TestCase->get("partials")};
+    return TestData(*TestCase);
   }
 
-  TestData() = default;
-
   StringRef TemplateStr;
   StringRef ExpectedStr;
   StringRef Name;
-  Value *Data;
-  Value *Partials;
+  const Value *Data;
+  const Value *Partials;
 };
 
 static void reportTestFailure(const TestData &TD, StringRef ActualStr,
@@ -191,7 +193,7 @@ static void reportTestFailure(const TestData &TD, StringRef ActualStr,
   }
 }
 
-static void registerPartials(Value *Partials, Template &T) {
+static void registerPartials(const Value *Partials, Template &T) {
   if (!Partials)
     return;
   for (const auto &[Partial, Str] : *Partials->getAsObject())

From 9b679889b596aa5076062d5fbbdd01e3532b4ff5 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 12 Jun 2025 08:24:50 -0700
Subject: [PATCH 0155/1322] [clang][darwin] Fix assertion failure when
 reporting fatal errors when inferring OS versions (#143817)

---
 .../clang/Basic/DiagnosticDriverKinds.td      |  2 +
 clang/lib/Driver/ToolChains/Darwin.cpp        | 52 +++++++++++++------
 .../Driver/darwin-invalid-version-range.c     | 29 +++++++++++
 3 files changed, 68 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/Driver/darwin-invalid-version-range.c

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 20fb47237c56..29f6480ba935 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -206,6 +206,8 @@ def err_drv_cannot_open_randomize_layout_seed_file : Error<
   "cannot read randomize layout seed file '%0'">;
 def err_drv_invalid_version_number : Error<
   "invalid version number in '%0'">;
+def err_drv_invalid_version_number_inferred
+    : Error<"invalid version number '%0' inferred from '%1'">;
 def err_drv_missing_version_number : Error<"missing version number in '%0'">;
 def err_drv_kcfi_arity_unsupported_target : Error<
   "target '%0' is unsupported by -fsanitize-kcfi-arity">;
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index e987ef78920e..e5075cbcaf66 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1793,16 +1793,23 @@ struct DarwinPlatform {
     case TargetArg:
     case MTargetOSArg:
     case OSVersionArg:
-    case InferredFromSDK:
-    case InferredFromArch:
       assert(Arg && "OS version argument not yet inferred");
       return Arg->getAsString(Args);
     case DeploymentTargetEnv:
       return (llvm::Twine(EnvVarName) + "=" + OSVersionStr).str();
+    case InferredFromSDK:
+    case InferredFromArch:
+      llvm_unreachable("Cannot print arguments for inferred OS version");
     }
     llvm_unreachable("Unsupported Darwin Source Kind");
   }
 
+  // Returns the inferred source of how the OS version was resolved.
+  std::string getInferredSource() {
+    assert(!isExplicitlySpecified() && "OS version was not inferred");
+    return InferredSource.str();
+  }
+
   void setEnvironment(llvm::Triple::EnvironmentType EnvType,
                       const VersionTuple &OSVersion,
                       const std::optional<DarwinSDKInfo> &SDKInfo) {
@@ -1876,7 +1883,8 @@ struct DarwinPlatform {
     Result.EnvVarName = EnvVarName;
     return Result;
   }
-  static DarwinPlatform createFromSDK(DarwinPlatformKind Platform,
+  static DarwinPlatform createFromSDK(StringRef SDKRoot,
+                                      DarwinPlatformKind Platform,
                                       StringRef Value,
                                       bool IsSimulator = false) {
     DarwinPlatform Result(InferredFromSDK, Platform,
@@ -1884,11 +1892,15 @@ struct DarwinPlatform {
     if (IsSimulator)
       Result.Environment = DarwinEnvironmentKind::Simulator;
     Result.InferSimulatorFromArch = false;
+    Result.InferredSource = SDKRoot;
     return Result;
   }
-  static DarwinPlatform createFromArch(llvm::Triple::OSType OS,
+  static DarwinPlatform createFromArch(StringRef Arch, llvm::Triple::OSType OS,
                                        VersionTuple Version) {
-    return DarwinPlatform(InferredFromArch, getPlatformFromOS(OS), Version);
+    auto Result =
+        DarwinPlatform(InferredFromArch, getPlatformFromOS(OS), Version);
+    Result.InferredSource = Arch;
+    return Result;
   }
 
   /// Constructs an inferred SDKInfo value based on the version inferred from
@@ -1975,6 +1987,9 @@ private:
   bool InferSimulatorFromArch = true;
   std::pair<Arg *, std::string> Arguments;
   StringRef EnvVarName;
+  // If the DarwinPlatform information is derived from an inferred source, this
+  // captures what that source input was for error reporting.
+  StringRef InferredSource;
   // When compiling for a zippered target, this value represents the target
   // triple encoded in the target variant.
   std::optional<llvm::Triple> TargetVariantTriple;
@@ -2143,26 +2158,27 @@ inferDeploymentTargetFromSDK(DerivedArgList &Args,
       [&](StringRef SDK) -> std::optional<DarwinPlatform> {
     if (SDK.starts_with("iPhoneOS") || SDK.starts_with("iPhoneSimulator"))
       return DarwinPlatform::createFromSDK(
-          Darwin::IPhoneOS, Version,
+          isysroot, Darwin::IPhoneOS, Version,
           /*IsSimulator=*/SDK.starts_with("iPhoneSimulator"));
     else if (SDK.starts_with("MacOSX"))
-      return DarwinPlatform::createFromSDK(Darwin::MacOS,
+      return DarwinPlatform::createFromSDK(isysroot, Darwin::MacOS,
                                            getSystemOrSDKMacOSVersion(Version));
     else if (SDK.starts_with("WatchOS") || SDK.starts_with("WatchSimulator"))
       return DarwinPlatform::createFromSDK(
-          Darwin::WatchOS, Version,
+          isysroot, Darwin::WatchOS, Version,
           /*IsSimulator=*/SDK.starts_with("WatchSimulator"));
     else if (SDK.starts_with("AppleTVOS") ||
              SDK.starts_with("AppleTVSimulator"))
       return DarwinPlatform::createFromSDK(
-          Darwin::TvOS, Version,
+          isysroot, Darwin::TvOS, Version,
           /*IsSimulator=*/SDK.starts_with("AppleTVSimulator"));
     else if (SDK.starts_with("XR"))
       return DarwinPlatform::createFromSDK(
-          Darwin::XROS, Version,
+          isysroot, Darwin::XROS, Version,
           /*IsSimulator=*/SDK.contains("Simulator"));
     else if (SDK.starts_with("DriverKit"))
-      return DarwinPlatform::createFromSDK(Darwin::DriverKit, Version);
+      return DarwinPlatform::createFromSDK(isysroot, Darwin::DriverKit,
+                                           Version);
     return std::nullopt;
   };
   if (auto Result = CreatePlatformFromSDKName(SDK))
@@ -2236,7 +2252,7 @@ inferDeploymentTargetFromArch(DerivedArgList &Args, const Darwin &Toolchain,
   if (OSTy == llvm::Triple::UnknownOS)
     return std::nullopt;
   return DarwinPlatform::createFromArch(
-      OSTy, getInferredOSVersion(OSTy, Triple, TheDriver));
+      MachOArchName, OSTy, getInferredOSVersion(OSTy, Triple, TheDriver));
 }
 
 /// Returns the deployment target that's specified using the -target option.
@@ -2455,9 +2471,15 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
   }
 
   assert(PlatformAndVersion && "Unable to infer Darwin variant");
-  if (!PlatformAndVersion->isValidOSVersion())
-    getDriver().Diag(diag::err_drv_invalid_version_number)
-        << PlatformAndVersion->getAsString(Args, Opts);
+  if (!PlatformAndVersion->isValidOSVersion()) {
+    if (PlatformAndVersion->isExplicitlySpecified())
+      getDriver().Diag(diag::err_drv_invalid_version_number)
+          << PlatformAndVersion->getAsString(Args, Opts);
+    else
+      getDriver().Diag(diag::err_drv_invalid_version_number_inferred)
+          << PlatformAndVersion->getOSVersion().getAsString()
+          << PlatformAndVersion->getInferredSource();
+  }
   // After the deployment OS version has been resolved, set it to the canonical
   // version before further error detection and converting to a proper target
   // triple.
diff --git a/clang/test/Driver/darwin-invalid-version-range.c b/clang/test/Driver/darwin-invalid-version-range.c
new file mode 100644
index 000000000000..84603aec1d2f
--- /dev/null
+++ b/clang/test/Driver/darwin-invalid-version-range.c
@@ -0,0 +1,29 @@
+/// This test validates that the various ways to assign an invalid deployment version are captured and detected.
+// REQUIRES: system-darwin && native
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// RUN: env SDKROOT=%t/iPhoneOS21.0.sdk not %clang -m64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=SDKROOT
+
+// RUN: not %clang -isysroot %t/iPhoneOS21.0.sdk -m64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=SYSROOT
+
+// RUN: not %clang -target arm64-apple-ios21 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=TARGET
+
+// RUN: not %clang -mtargetos=ios21 -arch arm64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=MTARGET
+
+// RUN: env IPHONEOS_DEPLOYMENT_TARGET=21.0 not %clang -arch arm64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=DEPLOY_VAR
+
+// SDKROOT:    error: invalid version number '21.0' inferred from '{{.*}}.sdk'
+// SYSROOT:    error: invalid version number '21.0' inferred from '{{.*}}.sdk'
+// TARGET:     error: invalid version number in '-target arm64-apple-ios21'
+// MTARGET:    error: invalid version number in '-mtargetos=ios21'
+// DEPLOY_VAR: error: invalid version number in 'IPHONEOS_DEPLOYMENT_TARGET=21.0'
+
+//--- iPhoneOS21.0.sdk/SDKSettings.json
+{"Version":"21.0", "MaximumDeploymentTarget": "21.0.99"}

From f6eaa2b00cc8d6421934cc92d4b210348809d700 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 12 Jun 2025 08:29:49 -0700
Subject: [PATCH 0156/1322] Reland "[clang-format][NFC] Clean up fillRanges()
 in ClangFormat.cpp" (#143477)

Reapply https://github.com/llvm/llvm-project/pull/143236 and fix the bug
reported in
https://github.com/llvm/llvm-project/pull/143236#issuecomment-2957102180.
---
 clang/tools/clang-format/ClangFormat.cpp | 52 +++++++++++-------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index b22d3aaf3183..24ad3cb42254 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -244,17 +244,17 @@ static bool fillRanges(MemoryBuffer *Code,
   DiagnosticsEngine Diagnostics(
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs), DiagOpts);
   SourceManager Sources(Diagnostics, Files);
-  FileID ID = createInMemoryFile("<irrelevant>", *Code, Sources, Files,
-                                 InMemoryFileSystem.get());
+  const auto ID = createInMemoryFile("<irrelevant>", *Code, Sources, Files,
+                                     InMemoryFileSystem.get());
   if (!LineRanges.empty()) {
     if (!Offsets.empty() || !Lengths.empty()) {
       errs() << "error: cannot use -lines with -offset/-length\n";
       return true;
     }
 
-    for (unsigned i = 0, e = LineRanges.size(); i < e; ++i) {
+    for (const auto &LineRange : LineRanges) {
       unsigned FromLine, ToLine;
-      if (parseLineRange(LineRanges[i], FromLine, ToLine)) {
+      if (parseLineRange(LineRange, FromLine, ToLine)) {
         errs() << "error: invalid <start line>:<end line> pair\n";
         return true;
       }
@@ -266,12 +266,12 @@ static bool fillRanges(MemoryBuffer *Code,
         errs() << "error: start line should not exceed end line\n";
         return true;
       }
-      SourceLocation Start = Sources.translateLineCol(ID, FromLine, 1);
-      SourceLocation End = Sources.translateLineCol(ID, ToLine, UINT_MAX);
+      const auto Start = Sources.translateLineCol(ID, FromLine, 1);
+      const auto End = Sources.translateLineCol(ID, ToLine, UINT_MAX);
       if (Start.isInvalid() || End.isInvalid())
         return true;
-      unsigned Offset = Sources.getFileOffset(Start);
-      unsigned Length = Sources.getFileOffset(End) - Offset;
+      const auto Offset = Sources.getFileOffset(Start);
+      const auto Length = Sources.getFileOffset(End) - Offset;
       Ranges.push_back(tooling::Range(Offset, Length));
     }
     return false;
@@ -279,32 +279,28 @@ static bool fillRanges(MemoryBuffer *Code,
 
   if (Offsets.empty())
     Offsets.push_back(0);
-  if (Offsets.size() != Lengths.size() &&
-      !(Offsets.size() == 1 && Lengths.empty())) {
+  const bool EmptyLengths = Lengths.empty();
+  unsigned Length = 0;
+  if (Offsets.size() == 1 && EmptyLengths) {
+    Length = Sources.getFileOffset(Sources.getLocForEndOfFile(ID)) - Offsets[0];
+  } else if (Offsets.size() != Lengths.size()) {
     errs() << "error: number of -offset and -length arguments must match.\n";
     return true;
   }
-  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
-    if (Offsets[i] >= Code->getBufferSize()) {
-      errs() << "error: offset " << Offsets[i] << " is outside the file\n";
+  for (unsigned I = 0, E = Offsets.size(), CodeSize = Code->getBufferSize();
+       I < E; ++I) {
+    const auto Offset = Offsets[I];
+    if (Offset >= CodeSize) {
+      errs() << "error: offset " << Offset << " is outside the file\n";
       return true;
     }
-    SourceLocation Start =
-        Sources.getLocForStartOfFile(ID).getLocWithOffset(Offsets[i]);
-    SourceLocation End;
-    if (i < Lengths.size()) {
-      if (Offsets[i] + Lengths[i] > Code->getBufferSize()) {
-        errs() << "error: invalid length " << Lengths[i]
-               << ", offset + length (" << Offsets[i] + Lengths[i]
-               << ") is outside the file.\n";
-        return true;
-      }
-      End = Start.getLocWithOffset(Lengths[i]);
-    } else {
-      End = Sources.getLocForEndOfFile(ID);
+    if (!EmptyLengths)
+      Length = Lengths[I];
+    if (Offset + Length > CodeSize) {
+      errs() << "error: invalid length " << Length << ", offset + length ("
+             << Offset + Length << ") is outside the file.\n";
+      return true;
     }
-    unsigned Offset = Sources.getFileOffset(Start);
-    unsigned Length = Sources.getFileOffset(End) - Offset;
     Ranges.push_back(tooling::Range(Offset, Length));
   }
   return false;

From f12b1ed11672bc40a53fb1180541b2fda6e7d9fc Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Thu, 12 Jun 2025 16:35:36 +0100
Subject: [PATCH 0157/1322] [flang][OpenMP] Add TODOs for target
 [teams|parallel] private (#143706)

Using the private clause on `target teams` or `target parallel` is not
currently implemented and causes crashes during lowering. Add
appropriate TODOs.

Resolves https://github.com/llvm/llvm-project/issues/116428.

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
---
 flang/lib/Lower/OpenMP/OpenMP.cpp                   |  7 +++++++
 .../Lower/OpenMP/Todo/target-parallel-private.f90   | 13 +++++++++++++
 .../test/Lower/OpenMP/Todo/target-teams-private.f90 | 13 +++++++++++++
 3 files changed, 33 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/target-teams-private.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index c13fa471978d..82673f0948a5 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -4024,6 +4024,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
       TODO(clauseLocation, name + " clause is not implemented yet");
     }
+
+    if (std::holds_alternative<clause::Private>(clause.u) &&
+        origDirective == llvm::omp::Directive::OMPD_target_teams)
+      TODO(clauseLocation, "TARGET TEAMS PRIVATE is not implemented yet");
+    if (std::holds_alternative<clause::Private>(clause.u) &&
+        origDirective == llvm::omp::Directive::OMPD_target_parallel)
+      TODO(clauseLocation, "TARGET PARALLEL PRIVATE is not implemented yet");
   }
 
   llvm::omp::Directive directive =
diff --git a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90 b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
new file mode 100644
index 000000000000..e820143021f9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `private` clause on `target parallel`
+!===============================================================================
+
+! CHECK: not yet implemented: TARGET PARALLEL PRIVATE is not implemented yet
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target parallel private(i)
+!$omp end target parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90 b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
new file mode 100644
index 000000000000..c8d998a5cbf9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `private` clause on `target teams`
+!===============================================================================
+
+! CHECK: not yet implemented: TARGET TEAMS PRIVATE is not implemented yet
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target teams private(i)
+!$omp end target teams
+end subroutine

From 13fe07d670e8a115929c9e595c4490ef5c75f583 Mon Sep 17 00:00:00 2001
From: tynasello-google <tynasello@google.com>
Date: Thu, 12 Jun 2025 08:39:28 -0700
Subject: [PATCH 0158/1322] [libc++] Expand Android libc++ test config files
 (#142846)

Parameterize (and rename) existing libc++/libc++abi test configuration
files for the Android NDK to work for both the NDK and platform.

Android LLVM downstream seeks to test libc++ for both the NDK and
platform build (currently only testing the NDK), which will use almost
identical test configuration files. The only difference is the name of
the libc++ shared object used. Because of this we parameterize the
current test files (for both libc++ and libc++abi) with the existing
LIBCXX_SHARED_OUTPUT_NAME cmake variable, and rename the file
accordingly.
---
 libcxx/cmake/caches/AndroidNDK.cmake                 |  4 ++--
 ...android-ndk.cfg.in => llvm-libc++-android.cfg.in} | 10 +++++-----
 ...roid-ndk.cfg.in => llvm-libc++abi-android.cfg.in} | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)
 rename libcxx/test/configs/{llvm-libc++-android-ndk.cfg.in => llvm-libc++-android.cfg.in} (83%)
 rename libcxxabi/test/configs/{llvm-libc++abi-android-ndk.cfg.in => llvm-libc++abi-android.cfg.in} (72%)

diff --git a/libcxx/cmake/caches/AndroidNDK.cmake b/libcxx/cmake/caches/AndroidNDK.cmake
index 298518781e9b..1a04b7fbb217 100644
--- a/libcxx/cmake/caches/AndroidNDK.cmake
+++ b/libcxx/cmake/caches/AndroidNDK.cmake
@@ -33,5 +33,5 @@ set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
 
 # Use adb to push tests to a locally-connected device (e.g. emulator) and run
 # them.
-set(LIBCXX_TEST_CONFIG "llvm-libc++-android-ndk.cfg.in" CACHE STRING "")
-set(LIBCXXABI_TEST_CONFIG "llvm-libc++abi-android-ndk.cfg.in" CACHE STRING "")
+set(LIBCXX_TEST_CONFIG "llvm-libc++-android.cfg.in" CACHE STRING "")
+set(LIBCXXABI_TEST_CONFIG "llvm-libc++abi-android.cfg.in" CACHE STRING "")
diff --git a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in b/libcxx/test/configs/llvm-libc++-android.cfg.in
similarity index 83%
rename from libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
rename to libcxx/test/configs/llvm-libc++-android.cfg.in
index 31a07f647165..9362c68e8f7a 100644
--- a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-android.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's
-# libc++ using adb and a libc++_shared.so library on Android.
+# libc++ using adb on Android.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
@@ -27,11 +27,11 @@ if re.match(r'i686-linux-android(21|22|23)$', config.target_triple):
     compile_flags += ' -mstackrealign'
 config.substitutions.append(('%{compile_flags}', compile_flags))
 
-# The NDK library is called "libc++_shared.so". Use LD_LIBRARY_PATH to find
-# libc++_shared.so because older Bionic dynamic loaders don't support rpath
-# lookup.
+# The platform library is called "libc++.so" and the NDK library is called "libc++_shared.so". 
+# Use LD_LIBRARY_PATH to find the libcxx shared object because older Bionic dynamic loaders 
+# don't support rpath lookup.
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib-dir} -lc++_shared'
+    '-nostdlib++ -L %{lib-dir} -l@LIBCXX_SHARED_OUTPUT_NAME@'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor}' +
diff --git a/libcxxabi/test/configs/llvm-libc++abi-android-ndk.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
similarity index 72%
rename from libcxxabi/test/configs/llvm-libc++abi-android-ndk.cfg.in
rename to libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
index f2cb62a32d4e..bc5844661536 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-android-ndk.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's
-# libc++abi using adb and a libc++_shared.so library on Android.
+# libc++abi using adb on Android.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
@@ -19,12 +19,12 @@ config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -I %{libcxx}/test/support -I %{libcxx}/src -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS'
 ))
 
-# The NDK library is called "libc++_shared.so". Use LD_LIBRARY_PATH to find
-# libc++_shared.so because older Bionic dynamic loaders don't support rpath
-# lookup. The Android libc++ shared library exports libc++abi, so we don't need
-# to link with -lc++abi.
+# The platform library is called "libc++.so" and the NDK library is called "libc++_shared.so". 
+# Use LD_LIBRARY_PATH to find the libcxx shared object because older Bionic dynamic loaders 
+# don't support rpath lookup. The Android libc++ shared library exports libc++abi, so we 
+# don't need to link with -lc++abi.
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++_shared'
+    '-nostdlib++ -L %{lib} -l@LIBCXX_SHARED_OUTPUT_NAME@'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor}' +

From 1c1df94d09820959c771cb4aaae4d36cdf5cab5a Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 12 Jun 2025 16:48:57 +0100
Subject: [PATCH 0159/1322] [lldb][Commands][NFC] Extract memory find
 expression evaluation into helpers (#143686)

This patch factors out the `-e` option logic into two helper functions.
The `EvaluateExpression` helper might seem redundant but I'll be adding
to it in a follow-up patch to fix an issue when running `memory find -e`
for Swift targets.

Also adds test coverage for the error cases that were previously
untested.

rdar://152113525
---
 lldb/source/Commands/CommandObjectMemory.cpp  | 101 ++++++++++--------
 .../memory/find/TestMemoryFind.py             |  41 +++++++
 .../API/functionalities/memory/find/main.cpp  |  15 +++
 3 files changed, 114 insertions(+), 43 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index 7140333bb3cd..85ae9f8f9e8c 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -885,6 +885,52 @@ protected:
 #define LLDB_OPTIONS_memory_find
 #include "CommandOptions.inc"
 
+static llvm::Error CopyExpressionResult(ValueObject &result,
+                                        DataBufferHeap &buffer) {
+  uint64_t value = result.GetValueAsUnsigned(0);
+  auto size_or_err = result.GetCompilerType().GetByteSize(nullptr);
+  if (!size_or_err)
+    return size_or_err.takeError();
+
+  switch (*size_or_err) {
+  case 1: {
+    uint8_t byte = (uint8_t)value;
+    buffer.CopyData(&byte, 1);
+  } break;
+  case 2: {
+    uint16_t word = (uint16_t)value;
+    buffer.CopyData(&word, 2);
+  } break;
+  case 4: {
+    uint32_t lword = (uint32_t)value;
+    buffer.CopyData(&lword, 4);
+  } break;
+  case 8: {
+    buffer.CopyData(&value, 8);
+  } break;
+  default:
+    return llvm::createStringError(
+        "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are "
+        "supported. For other pattern sizes the --string (-s) option may be "
+        "used.");
+  }
+
+  return llvm::Error::success();
+}
+
+static llvm::Expected<ValueObjectSP>
+EvaluateExpression(llvm::StringRef expression, StackFrame &frame,
+                   Process &process) {
+  ValueObjectSP result_sp;
+  auto status =
+      process.GetTarget().EvaluateExpression(expression, &frame, result_sp);
+  if (status != eExpressionCompleted || !result_sp)
+    return llvm::createStringError(
+        "expression evaluation failed. pass a string instead");
+
+  return result_sp;
+}
+
 // Find the specified data in memory
 class CommandObjectMemoryFind : public CommandObjectParsed {
 public:
@@ -1026,49 +1072,18 @@ protected:
       }
       buffer.CopyData(str);
     } else if (m_memory_options.m_expr.OptionWasSet()) {
-      StackFrame *frame = m_exe_ctx.GetFramePtr();
-      ValueObjectSP result_sp;
-      if ((eExpressionCompleted ==
-           process->GetTarget().EvaluateExpression(
-               m_memory_options.m_expr.GetValueAs<llvm::StringRef>().value_or(
-                   ""),
-               frame, result_sp)) &&
-          result_sp) {
-        uint64_t value = result_sp->GetValueAsUnsigned(0);
-        std::optional<uint64_t> size = llvm::expectedToOptional(
-            result_sp->GetCompilerType().GetByteSize(nullptr));
-        if (!size)
-          return;
-        switch (*size) {
-        case 1: {
-          uint8_t byte = (uint8_t)value;
-          buffer.CopyData(&byte, 1);
-        } break;
-        case 2: {
-          uint16_t word = (uint16_t)value;
-          buffer.CopyData(&word, 2);
-        } break;
-        case 4: {
-          uint32_t lword = (uint32_t)value;
-          buffer.CopyData(&lword, 4);
-        } break;
-        case 8: {
-          buffer.CopyData(&value, 8);
-        } break;
-        case 3:
-        case 5:
-        case 6:
-        case 7:
-          result.AppendError("unknown type. pass a string instead");
-          return;
-        default:
-          result.AppendError(
-              "result size larger than 8 bytes. pass a string instead");
-          return;
-        }
-      } else {
-        result.AppendError(
-            "expression evaluation failed. pass a string instead");
+      auto result_or_err = EvaluateExpression(
+          m_memory_options.m_expr.GetValueAs<llvm::StringRef>().value_or(""),
+          m_exe_ctx.GetFrameRef(), *process);
+      if (!result_or_err) {
+        result.AppendError(llvm::toString(result_or_err.takeError()));
+        return;
+      }
+
+      ValueObjectSP result_sp = *result_or_err;
+
+      if (auto err = CopyExpressionResult(*result_sp, buffer)) {
+        result.AppendError(llvm::toString(std::move(err)));
         return;
       }
     } else {
diff --git a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
index 09611cc80877..72426e75e013 100644
--- a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
+++ b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
@@ -79,3 +79,44 @@ class MemoryFindTestCase(TestBase):
             'memory find -s "nothere" `stringdata` `stringdata+10`',
             substrs=["data not found within the range."],
         )
+
+        # Expression results with unsupported result types.
+        self.expect(
+            'memory find -e "ThreeBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "FiveBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "SixBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "SevenBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "NineBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
diff --git a/lldb/test/API/functionalities/memory/find/main.cpp b/lldb/test/API/functionalities/memory/find/main.cpp
index e3dcfc762ee0..15c8df1a9fcf 100644
--- a/lldb/test/API/functionalities/memory/find/main.cpp
+++ b/lldb/test/API/functionalities/memory/find/main.cpp
@@ -1,9 +1,24 @@
 #include <stdio.h>
 #include <stdint.h>
 
+template <size_t T> struct [[gnu::packed]] Payload {
+  uint8_t data[T];
+};
+
+using ThreeBytes = Payload<3>;
+using FiveBytes = Payload<5>;
+using SixBytes = Payload<5>;
+using SevenBytes = Payload<7>;
+using NineBytes = Payload<9>;
+
 int main (int argc, char const *argv[])
 {
     const char* stringdata = "hello world; I like to write text in const char pointers";
     uint8_t bytedata[] = {0xAA,0xBB,0xCC,0xDD,0xEE,0xFF,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99};
+    ThreeBytes b1;
+    FiveBytes b2;
+    SixBytes b3;
+    SevenBytes b4;
+    NineBytes b5;
     return 0; // break here
 }

From 2a905dd1ebb46a6865b1f4743589b50cdb2cb4f0 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 12 Jun 2025 08:41:54 -0700
Subject: [PATCH 0160/1322] [Matrix] Use range-for in Visit* Result
 construction. NFC

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index ce6eaa292d8f..b32160ff275b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1223,7 +1224,7 @@ public:
       MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);
       Builder.setFastMathFlags(getFastMathFlags(Inst));
 
-      for (auto &Vector : M.vectors()) {
+      for (auto *Vector : M.vectors()) {
         switch (Inst->getIntrinsicID()) {
         case Intrinsic::abs:
           Result.addVector(Builder.CreateBinaryIntrinsic(Intrinsic::abs, Vector,
@@ -2256,9 +2257,8 @@ public:
 
     Builder.setFastMathFlags(getFastMathFlags(Inst));
 
-    for (unsigned I = 0; I < SI.getNumVectors(); ++I)
-      Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), A.getVector(I),
-                                           B.getVector(I)));
+    for (auto [AV, BV] : llvm::zip_equal(A.vectors(), B.vectors()))
+      Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), AV, BV));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
                                    Result.getNumVectors());
@@ -2285,8 +2285,8 @@ public:
       }
     };
 
-    for (unsigned I = 0; I < SI.getNumVectors(); ++I)
-      Result.addVector(BuildVectorOp(M.getVector(I)));
+    for (auto *Vector : M.vectors())
+      Result.addVector(BuildVectorOp(Vector));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
                                    Result.getNumVectors());
@@ -2307,7 +2307,7 @@ public:
     auto *NewVTy = VectorType::get(OrigVTy->getElementType(),
                                    ElementCount::getFixed(M.getStride()));
 
-    for (auto &Vector : M.vectors())
+    for (auto *Vector : M.vectors())
       Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
@@ -2336,9 +2336,8 @@ public:
       CondV[1] = Cond;
     }
 
-    for (unsigned I = 0, E = Shape.getNumVectors(); I != E; ++I)
-      Result.addVector(
-          Builder.CreateSelect(CondV[I], A.getVector(I), B.getVector(I)));
+    for (auto [CV, AV, BV] : llvm::zip_equal(CondV, A.vectors(), B.vectors()))
+      Result.addVector(Builder.CreateSelect(CV, AV, BV));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
                                    Result.getNumVectors());

From 316f530724ee2e870886e75729799afbcc1ff8d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 16:51:09 +0100
Subject: [PATCH 0161/1322] [X86] getTargetConstantBitsFromNode - handle
 EXTRACT_SUBVECTOR through bitcasts (#143886)

Generalize the extraction index/width to account for any changes in type through bitcasts
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 36 ++++++++--------
 .../zero_extend_vector_inreg_of_broadcast.ll  | 41 ++++++++-----------
 ...d_vector_inreg_of_broadcast_from_memory.ll | 30 ++++++--------
 3 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f0fbf55e97be..b4670e270141 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5242,25 +5242,25 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   }
 
   // Extract constant bits from a subvector's source.
-  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    // TODO - support extract_subvector through bitcasts.
-    if (EltSizeInBits != VT.getScalarSizeInBits())
-      return false;
+  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
+                                    EltBits, AllowWholeUndefs,
+                                    AllowPartialUndefs)) {
+    EVT SrcVT = Op.getOperand(0).getValueType();
+    unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
+    unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
+    unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
+    unsigned BaseIdx = BaseOfs / EltSizeInBits;
+    assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
+           (VT.getSizeInBits() % EltSizeInBits) == 0 &&
+           (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
 
-    if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
-                                      UndefElts, EltBits, AllowWholeUndefs,
-                                      AllowPartialUndefs)) {
-      EVT SrcVT = Op.getOperand(0).getValueType();
-      unsigned NumSrcElts = SrcVT.getVectorNumElements();
-      unsigned NumSubElts = VT.getVectorNumElements();
-      unsigned BaseIdx = Op.getConstantOperandVal(1);
-      UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
-      if ((BaseIdx + NumSubElts) != NumSrcElts)
-        EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
-      if (BaseIdx != 0)
-        EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
-      return true;
-    }
+    UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
+    if ((BaseIdx + NumSubElts) != NumSrcElts)
+      EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
+    if (BaseIdx != 0)
+      EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
+    return true;
   }
 
   // Extract constant bits from shuffle node sources.
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 45ccc39fb254..ed53c3693c9d 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -3567,14 +3567,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
 ; AVX-NEXT:    retq
 ;
@@ -3757,14 +3756,14 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
@@ -3955,10 +3954,9 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
@@ -4181,17 +4179,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4379,10 +4376,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
@@ -4517,10 +4513,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 31920d8348fb..239472c5cd1c 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -2868,14 +2868,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
 ; AVX-NEXT:    vmovdqa %xmm2, 32(%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -2986,7 +2985,8 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
@@ -3135,9 +3135,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT:    vpaddb 32(%rsi), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
@@ -3319,13 +3318,12 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
 ; AVX-NEXT:    vmovdqa %xmm2, 32(%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -3469,9 +3467,8 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpaddb 32(%rsi), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
@@ -3584,9 +3581,8 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vmovaps 32(%rsi), %ymm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)

From a53003fe23cb6c871e72d70ff2d3a075a7490da2 Mon Sep 17 00:00:00 2001
From: kotborealis <kotborealis@awooo.ru>
Date: Thu, 12 Jun 2025 18:51:22 +0300
Subject: [PATCH 0162/1322] [libc++] Update GDB pretty-printer to work with GDB
 17 (#142106)

This patch fixes an issue in libcxx/utils/gdb/libcxx/printers.py.

With gdb 17 (binutils 2_44) pretty-printers do not work anymore because
calls to `gdb.printing` requires `import gdb.printing` statement, which
was missing from the `printers.py`.

This was broken after commit https://github.com/bminor/binutils-gdb/commit/fc14343205d3a
and `import gdb.printing` was first referenced in https://github.com/bminor/binutils-gdb/commit/ee06c79b0f.

Co-authored-by: Dmitry Chestnykh <dm.chestnykh@gmail.com>
---
 libcxx/utils/gdb/libcxx/printers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/utils/gdb/libcxx/printers.py b/libcxx/utils/gdb/libcxx/printers.py
index e3d5d87aca32..90bc54d987ee 100644
--- a/libcxx/utils/gdb/libcxx/printers.py
+++ b/libcxx/utils/gdb/libcxx/printers.py
@@ -14,6 +14,7 @@ from __future__ import print_function
 
 import re
 import gdb
+import gdb.printing
 
 # One under-documented feature of the gdb pretty-printer API
 # is that clients can call any other member of the API

From 882b58a90ae0c4a91e1ecda6df3767b0fc44dab1 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Thu, 12 Jun 2025 12:12:01 -0400
Subject: [PATCH 0163/1322] [DirectX] Reland #142853 with Circular GEP fixes
 (#143747)

This change relands  https://github.com/llvm/llvm-project/pull/142853
It fixes the circular reference issue we were seeing in GEPs
ex `%.flat = getelementptr inbounds [16 x i32], ptr %.flat, i32 0, i32
15`
---
 llvm/lib/Target/DirectX/DXILFlattenArrays.cpp | 24 +++--
 llvm/test/CodeGen/DirectX/flatten-array.ll    | 18 ++--
 .../CodeGen/DirectX/flatten-bug-117273.ll     |  4 +-
 .../DirectX/llc-vector-load-scalarize.ll      | 88 +++++++++----------
 .../test/CodeGen/DirectX/scalar-bug-117273.ll |  4 +-
 5 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index a3163a896964..b1f3f41a28e8 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -42,7 +42,7 @@ public:
 
 struct GEPData {
   ArrayType *ParentArrayType;
-  Value *ParendOperand;
+  Value *ParentOperand;
   SmallVector<Value *> Indices;
   SmallVector<uint64_t> Dims;
   bool AllIndicesAreConstInt;
@@ -211,7 +211,7 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
 
   ArrayType *FattenedArrayType = ArrayType::get(BaseType, TotalElements);
   AllocaInst *FlatAlloca =
-      Builder.CreateAlloca(FattenedArrayType, nullptr, AI.getName() + ".flat");
+      Builder.CreateAlloca(FattenedArrayType, nullptr, AI.getName() + ".1dim");
   FlatAlloca->setAlignment(AI.getAlign());
   AI.replaceAllUsesWith(FlatAlloca);
   AI.eraseFromParent();
@@ -222,6 +222,10 @@ void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
     GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
     Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
     SmallVector<uint64_t> Dims, bool AllIndicesAreConstInt) {
+  // Check if this GEP is already in the map to avoid circular references
+  if (GEPChainMap.count(&CurrGEP) > 0)
+    return;
+
   Value *LastIndex = CurrGEP.getOperand(CurrGEP.getNumOperands() - 1);
   AllIndicesAreConstInt &= isa<ConstantInt>(LastIndex);
   Indices.push_back(LastIndex);
@@ -271,9 +275,19 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase(
         genInstructionFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder);
 
   ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType;
-  Value *FlatGEP =
-      Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParendOperand, FlatIndex,
-                        GEP.getName() + ".flat", GEP.isInBounds());
+
+  // Don't append '.flat' to an empty string. If the SSA name isn't available
+  // it could conflict with the ParentOperand's name.
+  std::string FlatName = GEP.hasName() ? GEP.getName().str() + ".flat" : "";
+
+  Value *FlatGEP = Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParentOperand,
+                                     {Builder.getInt32(0), FlatIndex}, FlatName,
+                                     GEP.getNoWrapFlags());
+
+  // Note: Old gep will become an invalid instruction after replaceAllUsesWith.
+  // Erase the old GEP in the map before to avoid invalid instructions
+  // and circular references.
+  GEPChainMap.erase(&GEP);
 
   GEP.replaceAllUsesWith(FlatGEP);
   GEP.eraseFromParent();
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index 754d5a25ca90..5c761014d471 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -31,7 +31,7 @@ define void @alloca_4d_test ()  {
 ; CHECK-LABEL: gep_2d_test
 define void @gep_2d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [9 x i32], align 4
-    ; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 {{[0-8]}}
+    ; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 0, i32 {{[0-8]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [3 x [3 x i32]], align 4
     %g2d0 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %1, i32 0, i32 0
@@ -53,7 +53,7 @@ define void @gep_2d_test ()  {
 ; CHECK-LABEL: gep_3d_test
 define void @gep_3d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [8 x i32], align 4
-    ; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 {{[0-7]}}
+    ; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 0, i32 {{[0-7]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [2 x[2 x [2 x i32]]], align 4
     %g3d0 = getelementptr inbounds [2 x[2 x [2 x i32]]], [2 x[2 x [2 x i32]]]* %1, i32 0, i32 0
@@ -76,7 +76,7 @@ define void @gep_3d_test ()  {
 ; CHECK-LABEL: gep_4d_test
 define void @gep_4d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
-    ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 {{[0-9]|1[0-5]}}
+    ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 0, i32 {{[0-9]|1[0-5]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
     %g4d0 = getelementptr inbounds [2x[2 x[2 x [2 x i32]]]], [2x[2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0
@@ -123,8 +123,8 @@ define void @gep_4d_test ()  {
 @b = internal global [2 x [3 x [4 x i32]]] zeroinitializer, align 16
 
 define void @global_gep_load() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 6
-  ; CHECK: load i32, ptr [[GEP_PTR]], align 4
+  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 6
+  ; CHECK-NEXT: load i32, ptr [[GEP_PTR]], align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @a, i32 0, i32 0
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 1
@@ -142,7 +142,7 @@ define void @global_gep_load_index(i32 %row, i32 %col, i32 %timeIndex) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[ROW]], 12
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP6]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP6]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -163,7 +163,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[ROW]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP4]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP4]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -177,8 +177,8 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 }
 
 define void @global_gep_store() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 13
-  ; CHECK:  store i32 1, ptr [[GEP_PTR]], align 4
+  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 0, i32 13
+  ; CHECK-NEXT: store i32 1, ptr [[GEP_PTR]], align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @b, i32 0, i32 1
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 0
diff --git a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
index 3ae5832ce832..c73e5017348d 100644
--- a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
@@ -8,9 +8,9 @@
 define internal void @main() {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 1
 ; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 2
 ; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index 7e5a92e1311f..c960aad3d262 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -32,23 +32,23 @@ define <4 x i32> @load_array_vec_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[DOTI12:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) [[DOTI12]], align 4
-; CHECK-NEXT:    [[DOTI24:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) [[DOTI24]], align 4
-; CHECK-NEXT:    [[DOTI36:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) [[DOTI36]], align 4
-; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP2]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP4]], [[DOTI13]]
-; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP6]], [[DOTI25]]
-; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP8]], [[DOTI37]]
-; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
-; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
-; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
+; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
+; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
+; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
+; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
+; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
+; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 1), align 4
@@ -81,23 +81,19 @@ define <4 x i32> @load_vec_test() #0 {
 define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
 ; CHECK-LABEL: define <4 x i32> @load_static_array_of_vec_test(
 ; CHECK-SAME: i32 [[INDEX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[DOTFLAT_I1:%.*]] = getelementptr i32, ptr [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[DOTI0:%.*]] = load i32, ptr [[DOTFLAT]], align 4
+; CHECK-NEXT:    [[DOTFLAT_I1:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 1
 ; CHECK-NEXT:    [[DOTI1:%.*]] = load i32, ptr [[DOTFLAT_I1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[DOTFLAT_I2:%.*]] = getelementptr i32, ptr [[TMP4]], i32 2
+; CHECK-NEXT:    [[DOTFLAT_I2:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 2
 ; CHECK-NEXT:    [[DOTI2:%.*]] = load i32, ptr [[DOTFLAT_I2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[DOTFLAT_I3:%.*]] = getelementptr i32, ptr [[TMP5]], i32 3
+; CHECK-NEXT:    [[DOTFLAT_I3:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 3
 ; CHECK-NEXT:    [[DOTI3:%.*]] = load i32, ptr [[DOTFLAT_I3]], align 4
-; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[DOTI1]], i32 1
-; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[DOTI2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[DOTI3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[DOTUPTO01:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI0]], i32 0
+; CHECK-NEXT:    [[DOTUPTO12:%.*]] = insertelement <4 x i32> [[DOTUPTO01]], i32 [[DOTI1]], i32 1
+; CHECK-NEXT:    [[DOTUPTO23:%.*]] = insertelement <4 x i32> [[DOTUPTO12]], i32 [[DOTI2]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[DOTUPTO23]], i32 [[DOTI3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %3 = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* @staticArrayOfVecData, i32 0, i32 %index
   %4 = load <4 x i32>, <4 x i32>* %3, align 4
@@ -115,23 +111,23 @@ define <4 x i32> @multid_load_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[DOTI12:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) [[DOTI12]], align 4
-; CHECK-NEXT:    [[DOTI24:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) [[DOTI24]], align 4
-; CHECK-NEXT:    [[DOTI36:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) [[DOTI36]], align 4
-; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP2]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP4]], [[DOTI13]]
-; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP6]], [[DOTI25]]
-; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP8]], [[DOTI37]]
-; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
-; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
-; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
+; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
+; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
+; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
+; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
+; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
+; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
diff --git a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
index 2676abec1d8a..a07ce2c24f7a 100644
--- a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
@@ -8,13 +8,13 @@
 define internal void @main() #1 {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 1
 ; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
 ; CHECK-NEXT:    [[DOTI1:%.*]] = getelementptr float, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    [[DOTI11:%.*]] = load float, ptr [[DOTI1]], align 4
 ; CHECK-NEXT:    [[DOTI2:%.*]] = getelementptr float, ptr [[TMP0]], i32 2
 ; CHECK-NEXT:    [[DOTI22:%.*]] = load float, ptr [[DOTI2]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 2
 ; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    [[DOTI14:%.*]] = getelementptr float, ptr [[TMP1]], i32 1
 ; CHECK-NEXT:    [[DOTI15:%.*]] = load float, ptr [[DOTI14]], align 4

From ef1cb8277ac3cb34ce9700a313ed60410dd9f84b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Thu, 12 Jun 2025 18:13:29 +0200
Subject: [PATCH 0164/1322] [SPIR-V] Fix ExecutionMode generation (#143888)

PR #141787 added code to emit the Fragment execution model. This
required emitting the OriginUpperLeft ExecutionMode. But this was done
by using the same codepath used for OpEntrypoint.

This has 2 issues:
- the interface variables were added to both OpEntryPoint and
OpExecutionMode.
- the existing OpExecutionMode logic was not used.

This commit fixes this, regrouping OpExecutionMode handling in one
place, and fixing bad codegen issue when interface variiables are added.
---
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp        | 16 ++++++++++++++++
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp      | 13 +------------
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp    |  2 --
 .../test/CodeGen/SPIRV/ExecutionMode_Fragment.ll | 14 +++++++++++---
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index d4becc286504..26b94788b810 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -510,6 +510,22 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
       continue;
     MCRegister FReg = MAI->getFuncReg(&F);
     assert(FReg.isValid());
+
+    if (Attribute Attr = F.getFnAttribute("hlsl.shader"); Attr.isValid()) {
+      // SPIR-V common validation: Fragment requires OriginUpperLeft or
+      // OriginLowerLeft.
+      // VUID-StandaloneSpirv-OriginLowerLeft-04653: Fragment must declare
+      // OriginUpperLeft.
+      if (Attr.getValueAsString() == "pixel") {
+        MCInst Inst;
+        Inst.setOpcode(SPIRV::OpExecutionMode);
+        Inst.addOperand(MCOperand::createReg(FReg));
+        unsigned EM =
+            static_cast<unsigned>(SPIRV::ExecutionMode::OriginUpperLeft);
+        Inst.addOperand(MCOperand::createImm(EM));
+        outputMCInst(Inst);
+      }
+    }
     if (MDNode *Node = F.getMetadata("reqd_work_group_size"))
       outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::LocalSize,
                                     3, 1);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 091368a309a8..36cc5cbe655b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -475,21 +475,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     // environment if we need to.
     const SPIRVSubtarget *ST =
         static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
-    SPIRV::ExecutionModel::ExecutionModel ExecutionModel =
-        getExecutionModel(*ST, F);
     auto MIB = MIRBuilder.buildInstr(SPIRV::OpEntryPoint)
-                   .addImm(static_cast<uint32_t>(ExecutionModel))
+                   .addImm(static_cast<uint32_t>(getExecutionModel(*ST, F)))
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
-
-    if (ExecutionModel == SPIRV::ExecutionModel::Fragment) {
-      // SPIR-V common validation: Fragment requires OriginUpperLeft or
-      // OriginLowerLeft VUID-StandaloneSpirv-OriginLowerLeft-04653: Fragment
-      // must declare OriginUpperLeft.
-      MIRBuilder.buildInstr(SPIRV::OpExecutionMode)
-          .addUse(FuncVReg)
-          .addImm(static_cast<uint32_t>(SPIRV::ExecutionMode::OriginUpperLeft));
-    }
   } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
              F.getLinkage() != GlobalValue::PrivateLinkage) {
     SPIRV::LinkageType::LinkageType LnkTy =
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 2ddd028c7941..b71a9dd68dd4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -595,8 +595,6 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
           collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames, IS);
         } else if (OpCode == SPIRV::OpEntryPoint) {
           collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints, IS);
-        } else if (OpCode == SPIRV::OpExecutionMode) {
-          collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints, IS);
         } else if (TII->isAliasingInstr(MI)) {
           collectOtherInstr(MI, MAI, SPIRV::MB_AliasingInsts, IS);
         } else if (TII->isDecorationInstr(MI)) {
diff --git a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
index 0a62db446cc1..4fa764fe192d 100644
--- a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
+++ b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
@@ -1,12 +1,20 @@
 ; RUN: llc -O0 -mtriple=spirv-unknown-vulkan1.3-pixel %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan1.3-pixel %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
 
-; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main"
+; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main" {{.*}}
 ; CHECK-DAG: OpExecutionMode %[[#entry]] OriginUpperLeft
 
-define void @main() #1 {
+
+define void @main() #0 {
 entry:
+  %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %0, i32 0)
+  store i32 1, ptr addrspace(11) %1, align 4
+
   ret void
 }
 
-attributes #1 = { "hlsl.shader"="pixel" }
+declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #1
+
+attributes #0 = { "hlsl.shader"="pixel" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

From daee5eee8562d26d234f85152e803b6571b15ee2 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Thu, 12 Jun 2025 11:14:21 -0500
Subject: [PATCH 0165/1322] [Offload][PGO] Fix new GPU PGO tests (#143645)

`pgo_atomic_teams.c` and `pgo_atomic_threads.c` currently are set to run
on NVPTX despite the changes for that target not being upstreamed yet.
This patch also replaces instances of `llvm-profdata` with `%profdata`
in those tests.
---
 offload/test/offloading/gpupgo/pgo_atomic_teams.c   | 6 +++---
 offload/test/offloading/gpupgo/pgo_atomic_threads.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
index 7bf3b1c11f28..b3b72db08039 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
@@ -3,7 +3,7 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.llvm.profraw | \
 // RUN:     %fcheck-generic --check-prefix="LLVM-PGO"
 
@@ -12,11 +12,11 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.clang.profraw | \
 // RUN:     %fcheck-generic --check-prefix="CLANG-PGO"
 
-// REQUIRES: gpu
+// REQUIRES: amdgpu
 // REQUIRES: pgo
 
 int test1(int a) { return a / 2; }
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
index f0e7111f7a64..440a6b533317 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
@@ -3,7 +3,7 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.llvm.profraw | \
 // RUN:     %fcheck-generic --check-prefix="LLVM-PGO"
 
@@ -12,11 +12,11 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.clang.profraw | \
 // RUN:     %fcheck-generic --check-prefix="CLANG-PGO"
 
-// REQUIRES: gpu
+// REQUIRES: amdgpu
 // REQUIRES: pgo
 
 int test1(int a) { return a / 2; }

From c6da2c877cb407c0404e58c5ca257d12036ed164 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 12 Jun 2025 17:14:31 +0100
Subject: [PATCH 0166/1322] [lldb][Commands] Fix memory find for Swift
 expressions (#143860)

(depends on https://github.com/llvm/llvm-project/pull/143686)

There were two issues previously preventing `memory find -e` expressions
to succeed when stopped in Swift frames:
1. We weren't getting the dynamic type of the result `ValueObject`.
   For Swift this would fail when we tried to produce a scalar value
   out of it because the static VO wasn't sufficient to get to the
integer value. Hence we add a call to
`GetQualifiedRepresentationIfAvailable`
(which is what we do for expressions in `OptionArgParser::ToAddress`
too).
2. We weren't passing an `ExecutionContextScope` to `GetByteSize`, which
   Swift relied on to get the size of the result type.

My plan is to add an API test for this on the Apple
`swiftlang/llvm-project` fork.

I considered an alternative where we use `OptionArgParser::ToAddress`
for `memory find -e` expressions, but it got a bit icky when trying to
figure out how many bytes we should copy out of the result into the
`DataBufferHeap` (currently we rely on the size of the result variable
type). This gets even trickier when we were to pass an expression that
was actually a hex digit or a number into `ToAddress`.

rdar://152113525
---
 lldb/source/Commands/CommandObjectMemory.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index 85ae9f8f9e8c..ccb06d8ff4d5 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -886,9 +886,10 @@ protected:
 #include "CommandOptions.inc"
 
 static llvm::Error CopyExpressionResult(ValueObject &result,
-                                        DataBufferHeap &buffer) {
+                                        DataBufferHeap &buffer,
+                                        ExecutionContextScope *scope) {
   uint64_t value = result.GetValueAsUnsigned(0);
-  auto size_or_err = result.GetCompilerType().GetByteSize(nullptr);
+  auto size_or_err = result.GetCompilerType().GetByteSize(scope);
   if (!size_or_err)
     return size_or_err.takeError();
 
@@ -928,6 +929,11 @@ EvaluateExpression(llvm::StringRef expression, StackFrame &frame,
     return llvm::createStringError(
         "expression evaluation failed. pass a string instead");
 
+  result_sp = result_sp->GetQualifiedRepresentationIfAvailable(
+      result_sp->GetDynamicValueType(), /*synthValue=*/true);
+  if (!result_sp)
+    return llvm::createStringError("failed to get dynamic result type");
+
   return result_sp;
 }
 
@@ -1082,7 +1088,8 @@ protected:
 
       ValueObjectSP result_sp = *result_or_err;
 
-      if (auto err = CopyExpressionResult(*result_sp, buffer)) {
+      if (auto err = CopyExpressionResult(*result_sp, buffer,
+                                          m_exe_ctx.GetFramePtr())) {
         result.AppendError(llvm::toString(std::move(err)));
         return;
       }

From 4039fdb7ba5a0d9ead5bdc0404f036063a4ca95d Mon Sep 17 00:00:00 2001
From: "W. Turner Abney" <weebney@gmail.com>
Date: Thu, 12 Jun 2025 12:20:32 -0400
Subject: [PATCH 0167/1322] [libc] add ioctl (#141393)

Closes #85275
Closes #90317
Updates #97191

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
Co-authored-by: Michael Jones <michaelrj@google.com>
---
 libc/config/linux/aarch64/entrypoints.txt     |  3 +
 libc/config/linux/arm/entrypoints.txt         |  3 +
 libc/config/linux/riscv/entrypoints.txt       |  3 +
 libc/config/linux/x86_64/entrypoints.txt      |  3 +
 libc/hdr/CMakeLists.txt                       |  9 +++
 libc/hdr/sys_ioctl_macros.h                   | 22 ++++++
 .../llvm-libc-macros/linux/sys-ioctl-macros.h |  1 +
 libc/src/sys/CMakeLists.txt                   |  1 +
 libc/src/sys/ioctl/CMakeLists.txt             | 10 +++
 libc/src/sys/ioctl/ioctl.h                    | 20 +++++
 libc/src/sys/ioctl/linux/CMakeLists.txt       | 12 +++
 libc/src/sys/ioctl/linux/ioctl.cpp            | 36 +++++++++
 libc/test/src/sys/CMakeLists.txt              |  1 +
 libc/test/src/sys/ioctl/CMakeLists.txt        |  3 +
 libc/test/src/sys/ioctl/linux/CMakeLists.txt  | 17 +++++
 libc/test/src/sys/ioctl/linux/ioctl_test.cpp  | 75 +++++++++++++++++++
 16 files changed, 219 insertions(+)
 create mode 100644 libc/hdr/sys_ioctl_macros.h
 create mode 100644 libc/src/sys/ioctl/CMakeLists.txt
 create mode 100644 libc/src/sys/ioctl/ioctl.h
 create mode 100644 libc/src/sys/ioctl/linux/CMakeLists.txt
 create mode 100644 libc/src/sys/ioctl/linux/ioctl.cpp
 create mode 100644 libc/test/src/sys/ioctl/CMakeLists.txt
 create mode 100644 libc/test/src/sys/ioctl/linux/CMakeLists.txt
 create mode 100644 libc/test/src/sys/ioctl/linux/ioctl_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 520046f768b5..fcf1278eae72 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -245,6 +245,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
     libc.src.sys.mman.mincore
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 7432a7e912e8..1161ae260be2 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -172,6 +172,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.free
     libc.src.stdlib.malloc
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.mmap
     libc.src.sys.mman.munmap
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 0b645a2d2fb8..050fc2672a57 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -246,6 +246,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
     libc.src.sys.mman.mincore
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 959bdbf08dbe..6c9d83708b92 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -246,6 +246,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
     libc.src.sys.mman.mincore
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 209fcb965242..1e9f59621a8e 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -126,6 +126,15 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.sys_epoll_macros
 )
 
+add_proxy_header_library(
+  sys_ioctl_macros
+  HDRS
+    sys_ioctl_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.sys_ioctl
+    libc.include.llvm-libc-macros.sys_ioctl_macros
+)
+
 add_proxy_header_library(
   sys_stat_macros
   HDRS
diff --git a/libc/hdr/sys_ioctl_macros.h b/libc/hdr/sys_ioctl_macros.h
new file mode 100644
index 000000000000..935d43627346
--- /dev/null
+++ b/libc/hdr/sys_ioctl_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from sys/ioctl.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_SYS_IOCTL_MACROS_H
+#define LLVM_LIBC_HDR_SYS_IOCTL_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/sys-ioctl-macros.h"
+
+#else // Overlay mode
+
+#include <sys/ioctl.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_SYS_IOCTL_MACROS_H
diff --git a/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h b/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h
index 5eb779aeeca5..41226080084c 100644
--- a/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h
+++ b/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h
@@ -15,5 +15,6 @@
 // around the definitions of macros like _IO, _IOR, _IOW, and _IOWR that I don't
 // think is worth digging into right now.
 #define TIOCGETD 0x5424
+#define FIONREAD 0x541B
 
 #endif // LLVM_LIBC_MACROS_LINUX_SYS_IOCTL_MACROS_H
diff --git a/libc/src/sys/CMakeLists.txt b/libc/src/sys/CMakeLists.txt
index 9a73b80d35d2..0fa11e9eee69 100644
--- a/libc/src/sys/CMakeLists.txt
+++ b/libc/src/sys/CMakeLists.txt
@@ -13,3 +13,4 @@ add_subdirectory(utsname)
 add_subdirectory(wait)
 add_subdirectory(prctl)
 add_subdirectory(uio)
+add_subdirectory(ioctl)
diff --git a/libc/src/sys/ioctl/CMakeLists.txt b/libc/src/sys/ioctl/CMakeLists.txt
new file mode 100644
index 000000000000..099a1b96389f
--- /dev/null
+++ b/libc/src/sys/ioctl/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+endif()
+
+add_entrypoint_object(
+  ioctl
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.ioctl
+)
diff --git a/libc/src/sys/ioctl/ioctl.h b/libc/src/sys/ioctl/ioctl.h
new file mode 100644
index 000000000000..62323ba7dd4d
--- /dev/null
+++ b/libc/src/sys/ioctl/ioctl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ioctl ---------------------------*-C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_IOCTL_IOCTL_H
+#define LLVM_LIBC_SRC_SYS_IOCTL_IOCTL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int ioctl(int fd, unsigned long request, ...);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SYS_IOCTL_IOCTL_H
diff --git a/libc/src/sys/ioctl/linux/CMakeLists.txt b/libc/src/sys/ioctl/linux/CMakeLists.txt
new file mode 100644
index 000000000000..876f35aaee66
--- /dev/null
+++ b/libc/src/sys/ioctl/linux/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_entrypoint_object(
+  ioctl
+  SRCS
+    ioctl.cpp
+  HDRS
+    ../ioctl.h
+  DEPENDS
+    libc.include.sys_ioctl
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
diff --git a/libc/src/sys/ioctl/linux/ioctl.cpp b/libc/src/sys/ioctl/linux/ioctl.cpp
new file mode 100644
index 000000000000..f03fea21c75b
--- /dev/null
+++ b/libc/src/sys/ioctl/linux/ioctl.cpp
@@ -0,0 +1,36 @@
+//===---------- Linux implementation of the ioctl function ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/ioctl/ioctl.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+#include <stdarg.h>
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, ioctl, (int fd, unsigned long request, ...)) {
+  va_list vargs;
+  va_start(vargs, request);
+  void *data_pointer = va_arg(vargs, void *);
+  int ret =
+      LIBC_NAMESPACE::syscall_impl<int>(SYS_ioctl, fd, request, data_pointer);
+  va_end(vargs);
+
+  // Some ioctls can be expected to return positive values
+  if (ret >= 0)
+    return ret;
+
+  // If there is an error, errno is set and -1 is returned.
+  libc_errno = -ret;
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/sys/CMakeLists.txt b/libc/test/src/sys/CMakeLists.txt
index 224cc7905ad3..13bf91eef04b 100644
--- a/libc/test/src/sys/CMakeLists.txt
+++ b/libc/test/src/sys/CMakeLists.txt
@@ -13,3 +13,4 @@ add_subdirectory(auxv)
 add_subdirectory(epoll)
 add_subdirectory(uio)
 add_subdirectory(time)
+add_subdirectory(ioctl)
diff --git a/libc/test/src/sys/ioctl/CMakeLists.txt b/libc/test/src/sys/ioctl/CMakeLists.txt
new file mode 100644
index 000000000000..b4bbe81c92ff
--- /dev/null
+++ b/libc/test/src/sys/ioctl/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+endif()
diff --git a/libc/test/src/sys/ioctl/linux/CMakeLists.txt b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
new file mode 100644
index 000000000000..e5095c54a729
--- /dev/null
+++ b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(libc_sys_ioctl_unittests)
+
+add_libc_unittest(
+  ioctl_test
+  SUITE
+    libc_sys_ioctl_unittests
+  SRCS
+    ioctl_test.cpp
+  DEPENDS
+    libc.hdr.ioctl_macros
+    libc.src.sys.ioctl.ioctl
+    libc.src.errno.errno
+    libc.src.fcntl.open
+    libc.src.unistd.close
+    libc.src.unistd.read
+    libc.src.unistd.write
+)
diff --git a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
new file mode 100644
index 000000000000..9c56a4689b18
--- /dev/null
+++ b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
@@ -0,0 +1,75 @@
+//===-- Unittests for ioctl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/fcntl/open.h"
+#include "src/sys/ioctl/ioctl.h"
+#include "src/unistd/close.h"
+#include "src/unistd/read.h"
+#include "src/unistd/write.h"
+
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "hdr/sys_stat_macros.h"
+
+#include "hdr/sys_ioctl_macros.h"
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+TEST(LlvmLibcSysIoctlTest, InvalidCommandAndFIONREAD) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  // Setup the test file
+  constexpr const char *TEST_FILE_NAME = "ioctl.test";
+  constexpr const char TEST_MSG[] = "ioctl test";
+  constexpr int TEST_MSG_SIZE = sizeof(TEST_MSG) - 1;
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+  int new_test_file_fd = LIBC_NAMESPACE::open(
+      TEST_FILE, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+  ASSERT_THAT(
+      (int)LIBC_NAMESPACE::write(new_test_file_fd, TEST_MSG, TEST_MSG_SIZE),
+      Succeeds(TEST_MSG_SIZE));
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_THAT(LIBC_NAMESPACE::close(new_test_file_fd), Succeeds(0));
+  ASSERT_ERRNO_SUCCESS();
+
+  // Reopen the file for testing
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_RDONLY);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  // FIONREAD reports the number of available bytes to read for the passed fd
+  // This will report the full size of the file, as we haven't read anything yet
+  int n = -1;
+  int ret = LIBC_NAMESPACE::ioctl(fd, FIONREAD, &n);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(ret, -1);
+  ASSERT_EQ(n, TEST_MSG_SIZE);
+
+  // But if we read some bytes...
+  constexpr int READ_COUNT = 5;
+  char read_buffer[READ_COUNT];
+  ASSERT_THAT((int)LIBC_NAMESPACE::read(fd, read_buffer, READ_COUNT),
+              Succeeds(READ_COUNT));
+
+  // ... n should have decreased by the number of bytes we've read
+  int n_after_reading = -1;
+  ret = LIBC_NAMESPACE::ioctl(fd, FIONREAD, &n_after_reading);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(ret, -1);
+  ASSERT_EQ(n - READ_COUNT, n_after_reading);
+
+  // 0xDEADBEEF is just a random nonexistent command;
+  // calling this should always fail with ENOTTY
+  ret = LIBC_NAMESPACE::ioctl(fd, 0xDEADBEEF, NULL);
+  ASSERT_ERRNO_EQ(ENOTTY);
+  ASSERT_EQ(ret, -1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}

From 77834a40cf350d2fe63fac26222c3918f5f348fd Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 12 Jun 2025 09:24:26 -0700
Subject: [PATCH 0168/1322] [CIR] Upstream support for emitting constructors
 (#143639)

This change upstreams the code to emit simple constructor defintions.
---
 clang/include/clang/CIR/MissingFeatures.h     |  4 +
 clang/lib/CIR/CodeGen/CIRGenCXX.cpp           | 40 +++++++++
 clang/lib/CIR/CodeGen/CIRGenCXXABI.h          | 11 +++
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 41 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 81 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      | 60 ++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 18 +++++
 clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 65 ++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 15 ++--
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  5 ++
 clang/lib/CIR/CodeGen/CMakeLists.txt          |  1 +
 clang/test/CIR/CodeGen/ctor.cpp               | 54 ++++++++++++-
 12 files changed, 379 insertions(+), 16 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenCXX.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 87908e2ec08a..fbd15d5c886d 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -81,6 +81,7 @@ struct MissingFeatures {
   static bool opFuncCPUAndFeaturesAttributes() { return false; }
   static bool opFuncSection() { return false; }
   static bool opFuncSetComdat() { return false; }
+  static bool opFuncAttributesForDefinition() { return false; }
 
   // CallOp handling
   static bool opCallPseudoDtor() { return false; }
@@ -226,6 +227,9 @@ struct MissingFeatures {
   static bool implicitConstructorArgs() { return false; }
   static bool intrinsics() { return false; }
   static bool attributeNoBuiltin() { return false; }
+  static bool emitCtorPrologue() { return false; }
+  static bool thunks() { return false; }
+  static bool runCleanupsScope() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
new file mode 100644
index 000000000000..51751483d34e
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code dealing with C++ code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+
+#include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
+  const CIRGenFunctionInfo &fnInfo =
+      getTypes().arrangeCXXStructorDeclaration(gd);
+  cir::FuncType funcType = getTypes().getFunctionType(fnInfo);
+  cir::FuncOp fn = getAddrOfCXXStructor(gd, &fnInfo, /*FnType=*/nullptr,
+                                        /*DontDefer=*/true, ForDefinition);
+  assert(!cir::MissingFeatures::opFuncLinkage());
+  CIRGenFunction cgf{*this, builder};
+  curCGF = &cgf;
+  {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    cgf.generateCode(gd, fn, funcType);
+  }
+  curCGF = nullptr;
+
+  setNonAliasAttributes(gd, fn);
+  assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
+  return fn;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index 107535ebc727..2d967fd307e0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -37,6 +37,10 @@ public:
 
   void setCXXABIThisValue(CIRGenFunction &cgf, mlir::Value thisPtr);
 
+  /// Emit a single constructor/destructor with the gen type from a C++
+  /// constructor/destructor Decl.
+  virtual void emitCXXStructor(clang::GlobalDecl gd) = 0;
+
 public:
   clang::ImplicitParamDecl *getThisDecl(CIRGenFunction &cgf) {
     return cgf.cxxabiThisDecl;
@@ -55,12 +59,19 @@ public:
     return md->getParent();
   }
 
+  /// Return whether the given global decl needs a VTT (virtual table table)
+  /// parameter.
+  virtual bool needsVTTParameter(clang::GlobalDecl gd) { return false; }
+
   /// Build a parameter variable suitable for 'this'.
   void buildThisParam(CIRGenFunction &cgf, FunctionArgList &params);
 
   /// Loads the incoming C++ this pointer as it was passed by the caller.
   mlir::Value loadIncomingCXXThis(CIRGenFunction &cgf);
 
+  /// Emit constructor variants required by this ABI.
+  virtual void emitCXXConstructors(const clang::CXXConstructorDecl *d) = 0;
+
   /// Returns true if the given constructor or destructor is one of the kinds
   /// that the ABI says returns 'this' (only applies when called non-virtually
   /// for destructors).
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 9d25eea9e413..da754e0806b2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -162,6 +162,47 @@ arrangeCIRFunctionInfo(CIRGenTypes &cgt, SmallVectorImpl<CanQualType> &prefix,
   return cgt.arrangeCIRFunctionInfo(resultType, prefix, required);
 }
 
+void CIRGenFunction::emitDelegateCallArg(CallArgList &args,
+                                         const VarDecl *param,
+                                         SourceLocation loc) {
+  // StartFunction converted the ABI-lowered parameter(s) into a local alloca.
+  // We need to turn that into an r-value suitable for emitCall
+  Address local = getAddrOfLocalVar(param);
+
+  QualType type = param->getType();
+
+  if (const auto *rd = type->getAsCXXRecordDecl()) {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: record argument");
+    return;
+  }
+
+  // GetAddrOfLocalVar returns a pointer-to-pointer for references, but the
+  // argument needs to be the original pointer.
+  if (type->isReferenceType()) {
+    args.add(
+        RValue::get(builder.createLoad(getLoc(param->getSourceRange()), local)),
+        type);
+  } else if (getLangOpts().ObjCAutoRefCount) {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: ObjCAutoRefCount");
+    // For the most part, we just need to load the alloca, except that aggregate
+    // r-values are actually pointers to temporaries.
+  } else {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: convertTempToRValue");
+  }
+
+  // Deactivate the cleanup for the callee-destructed param that was pushed.
+  assert(!cir::MissingFeatures::thunks());
+  if (type->isRecordType() &&
+      type->castAs<RecordType>()->getDecl()->isParamDestroyedInCallee() &&
+      param->needsDestruction(getContext())) {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: callee-destructed param");
+  }
+}
+
 static const CIRGenFunctionInfo &
 arrangeFreeFunctionLikeCall(CIRGenTypes &cgt, CIRGenModule &cgm,
                             const CallArgList &args,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 8491a66ea6cb..bb4b451c9924 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -21,6 +21,87 @@
 using namespace clang;
 using namespace clang::CIRGen;
 
+/// Checks whether the given constructor is a valid subject for the
+/// complete-to-base constructor delegation optimization, i.e. emitting the
+/// complete constructor as a simple call to the base constructor.
+bool CIRGenFunction::isConstructorDelegationValid(
+    const CXXConstructorDecl *ctor) {
+  // Currently we disable the optimization for classes with virtual bases
+  // because (1) the address of parameter variables need to be consistent across
+  // all initializers but (2) the delegate function call necessarily creates a
+  // second copy of the parameter variable.
+  //
+  // The limiting example (purely theoretical AFAIK):
+  //   struct A { A(int &c) { c++; } };
+  //   struct A : virtual A {
+  //     B(int count) : A(count) { printf("%d\n", count); }
+  //   };
+  // ...although even this example could in principle be emitted as a delegation
+  // since the address of the parameter doesn't escape.
+  if (ctor->getParent()->getNumVBases())
+    return false;
+
+  // We also disable the optimization for variadic functions because it's
+  // impossible to "re-pass" varargs.
+  if (ctor->getType()->castAs<FunctionProtoType>()->isVariadic())
+    return false;
+
+  // FIXME: Decide if we can do a delegation of a delegating constructor.
+  if (ctor->isDelegatingConstructor())
+    return false;
+
+  return true;
+}
+
+Address CIRGenFunction::loadCXXThisAddress() {
+  assert(curFuncDecl && "loading 'this' without a func declaration?");
+  assert(isa<CXXMethodDecl>(curFuncDecl));
+
+  // Lazily compute CXXThisAlignment.
+  if (cxxThisAlignment.isZero()) {
+    // Just use the best known alignment for the parent.
+    // TODO: if we're currently emitting a complete-object ctor/dtor, we can
+    // always use the complete-object alignment.
+    auto rd = cast<CXXMethodDecl>(curFuncDecl)->getParent();
+    cxxThisAlignment = cgm.getClassPointerAlignment(rd);
+  }
+
+  return Address(loadCXXThis(), cxxThisAlignment);
+}
+
+void CIRGenFunction::emitDelegateCXXConstructorCall(
+    const CXXConstructorDecl *ctor, CXXCtorType ctorType,
+    const FunctionArgList &args, SourceLocation loc) {
+  CallArgList delegateArgs;
+
+  FunctionArgList::const_iterator i = args.begin(), e = args.end();
+  assert(i != e && "no parameters to constructor");
+
+  // this
+  Address thisAddr = loadCXXThisAddress();
+  delegateArgs.add(RValue::get(thisAddr.getPointer()), (*i)->getType());
+  ++i;
+
+  // FIXME: The location of the VTT parameter in the parameter list is specific
+  // to the Itanium ABI and shouldn't be hardcoded here.
+  if (cgm.getCXXABI().needsVTTParameter(curGD)) {
+    cgm.errorNYI(loc, "emitDelegateCXXConstructorCall: VTT parameter");
+    return;
+  }
+
+  // Explicit arguments.
+  for (; i != e; ++i) {
+    const VarDecl *param = *i;
+    // FIXME: per-argument source location
+    emitDelegateCallArg(delegateArgs, param, loc);
+  }
+
+  assert(!cir::MissingFeatures::sanitizers());
+
+  emitCXXConstructorCall(ctor, ctorType, /*ForVirtualBase=*/false,
+                         /*Delegating=*/true, thisAddr, delegateArgs, loc);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index e32a5c836be0..53c44c6cc768 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -465,7 +465,7 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
     if (isa<CXXDestructorDecl>(funcDecl))
       getCIRGenModule().errorNYI(bodyRange, "C++ destructor definition");
     else if (isa<CXXConstructorDecl>(funcDecl))
-      getCIRGenModule().errorNYI(bodyRange, "C++ constructor definition");
+      emitConstructorBody(args);
     else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
              funcDecl->hasAttr<CUDAGlobalAttr>())
       getCIRGenModule().errorNYI(bodyRange, "CUDA kernel");
@@ -496,6 +496,54 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
   return fn;
 }
 
+void CIRGenFunction::emitConstructorBody(FunctionArgList &args) {
+  assert(!cir::MissingFeatures::sanitizers());
+  const auto *ctor = cast<CXXConstructorDecl>(curGD.getDecl());
+  CXXCtorType ctorType = curGD.getCtorType();
+
+  assert((cgm.getTarget().getCXXABI().hasConstructorVariants() ||
+          ctorType == Ctor_Complete) &&
+         "can only generate complete ctor for this ABI");
+
+  if (ctorType == Ctor_Complete && isConstructorDelegationValid(ctor) &&
+      cgm.getTarget().getCXXABI().hasConstructorVariants()) {
+    emitDelegateCXXConstructorCall(ctor, Ctor_Base, args, ctor->getEndLoc());
+    return;
+  }
+
+  const FunctionDecl *definition = nullptr;
+  Stmt *body = ctor->getBody(definition);
+  assert(definition == ctor && "emitting wrong constructor body");
+
+  if (isa_and_nonnull<CXXTryStmt>(body)) {
+    cgm.errorNYI(ctor->getSourceRange(), "emitConstructorBody: try body");
+    return;
+  }
+
+  assert(!cir::MissingFeatures::incrementProfileCounter());
+  assert(!cir::MissingFeatures::runCleanupsScope());
+
+  // TODO: in restricted cases, we can emit the vbase initializers of a
+  // complete ctor and then delegate to the base ctor.
+
+  assert(!cir::MissingFeatures::emitCtorPrologue());
+  if (ctor->isDelegatingConstructor()) {
+    // This will be handled in emitCtorPrologue, but we should emit a diagnostic
+    // rather than silently fail to delegate.
+    cgm.errorNYI(ctor->getSourceRange(),
+                 "emitConstructorBody: delegating ctor");
+    return;
+  }
+
+  // TODO(cir): propagate this result via mlir::logical result. Just unreachable
+  // now just to have it handled.
+  if (mlir::failed(emitStmt(body, true))) {
+    cgm.errorNYI(ctor->getSourceRange(),
+                 "emitConstructorBody: emit body statement failed.");
+    return;
+  }
+}
+
 /// Given a value of type T* that may not be to a complete object, construct
 /// an l-vlaue withi the natural pointee alignment of T.
 LValue CIRGenFunction::makeNaturalAlignPointeeAddrLValue(mlir::Value val,
@@ -522,16 +570,16 @@ clang::QualType CIRGenFunction::buildFunctionArgList(clang::GlobalDecl gd,
     cgm.getCXXABI().buildThisParam(*this, args);
   }
 
-  if (isa<CXXConstructorDecl>(fd))
-    cgm.errorNYI(fd->getSourceRange(),
-                 "buildFunctionArgList: CXXConstructorDecl");
+  if (const auto *cd = dyn_cast<CXXConstructorDecl>(fd))
+    if (cd->getInheritedConstructor())
+      cgm.errorNYI(fd->getSourceRange(),
+                   "buildFunctionArgList: inherited constructor");
 
   for (auto *param : fd->parameters())
     args.push_back(param);
 
   if (md && (isa<CXXConstructorDecl>(md) || isa<CXXDestructorDecl>(md)))
-    cgm.errorNYI(fd->getSourceRange(),
-                 "buildFunctionArgList: implicit structor params");
+    assert(!cir::MissingFeatures::cxxabiStructorImplicitParam());
 
   return retTy;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 682d59d63faa..361dcd5ef1c3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -66,6 +66,7 @@ public:
   ImplicitParamDecl *cxxabiThisDecl = nullptr;
   mlir::Value cxxabiThisValue = nullptr;
   mlir::Value cxxThisValue = nullptr;
+  clang::CharUnits cxxThisAlignment;
 
   // Holds the Decl for the current outermost non-closure context
   const clang::Decl *curFuncDecl = nullptr;
@@ -473,6 +474,9 @@ public:
 
   bool shouldNullCheckClassCastValue(const CastExpr *ce);
 
+  static bool
+  isConstructorDelegationValid(const clang::CXXConstructorDecl *ctor);
+
   LValue makeNaturalAlignPointeeAddrLValue(mlir::Value v, clang::QualType t);
 
   /// Construct an address with the natural alignment of T. If a pointer to T
@@ -517,6 +521,7 @@ public:
     assert(cxxThisValue && "no 'this' value for this function");
     return cxxThisValue;
   }
+  Address loadCXXThisAddress();
 
   /// Get an appropriate 'undef' rvalue for the given type.
   /// TODO: What's the equivalent for MLIR? Currently we're only using this for
@@ -753,6 +758,8 @@ public:
 
   LValue emitCompoundAssignmentLValue(const clang::CompoundAssignOperator *e);
 
+  void emitConstructorBody(FunctionArgList &args);
+
   mlir::LogicalResult emitContinueStmt(const clang::ContinueStmt &s);
 
   void emitCXXConstructExpr(const clang::CXXConstructExpr *e,
@@ -841,6 +848,17 @@ public:
                                       mlir::Type condType,
                                       bool buildingTopLevelCase);
 
+  void emitDelegateCXXConstructorCall(const clang::CXXConstructorDecl *ctor,
+                                      clang::CXXCtorType ctorType,
+                                      const FunctionArgList &args,
+                                      clang::SourceLocation loc);
+
+  /// We are performing a delegate call; that is, the current function is
+  /// delegating to another one. Produce a r-value suitable for passing the
+  /// given parameter.
+  void emitDelegateCallArg(CallArgList &args, const clang::VarDecl *param,
+                           clang::SourceLocation loc);
+
   /// Emit an `if` on a boolean condition to the specified blocks.
   /// FIXME: Based on the condition, this might try to simplify the codegen of
   /// the conditional based on the branch.
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index fdd8b63fb6da..cd9096a0188a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -20,7 +20,9 @@
 #include "CIRGenCXXABI.h"
 #include "CIRGenFunction.h"
 
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
@@ -35,8 +37,13 @@ public:
     assert(!cir::MissingFeatures::cxxabiUseARMGuardVarABI());
   }
 
-  void emitInstanceFunctionProlog(SourceLocation Loc,
-                                  CIRGenFunction &CGF) override;
+  bool needsVTTParameter(clang::GlobalDecl gd) override;
+
+  void emitInstanceFunctionProlog(SourceLocation loc,
+                                  CIRGenFunction &cgf) override;
+
+  void emitCXXConstructors(const clang::CXXConstructorDecl *d) override;
+  void emitCXXStructor(clang::GlobalDecl gd) override;
 };
 
 } // namespace
@@ -72,6 +79,60 @@ void CIRGenItaniumCXXABI::emitInstanceFunctionProlog(SourceLocation loc,
   }
 }
 
+void CIRGenItaniumCXXABI::emitCXXStructor(GlobalDecl gd) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+  auto *cd = dyn_cast<CXXConstructorDecl>(md);
+
+  if (!cd) {
+    cgm.errorNYI(md->getSourceRange(), "CXCABI emit destructor");
+    return;
+  }
+
+  if (cgm.getCodeGenOpts().CXXCtorDtorAliases)
+    cgm.errorNYI(md->getSourceRange(), "Ctor/Dtor aliases");
+
+  auto fn = cgm.codegenCXXStructor(gd);
+
+  cgm.maybeSetTrivialComdat(*md, fn);
+}
+
+void CIRGenItaniumCXXABI::emitCXXConstructors(const CXXConstructorDecl *d) {
+  // Just make sure we're in sync with TargetCXXABI.
+  assert(cgm.getTarget().getCXXABI().hasConstructorVariants());
+
+  // The constructor used for constructing this as a base class;
+  // ignores virtual bases.
+  cgm.emitGlobal(GlobalDecl(d, Ctor_Base));
+
+  // The constructor used for constructing this as a complete class;
+  // constructs the virtual bases, then calls the base constructor.
+  if (!d->getParent()->isAbstract()) {
+    // We don't need to emit the complete ctro if the class is abstract.
+    cgm.emitGlobal(GlobalDecl(d, Ctor_Complete));
+  }
+}
+
+/// Return whether the given global decl needs a VTT (virtual table table)
+/// parameter, which it does if it's a base constructor or destructor with
+/// virtual bases.
+bool CIRGenItaniumCXXABI::needsVTTParameter(GlobalDecl gd) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+
+  // We don't have any virtual bases, just return early.
+  if (!md->getParent()->getNumVBases())
+    return false;
+
+  // Check if we have a base constructor.
+  if (isa<CXXConstructorDecl>(md) && gd.getCtorType() == Ctor_Base)
+    return true;
+
+  // Check if we have a base destructor.
+  if (isa<CXXDestructorDecl>(md) && gd.getDtorType() == Dtor_Base)
+    return true;
+
+  return false;
+}
+
 CIRGenCXXABI *clang::CIRGen::CreateCIRGenItaniumCXXABI(CIRGenModule &cgm) {
   switch (cgm.getASTContext().getCXXABIKind()) {
   case TargetCXXABI::GenericItanium:
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 8407f8fad06b..434dd376208e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -226,11 +226,9 @@ mlir::Operation *
 CIRGenModule::getAddrOfGlobal(GlobalDecl gd, ForDefinition_t isForDefinition) {
   const Decl *d = gd.getDecl();
 
-  if (isa<CXXConstructorDecl>(d) || isa<CXXDestructorDecl>(d)) {
-    errorNYI(d->getSourceRange(),
-             "getAddrOfGlobal: C++ constructor/destructor");
-    return nullptr;
-  }
+  if (isa<CXXConstructorDecl>(d) || isa<CXXDestructorDecl>(d))
+    return getAddrOfCXXStructor(gd, /*FnInfo=*/nullptr, /*FnType=*/nullptr,
+                                /*DontDefer=*/false, isForDefinition);
 
   if (isa<CXXMethodDecl>(d)) {
     const CIRGenFunctionInfo &fi =
@@ -411,6 +409,7 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
     cgf.generateCode(gd, funcOp, funcType);
   }
   curCGF = nullptr;
+  assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
 }
 
 mlir::Operation *CIRGenModule::getGlobalValue(StringRef name) {
@@ -771,7 +770,7 @@ void CIRGenModule::emitGlobalDefinition(clang::GlobalDecl gd,
       // Make sure to emit the definition(s) before we emit the thunks. This is
       // necessary for the generation of certain thunks.
       if (isa<CXXConstructorDecl>(method) || isa<CXXDestructorDecl>(method))
-        errorNYI(method->getSourceRange(), "C++ ctor/dtor");
+        abi->emitCXXStructor(gd);
       else if (fd->isMultiVersion())
         errorNYI(method->getSourceRange(), "multiversion functions");
       else
@@ -1173,6 +1172,10 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
   case Decl::Empty:
     break;
 
+  case Decl::CXXConstructor:
+    getCXXABI().emitCXXConstructors(cast<CXXConstructorDecl>(decl));
+    break;
+
   // C++ Decls
   case Decl::LinkageSpec:
   case Decl::Namespace:
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 9748c0b3ed43..f76fd8e73364 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -267,6 +267,11 @@ public:
   // Make sure that this type is translated.
   void updateCompletedType(const clang::TagDecl *td);
 
+  // Produce code for this constructor/destructor. This method doesn't try to
+  // apply any ABI rules about which other constructors/destructors are needed
+  // or if they are alias to each other.
+  cir::FuncOp codegenCXXStructor(clang::GlobalDecl gd);
+
   bool supportsCOMDAT() const;
   void maybeSetTrivialComdat(const clang::Decl &d, mlir::Operation *op);
 
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index beaa9afb31f9..217609687eab 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_library(clangCIR
   CIRGenBuilder.cpp
   CIRGenCall.cpp
   CIRGenClass.cpp
+  CIRGenCXX.cpp
   CIRGenCXXABI.cpp
   CIRGenCXXExpr.cpp
   CIRGenBuiltin.cpp
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 3a1e82e338c1..3b4191fd74c9 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -3,7 +3,7 @@
 
 struct Struk {
   int a;
-  Struk();
+  Struk() {}
 };
 
 void baz() {
@@ -12,8 +12,58 @@ void baz() {
 
 // CHECK: !rec_Struk = !cir.record<struct "Struk" {!s32i}>
 
-// CHECK:   cir.func @_ZN5StrukC1Ev(!cir.ptr<!rec_Struk>)
+// Note: In the absence of the '-mconstructor-aliases' option, we emit two
+//       constructors here. The handling of constructor aliases is currently
+//       NYI, but when it is added this test should be updated to add a RUN
+//       line that passes '-mconstructor-aliases' to clang_cc1.
+// CHECK:   cir.func @_ZN5StrukC2Ev(%arg0: !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>, ["this", init] {alignment = 8 : i64}
+// CHECK-NEXT:     cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>
+// CHECK-NEXT:     %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_Struk>>, !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     cir.return
+
+// CHECK:   cir.func @_ZN5StrukC1Ev(%arg0: !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>, ["this", init] {alignment = 8 : i64}
+// CHECK-NEXT:     cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>
+// CHECK-NEXT:     %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_Struk>>, !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     cir.call @_ZN5StrukC2Ev(%[[THIS]]) : (!cir.ptr<!rec_Struk>) -> ()
+// CHECK-NEXT:     cir.return
+
 // CHECK:   cir.func @_Z3bazv()
 // CHECK-NEXT:     %[[S_ADDR:.*]] = cir.alloca !rec_Struk, !cir.ptr<!rec_Struk>, ["s", init] {alignment = 4 : i64}
 // CHECK-NEXT:     cir.call @_ZN5StrukC1Ev(%[[S_ADDR]]) : (!cir.ptr<!rec_Struk>) -> ()
 // CHECK-NEXT:     cir.return
+
+struct VariadicStruk {
+  int a;
+  VariadicStruk(int n, ...) { a = n;}
+};
+
+void bar() {
+  VariadicStruk s(1, 2, 3);
+}
+
+// When a variadic constructor is present, we call the C2 constructor directly.
+
+// CHECK-NOT: cir.func @_ZN13VariadicStrukC2Eiz
+
+// CHECK:      cir.func @_ZN13VariadicStrukC1Eiz(%arg0: !cir.ptr<!rec_VariadicStruk>
+// CHECK-SAME:                                   %arg1: !s32i
+// CHECK-SAME:                                   ...) {
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[N_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[N:.*]] = cir.load{{.*}} %[[N_ADDR]]
+// CHECK-NEXT:   %[[A_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "a"}
+// CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:  cir.func @_Z3barv
+// CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca !rec_VariadicStruk, !cir.ptr<!rec_VariadicStruk>, ["s", init]
+// CHECK-NEXT:    %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT:    %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CHECK-NEXT:    %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT:    cir.call @_ZN13VariadicStrukC1Eiz(%[[S_ADDR]], %[[ONE]], %[[TWO]], %[[THREE]])
+// CHECK-NEXT:    cir.return

From 639c19ddb688595a69ad9f83a40aa32e2187134c Mon Sep 17 00:00:00 2001
From: "long.chen" <lipracer@gmail.com>
Date: Fri, 13 Jun 2025 00:26:26 +0800
Subject: [PATCH 0169/1322] [NFC][mlir] make the assert consistent with the
 declared behavior (#143874)

---
 mlir/include/mlir/ExecutionEngine/MemRefUtils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
index 918647d9feac..f355dfb8648e 100644
--- a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
@@ -48,7 +48,8 @@ inline std::array<int64_t, N> makeStrides(ArrayRef<int64_t> shape) {
   std::array<int64_t, N> res;
   int64_t running = 1;
   for (int64_t idx = N - 1; idx >= 0; --idx) {
-    assert(shape[idx] && "size must be non-negative for all shape dimensions");
+    assert(shape[idx] >= 0 &&
+           "size must be non-negative for all shape dimensions");
     res[idx] = running;
     running *= shape[idx];
   }

From 56548e1d9b2ed4f5d2fe3913c27af770cf0e06e5 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 12 Jun 2025 09:19:58 -0700
Subject: [PATCH 0170/1322] [Matrix] Fix a crash in VisitSelectInst due to
 iteration length mismatch

---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  9 ++-
 .../LowerMatrixIntrinsics/select.ll           | 61 +++++++++++++++++++
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b32160ff275b..1e37f40fa9d5 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -2326,14 +2326,13 @@ public:
     MatrixTy A = getMatrix(OpA, Shape, Builder);
     MatrixTy B = getMatrix(OpB, Shape, Builder);
 
-    Value *CondV[2];
+    SmallVector<Value*> CondV;
     if (isa<FixedVectorType>(Cond->getType())) {
       MatrixTy C = getMatrix(Cond, Shape, Builder);
-      CondV[0] = C.getVector(0);
-      CondV[1] = C.getVector(1);
+      llvm::copy(C.vectors(), std::back_inserter(CondV));
     } else {
-      CondV[0] = Cond;
-      CondV[1] = Cond;
+      CondV.resize(A.getNumVectors());
+      std::fill(CondV.begin(), CondV.end(), Cond);
     }
 
     for (auto [CV, AV, BV] : llvm::zip_equal(CondV, A.vectors(), B.vectors()))
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
index 70b0dfdb3e7e..bd97915759aa 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
@@ -144,3 +144,64 @@ define void @select_2x2_vcond_shape3(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
   store <4 x float> %op, ptr %out
   ret void
 }
+
+define void @select_2x2_vcond_shape4(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape4(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <4 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <4 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[COL_LOAD1]], <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 4, i1 false, i32 4, i32 1)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 4, i1 false, i32 4, i32 1)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape5(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape5(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 1
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x float>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x float>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr float, ptr [[LHS]], i64 3
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <1 x float>, ptr [[VEC_GEP4]], align 4
+; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <1 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr i1, ptr [[COND]], i64 1
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x i1>, ptr [[VEC_GEP7]], align 1
+; CHECK-NEXT:    [[VEC_GEP9:%.*]] = getelementptr i1, ptr [[COND]], i64 2
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <1 x i1>, ptr [[VEC_GEP9]], align 1
+; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr i1, ptr [[COND]], i64 3
+; CHECK-NEXT:    [[COL_LOAD12:%.*]] = load <1 x i1>, ptr [[VEC_GEP11]], align 1
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr float, ptr [[RHS]], i64 1
+; CHECK-NEXT:    [[COL_LOAD15:%.*]] = load <1 x float>, ptr [[VEC_GEP14]], align 4
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD17:%.*]] = load <1 x float>, ptr [[VEC_GEP16]], align 4
+; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr float, ptr [[RHS]], i64 3
+; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x float>, ptr [[VEC_GEP18]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <1 x i1> [[COL_LOAD6]], <1 x float> [[COL_LOAD]], <1 x float> [[COL_LOAD13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <1 x i1> [[COL_LOAD8]], <1 x float> [[COL_LOAD1]], <1 x float> [[COL_LOAD15]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select <1 x i1> [[COL_LOAD10]], <1 x float> [[COL_LOAD3]], <1 x float> [[COL_LOAD17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select <1 x i1> [[COL_LOAD12]], <1 x float> [[COL_LOAD5]], <1 x float> [[COL_LOAD19]]
+; CHECK-NEXT:    store <1 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP20:%.*]] = getelementptr float, ptr [[OUT]], i64 1
+; CHECK-NEXT:    store <1 x float> [[TMP2]], ptr [[VEC_GEP20]], align 4
+; CHECK-NEXT:    [[VEC_GEP21:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <1 x float> [[TMP3]], ptr [[VEC_GEP21]], align 8
+; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <1 x float> [[TMP4]], ptr [[VEC_GEP22]], align 4
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 1, i1 false, i32 1, i32 4)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 1, i1 false, i32 1, i32 4)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}

From 31daed868d69ac1ac6f6a29340d0b5e0e6dc39ab Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Thu, 12 Jun 2025 22:01:11 +0530
Subject: [PATCH 0171/1322] [RISCV] Prefer QC_EXTU to ANDI for certain 12-bit
 mask immediates (#143838)

`QC_EXTU` can be compressed to `QC_C_EXTU` when the immediate is a `mask
>=63`. We currently only handle masks that don't fit in 12-bits in
`RISCVISelDAGToDAG`.

I have added ISEL patterns in `RISCVInstrInfoXqci.td` instead of
changing code in `RISCVISelDAGToDAG` since the other extract
instructions ( in `XTHeadbb` and `XAndesPerf`) don't have compressed
versions and it is a lot easier to maintain things this way.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |  8 ++++
 llvm/test/CodeGen/RISCV/xqcibm-extract.ll   | 42 +++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index dba035bab928..9f96a3ed8056 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1441,6 +1441,14 @@ let Predicates = [HasVendorXqcibm, IsRV32] in {
 def : Pat<(sext_inreg (i32 GPR:$rs1), i16), (QC_EXT GPR:$rs1, 16, 0)>;
 def : Pat<(sext_inreg (i32 GPR:$rs1), i8), (QC_EXT GPR:$rs1, 8, 0)>;
 def : Pat<(sext_inreg (i32 GPR:$rs1), i1), (QC_EXT GPR:$rs1, 1, 0)>;
+
+// Prefer qc.extu to andi for the following cases since the former can be compressed
+def : Pat<(i32 (and GPRNoX0:$rs, 63)), (QC_EXTU GPRNoX0:$rs, 6, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 127)), (QC_EXTU GPRNoX0:$rs, 7, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 255)), (QC_EXTU GPRNoX0:$rs, 8, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 511)), (QC_EXTU GPRNoX0:$rs, 9, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
 let Predicates = [HasVendorXqciint, IsRV32] in
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
index cb01510058da..edf6e9a2d501 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
@@ -247,6 +247,48 @@ define i32 @extu_from_and_i32(i32 %x) {
   ret i32 %a
 }
 
+define i32 @no_extu_from_and_i32(i32 %x) {
+; RV32I-LABEL: no_extu_from_and_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: no_extu_from_and_i32:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    andi a0, a0, 31
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 31
+  ret i32 %a
+}
+
+define i32 @extu_from_and_i32_simm12_lb(i32 %x) {
+; RV32I-LABEL: extu_from_and_i32_simm12_lb:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 63
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_i32_simm12_lb:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 6, 0
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 63
+  ret i32 %a
+}
+
+define i32 @extu_from_and_i32_simm12_ub(i32 %x) {
+; RV32I-LABEL: extu_from_and_i32_simm12_ub:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 2047
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_i32_simm12_ub:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 11, 0
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 2047
+  ret i32 %a
+}
+
 define i64 @extu_from_and_i64(i64 %x) {
 ; RV32I-LABEL: extu_from_and_i64:
 ; RV32I:       # %bb.0:

From cd8facebabab9b61c6af1313cd1fd1e586bc2ba6 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 12 Jun 2025 18:36:03 +0200
Subject: [PATCH 0172/1322] [CIR] Implement folder for VecCreateOp (#143355)

This change adds a folder for the VecCreateOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |   1 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  10 ++
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   3 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |   4 +
 clang/test/CIR/CodeGen/vector-ext.cpp         | 136 ++++++------------
 clang/test/CIR/CodeGen/vector.cpp             | 127 ++++++----------
 .../CIR/Transforms/vector-create-fold.cir     |  19 +++
 7 files changed, 128 insertions(+), 172 deletions(-)
 create mode 100644 clang/test/CIR/Transforms/vector-create-fold.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 634f0dd554c7..194153caa927 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2059,6 +2059,7 @@ def VecCreateOp : CIR_Op<"vec.create", [Pure]> {
   }];
 
   let hasVerifier = 1;
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index a6cf0a6b5d75..8ed0ee92574d 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1533,6 +1533,16 @@ LogicalResult cir::GetMemberOp::verify() {
 // VecCreateOp
 //===----------------------------------------------------------------------===//
 
+OpFoldResult cir::VecCreateOp::fold(FoldAdaptor adaptor) {
+  if (llvm::any_of(getElements(), [](mlir::Value value) {
+        return !mlir::isa<cir::ConstantOp>(value.getDefiningOp());
+      }))
+    return {};
+
+  return cir::ConstVectorAttr::get(
+      getType(), mlir::ArrayAttr::get(getContext(), adaptor.getElements()));
+}
+
 LogicalResult cir::VecCreateOp::verify() {
   // Verify that the number of arguments matches the number of elements in the
   // vector, and that the type of all the arguments matches the type of the
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 29f994263896..6f8a64ce0251 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -142,7 +142,8 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            VecExtractOp, VecShuffleOp, VecShuffleDynamicOp, VecTernaryOp>(op))
+            VecCreateOp, VecExtractOp, VecShuffleOp, VecShuffleDynamicOp,
+            VecTernaryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 1642d10d427b..619e113202c9 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -901,6 +901,10 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
       rewriter.eraseOp(op);
       return mlir::success();
     }
+  } else if (const auto vecTy = mlir::dyn_cast<cir::VectorType>(op.getType())) {
+    rewriter.replaceOp(op, lowerCirAttrAsValue(op, op.getValue(), rewriter,
+                                               getTypeConverter()));
+    return mlir::success();
   } else {
     return op.emitError() << "unsupported constant type " << op.getType();
   }
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index 965c44c9461a..fe4919ec0478 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -77,12 +77,8 @@ void foo() {
 // CIR: %[[VEC_F:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["f", init]
 // CIR: %[[VEC_G:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["g", init]
 // CIR: %[[VEC_H:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["h", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_E_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_E_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_E_VAL]], %[[VEC_E]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[GLOBAL_X:.*]] = cir.get_global @x : !cir.ptr<!s32i>
 // CIR: %[[X_VAL:.*]] = cir.load{{.*}} %[[GLOBAL_X]] : !cir.ptr<!s32i>, !s32i
@@ -95,13 +91,11 @@ void foo() {
 // CIR: %[[VEC_F_VAL:.*]] = cir.vec.create(%[[X_VAL]], %[[CONST_5]], %[[CONST_6]], %[[X_PLUS_1]] :
 // CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_F_VAL]], %[[VEC_F]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_5:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_G_VAL:.*]] = cir.vec.create(%[[CONST_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_G_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_G_VAL]], %[[VEC_G]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_H_VAL:.*]] = cir.vec.create(%[[ZERO]], %[[ZERO]], %[[ZERO]], %[[ZERO]] : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_H_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME; #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_H_VAL]], %[[VEC_H]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
@@ -148,12 +142,8 @@ void foo3() {
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
@@ -184,12 +174,8 @@ void foo4() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -225,12 +211,8 @@ void foo5() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -260,12 +242,8 @@ void foo6() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[VAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["value", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -307,12 +285,8 @@ void foo7() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -353,12 +327,8 @@ void foo8() {
 // CIR: %[[PLUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["plus_res", init]
 // CIR: %[[MINUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["minus_res", init]
 // CIR: %[[NOT_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["not_res", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP1:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[PLUS:.*]] = cir.unary(plus, %[[TMP1]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
@@ -410,19 +380,11 @@ void foo9() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shr", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_5:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: %[[CONST_6:.*]] = cir.const #cir.int<6> : !s32i
-// CIR: %[[CONST_7:.*]] = cir.const #cir.int<7> : !s32i
-// CIR: %[[CONST_8:.*]] = cir.const #cir.int<8> : !s32i
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_5]], %[[CONST_6]], %[[CONST_7]], %[[CONST_8]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -475,9 +437,11 @@ void foo10() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :  !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -534,11 +498,11 @@ void foo11() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -661,11 +625,11 @@ void foo12() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -776,11 +740,11 @@ void foo13() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -891,11 +855,11 @@ void foo14() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
@@ -1105,24 +1069,16 @@ void foo18() {
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
+// CIR-SAME: #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
 // CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index 23e91724dc0f..d0c5b83cd5b0 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -66,12 +66,8 @@ void foo() {
 // CIR: %[[VEC_E:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["e", init]
 // CIR: %[[VEC_F:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["f", init]
 // CIR: %[[VEC_G:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["g", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_D_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_D_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_D_VAL]], %[[VEC_D]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[GLOBAL_X:.*]] = cir.get_global @x : !cir.ptr<!s32i>
 // CIR: %[[X_VAL:.*]] = cir.load{{.*}} %[[GLOBAL_X]] : !cir.ptr<!s32i>, !s32i
@@ -84,14 +80,11 @@ void foo() {
 // CIR: %[[VEC_E_VAL:.*]] = cir.vec.create(%[[X_VAL]], %[[CONST_5]], %[[CONST_6]], %[[X_PLUS_1]] :
 // CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_E_VAL]], %[[VEC_E]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_5:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_F_VAL:.*]] = cir.vec.create(%[[CONST_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_F_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_F_VAL]], %[[VEC_F]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_G_VAL:.*]] = cir.vec.create(%[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_G_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME; #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_G_VAL]], %[[VEC_G]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
@@ -136,12 +129,8 @@ void foo3() {
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
@@ -172,12 +161,8 @@ void foo4() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -213,12 +198,8 @@ void foo5() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -248,12 +229,8 @@ void foo6() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[VAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["value", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -295,12 +272,8 @@ void foo7() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -341,12 +314,8 @@ void foo8() {
 // CIR: %[[PLUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["plus_res", init]
 // CIR: %[[MINUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["minus_res", init]
 // CIR: %[[NOT_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["not_res", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP1:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[PLUS:.*]] = cir.unary(plus, %[[TMP1]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
@@ -398,9 +367,11 @@ void foo9() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shr", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :  !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -453,9 +424,11 @@ void foo10() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :  !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -512,11 +485,11 @@ void foo11() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -639,11 +612,11 @@ void foo12() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -754,11 +727,11 @@ void foo13() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -869,11 +842,11 @@ void foo14() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
@@ -1083,24 +1056,16 @@ void foo18() {
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
+// CIR-SAME: #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
 // CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
diff --git a/clang/test/CIR/Transforms/vector-create-fold.cir b/clang/test/CIR/Transforms/vector-create-fold.cir
new file mode 100644
index 000000000000..fb8f66dc4deb
--- /dev/null
+++ b/clang/test/CIR/Transforms/vector-create-fold.cir
@@ -0,0 +1,19 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_create_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %2 = cir.const #cir.int<1> : !s32i
+    %3 = cir.const #cir.int<2> : !s32i
+    %4 = cir.const #cir.int<3> : !s32i
+    %5 = cir.const #cir.int<4> : !s32i
+    %vec = cir.vec.create(%2, %3, %4, %5 : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+    cir.return %vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_create_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+  // CHECK-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[VEC]] : !cir.vector<4 x !s32i>
+}

From ae7ea6e3a28c017485cc2401703d6fab1549123d Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 09:38:43 -0700
Subject: [PATCH 0173/1322] [libc] Fix ioctl errno inclusion (#143928)

Since errno was moved in
https://github.com/llvm/llvm-project/pull/143187 the code including it
in https://github.com/llvm/llvm-project/pull/141393 was rendered
incorrect. This patch fixes the include and the cmake depends.
---
 libc/src/sys/ioctl/linux/ioctl.cpp           | 2 +-
 libc/test/src/sys/ioctl/linux/CMakeLists.txt | 3 ++-
 libc/test/src/sys/ioctl/linux/ioctl_test.cpp | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/libc/src/sys/ioctl/linux/ioctl.cpp b/libc/src/sys/ioctl/linux/ioctl.cpp
index f03fea21c75b..9bb669c6a6f6 100644
--- a/libc/src/sys/ioctl/linux/ioctl.cpp
+++ b/libc/src/sys/ioctl/linux/ioctl.cpp
@@ -10,7 +10,7 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/test/src/sys/ioctl/linux/CMakeLists.txt b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
index e5095c54a729..2df67e9d9cbd 100644
--- a/libc/test/src/sys/ioctl/linux/CMakeLists.txt
+++ b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
@@ -7,7 +7,7 @@ add_libc_unittest(
   SRCS
     ioctl_test.cpp
   DEPENDS
-    libc.hdr.ioctl_macros
+    libc.hdr.sys_ioctl_macros
     libc.src.sys.ioctl.ioctl
     libc.src.errno.errno
     libc.src.fcntl.open
@@ -15,3 +15,4 @@ add_libc_unittest(
     libc.src.unistd.read
     libc.src.unistd.write
 )
+
diff --git a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
index 9c56a4689b18..b76dc14824c9 100644
--- a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
+++ b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/sys/ioctl/ioctl.h"
 #include "src/unistd/close.h"

From e65131a56335fc6b8e47c609f17df50ea65577b4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 12 Jun 2025 09:49:06 -0700
Subject: [PATCH 0174/1322] MC,test: Specify explicit triple for include.ll

The output is subject to .set or = difference.
---
 llvm/test/MC/AsmParser/include.ll | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/test/MC/AsmParser/include.ll b/llvm/test/MC/AsmParser/include.ll
index 22c9eaf7a36e..625fdd9e1e59 100644
--- a/llvm/test/MC/AsmParser/include.ll
+++ b/llvm/test/MC/AsmParser/include.ll
@@ -1,6 +1,4 @@
-; RUN: llc -I %p/Inputs -filetype asm -o - %s | FileCheck %s
-; UNSUPPORTED: target={{.*}}-zos{{.*}},target=nvptx{{.*}}
-; REQUIRES: default_triple
+; RUN: llc -mtriple=x86_64 -I %p/Inputs -filetype asm -o - %s | FileCheck %s
 
 module asm ".include \22module.x\22"
 

From 2c20bc5112a18a8a893e8caea6fd59c097754d74 Mon Sep 17 00:00:00 2001
From: fairywreath <65404740+fairywreath@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:54:42 -0400
Subject: [PATCH 0175/1322] [mlir][spirv] Add definitions for GL FindILsb and
 FindSMsb (#143916)

Adds SPIRV GL FindILsb and FindSMsb instructions which correspond to GL
instruction numbers 73 and 74.
---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       | 35 ++++++++++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 66 ++++++++++++++++++-
 mlir/test/Target/SPIRV/gl-ops.mlir            | 19 +++++-
 3 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 7ffe0c8da1ca..2ec61758ba8e 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1274,6 +1274,41 @@ def SPIRV_GLReflectOp : SPIRV_GLBinaryArithmeticOp<"Reflect", 71, SPIRV_Float> {
 
 // ----
 
+def SPIRV_GLFindILsbOp : SPIRV_GLUnaryArithmeticOp<"FindILsb", 73, SPIRV_Integer> {
+  let summary = "Integer least-significant bit";
+
+  let description = [{
+    Results in the bit number of the least-significant 1-bit in the binary
+    representation of Value. If Value is 0, the result is -1.
+
+    Result Type and the type of Value must both be integer scalar or
+    integer vector types. Result Type and operand types must have the
+    same number of components with the same component width. Results are
+    computed per component.
+  }];
+}
+
+// ----
+
+def SPIRV_GLFindSMsbOp : SPIRV_GLUnaryArithmeticOp<"FindSMsb", 74, SPIRV_Int32> {
+  let summary = "Signed-integer most-significant bit, with Value interpreted as a signed integer";
+
+  let description = [{
+    For positive numbers, the result will be the bit number of the most significant
+    1-bit. For negative numbers, the result will be the bit number of the most
+    significant 0-bit. For a Value of 0 or -1, the result is -1.
+
+    Result Type and the type of Value must both be integer scalar or
+    integer vector types. Result Type and operand types must have the
+    same number of components with the same component width. Results are
+    computed per component.
+
+    This instruction is currently limited to 32-bit width components.
+  }];
+}
+
+// ----
+
 def SPIRV_GLFindUMsbOp : SPIRV_GLUnaryArithmeticOp<"FindUMsb", 75, SPIRV_Int32> {
   let summary = "Unsigned-integer most-significant bit";
 
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index fbcf2095dc60..2b75767feaf9 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -567,7 +567,71 @@ func.func @fmix_vector(%arg0 : vector<3xf32>, %arg1 : vector<3xf32>, %arg2 : vec
 // -----
 
 //===----------------------------------------------------------------------===//
-// spirv.GL.Exp
+// spirv.GL.FindILsb
+//===----------------------------------------------------------------------===//
+
+func.func @findimsb_scalar_i32(%arg0 : i32) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : i32
+  %2 = spirv.GL.FindILsb %arg0 : i32
+  return
+}
+
+func.func @findimsb_vector_i32(%arg0 : vector<3xi32>) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : vector<3xi32>
+  %2 = spirv.GL.FindILsb %arg0 : vector<3xi32>
+  return
+}
+
+func.func @findimsb_scalar_i16(%arg0 : i16) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : i16
+  %2 = spirv.GL.FindILsb %arg0 : i16
+  return
+}
+
+func.func @findimsb_vector_i64(%arg0 : vector<3xi64>) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : vector<3xi64>
+  %2 = spirv.GL.FindILsb %arg0 : vector<3xi64>
+  return
+}
+
+// -----
+
+func.func @findimsb_error_scalar_float(%arg0 : f32) -> () {
+  // expected-error @+1 {{operand #0 must be 8/16/32/64-bit integer or vector of 8/16/32/64-bit integer values of length 2/3/4/8/1}}
+  %2 = spirv.GL.FindILsb %arg0 : f32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.FindSMsb
+//===----------------------------------------------------------------------===//
+
+func.func @findsmsb_scalar(%arg0 : i32) -> () {
+  // CHECK: spirv.GL.FindSMsb {{%.*}} : i32
+  %2 = spirv.GL.FindSMsb %arg0 : i32
+  return
+}
+
+func.func @findsmsb_vector(%arg0 : vector<3xi32>) -> () {
+  // CHECK: spirv.GL.FindSMsb {{%.*}} : vector<3xi32>
+  %2 = spirv.GL.FindSMsb %arg0 : vector<3xi32>
+  return
+}
+
+// -----
+
+func.func @findsmsb_error_scalar_i64(%arg0 : i64) -> () {
+  // expected-error @+1 {{operand #0 must be Int32 or vector of Int32}}
+  %2 = spirv.GL.FindSMsb %arg0 : i64
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.FindUMsb
 //===----------------------------------------------------------------------===//
 
 func.func @findumsb(%arg0 : i32) -> () {
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index e4a6c6fb5a34..eacf36bfba9c 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -90,13 +90,24 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.Return
   }
 
+  spirv.func @findilsb(%arg0 : i32) "None" {
+    // CHECK: spirv.GL.FindILsb {{%.*}} : i32
+    %2 = spirv.GL.FindILsb %arg0 : i32
+    spirv.Return
+  }
+  spirv.func @findsmsb(%arg0 : i32) "None" {
+    // CHECK: spirv.GL.FindSMsb {{%.*}} : i32
+    %2 = spirv.GL.FindSMsb %arg0 : i32
+    spirv.Return
+  }
+
   spirv.func @findumsb(%arg0 : i32) "None" {
     // CHECK: spirv.GL.FindUMsb {{%.*}} : i32
     %2 = spirv.GL.FindUMsb %arg0 : i32
     spirv.Return
   }
 
-  spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
+  spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>, %arg3: vector<3xi32>) "None" {
     // CHECK: {{%.*}} = spirv.GL.Cross {{%.*}}, {{%.*}} : vector<3xf32>
     %0 = spirv.GL.Cross %arg1, %arg2 : vector<3xf32>
     // CHECK: {{%.*}} = spirv.GL.Normalize {{%.*}} : f32
@@ -111,6 +122,12 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %5 = spirv.GL.Distance %arg0, %arg0 : f32, f32 -> f32
     // CHECK: {{%.*}} = spirv.GL.Distance {{%.*}}, {{%.*}} : vector<3xf32>, vector<3xf32> -> f32
     %6 = spirv.GL.Distance %arg1, %arg2 : vector<3xf32>, vector<3xf32> -> f32
+    // CHECK: {{%.*}} = spirv.GL.FindILsb {{%.*}} : vector<3xi32>
+    %7 = spirv.GL.FindILsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.FindSMsb {{%.*}} : vector<3xi32>
+    %8 = spirv.GL.FindSMsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.FindUMsb {{%.*}} : vector<3xi32>
+    %9 = spirv.GL.FindUMsb %arg3 : vector<3xi32>
     spirv.Return
   }
 

From 1a4cf1d3edff2d4c790f597834301702cfc6dc15 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 12 Jun 2025 10:07:45 -0700
Subject: [PATCH 0176/1322] [lldb][headers] Create Python script to fix up
 framework headers (#142051)

This commit replaces the shell script that fixes up includes for the
LLDB framework with a Python script. This script will also be used when
fixing up includes for the LLDBRPC.framework.
---
 lldb/cmake/modules/LLDBFramework.cmake        |  42 +++---
 lldb/scripts/framework-header-fix.py          | 126 ++++++++++++++++++
 .../Shell/Scripts/Inputs/Main/SBAddress.h     |  13 ++
 .../Shell/Scripts/Inputs/RPC/RPCSBAddress.h   |   9 ++
 .../Shell/Scripts/TestFrameworkFixScript.test |  11 ++
 .../Scripts/TestFrameworkFixUnifdef.test      |  12 ++
 .../Scripts/TestRPCFrameworkFixScript.test    |  14 ++
 7 files changed, 206 insertions(+), 21 deletions(-)
 create mode 100755 lldb/scripts/framework-header-fix.py
 create mode 100644 lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixScript.test
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
 create mode 100644 lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test

diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 8961b1afe93a..70010ffbf738 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,24 +68,17 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
-# At configuration time, collect headers for the framework bundle and copy them
-# into a staging directory. Later we can copy over the entire folder.
-file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
-set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
-file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
-file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
-list(REMOVE_ITEM root_public_headers ${root_private_headers})
-
 find_program(unifdef_EXECUTABLE unifdef)
 
-set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
-foreach(header
-    ${public_headers}
-    ${generated_public_headers}
-    ${root_public_headers})
+# All necessary header files will be staged in the include directory in the build directory,
+# so just copy the files from there into the framework's staging directory.
+set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
+set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
+file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
+foreach(header ${lldb_build_dir_header_staging_list})
 
   get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_header_staging}/${basename})
+  set(staged_header ${lldb_framework_header_staging}/${basename})
 
   if(unifdef_EXECUTABLE)
     # unifdef returns 0 when the file is unchanged and 1 if something was changed.
@@ -112,13 +105,20 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# At build time, copy the staged headers into the framework bundle (and do
-# some post-processing in-place).
-add_custom_command(TARGET liblldb POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${lldb_header_staging} $<TARGET_FILE_DIR:liblldb>/Headers
-  COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.sh $<TARGET_FILE_DIR:liblldb>/Headers ${LLDB_VERSION}
-  COMMENT "LLDB.framework: copy framework headers"
-)
+# Take the headers from the staging directory and fix up their includes for the framework.
+# Then write them to the output directory.
+# Also, run unifdef to remove any specified guards from the header files.
+file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
+foreach(header ${lldb_framework_header_staging_list})
+
+  set(input_header ${header})
+  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${input_header})
+
+  add_custom_command(TARGET liblldb POST_BUILD
+    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
+    COMMENT "LLDB.framework: Fix up and copy framework headers"
+  )
+endforeach()
 
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
new file mode 100755
index 000000000000..9e4e5f860a2c
--- /dev/null
+++ b/lldb/scripts/framework-header-fix.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+"""
+Usage: <path/to/input-directory> <path/to/output-directory>
+
+This script is used when building LLDB.framework or LLDBRPC.framework. For each framework, local includes are converted to their respective framework includes.
+
+This script is used in 2 ways:
+1. It is used on header files that are copied into LLDB.framework. For these files, local LLDB includes are converted into framework includes, e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>.
+
+2. It is used on header files for LLDBRPC.framework. For these files, includes of RPC common files will be converted to framework includes, e.g. #include <lldb-rpc/common/RPCCommon.h> -> #include <LLDBRPC/RPCCommon.h>. It will also change local includes to framework includes, e.g. #include "SBAddress.h" -> #include <LLDBRPC/SBAddress.h>
+"""
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+
+# Main header regexes
+INCLUDE_FILENAME_REGEX = re.compile(
+    r'#include "lldb/API/(?P<include_filename>.*){0,1}"'
+)
+
+# RPC header regexes
+RPC_COMMON_REGEX = re.compile(r"#include <lldb-rpc/common/(?P<include_filename>.*)>")
+RPC_INCLUDE_FILENAME_REGEX = re.compile(r'#include "(?P<include_filename>.*)"')
+
+
+def modify_rpc_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to RPC framework level includes.
+            # e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+            # Also, RPC common code includes must change to RPC framework level includes.
+            # e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+            rpc_common_matches = RPC_COMMON_REGEX.finditer(file_buffer)
+            rpc_include_filename_matches = RPC_INCLUDE_FILENAME_REGEX.finditer(
+                file_buffer
+            )
+            for match in rpc_common_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            for match in rpc_include_filename_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            output_file.write(file_buffer)
+
+
+def modify_main_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to framework level includes.
+            # e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+            regex_matches = INCLUDE_FILENAME_REGEX.finditer(file_buffer)
+            for match in regex_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDB/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+                output_file.write(file_buffer)
+
+
+def remove_guards(output_file_path, unifdef_path, unifdef_guards):
+    # The unifdef path should be passed in from CMake. If it wasn't there in CMake or is incorrect,
+    # find it using shutil. If shutil can't find it, then exit.
+    if not shutil.which(unifdef_path):
+        unifdef_path = shutil.which("unifdef")
+    if not unifdef_path:
+        print(
+            "Unable to find unifdef executable. Guards will not be removed from input files. Exiting..."
+        )
+        sys.exit(1)
+
+    subprocess_command = (
+        [unifdef_path, "-o", output_file_path] + unifdef_guards + [output_file_path]
+    )
+    subprocess.run(subprocess_command)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", "--framework", choices=["lldb_main", "lldb_rpc"])
+    parser.add_argument("-i", "--input_file")
+    parser.add_argument("-o", "--output_file")
+    parser.add_argument("-p", "--unifdef_path")
+    parser.add_argument(
+        "unifdef_guards",
+        nargs="+",
+        type=str,
+        help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
+    )
+    args = parser.parse_args()
+    input_file_path = str(args.input_file)
+    output_file_path = str(args.output_file)
+    framework_version = args.framework
+    unifdef_path = str(args.unifdef_path)
+    # Prepend dashes to the list of guards passed in from the command line.
+    # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
+    # but passing them in with dashes for this script causes argparse to think that they're
+    # arguments in and of themself, so they need to passed in without dashes.
+    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
+
+    if framework_version == "lldb_main":
+        modify_main_includes(input_file_path, output_file_path)
+    if framework_version == "lldb_rpc":
+        modify_rpc_includes(input_file_path, output_file_path)
+    # After the incldues have been modified, run unifdef on the headers to remove any guards
+    # specified at the command line.
+    remove_guards(output_file_path, unifdef_path, unifdef_guards)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
new file mode 100644
index 000000000000..fecc69687cd7
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
@@ -0,0 +1,13 @@
+// This is a truncated version of an SB API file
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDB.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "lldb/API/SBDefines.h"
+#include "lldb/API/SBModule.h"
+
+// Any include guards specified at the command line must be removed.
+#ifndef SWIG
+int a = 10
+#endif
diff --git a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
new file mode 100644
index 000000000000..556afa38a922
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
@@ -0,0 +1,9 @@
+// This is a truncated version of an SB API file generated by lldb-rpc-gen
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDBRPC.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "LLDBRPC.h"
+#include "SBDefines.h"
+#include <lldb-rpc/common/RPCPublic.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
new file mode 100644
index 000000000000..e90c3bdfc5ad
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
@@ -0,0 +1,11 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Local includes must be changed to framework level includes.
+# e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBModule.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
new file mode 100644
index 000000000000..a7e82d2f3640
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Any include guards specified at the command line must be removed.
+CHECK-NOT: #ifndef SWIG
+CHECK: int a = 10
+CHECK-NOT: #endif
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
new file mode 100644
index 000000000000..8ba03a8c2afa
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
@@ -0,0 +1,14 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/Main/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
+
+# Local includes must be changed to RPC framework level includes.
+# e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+# Also, RPC common code includes must change to RPC framework level includes.
+# e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+CHECK: #include <LLDBRPC/RPCPublic.h>
+CHECK: #include <LLDBRPC/SBDefines.h>
+CHECK: #include <LLDBRPC/LLDBRPC.h>

From 217304a09949de73a8def5ee4c7ed9510449ce4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=83=E5=9C=8B=E5=BA=AD?= <we3223@gmail.com>
Date: Fri, 13 Jun 2025 01:08:07 +0800
Subject: [PATCH 0177/1322] [X86] Use X86FixupInstTunings to select between
 (V)MOVSS/D and (V)BLENDPS/D (#143895)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix https://github.com/llvm/llvm-project/issues/142588
Following @RKSimon’s suggestion, the transformation applies only when
the blend mask is exactly 1, indicating that the instruction behaves
like a move. Additionally, the conversion will only be performed when
optimizing for size or when the target prefers MOVSS/D over BLENDPS/D
for performance reasons.

The switch-case instructions were identified with GPT O.O .

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    | 15 ++++
 .../test/CodeGen/X86/2012-01-12-extract-sv.ll |  4 +-
 llvm/test/CodeGen/X86/avx-insertelt.ll        |  6 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |  4 +-
 .../CodeGen/X86/avx512-intrinsics-upgrade.ll  |  2 +-
 llvm/test/CodeGen/X86/avx512-intrinsics.ll    |  2 +-
 .../test/CodeGen/X86/avx512copy-intrinsics.ll |  2 +-
 llvm/test/CodeGen/X86/build-vector-512.ll     |  6 +-
 llvm/test/CodeGen/X86/buildvec-extract.ll     |  6 +-
 .../CodeGen/X86/canonicalize-vars-f16-type.ll |  8 +-
 .../CodeGen/X86/coalesce_commute_movsd.ll     |  4 +-
 llvm/test/CodeGen/X86/combine-and.ll          |  4 +-
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   | 80 ++++++++++++++-----
 .../CodeGen/X86/fminimumnum-fmaximumnum.ll    |  2 +-
 llvm/test/CodeGen/X86/fmsubadd-combine.ll     |  4 +-
 .../test/CodeGen/X86/fp-strict-scalar-fp16.ll | 14 ++--
 .../X86/fp-strict-scalar-inttofp-fp16.ll      | 24 +++---
 .../X86/fp-strict-scalar-round-fp16.ll        | 14 ++--
 llvm/test/CodeGen/X86/half-constrained.ll     |  6 +-
 llvm/test/CodeGen/X86/half-darwin.ll          |  2 +-
 llvm/test/CodeGen/X86/insertelement-zero.ll   |  4 +-
 llvm/test/CodeGen/X86/masked_expandload.ll    |  2 +-
 llvm/test/CodeGen/X86/masked_gather.ll        | 12 +--
 .../test/CodeGen/X86/masked_gather_scatter.ll |  2 +-
 llvm/test/CodeGen/X86/masked_load.ll          |  2 +-
 llvm/test/CodeGen/X86/pr40730.ll              |  4 +-
 llvm/test/CodeGen/X86/scalarize-fp.ll         |  2 +-
 .../CodeGen/X86/sse-insertelt-from-mem.ll     |  2 +-
 llvm/test/CodeGen/X86/sse-insertelt.ll        |  2 +-
 .../CodeGen/X86/sse-intrinsics-fast-isel.ll   | 16 ++--
 llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll  | 16 ++--
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |  8 +-
 llvm/test/CodeGen/X86/sse2.ll                 |  2 +-
 llvm/test/CodeGen/X86/sse41.ll                | 12 +--
 .../test/CodeGen/X86/stack-folding-fp-avx1.ll |  4 +-
 llvm/test/CodeGen/X86/vec-strict-128-fp16.ll  |  2 +-
 .../X86/vec-strict-fptoint-128-fp16.ll        | 32 ++++----
 llvm/test/CodeGen/X86/vec_extract-avx.ll      |  8 +-
 llvm/test/CodeGen/X86/vec_floor.ll            | 68 ++++++++--------
 llvm/test/CodeGen/X86/vec_ss_load_fold.ll     |  8 +-
 llvm/test/CodeGen/X86/vector-blend.ll         |  2 +-
 .../CodeGen/X86/vector-half-conversions.ll    |  4 +-
 .../vector-interleaved-store-i32-stride-5.ll  |  4 +-
 .../vector-interleaved-store-i32-stride-7.ll  |  2 +-
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |  2 +-
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll | 16 ++--
 .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 32 +++-----
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |  4 +-
 .../X86/vector-shuffle-combining-avx2.ll      |  2 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |  4 +-
 .../X86/vector-shuffle-combining-xop.ll       |  2 +-
 llvm/test/CodeGen/X86/vector-zmov.ll          |  2 +-
 llvm/test/CodeGen/X86/vselect.ll              |  8 +-
 53 files changed, 272 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 6bb7600dedca..fd13305d8a73 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,7 +222,22 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
+  auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
+    if (MI.getOperand(NumOperands - 1).getImm() != 1)
+      return false;
+    bool Force = MF.getFunction().hasOptSize();
+    if (!Force && !NewOpcPreferable(MovOpc))
+      return false;
+    MI.setDesc(TII->get(MovOpc));
+    MI.removeOperand(NumOperands - 1);
+    return true;
+  };
+
   switch (Opc) {
+  case X86::VBLENDPSrri:
+    return ProcessBLENDToMOV(X86::VMOVSSrr);
+  case X86::VBLENDPDrri:
+    return ProcessBLENDToMOV(X86::VMOVSDrr);
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 254a53fcac4d..65273870c3df 100644
--- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -11,7 +11,7 @@ define void @endless_loop() {
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vmovaps %ymm0, (%eax)
 ; AVX1-NEXT:    vmovaps %ymm1, (%eax)
 ; AVX1-NEXT:    vzeroupper
@@ -21,7 +21,7 @@ define void @endless_loop() {
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vbroadcastss (%eax), %xmm0
 ; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 95a3169a5b16..02e6c9649c9a 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -94,7 +94,7 @@ define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float
 ; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 ;
@@ -202,9 +202,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
 define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
 ; AVX-LABEL: insert_f32_firstelts:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a8574c0b7516..30bf1a261f4b 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1843,7 +1843,7 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
 ; X86-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; X86-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
-; X86-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtu64_sd:
@@ -1891,7 +1891,7 @@ define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
 ; X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index c1ef500d9d3d..aae48aba93be 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -10483,7 +10483,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; CHECK-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; CHECK-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %q = load float, ptr %ptr_b
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 926af4e9957a..f9b5994a18d3 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6505,7 +6505,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %q = load float, ptr %ptr_b
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a7ca23792e6f..a2af7df44010 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
 ; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
 ; NOAVX512MOVZXC:       # %bb.0:
 ; NOAVX512MOVZXC-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; NOAVX512MOVZXC-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; NOAVX512MOVZXC-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOAVX512MOVZXC-NEXT:    retq # encoding: [0xc3]
   %res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 789196c5e484..69d17fe3ab69 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -578,7 +578,7 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
 ; AVX-32-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,17,0,0]
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; AVX-32-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
 ; AVX-32-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm2[0]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
@@ -626,7 +626,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-NEXT:    vbroadcastss (%ecx), %xmm1
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-32-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
 ; AVX-32-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
@@ -640,7 +640,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-64-NEXT:    vbroadcastss (%rdi), %xmm1
 ; AVX-64-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-64-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-64-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 545c57fed4b2..9d856ed7647c 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -42,7 +42,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 ; AVX-LABEL: extract0_i32_zext_insert0_i64_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 0
   %z = zext i32 %e to i64
@@ -85,7 +85,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 1
   %z = zext i32 %e to i64
@@ -130,7 +130,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 2
   %z = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 556b0deaf4c8..8b3aa2964db0 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -51,7 +51,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
@@ -149,7 +149,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0
@@ -235,12 +235,12 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm2
 ; AVX512-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX512-NEXT:    vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index b42fd957d7f4..086df87d1d5f 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -44,12 +44,12 @@ define <4 x float> @insert_f32(float %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: insert_f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: insert_f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    retq
  %1 = insertelement <4 x float> %a1, float %a0, i32 0
  ret <4 x float> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index e5594dc9c5e3..173457ff4667 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -37,7 +37,7 @@ define <4 x i32> @test1(<4 x i32> %A) {
 ; AVX-LABEL: test1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
   ret <4 x i32> %1
@@ -195,7 +195,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
 ; AVX-LABEL: test11:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 54390d8b66f7..2b5f09113ca6 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -86,10 +86,20 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
@@ -108,10 +118,20 @@ define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test5:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test5:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test5:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
@@ -241,10 +261,20 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test11:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test11:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test11:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test11:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
   %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
   %or = or <4 x i32> %and1, %and2
@@ -263,10 +293,20 @@ define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test12:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test12:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test12:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test12:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
   %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
   %or = or <4 x i32> %and1, %and2
@@ -395,18 +435,18 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 ; AVX1-LABEL: test18:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test18:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 33bc93d0fe4d..95d350d45d90 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1343,7 +1343,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
 ; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_nan:
diff --git a/llvm/test/CodeGen/X86/fmsubadd-combine.ll b/llvm/test/CodeGen/X86/fmsubadd-combine.ll
index ddf51b858cdd..674a1d5ad779 100644
--- a/llvm/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/llvm/test/CodeGen/X86/fmsubadd-combine.ll
@@ -12,7 +12,7 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x dou
 ; NOFMA-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; NOFMA-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
 ; NOFMA-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; NOFMA-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; NOFMA-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; NOFMA-NEXT:    retq
 ;
 ; FMA3-LABEL: mul_subadd_pd128:
@@ -191,7 +191,7 @@ define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2
 ; CHECK-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vsubpd %xmm0, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    retq
 entry:
   %AB = fmul <2 x double> %A, %B
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index fbc798d8bbe4..b013ddad19a9 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -44,7 +44,7 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -92,7 +92,7 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -140,7 +140,7 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -188,7 +188,7 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -400,7 +400,7 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX-NEXT:    retq
@@ -469,7 +469,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq fmaf@PLT
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    popq %rax
 ; F16C-NEXT:    retq
@@ -490,7 +490,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index c31bee5ff103..6312a26db9bf 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -35,7 +35,7 @@ define half @sitofp_i1tof16(i1 %x) #0 {
 ; AVX-NEXT:    movsbl %dil, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -76,7 +76,7 @@ define half @sitofp_i8tof16(i8 %x) #0 {
 ; AVX-NEXT:    movsbl %dil, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -112,7 +112,7 @@ define half @sitofp_i16tof16(i16 %x) #0 {
 ; AVX-NEXT:    movswl %di, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -146,7 +146,7 @@ define half @sitofp_i32tof16(i32 %x) #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -178,7 +178,7 @@ define half @sitofp_i64tof16(i64 %x) #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -213,7 +213,7 @@ define half @uitofp_i1tof16(i1 %x) #0 {
 ; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -251,7 +251,7 @@ define half @uitofp_i8tof16(i8 %x) #0 {
 ; AVX-NEXT:    movzbl %dil, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -287,7 +287,7 @@ define half @uitofp_i16tof16(i16 %x) #0 {
 ; AVX-NEXT:    movzwl %di, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -323,7 +323,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ; F16C-NEXT:    movl %edi, %eax
 ; F16C-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
@@ -331,7 +331,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -387,7 +387,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ; F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:  .LBB9_2:
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
@@ -395,7 +395,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index c834ddbf46f7..85a43394a1dc 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -31,7 +31,7 @@ define half @fceil32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -67,7 +67,7 @@ define half @ffloor32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -103,7 +103,7 @@ define half @ftrunc32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -139,7 +139,7 @@ define half @frint32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -176,7 +176,7 @@ define half @fnearbyint32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -213,7 +213,7 @@ define half @froundeven16(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -251,7 +251,7 @@ define half @fround16(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    callq roundf@PLT
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index f1874cc03000..d5f2060ca20e 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -194,7 +194,7 @@ define void @float_to_half(float %0) strictfp {
 ; X64-F16C-LABEL: float_to_half:
 ; X64-F16C:       # %bb.0:
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; X64-F16C-NEXT:    movq a@GOTPCREL(%rip), %rax
 ; X64-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
@@ -350,7 +350,7 @@ define void @add() strictfp {
 ; X86-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; X86-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; X86-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; X86-F16C-NEXT:    vpextrw $0, %xmm0, c
 ; X86-F16C-NEXT:    retl
@@ -387,7 +387,7 @@ define void @add() strictfp {
 ; X64-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; X64-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; X64-F16C-NEXT:    movq c@GOTPCREL(%rip), %rax
 ; X64-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 3cbf5c11235e..8765f7dbe6d3 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -105,7 +105,7 @@ define void @strict_truncsfhf(float %in, ptr %ptr) nounwind strictfp {
 ; CHECK-F16C-LABEL: strict_truncsfhf:
 ; CHECK-F16C:       ## %bb.0:
 ; CHECK-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; CHECK-F16C-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 952940d14180..31551360be48 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -508,8 +508,8 @@ define <8 x float> @PR41512_v8f32(float %x, float %y) {
 ; AVX-LABEL: PR41512_v8f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %ins1 = insertelement <8 x float> zeroinitializer, float %x, i32 0
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 4c5b67962a58..b7fe8e053fa1 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1120,7 +1120,7 @@ define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32
 ; AVX1OR2-NEXT:    retq
 ; AVX1OR2-NEXT:  LBB4_1: ## %cond.load
 ; AVX1OR2-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX1OR2-NEXT:    addq $4, %rdi
 ; AVX1OR2-NEXT:    testb $2, %al
 ; AVX1OR2-NEXT:    je LBB4_4
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 559a7ec0930b..324a371632c4 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -65,7 +65,7 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <
 ; AVX1-NEXT:  # %bb.1: # %cond.load
 ; AVX1-NEXT:    vmovq %xmm0, %rcx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:  .LBB0_2: # %else
 ; AVX1-NEXT:    testb $2, %al
 ; AVX1-NEXT:    je .LBB0_4
@@ -105,7 +105,7 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <
 ; AVX2-NEXT:  # %bb.1: # %cond.load
 ; AVX2-NEXT:    vmovq %xmm0, %rcx
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:  .LBB0_2: # %else
 ; AVX2-NEXT:    testb $2, %al
 ; AVX2-NEXT:    je .LBB0_4
@@ -254,7 +254,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32
 ; AVX1-NEXT:  # %bb.1: # %cond.load
 ; AVX1-NEXT:    vmovq %xmm0, %rcx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:  .LBB1_2: # %else
 ; AVX1-NEXT:    testb $2, %al
 ; AVX1-NEXT:    je .LBB1_4
@@ -299,7 +299,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32
 ; AVX2-NEXT:  # %bb.1: # %cond.load
 ; AVX2-NEXT:    vmovq %xmm0, %rcx
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:  .LBB1_2: # %else
 ; AVX2-NEXT:    testb $2, %al
 ; AVX2-NEXT:    je .LBB1_4
@@ -451,7 +451,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32
 ; AVX1-NEXT:  # %bb.1: # %cond.load
 ; AVX1-NEXT:    vmovq %xmm0, %rcx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:  .LBB2_2: # %else
 ; AVX1-NEXT:    testb $2, %al
 ; AVX1-NEXT:    je .LBB2_4
@@ -495,7 +495,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32
 ; AVX2-NEXT:  # %bb.1: # %cond.load
 ; AVX2-NEXT:    vmovq %xmm0, %rcx
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:  .LBB2_2: # %else
 ; AVX2-NEXT:    testb $2, %al
 ; AVX2-NEXT:    je .LBB2_4
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index af018d83d520..4e6f666fa05d 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -3481,7 +3481,7 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
 ; X86-SKX-NEXT:  .LBB47_1: # %cond.load
 ; X86-SKX-NEXT:    vmovd %xmm0, %ecx
 ; X86-SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; X86-SKX-NEXT:    vmovss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; X86-SKX-NEXT:    testb $2, %al
 ; X86-SKX-NEXT:    je .LBB47_4
 ; X86-SKX-NEXT:  .LBB47_3: # %cond.load1
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d1017..e2e26da95b87 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6424,7 +6424,7 @@ define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) {
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mload_constmask_v4i32:
diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll
index 164bf203d054..304d071e5d6e 100644
--- a/llvm/test/CodeGen/X86/pr40730.ll
+++ b/llvm/test/CodeGen/X86/pr40730.ll
@@ -5,7 +5,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: shuffle_v8i32_0dcd3f14:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -26,7 +26,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0)  {
 ; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7]
diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll
index ea9b69f8f5b8..ae24d3487c4b 100644
--- a/llvm/test/CodeGen/X86/scalarize-fp.ll
+++ b/llvm/test/CodeGen/X86/scalarize-fp.ll
@@ -911,7 +911,7 @@ define <4 x float> @merge_fcmp_cmpeqss_v4f32(<4 x float> %x, <4 x float> %y) {
 ; AVX1-LABEL: merge_fcmp_cmpeqss_v4f32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vcmpeqss %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: merge_fcmp_cmpeqss_v4f32:
diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
index f0af8fc29969..5ae905583571 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
@@ -22,7 +22,7 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) {
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %s = load float, ptr %s.addr
   %i0 = insertelement <4 x float> %x, float %s, i32 0
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index 34de7e65465d..1e4fe81abc13 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -19,7 +19,7 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
 ;
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %i0 = insertelement <4 x float> %x, float %s, i32 0
   ret <4 x float> %i0
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index b1f9872f7b6e..2e2e78a6da51 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -190,7 +190,7 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpge_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
@@ -232,7 +232,7 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpgt_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
@@ -382,7 +382,7 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpnge_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
@@ -424,7 +424,7 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpngt_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
@@ -1603,7 +1603,7 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: test_mm_move_ss:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -2219,7 +2219,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X86-AVX1-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2228,7 +2228,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X86-AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2243,7 +2243,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X64-AVX-LABEL: test_mm_set_ss:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X64-AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 47d35f3636d4..006c3006350c 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1165,7 +1165,7 @@ define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; AVX-LABEL: insert_test5_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1203,7 +1203,7 @@ define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
 ; AVX-LABEL: insert_test5_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1241,7 +1241,7 @@ define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; AVX-LABEL: insert_test5_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
@@ -1279,7 +1279,7 @@ define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
 ; AVX-LABEL: insert_test5_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
@@ -1318,7 +1318,7 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X86-AVX1-NEXT:  # %bb.1:
 ; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
 ; X86-AVX1-NEXT:  .LBB70_2:
-; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: add_ss_mask:
@@ -1360,7 +1360,7 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X64-AVX1-NEXT:  # %bb.1:
 ; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:  .LBB70_2:
-; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: add_ss_mask:
@@ -1412,7 +1412,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-AVX1-NEXT:  # %bb.1:
 ; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
 ; X86-AVX1-NEXT:  .LBB71_2:
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: add_sd_mask:
@@ -1454,7 +1454,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X64-AVX1-NEXT:  # %bb.1:
 ; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:  .LBB71_2:
-; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: add_sd_mask:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index dbdc45abb24d..18a6be8aaf0b 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -631,7 +631,7 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi
 ; AVX-LABEL: test_mm_cmpge_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
@@ -748,7 +748,7 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
 ; AVX-LABEL: test_mm_cmpgt_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
@@ -976,7 +976,7 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw
 ; AVX-LABEL: test_mm_cmpnge_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
@@ -1021,7 +1021,7 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw
 ; AVX-LABEL: test_mm_cmpngt_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index 3e5d76eae0bb..e1d91b407fc2 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -670,7 +670,7 @@ define <4 x i32> @PR19721(<4 x i32> %i) {
 ; AVX-LABEL: PR19721:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}}
 ;
 ; X64-SSE-LABEL: PR19721:
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d..53a10ab0c26f 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -353,7 +353,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -361,7 +361,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -373,7 +373,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ;
 ; X64-AVX-LABEL: blendps_not_insertps_1:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X64-AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
@@ -440,7 +440,7 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
 ;
 ; AVX-LABEL: blendps_not_insertps_2:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %tmp2 = extractelement <4 x float> %t2, i32 0
@@ -1207,7 +1207,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 ; AVX1-LABEL: i32_shuf_X00A:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX1-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
+; AVX1-NEXT:    vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
 ; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00]
 ; AVX1-NEXT:    ## xmm1 = xmm1[0,0,0,0]
@@ -1218,7 +1218,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 ; AVX512-LABEL: i32_shuf_X00A:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
 ; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
 ; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index d7404c9e7c7d..665a84a26fea 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1216,7 +1216,7 @@ define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -1307,7 +1307,7 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index 2c3d7ceb37d0..a6e288608c87 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -154,7 +154,7 @@ define <4 x float> @f18(<4 x float> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: f18:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <8 x half> %a1, i32 0
   %cvt = call float @llvm.experimental.constrained.fpext.f32.f16(half %ext,
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
index 0126685f2bb3..bde14e75dfc0 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
@@ -32,7 +32,7 @@ define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
@@ -54,7 +54,7 @@ define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
@@ -76,14 +76,14 @@ define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
@@ -99,14 +99,14 @@ define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; NOVL-NEXT:    vcvttph2udq %ymm0, %zmm0
@@ -122,14 +122,14 @@ define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
@@ -145,14 +145,14 @@ define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
@@ -168,7 +168,7 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
@@ -176,7 +176,7 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
@@ -192,7 +192,7 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
@@ -200,7 +200,7 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
@@ -216,7 +216,7 @@ define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k1
@@ -247,7 +247,7 @@ define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k1
diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll
index 341a703a21bd..4b70933334fb 100644
--- a/llvm/test/CodeGen/X86/vec_extract-avx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll
@@ -119,7 +119,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -128,7 +128,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-NEXT:    vmovaps %ymm0, (%rsi)
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -169,7 +169,7 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -178,7 +178,7 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-NEXT:    vmovaps %ymm0, (%rsi)
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index abb85ac83464..0538cac12cbf 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -828,13 +828,13 @@ define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; AVX-LABEL: floor_ss:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_ss:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    retq
   %s = extractelement <4 x float> %x, i32 0
   %call = call float @llvm.floor.f32(float %s)
@@ -853,13 +853,13 @@ define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; AVX-LABEL: floor_sd:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_sd:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    retq
   %s = extractelement <2 x double> %x, i32 0
   %call = call double @llvm.floor.f64(double %s)
@@ -1372,7 +1372,7 @@ define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB52_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_ss:
@@ -1414,7 +1414,7 @@ define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwi
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB53_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_ss:
@@ -1452,7 +1452,7 @@ define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB54_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_sd:
@@ -1494,7 +1494,7 @@ define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nou
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB55_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_sd:
@@ -1532,7 +1532,7 @@ define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x flo
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB56_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_ss_trunc:
@@ -1572,11 +1572,11 @@ define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; AVX-NEXT:    jne LBB57_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB57_1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_ss_trunc:
@@ -1613,7 +1613,7 @@ define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB58_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_sd_trunc:
@@ -1657,7 +1657,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB59_1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_sd_trunc:
@@ -1689,7 +1689,7 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_ss_mask8:
@@ -1723,7 +1723,7 @@ define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwin
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_ss_mask8:
@@ -1756,7 +1756,7 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_sd_mask8:
@@ -1790,7 +1790,7 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_sd_mask8:
@@ -1818,13 +1818,13 @@ define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; AVX-LABEL: ceil_ss:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_ss:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    retq
   %s = extractelement <4 x float> %x, i32 0
   %call = call float @llvm.ceil.f32(float %s)
@@ -1843,13 +1843,13 @@ define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; AVX-LABEL: ceil_sd:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_sd:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    retq
   %s = extractelement <2 x double> %x, i32 0
   %call = call double @llvm.ceil.f64(double %s)
@@ -2362,7 +2362,7 @@ define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w,
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB78_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_ss:
@@ -2404,7 +2404,7 @@ define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwin
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB79_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_ss:
@@ -2442,7 +2442,7 @@ define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double>
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB80_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_sd:
@@ -2484,7 +2484,7 @@ define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) noun
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB81_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_sd:
@@ -2522,7 +2522,7 @@ define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x floa
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB82_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_ss_trunc:
@@ -2562,11 +2562,11 @@ define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; AVX-NEXT:    jne LBB83_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB83_1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_ss_trunc:
@@ -2603,7 +2603,7 @@ define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x d
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB84_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_sd_trunc:
@@ -2647,7 +2647,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB85_1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_sd_trunc:
@@ -2679,7 +2679,7 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_ss_mask8:
@@ -2713,7 +2713,7 @@ define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_ss_mask8:
@@ -2746,7 +2746,7 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_sd_mask8:
@@ -2780,7 +2780,7 @@ define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounw
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_sd_mask8:
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index e4304f2cc214..e73d345d0fcd 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -38,7 +38,7 @@ define i16 @test1(float %f) nounwind {
 ; X86_AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86_AVX1-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vcvttss2si %xmm0, %eax
@@ -50,7 +50,7 @@ define i16 @test1(float %f) nounwind {
 ; X64_AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64_AVX1-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vcvttss2si %xmm0, %eax
@@ -63,7 +63,7 @@ define i16 @test1(float %f) nounwind {
 ; X86_AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86_AVX512-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
@@ -76,7 +76,7 @@ define i16 @test1(float %f) nounwind {
 ; X64_AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64_AVX512-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index bd5c9363794a..a38028e87532 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -54,7 +54,7 @@ define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
 ;
 ; AVX-LABEL: vsel_float2:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 1bbf92e45fc6..01159d4135d8 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -5034,7 +5034,7 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
 ; F16C-LABEL: fptoui_2f16_to_4i32:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvttps2dq %xmm0, %xmm1
 ; F16C-NEXT:    vpsrad $31, %xmm1, %xmm2
@@ -5048,7 +5048,7 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
 ; AVX512F-LABEL: fptoui_2f16_to_4i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index b1986e7af3ec..d83f969dd033 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -51,7 +51,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm0[1,1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
@@ -452,7 +452,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3]
 ; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-FCP-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r9)
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, (%r9)
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm3[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 769f0ec47db0..bfd1e3ece200 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -651,7 +651,7 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
 ; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm3, %ymm3
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
+; AVX2-FCP-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
 ; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d389f981722..8679c262e0bf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -877,7 +877,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
 ; AVX-LABEL: shuffle_v2i64_bitcast_z123:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %bitcast32 = bitcast <2 x i64> %x to <4 x float>
   %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> <float 1.000000e+00, float poison, float poison, float poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 0eb72c8bc0be..e1eb1a6704e3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -506,7 +506,7 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
 ; AVX1OR2-LABEL: shuffle_v4i32_4012:
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2]
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v4i32_4012:
@@ -618,7 +618,7 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
 ; AVX-LABEL: shuffle_v4f32_4zzz:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %shuffle
@@ -1164,7 +1164,7 @@ define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
 ; AVX-LABEL: shuffle_v4i32_4zzz:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x i32> %shuffle
@@ -1202,14 +1202,14 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
 ; AVX1-LABEL: shuffle_v4i32_z4zz:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-SLOW-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -1258,14 +1258,14 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
 ; AVX1-LABEL: shuffle_v4i32_zz4z:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-SLOW-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -2138,7 +2138,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 ; AVX-LABEL: insert_reg_and_zero_v4f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %v = insertelement <4 x float> poison, float %a, i32 0
   %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index d848a8b87921..94fc982d87e5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2376,33 +2376,21 @@ define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
 }
 
 define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
-; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT:    retq
   %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x float> %b
 }
 
 define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
-; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT:    retq
   %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %b
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index be3258765d87..950683cbfaee 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2132,7 +2132,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_08991abb:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
@@ -3402,7 +3402,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_0dcd3f14:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 38920aa5d7a1..f4f4842e4c69 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -435,7 +435,7 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <32 x i8>
   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 12d494c32b65..0570e2f580c1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -75,7 +75,7 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: combine_pshufb_as_movss:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1>
   %2 = bitcast <4 x float> %1 to <16 x i8>
@@ -137,7 +137,7 @@ define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
 ; AVX-LABEL: combine_pshufb_as_vzmovl_32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = bitcast <4 x float> %a0 to <16 x i8>
   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index f53c7a337017..e8bf5ec2b49a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -106,7 +106,7 @@ define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x flo
 ; CHECK-LABEL: combine_vpermil2ps_blend_with_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
   ret <4 x float> %res0
diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll
index 09835d797d17..2f84723b3c08 100644
--- a/llvm/test/CodeGen/X86/vector-zmov.ll
+++ b/llvm/test/CodeGen/X86/vector-zmov.ll
@@ -63,7 +63,7 @@ define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(ptr%ptr) {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 entry:
   %X = load volatile <4 x i32>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index be6ee8f68995..9851fe64847d 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -313,7 +313,7 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: test18:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -332,7 +332,7 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX-LABEL: test19:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
@@ -390,7 +390,7 @@ define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: test22:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -410,7 +410,7 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX-LABEL: test23:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1

From 53e50472ff445bb946a53aba30649ae65f3534b1 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Thu, 12 Jun 2025 10:09:02 -0700
Subject: [PATCH 0178/1322] [Clang][NFC] Move FatbinFileName instead of copy
 (#143827)

Static analysis flagged FatbinFileName since we can move it instead of
copying it.
---
 clang/lib/Interpreter/DeviceOffload.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Interpreter/DeviceOffload.cpp b/clang/lib/Interpreter/DeviceOffload.cpp
index 05625ddedb72..9a25a264b2d5 100644
--- a/clang/lib/Interpreter/DeviceOffload.cpp
+++ b/clang/lib/Interpreter/DeviceOffload.cpp
@@ -151,7 +151,7 @@ llvm::Error IncrementalCUDADeviceParser::GenerateFatbinary() {
                    llvm::StringRef(FatbinContent.data(), FatbinContent.size()),
                    "", false));
 
-  CodeGenOpts.CudaGpuBinaryFileName = FatbinFileName;
+  CodeGenOpts.CudaGpuBinaryFileName = std::move(FatbinFileName);
 
   FatbinContent.clear();
 

From 82f19674bff578b9afd164144fd6b75d042ac932 Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 12 Jun 2025 13:11:42 -0400
Subject: [PATCH 0179/1322] [libc] Update size_t and ssize_t definitions to use
 __SIZE_TYPE__ and __PTRDIFF_TYPE__ respectively. (#143921)

The current definition of `ssize_t` does not have the same bit width as
`size_t` on 32-bit platforms.
---
 libc/include/llvm-libc-types/size_t.h  | 7 +------
 libc/include/llvm-libc-types/ssize_t.h | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/libc/include/llvm-libc-types/size_t.h b/libc/include/llvm-libc-types/size_t.h
index 3b31b0820f23..26ae68abe0ee 100644
--- a/libc/include/llvm-libc-types/size_t.h
+++ b/libc/include/llvm-libc-types/size_t.h
@@ -9,11 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_SIZE_T_H
 #define LLVM_LIBC_TYPES_SIZE_T_H
 
-// Since __need_size_t is defined, we get the definition of size_t from the
-// standalone C header stddef.h. Also, because __need_size_t is defined,
-// including stddef.h will pull only the type size_t and nothing else.
-#define __need_size_t
-#include <stddef.h>
-#undef __need_size_t
+typedef __SIZE_TYPE__ size_t;
 
 #endif // LLVM_LIBC_TYPES_SIZE_T_H
diff --git a/libc/include/llvm-libc-types/ssize_t.h b/libc/include/llvm-libc-types/ssize_t.h
index 41e4b6d2c500..8f579e2749ba 100644
--- a/libc/include/llvm-libc-types/ssize_t.h
+++ b/libc/include/llvm-libc-types/ssize_t.h
@@ -9,6 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_SSIZE_T_H
 #define LLVM_LIBC_TYPES_SSIZE_T_H
 
-typedef __INT64_TYPE__ ssize_t;
+typedef __PTRDIFF_TYPE__ ssize_t;
 
 #endif // LLVM_LIBC_TYPES_SSIZE_T_H

From cbc2ef0e890e6c700023fe00c7166554f2f5ad14 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Thu, 12 Jun 2025 10:13:30 -0700
Subject: [PATCH 0180/1322] [llvm][utils] Add synthetic provider for
 llvm::DenseSet (#143631)

Add a synthetic child provider for `DenseSet`, which is a wrapper around
`DenseMap`. This provider leverages the existing `DenseMap` provider,
reshaping its dictionary structured children into a set.
---
 llvm/utils/lldbDataFormatters.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py
index 988827ab4aa5..c5cd627c5314 100644
--- a/llvm/utils/lldbDataFormatters.py
+++ b/llvm/utils/lldbDataFormatters.py
@@ -3,6 +3,7 @@ LLDB Formatters for LLVM data types.
 
 Load into LLDB with 'command script import /path/to/lldbDataFormatters.py'
 """
+
 from __future__ import annotations
 
 import collections
@@ -82,6 +83,11 @@ def __lldb_init_module(debugger, internal_dict):
         f"-l {__name__}.DenseMapSynthetic "
         '-x "^llvm::DenseMap<.+>$"'
     )
+    debugger.HandleCommand(
+        "type synthetic add -w llvm "
+        f"-l {__name__}.DenseSetSynthetic "
+        '-x "^llvm::DenseSet<.+>$"'
+    )
 
     debugger.HandleCommand(
         "type synthetic add -w llvm "
@@ -372,7 +378,8 @@ class DenseMapSynthetic:
         # For each key, collect a list of buckets it appears in.
         key_buckets: dict[str, list[int]] = collections.defaultdict(list)
         for index in range(num_buckets):
-            key = buckets.GetValueForExpressionPath(f"[{index}].first")
+            bucket = buckets.GetValueForExpressionPath(f"[{index}]")
+            key = bucket.GetChildAtIndex(0)
             key_buckets[str(key.data)].append(index)
 
         # Heuristic: This is not a multi-map, any repeated (non-unique) keys are
@@ -383,6 +390,26 @@ class DenseMapSynthetic:
                 self.child_buckets.append(indexes[0])
 
 
+class DenseSetSynthetic:
+    valobj: lldb.SBValue
+    map: lldb.SBValue
+
+    def __init__(self, valobj: lldb.SBValue, _) -> None:
+        self.valobj = valobj
+
+    def num_children(self) -> int:
+        return self.map.num_children
+
+    def get_child_at_index(self, idx: int) -> lldb.SBValue:
+        map_entry = self.map.child[idx]
+        set_entry = map_entry.GetChildAtIndex(0)
+        return set_entry.Clone(f"[{idx}]")
+
+    def update(self):
+        raw_map = self.valobj.GetChildMemberWithName("TheMap")
+        self.map = raw_map.GetSyntheticValue()
+
+
 class ExpectedSynthetic:
     # The llvm::Expected<T> value.
     expected: lldb.SBValue

From eab1a1d4914a51de8383b818bf595125fb830c51 Mon Sep 17 00:00:00 2001
From: halbi2 <hehiralbi@gmail.com>
Date: Thu, 12 Jun 2025 13:15:41 -0400
Subject: [PATCH 0181/1322] [libc++][test] Improve test coverage for flat_set
 (lack of) SCARY iterators (#139649)

Missing from 5e94e26a7afb8db00cc123e5fc5471c1125596e3.
---
 .../flat.set/scary.compile.pass.cpp           | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp

diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp
new file mode 100644
index 000000000000..99e93fc3b08b
--- /dev/null
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_set>
+
+// class flat_set
+// class flat_multiset
+
+// Extension: SCARY/N2913 iterator compatibility between flat_set and flat_multiset
+// Test for the absence of this feature
+
+#include <flat_set>
+#include <type_traits>
+
+#include "test_macros.h"
+
+void test() {
+  typedef std::flat_set<int, int> M1;
+  typedef std::flat_multiset<int, int> M2;
+
+  static_assert(!std::is_convertible_v<M1::iterator, M2::iterator>);
+  static_assert(!std::is_convertible_v<M2::iterator, M1::iterator>);
+
+  static_assert(!std::is_convertible_v<M1::const_iterator, M2::const_iterator>);
+  static_assert(!std::is_convertible_v<M2::const_iterator, M1::const_iterator>);
+}

From d1ca8d891ff038ec29e67065a446aa2f2043325e Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 12 Jun 2025 13:18:30 -0400
Subject: [PATCH 0182/1322] [libc][math] Refactor expf implementation to
 header-only in src/__support/math folder. (#143790)

This is a step in preparation for:
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 libc/shared/math.h                            |  16 ++
 libc/shared/math/expf.h                       |  23 +++
 libc/src/__support/CMakeLists.txt             |   2 +
 libc/src/__support/math/CMakeLists.txt        |  24 +++
 libc/src/__support/math/exp_float_constants.h | 145 ++++++++++++++++++
 libc/src/__support/math/expf.h                | 116 ++++++++++++++
 libc/src/math/generic/CMakeLists.txt          |  10 +-
 libc/src/math/generic/expf.cpp                |  97 +-----------
 .../llvm-project-overlay/libc/BUILD.bazel     |  39 ++++-
 .../libc/test/libc_test_rules.bzl             |   1 +
 10 files changed, 361 insertions(+), 112 deletions(-)
 create mode 100644 libc/shared/math.h
 create mode 100644 libc/shared/math/expf.h
 create mode 100644 libc/src/__support/math/CMakeLists.txt
 create mode 100644 libc/src/__support/math/exp_float_constants.h
 create mode 100644 libc/src/__support/math/expf.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
new file mode 100644
index 000000000000..4ddc29c7ae83
--- /dev/null
+++ b/libc/shared/math.h
@@ -0,0 +1,16 @@
+//===-- Floating point math functions ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_H
+#define LLVM_LIBC_SHARED_MATH_H
+
+#include "libc_common.h"
+
+#include "math/expf.h"
+
+#endif // LLVM_LIBC_SHARED_MATH_H
diff --git a/libc/shared/math/expf.h b/libc/shared/math/expf.h
new file mode 100644
index 000000000000..a4e8b0751bb4
--- /dev/null
+++ b/libc/shared/math/expf.h
@@ -0,0 +1,23 @@
+//===-- Shared expf function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXPF_H
+#define LLVM_LIBC_SHARED_MATH_EXPF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/expf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::expf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXPF_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 327ff5e0c6a3..8cf2b0cdcdcc 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -390,3 +390,5 @@ add_subdirectory(HashTable)
 add_subdirectory(fixed_point)
 
 add_subdirectory(time)
+
+add_subdirectory(math)
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
new file mode 100644
index 000000000000..66c1d19a1cab
--- /dev/null
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_header_library(
+  exp_float_constants
+  HDRS
+    exp_float_constants.h
+  DEPENDS
+    libc.src.__support.macros.config
+)
+
+add_header_library(
+  expf
+  HDRS
+    expf.h
+  DEPENDS
+    .exp_float_constants
+    libc.src.__support.common
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.config
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/math/exp_float_constants.h b/libc/src/__support/math/exp_float_constants.h
new file mode 100644
index 000000000000..cabb227a034b
--- /dev/null
+++ b/libc/src/__support/math/exp_float_constants.h
@@ -0,0 +1,145 @@
+//===-- Look-up tables for exp*f functions ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+// Lookup table for exp(m) with m = -104, ..., 89.
+//   -104 = floor(log(single precision's min denormal))
+//     89 = ceil(log(single precision's max normal))
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for i from -104 to 89 do { D(exp(i)); };
+static constexpr double EXP_M1[195] = {
+    0x1.f1e6b68529e33p-151, 0x1.525be4e4e601dp-149, 0x1.cbe0a45f75eb1p-148,
+    0x1.3884e838aea68p-146, 0x1.a8c1f14e2af5dp-145, 0x1.20a717e64a9bdp-143,
+    0x1.8851d84118908p-142, 0x1.0a9bdfb02d240p-140, 0x1.6a5bea046b42ep-139,
+    0x1.ec7f3b269efa8p-138, 0x1.4eafb87eab0f2p-136, 0x1.c6e2d05bbc000p-135,
+    0x1.35208867c2683p-133, 0x1.a425b317eeacdp-132, 0x1.1d8508fa8246ap-130,
+    0x1.840fbc08fdc8ap-129, 0x1.07b7112bc1ffep-127, 0x1.666d0dad2961dp-126,
+    0x1.e726c3f64d0fep-125, 0x1.4b0dc07cabf98p-123, 0x1.c1f2daf3b6a46p-122,
+    0x1.31c5957a47de2p-120, 0x1.9f96445648b9fp-119, 0x1.1a6baeadb4fd1p-117,
+    0x1.7fd974d372e45p-116, 0x1.04da4d1452919p-114, 0x1.62891f06b3450p-113,
+    0x1.e1dd273aa8a4ap-112, 0x1.4775e0840bfddp-110, 0x1.bd109d9d94bdap-109,
+    0x1.2e73f53fba844p-107, 0x1.9b138170d6bfep-106, 0x1.175af0cf60ec5p-104,
+    0x1.7baee1bffa80bp-103, 0x1.02057d1245cebp-101, 0x1.5eafffb34ba31p-100,
+    0x1.dca23bae16424p-99,  0x1.43e7fc88b8056p-97,  0x1.b83bf23a9a9ebp-96,
+    0x1.2b2b8dd05b318p-94,  0x1.969d47321e4ccp-93,  0x1.1452b7723aed2p-91,
+    0x1.778fe2497184cp-90,  0x1.fe7116182e9ccp-89,  0x1.5ae191a99585ap-87,
+    0x1.d775d87da854dp-86,  0x1.4063f8cc8bb98p-84,  0x1.b374b315f87c1p-83,
+    0x1.27ec458c65e3cp-81,  0x1.923372c67a074p-80,  0x1.1152eaeb73c08p-78,
+    0x1.737c5645114b5p-77,  0x1.f8e6c24b5592ep-76,  0x1.571db733a9d61p-74,
+    0x1.d257d547e083fp-73,  0x1.3ce9b9de78f85p-71,  0x1.aebabae3a41b5p-70,
+    0x1.24b6031b49bdap-68,  0x1.8dd5e1bb09d7ep-67,  0x1.0e5b73d1ff53dp-65,
+    0x1.6f741de1748ecp-64,  0x1.f36bd37f42f3ep-63,  0x1.536452ee2f75cp-61,
+    0x1.cd480a1b74820p-60,  0x1.39792499b1a24p-58,  0x1.aa0de4bf35b38p-57,
+    0x1.2188ad6ae3303p-55,  0x1.898471fca6055p-54,  0x1.0b6c3afdde064p-52,
+    0x1.6b7719a59f0e0p-51,  0x1.ee001eed62aa0p-50,  0x1.4fb547c775da8p-48,
+    0x1.c8464f7616468p-47,  0x1.36121e24d3bbap-45,  0x1.a56e0c2ac7f75p-44,
+    0x1.1e642baeb84a0p-42,  0x1.853f01d6d53bap-41,  0x1.0885298767e9ap-39,
+    0x1.67852a7007e42p-38,  0x1.e8a37a45fc32ep-37,  0x1.4c1078fe9228ap-35,
+    0x1.c3527e433fab1p-34,  0x1.32b48bf117da2p-32,  0x1.a0db0d0ddb3ecp-31,
+    0x1.1b48655f37267p-29,  0x1.81056ff2c5772p-28,  0x1.05a628c699fa1p-26,
+    0x1.639e3175a689dp-25,  0x1.e355bbaee85cbp-24,  0x1.4875ca227ec38p-22,
+    0x1.be6c6fdb01612p-21,  0x1.2f6053b981d98p-19,  0x1.9c54c3b43bc8bp-18,
+    0x1.18354238f6764p-16,  0x1.7cd79b5647c9bp-15,  0x1.02cf22526545ap-13,
+    0x1.5fc21041027adp-12,  0x1.de16b9c24a98fp-11,  0x1.44e51f113d4d6p-9,
+    0x1.b993fe00d5376p-8,   0x1.2c155b8213cf4p-6,   0x1.97db0ccceb0afp-5,
+    0x1.152aaa3bf81ccp-3,   0x1.78b56362cef38p-2,   0x1.0000000000000p+0,
+    0x1.5bf0a8b145769p+1,   0x1.d8e64b8d4ddaep+2,   0x1.415e5bf6fb106p+4,
+    0x1.b4c902e273a58p+5,   0x1.28d389970338fp+7,   0x1.936dc5690c08fp+8,
+    0x1.122885aaeddaap+10,  0x1.749ea7d470c6ep+11,  0x1.fa7157c470f82p+12,
+    0x1.5829dcf950560p+14,  0x1.d3c4488ee4f7fp+15,  0x1.3de1654d37c9ap+17,
+    0x1.b00b5916ac955p+18,  0x1.259ac48bf05d7p+20,  0x1.8f0ccafad2a87p+21,
+    0x1.0f2ebd0a80020p+23,  0x1.709348c0ea4f9p+24,  0x1.f4f22091940bdp+25,
+    0x1.546d8f9ed26e1p+27,  0x1.ceb088b68e804p+28,  0x1.3a6e1fd9eecfdp+30,
+    0x1.ab5adb9c43600p+31,  0x1.226af33b1fdc1p+33,  0x1.8ab7fb5475fb7p+34,
+    0x1.0c3d3920962c9p+36,  0x1.6c932696a6b5dp+37,  0x1.ef822f7f6731dp+38,
+    0x1.50bba3796379ap+40,  0x1.c9aae4631c056p+41,  0x1.370470aec28edp+43,
+    0x1.a6b765d8cdf6dp+44,  0x1.1f43fcc4b662cp+46,  0x1.866f34a725782p+47,
+    0x1.0953e2f3a1ef7p+49,  0x1.689e221bc8d5bp+50,  0x1.ea215a1d20d76p+51,
+    0x1.4d13fbb1a001ap+53,  0x1.c4b334617cc67p+54,  0x1.33a43d282a519p+56,
+    0x1.a220d397972ebp+57,  0x1.1c25c88df6862p+59,  0x1.8232558201159p+60,
+    0x1.0672a3c9eb871p+62,  0x1.64b41c6d37832p+63,  0x1.e4cf766fe49bep+64,
+    0x1.49767bc0483e3p+66,  0x1.bfc951eb8bb76p+67,  0x1.304d6aeca254bp+69,
+    0x1.9d97010884251p+70,  0x1.19103e4080b45p+72,  0x1.7e013cd114461p+73,
+    0x1.03996528e074cp+75,  0x1.60d4f6fdac731p+76,  0x1.df8c5af17ba3bp+77,
+    0x1.45e3076d61699p+79,  0x1.baed16a6e0da7p+80,  0x1.2cffdfebde1a1p+82,
+    0x1.9919cabefcb69p+83,  0x1.160345c9953e3p+85,  0x1.79dbc9dc53c66p+86,
+    0x1.00c810d464097p+88,  0x1.5d009394c5c27p+89,  0x1.da57de8f107a8p+90,
+    0x1.425982cf597cdp+92,  0x1.b61e5ca3a5e31p+93,  0x1.29bb825dfcf87p+95,
+    0x1.94a90db0d6fe2p+96,  0x1.12fec759586fdp+98,  0x1.75c1dc469e3afp+99,
+    0x1.fbfd219c43b04p+100, 0x1.5936d44e1a146p+102, 0x1.d531d8a7ee79cp+103,
+    0x1.3ed9d24a2d51bp+105, 0x1.b15cfe5b6e17bp+106, 0x1.268038c2c0e00p+108,
+    0x1.9044a73545d48p+109, 0x1.1002ab6218b38p+111, 0x1.71b3540cbf921p+112,
+    0x1.f6799ea9c414ap+113, 0x1.55779b984f3ebp+115, 0x1.d01a210c44aa4p+116,
+    0x1.3b63da8e91210p+118, 0x1.aca8d6b0116b8p+119, 0x1.234de9e0c74e9p+121,
+    0x1.8bec7503ca477p+122, 0x1.0d0eda9796b90p+124, 0x1.6db0118477245p+125,
+    0x1.f1056dc7bf22dp+126, 0x1.51c2cc3433801p+128, 0x1.cb108ffbec164p+129,
+};
+
+// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127.
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for i from 0 to 127 do { D(exp(i / 128)); };
+static constexpr double EXP_M2[128] = {
+    0x1.0000000000000p0, 0x1.0202015600446p0, 0x1.04080ab55de39p0,
+    0x1.06122436410ddp0, 0x1.08205601127edp0, 0x1.0a32a84e9c1f6p0,
+    0x1.0c49236829e8cp0, 0x1.0e63cfa7ab09dp0, 0x1.1082b577d34edp0,
+    0x1.12a5dd543ccc5p0, 0x1.14cd4fc989cd6p0, 0x1.16f9157587069p0,
+    0x1.192937074e0cdp0, 0x1.1b5dbd3f68122p0, 0x1.1d96b0eff0e79p0,
+    0x1.1fd41afcba45ep0, 0x1.2216045b6f5cdp0, 0x1.245c7613b8a9bp0,
+    0x1.26a7793f60164p0, 0x1.28f7170a755fdp0, 0x1.2b4b58b372c79p0,
+    0x1.2da4478b620c7p0, 0x1.3001ecf601af7p0, 0x1.32645269ea829p0,
+    0x1.34cb8170b5835p0, 0x1.373783a722012p0, 0x1.39a862bd3c106p0,
+    0x1.3c1e2876834aap0, 0x1.3e98deaa11dccp0, 0x1.41188f42c3e32p0,
+    0x1.439d443f5f159p0, 0x1.462707b2bac21p0, 0x1.48b5e3c3e8186p0,
+    0x1.4b49e2ae5ac67p0, 0x1.4de30ec211e60p0, 0x1.50817263c13cdp0,
+    0x1.5325180cfacf7p0, 0x1.55ce0a4c58c7cp0, 0x1.587c53c5a7af0p0,
+    0x1.5b2fff3210fd9p0, 0x1.5de9176045ff5p0, 0x1.60a7a734ab0e8p0,
+    0x1.636bb9a983258p0, 0x1.663559cf1bc7cp0, 0x1.690492cbf9433p0,
+    0x1.6bd96fdd034a2p0, 0x1.6eb3fc55b1e76p0, 0x1.719443a03acb9p0,
+    0x1.747a513dbef6ap0, 0x1.776630c678bc1p0, 0x1.7a57ede9ea23ep0,
+    0x1.7d4f946f0ba8dp0, 0x1.804d30347b546p0, 0x1.8350cd30ac390p0,
+    0x1.865a7772164c5p0, 0x1.896a3b1f66a0ep0, 0x1.8c802477b0010p0,
+    0x1.8f9c3fd29beafp0, 0x1.92be99a09bf00p0, 0x1.95e73e6b1b75ep0,
+    0x1.99163ad4b1dccp0, 0x1.9c4b9b995509bp0, 0x1.9f876d8e8c566p0,
+    0x1.a2c9bda3a3e78p0, 0x1.a61298e1e069cp0, 0x1.a9620c6cb3374p0,
+    0x1.acb82581eee54p0, 0x1.b014f179fc3b8p0, 0x1.b3787dc80f95fp0,
+    0x1.b6e2d7fa5eb18p0, 0x1.ba540dba56e56p0, 0x1.bdcc2cccd3c85p0,
+    0x1.c14b431256446p0, 0x1.c4d15e873c193p0, 0x1.c85e8d43f7cd0p0,
+    0x1.cbf2dd7d490f2p0, 0x1.cf8e5d84758a9p0, 0x1.d3311bc7822b4p0,
+    0x1.d6db26d16cd67p0, 0x1.da8c8d4a66969p0, 0x1.de455df80e3c0p0,
+    0x1.e205a7bdab73ep0, 0x1.e5cd799c6a54ep0, 0x1.e99ce2b397649p0,
+    0x1.ed73f240dc142p0, 0x1.f152b7a07bb76p0, 0x1.f539424d90f5ep0,
+    0x1.f927a1e24bb76p0, 0x1.fd1de6182f8c9p0, 0x1.008e0f64294abp1,
+    0x1.02912df5ce72ap1, 0x1.049856cd84339p1, 0x1.06a39207f0a09p1,
+    0x1.08b2e7d2035cfp1, 0x1.0ac6606916501p1, 0x1.0cde041b0e9aep1,
+    0x1.0ef9db467dcf8p1, 0x1.1119ee5ac36b6p1, 0x1.133e45d82e952p1,
+    0x1.1566ea50201d7p1, 0x1.1793e4652cc50p1, 0x1.19c53ccb3fc6bp1,
+    0x1.1bfafc47bda73p1, 0x1.1e352bb1a74adp1, 0x1.2073d3f1bd518p1,
+    0x1.22b6fe02a3b9cp1, 0x1.24feb2f105cb8p1, 0x1.274afbdbba4a6p1,
+    0x1.299be1f3e7f1cp1, 0x1.2bf16e7d2a38cp1, 0x1.2e4baacdb6614p1,
+    0x1.30aaa04e80d05p1, 0x1.330e587b62b28p1, 0x1.3576dce33feadp1,
+    0x1.37e437282d4eep1, 0x1.3a5670ff972edp1, 0x1.3ccd9432682b4p1,
+    0x1.3f49aa9d30590p1, 0x1.41cabe304cb34p1, 0x1.4450d8f00edd4p1,
+    0x1.46dc04f4e5338p1, 0x1.496c4c6b832dap1, 0x1.4c01b9950a111p1,
+    0x1.4e9c56c731f5dp1, 0x1.513c2e6c731d7p1, 0x1.53e14b042f9cap1,
+    0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
+};
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
diff --git a/libc/src/__support/math/expf.h b/libc/src/__support/math/expf.h
new file mode 100644
index 000000000000..88c151492a04
--- /dev/null
+++ b/libc/src/__support/math/expf.h
@@ -0,0 +1,116 @@
+//===-- Implementation header for expf --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXPF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXPF_H
+
+#include "exp_float_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float expf(float x) {
+  using FPBits = typename fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Exceptional values
+  if (LIBC_UNLIKELY(x_u == 0xc236'bd8cU)) { // x = -0x1.6d7b18p+5f
+    return 0x1.108a58p-66f - x * 0x1.0p-95f;
+  }
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // When |x| >= 89, |x| < 2^-25, or x is nan
+  if (LIBC_UNLIKELY(x_abs >= 0x42b2'0000U || x_abs <= 0x3280'0000U)) {
+    // |x| < 2^-25
+    if (xbits.get_biased_exponent() <= 101) {
+      return 1.0f + x;
+    }
+
+    // When x < log(2^-150) or nan
+    if (xbits.uintval() >= 0xc2cf'f1b5U) {
+      // exp(-Inf) = 0
+      if (xbits.is_inf())
+        return 0.0f;
+      // exp(nan) = nan
+      if (xbits.is_nan())
+        return x;
+      if (fputil::fenv_is_round_up())
+        return FPBits::min_subnormal().get_val();
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+      return 0.0f;
+    }
+    // x >= 89 or nan
+    if (xbits.is_pos() && (xbits.uintval() >= 0x42b2'0000)) {
+      // x is finite
+      if (xbits.uintval() < 0x7f80'0000U) {
+        int rounding = fputil::quick_get_round();
+        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+          return FPBits::max_normal().get_val();
+
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+      }
+      // x is +inf or nan
+      return x + FPBits::inf().get_val();
+    }
+  }
+  // For -104 < x < 89, to compute exp(x), we perform the following range
+  // reduction: find hi, mid, lo such that:
+  //   x = hi + mid + lo, in which
+  //     hi is an integer,
+  //     mid * 2^7 is an integer
+  //     -2^(-8) <= lo < 2^-8.
+  // In particular,
+  //   hi + mid = round(x * 2^7) * 2^(-7).
+  // Then,
+  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
+  // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
+  // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
+  // generated by Sollya.
+
+  // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
+  float kf = fputil::nearest_integer(x * 0x1.0p7f);
+  // Subtract (hi + mid) from x to get lo.
+  double xd = static_cast<double>(fputil::multiply_add(kf, -0x1.0p-7f, x));
+  int x_hi = static_cast<int>(kf);
+  x_hi += 104 << 7;
+  // hi = x_hi >> 7
+  double exp_hi = EXP_M1[x_hi >> 7];
+  // mid * 2^7 = x_hi & 0x0000'007fU;
+  double exp_mid = EXP_M2[x_hi & 0x7f];
+  // Degree-4 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > Q = fpminimax(expm1(x)/x, 3, [|D...|], [-2^-8, 2^-8]);
+  //   > Q;
+  double exp_lo =
+      fputil::polyeval(xd, 0x1p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
+                       0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
+  return static_cast<float>(exp_hi * exp_mid * exp_lo);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXPF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index db3ef8886b52..fd1e6c0d648a 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1321,15 +1321,7 @@ add_entrypoint_object(
   HDRS
     ../expf.h
   DEPENDS
-    .common_constants
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.expf
     libc.src.errno.errno
 )
 
diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp
index fa507d4d9322..de11f51ac64a 100644
--- a/libc/src/math/generic/expf.cpp
+++ b/libc/src/math/generic/expf.cpp
@@ -7,103 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expf.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
-#include "src/__support/FPUtil/BasicOperations.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/expf.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, expf, (float x)) {
-  using FPBits = typename fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Exceptional values
-  if (LIBC_UNLIKELY(x_u == 0xc236'bd8cU)) { // x = -0x1.6d7b18p+5f
-    return 0x1.108a58p-66f - x * 0x1.0p-95f;
-  }
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // When |x| >= 89, |x| < 2^-25, or x is nan
-  if (LIBC_UNLIKELY(x_abs >= 0x42b2'0000U || x_abs <= 0x3280'0000U)) {
-    // |x| < 2^-25
-    if (xbits.get_biased_exponent() <= 101) {
-      return 1.0f + x;
-    }
-
-    // When x < log(2^-150) or nan
-    if (xbits.uintval() >= 0xc2cf'f1b5U) {
-      // exp(-Inf) = 0
-      if (xbits.is_inf())
-        return 0.0f;
-      // exp(nan) = nan
-      if (xbits.is_nan())
-        return x;
-      if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW);
-      return 0.0f;
-    }
-    // x >= 89 or nan
-    if (xbits.is_pos() && (xbits.uintval() >= 0x42b2'0000)) {
-      // x is finite
-      if (xbits.uintval() < 0x7f80'0000U) {
-        int rounding = fputil::quick_get_round();
-        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal().get_val();
-
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW);
-      }
-      // x is +inf or nan
-      return x + FPBits::inf().get_val();
-    }
-  }
-  // For -104 < x < 89, to compute exp(x), we perform the following range
-  // reduction: find hi, mid, lo such that:
-  //   x = hi + mid + lo, in which
-  //     hi is an integer,
-  //     mid * 2^7 is an integer
-  //     -2^(-8) <= lo < 2^-8.
-  // In particular,
-  //   hi + mid = round(x * 2^7) * 2^(-7).
-  // Then,
-  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
-  // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
-  // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
-  // generated by Sollya.
-
-  // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
-  float kf = fputil::nearest_integer(x * 0x1.0p7f);
-  // Subtract (hi + mid) from x to get lo.
-  double xd = static_cast<double>(fputil::multiply_add(kf, -0x1.0p-7f, x));
-  int x_hi = static_cast<int>(kf);
-  x_hi += 104 << 7;
-  // hi = x_hi >> 7
-  double exp_hi = EXP_M1[x_hi >> 7];
-  // mid * 2^7 = x_hi & 0x0000'007fU;
-  double exp_mid = EXP_M2[x_hi & 0x7f];
-  // Degree-4 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > Q = fpminimax(expm1(x)/x, 3, [|D...|], [-2^-8, 2^-8]);
-  //   > Q;
-  double exp_lo =
-      fputil::polyeval(xd, 0x1p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
-                       0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
-  return static_cast<float>(exp_hi * exp_mid * exp_lo);
-}
+LLVM_LIBC_FUNCTION(float, expf, (float x)) { return math::expf(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 123d9ccc8310..0cedad285924 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1616,13 +1616,18 @@ libc_header_library(
 
 ############################### errno ########################################
 
+libc_support_library(
+    name = "__support_libc_errno",
+    hdrs = ["src/__support/libc_errno.h"],
+)
+
 libc_support_library(
     name = "errno",
     srcs = ["src/errno/libc_errno.cpp"],
-    hdrs = ["src/__support/libc_errno.h"],
     deps = [
         ":__support_common",
         ":__support_cpp_atomic",
+        ":__support_libc_errno",
         ":__support_macros_attributes",
         ":__support_macros_properties_architectures",
         ":hdr_errno_macros",
@@ -1973,6 +1978,29 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp_float_constants",
+    hdrs = ["src/__support/math/exp_float_constants.h"],
+)
+
+libc_support_library(
+    name = "__support_math_expf",
+    hdrs = ["src/__support/math/expf.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_libc_errno",
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":__support_math_exp_float_constants",
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -2570,13 +2598,8 @@ libc_math_function(
 libc_math_function(
     name = "expf",
     additional_deps = [
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_expf",
+        ":errno",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
index 123e05727aef..ba8753a17a85 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
@@ -31,6 +31,7 @@ def libc_test(name, copts = [], deps = [], local_defines = [], **kwargs):
         deps = [
             "//libc/test/UnitTest:LibcUnitTest",
             "//libc:__support_macros_config",
+            "//libc:__support_libc_errno",
             "//libc:errno",
             "//libc:func_aligned_alloc",
             "//libc:func_free",

From 6311f039b2678f0a1367a88679efb7b2e37949dc Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Thu, 12 Jun 2025 17:34:45 +0000
Subject: [PATCH 0183/1322] [libc] Build fixes for widechar characterconverter
 (#143805)

Build fixes for wchar CharacterConverter class
---
 libc/hdr/CMakeLists.txt                |  2 ++
 libc/hdr/types/CMakeLists.txt          | 22 ++++++++++++++++++++++
 libc/include/llvm-libc-types/char8_t.h |  3 +--
 libc/src/__support/CMakeLists.txt      |  2 ++
 libc/src/__support/wchar/mbstate.h     |  1 +
 libc/src/__support/wchar/utf_ret.h     |  5 ++++-
 6 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 1e9f59621a8e..052a773a4fce 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -221,6 +221,8 @@ add_proxy_header_library(
 
 add_header_library(wchar_overlay HDRS wchar_overlay.h)
 
+add_header_library(uchar_overlay HDRS uchar_overlay.h)
+
 add_proxy_header_library(
   wchar_macros
   HDRS
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 5f6197c93d44..c88c35700907 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -1,3 +1,25 @@
+add_proxy_header_library(
+  char8_t 
+  HDRS
+    char8_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char8_t
+    libc.include.uchar
+)
+
+add_proxy_header_library(
+  char32_t 
+  HDRS
+    char32_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char32_t
+    libc.include.uchar
+)
+
 add_proxy_header_library(
   div_t
   HDRS
diff --git a/libc/include/llvm-libc-types/char8_t.h b/libc/include/llvm-libc-types/char8_t.h
index ddadab1afa21..a343be77d810 100644
--- a/libc/include/llvm-libc-types/char8_t.h
+++ b/libc/include/llvm-libc-types/char8_t.h
@@ -9,8 +9,7 @@
 #ifndef LLVM_LIBC_TYPES_CHAR8_T_H
 #define LLVM_LIBC_TYPES_CHAR8_T_H
 
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) &&                      \
-    __STDC_VERSION__ >= 202311L
+#if !(defined(__cplusplus) && defined(__cpp_char8_t))
 typedef unsigned char char8_t;
 #endif
 
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 8cf2b0cdcdcc..309cde76370f 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -391,4 +391,6 @@ add_subdirectory(fixed_point)
 
 add_subdirectory(time)
 
+add_subdirectory(wchar)
+
 add_subdirectory(math)
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index 72ec72756000..cb8950374de4 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 
 #include "hdr/types/char32_t.h"
+#include "src/__support/common.h"
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
index b8a8f6f09414..fa99b76159bd 100644
--- a/libc/src/__support/wchar/utf_ret.h
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -9,13 +9,16 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 #define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 
-namespace LIBC_NAMESPACE_DECL {
+#include "src/__support/common.h"
 
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
 template <typename T> struct utf_ret {
   T out;
   int error;
 };
 
+} // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H

From 9208b343e962b9f1140ee345c0050a3920bdcbf2 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Thu, 12 Jun 2025 13:38:13 -0400
Subject: [PATCH 0184/1322]  [PowerPC] frontend get target feature from backend
 with cpu name (#137670)

1. The PR proceeds with a backend target hook to allow front-ends to
determine what target features are available in a compilation based on
the CPU name.
2. Fix a backend target feature bug that supports HTM for
Power8/9/10/11. However, HTM is only supported on Power8/9 according to
the ISA.
3. All target features that are hardcoded in PPC.cpp can be retrieved
from the backend target feature. I have double-checked that the
hardcoded logic for inferring target features from the CPU in the
frontend(PPC.cpp) is the same as in PPC.td.
---
 clang/lib/Basic/Targets/PPC.cpp               | 148 +-----------------
 .../cxx11-thread-local-reference.cpp          |   2 +-
 .../Driver/aix-shared-lib-tls-model-opt.c     |   7 +-
 .../Driver/aix-small-local-exec-dynamic-tls.c |  39 +++--
 clang/test/Driver/ppc-crbits.cpp              |   4 -
 clang/test/Driver/ppc-isa-features.cpp        |  22 +--
 .../llvm/TargetParser/PPCTargetParser.h       |   6 +
 llvm/include/llvm/TargetParser/TargetParser.h |  27 ++++
 llvm/lib/Target/PowerPC/PPC.td                |   4 +-
 llvm/lib/TargetParser/CMakeLists.txt          |   8 +
 llvm/lib/TargetParser/PPCTargetParser.cpp     |  25 +++
 llvm/lib/TargetParser/TargetParser.cpp        |  47 ++++++
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  50 ++++--
 13 files changed, 191 insertions(+), 198 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index e6ef0ecc526b..77145e2891a8 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/TargetParser/PPCTargetParser.h"
+#include <optional>
 
 using namespace clang;
 using namespace clang::targets;
@@ -516,129 +517,14 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
 bool PPCTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
-  Features["altivec"] = llvm::StringSwitch<bool>(CPU)
-                            .Case("7400", true)
-                            .Case("g4", true)
-                            .Case("7450", true)
-                            .Case("g4+", true)
-                            .Case("970", true)
-                            .Case("g5", true)
-                            .Case("pwr6", true)
-                            .Case("pwr7", true)
-                            .Case("pwr8", true)
-                            .Case("pwr9", true)
-                            .Case("ppc64", true)
-                            .Case("ppc64le", true)
-                            .Default(false);
 
-  Features["power9-vector"] = (CPU == "pwr9");
-  Features["crypto"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Default(false);
-  Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
-                                  .Case("ppc64le", true)
-                                  .Case("pwr9", true)
-                                  .Case("pwr8", true)
-                                  .Default(false);
-  Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Case("pwr7", true)
-                           .Default(false);
-  Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Case("pwr7", true)
-                           .Default(false);
-  Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
-                                .Case("ppc64le", true)
-                                .Case("pwr9", true)
-                                .Case("pwr8", true)
-                                .Default(false);
-  Features["crbits"] = llvm::StringSwitch<bool>(CPU)
-                                .Case("ppc64le", true)
-                                .Case("pwr9", true)
-                                .Case("pwr8", true)
-                                .Default(false);
-  Features["vsx"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("ppc64le", true)
-                        .Case("pwr9", true)
-                        .Case("pwr8", true)
-                        .Case("pwr7", true)
-                        .Default(false);
-  Features["htm"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("ppc64le", true)
-                        .Case("pwr9", true)
-                        .Case("pwr8", true)
-                        .Default(false);
+  const llvm::Triple &TheTriple = getTriple();
 
-  // ROP Protect is off by default.
-  Features["rop-protect"] = false;
-  // Privileged instructions are off by default.
-  Features["privileged"] = false;
-
-  if (getTriple().isOSAIX()) {
-    // The code generated by the -maix-small-local-[exec|dynamic]-tls option is
-    // turned off by default.
-    Features["aix-small-local-exec-tls"] = false;
-    Features["aix-small-local-dynamic-tls"] = false;
-
-    // Turn off TLS model opt by default.
-    Features["aix-shared-lib-tls-model-opt"] = false;
-  }
-
-  Features["spe"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("8548", true)
-                        .Case("e500", true)
-                        .Default(false);
-
-  Features["isa-v206-instructions"] = llvm::StringSwitch<bool>(CPU)
-                                          .Case("ppc64le", true)
-                                          .Case("pwr9", true)
-                                          .Case("pwr8", true)
-                                          .Case("pwr7", true)
-                                          .Case("a2", true)
-                                          .Default(false);
-
-  Features["isa-v207-instructions"] = llvm::StringSwitch<bool>(CPU)
-                                          .Case("ppc64le", true)
-                                          .Case("pwr9", true)
-                                          .Case("pwr8", true)
-                                          .Default(false);
-
-  Features["isa-v30-instructions"] =
-      llvm::StringSwitch<bool>(CPU).Case("pwr9", true).Default(false);
-
-  Features["quadword-atomics"] =
-      getTriple().isArch64Bit() && llvm::StringSwitch<bool>(CPU)
-                                       .Case("pwr9", true)
-                                       .Case("pwr8", true)
-                                       .Default(false);
-
-  // Power10 includes all the same features as Power9 plus any features specific
-  // to the Power10 core.
-  if (CPU == "pwr10" || CPU == "power10") {
-    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
-    addP10SpecificFeatures(Features);
-  }
-
-  // Power11 includes all the same features as Power10 plus any features
-  // specific to the Power11 core.
-  if (CPU == "pwr11" || CPU == "power11") {
-    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
-    addP11SpecificFeatures(Features);
-  }
-
-  // Future CPU should include all of the features of Power 11 as well as any
-  // additional features (yet to be determined) specific to it.
-  if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr11", FeaturesVec);
-    addFutureSpecificFeatures(Features);
-  }
+  std::optional<llvm::StringMap<bool>> FeaturesOpt =
+      llvm::PPC::getPPCDefaultTargetFeatures(TheTriple,
+                                             llvm::PPC::normalizeCPUName(CPU));
+  if (FeaturesOpt)
+    Features = FeaturesOpt.value();
 
   if (!ppcUserFeaturesCheck(Diags, FeaturesVec))
     return false;
@@ -700,26 +586,6 @@ bool PPCTargetInfo::initFeatureMap(
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
-// Add any Power10 specific features.
-void PPCTargetInfo::addP10SpecificFeatures(
-    llvm::StringMap<bool> &Features) const {
-  Features["htm"] = false; // HTM was removed for P10.
-  Features["paired-vector-memops"] = true;
-  Features["mma"] = true;
-  Features["power10-vector"] = true;
-  Features["pcrelative-memops"] = true;
-  Features["prefix-instrs"] = true;
-  Features["isa-v31-instructions"] = true;
-}
-
-// Add any Power11 specific features.
-void PPCTargetInfo::addP11SpecificFeatures(
-    llvm::StringMap<bool> &Features) const {}
-
-// Add features specific to the "Future" CPU.
-void PPCTargetInfo::addFutureSpecificFeatures(
-    llvm::StringMap<bool> &Features) const {}
-
 bool PPCTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("powerpc", true)
diff --git a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index cd5a18f39060..a0e76e8a9a0b 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -35,5 +35,5 @@ int &g() { return r; }
 // DARWIN-LABEL: define internal cxx_fast_tlscc void @__tls_init()
 // CHECK: call void @[[R_INIT]]()
 
-// LINUX_AIX: attributes [[ATTR0]] = { {{.*}}"target-features"{{.*}} }
+// LINUX_AIX: attributes [[ATTR0]] = { {{.*}} }
 // DARWIN: attributes [[ATTR1]] = { {{.*}}nounwind{{.*}}"target-features"{{.*}}  }
diff --git a/clang/test/Driver/aix-shared-lib-tls-model-opt.c b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
index 7acf091f0a04..891caf4ed3fc 100644
--- a/clang/test/Driver/aix-shared-lib-tls-model-opt.c
+++ b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 // RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
@@ -19,9 +19,8 @@ int test(void) {
 
 // CHECK-AIX: test() #0 {
 // CHECK-AIX: attributes #0 = {
-// CHECK-AIX-OFF-SAME: -aix-shared-lib-tls-model-opt
 // CHECK-AIX-ON-SAME: +aix-shared-lib-tls-model-opt
 
-// CHECK-LINUX-NOT: {{[-+]aix-shared-lib-tls-model-opt}}
+// CHECK-LINUX-NOT: {{[+]aix-shared-lib-tls-model-opt}}
 
 // CHECK-UNSUPPORTED-TARGET: option '-maix-shared-lib-tls-model-opt' cannot be specified on this target
diff --git a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
index 1a0619b58e89..6fc2b8efb4ae 100644
--- a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
+++ b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
@@ -1,37 +1,37 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
-// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
-// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
+// RUN: %clang --target=powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
 
-// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
+// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALEXEC_TLS
 
-// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
+// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALDYNAMIC_TLS
 
-// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
-// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
@@ -39,10 +39,9 @@ int test(void) {
   return 0;
 }
 
-// CHECK-AIX-DEFAULT: test() #0 {
-// CHECK-AIX-DEFAULT: attributes #0 = {
-// CHECK-AIX-DEFAULT-SAME: {{-aix-small-local-exec-tls,.*-aix-small-local-dynamic-tls|-aix-small-local-dynamic-tls,.*-aix-small-local-exec-tls}}
-// CHECK-LINUX-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
+// CHECK-DEFAULT: test() #0 {
+// CHECK-DEFAULT: attributes #0 = {
+// CHECK-DEFAULT-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
 
 // CHECK-UNSUPPORTED-AIX32: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
 // CHECK-UNSUPPORTED-LINUX: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
diff --git a/clang/test/Driver/ppc-crbits.cpp b/clang/test/Driver/ppc-crbits.cpp
index 3ed56308cb52..62893d3d0e87 100644
--- a/clang/test/Driver/ppc-crbits.cpp
+++ b/clang/test/Driver/ppc-crbits.cpp
@@ -64,8 +64,6 @@
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
-// RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -emit-llvm \
-// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mno-crbits \
@@ -92,8 +90,6 @@
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
-// RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -emit-llvm \
-// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mno-crbits \
diff --git a/clang/test/Driver/ppc-isa-features.cpp b/clang/test/Driver/ppc-isa-features.cpp
index 92c5bc82f72b..35dbfbcdf569 100644
--- a/clang/test/Driver/ppc-isa-features.cpp
+++ b/clang/test/Driver/ppc-isa-features.cpp
@@ -5,20 +5,20 @@
 // RUN: %clang -target powerpc64-unknown-aix -mcpu=pwr9 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR9
 // RUN: %clang -target powerpc-unknown-aix -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR10
 
-// CHECK-PWR6: -isa-v206-instructions
-// CHECK-PWR6: -isa-v207-instructions
-// CHECK-PWR6: -isa-v30-instructions
+// CHECK-PWR6-NOT: isa-v206-instructions
+// CHECK-PWR6-NOT: isa-v207-instructions
+// CHECK-PWR6-NOT: isa-v30-instructions
 
-// CHECK-A2: +isa-v206-instructions
-// CHECK-A2: -isa-v207-instructions
-// CHECK-A2: -isa-v30-instructions
+// CHECK-A2:     +isa-v206-instructions
+// CHECK-A2-NOT: isa-v207-instructions
+// CHECK-A2-NOT: isa-v30-instructions
 
-// CHECK-PWR7: +isa-v206-instructions
-// CHECK-PWR7: -isa-v207-instructions
-// CHECK-PWR7: -isa-v30-instructions
+// CHECK-PWR7:     +isa-v206-instructions
+// CHECK-PWR7-NOT: isa-v207-instructions
+// CHECK-PWR7-NOT: isa-v30-instructions
 
-// CHECK-PWR8: +isa-v207-instructions
-// CHECK-PWR8: -isa-v30-instructions
+// CHECK-PWR8:     +isa-v207-instructions
+// CHECK-PWR8-NOT: isa-v30-instructions
 
 // CHECK-PWR9: +isa-v207-instructions
 // CHECK-PWR9: +isa-v30-instructions
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index 59d9f867005a..d3d44afb5f54 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
+#include "TargetParser.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -37,6 +39,10 @@ LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
 LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
+
+LLVM_ABI std::optional<llvm::StringMap<bool>>
+getPPCDefaultTargetFeatures(const Triple &T, StringRef CPUName);
+
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index 176205e17ae0..b4a92cc6b6c4 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TARGETPARSER_TARGETPARSER_H
 #define LLVM_TARGETPARSER_TARGETPARSER_H
 
+#include "SubtargetFeature.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@@ -190,6 +192,31 @@ insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
 } // namespace AMDGPU
+
+struct BasicSubtargetFeatureKV {
+  const char *Key;         ///< K-V key string
+  unsigned Value;          ///< K-V integer value
+  FeatureBitArray Implies; ///< K-V bit mask
+};
+
+/// Used to provide key value pairs for feature and CPU bit flags.
+struct BasicSubtargetSubTypeKV {
+  const char *Key;         ///< K-V key string
+  FeatureBitArray Implies; ///< K-V bit mask
+
+  /// Compare routine for std::lower_bound
+  bool operator<(StringRef S) const { return StringRef(Key) < S; }
+
+  /// Compare routine for std::is_sorted.
+  bool operator<(const BasicSubtargetSubTypeKV &Other) const {
+    return StringRef(Key) < StringRef(Other.Key);
+  }
+};
+
+std::optional<llvm::StringMap<bool>>
+getCPUDefaultTargetFeatures(StringRef CPU,
+                            ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
+                            ArrayRef<BasicSubtargetFeatureKV> ProcFeatures);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index fd850faf7b2f..ea7c2203662b 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -411,7 +411,6 @@ def ProcessorFeatures {
      FeatureP8Altivec,
      FeatureP8Vector,
      FeatureP8Crypto,
-     FeatureHTM,
      FeatureDirectMove,
      FeatureICBT,
      FeaturePartwordAtomic,
@@ -422,6 +421,7 @@ def ProcessorFeatures {
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
+                                               FeatureHTM,
                                                FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
@@ -443,7 +443,7 @@ def ProcessorFeatures {
   // dispatch for vector operations than scalar ones. For the time being,
   // this list also includes scheduling-related features since we do not have
   // enough info to create custom scheduling strategies for future CPUs.
-  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits, FeatureHTM];
   list<SubtargetFeature> P9InheritableFeatures =
     !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
   list<SubtargetFeature> P9Features =
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 8f8b3a578a1d..66aed45ff18c 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -8,6 +8,12 @@ if (HAS_WERROR_GLOBAL_CTORS AND NOT LLVM_HAS_NOGLOBAL_CTOR_MUTEX)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=global-constructors")
 endif()
 
+set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC/PPC.td)
+
+tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget -I${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC)
+add_public_tablegen_target(PPCGenSubtargetInfo)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # Solaris code uses kstat, so specify dependency explicitly for shared builds.
 if (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
   set(system_libs kstat)
@@ -41,3 +47,5 @@ add_llvm_component_library(LLVMTargetParser
   DEPENDS
   target_parser_gen
   )
+
+add_dependencies(LLVMTargetParser PPCGenSubtargetInfo)
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index 422d758c772e..1b637b27be3d 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -15,6 +15,10 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TargetParser/Host.h"
 
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETFEATURES_KV
+#include "PPCGenSubtargetInfo.inc"
+
 namespace llvm {
 namespace PPC {
 
@@ -117,5 +121,26 @@ StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
   return getNormalizedPPCTargetCPU(T, CPUName);
 }
 
+std::optional<StringMap<bool>> getPPCDefaultTargetFeatures(const Triple &T,
+                                                           StringRef CPU) {
+  std::optional<StringMap<bool>> FeaturesOpt =
+      getCPUDefaultTargetFeatures(CPU, BasicPPCSubTypeKV, BasicPPCFeatureKV);
+
+  if (!FeaturesOpt.has_value())
+    return std::nullopt;
+
+  StringMap<bool> Features = FeaturesOpt.value();
+  // FIXME: We need to check for the processor model 8548, since the backend
+  // does not support this processor. When this processor model is implemented
+  // within the backend, the following code can be removed.
+  if (CPU == "8548")
+    Features["spe"] = true;
+
+  // The target feature `quadword-atomics` is only supported for 64-bit
+  // POWER8 and above.
+  if (Features.find("quadword-atomics") != Features.end() && !T.isArch64Bit())
+    Features["quadword-atomics"] = false;
+  return Features;
+}
 } // namespace PPC
 } // namespace llvm
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 7c54901dae47..03f7d3899c2e 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -18,6 +18,53 @@
 using namespace llvm;
 using namespace AMDGPU;
 
+/// Find KV in array using binary search.
+static const BasicSubtargetSubTypeKV *
+find(StringRef S, ArrayRef<BasicSubtargetSubTypeKV> A) {
+  // Binary search the array
+  auto F = llvm::lower_bound(A, S);
+  // If not found then return NULL
+  if (F == A.end() || StringRef(F->Key) != S)
+    return nullptr;
+  // Return the found array item
+  return F;
+}
+
+/// For each feature that is (transitively) implied by this feature, set it.
+static void setImpliedBits(FeatureBitset &Bits, const FeatureBitset &Implies,
+                           ArrayRef<BasicSubtargetFeatureKV> FeatureTable) {
+  // OR the Implies bits in outside the loop. This allows the Implies for CPUs
+  // which might imply features not in FeatureTable to use this.
+  Bits |= Implies;
+  for (const auto &FE : FeatureTable)
+    if (Implies.test(FE.Value))
+      setImpliedBits(Bits, FE.Implies.getAsBitset(), FeatureTable);
+}
+
+std::optional<llvm::StringMap<bool>> llvm::getCPUDefaultTargetFeatures(
+    StringRef CPU, ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
+    ArrayRef<BasicSubtargetFeatureKV> ProcFeatures) {
+  if (CPU.empty())
+    return std::nullopt;
+
+  const BasicSubtargetSubTypeKV *CPUEntry = ::find(CPU, ProcDesc);
+  if (!CPUEntry)
+    return std::nullopt;
+
+  // Set the features implied by this CPU feature if there is a match.
+  FeatureBitset Bits;
+  llvm::StringMap<bool> DefaultFeatures;
+  setImpliedBits(Bits, CPUEntry->Implies.getAsBitset(), ProcFeatures);
+
+  unsigned BitSize = Bits.size();
+  for (const BasicSubtargetFeatureKV &FE : ProcFeatures) {
+    assert(FE.Value < BitSize && "Target Feature is out of range");
+    if (Bits[FE.Value])
+      DefaultFeatures[FE.Key] = true;
+  }
+  return DefaultFeatures;
+}
+
 namespace {
 
 struct GPUInfo {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index ca008e256a70..da41e981888a 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -89,8 +89,10 @@ class SubtargetEmitter {
 
   FeatureMapTy enumeration(raw_ostream &OS);
   void emitSubtargetInfoMacroCalls(raw_ostream &OS);
-  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
-  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
+                            bool IsEmitBasic = false);
+  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
+                        bool IsEmitBasic = false);
   unsigned cpuNames(raw_ostream &OS);
   void formItineraryStageString(const std::string &Names,
                                 const Record *ItinData, std::string &ItinString,
@@ -254,7 +256,8 @@ void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {
 // command line.
 //
 unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
-                                            const FeatureMapTy &FeatureMap) {
+                                            const FeatureMapTy &FeatureMap,
+                                            bool IsEmitBasic) {
   std::vector<const Record *> FeatureList =
       Records.getAllDerivedDefinitions("SubtargetFeature");
 
@@ -270,7 +273,8 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
   // Begin feature table.
   OS << "// Sorted (by key) array of values for CPU features.\n"
-     << "extern const llvm::SubtargetFeatureKV " << Target
+     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
+     << "SubtargetFeatureKV " << (IsEmitBasic ? "Basic" : "") << Target
      << "FeatureKV[] = {\n";
 
   for (const Record *Feature : FeatureList) {
@@ -281,9 +285,11 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
     // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in
     // } }
-    OS << "  { "
-       << "\"" << CommandLineName << "\", "
-       << "\"" << Desc << "\", " << Target << "::" << Name << ", ";
+    OS << "  { " << "\"" << CommandLineName << "\", ";
+    if (!IsEmitBasic)
+      OS << "\"" << Desc << "\", ";
+
+    OS << Target << "::" << Name << ", ";
 
     ConstRecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
@@ -361,7 +367,8 @@ static void checkDuplicateCPUFeatures(StringRef CPUName,
 // line.
 //
 unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
-                                        const FeatureMapTy &FeatureMap) {
+                                        const FeatureMapTy &FeatureMap,
+                                        bool IsEmitBasic) {
   // Gather and sort processor information
   std::vector<const Record *> ProcessorList =
       Records.getAllDerivedDefinitions("Processor");
@@ -374,7 +381,8 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
 
   // Begin processor table.
   OS << "// Sorted (by key) array of values for CPU subtype.\n"
-     << "extern const llvm::SubtargetSubTypeKV " << Target
+     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
+     << "SubtargetSubTypeKV " << (IsEmitBasic ? "Basic" : "") << Target
      << "SubTypeKV[] = {\n";
 
   for (const Record *Processor : ProcessorList) {
@@ -392,13 +400,17 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
        << "\"" << Name << "\", ";
 
     printFeatureMask(OS, FeatureList, FeatureMap);
-    OS << ", ";
-    printFeatureMask(OS, TuneFeatureList, FeatureMap);
 
-    // Emit the scheduler model pointer.
-    const std::string &ProcModelName =
-        SchedModels.getModelForProc(Processor).ModelName;
-    OS << ", &" << ProcModelName << " },\n";
+    if (!IsEmitBasic) {
+      OS << ", ";
+      printFeatureMask(OS, TuneFeatureList, FeatureMap);
+
+      // Emit the scheduler model pointer.
+      const std::string &ProcModelName =
+          SchedModels.getModelForProc(Processor).ModelName;
+      OS << ", &" << ProcModelName;
+    }
+    OS << " },\n";
   }
 
   // End processor table.
@@ -2040,6 +2052,14 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "} // end namespace llvm\n\n";
   OS << "#endif // GET_SUBTARGETINFO_ENUM\n\n";
 
+  OS << "\n#ifdef GET_SUBTARGETFEATURES_KV\n";
+  OS << "#undef GET_SUBTARGETFEATURES_KV\n\n";
+  OS << "namespace llvm {\n";
+  featureKeyValues(OS, FeatureMap, true);
+  cpuKeyValues(OS, FeatureMap, true);
+  OS << "} // end namespace llvm\n\n";
+  OS << "#endif // GET_SUBTARGETFEATURES_KV\n\n";
+
   emitSubtargetInfoMacroCalls(OS);
 
   OS << "namespace llvm {\n";

From 06dad352dba16fd9afa89be7abf9bb46f7552b48 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 12 Jun 2025 10:39:53 -0700
Subject: [PATCH 0185/1322] Revert "[lldb][headers] Create Python script to fix
 up framework headers" (#143941)

Reverts llvm/llvm-project#142051
---
 lldb/cmake/modules/LLDBFramework.cmake        |  42 +++---
 lldb/scripts/framework-header-fix.py          | 126 ------------------
 .../Shell/Scripts/Inputs/Main/SBAddress.h     |  13 --
 .../Shell/Scripts/Inputs/RPC/RPCSBAddress.h   |   9 --
 .../Shell/Scripts/TestFrameworkFixScript.test |  11 --
 .../Scripts/TestFrameworkFixUnifdef.test      |  12 --
 .../Scripts/TestRPCFrameworkFixScript.test    |  14 --
 7 files changed, 21 insertions(+), 206 deletions(-)
 delete mode 100755 lldb/scripts/framework-header-fix.py
 delete mode 100644 lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
 delete mode 100644 lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
 delete mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixScript.test
 delete mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
 delete mode 100644 lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test

diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 70010ffbf738..8961b1afe93a 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,17 +68,24 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
+# At configuration time, collect headers for the framework bundle and copy them
+# into a staging directory. Later we can copy over the entire folder.
+file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
+set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
+file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
+file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
+list(REMOVE_ITEM root_public_headers ${root_private_headers})
+
 find_program(unifdef_EXECUTABLE unifdef)
 
-# All necessary header files will be staged in the include directory in the build directory,
-# so just copy the files from there into the framework's staging directory.
-set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
-set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
-file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
-foreach(header ${lldb_build_dir_header_staging_list})
+set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
+foreach(header
+    ${public_headers}
+    ${generated_public_headers}
+    ${root_public_headers})
 
   get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_framework_header_staging}/${basename})
+  set(staged_header ${lldb_header_staging}/${basename})
 
   if(unifdef_EXECUTABLE)
     # unifdef returns 0 when the file is unchanged and 1 if something was changed.
@@ -105,20 +112,13 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# Take the headers from the staging directory and fix up their includes for the framework.
-# Then write them to the output directory.
-# Also, run unifdef to remove any specified guards from the header files.
-file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
-foreach(header ${lldb_framework_header_staging_list})
-
-  set(input_header ${header})
-  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${input_header})
-
-  add_custom_command(TARGET liblldb POST_BUILD
-    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
-    COMMENT "LLDB.framework: Fix up and copy framework headers"
-  )
-endforeach()
+# At build time, copy the staged headers into the framework bundle (and do
+# some post-processing in-place).
+add_custom_command(TARGET liblldb POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${lldb_header_staging} $<TARGET_FILE_DIR:liblldb>/Headers
+  COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.sh $<TARGET_FILE_DIR:liblldb>/Headers ${LLDB_VERSION}
+  COMMENT "LLDB.framework: copy framework headers"
+)
 
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
deleted file mode 100755
index 9e4e5f860a2c..000000000000
--- a/lldb/scripts/framework-header-fix.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Usage: <path/to/input-directory> <path/to/output-directory>
-
-This script is used when building LLDB.framework or LLDBRPC.framework. For each framework, local includes are converted to their respective framework includes.
-
-This script is used in 2 ways:
-1. It is used on header files that are copied into LLDB.framework. For these files, local LLDB includes are converted into framework includes, e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>.
-
-2. It is used on header files for LLDBRPC.framework. For these files, includes of RPC common files will be converted to framework includes, e.g. #include <lldb-rpc/common/RPCCommon.h> -> #include <LLDBRPC/RPCCommon.h>. It will also change local includes to framework includes, e.g. #include "SBAddress.h" -> #include <LLDBRPC/SBAddress.h>
-"""
-
-import argparse
-import os
-import re
-import shutil
-import subprocess
-import sys
-
-# Main header regexes
-INCLUDE_FILENAME_REGEX = re.compile(
-    r'#include "lldb/API/(?P<include_filename>.*){0,1}"'
-)
-
-# RPC header regexes
-RPC_COMMON_REGEX = re.compile(r"#include <lldb-rpc/common/(?P<include_filename>.*)>")
-RPC_INCLUDE_FILENAME_REGEX = re.compile(r'#include "(?P<include_filename>.*)"')
-
-
-def modify_rpc_includes(input_file_path, output_file_path):
-    with open(input_file_path, "r") as input_file:
-        lines = input_file.readlines()
-        file_buffer = "".join(lines)
-        with open(output_file_path, "w") as output_file:
-            # Local includes must be changed to RPC framework level includes.
-            # e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
-            # Also, RPC common code includes must change to RPC framework level includes.
-            # e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
-            rpc_common_matches = RPC_COMMON_REGEX.finditer(file_buffer)
-            rpc_include_filename_matches = RPC_INCLUDE_FILENAME_REGEX.finditer(
-                file_buffer
-            )
-            for match in rpc_common_matches:
-                file_buffer = re.sub(
-                    match.group(),
-                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
-                    file_buffer,
-                )
-            for match in rpc_include_filename_matches:
-                file_buffer = re.sub(
-                    match.group(),
-                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
-                    file_buffer,
-                )
-            output_file.write(file_buffer)
-
-
-def modify_main_includes(input_file_path, output_file_path):
-    with open(input_file_path, "r") as input_file:
-        lines = input_file.readlines()
-        file_buffer = "".join(lines)
-        with open(output_file_path, "w") as output_file:
-            # Local includes must be changed to framework level includes.
-            # e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-            regex_matches = INCLUDE_FILENAME_REGEX.finditer(file_buffer)
-            for match in regex_matches:
-                file_buffer = re.sub(
-                    match.group(),
-                    r"#include <LLDB/" + match.group("include_filename") + ">",
-                    file_buffer,
-                )
-                output_file.write(file_buffer)
-
-
-def remove_guards(output_file_path, unifdef_path, unifdef_guards):
-    # The unifdef path should be passed in from CMake. If it wasn't there in CMake or is incorrect,
-    # find it using shutil. If shutil can't find it, then exit.
-    if not shutil.which(unifdef_path):
-        unifdef_path = shutil.which("unifdef")
-    if not unifdef_path:
-        print(
-            "Unable to find unifdef executable. Guards will not be removed from input files. Exiting..."
-        )
-        sys.exit(1)
-
-    subprocess_command = (
-        [unifdef_path, "-o", output_file_path] + unifdef_guards + [output_file_path]
-    )
-    subprocess.run(subprocess_command)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f", "--framework", choices=["lldb_main", "lldb_rpc"])
-    parser.add_argument("-i", "--input_file")
-    parser.add_argument("-o", "--output_file")
-    parser.add_argument("-p", "--unifdef_path")
-    parser.add_argument(
-        "unifdef_guards",
-        nargs="+",
-        type=str,
-        help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
-    )
-    args = parser.parse_args()
-    input_file_path = str(args.input_file)
-    output_file_path = str(args.output_file)
-    framework_version = args.framework
-    unifdef_path = str(args.unifdef_path)
-    # Prepend dashes to the list of guards passed in from the command line.
-    # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
-    # but passing them in with dashes for this script causes argparse to think that they're
-    # arguments in and of themself, so they need to passed in without dashes.
-    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
-
-    if framework_version == "lldb_main":
-        modify_main_includes(input_file_path, output_file_path)
-    if framework_version == "lldb_rpc":
-        modify_rpc_includes(input_file_path, output_file_path)
-    # After the incldues have been modified, run unifdef on the headers to remove any guards
-    # specified at the command line.
-    remove_guards(output_file_path, unifdef_path, unifdef_guards)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
deleted file mode 100644
index fecc69687cd7..000000000000
--- a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// This is a truncated version of an SB API file
-// used to test framework-header-fix.py to make sure the includes are correctly fixed
-// up for the LLDB.framework.
-
-// Local includes must be changed to framework level includes.
-// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-#include "lldb/API/SBDefines.h"
-#include "lldb/API/SBModule.h"
-
-// Any include guards specified at the command line must be removed.
-#ifndef SWIG
-int a = 10
-#endif
diff --git a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
deleted file mode 100644
index 556afa38a922..000000000000
--- a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
+++ /dev/null
@@ -1,9 +0,0 @@
-// This is a truncated version of an SB API file generated by lldb-rpc-gen
-// used to test framework-header-fix.py to make sure the includes are correctly fixed
-// up for the LLDBRPC.framework.
-
-// Local includes must be changed to framework level includes.
-// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-#include "LLDBRPC.h"
-#include "SBDefines.h"
-#include <lldb-rpc/common/RPCPublic.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
deleted file mode 100644
index e90c3bdfc5ad..000000000000
--- a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
+++ /dev/null
@@ -1,11 +0,0 @@
-# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
-RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
-
-# Check the output
-RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
-
-# Local includes must be changed to framework level includes.
-# e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-CHECK: #include <LLDB/SBDefines.h>
-CHECK: #include <LLDB/SBModule.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
deleted file mode 100644
index a7e82d2f3640..000000000000
--- a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
+++ /dev/null
@@ -1,12 +0,0 @@
-# REQUIRES: system-darwin
-# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
-RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
-
-# Check the output
-RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
-
-# Any include guards specified at the command line must be removed.
-CHECK-NOT: #ifndef SWIG
-CHECK: int a = 10
-CHECK-NOT: #endif
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
deleted file mode 100644
index 8ba03a8c2afa..000000000000
--- a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
+++ /dev/null
@@ -1,14 +0,0 @@
-# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
-RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/Main/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
-
-# Check the output
-RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
-
-# Local includes must be changed to RPC framework level includes.
-# e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
-# Also, RPC common code includes must change to RPC framework level includes.
-# e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
-CHECK: #include <LLDBRPC/RPCPublic.h>
-CHECK: #include <LLDBRPC/SBDefines.h>
-CHECK: #include <LLDBRPC/LLDBRPC.h>

From 4e765b7a6b93b5d82e90f9a112b3eca4f873f005 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 18:40:26 +0100
Subject: [PATCH 0186/1322] [x86] dpbusd_i4.ll - regenerate VPTERNLOGD asm
 comment

---
 llvm/test/CodeGen/X86/dpbusd_i4.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll
index a212f99680ef..9fbac111ee16 100644
--- a/llvm/test/CodeGen/X86/dpbusd_i4.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll
@@ -54,7 +54,7 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
 ; CHECK-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; CHECK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & m32bcst)
 ; CHECK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2

From 5a6a4b6ba6945363bf366a885103a4adca11b5ef Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 10:45:47 -0700
Subject: [PATCH 0187/1322] [libc] Implement perror (#143624)

The perror function writes an error message directly to stderr. This
patch adds an implementation, tests, and header generation details.
---
 libc/config/linux/aarch64/entrypoints.txt |  1 +
 libc/config/linux/riscv/entrypoints.txt   |  1 +
 libc/config/linux/x86_64/entrypoints.txt  |  1 +
 libc/include/stdio.yaml                   |  6 ++
 libc/src/stdio/CMakeLists.txt             |  1 +
 libc/src/stdio/generic/CMakeLists.txt     | 15 +++++
 libc/src/stdio/generic/perror.cpp         | 81 +++++++++++++++++++++++
 libc/src/stdio/perror.h                   | 20 ++++++
 libc/test/src/stdio/CMakeLists.txt        | 12 ++++
 libc/test/src/stdio/perror_test.cpp       | 32 +++++++++
 10 files changed, 170 insertions(+)
 create mode 100644 libc/src/stdio/generic/perror.cpp
 create mode 100644 libc/src/stdio/perror.h
 create mode 100644 libc/test/src/stdio/perror_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index fcf1278eae72..9e042cd4a8ac 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -972,6 +972,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.getc_unlocked
     libc.src.stdio.getchar
     libc.src.stdio.getchar_unlocked
+    libc.src.stdio.perror
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 050fc2672a57..db8f8a7cf0b7 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -1098,6 +1098,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.getc_unlocked
     libc.src.stdio.getchar
     libc.src.stdio.getchar_unlocked
+    libc.src.stdio.perror
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 6c9d83708b92..c993ef8303a5 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1116,6 +1116,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.getc_unlocked
     libc.src.stdio.getchar
     libc.src.stdio.getchar_unlocked
+    libc.src.stdio.perror
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 3d5164fa10ff..2a0c56370998 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -249,6 +249,12 @@ functions:
       - POSIX
     return_type: int
     arguments: []
+  - name: perror
+    standards:
+      - stdc
+    return_type: void
+    arguments:
+      - type: const char *
   - name: printf
     standards:
       - stdc
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 63f6ed8a11f1..b0a6ef1e291b 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -221,6 +221,7 @@ add_stdio_entrypoint_object(fopen)
 add_stdio_entrypoint_object(fclose)
 add_stdio_entrypoint_object(fread_unlocked)
 add_stdio_entrypoint_object(fread)
+add_stdio_entrypoint_object(perror)
 add_stdio_entrypoint_object(puts)
 add_stdio_entrypoint_object(fputs)
 add_stdio_entrypoint_object(fwrite_unlocked)
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index e1f4ed5c1949..6361822b6199 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -206,6 +206,21 @@ add_generic_entrypoint_object(
     libc.src.__support.File.platform_file
 )
 
+add_generic_entrypoint_object(
+  perror
+  SRCS
+    perror.cpp
+  HDRS
+    ../perror.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.StringUtil.error_to_string
+    libc.src.__support.CPP.string_view
+    libc.src.__support.File.file
+    libc.src.__support.File.platform_file
+    libc.src.__support.File.platform_stderr
+)
+
 add_generic_entrypoint_object(
   fputs
   SRCS
diff --git a/libc/src/stdio/generic/perror.cpp b/libc/src/stdio/generic/perror.cpp
new file mode 100644
index 000000000000..68b4ad644caa
--- /dev/null
+++ b/libc/src/stdio/generic/perror.cpp
@@ -0,0 +1,81 @@
+//===-- Implementation of perror ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/perror.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/File/file.h"
+#include "src/__support/StringUtil/error_to_string.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static int write_out(cpp::string_view str_view, File *f) {
+  if (str_view.size() > 0) {
+    auto result = f->write_unlocked(str_view.data(), str_view.size());
+    if (result.has_error())
+      return result.error;
+  }
+  return 0;
+}
+
+// separate function so that we can return early on error but still get the
+// unlock. This function sets errno and should not be called elsewhere.
+static void write_sequence(cpp::string_view str_view,
+                           cpp::string_view err_str) {
+  int write_err;
+  // TODO: this seems like there should be some sort of queue system to
+  // deduplicate this code.
+
+  // FORMAT:
+  // if str != nullptr and doesn't start with a null byte:
+  //   "[str]: [strerror(errno)]\n"
+  // else
+  //   "[strerror(errno)]\n"
+  if (str_view.size() > 0) {
+    write_err = write_out(str_view, LIBC_NAMESPACE::stderr);
+    if (write_err != 0) {
+      libc_errno = write_err;
+      return;
+    }
+
+    write_err = write_out(": ", LIBC_NAMESPACE::stderr);
+    if (write_err != 0) {
+      libc_errno = write_err;
+      return;
+    }
+  }
+
+  write_err = write_out(err_str, LIBC_NAMESPACE::stderr);
+  if (write_err != 0) {
+    libc_errno = write_err;
+    return;
+  }
+
+  write_err = write_out("\n", LIBC_NAMESPACE::stderr);
+  if (write_err != 0) {
+    libc_errno = write_err;
+    return;
+  }
+}
+
+LLVM_LIBC_FUNCTION(void, perror, (const char *str)) {
+  const char empty_str[1] = {'\0'};
+  if (str == nullptr)
+    str = empty_str;
+  cpp::string_view str_view(str);
+
+  cpp::string_view err_str = get_error_string(libc_errno);
+
+  // We need to lock the stream to ensure the newline is always appended.
+  LIBC_NAMESPACE::stderr->lock();
+  write_sequence(str_view, err_str);
+  LIBC_NAMESPACE::stderr->unlock();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/perror.h b/libc/src/stdio/perror.h
new file mode 100644
index 000000000000..bf8d0af1df5d
--- /dev/null
+++ b/libc/src/stdio/perror.h
@@ -0,0 +1,20 @@
+//===-- Implementation header of perror -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PERROR_H
+#define LLVM_LIBC_SRC_STDIO_PERROR_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+void perror(const char *s);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PERROR_H
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 01904a30504e..ce2171f19597 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -357,6 +357,18 @@ add_libc_test(
     libc.src.stdio.puts
 )
 
+add_libc_test(
+  perror_test
+  HERMETIC_TEST_ONLY # writes to libc's stderr
+  SUITE
+    libc_stdio_unittests
+  SRCS
+    perror_test.cpp
+  DEPENDS
+    libc.src.stdio.perror
+    libc.src.errno.errno
+)
+
 add_libc_test(
   fputs_test
   HERMETIC_TEST_ONLY # writes to libc's stdout and stderr
diff --git a/libc/test/src/stdio/perror_test.cpp b/libc/test/src/stdio/perror_test.cpp
new file mode 100644
index 000000000000..9a97be2eff21
--- /dev/null
+++ b/libc/test/src/stdio/perror_test.cpp
@@ -0,0 +1,32 @@
+//===-- Unittests for perror ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/perror.h"
+
+#include "src/__support/libc_errno.h"
+#include "test/UnitTest/Test.h"
+
+// The standard says perror prints directly to stderr and returns nothing. This
+// makes it rather difficult to test automatically.
+
+// TODO: figure out redirecting stderr so this test can check correctness.
+TEST(LlvmLibcPerrorTest, PrintOut) {
+  LIBC_NAMESPACE::libc_errno = 0;
+  constexpr char simple[] = "A simple string";
+  LIBC_NAMESPACE::perror(simple);
+
+  // stick to stdc errno values, specifically 0, EDOM, ERANGE, and EILSEQ.
+  LIBC_NAMESPACE::libc_errno = EDOM;
+  LIBC_NAMESPACE::perror("Print this and an error");
+
+  LIBC_NAMESPACE::libc_errno = EILSEQ;
+  LIBC_NAMESPACE::perror("\0 shouldn't print this.");
+
+  LIBC_NAMESPACE::libc_errno = ERANGE;
+  LIBC_NAMESPACE::perror(nullptr);
+}

From f94950db89a905309ec9ea2245889df88ffd0690 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Thu, 12 Jun 2025 18:04:26 +0000
Subject: [PATCH 0188/1322] [libc] Changed mbstate struct (#143942)

Changed the mbstate variable from bits processed to bytes processed and
implemented isComplete().

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/wchar/character_converter.cpp | 4 +++-
 libc/src/__support/wchar/mbstate.h               | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3cdb8ca83b7f..f09c7815a6cc 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -18,7 +18,9 @@ namespace internal {
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
-bool CharacterConverter::isComplete() {}
+bool CharacterConverter::isComplete() {
+  return state->bytes_processed == state->total_bytes;
+}
 
 int CharacterConverter::push(char8_t utf8_byte) {}
 
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index cb8950374de4..d33ee354a544 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -18,7 +18,7 @@ namespace internal {
 
 struct mbstate {
   char32_t partial;
-  uint8_t bits_processed;
+  uint8_t bytes_processed;
   uint8_t total_bytes;
 };
 

From fd88aef21bae75b4641472badeb2abe3757872ac Mon Sep 17 00:00:00 2001
From: Qinkun Bao <qinkun@google.com>
Date: Thu, 12 Jun 2025 14:08:36 -0400
Subject: [PATCH 0189/1322] [Doc][NFC] Fix Sanitizer Ignore list example
 errors. (#143755)

---
 clang/docs/SanitizerSpecialCaseList.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/SanitizerSpecialCaseList.rst b/clang/docs/SanitizerSpecialCaseList.rst
index 61b6c55d8e6e..2c50778d0f49 100644
--- a/clang/docs/SanitizerSpecialCaseList.rst
+++ b/clang/docs/SanitizerSpecialCaseList.rst
@@ -109,13 +109,13 @@ precedence. Here are a few examples.
 .. code-block:: bash
 
   $ cat ignorelist1.txt
-  # test.cc will be instrumented.
+  # test.cc will not be instrumented.
   src:*
   src:*/mylib/*=sanitize
   src:*/mylib/test.cc
 
   $ cat ignorelist2.txt
-  # test.cc will not be instrumented.
+  # test.cc will be instrumented.
   src:*
   src:*/mylib/test.cc
   src:*/mylib/*=sanitize

From 639e811434d2c21b9161fe9955acdea28ce33c7b Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 12 Jun 2025 20:10:05 +0200
Subject: [PATCH 0190/1322] [CIR][NFC] Fix an unused variable warning (#143933)

This fixes a warning where a variable assigned in 'if' statement wasn't
referenced again.
---
 clang/lib/CIR/CodeGen/CIRGenCall.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index da754e0806b2..67c6a8dd3ef5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -171,7 +171,7 @@ void CIRGenFunction::emitDelegateCallArg(CallArgList &args,
 
   QualType type = param->getType();
 
-  if (const auto *rd = type->getAsCXXRecordDecl()) {
+  if (type->getAsCXXRecordDecl()) {
     cgm.errorNYI(param->getSourceRange(),
                  "emitDelegateCallArg: record argument");
     return;

From 4a58a63280a673142fc674db1fb668b7bae00420 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 12 Jun 2025 19:26:51 +0100
Subject: [PATCH 0191/1322] [mlir][linalg] Remove the
 `test-linalg-to-vector-patterns` option (#142116)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch removes the `test-linalg-to-vector-patterns` option from the
`-test-linalg-transform-patterns=` test flag. It was only used in one
test, where a more specialized transform dialect op can be used instead:

* `transform.apply_patterns.linalg.pad_vectorization`

While we could preserve `test-linalg-to-vector-patterns`, it's better to
rely on finer-grained transformations — this way, we know exactly what
is being run and tested. Now that its only use has been removed, it
feels natural to delete `test-linalg-to-vector-patterns`.
---
 .../Dialect/Linalg/CPU/test-padtensor.mlir       | 13 ++++++++++++-
 .../lib/Dialect/Linalg/TestLinalgTransforms.cpp  | 16 ----------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
index 1361d21e7d94..63db0def1cbc 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-to-vector-patterns \
+// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule \
 // RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
@@ -34,4 +34,15 @@ func.func @main() {
   return
 }
 
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 046b9a65f335..738648b8ccdc 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -65,11 +65,6 @@ struct TestLinalgTransforms
       llvm::cl::desc(
           "Test a fused pass that forwards memref.copy to vector.transfer"),
       llvm::cl::init(false)};
-  Option<bool> testGenericToVectorPattern{
-      *this, "test-linalg-to-vector-patterns",
-      llvm::cl::desc("Test a set of patterns that rewrite a linalg contraction "
-                     "in vector.contract form"),
-      llvm::cl::init(false)};
   Option<bool> testDecomposePadTensor{
       *this, "test-decompose-pad-tensor",
       llvm::cl::desc("Test transform pad tensor by copying with generic ops"),
@@ -166,15 +161,6 @@ static void applyVectorTransferForwardingPatterns(func::FuncOp funcOp) {
   (void)applyPatternsGreedily(funcOp, std::move(forwardPattern));
 }
 
-static void applyLinalgToVectorPatterns(func::FuncOp funcOp) {
-  RewritePatternSet patterns(funcOp.getContext());
-  auto *ctx = funcOp.getContext();
-  patterns.add<CopyVectorizationPattern>(ctx);
-  populatePadOpVectorizationPatterns(patterns);
-  populateConvolutionVectorizationPatterns(patterns);
-  (void)applyPatternsGreedily(funcOp, std::move(patterns));
-}
-
 static void applyDecomposePadPatterns(func::FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
   patterns.add<DecomposePadOpPattern>(funcOp.getContext());
@@ -254,8 +240,6 @@ void TestLinalgTransforms::runOnOperation() {
     return applyPatterns(getOperation());
   if (testVectorTransferForwardingPatterns)
     return applyVectorTransferForwardingPatterns(getOperation());
-  if (testGenericToVectorPattern)
-    return applyLinalgToVectorPatterns(getOperation());
   if (testDecomposePadTensor)
     return applyDecomposePadPatterns(getOperation());
   if (testDecomposeTensorPackOp)

From 3c1053811e6925e8b9f7a044f3a18bfda1d7ccfe Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg@gmail.com>
Date: Thu, 12 Jun 2025 14:33:06 -0400
Subject: [PATCH 0192/1322] Revert "[clang-tidy] Improve integer comparison by
 matching valid expressions outside implicitCastExpr" (#143944)

Reverts llvm/llvm-project#134188
related: https://github.com/llvm/llvm-project/issues/143927
---
 .../UseIntegerSignComparisonCheck.cpp         | 21 ++---
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 -
 .../modernize/use-integer-sign-comparison.cpp | 78 -------------------
 3 files changed, 7 insertions(+), 96 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
index c02c5dfa8756..eeba5cce80da 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
@@ -39,28 +39,21 @@ intCastExpression(bool IsSigned,
   // std::cmp_{} functions trigger a compile-time error if either LHS or RHS
   // is a non-integer type, char, enum or bool
   // (unsigned char/ signed char are Ok and can be used).
-  const auto HasIntegerType = hasType(hasCanonicalType(qualType(
+  auto IntTypeExpr = expr(hasType(hasCanonicalType(qualType(
       isInteger(), IsSigned ? isSignedInteger() : isUnsignedInteger(),
-      unless(isActualChar()), unless(booleanType()), unless(enumType()))));
-
-  const auto IntTypeExpr = expr(HasIntegerType);
+      unless(isActualChar()), unless(booleanType()), unless(enumType())))));
 
   const auto ImplicitCastExpr =
       CastBindName.empty() ? implicitCastExpr(hasSourceExpression(IntTypeExpr))
                            : implicitCastExpr(hasSourceExpression(IntTypeExpr))
                                  .bind(CastBindName);
 
-  const auto ExplicitCastExpr =
-      anyOf(explicitCastExpr(has(ImplicitCastExpr)),
-            ignoringImpCasts(explicitCastExpr(has(ImplicitCastExpr))));
+  const auto CStyleCastExpr = cStyleCastExpr(has(ImplicitCastExpr));
+  const auto StaticCastExpr = cxxStaticCastExpr(has(ImplicitCastExpr));
+  const auto FunctionalCastExpr = cxxFunctionalCastExpr(has(ImplicitCastExpr));
 
-  // Match function calls or variable references not directly wrapped by an
-  // implicit cast
-  const auto CallIntExpr = CastBindName.empty()
-                               ? callExpr(HasIntegerType)
-                               : callExpr(HasIntegerType).bind(CastBindName);
-
-  return expr(anyOf(ImplicitCastExpr, ExplicitCastExpr, CallIntExpr));
+  return expr(anyOf(ImplicitCastExpr, CStyleCastExpr, StaticCastExpr,
+                    FunctionalCastExpr));
 }
 
 static StringRef parseOpCode(BinaryOperator::Opcode Code) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 882ee0015df1..19ccd1790e75 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -237,10 +237,6 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-designated-initializers>` check by avoiding
   diagnosing designated initializers for ``std::array`` initializations.
 
-- Improved :doc:`modernize-use-integer-sign-comparison
-  <clang-tidy/checks/modernize/use-integer-sign-comparison>` check by matching
-  valid integer expressions not directly wrapped around an implicit cast.
-
 - Improved :doc:`modernize-use-ranges
   <clang-tidy/checks/modernize/use-ranges>` check by updating suppress
   warnings logic for ``nullptr`` in ``std::find``.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
index d93a05ac3805..e0a84ef5aed2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
@@ -121,81 +121,3 @@ int AllComparisons() {
 
     return 0;
 }
-
-namespace PR127471 {
-    int getSignedValue();
-    unsigned int getUnsignedValue();
-
-    void callExprTest() {
-
-        if (getSignedValue() < getUnsignedValue())
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES:  if (std::cmp_less(getSignedValue() , getUnsignedValue()))
-
-        int sVar = 0;
-        if (getUnsignedValue() > sVar)
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_greater(getUnsignedValue() , sVar))
-
-        unsigned int uVar = 0;
-        if (getSignedValue() > uVar)
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_greater(getSignedValue() , uVar))
-
-    }
-
-    // Add a class with member functions for testing member function calls
-    class TestClass {
-    public:
-        int getSignedValue() { return -5; }
-        unsigned int getUnsignedValue() { return 5; }
-    };
-
-    void memberFunctionTests() {
-        TestClass obj;
-
-        if (obj.getSignedValue() < obj.getUnsignedValue())
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(obj.getSignedValue() , obj.getUnsignedValue()))
-    }
-
-    void castFunctionTests() {
-        // C-style casts with function calls
-        if ((int)getUnsignedValue() < (unsigned int)getSignedValue())
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(getUnsignedValue(),getSignedValue()))
-
-
-        // Static casts with function calls
-        if (static_cast<int>(getUnsignedValue()) < static_cast<unsigned int>(getSignedValue()))
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(getUnsignedValue(),getSignedValue()))
-    }
-
-    // Define tests
-    #define SIGNED_FUNC getSignedValue()
-    #define UNSIGNED_FUNC getUnsignedValue()
-
-    void defineTests() {
-        if (SIGNED_FUNC < UNSIGNED_FUNC)
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(SIGNED_FUNC , UNSIGNED_FUNC))
-    }
-
-    // Template tests (should not warn)
-    template <typename T1>
-    void templateFunctionTest(T1 value) {
-        if (value() < getUnsignedValue())
-            return;
-
-        if (value() < (getSignedValue() || getUnsignedValue()))
-          return;
-    }
-} // namespace PR127471

From edf636afe405ff90da7bf1834aa334bd52bc861e Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 12 Jun 2025 14:38:54 -0400
Subject: [PATCH 0193/1322] [PowerPC][NFC] Update lowering STXVP to STXV in Oct
 word spilling (#142220)

Remove explicit register arithmetic from spilling ACC and STXVP code.
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 93 ++++++++++-----------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 45183af0b798..9dc69e203b0d 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1238,42 +1238,6 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
-static void spillRegPairs(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          const TargetInstrInfo &TII, Register SrcReg,
-                          unsigned FrameIndex, bool IsLittleEndian,
-                          bool IsKilled, bool TwoPairs) {
-  unsigned Offset = 0;
-  // The register arithmetic in this function does not support virtual
-  // registers.
-  assert(!SrcReg.isVirtual() &&
-         "Spilling register pairs does not support virtual registers.");
-
-  if (TwoPairs)
-    Offset = IsLittleEndian ? 48 : 0;
-  else
-    Offset = IsLittleEndian ? 16 : 0;
-  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
-                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  Offset += IsLittleEndian ? -16 : 16;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg + 1, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  if (TwoPairs) {
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 2, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 3, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-  }
-}
-
 /// Remove any STXVP[X] instructions and split them out into a pair of
 /// STXV[X] instructions if --disable-auto-paired-vec-st is specified on
 /// the command line.
@@ -1290,8 +1254,21 @@ void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
   Register SrcReg = MI.getOperand(0).getReg();
   bool IsLittleEndian = Subtarget.isLittleEndian();
   bool IsKilled = MI.getOperand(0).isKill();
-  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
-                /* TwoPairs */ false);
+
+  assert(PPC::VSRpRCRegClass.contains(SrcReg) &&
+          "Expecting STXVP to be utilizing a VSRp register.");
+
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx0),
+                  getKillRegState(IsKilled)),
+      FrameIndex, IsLittleEndian ? 16 : 0);
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx1),
+                  getKillRegState(IsKilled)),
+      FrameIndex, IsLittleEndian ? 0 : 16);
+
   // Discard the original instruction.
   MBB.erase(II);
 }
@@ -1325,8 +1302,6 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   bool IsKilled = MI.getOperand(0).isKill();
 
   bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
-  Register Reg =
-      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   emitAccSpillRestoreInfo(MBB, IsPrimed, false);
@@ -1337,16 +1312,34 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  if (DisableAutoPairedVecSt)
-    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
-                  /* TwoPairs */ true);
-  else {
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 32 : 0);
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg + 1, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt) {
+    auto spillPair = [&](Register Reg, int Offset) {
+      addFrameReference(
+          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx0),
+                      getKillRegState(IsKilled)),
+          FrameIndex, Offset);
+      addFrameReference(
+          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx1),
+                      getKillRegState(IsKilled)),
+          FrameIndex, IsLittleEndian ? Offset - 16 : Offset + 16);
+    };
+    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+              IsLittleEndian ? 48 : 0);
+    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+              IsLittleEndian ? 16 : 32);
+  } else {
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 0 : 32);
   }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);

From 46085d8f83623f6ea2921459de9f731d7df762d4 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 12 Jun 2025 11:41:16 -0700
Subject: [PATCH 0194/1322] [lld/ELF][x86-64] Place large executable sections
 at the edges of binary (#70358)

So that when mixing small and large text, large text stays out of the
way of the rest of the binary.

Place large RX sections at the beginning rather than at the end so that
with `--no-rosegment`, the large text and rodata share a single PT_LOAD
segment. Place large RWX sections at the end to keep writable and
readonly sections separate.

Clang started emitting the large section flag for `.ltext` sections in
#73037.
---
 lld/ELF/Writer.cpp                   | 23 +++++++++++-----
 lld/test/ELF/x86-64-section-layout.s | 41 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 10dc688160d1..3d9888f576f0 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -653,15 +653,17 @@ enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
+  RF_LARGE_EXEC_WRITE = 1 << 16,
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
   RF_EXEC_WRITE = 1 << 13,
   RF_EXEC = 1 << 12,
   RF_RODATA = 1 << 11,
-  RF_LARGE = 1 << 10,
-  RF_NOT_RELRO = 1 << 9,
-  RF_NOT_TLS = 1 << 8,
-  RF_BSS = 1 << 7,
+  RF_LARGE_EXEC = 1 << 10,
+  RF_LARGE = 1 << 9,
+  RF_NOT_RELRO = 1 << 8,
+  RF_NOT_TLS = 1 << 7,
+  RF_BSS = 1 << 6,
 };
 
 unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
@@ -691,6 +693,7 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
   // places.
   bool isExec = osec.flags & SHF_EXECINSTR;
   bool isWrite = osec.flags & SHF_WRITE;
+  bool isLarge = osec.flags & SHF_X86_64_LARGE && ctx.arg.emachine == EM_X86_64;
 
   if (!isWrite && !isExec) {
     // Among PROGBITS sections, place .lrodata further from .text.
@@ -698,7 +701,7 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
     // layout has one extra PT_LOAD, but alleviates relocation overflow
     // pressure for absolute relocations referencing small data from -fno-pic
     // relocatable files.
-    if (osec.flags & SHF_X86_64_LARGE && ctx.arg.emachine == EM_X86_64)
+    if (isLarge)
       rank |= ctx.arg.zLrodataAfterBss ? RF_LARGE_ALT : 0;
     else
       rank |= ctx.arg.zLrodataAfterBss ? 0 : RF_LARGE;
@@ -722,7 +725,13 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
     else
       rank |= RF_RODATA;
   } else if (isExec) {
-    rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
+    // Place readonly .ltext before .lrodata and writable .ltext after .lbss to
+    // keep writable and readonly segments separate.
+    if (isLarge) {
+      rank |= isWrite ? RF_LARGE_EXEC_WRITE : RF_LARGE_EXEC;
+    } else {
+      rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
+    }
   } else {
     rank |= RF_WRITE;
     // The TLS initialization block needs to be a single contiguous block. Place
@@ -737,7 +746,7 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
     // alleviates relocation overflow pressure.
     // For -z lrodata-after-bss, place .lbss/.lrodata/.ldata after .bss.
     // .bss/.lbss being adjacent reuses the NOBITS size optimization.
-    if (osec.flags & SHF_X86_64_LARGE && ctx.arg.emachine == EM_X86_64) {
+    if (isLarge) {
       rank |= ctx.arg.zLrodataAfterBss
                   ? (osec.type == SHT_NOBITS ? 1 : RF_LARGE_ALT)
                   : RF_LARGE;
diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s
index b03d3e6c2b99..1432271b885a 100644
--- a/lld/test/ELF/x86-64-section-layout.s
+++ b/lld/test/ELF/x86-64-section-layout.s
@@ -18,6 +18,10 @@
 # RUN: ld.lld --section-start=.note=0x200300 a.o -z lrodata-after-bss -o a3
 # RUN: llvm-readelf -S -l -sX a3 | FileCheck %s --check-prefix=CHECK3
 
+# RUN: llvm-mc -filetype=obj -triple=x86_64 c.s -o c.o
+# RUN: ld.lld c.o -o c
+# RUN: llvm-readelf -S -l c | FileCheck %s --check-prefix=CHECK4
+
 # CHECK:       Name              Type            Address          Off    Size   ES Flg Lk Inf Al
 # CHECK-NEXT:                    NULL            0000000000000000 000000 000000 00      0   0  0
 # CHECK-NEXT:  .note             NOTE            0000000000200300 000300 000001 00   A  0   0  1
@@ -116,6 +120,18 @@
 # CHECK3-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
 # CHECK3-NEXT:  0000000000207d0d     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _end
 
+# CHECK4:      .note      NOTE
+# CHECK4-NEXT: .ltext     PROGBITS
+# CHECK4-NEXT: .lrodata   PROGBITS
+# CHECK4-NEXT: .rodata    PROGBITS
+# CHECK4-NEXT: .text      PROGBITS
+# CHECK4-NEXT: .data      PROGBITS
+# CHECK4-NEXT: .bss       NOBITS
+# CHECK4-NEXT: .ldata     PROGBITS
+# CHECK4-NEXT: .lbss      NOBITS
+# CHECK4-NEXT: .ltext_w   PROGBITS
+# CHECK4-NEXT: .comment   PROGBITS
+
 #--- a.s
 .globl _start, _etext, _edata, _end
 _start:
@@ -155,3 +171,28 @@ SECTIONS {
   .ldata2 : {}
   .lbss : { *(.lbss .lbss.*) }
 }
+
+#--- c.s
+## Test .ltext layout
+.section .ltext,"axl",@progbits
+.globl f
+f:
+  ret
+
+.section .ltext_w,"awxl",@progbits
+.globl g
+g:
+  ret
+
+.section .text,"ax",@progbits
+.globl h
+h:
+  ret
+
+.section .note,"a",@note; .space 1
+.section .rodata,"a",@progbits; .space 1
+.section .data,"aw",@progbits; .space 1
+.section .bss,"aw",@nobits; .space 1
+.section .lrodata,"al"; .space 1
+.section .ldata,"awl"; .space 1
+.section .lbss,"awl",@nobits; .space 1

From df7db441d4e97568a5cbf830b0810512bb702159 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 12 Jun 2025 14:56:25 -0400
Subject: [PATCH 0195/1322] =?UTF-8?q?Revert=20"[PowerPC][NFC]=20Update=20l?=
 =?UTF-8?q?owering=20STXVP=20to=20STXV=20in=20Oct=20word=20spil=E2=80=A6?=
 =?UTF-8?q?=20(#143948)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ling (#142220)"

This reverts commit edf636afe405ff90da7bf1834aa334bd52bc861e.
checked in wrong branch.
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 93 +++++++++++----------
 1 file changed, 50 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9dc69e203b0d..45183af0b798 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1238,6 +1238,42 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
+static void spillRegPairs(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator II, DebugLoc DL,
+                          const TargetInstrInfo &TII, Register SrcReg,
+                          unsigned FrameIndex, bool IsLittleEndian,
+                          bool IsKilled, bool TwoPairs) {
+  unsigned Offset = 0;
+  // The register arithmetic in this function does not support virtual
+  // registers.
+  assert(!SrcReg.isVirtual() &&
+         "Spilling register pairs does not support virtual registers.");
+
+  if (TwoPairs)
+    Offset = IsLittleEndian ? 48 : 0;
+  else
+    Offset = IsLittleEndian ? 16 : 0;
+  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
+                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                        .addReg(Reg, getKillRegState(IsKilled)),
+                    FrameIndex, Offset);
+  Offset += IsLittleEndian ? -16 : 16;
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                        .addReg(Reg + 1, getKillRegState(IsKilled)),
+                    FrameIndex, Offset);
+  if (TwoPairs) {
+    Offset += IsLittleEndian ? -16 : 16;
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                          .addReg(Reg + 2, getKillRegState(IsKilled)),
+                      FrameIndex, Offset);
+    Offset += IsLittleEndian ? -16 : 16;
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                          .addReg(Reg + 3, getKillRegState(IsKilled)),
+                      FrameIndex, Offset);
+  }
+}
+
 /// Remove any STXVP[X] instructions and split them out into a pair of
 /// STXV[X] instructions if --disable-auto-paired-vec-st is specified on
 /// the command line.
@@ -1254,21 +1290,8 @@ void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
   Register SrcReg = MI.getOperand(0).getReg();
   bool IsLittleEndian = Subtarget.isLittleEndian();
   bool IsKilled = MI.getOperand(0).isKill();
-
-  assert(PPC::VSRpRCRegClass.contains(SrcReg) &&
-          "Expecting STXVP to be utilizing a VSRp register.");
-
-  addFrameReference(
-      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx0),
-                  getKillRegState(IsKilled)),
-      FrameIndex, IsLittleEndian ? 16 : 0);
-  addFrameReference(
-      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx1),
-                  getKillRegState(IsKilled)),
-      FrameIndex, IsLittleEndian ? 0 : 16);
-
+  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
+                /* TwoPairs */ false);
   // Discard the original instruction.
   MBB.erase(II);
 }
@@ -1302,6 +1325,8 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   bool IsKilled = MI.getOperand(0).isKill();
 
   bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+  Register Reg =
+      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   emitAccSpillRestoreInfo(MBB, IsPrimed, false);
@@ -1312,34 +1337,16 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  if (DisableAutoPairedVecSt) {
-    auto spillPair = [&](Register Reg, int Offset) {
-      addFrameReference(
-          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx0),
-                      getKillRegState(IsKilled)),
-          FrameIndex, Offset);
-      addFrameReference(
-          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx1),
-                      getKillRegState(IsKilled)),
-          FrameIndex, IsLittleEndian ? Offset - 16 : Offset + 16);
-    };
-    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
-              IsLittleEndian ? 48 : 0);
-    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
-              IsLittleEndian ? 16 : 32);
-  } else {
-    addFrameReference(
-        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
-                    getKillRegState(IsKilled)),
-        FrameIndex, IsLittleEndian ? 32 : 0);
-    addFrameReference(
-        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
-                    getKillRegState(IsKilled)),
-        FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt)
+    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
+                  /* TwoPairs */ true);
+  else {
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                          .addReg(Reg, getKillRegState(IsKilled)),
+                      FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                          .addReg(Reg + 1, getKillRegState(IsKilled)),
+                      FrameIndex, IsLittleEndian ? 0 : 32);
   }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);

From c317eda6e3785037f16a746a1096c2cca82d9455 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 12 Jun 2025 15:47:02 -0400
Subject: [PATCH 0196/1322] [PowerPC][NFC] Update lowering STXVP to STXV in Oct
 word spilling (#143953)

Simpliy handling for spilling of acc reg with stx by removing explicit
register arithmetic and clean up code gen for register mapping used in
stxvp spilling.

Relanding: https://github.com/llvm/llvm-project/pull/142220
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 88 ++++++++++-----------
 llvm/lib/Target/PowerPC/PPCRegisterInfo.h   |  5 ++
 2 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 45183af0b798..ea34c1aba82e 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1238,40 +1238,28 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
-static void spillRegPairs(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          const TargetInstrInfo &TII, Register SrcReg,
-                          unsigned FrameIndex, bool IsLittleEndian,
-                          bool IsKilled, bool TwoPairs) {
-  unsigned Offset = 0;
-  // The register arithmetic in this function does not support virtual
-  // registers.
-  assert(!SrcReg.isVirtual() &&
+void PPCRegisterInfo::spillRegPair(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator II, DebugLoc DL,
+                                   const TargetInstrInfo &TII,
+                                   unsigned FrameIndex, bool IsLittleEndian,
+                                   bool IsKilled, Register Reg,
+                                   int Offset) const {
+
+  // This function does not support virtual registers.
+  assert(!Reg.isVirtual() &&
          "Spilling register pairs does not support virtual registers.");
 
-  if (TwoPairs)
-    Offset = IsLittleEndian ? 48 : 0;
-  else
-    Offset = IsLittleEndian ? 16 : 0;
-  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
-                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  Offset += IsLittleEndian ? -16 : 16;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg + 1, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  if (TwoPairs) {
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 2, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 3, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-  }
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx0),
+                  getKillRegState(IsKilled)),
+      FrameIndex, Offset);
+
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx1),
+                  getKillRegState(IsKilled)),
+      FrameIndex, IsLittleEndian ? Offset - 16 : Offset + 16);
 }
 
 /// Remove any STXVP[X] instructions and split them out into a pair of
@@ -1290,8 +1278,10 @@ void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
   Register SrcReg = MI.getOperand(0).getReg();
   bool IsLittleEndian = Subtarget.isLittleEndian();
   bool IsKilled = MI.getOperand(0).isKill();
-  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
-                /* TwoPairs */ false);
+
+  spillRegPair(MBB, II, DL, TII, FrameIndex, IsLittleEndian, IsKilled, SrcReg,
+               IsLittleEndian ? 16 : 0);
+
   // Discard the original instruction.
   MBB.erase(II);
 }
@@ -1325,8 +1315,6 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   bool IsKilled = MI.getOperand(0).isKill();
 
   bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
-  Register Reg =
-      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   emitAccSpillRestoreInfo(MBB, IsPrimed, false);
@@ -1337,16 +1325,24 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  if (DisableAutoPairedVecSt)
-    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
-                  /* TwoPairs */ true);
-  else {
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 32 : 0);
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg + 1, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt) {
+    spillRegPair(MBB, II, DL, TII, FrameIndex, IsLittleEndian, IsKilled,
+                 TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+                 IsLittleEndian ? 48 : 0);
+    spillRegPair(MBB, II, DL, TII, FrameIndex, IsLittleEndian, IsKilled,
+                 TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+                 IsLittleEndian ? 16 : 32);
+  } else {
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 0 : 32);
   }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 4b66ece53411..849f856b5419 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -58,6 +58,11 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   DenseMap<unsigned, unsigned> ImmToIdxMap;
   const PPCTargetMachine &TM;
 
+  void spillRegPair(MachineBasicBlock &MBB, MachineBasicBlock::iterator II,
+                    DebugLoc DL, const TargetInstrInfo &TII,
+                    unsigned FrameIndex, bool IsLittleEndian, bool IsKilled,
+                    Register Reg, int Offset) const;
+
 public:
   PPCRegisterInfo(const PPCTargetMachine &TM);
 

From 030a471753421477c7ef345cc60091788252fabc Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 12 Jun 2025 20:51:58 +0100
Subject: [PATCH 0197/1322] [AArch64][Clang] Exclude address spaces from
 pointer-only coercion types.

As reported on #135064, the generic pointer coercion code in
CoerceIntOrPtrToIntOrPtr cannot handle address space casts (it tries to bitcast
the pointers). This bails out if an address space qualifier is found on the
pointer.
---
 clang/lib/CodeGen/Targets/AArch64.cpp         |   3 +-
 .../AArch64/struct-coerce-using-ptr.cpp       | 181 +++++++++++++++++-
 2 files changed, 181 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 3efe6ab4ea9c..b82c46966cf0 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -507,7 +507,8 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
         if (FDTy->isArrayType())
           FDTy = getContext().getBaseElementType(FDTy);
         return (FDTy->isPointerOrReferenceType() &&
-                getContext().getTypeSize(FDTy) == 64) ||
+                getContext().getTypeSize(FDTy) == 64 &&
+                !FDTy->getPointeeType().hasAddressSpace()) ||
                Self(Self, FDTy);
       });
     };
diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
index a41f315340b5..b1232921df36 100644
--- a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
+++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
@@ -139,7 +139,7 @@ struct Srp {
 // CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRP:%.*]], align 8
 // CHECK-A64-NEXT:    store [2 x ptr] [[S_COERCE]], ptr [[S]], align 8
 // CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRP]], ptr [[S]], i32 0, i32 0
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 8
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 8, !nonnull [[META2:![0-9]+]], !align [[META3:![0-9]+]]
 // CHECK-A64-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CHECK-A64-NEXT:    ret void
 //
@@ -149,7 +149,7 @@ struct Srp {
 // CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRP:%.*]], align 4
 // CHECK-A64_32-NEXT:    store i64 [[S_COERCE]], ptr [[S]], align 4
 // CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRP]], ptr [[S]], i32 0, i32 0
-// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 4
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 4, !nonnull [[META2:![0-9]+]], !align [[META3:![0-9]+]]
 // CHECK-A64_32-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CHECK-A64_32-NEXT:    ret void
 //
@@ -618,3 +618,180 @@ struct SpSempty {
 // CHECK-A64_32-NEXT:    ret void
 //
 void TpSempty(SpSempty s) { *s.x = 1; }
+
+
+struct Spaddrspace {
+    __attribute__((address_space(100))) int *x;
+};
+// CHECK-A64-LABEL: define dso_local void @_Z11Tpaddrspace11Spaddrspace(
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr addrspace(100)
+// CHECK-A64-NEXT:    store ptr addrspace(100) [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 8
+// CHECK-A64-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Tpaddrspace11Spaddrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32
+// CHECK-A64_32-NEXT:    store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 4
+// CHECK-A64_32-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64_32-NEXT:    ret void
+//
+void Tpaddrspace(Spaddrspace s) { *s.x = 1; }
+// CHECK-A64-LABEL: define dso_local void @_Z11Cpaddrspacev(
+// CHECK-A64-SAME: ) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SPADDRSPACE]], align 8
+// CHECK-A64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[AGG_TMP]], ptr align 8 [[S]], i64 8, i1 false)
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i64
+// CHECK-A64-NEXT:    call void @_Z11Tpaddrspace11Spaddrspace(i64 [[COERCE_VAL_PI]])
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Cpaddrspacev(
+// CHECK-A64_32-SAME: ) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SPADDRSPACE]], align 4
+// CHECK-A64_32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TMP]], ptr align 4 [[S]], i32 4, i1 false)
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i32
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = zext i32 [[COERCE_VAL_PI]] to i64
+// CHECK-A64_32-NEXT:    call void @_Z11Tpaddrspace11Spaddrspace(i64 [[COERCE_VAL_II]])
+// CHECK-A64_32-NEXT:    ret void
+//
+void Cpaddrspace() { Spaddrspace s; Tpaddrspace(s); }
+
+struct Sp2addrspace {
+    __attribute__((address_space(100))) int *x[2];
+};
+// CHECK-A64-LABEL: define dso_local void @_Z12Tp2addrspace12Sp2addrspace(
+// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    store [2 x i64] [[S_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(100)], ptr [[X]], i64 0, i64 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[ARRAYIDX]], align 8
+// CHECK-A64-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z12Tp2addrspace12Sp2addrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(100)], ptr [[X]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[ARRAYIDX]], align 4
+// CHECK-A64_32-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64_32-NEXT:    ret void
+//
+void Tp2addrspace(Sp2addrspace s) { *s.x[0] = 1; }
+// CHECK-A64-LABEL: define dso_local void @_Z12Cp2addrspacev(
+// CHECK-A64-SAME: ) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SP2ADDRSPACE]], align 8
+// CHECK-A64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[AGG_TMP]], ptr align 8 [[S]], i64 16, i1 false)
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    call void @_Z12Tp2addrspace12Sp2addrspace([2 x i64] [[TMP0]])
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z12Cp2addrspacev(
+// CHECK-A64_32-SAME: ) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SP2ADDRSPACE]], align 4
+// CHECK-A64_32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TMP]], ptr align 4 [[S]], i32 8, i1 false)
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load i64, ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    call void @_Z12Tp2addrspace12Sp2addrspace(i64 [[TMP0]])
+// CHECK-A64_32-NEXT:    ret void
+//
+void Cp2addrspace() { Sp2addrspace s; Tp2addrspace(s); }
+
+struct Sraddrspace {
+    __attribute__((address_space(100))) int &x;
+};
+// CHECK-A64-LABEL: define dso_local void @_Z11Traddrspace11Sraddrspace(
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr addrspace(100)
+// CHECK-A64-NEXT:    store ptr addrspace(100) [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 8, !align [[META3]]
+// CHECK-A64-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Traddrspace11Sraddrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32
+// CHECK-A64_32-NEXT:    store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 4, !align [[META3]]
+// CHECK-A64_32-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64_32-NEXT:    ret void
+//
+void Traddrspace(Sraddrspace s) { s.x = 1; }
+// CHECK-A64-LABEL: define dso_local void @_Z11Craddrspace11Sraddrspace(
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SRADDRSPACE]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr addrspace(100)
+// CHECK-A64-NEXT:    store ptr addrspace(100) [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[AGG_TMP]], ptr align 8 [[S]], i64 8, i1 false)
+// CHECK-A64-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE1]], align 8
+// CHECK-A64-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i64
+// CHECK-A64-NEXT:    call void @_Z11Traddrspace11Sraddrspace(i64 [[COERCE_VAL_PI]])
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Craddrspace11Sraddrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SRADDRSPACE]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32
+// CHECK-A64_32-NEXT:    store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TMP]], ptr align 4 [[S]], i32 4, i1 false)
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE1]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i32
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II2:%.*]] = zext i32 [[COERCE_VAL_PI]] to i64
+// CHECK-A64_32-NEXT:    call void @_Z11Traddrspace11Sraddrspace(i64 [[COERCE_VAL_II2]])
+// CHECK-A64_32-NEXT:    ret void
+//
+void Craddrspace(Sraddrspace s) { Traddrspace(s); }
+
+//.
+// CHECK-A64: [[META2]] = !{}
+// CHECK-A64: [[META3]] = !{i64 4}
+//.
+// CHECK-A64_32: [[META2]] = !{}
+// CHECK-A64_32: [[META3]] = !{i64 4}
+//.

From 891f6ae783b36122b0f2fadc0c2d95d7dd590415 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 12 Jun 2025 12:50:02 -0700
Subject: [PATCH 0198/1322] [instcombine] Add test coverage for vp.reverse
 elimination combines

---
 .../test/Transforms/InstCombine/vp-reverse.ll | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/vp-reverse.ll

diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
new file mode 100644
index 000000000000..79e6c47bdf1b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim(
+; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim2(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m, i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_elim_diffmask(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m1, <vscale x 4 x i1> %m2, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim_diffmask(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M1:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M1]], i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M2:%.*]], i32 10)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m1, i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m1, i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> %m2, i32 10)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim_diffevl(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 10)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 10)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim(
+; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, splat (i32 22)
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim2(
+; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> splat (i32 22), %a.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
+; CHECK-LABEL: @unop_reverse_splat_elim(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV]]
+; CHECK-NEXT:    [[OP_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[OP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OP_REV]]
+;
+  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %op = fneg <vscale x 4 x float> %a.rev
+  %op.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %op, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x float> %op.rev
+}

From cbf27bf711c08c34185f05ca5edbfa61bd3786e2 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 12 Jun 2025 19:53:04 +0000
Subject: [PATCH 0199/1322] Revert " [PowerPC] frontend get target feature from
 backend with cpu name (#137670)"

This reverts commit 9208b343e962b9f1140ee345c0050a3920bdcbf2.

TargetParser shouldn't re-run the PPC subtarget tablegen target, it
should define its own `-gen-ppc-target-def` rule like all the other
targets do in llvm/include/llvm/TargetParser/CMakeLists.txt .

One user reported that there are incorrect CMake dependencies after this
change, so I will roll this back in the meantime.
---
 clang/lib/Basic/Targets/PPC.cpp               | 148 +++++++++++++++++-
 .../cxx11-thread-local-reference.cpp          |   2 +-
 .../Driver/aix-shared-lib-tls-model-opt.c     |   7 +-
 .../Driver/aix-small-local-exec-dynamic-tls.c |  39 ++---
 clang/test/Driver/ppc-crbits.cpp              |   4 +
 clang/test/Driver/ppc-isa-features.cpp        |  22 +--
 .../llvm/TargetParser/PPCTargetParser.h       |   6 -
 llvm/include/llvm/TargetParser/TargetParser.h |  27 ----
 llvm/lib/Target/PowerPC/PPC.td                |   4 +-
 llvm/lib/TargetParser/CMakeLists.txt          |   8 -
 llvm/lib/TargetParser/PPCTargetParser.cpp     |  25 ---
 llvm/lib/TargetParser/TargetParser.cpp        |  47 ------
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  50 ++----
 13 files changed, 198 insertions(+), 191 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 77145e2891a8..e6ef0ecc526b 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -15,7 +15,6 @@
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/TargetParser/PPCTargetParser.h"
-#include <optional>
 
 using namespace clang;
 using namespace clang::targets;
@@ -517,14 +516,129 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
 bool PPCTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
+  Features["altivec"] = llvm::StringSwitch<bool>(CPU)
+                            .Case("7400", true)
+                            .Case("g4", true)
+                            .Case("7450", true)
+                            .Case("g4+", true)
+                            .Case("970", true)
+                            .Case("g5", true)
+                            .Case("pwr6", true)
+                            .Case("pwr7", true)
+                            .Case("pwr8", true)
+                            .Case("pwr9", true)
+                            .Case("ppc64", true)
+                            .Case("ppc64le", true)
+                            .Default(false);
 
-  const llvm::Triple &TheTriple = getTriple();
+  Features["power9-vector"] = (CPU == "pwr9");
+  Features["crypto"] = llvm::StringSwitch<bool>(CPU)
+                           .Case("ppc64le", true)
+                           .Case("pwr9", true)
+                           .Case("pwr8", true)
+                           .Default(false);
+  Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
+                                  .Case("ppc64le", true)
+                                  .Case("pwr9", true)
+                                  .Case("pwr8", true)
+                                  .Default(false);
+  Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
+                           .Case("ppc64le", true)
+                           .Case("pwr9", true)
+                           .Case("pwr8", true)
+                           .Case("pwr7", true)
+                           .Default(false);
+  Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
+                           .Case("ppc64le", true)
+                           .Case("pwr9", true)
+                           .Case("pwr8", true)
+                           .Case("pwr7", true)
+                           .Default(false);
+  Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
+                                .Case("ppc64le", true)
+                                .Case("pwr9", true)
+                                .Case("pwr8", true)
+                                .Default(false);
+  Features["crbits"] = llvm::StringSwitch<bool>(CPU)
+                                .Case("ppc64le", true)
+                                .Case("pwr9", true)
+                                .Case("pwr8", true)
+                                .Default(false);
+  Features["vsx"] = llvm::StringSwitch<bool>(CPU)
+                        .Case("ppc64le", true)
+                        .Case("pwr9", true)
+                        .Case("pwr8", true)
+                        .Case("pwr7", true)
+                        .Default(false);
+  Features["htm"] = llvm::StringSwitch<bool>(CPU)
+                        .Case("ppc64le", true)
+                        .Case("pwr9", true)
+                        .Case("pwr8", true)
+                        .Default(false);
 
-  std::optional<llvm::StringMap<bool>> FeaturesOpt =
-      llvm::PPC::getPPCDefaultTargetFeatures(TheTriple,
-                                             llvm::PPC::normalizeCPUName(CPU));
-  if (FeaturesOpt)
-    Features = FeaturesOpt.value();
+  // ROP Protect is off by default.
+  Features["rop-protect"] = false;
+  // Privileged instructions are off by default.
+  Features["privileged"] = false;
+
+  if (getTriple().isOSAIX()) {
+    // The code generated by the -maix-small-local-[exec|dynamic]-tls option is
+    // turned off by default.
+    Features["aix-small-local-exec-tls"] = false;
+    Features["aix-small-local-dynamic-tls"] = false;
+
+    // Turn off TLS model opt by default.
+    Features["aix-shared-lib-tls-model-opt"] = false;
+  }
+
+  Features["spe"] = llvm::StringSwitch<bool>(CPU)
+                        .Case("8548", true)
+                        .Case("e500", true)
+                        .Default(false);
+
+  Features["isa-v206-instructions"] = llvm::StringSwitch<bool>(CPU)
+                                          .Case("ppc64le", true)
+                                          .Case("pwr9", true)
+                                          .Case("pwr8", true)
+                                          .Case("pwr7", true)
+                                          .Case("a2", true)
+                                          .Default(false);
+
+  Features["isa-v207-instructions"] = llvm::StringSwitch<bool>(CPU)
+                                          .Case("ppc64le", true)
+                                          .Case("pwr9", true)
+                                          .Case("pwr8", true)
+                                          .Default(false);
+
+  Features["isa-v30-instructions"] =
+      llvm::StringSwitch<bool>(CPU).Case("pwr9", true).Default(false);
+
+  Features["quadword-atomics"] =
+      getTriple().isArch64Bit() && llvm::StringSwitch<bool>(CPU)
+                                       .Case("pwr9", true)
+                                       .Case("pwr8", true)
+                                       .Default(false);
+
+  // Power10 includes all the same features as Power9 plus any features specific
+  // to the Power10 core.
+  if (CPU == "pwr10" || CPU == "power10") {
+    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
+    addP10SpecificFeatures(Features);
+  }
+
+  // Power11 includes all the same features as Power10 plus any features
+  // specific to the Power11 core.
+  if (CPU == "pwr11" || CPU == "power11") {
+    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
+    addP11SpecificFeatures(Features);
+  }
+
+  // Future CPU should include all of the features of Power 11 as well as any
+  // additional features (yet to be determined) specific to it.
+  if (CPU == "future") {
+    initFeatureMap(Features, Diags, "pwr11", FeaturesVec);
+    addFutureSpecificFeatures(Features);
+  }
 
   if (!ppcUserFeaturesCheck(Diags, FeaturesVec))
     return false;
@@ -586,6 +700,26 @@ bool PPCTargetInfo::initFeatureMap(
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
+// Add any Power10 specific features.
+void PPCTargetInfo::addP10SpecificFeatures(
+    llvm::StringMap<bool> &Features) const {
+  Features["htm"] = false; // HTM was removed for P10.
+  Features["paired-vector-memops"] = true;
+  Features["mma"] = true;
+  Features["power10-vector"] = true;
+  Features["pcrelative-memops"] = true;
+  Features["prefix-instrs"] = true;
+  Features["isa-v31-instructions"] = true;
+}
+
+// Add any Power11 specific features.
+void PPCTargetInfo::addP11SpecificFeatures(
+    llvm::StringMap<bool> &Features) const {}
+
+// Add features specific to the "Future" CPU.
+void PPCTargetInfo::addFutureSpecificFeatures(
+    llvm::StringMap<bool> &Features) const {}
+
 bool PPCTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("powerpc", true)
diff --git a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index a0e76e8a9a0b..cd5a18f39060 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -35,5 +35,5 @@ int &g() { return r; }
 // DARWIN-LABEL: define internal cxx_fast_tlscc void @__tls_init()
 // CHECK: call void @[[R_INIT]]()
 
-// LINUX_AIX: attributes [[ATTR0]] = { {{.*}} }
+// LINUX_AIX: attributes [[ATTR0]] = { {{.*}}"target-features"{{.*}} }
 // DARWIN: attributes [[ATTR1]] = { {{.*}}nounwind{{.*}}"target-features"{{.*}}  }
diff --git a/clang/test/Driver/aix-shared-lib-tls-model-opt.c b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
index 891caf4ed3fc..7acf091f0a04 100644
--- a/clang/test/Driver/aix-shared-lib-tls-model-opt.c
+++ b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 // RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
@@ -19,8 +19,9 @@ int test(void) {
 
 // CHECK-AIX: test() #0 {
 // CHECK-AIX: attributes #0 = {
+// CHECK-AIX-OFF-SAME: -aix-shared-lib-tls-model-opt
 // CHECK-AIX-ON-SAME: +aix-shared-lib-tls-model-opt
 
-// CHECK-LINUX-NOT: {{[+]aix-shared-lib-tls-model-opt}}
+// CHECK-LINUX-NOT: {{[-+]aix-shared-lib-tls-model-opt}}
 
 // CHECK-UNSUPPORTED-TARGET: option '-maix-shared-lib-tls-model-opt' cannot be specified on this target
diff --git a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
index 6fc2b8efb4ae..1a0619b58e89 100644
--- a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
+++ b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
@@ -1,37 +1,37 @@
-// RUN: %clang --target=powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clang --target=powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clang --target=powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clang --target=powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
+// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
-// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
+// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALEXEC_TLS
 
-// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
+// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALDYNAMIC_TLS
 
-// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
-// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
@@ -39,9 +39,10 @@ int test(void) {
   return 0;
 }
 
-// CHECK-DEFAULT: test() #0 {
-// CHECK-DEFAULT: attributes #0 = {
-// CHECK-DEFAULT-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
+// CHECK-AIX-DEFAULT: test() #0 {
+// CHECK-AIX-DEFAULT: attributes #0 = {
+// CHECK-AIX-DEFAULT-SAME: {{-aix-small-local-exec-tls,.*-aix-small-local-dynamic-tls|-aix-small-local-dynamic-tls,.*-aix-small-local-exec-tls}}
+// CHECK-LINUX-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
 
 // CHECK-UNSUPPORTED-AIX32: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
 // CHECK-UNSUPPORTED-LINUX: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
diff --git a/clang/test/Driver/ppc-crbits.cpp b/clang/test/Driver/ppc-crbits.cpp
index 62893d3d0e87..3ed56308cb52 100644
--- a/clang/test/Driver/ppc-crbits.cpp
+++ b/clang/test/Driver/ppc-crbits.cpp
@@ -64,6 +64,8 @@
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
+// RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -emit-llvm \
+// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mno-crbits \
@@ -90,6 +92,8 @@
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
+// RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -emit-llvm \
+// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mno-crbits \
diff --git a/clang/test/Driver/ppc-isa-features.cpp b/clang/test/Driver/ppc-isa-features.cpp
index 35dbfbcdf569..92c5bc82f72b 100644
--- a/clang/test/Driver/ppc-isa-features.cpp
+++ b/clang/test/Driver/ppc-isa-features.cpp
@@ -5,20 +5,20 @@
 // RUN: %clang -target powerpc64-unknown-aix -mcpu=pwr9 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR9
 // RUN: %clang -target powerpc-unknown-aix -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR10
 
-// CHECK-PWR6-NOT: isa-v206-instructions
-// CHECK-PWR6-NOT: isa-v207-instructions
-// CHECK-PWR6-NOT: isa-v30-instructions
+// CHECK-PWR6: -isa-v206-instructions
+// CHECK-PWR6: -isa-v207-instructions
+// CHECK-PWR6: -isa-v30-instructions
 
-// CHECK-A2:     +isa-v206-instructions
-// CHECK-A2-NOT: isa-v207-instructions
-// CHECK-A2-NOT: isa-v30-instructions
+// CHECK-A2: +isa-v206-instructions
+// CHECK-A2: -isa-v207-instructions
+// CHECK-A2: -isa-v30-instructions
 
-// CHECK-PWR7:     +isa-v206-instructions
-// CHECK-PWR7-NOT: isa-v207-instructions
-// CHECK-PWR7-NOT: isa-v30-instructions
+// CHECK-PWR7: +isa-v206-instructions
+// CHECK-PWR7: -isa-v207-instructions
+// CHECK-PWR7: -isa-v30-instructions
 
-// CHECK-PWR8:     +isa-v207-instructions
-// CHECK-PWR8-NOT: isa-v30-instructions
+// CHECK-PWR8: +isa-v207-instructions
+// CHECK-PWR8: -isa-v30-instructions
 
 // CHECK-PWR9: +isa-v207-instructions
 // CHECK-PWR9: +isa-v30-instructions
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index d3d44afb5f54..59d9f867005a 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
-#include "TargetParser.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -39,10 +37,6 @@ LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
 LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
-
-LLVM_ABI std::optional<llvm::StringMap<bool>>
-getPPCDefaultTargetFeatures(const Triple &T, StringRef CPUName);
-
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index b4a92cc6b6c4..176205e17ae0 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TARGETPARSER_TARGETPARSER_H
 #define LLVM_TARGETPARSER_TARGETPARSER_H
 
-#include "SubtargetFeature.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@@ -192,31 +190,6 @@ insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
 } // namespace AMDGPU
-
-struct BasicSubtargetFeatureKV {
-  const char *Key;         ///< K-V key string
-  unsigned Value;          ///< K-V integer value
-  FeatureBitArray Implies; ///< K-V bit mask
-};
-
-/// Used to provide key value pairs for feature and CPU bit flags.
-struct BasicSubtargetSubTypeKV {
-  const char *Key;         ///< K-V key string
-  FeatureBitArray Implies; ///< K-V bit mask
-
-  /// Compare routine for std::lower_bound
-  bool operator<(StringRef S) const { return StringRef(Key) < S; }
-
-  /// Compare routine for std::is_sorted.
-  bool operator<(const BasicSubtargetSubTypeKV &Other) const {
-    return StringRef(Key) < StringRef(Other.Key);
-  }
-};
-
-std::optional<llvm::StringMap<bool>>
-getCPUDefaultTargetFeatures(StringRef CPU,
-                            ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
-                            ArrayRef<BasicSubtargetFeatureKV> ProcFeatures);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index ea7c2203662b..fd850faf7b2f 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -411,6 +411,7 @@ def ProcessorFeatures {
      FeatureP8Altivec,
      FeatureP8Vector,
      FeatureP8Crypto,
+     FeatureHTM,
      FeatureDirectMove,
      FeatureICBT,
      FeaturePartwordAtomic,
@@ -421,7 +422,6 @@ def ProcessorFeatures {
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
-                                               FeatureHTM,
                                                FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
@@ -443,7 +443,7 @@ def ProcessorFeatures {
   // dispatch for vector operations than scalar ones. For the time being,
   // this list also includes scheduling-related features since we do not have
   // enough info to create custom scheduling strategies for future CPUs.
-  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits, FeatureHTM];
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
   list<SubtargetFeature> P9InheritableFeatures =
     !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
   list<SubtargetFeature> P9Features =
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 66aed45ff18c..8f8b3a578a1d 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -8,12 +8,6 @@ if (HAS_WERROR_GLOBAL_CTORS AND NOT LLVM_HAS_NOGLOBAL_CTOR_MUTEX)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=global-constructors")
 endif()
 
-set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC/PPC.td)
-
-tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget -I${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC)
-add_public_tablegen_target(PPCGenSubtargetInfo)
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # Solaris code uses kstat, so specify dependency explicitly for shared builds.
 if (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
   set(system_libs kstat)
@@ -47,5 +41,3 @@ add_llvm_component_library(LLVMTargetParser
   DEPENDS
   target_parser_gen
   )
-
-add_dependencies(LLVMTargetParser PPCGenSubtargetInfo)
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index 1b637b27be3d..422d758c772e 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -15,10 +15,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TargetParser/Host.h"
 
-#define GET_SUBTARGETINFO_ENUM
-#define GET_SUBTARGETFEATURES_KV
-#include "PPCGenSubtargetInfo.inc"
-
 namespace llvm {
 namespace PPC {
 
@@ -121,26 +117,5 @@ StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
   return getNormalizedPPCTargetCPU(T, CPUName);
 }
 
-std::optional<StringMap<bool>> getPPCDefaultTargetFeatures(const Triple &T,
-                                                           StringRef CPU) {
-  std::optional<StringMap<bool>> FeaturesOpt =
-      getCPUDefaultTargetFeatures(CPU, BasicPPCSubTypeKV, BasicPPCFeatureKV);
-
-  if (!FeaturesOpt.has_value())
-    return std::nullopt;
-
-  StringMap<bool> Features = FeaturesOpt.value();
-  // FIXME: We need to check for the processor model 8548, since the backend
-  // does not support this processor. When this processor model is implemented
-  // within the backend, the following code can be removed.
-  if (CPU == "8548")
-    Features["spe"] = true;
-
-  // The target feature `quadword-atomics` is only supported for 64-bit
-  // POWER8 and above.
-  if (Features.find("quadword-atomics") != Features.end() && !T.isArch64Bit())
-    Features["quadword-atomics"] = false;
-  return Features;
-}
 } // namespace PPC
 } // namespace llvm
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 03f7d3899c2e..7c54901dae47 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -18,53 +18,6 @@
 using namespace llvm;
 using namespace AMDGPU;
 
-/// Find KV in array using binary search.
-static const BasicSubtargetSubTypeKV *
-find(StringRef S, ArrayRef<BasicSubtargetSubTypeKV> A) {
-  // Binary search the array
-  auto F = llvm::lower_bound(A, S);
-  // If not found then return NULL
-  if (F == A.end() || StringRef(F->Key) != S)
-    return nullptr;
-  // Return the found array item
-  return F;
-}
-
-/// For each feature that is (transitively) implied by this feature, set it.
-static void setImpliedBits(FeatureBitset &Bits, const FeatureBitset &Implies,
-                           ArrayRef<BasicSubtargetFeatureKV> FeatureTable) {
-  // OR the Implies bits in outside the loop. This allows the Implies for CPUs
-  // which might imply features not in FeatureTable to use this.
-  Bits |= Implies;
-  for (const auto &FE : FeatureTable)
-    if (Implies.test(FE.Value))
-      setImpliedBits(Bits, FE.Implies.getAsBitset(), FeatureTable);
-}
-
-std::optional<llvm::StringMap<bool>> llvm::getCPUDefaultTargetFeatures(
-    StringRef CPU, ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
-    ArrayRef<BasicSubtargetFeatureKV> ProcFeatures) {
-  if (CPU.empty())
-    return std::nullopt;
-
-  const BasicSubtargetSubTypeKV *CPUEntry = ::find(CPU, ProcDesc);
-  if (!CPUEntry)
-    return std::nullopt;
-
-  // Set the features implied by this CPU feature if there is a match.
-  FeatureBitset Bits;
-  llvm::StringMap<bool> DefaultFeatures;
-  setImpliedBits(Bits, CPUEntry->Implies.getAsBitset(), ProcFeatures);
-
-  unsigned BitSize = Bits.size();
-  for (const BasicSubtargetFeatureKV &FE : ProcFeatures) {
-    assert(FE.Value < BitSize && "Target Feature is out of range");
-    if (Bits[FE.Value])
-      DefaultFeatures[FE.Key] = true;
-  }
-  return DefaultFeatures;
-}
-
 namespace {
 
 struct GPUInfo {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index da41e981888a..ca008e256a70 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -89,10 +89,8 @@ class SubtargetEmitter {
 
   FeatureMapTy enumeration(raw_ostream &OS);
   void emitSubtargetInfoMacroCalls(raw_ostream &OS);
-  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
-                            bool IsEmitBasic = false);
-  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
-                        bool IsEmitBasic = false);
+  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   unsigned cpuNames(raw_ostream &OS);
   void formItineraryStageString(const std::string &Names,
                                 const Record *ItinData, std::string &ItinString,
@@ -256,8 +254,7 @@ void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {
 // command line.
 //
 unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
-                                            const FeatureMapTy &FeatureMap,
-                                            bool IsEmitBasic) {
+                                            const FeatureMapTy &FeatureMap) {
   std::vector<const Record *> FeatureList =
       Records.getAllDerivedDefinitions("SubtargetFeature");
 
@@ -273,8 +270,7 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
   // Begin feature table.
   OS << "// Sorted (by key) array of values for CPU features.\n"
-     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
-     << "SubtargetFeatureKV " << (IsEmitBasic ? "Basic" : "") << Target
+     << "extern const llvm::SubtargetFeatureKV " << Target
      << "FeatureKV[] = {\n";
 
   for (const Record *Feature : FeatureList) {
@@ -285,11 +281,9 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
     // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in
     // } }
-    OS << "  { " << "\"" << CommandLineName << "\", ";
-    if (!IsEmitBasic)
-      OS << "\"" << Desc << "\", ";
-
-    OS << Target << "::" << Name << ", ";
+    OS << "  { "
+       << "\"" << CommandLineName << "\", "
+       << "\"" << Desc << "\", " << Target << "::" << Name << ", ";
 
     ConstRecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
@@ -367,8 +361,7 @@ static void checkDuplicateCPUFeatures(StringRef CPUName,
 // line.
 //
 unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
-                                        const FeatureMapTy &FeatureMap,
-                                        bool IsEmitBasic) {
+                                        const FeatureMapTy &FeatureMap) {
   // Gather and sort processor information
   std::vector<const Record *> ProcessorList =
       Records.getAllDerivedDefinitions("Processor");
@@ -381,8 +374,7 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
 
   // Begin processor table.
   OS << "// Sorted (by key) array of values for CPU subtype.\n"
-     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
-     << "SubtargetSubTypeKV " << (IsEmitBasic ? "Basic" : "") << Target
+     << "extern const llvm::SubtargetSubTypeKV " << Target
      << "SubTypeKV[] = {\n";
 
   for (const Record *Processor : ProcessorList) {
@@ -400,17 +392,13 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
        << "\"" << Name << "\", ";
 
     printFeatureMask(OS, FeatureList, FeatureMap);
+    OS << ", ";
+    printFeatureMask(OS, TuneFeatureList, FeatureMap);
 
-    if (!IsEmitBasic) {
-      OS << ", ";
-      printFeatureMask(OS, TuneFeatureList, FeatureMap);
-
-      // Emit the scheduler model pointer.
-      const std::string &ProcModelName =
-          SchedModels.getModelForProc(Processor).ModelName;
-      OS << ", &" << ProcModelName;
-    }
-    OS << " },\n";
+    // Emit the scheduler model pointer.
+    const std::string &ProcModelName =
+        SchedModels.getModelForProc(Processor).ModelName;
+    OS << ", &" << ProcModelName << " },\n";
   }
 
   // End processor table.
@@ -2052,14 +2040,6 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "} // end namespace llvm\n\n";
   OS << "#endif // GET_SUBTARGETINFO_ENUM\n\n";
 
-  OS << "\n#ifdef GET_SUBTARGETFEATURES_KV\n";
-  OS << "#undef GET_SUBTARGETFEATURES_KV\n\n";
-  OS << "namespace llvm {\n";
-  featureKeyValues(OS, FeatureMap, true);
-  cpuKeyValues(OS, FeatureMap, true);
-  OS << "} // end namespace llvm\n\n";
-  OS << "#endif // GET_SUBTARGETFEATURES_KV\n\n";
-
   emitSubtargetInfoMacroCalls(OS);
 
   OS << "namespace llvm {\n";

From c19e900ce8b422f6b8c028fbbd9ef7e9d3720236 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Thu, 12 Jun 2025 16:02:51 -0400
Subject: [PATCH 0200/1322] [AArch64] Signed comparison using CMN is safe when
 the subtraction is nsw (#141993)

nsw means no signed wrap, and 0 - INT_MIN is a signed wrap.

Now, this is going to be a point I need to get out of the way:

So is it okay to always transform a > -b into cmn if it is a signed
comparison, even if b is INT_MIN because -INT_MIN is undefined, at least
in C, because unless fwrapv is specified, opt puts nsw on signed integer
operations, allowing for more folds anyway.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 19 ++++++--
 llvm/test/CodeGen/AArch64/cmp-to-cmn.ll       | 46 +++++++++++++++++++
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac545534d728..5b9e699eaa40 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3392,8 +3392,19 @@ bool isLegalCmpImmed(APInt C) {
   return isLegalArithImmed(C.abs().getZExtValue());
 }
 
-static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
-  KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
+static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {
+  // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
+  if (Op->getFlags().hasNoSignedWrap())
+    return true;
+
+  // We can still figure out if the second operand is safe to use
+  // in a CMN instruction by checking if it is known to be not the minimum
+  // signed value. If it is not, then we can safely use CMN.
+  // Note: We can eventually remove this check and simply rely on
+  // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
+  // consistently sets them appropriately when making said nodes.
+
+  KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
   return !KnownSrc.getSignedMinValue().isMinSignedValue();
 }
 
@@ -3402,7 +3413,7 @@ static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
 // can be set differently by this operation. It comes down to whether
 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
 // everything is fine. If not then the optimization is wrong. Thus general
-// comparisons are only valid if op2 != 0.
+// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
 //
 // So, finally, the only LLVM-native comparisons that don't mention C or V
 // are the ones that aren't unsigned comparisons. They're the only ones we can
@@ -3411,7 +3422,7 @@ static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
          (isIntEqualitySetCC(CC) ||
           (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
-          (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
+          (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
 }
 
 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
diff --git a/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll b/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll
index c5fd9b63cce9..5765e0acae26 100644
--- a/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll
@@ -602,3 +602,49 @@ define i1 @almost_immediate_neg_ugt_64(i64 %x) {
   %cmp = icmp ugt i64 %x, -16773121
   ret i1 %cmp
 }
+
+define i1 @cmn_nsw(i32 %a, i32 %b) {
+; CHECK-LABEL: cmn_nsw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn w0, w1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub nsw i32 0, %b
+  %cmp = icmp sgt i32 %a, %sub
+  ret i1 %cmp
+}
+
+define i1 @cmn_nsw_64(i64 %a, i64 %b) {
+; CHECK-LABEL: cmn_nsw_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn x0, x1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub nsw i64 0, %b
+  %cmp = icmp sgt i64 %a, %sub
+  ret i1 %cmp
+}
+
+define i1 @cmn_nsw_neg(i32 %a, i32 %b) {
+; CHECK-LABEL: cmn_nsw_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w1
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub i32 0, %b
+  %cmp = icmp sgt i32 %a, %sub
+  ret i1 %cmp
+}
+
+define i1 @cmn_nsw_neg_64(i64 %a, i64 %b) {
+; CHECK-LABEL: cmn_nsw_neg_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg x8, x1
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub i64 0, %b
+  %cmp = icmp sgt i64 %a, %sub
+  ret i1 %cmp
+}

From b1f5e26b78a9550a22ee2f24bb3f220d396c452f Mon Sep 17 00:00:00 2001
From: GeorgeHuyubo <113479859+GeorgeHuyubo@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:23:26 -0700
Subject: [PATCH 0201/1322] [lldb] Properly handle locate module callback when
 Target change arch (#143793)

Since this PR: https://github.com/llvm/llvm-project/pull/141670/ We
started to override the Platform/Arch for a target if needed. However we
may have already registered locate module callback with the old
platform.

This PR will move the locate module callback to the new Platform
whenever Target changes architecture.

Co-authored-by: George Hu <georgehuyubo@gmail.com>
---
 lldb/source/Target/Target.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 9660fc97970b..45a9e1196a04 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1706,6 +1706,8 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
         if (PlatformSP arch_platform_sp =
                 GetDebugger().GetPlatformList().GetOrCreate(other, {},
                                                             &platform_arch)) {
+          arch_platform_sp->SetLocateModuleCallback(
+              platform_sp->GetLocateModuleCallback());
           SetPlatform(arch_platform_sp);
           if (platform_arch.IsValid())
             other = platform_arch;

From d65904675ea106713937c9cce24e3d1ec0bc570a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 12 Jun 2025 21:35:56 +0100
Subject: [PATCH 0202/1322] [LV] Move logic to create trip count check to
 helper (NFC).

Move the logic to create the iteration count check to a separate helper,
so it can be re-used by when creating the skeleton for epilogue
vectorization as well.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8177b76ad5bd..404ee6874d2a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -530,6 +530,9 @@ protected:
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
 
+  // Create a check to see if the vector loop should be executed
+  Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
+
   /// Emit a bypass check to see if the vector trip count is zero, including if
   /// it overflows.
   void emitIterationCountCheck(BasicBlock *Bypass);
@@ -2370,13 +2373,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   }
 }
 
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
-  Value *Count = getTripCount();
-  // Reuse existing vector loop preheader for TC checks.
-  // Note that new preheader block is generated for vector loop.
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  IRBuilder<> Builder(TCCheckBlock->getTerminator());
-
+Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
+                                                      unsigned UF) const {
   // Generate code to check if the loop's trip count is less than VF * UF, or
   // equal to it in case a scalar epilogue is required; this implies that the
   // vector trip count is zero. This check also covers the case where adding one
@@ -2385,7 +2383,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
                                                        : ICmpInst::ICMP_ULT;
 
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
   // If tail is to be folded, vector loop takes care of all iterations.
+  Value *Count = getTripCount();
   Type *CountTy = Count->getType();
   Value *CheckMinIters = Builder.getFalse();
   auto CreateStep = [&]() -> Value * {
@@ -2434,7 +2438,12 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
     // Don't execute the vector loop if (UMax - n) < (VF * UF).
     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
   }
+  return CheckMinIters;
+}
 
+void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  Value *CheckMinIters = createIterationCountCheck(VF, UF);
   // Create new preheader for vector loop.
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,

From 8ee9646b06cd128a6c55f375e4df431aee053c76 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 12 Jun 2025 13:46:06 -0700
Subject: [PATCH 0203/1322] [LV] Simplify creation of
 vp.load/vp.store/vp.reduce intrinsics (#143804)

The use of VectorBuilder here was simply obscuring what was actually
going on. For vp.load and vp.store, the resulting code is significantly
more idiomatic. For the vp.reduce cases, we remove several layers of
indirection, including passing parameters via implicit state on the
builder. In both cases, the code is significantly easier to follow.
---
 llvm/include/llvm/IR/VectorBuilder.h          | 120 --------
 .../include/llvm/Transforms/Utils/LoopUtils.h |  11 +-
 llvm/lib/IR/CMakeLists.txt                    |   1 -
 llvm/lib/IR/VectorBuilder.cpp                 | 116 --------
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  31 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  23 +-
 llvm/unittests/IR/CMakeLists.txt              |   1 -
 llvm/unittests/IR/VectorBuilderTest.cpp       | 279 ------------------
 8 files changed, 31 insertions(+), 551 deletions(-)
 delete mode 100644 llvm/include/llvm/IR/VectorBuilder.h
 delete mode 100644 llvm/lib/IR/VectorBuilder.cpp
 delete mode 100644 llvm/unittests/IR/VectorBuilderTest.cpp

diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
deleted file mode 100644
index bc23842d8e6b..000000000000
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ /dev/null
@@ -1,120 +0,0 @@
-//===- llvm/VectorBuilder.h - Builder for VP Intrinsics ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the VectorBuilder class, which is used as a convenient way
-// to create VP intrinsics as if they were LLVM instructions with a consistent
-// and simplified interface.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_VECTORBUILDER_H
-#define LLVM_IR_VECTORBUILDER_H
-
-#include "llvm/Support/Compiler.h"
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/InstrTypes.h>
-#include <llvm/IR/Instruction.h>
-#include <llvm/IR/Value.h>
-
-namespace llvm {
-
-class VectorBuilder {
-public:
-  enum class Behavior {
-    // Abort if the requested VP intrinsic could not be created.
-    // This is useful for strict consistency.
-    ReportAndAbort = 0,
-
-    // Return a default-initialized value if the requested VP intrinsic could
-    // not be created.
-    // This is useful for a defensive fallback to non-VP code.
-    SilentlyReturnNone = 1,
-  };
-
-private:
-  IRBuilderBase &Builder;
-  Behavior ErrorHandling;
-
-  // Explicit mask parameter.
-  Value *Mask;
-  // Explicit vector length parameter.
-  Value *ExplicitVectorLength;
-  // Compile-time vector length.
-  ElementCount StaticVectorLength;
-
-  // Get mask/evl value handles for the current configuration.
-  Value &requestMask();
-  Value &requestEVL();
-
-  LLVM_ABI void handleError(const char *ErrorMsg) const;
-  template <typename RetType>
-  RetType returnWithError(const char *ErrorMsg) const {
-    handleError(ErrorMsg);
-    return RetType();
-  }
-
-  /// Helper function for creating VP intrinsic call.
-  Value *createVectorInstructionImpl(Intrinsic::ID VPID, Type *ReturnTy,
-                                     ArrayRef<Value *> VecOpArray,
-                                     const Twine &Name = Twine());
-
-public:
-  VectorBuilder(IRBuilderBase &Builder,
-                Behavior ErrorHandling = Behavior::ReportAndAbort)
-      : Builder(Builder), ErrorHandling(ErrorHandling), Mask(nullptr),
-        ExplicitVectorLength(nullptr),
-        StaticVectorLength(ElementCount::getFixed(0)) {}
-
-  LLVM_ABI Module &getModule() const;
-  LLVMContext &getContext() const { return Builder.getContext(); }
-
-  // All-true mask for the currently configured explicit vector length.
-  LLVM_ABI Value *getAllTrueMask();
-
-  VectorBuilder &setMask(Value *NewMask) {
-    Mask = NewMask;
-    return *this;
-  }
-  VectorBuilder &setEVL(Value *NewExplicitVectorLength) {
-    ExplicitVectorLength = NewExplicitVectorLength;
-    return *this;
-  }
-  VectorBuilder &setStaticVL(unsigned NewFixedVL) {
-    StaticVectorLength = ElementCount::getFixed(NewFixedVL);
-    return *this;
-  }
-
-  /// Get the flags to be applied to created floating point ops.
-  const FastMathFlags &getFastMathFlags() const {
-    return Builder.getFastMathFlags();
-  }
-
-  // TODO: setStaticVL(ElementCount) for scalable types.
-
-  // Emit a VP intrinsic call that mimics a regular instruction.
-  // This operation behaves according to the VectorBuilderBehavior.
-  // \p Opcode      The functional instruction opcode of the emitted intrinsic.
-  // \p ReturnTy    The return type of the operation.
-  // \p VecOpArray  The operand list.
-  LLVM_ABI Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
-                                          ArrayRef<Value *> VecOpArray,
-                                          const Twine &Name = Twine());
-
-  /// Emit a VP reduction intrinsic call for recurrence kind.
-  /// \param RdxID       The intrinsic ID of llvm.vector.reduce.*
-  /// \param ValTy       The type of operand which the reduction operation is
-  ///                    performed.
-  /// \param VecOpArray  The operand list.
-  LLVM_ABI Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
-                                        ArrayRef<Value *> VecOpArray,
-                                        const Twine &Name = Twine());
-};
-
-} // namespace llvm
-
-#endif // LLVM_IR_VECTORBUILDER_H
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 6c0e06482a6d..12be3bad04d3 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/VectorBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
@@ -423,8 +422,9 @@ LLVM_ABI Value *createSimpleReduction(IRBuilderBase &B, Value *Src,
                                       RecurKind RdxKind);
 /// Overloaded function to generate vector-predication intrinsics for
 /// reduction.
-LLVM_ABI Value *createSimpleReduction(VectorBuilder &VB, Value *Src,
-                                      RecurKind RdxKind);
+LLVM_ABI Value *createSimpleReduction(IRBuilderBase &B, Value *Src,
+                                      RecurKind RdxKind, Value *Mask,
+                                      Value *EVL);
 
 /// Create a reduction of the given vector \p Src for a reduction of kind
 /// RecurKind::AnyOf. The start value of the reduction is \p InitVal.
@@ -442,8 +442,9 @@ LLVM_ABI Value *createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind,
                                        Value *Src, Value *Start);
 /// Overloaded function to generate vector-predication intrinsics for ordered
 /// reduction.
-LLVM_ABI Value *createOrderedReduction(VectorBuilder &VB, RecurKind RdxKind,
-                                       Value *Src, Value *Start);
+LLVM_ABI Value *createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind,
+                                       Value *Src, Value *Start, Value *Mask,
+                                       Value *EVL);
 
 /// Get the intersection (logical and) of all of the potential IR flags
 /// of each scalar operation (VL) that will be converted into a vector (I).
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index eb00829fd8c7..10572ff708bd 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -74,7 +74,6 @@ add_llvm_component_library(LLVMCore
   User.cpp
   Value.cpp
   ValueSymbolTable.cpp
-  VectorBuilder.cpp
   VectorTypeUtils.cpp
   Verifier.cpp
   VFABIDemangler.cpp
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
deleted file mode 100644
index 737f49b1334d..000000000000
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-//===- VectorBuilder.cpp - Builder for VP Intrinsics ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the VectorBuilder class, which is used as a convenient
-// way to create VP intrinsics as if they were LLVM instructions with a
-// consistent and simplified interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/IR/FPEnv.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/IntrinsicInst.h>
-#include <llvm/IR/Intrinsics.h>
-#include <llvm/IR/VectorBuilder.h>
-
-namespace llvm {
-
-void VectorBuilder::handleError(const char *ErrorMsg) const {
-  if (ErrorHandling == Behavior::SilentlyReturnNone)
-    return;
-  report_fatal_error(ErrorMsg);
-}
-
-Module &VectorBuilder::getModule() const {
-  return *Builder.GetInsertBlock()->getModule();
-}
-
-Value *VectorBuilder::getAllTrueMask() {
-  return Builder.getAllOnesMask(StaticVectorLength);
-}
-
-Value &VectorBuilder::requestMask() {
-  if (Mask)
-    return *Mask;
-
-  return *getAllTrueMask();
-}
-
-Value &VectorBuilder::requestEVL() {
-  if (ExplicitVectorLength)
-    return *ExplicitVectorLength;
-
-  assert(!StaticVectorLength.isScalable() && "TODO vscale lowering");
-  auto *IntTy = Builder.getInt32Ty();
-  return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue());
-}
-
-Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
-                                              ArrayRef<Value *> InstOpArray,
-                                              const Twine &Name) {
-  auto VPID = VPIntrinsic::getForOpcode(Opcode);
-  if (VPID == Intrinsic::not_intrinsic)
-    return returnWithError<Value *>("No VPIntrinsic for this opcode");
-  return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
-}
-
-Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
-                                            Type *ValTy,
-                                            ArrayRef<Value *> InstOpArray,
-                                            const Twine &Name) {
-  auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
-  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
-         "No VPIntrinsic for this reduction");
-  return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
-}
-
-Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID,
-                                                  Type *ReturnTy,
-                                                  ArrayRef<Value *> InstOpArray,
-                                                  const Twine &Name) {
-  auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
-  auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
-  size_t NumInstParams = InstOpArray.size();
-  size_t NumVPParams =
-      NumInstParams + MaskPosOpt.has_value() + VLenPosOpt.has_value();
-
-  SmallVector<Value *, 6> IntrinParams;
-
-  // Whether the mask and vlen parameter are at the end of the parameter list.
-  bool TrailingMaskAndVLen =
-      std::min<size_t>(MaskPosOpt.value_or(NumInstParams),
-                       VLenPosOpt.value_or(NumInstParams)) >= NumInstParams;
-
-  if (TrailingMaskAndVLen) {
-    // Fast path for trailing mask, vector length.
-    IntrinParams.append(InstOpArray.begin(), InstOpArray.end());
-    IntrinParams.resize(NumVPParams);
-  } else {
-    IntrinParams.resize(NumVPParams);
-    // Insert mask and evl operands in between the instruction operands.
-    for (size_t VPParamIdx = 0, ParamIdx = 0; VPParamIdx < NumVPParams;
-         ++VPParamIdx) {
-      if (MaskPosOpt == VPParamIdx || VLenPosOpt == VPParamIdx)
-        continue;
-      assert(ParamIdx < NumInstParams);
-      IntrinParams[VPParamIdx] = InstOpArray[ParamIdx++];
-    }
-  }
-
-  if (MaskPosOpt)
-    IntrinParams[*MaskPosOpt] = &requestMask();
-  if (VLenPosOpt)
-    IntrinParams[*VLenPosOpt] = &requestEVL();
-
-  auto *VPDecl = VPIntrinsic::getOrInsertDeclarationForParams(
-      &getModule(), VPID, ReturnTy, IntrinParams);
-  return Builder.CreateCall(VPDecl, IntrinParams, Name);
-}
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index ff69fa9f70c4..cf6b183c78ac 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1319,18 +1319,19 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
   }
 }
 
-Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
-                                   RecurKind Kind) {
+Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
+                                   RecurKind Kind, Value *Mask, Value *EVL) {
   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
          !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
          "AnyOf or FindLastIV reductions are not supported.");
   Intrinsic::ID Id = getReductionIntrinsicID(Kind);
-  auto *SrcTy = cast<VectorType>(Src->getType());
-  Type *SrcEltTy = SrcTy->getElementType();
-  Value *Iden =
-      getRecurrenceIdentity(Kind, SrcEltTy, VBuilder.getFastMathFlags());
-  Value *Ops[] = {Iden, Src};
-  return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
+  auto VPID = VPIntrinsic::getForIntrinsic(Id);
+  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
+         "No VPIntrinsic for this reduction");
+  auto *EltTy = cast<VectorType>(Src->getType())->getElementType();
+  Value *Iden = getRecurrenceIdentity(Kind, EltTy, Builder.getFastMathFlags());
+  Value *Ops[] = {Iden, Src, Mask, EVL};
+  return Builder.CreateIntrinsic(EltTy, VPID, Ops);
 }
 
 Value *llvm::createOrderedReduction(IRBuilderBase &B, RecurKind Kind,
@@ -1343,17 +1344,21 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B, RecurKind Kind,
   return B.CreateFAddReduce(Start, Src);
 }
 
-Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, RecurKind Kind,
-                                    Value *Src, Value *Start) {
+Value *llvm::createOrderedReduction(IRBuilderBase &Builder, RecurKind Kind,
+                                    Value *Src, Value *Start, Value *Mask,
+                                    Value *EVL) {
   assert((Kind == RecurKind::FAdd || Kind == RecurKind::FMulAdd) &&
          "Unexpected reduction kind");
   assert(Src->getType()->isVectorTy() && "Expected a vector type");
   assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
 
   Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
-  auto *SrcTy = cast<VectorType>(Src->getType());
-  Value *Ops[] = {Start, Src};
-  return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
+  auto VPID = VPIntrinsic::getForIntrinsic(Id);
+  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
+         "No VPIntrinsic for this reduction");
+  auto *EltTy = cast<VectorType>(Src->getType())->getElementType();
+  Value *Ops[] = {Start, Src, Mask, EVL};
+  return Builder.CreateIntrinsic(EltTy, VPID, Ops);
 }
 
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa6b13c217bd..74472aaeb167 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -30,7 +30,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/VectorBuilder.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -2524,21 +2523,17 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
   Value *VecOp = State.get(getVecOp());
   Value *EVL = State.get(getEVL(), VPLane(0));
 
-  VectorBuilder VBuilder(Builder);
-  VBuilder.setEVL(EVL);
   Value *Mask;
-  // TODO: move the all-true mask generation into VectorBuilder.
   if (VPValue *CondOp = getCondOp())
     Mask = State.get(CondOp);
   else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  VBuilder.setMask(Mask);
 
   Value *NewRed;
   if (isOrdered()) {
-    NewRed = createOrderedReduction(VBuilder, Kind, VecOp, Prev);
+    NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
   } else {
-    NewRed = createSimpleReduction(VBuilder, VecOp, Kind);
+    NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
     else
@@ -3086,10 +3081,8 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Load, DataTy, Addr, "vp.op.load"));
+    NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
+                                    {Addr, Mask, EVL}, nullptr, "vp.op.load");
   }
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
@@ -3204,11 +3197,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Store, Type::getVoidTy(EVL->getContext()),
-        {StoredVal, Addr}));
+    NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+                                    Intrinsic::vp_store,
+                                    {StoredVal, Addr, Mask, EVL});
   }
   NewSI->addParamAttr(
       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index bea6b1b46f57..b66eae93f933 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -50,7 +50,6 @@ add_llvm_unittest(IRTests
   ValueHandleTest.cpp
   ValueMapTest.cpp
   ValueTest.cpp
-  VectorBuilderTest.cpp
   VectorTypeUtilsTest.cpp
   VectorTypesTest.cpp
   VerifierTest.cpp
diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp
deleted file mode 100644
index e01378a2755f..000000000000
--- a/llvm/unittests/IR/VectorBuilderTest.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-//===--------- VectorBuilderTest.cpp - VectorBuilder unit tests -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/VectorBuilder.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-static unsigned VectorNumElements = 8;
-
-class VectorBuilderTest : public testing::Test {
-protected:
-  LLVMContext Context;
-
-  VectorBuilderTest() : Context() {}
-
-  std::unique_ptr<Module> createBuilderModule(Function *&Func, BasicBlock *&BB,
-                                              Value *&Mask, Value *&EVL) {
-    auto Mod = std::make_unique<Module>("TestModule", Context);
-    auto *Int32Ty = Type::getInt32Ty(Context);
-    auto *Mask8Ty =
-        FixedVectorType::get(Type::getInt1Ty(Context), VectorNumElements);
-    auto *VoidFuncTy =
-        FunctionType::get(Type::getVoidTy(Context), {Mask8Ty, Int32Ty}, false);
-    Func =
-        Function::Create(VoidFuncTy, GlobalValue::ExternalLinkage, "bla", *Mod);
-    Mask = Func->getArg(0);
-    EVL = Func->getArg(1);
-    BB = BasicBlock::Create(Context, "entry", Func);
-
-    return Mod;
-  }
-};
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setMask(Mask).setEVL(EVL);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    auto *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_EQ(VPIntrin->getMaskParam(), Mask);                                 \
-    ASSERT_EQ(VPIntrin->getVectorLengthParam(), EVL);                          \
-  }
-#include "llvm/IR/Instruction.def"
-}
-
-static bool isAllTrueMask(Value *Val, unsigned NumElements) {
-  auto *ConstMask = dyn_cast<Constant>(Val);
-  if (!ConstMask)
-    return false;
-
-  // Structure check.
-  if (!ConstMask->isAllOnesValue())
-    return false;
-
-  // Type check.
-  auto *MaskVecTy = cast<FixedVectorType>(ConstMask->getType());
-  if (MaskVecTy->getNumElements() != NumElements)
-    return false;
-
-  return MaskVecTy->getElementType()->isIntegerTy(1);
-}
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setEVL(EVL).setStaticVL(VectorNumElements);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_TRUE(isAllTrueMask(VPIntrin->getMaskParam(), VectorNumElements));   \
-    ASSERT_EQ(VPIntrin->getVectorLengthParam(), EVL);                          \
-  }
-#include "llvm/IR/Instruction.def"
-}
-
-static bool isLegalConstEVL(Value *Val, unsigned ExpectedEVL) {
-  auto *ConstEVL = dyn_cast<ConstantInt>(Val);
-  if (!ConstEVL)
-    return false;
-
-  // Value check.
-  if (ConstEVL->getZExtValue() != ExpectedEVL)
-    return false;
-
-  // Type check.
-  return ConstEVL->getType()->isIntegerTy(32);
-}
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setMask(Mask).setStaticVL(VectorNumElements);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_EQ(VPIntrin->getMaskParam(), Mask);                                 \
-    ASSERT_TRUE(                                                               \
-        isLegalConstEVL(VPIntrin->getVectorLengthParam(), VectorNumElements)); \
-  }
-#include "llvm/IR/Instruction.def"
-}
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest,
-       TestCreateBinaryInstructions_FixedVector_NoMask_NoEVL) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setStaticVL(VectorNumElements);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_TRUE(isAllTrueMask(VPIntrin->getMaskParam(), VectorNumElements));   \
-    ASSERT_TRUE(                                                               \
-        isLegalConstEVL(VPIntrin->getVectorLengthParam(), VectorNumElements)); \
-  }
-#include "llvm/IR/Instruction.def"
-}
-/// Check that creating vp.load/vp.store works.
-TEST_F(VectorBuilderTest, TestCreateLoadStore) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setMask(Mask).setEVL(EVL);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-
-  Value *FloatVecPtr = PoisonValue::get(Builder.getPtrTy(0));
-  Value *FloatVec = PoisonValue::get(FloatVecTy);
-
-  // vp.load
-  auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load);
-  auto *LoadIntrin = VBuild.createVectorInstruction(Instruction::Load,
-                                                    FloatVecTy, {FloatVecPtr});
-  ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
-  auto *VPLoad = cast<VPIntrinsic>(LoadIntrin);
-  ASSERT_EQ(VPLoad->getIntrinsicID(), LoadVPID);
-  ASSERT_EQ(VPLoad->getMemoryPointerParam(), FloatVecPtr);
-
-  // vp.store
-  auto *VoidTy = Builder.getVoidTy();
-  auto StoreVPID = VPIntrinsic::getForOpcode(Instruction::Store);
-  auto *StoreIntrin = VBuild.createVectorInstruction(Instruction::Store, VoidTy,
-                                                     {FloatVec, FloatVecPtr});
-  ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
-  auto *VPStore = cast<VPIntrinsic>(StoreIntrin);
-  ASSERT_EQ(VPStore->getIntrinsicID(), StoreVPID);
-  ASSERT_EQ(VPStore->getMemoryPointerParam(), FloatVecPtr);
-  ASSERT_EQ(VPStore->getMemoryDataParam(), FloatVec);
-}
-
-/// Check that the SilentlyReturnNone error handling mode works.
-TEST_F(VectorBuilderTest, TestFail_SilentlyReturnNone) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  auto *VoidTy = Builder.getVoidTy();
-  VectorBuilder VBuild(Builder, VectorBuilder::Behavior::SilentlyReturnNone);
-  VBuild.setMask(Mask).setEVL(EVL);
-  auto *Val = VBuild.createVectorInstruction(Instruction::Br, VoidTy, {});
-  ASSERT_EQ(Val, nullptr);
-}
-
-/// Check that the ReportAndFail error handling mode aborts as advertised.
-TEST_F(VectorBuilderTest, TestFail_ReportAndAbort) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  auto *VoidTy = Builder.getVoidTy();
-  VectorBuilder VBuild(Builder, VectorBuilder::Behavior::ReportAndAbort);
-  VBuild.setMask(Mask).setEVL(EVL);
-  ASSERT_DEATH({ VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); },
-               "No VPIntrinsic for this opcode");
-}
-
-} // end anonymous namespace

From 741ea80446e21b4052d723765011fe3583d3fc7f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 12 Jun 2025 20:46:30 +0000
Subject: [PATCH 0204/1322] [gn build] Port 8ee9646b06cd

---
 llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn       | 1 -
 llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
index 4f103d30f300..22aa0b641813 100644
--- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -90,7 +90,6 @@ static_library("IR") {
     "VFABIDemangler.cpp",
     "Value.cpp",
     "ValueSymbolTable.cpp",
-    "VectorBuilder.cpp",
     "VectorTypeUtils.cpp",
     "Verifier.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
index 0f34231ae321..0d162ff0f9d5 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -54,7 +54,6 @@ unittest("IRTests") {
     "ValueHandleTest.cpp",
     "ValueMapTest.cpp",
     "ValueTest.cpp",
-    "VectorBuilderTest.cpp",
     "VectorTypeUtilsTest.cpp",
     "VectorTypesTest.cpp",
     "VerifierTest.cpp",

From 8a8ea8fec063bd64c17e463e7c3eaae5cdb4a645 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 12 Jun 2025 13:55:44 -0700
Subject: [PATCH 0205/1322] =?UTF-8?q?Reland=20"[lldb][headers]=20Create=20?=
 =?UTF-8?q?Python=20script=20to=20fix=20up=20framework=20head=E2=80=A6=20(?=
 =?UTF-8?q?#143945)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ers" (#143941)

Reland the script that converts lldb headers to RPC headers. The RPC
test was failing due to the incorrect input filepath being used.

Original commit message:
This commit replaces the shell script that fixes up includes for the
LLDB framework with a Python script. This script will also be used when
fixing up includes for the LLDBRPC.framework.
---
 lldb/cmake/modules/LLDBFramework.cmake        |  42 +++---
 lldb/scripts/framework-header-fix.py          | 126 ++++++++++++++++++
 .../Shell/Scripts/Inputs/Main/SBAddress.h     |  13 ++
 .../Shell/Scripts/Inputs/RPC/RPCSBAddress.h   |   9 ++
 .../Shell/Scripts/TestFrameworkFixScript.test |  11 ++
 .../Scripts/TestFrameworkFixUnifdef.test      |  12 ++
 .../Scripts/TestRPCFrameworkFixScript.test    |  14 ++
 7 files changed, 206 insertions(+), 21 deletions(-)
 create mode 100755 lldb/scripts/framework-header-fix.py
 create mode 100644 lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixScript.test
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
 create mode 100644 lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test

diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 8961b1afe93a..70010ffbf738 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,24 +68,17 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
-# At configuration time, collect headers for the framework bundle and copy them
-# into a staging directory. Later we can copy over the entire folder.
-file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
-set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
-file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
-file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
-list(REMOVE_ITEM root_public_headers ${root_private_headers})
-
 find_program(unifdef_EXECUTABLE unifdef)
 
-set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
-foreach(header
-    ${public_headers}
-    ${generated_public_headers}
-    ${root_public_headers})
+# All necessary header files will be staged in the include directory in the build directory,
+# so just copy the files from there into the framework's staging directory.
+set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
+set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
+file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
+foreach(header ${lldb_build_dir_header_staging_list})
 
   get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_header_staging}/${basename})
+  set(staged_header ${lldb_framework_header_staging}/${basename})
 
   if(unifdef_EXECUTABLE)
     # unifdef returns 0 when the file is unchanged and 1 if something was changed.
@@ -112,13 +105,20 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# At build time, copy the staged headers into the framework bundle (and do
-# some post-processing in-place).
-add_custom_command(TARGET liblldb POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${lldb_header_staging} $<TARGET_FILE_DIR:liblldb>/Headers
-  COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.sh $<TARGET_FILE_DIR:liblldb>/Headers ${LLDB_VERSION}
-  COMMENT "LLDB.framework: copy framework headers"
-)
+# Take the headers from the staging directory and fix up their includes for the framework.
+# Then write them to the output directory.
+# Also, run unifdef to remove any specified guards from the header files.
+file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
+foreach(header ${lldb_framework_header_staging_list})
+
+  set(input_header ${header})
+  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${input_header})
+
+  add_custom_command(TARGET liblldb POST_BUILD
+    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
+    COMMENT "LLDB.framework: Fix up and copy framework headers"
+  )
+endforeach()
 
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
new file mode 100755
index 000000000000..9528fdb7e30b
--- /dev/null
+++ b/lldb/scripts/framework-header-fix.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+"""
+Usage: <path/to/input-directory> <path/to/output-directory>
+
+This script is used when building LLDB.framework or LLDBRPC.framework. For each framework, local includes are converted to their respective framework includes.
+
+This script is used in 2 ways:
+1. It is used on header files that are copied into LLDB.framework. For these files, local LLDB includes are converted into framework includes, e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>.
+
+2. It is used on header files for LLDBRPC.framework. For these files, includes of RPC common files will be converted to framework includes, e.g. #include <lldb-rpc/common/RPCCommon.h> -> #include <LLDBRPC/RPCCommon.h>. It will also change local includes to framework includes, e.g. #include "SBAddress.h" -> #include <LLDBRPC/SBAddress.h>
+"""
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+
+# Main header regexes
+INCLUDE_FILENAME_REGEX = re.compile(
+    r'#include "lldb/API/(?P<include_filename>.*){0,1}"'
+)
+
+# RPC header regexes
+RPC_COMMON_REGEX = re.compile(r"#include <lldb-rpc/common/(?P<include_filename>.*)>")
+RPC_INCLUDE_FILENAME_REGEX = re.compile(r'#include "(?P<include_filename>.*)"')
+
+
+def modify_rpc_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to RPC framework level includes.
+            # e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+            # Also, RPC common code includes must change to RPC framework level includes.
+            # e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+            rpc_common_matches = RPC_COMMON_REGEX.finditer(file_buffer)
+            rpc_include_filename_matches = RPC_INCLUDE_FILENAME_REGEX.finditer(
+                file_buffer
+            )
+            for match in rpc_common_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            for match in rpc_include_filename_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            output_file.write(file_buffer)
+
+
+def modify_main_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to framework level includes.
+            # e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+            regex_matches = INCLUDE_FILENAME_REGEX.finditer(file_buffer)
+            for match in regex_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDB/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+                output_file.write(file_buffer)
+
+
+def remove_guards(output_file_path, unifdef_path, unifdef_guards):
+    # The unifdef path should be passed in from CMake. If it wasn't there in CMake or is incorrect,
+    # find it using shutil. If shutil can't find it, then exit.
+    if not shutil.which(unifdef_path):
+        unifdef_path = shutil.which("unifdef")
+    if not unifdef_path:
+        print(
+            "Unable to find unifdef executable. Guards will not be removed from input files. Exiting..."
+        )
+        sys.exit()
+
+    subprocess_command = (
+        [unifdef_path, "-o", output_file_path] + unifdef_guards + [output_file_path]
+    )
+    subprocess.run(subprocess_command)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", "--framework", choices=["lldb_main", "lldb_rpc"])
+    parser.add_argument("-i", "--input_file")
+    parser.add_argument("-o", "--output_file")
+    parser.add_argument("-p", "--unifdef_path")
+    parser.add_argument(
+        "unifdef_guards",
+        nargs="+",
+        type=str,
+        help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
+    )
+    args = parser.parse_args()
+    input_file_path = str(args.input_file)
+    output_file_path = str(args.output_file)
+    framework_version = args.framework
+    unifdef_path = str(args.unifdef_path)
+    # Prepend dashes to the list of guards passed in from the command line.
+    # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
+    # but passing them in with dashes for this script causes argparse to think that they're
+    # arguments in and of themself, so they need to passed in without dashes.
+    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
+
+    if framework_version == "lldb_main":
+        modify_main_includes(input_file_path, output_file_path)
+    if framework_version == "lldb_rpc":
+        modify_rpc_includes(input_file_path, output_file_path)
+    # After the incldues have been modified, run unifdef on the headers to remove any guards
+    # specified at the command line.
+    remove_guards(output_file_path, unifdef_path, unifdef_guards)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
new file mode 100644
index 000000000000..fecc69687cd7
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
@@ -0,0 +1,13 @@
+// This is a truncated version of an SB API file
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDB.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "lldb/API/SBDefines.h"
+#include "lldb/API/SBModule.h"
+
+// Any include guards specified at the command line must be removed.
+#ifndef SWIG
+int a = 10
+#endif
diff --git a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
new file mode 100644
index 000000000000..556afa38a922
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
@@ -0,0 +1,9 @@
+// This is a truncated version of an SB API file generated by lldb-rpc-gen
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDBRPC.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "LLDBRPC.h"
+#include "SBDefines.h"
+#include <lldb-rpc/common/RPCPublic.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
new file mode 100644
index 000000000000..e90c3bdfc5ad
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
@@ -0,0 +1,11 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Local includes must be changed to framework level includes.
+# e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBModule.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
new file mode 100644
index 000000000000..a7e82d2f3640
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Any include guards specified at the command line must be removed.
+CHECK-NOT: #ifndef SWIG
+CHECK: int a = 10
+CHECK-NOT: #endif
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
new file mode 100644
index 000000000000..d01594265396
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
@@ -0,0 +1,14 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/RPC/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
+
+# Local includes must be changed to RPC framework level includes.
+# e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+# Also, RPC common code includes must change to RPC framework level includes.
+# e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+CHECK: #include <LLDBRPC/LLDBRPC.h>
+CHECK: #include <LLDBRPC/SBDefines.h>
+CHECK: #include <LLDBRPC/RPCPublic.h>

From 6f3e2c076d6e3abac9cfd756e95a1ebb5979dd88 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Thu, 12 Jun 2025 14:08:50 -0700
Subject: [PATCH 0206/1322] [MSAN] fork avx512vl-intrinsics and x86-vpermi2
 tests (#143643)

---
 .../X86/avx512vl-intrinsics.ll                | 12306 ++++++++++++++++
 .../MemorySanitizer/X86/x86-vpermi2.ll        |   722 +
 2 files changed, 13028 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
new file mode 100644
index 000000000000..14d68b449a7b
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -0,0 +1,12306 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mattr=+avx512f -passes=msan -S | FileCheck %s
+
+; Forked from llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_compress_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> %passthru, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_maskz_compress_pd_128(<2 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_maskz_compress_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> zeroinitializer, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_compress_pd_128(<2 x double> %data, <2 x double> %data2) #0 {
+; CHECK-LABEL: define <2 x double> @test_compress_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x double> %1
+}
+
+define <4 x float> @test_mask_compress_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_compress_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> %passthru, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_maskz_compress_ps_128(<4 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_maskz_compress_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> zeroinitializer, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_compress_ps_128(<4 x float> %data, <4 x float> %data2) #0 {
+; CHECK-LABEL: define <4 x float> @test_compress_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x float> %1
+}
+
+define <2 x i64> @test_mask_compress_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_mask_compress_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> %passthru, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_maskz_compress_q_128(<2 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_maskz_compress_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> zeroinitializer, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_compress_q_128(<2 x i64> %data, <2 x i64> %data2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_compress_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @test_mask_compress_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_mask_compress_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> %passthru, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @test_maskz_compress_d_128(<4 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_maskz_compress_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @test_compress_d_128(<4 x i32> %data, <4 x i32> %data2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_compress_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i32> %1
+}
+
+define <2 x double> @test_expand_pd_128(<2 x double> %data, <2 x double> %data2) #0 {
+; CHECK-LABEL: define <2 x double> @test_expand_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x double> %1
+}
+
+define <2 x double> @test_mask_expand_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_expand_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> %passthru, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_maskz_expand_pd_128(<2 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_maskz_expand_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> zeroinitializer, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <4 x float> @test_expand_ps_128(<4 x float> %data, <4 x float> %data2) #0 {
+; CHECK-LABEL: define <4 x float> @test_expand_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_expand_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_expand_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> %passthru, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_maskz_expand_ps_128(<4 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_maskz_expand_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> zeroinitializer, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <2 x i64> @test_expand_q_128(<2 x i64> %data, <2 x i64> %data2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_expand_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mask_expand_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_mask_expand_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> %passthru, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_maskz_expand_q_128(<2 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_maskz_expand_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> zeroinitializer, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @test_expand_d_128(<4 x i32> %data, <4 x i32> %data2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_expand_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_mask_expand_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_mask_expand_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> %passthru, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @test_maskz_expand_d_128(<4 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_maskz_expand_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x double> @test_mask_compress_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_compress_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> %passthru, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_maskz_compress_pd_256(<4 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_maskz_compress_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> zeroinitializer, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_compress_pd_256(<4 x double> %data, <4 x double> %data2) #0 {
+; CHECK-LABEL: define <4 x double> @test_compress_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x double> %1
+}
+
+define <8 x float> @test_mask_compress_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_compress_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> %passthru, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_maskz_compress_ps_256(<8 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_maskz_compress_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> zeroinitializer, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_compress_ps_256(<8 x float> %data, <8 x float> %data2) #0 {
+; CHECK-LABEL: define <8 x float> @test_compress_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x float> %1
+}
+
+define <4 x i64> @test_mask_compress_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_mask_compress_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> %passthru, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_maskz_compress_q_256(<4 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_maskz_compress_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_compress_q_256(<4 x i64> %data, <4 x i64> %data2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_compress_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i64> %1
+}
+
+define <8 x i32> @test_mask_compress_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_mask_compress_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> %passthru, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @test_maskz_compress_d_256(<8 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_maskz_compress_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> zeroinitializer, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @test_compress_d_256(<8 x i32> %data, <8 x i32> %data2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_compress_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x i32> %1
+}
+
+define <4 x double> @test_expand_pd_256(<4 x double> %data, <4 x double> %data2) #0 {
+; CHECK-LABEL: define <4 x double> @test_expand_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @test_mask_expand_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_expand_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> %passthru, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_maskz_expand_pd_256(<4 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_maskz_expand_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> zeroinitializer, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <8 x float> @test_expand_ps_256(<8 x float> %data, <8 x float> %data2) #0 {
+; CHECK-LABEL: define <8 x float> @test_expand_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @test_mask_expand_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_expand_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> %passthru, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_maskz_expand_ps_256(<8 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_maskz_expand_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> zeroinitializer, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <4 x i64> @test_expand_q_256(<4 x i64> %data, <4 x i64> %data2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_expand_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mask_expand_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_mask_expand_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> %passthru, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_maskz_expand_q_256(<4 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_maskz_expand_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <8 x i32> @test_expand_d_256(<8 x i32> %data, <8 x i32> %data2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_expand_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @test_mask_expand_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_mask_expand_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> %passthru, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_maskz_expand_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> zeroinitializer, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmpps_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A]], <8 x float> [[B]], i32 2, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[RES]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = bitcast <8 x i1> %res to i8
+  ret i8 %1
+}
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>)
+
+define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmpps_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A]], <4 x float> [[B]], i32 2, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[RES]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>)
+
+define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmppd_256(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP6]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A]], <4 x double> [[B]], i32 2, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[RES]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>)
+
+define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmppd_128(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A]], <2 x double> [[B]], i32 2, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[RES]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, <2 x i1> <i1 true, i1 true>)
+  %1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>)
+
+define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_maskz_max_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_mask_max_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[SRC]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[SRC]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <8 x float> @test_mm512_max_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
+
+define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_maskz_max_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP7]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_mask_max_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[SRC]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <4 x float> @test_mm512_max_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
+
+define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_maskz_min_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_mask_min_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[SRC]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[SRC]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <8 x float> @test_mm512_min_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
+
+define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_maskz_min_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP8]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = bitcast i8 %mask2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_mask_min_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], [[TMP12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[SRC]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = bitcast i8 %mask2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <4 x float> @test_mm512_min_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
+
+define <4 x double> @test_getexp_pd_256(<4 x double> %a0) #0 {
+; CHECK-LABEL: define <4 x double> @test_getexp_pd_256(
+; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <8 x float> @test_getexp_ps_256(<8 x float> %a0) #0 {
+; CHECK-LABEL: define <8 x float> @test_getexp_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP5]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP15]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1
+  ret <4 x i32> %3
+}
+
+define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP5]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP15]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1
+  ret <4 x i32> %3
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP6]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
+  ret <4 x i32> %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP15]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X1]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+  ret <8 x i32> %3
+}
+
+define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP15]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X1]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+  ret <8 x i32> %3
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP13]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP6]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  ret <8 x i32> %3
+}
+
+declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
+
+define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) #0 {
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
+  ret <2 x double> %1
+}
+
+define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[X1]] to <2 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP21]], <2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
+  %2 = bitcast <2 x i64> %x1 to <2 x double>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %2
+  ret <2 x double> %4
+}
+
+declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
+
+define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) #0 {
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
+  ret <4 x double> %1
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[X1]] to <4 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i64> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP21]], <4 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP10]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
+  %2 = bitcast <4 x i64> %x1 to <4 x double>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %2
+  ret <4 x double> %4
+}
+
+declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
+
+define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
+  ret <4 x float> %1
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1]] to <4 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP21]], <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
+  %2 = bitcast <4 x i32> %x1 to <4 x float>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2
+  ret <4 x float> %4
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128_cast(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <2 x i64> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP12]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1CAST]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1CAST]] to <4 x float>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP21]], [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP22]], <4 x i32> [[TMP17]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %x1cast = bitcast <2 x i64> %x1 to <4 x i32>
+  %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2)
+  %2 = bitcast <4 x i32> %x1cast to <4 x float>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2
+  ret <4 x float> %4
+}
+
+declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
+
+define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) #0 {
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
+  ret <8 x float> %1
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[X1]] to <8 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> zeroinitializer, <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP21]], <8 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP10]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
+  %2 = bitcast <8 x i32> %x1 to <8 x float>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
+  ret <8 x float> %4
+}
+
+declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
+
+define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP5]], <2 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP15]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X1]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1
+  ret <2 x i64> %3
+}
+
+define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP5]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP15]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X1]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1
+  ret <2 x i64> %3
+}
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP13]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer
+  ret <2 x i64> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
+
+define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP5]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP15]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X1]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP5]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP15]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X1]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP13]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer
+  ret <4 x i64> %3
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) #0 {
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_scalef_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_scalef_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) #0 {
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_scalef_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_scalef_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  ret <4 x double> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_scalef_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_scalef_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) #0 {
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_scalef_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_scalef_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  ret <8 x float> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES4]]
+;
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res3 = add <4 x i32> %res0, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES4]]
+;
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res3 = add <4 x i32> %res0, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES4]]
+;
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res3 = add <4 x i32> %res0, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmov_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = trunc <4 x i64> %x0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP1]], <4 x i32> [[X1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = trunc <4 x i64> %x0 to <4 x i32>
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1
+  ret <4 x i32> %3
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmov_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP6]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = trunc <4 x i64> %x0 to <4 x i32>
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
+  ret <4 x i32> %3
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovs_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_pmovs_qd_256(<4 x i64> %x0, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovs_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovus_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_pmovus_qd_256(<4 x i64> %x0, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovus_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = add <4 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES2]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x float> [[RES2]], <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES3]]
+;
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
+  %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %res3
+}
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x float> [[RES]], <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES1]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2dq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2dq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_ask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_ask_cvtt_pd2dq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvtt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_rndscale_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 4, <2 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 88, <2 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES2]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_rndscale_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 4, <4 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 88, <4 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES2]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_rndscale_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 88, <4 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 4, <4 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES2]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_rndscale_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 5, <8 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 66, <8 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES2]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_getmant_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 11, <2 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 12, <2 x double> zeroinitializer, i8 [[X3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 13, <2 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 12, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 13, <2 x double> %x2, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_getmant_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> [[X0]], i32 11, <4 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> [[X0]], i32 12, <4 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES2]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 12, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_getmant_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> [[X0]], i32 11, <4 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> [[X0]], i32 12, <4 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES2]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 12, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_getmant_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> [[X0]], i32 11, <8 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> [[X0]], i32 12, <8 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES2]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 12, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32)
+
+define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pternlog_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pternlog_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP16]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
+  ret <4 x i32> %3
+}
+
+declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
+
+define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pternlog_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP10]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP6]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
+  ret <4 x i32> %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32)
+
+define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_pternlog_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_pternlog_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> zeroinitializer, <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP16]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
+  ret <8 x i32> %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8)
+
+define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_pternlog_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP10]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP6]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  ret <8 x i32> %3
+}
+
+declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32)
+
+define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_pternlog_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
+  ret <2 x i64> %1
+}
+
+define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_pternlog_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP16]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x0
+  ret <2 x i64> %3
+}
+
+define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_pternlog_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP10]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer
+  ret <2 x i64> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32)
+
+define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_pternlog_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_pternlog_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP16]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x0
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_pternlog_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer
+  ret <4 x i64> %3
+}
+
+define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 10, <8 x i16> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 11, <8 x i16> [[SRC]], i8 [[MASK]])
+; CHECK-NEXT:    [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES]]
+;
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask)
+  %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 11, <8 x i16> %src, i8 %mask)
+  %res0 = add <8 x i16> %res1, %res2
+  %res = add <8 x i16> %res3, %res0
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly
+
+define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 11, <8 x i16> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 12, <8 x i16> [[SRC]], i8 [[MASK]])
+; CHECK-NEXT:    [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES]]
+;
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 11, <8 x i16> zeroinitializer, i8 %mask)
+  %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 12, <8 x i16> %src, i8 %mask)
+  %res0 = add <8 x i16> %res1, %res2
+  %res = add <8 x i16> %res3, %res0
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly
+
+define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) #0 {
+; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rr(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrkz(
+; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrk(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) #0 {
+; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rr(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrkz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrk(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) #0 {
+; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rr(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrkz(
+; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrk(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) #0 {
+; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rr(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrkz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrk(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) #0 {
+; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rr(
+; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrkz(
+; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrk(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) #0 {
+; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rr(
+; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrkz(
+; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrk(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) #0 {
+; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rr(
+; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrkz(
+; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrk(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) #0 {
+; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rr(
+; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrkz(
+; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrk(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
+
+declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
+
+define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) #0 {
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_permvar_df_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
+  ret <4 x double> %1
+}
+
+define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_permvar_df_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP7]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP17]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> %x2
+  ret <4 x double> %3
+}
+
+define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_permvar_df_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP7]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP8]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> zeroinitializer
+  ret <4 x double> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
+
+define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_permvar_di_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_permvar_di_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i64> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP11]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP1]], <4 x i64> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> %x2
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_permvar_di_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[_MSPROP]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP6]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> zeroinitializer
+  ret <4 x i64> %3
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_fixupimm_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> [[X1]], <2 x i64> [[X2]], i32 4, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 3, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 3, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_fixupimm_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 3, i8 [[X4]])
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES3]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4)
+  %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 3, i8 %x4)
+  ;%res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  ;%res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res3
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_fixupimm_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 4, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> [[X1]], <4 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 3, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES4]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res3, %res2
+  ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_fixupimm_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> zeroinitializer, i32 4, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 3, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES4]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
+  %res2 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res3, %res2
+  ret <4 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_fixupimm_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res3, %res2
+  ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_fixupimm_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res3, %res2
+  ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_fixupimm_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES4]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_fixupimm_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES4]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psra_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i128 [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psra_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i128 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i1 [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i128 [[TMP9]] to <2 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP16]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP12]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <2 x i64> [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP17]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP18]], <2 x i64> [[TMP15]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
+  ret <2 x i64> %res2
+}
+define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psra_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i128 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i128 [[TMP8]] to <2 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP15]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP13]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP11]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP14]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
+  ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx512_psra_q_256(<4 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psra_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psra_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i128 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i1 [[TMP8]] to i256
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i256 [[TMP9]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP16]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP12]], <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i64> [[TMP17]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP18]], <4 x i64> [[TMP15]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
+  ret <4 x i64> %res2
+}
+define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psra_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i128 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i256
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i256 [[TMP8]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP15]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP11]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP14]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
+  ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_avx512_psrai_q_128(<2 x i64> %a0) #0 {
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrai_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP3]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrai_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP5]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP6]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP7]], <8 x i1> [[TMP8]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP12]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
+  ret <2 x i64> %res2
+}
+define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP9]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP7]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
+  ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx512_psrai_q_256(<4 x i64> %a0) #0 {
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrai_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP3]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrai_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP5]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP6]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP7]], <8 x i1> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
+  ret <4 x i64> %res2
+}
+define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP9]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
+  ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_x86_avx512_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrav_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrav_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <2 x i1> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP9]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[A2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP15]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[A2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %a2
+  ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <2 x i1> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP7]], [[TMP14]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
+  ret <2 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_x86_avx512_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrav_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrav_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP9]], <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[A2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP15]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[A2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast , <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %a2
+  ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask,  i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i1> [[TMP13]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP7]], [[TMP14]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast , <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
+  ret <4 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <8 x float> @test_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: define <8 x float> @test_vfmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[A1]], <8 x float> [[A2]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  ret <8 x float> %1
+}
+
+define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_vfmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[A1]], <8 x float> [[A2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[A0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %a0
+  ret <8 x float> %3
+}
+
+define <4 x float> @test_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: define <4 x float> @test_vfmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x double> @test_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 {
+; CHECK-LABEL: define <4 x double> @test_fmadd256_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A]], <4 x double> [[B]], <4 x double> [[C]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
+  ret <4 x double> %1
+}
+
+define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_fmadd256_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A]], <4 x double> [[B]], <4 x double> [[C]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[A]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP13]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[A]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %a
+  ret <4 x double> %3
+}
+
+define <2 x double> @test_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
+; CHECK-LABEL: define <2 x double> @test_fmadd128_pd(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_fmadd128_pd(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP13]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[A]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP9]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %a
+  ret <2 x double> %3
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmadd_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP13]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP9]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %x2
+  ret <2 x double> %3
+}
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmadd_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <2 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP7]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> zeroinitializer
+  ret <2 x double> %3
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmadd_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP13]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %x2
+  ret <4 x double> %3
+}
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmadd_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP7]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP8]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> zeroinitializer
+  ret <4 x double> %3
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %x2
+  ret <4 x float> %3
+}
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP7]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %3
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmadd_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %x2
+  ret <8 x float> %3
+}
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmadd_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP7]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP2]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP3]], <2 x i64> [[TMP14]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP2]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract, <2 x double> %2, <2 x double> %x2
+  ret <2 x double> %4
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP2]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i64> [[TMP14]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP2]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP10]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x double> %2, <4 x double> %x2
+  ret <4 x double> %4
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP14]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP2]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %2, <4 x float> %x2
+  ret <4 x float> %4
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP2]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP14]], <8 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP2]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP10]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %x2
+  ret <8 x float> %4
+}
+
+define <8 x float> @test_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: define <8 x float> @test_vfnmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[A2]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %a2)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_vfnmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP2]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP14]], <8 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP2]], <8 x float> [[A0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP10]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %a0
+  ret <8 x float> %4
+}
+
+define <4 x float> @test_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: define <4 x float> @test_vfnmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %a2)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfnmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP14]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP2]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %2, <4 x float> %a0
+  ret <4 x float> %4
+}
+
+define <4 x double> @test_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: define <4 x double> @test_vfnmadd256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[A2]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %a2)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfnmadd256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP2]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i64> [[TMP14]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP2]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP10]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x double> %2, <4 x double> %a0
+  ret <4 x double> %4
+}
+
+define <2 x double> @test_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: define <2 x double> @test_vfnmadd128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[A2]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %a2)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfnmadd128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP2]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP3]], <2 x i64> [[TMP14]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP2]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract, <2 x double> %2, <2 x double> %a0
+  ret <2 x double> %4
+}
+
+define <8 x float> @test_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: define <8 x float> @test_vfnmsub256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_vfnmsub256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[_MSPROP3]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i32> [[TMP15]], <8 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x float> [[TMP3]], <8 x float> [[A0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP11]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %a0
+  ret <8 x float> %5
+}
+
+define <4 x float> @test_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: define <4 x float> @test_vfnmsub128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfnmsub128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP3]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i32> [[TMP15]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x float> %3, <4 x float> %a0
+  ret <4 x float> %5
+}
+
+define <4 x double> @test_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: define <4 x double> @test_vfnmsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfnmsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP3]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i64> [[TMP15]], <4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x double> %3, <4 x double> %a0
+  ret <4 x double> %5
+}
+
+define <2 x double> @test_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: define <2 x double> @test_vfnmsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfnmsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP3]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP4]], <2 x i64> [[TMP15]], <2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1>
+  %5 = select <2 x i1> %extract, <2 x double> %3, <2 x double> %a0
+  ret <2 x double> %5
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP3]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP4]], <2 x i64> [[TMP15]], <2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %x1, <2 x double> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1>
+  %5 = select <2 x i1> %extract, <2 x double> %3, <2 x double> %x2
+  ret <2 x double> %5
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP3]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i64> [[TMP15]], <4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x0
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %x1, <4 x double> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x double> %3, <4 x double> %x2
+  ret <4 x double> %5
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP3]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i32> [[TMP15]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %x1, <4 x float> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x float> %3, <4 x float> %x2
+  ret <4 x float> %5
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[_MSPROP3]], <8 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i32> [[TMP15]], <8 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x float> [[TMP3]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP11]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %x1, <8 x float> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %x2
+  ret <8 x float> %5
+}
+
+define <8 x float> @test_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) #0 {
+; CHECK-LABEL: define <8 x float> @test_fmaddsub256_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP4]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %4
+}
+
+define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_fmaddsub256_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[A]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP12]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %a
+  ret <8 x float> %6
+}
+
+define <4 x float> @test_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+; CHECK-LABEL: define <4 x float> @test_fmaddsub128_ps(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_fmaddsub128_ps(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[A]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP12]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %a
+  ret <4 x float> %6
+}
+
+define <4 x double> @test_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: define <4 x double> @test_vfmaddsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %4
+}
+
+define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfmaddsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP12]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %a0
+  ret <4 x double> %6
+}
+
+define <2 x double> @test_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: define <2 x double> @test_vfmaddsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %4
+}
+
+define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfmaddsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP12]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %a0
+  ret <2 x double> %6
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP12]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %x2
+  ret <2 x double> %6
+}
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <2 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP10]], <2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> zeroinitializer
+  ret <2 x double> %6
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP12]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %x2
+  ret <4 x double> %6
+}
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP10]], <4 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> zeroinitializer
+  ret <4 x double> %6
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP12]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %x2
+  ret <4 x float> %6
+}
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP10]], <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> zeroinitializer
+  ret <4 x float> %6
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP12]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %x2
+  ret <8 x float> %6
+}
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP10]], <8 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP11]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> zeroinitializer
+  ret <8 x float> %6
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP1]], <2 x i64> [[_MSPROP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP12]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %x2
+  ret <2 x double> %6
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP1]], <4 x i64> [[_MSPROP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP12]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %x2
+  ret <4 x double> %6
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP1]], <4 x i32> [[_MSPROP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP12]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2)
+  %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %x2
+  ret <4 x float> %6
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP1]], <8 x i32> [[_MSPROP4]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP12]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2)
+  %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %x2
+  ret <8 x float> %6
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmk(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmka(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2, align 8
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkza(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2, align 4
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmb(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %q = load float, ptr %ptr_a2
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmba(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %q = load float, ptr %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %q = load float, ptr %ptr_a2
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbza(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %q = load float, ptr %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  ret <4 x float> %1
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmk(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <2 x double>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP18]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP9]]
+;
+  %a2 = load <2 x double>, ptr %ptr_a2
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %a0
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmkz(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <2 x double>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a2 = load <2 x double>, ptr %ptr_a2
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  ret <2 x double> %1
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmk(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x double>, ptr [[PTR_A2]], align 32
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP15]], align 32
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i64> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %a2 = load <4 x double>, ptr %ptr_a2
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %a0
+  ret <4 x double> %3
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmkz(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x double>, ptr [[PTR_A2]], align 32
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a2 = load <4 x double>, ptr %ptr_a2
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  ret <4 x double> %1
+}
+
+define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @combine_vpermi2d_vpermps(
+; CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[TMP2]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %1, <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> %2)
+  ret <8 x i32> %3
+}
+
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double>, <2 x double>, <2 x i1>)
+declare <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float>, <4 x float>, <4 x i1>)
+declare <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64>, <2 x i64>, <2 x i1>)
+declare <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32>, <4 x i32>, <4 x i1>)
+declare <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double>, <2 x double>, <2 x i1>)
+declare <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float>, <4 x float>, <4 x i1>)
+declare <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64>, <2 x i64>, <2 x i1>)
+declare <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32>, <4 x i32>, <4 x i1>)
+declare <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double>, <4 x double>, <4 x i1>)
+declare <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float>, <8 x float>, <8 x i1>)
+declare <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64>, <4 x i64>, <4 x i1>)
+declare <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32>, <8 x i32>, <8 x i1>)
+declare <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double>, <4 x double>, <4 x i1>)
+declare <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float>, <8 x float>, <8 x i1>)
+declare <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64>, <4 x i64>, <4 x i1>)
+declare <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32>, <8 x i32>, <8 x i1>)
+
+attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
new file mode 100644
index 000000000000..2350d75b29b4
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mattr=+avx512f -passes=msan -S | FileCheck %s
+
+; Forked from llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;
+; vXi64
+;
+
+define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X1]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> <i64 2, i64 0>, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_unary(
+; CHECK-SAME: <2 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X0]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> <i64 2, i64 0>, <2 x i64> %x0)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP4]], <i64 -1, i64 -5>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %t = or <2 x i64> %m, <i64 0, i64 4>
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP4]], <i64 -1, i64 -3>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %t = or <2 x i64> %m, <i64 0, i64 2>
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> %x1)
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_unary(
+; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X0]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> %x0)
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) #0 {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i64> [[TMP4]], <i64 -1, i64 -9, i64 -17, i64 -33>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %t = or <4 x i64> %m, <i64 0, i64 8, i64 16, i64 32>
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %t, <4 x i64> %x1)
+  ret <4 x i64> %r
+}
+
+define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64(
+; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> %x1)
+  ret <8 x i64> %r
+}
+
+define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_unary(
+; CHECK-SAME: <8 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X0]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> %x0)
+  ret <8 x i64> %r
+}
+
+define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) #0 {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(
+; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP4]], <i64 -1, i64 -17, i64 -33, i64 -65, i64 -257, i64 -513, i64 -1025, i64 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %t = or <8 x i64> %m, <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %t, <8 x i64> %x1)
+  ret <8 x i64> %r
+}
+
+;
+; vXi32
+;
+
+define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> %x1)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_unary(
+; CHECK-SAME: <4 x i32> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X0]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> %x0)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) #0 {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[M]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP4]], <i32 -1, i32 -9, i32 -17, i32 -33>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %t = or <4 x i32> %m, <i32 0, i32 8, i32 16, i32 32>
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %t, <4 x i32> %x1)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> %x1)
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_unary(
+; CHECK-SAME: <8 x i32> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X0]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> %x0)
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) #0 {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i32> [[M]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i32> [[TMP4]], <i32 -1, i32 -17, i32 -33, i32 -65, i32 -257, i32 -513, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %t = or <8 x i32> %m, <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %t, <8 x i32> %x1)
+  ret <8 x i32> %r
+}
+
+define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32(
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> %x1)
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_unary(
+; CHECK-SAME: <16 x i32> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X0]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> %x0)
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) #0 {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i32> [[M]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP4]], <i32 -1, i32 -33, i32 -65, i32 -257, i32 -513, i32 -1025, i32 -2049, i32 -4097, i32 -8193, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %t = or <16 x i32> %m, <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %t, <16 x i32> %x1)
+  ret <16 x i32> %r
+}
+
+;
+; vXi16
+;
+
+define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16(
+; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X1]])
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> %x1)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_unary(
+; CHECK-SAME: <8 x i16> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X0]])
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> %x0)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) #0 {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(
+; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i16> [[M]], splat (i16 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i16> [[TMP4]], <i16 -1, i16 -17, i16 -33, i16 -65, i16 -257, i16 -513, i16 15, i16 31>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]])
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %t = or <8 x i16> %m, <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %t, <8 x i16> %x1)
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16(
+; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> %x1)
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_unary(
+; CHECK-SAME: <16 x i16> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X0]])
+; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> %x0)
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) #0 {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(
+; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i16> [[M]], splat (i16 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i16> [[TMP4]], <i16 -1, i16 -33, i16 -65, i16 -257, i16 -513, i16 -1025, i16 -2049, i16 -4097, i16 31, i16 63, i16 127, i16 255, i16 511, i16 1023, i16 2047, i16 4095>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %t = or <16 x i16> %m, <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %t, <16 x i16> %x1)
+  ret <16 x i16> %r
+}
+
+define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16(
+; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> %x1)
+  ret <32 x i16> %r
+}
+
+define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_unary(
+; CHECK-SAME: <32 x i16> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X0]])
+; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> %x0)
+  ret <32 x i16> %r
+}
+
+define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) #0 {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(
+; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <32 x i16> [[M]], splat (i16 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i16> [[TMP4]], <i16 -1, i16 -65, i16 -129, i16 -257, i16 -513, i16 -1025, i16 -2049, i16 -4097, i16 -1, i16 63, i16 127, i16 255, i16 511, i16 1023, i16 2047, i16 4095, i16 -1, i16 -65, i16 -129, i16 -257, i16 -513, i16 -1025, i16 -2049, i16 -4097, i16 -1, i16 63, i16 127, i16 255, i16 511, i16 1023, i16 2047, i16 4095>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %t = or <32 x i16> %m, <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %t, <32 x i16> %x1)
+  ret <32 x i16> %r
+}
+
+;
+; vXi8
+;
+
+define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8(
+; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X1]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> %x1)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_unary(
+; CHECK-SAME: <16 x i8> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X0]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> %x0)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) #0 {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(
+; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i8> [[M]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i8> [[TMP4]], <i8 -1, i8 -33, i8 -65, i8 127, i8 -1, i8 31, i8 63, i8 127, i8 -1, i8 -33, i8 -65, i8 127, i8 -1, i8 31, i8 63, i8 127>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %t = or <16 x i8> %m, <i8 0, i8 32, i8 64, i8 128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 128, i8 0, i8 -32, i8 -64, i8 -128>
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %t, <16 x i8> %x1)
+  ret <16 x i8> %r
+}
+
+define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8(
+; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> %x1)
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_unary(
+; CHECK-SAME: <32 x i8> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X0]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> %x0)
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) #0 {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(
+; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <32 x i8> [[M]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i8> [[TMP4]], <i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127, i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127, i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127, i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %t = or <32 x i8> %m, <i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128>
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %t, <32 x i8> %x1)
+  ret <32 x i8> %r
+}
+
+define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8(
+; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X1]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> <i8 128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %x1)
+  ret <64 x i8> %r
+}
+
+define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_unary(
+; CHECK-SAME: <64 x i8> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X0]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> <i8 128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %x0)
+  ret <64 x i8> %r
+}
+
+define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) #0 {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(
+; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <64 x i8> [[M]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <64 x i8> [[TMP4]], <i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %t = or <64 x i8> %m, <i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128>
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %t, <64 x i8> %x1)
+  ret <64 x i8> %r
+}
+
+attributes #0 = { sanitize_memory }

From ee6362515dfa4fe4531c7a7690c270313669195b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 12 Jun 2025 14:22:50 -0700
Subject: [PATCH 0207/1322] [RISCV][CostModel] Add additional high LMUL reverse
 tests

---
 llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll     | 6 ++++++
 llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index e068ab638d3a..e1bca7161412 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -86,6 +86,8 @@ define void @vector_reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
@@ -106,6 +108,8 @@ define void @vector_reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
@@ -125,6 +129,8 @@ define void @vector_reverse() {
   %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
   %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
   %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+  %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
   %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
   %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
   %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 7e92d8203a13..8f3219861f2f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -26,8 +26,11 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -55,8 +58,11 @@ define void @reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -85,9 +91,12 @@ define void @reverse() {
   %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
   %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
   %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>

From e4c32a4147012da735205eb44a45b8be5eea048d Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Thu, 12 Jun 2025 14:30:59 -0700
Subject: [PATCH 0208/1322] [Clang][NFC] Move Input into SmallVector instead of
 copy (#143830)

Static analysis flagged Input as a large object that would benefit from
being moved over being copied.
---
 clang/lib/Frontend/CompilerInstance.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 587b0d1af9c8..09a66b652518 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1191,7 +1191,7 @@ std::unique_ptr<CompilerInstance> CompilerInstance::cloneForModuleCompileImpl(
   FrontendOpts.OriginalModuleMap = std::string(OriginalModuleMapFile);
   // Force implicitly-built modules to hash the content of the module file.
   HSOpts.ModulesHashContent = true;
-  FrontendOpts.Inputs = {Input};
+  FrontendOpts.Inputs = {std::move(Input)};
 
   // Don't free the remapped file buffers; they are owned by our caller.
   PPOpts.RetainRemappedFileBuffers = true;

From 902a991e1245537f5fc11e031409fdd69fba1c06 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 12 Jun 2025 14:46:37 -0700
Subject: [PATCH 0209/1322] [BOLT] Make memory profile parsing optional
 (#129585)

Introduce `parse-mem-profile` option to limit overheads processing
tracing data (Intel PT or ARM ETM). By default, it's enabled for
perf data (existing behavior), unless `itrace` is passed to parse
tracing data where it's extremely expensive. In this case, the flag
needs to be set explicitly if needed.
---
 bolt/lib/Profile/DataAggregator.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 308346e5d02c..ade8478f556e 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -61,6 +61,12 @@ FilterMemProfile("filter-mem-profile",
   cl::init(true),
   cl::cat(AggregatorCategory));
 
+static cl::opt<bool> ParseMemProfile(
+    "parse-mem-profile",
+    cl::desc("enable memory profile parsing if it's present in the input data, "
+             "on by default unless `--itrace` is set."),
+    cl::init(true), cl::cat(AggregatorCategory));
+
 static cl::opt<unsigned long long>
 FilterPID("pid",
   cl::desc("only use samples from process with specified PID"),
@@ -181,6 +187,10 @@ void DataAggregator::start() {
                       "script -F pid,event,ip",
                       /*Wait = */false);
   } else if (!opts::ITraceAggregation.empty()) {
+    // Disable parsing memory profile from trace data, unless requested by user.
+    if (!opts::ParseMemProfile.getNumOccurrences())
+      opts::ParseMemProfile = false;
+
     std::string ItracePerfScriptArgs = llvm::formatv(
         "script -F pid,brstack --itrace={0}", opts::ITraceAggregation);
     launchPerfProcess("branch events with itrace", MainEventsPPI,
@@ -191,12 +201,9 @@ void DataAggregator::start() {
                       /*Wait = */ false);
   }
 
-  // Note: we launch script for mem events regardless of the option, as the
-  //       command fails fairly fast if mem events were not collected.
-  launchPerfProcess("mem events",
-                    MemEventsPPI,
-                    "script -F pid,event,addr,ip",
-                    /*Wait = */false);
+  if (opts::ParseMemProfile)
+    launchPerfProcess("mem events", MemEventsPPI, "script -F pid,event,addr,ip",
+                      /*Wait = */ false);
 
   launchPerfProcess("process events", MMapEventsPPI,
                     "script --show-mmap-events --no-itrace",
@@ -217,7 +224,8 @@ void DataAggregator::abort() {
   sys::Wait(TaskEventsPPI.PI, 1, &Error);
   sys::Wait(MMapEventsPPI.PI, 1, &Error);
   sys::Wait(MainEventsPPI.PI, 1, &Error);
-  sys::Wait(MemEventsPPI.PI, 1, &Error);
+  if (opts::ParseMemProfile)
+    sys::Wait(MemEventsPPI.PI, 1, &Error);
 
   deleteTempFiles();
 
@@ -506,7 +514,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
     errs() << "PERF2BOLT: failed to parse samples\n";
 
   // Special handling for memory events
-  if (!prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
+  if (opts::ParseMemProfile &&
+      !prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
     if (const std::error_code EC = parseMemEvents())
       errs() << "PERF2BOLT: failed to parse memory events: " << EC.message()
              << '\n';

From 1ac61c8334782629462e6bf7c91b3fc8f4e663e8 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Thu, 12 Jun 2025 14:49:00 -0700
Subject: [PATCH 0210/1322] [mlir][Vector] Remove
 `vector.extractelement/insertelement` from sparse vectorizer (#143270)

This PR is part of the last step to remove `vector.extractelement` and `vector.insertelement` ops.
RFC: https://discourse.llvm.org/t/rfc-psa-remove-vector-extractelement-and-vector-insertelement-ops-in-favor-of-vector-extract-and-vector-insert-ops

It updates the Sparse Vectorizer to use `vector.extract` and `vector.insert` instead of `vector.extractelement` and `vector.insertelement`.
---
 .../Transforms/SparseVectorization.cpp        | 74 ++++++++++++-------
 .../SparseTensor/minipipeline_vector.mlir     |  2 +-
 .../Dialect/SparseTensor/sparse_vector.mlir   |  6 +-
 .../SparseTensor/sparse_vector_chain.mlir     |  2 +-
 .../SparseTensor/vectorize_reduction.mlir     | 10 +--
 5 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
index 3d963dea2f57..359590f2434d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
@@ -198,14 +198,14 @@ static Value genVectorReducInit(PatternRewriter &rewriter, Location loc,
   case vector::CombiningKind::ADD:
   case vector::CombiningKind::XOR:
     // Initialize reduction vector to: | 0 | .. | 0 | r |
-    return rewriter.create<vector::InsertElementOp>(
-        loc, r, constantZero(rewriter, loc, vtp),
-        constantIndex(rewriter, loc, 0));
+    return rewriter.create<vector::InsertOp>(loc, r,
+                                             constantZero(rewriter, loc, vtp),
+                                             constantIndex(rewriter, loc, 0));
   case vector::CombiningKind::MUL:
     // Initialize reduction vector to: | 1 | .. | 1 | r |
-    return rewriter.create<vector::InsertElementOp>(
-        loc, r, constantOne(rewriter, loc, vtp),
-        constantIndex(rewriter, loc, 0));
+    return rewriter.create<vector::InsertOp>(loc, r,
+                                             constantOne(rewriter, loc, vtp),
+                                             constantIndex(rewriter, loc, 0));
   case vector::CombiningKind::AND:
   case vector::CombiningKind::OR:
     // Initialize reduction vector to: | r | .. | r | r |
@@ -628,31 +628,49 @@ private:
   const VL vl;
 };
 
-/// Reduction chain cleanup.
-///   v = for { }
-///   s = vsum(v)               v = for { }
-///   u = expand(s)       ->    for (v) { }
-///   for (u) { }
-template <typename VectorOp>
-struct ReducChainRewriter : public OpRewritePattern<VectorOp> {
-public:
-  using OpRewritePattern<VectorOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(VectorOp op,
-                                PatternRewriter &rewriter) const override {
-    Value inp = op.getSource();
-    if (auto redOp = inp.getDefiningOp<vector::ReductionOp>()) {
-      if (auto forOp = redOp.getVector().getDefiningOp<scf::ForOp>()) {
-        if (forOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) {
-          rewriter.replaceOp(op, redOp.getVector());
-          return success();
-        }
+static LogicalResult cleanReducChain(PatternRewriter &rewriter, Operation *op,
+                                     Value inp) {
+  if (auto redOp = inp.getDefiningOp<vector::ReductionOp>()) {
+    if (auto forOp = redOp.getVector().getDefiningOp<scf::ForOp>()) {
+      if (forOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) {
+        rewriter.replaceOp(op, redOp.getVector());
+        return success();
       }
     }
-    return failure();
+  }
+  return failure();
+}
+
+/// Reduction chain cleanup.
+///   v = for { }
+///   s = vsum(v)                  v = for { }
+///   u = broadcast(s)       ->    for (v) { }
+///   for (u) { }
+struct ReducChainBroadcastRewriter
+    : public OpRewritePattern<vector::BroadcastOp> {
+public:
+  using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::BroadcastOp op,
+                                PatternRewriter &rewriter) const override {
+    return cleanReducChain(rewriter, op, op.getSource());
   }
 };
 
+/// Reduction chain cleanup.
+///   v = for { }
+///   s = vsum(v)               v = for { }
+///   u = insert(s)       ->    for (v) { }
+///   for (u) { }
+struct ReducChainInsertRewriter : public OpRewritePattern<vector::InsertOp> {
+public:
+  using OpRewritePattern<vector::InsertOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::InsertOp op,
+                                PatternRewriter &rewriter) const override {
+    return cleanReducChain(rewriter, op, op.getValueToStore());
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -668,6 +686,6 @@ void mlir::populateSparseVectorizationPatterns(RewritePatternSet &patterns,
   vector::populateVectorStepLoweringPatterns(patterns);
   patterns.add<ForOpRewriter>(patterns.getContext(), vectorLength,
                               enableVLAVectorization, enableSIMDIndex32);
-  patterns.add<ReducChainRewriter<vector::InsertElementOp>,
-               ReducChainRewriter<vector::BroadcastOp>>(patterns.getContext());
+  patterns.add<ReducChainInsertRewriter, ReducChainBroadcastRewriter>(
+      patterns.getContext());
 }
diff --git a/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir b/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
index 2475aa5139da..b2dfbeb53fde 100755
--- a/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
+++ b/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
@@ -22,7 +22,7 @@
 // CHECK-NOVEC:       }
 //
 // CHECK-VEC-LABEL: func.func @sum_reduction
-// CHECK-VEC:       vector.insertelement
+// CHECK-VEC:       vector.insert
 // CHECK-VEC:       scf.for
 // CHECK-VEC:         vector.create_mask
 // CHECK-VEC:         vector.maskedload
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
index 364ba6e71ff3..64235c722780 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
@@ -241,7 +241,7 @@ func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>,
 // CHECK-VEC16-DAG:   %[[c1024:.*]] = arith.constant 1024 : index
 // CHECK-VEC16-DAG:   %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
 // CHECK-VEC16:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC16:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<16xf32>
+// CHECK-VEC16:       %[[r:.*]] = vector.insert %[[l]], %[[v0]] [0] : f32 into vector<16xf32>
 // CHECK-VEC16:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
 // CHECK-VEC16:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
 // CHECK-VEC16:         %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
@@ -258,7 +258,7 @@ func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>,
 // CHECK-VEC16-IDX32-DAG:   %[[c1024:.*]] = arith.constant 1024 : index
 // CHECK-VEC16-IDX32-DAG:   %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
 // CHECK-VEC16-IDX32:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC16-IDX32:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<16xf32>
+// CHECK-VEC16-IDX32:       %[[r:.*]] = vector.insert %[[l]], %[[v0]] [0] : f32 into vector<16xf32>
 // CHECK-VEC16-IDX32:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
 // CHECK-VEC16-IDX32:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
 // CHECK-VEC16-IDX32:         %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
@@ -278,7 +278,7 @@ func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>,
 // CHECK-VEC4-SVE:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
 // CHECK-VEC4-SVE:       %[[vscale:.*]] = vector.vscale
 // CHECK-VEC4-SVE:       %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4-SVE:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<[4]xf32>
+// CHECK-VEC4-SVE:       %[[r:.*]] = vector.insert %[[l]], %[[v0]] [0] : f32 into vector<[4]xf32>
 // CHECK-VEC4-SVE:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[step]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<[4]xf32>) {
 // CHECK-VEC4-SVE:         %[[sub:.*]] = affine.min #[[$map]](%[[c1024]], %[[i]])[%[[step]]]
 // CHECK-VEC4-SVE:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
index f4b565c7f9c8..0ab72897d7bc 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
@@ -82,7 +82,7 @@
 // CHECK:               %[[VAL_57:.*]] = arith.select %[[VAL_39]], %[[VAL_56]], %[[VAL_32]] : index
 // CHECK:               scf.yield %[[VAL_55]], %[[VAL_57]], %[[VAL_58:.*]] : index, index, f64
 // CHECK:             } attributes {"Emitted from" = "linalg.generic"}
-// CHECK:             %[[VAL_59:.*]] = vector.insertelement %[[VAL_60:.*]]#2, %[[VAL_4]]{{\[}}%[[VAL_6]] : index] : vector<8xf64>
+// CHECK:             %[[VAL_59:.*]] = vector.insert %[[VAL_60:.*]]#2, %[[VAL_4]] [0] : f64 into vector<8xf64>
 // CHECK:             %[[VAL_61:.*]] = scf.for %[[VAL_62:.*]] = %[[VAL_60]]#0 to %[[VAL_21]] step %[[VAL_3]] iter_args(%[[VAL_63:.*]] = %[[VAL_59]]) -> (vector<8xf64>) {
 // CHECK:               %[[VAL_64:.*]] = affine.min #map(%[[VAL_21]], %[[VAL_62]]){{\[}}%[[VAL_3]]]
 // CHECK:               %[[VAL_65:.*]] = vector.create_mask %[[VAL_64]] : vector<8xi1>
diff --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
index 01b717090e87..6effbbf98abb 100644
--- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
@@ -172,7 +172,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 // CHECK-ON:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:           %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:           %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_4]]{{\[}}%[[VAL_3]] : index] : vector<8xi32>
+// CHECK-ON:           %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_4]] [0] : i32 into vector<8xi32>
 // CHECK-ON:           %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
 // CHECK-ON:             %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:             %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -247,7 +247,7 @@ func.func @sparse_reduction_subi(%argx: tensor<i32>,
 // CHECK-ON:  %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:  %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:  %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:  %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32>
+// CHECK-ON:  %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : i32 into vector<8xi32>
 // CHECK-ON:  %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
 // CHECK-ON:    %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:    %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -323,7 +323,7 @@ func.func @sparse_reduction_xor(%argx: tensor<i32>,
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32>
+// CHECK-ON:   %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : i32 into vector<8xi32>
 // CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
 // CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -399,7 +399,7 @@ func.func @sparse_reduction_addi(%argx: tensor<i32>,
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xf32>
+// CHECK-ON:   %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : f32 into vector<8xf32>
 // CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xf32>) {
 // CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -475,7 +475,7 @@ func.func @sparse_reduction_subf(%argx: tensor<f32>,
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xf32>
+// CHECK-ON:   %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : f32 into vector<8xf32>
 // CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xf32>) {
 // CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>

From 4a4035c86b0dd2b1aa09bb2ff4b6788c2bf88745 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 12 Jun 2025 14:52:07 -0700
Subject: [PATCH 0211/1322] [CIR] Add support for delegating constructors
 (#143932)

This change adds the necessary support for handling delegating
constructors in ClangIR. The implementation is kept as small as possible
by not handling any other sort of initialization (members, base classes,
etc.). That will be added in a future commit.
---
 clang/include/clang/CIR/MissingFeatures.h     |  2 +-
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          |  3 +-
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 38 +++++++++++++
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          |  7 ++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 21 ++++++++
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 10 +++-
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      | 10 +---
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 13 +++++
 clang/lib/CIR/CodeGen/CIRGenValue.h           | 53 ++++++++++++++++---
 clang/test/CIR/CodeGen/ctor.cpp               | 46 ++++++++++++++++
 10 files changed, 183 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index fbd15d5c886d..97b933657d74 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -172,6 +172,7 @@ struct MissingFeatures {
   static bool astVarDeclInterface() { return false; }
   static bool stackSaveOp() { return false; }
   static bool aggValueSlot() { return false; }
+  static bool aggValueSlotMayOverlap() { return false; }
   static bool generateDebugInfo() { return false; }
   static bool pointerOverflowSanitizer() { return false; }
   static bool fpConstraints() { return false; }
@@ -227,7 +228,6 @@ struct MissingFeatures {
   static bool implicitConstructorArgs() { return false; }
   static bool intrinsics() { return false; }
   static bool attributeNoBuiltin() { return false; }
-  static bool emitCtorPrologue() { return false; }
   static bool thunks() { return false; }
   static bool runCleanupsScope() { return false; }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 67c6a8dd3ef5..5ec720ffd54f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -189,8 +189,7 @@ void CIRGenFunction::emitDelegateCallArg(CallArgList &args,
     // For the most part, we just need to load the alloca, except that aggregate
     // r-values are actually pointers to temporaries.
   } else {
-    cgm.errorNYI(param->getSourceRange(),
-                 "emitDelegateCallArg: convertTempToRValue");
+    args.add(convertTempToRValue(local, type, loc), type);
   }
 
   // Deactivate the cleanup for the callee-destructed param that was pushed.
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index bb4b451c9924..e59a1fdb837c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -53,6 +53,21 @@ bool CIRGenFunction::isConstructorDelegationValid(
   return true;
 }
 
+/// This routine generates necessary code to initialize base classes and
+/// non-static data members belonging to this constructor.
+void CIRGenFunction::emitCtorPrologue(const CXXConstructorDecl *cd,
+                                      CXXCtorType ctorType,
+                                      FunctionArgList &args) {
+  if (cd->isDelegatingConstructor())
+    return emitDelegatingCXXConstructorCall(cd, args);
+
+  if (cd->getNumCtorInitializers() != 0) {
+    // There's much more to do here.
+    cgm.errorNYI(cd->getSourceRange(), "emitCtorPrologue: any initializer");
+    return;
+  }
+}
+
 Address CIRGenFunction::loadCXXThisAddress() {
   assert(curFuncDecl && "loading 'this' without a func declaration?");
   assert(isa<CXXMethodDecl>(curFuncDecl));
@@ -102,6 +117,29 @@ void CIRGenFunction::emitDelegateCXXConstructorCall(
                          /*Delegating=*/true, thisAddr, delegateArgs, loc);
 }
 
+void CIRGenFunction::emitDelegatingCXXConstructorCall(
+    const CXXConstructorDecl *ctor, const FunctionArgList &args) {
+  assert(ctor->isDelegatingConstructor());
+
+  Address thisPtr = loadCXXThisAddress();
+
+  assert(!cir::MissingFeatures::objCGC());
+  assert(!cir::MissingFeatures::sanitizers());
+  AggValueSlot aggSlot = AggValueSlot::forAddr(
+      thisPtr, Qualifiers(), AggValueSlot::IsDestructed,
+      AggValueSlot::IsNotAliased, AggValueSlot::MayOverlap,
+      AggValueSlot::IsNotZeroed);
+
+  emitAggExpr(ctor->init_begin()[0]->getInit(), aggSlot);
+
+  const CXXRecordDecl *classDecl = ctor->getParent();
+  if (cgm.getLangOpts().Exceptions && !classDecl->hasTrivialDestructor()) {
+    cgm.errorNYI(ctor->getSourceRange(),
+                 "emitDelegatingCXXConstructorCall: exception");
+    return;
+  }
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 80b0172090aa..748c2b5f6fce 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -259,7 +259,12 @@ void CIRGenFunction::emitExprAsInit(const Expr *init, const ValueDecl *d,
     return;
   }
   case cir::TEK_Aggregate:
-    emitAggExpr(init, AggValueSlot::forLValue(lvalue));
+    // The overlap flag here should be calculated.
+    assert(!cir::MissingFeatures::aggValueSlotMayOverlap());
+    emitAggExpr(init,
+                AggValueSlot::forLValue(lvalue, AggValueSlot::IsDestructed,
+                                        AggValueSlot::IsNotAliased,
+                                        AggValueSlot::MayOverlap));
     return;
   }
   llvm_unreachable("bad evaluation kind");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index f1f86509c9a9..5d04faf443b8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1261,6 +1261,23 @@ Address CIRGenFunction::emitArrayToPointerDecay(const Expr *e) {
   return Address(ptr, addr.getAlignment());
 }
 
+/// Given the address of a temporary variable, produce an r-value of its type.
+RValue CIRGenFunction::convertTempToRValue(Address addr, clang::QualType type,
+                                           clang::SourceLocation loc) {
+  LValue lvalue = makeAddrLValue(addr, type, AlignmentSource::Decl);
+  switch (getEvaluationKind(type)) {
+  case cir::TEK_Complex:
+    cgm.errorNYI(loc, "convertTempToRValue: complex type");
+    return RValue::get(nullptr);
+  case cir::TEK_Aggregate:
+    cgm.errorNYI(loc, "convertTempToRValue: aggregate type");
+    return RValue::get(nullptr);
+  case cir::TEK_Scalar:
+    return RValue::get(emitLoadOfScalar(lvalue, loc));
+  }
+  llvm_unreachable("bad evaluation kind");
+}
+
 /// Emit an `if` on a boolean condition, filling `then` and `else` into
 /// appropriated regions.
 mlir::LogicalResult CIRGenFunction::emitIfOnBoolExpr(const Expr *cond,
@@ -1473,6 +1490,10 @@ void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
     type = Ctor_Complete;
     break;
   case CXXConstructionKind::Delegating:
+    // We should be emitting a constructor; GlobalDecl will assert this
+    type = curGD.getCtorType();
+    delegating = true;
+    break;
   case CXXConstructionKind::VirtualBase:
   case CXXConstructionKind::NonVirtualBase:
     cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index f1df1b79fc48..061123d55b88 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -203,7 +203,11 @@ void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
     cgf.cgm.errorNYI("emitInitializationToLValue TEK_Complex");
     break;
   case cir::TEK_Aggregate:
-    cgf.emitAggExpr(e, AggValueSlot::forLValue(lv));
+    cgf.emitAggExpr(e, AggValueSlot::forLValue(lv, AggValueSlot::IsDestructed,
+                                               AggValueSlot::IsNotAliased,
+                                               AggValueSlot::MayOverlap,
+                                               dest.isZeroed()));
+
     return;
   case cir::TEK_Scalar:
     if (lv.isSimple())
@@ -284,6 +288,8 @@ LValue CIRGenFunction::emitAggExprToLValue(const Expr *e) {
   assert(hasAggregateEvaluationKind(e->getType()) && "Invalid argument!");
   Address temp = createMemTemp(e->getType(), getLoc(e->getSourceRange()));
   LValue lv = makeAddrLValue(temp, e->getType());
-  emitAggExpr(e, AggValueSlot::forLValue(lv));
+  emitAggExpr(e, AggValueSlot::forLValue(lv, AggValueSlot::IsNotDestructed,
+                                         AggValueSlot::IsNotAliased,
+                                         AggValueSlot::DoesNotOverlap));
   return lv;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 53c44c6cc768..c5bd5109343d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -526,14 +526,8 @@ void CIRGenFunction::emitConstructorBody(FunctionArgList &args) {
   // TODO: in restricted cases, we can emit the vbase initializers of a
   // complete ctor and then delegate to the base ctor.
 
-  assert(!cir::MissingFeatures::emitCtorPrologue());
-  if (ctor->isDelegatingConstructor()) {
-    // This will be handled in emitCtorPrologue, but we should emit a diagnostic
-    // rather than silently fail to delegate.
-    cgm.errorNYI(ctor->getSourceRange(),
-                 "emitConstructorBody: delegating ctor");
-    return;
-  }
+  // Emit the constructor prologue, i.e. the base and member initializers.
+  emitCtorPrologue(ctor, ctorType, args);
 
   // TODO(cir): propagate this result via mlir::logical result. Just unreachable
   // now just to have it handled.
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 361dcd5ef1c3..cf672b0c90e6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -474,6 +474,9 @@ public:
 
   bool shouldNullCheckClassCastValue(const CastExpr *ce);
 
+  RValue convertTempToRValue(Address addr, clang::QualType type,
+                             clang::SourceLocation loc);
+
   static bool
   isConstructorDelegationValid(const clang::CXXConstructorDecl *ctor);
 
@@ -797,6 +800,16 @@ public:
                                        const CXXMethodDecl *md,
                                        ReturnValueSlot returnValue);
 
+  void emitCtorPrologue(const clang::CXXConstructorDecl *ctor,
+                        clang::CXXCtorType ctorType, FunctionArgList &args);
+
+  // It's important not to confuse this and emitDelegateCXXConstructorCall.
+  // Delegating constructors are the C++11 feature. The constructor delegate
+  // optimization is used to reduce duplication in the base and complete
+  // constructors where they are substantially the same.
+  void emitDelegatingCXXConstructorCall(const CXXConstructorDecl *ctor,
+                                        const FunctionArgList &args);
+
   mlir::LogicalResult emitDoStmt(const clang::DoStmt &s);
 
   /// Emit an expression as an initializer for an object (variable, field, etc.)
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 208247e16e53..8f52fea31750 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -267,23 +267,64 @@ class AggValueSlot {
   Address addr;
   clang::Qualifiers quals;
 
+  /// This is set to true if some external code is responsible for setting up a
+  /// destructor for the slot.  Otherwise the code which constructs it should
+  /// push the appropriate cleanup.
+  LLVM_PREFERRED_TYPE(bool)
+  [[maybe_unused]] unsigned destructedFlag : 1;
+
   /// This is set to true if the memory in the slot is known to be zero before
   /// the assignment into it.  This means that zero fields don't need to be set.
-  bool zeroedFlag : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned zeroedFlag : 1;
+
+  /// This is set to true if the slot might be aliased and it's not undefined
+  /// behavior to access it through such an alias.  Note that it's always
+  /// undefined behavior to access a C++ object that's under construction
+  /// through an alias derived from outside the construction process.
+  ///
+  /// This flag controls whether calls that produce the aggregate
+  /// value may be evaluated directly into the slot, or whether they
+  /// must be evaluated into an unaliased temporary and then memcpy'ed
+  /// over.  Since it's invalid in general to memcpy a non-POD C++
+  /// object, it's important that this flag never be set when
+  /// evaluating an expression which constructs such an object.
+  LLVM_PREFERRED_TYPE(bool)
+  [[maybe_unused]] unsigned aliasedFlag : 1;
+
+  /// This is set to true if the tail padding of this slot might overlap
+  /// another object that may have already been initialized (and whose
+  /// value must be preserved by this initialization). If so, we may only
+  /// store up to the dsize of the type. Otherwise we can widen stores to
+  /// the size of the type.
+  LLVM_PREFERRED_TYPE(bool)
+  [[maybe_unused]] unsigned overlapFlag : 1;
 
 public:
+  enum IsDestructed_t { IsNotDestructed, IsDestructed };
   enum IsZeroed_t { IsNotZeroed, IsZeroed };
+  enum IsAliased_t { IsNotAliased, IsAliased };
+  enum Overlap_t { MayOverlap, DoesNotOverlap };
 
-  AggValueSlot(Address addr, clang::Qualifiers quals, bool zeroedFlag)
-      : addr(addr), quals(quals), zeroedFlag(zeroedFlag) {}
+  AggValueSlot(Address addr, clang::Qualifiers quals, bool destructedFlag,
+               bool zeroedFlag, bool aliasedFlag, bool overlapFlag)
+      : addr(addr), quals(quals), destructedFlag(destructedFlag),
+        zeroedFlag(zeroedFlag), aliasedFlag(aliasedFlag),
+        overlapFlag(overlapFlag) {}
 
   static AggValueSlot forAddr(Address addr, clang::Qualifiers quals,
+                              IsDestructed_t isDestructed,
+                              IsAliased_t isAliased, Overlap_t mayOverlap,
                               IsZeroed_t isZeroed = IsNotZeroed) {
-    return AggValueSlot(addr, quals, isZeroed);
+    return AggValueSlot(addr, quals, isDestructed, isZeroed, isAliased,
+                        mayOverlap);
   }
 
-  static AggValueSlot forLValue(const LValue &lv) {
-    return forAddr(lv.getAddress(), lv.getQuals());
+  static AggValueSlot forLValue(const LValue &LV, IsDestructed_t isDestructed,
+                                IsAliased_t isAliased, Overlap_t mayOverlap,
+                                IsZeroed_t isZeroed = IsNotZeroed) {
+    return forAddr(LV.getAddress(), LV.getQuals(), isDestructed, isAliased,
+                   mayOverlap, isZeroed);
   }
 
   clang::Qualifiers getQualifiers() const { return quals; }
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 3b4191fd74c9..1a36eb0d9d3a 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -67,3 +67,49 @@ void bar() {
 // CHECK-NEXT:    %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
 // CHECK-NEXT:    cir.call @_ZN13VariadicStrukC1Eiz(%[[S_ADDR]], %[[ONE]], %[[TWO]], %[[THREE]])
 // CHECK-NEXT:    cir.return
+
+struct DelegatingStruk {
+  int a;
+  DelegatingStruk(int n) { a = n; }
+  DelegatingStruk() : DelegatingStruk(0) {}
+};
+
+void bam() {
+  DelegatingStruk s;
+}
+
+// CHECK:       cir.func @_ZN15DelegatingStrukC2Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK-SAME:                                     %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[N_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[N:.*]] = cir.load{{.*}} %[[N_ADDR]]
+// CHECK-NEXT:   %[[A_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "a"}
+// CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:       cir.func @_ZN15DelegatingStrukC1Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK-SAME:                                     %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[N_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[N:.*]] = cir.load{{.*}} %[[N_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN15DelegatingStrukC2Ei(%[[THIS]], %[[N]])
+// CHECK-NEXT:   cir.return
+
+// CHECK: cir.func @_ZN15DelegatingStrukC1Ev(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT:   cir.call @_ZN15DelegatingStrukC1Ei(%[[THIS]], %[[ZERO]])
+// CHECK-NEXT:   cir.return
+
+// CHECK: cir.func @_Z3bamv
+// CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
+// CHECK-NEXT:    cir.call @_ZN15DelegatingStrukC1Ev(%[[S_ADDR]])
+// CHECK-NEXT:    cir.return

From 8a2895ad89793591cd3f0114bc56cd345f651823 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 12 Jun 2025 14:52:43 -0700
Subject: [PATCH 0212/1322] [lldb] Implement JSON RPC (newline delimited)
 Transport (#143946)

This PR implements JSON RPC-style (i.e. newline delimited) JSON
transport. I moved the existing transport tests from DAP to Host and
moved the PipeTest base class into TestingSupport so it can be shared by
both.
---
 lldb/include/lldb/Host/JSONTransport.h        |  23 ++-
 lldb/source/Host/common/JSONTransport.cpp     |  37 +++-
 lldb/unittests/DAP/CMakeLists.txt             |   1 -
 lldb/unittests/DAP/TestBase.cpp               |   7 +-
 lldb/unittests/DAP/TestBase.h                 |  13 +-
 lldb/unittests/DAP/TransportTest.cpp          |  98 ----------
 lldb/unittests/Host/CMakeLists.txt            |   1 +
 lldb/unittests/Host/JSONTransportTest.cpp     | 176 ++++++++++++++++++
 .../TestingSupport/Host/PipeTestUtilities.h   |  28 +++
 9 files changed, 260 insertions(+), 124 deletions(-)
 delete mode 100644 lldb/unittests/DAP/TransportTest.cpp
 create mode 100644 lldb/unittests/Host/JSONTransportTest.cpp
 create mode 100644 lldb/unittests/TestingSupport/Host/PipeTestUtilities.h

diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h
index 4db5e417ea85..4087cdf2b42f 100644
--- a/lldb/include/lldb/Host/JSONTransport.h
+++ b/lldb/include/lldb/Host/JSONTransport.h
@@ -51,17 +51,17 @@ public:
   }
 };
 
-class TransportClosedError : public llvm::ErrorInfo<TransportClosedError> {
+class TransportInvalidError : public llvm::ErrorInfo<TransportInvalidError> {
 public:
   static char ID;
 
-  TransportClosedError() = default;
+  TransportInvalidError() = default;
 
   void log(llvm::raw_ostream &OS) const override {
-    OS << "transport is closed";
+    OS << "transport IO object invalid";
   }
   std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
+    return std::make_error_code(std::errc::not_connected);
   }
 };
 
@@ -121,6 +121,21 @@ protected:
   static constexpr llvm::StringLiteral kHeaderSeparator = "\r\n\r\n";
 };
 
+/// A transport class for JSON RPC.
+class JSONRPCTransport : public JSONTransport {
+public:
+  JSONRPCTransport(lldb::IOObjectSP input, lldb::IOObjectSP output)
+      : JSONTransport(input, output) {}
+  virtual ~JSONRPCTransport() = default;
+
+protected:
+  virtual llvm::Error WriteImpl(const std::string &message) override;
+  virtual llvm::Expected<std::string>
+  ReadImpl(const std::chrono::microseconds &timeout) override;
+
+  static constexpr llvm::StringLiteral kMessageSeparator = "\n";
+};
+
 } // namespace lldb_private
 
 #endif
diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp
index 103c76d25daf..1a0851d5c436 100644
--- a/lldb/source/Host/common/JSONTransport.cpp
+++ b/lldb/source/Host/common/JSONTransport.cpp
@@ -31,7 +31,7 @@ static Expected<std::string>
 ReadFull(IOObject &descriptor, size_t length,
          std::optional<std::chrono::microseconds> timeout = std::nullopt) {
   if (!descriptor.IsValid())
-    return llvm::make_error<TransportClosedError>();
+    return llvm::make_error<TransportInvalidError>();
 
   bool timeout_supported = true;
   // FIXME: SelectHelper does not work with NativeFile on Win32.
@@ -92,7 +92,7 @@ void JSONTransport::Log(llvm::StringRef message) {
 Expected<std::string>
 HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) {
   if (!m_input || !m_input->IsValid())
-    return createStringError("transport output is closed");
+    return llvm::make_error<TransportInvalidError>();
 
   IOObject *input = m_input.get();
   Expected<std::string> message_header =
@@ -131,7 +131,7 @@ HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) {
 
 Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) {
   if (!m_output || !m_output->IsValid())
-    return llvm::make_error<TransportClosedError>();
+    return llvm::make_error<TransportInvalidError>();
 
   Log(llvm::formatv("<-- {0}", message).str());
 
@@ -142,6 +142,35 @@ Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) {
   return m_output->Write(Output.data(), num_bytes).takeError();
 }
 
+Expected<std::string>
+JSONRPCTransport::ReadImpl(const std::chrono::microseconds &timeout) {
+  if (!m_input || !m_input->IsValid())
+    return make_error<TransportInvalidError>();
+
+  IOObject *input = m_input.get();
+  Expected<std::string> raw_json =
+      ReadUntil(*input, kMessageSeparator, timeout);
+  if (!raw_json)
+    return raw_json.takeError();
+
+  Log(llvm::formatv("--> {0}", *raw_json).str());
+
+  return *raw_json;
+}
+
+Error JSONRPCTransport::WriteImpl(const std::string &message) {
+  if (!m_output || !m_output->IsValid())
+    return llvm::make_error<TransportInvalidError>();
+
+  Log(llvm::formatv("<-- {0}", message).str());
+
+  std::string Output;
+  llvm::raw_string_ostream OS(Output);
+  OS << message << kMessageSeparator;
+  size_t num_bytes = Output.size();
+  return m_output->Write(Output.data(), num_bytes).takeError();
+}
+
 char TransportEOFError::ID;
 char TransportTimeoutError::ID;
-char TransportClosedError::ID;
+char TransportInvalidError::ID;
diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt
index 37a6a81ad12a..ee623d341ec6 100644
--- a/lldb/unittests/DAP/CMakeLists.txt
+++ b/lldb/unittests/DAP/CMakeLists.txt
@@ -7,7 +7,6 @@ add_lldb_unittest(DAPTests
   LLDBUtilsTest.cpp
   ProtocolTypesTest.cpp
   TestBase.cpp
-  TransportTest.cpp
   VariablesTest.cpp
 
   LINK_COMPONENTS
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index 4063b3425031..27ad42686fbb 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -28,13 +28,8 @@ using lldb_private::File;
 using lldb_private::NativeFile;
 using lldb_private::Pipe;
 
-void PipeBase::SetUp() {
-  ASSERT_THAT_ERROR(input.CreateNew(false).ToError(), Succeeded());
-  ASSERT_THAT_ERROR(output.CreateNew(false).ToError(), Succeeded());
-}
-
 void TransportBase::SetUp() {
-  PipeBase::SetUp();
+  PipeTest::SetUp();
   to_dap = std::make_unique<Transport>(
       "to_dap", nullptr,
       std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
diff --git a/lldb/unittests/DAP/TestBase.h b/lldb/unittests/DAP/TestBase.h
index 70b3985271a9..25d37013954d 100644
--- a/lldb/unittests/DAP/TestBase.h
+++ b/lldb/unittests/DAP/TestBase.h
@@ -8,26 +8,17 @@
 
 #include "DAP.h"
 #include "Protocol/ProtocolBase.h"
+#include "TestingSupport/Host/PipeTestUtilities.h"
 #include "Transport.h"
-#include "lldb/Host/Pipe.h"
 #include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace lldb_dap_tests {
 
-/// A base class for tests that need a pair of pipes for communication.
-class PipeBase : public testing::Test {
-protected:
-  lldb_private::Pipe input;
-  lldb_private::Pipe output;
-
-  void SetUp() override;
-};
-
 /// A base class for tests that need transport configured for communicating DAP
 /// messages.
-class TransportBase : public PipeBase {
+class TransportBase : public PipeTest {
 protected:
   std::unique_ptr<lldb_dap::Transport> to_dap;
   std::unique_ptr<lldb_dap::Transport> from_dap;
diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp
deleted file mode 100644
index aaf257993af2..000000000000
--- a/lldb/unittests/DAP/TransportTest.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===-- TransportTest.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Transport.h"
-#include "Protocol/ProtocolBase.h"
-#include "TestBase.h"
-#include "lldb/Host/File.h"
-#include "lldb/Host/Pipe.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Testing/Support/Error.h"
-#include "gtest/gtest.h"
-#include <chrono>
-#include <memory>
-#include <optional>
-
-using namespace llvm;
-using namespace lldb;
-using namespace lldb_dap;
-using namespace lldb_dap_tests;
-using namespace lldb_dap::protocol;
-using lldb_private::File;
-using lldb_private::NativeFile;
-using lldb_private::Pipe;
-using lldb_private::TransportEOFError;
-using lldb_private::TransportTimeoutError;
-
-class TransportTest : public PipeBase {
-protected:
-  std::unique_ptr<Transport> transport;
-
-  void SetUp() override {
-    PipeBase::SetUp();
-    transport = std::make_unique<Transport>(
-        "stdio", nullptr,
-        std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
-                                     File::eOpenOptionReadOnly,
-                                     NativeFile::Unowned),
-        std::make_shared<NativeFile>(output.GetWriteFileDescriptor(),
-                                     File::eOpenOptionWriteOnly,
-                                     NativeFile::Unowned));
-  }
-};
-
-TEST_F(TransportTest, MalformedRequests) {
-  std::string malformed_header = "COnTent-LenGth: -1{}\r\n\r\nnotjosn";
-  ASSERT_THAT_EXPECTED(
-      input.Write(malformed_header.data(), malformed_header.size()),
-      Succeeded());
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      FailedWithMessage(
-          "expected 'Content-Length: ' and got 'COnTent-LenGth: '"));
-}
-
-TEST_F(TransportTest, Read) {
-  std::string json =
-      R"json({"seq": 1, "type": "request", "command": "abc"})json";
-  std::string message =
-      formatv("Content-Length: {0}\r\n\r\n{1}", json.size(), json).str();
-  ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
-                       Succeeded());
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      HasValue(testing::VariantWith<Request>(testing::FieldsAre(
-          /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt))));
-}
-
-TEST_F(TransportTest, ReadWithTimeout) {
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      Failed<TransportTimeoutError>());
-}
-
-TEST_F(TransportTest, ReadWithEOF) {
-  input.CloseWriteFileDescriptor();
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      Failed<TransportEOFError>());
-}
-
-TEST_F(TransportTest, Write) {
-  ASSERT_THAT_ERROR(transport->Write(Event{"my-event", std::nullopt}),
-                    Succeeded());
-  output.CloseWriteFileDescriptor();
-  char buf[1024];
-  Expected<size_t> bytes_read =
-      output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
-  ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
-  ASSERT_EQ(
-      StringRef(buf, *bytes_read),
-      StringRef("Content-Length: 43\r\n\r\n"
-                R"json({"event":"my-event","seq":0,"type":"event"})json"));
-}
diff --git a/lldb/unittests/Host/CMakeLists.txt b/lldb/unittests/Host/CMakeLists.txt
index 5b8deed00af8..3b20f1d723d1 100644
--- a/lldb/unittests/Host/CMakeLists.txt
+++ b/lldb/unittests/Host/CMakeLists.txt
@@ -6,6 +6,7 @@ set (FILES
   HostInfoTest.cpp
   HostTest.cpp
   MainLoopTest.cpp
+  JSONTransportTest.cpp
   NativeProcessProtocolTest.cpp
   PipeTest.cpp
   ProcessLaunchInfoTest.cpp
diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
new file mode 100644
index 000000000000..f1ec5e03bbec
--- /dev/null
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -0,0 +1,176 @@
+//===-- JSONTransportTest.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/JSONTransport.h"
+#include "TestingSupport/Host/PipeTestUtilities.h"
+#include "lldb/Host/File.h"
+
+using namespace llvm;
+using namespace lldb_private;
+
+namespace {
+template <typename T> class JSONTransportTest : public PipeTest {
+protected:
+  std::unique_ptr<JSONTransport> transport;
+
+  void SetUp() override {
+    PipeTest::SetUp();
+    transport = std::make_unique<T>(
+        std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
+                                     File::eOpenOptionReadOnly,
+                                     NativeFile::Unowned),
+        std::make_shared<NativeFile>(output.GetWriteFileDescriptor(),
+                                     File::eOpenOptionWriteOnly,
+                                     NativeFile::Unowned));
+  }
+};
+
+class HTTPDelimitedJSONTransportTest
+    : public JSONTransportTest<HTTPDelimitedJSONTransport> {
+public:
+  using JSONTransportTest::JSONTransportTest;
+};
+
+class JSONRPCTransportTest : public JSONTransportTest<JSONRPCTransport> {
+public:
+  using JSONTransportTest::JSONTransportTest;
+};
+
+struct JSONTestType {
+  std::string str;
+};
+
+llvm::json::Value toJSON(const JSONTestType &T) {
+  return llvm::json::Object{{"str", T.str}};
+}
+
+bool fromJSON(const llvm::json::Value &V, JSONTestType &T, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("str", T.str);
+}
+} // namespace
+
+TEST_F(HTTPDelimitedJSONTransportTest, MalformedRequests) {
+  std::string malformed_header = "COnTent-LenGth: -1{}\r\n\r\nnotjosn";
+  ASSERT_THAT_EXPECTED(
+      input.Write(malformed_header.data(), malformed_header.size()),
+      Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      FailedWithMessage(
+          "expected 'Content-Length: ' and got 'COnTent-LenGth: '"));
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, Read) {
+  std::string json = R"json({"str": "foo"})json";
+  std::string message =
+      formatv("Content-Length: {0}\r\n\r\n{1}", json.size(), json).str();
+  ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
+                       Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      HasValue(testing::FieldsAre(/*str=*/"foo")));
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, ReadWithEOF) {
+  input.CloseWriteFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportEOFError>());
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, InvalidTransport) {
+  transport = std::make_unique<HTTPDelimitedJSONTransport>(nullptr, nullptr);
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportInvalidError>());
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, Write) {
+  ASSERT_THAT_ERROR(transport->Write(JSONTestType{"foo"}), Succeeded());
+  output.CloseWriteFileDescriptor();
+  char buf[1024];
+  Expected<size_t> bytes_read =
+      output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
+  ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
+  ASSERT_EQ(StringRef(buf, *bytes_read), StringRef("Content-Length: 13\r\n\r\n"
+                                                   R"json({"str":"foo"})json"));
+}
+
+TEST_F(JSONRPCTransportTest, MalformedRequests) {
+  std::string malformed_header = "notjson\n";
+  ASSERT_THAT_EXPECTED(
+      input.Write(malformed_header.data(), malformed_header.size()),
+      Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(JSONRPCTransportTest, Read) {
+  std::string json = R"json({"str": "foo"})json";
+  std::string message = formatv("{0}\n", json).str();
+  ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
+                       Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      HasValue(testing::FieldsAre(/*str=*/"foo")));
+}
+
+TEST_F(JSONRPCTransportTest, ReadWithEOF) {
+  input.CloseWriteFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportEOFError>());
+}
+
+TEST_F(JSONRPCTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(JSONRPCTransportTest, Write) {
+  ASSERT_THAT_ERROR(transport->Write(JSONTestType{"foo"}), Succeeded());
+  output.CloseWriteFileDescriptor();
+  char buf[1024];
+  Expected<size_t> bytes_read =
+      output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
+  ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
+  ASSERT_EQ(StringRef(buf, *bytes_read), StringRef(R"json({"str":"foo"})json"
+                                                   "\n"));
+}
+
+TEST_F(JSONRPCTransportTest, InvalidTransport) {
+  transport = std::make_unique<JSONRPCTransport>(nullptr, nullptr);
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportInvalidError>());
+}
+
+#ifndef _WIN32
+TEST_F(HTTPDelimitedJSONTransportTest, ReadWithTimeout) {
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportTimeoutError>());
+}
+
+TEST_F(JSONRPCTransportTest, ReadWithTimeout) {
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportTimeoutError>());
+}
+#endif
diff --git a/lldb/unittests/TestingSupport/Host/PipeTestUtilities.h b/lldb/unittests/TestingSupport/Host/PipeTestUtilities.h
new file mode 100644
index 000000000000..50d5d4117c89
--- /dev/null
+++ b/lldb/unittests/TestingSupport/Host/PipeTestUtilities.h
@@ -0,0 +1,28 @@
+//===-- PipeTestUtilities.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_UNITTESTS_TESTINGSUPPORT_PIPETESTUTILITIES_H
+#define LLDB_UNITTESTS_TESTINGSUPPORT_PIPETESTUTILITIES_H
+
+#include "lldb/Host/Pipe.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+/// A base class for tests that need a pair of pipes for communication.
+class PipeTest : public testing::Test {
+protected:
+  lldb_private::Pipe input;
+  lldb_private::Pipe output;
+
+  void SetUp() override {
+    ASSERT_THAT_ERROR(input.CreateNew(false).ToError(), llvm::Succeeded());
+    ASSERT_THAT_ERROR(output.CreateNew(false).ToError(), llvm::Succeeded());
+  }
+};
+
+#endif

From 26f91610011f1a23cb306d61bbc1fafded7d077d Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 12 Jun 2025 23:13:13 +0100
Subject: [PATCH 0213/1322] [lit] cleanup unused imports (#143930)

Remove imports that are not used in some lit test files.
---
 lld/test/Unit/lit.cfg.py         | 1 -
 lldb/test/API/lit.cfg.py         | 2 --
 lldb/test/Shell/lit.cfg.py       | 5 +----
 lldb/test/lit.cfg.py             | 3 ---
 llvm/utils/lit/lit/LitConfig.py  | 6 ++----
 llvm/utils/lit/lit/TestRunner.py | 6 ------
 llvm/utils/lit/lit/discovery.py  | 2 +-
 llvm/utils/lit/lit/worker.py     | 2 --
 8 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/lld/test/Unit/lit.cfg.py b/lld/test/Unit/lit.cfg.py
index 1cf890a05cb2..47375db517e9 100644
--- a/lld/test/Unit/lit.cfg.py
+++ b/lld/test/Unit/lit.cfg.py
@@ -3,7 +3,6 @@
 # Configuration file for the 'lit' test runner.
 
 import os
-import subprocess
 
 import lit.formats
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 646a446c86fd..04b360e8d330 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -9,8 +9,6 @@ import shutil
 import subprocess
 import sys
 
-import lit.formats
-
 # name: The name of this test suite.
 config.name = "lldb-api"
 
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index ab6113767187..6f0e017fb7cb 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -7,12 +7,9 @@ import re
 import shutil
 import site
 import subprocess
-import sys
 
-import lit.formats
+import lit.util
 from lit.llvm import llvm_config
-from lit.llvm.subst import FindTool
-from lit.llvm.subst import ToolSubst
 
 site.addsitedir(os.path.dirname(__file__))
 from helper import toolchain
diff --git a/lldb/test/lit.cfg.py b/lldb/test/lit.cfg.py
index eefc32aabd16..6a4255c2627d 100644
--- a/lldb/test/lit.cfg.py
+++ b/lldb/test/lit.cfg.py
@@ -2,9 +2,6 @@
 
 import os
 
-import lit.formats
-from lit.llvm import llvm_config
-
 # This is the top level configuration. Most of these configuration options will
 # be overriden by individual lit configuration files in the test
 # subdirectories. Anything configured here will *not* be loaded when pointing
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
index cb4aef6f72a8..5bb2d3c5c986 100644
--- a/llvm/utils/lit/lit/LitConfig.py
+++ b/llvm/utils/lit/lit/LitConfig.py
@@ -1,14 +1,12 @@
 from __future__ import absolute_import
+
 import inspect
 import os
-import platform
 import sys
 
-import lit.Test
-import lit.formats
-import lit.TestingConfig
 import lit.util
 
+
 # LitConfig must be a new style class for properties to work
 class LitConfig(object):
     """LitConfig - Configuration data for a 'lit' test runner instance, shared
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 73db67aedb73..1d3bf8e4e8df 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1,7 +1,4 @@
 from __future__ import absolute_import
-import errno
-import io
-import itertools
 import getopt
 import os, signal, subprocess, sys
 import re
@@ -12,11 +9,8 @@ import shlex
 import shutil
 import tempfile
 import threading
-import typing
 from typing import Optional, Tuple
 
-import io
-
 try:
     from StringIO import StringIO
 except ImportError:
diff --git a/llvm/utils/lit/lit/discovery.py b/llvm/utils/lit/lit/discovery.py
index 2e7f90c6bb0c..2e93bacc1236 100644
--- a/llvm/utils/lit/lit/discovery.py
+++ b/llvm/utils/lit/lit/discovery.py
@@ -6,8 +6,8 @@ import copy
 import os
 import sys
 
+from lit import Test, util
 from lit.TestingConfig import TestingConfig
-from lit import LitConfig, Test, util
 
 
 def chooseConfigFileFromDir(dir, config_names):
diff --git a/llvm/utils/lit/lit/worker.py b/llvm/utils/lit/lit/worker.py
index 8e78bfd45d38..dbc3ab53bc62 100644
--- a/llvm/utils/lit/lit/worker.py
+++ b/llvm/utils/lit/lit/worker.py
@@ -12,8 +12,6 @@ import time
 import traceback
 
 import lit.Test
-import lit.util
-
 
 _lit_config = None
 _parallelism_semaphores = None

From 2ee8fdbfddcca86ac079104718e6fda3aabed0eb Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Thu, 12 Jun 2025 22:14:28 +0000
Subject: [PATCH 0214/1322] [libc] Prevent building wchar on MacOS (#143978)

Prevent building wchar on macos as it depends on uchar.h which isn't
available
---
 libc/src/__support/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 309cde76370f..7e85136c0885 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -391,6 +391,10 @@ add_subdirectory(fixed_point)
 
 add_subdirectory(time)
 
-add_subdirectory(wchar)
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()
 
 add_subdirectory(math)

From 2b8f82b8308fc9df0a74cdd61a1257d9eb51189c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 12 Jun 2025 23:40:57 +0100
Subject: [PATCH 0215/1322] [RISCV] Remove implicit $vl def on vleNff pseudos
 (#143935)

In #90049 we removed the side effect flag on the vleNff pseudos with the
reasoning that we modelled the effect of setting vl as an output
operand.

This extends this further by removing the implicit def on vl, inserting
it back in RISCVInsertVSETVLI when we also emit the PseudoReadVL.

The motiviation for this is to make it easier to handle vleff in more
places in RISCVVectorPeephole in a follow up patch, which in turn will
make migrating the last vmerge peephole over from RISCVISelDAGToDAG
easier.

Some of these tests claim that the vleff shouldn't be deleted when none
of its values are used, but these are from the initial commit in
3b5430eb0dad5. I'm not sure if these still hold today?

This also moves the fault-only-first predicate to
RISCVInstrPredicates.td since we can't rely on the implicit vl operand
anymore.
---
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp            |  2 +-
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp         |  5 +++--
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp             |  5 -----
 llvm/lib/Target/RISCV/RISCVInstrInfo.h               |  2 --
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td      |  6 ------
 llvm/lib/Target/RISCV/RISCVInstrPredicates.td        |  7 +++++++
 llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll | 12 ++++++------
 llvm/test/CodeGen/RISCV/rvv/vleff.ll                 |  5 -----
 llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll     |  4 ----
 llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll     |  4 ----
 10 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 72f1596d79a0..4fb71a3ed000 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -1101,7 +1101,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
   if (RISCVII::hasRoundModeOp(TSFlags))
     --NumOps;
 
-  bool hasVLOutput = RISCV::isFaultFirstLoad(*MI);
+  bool hasVLOutput = RISCVInstrInfo::isFaultOnlyFirstLoad(*MI);
   for (unsigned OpNo = 0; OpNo != NumOps; ++OpNo) {
     const MachineOperand &MO = MI->getOperand(OpNo);
     // Skip vl output. It should be the second output.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 53192e9dfe6c..9a513891b765 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1253,7 +1253,7 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
     return;
   }
 
-  if (RISCV::isFaultFirstLoad(MI)) {
+  if (RISCVInstrInfo::isFaultOnlyFirstLoad(MI)) {
     // Update AVL to vl-output of the fault first load.
     assert(MI.getOperand(1).getReg().isVirtual());
     if (LIS) {
@@ -1756,7 +1756,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
 void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
   for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
     MachineInstr &MI = *I++;
-    if (RISCV::isFaultFirstLoad(MI)) {
+    if (RISCVInstrInfo::isFaultOnlyFirstLoad(MI)) {
       Register VLOutput = MI.getOperand(1).getReg();
       assert(VLOutput.isVirtual());
       if (!MI.getOperand(1).isDead()) {
@@ -1774,6 +1774,7 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
       }
       // We don't use the vl output of the VLEFF/VLSEGFF anymore.
       MI.getOperand(1).setReg(RISCV::X0);
+      MI.addRegisterDefined(RISCV::VL, MRI->getTargetRegisterInfo());
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 86a4e8e370ee..e5d29e1a8b47 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4568,11 +4568,6 @@ RISCV::isRVVSpillForZvlsseg(unsigned Opcode) {
   }
 }
 
-bool RISCV::isFaultFirstLoad(const MachineInstr &MI) {
-  return MI.getNumExplicitDefs() == 2 &&
-         MI.modifiesRegister(RISCV::VL, /*TRI=*/nullptr) && !MI.isInlineAsm();
-}
-
 bool RISCV::hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2) {
   int16_t MI1FrmOpIdx =
       RISCV::getNamedOperandIdx(MI1.getOpcode(), RISCV::OpName::frm);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index b099acd81e99..8260949cf918 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -357,8 +357,6 @@ bool isRVVSpill(const MachineInstr &MI);
 std::optional<std::pair<unsigned, unsigned>>
 isRVVSpillForZvlsseg(unsigned Opcode);
 
-bool isFaultFirstLoad(const MachineInstr &MI);
-
 // Return true if both input instructions have equal rounding mode. If at least
 // one of the instructions does not have rounding mode, false will be returned.
 bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 281f8d55932b..f9fc6f0be380 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6144,8 +6144,6 @@ defm PseudoVSUX : VPseudoIStore<Ordered=false>;
 // 7.7. Unit-stride Fault-Only-First Loads
 //===----------------------------------------------------------------------===//
 
-// vleff may update VL register
-let Defs = [VL] in
 defm PseudoVL : VPseudoFFLoad;
 
 //===----------------------------------------------------------------------===//
@@ -6159,11 +6157,7 @@ defm PseudoVSSEG : VPseudoUSSegStore;
 defm PseudoVSSSEG : VPseudoSSegStore;
 defm PseudoVSOXSEG : VPseudoISegStore<Ordered=true>;
 defm PseudoVSUXSEG : VPseudoISegStore<Ordered=false>;
-
-// vlseg<nf>e<eew>ff.v may update VL register
-let Defs = [VL] in {
 defm PseudoVLSEG : VPseudoUSSegLoadFF;
-}
 
 //===----------------------------------------------------------------------===//
 // 11. Vector Integer Arithmetic Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 4c37cb7e393b..1057eeee31d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -129,6 +129,13 @@ def isVSlideInstr
                       !instances<Pseudo>("^PseudoVSLIDEUP_VI.*")
                      ])>>>;
 
+def isFaultOnlyFirstLoad
+    : TIIPredicate<"isFaultOnlyFirstLoad",
+                    MCReturnStatement<
+                      CheckOpcode<
+                       !instances<Pseudo>(
+                          "^PseudoVL(SEG[2-8])?E(8|16|32|64)FF_V.*")>>>;
+
 def isNonZeroLoadImmediate
     : TIIPredicate<"isNonZeroLoadImmediate",
                    MCReturnStatement<CheckAll<[
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index e4235d03cda3..db31866b5637 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -14,7 +14,7 @@ define i64 @test_vleff_nxv8i8(ptr %p, i64 %vl) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -31,7 +31,7 @@ define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %passthru, ptr %p, i64 %vl) {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vr = COPY $v8
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -50,7 +50,7 @@ define i64 @test_vleff_nxv8i8_mask(<vscale x 8 x i8> %maskedoff, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -66,7 +66,7 @@ define i64 @test_vlseg2ff_nxv8i8(ptr %base, i64 %vl, ptr %outvl) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -83,7 +83,7 @@ define i64 @test_vlseg2ff_nxv8i8_tu(target("riscv.vector.tuple", <vscale x 8 x i
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vrn2m1 = COPY $v8_v9
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -102,7 +102,7 @@ define i64 @test_vlseg2ff_nxv8i8_mask(target("riscv.vector.tuple", <vscale x 8 x
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrn2m1nov0 = COPY $v8_v9
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_MASK:%[0-9]+]]:vrn2m1nov0, [[PseudoVLSEG2E8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_MASK:%[0-9]+]]:vrn2m1nov0, [[PseudoVLSEG2E8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff.ll b/llvm/test/CodeGen/RISCV/rvv/vleff.ll
index 1f3959c1eac8..4c989ce87290 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff.ll
@@ -3016,12 +3016,9 @@ entry:
   ret void
 }
 
-; Test with both outputs dead. Make sure the vleff isn't deleted.
 define void @intrinsic_vleff_dead_all(ptr %0, iXLen %1, ptr %2) nounwind {
 ; CHECK-LABEL: intrinsic_vleff_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vle64ff.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %a = call { <vscale x 1 x double>, iXLen } @llvm.riscv.vleff.nxv1f64(
@@ -3034,8 +3031,6 @@ entry:
 define void @intrinsic_vleff_mask_dead_all(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vleff_mask_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vle64ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %a = call { <vscale x 1 x double>, iXLen } @llvm.riscv.vleff.mask.nxv1f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
index d8bff08ea551..333ba83f69ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
@@ -66,8 +66,6 @@ entry:
 define void @test_vlseg2ff_dead_all(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2ff_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i32} @llvm.riscv.vlseg2ff.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
@@ -77,8 +75,6 @@ entry:
 define void @test_vlseg2ff_mask_dead_all(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2ff_mask_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i32} @llvm.riscv.vlseg2ff.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
index 05a5be295cc7..b9e45cc190a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
@@ -66,8 +66,6 @@ entry:
 define void @test_vlseg2ff_dead_all(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2ff_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i64} @llvm.riscv.vlseg2ff.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
@@ -77,8 +75,6 @@ entry:
 define void @test_vlseg2ff_mask_dead_all(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2ff_mask_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i64} @llvm.riscv.vlseg2ff.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)

From 703e4460228fa5893dd0dff514ce44442b310b5e Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou@intel.com>
Date: Fri, 13 Jun 2025 06:45:28 +0800
Subject: [PATCH 0216/1322] [Clang] Add check for -mstack-alignment (#143124)

Currently the assertion in Alignment.h is triggered if a wrong value is
passed -mstack-alignment option:
```
Assertion `(Value == 0 || llvm::isPowerOf2_64(Value)) && "Alignment is neither 0 nor
a power of 2"' failed.
```

Added check in clang driver for the value of -mstack-alignment option,
and emitted an error message when the wrong value was passed.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 14 +++++++++++---
 clang/test/Driver/stack-alignment.c   | 11 +++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/stack-alignment.c

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1d11be1d82be..15acb88c1a8f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6907,9 +6907,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.addOptInFlag(CmdArgs, options::OPT_mstackrealign,
                     options::OPT_mno_stackrealign);
 
-  if (Args.hasArg(options::OPT_mstack_alignment)) {
-    StringRef alignment = Args.getLastArgValue(options::OPT_mstack_alignment);
-    CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment));
+  if (const Arg *A = Args.getLastArg(options::OPT_mstack_alignment)) {
+    StringRef Value = A->getValue();
+    int64_t Alignment = 0;
+    if (Value.getAsInteger(10, Alignment) || Alignment < 0)
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Value << A->getOption().getName();
+    else if (Alignment & (Alignment - 1))
+      D.Diag(diag::err_drv_alignment_not_power_of_two)
+          << A->getAsString(Args) << Value;
+    else
+      CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + Value));
   }
 
   if (Args.hasArg(options::OPT_mstack_probe_size)) {
diff --git a/clang/test/Driver/stack-alignment.c b/clang/test/Driver/stack-alignment.c
new file mode 100644
index 000000000000..e1e62c05c32a
--- /dev/null
+++ b/clang/test/Driver/stack-alignment.c
@@ -0,0 +1,11 @@
+// RUN: not %clang -### -mstack-alignment=-1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_NEG_1
+// RUN: %clang -### -mstack-alignment=0 %s 2>&1 | FileCheck %s --check-prefix=CHECK_0
+// RUN: %clang -### -mstack-alignment=1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_1
+// RUN: %clang -### -mstack-alignment=4 %s 2>&1 | FileCheck %s --check-prefix=CHECK_4
+// RUN: not %clang -### -mstack-alignment=5 %s 2>&1 | FileCheck %s --check-prefix=CHECK_5
+
+// CHECK_NEG_1: error: invalid argument '-1' to -mstack-alignment=
+// CHECK_0: -mstack-alignment=0
+// CHECK_1: -mstack-alignment=1
+// CHECK_4: -mstack-alignment=4
+// CHECK_5: error: alignment is not a power of 2 in '-mstack-alignment=5'

From 28c14d475fbd16d07db88c8d12edddfe9cc226ab Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 15:57:58 -0700
Subject: [PATCH 0217/1322] [libc] Independent strcat/strncat/stpcpy (#142643)

The previous implementations called other entrypoints. This patch fixes
strcat, strncat, and stpcpy to be properly independent.
---
 libc/src/string/CMakeLists.txt |  3 ---
 libc/src/string/stpcpy.cpp     |  5 ++---
 libc/src/string/strcat.cpp     |  9 +++++----
 libc/src/string/strncat.cpp    | 10 +++++-----
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index c3b414d87285..8784bc3750cb 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -87,7 +87,6 @@ add_entrypoint_object(
   HDRS
     stpcpy.h
   DEPENDS
-    .mempcpy
     .string_utils
 )
 
@@ -108,7 +107,6 @@ add_entrypoint_object(
   HDRS
     strcat.h
   DEPENDS
-    .strcpy
     .string_utils
     libc.include.llvm-libc-types.size_t
 )
@@ -265,7 +263,6 @@ add_entrypoint_object(
   HDRS
     strncat.h
   DEPENDS
-    .strncpy
     .string_utils
     libc.include.llvm-libc-types.size_t
 )
diff --git a/libc/src/string/stpcpy.cpp b/libc/src/string/stpcpy.cpp
index 979edd72c1f1..48c0db950ace 100644
--- a/libc/src/string/stpcpy.cpp
+++ b/libc/src/string/stpcpy.cpp
@@ -8,7 +8,6 @@
 
 #include "src/string/stpcpy.h"
 #include "src/__support/macros/config.h"
-#include "src/string/mempcpy.h"
 #include "src/string/string_utils.h"
 
 #include "src/__support/common.h"
@@ -18,8 +17,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, stpcpy,
                    (char *__restrict dest, const char *__restrict src)) {
   size_t size = internal::string_length(src) + 1;
-  char *result =
-      reinterpret_cast<char *>(LIBC_NAMESPACE::mempcpy(dest, src, size));
+  __builtin_memcpy(dest, src, size);
+  char *result = dest + size;
 
   if (result != nullptr)
     return result - 1;
diff --git a/libc/src/string/strcat.cpp b/libc/src/string/strcat.cpp
index 6a6f068bd475..7dce6d15a65c 100644
--- a/libc/src/string/strcat.cpp
+++ b/libc/src/string/strcat.cpp
@@ -9,7 +9,6 @@
 #include "src/string/strcat.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/null_check.h"
-#include "src/string/strcpy.h"
 #include "src/string/string_utils.h"
 
 #include "src/__support/common.h"
@@ -21,9 +20,11 @@ LLVM_LIBC_FUNCTION(char *, strcat,
   LIBC_CRASH_ON_NULLPTR(dest);
   LIBC_CRASH_ON_NULLPTR(src);
   size_t dest_length = internal::string_length(dest);
-  size_t src_length = internal::string_length(src);
-  LIBC_NAMESPACE::strcpy(dest + dest_length, src);
-  dest[dest_length + src_length] = '\0';
+  size_t i;
+  for (i = 0; src[i] != '\0'; ++i)
+    dest[dest_length + i] = src[i];
+
+  dest[dest_length + i] = '\0';
   return dest;
 }
 
diff --git a/libc/src/string/strncat.cpp b/libc/src/string/strncat.cpp
index 4926b7d244d1..6d8bb6960748 100644
--- a/libc/src/string/strncat.cpp
+++ b/libc/src/string/strncat.cpp
@@ -10,7 +10,6 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/null_check.h"
 #include "src/string/string_utils.h"
-#include "src/string/strncpy.h"
 
 #include "src/__support/common.h"
 
@@ -23,11 +22,12 @@ LLVM_LIBC_FUNCTION(char *, strncat,
     LIBC_CRASH_ON_NULLPTR(dest);
     LIBC_CRASH_ON_NULLPTR(src);
   }
-  size_t src_length = internal::string_length(src);
-  size_t copy_amount = src_length > count ? count : src_length;
   size_t dest_length = internal::string_length(dest);
-  LIBC_NAMESPACE::strncpy(dest + dest_length, src, copy_amount);
-  dest[dest_length + copy_amount] = '\0';
+  size_t i;
+  for (i = 0; i < count && src[i] != '\0'; ++i)
+    dest[dest_length + i] = src[i];
+
+  dest[dest_length + i] = '\0';
   return dest;
 }
 

From 32e1360aaa9fbf5e388f9d061fa004b02c0a1359 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 12 Jun 2025 16:16:14 -0700
Subject: [PATCH 0218/1322] [CIR][NFC] Fix build problems with [[maybe_unused]]
 (#143994)

A recent commit introduced the use of [[maybe_unused]] following
LLVM_PREFFERED_TYPE(bool) on a member variable declaration. I compiled
it with clang 14.0, which doesn't support the `preferred_type` attribute
so I didn't notice a problem. However, starting with clang 18.0, this
reports an error ("an attribute list cannot appear here") because of the
mixing of attribute styles.

This change fixes the problem by replacing [[maybe_unused]] with
LLVM_ATTRIBUTE_UNUSED.
---
 clang/lib/CIR/CodeGen/CIRGenValue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 8f52fea31750..258ae306f693 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -271,7 +271,7 @@ class AggValueSlot {
   /// destructor for the slot.  Otherwise the code which constructs it should
   /// push the appropriate cleanup.
   LLVM_PREFERRED_TYPE(bool)
-  [[maybe_unused]] unsigned destructedFlag : 1;
+  LLVM_ATTRIBUTE_UNUSED unsigned destructedFlag : 1;
 
   /// This is set to true if the memory in the slot is known to be zero before
   /// the assignment into it.  This means that zero fields don't need to be set.
@@ -290,7 +290,7 @@ class AggValueSlot {
   /// object, it's important that this flag never be set when
   /// evaluating an expression which constructs such an object.
   LLVM_PREFERRED_TYPE(bool)
-  [[maybe_unused]] unsigned aliasedFlag : 1;
+  LLVM_ATTRIBUTE_UNUSED unsigned aliasedFlag : 1;
 
   /// This is set to true if the tail padding of this slot might overlap
   /// another object that may have already been initialized (and whose
@@ -298,7 +298,7 @@ class AggValueSlot {
   /// store up to the dsize of the type. Otherwise we can widen stores to
   /// the size of the type.
   LLVM_PREFERRED_TYPE(bool)
-  [[maybe_unused]] unsigned overlapFlag : 1;
+  LLVM_ATTRIBUTE_UNUSED unsigned overlapFlag : 1;
 
 public:
   enum IsDestructed_t { IsNotDestructed, IsDestructed };

From 70f44ec6feba56b076cf65e02b8876f185efdab9 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 16:49:25 -0700
Subject: [PATCH 0219/1322] [libc][NFC] Accept doc fix (#143996)

Docgen updates the docs when the config options are changed. This update
has been waiting since https://github.com/llvm/llvm-project/pull/143187.
---
 libc/docs/configure.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 8d53390ae19b..109412225634 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,7 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**

From e1bb35d067568794585544b8942638c467d13bea Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 12 Jun 2025 16:52:32 -0700
Subject: [PATCH 0220/1322] [bazel] Fix modules build for llvm-libc
 (speculative) (#143995)

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel          | 7 +++++++
 .../llvm-project-overlay/libc/test/UnitTest/BUILD.bazel    | 1 +
 2 files changed, 8 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 0cedad285924..84a6b7d23044 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1619,6 +1619,10 @@ libc_header_library(
 libc_support_library(
     name = "__support_libc_errno",
     hdrs = ["src/__support/libc_errno.h"],
+    deps = [
+        ":__support_macros_config",
+        ":hdr_errno_macros",
+    ],
 )
 
 libc_support_library(
@@ -1981,6 +1985,9 @@ libc_support_library(
 libc_support_library(
     name = "__support_math_exp_float_constants",
     hdrs = ["src/__support/math/exp_float_constants.h"],
+    deps = [
+        ":__support_macros_config",
+    ],
 )
 
 libc_support_library(
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index b37ec1933023..2354337da2dc 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -58,6 +58,7 @@ libc_test_library(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
         "//libc:__support_fputil_rounding_mode",
+        "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
         "//libc:__support_macros_properties_architectures",
         "//libc:__support_macros_properties_types",

From 3ddd137332237918fbb6175c20327fe765d2c4ad Mon Sep 17 00:00:00 2001
From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com>
Date: Thu, 12 Jun 2025 17:08:49 -0700
Subject: [PATCH 0221/1322] [flang] [cuda] Move SetImplicityCUDADevice after
 symbols in block construct are converted to objects (#143791)

`SetImplicitCUDADevice` looks for `symbol.has<ObjectEntityDetails>()` to
set the device attribute before symbols inside block constructs are
converted to ObjectEntity. Fix is to move the call to
`SetImplicitCUDADevice` after those symbols are converted.
---
 flang/lib/Semantics/resolve-names.cpp | 74 ++++++++++++++-------------
 flang/test/Semantics/cuf21.cuf        | 13 +++--
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 7db447aee002..e23e91b674a7 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -2828,6 +2828,16 @@ Scope &ScopeHandler::NonDerivedTypeScope() {
   return currScope_->IsDerivedType() ? currScope_->parent() : *currScope_;
 }
 
+static void SetImplicitCUDADevice(Symbol &symbol) {
+  if (auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
+    if (!object->cudaDataAttr() && !IsValue(symbol) &&
+        !IsFunctionResult(symbol)) {
+      // Implicitly set device attribute if none is set in device context.
+      object->set_cudaDataAttr(common::CUDADataAttr::Device);
+    }
+  }
+}
+
 void ScopeHandler::PushScope(Scope::Kind kind, Symbol *symbol) {
   PushScope(currScope().MakeScope(kind, symbol));
 }
@@ -2867,9 +2877,35 @@ void ScopeHandler::PopScope() {
   // Entities that are not yet classified as objects or procedures are now
   // assumed to be objects.
   // TODO: Statement functions
+  bool inDeviceSubprogram{false};
+  const Symbol *scopeSym{currScope().GetSymbol()};
+  if (currScope().kind() == Scope::Kind::BlockConstruct) {
+    scopeSym = GetProgramUnitContaining(currScope()).GetSymbol();
+  }
+  if (scopeSym) {
+    if (auto *details{scopeSym->detailsIf<SubprogramDetails>()}) {
+      // Check the current procedure is a device procedure to apply implicit
+      // attribute at the end.
+      if (auto attrs{details->cudaSubprogramAttrs()}) {
+        if (*attrs == common::CUDASubprogramAttrs::Device ||
+            *attrs == common::CUDASubprogramAttrs::Global ||
+            *attrs == common::CUDASubprogramAttrs::Grid_Global) {
+          inDeviceSubprogram = true;
+        }
+      }
+    }
+  }
   for (auto &pair : currScope()) {
     ConvertToObjectEntity(*pair.second);
   }
+
+  // Apply CUDA device attributes if in a device subprogram
+  if (inDeviceSubprogram && currScope().kind() == Scope::Kind::BlockConstruct) {
+    for (auto &pair : currScope()) {
+      SetImplicitCUDADevice(*pair.second);
+    }
+  }
+
   funcResultStack_.Pop();
   // If popping back into a global scope, pop back to the top scope.
   Scope *hermetic{context().currentHermeticModuleFileScope()};
@@ -9555,40 +9591,11 @@ void ResolveNamesVisitor::CreateGeneric(const parser::GenericSpec &x) {
   info.Resolve(&MakeSymbol(symbolName, Attrs{}, std::move(genericDetails)));
 }
 
-static void SetImplicitCUDADevice(bool inDeviceSubprogram, Symbol &symbol) {
-  if (inDeviceSubprogram && symbol.has<ObjectEntityDetails>()) {
-    auto *object{symbol.detailsIf<ObjectEntityDetails>()};
-    if (!object->cudaDataAttr() && !IsValue(symbol) &&
-        !IsFunctionResult(symbol)) {
-      // Implicitly set device attribute if none is set in device context.
-      object->set_cudaDataAttr(common::CUDADataAttr::Device);
-    }
-  }
-}
-
 void ResolveNamesVisitor::FinishSpecificationPart(
     const std::list<parser::DeclarationConstruct> &decls) {
   misparsedStmtFuncFound_ = false;
   funcResultStack().CompleteFunctionResultType();
   CheckImports();
-  bool inDeviceSubprogram{false};
-  Symbol *scopeSym{currScope().symbol()};
-  if (currScope().kind() == Scope::Kind::BlockConstruct) {
-    scopeSym = currScope().parent().symbol();
-  }
-  if (scopeSym) {
-    if (auto *details{scopeSym->detailsIf<SubprogramDetails>()}) {
-      // Check the current procedure is a device procedure to apply implicit
-      // attribute at the end.
-      if (auto attrs{details->cudaSubprogramAttrs()}) {
-        if (*attrs == common::CUDASubprogramAttrs::Device ||
-            *attrs == common::CUDASubprogramAttrs::Global ||
-            *attrs == common::CUDASubprogramAttrs::Grid_Global) {
-          inDeviceSubprogram = true;
-        }
-      }
-    }
-  }
   for (auto &pair : currScope()) {
     auto &symbol{*pair.second};
     if (inInterfaceBlock()) {
@@ -9623,11 +9630,6 @@ void ResolveNamesVisitor::FinishSpecificationPart(
         SetBindNameOn(symbol);
       }
     }
-    if (currScope().kind() == Scope::Kind::BlockConstruct) {
-      // Only look for specification in BlockConstruct. Other cases are done in
-      // ResolveSpecificationParts.
-      SetImplicitCUDADevice(inDeviceSubprogram, symbol);
-    }
   }
   currScope().InstantiateDerivedTypes();
   for (const auto &decl : decls) {
@@ -10187,7 +10189,9 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
     }
     ApplyImplicitRules(symbol);
     // Apply CUDA implicit attributes if needed.
-    SetImplicitCUDADevice(inDeviceSubprogram, symbol);
+    if (inDeviceSubprogram) {
+      SetImplicitCUDADevice(symbol);
+    }
     // Main program local objects usually don't have an implied SAVE attribute,
     // as one might think, but in the exceptional case of a derived type
     // local object that contains a coarray, we have to mark it as an
diff --git a/flang/test/Semantics/cuf21.cuf b/flang/test/Semantics/cuf21.cuf
index 077657c8a52d..db32f1dbd0e7 100644
--- a/flang/test/Semantics/cuf21.cuf
+++ b/flang/test/Semantics/cuf21.cuf
@@ -13,18 +13,21 @@ contains
     implicit none
     logical, intent(in), value :: back
     real(4) :: mval
-
-    call maxlocUpdate(mval, back)
-
+  block
+    integer(8) :: xloc
+    call maxlocUpdate(mval, xloc, back)
+  end block
   end subroutine maxlocPartialMaskR_32F1D
 
-  attributes(device) subroutine maxlocUpdateR_32F(mval, back)
+  attributes(device) subroutine maxlocUpdateR_32F(mval, xloc, back)
     real(4) :: mval
+    integer(8) :: xloc
     logical :: back
   end subroutine maxlocUpdateR_32F
 
-  attributes(device) subroutine maxlocUpdateR_64F(mval, back)
+  attributes(device) subroutine maxlocUpdateR_64F(mval, xloc, back)
     real(8) :: mval
+    integer(8) :: xloc
     logical :: back
   end subroutine maxlocUpdateR_64F
 end module

From 22f9b4aa1dad597d908be77be1e10ba4c77330ce Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 12 Jun 2025 20:08:55 -0400
Subject: [PATCH 0222/1322] Reland [HIP] use offload wrapper for
 non-device-only non-rdc (#132869) (#143964)

Fixed two issues:

1. assertion with -flto. the linker wrapper action is missing for
wrapping the device binary. Added it for -flto.

2. when there are two HIP files, the kernels in the second file were not
found. This is because the -r option of linker wrapper assumes offload
entries section of HIP to be hip_offloading_entries but it is actually
llvm_offload_entries, causing the offload entries sections not made
unique for different object files. Fixed and tested working for both
-fgpu-rdc and -fno-gpu-rdc case with and without -r
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  3 +-
 clang/lib/Driver/Driver.cpp                   | 62 +++++++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         | 18 ++++-
 clang/test/Driver/hip-binding.hip             |  6 +-
 clang/test/Driver/hip-phases.hip              | 56 ++++++++-----
 clang/test/Driver/hip-toolchain-no-rdc.hip    | 81 ++++++++++++-------
 clang/test/Driver/linker-wrapper.c            |  1 +
 .../ClangLinkerWrapper.cpp                    | 29 ++++---
 8 files changed, 166 insertions(+), 90 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 38f514304df5..dd26be74e561 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,7 +1280,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+      (CGM.getLangOpts().OffloadingNewDriver &&
+       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index eb60d907d221..060f76fb653c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4423,6 +4423,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4556,7 +4560,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if (UseNewOffloadingDriver ||
+    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4867,10 +4871,31 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input and compile action to embed it in. If preprocessing only
-  // ignore embedding.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
-      !(isa<CompileJobAction>(HostAction) ||
+  // valid source input.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first))
+    return HostAction;
+
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
+  // For HIP non-rdc non-device-only compilation, create a linker wrapper
+  // action for each host object to link, bundle and wrap device files in
+  // it.
+  if ((isa<AssembleJobAction>(HostAction) ||
+       (isa<BackendJobAction>(HostAction) &&
+        HostAction->getType() == types::TY_LTO_BC)) &&
+      HIPNoRDC && !offloadDeviceOnly()) {
+    ActionList AL{HostAction};
+    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+                                         /*BoundArch=*/nullptr);
+    return HostAction;
+  }
+
+  // Don't build offloading actions if we do not have a compile action. If
+  // preprocessing only ignore embedding.
+  if (!(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4966,12 +4991,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in non-RDC mode requires linking each action individually.
+    // Compiling HIP in device-only non-RDC mode requires linking each action
+    // individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          Kind != Action::OFK_HIP ||
-          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+          !HIPNoRDC || !offloadDeviceOnly())
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4995,12 +5020,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+  // HIP code in device-only non-RDC mode will bundle the output if it invoked
+  // the linker.
   bool ShouldBundleHIP =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      HIPNoRDC && offloadDeviceOnly() &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5020,11 +5045,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
-             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                           false)) {
-    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
-    // translation unit, linking each input individually.
+  } else if (HIPNoRDC && offloadDeviceOnly()) {
+    // If we are in device-only non-RDC-mode we just emit the final HIP
+    // fatbinary for each translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5177,8 +5200,11 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                       false) ||
+         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                        false) ||
+           (Args.hasFlag(options::OPT_offload_new_driver,
+                         options::OPT_no_offload_new_driver, false) &&
+            !offloadDeviceOnly())) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 15acb88c1a8f..8556bcadf091 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7710,7 +7710,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if ((IsCuda || IsHIP) && !IsRDCMode) {
+    if (IsCuda && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9257,8 +9257,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-  for (const char *LinkArg : LinkCommand->getArguments())
-    CmdArgs.push_back(LinkArg);
+
+  // We use action type to differentiate two use cases of the linker wrapper.
+  // TY_Image for normal linker wrapper work.
+  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+  // object.
+  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+  if (JA.getType() == types::TY_Object) {
+    CmdArgs.append({"-o", Output.getFilename()});
+    for (auto Input : Inputs)
+      CmdArgs.push_back(Input.getFilename());
+    CmdArgs.push_back("-r");
+  } else
+    for (const char *LinkArg : LinkCommand->getArguments())
+      CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87..d8b3f1e24201 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc..d8a58b78d6d5 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,57 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
 
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
+// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 6c69d1d51a26..ddd251b67cc5 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -133,40 +143,49 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
+// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
+
 //
 // Link host objects.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index a7e98e7351d9..80b1a5745a12 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -223,6 +223,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
+// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 95b6f74af1f1..b8019fac4c2e 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -310,22 +310,21 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   // Remove the old .llvm.offloading section to prevent further linking.
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
-  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
-    auto Section = (Prefix + "_offloading_entries").str();
-    // Rename the offloading entires to make them private to this link unit.
-    ObjcopyArgs.emplace_back("--rename-section");
-    ObjcopyArgs.emplace_back(
-        Args.MakeArgString(Section + "=" + Section + Suffix));
+  StringRef Prefix = "llvm";
+  auto Section = (Prefix + "llvm_offload_entries").str();
+  // Rename the offloading entires to make them private to this link unit.
+  ObjcopyArgs.emplace_back("--rename-section");
+  ObjcopyArgs.emplace_back(
+      Args.MakeArgString(Section + "=" + Section + Suffix));
 
-    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
-    // newly renamed section containing the offloading entries.
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
-                                                "__start_" + Section + Suffix));
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
-                                                "__stop_" + Section + Suffix));
-  }
+  // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+  // newly renamed section containing the offloading entries.
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                              "__start_" + Section + Suffix));
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                              "__stop_" + Section + Suffix));
 
   if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
     return Err;

From 029f8892a500594bd044507352503249fd641e6c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 13 Jun 2025 02:00:14 +0100
Subject: [PATCH 0223/1322] [RISCV] Fold vmv.v.v into vleNff.v (#143981)

We currently already fold vmerge.vvm into vleNff.v via
RISCVDAGToDAGISel::performCombineVMergeAndVOps, so this teaches
RISCVVectorPeephole::foldVMV_V_V to do the same.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp   |  7 ++++---
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll | 16 ++++++++++++++++
 .../test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 17 +++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 6bb026378274..c9c2413d009b 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -611,7 +611,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
 
   MachineInstr *Src = MRI->getVRegDef(MI.getOperand(2).getReg());
   if (!Src || Src->hasUnmodeledSideEffects() ||
-      Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
+      Src->getParent() != MI.getParent() ||
       !RISCVII::isFirstDefTiedToFirstUse(Src->getDesc()) ||
       !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
       !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags))
@@ -622,7 +622,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     return false;
 
   // Src needs to have the same passthru as VMV_V_V
-  MachineOperand &SrcPassthru = Src->getOperand(1);
+  MachineOperand &SrcPassthru = Src->getOperand(Src->getNumExplicitDefs());
   if (SrcPassthru.getReg() != RISCV::NoRegister &&
       SrcPassthru.getReg() != Passthru.getReg())
     return false;
@@ -643,7 +643,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     // If Src is masked then its passthru needs to be in VRNoV0.
     if (Passthru.getReg() != RISCV::NoRegister)
       MRI->constrainRegClass(Passthru.getReg(),
-                             TII->getRegClass(Src->getDesc(), 1, TRI,
+                             TII->getRegClass(Src->getDesc(),
+                                              SrcPassthru.getOperandNo(), TRI,
                                               *Src->getParent()->getParent()));
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
index 6345b90db23b..1e2e7795f654 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
@@ -206,3 +206,19 @@ define <vscale x 1 x i64> @undef_passthru(<vscale x 1 x i64> %passthru, <vscale
   %b = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %a, iXLen %avl)
   ret <vscale x 1 x i64> %b
 }
+
+; Check that we can fold into vle64ff.v even if we need to move it past the
+; passthru and it's safe.
+define <vscale x 1 x i64> @vleff_move_past_passthru(ptr %p, ptr %q, iXLen %avl) {
+; CHECK-LABEL: vleff_move_past_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl1re64.v v8, (a1)
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; CHECK-NEXT:    vle64ff.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = call { <vscale x 1 x i64>, iXLen } @llvm.riscv.vleff(<vscale x 1 x i64> poison, ptr %p, iXLen %avl)
+  %vec = extractvalue { <vscale x 1 x i64>, iXLen } %a, 0
+  %passthru = load <vscale x 1 x i64>, ptr %q
+  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(<vscale x 1 x i64> %passthru, <vscale x 1 x i64> %vec, iXLen %avl)
+  ret <vscale x 1 x i64> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index f545ecc5e53d..6e106e50634f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -135,3 +135,20 @@ body: |
     %3:vrnov0 = PseudoVMV_V_V_MF2 $noreg, %2, 0, 5 /* e32 */, 0 /* tu, mu */
     %7:vmv0 = COPY $v8
     %6:vrnov0 = PseudoVLSE32_V_MF2_MASK %3, $noreg, $noreg, %7, 0, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 4)
+...
+---
+name: move_vleff
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: move_vleff
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr, %vl:gpr = PseudoVLE32FF_V_M1 %passthru, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1)
+    ; CHECK-NEXT: %y:gpr = ADDI $x0, 1
+    %x:vr, %vl:gpr = PseudoVLE32FF_V_M1 $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size)
+    %passthru:vr = COPY $v8
+    %y:gpr = ADDI $x0, 1
+    %z:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 0 /* tu, mu */
+...

From 8890706db67384a423773cc921302dd63d950ef5 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 12 Jun 2025 21:33:05 -0400
Subject: [PATCH 0224/1322] Revert "Reland [HIP] use offload wrapper for
 non-device-only non-rdc (#132869) (#143964)"

This reverts commit 22f9b4aa1dad597d908be77be1e10ba4c77330ce.
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  3 +-
 clang/lib/Driver/Driver.cpp                   | 62 +++++---------
 clang/lib/Driver/ToolChains/Clang.cpp         | 18 +----
 clang/test/Driver/hip-binding.hip             |  6 +-
 clang/test/Driver/hip-phases.hip              | 56 +++++--------
 clang/test/Driver/hip-toolchain-no-rdc.hip    | 81 +++++++------------
 clang/test/Driver/linker-wrapper.c            |  1 -
 .../ClangLinkerWrapper.cpp                    | 29 +++----
 8 files changed, 90 insertions(+), 166 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dd26be74e561..38f514304df5 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver &&
-       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
+      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 060f76fb653c..eb60d907d221 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4423,10 +4423,6 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
-  bool HIPNoRDC =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4560,7 +4556,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
+    } else if (UseNewOffloadingDriver ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4871,31 +4867,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first))
-    return HostAction;
-
-  bool HIPNoRDC =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
-  // For HIP non-rdc non-device-only compilation, create a linker wrapper
-  // action for each host object to link, bundle and wrap device files in
-  // it.
-  if ((isa<AssembleJobAction>(HostAction) ||
-       (isa<BackendJobAction>(HostAction) &&
-        HostAction->getType() == types::TY_LTO_BC)) &&
-      HIPNoRDC && !offloadDeviceOnly()) {
-    ActionList AL{HostAction};
-    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
-    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
-                                         /*BoundArch=*/nullptr);
-    return HostAction;
-  }
-
-  // Don't build offloading actions if we do not have a compile action. If
-  // preprocessing only ignore embedding.
-  if (!(isa<CompileJobAction>(HostAction) ||
+  // valid source input and compile action to embed it in. If preprocessing only
+  // ignore embedding.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
+      !(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4991,12 +4966,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in device-only non-RDC mode requires linking each action
-    // individually.
+    // Compiling HIP in non-RDC mode requires linking each action individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          !HIPNoRDC || !offloadDeviceOnly())
+          Kind != Action::OFK_HIP ||
+          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -5020,12 +4995,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in device-only non-RDC mode will bundle the output if it invoked
-  // the linker.
+  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
   bool ShouldBundleHIP =
-      HIPNoRDC && offloadDeviceOnly() &&
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5045,9 +5020,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (HIPNoRDC && offloadDeviceOnly()) {
-    // If we are in device-only non-RDC-mode we just emit the final HIP
-    // fatbinary for each translation unit, linking each input individually.
+  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
+             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                           false)) {
+    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
+    // translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5200,11 +5177,8 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                        false) ||
-           (Args.hasFlag(options::OPT_offload_new_driver,
-                         options::OPT_no_offload_new_driver, false) &&
-            !offloadDeviceOnly())) ||
+         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                       false) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8556bcadf091..15acb88c1a8f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7710,7 +7710,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if (IsCuda && !IsRDCMode) {
+    if ((IsCuda || IsHIP) && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9257,20 +9257,8 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-
-  // We use action type to differentiate two use cases of the linker wrapper.
-  // TY_Image for normal linker wrapper work.
-  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
-  // object.
-  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
-  if (JA.getType() == types::TY_Object) {
-    CmdArgs.append({"-o", Output.getFilename()});
-    for (auto Input : Inputs)
-      CmdArgs.push_back(Input.getFilename());
-    CmdArgs.push_back("-r");
-  } else
-    for (const char *LinkArg : LinkCommand->getArguments())
-      CmdArgs.push_back(LinkArg);
+  for (const char *LinkArg : LinkCommand->getArguments())
+    CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index d8b3f1e24201..57e57194ec87 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index d8a58b78d6d5..5fd2c0216ccc 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,57 +8,39 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
+// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s
+// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
+// RUN: | FileCheck -check-prefixes=BIN,RDC %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
-// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
-// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
 
-// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
-// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
-// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
-// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
+// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index ddd251b67cc5..6c69d1d51a26 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,23 +47,22 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// OLD-SAME: "-emit-obj"
-// NEW-SAME: "-emit-llvm-bc"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
+// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -71,71 +70,62 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// OLD-SAME: "-bundle-align=4096"
-// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
-
-// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
-// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
-// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
+// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
-// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
-
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -143,49 +133,40 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// OLD-SAME: "-bundle-align=4096"
-// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
-
-// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
-// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
-// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
+// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
-// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
-
 //
 // Link host objects.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 80b1a5745a12..a7e98e7351d9 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -223,7 +223,6 @@ __attribute__((visibility("protected"), used)) int x;
 // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
-// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index b8019fac4c2e..95b6f74af1f1 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -310,21 +310,22 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   // Remove the old .llvm.offloading section to prevent further linking.
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
-  StringRef Prefix = "llvm";
-  auto Section = (Prefix + "llvm_offload_entries").str();
-  // Rename the offloading entires to make them private to this link unit.
-  ObjcopyArgs.emplace_back("--rename-section");
-  ObjcopyArgs.emplace_back(
-      Args.MakeArgString(Section + "=" + Section + Suffix));
+  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
+    auto Section = (Prefix + "_offloading_entries").str();
+    // Rename the offloading entires to make them private to this link unit.
+    ObjcopyArgs.emplace_back("--rename-section");
+    ObjcopyArgs.emplace_back(
+        Args.MakeArgString(Section + "=" + Section + Suffix));
 
-  // Rename the __start_ / __stop_ symbols appropriately to iterate over the
-  // newly renamed section containing the offloading entries.
-  ObjcopyArgs.emplace_back("--redefine-sym");
-  ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
-                                              "__start_" + Section + Suffix));
-  ObjcopyArgs.emplace_back("--redefine-sym");
-  ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
-                                              "__stop_" + Section + Suffix));
+    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+    // newly renamed section containing the offloading entries.
+    ObjcopyArgs.emplace_back("--redefine-sym");
+    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                                "__start_" + Section + Suffix));
+    ObjcopyArgs.emplace_back("--redefine-sym");
+    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                                "__stop_" + Section + Suffix));
+  }
 
   if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
     return Err;

From 7232c07eb97d5c21d47a661c9cca8981c7f91698 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 12 Jun 2025 21:35:22 -0400
Subject: [PATCH 0225/1322] Reland [HIP] use offload wrapper for
 non-device-only non-rdc (#143964)

Fixed a typo:

-  auto Section = (Prefix + "llvm_offload_entries").str();
+  auto Section = (Prefix + "_offload_entries").str();

which broke buildbot e.g.

https://lab.llvm.org/buildbot/#/builders/208/builds/1948
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  3 +-
 clang/lib/Driver/Driver.cpp                   | 62 +++++++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         | 18 ++++-
 clang/test/Driver/hip-binding.hip             |  6 +-
 clang/test/Driver/hip-phases.hip              | 56 ++++++++-----
 clang/test/Driver/hip-toolchain-no-rdc.hip    | 81 ++++++++++++-------
 clang/test/Driver/linker-wrapper.c            |  1 +
 .../ClangLinkerWrapper.cpp                    | 29 ++++---
 8 files changed, 166 insertions(+), 90 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 38f514304df5..dd26be74e561 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,7 +1280,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+      (CGM.getLangOpts().OffloadingNewDriver &&
+       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index eb60d907d221..060f76fb653c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4423,6 +4423,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4556,7 +4560,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if (UseNewOffloadingDriver ||
+    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4867,10 +4871,31 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input and compile action to embed it in. If preprocessing only
-  // ignore embedding.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
-      !(isa<CompileJobAction>(HostAction) ||
+  // valid source input.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first))
+    return HostAction;
+
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
+  // For HIP non-rdc non-device-only compilation, create a linker wrapper
+  // action for each host object to link, bundle and wrap device files in
+  // it.
+  if ((isa<AssembleJobAction>(HostAction) ||
+       (isa<BackendJobAction>(HostAction) &&
+        HostAction->getType() == types::TY_LTO_BC)) &&
+      HIPNoRDC && !offloadDeviceOnly()) {
+    ActionList AL{HostAction};
+    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+                                         /*BoundArch=*/nullptr);
+    return HostAction;
+  }
+
+  // Don't build offloading actions if we do not have a compile action. If
+  // preprocessing only ignore embedding.
+  if (!(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4966,12 +4991,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in non-RDC mode requires linking each action individually.
+    // Compiling HIP in device-only non-RDC mode requires linking each action
+    // individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          Kind != Action::OFK_HIP ||
-          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+          !HIPNoRDC || !offloadDeviceOnly())
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4995,12 +5020,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+  // HIP code in device-only non-RDC mode will bundle the output if it invoked
+  // the linker.
   bool ShouldBundleHIP =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      HIPNoRDC && offloadDeviceOnly() &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5020,11 +5045,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
-             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                           false)) {
-    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
-    // translation unit, linking each input individually.
+  } else if (HIPNoRDC && offloadDeviceOnly()) {
+    // If we are in device-only non-RDC-mode we just emit the final HIP
+    // fatbinary for each translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5177,8 +5200,11 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                       false) ||
+         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                        false) ||
+           (Args.hasFlag(options::OPT_offload_new_driver,
+                         options::OPT_no_offload_new_driver, false) &&
+            !offloadDeviceOnly())) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 15acb88c1a8f..8556bcadf091 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7710,7 +7710,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if ((IsCuda || IsHIP) && !IsRDCMode) {
+    if (IsCuda && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9257,8 +9257,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-  for (const char *LinkArg : LinkCommand->getArguments())
-    CmdArgs.push_back(LinkArg);
+
+  // We use action type to differentiate two use cases of the linker wrapper.
+  // TY_Image for normal linker wrapper work.
+  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+  // object.
+  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+  if (JA.getType() == types::TY_Object) {
+    CmdArgs.append({"-o", Output.getFilename()});
+    for (auto Input : Inputs)
+      CmdArgs.push_back(Input.getFilename());
+    CmdArgs.push_back("-r");
+  } else
+    for (const char *LinkArg : LinkCommand->getArguments())
+      CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87..d8b3f1e24201 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc..d8a58b78d6d5 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,57 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
 
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
+// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 6c69d1d51a26..ddd251b67cc5 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -133,40 +143,49 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
+// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
+
 //
 // Link host objects.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index a7e98e7351d9..80b1a5745a12 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -223,6 +223,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
+// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 95b6f74af1f1..7a1007d03737 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -310,22 +310,21 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   // Remove the old .llvm.offloading section to prevent further linking.
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
-  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
-    auto Section = (Prefix + "_offloading_entries").str();
-    // Rename the offloading entires to make them private to this link unit.
-    ObjcopyArgs.emplace_back("--rename-section");
-    ObjcopyArgs.emplace_back(
-        Args.MakeArgString(Section + "=" + Section + Suffix));
+  StringRef Prefix = "llvm";
+  auto Section = (Prefix + "_offload_entries").str();
+  // Rename the offloading entires to make them private to this link unit.
+  ObjcopyArgs.emplace_back("--rename-section");
+  ObjcopyArgs.emplace_back(
+      Args.MakeArgString(Section + "=" + Section + Suffix));
 
-    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
-    // newly renamed section containing the offloading entries.
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
-                                                "__start_" + Section + Suffix));
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
-                                                "__stop_" + Section + Suffix));
-  }
+  // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+  // newly renamed section containing the offloading entries.
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                              "__start_" + Section + Suffix));
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                              "__stop_" + Section + Suffix));
 
   if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
     return Err;

From 07dad4ecba43bcd92453a0cd4c351025126db683 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 12 Jun 2025 19:50:41 -0700
Subject: [PATCH 0226/1322] [ELF] Implement -z dynamic-undefined-weak

The behavior of an undefined weak reference is implementation defined.
For static -no-pie linking, dynamic relocations are generally avoided (except
IRELATIVE). -shared linking generally emits dynamic relocations.

Dynamic -no-pie linking and -pie allow flexibility. Changes adjust the
behavior for better consistency and simpler internal representation,
e.g. https://reviews.llvm.org/D63003 https://reviews.llvm.org/D105164
(generalized to undefined non-weak in
2fcaa00d1e2317a90c9071b735eb0e758b5dd58b).

GNU ld introduced -z [no]dynamic-undefined-weak option to fine-tune the
behavior. (The option is not very effective with -no-pie, e.g. on
x86-64, `ld.bfd a.o s.so -z dynamic-undefined-weak` generates
R_X86_64_NONE relocations instead of GLOB_DAT/JUMP_SLOT)

This patch implements -z [no]dynamic-undefined-weak option.
The effects are summarized as follows:

* Static -no-pie: no-op
* Dynamic -no-pie: nodynamic-undefined-weak suppresses GLOB_DAT/JUMP_SLOT
* Static -pie: dynamic-undefined-weak generates ABS/GLOB_DAT/JUMP_SLOT.
  https://discourse.llvm.org/t/lld-weak-undefined-symbols-in-vdso-only/86749
* Dynamic -pie: nodynamic-undefined-weak suppresses ABS/GLOB_DAT/JUMP_SLOT

The -pie behavior likely stays stable while -no-pie (`!ctx.arg.isPic` in
`isStaticLinkTimeConstant`) behavior will likely change in the future.
The current default value of ctx.arg.zDynamicUndefined is selected to
prevent behavior changes.

Pull Request: https://github.com/llvm/llvm-project/pull/143831
---
 lld/ELF/Config.h                  |  1 +
 lld/ELF/Driver.cpp                |  8 ++++++++
 lld/ELF/Symbols.cpp               | 14 ++++++++------
 lld/ELF/Writer.cpp                |  6 ++----
 lld/docs/ReleaseNotes.rst         |  4 ++++
 lld/docs/ld.lld.1                 |  8 ++++++++
 lld/test/ELF/driver.test          |  3 ++-
 lld/test/ELF/weak-undef-got-plt.s |  6 ++++++
 lld/test/ELF/weak-undef-hidden.s  |  4 ++++
 lld/test/ELF/weak-undef-rw.s      | 19 ++++++++++++++++---
 10 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 0a52dfe6901b..3a9001d2cc8b 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -368,6 +368,7 @@ struct Config {
   bool writeAddends;
   bool zCombreloc;
   bool zCopyreloc;
+  bool zDynamicUndefined;
   bool zForceBti;
   bool zForceIbt;
   bool zGlobal;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 12dac82c614a..87b19cf543d9 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -591,6 +591,7 @@ static void checkZOptions(Ctx &ctx, opt::InputArgList &args) {
   args::getZOptionValue(args, OPT_z, "max-page-size", 0);
   args::getZOptionValue(args, OPT_z, "common-page-size", 0);
   getZFlag(args, "rel", "rela", false);
+  getZFlag(args, "dynamic-undefined-weak", "nodynamic-undefined-weak", false);
   for (auto *arg : args.filtered(OPT_z))
     if (!arg->isClaimed())
       Warn(ctx) << "unknown -z value: " << StringRef(arg->getValue());
@@ -3058,6 +3059,13 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   ctx.hasDynsym = !ctx.sharedFiles.empty() || ctx.arg.isPic;
   ctx.arg.exportDynamic &= ctx.hasDynsym;
 
+  // Preemptibility of undefined symbols when ctx.hasDynsym is true. Default is
+  // true for dynamic linking.
+  ctx.arg.zDynamicUndefined =
+      getZFlag(args, "dynamic-undefined-weak", "nodynamic-undefined-weak",
+               ctx.sharedFiles.size() || ctx.arg.shared) &&
+      ctx.hasDynsym;
+
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = ctx.symtab->find(ctx.arg.entry))
     handleUndefined(ctx, sym, "--entry");
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index c461dfed0d74..de839795c50d 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -333,10 +333,13 @@ bool elf::computeIsPreemptible(Ctx &ctx, const Symbol &sym) {
   if (sym.visibility() != STV_DEFAULT)
     return false;
 
-  // At this point copy relocations have not been created yet, so any
-  // symbol that is not defined locally is preemptible.
+  // At this point copy relocations have not been created yet.
+  // Shared symbols are preemptible. Undefined symbols are preemptible
+  // when zDynamicUndefined (default in dynamic linking). Weakness is not
+  // checked, though undefined non-weak would typically trigger relocation
+  // errors unless options like -z undefs are used.
   if (!sym.isDefined())
-    return true;
+    return !sym.isUndefined() || ctx.arg.zDynamicUndefined;
 
   if (!ctx.arg.shared)
     return false;
@@ -360,7 +363,6 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   // can contain versions in the form of <name>@<version>.
   // Let them parse and update their names to exclude version suffix.
   // In addition, compute isExported and isPreemptible.
-  bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
@@ -369,11 +371,11 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
       continue;
     }
     if (!sym->isDefined() && !sym->isCommon()) {
-      sym->isPreemptible = maybePreemptible && computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
     } else if (ctx.arg.exportDynamic &&
                (sym->isUsedInRegularObj || !sym->ltoCanOmit)) {
       sym->isExported = true;
-      sym->isPreemptible = maybePreemptible && computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
     }
   }
 }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 3d9888f576f0..15909daf51ab 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -285,7 +285,6 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
 static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
-  bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (d->section && !d->section->isLive())
@@ -301,9 +300,8 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
       }
     }
 
-    if (maybePreemptible)
-      sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
-                           computeIsPreemptible(ctx, *sym);
+    sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
+                         computeIsPreemptible(ctx, *sym);
   }
 }
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 5c180fd8fbee..064ed0828c31 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -25,6 +25,10 @@ Non-comprehensive list of changes in this release
 
 ELF Improvements
 ----------------
+* Added ``-z dynamic-undefined-weak`` to make undefined weak symbols dynamic
+  when the dynamic symbol table is present.
+  (`#143831 <https://github.com/llvm/llvm-project/pull/143831>`_)
+
 * For AArch64, added support for ``-zgcs-report-dynamic``, enabling checks for
   GNU GCS Attribute Flags in Dynamic Objects when GCS is enabled. Inherits value
   from ``-zgcs-report`` (capped at ``warning`` level) unless user-defined,
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 57aa2be5907b..cfacdb081a80 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -793,6 +793,14 @@ Specify how to report the missing GNU_PROPERTY_X86_FEATURE_1_IBT or GNU_PROPERTY
 .Cm none
 is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
 .Pp
+.It Cm dynamic-undefined-weak
+Make undefined weak symbols dynamic when the dynamic symbol table is present, if they are referenced from
+relocatable object files and not forced local by symbol visibility or versioning. Do not make them dynamic when
+.Cm nodynamic-undefined-weak
+is specified.
+.Cm dynamic-undefined-weak
+is the default when building a shared object, or when an input shared object is present.
+.Pp
 .It Cm pauth-report Ns = Ns Ar [none|warning|error]
 Specify how to report the missing GNU_PROPERTY_AARCH64_FEATURE_PAUTH property.
 .Cm none
diff --git a/lld/test/ELF/driver.test b/lld/test/ELF/driver.test
index 45d73607c8ac..6d5761212cc3 100644
--- a/lld/test/ELF/driver.test
+++ b/lld/test/ELF/driver.test
@@ -47,7 +47,8 @@
 # ERR9: error: cannot open output file utput=/no/such/file
 
 # RUN: ld.lld %t -z foo -o /dev/null 2>&1 | FileCheck -check-prefix=ERR10 %s --implicit-check-not=warning:
-# RUN: ld.lld %t -z foo -z rel -z rela -z max-page-size=1 -z common-page-size=1 -o /dev/null --version 2>&1 | \
+# RUN: ld.lld %t -z foo -z rel -z rela -z max-page-size=1 -z common-page-size=1 -z dynamic-undefined-weak \
+# RUN:   -z nodynamic-undefined-weak -o /dev/null --version 2>&1 | \
 # RUN:   FileCheck -check-prefix=ERR10 %s --implicit-check-not=warning:
 # ERR10: warning: unknown -z value: foo
 
diff --git a/lld/test/ELF/weak-undef-got-plt.s b/lld/test/ELF/weak-undef-got-plt.s
index 0ee3da2cd3b4..48a7914e5b98 100644
--- a/lld/test/ELF/weak-undef-got-plt.s
+++ b/lld/test/ELF/weak-undef-got-plt.s
@@ -6,11 +6,17 @@
 
 # RUN: ld.lld a.o -o a
 # RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
+# RUN: ld.lld a.o -o a -z dynamic-undefined-weak
+# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
 # RUN: ld.lld a.o s.so -o as
 # RUN: llvm-objdump -dR as | FileCheck %s
+# RUN: ld.lld a.o s.so -o as -z nodynamic-undefined-weak
+# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
 
 # RUN: ld.lld -pie a.o s.so -o as.pie
 # RUN: llvm-objdump -dR as.pie | FileCheck %s
+# RUN: ld.lld -pie a.o s.so -o as.pie -z nodynamic-undefined-weak
+# RUN: llvm-readelf -r as.pie | FileCheck --check-prefix=NORELOC %s
 
 # RUN: ld.lld -shared a.o -o a.so
 # RUN: llvm-objdump -dR a.so | FileCheck %s
diff --git a/lld/test/ELF/weak-undef-hidden.s b/lld/test/ELF/weak-undef-hidden.s
index 2baad5738c36..ad2ba29ec27a 100644
--- a/lld/test/ELF/weak-undef-hidden.s
+++ b/lld/test/ELF/weak-undef-hidden.s
@@ -5,6 +5,10 @@
 // RUN: ld.lld %t.o -o %t -pie
 // RUN: llvm-readobj -r -S --section-data %t | FileCheck %s
 
+/// -z dynamic-undefined-weak does not affect hidden undefined symbols.
+// RUN: ld.lld %t.o -o %t.so -shared -z dynamic-undefined-weak
+// RUN: llvm-readobj -r -S --section-data %t.so | FileCheck %s
+
 /// This is usually guarded with a comparison. Don't report an error.
 call g
 
diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index 497228a3cf90..8d777669b7e1 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -18,9 +18,22 @@
 ## gABI leaves the behavior of weak undefined references implementation defined.
 ## We choose to resolve them statically for static linking and produce dynamic relocations
 ## for dynamic linking (-shared or at least one input DSO).
-##
-## Note: Some ports of GNU ld support -z nodynamic-undefined-weak that we don't
-## implement.
+
+## -z dynamic-undefined-weak is ignored if .dynsym is absent (-no-pie without DSO)
+# RUN: ld.lld a.o -o a.d -z dynamic-undefined-weak 2>&1 | count 0
+# RUN: llvm-readelf -r --hex-dump=.data a.d | FileCheck %s --check-prefix=STATIC
+
+## Currently no effect for S+A relocations.
+# RUN: ld.lld a.o s.so -o as.d -z dynamic-undefined-weak
+# RUN: llvm-readelf -r --hex-dump=.data as.d | FileCheck %s --check-prefix=STATIC
+
+## -z dynamic-undefined-weak forces dynamic relocations if .dynsym is present.
+# RUN: ld.lld a.o -o a.pie.d -pie -z dynamic-undefined-weak
+# RUN: llvm-readelf -r a.pie.d | FileCheck %s --check-prefix=DYN
+
+## -z nodynamic-undefined-weak suppresses dynamic relocations.
+# RUN: ld.lld a.o -o a.so.n -shared -z dynamic-undefined-weak -z nodynamic-undefined-weak
+# RUN: llvm-readelf -r --hex-dump=.data a.so.n | FileCheck %s --check-prefix=STATIC
 
 # STATIC:      no relocations
 # STATIC:      Hex dump of section '.data':

From 9992668404cfb2302f7a62f01884c210642caea1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 12 Jun 2025 20:47:58 -0700
Subject: [PATCH 0227/1322] [flang][cuda] Add runtime check for passing device
 arrays (#144003)

---
 flang-rt/lib/cuda/descriptor.cpp              |  8 +++++++
 flang/include/flang/Lower/LoweringOptions.def |  3 +++
 .../Builder/Runtime/CUDA/Descriptor.h         |  5 +++++
 flang/include/flang/Runtime/CUDA/descriptor.h |  4 ++++
 flang/lib/Lower/ConvertCall.cpp               | 14 ++++++++++++
 .../Builder/Runtime/CUDA/Descriptor.cpp       | 15 +++++++++++++
 flang/test/Lower/CUDA/cuda-runtime-check.cuf  | 22 +++++++++++++++++++
 flang/tools/bbc/bbc.cpp                       |  2 ++
 8 files changed, 73 insertions(+)
 create mode 100644 flang/test/Lower/CUDA/cuda-runtime-check.cuf

diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index 7b768f91af29..aa75d4eff051 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -54,6 +54,14 @@ void RTDEF(CUFSyncGlobalDescriptor)(
   ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine);
 }
 
+void RTDEF(CUFDescriptorCheckSection)(
+    const Descriptor *desc, const char *sourceFile, int sourceLine) {
+  if (desc && !desc->IsContiguous()) {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("device array section argument is not contiguous");
+  }
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index b062ea1a805a..d97abf4d864b 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -63,5 +63,8 @@ ENUM_LOWERINGOPT(StackRepackArrays, unsigned, 1, 0)
 /// in the leading dimension.
 ENUM_LOWERINGOPT(RepackArraysWhole, unsigned, 1, 0)
 
+/// If true, CUDA Fortran runtime check is inserted.
+ENUM_LOWERINGOPT(CUDARuntimeCheck, unsigned, 1, 0)
+
 #undef LOWERINGOPT
 #undef ENUM_LOWERINGOPT
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
index 14d262bf22a7..bdeb7574012c 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -26,6 +26,11 @@ namespace fir::runtime::cuda {
 void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value hostPtr);
 
+/// Generate runtime call to check the section of a descriptor and raise an
+/// error if it is not contiguous.
+void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc,
+                               mlir::Value desc);
+
 } // namespace fir::runtime::cuda
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 0ee7feca10e4..06e4a4649db1 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -37,6 +37,10 @@ void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
 void RTDECL(CUFSyncGlobalDescriptor)(
     void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Check descriptor passed to a kernel.
+void RTDECL(CUFDescriptorCheckSection)(
+    const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 7378118cfef7..864499e6c343 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -26,6 +26,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
 #include "flang/Optimizer/Builder/MutableBox.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@@ -543,6 +544,19 @@ Fortran::lower::genCallOpAndResult(
   fir::FortranProcedureFlagsEnumAttr procAttrs =
       caller.getProcedureAttrs(builder.getContext());
 
+  if (converter.getLoweringOptions().getCUDARuntimeCheck()) {
+    if (caller.getCallDescription().chevrons().empty()) {
+      for (auto [oper, arg] :
+           llvm::zip(operands, caller.getPassedArguments())) {
+        if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(oper.getType())) {
+          const Fortran::semantics::Symbol *sym = caller.getDummySymbol(arg);
+          if (sym && Fortran::evaluate::IsCUDADeviceSymbol(*sym))
+            fir::runtime::cuda::genDescriptorCheckSection(builder, loc, oper);
+        }
+      }
+    }
+  }
+
   if (!caller.getCallDescription().chevrons().empty()) {
     // A call to a CUDA kernel with the chevron syntax.
 
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
index 90662c094c65..a943469a7672 100644
--- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -32,3 +32,18 @@ void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
       builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, callee, args);
 }
+
+void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
+                                                   mlir::Location loc,
+                                                   mlir::Value desc) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFDescriptorCheckSection)>(loc,
+                                                                       builder);
+  auto fTy = func.getFunctionType();
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, sourceFile, sourceLine)};
+  builder.create<fir::CallOp>(loc, func, args);
+}
diff --git a/flang/test/Lower/CUDA/cuda-runtime-check.cuf b/flang/test/Lower/CUDA/cuda-runtime-check.cuf
new file mode 100644
index 000000000000..f26d372769ca
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-runtime-check.cuf
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Check insertion of runtime checks
+
+interface
+  subroutine foo(a)
+    real, device, dimension(:,:) :: a
+  end subroutine
+end interface
+
+  real, device, allocatable, dimension(:,:) :: a
+  allocate(a(10,10))
+  call foo(a(1:10,1:10:2))
+end
+
+subroutine foo(a)
+  real, device, dimension(:,:) :: a
+end subroutine
+
+! CHECK-LABEL: func.func @_QQmain()
+! CHECK: fir.call @_FortranACUFDescriptorCheckSection
+! CHECK: fir.call @_QPfoo
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index c80872108ac8..015c86604a1f 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -434,6 +434,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
   loweringOptions.setStackRepackArrays(stackRepackArrays);
   loweringOptions.setRepackArrays(repackArrays);
   loweringOptions.setRepackArraysWhole(repackArraysWhole);
+  if (enableCUDA)
+    loweringOptions.setCUDARuntimeCheck(true);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
   Fortran::frontend::TargetOptions targetOpts;
   Fortran::frontend::CodeGenOptions cgOpts;

From 4268360003e2dc6721469aa5ccab7efbb29dcbfd Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Fri, 13 Jun 2025 09:35:48 +0530
Subject: [PATCH 0228/1322] [Flang] [OpenMP] Allow any type as argument to the
 FlushOp (#143844)

Fixes: #143842
---
 flang/test/Lower/OpenMP/flush02.f90           | 32 +++++++++++++++++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  2 +-
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/flush02.f90

diff --git a/flang/test/Lower/OpenMP/flush02.f90 b/flang/test/Lower/OpenMP/flush02.f90
new file mode 100644
index 000000000000..b372e700e1a1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/flush02.f90
@@ -0,0 +1,32 @@
+! This test checks lowering of OpenMP Flush Directive.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module flush02_mod
+    type t1
+       integer(kind=4) :: x = 4
+    end type t1
+
+    type :: t2
+       type(t1) :: y = t1(2)
+    end type t2
+
+
+contains
+
+    subroutine sub01(pt)
+        class(t1), intent(inout) :: pt
+        type(t2)                 :: dt
+        integer, allocatable     :: a(:)
+        integer, pointer         :: b(:)
+
+        ! CHECK: omp.flush({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+        ! CHECK: omp.flush({{.*}} : !fir.ref<f32>)
+        ! CHECK: omp.flush({{.*}} : !fir.ref<!fir.type<_QMflush02_modTt2{y:!fir.type<_QMflush02_modTt1{x:i32}>}>>)
+        ! CHECK: omp.flush({{.*}} : !fir.class<!fir.type<_QMflush02_modTt1{x:i32}>>)
+        !$omp flush(a)
+        !$omp flush(p)
+        !$omp flush(dt)
+        !$omp flush(pt)
+    end subroutine
+end module flush02_mod
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 036c6a6e350a..ac80926053a2 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -889,7 +889,7 @@ def FlushOp : OpenMP_Op<"flush", clauses = [
     specified or implied.
   }] # clausesDescription;
 
-  let arguments = !con((ins Variadic<OpenMP_PointerLikeType>:$varList),
+  let arguments = !con((ins Variadic<AnyType>:$varList),
                        clausesArgs);
 
   // Override inherited assembly format to include `varList`.

From cd573e0a547dba18e2a960967c1f24f124c6cb26 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 22:45:08 -0700
Subject: [PATCH 0229/1322] [compiler-rt] Remove unused local variables (NFC)
 (#144010)

---
 compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp b/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp
index 93bf817a857b..c9210c78a063 100644
--- a/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp
@@ -265,8 +265,6 @@ int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath,
     // we then request tags in [0,Size/2) and [Size/2, Size), and so on.
     // Function number => DFT.
     auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
-    std::unordered_map<size_t, std::vector<uint8_t>> DFTMap;
-    std::unordered_set<std::string> Cov;
     Command Cmd;
     Cmd.addArgument(DFTBinary);
     Cmd.addArgument(F.File);

From 752538c12cf4b37499f73e1bf05ea421ab055665 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 22:45:15 -0700
Subject: [PATCH 0230/1322] [llvm-pdbutil] Remove an unused local variable
 (NFC) (#144011)

---
 llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 4cb64bdbe8ef..e50d19a994b6 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -1375,7 +1375,6 @@ static void mergePdbs() {
 }
 
 static void explain() {
-  std::unique_ptr<IPDBSession> Session;
   InputFile IF =
       ExitOnErr(InputFile::open(opts::explain::InputFilename.front(), true));
 

From 054f4a50bb2ec1e535111d779bc5fdc93314c55a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 22:45:23 -0700
Subject: [PATCH 0231/1322] [polly] Remove an unused local variable (NFC)
 (#144012)

---
 polly/lib/Support/RegisterPasses.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 503c3ae1e07c..56cb8aadce3b 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -541,7 +541,6 @@ static bool
 parseTopLevelPipeline(llvm::ModulePassManager &MPM,
                       PassInstrumentationCallbacks *PIC,
                       ArrayRef<PassBuilder::PipelineElement> Pipeline) {
-  std::vector<PassBuilder::PipelineElement> FullPipeline;
   StringRef FirstName = Pipeline.front().Name;
 
   if (!isScopPassName(FirstName))

From dfc5125946ade289840fa119716957ebce2d31d2 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 12 Jun 2025 23:00:33 -0700
Subject: [PATCH 0232/1322] [NVPTX] Consistently check fast-math flags when
 lowering fsqrt (#143776)

Ensure that we check the global, function-level, and instruction-level
flags when considering whether to use `sqrt.rn` or `sqrt.approx` to
lower either `@llvm.sqrt.f32` or `@llvm.nvvm.sqrt.f`
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp |   4 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h   |   2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  24 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h   |   3 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     |   3 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  35 +-
 llvm/test/CodeGen/NVPTX/fast-math.ll        | 467 ++++++++++++++++----
 llvm/test/CodeGen/NVPTX/sqrt-approx.ll      | 339 +++++++++++---
 8 files changed, 695 insertions(+), 182 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index a20099788d09..79b1bfbc8072 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -71,8 +71,8 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
   return Subtarget->getTargetLowering()->getDivF32Level(*MF, *N);
 }
 
-bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
-  return Subtarget->getTargetLowering()->usePrecSqrtF32();
+bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
+  return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N);
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 71a5b7ff8cd3..473f4781a6c3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -44,7 +44,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool doMulWide;
 
   NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
-  bool usePrecSqrtF32() const;
+  bool usePrecSqrtF32(const SDNode *N) const;
   bool useF32FTZ() const;
   bool allowFMA() const;
   bool allowUnsafeFPMath() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d6a134d9abaf..492f4ab76fdb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -134,14 +134,23 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
   return NVPTX::DivPrecisionLevel::IEEE754;
 }
 
-bool NVPTXTargetLowering::usePrecSqrtF32() const {
-  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF,
+                                         const SDNode *N) const {
+  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+  if (UsePrecSqrtF32.getNumOccurrences() > 0)
     return UsePrecSqrtF32;
-  } else {
-    // Otherwise, use sqrt.approx if fast math is enabled
-    return !getTargetMachine().Options.UnsafeFPMath;
+
+  // Otherwise, use sqrt.approx if fast math is enabled
+  if (allowUnsafeFPMath(MF))
+    return false;
+
+  if (N) {
+    const SDNodeFlags Flags = N->getFlags();
+    if (Flags.hasApproximateFuncs())
+      return false;
   }
+
+  return true;
 }
 
 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
@@ -1134,7 +1143,8 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                              bool &UseOneConst,
                                              bool Reciprocal) const {
   if (!(Enabled == ReciprocalEstimate::Enabled ||
-        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
+        (Enabled == ReciprocalEstimate::Unspecified &&
+         !usePrecSqrtF32(DAG.getMachineFunction()))))
     return SDValue();
 
   if (ExtraSteps == ReciprocalEstimate::Unspecified)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 8d71022a1f10..0a54a8fd71f3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -225,7 +225,8 @@ public:
 
   // Get whether we should use a precise or approximate 32-bit floating point
   // sqrt instruction.
-  bool usePrecSqrtF32() const;
+  bool usePrecSqrtF32(const MachineFunction &MF,
+                      const SDNode *N = nullptr) const;
 
   // Get whether we should use instructions that flush floating-point denormals
   // to sign-preserving zero.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9ca4e8d20650..fa521c040e8e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -151,9 +151,6 @@ def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
 
 def doMulWide      : Predicate<"doMulWide">;
 
-def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
-def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
-
 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f52ff39c3e1a..b3c1296cf0ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1520,15 +1520,18 @@ def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64", F64RT, F64RT, int_nvvm_sqrt_rz_
 def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64", F64RT, F64RT, int_nvvm_sqrt_rm_d>;
 def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64", F64RT, F64RT, int_nvvm_sqrt_rp_d>;
 
+def fsqrt_approx : PatFrags<(ops node:$a),
+                            [(fsqrt node:$a),
+                             (int_nvvm_sqrt_f node:$a)], [{
+  return !usePrecSqrtF32(N);
+}]>;
+
 // nvvm_sqrt intrinsic
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_FTZ_F $a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_F $a)>, Requires<[do_SQRTF32_RN]>;
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_F $a)>;
+def : Pat<(int_nvvm_sqrt_f f32:$a), (INT_NVVM_SQRT_RN_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f f32:$a), (INT_NVVM_SQRT_RN_F $a)>;
+
+def : Pat<(fsqrt_approx f32:$a), (INT_NVVM_SQRT_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(fsqrt_approx f32:$a), (INT_NVVM_SQRT_APPROX_F $a)>;
 
 //
 // Rsqrt
@@ -1551,20 +1554,14 @@ def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)),
 def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt]>;
-// same for int_nvvm_sqrt_f when non-precision sqrt is requested
-def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 
-def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
+// same for int_nvvm_sqrt_f when non-precision sqrt is requested
+def: Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)),
          (INT_NVVM_RSQRT_APPROX_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
+         Requires<[doRsqrtOpt, doNoF32FTZ]>;
+def: Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
+         Requires<[doRsqrtOpt, doF32FTZ]>;
 //
 // Add
 //
diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index 4cb6a35e796f..bc48d242f88f 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -1,58 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)
 
-; CHECK-LABEL: sqrt_div(
-; CHECK: sqrt.rn.f32
-; CHECK: div.rn.f32
 define float @sqrt_div(float %a, float %b) {
+; CHECK-LABEL: sqrt_div(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_param_1];
+; CHECK-NEXT:    div.rn.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast(
-; CHECK: sqrt.rn.f32
-; CHECK: div.approx.f32
 define float @sqrt_div_fast(float %a, float %b) #0 {
+; CHECK-LABEL: sqrt_div_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_fast_param_1];
+; CHECK-NEXT:    div.approx.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ninf(
-; CHECK: sqrt.approx.f32
-; CHECK: div.approx.f32
 define float @sqrt_div_fast_ninf(float %a, float %b) #0 {
+; CHECK-LABEL: sqrt_div_fast_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ninf_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    abs.f32 %r3, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r3, 0f00800000;
+; CHECK-NEXT:    selp.f32 %r4, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r5, [sqrt_div_fast_ninf_param_1];
+; CHECK-NEXT:    div.approx.f32 %r6, %r4, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_ftz(
-; CHECK: sqrt.rn.ftz.f32
-; CHECK: div.rn.ftz.f32
 define float @sqrt_div_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: sqrt_div_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_ftz_param_0];
+; CHECK-NEXT:    sqrt.rn.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_ftz_param_1];
+; CHECK-NEXT:    div.rn.ftz.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ftz(
-; CHECK: sqrt.rn.ftz.f32
-; CHECK: div.approx.ftz.f32
 define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ftz_param_0];
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_fast_ftz_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ftz_ninf(
-; CHECK: sqrt.approx.ftz.f32
-; CHECK: div.approx.ftz.f32
 define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ftz_ninf_param_0];
+; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    selp.f32 %r3, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [sqrt_div_fast_ftz_ninf_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
@@ -61,69 +117,117 @@ define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 {
 ; There are no fast-math or ftz versions of sqrt and div for f64.  We use
 ; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide.
 
-; CHECK-LABEL: sqrt_div_fast_ftz_f64(
-; CHECK: sqrt.rn.f64
-; CHECK: div.rn.f64
 define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [sqrt_div_fast_ftz_f64_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [sqrt_div_fast_ftz_f64_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd4, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
   %t1 = tail call double @llvm.sqrt.f64(double %a)
   %t2 = fdiv double %t1, %b
   ret double %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf(
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
-; CHECK: div.rn.f64
 define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [sqrt_div_fast_ftz_f64_ninf_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
+; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
+; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    ld.param.b64 %rd6, [sqrt_div_fast_ftz_f64_ninf_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd7, %rd5, %rd6;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn double @llvm.sqrt.f64(double %a)
   %t2 = fdiv double %t1, %b
   ret double %t2
 }
 
-; CHECK-LABEL: rsqrt(
-; CHECK-NOT: rsqrt.approx
-; CHECK: sqrt.rn.f32
-; CHECK-NOT: rsqrt.approx
 define float @rsqrt(float %a) {
+; CHECK-LABEL: rsqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [rsqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r2, %r1;
+; CHECK-NEXT:    rcp.rn.f32 %r3, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %b = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %b
   ret float %ret
 }
 
-; CHECK-LABEL: rsqrt_fast(
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
-; CHECK: rsqrt.approx.f32
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
 define float @rsqrt_fast(float %a) #0 {
+; CHECK-LABEL: rsqrt_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [rsqrt_fast_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %b = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %b
   ret float %ret
 }
 
-; CHECK-LABEL: rsqrt_fast_ftz(
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
-; CHECK: rsqrt.approx.ftz.f32
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
 define float @rsqrt_fast_ftz(float %a) #0 #1 {
+; CHECK-LABEL: rsqrt_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [rsqrt_fast_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %b = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %b
   ret float %ret
 }
 
-; CHECK-LABEL: fadd
-; CHECK: add.rn.f32
 define float @fadd(float %a, float %b) {
+; CHECK-LABEL: fadd(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fadd_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fadd_param_1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %t1 = fadd float %a, %b
   ret float %t1
 }
 
-; CHECK-LABEL: fadd_ftz
-; CHECK: add.rn.ftz.f32
 define float @fadd_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: fadd_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fadd_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fadd_ftz_param_1];
+; CHECK-NEXT:    add.rn.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %t1 = fadd float %a, %b
   ret float %t1
 }
@@ -131,41 +235,83 @@ define float @fadd_ftz(float %a, float %b) #1 {
 declare float @llvm.sin.f32(float)
 declare float @llvm.cos.f32(float)
 
-; CHECK-LABEL: fsin_approx_afn
-; CHECK:       sin.approx.f32
 define float @fsin_approx_afn(float %a) {
+; CHECK-LABEL: fsin_approx_afn(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fsin_approx_afn_param_0];
+; CHECK-NEXT:    sin.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call afn float @llvm.sin.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: fcos_approx_afn
-; CHECK:       cos.approx.f32
 define float @fcos_approx_afn(float %a) {
+; CHECK-LABEL: fcos_approx_afn(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fcos_approx_afn_param_0];
+; CHECK-NEXT:    cos.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call afn float @llvm.cos.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: fsin_approx
-; CHECK:       sin.approx.f32
 define float @fsin_approx(float %a) #0 {
+; CHECK-LABEL: fsin_approx(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fsin_approx_param_0];
+; CHECK-NEXT:    sin.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call float @llvm.sin.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: fcos_approx
-; CHECK:       cos.approx.f32
 define float @fcos_approx(float %a) #0 {
+; CHECK-LABEL: fcos_approx(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fcos_approx_param_0];
+; CHECK-NEXT:    cos.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call float @llvm.cos.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed
 define float @repeated_div_recip_allowed(i1 %pred, float %a, float %b, float %divisor) {
-; CHECK: rcp.rn.f32
-; CHECK: mul.rn.f32
-; CHECK: mul.rn.f32
-; CHECK: mul.rn.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_recip_allowed(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_param_3];
+; CHECK-NEXT:    rcp.rn.f32 %r3, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_recip_allowed_param_2];
+; CHECK-NEXT:    mul.rn.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %z = fmul float %x, %y
@@ -173,23 +319,51 @@ define float @repeated_div_recip_allowed(i1 %pred, float %a, float %b, float %di
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed_sel
 define float @repeated_div_recip_allowed_sel(i1 %pred, float %a, float %b, float %divisor) {
-; CHECK: selp.f32
-; CHECK: div.rn.f32
+; CHECK-LABEL: repeated_div_recip_allowed_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_recip_allowed_sel_param_3];
+; CHECK-NEXT:    div.rn.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed_ftz
 define float @repeated_div_recip_allowed_ftz(i1 %pred, float %a, float %b, float %divisor) #1 {
-; CHECK: rcp.rn.ftz.f32
-; CHECK: mul.rn.ftz.f32
-; CHECK: mul.rn.ftz.f32
-; CHECK: mul.rn.ftz.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_recip_allowed_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_param_3];
+; CHECK-NEXT:    rcp.rn.ftz.f32 %r3, %r2;
+; CHECK-NEXT:    mul.rn.ftz.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_recip_allowed_ftz_param_2];
+; CHECK-NEXT:    mul.rn.ftz.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.rn.ftz.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %z = fmul float %x, %y
@@ -197,23 +371,51 @@ define float @repeated_div_recip_allowed_ftz(i1 %pred, float %a, float %b, float
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed_ftz_sel
 define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #1 {
-; CHECK: selp.f32
-; CHECK: div.rn.ftz.f32
+; CHECK-LABEL: repeated_div_recip_allowed_ftz_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_recip_allowed_ftz_sel_param_3];
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast
 define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 {
-; CHECK: rcp.approx.f32
-; CHECK: mul.f32
-; CHECK: mul.f32
-; CHECK: mul.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_param_3];
+; CHECK-NEXT:    rcp.approx.f32 %r3, %r2;
+; CHECK-NEXT:    mul.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_fast_param_2];
+; CHECK-NEXT:    mul.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %z = fmul float %x, %y
@@ -221,23 +423,51 @@ define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast_sel
 define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor) #0 {
-; CHECK: selp.f32
-; CHECK: div.approx.f32
+; CHECK-LABEL: repeated_div_fast_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_fast_sel_param_3];
+; CHECK-NEXT:    div.approx.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast_ftz
 define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
-; CHECK: rcp.approx.ftz.f32
-; CHECK: mul.ftz.f32
-; CHECK: mul.ftz.f32
-; CHECK: mul.ftz.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_ftz_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_ftz_param_3];
+; CHECK-NEXT:    rcp.approx.ftz.f32 %r3, %r2;
+; CHECK-NEXT:    mul.ftz.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_fast_ftz_param_2];
+; CHECK-NEXT:    mul.ftz.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.ftz.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %z = fmul float %x, %y
@@ -245,33 +475,80 @@ define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast_ftz_sel
 define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
-; CHECK: selp.f32
-; CHECK: div.approx.ftz.f32
+; CHECK-LABEL: repeated_div_fast_ftz_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_ftz_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_ftz_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_fast_ftz_sel_param_3];
+; CHECK-NEXT:    div.approx.ftz.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: frem
 define float @frem(float %a, float %b) #0 {
-  ; CHECK-NOT: testp.infinite
+; CHECK-LABEL: frem(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_param_1];
+; CHECK-NEXT:    div.approx.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %rem = frem float %a, %b
   ret float %rem
 }
 
-; CHECK-LABEL: frem_ftz
 define float @frem_ftz(float %a, float %b) #0 #1 {
-  ; CHECK-NOT: testp.infinite
+; CHECK-LABEL: frem_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_ftz_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.ftz.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %rem = frem float %a, %b
   ret float %rem
 }
 
-; CHECK-LABEL: frem_f64
 define double @frem_f64(double %a, double %b) #0 {
-  ; CHECK-NOT: testp.infinite
+; CHECK-LABEL: frem_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    neg.f64 %rd5, %rd4;
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %rem = frem double %a, %b
   ret double %rem
 }
diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index e2a3f1cf0d2d..a28d264cd8ec 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
 ; RUN:   | FileCheck %s
 ; RUN: %if ptxas %{                                                                   \
@@ -12,34 +13,62 @@ declare double @llvm.sqrt.f64(double)
 
 ; -- reciprocal sqrt --
 
-; CHECK-LABEL: test_rsqrt32
 define float @test_rsqrt32(float %a) #0 {
-; CHECK: rsqrt.approx.f32
+; CHECK-LABEL: test_rsqrt32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt32_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt_ftz
 define float @test_rsqrt_ftz(float %a) #0 #1 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK-LABEL: test_rsqrt_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt64
 define double @test_rsqrt64(double %a) #0 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_rsqrt64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
 }
 
-; CHECK-LABEL: test_rsqrt64_ftz
-define double @test_rsqrt64_ftz(double %a) #0 #1 {
 ; There's no rsqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
-; CHECK: rsqrt.approx.f64
+define double @test_rsqrt64_ftz(double %a) #0 #1 {
+; CHECK-LABEL: test_rsqrt64_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
@@ -47,64 +76,135 @@ define double @test_rsqrt64_ftz(double %a) #0 #1 {
 
 ; -- sqrt --
 
-; CHECK-LABEL: test_sqrt32
 define float @test_sqrt32(float %a) #0 {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_sqrt32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_ninf
 define float @test_sqrt32_ninf(float %a) #0 {
-; CHECK: sqrt.approx.f32
+; CHECK-LABEL: test_sqrt32_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_ninf_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    abs.f32 %r3, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r3, 0f00800000;
+; CHECK-NEXT:    selp.f32 %r4, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt_ftz
 define float @test_sqrt_ftz(float %a) #0 #1 {
-; CHECK: sqrt.rn.ftz.f32
+; CHECK-LABEL: test_sqrt_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_ftz_param_0];
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt_ftz_ninf
 define float @test_sqrt_ftz_ninf(float %a) #0 #1 {
-; CHECK: sqrt.approx.ftz.f32
+; CHECK-LABEL: test_sqrt_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_ftz_ninf_param_0];
+; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    selp.f32 %r3, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt64
 define double @test_sqrt64(double %a) #0 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_ninf
-define double @test_sqrt64_ninf(double %a) #0 {
 ; There's no sqrt.approx.f64 instruction; we emit
 ; reciprocal(rsqrt.approx.f64(x)).  There's no non-ftz approximate reciprocal,
 ; so we just use the ftz version.
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
+define double @test_sqrt64_ninf(double %a) #0 {
+; CHECK-LABEL: test_sqrt64_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_ninf_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
+; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
+; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_ftz
 define double @test_sqrt64_ftz(double %a) #0 #1 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_ftz_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_ftz_ninf
-define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
 ; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
+define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
+; CHECK-LABEL: test_sqrt64_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_ftz_ninf_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
+; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
+; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
@@ -114,93 +214,224 @@ define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
 ; The sqrt and rsqrt refinement algorithms both emit an rsqrt.approx, followed
 ; by some math.
 
-; CHECK-LABEL: test_rsqrt32_refined
 define float @test_rsqrt32_refined(float %a) #0 #2 {
-; CHECK: rsqrt.approx.f32
+; CHECK-LABEL: test_rsqrt32_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt32_refined_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    mul.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.f32 %r5, %r2, 0fBF000000;
+; CHECK-NEXT:    mul.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined
 define float @test_sqrt32_refined(float %a) #0 #2 {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_sqrt32_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined_ninf
 define float @test_sqrt32_refined_ninf(float %a) #0 #2 {
-; CHECK: rsqrt.approx.f32
+; CHECK-LABEL: test_sqrt32_refined_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    mul.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.f32 %r5, %r3, 0fBF000000;
+; CHECK-NEXT:    mul.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    abs.f32 %r7, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r7, 0f00800000;
+; CHECK-NEXT:    selp.f32 %r8, 0f00000000, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt64_refined
 define double @test_rsqrt64_refined(double %a) #0 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_rsqrt64_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_refined_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd2, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined
 define double @test_sqrt64_refined(double %a) #0 #2 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined_ninf
 define double @test_sqrt64_refined_ninf(double %a) #0 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_sqrt64_refined_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd3, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    abs.f64 %rd7, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd7, 0d0010000000000000;
+; CHECK-NEXT:    selp.f64 %rd8, 0d0000000000000000, %rd6, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd8;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
 ; -- refined sqrt and rsqrt with ftz enabled --
 
-; CHECK-LABEL: test_rsqrt32_refined_ftz
 define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK-LABEL: test_rsqrt32_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt32_refined_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    mul.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.ftz.f32 %r5, %r2, 0fBF000000;
+; CHECK-NEXT:    mul.ftz.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined_ftz
 define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 {
-; CHECK: sqrt.rn.ftz.f32
+; CHECK-LABEL: test_sqrt32_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_ftz_param_0];
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined_ftz_ninf
 define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK-LABEL: test_sqrt32_refined_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_ftz_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    mul.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.ftz.f32 %r5, %r3, 0fBF000000;
+; CHECK-NEXT:    mul.ftz.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    selp.f32 %r7, 0f00000000, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt64_refined_ftz
-define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
 ; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version.
-; CHECK: rsqrt.approx.f64
+define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
+; CHECK-LABEL: test_rsqrt64_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_refined_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd2, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined_ftz
 define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_ftz_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined_ftz_ninf
 define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_sqrt64_refined_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_ftz_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd3, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    abs.f64 %rd7, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd7, 0d0010000000000000;
+; CHECK-NEXT:    selp.f64 %rd8, 0d0000000000000000, %rd6, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd8;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }

From 432d06ab919ae18c4ed1e94148448501578a6c85 Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Fri, 13 Jun 2025 11:33:52 +0530
Subject: [PATCH 0233/1322] [NFC][AMDGPU] Fix stale links to ROCm repositories
 (#143949)

Following GitHub organizations were merged into the ROCm org:
  * ROCm-Developer-Tools
  * RadeonOpenCompute
  * ROCmSoftwarePlatform

Ensure that all hyperlinks to the old organizations now point to the new
organization at https://github.com/ROCm.
---
 clang/docs/HIPSupport.rst                          | 14 +++++++-------
 ...GPUDwarfExtensionsForHeterogeneousDebugging.rst |  2 +-
 llvm/docs/AMDGPUUsage.rst                          |  4 ++--
 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll     |  2 +-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h    |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index 051a25396994..406e1c8e5a2f 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -17,7 +17,7 @@
 HIP Support
 =============
 
-HIP (Heterogeneous-Compute Interface for Portability) `<https://github.com/ROCm-Developer-Tools/HIP>`_ is
+HIP (Heterogeneous-Compute Interface for Portability) `<https://github.com/ROCm/HIP>`_ is
 a C++ Runtime API and Kernel Language. It enables developers to create portable applications for
 offloading computation to different hardware platforms from a single source code.
 
@@ -41,9 +41,9 @@ backend or the out-of-tree LLVM-SPIRV translator. The SPIR-V is then bundled and
 .. note::
    While Clang does not directly provide HIP support for NVIDIA GPUs and CPUs, these platforms are supported via other means:
 
-   - NVIDIA GPUs: HIP support is offered through the HIP project `<https://github.com/ROCm-Developer-Tools/HIP>`_, which provides a header-only library for translating HIP runtime APIs into CUDA runtime APIs. The code is subsequently compiled using NVIDIA's `nvcc`.
+   - NVIDIA GPUs: HIP support is offered through the HIP project `<https://github.com/ROCm/HIP>`_, which provides a header-only library for translating HIP runtime APIs into CUDA runtime APIs. The code is subsequently compiled using NVIDIA's `nvcc`.
 
-   - CPUs: HIP support is available through the HIP-CPU runtime library `<https://github.com/ROCm-Developer-Tools/HIP-CPU>`_. This header-only library enables CPUs to execute unmodified HIP code.
+   - CPUs: HIP support is available through the HIP-CPU runtime library `<https://github.com/ROCm/HIP-CPU>`_. This header-only library enables CPUs to execute unmodified HIP code.
 
 
 Example Usage
@@ -328,7 +328,7 @@ The `parallel_unsequenced_policy <https://en.cppreference.com/w/cpp/algorithm/ex
 maps relatively well to the execution model of AMD GPUs. This, coupled with the
 the availability and maturity of GPU accelerated algorithm libraries that
 implement most / all corresponding algorithms in the standard library
-(e.g. `rocThrust <https://github.com/ROCmSoftwarePlatform/rocThrust>`__), makes
+(e.g. `rocThrust <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust>`__), makes
 it feasible to provide seamless accelerator offload for supported algorithms,
 when an accelerated version exists. Thus, it becomes possible to easily access
 the computational resources of an AMD accelerator, via a well specified,
@@ -483,7 +483,7 @@ such as GPUs, work.
      allocation / deallocation functions with accelerator-aware equivalents,
      based on a pre-established table; the list of functions that can be
      interposed is available
-     `here <https://github.com/ROCmSoftwarePlatform/roc-stdpar#allocation--deallocation-interposition-status>`__;
+     `here <https://github.com/ROCm/roc-stdpar#allocation--deallocation-interposition-status>`__;
    - This is only run when compiling for the host.
 
 The second pass is optional.
@@ -627,7 +627,7 @@ Linux operating system. Support is synthesised in the following table:
 The minimum Linux kernel version for running in HMM mode is 6.4.
 
 The forwarding header can be obtained from
-`its GitHub repository <https://github.com/ROCmSoftwarePlatform/roc-stdpar>`_.
+`its GitHub repository <https://github.com/ROCm/roc-stdpar>`_.
 It will be packaged with a future `ROCm <https://rocm.docs.amd.com/en/latest/>`_
 release. Because accelerated algorithms are provided via
 `rocThrust <https://rocm.docs.amd.com/projects/rocThrust/en/latest/>`_, a
@@ -636,7 +636,7 @@ transitive dependency on
 can be obtained either by installing their associated components of the
 `ROCm <https://rocm.docs.amd.com/en/latest/>`_ stack, or from their respective
 repositories. The list algorithms that can be offloaded is available
-`here <https://github.com/ROCmSoftwarePlatform/roc-stdpar#algorithm-support-status>`_.
+`here <https://github.com/ROCm/roc-stdpar#algorithm-support-status>`_.
 
 HIP Specific Elements
 ---------------------
diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
index 0249c580964a..95ae4f74e0ea 100644
--- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -5323,7 +5323,7 @@ D. References
 
     .. _amdgpu-dwarf-AMD-ROCgdb:
 
-2.  [AMD-ROCgdb] `AMD ROCm Debugger (ROCgdb) <https://github.com/ROCm-Developer-Tools/ROCgdb>`__
+2.  [AMD-ROCgdb] `AMD ROCm Debugger (ROCgdb) <https://github.com/ROCm/ROCgdb>`__
 
     .. _amdgpu-dwarf-AMD-ROCm:
 
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 174a497c51b2..3aa8773fa506 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -18435,8 +18435,8 @@ Additional Documentation
 .. [AMD-RADEON-HD-5000] `AMD Evergreen shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_Evergreen-Family_Instruction_Set_Architecture.pdf>`__
 .. [AMD-RADEON-HD-6000] `AMD Cayman/Trinity shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_HD_6900_Series_Instruction_Set_Architecture.pdf>`__
 .. [AMD-ROCm] `AMD ROCm™ Platform <https://rocmdocs.amd.com/>`__
-.. [AMD-ROCm-github] `AMD ROCm™ github <http://github.com/RadeonOpenCompute>`__
-.. [AMD-ROCm-Release-Notes] `AMD ROCm Release Notes <https://github.com/RadeonOpenCompute/ROCm>`__
+.. [AMD-ROCm-github] `AMD ROCm™ github <http://github.com/ROCm>`__
+.. [AMD-ROCm-Release-Notes] `AMD ROCm Release Notes <https://github.com/ROCm/ROCm>`__
 .. [CLANG-ATTR] `Attributes in Clang <https://clang.llvm.org/docs/AttributeReference.html>`__
 .. [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
 .. [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index 049142732aa1..a84e261357de 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -182,7 +182,7 @@ attributes #5 = { nounwind }
 !10 = !{i32 7, !"frame-pointer", i32 2}
 !11 = !{i32 4, !"amdgpu_hostcall", i32 1}
 !12 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"}
-!13 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
+!13 = !{!"AMD clang version 17.0.0 (https://github.com/ROCm/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
 !14 = !{i32 2, i32 0}
 !15 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12_debug__", scope: !16, file: !16, line: 13, type: !17, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22)
 !16 = !DIFile(filename: "test.c", directory: "/tmp")
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index c2a82ffc1c43..ce1fe5a03c49 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -13,8 +13,8 @@
 // pointed to here. However the following links contain more information about
 // ROCDL (ROCm-Device-Library)
 //
-// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/amd-stg-open/doc/OCML.md
-// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/amd-stg-open/doc/OCKL.md
+// https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/doc/OCML.md
+// https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/doc/OCKL.md
 // https://llvm.org/docs/AMDGPUUsage.html
 //
 //===----------------------------------------------------------------------===//

From 4e80a033a1bade55bca8a32e267cf1b06d05b1ed Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 12 Jun 2025 23:09:55 -0700
Subject: [PATCH 0234/1322] [NVPTX] Use prmt.f4e to lower pointer alignment
 fshr idiom (#143407)

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td |  4 ++++
 llvm/test/CodeGen/NVPTX/prmt.ll         | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index fa521c040e8e..4c3501df57f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1621,6 +1621,10 @@ let hasSideEffects = false in {
 
 }
 
+// PRMT folding patterns
+def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))),
+          (PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>;
+
 
 // byte extraction + signed/unsigned extension to i32.
 def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
diff --git a/llvm/test/CodeGen/NVPTX/prmt.ll b/llvm/test/CodeGen/NVPTX/prmt.ll
index 271e4c86cd23..48b9eefb9fb3 100644
--- a/llvm/test/CodeGen/NVPTX/prmt.ll
+++ b/llvm/test/CodeGen/NVPTX/prmt.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -verify-machineinstrs -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -verify-machineinstrs -mcpu=sm_50 | %ptxas-verify %}
 
 target triple = "nvptx64-nvidia-cuda"
 
@@ -111,3 +111,20 @@ define i32 @test_prmt_rc16(i32 %lo, i32 %selector) {
   %val = call i32 @llvm.nvvm.prmt.rc16(i32 %lo, i32 %selector)
   ret i32 %val
 }
+
+define i32 @test_prmt_f4e_folding(i32 %lo, i32 %hi, i32 %ptr) {
+; CHECK-LABEL: test_prmt_f4e_folding(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_f4e_folding_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_f4e_folding_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_prmt_f4e_folding_param_2];
+; CHECK-NEXT:    prmt.b32.f4e %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %sh_amt = shl i32 %ptr, 3
+  %val = call i32 @llvm.fshr.i32(i32 %hi, i32 %lo, i32 %sh_amt)
+  ret i32 %val
+}

From f64b3bb276e820f00911dbf6ecc484751daeb5f1 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Fri, 13 Jun 2025 08:21:56 +0200
Subject: [PATCH 0235/1322] [mlir][llvm] Op interface LLVM converter (#143922)

Adds a utility conversion class for rewriting op interface instances
targeting LLVM dialect.
---
 .../mlir/Conversion/LLVMCommon/Pattern.h      | 45 +++++++++++++++++++
 .../AMX/Transforms/LegalizeForLLVMExport.cpp  | 15 ++-----
 .../Transforms/LegalizeForLLVMExport.cpp      | 15 ++-----
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
index 7e946495e3e7..503a2a7e6f0c 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -92,6 +92,10 @@ public:
                        PatternBenefit benefit = 1);
 
 protected:
+  /// See `ConversionPattern::ConversionPattern` for information on the other
+  /// available constructors.
+  using ConversionPattern::ConversionPattern;
+
   /// Returns the LLVM dialect.
   LLVM::LLVMDialect &getDialect() const;
 
@@ -234,6 +238,47 @@ private:
   using ConvertToLLVMPattern::matchAndRewrite;
 };
 
+/// Utility class for operation conversions targeting the LLVM dialect that
+/// allows for matching and rewriting against an instance of an OpInterface
+/// class.
+template <typename SourceOp>
+class ConvertOpInterfaceToLLVMPattern : public ConvertToLLVMPattern {
+public:
+  explicit ConvertOpInterfaceToLLVMPattern(
+      const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1)
+      : ConvertToLLVMPattern(typeConverter, Pattern::MatchInterfaceOpTypeTag(),
+                             SourceOp::getInterfaceID(), benefit,
+                             &typeConverter.getContext()) {}
+
+  /// Wrappers around the RewritePattern methods that pass the derived op type.
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+
+  /// Methods that operate on the SourceOp type. One of these must be
+  /// overridden by the derived pattern class.
+  virtual LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("matchAndRewrite is not implemented");
+  }
+  virtual LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    return matchAndRewrite(op, getOneToOneAdaptorOperands(operands), rewriter);
+  }
+
+private:
+  using ConvertToLLVMPattern::matchAndRewrite;
+};
+
 /// Generic implementation of one-to-one conversion from "SourceOp" to
 /// "TargetOp" where the latter belongs to the LLVM dialect or an equivalent.
 /// Upholds a convention that multi-result operations get converted into an
diff --git a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
index 37aebc9fab3e..06e5f7c2196d 100644
--- a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
@@ -24,27 +24,18 @@ namespace {
 /// Generic one-to-one conversion of simply mappable operations into calls
 /// to their respective LLVM intrinsics.
 struct AMXIntrinsicOpConversion
-    : public OpInterfaceConversionPattern<amx::AMXIntrinsicOp> {
-  using OpInterfaceConversionPattern<
-      amx::AMXIntrinsicOp>::OpInterfaceConversionPattern;
-
-  AMXIntrinsicOpConversion(const LLVMTypeConverter &typeConverter,
-                           PatternBenefit benefit = 1)
-      : OpInterfaceConversionPattern(typeConverter, &typeConverter.getContext(),
-                                     benefit),
-        typeConverter(typeConverter) {}
+    : public ConvertOpInterfaceToLLVMPattern<amx::AMXIntrinsicOp> {
+  using ConvertOpInterfaceToLLVMPattern::ConvertOpInterfaceToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(amx::AMXIntrinsicOp op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
+    const LLVMTypeConverter &typeConverter = *getTypeConverter();
     return LLVM::detail::intrinsicRewrite(
         op, rewriter.getStringAttr(op.getIntrinsicName()),
         op.getIntrinsicOperands(operands, typeConverter, rewriter),
         typeConverter, rewriter);
   }
-
-private:
-  const LLVMTypeConverter &typeConverter;
 };
 
 } // namespace
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
index b2fc2f3f40e8..8e062488f58c 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
@@ -23,27 +23,18 @@ namespace {
 /// Generic one-to-one conversion of simply mappable operations into calls
 /// to their respective LLVM intrinsics.
 struct X86IntrinsicOpConversion
-    : public OpInterfaceConversionPattern<x86vector::X86IntrinsicOp> {
-  using OpInterfaceConversionPattern<
-      x86vector::X86IntrinsicOp>::OpInterfaceConversionPattern;
-
-  X86IntrinsicOpConversion(const LLVMTypeConverter &typeConverter,
-                           PatternBenefit benefit = 1)
-      : OpInterfaceConversionPattern(typeConverter, &typeConverter.getContext(),
-                                     benefit),
-        typeConverter(typeConverter) {}
+    : public ConvertOpInterfaceToLLVMPattern<x86vector::X86IntrinsicOp> {
+  using ConvertOpInterfaceToLLVMPattern::ConvertOpInterfaceToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(x86vector::X86IntrinsicOp op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
+    const LLVMTypeConverter &typeConverter = *getTypeConverter();
     return LLVM::detail::intrinsicRewrite(
         op, rewriter.getStringAttr(op.getIntrinsicName()),
         op.getIntrinsicOperands(operands, typeConverter, rewriter),
         typeConverter, rewriter);
   }
-
-private:
-  const LLVMTypeConverter &typeConverter;
 };
 
 } // namespace

From 483d19619c3221c1d54080e57e43052eb863436a Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Fri, 13 Jun 2025 14:26:50 +0800
Subject: [PATCH 0236/1322] [RISCV] Add tune features for Andes 45 series cpus
 (#143899)

Add tune features TuneNoDefaultUnroll, TuneShortForwardBranchOpt and
TunePostRAScheduler for Andes 45 series cpus.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td   |  3 +++
 llvm/lib/Target/RISCV/RISCVProcessors.td | 19 ++++++++++++++-----
 llvm/lib/Target/RISCV/RISCVSubtarget.h   |  1 +
 llvm/test/CodeGen/RISCV/features-info.ll |  1 +
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 83eefc0858d4..940caa4f4044 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1742,6 +1742,9 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
 def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
                                          "Ventana Veyron-Series processors">;
 
+def TuneAndes45 : SubtargetFeature<"andes45", "RISCVProcFamily", "Andes45",
+                                   "Andes 45-Series processors">;
+
 def TuneVXRMPipelineFlush : SubtargetFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush",
                                              "true", "VXRM writes causes pipeline flush">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index de6f0ecfce73..32f4ab607a34 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -722,8 +722,13 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
+defvar Andes45TuneFeatures = [TuneAndes45,
+                              TuneNoDefaultUnroll,
+                              TuneShortForwardBranchOpt,
+                              TunePostRAScheduler];
+
 def ANDES_45 : RISCVTuneProcessorModel<"andes-45-series",
-                                       Andes45Model>;
+                                       Andes45Model, Andes45TuneFeatures>;
 
 def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                     Andes45Model,
@@ -737,7 +742,8 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtD,
                                      FeatureStdExtC,
                                      FeatureStdExtB,
-                                     FeatureVendorXAndesPerf]>;
+                                     FeatureVendorXAndesPerf],
+                                    Andes45TuneFeatures>;
 
 def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                      Andes45Model,
@@ -751,7 +757,8 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtB,
-                                      FeatureVendorXAndesPerf]>;
+                                      FeatureVendorXAndesPerf],
+                                     Andes45TuneFeatures>;
 
 def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                     Andes45Model,
@@ -765,7 +772,8 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtD,
                                      FeatureStdExtC,
                                      FeatureStdExtB,
-                                     FeatureVendorXAndesPerf]>;
+                                     FeatureVendorXAndesPerf],
+                                    Andes45TuneFeatures>;
 
 def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                      Andes45Model,
@@ -779,4 +787,5 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtB,
-                                      FeatureVendorXAndesPerf]>;
+                                      FeatureVendorXAndesPerf],
+                                     Andes45TuneFeatures>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 0eef7b1feaf5..04c7ca7d0572 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -83,6 +83,7 @@ public:
     SiFive7,
     VentanaVeyron,
     MIPSP8700,
+    Andes45,
   };
   enum RISCVVRGatherCostModelEnum : uint8_t {
     Quadratic,
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index b7b27cd579fb..fab2e9495930 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -6,6 +6,7 @@
 ; CHECK-NEXT:   32bit                            - Implements RV32.
 ; CHECK-NEXT:   64bit                            - Implements RV64.
 ; CHECK-NEXT:   a                                - 'A' (Atomic Instructions).
+; CHECK-NEXT:   andes45                          - Andes 45-Series processors.
 ; CHECK-NEXT:   auipc-addi-fusion                - Enable AUIPC+ADDI macrofusion.
 ; CHECK-NEXT:   b                                - 'B' (the collection of the Zba, Zbb, Zbs extensions).
 ; CHECK-NEXT:   c                                - 'C' (Compressed Instructions).

From 4903c11a7e144d63635b115d97936a7aecf7a2f6 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Fri, 13 Jun 2025 14:31:48 +0800
Subject: [PATCH 0237/1322] [RISCV] Support memcmp expansion for vectors

This patch adds the support of generating vector instructions for
`memcmp`. This implementation is inspired by X86's.

We convert integer comparisons (eq/ne only) into vector comparisons
and do a vector reduction and to get the result.

The range of supported load sizes is (XLEN, VLEN * LMUL8] and
non-power-of-2 types are not supported.

Fixes #143294.

Reviewers: lukel97, asb, preames, topperc, dtcxzyw

Reviewed By: topperc, lukel97

Pull Request: https://github.com/llvm/llvm-project/pull/114517
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   69 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   17 +
 llvm/test/CodeGen/RISCV/memcmp-optsize.ll     | 1348 ++++++++++++--
 llvm/test/CodeGen/RISCV/memcmp.ll             | 1556 +++++++++++++----
 4 files changed, 2456 insertions(+), 534 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a157c94849f3..7839af5c1691 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16147,17 +16147,80 @@ static bool narrowIndex(SDValue &N, ISD::MemIndexType IndexType, SelectionDAG &D
   return true;
 }
 
+/// Try to map an integer comparison with size > XLEN to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue
+combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
+                                const SDLoc &DL, SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget) {
+  assert(ISD::isIntEqualitySetCC(CC) && "Bad comparison predicate");
+
+  if (!Subtarget.hasVInstructions())
+    return SDValue();
+
+  MVT XLenVT = Subtarget.getXLenVT();
+  EVT OpVT = X.getValueType();
+  // We're looking for an oversized integer equality comparison.
+  if (!OpVT.isScalarInteger())
+    return SDValue();
+
+  unsigned OpSize = OpVT.getSizeInBits();
+  // TODO: Support non-power-of-2 types.
+  if (!isPowerOf2_32(OpSize))
+    return SDValue();
+
+  // The size should be larger than XLen and smaller than the maximum vector
+  // size.
+  if (OpSize <= Subtarget.getXLen() ||
+      OpSize > Subtarget.getRealMinVLen() *
+                   Subtarget.getMaxLMULForFixedLengthVectors())
+    return SDValue();
+
+  // Don't perform this combine if constructing the vector will be expensive.
+  auto IsVectorBitCastCheap = [](SDValue X) {
+    X = peekThroughBitcasts(X);
+    return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+           X.getOpcode() == ISD::LOAD;
+  };
+  if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
+    return SDValue();
+
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat))
+    return SDValue();
+
+  unsigned VecSize = OpSize / 8;
+  EVT VecVT = MVT::getVectorVT(MVT::i8, VecSize);
+  EVT CmpVT = MVT::getVectorVT(MVT::i1, VecSize);
+
+  SDValue VecX = DAG.getBitcast(VecVT, X);
+  SDValue VecY = DAG.getBitcast(VecVT, Y);
+  SDValue Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
+  return DAG.getSetCC(DL, VT, DAG.getNode(ISD::VECREDUCE_OR, DL, XLenVT, Cmp),
+                      DAG.getConstant(0, DL, XLenVT), CC);
+}
+
 // Replace (seteq (i64 (and X, 0xffffffff)), C1) with
 // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
 // bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
 // can become a sext.w instead of a shift pair.
 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  // Looking for an equality compare.
+  if (!isIntEqualitySetCC(Cond))
+    return SDValue();
+
+  if (SDValue V =
+          combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget))
+    return V;
+
   if (OpVT != MVT::i64 || !Subtarget.is64Bit())
     return SDValue();
 
@@ -16172,11 +16235,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
       N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
     return SDValue();
 
-  // Looking for an equality compare.
-  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  if (!isIntEqualitySetCC(Cond))
-    return SDValue();
-
   // Don't do this if the sign bit is provably zero, it will be turned back into
   // an AND.
   APInt SignMask = APInt::getOneBitSet(64, 31);
@@ -16185,7 +16243,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
 
   const APInt &C1 = N1C->getAPIntValue();
 
-  SDLoc dl(N);
   // If the constant is larger than 2^32 - 1 it is impossible for both sides
   // to be equal.
   if (C1.getActiveBits() > 32)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index d5ea0c5d5229..bee47527cf42 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2952,5 +2952,22 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     Options.LoadSizes = {4, 2, 1};
     Options.AllowedTailExpansions = {3};
   }
+
+  if (IsZeroCmp && ST->hasVInstructions()) {
+    unsigned RealMinVLen = ST->getRealMinVLen();
+    // Support Fractional LMULs if the lengths are larger than XLen.
+    // TODO: Support non-power-of-2 types.
+    for (unsigned FLMUL = 8; FLMUL >= 2; FLMUL /= 2) {
+      unsigned Len = RealMinVLen / FLMUL;
+      if (Len > ST->getXLen())
+        Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8);
+    }
+    for (unsigned LMUL = 1; LMUL <= ST->getMaxLMULForFixedLengthVectors();
+         LMUL *= 2) {
+      unsigned Len = RealMinVLen * LMUL;
+      if (Len > ST->getXLen())
+        Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8);
+    }
+  }
   return Options;
 }
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 38cd51c07459..3742383675b9 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -870,13 +870,11 @@ define i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_8:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
@@ -1073,18 +1071,18 @@ define i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 7(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 7(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
@@ -1284,33 +1282,21 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1319,15 +1305,15 @@ entry:
 }
 
 define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_31:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 31
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -1339,6 +1325,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -1349,6 +1345,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -1359,6 +1365,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -1369,6 +1385,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -1389,6 +1415,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 31
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -1409,6 +1445,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 31
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -1429,22 +1475,58 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 15(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 19(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 23(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 15(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 19(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 23(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, t3, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a3, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 15(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 15(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
@@ -1454,15 +1536,15 @@ entry:
 }
 
 define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_32:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 32
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -1474,6 +1556,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -1484,6 +1576,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -1494,6 +1596,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -1504,6 +1616,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -1524,6 +1646,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -1544,6 +1676,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -1564,23 +1706,25 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1589,100 +1733,1020 @@ entry:
 }
 
 define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_63:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 63
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_63:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 63
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -48
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 47(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 51(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 55(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 59(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 31(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 35(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 39(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 43(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 31(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 35(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 39(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 43(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 47(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 51(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 55(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 59(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 48
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 24(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 31(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 39(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 47(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 31(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 39(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 47(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, t3, t1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a3, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 63)
   ret i32 %bcmp
 }
 
 define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_64:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 64
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_64:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 64
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 64)
   ret i32 %bcmp
 }
 
 define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_127:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 127
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_127:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 127
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 64(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 60(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 56(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 52(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 48(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 32(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 36(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 40(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 44(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 48(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 52(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 56(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 60(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 60(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 36(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 40(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 44(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a4, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 56(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 48(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a6, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a7, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t1, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 107(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t5, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 75(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, t6, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 123(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s2, s2, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, t4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 83(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 87(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 91(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s1, s1, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 107(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, t3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 91(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, t2, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 123(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, a5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 75(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s5, s11, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s7, s8, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 87(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 83(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s3, s10, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 115(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s6, ra, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 115(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s4, s4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 119(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 119(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s10, s10, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 71(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 67(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 67(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 71(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 99(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 99(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 103(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 103(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, a4, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, s9, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, s11, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, ra, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 95(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 63(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 111(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 79(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 79(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 111(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 63(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 95(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s9, s9, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, s8, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t4, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, s1, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t5, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, s0, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, s2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, t6, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t1, t2, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t2, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t3, t3, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t4, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t5, t5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, t6, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or s0, s0, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, s0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t5, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t3, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t1, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a6, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t4, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t0, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 64(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 56(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 48(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 56(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 56(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t6, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 8(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 24(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 95(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 103(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 111(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 119(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 63(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 71(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 79(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 87(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 63(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 71(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 79(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 87(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 95(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 103(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 111(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 119(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)
   ret i32 %bcmp
 }
 
 define i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_128:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 128
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_128:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 128
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 128)
   ret i32 %bcmp
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index df9d781a4536..f9a6dbba04fc 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -870,13 +870,11 @@ define i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_8:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
@@ -1073,18 +1071,18 @@ define i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 7(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 7(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
@@ -1284,33 +1282,21 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1577,29 +1563,29 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 15(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 19(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 23(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 15(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 19(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 23(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, t3, t1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a3, a6
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
@@ -1607,18 +1593,18 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 15(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 15(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
@@ -1878,57 +1864,23 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1937,15 +1889,15 @@ entry:
 }
 
 define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_63:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 63
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -1957,6 +1909,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -1967,6 +1929,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -1977,6 +1949,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -1987,6 +1969,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -2023,6 +2015,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -2059,6 +2061,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -2095,6 +2107,98 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -48
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 47(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 51(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 55(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 59(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 31(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 35(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 39(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 43(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 31(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 35(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 39(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 43(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 47(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 51(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 55(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 59(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 48
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
@@ -2105,29 +2209,29 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 31(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 39(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 47(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 31(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 39(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 47(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, t3, t1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a3, a6
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2136,15 +2240,15 @@ entry:
 }
 
 define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_64:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 64
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -2156,6 +2260,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -2166,6 +2280,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -2176,6 +2300,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -2186,6 +2320,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -2222,6 +2366,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -2258,6 +2412,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -2294,39 +2458,25 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v16
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2335,50 +2485,580 @@ entry:
 }
 
 define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_127:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 127
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_127:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 127
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 64(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 60(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 56(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 52(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 48(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 32(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 36(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 40(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 44(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 48(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 52(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 56(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 60(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 60(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 36(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 40(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 44(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a4, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 56(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 48(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a6, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a7, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t1, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 107(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t5, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 75(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, t6, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 123(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s2, s2, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, t4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 83(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 87(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 91(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s1, s1, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 107(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, t3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 91(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, t2, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 123(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, a5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 75(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s5, s11, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s7, s8, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 87(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 83(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s3, s10, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 115(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s6, ra, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 115(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s4, s4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 119(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 119(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s10, s10, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 71(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 67(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 67(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 71(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 99(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 99(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 103(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 103(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, a4, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, s9, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, s11, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, ra, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 95(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 63(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 111(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 79(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 79(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 111(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 63(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 95(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s9, s9, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, s8, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t4, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, s1, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t5, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, s0, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, s2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, t6, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t1, t2, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t2, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t3, t3, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t4, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t5, t5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, t6, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or s0, s0, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, s0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t5, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t3, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t1, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a6, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t4, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t0, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 64(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 56(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 48(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 56(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 56(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t6, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 8(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 24(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 95(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 103(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 111(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 119(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 63(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 71(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 79(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 87(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 63(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 71(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 79(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 87(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 95(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 103(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 111(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 119(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)
   ret i32 %bcmp
 }
 
 define i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_128:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 128
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_128:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 128
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 128)
   ret i32 %bcmp
@@ -2412,7 +3092,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2423,7 +3103,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2434,7 +3114,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2445,7 +3125,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2456,7 +3136,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2467,7 +3147,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2478,7 +3158,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2489,22 +3169,130 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_eq_zero:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a0, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lw a1, 0(a1)
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    seqz a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
-  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4)
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 16)
   %ret = icmp eq i32 %bcmp, 0
   ret i1 %ret
 }
@@ -5980,213 +6768,209 @@ entry:
 define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a2, 0(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a2, 0(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: memcmp_eq_zero:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a0, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lw a1, 0(a1)
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    seqz a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
-  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4)
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 16)
   %ret = icmp eq i32 %memcmp, 0
   ret i1 %ret
 }

From 43be31e35ab0985ec381041762586902c2718751 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 00:12:07 -0700
Subject: [PATCH 0238/1322] SPARC: Simplify SparcMCExpr

Reduce direct uses of SparcMCExpr, facilitating transition to
MCSpecifierExpr in the future.
---
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 14 ++++++------
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.cpp     |  6 ++---
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp |  6 ++---
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.cpp | 22 +++++++++----------
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.h   | 22 +++++++++----------
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp     |  8 +++----
 .../Target/Sparc/SparcTargetObjectFile.cpp    |  3 +--
 7 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 3deeac38e560..187ecbaad4bb 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -848,14 +848,14 @@ bool SparcAsmParser::expandSETX(MCInst &Inst, SMLoc IDLoc,
   // sethi %hh(val), tmp
   Instructions.push_back(MCInstBuilder(SP::SETHIi)
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(SparcMCExpr::create(
-                                 ELF::R_SPARC_HH22, ValExpr, getContext())));
+                             .addExpr(Sparc::createSpecifierExpr(
+                                 getContext(), ValExpr, ELF::R_SPARC_HH22)));
   // or    tmp, %hm(val), tmp
   Instructions.push_back(MCInstBuilder(SP::ORri)
                              .addReg(MCTmpOp.getReg())
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(SparcMCExpr::create(
-                                 ELF::R_SPARC_HM10, ValExpr, getContext())));
+                             .addExpr(Sparc::createSpecifierExpr(
+                                 getContext(), ValExpr, ELF::R_SPARC_HM10)));
   // sllx  tmp, 32, tmp
   Instructions.push_back(MCInstBuilder(SP::SLLXri)
                              .addReg(MCTmpOp.getReg())
@@ -1165,7 +1165,7 @@ ParseStatus SparcAsmParser::parseTailRelocSym(OperandVector &Operands) {
     return Error(getLoc(), "expected valid identifier for operand modifier");
 
   StringRef Name = getParser().getTok().getIdentifier();
-  uint16_t RelType = SparcMCExpr::parseSpecifier(Name);
+  uint16_t RelType = Sparc::parseSpecifier(Name);
   if (RelType == 0)
     return Error(getLoc(), "invalid relocation specifier");
 
@@ -1689,7 +1689,7 @@ const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
     }
   }
 
-  return SparcMCExpr::create(RelType, subExpr, getContext());
+  return Sparc::createSpecifierExpr(getContext(), subExpr, RelType);
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
@@ -1700,7 +1700,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
 
   StringRef name = Tok.getString();
 
-  auto VK = SparcMCExpr::parseSpecifier(name);
+  auto VK = Sparc::parseSpecifier(name);
   switch (VK) {
   case 0:
     Error(getLoc(), "invalid relocation specifier");
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 8de9a789a63b..3049072b001c 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -50,8 +50,7 @@ SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return SparcMCExpr::create(ELF::R_SPARC_DISP32,
-                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
   }
 
   return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
@@ -63,8 +62,7 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
                                        MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return SparcMCExpr::create(ELF::R_SPARC_DISP32,
-                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 2c8dbaa5aba6..4ce9bea5d795 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -135,7 +135,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   assert(MO.isExpr());
   const MCExpr *Expr = MO.getExpr();
   if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
-    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getFixupKind()));
+    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
 
@@ -165,7 +165,7 @@ unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo,
     return CE->getValue();
 
   if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
-    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getFixupKind()));
+    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
   Fixups.push_back(MCFixup::create(0, Expr, ELF::R_SPARC_5));
@@ -191,7 +191,7 @@ SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
     return CE->getValue();
 
   if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
-    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getFixupKind()));
+    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
   Fixups.push_back(MCFixup::create(0, Expr, Sparc::fixup_sparc_13));
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 09e55a66fcc6..2e03e4739986 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -22,13 +22,18 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sparcmcexpr"
 
-const SparcMCExpr *SparcMCExpr::create(uint16_t S, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) SparcMCExpr(S, Expr);
+const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
+                                              const MCExpr *Expr, uint16_t S) {
+  return new (Ctx) SparcMCExpr(Expr, S);
+}
+
+const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
+                                              const MCSymbol *Sym, uint16_t S) {
+  return new (Ctx) SparcMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
 }
 
 void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  StringRef S = getSpecifierName(specifier);
+  StringRef S = Sparc::getSpecifierName(specifier);
   if (!S.empty())
     OS << '%' << S << '(';
   getSubExpr()->print(OS, MAI);
@@ -36,7 +41,7 @@ void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
     OS << ')';
 }
 
-StringRef SparcMCExpr::getSpecifierName(uint16_t S) {
+StringRef Sparc::getSpecifierName(uint16_t S) {
   // clang-format off
   switch (uint16_t(S)) {
   case 0:                          return {};
@@ -83,7 +88,7 @@ StringRef SparcMCExpr::getSpecifierName(uint16_t S) {
   llvm_unreachable("Unhandled SparcMCExpr::Specifier");
 }
 
-uint16_t SparcMCExpr::parseSpecifier(StringRef name) {
+uint16_t Sparc::parseSpecifier(StringRef name) {
   return StringSwitch<uint16_t>(name)
       .Case("lo", ELF::R_SPARC_LO10)
       .Case("hi", ELF::R_SPARC_HI22)
@@ -128,8 +133,3 @@ uint16_t SparcMCExpr::parseSpecifier(StringRef name) {
       .Case("gdop", ELF::R_SPARC_GOTDATA_OP)
       .Default(0);
 }
-
-uint16_t SparcMCExpr::getFixupKind() const {
-  assert(uint16_t(specifier) < FirstTargetFixupKind);
-  return specifier;
-}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 8368e8ff8795..612b439bfc74 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -21,21 +21,21 @@ namespace llvm {
 
 class StringRef;
 class SparcMCExpr : public MCSpecifierExpr {
-private:
-  explicit SparcMCExpr(uint16_t S, const MCExpr *Expr)
-      : MCSpecifierExpr(Expr, S) {}
-
 public:
-  static const SparcMCExpr *create(uint16_t S, const MCExpr *Expr,
-                                   MCContext &Ctx);
-  uint16_t getFixupKind() const;
-
+  explicit SparcMCExpr(const MCExpr *Expr, uint16_t S)
+      : MCSpecifierExpr(Expr, S) {}
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static uint16_t parseSpecifier(StringRef name);
-  static StringRef getSpecifierName(uint16_t S);
 };
 
+namespace Sparc {
+const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCExpr *Expr,
+                                       uint16_t S);
+const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCSymbol *Sym,
+                                       uint16_t S);
+uint16_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint16_t S);
+} // namespace Sparc
+
 } // end namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index a30cf5a661bb..ffefdf97edab 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -82,7 +82,7 @@ public:
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym, OutContext);
+  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, MCSym, Kind);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +101,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::create(Spec, Add, OutContext);
+  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, Add, Spec);
   return MCOperand::createExpr(expr);
 }
 
@@ -302,7 +302,7 @@ MCOperand SparcAsmPrinter::lowerOperand(const MachineOperand &MO) const {
 
     const MCExpr *expr = MCSymbolRefExpr::create(Symbol, OutContext);
     if (RelType)
-      expr = SparcMCExpr::create(RelType, expr, OutContext);
+      expr = Sparc::createSpecifierExpr(OutContext, expr, RelType);
     return MCOperand::createExpr(expr);
   }
 
@@ -374,7 +374,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   const MachineOperand &MO = MI->getOperand (opNum);
   auto TF = MO.getTargetFlags();
 
-  StringRef Spec = SparcMCExpr::getSpecifierName(TF);
+  StringRef Spec = Sparc::getSpecifierName(TF);
   O << Spec;
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 668e6eab4e1b..be11ea272ed1 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -39,8 +39,7 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     }
 
     MCContext &Ctx = getContext();
-    return SparcMCExpr::create(ELF::R_SPARC_DISP32,
-                               MCSymbolRefExpr::create(SSym, Ctx), Ctx);
+    return Sparc::createSpecifierExpr(Ctx, SSym, ELF::R_SPARC_DISP32);
   }
 
   return TargetLoweringObjectFileELF::getTTypeGlobalReference(GV, Encoding, TM,

From 1fae5918b3d6fbed8ce6d8a2edf31bdf304ca8db Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 13 Jun 2025 00:45:52 -0700
Subject: [PATCH 0239/1322] [clang-format] Fix an off-by-1 bug with -length
 option (#143302)

Also validate the argument value.

Fixes #56245
---
 clang/test/Format/multiple-inputs-error.cpp |  2 +-
 clang/test/Format/ranges.cpp                | 11 ++++++++++-
 clang/tools/clang-format/ClangFormat.cpp    | 14 +++++++++-----
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/clang/test/Format/multiple-inputs-error.cpp b/clang/test/Format/multiple-inputs-error.cpp
index 1aa9c9f3e2fa..7cb835d39f23 100644
--- a/clang/test/Format/multiple-inputs-error.cpp
+++ b/clang/test/Format/multiple-inputs-error.cpp
@@ -1,6 +1,6 @@
 // RUN: cp %s %t-1.cpp
 // RUN: cp %s %t-2.cpp
-// RUN: not clang-format 2>&1 >/dev/null -offset=1 -length=0 %t-1.cpp %t-2.cpp |FileCheck %s
+// RUN: not clang-format 2>&1 >/dev/null -offset=1 -length=1 %t-1.cpp %t-2.cpp |FileCheck %s
 // RUN: not clang-format 2>&1 >/dev/null -lines=1:1 %t-1.cpp %t-2.cpp |FileCheck %s -check-prefix=CHECK-LINE
 // CHECK: error: -offset, -length and -lines can only be used for single file.
 // CHECK-LINE: error: -offset, -length and -lines can only be used for single file.
diff --git a/clang/test/Format/ranges.cpp b/clang/test/Format/ranges.cpp
index 66b984e037b3..f42492e43f84 100644
--- a/clang/test/Format/ranges.cpp
+++ b/clang/test/Format/ranges.cpp
@@ -1,5 +1,5 @@
 // RUN: grep -Ev "// *[A-Z-]+:" %s \
-// RUN:   | clang-format -style=LLVM -offset=2 -length=0 -offset=28 -length=0 \
+// RUN:   | clang-format -style=LLVM -offset=2 -length=1 -offset=28 -length=1 -offset=35 -length=8 \
 // RUN:   | FileCheck -strict-whitespace %s
 // CHECK: {{^int\ \*i;$}}
 int*i;
@@ -9,3 +9,12 @@ int  *  i;
 
 // CHECK: {{^int\ \*i;$}}
 int   *   i;
+
+// CHECK: int I;
+// CHECK-NEXT: int J ;
+int I ;
+int J ;
+
+// RUN: not clang-format -length=0 %s 2>&1 \
+// RUN:   | FileCheck -strict-whitespace -check-prefix=CHECK0 %s
+// CHECK0: error: length should be at least 1
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 24ad3cb42254..c0efbb7588cc 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -284,7 +284,7 @@ static bool fillRanges(MemoryBuffer *Code,
   if (Offsets.size() == 1 && EmptyLengths) {
     Length = Sources.getFileOffset(Sources.getLocForEndOfFile(ID)) - Offsets[0];
   } else if (Offsets.size() != Lengths.size()) {
-    errs() << "error: number of -offset and -length arguments must match.\n";
+    errs() << "error: number of -offset and -length arguments must match\n";
     return true;
   }
   for (unsigned I = 0, E = Offsets.size(), CodeSize = Code->getBufferSize();
@@ -296,12 +296,16 @@ static bool fillRanges(MemoryBuffer *Code,
     }
     if (!EmptyLengths)
       Length = Lengths[I];
-    if (Offset + Length > CodeSize) {
-      errs() << "error: invalid length " << Length << ", offset + length ("
-             << Offset + Length << ") is outside the file.\n";
+    if (Length == 0) {
+      errs() << "error: length should be at least 1\n";
       return true;
     }
-    Ranges.push_back(tooling::Range(Offset, Length));
+    if (Offset + Length > CodeSize) {
+      errs() << "error: invalid length " << Length << ", offset + length ("
+             << Offset + Length << ") is outside the file\n";
+      return true;
+    }
+    Ranges.push_back(tooling::Range(Offset, Length - 1));
   }
   return false;
 }

From 1f4b1729851bcada646be75c2bc90e0d012525dd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 13 Jun 2025 16:46:20 +0900
Subject: [PATCH 0240/1322] GVN: Fix trying to inspect uselist of constants
 when emitting remark (#144009)

---
 llvm/lib/Transforms/Scalar/GVN.cpp            | 75 +++++++++++--------
 ...opt-remark-assert-constant-uselistorder.ll | 26 +++++++
 2 files changed, 69 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll

diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index a0eed31fde79..c8a0479358ea 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1165,7 +1165,7 @@ static bool isLifetimeStart(const Instruction *Inst) {
 /// Assuming To can be reached from both From and Between, does Between lie on
 /// every path from From to To?
 static bool liesBetween(const Instruction *From, Instruction *Between,
-                        const Instruction *To, DominatorTree *DT) {
+                        const Instruction *To, const DominatorTree *DT) {
   if (From->getParent() == Between->getParent())
     return DT->dominates(From, Between);
   SmallSet<BasicBlock *, 1> Exclusion;
@@ -1173,20 +1173,15 @@ static bool liesBetween(const Instruction *From, Instruction *Between,
   return !isPotentiallyReachable(From, To, &Exclusion, DT);
 }
 
-/// Try to locate the three instruction involved in a missed
-/// load-elimination case that is due to an intervening store.
-static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
-                                   DominatorTree *DT,
-                                   OptimizationRemarkEmitter *ORE) {
-  using namespace ore;
+static const Instruction *findMayClobberedPtrAccess(LoadInst *Load,
+                                                    const DominatorTree *DT) {
+  Value *PtrOp = Load->getPointerOperand();
+  if (!PtrOp->hasUseList())
+    return nullptr;
 
   Instruction *OtherAccess = nullptr;
 
-  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", Load);
-  R << "load of type " << NV("Type", Load->getType()) << " not eliminated"
-    << setExtraArgs();
-
-  for (auto *U : Load->getPointerOperand()->users()) {
+  for (auto *U : PtrOp->users()) {
     if (U != Load && (isa<LoadInst>(U) || isa<StoreInst>(U))) {
       auto *I = cast<Instruction>(U);
       if (I->getFunction() == Load->getFunction() && DT->dominates(I, Load)) {
@@ -1202,32 +1197,48 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
     }
   }
 
-  if (!OtherAccess) {
-    // There is no dominating use, check if we can find a closest non-dominating
-    // use that lies between any other potentially available use and Load.
-    for (auto *U : Load->getPointerOperand()->users()) {
-      if (U != Load && (isa<LoadInst>(U) || isa<StoreInst>(U))) {
-        auto *I = cast<Instruction>(U);
-        if (I->getFunction() == Load->getFunction() &&
-            isPotentiallyReachable(I, Load, nullptr, DT)) {
-          if (OtherAccess) {
-            if (liesBetween(OtherAccess, I, Load, DT)) {
-              OtherAccess = I;
-            } else if (!liesBetween(I, OtherAccess, Load, DT)) {
-              // These uses are both partially available at Load were it not for
-              // the clobber, but neither lies strictly after the other.
-              OtherAccess = nullptr;
-              break;
-            } // else: keep current OtherAccess since it lies between U and
-              // Load.
-          } else {
+  if (OtherAccess)
+    return OtherAccess;
+
+  // There is no dominating use, check if we can find a closest non-dominating
+  // use that lies between any other potentially available use and Load.
+  for (auto *U : PtrOp->users()) {
+    if (U != Load && (isa<LoadInst>(U) || isa<StoreInst>(U))) {
+      auto *I = cast<Instruction>(U);
+      if (I->getFunction() == Load->getFunction() &&
+          isPotentiallyReachable(I, Load, nullptr, DT)) {
+        if (OtherAccess) {
+          if (liesBetween(OtherAccess, I, Load, DT)) {
             OtherAccess = I;
-          }
+          } else if (!liesBetween(I, OtherAccess, Load, DT)) {
+            // These uses are both partially available at Load were it not for
+            // the clobber, but neither lies strictly after the other.
+            OtherAccess = nullptr;
+            break;
+          } // else: keep current OtherAccess since it lies between U and
+          // Load.
+        } else {
+          OtherAccess = I;
         }
       }
     }
   }
 
+  return OtherAccess;
+}
+
+/// Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
+                                   const DominatorTree *DT,
+                                   OptimizationRemarkEmitter *ORE) {
+  using namespace ore;
+
+  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", Load);
+  R << "load of type " << NV("Type", Load->getType()) << " not eliminated"
+    << setExtraArgs();
+
+  const Instruction *OtherAccess = findMayClobberedPtrAccess(Load, DT);
   if (OtherAccess)
     R << " in favor of " << NV("OtherAccess", OtherAccess);
 
diff --git a/llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll b/llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll
new file mode 100644
index 000000000000..e793728815a8
--- /dev/null
+++ b/llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll
@@ -0,0 +1,26 @@
+; RUN: opt -passes='gvn' -pass-remarks-output=%t.yaml %s
+; RUN: FileCheck %s < %t.yaml
+
+; Check that there's no assert from trying to the uses of the constant
+; null.
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            gvn
+; CHECK-NEXT: Name:            LoadClobbered
+; CHECK-NEXT: Function:        c
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          'load of type '
+; CHECK-NEXT:   - Type:            i64
+; CHECK-NEXT:   - String:          ' not eliminated'
+; CHECK-NEXT:   - String:          ' because it is clobbered by '
+; CHECK-NEXT:   - ClobberedBy:     store
+; CHECK-NEXT: ...
+define void @c(ptr addrspace(21) %a) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %load = load i64, ptr addrspace(21) null, align 1
+  store i64 %load, ptr addrspace(21) %a, align 1
+  br label %for.cond
+}

From 02f1f6967a847bba35fc207d61732f3466f39403 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Fri, 13 Jun 2025 15:49:54 +0800
Subject: [PATCH 0241/1322] [mlir][linalg] Add pure tensor check for
 `winogradConv2DHelper` (#142299)

This PR adds pure tensor semantics check for `winogradConv2DHelper` to
prevent a crash. Fixes #141566.
---
 .../Dialect/Linalg/Transforms/WinogradConv2D.cpp |  4 ++++
 .../Linalg/transform-winograd-conv2d.mlir        | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index c6ebd3a53d98..e4221d474841 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -904,6 +904,10 @@ static bool hasAllOneValues(DenseIntElementsAttr attr) {
 static FailureOr<Operation *>
 winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
                      int64_t m, int64_t r) {
+  if (!convOp.hasPureTensorSemantics())
+    return rewriter.notifyMatchFailure(
+        convOp, "expected pure tensor semantics for linalg.conv_2d_nhwc_fhwc");
+
   Value input = convOp.getInputs()[0];
   Value filter = convOp.getInputs()[1];
   Value output = convOp.getOutputs()[0];
diff --git a/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir b/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
index c10e0ccebfd7..1de861e65300 100644
--- a/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
+++ b/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
@@ -61,6 +61,22 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+func.func @conv2d_unsupported_type(%arg0: memref<2x10x10x5xf32>, %arg1: memref<2x3x3x5xf32>, %arg2: memref<2x8x8x2xf32>) {
+  linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : memref<2x10x10x5xf32>, memref<2x3x3x5xf32>) outs(%arg2 : memref<2x8x8x2xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+1 {{apply Winograd Conv2D failed}}
+    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @conv2d(%arg0: tensor<2x?x?x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x?x?x2xf32>) -> tensor<2x?x?x2xf32> {
   %0 = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<2x?x?x5xf32>, tensor<2x3x3x5xf32>) outs(%arg3 : tensor<2x?x?x2xf32>) -> tensor<2x?x?x2xf32>
   return %0 : tensor<2x?x?x2xf32>

From cd3d234868cad8b42e2a09a570e3e229d5ecfb08 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 08:52:48 +0100
Subject: [PATCH 0242/1322] [X86] X86FixupInstTuning - extend BLENDPD/S ->
 MOVSD/S handling to SSE variant (#143961)

---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |  10 +-
 llvm/test/CodeGen/X86/combine-and.ll          |   2 +-
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |  26 +---
 llvm/test/CodeGen/X86/insertelement-zero.ll   |   4 +-
 llvm/test/CodeGen/X86/masked_expandload.ll    |   2 +-
 llvm/test/CodeGen/X86/masked_load.ll          |   4 +-
 .../CodeGen/X86/sse-insertelt-from-mem.ll     |  16 +-
 llvm/test/CodeGen/X86/sse-insertelt.ll        |  13 +-
 llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll  | 144 ++++++------------
 llvm/test/CodeGen/X86/sse41.ll                |   6 +-
 llvm/test/CodeGen/X86/vec_floor.ll            |  32 ++--
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |   2 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |  13 +-
 llvm/test/CodeGen/X86/vector-zmov.ll          |  32 ++--
 llvm/test/CodeGen/X86/vselect.ll              |  26 +---
 15 files changed, 120 insertions(+), 212 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index fd13305d8a73..be0a8c23ea5c 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -234,10 +234,16 @@ bool X86FixupInstTuningPass::processInstruction(
   };
 
   switch (Opc) {
-  case X86::VBLENDPSrri:
-    return ProcessBLENDToMOV(X86::VMOVSSrr);
+  case X86::BLENDPDrri:
+    return ProcessBLENDToMOV(X86::MOVSDrr);
   case X86::VBLENDPDrri:
     return ProcessBLENDToMOV(X86::VMOVSDrr);
+
+  case X86::BLENDPSrri:
+    return ProcessBLENDToMOV(X86::MOVSSrr);
+  case X86::VBLENDPSrri:
+    return ProcessBLENDToMOV(X86::VMOVSSrr);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 173457ff4667..9ca4ebfec277 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -189,7 +189,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
 ; SSE-LABEL: test11:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test11:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 2b5f09113ca6..2f2a05fa6939 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -108,15 +108,10 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test5:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test5:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test5:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test5:
 ; AVX1:       # %bb.0:
@@ -283,15 +278,10 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test12:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test12:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test12:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test12:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 31551360be48..6036eddb0ca8 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -214,7 +214,7 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
 ; SSE41-LABEL: insert_v8f32_z12345z7:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
 ; SSE41-NEXT:    retq
 ;
@@ -287,7 +287,7 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
 ; SSE41-LABEL: insert_v8i32_z12345z7:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index b7fe8e053fa1..e81a983c0701 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1097,7 +1097,7 @@ define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32
 ; SSE42-NEXT:    retq
 ; SSE42-NEXT:  LBB4_1: ## %cond.load
 ; SSE42-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE42-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE42-NEXT:    addq $4, %rdi
 ; SSE42-NEXT:    testb $2, %al
 ; SSE42-NEXT:    je LBB4_4
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index e2e26da95b87..37ab4276fbcc 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -817,7 +817,7 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float>
 ; SSE42-NEXT:    retq
 ; SSE42-NEXT:  LBB7_1: ## %cond.load
 ; SSE42-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE42-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE42-NEXT:    testb $2, %al
 ; SSE42-NEXT:    je LBB7_4
 ; SSE42-NEXT:  LBB7_3: ## %cond.load1
@@ -1220,7 +1220,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
 ; SSE42-NEXT:    je LBB10_10
 ; SSE42-NEXT:  LBB10_9: ## %cond.load10
 ; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE42-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE42-NEXT:    testb $32, %al
 ; SSE42-NEXT:    je LBB10_12
 ; SSE42-NEXT:  LBB10_11: ## %cond.load13
diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
index 5ae905583571..1c3cfd079e9e 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
@@ -7,17 +7,11 @@
 ; 0'th element insertion into an SSE register.
 
 define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) {
-; SSE2-LABEL: insert_f32_firstelt:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: insert_f32_firstelt:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: insert_f32_firstelt:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index 1e4fe81abc13..f174eaaca38c 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -7,15 +7,10 @@
 ; 0'th element insertion into an SSE register.
 
 define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
-; SSE2-LABEL: insert_f32_firstelt:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: insert_f32_firstelt:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: insert_f32_firstelt:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 006c3006350c..12bfb8d4fc9c 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE41,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE41
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE41,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
 
@@ -1150,17 +1150,11 @@ define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: insert_test5_sub_ss:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    subps %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_sub_ss:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    subps %xmm0, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_sub_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subps %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_sub_ss:
 ; AVX:       # %bb.0:
@@ -1188,17 +1182,11 @@ define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: insert_test5_div_ss:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    divps %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_div_ss:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    divps %xmm0, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_div_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    divps %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_div_ss:
 ; AVX:       # %bb.0:
@@ -1226,17 +1214,11 @@ define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: insert_test5_sub_sd:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    subpd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_sub_sd:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    subpd %xmm0, %xmm1
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_sub_sd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subpd %xmm0, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_sub_sd:
 ; AVX:       # %bb.0:
@@ -1264,17 +1246,11 @@ define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: insert_test5_div_sd:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    divpd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_div_sd:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    divpd %xmm0, %xmm1
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_div_sd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    divpd %xmm0, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_div_sd:
 ; AVX:       # %bb.0:
@@ -1287,29 +1263,17 @@ define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; X86-SSE2-LABEL: add_ss_mask:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    jne .LBB70_1
-; X86-SSE2-NEXT:  # %bb.2:
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X86-SSE2-NEXT:    retl
-; X86-SSE2-NEXT:  .LBB70_1:
-; X86-SSE2-NEXT:    addss %xmm0, %xmm1
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: add_ss_mask:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE41-NEXT:    jne .LBB70_1
-; X86-SSE41-NEXT:  # %bb.2:
-; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X86-SSE41-NEXT:    retl
-; X86-SSE41-NEXT:  .LBB70_1:
-; X86-SSE41-NEXT:    addss %xmm0, %xmm1
-; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X86-SSE41-NEXT:    retl
+; X86-SSE-LABEL: add_ss_mask:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    jne .LBB70_1
+; X86-SSE-NEXT:  # %bb.2:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl
+; X86-SSE-NEXT:  .LBB70_1:
+; X86-SSE-NEXT:    addss %xmm0, %xmm1
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_ss_mask:
 ; X86-AVX1:       # %bb.0:
@@ -1329,29 +1293,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
 ; X86-AVX512-NEXT:    retl
 ;
-; X64-SSE2-LABEL: add_ss_mask:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    testb $1, %dil
-; X64-SSE2-NEXT:    jne .LBB70_1
-; X64-SSE2-NEXT:  # %bb.2:
-; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X64-SSE2-NEXT:    retq
-; X64-SSE2-NEXT:  .LBB70_1:
-; X64-SSE2-NEXT:    addss %xmm0, %xmm1
-; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: add_ss_mask:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    testb $1, %dil
-; X64-SSE41-NEXT:    jne .LBB70_1
-; X64-SSE41-NEXT:  # %bb.2:
-; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X64-SSE41-NEXT:    retq
-; X64-SSE41-NEXT:  .LBB70_1:
-; X64-SSE41-NEXT:    addss %xmm0, %xmm1
-; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-SSE41-NEXT:    retq
+; X64-SSE-LABEL: add_ss_mask:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    testb $1, %dil
+; X64-SSE-NEXT:    jne .LBB70_1
+; X64-SSE-NEXT:  # %bb.2:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-SSE-NEXT:    retq
+; X64-SSE-NEXT:  .LBB70_1:
+; X64-SSE-NEXT:    addss %xmm0, %xmm1
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_ss_mask:
 ; X64-AVX1:       # %bb.0:
@@ -1402,7 +1354,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-SSE41-NEXT:    retl
 ; X86-SSE41-NEXT:  .LBB71_1:
 ; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-SSE41-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_sd_mask:
@@ -1444,7 +1396,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X64-SSE41-NEXT:    retq
 ; X64-SSE41-NEXT:  .LBB71_1:
 ; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X64-SSE41-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_sd_mask:
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 53a10ab0c26f..4f5b7ee0eaea 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -345,7 +345,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -367,7 +367,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ;
 ; X64-SSE-LABEL: blendps_not_insertps_1:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X64-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
@@ -434,7 +434,7 @@ define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize noun
 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
 ; SSE-LABEL: blendps_not_insertps_2:
 ; SSE:       ## %bb.0:
-; SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 0538cac12cbf..1007969b6c6d 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -1361,7 +1361,7 @@ define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB52_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1402,7 +1402,7 @@ define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwi
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB53_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1441,7 +1441,7 @@ define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB54_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1482,7 +1482,7 @@ define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nou
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB55_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1521,7 +1521,7 @@ define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x flo
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB56_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1562,7 +1562,7 @@ define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; SSE41-NEXT:  LBB57_1:
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB57_3:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1602,7 +1602,7 @@ define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB58_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1643,7 +1643,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
 ; SSE41-NEXT:  LBB59_1:
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB59_3:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2351,7 +2351,7 @@ define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w,
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB78_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2392,7 +2392,7 @@ define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwin
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB79_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2431,7 +2431,7 @@ define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double>
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB80_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2472,7 +2472,7 @@ define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) noun
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB81_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2511,7 +2511,7 @@ define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x floa
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB82_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2552,7 +2552,7 @@ define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; SSE41-NEXT:  LBB83_1:
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB83_3:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2592,7 +2592,7 @@ define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x d
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB84_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2633,7 +2633,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
 ; SSE41-NEXT:  LBB85_1:
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB85_3:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 8679c262e0bf..2d3dc4c593c1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -871,7 +871,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
 ; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_bitcast_z123:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 0570e2f580c1..002a3b77dc35 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -63,15 +63,10 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1)
 }
 
 define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
-; SSSE3-LABEL: combine_pshufb_as_movss:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: combine_pshufb_as_movss:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_pshufb_as_movss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_movss:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll
index 2f84723b3c08..9d84ff8c01ab 100644
--- a/llvm/test/CodeGen/X86/vector-zmov.ll
+++ b/llvm/test/CodeGen/X86/vector-zmov.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
 
@@ -38,26 +38,12 @@ entry:
 }
 
 define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(ptr%ptr) {
-; SSE2-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movaps (%rdi), %xmm1
-; SSSE3-NEXT:    xorps %xmm0, %xmm0
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movaps (%rdi), %xmm1
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: load_zmov_4i32_to_0zzz_volatile:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps (%rdi), %xmm1
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile:
 ; AVX:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 9851fe64847d..18a060ad910b 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -301,15 +301,10 @@ define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: test18:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test18:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test18:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test18:
 ; AVX:       # %bb.0:
@@ -320,15 +315,10 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test19:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test19:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test19:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test19:
 ; AVX:       # %bb.0:

From 02b6ed0bf139518c704a2996418e66f3a93260a1 Mon Sep 17 00:00:00 2001
From: Shamshura Egor <164661612+egorshamshura@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:53:15 +0300
Subject: [PATCH 0243/1322] [Clang] Added explanation why `is_constructible`
 evaluated to false.  (#143309)

Added explanation why a is constructible evaluated to false. Also fixed
problem with ```ExtractTypeTraitFromExpression```. In case
```std::is_xxx_v<>``` with variadic pack it tries to get template
argument, but fails in expression ```Arg.getAsType()``` due to
```Arg.getKind() == TemplateArgument::ArgKind::Pack```, but not
```TemplateArgument::ArgKind::Type```.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  8 ++-
 clang/lib/Sema/SemaTypeTraits.cpp             | 71 ++++++++++++++++++-
 clang/test/CXX/drs/cwg18xx.cpp                |  3 +-
 ...overload-resolution-deferred-templates.cpp | 19 +++--
 .../type-traits-unsatisfied-diags-std.cpp     | 66 +++++++++++++++++
 .../SemaCXX/type-traits-unsatisfied-diags.cpp | 62 ++++++++++++++++
 6 files changed, 219 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0f77083dac9d..a2cf84d02419 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1767,7 +1767,8 @@ def note_unsatisfied_trait
     : Note<"%0 is not %enum_select<TraitName>{"
            "%TriviallyRelocatable{trivially relocatable}|"
            "%Replaceable{replaceable}|"
-           "%TriviallyCopyable{trivially copyable}"
+           "%TriviallyCopyable{trivially copyable}|"
+           "%Constructible{constructible with provided types}"
            "}1">;
 
 def note_unsatisfied_trait_reason
@@ -1797,7 +1798,10 @@ def note_unsatisfied_trait_reason
            "%DeletedAssign{has a deleted %select{copy|move}1 "
            "assignment operator}|"
            "%UnionWithUserDeclaredSMF{is a union with a user-declared "
-           "%sub{select_special_member_kind}1}"
+           "%sub{select_special_member_kind}1}|"
+           "%FunctionType{is a function type}|"
+           "%CVVoidType{is a cv void type}|"
+           "%IncompleteArrayType{is an incomplete array type}"
            "}0">;
 
 def warn_consteval_if_always_true : Warning<
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 1738ab446600..22c690bedc1e 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/DiagnosticSema.h"
@@ -1947,6 +1948,7 @@ static std::optional<TypeTrait> StdNameToTypeTrait(StringRef Name) {
             TypeTrait::UTT_IsCppTriviallyRelocatable)
       .Case("is_replaceable", TypeTrait::UTT_IsReplaceable)
       .Case("is_trivially_copyable", TypeTrait::UTT_IsTriviallyCopyable)
+      .Case("is_constructible", TypeTrait::TT_IsConstructible)
       .Default(std::nullopt);
 }
 
@@ -1983,8 +1985,16 @@ static ExtractedTypeTraitInfo ExtractTypeTraitFromExpression(const Expr *E) {
     Trait = StdNameToTypeTrait(Name);
     if (!Trait)
       return std::nullopt;
-    for (const auto &Arg : VD->getTemplateArgs().asArray())
-      Args.push_back(Arg.getAsType());
+    for (const auto &Arg : VD->getTemplateArgs().asArray()) {
+      if (Arg.getKind() == TemplateArgument::ArgKind::Pack) {
+        for (const auto &InnerArg : Arg.pack_elements())
+          Args.push_back(InnerArg.getAsType());
+      } else if (Arg.getKind() == TemplateArgument::ArgKind::Type) {
+        Args.push_back(Arg.getAsType());
+      } else {
+        llvm_unreachable("Unexpected kind");
+      }
+    }
     return {{Trait.value(), std::move(Args)}};
   }
 
@@ -2257,6 +2267,60 @@ static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
   }
 }
 
+static void DiagnoseNonConstructibleReason(
+    Sema &SemaRef, SourceLocation Loc,
+    const llvm::SmallVector<clang::QualType, 1> &Ts) {
+  if (Ts.empty()) {
+    return;
+  }
+
+  bool ContainsVoid = false;
+  for (const QualType &ArgTy : Ts) {
+    ContainsVoid |= ArgTy->isVoidType();
+  }
+
+  if (ContainsVoid)
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::CVVoidType;
+
+  QualType T = Ts[0];
+  if (T->isFunctionType())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::FunctionType;
+
+  if (T->isIncompleteArrayType())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::IncompleteArrayType;
+
+  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
+  if (!D || D->isInvalidDecl() || !D->hasDefinition())
+    return;
+
+  llvm::BumpPtrAllocator OpaqueExprAllocator;
+  SmallVector<Expr *, 2> ArgExprs;
+  ArgExprs.reserve(Ts.size() - 1);
+  for (unsigned I = 1, N = Ts.size(); I != N; ++I) {
+    QualType ArgTy = Ts[I];
+    if (ArgTy->isObjectType() || ArgTy->isFunctionType())
+      ArgTy = SemaRef.Context.getRValueReferenceType(ArgTy);
+    ArgExprs.push_back(
+        new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
+            OpaqueValueExpr(Loc, ArgTy.getNonLValueExprType(SemaRef.Context),
+                            Expr::getValueKindForType(ArgTy)));
+  }
+
+  EnterExpressionEvaluationContext Unevaluated(
+      SemaRef, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::ContextRAII TUContext(SemaRef,
+                              SemaRef.Context.getTranslationUnitDecl());
+  InitializedEntity To(InitializedEntity::InitializeTemporary(T));
+  InitializationKind InitKind(InitializationKind::CreateDirect(Loc, Loc, Loc));
+  InitializationSequence Init(SemaRef, To, InitKind, ArgExprs);
+
+  Init.Diagnose(SemaRef, To, InitKind, ArgExprs);
+  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
+}
+
 static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
                                                SourceLocation Loc, QualType T) {
   SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
@@ -2296,6 +2360,9 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   case UTT_IsTriviallyCopyable:
     DiagnoseNonTriviallyCopyableReason(*this, E->getBeginLoc(), Args[0]);
     break;
+  case TT_IsConstructible:
+    DiagnoseNonConstructibleReason(*this, E->getBeginLoc(), Args);
+    break;
   default:
     break;
   }
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 5b4551ba0143..994807585213 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -564,11 +564,12 @@ struct A {
 namespace ex2 {
 #if __cplusplus >= 201103L
 struct Bar {
-  struct Baz {
+  struct Baz { // #cwg1890-Baz
     int a = 0;
   };
   static_assert(__is_constructible(Baz), "");
   // since-cxx11-error@-1 {{static assertion failed due to requirement '__is_constructible(cwg1890::ex2::Bar::Baz)'}}
+  // since-cxx11-note@#cwg1890-Baz {{'Baz' defined here}}
 };
 #endif
 } // namespace ex2
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 7cb71e075d50..46c367084852 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -80,21 +80,30 @@ struct ImplicitlyCopyable {
 static_assert(__is_constructible(ImplicitlyCopyable, const ImplicitlyCopyable&));
 
 
-struct Movable {
+struct Movable { // #Movable
   template <typename T>
   requires __is_constructible(Movable, T) // #err-self-constraint-1
-  explicit Movable(T op) noexcept; // #1
-  Movable(Movable&&) noexcept = default; // #2
+  explicit Movable(T op) noexcept; // #Movable1
+  Movable(Movable&&) noexcept = default; // #Movable2
 };
 static_assert(__is_constructible(Movable, Movable&&));
 static_assert(__is_constructible(Movable, const Movable&));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}}
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}} \
+// expected-error@-1 {{call to implicitly-deleted copy constructor of 'Movable'}} \
+// expected-note@#Movable  {{'Movable' defined here}} \
+// expected-note@#Movable  {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const Movable' for 1st argument}} \
+// expected-note@#Movable2  {{copy constructor is implicitly deleted because 'Movable' has a user-declared move constructor}} \
+// expected-note@#Movable2  {{candidate constructor not viable: no known conversion from 'int' to 'Movable' for 1st argument}} \
+// expected-note@#Movable1  {{candidate template ignored: constraints not satisfied [with T = int]}}
+
 
 static_assert(__is_constructible(Movable, int));
-// expected-error@-1{{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
+// expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
 // expected-note@#err-self-constraint-1 4{{}}
+// expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
 struct Members {
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
index 329b611110c1..a403a0450607 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
@@ -20,6 +20,14 @@ struct is_trivially_copyable {
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
+
+template <typename... Args>
+struct is_constructible {
+    static constexpr bool value = __is_constructible(Args...);
+};
+
+template <typename... Args>
+constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 #ifdef STD2
@@ -44,6 +52,17 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
+
+template <typename... Args>
+struct __details_is_constructible{
+    static constexpr bool value = __is_constructible(Args...);
+};
+
+template <typename... Args>
+using is_constructible  = __details_is_constructible<Args...>;
+
+template <typename... Args>
+constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 
@@ -73,6 +92,15 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
+
+template <typename... Args>
+struct __details_is_constructible : bool_constant<__is_constructible(Args...)> {};
+
+template <typename... Args>
+using is_constructible  = __details_is_constructible<Args...>;
+
+template <typename... Args>
+constexpr bool is_constructible_v = is_constructible<Args...>::value;
 #endif
 
 }
@@ -100,6 +128,15 @@ static_assert(std::is_trivially_copyable_v<int&>);
 // expected-note@-1 {{because it is a reference type}}
 
 
+static_assert(std::is_constructible<int, int>::value);
+
+static_assert(std::is_constructible<void>::value);
+// expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_constructible<void>::value'}} \
+// expected-note@-1 {{because it is a cv void type}}
+static_assert(std::is_constructible_v<void>);
+// expected-error@-1 {{static assertion failed due to requirement 'std::is_constructible_v<void>'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
 namespace test_namespace {
     using namespace std;
     static_assert(is_trivially_relocatable<int&>::value);
@@ -119,6 +156,13 @@ namespace test_namespace {
     // expected-error@-1 {{static assertion failed due to requirement 'is_trivially_copyable_v<int &>'}} \
     // expected-note@-1 {{'int &' is not trivially copyable}} \
     // expected-note@-1 {{because it is a reference type}}
+
+    static_assert(is_constructible<void>::value);
+    // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_constructible<void>::value'}} \
+    // expected-note@-1 {{because it is a cv void type}}
+    static_assert(is_constructible_v<void>);
+    // expected-error@-1 {{static assertion failed due to requirement 'is_constructible_v<void>'}} \
+    // expected-note@-1 {{because it is a cv void type}}
 }
 
 
@@ -139,6 +183,15 @@ concept C2 = std::is_trivially_copyable_v<T>; // #concept4
 
 template <C2 T> void g2();  // #cand4
 
+template <typename... Args>
+requires std::is_constructible<Args...>::value void f3();  // #cand5
+
+template <typename... Args>
+concept C3 = std::is_constructible_v<Args...>; // #concept6
+
+template <C3 T> void g3();  // #cand6
+
+
 void test() {
     f<int&>();
     // expected-error@-1 {{no matching function for call to 'f'}} \
@@ -169,6 +222,19 @@ void test() {
     // expected-note@#concept4 {{because 'std::is_trivially_copyable_v<int &>' evaluated to false}} \
     // expected-note@#concept4 {{'int &' is not trivially copyable}} \
     // expected-note@#concept4 {{because it is a reference type}}
+
+    f3<void>();
+    // expected-error@-1 {{no matching function for call to 'f3'}} \
+    // expected-note@#cand5 {{candidate template ignored: constraints not satisfied [with Args = <void>]}} \
+    // expected-note-re@#cand5 {{because '{{.*}}is_constructible<void>::value' evaluated to false}} \
+    // expected-note@#cand5 {{because it is a cv void type}}
+
+    g3<void>();
+    // expected-error@-1 {{no matching function for call to 'g3'}} \
+    // expected-note@#cand6 {{candidate template ignored: constraints not satisfied [with T = void]}} \
+    // expected-note@#cand6 {{because 'void' does not satisfy 'C3'}} \
+    // expected-note@#concept6 {{because 'std::is_constructible_v<void>' evaluated to false}} \
+    // expected-note@#concept6 {{because it is a cv void type}}
 }
 }
 
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index a8c78f6304ca..d0b3f294fbca 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -488,3 +488,65 @@ static_assert(__is_trivially_copyable(S12));
 // expected-note@-1 {{'S12' is not trivially copyable}} \
 // expected-note@#tc-S12 {{'S12' defined here}}
 }
+
+namespace constructible {
+
+struct S1 {  // #c-S1
+    S1(int); // #cc-S1
+};
+static_assert(__is_constructible(S1, char*));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S1, char *)'}} \
+// expected-error@-1 {{no matching constructor for initialization of 'S1'}} \
+// expected-note@#c-S1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'char *' to 'const S1' for 1st argument}} \
+// expected-note@#c-S1 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'char *' to 'S1' for 1st argument}} \
+// expected-note@#cc-S1 {{candidate constructor not viable: no known conversion from 'char *' to 'int' for 1st argument; dereference the argument with *}} \
+// expected-note@#c-S1 {{'S1' defined here}}
+
+struct S2 { // #c-S2
+    S2(int, float, double); // #cc-S2
+};
+static_assert(__is_constructible(S2, float));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float)'}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'float' to 'const S2' for 1st argument}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'float' to 'S2' for 1st argument}} \
+// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
+// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 1 was provided}} \
+// expected-note@#c-S2 {{'S2' defined here}}
+
+static_assert(__is_constructible(S2, float, void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float, void)'}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} \
+// expected-note@-1{{because it is a cv void type}} \
+// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
+// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 2 were provided}} \
+// expected-note@#c-S2 {{'S2' defined here}}
+
+static_assert(__is_constructible(int[]));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int[])'}} \
+// expected-note@-1 {{because it is an incomplete array type}}
+
+static_assert(__is_constructible(void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(void, void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void, void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(const void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(const void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(volatile void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(volatile void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(int ()));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int ())'}} \
+// expected-note@-1 {{because it is a function type}}
+
+static_assert(__is_constructible(void (int, float)));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void (int, float))'}} \
+// expected-note@-1 {{because it is a function type}}
+}

From c4caf00bfbf10caa88f1c46a561564b4f0f723af Mon Sep 17 00:00:00 2001
From: LU-JOHN <John.Lu@amd.com>
Date: Fri, 13 Jun 2025 03:03:06 -0500
Subject: [PATCH 0244/1322] [AMDGPU] Convert more 64-bit lshr to 32-bit if
 shift amt>=32 (#138204)

Convert vector 64-bit lshr to 32-bit if shift amt is known to be >= 32.
Also convert scalar 64-bit lshr to 32-bit if shift amt is variable but
known to be >=32.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 130 ++++++++++-----
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |  77 ++++-----
 llvm/test/CodeGen/AMDGPU/srl64_reduce.ll      | 150 +++++++++---------
 3 files changed, 196 insertions(+), 161 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5f41bd7d8a61..c51cc2a2fe52 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4097,7 +4097,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   if (VT.getScalarType() != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+  // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
 
   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   // common case, splitting this into a move and a 32-bit shift is faster and
@@ -4117,12 +4117,12 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
                                TargetType);
   } else {
-    SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
+    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
     const SDValue ShiftMask =
         DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
     // This AND instruction will clamp out of bounds shift values.
     // It will also be removed during later instruction selection.
-    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
+    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
   }
 
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
@@ -4181,50 +4181,105 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
 
 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
-  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS)
-    return SDValue();
-
+  SDValue RHS = N->getOperand(1);
+  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
   EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
-  unsigned ShiftAmt = RHS->getZExtValue();
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
+  unsigned RHSVal;
 
-  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
-  // this improves the ability to match BFE patterns in isel.
-  if (LHS.getOpcode() == ISD::AND) {
-    if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
-      unsigned MaskIdx, MaskLen;
-      if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
-          MaskIdx == ShiftAmt) {
-        return DAG.getNode(
-            ISD::AND, SL, VT,
-            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
-            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
+  if (CRHS) {
+    RHSVal = CRHS->getZExtValue();
+
+    // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
+    // this improves the ability to match BFE patterns in isel.
+    if (LHS.getOpcode() == ISD::AND) {
+      if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
+        unsigned MaskIdx, MaskLen;
+        if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
+            MaskIdx == RHSVal) {
+          return DAG.getNode(ISD::AND, SL, VT,
+                             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
+                                         N->getOperand(1)),
+                             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
+                                         N->getOperand(1)));
+        }
       }
     }
   }
 
-  if (VT != MVT::i64)
+  if (VT.getScalarType() != MVT::i64)
     return SDValue();
 
-  if (ShiftAmt < 32)
+  // for C >= 32
+  // i64 (srl x, C) -> (build_pair (srl hi_32(x), C -32), 0)
+
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
+  KnownBits Known = DAG.computeKnownBits(RHS);
+
+  EVT ElementType = VT.getScalarType();
+  EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
+  EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
+                                 : TargetScalarType;
+
+  if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
     return SDValue();
 
-  // srl i64:x, C for C >= 32
-  // =>
-  //   build_pair (srl hi_32(x), C - 32), 0
-  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  SDValue ShiftAmt;
+  if (CRHS) {
+    ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
+                               TargetType);
+  } else {
+    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
+    const SDValue ShiftMask =
+        DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
+    // This AND instruction will clamp out of bounds shift values.
+    // It will also be removed during later instruction selection.
+    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
+  }
 
-  SDValue Hi = getHiHalf64(LHS, DAG);
+  const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
+  EVT ConcatType;
+  SDValue Hi;
+  SDLoc LHSSL(LHS);
+  // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
+  if (VT.isVector()) {
+    unsigned NElts = TargetType.getVectorNumElements();
+    ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
+    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
+    SmallVector<SDValue, 8> HiOps(NElts);
+    SmallVector<SDValue, 16> HiAndLoOps;
 
-  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
-  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+    DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
+    for (unsigned I = 0; I != NElts; ++I)
+      HiOps[I] = HiAndLoOps[2 * I + 1];
+    Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
+  } else {
+    const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
+    ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
+    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
+    Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
+  }
 
-  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+  SDValue NewShift = DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt);
 
-  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
+  SDValue Vec;
+  if (VT.isVector()) {
+    unsigned NElts = TargetType.getVectorNumElements();
+    SmallVector<SDValue, 8> LoOps;
+    SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
+
+    DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
+    for (unsigned I = 0; I != NElts; ++I)
+      HiAndLoOps[2 * I] = LoOps[I];
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
+  } else {
+    Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
+  }
+  return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
 }
 
 SDValue AMDGPUTargetLowering::performTruncateCombine(
@@ -5209,21 +5264,18 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
-  case ISD::SHL: {
+  case ISD::SHL:
+  case ISD::SRL: {
     // Range metadata can be invalidated when loads are converted to legal types
     // (e.g. v2i64 -> v4i32).
-    // Try to convert vector shl before type legalization so that range metadata
-    // can be utilized.
+    // Try to convert vector shl/srl before type legalization so that range
+    // metadata can be utilized.
     if (!(N->getValueType(0).isVector() &&
           DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
         DCI.getDAGCombineLevel() < AfterLegalizeDAG)
       break;
-    return performShlCombine(N, DCI);
-  }
-  case ISD::SRL: {
-    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
-      break;
-
+    if (N->getOpcode() == ISD::SHL)
+      return performShlCombine(N, DCI);
     return performSrlCombine(N, DCI);
   }
   case ISD::SRA: {
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index bb642155cd0a..117f359be0c3 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1945,16 +1945,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; CI-LABEL: lshr_mad_i64_vec:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v6, v3
-; CI-NEXT:    v_mov_b32_e32 v3, v1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
-; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, v1
+; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
 ; CI-NEXT:    s_mov_b32 s4, 0xffff1118
-; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
+; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
 ; CI-NEXT:    v_mov_b32_e32 v0, v4
-; CI-NEXT:    v_mov_b32_e32 v1, v5
+; CI-NEXT:    v_mov_b32_e32 v2, v6
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_vec:
@@ -1977,44 +1975,28 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; GFX9-LABEL: lshr_mad_i64_vec:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: lshr_mad_i64_vec:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v8, v3
-; GFX1100-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v6, v[0:1]
-; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v8, v[2:3]
-; GFX1100-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, v7
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1150-LABEL: lshr_mad_i64_vec:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1150-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
-; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1150-NEXT:    v_mad_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
-; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: lshr_mad_i64_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX11-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_vec:
 ; GFX12:       ; %bb.0:
@@ -2023,13 +2005,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX12-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
   %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
diff --git a/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll b/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll
index 09538c624de7..3567bafe5b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll
@@ -17,9 +17,9 @@ define i64 @srl_metadata(i64 %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_metadata:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v2, v[2:3]
+; CHECK-NEXT:    flat_load_dword v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load i64, ptr %arg1.ptr, !range !0, !noundef !{}
@@ -30,9 +30,9 @@ define i64 @srl_metadata(i64 %arg0, ptr %arg1.ptr) {
 define amdgpu_ps i64 @srl_metadata_sgpr_return(i64 inreg %arg0, ptr addrspace(1) inreg %arg1.ptr) {
 ; CHECK-LABEL: srl_metadata_sgpr_return:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dword s2, s[2:3], 0x0
+; CHECK-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_lshr_b32 s0, s1, s0
 ; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %shift.amt = load i64, ptr addrspace(1) %arg1.ptr, !range !0, !noundef !{}
@@ -45,9 +45,9 @@ define i64 @srl_exact_metadata(i64 %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_exact_metadata:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v2, v[2:3]
+; CHECK-NEXT:    flat_load_dword v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load i64, ptr %arg1.ptr, !range !0, !noundef !{}
@@ -59,9 +59,9 @@ define i64 @srl_metadata_two_ranges(i64 %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_metadata_two_ranges:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v2, v[2:3]
+; CHECK-NEXT:    flat_load_dword v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load i64, ptr %arg1.ptr, !range !1, !noundef !{}
@@ -106,8 +106,10 @@ define <2 x i64> @srl_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v4, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr <2 x i64> %arg0, %shift.amt
@@ -121,8 +123,10 @@ define <2 x i64> @srl_exact_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v4, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr exact <2 x i64> %arg0, %shift.amt
@@ -133,12 +137,15 @@ define <3 x i64> @srl_v3_metadata(<3 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_v3_metadata:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v12, v[6:7] offset:16
+; CHECK-NEXT:    flat_load_dword v0, v[6:7] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v12, v[4:5]
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v8, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v10, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v0, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v8, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v10, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr <3 x i64> %arg0, %shift.amt
@@ -153,11 +160,15 @@ define <4 x i64> @srl_v4_metadata(<4 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_dwordx4 v[13:16], v[8:9] offset:16
 ; CHECK-NEXT:    ; kill: killed $vgpr8 killed $vgpr9
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v10, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v12, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v10, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v12, v3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v13, v[4:5]
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], v15, v[6:7]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v13, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, v15, v7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr <4 x i64> %arg0, %shift.amt
@@ -337,8 +348,7 @@ define i64 @srl_or32(i64 %arg0, i64 %shift_amt) {
 ; CHECK-LABEL: srl_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v2, 32, v2
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v2, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or i64 %shift_amt, 32
@@ -350,10 +360,10 @@ define <2 x i64> @srl_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) {
 ; CHECK-LABEL: srl_v2_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v5, 32, v6
-; CHECK-NEXT:    v_or_b32_e32 v4, 32, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v4, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <2 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <2 x i64> %arg0, %or
@@ -364,12 +374,12 @@ define <3 x i64> @srl_v3_or32(<3 x i64> %arg0, <3 x i64> %shift_amt) {
 ; CHECK-LABEL: srl_v3_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v7, 32, v10
-; CHECK-NEXT:    v_or_b32_e32 v8, 32, v8
-; CHECK-NEXT:    v_or_b32_e32 v6, 32, v6
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v6, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v8, v[2:3]
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v7, v[4:5]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v6, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v8, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v10, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <3 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <3 x i64> %arg0, %or
@@ -380,14 +390,14 @@ define <4 x i64> @srl_v4_or32(<4 x i64> %arg0, <4 x i64> %shift_amt) {
 ; CHECK-LABEL: srl_v4_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v9, 32, v14
-; CHECK-NEXT:    v_or_b32_e32 v11, 32, v12
-; CHECK-NEXT:    v_or_b32_e32 v10, 32, v10
-; CHECK-NEXT:    v_or_b32_e32 v8, 32, v8
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v8, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v10, v[2:3]
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v11, v[4:5]
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], v9, v[6:7]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v8, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v10, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v12, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, v14, v7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <4 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <4 x i64> %arg0, %or
@@ -400,8 +410,7 @@ define i64 @srl_or32_sgpr(i64 inreg %arg0, i64 inreg %shift_amt) {
 ; CHECK-LABEL: srl_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_or_b32 s4, s18, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s18
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -413,8 +422,7 @@ define i64 @srl_or32_sgpr(i64 inreg %arg0, i64 inreg %shift_amt) {
 define amdgpu_ps i64 @srl_or32_sgpr_return(i64 inreg %arg0, i64 inreg %shift_amt) {
 ; CHECK-LABEL: srl_or32_sgpr_return:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_or_b32 s2, s2, 32
-; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_lshr_b32 s0, s1, s2
 ; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %or = or i64 %shift_amt, 32
@@ -426,14 +434,12 @@ define <2 x i64> @srl_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shift
 ; CHECK-LABEL: srl_v2_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_or_b32 s6, s22, 32
-; CHECK-NEXT:    s_or_b32 s4, s20, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
-; CHECK-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s20
+; CHECK-NEXT:    s_lshr_b32 s5, s19, s22
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <2 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <2 x i64> %arg0, %or
@@ -444,18 +450,15 @@ define <3 x i64> @srl_v3_or32_sgpr(<3 x i64> inreg %arg0, <3 x i64> inreg %shift
 ; CHECK-LABEL: srl_v3_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_or_b32 s8, s26, 32
-; CHECK-NEXT:    s_or_b32 s6, s24, 32
-; CHECK-NEXT:    s_or_b32 s4, s22, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
-; CHECK-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
-; CHECK-NEXT:    s_lshr_b64 s[8:9], s[20:21], s8
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s22
+; CHECK-NEXT:    s_lshr_b32 s5, s19, s24
+; CHECK-NEXT:    s_lshr_b32 s6, s21, s26
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_mov_b32_e32 v4, s8
-; CHECK-NEXT:    v_mov_b32_e32 v5, s9
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <3 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <3 x i64> %arg0, %or
@@ -466,20 +469,17 @@ define <4 x i64> @srl_v4_or32_sgpr(<4 x i64> inreg %arg0, <4 x i64> inreg %shift
 ; CHECK-LABEL: srl_v4_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v0, 32, v0
-; CHECK-NEXT:    s_or_b32 s8, s28, 32
-; CHECK-NEXT:    s_or_b32 s6, s26, 32
-; CHECK-NEXT:    s_or_b32 s4, s24, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
-; CHECK-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
-; CHECK-NEXT:    s_lshr_b64 s[8:9], s[20:21], s8
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], v0, s[22:23]
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s24
+; CHECK-NEXT:    s_lshr_b32 s5, s19, s26
+; CHECK-NEXT:    s_lshr_b32 s6, s21, s28
+; CHECK-NEXT:    v_lshrrev_b32_e64 v6, v0, s23
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_mov_b32_e32 v4, s8
-; CHECK-NEXT:    v_mov_b32_e32 v5, s9
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <4 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <4 x i64> %arg0, %or

From d4826cd324d9a10abdc67c973affa62d36dff4ee Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 13 Jun 2025 09:07:09 +0100
Subject: [PATCH 0245/1322] [AArch64] Observe Z-reg inline asm clobbers without
 SVE (#143742)

inline asm that clobbers any of the z-registers when not in streaming
mode, should still observe that the lower 128 bits of those registers
are clobbered.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 20 +++-
 llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll | 91 ++++++++++++++++++-
 2 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5b9e699eaa40..781a1281db40 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12281,13 +12281,14 @@ enum class PredicateConstraint { Uph, Upl, Upa };
 // not what we want. The code here pre-empts this by matching the register
 // explicitly.
 static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
-parsePredicateRegAsConstraint(StringRef Constraint) {
+parseSVERegAsConstraint(StringRef Constraint) {
   if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
-      Constraint[1] != 'p')
+      (Constraint[1] != 'p' && Constraint[1] != 'z'))
     return std::nullopt;
 
+  bool IsPredicate = Constraint[1] == 'p';
   Constraint = Constraint.substr(2, Constraint.size() - 3);
-  bool IsPredicateAsCount = Constraint.starts_with("n");
+  bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
   if (IsPredicateAsCount)
     Constraint = Constraint.drop_front(1);
 
@@ -12297,8 +12298,9 @@ parsePredicateRegAsConstraint(StringRef Constraint) {
 
   if (IsPredicateAsCount)
     return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
-  else
+  if (IsPredicate)
     return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
+  return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
 }
 
 static std::optional<PredicateConstraint>
@@ -12548,8 +12550,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       break;
     }
   } else {
-    if (const auto P = parsePredicateRegAsConstraint(Constraint))
+    if (const auto P = parseSVERegAsConstraint(Constraint)) {
+      // SME functions that are not in streaming mode, should
+      // still observe clobbers of Z-registers by clobbering
+      // the lower 128bits of those registers.
+      if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
+          !Subtarget->isSVEorStreamingSVEAvailable())
+        return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
+                              &AArch64::FPR128RegClass);
       return *P;
+    }
     if (const auto PC = parsePredicateConstraint(Constraint))
       if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
         return std::make_pair(0U, RegClass);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
index 63cc061cb618..b92a52403698 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sme2 -force-streaming -stop-after=finalize-isel | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -stop-after=finalize-isel | FileCheck %s
 
-define void @UphPNR(target("aarch64.svcount") %predcnt) {
+define void @UphPNR(target("aarch64.svcount") %predcnt) "target-features"="+sme2" "aarch64_pstate_sm_enabled" {
 entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store (<vscale x 1 x s16>) into %ir.predcnt.addr)
@@ -14,7 +14,7 @@ entry:
   ret void
 }
 
-define void @UpaPNR(target("aarch64.svcount") %predcnt) {
+define void @UpaPNR(target("aarch64.svcount") %predcnt) "target-features"="+sme2" "aarch64_pstate_sm_enabled" {
 entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store (<vscale x 1 x s16>) into %ir.predcnt.addr)
@@ -28,7 +28,7 @@ entry:
   ret void
 }
 
-define void @UplPNR(target("aarch64.svcount") %predcnt) {
+define void @UplPNR(target("aarch64.svcount") %predcnt) "target-features"="+sme2" "aarch64_pstate_sm_enabled" {
 entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store (<vscale x 1 x s16>) into %ir.predcnt.addr)
@@ -41,3 +41,86 @@ entry:
   call void asm sideeffect "fadd z0.h, $0/m, z0.h, #0.5", "@3Upl"(target("aarch64.svcount") %0)
   ret void
 }
+
+; Test that the z-register clobbers result in preserving %0 across the inline asm call.
+define <2 x float> @sme_nosve_nonstreaming(ptr %in) "target-features"="+sme,-sve" {
+entry:
+; CHECK-LABEL: name: sme_nosve_nonstreaming
+; CHECK:  INLINEASM &"smstart sm; smstop sm;"
+; CHECK-SAME: implicit-def early-clobber $q0
+; CHECK-SAME: implicit-def early-clobber $q1
+; CHECK-SAME: implicit-def early-clobber $q2
+; CHECK-SAME: implicit-def early-clobber $q3
+; CHECK-SAME: implicit-def early-clobber $q4
+; CHECK-SAME: implicit-def early-clobber $q5
+; CHECK-SAME: implicit-def early-clobber $q6
+; CHECK-SAME: implicit-def early-clobber $q7
+; CHECK-SAME: implicit-def early-clobber $q8
+; CHECK-SAME: implicit-def early-clobber $q9
+; CHECK-SAME: implicit-def early-clobber $q10
+; CHECK-SAME: implicit-def early-clobber $q11
+; CHECK-SAME: implicit-def early-clobber $q12
+; CHECK-SAME: implicit-def early-clobber $q13
+; CHECK-SAME: implicit-def early-clobber $q14
+; CHECK-SAME: implicit-def early-clobber $q15
+; CHECK-SAME: implicit-def early-clobber $q16
+; CHECK-SAME: implicit-def early-clobber $q17
+; CHECK-SAME: implicit-def early-clobber $q18
+; CHECK-SAME: implicit-def early-clobber $q19
+; CHECK-SAME: implicit-def early-clobber $q20
+; CHECK-SAME: implicit-def early-clobber $q21
+; CHECK-SAME: implicit-def early-clobber $q22
+; CHECK-SAME: implicit-def early-clobber $q23
+; CHECK-SAME: implicit-def early-clobber $q24
+; CHECK-SAME: implicit-def early-clobber $q25
+; CHECK-SAME: implicit-def early-clobber $q26
+; CHECK-SAME: implicit-def early-clobber $q27
+; CHECK-SAME: implicit-def early-clobber $q28
+; CHECK-SAME: implicit-def early-clobber $q29
+; CHECK-SAME: implicit-def early-clobber $q30
+; CHECK-SAME: implicit-def early-clobber $q31
+  %0 = load <2 x float>, ptr %in, align 8
+  call void asm sideeffect "smstart sm; smstop sm;", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"()
+  ret <2 x float> %0
+}
+
+define <2 x float> @sme_nosve_streaming(ptr %in) "target-features"="+sme,-sve" "aarch64_pstate_sm_enabled" {
+entry:
+; CHECK-LABEL: name: sme_nosve_streaming
+; CHECK:  INLINEASM &"smstart sm; smstop sm;"
+; CHECK-SAME: implicit-def early-clobber $z0
+; CHECK-SAME: implicit-def early-clobber $z1
+; CHECK-SAME: implicit-def early-clobber $z2
+; CHECK-SAME: implicit-def early-clobber $z3
+; CHECK-SAME: implicit-def early-clobber $z4
+; CHECK-SAME: implicit-def early-clobber $z5
+; CHECK-SAME: implicit-def early-clobber $z6
+; CHECK-SAME: implicit-def early-clobber $z7
+; CHECK-SAME: implicit-def early-clobber $z8
+; CHECK-SAME: implicit-def early-clobber $z9
+; CHECK-SAME: implicit-def early-clobber $z10
+; CHECK-SAME: implicit-def early-clobber $z11
+; CHECK-SAME: implicit-def early-clobber $z12
+; CHECK-SAME: implicit-def early-clobber $z13
+; CHECK-SAME: implicit-def early-clobber $z14
+; CHECK-SAME: implicit-def early-clobber $z15
+; CHECK-SAME: implicit-def early-clobber $z16
+; CHECK-SAME: implicit-def early-clobber $z17
+; CHECK-SAME: implicit-def early-clobber $z18
+; CHECK-SAME: implicit-def early-clobber $z19
+; CHECK-SAME: implicit-def early-clobber $z20
+; CHECK-SAME: implicit-def early-clobber $z21
+; CHECK-SAME: implicit-def early-clobber $z22
+; CHECK-SAME: implicit-def early-clobber $z23
+; CHECK-SAME: implicit-def early-clobber $z24
+; CHECK-SAME: implicit-def early-clobber $z25
+; CHECK-SAME: implicit-def early-clobber $z26
+; CHECK-SAME: implicit-def early-clobber $z27
+; CHECK-SAME: implicit-def early-clobber $z28
+; CHECK-SAME: implicit-def early-clobber $z29
+; CHECK-SAME: implicit-def early-clobber $z30
+; CHECK-SAME: implicit-def early-clobber $z31
+  %0 = load <2 x float>, ptr %in, align 8
+  call void asm sideeffect "smstart sm; smstop sm;", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"()
+  ret <2 x float> %0
+}

From 0cf333878d310bf9bbc8156cb7d8a0e271fb2c6f Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 13 Jun 2025 09:26:08 +0100
Subject: [PATCH 0246/1322] [NFC] Pack MDNodeKeyImpl<DILocation> from 40 to 32
 bytes (#143891)

---
 llvm/lib/IR/LLVMContextImpl.h | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 87cd52e357be..ef279721b964 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -310,36 +310,33 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
 
 /// DenseMapInfo for DILocation.
 template <> struct MDNodeKeyImpl<DILocation> {
-  unsigned Line;
-  uint16_t Column;
   Metadata *Scope;
   Metadata *InlinedAt;
-  bool ImplicitCode;
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
   uint64_t AtomGroup : 61;
   uint64_t AtomRank : 3;
 #endif
+  unsigned Line;
+  uint16_t Column;
+  bool ImplicitCode;
 
   MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope,
                 Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup,
                 uint8_t AtomRank)
-      : Line(Line), Column(Column), Scope(Scope), InlinedAt(InlinedAt),
-        ImplicitCode(ImplicitCode)
+      : Scope(Scope), InlinedAt(InlinedAt),
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-        ,
-        AtomGroup(AtomGroup), AtomRank(AtomRank)
+        AtomGroup(AtomGroup), AtomRank(AtomRank),
 #endif
-  {
+        Line(Line), Column(Column), ImplicitCode(ImplicitCode) {
   }
 
   MDNodeKeyImpl(const DILocation *L)
-      : Line(L->getLine()), Column(L->getColumn()), Scope(L->getRawScope()),
-        InlinedAt(L->getRawInlinedAt()), ImplicitCode(L->isImplicitCode())
+      : Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()),
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-        ,
-        AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank())
+        AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()),
 #endif
-  {
+        Line(L->getLine()), Column(L->getColumn()),
+        ImplicitCode(L->isImplicitCode()) {
   }
 
   bool isKeyOf(const DILocation *RHS) const {

From addd98f7a5b964a5a5860d65f327f3fc3b7e0a42 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 13 Jun 2025 09:31:57 +0100
Subject: [PATCH 0247/1322] [lldb][test] Don't call SBDebugger::Terminate if
 TestMultipleDebuggers times out (#143732)

Fixes #101162

This test did this:
* SBDebugger::Initialize
* Spawn a bunch of threads that do:
  * SBDebugger::Create
  * some work
  * SBDebugger::Destroy
* Wait on those threads to finish then call SBDebugger::Terminate and
exit, or -
* Reach a time limit before all the threads finish, call
SBDebugger::Terminate and exit.

The problem was that in the timeout case, calling SBDebugger::Terminate
destroys data being used by threads that are still running. I expect
this test was expecting said threads to be so broken they were probably
stuck, but when the machine is just heavily loaded, one of them might
read that data before the whole program exits.

This means what should have been a timeout becomes a crash. Sometimes.
Which explains why we saw both timeouts and various signals on the
AArch64 Linux bot. It depends on the timings.

So I'm changing it not to call SBDebugger::Terminate in the timeout
case. We will have to tweak the timeout value based on what happens on
the buildbot, but we will know it's machine load not an lldb bug.

Also use _exit instead of exit, to skip more cleanup that might cause a
crash.
---
 .../API/api/multiple-debuggers/TestMultipleDebuggers.py    | 2 --
 .../API/api/multiple-debuggers/multi-process-driver.cpp    | 7 +++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index 1fd4806cd74f..f0a3893f53aa 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -12,8 +12,6 @@ from lldbsuite.test import lldbutil
 class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    # Sometimes times out on Linux, see https://github.com/llvm/llvm-project/issues/101162.
-    @skipIfLinux
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget
diff --git a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
index 64728fb7c29a..5ad75e3c1e47 100644
--- a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
+++ b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
@@ -296,6 +296,9 @@ int main (int argc, char **argv)
                  NUMBER_OF_SIMULTANEOUS_DEBUG_SESSIONS);
     }
 
-    SBDebugger::Terminate();
-    exit (1);
+    // We do not call SBDebugger::Terminate() here because it will destroy
+    // data that might be being used by threads that are still running. Which
+    // would change the timeout into an unrelated crash.
+    // _exit instead of exit, to skip more things that could cause a crash.
+    _exit(1);
 }

From 8ba62fdb3d2da2f5f199ee7a07222620a451293f Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Fri, 13 Jun 2025 16:47:56 +0800
Subject: [PATCH 0248/1322] [CIR] Function calls with aggregate arguments and
 return values (#143377)

This patch updates cir.call operation and allows function calls with
aggregate arguments and return values.

It seems that C++ class support is still at a minimum now. I tried to
make a call to a C++ function with an argument of aggregate type but it
failed because the initialization of C++ class / struct is NYI. I also
tried to inline this part of support into this patch, but the mixed
patch quickly blows in size and becomes unsuitable for review. Thus,
tests for calling functions with aggregate arguments are added only for
C for now.
---
 clang/include/clang/CIR/MissingFeatures.h     |   6 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  12 ++
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 103 ++++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenCall.h            |  22 +++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  12 +-
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  82 +++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  11 ++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  20 +++-
 clang/lib/CIR/CodeGen/CIRGenValue.h           |  16 +++
 clang/test/CIR/CodeGen/call.c                 | 111 ++++++++++++++++++
 clang/test/CIR/CodeGen/call.cpp               |  32 +++++
 11 files changed, 411 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/call.c

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 97b933657d74..225e9ec89a82 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -173,6 +173,10 @@ struct MissingFeatures {
   static bool stackSaveOp() { return false; }
   static bool aggValueSlot() { return false; }
   static bool aggValueSlotMayOverlap() { return false; }
+  static bool aggValueSlotVolatile() { return false; }
+  static bool aggValueSlotDestructedFlag() { return false; }
+  static bool aggValueSlotAlias() { return false; }
+  static bool aggValueSlotGC() { return false; }
   static bool generateDebugInfo() { return false; }
   static bool pointerOverflowSanitizer() { return false; }
   static bool fpConstraints() { return false; }
@@ -230,6 +234,8 @@ struct MissingFeatures {
   static bool attributeNoBuiltin() { return false; }
   static bool thunks() { return false; }
   static bool runCleanupsScope() { return false; }
+  static bool lowerAggregateLoadStore() { return false; }
+  static bool dataLayoutTypeAllocSize() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index fb1a290c18fa..36c89809b4d9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -332,6 +332,18 @@ public:
     return Address(baseAddr, destType, addr.getAlignment());
   }
 
+  /// Cast the element type of the given address to a different type,
+  /// preserving information like the alignment.
+  Address createElementBitCast(mlir::Location loc, Address addr,
+                               mlir::Type destType) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    auto ptrTy = getPointerTo(destType);
+    return Address(createBitcast(loc, addr.getPointer(), ptrTy), destType,
+                   addr.getAlignment());
+  }
+
   cir::LoadOp createLoad(mlir::Location loc, Address addr,
                          bool isVolatile = false) {
     mlir::IntegerAttr align = getAlignmentAttr(addr.getAlignment());
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 5ec720ffd54f..0d9064425fa9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -60,6 +60,23 @@ CIRGenCallee CIRGenCallee::prepareConcreteCallee(CIRGenFunction &cgf) const {
   return *this;
 }
 
+void CIRGenFunction::emitAggregateStore(mlir::Value value, Address dest) {
+  // In classic codegen:
+  // Function to store a first-class aggregate into memory. We prefer to
+  // store the elements rather than the aggregate to be more friendly to
+  // fast-isel.
+  // In CIR codegen:
+  // Emit the most simple cir.store possible (e.g. a store for a whole
+  // record), which can later be broken down in other CIR levels (or prior
+  // to dialect codegen).
+
+  // Stored result for the callers of this function expected to be in the same
+  // scope as the value, don't make assumptions about current insertion point.
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointAfter(value.getDefiningOp());
+  builder.createStore(*currSrcLoc, value, dest);
+}
+
 /// Returns the canonical formal type of the given C++ method.
 static CanQual<FunctionProtoType> getFormalType(const CXXMethodDecl *md) {
   return md->getType()
@@ -439,8 +456,49 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       assert(!cir::MissingFeatures::opCallBitcastArg());
       cirCallArgs[argNo] = v;
     } else {
-      assert(!cir::MissingFeatures::opCallAggregateArgs());
-      cgm.errorNYI("emitCall: aggregate function call argument");
+      Address src = Address::invalid();
+      if (!arg.isAggregate())
+        cgm.errorNYI(loc, "emitCall: non-aggregate call argument");
+      else
+        src = arg.hasLValue() ? arg.getKnownLValue().getAddress()
+                              : arg.getKnownRValue().getAggregateAddress();
+
+      // Fast-isel and the optimizer generally like scalar values better than
+      // FCAs, so we flatten them if this is safe to do for this argument.
+      auto argRecordTy = cast<cir::RecordType>(argType);
+      mlir::Type srcTy = src.getElementType();
+      // FIXME(cir): get proper location for each argument.
+      mlir::Location argLoc = loc;
+
+      // If the source type is smaller than the destination type of the
+      // coerce-to logic, copy the source value into a temp alloca the size
+      // of the destination type to allow loading all of it. The bits past
+      // the source value are left undef.
+      // FIXME(cir): add data layout info and compare sizes instead of
+      // matching the types.
+      //
+      // uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
+      // uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
+      // if (SrcSize < DstSize) {
+      assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
+      if (srcTy != argRecordTy) {
+        cgm.errorNYI(loc, "emitCall: source type does not match argument type");
+      } else {
+        // FIXME(cir): this currently only runs when the types are exactly the
+        // same, but should be when alloc sizes are the same, fix this as soon
+        // as datalayout gets introduced.
+        assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
+      }
+
+      // assert(NumCIRArgs == STy.getMembers().size());
+      // In LLVMGen: Still only pass the struct without any gaps but mark it
+      // as such somehow.
+      //
+      // In CIRGen: Emit a load from the "whole" struct,
+      // which shall be broken later by some lowering step into multiple
+      // loads.
+      assert(!cir::MissingFeatures::lowerAggregateLoadStore());
+      cirCallArgs[argNo] = builder.createLoad(argLoc, src);
     }
   }
 
@@ -479,6 +537,7 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
 
   assert(!cir::MissingFeatures::opCallAttrs());
 
+  mlir::Location callLoc = loc;
   cir::CIRCallOpInterface theCall = emitCallLikeOp(
       *this, loc, indirectFuncTy, indirectFuncVal, directFuncOp, cirCallArgs);
 
@@ -492,6 +551,19 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
   if (isa<cir::VoidType>(retCIRTy))
     return getUndefRValue(retTy);
   switch (getEvaluationKind(retTy)) {
+  case cir::TEK_Aggregate: {
+    Address destPtr = returnValue.getValue();
+
+    if (!destPtr.isValid())
+      destPtr = createMemTemp(retTy, callLoc, getCounterAggTmpAsString());
+
+    mlir::ResultRange results = theCall->getOpResults();
+    assert(results.size() <= 1 && "multiple returns from a call");
+
+    SourceLocRAIIObject loc{*this, callLoc};
+    emitAggregateStore(results[0], destPtr);
+    return RValue::getAggregate(destPtr);
+  }
   case cir::TEK_Scalar: {
     mlir::ResultRange results = theCall->getOpResults();
     assert(results.size() == 1 && "unexpected number of returns");
@@ -508,7 +580,6 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
     return RValue::get(results[0]);
   }
   case cir::TEK_Complex:
-  case cir::TEK_Aggregate:
     cgm.errorNYI(loc, "unsupported evaluation kind of function call result");
     return getUndefRValue(retTy);
   }
@@ -527,10 +598,21 @@ void CIRGenFunction::emitCallArg(CallArgList &args, const clang::Expr *e,
 
   bool hasAggregateEvalKind = hasAggregateEvaluationKind(argType);
 
-  if (hasAggregateEvalKind) {
-    assert(!cir::MissingFeatures::opCallAggregateArgs());
-    cgm.errorNYI(e->getSourceRange(),
-                 "emitCallArg: aggregate function call argument");
+  // In the Microsoft C++ ABI, aggregate arguments are destructed by the callee.
+  // However, we still have to push an EH-only cleanup in case we unwind before
+  // we make it to the call.
+  if (argType->isRecordType() &&
+      argType->castAs<RecordType>()->getDecl()->isParamDestroyedInCallee()) {
+    assert(!cir::MissingFeatures::msabi());
+    cgm.errorNYI(e->getSourceRange(), "emitCallArg: msabi is NYI");
+  }
+
+  if (hasAggregateEvalKind && isa<ImplicitCastExpr>(e) &&
+      cast<CastExpr>(e)->getCastKind() == CK_LValueToRValue) {
+    LValue lv = emitLValue(cast<CastExpr>(e)->getSubExpr());
+    assert(lv.isSimple());
+    args.addUncopiedAggregate(lv, argType);
+    return;
   }
 
   args.add(emitAnyExprToTemp(e), argType);
@@ -551,12 +633,13 @@ QualType CIRGenFunction::getVarArgType(const Expr *arg) {
 /// Similar to emitAnyExpr(), however, the result will always be accessible
 /// even if no aggregate location is provided.
 RValue CIRGenFunction::emitAnyExprToTemp(const Expr *e) {
-  assert(!cir::MissingFeatures::opCallAggregateArgs());
+  AggValueSlot aggSlot = AggValueSlot::ignored();
 
   if (hasAggregateEvaluationKind(e->getType()))
-    cgm.errorNYI(e->getSourceRange(), "emit aggregate value to temp");
+    aggSlot = createAggTemp(e->getType(), getLoc(e->getSourceRange()),
+                            getCounterAggTmpAsString());
 
-  return emitAnyExpr(e);
+  return emitAnyExpr(e, aggSlot);
 }
 
 void CIRGenFunction::emitCallArgs(
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h
index 15c9080448c8..0353848f3ec0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.h
@@ -133,8 +133,16 @@ public:
   CallArg(RValue rv, clang::QualType ty)
       : rv(rv), hasLV(false), isUsed(false), ty(ty) {}
 
+  CallArg(LValue lv, clang::QualType ty)
+      : lv(lv), hasLV(true), isUsed(false), ty(ty) {}
+
   bool hasLValue() const { return hasLV; }
 
+  LValue getKnownLValue() const {
+    assert(hasLV && !isUsed);
+    return lv;
+  }
+
   RValue getKnownRValue() const {
     assert(!hasLV && !isUsed);
     return rv;
@@ -147,6 +155,10 @@ class CallArgList : public llvm::SmallVector<CallArg, 8> {
 public:
   void add(RValue rvalue, clang::QualType type) { emplace_back(rvalue, type); }
 
+  void addUncopiedAggregate(LValue lvalue, clang::QualType type) {
+    emplace_back(lvalue, type);
+  }
+
   /// Add all the arguments from another CallArgList to this one. After doing
   /// this, the old CallArgList retains its list of arguments, but must not
   /// be used to emit a call.
@@ -162,7 +174,15 @@ public:
 
 /// Contains the address where the return value of a function can be stored, and
 /// whether the address is volatile or not.
-class ReturnValueSlot {};
+class ReturnValueSlot {
+  Address addr = Address::invalid();
+
+public:
+  ReturnValueSlot() = default;
+  ReturnValueSlot(Address addr) : addr(addr) {}
+
+  Address getValue() const { return addr; }
+};
 
 } // namespace clang::CIRGen
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5d04faf443b8..99f942fcf2cd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1010,16 +1010,20 @@ LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
 
 /// Emit code to compute the specified expression which
 /// can have any type.  The result is returned as an RValue struct.
-RValue CIRGenFunction::emitAnyExpr(const Expr *e) {
+RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
     return RValue::get(emitScalarExpr(e));
   case cir::TEK_Complex:
     cgm.errorNYI(e->getSourceRange(), "emitAnyExpr: complex type");
     return RValue::get(nullptr);
-  case cir::TEK_Aggregate:
-    cgm.errorNYI(e->getSourceRange(), "emitAnyExpr: aggregate type");
-    return RValue::get(nullptr);
+  case cir::TEK_Aggregate: {
+    if (aggSlot.isIgnored())
+      aggSlot = createAggTemp(e->getType(), getLoc(e->getSourceRange()),
+                              getCounterAggTmpAsString());
+    emitAggExpr(e, aggSlot);
+    return aggSlot.asRValue();
+  }
   }
   llvm_unreachable("bad evaluation kind");
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 061123d55b88..ffe1b701b244 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -28,6 +28,15 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   CIRGenFunction &cgf;
   AggValueSlot dest;
 
+  // Calls `fn` with a valid return value slot, potentially creating a temporary
+  // to do so. If a temporary is created, an appropriate copy into `Dest` will
+  // be emitted, as will lifetime markers.
+  //
+  // The given function should take a ReturnValueSlot, and return an RValue that
+  // points to said slot.
+  void withReturnValueSlot(const Expr *e,
+                           llvm::function_ref<RValue(ReturnValueSlot)> fn);
+
   AggValueSlot ensureSlot(mlir::Location loc, QualType t) {
     if (!dest.isIgnored())
       return dest;
@@ -40,16 +49,28 @@ public:
   AggExprEmitter(CIRGenFunction &cgf, AggValueSlot dest)
       : cgf(cgf), dest(dest) {}
 
+  /// Given an expression with aggregate type that represents a value lvalue,
+  /// this method emits the address of the lvalue, then loads the result into
+  /// DestPtr.
+  void emitAggLoadOfLValue(const Expr *e);
+
   void emitArrayInit(Address destPtr, cir::ArrayType arrayTy, QualType arrayQTy,
                      Expr *exprToVisit, ArrayRef<Expr *> args,
                      Expr *arrayFiller);
 
+  /// Perform the final copy to DestPtr, if desired.
+  void emitFinalDestCopy(QualType type, const LValue &src);
+
   void emitInitializationToLValue(Expr *e, LValue lv);
 
   void emitNullInitializationToLValue(mlir::Location loc, LValue lv);
 
   void Visit(Expr *e) { StmtVisitor<AggExprEmitter>::Visit(e); }
 
+  void VisitCallExpr(const CallExpr *e);
+
+  void VisitDeclRefExpr(DeclRefExpr *e) { emitAggLoadOfLValue(e); }
+
   void VisitInitListExpr(InitListExpr *e);
   void VisitCXXConstructExpr(const CXXConstructExpr *e);
 
@@ -80,6 +101,17 @@ static bool isTrivialFiller(Expr *e) {
   return false;
 }
 
+/// Given an expression with aggregate type that represents a value lvalue, this
+/// method emits the address of the lvalue, then loads the result into DestPtr.
+void AggExprEmitter::emitAggLoadOfLValue(const Expr *e) {
+  LValue lv = cgf.emitLValue(e);
+
+  // If the type of the l-value is atomic, then do an atomic load.
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+
+  emitFinalDestCopy(e->getType(), lv);
+}
+
 void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
                                    QualType arrayQTy, Expr *e,
                                    ArrayRef<Expr *> args, Expr *arrayFiller) {
@@ -182,6 +214,18 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
   }
 }
 
+/// Perform the final copy to destPtr, if desired.
+void AggExprEmitter::emitFinalDestCopy(QualType type, const LValue &src) {
+  // If dest is ignored, then we're evaluating an aggregate expression
+  // in a context that doesn't care about the result.  Note that loads
+  // from volatile l-values force the existence of a non-ignored
+  // destination.
+  if (dest.isIgnored())
+    return;
+
+  cgf.cgm.errorNYI("emitFinalDestCopy: non-ignored dest is NYI");
+}
+
 void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
   const QualType type = lv.getType();
 
@@ -250,6 +294,44 @@ void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc,
   cgf.emitNullInitialization(loc, lv.getAddress(), lv.getType());
 }
 
+void AggExprEmitter::VisitCallExpr(const CallExpr *e) {
+  if (e->getCallReturnType(cgf.getContext())->isReferenceType()) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "reference return type");
+    return;
+  }
+
+  withReturnValueSlot(
+      e, [&](ReturnValueSlot slot) { return cgf.emitCallExpr(e, slot); });
+}
+
+void AggExprEmitter::withReturnValueSlot(
+    const Expr *e, llvm::function_ref<RValue(ReturnValueSlot)> fn) {
+  QualType retTy = e->getType();
+
+  assert(!cir::MissingFeatures::aggValueSlotDestructedFlag());
+  bool requiresDestruction =
+      retTy.isDestructedType() == QualType::DK_nontrivial_c_struct;
+  if (requiresDestruction)
+    cgf.cgm.errorNYI(
+        e->getSourceRange(),
+        "withReturnValueSlot: return value requiring destruction is NYI");
+
+  // If it makes no observable difference, save a memcpy + temporary.
+  //
+  // We need to always provide our own temporary if destruction is required.
+  // Otherwise, fn will emit its own, notice that it's "unused", and end its
+  // lifetime before we have the chance to emit a proper destructor call.
+  assert(!cir::MissingFeatures::aggValueSlotAlias());
+  assert(!cir::MissingFeatures::aggValueSlotGC());
+
+  Address retAddr = dest.getAddress();
+  assert(!cir::MissingFeatures::emitLifetimeMarkers());
+
+  assert(!cir::MissingFeatures::aggValueSlotVolatile());
+  assert(!cir::MissingFeatures::aggValueSlotDestructedFlag());
+  fn(ReturnValueSlot(retAddr));
+}
+
 void AggExprEmitter::VisitInitListExpr(InitListExpr *e) {
   if (e->hadArrayRangeDesignator())
     llvm_unreachable("GNU array range designator extension");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index c5bd5109343d..fd413fe86383 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -629,6 +629,17 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
   }
 }
 
+static std::string getVersionedTmpName(llvm::StringRef name, unsigned cnt) {
+  SmallString<256> buffer;
+  llvm::raw_svector_ostream out(buffer);
+  out << name << cnt;
+  return std::string(out.str());
+}
+
+std::string CIRGenFunction::getCounterAggTmpAsString() {
+  return getVersionedTmpName("agg.tmp", counterAggTmp++);
+}
+
 void CIRGenFunction::emitNullInitialization(mlir::Location loc, Address destPtr,
                                             QualType ty) {
   // Ignore empty classes in C++.
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index cf672b0c90e6..9421ea26a429 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -316,6 +316,10 @@ public:
     ~SourceLocRAIIObject() { restore(); }
   };
 
+  /// Hold counters for incrementally naming temporaries
+  unsigned counterAggTmp = 0;
+  std::string getCounterAggTmpAsString();
+
   /// Helpers to convert Clang's SourceLocation to a MLIR Location.
   mlir::Location getLoc(clang::SourceLocation srcLoc);
   mlir::Location getLoc(clang::SourceRange srcLoc);
@@ -695,6 +699,8 @@ public:
                          mlir::OpBuilder::InsertPoint ip,
                          mlir::Value arraySize = nullptr);
 
+  void emitAggregateStore(mlir::Value value, Address dest);
+
   void emitAggExpr(const clang::Expr *e, AggValueSlot slot);
 
   LValue emitAggExprToLValue(const Expr *e);
@@ -703,7 +709,8 @@ public:
   /// result is returned as an RValue struct. If this is an aggregate
   /// expression, the aggloc/agglocvolatile arguments indicate where the result
   /// should be returned.
-  RValue emitAnyExpr(const clang::Expr *e);
+  RValue emitAnyExpr(const clang::Expr *e,
+                     AggValueSlot aggSlot = AggValueSlot::ignored());
 
   /// Similarly to emitAnyExpr(), however, the result will always be accessible
   /// even if no aggregate location is provided.
@@ -1152,6 +1159,17 @@ public:
   void emitOpenACCDeclare(const OpenACCDeclareDecl &d);
   void emitOpenACCRoutine(const OpenACCRoutineDecl &d);
 
+  /// Create a temporary memory object for the given aggregate type.
+  AggValueSlot createAggTemp(QualType ty, mlir::Location loc,
+                             const Twine &name = "tmp",
+                             Address *alloca = nullptr) {
+    assert(!cir::MissingFeatures::aggValueSlot());
+    return AggValueSlot::forAddr(
+        createMemTemp(ty, loc, name, alloca), ty.getQualifiers(),
+        AggValueSlot::IsNotDestructed, AggValueSlot::IsNotAliased,
+        AggValueSlot::DoesNotOverlap);
+  }
+
 private:
   QualType getVarArgType(const Expr *arg);
 };
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 258ae306f693..c1e08ba1e9b6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -306,6 +306,13 @@ public:
   enum IsAliased_t { IsNotAliased, IsAliased };
   enum Overlap_t { MayOverlap, DoesNotOverlap };
 
+  /// Returns an aggregate value slot indicating that the aggregate
+  /// value is being ignored.
+  static AggValueSlot ignored() {
+    return forAddr(Address::invalid(), clang::Qualifiers(), IsNotDestructed,
+                   IsNotAliased, DoesNotOverlap);
+  }
+
   AggValueSlot(Address addr, clang::Qualifiers quals, bool destructedFlag,
                bool zeroedFlag, bool aliasedFlag, bool overlapFlag)
       : addr(addr), quals(quals), destructedFlag(destructedFlag),
@@ -333,7 +340,16 @@ public:
 
   bool isIgnored() const { return !addr.isValid(); }
 
+  mlir::Value getPointer() const { return addr.getPointer(); }
+
   IsZeroed_t isZeroed() const { return IsZeroed_t(zeroedFlag); }
+
+  RValue asRValue() const {
+    if (isIgnored())
+      return RValue::getIgnored();
+    assert(!cir::MissingFeatures::aggValueSlot());
+    return RValue::getAggregate(getAddress());
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
new file mode 100644
index 000000000000..13f3c5a21ceb
--- /dev/null
+++ b/clang/test/CIR/CodeGen/call.c
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct S {
+  int x;
+  int y;
+};
+
+void f1(struct S);
+void f2() {
+  struct S s;
+  f1(s);
+}
+
+// CIR-LABEL: cir.func @f2()
+// CIR:         %[[S:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_S>, !rec_S
+// CIR-NEXT:    cir.call @f1(%[[S]]) : (!rec_S) -> ()
+
+// LLVM-LABEL: define void @f2()
+// LLVM:         %[[S:.+]] = load %struct.S, ptr %{{.+}}, align 4
+// LLVM-NEXT:    call void @f1(%struct.S %[[S]])
+
+// OGCG-LABEL: define dso_local void @f2()
+// OGCG:         %[[S:.+]] = load i64, ptr %{{.+}}, align 4
+// OGCG-NEXT:    call void @f1(i64 %[[S]])
+
+struct S f3();
+void f4() {
+  struct S s = f3();
+}
+
+// CIR-LABEL: cir.func @f4() {
+// CIR:         %[[S:.+]] = cir.call @f3() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[S]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
+
+// LLVM-LABEL: define void @f4() {
+// LLVM:         %[[S:.+]] = call %struct.S (...) @f3()
+// LLVM-NEXT:    store %struct.S %[[S]], ptr %{{.+}}, align 4
+
+// OGCG-LABEL: define dso_local void @f4() #0 {
+// OGCG:         %[[S:.+]] = call i64 (...) @f3()
+// OGCG-NEXT:    store i64 %[[S]], ptr %{{.+}}, align 4
+
+struct Big {
+  int data[10];
+};
+
+void f5(struct Big);
+struct Big f6();
+
+void f7() {
+  struct Big b;
+  f5(b);
+}
+
+// CIR-LABEL: cir.func @f7()
+// CIR:         %[[B:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_Big>, !rec_Big
+// CIR-NEXT:    cir.call @f5(%[[B]]) : (!rec_Big) -> ()
+
+// LLVM-LABEL: define void @f7() {
+// LLVM:         %[[B:.+]] = load %struct.Big, ptr %{{.+}}, align 4
+// LLVM-NEXT:    call void @f5(%struct.Big %[[B]])
+
+// OGCG-LABEL: define dso_local void @f7() #0 {
+// OGCG:         %[[B:.+]] = alloca %struct.Big, align 8
+// OGCG-NEXT:    call void @f5(ptr noundef byval(%struct.Big) align 8 %[[B]])
+
+void f8() {
+  struct Big b = f6();
+}
+
+// CIR-LABEL: cir.func @f8()
+// CIR:         %[[B:.+]] = cir.call @f6() : () -> !rec_Big
+// CIR:         cir.store align(4) %[[B]], %{{.+}} : !rec_Big, !cir.ptr<!rec_Big>
+
+// LLVM-LABEL: define void @f8() {
+// LLVM:        %[[B:.+]] = call %struct.Big (...) @f6()
+// LLVM-NEXT:   store %struct.Big %[[B]], ptr %{{.+}}, align 4
+
+// OGCG-LABEL: define dso_local void @f8() #0 {
+// OGCG:         %[[B:.+]] = alloca %struct.Big, align 4
+// OGCG-NEXT:    call void (ptr, ...) @f6(ptr dead_on_unwind writable sret(%struct.Big) align 4 %[[B]])
+
+void f9() {
+  f1(f3());
+}
+
+// CIR-LABEL: cir.func @f9()
+// CIR:         %[[SLOT:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"] {alignment = 4 : i64}
+// CIR-NEXT:    %[[RET:.+]] = cir.call @f3() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[RET]], %[[SLOT]] : !rec_S, !cir.ptr<!rec_S>
+// CIR-NEXT:    %[[ARG:.+]] = cir.load align(4) %[[SLOT]] : !cir.ptr<!rec_S>, !rec_S
+// CIR-NEXT:    cir.call @f1(%[[ARG]]) : (!rec_S) -> ()
+
+// LLVM-LABEL: define void @f9() {
+// LLVM:         %[[SLOT:.+]] = alloca %struct.S, i64 1, align 4
+// LLVM-NEXT:    %[[RET:.+]] = call %struct.S (...) @f3()
+// LLVM-NEXT:    store %struct.S %[[RET]], ptr %[[SLOT]], align 4
+// LLVM-NEXT:    %[[ARG:.+]] = load %struct.S, ptr %[[SLOT]], align 4
+// LLVM-NEXT:    call void @f1(%struct.S %[[ARG]])
+
+// OGCG-LABEL: define dso_local void @f9() #0 {
+// OGCG:         %[[SLOT:.+]] = alloca %struct.S, align 4
+// OGCG-NEXT:    %[[RET:.+]] = call i64 (...) @f3()
+// OGCG-NEXT:    store i64 %[[RET]], ptr %[[SLOT]], align 4
+// OGCG-NEXT:    %[[ARG:.+]] = load i64, ptr %[[SLOT]], align 4
+// OGCG-NEXT:    call void @f1(i64 %[[ARG]])
diff --git a/clang/test/CIR/CodeGen/call.cpp b/clang/test/CIR/CodeGen/call.cpp
index 741cadeb5c76..cc25afce1e5a 100644
--- a/clang/test/CIR/CodeGen/call.cpp
+++ b/clang/test/CIR/CodeGen/call.cpp
@@ -70,3 +70,35 @@ void f9() {
 // LLVM-LABEL: define void @_Z2f9v()
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1)
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1, i32 2, i32 3, i32 4)
+
+struct S {
+  int x;
+  int y;
+};
+
+S f10();
+void f11() {
+  S s = f10();
+}
+
+// CIR-LABEL: cir.func @_Z3f11v()
+// CIR:         %[[#s:]] = cir.call @_Z3f10v() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[#s]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
+
+// LLVM-LABEL: define void @_Z3f11v()
+// LLVM:         %[[#s:]] = call %struct.S @_Z3f10v()
+// LLVM-NEXT:    store %struct.S %[[#s]], ptr %{{.+}}, align 4
+
+void f12() {
+  f10();
+}
+
+// CIR-LABEL: cir.func @_Z3f12v()
+// CIR:         %[[#slot:]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"]
+// CIR-NEXT:    %[[#ret:]] = cir.call @_Z3f10v() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[#ret]], %[[#slot]] : !rec_S, !cir.ptr<!rec_S>
+
+// LLVM-LABEL: define void @_Z3f12v() {
+// LLVM:         %[[#slot:]] = alloca %struct.S, i64 1, align 4
+// LLVM-NEXT:    %[[#ret:]] = call %struct.S @_Z3f10v()
+// LLVM-NEXT:    store %struct.S %[[#ret]], ptr %[[#slot]], align 4

From 2d49bc01cf07434138ea01ef7b9ba4b646b54183 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 13 Jun 2025 10:02:27 +0100
Subject: [PATCH 0249/1322] [LV][NFC] Tidy up check-prof-info.ll test (#143884)

---
 .../LoopVectorize/check-prof-info.ll          | 144 ++++++++++--------
 1 file changed, 83 insertions(+), 61 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 17013c590806..87c1ccb70227 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -1,24 +1,43 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:"
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s |  FileCheck %s
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-MASKED
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-@a = dso_local global [1024 x i32] zeroinitializer, align 16
-@b = dso_local global [1024 x i32] zeroinitializer, align 16
+@a = global [1024 x i32] zeroinitializer, align 16
+@b = global [1024 x i32] zeroinitializer, align 16
 
 ; Check correctness of profile info for vectorization without epilog.
-; Function Attrs: nofree norecurse nounwind uwtable
-define dso_local void @_Z3foov() local_unnamed_addr #0 {
+define void @_Z3foov() {
 ; CHECK-LABEL: @_Z3foov(
-; CHECK:  [[VECTOR_BODY:vector\.body]]:
-; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
-; CHECK:  [[FOR_BODY:for\.body]]:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
-; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
-; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
-; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
+; CHECK:  entry:
+; CHECK:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:  vector.ph:
+; CHECK:    br label [[VECTOR_BODY:%.*]]
+; CHECK:  vector.body:
+; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:  middle.block:
+; CHECK:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK:  scalar.ph:
+; CHECK:    br label [[FOR_BODY:%.*]]
+; CHECK:  for.cond.cleanup:
+; CHECK:  for.body:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; CHECK-MASKED-LABEL: @_Z3foov(
+; CHECK-MASKED:  entry:
+; CHECK-MASKED:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-MASKED:  vector.ph:
+; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MASKED:  vector.body:
+; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-MASKED:  middle.block:
+; CHECK-MASKED:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-MASKED:  scalar.ph:
+; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
+; CHECK-MASKED:  for.cond.cleanup:
+; CHECK-MASKED:  for.body:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -27,32 +46,51 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4, !tbaa !2
-  %1 = trunc i64 %indvars.iv to i32
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = trunc i64 %iv to i32
   %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4, !tbaa !2
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %2, %mul
-  store i32 %add, ptr %arrayidx2, align 4, !tbaa !2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6
+  store i32 %add, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
 }
 
 ; Check correctness of profile info for vectorization with epilog.
-; Function Attrs: nofree norecurse nounwind uwtable
-define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
+define void @_Z3foo2v() {
 ; CHECK-LABEL: @_Z3foo2v(
-; CHECK:  [[VECTOR_BODY:vector\.body]]:
-; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
-; CHECK:  [[FOR_BODY:for\.body]]:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
-; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
-; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
-; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
+; CHECK:  entry:
+; CHECK:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK:  vector.ph:
+; CHECK:    br label [[VECTOR_BODY:%.*]]
+; CHECK:  vector.body:
+; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:  middle.block:
+; CHECK:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK:  scalar.ph:
+; CHECK:    br label [[FOR_BODY:%.*]]
+; CHECK:  for.cond.cleanup:
+; CHECK:  for.body:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-MASKED-LABEL: @_Z3foo2v(
+; CHECK-MASKED:  entry:
+; CHECK-MASKED:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK-MASKED:  vector.ph:
+; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MASKED:  vector.body:
+; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MASKED:  middle.block:
+; CHECK-MASKED:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-MASKED:  scalar.ph:
+; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
+; CHECK-MASKED:  for.cond.cleanup:
+; CHECK-MASKED:  for.body:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -61,36 +99,20 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4, !tbaa !2
-  %1 = trunc i64 %indvars.iv to i32
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = trunc i64 %iv to i32
   %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4, !tbaa !2
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %2, %mul
-  store i32 %add, ptr %arrayidx2, align 4, !tbaa !2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1027
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7
+  store i32 %add, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1027
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !1
 }
 
-attributes #0 = { "use-soft-float"="false" }
 
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
-; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
-; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C++ TBAA"}
-!6 = !{!"branch_weights", i32 1, i32 1023}
-!7 = !{!"branch_weights", i32 1, i32 1026}
+!0 = !{!"branch_weights", i32 1, i32 1023}
+!1 = !{!"branch_weights", i32 1, i32 1026}

From 4b59b7b94608ddbd21d14bec68400f2eb21f510d Mon Sep 17 00:00:00 2001
From: Simone Pellegrini <simone.pellegrini@arm.com>
Date: Fri, 13 Jun 2025 11:03:09 +0200
Subject: [PATCH 0250/1322] [mlir][Linalg] Fix fusing of indexed linalg
 consumer with different axes (#140892)

When fusing two `linalg.genericOp`, where the producer has index
semantics, invalid `affine.apply` ops can be generated where the number
of indices do not match the number of loops in the fused genericOp.

This patch fixes the issue by directly using the number of loops from
the generated fused op.
---
 .../Linalg/Transforms/ElementwiseOpFusion.cpp |  3 +-
 .../Linalg/fusion-elementwise-ops.mlir        | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 1f5af39e604e..f97ed3d6d511 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -231,8 +231,7 @@ static void generateFusedElementwiseOpRegion(
   // `consumerToProducerLoopsMap` to map the producer indices.
   if (producer.hasIndexSemantics()) {
     // Add an index operation for every fused loop dimension.
-    unsigned numFusedOpLoops =
-        std::max(producer.getNumLoops(), consumer.getNumLoops());
+    unsigned numFusedOpLoops = fusedOp.getNumLoops();
     SmallVector<Value> fusedIndices;
     fusedIndices.reserve(numFusedOpLoops);
     llvm::transform(llvm::seq<uint64_t>(0, numFusedOpLoops),
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index 28e1291bce1f..66fc55fadf8f 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -860,6 +860,43 @@ func.func @fusion_different_axes(%arg0 : tensor<5000xi64>, %arg1 : tensor<5000xi
 
 // -----
 
+func.func @fusion_different_axes_indexed(%arg0: tensor<2x2xi32>) ->  tensor<2xi32> {
+  %0 = tensor.empty() : tensor<2x2xi32>
+  %1 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+        iterator_types = ["parallel", "parallel"]}
+        ins(%arg0 : tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
+          ^bb0(%in: i32, %out: i32):
+            %2 = linalg.index 1 : index
+            %3 = arith.index_cast %2 : index to i32
+            linalg.yield %3 : i32
+        } -> tensor<2x2xi32>
+  %4 = tensor.empty() : tensor<2xi32>
+  %5 = linalg.generic {
+        indexing_maps = [affine_map<(d0) -> (d0, 1)>, affine_map<(d0) -> (d0)>],
+        iterator_types = ["parallel"]}
+        ins(%1 : tensor<2x2xi32>) outs(%4 : tensor<2xi32>) {
+          ^bb0(%in: i32, %out: i32):
+            linalg.yield %in : i32
+        } -> tensor<2xi32>
+  return %5 : tensor<2xi32>
+}
+
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> (d0)>
+//      CHECK: func @fusion_different_axes_indexed(
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<2x2xi32>
+//  CHECK-DAG:   %[[CST:.+]] = arith.constant 1 : i32
+//  CHECK-DAG:   %[[INIT:.+]] = tensor.empty() : tensor<2xi32>
+//      CHECK:   %[[RESULT:.+]] = linalg.generic
+// CHECK-SAME:       indexing_maps = [#[[MAP]]]
+// CHECK-SAME:       outs(%[[INIT]] :
+// CHECK-NEXT:   ^bb0(
+// CHECK-SAME:       %[[B0:.+]]: i32
+//      CHECK:     linalg.yield %[[CST]] : i32
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
 // CHECK-LABEL: func @fold_fill_generic_basic
 //  CHECK-SAME: (%[[ARG0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
 //   CHECK-NOT: linalg.fill

From 67c590004d055b7aeb0f82787041a114c3a136b3 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim@gymni.ch>
Date: Fri, 13 Jun 2025 11:09:11 +0200
Subject: [PATCH 0251/1322] [mlir][AMDGPU] Add scaled floating point conversion
 ops (#141554)

implement `ScaledExtPackedOp` and `PackedScaledTruncOp`
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  62 ++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 189 ++++++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |   9 +
 .../Conversion/AMDGPUToROCDL/packed-ext.mlir  | 492 ++++++++++++++++
 .../AMDGPUToROCDL/packed-trunc.mlir           | 535 ++++++++++++++++++
 mlir/test/Dialect/AMDGPU/ops.mlir             | 315 +++++++++++
 6 files changed, 1601 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 02308568c1ad..d58558ac3288 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,38 @@ def AMDGPU_ExtPackedFp8Op :
   }];
 }
 
+def AMDGPU_ScaledExtPackedOp
+    : AMDGPU_Op<"scaled_ext_packed", [Pure]>,
+      Arguments<(
+          ins AnyTypeOf<[VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2, F8E4M3FN]>,
+                         VectorOfLengthAndType<[1, 2, 3, 4, 5, 6, 7, 8],
+                                               [F4E2M1FN]>]>:$source,
+          F32:$scale,
+          ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+      Results<(
+          outs AnyTypeOf<[FixedVectorOfLengthAndType<[2], [F32]>,
+                          FixedVectorOfLengthAndType<[2], [F16]>,
+                          FixedVectorOfLengthAndType<[2], [BF16]>]>:$res)> {
+  let summary = "Extend a vector of packed floating point values";
+
+  let description = [{
+    Extend and scale two packed floats in `source[index]` to two floats and 
+    return them.
+
+    This rather unusual signature arises from the fact that AMD GPUs cannot
+    easily work with sub 32-bit quantities, so the compiler intrinsics for
+    extending 8-bit floats (which are, currently, the only way to work with
+    this operation) take packed vectors of 2 such floats.
+
+    If the passed-in vector has fewer than two elements, or the input is scalar,
+    the remaining values in the <2 x i8> will be filled with
+    undefined values as needed.
+  }];
+  let assemblyFormat = [{
+    attr-dict $source `[` $index `]` `,` $scale `:` type($source) `to` type($res)
+  }];
+}
+
 def AMDGPU_PackedTrunc2xFp8Op :
     AMDGPU_Op<"packed_trunc_2xfp8", [Pure, AttrSizedOperandSegments]>,
     Arguments<(ins F32:$sourceA,
@@ -139,6 +171,36 @@ def AMDGPU_PackedTrunc2xFp8Op :
   let hasVerifier = 1;
 }
 
+def AMDGPU_PackedScaledTruncOp
+    : AMDGPU_Op<"packed_scaled_trunc", [Pure]>,
+      Arguments<(ins VectorOfLengthAndType<[1, 2], [F32, F16, BF16]>:$source,
+          F32:$scale,
+          ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index,
+          Optional<AnyTypeOf<
+              [FixedVectorOfLengthAndType<[4], [F8E5M2, F8E4M3FN]>,
+               FixedVectorOfLengthAndType<[8], [F4E2M1FN]>]>>:$existing)>,
+      Results<(
+          outs AnyTypeOf<[FixedVectorOfLengthAndType<[4], [F8E5M2, F8E4M3FN]>,
+                          FixedVectorOfLengthAndType<[8], [F4E2M1FN]>]>:$res)> {
+  let summary = "Round two floats into a packed vector of floats";
+  let description = [{
+    Scale and round the inputs `source` (which is undefined if not
+    specified) into the low or high word (bottom two or top two) elements
+    of the returned vector, keeping the other two elements of `existing`
+    unchanged if present (or undefined if it was not passed in).
+
+    The reason for this odd signature is that AMD GPUs cannot easily work with
+    sub-registers, and so the conversion intrinsics take 32-bit wide
+    packed vectors of float values.
+  }];
+  let assemblyFormat = [{
+    attr-dict $source `into` ($existing^):(`undef`)? `[` $index `]`
+    `,` $scale
+    `:` type($source) `to` type($res) (`into` type($existing)^)?
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_PackedStochRoundFp8Op :
     AMDGPU_Op<"packed_stoch_round_fp8", [Pure]>,
     Arguments<(ins F32:$source,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index c5094799bbef..5e6f675a6414 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 namespace mlir {
@@ -1174,6 +1175,32 @@ struct PackedStochRoundFp8OpLowering final
                   PackedStochRoundFp8OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override;
 };
+
+struct ScaledExtPackedOpLowering final
+    : public ConvertOpToLLVMPattern<ScaledExtPackedOp> {
+  ScaledExtPackedOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::ScaledExtPackedOp>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+struct PackedScaledTruncOpLowering final
+    : public ConvertOpToLLVMPattern<PackedScaledTruncOp> {
+  PackedScaledTruncOpLowering(const LLVMTypeConverter &converter,
+                              Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::PackedScaledTruncOp>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(PackedScaledTruncOp op, PackedScaledTruncOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 } // end namespace
 
 LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
@@ -1230,6 +1257,165 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
   return success();
 }
 
+LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
+    ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  if (chipset != kGfx950)
+    return rewriter.notifyMatchFailure(
+        loc, "Scaled fp conversion instructions are not available on target "
+             "architecture and their emulation is not implemented");
+  Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
+
+  Value source = adaptor.getSource();
+  Value scale = adaptor.getScale();
+
+  VectorType sourceVecType = cast<VectorType>(op.getSource().getType());
+  Type sourceElemType = sourceVecType.getElementType();
+  VectorType destVecType = cast<VectorType>(op.getResult().getType());
+  Type destElemType = destVecType.getElementType();
+
+  VectorType packedVecType;
+  if (isa<Float8E5M2Type, Float8E4M3FNType>(sourceElemType)) {
+    VectorType v4i8 = VectorType::get(4, rewriter.getI8Type());
+    packedVecType = cast<VectorType>(getTypeConverter()->convertType(v4i8));
+  } else if (isa<Float4E2M1FNType>(sourceElemType)) {
+    VectorType v8i4 = VectorType::get(8, rewriter.getI4Type());
+    packedVecType = cast<VectorType>(getTypeConverter()->convertType(v8i4));
+  } else {
+    llvm_unreachable("invalid element type for scaled ext");
+  }
+
+  // Extend to a packedVectorType
+  if (sourceVecType.getNumElements() < packedVecType.getNumElements()) {
+    Value longVec = rewriter.create<LLVM::ZeroOp>(loc, packedVecType);
+    if (!sourceVecType) {
+      longVec = rewriter.create<LLVM::InsertElementOp>(
+          loc, longVec, source, createI32Constant(rewriter, loc, 0));
+    } else {
+      for (int32_t i = 0, e = sourceVecType.getNumElements(); i < e; ++i) {
+        Value idx = createI32Constant(rewriter, loc, i);
+        Value elem = rewriter.create<LLVM::ExtractElementOp>(loc, source, idx);
+        longVec =
+            rewriter.create<LLVM::InsertElementOp>(loc, longVec, elem, idx);
+      }
+    }
+    source = longVec;
+  }
+  Value i32Source = rewriter.create<LLVM::BitcastOp>(loc, i32, source);
+
+  if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isF32())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Bf8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Bf8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isBF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Bf8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isF32())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Fp8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Fp8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isBF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Fp8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isF32())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Fp4Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Fp4Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isBF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Fp4Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else
+    return failure();
+
+  return success();
+}
+
+LogicalResult PackedScaledTruncOpLowering::matchAndRewrite(
+    PackedScaledTruncOp op, PackedScaledTruncOpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  if (chipset != kGfx950)
+    return rewriter.notifyMatchFailure(
+        loc, "Scaled fp conversion instructions are not available on target "
+             "architecture and their emulation is not implemented");
+  Type v2i16 = getTypeConverter()->convertType(
+      VectorType::get(2, rewriter.getI16Type()));
+  Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
+
+  Type resultType = op.getResult().getType();
+  Type resultElemType = getElementTypeOrSelf(resultType);
+  VectorType sourceVecType = cast<VectorType>(op.getSource().getType());
+  Type sourceElemType = sourceVecType.getElementType();
+
+  Type intResultType = isa<Float4E2M1FNType>(resultElemType) ? i32 : v2i16;
+
+  Value source = adaptor.getSource();
+  Value scale = adaptor.getScale();
+  Value existing = adaptor.getExisting();
+  if (existing)
+    existing = rewriter.create<LLVM::BitcastOp>(loc, intResultType, existing);
+  else
+    existing = rewriter.create<LLVM::ZeroOp>(loc, intResultType);
+
+  if (sourceVecType.getNumElements() < 2) {
+    Value c0 = createI32Constant(rewriter, loc, 0);
+    Value elem0 = rewriter.create<LLVM::ExtractElementOp>(loc, source, c0);
+    VectorType v2 = VectorType::get(2, sourceElemType);
+    source = rewriter.create<LLVM::ZeroOp>(loc, v2);
+    source = rewriter.create<LLVM::InsertElementOp>(loc, source, elem0, c0);
+  }
+
+  Value sourceA, sourceB;
+  if (sourceElemType.isF32()) {
+    Value c0 = createI32Constant(rewriter, loc, 0);
+    Value c1 = createI32Constant(rewriter, loc, 1);
+    sourceA = rewriter.create<LLVM::ExtractElementOp>(loc, source, c0);
+    sourceB = rewriter.create<LLVM::ExtractElementOp>(loc, source, c1);
+  }
+
+  Value result;
+  if (sourceElemType.isF32() && isa<Float8E5M2Type>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkBf8F32Op>(
+        loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+  else if (sourceElemType.isF16() && isa<Float8E5M2Type>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkBf8F16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isBF16() && isa<Float8E5M2Type>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkBf8Bf16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isF32() && isa<Float8E4M3FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp8F32Op>(
+        loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+  else if (sourceElemType.isF16() && isa<Float8E4M3FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp8F16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isBF16() && isa<Float8E4M3FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp8Bf16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isF32() && isa<Float4E2M1FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp4F32Op>(
+        loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+  else if (sourceElemType.isF16() && isa<Float4E2M1FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp4F16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isBF16() && isa<Float4E2M1FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp4Bf16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else
+    return failure();
+
+  result = rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(
+      op, getTypeConverter()->convertType(resultType), result);
+  return success();
+}
+
 LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
     PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1547,7 +1733,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
            AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
            MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
-           ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+           ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+           PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering>(converter,
                                                                  chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index a0a98a4e8672..0d0add309466 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -60,6 +60,15 @@ LogicalResult PackedStochRoundFp8Op::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// mxfp float ops
+//===----------------------------------------------------------------------===//
+LogicalResult PackedScaledTruncOp::verify() {
+  if (getExisting() && getExisting().getType() != getResult().getType())
+    return emitOpError("existing values must have same type as result");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FatRawBufferCastOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
new file mode 100644
index 000000000000..ad2e7684afc4
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
@@ -0,0 +1,492 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f8e4m3_f32(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f8e4m3_f16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_bf16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f8e5m2_f32(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f8e5m2_f16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_bf16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f4e2m1_f16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_bf16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f4e2m1_bf16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C2:%.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK:       [[ELEM_2:%.+]] = llvm.extractelement [[V]]{{\[}}[[C2]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_2:%.+]] = llvm.insertelement [[ELEM_2]], [[VEC_1]]{{\[}}[[C2]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C3:%.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK:       [[ELEM_3:%.+]] = llvm.extractelement [[V]]{{\[}}[[C3]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_3:%.+]] = llvm.insertelement [[ELEM_3]], [[VEC_2]]{{\[}}[[C3]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_3]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C2:%.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK:       [[ELEM_2:%.+]] = llvm.extractelement [[V]]{{\[}}[[C2]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_2:%.+]] = llvm.insertelement [[ELEM_2]], [[VEC_1]]{{\[}}[[C2]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C3:%.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK:       [[ELEM_3:%.+]] = llvm.extractelement [[V]]{{\[}}[[C3]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_3:%.+]] = llvm.insertelement [[ELEM_3]], [[VEC_2]]{{\[}}[[C3]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_3]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_one_f8e4m3_f32(%v: vector<1xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_one_f8e4m3_f16(%v: vector<1xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_one_f8e4m3_bf16(%v: vector<1xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_one_f8e5m2_f32(%v: vector<1xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_one_f8e5m2_f16(%v: vector<1xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_one_f8e5m2_bf16(%v: vector<1xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_one_f4e2m1_f32(%v: vector<1xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_one_f4e2m1_f16(%v: vector<1xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_one_f4e2m1_bf16(%v: vector<1xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
new file mode 100644
index 000000000000..e9764d34cefa
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
@@ -0,0 +1,535 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f32_vec1(%v: vector<1xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf32> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f32_vec1(%v: vector<1xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f16_vec1(%v: vector<1xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f16_vec1(%v: vector<1xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_bf16_vec1(%v: vector<1xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xbf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_bf16_vec1(%v: vector<1xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f32_vec1(%v: vector<1xf32>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf32> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f32_vec1(%v: vector<1xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f16_vec1(%v: vector<1xf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f16_vec1(%v: vector<1xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_bf16_vec1(%v: vector<1xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xbf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_bf16_vec1(%v: vector<1xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32_vec1
+// CHECK-DAG:   [[ZERO_I32:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f32_vec1(%v: vector<1xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf32> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
+// CHECK-DAG:   [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f32_vec1(%v: vector<1xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg1 -> [[ZERO]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16_vec1
+// CHECK-DAG:   [[ZERO_I32:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 [[INSERT]], %arg1 -> [[ZERO_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f16_vec1(%v: vector<1xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
+// CHECK-DAG:   [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f16_vec1(%v: vector<1xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg1 -> [[ZERO]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16_vec1
+// CHECK-DAG:   [[ZERO_I32:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 [[INSERT]], %arg1 -> [[ZERO_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_bf16_vec1(%v: vector<1xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xbf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
+// CHECK-DAG:   [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_bf16_vec1(%v: vector<1xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 188cfcc4eb38..6c3ffb575f7c 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -32,6 +32,321 @@ func.func @packed_stoch_round_fp8(%v1: f32, %stoch: i32, %others: vector<4xf8E5M
   func.return %ret : vector<4xf8E5M2FNUZ>
 }
 
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_f32(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_f16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_f32(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_f16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_f16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_bf16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
 // CHECK-LABEL: func @fat_raw_buffer_cast_easy
 // CHECK: amdgpu.fat_raw_buffer_cast
 func.func @fat_raw_buffer_cast_easy(%m: memref<8xi32>) -> memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> {

From 06c783567069db169ee2d1545a4bd3ffd0e3fec0 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 13 Jun 2025 09:10:30 +0000
Subject: [PATCH 0252/1322] [lldb][test] Disable TestMultipleDebuggers again

I did manage to turn a crash into a non-zero return code,
but on the very first build it managed to time out.

I thought I had the appetite to tweak timeouts but
on second thought, I don't want yet another test to look
out for.

The test is not wrong, but on heavily loaded machines
it's always going to be inherently unstable.
---
 lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index f0a3893f53aa..7d6fdd444791 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -12,6 +12,10 @@ from lldbsuite.test import lldbutil
 class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    # Times out on heavily loaded Linux buildbots, don't want to get into tweaking
+    # the timeout per bot. Does work when run alone. See:
+    # https://github.com/llvm/llvm-project/issues/101162
+    @skipIfLinux
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget

From 5762491e2a1935911c1e998a4865591d429f8559 Mon Sep 17 00:00:00 2001
From: SivanShani-Arm <sivan.shani@arm.com>
Date: Fri, 13 Jun 2025 11:02:33 +0100
Subject: [PATCH 0253/1322] [lld] Refactor storage of PAuth ABI core info
 (#141920)

Previously, the AArch64 PAuth ABI core values were stored as an
ArrayRef<uint8_t>, introducing unnecessary indirection.

This patch replaces the ArrayRef with two explicit uint64_t fields:
aarch64PauthAbiPlatform and aarch64PauthAbiVersion. This simplifies the
representation and improves readability.

No functional change intended, aside from improved error messages.
---
 lld/ELF/Arch/AArch64.cpp             |  3 +--
 lld/ELF/Config.h                     | 19 ++++++++++++++++++-
 lld/ELF/Driver.cpp                   | 27 +++++++++++++++++----------
 lld/ELF/InputFiles.cpp               |  6 ++++--
 lld/ELF/InputFiles.h                 |  2 +-
 lld/ELF/SyntheticSections.cpp        | 14 +++++++-------
 lld/test/ELF/aarch64-feature-pauth.s |  8 ++++++--
 7 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 2b5d5e90573f..8a225ed103ee 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -1043,8 +1043,7 @@ AArch64BtiPac::AArch64BtiPac(Ctx &ctx) : AArch64(ctx) {
   // instructions.
 
   if (ctx.arg.zPacPlt) {
-    if (llvm::any_of(ctx.aarch64PauthAbiCoreInfo,
-                     [](uint8_t c) { return c != 0; }))
+    if (ctx.aarch64PauthAbiCoreInfo && ctx.aarch64PauthAbiCoreInfo->isValid())
       pacEntryKind = PEK_Auth;
     else
       pacEntryKind = PEK_AuthHint;
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 3a9001d2cc8b..a2f7759fb7d3 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -139,6 +139,23 @@ enum class GcsPolicy { Implicit, Never, Always };
 // For some options that resemble -z bti-report={none,warning,error}
 enum class ReportPolicy { None, Warning, Error };
 
+// Describes the signing schema for a file using the PAuth ABI extension.
+// Two files are considered compatible when both `platform` and `version` match.
+// The pair (0, 0) is reserved to indicate incompatibility with the PAuth ABI.
+struct AArch64PauthAbiCoreInfo {
+  uint64_t platform;
+  uint64_t version;
+  // Returns true if the core info is not the reserved (0, 0) value.
+  bool isValid() const { return platform || version; }
+  static constexpr size_t size() { return sizeof(platform) + sizeof(version); }
+  bool operator==(const AArch64PauthAbiCoreInfo &other) const {
+    return platform == other.platform && version == other.version;
+  }
+  bool operator!=(const AArch64PauthAbiCoreInfo &other) const {
+    return !(*this == other);
+  }
+};
+
 struct SymbolVersion {
   llvm::StringRef name;
   bool isExternCpp;
@@ -699,7 +716,7 @@ struct Ctx : CommonLinkerContext {
 
   llvm::raw_fd_ostream openAuxiliaryFile(llvm::StringRef, std::error_code &);
 
-  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
+  std::optional<AArch64PauthAbiCoreInfo> aarch64PauthAbiCoreInfo;
 };
 
 // The first two elements of versionDefinitions represent VER_NDX_LOCAL and
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 87b19cf543d9..c9ac71f7236f 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2857,15 +2857,15 @@ static void readSecurityNotes(Ctx &ctx) {
   StringRef referenceFileName;
   if (ctx.arg.emachine == EM_AARCH64) {
     auto it = llvm::find_if(ctx.objectFiles, [](const ELFFileBase *f) {
-      return !f->aarch64PauthAbiCoreInfo.empty();
+      return f->aarch64PauthAbiCoreInfo.has_value();
     });
     if (it != ctx.objectFiles.end()) {
       ctx.aarch64PauthAbiCoreInfo = (*it)->aarch64PauthAbiCoreInfo;
       referenceFileName = (*it)->getName();
     }
   }
-  bool hasValidPauthAbiCoreInfo = llvm::any_of(
-      ctx.aarch64PauthAbiCoreInfo, [](uint8_t c) { return c != 0; });
+  bool hasValidPauthAbiCoreInfo =
+      ctx.aarch64PauthAbiCoreInfo && ctx.aarch64PauthAbiCoreInfo->isValid();
 
   auto report = [&](ReportPolicy policy) -> ELFSyncStream {
     return {ctx, toDiagLevel(policy)};
@@ -2952,10 +2952,10 @@ static void readSecurityNotes(Ctx &ctx) {
     }
     ctx.arg.andFeatures &= features;
 
-    if (ctx.aarch64PauthAbiCoreInfo.empty())
+    if (!ctx.aarch64PauthAbiCoreInfo)
       continue;
 
-    if (f->aarch64PauthAbiCoreInfo.empty()) {
+    if (!f->aarch64PauthAbiCoreInfo) {
       report(ctx.arg.zPauthReport)
           << f
           << ": -z pauth-report: file does not have AArch64 "
@@ -2965,11 +2965,18 @@ static void readSecurityNotes(Ctx &ctx) {
     }
 
     if (ctx.aarch64PauthAbiCoreInfo != f->aarch64PauthAbiCoreInfo)
-      Err(ctx) << "incompatible values of AArch64 PAuth core info found\n>>> "
-               << referenceFileName << ": 0x"
-               << toHex(ctx.aarch64PauthAbiCoreInfo, /*LowerCase=*/true)
-               << "\n>>> " << f << ": 0x"
-               << toHex(f->aarch64PauthAbiCoreInfo, /*LowerCase=*/true);
+      Err(ctx)
+          << "incompatible values of AArch64 PAuth core info found\n"
+          << "platform:\n"
+          << ">>> " << referenceFileName << ": 0x"
+          << toHex(ctx.aarch64PauthAbiCoreInfo->platform, /*LowerCase=*/true)
+          << "\n>>> " << f << ": 0x"
+          << toHex(f->aarch64PauthAbiCoreInfo->platform, /*LowerCase=*/true)
+          << "\nversion:\n"
+          << ">>> " << referenceFileName << ": 0x"
+          << toHex(ctx.aarch64PauthAbiCoreInfo->version, /*LowerCase=*/true)
+          << "\n>>> " << f << ": 0x"
+          << toHex(f->aarch64PauthAbiCoreInfo->version, /*LowerCase=*/true);
   }
 
   // Force enable Shadow Stack.
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 44e77bf57183..71e72e7184b9 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -948,7 +948,7 @@ static void parseGnuPropertyNote(Ctx &ctx, ELFFileBase &f,
     } else if (ctx.arg.emachine == EM_AARCH64 &&
                type == GNU_PROPERTY_AARCH64_FEATURE_PAUTH) {
       ArrayRef<uint8_t> contents = data ? *data : desc;
-      if (!f.aarch64PauthAbiCoreInfo.empty()) {
+      if (f.aarch64PauthAbiCoreInfo) {
         return void(
             err(contents.data())
             << "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are "
@@ -959,7 +959,9 @@ static void parseGnuPropertyNote(Ctx &ctx, ELFFileBase &f,
                        "is invalid: expected 16 bytes, but got "
                     << size);
       }
-      f.aarch64PauthAbiCoreInfo = desc;
+      f.aarch64PauthAbiCoreInfo = {
+          support::endian::read64<ELFT::Endianness>(&desc[0]),
+          support::endian::read64<ELFT::Endianness>(&desc[8])};
     }
 
     // Padding is present in the note descriptor, if necessary.
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 808cb5d24079..ba844ad18f63 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -241,7 +241,7 @@ public:
   StringRef sourceFile;
   uint32_t andFeatures = 0;
   bool hasCommonSyms = false;
-  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
+  std::optional<AArch64PauthAbiCoreInfo> aarch64PauthAbiCoreInfo;
 };
 
 // .o file.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 0a9c7a081eb8..051e5cd04ef5 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -354,11 +354,11 @@ void GnuPropertySection::writeTo(uint8_t *buf) {
     offset += 16;
   }
 
-  if (!ctx.aarch64PauthAbiCoreInfo.empty()) {
+  if (ctx.aarch64PauthAbiCoreInfo) {
     write32(ctx, buf + offset + 0, GNU_PROPERTY_AARCH64_FEATURE_PAUTH);
-    write32(ctx, buf + offset + 4, ctx.aarch64PauthAbiCoreInfo.size());
-    memcpy(buf + offset + 8, ctx.aarch64PauthAbiCoreInfo.data(),
-           ctx.aarch64PauthAbiCoreInfo.size());
+    write32(ctx, buf + offset + 4, AArch64PauthAbiCoreInfo::size());
+    write64(ctx, buf + offset + 8, ctx.aarch64PauthAbiCoreInfo->platform);
+    write64(ctx, buf + offset + 16, ctx.aarch64PauthAbiCoreInfo->version);
   }
 }
 
@@ -366,8 +366,8 @@ size_t GnuPropertySection::getSize() const {
   uint32_t contentSize = 0;
   if (ctx.arg.andFeatures != 0)
     contentSize += ctx.arg.is64 ? 16 : 12;
-  if (!ctx.aarch64PauthAbiCoreInfo.empty())
-    contentSize += 4 + 4 + ctx.aarch64PauthAbiCoreInfo.size();
+  if (ctx.aarch64PauthAbiCoreInfo)
+    contentSize += 4 + 4 + AArch64PauthAbiCoreInfo::size();
   assert(contentSize != 0);
   return contentSize + 16;
 }
@@ -4967,7 +4967,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   ctx.in.iplt = std::make_unique<IpltSection>(ctx);
   add(*ctx.in.iplt);
 
-  if (ctx.arg.andFeatures || !ctx.aarch64PauthAbiCoreInfo.empty()) {
+  if (ctx.arg.andFeatures || ctx.aarch64PauthAbiCoreInfo) {
     ctx.in.gnuProperty = std::make_unique<GnuPropertySection>(ctx);
     add(*ctx.in.gnuProperty);
   }
diff --git a/lld/test/ELF/aarch64-feature-pauth.s b/lld/test/ELF/aarch64-feature-pauth.s
index bc58f69d32f2..e8c900b9cb13 100644
--- a/lld/test/ELF/aarch64-feature-pauth.s
+++ b/lld/test/ELF/aarch64-feature-pauth.s
@@ -13,8 +13,12 @@
 # RUN: not ld.lld tag1.o tag1a.o tag2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR1 %s
 
 # ERR1:      error: incompatible values of AArch64 PAuth core info found
-# ERR1-NEXT: >>> tag1.o: 0x2a000000000000000{{1|2}}00000000000000
-# ERR1-NEXT: >>> tag2.o: 0x2a000000000000000{{1|2}}00000000000000
+# ERR1-NEXT: platform:
+# ERR1-NEXT: >>> tag1.o: 0x2a
+# ERR1-NEXT: >>> tag2.o: 0x2a
+# ERR1-NEXT: version:
+# ERR1-NEXT: >>> tag1.o: 0x01
+# ERR1-NEXT: >>> tag2.o: 0x02
 
 # RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o short.o
 # RUN: not ld.lld short.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR2 %s

From 058602372e2bb7460469c5c53cc36f0a4b131f54 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 11:05:57 +0100
Subject: [PATCH 0254/1322] [X86] X86FixupInstTuning - fold BLENDPS -> MOVSD
 (#144029)

Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |  15 +-
 llvm/test/CodeGen/X86/avx-insertelt.ll        |   2 +-
 .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll |   4 +-
 .../CodeGen/X86/coalesce_commute_movsd.ll     |   4 +-
 llvm/test/CodeGen/X86/combine-and.ll          |   2 +-
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   | 175 +++++++-----------
 llvm/test/CodeGen/X86/commute-blend-sse41.ll  |   2 +-
 llvm/test/CodeGen/X86/horizontal-sum.ll       |   4 +-
 llvm/test/CodeGen/X86/insertelement-zero.ll   |  10 +-
 llvm/test/CodeGen/X86/masked_load.ll          |   2 +-
 llvm/test/CodeGen/X86/sse-insertelt.ll        |  15 +-
 llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll  |  76 +++-----
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |   4 +-
 .../X86/sse2-intrinsics-x86-upgrade.ll        |  16 +-
 llvm/test/CodeGen/X86/sse2.ll                 |   2 +-
 .../CodeGen/X86/sse41-intrinsics-fast-isel.ll |   2 +-
 .../X86/sse41-intrinsics-x86-upgrade.ll       |   4 +-
 llvm/test/CodeGen/X86/vec-strict-128-fp16.ll  |   2 +-
 llvm/test/CodeGen/X86/vec_floor.ll            |   4 +-
 llvm/test/CodeGen/X86/vector-blend.ll         |  24 +--
 .../vector-interleaved-load-i32-stride-4.ll   |  64 +++----
 .../vector-interleaved-load-i32-stride-5.ll   |  40 ++--
 llvm/test/CodeGen/X86/vector-mul.ll           |   2 +-
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |  62 ++-----
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll |   6 +-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |   2 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |   2 +-
 .../X86/vector-shuffle-concatenation.ll       |   4 +-
 llvm/test/CodeGen/X86/vselect-2.ll            |  38 ++--
 llvm/test/CodeGen/X86/vselect.ll              |  60 ++----
 30 files changed, 258 insertions(+), 391 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index be0a8c23ea5c..ce1e4966553f 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,8 +222,9 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
-  auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
-    if (MI.getOperand(NumOperands - 1).getImm() != 1)
+  auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
+                               unsigned MovImm) -> bool {
+    if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
       return false;
     bool Force = MF.getFunction().hasOptSize();
     if (!Force && !NewOpcPreferable(MovOpc))
@@ -235,14 +236,16 @@ bool X86FixupInstTuningPass::processInstruction(
 
   switch (Opc) {
   case X86::BLENDPDrri:
-    return ProcessBLENDToMOV(X86::MOVSDrr);
+    return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
   case X86::VBLENDPDrri:
-    return ProcessBLENDToMOV(X86::VMOVSDrr);
+    return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
 
   case X86::BLENDPSrri:
-    return ProcessBLENDToMOV(X86::MOVSSrr);
+    return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
+           ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
   case X86::VBLENDPSrri:
-    return ProcessBLENDToMOV(X86::VMOVSSrr);
+    return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
+           ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
 
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 02e6c9649c9a..f8feceb0404b 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -111,7 +111,7 @@ define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, doub
 ; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 966662f5f9f8..f0203b3b889e 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -300,8 +300,8 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse41_blendpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; CHECK-NEXT:    # xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; CHECK-NEXT:    # xmm0 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index 086df87d1d5f..441c79b3fc31 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -19,12 +19,12 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
 ;
 ; AVX-LABEL: insert_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: insert_f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    retq
  %1 = insertelement <2 x double> %a1, double %a0, i32 0
  ret <2 x double> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 9ca4ebfec277..a476b21979ce 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -127,7 +127,7 @@ define <4 x i32> @test7(<4 x i32> %A) {
 ; SSE-LABEL: test7:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test7:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 2f2a05fa6939..14e3767f6556 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -31,15 +31,10 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
 
 
 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # %bb.0:
@@ -53,15 +48,10 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test3:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test3:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test3:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test3:
 ; AVX:       # %bb.0:
@@ -201,15 +191,10 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
 
 
 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test9:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test9:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test9:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test9:
 ; AVX:       # %bb.0:
@@ -223,15 +208,10 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test10:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test10:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test10:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test10:
 ; AVX:       # %bb.0:
@@ -563,20 +543,25 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
 ; bitcast to use the mask-or blend combine.
 
 define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
-; SSE2-LABEL: test22:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
+; SSE-LABEL: test22:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
-; SSE4-LABEL: test22:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; AVX1-LABEL: test22:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: test22:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT:    retq
+; AVX2-LABEL: test22:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test22:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT:    retq
   %bc1 = bitcast <2 x double> %a0 to <2 x i64>
   %bc2 = bitcast <2 x double> %a1 to <2 x i64>
   %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -614,20 +599,25 @@ define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
 
 
 define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
-; SSE2-LABEL: test24:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
+; SSE-LABEL: test24:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
-; SSE4-LABEL: test24:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; AVX1-LABEL: test24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: test24:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT:    retq
+; AVX2-LABEL: test24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT:    retq
   %bc1 = bitcast <4 x float> %a0 to <2 x i64>
   %bc2 = bitcast <4 x float> %a1 to <2 x i64>
   %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -707,15 +697,10 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
 ; Verify that we can fold regardless of which operand is the zeroinitializer
 
 define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2b:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2b:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2b:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2b:
 ; AVX:       # %bb.0:
@@ -728,15 +713,10 @@ define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2c:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2c:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2c:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2c:
 ; AVX:       # %bb.0:
@@ -750,15 +730,10 @@ define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2d:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2d:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2d:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2d:
 ; AVX:       # %bb.0:
@@ -773,15 +748,10 @@ define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
 ; Make sure we can have an undef where an index pointing to the zero vector should be
 
 define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2e:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2e:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2e:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2e:
 ; AVX:       # %bb.0:
@@ -794,15 +764,10 @@ define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2f:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2f:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2f:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2f:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
index 07d6a8ba22bb..4740bf59a69e 100644
--- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -57,7 +57,7 @@ define void @baz(ptr %arg, ptr %arg1) optsize {
 ; CHECK-NEXT:    movaps (%rdi), %xmm0
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [3,3]
 ; CHECK-NEXT:    andps %xmm0, %xmm1
-; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    movups %xmm1, (%rsi)
 ; CHECK-NEXT:    retq
 bb:
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 443275e11459..0afc4f784bc5 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -577,7 +577,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
 ; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
 ; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-SLOW-NEXT:    vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; AVX-SLOW-NEXT:    vaddps %xmm3, %xmm4, %xmm4
 ; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -596,7 +596,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
 ; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-FAST-NEXT:    vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
 ; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm4
 ; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 6036eddb0ca8..b66ad07c466e 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -30,13 +30,13 @@ define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
 ; SSE41-LABEL: insert_v2f64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v2f64_z1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %1 = insertelement <2 x double> %a, double 0.0, i32 0
   ret <2 x double> %1
@@ -68,7 +68,7 @@ define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v4f64_0zz3:
@@ -103,7 +103,7 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
 ; SSE41-LABEL: insert_v2i64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v2i64_z1:
@@ -137,7 +137,7 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
 ; SSE41-LABEL: insert_v4i64_01z3:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v4i64_01z3:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 37ab4276fbcc..8c4bab99a5b7 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6512,7 +6512,7 @@ define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
 ; SSE42-LABEL: mload_constmask_v8f32:
 ; SSE42:       ## %bb.0:
 ; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE42-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; SSE42-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index f174eaaca38c..72e002ed6b7d 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -21,19 +21,14 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
 }
 
 define <2 x double> @insert_f64_firstelt(<2 x double> %x, double %s) {
-; SSE2-LABEL: insert_f64_firstelt:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: insert_f64_firstelt:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: insert_f64_firstelt:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_f64_firstelt:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %i0 = insertelement <2 x double> %x, double %s, i32 0
   ret <2 x double> %i0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 12bfb8d4fc9c..325f735b09cd 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
 
@@ -1333,29 +1333,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 }
 
 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; X86-SSE2-LABEL: add_sd_mask:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    jne .LBB71_1
-; X86-SSE2-NEXT:  # %bb.2:
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE2-NEXT:    retl
-; X86-SSE2-NEXT:  .LBB71_1:
-; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: add_sd_mask:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE41-NEXT:    jne .LBB71_1
-; X86-SSE41-NEXT:  # %bb.2:
-; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; X86-SSE41-NEXT:    retl
-; X86-SSE41-NEXT:  .LBB71_1:
-; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE41-NEXT:    retl
+; X86-SSE-LABEL: add_sd_mask:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    jne .LBB71_1
+; X86-SSE-NEXT:  # %bb.2:
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-SSE-NEXT:    retl
+; X86-SSE-NEXT:  .LBB71_1:
+; X86-SSE-NEXT:    addsd %xmm0, %xmm1
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_sd_mask:
 ; X86-AVX1:       # %bb.0:
@@ -1375,29 +1363,17 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
 ; X86-AVX512-NEXT:    retl
 ;
-; X64-SSE2-LABEL: add_sd_mask:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    testb $1, %dil
-; X64-SSE2-NEXT:    jne .LBB71_1
-; X64-SSE2-NEXT:  # %bb.2:
-; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X64-SSE2-NEXT:    retq
-; X64-SSE2-NEXT:  .LBB71_1:
-; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: add_sd_mask:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    testb $1, %dil
-; X64-SSE41-NEXT:    jne .LBB71_1
-; X64-SSE41-NEXT:  # %bb.2:
-; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; X64-SSE41-NEXT:    retq
-; X64-SSE41-NEXT:  .LBB71_1:
-; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X64-SSE41-NEXT:    retq
+; X64-SSE-LABEL: add_sd_mask:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    testb $1, %dil
+; X64-SSE-NEXT:    jne .LBB71_1
+; X64-SSE-NEXT:  # %bb.2:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-SSE-NEXT:    retq
+; X64-SSE-NEXT:  .LBB71_1:
+; X64-SSE-NEXT:    addsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_sd_mask:
 ; X64-AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 18a6be8aaf0b..3f48b22e2b9f 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3010,8 +3010,8 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin
 ;
 ; AVX-LABEL: test_mm_move_sd:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; AVX-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ext0 = extractelement <2 x double> %a1, i32 0
   %res0 = insertelement <2 x double> undef, double %ext0, i32 0
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 6dd75c8c09ce..413b4e79257a 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -724,8 +724,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX1-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -734,8 +734,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX512-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -752,8 +752,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X64-AVX1-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -761,8 +761,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X64-AVX512-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %a1 = load <4 x float>, ptr %p1
   %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index e1d91b407fc2..6e77d3e4fd13 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -417,7 +417,7 @@ define void @test12() nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps 0, %xmm0
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index 47630501864a..c6f0ec493a36 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -33,7 +33,7 @@ define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
 ;
 ; AVX-LABEL: test_mm_blend_pd:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index bdf8033a00b0..137606b7cfee 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -18,8 +18,8 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1)
 ;
 ; AVX-LABEL: test_x86_sse41_blendpd:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps $3, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; AVX-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index a6e288608c87..35688e59fc9f 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -93,7 +93,7 @@ define <2 x double> @f12(<2 x double> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: f12:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtsh2sd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <8 x half> %a1, i32 0
   %cvt = call double @llvm.experimental.constrained.fpext.f64.f16(half %ext,
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 1007969b6c6d..7f4ed3394d10 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -1653,7 +1653,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
 ; AVX-NEXT:    jne LBB59_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB59_1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
@@ -2643,7 +2643,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
 ; AVX-NEXT:    jne LBB85_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB85_1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index a38028e87532..2d2fc6b6ee0d 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -172,7 +172,7 @@ define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
 ;
 ; AVX-LABEL: vsel_double:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -732,23 +732,11 @@ entry:
 }
 
 define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
-; SSE2-LABEL: blend_shufflevector_4xi64:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movaps %xmm3, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: blend_shufflevector_4xi64:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movaps %xmm3, %xmm1
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: blend_shufflevector_4xi64:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movaps %xmm3, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: blend_shufflevector_4xi64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: blend_shufflevector_4xi64:
 ; AVX:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 0bf126073843..822d31eb4513 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -250,10 +250,10 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm5 = xmm5[0],xmm6[1]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
@@ -584,14 +584,14 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm10 = xmm5[1],xmm6[1],zero,zero
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm10 = xmm10[0],xmm11[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; AVX-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm11 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm12 = zero,zero,xmm7[2],xmm8[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm11 = xmm11[0],xmm12[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
 ; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4]
@@ -1080,7 +1080,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps %ymm12, %ymm0
@@ -1094,7 +1094,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -1105,7 +1105,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm3[2]
 ; AVX-NEXT:    vmovaps %xmm10, %xmm14
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
@@ -1115,7 +1115,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm9[2],xmm1[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
@@ -2120,7 +2120,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5]
@@ -2131,7 +2131,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -2147,7 +2147,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2164,7 +2164,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2176,7 +2176,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = zero,zero,xmm9[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
@@ -2187,7 +2187,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = zero,zero,xmm5[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -2203,7 +2203,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
@@ -2215,7 +2215,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -4239,7 +4239,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -4253,7 +4253,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4269,7 +4269,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4285,7 +4285,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4301,7 +4301,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4317,7 +4317,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4333,7 +4333,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4349,7 +4349,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3]
@@ -4358,7 +4358,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -4368,7 +4368,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -4384,7 +4384,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
@@ -4400,7 +4400,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4416,7 +4416,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4432,7 +4432,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4448,7 +4448,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4464,7 +4464,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index c08442f9d9d0..4f80140bc6c1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -359,7 +359,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm3
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm5 = xmm3[0],xmm4[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3]
 ; AVX-NEXT:    vmovaps 64(%rdi), %xmm6
@@ -369,7 +369,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3]
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm7 = xmm7[1,0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
@@ -787,7 +787,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[3,0],ymm9[6,4],ymm7[7,4]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm9
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm11 = xmm9[0],xmm10[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
@@ -806,7 +806,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4]
@@ -1552,7 +1552,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm15
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3]
+; AVX-NEXT:    vmovsd %xmm15, %xmm10, %xmm4
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1565,7 +1565,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4]
 ; AVX-NEXT:    vmovaps 160(%rdi), %xmm9
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT:    vmovsd %xmm9, %xmm8, %xmm4
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1597,7 +1597,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm10 = xmm10[0],xmm15[1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3]
@@ -1605,7 +1605,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm8 = xmm8[0],xmm9[1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3]
@@ -3086,7 +3086,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7]
@@ -3102,7 +3102,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 512(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3118,7 +3118,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3134,7 +3134,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6148,7 +6148,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6165,7 +6165,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 512(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6182,7 +6182,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 832(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6198,7 +6198,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 1152(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6214,7 +6214,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6230,7 +6230,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6247,7 +6247,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 672(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6265,7 +6265,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 98b5bab98c4f..13b21a747878 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -1579,7 +1579,7 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
 ; SSE4-LABEL: mul_v2i64_0_1:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    xorps %xmm1, %xmm1
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE4-NEXT:    ret{{[l|q]}}
 ;
 ; X64-AVX-LABEL: mul_v2i64_0_1:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 2d3dc4c593c1..baaae507ae15 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -242,35 +242,20 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: shuffle_v2f64_03:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: shuffle_v2f64_21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_v2f64_21:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v2f64_21:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v2f64_21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_21:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x double> %shuffle
@@ -523,25 +508,10 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: shuffle_v2i64_21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_v2i64_21:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v2i64_21:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v2i64_21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_21:
 ; AVX:       # %bb.0:
@@ -572,7 +542,7 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 ; SSE41-LABEL: shuffle_v2i64_21_copy:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_21_copy:
@@ -740,7 +710,7 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
 ; SSE41-LABEL: shuffle_v2i64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_z1:
@@ -821,13 +791,13 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
 ; SSE41-LABEL: shuffle_v2f64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_z1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
   ret <2 x double> %shuffle
@@ -1102,7 +1072,7 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: insert_reg_lo_v2f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %v = insertelement <2 x double> poison, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
@@ -1334,7 +1304,7 @@ define <2 x double> @shuffle_mem_v2f64_21(<2 x double> %a, ptr %pb) {
 ; SSE41-LABEL: shuffle_mem_v2f64_21:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v2f64_21:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index e1eb1a6704e3..9ec24c447c2c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2222,7 +2222,7 @@ define <4 x i32> @insert_mem_lo_v4i32(ptr %ptr, <4 x i32> %b) {
 ; SSE41-LABEL: insert_mem_lo_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_lo_v4i32:
@@ -2295,7 +2295,7 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: insert_reg_lo_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %a.cast = bitcast double %a to <2 x float>
   %v = shufflevector <2 x float> %a.cast, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -2489,7 +2489,7 @@ define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
 ; SSE41-LABEL: shuffle_mem_v4f32_4523:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v4f32_4523:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 950683cbfaee..bce50db4d952 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -468,7 +468,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 002a3b77dc35..bd2710139d58 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -53,7 +53,7 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1)
 ;
 ; AVX-LABEL: combine_pshufb_as_movsd:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0>
   %2 = bitcast <2 x double> %1 to <16 x i8>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
index 2812bf348910..925f8d510451 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
@@ -173,7 +173,7 @@ define void @concat_a_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movaps (%rdi), %xmm0
 ; SSE42-NEXT:    movaps (%rsi), %xmm1
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; SSE42-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE42-NEXT:    movaps %xmm0, 16(%rdx)
 ; SSE42-NEXT:    movaps %xmm1, (%rdx)
 ; SSE42-NEXT:    retq
@@ -288,7 +288,7 @@ define void @concat_shuf_of_ab_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movaps (%rdi), %xmm0
 ; SSE42-NEXT:    movaps (%rsi), %xmm1
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; SSE42-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE42-NEXT:    movaps %xmm1, 16(%rdx)
 ; SSE42-NEXT:    movaps %xmm0, (%rdx)
 ; SSE42-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vselect-2.ll b/llvm/test/CodeGen/X86/vselect-2.ll
index c02cbcf55408..429ae88fe6d6 100644
--- a/llvm/test/CodeGen/X86/vselect-2.ll
+++ b/llvm/test/CodeGen/X86/vselect-2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
 ; SSE2-LABEL: test1:
@@ -24,15 +24,10 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
-; SSE2-LABEL: test2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test2:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # %bb.0:
@@ -55,26 +50,21 @@ define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
 ;
 ; AVX-LABEL: test3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
   ret <4 x float> %select
 }
 
 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
-; SSE2-LABEL: test4:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test4:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
   ret <4 x float> %select
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 18a060ad910b..f70145d6b21c 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -69,26 +69,21 @@ define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
 
 define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: test3:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test3:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test3:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -152,15 +147,10 @@ define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: test8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test8:
 ; AVX:       # %bb.0:
@@ -329,34 +319,24 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: test20:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test20:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test20:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test20:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
 }
 
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test21:
 ; AVX:       # %bb.0:
@@ -419,7 +399,7 @@ define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: test24:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1

From e2c27fd66a13c7a37cccbf4309532fcbce86c09b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 11:11:01 +0100
Subject: [PATCH 0255/1322] [X86] X86FixupInstTuning - hoist OptSize flag. NFC.

Allow reuse in a future patch.
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index ce1e4966553f..8c1ff523c975 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -81,6 +81,7 @@ bool X86FixupInstTuningPass::processInstruction(
   MachineInstr &MI = *I;
   unsigned Opc = MI.getOpcode();
   unsigned NumOperands = MI.getDesc().getNumOperands();
+  bool OptSize = MF.getFunction().hasOptSize();
 
   auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
     // We already checked that SchedModel exists in `NewOpcPreferable`.
@@ -226,8 +227,7 @@ bool X86FixupInstTuningPass::processInstruction(
                                unsigned MovImm) -> bool {
     if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
       return false;
-    bool Force = MF.getFunction().hasOptSize();
-    if (!Force && !NewOpcPreferable(MovOpc))
+    if (!OptSize && !NewOpcPreferable(MovOpc))
       return false;
     MI.setDesc(TII->get(MovOpc));
     MI.removeOperand(NumOperands - 1);

From 6fc8ec720ea590bbdb94e19acefaf5bafdfcf817 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 13 Jun 2025 12:29:50 +0200
Subject: [PATCH 0256/1322] [InstCombine] Restore splat gep support in
 OptimizePointerDifference() (#143906)

When looking for the common base pointer, support the case where the
type changes because the GEP goes from pointer to vector of pointers.
This was supported prior to #142958.
---
 .../InstCombine/InstCombineAddSub.cpp         |  7 +++--
 llvm/test/Transforms/InstCombine/sub-gep.ll   | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 86d318967403..0d91e7d77e4a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2088,8 +2088,6 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
   // Find common base and collect RHS GEPs.
   while (true) {
     if (Ptrs.contains(RHS)) {
-      if (LHS->getType() != RHS->getType())
-        return Base;
       Base.Ptr = RHS;
       break;
     }
@@ -2132,12 +2130,15 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   // TODO: We should probably do this even if there is only one GEP.
   bool RewriteGEPs = !Base.LHSGEPs.empty() && !Base.RHSGEPs.empty();
 
-  Type *IdxTy = DL.getIndexType(Base.Ptr->getType());
+  Type *IdxTy = DL.getIndexType(LHS->getType());
   auto EmitOffsetFromBase = [&](ArrayRef<GEPOperator *> GEPs,
                                 GEPNoWrapFlags NW) -> Value * {
     Value *Sum = nullptr;
     for (GEPOperator *GEP : reverse(GEPs)) {
       Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
+      if (Offset->getType() != IdxTy)
+        Offset = Builder.CreateVectorSplat(
+            cast<VectorType>(IdxTy)->getElementCount(), Offset);
       if (Sum)
         Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
                                 NW.isInBounds());
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 9444fef1887d..375be8a3d69c 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -995,3 +995,33 @@ define i64 @multiple_geps_inbounds_nuw(ptr %base, i64 %idx, i64 %idx2) {
   %d = sub i64 %i2, %i1
   ret i64 %d
 }
+
+define <2 x i64> @splat_geps(ptr %base, <2 x i64> %idx1, <2 x i64> %idx2) {
+; CHECK-LABEL: @splat_geps(
+; CHECK-NEXT:    [[D:%.*]] = sub nsw <2 x i64> [[IDX2:%.*]], [[IDX1:%.*]]
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %base, <2 x i64> %idx1
+  %gep2 = getelementptr inbounds i8, ptr %base, <2 x i64> %idx2
+  %gep1.int = ptrtoint <2 x ptr> %gep1 to <2 x i64>
+  %gep2.int = ptrtoint <2 x ptr> %gep2 to <2 x i64>
+  %d = sub <2 x i64> %gep2.int, %gep1.int
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @splat_geps_multiple(ptr %base, i64 %idx0, <2 x i64> %idx1, <2 x i64> %idx2) {
+; CHECK-LABEL: @splat_geps_multiple(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[IDX0:%.*]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i64> [[DOTSPLAT]], [[IDX1:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = sub nsw <2 x i64> [[IDX2:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %gep0 = getelementptr inbounds i8, ptr %base, i64 %idx0
+  %gep1 = getelementptr inbounds i8, ptr %gep0, <2 x i64> %idx1
+  %gep2 = getelementptr inbounds i8, ptr %base, <2 x i64> %idx2
+  %gep1.int = ptrtoint <2 x ptr> %gep1 to <2 x i64>
+  %gep2.int = ptrtoint <2 x ptr> %gep2 to <2 x i64>
+  %d = sub <2 x i64> %gep2.int, %gep1.int
+  ret <2 x i64> %d
+}

From 2019553a0b8811a23d7546cbace52a8e241a3b37 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 13 Jun 2025 12:34:18 +0200
Subject: [PATCH 0257/1322] [InstCombine] Extract EmitGEPOffsets() helper (NFC)

Extract a reusable helper for emitting a sum of multiple GEP
offsets.
---
 .../InstCombine/InstCombineAddSub.cpp         | 23 ++-----------------
 .../InstCombine/InstCombineInternal.h         |  4 ++++
 .../InstCombine/InstructionCombining.cpp      | 20 ++++++++++++++++
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0d91e7d77e4a..c1ce364eb179 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2131,27 +2131,8 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   bool RewriteGEPs = !Base.LHSGEPs.empty() && !Base.RHSGEPs.empty();
 
   Type *IdxTy = DL.getIndexType(LHS->getType());
-  auto EmitOffsetFromBase = [&](ArrayRef<GEPOperator *> GEPs,
-                                GEPNoWrapFlags NW) -> Value * {
-    Value *Sum = nullptr;
-    for (GEPOperator *GEP : reverse(GEPs)) {
-      Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
-      if (Offset->getType() != IdxTy)
-        Offset = Builder.CreateVectorSplat(
-            cast<VectorType>(IdxTy)->getElementCount(), Offset);
-      if (Sum)
-        Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
-                                NW.isInBounds());
-      else
-        Sum = Offset;
-    }
-    if (!Sum)
-      return Constant::getNullValue(IdxTy);
-    return Sum;
-  };
-
-  Value *Result = EmitOffsetFromBase(Base.LHSGEPs, Base.LHSNW);
-  Value *Offset2 = EmitOffsetFromBase(Base.RHSGEPs, Base.RHSNW);
+  Value *Result = EmitGEPOffsets(Base.LHSGEPs, Base.LHSNW, IdxTy, RewriteGEPs);
+  Value *Offset2 = EmitGEPOffsets(Base.RHSGEPs, Base.RHSNW, IdxTy, RewriteGEPs);
 
   // If this is a single inbounds GEP and the original sub was nuw,
   // then the final multiplication is also nuw.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index bf7689bbfde7..ce0e843437b5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -378,6 +378,10 @@ private:
   }
 
   Value *EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP = false);
+  /// Emit sum of multiple GEP offsets. The GEPs are processed in reverse
+  /// order.
+  Value *EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs, GEPNoWrapFlags NW,
+                        Type *IdxTy, bool RewriteGEPs);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index dc2a8cb0115e..29582939fa06 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -217,6 +217,26 @@ Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
   return Offset;
 }
 
+Value *InstCombinerImpl::EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs,
+                                        GEPNoWrapFlags NW, Type *IdxTy,
+                                        bool RewriteGEPs) {
+  Value *Sum = nullptr;
+  for (GEPOperator *GEP : reverse(GEPs)) {
+    Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
+    if (Offset->getType() != IdxTy)
+      Offset = Builder.CreateVectorSplat(
+          cast<VectorType>(IdxTy)->getElementCount(), Offset);
+    if (Sum)
+      Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
+                              NW.isInBounds());
+    else
+      Sum = Offset;
+  }
+  if (!Sum)
+    return Constant::getNullValue(IdxTy);
+  return Sum;
+}
+
 /// Legal integers and common types are considered desirable. This is used to
 /// avoid creating instructions with types that may not be supported well by the
 /// the backend.

From 541e5118ce570c9bed74cb5ff836f88cf1c0e644 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 13 Jun 2025 11:43:50 +0100
Subject: [PATCH 0258/1322] [LV] Use getFixedValue instead of getKnownMinValue
 when appropriate (#143526)

There are many places in VPlan and LoopVectorize where we use
getKnownMinValue to discover the number of elements in a vector. Where
we expect the vector to have a fixed length, I have used the stronger
getFixedValue call. I believe this is clearer and adds extra protection
in the form of an assert in getFixedValue that the vector is not
scalable.

While looking at VPFirstOrderRecurrencePHIRecipe::computeCost I also
took the liberty of simplifying the code.

In theory I believe this patch should be NFC, but I'm reluctant to add
that to the title in case we're just missing tests for some of the VPlan
changes. I built and ran the LLVM test suite when targeting neoverse-v1
and it seemed ok.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 33 +++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  7 ++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++++------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 404ee6874d2a..fa313243a57d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3116,12 +3116,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
     // that we will create. This cost is likely to be zero. The phi node
     // cost, if any, should be scaled by the block probability because it
     // models a copy at the end of each predicated block.
-    ScalarizationCost += VF.getKnownMinValue() *
-      TTI.getCFInstrCost(Instruction::PHI, CostKind);
+    ScalarizationCost +=
+        VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
 
     // The cost of the non-predicated instruction.
-    ScalarizationCost += VF.getKnownMinValue() *
-      TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
+    ScalarizationCost +=
+        VF.getFixedValue() *
+        TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
 
     // The cost of insertelement and extractelement instructions needed for
     // scalarization.
@@ -4289,7 +4290,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
           return NumLegalParts <= VF.getKnownMinValue();
         }
         // Two or more elements that share a register - are vectorized.
-        return NumLegalParts < VF.getKnownMinValue();
+        return NumLegalParts < VF.getFixedValue();
       };
 
       // If no def nor is a store, e.g., branches, continue - no value to check.
@@ -4574,8 +4575,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
         assert(!isa<SCEVCouldNotCompute>(TC) &&
                "Trip count SCEV must be computable");
         RemainingIterations = SE.getURemExpr(
-            TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
-        MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
+            TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+        MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
                                 SE.getConstant(TCType, MaxTripCount))) {
           MaxTripCount =
@@ -4586,7 +4587,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
       }
       if (SE.isKnownPredicate(
               CmpInst::ICMP_UGT,
-              SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
+              SE.getConstant(TCType, NextVF.Width.getFixedValue()),
               RemainingIterations))
         continue;
     }
@@ -5257,14 +5258,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Get the cost of the scalar memory instruction and address computation.
   InstructionCost Cost =
-      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+      VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
-                                                      ValTy->getScalarType(),
-                                                      Alignment, AS, CostKind);
+  Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
+                                                   ValTy->getScalarType(),
+                                                   Alignment, AS, CostKind);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5280,7 +5281,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
     auto *VecI1Ty =
         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
     Cost += TTI.getScalarizationOverhead(
-        VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
+        VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
         /*Insert=*/false, /*Extract=*/true, CostKind);
     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
 
@@ -5341,6 +5342,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   StoreInst *SI = cast<StoreInst>(I);
 
   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
+  // TODO: We have existing tests that request the cost of extracting element
+  // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
+  // the actual generated code, which involves extracting the last element of
+  // a scalable vector where the lane to extract is unknown at compile time.
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
                              CostKind) +
@@ -5623,7 +5628,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
 
     for (Type *VectorTy : getContainedTypes(RetTy)) {
       Cost += TTI.getScalarizationOverhead(
-          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
           /*Insert=*/true,
           /*Extract=*/false, CostKind);
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b74ef91f26e7..10906d9a30df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
 
   bool IsSingleScalar = vputils::isSingleScalar(Def);
 
-  VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
+  VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
   // Check if there is a scalar value for the selected lane.
   if (!hasScalarValue(Def, LastLane)) {
     // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
@@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
-    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+    for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
       packScalarIntoVectorizedValue(Def, Lane);
     VectorValue = get(Def);
   }
@@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Entry);
   State->Lane = VPLane(0);
-  for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
-       ++Lane) {
+  for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
     State->Lane = VPLane(Lane, VPLane::Kind::First);
     // Visit the VPBlocks connected to \p this, starting from it.
     for (VPBlockBase *Block : RPOT) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 74472aaeb167..ccce0e07e4d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -871,7 +871,7 @@ void VPInstruction::execute(VPTransformState &State) {
                                     isVectorToScalar() || isSingleScalar());
   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
   if (GeneratesPerAllLanes) {
-    for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
+    for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
          Lane != NumLanes; ++Lane) {
       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
       assert(GeneratedValue && "generatePerLane must produce a value");
@@ -2787,8 +2787,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   }
 
   // Generate scalar instances for all VF lanes.
-  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-  const unsigned EndLane = State.VF.getKnownMinValue();
+  const unsigned EndLane = State.VF.getFixedValue();
   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
     scalarizeInstruction(UI, this, VPLane(Lane), State);
 }
@@ -2841,7 +2840,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                UI->getOpcode(), ResultTy, CostKind,
                {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
                Op2Info, Operands, UI, &Ctx.TLI) *
-           (isSingleScalar() ? 1 : VF.getKnownMinValue());
+           (isSingleScalar() ? 1 : VF.getFixedValue());
   }
   }
 
@@ -3390,7 +3389,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     Value *ResBlockInMask = State.get(BlockInMask);
     Value *ShuffledMask = State.Builder.CreateShuffleVector(
         ResBlockInMask,
-        createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
+        createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
         "interleaved.mask");
     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
                                                    ShuffledMask, MaskForGaps)
@@ -3402,8 +3401,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
     if (NeedsMaskForGaps) {
-      MaskForGaps = createBitMaskForGaps(State.Builder,
-                                         State.VF.getKnownMinValue(), *Group);
+      MaskForGaps =
+          createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
       assert(MaskForGaps && "Mask for Gaps is required but it is null");
     }
 
@@ -3454,6 +3453,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
       return;
     }
+    assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
 
     // For each member in the group, shuffle out the appropriate data from the
     // wide loads.
@@ -3466,13 +3466,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
         continue;
 
       auto StrideMask =
-          createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
+          createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
       Value *StridedVec =
           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
 
       // If this member has different type, cast the result type.
       if (Member->getType() != ScalarTy) {
-        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
         StridedVec =
             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
@@ -3808,7 +3807,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
   if (VF.isScalar())
     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 
-  if (VF.isScalable() && VF.getKnownMinValue() == 1)
+  if (VF == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
   return 0;

From 9eef4d1c5fa6b1bcbbe675c14ca8301d5d346f7b Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Fri, 13 Jun 2025 06:45:40 -0400
Subject: [PATCH 0259/1322] Remove delayed typo expressions (#143423)

This removes the delayed typo correction functionality from Clang
(regular typo correction still remains) due to fragility of the
solution.

An RFC was posted here:
https://discourse.llvm.org/t/rfc-removing-support-for-delayed-typo-correction/86631
and while that RFC was asking for folks to consider stepping up to be
maintainers, and we did have a few new contributors show some interest,
experiments show that it's likely worth it to remove this functionality
entirely and focus efforts on improving regular typo correction.

This removal fixes ~20 open issues (quite possibly more), improves
compile time performance by roughly .3-.4%
(https://llvm-compile-time-tracker.com/?config=Overview&stat=instructions%3Au&remote=AaronBallman&sortBy=date),
and does not appear to regress diagnostic behavior in a way we wouldn't
find acceptable.

Fixes #142457
Fixes #139913
Fixes #138850
Fixes #137867
Fixes #137860
Fixes #107840
Fixes #93308
Fixes #69470
Fixes #59391
Fixes #58172
Fixes #46215
Fixes #45915
Fixes #45891
Fixes #44490
Fixes #36703
Fixes #32903
Fixes #23312
Fixes #69874
---
 .../clangd/unittests/HoverTests.cpp           |   2 +-
 clang/docs/ReleaseNotes.rst                   |   8 +
 clang/include/clang/AST/Expr.h                |  33 +-
 clang/include/clang/AST/RecursiveASTVisitor.h |   1 -
 clang/include/clang/Basic/StmtNodes.td        |   1 -
 clang/include/clang/Parse/Parser.h            |   3 +-
 clang/include/clang/Sema/Sema.h               | 126 +-----
 clang/include/clang/Sema/SemaInternal.h       |  14 -
 clang/lib/AST/Expr.cpp                        |   1 -
 clang/lib/AST/ExprClassification.cpp          |   1 -
 clang/lib/AST/ExprConstant.cpp                |   1 -
 clang/lib/AST/ItaniumMangle.cpp               |   1 -
 clang/lib/AST/StmtPrinter.cpp                 |   5 -
 clang/lib/AST/StmtProfile.cpp                 |   4 -
 clang/lib/Parse/ParseCXXInlineMethods.cpp     |   1 -
 clang/lib/Parse/ParseDecl.cpp                 |  33 +-
 clang/lib/Parse/ParseDeclCXX.cpp              |   8 +-
 clang/lib/Parse/ParseExpr.cpp                 |  94 +----
 clang/lib/Parse/ParseExprCXX.cpp              |  13 +-
 clang/lib/Parse/ParseInit.cpp                 |   2 -
 clang/lib/Parse/ParseObjc.cpp                 |  23 +-
 clang/lib/Parse/ParseOpenACC.cpp              |  28 +-
 clang/lib/Parse/ParseOpenMP.cpp               |  27 +-
 clang/lib/Parse/ParseStmt.cpp                 |  16 +-
 clang/lib/Parse/ParseStmtAsm.cpp              |   2 +-
 clang/lib/Parse/ParseTemplate.cpp             |   3 +-
 clang/lib/Sema/Sema.cpp                       |   9 -
 clang/lib/Sema/SemaChecking.cpp               |   2 -
 clang/lib/Sema/SemaCoroutine.cpp              |  12 -
 clang/lib/Sema/SemaDecl.cpp                   |  39 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  27 +-
 clang/lib/Sema/SemaExceptionSpec.cpp          |   1 -
 clang/lib/Sema/SemaExpr.cpp                   | 156 +-------
 clang/lib/Sema/SemaExprCXX.cpp                | 376 +-----------------
 clang/lib/Sema/SemaExprMember.cpp             | 116 +-----
 clang/lib/Sema/SemaLookup.cpp                 |  60 ---
 clang/lib/Sema/SemaObjC.cpp                   |   7 +-
 clang/lib/Sema/SemaOverload.cpp               |   6 +-
 clang/lib/Sema/SemaStmt.cpp                   |  14 +-
 clang/lib/Sema/SemaStmtAttr.cpp               |   5 +-
 clang/lib/Sema/SemaTemplateVariadic.cpp       |  12 +-
 clang/lib/Sema/TreeTransform.h                |   6 -
 clang/lib/Serialization/ASTReaderStmt.cpp     |   4 -
 clang/lib/Serialization/ASTWriterStmt.cpp     |   6 -
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   1 -
 clang/test/AST/ByteCode/literals.cpp          |   8 +-
 clang/test/AST/ast-dump-recovery.c            |  30 --
 clang/test/AST/ast-dump-recovery.cpp          | 214 ----------
 clang/test/AST/ast-dump-recovery.m            |  32 --
 clang/test/CXX/drs/cwg1xx.cpp                 |   3 +-
 clang/test/CXX/drs/cwg26xx.cpp                |   1 -
 .../test/CXX/module/basic/basic.link/p2.cppm  |   4 +-
 clang/test/FixIt/typo.cpp                     | 137 -------
 clang/test/Index/complete-switch.c            |  10 -
 clang/test/Index/fix-its.c                    |  19 +-
 clang/test/Lexer/raw-string-ext.c             |  10 +-
 clang/test/Modules/diagnose-missing-import.m  |   2 -
 .../OpenMP/begin_declare_variant_messages.c   |   2 +-
 .../OpenMP/declare_reduction_messages.cpp     |   2 +-
 clang/test/OpenMP/declare_variant_messages.c  |   6 +-
 .../test/OpenMP/declare_variant_messages.cpp  |   4 +-
 clang/test/OpenMP/target_update_messages.cpp  |  15 +-
 clang/test/Parser/cxx1z-decomposition.cpp     |   6 +-
 clang/test/Parser/cxx1z-fold-expressions.cpp  |   6 +-
 clang/test/Parser/cxx2c-pack-indexing.cpp     |   3 +-
 clang/test/Parser/objc-foreach-syntax.m       |   3 +-
 clang/test/Parser/opencl-atomics-cl20.cl      |  18 +-
 clang/test/Parser/recovery.c                  |   2 +-
 clang/test/Parser/switch-recovery.cpp         |   2 +-
 clang/test/Parser/switch-typo-correction.cpp  |   4 +-
 .../ParserOpenACC/parse-cache-construct.cpp   |  10 +-
 clang/test/ParserOpenACC/parse-clauses.c      |  24 +-
 clang/test/ParserOpenACC/parse-constructs.cpp |   4 +-
 clang/test/ParserOpenACC/parse-wait-clause.c  |   5 +-
 .../test/ParserOpenACC/parse-wait-construct.c |   9 +-
 clang/test/Sema/PR28181.c                     |   8 +-
 clang/test/Sema/builtin-unary-fp.c            |   1 -
 .../c23-delayed-typo-correction-crashes.c     |  18 +
 .../Sema/delayed-typo-correction-crashes.c    |  18 +
 clang/test/Sema/invalid-member.cpp            |   6 +-
 clang/test/Sema/typo-correction-ambiguity.cpp |   4 +-
 clang/test/Sema/typo-correction-no-hang.c     |   9 +-
 clang/test/Sema/typo-correction-no-hang.cpp   |  12 +-
 clang/test/Sema/typo-correction-recursive.cpp |  28 +-
 clang/test/Sema/typo-correction.c             |  26 +-
 clang/test/SemaCXX/arrow-operator.cpp         |   9 +-
 .../SemaCXX/constant-expression-cxx11.cpp     |   7 +-
 clang/test/SemaCXX/conversion-function.cpp    |   2 +-
 clang/test/SemaCXX/coroutines.cpp             |  20 +-
 .../cxx-delayed-typo-correction-crashes.cpp   |  67 ++++
 clang/test/SemaCXX/cxx1z-decomposition.cpp    |   3 +-
 .../cxx20-delayed-typo-correction-crashes.cpp |  19 +
 .../SemaCXX/cxx2a-adl-only-template-id.cpp    |   2 +-
 clang/test/SemaCXX/destructor.cpp             |  15 +-
 clang/test/SemaCXX/invalid-if-constexpr.cpp   |  10 +-
 clang/test/SemaCXX/member-expr.cpp            |   8 +-
 clang/test/SemaCXX/nested-name-spec.cpp       |   6 +-
 .../test/SemaCXX/pr13394-crash-on-invalid.cpp |  29 --
 clang/test/SemaCXX/return.cpp                 |   2 +-
 clang/test/SemaCXX/typo-correction-crash.cpp  |  19 +-
 clang/test/SemaCXX/typo-correction-cxx11.cpp  |  11 +-
 .../test/SemaCXX/typo-correction-delayed.cpp  | 216 ----------
 clang/test/SemaCXX/typo-correction.cpp        |  38 +-
 clang/test/SemaCXX/virtuals.cpp               |   4 +-
 clang/test/SemaObjC/call-super-2.m            |   2 +-
 .../test/SemaObjC/typo-correction-subscript.m |   3 +-
 .../SemaObjC/undef-arg-super-method-call.m    |   8 +-
 .../SemaObjCXX/block-for-lambda-conversion.mm |   7 +-
 .../compute-construct-num_gangs-clause.cpp    |   6 +-
 clang/test/SemaOpenCL/atomic-ops.cl           |   2 +-
 .../test/SemaOpenCL/clang-builtin-version.cl  |   8 +-
 .../SemaTemplate/concepts-recovery-expr.cpp   |   4 +-
 clang/test/SemaTemplate/concepts.cpp          |   6 +-
 clang/test/SemaTemplate/typo-variadic.cpp     |   2 +-
 clang/tools/libclang/CXCursor.cpp             |   1 -
 .../unittests/Sema/ExternalSemaSourceTest.cpp |  14 -
 116 files changed, 438 insertions(+), 2147 deletions(-)
 delete mode 100644 clang/test/AST/ast-dump-recovery.m
 delete mode 100644 clang/test/FixIt/typo.cpp
 delete mode 100644 clang/test/Index/complete-switch.c
 create mode 100644 clang/test/Sema/c23-delayed-typo-correction-crashes.c
 create mode 100644 clang/test/Sema/delayed-typo-correction-crashes.c
 create mode 100644 clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp
 create mode 100644 clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp
 delete mode 100644 clang/test/SemaCXX/pr13394-crash-on-invalid.cpp
 delete mode 100644 clang/test/SemaCXX/typo-correction-delayed.cpp

diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 69f6df46c87c..775278ccf694 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -974,7 +974,7 @@ class Foo final {})cpp";
          HI.Name = "abc";
          HI.Kind = index::SymbolKind::Variable;
          HI.NamespaceScope = "";
-         HI.Definition = "int abc = <recovery - expr>()";
+         HI.Definition = "int abc";
          HI.Type = "int";
          HI.AccessSpecifier = "public";
        }},
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b42d5f8425af..9ab8031b9ea8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -622,6 +622,14 @@ Improvements to Clang's diagnostics
 
 - Improved the FixIts for unused lambda captures.
 
+- Delayed typo correction was removed from the compiler; immediate typo
+  correction behavior remains the same. Delayed typo correction facilities were
+  fragile and unmaintained, and the removal closed the following issues:
+  #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308,
+  #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
+  #GH36703, #GH32903, #GH23312, #GH69874.
+
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 43c28c8bf649..9fc23d30b733 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -240,8 +240,7 @@ public:
     return static_cast<bool>(getDependence() & ExprDependence::UnexpandedPack);
   }
 
-  /// Whether this expression contains subexpressions which had errors, e.g. a
-  /// TypoExpr.
+  /// Whether this expression contains subexpressions which had errors.
   bool containsErrors() const {
     return static_cast<bool>(getDependence() & ExprDependence::Error);
   }
@@ -6965,36 +6964,6 @@ public:
   }
 };
 
-/// TypoExpr - Internal placeholder for expressions where typo correction
-/// still needs to be performed and/or an error diagnostic emitted.
-class TypoExpr : public Expr {
-  // The location for the typo name.
-  SourceLocation TypoLoc;
-
-public:
-  TypoExpr(QualType T, SourceLocation TypoLoc)
-      : Expr(TypoExprClass, T, VK_LValue, OK_Ordinary), TypoLoc(TypoLoc) {
-    assert(T->isDependentType() && "TypoExpr given a non-dependent type");
-    setDependence(ExprDependence::TypeValueInstantiation |
-                  ExprDependence::Error);
-  }
-
-  child_range children() {
-    return child_range(child_iterator(), child_iterator());
-  }
-  const_child_range children() const {
-    return const_child_range(const_child_iterator(), const_child_iterator());
-  }
-
-  SourceLocation getBeginLoc() const LLVM_READONLY { return TypoLoc; }
-  SourceLocation getEndLoc() const LLVM_READONLY { return TypoLoc; }
-
-  static bool classof(const Stmt *T) {
-    return T->getStmtClass() == TypoExprClass;
-  }
-
-};
-
 /// This class represents BOTH the OpenMP Array Section and OpenACC 'subarray',
 /// with a boolean differentiator.
 /// OpenMP 5.0 [2.1.5, Array Sections].
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index b0f8ae621cf6..5cb2f57edffe 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2956,7 +2956,6 @@ DEF_TRAVERSE_STMT(CXXRewrittenBinaryOperator, {
   }
 })
 DEF_TRAVERSE_STMT(OpaqueValueExpr, {})
-DEF_TRAVERSE_STMT(TypoExpr, {})
 DEF_TRAVERSE_STMT(RecoveryExpr, {})
 DEF_TRAVERSE_STMT(CUDAKernelCallExpr, {})
 
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 9526fa5808aa..c9c173f5c746 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -202,7 +202,6 @@ def ShuffleVectorExpr : StmtNode<Expr>;
 def ConvertVectorExpr : StmtNode<Expr>;
 def BlockExpr : StmtNode<Expr>;
 def OpaqueValueExpr : StmtNode<Expr>;
-def TypoExpr : StmtNode<Expr>;
 def RecoveryExpr : StmtNode<Expr>;
 def BuiltinBitCastExpr : StmtNode<ExplicitCastExpr>;
 def EmbedExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index d99de77a5291..3243b94c5e5e 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -4169,8 +4169,7 @@ private:
   bool ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
                            llvm::function_ref<void()> ExpressionStarts =
                                llvm::function_ref<void()>(),
-                           bool FailImmediatelyOnInvalidExpr = false,
-                           bool EarlyTypoCorrection = false);
+                           bool FailImmediatelyOnInvalidExpr = false);
 
   /// ParseSimpleExpressionList - A simple comma-separated list of expressions,
   /// used for misc language extensions.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 0dad07e55a82..29452bb37260 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -6713,10 +6713,6 @@ public:
     /// this expression evaluation context.
     unsigned NumCleanupObjects;
 
-    /// The number of typos encountered during this expression evaluation
-    /// context (i.e. the number of TypoExprs created).
-    unsigned NumTypos;
-
     MaybeODRUseExprSet SavedMaybeODRUseExprs;
 
     /// The lambdas that are present within this context, if it
@@ -6813,7 +6809,7 @@ public:
                                       Decl *ManglingContextDecl,
                                       ExpressionKind ExprContext)
         : Context(Context), ParentCleanup(ParentCleanup),
-          NumCleanupObjects(NumCleanupObjects), NumTypos(0),
+          NumCleanupObjects(NumCleanupObjects),
           ManglingContextDecl(ManglingContextDecl), ExprContext(ExprContext),
           InDiscardedStatement(false), InImmediateFunctionContext(false),
           InImmediateEscalatingFunctionContext(false) {}
@@ -7146,8 +7142,7 @@ public:
                       CorrectionCandidateCallback &CCC,
                       TemplateArgumentListInfo *ExplicitTemplateArgs = nullptr,
                       ArrayRef<Expr *> Args = {},
-                      DeclContext *LookupCtx = nullptr,
-                      TypoExpr **Out = nullptr);
+                      DeclContext *LookupCtx = nullptr);
 
   /// If \p D cannot be odr-used in the current expression evaluation context,
   /// return a reason explaining why. Otherwise, return NOUR_None.
@@ -8748,40 +8743,6 @@ public:
 
   ExprResult CheckUnevaluatedOperand(Expr *E);
 
-  /// Process any TypoExprs in the given Expr and its children,
-  /// generating diagnostics as appropriate and returning a new Expr if there
-  /// were typos that were all successfully corrected and ExprError if one or
-  /// more typos could not be corrected.
-  ///
-  /// \param E The Expr to check for TypoExprs.
-  ///
-  /// \param InitDecl A VarDecl to avoid because the Expr being corrected is its
-  /// initializer.
-  ///
-  /// \param RecoverUncorrectedTypos If true, when typo correction fails, it
-  /// will rebuild the given Expr with all TypoExprs degraded to RecoveryExprs.
-  ///
-  /// \param Filter A function applied to a newly rebuilt Expr to determine if
-  /// it is an acceptable/usable result from a single combination of typo
-  /// corrections. As long as the filter returns ExprError, different
-  /// combinations of corrections will be tried until all are exhausted.
-  ExprResult CorrectDelayedTyposInExpr(
-      Expr *E, VarDecl *InitDecl = nullptr,
-      bool RecoverUncorrectedTypos = false,
-      llvm::function_ref<ExprResult(Expr *)> Filter =
-          [](Expr *E) -> ExprResult { return E; });
-
-  ExprResult CorrectDelayedTyposInExpr(
-      ExprResult ER, VarDecl *InitDecl = nullptr,
-      bool RecoverUncorrectedTypos = false,
-      llvm::function_ref<ExprResult(Expr *)> Filter =
-          [](Expr *E) -> ExprResult { return E; }) {
-    return ER.isInvalid()
-               ? ER
-               : CorrectDelayedTyposInExpr(ER.get(), InitDecl,
-                                           RecoverUncorrectedTypos, Filter);
-  }
-
   IfExistsResult
   CheckMicrosoftIfExistsSymbol(Scope *S, CXXScopeSpec &SS,
                                const DeclarationNameInfo &TargetNameInfo);
@@ -9283,12 +9244,6 @@ public:
   /// for C++ records.
   llvm::FoldingSet<SpecialMemberOverloadResultEntry> SpecialMemberCache;
 
-  /// Holds TypoExprs that are created from `createDelayedTypo`. This is used by
-  /// `TransformTypos` in order to keep track of any TypoExprs that are created
-  /// recursively during typo correction and wipe them away if the correction
-  /// fails.
-  llvm::SmallVector<TypoExpr *, 2> TypoExprs;
-
   enum class AcceptableKind { Visible, Reachable };
 
   // Members have to be NamespaceDecl* or TranslationUnitDecl*.
@@ -9376,10 +9331,6 @@ public:
                       bool VolatileArg, bool RValueThis, bool ConstThis,
                       bool VolatileThis);
 
-  typedef std::function<void(const TypoCorrection &)> TypoDiagnosticGenerator;
-  typedef std::function<ExprResult(Sema &, TypoExpr *, TypoCorrection)>
-      TypoRecoveryCallback;
-
   RedeclarationKind forRedeclarationInCurContext() const;
 
   /// Look up a name, looking for a single declaration.  Return
@@ -9733,51 +9684,6 @@ public:
                              const ObjCObjectPointerType *OPT = nullptr,
                              bool RecordFailure = true);
 
-  /// Try to "correct" a typo in the source code by finding
-  /// visible declarations whose names are similar to the name that was
-  /// present in the source code.
-  ///
-  /// \param TypoName the \c DeclarationNameInfo structure that contains
-  /// the name that was present in the source code along with its location.
-  ///
-  /// \param LookupKind the name-lookup criteria used to search for the name.
-  ///
-  /// \param S the scope in which name lookup occurs.
-  ///
-  /// \param SS the nested-name-specifier that precedes the name we're
-  /// looking for, if present.
-  ///
-  /// \param CCC A CorrectionCandidateCallback object that provides further
-  /// validation of typo correction candidates. It also provides flags for
-  /// determining the set of keywords permitted.
-  ///
-  /// \param TDG A TypoDiagnosticGenerator functor that will be used to print
-  /// diagnostics when the actual typo correction is attempted.
-  ///
-  /// \param TRC A TypoRecoveryCallback functor that will be used to build an
-  /// Expr from a typo correction candidate.
-  ///
-  /// \param MemberContext if non-NULL, the context in which to look for
-  /// a member access expression.
-  ///
-  /// \param EnteringContext whether we're entering the context described by
-  /// the nested-name-specifier SS.
-  ///
-  /// \param OPT when non-NULL, the search for visible declarations will
-  /// also walk the protocols in the qualified interfaces of \p OPT.
-  ///
-  /// \returns a new \c TypoExpr that will later be replaced in the AST with an
-  /// Expr representing the result of performing typo correction, or nullptr if
-  /// typo correction is not possible. If nullptr is returned, no diagnostics
-  /// will be emitted and it is the responsibility of the caller to emit any
-  /// that are needed.
-  TypoExpr *CorrectTypoDelayed(
-      const DeclarationNameInfo &Typo, Sema::LookupNameKind LookupKind,
-      Scope *S, CXXScopeSpec *SS, CorrectionCandidateCallback &CCC,
-      TypoDiagnosticGenerator TDG, TypoRecoveryCallback TRC,
-      CorrectTypoKind Mode, DeclContext *MemberContext = nullptr,
-      bool EnteringContext = false, const ObjCObjectPointerType *OPT = nullptr);
-
   /// Kinds of missing import. Note, the values of these enumerators correspond
   /// to %select values in diagnostics.
   enum class MissingImportKind {
@@ -9796,20 +9702,6 @@ public:
                              SourceLocation DeclLoc, ArrayRef<Module *> Modules,
                              MissingImportKind MIK, bool Recover);
 
-  struct TypoExprState {
-    std::unique_ptr<TypoCorrectionConsumer> Consumer;
-    TypoDiagnosticGenerator DiagHandler;
-    TypoRecoveryCallback RecoveryHandler;
-    TypoExprState();
-    TypoExprState(TypoExprState &&other) noexcept;
-    TypoExprState &operator=(TypoExprState &&other) noexcept;
-  };
-
-  const TypoExprState &getTypoExprState(TypoExpr *TE) const;
-
-  /// Clears the state of the given TypoExpr.
-  void clearDelayedTypo(TypoExpr *TE);
-
   /// Called on #pragma clang __debug dump II
   void ActOnPragmaDump(Scope *S, SourceLocation Loc, IdentifierInfo *II);
 
@@ -9832,23 +9724,15 @@ private:
   /// Determine if we could use all the declarations in the module.
   bool isUsableModule(const Module *M);
 
-  /// Helper for CorrectTypo and CorrectTypoDelayed used to create and
-  /// populate a new TypoCorrectionConsumer. Returns nullptr if typo correction
-  /// should be skipped entirely.
+  /// Helper for CorrectTypo used to create and populate a new
+  /// TypoCorrectionConsumer. Returns nullptr if typo correction should be
+  /// skipped entirely.
   std::unique_ptr<TypoCorrectionConsumer> makeTypoCorrectionConsumer(
       const DeclarationNameInfo &Typo, Sema::LookupNameKind LookupKind,
       Scope *S, CXXScopeSpec *SS, CorrectionCandidateCallback &CCC,
       DeclContext *MemberContext, bool EnteringContext,
       const ObjCObjectPointerType *OPT, bool ErrorRecovery);
 
-  /// The set of unhandled TypoExprs and their associated state.
-  llvm::MapVector<TypoExpr *, TypoExprState> DelayedTypos;
-
-  /// Creates a new TypoExpr AST node.
-  TypoExpr *createDelayedTypo(std::unique_ptr<TypoCorrectionConsumer> TCC,
-                              TypoDiagnosticGenerator TDG,
-                              TypoRecoveryCallback TRC, SourceLocation TypoLoc);
-
   /// Cache for module units which is usable for current module.
   llvm::DenseSet<const Module *> UsableModuleUnitsCache;
 
diff --git a/clang/include/clang/Sema/SemaInternal.h b/clang/include/clang/Sema/SemaInternal.h
index 95874077050a..4d0da1102bb5 100644
--- a/clang/include/clang/Sema/SemaInternal.h
+++ b/clang/include/clang/Sema/SemaInternal.h
@@ -314,20 +314,6 @@ private:
   bool SearchNamespaces;
 };
 
-inline Sema::TypoExprState::TypoExprState() {}
-
-inline Sema::TypoExprState::TypoExprState(TypoExprState &&other) noexcept {
-  *this = std::move(other);
-}
-
-inline Sema::TypoExprState &Sema::TypoExprState::
-operator=(Sema::TypoExprState &&other) noexcept {
-  Consumer = std::move(other.Consumer);
-  DiagHandler = std::move(other.DiagHandler);
-  RecoveryHandler = std::move(other.RecoveryHandler);
-  return *this;
-}
-
 } // end namespace clang
 
 #endif
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 17d2cb4a30f3..c3722c65abf6 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -3611,7 +3611,6 @@ bool Expr::HasSideEffects(const ASTContext &Ctx,
   case PackExpansionExprClass:
   case SubstNonTypeTemplateParmPackExprClass:
   case FunctionParmPackExprClass:
-  case TypoExprClass:
   case RecoveryExprClass:
   case CXXFoldExprClass:
     // Make a conservative assumption for dependent nodes.
diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp
index 3f37d06cc8f3..ad66335138a4 100644
--- a/clang/lib/AST/ExprClassification.cpp
+++ b/clang/lib/AST/ExprClassification.cpp
@@ -129,7 +129,6 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) {
     // FIXME: Is this wise? Should they get their own kind?
   case Expr::UnresolvedLookupExprClass:
   case Expr::UnresolvedMemberExprClass:
-  case Expr::TypoExprClass:
   case Expr::DependentCoawaitExprClass:
   case Expr::CXXDependentScopeMemberExprClass:
   case Expr::DependentScopeDeclRefExprClass:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 27ea55e98144..f1580255a462 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -17327,7 +17327,6 @@ static ICEDiag CheckICE(const Expr* E, const ASTContext &Ctx) {
   case Expr::CXXDeleteExprClass:
   case Expr::CXXPseudoDestructorExprClass:
   case Expr::UnresolvedLookupExprClass:
-  case Expr::TypoExprClass:
   case Expr::RecoveryExprClass:
   case Expr::DependentScopeDeclRefExprClass:
   case Expr::CXXConstructExprClass:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index ecf5be220439..487933a748ab 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4994,7 +4994,6 @@ recurse:
   case Expr::ParenListExprClass:
   case Expr::MSPropertyRefExprClass:
   case Expr::MSPropertySubscriptExprClass:
-  case Expr::TypoExprClass: // This should no longer exist in the AST by now.
   case Expr::RecoveryExprClass:
   case Expr::ArraySectionExprClass:
   case Expr::OMPArrayShapingExprClass:
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 13c3bc038789..28317911d825 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -2914,11 +2914,6 @@ void StmtPrinter::VisitOpaqueValueExpr(OpaqueValueExpr *Node) {
   PrintExpr(Node->getSourceExpr());
 }
 
-void StmtPrinter::VisitTypoExpr(TypoExpr *Node) {
-  // TODO: Print something reasonable for a TypoExpr, if necessary.
-  llvm_unreachable("Cannot print TypoExpr nodes");
-}
-
 void StmtPrinter::VisitRecoveryExpr(RecoveryExpr *Node) {
   OS << "<recovery-expr>(";
   const char *Sep = "";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index f7d1655f67ed..c666d966a6e5 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2361,10 +2361,6 @@ void StmtProfiler::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
   VisitExpr(E);
 }
 
-void StmtProfiler::VisitTypoExpr(const TypoExpr *E) {
-  VisitExpr(E);
-}
-
 void StmtProfiler::VisitSourceLocExpr(const SourceLocExpr *E) {
   VisitExpr(E);
 }
diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp
index e215c64cccd1..9a010fb5f342 100644
--- a/clang/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp
@@ -422,7 +422,6 @@ void Parser::ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM) {
         DefArgResult = ParseBraceInitializer();
       } else
         DefArgResult = ParseAssignmentExpression();
-      DefArgResult = Actions.CorrectDelayedTyposInExpr(DefArgResult, Param);
       if (DefArgResult.isInvalid()) {
         Actions.ActOnParamDefaultArgumentError(Param, EqualLoc,
                                                /*DefaultArg=*/nullptr);
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index f469e466e463..647ee34efcab 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -436,7 +436,6 @@ bool Parser::ParseAttributeArgumentList(
     } else {
       Expr = ParseAssignmentExpression();
     }
-    Expr = Actions.CorrectDelayedTyposInExpr(Expr);
 
     if (Tok.is(tok::ellipsis))
       Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken());
@@ -472,15 +471,6 @@ bool Parser::ParseAttributeArgumentList(
     Arg++;
   }
 
-  if (SawError) {
-    // Ensure typos get diagnosed when errors were encountered while parsing the
-    // expression list.
-    for (auto &E : Exprs) {
-      ExprResult Expr = Actions.CorrectDelayedTyposInExpr(E);
-      if (Expr.isUsable())
-        E = Expr.get();
-    }
-  }
   return SawError;
 }
 
@@ -565,9 +555,7 @@ unsigned Parser::ParseAttributeArgsCommon(
               nullptr,
               Sema::ExpressionEvaluationContextRecord::EK_AttrArgument);
 
-          ExprResult ArgExpr(
-              Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
-
+          ExprResult ArgExpr = ParseAssignmentExpression();
           if (ArgExpr.isInvalid()) {
             SkipUntil(tok::r_paren, StopAtSemi);
             return 0;
@@ -3212,9 +3200,7 @@ void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
       Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated, nullptr,
       ExpressionKind::EK_AttrArgument);
 
-  ExprResult ArgExpr(
-      Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
-
+  ExprResult ArgExpr = ParseAssignmentExpression();
   if (ArgExpr.isInvalid()) {
     Parens.skipToEnd();
     return;
@@ -6890,8 +6876,8 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
       //   void (f()) requires true;
       Diag(Tok, diag::err_requires_clause_inside_parens);
       ConsumeToken();
-      ExprResult TrailingRequiresClause = Actions.CorrectDelayedTyposInExpr(
-         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+      ExprResult TrailingRequiresClause =
+          ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true);
       if (TrailingRequiresClause.isUsable() && D.isFunctionDeclarator() &&
           !D.hasTrailingRequiresClause())
         // We're already ill-formed if we got here but we'll accept it anyway.
@@ -7538,8 +7524,7 @@ void Parser::ParseParameterDeclarationClause(
       Diag(Tok,
            diag::err_requires_clause_on_declarator_not_declaring_a_function);
       ConsumeToken();
-      Actions.CorrectDelayedTyposInExpr(
-         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+      ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true);
     }
 
     // Remember this parsed parameter in ParamInfo.
@@ -7653,7 +7638,6 @@ void Parser::ParseParameterDeclarationClause(
             }
             DefArgResult = ParseAssignmentExpression();
           }
-          DefArgResult = Actions.CorrectDelayedTyposInExpr(DefArgResult);
           if (DefArgResult.isInvalid()) {
             Actions.ActOnParamDefaultArgumentError(Param, EqualLoc,
                                                    /*DefaultArg=*/nullptr);
@@ -7799,8 +7783,7 @@ void Parser::ParseBracketDeclarator(Declarator &D) {
     } else {
       EnterExpressionEvaluationContext Unevaluated(
           Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-      NumElements =
-          Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+      NumElements = ParseAssignmentExpression();
     }
   } else {
     if (StaticLoc.isValid()) {
@@ -7937,8 +7920,8 @@ void Parser::ParseTypeofSpecifier(DeclSpec &DS) {
   bool isCastExpr;
   ParsedType CastTy;
   SourceRange CastRange;
-  ExprResult Operand = Actions.CorrectDelayedTyposInExpr(
-      ParseExprAfterUnaryExprOrTypeTrait(OpTok, isCastExpr, CastTy, CastRange));
+  ExprResult Operand =
+      ParseExprAfterUnaryExprOrTypeTrait(OpTok, isCastExpr, CastTy, CastRange);
   if (HasParens)
     DS.setTypeArgumentRange(CastRange);
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 0b5f56fea0b1..5f34370aeeb2 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1071,10 +1071,7 @@ SourceLocation Parser::ParseDecltypeSpecifier(DeclSpec &DS) {
       EnterExpressionEvaluationContext Unevaluated(
           Actions, Sema::ExpressionEvaluationContext::Unevaluated, nullptr,
           Sema::ExpressionEvaluationContextRecord::EK_Decltype);
-      Result = Actions.CorrectDelayedTyposInExpr(
-          ParseExpression(), /*InitDecl=*/nullptr,
-          /*RecoverUncorrectedTypos=*/false,
-          [](Expr *E) { return E->hasPlaceholderType() ? ExprError() : E; });
+      Result = ParseExpression();
       if (Result.isInvalid()) {
         DS.SetTypeSpecError();
         if (SkipUntil(tok::r_paren, StopAtSemi | StopBeforeMatch)) {
@@ -4465,8 +4462,7 @@ bool Parser::ParseCXXAssumeAttributeArg(
       Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   TentativeParsingAction TPA(*this);
-  ExprResult Res(
-      Actions.CorrectDelayedTyposInExpr(ParseConditionalExpression()));
+  ExprResult Res = ParseConditionalExpression();
   if (Res.isInvalid()) {
     TPA.Commit();
     SkipUntil(tok::r_paren, tok::r_square, StopAtSemi | StopBeforeMatch);
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 951a157305dd..a27a44455b62 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -183,7 +183,6 @@ ExprResult Parser::ParseConstraintExpression() {
   ExprResult LHS(ParseCastExpression(CastParseKind::AnyCastExpr));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::LogicalOr));
   if (Res.isUsable() && !Actions.CheckConstraintExpression(Res.get())) {
-    Actions.CorrectDelayedTyposInExpr(Res);
     return ExprError();
   }
   return Res;
@@ -244,7 +243,6 @@ Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) {
       // the rest of the addition expression). Try to parse the rest of it here.
       if (PossibleNonPrimary)
         E = RecoverFromNonPrimary(E, /*Note=*/!IsConstraintExpr);
-      Actions.CorrectDelayedTyposInExpr(E);
       return ExprError();
     }
     return E;
@@ -256,14 +254,11 @@ Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) {
     SourceLocation LogicalAndLoc = ConsumeToken();
     ExprResult RHS = ParsePrimary();
     if (RHS.isInvalid()) {
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalAndLoc,
                                        tok::ampamp, LHS.get(), RHS.get());
     if (!Op.isUsable()) {
-      Actions.CorrectDelayedTyposInExpr(RHS);
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     LHS = Op;
@@ -281,14 +276,11 @@ Parser::ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause) {
     ExprResult RHS =
         ParseConstraintLogicalAndExpression(IsTrailingRequiresClause);
     if (!RHS.isUsable()) {
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalOrLoc,
                                        tok::pipepipe, LHS.get(), RHS.get());
     if (!Op.isUsable()) {
-      Actions.CorrectDelayedTyposInExpr(RHS);
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     LHS = Op;
@@ -408,7 +400,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       }
 
       if (TernaryMiddle.isInvalid()) {
-        Actions.CorrectDelayedTyposInExpr(LHS);
         LHS = ExprError();
         TernaryMiddle = nullptr;
       }
@@ -466,11 +457,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       RHS = ParseCastExpression(CastParseKind::AnyCastExpr);
 
     if (RHS.isInvalid()) {
-      // FIXME: Errors generated by the delayed typo correction should be
-      // printed before errors from parsing the RHS, not after.
-      Actions.CorrectDelayedTyposInExpr(LHS);
-      if (TernaryMiddle.isUsable())
-        TernaryMiddle = Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
       LHS = ExprError();
     }
 
@@ -503,11 +489,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       RHSIsInitList = false;
 
       if (RHS.isInvalid()) {
-        // FIXME: Errors generated by the delayed typo correction should be
-        // printed before errors from ParseRHSOfBinaryExpression, not after.
-        Actions.CorrectDelayedTyposInExpr(LHS);
-        if (TernaryMiddle.isUsable())
-          TernaryMiddle = Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
         LHS = ExprError();
       }
 
@@ -571,17 +552,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
 
         LHS = CondOp;
       }
-      // In this case, ActOnBinOp or ActOnConditionalOp performed the
-      // CorrectDelayedTyposInExpr check.
-      if (!getLangOpts().CPlusPlus)
-        continue;
-    }
-
-    // Ensure potential typos aren't left undiagnosed.
-    if (LHS.isInvalid()) {
-      Actions.CorrectDelayedTyposInExpr(OrigLHS);
-      Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
-      Actions.CorrectDelayedTyposInExpr(RHS);
     }
   }
 }
@@ -1711,7 +1681,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
       // Reject array indices starting with a lambda-expression. '[[' is
       // reserved for attributes.
       if (CheckProhibitedCXX11Attribute()) {
-        (void)Actions.CorrectDelayedTyposInExpr(LHS);
         return ExprError();
       }
       BalancedDelimiterTracker T(*this, tok::l_square);
@@ -1737,8 +1706,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           } else {
             Idx = ParseExpression(); // May be a comma expression
           }
-          LHS = Actions.CorrectDelayedTyposInExpr(LHS);
-          Idx = Actions.CorrectDelayedTyposInExpr(Idx);
           if (Idx.isInvalid()) {
             HasError = true;
           } else {
@@ -1746,7 +1713,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           }
         } else if (Tok.isNot(tok::r_square)) {
           if (ParseExpressionList(ArgExprs)) {
-            LHS = Actions.CorrectDelayedTyposInExpr(LHS);
             HasError = true;
           }
         }
@@ -1762,7 +1728,7 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           // Consume ':'
           ColonLocFirst = ConsumeToken();
           if (Tok.isNot(tok::r_square))
-            Length = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+            Length = ParseExpression();
         }
       } else if (ArgExprs.size() <= 1 && getLangOpts().OpenMP) {
         ColonProtectionRAIIObject RAII(*this);
@@ -1773,7 +1739,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
               (getLangOpts().OpenMP < 50 ||
                ((Tok.isNot(tok::colon) && getLangOpts().OpenMP >= 50)))) {
             Length = ParseExpression();
-            Length = Actions.CorrectDelayedTyposInExpr(Length);
           }
         }
         if (getLangOpts().OpenMP >= 50 &&
@@ -1789,8 +1754,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
       }
 
       SourceLocation RLoc = Tok.getLocation();
-      LHS = Actions.CorrectDelayedTyposInExpr(LHS);
-
       if (!LHS.isInvalid() && !HasError && !Length.isInvalid() &&
           !Stride.isInvalid() && Tok.is(tok::r_square)) {
         if (ColonLocFirst.isValid() || ColonLocSecond.isValid()) {
@@ -1838,7 +1801,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
         SourceLocation OpenLoc = ConsumeToken();
 
         if (ParseSimpleExpressionList(ExecConfigExprs)) {
-          (void)Actions.CorrectDelayedTyposInExpr(LHS);
           LHS = ExprError();
         }
 
@@ -1889,16 +1851,12 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
                  PreferredType.enterFunctionArgument(Tok.getLocation(),
                                                      RunSignatureHelp);
                }))) {
-            (void)Actions.CorrectDelayedTyposInExpr(LHS);
             // If we got an error when parsing expression list, we don't call
             // the CodeCompleteCall handler inside the parser. So call it here
             // to make sure we get overload suggestions even when we are in the
             // middle of a parameter.
             if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
               RunSignatureHelp();
-          } else if (LHS.isInvalid()) {
-            for (auto &E : ArgExprs)
-              Actions.CorrectDelayedTyposInExpr(E);
           }
         }
       }
@@ -1913,16 +1871,16 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
                                          ArgExprs);
         SkipUntil(tok::r_paren, StopAtSemi);
       } else if (Tok.isNot(tok::r_paren)) {
-        bool HadDelayedTypo = false;
-        if (Actions.CorrectDelayedTyposInExpr(LHS).get() != LHS.get())
-          HadDelayedTypo = true;
+        bool HadErrors = false;
+        if (LHS.get()->containsErrors())
+          HadErrors = true;
         for (auto &E : ArgExprs)
-          if (Actions.CorrectDelayedTyposInExpr(E).get() != E)
-            HadDelayedTypo = true;
-        // If there were delayed typos in the LHS or ArgExprs, call SkipUntil
-        // instead of PT.consumeClose() to avoid emitting extra diagnostics for
-        // the unmatched l_paren.
-        if (HadDelayedTypo)
+          if (E->containsErrors())
+            HadErrors = true;
+        // If there were errors in the LHS or ArgExprs, call SkipUntil instead
+        // of PT.consumeClose() to avoid emitting extra diagnostics for the
+        // unmatched l_paren.
+        if (HadErrors)
           SkipUntil(tok::r_paren, StopAtSemi);
         else
           PT.consumeClose();
@@ -2050,7 +2008,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
                      /*AllowConstructorName=*/
                      getLangOpts().MicrosoftExt && SS.isNotEmpty(),
                      /*AllowDeductionGuide=*/false, &TemplateKWLoc, Name)) {
-        (void)Actions.CorrectDelayedTyposInExpr(LHS);
         LHS = ExprError();
       }
 
@@ -2921,8 +2878,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     do {
       BalancedDelimiterTracker TS(*this, tok::l_square);
       TS.consumeOpen();
-      ExprResult NumElements =
-          Actions.CorrectDelayedTyposInExpr(ParseExpression());
+      ExprResult NumElements = ParseExpression();
       if (!NumElements.isUsable()) {
         ErrorFound = true;
         while (!SkipUntil(tok::r_square, tok::r_paren,
@@ -2936,7 +2892,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     // Match the ')'.
     T.consumeClose();
     RParenLoc = T.getCloseLocation();
-    Result = Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+    Result = ParseAssignmentExpression();
     if (ErrorFound) {
       Result = ExprError();
     } else if (!Result.isInvalid()) {
@@ -2948,12 +2904,6 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     InMessageExpressionRAIIObject InMessage(*this, false);
 
     Result = ParseExpression(TypeCastState::MaybeTypeCast);
-    if (!getLangOpts().CPlusPlus && Result.isUsable()) {
-      // Correct typos in non-C++ code earlier so that implicit-cast-like
-      // expressions are parsed correctly.
-      Result = Actions.CorrectDelayedTyposInExpr(Result);
-    }
-
     if (ExprType >= ParenParseOption::FoldExpr &&
         isFoldOperator(Tok.getKind()) && NextToken().is(tok::ellipsis)) {
       ExprType = ParenParseOption::FoldExpr;
@@ -3057,8 +3007,7 @@ ExprResult Parser::ParseGenericSelectionExpression() {
     // not evaluated."
     EnterExpressionEvaluationContext Unevaluated(
         Actions, Sema::ExpressionEvaluationContext::Unevaluated);
-    ControllingExpr =
-        Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+    ControllingExpr = ParseAssignmentExpression();
     if (ControllingExpr.isInvalid()) {
       SkipUntil(tok::r_paren, StopAtSemi);
       return ExprError();
@@ -3104,8 +3053,7 @@ ExprResult Parser::ParseGenericSelectionExpression() {
 
     // FIXME: These expressions should be parsed in a potentially potentially
     // evaluated context.
-    ExprResult ER(
-        Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
+    ExprResult ER = ParseAssignmentExpression();
     if (ER.isInvalid()) {
       SkipUntil(tok::r_paren, StopAtSemi);
       return ExprError();
@@ -3199,8 +3147,7 @@ void Parser::injectEmbedTokens() {
 
 bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
                                  llvm::function_ref<void()> ExpressionStarts,
-                                 bool FailImmediatelyOnInvalidExpr,
-                                 bool EarlyTypoCorrection) {
+                                 bool FailImmediatelyOnInvalidExpr) {
   bool SawError = false;
   while (true) {
     if (ExpressionStarts)
@@ -3213,9 +3160,6 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     } else
       Expr = ParseAssignmentExpression();
 
-    if (EarlyTypoCorrection)
-      Expr = Actions.CorrectDelayedTyposInExpr(Expr);
-
     if (Tok.is(tok::ellipsis))
       Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken());
     else if (Tok.is(tok::code_completion)) {
@@ -3244,14 +3188,6 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     ConsumeToken();
     checkPotentialAngleBracketDelimiter(Comma);
   }
-  if (SawError) {
-    // Ensure typos get diagnosed when errors were encountered while parsing the
-    // expression list.
-    for (auto &E : Exprs) {
-      ExprResult Expr = Actions.CorrectDelayedTyposInExpr(E);
-      if (Expr.isUsable()) E = Expr.get();
-    }
-  }
   return SawError;
 }
 
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 55ad7f256fa8..329572047da0 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -972,8 +972,6 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
           SourceLocation StartLoc = Tok.getLocation();
           InMessageExpressionRAIIObject MaybeInMessageExpression(*this, true);
           Init = ParseInitializer();
-          if (!Init.isInvalid())
-            Init = Actions.CorrectDelayedTyposInExpr(Init.get());
 
           if (Tok.getLocation() != StartLoc) {
             // Back out the lexing of the token after the initializer.
@@ -1065,8 +1063,6 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
     // enclosing the lambda-expression, rather than in the context of the
     // lambda-expression itself.
     ParsedType InitCaptureType;
-    if (Init.isUsable())
-      Init = Actions.CorrectDelayedTyposInExpr(Init.get());
     if (Init.isUsable()) {
       NonTentativeAction([&] {
         // Get the pointer and store it in an lvalue, so we can use it as an
@@ -3202,8 +3198,7 @@ ExprResult Parser::ParseRequiresExpression() {
         //             cv-qualifier-seq[opt] abstract-declarator[opt]
         BalancedDelimiterTracker ExprBraces(*this, tok::l_brace);
         ExprBraces.consumeOpen();
-        ExprResult Expression =
-            Actions.CorrectDelayedTyposInExpr(ParseExpression());
+        ExprResult Expression = ParseExpression();
         if (!Expression.isUsable()) {
           ExprBraces.skipToEnd();
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
@@ -3306,8 +3301,7 @@ ExprResult Parser::ParseRequiresExpression() {
             // C++ [expr.prim.req.nested]
             //     nested-requirement:
             //         'requires' constraint-expression ';'
-            ExprResult ConstraintExpr =
-                Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
+            ExprResult ConstraintExpr = ParseConstraintExpression();
             if (ConstraintExpr.isInvalid() || !ConstraintExpr.isUsable()) {
               SkipUntil(tok::semi, tok::r_brace,
                         SkipUntilFlags::StopBeforeMatch);
@@ -3373,8 +3367,7 @@ ExprResult Parser::ParseRequiresExpression() {
         //     simple-requirement:
         //         expression ';'
         SourceLocation StartLoc = Tok.getLocation();
-        ExprResult Expression =
-            Actions.CorrectDelayedTyposInExpr(ParseExpression());
+        ExprResult Expression = ParseExpression();
         if (!Expression.isUsable()) {
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
           break;
diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
index df8372b995e5..a3be3744a932 100644
--- a/clang/lib/Parse/ParseInit.cpp
+++ b/clang/lib/Parse/ParseInit.cpp
@@ -477,8 +477,6 @@ ExprResult Parser::ParseBraceInitializer() {
     if (Tok.is(tok::ellipsis))
       SubElt = Actions.ActOnPackExpansion(SubElt.get(), ConsumeToken());
 
-    SubElt = Actions.CorrectDelayedTyposInExpr(SubElt.get());
-
     // If we couldn't parse the subelement, bail out.
     if (SubElt.isUsable()) {
       InitExprs.push_back(SubElt.get());
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 6afb7809d3cd..8ef16a4d3808 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -2629,10 +2629,7 @@ bool Parser::ParseObjCXXMessageReceiver(bool &IsExpr, void *&TypeOrExpr) {
   if (!Tok.isSimpleTypeSpecifier(getLangOpts())) {
     //   objc-receiver:
     //     expression
-    // Make sure any typos in the receiver are corrected or diagnosed, so that
-    // proper recovery can happen. FIXME: Perhaps filter the corrected expr to
-    // only the things that are valid ObjC receivers?
-    ExprResult Receiver = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    ExprResult Receiver = ParseExpression();
     if (Receiver.isInvalid())
       return true;
 
@@ -2809,7 +2806,7 @@ ExprResult Parser::ParseObjCMessageExpression() {
   }
 
   // Otherwise, an arbitrary expression can be the receiver of a send.
-  ExprResult Res = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+  ExprResult Res = ParseExpression();
   if (Res.isInvalid()) {
     SkipUntil(tok::r_square, StopAtSemi);
     return Res;
@@ -2930,8 +2927,6 @@ Parser::ParseObjCMessageExpressionBody(SourceLocation LBracLoc,
       SourceLocation commaLoc = ConsumeToken(); // Eat the ','.
       ///  Parse the expression after ','
       ExprResult Res(ParseAssignmentExpression());
-      if (Tok.is(tok::colon))
-        Res = Actions.CorrectDelayedTyposInExpr(Res);
       if (Res.isInvalid()) {
         if (Tok.is(tok::colon)) {
           Diag(commaLoc, diag::note_extra_comma_message_arg) <<
@@ -3078,10 +3073,6 @@ ExprResult Parser::ParseObjCArrayLiteral(SourceLocation AtLoc) {
       return Res;
     }
 
-    Res = Actions.CorrectDelayedTyposInExpr(Res.get());
-    if (Res.isInvalid())
-      HasInvalidEltExpr = true;
-
     // Parse the ellipsis that indicates a pack expansion.
     if (Tok.is(tok::ellipsis))
       Res = Actions.ActOnPackExpansion(Res.get(), ConsumeToken());
@@ -3108,7 +3099,6 @@ ExprResult Parser::ParseObjCArrayLiteral(SourceLocation AtLoc) {
 ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
   SmallVector<ObjCDictionaryElement, 4> Elements; // dictionary elements.
   ConsumeBrace(); // consume the l_square.
-  bool HasInvalidEltExpr = false;
   while (Tok.isNot(tok::r_brace)) {
     // Parse the comma separated key : value expressions.
     ExprResult KeyExpr;
@@ -3138,12 +3128,6 @@ ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
       return ValueExpr;
     }
 
-    // Check the key and value for possible typos
-    KeyExpr = Actions.CorrectDelayedTyposInExpr(KeyExpr.get());
-    ValueExpr = Actions.CorrectDelayedTyposInExpr(ValueExpr.get());
-    if (KeyExpr.isInvalid() || ValueExpr.isInvalid())
-      HasInvalidEltExpr = true;
-
     // Parse the ellipsis that designates this as a pack expansion. Do not
     // ActOnPackExpansion here, leave it to template instantiation time where
     // we can get better diagnostics.
@@ -3163,9 +3147,6 @@ ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
   }
   SourceLocation EndLoc = ConsumeBrace();
 
-  if (HasInvalidEltExpr)
-    return ExprError();
-
   // Create the ObjCDictionaryLiteral.
   return Actions.ObjC().BuildObjCDictionaryLiteral(SourceRange(AtLoc, EndLoc),
                                                    Elements);
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index ca4f878464c4..f2849c4eac7c 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -653,7 +653,7 @@ ExprResult Parser::ParseOpenACCConditionExpr() {
   // it does in an if/while/etc (See ParseCXXCondition), however as it was
   // written with Fortran/C in mind, we're going to assume it just means an
   // 'expression evaluating to boolean'.
-  ExprResult ER = getActions().CorrectDelayedTyposInExpr(ParseExpression());
+  ExprResult ER = ParseExpression();
 
   if (!ER.isUsable())
     return ER;
@@ -761,12 +761,6 @@ Parser::ParseOpenACCIntExpr(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
   if (!ER.isUsable())
     return {ER, OpenACCParseCanContinue::Cannot};
 
-  // Parsing can continue after the initial assignment expression parsing, so
-  // even if there was a typo, we can continue.
-  ER = getActions().CorrectDelayedTyposInExpr(ER);
-  if (!ER.isUsable())
-    return {ER, OpenACCParseCanContinue::Can};
-
   return {getActions().OpenACC().ActOnIntExpr(DK, CK, Loc, ER.get()),
           OpenACCParseCanContinue::Can};
 }
@@ -836,8 +830,7 @@ ExprResult Parser::ParseOpenACCSizeExpr(OpenACCClauseKind CK) {
     return getActions().OpenACC().ActOnOpenACCAsteriskSizeExpr(AsteriskLoc);
   }
 
-  ExprResult SizeExpr =
-      getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
+  ExprResult SizeExpr = ParseConstantExpression();
 
   if (!SizeExpr.isUsable())
     return SizeExpr;
@@ -891,8 +884,7 @@ Parser::OpenACCGangArgRes Parser::ParseOpenACCGangArg(SourceLocation GangLoc) {
     ConsumeToken();
     // Parse this as a const-expression, and we'll check its integer-ness/value
     // in CheckGangExpr.
-    ExprResult Res =
-        getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
+    ExprResult Res = ParseConstantExpression();
     return {OpenACCGangKind::Dim, Res};
   }
 
@@ -1089,8 +1081,7 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
     case OpenACCClauseKind::Collapse: {
       bool HasForce = tryParseAndConsumeSpecialTokenKind(
           *this, OpenACCSpecialTokenKind::Force, ClauseKind);
-      ExprResult LoopCount =
-          getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
+      ExprResult LoopCount = ParseConstantExpression();
       if (LoopCount.isInvalid()) {
         Parens.skipToEnd();
         return OpenACCCanContinue();
@@ -1387,7 +1378,7 @@ ExprResult Parser::ParseOpenACCIDExpression() {
                                     /*isAddressOfOperand=*/false);
   }
 
-  return getActions().CorrectDelayedTyposInExpr(Res);
+  return Res;
 }
 
 std::variant<std::monostate, clang::StringLiteral *, IdentifierInfo *>
@@ -1414,9 +1405,8 @@ Parser::ParseOpenACCBindClauseArgument() {
     return std::monostate{};
   }
 
-  ExprResult Res =
-      getActions().CorrectDelayedTyposInExpr(ParseStringLiteralExpression(
-          /*AllowUserDefinedLiteral=*/false, /*Unevaluated=*/true));
+  ExprResult Res = ParseStringLiteralExpression(
+      /*AllowUserDefinedLiteral=*/false, /*Unevaluated=*/true);
   if (!Res.isUsable())
     return std::monostate{};
   return cast<StringLiteral>(Res.get());
@@ -1430,10 +1420,6 @@ Parser::OpenACCVarParseResult Parser::ParseOpenACCVar(OpenACCDirectiveKind DK,
   if (!Res.isUsable())
     return {Res, OpenACCParseCanContinue::Cannot};
 
-  Res = getActions().CorrectDelayedTyposInExpr(Res.get());
-  if (!Res.isUsable())
-    return {Res, OpenACCParseCanContinue::Can};
-
   Res = getActions().OpenACC().ActOnVar(DK, CK, Res.get());
 
   return {Res, OpenACCParseCanContinue::Can};
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index b69c3abe0b32..def1a52ba7d4 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3600,8 +3600,7 @@ bool Parser::ParseOMPInteropInfo(OMPInteropInfo &InteropInfo,
       while (Tok.isNot(tok::r_paren)) {
         SourceLocation Loc = Tok.getLocation();
         ExprResult LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-        ExprResult PTExpr = Actions.CorrectDelayedTyposInExpr(
-            ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+        ExprResult PTExpr = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
         PTExpr = Actions.ActOnFinishFullExpr(PTExpr.get(), Loc,
                                              /*DiscardedValue=*/false);
         if (PTExpr.isUsable()) {
@@ -3662,8 +3661,7 @@ OMPClause *Parser::ParseOpenMPInteropClause(OpenMPClauseKind Kind,
 
   // Parse the variable.
   SourceLocation VarLoc = Tok.getLocation();
-  ExprResult InteropVarExpr =
-      Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+  ExprResult InteropVarExpr = ParseAssignmentExpression();
   if (!InteropVarExpr.isUsable()) {
     SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
               StopBeforeMatch);
@@ -4288,8 +4286,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
     // Parse <begin>
     SourceLocation Loc = Tok.getLocation();
     ExprResult LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-    ExprResult Begin = Actions.CorrectDelayedTyposInExpr(
-        ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+    ExprResult Begin = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
     Begin = Actions.ActOnFinishFullExpr(Begin.get(), Loc,
                                         /*DiscardedValue=*/false);
     // Parse ':'.
@@ -4300,8 +4297,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
     // Parse <end>
     Loc = Tok.getLocation();
     LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-    ExprResult End = Actions.CorrectDelayedTyposInExpr(
-        ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+    ExprResult End = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
     End = Actions.ActOnFinishFullExpr(End.get(), Loc,
                                       /*DiscardedValue=*/false);
 
@@ -4314,8 +4310,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
       // Parse <step>
       Loc = Tok.getLocation();
       LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-      Step = Actions.CorrectDelayedTyposInExpr(
-          ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+      Step = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
       Step = Actions.ActOnFinishFullExpr(Step.get(), Loc,
                                          /*DiscardedValue=*/false);
     }
@@ -4797,7 +4792,6 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
       EnterScope(Scope::OpenMPDirectiveScope | Scope::DeclScope);
       Tail = ParseOpenMPIteratorsExpr();
     }
-    Tail = Actions.CorrectDelayedTyposInExpr(Tail);
     Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(),
                                        /*DiscardedValue=*/false);
     if (Tail.isUsable() || Data.AllocateAlignment) {
@@ -4858,8 +4852,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
     ColonProtectionRAIIObject ColonRAII(*this, MayHaveTail);
     if (!ParseOpenMPReservedLocator(Kind, Data, getLangOpts())) {
       // Parse variable
-      ExprResult VarExpr =
-          Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+      ExprResult VarExpr = ParseAssignmentExpression();
       if (VarExpr.isUsable()) {
         Vars.push_back(VarExpr.get());
       } else {
@@ -4896,6 +4889,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
     SourceLocation ELoc = ConsumeToken();
 
     if (getLangOpts().OpenMP >= 52 && Kind == OMPC_linear) {
+      bool Malformed = false;
       while (Tok.isNot(tok::r_paren)) {
         if (Tok.is(tok::identifier)) {
           // identifier could be a linear kind (val, uval, ref) or step
@@ -4932,6 +4926,11 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
             ModifierFound = true;
           } else {
             StepFound = parseStepSize(*this, Data, Kind, Tok.getLocation());
+            if (!StepFound) {
+              Malformed = true;
+              SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
+                        StopBeforeMatch);
+            }
           }
         } else {
           // parse an integer expression as step size
@@ -4943,7 +4942,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         if (Tok.is(tok::r_paren) || Tok.is(tok::annot_pragma_openmp_end))
           break;
       }
-      if (!StepFound && !ModifierFound)
+      if (!Malformed && !StepFound && !ModifierFound)
         Diag(ELoc, diag::err_expected_expression);
     } else {
       // for OMPC_aligned and OMPC_linear (with OpenMP <= 5.1)
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index c00759893b0c..434ea6844281 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -602,7 +602,7 @@ StmtResult Parser::ParseSEHExceptBlock(SourceLocation ExceptLoc) {
   {
     ParseScopeFlags FilterScope(this, getCurScope()->getFlags() |
                                           Scope::SEHFilterScope);
-    FilterExpr = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    FilterExpr = ParseExpression();
   }
 
   if (getLangOpts().Borland) {
@@ -1832,11 +1832,7 @@ StmtResult Parser::ParseDoStatement() {
 
   SourceLocation Start = Tok.getLocation();
   ExprResult Cond = ParseExpression();
-  // Correct the typos in condition before closing the scope.
-  if (Cond.isUsable())
-    Cond = Actions.CorrectDelayedTyposInExpr(Cond, /*InitDecl=*/nullptr,
-                                             /*RecoverUncorrectedTypos=*/true);
-  else {
+  if (!Cond.isUsable()) {
     if (!Tok.isOneOf(tok::r_paren, tok::r_square, tok::r_brace))
       SkipUntil(tok::semi);
     Cond = Actions.CreateRecoveryExpr(
@@ -2018,7 +2014,7 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
     }
   } else {
     ProhibitAttributes(attrs);
-    Value = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    Value = ParseExpression();
 
     ForEach = isTokIdentifier_in();
 
@@ -2177,12 +2173,10 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
   StmtResult ForEachStmt;
 
   if (ForRangeInfo.ParsedForRangeDecl()) {
-    ExprResult CorrectedRange =
-        Actions.CorrectDelayedTyposInExpr(ForRangeInfo.RangeExpr.get());
     ForRangeStmt = Actions.ActOnCXXForRangeStmt(
         getCurScope(), ForLoc, CoawaitLoc, FirstPart.get(),
-        ForRangeInfo.LoopVar.get(), ForRangeInfo.ColonLoc, CorrectedRange.get(),
-        T.getCloseLocation(), Sema::BFRK_Build,
+        ForRangeInfo.LoopVar.get(), ForRangeInfo.ColonLoc,
+        ForRangeInfo.RangeExpr.get(), T.getCloseLocation(), Sema::BFRK_Build,
         ForRangeInfo.LifetimeExtendTemps);
   } else if (ForEach) {
     // Similarly, we need to do the semantic analysis for a for-range
diff --git a/clang/lib/Parse/ParseStmtAsm.cpp b/clang/lib/Parse/ParseStmtAsm.cpp
index f2417479a0e7..182907df5607 100644
--- a/clang/lib/Parse/ParseStmtAsm.cpp
+++ b/clang/lib/Parse/ParseStmtAsm.cpp
@@ -864,7 +864,7 @@ bool Parser::ParseAsmOperandsOpt(SmallVectorImpl<IdentifierInfo *> &Names,
     // Read the parenthesized expression.
     BalancedDelimiterTracker T(*this, tok::l_paren);
     T.consumeOpen();
-    ExprResult Res = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    ExprResult Res = ParseExpression();
     T.consumeClose();
     if (Res.isInvalid()) {
       SkipUntil(tok::r_paren, StopAtSemi);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index d3c9ca029c9a..a16dbe95b788 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -296,8 +296,7 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo,
     return nullptr;
   }
 
-  ExprResult ConstraintExprResult =
-      Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
+  ExprResult ConstraintExprResult = ParseConstraintExpression();
   if (ConstraintExprResult.isInvalid()) {
     SkipUntil(tok::semi);
     if (D)
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 9826abc0c3b4..42ebf2a508a2 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1227,15 +1227,6 @@ void Sema::ActOnEndOfTranslationUnitFragment(TUFragmentKind Kind) {
   assert(LateParsedInstantiations.empty() &&
          "end of TU template instantiation should not create more "
          "late-parsed templates");
-
-  // Report diagnostics for uncorrected delayed typos. Ideally all of them
-  // should have been corrected by that time, but it is very hard to cover all
-  // cases in practice.
-  for (const auto &Typo : DelayedTypos) {
-    // We pass an empty TypoCorrection to indicate no correction was performed.
-    Typo.second.DiagHandler(TypoCorrection());
-  }
-  DelayedTypos.clear();
 }
 
 void Sema::ActOnEndOfTranslationUnit() {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 8f8e1ceb7197..69276ce418fa 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2648,8 +2648,6 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     bool IsDelete = BuiltinID == Builtin::BI__builtin_operator_delete;
     ExprResult Res =
         BuiltinOperatorNewDeleteOverloaded(TheCallResult, IsDelete);
-    if (Res.isInvalid())
-      CorrectDelayedTyposInExpr(TheCallResult.get());
     return Res;
   }
   case Builtin::BI__builtin_dump_struct:
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 425b32e53a7b..a1389c6c034b 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -309,15 +309,6 @@ static ExprResult buildMemberCall(Sema &S, Expr *Base, SourceLocation Loc,
   if (Result.isInvalid())
     return ExprError();
 
-  // We meant exactly what we asked for. No need for typo correction.
-  if (auto *TE = dyn_cast<TypoExpr>(Result.get())) {
-    S.clearDelayedTypo(TE);
-    S.Diag(Loc, diag::err_no_member)
-        << NameInfo.getName() << Base->getType()->getAsCXXRecordDecl()
-        << Base->getSourceRange();
-    return ExprError();
-  }
-
   auto EndLoc = Args.empty() ? Loc : Args.back()->getEndLoc();
   return S.BuildCallExpr(nullptr, Result.get(), Loc, Args, EndLoc, nullptr);
 }
@@ -811,7 +802,6 @@ ExprResult Sema::ActOnCoawaitExpr(Scope *S, SourceLocation Loc, Expr *E) {
     return ExprError();
 
   if (!ActOnCoroutineBodyStart(S, Loc, "co_await")) {
-    CorrectDelayedTyposInExpr(E);
     return ExprError();
   }
 
@@ -970,7 +960,6 @@ ExprResult Sema::ActOnCoyieldExpr(Scope *S, SourceLocation Loc, Expr *E) {
     return ExprError();
 
   if (!ActOnCoroutineBodyStart(S, Loc, "co_yield")) {
-    CorrectDelayedTyposInExpr(E);
     return ExprError();
   }
 
@@ -1025,7 +1014,6 @@ ExprResult Sema::BuildCoyieldExpr(SourceLocation Loc, Expr *E) {
 
 StmtResult Sema::ActOnCoreturnStmt(Scope *S, SourceLocation Loc, Expr *E) {
   if (!ActOnCoroutineBodyStart(S, Loc, "co_return")) {
-    CorrectDelayedTyposInExpr(E);
     return StmtError();
   }
   return BuildCoreturnStmt(Loc, E);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index bbd63372c168..c152f406b497 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13584,7 +13584,6 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   // If there is no declaration, there was an error parsing it.  Just ignore
   // the initializer.
   if (!RealDecl) {
-    CorrectDelayedTyposInExpr(Init, dyn_cast_or_null<VarDecl>(RealDecl));
     return;
   }
 
@@ -13607,12 +13606,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   }
 
   if (VDecl->isInvalidDecl()) {
-    ExprResult Res = CorrectDelayedTyposInExpr(Init, VDecl);
-    SmallVector<Expr *> SubExprs;
-    if (Res.isUsable())
-      SubExprs.push_back(Res.get());
     ExprResult Recovery =
-        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), SubExprs);
+        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), {Init});
     if (Expr *E = Recovery.get())
       VDecl->setInit(E);
     return;
@@ -13627,23 +13622,12 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
 
   // C++11 [decl.spec.auto]p6. Deduce the type which 'auto' stands in for.
   if (VDecl->getType()->isUndeducedType()) {
-    // Attempt typo correction early so that the type of the init expression can
-    // be deduced based on the chosen correction if the original init contains a
-    // TypoExpr.
-    ExprResult Res = CorrectDelayedTyposInExpr(Init, VDecl);
-    if (!Res.isUsable()) {
-      // There are unresolved typos in Init, just drop them.
-      // FIXME: improve the recovery strategy to preserve the Init.
-      RealDecl->setInvalidDecl();
-      return;
-    }
-    if (Res.get()->containsErrors()) {
+    if (Init->containsErrors()) {
       // Invalidate the decl as we don't know the type for recovery-expr yet.
       RealDecl->setInvalidDecl();
-      VDecl->setInit(Res.get());
+      VDecl->setInit(Init);
       return;
     }
-    Init = Res.get();
 
     if (DeduceVariableDeclarationType(VDecl, DirectInit, Init))
       return;
@@ -13789,23 +13773,6 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
       InitializedFromParenListExpr = true;
     }
 
-    // Try to correct any TypoExprs in the initialization arguments.
-    for (size_t Idx = 0; Idx < Args.size(); ++Idx) {
-      ExprResult Res = CorrectDelayedTyposInExpr(
-          Args[Idx], VDecl, /*RecoverUncorrectedTypos=*/true,
-          [this, Entity, Kind](Expr *E) {
-            InitializationSequence Init(*this, Entity, Kind, MultiExprArg(E));
-            return Init.Failed() ? ExprError() : E;
-          });
-      if (!Res.isUsable()) {
-        VDecl->setInvalidDecl();
-      } else if (Res.get() != Args[Idx]) {
-        Args[Idx] = Res.get();
-      }
-    }
-    if (VDecl->isInvalidDecl())
-      return;
-
     InitializationSequence InitSeq(*this, Entity, Kind, Args,
                                    /*TopLevelOfInitList=*/false,
                                    /*TreatUnavailableAsInvalid=*/false);
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 39d4d49a0fe7..31e283433674 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -4154,10 +4154,6 @@ ExprResult Sema::ActOnRequiresClause(ExprResult ConstraintExpr) {
   if (ConstraintExpr.isInvalid())
     return ExprError();
 
-  ConstraintExpr = CorrectDelayedTyposInExpr(ConstraintExpr);
-  if (ConstraintExpr.isInvalid())
-    return ExprError();
-
   if (DiagnoseUnexpandedParameterPack(ConstraintExpr.get(),
                                       UPPC_RequiresClause))
     return ExprError();
@@ -4207,23 +4203,20 @@ void Sema::ActOnFinishCXXInClassMemberInitializer(Decl *D,
     return;
   }
 
-  ExprResult Init = CorrectDelayedTyposInExpr(InitExpr, /*InitDecl=*/nullptr,
-                                              /*RecoverUncorrectedTypos=*/true);
-  assert(Init.isUsable() && "Init should at least have a RecoveryExpr");
-  if (!FD->getType()->isDependentType() && !Init.get()->isTypeDependent()) {
-    Init = ConvertMemberDefaultInitExpression(FD, Init.get(), InitLoc);
+  if (!FD->getType()->isDependentType() && !InitExpr.get()->isTypeDependent()) {
+    InitExpr = ConvertMemberDefaultInitExpression(FD, InitExpr.get(), InitLoc);
     // C++11 [class.base.init]p7:
     //   The initialization of each base and member constitutes a
     //   full-expression.
-    if (!Init.isInvalid())
-      Init = ActOnFinishFullExpr(Init.get(), /*DiscarededValue=*/false);
-    if (Init.isInvalid()) {
+    if (!InitExpr.isInvalid())
+      InitExpr = ActOnFinishFullExpr(InitExpr.get(), /*DiscarededValue=*/false);
+    if (InitExpr.isInvalid()) {
       FD->setInvalidDecl();
       return;
     }
   }
 
-  FD->setInClassInitializer(Init.get());
+  FD->setInClassInitializer(InitExpr.get());
 }
 
 /// Find the direct and/or virtual base specifiers that
@@ -4393,13 +4386,7 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
                           SourceLocation IdLoc,
                           Expr *Init,
                           SourceLocation EllipsisLoc) {
-  ExprResult Res = CorrectDelayedTyposInExpr(Init, /*InitDecl=*/nullptr,
-                                             /*RecoverUncorrectedTypos=*/true);
-  if (!Res.isUsable())
-    return true;
-  Init = Res.get();
-
-  if (!ConstructorD)
+  if (!ConstructorD || !Init)
     return true;
 
   AdjustDeclIfTemplate(ConstructorD);
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index c692f824da42..0a6cea8869c1 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1368,7 +1368,6 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Expr::UnaryExprOrTypeTraitExprClass:
   case Expr::UnresolvedLookupExprClass:
   case Expr::UnresolvedMemberExprClass:
-  case Expr::TypoExprClass:
     // FIXME: Many of the above can throw.
     return CT_Cannot;
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index c7abbbd6993d..b7031bc8c022 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2544,8 +2544,7 @@ bool Sema::DiagnoseDependentMemberLookup(const LookupResult &R) {
 bool Sema::DiagnoseEmptyLookup(Scope *S, CXXScopeSpec &SS, LookupResult &R,
                                CorrectionCandidateCallback &CCC,
                                TemplateArgumentListInfo *ExplicitTemplateArgs,
-                               ArrayRef<Expr *> Args, DeclContext *LookupCtx,
-                               TypoExpr **Out) {
+                               ArrayRef<Expr *> Args, DeclContext *LookupCtx) {
   DeclarationName Name = R.getLookupName();
   SourceRange NameRange = R.getLookupNameInfo().getSourceRange();
 
@@ -2604,21 +2603,9 @@ bool Sema::DiagnoseEmptyLookup(Scope *S, CXXScopeSpec &SS, LookupResult &R,
 
   // We didn't find anything, so try to correct for a typo.
   TypoCorrection Corrected;
-  if (S && Out) {
-    assert(!ExplicitTemplateArgs &&
-           "Diagnosing an empty lookup with explicit template args!");
-    *Out = CorrectTypoDelayed(
-        R.getLookupNameInfo(), R.getLookupKind(), S, &SS, CCC,
-        [=](const TypoCorrection &TC) {
-          emitEmptyLookupTypoDiagnostic(TC, *this, SS, Name, NameRange,
-                                        diagnostic, diagnostic_suggest);
-        },
-        nullptr, CorrectTypoKind::ErrorRecovery, LookupCtx);
-    if (*Out)
-      return true;
-  } else if (S && (Corrected = CorrectTypo(
-                       R.getLookupNameInfo(), R.getLookupKind(), S, &SS, CCC,
-                       CorrectTypoKind::ErrorRecovery, LookupCtx))) {
+  if (S && (Corrected =
+                CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S, &SS,
+                            CCC, CorrectTypoKind::ErrorRecovery, LookupCtx))) {
     std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
     bool DroppedSpecifier =
         Corrected.WillReplaceSpecifier() && Name.getAsString() == CorrectedStr;
@@ -2880,7 +2867,6 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
 
     // If this name wasn't predeclared and if this is not a function
     // call, diagnose the problem.
-    TypoExpr *TE = nullptr;
     DefaultFilterCCC DefaultValidator(II, SS.isValid() ? SS.getScopeRep()
                                                        : nullptr);
     DefaultValidator.IsAddressOfOperand = IsAddressOfOperand;
@@ -2896,29 +2882,8 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     // a template name, but we happen to have always already looked up the name
     // before we get here if it must be a template name.
     if (DiagnoseEmptyLookup(S, SS, R, CCC ? *CCC : DefaultValidator, nullptr,
-                            {}, nullptr, &TE)) {
-      if (TE && KeywordReplacement) {
-        auto &State = getTypoExprState(TE);
-        auto BestTC = State.Consumer->getNextCorrection();
-        if (BestTC.isKeyword()) {
-          auto *II = BestTC.getCorrectionAsIdentifierInfo();
-          if (State.DiagHandler)
-            State.DiagHandler(BestTC);
-          KeywordReplacement->startToken();
-          KeywordReplacement->setKind(II->getTokenID());
-          KeywordReplacement->setIdentifierInfo(II);
-          KeywordReplacement->setLocation(BestTC.getCorrectionRange().getBegin());
-          // Clean up the state associated with the TypoExpr, since it has
-          // now been diagnosed (without a call to CorrectDelayedTyposInExpr).
-          clearDelayedTypo(TE);
-          // Signal that a correction to a keyword was performed by returning a
-          // valid-but-null ExprResult.
-          return (Expr*)nullptr;
-        }
-        State.Consumer->resetCorrectionStream();
-      }
-      return TE ? TE : ExprError();
-    }
+                            {}, nullptr))
+      return ExprError();
 
     assert(!R.empty() &&
            "DiagnoseEmptyLookup returned false but added no results");
@@ -7009,40 +6974,6 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
                          CurFPFeatureOverrides(), NumParams, UsesADL);
   }
 
-  if (!Context.isDependenceAllowed()) {
-    // Forget about the nulled arguments since typo correction
-    // do not handle them well.
-    TheCall->shrinkNumArgs(Args.size());
-    // C cannot always handle TypoExpr nodes in builtin calls and direct
-    // function calls as their argument checking don't necessarily handle
-    // dependent types properly, so make sure any TypoExprs have been
-    // dealt with.
-    ExprResult Result = CorrectDelayedTyposInExpr(TheCall);
-    if (!Result.isUsable()) return ExprError();
-    CallExpr *TheOldCall = TheCall;
-    TheCall = dyn_cast<CallExpr>(Result.get());
-    bool CorrectedTypos = TheCall != TheOldCall;
-    if (!TheCall) return Result;
-    Args = llvm::ArrayRef(TheCall->getArgs(), TheCall->getNumArgs());
-
-    // A new call expression node was created if some typos were corrected.
-    // However it may not have been constructed with enough storage. In this
-    // case, rebuild the node with enough storage. The waste of space is
-    // immaterial since this only happens when some typos were corrected.
-    if (CorrectedTypos && Args.size() < NumParams) {
-      if (Config)
-        TheCall = CUDAKernelCallExpr::Create(
-            Context, Fn, cast<CallExpr>(Config), Args, ResultTy, VK_PRValue,
-            RParenLoc, CurFPFeatureOverrides(), NumParams);
-      else
-        TheCall =
-            CallExpr::Create(Context, Fn, Args, ResultTy, VK_PRValue, RParenLoc,
-                             CurFPFeatureOverrides(), NumParams, UsesADL);
-    }
-    // We can now handle the nulled arguments for the default arguments.
-    TheCall->setNumArgsUnsafe(std::max<unsigned>(Args.size(), NumParams));
-  }
-
   // Bail out early if calling a builtin with custom type checking.
   if (BuiltinID && Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) {
     ExprResult E = CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall);
@@ -7933,12 +7864,6 @@ Sema::ActOnCastExpr(Scope *S, SourceLocation LParenLoc,
   if (getLangOpts().CPlusPlus) {
     // Check that there are no default arguments (C++ only).
     CheckExtraCXXDefaultArguments(D);
-  } else {
-    // Make sure any TypoExprs have been dealt with.
-    ExprResult Res = CorrectDelayedTyposInExpr(CastExpr);
-    if (!Res.isUsable())
-      return ExprError();
-    CastExpr = Res.get();
   }
 
   checkUnusedDeclAttributes(D);
@@ -8984,30 +8909,6 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc,
                                     SourceLocation ColonLoc,
                                     Expr *CondExpr, Expr *LHSExpr,
                                     Expr *RHSExpr) {
-  if (!Context.isDependenceAllowed()) {
-    // C cannot handle TypoExpr nodes in the condition because it
-    // doesn't handle dependent types properly, so make sure any TypoExprs have
-    // been dealt with before checking the operands.
-    ExprResult CondResult = CorrectDelayedTyposInExpr(CondExpr);
-    ExprResult LHSResult = CorrectDelayedTyposInExpr(LHSExpr);
-    ExprResult RHSResult = CorrectDelayedTyposInExpr(RHSExpr);
-
-    if (!CondResult.isUsable())
-      return ExprError();
-
-    if (LHSExpr) {
-      if (!LHSResult.isUsable())
-        return ExprError();
-    }
-
-    if (!RHSResult.isUsable())
-      return ExprError();
-
-    CondExpr = CondResult.get();
-    LHSExpr = LHSResult.get();
-    RHSExpr = RHSResult.get();
-  }
-
   // If this is the gnu "x ?: y" extension, analyze the types as though the LHS
   // was the condition.
   OpaqueValueExpr *opaqueValue = nullptr;
@@ -15068,28 +14969,6 @@ static ExprResult convertHalfVecBinOp(Sema &S, ExprResult LHS, ExprResult RHS,
   return convertVector(BO, ResultTy->castAs<VectorType>()->getElementType(), S);
 }
 
-static std::pair<ExprResult, ExprResult>
-CorrectDelayedTyposInBinOp(Sema &S, BinaryOperatorKind Opc, Expr *LHSExpr,
-                           Expr *RHSExpr) {
-  ExprResult LHS = LHSExpr, RHS = RHSExpr;
-  if (!S.Context.isDependenceAllowed()) {
-    // C cannot handle TypoExpr nodes on either side of a binop because it
-    // doesn't handle dependent types properly, so make sure any TypoExprs have
-    // been dealt with before checking the operands.
-    LHS = S.CorrectDelayedTyposInExpr(LHS);
-    RHS = S.CorrectDelayedTyposInExpr(
-        RHS, /*InitDecl=*/nullptr, /*RecoverUncorrectedTypos=*/false,
-        [Opc, LHS](Expr *E) {
-          if (Opc != BO_Assign)
-            return ExprResult(E);
-          // Avoid correcting the RHS to the same Expr as the LHS.
-          Decl *D = getDeclFromExpr(E);
-          return (D && D == getDeclFromExpr(LHS.get())) ? ExprError() : E;
-        });
-  }
-  return std::make_pair(LHS, RHS);
-}
-
 /// Returns true if conversion between vectors of halfs and vectors of floats
 /// is needed.
 static bool needsConversionOfHalfVec(bool OpRequiresConversion, ASTContext &Ctx,
@@ -15146,7 +15025,6 @@ ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
   ExprObjectKind OK = OK_Ordinary;
   bool ConvertHalfVec = false;
 
-  std::tie(LHS, RHS) = CorrectDelayedTyposInBinOp(*this, Opc, LHSExpr, RHSExpr);
   if (!LHS.isUsable() || !RHS.isUsable())
     return ExprError();
 
@@ -15662,12 +15540,8 @@ static ExprResult BuildOverloadedBinOp(Sema &S, Scope *Sc, SourceLocation OpLoc,
 ExprResult Sema::BuildBinOp(Scope *S, SourceLocation OpLoc,
                             BinaryOperatorKind Opc, Expr *LHSExpr,
                             Expr *RHSExpr, bool ForFoldExpression) {
-  ExprResult LHS, RHS;
-  std::tie(LHS, RHS) = CorrectDelayedTyposInBinOp(*this, Opc, LHSExpr, RHSExpr);
-  if (!LHS.isUsable() || !RHS.isUsable())
+  if (!LHSExpr || !RHSExpr)
     return ExprError();
-  LHSExpr = LHS.get();
-  RHSExpr = RHS.get();
 
   // We want to end up calling one of SemaPseudoObject::checkAssignment
   // (if the LHS is a pseudo-object), BuildOverloadedBinOp (if
@@ -18194,8 +18068,6 @@ HandleImmediateInvocations(Sema &SemaRef,
 
 void Sema::PopExpressionEvaluationContext() {
   ExpressionEvaluationContextRecord& Rec = ExprEvalContexts.back();
-  unsigned NumTypos = Rec.NumTypos;
-
   if (!Rec.Lambdas.empty()) {
     using ExpressionKind = ExpressionEvaluationContextRecord::ExpressionKind;
     if (!getLangOpts().CPlusPlus20 &&
@@ -18263,9 +18135,6 @@ void Sema::PopExpressionEvaluationContext() {
 
   // Pop the current expression evaluation context off the stack.
   ExprEvalContexts.pop_back();
-
-  // The global expression evaluation context record is never popped.
-  ExprEvalContexts.back().NumTypos += NumTypos;
 }
 
 void Sema::DiscardCleanupsInEvaluationContext() {
@@ -20023,8 +19892,6 @@ ExprResult Sema::CheckLValueToRValueConversionOperand(Expr *E) {
 }
 
 ExprResult Sema::ActOnConstantExpression(ExprResult Res) {
-  Res = CorrectDelayedTyposInExpr(Res);
-
   if (!Res.isUsable())
     return Res;
 
@@ -21350,15 +21217,6 @@ static ExprResult diagnoseUnknownAnyExpr(Sema &S, Expr *E) {
 }
 
 ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
-  if (!Context.isDependenceAllowed()) {
-    // C cannot handle TypoExpr nodes on either side of a binop because it
-    // doesn't handle dependent types properly, so make sure any TypoExprs have
-    // been dealt with before checking the operands.
-    ExprResult Result = CorrectDelayedTyposInExpr(E);
-    if (!Result.isUsable()) return ExprError();
-    E = Result.get();
-  }
-
   const BuiltinType *placeholderType = E->getType()->getAsPlaceholderType();
   if (!placeholderType) return E;
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c106ea749170..c653cb56351c 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1500,13 +1500,7 @@ Sema::ActOnCXXTypeConstructExpr(ParsedType TypeRep,
 
   auto Result = BuildCXXTypeConstructExpr(TInfo, LParenOrBraceLoc, exprs,
                                           RParenOrBraceLoc, ListInitialization);
-  // Avoid creating a non-type-dependent expression that contains typos.
-  // Non-type-dependent expressions are liable to be discarded without
-  // checking for embedded typos.
-  if (!Result.isInvalid() && Result.get()->isInstantiationDependent() &&
-      !Result.get()->isTypeDependent())
-    Result = CorrectDelayedTyposInExpr(Result.get());
-  else if (Result.isInvalid())
+  if (Result.isInvalid())
     Result = CreateRecoveryExpr(TInfo->getTypeLoc().getBeginLoc(),
                                 RParenOrBraceLoc, exprs, Ty);
   return Result;
@@ -7698,372 +7692,6 @@ static ExprResult attemptRecovery(Sema &SemaRef,
                                           /*AcceptInvalidDecl*/ true);
 }
 
-namespace {
-class FindTypoExprs : public DynamicRecursiveASTVisitor {
-  llvm::SmallSetVector<TypoExpr *, 2> &TypoExprs;
-
-public:
-  explicit FindTypoExprs(llvm::SmallSetVector<TypoExpr *, 2> &TypoExprs)
-      : TypoExprs(TypoExprs) {}
-  bool VisitTypoExpr(TypoExpr *TE) override {
-    TypoExprs.insert(TE);
-    return true;
-  }
-};
-
-class TransformTypos : public TreeTransform<TransformTypos> {
-  typedef TreeTransform<TransformTypos> BaseTransform;
-
-  VarDecl *InitDecl; // A decl to avoid as a correction because it is in the
-                     // process of being initialized.
-  llvm::function_ref<ExprResult(Expr *)> ExprFilter;
-  llvm::SmallSetVector<TypoExpr *, 2> TypoExprs, AmbiguousTypoExprs;
-  llvm::SmallDenseMap<TypoExpr *, ExprResult, 2> TransformCache;
-  llvm::SmallDenseMap<OverloadExpr *, Expr *, 4> OverloadResolution;
-
-  /// Emit diagnostics for all of the TypoExprs encountered.
-  ///
-  /// If the TypoExprs were successfully corrected, then the diagnostics should
-  /// suggest the corrections. Otherwise the diagnostics will not suggest
-  /// anything (having been passed an empty TypoCorrection).
-  ///
-  /// If we've failed to correct due to ambiguous corrections, we need to
-  /// be sure to pass empty corrections and replacements. Otherwise it's
-  /// possible that the Consumer has a TypoCorrection that failed to ambiguity
-  /// and we don't want to report those diagnostics.
-  void EmitAllDiagnostics(bool IsAmbiguous) {
-    for (TypoExpr *TE : TypoExprs) {
-      auto &State = SemaRef.getTypoExprState(TE);
-      if (State.DiagHandler) {
-        TypoCorrection TC = IsAmbiguous
-            ? TypoCorrection() : State.Consumer->getCurrentCorrection();
-        ExprResult Replacement = IsAmbiguous ? ExprError() : TransformCache[TE];
-
-        // Extract the NamedDecl from the transformed TypoExpr and add it to the
-        // TypoCorrection, replacing the existing decls. This ensures the right
-        // NamedDecl is used in diagnostics e.g. in the case where overload
-        // resolution was used to select one from several possible decls that
-        // had been stored in the TypoCorrection.
-        if (auto *ND = getDeclFromExpr(
-                Replacement.isInvalid() ? nullptr : Replacement.get()))
-          TC.setCorrectionDecl(ND);
-
-        State.DiagHandler(TC);
-      }
-      SemaRef.clearDelayedTypo(TE);
-    }
-  }
-
-  /// Try to advance the typo correction state of the first unfinished TypoExpr.
-  /// We allow advancement of the correction stream by removing it from the
-  /// TransformCache which allows `TransformTypoExpr` to advance during the
-  /// next transformation attempt.
-  ///
-  /// Any substitution attempts for the previous TypoExprs (which must have been
-  /// finished) will need to be retried since it's possible that they will now
-  /// be invalid given the latest advancement.
-  ///
-  /// We need to be sure that we're making progress - it's possible that the
-  /// tree is so malformed that the transform never makes it to the
-  /// `TransformTypoExpr`.
-  ///
-  /// Returns true if there are any untried correction combinations.
-  bool CheckAndAdvanceTypoExprCorrectionStreams() {
-    for (auto *TE : TypoExprs) {
-      auto &State = SemaRef.getTypoExprState(TE);
-      TransformCache.erase(TE);
-      if (!State.Consumer->hasMadeAnyCorrectionProgress())
-        return false;
-      if (!State.Consumer->finished())
-        return true;
-      State.Consumer->resetCorrectionStream();
-    }
-    return false;
-  }
-
-  NamedDecl *getDeclFromExpr(Expr *E) {
-    if (auto *OE = dyn_cast_or_null<OverloadExpr>(E))
-      E = OverloadResolution[OE];
-
-    if (!E)
-      return nullptr;
-    if (auto *DRE = dyn_cast<DeclRefExpr>(E))
-      return DRE->getFoundDecl();
-    if (auto *ME = dyn_cast<MemberExpr>(E))
-      return ME->getFoundDecl();
-    // FIXME: Add any other expr types that could be seen by the delayed typo
-    // correction TreeTransform for which the corresponding TypoCorrection could
-    // contain multiple decls.
-    return nullptr;
-  }
-
-  ExprResult TryTransform(Expr *E) {
-    Sema::SFINAETrap Trap(SemaRef);
-    ExprResult Res = TransformExpr(E);
-    if (Trap.hasErrorOccurred() || Res.isInvalid())
-      return ExprError();
-
-    return ExprFilter(Res.get());
-  }
-
-  // Since correcting typos may intoduce new TypoExprs, this function
-  // checks for new TypoExprs and recurses if it finds any. Note that it will
-  // only succeed if it is able to correct all typos in the given expression.
-  ExprResult CheckForRecursiveTypos(ExprResult Res, bool &IsAmbiguous) {
-    if (Res.isInvalid()) {
-      return Res;
-    }
-    // Check to see if any new TypoExprs were created. If so, we need to recurse
-    // to check their validity.
-    Expr *FixedExpr = Res.get();
-
-    auto SavedTypoExprs = std::move(TypoExprs);
-    auto SavedAmbiguousTypoExprs = std::move(AmbiguousTypoExprs);
-    TypoExprs.clear();
-    AmbiguousTypoExprs.clear();
-
-    FindTypoExprs(TypoExprs).TraverseStmt(FixedExpr);
-    if (!TypoExprs.empty()) {
-      // Recurse to handle newly created TypoExprs. If we're not able to
-      // handle them, discard these TypoExprs.
-      ExprResult RecurResult =
-          RecursiveTransformLoop(FixedExpr, IsAmbiguous);
-      if (RecurResult.isInvalid()) {
-        Res = ExprError();
-        // Recursive corrections didn't work, wipe them away and don't add
-        // them to the TypoExprs set. Remove them from Sema's TypoExpr list
-        // since we don't want to clear them twice. Note: it's possible the
-        // TypoExprs were created recursively and thus won't be in our
-        // Sema's TypoExprs - they were created in our `RecursiveTransformLoop`.
-        auto &SemaTypoExprs = SemaRef.TypoExprs;
-        for (auto *TE : TypoExprs) {
-          TransformCache.erase(TE);
-          SemaRef.clearDelayedTypo(TE);
-
-          auto SI = find(SemaTypoExprs, TE);
-          if (SI != SemaTypoExprs.end()) {
-            SemaTypoExprs.erase(SI);
-          }
-        }
-      } else {
-        // TypoExpr is valid: add newly created TypoExprs since we were
-        // able to correct them.
-        Res = RecurResult;
-        SavedTypoExprs.set_union(TypoExprs);
-      }
-    }
-
-    TypoExprs = std::move(SavedTypoExprs);
-    AmbiguousTypoExprs = std::move(SavedAmbiguousTypoExprs);
-
-    return Res;
-  }
-
-  // Try to transform the given expression, looping through the correction
-  // candidates with `CheckAndAdvanceTypoExprCorrectionStreams`.
-  //
-  // If valid ambiguous typo corrections are seen, `IsAmbiguous` is set to
-  // true and this method immediately will return an `ExprError`.
-  ExprResult RecursiveTransformLoop(Expr *E, bool &IsAmbiguous) {
-    ExprResult Res;
-    auto SavedTypoExprs = std::move(SemaRef.TypoExprs);
-    SemaRef.TypoExprs.clear();
-
-    while (true) {
-      Res = CheckForRecursiveTypos(TryTransform(E), IsAmbiguous);
-
-      // Recursion encountered an ambiguous correction. This means that our
-      // correction itself is ambiguous, so stop now.
-      if (IsAmbiguous)
-        break;
-
-      // If the transform is still valid after checking for any new typos,
-      // it's good to go.
-      if (!Res.isInvalid())
-        break;
-
-      // The transform was invalid, see if we have any TypoExprs with untried
-      // correction candidates.
-      if (!CheckAndAdvanceTypoExprCorrectionStreams())
-        break;
-    }
-
-    // If we found a valid result, double check to make sure it's not ambiguous.
-    if (!IsAmbiguous && !Res.isInvalid() && !AmbiguousTypoExprs.empty()) {
-      auto SavedTransformCache =
-          llvm::SmallDenseMap<TypoExpr *, ExprResult, 2>(TransformCache);
-
-      // Ensure none of the TypoExprs have multiple typo correction candidates
-      // with the same edit length that pass all the checks and filters.
-      while (!AmbiguousTypoExprs.empty()) {
-        auto TE  = AmbiguousTypoExprs.back();
-
-        // TryTransform itself can create new Typos, adding them to the TypoExpr map
-        // and invalidating our TypoExprState, so always fetch it instead of storing.
-        SemaRef.getTypoExprState(TE).Consumer->saveCurrentPosition();
-
-        TypoCorrection TC = SemaRef.getTypoExprState(TE).Consumer->peekNextCorrection();
-        TypoCorrection Next;
-        do {
-          // Fetch the next correction by erasing the typo from the cache and calling
-          // `TryTransform` which will iterate through corrections in
-          // `TransformTypoExpr`.
-          TransformCache.erase(TE);
-          ExprResult AmbigRes = CheckForRecursiveTypos(TryTransform(E), IsAmbiguous);
-
-          if (!AmbigRes.isInvalid() || IsAmbiguous) {
-            SemaRef.getTypoExprState(TE).Consumer->resetCorrectionStream();
-            SavedTransformCache.erase(TE);
-            Res = ExprError();
-            IsAmbiguous = true;
-            break;
-          }
-        } while ((Next = SemaRef.getTypoExprState(TE).Consumer->peekNextCorrection()) &&
-                 Next.getEditDistance(false) == TC.getEditDistance(false));
-
-        if (IsAmbiguous)
-          break;
-
-        AmbiguousTypoExprs.remove(TE);
-        SemaRef.getTypoExprState(TE).Consumer->restoreSavedPosition();
-        TransformCache[TE] = SavedTransformCache[TE];
-      }
-      TransformCache = std::move(SavedTransformCache);
-    }
-
-    // Wipe away any newly created TypoExprs that we don't know about. Since we
-    // clear any invalid TypoExprs in `CheckForRecursiveTypos`, this is only
-    // possible if a `TypoExpr` is created during a transformation but then
-    // fails before we can discover it.
-    auto &SemaTypoExprs = SemaRef.TypoExprs;
-    for (auto Iterator = SemaTypoExprs.begin(); Iterator != SemaTypoExprs.end();) {
-      auto TE = *Iterator;
-      auto FI = find(TypoExprs, TE);
-      if (FI != TypoExprs.end()) {
-        Iterator++;
-        continue;
-      }
-      SemaRef.clearDelayedTypo(TE);
-      Iterator = SemaTypoExprs.erase(Iterator);
-    }
-    SemaRef.TypoExprs = std::move(SavedTypoExprs);
-
-    return Res;
-  }
-
-public:
-  TransformTypos(Sema &SemaRef, VarDecl *InitDecl, llvm::function_ref<ExprResult(Expr *)> Filter)
-      : BaseTransform(SemaRef), InitDecl(InitDecl), ExprFilter(Filter) {}
-
-  ExprResult RebuildCallExpr(Expr *Callee, SourceLocation LParenLoc,
-                                   MultiExprArg Args,
-                                   SourceLocation RParenLoc,
-                                   Expr *ExecConfig = nullptr) {
-    auto Result = BaseTransform::RebuildCallExpr(Callee, LParenLoc, Args,
-                                                 RParenLoc, ExecConfig);
-    if (auto *OE = dyn_cast<OverloadExpr>(Callee)) {
-      if (Result.isUsable()) {
-        Expr *ResultCall = Result.get();
-        if (auto *BE = dyn_cast<CXXBindTemporaryExpr>(ResultCall))
-          ResultCall = BE->getSubExpr();
-        if (auto *CE = dyn_cast<CallExpr>(ResultCall))
-          OverloadResolution[OE] = CE->getCallee();
-      }
-    }
-    return Result;
-  }
-
-  ExprResult TransformLambdaExpr(LambdaExpr *E) { return Owned(E); }
-
-  ExprResult TransformBlockExpr(BlockExpr *E) { return Owned(E); }
-
-  ExprResult Transform(Expr *E) {
-    bool IsAmbiguous = false;
-    ExprResult Res = RecursiveTransformLoop(E, IsAmbiguous);
-
-    if (!Res.isUsable())
-      FindTypoExprs(TypoExprs).TraverseStmt(E);
-
-    EmitAllDiagnostics(IsAmbiguous);
-
-    return Res;
-  }
-
-  ExprResult TransformTypoExpr(TypoExpr *E) {
-    // If the TypoExpr hasn't been seen before, record it. Otherwise, return the
-    // cached transformation result if there is one and the TypoExpr isn't the
-    // first one that was encountered.
-    auto &CacheEntry = TransformCache[E];
-    if (!TypoExprs.insert(E) && !CacheEntry.isUnset()) {
-      return CacheEntry;
-    }
-
-    auto &State = SemaRef.getTypoExprState(E);
-    assert(State.Consumer && "Cannot transform a cleared TypoExpr");
-
-    // For the first TypoExpr and an uncached TypoExpr, find the next likely
-    // typo correction and return it.
-    while (TypoCorrection TC = State.Consumer->getNextCorrection()) {
-      if (InitDecl && TC.getFoundDecl() == InitDecl)
-        continue;
-      // FIXME: If we would typo-correct to an invalid declaration, it's
-      // probably best to just suppress all errors from this typo correction.
-      ExprResult NE = State.RecoveryHandler ?
-          State.RecoveryHandler(SemaRef, E, TC) :
-          attemptRecovery(SemaRef, *State.Consumer, TC);
-      if (!NE.isInvalid()) {
-        // Check whether there may be a second viable correction with the same
-        // edit distance; if so, remember this TypoExpr may have an ambiguous
-        // correction so it can be more thoroughly vetted later.
-        TypoCorrection Next;
-        if ((Next = State.Consumer->peekNextCorrection()) &&
-            Next.getEditDistance(false) == TC.getEditDistance(false)) {
-          AmbiguousTypoExprs.insert(E);
-        } else {
-          AmbiguousTypoExprs.remove(E);
-        }
-        assert(!NE.isUnset() &&
-               "Typo was transformed into a valid-but-null ExprResult");
-        return CacheEntry = NE;
-      }
-    }
-    return CacheEntry = ExprError();
-  }
-};
-}
-
-ExprResult
-Sema::CorrectDelayedTyposInExpr(Expr *E, VarDecl *InitDecl,
-                                bool RecoverUncorrectedTypos,
-                                llvm::function_ref<ExprResult(Expr *)> Filter) {
-  // If the current evaluation context indicates there are uncorrected typos
-  // and the current expression isn't guaranteed to not have typos, try to
-  // resolve any TypoExpr nodes that might be in the expression.
-  if (E && !ExprEvalContexts.empty() && ExprEvalContexts.back().NumTypos &&
-      (E->isTypeDependent() || E->isValueDependent() ||
-       E->isInstantiationDependent())) {
-    auto TyposResolved = DelayedTypos.size();
-    auto Result = TransformTypos(*this, InitDecl, Filter).Transform(E);
-    TyposResolved -= DelayedTypos.size();
-    if (Result.isInvalid() || Result.get() != E) {
-      ExprEvalContexts.back().NumTypos -= TyposResolved;
-      if (Result.isInvalid() && RecoverUncorrectedTypos) {
-        struct TyposReplace : TreeTransform<TyposReplace> {
-          TyposReplace(Sema &SemaRef) : TreeTransform(SemaRef) {}
-          ExprResult TransformTypoExpr(clang::TypoExpr *E) {
-            return this->SemaRef.CreateRecoveryExpr(E->getBeginLoc(),
-                                                    E->getEndLoc(), {});
-          }
-        } TT(*this);
-        return TT.TransformExpr(E);
-      }
-      return Result;
-    }
-    assert(TyposResolved == 0 && "Corrected typo but got same Expr back?");
-  }
-  return E;
-}
-
 ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
                                      bool DiscardedValue, bool IsConstexpr,
                                      bool IsTemplateArgument) {
@@ -8095,8 +7723,6 @@ ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
     DiagnoseUnusedExprResult(FullExpr.get(), diag::warn_unused_expr);
   }
 
-  FullExpr = CorrectDelayedTyposInExpr(FullExpr.get(), /*InitDecl=*/nullptr,
-                                       /*RecoverUncorrectedTypos=*/true);
   if (FullExpr.isInvalid())
     return ExprError();
 
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 39c162c3b835..5dca509d46fd 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -650,64 +650,11 @@ bool Sema::CheckQualifiedMemberReference(Expr *BaseExpr,
   return true;
 }
 
-namespace {
-
-// Callback to only accept typo corrections that are either a ValueDecl or a
-// FunctionTemplateDecl and are declared in the current record or, for a C++
-// classes, one of its base classes.
-class RecordMemberExprValidatorCCC final : public CorrectionCandidateCallback {
-public:
-  explicit RecordMemberExprValidatorCCC(QualType RTy)
-      : Record(RTy->getAsRecordDecl()) {
-    // Don't add bare keywords to the consumer since they will always fail
-    // validation by virtue of not being associated with any decls.
-    WantTypeSpecifiers = false;
-    WantExpressionKeywords = false;
-    WantCXXNamedCasts = false;
-    WantFunctionLikeCasts = false;
-    WantRemainingKeywords = false;
-  }
-
-  bool ValidateCandidate(const TypoCorrection &candidate) override {
-    NamedDecl *ND = candidate.getCorrectionDecl();
-    // Don't accept candidates that cannot be member functions, constants,
-    // variables, or templates.
-    if (!ND || !(isa<ValueDecl>(ND) || isa<FunctionTemplateDecl>(ND)))
-      return false;
-
-    // Accept candidates that occur in the current record.
-    if (Record->containsDecl(ND))
-      return true;
-
-    if (const auto *RD = dyn_cast<CXXRecordDecl>(Record)) {
-      // Accept candidates that occur in any of the current class' base classes.
-      for (const auto &BS : RD->bases()) {
-        if (const auto *BSTy = BS.getType()->getAs<RecordType>()) {
-          if (BSTy->getDecl()->containsDecl(ND))
-            return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
-  std::unique_ptr<CorrectionCandidateCallback> clone() override {
-    return std::make_unique<RecordMemberExprValidatorCCC>(*this);
-  }
-
-private:
-  const RecordDecl *const Record;
-};
-
-}
-
 static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
                                      Expr *BaseExpr, QualType RTy,
                                      SourceLocation OpLoc, bool IsArrow,
                                      CXXScopeSpec &SS, bool HasTemplateArgs,
-                                     SourceLocation TemplateKWLoc,
-                                     TypoExpr *&TE) {
+                                     SourceLocation TemplateKWLoc) {
   SourceRange BaseRange = BaseExpr ? BaseExpr->getSourceRange() : SourceRange();
   if (!RTy->isDependentType() &&
       !SemaRef.isThisOutsideMemberFunctionBody(RTy) &&
@@ -724,56 +671,6 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
                                       /*EnteringContext=*/false, TemplateKWLoc);
 
   SemaRef.LookupParsedName(R, /*S=*/nullptr, &SS, ObjectType);
-
-  if (!R.empty() || R.wasNotFoundInCurrentInstantiation())
-    return false;
-
-  DeclarationName Typo = R.getLookupName();
-  SourceLocation TypoLoc = R.getNameLoc();
-  // Recompute the lookup context.
-  DeclContext *DC = SS.isSet() ? SemaRef.computeDeclContext(SS)
-                               : SemaRef.computeDeclContext(RTy);
-
-  struct QueryState {
-    Sema &SemaRef;
-    DeclarationNameInfo NameInfo;
-    Sema::LookupNameKind LookupKind;
-    RedeclarationKind Redecl;
-  };
-  QueryState Q = {R.getSema(), R.getLookupNameInfo(), R.getLookupKind(),
-                  R.redeclarationKind()};
-  RecordMemberExprValidatorCCC CCC(RTy);
-  TE = SemaRef.CorrectTypoDelayed(
-      R.getLookupNameInfo(), R.getLookupKind(), nullptr, &SS, CCC,
-      [=, &SemaRef](const TypoCorrection &TC) {
-        if (TC) {
-          assert(!TC.isKeyword() &&
-                 "Got a keyword as a correction for a member!");
-          bool DroppedSpecifier =
-              TC.WillReplaceSpecifier() &&
-              Typo.getAsString() == TC.getAsString(SemaRef.getLangOpts());
-          SemaRef.diagnoseTypo(TC, SemaRef.PDiag(diag::err_no_member_suggest)
-                                       << Typo << DC << DroppedSpecifier
-                                       << SS.getRange());
-        } else {
-          SemaRef.Diag(TypoLoc, diag::err_no_member)
-              << Typo << DC << (SS.isSet() ? SS.getRange() : BaseRange);
-        }
-      },
-      [=](Sema &SemaRef, TypoExpr *TE, TypoCorrection TC) mutable {
-        LookupResult R(Q.SemaRef, Q.NameInfo, Q.LookupKind, Q.Redecl);
-        R.clear(); // Ensure there's no decls lingering in the shared state.
-        R.suppressDiagnostics();
-        R.setLookupName(TC.getCorrection());
-        for (NamedDecl *ND : TC)
-          R.addDecl(ND);
-        R.resolveKind();
-        return SemaRef.BuildMemberReferenceExpr(
-            BaseExpr, BaseExpr->getType(), OpLoc, IsArrow, SS, SourceLocation(),
-            nullptr, R, nullptr, nullptr);
-      },
-      CorrectTypoKind::ErrorRecovery, DC);
-
   return false;
 }
 
@@ -793,15 +690,11 @@ ExprResult Sema::BuildMemberReferenceExpr(
 
   // Implicit member accesses.
   if (!Base) {
-    TypoExpr *TE = nullptr;
     QualType RecordTy = BaseType;
     if (IsArrow) RecordTy = RecordTy->castAs<PointerType>()->getPointeeType();
     if (LookupMemberExprInRecord(*this, R, nullptr, RecordTy, OpLoc, IsArrow,
-                                 SS, TemplateArgs != nullptr, TemplateKWLoc,
-                                 TE))
+                                 SS, TemplateArgs != nullptr, TemplateKWLoc))
       return ExprError();
-    if (TE)
-      return TE;
 
   // Explicit member accesses.
   } else {
@@ -1396,16 +1289,15 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
 
   // Handle field access to simple records.
   if (BaseType->getAsRecordDecl()) {
-    TypoExpr *TE = nullptr;
     if (LookupMemberExprInRecord(S, R, BaseExpr.get(), BaseType, OpLoc, IsArrow,
-                                 SS, HasTemplateArgs, TemplateKWLoc, TE))
+                                 SS, HasTemplateArgs, TemplateKWLoc))
       return ExprError();
 
     // Returning valid-but-null is how we indicate to the caller that
     // the lookup result was filled in. If typo correction was attempted and
     // failed, the lookup result will have been cleared--that combined with the
     // valid-but-null ExprResult will trigger the appropriate diagnostics.
-    return ExprResult(TE);
+    return ExprResult{};
   } else if (BaseType->isDependentType()) {
     R.setNotFoundInCurrentInstantiation();
     return ExprEmpty();
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 91822909f1fd..5ad9dd8ed0d3 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -5444,40 +5444,6 @@ TypoCorrection Sema::CorrectTypo(const DeclarationNameInfo &TypoName,
   return FailedCorrection(Typo, TypoName.getLoc(), RecordFailure && !SecondBestTC);
 }
 
-TypoExpr *Sema::CorrectTypoDelayed(
-    const DeclarationNameInfo &TypoName, Sema::LookupNameKind LookupKind,
-    Scope *S, CXXScopeSpec *SS, CorrectionCandidateCallback &CCC,
-    TypoDiagnosticGenerator TDG, TypoRecoveryCallback TRC, CorrectTypoKind Mode,
-    DeclContext *MemberContext, bool EnteringContext,
-    const ObjCObjectPointerType *OPT) {
-  auto Consumer = makeTypoCorrectionConsumer(
-      TypoName, LookupKind, S, SS, CCC, MemberContext, EnteringContext, OPT,
-      Mode == CorrectTypoKind::ErrorRecovery);
-
-  // Give the external sema source a chance to correct the typo.
-  TypoCorrection ExternalTypo;
-  if (ExternalSource && Consumer) {
-    ExternalTypo = ExternalSource->CorrectTypo(
-        TypoName, LookupKind, S, SS, *Consumer->getCorrectionValidator(),
-        MemberContext, EnteringContext, OPT);
-    if (ExternalTypo)
-      Consumer->addCorrection(ExternalTypo);
-  }
-
-  if (!Consumer || Consumer->empty())
-    return nullptr;
-
-  // Make sure the best edit distance (prior to adding any namespace qualifiers)
-  // is not more that about a third of the length of the typo's identifier.
-  unsigned ED = Consumer->getBestEditDistance(true);
-  IdentifierInfo *Typo = TypoName.getName().getAsIdentifierInfo();
-  if (!ExternalTypo && ED > 0 && Typo->getName().size() / ED < 3)
-    return nullptr;
-  ExprEvalContexts.back().NumTypos++;
-  return createDelayedTypo(std::move(Consumer), std::move(TDG), std::move(TRC),
-                           TypoName.getLoc());
-}
-
 void TypoCorrection::addCorrectionDecl(NamedDecl *CDecl) {
   if (!CDecl) return;
 
@@ -5802,32 +5768,6 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction,
     Diag(Correction.getCorrectionRange().getBegin(), PD);
 }
 
-TypoExpr *Sema::createDelayedTypo(std::unique_ptr<TypoCorrectionConsumer> TCC,
-                                  TypoDiagnosticGenerator TDG,
-                                  TypoRecoveryCallback TRC,
-                                  SourceLocation TypoLoc) {
-  assert(TCC && "createDelayedTypo requires a valid TypoCorrectionConsumer");
-  auto TE = new (Context) TypoExpr(Context.DependentTy, TypoLoc);
-  auto &State = DelayedTypos[TE];
-  State.Consumer = std::move(TCC);
-  State.DiagHandler = std::move(TDG);
-  State.RecoveryHandler = std::move(TRC);
-  if (TE)
-    TypoExprs.push_back(TE);
-  return TE;
-}
-
-const Sema::TypoExprState &Sema::getTypoExprState(TypoExpr *TE) const {
-  auto Entry = DelayedTypos.find(TE);
-  assert(Entry != DelayedTypos.end() &&
-         "Failed to get the state for a TypoExpr!");
-  return Entry->second;
-}
-
-void Sema::clearDelayedTypo(TypoExpr *TE) {
-  DelayedTypos.erase(TE);
-}
-
 void Sema::ActOnPragmaDump(Scope *S, SourceLocation IILoc, IdentifierInfo *II) {
   DeclarationNameInfo Name(II, IILoc);
   LookupResult R(*this, Name, LookupAnyName,
diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp
index 56815cd2731a..0f39a9817ce7 100644
--- a/clang/lib/Sema/SemaObjC.cpp
+++ b/clang/lib/Sema/SemaObjC.cpp
@@ -124,17 +124,12 @@ ExprResult SemaObjC::CheckObjCForCollectionOperand(SourceLocation forLoc,
   if (!collection)
     return ExprError();
 
-  ExprResult result = SemaRef.CorrectDelayedTyposInExpr(collection);
-  if (!result.isUsable())
-    return ExprError();
-  collection = result.get();
-
   // Bail out early if we've got a type-dependent expression.
   if (collection->isTypeDependent())
     return collection;
 
   // Perform normal l-value conversion.
-  result = SemaRef.DefaultFunctionArrayLvalueConversion(collection);
+  ExprResult result = SemaRef.DefaultFunctionArrayLvalueConversion(collection);
   if (result.isInvalid())
     return ExprError();
   collection = result.get();
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 89e86f49a3ca..49e5a311e239 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14055,8 +14055,10 @@ FunctionDecl *Sema::ResolveSingleFunctionTemplateSpecialization(
     //   specified and it, along with any default template arguments,
     //   identifies a single function template specialization, then the
     //   template-id is an lvalue for the function template specialization.
-    FunctionTemplateDecl *FunctionTemplate
-      = cast<FunctionTemplateDecl>((*I)->getUnderlyingDecl());
+    FunctionTemplateDecl *FunctionTemplate =
+        dyn_cast<FunctionTemplateDecl>((*I)->getUnderlyingDecl());
+    if (!FunctionTemplate)
+      continue;
 
     // C++ [over.over]p2:
     //   If the name is a function template, template argument deduction is
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 50f5757dff5b..923a9e81fbd6 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -535,12 +535,7 @@ Sema::ActOnCaseExpr(SourceLocation CaseLoc, ExprResult Val) {
     return ER;
   };
 
-  ExprResult Converted = CorrectDelayedTyposInExpr(
-      Val, /*InitDecl=*/nullptr, /*RecoverUncorrectedTypos=*/false,
-      CheckAndFinish);
-  if (Converted.get() == Val.get())
-    Converted = CheckAndFinish(Val.get());
-  return Converted;
+  return CheckAndFinish(Val.get());
 }
 
 StmtResult
@@ -2344,7 +2339,7 @@ StmtResult Sema::ActOnForEachLValueExpr(Expr *E) {
 static bool FinishForRangeVarDecl(Sema &SemaRef, VarDecl *Decl, Expr *Init,
                                   SourceLocation Loc, int DiagID) {
   if (Decl->getType()->isUndeducedType()) {
-    ExprResult Res = SemaRef.CorrectDelayedTyposInExpr(Init);
+    ExprResult Res = Init;
     if (!Res.isUsable()) {
       Decl->setInvalidDecl();
       return true;
@@ -3845,10 +3840,7 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD,
 StmtResult
 Sema::ActOnReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp,
                       Scope *CurScope) {
-  // Correct typos, in case the containing function returns 'auto' and
-  // RetValExp should determine the deduced type.
-  ExprResult RetVal = CorrectDelayedTyposInExpr(
-      RetValExp, nullptr, /*RecoverUncorrectedTypos=*/true);
+  ExprResult RetVal = RetValExp;
   if (RetVal.isInvalid())
     return StmtError();
 
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 17da5fd8325b..b78080c99176 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -782,11 +782,10 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
 ExprResult Sema::BuildCXXAssumeExpr(Expr *Assumption,
                                     const IdentifierInfo *AttrName,
                                     SourceRange Range) {
-  ExprResult Res = CorrectDelayedTyposInExpr(Assumption);
-  if (Res.isInvalid())
+  if (!Assumption)
     return ExprError();
 
-  Res = CheckPlaceholderExpr(Res.get());
+  ExprResult Res = CheckPlaceholderExpr(Assumption);
   if (Res.isInvalid())
     return ExprError();
 
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 5f0e968ff18c..572dbf2e7393 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -741,7 +741,6 @@ ExprResult Sema::CheckPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc,
   if (!Pattern->containsUnexpandedParameterPack()) {
     Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
     << Pattern->getSourceRange();
-    CorrectDelayedTyposInExpr(Pattern);
     return ExprError();
   }
 
@@ -1201,11 +1200,9 @@ ExprResult Sema::ActOnPackIndexingExpr(Scope *S, Expr *PackExpression,
                                        SourceLocation RSquareLoc) {
   bool isParameterPack = ::isParameterPack(PackExpression);
   if (!isParameterPack) {
-    if (!PackExpression->containsErrors()) {
-      CorrectDelayedTyposInExpr(IndexExpr);
+    if (!PackExpression->containsErrors())
       Diag(PackExpression->getBeginLoc(), diag::err_expected_name_of_pack)
           << PackExpression;
-    }
     return ExprError();
   }
   ExprResult Res =
@@ -1403,11 +1400,6 @@ ExprResult Sema::ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
   CheckFoldOperand(*this, LHS);
   CheckFoldOperand(*this, RHS);
 
-  auto DiscardOperands = [&] {
-    CorrectDelayedTyposInExpr(LHS);
-    CorrectDelayedTyposInExpr(RHS);
-  };
-
   // [expr.prim.fold]p3:
   //   In a binary fold, op1 and op2 shall be the same fold-operator, and
   //   either e1 shall contain an unexpanded parameter pack or e2 shall contain
@@ -1415,7 +1407,6 @@ ExprResult Sema::ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
   if (LHS && RHS &&
       LHS->containsUnexpandedParameterPack() ==
           RHS->containsUnexpandedParameterPack()) {
-    DiscardOperands();
     return Diag(EllipsisLoc,
                 LHS->containsUnexpandedParameterPack()
                     ? diag::err_fold_expression_packs_both_sides
@@ -1430,7 +1421,6 @@ ExprResult Sema::ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
     Expr *Pack = LHS ? LHS : RHS;
     assert(Pack && "fold expression with neither LHS nor RHS");
     if (!Pack->containsUnexpandedParameterPack()) {
-      DiscardOperands();
       return Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
              << Pack->getSourceRange();
     }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index c8d29f0a625f..3e33fb73e01b 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -13121,12 +13121,6 @@ TreeTransform<Derived>::TransformOpaqueValueExpr(OpaqueValueExpr *E) {
   return E;
 }
 
-template<typename Derived>
-ExprResult
-TreeTransform<Derived>::TransformTypoExpr(TypoExpr *E) {
-  return E;
-}
-
 template <typename Derived>
 ExprResult TreeTransform<Derived>::TransformRecoveryExpr(RecoveryExpr *E) {
   llvm::SmallVector<Expr *, 8> Children;
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 01c838b95575..65102b64030c 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2310,10 +2310,6 @@ void ASTStmtReader::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
   E->setIsUnique(Record.readInt());
 }
 
-void ASTStmtReader::VisitTypoExpr(TypoExpr *E) {
-  llvm_unreachable("Cannot read TypoExpr nodes");
-}
-
 void ASTStmtReader::VisitRecoveryExpr(RecoveryExpr *E) {
   VisitExpr(E);
   unsigned NumArgs = Record.readInt();
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 767e7405752c..a6e320c7f3eb 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2314,12 +2314,6 @@ void ASTStmtWriter::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
   Code = serialization::EXPR_OPAQUE_VALUE;
 }
 
-void ASTStmtWriter::VisitTypoExpr(TypoExpr *E) {
-  VisitExpr(E);
-  // TODO: Figure out sane writer behavior for a TypoExpr, if necessary
-  llvm_unreachable("Cannot write TypoExpr nodes");
-}
-
 //===----------------------------------------------------------------------===//
 // CUDA Expressions and Statements.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index b28deee41d1c..c77ef26da568 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1732,7 +1732,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::ExpressionTraitExprClass:
     case Stmt::UnresolvedLookupExprClass:
     case Stmt::UnresolvedMemberExprClass:
-    case Stmt::TypoExprClass:
     case Stmt::RecoveryExprClass:
     case Stmt::CXXNoexceptExprClass:
     case Stmt::PackExpansionExprClass:
diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp
index 2fa7b69b9347..699746c0b2c4 100644
--- a/clang/test/AST/ByteCode/literals.cpp
+++ b/clang/test/AST/ByteCode/literals.cpp
@@ -910,7 +910,8 @@ namespace CompoundLiterals {
   constexpr int f2(int *x =(int[]){1,2,3}) {
     return x[0];
   }
-  constexpr int g = f2(); // Should evaluate to 1?
+  // Should evaluate to 1?
+  constexpr int g = f2(); // #g_decl
   static_assert(g == 1, "");
 
   // This example should be rejected because the lifetime of the compound
@@ -1347,7 +1348,10 @@ namespace NTTP {
 namespace UnaryOpError {
   constexpr int foo() {
     int f = 0;
-    ++g; // both-error {{use of undeclared identifier 'g'}}
+    ++g; // both-error {{use of undeclared identifier 'g'}} \
+            both-error {{cannot assign to variable 'g' with const-qualified type 'const int'}} \
+            both-note@#g_decl {{'CompoundLiterals::g' declared here}} \
+            both-note@#g_decl {{variable 'g' declared const here}}
     return f;
   }
 }
diff --git a/clang/test/AST/ast-dump-recovery.c b/clang/test/AST/ast-dump-recovery.c
index 68d3f182dd9f..09a03fb9d6fd 100644
--- a/clang/test/AST/ast-dump-recovery.c
+++ b/clang/test/AST/ast-dump-recovery.c
@@ -23,13 +23,6 @@ int postfix_inc = a++;
 // CHECK-NEXT:      `-IntegerLiteral {{.*}} 'int'
 int unary_address = &(a + 1);
 
-// CHECK:       VarDecl {{.*}} ternary 'int' cinit
-// CHECK-NEXT:  `-ConditionalOperator {{.*}}
-// CHECK-NEXT:    |-DeclRefExpr {{.*}} 'a'
-// CHECK-NEXT:    |-RecoveryExpr {{.*}}
-// CHECK-NEXT:    `-DeclRefExpr {{.*}} 'a'
-int ternary = a ? undef : a;
-
 void test1() {
   // CHECK:     `-RecoveryExpr {{.*}} contains-errors
   // CHECK-NEXT:  `-DeclRefExpr {{.*}} 'a' 'const int'
@@ -91,12 +84,6 @@ void test3() {
   // CHECK-NEXT: |   `-DeclRefExpr {{.*}} '__builtin_classify_type'
   // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 1
   (*__builtin_classify_type)(1);
-
-  extern void ext();
-  // CHECK:     CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |-DeclRefExpr {{.*}} 'ext'
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>'
-  ext(undef_var);
 }
 
 // Verify no crash.
@@ -110,23 +97,6 @@ void test4() {
   };
 }
 
-// Verify no crash
-void test5_GH62711() {
-  // CHECK:      VAArgExpr {{.*}} 'int' contains-errors
-  // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  if (__builtin_va_arg(undef, int) << 1);
-}
-
-void test6_GH50244() {
-  double array[16];
-  // CHECK:      UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' contains-errors sizeof
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-DeclRefExpr {{.*}} 'int ()'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} '<dependent type>'
-  sizeof array / sizeof foo(undef);
-}
-
 // No crash on DeclRefExpr that refers to ValueDecl with invalid initializers.
 void test7() {
   int b[] = {""()};
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index b8195950f2fa..a8e30f1759e9 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -9,28 +9,6 @@ int some_func(int *);
 // CHECK-NEXT:    `-IntegerLiteral {{.*}} 123
 // DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
 int invalid_call = some_func(123);
-void test_invalid_call_1(int s) {
-  // CHECK:      CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} <col:13>
-  // CHECK-NEXT: `-BinaryOperator {{.*}}
-  // CHECK-NEXT:   |-RecoveryExpr {{.*}}
-  // CHECK-NEXT:   `-IntegerLiteral {{.*}} <col:28> 'int' 1
-  some_func(undef1, undef2+1);
-
-  // CHECK:      BinaryOperator {{.*}} '<dependent type>' contains-errors '='
-  // CHECK-NEXT: |-DeclRefExpr {{.*}} 's'
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  s = some_func(undef1);
-
-  // CHECK:     VarDecl {{.*}} var 'int'
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  int var = some_func(undef1);
-}
 
 int some_func2(int a, int b);
 void test_invalid_call_2() {
@@ -63,22 +41,6 @@ int ambig_func(float);
 // DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
 int ambig_call = ambig_func(123);
 
-// CHECK:     VarDecl {{.*}} unresolved_call1
-// CHECK-NEXT:`-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-// CHECK-NEXT:  `-UnresolvedLookupExpr {{.*}} 'bar'
-// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
-int unresolved_call1 = bar();
-
-// CHECK:     VarDecl {{.*}} unresolved_call2
-// CHECK-NEXT:`-CallExpr {{.*}} contains-errors
-// CHECK-NEXT:  |-UnresolvedLookupExpr {{.*}} 'bar'
-// CHECK-NEXT:  |-RecoveryExpr {{.*}} contains-errors
-// CHECK-NEXT:  | `-UnresolvedLookupExpr {{.*}} 'baz'
-// CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-// CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'qux'
-// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
-int unresolved_call2 = bar(baz(), qux());
-
 constexpr int a = 10;
 
 // CHECK:     VarDecl {{.*}} postfix_inc
@@ -177,11 +139,6 @@ void test2(Foo2 f) {
   f.overload(1);
 }
 
-// CHECK:     |-AlignedAttr {{.*}} alignas
-// CHECK-NEXT:| `-RecoveryExpr {{.*}} contains-errors
-// CHECK-NEXT:|   `-UnresolvedLookupExpr {{.*}} 'invalid'
-struct alignas(invalid()) Aligned {};
-
 auto f();
 int f(double);
 // CHECK:      VarDecl {{.*}} unknown_type_call 'int'
@@ -203,16 +160,6 @@ void InvalidInitalizer(int x) {
   // CHECK-NEXT:  `-InitListExpr
   // CHECK-NEDT:   `-DeclRefExpr {{.*}} 'x'
   Bar a3{x};
-  // CHECK:     `-VarDecl {{.*}} a4 'Bar'
-  // CHECK-NEXT: `-ParenListExpr {{.*}} 'NULL TYPE' contains-errors
-  // CHECK-NEXT:  `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:   `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar a4(invalid());
-  // CHECK:     `-VarDecl {{.*}} a5 'Bar'
-  // CHECK-NEXT: `-InitListExpr {{.*}} contains-errors
-  // CHECK-NEXT:  `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:   `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar a5{invalid()};
 
   // CHECK:     `-VarDecl {{.*}} b1 'Bar'
   // CHECK-NEXT: `-RecoveryExpr {{.*}} contains-errors
@@ -231,51 +178,11 @@ void InvalidInitalizer(int x) {
   // CHECK-NEXT:    `-InitListExpr {{.*}} 'void'
   // CHECK-NEXT:      `-DeclRefExpr {{.*}} 'x' 'int'
   Bar b4 = Bar{x};
-  // CHECK:     `-VarDecl {{.*}} b5 'Bar'
-  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar' contains-errors 'Bar'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar b5 = Bar(invalid());
-  // CHECK:     `-VarDecl {{.*}} b6 'Bar'
-  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar' contains-errors 'Bar'
-  // CHECK-NEXT:  `-InitListExpr {{.*}} contains-errors
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar b6 = Bar{invalid()};
 
   // CHECK:     RecoveryExpr {{.*}} 'Bar' contains-errors
   // CHECK-NEXT:  `-IntegerLiteral {{.*}} 'int' 1
   Bar(1);
-
-  // CHECK:     `-VarDecl {{.*}} var1
-  // CHECK-NEXT: `-BinaryOperator {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
-  int var1 = undef + 1;
 }
-void InitializerForAuto() {
-  // CHECK:     `-VarDecl {{.*}} invalid a 'auto'
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-UnresolvedLookupExpr {{.*}} 'invalid'
-  auto a = invalid();
-
-  // CHECK:     `-VarDecl {{.*}} invalid b 'auto'
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
-  auto b = some_func(invalid());
-
-  decltype(ned);
-  // very bad initailizer: there is an unresolved typo expr internally, we just
-  // drop it.
-  // CHECK: `-VarDecl {{.*}} invalid unresolved_typo 'auto'
-  auto unresolved_typo = gned.*[] {};
-}
-
-// Verified that the generated call operator is invalid.
-// CHECK: |-CXXMethodDecl {{.*}} invalid operator() 'auto () const -> auto'
-using Escape = decltype([] { return undef(); }());
 
 // CHECK:      VarDecl {{.*}} NoCrashOnInvalidInitList
 // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors lvalue
@@ -301,56 +208,8 @@ void ValueCategory() {
   xvalue(); // call to a function (rvalue reference return type) yields an xvalue.
 }
 
-void InvalidCondition() {
-  // CHECK:      IfStmt {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} <col:7, col:15> '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} <col:7>
-  if (invalid()) {}
-
-  // CHECK:      WhileStmt {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} <col:10, col:18> '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} <col:10>
-  while (invalid()) {}
-
-  // CHECK:      SwitchStmt {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} <col:10>
-  switch(invalid()) {
-    case 1:
-      break;
-  }
-  // FIXME: figure out why the type of ConditionalOperator is not int.
-  // CHECK:      ConditionalOperator {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}}
-  // CHECK-NEXT: |-IntegerLiteral {{.*}} 'int' 1
-  // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 2
-  invalid() ? 1 : 2;
-}
-
 void CtorInitializer() {
   struct S{int m};
-  class MemberInit {
-    int x, y, z;
-    S s;
-    MemberInit() : x(invalid), y(invalid, invalid), z(invalid()), s(1,2) {}
-    // CHECK:      CXXConstructorDecl {{.*}} MemberInit 'void ()'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 'x' 'int'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 'y' 'int'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   |-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 'z' 'int'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |     `-UnresolvedLookupExpr {{.*}} '<overloaded function type>'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 's' 'S'
-    // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'S' contains-errors
-    // CHECK-NEXT: |   |-IntegerLiteral {{.*}} 1
-    // CHECK-NEXT: |   `-IntegerLiteral {{.*}} 2
-  };
   class BaseInit : S {
     BaseInit(float) : S("no match") {}
     // CHECK:      CXXConstructorDecl {{.*}} BaseInit 'void (float)'
@@ -358,13 +217,6 @@ void CtorInitializer() {
     // CHECK-NEXT: |-CXXCtorInitializer 'S'
     // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'S'
     // CHECK-NEXT: |   `-StringLiteral
-
-    BaseInit(double) : S(invalid) {}
-    // CHECK:      CXXConstructorDecl {{.*}} BaseInit 'void (double)'
-    // CHECK-NEXT: |-ParmVarDecl
-    // CHECK-NEXT: |-CXXCtorInitializer 'S'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
   };
   class DelegatingInit {
     DelegatingInit(float) : DelegatingInit("no match") {}
@@ -373,13 +225,6 @@ void CtorInitializer() {
     // CHECK-NEXT: |-CXXCtorInitializer 'DelegatingInit'
     // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'DelegatingInit'
     // CHECK-NEXT: |   `-StringLiteral
-
-    DelegatingInit(double) : DelegatingInit(invalid) {}
-    // CHECK:      CXXConstructorDecl {{.*}} DelegatingInit 'void (double)'
-    // CHECK-NEXT: |-ParmVarDecl
-    // CHECK-NEXT: |-CXXCtorInitializer 'DelegatingInit'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
   };
 }
 
@@ -423,65 +268,6 @@ void returnInitListFromVoid() {
   // CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 8
 }
 
-void RecoveryExprForInvalidDecls(Unknown InvalidDecl) {
-  InvalidDecl + 1;
-  // CHECK:      BinaryOperator {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} '<dependent type>'
-  // CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'InvalidDecl' 'int'
-  // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 1
-  InvalidDecl();
-  // CHECK:      CallExpr {{.*}}
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>'
-}
-
-void InitializerOfInvalidDecl() {
-  int ValidDecl;
-  Unkown InvalidDecl = ValidDecl;
-  // CHECK:      VarDecl {{.*}} invalid InvalidDecl
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'ValidDecl'
-
-  Unknown InvalidDeclWithInvalidInit = Invalid;
-  // CHECK:      VarDecl {{.*}} invalid InvalidDeclWithInvalidInit
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NOT:    `-TypoExpr
-}
-
-void RecoverToAnInvalidDecl() {
-  Unknown* foo; // invalid decl
-  goo; // the typo was correct to the invalid foo.
-  // Verify that RecoveryExpr has an inner DeclRefExpr.
-  // CHECK:      RecoveryExpr {{.*}} '<dependent type>' contains-errors lvalue
-  // CHECK-NEXT: `-DeclRefExpr {{.*}} 'foo' 'int *'
-}
-
-void RecoveryToDoWhileStmtCond() {
-  // CHECK:       FunctionDecl {{.*}} RecoveryToDoWhileStmtCond
-  // CHECK:       `-DoStmt {{.*}}
-  // CHECK-NEXT:    |-CompoundStmt {{.*}}
-  // CHECK-NEXT:    `-BinaryOperator {{.*}} '<dependent type>' contains-errors '<'
-  // CHECK-NEXT:      |-BinaryOperator {{.*}} '<dependent type>' contains-errors '+'
-  // CHECK-NEXT:      | |-RecoveryExpr {{.*}} '<dependent type>' contains-errors lvalue
-  // CHECK-NEXT:      | `-IntegerLiteral {{.*}} 'int' 1
-  // CHECK-NEXT:      `-IntegerLiteral {{.*}} 'int' 10
-  do {} while (some_invalid_val + 1 < 10);
-}
-
-void RecoveryForStmtCond() {
-  // CHECK:FunctionDecl {{.*}} RecoveryForStmtCond
-  // CHECK-NEXT:`-CompoundStmt {{.*}}
-  // CHECK-NEXT:  `-ForStmt {{.*}}
-  // CHECK-NEXT:    |-DeclStmt {{.*}}
-  // CHECK-NEXT:    | `-VarDecl {{.*}}
-  // CHECK-NEXT:    |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
-  // CHECK-NEXT:    |-<<<NULL>>>
-  // CHECK-NEXT:    |-RecoveryExpr {{.*}} 'bool' contains-errors
-  // CHECK-NEXT:    |-UnaryOperator {{.*}} 'int' lvalue prefix '++'
-  // CHECK-NEXT:    | `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'i' 'int'
-  // CHECK-NEXT:    `-CompoundStmt {{.*}}
-  for (int i = 0; i < invalid; ++i) {}
-}
-
 // Fix crash issue https://github.com/llvm/llvm-project/issues/112560.
 // Make sure clang compiles the following code without crashing:
 
diff --git a/clang/test/AST/ast-dump-recovery.m b/clang/test/AST/ast-dump-recovery.m
deleted file mode 100644
index 37fa8045c0b9..000000000000
--- a/clang/test/AST/ast-dump-recovery.m
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -frecovery-ast -frecovery-ast-type -fblocks -ast-dump %s | FileCheck -strict-whitespace %s
-
-@interface Foo
-- (void)method:(int)n;
-@end
-
-void k(Foo *foo) {
-  // CHECK:       ObjCMessageExpr {{.*}} 'void' contains-errors
-  // CHECK-CHECK:  |-ImplicitCastExpr {{.*}} 'Foo *' <LValueToRValue>
-  // CHECK-CHECK:  | `-DeclRefExpr {{.*}} 'foo'
-  // CHECK-CHECK:  `-RecoveryExpr {{.*}}
-  [foo method:undef];
-
-  // CHECK:      ImplicitCastExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'foo'
-  foo.undef;
-}
-
-// CHECK:      |-VarDecl {{.*}} 'int (^)()' cinit
-// CHECK-NEXT: | `-RecoveryExpr {{.*}} '<dependent type> (^)(void)' contains-errors lvalue
-// CHECK-NEXT: |   `-BlockExpr {{.*}} '<dependent type> (^)(void)'
-// CHECK-NEXT: |     `-BlockDecl {{.*}} invalid
-int (^gh63863)() = ^() {
-  return undef;
-};
-
-// CHECK:      `-BlockExpr {{.*}} 'int (^)(int, int)'
-// CHECK-NEXT:   `-BlockDecl {{.*}} invalid
-int (^gh64005)(int, int) = ^(int, undefined b) {
-   return 1;
-};
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index 8b84de0ab5a9..c9dce77b772d 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -702,8 +702,7 @@ namespace cwg141 { // cwg141: 3.1
     // cxx98-error@#cwg141-a {{lookup of 'S' in member access expression is ambiguous; using member of 'struct A'}}
     //   cxx98-note@#cwg141-A-S {{lookup in the object type 'struct A' refers here}}
     //   cxx98-note@#cwg141-S {{lookup from the current scope refers here}}
-    // expected-error@#cwg141-a {{no member named 'n' in 'cwg141::A::S<int>'; did you mean '::cwg141::S<int>::n'?}}
-    //   expected-note@#cwg141-S {{'::cwg141::S<int>::n' declared here}}
+    // expected-error@#cwg141-a {{no member named 'n' in 'cwg141::A::S<int>'}}
     // FIXME: we issue a useful diagnostic first, then some bogus ones.
     b.f<int>();
     // expected-error@-1 {{no member named 'f' in 'cwg141::B'}}
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index ab4d3695b6e2..60d896443ecd 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -220,7 +220,6 @@ int x = cwg2640_a\N{abc});
 int y = cwg2640_a\N{LOTUS});
 // expected-error@-1 {{character <U+1FAB7> not allowed in an identifier}}
 // expected-error@-2 {{use of undeclared identifier 'cwg2640_a🪷'}}
-// expected-error@-3 {{extraneous ')' before ';'}}
 } // namespace cwg2640
 
 // cwg2642: na
diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm
index d7d2b5992a23..6a2c67526c9a 100644
--- a/clang/test/CXX/module/basic/basic.link/p2.cppm
+++ b/clang/test/CXX/module/basic/basic.link/p2.cppm
@@ -51,7 +51,7 @@ void use_from_module_impl() {
   (void)external_linkage_var;
   (void)module_linkage_var;
 
-  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}}
+  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} // expected-note@* {{}}
   (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}}
 }
 
@@ -64,7 +64,7 @@ void use_from_module_impl() {
   internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}}
   (void)external_linkage_class{};
   (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
-  (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
   (void)external_linkage_var;
   (void)module_linkage_var; // expected-error {{undeclared identifier}}
   (void)internal_linkage_var; // expected-error {{undeclared identifier}}
diff --git a/clang/test/FixIt/typo.cpp b/clang/test/FixIt/typo.cpp
deleted file mode 100644
index e489fbbcaa1d..000000000000
--- a/clang/test/FixIt/typo.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: cp %s %t
-// RUN: not %clang_cc1 -fixit -x c++ %t
-// RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x c++ %t
-// RUN: grep test_string %t
-
-namespace std {
-  template<typename T> class basic_string { // expected-note 3{{'basic_string' declared here}}
-  public:
-    int find(const char *substr); // expected-note{{'find' declared here}}
-    static const int npos = -1; // expected-note{{'npos' declared here}}
-  };
-
-  typedef basic_string<char> string; // expected-note 2{{'string' declared here}}
-}
-
-namespace otherstd { // expected-note 2{{'otherstd' declared here}} \
-                     // expected-note{{namespace 'otherstd' defined here}}
-  using namespace std;
-}
-
-using namespace std;
-
-other_std::strng str1; // expected-error{{use of undeclared identifier 'other_std'; did you mean 'otherstd'?}} \
-// expected-error{{no type named 'strng' in namespace 'otherstd'; did you mean 'string'?}}
-tring str2; // expected-error{{unknown type name 'tring'; did you mean 'string'?}}
-
-::other_std::string str3; // expected-error{{no member named 'other_std' in the global namespace; did you mean 'otherstd'?}}
-
-float area(float radius, // expected-note{{'radius' declared here}}
-           float pi) {
-  return radious * pi; // expected-error{{did you mean 'radius'?}}
-}
-
-using namespace othestd; // expected-error{{no namespace named 'othestd'; did you mean 'otherstd'?}}
-namespace blargh = otherstd; // expected-note 3{{namespace 'blargh' defined here}}
-using namespace ::blarg; // expected-error{{no namespace named 'blarg' in the global namespace; did you mean 'blargh'?}}
-
-namespace wibble = blarg; // expected-error{{no namespace named 'blarg'; did you mean 'blargh'?}}
-namespace wobble = ::blarg; // expected-error{{no namespace named 'blarg' in the global namespace; did you mean 'blargh'?}}
-
-bool test_string(std::string s) {
-  basc_string<char> b1; // expected-error{{no template named 'basc_string'; did you mean 'basic_string'?}}
-  std::basic_sting<char> b2; // expected-error{{no template named 'basic_sting' in namespace 'std'; did you mean 'basic_string'?}}
-  (void)b1;
-  (void)b2;
-  return s.fnd("hello") // expected-error{{no member named 'fnd' in 'std::basic_string<char>'; did you mean 'find'?}}
-    == std::string::pos; // expected-error{{no member named 'pos' in 'std::basic_string<char>'; did you mean 'npos'?}}
-}
-
-struct Base { };
-struct Derived : public Base { // expected-note{{base class 'Base' specified here}}
-  int member; // expected-note 3{{'member' declared here}}
-
-  Derived() : base(), // expected-error{{initializer 'base' does not name a non-static data member or base class; did you mean the base class 'Base'?}}
-              ember() { } // expected-error{{initializer 'ember' does not name a non-static data member or base class; did you mean the member 'member'?}}
-
-  int getMember() const {
-    return ember; // expected-error{{use of undeclared identifier 'ember'; did you mean 'member'?}}
-  }
-
-  int &getMember();
-};
-
-int &Derived::getMember() {
-  return ember; // expected-error{{use of undeclared identifier 'ember'; did you mean 'member'?}}
-}
-
-typedef int Integer; // expected-note{{'Integer' declared here}}
-int global_value; // expected-note{{'global_value' declared here}}
-
-int foo() {
-  integer * i = 0; // expected-error{{unknown type name 'integer'; did you mean 'Integer'?}}
-  unsinged *ptr = 0; // expected-error{{use of undeclared identifier 'unsinged'; did you mean 'unsigned'?}}
-  return *i + *ptr + global_val; // expected-error{{use of undeclared identifier 'global_val'; did you mean 'global_value'?}}
-}
-
-namespace nonstd {
-  typedef std::basic_string<char> yarn; // expected-note 2 {{'nonstd::yarn' declared here}}
-  int narf; // expected-note{{'nonstd::narf' declared here}}
-}
-
-yarn str4; // expected-error{{unknown type name 'yarn'; did you mean 'nonstd::yarn'?}}
-wibble::yarn str5; // expected-error{{no type named 'yarn' in namespace 'otherstd'; did you mean 'nonstd::yarn'?}}
-
-namespace another {
-  template<typename T> class wide_string {}; // expected-note {{'another::wide_string' declared here}}
-}
-int poit() {
-  nonstd::basic_string<char> str; // expected-error{{no template named 'basic_string' in namespace 'nonstd'; did you mean simply 'basic_string'?}}
-  nonstd::wide_string<char> str2; // expected-error{{no template named 'wide_string' in namespace 'nonstd'; did you mean 'another::wide_string'?}}
-  return wibble::narf; // expected-error{{no member named 'narf' in namespace 'otherstd'; did you mean 'nonstd::narf'?}}
-}
-
-namespace check_bool {
-  void f() {
-    Bool b; // expected-error{{use of undeclared identifier 'Bool'; did you mean 'bool'?}}
-  }
-}
-
-namespace outr {
-}
-namespace outer {
-  namespace inner { // expected-note{{'outer::inner' declared here}} \
-                    // expected-note{{namespace 'outer::inner' defined here}} \
-                    // expected-note{{'inner' declared here}}
-    int i;
-  }
-}
-
-using namespace outr::inner; // expected-error{{no namespace named 'inner' in namespace 'outr'; did you mean 'outer::inner'?}}
-
-void func() {
-  outr::inner::i = 3; // expected-error{{no member named 'inner' in namespace 'outr'; did you mean 'outer::inner'?}}
-  outer::innr::i = 4; // expected-error{{no member named 'innr' in namespace 'outer'; did you mean 'inner'?}}
-}
-
-struct base {
-};
-struct derived : base {
-  int i;
-};
-
-void func2() {
-  derived d;
-  // FIXME: we should offer a fix here. We do if the 'i' is misspelled, but we don't do name qualification changes
-  //        to replace base::i with derived::i as we would for other qualified name misspellings.
-  // d.base::i = 3;
-}
-
-class A {
-  void bar(int);
-};
-void bar(int, int);  // expected-note{{'::bar' declared here}}
-void A::bar(int x) {
-  bar(x, 5);  // expected-error{{too many arguments to function call, expected 1, have 2; did you mean '::bar'?}}
-}
diff --git a/clang/test/Index/complete-switch.c b/clang/test/Index/complete-switch.c
deleted file mode 100644
index 4a7885459554..000000000000
--- a/clang/test/Index/complete-switch.c
+++ /dev/null
@@ -1,10 +0,0 @@
-void f() {
-  auto foo = bar;
-  switch(foo) {
-    case x:
-      break;
-  }
-}
-
-// RUN: not %clang_cc1 -fsyntax-only -fno-recovery-ast -code-completion-at=%s:4:10 %s | FileCheck %s -allow-empty
-// CHECK-NOT: COMPLETION: foo
diff --git a/clang/test/Index/fix-its.c b/clang/test/Index/fix-its.c
index 1e710c28afcc..8378fd9da9b4 100644
--- a/clang/test/Index/fix-its.c
+++ b/clang/test/Index/fix-its.c
@@ -1,27 +1,12 @@
-// RUN: c-index-test -test-load-source all -fspell-checking %s 2> %t  
+// RUN: c-index-test -test-load-source all -fspell-checking %s 2> %t
 // RUN: FileCheck %s < %t
-struct X {
-  int wibble;
-};
-
 #define MACRO(X) X
 
-void f(struct X *x) {
-  // CHECK: error: no member named 'wobble' in 'struct X'; did you mean 'wibble'?
-  // CHECK: FIX-IT: Replace [13:12 - 13:18] with "wibble"
-  // CHECK: note: 'wibble' declared here
-  MACRO(x->wobble = 17);
-  // CHECK: error: no member named 'wabble' in 'struct X'; did you mean 'wibble'?
-  // CHECK: FIX-IT: Replace [17:6 - 17:12] with "wibble"
-  // CHECK: note: 'wibble' declared here
-  x->wabble = 17;
-}
-
 int printf(const char *restrict, ...);
 
 void f2() {
   unsigned long index;
   // CHECK: warning: format specifies type 'int' but the argument has type 'unsigned long'
-  // CHECK: FIX-IT: Replace [26:17 - 26:19] with "%lu"
+  // CHECK: FIX-IT: Replace [11:17 - 11:19] with "%lu"
   MACRO(printf("%d", index));
 }
diff --git a/clang/test/Lexer/raw-string-ext.c b/clang/test/Lexer/raw-string-ext.c
index de318b616df7..8ed96e5c19f0 100644
--- a/clang/test/Lexer/raw-string-ext.c
+++ b/clang/test/Lexer/raw-string-ext.c
@@ -27,13 +27,13 @@
 // no-warning@* {{ignoring '-fno-raw-string-literals'}}
 
 void f() {
-  (void) R"foo()foo"; // unsupported-error {{use of undeclared identifier 'R'}} cxx-unsupported-error {{expected ';' after expression}}
-  (void) LR"foo()foo"; // unsupported-error {{use of undeclared identifier 'LR'}} cxx-unsupported-error {{expected ';' after expression}}
+  (void) R"foo()foo"; // unsupported-error {{use of undeclared identifier 'R'}}
+  (void) LR"foo()foo"; // unsupported-error {{use of undeclared identifier 'LR'}}
 
 #ifdef UNICODE
-  (void) uR"foo()foo"; // unsupported-error {{use of undeclared identifier 'uR'}} cxx-unsupported-error {{expected ';' after expression}}
-  (void) u8R"foo()foo"; // unsupported-error {{use of undeclared identifier 'u8R'}} cxx-unsupported-error {{expected ';' after expression}}
-  (void) UR"foo()foo"; // unsupported-error {{use of undeclared identifier 'UR'}} cxx-unsupported-error {{expected ';' after expression}}
+  (void) uR"foo()foo"; // unsupported-error {{use of undeclared identifier 'uR'}}
+  (void) u8R"foo()foo"; // unsupported-error {{use of undeclared identifier 'u8R'}}
+  (void) UR"foo()foo"; // unsupported-error {{use of undeclared identifier 'UR'}}
 #endif
 }
 
diff --git a/clang/test/Modules/diagnose-missing-import.m b/clang/test/Modules/diagnose-missing-import.m
index 8fb8e6b25f68..b34bc1a62b6b 100644
--- a/clang/test/Modules/diagnose-missing-import.m
+++ b/clang/test/Modules/diagnose-missing-import.m
@@ -7,11 +7,9 @@
 void foo(void) {
   XYZLogEvent(xyzRiskyCloseOpenParam, xyzRiskyCloseOpenParam); // expected-error {{call to undeclared function 'XYZLogEvent'; ISO C99 and later do not support implicit function declarations}} \
                                                                   expected-error {{declaration of 'XYZLogEvent' must be imported}} \
-                                                                  expected-error {{declaration of 'xyzRiskyCloseOpenParam' must be imported from module 'NCI.A'}} \
                                                                   expected-error {{declaration of 'xyzRiskyCloseOpenParam' must be imported from module 'NCI.A'}}
 }
 
-// expected-note@Inputs/diagnose-missing-import/a.h:5 {{declaration here is not visible}}
 // expected-note@Inputs/diagnose-missing-import/a.h:5 {{declaration here is not visible}}
 // expected-note@Inputs/diagnose-missing-import/a.h:6 {{declaration here is not visible}}
 
diff --git a/clang/test/OpenMP/begin_declare_variant_messages.c b/clang/test/OpenMP/begin_declare_variant_messages.c
index d8d8f4211678..8878188e7ceb 100644
--- a/clang/test/OpenMP/begin_declare_variant_messages.c
+++ b/clang/test/OpenMP/begin_declare_variant_messages.c
@@ -83,7 +83,7 @@ const int var;
 #pragma omp end declare variant
 #pragma omp begin declare variant match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}}
 #pragma omp end declare variant
-#pragma omp begin declare variant match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<recovery-expr>()'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp begin declare variant match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp end declare variant
 #pragma omp begin declare variant match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp end declare variant
diff --git a/clang/test/OpenMP/declare_reduction_messages.cpp b/clang/test/OpenMP/declare_reduction_messages.cpp
index 752cc4fb05a1..f91d952dfa14 100644
--- a/clang/test/OpenMP/declare_reduction_messages.cpp
+++ b/clang/test/OpenMP/declare_reduction_messages.cpp
@@ -69,7 +69,7 @@ class Class2 : public Class1<T> {
 #pragma omp declare reduction(fun77 : long : omp_out += omp_in) initializer(omp_priv Class2 < int > ()) // expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv 23)                 // expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare reduction(fun88 : long : omp_out += omp_in) initializer(omp_priv 23))               // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
-#pragma omp declare reduction(fun9 : long : omp_out += omp_priv) initializer(omp_in = 23)               // expected-error {{use of undeclared identifier 'omp_priv'; did you mean 'omp_in'?}} expected-note {{'omp_in' declared here}}
+#pragma omp declare reduction(fun9 : long : omp_out += omp_priv) initializer(omp_in = 23)               // expected-error {{use of undeclared identifier 'omp_priv'}}
 #pragma omp declare reduction(fun10 : long : omp_out += omp_in) initializer(omp_priv = 23)
 
 template <typename T>
diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c
index 32e365cc415b..d1e36e5d1e7e 100644
--- a/clang/test/OpenMP/declare_variant_messages.c
+++ b/clang/test/OpenMP/declare_variant_messages.c
@@ -11,7 +11,7 @@ int foo(void);
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
 #pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo // expected-error {{expected ')'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} omp50-error {{expected 'match' clause on}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) xxx // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
@@ -42,7 +42,7 @@ int foo(void);
 #pragma omp declare variant(foo) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}}
 #pragma omp declare variant(foo) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}}
-#pragma omp declare variant(foo) match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<recovery-expr>()'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo) match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo) match(device={kind(score(foo()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foo()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}}
 #pragma omp declare variant(foo) match(device={kind(score(5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}}
@@ -56,7 +56,7 @@ int foo(void);
 #pragma omp declare variant(foo) match(target_device={device_num}) // expected-warning {{the context selector 'device_num' in context set 'target_device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
 #pragma omp declare variant(foo) match(target_device={device_num()}) // expected-error {{expected expression}}
 #pragma omp declare variant(foo) match(target_device={device_num(-1)}) // expected-error {{argument to 'device_num' clause must be a non-negative integer value}}
-#pragma omp declare variant(foo) match(target_device={device_num(abc)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'abc'}}
+#pragma omp declare variant(foo) match(target_device={device_num(abc)}) // expected-error {{use of undeclared identifier 'abc'}}
 int bar(void);
 
 
diff --git a/clang/test/OpenMP/declare_variant_messages.cpp b/clang/test/OpenMP/declare_variant_messages.cpp
index 8eb37bc64cbc..06da8a8e5b05 100644
--- a/clang/test/OpenMP/declare_variant_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_messages.cpp
@@ -16,7 +16,7 @@ T foofoo();
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
 #pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo // expected-error {{expected ')'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo <int>) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo <int>) xxx // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
@@ -57,7 +57,7 @@ int bar();
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
 #pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foofoo <T> // expected-error {{expected ')'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo <T>) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
diff --git a/clang/test/OpenMP/target_update_messages.cpp b/clang/test/OpenMP/target_update_messages.cpp
index 83191059202c..000cc80e513e 100644
--- a/clang/test/OpenMP/target_update_messages.cpp
+++ b/clang/test/OpenMP/target_update_messages.cpp
@@ -113,9 +113,11 @@ int main(int argc, char **argv) {
   // Check parsing with two modifiers.
   // lt51-warning@+1 {{missing ':' after ) - ignoring}}
   #pragma omp target update to(mapper(id), present: s)
-  // lt51-error@+3 {{use of undeclared identifier 'present'}}
-  // lt51-error@+2 {{use of undeclared identifier 'id'}}
-  // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  // lt51-error@+5 {{use of undeclared identifier 'present'}}
+  // lt51-error@+4 {{use of undeclared identifier 'id'}}
+  // lt51-error@+3 {{expected ',' or ')' in 'to' clause}}
+  // lt51-error@+2 {{expected ')'}}
+  // lt51-note@+1 {{to match this '('}}
   #pragma omp target update to(present, mapper(id): s)
   // lt51-warning@+1 {{missing ':' after ) - ignoring}}
   #pragma omp target update to(mapper(id) present: s)
@@ -141,10 +143,9 @@ int main(int argc, char **argv) {
   #pragma omp target update to(present,,: s)
   // lt51-warning@+1 {{missing ':' after ) - ignoring}}
   #pragma omp target update to(mapper(id), present,: s)
-  // lt51-error@+4 {{use of undeclared identifier 'present'}}
-  // lt51-error@+3 {{use of undeclared identifier 'id'}}
-  // lt51-error@+2 {{expected expression}}
-  // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  // lt51-error@+3 {{use of undeclared identifier 'present'}}
+  // lt51-error@+2 {{use of undeclared identifier 'id'}}
+  // lt51-error@+1 {{expected expression}}
   #pragma omp target update to(present, mapper(id),: s)
 
   #pragma omp target update from(m) allocate(m) // expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp target update'}}
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index 3e2526979be8..b7a8d30bd16c 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -std=c++2c %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx2c,post2b -fcxx-exceptions
 // RUN: not %clang_cc1 -std=c++17 %s -triple x86_64-unknown-linux-gnu -emit-llvm-only -fcxx-exceptions
 
-struct S { int a, b, c; };
+struct S { int a, b, c; }; // expected-note 2 {{'S::a' declared here}}
 
 // A simple-declaration can be a decompsition declaration.
 namespace SimpleDecl {
@@ -32,7 +32,7 @@ namespace ForRangeDecl {
 namespace OtherDecl {
   // A parameter-declaration is not a simple-declaration.
   // This parses as an array declaration.
-  void f(auto [a, b, c]); // cxx17-error {{'auto' not allowed in function prototype}} expected-error {{'a'}}
+  void f(auto [a, b, c]); // cxx17-error {{'auto' not allowed in function prototype}} expected-error 1+{{'a'}}
 
   void g() {
     // A condition is allowed as a Clang extension.
@@ -46,7 +46,7 @@ namespace OtherDecl {
 
     // An exception-declaration is not a simple-declaration.
     try {}
-    catch (auto [a, b, c]) {} // expected-error {{'auto' not allowed in exception declaration}} expected-error {{'a'}}
+    catch (auto [a, b, c]) {} // expected-error {{'auto' not allowed in exception declaration}} expected-error 1+{{'a'}}
   }
 
   // A member-declaration is not a simple-declaration.
diff --git a/clang/test/Parser/cxx1z-fold-expressions.cpp b/clang/test/Parser/cxx1z-fold-expressions.cpp
index 4a329646b799..d798a9cbb99b 100644
--- a/clang/test/Parser/cxx1z-fold-expressions.cpp
+++ b/clang/test/Parser/cxx1z-fold-expressions.cpp
@@ -37,14 +37,14 @@ template<int ...N> int bad12() { return (... N); } // expected-error {{expected
 
 template<typename ...T> void as_operand_of_cast(int a, T ...t) {
   return
-    (int)(a + ... + undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(a + ... + undeclared_junk) + // expected-error {{undeclared}}
     (int)(t + ... + undeclared_junk) + // expected-error {{undeclared}}
-    (int)(... + undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(... + undeclared_junk) + // expected-error {{undeclared}}
     (int)(undeclared_junk + ...) + // expected-error {{undeclared}}
     (int)(a + ...) + // expected-error {{does not contain any unexpanded}}
     (int)(a, ...) + // expected-error {{does not contain any unexpanded}}
     (int)(..., a) + // expected-error {{does not contain any unexpanded}}
-    (int)(a, ..., undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(a, ..., undeclared_junk) + // expected-error {{undeclared}}
     (int)(t, ...) +
     (int)(..., t) +
     (int)(t, ..., a);
diff --git a/clang/test/Parser/cxx2c-pack-indexing.cpp b/clang/test/Parser/cxx2c-pack-indexing.cpp
index 72e286322fa9..79069a86ea70 100644
--- a/clang/test/Parser/cxx2c-pack-indexing.cpp
+++ b/clang/test/Parser/cxx2c-pack-indexing.cpp
@@ -69,7 +69,8 @@ template <typename... T>
 requires( ); // expected-error {{expected expression}}
 struct SS {
     void f( ) {
-        (*p).~T...[](); // expected-error {{use of undeclared identifier 'p'}}
+        (*p).~T...[](); // expected-error {{use of undeclared identifier 'p'}} \
+                           expected-error {{undeclared identifier 'T' in destructor name}}
     }
 };
 }
diff --git a/clang/test/Parser/objc-foreach-syntax.m b/clang/test/Parser/objc-foreach-syntax.m
index 2158d8062f6c..1ff84f393b9f 100644
--- a/clang/test/Parser/objc-foreach-syntax.m
+++ b/clang/test/Parser/objc-foreach-syntax.m
@@ -21,6 +21,5 @@ MyList * el;
 
 
 static int test7(id keys) {
-  for (id key; in keys) ;  // expected-error {{use of undeclared identifier 'in'}} \
-                           // expected-error {{expected ';' in 'for' statement specifier}}
+  for (id key; in keys) ;  // expected-error {{use of undeclared identifier 'in'}}
 }
diff --git a/clang/test/Parser/opencl-atomics-cl20.cl b/clang/test/Parser/opencl-atomics-cl20.cl
index 2648142f28e7..2cd2c6ca133e 100644
--- a/clang/test/Parser/opencl-atomics-cl20.cl
+++ b/clang/test/Parser/opencl-atomics-cl20.cl
@@ -39,23 +39,17 @@ void atomic_types_test(void) {
 // expected-error@-11 {{use of undeclared identifier 'atomic_ulong'}}
 // expected-error@-11 {{use of undeclared identifier 'atomic_double'}}
 #if defined(LANG_VER_OK)
-// expected-error@-15 {{expected ';' after expression}}
-// expected-error@-16 {{use of undeclared identifier 'l'}}
-// expected-error@-16 {{expected ';' after expression}}
-// expected-error@-17 {{use of undeclared identifier 'ul'}}
 #endif
 #if !defined(LANG_VER_OK) || defined(__SPIR64__)
-// expected-error@-18 {{use of undeclared identifier 'atomic_size_t'}}
-// expected-error@-16 {{use of undeclared identifier 'atomic_ptrdiff_t'}}
+// expected-error@-14 {{use of undeclared identifier 'atomic_size_t'}}
+// expected-error@-12 {{use of undeclared identifier 'atomic_ptrdiff_t'}}
 #if !defined(LANG_VER_OK)
-// expected-error@-20 {{use of undeclared identifier 'atomic_intptr_t'}}
-// expected-error@-20 {{use of undeclared identifier 'atomic_uintptr_t'}}
+// expected-error@-16 {{use of undeclared identifier 'atomic_intptr_t'}}
+// expected-error@-16 {{use of undeclared identifier 'atomic_uintptr_t'}}
 #else
-// expected-error@-24 {{expected ';' after expression}}
-// expected-error@-25 {{use of undeclared identifier 's'}}
-// expected-error@-25 {{unknown type name 'atomic_intptr_t'; did you mean 'atomic_int'?}}
+// expected-error@-19 {{unknown type name 'atomic_intptr_t'; did you mean 'atomic_int'?}}
 // expected-note@* {{'atomic_int' declared here}}
-// expected-error@-26 {{unknown type name 'atomic_uintptr_t'; did you mean 'atomic_uint'?}}
+// expected-error@-20 {{unknown type name 'atomic_uintptr_t'; did you mean 'atomic_uint'?}}
 // expected-note@* {{'atomic_uint' declared here}}
 #endif
 #endif
diff --git a/clang/test/Parser/recovery.c b/clang/test/Parser/recovery.c
index 6fdbedffd236..0d86bd0608bf 100644
--- a/clang/test/Parser/recovery.c
+++ b/clang/test/Parser/recovery.c
@@ -11,7 +11,7 @@ float test2241[2] = {
 static void f (char * (*g) (char **, int), char **p, ...) {
   char *s;
   va_list v;                              // expected-error {{identifier}}
-  s = g (p, __builtin_va_arg(v, int));    // expected-error {{identifier}}
+  s = g (p, __builtin_va_arg(v, int));    // expected-error {{identifier}} expected-error {{extraneous ')' before ';'}}
 }
 
 
diff --git a/clang/test/Parser/switch-recovery.cpp b/clang/test/Parser/switch-recovery.cpp
index 7b3909e3b0d3..40712799933c 100644
--- a/clang/test/Parser/switch-recovery.cpp
+++ b/clang/test/Parser/switch-recovery.cpp
@@ -104,7 +104,7 @@ void test9(int x) { // expected-note {{'x' declared here}}
               expected-error {{expected expression}}
     8:: x; // expected-error {{expected ';' after expression}} \
               expected-error {{no member named 'x' in the global namespace; did you mean simply 'x'?}} \
-              expected-warning {{expression result unused}}
+              expected-warning 2 {{expression result unused}}
     9:: :y; // expected-error {{expected ';' after expression}} \
                expected-error {{expected unqualified-id}} \
                expected-warning {{expression result unused}}
diff --git a/clang/test/Parser/switch-typo-correction.cpp b/clang/test/Parser/switch-typo-correction.cpp
index ebf1c18f2b86..95d610b9cdd2 100644
--- a/clang/test/Parser/switch-typo-correction.cpp
+++ b/clang/test/Parser/switch-typo-correction.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 
-namespace c { double xxx; } // expected-note{{'c::xxx' declared here}}
+namespace c { double xxx; }
 namespace d { float xxx; }
 namespace z { namespace xxx {} }
 
 void crash() {
-  switch (xxx) {} // expected-error{{use of undeclared identifier 'xxx'; did you mean }}
+  switch (xxx) {} // expected-error{{use of undeclared identifier 'xxx'}}
 }
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.cpp b/clang/test/ParserOpenACC/parse-cache-construct.cpp
index a5a1e58028c3..948f2e30f149 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.cpp
+++ b/clang/test/ParserOpenACC/parse-cache-construct.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 %s -verify -fopenacc
 
 namespace NS {
-  static char* NSArray;// expected-note{{declared here}}
-  static int NSInt;// expected-note 2{{declared here}}
+  static char* NSArray; // expected-note {{'NS::NSArray' declared here}}
+  static int NSInt;     // expected-note 2 {{'NS::NSInt' declared here}}
 }
 char *getArrayPtr();
 template<typename T, int I>
@@ -21,17 +21,17 @@ void func() {
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+1{{use of undeclared identifier 'NSArray'; did you mean 'NS::NSArray'}}
+    // expected-error@+1{{use of undeclared identifier 'NSArray'}}
     #pragma acc cache(NSArray[NS::NSInt : NS::NSInt])
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+1{{use of undeclared identifier 'NSInt'; did you mean 'NS::NSInt'}}
+    // expected-error@+1{{use of undeclared identifier 'NSInt'}}
     #pragma acc cache(NS::NSArray[NSInt : NS::NSInt])
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+1{{use of undeclared identifier 'NSInt'; did you mean 'NS::NSInt'}}
+    // expected-error@+1{{use of undeclared identifier 'NSInt'}}
     #pragma acc cache(NS::NSArray[NS::NSInt : NSInt])
   }
 }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 6d771e858d24..a9ad7ab176cb 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -347,9 +347,7 @@ void SelfUpdate() {
 #pragma acc update host(s) self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+3{{use of undeclared identifier 'zero'}}
-  // expected-error@+2{{expected ','}}
-  // expected-error@+1{{expected expression}}
+  // expected-error@+1{{use of undeclared identifier 'zero'}}
 #pragma acc update self(zero : s.array[s.value : 5], s.value), if_present
   for(int i = 0; i < 5;++i) {}
 
@@ -453,8 +451,6 @@ void VarListClauses() {
 #pragma acc parallel copy(always, alwaysin, always: HasMem.MemArr[3:]) self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+3{{use of undeclared identifier 'always'}}
-  // expected-error@+2{{use of undeclared identifier 'alwaysin'}}
   // expected-error@+1{{use of undeclared identifier 'always'}}
 #pragma acc parallel copy(always, alwaysin, always, HasMem.MemArr[3:]) self
   for(int i = 0; i < 5;++i) {}
@@ -591,8 +587,7 @@ void VarListClauses() {
 #pragma acc serial copyout(zero : s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'zero'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'zero'}}
 #pragma acc serial copyout(zero s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -608,8 +603,7 @@ void VarListClauses() {
 #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc serial copyout(invalid s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -657,8 +651,7 @@ void VarListClauses() {
 #pragma acc serial create(zero : s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'zero'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'zero'}}
 #pragma acc serial create(zero s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -674,8 +667,7 @@ void VarListClauses() {
 #pragma acc serial create(invalid:s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc serial create(invalid s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -700,8 +692,7 @@ void VarListClauses() {
 #pragma acc serial copyin(readonly : s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'readonly'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'readonly'}}
 #pragma acc serial copyin(readonly s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -717,8 +708,7 @@ void VarListClauses() {
 #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc serial copyin(invalid s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
diff --git a/clang/test/ParserOpenACC/parse-constructs.cpp b/clang/test/ParserOpenACC/parse-constructs.cpp
index 814f6a1fd09f..69b04bcbad9e 100644
--- a/clang/test/ParserOpenACC/parse-constructs.cpp
+++ b/clang/test/ParserOpenACC/parse-constructs.cpp
@@ -18,13 +18,13 @@ namespace NS {
 #pragma acc routine(NS::foo) seq
 
 // expected-error@+2{{use of undeclared identifier 'templ'; did you mean 'NS::templ'?}}
-// expected-error@+1{{OpenACC routine name 'NS::templ' names a set of overloads}}
+// expected-error@+1{{OpenACC routine name 'templ' names a set of overloads}}
 #pragma acc routine(templ) seq
 // expected-error@+1{{OpenACC routine name 'NS::templ' names a set of overloads}}
 #pragma acc routine(NS::templ) seq
 
 // expected-error@+2{{use of undeclared identifier 'templ'; did you mean 'NS::templ'?}}
-// expected-error@+1{{OpenACC routine name 'NS::templ' names a set of overloads}}
+// expected-error@+1{{OpenACC routine name 'templ<int>' names a set of overloads}}
 #pragma acc routine(templ<int>) seq
 // expected-error@+1{{OpenACC routine name 'NS::templ<int>' names a set of overloads}}
 #pragma acc routine(NS::templ<int>) seq
diff --git a/clang/test/ParserOpenACC/parse-wait-clause.c b/clang/test/ParserOpenACC/parse-wait-clause.c
index 16e31a67c094..5c006b4379a2 100644
--- a/clang/test/ParserOpenACC/parse-wait-clause.c
+++ b/clang/test/ParserOpenACC/parse-wait-clause.c
@@ -85,19 +85,16 @@ void func() {
   #pragma acc parallel wait (devnum: i + j:queues:) clause-list
     {}
 
-  // expected-error@+4{{use of undeclared identifier 'devnum'}}
-  // expected-error@+3{{expected ','}}
+  // expected-error@+3{{use of undeclared identifier 'devnum'}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait (queues:devnum: i + j
     {}
 
-  // expected-error@+2{{expected ','}}
   // expected-error@+1{{use of undeclared identifier 'devnum'}}
   #pragma acc parallel wait (queues:devnum: i + j)
     {}
 
-  // expected-error@+3{{expected ','}}
   // expected-error@+2{{use of undeclared identifier 'devnum'}}
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait (queues:devnum: i + j) clause-list
diff --git a/clang/test/ParserOpenACC/parse-wait-construct.c b/clang/test/ParserOpenACC/parse-wait-construct.c
index 491c3bee4ac5..27a3a02dc263 100644
--- a/clang/test/ParserOpenACC/parse-wait-construct.c
+++ b/clang/test/ParserOpenACC/parse-wait-construct.c
@@ -68,18 +68,15 @@ void func() {
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc wait (devnum: i + j:queues:) clause-list
 
-  // expected-error@+4{{use of undeclared identifier 'devnum'}}
-  // expected-error@+3{{expected ','}}
+  // expected-error@+3{{use of undeclared identifier 'devnum'}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc wait (queues:devnum: i + j
 
-  // expected-error@+2{{use of undeclared identifier 'devnum'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'devnum'}}
   #pragma acc wait (queues:devnum: i + j)
 
-  // expected-error@+3{{use of undeclared identifier 'devnum'}}
-  // expected-error@+2{{expected ','}}
+  // expected-error@+2{{use of undeclared identifier 'devnum'}}
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc wait (queues:devnum: i + j) clause-list
 
diff --git a/clang/test/Sema/PR28181.c b/clang/test/Sema/PR28181.c
index 8d0a4ad33562..7e9d5cc91038 100644
--- a/clang/test/Sema/PR28181.c
+++ b/clang/test/Sema/PR28181.c
@@ -5,9 +5,9 @@ struct spinlock_t {
 } audit_skb_queue;
 
 void fn1(void) {
-  audit_skb_queue = (lock); // expected-error {{use of undeclared identifier 'lock'; did you mean 'long'?}}
-}                           // expected-error@-1 {{assigning to 'struct spinlock_t' from incompatible type '<overloaded function type>'}}
+  audit_skb_queue = (lock); // expected-error {{use of undeclared identifier 'lock'}}
+}
 
 void fn2(void) {
-  audit_skb_queue + (lock); // expected-error {{use of undeclared identifier 'lock'; did you mean 'long'?}}
-}                           // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+  audit_skb_queue + (lock); // expected-error {{use of undeclared identifier 'lock'}}
+}
diff --git a/clang/test/Sema/builtin-unary-fp.c b/clang/test/Sema/builtin-unary-fp.c
index fb8e341156a5..9bfcb30b9eba 100644
--- a/clang/test/Sema/builtin-unary-fp.c
+++ b/clang/test/Sema/builtin-unary-fp.c
@@ -17,5 +17,4 @@ void a(void) {
 
   check(__builtin_fpclassify(0,0,0,0,0, (invalid))); // expected-error{{use of undeclared identifier 'invalid'}}
   check(__builtin_fpclassify(0,0,0,0,0, (inf))); // expected-error{{use of undeclared identifier 'inf'}}
-                                                // expected-error@-1{{reference to overloaded function could not be resolved}}
 }
diff --git a/clang/test/Sema/c23-delayed-typo-correction-crashes.c b/clang/test/Sema/c23-delayed-typo-correction-crashes.c
new file mode 100644
index 000000000000..6afd3fd32c36
--- /dev/null
+++ b/clang/test/Sema/c23-delayed-typo-correction-crashes.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c23 -fsyntax-only -verify %s
+
+void GH139913(...);
+void GH139913_test() {
+  GH139913(CONCAT(foo, )); // expected-error {{use of undeclared identifier 'CONCAT'}} \
+                              expected-error {{use of undeclared identifier 'foo'}} \
+                              expected-error {{expected expression}}
+}
+
+struct GH137867 {
+ char value;
+};
+void GH137867_test() {
+  _Atomic(struct GH137867) t;
+  while (!atomic_load(&t.value)->value) // expected-error {{use of undeclared identifier 'atomic_load'}} \
+                                           expected-error {{accessing a member of an atomic structure or union is undefined behavior}}
+    ;
+}
diff --git a/clang/test/Sema/delayed-typo-correction-crashes.c b/clang/test/Sema/delayed-typo-correction-crashes.c
new file mode 100644
index 000000000000..81c966789ccb
--- /dev/null
+++ b/clang/test/Sema/delayed-typo-correction-crashes.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fsyntax-only -fblocks -ffixed-point -verify %s
+
+void GH137860_test(void) {
+  struct S {
+    char h;
+  };
+  _Atomic struct S s = { .h = UINT8_MIN }; // expected-error {{use of undeclared identifier 'UINT8_MIN'}}
+  __c11_atomic_fetch_add(&s.h, UINT8_MIN); // expected-error {{use of undeclared identifier 'UINT8_MIN'}} \
+                                              expected-error {{accessing a member of an atomic structure or union is undefined behavior}}
+}
+
+int (^GH69470) (int i, int j) = ^(int i, int j)
+{ return i / j; }/ j; // expected-error {{use of undeclared identifier 'j'}}
+
+void GH69874(void) {
+  *a = (a_struct){0}; // expected-error {{use of undeclared identifier 'a'}} \
+                         expected-error {{use of undeclared identifier 'a_struct'}}
+}
diff --git a/clang/test/Sema/invalid-member.cpp b/clang/test/Sema/invalid-member.cpp
index 57ee187ccf4d..0e3fec1b18ee 100644
--- a/clang/test/Sema/invalid-member.cpp
+++ b/clang/test/Sema/invalid-member.cpp
@@ -20,10 +20,12 @@ class Z {
 // Should be able to evaluate sizeof without crashing.
 static_assert(sizeof(Z) == 1, "No valid members");
 
-constexpr int N = undef; // expected-error {{use of undeclared identifier}}
+constexpr int N = undef; // expected-error {{use of undeclared identifier}} \
+                            expected-note {{declared here}}
 template<int a>
 class ABC {};
 class T {
-  ABC<N> abc;
+  ABC<N> abc; // expected-error {{non-type template argument is not a constant expression}} \
+                 expected-note {{initializer of 'N' is unknown}}
 };
 static_assert(sizeof(T) == 1, "No valid members");
diff --git a/clang/test/Sema/typo-correction-ambiguity.cpp b/clang/test/Sema/typo-correction-ambiguity.cpp
index 9dcff3d68c82..b2dae1d7696c 100644
--- a/clang/test/Sema/typo-correction-ambiguity.cpp
+++ b/clang/test/Sema/typo-correction-ambiguity.cpp
@@ -18,12 +18,12 @@ void testAmbiguousNoSuggestions()
 
 namespace MultipleCorrectionsButNotAmbiguous
 {
-  int PrefixType_Name(int value);  // expected-note {{'PrefixType_Name' declared here}}
+  int PrefixType_Name(int value);
   int PrefixType_MIN();
   int PrefixType_MAX();
 };
 
 int testMultipleCorrectionsButNotAmbiguous() {
-  int val = MultipleCorrectionsButNotAmbiguous::PrefixType_Enum(0);  // expected-error {{no member named 'PrefixType_Enum' in namespace 'MultipleCorrectionsButNotAmbiguous'; did you mean 'PrefixType_Name'?}}
+  int val = MultipleCorrectionsButNotAmbiguous::PrefixType_Enum(0);  // expected-error {{no member named 'PrefixType_Enum' in namespace 'MultipleCorrectionsButNotAmbiguous'}}
   return val;
 }
diff --git a/clang/test/Sema/typo-correction-no-hang.c b/clang/test/Sema/typo-correction-no-hang.c
index e6041704ff32..da234a2c7373 100644
--- a/clang/test/Sema/typo-correction-no-hang.c
+++ b/clang/test/Sema/typo-correction-no-hang.c
@@ -2,16 +2,15 @@
 
 // PR50797
 struct a {
-  int xxx; // expected-note {{'xxx' declared here}}
+  int xxx;
 };
 
 int g_107;
 int g_108;
 int g_109;
 
-struct a g_999; // expected-note 4{{'g_999' declared here}}
+struct a g_999;
 
-void b(void) { (g_910.xxx = g_910.xxx); } //expected-error 2{{use of undeclared identifier 'g_910'; did you mean 'g_999'}}
+void b(void) { (g_910.xxx = g_910.xxx); } //expected-error 2{{use of undeclared identifier 'g_910'}}
 
-void c(void) { (g_910.xxx = g_910.xxx1); } //expected-error 2{{use of undeclared identifier 'g_910'; did you mean 'g_999'}} \
-                                             expected-error {{no member named 'xxx1' in 'struct a'; did you mean 'xxx'}}
+void c(void) { (g_910.xxx = g_910.xxx1); } //expected-error 2{{use of undeclared identifier 'g_910'}}
diff --git a/clang/test/Sema/typo-correction-no-hang.cpp b/clang/test/Sema/typo-correction-no-hang.cpp
index 3c591645be25..34b8486bed90 100644
--- a/clang/test/Sema/typo-correction-no-hang.cpp
+++ b/clang/test/Sema/typo-correction-no-hang.cpp
@@ -8,10 +8,12 @@ struct rdar38642201 {
 
 void rdar38642201_callee(int x, int y);
 void rdar38642201_caller() {
-  struct rdar38642201 structVar;
+  struct rdar38642201 structVar;      //expected-note 2{{'structVar' declared here}}
   rdar38642201_callee(
-      structVar1.fieldName1.member1,  //expected-error{{use of undeclared identifier 'structVar1'}}
-      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}}
+      structVar1.fieldName1.member1,  //expected-error{{use of undeclared identifier 'structVar1'}} \
+                                        expected-error{{no member named 'fieldName1' in 'rdar38642201'}}
+      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}} \
+                                        expected-error{{no member named 'fieldName2' in 'rdar38642201'}}
 }
 
 // Similar reproducer.
@@ -20,7 +22,7 @@ public:
   int minut() const = delete;
   int hour() const = delete;
 
-  int longit() const; //expected-note{{'longit' declared here}}
+  int longit() const;
   int latit() const;
 };
 
@@ -35,6 +37,6 @@ int Foo(const B &b) {
 }
 
 int Bar(const B &b) {
-  return b.depar().longitude() + //expected-error{{no member named 'longitude' in 'A'; did you mean 'longit'?}}
+  return b.depar().longitude() + //expected-error{{no member named 'longitude' in 'A'}}
          b.depar().latitude();   //expected-error{{no member named 'latitude' in 'A'}}
 }
diff --git a/clang/test/Sema/typo-correction-recursive.cpp b/clang/test/Sema/typo-correction-recursive.cpp
index b39beb5493f6..a7d7127564b7 100644
--- a/clang/test/Sema/typo-correction-recursive.cpp
+++ b/clang/test/Sema/typo-correction-recursive.cpp
@@ -8,13 +8,13 @@
 class DeepClass
 {
 public:
-  void trigger() const;  // expected-note {{'trigger' declared here}}
+  void trigger() const;
 };
 
 class Y
 {
 public:
-  const DeepClass& getX() const { return m_deepInstance; }  // expected-note {{'getX' declared here}}
+  const DeepClass& getX() const { return m_deepInstance; }
 private:
   DeepClass m_deepInstance;
   int m_n;
@@ -23,7 +23,7 @@ private:
 class Z
 {
 public:
-  const Y& getY0() const { return m_y0; }  // expected-note {{'getY0' declared here}}
+  const Y& getY0() const { return m_y0; }
   const Y& getActiveY() const { return m_y0; }
 
 private:
@@ -35,9 +35,9 @@ Z z_obj;
 
 void testMultipleCorrections()
 {
-  z_obj.getY2().  // expected-error {{no member named 'getY2' in 'Z'; did you mean 'getY0'}}
-      getM().     // expected-error {{no member named 'getM' in 'Y'; did you mean 'getX'}}
-      triggee();  // expected-error {{no member named 'triggee' in 'DeepClass'; did you mean 'trigger'}}
+  z_obj.getY2().  // expected-error {{no member named 'getY2' in 'Z'}}
+      getM().
+      triggee();
 }
 
 void testNoCorrections()
@@ -53,19 +53,19 @@ struct A {
   C get_me_a_C();
 };
 struct B {
-  D get_me_a_D();  // expected-note {{'get_me_a_D' declared here}}
+  D get_me_a_D();
 };
 class Scope {
 public:
   A make_an_A();
-  B make_a_B();  // expected-note {{'make_a_B' declared here}}
+  B make_a_B();
 };
 
 Scope scope_obj;
 
 int testDiscardedCorrections() {
-  return scope_obj.make_an_E().  // expected-error {{no member named 'make_an_E' in 'Scope'; did you mean 'make_a_B'}}
-      get_me_a_Z().value;        // expected-error {{no member named 'get_me_a_Z' in 'B'; did you mean 'get_me_a_D'}}
+  return scope_obj.make_an_E().  // expected-error {{no member named 'make_an_E' in 'Scope'}}
+      get_me_a_Z().value;
 }
 
 class AmbiguousHelper {
@@ -120,13 +120,13 @@ int testDeepAmbiguity() {
 }
 
 struct Dog {
-  int age;  //expected-note{{'age' declared here}}
-  int size; //expected-note{{'size' declared here}}
+  int age;
+  int size;
 };
 
 int from_dog_years(int DogYears, int DogSize);
 int get_dog_years() {
   struct Dog doggo;
-  return from_dog_years(doggo.agee,   //expected-error{{no member named 'agee' in 'Dog'; did you mean 'age'}}
-                        doggo.sizee); //expected-error{{no member named 'sizee' in 'Dog'; did you mean 'size'}}
+  return from_dog_years(doggo.agee,   //expected-error{{no member named 'agee' in 'Dog'}}
+                        doggo.sizee); //expected-error{{no member named 'sizee' in 'Dog'}}
 }
diff --git a/clang/test/Sema/typo-correction.c b/clang/test/Sema/typo-correction.c
index 4157207a9ac4..510a67e725f9 100644
--- a/clang/test/Sema/typo-correction.c
+++ b/clang/test/Sema/typo-correction.c
@@ -50,10 +50,12 @@ void fn1(void) {
   cabs(errij);  // expected-error {{use of undeclared identifier 'errij'}}
 }
 
-extern long afunction(int);
+extern long afunction(int); // expected-note {{'afunction' declared here}} \
+                               expected-note {{passing argument to parameter here}}
 void fn2(void) {
   f(THIS_IS_AN_ERROR,       // expected-error {{use of undeclared identifier 'THIS_IS_AN_ERROR'}}
-    afunction(afunction_)); // expected-error {{use of undeclared identifier 'afunction_'}}
+    afunction(afunction_)); // expected-error {{use of undeclared identifier 'afunction_'}} \
+                               expected-error {{incompatible pointer to integer conversion passing 'long (int)' to parameter of type 'int'}}
 }
 
 int d = X ? d : L; // expected-error 2 {{use of undeclared identifier}}
@@ -94,22 +96,24 @@ struct rdar38642201 {
 
 void rdar38642201_callee(int x, int y);
 void rdar38642201_caller(void) {
-  struct rdar38642201 structVar;
+  struct rdar38642201 structVar;     // expected-note 2{{'structVar' declared here}}
   rdar38642201_callee(
-      structVar1.fieldName1.member1, //expected-error{{use of undeclared identifier 'structVar1'}}
-      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}}
+      structVar1.fieldName1.member1, //expected-error{{use of undeclared identifier 'structVar1'}} \
+                                       expected-error{{no member named 'fieldName1' in 'struct rdar38642201'}}
+      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}} \
+                                        expected-error{{no member named 'fieldName2' in 'struct rdar38642201'}}
 }
 
 void PR40286_g(int x, int y);
 void PR40286_h(int x, int y, int z);
-void PR40286_1(int the_value) {
-  PR40286_g(the_walue); // expected-error {{use of undeclared identifier 'the_walue'}}
+void PR40286_1(int the_value) { // expected-note {{'the_value' declared here}}
+  PR40286_g(the_walue, 0); // expected-error {{use of undeclared identifier 'the_walue'}}
 }
-void PR40286_2(int the_value) {
-  PR40286_h(the_value, the_walue); // expected-error {{use of undeclared identifier 'the_walue'}}
+void PR40286_2(int the_value) { // expected-note {{'the_value' declared here}}
+  PR40286_h(the_value, the_walue, 0); // expected-error {{use of undeclared identifier 'the_walue'}}
 }
-void PR40286_3(int the_value) {
-  PR40286_h(the_walue); // expected-error {{use of undeclared identifier 'the_walue'}}
+void PR40286_3(int the_value) { // expected-note {{'the_value' declared here}}
+  PR40286_h(the_walue, 0, 0); // expected-error {{use of undeclared identifier 'the_walue'}}
 }
 void PR40286_4(int the_value) { // expected-note {{'the_value' declared here}}
   PR40286_h(the_value, the_value, the_walue); // expected-error {{use of undeclared identifier 'the_walue'; did you mean 'the_value'?}}
diff --git a/clang/test/SemaCXX/arrow-operator.cpp b/clang/test/SemaCXX/arrow-operator.cpp
index 295dea3c1756..a789c4e36e4c 100644
--- a/clang/test/SemaCXX/arrow-operator.cpp
+++ b/clang/test/SemaCXX/arrow-operator.cpp
@@ -47,23 +47,22 @@ class wrapped_ptr {
  public:
   wrapped_ptr(T* ptr) : ptr_(ptr) {}
   T* operator->() { return ptr_; }
-  void Check(); // expected-note {{'Check' declared here}}
+  void Check();
  private:
   T *ptr_;
 };
 
 class Worker {
  public:
-  void DoSomething(); // expected-note {{'DoSomething' declared here}}
+  void DoSomething();
   void Chuck();
 };
 
 void test() {
   wrapped_ptr<Worker> worker(new Worker);
   worker.DoSomething(); // expected-error {{no member named 'DoSomething' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'; did you mean to use '->' instead of '.'?}}
-  worker.DoSamething(); // expected-error {{no member named 'DoSamething' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'; did you mean to use '->' instead of '.'?}} \
-                        // expected-error {{no member named 'DoSamething' in 'arrow_suggest::Worker'; did you mean 'DoSomething'?}}
-  worker.Chuck(); // expected-error {{no member named 'Chuck' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'; did you mean 'Check'?}}
+  worker.DoSamething(); // expected-error {{no member named 'DoSamething' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'}}
+  worker.Chuck(); // expected-error {{no member named 'Chuck' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'}}
 }
 
 } // namespace arrow_suggest
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index eeeb58f1a771..ab4e50072f65 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1888,10 +1888,11 @@ namespace PR15884 {
 }
 
 namespace AfterError {
-  constexpr int error() {
+  constexpr int error() { // pre-cxx23-error {{no return statement in constexpr function}}
     return foobar; // expected-error {{undeclared identifier}}
-  }
-  constexpr int k = error(); // expected-error {{constexpr variable 'k' must be initialized by a constant expression}}
+  } // cxx23-note {{control reached end of constexpr function}}
+  constexpr int k = error(); // cxx23-error {{constexpr variable 'k' must be initialized by a constant expression}} \
+                                cxx23-note {{in call to 'error()'}}
 }
 
 namespace std {
diff --git a/clang/test/SemaCXX/conversion-function.cpp b/clang/test/SemaCXX/conversion-function.cpp
index b653a3bf1a1d..717c73c4786e 100644
--- a/clang/test/SemaCXX/conversion-function.cpp
+++ b/clang/test/SemaCXX/conversion-function.cpp
@@ -458,7 +458,7 @@ namespace PR18234 {
 #endif
   } a;
   A::S s = a; // expected-error {{no viable conversion from 'struct A' to 'A::S'}}
-  A::E e = a;
+  A::E e = a; // expected-note {{'e' declared here}}
   bool k1 = e == A::e; // expected-error {{no member named 'e'}}
   bool k2 = e.n == 0;
 }
diff --git a/clang/test/SemaCXX/coroutines.cpp b/clang/test/SemaCXX/coroutines.cpp
index 068fdab4bfe3..c9cefeb30c15 100644
--- a/clang/test/SemaCXX/coroutines.cpp
+++ b/clang/test/SemaCXX/coroutines.cpp
@@ -8,19 +8,16 @@
 // RUN: not %clang_cc1 -std=c++20 -fsyntax-only %s -fcxx-exceptions -fexceptions -Wunused-result 2>&1 | FileCheck %s
 
 void no_coroutine_traits_bad_arg_await() {
-  co_await a; // expected-error {{include <coroutine>}}
-  // expected-error@-1 {{use of undeclared identifier 'a'}}
+  co_await a; // expected-error {{use of undeclared identifier 'a'}}
 }
 
 void no_coroutine_traits_bad_arg_yield() {
-  co_yield a; // expected-error {{include <coroutine>}}
-  // expected-error@-1 {{use of undeclared identifier 'a'}}
+  co_yield a; // expected-error {{use of undeclared identifier 'a'}}
 }
 
 
 void no_coroutine_traits_bad_arg_return() {
-  co_return a; // expected-error {{include <coroutine>}}
-  // expected-error@-1 {{use of undeclared identifier 'a'}}
+  co_return a; // expected-error {{use of undeclared identifier 'a'}}
 }
 
 void no_coroutine_traits() {
@@ -208,8 +205,7 @@ void mixed_yield() {
 
 void mixed_yield_invalid() {
   co_yield blah; // expected-error {{use of undeclared identifier}}
-  // expected-note@-1 {{function is a coroutine due to use of 'co_yield'}}
-  return; // expected-error {{return statement not allowed in coroutine}}
+  return;
 }
 
 void mixed_yield_return_first(bool b) {
@@ -231,8 +227,7 @@ void mixed_return_for_range(bool b, T t) {
 template <class T>
 void mixed_yield_template(T) {
   co_yield blah; // expected-error {{use of undeclared identifier}}
-  // expected-note@-1 {{function is a coroutine due to use of 'co_yield'}}
-  return; // expected-error {{return statement not allowed in coroutine}}
+  return;
 }
 
 template <class T>
@@ -314,10 +309,9 @@ template void mixed_coreturn_template(void_tag, bool, int); // expected-note {{r
 template <class T>
 void mixed_coreturn_template2(bool b, T) {
   if (b)
-    co_return v; // expected-note {{use of 'co_return'}}
-    // expected-error@-1 {{use of undeclared identifier 'v'}}
+    co_return v; // expected-error {{use of undeclared identifier 'v'}}
   else
-    return; // expected-error {{not allowed in coroutine}}
+    return;
 }
 
 struct promise_handle;
diff --git a/clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp b/clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp
new file mode 100644
index 000000000000..f3aa05153281
--- /dev/null
+++ b/clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace GH138850 {
+void test() {
+  int tmp = add(int, 0, 0);    // expected-error {{expected '(' for function-style cast or type construction}} \
+                                  expected-note {{previous definition is here}}
+  uint tmp = add(uint, 1, 1);  // expected-error {{use of undeclared identifier 'uint'; did you mean 'int'?}} \
+                                  expected-error {{redefinition of 'tmp'}} \
+                                  expected-error {{use of undeclared identifier 'uint'}}
+  call(void, f, (int)tmp);     // expected-error {{expected '(' for function-style cast or type construction}} \
+                                  expected-error {{use of undeclared identifier 'f'}}
+}
+}
+
+namespace GH107840 {
+struct tm {};          // expected-note {{'tm' declared here}}
+
+auto getCache = [&] {  // expected-error {{non-local lambda expression cannot have a capture-default}}
+  ::foo([=] {          // expected-error {{no member named 'foo' in the global namespace}}
+    tms time;          // expected-error {{unknown type name 'tms'; did you mean 'tm'?}}
+    (void)time;
+  });
+};
+}
+
+namespace GH59391 {
+template <typename b> class c {
+  c(b);
+  b e;
+  void f() {
+    for (auto core : a::c(cores)) { // expected-error {{use of undeclared identifier 'cores'}} \
+                                       expected-error {{use of undeclared identifier 'a'}}
+    }
+  }
+};
+}
+
+namespace GH45915 {
+short g_volatile_ushort;                   // expected-note {{'g_volatile_ushort' declared here}}
+namespace a {
+   int b = l_volatile_uwchar.a ::c ::~d<>; // expected-error {{use of undeclared identifier 'l_volatile_uwchar'}} \
+                                              expected-error {{no member named 'd' in namespace 'GH45915::a'}}
+}
+}
+
+namespace GH45891 {
+int a = b.c < enum , > :: template ~d < > [ e; // expected-error {{use of undeclared identifier 'b'}} \
+                                                  expected-error {{expected identifier or '{'}} \
+                                                  expected-error {{expected ';' after top level declarator}}
+}
+
+namespace GH32903 {
+void
+B(
+  char cat_dog_3, char cat_dog_2, char cat_dog_1, char cat_dog_0, char pigeon_dog_3, char pigeon_dog_2,
+  char pigeon_dog_1, char pigeon_dog_0, short &elefant15_lion, short &elefant14_lion, short &elefant13_lion,       // expected-note 3 {{declared here}}
+  short &elefant12_lion, short &elefant11_lion, short &elefant10_lion, short &elefant9_lion, short &elefant8_lion, // expected-note 5 {{declared here}}
+  short &elefant7_lion, short &elefant6_lion, short &elefant5_lion, short &elefant4_lion, short &elefant3_lion,    // expected-note 2 {{declared here}}
+  short &elefant2_lion, short &elefant1_lion, short &elefant0_lion, char& no_animal)
+{
+
+    A(  // FIXME: it's surprising that we don't issue a "use of undeclared identifier" diagnostic for the call itself.
+        elefant_15_lion, elefant_14_lion, elefant_13_lion, elefant_12_lion, elefant_11_lion, elefant_10_lion, elefant_9_lion, // expected-error 7 {{use of undeclared identifier}}
+        elefant_8_lion, elefant_7_lion, elefant_6_lion, elefant_5_lion, elefant_4_lion, elefant_3_lion, elefant_2_lion,       // expected-error 7 {{use of undeclared identifier}}
+        elefant_1_lion, elefant_0_lion, no_animal, other_mammal);                                                             // expected-error 3 {{use of undeclared identifier}}
+}
+}
diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp
index 95c64bc3b8bf..6ee1249a66c3 100644
--- a/clang/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp
@@ -121,7 +121,8 @@ void for_range() {
 }
 
 int error_recovery() {
-  auto [foobar]; // expected-error {{requires an initializer}}
+  auto [foobar]; // expected-error {{requires an initializer}} \
+                    expected-note {{'foobar' declared here}}
   return foobar_; // expected-error {{undeclared identifier 'foobar_'}}
 }
 
diff --git a/clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp b/clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp
new file mode 100644
index 000000000000..a16a7f8255f7
--- /dev/null
+++ b/clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+
+#include "Inputs/std-coroutine.h"
+
+namespace GH58172 {
+template<typename Fn>
+int f2(int, Fn&&)
+{
+  return 0;
+}
+
+int f1()
+{
+  return f2(v1, []() -> task<int> {   // expected-error {{no template named 'task'}} \
+                                         expected-error {{use of undeclared identifier 'v1'}}
+    co_return v2;                     // expected-error {{use of undeclared identifier 'v2'}}
+  });
+}
+}
diff --git a/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp b/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp
index 5c0d89d9125f..1bc7f2cce3c9 100644
--- a/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp
+++ b/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp
@@ -61,7 +61,7 @@ struct A : X<int> { // expected-error {{no template named 'X'}}
 // Similarly for treating overload sets of functions as template names.
 struct g<int> {}; // expected-error {{'g' refers to a function template}}
 g<int>::Y xy; // expected-error {{no template named 'g'}} FIXME lies
-void xf(g<int> x); // expected-error {{variable has incomplete type 'void'}} expected-error 1+{{}} expected-note {{}}
+void xf(g<int> x); // expected-error {{variable has incomplete type 'void'}} expected-error 1+{{}}
 struct B : g<int> { // expected-error {{expected class name}}
   B() : g<int>() {} // expected-error {{expected class member or base class name}}
 };
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index ed4802943ad3..b9e0b17d510a 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -553,14 +553,11 @@ namespace crash_on_invalid_base_dtor {
 struct Test {
   virtual ~Test();
 };
-struct Baz : public Test { // expected-warning {{non-virtual destructor}}
+struct Baz : public Test {
   Baz() {}
-  ~Baz() = defaul; // expected-error {{undeclared identifier 'defaul'}} \
-                   // expected-error {{initializer on function}} \
-                   // expected-note {{overridden virtual function is here}}
+  ~Baz() = defaul; // expected-error {{undeclared identifier 'defaul'}}
 };
-struct Foo : public Baz { // expected-error {{cannot override a non-deleted function}} \
-                          // expected-note {{destructor of 'Foo' is implicitly deleted}}
+struct Foo : public Baz {
   Foo() {}
 };
 }
@@ -579,11 +576,9 @@ static_assert(!__is_trivially_constructible(Foo, Foo &&), "");
 
 namespace GH97230 {
 struct X {
-  ~X() = defaul; // expected-error {{initializer on function does not look like a pure-specifier}} \
-                 // expected-error {{use of undeclared identifier 'defaul'}}
+  ~X() = defaul; // expected-error {{use of undeclared identifier 'defaul'}}
 };
-struct Y : X {} y1{ }; // expected-error {{call to implicitly-deleted default constructor of 'struct Y'}} \
-                       // expected-note {{default constructor of 'Y' is implicitly deleted because base class 'X' has no destructor}}
+struct Y : X {} y1{ };
 }
 
 namespace GH121706 {
diff --git a/clang/test/SemaCXX/invalid-if-constexpr.cpp b/clang/test/SemaCXX/invalid-if-constexpr.cpp
index 0007f2739cbb..9f2774187148 100644
--- a/clang/test/SemaCXX/invalid-if-constexpr.cpp
+++ b/clang/test/SemaCXX/invalid-if-constexpr.cpp
@@ -2,12 +2,16 @@
 
 namespace GH61885 {
 void similar() { // expected-note {{'similar' declared here}}
-  if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}}
+  if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}} \
+                                 expected-warning {{address of function 'similar<>' will always evaluate to 'true'}} \
+                                 expected-note {{prefix with the address-of operator to silence this warning}}
 }
-void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}}
+void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'}}
 
 int AA() { return true;} // expected-note {{'AA' declared here}}
 
-void b() { if constexpr (AAA<>) {}} // expected-error {{use of undeclared identifier 'AAA'; did you mean 'AA'?}}
+void b() { if constexpr (AAA<>) {}} // expected-error {{use of undeclared identifier 'AAA'; did you mean 'AA'?}} \
+                                       expected-warning {{address of function 'AA<>' will always evaluate to 'true'}} \
+                                       expected-note {{prefix with the address-of operator to silence this warning}}
 }
 
diff --git a/clang/test/SemaCXX/member-expr.cpp b/clang/test/SemaCXX/member-expr.cpp
index 0596e40f6c2f..902b09097a12 100644
--- a/clang/test/SemaCXX/member-expr.cpp
+++ b/clang/test/SemaCXX/member-expr.cpp
@@ -96,11 +96,11 @@ namespace test5 {
 namespace PR7508 {
   struct A {
     struct CleanupScope {};
-    void PopCleanupBlock(); // expected-note{{'PopCleanupBlock' declared here}}
+    void PopCleanupBlock();
   };
 
   void foo(A &a) {
-    a.PopCleanupScope(); // expected-error{{no member named 'PopCleanupScope' in 'PR7508::A'; did you mean 'PopCleanupBlock'?}}
+    a.PopCleanupScope(); // expected-error{{no member named 'PopCleanupScope' in 'PR7508::A'}}
   }
 }
 
@@ -189,7 +189,7 @@ namespace PR15045 {
   }
 
   struct bar {
-    void func();  // expected-note {{'func' declared here}}
+    void func();
   };
 
   struct foo {
@@ -207,7 +207,7 @@ namespace PR15045 {
 
     // Show that recovery has happened by also triggering typo correction
     e->Func();  // expected-error {{member reference type 'bar' is not a pointer; did you mean to use '.'?}} \
-                // expected-error {{no member named 'Func' in 'PR15045::bar'; did you mean 'func'?}}
+                // expected-error {{no member named 'Func' in 'PR15045::bar'}}
 
     // Make sure a fixit isn't given in the case that the '->' isn't actually
     // the problem (the problem is with the return value of an operator->).
diff --git a/clang/test/SemaCXX/nested-name-spec.cpp b/clang/test/SemaCXX/nested-name-spec.cpp
index 36398aed7ac5..abeaba9d8dde 100644
--- a/clang/test/SemaCXX/nested-name-spec.cpp
+++ b/clang/test/SemaCXX/nested-name-spec.cpp
@@ -409,7 +409,8 @@ T1<C2::N1> var_1a;
 T1<C2:N1> var_1b;  // expected-error{{unexpected ':' in nested name specifier; did you mean '::'?}}
 template<int N> int F() {}
 int (*X1)() = (B1::B2 ? F<1> : F<2>);
-int (*X2)() = (B1:B2 ? F<1> : F<2>);  // expected-error{{unexpected ':' in nested name specifier; did you mean '::'?}}
+int (*X2)() = (B1:B2 ? F<1> : F<2>);  // expected-error{{unexpected ':' in nested name specifier; did you mean '::'?}} \
+                                         expected-note{{'PR18587::X2' declared here}}
 
 // Bit fields + templates
 struct S7a {
@@ -445,7 +446,8 @@ namespace PR16951 {
 
   int x4 = enumerator_2::ENUMERATOR_2; // expected-warning{{use of enumeration in a nested name specifier is a C++11 extension}}
   int x5 = enumerator_2::X2; // expected-warning{{use of enumeration in a nested name specifier is a C++11 extension}} \
-                             // expected-error{{no member named 'X2' in 'PR16951::enumerator_2'}}
+                             // expected-error{{no member named 'X2' in 'PR16951::enumerator_2'}} \
+                             // expected-error{{cannot initialize a variable of type 'int' with an lvalue of type 'int (*)()'}}
 
 }
 
diff --git a/clang/test/SemaCXX/pr13394-crash-on-invalid.cpp b/clang/test/SemaCXX/pr13394-crash-on-invalid.cpp
deleted file mode 100644
index 304ee92f6a8d..000000000000
--- a/clang/test/SemaCXX/pr13394-crash-on-invalid.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// Don't crash (PR13394).
-
-namespace stretch_v1 {
-  struct closure_t {
-    const stretch_v1::ops_t* d_methods; // expected-error {{no type named 'ops_t' in namespace 'stretch_v1'}}
-  };
-}
-namespace gatekeeper_v1 {
-  namespace gatekeeper_factory_v1 {
-    struct closure_t { // expected-note {{'closure_t' declared here}} expected-note {{'gatekeeper_factory_v1::closure_t' declared here}}
-      gatekeeper_v1::closure_t* create(); // expected-error {{no type named 'closure_t' in namespace 'gatekeeper_v1'; did you mean simply 'closure_t'?}}
-    };
-  }
-  // FIXME: Typo correction should remove the 'gatekeeper_v1::' name specifier
-  gatekeeper_v1::closure_t *x; // expected-error {{no type named 'closure_t' in namespace 'gatekeeper_v1'; did you mean 'gatekeeper_factory_v1::closure_t'}}
-}
-
-namespace Foo {
-struct Base {
-  void Bar() {} // expected-note{{'Bar' declared here}}
-};
-}
-
-struct Derived : public Foo::Base {
-  void test() {
-    Foo::Bar(); // expected-error{{no member named 'Bar' in namespace 'Foo'; did you mean simply 'Bar'?}}
-  }
-};
diff --git a/clang/test/SemaCXX/return.cpp b/clang/test/SemaCXX/return.cpp
index 17d7892d8dbd..796c9ae91ded 100644
--- a/clang/test/SemaCXX/return.cpp
+++ b/clang/test/SemaCXX/return.cpp
@@ -130,5 +130,5 @@ void cxx_unresolved_expr() {
   // CXXUnresolvedConstructExpr, and the missing ')' gives it an invalid source
   // location for its rparen.  Check that emitting a diag on the range of the
   // expr doesn't assert.
-  return int(undeclared, 4; // expected-error {{expected ')'}} expected-note{{to match this '('}} expected-error {{use of undeclared identifier 'undeclared'}}
+  return int(undeclared, 4; // expected-error {{use of undeclared identifier 'undeclared'}}
 }
diff --git a/clang/test/SemaCXX/typo-correction-crash.cpp b/clang/test/SemaCXX/typo-correction-crash.cpp
index 2a77c9df505e..434b70e3c509 100644
--- a/clang/test/SemaCXX/typo-correction-crash.cpp
+++ b/clang/test/SemaCXX/typo-correction-crash.cpp
@@ -4,10 +4,10 @@ auto check1() {
   return s; // expected-error {{use of undeclared identifier 's'}}
 }
 
-int test = 11; // expected-note 2 {{'test' declared here}}
+int test = 11; // expected-note 3 {{'test' declared here}}
 auto check2() {
   return "s";
-  return tes; // expected-error {{use of undeclared identifier 'tes'; did you mean 'test'?}}
+  return tes; // expected-error {{use of undeclared identifier 'tes'}}
               // expected-error@-1 {{deduced as 'int' here but deduced as 'const char *' in earlier}}
 }
 
@@ -16,9 +16,8 @@ template <class A> struct is_same<A,A> { static constexpr bool value = true; };
 
 auto L1 = [] { return s; }; // expected-error {{use of undeclared identifier 's'}}
 using T1 = decltype(L1());
-// FIXME: Suppress the 'undeclared identifier T1' diagnostic, the UsingDecl T1 is discarded because of an invalid L1().
-static_assert(is_same<T1, void>::value, "Return statement should be discarded"); // expected-error {{use of undeclared identifier 'T1'}}
-auto L2 = [] { return tes; }; // expected-error {{use of undeclared identifier 'tes'; did you mean 'test'?}}
+static_assert(is_same<T1, void>::value, "Return statement should be discarded");
+auto L2 = [] { return tes; }; // expected-error {{use of undeclared identifier 'tes'}}
 using T2 = decltype(L2());
 static_assert(is_same<T2, int>::value, "Return statement was corrected");
 
@@ -32,13 +31,13 @@ FooRecord::NestedNamespace::type x; // expected-error {{no member named 'NestedN
 
 void cast_expr(int g) { +int(n)(g); } // expected-error {{undeclared identifier 'n'}}
 
-void bind() { for (const auto& [test,_] : _test_) { }; } // expected-error {{undeclared identifier '_test_'}}
+void bind() { for (const auto& [test,_] : _test_) { }; } // expected-error {{undeclared identifier '_test_'}} \
+                                                            expected-error {{invalid range expression of type 'int'; no viable 'begin' function available}}
 
 namespace NoCrash {
 class S {
   void Function(int a) {
-    unknown1(unknown2, Function, unknown3); // expected-error 2{{use of undeclared identifier}} \
-                                               expected-error {{reference to non-static member function must be called}}
+    unknown1(unknown2, Function, unknown3); // expected-error 2{{use of undeclared identifier}}
   }
 };
 }
@@ -46,8 +45,6 @@ class S {
 namespace NoCrashOnCheckArgAlignment {
 template <typename a> void b(a &);
 void test() {
-  for (auto file_data :b(files_db_data)); // expected-error {{use of undeclared identifier 'files_db_data'; did you mean 'file_data'?}} \
-                                          // expected-note {{'file_data' declared here}} \
-                                          // expected-error {{cannot use type 'void' as a range}}
+  for (auto file_data :b(files_db_data)); // expected-error {{use of undeclared identifier 'files_db_data'}}
 }
 }
diff --git a/clang/test/SemaCXX/typo-correction-cxx11.cpp b/clang/test/SemaCXX/typo-correction-cxx11.cpp
index 8c588203cc12..9eb5f9c29962 100644
--- a/clang/test/SemaCXX/typo-correction-cxx11.cpp
+++ b/clang/test/SemaCXX/typo-correction-cxx11.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
 
 namespace PR23186 {
-decltype(ned);  // expected-error-re {{use of undeclared identifier 'ned'{{$}}}}
+decltype(ned);  // expected-error {{use of undeclared identifier 'ned'}}
 // The code below was triggering an UNREACHABLE in ASTContext::getTypeInfoImpl
 // once the above code failed to recover properly after making the bogus
 // correction of 'ned' to 'new'.
@@ -19,8 +19,9 @@ struct S {
 namespace PR23140 {
 auto lneed = gned.*[] {};  // expected-error-re {{use of undeclared identifier 'gned'{{$}}}}
 
-void test(int aaa, int bbb, int thisvar) {  // expected-note {{'thisvar' declared here}}
-  int thatval = aaa * (bbb + thatvar);  // expected-error {{use of undeclared identifier 'thatvar'; did you mean 'thisvar'?}}
+void test(int aaa, int bbb, int thisvar) {
+  int thatval = aaa * (bbb + thatvar);  // expected-error {{use of undeclared identifier 'thatvar'; did you mean 'thatval'}} \
+                                           expected-note {{'thatval' declared here}}
 }
 }
 
@@ -54,7 +55,7 @@ void run(A *annotations) {
 
   auto &annotation = *annotations;
   auto new_it = new_annotations.find(5);
-  auto &new_anotation = new_it.second;  // expected-note {{'new_anotation' declared here}}
-  new_annotation->Swap(&annotation);  // expected-error {{use of undeclared identifier 'new_annotation'; did you mean 'new_anotation'?}}
+  auto &new_anotation = new_it.second;
+  new_annotation->Swap(&annotation);  // expected-error {{use of undeclared identifier 'new_annotation'}}
 }
 }
diff --git a/clang/test/SemaCXX/typo-correction-delayed.cpp b/clang/test/SemaCXX/typo-correction-delayed.cpp
deleted file mode 100644
index fdb1f740fda6..000000000000
--- a/clang/test/SemaCXX/typo-correction-delayed.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wno-c++11-extensions %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 -Wno-c++11-extensions %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-
-struct A {};
-struct B {};
-struct D {
-  A fizbin;  // expected-note 2 {{declared here}}
-  A foobar;  // expected-note 2 {{declared here}}
-  B roxbin;  // expected-note 2 {{declared here}}
-  B toobad;  // expected-note 2 {{declared here}}
-  void BooHoo();
-  void FoxBox();
-};
-
-void something(A, B);
-void test() {
-  D obj;
-  something(obj.fixbin,   // expected-error {{did you mean 'fizbin'?}}
-            obj.toobat);  // expected-error {{did you mean 'toobad'?}}
-  something(obj.toobat,   // expected-error {{did you mean 'foobar'?}}
-            obj.fixbin);  // expected-error {{did you mean 'roxbin'?}}
-  something(obj.fixbin,   // expected-error {{did you mean 'fizbin'?}}
-            obj.fixbin);  // expected-error {{did you mean 'roxbin'?}}
-  something(obj.toobat,   // expected-error {{did you mean 'foobar'?}}
-            obj.toobat);  // expected-error {{did you mean 'toobad'?}}
-  // Both members could be corrected to methods, but that isn't valid.
-  something(obj.boohoo,   // expected-error-re {{no member named 'boohoo' in 'D'{{$}}}}
-            obj.foxbox);  // expected-error-re {{no member named 'foxbox' in 'D'{{$}}}}
-  // The first argument has a usable correction but the second doesn't.
-  something(obj.boobar,   // expected-error-re {{no member named 'boobar' in 'D'{{$}}}}
-            obj.foxbox);  // expected-error-re {{no member named 'foxbox' in 'D'{{$}}}}
-}
-
-// Ensure the delayed typo correction does the right thing when trying to
-// recover using a seemingly-valid correction for which a valid expression to
-// replace the TypoExpr cannot be created (but which does have a second
-// correction candidate that would be a valid and usable correction).
-class Foo {
-public:
-  template <> void testIt();  // expected-error {{no function template matches}}
-  void textIt();  // expected-note {{'textIt' declared here}}
-};
-void testMemberExpr(Foo *f) {
-  f->TestIt();  // expected-error {{no member named 'TestIt' in 'Foo'; did you mean 'textIt'?}}
-}
-
-void callee(double, double);
-void testNoCandidates() {
-  callee(xxxxxx,   // expected-error-re {{use of undeclared identifier 'xxxxxx'{{$}}}}
-         zzzzzz);  // expected-error-re {{use of undeclared identifier 'zzzzzz'{{$}}}}
-}
-
-class string {};
-struct Item {
-  void Nest();
-  string text();
-  Item* next();  // expected-note {{'next' declared here}}
-};
-void testExprFilter(Item *i) {
-  Item *j;
-  j = i->Next();  // expected-error {{no member named 'Next' in 'Item'; did you mean 'next'?}}
-}
-
-// Test that initializer expressions are handled correctly and that the type
-// being initialized is taken into account when choosing a correction.
-namespace initializerCorrections {
-struct Node {
-  string text() const;
-  // Node* Next() is not implemented yet
-};
-void f(Node *node) {
-  // text is only an edit distance of 1 from Next, but would trigger type
-  // conversion errors if used in this initialization expression.
-  Node *next = node->Next();  // expected-error-re {{no member named 'Next' in 'initializerCorrections::Node'{{$}}}}
-}
-
-struct LinkedNode {
-  LinkedNode* next();  // expected-note {{'next' declared here}}
-  string text() const;
-};
-void f(LinkedNode *node) {
-  // text and next are equidistant from Next, but only one results in a valid
-  // initialization expression.
-  LinkedNode *next = node->Next();  // expected-error {{no member named 'Next' in 'initializerCorrections::LinkedNode'; did you mean 'next'?}}
-}
-
-struct NestedNode {
-  NestedNode* Nest();
-  NestedNode* next();
-  string text() const;
-};
-void f(NestedNode *node) {
-  // There are two equidistant, usable corrections for Next: next and Nest
-  NestedNode *next = node->Next();  // expected-error-re {{no member named 'Next' in 'initializerCorrections::NestedNode'{{$}}}}
-}
-}
-
-namespace PR21669 {
-void f(int *i) {
-  // Check that arguments to a builtin with custom type checking are corrected
-  // properly, since calls to such builtins bypass much of the normal code path
-  // for building and checking the call.
-  __atomic_load(i, i, something_something);  // expected-error-re {{use of undeclared identifier 'something_something'{{$}}}}
-}
-}
-
-const int DefaultArg = 9;  // expected-note {{'DefaultArg' declared here}}
-template <int I = defaultArg> struct S {};  // expected-error {{use of undeclared identifier 'defaultArg'; did you mean 'DefaultArg'?}}
-S<1> s;
-
-namespace foo {}
-void test_paren_suffix() {
-  foo::bar({5, 6});  // expected-error-re {{no member named 'bar' in namespace 'foo'{{$}}}}
-#if __cplusplus <= 199711L
-  // expected-error@-2 {{expected expression}}
-#endif
-}
-
-const int kNum = 10;  // expected-note {{'kNum' declared here}}
-class SomeClass {
-  int Kind;
-public:
-  explicit SomeClass() : Kind(kSum) {}  // expected-error {{use of undeclared identifier 'kSum'; did you mean 'kNum'?}}
-};
-
-// There used to be an issue with typo resolution inside overloads.
-struct AssertionResult { ~AssertionResult(); };
-AssertionResult Overload(const char *a);
-AssertionResult Overload(int a);
-void UseOverload() {
-  // expected-note@+1 {{'result' declared here}}
-  const char *result;
-  // expected-error@+1 {{use of undeclared identifier 'resulta'; did you mean 'result'?}}
-  Overload(resulta);
-}
-
-namespace PR21925 {
-struct X {
-  int get() { return 7; }  // expected-note {{'get' declared here}}
-};
-void test() {
-  X variable;  // expected-note {{'variable' declared here}}
-
-  // expected-error@+2 {{use of undeclared identifier 'variableX'; did you mean 'variable'?}}
-  // expected-error@+1 {{no member named 'getX' in 'PR21925::X'; did you mean 'get'?}}
-  int x = variableX.getX();
-}
-}
-
-namespace PR21905 {
-int (*a)() = (void)Z; // expected-error-re {{use of undeclared identifier 'Z'{{$}}}} \
-                      // expected-error {{cannot initialize a variable of type 'int (*)()' with an rvalue of type 'void'}}
-}
-
-namespace PR21947 {
-int blue;  // expected-note {{'blue' declared here}}
-__typeof blur y;  // expected-error {{use of undeclared identifier 'blur'; did you mean 'blue'?}}
-}
-
-namespace PR22092 {
-a = b ? : 0;  // expected-error {{a type specifier is required for all declarations}} \
-              // expected-error-re {{use of undeclared identifier 'b'{{$}}}}
-}
-
-extern long clock (void);
-struct Pointer {
-  void set_xpos(int);
-  void set_ypos(int);
-};
-void MovePointer(Pointer &Click, int x, int y) {  // expected-note 2 {{'Click' declared here}}
-  click.set_xpos(x);  // expected-error {{use of undeclared identifier 'click'; did you mean 'Click'?}}
-  click.set_ypos(x);  // expected-error {{use of undeclared identifier 'click'; did you mean 'Click'?}}
-}
-
-namespace PR22250 {
-// expected-error@+4 {{use of undeclared identifier 'size_t'; did you mean 'sizeof'?}}
-// expected-error-re@+3 {{use of undeclared identifier 'y'{{$}}}}
-// expected-error-re@+2 {{use of undeclared identifier 'z'{{$}}}}
-// expected-error@+1 {{expected ';' after top level declarator}}
-int getenv_s(size_t *y, char(&z)) {}
-}
-
-namespace PR22291 {
-template <unsigned I> void f() {
-  unsigned *prio_bits_array;  // expected-note {{'prio_bits_array' declared here}}
-  // expected-error@+1 {{use of undeclared identifier 'prio_op_array'; did you mean 'prio_bits_array'?}}
-  __atomic_store_n(prio_op_array + I, false, __ATOMIC_RELAXED);
-}
-}
-
-namespace PR22297 {
-double pow(double x, double y);
-struct TimeTicks {
-  static void Now();  // expected-note {{'Now' declared here}}
-};
-void f() {
-  TimeTicks::now();  // expected-error {{no member named 'now' in 'PR22297::TimeTicks'; did you mean 'Now'?}}
-}
-}
-
-namespace PR23005 {
-void f() { int a = Unknown::b(c); }  // expected-error {{use of undeclared identifier 'Unknown'}}
-// expected-error@-1 {{use of undeclared identifier 'c'}}
-}
-
-namespace PR23350 {
-int z = 1 ? N : ;  // expected-error {{expected expression}}
-// expected-error-re@-1 {{use of undeclared identifier 'N'{{$}}}}
-}
-
-// PR 23285. This test must be at the end of the file to avoid additional,
-// unwanted diagnostics.
-// expected-error-re@+2 {{use of undeclared identifier 'uintmax_t'{{$}}}}
-// expected-error@+1 {{expected ';' after top level declarator}}
-unsigned int a = 0(uintmax_t
diff --git a/clang/test/SemaCXX/typo-correction.cpp b/clang/test/SemaCXX/typo-correction.cpp
index 45f42c426035..e4dadf83e0a0 100644
--- a/clang/test/SemaCXX/typo-correction.cpp
+++ b/clang/test/SemaCXX/typo-correction.cpp
@@ -3,7 +3,6 @@
 
 namespace PR21817{
 int a(-rsing[2]); // expected-error {{undeclared identifier 'rsing'; did you mean 'using'?}}
-                  // expected-error@-1 {{expected expression}}
 }
 
 struct errc {
@@ -43,14 +42,14 @@ inline error_condition make_error_condition(errc _e) {
 // refer to a base class or non-static data member.
 struct BaseType { };
 struct Derived : public BaseType { // expected-note {{base class 'BaseType' specified here}}
-  static int base_type; // expected-note {{'base_type' declared here}}
+  static int base_type;
   Derived() : basetype() {} // expected-error{{initializer 'basetype' does not name a non-static data member or base class; did you mean the base class 'BaseType'?}}
 };
 
 // Test the improvement from passing a callback object to CorrectTypo in
 // the helper function LookupMemberExprInRecord.
 int get_type(struct Derived *st) {
-  return st->Base_Type; // expected-error{{no member named 'Base_Type' in 'Derived'; did you mean 'base_type'?}}
+  return st->Base_Type; // expected-error{{no member named 'Base_Type' in 'Derived'}}
 }
 
 // In this example, somename should not be corrected to the cached correction
@@ -212,12 +211,11 @@ namespace PR13051 {
   };
 
   void foo(); // expected-note{{'foo' declared here}}
-  void g(void(*)()); // expected-note{{candidate function not viable}}
-  void g(bool(S<int>::*)() const); // expected-note{{candidate function not viable}}
+  void g(void(*)());
+  void g(bool(S<int>::*)() const);
 
   void test() {
-    g(&S<int>::tempalte f<int>); // expected-error{{did you mean 'template'?}} \
-                                 // expected-error{{no matching function for call to 'g'}}
+    g(&S<int>::tempalte f<int>); // expected-error{{did you mean 'template'?}}
     g(&S<int>::opeartor bool); // expected-error{{did you mean 'operator'?}}
     g(&S<int>::foo); // expected-error{{no member named 'foo' in 'PR13051::S<int>'; did you mean simply 'foo'?}}
   }
@@ -251,13 +249,13 @@ namespace b6956809_test1 {
 
   struct S1 {
     void method(A*);  // no note here
-    void method(B*);  // expected-note{{'method' declared here}}
+    void method(B*);
   };
 
   void test1() {
     B b;
     S1 s;
-    s.methodd(&b);  // expected-error{{no member named 'methodd' in 'b6956809_test1::S1'; did you mean 'method'}}
+    s.methodd(&b);  // expected-error{{no member named 'methodd' in 'b6956809_test1::S1'}}
   }
 
   struct S2 {
@@ -275,15 +273,15 @@ namespace b6956809_test1 {
 }
 
 namespace b6956809_test2 {
-  template<typename T> struct Err { typename T::error n; };  // expected-error{{type 'void *' cannot be used prior to '::' because it has no members}}
+  template<typename T> struct Err { typename T::error n; };
   struct S {
-    template<typename T> typename Err<T>::type method(T);  // expected-note{{in instantiation of template class 'b6956809_test2::Err<void *>' requested here}}
-    template<typename T> int method(T *);  // expected-note{{'method' declared here}}
+    template<typename T> typename Err<T>::type method(T);
+    template<typename T> int method(T *);
   };
 
   void test() {
     S s;
-    int k = s.methodd((void*)0);  // expected-error{{no member named 'methodd' in 'b6956809_test2::S'; did you mean 'method'?}} expected-note{{while substituting deduced template arguments into function template 'method' [with T = void *]}}
+    int k = s.methodd((void*)0);  // expected-error{{no member named 'methodd' in 'b6956809_test2::S'}}
   }
 }
 
@@ -309,12 +307,12 @@ struct A {
   void CreateBar(float, float);
 };
 struct B : A {
-  using A::CreateFoo; // expected-note {{'CreateFoo' declared here}}
-  void CreateFoo(int, int);  // expected-note {{'CreateFoo' declared here}}
+  using A::CreateFoo;
+  void CreateFoo(int, int);
 };
 void f(B &x) {
-  x.Createfoo(0,0);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'; did you mean 'CreateFoo'?}}
-  x.Createfoo(0.f,0.f);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'; did you mean 'CreateFoo'?}}
+  x.Createfoo(0,0);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'}}
+  x.Createfoo(0.f,0.f);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'}}
 }
 }
 
@@ -649,12 +647,12 @@ class AddObservation { // expected-note {{declared here}}
 
 namespace testNonStaticMemberHandling {
 struct Foo {
-  bool usesMetadata;  // expected-note {{'usesMetadata' declared here}}
+  bool usesMetadata;
 };
 int test(Foo f) {
   if (UsesMetadata)  // expected-error-re {{use of undeclared identifier 'UsesMetadata'{{$}}}}
     return 5;
-  if (f.UsesMetadata)  // expected-error {{no member named 'UsesMetadata' in 'testNonStaticMemberHandling::Foo'; did you mean 'usesMetadata'?}}
+  if (f.UsesMetadata)  // expected-error {{no member named 'UsesMetadata' in 'testNonStaticMemberHandling::Foo'}}
     return 11;
   return 0;
 }
@@ -707,7 +705,7 @@ using C::D::Foofoo;  // expected-error {{no member named 'Foofoo' in namespace '
 int d = ? L : d; // expected-error {{expected expression}} expected-error {{undeclared identifier}}
 
 struct B0 {
-  int : 0 |         // expected-error {{invalid operands to binary expression}}
+  int : 0 |
       (struct B0)e; // expected-error {{use of undeclared identifier}}
 };
 
diff --git a/clang/test/SemaCXX/virtuals.cpp b/clang/test/SemaCXX/virtuals.cpp
index 2a22ab9fc2b0..f6f52d51f650 100644
--- a/clang/test/SemaCXX/virtuals.cpp
+++ b/clang/test/SemaCXX/virtuals.cpp
@@ -58,10 +58,8 @@ struct Base {
 };
 
 struct Derived final : Base {
-  virtual ~Derived() = defaul; // #default
+  virtual ~Derived() = defaul; // expected-error {{use of undeclared identifier 'defaul'}}
 } do_not_crash;
-// expected-error@#default {{initializer on function does not look like a pure-specifier}}
-// expected-error@#default {{use of undeclared identifier 'defaul'}}
 }
 
 namespace VirtualFriend {
diff --git a/clang/test/SemaObjC/call-super-2.m b/clang/test/SemaObjC/call-super-2.m
index 01acff70c230..885f392e353a 100644
--- a/clang/test/SemaObjC/call-super-2.m
+++ b/clang/test/SemaObjC/call-super-2.m
@@ -115,7 +115,7 @@ id objc_getClass(const char *s);
 @end
 
 @implementation B
-- (instancetype)initWithCoder:(C *)coder {
+- (instancetype)initWithCoder:(C *)coder {     // expected-note {{'coder' declared here}}
   if (0 != (self = [super initWithCode:code])) // expected-error {{use of undeclared identifier 'code'}} expected-warning {{instance method '-initWithCode:' not found}}
     return (void *)0;
   return (void *)0;
diff --git a/clang/test/SemaObjC/typo-correction-subscript.m b/clang/test/SemaObjC/typo-correction-subscript.m
index 340f3cfe2743..6c09127dbb8d 100644
--- a/clang/test/SemaObjC/typo-correction-subscript.m
+++ b/clang/test/SemaObjC/typo-correction-subscript.m
@@ -7,8 +7,7 @@
 @implementation Test
 - (void)rdar47403222:(Dictionary *)opts {
   [self undeclaredMethod:undeclaredArg];
-  // expected-error@-1{{no visible @interface for 'Test' declares the selector 'undeclaredMethod:'}}
-  // expected-error@-2{{use of undeclared identifier 'undeclaredArg}}
+  // expected-error@-1{{use of undeclared identifier 'undeclaredArg}}
   opts[(__bridge id)undeclaredKey] = 0;
   // expected-error@-1{{use of undeclared identifier 'undeclaredKey'}}
 }
diff --git a/clang/test/SemaObjC/undef-arg-super-method-call.m b/clang/test/SemaObjC/undef-arg-super-method-call.m
index 11fd97f2c00d..b8cbe7f69f2f 100644
--- a/clang/test/SemaObjC/undef-arg-super-method-call.m
+++ b/clang/test/SemaObjC/undef-arg-super-method-call.m
@@ -11,12 +11,12 @@
 @end
 
 @implementation DBGViewDebuggerSupport_iOS
-+ (void)addViewLayerInfo:(id)aView; // expected-note {{'aView' declared here}}
++ (void)addViewLayerInfo:(id)aView;
 {
-    [super addViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'; did you mean 'aView'?}}
+    [super addViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'}}
 }
-- (void)addInstViewLayerInfo:(id)aView; // expected-note {{'aView' declared here}}
+- (void)addInstViewLayerInfo:(id)aView;
 {
-    [super addInstViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'; did you mean 'aView'?}}
+    [super addInstViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'}}
 }
 @end
diff --git a/clang/test/SemaObjCXX/block-for-lambda-conversion.mm b/clang/test/SemaObjCXX/block-for-lambda-conversion.mm
index 671e83dc2201..a3bcfab67719 100644
--- a/clang/test/SemaObjCXX/block-for-lambda-conversion.mm
+++ b/clang/test/SemaObjCXX/block-for-lambda-conversion.mm
@@ -8,19 +8,20 @@ enum NSEventMask {
   NSEventMaskLeftMouseDown = 1
 };
 
-static const NSEventType NSFlagsChanged = NSEventTypeFlagsChanged;
+static const NSEventType NSFlagsChanged = NSEventTypeFlagsChanged; // expected-note {{'NSFlagsChanged' declared here}}
 
 @interface NSObject
 @end
 @interface NSEvent : NSObject {
 }
 + (nullable id)
-addMonitor:(NSEventMask)mask handler:(NSEvent *_Nullable (^)(NSEvent *))block;
+addMonitor:(NSEventMask)mask handler:(NSEvent *_Nullable (^)(NSEvent *))block; // expected-note {{passing argument to parameter 'mask' here}}
 @end
 
 void test(id weakThis) {
   id m_flagsChangedEventMonitor = [NSEvent
-      addMonitor:NSFlagsChangedMask //expected-error {{use of undeclared identifier 'NSFlagsChangedMask'}}
+      addMonitor:NSFlagsChangedMask //expected-error {{use of undeclared identifier 'NSFlagsChangedMask'}} \
+                                      expected-error {{cannot initialize a parameter of type 'NSEventMask' with an lvalue of type 'const NSEventType'}}
          handler:[weakThis](NSEvent *flagsChangedEvent) {
              return flagsChangedEvent;
          }];
diff --git a/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp b/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
index c6dbe4db2be6..0cf27666dd03 100644
--- a/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
@@ -119,8 +119,7 @@ struct HasInt {
 
 template <typename T>
 void TestInst() {
-  // expected-error@+2{{no member named 'Invalid' in 'HasInt'}}
-  // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial' directive}}
+  // expected-error@+1{{no member named 'Invalid' in 'HasInt'}}
 #pragma acc serial num_gangs(HasInt::Invalid)
   while(1);
 
@@ -137,8 +136,7 @@ void TestInst() {
 #pragma acc parallel num_gangs(T::Invalid, 1)
   while(1);
 
-  // expected-error@+2{{no member named 'Invalid' in 'HasInt'}}
-  // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial' directive}}
+  // expected-error@+1{{no member named 'Invalid' in 'HasInt'}}
 #pragma acc serial num_gangs(1, HasInt::Invalid)
   while(1);
 
diff --git a/clang/test/SemaOpenCL/atomic-ops.cl b/clang/test/SemaOpenCL/atomic-ops.cl
index 7a273546db77..babebba31e82 100644
--- a/clang/test/SemaOpenCL/atomic-ops.cl
+++ b/clang/test/SemaOpenCL/atomic-ops.cl
@@ -167,7 +167,7 @@ void syncscope_checks(atomic_int *Ap, int scope) {
   (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_all_devices);
 #if __OPENCL_C_VERSION__ < CL_VERSION_3_0
   // expected-error@-2{{use of undeclared identifier 'memory_scope_all_devices'}}
-  // expected-note@* {{'memory_scope_all_svm_devices' declared here}}
+  // expected-note@opencl-c-base.h:*{{'memory_scope_all_svm_devices' declared here}}
 #endif
   (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_sub_group);
   (void)__opencl_atomic_load(Ap, memory_order_relaxed, scope);
diff --git a/clang/test/SemaOpenCL/clang-builtin-version.cl b/clang/test/SemaOpenCL/clang-builtin-version.cl
index ec6eecee3106..21cbf2d8f28d 100644
--- a/clang/test/SemaOpenCL/clang-builtin-version.cl
+++ b/clang/test/SemaOpenCL/clang-builtin-version.cl
@@ -17,12 +17,8 @@ kernel void dse_builtins(void) {
   });
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0) && !defined(__opencl_c_device_enqueue)
 // expected-error@-10{{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
-// FIXME: the typo correction for the undeclared identifiers finds alternative
-// suggestions, but instantiating the typo correction causes us to
-// re-instantiate the argument to the call, which triggers the support
-// diagnostic a second time.
-// expected-error@-12 2{{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
-// expected-error@-10 2{{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
+// expected-error@-8 {{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
+// expected-error@-6 {{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
 #endif
 }
 
diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
index b338f3bc271b..6bed1790051f 100644
--- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp
+++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++20 -verify %s
 
-// expected-error@+1{{use of undeclared identifier 'b'}}
-constexpr bool CausesRecoveryExpr = b;
+// expected-error@+1 {{invalid operands to binary expression ('const char[5]' and 'float')}}
+constexpr bool CausesRecoveryExpr = "test" + 1.0f;
 
 template<typename T>
 concept ReferencesCRE = CausesRecoveryExpr;
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index a99df2390a55..62a4f95d79c7 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -814,11 +814,7 @@ static_assert(invalid<int> also here ; // expected-error{{use of undeclared iden
 
 int foo() {
     bool b;
-    b = invalid<int> not just in declarations; // expected-error{{expected ';' after expression}}
-                                               // expected-error@-1{{use of undeclared identifier 'invalid'}}
-                                               // expected-error@-2{{expected ';' after expression}}
-                                               // expected-error@-3{{use of undeclared identifier 'just'}}
-                                               // expected-error@-4{{unknown type name 'in'}}
+    b = invalid<int> not just in declarations; // expected-error{{use of undeclared identifier 'invalid'}}
     return b;
 }
 } // namespace GH48182
diff --git a/clang/test/SemaTemplate/typo-variadic.cpp b/clang/test/SemaTemplate/typo-variadic.cpp
index c9b777aebbe9..48306fb9ce80 100644
--- a/clang/test/SemaTemplate/typo-variadic.cpp
+++ b/clang/test/SemaTemplate/typo-variadic.cpp
@@ -1,2 +1,2 @@
 // RUN: %clang_cc1 -fsyntax-only %s -verify
-int x = m(s...); // expected-error{{pack expansion does not}} expected-error{{undeclared identifier}}
+int x = m(s...); // expected-error{{undeclared identifier}}
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 635d03a88d10..a6301daa672c 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -598,7 +598,6 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::SubstNonTypeTemplateParmPackExprClass:
   case Stmt::FunctionParmPackExprClass:
   case Stmt::UnresolvedLookupExprClass:
-  case Stmt::TypoExprClass: // A typo could actually be a DeclRef or a MemberRef
     K = CXCursor_DeclRefExpr;
     break;
 
diff --git a/clang/unittests/Sema/ExternalSemaSourceTest.cpp b/clang/unittests/Sema/ExternalSemaSourceTest.cpp
index 2b271d4bf782..cc9dd4175af5 100644
--- a/clang/unittests/Sema/ExternalSemaSourceTest.cpp
+++ b/clang/unittests/Sema/ExternalSemaSourceTest.cpp
@@ -268,20 +268,6 @@ TEST(ExternalSemaSource, ExternalTypoCorrectionOrdering) {
   ASSERT_EQ(1, Watcher.SeenCount);
 }
 
-TEST(ExternalSemaSource, ExternalDelayedTypoCorrection) {
-  auto Installer = std::make_unique<ExternalSemaSourceInstaller>();
-  auto Provider = makeIntrusiveRefCnt<FunctionTypoProvider>("aaa", "bbb");
-  DiagnosticWatcher Watcher("aaa", "bbb");
-  Installer->PushSource(Provider.get());
-  Installer->PushWatcher(&Watcher);
-  std::vector<std::string> Args(1, "-std=c++11");
-  ASSERT_TRUE(clang::tooling::runToolOnCodeWithArgs(
-      std::move(Installer), "namespace AAA { } void foo() { AAA::aaa(); }",
-      Args));
-  ASSERT_LE(0, Provider->CallCount);
-  ASSERT_EQ(1, Watcher.SeenCount);
-}
-
 // We should only try MaybeDiagnoseMissingCompleteType if we can't otherwise
 // solve the problem.
 TEST(ExternalSemaSource, TryOtherTacticsBeforeDiagnosing) {

From a5cbd2ab0bebc722f836cd3b04dbab691ef9ed2f Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus@amd.com>
Date: Fri, 13 Jun 2025 12:48:24 +0200
Subject: [PATCH 0260/1322] =?UTF-8?q?Revert=20"[AMDGPU]=20Skip=20register?=
 =?UTF-8?q?=20uses=20in=20AMDGPUResourceUsageAnalysis=20(#=E2=80=A6=20(#14?=
 =?UTF-8?q?4039)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…133242)"

This reverts commit 130080fab11cde5efcb338b77f5c3b31097df6e6 because it
causes issues in testcases similar to coalescer_remat.ll [1], i.e. when
we use a VGPR tuple but only write to its lower parts. The high VGPRs
would then not be included in the vgpr_count, and accessing them would
be an out of bounds violation.

[1]
https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
---
 llvm/docs/AMDGPUUsage.rst                     |  11 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  11 +-
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 281 +++++++++++++++++-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  15 -
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  14 -
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   5 -
 .../AMDGPU/GlobalISel/extractelement.ll       |  38 +--
 .../AMDGPU/amdgpu-no-agprs-violations.ll      |   7 +-
 .../amdhsa-kernarg-preload-num-sgprs.ll       |  28 +-
 llvm/test/CodeGen/AMDGPU/amdpal-callable.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/amdpal-elf.ll        |  16 +-
 .../attr-amdgpu-flat-work-group-size.ll       |   8 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |  24 +-
 .../AMDGPU/call-alias-register-usage-agpr.ll  |   2 +-
 .../AMDGPU/call-alias-register-usage0.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage1.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage2.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage3.ll      |   2 +-
 .../AMDGPU/call-graph-register-usage.ll       |  10 +-
 llvm/test/CodeGen/AMDGPU/coalescer_remat.ll   |   2 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/elf-notes.ll         |   2 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  | 106 +++----
 .../CodeGen/AMDGPU/function-resource-usage.ll |  24 +-
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |   2 +-
 .../init-whole-wave-vgpr-count-large.ll       |  72 -----
 .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll |  46 ---
 ...init-whole-wave-vgpr-count-use-inactive.ll |  74 -----
 .../AMDGPU/init-whole-wave-vgpr-count.ll      |  71 -----
 llvm/test/CodeGen/AMDGPU/ipra.ll              |   2 +-
 ...-knownbits-assign-crash-gh-issue-110930.ll |  24 +-
 .../multi-call-resource-usage-mcexpr.ll       |   2 +-
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   8 +-
 .../CodeGen/AMDGPU/ps-shader-arg-count.ll     |   6 +-
 .../CodeGen/AMDGPU/register-count-comments.ll |   4 +-
 .../AMDGPU/resource-optimization-remarks.ll   |   4 +-
 .../AMDGPU/schedule-amdgpu-tracker-physreg.ll |   4 +-
 .../AMDGPU/schedule-amdgpu-trackers.ll        |   4 +-
 .../AMDGPU/schedule-regpressure-limit2.ll     |   6 +-
 .../CodeGen/AMDGPU/stack-realign-kernel.ll    |  12 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |   4 +-
 .../AMDGPU/unnamed-function-resource-info.ll  |   4 +-
 .../CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll  |   4 +-
 .../test/CodeGen/AMDGPU/vgpr-count-compute.ll |  30 --
 .../CodeGen/AMDGPU/vgpr-count-graphics.ll     |  35 ---
 48 files changed, 472 insertions(+), 586 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3aa8773fa506..e0a43225e81b 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4263,9 +4263,10 @@ same *vendor-name*.
                                                                   wavefront for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  written to, or
+                                                                  used explicitly, or
                                                                   if a higher numbered
-                                                                  register is written to. This
+                                                                  register is used
+                                                                  explicitly. This
                                                                   includes the special
                                                                   SGPRs for VCC, Flat
                                                                   Scratch (GFX7-GFX9)
@@ -4283,10 +4284,10 @@ same *vendor-name*.
                                                                   each work-item for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  written to, or
+                                                                  used explicitly, or
                                                                   if a higher numbered
-                                                                  register is
-                                                                  written to.
+                                                                  register is used
+                                                                  explicitly.
      ".agpr_count"                       integer        Required  Number of accumulator
                                                                   registers required by
                                                                   each work-item for
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d4fea30f21f4..491314daf2d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // dispatch registers are function args.
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
-  if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
+  if (isShader(F.getCallingConv())) {
     bool IsPixelShader =
         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
 
@@ -1060,6 +1060,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+  } else if (isKernel(F.getCallingConv()) &&
+             MFI->getNumKernargPreloadedSGPRs()) {
+    // Consider cases where the total number of UserSGPRs with trailing
+    // allocated preload SGPRs, is greater than the number of explicitly
+    // referenced SGPRs.
+    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
+        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
+    ProgInfo.NumSGPR =
+        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 7bde59412d90..9a609a1752de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -137,29 +137,274 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
-  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
+  Info.UsesVCC =
+      MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
 
-  Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-  Info.NumExplicitSGPR =
-      TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-  if (ST.hasMAIInsts())
-    Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
-
-  // Preloaded registers are written by the hardware, not defined in the
-  // function body, so they need special handling.
-  if (MFI->isEntryFunction()) {
-    Info.NumExplicitSGPR =
-        std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
-    Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
+  // If there are no calls, MachineRegisterInfo can tell us the used register
+  // count easily.
+  // A tail call isn't considered a call for MachineFrameInfo's purposes.
+  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+    if (ST.hasMAIInsts())
+      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+    return Info;
   }
 
-  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
-    return Info;
-
+  int32_t MaxVGPR = -1;
+  int32_t MaxAGPR = -1;
+  int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
+      // TODO: Check regmasks? Do they occur anywhere except calls?
+      for (const MachineOperand &MO : MI.operands()) {
+        unsigned Width = 0;
+        bool IsSGPR = false;
+        bool IsAGPR = false;
+
+        if (!MO.isReg())
+          continue;
+
+        Register Reg = MO.getReg();
+        switch (Reg) {
+        case AMDGPU::EXEC:
+        case AMDGPU::EXEC_LO:
+        case AMDGPU::EXEC_HI:
+        case AMDGPU::SCC:
+        case AMDGPU::M0:
+        case AMDGPU::M0_LO16:
+        case AMDGPU::M0_HI16:
+        case AMDGPU::SRC_SHARED_BASE_LO:
+        case AMDGPU::SRC_SHARED_BASE:
+        case AMDGPU::SRC_SHARED_LIMIT_LO:
+        case AMDGPU::SRC_SHARED_LIMIT:
+        case AMDGPU::SRC_PRIVATE_BASE_LO:
+        case AMDGPU::SRC_PRIVATE_BASE:
+        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
+        case AMDGPU::SRC_PRIVATE_LIMIT:
+        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+        case AMDGPU::SGPR_NULL:
+        case AMDGPU::SGPR_NULL64:
+        case AMDGPU::MODE:
+          continue;
+
+        case AMDGPU::NoRegister:
+          assert(MI.isDebugInstr() &&
+                 "Instruction uses invalid noreg register");
+          continue;
+
+        case AMDGPU::VCC:
+        case AMDGPU::VCC_LO:
+        case AMDGPU::VCC_HI:
+        case AMDGPU::VCC_LO_LO16:
+        case AMDGPU::VCC_LO_HI16:
+        case AMDGPU::VCC_HI_LO16:
+        case AMDGPU::VCC_HI_HI16:
+          Info.UsesVCC = true;
+          continue;
+
+        case AMDGPU::FLAT_SCR:
+        case AMDGPU::FLAT_SCR_LO:
+        case AMDGPU::FLAT_SCR_HI:
+          continue;
+
+        case AMDGPU::XNACK_MASK:
+        case AMDGPU::XNACK_MASK_LO:
+        case AMDGPU::XNACK_MASK_HI:
+          llvm_unreachable("xnack_mask registers should not be used");
+
+        case AMDGPU::LDS_DIRECT:
+          llvm_unreachable("lds_direct register should not be used");
+
+        case AMDGPU::TBA:
+        case AMDGPU::TBA_LO:
+        case AMDGPU::TBA_HI:
+        case AMDGPU::TMA:
+        case AMDGPU::TMA_LO:
+        case AMDGPU::TMA_HI:
+          llvm_unreachable("trap handler registers should not be used");
+
+        case AMDGPU::SRC_VCCZ:
+          llvm_unreachable("src_vccz register should not be used");
+
+        case AMDGPU::SRC_EXECZ:
+          llvm_unreachable("src_execz register should not be used");
+
+        case AMDGPU::SRC_SCC:
+          llvm_unreachable("src_scc register should not be used");
+
+        default:
+          break;
+        }
+
+        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
+            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
+            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 1;
+        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 1;
+        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 1;
+        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 2;
+        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 2;
+        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 3;
+        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 3;
+        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 3;
+        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 4;
+        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 4;
+        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 5;
+        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 5;
+        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 5;
+        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 6;
+        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 6;
+        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 6;
+        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 7;
+        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 7;
+        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 7;
+        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 8;
+        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 8;
+        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 9;
+        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 9;
+        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 9;
+        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 10;
+        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 10;
+        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 10;
+        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 11;
+        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 11;
+        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 11;
+        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 12;
+        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 12;
+        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 12;
+        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 16;
+        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 16;
+        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 16;
+        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 32;
+        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 32;
+        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 32;
+        } else {
+          // We only expect TTMP registers or registers that do not belong to
+          // any RC.
+          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
+                  !TRI.getPhysRegBaseClass(Reg)) &&
+                 "Unknown register class");
+        }
+        unsigned HWReg = TRI.getHWRegIndex(Reg);
+        int MaxUsed = HWReg + Width - 1;
+        if (IsSGPR) {
+          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+        } else if (IsAGPR) {
+          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+        } else {
+          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+        }
+      }
+
       if (MI.isCall()) {
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?
@@ -219,5 +464,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     }
   }
 
+  Info.NumExplicitSGPR = MaxSGPR + 1;
+  Info.NumVGPR = MaxVGPR + 1;
+  Info.NumAGPR = MaxAGPR + 1;
+
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 01718faaf5c2..0e7635a04558 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -970,25 +970,10 @@ public:
     return NumUserSGPRs;
   }
 
-  // Get the number of preloaded SGPRs for compute kernels.
   unsigned getNumPreloadedSGPRs() const {
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
-  // Get the number of preloaded VGPRs for compute kernels.
-  unsigned getNumPreloadedVGPRs() const {
-    if (hasWorkItemIDZ())
-      return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
-
-    if (hasWorkItemIDY())
-      return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
-
-    if (hasWorkItemIDX())
-      return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
-
-    return 0;
-  }
-
   unsigned getNumKernargPreloadedSGPRs() const {
     return UserSGPRInfo.getNumKernargPreloadSGPRs();
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b76823a128e0..e41189adfb46 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4055,20 +4055,6 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
   return 0;
 }
 
-unsigned
-SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
-                                      const TargetRegisterClass &RC) const {
-  for (MCPhysReg Reg : reverse(RC.getRegisters())) {
-    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
-      if (llvm::any_of(MRI.def_instructions(*AI), [](const MachineInstr &MI) {
-            return !MI.isImplicitDef();
-          }))
-        return getHWRegIndex(Reg) + 1;
-    }
-  }
-  return 0;
-}
-
 SmallVector<StringLiteral>
 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
                                   const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7726762ad0e6..a4b135d5e0b5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,11 +486,6 @@ public:
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass &RC) const;
 
-  // \returns the number of registers of a given \p RC defined in a function.
-  // Does not go inside function calls.
-  unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
-                                 const TargetRegisterClass &RC) const;
-
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
                              : std::optional<uint8_t>{};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index bdd86c1af624..9b35920f8547 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 17
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     wavefront_sgpr_count = 10
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
-; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     wavefront_sgpr_count = 16
+; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
-; GFX10-NEXT:     workitem_vgpr_count = 3
+; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     workitem_vgpr_count = 2
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 6
 ; GFX11-NEXT:     workitem_vgpr_count = 2
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 16
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     wavefront_sgpr_count = 10
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
index cc614bb24839..7bf9a29e9ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -13,9 +13,8 @@
 ; CHECK: {{^}}kernel_illegal_agpr_use_asm:
 ; CHECK: ; use a0
 
-; GFX908: NumVgprs: 3
-; GFX90A: NumVgprs: 1
-; CHECK: NumAgprs: 0
+; CHECK: NumVgprs: 0
+; CHECK: NumAgprs: 1
 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
@@ -25,7 +24,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
 ; CHECK: ; use a0
 
 ; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 0
+; CHECK: NumAgprs: 1
 define void @func_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index 7851de641c5a..dd760c2a215c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -10,9 +10,9 @@
 
 ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
 ; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 15
-; ASM: ; TotalNumSgprs: 21
-; ASM: ; NumSGPRsForWavesPerEU: 21
+; ASM: .amdhsa_next_free_sgpr 12
+; ASM: ; TotalNumSgprs: 18
+; ASM: ; NumSGPRsForWavesPerEU: 18
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
 ; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
 ; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000  ................
+; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
 
 ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 11
-; ASM: ; TotalNumSgprs: 17
-; ASM: ; NumSGPRsForWavesPerEU: 17
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; TotalNumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
 
 ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 4
-; ASM: ; TotalNumSgprs: 10
-; ASM: ; NumSGPRsForWavesPerEU: 10
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; TotalNumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
 
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 ; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
 ; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000  @...............
+; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
 
 ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM: .amdhsa_next_free_sgpr 0
+; ASM: ; TotalNumSgprs: 6
+; ASM: ; NumSGPRsForWavesPerEU: 6
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 494ade73cb5f..f4d17e50cf18 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GFX8-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
-; GFX9-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
+; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x21{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      no_stack:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x20{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x20{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x14{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x21{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 5ccf41c408b7..f52ba7000ede 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
@@ -23,16 +23,8 @@
 ; ELF: Section: .text (0x2)
 ; ELF: }
 
-; GFX10: NumSGPRsForWavesPerEU: 12
-; GFX10: NumVGPRsForWavesPerEU: 3
-
-; Wave32 and 64 behave differently due to the UserSGPRInit16Bug,
-; which only affects Wave32.
-; GFX11W32: NumSGPRsForWavesPerEU: 16
-; GFX11W32: NumVGPRsForWavesPerEU: 1
-
-; GFX11W64: NumSGPRsForWavesPerEU: 11
-; GFX11W64: NumVGPRsForWavesPerEU: 1
+; GFX10: NumSGPRsForWavesPerEU: 6
+; GFX10: NumVGPRsForWavesPerEU: 1
 
 define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 0e0a81d4657d..616867481d17 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
@@ -13,10 +13,10 @@ entry:
 attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 
 ; CHECK-LABEL: {{^}}min_64_max_128:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 5617a80fc94b..e9fe4f3c618c 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 
 ; At least 1 wave per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_1:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 
 ; At least 5 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_5:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 
 ; At least 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
 
 ; At most 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_most_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 
 ; Between 5 and 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_between_5_and_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index efa416e301cc..2e79d8bab46a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -28,7 +28,7 @@ bb:
 }
 ; ALL:      .set .Laliasee_default.num_vgpr, 0
 ; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
-; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
+; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
 
 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index 62ca985bc640..337da5d0ecbe 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -18,7 +18,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index 344f8200608f..075eddd2763d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -24,7 +24,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
 ; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
   call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index 3d36f8a514c4..4fd181d3c0f4 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 2274c437daf6..00f72d5d8b1d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
 ; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index db1269e8e95c..dbd00f09943c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
 ; Make sure to run a GPU with the SGPR allocation bug.
 
 ; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; TotalNumSgprs: 2
+; GCN: ; TotalNumSgprs: 34
 ; GCN: ; NumVgprs: 0
 define void @use_vcc() #1 {
   call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 }
 
 ; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; TotalNumSgprs: 4
-; VI: ; TotalNumSgprs: 6
+; CI: ; TotalNumSgprs: 36
+; VI: ; TotalNumSgprs: 38
 ; GCN: ; NumVgprs: 0
 define void @use_flat_scratch() #1 {
   call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 55dc39462817..61830f18ad7a 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 5
+; CHECK: ; NumVgprs: 8
 define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index d8d7494d0dc1..3fe3cafd729a 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -16,7 +16,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
 ; registers used.
 ;
 ; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM:     .amdhsa_next_free_vgpr 3
+; ALL-ASM:     .amdhsa_next_free_vgpr 1
 ; ALL-ASM:     .amdhsa_next_free_sgpr 1
 define amdgpu_kernel void @empty(
     i32 %i,
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index 59cf9825116f..22d699a8f480 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -43,7 +43,7 @@
 ; OSABI-HSA-ELF:     .sgpr_count:     96
 ; OSABI-HSA-ELF:     .sgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .symbol:         elf_notes.kd
-; OSABI-HSA-ELF:     .vgpr_count:     1
+; OSABI-HSA-ELF:     .vgpr_count:     0
 ; OSABI-HSA-ELF:     .vgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .wavefront_size: 64
 ; OSABI-HSA-ELF: amdhsa.target:   amdgcn-amd-amdhsa--gfx802
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index ed1f3e1397ab..a59382ba20dc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -27,15 +27,15 @@
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 12
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 14
-; VI-NOXNACK: ; TotalNumSgprs: 14
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 2a18d40e0bd8..59bcc5d8be9b 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -5,14 +5,14 @@
 ; GCN-LABEL: {{^}}use_vcc:
 ; GCN: .set use_vcc.num_vgpr, 0
 ; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.numbered_sgpr, 0
+; GCN: .set use_vcc.numbered_sgpr, 32
 ; GCN: .set use_vcc.private_seg_size, 0
 ; GCN: .set use_vcc.uses_vcc, 1
 ; GCN: .set use_vcc.uses_flat_scratch, 0
 ; GCN: .set use_vcc.has_dyn_sized_stack, 0
 ; GCN: .set use_vcc.has_recursion, 0
 ; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 ; GCN-LABEL: {{^}}use_flat_scratch:
 ; GCN: .set use_flat_scratch.num_vgpr, 0
 ; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.numbered_sgpr, 0
+; GCN: .set use_flat_scratch.numbered_sgpr, 32
 ; GCN: .set use_flat_scratch.private_seg_size, 0
 ; GCN: .set use_flat_scratch.uses_vcc, 0
 ; GCN: .set use_flat_scratch.uses_flat_scratch, 1
 ; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
 ; GCN: .set use_flat_scratch.has_recursion, 0
 ; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: TotalNumSgprs: 6
+; GCN: TotalNumSgprs: 38
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
 ; GCN-LABEL: {{^}}use_10_vgpr:
 ; GCN: .set use_10_vgpr.num_vgpr, 10
 ; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.numbered_sgpr, 0
+; GCN: .set use_10_vgpr.numbered_sgpr, 32
 ; GCN: .set use_10_vgpr.private_seg_size, 0
 ; GCN: .set use_10_vgpr.uses_vcc, 0
 ; GCN: .set use_10_vgpr.uses_flat_scratch, 0
 ; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
 ; GCN: .set use_10_vgpr.has_recursion, 0
 ; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 10
 ; GCN: ScratchSize: 0
 define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
 ; GCN-LABEL: {{^}}use_50_vgpr:
 ; GCN:	.set use_50_vgpr.num_vgpr, 50
 ; GCN:	.set use_50_vgpr.num_agpr, 0
-; GCN:	.set use_50_vgpr.numbered_sgpr, 0
+; GCN:	.set use_50_vgpr.numbered_sgpr, 32
 ; GCN:	.set use_50_vgpr.private_seg_size, 0
 ; GCN:	.set use_50_vgpr.uses_vcc, 0
 ; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
 ; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
 ; GCN:	.set use_50_vgpr.has_recursion, 0
 ; GCN:	.set use_50_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 50
 ; GCN: ScratchSize: 0
 define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
 ; GCN-LABEL: {{^}}use_stack0:
 ; GCN:	.set use_stack0.num_vgpr, 1
 ; GCN:	.set use_stack0.num_agpr, 0
-; GCN:	.set use_stack0.numbered_sgpr, 0
+; GCN:	.set use_stack0.numbered_sgpr, 33
 ; GCN:	.set use_stack0.private_seg_size, 2052
 ; GCN:	.set use_stack0.uses_vcc, 0
 ; GCN:	.set use_stack0.uses_flat_scratch, 0
 ; GCN:	.set use_stack0.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack0.has_recursion, 0
 ; GCN:	.set use_stack0.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 2052
 define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
 ; GCN-LABEL: {{^}}use_stack1:
 ; GCN:	.set use_stack1.num_vgpr, 1
 ; GCN:	.set use_stack1.num_agpr, 0
-; GCN:	.set use_stack1.numbered_sgpr, 0
+; GCN:	.set use_stack1.numbered_sgpr, 33
 ; GCN:	.set use_stack1.private_seg_size, 404
 ; GCN:	.set use_stack1.uses_vcc, 0
 ; GCN:	.set use_stack1.uses_flat_scratch, 0
 ; GCN:	.set use_stack1.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack1.has_recursion, 0
 ; GCN:	.set use_stack1.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 404
 define void @use_stack1() #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index bf452a9e38e0..cd89a36fe538 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     16
+; CHECK:     .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index f7e349890700..024593c49dba 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -63,7 +63,7 @@
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
-; ELF:   0250: 6770725F 636F756E 7401B12E 76677072
+; ELF:   0250: 6770725F 636F756E 7402B12E 76677072
 ; ELF:   0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
 ; ELF:   0270: 61766566 726F6E74 5F73697A 6540AD61
 ; ELF:   0280: 6D646873 612E7461 72676574 BD616D64
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
deleted file mode 100644
index 45de8a79fe88..000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Use VGPRs above the input arguments.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  call void asm sideeffect "; clobber v28", "~{v28}"()
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
deleted file mode 100644
index 9c636d4516a8..000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: leaf_shader:
-; CHECK: .vgpr_count:{{.*}}0x1{{$}}
-
-; Function without calls.
-define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
-                              i32 %active.vgpr1, i32 %active.vgpr2,
-                              i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                              i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
-                              local_unnamed_addr {
-entry:
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %compute, label %merge
-
-compute:
-  ; Perform a more complex computation using active VGPRs
-  %square = mul i32 %active.vgpr1, %active.vgpr1
-  %product = mul i32 %square, %active.vgpr2
-  %sum = add i32 %product, %input.value
-  %result = add i32 %sum, 42
-  br label %merge
-
-merge:
-  %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
-
-  store i32 %final.result, ptr %output.ptr, align 4
-
-  ret void
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
deleted file mode 100644
index 1b0d33cec705..000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
-; In that case, the VGPR should be included in the .vgpr_count
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xd{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
deleted file mode 100644
index 940850171878..000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xa{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index c3b033113431..464cd820028c 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
 ; GCN-NEXT: s_addc_u32 s17,
 ; GCN-NEXT: s_setpc_b64 s[16:17]
 
-; GCN: ; TotalNumSgprs: 18
+; GCN: ; TotalNumSgprs: 32
 ; GCN: ; NumVgprs: 8
 define void @func_tail_call() #1 {
   tail call void @func()
diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 03694b913d6e..60bbf4646ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -24,7 +24,7 @@ define void @I_Quit() {
 ; CHECK-LABEL: P_RemoveMobj:
 ; CHECK: .set P_RemoveMobj.num_vgpr, 0
 ; CHECK: .set P_RemoveMobj.num_agpr, 0
-; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
+; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
 ; CHECK: .set P_RemoveMobj.private_seg_size, 0
 ; CHECK: .set P_RemoveMobj.uses_vcc, 0
 ; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
 ; CHECK-LABEL: P_SpawnMobj:
 ; CHECK: .set P_SpawnMobj.num_vgpr, 0
 ; CHECK: .set P_SpawnMobj.num_agpr, 0
-; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
+; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
 ; CHECK: .set P_SpawnMobj.private_seg_size, 0
 ; CHECK: .set P_SpawnMobj.uses_vcc, 0
 ; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
 ; CHECK-LABEL: G_PlayerReborn:
 ; CHECK: .set G_PlayerReborn.num_vgpr, 0
 ; CHECK: .set G_PlayerReborn.num_agpr, 0
-; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
+; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
 ; CHECK: .set G_PlayerReborn.private_seg_size, 0
 ; CHECK: .set G_PlayerReborn.uses_vcc, 0
 ; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
 ; CHECK-LABEL: P_SetThingPosition:
 ; CHECK: .set P_SetThingPosition.num_vgpr, 0
 ; CHECK: .set P_SetThingPosition.num_agpr, 0
-; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
+; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
 ; CHECK: .set P_SetThingPosition.private_seg_size, 0
 ; CHECK: .set P_SetThingPosition.uses_vcc, 0
 ; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
 ; CHECK-LABEL: HU_Start:
 ; CHECK: .set HU_Start.num_vgpr, 0
 ; CHECK: .set HU_Start.num_agpr, 0
-; CHECK: .set HU_Start.numbered_sgpr, 0
+; CHECK: .set HU_Start.numbered_sgpr, 32
 ; CHECK: .set HU_Start.private_seg_size, 0
 ; CHECK: .set HU_Start.uses_vcc, 0
 ; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
 ; CHECK-LABEL: AM_Stop:
 ; CHECK: .set AM_Stop.num_vgpr, 0
 ; CHECK: .set AM_Stop.num_agpr, 0
-; CHECK: .set AM_Stop.numbered_sgpr, 0
+; CHECK: .set AM_Stop.numbered_sgpr, 32
 ; CHECK: .set AM_Stop.private_seg_size, 0
 ; CHECK: .set AM_Stop.uses_vcc, 0
 ; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
 ; CHECK-LABEL: D_AdvanceDemo:
 ; CHECK: .set D_AdvanceDemo.num_vgpr, 0
 ; CHECK: .set D_AdvanceDemo.num_agpr, 0
-; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
+; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
 ; CHECK: .set D_AdvanceDemo.private_seg_size, 0
 ; CHECK: .set D_AdvanceDemo.uses_vcc, 0
 ; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
 ; CHECK-LABEL: F_StartFinale:
 ; CHECK: .set F_StartFinale.num_vgpr, 0
 ; CHECK: .set F_StartFinale.num_agpr, 0
-; CHECK: .set F_StartFinale.numbered_sgpr, 0
+; CHECK: .set F_StartFinale.numbered_sgpr, 32
 ; CHECK: .set F_StartFinale.private_seg_size, 0
 ; CHECK: .set F_StartFinale.uses_vcc, 0
 ; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
 ; CHECK-LABEL: F_Ticker:
 ; CHECK: .set F_Ticker.num_vgpr, 0
 ; CHECK: .set F_Ticker.num_agpr, 0
-; CHECK: .set F_Ticker.numbered_sgpr, 0
+; CHECK: .set F_Ticker.numbered_sgpr, 32
 ; CHECK: .set F_Ticker.private_seg_size, 0
 ; CHECK: .set F_Ticker.uses_vcc, 0
 ; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
 ; CHECK-LABEL: P_TempSaveGameFile:
 ; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
 ; CHECK: .set P_TempSaveGameFile.num_agpr, 0
-; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
 ; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
 ; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
 ; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
 ; CHECK-LABEL: P_SaveGameFile:
 ; CHECK: .set P_SaveGameFile.num_vgpr, 2
 ; CHECK: .set P_SaveGameFile.num_agpr, 0
-; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
 ; CHECK: .set P_SaveGameFile.private_seg_size, 0
 ; CHECK: .set P_SaveGameFile.uses_vcc, 0
 ; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
 ; CHECK-LABEL: R_FlatNumForName:
 ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
 ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
 ; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
 ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
 ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
diff --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
index 83f58db1aa67..7a810d0067c1 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, 13
 ; CHECK: .set qux.num_agpr, 0
-; CHECK: .set qux.numbered_sgpr, 0
+; CHECK: .set qux.numbered_sgpr, 32
 ; CHECK: .set qux.private_seg_size, 0
 ; CHECK: .set qux.uses_vcc, 0
 ; CHECK: .set qux.uses_flat_scratch, 0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 28c3131302a3..638dc8965987 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -83,13 +83,13 @@
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x21
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x20
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      no_stack_call:
@@ -122,7 +122,7 @@
 ; CHECK-NEXT:      simple_lds:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0x100
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x20
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      simple_lds_recurse:
@@ -134,7 +134,7 @@
 ; CHECK-NEXT:      simple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x14
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x21
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index a71fd7fe782f..5b9b0feea990 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
 
 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
-; ;CHECK: NumVgprs: 2
+; ;CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   %i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; Additionally set the PSInputAddr to 0 via the metadata
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
 .entry:
   ret { <4 x float> } undef
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index bfcf90037bfd..35e11ad6a648 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -24,9 +24,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
   store i32 %x, ptr addrspace(1) %out, align 4
   ret void
 }
-
-attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index a2470a60cb19..afb77ed19089 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 22
-; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 3
+; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     Dynamic Stack: False
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
index 557ffd27a07f..0d25bc97ff77 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: {{^}}spill:
 ; GCN:    NumSgprs: 104
 ; GCN-GCNTRACKERS:    NumSgprs: 104
-; GCN:    NumVgprs: 3
-; GCN-GCNTRACKERS:    NumVgprs: 3
+; GCN:    NumVgprs: 1
+; GCN-GCNTRACKERS:    NumVgprs: 2
 ; GCN:    ScratchSize: 0
 ; GCN-GCNTRACKERS:    ScratchSize: 0
 ; GCN:    Occupancy: 5
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index 95d707aee566..c5732531f542 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -11,8 +11,8 @@
 ; allow scheduling of other instructions which reduce RP
 
 ; CHECK-LABEL: {{^}}return_72xi32:
-; GFX11-PAL:    NumSgprs: 0
-; GFX11-PAL-GCNTRACKERS:    NumSgprs: 0
+; GFX11-PAL:    NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
 ; GFX11-PAL:    NumVgprs: 64
 ; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
 ; GFX11-PAL:    ScratchSize: 220
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 8300a52955b9..462ac23ec7e0 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,14 +7,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
 
-; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
-; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1-9]$}}
 
 ; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
 ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
 
 ; stores may alias loads
-; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
+; VI-MINREG: NumSgprs: {{[0-9]$}}
 ; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
 
 ; stores may alias loads
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 682bbdedb37a..6ddf0986755f 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index d3def45c4f9d..30accc846d2b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     12
+; ELF: .sgpr_count:     9
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index ad831e040d72..4f84b31f1877 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 0
 
 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     8
+; ELF: .sgpr_count:     5
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index d1e28e11601c..644f43492336 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     12
+; ELF: .sgpr_count:     9
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index 4802ec861d68..cf5b95a72997 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: __unnamed_1:
 ; CHECK: .set __unnamed_1.num_vgpr, 0
 ; CHECK: .set __unnamed_1.num_agpr, 0
-; CHECK: .set __unnamed_1.numbered_sgpr, 0
+; CHECK: .set __unnamed_1.numbered_sgpr, 32
 ; CHECK: .set __unnamed_1.private_seg_size, 0
 ; CHECK: .set __unnamed_1.uses_vcc, 0
 ; CHECK: .set __unnamed_1.uses_flat_scratch, 0
@@ -16,7 +16,7 @@ entry:
 }
 
 ; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
 ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
 ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
 ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index ee35dc4cddad..2cb5e309c8c2 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
 }
 
 ; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 1
+; GFX90A: NumVgprs: 32
 ; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 5
+; GFX90A: TotalNumVgprs: 33
 define void @f1024_0() #1024 {
   call void @foo()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
deleted file mode 100644
index 8c8182db7b47..000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
-; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
-target triple = "amdgcn-amd-amdhsa"
-
-@global = addrspace(1) global i32 poison, align 4
-
-; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
-; Only hardware-initialized VGPRs (v0) are read in this kernel.
-
-; CHECK-LABEL: amdhsa.kernels:
-; CHECK-LABEL: kernel_x
-; CHECK: .vgpr_count:     1
-define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: kernel_z
-; PACKED: .vgpr_count:     1
-; NOTPACKED: .vgpr_count:     3
-define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.z()
-  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
deleted file mode 100644
index f5d28a0ae162..000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
-; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
-target triple = "amdgcn--amdpal"
-
-@global = addrspace(1) global i32 poison, align 4
-
-; CHECK-LABEL: amdpal.pipelines:
-
-; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
-; CHECK-LABEL: .entry_point_symbol: kernel_use
-; CHECK: .vgpr_count:     0x20
-define amdgpu_cs void @kernel_use([32 x i32] %args) {
-entry:
-  %a = extractvalue [32 x i32] %args, 14
-  store i32 %a, ptr addrspace(1) @global
-  ret void
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: chain_func:
-; CHECK: .vgpr_count:     0x1
-define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
-entry:
-  call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
-        ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
-  unreachable
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: gfx_func:
-; CHECK: .vgpr_count:     0x1
-define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
-entry:
-  ret [32 x i32] %args
-}

From be9994b09206a84a32c3029b409587008d179b95 Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Fri, 13 Jun 2025 07:00:36 -0400
Subject: [PATCH 0261/1322] [SystemZ][z/OS] Refactor AutoConvert more (#143955)

This patch removes the C++
disablezOSAutoConversion,enablezOSAutoConversion declarations and also
updates Path.inc to use the common function.
---
 llvm/include/llvm/Support/AutoConvert.h | 45 +++++++++----------------
 llvm/lib/Support/AutoConvert.cpp        |  1 -
 llvm/lib/Support/Unix/Path.inc          |  4 +--
 llvm/lib/Support/Unix/Program.inc       |  2 +-
 4 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 56ad91425bcc..1e6792636e16 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -18,7 +18,7 @@
 #include <_Ccsid.h>
 #endif
 #ifdef __cplusplus
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Error.h"
 #include <system_error>
 #endif /* __cplusplus */
 
@@ -41,6 +41,21 @@ int restorezOSStdHandleAutoConversion(int FD);
 #ifdef __cplusplus
 namespace llvm {
 
+#ifdef __MVS__
+
+/** \brief Set the tag information for a file descriptor. */
+std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
+
+/** \brief Get the the tag ccsid for a file name or a file descriptor. */
+ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
+
+/** \brief Query the file tag to determine if it needs conversion to UTF-8
+ *  codepage.
+ */
+ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
+
+#endif /* __MVS__*/
+
 inline std::error_code disableAutoConversion(int FD) {
 #ifdef __MVS__
   if (::disablezOSAutoConversion(FD) == -1)
@@ -79,34 +94,6 @@ inline ErrorOr<bool> needConversion(const char *FileName, const int FD = -1) {
   return false;
 }
 
-#ifdef __MVS__
-
-/** \brief Disable the z/OS enhanced ASCII auto-conversion for the file
- * descriptor.
- */
-std::error_code disablezOSAutoConversion(int FD);
-
-/** \brief Query the z/OS enhanced ASCII auto-conversion status of a file
- * descriptor and force the conversion if the file is not tagged with a
- * codepage.
- */
-std::error_code enablezOSAutoConversion(int FD);
-
-/** Restore the z/OS enhanced ASCII auto-conversion for the std handle. */
-std::error_code restorezOSStdHandleAutoConversion(int FD);
-
-/** \brief Set the tag information for a file descriptor. */
-std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
-
-/** \brief Get the the tag ccsid for a file name or a file descriptor. */
-ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
-
-/** \brief Query the file tag to determine if it needs conversion to UTF-8
- *  codepage.
- */
-ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
-
-#endif /* __MVS__*/
 } /* namespace llvm */
 #endif /* __cplusplus */
 
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index c69e9a8f97c0..0b6928e10ef5 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -14,7 +14,6 @@
 #ifdef __MVS__
 
 #include "llvm/Support/AutoConvert.h"
-#include "llvm/Support/Error.h"
 #include <cassert>
 #include <fcntl.h>
 #include <sys/stat.h>
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 0728413f4db6..277247e3cc23 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1097,14 +1097,14 @@ std::error_code openFile(const Twine &Name, int &ResultFD,
                     !Stat.st_tag.ft_txtflag && !Stat.st_tag.ft_ccsid &&
                     Stat.st_size == 0;
     if (Flags & OF_Text) {
-      if (auto EC = llvm::enablezOSAutoConversion(ResultFD))
+      if (auto EC = llvm::enableAutoConversion(ResultFD))
         return EC;
       if (DoSetTag) {
         if (auto EC = llvm::setzOSFileTag(ResultFD, CCSID_IBM_1047, true))
           return EC;
       }
     } else {
-      if (auto EC = llvm::disablezOSAutoConversion(ResultFD))
+      if (auto EC = llvm::disableAutoConversion(ResultFD))
         return EC;
       if (DoSetTag) {
         if (auto EC = llvm::setzOSFileTag(ResultFD, FT_BINARY, false))
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index 6d68369ad191..4f17b2257a75 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -516,7 +516,7 @@ std::error_code llvm::sys::ChangeStdoutMode(fs::OpenFlags Flags) {
 
 std::error_code llvm::sys::ChangeStdinToBinary() {
 #ifdef __MVS__
-  return disablezOSAutoConversion(STDIN_FILENO);
+  return disableAutoConversion(STDIN_FILENO);
 #else
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return std::error_code();

From 30725efe671bc82bf9095a575aece60fc40fbef5 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Fri, 13 Jun 2025 07:12:41 -0400
Subject: [PATCH 0262/1322] Fix build after removing delayed typo expression

This addresses issues found by:
  https://lab.llvm.org/buildbot/#/builders/64/builds/4220
  https://lab.llvm.org/buildbot/#/builders/51/builds/17890
---
 clang/lib/Parse/ParseExpr.cpp  |  1 -
 clang/lib/Sema/SemaExpr.cpp    | 48 ----------------------------------
 clang/lib/Sema/SemaExprCXX.cpp | 45 -------------------------------
 3 files changed, 94 deletions(-)

diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index a27a44455b62..3cf3d4ea7d70 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -513,7 +513,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       }
     }
 
-    ExprResult OrigLHS = LHS;
     if (!LHS.isInvalid()) {
       // Combine the LHS and RHS into the LHS (e.g. build AST).
       if (TernaryMiddle.isInvalid()) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index b7031bc8c022..413eff4aa294 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2446,42 +2446,6 @@ Sema::DecomposeUnqualifiedId(const UnqualifiedId &Id,
   }
 }
 
-static void emitEmptyLookupTypoDiagnostic(const TypoCorrection &TC,
-                                          Sema &SemaRef, const CXXScopeSpec &SS,
-                                          DeclarationName Typo,
-                                          SourceRange TypoRange,
-                                          unsigned DiagnosticID,
-                                          unsigned DiagnosticSuggestID) {
-  DeclContext *Ctx =
-      SS.isEmpty() ? nullptr : SemaRef.computeDeclContext(SS, false);
-  if (!TC) {
-    // Emit a special diagnostic for failed member lookups.
-    // FIXME: computing the declaration context might fail here (?)
-    if (Ctx)
-      SemaRef.Diag(TypoRange.getBegin(), diag::err_no_member)
-          << Typo << Ctx << TypoRange;
-    else
-      SemaRef.Diag(TypoRange.getBegin(), DiagnosticID) << Typo << TypoRange;
-    return;
-  }
-
-  std::string CorrectedStr = TC.getAsString(SemaRef.getLangOpts());
-  bool DroppedSpecifier =
-      TC.WillReplaceSpecifier() && Typo.getAsString() == CorrectedStr;
-  unsigned NoteID = TC.getCorrectionDeclAs<ImplicitParamDecl>()
-                        ? diag::note_implicit_param_decl
-                        : diag::note_previous_decl;
-  if (!Ctx)
-    SemaRef.diagnoseTypo(
-        TC, SemaRef.PDiag(DiagnosticSuggestID) << Typo << TypoRange,
-        SemaRef.PDiag(NoteID));
-  else
-    SemaRef.diagnoseTypo(TC,
-                         SemaRef.PDiag(diag::err_no_member_suggest)
-                             << Typo << Ctx << DroppedSpecifier << TypoRange,
-                         SemaRef.PDiag(NoteID));
-}
-
 bool Sema::DiagnoseDependentMemberLookup(const LookupResult &R) {
   // During a default argument instantiation the CurContext points
   // to a CXXMethodDecl; but we can't apply a this-> fixit inside a
@@ -14922,18 +14886,6 @@ static void checkObjCPointerIntrospection(Sema &S, ExprResult &L, ExprResult &R,
   }
 }
 
-static NamedDecl *getDeclFromExpr(Expr *E) {
-  if (!E)
-    return nullptr;
-  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
-    return DRE->getDecl();
-  if (auto *ME = dyn_cast<MemberExpr>(E))
-    return ME->getMemberDecl();
-  if (auto *IRE = dyn_cast<ObjCIvarRefExpr>(E))
-    return IRE->getDecl();
-  return nullptr;
-}
-
 // This helper function promotes a binary operator's operands (which are of a
 // half vector type) to a vector of floats and then truncates the result to
 // a vector of either half or short.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c653cb56351c..ba52e8f8932d 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7647,51 +7647,6 @@ static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures(
   CurrentLSI->clearPotentialCaptures();
 }
 
-static ExprResult attemptRecovery(Sema &SemaRef,
-                                  const TypoCorrectionConsumer &Consumer,
-                                  const TypoCorrection &TC) {
-  LookupResult R(SemaRef, Consumer.getLookupResult().getLookupNameInfo(),
-                 Consumer.getLookupResult().getLookupKind());
-  const CXXScopeSpec *SS = Consumer.getSS();
-  CXXScopeSpec NewSS;
-
-  // Use an approprate CXXScopeSpec for building the expr.
-  if (auto *NNS = TC.getCorrectionSpecifier())
-    NewSS.MakeTrivial(SemaRef.Context, NNS, TC.getCorrectionRange());
-  else if (SS && !TC.WillReplaceSpecifier())
-    NewSS = *SS;
-
-  if (auto *ND = TC.getFoundDecl()) {
-    R.setLookupName(ND->getDeclName());
-    R.addDecl(ND);
-    if (ND->isCXXClassMember()) {
-      // Figure out the correct naming class to add to the LookupResult.
-      CXXRecordDecl *Record = nullptr;
-      if (auto *NNS = TC.getCorrectionSpecifier())
-        Record = NNS->getAsType()->getAsCXXRecordDecl();
-      if (!Record)
-        Record =
-            dyn_cast<CXXRecordDecl>(ND->getDeclContext()->getRedeclContext());
-      if (Record)
-        R.setNamingClass(Record);
-
-      // Detect and handle the case where the decl might be an implicit
-      // member.
-      if (SemaRef.isPotentialImplicitMemberAccess(
-              NewSS, R, Consumer.isAddressOfOperand()))
-        return SemaRef.BuildPossibleImplicitMemberExpr(
-            NewSS, /*TemplateKWLoc*/ SourceLocation(), R,
-            /*TemplateArgs*/ nullptr, /*S*/ nullptr);
-    } else if (auto *Ivar = dyn_cast<ObjCIvarDecl>(ND)) {
-      return SemaRef.ObjC().LookupInObjCMethod(R, Consumer.getScope(),
-                                               Ivar->getIdentifier());
-    }
-  }
-
-  return SemaRef.BuildDeclarationNameExpr(NewSS, R, /*NeedsADL*/ false,
-                                          /*AcceptInvalidDecl*/ true);
-}
-
 ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
                                      bool DiscardedValue, bool IsConstexpr,
                                      bool IsTemplateArgument) {

From 4236423ee863be5903819db57205fc83a4bd21e1 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Fri, 13 Jun 2025 16:31:25 +0500
Subject: [PATCH 0263/1322] [LLDB] Add bit extraction to DIL (#141422)

---
 lldb/include/lldb/ValueObject/DILAST.h        | 27 +++++++++
 lldb/include/lldb/ValueObject/DILEval.h       |  2 +
 lldb/include/lldb/ValueObject/DILLexer.h      |  1 +
 lldb/source/ValueObject/DILAST.cpp            |  5 ++
 lldb/source/ValueObject/DILEval.cpp           | 32 +++++++++++
 lldb/source/ValueObject/DILLexer.cpp          |  7 ++-
 lldb/source/ValueObject/DILParser.cpp         | 22 ++++++--
 .../TestFrameVarDILArraySubscript.py          |  2 +-
 .../basics/BitFieldExtraction/Makefile        |  3 +
 .../TestFrameVarDILBitFieldExtraction.py      | 56 +++++++++++++++++++
 .../basics/BitFieldExtraction/main.cpp        |  9 +++
 11 files changed, 159 insertions(+), 7 deletions(-)
 create mode 100644 lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile
 create mode 100644 lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py
 create mode 100644 lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp

diff --git a/lldb/include/lldb/ValueObject/DILAST.h b/lldb/include/lldb/ValueObject/DILAST.h
index 6c7838e05c93..709f0639135f 100644
--- a/lldb/include/lldb/ValueObject/DILAST.h
+++ b/lldb/include/lldb/ValueObject/DILAST.h
@@ -19,6 +19,7 @@ namespace lldb_private::dil {
 /// The various types DIL AST nodes (used by the DIL parser).
 enum class NodeKind {
   eArraySubscriptNode,
+  eBitExtractionNode,
   eErrorNode,
   eIdentifierNode,
   eMemberOfNode,
@@ -153,6 +154,30 @@ private:
   int64_t m_index;
 };
 
+class BitFieldExtractionNode : public ASTNode {
+public:
+  BitFieldExtractionNode(uint32_t location, ASTNodeUP base, int64_t first_index,
+                         int64_t last_index)
+      : ASTNode(location, NodeKind::eBitExtractionNode),
+        m_base(std::move(base)), m_first_index(first_index),
+        m_last_index(last_index) {}
+
+  llvm::Expected<lldb::ValueObjectSP> Accept(Visitor *v) const override;
+
+  ASTNode *GetBase() const { return m_base.get(); }
+  int64_t GetFirstIndex() const { return m_first_index; }
+  int64_t GetLastIndex() const { return m_last_index; }
+
+  static bool classof(const ASTNode *node) {
+    return node->GetKind() == NodeKind::eBitExtractionNode;
+  }
+
+private:
+  ASTNodeUP m_base;
+  int64_t m_first_index;
+  int64_t m_last_index;
+};
+
 /// This class contains one Visit method for each specialized type of
 /// DIL AST node. The Visit methods are used to dispatch a DIL AST node to
 /// the correct function in the DIL expression evaluator for evaluating that
@@ -168,6 +193,8 @@ public:
   Visit(const UnaryOpNode *node) = 0;
   virtual llvm::Expected<lldb::ValueObjectSP>
   Visit(const ArraySubscriptNode *node) = 0;
+  virtual llvm::Expected<lldb::ValueObjectSP>
+  Visit(const BitFieldExtractionNode *node) = 0;
 };
 
 } // namespace lldb_private::dil
diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h
index 9d0fa53c6622..2a0cb548a810 100644
--- a/lldb/include/lldb/ValueObject/DILEval.h
+++ b/lldb/include/lldb/ValueObject/DILEval.h
@@ -54,6 +54,8 @@ private:
   llvm::Expected<lldb::ValueObjectSP> Visit(const UnaryOpNode *node) override;
   llvm::Expected<lldb::ValueObjectSP>
   Visit(const ArraySubscriptNode *node) override;
+  llvm::Expected<lldb::ValueObjectSP>
+  Visit(const BitFieldExtractionNode *node) override;
 
   // Used by the interpreter to create objects, perform casts, etc.
   lldb::TargetSP m_target;
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
index 7d70f88f9a71..9c1ba9768025 100644
--- a/lldb/include/lldb/ValueObject/DILLexer.h
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -31,6 +31,7 @@ public:
     identifier,
     l_paren,
     l_square,
+    minus,
     numeric_constant,
     period,
     r_paren,
diff --git a/lldb/source/ValueObject/DILAST.cpp b/lldb/source/ValueObject/DILAST.cpp
index 8b5e64ad462c..b1cd824c2299 100644
--- a/lldb/source/ValueObject/DILAST.cpp
+++ b/lldb/source/ValueObject/DILAST.cpp
@@ -32,4 +32,9 @@ ArraySubscriptNode::Accept(Visitor *v) const {
   return v->Visit(this);
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+BitFieldExtractionNode::Accept(Visitor *v) const {
+  return v->Visit(this);
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index c8cb54aa18a9..b2bb4e20ddc2 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -430,4 +430,36 @@ Interpreter::Visit(const ArraySubscriptNode *node) {
   return base->GetSyntheticArrayMember(signed_child_idx, true);
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+Interpreter::Visit(const BitFieldExtractionNode *node) {
+  auto lhs_or_err = Evaluate(node->GetBase());
+  if (!lhs_or_err)
+    return lhs_or_err;
+  lldb::ValueObjectSP base = *lhs_or_err;
+  int64_t first_index = node->GetFirstIndex();
+  int64_t last_index = node->GetLastIndex();
+
+  // if the format given is [high-low], swap range
+  if (first_index > last_index)
+    std::swap(first_index, last_index);
+
+  Status error;
+  if (base->GetCompilerType().IsReferenceType()) {
+    base = base->Dereference(error);
+    if (error.Fail())
+      return error.ToError();
+  }
+  lldb::ValueObjectSP child_valobj_sp =
+      base->GetSyntheticBitFieldChild(first_index, last_index, true);
+  if (!child_valobj_sp) {
+    std::string message = llvm::formatv(
+        "bitfield range {0}-{1} is not valid for \"({2}) {3}\"", first_index,
+        last_index, base->GetTypeName().AsCString("<invalid type>"),
+        base->GetName().AsCString());
+    return llvm::make_error<DILDiagnosticError>(m_expr, message,
+                                                node->GetLocation());
+  }
+  return child_valobj_sp;
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index 99182d2da113..eaefaf484bc1 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -34,6 +34,8 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
     return "l_paren";
   case Kind::l_square:
     return "l_square";
+  case Kind::minus:
+    return "minus";
   case Kind::numeric_constant:
     return "numeric_constant";
   case Kind::period:
@@ -113,8 +115,9 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
 
   constexpr std::pair<Token::Kind, const char *> operators[] = {
       {Token::amp, "&"},     {Token::arrow, "->"},   {Token::coloncolon, "::"},
-      {Token::l_paren, "("}, {Token::l_square, "["}, {Token::period, "."},
-      {Token::r_paren, ")"}, {Token::r_square, "]"}, {Token::star, "*"},
+      {Token::l_paren, "("}, {Token::l_square, "["}, {Token::minus, "-"},
+      {Token::period, "."},  {Token::r_paren, ")"},  {Token::r_square, "]"},
+      {Token::star, "*"},
   };
   for (auto [kind, str] : operators) {
     if (remainder.consume_front(str))
diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp
index 9667885734f2..32af0820acb9 100644
--- a/lldb/source/ValueObject/DILParser.cpp
+++ b/lldb/source/ValueObject/DILParser.cpp
@@ -120,6 +120,7 @@ ASTNodeUP DILParser::ParseUnaryExpression() {
 //  postfix_expression:
 //    primary_expression
 //    postfix_expression "[" integer_literal "]"
+//    postfix_expression "[" integer_literal "-" integer_literal "]"
 //    postfix_expression "." id_expression
 //    postfix_expression "->" id_expression
 //
@@ -131,17 +132,30 @@ ASTNodeUP DILParser::ParsePostfixExpression() {
     switch (token.GetKind()) {
     case Token::l_square: {
       m_dil_lexer.Advance();
-      std::optional<int64_t> rhs = ParseIntegerConstant();
-      if (!rhs) {
+      std::optional<int64_t> index = ParseIntegerConstant();
+      if (!index) {
         BailOut(
             llvm::formatv("failed to parse integer constant: {0}", CurToken()),
             CurToken().GetLocation(), CurToken().GetSpelling().length());
         return std::make_unique<ErrorNode>();
       }
+      if (CurToken().GetKind() == Token::minus) {
+        m_dil_lexer.Advance();
+        std::optional<int64_t> last_index = ParseIntegerConstant();
+        if (!last_index) {
+          BailOut(llvm::formatv("failed to parse integer constant: {0}",
+                                CurToken()),
+                  CurToken().GetLocation(), CurToken().GetSpelling().length());
+          return std::make_unique<ErrorNode>();
+        }
+        lhs = std::make_unique<BitFieldExtractionNode>(
+            loc, std::move(lhs), std::move(*index), std::move(*last_index));
+      } else {
+        lhs = std::make_unique<ArraySubscriptNode>(loc, std::move(lhs),
+                                                   std::move(*index));
+      }
       Expect(Token::r_square);
       m_dil_lexer.Advance();
-      lhs = std::make_unique<ArraySubscriptNode>(loc, std::move(lhs),
-                                                 std::move(*rhs));
       break;
     }
     case Token::period:
diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
index 0d91f804ce56..c90e0eaa6363 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
@@ -63,7 +63,7 @@ class TestFrameVarDILArraySubscript(TestBase):
         self.expect(
             "frame var 'int_arr[-1]'",
             error=True,
-            substrs=["unrecognized token"],
+            substrs=["failed to parse integer constant"],
         )
 
         # Test for floating point index
diff --git a/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile
new file mode 100644
index 000000000000..99998b20bcb0
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py
new file mode 100644
index 000000000000..7b5ef0650b6e
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py
@@ -0,0 +1,56 @@
+"""
+Test DIL BifField extraction.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestFrameVarDILBitFieldExtraction(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def expect_var_path(self, expr, compare_to_framevar=False, value=None, type=None):
+        value_dil = super().expect_var_path(expr, value=value, type=type)
+        if compare_to_framevar:
+            self.runCmd("settings set target.experimental.use-DIL false")
+            value_frv = super().expect_var_path(expr, value=value, type=type)
+            self.runCmd("settings set target.experimental.use-DIL true")
+            self.assertEqual(value_dil.GetValue(), value_frv.GetValue())
+
+    def test_bitfield_extraction(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp")
+        )
+
+        self.runCmd("settings set target.experimental.use-DIL true")
+
+        # Test ranges and type
+        self.expect_var_path("value[0-1]", True, value="3", type="int:2")
+        self.expect_var_path("value[4-7]", True, value="7", type="int:4")
+        self.expect_var_path("value[7-0]", True, value="115", type="int:8")
+
+        # Test reference and dereferenced pointer
+        self.expect_var_path("value_ref[0-1]", value="3", type="int:2")
+        self.expect_var_path("(*value_ptr)[0-1]", value="3", type="int:2")
+
+        # Test array and pointer
+        self.expect(
+            "frame var 'int_arr[0-2]'",
+            error=True,
+            substrs=["bitfield range 0-2 is not valid"],
+        )
+        self.expect(
+            "frame var 'value_ptr[0-1]'",
+            error=True,
+            substrs=["bitfield range 0-1 is not valid"],
+        )
+
+        # Test invalid input
+        self.expect(
+            "frame var 'value[1-]'",
+            error=True,
+            substrs=["failed to parse integer constant"],
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp
new file mode 100644
index 000000000000..a35f68a9e30a
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp
@@ -0,0 +1,9 @@
+int main(int argc, char **argv) {
+  int value = 0b01110011;
+  int &value_ref = value;
+  int *value_ptr = &value;
+
+  int int_arr[] = {7, 3, 1};
+
+  return 0; // Set a breakpoint here
+}

From 41b37f05554ae59974675ae219430b5598c6159f Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 13 Jun 2025 12:43:27 +0100
Subject: [PATCH 0264/1322] [lldb] CommandObjectMemoryFind: Improve expression
 evaluation error messages (#144036)

We now bubble up the expression evaluation diagnostics to the user and
also distinguish between "expression failed to parse/run" versus other
ways in which expressions didn't complete (e.g., setup errors, etc.).

Before:
```
(lldb) memory find -e "" 0x16fdfedc0 0x16fdfede0
error: expression evaluation failed. pass a string instead
(lldb) memory find -e "invalid" 0x16fdfedc0 0x16fdfede0
error: expression evaluation failed. pass a string instead
```

After:
```
(lldb) memory find -e "" 0x16fdfedc0 0x16fdfede0
error: Expression evaluation failed:
error: No result returned from expression. Exit status: 1
(lldb) memory find -e "invalid" 0x16fdfedc0 0x16fdfede0
error: Expression evaluation failed:
error: <user expression 0>:1:1: use of undeclared identifier 'invalid'
    1 | invalid
      | ^~~~~~~
```
---
 lldb/source/Commands/CommandObjectMemory.cpp      |  8 ++++++--
 .../functionalities/memory/find/TestMemoryFind.py | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index ccb06d8ff4d5..5792c13373c1 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -925,9 +925,12 @@ EvaluateExpression(llvm::StringRef expression, StackFrame &frame,
   ValueObjectSP result_sp;
   auto status =
       process.GetTarget().EvaluateExpression(expression, &frame, result_sp);
-  if (status != eExpressionCompleted || !result_sp)
+  if (!result_sp)
     return llvm::createStringError(
-        "expression evaluation failed. pass a string instead");
+        "No result returned from expression. Exit status: %d", status);
+
+  if (status != eExpressionCompleted)
+    return result_sp->GetError().ToError();
 
   result_sp = result_sp->GetQualifiedRepresentationIfAvailable(
       result_sp->GetDynamicValueType(), /*synthValue=*/true);
@@ -1082,6 +1085,7 @@ protected:
           m_memory_options.m_expr.GetValueAs<llvm::StringRef>().value_or(""),
           m_exe_ctx.GetFrameRef(), *process);
       if (!result_or_err) {
+        result.AppendError("Expression evaluation failed: ");
         result.AppendError(llvm::toString(result_or_err.takeError()));
         return;
       }
diff --git a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
index 72426e75e013..a06b0d960889 100644
--- a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
+++ b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
@@ -56,10 +56,23 @@ class MemoryFindTestCase(TestBase):
         # Invalid expr is an error.
         self.expect(
             'memory find -e "not_a_symbol" `&bytedata[0]` `&bytedata[15]`',
+            substrs=[
+                "Expression evaluation failed:",
+                "use of undeclared identifier 'not_a_symbol'",
+            ],
             error=True,
-            substrs=["error: expression evaluation failed. pass a string instead"],
         )
 
+        self.expect(
+            'memory find -e "" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Expression evaluation failed:",
+                "No result returned from expression. Exit status: 1",
+            ],
+            error=True,
+        )
+
+        # Valid expressions/strings
         self.expect(
             'memory find -e "(uint8_t)0x22" `&bytedata[0]` `&bytedata[15]`',
             substrs=["data found at location: 0x", "22 33 44 55 66"],

From f1036d844e4b886ac702859ccf8a19cf2153c7f7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 12:49:22 +0100
Subject: [PATCH 0265/1322] [X86] X86InstrInfo::commuteInstructionImpl - remove
 (V)BLENDPD/S commutation to (V)MOVSD/S optsize handling (#144051)

Just commute with (V)BLENDPD/S like all other BLEND instructions

This is now handled more generally by the X86FixupInstTuningPass (OptSize fold occurs even without a scheduler model).

First step towards #142972
---
 llvm/lib/Target/X86/X86InstrInfo.cpp | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 212e134c512a..abf365eedec3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2353,33 +2353,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     break;
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
+  case X86::PBLENDWrri:
   case X86::VBLENDPDrri:
   case X86::VBLENDPSrri:
-    // If we're optimizing for size, try to use MOVSD/MOVSS.
-    if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
-      unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
-      if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
-#define FROM_TO(FROM, TO)                                                      \
-  case X86::FROM:                                                              \
-    Opc = X86::TO;                                                             \
-    break;
-        switch (Opc) {
-        default:
-          llvm_unreachable("Unreachable!");
-        FROM_TO(BLENDPDrri, MOVSDrr)
-        FROM_TO(BLENDPSrri, MOVSSrr)
-        FROM_TO(VBLENDPDrri, VMOVSDrr)
-        FROM_TO(VBLENDPSrri, VMOVSSrr)
-        }
-        WorkingMI = CloneIfNew(MI);
-        WorkingMI->setDesc(get(Opc));
-        WorkingMI->removeOperand(3);
-        break;
-      }
-#undef FROM_TO
-    }
-    [[fallthrough]];
-  case X86::PBLENDWrri:
   case X86::VBLENDPDYrri:
   case X86::VBLENDPSYrri:
   case X86::VPBLENDDrri:

From cc365331af423de99ae98655d035e4892842fe97 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Fri, 13 Jun 2025 13:54:30 +0200
Subject: [PATCH 0266/1322] [DLCov] Origin-Tracking: Add config options
 (#143590)

This patch is part of a series that adds origin-tracking to the debugify
source location coverage checks, allowing us to report symbolized stack
traces of the point where missing source locations appear.

This patch adds the configuration options needed to enable this feature,
in the form of a new CMake option that enables a flag in
`llvm-config.h`; this is not an entirely new CMake flag, but a new
option, `COVERAGE_AND_ORIGIN`, for the existing flag
`LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING`. This patch contains
documentation, but no actual implementation for the flag itself.
---
 llvm/CMakeLists.txt                          |  4 ++--
 llvm/cmake/modules/HandleLLVMOptions.cmake   |  3 +++
 llvm/docs/CMake.rst                          | 13 ++++++++-----
 llvm/include/llvm/Config/llvm-config.h.cmake |  4 ++++
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index cfb67472aa71..0849bec26d56 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -569,8 +569,8 @@ endif()
 option(LLVM_ENABLE_CRASH_DUMPS "Turn on memory dumps on crashes. Currently only implemented on Windows." OFF)
 
 set(LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING "DISABLED" CACHE STRING
-  "Enhance Debugify's line number coverage tracking; enabling this is ABI-breaking. Can be DISABLED, or COVERAGE.")
-set_property(CACHE LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING PROPERTY STRINGS DISABLED COVERAGE)
+  "Enhance Debugify's line number coverage tracking; enabling this is ABI-breaking. Can be DISABLED, COVERAGE, or COVERAGE_AND_ORIGIN.")
+set_property(CACHE LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING PROPERTY STRINGS DISABLED COVERAGE COVERAGE_AND_ORIGIN)
 
 option(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS
   "Add additional fields to DILocations to support Key Instructions" OFF)
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 9721dacbcbe8..c35d9763a330 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -200,6 +200,9 @@ string(TOUPPER "${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}" uppercase_LLVM_ENABLE
 
 if( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE" )
   set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_ORIGIN" )
+  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
   # The DISABLED setting is default and requires no additional defines.
 else()
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 674e4969c691..72f19fd35392 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -482,11 +482,14 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING**:STRING
   Enhances Debugify's ability to detect line number errors by storing extra
   information inside Instructions, removing false positives from Debugify's
-  results at the cost of performance. Allowed values are `DISABLED` (default)
-  and `COVERAGE`. `COVERAGE` tracks whether and why a line number was
-  intentionally dropped or not generated for an instruction, allowing Debugify
-  to avoid reporting these as errors; this comes with a small performance cost
-  of ~0.1%. `COVERAGE` is an ABI-breaking option.
+  results at the cost of performance. Allowed values are `DISABLED` (default),
+  `COVERAGE`, and `COVERAGE_AND_ORIGIN`. `COVERAGE` tracks whether and why a
+  line number was intentionally dropped or not generated for an instruction,
+  allowing Debugify to avoid reporting these as errors; this comes with a small
+  performance cost of ~0.1%. `COVERAGE_AND_ORIGIN` additionally stores a
+  stacktrace of the point where each DebugLoc is unintentionally dropped,
+  allowing for much easier bug triaging at the cost of a ~10x performance
+  slowdown. `COVERAGE` and `COVERAGE_AND_ORIGIN` are ABI-breaking options.
 
 **LLVM_ENABLE_DIA_SDK**:BOOL
   Enable building with MSVC DIA SDK for PDB debugging support. Available
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index dbc882937b4f..6d3c37cc8b19 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -133,4 +133,8 @@
    and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+
 #endif

From fbea0fc5c77713a4d62db2512b1b51cc76ed6a25 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking@arm.com>
Date: Fri, 13 Jun 2025 13:33:46 +0100
Subject: [PATCH 0267/1322] Add Macro for CSSC Feature (#143148)

Add a new __ARM_FEATURE_CSSC macro that can be utilized during the
preprocessing stage.

__ARM_FEATURE_CSSC is defined to 1 if there is hardware support for
CSSC.

Implements the ACLE change:
https://github.com/ARM-software/acle/pull/394
---
 clang/lib/Basic/Targets/AArch64.cpp               | 6 ++++++
 clang/lib/Basic/Targets/AArch64.h                 | 1 +
 clang/test/Preprocessor/aarch64-target-features.c | 5 ++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index e8abdf9aafd8..124b340b62d9 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -625,6 +625,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasCRC)
     Builder.defineMacro("__ARM_FEATURE_CRC32", "1");
 
+  if (HasCSSC)
+    Builder.defineMacro("__ARM_FEATURE_CSSC", "1");
+
   if (HasRCPC3)
     Builder.defineMacro("__ARM_FEATURE_RCPC", "3");
   else if (HasRCPC)
@@ -874,6 +877,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("rdm", HasRDM)
       .Case("lse", HasLSE)
       .Case("crc", HasCRC)
+      .Case("cssc", HasCSSC)
       .Case("sha2", HasSHA2)
       .Case("sha3", HasSHA3)
       .Cases("aes", "pmull", HasAES)
@@ -1249,6 +1253,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasPAuthLR = true;
       HasPAuth = true;
     }
+    if (Feature == "+cssc")
+      HasCSSC = true;
   }
 
   // Check features that are manually disabled by command line options.
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index a4c65361105e..1951e0679d2e 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -66,6 +66,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   unsigned FPU = FPUMode;
   bool HasCRC = false;
+  bool HasCSSC = false;
   bool HasAES = false;
   bool HasSHA2 = false;
   bool HasSHA3 = false;
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 4cb9b6ce53b0..fd83e4b689a2 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -744,7 +744,10 @@
 // CHECK-SMEB16B16: __ARM_FEATURE_SME2 1
 // CHECK-SMEB16B16: __ARM_FEATURE_SME_B16B16 1
 // CHECK-SMEB16B16: __ARM_FEATURE_SVE_B16B16 1
-//
+
+// RUN: %clang --target=aarch64 -march=armv9-a+cssc -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-CSSC %s
+// CHECK-CSSC: __ARM_FEATURE_CSSC 1
+
 //  RUN: %clang --target=aarch64 -march=armv9-a+fp8 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-FP8 %s
 // CHECK-FP8: __ARM_FEATURE_FP8 1
 

From 9a237f35ef58c838a461d560908e380c481aadad Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 13 Jun 2025 08:39:00 -0400
Subject: [PATCH 0268/1322] [AMDGPU][AsmParser] Support true16 register suffix
 for valid register range (#143997)

---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 19 ++++++++++++++++---
 llvm/test/MC/AMDGPU/gfx11_asm_vop1.s          |  6 ++++++
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s  |  6 ++++++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b43876582daa..0dc1d1377322 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1395,7 +1395,7 @@ private:
   MCRegister ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
                           unsigned &RegWidth,
                           SmallVectorImpl<AsmToken> &Tokens);
-  bool ParseRegRange(unsigned& Num, unsigned& Width);
+  bool ParseRegRange(unsigned &Num, unsigned &Width, unsigned &SubReg);
   MCRegister getRegularReg(RegisterKind RegKind, unsigned RegNum,
                            unsigned SubReg, unsigned RegWidth, SMLoc Loc);
 
@@ -2857,7 +2857,8 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
   return Reg;
 }
 
-bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
+bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth,
+                                    unsigned &SubReg) {
   int64_t RegLo, RegHi;
   if (!skipToken(AsmToken::LBrac, "missing register index"))
     return false;
@@ -2894,8 +2895,20 @@ bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
     return false;
   }
 
+  if (RegHi == RegLo) {
+    StringRef RegSuffix = getTokenStr();
+    if (RegSuffix == ".l") {
+      SubReg = AMDGPU::lo16;
+      lex();
+    } else if (RegSuffix == ".h") {
+      SubReg = AMDGPU::hi16;
+      lex();
+    }
+  }
+
   Num = static_cast<unsigned>(RegLo);
   RegWidth = 32 * ((RegHi - RegLo) + 1);
+
   return true;
 }
 
@@ -2949,7 +2962,7 @@ MCRegister AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
     RegWidth = 32;
   } else {
     // Range of registers: v[XX:YY]. ":YY" is optional.
-    if (!ParseRegRange(RegNum, RegWidth))
+    if (!ParseRegRange(RegNum, RegWidth, SubReg))
       return MCRegister();
   }
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 93e01954bea5..f1438532d7c5 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3808,3 +3808,9 @@ v_trunc_f64 v[5:6], src_scc
 
 v_trunc_f64 v[254:255], 0xaf123456
 // GFX11: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_trunc_f16 v[5].l, v[1].h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v[5:5].l, v[1:1].h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 55a25ad3ec81..d19220867f29 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -1231,3 +1231,9 @@ v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 
 v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_ceil_f16_e32 v[5:5].s, 0xfe0b
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_ceil_f16_e32 v[6:7].l, 0xfe0b
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction

From d7ddd461162cc5585408417f64dd160929dd0691 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 13:47:45 +0100
Subject: [PATCH 0269/1322] [X86] Add start/end debug messages for the
 X86CompressEVEXPass and X86PadShortFunctionPass (#144056)

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp     | 3 ++-
 llvm/lib/Target/X86/X86PadShortFunction.cpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index fe593aa307df..4ea30de78402 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -300,6 +300,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
 }
 
 bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "Start X86CompressEVEXPass\n";);
 #ifndef NDEBUG
   // Make sure the tables are sorted.
   static std::atomic<bool> TableChecked(false);
@@ -320,7 +321,7 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
     for (MachineInstr &MI : MBB)
       Changed |= CompressEVEXImpl(MI, ST);
   }
-
+  LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 170ca2a93250..049384eefa18 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -100,6 +100,7 @@ FunctionPass *llvm::createX86PadShortFunctions() {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// NOOP instructions before early exits.
 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "Start X86PadShortFunctionPass\n";);
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -149,7 +150,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
       MadeChange = true;
     }
   }
-
+  LLVM_DEBUG(dbgs() << "End X86PadShortFunctionPass\n";);
   return MadeChange;
 }
 

From 4f8187c0dc6e7a818ebf3272a0c022203f901e96 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 13 Jun 2025 14:53:58 +0200
Subject: [PATCH 0270/1322] [TSan] Regenerate test checks (NFC)

---
 .../ThreadSanitizer/atomic-non-integer.ll     | 76 ++++++++++++++-----
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
index 40c4bef3bff9..8bcabaecf0fd 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
@@ -1,51 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=tsan -S | FileCheck %s
 ; Check that atomic memory operations on floating-point types are converted to calls into ThreadSanitizer runtime.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define float @load_float(ptr %fptr) {
+; CHECK-LABEL: define float @load_float(
+; CHECK-SAME: ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__tsan_atomic32_load(ptr [[FPTR]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; CHECK-NEXT:    [[V:%.*]] = load atomic float, ptr [[FPTR]] unordered, align 4
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret float [[TMP3]]
+;
   %v = load atomic float, ptr %fptr unordered, align 4
   ret float %v
-  ; CHECK-LABEL: load_float
-  ; CHECK: call i32 @__tsan_atomic32_load(ptr %{{.+}}, i32 0)
-  ; CHECK: bitcast i32 {{.+}} to float
 }
 
 define double @load_double(ptr %fptr) {
+; CHECK-LABEL: define double @load_double(
+; CHECK-SAME: ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @__tsan_atomic64_load(ptr [[FPTR]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
+; CHECK-NEXT:    [[V:%.*]] = load atomic double, ptr [[FPTR]] unordered, align 8
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret double [[TMP3]]
+;
   %v = load atomic double, ptr %fptr unordered, align 8
   ret double %v
-  ; CHECK-LABEL: load_double
-  ; CHECK: call i64 @__tsan_atomic64_load(ptr %{{.+}}, i32 0)
-  ; CHECK: bitcast i64 {{.+}} to double
 }
 
 define fp128 @load_fp128(ptr %fptr) {
+; CHECK-LABEL: define fp128 @load_fp128(
+; CHECK-SAME: ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i128 @__tsan_atomic128_load(ptr [[FPTR]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128 [[TMP2]] to fp128
+; CHECK-NEXT:    [[V:%.*]] = load atomic fp128, ptr [[FPTR]] unordered, align 16
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret fp128 [[TMP3]]
+;
   %v = load atomic fp128, ptr %fptr unordered, align 16
   ret fp128 %v
-  ; CHECK-LABEL: load_fp128
-  ; CHECK: call i128 @__tsan_atomic128_load(ptr %{{.+}}, i32 0)
-  ; CHECK: bitcast i128 {{.+}} to fp128
 }
 
 define void @store_float(ptr %fptr, float %v) {
+; CHECK-LABEL: define void @store_float(
+; CHECK-SAME: ptr [[FPTR:%.*]], float [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[V]] to i32
+; CHECK-NEXT:    call void @__tsan_atomic32_store(ptr [[FPTR]], i32 [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret void
+;
   store atomic float %v, ptr %fptr unordered, align 4
   ret void
-  ; CHECK-LABEL: store_float
-  ; CHECK: bitcast float %v to i32
-  ; CHECK: call void @__tsan_atomic32_store(ptr %{{.+}}, i32 %{{.+}}, i32 0)
 }
 
 define void @store_double(ptr %fptr, double %v) {
+; CHECK-LABEL: define void @store_double(
+; CHECK-SAME: ptr [[FPTR:%.*]], double [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[V]] to i64
+; CHECK-NEXT:    call void @__tsan_atomic64_store(ptr [[FPTR]], i64 [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret void
+;
   store atomic double %v, ptr %fptr unordered, align 8
   ret void
-  ; CHECK-LABEL: store_double
-  ; CHECK: bitcast double %v to i64
-  ; CHECK: call void @__tsan_atomic64_store(ptr %{{.+}}, i64 %{{.+}}, i32 0)
 }
 
 define void @store_fp128(ptr %fptr, fp128 %v) {
+; CHECK-LABEL: define void @store_fp128(
+; CHECK-SAME: ptr [[FPTR:%.*]], fp128 [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast fp128 [[V]] to i128
+; CHECK-NEXT:    call void @__tsan_atomic128_store(ptr [[FPTR]], i128 [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret void
+;
   store atomic fp128 %v, ptr %fptr unordered, align 16
   ret void
-  ; CHECK-LABEL: store_fp128
-  ; CHECK: bitcast fp128 %v to i128
-  ; CHECK: call void @__tsan_atomic128_store(ptr %{{.+}}, i128 %{{.+}}, i32 0)
 }

From a59e4acd753007c83594a6a56654025d4202a528 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <92571492+bababuck@users.noreply.github.com>
Date: Fri, 13 Jun 2025 05:57:46 -0700
Subject: [PATCH 0271/1322] [RISCV] Lower SELECT's with one constant more
 efficiently using Zicond (#143581)

See #143580 for MR with the test commit.

Performs the following transformations:
(select c, c1, t) -> (add (czero_nez t - c1, c), c1)
(select c, t, c1) -> (add (czero_eqz t - c1, c), c1)


@mgudim
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 26 +++++++++++
 .../CodeGen/RISCV/short-forward-branch-opt.ll | 46 ++++++++++++-------
 llvm/test/CodeGen/RISCV/zicond-opts.ll        | 38 +++++++--------
 3 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7839af5c1691..7cfada6c0601 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9096,6 +9096,32 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
     }
 
+    // (select c, c1, t) -> (add (czero_nez t - c1, c), c1)
+    // (select c, t, c1) -> (add (czero_eqz t - c1, c), c1)
+    if (isa<ConstantSDNode>(TrueV) != isa<ConstantSDNode>(FalseV)) {
+      bool IsCZERO_NEZ = isa<ConstantSDNode>(TrueV);
+      SDValue ConstVal = IsCZERO_NEZ ? TrueV : FalseV;
+      SDValue RegV = IsCZERO_NEZ ? FalseV : TrueV;
+      int64_t RawConstVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+      // Fall back to XORI if Const == -0x800
+      if (RawConstVal == -0x800) {
+        SDValue XorOp = DAG.getNode(ISD::XOR, DL, VT, RegV, ConstVal);
+        SDValue CMOV =
+            DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+                        DL, VT, XorOp, CondV);
+        return DAG.getNode(ISD::XOR, DL, VT, CMOV, ConstVal);
+      }
+      // Efficient only if the constant and its negation fit into `ADDI`
+      // Prefer Add/Sub over Xor since can be compressed for small immediates
+      if (isInt<12>(RawConstVal)) {
+        SDValue SubOp = DAG.getNode(ISD::SUB, DL, VT, RegV, ConstVal);
+        SDValue CMOV =
+            DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+                        DL, VT, SubOp, CondV);
+        return DAG.getNode(ISD::ADD, DL, VT, CMOV, ConstVal);
+      }
+    }
+
     // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
     // Unless we have the short forward branch optimization.
     if (!Subtarget.hasConditionalMoveFusion())
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index b7b88584f3bd..13c43a3875a0 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -173,14 +173,21 @@ define signext i32 @test6(i32 signext %x, i32 signext %z) {
 ; NOSFB-NEXT:    or a0, a0, a1
 ; NOSFB-NEXT:    ret
 ;
-; SFB-LABEL: test6:
-; SFB:       # %bb.0:
-; SFB-NEXT:    li a2, -1
-; SFB-NEXT:    beqz a1, .LBB5_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    mv a0, a2
-; SFB-NEXT:  .LBB5_2:
-; SFB-NEXT:    ret
+; NOZICOND-LABEL: test6:
+; NOZICOND:       # %bb.0:
+; NOZICOND-NEXT:    li a2, -1
+; NOZICOND-NEXT:    beqz a1, .LBB5_2
+; NOZICOND-NEXT:  # %bb.1:
+; NOZICOND-NEXT:    mv a0, a2
+; NOZICOND-NEXT:  .LBB5_2:
+; NOZICOND-NEXT:    ret
+;
+; ZICOND-LABEL: test6:
+; ZICOND:       # %bb.0:
+; ZICOND-NEXT:    addi a0, a0, 1
+; ZICOND-NEXT:    czero.nez a0, a0, a1
+; ZICOND-NEXT:    addi a0, a0, -1
+; ZICOND-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %b = select i1 %c, i32 %x, i32 -1
   ret i32 %b
@@ -195,14 +202,21 @@ define signext i32 @test7(i32 signext %x, i32 signext %z) {
 ; NOSFB-NEXT:    or a0, a0, a1
 ; NOSFB-NEXT:    ret
 ;
-; SFB-LABEL: test7:
-; SFB:       # %bb.0:
-; SFB-NEXT:    li a2, -1
-; SFB-NEXT:    bnez a1, .LBB6_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    mv a0, a2
-; SFB-NEXT:  .LBB6_2:
-; SFB-NEXT:    ret
+; NOZICOND-LABEL: test7:
+; NOZICOND:       # %bb.0:
+; NOZICOND-NEXT:    li a2, -1
+; NOZICOND-NEXT:    bnez a1, .LBB6_2
+; NOZICOND-NEXT:  # %bb.1:
+; NOZICOND-NEXT:    mv a0, a2
+; NOZICOND-NEXT:  .LBB6_2:
+; NOZICOND-NEXT:    ret
+;
+; ZICOND-LABEL: test7:
+; ZICOND:       # %bb.0:
+; ZICOND-NEXT:    addi a0, a0, 1
+; ZICOND-NEXT:    czero.eqz a0, a0, a1
+; ZICOND-NEXT:    addi a0, a0, -1
+; ZICOND-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %b = select i1 %c, i32 -1, i32 %x
   ret i32 %b
diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
index f5a25868bd12..2512ba803cf4 100644
--- a/llvm/test/CodeGen/RISCV/zicond-opts.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -146,20 +146,18 @@ define i64 @select_imm_reg(i64 %t, i1 %cond) {
 ; RV32ZICOND-LABEL: select_imm_reg:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a2, a2, 1
-; RV32ZICOND-NEXT:    li a3, 3
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
-; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    addi a0, a0, -3
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    addi a0, a0, 3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: select_imm_reg:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    andi a1, a1, 1
-; RV64ZICOND-NEXT:    li a2, 3
+; RV64ZICOND-NEXT:    addi a0, a0, -3
 ; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
-; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
-; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    addi a0, a0, 3
 ; RV64ZICOND-NEXT:    ret
   %4 = select i1 %cond, i64 3, i64 %t
   ret i64 %4
@@ -170,20 +168,18 @@ define i64 @select_reg_imm(i64 %t, i1 %cond) {
 ; RV32ZICOND-LABEL: select_reg_imm:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a2, a2, 1
-; RV32ZICOND-NEXT:    li a3, 3
-; RV32ZICOND-NEXT:    czero.nez a3, a3, a2
-; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    addi a0, a0, -3
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV32ZICOND-NEXT:    addi a0, a0, 3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: select_reg_imm:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    andi a1, a1, 1
-; RV64ZICOND-NEXT:    li a2, 3
-; RV64ZICOND-NEXT:    czero.nez a2, a2, a1
+; RV64ZICOND-NEXT:    addi a0, a0, -3
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a1
-; RV64ZICOND-NEXT:    or a0, a0, a2
+; RV64ZICOND-NEXT:    addi a0, a0, 3
 ; RV64ZICOND-NEXT:    ret
   %4 = select i1 %cond, i64 %t, i64 3
   ret i64 %4
@@ -194,21 +190,19 @@ define i64 @select_imm_reg_neg_2048(i64 %t, i1 %cond) {
 ; RV32ZICOND-LABEL: select_imm_reg_neg_2048:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a2, a2, 1
-; RV32ZICOND-NEXT:    li a3, -2048
+; RV32ZICOND-NEXT:    xori a0, a0, -2048
+; RV32ZICOND-NEXT:    neg a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
-; RV32ZICOND-NEXT:    neg a2, a2
-; RV32ZICOND-NEXT:    or a0, a3, a0
-; RV32ZICOND-NEXT:    or a1, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a1
+; RV32ZICOND-NEXT:    xori a0, a0, -2048
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: select_imm_reg_neg_2048:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    andi a1, a1, 1
-; RV64ZICOND-NEXT:    li a2, -2048
+; RV64ZICOND-NEXT:    xori a0, a0, -2048
 ; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
-; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
-; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    xori a0, a0, -2048
 ; RV64ZICOND-NEXT:    ret
   %4 = select i1 %cond, i64 -2048, i64 %t
   ret i64 %4

From 85a9f2e14859b472750f13fb441291e6e9c893a0 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Fri, 13 Jun 2025 09:14:48 -0400
Subject: [PATCH 0272/1322] [PowerPC] enable
 AtomicExpandImpl::expandAtomicCmpXchg for powerpc (#142395)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In PowerPC, the AtomicCmpXchgInst is lowered to
ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS. However, this node does not handle
the weak attribute of AtomicCmpXchgInst. As a result, when compiling C++
atomic_compare_exchange_weak_explicit, the generated assembly includes a
"reservation lost" loop — i.e., it branches back and retries if the
stwcx. (store-conditional) fails. This differs from GCC’s codegen, which
does not include that loop for weak compare-exchange.

Since PowerPC uses LL/SC-style atomic instructions, the patch enables
AtomicExpandImpl::expandAtomicCmpXchg for PowerPC. With this, the weak
attribute is properly respected, and the "reservation lost" loop is
removed for weak operations.

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   20 +-
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |   15 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   73 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |    6 +
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td      |    2 +
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |    8 +-
 .../CodeGen/PowerPC/PR35812-neg-cmpxchg.ll    |  142 +-
 llvm/test/CodeGen/PowerPC/all-atomics.ll      | 1672 +++++++++-------
 llvm/test/CodeGen/PowerPC/atomic-2.ll         |    4 +-
 .../PowerPC/atomic-compare-exchange-weak.ll   |   52 +-
 llvm/test/CodeGen/PowerPC/atomic-float.ll     |  108 +-
 .../PowerPC/atomicrmw-cond-sub-clamp.ll       |  526 +++--
 .../PowerPC/atomicrmw-uinc-udec-wrap.ll       |  524 +++--
 .../CodeGen/PowerPC/atomics-regression.ll     | 1740 +++++++++++------
 llvm/test/CodeGen/PowerPC/atomics.ll          |  227 +--
 llvm/test/CodeGen/PowerPC/loop-comment.ll     |    9 +-
 .../AtomicExpand/PowerPC/atomicrmw-fp.ll      |  116 +-
 17 files changed, 3133 insertions(+), 2111 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 04bc0e935310..4ed81d25e8e2 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -254,20 +254,20 @@ public:
   /// support for these atomic instructions, and also have different options
   /// w.r.t. what they should expand to.
   enum class AtomicExpansionKind {
-    None,    // Don't expand the instruction.
-    CastToInteger,    // Cast the atomic instruction to another type, e.g. from
-                      // floating-point to integer type.
+    None,          // Don't expand the instruction.
+    CastToInteger, // Cast the atomic instruction to another type, e.g. from
+                   // floating-point to integer type.
     LLSC,    // Expand the instruction into loadlinked/storeconditional; used
-             // by ARM/AArch64.
+             // by ARM/AArch64/PowerPC.
     LLOnly,  // Expand the (load) instruction into just a load-linked, which has
              // greater atomic guarantees than a normal load.
     CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
-    MaskedIntrinsic,  // Use a target-specific intrinsic for the LL/SC loop.
-    BitTestIntrinsic, // Use a target-specific intrinsic for special bit
-                      // operations; used by X86.
-    CmpArithIntrinsic,// Use a target-specific intrinsic for special compare
-                      // operations; used by X86.
-    Expand,           // Generic expansion in terms of other atomic operations.
+    MaskedIntrinsic,   // Use a target-specific intrinsic for the LL/SC loop.
+    BitTestIntrinsic,  // Use a target-specific intrinsic for special bit
+                       // operations; used by X86.
+    CmpArithIntrinsic, // Use a target-specific intrinsic for special compare
+                       // operations; used by X86.
+    Expand,            // Generic expansion in terms of other atomic operations.
 
     // Rewrite to a non-atomic form for use in a known non-preemptible
     // environment.
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 751628cee58c..84c26599b5b7 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1835,6 +1835,19 @@ let TargetPrefix = "ppc" in {
                       Intrinsic<[],[],[]>;
   def int_ppc_iospace_eieio : ClangBuiltin<"__builtin_ppc_iospace_eieio">,
                               Intrinsic<[],[],[]>;
+  def int_ppc_lbarx :
+    Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+  def int_ppc_lharx :
+    Intrinsic<[llvm_i32_ty],[llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+  def int_ppc_lwarx :
+    Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+  def int_ppc_ldarx :
+    Intrinsic<[llvm_i64_ty],[llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+
   def int_ppc_stdcx :
     ClangBuiltin<"__builtin_ppc_stdcx">,
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i64_ty],
@@ -1844,7 +1857,7 @@ let TargetPrefix = "ppc" in {
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
               [IntrWriteMem, IntrArgMemOnly]>;
   def int_ppc_sthcx :
-    Intrinsic<[llvm_i32_ty], [ llvm_ptr_ty, llvm_i32_ty ],
+    Intrinsic<[llvm_i32_ty], [ llvm_ptr_ty, llvm_i32_ty],
               [IntrWriteMem, IntrArgMemOnly, IntrNoDuplicate]>;
   def int_ppc_stbcx :
     ClangBuiltin<"__builtin_ppc_stbcx">,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59c89985c6cf..0f8e5e57c58b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1442,6 +1442,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
 
   setMinFunctionAlignment(Align(4));
+  setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
 
   auto CPUDirective = Subtarget.getCPUDirective();
   switch (CPUDirective) {
@@ -12690,6 +12691,76 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
   return Builder.CreateIntrinsic(Id, {});
 }
 
+Value *PPCTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
+                                         Value *Addr,
+                                         AtomicOrdering Ord) const {
+  unsigned SZ = ValueTy->getPrimitiveSizeInBits();
+
+  assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
+         "Only 8/16/32/64-bit atomic loads supported");
+  Intrinsic::ID IntID;
+  switch (SZ) {
+  default:
+    llvm_unreachable("Unexpected PrimitiveSize");
+  case 8:
+    IntID = Intrinsic::ppc_lbarx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 16:
+    IntID = Intrinsic::ppc_lharx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 32:
+    IntID = Intrinsic::ppc_lwarx;
+    break;
+  case 64:
+    IntID = Intrinsic::ppc_ldarx;
+    break;
+  }
+  Value *Call =
+      Builder.CreateIntrinsic(IntID, Addr, /*FMFSource=*/nullptr, "larx");
+
+  return Builder.CreateTruncOrBitCast(Call, ValueTy);
+}
+
+// Perform a store-conditional operation to Addr. Return the status of the
+// store. This should be 0 if the store succeeded, non-zero otherwise.
+Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
+                                               Value *Val, Value *Addr,
+                                               AtomicOrdering Ord) const {
+  Type *Ty = Val->getType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+
+  assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
+         "Only 8/16/32/64-bit atomic loads supported");
+  Intrinsic::ID IntID;
+  switch (SZ) {
+  default:
+    llvm_unreachable("Unexpected PrimitiveSize");
+  case 8:
+    IntID = Intrinsic::ppc_stbcx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 16:
+    IntID = Intrinsic::ppc_sthcx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 32:
+    IntID = Intrinsic::ppc_stwcx;
+    break;
+  case 64:
+    IntID = Intrinsic::ppc_stdcx;
+    break;
+  }
+
+  if (SZ == 8 || SZ == 16)
+    Val = Builder.CreateZExt(Val, Builder.getInt32Ty());
+
+  Value *Call = Builder.CreateIntrinsic(IntID, {Addr, Val},
+                                        /*FMFSource=*/nullptr, "stcx");
+  return Builder.CreateXor(Call, Builder.getInt32(1));
+}
+
 // The mappings for emitLeading/TrailingFence is taken from
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
@@ -19651,7 +19722,7 @@ PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
   if (shouldInlineQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
-  return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
+  return AtomicExpansionKind::LLSC;
 }
 
 static Intrinsic::ID
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 2c55b5427297..4c88bd372b10 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,6 +927,12 @@ namespace llvm {
       return true;
     }
 
+    Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
+                          AtomicOrdering Ord) const override;
+
+    Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
+                                AtomicOrdering Ord) const override;
+
     Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
                                   AtomicOrdering Ord) const override;
     Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 659c1a9079c3..fd2084398c85 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -2023,6 +2023,8 @@ def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
 
 } // IsISA3_0
 
+def : Pat<(int_ppc_ldarx ForceXForm:$ptr),
+          (LDARX ForceXForm:$ptr)>;
 def : Pat<(int_ppc_stdcx ForceXForm:$dst, g8rc:$A),
           (RLWINM (STDCX g8rc:$A, ForceXForm:$dst), 31, 31, 31)>;
 def : Pat<(PPCStoreCond ForceXForm:$dst, g8rc:$A, 8),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index b70290df07b1..99ef89a7fdc0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -5143,7 +5143,6 @@ def : Pat<(int_ppc_store2r gprc:$a, ForceXForm:$ptr),
 def : Pat<(int_ppc_store4r gprc:$a, ForceXForm:$ptr),
           (STWBRX gprc:$a, ForceXForm:$ptr)>;
 
-
 // Fast 32-bit reverse bits algorithm:
 // Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
 // n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA);
@@ -5324,10 +5323,14 @@ def CFENCE : PPCPostRAExpPseudo<(outs), (ins gprc:$cr), "#CFENCE", []>;
 def : Pat<(i64 (bitreverse i64:$A)),
   (OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>;
 
+def : Pat<(int_ppc_lwarx ForceXForm:$ptr),
+          (LWARX ForceXForm:$ptr)>;
 def : Pat<(int_ppc_stwcx ForceXForm:$dst, gprc:$A),
           (RLWINM (STWCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
 def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 4),
           (RLWINM (STWCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
+def : Pat<(int_ppc_lbarx ForceXForm:$ptr),
+          (LBARX ForceXForm:$ptr)>;
 def : Pat<(int_ppc_stbcx ForceXForm:$dst, gprc:$A),
           (RLWINM (STBCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
 def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 1),
@@ -5360,6 +5363,9 @@ def : Pat<(int_ppc_mtmsr gprc:$RS),
           (MTMSR $RS, 0)>;
 
 let Predicates = [IsISA2_07] in {
+  def : Pat<(int_ppc_lharx ForceXForm:$ptr),
+          (LHARX ForceXForm:$ptr)>;
+
   def : Pat<(int_ppc_sthcx ForceXForm:$dst, gprc:$A),
             (RLWINM (STHCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
   def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 2),
diff --git a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
index 8517783e3ebd..1a8dabc5ad71 100644
--- a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
+++ b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
@@ -15,50 +15,57 @@ define signext i32 @main() nounwind {
 ; CHECK-NEXT:    stdu 1, -48(1)
 ; CHECK-NEXT:    li 3, -32477
 ; CHECK-NEXT:    std 0, 64(1)
-; CHECK-NEXT:    li 4, 234
-; CHECK-NEXT:    addi 6, 1, 46
 ; CHECK-NEXT:    sth 3, 46(1)
-; CHECK-NEXT:    lis 3, 0
+; CHECK-NEXT:    addi 3, 1, 46
+; CHECK-NEXT:    lharx 4, 0, 3
+; CHECK-NEXT:    clrlwi  4, 4, 16
+; CHECK-NEXT:    cmplwi  4, 33059
+; CHECK-NEXT:    bne     0, .LBB0_4
+; CHECK-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    ori 3, 3, 33059
-; CHECK-NEXT:  .LBB0_1: # %L.entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 5, 0, 6
-; CHECK-NEXT:    cmpw 5, 3
-; CHECK-NEXT:    bne 0, .LBB0_3
-; CHECK-NEXT:  # %bb.2: # %L.entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 4, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_1
-; CHECK-NEXT:  .LBB0_3: # %L.entry
-; CHECK-NEXT:    cmplwi 5, 33059
+; CHECK-NEXT:    li 4, 234
+; CHECK-NEXT:    .p2align        5
+; CHECK-NEXT:  .LBB0_2:                                # %cmpxchg.trystore
+; CHECK-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sthcx. 4, 0, 3
+; CHECK-NEXT:    beq     0, .LBB0_7
+; CHECK-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    lharx 5, 0, 3
+; CHECK-NEXT:    clrlwi  5, 5, 16
+; CHECK-NEXT:    cmplwi  5, 33059
+; CHECK-NEXT:    beq     0, .LBB0_2
+; CHECK-NEXT:  .LBB0_4:                                # %cmpxchg.nostore
 ; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    bne 0, .LBB0_6
-; CHECK-NEXT:  # %bb.4: # %L.B0000
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_5:                                # %L.B0000
 ; CHECK-NEXT:    lhz 3, 46(1)
-; CHECK-NEXT:    cmplwi 3, 234
-; CHECK-NEXT:    bne 0, .LBB0_7
-; CHECK-NEXT:  # %bb.5: # %L.B0001
+; CHECK-NEXT:    cmplwi  3, 234
+; CHECK-NEXT:    bne     0, .LBB0_9
+; CHECK-NEXT:  # %bb.6:                                # %L.B0001
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    bl puts
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    b .LBB0_9
-; CHECK-NEXT:  .LBB0_6: # %L.B0003
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_7:                                # %cmpxchg.success
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    b .LBB0_5
+; CHECK-NEXT:  .LBB0_8:                                # %L.B0003
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    addi 3, 3, 16
-; CHECK-NEXT:    b .LBB0_8
-; CHECK-NEXT:  .LBB0_7: # %L.B0005
+; CHECK-NEXT:    b .LBB0_10
+; CHECK-NEXT:  .LBB0_9:                                # %L.B0005
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    addi 3, 3, 64
-; CHECK-NEXT:  .LBB0_8: # %L.B0003
+; CHECK-NEXT:  .LBB0_10:                               # %L.B0003
 ; CHECK-NEXT:    bl puts
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB0_9: # %L.B0003
+; CHECK-NEXT:  .LBB0_11:                               # %L.B0003
 ; CHECK-NEXT:    addi 1, 1, 48
 ; CHECK-NEXT:    ld 0, 16(1)
 ; CHECK-NEXT:    mtlr 0
@@ -69,64 +76,69 @@ define signext i32 @main() nounwind {
 ; CHECK-P7-NEXT:    mflr 0
 ; CHECK-P7-NEXT:    stdu 1, -48(1)
 ; CHECK-P7-NEXT:    li 3, -32477
-; CHECK-P7-NEXT:    std 0, 64(1)
 ; CHECK-P7-NEXT:    addi 4, 1, 46
-; CHECK-P7-NEXT:    li 6, 234
+; CHECK-P7-NEXT:    std 0, 64(1)
 ; CHECK-P7-NEXT:    sth 3, 46(1)
-; CHECK-P7-NEXT:    lis 3, 0
+; CHECK-P7-NEXT:    rldicr 3, 4, 0, 61
+; CHECK-P7-NEXT:    rlwinm 4, 4, 3, 27, 27
+; CHECK-P7-NEXT:    lwarx 5, 0, 3
+; CHECK-P7-NEXT:    srw 6, 5, 4
+; CHECK-P7-NEXT:    clrlwi  6, 6, 16
+; CHECK-P7-NEXT:    cmplwi  6, 33059
+; CHECK-P7-NEXT:    bne     0, .LBB0_4
+; CHECK-P7-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; CHECK-P7-NEXT:    lis 6, 0
+; CHECK-P7-NEXT:    li 7, 234
 ; CHECK-P7-NEXT:    sync
-; CHECK-P7-NEXT:    ori 5, 3, 33059
-; CHECK-P7-NEXT:    rlwinm 3, 4, 3, 27, 27
-; CHECK-P7-NEXT:    rldicr 4, 4, 0, 61
-; CHECK-P7-NEXT:    slw 7, 5, 3
-; CHECK-P7-NEXT:    li 5, 0
-; CHECK-P7-NEXT:    slw 6, 6, 3
-; CHECK-P7-NEXT:    ori 5, 5, 65535
-; CHECK-P7-NEXT:    slw 5, 5, 3
-; CHECK-P7-NEXT:    and 6, 6, 5
-; CHECK-P7-NEXT:    and 7, 7, 5
-; CHECK-P7-NEXT:  .LBB0_1: # %L.entry
-; CHECK-P7-NEXT:    #
-; CHECK-P7-NEXT:    lwarx 9, 0, 4
-; CHECK-P7-NEXT:    and 8, 9, 5
-; CHECK-P7-NEXT:    cmpw 8, 7
-; CHECK-P7-NEXT:    bne 0, .LBB0_3
-; CHECK-P7-NEXT:  # %bb.2: # %L.entry
-; CHECK-P7-NEXT:    #
-; CHECK-P7-NEXT:    andc 9, 9, 5
-; CHECK-P7-NEXT:    or 9, 9, 6
-; CHECK-P7-NEXT:    stwcx. 9, 0, 4
-; CHECK-P7-NEXT:    bne 0, .LBB0_1
-; CHECK-P7-NEXT:  .LBB0_3: # %L.entry
-; CHECK-P7-NEXT:    srw 3, 8, 3
+; CHECK-P7-NEXT:    ori 6, 6, 65535
+; CHECK-P7-NEXT:    slw 7, 7, 4
+; CHECK-P7-NEXT:    slw 6, 6, 4
+; CHECK-P7-NEXT:    not     6, 6
+; CHECK-P7-NEXT:    .p2align        4
+; CHECK-P7-NEXT:  .LBB0_2:                                # %cmpxchg.trystore
+; CHECK-P7-NEXT:                                        # =>This Inner Loop Header: Depth=1
+; CHECK-P7-NEXT:    and 5, 5, 6
+; CHECK-P7-NEXT:    or 5, 5, 7
+; CHECK-P7-NEXT:    stwcx. 5, 0, 3
+; CHECK-P7-NEXT:    beq     0, .LBB0_7
+; CHECK-P7-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; CHECK-P7-NEXT:                                        #   in Loop: Header=BB0_2 Depth=1
+; CHECK-P7-NEXT:    lwarx 5, 0, 3
+; CHECK-P7-NEXT:    srw 8, 5, 4
+; CHECK-P7-NEXT:    clrlwi  8, 8, 16
+; CHECK-P7-NEXT:    cmplwi  8, 33059
+; CHECK-P7-NEXT:    beq     0, .LBB0_2
+; CHECK-P7-NEXT:  .LBB0_4:                                # %cmpxchg.nostore
 ; CHECK-P7-NEXT:    lwsync
-; CHECK-P7-NEXT:    cmplwi 3, 33059
-; CHECK-P7-NEXT:    bne 0, .LBB0_6
-; CHECK-P7-NEXT:  # %bb.4: # %L.B0000
+; CHECK-P7-NEXT:    b .LBB0_8
+; CHECK-P7-NEXT:  .LBB0_5:                                # %L.B0000
 ; CHECK-P7-NEXT:    lhz 3, 46(1)
-; CHECK-P7-NEXT:    cmplwi 3, 234
-; CHECK-P7-NEXT:    bne 0, .LBB0_7
-; CHECK-P7-NEXT:  # %bb.5: # %L.B0001
+; CHECK-P7-NEXT:    cmplwi  3, 234
+; CHECK-P7-NEXT:    bne     0, .LBB0_9
+; CHECK-P7-NEXT:  # %bb.6:                                # %L.B0001
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    bl puts
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    li 3, 0
-; CHECK-P7-NEXT:    b .LBB0_9
-; CHECK-P7-NEXT:  .LBB0_6: # %L.B0003
+; CHECK-P7-NEXT:    b .LBB0_11
+; CHECK-P7-NEXT:  .LBB0_7:                                # %cmpxchg.success
+; CHECK-P7-NEXT:    lwsync
+; CHECK-P7-NEXT:    b .LBB0_5
+; CHECK-P7-NEXT:  .LBB0_8:                                # %L.B0003
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    addi 3, 3, 16
-; CHECK-P7-NEXT:    b .LBB0_8
-; CHECK-P7-NEXT:  .LBB0_7: # %L.B0005
+; CHECK-P7-NEXT:    b .LBB0_10
+; CHECK-P7-NEXT:  .LBB0_9:                                # %L.B0005
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    addi 3, 3, 64
-; CHECK-P7-NEXT:  .LBB0_8: # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_10:                               # %L.B0003
 ; CHECK-P7-NEXT:    bl puts
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    li 3, 1
-; CHECK-P7-NEXT:  .LBB0_9: # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_11:                               # %L.B0003
 ; CHECK-P7-NEXT:    addi 1, 1, 48
 ; CHECK-P7-NEXT:    ld 0, 16(1)
 ; CHECK-P7-NEXT:    mtlr 0
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 531e559ea730..67cee358882f 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -4336,704 +4336,959 @@ entry:
 define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-LABEL: test_compare_and_swap:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis 3, 2, uc@toc@ha
-; CHECK-NEXT:    addis 4, 2, sc@toc@ha
-; CHECK-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; CHECK-NEXT:    lbz 5, uc@toc@l(3)
-; CHECK-NEXT:    lbz 8, sc@toc@l(4)
-; CHECK-NEXT:    addi 6, 3, uc@toc@l
-; CHECK-NEXT:    addi 0, 4, sc@toc@l
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_1: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 7, 0, 0
-; CHECK-NEXT:    cmpw 7, 5
-; CHECK-NEXT:    bne 0, .LBB3_3
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 8, 0, 0
-; CHECK-NEXT:    bne 0, .LBB3_1
-; CHECK-NEXT:  .LBB3_3: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stb 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 8, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_4: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 5, 0, 6
-; CHECK-NEXT:    cmpw 5, 8
-; CHECK-NEXT:    bne 0, .LBB3_6
-; CHECK-NEXT:  # %bb.5: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 7, 0, 6
-; CHECK-NEXT:    bne 0, .LBB3_4
-; CHECK-NEXT:  .LBB3_6: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stb 5, uc@toc@l(3)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 7
-; CHECK-NEXT:    addis 7, 2, ss@toc@ha
-; CHECK-NEXT:    addi 12, 7, ss@toc@l
-; CHECK-NEXT:  .LBB3_7: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 9, 0, 12
-; CHECK-NEXT:    cmpw 9, 5
-; CHECK-NEXT:    bne 0, .LBB3_9
-; CHECK-NEXT:  # %bb.8: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 8, 0, 12
-; CHECK-NEXT:    bne 0, .LBB3_7
-; CHECK-NEXT:  .LBB3_9: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    sth 9, ss@toc@l(7)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 5, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 7
-; CHECK-NEXT:    addis 7, 2, us@toc@ha
-; CHECK-NEXT:    addi 11, 7, us@toc@l
-; CHECK-NEXT:  .LBB3_10: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 9, 0, 11
-; CHECK-NEXT:    cmpw 9, 5
-; CHECK-NEXT:    bne 0, .LBB3_12
-; CHECK-NEXT:  # %bb.11: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 8, 0, 11
-; CHECK-NEXT:    bne 0, .LBB3_10
-; CHECK-NEXT:  .LBB3_12: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    sth 9, us@toc@l(7)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 5, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 7
-; CHECK-NEXT:    addis 7, 2, si@toc@ha
-; CHECK-NEXT:    addi 10, 7, si@toc@l
-; CHECK-NEXT:  .LBB3_13: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 9, 0, 10
-; CHECK-NEXT:    cmpw 9, 5
-; CHECK-NEXT:    bne 0, .LBB3_15
-; CHECK-NEXT:  # %bb.14: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 8, 0, 10
-; CHECK-NEXT:    bne 0, .LBB3_13
-; CHECK-NEXT:  .LBB3_15: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stw 9, si@toc@l(7)
-; CHECK-NEXT:    lbz 5, sc@toc@l(4)
-; CHECK-NEXT:    lbz 7, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 5
-; CHECK-NEXT:    addis 5, 2, ui@toc@ha
-; CHECK-NEXT:    addi 9, 5, ui@toc@l
-; CHECK-NEXT:  .LBB3_16: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 30, 0, 9
-; CHECK-NEXT:    cmpw 30, 7
-; CHECK-NEXT:    bne 0, .LBB3_18
-; CHECK-NEXT:  # %bb.17: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 8, 0, 9
-; CHECK-NEXT:    bne 0, .LBB3_16
-; CHECK-NEXT:  .LBB3_18: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stw 30, ui@toc@l(5)
-; CHECK-NEXT:    addis 30, 2, sll@toc@ha
-; CHECK-NEXT:    lbz 8, sc@toc@l(4)
-; CHECK-NEXT:    lbz 7, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 29, 8
-; CHECK-NEXT:    addi 8, 30, sll@toc@l
-; CHECK-NEXT:  .LBB3_19: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 28, 0, 8
-; CHECK-NEXT:    cmpd 28, 7
-; CHECK-NEXT:    bne 0, .LBB3_21
-; CHECK-NEXT:  # %bb.20: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 29, 0, 8
-; CHECK-NEXT:    bne 0, .LBB3_19
-; CHECK-NEXT:  .LBB3_21: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    addis 29, 2, ull@toc@ha
-; CHECK-NEXT:    std 28, sll@toc@l(30)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 30, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 28, 7
-; CHECK-NEXT:    addi 7, 29, ull@toc@l
-; CHECK-NEXT:  .LBB3_22: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 27, 0, 7
-; CHECK-NEXT:    cmpd 27, 30
-; CHECK-NEXT:    bne 0, .LBB3_24
-; CHECK-NEXT:  # %bb.23: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 28, 0, 7
-; CHECK-NEXT:    bne 0, .LBB3_22
-; CHECK-NEXT:  .LBB3_24: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    std 27, ull@toc@l(29)
-; CHECK-NEXT:    lbz 30, uc@toc@l(3)
-; CHECK-NEXT:    lbz 29, sc@toc@l(4)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_25: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 28, 0, 0
-; CHECK-NEXT:    cmpw 28, 30
-; CHECK-NEXT:    bne 0, .LBB3_27
-; CHECK-NEXT:  # %bb.26: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 29, 0, 0
-; CHECK-NEXT:    bne 0, .LBB3_25
-; CHECK-NEXT:  .LBB3_27: # %entry
-; CHECK-NEXT:    xor 0, 28, 30
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 30, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 0, 0
-; CHECK-NEXT:    srwi 0, 0, 5
-; CHECK-NEXT:    stw 0, ui@toc@l(5)
-; CHECK-NEXT:    lbz 0, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_28: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 29, 0, 6
-; CHECK-NEXT:    cmpw 29, 0
-; CHECK-NEXT:    bne 0, .LBB3_30
-; CHECK-NEXT:  # %bb.29: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 30, 0, 6
-; CHECK-NEXT:    bne 0, .LBB3_28
-; CHECK-NEXT:  .LBB3_30: # %entry
-; CHECK-NEXT:    xor 6, 29, 0
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 0, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 0, 0
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_31: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 30, 0, 12
-; CHECK-NEXT:    cmpw 30, 6
-; CHECK-NEXT:    bne 0, .LBB3_33
-; CHECK-NEXT:  # %bb.32: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB3_31
-; CHECK-NEXT:  .LBB3_33: # %entry
-; CHECK-NEXT:    xor 6, 30, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 12, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 12, 12
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_34: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 0, 0, 11
-; CHECK-NEXT:    cmpw 0, 6
-; CHECK-NEXT:    bne 0, .LBB3_36
-; CHECK-NEXT:  # %bb.35: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 12, 0, 11
-; CHECK-NEXT:    bne 0, .LBB3_34
-; CHECK-NEXT:  .LBB3_36: # %entry
-; CHECK-NEXT:    xor 6, 0, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 11, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 11, 11
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_37: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 12, 0, 10
-; CHECK-NEXT:    cmpw 12, 6
-; CHECK-NEXT:    bne 0, .LBB3_39
-; CHECK-NEXT:  # %bb.38: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 11, 0, 10
-; CHECK-NEXT:    bne 0, .LBB3_37
-; CHECK-NEXT:  .LBB3_39: # %entry
-; CHECK-NEXT:    xor 6, 12, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 10, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 10, 10
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_40: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 11, 0, 9
-; CHECK-NEXT:    cmpw 11, 6
-; CHECK-NEXT:    bne 0, .LBB3_42
-; CHECK-NEXT:  # %bb.41: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 10, 0, 9
-; CHECK-NEXT:    bne 0, .LBB3_40
-; CHECK-NEXT:  .LBB3_42: # %entry
-; CHECK-NEXT:    xor 6, 11, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 9, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 9, 9
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_43: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 10, 0, 8
-; CHECK-NEXT:    cmpd 10, 6
-; CHECK-NEXT:    bne 0, .LBB3_45
-; CHECK-NEXT:  # %bb.44: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 9, 0, 8
-; CHECK-NEXT:    bne 0, .LBB3_43
-; CHECK-NEXT:  .LBB3_45: # %entry
-; CHECK-NEXT:    xor 6, 10, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 4, sc@toc@l(4)
-; CHECK-NEXT:    lbz 3, uc@toc@l(3)
-; CHECK-NEXT:    cntlzd 6, 6
-; CHECK-NEXT:    extsb 4, 4
-; CHECK-NEXT:    rldicl 6, 6, 58, 63
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_46: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 6, 0, 7
-; CHECK-NEXT:    cmpd 6, 3
-; CHECK-NEXT:    bne 0, .LBB3_48
-; CHECK-NEXT:  # %bb.47: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 4, 0, 7
-; CHECK-NEXT:    bne 0, .LBB3_46
-; CHECK-NEXT:  .LBB3_48: # %entry
-; CHECK-NEXT:    xor 3, 6, 3
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; CHECK-NEXT:    cntlzd 3, 3
-; CHECK-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; CHECK-NEXT:    rldicl 3, 3, 58, 63
-; CHECK-NEXT:    stw 3, ui@toc@l(5)
-; CHECK-NEXT:    blr
+; CHECK-NEXT:   addis 4, 2, sc@toc@ha
+; CHECK-NEXT:   addis 3, 2, uc@toc@ha
+; CHECK-NEXT:   std 27, -40(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   std 28, -32(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   std 29, -24(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   std 30, -16(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   addi 6, 4, sc@toc@l
+; CHECK-NEXT:   lbz 7, uc@toc@l(3)
+; CHECK-NEXT:   lbz 8, sc@toc@l(4)
+; CHECK-NEXT:   lbarx 5, 0, 6
+; CHECK-NEXT:   clrlwi  9, 5, 24
+; CHECK-NEXT:   cmplw   9, 7
+; CHECK-NEXT:   bne     0, .LBB3_4
+; CHECK-NEXT: # %bb.1:                                # %cmpxchg.fencedstore276
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_2:                                # %cmpxchg.trystore275
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 8, 0, 6
+; CHECK-NEXT:   beq     0, .LBB3_4
+; CHECK-NEXT: # %bb.3:                                # %cmpxchg.releasedload274
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:   lbarx 5, 0, 6
+; CHECK-NEXT:   clrlwi  9, 5, 24
+; CHECK-NEXT:   cmplw   9, 7
+; CHECK-NEXT:   beq     0, .LBB3_2
+; CHECK-NEXT: .LBB3_4:                                # %cmpxchg.nostore272
+; CHECK-NEXT:   addi 7, 3, uc@toc@l
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stb 5, sc@toc@l(4)
+; CHECK-NEXT:   lbz 9, uc@toc@l(3)
+; CHECK-NEXT:   lbarx 8, 0, 7
+; CHECK-NEXT:   clrlwi  10, 8, 24
+; CHECK-NEXT:   cmplw   10, 9
+; CHECK-NEXT:   bne     0, .LBB3_8
+; CHECK-NEXT: # %bb.5:                                # %cmpxchg.fencedstore257
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  5, 5, 24
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_6:                                # %cmpxchg.trystore256
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 5, 0, 7
+; CHECK-NEXT:   beq     0, .LBB3_8
+; CHECK-NEXT: # %bb.7:                                # %cmpxchg.releasedload255
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_6 Depth=1
+; CHECK-NEXT:   lbarx 8, 0, 7
+; CHECK-NEXT:   clrlwi  10, 8, 24
+; CHECK-NEXT:   cmplw   10, 9
+; CHECK-NEXT:   beq     0, .LBB3_6
+; CHECK-NEXT: .LBB3_8:                                # %cmpxchg.nostore253
+; CHECK-NEXT:   addis 5, 2, ss@toc@ha
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stb 8, uc@toc@l(3)
+; CHECK-NEXT:   clrlwi  10, 8, 24
+; CHECK-NEXT:   lbz 11, sc@toc@l(4)
+; CHECK-NEXT:   addi 8, 5, ss@toc@l
+; CHECK-NEXT:   lharx 9, 0, 8
+; CHECK-NEXT:   clrlwi  12, 9, 16
+; CHECK-NEXT:   cmplw   12, 10
+; CHECK-NEXT:   bne     0, .LBB3_12
+; CHECK-NEXT: # %bb.9:                                # %cmpxchg.fencedstore238
+; CHECK-NEXT:   extsb 11, 11
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  11, 11, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_10:                               # %cmpxchg.trystore237
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 11, 0, 8
+; CHECK-NEXT:   beq     0, .LBB3_12
+; CHECK-NEXT: # %bb.11:                               # %cmpxchg.releasedload236
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_10 Depth=1
+; CHECK-NEXT:   lharx 9, 0, 8
+; CHECK-NEXT:   clrlwi  12, 9, 16
+; CHECK-NEXT:   cmplw   12, 10
+; CHECK-NEXT:   beq     0, .LBB3_10
+; CHECK-NEXT: .LBB3_12:                               # %cmpxchg.nostore234
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   sth 9, ss@toc@l(5)
+; CHECK-NEXT:   addis 5, 2, us@toc@ha
+; CHECK-NEXT:   lbz 11, uc@toc@l(3)
+; CHECK-NEXT:   lbz 12, sc@toc@l(4)
+; CHECK-NEXT:   addi 9, 5, us@toc@l
+; CHECK-NEXT:   lharx 10, 0, 9
+; CHECK-NEXT:   clrlwi  0, 10, 16
+; CHECK-NEXT:   cmplw   0, 11
+; CHECK-NEXT:   bne     0, .LBB3_16
+; CHECK-NEXT: # %bb.13:                               # %cmpxchg.fencedstore219
+; CHECK-NEXT:   extsb 12, 12
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  12, 12, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_14:                               # %cmpxchg.trystore218
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 12, 0, 9
+; CHECK-NEXT:   beq     0, .LBB3_16
+; CHECK-NEXT: # %bb.15:                               # %cmpxchg.releasedload217
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_14 Depth=1
+; CHECK-NEXT:   lharx 10, 0, 9
+; CHECK-NEXT:   clrlwi  0, 10, 16
+; CHECK-NEXT:   cmplw   0, 11
+; CHECK-NEXT:   beq     0, .LBB3_14
+; CHECK-NEXT: .LBB3_16:                               # %cmpxchg.nostore215
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   sth 10, us@toc@l(5)
+; CHECK-NEXT:   addis 5, 2, si@toc@ha
+; CHECK-NEXT:   lbz 12, uc@toc@l(3)
+; CHECK-NEXT:   lbz 0, sc@toc@l(4)
+; CHECK-NEXT:   addi 10, 5, si@toc@l
+; CHECK-NEXT:   lwarx 11, 0, 10
+; CHECK-NEXT:   cmplw   11, 12
+; CHECK-NEXT:   bne     0, .LBB3_20
+; CHECK-NEXT: # %bb.17:                               # %cmpxchg.fencedstore200
+; CHECK-NEXT:   extsb 0, 0
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_18:                               # %cmpxchg.trystore199
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 0, 0, 10
+; CHECK-NEXT:   beq     0, .LBB3_20
+; CHECK-NEXT: # %bb.19:                               # %cmpxchg.releasedload198
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_18 Depth=1
+; CHECK-NEXT:   lwarx 11, 0, 10
+; CHECK-NEXT:   cmplw   11, 12
+; CHECK-NEXT:   beq     0, .LBB3_18
+; CHECK-NEXT: .LBB3_20:                               # %cmpxchg.nostore196
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stw 11, si@toc@l(5)
+; CHECK-NEXT:   addis 5, 2, ui@toc@ha
+; CHECK-NEXT:   lbz 0, uc@toc@l(3)
+; CHECK-NEXT:   lbz 30, sc@toc@l(4)
+; CHECK-NEXT:   addi 11, 5, ui@toc@l
+; CHECK-NEXT:   lwarx 12, 0, 11
+; CHECK-NEXT:   cmplw   12, 0
+; CHECK-NEXT:   bne     0, .LBB3_24
+; CHECK-NEXT: # %bb.21:                               # %cmpxchg.fencedstore181
+; CHECK-NEXT:   extsb 30, 30
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_22:                               # %cmpxchg.trystore180
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 30, 0, 11
+; CHECK-NEXT:   beq     0, .LBB3_24
+; CHECK-NEXT: # %bb.23:                               # %cmpxchg.releasedload179
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_22 Depth=1
+; CHECK-NEXT:   lwarx 12, 0, 11
+; CHECK-NEXT:   cmplw   12, 0
+; CHECK-NEXT:   beq     0, .LBB3_22
+; CHECK-NEXT: .LBB3_24:                               # %cmpxchg.nostore177
+; CHECK-NEXT:   addis 30, 2, sll@toc@ha
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stw 12, ui@toc@l(5)
+; CHECK-NEXT:   lbz 29, uc@toc@l(3)
+; CHECK-NEXT:   lbz 28, sc@toc@l(4)
+; CHECK-NEXT:   addi 12, 30, sll@toc@l
+; CHECK-NEXT:   ldarx 0, 0, 12
+; CHECK-NEXT:   cmpld   0, 29
+; CHECK-NEXT:   bne     0, .LBB3_28
+; CHECK-NEXT: # %bb.25:                               # %cmpxchg.fencedstore162
+; CHECK-NEXT:   extsb 28, 28
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_26:                               # %cmpxchg.trystore161
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 28, 0, 12
+; CHECK-NEXT:   beq     0, .LBB3_28
+; CHECK-NEXT: # %bb.27:                               # %cmpxchg.releasedload160
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_26 Depth=1
+; CHECK-NEXT:   ldarx 0, 0, 12
+; CHECK-NEXT:   cmpld   0, 29
+; CHECK-NEXT:   beq     0, .LBB3_26
+; CHECK-NEXT: .LBB3_28:                               # %cmpxchg.nostore158
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   std 0, sll@toc@l(30)
+; CHECK-NEXT:   addis 30, 2, ull@toc@ha
+; CHECK-NEXT:   lbz 28, uc@toc@l(3)
+; CHECK-NEXT:   lbz 27, sc@toc@l(4)
+; CHECK-NEXT:   addi 0, 30, ull@toc@l
+; CHECK-NEXT:   ldarx 29, 0, 0
+; CHECK-NEXT:   cmpld   29, 28
+; CHECK-NEXT:   bne     0, .LBB3_32
+; CHECK-NEXT: # %bb.29:                               # %cmpxchg.fencedstore143
+; CHECK-NEXT:   extsb 27, 27
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_30:                               # %cmpxchg.trystore142
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 27, 0, 0
+; CHECK-NEXT:   beq     0, .LBB3_32
+; CHECK-NEXT: # %bb.31:                               # %cmpxchg.releasedload141
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_30 Depth=1
+; CHECK-NEXT:   ldarx 29, 0, 0
+; CHECK-NEXT:   cmpld   29, 28
+; CHECK-NEXT:   beq     0, .LBB3_30
+; CHECK-NEXT: .LBB3_32:                               # %cmpxchg.nostore139
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   std 29, ull@toc@l(30)
+; CHECK-NEXT:   lbz 30, uc@toc@l(3)
+; CHECK-NEXT:   lbz 29, sc@toc@l(4)
+; CHECK-NEXT:   lbarx 28, 0, 6
+; CHECK-NEXT:   clrlwi  28, 28, 24
+; CHECK-NEXT:   cmplw   28, 30
+; CHECK-NEXT:   bne     0, .LBB3_36
+; CHECK-NEXT: # %bb.33:                               # %cmpxchg.fencedstore124
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_34:                               # %cmpxchg.trystore123
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 29, 0, 6
+; CHECK-NEXT:   beq     0, .LBB3_37
+; CHECK-NEXT: # %bb.35:                               # %cmpxchg.releasedload122
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_34 Depth=1
+; CHECK-NEXT:   lbarx 28, 0, 6
+; CHECK-NEXT:   clrlwi  28, 28, 24
+; CHECK-NEXT:   cmplw   28, 30
+; CHECK-NEXT:   beq     0, .LBB3_34
+; CHECK-NEXT: .LBB3_36:                               # %cmpxchg.nostore120
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_38
+; CHECK-NEXT: .LBB3_37:                               # %cmpxchg.success121
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_38:                               # %cmpxchg.end118
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 30, 1
+; CHECK-NEXT:   isel 6, 30, 6, 20
+; CHECK-NEXT:   lbz 30, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lbarx 29, 0, 7
+; CHECK-NEXT:   clrlwi  29, 29, 24
+; CHECK-NEXT:   cmplw   29, 6
+; CHECK-NEXT:   bne     0, .LBB3_42
+; CHECK-NEXT: # %bb.39:                               # %cmpxchg.fencedstore105
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_40:                               # %cmpxchg.trystore104
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 30, 0, 7
+; CHECK-NEXT:   beq     0, .LBB3_43
+; CHECK-NEXT: # %bb.41:                               # %cmpxchg.releasedload103
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_40 Depth=1
+; CHECK-NEXT:   lbarx 29, 0, 7
+; CHECK-NEXT:   clrlwi  29, 29, 24
+; CHECK-NEXT:   cmplw   29, 6
+; CHECK-NEXT:   beq     0, .LBB3_40
+; CHECK-NEXT: .LBB3_42:                               # %cmpxchg.nostore101
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_44
+; CHECK-NEXT: .LBB3_43:                               # %cmpxchg.success102
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_44:                               # %cmpxchg.end99
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lharx 30, 0, 8
+; CHECK-NEXT:   clrlwi  30, 30, 16
+; CHECK-NEXT:   cmplw   30, 6
+; CHECK-NEXT:   bne     0, .LBB3_48
+; CHECK-NEXT: # %bb.45:                               # %cmpxchg.fencedstore86
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  7, 7, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_46:                               # %cmpxchg.trystore85
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 7, 0, 8
+; CHECK-NEXT:   beq     0, .LBB3_49
+; CHECK-NEXT: # %bb.47:                               # %cmpxchg.releasedload84
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_46 Depth=1
+; CHECK-NEXT:   lharx 30, 0, 8
+; CHECK-NEXT:   clrlwi  30, 30, 16
+; CHECK-NEXT:   cmplw   30, 6
+; CHECK-NEXT:   beq     0, .LBB3_46
+; CHECK-NEXT: .LBB3_48:                               # %cmpxchg.nostore82
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_50
+; CHECK-NEXT: .LBB3_49:                               # %cmpxchg.success83
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_50:                               # %cmpxchg.end80
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lharx 8, 0, 9
+; CHECK-NEXT:   clrlwi  8, 8, 16
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_54
+; CHECK-NEXT: # %bb.51:                               # %cmpxchg.fencedstore67
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  7, 7, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_52:                               # %cmpxchg.trystore66
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 7, 0, 9
+; CHECK-NEXT:   beq     0, .LBB3_55
+; CHECK-NEXT: # %bb.53:                               # %cmpxchg.releasedload65
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_52 Depth=1
+; CHECK-NEXT:   lharx 8, 0, 9
+; CHECK-NEXT:   clrlwi  8, 8, 16
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_52
+; CHECK-NEXT: .LBB3_54:                               # %cmpxchg.nostore63
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_56
+; CHECK-NEXT: .LBB3_55:                               # %cmpxchg.success64
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_56:                               # %cmpxchg.end61
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lwarx 8, 0, 10
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_60
+; CHECK-NEXT: # %bb.57:                               # %cmpxchg.fencedstore48
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_58:                               # %cmpxchg.trystore47
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 7, 0, 10
+; CHECK-NEXT:   beq     0, .LBB3_61
+; CHECK-NEXT: # %bb.59:                               # %cmpxchg.releasedload46
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_58 Depth=1
+; CHECK-NEXT:   lwarx 8, 0, 10
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_58
+; CHECK-NEXT: .LBB3_60:                               # %cmpxchg.nostore44
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_62
+; CHECK-NEXT: .LBB3_61:                               # %cmpxchg.success45
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_62:                               # %cmpxchg.end42
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lwarx 8, 0, 11
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_66
+; CHECK-NEXT: # %bb.63:                               # %cmpxchg.fencedstore29
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_64:                               # %cmpxchg.trystore28
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 7, 0, 11
+; CHECK-NEXT:   beq     0, .LBB3_67
+; CHECK-NEXT: # %bb.65:                               # %cmpxchg.releasedload27
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_64 Depth=1
+; CHECK-NEXT:   lwarx 8, 0, 11
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_64
+; CHECK-NEXT: .LBB3_66:                               # %cmpxchg.nostore25
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_68
+; CHECK-NEXT: .LBB3_67:                               # %cmpxchg.success26
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_68:                               # %cmpxchg.end23
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   ldarx 8, 0, 12
+; CHECK-NEXT:   cmpld   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_72
+; CHECK-NEXT: # %bb.69:                               # %cmpxchg.fencedstore10
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_70:                               # %cmpxchg.trystore9
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 7, 0, 12
+; CHECK-NEXT:   beq     0, .LBB3_73
+; CHECK-NEXT: # %bb.71:                               # %cmpxchg.releasedload8
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_70 Depth=1
+; CHECK-NEXT:   ldarx 8, 0, 12
+; CHECK-NEXT:   cmpld   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_70
+; CHECK-NEXT: .LBB3_72:                               # %cmpxchg.nostore6
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_74
+; CHECK-NEXT: .LBB3_73:                               # %cmpxchg.success7
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_74:                               # %cmpxchg.end4
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   lbz 3, uc@toc@l(3)
+; CHECK-NEXT:   lbz 4, sc@toc@l(4)
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   ldarx 6, 0, 0
+; CHECK-NEXT:   cmpld   6, 3
+; CHECK-NEXT:   bne     0, .LBB3_78
+; CHECK-NEXT: # %bb.75:                               # %cmpxchg.fencedstore
+; CHECK-NEXT:   extsb 4, 4
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_76:                               # %cmpxchg.trystore
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 4, 0, 0
+; CHECK-NEXT:   beq     0, .LBB3_79
+; CHECK-NEXT: # %bb.77:                               # %cmpxchg.releasedload
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_76 Depth=1
+; CHECK-NEXT:   ldarx 6, 0, 0
+; CHECK-NEXT:   cmpld   6, 3
+; CHECK-NEXT:   beq     0, .LBB3_76
+; CHECK-NEXT: .LBB3_78:                               # %cmpxchg.nostore
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_80
+; CHECK-NEXT: .LBB3_79:                               # %cmpxchg.success
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_80:                               # %cmpxchg.end
+; CHECK-NEXT:   li 3, 0
+; CHECK-NEXT:   li 4, 1
+; CHECK-NEXT:   ld 30, -16(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   ld 29, -24(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   ld 28, -32(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   ld 27, -40(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   isel 3, 4, 3, 20
+; CHECK-NEXT:   stw 3, ui@toc@l(5)
+; CHECK-NEXT:   blr
 ;
 ; AIX32-LABEL: test_compare_and_swap:
 ; AIX32:       # %bb.0: # %entry
 ; AIX32-NEXT:    mflr 0
-; AIX32-NEXT:    stwu 1, -128(1)
-; AIX32-NEXT:    stw 0, 136(1)
-; AIX32-NEXT:    stw 28, 112(1) # 4-byte Folded Spill
-; AIX32-NEXT:    lwz 28, L..C0(2) # @sc
-; AIX32-NEXT:    stw 29, 116(1) # 4-byte Folded Spill
-; AIX32-NEXT:    lwz 29, L..C1(2) # @uc
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    rlwinm 5, 28, 3, 27, 28
-; AIX32-NEXT:    stw 21, 84(1) # 4-byte Folded Spill
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 17, 68(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 18, 72(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 19, 76(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 20, 80(1) # 4-byte Folded Spill
-; AIX32-NEXT:    xori 21, 5, 24
-; AIX32-NEXT:    stw 22, 88(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 23, 92(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 24, 96(1) # 4-byte Folded Spill
-; AIX32-NEXT:    slw 5, 3, 21
-; AIX32-NEXT:    li 3, 255
-; AIX32-NEXT:    slw 4, 4, 21
-; AIX32-NEXT:    stw 25, 100(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 26, 104(1) # 4-byte Folded Spill
-; AIX32-NEXT:    slw 3, 3, 21
-; AIX32-NEXT:    stw 27, 108(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 30, 120(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 31, 124(1) # 4-byte Folded Spill
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    rlwinm 18, 28, 0, 0, 29
-; AIX32-NEXT:    and 4, 4, 3
-; AIX32-NEXT:    and 5, 5, 3
-; AIX32-NEXT:  L..BB3_1: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 7, 0, 18
-; AIX32-NEXT:    and 6, 7, 3
-; AIX32-NEXT:    cmpw 6, 5
-; AIX32-NEXT:    bne 0, L..BB3_3
-; AIX32-NEXT:  # %bb.2: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 7, 7, 3
-; AIX32-NEXT:    or 7, 7, 4
-; AIX32-NEXT:    stwcx. 7, 0, 18
-; AIX32-NEXT:    bne 0, L..BB3_1
-; AIX32-NEXT:  L..BB3_3: # %entry
-; AIX32-NEXT:    rlwinm 5, 29, 3, 27, 28
-; AIX32-NEXT:    srw 3, 6, 21
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    rlwinm 20, 29, 0, 0, 29
-; AIX32-NEXT:    xori 25, 5, 24
-; AIX32-NEXT:    slw 5, 3, 25
-; AIX32-NEXT:    stb 3, 0(28)
-; AIX32-NEXT:    li 3, 255
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 6, 4, 25
-; AIX32-NEXT:    slw 3, 3, 25
-; AIX32-NEXT:    and 4, 5, 3
-; AIX32-NEXT:    and 5, 6, 3
-; AIX32-NEXT:  L..BB3_4: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 7, 0, 20
-; AIX32-NEXT:    and 6, 7, 3
-; AIX32-NEXT:    cmpw 6, 5
-; AIX32-NEXT:    bne 0, L..BB3_6
-; AIX32-NEXT:  # %bb.5: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 7, 7, 3
-; AIX32-NEXT:    or 7, 7, 4
-; AIX32-NEXT:    stwcx. 7, 0, 20
-; AIX32-NEXT:    bne 0, L..BB3_4
-; AIX32-NEXT:  L..BB3_6: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    srw 4, 6, 25
-; AIX32-NEXT:    lbz 3, 0(28)
-; AIX32-NEXT:    extsb 5, 3
-; AIX32-NEXT:    lwz 3, L..C2(2) # @ss
-; AIX32-NEXT:    stb 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    rlwinm 6, 3, 3, 27, 27
-; AIX32-NEXT:    rlwinm 22, 3, 0, 0, 29
-; AIX32-NEXT:    xori 26, 6, 16
-; AIX32-NEXT:    slw 6, 4, 26
-; AIX32-NEXT:    li 4, 0
-; AIX32-NEXT:    slw 5, 5, 26
-; AIX32-NEXT:    ori 4, 4, 65535
-; AIX32-NEXT:    slw 4, 4, 26
-; AIX32-NEXT:    and 5, 5, 4
-; AIX32-NEXT:    and 6, 6, 4
-; AIX32-NEXT:  L..BB3_7: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 8, 0, 22
-; AIX32-NEXT:    and 7, 8, 4
-; AIX32-NEXT:    cmpw 7, 6
-; AIX32-NEXT:    bne 0, L..BB3_9
-; AIX32-NEXT:  # %bb.8: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 8, 8, 4
-; AIX32-NEXT:    or 8, 8, 5
-; AIX32-NEXT:    stwcx. 8, 0, 22
-; AIX32-NEXT:    bne 0, L..BB3_7
-; AIX32-NEXT:  L..BB3_9: # %entry
-; AIX32-NEXT:    srw 4, 7, 26
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    sth 4, 0(3)
-; AIX32-NEXT:    lbz 3, 0(28)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 5, 3
-; AIX32-NEXT:    lwz 3, L..C3(2) # @us
-; AIX32-NEXT:    rlwinm 6, 3, 3, 27, 27
-; AIX32-NEXT:    rlwinm 19, 3, 0, 0, 29
-; AIX32-NEXT:    xori 24, 6, 16
-; AIX32-NEXT:    slw 6, 4, 24
-; AIX32-NEXT:    li 4, 0
-; AIX32-NEXT:    slw 5, 5, 24
-; AIX32-NEXT:    ori 4, 4, 65535
-; AIX32-NEXT:    slw 4, 4, 24
-; AIX32-NEXT:    and 5, 5, 4
-; AIX32-NEXT:    and 6, 6, 4
-; AIX32-NEXT:  L..BB3_10: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 8, 0, 19
-; AIX32-NEXT:    and 7, 8, 4
-; AIX32-NEXT:    cmpw 7, 6
-; AIX32-NEXT:    bne 0, L..BB3_12
-; AIX32-NEXT:  # %bb.11: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 8, 8, 4
-; AIX32-NEXT:    or 8, 8, 5
-; AIX32-NEXT:    stwcx. 8, 0, 19
-; AIX32-NEXT:    bne 0, L..BB3_10
-; AIX32-NEXT:  L..BB3_12: # %entry
-; AIX32-NEXT:    srw 4, 7, 24
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lwz 17, L..C4(2) # @si
-; AIX32-NEXT:    sth 4, 0(3)
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 4, 4
-; AIX32-NEXT:  L..BB3_13: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 5, 0, 17
-; AIX32-NEXT:    cmpw 5, 3
-; AIX32-NEXT:    bne 0, L..BB3_15
-; AIX32-NEXT:  # %bb.14: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 4, 0, 17
-; AIX32-NEXT:    bne 0, L..BB3_13
-; AIX32-NEXT:  L..BB3_15: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    stw 5, 0(17)
-; AIX32-NEXT:    lwz 27, L..C5(2) # @ui
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 4, 4
-; AIX32-NEXT:  L..BB3_16: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 5, 0, 27
-; AIX32-NEXT:    cmpw 5, 3
-; AIX32-NEXT:    bne 0, L..BB3_18
-; AIX32-NEXT:  # %bb.17: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB3_16
-; AIX32-NEXT:  L..BB3_18: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
-; AIX32-NEXT:    stw 5, 0(27)
-; AIX32-NEXT:    lbz 3, 0(28)
-; AIX32-NEXT:    li 23, 0
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    extsb 6, 3
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 31
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    lwz 3, 60(1)
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    lwz 30, L..C7(2) # @ull
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    stw 3, 4(31)
-; AIX32-NEXT:    lwz 3, 56(1)
-; AIX32-NEXT:    extsb 6, 4
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    stw 3, 0(31)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 30
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    lwz 4, 60(1)
-; AIX32-NEXT:    lwz 3, 56(1)
-; AIX32-NEXT:    stw 4, 4(30)
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 3, 0(30)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 5, 4, 21
-; AIX32-NEXT:    li 4, 255
-; AIX32-NEXT:    slw 6, 3, 21
-; AIX32-NEXT:    slw 4, 4, 21
-; AIX32-NEXT:    and 5, 5, 4
-; AIX32-NEXT:    and 6, 6, 4
-; AIX32-NEXT:  L..BB3_19: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 8, 0, 18
-; AIX32-NEXT:    and 7, 8, 4
-; AIX32-NEXT:    cmpw 7, 6
-; AIX32-NEXT:    bne 0, L..BB3_21
-; AIX32-NEXT:  # %bb.20: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 8, 8, 4
-; AIX32-NEXT:    or 8, 8, 5
-; AIX32-NEXT:    stwcx. 8, 0, 18
-; AIX32-NEXT:    bne 0, L..BB3_19
-; AIX32-NEXT:  L..BB3_21: # %entry
-; AIX32-NEXT:    srw 4, 7, 21
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    cmpw 4, 3
-; AIX32-NEXT:    li 3, 1
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    slw 6, 5, 25
-; AIX32-NEXT:    li 5, 255
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    slw 5, 5, 25
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 7, 4, 25
-; AIX32-NEXT:    and 6, 6, 5
-; AIX32-NEXT:    and 7, 7, 5
-; AIX32-NEXT:  L..BB3_22: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 9, 0, 20
-; AIX32-NEXT:    and 8, 9, 5
-; AIX32-NEXT:    cmpw 8, 7
-; AIX32-NEXT:    bne 0, L..BB3_24
-; AIX32-NEXT:  # %bb.23: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 9, 9, 5
-; AIX32-NEXT:    or 9, 9, 6
-; AIX32-NEXT:    stwcx. 9, 0, 20
-; AIX32-NEXT:    bne 0, L..BB3_22
-; AIX32-NEXT:  L..BB3_24: # %entry
-; AIX32-NEXT:    srw 5, 8, 25
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    cmpw 5, 4
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 6, 5, 26
-; AIX32-NEXT:    li 5, 0
-; AIX32-NEXT:    slw 7, 4, 26
-; AIX32-NEXT:    ori 5, 5, 65535
-; AIX32-NEXT:    slw 5, 5, 26
-; AIX32-NEXT:    and 6, 6, 5
-; AIX32-NEXT:    and 7, 7, 5
-; AIX32-NEXT:  L..BB3_25: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 9, 0, 22
-; AIX32-NEXT:    and 8, 9, 5
-; AIX32-NEXT:    cmpw 8, 7
-; AIX32-NEXT:    bne 0, L..BB3_27
-; AIX32-NEXT:  # %bb.26: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 9, 9, 5
-; AIX32-NEXT:    or 9, 9, 6
-; AIX32-NEXT:    stwcx. 9, 0, 22
-; AIX32-NEXT:    bne 0, L..BB3_25
-; AIX32-NEXT:  L..BB3_27: # %entry
-; AIX32-NEXT:    srw 5, 8, 26
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    cmpw 5, 4
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 6, 5, 24
-; AIX32-NEXT:    li 5, 0
-; AIX32-NEXT:    slw 7, 4, 24
-; AIX32-NEXT:    ori 5, 5, 65535
-; AIX32-NEXT:    slw 5, 5, 24
-; AIX32-NEXT:    and 6, 6, 5
-; AIX32-NEXT:    and 7, 7, 5
-; AIX32-NEXT:  L..BB3_28: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 9, 0, 19
-; AIX32-NEXT:    and 8, 9, 5
-; AIX32-NEXT:    cmpw 8, 7
-; AIX32-NEXT:    bne 0, L..BB3_30
-; AIX32-NEXT:  # %bb.29: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 9, 9, 5
-; AIX32-NEXT:    or 9, 9, 6
-; AIX32-NEXT:    stwcx. 9, 0, 19
-; AIX32-NEXT:    bne 0, L..BB3_28
-; AIX32-NEXT:  L..BB3_30: # %entry
-; AIX32-NEXT:    srw 5, 8, 24
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    cmpw 5, 4
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:  L..BB3_31: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 6, 0, 17
-; AIX32-NEXT:    cmpw 1, 6, 4
-; AIX32-NEXT:    bne 1, L..BB3_33
-; AIX32-NEXT:  # %bb.32: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB3_31
-; AIX32-NEXT:  L..BB3_33: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    isel 4, 3, 23, 6
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:  L..BB3_34: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 6, 0, 27
-; AIX32-NEXT:    cmpw 1, 6, 4
-; AIX32-NEXT:    bne 1, L..BB3_36
-; AIX32-NEXT:  # %bb.35: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 5, 0, 27
-; AIX32-NEXT:    bne 0, L..BB3_34
-; AIX32-NEXT:  L..BB3_36: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    isel 3, 3, 23, 6
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 3, 0(27)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    extsb 6, 4
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 31
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 3, 0(27)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    extsb 6, 4
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 30
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    stw 3, 0(27)
-; AIX32-NEXT:    lwz 31, 124(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 30, 120(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 29, 116(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 28, 112(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 27, 108(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 26, 104(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 25, 100(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 24, 96(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 23, 92(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 22, 88(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 21, 84(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 20, 80(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 19, 76(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 18, 72(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 17, 68(1) # 4-byte Folded Reload
-; AIX32-NEXT:    addi 1, 1, 128
-; AIX32-NEXT:    lwz 0, 8(1)
-; AIX32-NEXT:    mtlr 0
+; AIX32-NEXT:   stwu 1, -144(1)
+; AIX32-NEXT:   stw 0, 152(1)
+; AIX32-NEXT:   stw 29, 132(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   lwz 29, L..C0(2)                        # @sc
+; AIX32-NEXT:   stw 26, 120(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   not     3, 29
+; AIX32-NEXT:   stw 30, 136(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   lwz 30, L..C1(2)                        # @uc
+; AIX32-NEXT:   lbz 4, 0(30)
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   stw 27, 124(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   rlwinm 27, 29, 0, 0, 29
+; AIX32-NEXT:   stw 14, 72(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 15, 76(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   rlwinm 26, 3, 3, 27, 28
+; AIX32-NEXT:   li 3, 255
+; AIX32-NEXT:   slw 3, 3, 26
+; AIX32-NEXT:   stw 16, 80(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 17, 84(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 18, 88(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 19, 92(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 20, 96(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 21, 100(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 22, 104(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 23, 108(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 24, 112(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 25, 116(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 28, 128(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 31, 140(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   not     25, 3
+; AIX32-NEXT:   lwarx 3, 0, 27
+; AIX32-NEXT:   srw 6, 3, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 4
+; AIX32-NEXT:   bne     0, L..BB3_4
+; AIX32-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore289
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 26
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_2:                               # %cmpxchg.trystore288
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 6, 3, 25
+; AIX32-NEXT:   or 6, 6, 5
+; AIX32-NEXT:   stwcx. 6, 0, 27
+; AIX32-NEXT:   beq     0, L..BB3_4
+; AIX32-NEXT:  # %bb.3:                                # %cmpxchg.releasedload287
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 27
+; AIX32-NEXT:   srw 6, 3, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 4
+; AIX32-NEXT:   beq     0, L..BB3_2
+; AIX32-NEXT:  L..BB3_4:                               # %cmpxchg.nostore285
+; AIX32-NEXT:   not     4, 30
+; AIX32-NEXT:   srw 5, 3, 26
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   rlwinm 24, 30, 0, 0, 29
+; AIX32-NEXT:   rlwinm 23, 4, 3, 27, 28
+; AIX32-NEXT:   li 4, 255
+; AIX32-NEXT:   stb 5, 0(29)
+; AIX32-NEXT:   slw 4, 4, 23
+; AIX32-NEXT:   not     22, 4
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_8
+; AIX32-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore256
+; AIX32-NEXT:   clrlwi  5, 5, 24
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 23
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_6:                               # %cmpxchg.trystore255
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 6, 4, 22
+; AIX32-NEXT:   or 6, 6, 5
+; AIX32-NEXT:   stwcx. 6, 0, 24
+; AIX32-NEXT:   beq     0, L..BB3_8
+; AIX32-NEXT:  # %bb.7:                                # %cmpxchg.releasedload254
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_6 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_6
+; AIX32-NEXT:  L..BB3_8:                               # %cmpxchg.nostore252
+; AIX32-NEXT:   srw 4, 4, 23
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lis 3, 0
+; AIX32-NEXT:   lbz 7, 0(29)
+; AIX32-NEXT:   stb 4, 0(30)
+; AIX32-NEXT:   clrlwi  6, 4, 24
+; AIX32-NEXT:   lwz 4, L..C2(2)                         # @ss
+; AIX32-NEXT:   ori 3, 3, 65535
+; AIX32-NEXT:   clrlwi  5, 4, 30
+; AIX32-NEXT:   rlwinm 21, 4, 0, 0, 29
+; AIX32-NEXT:   xori 5, 5, 2
+; AIX32-NEXT:   slwi 20, 5, 3
+; AIX32-NEXT:   slw 5, 3, 20
+; AIX32-NEXT:   not     19, 5
+; AIX32-NEXT:   lwarx 5, 0, 21
+; AIX32-NEXT:   srw 8, 5, 20
+; AIX32-NEXT:   clrlwi  8, 8, 16
+; AIX32-NEXT:   cmplw   8, 6
+; AIX32-NEXT:   bne     0, L..BB3_12
+; AIX32-NEXT:  # %bb.9:                                # %cmpxchg.fencedstore223
+; AIX32-NEXT:   extsb 7, 7
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  7, 7, 16
+; AIX32-NEXT:   slw 7, 7, 20
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_10:                              # %cmpxchg.trystore222
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 8, 5, 19
+; AIX32-NEXT:   or 8, 8, 7
+; AIX32-NEXT:   stwcx. 8, 0, 21
+; AIX32-NEXT:   beq     0, L..BB3_12
+; AIX32-NEXT:  # %bb.11:                               # %cmpxchg.releasedload221
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_10 Depth=1
+; AIX32-NEXT:   lwarx 5, 0, 21
+; AIX32-NEXT:   srw 8, 5, 20
+; AIX32-NEXT:   clrlwi  8, 8, 16
+; AIX32-NEXT:   cmplw   8, 6
+; AIX32-NEXT:   beq     0, L..BB3_10
+; AIX32-NEXT:  L..BB3_12:                              # %cmpxchg.nostore219
+; AIX32-NEXT:   srw 5, 5, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lbz 6, 0(29)
+; AIX32-NEXT:   sth 5, 0(4)
+; AIX32-NEXT:   lwz 4, L..C3(2)                         # @us
+; AIX32-NEXT:   lbz 5, 0(30)
+; AIX32-NEXT:   clrlwi  7, 4, 30
+; AIX32-NEXT:   rlwinm 18, 4, 0, 0, 29
+; AIX32-NEXT:   xori 7, 7, 2
+; AIX32-NEXT:   slwi 17, 7, 3
+; AIX32-NEXT:   slw 3, 3, 17
+; AIX32-NEXT:   not     16, 3
+; AIX32-NEXT:   lwarx 3, 0, 18
+; AIX32-NEXT:   srw 7, 3, 17
+; AIX32-NEXT:   clrlwi  7, 7, 16
+; AIX32-NEXT:   cmplw   7, 5
+; AIX32-NEXT:   bne     0, L..BB3_16
+; AIX32-NEXT:  # %bb.13:                               # %cmpxchg.fencedstore190
+; AIX32-NEXT:   extsb 6, 6
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   slw 6, 6, 17
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_14:                              # %cmpxchg.trystore189
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 7, 3, 16
+; AIX32-NEXT:   or 7, 7, 6
+; AIX32-NEXT:   stwcx. 7, 0, 18
+; AIX32-NEXT:   beq     0, L..BB3_16
+; AIX32-NEXT:  # %bb.15:                               # %cmpxchg.releasedload188
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_14 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 18
+; AIX32-NEXT:   srw 7, 3, 17
+; AIX32-NEXT:   clrlwi  7, 7, 16
+; AIX32-NEXT:   cmplw   7, 5
+; AIX32-NEXT:   beq     0, L..BB3_14
+; AIX32-NEXT:  L..BB3_16:                              # %cmpxchg.nostore186
+; AIX32-NEXT:   srw 3, 3, 17
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lwz 15, L..C4(2)                        # @si
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   sth 3, 0(4)
+; AIX32-NEXT:   lbz 4, 0(30)
+; AIX32-NEXT:   lwarx 3, 0, 15
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   bne     0, L..BB3_20
+; AIX32-NEXT:  # %bb.17:                               # %cmpxchg.fencedstore171
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_18:                              # %cmpxchg.trystore170
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 5, 0, 15
+; AIX32-NEXT:   beq     0, L..BB3_20
+; AIX32-NEXT:  # %bb.19:                               # %cmpxchg.releasedload169
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_18 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 15
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   beq     0, L..BB3_18
+; AIX32-NEXT:  L..BB3_20:                              # %cmpxchg.nostore167
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lwz 28, L..C5(2)                        # @ui
+; AIX32-NEXT:   stw 3, 0(15)
+; AIX32-NEXT:   lbz 4, 0(30)
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   lwarx 3, 0, 28
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   bne     0, L..BB3_24
+; AIX32-NEXT:  # %bb.21:                               # %cmpxchg.fencedstore152
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_22:                              # %cmpxchg.trystore151
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 5, 0, 28
+; AIX32-NEXT:   beq     0, L..BB3_24
+; AIX32-NEXT:  # %bb.23:                               # %cmpxchg.releasedload150
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_22 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 28
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   beq     0, L..BB3_22
+; AIX32-NEXT:  L..BB3_24:                              # %cmpxchg.nostore148
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lwz 31, L..C6(2)                        # @sll
+; AIX32-NEXT:   lbz 3, 0(29)
+; AIX32-NEXT:   li 14, 0
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   stw 14, 64(1)
+; AIX32-NEXT:   extsb 6, 3
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   mr      3, 31
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   lwz 3, 68(1)
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   stw 3, 4(31)
+; AIX32-NEXT:   lwz 3, 64(1)
+; AIX32-NEXT:   extsb 6, 4
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   stw 14, 64(1)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   stw 3, 0(31)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwz 31, L..C7(2)                        # @ull
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   mr      3, 31
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   lwz 3, 64(1)
+; AIX32-NEXT:   lwz 4, 68(1)
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   stw 4, 4(31)
+; AIX32-NEXT:   stw 3, 0(31)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 27
+; AIX32-NEXT:   srw 6, 4, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_28
+; AIX32-NEXT:  # %bb.25:                               # %cmpxchg.fencedstore119
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 26
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_26:                              # %cmpxchg.trystore118
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 25
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 27
+; AIX32-NEXT:   beq     0, L..BB3_29
+; AIX32-NEXT:  # %bb.27:                               # %cmpxchg.releasedload117
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_26 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 27
+; AIX32-NEXT:   srw 6, 4, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_26
+; AIX32-NEXT:  L..BB3_28:                              # %cmpxchg.nostore115
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_30
+; AIX32-NEXT:  L..BB3_29:                              # %cmpxchg.success116
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_30:                              # %cmpxchg.end113
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_34
+; AIX32-NEXT:  # %bb.31:                               # %cmpxchg.fencedstore86
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 23
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_32:                              # %cmpxchg.trystore85
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 22
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 24
+; AIX32-NEXT:   beq     0, L..BB3_35
+; AIX32-NEXT:  # %bb.33:                               # %cmpxchg.releasedload84
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_32 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_32
+; AIX32-NEXT:  L..BB3_34:                              # %cmpxchg.nostore82
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_36
+; AIX32-NEXT:  L..BB3_35:                              # %cmpxchg.success83
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_36:                              # %cmpxchg.end80
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 21
+; AIX32-NEXT:   srw 6, 4, 20
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_40
+; AIX32-NEXT:  # %bb.37:                               # %cmpxchg.fencedstore53
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  5, 5, 16
+; AIX32-NEXT:   slw 5, 5, 20
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_38:                              # %cmpxchg.trystore52
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 19
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 21
+; AIX32-NEXT:   beq     0, L..BB3_41
+; AIX32-NEXT:  # %bb.39:                               # %cmpxchg.releasedload51
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_38 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 21
+; AIX32-NEXT:   srw 6, 4, 20
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_38
+; AIX32-NEXT:  L..BB3_40:                              # %cmpxchg.nostore49
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_42
+; AIX32-NEXT:  L..BB3_41:                              # %cmpxchg.success50
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_42:                              # %cmpxchg.end47
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 18
+; AIX32-NEXT:   srw 6, 4, 17
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_46
+; AIX32-NEXT:  # %bb.43:                               # %cmpxchg.fencedstore29
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  5, 5, 16
+; AIX32-NEXT:   slw 5, 5, 17
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_44:                              # %cmpxchg.trystore28
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 16
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 18
+; AIX32-NEXT:   beq     0, L..BB3_47
+; AIX32-NEXT:  # %bb.45:                               # %cmpxchg.releasedload27
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_44 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 18
+; AIX32-NEXT:   srw 6, 4, 17
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_44
+; AIX32-NEXT:  L..BB3_46:                              # %cmpxchg.nostore25
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_48
+; AIX32-NEXT:  L..BB3_47:                              # %cmpxchg.success26
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_48:                              # %cmpxchg.end23
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 5, 0, 15
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   bne     0, L..BB3_52
+; AIX32-NEXT:  # %bb.49:                               # %cmpxchg.fencedstore10
+; AIX32-NEXT:   extsb 4, 4
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_50:                              # %cmpxchg.trystore9
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 4, 0, 15
+; AIX32-NEXT:   beq     0, L..BB3_53
+; AIX32-NEXT:  # %bb.51:                               # %cmpxchg.releasedload8
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_50 Depth=1
+; AIX32-NEXT:   lwarx 5, 0, 15
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   beq     0, L..BB3_50
+; AIX32-NEXT:  L..BB3_52:                              # %cmpxchg.nostore6
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_54
+; AIX32-NEXT:  L..BB3_53:                              # %cmpxchg.success7
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_54:                              # %cmpxchg.end4
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 5, 0, 28
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   bne     0, L..BB3_58
+; AIX32-NEXT:  # %bb.55:                               # %cmpxchg.fencedstore
+; AIX32-NEXT:   extsb 4, 4
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_56:                              # %cmpxchg.trystore
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 4, 0, 28
+; AIX32-NEXT:   beq     0, L..BB3_59
+; AIX32-NEXT:  # %bb.57:                               # %cmpxchg.releasedload
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_56 Depth=1
+; AIX32-NEXT:   lwarx 5, 0, 28
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   beq     0, L..BB3_56
+; AIX32-NEXT:  L..BB3_58:                              # %cmpxchg.nostore
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_60
+; AIX32-NEXT:  L..BB3_59:                              # %cmpxchg.success
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_60:                              # %cmpxchg.end
+; AIX32-NEXT:   li 3, 1
+; AIX32-NEXT:   li 31, 0
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   isel 3, 3, 31, 20
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   extsb 6, 4
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   stw 31, 64(1)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   lwz 3, L..C6(2)                         # @sll
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   extsb 6, 4
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   lwz 3, L..C7(2)                         # @ull
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   stw 31, 64(1)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lwz 31, 140(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 30, 136(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 29, 132(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 28, 128(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 27, 124(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 26, 120(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 25, 116(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 24, 112(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 23, 108(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 22, 104(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 21, 100(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 20, 96(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 19, 92(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 18, 88(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 17, 84(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 16, 80(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 15, 76(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 14, 72(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   addi 1, 1, 144
+; AIX32-NEXT:   lwz 0, 8(1)
+; AIX32-NEXT:   mtlr 0
 ; AIX32-NEXT:    blr
 entry:
   %0 = load i8, ptr @uc, align 1
@@ -5597,21 +5852,20 @@ entry:
 define dso_local i64 @cmpswplp(ptr noundef %ptr, ptr nocapture noundef readnone %oldval, i64 noundef %newval) local_unnamed_addr #0 {
 ; CHECK-LABEL: cmpswplp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi 4, 5, 1
-; CHECK-NEXT:  .LBB6_1: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 6, 0, 3
-; CHECK-NEXT:    cmpd 1, 6, 5
-; CHECK-NEXT:    bne 1, .LBB6_3
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_1
-; CHECK-NEXT:  .LBB6_3: # %entry
-; CHECK-NEXT:    li 3, 66
-; CHECK-NEXT:    li 4, 55
-; CHECK-NEXT:    isel 3, 4, 3, 6
-; CHECK-NEXT:    blr
+; CHECK-NEXT:   ldarx 4, 0, 3
+; CHECK-NEXT:   cmpld   4, 5
+; CHECK-NEXT:   bne     0, .LBB6_2
+; CHECK-NEXT: # %bb.1:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:   addi 4, 5, 1
+; CHECK-NEXT:   stdcx. 4, 0, 3
+; CHECK-NEXT:   beq     0, .LBB6_4
+; CHECK-NEXT: .LBB6_2:                                # %cmpxchg.failure
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT: .LBB6_3:                                # %cmpxchg.end
+; CHECK-NEXT:   li 3, 66
+; CHECK-NEXT:   li 4, 55
+; CHECK-NEXT:   isel 3, 4, 3, 20
+; CHECK-NEXT:   blr
 ;
 ; AIX32-LABEL: cmpswplp:
 ; AIX32:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/PowerPC/atomic-2.ll b/llvm/test/CodeGen/PowerPC/atomic-2.ll
index 10476541870f..8fa0d767b329 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-2.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-2.ll
@@ -42,8 +42,8 @@ define i64 @exchange_and_cmp(ptr %mem) nounwind {
 
 define i8 @exchange_and_cmp8(ptr %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp8:
-; CHECK-BE: xori
-; CHECK-LE-NOT: xori
+; CHECK-BE: or r{{.*}} r{{.*}} r{{.*}}
+; CHECK-LE-NOT: or r{{.*}} r{{.*}} r{{.*}}
 ; CHECK-P8U: lbarx
   %tmppair = cmpxchg ptr %mem, i8 0, i8 1 monotonic monotonic
   %tmp = extractvalue { i8, i1 } %tmppair, 0
diff --git a/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll b/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll
index 399645f671f7..65a12a6222f2 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll
@@ -12,62 +12,60 @@
 define i32 @foo(ptr noundef %cp, ptr noundef %old, i32 noundef %c)  {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lwz r7, 0(r4)
 ; CHECK-NEXT:    stw r3, -4(r1)
 ; CHECK-NEXT:    stw r4, -8(r1)
+; CHECK-NEXT:    lwz r7, 0(r4)
 ; CHECK-NEXT:    stw r5, -12(r1)
 ; CHECK-NEXT:    stw r5, -16(r1)
-; CHECK-NEXT:  L..BB0_1: # %entry
-; CHECK-NEXT:    #
 ; CHECK-NEXT:    lwarx r6, 0, r3
-; CHECK-NEXT:    cmpw cr1, r6, r7
-; CHECK-NEXT:    bne cr1, L..BB0_3
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. r5, 0, r3
-; CHECK-NEXT:    bne cr0, L..BB0_1
-; CHECK-NEXT:  L..BB0_3: # %entry
 ; CHECK-NEXT:    cmplw r6, r7
+; CHECK-NEXT:    bne cr0, L..BB0_2
+; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; CHECK-NEXT:    stwcx. r5, 0, r3
 ; CHECK-NEXT:    beq cr0, L..BB0_5
-; CHECK-NEXT:  # %bb.4: # %cmpxchg.store_expected
+; CHECK-NEXT:  L..BB0_2: # %cmpxchg.failure
+; CHECK-NEXT:    crxor 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK-NEXT:  # %bb.3: # %cmpxchg.store_expected
 ; CHECK-NEXT:    stw r6, 0(r4)
-; CHECK-NEXT:  L..BB0_5: # %cmpxchg.continue
+; CHECK-NEXT:  L..BB0_4: # %cmpxchg.continue
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    li r4, 1
-; CHECK-NEXT:    isel r3, r4, r3, 4*cr1+eq
+; CHECK-NEXT:    isel r3, r4, r3, 4*cr5+lt
 ; CHECK-NEXT:    stb r3, -17(r1)
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  L..BB0_5:
+; CHECK-NEXT:    creqv 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK-NEXT:    b L..BB0_4
 ;
 ; CHECK64-LABEL: foo:
 ; CHECK64:       # %bb.0: # %entry
-; CHECK64-NEXT:    lwz r7, 0(r4)
 ; CHECK64-NEXT:    std r3, -8(r1)
 ; CHECK64-NEXT:    std r4, -16(r1)
+; CHECK64-NEXT:    lwz r7, 0(r4)
 ; CHECK64-NEXT:    stw r5, -20(r1)
 ; CHECK64-NEXT:    stw r5, -24(r1)
-; CHECK64-NEXT:  L..BB0_1: # %entry
-; CHECK64-NEXT:    #
 ; CHECK64-NEXT:    lwarx r6, 0, r3
-; CHECK64-NEXT:    cmpw cr1, r6, r7
-; CHECK64-NEXT:    bne cr1, L..BB0_3
-; CHECK64-NEXT:  # %bb.2: # %entry
-; CHECK64-NEXT:    #
-; CHECK64-NEXT:    stwcx. r5, 0, r3
-; CHECK64-NEXT:    bne cr0, L..BB0_1
-; CHECK64-NEXT:  L..BB0_3: # %entry
 ; CHECK64-NEXT:    cmplw r6, r7
+; CHECK64-NEXT:    bne cr0, L..BB0_2
+; CHECK64-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; CHECK64-NEXT:    stwcx. r5, 0, r3
 ; CHECK64-NEXT:    beq cr0, L..BB0_5
-; CHECK64-NEXT:  # %bb.4: # %cmpxchg.store_expected
+; CHECK64-NEXT:  L..BB0_2: # %cmpxchg.failure
+; CHECK64-NEXT:    crxor 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK64-NEXT:  # %bb.3: # %cmpxchg.store_expected
 ; CHECK64-NEXT:    stw r6, 0(r4)
-; CHECK64-NEXT:  L..BB0_5: # %cmpxchg.continue
+; CHECK64-NEXT:  L..BB0_4: # %cmpxchg.continue
 ; CHECK64-NEXT:    li r3, 0
 ; CHECK64-NEXT:    li r4, 1
-; CHECK64-NEXT:    isel r3, r4, r3, 4*cr1+eq
+; CHECK64-NEXT:    isel r3, r4, r3, 4*cr5+lt
 ; CHECK64-NEXT:    li r4, 1
 ; CHECK64-NEXT:    stb r3, -25(r1)
 ; CHECK64-NEXT:    li r3, 0
-; CHECK64-NEXT:    isel r3, r4, r3, 4*cr1+eq
+; CHECK64-NEXT:    isel r3, r4, r3, 4*cr5+lt
 ; CHECK64-NEXT:    blr
+; CHECK64-NEXT:  L..BB0_5:
+; CHECK64-NEXT:    creqv 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK64-NEXT:    b L..BB0_4
 entry:
   %cp.addr = alloca ptr, align 4
   %old.addr = alloca ptr, align 4
diff --git a/llvm/test/CodeGen/PowerPC/atomic-float.ll b/llvm/test/CodeGen/PowerPC/atomic-float.ll
index e2a46130ab79..600d28936c16 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-float.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-float.ll
@@ -9,33 +9,37 @@ define float @test_add(ptr %ptr, float %incr) {
 ; CHECK-64:       # %bb.0: # %entry
 ; CHECK-64-NEXT:    sync
 ; CHECK-64-NEXT:    lfs 0, 0(3)
-; CHECK-64-NEXT:    b .LBB0_2
-; CHECK-64-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-64-NEXT:    #
-; CHECK-64-NEXT:    stw 6, -4(1)
-; CHECK-64-NEXT:    cmplw 6, 4
-; CHECK-64-NEXT:    lfs 0, -4(1)
-; CHECK-64-NEXT:    beq 0, .LBB0_5
-; CHECK-64-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-64-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-64-NEXT:    # Child Loop BB0_3 Depth 2
+; CHECK-64-NEXT:    b .LBB0_3
+; CHECK-64-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-64-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-64-NEXT:    crxor 20, 20, 20
+; CHECK-64-NEXT:  .LBB0_2:                                # %cmpxchg.end
+; CHECK-64-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-64-NEXT:    stw 4, -12(1)
+; CHECK-64-NEXT:    lfs 0, -12(1)
+; CHECK-64-NEXT:    bc 12, 20, .LBB0_7
+; CHECK-64-NEXT:  .LBB0_3:                                # %atomicrmw.start
+; CHECK-64-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-64-NEXT:                                          #     Child Loop BB0_4 Depth 2
 ; CHECK-64-NEXT:    fadds 2, 0, 1
-; CHECK-64-NEXT:    stfs 2, -8(1)
-; CHECK-64-NEXT:    stfs 0, -12(1)
-; CHECK-64-NEXT:    lwz 5, -8(1)
-; CHECK-64-NEXT:    lwz 4, -12(1)
-; CHECK-64-NEXT:  .LBB0_3: # %atomicrmw.start
-; CHECK-64-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-64-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-64-NEXT:    lwarx 6, 0, 3
-; CHECK-64-NEXT:    cmpw 6, 4
-; CHECK-64-NEXT:    bne 0, .LBB0_1
-; CHECK-64-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-64-NEXT:    #
+; CHECK-64-NEXT:    stfs 2, -4(1)
+; CHECK-64-NEXT:    stfs 0, -8(1)
+; CHECK-64-NEXT:    lwz 5, -4(1)
+; CHECK-64-NEXT:    lwz 6, -8(1)
+; CHECK-64-NEXT:  .LBB0_4:                                # %cmpxchg.start
+; CHECK-64-NEXT:                                          #   Parent Loop BB0_3 Depth=1
+; CHECK-64-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-64-NEXT:    lwarx 4, 0, 3
+; CHECK-64-NEXT:    cmplw   4, 6
+; CHECK-64-NEXT:    bne     0, .LBB0_1
+; CHECK-64-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-64-NEXT:                                          #   in Loop: Header=BB0_4 Depth=2
 ; CHECK-64-NEXT:    stwcx. 5, 0, 3
-; CHECK-64-NEXT:    bne 0, .LBB0_3
-; CHECK-64-NEXT:    b .LBB0_1
-; CHECK-64-NEXT:  .LBB0_5: # %atomicrmw.end
+; CHECK-64-NEXT:    bne     0, .LBB0_4
+; CHECK-64-NEXT:  # %bb.6:                                #   in Loop: Header=BB0_3 Depth=1
+; CHECK-64-NEXT:    creqv 20, 20, 20
+; CHECK-64-NEXT:    b .LBB0_2
+; CHECK-64-NEXT:  .LBB0_7:                                # %atomicrmw.end
 ; CHECK-64-NEXT:    fmr 1, 0
 ; CHECK-64-NEXT:    lwsync
 ; CHECK-64-NEXT:    blr
@@ -46,33 +50,37 @@ define float @test_add(ptr %ptr, float %incr) {
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-32-NEXT:    sync
 ; CHECK-32-NEXT:    lfs 0, 0(3)
-; CHECK-32-NEXT:    b .LBB0_2
-; CHECK-32-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stw 6, 28(1)
-; CHECK-32-NEXT:    cmplw 6, 4
-; CHECK-32-NEXT:    lfs 0, 28(1)
-; CHECK-32-NEXT:    beq 0, .LBB0_5
-; CHECK-32-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-32-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-32-NEXT:    # Child Loop BB0_3 Depth 2
+; CHECK-32-NEXT:    b .LBB0_3
+; CHECK-32-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-32-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-32-NEXT:    crxor 20, 20, 20
+; CHECK-32-NEXT:  .LBB0_2:                                # %cmpxchg.end
+; CHECK-32-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-32-NEXT:    stw 4, 20(1)
+; CHECK-32-NEXT:    lfs 0, 20(1)
+; CHECK-32-NEXT:    bc 12, 20, .LBB0_7
+; CHECK-32-NEXT:  .LBB0_3:                                # %atomicrmw.start
+; CHECK-32-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-32-NEXT:                                          #     Child Loop BB0_4 Depth 2
 ; CHECK-32-NEXT:    fadds 2, 0, 1
-; CHECK-32-NEXT:    stfs 2, 24(1)
-; CHECK-32-NEXT:    stfs 0, 20(1)
-; CHECK-32-NEXT:    lwz 5, 24(1)
-; CHECK-32-NEXT:    lwz 4, 20(1)
-; CHECK-32-NEXT:  .LBB0_3: # %atomicrmw.start
-; CHECK-32-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-32-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-32-NEXT:    lwarx 6, 0, 3
-; CHECK-32-NEXT:    cmpw 6, 4
-; CHECK-32-NEXT:    bne 0, .LBB0_1
-; CHECK-32-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-32-NEXT:    #
+; CHECK-32-NEXT:    stfs 2, 28(1)
+; CHECK-32-NEXT:    stfs 0, 24(1)
+; CHECK-32-NEXT:    lwz 5, 28(1)
+; CHECK-32-NEXT:    lwz 6, 24(1)
+; CHECK-32-NEXT:  .LBB0_4:                                # %cmpxchg.start
+; CHECK-32-NEXT:                                          #   Parent Loop BB0_3 Depth=1
+; CHECK-32-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-32-NEXT:    lwarx 4, 0, 3
+; CHECK-32-NEXT:    cmplw   4, 6
+; CHECK-32-NEXT:    bne     0, .LBB0_1
+; CHECK-32-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-32-NEXT:                                          #   in Loop: Header=BB0_4 Depth=2
 ; CHECK-32-NEXT:    stwcx. 5, 0, 3
-; CHECK-32-NEXT:    bne 0, .LBB0_3
-; CHECK-32-NEXT:    b .LBB0_1
-; CHECK-32-NEXT:  .LBB0_5: # %atomicrmw.end
+; CHECK-32-NEXT:    bne     0, .LBB0_4
+; CHECK-32-NEXT:  # %bb.6:                                #   in Loop: Header=BB0_3 Depth=1
+; CHECK-32-NEXT:    creqv 20, 20, 20
+; CHECK-32-NEXT:    b .LBB0_2
+; CHECK-32-NEXT:  .LBB0_7:                                # %atomicrmw.end
 ; CHECK-32-NEXT:    fmr 1, 0
 ; CHECK-32-NEXT:    lwsync
 ; CHECK-32-NEXT:    addi 1, 1, 32
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
index 4f00cff83942..27a26aaca8b2 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
@@ -5,49 +5,47 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_usub_cond_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 7, 7, 24
-; CHECK-NEXT:    li 8, 255
-; CHECK-NEXT:    clrlwi 6, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 24
 ; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB0_7
-; CHECK-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    clrlwi 9, 3, 24
-; CHECK-NEXT:    cmplw 9, 6
-; CHECK-NEXT:    blt 0, .LBB0_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:  .LBB0_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 3, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB0_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_5
-; CHECK-NEXT:    b .LBB0_1
-; CHECK-NEXT:  .LBB0_7: # %atomicrmw.end
+; CHECK-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB0_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    clrlwi  10, 9, 24
+; CHECK-NEXT:    cmplw   10, 7
+; CHECK-NEXT:    blt     0, .LBB0_4
+; CHECK-NEXT:  # %bb.3:                                #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    sub     9, 9, 4
+; CHECK-NEXT:  .LBB0_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 9, 24
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB0_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB0_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB0_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
@@ -58,50 +56,49 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_usub_cond_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 8, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 27
-; CHECK-NEXT:    xori 7, 7, 16
-; CHECK-NEXT:    ori 8, 8, 65535
-; CHECK-NEXT:    clrlwi 6, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 16
 ; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB1_7
-; CHECK-NEXT:  .LBB1_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB1_5 Depth 2
-; CHECK-NEXT:    clrlwi 9, 3, 16
-; CHECK-NEXT:    cmplw 9, 6
-; CHECK-NEXT:    blt 0, .LBB1_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:  .LBB1_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 3, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB1_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB1_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB1_5
-; CHECK-NEXT:    b .LBB1_1
-; CHECK-NEXT:  .LBB1_7: # %atomicrmw.end
+; CHECK-NEXT:  .LBB1_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB1_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB1_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    clrlwi  10, 9, 16
+; CHECK-NEXT:    cmplw   10, 7
+; CHECK-NEXT:    blt     0, .LBB1_4
+; CHECK-NEXT:  # %bb.3:                                #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    sub     9, 9, 4
+; CHECK-NEXT:  .LBB1_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 9, 16
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB1_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB1_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB1_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB1_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
@@ -114,34 +111,33 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB2_2
-; CHECK-NEXT:  .LBB2_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB2_7
-; CHECK-NEXT:  .LBB2_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB2_5 Depth 2
-; CHECK-NEXT:    cmplw 6, 4
+; CHECK-NEXT:  .LBB2_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB2_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB2_5 Depth 2
+; CHECK-NEXT:    cmplw   6, 4
 ; CHECK-NEXT:    bge 0, .LBB2_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 6
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    mr      7, 6
 ; CHECK-NEXT:    b .LBB2_5
-; CHECK-NEXT:  .LBB2_4:
-; CHECK-NEXT:    sub 7, 6, 4
-; CHECK-NEXT:  .LBB2_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB2_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB2_4:                                #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    sub     7, 6, 4
+; CHECK-NEXT:  .LBB2_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB2_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB2_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_5 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB2_5
-; CHECK-NEXT:    b .LBB2_1
-; CHECK-NEXT:  .LBB2_7: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB2_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -155,34 +151,33 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB3_2
-; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB3_7
-; CHECK-NEXT:  .LBB3_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB3_5 Depth 2
-; CHECK-NEXT:    cmpld 6, 4
+; CHECK-NEXT:  .LBB3_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB3_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB3_5 Depth 2
+; CHECK-NEXT:    cmpld   6, 4
 ; CHECK-NEXT:    bge 0, .LBB3_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 6
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    mr      7, 6
 ; CHECK-NEXT:    b .LBB3_5
-; CHECK-NEXT:  .LBB3_4:
-; CHECK-NEXT:    sub 7, 6, 4
-; CHECK-NEXT:  .LBB3_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB3_4:                                #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    sub     7, 6, 4
+; CHECK-NEXT:  .LBB3_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB3_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB3_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB3_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_5 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB3_5
-; CHECK-NEXT:    b .LBB3_1
-; CHECK-NEXT:  .LBB3_7: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB3_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -194,52 +189,49 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_usub_sat_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 6, 6, 24
-; CHECK-NEXT:    li 7, 255
-; CHECK-NEXT:    clrlwi 4, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 24
 ; CHECK-NEXT:    b .LBB4_2
-; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB4_7
-; CHECK-NEXT:  .LBB4_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB4_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 24
-; CHECK-NEXT:    sub 3, 8, 4
-; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:  .LBB4_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB4_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB4_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 24
+; CHECK-NEXT:    sub     8, 9, 4
+; CHECK-NEXT:    cmplw   8, 9
 ; CHECK-NEXT:    li 9, 0
-; CHECK-NEXT:    bgt 0, .LBB4_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 9, 3
-; CHECK-NEXT:  .LBB4_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB4_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB4_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB4_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB4_5
-; CHECK-NEXT:    b .LBB4_1
-; CHECK-NEXT:  .LBB4_7: # %atomicrmw.end
+; CHECK-NEXT:    bgt     0, .LBB4_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mr      9, 8
+; CHECK-NEXT:  .LBB4_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    slw 8, 9, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB4_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB4_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB4_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB4_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
@@ -250,53 +242,51 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_usub_sat_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 7, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 27
-; CHECK-NEXT:    xori 6, 6, 16
-; CHECK-NEXT:    ori 7, 7, 65535
-; CHECK-NEXT:    clrlwi 4, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 16
 ; CHECK-NEXT:    b .LBB5_2
-; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB5_7
-; CHECK-NEXT:  .LBB5_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB5_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 16
-; CHECK-NEXT:    sub 3, 8, 4
-; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:  .LBB5_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB5_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB5_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 16
+; CHECK-NEXT:    sub     8, 9, 4
+; CHECK-NEXT:    cmplw   8, 9
 ; CHECK-NEXT:    li 9, 0
-; CHECK-NEXT:    bgt 0, .LBB5_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 9, 3
-; CHECK-NEXT:  .LBB5_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB5_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB5_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB5_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB5_5
-; CHECK-NEXT:    b .LBB5_1
-; CHECK-NEXT:  .LBB5_7: # %atomicrmw.end
+; CHECK-NEXT:    bgt     0, .LBB5_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    mr      9, 8
+; CHECK-NEXT:  .LBB5_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    slw 8, 9, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB5_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB5_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB5_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB5_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
@@ -309,33 +299,32 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB6_2
-; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB6_6
-; CHECK-NEXT:  .LBB6_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB6_4 Depth 2
-; CHECK-NEXT:    sub 5, 6, 4
-; CHECK-NEXT:    cmplw 5, 6
+; CHECK-NEXT:  .LBB6_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB6_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB6_4 Depth 2
+; CHECK-NEXT:    sub     5, 6, 4
+; CHECK-NEXT:    cmplw   5, 6
 ; CHECK-NEXT:    li 7, 0
-; CHECK-NEXT:    bgt 0, .LBB6_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 5
-; CHECK-NEXT:  .LBB6_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB6_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    bgt     0, .LBB6_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    mr      7, 5
+; CHECK-NEXT:  .LBB6_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB6_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB6_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB6_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_4 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_4
-; CHECK-NEXT:    b .LBB6_1
-; CHECK-NEXT:  .LBB6_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB6_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -349,33 +338,32 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB7_2
-; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB7_6
-; CHECK-NEXT:  .LBB7_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB7_4 Depth 2
-; CHECK-NEXT:    subc 5, 6, 4
+; CHECK-NEXT:  .LBB7_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB7_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB7_4 Depth 2
+; CHECK-NEXT:    subc    5, 6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    addze. 8, 7
-; CHECK-NEXT:    beq 0, .LBB7_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 5
-; CHECK-NEXT:  .LBB7_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB7_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    beq     0, .LBB7_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    mr      7, 5
+; CHECK-NEXT:  .LBB7_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB7_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB7_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB7_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_4 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB7_4
-; CHECK-NEXT:    b .LBB7_1
-; CHECK-NEXT:  .LBB7_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB7_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
index 2882dc420b60..6ced47bd6bcb 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
@@ -5,51 +5,49 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_uinc_wrap_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 6, 6, 24
-; CHECK-NEXT:    li 7, 255
-; CHECK-NEXT:    clrlwi 4, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 24
 ; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB0_7
-; CHECK-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 24
-; CHECK-NEXT:    cmplw 8, 4
+; CHECK-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB0_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 24
+; CHECK-NEXT:    cmplw   9, 4
 ; CHECK-NEXT:    li 9, 0
 ; CHECK-NEXT:    bge 0, .LBB0_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 9, 3, 1
-; CHECK-NEXT:  .LBB0_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB0_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_5
-; CHECK-NEXT:    b .LBB0_1
-; CHECK-NEXT:  .LBB0_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    addi 9, 8, 1
+; CHECK-NEXT:  .LBB0_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    clrlwi  8, 9, 24
+; CHECK-NEXT:    slw 8, 8, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB0_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB0_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB0_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
@@ -60,52 +58,51 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_uinc_wrap_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 7, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 27
-; CHECK-NEXT:    xori 6, 6, 16
-; CHECK-NEXT:    ori 7, 7, 65535
-; CHECK-NEXT:    clrlwi 4, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 16
 ; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB1_7
-; CHECK-NEXT:  .LBB1_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB1_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 16
-; CHECK-NEXT:    cmplw 8, 4
+; CHECK-NEXT:  .LBB1_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB1_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB1_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 16
+; CHECK-NEXT:    cmplw   9, 4
 ; CHECK-NEXT:    li 9, 0
 ; CHECK-NEXT:    bge 0, .LBB1_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 9, 3, 1
-; CHECK-NEXT:  .LBB1_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB1_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB1_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB1_5
-; CHECK-NEXT:    b .LBB1_1
-; CHECK-NEXT:  .LBB1_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    addi 9, 8, 1
+; CHECK-NEXT:  .LBB1_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    clrlwi  8, 9, 16
+; CHECK-NEXT:    slw 8, 8, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB1_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB1_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB1_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB1_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
@@ -118,32 +115,31 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB2_2
-; CHECK-NEXT:  .LBB2_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB2_6
-; CHECK-NEXT:  .LBB2_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB2_4 Depth 2
-; CHECK-NEXT:    cmplw 6, 4
+; CHECK-NEXT:  .LBB2_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB2_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB2_4 Depth 2
+; CHECK-NEXT:    cmplw   6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    bge 0, .LBB2_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, 1
-; CHECK-NEXT:  .LBB2_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB2_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB2_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB2_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB2_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_4 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB2_4
-; CHECK-NEXT:    b .LBB2_1
-; CHECK-NEXT:  .LBB2_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB2_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -157,32 +153,31 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB3_2
-; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB3_6
-; CHECK-NEXT:  .LBB3_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB3_4 Depth 2
-; CHECK-NEXT:    cmpld 6, 4
+; CHECK-NEXT:  .LBB3_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB3_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB3_4 Depth 2
+; CHECK-NEXT:    cmpld   6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    bge 0, .LBB3_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, 1
-; CHECK-NEXT:  .LBB3_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB3_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB3_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB3_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB3_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_4 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB3_4
-; CHECK-NEXT:    b .LBB3_1
-; CHECK-NEXT:  .LBB3_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB3_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -194,52 +189,50 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_udec_wrap_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 7, 7, 24
-; CHECK-NEXT:    li 8, 255
-; CHECK-NEXT:    clrlwi 6, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 24
 ; CHECK-NEXT:    b .LBB4_2
-; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB4_7
-; CHECK-NEXT:  .LBB4_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB4_5 Depth 2
-; CHECK-NEXT:    andi. 9, 3, 255
-; CHECK-NEXT:    cmplw 1, 9, 6
+; CHECK-NEXT:  .LBB4_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB4_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB4_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    andi. 10, 9, 255
+; CHECK-NEXT:    cmplw 1, 10, 7
 ; CHECK-NEXT:    cror 20, 2, 5
-; CHECK-NEXT:    mr 10, 4
+; CHECK-NEXT:    mr      10, 4
 ; CHECK-NEXT:    bc 12, 20, .LBB4_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 10, 3, -1
-; CHECK-NEXT:  .LBB4_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 10, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB4_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB4_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB4_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB4_5
-; CHECK-NEXT:    b .LBB4_1
-; CHECK-NEXT:  .LBB4_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    addi 10, 9, -1
+; CHECK-NEXT:  .LBB4_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 10, 24
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB4_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB4_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB4_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB4_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
@@ -250,53 +243,52 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_udec_wrap_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 8, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 27
-; CHECK-NEXT:    xori 7, 7, 16
-; CHECK-NEXT:    ori 8, 8, 65535
-; CHECK-NEXT:    clrlwi 6, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 16
 ; CHECK-NEXT:    b .LBB5_2
-; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB5_7
-; CHECK-NEXT:  .LBB5_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB5_5 Depth 2
-; CHECK-NEXT:    andi. 9, 3, 65535
-; CHECK-NEXT:    cmplw 1, 9, 6
+; CHECK-NEXT:  .LBB5_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB5_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB5_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    andi. 10, 9, 65535
+; CHECK-NEXT:    cmplw 1, 10, 7
 ; CHECK-NEXT:    cror 20, 2, 5
-; CHECK-NEXT:    mr 10, 4
+; CHECK-NEXT:    mr      10, 4
 ; CHECK-NEXT:    bc 12, 20, .LBB5_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 10, 3, -1
-; CHECK-NEXT:  .LBB5_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 10, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB5_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB5_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB5_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB5_5
-; CHECK-NEXT:    b .LBB5_1
-; CHECK-NEXT:  .LBB5_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    addi 10, 9, -1
+; CHECK-NEXT:  .LBB5_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 10, 16
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB5_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB5_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB5_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB5_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
@@ -309,37 +301,36 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB6_2
-; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB6_7
-; CHECK-NEXT:  .LBB6_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB6_5 Depth 2
-; CHECK-NEXT:    cmpwi 6, 0
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  .LBB6_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB6_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB6_5 Depth 2
+; CHECK-NEXT:    cmpwi   6, 0
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 2, .LBB6_5
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 6, 4
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    cmplw   6, 4
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 1, .LBB6_5
-; CHECK-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, -1
-; CHECK-NEXT:  .LBB6_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB6_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB6_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB6_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB6_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB6_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_5 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_5
-; CHECK-NEXT:    b .LBB6_1
-; CHECK-NEXT:  .LBB6_7: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB6_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -353,38 +344,37 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB7_2
-; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB7_7
-; CHECK-NEXT:  .LBB7_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB7_5 Depth 2
-; CHECK-NEXT:    cmpdi 6, 0
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  .LBB7_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB7_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB7_5 Depth 2
+; CHECK-NEXT:    cmpdi   6, 0
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 2, .LBB7_5
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 6, 4
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    cmpld   6, 4
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 1, .LBB7_5
-; CHECK-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, -1
-; CHECK-NEXT:  .LBB7_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB7_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB7_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB7_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB7_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB7_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_5 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB7_5
-; CHECK-NEXT:    b .LBB7_1
-; CHECK-NEXT:  .LBB7_7: # %atomicrmw.end
-; CHECK-NEXT:    mr 3, 5
+; CHECK-NEXT:    bne     0, .LBB7_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    mr      3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw udec_wrap ptr %ptr, i64 %val seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
index b31be701454d..280c4299c30b 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
@@ -400,15 +400,20 @@ define void @test39() {
 define void @test40(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test40:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB40_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB40_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB40_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
   ret void
@@ -417,15 +422,20 @@ define void @test40(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test41(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test41:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB41_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB41_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB41_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB41_1
-; PPC64LE-NEXT:  .LBB41_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire monotonic
@@ -435,15 +445,20 @@ define void @test41(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test42(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test42:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB42_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB42_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB42_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB42_1
-; PPC64LE-NEXT:  .LBB42_3:
+; PPC64LE-NEXT:  .LBB42_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire acquire
@@ -452,17 +467,26 @@ define void @test42(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test43(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test43:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB43_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB43_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB43_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB43_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val release monotonic
   ret void
@@ -470,17 +494,27 @@ define void @test43(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test44(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test44:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB44_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB44_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB44_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB44_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB44_1
-; PPC64LE-NEXT:  .LBB44_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB44_2
+; PPC64LE-NEXT:  .LBB44_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val release acquire
@@ -489,17 +523,29 @@ define void @test44(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test45(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test45:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB45_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB45_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB45_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB45_1
-; PPC64LE-NEXT:  .LBB45_3:
+; PPC64LE-NEXT:    beq 0, .LBB45_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB45_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB45_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acq_rel monotonic
@@ -508,17 +554,27 @@ define void @test45(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test46(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test46:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB46_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB46_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB46_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB46_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB46_1
-; PPC64LE-NEXT:  .LBB46_3:
+; PPC64LE-NEXT:    beq 0, .LBB46_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB46_2
+; PPC64LE-NEXT:  .LBB46_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acq_rel acquire
@@ -527,17 +583,29 @@ define void @test46(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test47(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test47:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB47_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB47_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB47_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB47_1
-; PPC64LE-NEXT:  .LBB47_3:
+; PPC64LE-NEXT:    beq 0, .LBB47_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB47_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB47_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst monotonic
@@ -546,17 +614,27 @@ define void @test47(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test48(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test48:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB48_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB48_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB48_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB48_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB48_1
-; PPC64LE-NEXT:  .LBB48_3:
+; PPC64LE-NEXT:    beq 0, .LBB48_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB48_2
+; PPC64LE-NEXT:  .LBB48_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst acquire
@@ -565,17 +643,27 @@ define void @test48(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test49(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test49:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB49_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB49_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB49_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB49_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB49_1
-; PPC64LE-NEXT:  .LBB49_3:
+; PPC64LE-NEXT:    beq 0, .LBB49_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB49_2
+; PPC64LE-NEXT:  .LBB49_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst seq_cst
@@ -585,15 +673,20 @@ define void @test49(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test50(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test50:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB50_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB50_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB50_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
   ret void
@@ -602,15 +695,20 @@ define void @test50(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test51(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test51:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB51_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB51_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB51_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB51_1
-; PPC64LE-NEXT:  .LBB51_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire monotonic
@@ -620,15 +718,20 @@ define void @test51(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test52(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test52:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB52_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB52_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB52_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB52_1
-; PPC64LE-NEXT:  .LBB52_3:
+; PPC64LE-NEXT:  .LBB52_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire acquire
@@ -637,17 +740,26 @@ define void @test52(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test53(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test53:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB53_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB53_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB53_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB53_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val release monotonic
   ret void
@@ -655,17 +767,27 @@ define void @test53(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test54(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test54:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB54_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB54_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB54_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB54_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB54_1
-; PPC64LE-NEXT:  .LBB54_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB54_2
+; PPC64LE-NEXT:  .LBB54_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val release acquire
@@ -674,17 +796,29 @@ define void @test54(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test55(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test55:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB55_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB55_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB55_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB55_1
-; PPC64LE-NEXT:  .LBB55_3:
+; PPC64LE-NEXT:    beq 0, .LBB55_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB55_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB55_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acq_rel monotonic
@@ -693,17 +827,27 @@ define void @test55(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test56(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test56:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB56_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB56_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB56_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB56_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB56_1
-; PPC64LE-NEXT:  .LBB56_3:
+; PPC64LE-NEXT:    beq 0, .LBB56_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB56_2
+; PPC64LE-NEXT:  .LBB56_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acq_rel acquire
@@ -712,17 +856,29 @@ define void @test56(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test57(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test57:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB57_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB57_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB57_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB57_1
-; PPC64LE-NEXT:  .LBB57_3:
+; PPC64LE-NEXT:    beq 0, .LBB57_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB57_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB57_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val seq_cst monotonic
@@ -731,17 +887,27 @@ define void @test57(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test58(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test58:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB58_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB58_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB58_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB58_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB58_1
-; PPC64LE-NEXT:  .LBB58_3:
+; PPC64LE-NEXT:    beq 0, .LBB58_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB58_2
+; PPC64LE-NEXT:  .LBB58_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val seq_cst acquire
@@ -750,17 +916,27 @@ define void @test58(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test59(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test59:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB59_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB59_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB59_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB59_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB59_1
-; PPC64LE-NEXT:  .LBB59_3:
+; PPC64LE-NEXT:    beq 0, .LBB59_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB59_2
+; PPC64LE-NEXT:  .LBB59_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val seq_cst seq_cst
@@ -770,14 +946,17 @@ define void @test59(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test60(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test60:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB60_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB60_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB60_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
   ret void
@@ -786,14 +965,17 @@ define void @test60(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test61(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test61:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB61_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB61_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB61_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB61_1
-; PPC64LE-NEXT:  .LBB61_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire monotonic
@@ -803,14 +985,17 @@ define void @test61(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test62(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test62:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB62_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB62_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB62_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB62_1
-; PPC64LE-NEXT:  .LBB62_3:
+; PPC64LE-NEXT:  .LBB62_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
@@ -819,16 +1004,22 @@ define void @test62(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test63(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test63:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB63_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB63_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB63_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB63_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val release monotonic
   ret void
@@ -836,16 +1027,23 @@ define void @test63(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test64(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test64:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB64_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB64_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB64_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB64_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB64_1
-; PPC64LE-NEXT:  .LBB64_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB64_2
+; PPC64LE-NEXT:  .LBB64_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val release acquire
@@ -854,16 +1052,25 @@ define void @test64(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test65(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test65:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB65_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB65_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB65_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB65_1
-; PPC64LE-NEXT:  .LBB65_3:
+; PPC64LE-NEXT:    beq 0, .LBB65_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB65_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB65_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acq_rel monotonic
@@ -872,16 +1079,23 @@ define void @test65(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test66(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test66:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB66_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB66_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB66_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB66_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB66_1
-; PPC64LE-NEXT:  .LBB66_3:
+; PPC64LE-NEXT:    beq 0, .LBB66_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB66_2
+; PPC64LE-NEXT:  .LBB66_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acq_rel acquire
@@ -890,16 +1104,25 @@ define void @test66(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test67(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test67:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB67_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB67_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB67_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB67_1
-; PPC64LE-NEXT:  .LBB67_3:
+; PPC64LE-NEXT:    beq 0, .LBB67_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB67_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB67_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst monotonic
@@ -908,16 +1131,23 @@ define void @test67(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test68(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test68:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB68_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB68_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB68_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB68_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB68_1
-; PPC64LE-NEXT:  .LBB68_3:
+; PPC64LE-NEXT:    beq 0, .LBB68_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB68_2
+; PPC64LE-NEXT:  .LBB68_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst acquire
@@ -926,16 +1156,23 @@ define void @test68(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test69(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test69:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB69_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB69_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB69_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB69_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB69_1
-; PPC64LE-NEXT:  .LBB69_3:
+; PPC64LE-NEXT:    beq 0, .LBB69_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB69_2
+; PPC64LE-NEXT:  .LBB69_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst seq_cst
@@ -945,14 +1182,17 @@ define void @test69(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test70(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test70:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB70_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB70_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB70_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
   ret void
@@ -961,14 +1201,17 @@ define void @test70(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test71(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test71:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB71_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB71_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB71_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB71_1
-; PPC64LE-NEXT:  .LBB71_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire monotonic
@@ -978,14 +1221,17 @@ define void @test71(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test72(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test72:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB72_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB72_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB72_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB72_1
-; PPC64LE-NEXT:  .LBB72_3:
+; PPC64LE-NEXT:  .LBB72_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire acquire
@@ -994,16 +1240,22 @@ define void @test72(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test73(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test73:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB73_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB73_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB73_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB73_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val release monotonic
   ret void
@@ -1011,16 +1263,23 @@ define void @test73(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test74(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test74:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB74_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB74_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB74_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB74_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB74_1
-; PPC64LE-NEXT:  .LBB74_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB74_2
+; PPC64LE-NEXT:  .LBB74_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val release acquire
@@ -1029,16 +1288,25 @@ define void @test74(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test75(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test75:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB75_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB75_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB75_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB75_1
-; PPC64LE-NEXT:  .LBB75_3:
+; PPC64LE-NEXT:    beq 0, .LBB75_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB75_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB75_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acq_rel monotonic
@@ -1047,16 +1315,23 @@ define void @test75(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test76(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test76:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB76_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB76_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB76_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB76_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB76_1
-; PPC64LE-NEXT:  .LBB76_3:
+; PPC64LE-NEXT:    beq 0, .LBB76_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB76_2
+; PPC64LE-NEXT:  .LBB76_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acq_rel acquire
@@ -1065,16 +1340,25 @@ define void @test76(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test77(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test77:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB77_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB77_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB77_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB77_1
-; PPC64LE-NEXT:  .LBB77_3:
+; PPC64LE-NEXT:    beq 0, .LBB77_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB77_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB77_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val seq_cst monotonic
@@ -1083,16 +1367,23 @@ define void @test77(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test78(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test78:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB78_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB78_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB78_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB78_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB78_1
-; PPC64LE-NEXT:  .LBB78_3:
+; PPC64LE-NEXT:    beq 0, .LBB78_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB78_2
+; PPC64LE-NEXT:  .LBB78_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val seq_cst acquire
@@ -1101,16 +1392,23 @@ define void @test78(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test79(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test79:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB79_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB79_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB79_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB79_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB79_1
-; PPC64LE-NEXT:  .LBB79_3:
+; PPC64LE-NEXT:    beq 0, .LBB79_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB79_2
+; PPC64LE-NEXT:  .LBB79_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val seq_cst seq_cst
@@ -1120,15 +1418,20 @@ define void @test79(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test80(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test80:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB80_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB80_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB80_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1137,15 +1440,20 @@ define void @test80(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test81(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test81:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB81_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB81_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB81_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB81_1
-; PPC64LE-NEXT:  .LBB81_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire monotonic
@@ -1155,15 +1463,20 @@ define void @test81(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test82(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test82:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB82_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB82_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB82_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB82_1
-; PPC64LE-NEXT:  .LBB82_3:
+; PPC64LE-NEXT:  .LBB82_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire acquire
@@ -1172,17 +1485,26 @@ define void @test82(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test83(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test83:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB83_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB83_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB83_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB83_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic
   ret void
@@ -1190,17 +1512,27 @@ define void @test83(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test84(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test84:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB84_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB84_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB84_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB84_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB84_1
-; PPC64LE-NEXT:  .LBB84_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB84_2
+; PPC64LE-NEXT:  .LBB84_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
@@ -1209,17 +1541,29 @@ define void @test84(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test85(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test85:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB85_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB85_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB85_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB85_1
-; PPC64LE-NEXT:  .LBB85_3:
+; PPC64LE-NEXT:    beq 0, .LBB85_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB85_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB85_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel monotonic
@@ -1228,17 +1572,27 @@ define void @test85(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test86(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test86:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB86_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB86_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB86_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB86_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB86_1
-; PPC64LE-NEXT:  .LBB86_3:
+; PPC64LE-NEXT:    beq 0, .LBB86_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB86_2
+; PPC64LE-NEXT:  .LBB86_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel acquire
@@ -1247,17 +1601,29 @@ define void @test86(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test87(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test87:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB87_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB87_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB87_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB87_1
-; PPC64LE-NEXT:  .LBB87_3:
+; PPC64LE-NEXT:    beq 0, .LBB87_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB87_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB87_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst monotonic
@@ -1266,17 +1632,27 @@ define void @test87(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test88(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test88:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB88_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB88_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB88_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB88_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB88_1
-; PPC64LE-NEXT:  .LBB88_3:
+; PPC64LE-NEXT:    beq 0, .LBB88_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB88_2
+; PPC64LE-NEXT:  .LBB88_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst acquire
@@ -1285,17 +1661,27 @@ define void @test88(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test89(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test89:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB89_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB89_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB89_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB89_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB89_1
-; PPC64LE-NEXT:  .LBB89_3:
+; PPC64LE-NEXT:    beq 0, .LBB89_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB89_2
+; PPC64LE-NEXT:  .LBB89_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst seq_cst
@@ -1305,15 +1691,20 @@ define void @test89(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test90(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test90:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB90_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB90_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB90_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1322,15 +1713,20 @@ define void @test90(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test91(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test91:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB91_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB91_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB91_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB91_1
-; PPC64LE-NEXT:  .LBB91_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire monotonic
@@ -1340,15 +1736,20 @@ define void @test91(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test92(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test92:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB92_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB92_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB92_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB92_1
-; PPC64LE-NEXT:  .LBB92_3:
+; PPC64LE-NEXT:  .LBB92_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire acquire
@@ -1357,17 +1758,26 @@ define void @test92(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test93(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test93:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB93_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB93_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB93_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB93_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic
   ret void
@@ -1375,17 +1785,27 @@ define void @test93(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test94(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test94:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB94_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB94_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB94_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB94_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB94_1
-; PPC64LE-NEXT:  .LBB94_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB94_2
+; PPC64LE-NEXT:  .LBB94_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
@@ -1394,17 +1814,29 @@ define void @test94(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test95(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test95:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB95_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB95_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB95_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB95_1
-; PPC64LE-NEXT:  .LBB95_3:
+; PPC64LE-NEXT:    beq 0, .LBB95_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB95_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB95_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel monotonic
@@ -1413,17 +1845,27 @@ define void @test95(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test96(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test96:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB96_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB96_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB96_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB96_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB96_1
-; PPC64LE-NEXT:  .LBB96_3:
+; PPC64LE-NEXT:    beq 0, .LBB96_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB96_2
+; PPC64LE-NEXT:  .LBB96_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel acquire
@@ -1432,17 +1874,29 @@ define void @test96(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test97(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test97:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB97_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB97_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB97_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB97_1
-; PPC64LE-NEXT:  .LBB97_3:
+; PPC64LE-NEXT:    beq 0, .LBB97_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB97_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB97_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst monotonic
@@ -1451,17 +1905,27 @@ define void @test97(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test98(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test98:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB98_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB98_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB98_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB98_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB98_1
-; PPC64LE-NEXT:  .LBB98_3:
+; PPC64LE-NEXT:    beq 0, .LBB98_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB98_2
+; PPC64LE-NEXT:  .LBB98_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst acquire
@@ -1470,17 +1934,27 @@ define void @test98(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test99(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test99:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB99_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB99_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB99_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB99_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB99_1
-; PPC64LE-NEXT:  .LBB99_3:
+; PPC64LE-NEXT:    beq 0, .LBB99_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB99_2
+; PPC64LE-NEXT:  .LBB99_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst seq_cst
@@ -1490,14 +1964,17 @@ define void @test99(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test100(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test100:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB100_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB100_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB100_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1506,14 +1983,17 @@ define void @test100(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test101(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test101:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB101_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB101_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB101_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB101_1
-; PPC64LE-NEXT:  .LBB101_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire monotonic
@@ -1523,14 +2003,17 @@ define void @test101(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test102(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test102:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB102_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB102_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB102_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB102_1
-; PPC64LE-NEXT:  .LBB102_3:
+; PPC64LE-NEXT:  .LBB102_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire acquire
@@ -1539,16 +2022,22 @@ define void @test102(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test103(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test103:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB103_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB103_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB103_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB103_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic
   ret void
@@ -1556,16 +2045,23 @@ define void @test103(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test104(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test104:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB104_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB104_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB104_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB104_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB104_1
-; PPC64LE-NEXT:  .LBB104_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB104_2
+; PPC64LE-NEXT:  .LBB104_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
@@ -1574,16 +2070,25 @@ define void @test104(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test105(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test105:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB105_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB105_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB105_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB105_1
-; PPC64LE-NEXT:  .LBB105_3:
+; PPC64LE-NEXT:    beq 0, .LBB105_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB105_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB105_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel monotonic
@@ -1592,16 +2097,23 @@ define void @test105(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test106(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test106:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB106_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB106_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB106_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB106_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB106_1
-; PPC64LE-NEXT:  .LBB106_3:
+; PPC64LE-NEXT:    beq 0, .LBB106_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB106_2
+; PPC64LE-NEXT:  .LBB106_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel acquire
@@ -1610,16 +2122,25 @@ define void @test106(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test107(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test107:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB107_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB107_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB107_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB107_1
-; PPC64LE-NEXT:  .LBB107_3:
+; PPC64LE-NEXT:    beq 0, .LBB107_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB107_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB107_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst monotonic
@@ -1628,16 +2149,23 @@ define void @test107(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test108(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test108:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB108_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB108_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB108_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB108_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB108_1
-; PPC64LE-NEXT:  .LBB108_3:
+; PPC64LE-NEXT:    beq 0, .LBB108_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB108_2
+; PPC64LE-NEXT:  .LBB108_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst acquire
@@ -1646,16 +2174,23 @@ define void @test108(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test109(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test109:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB109_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB109_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB109_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB109_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB109_1
-; PPC64LE-NEXT:  .LBB109_3:
+; PPC64LE-NEXT:    beq 0, .LBB109_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB109_2
+; PPC64LE-NEXT:  .LBB109_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst seq_cst
@@ -1665,14 +2200,17 @@ define void @test109(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test110(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test110:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB110_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB110_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB110_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1681,14 +2219,17 @@ define void @test110(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test111(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test111:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB111_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB111_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB111_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB111_1
-; PPC64LE-NEXT:  .LBB111_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire monotonic
@@ -1698,14 +2239,17 @@ define void @test111(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test112(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test112:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB112_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB112_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB112_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB112_1
-; PPC64LE-NEXT:  .LBB112_3:
+; PPC64LE-NEXT:  .LBB112_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire acquire
@@ -1714,16 +2258,22 @@ define void @test112(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test113(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test113:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB113_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB113_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB113_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB113_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic
   ret void
@@ -1731,16 +2281,23 @@ define void @test113(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test114(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test114:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB114_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB114_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB114_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB114_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB114_1
-; PPC64LE-NEXT:  .LBB114_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB114_2
+; PPC64LE-NEXT:  .LBB114_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire
@@ -1749,16 +2306,25 @@ define void @test114(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test115(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test115:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB115_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB115_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB115_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB115_1
-; PPC64LE-NEXT:  .LBB115_3:
+; PPC64LE-NEXT:    beq 0, .LBB115_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB115_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB115_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel monotonic
@@ -1767,16 +2333,23 @@ define void @test115(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test116(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test116:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB116_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB116_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB116_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB116_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB116_1
-; PPC64LE-NEXT:  .LBB116_3:
+; PPC64LE-NEXT:    beq 0, .LBB116_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB116_2
+; PPC64LE-NEXT:  .LBB116_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel acquire
@@ -1785,16 +2358,25 @@ define void @test116(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test117(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test117:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB117_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB117_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB117_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB117_1
-; PPC64LE-NEXT:  .LBB117_3:
+; PPC64LE-NEXT:    beq 0, .LBB117_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB117_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB117_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst monotonic
@@ -1803,16 +2385,23 @@ define void @test117(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test118(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test118:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB118_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB118_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB118_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB118_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB118_1
-; PPC64LE-NEXT:  .LBB118_3:
+; PPC64LE-NEXT:    beq 0, .LBB118_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB118_2
+; PPC64LE-NEXT:  .LBB118_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst acquire
@@ -1821,16 +2410,23 @@ define void @test118(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test119(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test119:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB119_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB119_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB119_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB119_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB119_1
-; PPC64LE-NEXT:  .LBB119_3:
+; PPC64LE-NEXT:    beq 0, .LBB119_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB119_2
+; PPC64LE-NEXT:  .LBB119_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 24e71c87414e..40786057ead5 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -139,59 +139,67 @@ define void @store_i64_seq_cst(ptr %mem) {
 define i8 @cas_strong_i8_sc_sc(ptr %mem) {
 ; PPC32-LABEL: cas_strong_i8_sc_sc:
 ; PPC32:       # %bb.0:
-; PPC32-NEXT:    rlwinm r8, r3, 3, 27, 28
-; PPC32-NEXT:    li r5, 1
-; PPC32-NEXT:    li r6, 0
-; PPC32-NEXT:    li r7, 255
-; PPC32-NEXT:    rlwinm r4, r3, 0, 0, 29
-; PPC32-NEXT:    xori r3, r8, 24
-; PPC32-NEXT:    slw r8, r5, r3
-; PPC32-NEXT:    slw r9, r6, r3
-; PPC32-NEXT:    slw r5, r7, r3
-; PPC32-NEXT:    and r6, r8, r5
-; PPC32-NEXT:    and r7, r9, r5
+; PPC32-NEXT:    rlwinm r5, r3, 0, 0, 29
+; PPC32-NEXT:    lwarx r4, 0, r5
+; PPC32-NEXT:    not     r3, r3
+; PPC32-NEXT:    rlwinm r3, r3, 3, 27, 28
+; PPC32-NEXT:    srw r6, r4, r3
+; PPC32-NEXT:    andi. r6, r6, 255
+; PPC32-NEXT:    bne     cr0, .LBB8_4
+; PPC32-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; PPC32-NEXT:    li r6, 255
+; PPC32-NEXT:    li r7, 1
+; PPC32-NEXT:    slw r6, r6, r3
+; PPC32-NEXT:    not     r6, r6
+; PPC32-NEXT:    slw r7, r7, r3
 ; PPC32-NEXT:    sync
-; PPC32-NEXT:  .LBB8_1:
-; PPC32-NEXT:    lwarx r9, 0, r4
-; PPC32-NEXT:    and r8, r9, r5
-; PPC32-NEXT:    cmpw r8, r7
-; PPC32-NEXT:    bne cr0, .LBB8_3
-; PPC32-NEXT:  # %bb.2:
-; PPC32-NEXT:    andc r9, r9, r5
-; PPC32-NEXT:    or r9, r9, r6
-; PPC32-NEXT:    stwcx. r9, 0, r4
-; PPC32-NEXT:    bne cr0, .LBB8_1
-; PPC32-NEXT:  .LBB8_3:
-; PPC32-NEXT:    srw r3, r8, r3
+; PPC32-NEXT:  .LBB8_2:                                # %cmpxchg.trystore
+; PPC32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; PPC32-NEXT:    and r8, r4, r6
+; PPC32-NEXT:    or r8, r8, r7
+; PPC32-NEXT:    stwcx. r8, 0, r5
+; PPC32-NEXT:    beq     cr0, .LBB8_4
+; PPC32-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; PPC32-NEXT:                                          #   in Loop: Header=BB8_2 Depth=1
+; PPC32-NEXT:    lwarx r4, 0, r5
+; PPC32-NEXT:    srw r8, r4, r3
+; PPC32-NEXT:    andi. r8, r8, 255
+; PPC32-NEXT:    beq     cr0, .LBB8_2
+; PPC32-NEXT:  .LBB8_4:                                # %cmpxchg.nostore
+; PPC32-NEXT:    srw r3, r4, r3
 ; PPC32-NEXT:    lwsync
 ; PPC32-NEXT:    blr
 ;
 ; PPC64-LABEL: cas_strong_i8_sc_sc:
 ; PPC64:       # %bb.0:
-; PPC64-NEXT:    rlwinm r8, r3, 3, 27, 28
-; PPC64-NEXT:    li r5, 1
-; PPC64-NEXT:    li r6, 0
-; PPC64-NEXT:    li r7, 255
-; PPC64-NEXT:    rldicr r4, r3, 0, 61
-; PPC64-NEXT:    xori r3, r8, 24
-; PPC64-NEXT:    slw r8, r5, r3
-; PPC64-NEXT:    slw r9, r6, r3
-; PPC64-NEXT:    slw r5, r7, r3
-; PPC64-NEXT:    and r6, r8, r5
-; PPC64-NEXT:    and r7, r9, r5
+; PPC64-NEXT:    rldicr r5, r3, 0, 61
+; PPC64-NEXT:    not     r3, r3
+; PPC64-NEXT:    lwarx r4, 0, r5
+; PPC64-NEXT:    rlwinm r3, r3, 3, 27, 28
+; PPC64-NEXT:    srw r6, r4, r3
+; PPC64-NEXT:    andi. r6, r6, 255
+; PPC64-NEXT:    bne     cr0, .LBB8_4
+; PPC64-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; PPC64-NEXT:    li r6, 255
+; PPC64-NEXT:    li r7, 1
+; PPC64-NEXT:    slw r6, r6, r3
+; PPC64-NEXT:    not     r6, r6
+; PPC64-NEXT:    slw r7, r7, r3
 ; PPC64-NEXT:    sync
-; PPC64-NEXT:  .LBB8_1:
-; PPC64-NEXT:    lwarx r9, 0, r4
-; PPC64-NEXT:    and r8, r9, r5
-; PPC64-NEXT:    cmpw r8, r7
-; PPC64-NEXT:    bne cr0, .LBB8_3
-; PPC64-NEXT:  # %bb.2:
-; PPC64-NEXT:    andc r9, r9, r5
-; PPC64-NEXT:    or r9, r9, r6
-; PPC64-NEXT:    stwcx. r9, 0, r4
-; PPC64-NEXT:    bne cr0, .LBB8_1
-; PPC64-NEXT:  .LBB8_3:
-; PPC64-NEXT:    srw r3, r8, r3
+; PPC64-NEXT:  .LBB8_2:                                # %cmpxchg.trystore
+; PPC64-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; PPC64-NEXT:    and r8, r4, r6
+; PPC64-NEXT:    or r8, r8, r7
+; PPC64-NEXT:    stwcx. r8, 0, r5
+; PPC64-NEXT:    beq     cr0, .LBB8_4
+; PPC64-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; PPC64-NEXT:                                          #   in Loop: Header=BB8_2 Depth=1
+; PPC64-NEXT:    lwarx r4, 0, r5
+; PPC64-NEXT:    srw r8, r4, r3
+; PPC64-NEXT:    andi. r8, r8, 255
+; PPC64-NEXT:    beq     cr0, .LBB8_2
+; PPC64-NEXT:  .LBB8_4:                                # %cmpxchg.nostore
+; PPC64-NEXT:    srw r3, r4, r3
 ; PPC64-NEXT:    lwsync
 ; PPC64-NEXT:    blr
   %val = cmpxchg ptr %mem, i8 0, i8 1 seq_cst seq_cst
@@ -201,57 +209,53 @@ define i8 @cas_strong_i8_sc_sc(ptr %mem) {
 define i16 @cas_weak_i16_acquire_acquire(ptr %mem) {
 ; PPC32-LABEL: cas_weak_i16_acquire_acquire:
 ; PPC32:       # %bb.0:
-; PPC32-NEXT:    li r6, 0
-; PPC32-NEXT:    rlwinm r4, r3, 3, 27, 27
-; PPC32-NEXT:    li r5, 1
-; PPC32-NEXT:    ori r7, r6, 65535
-; PPC32-NEXT:    xori r4, r4, 16
-; PPC32-NEXT:    slw r8, r5, r4
-; PPC32-NEXT:    slw r9, r6, r4
-; PPC32-NEXT:    slw r5, r7, r4
-; PPC32-NEXT:    rlwinm r3, r3, 0, 0, 29
-; PPC32-NEXT:    and r6, r8, r5
-; PPC32-NEXT:    and r7, r9, r5
-; PPC32-NEXT:  .LBB9_1:
-; PPC32-NEXT:    lwarx r9, 0, r3
-; PPC32-NEXT:    and r8, r9, r5
-; PPC32-NEXT:    cmpw r8, r7
-; PPC32-NEXT:    bne cr0, .LBB9_3
-; PPC32-NEXT:  # %bb.2:
-; PPC32-NEXT:    andc r9, r9, r5
-; PPC32-NEXT:    or r9, r9, r6
-; PPC32-NEXT:    stwcx. r9, 0, r3
-; PPC32-NEXT:    bne cr0, .LBB9_1
-; PPC32-NEXT:  .LBB9_3:
-; PPC32-NEXT:    srw r3, r8, r4
+; PPC32-NEXT:    rlwinm r4, r3, 0, 0, 29
+; PPC32-NEXT:    lwarx r5, 0, r4
+; PPC32-NEXT:    clrlwi  r3, r3, 30
+; PPC32-NEXT:    xori r3, r3, 2
+; PPC32-NEXT:    slwi r6, r3, 3
+; PPC32-NEXT:    srw r3, r5, r6
+; PPC32-NEXT:    andi. r7, r3, 65535
+; PPC32-NEXT:    beq     cr0, .LBB9_2
+; PPC32-NEXT:  # %bb.1:                                # %cmpxchg.failure
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+; PPC32-NEXT:  .LBB9_2:                                # %cmpxchg.fencedstore
+; PPC32-NEXT:    lis r7, 0
+; PPC32-NEXT:    ori r7, r7, 65535
+; PPC32-NEXT:    slw r7, r7, r6
+; PPC32-NEXT:    li r8, 1
+; PPC32-NEXT:    not     r7, r7
+; PPC32-NEXT:    slw r6, r8, r6
+; PPC32-NEXT:    and r5, r5, r7
+; PPC32-NEXT:    or r5, r5, r6
+; PPC32-NEXT:    stwcx. r5, 0, r4
 ; PPC32-NEXT:    lwsync
 ; PPC32-NEXT:    blr
 ;
 ; PPC64-LABEL: cas_weak_i16_acquire_acquire:
 ; PPC64:       # %bb.0:
-; PPC64-NEXT:    li r6, 0
-; PPC64-NEXT:    rlwinm r4, r3, 3, 27, 27
-; PPC64-NEXT:    li r5, 1
-; PPC64-NEXT:    ori r7, r6, 65535
-; PPC64-NEXT:    xori r4, r4, 16
-; PPC64-NEXT:    slw r8, r5, r4
-; PPC64-NEXT:    slw r9, r6, r4
-; PPC64-NEXT:    slw r5, r7, r4
-; PPC64-NEXT:    rldicr r3, r3, 0, 61
-; PPC64-NEXT:    and r6, r8, r5
-; PPC64-NEXT:    and r7, r9, r5
-; PPC64-NEXT:  .LBB9_1:
-; PPC64-NEXT:    lwarx r9, 0, r3
-; PPC64-NEXT:    and r8, r9, r5
-; PPC64-NEXT:    cmpw r8, r7
-; PPC64-NEXT:    bne cr0, .LBB9_3
-; PPC64-NEXT:  # %bb.2:
-; PPC64-NEXT:    andc r9, r9, r5
-; PPC64-NEXT:    or r9, r9, r6
-; PPC64-NEXT:    stwcx. r9, 0, r3
-; PPC64-NEXT:    bne cr0, .LBB9_1
-; PPC64-NEXT:  .LBB9_3:
-; PPC64-NEXT:    srw r3, r8, r4
+; PPC64-NEXT:   rldicr r4, r3, 0, 61
+; PPC64-NEXT:    clrlwi  r3, r3, 30
+; PPC64-NEXT:    lwarx r5, 0, r4
+; PPC64-NEXT:    xori r3, r3, 2
+; PPC64-NEXT:    slwi r6, r3, 3
+; PPC64-NEXT:    srw r3, r5, r6
+; PPC64-NEXT:    andi. r7, r3, 65535
+; PPC64-NEXT:    beq     cr0, .LBB9_2
+; PPC64-NEXT:  # %bb.1:                                # %cmpxchg.failure
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:    blr
+; PPC64-NEXT:  .LBB9_2:                                # %cmpxchg.fencedstore
+; PPC64-NEXT:    lis r7, 0
+; PPC64-NEXT:    ori r7, r7, 65535
+; PPC64-NEXT:    slw r7, r7, r6
+; PPC64-NEXT:    li r8, 1
+; PPC64-NEXT:    not     r7, r7
+; PPC64-NEXT:    slw r6, r8, r6
+; PPC64-NEXT:    and r5, r5, r7
+; PPC64-NEXT:    or r5, r5, r6
+; PPC64-NEXT:    stwcx. r5, 0, r4
 ; PPC64-NEXT:    lwsync
 ; PPC64-NEXT:    blr
   %val = cmpxchg weak ptr %mem, i16 0, i16 1 acquire acquire
@@ -261,17 +265,23 @@ define i16 @cas_weak_i16_acquire_acquire(ptr %mem) {
 define i32 @cas_strong_i32_acqrel_acquire(ptr %mem) {
 ; CHECK-LABEL: cas_strong_i32_acqrel_acquire:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr      r4, r3
+; CHECK-NEXT:    lwarx r3, 0, r3
+; CHECK-NEXT:    cmplwi  r3, 0
+; CHECK-NEXT:    bne     cr0, .LBB10_4
+; CHECK-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
 ; CHECK-NEXT:    li r5, 1
 ; CHECK-NEXT:    lwsync
-; CHECK-NEXT:  .LBB10_1:
-; CHECK-NEXT:    lwarx r4, 0, r3
-; CHECK-NEXT:    cmpwi r4, 0
-; CHECK-NEXT:    bne cr0, .LBB10_3
-; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    stwcx. r5, 0, r3
-; CHECK-NEXT:    bne cr0, .LBB10_1
-; CHECK-NEXT:  .LBB10_3:
-; CHECK-NEXT:    mr r3, r4
+; CHECK-NEXT:  .LBB10_2:                               # %cmpxchg.trystore
+; CHECK-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    stwcx. r5, 0, r4
+; CHECK-NEXT:    beq     cr0, .LBB10_4
+; CHECK-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; CHECK-NEXT:                                          #   in Loop: Header=BB10_2 Depth=1
+; CHECK-NEXT:    lwarx r3, 0, r4
+; CHECK-NEXT:    cmplwi  r3, 0
+; CHECK-NEXT:    beq     cr0, .LBB10_2
+; CHECK-NEXT:  .LBB10_4:                               # %cmpxchg.nostore
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %val = cmpxchg ptr %mem, i32 0, i32 1 acq_rel acquire
@@ -304,17 +314,14 @@ define i64 @cas_weak_i64_release_monotonic(ptr %mem) {
 ;
 ; PPC64-LABEL: cas_weak_i64_release_monotonic:
 ; PPC64:       # %bb.0:
+; PPC64-NEXT:    mr      r4, r3
+; PPC64-NEXT:    ldarx r3, 0, r3
+; PPC64-NEXT:    cmpldi  r3, 0
+; PPC64-NEXT:    bnelr   cr0
+; PPC64-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
 ; PPC64-NEXT:    li r5, 1
 ; PPC64-NEXT:    lwsync
-; PPC64-NEXT:  .LBB11_1:
-; PPC64-NEXT:    ldarx r4, 0, r3
-; PPC64-NEXT:    cmpdi r4, 0
-; PPC64-NEXT:    bne cr0, .LBB11_3
-; PPC64-NEXT:  # %bb.2:
-; PPC64-NEXT:    stdcx. r5, 0, r3
-; PPC64-NEXT:    bne cr0, .LBB11_1
-; PPC64-NEXT:  .LBB11_3:
-; PPC64-NEXT:    mr r3, r4
+; PPC64-NEXT:    stdcx. r5, 0, r4
 ; PPC64-NEXT:    blr
   %val = cmpxchg weak ptr %mem, i64 0, i64 1 release monotonic
   %loaded = extractvalue { i64, i1} %val, 0
diff --git a/llvm/test/CodeGen/PowerPC/loop-comment.ll b/llvm/test/CodeGen/PowerPC/loop-comment.ll
index 14f6791fc779..1fa9dda51ef9 100644
--- a/llvm/test/CodeGen/PowerPC/loop-comment.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-comment.ll
@@ -4,12 +4,17 @@
 define void @test(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB0_1:
+; PPC64LE-NEXT:    .p2align        5
+; PPC64LE-NEXT:  .LBB0_1:                                # %cmpxchg.start
+; PPC64LE-NEXT:                                          # =>This Inner Loop Header: Depth=1
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi  6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:                                          #   in Loop: Header=BB0_1 Depth=1
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB0_1
 ; PPC64LE-NEXT:  # %bb.3:
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
index f787aa7f6a42..840e2d3eee55 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
@@ -7,19 +7,51 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    %loaded = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %cmpxchg.end ]
+; CHECK-NEXT:    %new = fadd float %loaded, %value
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float %new to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float %loaded to i32
+; CHECK-NEXT:    br label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.start:                                    ; preds = %cmpxchg.trystore, %atomicrmw.start
+; CHECK-NEXT:    %larx = call i32 @llvm.ppc.lwarx(ptr %ptr)
+; CHECK-NEXT:    %should_store = icmp eq i32 %larx, [[TMP3]]
+; CHECK-NEXT:    br i1 %should_store, label %cmpxchg.fencedstore, label %cmpxchg.nostore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.fencedstore:                              ; preds = %cmpxchg.start
+; CHECK-NEXT:    br label %cmpxchg.trystore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.trystore:                                 ; preds = %cmpxchg.fencedstore
+; CHECK-NEXT:    %loaded.trystore = phi i32 [ %larx, %cmpxchg.fencedstore ]
+; CHECK-NEXT:    %stcx = call i32 @llvm.ppc.stwcx(ptr %ptr, i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 %stcx, 1
+; CHECK-NEXT:    %success1 = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 %success1, label %cmpxchg.success, label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.releasedload:                             ; No predecessors!
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.success:                                  ; preds = %cmpxchg.trystore
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.nostore:                                  ; preds = %cmpxchg.start
+; CHECK-NEXT:    %loaded.nostore = phi i32 [ %larx, %cmpxchg.start ]
+; CHECK-NEXT:    br label %cmpxchg.failure
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.failure:                                  ; preds = %cmpxchg.nostore
+; CHECK-NEXT:    %loaded.failure = phi i32 [ %loaded.nostore, %cmpxchg.nostore ]
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.end:                                      ; preds = %cmpxchg.failure, %cmpxchg.success
+; CHECK-NEXT:    %loaded.exit = phi i32 [ %loaded.trystore, %cmpxchg.success ], [ %loaded.failure, %cmpxchg.failure ]
+; CHECK-NEXT:    %success2 = phi i1 [ true, %cmpxchg.success ], [ false, %cmpxchg.failure ]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 %loaded.exit to float
+; CHECK-NEXT:    br i1 %success2, label %atomicrmw.end, label %atomicrmw.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  atomicrmw.end:                                    ; preds = %cmpxchg.end
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
 ; CHECK-NEXT:    ret float [[TMP5]]
-;
+; CHECK-NEXT:  }
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
 }
@@ -28,22 +60,56 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fsub_f32(
 ; CHECK-NEXT:    call void @llvm.ppc.sync()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    br label %atomicrmw.start 
+; CHECK-EMPTY:
+; CHECK-NEXT:  atomicrmw.start:
+; CHECK-NEXT:    %loaded = phi float [ [[TMP1]], %0 ], [ [[TMP5:%.*]], %cmpxchg.end ]
+; CHECK-NEXT:    %new = fsub float %loaded, %value
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float %new to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float %loaded to i32
+; CHECK-NEXT:    br label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.start:
+; CHECK-NEXT:    %larx = call i32 @llvm.ppc.lwarx(ptr %ptr)
+; CHECK-NEXT:    %should_store = icmp eq i32 %larx, [[TMP3]]
+; CHECK-NEXT:    br i1 %should_store, label %cmpxchg.fencedstore, label %cmpxchg.nostore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.fencedstore:                              ; preds = %cmpxchg.start
+; CHECK-NEXT:    br label %cmpxchg.trystore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.trystore:                                 ; preds = %cmpxchg.fencedstore
+; CHECK-NEXT:    %loaded.trystore = phi i32 [ %larx, %cmpxchg.fencedstore ]
+; CHECK-NEXT:    %stcx = call i32 @llvm.ppc.stwcx(ptr %ptr, i32 %2)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 %stcx, 1
+; CHECK-NEXT:    %success1 = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 %success1, label %cmpxchg.success, label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.releasedload:                             ; No predecessors!
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.success:                                  ; preds = %cmpxchg.trystore
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.nostore:                                  ; preds = %cmpxchg.start
+; CHECK-NEXT:    %loaded.nostore = phi i32 [ %larx, %cmpxchg.start ]
+; CHECK-NEXT:    br label %cmpxchg.failure
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.failure:                                  ; preds = %cmpxchg.nostore
+; CHECK-NEXT:    %loaded.failure = phi i32 [ %loaded.nostore, %cmpxchg.nostore ]
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.end:                                      ; preds = %cmpxchg.failure, %cmpxchg.success
+; CHECK-NEXT:    %loaded.exit = phi i32 [ %loaded.trystore, %cmpxchg.success ], [ %loaded.failure, %cmpxchg.failure ]
+; CHECK-NEXT:    %success2 = phi i1 [ true, %cmpxchg.success ], [ false, %cmpxchg.failure ]
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 %loaded.exit to float
+; CHECK-NEXT:    br i1 %success2, label %atomicrmw.end, label %atomicrmw.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  atomicrmw.end:                                    ; preds = %cmpxchg.end
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
 ; CHECK-NEXT:    ret float [[TMP5]]
-;
-  %res = atomicrmw fsub ptr %ptr, float %value seq_cst
+; CHECK-NEXT:  }
+
+%res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
 }
 

From 4a47634a0075c49051cb4708a7f54577ecb080f4 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Fri, 13 Jun 2025 14:16:58 +0100
Subject: [PATCH 0273/1322] [flang][OpenMP] Support substrings and complex part
 refs for DEPEND (#143907)

Fixes #142404

The parser can't tell the difference between array indexing and a
substring: that has to be done in semantics once we have types.
Substrings can only be in the form string([lower]:[higher]) not
string(index) or string(lower:higher:step). I added semantic checks to
catch this for the DEPEND clause.

This patch also adds lowering for correct substrings and for complex
part references.
---
 flang/include/flang/Evaluate/tools.h          |  18 +--
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |  13 +--
 flang/lib/Lower/OpenMP/Clauses.cpp            |  11 +-
 flang/lib/Semantics/check-omp-structure.cpp   |  34 ++++++
 flang/test/Lower/OpenMP/depend-complex.f90    |  22 ++++
 flang/test/Lower/OpenMP/depend-substring.f90  | 108 ++++++++++++++++++
 .../Semantics/OpenMP/depend-substring.f90     |  65 +++++++++++
 7 files changed, 250 insertions(+), 21 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/depend-complex.f90
 create mode 100644 flang/test/Lower/OpenMP/depend-substring.f90
 create mode 100644 flang/test/Semantics/OpenMP/depend-substring.f90

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 4dce1257a650..1959d5f3a589 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -490,26 +490,30 @@ template <typename A> std::optional<CoarrayRef> ExtractCoarrayRef(const A &x) {
   }
 }
 
-struct ExtractSubstringHelper {
-  template <typename T> static std::optional<Substring> visit(T &&) {
+template <typename TARGET> struct ExtractFromExprDesignatorHelper {
+  template <typename T> static std::optional<TARGET> visit(T &&) {
     return std::nullopt;
   }
 
-  static std::optional<Substring> visit(const Substring &e) { return e; }
+  static std::optional<TARGET> visit(const TARGET &t) { return t; }
 
   template <typename T>
-  static std::optional<Substring> visit(const Designator<T> &e) {
+  static std::optional<TARGET> visit(const Designator<T> &e) {
     return common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 
-  template <typename T>
-  static std::optional<Substring> visit(const Expr<T> &e) {
+  template <typename T> static std::optional<TARGET> visit(const Expr<T> &e) {
     return common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 };
 
 template <typename A> std::optional<Substring> ExtractSubstring(const A &x) {
-  return ExtractSubstringHelper::visit(x);
+  return ExtractFromExprDesignatorHelper<Substring>::visit(x);
+}
+
+template <typename A>
+std::optional<ComplexPart> ExtractComplexPart(const A &x) {
+  return ExtractFromExprDesignatorHelper<ComplexPart>::visit(x);
 }
 
 // If an expression is simply a whole symbol data designator,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 88baad8827e9..b5c8de8c2ce8 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -926,14 +926,10 @@ bool ClauseProcessor::processDepend(lower::SymMap &symMap,
     for (const omp::Object &object : objects) {
       assert(object.ref() && "Expecting designator");
       mlir::Value dependVar;
+      SomeExpr expr = *object.ref();
 
-      if (evaluate::ExtractSubstring(*object.ref())) {
-        TODO(converter.getCurrentLocation(),
-             "substring not supported for task depend");
-      } else if (evaluate::IsArrayElement(*object.ref())) {
-        // Array Section
-        SomeExpr expr = *object.ref();
-
+      if (evaluate::IsArrayElement(expr) || evaluate::ExtractSubstring(expr)) {
+        // Array Section or character (sub)string
         if (isVectorSubscript(expr)) {
           // OpenMP needs the address of the first indexed element (required by
           // the standard to be the lowest index) to identify the dependency. We
@@ -947,7 +943,8 @@ bool ClauseProcessor::processDepend(lower::SymMap &symMap,
               converter.getCurrentLocation(), converter, expr, symMap, stmtCtx);
           dependVar = entity.getBase();
         }
-      } else if (evaluate::isStructureComponent(*object.ref())) {
+      } else if (evaluate::isStructureComponent(expr) ||
+                 evaluate::ExtractComplexPart(expr)) {
         SomeExpr expr = *object.ref();
         hlfir::EntityWithAttributes entity = convertExprToHLFIR(
             converter.getCurrentLocation(), converter, expr, symMap, stmtCtx);
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index f3088b18b77f..4d0f5c3a127e 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -70,19 +70,18 @@ struct SymbolAndDesignatorExtractor {
 
   static void verify(const SymbolWithDesignator &sd) {
     const semantics::Symbol *symbol = std::get<0>(sd);
-    assert(symbol && "Expecting symbol");
-    auto &maybeDsg = std::get<1>(sd);
+    const std::optional<evaluate::Expr<evaluate::SomeType>> &maybeDsg =
+        std::get<1>(sd);
     if (!maybeDsg)
       return; // Symbol with no designator -> OK
-    std::optional<evaluate::DataRef> maybeRef =
-        evaluate::ExtractDataRef(*maybeDsg);
+    assert(symbol && "Expecting symbol");
+    std::optional<evaluate::DataRef> maybeRef = evaluate::ExtractDataRef(
+        *maybeDsg, /*intoSubstring=*/true, /*intoComplexPart=*/true);
     if (maybeRef) {
       if (&maybeRef->GetLastSymbol() == symbol)
         return; // Symbol with a designator for it -> OK
       llvm_unreachable("Expecting designator for given symbol");
     } else {
-      // This could still be a Substring or ComplexPart, but at least Substring
-      // is not allowed in OpenMP.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       maybeDsg->dump();
 #endif
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 4dccb0e88e32..58d28dce7094 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -11,6 +11,7 @@
 #include "resolve-names-utils.h"
 #include "flang/Evaluate/check-expression.h"
 #include "flang/Evaluate/expression.h"
+#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
@@ -6524,6 +6525,29 @@ void OmpStructureChecker::CheckDependList(const parser::DataRef &d) {
 void OmpStructureChecker::CheckArraySection(
     const parser::ArrayElement &arrayElement, const parser::Name &name,
     const llvm::omp::Clause clause) {
+  // Sometimes substring operations are incorrectly parsed as array accesses.
+  // Detect this by looking for array accesses on character variables which are
+  // not arrays.
+  bool isSubstring{false};
+  evaluate::ExpressionAnalyzer ea{context_};
+  if (MaybeExpr expr = ea.Analyze(arrayElement.base)) {
+    std::optional<evaluate::Shape> shape = evaluate::GetShape(expr);
+    // Not an array: rank 0
+    if (shape && shape->size() == 0) {
+      if (std::optional<evaluate::DynamicType> type = expr->GetType()) {
+        if (type->category() == evaluate::TypeCategory::Character) {
+          // Substrings are explicitly denied by the standard [6.0:163:9-11].
+          // This is supported as an extension. This restriction was added in
+          // OpenMP 5.2.
+          isSubstring = true;
+          context_.Say(GetContext().clauseSource,
+              "The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2."_port_en_US);
+        } else {
+          llvm_unreachable("Array indexing on a variable that isn't an array");
+        }
+      }
+    }
+  }
   if (!arrayElement.subscripts.empty()) {
     for (const auto &subscript : arrayElement.subscripts) {
       if (const auto *triplet{
@@ -6541,6 +6565,10 @@ void OmpStructureChecker::CheckArraySection(
                   name.ToString(),
                   parser::ToUpperCaseLetters(getClauseName(clause).str()));
             }
+            if (isSubstring) {
+              context_.Say(GetContext().clauseSource,
+                  "Cannot specify a step for a substring"_err_en_US);
+            }
           }
           const auto &lower{std::get<0>(triplet->t)};
           const auto &upper{std::get<1>(triplet->t)};
@@ -6564,6 +6592,12 @@ void OmpStructureChecker::CheckArraySection(
             }
           }
         }
+      } else if (std::get_if<parser::IntExpr>(&subscript.u)) {
+        // base(n) is valid as an array index but not as a substring operation
+        if (isSubstring) {
+          context_.Say(GetContext().clauseSource,
+              "Substrings must be in the form parent-string(lb:ub)"_err_en_US);
+        }
       }
     }
   }
diff --git a/flang/test/Lower/OpenMP/depend-complex.f90 b/flang/test/Lower/OpenMP/depend-complex.f90
new file mode 100644
index 000000000000..488696b56507
--- /dev/null
+++ b/flang/test/Lower/OpenMP/depend-complex.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -fopenmp -emit-hlfir -o - %s | FileCheck %s
+
+subroutine depend_complex(z)
+! CHECK-LABEL:   func.func @_QPdepend_complex(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<complex<f32>> {fir.bindc_name = "z"}) {
+  complex :: z
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFdepend_complexEz"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+  !$omp task depend(in:z%re)
+! CHECK:           %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0  real : (!fir.ref<complex<f32>>) -> !fir.ref<f32>
+! CHECK:           omp.task depend(taskdependin -> %[[VAL_2]] : !fir.ref<f32>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+  !$omp end task
+  !$omp task depend(in:z%im)
+! CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_1]]#0  imag : (!fir.ref<complex<f32>>) -> !fir.ref<f32>
+! CHECK:           omp.task depend(taskdependin -> %[[VAL_3]] : !fir.ref<f32>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+  !$omp end task
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/depend-substring.f90 b/flang/test/Lower/OpenMP/depend-substring.f90
new file mode 100644
index 000000000000..5de11e06cc10
--- /dev/null
+++ b/flang/test/Lower/OpenMP/depend-substring.f90
@@ -0,0 +1,108 @@
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+subroutine substring_0(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c(:))
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_0(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_0Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_6:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_9:.*]] = fir.box_elesize %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_6]]  substr %[[VAL_7]], %[[VAL_11]]  typeparams %[[VAL_17]] : (!fir.boxchar<1>, index, index, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.boxchar<1>) -> !fir.ref<!fir.char<1,?>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_19]] : !fir.ref<!fir.char<1,?>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine substring_1(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c(2:))
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_1(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_1Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_6:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_9:.*]] = fir.box_elesize %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_6]]  substr %[[VAL_7]], %[[VAL_11]]  typeparams %[[VAL_17]] : (!fir.boxchar<1>, index, index, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.boxchar<1>) -> !fir.ref<!fir.char<1,?>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_19]] : !fir.ref<!fir.char<1,?>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine substring_2(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c(:2))
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_2(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_6:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_8:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_9:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_10:.*]] = hlfir.designate %[[VAL_6]]  substr %[[VAL_7]], %[[VAL_8]]  typeparams %[[VAL_9]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1,2>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_10]] : !fir.ref<!fir.char<1,2>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine substring_4(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c)
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_4(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_4Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_3]] : !fir.ptr<!fir.char<1,?>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Semantics/OpenMP/depend-substring.f90 b/flang/test/Semantics/OpenMP/depend-substring.f90
new file mode 100644
index 000000000000..23d6bb4c0b7b
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/depend-substring.f90
@@ -0,0 +1,65 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Test for parsing confusion between array indexing and string subscripts
+
+! This is okay: selects the whole substring
+subroutine substring_0(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !$omp task depend(out:c(:))
+  !$omp end task
+end
+
+! This is okay: selects from the second character onwards
+subroutine substring_1(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !$omp task depend(out:c(2:))
+  !$omp end task
+end
+
+! This is okay: selects the first 2 characters
+subroutine substring_2(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !$omp task depend(out:c(:2))
+  !$omp end task
+end
+
+! Error
+subroutine substring_3(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !ERROR: Substrings must be in the form parent-string(lb:ub)
+  !$omp task depend(out:c(2))
+  !$omp end task
+end
+
+! This is okay: interpreted as indexing into the array not as a substring
+subroutine substring_3b(c)
+  character(:), pointer :: c(:)
+  !$omp task depend(out:c(2))
+  !$omp end task
+end
+
+! This is okay: no indexing or substring at all
+subroutine substring_4(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c)
+  !$omp end task
+end
+
+! This is not okay: substrings can't have a stride
+subroutine substring_5(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !ERROR: Cannot specify a step for a substring
+  !$omp task depend(out:c(1:20:5))
+  !$omp end task
+end
+
+! This is okay: interpreted as indexing the array
+subroutine substring_5b(c)
+  character(:), pointer :: c(:)
+  !$omp task depend(out:c(1:20:5))
+  !$omp end task
+end

From 6ca31ad720ba32bff3664af218ec2d3c29bdd1b0 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Fri, 13 Jun 2025 14:17:39 +0100
Subject: [PATCH 0274/1322] [flang][OpenMP] improve semantic check for invalid
 goto (#144040)

Fixes #143229
---
 flang/lib/Semantics/resolve-directives.cpp        |  8 ++++++--
 .../Semantics/OpenMP/parallel-master-goto.f90     | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/parallel-master-goto.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 93bf510fbc3c..b5f8667fe36f 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -3023,10 +3023,14 @@ void OmpAttributeVisitor::CheckSourceLabel(const parser::Label &label) {
 void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
     const parser::CharBlock target, std::optional<DirContext> sourceContext,
     std::optional<DirContext> targetContext) {
+  auto dirContextsSame = [](DirContext &lhs, DirContext &rhs) -> bool {
+    // Sometimes nested constructs share a scope but are different contexts
+    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive);
+  };
   unsigned version{context_.langOptions().OpenMPVersion};
   if (targetContext &&
       (!sourceContext ||
-          (sourceContext->scope != targetContext->scope &&
+          (!dirContextsSame(*targetContext, *sourceContext) &&
               !DoesScopeContain(
                   &targetContext->scope, sourceContext->scope)))) {
     context_
@@ -3038,7 +3042,7 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
   if (sourceContext &&
       (!targetContext ||
-          (sourceContext->scope != targetContext->scope &&
+          (!dirContextsSame(*sourceContext, *targetContext) &&
               !DoesScopeContain(
                   &sourceContext->scope, targetContext->scope)))) {
     context_
diff --git a/flang/test/Semantics/OpenMP/parallel-master-goto.f90 b/flang/test/Semantics/OpenMP/parallel-master-goto.f90
new file mode 100644
index 000000000000..72c8002ab4c5
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/parallel-master-goto.f90
@@ -0,0 +1,15 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Regression test for #143229
+
+!$omp parallel
+do i = 1, 2
+!ERROR: invalid branch into an OpenMP structured block
+!ERROR: invalid branch leaving an OpenMP structured block
+  goto 10
+end do
+!WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
+!$omp master
+10 print *, i
+!$omp end master
+!$omp end parallel
+end

From 9c2e0bd59ce0438fcad61b0468fd939c6282d048 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Fri, 13 Jun 2025 09:19:10 -0400
Subject: [PATCH 0275/1322] [PowerPC][NFC] Pre-commit test case for checking
 whether  `mtvsrbmi` power10 instruction not used (#143956)

Verify whether the generated assembly for the following function
includes the mtvsrbmi instruction.
 vector unsigned char v00FF()
{
 vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
 return x;
 }
---
 llvm/test/CodeGen/PowerPC/mtvsrbmi.ll | 44 +++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/mtvsrbmi.ll

diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
new file mode 100644
index 000000000000..7ed57c300ec7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Verify whether the generated assembly for the following function includes the mtvsrbmi instruction.
+; vector unsigned char v00FF()
+; {
+; vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
+; return x;
+; }
+
+; RUN: llc < %s -ppc-asm-full-reg-names  -mtriple=powerpc-ibm-aix -mcpu=pwr10  -verify-machineinstrs \
+; RUN:   | FileCheck %s --check-prefix=CHECK
+
+define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() {
+; CHECK-LABEL: _Z5v00FFv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-NEXT:    lxv vs34, 0(r3)
+; CHECK-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; CHECK:      L..CPI0_0:
+; CHECK-NEXT:   .byte   255                             # 0xff
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+
+; CHECK:      ._Z5v00FFv:
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT:   lwz r3, L..C0(r2)
+; CHECK-NEXT:   lxv vs34, 0(r3)
+; CHECK-NEXT:   blr

From 7e0bb2b0b9f66715c07c5eeaadb367d1a084d4c7 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Fri, 13 Jun 2025 15:21:23 +0200
Subject: [PATCH 0276/1322] [flang][fir] Extend locality specs lowering to
 support `init` and `dealloc` regions (#144027)

Extending `fir.do_concurrent` to `fir.do_loop ... unordered` lowering by
adding support for lowring/inlining non-empty `init` and `dealloc`
regions.

Resolves https://github.com/llvm/llvm-project/issues/143897 (actually
handles the todo).
---
 .../Transforms/SimplifyFIROperations.cpp      |  49 ++++---
 ...do-concurrent-localizer-dealloc-region.fir | 126 ++++++++++++++++++
 .../do-concurrent-localizer-init-region.fir   | 102 ++++++++++++++
 3 files changed, 257 insertions(+), 20 deletions(-)
 create mode 100644 flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir
 create mode 100644 flang/test/Transforms/do-concurrent-localizer-init-region.fir

diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
index cb9e48cced2a..e440852b3103 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
@@ -180,41 +180,50 @@ public:
 
       std::optional<mlir::ArrayAttr> localSyms = loop.getLocalSyms();
 
-      for (auto [localVar, localArg, localizerSym] : llvm::zip_equal(
+      for (auto localInfo : llvm::zip_equal(
                loop.getLocalVars(), loop.getRegionLocalArgs(), *localSyms)) {
+        mlir::Value localVar = std::get<0>(localInfo);
+        mlir::BlockArgument localArg = std::get<1>(localInfo);
+        mlir::Attribute localizerSym = std::get<2>(localInfo);
         mlir::SymbolRefAttr localizerName =
             llvm::cast<mlir::SymbolRefAttr>(localizerSym);
         fir::LocalitySpecifierOp localizer = findLocalizer(loop, localizerName);
 
-        if (!localizer.getInitRegion().empty() ||
-            !localizer.getDeallocRegion().empty())
-          TODO(localizer.getLoc(), "localizers with `init` and `dealloc` "
-                                   "regions are not handled yet.");
-
         // TODO Should this be a heap allocation instead? For now, we allocate
         // on the stack for each loop iteration.
         mlir::Value localAlloc =
             rewriter.create<fir::AllocaOp>(loop.getLoc(), localizer.getType());
 
-        if (localizer.getLocalitySpecifierType() ==
-            fir::LocalitySpecifierType::LocalInit) {
+        auto cloneLocalizerRegion = [&](mlir::Region &region,
+                                        mlir::ValueRange regionArgs,
+                                        mlir::Block::iterator insertionPoint) {
           // It is reasonable to make this assumption since, at this stage,
           // control-flow ops are not converted yet. Therefore, things like `if`
           // conditions will still be represented by their encapsulating `fir`
           // dialect ops.
-          assert(localizer.getCopyRegion().hasOneBlock() &&
-                 "Expected localizer to have a single block.");
-          mlir::Block *beforeLocalInit = rewriter.getInsertionBlock();
-          mlir::Block *afterLocalInit = rewriter.splitBlock(
-              rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
-          rewriter.cloneRegionBefore(localizer.getCopyRegion(), afterLocalInit);
-          mlir::Block *copyRegionBody = beforeLocalInit->getNextNode();
+          assert(region.hasOneBlock() &&
+                 "Expected localizer region to have a single block.");
+          mlir::OpBuilder::InsertionGuard guard(rewriter);
+          rewriter.setInsertionPoint(rewriter.getInsertionBlock(),
+                                     insertionPoint);
+          mlir::IRMapping mapper;
+          mapper.map(region.getArguments(), regionArgs);
+          for (mlir::Operation &op : region.front().without_terminator())
+            (void)rewriter.clone(op, mapper);
+        };
 
-          rewriter.eraseOp(copyRegionBody->getTerminator());
-          rewriter.mergeBlocks(afterLocalInit, copyRegionBody);
-          rewriter.mergeBlocks(copyRegionBody, beforeLocalInit,
-                               {localVar, localArg});
-        }
+        if (!localizer.getInitRegion().empty())
+          cloneLocalizerRegion(localizer.getInitRegion(), {localVar, localArg},
+                               rewriter.getInsertionPoint());
+
+        if (localizer.getLocalitySpecifierType() ==
+            fir::LocalitySpecifierType::LocalInit)
+          cloneLocalizerRegion(localizer.getCopyRegion(), {localVar, localArg},
+                               rewriter.getInsertionPoint());
+
+        if (!localizer.getDeallocRegion().empty())
+          cloneLocalizerRegion(localizer.getDeallocRegion(), {localArg},
+                               rewriter.getInsertionBlock()->end());
 
         rewriter.replaceAllUsesWith(localArg, localAlloc);
       }
diff --git a/flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir b/flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir
new file mode 100644
index 000000000000..b59ffdfb34ad
--- /dev/null
+++ b/flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir
@@ -0,0 +1,126 @@
+// Tests converting `fir.local` ops that have `dealloc` regions.
+
+// RUN: fir-opt --split-input-file --simplify-fir-operations %s | FileCheck %s
+
+fir.local {type = local} @_QFlocalizer_with_dealloc_regionEa_private_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>, %arg1: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+  %c0 = arith.constant 0 : index
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  %1:3 = fir.box_dims %0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+  %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
+  %3 = fir.allocmem !fir.array<?xi32>, %1#1 {bindc_name = ".tmp", uniq_name = ""}
+  %4 = fir.declare %3(%2) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
+  %5 = fir.embox %4(%2) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  %6 = fir.shape_shift %1#0, %1#1 : (index, index) -> !fir.shapeshift<1>
+  %7 = fir.rebox %5(%6) : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xi32>>
+  fir.store %7 to %arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+} dealloc {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+  %c0_i64 = arith.constant 0 : i64
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  %1 = fir.box_addr %0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %2 = fir.convert %1 : (!fir.ref<!fir.array<?xi32>>) -> i64
+  %3 = arith.cmpi ne, %2, %c0_i64 : i64
+  fir.if %3 {
+    %4 = fir.convert %1 : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+    fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+  }
+  fir.yield
+}
+
+func.func @_QPlocalizer_with_dealloc_region(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
+  %c42_i32 = arith.constant 42 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.alloca !fir.box<!fir.array<?xi32>>
+  %1 = fir.dummy_scope : !fir.dscope
+  %2 = fir.declare %arg0 dummy_scope %1 {uniq_name = "_QFlocalizer_with_dealloc_regionEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+  %3 = fir.load %2 : !fir.ref<i32>
+  %4 = fir.convert %3 : (i32) -> index
+  %5 = arith.cmpi sgt, %4, %c0 : index
+  %6 = arith.select %5, %4, %c0 : index
+  %7 = fir.alloca !fir.array<?xi32>, %6 {bindc_name = "a", uniq_name = "_QFlocalizer_with_dealloc_regionEa"}
+  %8 = fir.shape %6 : (index) -> !fir.shape<1>
+  %9 = fir.declare %7(%8) {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+  %10 = fir.embox %9(%8) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  fir.store %10 to %0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  fir.do_concurrent {
+    %11 = fir.alloca i32 {bindc_name = "i"}
+    %12 = fir.declare %11 {uniq_name = "_QFlocalizer_with_dealloc_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    fir.do_concurrent.loop (%arg1) = (%c1) to (%4) step (%c1) local(@_QFlocalizer_with_dealloc_regionEa_private_box_Uxi32 %0 -> %arg2 : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
+      %13 = fir.convert %arg1 : (index) -> i32
+      fir.store %13 to %12 : !fir.ref<i32>
+      %14 = fir.declare %arg2 {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>>
+      %15 = fir.load %14 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+      %16 = fir.load %12 : !fir.ref<i32>
+      %17 = fir.convert %16 : (i32) -> i64
+      %18:3 = fir.box_dims %15, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+      %19 = fir.shift %18#0 : (index) -> !fir.shift<1>
+      %20 = fir.array_coor %15(%19) %17 : (!fir.box<!fir.array<?xi32>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      fir.store %c42_i32 to %20 : !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func.func @_QPlocalizer_with_dealloc_region(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[VAL_4]] {uniq_name = "_QFlocalizer_with_dealloc_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_8:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFlocalizer_with_dealloc_regionEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
+// CHECK:           %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_1]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_12]] {bindc_name = "a", uniq_name = "_QFlocalizer_with_dealloc_regionEa"}
+// CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_15:.*]] = fir.declare %[[VAL_13]](%[[VAL_14]]) {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:           %[[VAL_16:.*]] = fir.embox %[[VAL_15]](%[[VAL_14]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           fir.store %[[VAL_16]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:           fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]] to %[[VAL_10]] step %[[VAL_2]] unordered {
+
+// Local allocation
+// CHECK:             %[[VAL_18:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+
+// `init` region body
+// CHECK:             %[[VAL_19:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_19]], %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_21:.*]] = fir.shape %[[VAL_20]]#1 : (index) -> !fir.shape<1>
+// CHECK:             %[[VAL_22:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_20]]#1 {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:             %[[VAL_23:.*]] = fir.declare %[[VAL_22]](%[[VAL_21]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK:             %[[VAL_24:.*]] = fir.embox %[[VAL_23]](%[[VAL_21]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             %[[VAL_25:.*]] = fir.shape_shift %[[VAL_20]]#0, %[[VAL_20]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:             %[[VAL_26:.*]] = fir.rebox %[[VAL_24]](%[[VAL_25]]) : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             fir.store %[[VAL_26]] to %[[VAL_18]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+
+// Loop body
+// CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_17]] : (index) -> i32
+// CHECK:             fir.store %[[VAL_27]] to %[[VAL_5]] : !fir.ref<i32>
+// CHECK:             %[[VAL_28:.*]] = fir.declare %[[VAL_18]] {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_29:.*]] = fir.load %[[VAL_28]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> i64
+// CHECK:             %[[VAL_32:.*]]:3 = fir.box_dims %[[VAL_29]], %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_33:.*]] = fir.shift %[[VAL_32]]#0 : (index) -> !fir.shift<1>
+// CHECK:             %[[VAL_34:.*]] = fir.array_coor %[[VAL_29]](%[[VAL_33]]) %[[VAL_31]] : (!fir.box<!fir.array<?xi32>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+// CHECK:             fir.store %[[VAL_3]] to %[[VAL_34]] : !fir.ref<i32>
+
+// `dealloc` region
+// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_18]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_36:.*]] = fir.box_addr %[[VAL_35]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:             %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (!fir.ref<!fir.array<?xi32>>) -> i64
+// CHECK:             %[[VAL_38:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_0]] : i64
+// CHECK:             fir.if %[[VAL_38]] {
+// CHECK:               %[[VAL_39:.*]] = fir.convert %[[VAL_36]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK:               fir.freemem %[[VAL_39]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Transforms/do-concurrent-localizer-init-region.fir b/flang/test/Transforms/do-concurrent-localizer-init-region.fir
new file mode 100644
index 000000000000..ebb56aec278f
--- /dev/null
+++ b/flang/test/Transforms/do-concurrent-localizer-init-region.fir
@@ -0,0 +1,102 @@
+// Tests converting `fir.local` ops that have `init` regions.
+
+// RUN: fir-opt --split-input-file --simplify-fir-operations %s | FileCheck %s
+
+fir.local {type = local_init} @_QFlocalizer_with_init_regionEp_firstprivate_box_ptr_Uxi32 : !fir.box<!fir.ptr<!fir.array<?xi32>>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+  %c0 = arith.constant 0 : index
+  %0 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %1 = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+  %2 = fir.embox %1(%0) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  fir.store %2 to %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+} copy {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.store %0 to %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+}
+
+func.func @_QPlocalizer_with_init_region() {
+  %c42_i32 = arith.constant 42 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFlocalizer_with_init_regionEn"}
+  %2 = fir.declare %1 {uniq_name = "_QFlocalizer_with_init_regionEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", uniq_name = "_QFlocalizer_with_init_regionEp"}
+  %4 = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+  %5 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %6 = fir.embox %4(%5) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  fir.store %6 to %3 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  %7 = fir.declare %3 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  %8 = fir.load %2 : !fir.ref<i32>
+  %9 = fir.convert %8 : (i32) -> index
+
+  fir.do_concurrent {
+    %10 = fir.alloca i32 {bindc_name = "i"}
+    %11 = fir.declare %10 {uniq_name = "_QFlocalizer_with_init_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%9) step (%c1) local(@_QFlocalizer_with_init_regionEp_firstprivate_box_ptr_Uxi32 %7 -> %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+      %12 = fir.convert %arg0 : (index) -> i32
+      fir.store %12 to %11 : !fir.ref<i32>
+      %13 = fir.declare %arg1 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %14 = fir.load %13 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %15 = fir.load %11 : !fir.ref<i32>
+      %16 = fir.convert %15 : (i32) -> i64
+      %17:3 = fir.box_dims %14, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+      %18 = fir.shift %17#0 : (index) -> !fir.shift<1>
+      %19 = fir.array_coor %14(%18) %16 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      fir.store %c42_i32 to %19 : !fir.ref<i32>
+    }
+  }
+
+  return
+}
+
+// CHECK-LABEL:   func.func @_QPlocalizer_with_init_region() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_3]] {uniq_name = "_QFlocalizer_with_init_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFlocalizer_with_init_regionEn"}
+// CHECK:           %[[VAL_7:.*]] = fir.declare %[[VAL_6]] {uniq_name = "_QFlocalizer_with_init_regionEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", uniq_name = "_QFlocalizer_with_init_regionEp"}
+// CHECK:           %[[VAL_9:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+// CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_11:.*]] = fir.embox %[[VAL_9]](%[[VAL_10]]) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:           fir.store %[[VAL_11]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:           %[[VAL_12:.*]] = fir.declare %[[VAL_8]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> index
+// CHECK:           fir.do_loop %[[VAL_15:.*]] = %[[VAL_1]] to %[[VAL_14]] step %[[VAL_1]] unordered {
+
+// Local allocation
+// CHECK:             %[[VAL_16:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+
+// `init` region body
+// CHECK:             %[[VAL_17:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+// CHECK:             %[[VAL_18:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+// CHECK:             %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_17]]) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:             fir.store %[[VAL_19]] to %[[VAL_16]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+
+// `copy` region body
+// CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_12]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:             fir.store %[[VAL_20]] to %[[VAL_16]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+
+// loop body
+// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_15]] : (index) -> i32
+// CHECK:             fir.store %[[VAL_21]] to %[[VAL_4]] : !fir.ref<i32>
+// CHECK:             %[[VAL_22:.*]] = fir.declare %[[VAL_16]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> i64
+// CHECK:             %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_27:.*]] = fir.shift %[[VAL_26]]#0 : (index) -> !fir.shift<1>
+// CHECK:             %[[VAL_28:.*]] = fir.array_coor %[[VAL_23]](%[[VAL_27]]) %[[VAL_25]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+// CHECK:             fir.store %[[VAL_2]] to %[[VAL_28]] : !fir.ref<i32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+

From ea73fc5f079d1849ca3bed902e598191105a95dc Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Fri, 13 Jun 2025 09:38:54 -0400
Subject: [PATCH 0277/1322] [PowerPC] fixed mtvsrbmi.ll test case error caused
 by run the update_llc_test_checks.py (#144075)

fixed mtvsrbmi.ll test case error which caused by run the
update_llc_test_checks.py
---
 llvm/test/CodeGen/PowerPC/mtvsrbmi.ll | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
index 7ed57c300ec7..5486dc02faf9 100644
--- a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
+++ b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
@@ -10,15 +10,6 @@
 ; RUN:   | FileCheck %s --check-prefix=CHECK
 
 define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() {
-; CHECK-LABEL: _Z5v00FFv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lwz r3, L..C0(r2) # %const.0
-; CHECK-NEXT:    lxv vs34, 0(r3)
-; CHECK-NEXT:    blr
-entry:
-  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-}
-
 ; CHECK:      L..CPI0_0:
 ; CHECK-NEXT:   .byte   255                             # 0xff
 ; CHECK-NEXT:   .byte   0                               # 0x0
@@ -37,8 +28,11 @@ entry:
 ; CHECK-NEXT:   .byte   0                               # 0x0
 ; CHECK-NEXT:   .byte   0                               # 0x0
 
-; CHECK:      ._Z5v00FFv:
-; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT:   lwz r3, L..C0(r2)
-; CHECK-NEXT:   lxv vs34, 0(r3)
-; CHECK-NEXT:   blr
+; CHECK-LABEL: _Z5v00FFv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-NEXT:    lxv vs34, 0(r3)
+; CHECK-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}

From c3ec9e3f6553b43caf2b9d754f128abbf44cf80e Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 13 Jun 2025 14:40:27 +0100
Subject: [PATCH 0278/1322] [lldb][DWARF] Don't try to compute address range
 information of forward declarations (#144059)

This fixes the error reported in
https://github.com/llvm/llvm-project/pull/144037.

When computing the aranges table of a CU, LLDB would currently visit all
`DW_TAG_subprogram` DIEs and check their
`DW_AT_low_pc`/`DW_AT_high_pc`/`DW_AT_ranges` attributes. If those don't
exist it would error out and spam the console. Some subprograms
(particularly forward declarations) don't have low/high pc attributes,
so it's not really an "error". See DWARFv5 spec section `3.3.3
Subroutine and Entry Point Locations`:
```
A subroutine entry may have either a DW_AT_low_pc and DW_AT_high_pc
pair of attributes or a DW_AT_ranges attribute whose values encode the
contiguous or non-contiguous address ranges, respectively, of the machine
instructions generated for the subroutine (see Section 2.17 on page 51).
...
A subroutine entry representing a subroutine declaration that is not also a
definition does not have code address or range attributes.
```

We should just ignore those DIEs.
---
 .../source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 5196ce89a2c1..8217c85f8601 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -611,7 +611,11 @@ void DWARFDebugInfoEntry::BuildFunctionAddressRangeTable(
     DWARFUnit *cu, DWARFDebugAranges *debug_aranges) const {
   Log *log = GetLog(DWARFLog::DebugInfo);
   if (m_tag) {
-    if (m_tag == DW_TAG_subprogram) {
+    // Subprogram forward declarations don't have
+    // DW_AT_ranges/DW_AT_low_pc/DW_AT_high_pc attributes, so don't even try
+    // getting address range information for them.
+    if (m_tag == DW_TAG_subprogram &&
+        !GetAttributeValueAsOptionalUnsigned(cu, DW_AT_declaration)) {
       if (llvm::Expected<llvm::DWARFAddressRangesVector> ranges =
               GetAttributeAddressRanges(cu, /*check_hi_lo_pc=*/true)) {
         for (const auto &r : *ranges)

From 6f999a5d99e5cb21520d8a7878ed0d3a32971af6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 14:52:17 +0100
Subject: [PATCH 0279/1322] [x86] vector-pcmp.ll - regenerate VPTERNLOGD asm
 comment

---
 llvm/test/CodeGen/X86/vector-pcmp.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index 5b43acbe5237..30eb2279bda8 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -1749,7 +1749,7 @@ define <16 x i1> @is_positive_mask_v16i16_v16i1(<16 x i16> %x, <16 x i1> %y) {
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq

From a361a3dc7a12b776507f48035f245e764c45455d Mon Sep 17 00:00:00 2001
From: Yash Solanki <67216443+yashnator@users.noreply.github.com>
Date: Fri, 13 Jun 2025 19:23:34 +0530
Subject: [PATCH 0280/1322] [llvm][InstCombine] Fold select to cmp for weak and
 inverted inequalities (#143445)

---
 .../InstCombine/InstCombineSelect.cpp         |  22 ++
 .../Transforms/InstCombine/select-to-cmp.ll   | 293 ++++++++++++++++++
 2 files changed, 315 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/select-to-cmp.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 979a803a79ed..320b827bdbe8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3631,6 +3631,28 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
   if (!LHS->getType()->isIntOrIntVectorTy())
     return nullptr;
 
+  // If there is no -1, 0 or 1 at TV, then invert the select statement and try
+  // to canonicalize to one of the forms above
+  if (!isa<Constant>(TV)) {
+    if (!isa<Constant>(FV))
+      return nullptr;
+    Pred = ICmpInst::getInverseCmpPredicate(Pred);
+    std::swap(TV, FV);
+  }
+
+  if (ICmpInst::isNonStrictPredicate(Pred)) {
+    if (Constant *C = dyn_cast<Constant>(RHS)) {
+      auto FlippedPredAndConst =
+          getFlippedStrictnessPredicateAndConstant(Pred, C);
+      if (!FlippedPredAndConst)
+        return nullptr;
+      Pred = FlippedPredAndConst->first;
+      RHS = FlippedPredAndConst->second;
+    } else {
+      return nullptr;
+    }
+  }
+
   // Try to swap operands and the predicate. We need to be careful when doing
   // so because two of the patterns have opposite predicates, so use the
   // constant inside select to determine if swapping operands would be
diff --git a/llvm/test/Transforms/InstCombine/select-to-cmp.ll b/llvm/test/Transforms/InstCombine/select-to-cmp.ll
new file mode 100644
index 000000000000..a76d4b138686
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-to-cmp.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Tests for select to scmp
+
+define i32 @scmp_x_0_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %2 = icmp ne i32 %x, 0
+  %3 = zext i1 %2 to i32
+  %4 = icmp sgt i32 %x, -1
+  %5 = select i1 %4, i32 %3, i32 -1
+  ret i32 %5
+}
+
+; y = -10
+define i32 @scmp_x_0_inverted_const_neg10(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_neg10(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 -10)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp ne i32 %x, -10
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, -11
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; y = 7 (i8)
+define i8 @scmp_x_0_inverted_i8(i8 %x) {
+; CHECK-LABEL: define i8 @scmp_x_0_inverted_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[X]], i8 7)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = icmp ne i8 %x, 7
+  %2 = zext i1 %1 to i8
+  %3 = icmp sgt i8 %x, 6
+  %4 = select i1 %3, i8 %2, i8 -1
+  ret i8 %4
+}
+
+; scmp using ints of two kinds- i32 and i64
+define i32 @scmp_x_0_inverted_i64_neq(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_i64_neq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = call i64 @llvm.scmp.i64.i32(i32 [[X]], i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i64 [[SEL]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %x64 = sext i32 %x to i64
+  %cmp1 = icmp ne i64 %x64, 0
+  %zext = zext i1 %cmp1 to i64
+  %cmp2 = icmp sgt i64 %x64, -1
+  %sel = select i1 %cmp2, i64 %zext, i64 -1
+  %ret = trunc i64 %sel to i32
+  ret i32 %ret
+}
+
+; Same example as previous but with inequality
+define i32 @scmp_x_0_inverted_i64_sgt(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_i64_sgt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = call i64 @llvm.scmp.i64.i32(i32 [[X]], i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i64 [[SEL]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %x64 = sext i32 %x to i64
+  %cmp1 = icmp sgt i64 %x64, 0
+  %zext = zext i1 %cmp1 to i64
+  %cmp2 = icmp sgt i64 %x64, -1
+  %sel = select i1 %cmp2, i64 %zext, i64 -1
+  %ret = trunc i64 %sel to i32
+  ret i32 %ret
+}
+
+; y = -1000
+define i32 @scmp_x_0_inverted_const_neg1000(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_neg1000(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 -1000)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp sgt i32 %x, -1000
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, -1001
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; y = 1729
+define i32 @scmp_x_0_inverted_const_1729_sgt(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_1729_sgt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 1729)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp sgt i32 %x, 1729
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, 1728
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; ucmp with 10
+define i32 @ucmp_x_10_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_10_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[X]], i32 10)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp ne i32 %x, 10
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, 9
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; ucmp with -3, wraps around
+define i32 @ucmp_x_neg1_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_neg1_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[X]], i32 -3)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp ne i32 %x, -3
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, -4
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; ucmp with -4, wraps around
+define i8 @ucmp_x_neg4_i8_ugt(i8 %x) {
+; CHECK-LABEL: define i8 @ucmp_x_neg4_i8_ugt(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.ucmp.i8.i8(i8 [[X]], i8 -4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = icmp ugt i8 %x, -4
+  %2 = zext i1 %1 to i8
+  %3 = icmp ugt i8 %x, -5
+  %4 = select i1 %3, i8 %2, i8 -1
+  ret i8 %4
+}
+
+; Vector tests
+
+; Test with splat vec
+define <4 x i32> @scmp_x_0_inverted_splat_vec(<4 x i32> %x) {
+; CHECK-LABEL: define <4 x i32> @scmp_x_0_inverted_splat_vec(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[X]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %2 = icmp ne <4 x i32> %x, zeroinitializer
+  %3 = zext <4 x i1> %2 to <4 x i32>
+  %4 = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %5
+}
+
+; Test with non-splat vector and different bitwidth
+define <4 x i32> @non_splat_vec_scmp_diff_bitwidth(<4 x i32> %x) {
+; CHECK-LABEL: define <4 x i32> @non_splat_vec_scmp_diff_bitwidth(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = call <4 x i64> @llvm.scmp.v4i64.v4i32(<4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 -1, i32 5>)
+; CHECK-NEXT:    [[RET:%.*]] = trunc <4 x i64> [[SEL]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[RET]]
+;
+  %x64 = sext <4 x i32> %x to <4 x i64>
+  %cmp1 = icmp slt <4 x i64> %x64, <i64 0, i64 1, i64 -1, i64 5>
+  %sext = sext <4 x i1> %cmp1 to <4 x i64>
+  %cmp2 = icmp slt <4 x i64> %x64, <i64 1, i64 2, i64 0, i64 6>
+  %sel = select <4 x i1> %cmp2, <4 x i64> %sext, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+  %ret = trunc <4 x i64> %sel to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Negative examples
+
+; Not scmp due to wrong RHS of the predicate
+define i32 @scmp_ne_0(i32 %0) {
+; CHECK-LABEL: define i32 @scmp_ne_0(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 -1
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+  %2 = icmp ne i32 %0, 0
+  %3 = zext i1 %2 to i32
+  %4 = icmp sgt i32 %0, 1
+  %5 = select i1 %4, i32 %3, i32 -1
+  ret i32 %5
+}
+
+; y = 0 with unsigned compare but RHS wraps
+define i32 @ucmp_x_0_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_0_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 -1
+;
+  %1 = icmp ne i32 %x, 0
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, -1
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Don't fold with INT32_MIN
+define i32 @scmp_x_0_inverted_const_min(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_min(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = icmp ne i32 %x, -2147483648
+  %2 = zext i1 %1 to i32
+  %3 = icmp sge i32 %x, -2147483648
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Unsigned cmp of zext of i32 with i64 -1 should always be -1
+define i32 @ucmp_x_0_inverted_i64_ugt(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_0_inverted_i64_ugt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 -1
+;
+  %x64 = zext i32 %x to i64
+  %cmp1 = icmp ugt i64 %x64, 0
+  %zext = zext i1 %cmp1 to i64
+  %cmp2 = icmp ugt i64 %x64, -1
+  %sel = select i1 %cmp2, i64 %zext, i64 -1
+  %ret = trunc i64 %sel to i32
+  ret i32 %ret
+}
+
+; y = 4294967295 (UINT32_MAX), simply sign extend neq
+define i32 @ucmp_x_const_u32max(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_const_u32max(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = icmp ugt i32 %x, 4294967295
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, 4294967294
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Don't fold with different signedness
+define i32 @different_signedness_neg(i32 %x) {
+; CHECK-LABEL: define i32 @different_signedness_neg(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X]], -10
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[X]], -11
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 -1
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %1 = icmp ugt i32 %x, -10
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, -11
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Test with wrong false value
+define <4 x i32> @scmp_x_0_inverted_vec(<4 x i32> %x) {
+; CHECK-LABEL: define <4 x i32> @scmp_x_0_inverted_vec(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> <i32 -1, i32 -2, i32 -1, i32 -1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %2 = icmp ne <4 x i32> %x, zeroinitializer
+  %3 = zext <4 x i1> %2 to <4 x i32>
+  %4 = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -1, i32 -2, i32 -1, i32 -1>
+  ret <4 x i32> %5
+}

From 8b11de70681355d7e7a4f8f3da85afa31fa7fc74 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Fri, 13 Jun 2025 15:59:58 +0200
Subject: [PATCH 0281/1322] [AMDGPU][SDAG] Initial support for ISD::PTRADD
 (#141725)

Enable generation of PTRADD SelectionDAG nodes for pointer arithmetic for SI,
for now behind an internal CLI option. Also add basic patterns to match these
nodes. Optimizations will come in follow-up PRs. Basic tests for SDAG codegen
with PTRADD are in test/CodeGen/AMDGPU/ptradd-sdag.ll

Only affects 64-bit address spaces for now, since the immediate use case only
affects the flat address space.

For SWDEV-516125.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  13 +
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  14 +
 llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll   | 537 ++++++++++++++++++++++
 4 files changed, 566 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 53dc540cbd63..30535ae88f7b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -61,6 +61,14 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
     cl::desc("Use indirect register addressing for divergent indexes"),
     cl::init(false));
 
+// TODO: This option should be removed once we switch to always using PTRADD in
+// the SelectionDAG.
+static cl::opt<bool> UseSelectionDAGPTRADD(
+    "amdgpu-use-sdag-ptradd", cl::Hidden,
+    cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
+             "SelectionDAG ISel"),
+    cl::init(false));
+
 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
@@ -10457,6 +10465,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
 }
 
+bool SITargetLowering::shouldPreservePtrArith(const Function &F,
+                                              EVT PtrVT) const {
+  return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
+}
+
 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
 // offset (the offset that is included in bounds checking and swizzling, to be
 // split between the instruction's voffset and immoffset fields) and soffset
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e14611d99964..d71a22722129 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -260,6 +260,8 @@ public:
 
   bool shouldExpandVectorDynExt(SDNode *N) const;
 
+  bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
+
 private:
   // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
   // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 360fd05cb3d9..1419f63202a7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1376,6 +1376,20 @@ def : GCNPat <
       (i32 (V_MOV_B32_e32 (i32 0))), sub1)
 >;
 
+//===----------------------------------------------------------------------===//
+// PTRADD Patterns
+//===----------------------------------------------------------------------===//
+
+// GlobalISel shouldn't generate 64-bit addition pseudos.
+let GISelShouldIgnore = 1 in {
+def : GCNPat<
+  (DivergentBinFrag<ptradd> i64:$src0, i64:$src1),
+  (V_ADD_U64_PSEUDO $src0, $src1)>;
+def : GCNPat<
+  (UniformBinFrag<ptradd> i64:$src0, i64:$src1),
+  (S_ADD_U64_PSEUDO $src0, $src1)>;
+}
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
new file mode 100644
index 000000000000..653d4b85a9a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
@@ -0,0 +1,537 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY
+
+; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address
+; spaces since PTRADD is currently only used for these.
+
+; Check that basic pointer arithmetic can be lowered.
+define ptr @gep_as0(ptr %p, i64 %offset) {
+; GFX8-LABEL: gep_as0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_PTRADD-LABEL: gep_as0:
+; GFX942_PTRADD:       ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: gep_as0:
+; GFX942_LEGACY:       ; %bb.0: ; %entry
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: gep_as0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: gep_as0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: gep_as0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %offset
+  %gep2 = getelementptr inbounds i8, ptr %gep1, i64 5
+  ret ptr %gep2
+}
+
+define amdgpu_kernel void @gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) {
+; GFX8-LABEL: gep_as0_uniform:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT:    s_add_i32 s12, s12, s17
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_endpgm
+;
+; GFX942-LABEL: gep_as0_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: gep_as0_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: gep_as0_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: gep_as0_uniform:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr inbounds i32, ptr %p, i64 %offset
+  store ptr %gep, ptr %ret
+  ret void
+}
+
+; Check that pointer arithmetic with multiple indexing steps can be lowered.
+define ptr @multi_gep_as0(ptr %p, i64 %offset) {
+; GFX8-LABEL: multi_gep_as0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_PTRADD-LABEL: multi_gep_as0:
+; GFX942_PTRADD:       ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: multi_gep_as0:
+; GFX942_LEGACY:       ; %bb.0: ; %entry
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: multi_gep_as0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: multi_gep_as0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: multi_gep_as0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %offset
+  %gep2 = getelementptr inbounds i8, ptr %gep1, i64 5
+  ret ptr %gep2
+}
+
+define amdgpu_kernel void @multi_gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) {
+; GFX8-LABEL: multi_gep_as0_uniform:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT:    s_add_i32 s12, s12, s17
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_add_u32 s0, s0, 5
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_endpgm
+;
+; GFX942-LABEL: multi_gep_as0_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, 5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: multi_gep_as0_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10-NEXT:    s_add_u32 s0, s0, 5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: multi_gep_as0_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_add_u32 s0, s0, 5
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: multi_gep_as0_uniform:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 5
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %offset
+  %gep2 = getelementptr inbounds i8, ptr %gep1, i64 5
+  store ptr %gep2, ptr %ret
+  ret void
+}
+
+; Check that constant offsets are folded into memory instructions.
+
+define void @fold_as0(ptr %from, ptr %to) {
+; GFX8-LABEL: fold_as0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fold_as0:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dword v0, v[0:1] offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_store_dword v[2:3], v0 offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fold_as0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v0, v[0:1] offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_store_dword v[2:3], v0 offset:8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fold_as0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v0, v[0:1] offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_store_b32 v[2:3], v0 offset:8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fold_as0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1] offset:8
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[2:3], v0 offset:8
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep.from = getelementptr inbounds i8, ptr %from, i64 8
+  %val = load i32, ptr %gep.from, align 4
+  %gep.to = getelementptr inbounds i8, ptr %to, i64 8
+  store i32 %val, ptr %gep.to, align 4
+  ret void
+}
+
+define void @fold_as1(ptr addrspace(1) %from, ptr addrspace(1) %to) {
+; GFX8-LABEL: fold_as1:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fold_as1:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fold_as1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fold_as1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fold_as1:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep.from = getelementptr inbounds i8, ptr addrspace(1) %from, i64 8
+  %val = load i32, ptr addrspace(1) %gep.from, align 4
+  %gep.to = getelementptr inbounds i8, ptr addrspace(1) %to, i64 8
+  store i32 %val, ptr addrspace(1) %gep.to, align 4
+  ret void
+}
+
+define void @fold_as4(ptr addrspace(4) %from, ptr addrspace(1) %to) {
+; GFX8-LABEL: fold_as4:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fold_as4:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fold_as4:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fold_as4:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fold_as4:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep.from = getelementptr inbounds i8, ptr addrspace(4) %from, i64 8
+  %val = load i32, ptr addrspace(4) %gep.from, align 4
+  %gep.to = getelementptr inbounds i8, ptr addrspace(1) %to, i64 8
+  store i32 %val, ptr addrspace(1) %gep.to, align 4
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10_LEGACY: {{.*}}
+; GFX10_PTRADD: {{.*}}
+; GFX11_LEGACY: {{.*}}
+; GFX11_PTRADD: {{.*}}
+; GFX12_LEGACY: {{.*}}
+; GFX12_PTRADD: {{.*}}
+; GFX8_LEGACY: {{.*}}
+; GFX8_PTRADD: {{.*}}

From 0a0960dac69fc88a3c8bd5e2099f8d45b0292c78 Mon Sep 17 00:00:00 2001
From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:14:45 -0400
Subject: [PATCH 0282/1322] [mlir][spirv] Add bfloat16 support (#141458)

Adds bf16 support to SPIRV by using the `SPV_KHR_bfloat16` extension.
Only a few operations are supported, including loading from and storing
to memory, conversion to/from other types, cooperative matrix operations
(including coop matrix arithmetic ops) and dot product support.

This PR adds the type definition and implements the basic cast
operations. Arithmetic/coop matrix ops will be added in a separate PR.
---
 .../mlir/Dialect/SPIRV/IR/SPIRVBase.td        | 41 ++++++++++--
 .../mlir/Dialect/SPIRV/IR/SPIRVCastOps.td     | 12 ++--
 mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp    |  5 +-
 mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp      | 18 +++++-
 .../SPIRV/Deserialization/Deserializer.cpp    | 27 ++++++--
 .../Target/SPIRV/Serialization/Serializer.cpp | 11 +++-
 .../FuncToSPIRV/types-to-spirv.mlir           | 18 ++----
 .../test/Dialect/SPIRV/IR/arithmetic-ops.mlir | 64 +++++++++++++++++++
 mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir    |  8 +++
 mlir/test/Dialect/SPIRV/IR/cast-ops.mlir      | 56 ++++++++++++++++
 mlir/test/Dialect/SPIRV/IR/composite-ops.mlir |  7 ++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 17 +++++
 mlir/test/Dialect/SPIRV/IR/logical-ops.mlir   |  8 +++
 .../Dialect/SPIRV/IR/non-uniform-ops.mlir     | 16 +++++
 mlir/test/Dialect/SPIRV/IR/types.mlir         |  8 +--
 .../SPIRV/Transforms/vce-deduction.mlir       | 14 ++++
 mlir/test/Target/SPIRV/cast-ops.mlir          | 32 +++++++++-
 mlir/test/Target/SPIRV/logical-ops.mlir       | 23 +++++++
 18 files changed, 343 insertions(+), 42 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index b143cf9a5f50..e413503bbd67 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -344,6 +344,7 @@ def SPV_KHR_subgroup_rotate                  : I32EnumAttrCase<"SPV_KHR_subgroup
 def SPV_KHR_non_semantic_info                : I32EnumAttrCase<"SPV_KHR_non_semantic_info", 29>;
 def SPV_KHR_terminate_invocation             : I32EnumAttrCase<"SPV_KHR_terminate_invocation", 30>;
 def SPV_KHR_cooperative_matrix               : I32EnumAttrCase<"SPV_KHR_cooperative_matrix", 31>;
+def SPV_KHR_bfloat16                         : I32EnumAttrCase<"SPV_KHR_bfloat16", 32>;
 
 def SPV_EXT_demote_to_helper_invocation  : I32EnumAttrCase<"SPV_EXT_demote_to_helper_invocation", 1000>;
 def SPV_EXT_descriptor_indexing          : I32EnumAttrCase<"SPV_EXT_descriptor_indexing", 1001>;
@@ -436,7 +437,7 @@ def SPIRV_ExtensionAttr :
       SPV_KHR_fragment_shader_barycentric, SPV_KHR_ray_cull_mask,
       SPV_KHR_uniform_group_instructions, SPV_KHR_subgroup_rotate,
       SPV_KHR_non_semantic_info, SPV_KHR_terminate_invocation,
-      SPV_KHR_cooperative_matrix,
+      SPV_KHR_cooperative_matrix, SPV_KHR_bfloat16,
       SPV_EXT_demote_to_helper_invocation, SPV_EXT_descriptor_indexing,
       SPV_EXT_fragment_fully_covered, SPV_EXT_fragment_invocation_density,
       SPV_EXT_fragment_shader_interlock, SPV_EXT_physical_storage_buffer,
@@ -1412,6 +1413,23 @@ def SPIRV_C_ShaderStereoViewNV                          : I32EnumAttrCase<"Shade
     Extension<[SPV_NV_stereo_view_rendering]>
   ];
 }
+def SPIRV_C_BFloat16TypeKHR                             : I32EnumAttrCase<"BFloat16TypeKHR", 5116> {
+  list<Availability> availability = [
+    Extension<[SPV_KHR_bfloat16]>
+  ];
+}
+def SPIRV_C_BFloat16DotProductKHR                       : I32EnumAttrCase<"BFloat16DotProductKHR", 5117> {
+  list<I32EnumAttrCase> implies = [SPIRV_C_BFloat16TypeKHR];
+  list<Availability> availability = [
+    Extension<[SPV_KHR_bfloat16]>
+  ];
+}
+def SPIRV_C_BFloat16CooperativeMatrixKHR                : I32EnumAttrCase<"BFloat16CooperativeMatrixKHR", 5118> {
+  list<I32EnumAttrCase> implies = [SPIRV_C_BFloat16TypeKHR, SPIRV_C_CooperativeMatrixKHR];
+  list<Availability> availability = [
+    Extension<[SPV_KHR_bfloat16]>
+  ];
+}
 
 def SPIRV_C_Bfloat16ConversionINTEL                         : I32EnumAttrCase<"Bfloat16ConversionINTEL", 6115> {
   list<Availability> availability = [
@@ -1518,7 +1536,8 @@ def SPIRV_CapabilityAttr :
       SPIRV_C_StorageTexelBufferArrayNonUniformIndexing,
       SPIRV_C_ShaderViewportIndexLayerEXT, SPIRV_C_ShaderViewportMaskNV,
       SPIRV_C_ShaderStereoViewNV, SPIRV_C_Bfloat16ConversionINTEL,
-      SPIRV_C_CacheControlsINTEL
+      SPIRV_C_CacheControlsINTEL, SPIRV_C_BFloat16TypeKHR,
+      SPIRV_C_BFloat16DotProductKHR, SPIRV_C_BFloat16CooperativeMatrixKHR
     ]>;
 
 def SPIRV_AM_Logical                 : I32EnumAttrCase<"Logical", 0>;
@@ -3217,6 +3236,16 @@ def SPIRV_ExecutionModelAttr :
       SPIRV_EM_TaskEXT, SPIRV_EM_MeshEXT
     ]>;
 
+def SPIRV_FPE_BFloat16KHR : I32EnumAttrCase<"BFloat16KHR", 0> {
+  list<Availability> availability = [
+    Capability<[SPIRV_C_BFloat16TypeKHR]>
+  ];
+}
+def SPIRV_FPEncodingAttr :
+    SPIRV_I32EnumAttr<"FPEncoding", "valid SPIR-V FPEncoding", "f_p_encoding", [
+      SPIRV_FPE_BFloat16KHR
+    ]>;
+
 def SPIRV_FC_None         : I32BitEnumAttrCaseNone<"None">;
 def SPIRV_FC_Inline       : I32BitEnumAttrCaseBit<"Inline", 0>;
 def SPIRV_FC_DontInline   : I32BitEnumAttrCaseBit<"DontInline", 1>;
@@ -4161,10 +4190,12 @@ def SPIRV_Integer : AnyIntOfWidths<[8, 16, 32, 64]>;
 def SPIRV_Int16 : TypeAlias<I16, "Int16">;
 def SPIRV_Int32 : TypeAlias<I32, "Int32">;
 def SPIRV_Float32 : TypeAlias<F32, "Float32">;
+def SPIRV_BFloat16KHR : TypeAlias<BF16, "BFloat16">;
 def SPIRV_Float : FloatOfWidths<[16, 32, 64]>;
 def SPIRV_Float16or32 : FloatOfWidths<[16, 32]>;
+def SPIRV_AnyFloat : AnyTypeOf<[SPIRV_Float, SPIRV_BFloat16KHR]>;
 def SPIRV_Vector : VectorOfLengthAndType<[2, 3, 4, 8, 16],
-                                       [SPIRV_Bool, SPIRV_Integer, SPIRV_Float]>;
+                                       [SPIRV_Bool, SPIRV_Integer, SPIRV_AnyFloat]>;
 // Component type check is done in the type parser for the following SPIR-V
 // dialect-specific types so we use "Any" here.
 def SPIRV_AnyPtr : DialectType<SPIRV_Dialect, SPIRV_IsPtrType,
@@ -4187,14 +4218,14 @@ def SPIRV_AnyStruct : DialectType<SPIRV_Dialect, SPIRV_IsStructType,
 def SPIRV_AnySampledImage : DialectType<SPIRV_Dialect, SPIRV_IsSampledImageType,
                                 "any SPIR-V sampled image type">;
 
-def SPIRV_Numerical : AnyTypeOf<[SPIRV_Integer, SPIRV_Float]>;
+def SPIRV_Numerical : AnyTypeOf<[SPIRV_Integer, SPIRV_AnyFloat]>;
 def SPIRV_Scalar : AnyTypeOf<[SPIRV_Numerical, SPIRV_Bool]>;
 def SPIRV_Aggregate : AnyTypeOf<[SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct]>;
 def SPIRV_Composite :
     AnyTypeOf<[SPIRV_Vector, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
                SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix]>;
 def SPIRV_Type : AnyTypeOf<[
-    SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_Vector,
+    SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_AnyFloat, SPIRV_Vector,
     SPIRV_AnyPtr, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
     SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage,
     SPIRV_AnyImage
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
index b05ee0251df5..a5c8aa8fb450 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
@@ -86,7 +86,7 @@ def SPIRV_BitcastOp : SPIRV_Op<"Bitcast", [Pure]> {
 
 // -----
 
-def SPIRV_ConvertFToSOp : SPIRV_CastOp<"ConvertFToS", SPIRV_Integer, SPIRV_Float, []> {
+def SPIRV_ConvertFToSOp : SPIRV_CastOp<"ConvertFToS", SPIRV_Integer, SPIRV_AnyFloat, []> {
   let summary = [{
     Convert value numerically from floating point to signed integer, with
     round toward 0.0.
@@ -111,7 +111,7 @@ def SPIRV_ConvertFToSOp : SPIRV_CastOp<"ConvertFToS", SPIRV_Integer, SPIRV_Float
 
 // -----
 
-def SPIRV_ConvertFToUOp : SPIRV_CastOp<"ConvertFToU", SPIRV_Integer, SPIRV_Float, []> {
+def SPIRV_ConvertFToUOp : SPIRV_CastOp<"ConvertFToU", SPIRV_Integer, SPIRV_AnyFloat, []> {
   let summary = [{
     Convert value numerically from floating point to unsigned integer, with
     round toward 0.0.
@@ -138,7 +138,7 @@ def SPIRV_ConvertFToUOp : SPIRV_CastOp<"ConvertFToU", SPIRV_Integer, SPIRV_Float
 // -----
 
 def SPIRV_ConvertSToFOp : SPIRV_CastOp<"ConvertSToF",
-                                   SPIRV_Float,
+                                   SPIRV_AnyFloat,
                                    SPIRV_Integer,
                                    [SignedOp]> {
   let summary = [{
@@ -165,7 +165,7 @@ def SPIRV_ConvertSToFOp : SPIRV_CastOp<"ConvertSToF",
 // -----
 
 def SPIRV_ConvertUToFOp : SPIRV_CastOp<"ConvertUToF",
-                                   SPIRV_Float,
+                                   SPIRV_AnyFloat,
                                    SPIRV_Integer,
                                    [UnsignedOp]> {
   let summary = [{
@@ -192,8 +192,8 @@ def SPIRV_ConvertUToFOp : SPIRV_CastOp<"ConvertUToF",
 // -----
 
 def SPIRV_FConvertOp : SPIRV_CastOp<"FConvert",
-                                SPIRV_Float,
-                                SPIRV_Float,
+                                SPIRV_AnyFloat,
+                                SPIRV_AnyFloat,
                                 [UsableInSpecConstantOp]> {
   let summary = [{
     Convert value numerically from one floating-point width to another
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index 0cf5f0823be6..a21acef1c4b4 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -175,10 +175,7 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect,
 
   // Check other allowed types
   if (auto t = llvm::dyn_cast<FloatType>(type)) {
-    if (type.isBF16()) {
-      parser.emitError(typeLoc, "cannot use 'bf16' to compose SPIR-V types");
-      return Type();
-    }
+    // TODO: All float types are allowed for now, but this should be fixed.
   } else if (auto t = llvm::dyn_cast<IntegerType>(type)) {
     if (!ScalarType::isValid(t)) {
       parser.emitError(typeLoc,
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
index 1aff43c30133..93e0c9b33c54 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -526,7 +526,7 @@ bool ScalarType::classof(Type type) {
 }
 
 bool ScalarType::isValid(FloatType type) {
-  return llvm::is_contained({16u, 32u, 64u}, type.getWidth()) && !type.isBF16();
+  return llvm::is_contained({16u, 32u, 64u}, type.getWidth());
 }
 
 bool ScalarType::isValid(IntegerType type) {
@@ -535,6 +535,11 @@ bool ScalarType::isValid(IntegerType type) {
 
 void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions,
                                std::optional<StorageClass> storage) {
+  if (isa<BFloat16Type>(*this)) {
+    static const Extension ext = Extension::SPV_KHR_bfloat16;
+    extensions.push_back(ext);
+  }
+
   // 8- or 16-bit integer/floating-point numbers will require extra extensions
   // to appear in interface storage classes. See SPV_KHR_16bit_storage and
   // SPV_KHR_8bit_storage for more details.
@@ -640,7 +645,16 @@ void ScalarType::getCapabilities(
   } else {
     assert(llvm::isa<FloatType>(*this));
     switch (bitwidth) {
-      WIDTH_CASE(Float, 16);
+    case 16: {
+      if (isa<BFloat16Type>(*this)) {
+        static const Capability cap = Capability::BFloat16TypeKHR;
+        capabilities.push_back(cap);
+      } else {
+        static const Capability cap = Capability::Float16;
+        capabilities.push_back(cap);
+      }
+      break;
+    }
       WIDTH_CASE(Float, 64);
     case 32:
       break;
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index c43d584d7b91..b9d9a9015eb6 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -867,11 +867,15 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode,
     typeMap[operands[0]] = IntegerType::get(context, operands[1], sign);
   } break;
   case spirv::Opcode::OpTypeFloat: {
-    if (operands.size() != 2)
-      return emitError(unknownLoc, "OpTypeFloat must have bitwidth parameter");
+    if (operands.size() != 2 && operands.size() != 3)
+      return emitError(unknownLoc,
+                       "OpTypeFloat expects either 2 operands (type, bitwidth) "
+                       "or 3 operands (type, bitwidth, encoding), but got ")
+             << operands.size();
+    uint32_t bitWidth = operands[1];
 
     Type floatTy;
-    switch (operands[1]) {
+    switch (bitWidth) {
     case 16:
       floatTy = opBuilder.getF16Type();
       break;
@@ -883,8 +887,20 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode,
       break;
     default:
       return emitError(unknownLoc, "unsupported OpTypeFloat bitwidth: ")
-             << operands[1];
+             << bitWidth;
     }
+
+    if (operands.size() == 3) {
+      if (spirv::FPEncoding(operands[2]) != spirv::FPEncoding::BFloat16KHR)
+        return emitError(unknownLoc, "unsupported OpTypeFloat FP encoding: ")
+               << operands[2];
+      if (bitWidth != 16)
+        return emitError(unknownLoc,
+                         "invalid OpTypeFloat bitwidth for bfloat16 encoding: ")
+               << bitWidth << " (expected 16)";
+      floatTy = opBuilder.getBF16Type();
+    }
+
     typeMap[operands[0]] = floatTy;
   } break;
   case spirv::Opcode::OpTypeVector: {
@@ -1399,6 +1415,9 @@ LogicalResult spirv::Deserializer::processConstant(ArrayRef<uint32_t> operands,
     } else if (floatType.isF16()) {
       APInt data(16, operands[2]);
       value = APFloat(APFloat::IEEEhalf(), data);
+    } else if (floatType.isBF16()) {
+      APInt data(16, operands[2]);
+      value = APFloat(APFloat::BFloat(), data);
     }
 
     auto attr = opBuilder.getFloatAttr(floatType, value);
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 647535809554..d258bfd85296 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -523,6 +523,9 @@ LogicalResult Serializer::prepareBasicType(
   if (auto floatType = dyn_cast<FloatType>(type)) {
     typeEnum = spirv::Opcode::OpTypeFloat;
     operands.push_back(floatType.getWidth());
+    if (floatType.isBF16()) {
+      operands.push_back(static_cast<uint32_t>(spirv::FPEncoding::BFloat16KHR));
+    }
     return success();
   }
 
@@ -1022,21 +1025,23 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
 
   auto resultID = getNextID();
   APFloat value = floatAttr.getValue();
+  const llvm::fltSemantics *semantics = &value.getSemantics();
 
   auto opcode =
       isSpec ? spirv::Opcode::OpSpecConstant : spirv::Opcode::OpConstant;
 
-  if (&value.getSemantics() == &APFloat::IEEEsingle()) {
+  if (semantics == &APFloat::IEEEsingle()) {
     uint32_t word = llvm::bit_cast<uint32_t>(value.convertToFloat());
     encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
-  } else if (&value.getSemantics() == &APFloat::IEEEdouble()) {
+  } else if (semantics == &APFloat::IEEEdouble()) {
     struct DoubleWord {
       uint32_t word1;
       uint32_t word2;
     } words = llvm::bit_cast<DoubleWord>(value.convertToDouble());
     encodeInstructionInto(typesGlobalValues, opcode,
                           {typeID, resultID, words.word1, words.word2});
-  } else if (&value.getSemantics() == &APFloat::IEEEhalf()) {
+  } else if (semantics == &APFloat::IEEEhalf() ||
+             semantics == &APFloat::BFloat()) {
     uint32_t word =
         static_cast<uint32_t>(value.bitcastToAPInt().getZExtValue());
     encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
diff --git a/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir b/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir
index 82d750755ffe..1737f4a906bf 100644
--- a/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir
+++ b/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir
@@ -173,6 +173,12 @@ func.func @float16(%arg0: f16) { return }
 // NOEMU-SAME: f64
 func.func @float64(%arg0: f64) { return }
 
+// CHECK-LABEL: spirv.func @bfloat16
+// CHECK-SAME: f32
+// NOEMU-LABEL: func.func @bfloat16
+// NOEMU-SAME: bf16
+func.func @bfloat16(%arg0: bf16) { return }
+
 // f80 is not supported by SPIR-V.
 // CHECK-LABEL: func.func @float80
 // CHECK-SAME: f80
@@ -206,18 +212,6 @@ func.func @float64(%arg0: f64) { return }
 
 // -----
 
-// Check that bf16 is not supported.
-module attributes {
-  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [], []>, #spirv.resource_limits<>>
-} {
-
-// CHECK-NOT: spirv.func @bf16_type
-func.func @bf16_type(%arg0: bf16) { return }
-
-} // end module
-
-// -----
-
 //===----------------------------------------------------------------------===//
 // Complex types
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir b/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
index 2d0c86e08de5..d58c27598f2b 100644
--- a/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
@@ -12,6 +12,14 @@ func.func @fadd_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fadd_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FAdd %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FDiv
 //===----------------------------------------------------------------------===//
@@ -24,6 +32,14 @@ func.func @fdiv_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fdiv_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FDiv %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FMod
 //===----------------------------------------------------------------------===//
@@ -36,6 +52,14 @@ func.func @fmod_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fmod_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FMod %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FMul
 //===----------------------------------------------------------------------===//
@@ -70,6 +94,14 @@ func.func @fmul_bf16(%arg: bf16) -> bf16 {
 
 // -----
 
+func.func @fmul_bf16_vector(%arg: vector<4xbf16>) -> vector<4xbf16> {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FMul %arg, %arg : vector<4xbf16>
+  return %0 : vector<4xbf16>
+}
+
+// -----
+
 func.func @fmul_tensor(%arg: tensor<4xf32>) -> tensor<4xf32> {
   // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
   %0 = spirv.FMul %arg, %arg : tensor<4xf32>
@@ -90,6 +122,14 @@ func.func @fnegate_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fnegate_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FNegate %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FRem
 //===----------------------------------------------------------------------===//
@@ -102,6 +142,14 @@ func.func @frem_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @frem_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FRem %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FSub
 //===----------------------------------------------------------------------===//
@@ -114,6 +162,14 @@ func.func @fsub_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fsub_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FSub %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.IAdd
 //===----------------------------------------------------------------------===//
@@ -489,3 +545,11 @@ func.func @vector_times_scalar(%vector: vector<4xf32>, %scalar: f32) -> vector<3
   %0 = spirv.VectorTimesScalar %vector, %scalar : (vector<4xf32>, f32) -> vector<3xf32>
   return %0 : vector<3xf32>
 }
+
+// -----
+
+func.func @vector_bf16_times_scalar_bf16(%vector: vector<4xbf16>, %scalar: bf16) -> vector<4xbf16> {
+  // expected-error @+1 {{op operand #0 must be vector of 16/32/64-bit float values of length 2/3/4}}
+  %0 = spirv.VectorTimesScalar %vector, %scalar : (vector<4xbf16>, bf16) -> vector<4xbf16>
+  return %0 : vector<4xbf16>
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir b/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir
index cc0abd3a42dc..661497d5fff3 100644
--- a/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir
@@ -272,3 +272,11 @@ func.func @atomic_fadd(%ptr : !spirv.ptr<f32, StorageBuffer>, %value : f32) -> f
   %0 = spirv.EXT.AtomicFAdd <Device> <Acquire|Release> %ptr, %value : !spirv.ptr<f32, StorageBuffer>
   return %0 : f32
 }
+
+// -----
+
+func.func @atomic_bf16_fadd(%ptr : !spirv.ptr<bf16, StorageBuffer>, %value : bf16) -> bf16 {
+  // expected-error @+1 {{op operand #1 must be 16/32/64-bit float, but got 'bf16'}}
+  %0 = spirv.EXT.AtomicFAdd <Device> <None> %ptr, %value : !spirv.ptr<bf16, StorageBuffer>
+  return %0 : bf16
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir b/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir
index 34d0109e6bb4..4480a1f3720f 100644
--- a/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir
@@ -110,6 +110,14 @@ func.func @convert_f_to_s_vector(%arg0 : vector<3xf32>) -> vector<3xi32> {
 
 // -----
 
+func.func @convert_bf16_to_s32_scalar(%arg0 : bf16) -> i32 {
+  // CHECK: {{%.*}} = spirv.ConvertFToS {{%.*}} : bf16 to i32
+  %0 = spirv.ConvertFToS %arg0 : bf16 to i32
+  spirv.ReturnValue %0 : i32
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ConvertFToU
 //===----------------------------------------------------------------------===//
@@ -146,6 +154,14 @@ func.func @convert_f_to_u.coopmatrix(%arg0 : !spirv.coopmatrix<8x16xf32, Subgrou
 
 // -----
 
+func.func @convert_bf16_to_u32_scalar(%arg0 : bf16) -> i32 {
+  // CHECK: {{%.*}} = spirv.ConvertFToU {{%.*}} : bf16 to i32
+  %0 = spirv.ConvertFToU %arg0 : bf16 to i32
+  spirv.ReturnValue %0 : i32
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ConvertSToF
 //===----------------------------------------------------------------------===//
@@ -174,6 +190,14 @@ func.func @convert_s_to_f_vector(%arg0 : vector<3xi32>) -> vector<3xf32> {
 
 // -----
 
+func.func @convert_s32_to_bf16_scalar(%arg0 : i32) -> bf16 {
+  // CHECK: {{%.*}} = spirv.ConvertSToF {{%.*}} : i32 to bf16
+  %0 = spirv.ConvertSToF %arg0 : i32 to bf16
+  spirv.ReturnValue %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ConvertUToF
 //===----------------------------------------------------------------------===//
@@ -202,6 +226,14 @@ func.func @convert_u_to_f_vector(%arg0 : vector<3xi32>) -> vector<3xf32> {
 
 // -----
 
+func.func @convert_u32_to_bf16_scalar(%arg0 : i32) -> bf16 {
+  // CHECK: {{%.*}} = spirv.ConvertUToF {{%.*}} : i32 to bf16
+  %0 = spirv.ConvertUToF %arg0 : i32 to bf16
+  spirv.ReturnValue %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FConvert
 //===----------------------------------------------------------------------===//
@@ -238,6 +270,30 @@ func.func @f_convert_vector(%arg0 : f32) -> f32 {
 
 // -----
 
+func.func @f_convert_bf16_to_f32_scalar(%arg0 : bf16) -> f32 {
+  // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : bf16 to f32
+  %0 = spirv.FConvert %arg0 : bf16 to f32
+  spirv.ReturnValue %0 : f32
+}
+
+// -----
+
+func.func @f_convert_f32_to_bf16_vector(%arg0 : vector<3xf32>) -> vector<3xbf16> {
+  // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : vector<3xf32> to vector<3xbf16>
+  %0 = spirv.FConvert %arg0 : vector<3xf32> to vector<3xbf16>
+  spirv.ReturnValue %0 : vector<3xbf16>
+}
+
+// -----
+
+func.func @f_convert_f32_to_bf16_coop_matrix(%arg0 : !spirv.coopmatrix<8x16xf32, Subgroup, MatrixA>) -> !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA> {
+  // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : !spirv.coopmatrix<8x16xf32, Subgroup, MatrixA> to !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA>
+  %0 = spirv.FConvert %arg0 : !spirv.coopmatrix<8x16xf32, Subgroup, MatrixA> to !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA>
+  spirv.ReturnValue %0 : !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA>
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.SConvert
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir b/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir
index 3fc8dfb2767d..e71b545de11d 100644
--- a/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir
@@ -11,6 +11,13 @@ func.func @composite_construct_vector(%arg0: f32, %arg1: f32, %arg2 : f32) -> ve
   return %0: vector<3xf32>
 }
 
+// CHECK-LABEL: func @composite_construct_bf16_vector
+func.func @composite_construct_bf16_vector(%arg0: bf16, %arg1: bf16, %arg2 : bf16) -> vector<3xbf16> {
+  // CHECK: spirv.CompositeConstruct {{%.*}}, {{%.*}}, {{%.*}} : (bf16, bf16, bf16) -> vector<3xbf16>
+  %0 = spirv.CompositeConstruct %arg0, %arg1, %arg2 : (bf16, bf16, bf16) -> vector<3xbf16>
+  return %0: vector<3xbf16>
+}
+
 // CHECK-LABEL: func @composite_construct_struct
 func.func @composite_construct_struct(%arg0: vector<3xf32>, %arg1: !spirv.array<4xf32>, %arg2 : !spirv.struct<(f32)>) -> !spirv.struct<(vector<3xf32>, !spirv.array<4xf32>, !spirv.struct<(f32)>)> {
   // CHECK: spirv.CompositeConstruct
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 2b75767feaf9..642346cc40b0 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -50,6 +50,14 @@ func.func @exp(%arg0 : i32) -> () {
 
 // -----
 
+func.func @exp_bf16(%arg0 : bf16) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32-bit float or vector of 16/32-bit float values of length 2/3/4}}
+  %2 = spirv.GL.Exp %arg0 : bf16
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GL.{F|S|U}{Max|Min}
 //===----------------------------------------------------------------------===//
@@ -92,6 +100,15 @@ func.func @iminmax(%arg0: i32, %arg1: i32) {
 
 // -----
 
+func.func @fmaxminbf16vec(%arg0 : vector<3xbf16>, %arg1 : vector<3xbf16>) {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %1 = spirv.GL.FMax %arg0, %arg1 : vector<3xbf16>
+  %2 = spirv.GL.FMin %arg0, %arg1 : vector<3xbf16>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GL.InverseSqrt
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
index 5c24f0e6a7d3..d6c34645f574 100644
--- a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
@@ -201,6 +201,14 @@ func.func @select_op_float(%arg0: i1) -> () {
   return
 }
 
+func.func @select_op_bfloat16(%arg0: i1) -> () {
+  %0 = spirv.Constant 2.0 : bf16
+  %1 = spirv.Constant 3.0 : bf16
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, bf16
+  %2 = spirv.Select %arg0, %0, %1 : i1, bf16
+  return
+}
+
 func.func @select_op_ptr(%arg0: i1) -> () {
   %0 = spirv.Variable : !spirv.ptr<f32, Function>
   %1 = spirv.Variable : !spirv.ptr<f32, Function>
diff --git a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
index 5f56de6ad1fa..7ab94f17360d 100644
--- a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
@@ -184,6 +184,14 @@ func.func @group_non_uniform_fmul_clustered_reduce(%val: vector<2xf32>) -> vecto
 
 // -----
 
+func.func @group_non_uniform_bf16_fmul_reduce(%val: bf16) -> bf16 {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'bf16'}}
+  %0 = spirv.GroupNonUniformFMul <Workgroup> <Reduce> %val : bf16 -> bf16
+  return %0: bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GroupNonUniformFMax
 //===----------------------------------------------------------------------===//
@@ -197,6 +205,14 @@ func.func @group_non_uniform_fmax_reduce(%val: f32) -> f32 {
 
 // -----
 
+func.func @group_non_uniform_bf16_fmax_reduce(%val: bf16) -> bf16 {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'bf16'}}
+  %0 = spirv.GroupNonUniformFMax <Workgroup> <Reduce> %val : bf16 -> bf16
+  return %0: bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GroupNonUniformFMin
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/types.mlir b/mlir/test/Dialect/SPIRV/IR/types.mlir
index b63a08d96e6a..c23894c62826 100644
--- a/mlir/test/Dialect/SPIRV/IR/types.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/types.mlir
@@ -15,6 +15,9 @@ func.func private @vector_array_type(!spirv.array< 32 x vector<4xf32> >) -> ()
 // CHECK: func private @array_type_stride(!spirv.array<4 x !spirv.array<4 x f32, stride=4>, stride=128>)
 func.func private @array_type_stride(!spirv.array< 4 x !spirv.array<4 x f32, stride=4>, stride = 128>) -> ()
 
+// CHECK: func private @vector_array_type_bf16(!spirv.array<32 x vector<4xbf16>>)
+func.func private @vector_array_type_bf16(!spirv.array<32 x vector<4xbf16> >) -> ()
+
 // -----
 
 // expected-error @+1 {{expected '<'}}
@@ -57,11 +60,6 @@ func.func private @tensor_type(!spirv.array<4xtensor<4xf32>>) -> ()
 
 // -----
 
-// expected-error @+1 {{cannot use 'bf16' to compose SPIR-V types}}
-func.func private @bf16_type(!spirv.array<4xbf16>) -> ()
-
-// -----
-
 // expected-error @+1 {{only 1/8/16/32/64-bit integer type allowed but found 'i256'}}
 func.func private @i256_type(!spirv.array<4xi256>) -> ()
 
diff --git a/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir b/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir
index ff5ac7cea8fc..2b237665ffc4 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir
@@ -217,3 +217,17 @@ spirv.module Logical GLSL450 attributes {
   spirv.GlobalVariable @data : !spirv.ptr<!spirv.struct<(i8 [0], f16 [2], i64 [4])>, Uniform>
   spirv.GlobalVariable @img  : !spirv.ptr<!spirv.image<f32, Buffer, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Rg32f>, UniformConstant>
 }
+
+// Using bfloat16 requires BFloat16TypeKHR capability and SPV_KHR_bfloat16 extension.
+// CHECK: requires #spirv.vce<v1.0, [StorageBuffer16BitAccess, Shader, BFloat16TypeKHR], [SPV_KHR_bfloat16, SPV_KHR_16bit_storage, SPV_KHR_storage_buffer_storage_class]>
+spirv.module Logical GLSL450 attributes {
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader, StorageBuffer16BitAccess, BFloat16TypeKHR], [SPV_KHR_bfloat16, SPV_KHR_16bit_storage, SPV_KHR_storage_buffer_storage_class]>,
+    #spirv.resource_limits<>
+  >
+} {
+  spirv.func @load_bf16(%ptr : !spirv.ptr<bf16, StorageBuffer>) -> bf16 "None" {
+    %val = spirv.Load "StorageBuffer" %ptr : bf16
+    spirv.ReturnValue %val : bf16
+  }
+}
diff --git a/mlir/test/Target/SPIRV/cast-ops.mlir b/mlir/test/Target/SPIRV/cast-ops.mlir
index ede0bf30511e..04a468b39b64 100644
--- a/mlir/test/Target/SPIRV/cast-ops.mlir
+++ b/mlir/test/Target/SPIRV/cast-ops.mlir
@@ -25,6 +25,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertFToS %arg0 : f64 to i32
     spirv.ReturnValue %0 : i32
   }
+  spirv.func @convert_bf16_to_s32(%arg0 : bf16) -> i32 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertFToS {{%.*}} : bf16 to i32
+    %0 = spirv.ConvertFToS %arg0 : bf16 to i32
+    spirv.ReturnValue %0 : i32
+  }
   spirv.func @convert_f_to_u(%arg0 : f32) -> i32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertFToU {{%.*}} : f32 to i32
     %0 = spirv.ConvertFToU %arg0 : f32 to i32
@@ -35,6 +40,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertFToU %arg0 : f64 to i32
     spirv.ReturnValue %0 : i32
   }
+  spirv.func @convert_bf16_to_u32(%arg0 : bf16) -> i32 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertFToU {{%.*}} : bf16 to i32
+    %0 = spirv.ConvertFToU %arg0 : bf16 to i32
+    spirv.ReturnValue %0 : i32
+  }
   spirv.func @convert_s_to_f(%arg0 : i32) -> f32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertSToF {{%.*}} : i32 to f32
     %0 = spirv.ConvertSToF %arg0 : i32 to f32
@@ -45,6 +55,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertSToF %arg0 : i64 to f32
     spirv.ReturnValue %0 : f32
   }
+  spirv.func @convert_s64_to_bf16(%arg0 : i64) -> bf16 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertSToF {{%.*}} : i64 to bf16
+    %0 = spirv.ConvertSToF %arg0 : i64 to bf16
+    spirv.ReturnValue %0 : bf16
+  }
   spirv.func @convert_u_to_f(%arg0 : i32) -> f32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertUToF {{%.*}} : i32 to f32
     %0 = spirv.ConvertUToF %arg0 : i32 to f32
@@ -55,11 +70,26 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertUToF %arg0 : i64 to f32
     spirv.ReturnValue %0 : f32
   }
-  spirv.func @f_convert(%arg0 : f32) -> f64 "None" {
+  spirv.func @convert_u64_to_bf16(%arg0 : i64) -> bf16 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertUToF {{%.*}} : i64 to bf16
+    %0 = spirv.ConvertUToF %arg0 : i64 to bf16
+    spirv.ReturnValue %0 : bf16
+  }
+  spirv.func @convert_f32_to_f64(%arg0 : f32) -> f64 "None" {
     // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : f32 to f64
     %0 = spirv.FConvert %arg0 : f32 to f64
     spirv.ReturnValue %0 : f64
   }
+  spirv.func @convert_f32_to_bf16(%arg0 : f32) -> bf16 "None" {
+    // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : f32 to bf16
+    %0 = spirv.FConvert %arg0 : f32 to bf16
+    spirv.ReturnValue %0 : bf16
+  }
+  spirv.func @convert_bf16_to_f32(%arg0 : bf16) -> f32 "None" {
+    // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : bf16 to f32
+    %0 = spirv.FConvert %arg0 : bf16 to f32
+    spirv.ReturnValue %0 : f32
+  }
   spirv.func @s_convert(%arg0 : i32) -> i64 "None" {
     // CHECK: {{%.*}} = spirv.SConvert {{%.*}} : i32 to i64
     %0 = spirv.SConvert %arg0 : i32 to i64
diff --git a/mlir/test/Target/SPIRV/logical-ops.mlir b/mlir/test/Target/SPIRV/logical-ops.mlir
index 16846ac84e38..b2008719b021 100644
--- a/mlir/test/Target/SPIRV/logical-ops.mlir
+++ b/mlir/test/Target/SPIRV/logical-ops.mlir
@@ -108,3 +108,26 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.Return
   }
 }
+
+// -----
+
+// Test select works with bf16 scalar and vectors.
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+  spirv.SpecConstant @condition_scalar = true
+  spirv.func @select_bf16() -> () "None" {
+    %0 = spirv.Constant 4.0 : bf16
+    %1 = spirv.Constant 5.0 : bf16
+    %2 = spirv.mlir.referenceof @condition_scalar : i1
+    // CHECK: spirv.Select {{.*}}, {{.*}}, {{.*}} : i1, bf16
+    %3 = spirv.Select %2, %0, %1 : i1, bf16
+    %4 = spirv.Constant dense<[2.0, 3.0, 4.0, 5.0]> : vector<4xbf16>
+    %5 = spirv.Constant dense<[6.0, 7.0, 8.0, 9.0]> : vector<4xbf16>
+    // CHECK: spirv.Select {{.*}}, {{.*}}, {{.*}} : i1, vector<4xbf16>
+    %6 = spirv.Select %2, %4, %5 : i1, vector<4xbf16>
+    %7 = spirv.Constant dense<[true, true, true, true]> : vector<4xi1>
+    // CHECK: spirv.Select {{.*}}, {{.*}}, {{.*}} : vector<4xi1>, vector<4xbf16>
+    %8 = spirv.Select %7, %4, %5 : vector<4xi1>, vector<4xbf16>
+    spirv.Return
+  }
+}

From e6a3579653196af337f191ed2a3acbbf0e6d01bb Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Fri, 13 Jun 2025 15:22:47 +0100
Subject: [PATCH 0283/1322] [Offload] Replace device info queue with a tree
 (#144050)

Previously, device info was returned as a queue with each element having
a "Level" field indicating its nesting level. This replaces this queue
with a more traditional tree-like structure.

This should not result in a change to the output of
`llvm-offload-device-info`.
---
 offload/liboffload/src/OffloadImpl.cpp        |  15 +-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  45 +++---
 .../common/include/PluginInterface.h          | 133 ++++++++++--------
 .../common/src/PluginInterface.cpp            |   6 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  21 +--
 offload/plugins-nextgen/host/src/rtl.cpp      |   5 +-
 6 files changed, 122 insertions(+), 103 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 0a784cddeaec..770c212d804d 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -229,26 +229,19 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 
   // Find the info if it exists under any of the given names
   auto GetInfo = [&](std::vector<std::string> Names) {
-    InfoQueueTy DevInfo;
     if (Device == HostDevice())
       return std::string("Host");
 
     if (!Device->Device)
       return std::string("");
 
-    if (auto Err = Device->Device->obtainInfoImpl(DevInfo))
+    auto Info = Device->Device->obtainInfoImpl();
+    if (auto Err = Info.takeError())
       return std::string("");
 
     for (auto Name : Names) {
-      auto InfoKeyMatches = [&](const InfoQueueTy::InfoQueueEntryTy &Info) {
-        return Info.Key == Name;
-      };
-      auto Item = std::find_if(DevInfo.getQueue().begin(),
-                               DevInfo.getQueue().end(), InfoKeyMatches);
-
-      if (Item != std::end(DevInfo.getQueue())) {
-        return Item->Value;
-      }
+      if (auto Entry = Info->get(Name))
+        return (*Entry)->Value;
     }
 
     return std::string("");
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c1..73e1e66928fa 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2551,7 +2551,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
+  Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
     const char *TmpCharPtr = "Unknown";
     uint16_t Major, Minor;
@@ -2562,6 +2562,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     uint16_t WorkgrpMaxDim[3];
     hsa_dim3_t GridMaxDim;
     hsa_status_t Status, Status2;
+    InfoTreeNode Info;
 
     Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
     Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
@@ -2617,11 +2618,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // runtime.
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
     if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Cache");
+      auto &Cache = *Info.add("Cache");
 
       for (int I = 0; I < 4; I++)
         if (CacheSize[I])
-          Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
+          Cache.add("L" + std::to_string(I), CacheSize[I]);
     }
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
@@ -2654,10 +2655,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Workgroup Max Size per Dimension");
-      Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
-      Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
-      Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
+      auto &MaxSize = *Info.add("Workgroup Max Size per Dimension");
+      MaxSize.add("x", WorkgrpMaxDim[0]);
+      MaxSize.add("y", WorkgrpMaxDim[1]);
+      MaxSize.add("z", WorkgrpMaxDim[2]);
     }
 
     Status = getDeviceAttrRaw(
@@ -2673,17 +2674,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Grid Max Size per Dimension");
-      Info.add<InfoLevel2>("x", GridMaxDim.x);
-      Info.add<InfoLevel2>("y", GridMaxDim.y);
-      Info.add<InfoLevel2>("z", GridMaxDim.z);
+      auto &MaxDim = *Info.add("Grid Max Size per Dimension");
+      MaxDim.add("x", GridMaxDim.x);
+      MaxDim.add("y", GridMaxDim.y);
+      MaxDim.add("z", GridMaxDim.z);
     }
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Max fbarriers/Workgrp", TmpUInt);
 
-    Info.add("Memory Pools");
+    auto &RootPool = *Info.add("Memory Pools");
     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
       std::string TmpStr, TmpStr2;
 
@@ -2698,7 +2699,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       else
         TmpStr = "Unknown";
 
-      Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
+      auto &PoolNode = *RootPool.add(std::string("Pool ") + TmpStr);
 
       if (Pool->isGlobal()) {
         if (Pool->isFineGrained())
@@ -2708,39 +2709,39 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         if (Pool->supportsKernelArgs())
           TmpStr2 += "Kernarg ";
 
-        Info.add<InfoLevel3>("Flags", TmpStr2);
+        PoolNode.add("Flags", TmpStr2);
       }
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Size", TmpSt, "bytes");
+        PoolNode.add("Size", TmpSt, "bytes");
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
                                 TmpBool);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Allocatable", TmpBool);
+        PoolNode.add("Allocatable", TmpBool);
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
                                 TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
+        PoolNode.add("Runtime Alloc Granule", TmpSt, "bytes");
 
       Status = Pool->getAttrRaw(
           HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
+        PoolNode.add("Runtime Alloc Alignment", TmpSt, "bytes");
 
       Status =
           Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Accessible by all", TmpBool);
+        PoolNode.add("Accessible by all", TmpBool);
     }
 
-    Info.add("ISAs");
+    auto &ISAs = *Info.add("ISAs");
     auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel2>("Name", TmpChar);
+        ISAs.add("Name", TmpChar);
 
       return Status;
     });
@@ -2749,7 +2750,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (Err)
       consumeError(std::move(Err));
 
-    return Plugin::success();
+    return Info;
   }
 
   /// Returns true if auto zero-copy the best configuration for the current
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6..f5d995532b7a 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -112,77 +112,100 @@ private:
   __tgt_async_info *AsyncInfoPtr;
 };
 
-/// The information level represents the level of a key-value property in the
-/// info tree print (i.e. indentation). The first level should be the default.
-enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
+/// Tree node for device information
+///
+/// This information is either printed or used by liboffload to extract certain
+/// device queries. Each property has an optional key, an optional value
+/// and optional children. The children can be used to store additional
+/// information (such as x, y and z components of ranges).
+struct InfoTreeNode {
+  static constexpr uint64_t IndentSize = 4;
 
-/// Class for storing device information and later be printed. An object of this
-/// type acts as a queue of key-value properties. Each property has a key, a
-/// a value, and an optional unit for the value. For printing purposes, the
-/// information can be classified into several levels. These levels are useful
-/// for defining sections and subsections. Thus, each key-value property also
-/// has an additional field indicating to which level belongs to. Notice that
-/// we use the level to determine the indentation of the key-value property at
-/// printing time. See the enum InfoLevelKind for the list of accepted levels.
-class InfoQueueTy {
-public:
-  struct InfoQueueEntryTy {
-    std::string Key;
-    std::string Value;
-    std::string Units;
-    uint64_t Level;
-  };
+  std::string Key;
+  std::string Value;
+  std::string Units;
+  // Need to specify a default value number of elements here as `InfoTreeNode`'s
+  // size is unknown. This is a vector (rather than a Key->Value map) since:
+  // * The keys need to be owned and thus `std::string`s
+  // * The order of keys is important
+  // * The same key can appear multiple times
+  std::unique_ptr<llvm::SmallVector<InfoTreeNode, 8>> Children;
 
-private:
-  std::deque<InfoQueueEntryTy> Queue;
+  InfoTreeNode() : InfoTreeNode("", "", "") {}
+  InfoTreeNode(std::string Key, std::string Value, std::string Units)
+      : Key(Key), Value(Value), Units(Units) {}
 
-public:
-  /// Add a new info entry to the queue. The entry requires at least a key
-  /// string in \p Key. The value in \p Value is optional and can be any type
-  /// that is representable as a string. The units in \p Units is optional and
-  /// must be a string. The info level is a template parameter that defaults to
-  /// the first level (top level).
-  template <InfoLevelKind L = InfoLevel1, typename T = std::string>
-  void add(const std::string &Key, T Value = T(),
-           const std::string &Units = std::string()) {
+  /// Add a new info entry as a child of this node. The entry requires at least
+  /// a key string in \p Key. The value in \p Value is optional and can be any
+  /// type that is representable as a string. The units in \p Units is optional
+  /// and must be a string.
+  template <typename T = std::string>
+  InfoTreeNode *add(std::string Key, T Value = T(),
+                    const std::string &Units = std::string()) {
     assert(!Key.empty() && "Invalid info key");
 
-    // Convert the value to a string depending on its type.
+    if (!Children)
+      Children = std::make_unique<llvm::SmallVector<InfoTreeNode, 8>>();
+
+    std::string ValueStr;
     if constexpr (std::is_same_v<T, bool>)
-      Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
+      ValueStr = Value ? "Yes" : "No";
     else if constexpr (std::is_arithmetic_v<T>)
-      Queue.push_back({Key, std::to_string(Value), Units, L});
+      ValueStr = std::to_string(Value);
     else
-      Queue.push_back({Key, Value, Units, L});
+      ValueStr = Value;
+
+    return &Children->emplace_back(Key, ValueStr, Units);
   }
 
-  const std::deque<InfoQueueEntryTy> &getQueue() const { return Queue; }
+  std::optional<InfoTreeNode *> get(StringRef Key) {
+    if (!Children)
+      return std::nullopt;
 
-  /// Print all info entries added to the queue.
+    auto It = std::find_if(Children->begin(), Children->end(),
+                           [&](auto &V) { return V.Key == Key; });
+    if (It == Children->end())
+      return std::nullopt;
+    return It;
+  }
+
+  /// Print all info entries in the tree
   void print() const {
-    // We print four spances for each level.
-    constexpr uint64_t IndentSize = 4;
+    // Fake an additional indent so that values are offset from the keys
+    doPrint(0, maxKeySize(1));
+  }
 
-    // Find the maximum key length (level + key) to compute the individual
-    // indentation of each entry.
-    uint64_t MaxKeySize = 0;
-    for (const auto &Entry : Queue) {
-      uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
-      if (KeySize > MaxKeySize)
-        MaxKeySize = KeySize;
-    }
-
-    // Print all info entries.
-    for (const auto &Entry : Queue) {
+private:
+  void doPrint(int Level, uint64_t MaxKeySize) const {
+    if (Key.size()) {
       // Compute the indentations for the current entry.
-      uint64_t KeyIndentSize = Entry.Level * IndentSize;
+      uint64_t KeyIndentSize = Level * IndentSize;
       uint64_t ValIndentSize =
-          MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
+          MaxKeySize - (Key.size() + KeyIndentSize) + IndentSize;
 
-      llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
-                   << std::string(ValIndentSize, ' ') << Entry.Value
-                   << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
+      llvm::outs() << std::string(KeyIndentSize, ' ') << Key
+                   << std::string(ValIndentSize, ' ') << Value
+                   << (Units.empty() ? "" : " ") << Units << "\n";
     }
+
+    // Print children
+    if (Children)
+      for (const auto &Entry : *Children)
+        Entry.doPrint(Level + 1, MaxKeySize);
+  }
+
+  // Recursively calculates the maximum width of each key, including indentation
+  uint64_t maxKeySize(int Level) const {
+    uint64_t MaxKeySize = 0;
+
+    if (Children)
+      for (const auto &Entry : *Children) {
+        uint64_t KeySize = Entry.Key.size() + Level * IndentSize;
+        MaxKeySize = std::max(MaxKeySize, KeySize);
+        MaxKeySize = std::max(MaxKeySize, Entry.maxKeySize(Level + 1));
+      }
+
+    return MaxKeySize;
   }
 };
 
@@ -871,7 +894,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Print information about the device.
   Error printInfo();
-  virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
+  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
 
   /// Getters of the grid values.
   uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9a6b3c1f432..6fd3405d03af 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1578,14 +1578,14 @@ Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
 }
 
 Error GenericDeviceTy::printInfo() {
-  InfoQueueTy InfoQueue;
+  auto Info = obtainInfoImpl();
 
   // Get the vendor-specific info entries describing the device properties.
-  if (auto Err = obtainInfoImpl(InfoQueue))
+  if (auto Err = Info.takeError())
     return Err;
 
   // Print all info entries.
-  InfoQueue.print();
+  Info->print();
 
   return Plugin::success();
 }
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c..9943f533ef5a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -922,11 +922,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
+  Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
     const char *TmpCharPtr;
     size_t TmpSt;
     int TmpInt;
+    InfoTreeNode Info;
 
     CUresult Res = cuDriverGetVersion(&TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -971,27 +972,27 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       Info.add("Maximum Threads per Block", TmpInt);
 
-    Info.add("Maximum Block Dimensions", "");
+    auto &MaxBlock = *Info.add("Maximum Block Dimensions", "");
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("x", TmpInt);
+      MaxBlock.add("x", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("y", TmpInt);
+      MaxBlock.add("y", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("z", TmpInt);
+      MaxBlock.add("z", TmpInt);
 
-    Info.add("Maximum Grid Dimensions", "");
+    auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("x", TmpInt);
+      MaxGrid.add("x", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("y", TmpInt);
+      MaxGrid.add("y", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("z", TmpInt);
+      MaxGrid.add("z", TmpInt);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1087,7 +1088,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Info.add("Compute Capabilities", ComputeCapability.str());
 
-    return Plugin::success();
+    return Info;
   }
 
   virtual bool shouldSetupDeviceMemoryPool() const override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab25..ced9208acaed 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -326,9 +326,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
 
   /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
+  Expected<InfoTreeNode> obtainInfoImpl() override {
+    InfoTreeNode Info;
     Info.add("Device Type", "Generic-elf-64bit");
-    return Plugin::success();
+    return Info;
   }
 
   /// This plugin should not setup the device environment or memory pool.

From 82911f188be7ce7cb0a04b7fd648ea8b4aad2e59 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 13 Jun 2025 14:23:02 +0000
Subject: [PATCH 0284/1322] [lldb][test] Skip ReadAfterClose JSON Transport
 tests on Windows

These were failing on our Windows on Arm bot, or more precisely,
not even completing.

This is because Microsoft's C runtime does extra parameter validation.
So when we called _read with an invalid fd, it called an invalid
parameter handler instead of returning an error.

https://learn.microsoft.com/en-us/%20cpp/c-runtime-library/reference/read?view=msvc-170
https://learn.microsoft.com/en-us/%20cpp/c-runtime-library/parameter-validation?view=msvc-170

(lldb) run
Process 8440 launched: 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\unittests\Host\HostTests.exe' (aarch64)
Process 8440 stopped
* thread #1, stop reason = Exception 0xc0000409 encountered at address 0x7ffb7453564c
    frame #0: 0x00007ffb7453564c ucrtbase.dll`_get_thread_local_invalid_parameter_handler + 652
ucrtbase.dll`_get_thread_local_invalid_parameter_handler:
->  0x7ffb7453564c <+652>: brk    #0xf003

ucrtbase.dll`_invalid_parameter_noinfo:
    0x7ffb74535650 <+0>:   b      0x7ffb745354d8 ; _get_thread_local_invalid_parameter_handler + 280
    0x7ffb74535654 <+4>:   nop
    0x7ffb74535658 <+8>:   nop

You can override this handler but I'm assuming that this reading
after close isn't a crucial feature, so disabling the tests seems
like the way to go.

If it is crucial, we can check the fd before we use it.

Tests added by https://github.com/llvm/llvm-project/pull/143946.
---
 lldb/unittests/Host/JSONTransportTest.cpp | 31 +++++++++++++----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
index f1ec5e03bbec..4621869887ac 100644
--- a/lldb/unittests/Host/JSONTransportTest.cpp
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -84,12 +84,6 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadWithEOF) {
       Failed<TransportEOFError>());
 }
 
-TEST_F(HTTPDelimitedJSONTransportTest, ReadAfterClosed) {
-  input.CloseReadFileDescriptor();
-  ASSERT_THAT_EXPECTED(
-      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
-      llvm::Failed());
-}
 
 TEST_F(HTTPDelimitedJSONTransportTest, InvalidTransport) {
   transport = std::make_unique<HTTPDelimitedJSONTransport>(nullptr, nullptr);
@@ -136,13 +130,6 @@ TEST_F(JSONRPCTransportTest, ReadWithEOF) {
       Failed<TransportEOFError>());
 }
 
-TEST_F(JSONRPCTransportTest, ReadAfterClosed) {
-  input.CloseReadFileDescriptor();
-  ASSERT_THAT_EXPECTED(
-      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
-      llvm::Failed());
-}
-
 TEST_F(JSONRPCTransportTest, Write) {
   ASSERT_THAT_ERROR(transport->Write(JSONTestType{"foo"}), Succeeded());
   output.CloseWriteFileDescriptor();
@@ -173,4 +160,22 @@ TEST_F(JSONRPCTransportTest, ReadWithTimeout) {
       transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
       Failed<TransportTimeoutError>());
 }
+
+// Windows CRT _read checks that the file descriptor is valid and calls a
+// handler if not. This handler is normally a breakpoint, which looks like a
+// crash when not handled by a debugger.
+// https://learn.microsoft.com/en-us/%20cpp/c-runtime-library/reference/read?view=msvc-170
+TEST_F(HTTPDelimitedJSONTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(JSONRPCTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
 #endif

From 9670e09d0eac596fba6bf03ef1a6f3229dddee46 Mon Sep 17 00:00:00 2001
From: Devon Loehr <DKLoehr@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:29:42 -0400
Subject: [PATCH 0285/1322] Enable unique-object-duplication warning for
 windows (#143537)

Followup to #125526. This expands the logic of the
unique-object-duplication warning so that it also works for windows
code.

For the most part, the logic is unchanged, merely substituting "has no
import/export annotation" in place of "has hidden visibility". However,
there are some small inconsistencies between the two; namely, visibility
is propagated through nested classes, while import/export annotations
aren't.

This PR:
1. Updates the logic for the warning to account for the differences
between posix and windows
2. Changes the warning message and documentation appropriately
3. Updates the tests to cover windows, and adds new test cases for the
places where behavior differs.

This PR was tested by building chromium (cross compiling linux->windows)
with the changes in place. After accounting for the differences in
semantics, no new warnings were discovered.
---
 clang/include/clang/Basic/DiagnosticGroups.td | 16 ++--
 .../clang/Basic/DiagnosticSemaKinds.td        | 21 +++--
 clang/lib/Sema/SemaDecl.cpp                   | 31 +++++--
 .../SemaCXX/unique_object_duplication.cpp     | 10 ++-
 .../test/SemaCXX/unique_object_duplication.h  | 90 +++++++++++++------
 5 files changed, 114 insertions(+), 54 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index beda73e675fc..38b4f581fa5c 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -806,7 +806,9 @@ def UniqueObjectDuplication : DiagGroup<"unique-object-duplication"> {
 Warns when objects which are supposed to be globally unique might get duplicated
 when built into a shared library.
 
-If an object with hidden visibility is built into a shared library, each instance
+This can occur to objects which are hidden from the dynamic linker, due to
+having hidden visibility (on posix) or lacking a dllimport/dllexport attribute
+(on windows). If such an object is built into a shared library, each instance
 of the library will get its own copy. This can cause very subtle bugs if there was
 only supposed to be one copy of the object in question: singletons aren't single,
 changes to one object won't affect the others, the object's initializer will run
@@ -815,7 +817,7 @@ once per copy, etc.
 Specifically, this warning fires when it detects an object which:
 1. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
 2. Has external linkage (otherwise it's supposed to be duplicated), and
-3. Has hidden visibility.
+3. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
 
 As well as one of the following:
 1. The object is mutable, or
@@ -825,13 +827,15 @@ The warning can be resolved by removing one of the conditions above. In rough
 order of preference, this may be done by:
 1. Marking the object ``const`` (if possible)
 2. Moving the object's definition to a source file
-3. Giving the object non-hidden visibility, e.g. using ``__attribute((visibility("default")))``.
+3. Making the object visible using ``__attribute((visibility("default")))``,
+   ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
+
+When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``,
+take care to ensure that the object is only exported from one dll, and is imported
+everywhere else.
 
 Note that for (2), all levels of a pointer variable must be constant;
 ``const int*`` will trigger the warning because the pointer itself is mutable.
-
-This warning is not yet implemented for Windows, since Windows uses
-import/export rules instead of visibility.
 }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a2cf84d02419..95d24e9f1e6b 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6267,14 +6267,19 @@ def warn_static_local_in_extern_inline : Warning<
 def note_convert_inline_to_static : Note<
   "use 'static' to give inline function %0 internal linkage">;
 
-def warn_possible_object_duplication_mutable : Warning<
-  "%0 may be duplicated when built into a shared library: "
-  "it is mutable, has hidden visibility, and external linkage">,
-  InGroup<UniqueObjectDuplication>, DefaultIgnore;
-def warn_possible_object_duplication_init : Warning<
-  "initialization of %0 may run twice when built into a shared library: "
-  "it has hidden visibility and external linkage">,
-  InGroup<UniqueObjectDuplication>, DefaultIgnore;
+def warn_possible_object_duplication_mutable
+    : Warning<"%0 may be duplicated when built into a shared library: "
+              "it is mutable, with external linkage and "
+              "%select{hidden visibility|no import/export annotation}1">,
+      InGroup<UniqueObjectDuplication>,
+      DefaultIgnore;
+def warn_possible_object_duplication_init
+    : Warning<"initialization of %0 may run twice when built into a shared "
+              "library: "
+              "it has external linkage and "
+              "%select{hidden visibility|no import/export annotation}1">,
+      InGroup<UniqueObjectDuplication>,
+      DefaultIgnore;
 
 def ext_redefinition_of_typedef : ExtWarn<
   "redefinition of typedef %0 is a C11 feature">,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index c152f406b497..5cffd82e3372 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13518,8 +13518,28 @@ bool Sema::GloballyUniqueObjectMightBeAccidentallyDuplicated(
 
   // If the object isn't hidden, the dynamic linker will prevent duplication.
   clang::LinkageInfo Lnk = Target->getLinkageAndVisibility();
-  if (Lnk.getVisibility() != HiddenVisibility)
+
+  // The target is "hidden" (from the dynamic linker) if:
+  // 1. On posix, it has hidden visibility, or
+  // 2. On windows, it has no import/export annotation
+  if (Context.getTargetInfo().shouldDLLImportComdatSymbols()) {
+    if (Target->hasAttr<DLLExportAttr>() || Target->hasAttr<DLLImportAttr>())
+      return false;
+
+    // If the variable isn't directly annotated, check to see if it's a member
+    // of an annotated class.
+    const VarDecl *VD = dyn_cast<VarDecl>(Target);
+
+    if (VD && VD->isStaticDataMember()) {
+      const CXXRecordDecl *Ctx = dyn_cast<CXXRecordDecl>(VD->getDeclContext());
+      if (Ctx &&
+          (Ctx->hasAttr<DLLExportAttr>() || Ctx->hasAttr<DLLImportAttr>()))
+        return false;
+    }
+  } else if (Lnk.getVisibility() != HiddenVisibility) {
+    // Posix case
     return false;
+  }
 
   // If the obj doesn't have external linkage, it's supposed to be duplicated.
   if (!isExternalFormalLinkage(Lnk.getLinkage()))
@@ -13550,19 +13570,16 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) {
   // duplicated when built into a shared library, which causes problems if it's
   // mutable (since the copies won't be in sync) or its initialization has side
   // effects (since it will run once per copy instead of once globally).
-  // FIXME: Windows uses dllexport/dllimport instead of visibility, and we don't
-  // handle that yet. Disable the warning on Windows for now.
 
   // Don't diagnose if we're inside a template, because it's not practical to
   // fix the warning in most cases.
-  if (!Context.getTargetInfo().shouldDLLImportComdatSymbols() &&
-      !VD->isTemplated() &&
+  if (!VD->isTemplated() &&
       GloballyUniqueObjectMightBeAccidentallyDuplicated(VD)) {
 
     QualType Type = VD->getType();
     if (looksMutable(Type, VD->getASTContext())) {
       Diag(VD->getLocation(), diag::warn_possible_object_duplication_mutable)
-          << VD;
+          << VD << Context.getTargetInfo().shouldDLLImportComdatSymbols();
     }
 
     // To keep false positives low, only warn if we're certain that the
@@ -13575,7 +13592,7 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) {
                              /*IncludePossibleEffects=*/false) &&
         !isa<CXXNewExpr>(Init->IgnoreParenImpCasts())) {
       Diag(Init->getExprLoc(), diag::warn_possible_object_duplication_init)
-          << VD;
+          << VD << Context.getTargetInfo().shouldDLLImportComdatSymbols();
     }
   }
 }
diff --git a/clang/test/SemaCXX/unique_object_duplication.cpp b/clang/test/SemaCXX/unique_object_duplication.cpp
index 4b41bfbfdc2f..ff3b85d19fa6 100644
--- a/clang/test/SemaCXX/unique_object_duplication.cpp
+++ b/clang/test/SemaCXX/unique_object_duplication.cpp
@@ -1,7 +1,9 @@
-// RUN: %clang_cc1 -fsyntax-only -verify=hidden -Wunique-object-duplication -fvisibility=hidden -Wno-unused-value %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wunique-object-duplication -Wno-unused-value %s
-// The check is currently disabled on windows in MSVC-like environments. The test should fail because we're not getting the expected warnings.
-// XFAIL: target={{.*}}-windows-msvc, {{.*}}-ps{{(4|5)(-.+)?}}
+// RUN: %clang_cc1 -fsyntax-only -Wunique-object-duplication -Wno-unused-value \
+// RUN:   -verify -triple=x86_64-pc-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -Wunique-object-duplication -Wno-unused-value \
+// RUN:   -verify=hidden -triple=x86_64-pc-linux-gnu -fvisibility=hidden  %s
+// RUN: %clang_cc1 -fsyntax-only -Wunique-object-duplication -Wno-unused-value \
+// RUN:   -verify=windows -triple=x86_64-windows-msvc -DWINDOWS_TEST -fdeclspec %s
 
 #include "unique_object_duplication.h"
 
diff --git a/clang/test/SemaCXX/unique_object_duplication.h b/clang/test/SemaCXX/unique_object_duplication.h
index 537429d9ebda..bd0ee6bd14d6 100644
--- a/clang/test/SemaCXX/unique_object_duplication.h
+++ b/clang/test/SemaCXX/unique_object_duplication.h
@@ -3,8 +3,14 @@
  * See the warning's documentation for more information.
  */
 
+#ifdef WINDOWS_TEST
+#define HIDDEN
+// dllimport also suffices for visibility, but those can't have definitions
+#define VISIBLE __declspec(dllexport)
+#else
 #define HIDDEN __attribute__((visibility("hidden")))
-#define DEFAULT __attribute__((visibility("default")))
+#define VISIBLE __attribute__((visibility("default")))
+#endif
 
 // Helper functions
 constexpr int init_constexpr(int x) { return x; };
@@ -17,10 +23,11 @@ namespace StaticLocalTest {
 
 inline void has_static_locals_external() {
   // Mutable
-  static int disallowedStatic1 = 0; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  static int disallowedStatic1 = 0; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                    // windows-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
   // Initialization might run more than once
-  static const double disallowedStatic2 = disallowedStatic1++; // hidden-warning {{initialization of 'disallowedStatic2' may run twice when built into a shared library: it has hidden visibility and external linkage}}
-  
+  static const double disallowedStatic2 = disallowedStatic1++; // hidden-warning {{initialization of 'disallowedStatic2' may run twice when built into a shared library: it has external linkage and hidden visibility}}
+                                                               // windows-warning@-1 {{initialization of 'disallowedStatic2' may run twice when built into a shared library: it has external linkage and no import/export annotation}}
   // OK, because immutable and compile-time-initialized
   static constexpr int allowedStatic1 = 0;
   static const float allowedStatic2 = 1;
@@ -53,29 +60,33 @@ void has_static_locals_anon() {
   static double allowedStatic2 = init_dynamic(2);
   static char allowedStatic3 = []() { return allowedStatic1++; }();
   static constexpr int allowedStatic4 = init_constexpr(3);
-} 
+}
 
 } // Anonymous namespace
 
 HIDDEN inline void static_local_always_hidden() {
-    static int disallowedStatic1 = 3; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-                                      // expected-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+    static int disallowedStatic1 = 3; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                      // expected-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                      // windows-warning@-2 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     {
-      static int disallowedStatic2 = 3; // hidden-warning {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-                                        // expected-warning@-1 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+      static int disallowedStatic2 = 3; // hidden-warning {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // expected-warning@-1 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // windows-warning@-2 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     }
 
     auto lmb = []() {
-      static int disallowedStatic3 = 3; // hidden-warning {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-                                        // expected-warning@-1 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+      static int disallowedStatic3 = 3; // hidden-warning {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // expected-warning@-1 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // windows-warning@-2 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     };
 }
 
-DEFAULT void static_local_never_hidden() {
-    static int allowedStatic1 = 3; 
+// Always visible
+VISIBLE void static_local_never_hidden() {
+    static int allowedStatic1 = 3;
 
     {
-      static int allowedStatic2 = 3; 
+      static int allowedStatic2 = 3;
     }
 
     auto lmb = []() {
@@ -96,7 +107,8 @@ inline void has_regular_local() {
 
 inline void has_thread_local() {
   // thread_local variables are static by default
-  thread_local int disallowedThreadLocal = 0; // hidden-warning {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  thread_local int disallowedThreadLocal = 0; // hidden-warning {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                              // windows-warning@-1 {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
 }
 
 // Functions themselves are always immutable, so referencing them is okay
@@ -109,11 +121,13 @@ inline auto& allowedFunctionReference = has_static_locals_external;
  ******************************************************************************/
 namespace GlobalTest {
   // Mutable
-  inline float disallowedGlobal1 = 3.14; // hidden-warning {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-  
-  // Initialization might run more than once
-  inline const double disallowedGlobal5 = disallowedGlobal1++; // hidden-warning {{initialization of 'disallowedGlobal5' may run twice when built into a shared library: it has hidden visibility and external linkage}}
+  inline float disallowedGlobal1 = 3.14; // hidden-warning {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                         // windows-warning@-1 {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
 
+
+  // Initialization might run more than once
+  inline const double disallowedGlobal5 = disallowedGlobal1++; // hidden-warning {{initialization of 'disallowedGlobal5' may run twice when built into a shared library: it has external linkage and hidden visibility}}
+                                                               // windows-warning@-1 {{initialization of 'disallowedGlobal5' may run twice when built into a shared library: it has external linkage and no import/export annotation}}
   // OK because internal linkage, so duplication is intended
   static float allowedGlobal1 = 3.14;
   const double allowedGlobal2 = init_dynamic(2);
@@ -129,34 +143,52 @@ namespace GlobalTest {
   // We don't warn on this because non-inline variables can't (legally) appear
   // in more than one TU.
   float allowedGlobal9 = 3.14;
-  
+
   // Pointers need to be double-const-qualified
-  inline float& nonConstReference = disallowedGlobal1; // hidden-warning {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline float& nonConstReference = disallowedGlobal1; // hidden-warning {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                       // windows-warning@-1 {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
   const inline int& constReference = allowedGlobal5;
 
-  inline int* nonConstPointerToNonConst = nullptr; // hidden-warning {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-  inline int const* nonConstPointerToConst = nullptr; // hidden-warning {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-  inline int* const constPointerToNonConst = nullptr; // hidden-warning {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int* nonConstPointerToNonConst = nullptr; // hidden-warning {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                   // windows-warning@-1 {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+  inline int const* nonConstPointerToConst = nullptr; // hidden-warning {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                      // windows-warning@-1 {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+  inline int* const constPointerToNonConst = nullptr; // hidden-warning {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                      // windows-warning@-1 {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
   inline int const* const constPointerToConst = nullptr;
   // Don't warn on new because it tends to generate false positives
   inline int const* const constPointerToConstNew = new int(7);
 
   inline int const * const * const * const nestedConstPointer = nullptr;
-  inline int const * const ** const * const nestedNonConstPointer = nullptr; // hidden-warning {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int const * const ** const * const nestedNonConstPointer = nullptr; // hidden-warning {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                                             // windows-warning@-1 {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
 
   struct Test {
-    static inline float disallowedStaticMember1; // hidden-warning {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}       
+    static inline float disallowedStaticMember1; // hidden-warning {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                 // windows-warning@-1 {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     // Defined below, in the header file
-    static float disallowedStaticMember2;                                       
+    static float disallowedStaticMember2;
     // Defined in the cpp file, so won't get duplicated
     static float allowedStaticMember1;
 
+    // Always visible
+    VISIBLE static inline float allowedStaticMember2 = 0.0;
+
     // Tests here are sparse because the AddrTest case below will define plenty
     // more, which aren't problematic to define (because they're immutable), but
     // may still cause problems if their address is taken.
   };
 
-  inline float Test::disallowedStaticMember2 = 2.3; // hidden-warning {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline float Test::disallowedStaticMember2 = 2.3; // hidden-warning {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                    // windows-warning@-1 {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+
+  // This is always visible, so nothing inside it will get duplicated
+  struct VISIBLE NeverHidden {
+    static inline float allowedStaticMember3;
+    static float allowedStaticMember4;
+  };
+
+  inline float NeverHidden::allowedStaticMember4 = 3.4;
 } // namespace GlobalTest
 
 /******************************************************************************
@@ -165,7 +197,7 @@ namespace GlobalTest {
 
 namespace TemplateTest {
 
-// We never warn inside templates because it's frequently infeasible to actually
+// We never warn inside templates because it's usually infeasible to actually
 // fix the warning.
 
 template <typename T>

From cf6ae065a042aae6324b28e99628c40bc53be0b7 Mon Sep 17 00:00:00 2001
From: nicebert <110385235+nicebert@users.noreply.github.com>
Date: Fri, 13 Jun 2025 16:46:36 +0200
Subject: [PATCH 0286/1322] [OpenMP] Remove declaration and usage of
 __AMDGCN_WAVEFRONT_SIZE (#143761)

Removes usage of __AMDGCN_WAVEFRONT_SIZE as compile time constant.

---------

Co-authored-by: Shilei Tian <i@tianshilei.me>
---
 openmp/runtime/src/include/ompx.h.var | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 623f0b9c315b..6884745f4240 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,13 +9,21 @@
 #ifndef __OMPX_H
 #define __OMPX_H
 
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-#define __WARP_SIZE __AMDGCN_WAVEFRONT_SIZE
-#else
-#define __WARP_SIZE 32
+#if (defined(__NVPTX__) || defined(__AMDGPU__))
+#include <gpuintrin.h>
+#define __OMPX_TARGET_IS_GPU
 #endif
 
 typedef unsigned long uint64_t;
+typedef unsigned int uint32_t;
+
+static inline uint32_t __warpSize(void) {
+#ifdef __OMPX_TARGET_IS_GPU
+  return __gpu_num_lanes();
+#else
+  __builtin_trap();
+#endif
+}
 
 #ifdef __cplusplus
 extern "C" {
@@ -212,7 +220,7 @@ static inline uint64_t ballot_sync(uint64_t mask, int pred) {
 ///{
 #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY)                          \
   static inline TYPE shfl_down_sync(uint64_t mask, TYPE var, unsigned delta,   \
-                                    int width = __WARP_SIZE) {                 \
+                                    int width = __warpSize()) {                \
     return ompx_shfl_down_sync_##TY(mask, var, delta, width);                  \
   }
 

From ebd7f7539b1c2bc7d5e391bbb00cb56dc245b2dd Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 13 Jun 2025 13:26:29 +0100
Subject: [PATCH 0287/1322] [KeyInstr][NFC] Fix incorrect atomGroup/rank uint
 size in computeKeyInstructions

---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5fb74a016a75..0edfca78b088 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2368,8 +2368,8 @@ void DwarfDebug::computeKeyInstructions(const MachineFunction *MF) {
   // Map {(InlinedAt, Group): (Rank, Instructions)}.
   // NOTE: Anecdotally, for a large C++ blob, 99% of the instruction
   // SmallVectors contain 2 or fewer elements; use 2 inline elements.
-  DenseMap<std::pair<DILocation *, uint32_t>,
-           std::pair<uint16_t, SmallVector<const MachineInstr *, 2>>>
+  DenseMap<std::pair<DILocation *, uint64_t>,
+           std::pair<uint8_t, SmallVector<const MachineInstr *, 2>>>
       GroupCandidates;
 
   // For each instruction:

From 9e622986526a35f3f8bc60a7fc756b5c7bf825c0 Mon Sep 17 00:00:00 2001
From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:06:31 -0400
Subject: [PATCH 0288/1322] [mlir][spirv] Fix FuncOpVectorUnroll to process
 placeholder values in all blocks (#142339)

`FuncOpVectorUnroll` contains logic that replaces function arguments by
placeholders values. These replacements also involve changing all
instructions in the function that use the arguments to use these
placeholders. These placeholder values will later be changed back to use
the function arguments (either new or original if already legal).

The current implementation however only replaces back (the second
replacement, i.e. replacing the placeholder values to new/legal
arguments) the first block of instructions and not all of the blocks.
This may leave some instructions to use these placeholder values (which
for already legal arguments are just zeroattr values that will get
DCE'd) instead of the arguments, which is incorrect.

Closes #132158.
---
 .../SPIRV/Transforms/SPIRVConversion.cpp      | 26 +++----
 .../func-signature-vector-unroll.mlir         | 73 +++++++++++++++++++
 2 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index 62a24646d066..f5a58c58e05d 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -1020,22 +1020,22 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
     SmallVector<Location> locs(convertedTypes.size(), newFuncOp.getLoc());
     entryBlock.addArguments(convertedTypes, locs);
 
-    // Replace the placeholder values with the new arguments. We assume there is
-    // only one block for now.
+    // Replace all uses of placeholders for initially legal arguments with their
+    // original function arguments (that were added to `newFuncOp`).
+    for (auto &[placeholderOp, argIdx] : tmpOps) {
+      if (!placeholderOp)
+        continue;
+      Value replacement = newFuncOp.getArgument(argIdx);
+      rewriter.replaceAllUsesWith(placeholderOp->getResult(0), replacement);
+    }
+
+    // Replace dummy operands of new `vector.insert_strided_slice` ops with
+    // their corresponding new function arguments. The new
+    // `vector.insert_strided_slice` ops are inserted only into the entry block,
+    // so iterating over that block is sufficient.
     size_t unrolledInputIdx = 0;
     for (auto [count, op] : enumerate(entryBlock.getOperations())) {
-      // We first look for operands that are placeholders for initially legal
-      // arguments.
       Operation &curOp = op;
-      for (auto [operandIdx, operandVal] : llvm::enumerate(op.getOperands())) {
-        Operation *operandOp = operandVal.getDefiningOp();
-        if (auto it = tmpOps.find(operandOp); it != tmpOps.end()) {
-          size_t idx = operandIdx;
-          rewriter.modifyOpInPlace(&curOp, [&curOp, &newFuncOp, it, idx] {
-            curOp.setOperand(idx, newFuncOp.getArgument(it->second));
-          });
-        }
-      }
       // Since all newly created operations are in the beginning, reaching the
       // end of them means that any later `vector.insert_strided_slice` should
       // not be touched.
diff --git a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
index c018ccb92498..211d6c90243b 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
@@ -189,3 +189,76 @@ func.func @unsupported_scalable(%arg0 : vector<[8]xi32>) -> (vector<[8]xi32>) {
   return %arg0 : vector<[8]xi32>
 }
 
+// -----
+
+// Check that already legal function parameters are properly preserved across multiple blocks.
+
+// CHECK-LABEL: func.func @legal_params_multiple_blocks_simple
+// CHECK-SAME: (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32) -> i32
+func.func @legal_params_multiple_blocks_simple(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[ARG1]] : i32
+  // CHECK: return %[[ADD1]] : i32
+  cf.br ^bb1(%arg0 : i32)
+^bb1(%acc0: i32):
+  %acc1_val = arith.addi %acc0, %arg1 : i32
+  cf.br ^bb2(%acc1_val : i32)
+^bb2(%acc1: i32):
+  %acc2_val = arith.addi %acc1, %arg1 : i32
+  cf.br ^bb3(%acc2_val : i32)
+^bb3(%acc_final: i32):
+  return %acc_final : i32
+}
+
+// -----
+
+// Check that legal parameters and existing `vector.insert_strided_slice`s are properly preserved across multiple blocks.
+
+// CHECK-LABEL: func.func @legal_params_with_vec_insert_multiple_blocks
+// CHECK-SAME: (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32, %[[ARG2:.+]]: vector<4xi32>) -> vector<4xi32>
+func.func @legal_params_with_vec_insert_multiple_blocks(%arg0: i32, %arg1: i32, %arg2: vector<4xi32>) -> vector<4xi32> {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[ARG1]] : i32
+  // CHECK: %[[VEC1D:.*]] = vector.broadcast %[[ADD1]] : i32 to vector<1xi32>
+  // CHECK: %[[VEC0:.*]] = vector.insert_strided_slice %[[VEC1D]], %[[ARG2]] {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
+  // CHECK: %[[VEC1:.*]] = vector.insert_strided_slice %[[VEC1D]], %[[VEC0]] {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
+  // CHECK: %[[RESULT:.*]] = vector.insert_strided_slice %[[VEC1D]], %[[VEC1]] {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
+  // CHECK: return %[[RESULT]] : vector<4xi32>
+  cf.br ^bb1(%arg0 : i32)
+^bb1(%acc0: i32):
+  %acc1_val = arith.addi %acc0, %arg1 : i32
+  cf.br ^bb2(%acc1_val : i32)
+^bb2(%acc1: i32):
+  %acc2_val = arith.addi %acc1, %arg1 : i32
+  cf.br ^bb3(%acc2_val : i32)
+^bb3(%acc_final: i32):
+  %scalar_vec = vector.broadcast %acc_final : i32 to vector<1xi32>
+  %vec0 = vector.insert_strided_slice %scalar_vec, %arg2 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
+  %vec1 = vector.insert_strided_slice %scalar_vec, %vec0 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
+  %result = vector.insert_strided_slice %scalar_vec, %vec1 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
+  return %result : vector<4xi32>
+}
+
+// -----
+
+// Check that already legal function parameters are preserved across a loop (which contains multiple blocks).
+
+// CHECK-LABEL: @legal_params_for_loop
+// CHECK-SAME: (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
+func.func @legal_params_for_loop(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 {
+  // CHECK: %[[CST0:.*]] = arith.constant 0 : index
+  // CHECK: %[[CST1:.*]] = arith.constant 1 : index
+  // CHECK: %[[UB:.*]] = arith.index_cast %[[ARG2]] : i32 to index
+  // CHECK: %[[RESULT:.*]] = scf.for %[[STEP:.*]] = %[[CST0]] to %[[UB]] step %[[CST1]] iter_args(%[[ACC:.*]] = %[[ARG0]]) -> (i32) {
+  // CHECK:   %[[ADD:.*]] = arith.addi %[[ACC]], %[[ARG1]] : i32
+  // CHECK:   scf.yield %[[ADD]] : i32
+  // CHECK: return %[[RESULT]] : i32
+  %zero = arith.constant 0 : index
+  %one = arith.constant 1 : index
+  %ub = arith.index_cast %arg2 : i32 to index
+  %result = scf.for %i = %zero to %ub step %one iter_args(%acc = %arg0) -> (i32) {
+    %new_acc = arith.addi %acc, %arg1 : i32
+    scf.yield %new_acc : i32
+  }
+  return %result : i32
+}

From bcfbba12e6754e0a2a5a1c8e3aac3a24316bba2d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 08:11:20 -0700
Subject: [PATCH 0289/1322] [llvm] Compare std::optional<T> to values directly
 (NFC) (#143913)

This patch transforms:

  X && *X == Y

to:

  X == Y

where X is of std::optional<T>, and Y is of T or similar.
---
 llvm/lib/Analysis/ConstantFolding.cpp | 4 ++--
 llvm/lib/IR/Attributes.cpp            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 139a0b81e299..64a0f4641250 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2132,7 +2132,7 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI,
 
   // If evaluation raised FP exception, the result can depend on rounding
   // mode. If the latter is unknown, folding is not possible.
-  if (ORM && *ORM == RoundingMode::Dynamic)
+  if (ORM == RoundingMode::Dynamic)
     return false;
 
   // If FP exceptions are ignored, fold the call, even if such exception is
@@ -2418,7 +2418,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         if (IntrinsicID == Intrinsic::experimental_constrained_rint &&
             St == APFloat::opInexact) {
           std::optional<fp::ExceptionBehavior> EB = CI->getExceptionBehavior();
-          if (EB && *EB == fp::ebStrict)
+          if (EB == fp::ebStrict)
             return nullptr;
         }
       } else if (U.isSignaling()) {
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index ed485f965699..bfb32ff9995d 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -295,7 +295,7 @@ Attribute Attribute::getWithCaptureInfo(LLVMContext &Context, CaptureInfo CI) {
 Attribute
 Attribute::getWithAllocSizeArgs(LLVMContext &Context, unsigned ElemSizeArg,
                                 const std::optional<unsigned> &NumElemsArg) {
-  assert(!(ElemSizeArg == 0 && NumElemsArg && *NumElemsArg == 0) &&
+  assert(!(ElemSizeArg == 0 && NumElemsArg == 0) &&
          "Invalid allocsize arguments -- given allocsize(0, 0)");
   return get(Context, AllocSize, packAllocSizeArgs(ElemSizeArg, NumElemsArg));
 }

From 6751b3a549ebef78a7e75b100d61742c20945592 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Fri, 13 Jun 2025 16:16:09 +0100
Subject: [PATCH 0290/1322] Revert "[lit] cleanup unused imports" (#144054)

Reverts llvm/llvm-project#143930 as it causes build failures:
https://github.com/llvm/llvm-project/pull/143930#issuecomment-2969115461
---
 lld/test/Unit/lit.cfg.py         | 1 +
 lldb/test/API/lit.cfg.py         | 2 ++
 lldb/test/Shell/lit.cfg.py       | 5 ++++-
 lldb/test/lit.cfg.py             | 3 +++
 llvm/utils/lit/lit/LitConfig.py  | 6 ++++--
 llvm/utils/lit/lit/TestRunner.py | 6 ++++++
 llvm/utils/lit/lit/discovery.py  | 2 +-
 llvm/utils/lit/lit/worker.py     | 2 ++
 8 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/lld/test/Unit/lit.cfg.py b/lld/test/Unit/lit.cfg.py
index 47375db517e9..1cf890a05cb2 100644
--- a/lld/test/Unit/lit.cfg.py
+++ b/lld/test/Unit/lit.cfg.py
@@ -3,6 +3,7 @@
 # Configuration file for the 'lit' test runner.
 
 import os
+import subprocess
 
 import lit.formats
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 04b360e8d330..646a446c86fd 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -9,6 +9,8 @@ import shutil
 import subprocess
 import sys
 
+import lit.formats
+
 # name: The name of this test suite.
 config.name = "lldb-api"
 
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 6f0e017fb7cb..ab6113767187 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -7,9 +7,12 @@ import re
 import shutil
 import site
 import subprocess
+import sys
 
-import lit.util
+import lit.formats
 from lit.llvm import llvm_config
+from lit.llvm.subst import FindTool
+from lit.llvm.subst import ToolSubst
 
 site.addsitedir(os.path.dirname(__file__))
 from helper import toolchain
diff --git a/lldb/test/lit.cfg.py b/lldb/test/lit.cfg.py
index 6a4255c2627d..eefc32aabd16 100644
--- a/lldb/test/lit.cfg.py
+++ b/lldb/test/lit.cfg.py
@@ -2,6 +2,9 @@
 
 import os
 
+import lit.formats
+from lit.llvm import llvm_config
+
 # This is the top level configuration. Most of these configuration options will
 # be overriden by individual lit configuration files in the test
 # subdirectories. Anything configured here will *not* be loaded when pointing
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
index 5bb2d3c5c986..cb4aef6f72a8 100644
--- a/llvm/utils/lit/lit/LitConfig.py
+++ b/llvm/utils/lit/lit/LitConfig.py
@@ -1,12 +1,14 @@
 from __future__ import absolute_import
-
 import inspect
 import os
+import platform
 import sys
 
+import lit.Test
+import lit.formats
+import lit.TestingConfig
 import lit.util
 
-
 # LitConfig must be a new style class for properties to work
 class LitConfig(object):
     """LitConfig - Configuration data for a 'lit' test runner instance, shared
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 1d3bf8e4e8df..73db67aedb73 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1,4 +1,7 @@
 from __future__ import absolute_import
+import errno
+import io
+import itertools
 import getopt
 import os, signal, subprocess, sys
 import re
@@ -9,8 +12,11 @@ import shlex
 import shutil
 import tempfile
 import threading
+import typing
 from typing import Optional, Tuple
 
+import io
+
 try:
     from StringIO import StringIO
 except ImportError:
diff --git a/llvm/utils/lit/lit/discovery.py b/llvm/utils/lit/lit/discovery.py
index 2e93bacc1236..2e7f90c6bb0c 100644
--- a/llvm/utils/lit/lit/discovery.py
+++ b/llvm/utils/lit/lit/discovery.py
@@ -6,8 +6,8 @@ import copy
 import os
 import sys
 
-from lit import Test, util
 from lit.TestingConfig import TestingConfig
+from lit import LitConfig, Test, util
 
 
 def chooseConfigFileFromDir(dir, config_names):
diff --git a/llvm/utils/lit/lit/worker.py b/llvm/utils/lit/lit/worker.py
index dbc3ab53bc62..8e78bfd45d38 100644
--- a/llvm/utils/lit/lit/worker.py
+++ b/llvm/utils/lit/lit/worker.py
@@ -12,6 +12,8 @@ import time
 import traceback
 
 import lit.Test
+import lit.util
+
 
 _lit_config = None
 _parallelism_semaphores = None

From 3ea45a65edb2f033e59a12f71a8241f220791ac8 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Fri, 13 Jun 2025 16:18:54 +0100
Subject: [PATCH 0291/1322] [AArch64] Add fixed-length SVE USDOT support
 (#143730)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  11 +
 .../sve-fixed-length-partial-reduce.ll        | 230 +++++++++++++++++-
 2 files changed, 238 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 781a1281db40..7519ac5260a6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2272,6 +2272,17 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
       setPartialReduceMLAAction(MLAOps, VT,
                                 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
     }
+
+    if (Subtarget->hasMatMulInt8()) {
+      if (VT.getVectorElementType() == MVT::i32)
+        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                  MVT::getVectorVT(MVT::i8, NumElts * 4),
+                                  Custom);
+      else if (VT.getVectorElementType() == MVT::i64)
+        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                  MVT::getVectorVT(MVT::i8, NumElts * 8),
+                                  Custom);
+    }
   }
 
   // Lower fixed length vector operations to scalable equivalents.
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index 79d766d1b990..af813ff16a20 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+dotprod -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
-; RUN: llc -mattr=+sve,+dotprod -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
-; RUN: llc -mattr=+sme -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
+; RUN: llc -mattr=+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mattr=+sve,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
+; RUN: llc -mattr=+sme,+i8mm -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
 
 target triple = "aarch64"
 
@@ -407,6 +407,154 @@ define <4 x i32> @four_way_i8_i32_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   ret <4 x i32> %partial.reduce
 }
 
+define <4 x i32> @four_way_i8_i32_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; COMMON-LABEL: four_way_i8_i32_vl128_usdot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldr q0, [x0]
+; COMMON-NEXT:    ldr q1, [x1]
+; COMMON-NEXT:    ldr q2, [x2]
+; COMMON-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl128_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldr q0, [x0]
+; SME-NEXT:    ldr q1, [x1]
+; SME-NEXT:    ldr q2, [x2]
+; SME-NEXT:    usdot z0.s, z1.b, z2.b
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <4 x i32>, ptr %accptr
+  %u = load <16 x i8>, ptr %uptr
+  %s = load <16 x i8>, ptr %sptr
+  %u.wide = zext <16 x i8> %u to <16 x i32>
+  %s.wide = sext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @four_way_i8_i32_vl128_sudot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; COMMON-LABEL: four_way_i8_i32_vl128_sudot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldr q0, [x0]
+; COMMON-NEXT:    ldr q1, [x1]
+; COMMON-NEXT:    ldr q2, [x2]
+; COMMON-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl128_sudot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldr q0, [x0]
+; SME-NEXT:    ldr q1, [x1]
+; SME-NEXT:    ldr q2, [x2]
+; SME-NEXT:    usdot z0.s, z2.b, z1.b
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <4 x i32>, ptr %accptr
+  %u = load <16 x i8>, ptr %uptr
+  %s = load <16 x i8>, ptr %sptr
+  %u.wide = sext <16 x i8> %u to <16 x i32>
+  %s.wide = zext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i64> @four_way_i8_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; NEON-LABEL: four_way_i8_i64_vl128_usdot:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    ldr q1, [x1]
+; NEON-NEXT:    ldr q2, [x2]
+; NEON-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; NEON-NEXT:    ldr q1, [x0]
+; NEON-NEXT:    saddw v1.2d, v1.2d, v0.2s
+; NEON-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: four_way_i8_i64_vl128_usdot:
+; SVE:       // %bb.0:
+; SVE-NEXT:    movi v0.2d, #0000000000000000
+; SVE-NEXT:    ldr q1, [x1]
+; SVE-NEXT:    ldr q2, [x2]
+; SVE-NEXT:    usdot z0.s, z1.b, z2.b
+; SVE-NEXT:    ldr q2, [x0]
+; SVE-NEXT:    sunpklo z1.d, z0.s
+; SVE-NEXT:    sunpkhi z0.d, z0.s
+; SVE-NEXT:    add z1.d, z2.d, z1.d
+; SVE-NEXT:    add z0.d, z1.d, z0.d
+; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i64_vl128_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    mov z0.s, #0 // =0x0
+; SME-NEXT:    ldr q1, [x1]
+; SME-NEXT:    ldr q2, [x2]
+; SME-NEXT:    usdot z0.s, z1.b, z2.b
+; SME-NEXT:    ldr q1, [x0]
+; SME-NEXT:    saddwb z1.d, z1.d, z0.s
+; SME-NEXT:    saddwt z0.d, z1.d, z0.s
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <2 x i64>, ptr %accptr
+  %u = load <16 x i8>, ptr %uptr
+  %s = load <16 x i8>, ptr %sptr
+  %u.wide = zext <16 x i8> %u to <16 x i64>
+  %s.wide = sext <16 x i8> %s to <16 x i64>
+  %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @four_way_i16_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; COMMON-LABEL: four_way_i16_i64_vl128_usdot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldr q1, [x1]
+; COMMON-NEXT:    ldr q2, [x2]
+; COMMON-NEXT:    ldr q0, [x0]
+; COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; COMMON-NEXT:    smlal v0.2d, v4.2s, v3.2s
+; COMMON-NEXT:    smlal2 v0.2d, v4.4s, v3.4s
+; COMMON-NEXT:    smlal v0.2d, v2.2s, v1.2s
+; COMMON-NEXT:    smlal2 v0.2d, v2.4s, v1.4s
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i16_i64_vl128_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ptrue p0.d, vl2
+; SME-NEXT:    ldr q2, [x0]
+; SME-NEXT:    mov x8, #2 // =0x2
+; SME-NEXT:    ld1h { z0.d }, p0/z, [x1]
+; SME-NEXT:    ld1sh { z1.d }, p0/z, [x2]
+; SME-NEXT:    mad z0.d, p0/m, z1.d, z2.d
+; SME-NEXT:    ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
+; SME-NEXT:    ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
+; SME-NEXT:    mov x8, #4 // =0x4
+; SME-NEXT:    mla z0.d, p0/m, z2.d, z1.d
+; SME-NEXT:    ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
+; SME-NEXT:    ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
+; SME-NEXT:    mov x8, #6 // =0x6
+; SME-NEXT:    mla z0.d, p0/m, z2.d, z1.d
+; SME-NEXT:    ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
+; SME-NEXT:    ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
+; SME-NEXT:    mla z0.d, p0/m, z2.d, z1.d
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <2 x i64>, ptr %accptr
+  %u = load <8 x i16>, ptr %uptr
+  %s = load <8 x i16>, ptr %sptr
+  %u.wide = zext <8 x i16> %u to <8 x i64>
+  %s.wide = sext <8 x i16> %s to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
 define <8 x i32> @four_way_i8_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
 ;
 ; COMMON-LABEL: four_way_i8_i32_vl128_double_width:
@@ -438,6 +586,37 @@ define <8 x i32> @four_way_i8_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   ret <8 x i32> %partial.reduce
 }
 
+define <8 x i32> @four_way_i8_i32_vl128_double_width_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+;
+; COMMON-LABEL: four_way_i8_i32_vl128_double_width_usdot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldp q0, q1, [x0]
+; COMMON-NEXT:    ldp q3, q2, [x1]
+; COMMON-NEXT:    ldp q5, q4, [x2]
+; COMMON-NEXT:    usdot v0.4s, v3.16b, v5.16b
+; COMMON-NEXT:    usdot v1.4s, v2.16b, v4.16b
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl128_double_width_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldp q0, q1, [x0]
+; SME-NEXT:    ldp q3, q2, [x1]
+; SME-NEXT:    ldp q5, q4, [x2]
+; SME-NEXT:    usdot z0.s, z3.b, z5.b
+; SME-NEXT:    usdot z1.s, z2.b, z4.b
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; SME-NEXT:    ret
+  %acc = load <8 x i32>, ptr %accptr
+  %u = load <32 x i8>, ptr %uptr
+  %s = load <32 x i8>, ptr %sptr
+  %u.wide = zext <32 x i8> %u to <32 x i32>
+  %s.wide = sext <32 x i8> %s to <32 x i32>
+  %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  ret <8 x i32> %partial.reduce
+}
+
 define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
 ;
 ;
@@ -483,6 +662,51 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   ret <8 x i32> %partial.reduce
 }
 
+define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
+;
+;
+; NEON-LABEL: four_way_i8_i32_vl256_usdot:
+; NEON:       // %bb.0:
+; NEON-NEXT:    ldp q0, q1, [x0]
+; NEON-NEXT:    ldp q3, q2, [x1]
+; NEON-NEXT:    ldp q5, q4, [x2]
+; NEON-NEXT:    usdot v0.4s, v3.16b, v5.16b
+; NEON-NEXT:    usdot v1.4s, v2.16b, v4.16b
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: four_way_i8_i32_vl256_usdot:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ldr z0, [x0]
+; SVE-NEXT:    ldr z1, [x1]
+; SVE-NEXT:    ldr z2, [x2]
+; SVE-NEXT:    usdot z0.s, z1.b, z2.b
+; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
+; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; SVE-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl256_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldr z0, [x0]
+; SME-NEXT:    ldr z1, [x1]
+; SME-NEXT:    ldr z2, [x2]
+; SME-NEXT:    usdot z0.s, z1.b, z2.b
+; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; SME-NEXT:    ret
+  %acc = load <8 x i32>, ptr %accptr
+  %u = load <32 x i8>, ptr %uptr
+  %s = load <32 x i8>, ptr %sptr
+  %u.wide = zext <32 x i8> %u to <32 x i32>
+  %s.wide = sext <32 x i8> %s to <32 x i32>
+  %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  ret <8 x i32> %partial.reduce
+}
+
 ;
 ; Four-way dot (i16 -> i64)
 ;

From eba63cd76f7ba7f9e9964b1263f76409d08fcd04 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 13 Jun 2025 10:29:31 -0500
Subject: [PATCH 0292/1322] [flang][OpenMP] Improve handling of REQUIRES
 ATOMIC_DEFAULT_MEM_ORDER (#143917)

According to OpenMP 5.0 rules, the ACQ_REL ordering coming from a
REQUIRES directive may need to be replaced with ACQUIRE or RELEASE
depending on the directive in the ATOMIC construct. This was not done,
leading to an incorrect "memory-order" clause appearing in the generated
HLFIR.

This may need to be relaxed a bit to fully comply with later spec
versions, that will be done in a future PR.
---
 flang/lib/Semantics/rewrite-directives.cpp    | 15 ++++++++++++-
 .../requires-atomic-default-mem-order.f90     | 22 +++++++++++++++++++
 .../Semantics/OpenMP/requires-atomic02.f90    |  8 +++----
 3 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90

diff --git a/flang/lib/Semantics/rewrite-directives.cpp b/flang/lib/Semantics/rewrite-directives.cpp
index b4fef2c881b6..91b60ea151de 100644
--- a/flang/lib/Semantics/rewrite-directives.cpp
+++ b/flang/lib/Semantics/rewrite-directives.cpp
@@ -112,9 +112,22 @@ bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) {
 
   // Add a memory order clause to the atomic directive.
   atomicDirectiveDefaultOrderFound_ = true;
+  llvm::omp::Clause kind{x.GetKind()};
   switch (*defaultMemOrder) {
   case common::OmpMemoryOrderType::Acq_Rel:
-    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::AcqRel{}});
+    // FIXME: Implement 5.0 rules, pending clarification on later spec
+    // versions.
+    // [5.0:62:22-26]
+    if (kind == llvm::omp::Clause::OMPC_read) {
+      clauseList->v.emplace_back(
+          parser::OmpClause{parser::OmpClause::Acquire{}});
+    } else if (kind == llvm::omp::Clause::OMPC_update && x.IsCapture()) {
+      clauseList->v.emplace_back(
+          parser::OmpClause{parser::OmpClause::AcqRel{}});
+    } else {
+      clauseList->v.emplace_back(
+          parser::OmpClause{parser::OmpClause::Release{}});
+    }
     break;
   case common::OmpMemoryOrderType::Relaxed:
     clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::Relaxed{}});
diff --git a/flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90 b/flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90
new file mode 100644
index 000000000000..91cb654aeeb3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90
@@ -0,0 +1,22 @@
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+module m
+!$omp requires atomic_default_mem_order(acq_rel)
+
+contains
+
+!CHECK: %[[V:[0-9]+]]:2 = hlfir.declare {{.*}} {uniq_name = "_QMmFf00Ev"}
+!CHECK: %[[X:[0-9]+]]:2 = hlfir.declare {{.*}} {uniq_name = "_QMmFf00Ex"}
+!CHECK: omp.atomic.read %[[V]]#0 = %[[X]]#0 memory_order(acquire)
+!CHECK: omp.atomic.write %[[X]]#0 = %{{[0-9]+}} memory_order(release)
+
+subroutine f00(x, v)
+  integer :: x, v
+  !$omp atomic read
+    v = x
+
+  !$omp atomic write
+    x = v
+end
+
+end module
diff --git a/flang/test/Semantics/OpenMP/requires-atomic02.f90 b/flang/test/Semantics/OpenMP/requires-atomic02.f90
index a3724a83456f..04a9b7a09aa9 100644
--- a/flang/test/Semantics/OpenMP/requires-atomic02.f90
+++ b/flang/test/Semantics/OpenMP/requires-atomic02.f90
@@ -12,7 +12,7 @@ program requires
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
   ! CHECK: OmpClause -> Read
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Acquire
   !$omp atomic read
   i = j
 
@@ -36,7 +36,7 @@ program requires
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
   ! CHECK: OmpClause -> Write
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Release
   !$omp atomic write
   i = j
 
@@ -60,7 +60,7 @@ program requires
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
   ! CHECK: OmpClause -> Update
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Release
   !$omp atomic update
   i = i + j
 
@@ -79,7 +79,7 @@ program requires
   i = i + j
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Release
   !$omp atomic
   i = i + j
 

From ec21b0fc9f64e8cffe689699d1e39533c62fcfc3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 16:26:10 +0100
Subject: [PATCH 0293/1322] [X86] Add X86FixupInstTuning test coverage for
 (V)BLENDPD/S <-> (V)MOVSD/S patterns for various scheduler models

---
 llvm/test/CodeGen/X86/fixup-blend.ll | 713 +++++++++++++++++++++++++++
 1 file changed, 713 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/fixup-blend.ll

diff --git a/llvm/test/CodeGen/X86/fixup-blend.ll b/llvm/test/CodeGen/X86/fixup-blend.ll
new file mode 100644
index 000000000000..3126e4823bee
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fixup-blend.ll
@@ -0,0 +1,713 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64      | FileCheck %s -check-prefixes=SSE,SSE-MOV,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2   | FileCheck %s -check-prefixes=SSE,SSE4,SSE4-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=slm         | FileCheck %s -check-prefixes=SSE,SSE-MOV,SSE4,SSE4-MOV
+
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s -check-prefixes=AVX,AVX1,AVX-BLEND,AVX1-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2      | FileCheck %s -check-prefixes=AVX,AVX1,AVX-MOV,AVX1-MOV
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3   | FileCheck %s -check-prefixes=AVX,AVX2,AVX-BLEND,AVX2-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4   | FileCheck %s -check-prefixes=AVX,AVX2,AVX-BLEND,AVX2-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake   | FileCheck %s -check-prefixes=AVX,AVX2,AVX-MOV,AVX2-MOV
+
+;
+; v2f64 patterns
+;
+
+define <2 x double> @test_v2f64_blend_movsd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; SSE-MOV-LABEL: test_v2f64_blend_movsd:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-MOV-NEXT:    addpd %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v2f64_blend_movsd:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE4-BLEND-NEXT:    addpd %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v2f64_blend_movsd:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-BLEND-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v2f64_blend_movsd:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_optsize(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) optsize {
+; SSE-LABEL: test_v2f64_blend_movsd_optsize:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_optsize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_load(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+; SSE2-LABEL: test_v2f64_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2f64_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    addpd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = load <2 x double>, ptr %p1
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_zero(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; SSE-MOV-LABEL: test_v2f64_blend_movsd_zero:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    xorpd %xmm1, %xmm1
+; SSE-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-MOV-NEXT:    addpd %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v2f64_blend_movsd_zero:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    xorpd %xmm1, %xmm1
+; SSE4-BLEND-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE4-BLEND-NEXT:    addpd %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v2f64_blend_movsd_zero:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-BLEND-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-BLEND-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v2f64_blend_movsd_zero:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+;
+; v2i64 patterns
+;
+
+define <2 x i64> @test_v2i64_blend_movsd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_optsize(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) optsize {
+; SSE2-LABEL: test_v2i64_blend_movsd_optsize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_optsize:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_optsize:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_optsize:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_load(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a1 = load <2 x i64>, ptr %p1
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_zero(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_zero:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_zero:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_zero:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+;
+; v4f32 patterns
+;
+
+define <4 x float> @test_v4f32_blend_movss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE-MOV-LABEL: test_v4f32_blend_movss:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movss:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movss:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movss:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE-MOV-LABEL: test_v4f32_blend_movsd:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movsd:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movsd:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movsd:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_optsize(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) optsize {
+; SSE-LABEL: test_v4f32_blend_movss_optsize:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_optsize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_optsize(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) optsize {
+; SSE-LABEL: test_v4f32_blend_movsd_optsize:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_optsize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movss_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = load <4 x float>, ptr %p1
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = load <4 x float>, ptr %p1
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_zero(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE-MOV-LABEL: test_v4f32_blend_movss_zero:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    xorps %xmm1, %xmm1
+; SSE-MOV-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movss_zero:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    xorps %xmm1, %xmm1
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movss_zero:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movss_zero:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-MOV-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_zero(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    addps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movsd_zero:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    xorps %xmm1, %xmm1
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; SSE4-MOV-LABEL: test_v4f32_blend_movsd_zero:
+; SSE4-MOV:       # %bb.0:
+; SSE4-MOV-NEXT:    xorps %xmm1, %xmm1
+; SSE4-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE4-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE4-MOV-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movsd_zero:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movsd_zero:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+;
+; v4i32 patterns
+;
+
+define <4 x i32> @test_v4i32_blend_movss(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_optsize(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) optsize {
+; SSE2-LABEL: test_v4i32_blend_movss_optsize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_optsize:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_optsize:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_optsize:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_optsize(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) optsize {
+; SSE2-LABEL: test_v4i32_blend_movsd_optsize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_optsize:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_optsize:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_optsize:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a1 = load <4 x i32>, ptr %p1
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a1 = load <4 x i32>, ptr %p1
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_zero(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_zero:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_zero:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_zero:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_zero(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_zero:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_zero:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_zero:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1-BLEND: {{.*}}
+; AVX1-MOV: {{.*}}
+; AVX2-BLEND: {{.*}}
+; AVX2-MOV: {{.*}}

From ca5040990ed17fa444d30c22fffcfa7ddc72612f Mon Sep 17 00:00:00 2001
From: Aleksandr Platonov <platonov.aleksandr@huawei.com>
Date: Fri, 13 Jun 2025 18:32:42 +0300
Subject: [PATCH 0294/1322] [clangd] Collect references in array designators
 (#140356)

---
 clang-tools-extra/clangd/unittests/XRefsTests.cpp | 8 ++++++++
 clang/lib/Index/IndexBody.cpp                     | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 1892f87c8e82..b04d6431f89f 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -2311,6 +2311,14 @@ TEST(FindReferences, WithinAST) {
             $(S::deleteObject)[[de^lete]] S;
           }
         };
+      )cpp",
+      // Array designators
+      R"cpp(
+        const int $def[[F^oo]] = 0;
+        int Bar[] = {
+          [$(Bar)[[F^oo]]...$(Bar)[[Fo^o]] + 1] = 0,
+          [$(Bar)[[^Foo]] + 2] = 1
+        };
       )cpp"};
   for (const char *Test : Tests)
     checkFindRefs(Test);
diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp
index 2ed20df22bda..98ce6f73ec84 100644
--- a/clang/lib/Index/IndexBody.cpp
+++ b/clang/lib/Index/IndexBody.cpp
@@ -435,6 +435,13 @@ public:
                                             ParentDC, SymbolRoleSet(),
                                             /*Relations=*/{}, E);
           }
+        } else {
+          if (D.isArrayDesignator())
+            TraverseStmt(E->getArrayIndex(D));
+          else if (D.isArrayRangeDesignator()) {
+            TraverseStmt(E->getArrayRangeStart(D));
+            TraverseStmt(E->getArrayRangeEnd(D));
+          }
         }
       }
       return true;

From dc9e300f12f3b9c8160dbfb0bc32252ad99c3ba7 Mon Sep 17 00:00:00 2001
From: Fabian Meumertzheim <fabian@meumertzhe.im>
Date: Fri, 13 Jun 2025 17:49:30 +0200
Subject: [PATCH 0295/1322] [llvm-cov] Add support for baseline coverage
 (#117910)

When no profile is provided, but the new --empty-profile option is
specifed, the export/report/show commands now emit coverage data
equivalent to that obtained from a profile with all zero counters
("baseline coverage").

This is useful for build systems (e.g. Bazel) that can track coverage
information for each build target, even those that are never linked into
tests and thus don't have runtime coverage data recorded. By merging in
baseline coverage, lines in files that aren't linked into tests are
correctly reported as uncovered.
---
 llvm/docs/CommandGuide/llvm-cov.rst           |  15 +++
 .../ProfileData/Coverage/CoverageMapping.h    |  24 ++--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 123 +++++++++++-------
 ...showLineExecutionCounts-lcov-baseline.test |  37 ++++++
 llvm/tools/llvm-cov/CodeCoverage.cpp          |  78 +++++++----
 .../ProfileData/CoverageMappingTest.cpp       |   4 +-
 6 files changed, 195 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test

diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst
index 968f3c452f55..f4db60cf06fa 100644
--- a/llvm/docs/CommandGuide/llvm-cov.rst
+++ b/llvm/docs/CommandGuide/llvm-cov.rst
@@ -380,6 +380,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -470,6 +475,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
@@ -562,6 +572,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Export the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 CONVERT-FOR-TESTING COMMAND
 ---------------------------
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index e62ce5e3d8fa..d1230b0ba7c5 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -991,18 +991,23 @@ class CoverageMapping {
   // Load coverage records from readers.
   static Error loadFromReaders(
       ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-      IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage);
+      std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader,
+      CoverageMapping &Coverage);
 
   // Load coverage records from file.
   static Error
   loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir,
-               IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-               bool &DataFound,
+               std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+                   &ProfileReader,
+               CoverageMapping &Coverage, bool &DataFound,
                SmallVectorImpl<object::BuildID> *FoundBinaryIDs = nullptr);
 
   /// Add a function record corresponding to \p Record.
-  Error loadFunctionRecord(const CoverageMappingRecord &Record,
-                           IndexedInstrProfReader &ProfileReader);
+  Error loadFunctionRecord(
+      const CoverageMappingRecord &Record,
+      const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader);
 
   /// Look up the indices for function records which are at least partially
   /// defined in the specified file. This is guaranteed to return a superset of
@@ -1018,15 +1023,16 @@ public:
   /// Load the coverage mapping using the given readers.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-       IndexedInstrProfReader &ProfileReader);
+       std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+           &ProfileReader);
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
   /// Ignores non-instrumented object files unless all are not instrumented.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
-  load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-       vfs::FileSystem &FS, ArrayRef<StringRef> Arches = {},
-       StringRef CompilationDir = "",
+  load(ArrayRef<StringRef> ObjectFilenames,
+       std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+       ArrayRef<StringRef> Arches = {}, StringRef CompilationDir = "",
        const object::BuildIDFetcher *BIDFetcher = nullptr,
        bool CheckBinaryIDs = false);
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index dd74eb054a34..429ec5c19f1f 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -823,7 +823,8 @@ public:
 
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
-    IndexedInstrProfReader &ProfileReader) {
+    const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
   if (OrigFuncName.empty())
     return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -837,35 +838,44 @@ Error CoverageMapping::loadFunctionRecord(
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
-  if (Error E = ProfileReader.getFunctionCounts(Record.FunctionName,
-                                                Record.FunctionHash, Counts)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionCounts(
+            Record.FunctionName, Record.FunctionHash, Counts)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
+  } else {
     Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
   }
   Ctx.setCounts(Counts);
 
   bool IsVersion11 =
-      ProfileReader.getVersion() < IndexedInstrProf::ProfVersion::Version12;
+      ProfileReader && ProfileReader.value().get().getVersion() <
+                           IndexedInstrProf::ProfVersion::Version12;
 
   BitVector Bitmap;
-  if (Error E = ProfileReader.getFunctionBitmap(Record.FunctionName,
-                                                Record.FunctionHash, Bitmap)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionBitmap(
+            Record.FunctionName, Record.FunctionHash, Bitmap)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
-    Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
+  } else {
+    Bitmap = BitVector(getMaxBitmapSize(Record, false));
   }
   Ctx.setBitmap(std::move(Bitmap));
 
@@ -959,10 +969,14 @@ Error CoverageMapping::loadFunctionRecord(
 // of CoverageMappingReader instances.
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
-  assert(!Coverage.SingleByteCoverage ||
-         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
-  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage || !ProfileReader ||
+         *Coverage.SingleByteCoverage ==
+             ProfileReader.value().get().hasSingleByteCoverage());
+  Coverage.SingleByteCoverage =
+      !ProfileReader || ProfileReader.value().get().hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -977,7 +991,8 @@ Error CoverageMapping::loadFromReaders(
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   if (Error E = loadFromReaders(CoverageReaders, ProfileReader, *Coverage))
     return std::move(E);
@@ -986,18 +1001,19 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
 // If E is a no_data_found error, returns success. Otherwise returns E.
 static Error handleMaybeNoDataFoundError(Error E) {
-  return handleErrors(
-      std::move(E), [](const CoverageMapError &CME) {
-        if (CME.get() == coveragemap_error::no_data_found)
-          return static_cast<Error>(Error::success());
-        return make_error<CoverageMapError>(CME.get(), CME.getMessage());
-      });
+  return handleErrors(std::move(E), [](const CoverageMapError &CME) {
+    if (CME.get() == coveragemap_error::no_data_found)
+      return static_cast<Error>(Error::success());
+    return make_error<CoverageMapError>(CME.get(), CME.getMessage());
+  });
 }
 
 Error CoverageMapping::loadFromFile(
     StringRef Filename, StringRef Arch, StringRef CompilationDir,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-    bool &DataFound, SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage, bool &DataFound,
+    SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
   auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
       Filename, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = CovMappingBufOrErr.getError())
@@ -1033,13 +1049,23 @@ Error CoverageMapping::loadFromFile(
 }
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
-    ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-    vfs::FileSystem &FS, ArrayRef<StringRef> Arches, StringRef CompilationDir,
+    ArrayRef<StringRef> ObjectFilenames,
+    std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+    ArrayRef<StringRef> Arches, StringRef CompilationDir,
     const object::BuildIDFetcher *BIDFetcher, bool CheckBinaryIDs) {
-  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename, FS);
-  if (Error E = ProfileReaderOrErr.takeError())
-    return createFileError(ProfileFilename, std::move(E));
-  auto ProfileReader = std::move(ProfileReaderOrErr.get());
+  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
+  if (ProfileFilename) {
+    auto ProfileReaderOrErr =
+        IndexedInstrProfReader::create(ProfileFilename.value(), FS);
+    if (Error E = ProfileReaderOrErr.takeError())
+      return createFileError(ProfileFilename.value(), std::move(E));
+    ProfileReader = std::move(ProfileReaderOrErr.get());
+  }
+  auto ProfileReaderRef =
+      ProfileReader
+          ? std::optional<std::reference_wrapper<IndexedInstrProfReader>>(
+                *ProfileReader)
+          : std::nullopt;
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
 
@@ -1053,16 +1079,17 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
   SmallVector<object::BuildID> FoundBinaryIDs;
   for (const auto &File : llvm::enumerate(ObjectFilenames)) {
-    if (Error E =
-            loadFromFile(File.value(), GetArch(File.index()), CompilationDir,
-                         *ProfileReader, *Coverage, DataFound, &FoundBinaryIDs))
+    if (Error E = loadFromFile(File.value(), GetArch(File.index()),
+                               CompilationDir, ProfileReaderRef, *Coverage,
+                               DataFound, &FoundBinaryIDs))
       return std::move(E);
   }
 
   if (BIDFetcher) {
     std::vector<object::BuildID> ProfileBinaryIDs;
-    if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
-      return createFileError(ProfileFilename, std::move(E));
+    if (ProfileReader)
+      if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
+        return createFileError(ProfileFilename.value(), std::move(E));
 
     SmallVector<object::BuildIDRef> BinaryIDsToFetch;
     if (!ProfileBinaryIDs.empty()) {
@@ -1082,12 +1109,12 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
       if (PathOpt) {
         std::string Path = std::move(*PathOpt);
         StringRef Arch = Arches.size() == 1 ? Arches.front() : StringRef();
-        if (Error E = loadFromFile(Path, Arch, CompilationDir, *ProfileReader,
-                                  *Coverage, DataFound))
+        if (Error E = loadFromFile(Path, Arch, CompilationDir, ProfileReaderRef,
+                                   *Coverage, DataFound))
           return std::move(E);
       } else if (CheckBinaryIDs) {
         return createFileError(
-            ProfileFilename,
+            ProfileFilename.value(),
             createStringError(errc::no_such_file_or_directory,
                               "Missing binary ID: " +
                                   llvm::toHex(BinaryID, /*LowerCase=*/true)));
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
new file mode 100644
index 000000000000..bce886bdf510
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
@@ -0,0 +1,37 @@
+// FULL: SF:{{.*}}showLineExecutionCounts.cpp
+// FULL: FN:6,main
+// FULL: FNDA:0,main
+// FULL: FNF:1
+// FULL: FNH:0
+int main() {                              // FULL: DA:[[@LINE]],0
+  int x = 0;                              // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  if (x) {                                // FULL: DA:[[@LINE]],0
+    x = 0;                                // FULL: DA:[[@LINE]],0
+  } else {                                // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],0
+  x = x > 10 ?                            // FULL: DA:[[@LINE]],0
+        x - 1:                            // FULL: DA:[[@LINE]],0
+        x + 1;                            // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  return 0;                               // FULL: DA:[[@LINE]],0
+}                                         // FULL: DA:[[@LINE]],0
+// FULL: LF:20
+// FULL: LH:0
+// FULL: end_of_record
+// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=FULL %s
+
+// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=SUMMARYONLY %s
+// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
+// SUMMARYONLY: FNF:1
+// SUMMARYONLY: FNH:0
+// SUMMARYONLY: LF:20
+// SUMMARYONLY: LH:0
+// SUMMARYONLY: end_of_record
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 1f2484cd4dda..6c66858c4de8 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -153,7 +153,7 @@ private:
   bool HadSourceFiles = false;
 
   /// The path to the indexed profile.
-  std::string PGOFilename;
+  std::optional<std::string> PGOFilename;
 
   /// A list of input source files.
   std::vector<std::string> SourceFiles;
@@ -455,10 +455,12 @@ static bool modifiedTimeGT(StringRef LHS, StringRef RHS) {
 }
 
 std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
-  for (StringRef ObjectFilename : ObjectFilenames)
-    if (modifiedTimeGT(ObjectFilename, PGOFilename))
-      warning("profile data may be out of date - object is newer",
-              ObjectFilename);
+  if (PGOFilename) {
+    for (StringRef ObjectFilename : ObjectFilenames)
+      if (modifiedTimeGT(ObjectFilename, PGOFilename.value()))
+        warning("profile data may be out of date - object is newer",
+                ObjectFilename);
+  }
   auto FS = vfs::getRealFileSystem();
   auto CoverageOrErr = CoverageMapping::load(
       ObjectFilenames, PGOFilename, *FS, CoverageArches,
@@ -668,11 +670,16 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "dump-collected-paths", cl::Optional, cl::Hidden,
       cl::desc("Show the collected paths to source files"));
 
-  cl::opt<std::string, true> PGOFilename(
-      "instr-profile", cl::Required, cl::location(this->PGOFilename),
+  cl::opt<std::string> PGOFilename(
+      "instr-profile", cl::Optional,
       cl::desc(
           "File with the profile data obtained after an instrumented run"));
 
+  cl::opt<bool> EmptyProfile(
+      "empty-profile", cl::Optional,
+      cl::desc("Use a synthetic profile with no data to generate "
+               "baseline coverage"));
+
   cl::list<std::string> Arches(
       "arch", cl::desc("architectures of the coverage mapping binaries"));
 
@@ -805,6 +812,15 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     }
     this->CheckBinaryIDs = CheckBinaryIDs;
 
+    if (!PGOFilename.empty() == EmptyProfile) {
+      error(
+          "exactly one of -instr-profile and -empty-profile must be specified");
+      return 1;
+    }
+    if (!PGOFilename.empty()) {
+      this->PGOFilename = std::make_optional(PGOFilename.getValue());
+    }
+
     if (!CovFilename.empty())
       ObjectFilenames.emplace_back(CovFilename);
     for (const std::string &Filename : CovFilenames)
@@ -1116,20 +1132,22 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     }
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
-  }
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
 
-  if (ShowCreatedTime) {
-    auto ModifiedTime = Status.getLastModificationTime();
-    std::string ModifiedTimeStr = to_string(ModifiedTime);
-    size_t found = ModifiedTimeStr.rfind(':');
-    ViewOpts.CreatedTimeStr =
-        (found != std::string::npos)
-            ? "Created: " + ModifiedTimeStr.substr(0, found)
-            : "Created: " + ModifiedTimeStr;
+    if (ShowCreatedTime) {
+      auto ModifiedTime = Status.getLastModificationTime();
+      std::string ModifiedTimeStr = to_string(ModifiedTime);
+      size_t found = ModifiedTimeStr.rfind(':');
+      ViewOpts.CreatedTimeStr =
+          (found != std::string::npos)
+              ? "Created: " + ModifiedTimeStr.substr(0, found)
+              : "Created: " + ModifiedTimeStr;
+    }
   }
 
   auto Coverage = load();
@@ -1238,10 +1256,12 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
@@ -1303,10 +1323,12 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 46f881ecddb5..c0e99cf80b94 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -277,7 +277,9 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
       CoverageReaders.push_back(
           std::make_unique<CoverageMappingReaderMock>(Funcs));
     }
-    return CoverageMapping::load(CoverageReaders, *ProfileReader);
+    auto ProfileReaderRef =
+        std::make_optional(std::reference_wrapper(*ProfileReader));
+    return CoverageMapping::load(CoverageReaders, ProfileReaderRef);
   }
 
   Error loadCoverageMapping(bool EmitFilenames = true) {

From 18b67a7a102c0052e5ae0e76ef1297902ffeb22d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 08:58:19 -0700
Subject: [PATCH 0296/1322] MC: Add MCAsmInfo::printExpr to replace
 MCExpr::print

* Make relocation specifier code closer (MCAsmInfo defines specifiers).
* MCExpr::print has an optional MCAsmInfo argument, which is
  error-prone when omitted.
* Enable MCSpecifierExpr
---
 llvm/include/llvm/MC/MCAsmInfo.h |  3 +++
 llvm/lib/MC/MCAsmInfo.cpp        |  5 +++++
 llvm/lib/MC/MCAsmStreamer.cpp    | 24 ++++++++++++------------
 llvm/lib/MC/MCStreamer.cpp       |  2 +-
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index e98cd17a9df5..18303e028f62 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -32,6 +32,7 @@ class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbol;
+class raw_ostream;
 
 namespace WinEH {
 
@@ -709,6 +710,8 @@ public:
 
   StringRef getSpecifierName(uint32_t S) const;
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
+
+  void printExpr(raw_ostream &, const MCExpr &) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index 86759c32bb75..fbacca4f5679 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -148,3 +149,7 @@ std::optional<uint32_t> MCAsmInfo::getSpecifierForName(StringRef Name) const {
     return It->second;
   return {};
 }
+
+void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
+  Expr.print(OS, this);
+}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 4380f74318e7..c43619d71217 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -700,7 +700,7 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
       OS << ".set ";
     Symbol->print(OS, MAI);
     OS << (UseSet ? ", " : " = ");
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
 
     EmitEOL();
   }
@@ -713,7 +713,7 @@ void MCAsmStreamer::emitConditionalAssignment(MCSymbol *Symbol,
   OS << ".lto_set_conditional ";
   Symbol->print(OS, MAI);
   OS << ", ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1065,7 +1065,7 @@ void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   OS << "\t.size\t";
   Symbol->print(OS, MAI);
   OS << ", ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1399,7 +1399,7 @@ void MCAsmStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
   if (MCTargetStreamer *TS = getTargetStreamer()) {
     TS->emitValue(Value);
   } else {
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     EmitEOL();
   }
 }
@@ -1411,7 +1411,7 @@ void MCAsmStreamer::emitULEB128Value(const MCExpr *Value) {
     return;
   }
   OS << "\t.uleb128 ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1422,7 +1422,7 @@ void MCAsmStreamer::emitSLEB128Value(const MCExpr *Value) {
     return;
   }
   OS << "\t.sleb128 ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1437,7 +1437,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
     if (!MAI->isAIX() || FillValue == 0) {
       // FIXME: Emit location directives
       OS << ZeroDirective;
-      NumBytes.print(OS, MAI);
+      MAI->printExpr(OS, NumBytes);
       if (FillValue != 0)
         OS << ',' << (int)FillValue;
       EmitEOL();
@@ -1460,7 +1460,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
                              int64_t Expr, SMLoc Loc) {
   // FIXME: Emit location directives
   OS << "\t.fill\t";
-  NumValues.print(OS, MAI);
+  MAI->printExpr(OS, NumValues);
   OS << ", " << Size << ", 0x";
   OS.write_hex(truncateToSize(Expr, 4));
   EmitEOL();
@@ -1558,7 +1558,7 @@ void MCAsmStreamer::emitValueToOffset(const MCExpr *Offset,
                                       SMLoc Loc) {
   // FIXME: Verify that Offset is associated with the current section.
   OS << ".org ";
-  Offset->print(OS, MAI);
+  MAI->printExpr(OS, *Offset);
   OS << ", " << (unsigned)Value;
   EmitEOL();
 }
@@ -2417,7 +2417,7 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
     MCFixup &F = Fixups[i];
     OS << "  fixup " << char('A' + i) << " - "
        << "offset: " << F.getOffset() << ", value: ";
-    F.getValue()->print(OS, MAI);
+    MAI->printExpr(OS, *F.getValue());
     auto Kind = F.getKind();
     if (mc::isRelocation(Kind))
       OS << ", relocation type: " << Kind;
@@ -2496,11 +2496,11 @@ MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
                                   const MCExpr *Expr, SMLoc,
                                   const MCSubtargetInfo &STI) {
   OS << "\t.reloc ";
-  Offset.print(OS, MAI);
+  MAI->printExpr(OS, Offset);
   OS << ", " << Name;
   if (Expr) {
     OS << ", ";
-    Expr->print(OS, MAI);
+    MAI->printExpr(OS, *Expr);
   }
   EmitEOL();
   return std::nullopt;
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index d70639b7bfe2..5f1fd57802c7 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -72,7 +72,7 @@ void MCTargetStreamer::emitValue(const MCExpr *Value) {
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
 
-  Value->print(OS, Streamer.getContext().getAsmInfo());
+  Streamer.getContext().getAsmInfo()->printExpr(OS, *Value);
   Streamer.emitRawText(OS.str());
 }
 

From d688df52ba9012197b3716ae85f818fafee7cf62 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 13 Jun 2025 08:56:49 -0700
Subject: [PATCH 0297/1322] [instsimplify] Add tests for missing vp.reverse
 simplifications

---
 .../Transforms/InstSimplify/vp-reverse.ll     | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/vp-reverse.ll

diff --git a/llvm/test/Transforms/InstSimplify/vp-reverse.ll b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
new file mode 100644
index 000000000000..3c3bb871dc61
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+define <vscale x 4 x i32> @rev_of_rev(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @rev_of_rev(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @rev_of_rev_diffevl(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @rev_of_rev_diffevl(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 10)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 10)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
+; CHECK-LABEL: @rev_of_poison(
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
+; CHECK-LABEL: @rev_of_undef(
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_zero(i32 %evl) {
+; CHECK-LABEL: @rev_of_zero(
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_splat(i32 %a, i32 %evl) {
+; CHECK-LABEL: @rev_of_splat(
+; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+  %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.vec, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_splat2(i32 %a, <vscale x 4 x i1> %m, i32 %evl) {
+; CHECK-LABEL: @rev_of_splat2(
+; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+  %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.vec, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}

From dec576514cb7106c59a5059ac6d52ebdf5de5275 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 17:11:18 +0100
Subject: [PATCH 0298/1322] [X86] X86FixupInstTuning - add dbg message for each
 instruction replacement (#144083)

Help debug the changes the pass makes
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp | 57 +++++++++++++++-------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 8c1ff523c975..89093b2e1a3f 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -132,11 +132,15 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
     if (!NewOpcPreferable(NewOpc))
       return false;
-    unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
-    MI.removeOperand(NumOperands - 1);
-    MI.addOperand(MI.getOperand(NumOperands - 2));
-    MI.setDesc(TII->get(NewOpc));
-    MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MI.getOperand(NumOperands - 2));
+      MI.setDesc(TII->get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -147,11 +151,15 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
     if (!NewOpcPreferable(NewOpc))
       return false;
-    unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
-    MI.removeOperand(NumOperands - 1);
-    MI.addOperand(MI.getOperand(NumOperands - 2));
-    MI.setDesc(TII->get(NewOpc));
-    MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MI.getOperand(NumOperands - 2));
+      MI.setDesc(TII->get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -164,7 +172,11 @@ bool X86FixupInstTuningPass::processInstruction(
     if (!ST->hasNoDomainDelayShuffle() ||
         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
-    MI.setDesc(TII->get(NewOpc));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(NewOpc));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -185,9 +197,12 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
     if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
-
-    MI.setDesc(TII->get(NewOpc));
-    MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -198,7 +213,11 @@ bool X86FixupInstTuningPass::processInstruction(
     if (!ST->hasNoDomainDelayShuffle() ||
         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
-    MI.setDesc(TII->get(NewOpc));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(NewOpc));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -229,8 +248,12 @@ bool X86FixupInstTuningPass::processInstruction(
       return false;
     if (!OptSize && !NewOpcPreferable(MovOpc))
       return false;
-    MI.setDesc(TII->get(MovOpc));
-    MI.removeOperand(NumOperands - 1);
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(MovOpc));
+      MI.removeOperand(NumOperands - 1);
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 

From bd33eef7f1013bea24289a898f788a2efe9d8282 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 13 Jun 2025 12:21:38 -0400
Subject: [PATCH 0299/1322] [HLSL][SPIRV] Use resource names (#143412)

The SPIR-V backend does not have access to the original name of a
resource in the source, so it tries to create a name. This leads to some
problems with reflection.

That is why start to pass the name of the resource from Clang to the
SPIR-V backend.

Fixes #138533
---
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 17 ++--
 clang/lib/CodeGen/CGHLSLRuntime.cpp           | 49 ++--------
 clang/lib/CodeGen/CGHLSLRuntime.h             | 13 +--
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       | 16 ++--
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 94 +------------------
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h   |  1 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 16 ++--
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          | 10 ++
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |  4 +
 .../SPIRV/hlsl-resources/BufferLoad.ll        |  8 +-
 .../SPIRV/hlsl-resources/BufferLoadStore.ll   | 14 +--
 .../SPIRV/hlsl-resources/BufferStore.ll       |  4 +-
 .../CodeGen/SPIRV/hlsl-resources/Packed.ll    |  8 +-
 .../hlsl-resources/ScalarResourceType.ll      | 11 ++-
 .../hlsl-resources/StorageImageDynIdx.ll      |  6 +-
 .../StorageImageNonUniformIdx.ll              |  6 +-
 .../SPIRV/hlsl-resources/StructuredBuffer.ll  | 13 +--
 .../SPIRV/hlsl-resources/UnknownBufferLoad.ll |  7 +-
 .../hlsl-resources/UnknownBufferStore.ll      |  4 +-
 .../SPIRV/hlsl-resources/spirv.layout.type.ll | 16 +++-
 .../pointers/resource-addrspacecast-2.ll      |  6 +-
 .../SPIRV/pointers/resource-addrspacecast.ll  |  6 +-
 .../CodeGen/SPIRV/spirv-explicit-layout.ll    | 29 +++---
 23 files changed, 135 insertions(+), 223 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index abebc201808b..ccf45c0c6ff1 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -295,17 +295,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Value *SpaceOp = EmitScalarExpr(E->getArg(2));
     Value *RangeOp = EmitScalarExpr(E->getArg(3));
     Value *IndexOp = EmitScalarExpr(E->getArg(4));
+    Value *Name = EmitScalarExpr(E->getArg(5));
     // FIXME: NonUniformResourceIndex bit is not yet implemented
     // (llvm/llvm-project#135452)
     Value *NonUniform =
         llvm::ConstantInt::get(llvm::Type::getInt1Ty(getLLVMContext()), false);
 
-    auto [IntrinsicID, HasNameArg] =
+    llvm::Intrinsic::ID IntrinsicID =
         CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic();
-    SmallVector<Value *> Args{SpaceOp, RegisterOp, RangeOp, IndexOp,
-                              NonUniform};
-    if (HasNameArg)
-      Args.push_back(EmitScalarExpr(E->getArg(5)));
+    SmallVector<Value *> Args{SpaceOp, RegisterOp, RangeOp,
+                              IndexOp, NonUniform, Name};
     return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args);
   }
   case Builtin::BI__builtin_hlsl_resource_handlefromimplicitbinding: {
@@ -314,16 +313,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Value *RangeOp = EmitScalarExpr(E->getArg(2));
     Value *IndexOp = EmitScalarExpr(E->getArg(3));
     Value *OrderID = EmitScalarExpr(E->getArg(4));
+    Value *Name = EmitScalarExpr(E->getArg(5));
     // FIXME: NonUniformResourceIndex bit is not yet implemented
     // (llvm/llvm-project#135452)
     Value *NonUniform =
         llvm::ConstantInt::get(llvm::Type::getInt1Ty(getLLVMContext()), false);
 
-    auto [IntrinsicID, HasNameArg] =
+    llvm::Intrinsic::ID IntrinsicID =
         CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic();
-    SmallVector<Value *> Args{OrderID, SpaceOp, RangeOp, IndexOp, NonUniform};
-    if (HasNameArg)
-      Args.push_back(EmitScalarExpr(E->getArg(5)));
+    SmallVector<Value *> Args{OrderID, SpaceOp,    RangeOp,
+                              IndexOp, NonUniform, Name};
     return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args);
   }
   case Builtin::BI__builtin_hlsl_all: {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 720dac8383c0..977ff792bae2 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -237,35 +237,6 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
   }
 }
 
-std::pair<llvm::Intrinsic::ID, bool>
-CGHLSLRuntime::getCreateHandleFromBindingIntrinsic() {
-  switch (getArch()) {
-  case llvm::Triple::dxil:
-    return std::pair(llvm::Intrinsic::dx_resource_handlefrombinding, true);
-  case llvm::Triple::spirv:
-    return std::pair(llvm::Intrinsic::spv_resource_handlefrombinding, false);
-  default:
-    llvm_unreachable("Intrinsic resource_handlefrombinding not supported by "
-                     "target architecture");
-  }
-}
-
-std::pair<llvm::Intrinsic::ID, bool>
-CGHLSLRuntime::getCreateHandleFromImplicitBindingIntrinsic() {
-  switch (getArch()) {
-  case llvm::Triple::dxil:
-    return std::pair(llvm::Intrinsic::dx_resource_handlefromimplicitbinding,
-                     true);
-  case llvm::Triple::spirv:
-    return std::pair(llvm::Intrinsic::spv_resource_handlefromimplicitbinding,
-                     false);
-  default:
-    llvm_unreachable(
-        "Intrinsic resource_handlefromimplicitbinding not supported by "
-        "target architecture");
-  }
-}
-
 // Codegen for HLSLBufferDecl
 void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) {
 
@@ -625,31 +596,27 @@ void CGHLSLRuntime::initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
       llvm::ConstantInt::get(CGM.IntTy, RBA ? RBA->getSpaceNumber() : 0);
   Value *Name = nullptr;
 
-  auto [IntrinsicID, HasNameArg] =
+  llvm::Intrinsic::ID IntrinsicID =
       RBA->hasRegisterSlot()
           ? CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic()
           : CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic();
 
-  if (HasNameArg) {
-    std::string Str(BufDecl->getName());
-    std::string GlobalName(Str + ".str");
-    Name = CGM.GetAddrOfConstantCString(Str, GlobalName.c_str()).getPointer();
-  }
+  std::string Str(BufDecl->getName());
+  std::string GlobalName(Str + ".str");
+  Name = CGM.GetAddrOfConstantCString(Str, GlobalName.c_str()).getPointer();
 
   // buffer with explicit binding
   if (RBA->hasRegisterSlot()) {
     auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber());
-    SmallVector<Value *> Args{Space, RegSlot, RangeSize, Index, NonUniform};
-    if (Name)
-      Args.push_back(Name);
+    SmallVector<Value *> Args{Space, RegSlot,    RangeSize,
+                              Index, NonUniform, Name};
     initializeBuffer(CGM, GV, IntrinsicID, Args);
   } else {
     // buffer with implicit binding
     auto *OrderID =
         llvm::ConstantInt::get(CGM.IntTy, RBA->getImplicitBindingOrderID());
-    SmallVector<Value *> Args{OrderID, Space, RangeSize, Index, NonUniform};
-    if (Name)
-      Args.push_back(Name);
+    SmallVector<Value *> Args{OrderID, Space,      RangeSize,
+                              Index,   NonUniform, Name};
     initializeBuffer(CGM, GV, IntrinsicID, Args);
   }
 }
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index bb2b82fa1f5a..89d2aff85d91 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -118,6 +118,10 @@ public:
 
   GENERATE_HLSL_INTRINSIC_FUNCTION(CreateResourceGetPointer,
                                    resource_getpointer)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding,
+                                   resource_handlefrombinding)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromImplicitBinding,
+                                   resource_handlefromimplicitbinding)
   GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
                                    group_memory_barrier_with_group_sync)
@@ -126,15 +130,6 @@ public:
   // End of reserved area for HLSL intrinsic getters.
   //===----------------------------------------------------------------------===//
 
-  // Returns ID of the intrinsic that initializes resource handle from binding
-  // and a bool value indicating whether the last argument of the intrinsic is
-  // the resource name (not all targets need that).
-  std::pair<llvm::Intrinsic::ID, bool> getCreateHandleFromBindingIntrinsic();
-
-  // Same as above but for implicit binding.
-  std::pair<llvm::Intrinsic::ID, bool>
-  getCreateHandleFromImplicitBindingIntrinsic();
-
 protected:
   CodeGenModule &CGM;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index e1c4a7aaf5a2..43335f81ed87 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -117,15 +117,15 @@ let TargetPrefix = "spv" in {
   // array size of the binding, as well as an index and an indicator
   // whether that index may be non-uniform.
   def int_spv_resource_handlefrombinding
-      : DefaultAttrsIntrinsic<
-            [llvm_any_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem]>;
+      : DefaultAttrsIntrinsic<[llvm_any_ty],
+                              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i1_ty, llvm_ptr_ty],
+                              [IntrNoMem]>;
   def int_spv_resource_handlefromimplicitbinding
-      : DefaultAttrsIntrinsic<
-            [llvm_any_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem]>;
+      : DefaultAttrsIntrinsic<[llvm_any_ty],
+                              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i1_ty, llvm_ptr_ty],
+                              [IntrNoMem]>;
 
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index c5e8269efd25..292b83e05b56 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -799,107 +799,15 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   return Reg;
 }
 
-static std::string GetSpirvImageTypeName(const SPIRVType *Type,
-                                         MachineIRBuilder &MIRBuilder,
-                                         const std::string &Prefix,
-                                         SPIRVGlobalRegistry &GR);
-
 // Returns a name based on the Type. Notes that this does not look at
 // decorations, and will return the same string for two types that are the same
 // except for decorations.
-static std::string buildSpirvTypeName(const SPIRVType *Type,
-                                      MachineIRBuilder &MIRBuilder,
-                                      SPIRVGlobalRegistry &GR) {
-  switch (Type->getOpcode()) {
-  case SPIRV::OpTypeSampledImage: {
-    return GetSpirvImageTypeName(Type, MIRBuilder, "sampled_image_", GR);
-  }
-  case SPIRV::OpTypeImage: {
-    return GetSpirvImageTypeName(Type, MIRBuilder, "image_", GR);
-  }
-  case SPIRV::OpTypeArray: {
-    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-    Register ElementTypeReg = Type->getOperand(1).getReg();
-    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
-    uint32_t ArraySize = getArrayComponentCount(MRI, Type);
-    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
-            Twine(ArraySize) + Twine("]"))
-        .str();
-  }
-  case SPIRV::OpTypeFloat:
-    return ("f" + Twine(Type->getOperand(1).getImm())).str();
-  case SPIRV::OpTypeSampler:
-    return ("sampler");
-  case SPIRV::OpTypeInt:
-    if (Type->getOperand(2).getImm())
-      return ("i" + Twine(Type->getOperand(1).getImm())).str();
-    return ("u" + Twine(Type->getOperand(1).getImm())).str();
-  case SPIRV::OpTypePointer: {
-    uint32_t StorageClass = GR.getPointerStorageClass(Type);
-    SPIRVType *PointeeType = GR.getPointeeType(Type);
-    return ("p_" + Twine(StorageClass) + Twine("_") +
-            buildSpirvTypeName(PointeeType, MIRBuilder, GR))
-        .str();
-  }
-  case SPIRV::OpTypeStruct: {
-    std::string TypeName = "{";
-    for (uint32_t I = 1; I < Type->getNumOperands(); ++I) {
-      SPIRVType *MemberType =
-          GR.getSPIRVTypeForVReg(Type->getOperand(I).getReg());
-      TypeName += '_' + buildSpirvTypeName(MemberType, MIRBuilder, GR);
-    }
-    return TypeName + "}";
-  }
-  case SPIRV::OpTypeVector: {
-    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-    Register ElementTypeReg = Type->getOperand(1).getReg();
-    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
-    uint32_t VectorSize = GR.getScalarOrVectorComponentCount(Type);
-    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
-            Twine(VectorSize) + Twine("]"))
-        .str();
-  }
-  case SPIRV::OpTypeRuntimeArray: {
-    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-    Register ElementTypeReg = Type->getOperand(1).getReg();
-    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
-    uint32_t ArraySize = 0;
-    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
-            Twine(ArraySize) + Twine("]"))
-        .str();
-  }
-  default:
-    llvm_unreachable("Trying to the the name of an unknown type.");
-  }
-}
-
-static std::string GetSpirvImageTypeName(const SPIRVType *Type,
-                                         MachineIRBuilder &MIRBuilder,
-                                         const std::string &Prefix,
-                                         SPIRVGlobalRegistry &GR) {
-  Register SampledTypeReg = Type->getOperand(1).getReg();
-  auto *SampledType = MIRBuilder.getMRI()->getUniqueVRegDef(SampledTypeReg);
-  std::string TypeName =
-      Prefix + buildSpirvTypeName(SampledType, MIRBuilder, GR);
-  for (uint32_t I = 2; I < Type->getNumOperands(); ++I) {
-    TypeName = (TypeName + '_' + Twine(Type->getOperand(I).getImm())).str();
-  }
-  return TypeName;
-}
-
 Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding(
-    const SPIRVType *VarType, uint32_t Set, uint32_t Binding,
+    const SPIRVType *VarType, uint32_t Set, uint32_t Binding, StringRef Name,
     MachineIRBuilder &MIRBuilder) {
   Register VarReg =
       MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
 
-  // TODO(138533): The name should come from the llvm-ir, but how that name will
-  // be passed from the HLSL to the backend has not been decided. Using this
-  // place holder for now.
-  std::string Name =
-      ("__resource_" + buildSpirvTypeName(VarType, MIRBuilder, *this) + "_" +
-       Twine(Set) + "_" + Twine(Binding))
-          .str();
   buildGlobalVariable(VarReg, VarType, Name, nullptr,
                       getPointerStorageClass(VarType), nullptr, false, false,
                       SPIRV::LinkageType::Import, MIRBuilder, false);
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 3b481b3aba0c..35f616a1981d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -548,6 +548,7 @@ public:
                                bool IsInstSelector);
   Register getOrCreateGlobalVariableWithBinding(const SPIRVType *VarType,
                                                 uint32_t Set, uint32_t Binding,
+                                                StringRef Name,
                                                 MachineIRBuilder &MIRBuilder);
 
   // Convenient helpers for getting types with check for duplicates.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 2dae0721886c..8edd0b533b9f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -322,7 +322,7 @@ private:
                                   SPIRV::StorageClass::StorageClass SC,
                                   uint32_t Set, uint32_t Binding,
                                   uint32_t ArraySize, Register IndexReg,
-                                  bool IsNonUniform,
+                                  bool IsNonUniform, StringRef Name,
                                   MachineIRBuilder MIRBuilder) const;
   SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
   bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
@@ -3380,14 +3380,14 @@ bool SPIRVInstructionSelector::selectImageWriteIntrinsic(
 Register SPIRVInstructionSelector::buildPointerToResource(
     const SPIRVType *SpirvResType, SPIRV::StorageClass::StorageClass SC,
     uint32_t Set, uint32_t Binding, uint32_t ArraySize, Register IndexReg,
-    bool IsNonUniform, MachineIRBuilder MIRBuilder) const {
+    bool IsNonUniform, StringRef Name, MachineIRBuilder MIRBuilder) const {
   const Type *ResType = GR.getTypeForSPIRVType(SpirvResType);
   if (ArraySize == 1) {
     SPIRVType *PtrType =
         GR.getOrCreateSPIRVPointerType(ResType, MIRBuilder, SC);
     assert(GR.getPointeeType(PtrType) == SpirvResType &&
            "SpirvResType did not have an explicit layout.");
-    return GR.getOrCreateGlobalVariableWithBinding(PtrType, Set, Binding,
+    return GR.getOrCreateGlobalVariableWithBinding(PtrType, Set, Binding, Name,
                                                    MIRBuilder);
   }
 
@@ -3395,7 +3395,7 @@ Register SPIRVInstructionSelector::buildPointerToResource(
   SPIRVType *VarPointerType =
       GR.getOrCreateSPIRVPointerType(VarType, MIRBuilder, SC);
   Register VarReg = GR.getOrCreateGlobalVariableWithBinding(
-      VarPointerType, Set, Binding, MIRBuilder);
+      VarPointerType, Set, Binding, Name, MIRBuilder);
 
   SPIRVType *ResPointerType =
       GR.getOrCreateSPIRVPointerType(ResType, MIRBuilder, SC);
@@ -4081,6 +4081,9 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
   uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI);
   Register IndexReg = HandleDef.getOperand(5).getReg();
   bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI);
+  std::string Name =
+      getStringValueFromReg(HandleDef.getOperand(7).getReg(), *MRI);
+
   bool IsStructuredBuffer = ResType->getOpcode() == SPIRV::OpTypePointer;
   MachineIRBuilder MIRBuilder(HandleDef);
   SPIRVType *VarType = ResType;
@@ -4091,8 +4094,9 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
     SC = GR.getPointerStorageClass(ResType);
   }
 
-  Register VarReg = buildPointerToResource(VarType, SC, Set, Binding, ArraySize,
-                                           IndexReg, IsNonUniform, MIRBuilder);
+  Register VarReg =
+      buildPointerToResource(VarType, SC, Set, Binding, ArraySize, IndexReg,
+                             IsNonUniform, Name, MIRBuilder);
 
   if (IsNonUniform)
     buildOpDecorate(HandleReg, HandleDef, TII, SPIRV::Decoration::NonUniformEXT,
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 725a7979d3e5..768efb96a53e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -80,6 +80,16 @@ std::string getStringImm(const MachineInstr &MI, unsigned StartIndex) {
   return getSPIRVStringOperand(MI, StartIndex);
 }
 
+std::string getStringValueFromReg(Register Reg, MachineRegisterInfo &MRI) {
+  MachineInstr *Def = getVRegDef(MRI, Reg);
+  assert(Def && Def->getOpcode() == TargetOpcode::G_GLOBAL_VALUE &&
+         "Expected G_GLOBAL_VALUE");
+  const GlobalValue *GV = Def->getOperand(1).getGlobal();
+  Value *V = GV->getOperand(0);
+  const ConstantDataArray *CDA = cast<ConstantDataArray>(V);
+  return CDA->getAsCString().str();
+}
+
 void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
   const auto Bitwidth = Imm.getBitWidth();
   if (Bitwidth == 1)
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index f14a7d356ea5..d732188f9289 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -125,6 +125,10 @@ void addStringImm(const StringRef &Str, IRBuilder<> &B,
 // the reverse of the logic in addStringImm.
 std::string getStringImm(const MachineInstr &MI, unsigned StartIndex);
 
+// Returns the string constant that the register refers to. It is assumed that
+// Reg is a global value that contains a string.
+std::string getStringValueFromReg(Register Reg, MachineRegisterInfo &MRI);
+
 // Add the given numerical immediate to MIB.
 void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB);
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
index 58252fe297f3..b14b6af156ca 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-NOT: OpCapability StorageImageReadWithoutFormat
 
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
@@ -20,7 +22,7 @@ define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
   %data0 = call <4 x i32> @llvm.spv.resource.load.typedbuffer(
@@ -35,7 +37,7 @@ define void @RWBufferLoad_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
 ; CHECK: OpCompositeExtract [[int]] [[V]] 0
@@ -51,7 +53,7 @@ define void @RWBufferLoad_Vec2_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
 ; CHECK: [[e0:%[0-9]+]] = OpCompositeExtract [[int]] [[V]] 0
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
index d810ef9ccecc..22fb4c3e78dc 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-DAG: [[float:%[0-9]+]] = OpTypeFloat 32
 ; CHECK-DAG: [[v2float:%[0-9]+]] = OpTypeVector [[float]] 2
 ; CHECK-DAG: [[v4float:%[0-9]+]] = OpTypeVector [[float]] 4
@@ -18,7 +20,7 @@
 define void @main_scalar() local_unnamed_addr #0 {
 entry:
 ; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
-  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
 ; CHECK: [[V:%[0-9]+]] = OpCompositeExtract [[float]] [[R]] 0
@@ -57,7 +59,7 @@ bb_both:
 define void @main_vector2() local_unnamed_addr #0 {
 entry:
 ; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
-  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
 ; CHECK: [[E0:%[0-9]+]] = OpCompositeExtract [[float]] [[R]] 0
@@ -100,7 +102,7 @@ bb_both:
 define void @main_vector4() local_unnamed_addr #0 {
 entry:
 ; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
-  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1) %s_h.i, i32 1)
@@ -132,11 +134,5 @@ bb_both:
   ret void
 }
 
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1), i32) #1
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32, i32, i32, i32, i1) #1
-
 attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
index 812e20e45565..ee976f1a4110 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O3 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b = private unnamed_addr constant [2 x i8] c"B\00", align 1
+
 ; CHECK-NOT: OpCapability StorageImageReadWithoutFormat
 
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
@@ -22,7 +24,7 @@ declare <4 x i32> @get_data() #1
 define void @RWBufferStore_Vec4_I32() #0 {
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll
index d5f654518014..5e9d88fd9af0 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll
@@ -3,6 +3,10 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
 
+
+@.str.unpacked = private unnamed_addr constant [12 x i8] c"UnpackedRes\00", align 1
+@.str.packed = private unnamed_addr constant [10 x i8] c"PackedRes\00", align 1
+
 ; CHECK-DAG: OpName [[unpacked:%[0-9]+]] "unpacked"
 ; CHECK-DAG: OpName [[packed:%[0-9]+]] "packed"
 
@@ -22,7 +26,7 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 
 define external i32 @unpacked_vulkan_buffer_load() {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.unpacked)
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) %handle, i32 1)
   %1 = load i32, ptr addrspace(11) %0, align 4
   ret i32 %1
@@ -30,7 +34,7 @@ entry:
 
 define external i32 @packed_vulkan_buffer_load() {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %packed], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %packed], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.packed)
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %packed], 12, 0) %handle, i32 1)
   %1 = load i32, ptr addrspace(11) %0, align 4
   ret i32 %1
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
index f52fd44bf380..03b41ae0df31 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.int_buf = private unnamed_addr constant [7 x i8] c"IntBuf\00", align 1
+@.str.float_buf = private unnamed_addr constant [9 x i8] c"FloatBuf\00", align 1
+
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
 ; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
 ; CHECK-DAG: OpDecorate [[FloatBufferVar:%[0-9]+]] DescriptorSet 16
@@ -21,7 +24,7 @@ define void @RWBufferLoad() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.int_buf)
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
   store i32 0, ptr %ptr0, align 4
 
@@ -29,7 +32,7 @@ define void @RWBufferLoad() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.int_buf)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
@@ -43,7 +46,7 @@ define void @UseDifferentGlobalVar() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeFloat]] [[FloatBufferVar]]
   %buffer0 = call target("spirv.Image", float, 5, 2, 0, 0, 2, 3)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_3(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.float_buf )
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 3) %buffer0, i32 0)
   store float 0.0, ptr %ptr0, align 4
   ret void
@@ -57,7 +60,7 @@ define void @ReuseGlobalVarFromFirstFunction() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.int_buf)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
index 6a6d810e6bab..236c5e4ea56a 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-DAG: OpCapability Shader
 ; CHECK-DAG: OpCapability StorageImageArrayDynamicIndexing
 ; CHECK-DAG: OpCapability Image1D
@@ -26,7 +28,7 @@ define void @main() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
+          i32 3, i32 4, i32 3, i32 0, i1 false, ptr nonnull @.str.b0)
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer0, i32 0)
   store i32 0, ptr %ptr0, align 4
 
@@ -34,7 +36,7 @@ define void @main() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
+          i32 3, i32 4, i32 3, i32 1, i1 false, ptr nonnull @.str.b0)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
index 16f3724d5d10..5693f797c798 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-DAG: OpCapability Shader
 ; CHECK-DAG: OpCapability ShaderNonUniformEXT
 ; CHECK-DAG: OpCapability StorageImageArrayNonUniformIndexing
@@ -33,7 +35,7 @@ define void @main() #0 {
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
+          i32 3, i32 4, i32 3, i32 0, i1 true, ptr nonnull @.str.b0)
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer0, i32 0)
   store i32 0, ptr %ptr0, align 4
 
@@ -41,7 +43,7 @@ define void @main() #0 {
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
+          i32 3, i32 4, i32 3, i32 1, i1 true, ptr nonnull @.str.b0)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
index f539fdefa3fa..e47685cd38a2 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
@@ -3,11 +3,8 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
 
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32, i32, i32, i32, i1) #0
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #0
+@.str.b = private unnamed_addr constant [2 x i8] c"B\00", align 1
+@.str.rwb = private unnamed_addr constant [4 x i8] c"RWB\00", align 1
 
 ; CHECK: OpDecorate [[BufferVar:%.+]] DescriptorSet 0
 ; CHECK: OpDecorate [[BufferVar]] Binding 0
@@ -40,9 +37,9 @@ entry:
 ; CHECK-DAG: [[BufferHandle:%.+]] = OpCopyObject [[BufferPtrType]] [[BufferVar]]
 ; CHECK-DAG: [[BufferHandle2:%.+]] = OpCopyObject [[BufferPtrType]] [[BufferVar]]
 ; CHECK-DAG: [[RWBufferHandle:%.+]] = OpCopyObject [[RWBufferPtrType]] [[RWBufferVar]]
-  %BufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %BufferHandle2 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %RWBufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %BufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
+  %BufferHandle2 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
+  %RWBufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.rwb)
 
 ; CHECK: [[AC:%.+]] = OpAccessChain {{.*}} [[BufferHandle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_0t(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %BufferHandle,  i32 1)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
index 4ec8605f6813..704665d7e52e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
@@ -1,8 +1,11 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
 
+@.str = private unnamed_addr constant [4 x i8] c"Buf\00", align 1
+
 ; CHECK: OpCapability StorageImageReadWithoutFormat
-; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
+; CHECK: OpName [[IntBufferVar:%[0-9]+]] "Buf"
+; CHECK-DAG: OpDecorate [[IntBufferVar]] DescriptorSet 16
 ; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
 
 ; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
@@ -18,7 +21,7 @@ define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
   %data0 = call <4 x i32> @llvm.spv.resource.load.typedbuffer(
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
index a4123c36a448..27ae6a03797c 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b = private unnamed_addr constant [2 x i8] c"B\00", align 1
+
 ; CHECK: OpCapability StorageImageWriteWithoutFormat
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
 ; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
@@ -20,7 +22,7 @@ declare <4 x i32> @get_data() #1
 define void @RWBufferLoad_Vec4_I32() #0 {
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll
index 14c98b2fd55a..064251a57dfc 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll
@@ -3,6 +3,12 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G10"
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+@.str.b1 = private unnamed_addr constant [3 x i8] c"B1\00", align 1
+@.str.b2 = private unnamed_addr constant [3 x i8] c"B2\00", align 1
+@.str.b3 = private unnamed_addr constant [3 x i8] c"B3\00", align 1
+@.str.b4 = private unnamed_addr constant [3 x i8] c"B4\00", align 1
+
 ; CHECK-DAG: OpName [[standard_layout:%[0-9]+]] "standard_layout"
 ; CHECK-DAG: OpMemberDecorate [[standard_layout]] 0 Offset 0
 ; CHECK-DAG: OpMemberDecorate [[standard_layout]] 1 Offset 4
@@ -33,11 +39,11 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 
 define void @main() local_unnamed_addr #1 {
 entry:
-  %standard_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 8, 0, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %standard_handle_with_different_offset = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 12, 0, 8), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %backwards_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %backwards_layout, 8, 4, 0), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %large_gap_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %large_gap, 1024, 0, 64, 1020, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %mixed_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %mixed_layout, 16, 0, 8, 4, 12), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %standard_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 8, 0, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
+  %standard_handle_with_different_offset = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 12, 0, 8), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b1)
+  %backwards_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %backwards_layout, 8, 4, 0), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b2)
+  %large_gap_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %large_gap, 1024, 0, 64, 1020, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b3)
+  %mixed_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %mixed_layout, 16, 0, 8, 4, 12), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b4)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
index d608529b421c..d87c175c3691 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
 ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
 
+@.str = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 %S2 = type { { [10 x { i32, i32 } ] }, i32 }
 
 ; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
@@ -21,11 +23,9 @@
 ; CHECK-DAG:              %[[#rarr_struct:]] = OpTypeStruct %[[#rarr]]
 ; CHECK-DAG:       %[[#spirv_VulkanBuffer:]] = OpTypePointer StorageBuffer %[[#rarr_struct]]
 
-declare target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32, i32, i32, i32, i1)
-
 define void @main() "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str)
 ; CHECK:      %[[#resource:]] = OpVariable %[[#spirv_VulkanBuffer]] StorageBuffer
 
   %ptr = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_Ss_12_1t(target("spirv.VulkanBuffer", [0 x %S2], 12, 1) %handle, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
index b1446b7529ea..5a469a4515b7 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
 
+@.str = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 %struct.S = type { i32 }
 
 ; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
@@ -13,11 +15,9 @@
 ; CHECK-DAG:              %[[#rarr_struct:]] = OpTypeStruct %[[#rarr]]
 ; CHECK-DAG:       %[[#spirv_VulkanBuffer:]] = OpTypePointer StorageBuffer %[[#rarr_struct]]
 
-declare target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32, i32, i32, i32, i1)
-
 define void @main() "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str)
 ; CHECK:      %[[#resource:]] = OpVariable %[[#spirv_VulkanBuffer]] StorageBuffer
 
   %ptr = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) %handle, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll b/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
index 7303471c9929..4cc07c249be9 100644
--- a/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
+++ b/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
@@ -3,9 +3,14 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
 
-; CHECK-DAG: OpName [[ScalarBlock_var:%[0-9]+]] "__resource_p_12_{_u32[0]}_0_0"
-; CHECK-DAG: OpName [[buffer_var:%[0-9]+]] "__resource_p_12_{_{_{_u32_f32[3]}[10]}[0]}_0_0"
-; CHECK-DAG: OpName [[array_buffer_var:%[0-9]+]] "__resource_p_12_{_{_{_u32_f32[3]}[10]}[0]}[10]_0_0"
+@.str.scalarblock = private unnamed_addr constant [12 x i8] c"ScalarBlock\00", align 1
+@.str.buffervar = private unnamed_addr constant [10 x i8] c"BufferVar\00", align 1
+@.str.arraybuffervar = private unnamed_addr constant [15 x i8] c"ArrayBufferVar\00", align 1
+
+
+; CHECK-DAG: OpName [[ScalarBlock_var:%[0-9]+]] "ScalarBlock"
+; CHECK-DAG: OpName [[buffer_var:%[0-9]+]] "BufferVar"
+; CHECK-DAG: OpName [[array_buffer_var:%[0-9]+]] "ArrayBufferVar"
 
 ; CHECK-DAG: OpMemberDecorate [[ScalarBlock:%[0-9]+]] 0 Offset 0
 ; CHECK-DAG: OpDecorate [[ScalarBlock]] Block
@@ -63,8 +68,8 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 define external i32 @scalar_vulkan_buffer_load() {
 ; CHECK-NEXT: OpLabel
 entry:
-; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[ScalarBlock_ptr]] [[ScalarBlock_var]]
-  %handle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+; CHECK: [[handle:%[0-9]+]] = OpCopyObject [[ScalarBlock_ptr]] [[ScalarBlock_var]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.scalarblock)
 
 ; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_int_ptr]] [[handle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %handle, i32 1)
@@ -83,7 +88,7 @@ define external %struct.S @private_load() {
 ; CHECK-NEXT: OpLabel
 entry:
 
-; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S]] [[private_var]] Aligned 4
+; CHECK: [[ld:%[0-9]+]] = OpLoad [[S]] [[private_var]] Aligned 4
   %1 = load %struct.S, ptr addrspace(10) @private, align 4
 
 ; CHECK-NEXT: OpReturnValue [[ld]]
@@ -97,7 +102,7 @@ define external %struct.S @storage_buffer_load() {
 ; CHECK-NEXT: OpLabel
 entry:
 
-; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[storage_buffer]] Aligned 4
+; CHECK: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[storage_buffer]] Aligned 4
 ; CHECK-NEXT: [[copy:%[0-9]+]] = OpCopyLogical [[S]] [[ld]]
   %1 = load %struct.S, ptr addrspace(11) @storage_buffer, align 4
 
@@ -111,8 +116,8 @@ entry:
 define external %struct.S @vulkan_buffer_load() {
 ; CHECK-NEXT: OpLabel
 entry:
-; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[buffer_var]]
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+; CHECK: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[buffer_var]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.buffervar)
 
 ; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_S_ptr]] [[handle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) %handle, i32 1)
@@ -131,9 +136,9 @@ entry:
 define external %struct.S @array_of_vulkan_buffers_load() {
 ; CHECK-NEXT: OpLabel
 entry:
-; CHECK-NEXT: [[h:%[0-9]+]] = OpAccessChain [[buffer_ptr]] [[array_buffer_var]] [[one]]
-; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[h]]
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 1, i1 false)
+; CHECK: [[h:%[0-9]+]] = OpAccessChain [[buffer_ptr]] [[array_buffer_var]] [[one]]
+; CHECK: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[h]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 1, i1 false, ptr nonnull @.str.arraybuffervar)
 
 ; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_S_ptr]] [[handle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) %handle, i32 1)

From 68b6f392ed446ff8edfbb2a52899c9361d45ba28 Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <danherna@amd.com>
Date: Fri, 13 Jun 2025 18:33:51 +0200
Subject: [PATCH 0300/1322] [MLIR][AMDGPU] Fix bug in GatherToLDSOpLowering,
 get the correct MemRefType for destination (#142915)

This PR fixes a bug in GatherToLDSOpLowering, we were getting the
MemRefType of source for the destination. Additionally, some related
typos are corrected.

CC: @krzysz00 @umangyadav @lialan
---
 llvm/docs/AMDGPUUsage.rst                     |  4 ++--
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  2 +-
 .../Conversion/AMDGPUToROCDL/load_lds.mlir    | 20 ++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e0a43225e81b..39f04f8e01b8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1215,12 +1215,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    denormalization mode, enabled traps, and floating point exceptions.
                                                    The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
 
-  :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specifies state.
+  :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specified state.
   llvm.amdgcn.load.to.lds.p<1/7>                   Loads values from global memory (either in the form of a global
                                                    a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2,
                                                    or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer
                                                    argument should be wavefront-uniform; the global pointer need not be.
-                                                   The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes
+                                                   The LDS pointer is implicitly offset by 4 * lane_id bytes for size <= 4 bytes
                                                    and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`,
                                                    `buffer_load_* ... lds`, or `global_load__* ... lds` depending on address
                                                    space and architecture. `amdgcn.global.load.lds` has the same semantics as
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5e6f675a6414..074404add47f 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1101,7 +1101,7 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
     Location loc = op.getLoc();
 
     auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
-    auto dstMemRefType = cast<MemRefType>(op.getSrc().getType());
+    auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
 
     // TODO: instead of only transfering one element per thread, we could
     // augment it to transfer multiple elements per thread by issuing multiple
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
index cb3539dd11be..581346e03b89 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -31,8 +31,8 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
@@ -65,8 +65,8 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
@@ -103,8 +103,8 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C128:.*]] = llvm.mlir.constant(128 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C128]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
@@ -130,7 +130,9 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
-  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]]
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
   // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
   %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
   %c0 = arith.constant 0 : index
@@ -166,8 +168,8 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]

From 3b09a3d5ae41faac3c0046b93a9c6e0297cc860b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 09:36:38 -0700
Subject: [PATCH 0301/1322] MC,SPARC: Replace SparcMCExpr with MCSpecifierExpr

Add a hook printSpecifierExpr so that targets can implement
relocation specifier printing without inheriting from MCSpecifierExpr.
---
 llvm/include/llvm/MC/MCAsmInfo.h                    |  2 ++
 llvm/include/llvm/MC/MCExpr.h                       |  7 +++++--
 llvm/lib/MC/MCAsmInfo.cpp                           | 12 +++++++++++-
 llvm/lib/MC/MCExpr.cpp                              |  2 ++
 .../Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp  |  5 +++--
 .../Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp    | 10 ++++++++++
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h |  2 ++
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp  | 13 ++-----------
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h    |  7 +------
 9 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 18303e028f62..1f2ea0cfaaff 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -28,6 +28,7 @@ namespace llvm {
 class MCContext;
 class MCCFIInstruction;
 class MCExpr;
+class MCSpecifierExpr;
 class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
@@ -712,6 +713,7 @@ public:
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
 
   void printExpr(raw_ostream &, const MCExpr &) const;
+  virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 0b8af09fe1c2..1c72269e53e2 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -82,6 +82,7 @@ public:
   /// \name Utility Methods
   /// @{
 
+  // TODO: Make this private. Users should call MCAsmInfo::printExpr instead.
   LLVM_ABI void print(raw_ostream &OS, const MCAsmInfo *MAI,
                       int SurroundingPrec = 0) const;
   LLVM_ABI void dump() const;
@@ -509,7 +510,7 @@ protected:
   // Target-specific relocation specifier code
   const Spec specifier;
 
-protected:
+public:
   explicit MCSpecifierExpr(const MCExpr *Expr, Spec S)
       : MCExpr(Specifier, SMLoc()), Expr(Expr), specifier(S) {}
   virtual ~MCSpecifierExpr() = default;
@@ -518,7 +519,9 @@ public:
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
-  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const = 0;
+  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+    llvm_unreachable("Replace MCExpr::print calls with MCAsmInfo::printExpr");
+  }
   virtual bool evaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAssembler *Asm) const;
 
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index fbacca4f5679..13b077349a58 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -151,5 +151,15 @@ std::optional<uint32_t> MCAsmInfo::getSpecifierForName(StringRef Name) const {
 }
 
 void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
-  Expr.print(OS, this);
+  if (auto *SE = dyn_cast<MCSpecifierExpr>(&Expr))
+    printSpecifierExpr(OS, *SE);
+  else
+    Expr.print(OS, this);
+}
+
+void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                   const MCSpecifierExpr &Expr) const {
+  // TODO: Switch to unreachable after all targets that use MCSpecifierExpr
+  // migrate to MCAsmInfo::printSpecifierExpr.
+  Expr.printImpl(OS, this);
 }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index aec698721d9d..2ae440cba46f 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -174,6 +174,8 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI,
   }
 
   case MCExpr::Specifier:
+    // TODO: Remove after all targets that use MCSpecifierExpr migrate to
+    // MCAsmInfo::printSpecifierExpr.
     return cast<MCSpecifierExpr>(this)->printImpl(OS, MAI);
   }
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index 936518da3511..2d1a4fe94d18 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -12,6 +12,7 @@
 
 #include "SparcInstPrinter.h"
 #include "Sparc.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -142,7 +143,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
@@ -288,5 +289,5 @@ void SparcInstPrinter::printCTILabel(const MCInst *MI, uint64_t Address,
   }
 
   // Otherwise, just print the expression.
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 3049072b001c..4156780e962d 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -66,3 +66,13 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
+
+void SparcELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                           const MCSpecifierExpr &Expr) const {
+  StringRef S = Sparc::getSpecifierName(Expr.getSpecifier());
+  if (!S.empty())
+    OS << '%' << S << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (!S.empty())
+    OS << ')';
+}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index c9162f2dc8a5..7ea800f11917 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -32,6 +32,8 @@ public:
                                     unsigned Encoding,
                                     MCStreamer &Streamer) const override;
 
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 2e03e4739986..6a08fa5c9f3f 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -24,21 +24,12 @@ using namespace llvm;
 
 const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
                                               const MCExpr *Expr, uint16_t S) {
-  return new (Ctx) SparcMCExpr(Expr, S);
+  return new (Ctx) MCSpecifierExpr(Expr, S);
 }
 
 const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
                                               const MCSymbol *Sym, uint16_t S) {
-  return new (Ctx) SparcMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
-void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  StringRef S = Sparc::getSpecifierName(specifier);
-  if (!S.empty())
-    OS << '%' << S << '(';
-  getSubExpr()->print(OS, MAI);
-  if (!S.empty())
-    OS << ')';
+  return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
 }
 
 StringRef Sparc::getSpecifierName(uint16_t S) {
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 612b439bfc74..78af9a815020 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -20,12 +20,7 @@
 namespace llvm {
 
 class StringRef;
-class SparcMCExpr : public MCSpecifierExpr {
-public:
-  explicit SparcMCExpr(const MCExpr *Expr, uint16_t S)
-      : MCSpecifierExpr(Expr, S) {}
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-};
+using SparcMCExpr = MCSpecifierExpr;
 
 namespace Sparc {
 const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCExpr *Expr,

From 36c710c40e8a59f74f56eb0e04e438cec5532ec5 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 09:42:58 -0700
Subject: [PATCH 0302/1322] [CIR] Change default assumption about allowing
 builtins (#144004)

The code to read the "nobuiltins" attributes hasn't been implemented
yet, but we were defaulting to the assumption that use of builtins is
allowed for function calls that we recognize as standard C library calls
and have builtin equivalents of. This change reverses that assumption so
that when such calls are encountered, we just emit the call. This is a
better default assumption, and since our builtin handling for these
functions isn't implemented yet, it also allows us to compile more
programs.
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp |  3 +-
 clang/test/CIR/CodeGen/libc.c        | 55 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/CodeGen/libc.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 99f942fcf2cd..42d0c78013f5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1056,7 +1056,8 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
 
     bool isPredefinedLibFunction =
         cgm.getASTContext().BuiltinInfo.isPredefinedLibFunction(builtinID);
-    bool hasAttributeNoBuiltin = false;
+    // Assume nobuiltins everywhere until we actually read the attributes.
+    bool hasAttributeNoBuiltin = true;
     assert(!cir::MissingFeatures::attributeNoBuiltin());
 
     // When directing calling an inline builtin, call it through it's mangled
diff --git a/clang/test/CIR/CodeGen/libc.c b/clang/test/CIR/CodeGen/libc.c
new file mode 100644
index 000000000000..f65fe92cd36a
--- /dev/null
+++ b/clang/test/CIR/CodeGen/libc.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+// Note: In the final implementation, we will want these to generate
+//       CIR-specific libc operations. This test is just a placeholder
+//       to make sure we can compile these to normal function calls
+//       until the special handling is implemented.
+
+void *memcpy(void *, const void *, unsigned long);
+void testMemcpy(void *dst, const void *src, unsigned long size) {
+  memcpy(dst, src, size);
+  // CHECK: cir.call @memcpy
+}
+
+void *memmove(void *, const void *, unsigned long);
+void testMemmove(void *src, const void *dst, unsigned long size) {
+  memmove(dst, src, size);
+  // CHECK: cir.call @memmove
+}
+
+void *memset(void *, int, unsigned long);
+void testMemset(void *dst, int val, unsigned long size) {
+  memset(dst, val, size);
+  // CHECK: cir.call @memset
+}
+
+double fabs(double);
+double testFabs(double x) {
+  return fabs(x);
+  // CHECK: cir.call @fabs
+}
+
+float fabsf(float);
+float testFabsf(float x) {
+  return fabsf(x);
+  // CHECK: cir.call @fabsf
+}
+
+int abs(int);
+int testAbs(int x) {
+  return abs(x);
+  // CHECK: cir.call @abs
+}
+
+long labs(long);
+long testLabs(long x) {
+  return labs(x);
+  // CHECK: cir.call @labs
+}
+
+long long llabs(long long);
+long long testLlabs(long long x) {
+  return llabs(x);
+  // CHECK: cir.call @llabs
+}

From 3bf1e1f79ce5b4921586b24014acf5888c35e03f Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Fri, 13 Jun 2025 17:47:06 +0100
Subject: [PATCH 0303/1322] [mlir][spirv] Add definition of OpImageRead
 (#144038)

---
 .../mlir/Dialect/SPIRV/IR/SPIRVBase.td        |  4 +-
 .../mlir/Dialect/SPIRV/IR/SPIRVImageOps.td    | 57 +++++++++++++++++++
 mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp        | 17 ++++++
 mlir/test/Dialect/SPIRV/IR/image-ops.mlir     | 28 +++++++++
 mlir/test/Target/SPIRV/image-ops.mlir         | 11 +++-
 5 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index e413503bbd67..d2ba76cdad90 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -4370,6 +4370,7 @@ def SPIRV_OC_OpImageSampleImplicitLod         : I32EnumAttrCase<"OpImageSampleIm
 def SPIRV_OC_OpImageSampleExplicitLod         : I32EnumAttrCase<"OpImageSampleExplicitLod", 88>;
 def SPIRV_OC_OpImageSampleProjDrefImplicitLod : I32EnumAttrCase<"OpImageSampleProjDrefImplicitLod", 93>;
 def SPIRV_OC_OpImageDrefGather                : I32EnumAttrCase<"OpImageDrefGather", 97>;
+def SPIRV_OC_OpImageRead                      : I32EnumAttrCase<"OpImageRead", 98>;
 def SPIRV_OC_OpImageWrite                     : I32EnumAttrCase<"OpImageWrite", 99>;
 def SPIRV_OC_OpImage                          : I32EnumAttrCase<"OpImage", 100>;
 def SPIRV_OC_OpImageQuerySize                 : I32EnumAttrCase<"OpImageQuerySize", 104>;
@@ -4577,7 +4578,8 @@ def SPIRV_OpcodeAttr :
       SPIRV_OC_OpCompositeInsert, SPIRV_OC_OpTranspose,
       SPIRV_OC_OpImageSampleImplicitLod, SPIRV_OC_OpImageSampleExplicitLod,
       SPIRV_OC_OpImageSampleProjDrefImplicitLod, SPIRV_OC_OpImageDrefGather,
-      SPIRV_OC_OpImageWrite, SPIRV_OC_OpImage, SPIRV_OC_OpImageQuerySize,
+      SPIRV_OC_OpImageRead, SPIRV_OC_OpImageWrite, SPIRV_OC_OpImage,
+      SPIRV_OC_OpImageQuerySize,
       SPIRV_OC_OpConvertFToU, SPIRV_OC_OpConvertFToS, SPIRV_OC_OpConvertSToF,
       SPIRV_OC_OpConvertUToF, SPIRV_OC_OpUConvert, SPIRV_OC_OpSConvert,
       SPIRV_OC_OpFConvert, SPIRV_OC_OpConvertPtrToU, SPIRV_OC_OpConvertUToPtr,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
index 9999e5cc07b8..7610966b84be 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
@@ -186,6 +186,63 @@ def SPIRV_ImageQuerySizeOp : SPIRV_Op<"ImageQuerySize", [Pure]> {
 
 // -----
 
+def SPIRV_ImageReadOp : SPIRV_Op<"ImageRead",
+    [SPIRV_SampledOperandIs<"image", ["SamplerUnknown", "NoSampler"]>,
+     SPIRV_NoneOrElementMatchImage<"result", "image">]> {
+  let summary = "Read a texel from an image without a sampler.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type or integer
+    type. It must be a scalar or vector with component type the same as Sampled
+    Type of the OpTypeImage (unless that Sampled Type is OpTypeVoid).
+
+    Image must be an object whose type is OpTypeImage with a Sampled operand of
+    0 or 2. If the Arrayed operand is 1, then additional capabilities may be
+    required; e.g., ImageCubeArray, or ImageMSArray.
+
+    Coordinate must be a scalar or vector of floating-point type or integer
+    type. It contains non-normalized texel coordinates (u[, v] ... [, array
+    layer]) as needed by the definition of Image. See the client API
+    specification for handling of coordinates outside the image.
+
+    If the Image Dim operand is SubpassData, Coordinate is relative to the
+    current fragment location. See the client API specification for more detail
+    on how these coordinates are applied.
+
+    If the Image Dim operand is not SubpassData, the Image Format must not be
+    Unknown, unless the StorageImageReadWithoutFormat Capability was declared.
+
+    Image Operands encodes what operands follow, as per Image Operands.
+
+    <!-- End of AutoGen section -->
+
+    #### Example:
+
+    ```mlir
+    %0 = spirv.ImageRead %1, %2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, R32f>, vector<2xsi32> -> vector<4xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_AnyImage:$image,
+    AnyTypeOf<[SPIRV_ScalarOrVectorOf<SPIRV_Float>, SPIRV_ScalarOrVectorOf<SPIRV_Integer>]>:$coordinate,
+    OptionalAttr<SPIRV_ImageOperandsAttr>:$image_operands,
+    Variadic<SPIRV_Type>:$operand_arguments
+  );
+
+  let results = (outs
+    AnyTypeOf<[SPIRV_ScalarOrVectorOf<SPIRV_Float>, SPIRV_ScalarOrVectorOf<SPIRV_Integer>]>:$result
+  );
+
+  let assemblyFormat = [{
+    $image `,` $coordinate custom<ImageOperands>($image_operands) ( `,` $operand_arguments^ )? attr-dict
+    `:` type($image) `,` type($coordinate) ( `,` type($operand_arguments)^ )?
+    `->` type($result)
+  }];
+}
+
+// -----
+
 def SPIRV_ImageWriteOp : SPIRV_Op<"ImageWrite",
     [SPIRV_SampledOperandIs<"image", ["SamplerUnknown", "NoSampler"]>,
      SPIRV_DimIsNot<"image", ["SubpassData"]>,
diff --git a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
index a021931425fb..f7af79ceefa8 100644
--- a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
@@ -204,6 +204,23 @@ LogicalResult spirv::ImageDrefGatherOp::verify() {
                              getOperandArguments());
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.ImageReadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::ImageReadOp::verify() {
+  // TODO: Do we need check for: "If the Arrayed operand is 1, then additional
+  // capabilities may be required; e.g., ImageCubeArray, or ImageMSArray."?
+
+  // TODO: Ideally it should be somewhere verified that "If the Image Dim
+  // operand is not SubpassData, the Image Format must not be Unknown, unless
+  // the StorageImageReadWithoutFormat Capability was declared." This function
+  // however may not be the suitable place for such verification.
+
+  return verifyImageOperands(getOperation(), getImageOperandsAttr(),
+                             getOperandArguments());
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.ImageWriteOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
index 1ebdfdb41de1..484a54023edc 100644
--- a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
@@ -116,6 +116,34 @@ func.func @image_query_size_error_result2(%arg0 : !spirv.image<f32, Buffer, NoDe
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// spirv.ImageRead
+//===----------------------------------------------------------------------===//
+
+func.func @image_read(%arg0: !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1: vector<2xsi32>) -> () {
+  // CHECK: {{%.*}} = spirv.ImageRead {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+  %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+  spirv.Return
+}
+
+// -----
+
+func.func @image_read_type_mismatch(%arg0: !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1: vector<2xsi32>) -> () {
+  // expected-error @+1 {{op failed to verify that the result component type must match the image sampled type}}
+  %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf16>
+  spirv.Return
+}
+
+// -----
+
+func.func @image_read_need_sampler(%arg0: !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba8>, %arg1: vector<2xsi32>) -> () {
+  // expected-error @+1 {{op failed to verify that the sampled operand of the underlying image must be SamplerUnknown or NoSampler}}
+  %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba8>, vector<2xsi32> -> vector<4xf16>
+  spirv.Return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ImageWrite
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Target/SPIRV/image-ops.mlir b/mlir/test/Target/SPIRV/image-ops.mlir
index 6dd23844d46a..b8d19f0f9a7d 100644
--- a/mlir/test/Target/SPIRV/image-ops.mlir
+++ b/mlir/test/Target/SPIRV/image-ops.mlir
@@ -13,6 +13,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery, Link
     %0 = spirv.ImageQuerySize %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown> -> vector<2xi32>
     spirv.Return
   }
+  spirv.func @image_read(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>) "None" {
+    // CHECK: {{.*}} = spirv.ImageRead {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+    %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+    spirv.Return
+  }
   spirv.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) "None" {
     // CHECK: spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32>, vector<4xf32>
     spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32>, vector<4xf32>
@@ -38,9 +43,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery, Link
 // -----
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, StorageImageWriteWithoutFormat, Linkage], []> {
-  spirv.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) "None" {
+  spirv.func @image_read_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>) "None" {
+    // CHECK: spirv.ImageRead {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32> -> vector<4xf32>
+    %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32> -> vector<4xf32>
     // CHECK: spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
-    spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
+    spirv.ImageWrite %arg0, %arg1, %0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
     spirv.Return
   }
 }

From b184672ec7f1433e5dc698cda7e61be8a6085aa6 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Fri, 13 Jun 2025 16:48:24 +0000
Subject: [PATCH 0304/1322] [libc] Implemented wmemmove (#142245)

Implemented wmemmove and added tests
---
 libc/config/linux/x86_64/entrypoints.txt |   1 +
 libc/include/wchar.yaml                  |   8 ++
 libc/src/wchar/CMakeLists.txt            |  12 +++
 libc/src/wchar/wmemmove.cpp              |  27 ++++++
 libc/src/wchar/wmemmove.h                |  22 +++++
 libc/test/src/wchar/CMakeLists.txt       |  10 ++
 libc/test/src/wchar/wmemmove_test.cpp    | 111 +++++++++++++++++++++++
 7 files changed, 191 insertions(+)
 create mode 100644 libc/src/wchar/wmemmove.cpp
 create mode 100644 libc/src/wchar/wmemmove.h
 create mode 100644 libc/test/src/wchar/wmemmove_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c993ef8303a5..aa2079faed40 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -367,6 +367,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.btowc
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
+    libc.src.wchar.wmemmove
     libc.src.wchar.wmemset
     libc.src.wchar.wcschr
     libc.src.wchar.wcsncmp
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 57f4f6660827..1af15a6c112b 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -112,6 +112,14 @@ functions:
       - type: __restrict wchar_t *
       - type: const __restrict wchar_t *
       - type: size_t
+  - name: wmemmove
+    standards:
+      - stdc
+    return_type: wchar_t *
+    arguments:
+      - type: wchar_t *
+      - type: const wchar_t *
+      - type: size_t
   - name: wcsncpy
     standards:
       - stdc
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 4b8802ede5f5..491dd5b34340 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -131,6 +131,18 @@ add_entrypoint_object(
     libc.hdr.wchar_macros
 )
 
+add_entrypoint_object(
+  wmemmove
+  SRCS
+    wmemmove.cpp
+  HDRS
+    wmemmove.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.wchar_macros
+    libc.src.__support.macros.null_check
+)
+
 add_entrypoint_object(
   wcsncpy
   SRCS
diff --git a/libc/src/wchar/wmemmove.cpp b/libc/src/wchar/wmemmove.cpp
new file mode 100644
index 000000000000..3282077003bd
--- /dev/null
+++ b/libc/src/wchar/wmemmove.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation of wmemmove ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wmemmove.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/null_check.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(wchar_t *, wmemmove,
+                   (wchar_t * dest, const wchar_t *src, size_t n)) {
+  LIBC_CRASH_ON_NULLPTR(dest);
+  LIBC_CRASH_ON_NULLPTR(src);
+
+  __builtin_memmove(dest, src, n * sizeof(wchar_t));
+  return dest;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wmemmove.h b/libc/src/wchar/wmemmove.h
new file mode 100644
index 000000000000..b4c31ac7b397
--- /dev/null
+++ b/libc/src/wchar/wmemmove.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for wmemmove --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WMEMMOVE_H
+#define LLVM_LIBC_SRC_WCHAR_WMEMMOVE_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+wchar_t *wmemmove(wchar_t *dest, const wchar_t *src, size_t n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WMEMMOVE_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 6293e8e3d55c..4990b6953348 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -145,6 +145,16 @@ add_libc_test(
     libc.src.wchar.wmemcpy
 )
 
+add_libc_test(
+  wmemmove_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wmemmove_test.cpp
+  DEPENDS
+    libc.src.wchar.wmemmove
+)
+
 add_libc_test(
   wcsncpy_test
   SUITE
diff --git a/libc/test/src/wchar/wmemmove_test.cpp b/libc/test/src/wchar/wmemmove_test.cpp
new file mode 100644
index 000000000000..d23aa0f0b3af
--- /dev/null
+++ b/libc/test/src/wchar/wmemmove_test.cpp
@@ -0,0 +1,111 @@
+//===-- Unittests for wmemmove --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/wchar/wmemmove.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcWMemmoveTest, MoveZeroByte) {
+  wchar_t buffer[] = {L'a', L'b', L'y', L'z'};
+
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(buffer, buffer + 2, 0);
+  EXPECT_EQ(ret, buffer);
+
+  const wchar_t expected[] = {L'a', L'b', L'y', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstAndSrcPointToSameAddress) {
+  wchar_t buffer[] = {L'a', L'b'};
+
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(buffer, buffer, 1);
+  EXPECT_EQ(ret, buffer);
+
+  const wchar_t expected[] = {L'a', L'b'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstStartsBeforeSrc) {
+  // Set boundary at beginning and end for not overstepping when
+  // copy forward or backward.
+  wchar_t buffer[] = {L'z', L'a', L'b', L'c', L'z'};
+
+  wchar_t *dst = buffer + 1;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 2, 2);
+  EXPECT_EQ(ret, dst);
+
+  const wchar_t expected[] = {L'z', L'b', L'c', L'c', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+  EXPECT_TRUE(buffer[4] == expected[4]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstStartsAfterSrc) {
+  wchar_t buffer[] = {L'z', L'a', L'b', L'c', L'z'};
+
+  wchar_t *dst = buffer + 2;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 1, 2);
+  EXPECT_EQ(ret, dst);
+
+  const wchar_t expected[] = {L'z', L'a', L'a', L'b', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+  EXPECT_TRUE(buffer[4] == expected[4]);
+}
+
+// e.g. `Dst` follow `src`.
+// str: [abcdefghij]
+//      [__src_____]
+//      [_____Dst__]
+TEST(LlvmLibcWMemmoveTest, SrcFollowDst) {
+  wchar_t buffer[] = {L'z', L'a', L'b', L'z'};
+
+  wchar_t *dst = buffer + 1;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 2, 1);
+  EXPECT_EQ(ret, dst);
+
+  const char expected[] = {L'z', L'b', L'b', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstFollowSrc) {
+  wchar_t buffer[] = {L'z', L'a', L'b', L'z'};
+
+  wchar_t *dst = buffer + 2;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 1, 1);
+  EXPECT_EQ(ret, dst);
+
+  const char expected[] = {L'z', L'a', L'a', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+}
+
+#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER)
+TEST(LlvmLibcWMemmoveTest, NullptrCrash) {
+  wchar_t buffer[] = {L'a', L'b'};
+  // Passing in a nullptr should crash the program.
+  EXPECT_DEATH([&buffer] { LIBC_NAMESPACE::wmemmove(buffer, nullptr, 2); },
+               WITH_SIGNAL(-1));
+  EXPECT_DEATH([&buffer] { LIBC_NAMESPACE::wmemmove(nullptr, buffer, 2); },
+               WITH_SIGNAL(-1));
+}
+#endif // LIBC_HAS_ADDRESS_SANITIZER

From c403cf1e38faa456fdd6f1301efabea3f36c3e6b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 09:49:55 -0700
Subject: [PATCH 0305/1322] VE: Replace VEMCExpr::printImpl with
 printSpecifierExpr

Prepare for removing the VEMCExpr subclass.
VEMCExpr overrides evaluateAsRelocatableImpl, so it cannot be removed
yet.
---
 llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp | 3 ++-
 llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp   | 8 ++++++++
 llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h     | 2 ++
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp      | 8 --------
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h        | 1 -
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
index 77bd30e96f7b..b78b86f70f39 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
@@ -12,6 +12,7 @@
 
 #include "VEInstPrinter.h"
 #include "VE.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -61,7 +62,7 @@ void VEInstPrinter::printOperand(const MCInst *MI, int OpNum,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void VEInstPrinter::printMemASXOperand(const MCInst *MI, int OpNum,
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index f290804ae449..fdde46f09d5b 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -56,3 +56,11 @@ VEELFMCAsmInfo::VEELFMCAsmInfo(const Triple &TheTriple) {
 
   initializeVariantKinds(variantKindDescs);
 }
+
+void VEELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                        const MCSpecifierExpr &Expr) const {
+  printExpr(OS, *Expr.getSubExpr());
+  auto specifier = Expr.getSpecifier();
+  if (specifier && specifier != VEMCExpr::VK_REFLONG)
+    OS << '@' << getSpecifierName(specifier);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
index 6557d68b383c..444f422c7ec1 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
@@ -24,6 +24,8 @@ class VEELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit VEELFMCAsmInfo(const Triple &TheTriple);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index a7986ab9006d..fa4d9b18a9ad 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -27,14 +27,6 @@ const VEMCExpr *VEMCExpr::create(Specifier S, const MCExpr *Expr,
   return new (Ctx) VEMCExpr(Expr, S);
 }
 
-void VEMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-
-  const MCExpr *Expr = getSubExpr();
-  Expr->print(OS, MAI);
-  if (specifier != VK_None && specifier != VK_REFLONG)
-    OS << '@' << MAI->getSpecifierName(specifier);
-}
-
 VE::Fixups VEMCExpr::getFixupKind(MCSpecifierExpr::Spec S) {
   switch (S) {
   default:
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
index 80ea350a6166..4d191149d4aa 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -50,7 +50,6 @@ public:
   static const VEMCExpr *create(Specifier Kind, const MCExpr *Expr,
                                 MCContext &Ctx);
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAssembler *Asm) const override;
 

From 6e988bd33f5fa8a529ef9208d3e147945b7bb7ed Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 09:56:57 -0700
Subject: [PATCH 0306/1322] [mlir] Forward **kwargs through gentbl_shard_rule
 (#144001)

This allows clients to pass additional cc_library arguments through this
macro to the build rules it calls.
---
 utils/bazel/llvm-project-overlay/mlir/tblgen.bzl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 16a7ecadeaff..884d6f381b02 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -484,7 +484,8 @@ def gentbl_sharded_ops(
         test = False,
         includes = [],
         strip_include_prefix = None,
-        deps = []):
+        deps = [],
+        **kwargs):
     """Generate sharded op declarations and definitions.
 
     This special build rule shards op definitions in a TableGen file and generates multiple copies
@@ -524,6 +525,7 @@ def gentbl_sharded_ops(
         td_file = td_file,
         test = test,
         deps = deps,
+        **kwargs,
     )
     all_files = [hdr_out, src_out]
     for i in range(0, shard_count):
@@ -535,9 +537,14 @@ def gentbl_sharded_ops(
             out = out_file,
             sharder = sharder,
             src_file = src_file,
+            **kwargs,
         )
         all_files.append(out_file)
-    native.filegroup(name = name, srcs = all_files)
+    native.filegroup(
+        name = name,
+        srcs = all_files,
+        **kwargs,
+    )
 
 def gentbl_sharded_op_defs(name, source_file, shard_count):
     """Generates multiple copies of a source file that includes sharded op definitions.

From 2704b27a0b452f4aaf87ab26d315fdc92857373a Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 13 Jun 2025 10:02:41 -0700
Subject: [PATCH 0307/1322] [lldb] Include unistd.h for _exit  in
 multi-process-driver.cpp

This test fails to build on macOS without the correct header include.
---
 lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
index 5ad75e3c1e47..68d73f1dee01 100644
--- a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
+++ b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
@@ -13,10 +13,11 @@
 // that are hit when lldb is being used to debug multiple processes
 // simultaneously.
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <inttypes.h>
+#include <unistd.h>
 
 #include "lldb/API/LLDB.h"
 #include "lldb/API/SBCommandInterpreter.h"

From 65d88d31ea279bbab8a0fa2c8abfb3f723a1715b Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Fri, 13 Jun 2025 10:04:45 -0700
Subject: [PATCH 0308/1322] Revert "[llvm-cov] Add support for baseline
 coverage" (#144121)

Reverts llvm/llvm-project#117910

```
/home/buildbots/llvm-external-buildbots/workers/ppc64le-lld-multistage-test/ppc64le-lld-multistage-test/llvm-project/llvm/unittests/ProfileData/CoverageMappingTest.cpp
/home/buildbots/llvm-external-buildbots/workers/ppc64le-lld-multistage-test/ppc64le-lld-multistage-test/llvm-project/llvm/unittests/ProfileData/CoverageMappingTest.cpp:281:28: error: 'std::reference_wrapper' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported]
  281 |         std::make_optional(std::reference_wrapper(*ProfileReader));
      |                            ^
/usr/lib/gcc/ppc64le-redhat-linux/8/../../../../include/c++/8/bits/refwrap.h:289:11: note: add a deduction guide to suppress this warning
  289 |     class reference_wrapper
      |           ^
```
---
 llvm/docs/CommandGuide/llvm-cov.rst           |  15 ---
 .../ProfileData/Coverage/CoverageMapping.h    |  24 ++--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 123 +++++++-----------
 ...showLineExecutionCounts-lcov-baseline.test |  37 ------
 llvm/tools/llvm-cov/CodeCoverage.cpp          |  78 ++++-------
 .../ProfileData/CoverageMappingTest.cpp       |   4 +-
 6 files changed, 86 insertions(+), 195 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test

diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst
index f4db60cf06fa..968f3c452f55 100644
--- a/llvm/docs/CommandGuide/llvm-cov.rst
+++ b/llvm/docs/CommandGuide/llvm-cov.rst
@@ -380,11 +380,6 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
-.. option:: -empty-profile
-
- Display the baseline coverage of the binaries with all zero execution counts.
- Mutually exclusive with -instr-profile.
-
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -475,11 +470,6 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
-.. option:: -empty-profile
-
- Display the baseline coverage of the binaries with all zero execution counts.
- Mutually exclusive with -instr-profile.
-
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
@@ -572,11 +562,6 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
-.. option:: -empty-profile
-
- Export the baseline coverage of the binaries with all zero execution counts.
- Mutually exclusive with -instr-profile.
-
 CONVERT-FOR-TESTING COMMAND
 ---------------------------
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index d1230b0ba7c5..e62ce5e3d8fa 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -991,23 +991,18 @@ class CoverageMapping {
   // Load coverage records from readers.
   static Error loadFromReaders(
       ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-      std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-          &ProfileReader,
-      CoverageMapping &Coverage);
+      IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage);
 
   // Load coverage records from file.
   static Error
   loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir,
-               std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-                   &ProfileReader,
-               CoverageMapping &Coverage, bool &DataFound,
+               IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
+               bool &DataFound,
                SmallVectorImpl<object::BuildID> *FoundBinaryIDs = nullptr);
 
   /// Add a function record corresponding to \p Record.
-  Error loadFunctionRecord(
-      const CoverageMappingRecord &Record,
-      const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-          &ProfileReader);
+  Error loadFunctionRecord(const CoverageMappingRecord &Record,
+                           IndexedInstrProfReader &ProfileReader);
 
   /// Look up the indices for function records which are at least partially
   /// defined in the specified file. This is guaranteed to return a superset of
@@ -1023,16 +1018,15 @@ public:
   /// Load the coverage mapping using the given readers.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-       std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-           &ProfileReader);
+       IndexedInstrProfReader &ProfileReader);
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
   /// Ignores non-instrumented object files unless all are not instrumented.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
-  load(ArrayRef<StringRef> ObjectFilenames,
-       std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
-       ArrayRef<StringRef> Arches = {}, StringRef CompilationDir = "",
+  load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
+       vfs::FileSystem &FS, ArrayRef<StringRef> Arches = {},
+       StringRef CompilationDir = "",
        const object::BuildIDFetcher *BIDFetcher = nullptr,
        bool CheckBinaryIDs = false);
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 429ec5c19f1f..dd74eb054a34 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -823,8 +823,7 @@ public:
 
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
-    const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader) {
+    IndexedInstrProfReader &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
   if (OrigFuncName.empty())
     return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -838,44 +837,35 @@ Error CoverageMapping::loadFunctionRecord(
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
-  if (ProfileReader) {
-    if (Error E = ProfileReader.value().get().getFunctionCounts(
-            Record.FunctionName, Record.FunctionHash, Counts)) {
-      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-      if (IPE == instrprof_error::hash_mismatch) {
-        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                        Record.FunctionHash);
-        return Error::success();
-      }
-      if (IPE != instrprof_error::unknown_function)
-        return make_error<InstrProfError>(IPE);
-      Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
+  if (Error E = ProfileReader.getFunctionCounts(Record.FunctionName,
+                                                Record.FunctionHash, Counts)) {
+    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+    if (IPE == instrprof_error::hash_mismatch) {
+      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                      Record.FunctionHash);
+      return Error::success();
     }
-  } else {
+    if (IPE != instrprof_error::unknown_function)
+      return make_error<InstrProfError>(IPE);
     Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
   }
   Ctx.setCounts(Counts);
 
   bool IsVersion11 =
-      ProfileReader && ProfileReader.value().get().getVersion() <
-                           IndexedInstrProf::ProfVersion::Version12;
+      ProfileReader.getVersion() < IndexedInstrProf::ProfVersion::Version12;
 
   BitVector Bitmap;
-  if (ProfileReader) {
-    if (Error E = ProfileReader.value().get().getFunctionBitmap(
-            Record.FunctionName, Record.FunctionHash, Bitmap)) {
-      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-      if (IPE == instrprof_error::hash_mismatch) {
-        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                        Record.FunctionHash);
-        return Error::success();
-      }
-      if (IPE != instrprof_error::unknown_function)
-        return make_error<InstrProfError>(IPE);
-      Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
+  if (Error E = ProfileReader.getFunctionBitmap(Record.FunctionName,
+                                                Record.FunctionHash, Bitmap)) {
+    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+    if (IPE == instrprof_error::hash_mismatch) {
+      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                      Record.FunctionHash);
+      return Error::success();
     }
-  } else {
-    Bitmap = BitVector(getMaxBitmapSize(Record, false));
+    if (IPE != instrprof_error::unknown_function)
+      return make_error<InstrProfError>(IPE);
+    Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
   }
   Ctx.setBitmap(std::move(Bitmap));
 
@@ -969,14 +959,10 @@ Error CoverageMapping::loadFunctionRecord(
 // of CoverageMappingReader instances.
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader,
-    CoverageMapping &Coverage) {
-  assert(!Coverage.SingleByteCoverage || !ProfileReader ||
-         *Coverage.SingleByteCoverage ==
-             ProfileReader.value().get().hasSingleByteCoverage());
-  Coverage.SingleByteCoverage =
-      !ProfileReader || ProfileReader.value().get().hasSingleByteCoverage();
+    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage ||
+         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
+  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -991,8 +977,7 @@ Error CoverageMapping::loadFromReaders(
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader) {
+    IndexedInstrProfReader &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   if (Error E = loadFromReaders(CoverageReaders, ProfileReader, *Coverage))
     return std::move(E);
@@ -1001,19 +986,18 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
 // If E is a no_data_found error, returns success. Otherwise returns E.
 static Error handleMaybeNoDataFoundError(Error E) {
-  return handleErrors(std::move(E), [](const CoverageMapError &CME) {
-    if (CME.get() == coveragemap_error::no_data_found)
-      return static_cast<Error>(Error::success());
-    return make_error<CoverageMapError>(CME.get(), CME.getMessage());
-  });
+  return handleErrors(
+      std::move(E), [](const CoverageMapError &CME) {
+        if (CME.get() == coveragemap_error::no_data_found)
+          return static_cast<Error>(Error::success());
+        return make_error<CoverageMapError>(CME.get(), CME.getMessage());
+      });
 }
 
 Error CoverageMapping::loadFromFile(
     StringRef Filename, StringRef Arch, StringRef CompilationDir,
-    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader,
-    CoverageMapping &Coverage, bool &DataFound,
-    SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
+    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
+    bool &DataFound, SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
   auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
       Filename, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = CovMappingBufOrErr.getError())
@@ -1049,23 +1033,13 @@ Error CoverageMapping::loadFromFile(
 }
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
-    ArrayRef<StringRef> ObjectFilenames,
-    std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
-    ArrayRef<StringRef> Arches, StringRef CompilationDir,
+    ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
+    vfs::FileSystem &FS, ArrayRef<StringRef> Arches, StringRef CompilationDir,
     const object::BuildIDFetcher *BIDFetcher, bool CheckBinaryIDs) {
-  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
-  if (ProfileFilename) {
-    auto ProfileReaderOrErr =
-        IndexedInstrProfReader::create(ProfileFilename.value(), FS);
-    if (Error E = ProfileReaderOrErr.takeError())
-      return createFileError(ProfileFilename.value(), std::move(E));
-    ProfileReader = std::move(ProfileReaderOrErr.get());
-  }
-  auto ProfileReaderRef =
-      ProfileReader
-          ? std::optional<std::reference_wrapper<IndexedInstrProfReader>>(
-                *ProfileReader)
-          : std::nullopt;
+  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename, FS);
+  if (Error E = ProfileReaderOrErr.takeError())
+    return createFileError(ProfileFilename, std::move(E));
+  auto ProfileReader = std::move(ProfileReaderOrErr.get());
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
 
@@ -1079,17 +1053,16 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
   SmallVector<object::BuildID> FoundBinaryIDs;
   for (const auto &File : llvm::enumerate(ObjectFilenames)) {
-    if (Error E = loadFromFile(File.value(), GetArch(File.index()),
-                               CompilationDir, ProfileReaderRef, *Coverage,
-                               DataFound, &FoundBinaryIDs))
+    if (Error E =
+            loadFromFile(File.value(), GetArch(File.index()), CompilationDir,
+                         *ProfileReader, *Coverage, DataFound, &FoundBinaryIDs))
       return std::move(E);
   }
 
   if (BIDFetcher) {
     std::vector<object::BuildID> ProfileBinaryIDs;
-    if (ProfileReader)
-      if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
-        return createFileError(ProfileFilename.value(), std::move(E));
+    if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
+      return createFileError(ProfileFilename, std::move(E));
 
     SmallVector<object::BuildIDRef> BinaryIDsToFetch;
     if (!ProfileBinaryIDs.empty()) {
@@ -1109,12 +1082,12 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
       if (PathOpt) {
         std::string Path = std::move(*PathOpt);
         StringRef Arch = Arches.size() == 1 ? Arches.front() : StringRef();
-        if (Error E = loadFromFile(Path, Arch, CompilationDir, ProfileReaderRef,
-                                   *Coverage, DataFound))
+        if (Error E = loadFromFile(Path, Arch, CompilationDir, *ProfileReader,
+                                  *Coverage, DataFound))
           return std::move(E);
       } else if (CheckBinaryIDs) {
         return createFileError(
-            ProfileFilename.value(),
+            ProfileFilename,
             createStringError(errc::no_such_file_or_directory,
                               "Missing binary ID: " +
                                   llvm::toHex(BinaryID, /*LowerCase=*/true)));
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
deleted file mode 100644
index bce886bdf510..000000000000
--- a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
+++ /dev/null
@@ -1,37 +0,0 @@
-// FULL: SF:{{.*}}showLineExecutionCounts.cpp
-// FULL: FN:6,main
-// FULL: FNDA:0,main
-// FULL: FNF:1
-// FULL: FNH:0
-int main() {                              // FULL: DA:[[@LINE]],0
-  int x = 0;                              // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  if (x) {                                // FULL: DA:[[@LINE]],0
-    x = 0;                                // FULL: DA:[[@LINE]],0
-  } else {                                // FULL: DA:[[@LINE]],0
-    x = 1;                                // FULL: DA:[[@LINE]],0
-  }                                       // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],0
-    x = 1;                                // FULL: DA:[[@LINE]],0
-  }                                       // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],0
-  x = x > 10 ?                            // FULL: DA:[[@LINE]],0
-        x - 1:                            // FULL: DA:[[@LINE]],0
-        x + 1;                            // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  return 0;                               // FULL: DA:[[@LINE]],0
-}                                         // FULL: DA:[[@LINE]],0
-// FULL: LF:20
-// FULL: LH:0
-// FULL: end_of_record
-// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=FULL %s
-
-// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=SUMMARYONLY %s
-// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
-// SUMMARYONLY: FNF:1
-// SUMMARYONLY: FNH:0
-// SUMMARYONLY: LF:20
-// SUMMARYONLY: LH:0
-// SUMMARYONLY: end_of_record
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 6c66858c4de8..1f2484cd4dda 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -153,7 +153,7 @@ private:
   bool HadSourceFiles = false;
 
   /// The path to the indexed profile.
-  std::optional<std::string> PGOFilename;
+  std::string PGOFilename;
 
   /// A list of input source files.
   std::vector<std::string> SourceFiles;
@@ -455,12 +455,10 @@ static bool modifiedTimeGT(StringRef LHS, StringRef RHS) {
 }
 
 std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
-  if (PGOFilename) {
-    for (StringRef ObjectFilename : ObjectFilenames)
-      if (modifiedTimeGT(ObjectFilename, PGOFilename.value()))
-        warning("profile data may be out of date - object is newer",
-                ObjectFilename);
-  }
+  for (StringRef ObjectFilename : ObjectFilenames)
+    if (modifiedTimeGT(ObjectFilename, PGOFilename))
+      warning("profile data may be out of date - object is newer",
+              ObjectFilename);
   auto FS = vfs::getRealFileSystem();
   auto CoverageOrErr = CoverageMapping::load(
       ObjectFilenames, PGOFilename, *FS, CoverageArches,
@@ -670,16 +668,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "dump-collected-paths", cl::Optional, cl::Hidden,
       cl::desc("Show the collected paths to source files"));
 
-  cl::opt<std::string> PGOFilename(
-      "instr-profile", cl::Optional,
+  cl::opt<std::string, true> PGOFilename(
+      "instr-profile", cl::Required, cl::location(this->PGOFilename),
       cl::desc(
           "File with the profile data obtained after an instrumented run"));
 
-  cl::opt<bool> EmptyProfile(
-      "empty-profile", cl::Optional,
-      cl::desc("Use a synthetic profile with no data to generate "
-               "baseline coverage"));
-
   cl::list<std::string> Arches(
       "arch", cl::desc("architectures of the coverage mapping binaries"));
 
@@ -812,15 +805,6 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     }
     this->CheckBinaryIDs = CheckBinaryIDs;
 
-    if (!PGOFilename.empty() == EmptyProfile) {
-      error(
-          "exactly one of -instr-profile and -empty-profile must be specified");
-      return 1;
-    }
-    if (!PGOFilename.empty()) {
-      this->PGOFilename = std::make_optional(PGOFilename.getValue());
-    }
-
     if (!CovFilename.empty())
       ObjectFilenames.emplace_back(CovFilename);
     for (const std::string &Filename : CovFilenames)
@@ -1132,22 +1116,20 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     }
   }
 
-  if (PGOFilename) {
-    sys::fs::file_status Status;
-    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
-      error("could not read profile data!" + EC.message(), PGOFilename.value());
-      return 1;
-    }
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
+    error("could not read profile data!" + EC.message(), PGOFilename);
+    return 1;
+  }
 
-    if (ShowCreatedTime) {
-      auto ModifiedTime = Status.getLastModificationTime();
-      std::string ModifiedTimeStr = to_string(ModifiedTime);
-      size_t found = ModifiedTimeStr.rfind(':');
-      ViewOpts.CreatedTimeStr =
-          (found != std::string::npos)
-              ? "Created: " + ModifiedTimeStr.substr(0, found)
-              : "Created: " + ModifiedTimeStr;
-    }
+  if (ShowCreatedTime) {
+    auto ModifiedTime = Status.getLastModificationTime();
+    std::string ModifiedTimeStr = to_string(ModifiedTime);
+    size_t found = ModifiedTimeStr.rfind(':');
+    ViewOpts.CreatedTimeStr =
+        (found != std::string::npos)
+            ? "Created: " + ModifiedTimeStr.substr(0, found)
+            : "Created: " + ModifiedTimeStr;
   }
 
   auto Coverage = load();
@@ -1256,12 +1238,10 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
     return 1;
   }
 
-  if (PGOFilename) {
-    sys::fs::file_status Status;
-    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
-      error("could not read profile data!" + EC.message(), PGOFilename.value());
-      return 1;
-    }
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
+    error("could not read profile data!" + EC.message(), PGOFilename);
+    return 1;
   }
 
   auto Coverage = load();
@@ -1323,12 +1303,10 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  if (PGOFilename) {
-    sys::fs::file_status Status;
-    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
-      error("could not read profile data!" + EC.message(), PGOFilename.value());
-      return 1;
-    }
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
+    error("could not read profile data!" + EC.message(), PGOFilename);
+    return 1;
   }
 
   auto Coverage = load();
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index c0e99cf80b94..46f881ecddb5 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -277,9 +277,7 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
       CoverageReaders.push_back(
           std::make_unique<CoverageMappingReaderMock>(Funcs));
     }
-    auto ProfileReaderRef =
-        std::make_optional(std::reference_wrapper(*ProfileReader));
-    return CoverageMapping::load(CoverageReaders, ProfileReaderRef);
+    return CoverageMapping::load(CoverageReaders, *ProfileReader);
   }
 
   Error loadCoverageMapping(bool EmitFilenames = true) {

From 9e23e85d6597bd59ff316a3ce93bb8ec41919b19 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 02:10:56 +0900
Subject: [PATCH 0309/1322] [LLD][Cygwin] Implement --dll-search-prefix
 (#143263)

GCC on Cygwin environment invokes linker with passing
`--dll-search-prefix=cyg`.
Implementing this option makes lld-mingw invokable by `gcc -fuse-ld=lld`.

---------

Co-authored-by: jeremyd2019 <github@jdrake.com>
---
 lld/MinGW/Driver.cpp    | 14 ++++++++++----
 lld/MinGW/Options.td    |  2 ++
 lld/test/MinGW/lib.test |  7 +++++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 8996293fdfa1..98d48bdfcf31 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -138,8 +138,9 @@ static std::optional<std::string> findFile(StringRef path1,
 }
 
 // This is for -lfoo. We'll look for libfoo.dll.a or libfoo.a from search paths.
-static std::string
-searchLibrary(StringRef name, ArrayRef<StringRef> searchPaths, bool bStatic) {
+static std::string searchLibrary(StringRef name,
+                                 ArrayRef<StringRef> searchPaths, bool bStatic,
+                                 StringRef prefix) {
   if (name.starts_with(":")) {
     for (StringRef dir : searchPaths)
       if (std::optional<std::string> s = findFile(dir, name.substr(1)))
@@ -160,7 +161,7 @@ searchLibrary(StringRef name, ArrayRef<StringRef> searchPaths, bool bStatic) {
     if (std::optional<std::string> s = findFile(dir, name + ".lib"))
       return *s;
     if (!bStatic) {
-      if (std::optional<std::string> s = findFile(dir, "lib" + name + ".dll"))
+      if (std::optional<std::string> s = findFile(dir, prefix + name + ".dll"))
         return *s;
       if (std::optional<std::string> s = findFile(dir, name + ".dll"))
         return *s;
@@ -554,6 +555,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     add("-libpath:" + StringRef(a->getValue()));
   }
 
+  StringRef dllPrefix = "lib";
+  if (auto *arg = args.getLastArg(OPT_dll_search_prefix))
+    dllPrefix = arg->getValue();
+
   StringRef prefix = "";
   bool isStatic = false;
   for (auto *a : args) {
@@ -565,7 +570,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
         add(prefix + StringRef(a->getValue()));
       break;
     case OPT_l:
-      add(prefix + searchLibrary(a->getValue(), searchPaths, isStatic));
+      add(prefix +
+          searchLibrary(a->getValue(), searchPaths, isStatic, dllPrefix));
       break;
     case OPT_whole_archive:
       prefix = "-wholearchive:";
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 01b01972112a..e6cf48e685b7 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -79,6 +79,8 @@ defm exclude_symbols: Eq<"exclude-symbols",
     "Exclude symbols from automatic export">, MetaVarName<"<symbol[,symbol,...]>">;
 def export_all_symbols: F<"export-all-symbols">,
     HelpText<"Export all symbols even if a def file or dllexport attributes are used">;
+defm dll_search_prefix:Eq<"dll-search-prefix", "Specify DLL prefix instead of 'lib'">,
+    MetaVarName<"<dll_search_prefix>">;
 defm fatal_warnings: B<"fatal-warnings",
     "Treat warnings as errors",
     "Do not treat warnings as errors (default)">;
diff --git a/lld/test/MinGW/lib.test b/lld/test/MinGW/lib.test
index 8bd8a0e9304d..ac002e7549b4 100644
--- a/lld/test/MinGW/lib.test
+++ b/lld/test/MinGW/lib.test
@@ -5,6 +5,7 @@ LIB1: unable to find library -lfoo
 
 RUN: echo > %t/lib/libfoo.dll.a
 RUN: ld.lld -### -m i386pep -lfoo -L%t/lib 2>&1 | FileCheck -check-prefix=LIB2 %s
+RUN: ld.lld -### -m i386pep -lfoo --dll-search-prefix=cyg -L%t/lib 2>&1 | FileCheck -check-prefix=LIB2 %s
 LIB2: libfoo.dll.a
 
 RUN: not ld.lld -### -m i386pep -l:barefilename -L%t/lib 2>&1 | FileCheck -check-prefix=LIB-LITERAL-FAIL %s
@@ -22,6 +23,7 @@ LIB3: unable to find library -lfoo
 
 RUN: echo > %t/lib/libfoo.a
 RUN: ld.lld -### -m i386pep -Bstatic -lfoo -L%t/lib 2>&1 | FileCheck -check-prefix=LIB4 %s
+RUN: ld.lld -### -m i386pep -Bstatic -lfoo --dll-search-prefix=cyg -L%t/lib 2>&1 | FileCheck -check-prefix=LIB4 %s
 LIB4: libfoo.a
 
 RUN: echo > %t/lib/libbar.dll.a
@@ -46,12 +48,17 @@ MSVCSTYLE: msvcstyle.lib
 
 RUN: echo > %t/lib/libnoimplib.dll
 RUN: echo > %t/lib/noprefix_noimplib.dll
+RUN: echo > %t/lib/cygnoimplib2.dll
 RUN: ld.lld -### -m i386pep -L%t/lib -lnoimplib 2>&1 | FileCheck -check-prefix=DLL1 %s
 RUN: ld.lld -### -m i386pep -L%t/lib -lnoprefix_noimplib 2>&1 | FileCheck -check-prefix=DLL2 %s
+RUN: ld.lld -### -m i386pep -L%t/lib -lnoimplib2 --dll-search-prefix=cyg 2>&1 | FileCheck -check-prefix=DLL3 %s
 DLL1: libnoimplib.dll
 DLL2: noprefix_noimplib.dll
+DLL3: cygnoimplib2.dll
 
 RUN: not ld.lld -### -m i386pep -L%t/lib -static -lnoimplib 2>&1 | FileCheck -check-prefix=ERROR-NOIMPLIB %s
 RUN: not ld.lld -### -m i386pep -L%t/lib -static -lnoprefix_noimplib 2>&1 | FileCheck -check-prefix=ERROR-NOPREFIX-NOIMPLIB %s
+RUN: not ld.lld -### -m i386pep -L%t/lib -static -lnoimplib2 --dll-search-prefix=cyg 2>&1 | FileCheck -check-prefix=ERROR-CYG-NOIMPLIB %s
 ERROR-NOIMPLIB: unable to find library -lnoimplib
 ERROR-NOPREFIX-NOIMPLIB: unable to find library -lnoprefix_noimplib
+ERROR-CYG-NOIMPLIB: unable to find library -lnoimplib2

From 1072196c2737fcf921ad52e9a44c13423789111b Mon Sep 17 00:00:00 2001
From: Tai Ly <tai.ly@arm.com>
Date: Fri, 13 Jun 2025 12:12:25 -0500
Subject: [PATCH 0310/1322] [tosa] Add duplicate indices check for Scatter
 (#143736)

Tosa scatter operator disallow duplicate indices (per batch)
This patch adds, to the validation pass, checking for duplicate values
in scatter operator's constant indices values.

Signed-off-by: Tai Ly <tai.ly@arm.com>
---
 .../mlir/Dialect/Tosa/Utils/ConversionUtils.h |  5 ++++
 .../Tosa/Transforms/TosaValidation.cpp        | 28 ++++++++++++++++++-
 .../Dialect/Tosa/Utils/ConversionUtils.cpp    | 27 ++++++++++++++++++
 mlir/test/Dialect/Tosa/invalid.mlir           | 10 +++++++
 4 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index 096510a09e32..6f3b0916a7a6 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -243,6 +243,11 @@ bool getConstShapeValues(Operation *op,
 // returns a small vector of int64_t values that attr contains
 SmallVector<int64_t> convertFromIntAttr(const DenseElementsAttr &attr,
                                         const int rank);
+
+// returns true iff constant indices for scatter op contains unique indices
+// per batch
+bool hasUniqueConstantScatterIndices(ShapedType indicesType,
+                                     DenseIntElementsAttr indicesAttr);
 } // namespace tosa
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index d33fc902de3a..229f42d3178b 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -1244,10 +1244,36 @@ bool checkErrorIfCondIf(Operation *op) {
   return true;
 }
 
+bool checkErrorIfScatter(Operation *op) {
+  auto scatterOp = dyn_cast<tosa::ScatterOp>(op);
+  if (!scatterOp)
+    return true;
+
+  // for constant indices, check that there are no duplicate values
+  DenseIntElementsAttr indicesAttr;
+  if (!matchPattern(scatterOp.getIndices(), m_Constant(&indicesAttr)))
+    return true;
+
+  auto const indicesType =
+      dyn_cast<ShapedType>(scatterOp.getIndices().getType());
+  if (!indicesType || !indicesType.hasRank()) {
+    op->emitOpError("expect ranked indices tensor");
+    return false;
+  }
+
+  if (!hasUniqueConstantScatterIndices(indicesType, indicesAttr)) {
+    op->emitOpError("indices values contain duplicates");
+    return false;
+  }
+
+  return true;
+}
+
 LogicalResult TosaValidation::applyErrorIfCheck(Operation *op) {
   if (!checkErrorIfResize(op) || !checkErrorIfMul(op) ||
       !checkErrorIfTable(op) || !checkErrorIfRescale(op) ||
-      !checkErrorIfPad(op) || !checkErrorIfCondIf(op))
+      !checkErrorIfPad(op) || !checkErrorIfCondIf(op) ||
+      !checkErrorIfScatter(op))
     return failure();
   return success();
 }
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index e1b3be74b50f..9844abcc34cb 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -213,3 +213,30 @@ mlir::tosa::convertFromIntAttr(const DenseElementsAttr &attr, const int rank) {
   }
   return {};
 }
+
+bool mlir::tosa::hasUniqueConstantScatterIndices(
+    ShapedType indicesType, DenseIntElementsAttr indicesAttr) {
+  llvm::ArrayRef<int64_t> const indicesShape = indicesType.getShape();
+  const unsigned int indicesRank = indicesShape.size();
+  const unsigned int lastDimSize = indicesShape[indicesRank - 1];
+
+  // check each batch of indices from the flat indicesAttr values
+  // for duplicates
+  auto const indicesValues = indicesAttr.getValues<int32_t>();
+  assert(
+      (indicesValues.size() % lastDimSize == 0) &&
+      "Constant indices data length should be a multiple of indicesShape[-1]");
+
+  std::vector<uint64_t> indices(lastDimSize);
+  for (auto beg = indicesValues.begin(); beg < indicesValues.end();
+       beg += lastDimSize) {
+    std::copy(beg, beg + lastDimSize, indices.begin());
+    std::sort(indices.begin(), indices.end());
+    if (std::adjacent_find(indices.begin(), indices.end()) != indices.end()) {
+      // found duplicate values in indices in batch
+      return false;
+    }
+  }
+
+  return true;
+}
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index a4617fc6fba8..805522799a6d 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -2015,3 +2015,13 @@ func.func @test_rescale_output_unsigned(%arg0: tensor<1x1xi8>) -> (tensor<1x1xui
   %r = tosa.rescale %arg0, %1, %0, %3, %2 {input_unsigned = false, output_unsigned = true, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<1x1xi8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x1xui8>
   return %r : tensor<1x1xui8>
 }
+
+// -----
+
+// CHECK-LABEL: test_scatter_duplicate_indices
+func.func @test_scatter_duplicate_indices(%arg0: tensor<2x52x3xf32>, %arg2: tensor<2x12x3xf32>) -> tensor<2x52x3xf32> {
+  %indices = "tosa.const"() { values = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 3, 11, 12]]> : tensor<2x12xi32> } : () -> tensor<2x12xi32>
+  // expected-error@+1 {{'tosa.scatter' op indices values contain duplicates}}
+  %0 = tosa.scatter %arg0, %indices, %arg2 : (tensor<2x52x3xf32>, tensor<2x12xi32>, tensor<2x12x3xf32>) -> tensor<2x52x3xf32>
+  return %0 : tensor<2x52x3xf32>
+}

From b81d5e06c7cba8c9f1f5380daed4b9ee139214ba Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 13 Jun 2025 18:25:07 +0100
Subject: [PATCH 0311/1322] [InstCombine] Fold shuffles through all trivially
 vectorizable intrinsics (#141979)

This addresses a TODO in foldShuffledIntrinsicOperands to use
isTriviallyVectorizable instead of a hardcoded list of intrinsics, which
in turn allows more intriniscs to be scalarized by VectorCombine.

From what I can tell every intrinsic here should be speculatable so an
assertion was added.

Because this enables intrinsics like abs which have a scalar operand, we
need to also check isVectorIntrinsicWithScalarOpAtArg.
---
 .../InstCombine/InstCombineCalls.cpp          | 52 +++++++++++--------
 llvm/test/Transforms/InstCombine/abs-1.ll     | 11 ++++
 llvm/test/Transforms/InstCombine/fma.ll       | 13 +++++
 .../InstCombine/minmax-intrinsics.ll          | 15 ++++++
 llvm/test/Transforms/InstCombine/powi.ll      | 26 ++++++++++
 llvm/test/Transforms/InstCombine/scmp.ll      | 13 +++++
 llvm/test/Transforms/InstCombine/sqrt.ll      | 11 ++++
 .../AMDGPU/add_sub_sat-inseltpoison.ll        | 30 ++++-------
 .../SLPVectorizer/AMDGPU/add_sub_sat.ll       | 30 ++++-------
 .../X86/alternate-calls-inseltpoison.ll       | 36 ++++++-------
 .../SLPVectorizer/X86/alternate-calls.ll      | 36 ++++++-------
 11 files changed, 175 insertions(+), 98 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c169ab25b210..8c8cc0859e4a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1400,42 +1400,46 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
 /// try to shuffle after the intrinsic.
 Instruction *
 InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
-  // TODO: This should be extended to handle other intrinsics like fshl, ctpop,
-  //       etc. Use llvm::isTriviallyVectorizable() and related to determine
-  //       which intrinsics are safe to shuffle?
-  switch (II->getIntrinsicID()) {
-  case Intrinsic::smax:
-  case Intrinsic::smin:
-  case Intrinsic::umax:
-  case Intrinsic::umin:
-  case Intrinsic::fma:
-  case Intrinsic::fshl:
-  case Intrinsic::fshr:
-    break;
-  default:
+  if (!isTriviallyVectorizable(II->getIntrinsicID()) ||
+      !II->getCalledFunction()->isSpeculatable())
+    return nullptr;
+
+  // fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so
+  // avoid undoing it.
+  if (match(II, m_FAbs(m_Value())))
     return nullptr;
-  }
 
   Value *X;
   Constant *C;
   ArrayRef<int> Mask;
-  auto *NonConstArg = find_if_not(II->args(), IsaPred<Constant>);
+  auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) {
+    return isa<Constant>(Arg.get()) ||
+           isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                              Arg.getOperandNo(), nullptr);
+  });
   if (!NonConstArg ||
       !match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask))))
     return nullptr;
 
-  // At least 1 operand must have 1 use because we are creating 2 instructions.
-  if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); }))
+  // At least 1 operand must be a shuffle with 1 use because we are creating 2
+  // instructions.
+  if (none_of(II->args(), [](Value *V) {
+        return isa<ShuffleVectorInst>(V) && V->hasOneUse();
+      }))
     return nullptr;
 
   // See if all arguments are shuffled with the same mask.
   SmallVector<Value *, 4> NewArgs;
   Type *SrcTy = X->getType();
-  for (Value *Arg : II->args()) {
-    if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
-        X->getType() == SrcTy)
+  for (Use &Arg : II->args()) {
+    if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                           Arg.getOperandNo(), nullptr))
+      NewArgs.push_back(Arg);
+    else if (match(&Arg,
+                   m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
+             X->getType() == SrcTy)
       NewArgs.push_back(X);
-    else if (match(Arg, m_ImmConstant(C))) {
+    else if (match(&Arg, m_ImmConstant(C))) {
       // If it's a constant, try find the constant that would be shuffled to C.
       if (Constant *ShuffledC =
               unshuffleConstant(Mask, C, cast<VectorType>(SrcTy)))
@@ -1448,8 +1452,12 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
 
   // intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M
   Instruction *FPI = isa<FPMathOperator>(II) ? II : nullptr;
+  // Result type might be a different vector width.
+  // TODO: Check that the result type isn't widened?
+  VectorType *ResTy =
+      VectorType::get(II->getType()->getScalarType(), cast<VectorType>(SrcTy));
   Value *NewIntrinsic =
-      Builder.CreateIntrinsic(II->getIntrinsicID(), SrcTy, NewArgs, FPI);
+      Builder.CreateIntrinsic(ResTy, II->getIntrinsicID(), NewArgs, FPI);
   return new ShuffleVectorInst(NewIntrinsic, Mask);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll
index 7037647d116b..fd67fc342149 100644
--- a/llvm/test/Transforms/InstCombine/abs-1.ll
+++ b/llvm/test/Transforms/InstCombine/abs-1.ll
@@ -978,3 +978,14 @@ define i32 @abs_diff_signed_slt_no_nsw_swap(i32 %a, i32 %b) {
   %cond = select i1 %cmp, i32 %sub_ba, i32 %sub_ab
   ret i32 %cond
 }
+
+define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) {
+; CHECK-LABEL: @abs_unary_shuffle_ops(
+; CHECK-NEXT:    [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1:%.*]], i1 false)
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[R2]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false)
+  ret <2 x i32> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll
index f0d4f776a5d9..e3d3e722bcc2 100644
--- a/llvm/test/Transforms/InstCombine/fma.ll
+++ b/llvm/test/Transforms/InstCombine/fma.ll
@@ -972,6 +972,19 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
   ret <2 x half> %sub
 }
 
+define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
+; CHECK-LABEL: @fmuladd_unary_shuffle_ops(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x float> [[R1]]
+;
+  %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %c = shufflevector <2 x float> %z, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %r
+}
+
 ; negative tests
 
 define half @fma_non_negone(half %x, half %y) {
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index 38930956eda2..52bc3636be35 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2511,6 +2511,21 @@ define <3 x i8> @smin_unary_shuffle_ops_uses(<3 x i8> %x, <3 x i8> %y) {
   ret <3 x i8> %r
 }
 
+; negative test - too many uses
+
+define <3 x i8> @smin_unary_shuffle_ops_uses_const(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @smin_unary_shuffle_ops_uses_const(
+; CHECK-NEXT:    [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    call void @use_vec(<3 x i8> [[SX]])
+; CHECK-NEXT:    [[R:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[SX]], <3 x i8> <i8 1, i8 2, i8 3>)
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  call void @use_vec(<3 x i8> %sx)
+  %r = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %sx, <3 x i8> <i8 1, i8 2, i8 3>)
+  ret <3 x i8> %r
+}
+
 ; This would assert/crash because we tried to zext to i1.
 
 @g = external dso_local global i32, align 4
diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll
index d76f92c1849a..422792a5a2c2 100644
--- a/llvm/test/Transforms/InstCombine/powi.ll
+++ b/llvm/test/Transforms/InstCombine/powi.ll
@@ -564,3 +564,29 @@ define double @powi_fmul_powi_x_overflow(double noundef %x) {
   %mul = fmul reassoc double %p1, %x
   ret double %mul
 }
+
+define <3 x float> @powi_unary_shuffle_ops(<3 x float> %x, i32 %power) {
+; CHECK-LABEL: @powi_unary_shuffle_ops(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[X:%.*]], i32 [[POWER:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  %r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power)
+  ret <3 x float> %r
+}
+
+; Negative test - multiple uses
+
+define <3 x float> @powi_unary_shuffle_ops_use(<3 x float> %x, i32 %power, ptr %p) {
+; CHECK-LABEL: @powi_unary_shuffle_ops_use(
+; CHECK-NEXT:    [[SX:%.*]] = shufflevector <3 x float> [[X:%.*]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    store <3 x float> [[SX]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[SX]], i32 [[POWER:%.*]])
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  store <3 x float> %sx, ptr %p
+  %r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power)
+  ret <3 x float> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index 2140a59de3fa..2bf22aeb7a6e 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -423,6 +423,19 @@ define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) {
   ret i8 %r
 }
 
+define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: define <3 x i2> @scmp_unary_shuffle_ops(
+; CHECK-SAME: <3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i2> @llvm.scmp.v3i2.v3i8(<3 x i8> [[X]], <3 x i8> [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i2> [[TMP1]], <3 x i2> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    ret <3 x i2> [[R]]
+;
+  %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  %sy = shufflevector <3 x i8> %y, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  %r = call <3 x i2> @llvm.scmp(<3 x i8> %sx, <3 x i8> %sy)
+  ret <3 x i2> %r
+}
+
 ; Negative test: true value of outer select is not zero
 define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1(
diff --git a/llvm/test/Transforms/InstCombine/sqrt.ll b/llvm/test/Transforms/InstCombine/sqrt.ll
index 0f4db3b3a65a..2fda5bc37d02 100644
--- a/llvm/test/Transforms/InstCombine/sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/sqrt.ll
@@ -201,6 +201,17 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
   ret <2 x float> %res
 }
 
+define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) {
+; CHECK-LABEL: @sqrt_unary_shuffle_ops(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x float> [[R1]]
+;
+  %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x float> @llvm.sqrt(<2 x float> %a)
+  ret <2 x float> %r
+}
+
 declare i32 @foo(double)
 declare double @sqrt(double) readnone
 declare float @sqrtf(float)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 3749bdf1bba3..783a1e83c672 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 0bb641371825..7e31ec9a0b39 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index b790e6f3c99c..77d36f010766 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]])
+; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index ef1a67032c23..18d79752b0b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]])
+; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

From c609112a5383c10272e3afceedd4d03f26437cf0 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Fri, 13 Jun 2025 10:25:26 -0700
Subject: [PATCH 0312/1322] Fix/reapply "[libc] Migrate stdio tests to
 ErrnoCheckingTest." (#143972)

This reverts commit a93e55e57ed00a55f822c64e3520c7c732b58480 and fixes
build and test failures:

* Proper include added to setvbuf_test.cpp
* fgetc/fgetc_unlocked/fgets tests are ported to ErrnoSetterMatcher and
are made more precise. This fixes inconsistencies between expectations
in regular and GPU builds - ErrnoSetterMatcher is configured to omit
errno matching on GPUs, as fgetc implementation on GPU doesn't set
errno, in contrast to Linux.
---
 libc/test/src/stdio/CMakeLists.txt           | 13 ++++++++++++
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++-------
 libc/test/src/stdio/fgetc_test.cpp           | 22 +++++++++++---------
 libc/test/src/stdio/fgetc_unlocked_test.cpp  | 22 +++++++++++---------
 libc/test/src/stdio/fgets_test.cpp           | 18 +++++++++-------
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++-------------
 libc/test/src/stdio/fopencookie_test.cpp     | 15 +++++++------
 libc/test/src/stdio/remove_test.cpp          | 10 ++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  9 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 +++----
 libc/test/src/stdlib/StrtolTest.h            |  1 -
 libc/test/src/stdlib/strtold_test.cpp        |  1 -
 13 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index ce2171f19597..4aa8b9588001 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -438,6 +442,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -452,6 +457,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -468,6 +474,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -488,6 +495,8 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -510,6 +519,8 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -527,6 +538,8 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100..b53184c30be3 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a..be2e50271b51 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,15 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -27,29 +30,28 @@ public:
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82..bef9dafd3d87 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,15 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,31 +33,30 @@ public:
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d405293..ca8d4d454663 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,14 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -29,15 +32,15 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+              Succeeds(WRITE_SIZE));
   // This is a write-only file so reads should fail.
-  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
+  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file), Fails(EBADF, nullptr));
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -55,6 +58,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
+  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -86,5 +90,5 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b..e097785832d5 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b64..bcf5e674141a 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c..296bff1f5dc1 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8..135fb98c07fb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb4..a0936ba79ef7 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,12 +11,14 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +54,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +104,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064b..e99b382d1211 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e7..03f0a6539c78 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c..eb4056dc7ba6 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 493c1612d6f8f7a40d0bf0ba28fb753be83fac1c Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 13 Jun 2025 13:26:26 -0400
Subject: [PATCH 0313/1322] [SPIRV] Fix ExecutionMode_fragment.ll test
 (#144116)

Fix test broken by https://github.com/llvm/llvm-project/pull/143412.
---
 llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
index 4fa764fe192d..aab0ae05753f 100644
--- a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
+++ b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
@@ -4,17 +4,16 @@
 ; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main" {{.*}}
 ; CHECK-DAG: OpExecutionMode %[[#entry]] OriginUpperLeft
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
 
 define void @main() #0 {
 entry:
-  %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
   %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %0, i32 0)
   store i32 1, ptr addrspace(11) %1, align 4
 
   ret void
 }
 
-declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #1
-
 attributes #0 = { "hlsl.shader"="pixel" }
 attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

From fd432151a607a997c417f32cb70650fc7728629a Mon Sep 17 00:00:00 2001
From: William Huynh <113542065+saturn691@users.noreply.github.com>
Date: Fri, 13 Jun 2025 18:26:40 +0100
Subject: [PATCH 0314/1322] [libc] Fix bugs found when testing with all headers
 (#144049)

Fixes a couple of bugs found when building. The PR to enable the headers
can be found here: #144114.

- math.yaml: float128 guard
- wchar.yaml: __restrict keyword order
---
 libc/include/math.yaml                        |  2 +-
 libc/include/wchar.yaml                       | 20 +++++++++----------
 .../src/stdio/printf_core/converter_test.cpp  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libc/include/math.yaml b/libc/include/math.yaml
index 466c08ade6fc..11bead074595 100644
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@@ -734,7 +734,7 @@ functions:
       - type: float128
       - type: float128
       - type: float128
-    guards: LIBC_TYPES_HAS_FLOAT128
+    guard: LIBC_TYPES_HAS_FLOAT128
   - name: ffmal
     standards:
       - stdc
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 1af15a6c112b..84db73d8f01e 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -109,8 +109,8 @@ functions:
       - stdc
     return_type: wchar_t *
     arguments: 
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict 
+      - type: const wchar_t *__restrict
       - type: size_t
   - name: wmemmove
     standards:
@@ -125,16 +125,16 @@ functions:
       - stdc
     return_type: wchar_t *
     arguments:
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
       - type: size_t
   - name: wcscat
     standards:
       - stdc
     return_type: wchar_t *
     arguments: 
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
   - name: wcsstr
     standards:
       - stdc
@@ -147,13 +147,13 @@ functions:
       - stdc
     return_type: wchar_t *
     arguments:
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
       - type: size_t
   - name: wcscpy
     standards:
       - stdc
     return_type: wchar_t *
     arguments:
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index 96a00ae598ec..bf088937e410 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -124,7 +124,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   LIBC_NAMESPACE::printf_core::FormatSection high_precision_conv;
   high_precision_conv.has_conv = true;
-  high_precision_conv.raw_string = "%4s";
+  high_precision_conv.raw_string = "%.4s";
   high_precision_conv.conv_name = 's';
   high_precision_conv.precision = 4;
   high_precision_conv.conv_val_ptr = const_cast<char *>("456");

From 9a3082276d21873a37925d0c6ad89bd28d065cea Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 10:28:03 -0700
Subject: [PATCH 0315/1322] [CIR][NFC] Fix forrange.cpp test (#144123)

A recent change has cause the begin and end iterators in the
forrange.cpp CIR codegen test to be marked as 'init' causing the test to
fail. This change fixes the checks in the test.
---
 clang/test/CIR/CodeGen/forrange.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CIR/CodeGen/forrange.cpp b/clang/test/CIR/CodeGen/forrange.cpp
index 6b6ccc79e59d..45e146e9091d 100644
--- a/clang/test/CIR/CodeGen/forrange.cpp
+++ b/clang/test/CIR/CodeGen/forrange.cpp
@@ -115,8 +115,8 @@ void for_range3() {
 // CIR:    %[[C_ADDR:.*]] = cir.alloca !rec_C3{{.*}} ["c"]
 // CIR:    cir.scope {
 // CIR:      %[[RANGE_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C3>{{.*}} ["__range1", init, const]
-// CIR:      %[[BEGIN_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__begin1"]
-// CIR:      %[[END_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__end1"]
+// CIR:      %[[BEGIN_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__begin1", init]
+// CIR:      %[[END_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__end1", init]
 // CIR:      %[[E_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Element>{{.*}} ["e", init, const]
 // CIR:      cir.store{{.*}} %[[C_ADDR]], %[[RANGE_ADDR]]
 // CIR:      cir.for : cond {

From 62eea86424c4eacd38ad8a03f4bdae78687e3ade Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Fri, 13 Jun 2025 19:29:21 +0200
Subject: [PATCH 0316/1322] [CIR] Update isSized with upstreamed types
 (#143960)

Update `isSized` function with the upstreamed types
---
 clang/lib/CIR/CodeGen/CIRGenBuilder.h |  5 +++--
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 13 +++++++++++--
 clang/test/CIR/CodeGen/array.cpp      | 23 +++++++++++++++++++++++
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 36c89809b4d9..a4bc69619d60 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -139,8 +139,9 @@ public:
   }
 
   bool isSized(mlir::Type ty) {
-    if (mlir::isa<cir::PointerType, cir::ArrayType, cir::BoolType,
-                  cir::IntType>(ty))
+    if (mlir::isa<cir::PointerType, cir::ArrayType, cir::BoolType, cir::IntType,
+                  cir::CIRFPTypeInterface, cir::ComplexType, cir::RecordType>(
+            ty))
       return true;
 
     if (const auto vt = mlir::dyn_cast<cir::VectorType>(ty))
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index eaba3dfd1105..bab47924dd71 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -419,6 +419,15 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
   case Type::ConstantArray: {
     const ConstantArrayType *arrTy = cast<ConstantArrayType>(ty);
     mlir::Type elemTy = convertTypeForMem(arrTy->getElementType());
+
+    // TODO(CIR): In LLVM, "lower arrays of undefined struct type to arrays of
+    // i8 just to have a concrete type"
+    if (!builder.isSized(elemTy)) {
+      cgm.errorNYI(SourceLocation(), "arrays of undefined struct type", type);
+      resultType = cgm.UInt32Ty;
+      break;
+    }
+
     resultType = cir::ArrayType::get(elemTy, arrTy->getSize().getZExtValue());
     break;
   }
@@ -432,8 +441,8 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
   }
 
   case Type::Enum: {
-    const EnumDecl *ED = cast<EnumType>(ty)->getDecl();
-    if (auto integerType = ED->getIntegerType(); !integerType.isNull())
+    const EnumDecl *ed = cast<EnumType>(ty)->getDecl();
+    if (auto integerType = ed->getIntegerType(); !integerType.isNull())
       return convertType(integerType);
     // Return a placeholder 'i32' type.  This can be changed later when the
     // type is defined (see UpdateCompletedType), but is likely to be the
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 7b90c1682ec4..26e172a00645 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -473,3 +473,26 @@ void func10(int *a) {
 // OGCG:  %[[ELE:.*]] = getelementptr inbounds i32, ptr %[[TMP_1]], i64 5
 // OGCG:  %[[TMP_2:.*]] = load i32, ptr %[[ELE]], align 4
 // OGCG:  store i32 %[[TMP_2]], ptr %[[INIT]], align 4
+
+void func11() { int _Complex a[4]; }
+
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.complex<!s32i> x 4>, !cir.ptr<!cir.array<!cir.complex<!s32i> x 4>>, ["a"]
+
+// LLVM: %[[ARR:.*]] = alloca [4 x { i32, i32 }], i64 1, align 16
+
+// OGCG: %[[ARR:.*]] = alloca [4 x { i32, i32 }], align 16
+
+void func12() {
+  struct Point {
+    int x;
+    int y;
+  };
+
+  Point a[4];
+}
+
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!rec_Point x 4>, !cir.ptr<!cir.array<!rec_Point x 4>>, ["a"]
+
+// LLVM: %[[ARR:.*]] = alloca [4 x %struct.Point], i64 1, align 16
+
+// OGCG: %[[ARR:.*]] = alloca [4 x %struct.Point], align 16

From ec330cf6701793525da9eb471e7ff796938ab54a Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 10:31:35 -0700
Subject: [PATCH 0317/1322] [bazel] Update llvm-config.h and disable DebugLoc
 tracking (#144125)

In c588224ca797886064a7a79f6c0114a6963c325e, @chapuni set
LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING to 1, but from what I can tell,
this is not the default setting for CMake builds. I think the intention
was mostly just to update llvm-config.h to fix the Bazel build.

I'm adding LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING as well to fix the build
for the same purpose.
---
 .../llvm/include/llvm/Config/llvm-config.h                  | 6 +++++-
 utils/bazel/llvm_configs/llvm-config.h.cmake                | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 97626d466509..5dd53cffb7bd 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -132,6 +132,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1
+#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0
+
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#define LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 0
 
 #endif
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index dbc882937b4f..6d3c37cc8b19 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -133,4 +133,8 @@
    and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+
 #endif

From 51689c9df2fbb81aab1ff802f3efb86cac926853 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Fri, 13 Jun 2025 10:31:47 -0700
Subject: [PATCH 0318/1322] [libc][NFC] clean internal fd handling (#143991)

The previous internal fcntl implementation modified errno directly, this
patch fixes that. This patch also moves open and close into OSUtil since
they are used in multiple places. There are more places that need
similar cleanup but only got comments in this patch to keep it
relatively reviewable.

Related to: https://github.com/llvm/llvm-project/issues/143937
---
 libc/src/__support/File/linux/file.cpp        | 14 ++--
 libc/src/__support/OSUtil/fcntl.h             |  8 +-
 .../src/__support/OSUtil/linux/CMakeLists.txt |  1 -
 libc/src/__support/OSUtil/linux/fcntl.cpp     | 83 ++++++++++++-------
 libc/src/fcntl/linux/CMakeLists.txt           |  1 +
 libc/src/fcntl/linux/fcntl.cpp                | 10 ++-
 libc/src/fcntl/linux/open.cpp                 | 24 ++----
 libc/src/sys/auxv/linux/getauxval.cpp         | 37 ++++++---
 libc/src/sys/mman/linux/shm_common.h          |  5 ++
 libc/src/sys/mman/linux/shm_open.cpp          | 16 +++-
 libc/src/sys/mman/linux/shm_unlink.cpp        |  9 +-
 libc/src/unistd/linux/close.cpp               | 12 +--
 .../llvm-project-overlay/libc/BUILD.bazel     | 58 ++++++++++++-
 13 files changed, 197 insertions(+), 81 deletions(-)

diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp
index 761e352f74ea..4594dadf1ccd 100644
--- a/libc/src/__support/File/linux/file.cpp
+++ b/libc/src/__support/File/linux/file.cpp
@@ -19,8 +19,8 @@
 #include "src/__support/macros/config.h"
 
 #include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall
-#include <sys/stat.h>    // For S_IS*, S_IF*, and S_IR* flags.
-#include <sys/syscall.h> // For syscall numbers
+#include <sys/stat.h>         // For S_IS*, S_IF*, and S_IR* flags.
+#include <sys/syscall.h>      // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -128,10 +128,11 @@ ErrorOr<LinuxFile *> create_file_from_fd(int fd, const char *mode) {
     return Error(EINVAL);
   }
 
-  int fd_flags = internal::fcntl(fd, F_GETFL);
-  if (fd_flags == -1) {
+  auto result = internal::fcntl(fd, F_GETFL);
+  if (!result.has_value()) {
     return Error(EBADF);
   }
+  int fd_flags = result.value();
 
   using OpenMode = File::OpenMode;
   if (((fd_flags & O_ACCMODE) == O_RDONLY &&
@@ -145,8 +146,9 @@ ErrorOr<LinuxFile *> create_file_from_fd(int fd, const char *mode) {
   if ((modeflags & static_cast<ModeFlags>(OpenMode::APPEND)) &&
       !(fd_flags & O_APPEND)) {
     do_seek = true;
-    if (internal::fcntl(fd, F_SETFL,
-                        reinterpret_cast<void *>(fd_flags | O_APPEND)) == -1) {
+    if (!internal::fcntl(fd, F_SETFL,
+                         reinterpret_cast<void *>(fd_flags | O_APPEND))
+             .has_value()) {
       return Error(EBADF);
     }
   }
diff --git a/libc/src/__support/OSUtil/fcntl.h b/libc/src/__support/OSUtil/fcntl.h
index 46f7d2813239..3983d78f7f89 100644
--- a/libc/src/__support/OSUtil/fcntl.h
+++ b/libc/src/__support/OSUtil/fcntl.h
@@ -8,12 +8,18 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_FCNTL_H
 #define LLVM_LIBC_SRC___SUPPORT_OSUTIL_FCNTL_H
 
+#include "hdr/types/mode_t.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-int fcntl(int fd, int cmd, void *arg = nullptr);
+ErrorOr<int> fcntl(int fd, int cmd, void *arg = nullptr);
+
+ErrorOr<int> open(const char *path, int flags, mode_t mode_flags = 0);
+
+ErrorOr<int> close(int fd);
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index b9704d42cd33..4681d8c2bb73 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -16,7 +16,6 @@ add_object_library(
     .${LIBC_TARGET_ARCHITECTURE}.linux_${LIBC_TARGET_ARCHITECTURE}_util
     libc.src.__support.common
     libc.src.__support.CPP.string_view
-    libc.src.errno.errno
     libc.hdr.fcntl_macros
     libc.hdr.types.struct_flock
     libc.hdr.types.struct_flock64
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index 99e16ad58c91..bb76eee90efd 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -8,23 +8,24 @@
 
 #include "src/__support/OSUtil/fcntl.h"
 
+#include "hdr/errno_macros.h"
 #include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
 #include "hdr/types/off_t.h"
 #include "hdr/types/struct_f_owner_ex.h"
 #include "hdr/types/struct_flock.h"
 #include "hdr/types/struct_flock64.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/__support/libc_errno.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
-#include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-int fcntl(int fd, int cmd, void *arg) {
+ErrorOr<int> fcntl(int fd, int cmd, void *arg) {
 #if SYS_fcntl
   constexpr auto FCNTL_SYSCALL_ID = SYS_fcntl;
 #elif defined(SYS_fcntl64)
@@ -33,8 +34,7 @@ int fcntl(int fd, int cmd, void *arg) {
 #error "fcntl and fcntl64 syscalls not available."
 #endif
 
-  int new_cmd = cmd;
-  switch (new_cmd) {
+  switch (cmd) {
   case F_OFD_SETLKW: {
     struct flock *flk = reinterpret_cast<struct flock *>(arg);
     // convert the struct to a flock64
@@ -45,8 +45,11 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
-                                             &flk64);
+    int ret =
+        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
+    if (ret < 0)
+      return Error(-ret);
+    return ret;
   }
   case F_OFD_GETLK:
   case F_OFD_SETLK: {
@@ -59,60 +62,80 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
-                                                   new_cmd, &flk64);
+    int ret =
+        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
     // On failure, return
-    if (retVal == -1)
-      return -1;
+    if (ret < 0)
+      return Error(-1);
     // Check for overflow, i.e. the offsets are not the same when cast
     // to off_t from off64_t.
     if (static_cast<off_t>(flk64.l_len) != flk64.l_len ||
-        static_cast<off_t>(flk64.l_start) != flk64.l_start) {
-      libc_errno = EOVERFLOW;
-      return -1;
-    }
+        static_cast<off_t>(flk64.l_start) != flk64.l_start)
+      return Error(EOVERFLOW);
+
     // Now copy back into flk, in case flk64 got modified
     flk->l_type = flk64.l_type;
     flk->l_whence = flk64.l_whence;
     flk->l_start = static_cast<decltype(flk->l_start)>(flk64.l_start);
     flk->l_len = static_cast<decltype(flk->l_len)>(flk64.l_len);
     flk->l_pid = flk64.l_pid;
-    return retVal;
+    return ret;
   }
   case F_GETOWN: {
     struct f_owner_ex fex;
     int ret = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
                                                 F_GETOWN_EX, &fex);
-    if (ret >= 0)
-      return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
-    libc_errno = -ret;
-    return -1;
+    if (ret < 0)
+      return Error(-ret);
+    return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
   }
 #ifdef SYS_fcntl64
   case F_GETLK: {
     if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_GETLK64;
+      cmd = F_GETLK64;
     break;
   }
   case F_SETLK: {
     if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_SETLK64;
+      cmd = F_SETLK64;
     break;
   }
   case F_SETLKW: {
     if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_SETLKW64;
+      cmd = F_SETLKW64;
     break;
   }
 #endif
   }
-  int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
-                                                 reinterpret_cast<void *>(arg));
-  if (retVal >= 0) {
-    return retVal;
-  }
-  libc_errno = -retVal;
-  return -1;
+
+  // default, but may use rewritten cmd from above.
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd,
+                                              reinterpret_cast<void *>(arg));
+  if (ret < 0)
+    return Error(-ret);
+  return ret;
+}
+
+ErrorOr<int> open(const char *path, int flags, mode_t mode_flags) {
+#ifdef SYS_open
+  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_open, path, flags, mode_flags);
+#else
+  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_openat, AT_FDCWD, path, flags,
+                                             mode_flags);
+#endif
+  if (fd < 0)
+    return Error(-fd);
+
+  return fd;
+}
+
+ErrorOr<int> close(int fd) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_close, fd);
+
+  if (ret < 0)
+    return Error(-ret);
+
+  return ret;
 }
 
 } // namespace internal
diff --git a/libc/src/fcntl/linux/CMakeLists.txt b/libc/src/fcntl/linux/CMakeLists.txt
index 580db16cd413..c31eb3f438c1 100644
--- a/libc/src/fcntl/linux/CMakeLists.txt
+++ b/libc/src/fcntl/linux/CMakeLists.txt
@@ -19,6 +19,7 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.fcntl_macros
     libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
diff --git a/libc/src/fcntl/linux/fcntl.cpp b/libc/src/fcntl/linux/fcntl.cpp
index a0c8459ced34..fd9c48eb562f 100644
--- a/libc/src/fcntl/linux/fcntl.cpp
+++ b/libc/src/fcntl/linux/fcntl.cpp
@@ -10,6 +10,7 @@
 
 #include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
 #include <stdarg.h>
@@ -22,7 +23,14 @@ LLVM_LIBC_FUNCTION(int, fcntl, (int fd, int cmd, ...)) {
   va_start(varargs, cmd);
   arg = va_arg(varargs, void *);
   va_end(varargs);
-  return LIBC_NAMESPACE::internal::fcntl(fd, cmd, arg);
+
+  auto result = LIBC_NAMESPACE::internal::fcntl(fd, cmd, arg);
+
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return result.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp
index a21a03788dea..3a56d1055419 100644
--- a/libc/src/fcntl/linux/open.cpp
+++ b/libc/src/fcntl/linux/open.cpp
@@ -8,15 +8,13 @@
 
 #include "src/fcntl/open.h"
 
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-
-#include "hdr/fcntl_macros.h"
-#include "hdr/types/mode_t.h"
 #include <stdarg.h>
-#include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -31,17 +29,13 @@ LLVM_LIBC_FUNCTION(int, open, (const char *path, int flags, ...)) {
     va_end(varargs);
   }
 
-#ifdef SYS_open
-  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_open, path, flags, mode_flags);
-#else
-  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_openat, AT_FDCWD, path, flags,
-                                             mode_flags);
-#endif
-  if (fd > 0)
-    return fd;
+  auto result = internal::open(path, flags, mode_flags);
 
-  libc_errno = -fd;
-  return -1;
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return result.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index f3ae7c5c4e07..b50c5845bcc2 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -8,6 +8,8 @@
 
 #include "src/sys/auxv/getauxval.h"
 #include "config/app.h"
+#include "hdr/fcntl_macros.h"
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
@@ -17,14 +19,18 @@
 #include "src/__support/threads/callonce.h"
 #include "src/__support/threads/linux/futex_word.h"
 
+// -----------------------------------------------------------------------------
+// TODO: This file should not include other public libc functions. Calling other
+// public libc functions is an antipattern within LLVM-libc. This needs to be
+// cleaned up. DO NOT COPY THIS.
+// -----------------------------------------------------------------------------
+
 // for mallocing the global auxv
 #include "src/sys/mman/mmap.h"
 #include "src/sys/mman/munmap.h"
 
 // for reading /proc/self/auxv
-#include "src/fcntl/open.h"
 #include "src/sys/prctl/prctl.h"
-#include "src/unistd/close.h"
 #include "src/unistd/read.h"
 
 // getauxval will work either with or without __cxa_atexit support.
@@ -60,17 +66,18 @@ public:
   constexpr static size_t AUXV_MMAP_SIZE = sizeof(AuxEntry) * MAX_AUXV_ENTRIES;
 
   AuxvMMapGuard()
-      : ptr(mmap(nullptr, AUXV_MMAP_SIZE, PROT_READ | PROT_WRITE,
-                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) {}
+      : ptr(LIBC_NAMESPACE::mmap(nullptr, AUXV_MMAP_SIZE,
+                                 PROT_READ | PROT_WRITE,
+                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) {}
   ~AuxvMMapGuard() {
     if (ptr != MAP_FAILED)
-      munmap(ptr, AUXV_MMAP_SIZE);
+      LIBC_NAMESPACE::munmap(ptr, AUXV_MMAP_SIZE);
   }
   void submit_to_global() {
     // atexit may fail, we do not set it to global in that case.
     int ret = __cxa_atexit(
         [](void *) {
-          munmap(auxv, AUXV_MMAP_SIZE);
+          LIBC_NAMESPACE::munmap(auxv, AUXV_MMAP_SIZE);
           auxv = nullptr;
         },
         nullptr, nullptr);
@@ -90,10 +97,16 @@ private:
 
 class AuxvFdGuard {
 public:
-  AuxvFdGuard() : fd(open("/proc/self/auxv", O_RDONLY | O_CLOEXEC)) {}
+  AuxvFdGuard() {
+    auto result = internal::open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+    if (!result.has_value())
+      fd = -1;
+
+    fd = result.value();
+  }
   ~AuxvFdGuard() {
     if (fd != -1)
-      close(fd);
+      internal::close(fd);
   }
   bool valid() const { return fd != -1; }
   int get() const { return fd; }
@@ -135,7 +148,8 @@ static void initialize_auxv_once(void) {
   bool error_detected = false;
   // Read until we use up all the available space or we finish reading the file.
   while (available_size != 0) {
-    ssize_t bytes_read = read(fd_guard.get(), buf, available_size);
+    ssize_t bytes_read =
+        LIBC_NAMESPACE::read(fd_guard.get(), buf, available_size);
     if (bytes_read <= 0) {
       if (libc_errno == EINTR)
         continue;
@@ -158,7 +172,7 @@ static AuxEntry read_entry(int fd) {
   size_t size = sizeof(AuxEntry);
   char *ptr = reinterpret_cast<char *>(&buf);
   while (size > 0) {
-    ssize_t ret = read(fd, ptr, size);
+    ssize_t ret = LIBC_NAMESPACE::read(fd, ptr, size);
     if (ret < 0) {
       if (libc_errno == EINTR)
         continue;
@@ -195,7 +209,8 @@ LLVM_LIBC_FUNCTION(unsigned long, getauxval, (unsigned long id)) {
     return search_auxv(app.auxv_ptr, id);
 
   static FutexWordType once_flag;
-  callonce(reinterpret_cast<CallOnceFlag *>(&once_flag), initialize_auxv_once);
+  LIBC_NAMESPACE::callonce(reinterpret_cast<CallOnceFlag *>(&once_flag),
+                           initialize_auxv_once);
   if (auxv != nullptr)
     return search_auxv(auxv, id);
 
diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h
index 69911012ff7e..29d1401821e4 100644
--- a/libc/src/sys/mman/linux/shm_common.h
+++ b/libc/src/sys/mman/linux/shm_common.h
@@ -13,6 +13,11 @@
 #include "src/__support/macros/config.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
+// TODO: clean this up.
+//  1. Change from optional to ErrorOr, and return the errno instead of setting
+//    it here.
+//  2. Replace inline memcpy with __builtin_memcpy
+
 // TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121
 #include <linux/limits.h>
 
diff --git a/libc/src/sys/mman/linux/shm_open.cpp b/libc/src/sys/mman/linux/shm_open.cpp
index 11de482272d0..3099062eace9 100644
--- a/libc/src/sys/mman/linux/shm_open.cpp
+++ b/libc/src/sys/mman/linux/shm_open.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/sys/mman/shm_open.h"
+#include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/macros/config.h"
-#include "src/fcntl/open.h"
 #include "src/sys/mman/linux/shm_common.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -17,9 +18,16 @@ namespace LIBC_NAMESPACE_DECL {
 static constexpr int DEFAULT_OFLAGS = O_NOFOLLOW | O_CLOEXEC | O_NONBLOCK;
 
 LLVM_LIBC_FUNCTION(int, shm_open, (const char *name, int oflags, mode_t mode)) {
-  using namespace shm_common;
-  if (cpp::optional<SHMPath> buffer = translate_name(name))
-    return open(buffer->data(), oflags | DEFAULT_OFLAGS, mode);
+  if (cpp::optional<shm_common::SHMPath> buffer =
+          shm_common::translate_name(name)) {
+    auto result = internal::open(buffer->data(), oflags | DEFAULT_OFLAGS, mode);
+
+    if (!result.has_value()) {
+      libc_errno = result.error();
+      return -1;
+    }
+    return result.value();
+  }
   return -1;
 }
 
diff --git a/libc/src/sys/mman/linux/shm_unlink.cpp b/libc/src/sys/mman/linux/shm_unlink.cpp
index 6a7630151220..4c61c7cd16ba 100644
--- a/libc/src/sys/mman/linux/shm_unlink.cpp
+++ b/libc/src/sys/mman/linux/shm_unlink.cpp
@@ -13,10 +13,13 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+// TODO: stop calling the public unlink function. It should be calling an
+// internal shared utility.
+
 LLVM_LIBC_FUNCTION(int, shm_unlink, (const char *name)) {
-  using namespace shm_common;
-  if (cpp::optional<SHMPath> buffer = translate_name(name))
-    return unlink(buffer->data());
+  if (cpp::optional<shm_common::SHMPath> buffer =
+          shm_common::translate_name(name))
+    return LIBC_NAMESPACE::unlink(buffer->data());
   return -1;
 }
 
diff --git a/libc/src/unistd/linux/close.cpp b/libc/src/unistd/linux/close.cpp
index b5842f2b64d2..6ef3a3c6d63f 100644
--- a/libc/src/unistd/linux/close.cpp
+++ b/libc/src/unistd/linux/close.cpp
@@ -8,9 +8,8 @@
 
 #include "src/unistd/close.h"
 
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
-
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include <sys/syscall.h> // For syscall numbers.
@@ -18,12 +17,13 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, close, (int fd)) {
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_close, fd);
-  if (ret < 0) {
-    libc_errno = -ret;
+  auto result = internal::close(fd);
+
+  if (!result.has_value()) {
+    libc_errno = result.error();
     return -1;
   }
-  return ret;
+  return result.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 84a6b7d23044..7901de161b7a 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -344,6 +344,21 @@ libc_support_library(
     hdrs = ["hdr/types/struct_epoll_event.h"],
 )
 
+libc_support_library(
+    name = "types_struct_f_owner_ex",
+    hdrs = ["hdr/types/struct_f_owner_ex.h"],
+)
+
+libc_support_library(
+    name = "types_struct_flock",
+    hdrs = ["hdr/types/struct_flock.h"],
+)
+
+libc_support_library(
+    name = "types_struct_flock64",
+    hdrs = ["hdr/types/struct_flock64.h"],
+)
+
 libc_support_library(
     name = "types_struct_timespec",
     hdrs = ["hdr/types/struct_timespec.h"],
@@ -1380,6 +1395,28 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_osutil_fcntl",
+    srcs = ["src/__support/OSUtil/linux/fcntl.cpp"],
+    hdrs = ["src/__support/OSUtil/fcntl.h"],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_common",
+        ":__support_error_or",
+        ":__support_osutil_syscall",
+        ":hdr_errno_macros",
+        ":hdr_fcntl_macros",
+        ":types_mode_t",
+        ":types_off_t",
+        ":types_struct_f_owner_ex",
+        ":types_struct_flock",
+        ":types_struct_flock64",
+    ],
+)
+
 libc_support_library(
     name = "__support_osutil_exit",
     srcs = ["src/__support/OSUtil/linux/exit.cpp"],
@@ -1601,8 +1638,8 @@ libc_support_library(
 libc_header_library(
     name = "libcxx_shared_headers",
     hdrs = [
-        "shared/libc_common.h",
         "shared/fp_bits.h",
+        "shared/libc_common.h",
         "shared/str_to_float.h",
         "shared/str_to_integer.h",
     ],
@@ -4475,13 +4512,28 @@ libc_function(
     }),
     deps = [
         ":__support_common",
-        ":__support_osutil_syscall",
+        ":__support_osutil_fcntl",
         ":errno",
         ":hdr_fcntl_macros",
         ":types_mode_t",
     ],
 )
 
+libc_function(
+    name = "fcntl",
+    srcs = ["src/fcntl/linux/fcntl.cpp"],
+    hdrs = ["src/fcntl/fcntl.h"],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_common",
+        ":__support_osutil_fcntl",
+        ":errno",
+    ],
+)
+
 libc_function(
     name = "openat",
     srcs = ["src/fcntl/linux/openat.cpp"],
@@ -4542,7 +4594,7 @@ libc_function(
     hdrs = ["src/unistd/close.h"],
     deps = [
         ":__support_common",
-        ":__support_osutil_syscall",
+        ":__support_osutil_fcntl",
         ":errno",
     ],
 )

From 5578bcbcfd25c797d4d14b8dfb3f83360712513d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 13 Jun 2025 12:32:46 -0500
Subject: [PATCH 0319/1322] [mlir][xegpu] add support for structure control
 flow ops in workgroup to subgroup distribution (#142618)

This PR introduces support for `scf::ForOp`, `scf::WhileOp`, `scf::If`,
and `scf::Condition` within the workgroup-subgroup-distribution pass,
leveraging the `SCFStructuralTypeConversionsAndLegality`.
---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   3 +
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 220 ++++++++++++++++--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   6 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  98 +++++++-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 134 ++++++++++-
 5 files changed, 430 insertions(+), 31 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index f9327d63869c..6fea10185402 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -26,6 +26,9 @@ class TensorDescType;
 
 namespace xegpu {
 
+/// Flatten a set of ValueRange into a single SmallVector<Value>
+SmallVector<Value> flattenValues(ArrayRef<ValueRange> values);
+
 /// If tensor descriptor has a layout attribute it is used in SIMT mode.
 /// In this mode, the distributed vector shape is determined as follows:
 /// Definitions:
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 3bf76af674ba..a26c6b52f0dd 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -13,9 +13,11 @@
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -29,6 +31,29 @@ using namespace mlir;
 
 namespace {
 
+static std::pair<SmallVector<int64_t>, int>
+getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
+  int count = 1;
+  SmallVector<int64_t> sgShape(shape);
+
+  if (layout && layout.isWgLayout()) {
+    DenseI32ArrayAttr sgLayoutAttr = layout.getSgLayout();
+    auto sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
+    if (DenseI32ArrayAttr sgDataAttr = layout.getSgData())
+      sgShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
+    else
+      sgShape = computeShapeRatio(shape, sgLayout).value_or(sgShape);
+    SmallVector<int64_t> distUnit = computeElementwiseMul(sgLayout, sgShape);
+    // Clamp distUnit to the original shape to handle cases where data is
+    // shared among subgroups, which may cause distUnit to exceed the original
+    // shape.
+    for (size_t i = 0; i < distUnit.size(); ++i)
+      distUnit[i] = std::min(shape[i], distUnit[i]);
+    count = computeProduct(shape) / computeProduct(distUnit);
+  }
+  return std::make_pair(sgShape, count);
+}
+
 /// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
 /// from a workgroup descriptor. It replaces the offsets and sizes with
 /// appropriate values for the subgroup.
@@ -129,18 +154,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       return rewriter.notifyMatchFailure(
           op, "sgLayout attribute is required in layout");
 
-    SmallVector<int64_t> sgShape;
-    if (auto sgDataAttr = layout.getSgData()) {
-      sgShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
-    } else {
-      assert(wgShape.size() == sgLayout.size() &&
-             "sgLayout and wgShape must have the same rank");
-      sgShape.reserve(wgShape.size());
-      for (size_t i = 0; i < wgShape.size(); ++i) {
-        assert(sgLayout[i] != 0 && "sgLayout elements must be non-zero");
-        sgShape.push_back(wgShape[i] / sgLayout[i]);
-      }
-    }
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
 
     // TODO : Handle order attribute
     // Get the subgroup ID
@@ -266,15 +280,15 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
     if (resultTy.getRank() != 2)
       return failure();
 
-    auto originalLayout =
-        llvm::dyn_cast_or_null<xegpu::LayoutAttr>(op->getAttr("layout"));
+    auto originalLayout = xegpu::getLayoutAttr(op.getResult());
     if (!originalLayout)
       return failure();
 
-    SmallVector<Value> newDpasOps;
     size_t i = 0;
+    SmallVector<Value> newDpasOps;
     for (auto aVec : adaptor.getLhs()) {
       for (auto bVec : adaptor.getRhs()) {
+
         llvm::SmallVector<Value> operands({aVec, bVec});
         Value tmpC;
         if (op.getAcc()) {
@@ -288,10 +302,10 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
             llvm::cast<VectorType>(bVec.getType()).getShape();
         VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]},
                                            resultTy.getElementType());
-        tmpC = rewriter.create<xegpu::DpasOp>(
-            loc, resTy, operands,
-            llvm::ArrayRef<NamedAttribute>(
-                {"layout_result_0", originalLayout.dropSgLayoutAndData()}));
+        tmpC = rewriter.create<xegpu::DpasOp>(loc, resTy, operands);
+        xegpu::setLayoutAttr(cast<OpResult>(tmpC),
+                             originalLayout.dropSgLayoutAndData());
+
         newDpasOps.push_back(tmpC);
       }
     }
@@ -314,14 +328,90 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
   }
 };
 
+// Handles UnrealizedConversionCastOp generated during
+// SCFStructuralTypeConversions (step 1). This op may appear as either a
+// target or source materialization for Vector values, e.g.:
+// 1. unrealized_cast %1 : vector<256xf32> to vector<16xf32>, ...
+// 2. unrealized_cast %1 : vector<16xf32>, ... to vector<256xf32>
+// it could be either 1:N or N:1 cast. In both cases, the pattern
+// simply forwards the inputs to the outputs using 1:1 or 1:N interface.
+// for example, the following scf::forOp
+// ```
+// %for = scf.for ... iter_args(%arg1 = %0)->(vector<128x128xf16>) {
+//     %n = use(%arg1): vector<128x128xf16>
+//     scf.yield %n : vector<128x128xf16>
+// }
+// ```
+// Could be converted to:
+// ```
+// %1 = unrealized_conversion_cast %0
+//          : vector<128x128xf16> to vector<16x16xf16>, vector<16x16xf16>
+// %for:2 = scf.for ... iter_args(%arg1 = %1#1, %arg2 = %1#2)
+//                    -> (vector<16x16xf16>, vector<16x16xf16) {
+//     %m = unrealized_conversion_cast %arg1, %arg2
+//            : vector<16x16xf16>, vector<16x16xf16> to vector<128x128xf16>
+//     %n = use(%m): vector<128x128xf16>
+//     %b = unrealized_conversion_cast %n
+//            : vector<128x128xf16> to vector<16x16xf16>, vector<16x16xf16>
+//     scf.yield %b#1, %b#2 : vector<16x16xf16>, vector<16x16xf16>
+// }
+// %cast = unrealized_conversion_cast %for:2
+//          : vector<16x16xf16>, vector<16x16xf16> to vector<128x128xf16>
+// ```
+// TODO: remove it when context-aware type converter is ready.
+struct UnrealizedConversionCastOpPattern
+    : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
+  using OpConversionPattern<
+      mlir::UnrealizedConversionCastOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::UnrealizedConversionCastOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Value> inputs = xegpu::flattenValues(adaptor.getInputs());
+
+    auto inputTy = dyn_cast<VectorType>(inputs[0].getType());
+    auto outputTy = dyn_cast<VectorType>(op->getOpResult(0).getType());
+
+    if (!inputTy || !outputTy || !llvm::all_equal(op->getResultTypes()) ||
+        !llvm::all_equal(ValueRange(inputs).getTypes()))
+      return failure();
+
+    // Handles the case "cast %1 : vector<256xf32> to vector<16xf32>, ...".
+    // It is generated by source materialization (e.g., inits to scf forOp).
+    // The input values provided by the adaptor should already be distributed,
+    // and their types should correspond exactly to the result types of the
+    // operation.
+    if (op.getNumOperands() == 1 &&
+        llvm::equal(ValueRange(inputs).getTypes(), op->getResultTypes())) {
+      rewriter.replaceOp(op, inputs);
+      return success();
+    }
+
+    // Handles the case "cast %1 : vector<16xf32>, ... to vector<256xf32>".
+    // It is generated by target materialization (e.g., arguments/results
+    // of scf forOp). All input values must have the same vector type, and
+    // their shape must be evenly divisible by the output vector's shape
+    // (determined by the nature of the workgroup to subgroup distribution).
+    // TODO: it is not safe to do such forward, since such N:1 cast could be
+    // from others.
+    if (op.getNumResults() == 1 &&
+        computeShapeRatio(outputTy.getShape(), inputTy.getShape())) {
+      rewriter.replaceOpWithMultiple(op, {inputs});
+      return success();
+    }
+
+    return mlir::failure();
+  }
+};
+
 } // namespace
 
 namespace mlir {
 namespace xegpu {
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
   patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
-               WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp>(
-      patterns.getContext());
+               WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
+               UnrealizedConversionCastOpPattern>(patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -334,9 +424,68 @@ struct XeGPUWgToSgDistributePass
 } // namespace
 
 void XeGPUWgToSgDistributePass::runOnOperation() {
+  // Track existing UnrealizedConversionCastOps
+  SmallVector<Operation *> existingCastOps;
+  getOperation()->walk([&](UnrealizedConversionCastOp castOp) {
+    existingCastOps.push_back(castOp.getOperation());
+  });
+
+  {
+    // Step 1: Apply SCFStructuralTypeConversions to SCF operations with
+    // VectorType operands. This first converts such operands to
+    // RankedTensorType, propagates the layout attribute into the encoding
+    // attribute, and finally converts the RankedTensorType to VectorType based
+    // on the encoding.
+
+    TypeConverter converter;
+    converter.addConversion([&](Type type) -> Type { return type; });
+    converter.addConversion(
+        [&](RankedTensorType type,
+            SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+          Type elemTy = type.getElementType();
+          ArrayRef<int64_t> shape = type.getShape();
+
+          int count;
+          SmallVector<int64_t> subShape;
+          std::tie(subShape, count) = getSgShapeAndCount(
+              shape,
+              dyn_cast_if_present<xegpu::LayoutAttr>(type.getEncoding()));
+
+          auto newTy = VectorType::get(subShape, elemTy);
+          result.append(count, newTy);
+          return success();
+        });
+
+    xegpu::doSCFStructuralTypeConversionWithTensorType(getOperation(),
+                                                       converter);
+  }
+
+  // Step 2: Perform workgroup to subgroup distribution for TensorDesc values,
+  // as well as XeGPU, Arith, and Vector operations.
   MLIRContext *ctx = &getContext();
   RewritePatternSet patterns(ctx);
   ConversionTarget target(*ctx);
+  TypeConverter converter;
+  converter.addConversion([&](Type type) -> Type { return type; });
+  converter.addConversion(
+      [&](xegpu::TensorDescType type,
+          SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+        Type elemTy = type.getElementType();
+        ArrayRef<int64_t> shape = type.getShape();
+
+        int count;
+        SmallVector<int64_t> subShape;
+        xegpu::LayoutAttr layout = type.getLayoutAttr();
+        std::tie(subShape, count) = getSgShapeAndCount(shape, layout);
+
+        if (layout)
+          layout = layout.dropSgLayoutAndData();
+
+        auto newTy = xegpu::TensorDescType::get(
+            type.getContext(), subShape, elemTy, type.getEncoding(), layout);
+        result.append(count, newTy);
+        return success();
+      });
 
   auto getTensorDescType = [](Operation *op) -> xegpu::TensorDescType {
     if (auto createOp = dyn_cast<xegpu::CreateNdDescOp>(op))
@@ -353,26 +502,49 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   };
 
   auto isLegal = [&](xegpu::LayoutAttr layout) -> bool {
-    return !layout || layout.getSgLayout() == nullptr;
+    return !layout || !layout.isWgLayout();
   };
 
   target.addDynamicallyLegalOp<xegpu::CreateNdDescOp, xegpu::LoadNdOp,
                                xegpu::StoreNdOp, xegpu::UpdateNdOffsetOp,
                                xegpu::PrefetchNdOp>([=](Operation *op) -> bool {
     auto tdescTy = getTensorDescType(op);
-    auto layout = dyn_cast_or_null<xegpu::LayoutAttr>(tdescTy.getLayout());
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(tdescTy.getLayout());
     return isLegal(layout);
   });
 
   target.addDynamicallyLegalOp<xegpu::DpasOp>([=](xegpu::DpasOp op) -> bool {
-    auto layout = dyn_cast_or_null<xegpu::LayoutAttr>(op->getAttr("layout"));
+    auto layout = xegpu::getLayoutAttr(op.getResult());
     return isLegal(layout);
   });
 
+  target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+      [=](UnrealizedConversionCastOp op) {
+        return llvm::is_contained(existingCastOps, op.getOperation());
+      });
+
   target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
+  scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                       target);
   xegpu::populateXeGPUWgToSgDistributePatterns(patterns);
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
     return signalPassFailure();
+
+  // Remove sg_layout and sg_data attributes from the Layout
+  // attribute for each VectorType result of the operation.
+  // For Structured Control Flow ops, the layout is simply removed,
+  // since in 1:N case, the layout for new results are missing.
+  // Layout propagation pass will activated.
+  getOperation()->walk([](Operation *op) {
+    for (OpResult result : op->getOpResults()) {
+      std::string name = xegpu::getLayoutName(result);
+      if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
+        op->removeAttr(name);
+        if (!isa<scf::IfOp, scf::ForOp, scf::WhileOp, scf::ConditionOp>(op))
+          op->setAttr(name, layout.dropSgLayoutAndData());
+      }
+    }
+  });
 }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index dcaf4e85a82c..6b85a66a8bd3 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -27,7 +27,7 @@
 using namespace mlir;
 
 /// convert ArrayRef<ValueRange> into SmallVector<Value>
-static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
+SmallVector<Value> xegpu::flattenValues(ArrayRef<ValueRange> values) {
   SmallVector<Value> result;
   for (const auto &vals : values)
     llvm::append_range(result, vals);
@@ -271,7 +271,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
       auto resultTy = dyn_cast<RankedTensorType>(result.getType());
 
       // Only look at ops casting from VectorType to RankedTensorType
-      if (!isa<VectorType>(inputTy) || !isa<RankedTensorType>(resultTy))
+      if (!inputTy || !resultTy)
         return WalkResult::skip();
 
       xegpu::LayoutAttr layout = xegpu::getLayoutAttr(input);
@@ -342,7 +342,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
         }
 
         if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
-          SmallVector<Value> values = flattenValues(adaptor.getInputs());
+          SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
           auto newOp = rewriter.create<UnrealizedConversionCastOp>(
               op.getLoc(), outputTy, values);
           rewriter.replaceOp(op, newOp);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index bee026eb2084..35ad16d8cd9a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -85,7 +85,7 @@ gpu.module @test_round_robin_assignment {
     %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<8x8xf32>
       -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_result_0 =  #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<8x8xf32>, vector<8x8xf32> -> vector<8x8xf32>
     gpu.return
   }
@@ -102,4 +102,100 @@ gpu.module @test_round_robin_assignment {
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     gpu.return
   }
+
+  gpu.func @test_scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c1 = arith.constant 1 : index
+    %c10 = arith.constant 10 : index
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1024 = arith.constant 1024 : index
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    // CHECK-LABEL: scf.for
+    // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1)
+        -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
+      %3 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      xegpu.store_nd %3, %arg3  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      scf.yield %4, %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    }
+    gpu.return
+  }
+
+  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
+    %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
+      %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
+      //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
+      scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
+    } do {
+    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
+    ^bb0(%arg2: vector<256xf32>, %arg3: i32):
+      xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %4 = arith.addi %arg3, %c1_i32 : i32
+      %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      scf.yield %6, %4 : vector<256xf32>, i32
+    }
+    gpu.return
+  }
+
+  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %0 = gpu.subgroup_id : index
+    %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %3 = arith.cmpi eq, %0, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (vector<16xf32>, vector<16xf32>)
+    %4 = scf.if %3 -> (vector<256xf32>) {
+      %5 = xegpu.load_nd %1  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      scf.yield %5 : vector<256xf32>
+    } else {
+      %5 = xegpu.load_nd %2  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      scf.yield %5 : vector<256xf32>
+    } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
+    xegpu.store_nd %4, %1  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    gpu.return
+  }
+
+  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %id = gpu.subgroup_id : index
+
+    %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+
+    %0 = arith.cmpi eq, %id, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
+      %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    } else {
+      %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    }
+    xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    gpu.return
+  }
+
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 7e89ada93407..466842c96844 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -5,7 +5,7 @@
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: test_create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {  
+  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[SGID:.*]] = gpu.subgroup_id
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[C4:.*]] = arith.constant 4 : index
@@ -108,7 +108,7 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       : !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 12], lane_layout = [8, 2], lane_data = [1, 1]>>
       -> vector<32x24xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_result_0 =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
     gpu.return
   }
@@ -142,7 +142,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       : !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
       -> vector<32x24xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_result_0 =  #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
     gpu.return
   }
@@ -169,4 +169,132 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
     gpu.return
   }
+
+  gpu.func @test_scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id  x
+    %block_id_y = gpu.block_id  y
+    %0 = arith.muli %block_id_x, %c128 : index
+    %1 = arith.muli %block_id_y, %c128 : index
+    %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
+    %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
+    %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+
+    //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
+    // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
+    // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
+    //      CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+    //      CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+    //      CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
+    //      CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
+    //      CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
+    //      CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
+    %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
+        -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
+            !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
+      %8 = xegpu.load_nd %arg4  : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+      %9 = xegpu.load_nd %arg5  : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+      %10 = xegpu.dpas %8, %9, %arg6 {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+                          : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+      %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
+      %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+      scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
+                                !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>
+    }
+    %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32>
+            -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    xegpu.store_nd %6#2, %7  : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    gpu.return
+  }
+
+  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+
+    // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
+    %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
+      %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
+      // CHECK: scf.condition{{.*}} : vector<16xf32>, i32
+      scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
+    } do {
+    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32)
+    ^bb0(%arg2: vector<256xf32>, %arg3: i32):
+      xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      %4 = arith.addi %arg3, %c1_i32 : i32
+      %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      scf.yield %6, %4 : vector<256xf32>, i32
+    }
+    gpu.return
+  }
+
+  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %id = gpu.subgroup_id : index
+
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+
+    %4 = arith.cmpi eq, %id, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (vector<16xf32>)
+    %5 = scf.if %4 -> (vector<256xf32>) {
+      // CHECK-LABEL: xegpu.load_nd
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>
+      scf.yield %2 : vector<256xf32>
+    } else {
+      // CHECK-LABEL: xegpu.load_nd
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>
+      scf.yield %3 : vector<256xf32>
+    } {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [16]>}
+    xegpu.store_nd %5, %0 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    gpu.return
+  }
+
+  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %id = gpu.subgroup_id : index
+
+    %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+
+    %0 = arith.cmpi eq, %id, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>)
+    %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>) {
+      // CHECK-LABEL: xegpu.create_nd_tdesc
+      //  CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
+      %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>
+      scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    } else {
+      // CHECK-LABEL: xegpu.create_nd_tdesc
+      //  CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
+      %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>
+      scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    }
+    xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    gpu.return
+  }
+
+
 }

From ecdb549e6de60b3211cfa860eec498270e3980f1 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 13 Jun 2025 10:36:09 -0700
Subject: [PATCH 0320/1322] [TableGen] Avoid evaluating RHS of a BinOp until
 short-circuit is complete (#144021)

This patch adds an even more aggressive short-circuit on `!and` and
`!or` that completely avoids the evaluation of RHS operand until short
circuiting decisions are made.
---
 llvm/lib/TableGen/Record.cpp     | 11 ++++++-----
 llvm/test/TableGen/true-false.td |  9 +++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 4c8b41237c60..7f2ed77a7409 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1557,8 +1557,7 @@ unresolved:
 }
 
 const Init *BinOpInit::resolveReferences(Resolver &R) const {
-  const Init *lhs = LHS->resolveReferences(R);
-  const Init *rhs = RHS->resolveReferences(R);
+  const Init *NewLHS = LHS->resolveReferences(R);
 
   unsigned Opc = getOpcode();
   if (Opc == AND || Opc == OR) {
@@ -1570,15 +1569,17 @@ const Init *BinOpInit::resolveReferences(Resolver &R) const {
     // limited version of short-circuit against all ones (`true` is casted
     // to 1 rather than all ones before we evaluate `!or`).
     if (const auto *LHSi = dyn_cast_or_null<IntInit>(
-            lhs->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) {
+            NewLHS->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) {
       if ((Opc == AND && !LHSi->getValue()) ||
           (Opc == OR && LHSi->getValue() == -1))
         return LHSi;
     }
   }
 
-  if (LHS != lhs || RHS != rhs)
-    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))
+  const Init *NewRHS = RHS->resolveReferences(R);
+
+  if (LHS != NewLHS || RHS != NewRHS)
+    return (BinOpInit::get(getOpcode(), NewLHS, NewRHS, getType()))
         ->Fold(R.getCurrentRecord());
   return this;
 }
diff --git a/llvm/test/TableGen/true-false.td b/llvm/test/TableGen/true-false.td
index 5a59f20b21d2..5fa570231448 100644
--- a/llvm/test/TableGen/true-false.td
+++ b/llvm/test/TableGen/true-false.td
@@ -67,13 +67,18 @@ def rec7 {
   bits<3> flags = { true, false, true };
 }
 
-// `!and` and `!or` should be short-circuit such that `!tail` on empty list will never
-// be evaluated.
+// `!and` and `!or` should be short-circuited such that any of the `!head` or
+// `!tail` on empty list below will never be evaluated.
 // CHECK: def rec8
+// CHECK:   bit v = 0;
+// CHECK:   int v2 = -1;
 // CHECK:   list<int> newSeq = [];
 // CHECK:   list<int> newSeq2 = [];
 
 class Foo <list<int> seq = []> {
+  bit v = !and(false, !head(seq));
+  int v2 = !or(-1, !head(seq));
+
   bit unresolved = !ne(!find(NAME, "BAR"), -1);
   list<int> newSeq  = !if(!and(false, unresolved), !tail(seq), seq);
   list<int> newSeq2 = !if(!or(-1, unresolved), seq, !tail(seq));

From 09c54c2e9e044fa0857831e6ce1bf77c8ce16ecc Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:43:22 -0700
Subject: [PATCH 0321/1322] [IR2Vec] Minor vocab changes and exposing weights
 (#143200)

This PR changes some asserts in Vocab to hard checks that emit error and exposes flags and constructor to help in unit tests.

(Tracking issue - #141817)
---
 llvm/include/llvm/Analysis/IR2Vec.h    |  11 ++
 llvm/lib/Analysis/IR2Vec.cpp           |  82 +++++++++------
 llvm/unittests/Analysis/IR2VecTest.cpp | 137 ++++++++++++++++++-------
 3 files changed, 164 insertions(+), 66 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 8bf21b0e75d6..de67955d85d7 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -31,7 +31,9 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/JSON.h"
 #include <map>
 
 namespace llvm {
@@ -43,6 +45,7 @@ class Function;
 class Type;
 class Value;
 class raw_ostream;
+class LLVMContext;
 
 /// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware.
 /// Symbolic embeddings capture the "syntactic" and "statistical correlation"
@@ -53,6 +56,11 @@ class raw_ostream;
 enum class IR2VecKind { Symbolic };
 
 namespace ir2vec {
+
+extern cl::opt<float> OpcWeight;
+extern cl::opt<float> TypeWeight;
+extern cl::opt<float> ArgWeight;
+
 /// Embedding is a datatype that wraps std::vector<double>. It provides
 /// additional functionality for arithmetic and comparison operations.
 /// It is meant to be used *like* std::vector<double> but is more restrictive
@@ -226,10 +234,13 @@ public:
 class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
   ir2vec::Vocab Vocabulary;
   Error readVocabulary();
+  void emitError(Error Err, LLVMContext &Ctx);
 
 public:
   static AnalysisKey Key;
   IR2VecVocabAnalysis() = default;
+  explicit IR2VecVocabAnalysis(const ir2vec::Vocab &Vocab);
+  explicit IR2VecVocabAnalysis(ir2vec::Vocab &&Vocab);
   using Result = IR2VecVocabResult;
   Result run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 25ce35d4ace3..0f7303c1b091 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -16,13 +16,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
@@ -33,6 +31,8 @@ using namespace ir2vec;
 STATISTIC(VocabMissCounter,
           "Number of lookups to entites not present in the vocabulary");
 
+namespace llvm {
+namespace ir2vec {
 static cl::OptionCategory IR2VecCategory("IR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
@@ -40,18 +40,17 @@ static cl::opt<std::string>
     VocabFile("ir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for IR2Vec"), cl::init(""),
               cl::cat(IR2VecCategory));
-static cl::opt<float> OpcWeight("ir2vec-opc-weight", cl::Optional,
-                                cl::init(1.0),
-                                cl::desc("Weight for opcode embeddings"),
-                                cl::cat(IR2VecCategory));
-static cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional,
-                                 cl::init(0.5),
-                                 cl::desc("Weight for type embeddings"),
-                                 cl::cat(IR2VecCategory));
-static cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional,
-                                cl::init(0.2),
-                                cl::desc("Weight for argument embeddings"),
-                                cl::cat(IR2VecCategory));
+cl::opt<float> OpcWeight("ir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                         cl::desc("Weight for opcode embeddings"),
+                         cl::cat(IR2VecCategory));
+cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional, cl::init(0.5),
+                          cl::desc("Weight for type embeddings"),
+                          cl::cat(IR2VecCategory));
+cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional, cl::init(0.2),
+                         cl::desc("Weight for argument embeddings"),
+                         cl::cat(IR2VecCategory));
+} // namespace ir2vec
+} // namespace llvm
 
 AnalysisKey IR2VecVocabAnalysis::Key;
 
@@ -251,9 +250,9 @@ bool IR2VecVocabResult::invalidate(
 // by auto-generating a default vocabulary during the build time.
 Error IR2VecVocabAnalysis::readVocabulary() {
   auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true);
-  if (!BufOrError) {
+  if (!BufOrError)
     return createFileError(VocabFile, BufOrError.getError());
-  }
+
   auto Content = BufOrError.get()->getBuffer();
   json::Path::Root Path("");
   Expected<json::Value> ParsedVocabValue = json::parse(Content);
@@ -261,39 +260,60 @@ Error IR2VecVocabAnalysis::readVocabulary() {
     return ParsedVocabValue.takeError();
 
   bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path);
-  if (!Res) {
+  if (!Res)
     return createStringError(errc::illegal_byte_sequence,
                              "Unable to parse the vocabulary");
-  }
-  assert(Vocabulary.size() > 0 && "Vocabulary is empty");
+
+  if (Vocabulary.empty())
+    return createStringError(errc::illegal_byte_sequence,
+                             "Vocabulary is empty");
 
   unsigned Dim = Vocabulary.begin()->second.size();
-  assert(Dim > 0 && "Dimension of vocabulary is zero");
-  (void)Dim;
-  assert(std::all_of(Vocabulary.begin(), Vocabulary.end(),
-                     [Dim](const std::pair<StringRef, Embedding> &Entry) {
-                       return Entry.second.size() == Dim;
-                     }) &&
-         "All vectors in the vocabulary are not of the same dimension");
+  if (Dim == 0)
+    return createStringError(errc::illegal_byte_sequence,
+                             "Dimension of vocabulary is zero");
+
+  if (!std::all_of(Vocabulary.begin(), Vocabulary.end(),
+                   [Dim](const std::pair<StringRef, Embedding> &Entry) {
+                     return Entry.second.size() == Dim;
+                   }))
+    return createStringError(
+        errc::illegal_byte_sequence,
+        "All vectors in the vocabulary are not of the same dimension");
+
   return Error::success();
 }
 
+IR2VecVocabAnalysis::IR2VecVocabAnalysis(const Vocab &Vocabulary)
+    : Vocabulary(Vocabulary) {}
+
+IR2VecVocabAnalysis::IR2VecVocabAnalysis(Vocab &&Vocabulary)
+    : Vocabulary(std::move(Vocabulary)) {}
+
+void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) {
+  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
+    Ctx.emitError("Error reading vocabulary: " + EI.message());
+  });
+}
+
 IR2VecVocabAnalysis::Result
 IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   auto Ctx = &M.getContext();
+  // FIXME: Scale the vocabulary once. This would avoid scaling per use later.
+  // If vocabulary is already populated by the constructor, use it.
+  if (!Vocabulary.empty())
+    return IR2VecVocabResult(std::move(Vocabulary));
+
+  // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
     // FIXME: Use default vocabulary
     Ctx->emitError("IR2Vec vocabulary file path not specified");
     return IR2VecVocabResult(); // Return invalid result
   }
   if (auto Err = readVocabulary()) {
-    handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-      Ctx->emitError("Error reading vocabulary: " + EI.message());
-    });
+    emitError(std::move(Err), *Ctx);
     return IR2VecVocabResult();
   }
-  // FIXME: Scale the vocabulary here once. This would avoid scaling per use
-  // later.
   return IR2VecVocabResult(std::move(Vocabulary));
 }
 
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 053b9f75e7a6..90d07d080443 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -281,25 +281,30 @@ TEST(IR2VecTest, IR2VecVocabResultValidity) {
   EXPECT_EQ(validResult.getDimension(), 2u);
 }
 
-// Helper to create a minimal function and embedder for getter tests
-struct GetterTestEnv {
-  Vocab V = {};
+// Fixture for IR2Vec tests requiring IR setup and weight management.
+class IR2VecTestFixture : public ::testing::Test {
+protected:
+  Vocab V;
   LLVMContext Ctx;
-  std::unique_ptr<Module> M = nullptr;
+  std::unique_ptr<Module> M;
   Function *F = nullptr;
   BasicBlock *BB = nullptr;
-  Instruction *Add = nullptr;
-  Instruction *Ret = nullptr;
-  std::unique_ptr<Embedder> Emb = nullptr;
+  Instruction *AddInst = nullptr;
+  Instruction *RetInst = nullptr;
 
-  GetterTestEnv() {
+  float OriginalOpcWeight = ::OpcWeight;
+  float OriginalTypeWeight = ::TypeWeight;
+  float OriginalArgWeight = ::ArgWeight;
+
+  void SetUp() override {
     V = {{"add", {1.0, 2.0}},
          {"integerTy", {0.5, 0.5}},
          {"constant", {0.2, 0.3}},
          {"variable", {0.0, 0.0}},
          {"unknownTy", {0.0, 0.0}}};
 
-    M = std::make_unique<Module>("M", Ctx);
+    // Setup IR
+    M = std::make_unique<Module>("TestM", Ctx);
     FunctionType *FTy = FunctionType::get(
         Type::getInt32Ty(Ctx), {Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx)},
         false);
@@ -308,61 +313,82 @@ struct GetterTestEnv {
     Argument *Arg = F->getArg(0);
     llvm::Value *Const = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
 
-    Add = BinaryOperator::CreateAdd(Arg, Const, "add", BB);
-    Ret = ReturnInst::Create(Ctx, Add, BB);
+    AddInst = BinaryOperator::CreateAdd(Arg, Const, "add", BB);
+    RetInst = ReturnInst::Create(Ctx, AddInst, BB);
+  }
 
-    auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
-    EXPECT_TRUE(static_cast<bool>(Result));
-    Emb = std::move(*Result);
+  void setWeights(float OpcWeight, float TypeWeight, float ArgWeight) {
+    ::OpcWeight = OpcWeight;
+    ::TypeWeight = TypeWeight;
+    ::ArgWeight = ArgWeight;
+  }
+
+  void TearDown() override {
+    // Restore original global weights
+    ::OpcWeight = OriginalOpcWeight;
+    ::TypeWeight = OriginalTypeWeight;
+    ::ArgWeight = OriginalArgWeight;
   }
 };
 
-TEST(IR2VecTest, GetInstVecMap) {
-  GetterTestEnv Env;
-  const auto &InstMap = Env.Emb->getInstVecMap();
+TEST_F(IR2VecTestFixture, GetInstVecMap) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &InstMap = Emb->getInstVecMap();
 
   EXPECT_EQ(InstMap.size(), 2u);
-  EXPECT_TRUE(InstMap.count(Env.Add));
-  EXPECT_TRUE(InstMap.count(Env.Ret));
+  EXPECT_TRUE(InstMap.count(AddInst));
+  EXPECT_TRUE(InstMap.count(RetInst));
 
-  EXPECT_EQ(InstMap.at(Env.Add).size(), 2u);
-  EXPECT_EQ(InstMap.at(Env.Ret).size(), 2u);
+  EXPECT_EQ(InstMap.at(AddInst).size(), 2u);
+  EXPECT_EQ(InstMap.at(RetInst).size(), 2u);
 
   // Check values for add: {1.29, 2.31}
-  EXPECT_THAT(InstMap.at(Env.Add),
+  EXPECT_THAT(InstMap.at(AddInst),
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 
   // Check values for ret: {0.0, 0.}; Neither ret nor voidTy are present in
   // vocab
-  EXPECT_THAT(InstMap.at(Env.Ret), ElementsAre(0.0, 0.0));
+  EXPECT_THAT(InstMap.at(RetInst), ElementsAre(0.0, 0.0));
 }
 
-TEST(IR2VecTest, GetBBVecMap) {
-  GetterTestEnv Env;
-  const auto &BBMap = Env.Emb->getBBVecMap();
+TEST_F(IR2VecTestFixture, GetBBVecMap) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &BBMap = Emb->getBBVecMap();
 
   EXPECT_EQ(BBMap.size(), 1u);
-  EXPECT_TRUE(BBMap.count(Env.BB));
-  EXPECT_EQ(BBMap.at(Env.BB).size(), 2u);
+  EXPECT_TRUE(BBMap.count(BB));
+  EXPECT_EQ(BBMap.at(BB).size(), 2u);
 
   // BB vector should be sum of add and ret: {1.29, 2.31} + {0.0, 0.0} =
   // {1.29, 2.31}
-  EXPECT_THAT(BBMap.at(Env.BB),
+  EXPECT_THAT(BBMap.at(BB),
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 }
 
-TEST(IR2VecTest, GetBBVector) {
-  GetterTestEnv Env;
-  const auto &BBVec = Env.Emb->getBBVector(*Env.BB);
+TEST_F(IR2VecTestFixture, GetBBVector) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &BBVec = Emb->getBBVector(*BB);
 
   EXPECT_EQ(BBVec.size(), 2u);
   EXPECT_THAT(BBVec,
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 }
 
-TEST(IR2VecTest, GetFunctionVector) {
-  GetterTestEnv Env;
-  const auto &FuncVec = Env.Emb->getFunctionVector();
+TEST_F(IR2VecTestFixture, GetFunctionVector) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &FuncVec = Emb->getFunctionVector();
 
   EXPECT_EQ(FuncVec.size(), 2u);
 
@@ -371,4 +397,45 @@ TEST(IR2VecTest, GetFunctionVector) {
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 }
 
+TEST_F(IR2VecTestFixture, GetFunctionVectorWithCustomWeights) {
+  setWeights(1.0, 1.0, 1.0);
+
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &FuncVec = Emb->getFunctionVector();
+
+  EXPECT_EQ(FuncVec.size(), 2u);
+
+  // Expected: 1*([1.0 2.0] + [0.0 0.0]) + 1*([0.5 0.5] + [0.0 0.0]) + 1*([0.2
+  // 0.3] + [0.0 0.0])
+  EXPECT_THAT(FuncVec,
+              ElementsAre(DoubleNear(1.7, 1e-6), DoubleNear(2.8, 1e-6)));
+}
+
+TEST(IR2VecTest, IR2VecVocabAnalysisWithPrepopulatedVocab) {
+  Vocab InitialVocab = {{"key1", {1.1, 2.2}}, {"key2", {3.3, 4.4}}};
+  Vocab ExpectedVocab = InitialVocab;
+  unsigned ExpectedDim = InitialVocab.begin()->second.size();
+
+  IR2VecVocabAnalysis VocabAnalysis(std::move(InitialVocab));
+
+  LLVMContext TestCtx;
+  Module TestMod("TestModuleForVocabAnalysis", TestCtx);
+  ModuleAnalysisManager MAM;
+  IR2VecVocabResult Result = VocabAnalysis.run(TestMod, MAM);
+
+  EXPECT_TRUE(Result.isValid());
+  ASSERT_FALSE(Result.getVocabulary().empty());
+  EXPECT_EQ(Result.getDimension(), ExpectedDim);
+
+  const auto &ResultVocab = Result.getVocabulary();
+  EXPECT_EQ(ResultVocab.size(), ExpectedVocab.size());
+  for (const auto &pair : ExpectedVocab) {
+    EXPECT_TRUE(ResultVocab.count(pair.first));
+    EXPECT_THAT(ResultVocab.at(pair.first), ElementsAreArray(pair.second));
+  }
+}
+
 } // end anonymous namespace

From 9d49b82de077c730d687593604dfa00770f11965 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Fri, 13 Jun 2025 19:48:05 +0200
Subject: [PATCH 0322/1322] [clang-scan-deps] Implement P2223R2 for
 DependencyDirectiveScanner.cpp (#143950)

P2223R2 allows the line-continuation slash `\` to be followed by
additional whitespace. The Clang lexer already follows this behavior,
also for versions prior to C++23. The dependency directive scanner
however only implements it for `#define` directives (15d5f5d).

This fully implements P2223R2 for the dependency directive scanner (for
any C++ standard) and aligns the dependency directive scanner's splicing
behavior with that of the Clang lexer.

For example, the following code was previously not scanned correctly by
`clang-scan-deps` but now works as expected:

```cpp
import \<whitespace here>
A;
```
---
 clang/lib/Lex/DependencyDirectivesScanner.cpp | 32 +++++--
 .../Lex/DependencyDirectivesScannerTest.cpp   | 91 +++++++++++++++++++
 2 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 4606b85d42fe..1b6b16c56114 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -323,10 +323,6 @@ static unsigned skipNewline(const char *&First, const char *End) {
   return Len;
 }
 
-static bool wasLineContinuation(const char *First, unsigned EOLLen) {
-  return *(First - (int)EOLLen - 1) == '\\';
-}
-
 static void skipToNewlineRaw(const char *&First, const char *const End) {
   for (;;) {
     if (First == End)
@@ -336,13 +332,16 @@ static void skipToNewlineRaw(const char *&First, const char *const End) {
     if (Len)
       return;
 
+    char LastNonWhitespace = ' ';
     do {
+      if (!isHorizontalWhitespace(*First))
+        LastNonWhitespace = *First;
       if (++First == End)
         return;
       Len = isEOL(First, End);
     } while (!Len);
 
-    if (First[-1] != '\\')
+    if (LastNonWhitespace != '\\')
       return;
 
     First += Len;
@@ -394,6 +393,7 @@ static bool isQuoteCppDigitSeparator(const char *const Start,
 }
 
 void Scanner::skipLine(const char *&First, const char *const End) {
+  char LastNonWhitespace = ' ';
   for (;;) {
     assert(First <= End);
     if (First == End)
@@ -419,6 +419,8 @@ void Scanner::skipLine(const char *&First, const char *const End) {
       // Iterate over comments correctly.
       if (*First != '/' || End - First < 2) {
         LastTokenPtr = First;
+        if (!isWhitespace(*First))
+          LastNonWhitespace = *First;
         ++First;
         continue;
       }
@@ -431,6 +433,8 @@ void Scanner::skipLine(const char *&First, const char *const End) {
 
       if (First[1] != '*') {
         LastTokenPtr = First;
+        if (!isWhitespace(*First))
+          LastNonWhitespace = *First;
         ++First;
         continue;
       }
@@ -442,8 +446,9 @@ void Scanner::skipLine(const char *&First, const char *const End) {
       return;
 
     // Skip over the newline.
-    unsigned Len = skipNewline(First, End);
-    if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
+    skipNewline(First, End);
+
+    if (LastNonWhitespace != '\\')
       break;
   }
 }
@@ -468,9 +473,16 @@ static void skipWhitespace(const char *&First, const char *const End) {
     if (End - First < 2)
       return;
 
-    if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
-      skipNewline(++First, End);
-      continue;
+    if (*First == '\\') {
+      const char *Ptr = First + 1;
+      while (Ptr < End && isHorizontalWhitespace(*Ptr))
+        ++Ptr;
+      if (Ptr != End && isVerticalWhitespace(*Ptr)) {
+        skipNewline(Ptr, End);
+        First = Ptr;
+        continue;
+      }
+      return;
     }
 
     // Check for a non-comment character.
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 377c066f031d..61f74929c1e9 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -789,6 +789,97 @@ TEST(MinimizeSourceToDependencyDirectivesTest,
                Out.data());
 }
 
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     WhitespaceAfterLineContinuationSlashLineComment) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("// some comment \\  \n"
+                                                    "module A;\n",
+                                                    Out));
+  EXPECT_STREQ("", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     WhitespaceAfterLineContinuationSlashAllDirectives) {
+  SmallVector<char, 512> Out;
+  SmallVector<dependency_directives_scan::Token, 16> Tokens;
+  SmallVector<Directive, 16> Directives;
+
+  StringRef Input = "#define \\   \n"
+                    "A\n"
+                    "#undef\t\\   \n"
+                    "A\n"
+                    "#endif \\\t\t\n"
+                    "\n"
+                    "#if \\     \t\n"
+                    "A\n"
+                    "#ifdef\t\\   \n"
+                    "A\n"
+                    "#ifndef \\ \t\n"
+                    "A\n"
+                    "#elifdef \\  \n"
+                    "A\n"
+                    "#elifndef \\ \n"
+                    "A\n"
+                    "#elif \\\t\t \n"
+                    "A\n"
+                    "#else \\\t \t\n"
+                    "\n"
+                    "#include \\  \n"
+                    "<A>\n"
+                    "#include_next \\    \n"
+                    "<A>\n"
+                    "#__include_macros\\ \n"
+                    "<A>\n"
+                    "#import \\ \t\n"
+                    "<A>\n"
+                    "@import \\\t \n"
+                    "A;\n"
+                    "#pragma clang \\   \n"
+                    "module \\    \n"
+                    "import A\n"
+                    "#pragma \\   \n"
+                    "push_macro(A)\n"
+                    "#pragma \\\t \n"
+                    "pop_macro(A)\n"
+                    "#pragma \\   \n"
+                    "include_alias(<A>,\\ \n"
+                    "<B>)\n"
+                    "export \\    \n"
+                    "module m;\n"
+                    "import\t\\\t \n"
+                    "m;\n"
+                    "#pragma\t\\  \n"
+                    "clang\t\\  \t\n"
+                    "system_header\n";
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives));
+
+  EXPECT_EQ(pp_define, Directives[0].Kind);
+  EXPECT_EQ(pp_undef, Directives[1].Kind);
+  EXPECT_EQ(pp_endif, Directives[2].Kind);
+  EXPECT_EQ(pp_if, Directives[3].Kind);
+  EXPECT_EQ(pp_ifdef, Directives[4].Kind);
+  EXPECT_EQ(pp_ifndef, Directives[5].Kind);
+  EXPECT_EQ(pp_elifdef, Directives[6].Kind);
+  EXPECT_EQ(pp_elifndef, Directives[7].Kind);
+  EXPECT_EQ(pp_elif, Directives[8].Kind);
+  EXPECT_EQ(pp_else, Directives[9].Kind);
+  EXPECT_EQ(pp_include, Directives[10].Kind);
+  EXPECT_EQ(pp_include_next, Directives[11].Kind);
+  EXPECT_EQ(pp___include_macros, Directives[12].Kind);
+  EXPECT_EQ(pp_import, Directives[13].Kind);
+  EXPECT_EQ(decl_at_import, Directives[14].Kind);
+  EXPECT_EQ(pp_pragma_import, Directives[15].Kind);
+  EXPECT_EQ(pp_pragma_push_macro, Directives[16].Kind);
+  EXPECT_EQ(pp_pragma_pop_macro, Directives[17].Kind);
+  EXPECT_EQ(pp_pragma_include_alias, Directives[18].Kind);
+  EXPECT_EQ(cxx_export_module_decl, Directives[19].Kind);
+  EXPECT_EQ(cxx_import_decl, Directives[20].Kind);
+  EXPECT_EQ(pp_pragma_system_header, Directives[21].Kind);
+  EXPECT_EQ(pp_eof, Directives[22].Kind);
+}
+
 TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
   SmallVector<char, 128> Out;
 

From 92a116c4ef822950f8c57eaa5164c844c73a1f7e Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Fri, 13 Jun 2025 10:48:34 -0700
Subject: [PATCH 0323/1322] Revert "Fix/reapply "[libc] Migrate stdio tests to
 ErrnoCheckingTest."" (#144129)

Reverts llvm/llvm-project#143972 - matcher seems to be pedantic for
fgets tests, reverting to verify and fix.
---
 libc/test/src/stdio/CMakeLists.txt           | 13 ------------
 libc/test/src/stdio/fdopen_test.cpp          | 14 +++++++------
 libc/test/src/stdio/fgetc_test.cpp           | 22 +++++++++-----------
 libc/test/src/stdio/fgetc_unlocked_test.cpp  | 22 +++++++++-----------
 libc/test/src/stdio/fgets_test.cpp           | 18 +++++++---------
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++++++++++-----
 libc/test/src/stdio/fopencookie_test.cpp     | 15 ++++++-------
 libc/test/src/stdio/remove_test.cpp          | 10 ++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  9 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 ++++---
 libc/test/src/stdlib/StrtolTest.h            |  1 +
 libc/test/src/stdlib/strtold_test.cpp        |  1 +
 13 files changed, 77 insertions(+), 84 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 4aa8b9588001..ce2171f19597 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,7 +20,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -69,7 +68,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -90,7 +88,6 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -112,7 +109,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -442,7 +438,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -457,7 +452,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -474,7 +468,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -495,8 +488,6 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -519,8 +510,6 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -538,8 +527,6 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index b53184c30be3..104fc478b100 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,21 +9,20 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
+TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
+TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
+  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index be2e50271b51..56bde5f0099a 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,15 +14,12 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
-
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,28 +27,29 @@ public:
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
-                Succeeds(WRITE_SIZE));
+    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
     // This is a write-only file so reads should fail.
-    ASSERT_THAT(func(file), Fails(EBADF, EOF));
+    ASSERT_EQ(func(file), EOF);
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
+      int c = func(file);
+      ASSERT_EQ(c, int('1' + i));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_THAT(func(file), Succeeds(EOF));
+    ASSERT_EQ(func(file), EOF);
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index bef9dafd3d87..90429ecf4e82 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,15 +17,12 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
-
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -33,30 +30,31 @@ public:
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
-                Succeeds(WRITE_SIZE));
+    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
     // This is a write-only file so reads should fail.
-    ASSERT_THAT(func(file), Fails(EBADF, EOF));
+    ASSERT_EQ(func(file), EOF);
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
+      int c = func(file);
+      ASSERT_EQ(c, int('1' + i));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_THAT(func(file), Succeeds(EOF));
+    ASSERT_EQ(func(file), EOF);
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index ca8d4d454663..abed3d405293 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,14 +12,11 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -32,15 +29,15 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
-              Succeeds(WRITE_SIZE));
+  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
   // This is a write-only file so reads should fail.
-  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file), Fails(EBADF, nullptr));
+  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
-  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -58,7 +55,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
-  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -90,5 +86,5 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e097785832d5..e624181c795b 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,18 +17,17 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
+TEST(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST_F(LlvmLibcFILETest, FFlush) {
+TEST(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
+  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index bcf5e674141a..03e1ac286b64 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,7 +15,6 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -23,7 +22,6 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
-using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -90,7 +88,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 296bff1f5dc1..84984e26398c 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,17 +11,16 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index 135fb98c07fb..ac494a4ecaf8 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,19 +8,18 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
+TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
+TEST(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a0936ba79ef7..5872943c1bb4 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,14 +11,12 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -54,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -104,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
+  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index e99b382d1211..5d482b70064b 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,12 +15,11 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c78..3eeccc5727e7 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index eb4056dc7ba6..c2f2b9c9a11c 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 452276ecc0f5d1cb9bf5e1655e422a68eafdb8b9 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Fri, 13 Jun 2025 11:00:08 -0700
Subject: [PATCH 0324/1322] [libc] Fix missing errno include in fuzzer
 (#144132)

The printf parser uses errno for setting up the %m conversion. It was
presumably getting this include indirectly until a recent change. This
patch adds a direct dependency to fix it.
---
 libc/fuzzing/stdio/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/fuzzing/stdio/CMakeLists.txt b/libc/fuzzing/stdio/CMakeLists.txt
index 8f89baa70200..401785a30469 100644
--- a/libc/fuzzing/stdio/CMakeLists.txt
+++ b/libc/fuzzing/stdio/CMakeLists.txt
@@ -4,6 +4,7 @@ add_libc_fuzzer(
     printf_parser_fuzz.cpp
   DEPENDS
     libc.src.stdio.printf_core.parser
+    libc.src.errno.errno # needed for the strerror conversion
 )
 
 add_libc_fuzzer(

From 0c7ce6883a04dadd9daf0d41cba58c2f9eec19ad Mon Sep 17 00:00:00 2001
From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:02:05 -0700
Subject: [PATCH 0325/1322] Revert "[mlir][vector] Fix for WarpOpScfForOp
 failure when scf.for has results that are unused." (#144124)

Reverts llvm/llvm-project#141853

Reverting the bug fix because it does not handle all cases correctly.
---
 .../Vector/Transforms/VectorDistribute.cpp    | 39 +++++--------------
 .../Vector/vector-warp-distribute.mlir        | 36 -----------------
 2 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 52a9cedb43cc..045c192787f1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1554,36 +1554,22 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     llvm::SmallSetVector<Value, 32> escapingValues;
     SmallVector<Type> inputTypes;
     SmallVector<Type> distTypes;
-    auto collectEscapingValues = [&](Value value) {
-      if (!escapingValues.insert(value))
-        return;
-      Type distType = value.getType();
-      if (auto vecType = dyn_cast<VectorType>(distType)) {
-        AffineMap map = distributionMapFn(value);
-        distType = getDistributedType(vecType, map, warpOp.getWarpSize());
-      }
-      inputTypes.push_back(value.getType());
-      distTypes.push_back(distType);
-    };
-
     mlir::visitUsedValuesDefinedAbove(
         forOp.getBodyRegion(), [&](OpOperand *operand) {
           Operation *parent = operand->get().getParentRegion()->getParentOp();
           if (warpOp->isAncestor(parent)) {
-            collectEscapingValues(operand->get());
+            if (!escapingValues.insert(operand->get()))
+              return;
+            Type distType = operand->get().getType();
+            if (auto vecType = dyn_cast<VectorType>(distType)) {
+              AffineMap map = distributionMapFn(operand->get());
+              distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+            }
+            inputTypes.push_back(operand->get().getType());
+            distTypes.push_back(distType);
           }
         });
 
-    // Any forOp result that is not already yielded by the warpOp
-    // region is also considered escaping and must be returned by the
-    // original warpOp.
-    for (OpResult forResult : forOp.getResults()) {
-      // Check if this forResult is already yielded by the yield op.
-      if (llvm::is_contained(yield->getOperands(), forResult))
-        continue;
-      collectEscapingValues(forResult);
-    }
-
     if (llvm::is_contained(distTypes, Type{}))
       return failure();
 
@@ -1623,12 +1609,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
                                     forOp.getResultTypes().end());
     llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
     for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) {
-      auto newWarpResult = newWarpOp.getResult(retIdx);
-      // Unused forOp results yielded by the warpOp region are already included
-      // in the new ForOp.
-      if (llvm::is_contained(newOperands, newWarpResult))
-        continue;
-      warpInput.push_back(newWarpResult);
+      warpInput.push_back(newWarpOp.getResult(retIdx));
       argIndexMapping[escapingValues[i]] = warpInputType.size();
       warpInputType.push_back(inputTypes[i]);
     }
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 6c7ac7a5196a..38771f259344 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -584,42 +584,6 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
   return
 }
 
-// -----
-// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield(
-//       CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
-//       CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32>
-//       CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
-//       CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32>
-//       CHECK-PROP: }
-//       CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) {
-//       CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
-//       CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32>
-//       CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32>
-//       CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32>
-//       CHECK-PROP: }
-//       CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32>
-//       CHECK-PROP: }
-//       CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
-func.func @warp_scf_for_unused_yield(%arg0: index) {
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
-    %ini = "some_def"() : () -> (vector<128xf32>)
-    %ini1 = "some_def"() : () -> (vector<128xf32>)
-    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) {
-      %add = arith.addi %arg3, %c1 : index
-      %1  = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>)
-      %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
-      scf.yield %acc, %1 : vector<128xf32>, vector<128xf32>
-    }
-    gpu.yield %3#0 : vector<128xf32>
-  }
-  "some_use"(%0) : (vector<4xf32>) -> ()
-  return
-}
-
-
 // -----
 
 // CHECK-PROP-LABEL: func @vector_reduction(

From f82cf7442029d3376813db82eca60800e999bfb9 Mon Sep 17 00:00:00 2001
From: Artem Gindinson <gindinson@roofline.ai>
Date: Fri, 13 Jun 2025 20:03:24 +0200
Subject: [PATCH 0326/1322] =?UTF-8?q?[mlir][tensor]=20Fix=20`getReassociat?=
 =?UTF-8?q?ionForCollapse`=20for=20tensor/scalar=20re=E2=80=A6=20(#144118)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…shapes

Commit 6e5a142 changed the behavior of the function when computing
reassociations between tensors (consisting of unit/dynamic dimensions)
and scalars/0d vectors. The IR representation for such reshapes actually
expects an empty reassociation, like so:
```
func.func @example(%arg0 : tensor<?x?x?xf32>) -> tensor<f32> {
  %0 = tensor.collapse_shape %arg0 [] : tensor<?x?x?xf32> into tensor<f32>
}
```

Restore the original behavior - the routine should resort to reporting
failures when compile time-known non-unit dimensions are part of the
attempted reassociation.

Signed-off-by: Artem Gindinson <gindinson@roofline.ai>
---
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp           | 10 ++++------
 mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp |  8 ++++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index 3b1fdb69e8ef..aa566c0086a2 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -299,19 +299,17 @@ mlir::getReassociationIndicesForCollapse(ArrayRef<int64_t> sourceShape,
   // this utility).
   if (numSourceDims <= numTargetDims)
     return std::nullopt;
-  // Early handling for scalar target types.
+  // Early handling for scalar target types. We should report an invalid
+  // reassociation for non-unit static dimensions - no chance to collapse these
+  // into a scalar.
   if (numTargetDims == 0) {
-    ReassociationIndices allSourceIndices;
-    allSourceIndices.reserve(numSourceDims);
     for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims;
          ++sourceDimIdx) {
       int64_t sourceSize = sourceShape[sourceDimIdx];
-      // All source dimensions must be unit or dynamic.
       if (sourceSize != 1 && sourceSize != ShapedType::kDynamic)
         return std::nullopt;
-      allSourceIndices.push_back(sourceDimIdx);
     }
-    return SmallVector<ReassociationIndices>{allSourceIndices};
+    return SmallVector<ReassociationIndices>{};
   }
 
   // Collect source ranges by iterating over the target shape left-to-right.
diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
index db1a87a4de2d..05f97e875e2d 100644
--- a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
@@ -23,16 +23,16 @@ makeOptionalIndices(std::initializer_list<ReassociationIndices> list) {
 
 TEST(ReassociationIndicesForCollapse, ScalarTest) {
   EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}),
-            makeOptionalIndices({{0}}));
+            makeOptionalIndices({}));
   EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}),
-            makeOptionalIndices({{0, 1}}));
+            makeOptionalIndices({}));
   EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}),
-            makeOptionalIndices({{0}}));
+            makeOptionalIndices({}));
   EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic,
                                                 ShapedType::kDynamic, 1,
                                                 ShapedType::kDynamic},
                                                {}),
-            makeOptionalIndices({{0, 1, 2, 3, 4}}));
+            makeOptionalIndices({}));
 }
 
 TEST(ReassociationIndicesForCollapse, ScalarTestFailure) {

From 52d34865b9db3485c8a671a88cc571270349f720 Mon Sep 17 00:00:00 2001
From: FYK <fanju110@163.com>
Date: Sat, 14 Jun 2025 02:05:16 +0800
Subject: [PATCH 0327/1322] Fix and reapply IR PGO support for Flang (#142892)

This PR resubmits the changes from #136098, which was previously
reverted due to a build failure during the linking stage:

```
undefined reference to `llvm::DebugInfoCorrelate'
undefined reference to `llvm::ProfileCorrelate'
```

The root cause was that `llvm/lib/Frontend/Driver/CodeGenOptions.cpp`
references symbols from the `Instrumentation` component, but the
`LINK_COMPONENTS` in the `llvm/lib/Frontend/CMakeLists.txt` for
`LLVMFrontendDriver` did not include it. As a result, linking failed in
configurations where these components were not transitively linked.

### Fix:

This updated patch explicitly adds `Instrumentation` to
`LINK_COMPONENTS` in the relevant `llvm/lib/Frontend/CMakeLists.txt`
file to ensure the required symbols are properly resolved.

---------

Co-authored-by: ict-ql <168183727+ict-ql@users.noreply.github.com>
Co-authored-by: Chyaka <52224511+liliumshade@users.noreply.github.com>
Co-authored-by: Tarun Prabhu <tarunprabhu@gmail.com>
---
 clang/include/clang/Basic/CodeGenOptions.def  |  6 ++-
 clang/include/clang/Basic/CodeGenOptions.h    | 32 +++++++---------
 clang/include/clang/Basic/ProfileList.h       |  9 ++---
 clang/include/clang/Driver/Options.td         |  6 +--
 clang/lib/Basic/ProfileList.cpp               | 22 +++++------
 clang/lib/CodeGen/BackendUtil.cpp             |  9 +----
 clang/lib/CodeGen/CodeGenAction.cpp           |  4 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |  3 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  2 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |  4 ++
 clang/lib/Frontend/CompilerInvocation.cpp     |  6 +--
 .../include/flang/Frontend/CodeGenOptions.def |  7 ++++
 flang/include/flang/Frontend/CodeGenOptions.h | 38 +++++++++++++++++++
 flang/lib/Frontend/CompilerInvocation.cpp     | 10 +++++
 flang/lib/Frontend/FrontendActions.cpp        | 26 +++++++++++++
 flang/test/Driver/flang-f-opts.f90            |  5 +++
 .../Inputs/gcc-flag-compatibility_IR.proftext | 18 +++++++++
 .../gcc-flag-compatibility_IR_entry.proftext  | 11 ++++++
 flang/test/Profile/gcc-flag-compatibility.f90 | 32 ++++++++++++++++
 .../llvm/Frontend/Driver/CodeGenOptions.h     | 13 +++++++
 llvm/lib/Frontend/Driver/CMakeLists.txt       |  1 +
 llvm/lib/Frontend/Driver/CodeGenOptions.cpp   | 13 +++++++
 22 files changed, 223 insertions(+), 54 deletions(-)
 create mode 100644 flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
 create mode 100644 flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
 create mode 100644 flang/test/Profile/gcc-flag-compatibility.f90

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index fa9474d63ae4..2a30ff11464d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -223,9 +223,11 @@ AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is
 CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic
 CODEGENOPT(ContinuousProfileSync, 1, 0) ///< Enable continuous instrumentation profiling
 /// Choose profile instrumenation kind or no instrumentation.
-ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 4, ProfileNone)
+
+ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 4, llvm::driver::ProfileInstrKind::ProfileNone)
+
 /// Choose profile kind for PGO use compilation.
-ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
+ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone)
 /// Partition functions into N groups and select only functions in group i to be
 /// instrumented. Selected group numbers can be 0 to N-1 inclusive.
 VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index a77232c281f7..7ba21fca6dd6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -80,16 +80,6 @@ public:
     SRCK_InRegs    // Small structs in registers (-freg-struct-return).
   };
 
-  enum ProfileInstrKind {
-    ProfileNone,       // Profile instrumentation is turned off.
-    ProfileClangInstr, // Clang instrumentation to generate execution counts
-                       // to use with PGO.
-    ProfileIRInstr,    // IR level PGO instrumentation in LLVM.
-    ProfileCSIRInstr, // IR level PGO context sensitive instrumentation in LLVM.
-    ProfileIRSampleColdCov, // IR level sample pgo based cold function coverage
-                            // instrumentation in LLVM.
-  };
-
   enum EmbedBitcodeKind {
     Embed_Off,      // No embedded bitcode.
     Embed_All,      // Embed both bitcode and commandline in the output.
@@ -522,35 +512,41 @@ public:
 
   /// Check if Clang profile instrumenation is on.
   bool hasProfileClangInstr() const {
-    return getProfileInstr() == ProfileClangInstr;
+    return getProfileInstr() ==
+           llvm::driver::ProfileInstrKind::ProfileClangInstr;
   }
 
   /// Check if IR level profile instrumentation is on.
   bool hasProfileIRInstr() const {
-    return getProfileInstr() == ProfileIRInstr;
+    return getProfileInstr() == llvm::driver::ProfileInstrKind::ProfileIRInstr;
   }
 
   /// Check if CS IR level profile instrumentation is on.
   bool hasProfileCSIRInstr() const {
-    return getProfileInstr() == ProfileCSIRInstr;
+    return getProfileInstr() ==
+           llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
   }
 
   /// Check if any form of instrumentation is on.
-  bool hasProfileInstr() const { return getProfileInstr() != ProfileNone; }
+  bool hasProfileInstr() const {
+    return getProfileInstr() != llvm::driver::ProfileInstrKind::ProfileNone;
+  }
 
   /// Check if Clang profile use is on.
   bool hasProfileClangUse() const {
-    return getProfileUse() == ProfileClangInstr;
+    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileClangInstr;
   }
 
   /// Check if IR level profile use is on.
   bool hasProfileIRUse() const {
-    return getProfileUse() == ProfileIRInstr ||
-           getProfileUse() == ProfileCSIRInstr;
+    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileIRInstr ||
+           getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
   }
 
   /// Check if CSIR profile use is on.
-  bool hasProfileCSIRUse() const { return getProfileUse() == ProfileCSIRInstr; }
+  bool hasProfileCSIRUse() const {
+    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
+  }
 
   /// Check if type and variable info should be emitted.
   bool hasReducedDebugInfo() const {
diff --git a/clang/include/clang/Basic/ProfileList.h b/clang/include/clang/Basic/ProfileList.h
index b4217e49c18a..5338ef3992ad 100644
--- a/clang/include/clang/Basic/ProfileList.h
+++ b/clang/include/clang/Basic/ProfileList.h
@@ -49,17 +49,16 @@ public:
   ~ProfileList();
 
   bool isEmpty() const { return Empty; }
-  ExclusionType getDefault(CodeGenOptions::ProfileInstrKind Kind) const;
+  ExclusionType getDefault(llvm::driver::ProfileInstrKind Kind) const;
 
   std::optional<ExclusionType>
   isFunctionExcluded(StringRef FunctionName,
-                     CodeGenOptions::ProfileInstrKind Kind) const;
+                     llvm::driver::ProfileInstrKind Kind) const;
   std::optional<ExclusionType>
   isLocationExcluded(SourceLocation Loc,
-                     CodeGenOptions::ProfileInstrKind Kind) const;
+                     llvm::driver::ProfileInstrKind Kind) const;
   std::optional<ExclusionType>
-  isFileExcluded(StringRef FileName,
-                 CodeGenOptions::ProfileInstrKind Kind) const;
+  isFileExcluded(StringRef FileName, llvm::driver::ProfileInstrKind Kind) const;
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 152df89118a6..5951687b095e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1772,7 +1772,7 @@ def fmcdc_max_test_vectors_EQ : Joined<["-"], "fmcdc-max-test-vectors=">,
   HelpText<"Maximum number of test vectors in MC/DC coverage">,
   MarshallingInfoInt<CodeGenOpts<"MCDCMaxTVs">, "0x7FFFFFFE">;
 def fprofile_generate : Flag<["-"], "fprofile-generate">,
-    Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+    Group<f_Group>, Visibility<[ClangOption, CLOption, FlangOption, FC1Option]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>,
@@ -1789,7 +1789,7 @@ def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
     Visibility<[ClangOption, CLOption]>, Alias<fprofile_instr_use>;
 def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
     Group<f_Group>,
-    Visibility<[ClangOption, CLOption]>,
+    Visibility<[ClangOption, CLOption, FlangOption, FC1Option]>,
     MetaVarName<"<pathname>">,
     HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
 def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
@@ -7761,7 +7761,7 @@ def fpatchable_function_entry_section_EQ
       MarshallingInfoString<CodeGenOpts<"PatchableFunctionEntrySection">>;
 def fprofile_instrument_EQ : Joined<["-"], "fprofile-instrument=">,
     HelpText<"Enable PGO instrumentation">, Values<"none,clang,llvm,csllvm,sample-coldcov">,
-    NormalizedValuesScope<"CodeGenOptions">,
+    NormalizedValuesScope<"llvm::driver::ProfileInstrKind">,
     NormalizedValues<["ProfileNone", "ProfileClangInstr", "ProfileIRInstr", "ProfileCSIRInstr", "ProfileIRSampleColdCov"]>,
     MarshallingInfoEnum<CodeGenOpts<"ProfileInstr">, "ProfileNone">;
 def fprofile_instrument_path_EQ : Joined<["-"], "fprofile-instrument-path=">,
diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp
index aaea5a00ab6a..8481deffe2a7 100644
--- a/clang/lib/Basic/ProfileList.cpp
+++ b/clang/lib/Basic/ProfileList.cpp
@@ -69,24 +69,24 @@ ProfileList::ProfileList(ArrayRef<std::string> Paths, SourceManager &SM)
 
 ProfileList::~ProfileList() = default;
 
-static StringRef getSectionName(CodeGenOptions::ProfileInstrKind Kind) {
+static StringRef getSectionName(llvm::driver::ProfileInstrKind Kind) {
   switch (Kind) {
-  case CodeGenOptions::ProfileNone:
+  case llvm::driver::ProfileInstrKind::ProfileNone:
     return "";
-  case CodeGenOptions::ProfileClangInstr:
+  case llvm::driver::ProfileInstrKind::ProfileClangInstr:
     return "clang";
-  case CodeGenOptions::ProfileIRInstr:
+  case llvm::driver::ProfileInstrKind::ProfileIRInstr:
     return "llvm";
-  case CodeGenOptions::ProfileCSIRInstr:
+  case llvm::driver::ProfileInstrKind::ProfileCSIRInstr:
     return "csllvm";
-  case CodeGenOptions::ProfileIRSampleColdCov:
+  case llvm::driver::ProfileInstrKind::ProfileIRSampleColdCov:
     return "sample-coldcov";
   }
-  llvm_unreachable("Unhandled CodeGenOptions::ProfileInstrKind enum");
+  llvm_unreachable("Unhandled llvm::driver::ProfileInstrKind enum");
 }
 
 ProfileList::ExclusionType
-ProfileList::getDefault(CodeGenOptions::ProfileInstrKind Kind) const {
+ProfileList::getDefault(llvm::driver::ProfileInstrKind Kind) const {
   StringRef Section = getSectionName(Kind);
   // Check for "default:<type>"
   if (SCL->inSection(Section, "default", "allow"))
@@ -117,7 +117,7 @@ ProfileList::inSection(StringRef Section, StringRef Prefix,
 
 std::optional<ProfileList::ExclusionType>
 ProfileList::isFunctionExcluded(StringRef FunctionName,
-                                CodeGenOptions::ProfileInstrKind Kind) const {
+                                llvm::driver::ProfileInstrKind Kind) const {
   StringRef Section = getSectionName(Kind);
   // Check for "function:<regex>=<case>"
   if (auto V = inSection(Section, "function", FunctionName))
@@ -131,13 +131,13 @@ ProfileList::isFunctionExcluded(StringRef FunctionName,
 
 std::optional<ProfileList::ExclusionType>
 ProfileList::isLocationExcluded(SourceLocation Loc,
-                                CodeGenOptions::ProfileInstrKind Kind) const {
+                                llvm::driver::ProfileInstrKind Kind) const {
   return isFileExcluded(SM.getFilename(SM.getFileLoc(Loc)), Kind);
 }
 
 std::optional<ProfileList::ExclusionType>
 ProfileList::isFileExcluded(StringRef FileName,
-                            CodeGenOptions::ProfileInstrKind Kind) const {
+                            llvm::driver::ProfileInstrKind Kind) const {
   StringRef Section = getSectionName(Kind);
   // Check for "source:<regex>=<case>"
   if (auto V = inSection(Section, "source", FileName))
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 88b3a4943e0d..7e0a3cf5591c 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -124,17 +124,10 @@ namespace clang {
 extern llvm::cl::opt<bool> ClSanitizeGuardChecks;
 }
 
-// Default filename used for profile generation.
-static std::string getDefaultProfileGenName() {
-  return DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE
-             ? "default_%m.proflite"
-             : "default_%m.profraw";
-}
-
 // Path and name of file used for profile generation
 static std::string getProfileGenName(const CodeGenOptions &CodeGenOpts) {
   std::string FileName = CodeGenOpts.InstrProfileOutput.empty()
-                             ? getDefaultProfileGenName()
+                             ? llvm::driver::getDefaultProfileGenName()
                              : CodeGenOpts.InstrProfileOutput;
   if (CodeGenOpts.ContinuousProfileSync)
     FileName = "%c" + FileName;
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 1f5eb427b566..5493cc92bd8b 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -273,8 +273,8 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
   std::unique_ptr<llvm::ToolOutputFile> OptRecordFile =
     std::move(*OptRecordFileOrErr);
 
-  if (OptRecordFile &&
-      CodeGenOpts.getProfileUse() != CodeGenOptions::ProfileNone)
+  if (OptRecordFile && CodeGenOpts.getProfileUse() !=
+                           llvm::driver::ProfileInstrKind::ProfileNone)
     Ctx.setDiagnosticsHotnessRequested(true);
 
   if (CodeGenOpts.MisExpect) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 56562002e719..13d0633e9b1c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -943,7 +943,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     }
   }
 
-  if (CGM.getCodeGenOpts().getProfileInstr() != CodeGenOptions::ProfileNone) {
+  if (CGM.getCodeGenOpts().getProfileInstr() !=
+      llvm::driver::ProfileInstrKind::ProfileNone) {
     switch (CGM.isFunctionBlockedFromProfileInstr(Fn, Loc)) {
     case ProfileList::Skip:
       Fn->addFnAttr(llvm::Attribute::SkipProfile);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 16e49aab4fe6..451792dca40c 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3608,7 +3608,7 @@ CodeGenModule::isFunctionBlockedByProfileList(llvm::Function *Fn,
   // If the profile list is empty, then instrument everything.
   if (ProfileList.isEmpty())
     return ProfileList::Allow;
-  CodeGenOptions::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr();
+  llvm::driver::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr();
   // First, check the function name.
   if (auto V = ProfileList.isFunctionExcluded(Fn->getName(), Kind))
     return *V;
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index a20879dad94d..47d0e345086b 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -887,6 +887,10 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   // TODO: Handle interactions between -w, -pedantic, -Wall, -WOption
   Args.AddLastArg(CmdArgs, options::OPT_w);
 
+  // recognise options: fprofile-generate -fprofile-use=
+  Args.addAllArgs(
+      CmdArgs, {options::OPT_fprofile_generate, options::OPT_fprofile_use_EQ});
+
   // Forward flags for OpenMP. We don't do this if the current action is an
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 2c02719121c7..dd021ad2e441 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1492,11 +1492,11 @@ static void setPGOUseInstrumentor(CodeGenOptions &Opts,
   // which is available (might be one or both).
   if (PGOReader->isIRLevelProfile() || PGOReader->hasMemoryProfile()) {
     if (PGOReader->hasCSIRLevelProfile())
-      Opts.setProfileUse(CodeGenOptions::ProfileCSIRInstr);
+      Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileCSIRInstr);
     else
-      Opts.setProfileUse(CodeGenOptions::ProfileIRInstr);
+      Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
   } else
-    Opts.setProfileUse(CodeGenOptions::ProfileClangInstr);
+    Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileClangInstr);
 }
 
 void CompilerInvocation::setDefaultPointerAuthOptions(
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index a69787283656..ae12aec51810 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -24,8 +24,15 @@ CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified.
 CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new
                                    ///< pass manager.
 
+
+/// Choose profile instrumenation kind or no instrumentation.
+ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone)
+/// Choose profile kind for PGO use compilation.
+ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone)
+
 CODEGENOPT(InstrumentFunctions, 1, 0) ///< Set when -finstrument_functions is
                                       ///< enabled on the compile step.
+
 CODEGENOPT(IsPIE, 1, 0) ///< PIE level is the same as PIC Level.
 CODEGENOPT(PICLevel, 2, 0) ///< PIC level of the LLVM module.
 CODEGENOPT(PrepareForFullLTO , 1, 0) ///< Set when -flto is enabled on the
diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h
index e939f10f3c3e..bad17c8309eb 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.h
+++ b/flang/include/flang/Frontend/CodeGenOptions.h
@@ -154,6 +154,44 @@ public:
   /// OpenMP is enabled.
   using DoConcurrentMappingKind = flangomp::DoConcurrentMappingKind;
 
+  /// Name of the profile file to use as output for -fprofile-instr-generate,
+  /// -fprofile-generate, and -fcs-profile-generate.
+  std::string InstrProfileOutput;
+
+  /// Name of the profile file to use as input for -fmemory-profile-use.
+  std::string MemoryProfileUsePath;
+
+  /// Name of the profile file to use as input for -fprofile-instr-use
+  std::string ProfileInstrumentUsePath;
+
+  /// Name of the profile remapping file to apply to the profile data supplied
+  /// by -fprofile-sample-use or -fprofile-instr-use.
+  std::string ProfileRemappingFile;
+
+  /// Check if Clang profile instrumenation is on.
+  bool hasProfileClangInstr() const {
+    return getProfileInstr() == llvm::driver::ProfileClangInstr;
+  }
+
+  /// Check if IR level profile instrumentation is on.
+  bool hasProfileIRInstr() const {
+    return getProfileInstr() == llvm::driver::ProfileIRInstr;
+  }
+
+  /// Check if CS IR level profile instrumentation is on.
+  bool hasProfileCSIRInstr() const {
+    return getProfileInstr() == llvm::driver::ProfileCSIRInstr;
+  }
+  /// Check if IR level profile use is on.
+  bool hasProfileIRUse() const {
+    return getProfileUse() == llvm::driver::ProfileIRInstr ||
+           getProfileUse() == llvm::driver::ProfileCSIRInstr;
+  }
+  /// Check if CSIR profile use is on.
+  bool hasProfileCSIRUse() const {
+    return getProfileUse() == llvm::driver::ProfileCSIRInstr;
+  }
+
   // Define accessors/mutators for code generation options of enumeration type.
 #define CODEGENOPT(Name, Bits, Default)
 #define ENUM_CODEGENOPT(Name, Type, Bits, Default)                             \
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 15bcff254756..147849b0b7d2 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Frontend/Debug/Options.h"
+#include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
@@ -441,6 +442,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.IsPIE = 1;
   }
 
+  if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) {
+    opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr);
+  }
+
+  if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) {
+    opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
+    opts.ProfileInstrumentUsePath = A->getValue();
+  }
+
   // -mcmodel option.
   if (const llvm::opt::Arg *a =
           args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) {
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 1c8a419188b8..d684eeb69675 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -56,10 +56,12 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -67,6 +69,7 @@
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <memory>
 #include <system_error>
@@ -919,6 +922,29 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
   llvm::PassInstrumentationCallbacks pic;
   llvm::PipelineTuningOptions pto;
   std::optional<llvm::PGOOptions> pgoOpt;
+
+  if (opts.hasProfileIRInstr()) {
+    // -fprofile-generate.
+    pgoOpt = llvm::PGOOptions(opts.InstrProfileOutput.empty()
+                                  ? llvm::driver::getDefaultProfileGenName()
+                                  : opts.InstrProfileOutput,
+                              "", "", opts.MemoryProfileUsePath, nullptr,
+                              llvm::PGOOptions::IRInstr,
+                              llvm::PGOOptions::NoCSAction,
+                              llvm::PGOOptions::ColdFuncOpt::Default, false,
+                              /*PseudoProbeForProfiling=*/false, false);
+  } else if (opts.hasProfileIRUse()) {
+    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
+        llvm::vfs::getRealFileSystem();
+    // -fprofile-use.
+    auto CSAction = opts.hasProfileCSIRUse() ? llvm::PGOOptions::CSIRUse
+                                             : llvm::PGOOptions::NoCSAction;
+    pgoOpt = llvm::PGOOptions(
+        opts.ProfileInstrumentUsePath, "", opts.ProfileRemappingFile,
+        opts.MemoryProfileUsePath, VFS, llvm::PGOOptions::IRUse, CSAction,
+        llvm::PGOOptions::ColdFuncOpt::Default, false);
+  }
+
   llvm::StandardInstrumentations si(llvmModule->getContext(),
                                     opts.DebugPassManager);
   si.registerCallbacks(pic, &mam);
diff --git a/flang/test/Driver/flang-f-opts.f90 b/flang/test/Driver/flang-f-opts.f90
index 4493a519e201..b972b9b7b2a5 100644
--- a/flang/test/Driver/flang-f-opts.f90
+++ b/flang/test/Driver/flang-f-opts.f90
@@ -8,3 +8,8 @@
 ! CHECK-LABEL: "-fc1"
 ! CHECK: -ffp-contract=off
 ! CHECK: -O3
+
+! RUN: %flang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-LLVM %s
+! CHECK-PROFILE-GENERATE-LLVM: "-fprofile-generate"
+! RUN: %flang -### -S -fprofile-use=%S %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s
+! CHECK-PROFILE-USE-DIR: "-fprofile-use={{.*}}"
diff --git a/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
new file mode 100644
index 000000000000..2650fb5ebfd3
--- /dev/null
+++ b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
@@ -0,0 +1,18 @@
+# IR level Instrumentation Flag
+:ir
+_QQmain
+# Func Hash:
+146835646621254984
+# Num Counters:
+2
+# Counter Values:
+100
+1
+
+main
+# Func Hash:
+742261418966908927
+# Num Counters:
+1
+# Counter Values:
+1
\ No newline at end of file
diff --git a/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
new file mode 100644
index 000000000000..c4a2a26557e8
--- /dev/null
+++ b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
@@ -0,0 +1,11 @@
+# IR level Instrumentation Flag
+:ir
+:entry_first
+_QQmain
+# Func Hash:
+146835646621254984
+# Num Counters:
+2
+# Counter Values:
+100
+1
\ No newline at end of file
diff --git a/flang/test/Profile/gcc-flag-compatibility.f90 b/flang/test/Profile/gcc-flag-compatibility.f90
new file mode 100644
index 000000000000..4490c45232d2
--- /dev/null
+++ b/flang/test/Profile/gcc-flag-compatibility.f90
@@ -0,0 +1,32 @@
+! Tests for -fprofile-generate and -fprofile-use flag compatibility. These two
+! flags behave similarly to their GCC counterparts:
+!
+! -fprofile-generate         Generates the profile file ./default.profraw
+! -fprofile-use=<dir>/file   Uses the profile file <dir>/file
+
+! On AIX, -flto used to be required with -fprofile-generate. gcc-flag-compatibility-aix.c is used to do the testing on AIX with -flto
+! RUN: %flang %s -c -S -o - -emit-llvm -fprofile-generate | FileCheck -check-prefix=PROFILE-GEN %s
+! PROFILE-GEN: @__profc_{{_?}}main = {{(private|internal)}} global [1 x i64] zeroinitializer, section
+! PROFILE-GEN: @__profd_{{_?}}main =
+
+! Check that -fprofile-use=some/path/file.prof reads some/path/file.prof
+! This uses LLVM IR format profile.
+! RUN: rm -rf %t.dir
+! RUN: mkdir -p %t.dir/some/path
+! RUN: llvm-profdata merge %S/Inputs/gcc-flag-compatibility_IR.proftext -o %t.dir/some/path/file.prof
+! RUN: %flang %s -o - -emit-llvm -S -fprofile-use=%t.dir/some/path/file.prof | FileCheck -check-prefix=PROFILE-USE-IR1 %s
+! RUN: llvm-profdata merge %S/Inputs/gcc-flag-compatibility_IR_entry.proftext -o %t.dir/some/path/file.prof
+! RUN: %flang %s -o - -emit-llvm -S -fprofile-use=%t.dir/some/path/file.prof | FileCheck -check-prefix=PROFILE-USE-IR2 %s
+! PROFILE-USE-IR1: = !{!"branch_weights", i32 100, i32 1}
+! PROFILE-USE-IR2: = !{!"branch_weights", i32 1, i32 100}
+
+program main
+  implicit none
+  integer :: i
+  integer :: X = 0
+
+  do i = 0, 99
+     X = X + i
+  end do
+
+end program main
diff --git a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
index e8e70c0e126a..f0168c040788 100644
--- a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
+++ b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
@@ -14,6 +14,7 @@
 #define LLVM_FRONTEND_DRIVER_CODEGENOPTIONS_H
 
 #include "llvm/Support/Compiler.h"
+#include <string>
 
 namespace llvm {
 class Triple;
@@ -51,6 +52,18 @@ enum class VectorLibrary {
 LLVM_ABI TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
                                            VectorLibrary Veclib);
 
+enum ProfileInstrKind {
+  ProfileNone,       // Profile instrumentation is turned off.
+  ProfileClangInstr, // Clang instrumentation to generate execution counts
+                     // to use with PGO.
+  ProfileIRInstr,    // IR level PGO instrumentation in LLVM.
+  ProfileCSIRInstr,  // IR level PGO context sensitive instrumentation in LLVM.
+  ProfileIRSampleColdCov, // IR level sample pgo based cold function coverage
+                          // instrumentation in LLVM.
+};
+
+// Default filename used for profile generation.
+std::string getDefaultProfileGenName();
 } // end namespace llvm::driver
 
 #endif
diff --git a/llvm/lib/Frontend/Driver/CMakeLists.txt b/llvm/lib/Frontend/Driver/CMakeLists.txt
index 23de4994a300..9feee6fe6929 100644
--- a/llvm/lib/Frontend/Driver/CMakeLists.txt
+++ b/llvm/lib/Frontend/Driver/CMakeLists.txt
@@ -12,4 +12,5 @@ add_llvm_component_library(LLVMFrontendDriver
   Core
   Support
   Analysis
+  Instrumentation
   )
diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
index 52080dea93c9..df884908845d 100644
--- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
+++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
@@ -8,8 +8,15 @@
 
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/TargetParser/Triple.h"
 
+namespace llvm {
+extern llvm::cl::opt<bool> DebugInfoCorrelate;
+extern llvm::cl::opt<llvm::InstrProfCorrelator::ProfCorrelatorKind>
+    ProfileCorrelate;
+} // namespace llvm
+
 namespace llvm::driver {
 
 TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
@@ -56,4 +63,10 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
   return TLII;
 }
 
+std::string getDefaultProfileGenName() {
+  return llvm::DebugInfoCorrelate ||
+                 llvm::ProfileCorrelate != InstrProfCorrelator::NONE
+             ? "default_%m.proflite"
+             : "default_%m.profraw";
+}
 } // namespace llvm::driver

From f6bf3bd5e001918780e7b1e8fceeb02604d65783 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 11:08:15 -0700
Subject: [PATCH 0328/1322] [bazel] Fix XeGpu deps for 5578bcbcfd25c (#144133)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 7bcb1d4ca883..b62d5595fe94 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3506,6 +3506,7 @@ cc_library(
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":Pass",
+        ":SCFTransforms",
         ":TransformUtils",
         ":VectorDialect",
         ":VectorTransforms",

From 59388fb0b92d7efd5737efd6c7b6d5c82f1bc6a8 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Fri, 13 Jun 2025 11:16:44 -0700
Subject: [PATCH 0329/1322] [InstCombine] Preserve NSW/NUW flags when folding
 const BOp with min/max (#143471)

When folding `X Pred C2 ? X BOp C1 : C2 BOp C1` to `min/max(X, C2) BOp
C1`, if NUW/NSW flags are present on `X BOp C1` and could be safely
applied to `C2 BOp C1`, then they may be added on the BOp after the fold
is complete. https://alive2.llvm.org/ce/z/n_3aNJ

Preserving these flags can allow subsequent transforms to re-order the
min/max and BOp, which in the case of NVPTX would allow for some
potential future transformations which would improve
instruction-selection.
---
 .../InstCombine/InstCombineInternal.h         |  2 +
 .../InstCombine/InstCombineSelect.cpp         | 36 ++++++--
 .../InstCombine/canonicalize-const-to-bop.ll  | 83 ++++++++++++++++++-
 3 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ce0e843437b5..8c9de862fe8f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -771,6 +771,8 @@ public:
                             Value *A, Value *B, Instruction &Outer,
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+  Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
+                                      Value *FalseVal);
   Instruction *foldSelectValueEquivalence(SelectInst &SI, CmpInst &CI);
   bool replaceInInstruction(Value *V, Value *Old, Value *New,
                             unsigned Depth = 0);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 320b827bdbe8..73ba0f78e805 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1879,9 +1879,9 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
 
 /// Fold `X Pred C1 ? X BOp C2 : C1 BOp C2` to `min/max(X, C1) BOp C2`.
 /// This allows for better canonicalization.
-static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
-                                           Value *FalseVal,
-                                           IRBuilderBase &Builder) {
+Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp,
+                                                      Value *TrueVal,
+                                                      Value *FalseVal) {
   Constant *C1, *C2, *C3;
   Value *X;
   CmpPredicate Predicate;
@@ -1945,11 +1945,29 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
     return nullptr;
   }
 
-  Intrinsic::ID IntrinsicID = getMinMaxIntrinsic(SPF);
-  Value *Intrinsic = Builder.CreateBinaryIntrinsic(IntrinsicID, X, RHS);
-  return IsIntrinsic ? Builder.CreateBinaryIntrinsic(Opcode, Intrinsic, C2)
-                     : Builder.CreateBinOp(Instruction::BinaryOps(Opcode),
-                                           Intrinsic, C2);
+  Intrinsic::ID MinMaxID = getMinMaxIntrinsic(SPF);
+  Value *MinMax = Builder.CreateBinaryIntrinsic(MinMaxID, X, RHS);
+  if (IsIntrinsic)
+    return Builder.CreateBinaryIntrinsic(Opcode, MinMax, C2);
+
+  const auto BinOpc = Instruction::BinaryOps(Opcode);
+  Value *BinOp = Builder.CreateBinOp(BinOpc, MinMax, C2);
+
+  // If we can attach no-wrap flags to the new instruction, do so if the
+  // old instruction had them and C1 BinOp C2 does not overflow.
+  if (Instruction *BinOpInst = dyn_cast<Instruction>(BinOp)) {
+    if (BinOpc == Instruction::Add || BinOpc == Instruction::Sub ||
+        BinOpc == Instruction::Mul) {
+      Instruction *OldBinOp = cast<BinaryOperator>(TrueVal);
+      if (OldBinOp->hasNoSignedWrap() &&
+          willNotOverflow(BinOpc, RHS, C2, *BinOpInst, /*IsSigned=*/true))
+        BinOpInst->setHasNoSignedWrap();
+      if (OldBinOp->hasNoUnsignedWrap() &&
+          willNotOverflow(BinOpc, RHS, C2, *BinOpInst, /*IsSigned=*/false))
+        BinOpInst->setHasNoUnsignedWrap();
+    }
+  }
+  return BinOp;
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
@@ -2027,7 +2045,7 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Value *V = foldAbsDiff(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
-  if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal, Builder))
+  if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal))
     return replaceInstUsesWith(SI, V);
 
   return Changed ? &SI : nullptr;
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll b/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll
index c08ec1bb7de0..b3093a92624a 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll
@@ -5,7 +5,7 @@ define i8 @add_and_sgt(i8 %x) {
 ; CHECK-LABEL: define i8 @add_and_sgt(
 ; CHECK-SAME: i8 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 8)
-; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[TMP1]], 16
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw i8 [[TMP1]], 16
 ; CHECK-NEXT:    ret i8 [[S]]
 ;
   %add = add nsw i8 %x, 16
@@ -155,7 +155,7 @@ define i8 @multi_use_cond_and_sel(i8 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X]], 8
 ; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 8)
-; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[TMP1]], 16
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw i8 [[TMP1]], 16
 ; CHECK-NEXT:    call void @use_byte(i8 [[S]])
 ; CHECK-NEXT:    ret i8 [[S]]
 ;
@@ -450,3 +450,82 @@ define i8 @umax_sgt(i8 %x) {
   %s = select i1 %cmp, i8 100, i8 %umax
   ret i8 %s
 }
+
+define i8 @add_sgt_nuw_nsw_safe(i8 %x) {
+; CHECK-LABEL: define i8 @add_sgt_nuw_nsw_safe(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100)
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw i8 [[TMP1]], 1
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %add = add nuw nsw i8 %x, 1
+  %cmp = icmp sgt i8 %x, 100
+  %s = select i1 %cmp, i8 101, i8 %add
+  ret i8 %s
+}
+
+define i8 @add_sgt_nuw_only(i8 %x) {
+; CHECK-LABEL: define i8 @add_sgt_nuw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100)
+; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[TMP1]], 50
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %add = add nuw nsw i8 %x, 50
+  %cmp = icmp sgt i8 %x, 100
+  %s = select i1 %cmp, i8 150, i8 %add
+  ret i8 %s
+}
+
+define i8 @add_sgt_nsw_only(i8 %x) {
+; CHECK-LABEL: define i8 @add_sgt_nsw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100)
+; CHECK-NEXT:    [[S:%.*]] = add nsw i8 [[TMP1]], -99
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %add = add nuw nsw i8 %x, -99
+  %cmp = icmp sgt i8 %x, 100
+  %s = select i1 %cmp, i8 1, i8 %add
+  ret i8 %s
+}
+
+
+define i8 @mul_ult_nuw_nsw_safe(i8 %x) {
+; CHECK-LABEL: define i8 @mul_ult_nuw_nsw_safe(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 10)
+; CHECK-NEXT:    [[S:%.*]] = mul nuw nsw i8 [[TMP1]], 3
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %mul = mul nuw nsw i8 %x, 3
+  %cmp = icmp ult i8 %x, 10
+  %s = select i1 %cmp, i8 30, i8 %mul
+  ret i8 %s
+}
+
+define i8 @mul_ult_nuw_only(i8 %x) {
+; CHECK-LABEL: define i8 @mul_ult_nuw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 10)
+; CHECK-NEXT:    [[S:%.*]] = mul nuw i8 [[TMP1]], 25
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %mul = mul nuw nsw i8 %x, 25
+  %cmp = icmp ult i8 %x, 10
+  %s = select i1 %cmp, i8 250, i8 %mul
+  ret i8 %s
+}
+
+define i8 @mul_ult_nsw_only(i8 %x) {
+; CHECK-LABEL: define i8 @mul_ult_nsw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 40)
+; CHECK-NEXT:    [[S:%.*]] = mul nsw i8 [[TMP1]], -2
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %mul = mul nuw nsw i8 %x, -2
+  %cmp = icmp ult i8 %x, 40
+  %s = select i1 %cmp, i8 -80, i8 %mul
+  ret i8 %s
+}

From f68848015f62156b8c3539b44f16d9c8b0a93a89 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 13 Jun 2025 19:17:01 +0100
Subject: [PATCH 0330/1322] [VPlan] Manage Sentinel value for FindLastIV in
 VPlan. (#142291)

Similar to modeling the start value as operand, also model the sentinel
value as operand explicitly. This makes all require information for
code-gen available directly in VPlan.

PR: https://github.com/llvm/llvm-project/pull/142291
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++++++++++--------
 .../Transforms/Vectorize/VPlanPatternMatch.h  | 19 ++++++++++++++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 ++++----
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  2 +-
 .../vplan-printing-reductions.ll              |  2 +-
 5 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa313243a57d..69b60c7b9320 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7266,9 +7266,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
     using namespace llvm::PatternMatch;
     Value *Cmp, *OrigResumeV, *CmpOp;
     bool IsExpectedPattern =
-        match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
-                                        m_Specific(RdxDesc.getSentinelValue()),
-                                        m_Value(OrigResumeV))) &&
+        match(MainResumeValue,
+              m_Select(
+                  m_OneUse(m_Value(Cmp)),
+                  m_Specific(EpiRedResult->getOperand(2)->getLiveInIRValue()),
+                  m_Value(OrigResumeV))) &&
         (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
                                    m_Value(CmpOp))) &&
          ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
@@ -9235,9 +9237,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
             RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
-      FinalReductionResult =
-          Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult,
-                               {PhiR, Start, NewExitingVPV}, ExitDL);
+      FinalReductionResult = Builder.createNaryOp(
+          VPInstruction::ComputeFindLastIVResult,
+          {PhiR, Start, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()),
+           NewExitingVPV},
+          ExitDL);
     } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
                    RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
@@ -9825,8 +9829,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
         Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
-        ResumeV =
-            Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
+        ResumeV = Builder.CreateSelect(
+            Cmp, RdxResult->getOperand(2)->getLiveInIRValue(), ResumeV);
       } else {
         VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
         auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index dfd9fc3d4d71..b2535fe3aa57 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -318,6 +318,25 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
       {Op0, Op1, Op2});
 }
 
+template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
+          unsigned Opcode, bool Commutative, typename... RecipeTys>
+using Recipe4Op_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t, Op3_t>,
+                                     Opcode, Commutative, RecipeTys...>;
+
+template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
+          unsigned Opcode>
+using VPInstruction4Op_match =
+    Recipe4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode, /*Commutative*/ false,
+                    VPInstruction>;
+
+template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t,
+          typename Op3_t>
+inline VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>
+m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2,
+                const Op3_t &Op3) {
+  return VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>(
+      {Op0, Op1, Op2, Op3});
+}
 template <typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Instruction::Freeze>
 m_Freeze(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ccce0e07e4d0..d59cec892d40 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -645,16 +645,16 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
     // The recipe's operands are the reduction phi, followed by one operand for
     // each part of the reduction.
-    unsigned UF = getNumOperands() - 2;
-    Value *ReducedPartRdx = State.get(getOperand(2));
+    unsigned UF = getNumOperands() - 3;
+    Value *ReducedPartRdx = State.get(getOperand(3));
     for (unsigned Part = 1; Part < UF; ++Part) {
       ReducedPartRdx = createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx,
-                                      State.get(getOperand(2 + Part)));
+                                      State.get(getOperand(3 + Part)));
     }
 
     return createFindLastIVReduction(Builder, ReducedPartRdx,
                                      State.get(getOperand(1), true),
-                                     RdxDesc.getSentinelValue());
+                                     getOperand(2)->getLiveInIRValue());
   }
   case VPInstruction::ComputeReductionResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index e4c068ef175b..dfb5bfabd22b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -347,7 +347,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
                       m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeFindLastIVResult>(
-                      m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {
+                      m_VPValue(), m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {
       addUniformForAllParts(cast<VPInstruction>(&R));
       for (unsigned Part = 1; Part != UF; ++Part)
         R.addOperand(getValueForPart(Op1, Part));
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 95fbc4260587..978f1b80d26d 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -240,7 +240,7 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<%cond>
+; CHECK-NEXT:   EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<-9223372036854775808>, ir<%cond>
 ; CHECK-NEXT:   EMIT vp<[[EXT:%.+]]> = extract-last-element vp<[[RDX_RES]]>
 ; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>

From 24bbc820701b49ab8bc7b9670034e39e11da8a16 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 11:20:32 -0700
Subject: [PATCH 0331/1322] [CIR] Support for static variables (#143980)

This adds support for emitting static variables and their initializers.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  14 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  18 ++
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          | 248 +++++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |   6 +
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  13 +
 clang/test/CIR/CodeGen/static-vars.c          |  37 +++
 clang/test/CIR/CodeGen/static-vars.cpp        |  49 ++++
 7 files changed, 383 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/static-vars.c
 create mode 100644 clang/test/CIR/CodeGen/static-vars.cpp

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index a3754f4de66b..502d58d7db8b 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -185,11 +185,25 @@ public:
                                     global.getSymName());
   }
 
+  mlir::Value createGetGlobal(cir::GlobalOp global) {
+    return createGetGlobal(global.getLoc(), global);
+  }
+
   cir::StoreOp createStore(mlir::Location loc, mlir::Value val, mlir::Value dst,
                            mlir::IntegerAttr align = {}) {
     return create<cir::StoreOp>(loc, val, dst, align);
   }
 
+  [[nodiscard]] cir::GlobalOp createGlobal(mlir::ModuleOp mlirModule,
+                                           mlir::Location loc,
+                                           mlir::StringRef name,
+                                           mlir::Type type,
+                                           cir::GlobalLinkageKind linkage) {
+    mlir::OpBuilder::InsertionGuard guard(*this);
+    setInsertionPointToStart(mlirModule.getBody());
+    return create<cir::GlobalOp>(loc, name, type, linkage);
+  }
+
   cir::GetMemberOp createGetMember(mlir::Location loc, mlir::Type resultTy,
                                    mlir::Value base, llvm::StringRef name,
                                    unsigned index) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a4bc69619d60..adf7cb77f1a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -24,6 +24,7 @@ namespace clang::CIRGen {
 class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   const CIRGenTypeCache &typeCache;
   llvm::StringMap<unsigned> recordNames;
+  llvm::StringMap<unsigned> globalsVersioning;
 
 public:
   CIRGenBuilderTy(mlir::MLIRContext &mlirContext, const CIRGenTypeCache &tc)
@@ -371,6 +372,23 @@ public:
   /// pointed to by \p arrayPtr.
   mlir::Value maybeBuildArrayDecay(mlir::Location loc, mlir::Value arrayPtr,
                                    mlir::Type eltTy);
+
+  /// Creates a versioned global variable. If the symbol is already taken, an ID
+  /// will be appended to the symbol. The returned global must always be queried
+  /// for its name so it can be referenced correctly.
+  [[nodiscard]] cir::GlobalOp
+  createVersionedGlobal(mlir::ModuleOp module, mlir::Location loc,
+                        mlir::StringRef name, mlir::Type type,
+                        cir::GlobalLinkageKind linkage) {
+    // Create a unique name if the given name is already taken.
+    std::string uniqueName;
+    if (unsigned version = globalsVersioning[name.str()]++)
+      uniqueName = name.str() + "." + std::to_string(version);
+    else
+      uniqueName = name.str();
+
+    return createGlobal(module, loc, uniqueName, type, linkage);
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 748c2b5f6fce..1941b5066edb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -208,8 +208,25 @@ void CIRGenFunction::emitVarDecl(const VarDecl &d) {
   if (d.hasExternalStorage())
     return;
 
-  if (d.getStorageDuration() != SD_Automatic)
-    cgm.errorNYI(d.getSourceRange(), "emitVarDecl automatic storage duration");
+  if (d.getStorageDuration() != SD_Automatic) {
+    // Static sampler variables translated to function calls.
+    if (d.getType()->isSamplerT()) {
+      // Nothing needs to be done here, but let's flag it as an error until we
+      // have a test. It requires OpenCL support.
+      cgm.errorNYI(d.getSourceRange(), "emitVarDecl static sampler type");
+      return;
+    }
+
+    cir::GlobalLinkageKind linkage =
+        cgm.getCIRLinkageVarDefinition(&d, /*IsConstant=*/false);
+
+    // FIXME: We need to force the emission/use of a guard variable for
+    // some variables even if we can constant-evaluate them because
+    // we can't guarantee every translation unit will constant-evaluate them.
+
+    return emitStaticVarDecl(d, linkage);
+  }
+
   if (d.getType().getAddressSpace() == LangAS::opencl_local)
     cgm.errorNYI(d.getSourceRange(), "emitVarDecl openCL address space");
 
@@ -219,6 +236,233 @@ void CIRGenFunction::emitVarDecl(const VarDecl &d) {
   return emitAutoVarDecl(d);
 }
 
+static std::string getStaticDeclName(CIRGenModule &cgm, const VarDecl &d) {
+  if (cgm.getLangOpts().CPlusPlus)
+    return cgm.getMangledName(&d).str();
+
+  // If this isn't C++, we don't need a mangled name, just a pretty one.
+  assert(!d.isExternallyVisible() && "name shouldn't matter");
+  std::string contextName;
+  const DeclContext *dc = d.getDeclContext();
+  if (auto *cd = dyn_cast<CapturedDecl>(dc))
+    dc = cast<DeclContext>(cd->getNonClosureContext());
+  if (const auto *fd = dyn_cast<FunctionDecl>(dc))
+    contextName = std::string(cgm.getMangledName(fd));
+  else if (isa<BlockDecl>(dc))
+    cgm.errorNYI(d.getSourceRange(), "block decl context for static var");
+  else if (isa<ObjCMethodDecl>(dc))
+    cgm.errorNYI(d.getSourceRange(), "ObjC decl context for static var");
+  else
+    cgm.errorNYI(d.getSourceRange(), "Unknown context for static var decl");
+
+  contextName += "." + d.getNameAsString();
+  return contextName;
+}
+
+// TODO(cir): LLVM uses a Constant base class. Maybe CIR could leverage an
+// interface for all constants?
+cir::GlobalOp
+CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &d,
+                                       cir::GlobalLinkageKind linkage) {
+  // In general, we don't always emit static var decls once before we reference
+  // them. It is possible to reference them before emitting the function that
+  // contains them, and it is possible to emit the containing function multiple
+  // times.
+  if (cir::GlobalOp existingGV = getStaticLocalDeclAddress(&d))
+    return existingGV;
+
+  QualType ty = d.getType();
+  assert(ty->isConstantSizeType() && "VLAs can't be static");
+
+  // Use the label if the variable is renamed with the asm-label extension.
+  if (d.hasAttr<AsmLabelAttr>())
+    errorNYI(d.getSourceRange(), "getOrCreateStaticVarDecl: asm label");
+
+  std::string name = getStaticDeclName(*this, d);
+
+  mlir::Type lty = getTypes().convertTypeForMem(ty);
+  assert(!cir::MissingFeatures::addressSpace());
+
+  if (d.hasAttr<LoaderUninitializedAttr>() || d.hasAttr<CUDASharedAttr>())
+    errorNYI(d.getSourceRange(),
+             "getOrCreateStaticVarDecl: LoaderUninitializedAttr");
+  assert(!cir::MissingFeatures::addressSpace());
+
+  mlir::Attribute init = builder.getZeroInitAttr(convertType(ty));
+
+  cir::GlobalOp gv = builder.createVersionedGlobal(
+      getModule(), getLoc(d.getLocation()), name, lty, linkage);
+  // TODO(cir): infer visibility from linkage in global op builder.
+  gv.setVisibility(getMLIRVisibilityFromCIRLinkage(linkage));
+  gv.setInitialValueAttr(init);
+  gv.setAlignment(getASTContext().getDeclAlign(&d).getAsAlign().value());
+
+  if (supportsCOMDAT() && gv.isWeakForLinker())
+    gv.setComdat(true);
+
+  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+
+  setGVProperties(gv, &d);
+
+  // OG checks if the expected address space, denoted by the type, is the
+  // same as the actual address space indicated by attributes. If they aren't
+  // the same, an addrspacecast is emitted when this variable is accessed.
+  // In CIR however, cir.get_global already carries that information in
+  // !cir.ptr type - if this global is in OpenCL local address space, then its
+  // type would be !cir.ptr<..., addrspace(offload_local)>. Therefore we don't
+  // need an explicit address space cast in CIR: they will get emitted when
+  // lowering to LLVM IR.
+
+  // Ensure that the static local gets initialized by making sure the parent
+  // function gets emitted eventually.
+  const Decl *dc = cast<Decl>(d.getDeclContext());
+
+  // We can't name blocks or captured statements directly, so try to emit their
+  // parents.
+  if (isa<BlockDecl>(dc) || isa<CapturedDecl>(dc)) {
+    dc = dc->getNonClosureContext();
+    // FIXME: Ensure that global blocks get emitted.
+    if (!dc)
+      errorNYI(d.getSourceRange(), "non-closure context");
+  }
+
+  GlobalDecl gd;
+  if (isa<CXXConstructorDecl>(dc))
+    errorNYI(d.getSourceRange(), "C++ constructors static var context");
+  else if (isa<CXXDestructorDecl>(dc))
+    errorNYI(d.getSourceRange(), "C++ destructors static var context");
+  else if (const auto *fd = dyn_cast<FunctionDecl>(dc))
+    gd = GlobalDecl(fd);
+  else {
+    // Don't do anything for Obj-C method decls or global closures. We should
+    // never defer them.
+    assert(isa<ObjCMethodDecl>(dc) && "unexpected parent code decl");
+  }
+  if (gd.getDecl() && cir::MissingFeatures::openMP()) {
+    // Disable emission of the parent function for the OpenMP device codegen.
+    errorNYI(d.getSourceRange(), "OpenMP");
+  }
+
+  return gv;
+}
+
+/// Add the initializer for 'd' to the global variable that has already been
+/// created for it. If the initializer has a different type than gv does, this
+/// may free gv and return a different one. Otherwise it just returns gv.
+cir::GlobalOp CIRGenFunction::addInitializerToStaticVarDecl(
+    const VarDecl &d, cir::GlobalOp gv, cir::GetGlobalOp gvAddr) {
+  ConstantEmitter emitter(*this);
+  mlir::TypedAttr init =
+      mlir::cast<mlir::TypedAttr>(emitter.tryEmitForInitializer(d));
+
+  // If constant emission failed, then this should be a C++ static
+  // initializer.
+  if (!init) {
+    cgm.errorNYI(d.getSourceRange(), "static var without initializer");
+    return gv;
+  }
+
+  // TODO(cir): There should be debug code here to assert that the decl size
+  // matches the CIR data layout type alloc size, but the code for calculating
+  // the type alloc size is not implemented yet.
+  assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
+
+  // The initializer may differ in type from the global. Rewrite
+  // the global to match the initializer.  (We have to do this
+  // because some types, like unions, can't be completely represented
+  // in the LLVM type system.)
+  if (gv.getSymType() != init.getType()) {
+    gv.setSymType(init.getType());
+
+    // Normally this should be done with a call to cgm.replaceGlobal(oldGV, gv),
+    // but since at this point the current block hasn't been really attached,
+    // there's no visibility into the GetGlobalOp corresponding to this Global.
+    // Given those constraints, thread in the GetGlobalOp and update it
+    // directly.
+    assert(!cir::MissingFeatures::addressSpace());
+    gvAddr.getAddr().setType(builder.getPointerTo(init.getType()));
+  }
+
+  bool needsDtor =
+      d.needsDestruction(getContext()) == QualType::DK_cxx_destructor;
+
+  assert(!cir::MissingFeatures::opGlobalConstant());
+  gv.setInitialValueAttr(init);
+
+  emitter.finalize(gv);
+
+  if (needsDtor) {
+    // We have a constant initializer, but a nontrivial destructor. We still
+    // need to perform a guarded "initialization" in order to register the
+    // destructor.
+    cgm.errorNYI(d.getSourceRange(), "C++ guarded init");
+  }
+
+  return gv;
+}
+
+void CIRGenFunction::emitStaticVarDecl(const VarDecl &d,
+                                       cir::GlobalLinkageKind linkage) {
+  // Check to see if we already have a global variable for this
+  // declaration.  This can happen when double-emitting function
+  // bodies, e.g. with complete and base constructors.
+  cir::GlobalOp globalOp = cgm.getOrCreateStaticVarDecl(d, linkage);
+  // TODO(cir): we should have a way to represent global ops as values without
+  // having to emit a get global op. Sometimes these emissions are not used.
+  mlir::Value addr = builder.createGetGlobal(globalOp);
+  auto getAddrOp = mlir::cast<cir::GetGlobalOp>(addr.getDefiningOp());
+
+  CharUnits alignment = getContext().getDeclAlign(&d);
+
+  // Store into LocalDeclMap before generating initializer to handle
+  // circular references.
+  mlir::Type elemTy = convertTypeForMem(d.getType());
+  setAddrOfLocalVar(&d, Address(addr, elemTy, alignment));
+
+  // We can't have a VLA here, but we can have a pointer to a VLA,
+  // even though that doesn't really make any sense.
+  // Make sure to evaluate VLA bounds now so that we have them for later.
+  if (d.getType()->isVariablyModifiedType()) {
+    cgm.errorNYI(d.getSourceRange(),
+                 "emitStaticVarDecl: variably modified type");
+  }
+
+  // Save the type in case adding the initializer forces a type change.
+  mlir::Type expectedType = addr.getType();
+
+  cir::GlobalOp var = globalOp;
+
+  assert(!cir::MissingFeatures::cudaSupport());
+
+  // If this value has an initializer, emit it.
+  if (d.getInit())
+    var = addInitializerToStaticVarDecl(d, var, getAddrOp);
+
+  var.setAlignment(alignment.getAsAlign().value());
+
+  // There are a lot of attributes that need to be handled here. Until
+  // we start to support them, we just report an error if there are any.
+  if (d.hasAttrs())
+    cgm.errorNYI(d.getSourceRange(), "static var with attrs");
+
+  if (cgm.getCodeGenOpts().KeepPersistentStorageVariables)
+    cgm.errorNYI(d.getSourceRange(), "static var keep persistent storage");
+
+  // From traditional codegen:
+  // We may have to cast the constant because of the initializer
+  // mismatch above.
+  //
+  // FIXME: It is really dangerous to store this in the map; if anyone
+  // RAUW's the GV uses of this constant will be invalid.
+  mlir::Value castedAddr =
+      builder.createBitcast(getAddrOp.getAddr(), expectedType);
+  localDeclMap.find(&d)->second = Address(castedAddr, elemTy, alignment);
+  cgm.setStaticLocalDeclAddress(&d, var);
+
+  assert(!cir::MissingFeatures::sanitizers());
+  assert(!cir::MissingFeatures::generateDebugInfo());
+}
+
 void CIRGenFunction::emitScalarInit(const Expr *init, mlir::Location loc,
                                     LValue lvalue, bool capturedByInit) {
   assert(!cir::MissingFeatures::objCLifetime());
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 9421ea26a429..318d3fbf3f9e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -469,6 +469,10 @@ public:
   /// compare the result against zero, returning an Int1Ty value.
   mlir::Value evaluateExprAsBool(const clang::Expr *e);
 
+  cir::GlobalOp addInitializerToStaticVarDecl(const VarDecl &d,
+                                              cir::GlobalOp gv,
+                                              cir::GetGlobalOp gvAddr);
+
   /// Set the address of a local variable.
   void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
     assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!");
@@ -955,6 +959,8 @@ public:
   void emitScalarInit(const clang::Expr *init, mlir::Location loc,
                       LValue lvalue, bool capturedByInit = false);
 
+  void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage);
+
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
                          clang::QualType ty, bool isInit = false,
                          bool isNontemporal = false);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index f76fd8e73364..03606dba200f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -113,8 +113,21 @@ public:
 
   mlir::Operation *lastGlobalOp = nullptr;
 
+  llvm::DenseMap<const Decl *, cir::GlobalOp> staticLocalDeclMap;
+
   mlir::Operation *getGlobalValue(llvm::StringRef ref);
 
+  cir::GlobalOp getStaticLocalDeclAddress(const VarDecl *d) {
+    return staticLocalDeclMap[d];
+  }
+
+  void setStaticLocalDeclAddress(const VarDecl *d, cir::GlobalOp c) {
+    staticLocalDeclMap[d] = c;
+  }
+
+  cir::GlobalOp getOrCreateStaticVarDecl(const VarDecl &d,
+                                         cir::GlobalLinkageKind linkage);
+
   /// If the specified mangled name is not in the module, create and return an
   /// mlir::GlobalOp value
   cir::GlobalOp getOrCreateCIRGlobal(llvm::StringRef mangledName, mlir::Type ty,
diff --git a/clang/test/CIR/CodeGen/static-vars.c b/clang/test/CIR/CodeGen/static-vars.c
new file mode 100644
index 000000000000..f45a41d9a00f
--- /dev/null
+++ b/clang/test/CIR/CodeGen/static-vars.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+void func1(void) {
+  // Should lower default-initialized static vars.
+  static int i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func1.i = #cir.int<0> : !s32i
+
+  // Should lower constant-initialized static vars.
+  static int j = 1;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func1.j = #cir.int<1> : !s32i
+
+  // Should properly shadow static vars in nested scopes.
+  {
+    static int j = 2;
+    // CHECK-DAG: cir.global "private" internal dsolocal @func1.j.1 = #cir.int<2> : !s32i
+  }
+  {
+    static int j = 3;
+    // CHECK-DAG: cir.global "private" internal dsolocal @func1.j.2 = #cir.int<3> : !s32i
+  }
+
+  // Should lower basic static vars arithmetics.
+  j++;
+  // CHECK-DAG: %[[#V2:]] = cir.get_global @func1.j : !cir.ptr<!s32i>
+  // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i
+  // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr<!s32i>
+}
+
+// Should shadow static vars on different functions.
+void func2(void) {
+  static char i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func2.i = #cir.int<0> : !s8i
+  static float j;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func2.j = #cir.fp<0.000000e+00> : !cir.float
+}
diff --git a/clang/test/CIR/CodeGen/static-vars.cpp b/clang/test/CIR/CodeGen/static-vars.cpp
new file mode 100644
index 000000000000..9b892c69a6fe
--- /dev/null
+++ b/clang/test/CIR/CodeGen/static-vars.cpp
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t1.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t1.ll %s
+
+void func1(void) {
+  // Should lower default-initialized static vars.
+  static int i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1i = #cir.int<0> : !s32i
+
+  // Should lower constant-initialized static vars.
+  static int j = 1;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j = #cir.int<1> : !s32i
+
+  // Should properly shadow static vars in nested scopes.
+  {
+    static int j = 2;
+    // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j_0 = #cir.int<2> : !s32i
+  }
+  {
+    static int j = 3;
+    // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j_1 = #cir.int<3> : !s32i
+  }
+
+  // Should lower basic static vars arithmetics.
+  j++;
+  // CHECK-DAG: %[[#V2:]] = cir.get_global @_ZZ5func1vE1j : !cir.ptr<!s32i>
+  // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i
+  // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr<!s32i>
+}
+
+// Should shadow static vars on different functions.
+void func2(void) {
+  static char i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func2vE1i = #cir.int<0> : !s8i
+  static float j;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func2vE1j = #cir.fp<0.000000e+00> : !cir.float
+}
+
+// CHECK-DAG: cir.global linkonce_odr comdat @_ZZ4testvE1c = #cir.int<0> : !s32i
+
+// LLVM-DAG: $_ZZ4testvE1c = comdat any
+// LLVM-DAG: @_ZZ4testvE1c = linkonce_odr global i32 0, comdat, align 4
+
+inline void test() { static int c; }
+// CHECK-LABEL: @_Z4testv
+// CHECK: {{%.*}} = cir.get_global @_ZZ4testvE1c : !cir.ptr<!s32i>
+void foo() { test(); }

From 79e06bf1ae9961c5045134288fd8acc9173f6be2 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Fri, 13 Jun 2025 12:22:21 -0600
Subject: [PATCH 0332/1322] [AMDGPU] Extended vector promotion to aggregate
 types. (#143784)

Extends the `amdgpu-promote-alloca-to-vector` pass to also promote
aggregate types whose elements are all the same type to vector
registers.

The motivation for this extension was to account for IR generated by the
frontend containing several singleton struct types containing vectors or
vector-like elements, though the implementation is strictly more
general.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 108 ++++---
 .../CodeGen/AMDGPU/promote-alloca-structs.ll  | 286 ++++++++++++++++++
 2 files changed, 352 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 700dc87d2f82..e90a3a275f67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,6 +818,39 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
+/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
+/// type is non-homogeneous.
+static Type *getHomogeneousType(Type *Ty) {
+  Type *ElemTy = nullptr;
+  SmallVector<Type *> WorkList;
+  WorkList.push_back(Ty);
+  while (!WorkList.empty()) {
+    Type *CurTy = WorkList.pop_back_val();
+
+    // Check if the current type is an aggregate type.
+    if (auto *VectorTy = dyn_cast<FixedVectorType>(CurTy)) {
+      WorkList.push_back(VectorTy->getElementType());
+      continue;
+    }
+    if (auto *ArrayTy = dyn_cast<ArrayType>(CurTy)) {
+      WorkList.push_back(ArrayTy->getElementType());
+      continue;
+    }
+    if (auto *StructTy = dyn_cast<StructType>(CurTy)) {
+      WorkList.append(StructTy->element_begin(), StructTy->element_end());
+      continue;
+    }
+
+    // If not, it must be the same as all other non-aggregate types.
+    if (!ElemTy)
+      ElemTy = CurTy;
+    else if (ElemTy != CurTy)
+      return nullptr;
+  }
+
+  return ElemTy;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -828,43 +861,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   Type *AllocaTy = Alloca.getAllocatedType();
-  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
-  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
-    uint64_t NumElems = 1;
-    Type *ElemTy;
-    do {
-      NumElems *= ArrayTy->getNumElements();
-      ElemTy = ArrayTy->getElementType();
-    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
+  Type *ElemTy = getHomogeneousType(AllocaTy);
 
-    // Check for array of vectors
-    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
-    if (InnerVectorTy) {
-      NumElems *= InnerVectorTy->getNumElements();
-      ElemTy = InnerVectorTy->getElementType();
-    }
-
-    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
-      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      if (ElementSize > 0) {
-        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-        // Expand vector if required to match padding of inner type,
-        // i.e. odd size subvectors.
-        // Storage size of new vector must match that of alloca for correct
-        // behaviour of byte offsets and GEP computation.
-        if (NumElems * ElementSize != AllocaSize)
-          NumElems = AllocaSize / ElementSize;
-        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-          VectorTy = FixedVectorType::get(ElemTy, NumElems);
-      }
-    }
-  }
-
-  if (!VectorTy) {
+  if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  if (ElementSize == 0) {
+    LLVM_DEBUG(dbgs() << "  Cannot create vector of zero-sized elements\n");
+    return false;
+  }
+
+  // Calculate the size of the corresponding vector, accounting for padding of
+  // inner types, e.g., odd-sized subvectors. Storage size of new vector must
+  // match that of alloca for correct behaviour of byte offsets and GEP
+  // computation.
+  unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+  unsigned NumElems = AllocaSize / ElementSize;
+  if (NumElems == 0) {
+    LLVM_DEBUG(dbgs() << "  Cannot vectorize an empty aggregate type\n");
+    return false;
+  }
+  if (NumElems * ElementSize != AllocaSize) {
+    LLVM_DEBUG(
+        dbgs() << "  Cannot convert type into vector of the same size\n");
+    return false;
+  }
+  auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
+  assert(VectorTy && "Failed to create vector type.");
+
   const unsigned MaxElements =
       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
 
@@ -895,15 +928,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
-  Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
-  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
@@ -943,7 +967,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
new file mode 100644
index 000000000000..1cdd027fef89
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
@@ -0,0 +1,286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s
+
+define i8 @test_v4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_v4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca <4 x i8>, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [4 x i8], align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2v4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2v4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2v3i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2v3i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2a4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2a4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2a3i8(i48 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2a3i8(
+; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
+  store i48 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s1v4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s1v4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<4 x i8>}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s1a4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s1a4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {[4 x i8]}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2v4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2v4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2v2i8v4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2v2i8v3i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2s2i8s4i8(
+; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
+  store i48 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2s2i8s3i8(
+; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <5 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
+  store i40 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s3i8i8s0(
+; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <2 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {i8, i8, {}}, align 4, addrspace(5)
+  store i16 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+; heterogeneous element types are not supported
+define i8 @test_heterogeneous(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_heterogeneous(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
+; CHECK-NEXT:    store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+  %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+; empty types are not supported
+define void @test_empty() {
+; CHECK-LABEL: define void @test_empty() {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca {}, align 4, addrspace(5)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {}, align 4, addrspace(5)
+  ret void
+}
+
+; singleton types are not supported
+define i8 @test_singleton(i8 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_singleton(
+; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
+; CHECK-NEXT:    store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+  %stack = alloca {i8, {}}, align 4, addrspace(5)
+  store i8 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}

From a08de429e4ae0baaed23060cbae5c73dc6ffcc5d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 13 Jun 2025 14:46:54 -0400
Subject: [PATCH 0333/1322] [gn] port cc365331af42

---
 llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index fec917c25b19..ca05ac1b2464 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -299,6 +299,7 @@ write_cmake_config("llvm-config") {
     "LLVM_ENABLE_TELEMETRY=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
     "LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=",
+    "LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING=",
     "LLVM_ENABLE_DUMP=",
     "LLVM_ENABLE_HTTPLIB=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",

From 2f1e6eb6c3e731266052536c3f98cce3a71a316e Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Fri, 13 Jun 2025 11:58:48 -0700
Subject: [PATCH 0334/1322] [BPF] Report an warning if certain insn imm operand
 cannot fit in 32bit (#142989)

Ihor Solodrai reported a case ([1]) where gcc reports an error but clang
ignores that error and proceeds to generate incorrect code. More
specifically, the problematic code looks like:
   if r1 == 0xcafefeeddeadbeef goto <label>

Here, 0xcafefeeddeadbeef needs to be encoded in a 32-bit imm field
of the insns and the 32-bit imm allows sign extenstion to 64-bit imm.
Obviously, 0xcafefeeddeadbeef cannot encode properly.

The compilation failed for gcc with the following error:
  Error: immediate out of range, shall fit in 32 bits

Given a 64-bit imm value, converting to the proper 32-bit imm value
must satisfy the following 64-bit patterns:
  00000000 00000000 00000000 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
  11111111 11111111 11111111 11111111 1xxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx

So if the top 32-bits is 0 or the top 33-bits is 0x1ffffffff, then the 64-bit imm
value can be truncated into proper 32-bit imm. Otherwise, a warning
message, the same as gcc, will be issued. If -Werror is enabled during
compilation, the warning will turn into an error.

[1] https://lore.kernel.org/bpf/70affb12-327b-4882-bd1d-afda8b8c6f56@linux.dev/
---
 .../BPF/MCTargetDesc/BPFMCCodeEmitter.cpp     | 22 ++++++++++++++-----
 llvm/test/CodeGen/BPF/warn-cmp.ll             | 15 +++++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/warn-cmp.ll

diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 10a46f100bbe..bd9d2de58c8b 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
@@ -33,11 +34,12 @@ namespace {
 class BPFMCCodeEmitter : public MCCodeEmitter {
   const MCRegisterInfo &MRI;
   bool IsLittleEndian;
+  MCContext &Ctx;
 
 public:
   BPFMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri,
-                   bool IsLittleEndian)
-      : MRI(mri), IsLittleEndian(IsLittleEndian) { }
+                   bool IsLittleEndian, MCContext &ctx)
+      : MRI(mri), IsLittleEndian(IsLittleEndian), Ctx(ctx) {}
   BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
   void operator=(const BPFMCCodeEmitter &) = delete;
   ~BPFMCCodeEmitter() override = default;
@@ -67,12 +69,12 @@ public:
 
 MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
                                             MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), true);
+  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), true, Ctx);
 }
 
 MCCodeEmitter *llvm::createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
                                               MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), false);
+  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), false, Ctx);
 }
 
 unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
@@ -81,8 +83,16 @@ unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                              const MCSubtargetInfo &STI) const {
   if (MO.isReg())
     return MRI.getEncodingValue(MO.getReg());
-  if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+  if (MO.isImm()) {
+    uint64_t Imm = MO.getImm();
+    uint64_t High32Bits = Imm >> 32, High33Bits = Imm >> 31;
+    if (MI.getOpcode() != BPF::LD_imm64 && High32Bits != 0 &&
+        High33Bits != 0x1FFFFFFFFULL) {
+      Ctx.reportWarning(MI.getLoc(),
+                        "immediate out of range, shall fit in 32 bits");
+    }
+    return static_cast<unsigned>(Imm);
+  }
 
   assert(MO.isExpr());
 
diff --git a/llvm/test/CodeGen/BPF/warn-cmp.ll b/llvm/test/CodeGen/BPF/warn-cmp.ll
new file mode 100644
index 000000000000..109d177b0fb4
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/warn-cmp.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=bpfel -filetype=obj < %s 2>&1 >/dev/null | FileCheck %s
+
+; CHECK: warning: immediate out of range, shall fit in 32 bits
+define dso_local void @test_1() naked {
+  tail call void asm sideeffect
+    "r1 = 40; if r1 == 0x1deadbeef goto +0; r0 = 0; exit;", "~{r0},~{r1}"()
+  unreachable
+}
+
+; CHECK: warning: immediate out of range, shall fit in 32 bits
+define dso_local void @test_2() naked {
+  tail call void asm sideeffect
+    "r1 = 40; if r1 == 0xffffffff00000000 goto +0; r0 = 0; exit;", "~{r0},~{r1}"()
+  unreachable
+}

From 90d98a38b273f5d62424a3815447675860947927 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Fri, 13 Jun 2025 21:05:44 +0200
Subject: [PATCH 0335/1322] Revert "[Clang] Added explanation why
 `is_constructible` evaluated to false. " (#144127)

Reverts llvm/llvm-project#143309

Someone needs to go through the libc++ tests and update the diagnostics
checks in those tests (ie, i don't believe there was anything wrong with
the PR, but it impacts libc++ tests nonetheless
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  8 +--
 clang/lib/Sema/SemaTypeTraits.cpp             | 71 +------------------
 clang/test/CXX/drs/cwg18xx.cpp                |  3 +-
 ...overload-resolution-deferred-templates.cpp | 19 ++---
 .../type-traits-unsatisfied-diags-std.cpp     | 66 -----------------
 .../SemaCXX/type-traits-unsatisfied-diags.cpp | 62 ----------------
 6 files changed, 10 insertions(+), 219 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 95d24e9f1e6b..8fe7ad6138aa 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1767,8 +1767,7 @@ def note_unsatisfied_trait
     : Note<"%0 is not %enum_select<TraitName>{"
            "%TriviallyRelocatable{trivially relocatable}|"
            "%Replaceable{replaceable}|"
-           "%TriviallyCopyable{trivially copyable}|"
-           "%Constructible{constructible with provided types}"
+           "%TriviallyCopyable{trivially copyable}"
            "}1">;
 
 def note_unsatisfied_trait_reason
@@ -1798,10 +1797,7 @@ def note_unsatisfied_trait_reason
            "%DeletedAssign{has a deleted %select{copy|move}1 "
            "assignment operator}|"
            "%UnionWithUserDeclaredSMF{is a union with a user-declared "
-           "%sub{select_special_member_kind}1}|"
-           "%FunctionType{is a function type}|"
-           "%CVVoidType{is a cv void type}|"
-           "%IncompleteArrayType{is an incomplete array type}"
+           "%sub{select_special_member_kind}1}"
            "}0">;
 
 def warn_consteval_if_always_true : Warning<
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 22c690bedc1e..1738ab446600 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/DiagnosticSema.h"
@@ -1948,7 +1947,6 @@ static std::optional<TypeTrait> StdNameToTypeTrait(StringRef Name) {
             TypeTrait::UTT_IsCppTriviallyRelocatable)
       .Case("is_replaceable", TypeTrait::UTT_IsReplaceable)
       .Case("is_trivially_copyable", TypeTrait::UTT_IsTriviallyCopyable)
-      .Case("is_constructible", TypeTrait::TT_IsConstructible)
       .Default(std::nullopt);
 }
 
@@ -1985,16 +1983,8 @@ static ExtractedTypeTraitInfo ExtractTypeTraitFromExpression(const Expr *E) {
     Trait = StdNameToTypeTrait(Name);
     if (!Trait)
       return std::nullopt;
-    for (const auto &Arg : VD->getTemplateArgs().asArray()) {
-      if (Arg.getKind() == TemplateArgument::ArgKind::Pack) {
-        for (const auto &InnerArg : Arg.pack_elements())
-          Args.push_back(InnerArg.getAsType());
-      } else if (Arg.getKind() == TemplateArgument::ArgKind::Type) {
-        Args.push_back(Arg.getAsType());
-      } else {
-        llvm_unreachable("Unexpected kind");
-      }
-    }
+    for (const auto &Arg : VD->getTemplateArgs().asArray())
+      Args.push_back(Arg.getAsType());
     return {{Trait.value(), std::move(Args)}};
   }
 
@@ -2267,60 +2257,6 @@ static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
   }
 }
 
-static void DiagnoseNonConstructibleReason(
-    Sema &SemaRef, SourceLocation Loc,
-    const llvm::SmallVector<clang::QualType, 1> &Ts) {
-  if (Ts.empty()) {
-    return;
-  }
-
-  bool ContainsVoid = false;
-  for (const QualType &ArgTy : Ts) {
-    ContainsVoid |= ArgTy->isVoidType();
-  }
-
-  if (ContainsVoid)
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::CVVoidType;
-
-  QualType T = Ts[0];
-  if (T->isFunctionType())
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::FunctionType;
-
-  if (T->isIncompleteArrayType())
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::IncompleteArrayType;
-
-  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
-  if (!D || D->isInvalidDecl() || !D->hasDefinition())
-    return;
-
-  llvm::BumpPtrAllocator OpaqueExprAllocator;
-  SmallVector<Expr *, 2> ArgExprs;
-  ArgExprs.reserve(Ts.size() - 1);
-  for (unsigned I = 1, N = Ts.size(); I != N; ++I) {
-    QualType ArgTy = Ts[I];
-    if (ArgTy->isObjectType() || ArgTy->isFunctionType())
-      ArgTy = SemaRef.Context.getRValueReferenceType(ArgTy);
-    ArgExprs.push_back(
-        new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
-            OpaqueValueExpr(Loc, ArgTy.getNonLValueExprType(SemaRef.Context),
-                            Expr::getValueKindForType(ArgTy)));
-  }
-
-  EnterExpressionEvaluationContext Unevaluated(
-      SemaRef, Sema::ExpressionEvaluationContext::Unevaluated);
-  Sema::ContextRAII TUContext(SemaRef,
-                              SemaRef.Context.getTranslationUnitDecl());
-  InitializedEntity To(InitializedEntity::InitializeTemporary(T));
-  InitializationKind InitKind(InitializationKind::CreateDirect(Loc, Loc, Loc));
-  InitializationSequence Init(SemaRef, To, InitKind, ArgExprs);
-
-  Init.Diagnose(SemaRef, To, InitKind, ArgExprs);
-  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
-}
-
 static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
                                                SourceLocation Loc, QualType T) {
   SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
@@ -2360,9 +2296,6 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   case UTT_IsTriviallyCopyable:
     DiagnoseNonTriviallyCopyableReason(*this, E->getBeginLoc(), Args[0]);
     break;
-  case TT_IsConstructible:
-    DiagnoseNonConstructibleReason(*this, E->getBeginLoc(), Args);
-    break;
   default:
     break;
   }
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 994807585213..5b4551ba0143 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -564,12 +564,11 @@ struct A {
 namespace ex2 {
 #if __cplusplus >= 201103L
 struct Bar {
-  struct Baz { // #cwg1890-Baz
+  struct Baz {
     int a = 0;
   };
   static_assert(__is_constructible(Baz), "");
   // since-cxx11-error@-1 {{static assertion failed due to requirement '__is_constructible(cwg1890::ex2::Bar::Baz)'}}
-  // since-cxx11-note@#cwg1890-Baz {{'Baz' defined here}}
 };
 #endif
 } // namespace ex2
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 46c367084852..7cb71e075d50 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -80,30 +80,21 @@ struct ImplicitlyCopyable {
 static_assert(__is_constructible(ImplicitlyCopyable, const ImplicitlyCopyable&));
 
 
-struct Movable { // #Movable
+struct Movable {
   template <typename T>
   requires __is_constructible(Movable, T) // #err-self-constraint-1
-  explicit Movable(T op) noexcept; // #Movable1
-  Movable(Movable&&) noexcept = default; // #Movable2
+  explicit Movable(T op) noexcept; // #1
+  Movable(Movable&&) noexcept = default; // #2
 };
 static_assert(__is_constructible(Movable, Movable&&));
 static_assert(__is_constructible(Movable, const Movable&));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}} \
-// expected-error@-1 {{call to implicitly-deleted copy constructor of 'Movable'}} \
-// expected-note@#Movable  {{'Movable' defined here}} \
-// expected-note@#Movable  {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const Movable' for 1st argument}} \
-// expected-note@#Movable2  {{copy constructor is implicitly deleted because 'Movable' has a user-declared move constructor}} \
-// expected-note@#Movable2  {{candidate constructor not viable: no known conversion from 'int' to 'Movable' for 1st argument}} \
-// expected-note@#Movable1  {{candidate template ignored: constraints not satisfied [with T = int]}}
-
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}}
 
 static_assert(__is_constructible(Movable, int));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
-// expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
+// expected-error@-1{{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
 // expected-note@#err-self-constraint-1 4{{}}
-// expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
 struct Members {
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
index a403a0450607..329b611110c1 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
@@ -20,14 +20,6 @@ struct is_trivially_copyable {
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
-
-template <typename... Args>
-struct is_constructible {
-    static constexpr bool value = __is_constructible(Args...);
-};
-
-template <typename... Args>
-constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 #ifdef STD2
@@ -52,17 +44,6 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
-
-template <typename... Args>
-struct __details_is_constructible{
-    static constexpr bool value = __is_constructible(Args...);
-};
-
-template <typename... Args>
-using is_constructible  = __details_is_constructible<Args...>;
-
-template <typename... Args>
-constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 
@@ -92,15 +73,6 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
-
-template <typename... Args>
-struct __details_is_constructible : bool_constant<__is_constructible(Args...)> {};
-
-template <typename... Args>
-using is_constructible  = __details_is_constructible<Args...>;
-
-template <typename... Args>
-constexpr bool is_constructible_v = is_constructible<Args...>::value;
 #endif
 
 }
@@ -128,15 +100,6 @@ static_assert(std::is_trivially_copyable_v<int&>);
 // expected-note@-1 {{because it is a reference type}}
 
 
-static_assert(std::is_constructible<int, int>::value);
-
-static_assert(std::is_constructible<void>::value);
-// expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_constructible<void>::value'}} \
-// expected-note@-1 {{because it is a cv void type}}
-static_assert(std::is_constructible_v<void>);
-// expected-error@-1 {{static assertion failed due to requirement 'std::is_constructible_v<void>'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
 namespace test_namespace {
     using namespace std;
     static_assert(is_trivially_relocatable<int&>::value);
@@ -156,13 +119,6 @@ namespace test_namespace {
     // expected-error@-1 {{static assertion failed due to requirement 'is_trivially_copyable_v<int &>'}} \
     // expected-note@-1 {{'int &' is not trivially copyable}} \
     // expected-note@-1 {{because it is a reference type}}
-
-    static_assert(is_constructible<void>::value);
-    // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_constructible<void>::value'}} \
-    // expected-note@-1 {{because it is a cv void type}}
-    static_assert(is_constructible_v<void>);
-    // expected-error@-1 {{static assertion failed due to requirement 'is_constructible_v<void>'}} \
-    // expected-note@-1 {{because it is a cv void type}}
 }
 
 
@@ -183,15 +139,6 @@ concept C2 = std::is_trivially_copyable_v<T>; // #concept4
 
 template <C2 T> void g2();  // #cand4
 
-template <typename... Args>
-requires std::is_constructible<Args...>::value void f3();  // #cand5
-
-template <typename... Args>
-concept C3 = std::is_constructible_v<Args...>; // #concept6
-
-template <C3 T> void g3();  // #cand6
-
-
 void test() {
     f<int&>();
     // expected-error@-1 {{no matching function for call to 'f'}} \
@@ -222,19 +169,6 @@ void test() {
     // expected-note@#concept4 {{because 'std::is_trivially_copyable_v<int &>' evaluated to false}} \
     // expected-note@#concept4 {{'int &' is not trivially copyable}} \
     // expected-note@#concept4 {{because it is a reference type}}
-
-    f3<void>();
-    // expected-error@-1 {{no matching function for call to 'f3'}} \
-    // expected-note@#cand5 {{candidate template ignored: constraints not satisfied [with Args = <void>]}} \
-    // expected-note-re@#cand5 {{because '{{.*}}is_constructible<void>::value' evaluated to false}} \
-    // expected-note@#cand5 {{because it is a cv void type}}
-
-    g3<void>();
-    // expected-error@-1 {{no matching function for call to 'g3'}} \
-    // expected-note@#cand6 {{candidate template ignored: constraints not satisfied [with T = void]}} \
-    // expected-note@#cand6 {{because 'void' does not satisfy 'C3'}} \
-    // expected-note@#concept6 {{because 'std::is_constructible_v<void>' evaluated to false}} \
-    // expected-note@#concept6 {{because it is a cv void type}}
 }
 }
 
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index d0b3f294fbca..a8c78f6304ca 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -488,65 +488,3 @@ static_assert(__is_trivially_copyable(S12));
 // expected-note@-1 {{'S12' is not trivially copyable}} \
 // expected-note@#tc-S12 {{'S12' defined here}}
 }
-
-namespace constructible {
-
-struct S1 {  // #c-S1
-    S1(int); // #cc-S1
-};
-static_assert(__is_constructible(S1, char*));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S1, char *)'}} \
-// expected-error@-1 {{no matching constructor for initialization of 'S1'}} \
-// expected-note@#c-S1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'char *' to 'const S1' for 1st argument}} \
-// expected-note@#c-S1 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'char *' to 'S1' for 1st argument}} \
-// expected-note@#cc-S1 {{candidate constructor not viable: no known conversion from 'char *' to 'int' for 1st argument; dereference the argument with *}} \
-// expected-note@#c-S1 {{'S1' defined here}}
-
-struct S2 { // #c-S2
-    S2(int, float, double); // #cc-S2
-};
-static_assert(__is_constructible(S2, float));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float)'}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'float' to 'const S2' for 1st argument}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'float' to 'S2' for 1st argument}} \
-// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
-// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 1 was provided}} \
-// expected-note@#c-S2 {{'S2' defined here}}
-
-static_assert(__is_constructible(S2, float, void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float, void)'}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} \
-// expected-note@-1{{because it is a cv void type}} \
-// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
-// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 2 were provided}} \
-// expected-note@#c-S2 {{'S2' defined here}}
-
-static_assert(__is_constructible(int[]));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int[])'}} \
-// expected-note@-1 {{because it is an incomplete array type}}
-
-static_assert(__is_constructible(void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(void, void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void, void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(const void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(const void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(volatile void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(volatile void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(int ()));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int ())'}} \
-// expected-note@-1 {{because it is a function type}}
-
-static_assert(__is_constructible(void (int, float)));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void (int, float))'}} \
-// expected-note@-1 {{because it is a function type}}
-}

From 83f215b0350289f3bd349c1f85826a58d8d80f03 Mon Sep 17 00:00:00 2001
From: Fabian Meumertzheim <fabian@meumertzhe.im>
Date: Fri, 13 Jun 2025 21:09:58 +0200
Subject: [PATCH 0336/1322] Reland "[llvm-cov] Add support for baseline
 coverage" (#144130)

When no profile is provided, but the new --empty-profile option is
specified, the export/report/show commands now emit coverage data
equivalent to that obtained from a profile with all zero counters
("baseline coverage").

This is useful for build systems (e.g. Bazel) that can track coverage
information for each build target, even those that are never linked into
tests and thus don't have runtime coverage data recorded. By merging in
baseline coverage, lines in files that aren't linked into tests are
correctly reported as uncovered.

Reland with fixes to `CoverageMappingTest.cpp`.

Reverts llvm/llvm-project#144121
---
 llvm/docs/CommandGuide/llvm-cov.rst           |  15 +++
 .../ProfileData/Coverage/CoverageMapping.h    |  24 ++--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 123 +++++++++++-------
 ...showLineExecutionCounts-lcov-baseline.test |  37 ++++++
 llvm/tools/llvm-cov/CodeCoverage.cpp          |  78 +++++++----
 .../ProfileData/CoverageMappingTest.cpp       |   4 +-
 6 files changed, 195 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test

diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst
index 968f3c452f55..f4db60cf06fa 100644
--- a/llvm/docs/CommandGuide/llvm-cov.rst
+++ b/llvm/docs/CommandGuide/llvm-cov.rst
@@ -380,6 +380,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -470,6 +475,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
@@ -562,6 +572,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Export the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 CONVERT-FOR-TESTING COMMAND
 ---------------------------
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index e62ce5e3d8fa..d1230b0ba7c5 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -991,18 +991,23 @@ class CoverageMapping {
   // Load coverage records from readers.
   static Error loadFromReaders(
       ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-      IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage);
+      std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader,
+      CoverageMapping &Coverage);
 
   // Load coverage records from file.
   static Error
   loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir,
-               IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-               bool &DataFound,
+               std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+                   &ProfileReader,
+               CoverageMapping &Coverage, bool &DataFound,
                SmallVectorImpl<object::BuildID> *FoundBinaryIDs = nullptr);
 
   /// Add a function record corresponding to \p Record.
-  Error loadFunctionRecord(const CoverageMappingRecord &Record,
-                           IndexedInstrProfReader &ProfileReader);
+  Error loadFunctionRecord(
+      const CoverageMappingRecord &Record,
+      const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader);
 
   /// Look up the indices for function records which are at least partially
   /// defined in the specified file. This is guaranteed to return a superset of
@@ -1018,15 +1023,16 @@ public:
   /// Load the coverage mapping using the given readers.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-       IndexedInstrProfReader &ProfileReader);
+       std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+           &ProfileReader);
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
   /// Ignores non-instrumented object files unless all are not instrumented.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
-  load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-       vfs::FileSystem &FS, ArrayRef<StringRef> Arches = {},
-       StringRef CompilationDir = "",
+  load(ArrayRef<StringRef> ObjectFilenames,
+       std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+       ArrayRef<StringRef> Arches = {}, StringRef CompilationDir = "",
        const object::BuildIDFetcher *BIDFetcher = nullptr,
        bool CheckBinaryIDs = false);
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index dd74eb054a34..429ec5c19f1f 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -823,7 +823,8 @@ public:
 
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
-    IndexedInstrProfReader &ProfileReader) {
+    const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
   if (OrigFuncName.empty())
     return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -837,35 +838,44 @@ Error CoverageMapping::loadFunctionRecord(
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
-  if (Error E = ProfileReader.getFunctionCounts(Record.FunctionName,
-                                                Record.FunctionHash, Counts)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionCounts(
+            Record.FunctionName, Record.FunctionHash, Counts)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
+  } else {
     Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
   }
   Ctx.setCounts(Counts);
 
   bool IsVersion11 =
-      ProfileReader.getVersion() < IndexedInstrProf::ProfVersion::Version12;
+      ProfileReader && ProfileReader.value().get().getVersion() <
+                           IndexedInstrProf::ProfVersion::Version12;
 
   BitVector Bitmap;
-  if (Error E = ProfileReader.getFunctionBitmap(Record.FunctionName,
-                                                Record.FunctionHash, Bitmap)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionBitmap(
+            Record.FunctionName, Record.FunctionHash, Bitmap)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
-    Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
+  } else {
+    Bitmap = BitVector(getMaxBitmapSize(Record, false));
   }
   Ctx.setBitmap(std::move(Bitmap));
 
@@ -959,10 +969,14 @@ Error CoverageMapping::loadFunctionRecord(
 // of CoverageMappingReader instances.
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
-  assert(!Coverage.SingleByteCoverage ||
-         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
-  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage || !ProfileReader ||
+         *Coverage.SingleByteCoverage ==
+             ProfileReader.value().get().hasSingleByteCoverage());
+  Coverage.SingleByteCoverage =
+      !ProfileReader || ProfileReader.value().get().hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -977,7 +991,8 @@ Error CoverageMapping::loadFromReaders(
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   if (Error E = loadFromReaders(CoverageReaders, ProfileReader, *Coverage))
     return std::move(E);
@@ -986,18 +1001,19 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
 // If E is a no_data_found error, returns success. Otherwise returns E.
 static Error handleMaybeNoDataFoundError(Error E) {
-  return handleErrors(
-      std::move(E), [](const CoverageMapError &CME) {
-        if (CME.get() == coveragemap_error::no_data_found)
-          return static_cast<Error>(Error::success());
-        return make_error<CoverageMapError>(CME.get(), CME.getMessage());
-      });
+  return handleErrors(std::move(E), [](const CoverageMapError &CME) {
+    if (CME.get() == coveragemap_error::no_data_found)
+      return static_cast<Error>(Error::success());
+    return make_error<CoverageMapError>(CME.get(), CME.getMessage());
+  });
 }
 
 Error CoverageMapping::loadFromFile(
     StringRef Filename, StringRef Arch, StringRef CompilationDir,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-    bool &DataFound, SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage, bool &DataFound,
+    SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
   auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
       Filename, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = CovMappingBufOrErr.getError())
@@ -1033,13 +1049,23 @@ Error CoverageMapping::loadFromFile(
 }
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
-    ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-    vfs::FileSystem &FS, ArrayRef<StringRef> Arches, StringRef CompilationDir,
+    ArrayRef<StringRef> ObjectFilenames,
+    std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+    ArrayRef<StringRef> Arches, StringRef CompilationDir,
     const object::BuildIDFetcher *BIDFetcher, bool CheckBinaryIDs) {
-  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename, FS);
-  if (Error E = ProfileReaderOrErr.takeError())
-    return createFileError(ProfileFilename, std::move(E));
-  auto ProfileReader = std::move(ProfileReaderOrErr.get());
+  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
+  if (ProfileFilename) {
+    auto ProfileReaderOrErr =
+        IndexedInstrProfReader::create(ProfileFilename.value(), FS);
+    if (Error E = ProfileReaderOrErr.takeError())
+      return createFileError(ProfileFilename.value(), std::move(E));
+    ProfileReader = std::move(ProfileReaderOrErr.get());
+  }
+  auto ProfileReaderRef =
+      ProfileReader
+          ? std::optional<std::reference_wrapper<IndexedInstrProfReader>>(
+                *ProfileReader)
+          : std::nullopt;
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
 
@@ -1053,16 +1079,17 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
   SmallVector<object::BuildID> FoundBinaryIDs;
   for (const auto &File : llvm::enumerate(ObjectFilenames)) {
-    if (Error E =
-            loadFromFile(File.value(), GetArch(File.index()), CompilationDir,
-                         *ProfileReader, *Coverage, DataFound, &FoundBinaryIDs))
+    if (Error E = loadFromFile(File.value(), GetArch(File.index()),
+                               CompilationDir, ProfileReaderRef, *Coverage,
+                               DataFound, &FoundBinaryIDs))
       return std::move(E);
   }
 
   if (BIDFetcher) {
     std::vector<object::BuildID> ProfileBinaryIDs;
-    if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
-      return createFileError(ProfileFilename, std::move(E));
+    if (ProfileReader)
+      if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
+        return createFileError(ProfileFilename.value(), std::move(E));
 
     SmallVector<object::BuildIDRef> BinaryIDsToFetch;
     if (!ProfileBinaryIDs.empty()) {
@@ -1082,12 +1109,12 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
       if (PathOpt) {
         std::string Path = std::move(*PathOpt);
         StringRef Arch = Arches.size() == 1 ? Arches.front() : StringRef();
-        if (Error E = loadFromFile(Path, Arch, CompilationDir, *ProfileReader,
-                                  *Coverage, DataFound))
+        if (Error E = loadFromFile(Path, Arch, CompilationDir, ProfileReaderRef,
+                                   *Coverage, DataFound))
           return std::move(E);
       } else if (CheckBinaryIDs) {
         return createFileError(
-            ProfileFilename,
+            ProfileFilename.value(),
             createStringError(errc::no_such_file_or_directory,
                               "Missing binary ID: " +
                                   llvm::toHex(BinaryID, /*LowerCase=*/true)));
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
new file mode 100644
index 000000000000..bce886bdf510
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
@@ -0,0 +1,37 @@
+// FULL: SF:{{.*}}showLineExecutionCounts.cpp
+// FULL: FN:6,main
+// FULL: FNDA:0,main
+// FULL: FNF:1
+// FULL: FNH:0
+int main() {                              // FULL: DA:[[@LINE]],0
+  int x = 0;                              // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  if (x) {                                // FULL: DA:[[@LINE]],0
+    x = 0;                                // FULL: DA:[[@LINE]],0
+  } else {                                // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],0
+  x = x > 10 ?                            // FULL: DA:[[@LINE]],0
+        x - 1:                            // FULL: DA:[[@LINE]],0
+        x + 1;                            // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  return 0;                               // FULL: DA:[[@LINE]],0
+}                                         // FULL: DA:[[@LINE]],0
+// FULL: LF:20
+// FULL: LH:0
+// FULL: end_of_record
+// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=FULL %s
+
+// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=SUMMARYONLY %s
+// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
+// SUMMARYONLY: FNF:1
+// SUMMARYONLY: FNH:0
+// SUMMARYONLY: LF:20
+// SUMMARYONLY: LH:0
+// SUMMARYONLY: end_of_record
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 1f2484cd4dda..6c66858c4de8 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -153,7 +153,7 @@ private:
   bool HadSourceFiles = false;
 
   /// The path to the indexed profile.
-  std::string PGOFilename;
+  std::optional<std::string> PGOFilename;
 
   /// A list of input source files.
   std::vector<std::string> SourceFiles;
@@ -455,10 +455,12 @@ static bool modifiedTimeGT(StringRef LHS, StringRef RHS) {
 }
 
 std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
-  for (StringRef ObjectFilename : ObjectFilenames)
-    if (modifiedTimeGT(ObjectFilename, PGOFilename))
-      warning("profile data may be out of date - object is newer",
-              ObjectFilename);
+  if (PGOFilename) {
+    for (StringRef ObjectFilename : ObjectFilenames)
+      if (modifiedTimeGT(ObjectFilename, PGOFilename.value()))
+        warning("profile data may be out of date - object is newer",
+                ObjectFilename);
+  }
   auto FS = vfs::getRealFileSystem();
   auto CoverageOrErr = CoverageMapping::load(
       ObjectFilenames, PGOFilename, *FS, CoverageArches,
@@ -668,11 +670,16 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "dump-collected-paths", cl::Optional, cl::Hidden,
       cl::desc("Show the collected paths to source files"));
 
-  cl::opt<std::string, true> PGOFilename(
-      "instr-profile", cl::Required, cl::location(this->PGOFilename),
+  cl::opt<std::string> PGOFilename(
+      "instr-profile", cl::Optional,
       cl::desc(
           "File with the profile data obtained after an instrumented run"));
 
+  cl::opt<bool> EmptyProfile(
+      "empty-profile", cl::Optional,
+      cl::desc("Use a synthetic profile with no data to generate "
+               "baseline coverage"));
+
   cl::list<std::string> Arches(
       "arch", cl::desc("architectures of the coverage mapping binaries"));
 
@@ -805,6 +812,15 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     }
     this->CheckBinaryIDs = CheckBinaryIDs;
 
+    if (!PGOFilename.empty() == EmptyProfile) {
+      error(
+          "exactly one of -instr-profile and -empty-profile must be specified");
+      return 1;
+    }
+    if (!PGOFilename.empty()) {
+      this->PGOFilename = std::make_optional(PGOFilename.getValue());
+    }
+
     if (!CovFilename.empty())
       ObjectFilenames.emplace_back(CovFilename);
     for (const std::string &Filename : CovFilenames)
@@ -1116,20 +1132,22 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     }
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
-  }
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
 
-  if (ShowCreatedTime) {
-    auto ModifiedTime = Status.getLastModificationTime();
-    std::string ModifiedTimeStr = to_string(ModifiedTime);
-    size_t found = ModifiedTimeStr.rfind(':');
-    ViewOpts.CreatedTimeStr =
-        (found != std::string::npos)
-            ? "Created: " + ModifiedTimeStr.substr(0, found)
-            : "Created: " + ModifiedTimeStr;
+    if (ShowCreatedTime) {
+      auto ModifiedTime = Status.getLastModificationTime();
+      std::string ModifiedTimeStr = to_string(ModifiedTime);
+      size_t found = ModifiedTimeStr.rfind(':');
+      ViewOpts.CreatedTimeStr =
+          (found != std::string::npos)
+              ? "Created: " + ModifiedTimeStr.substr(0, found)
+              : "Created: " + ModifiedTimeStr;
+    }
   }
 
   auto Coverage = load();
@@ -1238,10 +1256,12 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
@@ -1303,10 +1323,12 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 46f881ecddb5..ec81e5f274ef 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -277,7 +277,9 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
       CoverageReaders.push_back(
           std::make_unique<CoverageMappingReaderMock>(Funcs));
     }
-    return CoverageMapping::load(CoverageReaders, *ProfileReader);
+    auto ProfileReaderRef = std::make_optional(
+        std::reference_wrapper<IndexedInstrProfReader>(*ProfileReader));
+    return CoverageMapping::load(CoverageReaders, ProfileReaderRef);
   }
 
   Error loadCoverageMapping(bool EmitFilenames = true) {

From f952af30fd2efbf6effa3e845f0e49a9f0e2302d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 12:25:31 -0700
Subject: [PATCH 0337/1322] [clang][docs][RISCV] Prepend the HelpText for
 -mrvv-vector-bits into the DocBrief. (#144128)

The DocBrief is used to generate the webpage description of the option.
The current text only talks about the possible values, but not what the
option does.
---
 clang/include/clang/Driver/Options.td | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5951687b095e..1b07deb4a848 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5056,9 +5056,10 @@ def mrvv_vector_bits_EQ : Joined<["-"], "mrvv-vector-bits=">, Group<m_Group>,
   Visibility<[ClangOption, FlangOption]>,
   HelpText<"Specify the size in bits of an RVV vector register">,
   DocBrief<!strconcat(
-    "Defaults to the vector length agnostic value of \"scalable\". "
-    "Accepts power of 2 values between 64 and 65536. Also accepts "
-    "\"zvl\" to use the value implied by -march/-mcpu.",
+    "Specify the size in bits of an RVV vector register. Defaults to the "
+    "vector length agnostic value of \"scalable\". Accepts power of 2 values "
+    "between 64 and 65536. Also accepts \"zvl\" to use the value implied by "
+    "-march/-mcpu.",
     !cond(
       // Flang does not set the preprocessor define.
       !eq(GlobalDocumentation.Program, "Flang") : "",

From acc58ac8bf792d78233daf913565e2cbb61a8f5c Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 12:37:20 -0700
Subject: [PATCH 0338/1322] [bazel] Add missing dep for 52d34865b9db3485c
 (#144147)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b54ac1728a59..31855cd5444c 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1631,6 +1631,7 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":Analysis",
+        ":ProfileData",
         ":Support",
         ":TargetParser",
     ],

From b7cb34840cd1e8cea932f04d5b4e34b4056cb6de Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 12:40:44 -0700
Subject: [PATCH 0339/1322] [CIR] Enable floating point casts (#144142)

We already had the code in place to emit CIR floating point cast ops
that get lowered to fpext or fptrunc, but we weren't calling the
function to emit that cast from ScalarExprEmitter::emitScalarCast. This
change adds that call.
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 4 ++--
 clang/test/CIR/CodeGen/cast.cpp            | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 30d231e2c61d..baaef022ccc6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -346,8 +346,8 @@ public:
         assert(!cir::MissingFeatures::fpConstraints());
         castKind = cir::CastKind::float_to_int;
       } else if (mlir::isa<cir::CIRFPTypeInterface>(dstTy)) {
-        cgf.getCIRGenModule().errorNYI("floating point casts");
-        return cgf.createDummyValue(src.getLoc(), dstType);
+        // TODO: split this to createFPExt/createFPTrunc
+        return builder.createFloatingCast(src, fullDstTy);
       } else {
         llvm_unreachable("Internal error: Cast to unexpected type");
       }
diff --git a/clang/test/CIR/CodeGen/cast.cpp b/clang/test/CIR/CodeGen/cast.cpp
index a7c11b1939ba..84f55242a611 100644
--- a/clang/test/CIR/CodeGen/cast.cpp
+++ b/clang/test/CIR/CodeGen/cast.cpp
@@ -73,6 +73,14 @@ int cStyleCasts_0(unsigned x1, int x2, float x3, short x4, double x5) {
   // LLVM: %{{[0-9]+}} = fcmp une float %{{[0-9]+}}, 0.000000e+00
   // LLVM: %{{[0-9]+}} = zext i1 %{{[0-9]+}} to i8
 
+  double d2 = f; // float to double
+  // CIR: %{{[0-9]+}} = cir.cast(floating, %{{[0-9]+}} : !cir.float), !cir.double
+  // LLVM: %{{[0-9]+}} = fpext float %{{[0-9]+}} to double
+
+  f = d2; // double to float
+  // CIR: %{{[0-9]+}} = cir.cast(floating, %{{[0-9]+}} : !cir.double), !cir.float
+  // LLVM: %{{[0-9]+}} = fptrunc double %{{[0-9]+}} to float
+
   return 0;
 }
 

From 65eaed7d5a08210cd5b419f45845d5de81435d7e Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 12:40:57 -0700
Subject: [PATCH 0340/1322] [CIR] Handle character literal values (#144141)

This change adds a handler for emitting a cir.constant op when a
character literal is encountered outside an initializer expression.
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  6 ++++++
 clang/test/CIR/CodeGen/basic.c             | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index baaef022ccc6..75b4d2a637e6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -162,6 +162,12 @@ public:
         builder.getAttr<cir::FPAttr>(type, e->getValue()));
   }
 
+  mlir::Value VisitCharacterLiteral(const CharacterLiteral *e) {
+    mlir::Type ty = cgf.convertType(e->getType());
+    auto init = cir::IntAttr::get(ty, e->getValue());
+    return builder.create<cir::ConstantOp>(cgf.getLoc(e->getExprLoc()), init);
+  }
+
   mlir::Value VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *e) {
     return builder.getBool(e->getValue(), cgf.getLoc(e->getExprLoc()));
   }
diff --git a/clang/test/CIR/CodeGen/basic.c b/clang/test/CIR/CodeGen/basic.c
index abc1a45fd433..7ff73ee95f79 100644
--- a/clang/test/CIR/CodeGen/basic.c
+++ b/clang/test/CIR/CodeGen/basic.c
@@ -309,3 +309,17 @@ size_type max_size(void) {
 // CHECK:   %6 = cir.load{{.*}} %0 : !cir.ptr<!u64i>, !u64i
 // CHECK:   cir.return %6 : !u64i
 // CHECK:   }
+
+void test_char_literal() {
+  char c;
+  c = 'X';
+}
+
+// CIR: cir.func @test_char_literal
+// CIR:   cir.const #cir.int<88>
+
+// LLVM: define void @test_char_literal()
+// LLVM:   store i8 88, ptr %{{.*}}, align 1
+
+// OGCG: define{{.*}} void @test_char_literal()
+// OGCG:   store i8 88, ptr %{{.*}}, align 1

From f5df231d8caece81fd800b921cf4fbd7774e2885 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 13 Jun 2025 12:45:34 -0700
Subject: [PATCH 0341/1322] [LV] Fix test line and regen an autogen test

---
 .../RISCV/riscv-vector-reverse.ll             | 611 +++++++++++++-----
 1 file changed, 466 insertions(+), 145 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 7ec9749eb87e..b026e6868581 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; This is the loop in c++ being vectorize in this file with
 ;vector.reverse
 ;  #pragma clang loop vectorize_width(4, scalable)
@@ -46,66 +46,100 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK:       LV: Loop does not require scalar epilogue
-; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  Creating VPBasicBlock for for.body
+; CHECK-NEXT:  VPlan 'Plain CFG
+; CHECK-NEXT:   for UF>=1' {
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
+; CHECK-NEXT:  Successor(s): for.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body:
+; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:    EMIT ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
+; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
+; CHECK-NEXT:    EMIT ir<%add9> = add ir<%1>, ir<1>
+; CHECK-NEXT:    EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
+; CHECK-NEXT:    EMIT store ir<%add9>, ir<%arrayidx3>
+; CHECK-NEXT:    EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
+; CHECK-NEXT:    EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
+; CHECK-NEXT:    EMIT branch-on-cond ir<%cmp>
+; CHECK-NEXT:  Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
+; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
+; CHECK-NEXT:      vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
 ; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%9>
 ; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
 ; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
+; CHECK-NEXT:      WIDEN store vp<%10>, ir<%add9>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %add9 = add i32 %1, 1
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -151,85 +185,212 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK:       Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
-; CHECK-NEXT:  Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  ir<%0> = original trip-count
+; CHECK-NEXT:  Live-in ir<%18> = VF
+; CHECK-NEXT:  Live-in ir<%18>.1 = VF * UF
+; CHECK-NEXT:  Live-in ir<%n.vec> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%0> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.scevcheck>:
-; CHECK-NEXT:    IR   %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    IR   %4 = add i32 %n, -1
-; CHECK-NEXT:    IR   %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    IR   %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    IR   %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    IR   %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    IR   %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    IR   %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    IR   %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    IR   %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    IR   %10 = or i1 %8, %9
+; CHECK-NEXT:    IR %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    IR %4 = add i32 %n, -1
+; CHECK-NEXT:    IR %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    IR %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    IR %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    IR %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    IR %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    IR %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    IR %10 = or i1 %8, %9
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.memcheck>:
-; CHECK-NEXT:    IR   %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %12 = mul i64 %11, 4
-; CHECK-NEXT:    IR   %13 = mul i64 %12, 4
-; CHECK-NEXT:    IR   %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    IR   %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR %13 = mul i64 %12, 4
+; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.ph>:
-; CHECK-NEXT:    IR   %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
-; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT:    vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT:    vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
 ; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
+; CHECK-NEXT:    vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT:    WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%19>, ir<1>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
-; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT:    vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT:    WIDEN store vp<%5>, ir<%add9>
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
 ; CHECK-NEXT:  Successor(s): middle.block, vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME_1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME_2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME_1]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME_2]]>.1 from ir-bb<scalar.ph>)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %19 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %add9 = add i32 %19, 1
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body.preheader: ; preds = %entry
+; CHECK-NEXT:    %0 = zext i32 %n to i64
+; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %2 = mul i64 %1, 4
+; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
+; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT:    %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    %4 = add i32 %n, -1
+; CHECK-NEXT:    %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    %10 = or i1 %8, %9
+; CHECK-NEXT:    br i1 %10, label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %12 = mul i64 %11, 4
+; CHECK-NEXT:    %13 = mul i64 %12, 4
+; CHECK-NEXT:    %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    br i1 %diff.check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %16 = mul i64 %15, 4
+; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %18 = mul i64 %17, 4
+; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
+; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
+; CHECK-NEXT:    %20 = sub i32 %n, %.cast
+; CHECK-NEXT:    br
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: created vector.body
+; CHECK-NEXT:  LV: draw edge fromvector.ph
+; CHECK-NEXT:  LV: vectorizing VPBB:vector.body in BB:vector.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
+; CHECK-NEXT:    %.cast3 = trunc i64 %index to i32
+; CHECK-NEXT:    %offset.idx = sub i32 %n, %.cast3
+; CHECK-NEXT:    %21 = add nsw i32 %offset.idx, -1
+; CHECK-NEXT:    %22 = zext i32 %21 to i64
+; CHECK-NEXT:    %23 = getelementptr inbounds i32, ptr %B, i64 %22
+; CHECK-NEXT:    %24 = mul i64 0, %18
+; CHECK-NEXT:    %25 = sub i64 1, %18
+; CHECK-NEXT:    %26 = getelementptr inbounds i32, ptr %23, i64 %24
+; CHECK-NEXT:    %27 = getelementptr inbounds i32, ptr %26, i64 %25
+; CHECK-NEXT:    %wide.load = load <vscale x 4 x i32>, ptr %27, align 4
+; CHECK-NEXT:    %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
+; CHECK-NEXT:    %28 = add <vscale x 4 x i32> %reverse, splat (i32 1)
+; CHECK-NEXT:    %29 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT:    %30 = mul i64 0, %18
+; CHECK-NEXT:    %31 = sub i64 1, %18
+; CHECK-NEXT:    %32 = getelementptr inbounds i32, ptr %29, i64 %30
+; CHECK-NEXT:    %33 = getelementptr inbounds i32, ptr %32, i64 %31
+; CHECK-NEXT:    %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28)
+; CHECK-NEXT:    store <vscale x 4 x i32> %reverse4, ptr %33, align 4
+; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
+; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT:  LV: created middle.block
+; CHECK-NEXT:  LV: draw edge fromvector.body
+; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  middle.block: ; preds = %vector.body
+; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
+; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
+; CHECK-NEXT:    br label %for.cond.cleanup
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
+; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT:    br label %for.body
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
+; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:    %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:    %35 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    %add9 = add i32 %35, 1
+; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:    store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: draw edge fromscalar.ph
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Vectorizing: innermost loop.
+; CHECK-EMPTY:
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
@@ -294,66 +455,100 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK:       LV: Loop does not require scalar epilogue
-; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  Creating VPBasicBlock for for.body
+; CHECK-NEXT:  VPlan 'Plain CFG
+; CHECK-NEXT:   for UF>=1' {
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
+; CHECK-NEXT:  Successor(s): for.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body:
+; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:    EMIT ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
+; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
+; CHECK-NEXT:    EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
+; CHECK-NEXT:    EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
+; CHECK-NEXT:    EMIT store ir<%conv1>, ir<%arrayidx3>
+; CHECK-NEXT:    EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
+; CHECK-NEXT:    EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
+; CHECK-NEXT:    EMIT branch-on-cond ir<%cmp>
+; CHECK-NEXT:  Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
+; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
+; CHECK-NEXT:      vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
 ; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%9>
 ; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
 ; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
-; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
+; CHECK-NEXT:      WIDEN store vp<%10>, ir<%conv1>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -399,85 +594,211 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK:       Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
-; CHECK-NEXT:  Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  ir<%0> = original trip-count
+; CHECK-NEXT:  Live-in ir<%18> = VF
+; CHECK-NEXT:  Live-in ir<%18>.1 = VF * UF
+; CHECK-NEXT:  Live-in ir<%n.vec> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%0> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.scevcheck>:
-; CHECK-NEXT:    IR   %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    IR   %4 = add i32 %n, -1
-; CHECK-NEXT:    IR   %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    IR   %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    IR   %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    IR   %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    IR   %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    IR   %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    IR   %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    IR   %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    IR   %10 = or i1 %8, %9
+; CHECK-NEXT:    IR %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    IR %4 = add i32 %n, -1
+; CHECK-NEXT:    IR %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    IR %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    IR %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    IR %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    IR %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    IR %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    IR %10 = or i1 %8, %9
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.memcheck>:
-; CHECK-NEXT:    IR   %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %12 = mul i64 %11, 4
-; CHECK-NEXT:    IR   %13 = mul i64 %12, 4
-; CHECK-NEXT:    IR   %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    IR   %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR %13 = mul i64 %12, 4
+; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.ph>:
-; CHECK-NEXT:    IR   %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
-; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT:    vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT:    vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
 ; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
+; CHECK-NEXT:    vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT:    WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
-; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
-; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT:    vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT:    WIDEN store vp<%5>, ir<%conv1>
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
 ; CHECK-NEXT:  Successor(s): middle.block, vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %19 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %conv1 = fadd float %19, 1.000000e+00
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body.preheader: ; preds = %entry
+; CHECK-NEXT:    %0 = zext i32 %n to i64
+; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %2 = mul i64 %1, 4
+; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
+; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT:    %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    %4 = add i32 %n, -1
+; CHECK-NEXT:    %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    %10 = or i1 %8, %9
+; CHECK-NEXT:    br i1 %10, label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %12 = mul i64 %11, 4
+; CHECK-NEXT:    %13 = mul i64 %12, 4
+; CHECK-NEXT:    %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    br i1 %diff.check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %16 = mul i64 %15, 4
+; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %18 = mul i64 %17, 4
+; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
+; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
+; CHECK-NEXT:    %20 = sub i32 %n, %.cast
+; CHECK-NEXT:    br
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: created vector.body
+; CHECK-NEXT:  LV: draw edge fromvector.ph
+; CHECK-NEXT:  LV: vectorizing VPBB:vector.body in BB:vector.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
+; CHECK-NEXT:    %.cast3 = trunc i64 %index to i32
+; CHECK-NEXT:    %offset.idx = sub i32 %n, %.cast3
+; CHECK-NEXT:    %21 = add nsw i32 %offset.idx, -1
+; CHECK-NEXT:    %22 = zext i32 %21 to i64
+; CHECK-NEXT:    %23 = getelementptr inbounds float, ptr %B, i64 %22
+; CHECK-NEXT:    %24 = mul i64 0, %18
+; CHECK-NEXT:    %25 = sub i64 1, %18
+; CHECK-NEXT:    %26 = getelementptr inbounds float, ptr %23, i64 %24
+; CHECK-NEXT:    %27 = getelementptr inbounds float, ptr %26, i64 %25
+; CHECK-NEXT:    %wide.load = load <vscale x 4 x float>, ptr %27, align 4
+; CHECK-NEXT:    %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
+; CHECK-NEXT:    %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
+; CHECK-NEXT:    %29 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT:    %30 = mul i64 0, %18
+; CHECK-NEXT:    %31 = sub i64 1, %18
+; CHECK-NEXT:    %32 = getelementptr inbounds float, ptr %29, i64 %30
+; CHECK-NEXT:    %33 = getelementptr inbounds float, ptr %32, i64 %31
+; CHECK-NEXT:    %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28)
+; CHECK-NEXT:    store <vscale x 4 x float> %reverse4, ptr %33, align 4
+; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
+; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT:  LV: created middle.block
+; CHECK-NEXT:  LV: draw edge fromvector.body
+; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  middle.block: ; preds = %vector.body
+; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
+; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
+; CHECK-NEXT:    br label %for.cond.cleanup
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
+; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT:    br label %for.body
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
+; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:    %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:    %35 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    %conv1 = fadd float %35, 1.000000e+00
+; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:    store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: draw edge fromscalar.ph
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0

From 1ded2c599fd230b2d355386c019a3054f5745d55 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 13 Jun 2025 21:01:11 +0100
Subject: [PATCH 0342/1322] [LV] Use createIterationCountCheck during epilogue
 skeleton creation.

Use helper already used for minimum trip count checks for the regular
ILV skeleton creation also for epilogue skeleton creation.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 ++++---------------
 .../partial-reduce-dot-product-epilogue.ll    |  3 +--
 ...ctor-loop-backedge-elimination-epilogue.ll |  3 +--
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 69b60c7b9320..93f53996425d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7535,25 +7535,12 @@ BasicBlock *
 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
                                                     bool ForEpilogue) {
   assert(Bypass && "Expected valid bypass basic block.");
-  ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
-  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
   Value *Count = getTripCount();
-  // Reuse existing vector loop preheader for TC checks.
-  // Note that new preheader block is generated for vector loop.
+  MinProfitableTripCount = ElementCount::getFixed(0);
+  Value *CheckMinIters = createIterationCountCheck(
+      ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
+
   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  IRBuilder<> Builder(TCCheckBlock->getTerminator());
-
-  // Generate code to check if the loop's trip count is less than VF * UF of the
-  // main vector loop.
-  auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
-                                                    : VF.isVector())
-               ? ICmpInst::ICMP_ULE
-               : ICmpInst::ICMP_ULT;
-
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
-      "min.iters.check");
-
   if (!ForEpilogue)
     TCCheckBlock->setName("vector.main.loop.iter.check");
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index c3fc91c4574f..66dbcff2c123 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -10,8 +10,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
index cb966e4088db..2705d6910bb2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
@@ -11,8 +11,7 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c)  {
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TC]], 64
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]

From c42912b8c96ff1130437e47c163aeb5c1191fe5d Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 13 Jun 2025 13:07:39 -0700
Subject: [PATCH 0343/1322] Fix string_length function so that it always
 returns. (#144148)

Previously setting LIBC_COPT_STRING_UNSAFE_WIDE_READ would cause a build
error because there is a path in the ifdef that doesn't return anything.
---
 libc/src/string/string_utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index dcbfc7584a30..4f56263fce8e 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -90,12 +90,11 @@ template <typename T> LIBC_INLINE size_t string_length(const T *src) {
   // string a block at a time.
   if constexpr (cpp::is_same_v<T, char>)
     return string_length_wide_read<unsigned int>(src);
-#else
+#endif
   size_t length;
   for (length = 0; *src; ++src, ++length)
     ;
   return length;
-#endif
 }
 
 template <typename Word>

From 938e91e4fe10a9ff810b41ee74f5c0af8d3ac490 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:08 -0700
Subject: [PATCH 0344/1322] [memprof] Use testing::IsEmpty (NFC) (#144096)

This patch replaces testing::IsEmpty with IsEmpty because we already
have:

  using ::testing::IsEmpty;

near the beginning of the file.
---
 .../ProfileData/DataAccessProfTest.cpp        | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/unittests/ProfileData/DataAccessProfTest.cpp b/llvm/unittests/ProfileData/DataAccessProfTest.cpp
index 13af3390557d..2f93e16f28c1 100644
--- a/llvm/unittests/ProfileData/DataAccessProfTest.cpp
+++ b/llvm/unittests/ProfileData/DataAccessProfTest.cpp
@@ -92,10 +92,10 @@ TEST(MemProf, DataAccessProfile) {
 
     EXPECT_THAT(
         Data.getProfileRecord("foo.llvm.123"),
-        ValueIs(AllOf(
-            Field(&DataAccessProfRecord::SymHandle,
-                  testing::VariantWith<std::string>(testing::Eq("foo"))),
-            Field(&DataAccessProfRecord::Locations, testing::IsEmpty()))));
+        ValueIs(
+            AllOf(Field(&DataAccessProfRecord::SymHandle,
+                        testing::VariantWith<std::string>(testing::Eq("foo"))),
+                  Field(&DataAccessProfRecord::Locations, IsEmpty()))));
     EXPECT_THAT(
         Data.getProfileRecord("bar.__uniq.321"),
         ValueIs(AllOf(
@@ -130,7 +130,7 @@ TEST(MemProf, DataAccessProfile) {
         reinterpret_cast<const unsigned char *>(serializedData.data());
     ASSERT_THAT(llvm::to_vector(llvm::make_first_range(
                     deserializedData.getStrToIndexMapRef())),
-                testing::IsEmpty());
+                IsEmpty());
     EXPECT_FALSE(deserializedData.deserialize(p));
 
     EXPECT_THAT(
@@ -153,11 +153,10 @@ TEST(MemProf, DataAccessProfile) {
     EXPECT_THAT(
         Records,
         ElementsAre(
-            AllOf(
-                Field(&DataAccessProfRecordRef::SymbolID, 0),
-                Field(&DataAccessProfRecordRef::AccessCount, 100),
-                Field(&DataAccessProfRecordRef::IsStringLiteral, false),
-                Field(&DataAccessProfRecordRef::Locations, testing::IsEmpty())),
+            AllOf(Field(&DataAccessProfRecordRef::SymbolID, 0),
+                  Field(&DataAccessProfRecordRef::AccessCount, 100),
+                  Field(&DataAccessProfRecordRef::IsStringLiteral, false),
+                  Field(&DataAccessProfRecordRef::Locations, IsEmpty())),
             AllOf(Field(&DataAccessProfRecordRef::SymbolID, 2),
                   Field(&DataAccessProfRecordRef::AccessCount, 123),
                   Field(&DataAccessProfRecordRef::IsStringLiteral, false),

From 6d0cfc2ca51e9365f1c6f216df30a612958aca70 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:15 -0700
Subject: [PATCH 0345/1322] [Vectorize] Use llvm::drop_begin (NFC) (#144098)

We can pass a range to llvm::drop_begin.
---
 llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index acc861b99197..53619b39219e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2114,7 +2114,7 @@ public:
   VPWidenPHIRecipe *clone() override {
     auto *C = new VPWidenPHIRecipe(cast<PHINode>(getUnderlyingValue()),
                                    getOperand(0), getDebugLoc(), Name);
-    for (VPValue *Op : make_range(std::next(op_begin()), op_end()))
+    for (VPValue *Op : llvm::drop_begin(operands()))
       C->addOperand(Op);
     return C;
   }

From 2a805589f56b30b27057c7549dd0ad2963ae16b1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:22 -0700
Subject: [PATCH 0346/1322] [SPIRV] Use llvm::all_of (NFC) (#144099)

We can pass a range to llvm::all_of.
---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 8edd0b533b9f..911a6966aaef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1564,15 +1564,14 @@ static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
 static bool isASCastInGVar(MachineRegisterInfo *MRI, Register ResVReg) {
   bool IsGRef = false;
   bool IsAllowedRefs =
-      std::all_of(MRI->use_instr_begin(ResVReg), MRI->use_instr_end(),
-                  [&IsGRef](auto const &It) {
-                    unsigned Opcode = It.getOpcode();
-                    if (Opcode == SPIRV::OpConstantComposite ||
-                        Opcode == SPIRV::OpVariable ||
-                        isSpvIntrinsic(It, Intrinsic::spv_init_global))
-                      return IsGRef = true;
-                    return Opcode == SPIRV::OpName;
-                  });
+      llvm::all_of(MRI->use_instructions(ResVReg), [&IsGRef](auto const &It) {
+        unsigned Opcode = It.getOpcode();
+        if (Opcode == SPIRV::OpConstantComposite ||
+            Opcode == SPIRV::OpVariable ||
+            isSpvIntrinsic(It, Intrinsic::spv_init_global))
+          return IsGRef = true;
+        return Opcode == SPIRV::OpName;
+      });
   return IsAllowedRefs && IsGRef;
 }
 

From 5064a5bc3e958aeb18bf3f8c7144c99cc3103a91 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:31 -0700
Subject: [PATCH 0347/1322] [IR] Remove a redundant control flow statement
 (NFC) (#144100)

---
 llvm/lib/IR/DebugInfo.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 9527c3e0b5d6..e6b1f76dfacf 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2127,7 +2127,6 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
       &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
   (void)Assign;
   LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-  return;
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).

From a89df72ec0864301f102296dcf7b3bd22844adf5 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Fri, 13 Jun 2025 13:30:18 -0700
Subject: [PATCH 0348/1322] WholeProgramDevirt: Fix importing in
 llvm.type.checked.load case.

We were clearing SummaryTypeCheckedLoadUsers to prevent devirtualized
llvm.type.checked.load calls from being converted to llvm.type.test,
which meant that AddCalls would not see them in the list of
callsites and they would not get imported. Fix that by not clearing
SummaryTypeCheckedLoadUsers so that the list survives to AddCalls and
using AllCallSitesDevirted to control whether to convert them instead.

Reviewers: teresajohnson

Reviewed By: teresajohnson

Pull Request: https://github.com/llvm/llvm-project/pull/144019
---
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 39 +++++++++----------
 .../Inputs/devirt_single_hybrid_foo_tcl.ll    | 31 +++++++++++++++
 llvm/test/ThinLTO/X86/devirt_single_hybrid.ll | 26 ++++++++++++-
 3 files changed, 74 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll

diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index a7d9f3ba24b2..30e1dc7167a3 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -494,28 +494,28 @@ struct CallSiteInfo {
   /// Whether all call sites represented by this CallSiteInfo, including those
   /// in summaries, have been devirtualized. This starts off as true because a
   /// default constructed CallSiteInfo represents no call sites.
+  ///
+  /// If at the end of the pass there are still undevirtualized calls, we will
+  /// need to add a use of llvm.type.test to each of the function summaries in
+  /// the vector.
   bool AllCallSitesDevirted = true;
 
   // These fields are used during the export phase of ThinLTO and reflect
   // information collected from function summaries.
 
-  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
-  /// this slot.
-  bool SummaryHasTypeTestAssumeUsers = false;
-
   /// CFI-specific: a vector containing the list of function summaries that use
   /// the llvm.type.checked.load intrinsic and therefore will require
   /// resolutions for llvm.type.test in order to implement CFI checks if
-  /// devirtualization was unsuccessful. If devirtualization was successful, the
-  /// pass will clear this vector by calling markDevirt(). If at the end of the
-  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
-  /// to each of the function summaries in the vector.
+  /// devirtualization was unsuccessful.
   std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+
+  /// A vector containing the list of function summaries that use
+  /// assume(llvm.type.test).
   std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
 
   bool isExported() const {
-    return SummaryHasTypeTestAssumeUsers ||
-           !SummaryTypeCheckedLoadUsers.empty();
+    return !SummaryTypeCheckedLoadUsers.empty() ||
+           !SummaryTypeTestAssumeUsers.empty();
   }
 
   void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
@@ -525,16 +525,10 @@ struct CallSiteInfo {
 
   void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
     SummaryTypeTestAssumeUsers.push_back(FS);
-    SummaryHasTypeTestAssumeUsers = true;
     AllCallSitesDevirted = false;
   }
 
-  void markDevirt() {
-    AllCallSitesDevirted = true;
-
-    // As explained in the comment for SummaryTypeCheckedLoadUsers.
-    SummaryTypeCheckedLoadUsers.clear();
-  }
+  void markDevirt() { AllCallSitesDevirted = true; }
 };
 
 // Call site information collected for a specific VTableSlot.
@@ -2465,11 +2459,14 @@ bool DevirtModule::run() {
     if (ExportSummary && isa<MDString>(S.first.TypeID)) {
       auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
           cast<MDString>(S.first.TypeID)->getString());
-      for (auto *FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
-        FS->addTypeTest(GUID);
+      auto AddTypeTestsForTypeCheckedLoads = [&](CallSiteInfo &CSI) {
+        if (!CSI.AllCallSitesDevirted)
+          for (auto *FS : CSI.SummaryTypeCheckedLoadUsers)
+            FS->addTypeTest(GUID);
+      };
+      AddTypeTestsForTypeCheckedLoads(S.second.CSInfo);
       for (auto &CCS : S.second.ConstCSInfo)
-        for (auto *FS : CCS.second.SummaryTypeCheckedLoadUsers)
-          FS->addTypeTest(GUID);
+        AddTypeTestsForTypeCheckedLoads(CCS.second);
     }
   }
 
diff --git a/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll
new file mode 100644
index 000000000000..4a696837bc8e
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll
@@ -0,0 +1,31 @@
+; ModuleID = 'foo.cpp'
+source_filename = "foo.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+
+; Function Attrs: uwtable
+define hidden i32 @_Z3fooP1A(ptr %pA) local_unnamed_addr {
+entry:
+  %vtable = load ptr, ptr %pA, align 8, !tbaa !2
+  %0 = call { ptr, i1 } @llvm.type.checked.load(ptr %vtable, i32 0, metadata !"_ZTS1A")
+  %1 = extractvalue { ptr, i1 } %0, 0
+  %call = tail call i32 %1(ptr %pA)
+  %add = add nsw i32 %call, 10
+  ret i32 %add
+}
+
+declare { ptr, i1 } @llvm.type.checked.load(ptr, i32, metadata)
+
+; Function Attrs: nounwind willreturn
+declare void @llvm.assume(i1)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (trunk 373596)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"vtable pointer", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll b/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
index 90fdf0d7dfa0..53c001efc132 100644
--- a/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
@@ -2,8 +2,32 @@
 ; when we're running hybrid LTO.
 ;
 ; RUN: opt -thinlto-bc -thinlto-split-lto-unit %s -o %t-main.bc
-; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_foo.ll -o %t-foo.bc
 ; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_bar.ll -o %t-bar.bc
+
+; Test the assume(type.test) case.
+ 
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_foo.ll -o %t-foo.bc
+; RUN: llvm-lto2 run -save-temps %t-main.bc %t-foo.bc %t-bar.bc -pass-remarks=. -o %t \
+; RUN:   -whole-program-visibility \
+; RUN:    -r=%t-foo.bc,_Z3fooP1A,pl \
+; RUN:    -r=%t-main.bc,main,plx \
+; RUN:    -r=%t-main.bc,_Z3barv,l \
+; RUN:    -r=%t-bar.bc,_Z3barv,pl \
+; RUN:    -r=%t-bar.bc,_Z3fooP1A, \
+; RUN:    -r=%t-bar.bc,_ZNK1A1fEv,pl \
+; RUN:    -r=%t-bar.bc,_ZTV1A,l \
+; RUN:    -r=%t-bar.bc,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN:    -r=%t-bar.bc,_ZTS1A,pl \
+; RUN:    -r=%t-bar.bc,_ZTI1A,pl \
+; RUN:    -r=%t-bar.bc,_ZNK1A1fEv, \
+; RUN:    -r=%t-bar.bc,_ZTV1A,pl \
+; RUN:    -r=%t-bar.bc,_ZTI1A, 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+; RUN: llvm-dis %t.1.5.precodegen.bc -o - | FileCheck %s --check-prefix=CODEGEN
+
+; Test the type.checked.load case.
+ 
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_foo_tcl.ll -o %t-foo.bc
 ; RUN: llvm-lto2 run -save-temps %t-main.bc %t-foo.bc %t-bar.bc -pass-remarks=. -o %t \
 ; RUN:   -whole-program-visibility \
 ; RUN:    -r=%t-foo.bc,_Z3fooP1A,pl \

From 52a6492136ef43462c68efa88a0276bb66ee8c52 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 20:30:12 +0000
Subject: [PATCH 0349/1322] [bazel] Add missing errno deps one more time

---
 utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel  | 1 +
 .../bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index 2354337da2dc..2c5ad7d27ce8 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -81,6 +81,7 @@ libc_test_library(
     ],
     deps = [
         ":LibcUnitTest",
+        "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
         "//libc:errno",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 610978059d7e..4f66793d44df 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -204,6 +204,7 @@ libc_test_library(
         "//libc:__support_cpp_limits",
         "//libc:__support_cpp_type_traits",
         "//libc:__support_ctype_utils",
+        "//libc:__support_libc_errno",
         "//libc:__support_macros_properties_architectures",
         "//libc:errno",
         "//libc/test/UnitTest:LibcUnitTest",

From 60d000496b5485c89c51e64b2b339210d48263be Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 05:44:08 +0900
Subject: [PATCH 0350/1322] [Cygwin] Define LLVM_ABI for Cygwin (#143222)

592243c1cb3ea53b34033132a87b0d14af9d1079 should be also applied to
LLVM_ABI.
---
 llvm/include/llvm/Support/Compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index f6bc05011e3c..0de789ec68c4 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -209,7 +209,7 @@
 #define LLVM_ABI_FRIEND LLVM_ABI
 #define LLVM_ABI_EXPORT __declspec(dllexport)
 #elif defined(__ELF__) || defined(__MINGW32__) || defined(_AIX) ||             \
-    defined(__MVS__)
+    defined(__MVS__) || defined(__CYGWIN__)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_ABI_FRIEND
 #define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT

From be5c96bfac328fed548c532bbe1710fe23460a85 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Jun 2025 13:48:29 -0700
Subject: [PATCH 0351/1322] [CodeGen][COFF] Always emit CodeView compiler info
 on Windows targets (#142970)

MSVC always emits minimal CodeView metadata with compiler information,
even when debug info is otherwise disabled. Other tools may rely on this
metadata being present. For example, linkers use it to determine whether
hotpatching is enabled for the object file.
---
 clang/lib/CodeGen/CodeGenModule.cpp           |  7 +++-
 clang/test/CodeGen/debug-info-version-coff.c  |  8 ++++
 clang/test/CodeGen/debug-info-version.c       |  1 +
 clang/test/CodeGenCXX/debug-info-coff.cpp     | 37 ++++++++++++++++++
 .../debug-info-hotpatch-aarch64.cpp           |  7 +---
 .../CodeGenCXX/debug-info-hotpatch-arm.cpp    |  7 +---
 clang/test/Frontend/ast-main.c                |  4 +-
 clang/test/Frontend/ast-main.cpp              |  4 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  7 +++-
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 34 ++++++++++++-----
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h   |  4 ++
 .../Generic/selection-dag-determinism.ll      | 10 ++---
 llvm/test/DebugInfo/COFF/dwarf-headers.ll     | 27 +++++++++++++
 .../COFF/emission-kind-no-codeview.ll         | 38 +++++++++++++++++++
 .../DebugInfo/COFF/emission-kind-no-debug.ll  | 28 ++++++++++++--
 llvm/test/DebugInfo/COFF/fission-cu.ll        | 10 ++---
 llvm/test/DebugInfo/COFF/fission-sections.ll  | 15 ++++----
 llvm/test/DebugInfo/COFF/uefi-nodebug.ll      | 16 ++++++++
 .../test/DebugInfo/Generic/directives-only.ll |  2 +-
 19 files changed, 219 insertions(+), 47 deletions(-)
 create mode 100644 clang/test/CodeGen/debug-info-version-coff.c
 create mode 100644 clang/test/CodeGenCXX/debug-info-coff.cpp
 create mode 100644 llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll
 create mode 100644 llvm/test/DebugInfo/COFF/uefi-nodebug.ll

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 451792dca40c..c036902b0b13 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -414,6 +414,11 @@ CodeGenModule::CodeGenModule(ASTContext &C,
       CodeGenOpts.CoverageNotesFile.size() ||
       CodeGenOpts.CoverageDataFile.size())
     DebugInfo.reset(new CGDebugInfo(*this));
+  else if (getTriple().isOSWindows())
+    // On Windows targets, we want to emit compiler info even if debug info is
+    // otherwise disabled. Use a temporary CGDebugInfo instance to emit only
+    // basic compiler metadata.
+    CGDebugInfo(*this);
 
   Block.GlobalUniqueCount = 0;
 
@@ -1051,7 +1056,7 @@ void CodeGenModule::Release() {
                               "StrictVTablePointersRequirement",
                               llvm::MDNode::get(VMContext, Ops));
   }
-  if (getModuleDebugInfo())
+  if (getModuleDebugInfo() || getTriple().isOSWindows())
     // We support a single version in the linked module. The LLVM
     // parser will drop debug info with a different version number
     // (and warn about it, too).
diff --git a/clang/test/CodeGen/debug-info-version-coff.c b/clang/test/CodeGen/debug-info-version-coff.c
new file mode 100644
index 000000000000..6497a5829236
--- /dev/null
+++ b/clang/test/CodeGen/debug-info-version-coff.c
@@ -0,0 +1,8 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang --target=x86_64-windows -g -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang --target=x86_64-windows -S -emit-llvm -o - %s | FileCheck %s
+int main (void) {
+  return 0;
+}
+
+// CHECK:  i32 2, !"Debug Info Version", i32 3}
diff --git a/clang/test/CodeGen/debug-info-version.c b/clang/test/CodeGen/debug-info-version.c
index fa7e20e7f527..c7c2bb95017a 100644
--- a/clang/test/CodeGen/debug-info-version.c
+++ b/clang/test/CodeGen/debug-info-version.c
@@ -1,3 +1,4 @@
+// REQUIRES: !system-windows
 // RUN: %clang -g -S -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang -S -emit-llvm -o - %s | FileCheck %s --check-prefix=NO_DEBUG
 int main (void) {
diff --git a/clang/test/CodeGenCXX/debug-info-coff.cpp b/clang/test/CodeGenCXX/debug-info-coff.cpp
new file mode 100644
index 000000000000..4507f5f40d41
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-coff.cpp
@@ -0,0 +1,37 @@
+// REQUIRES: x86-registered-target
+
+// Check that CodeView compiler version is emitted even when debug info is otherwise disabled.
+
+// RUN: %clang --target=i686-pc-windows-msvc -S -emit-llvm %s -o - | FileCheck --check-prefix=IR %s
+// IR: !llvm.dbg.cu = !{!0}
+// IR: !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
+
+// RUN: %clang --target=i686-pc-windows-msvc -c %s -o %t.o
+// RUN: llvm-readobj --codeview %t.o | FileCheck %s
+// CHECK:      CodeViewDebugInfo [
+// CHECK-NEXT:   Section: .debug$S (4)
+// CHECK-NEXT:   Magic: 0x4
+// CHECK-NEXT:   Subsection [
+// CHECK-NEXT:     SubSectionType: Symbols (0xF1)
+// CHECK-NEXT:     SubSectionSize:
+// CHECK-NEXT:     ObjNameSym {
+// CHECK-NEXT:       Kind: S_OBJNAME (0x1101)
+// CHECK-NEXT:       Signature: 0x0
+// CHECK-NEXT:       ObjectName:
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Compile3Sym {
+// CHECK-NEXT:       Kind: S_COMPILE3 (0x113C)
+// CHECK-NEXT:       Language: Cpp (0x1)
+// CHECK-NEXT:       Flags [ (0x0)
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       Machine: Pentium3 (0x7)
+// CHECK-NEXT:       FrontendVersion:
+// CHECK-NEXT:       BackendVersion:
+// CHECK-NEXT:       VersionName: clang version
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ]
+// CHECK-NEXT: ]
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp b/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp
index 10fb1750f2c5..ff2dfc19961c 100644
--- a/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp
+++ b/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp
@@ -11,12 +11,9 @@
 // RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 // HOTPATCH: S_COMPILE3 [size = [[#]]]
 // HOTPATCH: flags = hot patchable
-///
-/// Unfortunately we need /Z7, Clang does not systematically generate S_COMPILE3.
-///
+//
 // RUN: %clang_cl --target=aarch64-pc-windows-msvc /c -o %t.obj -- %s
-// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=NO-HOTPATCH
-// NO-HOTPATCH-NOT: flags = hot patchable
+// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 
 int main() {
   return 0;
diff --git a/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp b/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp
index 48a61f7fb197..e31c762b0887 100644
--- a/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp
+++ b/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp
@@ -11,12 +11,9 @@
 // RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 // HOTPATCH: S_COMPILE3 [size = [[#]]]
 // HOTPATCH: flags = hot patchable
-///
-/// Unfortunately we need /Z7, Clang does not systematically generate S_COMPILE3.
-///
+//
 // RUN: %clang_cl --target=arm-pc-windows-msvc /c -o %t.obj -- %s
-// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=NO-HOTPATCH
-// NO-HOTPATCH-NOT: flags = hot patchable
+// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 
 int main() {
   return 0;
diff --git a/clang/test/Frontend/ast-main.c b/clang/test/Frontend/ast-main.c
index cdc74219f73a..6a64497f4109 100644
--- a/clang/test/Frontend/ast-main.c
+++ b/clang/test/Frontend/ast-main.c
@@ -1,6 +1,6 @@
-// RUN: env SDKROOT="/" %clang -emit-llvm -S -o %t1.ll -x c - < %s
+// RUN: env SDKROOT="/" %clang -emit-llvm -S -o - -x c - < %s | grep -v DIFile > %t1.ll
 // RUN: env SDKROOT="/" %clang -emit-ast -o %t.ast %s
-// RUN: env SDKROOT="/" %clang -emit-llvm -S -o %t2.ll -x ast - < %t.ast
+// RUN: env SDKROOT="/" %clang -emit-llvm -S -o - -x ast - < %t.ast | grep -v DIFile > %t2.ll
 // RUN: diff %t1.ll %t2.ll
 
 int main(void) {
diff --git a/clang/test/Frontend/ast-main.cpp b/clang/test/Frontend/ast-main.cpp
index fe47ce435f06..fc09e6437f93 100644
--- a/clang/test/Frontend/ast-main.cpp
+++ b/clang/test/Frontend/ast-main.cpp
@@ -1,6 +1,6 @@
-// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o %t1.ll -x c++ - < %s
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o - -x c++ - < %s | grep -v DIFile > %t1.ll
 // RUN: env SDKROOT="/" %clang -Wno-error=return-type -fno-delayed-template-parsing -emit-ast -o %t.ast %s
-// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o %t2.ll -x ast - < %t.ast
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o - -x ast - < %t.ast | grep -v DIFile > %t2.ll
 // RUN: diff %t1.ll %t2.ll
 
 // http://llvm.org/bugs/show_bug.cgi?id=15377
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e13e92378d4a..a2c3b50b2467 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -565,8 +565,11 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = M.getCodeViewFlag();
-    if (EmitCodeView &&
-        (TM.getTargetTriple().isOSWindows() || TM.getTargetTriple().isUEFI()))
+    // On Windows targets, emit minimal CodeView compiler info even when debug
+    // info is disabled.
+    if ((TM.getTargetTriple().isOSWindows() &&
+         M.getNamedMetadata("llvm.dbg.cu")) ||
+        (TM.getTargetTriple().isUEFI() && EmitCodeView))
       Handlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
       if (hasDebugInfo()) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index fc43bc6f7776..ea57a8fa1f79 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -125,6 +125,8 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
     return CPUType::ARM64;
   case Triple::ArchType::mipsel:
     return CPUType::MIPS;
+  case Triple::ArchType::UnknownArch:
+    return CPUType::Unknown;
   default:
     report_fatal_error("target architecture doesn't map to a CodeView CPUType");
   }
@@ -611,21 +613,33 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 }
 
 void CodeViewDebug::beginModule(Module *M) {
-  // If module doesn't have named metadata anchors or COFF debug section
-  // is not available, skip any debug info related stuff.
-  if (!Asm->hasDebugInfo() ||
-      !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+  // If COFF debug section is not available, skip any debug info related stuff.
+  if (!Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
   }
 
+  CompilerInfoAsm = Asm;
   TheCPU = mapArchToCVCPUType(M->getTargetTriple().getArch());
 
   // Get the current source language.
-  const MDNode *Node = *M->debug_compile_units_begin();
+  const MDNode *Node;
+  if (Asm->hasDebugInfo()) {
+    Node = *M->debug_compile_units_begin();
+  } else {
+    // When emitting only compiler information, we may have only NoDebug CUs,
+    // which would be skipped by debug_compile_units_begin.
+    NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+    Node = *CUs->operands().begin();
+  }
   const auto *CU = cast<DICompileUnit>(Node);
 
   CurrentSourceLanguage = MapDWLangToCVLang(CU->getSourceLanguage());
+  if (!M->getCodeViewFlag() ||
+      CU->getEmissionKind() == DICompileUnit::NoDebug) {
+    Asm = nullptr;
+    return;
+  }
 
   collectGlobalVariableInfo();
 
@@ -636,7 +650,7 @@ void CodeViewDebug::beginModule(Module *M) {
 }
 
 void CodeViewDebug::endModule() {
-  if (!Asm || !Asm->hasDebugInfo())
+  if (!CompilerInfoAsm)
     return;
 
   // The COFF .debug$S section consists of several subsections, each starting
@@ -652,6 +666,8 @@ void CodeViewDebug::endModule() {
   emitObjName();
   emitCompilerInformation();
   endCVSubsection(CompilerInfo);
+  if (!Asm)
+    return;
 
   emitInlineeLinesSubsection();
 
@@ -788,7 +804,7 @@ void CodeViewDebug::emitTypeGlobalHashes() {
 void CodeViewDebug::emitObjName() {
   MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_OBJNAME);
 
-  StringRef PathRef(Asm->TM.Options.ObjectFilenameForDebug);
+  StringRef PathRef(CompilerInfoAsm->TM.Options.ObjectFilenameForDebug);
   llvm::SmallString<256> PathStore(PathRef);
 
   if (PathRef.empty() || PathRef == "-") {
@@ -846,7 +862,7 @@ void CodeViewDebug::emitCompilerInformation() {
   }
   using ArchType = llvm::Triple::ArchType;
   ArchType Arch = MMI->getModule()->getTargetTriple().getArch();
-  if (Asm->TM.Options.Hotpatch || Arch == ArchType::thumb ||
+  if (CompilerInfoAsm->TM.Options.Hotpatch || Arch == ArchType::thumb ||
       Arch == ArchType::aarch64) {
     Flags |= static_cast<uint32_t>(CompileSym3Flags::HotPatch);
   }
@@ -1015,7 +1031,7 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
   const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
 
   MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
-      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+      CompilerInfoAsm->getObjFileLowering().getCOFFDebugSymbolsSection());
   DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
 
   OS.switchSection(DebugSec);
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index d13b315135ad..5f4f30271d9c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -98,6 +98,10 @@ private:
   /// The codeview CPU type used by the translation unit.
   codeview::CPUType TheCPU;
 
+  /// The AsmPrinter used for emitting compiler metadata. When only compiler
+  /// info is being emitted, DebugHandlerBase::Asm may be null.
+  AsmPrinter *CompilerInfoAsm = nullptr;
+
   static LocalVarDef createDefRangeMem(uint16_t CVRegister, int Offset);
 
   /// Similar to DbgVariable in DwarfDebug, but not dwarf-specific.
diff --git a/llvm/test/CodeGen/Generic/selection-dag-determinism.ll b/llvm/test/CodeGen/Generic/selection-dag-determinism.ll
index 1adff3d61ba2..522894236807 100644
--- a/llvm/test/CodeGen/Generic/selection-dag-determinism.ll
+++ b/llvm/test/CodeGen/Generic/selection-dag-determinism.ll
@@ -1,8 +1,8 @@
-; RUN: llc -O2 -o %t1.o < %s
-; RUN: llc -O2 -o %t2.o < %s
-; RUN: llc -O2 -o %t3.o < %s
-; RUN: llc -O2 -o %t4.o < %s
-; RUN: llc -O2 -o %t5.o < %s
+; RUN: llc -O2 < %s > %t1.o
+; RUN: llc -O2 < %s > %t2.o
+; RUN: llc -O2 < %s > %t3.o
+; RUN: llc -O2 < %s > %t4.o
+; RUN: llc -O2 < %s > %t5.o
 ; RUN: cmp %t1.o %t2.o
 ; RUN: cmp %t1.o %t3.o
 ; RUN: cmp %t1.o %t4.o
diff --git a/llvm/test/DebugInfo/COFF/dwarf-headers.ll b/llvm/test/DebugInfo/COFF/dwarf-headers.ll
index 9d515f6cec64..919068e96604 100644
--- a/llvm/test/DebugInfo/COFF/dwarf-headers.ll
+++ b/llvm/test/DebugInfo/COFF/dwarf-headers.ll
@@ -43,6 +43,33 @@
 ; DWO-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004, abbr_offset
 ; DWO-4: 0x0000000b: DW_TAG_compile_unit
 
+; Check that basic CodeView compiler info is emitted even when the DWARF debug format is used.
+; RUN: llc -dwarf-version=4 \
+; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-windows-msvc < %s \
+; RUN:     | llvm-readobj --codeview - | FileCheck %s --check-prefix=CODEVIEW
+; CODEVIEW:      CodeViewDebugInfo [
+; CODEVIEW-NEXT:   Section: .debug$S (4)
+; CODEVIEW-NEXT:   Magic: 0x4
+; CODEVIEW-NEXT:   Subsection [
+; CODEVIEW-NEXT:     SubSectionType: Symbols (0xF1)
+; CODEVIEW-NEXT:     SubSectionSize: 0x90
+; CODEVIEW-NEXT:     ObjNameSym {
+; CODEVIEW-NEXT:       Kind: S_OBJNAME (0x1101)
+; CODEVIEW-NEXT:       Signature: 0x0
+; CODEVIEW-NEXT:       ObjectName:
+; CODEVIEW-NEXT:     }
+; CODEVIEW-NEXT:     Compile3Sym {
+; CODEVIEW-NEXT:       Kind: S_COMPILE3 (0x113C)
+; CODEVIEW-NEXT:       Language: Cpp (0x1)
+; CODEVIEW-NEXT:       Flags [ (0x0)
+; CODEVIEW-NEXT:       ]
+; CODEVIEW-NEXT:       Machine: X64 (0xD0)
+; CODEVIEW-NEXT:       FrontendVersion: 17.0.0.0
+; CODEVIEW-NEXT:       BackendVersion:
+; CODEVIEW-NEXT:       VersionName: clang version 17.0.0
+; CODEVIEW-NEXT:     }
+; CODEVIEW-NEXT:   ]
+; CODEVIEW-NEXT: ]
 
 ; ModuleID = 't.cpp'
 source_filename = "t.cpp"
diff --git a/llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll b/llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll
new file mode 100644
index 000000000000..792aaeef483f
--- /dev/null
+++ b/llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll
@@ -0,0 +1,38 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-readobj --codeview - | FileCheck %s
+; Check that basic CodeView compiler info is emitted even when the CodeView flag is not set.
+
+; CHECK-NOT:  CodeViewTypes
+; CHECK:      CodeViewDebugInfo [
+; CHECK-NEXT:   Section: .debug$S (4)
+; CHECK-NEXT:   Magic: 0x4
+; CHECK-NEXT:   Subsection [
+; CHECK-NEXT:     SubSectionType: Symbols (0xF1)
+; CHECK-NEXT:     SubSectionSize: 0x2C
+; CHECK-NEXT:     ObjNameSym {
+; CHECK-NEXT:       Kind: S_OBJNAME (0x1101)
+; CHECK-NEXT:       Signature: 0x0
+; CHECK-NEXT:       ObjectName:
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Compile3Sym {
+; CHECK-NEXT:       Kind: S_COMPILE3 (0x113C)
+; CHECK-NEXT:       Language: C (0x0)
+; CHECK-NEXT:       Flags [ (0x0)
+; CHECK-NEXT:       ]
+; CHECK-NEXT:       Machine: X64 (0xD0)
+; CHECK-NEXT:       FrontendVersion:
+; CHECK-NEXT:       BackendVersion:
+; CHECK-NEXT:       VersionName: clang
+; CHECK-NEXT:     }
+; CHECK-NEXT:   ]
+; CHECK-NEXT: ]
+
+source_filename = "empty"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang", emissionKind: NoDebug)
+!1 = !DIFile(filename: "empty", directory: "path/to")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll b/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll
index 4204df512ac3..94fee0e1812f 100644
--- a/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll
+++ b/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll
@@ -1,8 +1,30 @@
 ; RUN: llc -filetype=obj -o - < %s | llvm-readobj --codeview - | FileCheck %s
-; Check that debug info isn't emitted for CodeView with emissionKind NoDebug
+; Check that only basic compiler info is emitted for CodeView with emissionKind NoDebug
 
-; CHECK-NOT:      CodeViewTypes
-; CHECK-NOT:      CodeViewDebugInfo
+; CHECK-NOT:  CodeViewTypes
+; CHECK:      CodeViewDebugInfo [
+; CHECK-NEXT:   Section: .debug$S (4)
+; CHECK-NEXT:   Magic: 0x4
+; CHECK-NEXT:   Subsection [
+; CHECK-NEXT:     SubSectionType: Symbols (0xF1)
+; CHECK-NEXT:     SubSectionSize: 0x2C
+; CHECK-NEXT:     ObjNameSym {
+; CHECK-NEXT:       Kind: S_OBJNAME (0x1101)
+; CHECK-NEXT:       Signature: 0x0
+; CHECK-NEXT:       ObjectName:
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Compile3Sym {
+; CHECK-NEXT:       Kind: S_COMPILE3 (0x113C)
+; CHECK-NEXT:       Language: C (0x0)
+; CHECK-NEXT:       Flags [ (0x0)
+; CHECK-NEXT:       ]
+; CHECK-NEXT:       Machine: X64 (0xD0)
+; CHECK-NEXT:       FrontendVersion:
+; CHECK-NEXT:       BackendVersion:
+; CHECK-NEXT:       VersionName: clang
+; CHECK-NEXT:     }
+; CHECK-NEXT:   ]
+; CHECK-NEXT: ]
 
 source_filename = "empty"
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/DebugInfo/COFF/fission-cu.ll b/llvm/test/DebugInfo/COFF/fission-cu.ll
index 3afcb8717e31..dcc3fdd2efa7 100644
--- a/llvm/test/DebugInfo/COFF/fission-cu.ll
+++ b/llvm/test/DebugInfo/COFF/fission-cu.ll
@@ -107,11 +107,11 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; For COFF we should have this set of relocations for the debug info section
 ;
 ; OBJ: .debug_info
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_abbrev (6)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_line (26)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (10)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (10)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_addr (20)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_abbrev (8)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_line (28)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (12)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (12)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_addr (22)
 ; OBJ-NEXT: }
 
 ; HDR-NOT: .debug_aranges
diff --git a/llvm/test/DebugInfo/COFF/fission-sections.ll b/llvm/test/DebugInfo/COFF/fission-sections.ll
index 754e2b888c20..c16a4d072909 100644
--- a/llvm/test/DebugInfo/COFF/fission-sections.ll
+++ b/llvm/test/DebugInfo/COFF/fission-sections.ll
@@ -27,13 +27,14 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; OBJ-NEXT:  0 .text
 ; OBJ-NEXT:  1 .data
 ; OBJ-NEXT:  2 .bss
-; OBJ-NEXT:  3 .debug_abbrev
-; OBJ-NEXT:  4 .debug_info
-; OBJ-NEXT:  5 .debug_str
-; OBJ-NEXT:  6 .debug_addr
-; OBJ-NEXT:  7 .debug_pubnames
-; OBJ-NEXT:  8 .debug_pubtypes
-; OBJ-NEXT:  9 .debug_line
+; OBJ-NEXT:  3 .debug$S
+; OBJ-NEXT:  4 .debug_abbrev
+; OBJ-NEXT:  5 .debug_info
+; OBJ-NEXT:  6 .debug_str
+; OBJ-NEXT:  7 .debug_addr
+; OBJ-NEXT:  8 .debug_pubnames
+; OBJ-NEXT:  9 .debug_pubtypes
+; OBJ-NEXT: 10 .debug_line
 
 ; OBJ:     .debug_abbrev
 ; OBJ:     .debug_info
diff --git a/llvm/test/DebugInfo/COFF/uefi-nodebug.ll b/llvm/test/DebugInfo/COFF/uefi-nodebug.ll
new file mode 100644
index 000000000000..92e5fd6b5796
--- /dev/null
+++ b/llvm/test/DebugInfo/COFF/uefi-nodebug.ll
@@ -0,0 +1,16 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-readobj --codeview - | FileCheck %s
+; Check that compiler info is not emitted when CodeView flag is not specified
+
+; CHECK-NOT:  CodeViewTypes
+; CHECK-NOT:  CodeViewDebugInfo
+
+source_filename = "empty"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-uefi"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang", emissionKind: NoDebug)
+!1 = !DIFile(filename: "empty", directory: "path/to")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/DebugInfo/Generic/directives-only.ll b/llvm/test/DebugInfo/Generic/directives-only.ll
index ff9393221e2f..4754df7186fa 100644
--- a/llvm/test/DebugInfo/Generic/directives-only.ll
+++ b/llvm/test/DebugInfo/Generic/directives-only.ll
@@ -18,7 +18,7 @@
 ; CHECK: .loc 1 4 15
 ; CHECK: .loc 1 5 1
 
-; CHECK-NOT: .section .{{debug.*}}
+; CHECK-NOT: .section .{{debug_.*}}
 
 ; Function Attrs: nounwind uwtable
 define void @f2() #0 !dbg !4 {

From f62a8ab9304fb8b8b3ac3519a7addd7d3d234b04 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Fri, 13 Jun 2025 22:51:33 +0200
Subject: [PATCH 0352/1322] [CIR] Extend VecShuffleOp verifier to catch invalid
 index (#143262)

Extend the verifier to catch index larger than the size of vector
elements in VecShuffleOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp          |  9 +++++++++
 .../IR/invalid-vector-shuffle-wrong-index.cir    | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir

diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 8ed0ee92574d..a685253b7d82 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1643,6 +1643,15 @@ LogicalResult cir::VecShuffleOp::verify() {
                          << " and " << getResult().getType() << " don't match";
   }
 
+  const uint64_t maxValidIndex =
+      getVec1().getType().getSize() + getVec2().getType().getSize() - 1;
+  if (llvm::any_of(
+          getIndices().getAsRange<cir::IntAttr>(), [&](cir::IntAttr idxAttr) {
+            return idxAttr.getSInt() != -1 && idxAttr.getUInt() > maxValidIndex;
+          })) {
+    return emitOpError() << ": index for __builtin_shufflevector must be "
+                            "less than the total number of vector elements";
+  }
   return success();
 }
 
diff --git a/clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir b/clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir
new file mode 100644
index 000000000000..375b2d3dc563
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir
@@ -0,0 +1,16 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!s32i = !cir.int<s, 32>
+!s64i = !cir.int<s, 64>
+
+module  {
+  cir.func @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+
+    // expected-error @below {{index for __builtin_shufflevector must be less than the total number of vector elements}}
+    %new_vec = cir.vec.shuffle(%vec_1, %vec_2 : !cir.vector<4 x !s32i>) [#cir.int<9> : !s64i, #cir.int<4> : !s64i,
+      #cir.int<1> : !s64i, #cir.int<5> : !s64i] : !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+}

From 5ab285e0a60ad914bda893dbe18b6c1c562f3db6 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Jun 2025 14:10:30 -0700
Subject: [PATCH 0353/1322] [LLD][COFF] Fix ARM64X CHPE exception data size
 relocation when no x86 .pdata is present (#144085)

Fixes an issue where we incorrectly skip setting the relocation value if
`hybridPdata.first` is null.
---
 lld/COFF/Writer.cpp              | 16 +++-------
 lld/test/COFF/pdata-arm64ec.test | 53 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index cb9d0001015b..5f1da5e79dac 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -2495,22 +2495,16 @@ void Writer::setECSymbols() {
               offsetof(data_directory, Size),
           ctx.symtab.edataEnd->getRVA() - ctx.symtab.edataStart->getRVA() +
               ctx.symtab.edataEnd->getSize());
-    if (hybridPdata.first) {
+    if (hybridPdata.first)
       ctx.dynamicRelocs->set(
           dataDirOffset64 + EXCEPTION_TABLE * sizeof(data_directory) +
               offsetof(data_directory, Size),
           hybridPdata.last->getRVA() - hybridPdata.first->getRVA() +
               hybridPdata.last->getSize());
-      if (chpeSym) {
-        size_t size = 0;
-        if (pdata.first)
-          size = pdata.last->getRVA() + pdata.last->getSize() -
-                 pdata.first->getRVA();
-        ctx.dynamicRelocs->set(chpeSym->getRVA() +
-                                   offsetof(chpe_metadata, ExtraRFETableSize),
-                               size);
-      }
-    }
+    if (chpeSym && pdata.first)
+      ctx.dynamicRelocs->set(
+          chpeSym->getRVA() + offsetof(chpe_metadata, ExtraRFETableSize),
+          pdata.last->getRVA() + pdata.last->getSize() - pdata.first->getRVA());
   }
 }
 
diff --git a/lld/test/COFF/pdata-arm64ec.test b/lld/test/COFF/pdata-arm64ec.test
index cf59330b2354..6bdcc5c5682b 100644
--- a/lld/test/COFF/pdata-arm64ec.test
+++ b/lld/test/COFF/pdata-arm64ec.test
@@ -80,10 +80,63 @@ DIR3-NEXT:     ExtraRFETableSize: 0x10
 DIR3:        ]
 DIR3:      }
 
+arm64x with no x86 .pdata:
+
 RUN: llvm-objdump -s --section=.pdata test4.dll | FileCheck -check-prefix=DATA4 %s
 DATA4: 180006000 00100000 11000001 00200000 11000001  ......... ......
 DATA4: 180006010 00300000 0e300000
 
+RUN: lld-link -out:testx2.dll -machine:arm64x arm64-func-sym.obj arm64ec-func-sym.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj -dll -noentry
+
+RUN: llvm-readobj --headers --coff-load-config testx2.dll | FileCheck -check-prefix=DIR4 %s
+DIR4:      ImageOptionalHeader {
+DIR4:        DataDirectory {
+DIR4:          ExceptionTableRVA: 0x5000
+DIR4-NEXT:     ExceptionTableSize: 0x10
+DIR4:        }
+DIR4:      }
+DIR4:      CHPEMetadata [
+DIR4:        ExtraRFETable: 0x0
+DIR4-NEXT:   ExtraRFETableSize: 0x0
+DIR4:      ]
+DIR4:      HybridObject {
+DIR4:        ImageOptionalHeader {
+DIR4:          ExceptionTableRVA: 0x0
+DIR4-NEXT:     ExceptionTableSize: 0x0
+DIR4:        }
+DIR4:        CHPEMetadata [
+DIR4:          ExtraRFETable: 0x5000
+DIR4-NEXT:     ExtraRFETableSize: 0x10
+DIR4:        ]
+DIR4:      }
+
+arm64x with no ARM .pdata:
+
+RUN: lld-link -out:testx3.dll -machine:arm64x x86_64-func-sym.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -dll -noentry
+
+RUN: llvm-readobj --headers --coff-load-config testx3.dll | FileCheck -check-prefix=DIR5 %s
+DIR5:      ImageOptionalHeader {
+DIR5:        DataDirectory {
+DIR5:          ExceptionTableRVA: 0x0
+DIR5-NEXT:     ExceptionTableSize: 0x0
+DIR5:        }
+DIR5:      }
+DIR5:      CHPEMetadata [
+DIR5:        ExtraRFETable: 0x4000
+DIR5-NEXT:   ExtraRFETableSize: 0xC
+DIR5:      ]
+DIR5:      HybridObject {
+DIR5:        ImageOptionalHeader {
+DIR5:          ExceptionTableRVA: 0x4000
+DIR5-NEXT:     ExceptionTableSize: 0xC
+DIR5:        }
+DIR5:        CHPEMetadata [
+DIR5:          ExtraRFETable: 0x0
+DIR5-NEXT:     ExtraRFETableSize: 0x0
+DIR5:        ]
+DIR5:      }
+
 Order of inputs doesn't matter, the data is sorted by type and RVA:
 
 RUN: lld-link -out:test5.dll -machine:arm64ec x86_64-func-sym.obj arm64ec-func-sym.obj \

From 8229628cf1812e126ff72ee9f4b5f267db4c91da Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Jun 2025 23:22:37 +0200
Subject: [PATCH 0354/1322] [Clang] Relax DICompileUnit producer check in
 debug-info-coff.cpp test (NFC)

Fixes test from #142970 on Fuchsia CI, which uses "Fuchsia clang version" prefix.
---
 clang/test/CodeGenCXX/debug-info-coff.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGenCXX/debug-info-coff.cpp b/clang/test/CodeGenCXX/debug-info-coff.cpp
index 4507f5f40d41..2535c5cc7511 100644
--- a/clang/test/CodeGenCXX/debug-info-coff.cpp
+++ b/clang/test/CodeGenCXX/debug-info-coff.cpp
@@ -4,7 +4,7 @@
 
 // RUN: %clang --target=i686-pc-windows-msvc -S -emit-llvm %s -o - | FileCheck --check-prefix=IR %s
 // IR: !llvm.dbg.cu = !{!0}
-// IR: !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
+// IR: !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "{{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
 
 // RUN: %clang --target=i686-pc-windows-msvc -c %s -o %t.o
 // RUN: llvm-readobj --codeview %t.o | FileCheck %s
@@ -27,7 +27,7 @@
 // CHECK-NEXT:       Machine: Pentium3 (0x7)
 // CHECK-NEXT:       FrontendVersion:
 // CHECK-NEXT:       BackendVersion:
-// CHECK-NEXT:       VersionName: clang version
+// CHECK-NEXT:       VersionName: {{.*}}clang version
 // CHECK-NEXT:     }
 // CHECK-NEXT:   ]
 // CHECK-NEXT: ]

From 3afc2be1f0a4d3e3f646403a7495bcb12ef94246 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Fri, 13 Jun 2025 14:35:40 -0700
Subject: [PATCH 0355/1322] llvm-lto2: Add print-guid subcommand.

This is useful for debugging ThinLTO issues.

Reviewers: teresajohnson

Reviewed By: teresajohnson

Pull Request: https://github.com/llvm/llvm-project/pull/143992
---
 llvm/test/tools/llvm-lto2/print-guid.test | 2 ++
 llvm/tools/llvm-lto2/llvm-lto2.cpp        | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-lto2/print-guid.test

diff --git a/llvm/test/tools/llvm-lto2/print-guid.test b/llvm/test/tools/llvm-lto2/print-guid.test
new file mode 100644
index 000000000000..a3d3f202ea43
--- /dev/null
+++ b/llvm/test/tools/llvm-lto2/print-guid.test
@@ -0,0 +1,2 @@
+# RUN: llvm-lto2 print-guid foo | FileCheck %s
+# CHECK: 6699318081062747564
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 2bbb65be3b31..fbde66666a59 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -247,7 +247,7 @@ template <typename T> static T check(ErrorOr<T> E, std::string Msg) {
 }
 
 static int usage() {
-  errs() << "Available subcommands: dump-symtab run\n";
+  errs() << "Available subcommands: dump-symtab run print-guid\n";
   return 1;
 }
 
@@ -610,5 +610,11 @@ int main(int argc, char **argv) {
     return dumpSymtab(argc - 1, argv + 1);
   if (Subcommand == "run")
     return run(argc - 1, argv + 1);
+  if (Subcommand == "print-guid" && argc > 2) {
+    // Note the name of the function we're calling: this won't return the right
+    // answer for internal linkage symbols.
+    outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]) << '\n';
+    return 0;
+  }
   return usage();
 }

From 473dea9b0b86d48db805079fa3e68b37e1dbcdd9 Mon Sep 17 00:00:00 2001
From: William Huynh <William.Huynh@arm.com>
Date: Fri, 13 Jun 2025 22:37:25 +0100
Subject: [PATCH 0356/1322] [libc] Output all headers with
 LIBC_CONF_OUTPUT_ALL_HEADERS (#144114)

Following discussion from
https://discourse.llvm.org/t/missing-declarations-in-header-files/86678,
we decided to add a flag to output all headers. Requires #144049.

- Allows outputting all headers
- Minor whitespace change for alignment

---------

Co-authored-by: Michael Jones <michaelrj@google.com>
---
 libc/CMakeLists.txt                          | 1 +
 libc/cmake/modules/LLVMLibCHeaderRules.cmake | 7 ++++++-
 libc/test/UnitTest/CMakeLists.txt            | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index f21fc2fba730..9907adfc55a5 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -135,6 +135,7 @@ endif()
 option(LLVM_LIBC_FULL_BUILD "Build and test LLVM libc as if it is the full libc" ${default_to_full_build})
 option(LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR "Build LLVM libc tests assuming our implementation-defined behavior" ON)
 option(LLVM_LIBC_ENABLE_LINTING "Enables linting of libc source files" OFF)
+option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless of whether they are enabled on this target" OFF)
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 99f90244e013..01c288f0b919 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -97,8 +97,13 @@ function(add_gen_header target_name)
   set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
   set(dep_file "${out_file}.d")
   set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR_YAML_FILE})
+  
+  if(LLVM_LIBC_ALL_HEADERS)
+    set(entry_points "")
+  else()
+    set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
+  endif()
 
-  set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
   list(TRANSFORM entry_points PREPEND "--entry-point=")
 
   add_custom_command(
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index b0a3a7431c22..c32809da577d 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -35,7 +35,7 @@ function(add_unittest_framework_library name)
   else()
     _get_common_test_compile_options(compile_options "" "")
     target_compile_options(${name}.unit PRIVATE ${compile_options})
-endif()
+  endif()
 
   _get_hermetic_test_compile_options(compile_options "")
   target_include_directories(${name}.hermetic PRIVATE ${LIBC_INCLUDE_DIR})

From 2c440232e261746970cdf6f74d6588464eecd48b Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Fri, 13 Jun 2025 15:07:06 -0700
Subject: [PATCH 0357/1322] [bazel][libc] Add missing deps after
 51689c9df2fbb81aab1ff802f3efb86cac926853

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 7901de161b7a..8e629270c89d 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -347,16 +347,19 @@ libc_support_library(
 libc_support_library(
     name = "types_struct_f_owner_ex",
     hdrs = ["hdr/types/struct_f_owner_ex.h"],
+    deps = [":hdr_fcntl_overlay"],
 )
 
 libc_support_library(
     name = "types_struct_flock",
     hdrs = ["hdr/types/struct_flock.h"],
+    deps = [":hdr_fcntl_overlay"],
 )
 
 libc_support_library(
     name = "types_struct_flock64",
     hdrs = ["hdr/types/struct_flock64.h"],
+    deps = [":hdr_fcntl_overlay"],
 )
 
 libc_support_library(

From a591bd222b2e0356b8132b515422fe480b87322b Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 13 Jun 2025 15:09:57 -0700
Subject: [PATCH 0358/1322] Turn LIBC_COPT_STRING_UNSAFE_WIDE_READ on by
 default (#144163)

Configure strlen to use unsafe implementation because it is faster.

Because this is undefined behavior it could cause sanitizers to fail.
---
 libc/config/config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/config/config.json b/libc/config/config.json
index d53b2936edb0..0354b16997cd 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -59,7 +59,7 @@
   },
   "string": {
     "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
-      "value": false,
+      "value": true,
       "doc": "Read more than a byte at a time to perform byte-string operations like strlen."
     },
     "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {

From ca38027c036593ae487ccef250ebd5133803bb55 Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 13 Jun 2025 15:30:28 -0700
Subject: [PATCH 0359/1322] Revert "Turn LIBC_COPT_STRING_UNSAFE_WIDE_READ on
 by default" (#144167)

Reverts llvm/llvm-project#144163 because for some reason I didn't
realize there are ASan tests.
---
 libc/config/config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/config/config.json b/libc/config/config.json
index 0354b16997cd..d53b2936edb0 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -59,7 +59,7 @@
   },
   "string": {
     "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
-      "value": true,
+      "value": false,
       "doc": "Read more than a byte at a time to perform byte-string operations like strlen."
     },
     "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {

From d7e64d9594d241d6a9186fadad2b0d40a8fba8a7 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Fri, 13 Jun 2025 15:48:46 -0700
Subject: [PATCH 0360/1322] [MSAN] handle assorted AVX permutations (#143462)

---
 .../Instrumentation/MemorySanitizer.cpp       |  82 ++-
 .../X86/avx2-intrinsics-x86.ll                |  38 +-
 .../X86/avx512-intrinsics-upgrade.ll          | 430 +++++++------
 .../MemorySanitizer/X86/avx512-intrinsics.ll  | 428 +++++++------
 .../X86/avx512vl-intrinsics.ll                | 595 ++++++++++++------
 .../MemorySanitizer/X86/x86-vpermi2.ll        | 205 +++---
 .../i386/avx2-intrinsics-i386.ll              |  38 +-
 7 files changed, 1164 insertions(+), 652 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index d3c6a7151ec3..fb55bd7bfe56 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4173,7 +4173,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Instrument AVX permutation intrinsic.
   // We apply the same permutation (argument index 1) to the shadow.
-  void handleAVXVpermilvar(IntrinsicInst &I) {
+  void handleAVXPermutation(IntrinsicInst &I) {
+    assert(I.arg_size() == 2);
+    assert(isa<FixedVectorType>(I.getArgOperand(0)->getType()));
+    assert(isa<FixedVectorType>(I.getArgOperand(1)->getType()));
+    [[maybe_unused]] auto ArgVectorSize =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+    assert(cast<FixedVectorType>(I.getArgOperand(1)->getType())
+               ->getNumElements() == ArgVectorSize);
+    assert(I.getType() == I.getArgOperand(0)->getType());
     IRBuilder<> IRB(&I);
     Value *Shadow = getShadow(&I, 0);
     insertShadowCheck(I.getArgOperand(1), &I);
@@ -4187,6 +4195,38 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
     setOriginForNaryOp(I);
   }
+  // Instrument AVX permutation intrinsic.
+  // We apply the same permutation (argument index 1) to the shadows.
+  void handleAVXVpermil2var(IntrinsicInst &I) {
+    assert(I.arg_size() == 3);
+    assert(isa<FixedVectorType>(I.getArgOperand(0)->getType()));
+    assert(isa<FixedVectorType>(I.getArgOperand(1)->getType()));
+    assert(isa<FixedVectorType>(I.getArgOperand(2)->getType()));
+    [[maybe_unused]] auto ArgVectorSize =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+    assert(cast<FixedVectorType>(I.getArgOperand(1)->getType())
+               ->getNumElements() == ArgVectorSize);
+    assert(cast<FixedVectorType>(I.getArgOperand(2)->getType())
+               ->getNumElements() == ArgVectorSize);
+    assert(I.getArgOperand(0)->getType() == I.getArgOperand(2)->getType());
+    assert(I.getType() == I.getArgOperand(0)->getType());
+    assert(I.getArgOperand(1)->getType()->isIntOrIntVectorTy());
+    IRBuilder<> IRB(&I);
+    Value *AShadow = getShadow(&I, 0);
+    Value *Idx = I.getArgOperand(1);
+    Value *BShadow = getShadow(&I, 2);
+    insertShadowCheck(Idx, &I);
+
+    // Shadows are integer-ish types but some intrinsics require a
+    // different (e.g., floating-point) type.
+    AShadow = IRB.CreateBitCast(AShadow, I.getArgOperand(0)->getType());
+    BShadow = IRB.CreateBitCast(BShadow, I.getArgOperand(2)->getType());
+    CallInst *CI = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                       {AShadow, Idx, BShadow});
+
+    setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
+    setOriginForNaryOp(I);
+  }
 
   // Instrument BMI / BMI2 intrinsics.
   // All of these intrinsics are Z = I(X, Y)
@@ -5132,16 +5172,52 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(Success);
       break;
     }
-
+    case Intrinsic::x86_avx2_permd:
+    case Intrinsic::x86_avx2_permps:
+    case Intrinsic::x86_ssse3_pshuf_b_128:
+    case Intrinsic::x86_avx2_pshuf_b:
+    case Intrinsic::x86_avx512_pshuf_b_512:
+    case Intrinsic::x86_avx512_permvar_df_256:
+    case Intrinsic::x86_avx512_permvar_df_512:
+    case Intrinsic::x86_avx512_permvar_di_256:
+    case Intrinsic::x86_avx512_permvar_di_512:
+    case Intrinsic::x86_avx512_permvar_hi_128:
+    case Intrinsic::x86_avx512_permvar_hi_256:
+    case Intrinsic::x86_avx512_permvar_hi_512:
+    case Intrinsic::x86_avx512_permvar_qi_128:
+    case Intrinsic::x86_avx512_permvar_qi_256:
+    case Intrinsic::x86_avx512_permvar_qi_512:
+    case Intrinsic::x86_avx512_permvar_sf_512:
+    case Intrinsic::x86_avx512_permvar_si_512:
     case Intrinsic::x86_avx_vpermilvar_pd:
     case Intrinsic::x86_avx_vpermilvar_pd_256:
     case Intrinsic::x86_avx512_vpermilvar_pd_512:
     case Intrinsic::x86_avx_vpermilvar_ps:
     case Intrinsic::x86_avx_vpermilvar_ps_256:
     case Intrinsic::x86_avx512_vpermilvar_ps_512: {
-      handleAVXVpermilvar(I);
+      handleAVXPermutation(I);
       break;
     }
+    case Intrinsic::x86_avx512_vpermi2var_d_128:
+    case Intrinsic::x86_avx512_vpermi2var_d_256:
+    case Intrinsic::x86_avx512_vpermi2var_d_512:
+    case Intrinsic::x86_avx512_vpermi2var_hi_128:
+    case Intrinsic::x86_avx512_vpermi2var_hi_256:
+    case Intrinsic::x86_avx512_vpermi2var_hi_512:
+    case Intrinsic::x86_avx512_vpermi2var_pd_128:
+    case Intrinsic::x86_avx512_vpermi2var_pd_256:
+    case Intrinsic::x86_avx512_vpermi2var_pd_512:
+    case Intrinsic::x86_avx512_vpermi2var_ps_128:
+    case Intrinsic::x86_avx512_vpermi2var_ps_256:
+    case Intrinsic::x86_avx512_vpermi2var_ps_512:
+    case Intrinsic::x86_avx512_vpermi2var_q_128:
+    case Intrinsic::x86_avx512_vpermi2var_q_256:
+    case Intrinsic::x86_avx512_vpermi2var_q_512:
+    case Intrinsic::x86_avx512_vpermi2var_qi_128:
+    case Intrinsic::x86_avx512_vpermi2var_qi_256:
+    case Intrinsic::x86_avx512_vpermi2var_qi_512:
+      handleAVXVpermil2var(I);
+      break;
 
     case Intrinsic::x86_avx512fp16_mask_add_sh_round:
     case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index f916130fe53e..9649f2dc71f1 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -740,8 +740,15 @@ define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[RES]]
 ;
@@ -969,8 +976,15 @@ define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -985,18 +999,18 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[TMP3]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP6]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 5aeaa1221cd2..3eeb5886b5fc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -13171,18 +13171,18 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP3]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP7]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -13197,24 +13197,24 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP7]], <8 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
@@ -13232,23 +13232,23 @@ define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP6]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
@@ -13266,8 +13266,15 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
@@ -13283,8 +13290,15 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
@@ -13307,8 +13321,15 @@ define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
@@ -13331,18 +13352,18 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP3]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -13357,24 +13378,24 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP7]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
@@ -13392,23 +13413,23 @@ define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -13426,8 +13447,15 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
@@ -13443,8 +13471,15 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
@@ -13467,8 +13502,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
@@ -13700,8 +13742,8 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -13714,9 +13756,15 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP14]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X4:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
 ;
@@ -13744,9 +13792,15 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
@@ -13768,25 +13822,23 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP9]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -13797,32 +13849,30 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP8]], <8 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
@@ -13838,25 +13888,23 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP9]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -13867,32 +13915,30 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP8]], <16 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
@@ -13908,12 +13954,18 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
@@ -13925,13 +13977,19 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
@@ -13968,9 +14026,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -13999,7 +14063,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
@@ -14013,26 +14077,24 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP5]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP11]], <8 x i64> [[X0:%.*]], <8 x double> [[TMP24]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[TMP13]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       14:
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]]
+; CHECK:       16:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
@@ -14052,30 +14114,28 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP9]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
@@ -14093,13 +14153,19 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -14120,12 +14186,18 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -14137,13 +14209,19 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index 1644a5e3a045..4b559bc9fb8e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -5467,9 +5467,15 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
 ;
@@ -5496,9 +5502,15 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
@@ -5522,24 +5534,22 @@ declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>,
 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP9]]
 ;
   %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
@@ -5549,32 +5559,30 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x
 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP8]], <8 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
@@ -5593,24 +5601,22 @@ declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>
 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP9]]
 ;
   %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
@@ -5620,32 +5626,30 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16
 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP8]], <16 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
@@ -5664,12 +5668,18 @@ declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
@@ -5680,13 +5690,19 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %
 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
@@ -5722,9 +5738,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -5753,7 +5775,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
@@ -5767,26 +5789,24 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP6]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM2:%.*]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP24]], <8 x i64> [[X0:%.*]], <8 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x double> [[TMP14]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       15:
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       17:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
@@ -5805,30 +5825,28 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP9]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
@@ -5844,13 +5862,19 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -5871,12 +5895,18 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -5887,13 +5917,19 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -9441,18 +9477,18 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP3]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP7]]
 ;
   %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
@@ -9466,24 +9502,24 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP7]], <8 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
@@ -9502,23 +9538,23 @@ define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP6]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
@@ -9538,8 +9574,15 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
@@ -9554,8 +9597,15 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
@@ -9579,8 +9629,15 @@ define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
@@ -9605,18 +9662,18 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP3]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
@@ -9630,24 +9687,24 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP7]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
@@ -9666,23 +9723,23 @@ define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -9702,8 +9759,15 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
@@ -9718,8 +9782,15 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
@@ -9743,8 +9814,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index 14d68b449a7b..40b5e9338e45 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -3,6 +3,79 @@
 
 ; Forked from llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
 
+; Strictly handled instructions:
+; * llvm.x86.avx512.mask.cmp.pd
+; * llvm.x86.avx512.mask.cmp.ps
+; * llvm.x86.avx512.mask.compress
+; * llvm.x86.avx512.mask.cvtpd2dq
+; * llvm.x86.avx512.mask.cvtp
+; * llvm.x86.avx512.mask.cvtpd2udq
+; * llvm.x86.avx512.mask.cvtps2dq
+; * llvm.x86.avx512.mask.cvtps2udq
+; * llvm.x86.avx512.mask.cvttpd2dq
+; * llvm.x86.avx512.mask.cvttpd2udq
+; * llvm.x86.avx512.mask.cvttps2udq
+; * llvm.x86.avx512.mask.expand
+; * llvm.x86.avx512.mask.fixupimm.pd
+; * llvm.x86.avx512.mask.fixupimm.ps
+; * llvm.x86.avx512.mask.getexp.pd
+; * llvm.x86.avx512.mask.getexp.ps
+; * llvm.x86.avx512.mask.getmant.pd
+; * llvm.x86.avx512.mask.getmant.ps
+; * llvm.x86.avx512.mask.pmov.db
+; * llvm.x86.avx512.mask.pmov.db.mem
+; * llvm.x86.avx512.mask.pmov.dw
+; * llvm.x86.avx512.mask.pmov.dw.mem
+; * llvm.x86.avx512.mask.pmov.qb
+; * llvm.x86.avx512.mask.pmov.qb.mem
+; * llvm.x86.avx512.mask.pmov.qd
+; * llvm.x86.avx512.mask.pmov.qd.mem
+; * llvm.x86.avx512.mask.pmov.qw
+; * llvm.x86.avx512.mask.pmov.qw.mem
+; * llvm.x86.avx512.mask.pmovs.db
+; * llvm.x86.avx512.mask.pmovs.db.mem
+; * llvm.x86.avx512.mask.pmovs.dw
+; * llvm.x86.avx512.mask.pmovs.dw.mem
+; * llvm.x86.avx512.mask.pmovs.qb
+; * llvm.x86.avx512.mask.pmovs.qb.mem
+; * llvm.x86.avx512.mask.pmovs.qd
+; * llvm.x86.avx512.mask.pmovs.qd.mem
+; * llvm.x86.avx512.mask.pmovs.qw
+; * llvm.x86.avx512.mask.pmovs.qw.mem
+; * llvm.x86.avx512.mask.pmovus.db
+; * llvm.x86.avx512.mask.pmovus.db.mem
+; * llvm.x86.avx512.mask.pmovus.dw
+; * llvm.x86.avx512.mask.pmovus.dw.mem
+; * llvm.x86.avx512.mask.pmovus.qb
+; * llvm.x86.avx512.mask.pmovus.qb.mem
+; * llvm.x86.avx512.mask.pmovus.qd
+; * llvm.x86.avx512.mask.pmovus.qd.mem
+; * llvm.x86.avx512.mask.pmovus.qw
+; * llvm.x86.avx512.mask.pmovus.qw.mem
+; * llvm.x86.avx512.mask.rndscale.pd
+; * llvm.x86.avx512.mask.rndscale.ps
+; * llvm.x86.avx512.mask.scalef.pd
+; * llvm.x86.avx512.mask.scalef.ps
+; * llvm.x86.avx512.mask.vcvtps2ph
+; * llvm.x86.avx512.maskz.fixupimm.pd
+; * llvm.x86.avx512.maskz.fixupimm.ps
+; * llvm.x86.avx512.pternlog.d
+; * llvm.x86.avx512.pternlog.q
+; * llvm.x86.avx512.rcp14.pd
+; * llvm.x86.avx512.rcp14.ps
+; * llvm.x86.avx512.rsqrt14.pd
+; * llvm.x86.avx512.rsqrt14.ps
+;
+; Heuristically handled instructions:
+; * llvm.fma.v2f64
+; * llvm.fma.v4f32
+; * llvm.fma.v4f64
+; * llvm.fma.v8f32
+; * llvm.x86.avx.max.ps.256
+; * llvm.x86.avx.min.ps.256
+; * llvm.x86.sse.max.ps
+; * llvm.x86.sse.min.ps
+
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -1901,11 +1974,17 @@ define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X1]], <4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1919,12 +1998,18 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X1]], <4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -1950,11 +2035,17 @@ define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X0]], <4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1968,12 +2059,18 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2000,12 +2097,18 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2033,11 +2136,17 @@ define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X1]], <8 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -2051,12 +2160,18 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X1]], <8 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2079,11 +2194,17 @@ define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i3
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X0]], <8 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -2097,12 +2218,18 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2126,12 +2253,18 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2156,24 +2289,22 @@ define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP8]], <2 x i64> [[X1]], <2 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP10]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
@@ -2185,34 +2316,32 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to <2 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to <2 x double>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP9]], <2 x i64> [[X1]], <2 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <2 x double> [[TMP17]] to <2 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[X1]] to <2 x double>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP18]], <2 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <2 x i64> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP21]], <2 x i64> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]
@@ -2233,24 +2362,22 @@ define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to <4 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP4]] to <4 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP8]], <4 x i64> [[X1]], <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP10]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
@@ -2262,34 +2389,32 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to <4 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP9]], <4 x i64> [[X1]], <4 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x double> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[X1]] to <4 x double>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i64> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i64> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP21]], <4 x i64> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]
@@ -2310,24 +2435,22 @@ define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to <4 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP8]], <4 x i32> [[X1]], <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
@@ -2339,34 +2462,32 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to <4 x float>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP9]], <4 x i32> [[X1]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x float> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1]] to <4 x float>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP21]], <4 x i32> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
@@ -2392,30 +2513,28 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP12]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP16]], <4 x i32> [[X1CAST]], <4 x float> [[TMP18]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK:       [[BB9]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB10]]:
+; CHECK:       [[BB12]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1CAST]], <4 x float> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1CAST]] to <4 x float>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP9]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP21]], [[TMP14]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP22]], <4 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
@@ -2437,24 +2556,22 @@ define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to <8 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP8]], <8 x i32> [[X1]], <8 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP10]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
@@ -2466,32 +2583,30 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to <8 x float>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP9]], <8 x i32> [[X1]], <8 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x float> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[X1]] to <8 x float>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> zeroinitializer, <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP18]], <8 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i32> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP21]], <8 x i32> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]
@@ -2511,11 +2626,17 @@ define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X1]], <2 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
@@ -2529,12 +2650,18 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X1]], <2 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2560,11 +2687,17 @@ define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X0]], <2 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
@@ -2578,12 +2711,18 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2610,12 +2749,18 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2643,11 +2788,17 @@ define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X1]], <4 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2661,12 +2812,18 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]], <4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2692,11 +2849,17 @@ define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X0]], <4 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2710,12 +2873,18 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2742,12 +2911,18 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -8458,18 +8633,18 @@ define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i6
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to <4 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP3]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB6]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
@@ -8485,26 +8660,26 @@ define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP14]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x double> [[TMP16]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP7]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP17]], <4 x i64> [[TMP12]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> [[X2]]
@@ -8526,25 +8701,25 @@ define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to <4 x double>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP13]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x double> [[TMP15]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP16]], <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP7]], <4 x i64> [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> zeroinitializer
@@ -8566,7 +8741,14 @@ define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[TMP3]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -8584,7 +8766,14 @@ define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[TMP5]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -8614,7 +8803,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i6
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -12267,8 +12463,7 @@ define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[_MSPROP]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[_MSPROP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
index 2350d75b29b4..35e1feb3aa20 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -16,8 +16,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X1]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -31,8 +30,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 {
 ; CHECK-SAME: <2 x i64> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X0]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -55,8 +53,14 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -80,8 +84,14 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -97,8 +107,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X1]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
@@ -112,8 +121,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 {
 ; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X0]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
@@ -136,8 +144,14 @@ define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[T]], <4 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
@@ -153,8 +167,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[R]]
@@ -168,8 +181,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 {
 ; CHECK-SAME: <8 x i64> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X0]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[R]]
@@ -192,8 +204,14 @@ define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP6]], <8 x i64> [[T]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[R]]
@@ -213,8 +231,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X1]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
@@ -228,8 +245,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 {
 ; CHECK-SAME: <4 x i32> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X0]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
@@ -252,8 +268,14 @@ define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[T]], <4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
@@ -269,8 +291,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
@@ -284,8 +305,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 {
 ; CHECK-SAME: <8 x i32> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X0]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
@@ -308,8 +328,14 @@ define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[T]], <8 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
@@ -325,8 +351,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
@@ -340,8 +365,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 {
 ; CHECK-SAME: <16 x i32> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X0]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
@@ -364,8 +388,14 @@ define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP6]], <16 x i32> [[T]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
@@ -385,8 +415,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X1]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
@@ -400,8 +429,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 {
 ; CHECK-SAME: <8 x i16> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X0]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
@@ -424,8 +452,14 @@ define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP6]], <8 x i16> [[T]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
@@ -441,8 +475,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X1]])
 ; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
@@ -456,8 +489,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 {
 ; CHECK-SAME: <16 x i16> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X0]])
 ; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
@@ -480,8 +512,14 @@ define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP6]], <16 x i16> [[T]], <16 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i16> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]])
 ; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
@@ -497,8 +535,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X1]])
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[R]]
@@ -512,8 +549,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 {
 ; CHECK-SAME: <32 x i16> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X0]])
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[R]]
@@ -536,8 +572,14 @@ define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP6]], <32 x i16> [[T]], <32 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i16> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]])
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[R]]
@@ -557,8 +599,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X1]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -572,8 +613,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 {
 ; CHECK-SAME: <16 x i8> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X0]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -596,8 +636,14 @@ define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP6]], <16 x i8> [[T]], <16 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -613,8 +659,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[R]]
@@ -628,8 +673,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 {
 ; CHECK-SAME: <32 x i8> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X0]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[R]]
@@ -652,8 +696,14 @@ define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP6]], <32 x i8> [[T]], <32 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i8> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[R]]
@@ -669,8 +719,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X1]])
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[R]]
@@ -684,8 +733,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 {
 ; CHECK-SAME: <64 x i8> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X0]])
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[R]]
@@ -708,8 +756,14 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP6]], <64 x i8> [[T]], <64 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <64 x i8> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]])
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[R]]
@@ -720,3 +774,6 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
 }
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 5cc56baf0e0d..9d3e9d63eed2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -780,8 +780,15 @@ define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[RES]]
 ;
@@ -1021,8 +1028,15 @@ define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -1038,18 +1052,18 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[TMP7]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP10]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]

From 7f69cd578de899f8b00525a02d1fe25dab567bcf Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Fri, 13 Jun 2025 16:35:30 -0700
Subject: [PATCH 0361/1322] [clang-doc] remove default label on some switches
 (#143919)

LLVM style prefers no default label on fully covered switches to warn if
new enums are added. This patch removes the default label for that
purpose or uses IT_default instead of default if that was the only enum
not covered.
---
 clang-tools-extra/clang-doc/BitcodeReader.cpp         |  4 +---
 clang-tools-extra/clang-doc/BitcodeWriter.cpp         |  2 +-
 clang-tools-extra/clang-doc/Representation.cpp        |  2 +-
 clang-tools-extra/clang-doc/Serialize.cpp             | 11 ++++++++---
 clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp |  2 +-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/clang-tools-extra/clang-doc/BitcodeReader.cpp b/clang-tools-extra/clang-doc/BitcodeReader.cpp
index 57dd514b90a2..35058abab066 100644
--- a/clang-tools-extra/clang-doc/BitcodeReader.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeReader.cpp
@@ -54,10 +54,8 @@ static llvm::Error decodeRecord(const Record &R, AccessSpecifier &Field,
   case AS_none:
     Field = (AccessSpecifier)R[0];
     return llvm::Error::success();
-  default:
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "invalid value for AccessSpecifier");
   }
+  llvm_unreachable("invalid value for AccessSpecifier");
 }
 
 static llvm::Error decodeRecord(const Record &R, TagTypeKind &Field,
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
index 708ce09d9e5b..f8a6859169b0 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
@@ -664,7 +664,7 @@ bool ClangDocBitcodeWriter::dispatchInfoForWrite(Info *I) {
   case InfoType::IT_typedef:
     emitBlock(*static_cast<clang::doc::TypedefInfo *>(I));
     break;
-  default:
+  case InfoType::IT_default:
     llvm::errs() << "Unexpected info, unable to write.\n";
     return true;
   }
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 3ce930c6965d..820d644ef8b8 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -143,7 +143,7 @@ mergeInfos(std::vector<std::unique_ptr<Info>> &Values) {
     return reduce<FunctionInfo>(Values);
   case InfoType::IT_typedef:
     return reduce<TypedefInfo>(Values);
-  default:
+  case InfoType::IT_default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
   }
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index 3cda38115ff7..e8f1a9cee267 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -388,7 +388,8 @@ std::string serialize(std::unique_ptr<Info> &I) {
     return serialize(*static_cast<EnumInfo *>(I.get()));
   case InfoType::IT_function:
     return serialize(*static_cast<FunctionInfo *>(I.get()));
-  default:
+  case InfoType::IT_typedef:
+  case InfoType::IT_default:
     return "";
   }
 }
@@ -525,9 +526,13 @@ static std::unique_ptr<Info> makeAndInsertIntoParent(ChildType Child) {
     InsertChild(ParentRec->Children, std::forward<ChildType>(Child));
     return ParentRec;
   }
-  default:
-    llvm_unreachable("Invalid reference type for parent namespace");
+  case InfoType::IT_default:
+  case InfoType::IT_enum:
+  case InfoType::IT_function:
+  case InfoType::IT_typedef:
+    break;
   }
+  llvm_unreachable("Invalid reference type for parent namespace");
 }
 
 // There are two uses for this function.
diff --git a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
index bbe158ed50e2..659870d2a5c0 100644
--- a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
@@ -37,7 +37,7 @@ static std::string writeInfo(Info *I) {
     return writeInfo(*static_cast<FunctionInfo *>(I));
   case InfoType::IT_typedef:
     return writeInfo(*static_cast<TypedefInfo *>(I));
-  default:
+  case InfoType::IT_default:
     return "";
   }
 }

From 417ab37d85ad1bb3e5623dff487ef108404e37f5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 17:11:52 -0700
Subject: [PATCH 0362/1322] [ConstantFolding] Fold deinterleave2 of any splat
 vector not just zeroinitializer (#144144)

While there remove an unnecessary dyn_cast from Constant to Constant.
Reverse a branch condition into an early out to reduce nesting.
---
 llvm/lib/Analysis/ConstantFolding.cpp         | 43 +++++++++----------
 .../InstSimplify/ConstProp/vector-calls.ll    | 16 +++++++
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 64a0f4641250..2b7a438a9ef0 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -3990,31 +3990,30 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID,
     return ConstantStruct::get(StTy, SinResult, CosResult);
   }
   case Intrinsic::vector_deinterleave2: {
-    auto *Vec = dyn_cast<Constant>(Operands[0]);
-    if (!Vec)
+    auto *Vec = Operands[0];
+    auto *VecTy = cast<VectorType>(Vec->getType());
+
+    if (auto *EltC = Vec->getSplatValue()) {
+      ElementCount HalfEC = VecTy->getElementCount().divideCoefficientBy(2);
+      auto *HalfVec = ConstantVector::getSplat(HalfEC, EltC);
+      return ConstantStruct::get(StTy, HalfVec, HalfVec);
+    }
+
+    if (!isa<FixedVectorType>(Vec->getType()))
       return nullptr;
 
-    auto *VecTy = cast<VectorType>(Vec->getType());
-    unsigned NumElements = VecTy->getElementCount().getKnownMinValue() / 2;
-    if (isa<ConstantAggregateZero>(Vec)) {
-      auto *HalfVecTy = VectorType::getHalfElementsVectorType(VecTy);
-      return ConstantStruct::get(StTy, ConstantAggregateZero::get(HalfVecTy),
-                                 ConstantAggregateZero::get(HalfVecTy));
+    unsigned NumElements = VecTy->getElementCount().getFixedValue() / 2;
+    SmallVector<Constant *, 4> Res0(NumElements), Res1(NumElements);
+    for (unsigned I = 0; I < NumElements; ++I) {
+      Constant *Elt0 = Vec->getAggregateElement(2 * I);
+      Constant *Elt1 = Vec->getAggregateElement(2 * I + 1);
+      if (!Elt0 || !Elt1)
+        return nullptr;
+      Res0[I] = Elt0;
+      Res1[I] = Elt1;
     }
-    if (isa<FixedVectorType>(Vec->getType())) {
-      SmallVector<Constant *, 4> Res0(NumElements), Res1(NumElements);
-      for (unsigned I = 0; I < NumElements; ++I) {
-        Constant *Elt0 = Vec->getAggregateElement(2 * I);
-        Constant *Elt1 = Vec->getAggregateElement(2 * I + 1);
-        if (!Elt0 || !Elt1)
-          return nullptr;
-        Res0[I] = Elt0;
-        Res1[I] = Elt1;
-      }
-      return ConstantStruct::get(StTy, ConstantVector::get(Res0),
-                                 ConstantVector::get(Res1));
-    }
-    return nullptr;
+    return ConstantStruct::get(StTy, ConstantVector::get(Res0),
+                               ConstantVector::get(Res1));
   }
   default:
     // TODO: Constant folding of vector intrinsics that fall through here does
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
index 9dbe3d4e50ee..14543f339db5 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
@@ -66,3 +66,19 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @fold_scalable_vector_deinterlea
   %1 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<vscale x 8 x i32> zeroinitializer)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %1
 }
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @fold_scalable_vector_deinterleave2_splat() {
+; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @fold_scalable_vector_deinterleave2_splat() {
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } { <vscale x 4 x i32> splat (i32 1), <vscale x 4 x i32> splat (i32 1) }
+;
+  %1 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<vscale x 8 x i32> splat (i32 1))
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %1
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @fold_scalable_vector_deinterleave2_splatfp() {
+; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @fold_scalable_vector_deinterleave2_splatfp() {
+; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } { <vscale x 4 x float> splat (float 1.000000e+00), <vscale x 4 x float> splat (float 1.000000e+00) }
+;
+  %1 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.v4f32.v8f32(<vscale x 8 x float> splat (float 1.0))
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %1
+}

From 15f100d1445846cdb55c24e588a74fde522fc9c9 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Fri, 13 Jun 2025 17:17:20 -0700
Subject: [PATCH 0363/1322] [bazel] fix mlir/tblgen.bzl formatting after
 6e988bd33f5fa8a529ef9208d3e147945b7bb7ed

---
 utils/bazel/llvm-project-overlay/mlir/tblgen.bzl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 884d6f381b02..89b17735e005 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -525,7 +525,7 @@ def gentbl_sharded_ops(
         td_file = td_file,
         test = test,
         deps = deps,
-        **kwargs,
+        **kwargs
     )
     all_files = [hdr_out, src_out]
     for i in range(0, shard_count):
@@ -537,13 +537,13 @@ def gentbl_sharded_ops(
             out = out_file,
             sharder = sharder,
             src_file = src_file,
-            **kwargs,
+            **kwargs
         )
         all_files.append(out_file)
     native.filegroup(
         name = name,
         srcs = all_files,
-        **kwargs,
+        **kwargs
     )
 
 def gentbl_sharded_op_defs(name, source_file, shard_count):

From bd319d9071fb0c6e1bda9db500d039d32a49c28a Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 09:42:39 +0900
Subject: [PATCH 0364/1322] [Cygwin] CYGWIN is not WIN32 in current CMake
 (#143130)

On old CMake, Cygwin were also WIN32 but currently not. LLVM_ON_UNIX=1
and LLVM_HAVE_LINK_VERSION_SCRIPT=0 should be defined for Cygwin target.
---
 llvm/cmake/config-ix.cmake                 | 16 +++++-----------
 llvm/cmake/modules/HandleLLVMOptions.cmake |  6 +++---
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 687f5077cbfd..9895469973e4 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -1,8 +1,3 @@
-if( WIN32 AND NOT CYGWIN )
-  # We consider Cygwin as another Unix
-  set(PURE_WINDOWS 1)
-endif()
-
 include(CheckIncludeFile)
 include(CheckLibraryExists)
 include(CheckSymbolExists)
@@ -31,7 +26,7 @@ elseif (APPLE)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
-elseif (PURE_WINDOWS)
+elseif (WIN32)
   set(HAVE_MACH_MACH_H 0)
   set(HAVE_MALLOC_MALLOC_H 0)
   set(HAVE_PTHREAD_H 0)
@@ -132,7 +127,7 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
 endif()
 
 # library checks
-if( NOT PURE_WINDOWS )
+if(NOT WIN32)
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
   if (HAVE_LIBPTHREAD)
     check_library_exists(pthread pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
@@ -275,7 +270,7 @@ endif()
 # party code may call MSan interceptors like strlen, leading to false positives.
 if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
   # Don't look for these libraries on Windows.
-  if (NOT PURE_WINDOWS)
+  if (NOT WIN32)
     # Skip libedit if using ASan as it contains memory leaks.
     if (LLVM_ENABLE_LIBEDIT AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*")
       if(LLVM_ENABLE_LIBEDIT STREQUAL FORCE_ON)
@@ -384,7 +379,7 @@ check_symbol_exists(sbrk unistd.h HAVE_SBRK)
 check_symbol_exists(strerror_r string.h HAVE_STRERROR_R)
 check_symbol_exists(strerror_s string.h HAVE_DECL_STRERROR_S)
 check_symbol_exists(setenv stdlib.h HAVE_SETENV)
-if( PURE_WINDOWS )
+if(WIN32)
   check_symbol_exists(_chsize_s io.h HAVE__CHSIZE_S)
 
   check_function_exists(_alloca HAVE__ALLOCA)
@@ -420,8 +415,7 @@ else()
       "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
 endif()
 
-# This check requires _GNU_SOURCE.
-if (NOT PURE_WINDOWS)
+if (NOT WIN32)
   if (LLVM_PTHREAD_LIB)
     list(APPEND CMAKE_REQUIRED_LIBRARIES ${LLVM_PTHREAD_LIB})
   endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index c35d9763a330..e2f9826d3981 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -217,15 +217,15 @@ if( LLVM_REVERSE_ITERATION )
   set( LLVM_ENABLE_REVERSE_ITERATION 1 )
 endif()
 
-if(WIN32)
+if(WIN32 OR CYGWIN)
   set(LLVM_HAVE_LINK_VERSION_SCRIPT 0)
   if(CYGWIN)
     set(LLVM_ON_WIN32 0)
     set(LLVM_ON_UNIX 1)
-  else(CYGWIN)
+  else()
     set(LLVM_ON_WIN32 1)
     set(LLVM_ON_UNIX 0)
-  endif(CYGWIN)
+  endif()
 elseif(FUCHSIA OR UNIX)
   set(LLVM_ON_WIN32 0)
   set(LLVM_ON_UNIX 1)

From e37707b1e85cfc07fe75fd6b7e5d41963c52a8ec Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 18:06:03 -0700
Subject: [PATCH 0365/1322] [RISCV] Use unsigned instead of uint16_t for the
 Opcode argument to getVectorLowDemandedScalarBits. NFC

All the callers pass an unsigned and uint16_t arguments are unusual.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index e5d29e1a8b47..107f645709c7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4581,7 +4581,7 @@ bool RISCV::hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2) {
 }
 
 std::optional<unsigned>
-RISCV::getVectorLowDemandedScalarBits(uint16_t Opcode, unsigned Log2SEW) {
+RISCV::getVectorLowDemandedScalarBits(unsigned Opcode, unsigned Log2SEW) {
   switch (Opcode) {
   default:
     return std::nullopt;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 8260949cf918..020be91e90e0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -364,7 +364,7 @@ bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2);
 // If \p Opcode is a .vx vector instruction, returns the lower number of bits
 // that are used from the scalar .x operand for a given \p Log2SEW. Otherwise
 // returns null.
-std::optional<unsigned> getVectorLowDemandedScalarBits(uint16_t Opcode,
+std::optional<unsigned> getVectorLowDemandedScalarBits(unsigned Opcode,
                                                        unsigned Log2SEW);
 
 // Returns the MC opcode of RVV pseudo instruction.

From d4c7d0be1f5235555393313bb1f8e46c97f76766 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 19:12:12 -0700
Subject: [PATCH 0366/1322] MCObjectStreamer: Replace getAssemblerPtr with
 getAssembler

In general getAssemblerPtr should only be called by MCParse.
Revert some changes from https://reviews.llvm.org/D45164?id=143128
---
 llvm/lib/MC/MCObjectStreamer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index e3d5a5a9a132..1bb2143ed6ab 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -261,7 +261,7 @@ void MCObjectStreamer::emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc,
 
 void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
-  if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
+  if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
     emitULEB128IntValue(IntValue);
     return;
   }
@@ -270,7 +270,7 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
 
 void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
-  if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
+  if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
     emitSLEB128IntValue(IntValue);
     return;
   }
@@ -727,7 +727,7 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
                                 int64_t Expr, SMLoc Loc) {
   int64_t IntNumValues;
   // Do additional checking now if we can resolve the value.
-  if (NumValues.evaluateAsAbsolute(IntNumValues, getAssemblerPtr())) {
+  if (NumValues.evaluateAsAbsolute(IntNumValues, getAssembler())) {
     if (IntNumValues < 0) {
       getContext().getSourceManager()->PrintMessage(
           Loc, SourceMgr::DK_Warning,

From 709ba084c5632b786f2e6c503d3f9f27e1f1c433 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 22:18:12 -0700
Subject: [PATCH 0367/1322] [RISCV] Use RISCVII::getVecPolicyOpNum instead of
 making assumptions.  NFC (#144175)

---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 107f645709c7..7d868bf6e2ab 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3696,7 +3696,8 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VMA_OPCODE_LMULS(NMSAC, VV): {
     // If the tail policy is undisturbed we can't commute.
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags));
-    if ((MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 1) == 0)
+    if ((MI.getOperand(RISCVII::getVecPolicyOpNum(MI.getDesc())).getImm() &
+         1) == 0)
       return false;
 
     // For these instructions we can only swap operand 1 and operand 3 by
@@ -3716,7 +3717,8 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VMA_OPCODE_LMULS(NMSUB, VV): {
     // If the tail policy is undisturbed we can't commute.
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags));
-    if ((MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 1) == 0)
+    if ((MI.getOperand(RISCVII::getVecPolicyOpNum(MI.getDesc())).getImm() &
+         1) == 0)
       return false;
 
     // For these instructions we have more freedom. We can commute with the
@@ -4331,7 +4333,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
     // If the tail policy is undisturbed we can't convert.
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
            MI.getNumExplicitOperands() == 6);
-    if ((MI.getOperand(5).getImm() & 1) == 0)
+    if ((MI.getOperand(RISCVII::getVecPolicyOpNum(MI.getDesc())).getImm() &
+         1) == 0)
       return nullptr;
 
     // clang-format off

From ef265ed23038a3719829a08fcbf7384fbdfe0451 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 22:19:04 -0700
Subject: [PATCH 0368/1322] [RISCV] Simplify macros used by
 RISCVInstrInfo::convertToThreeAddress. NFC (#144173)

Merge some macros that are only used once by another macro.
Rename macros to remove _MF4 where not needed.

I suspect these are artifacts from FP being split from integer in the
past.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 36 ++++++++----------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7d868bf6e2ab..949d78b3940e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4226,38 +4226,32 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
 #define CASE_WIDEOP_OPCODE_COMMON(OP, LMUL)                                    \
   RISCV::PseudoV##OP##_##LMUL##_TIED
 
-#define CASE_WIDEOP_OPCODE_LMULS_MF4(OP)                                       \
-  CASE_WIDEOP_OPCODE_COMMON(OP, MF4):                                          \
+#define CASE_WIDEOP_OPCODE_LMULS(OP)                                           \
+  CASE_WIDEOP_OPCODE_COMMON(OP, MF8):                                          \
+  case CASE_WIDEOP_OPCODE_COMMON(OP, MF4):                                     \
   case CASE_WIDEOP_OPCODE_COMMON(OP, MF2):                                     \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M1):                                      \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M2):                                      \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M4)
 
-#define CASE_WIDEOP_OPCODE_LMULS(OP)                                           \
-  CASE_WIDEOP_OPCODE_COMMON(OP, MF8):                                          \
-  case CASE_WIDEOP_OPCODE_LMULS_MF4(OP)
-
 #define CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, LMUL)                             \
   case RISCV::PseudoV##OP##_##LMUL##_TIED:                                     \
     NewOpc = RISCV::PseudoV##OP##_##LMUL;                                      \
     break;
 
-#define CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)                                \
+#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                    \
+  CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4)
 
-#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                    \
-  CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8)                                    \
-  CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)
-
 // FP Widening Ops may by SEW aware. Create SEW aware cases for these cases.
 #define CASE_FP_WIDEOP_OPCODE_COMMON(OP, LMUL, SEW)                            \
   RISCV::PseudoV##OP##_##LMUL##_##SEW##_TIED
 
-#define CASE_FP_WIDEOP_OPCODE_LMULS_MF4(OP)                                    \
+#define CASE_FP_WIDEOP_OPCODE_LMULS(OP)                                        \
   CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF4, E16):                                  \
   case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E16):                             \
   case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E32):                             \
@@ -4273,7 +4267,7 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
     NewOpc = RISCV::PseudoV##OP##_##LMUL##_##SEW;                              \
     break;
 
-#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)                             \
+#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                 \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4, E16)                            \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E16)                            \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E32)                            \
@@ -4283,9 +4277,6 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E32)                             \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16)                             \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E32)                             \
-
-#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                 \
-  CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)
 // clang-format on
 
 MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
@@ -4295,8 +4286,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     return nullptr;
-  case CASE_FP_WIDEOP_OPCODE_LMULS_MF4(FWADD_WV):
-  case CASE_FP_WIDEOP_OPCODE_LMULS_MF4(FWSUB_WV): {
+  case CASE_FP_WIDEOP_OPCODE_LMULS(FWADD_WV):
+  case CASE_FP_WIDEOP_OPCODE_LMULS(FWSUB_WV): {
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
            MI.getNumExplicitOperands() == 7 &&
            "Expect 7 explicit operands rd, rs2, rs1, rm, vl, sew, policy");
@@ -4309,8 +4300,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected opcode");
-    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWADD_WV)
-    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWSUB_WV)
+    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV)
+    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV)
     }
     // clang-format on
 
@@ -4390,15 +4381,12 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
 }
 
 #undef CASE_WIDEOP_OPCODE_COMMON
-#undef CASE_WIDEOP_OPCODE_LMULS_MF4
 #undef CASE_WIDEOP_OPCODE_LMULS
 #undef CASE_WIDEOP_CHANGE_OPCODE_COMMON
-#undef CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4
 #undef CASE_WIDEOP_CHANGE_OPCODE_LMULS
 #undef CASE_FP_WIDEOP_OPCODE_COMMON
-#undef CASE_FP_WIDEOP_OPCODE_LMULS_MF4
+#undef CASE_FP_WIDEOP_OPCODE_LMULS
 #undef CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON
-#undef CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4
 #undef CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS
 
 void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB,

From 35e3c50731870cc37a73ef1286a92f49347ccea4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 22:19:25 -0700
Subject: [PATCH 0369/1322] [RISCV] Simplify macros used for commuting vector
 multiply-accumulate instructions. NFC (#144169)

Inline some macros that were only instantiated once.
Remove unused macros.
#undef macros when finished with them
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 65 ++++++++----------------
 1 file changed, 22 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 949d78b3940e..d9ef911b9a32 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3573,24 +3573,15 @@ std::string RISCVInstrInfo::createMIROperandComment(
 #define CASE_VMA_OPCODE_COMMON(OP, TYPE, LMUL)                                 \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL
 
-#define CASE_VMA_OPCODE_LMULS_M1(OP, TYPE)                                     \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, M1):                                        \
+#define CASE_VMA_OPCODE_LMULS(OP, TYPE)                                        \
+  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF8):                                       \
+  case CASE_VMA_OPCODE_COMMON(OP, TYPE, MF4):                                  \
+  case CASE_VMA_OPCODE_COMMON(OP, TYPE, MF2):                                  \
+  case CASE_VMA_OPCODE_COMMON(OP, TYPE, M1):                                   \
   case CASE_VMA_OPCODE_COMMON(OP, TYPE, M2):                                   \
   case CASE_VMA_OPCODE_COMMON(OP, TYPE, M4):                                   \
   case CASE_VMA_OPCODE_COMMON(OP, TYPE, M8)
 
-#define CASE_VMA_OPCODE_LMULS_MF2(OP, TYPE)                                    \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF2):                                       \
-  case CASE_VMA_OPCODE_LMULS_M1(OP, TYPE)
-
-#define CASE_VMA_OPCODE_LMULS_MF4(OP, TYPE)                                    \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF4):                                       \
-  case CASE_VMA_OPCODE_LMULS_MF2(OP, TYPE)
-
-#define CASE_VMA_OPCODE_LMULS(OP, TYPE)                                        \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF8):                                       \
-  case CASE_VMA_OPCODE_LMULS_MF4(OP, TYPE)
-
 // VFMA instructions are SEW specific.
 #define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL, SEW)                           \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL##_##SEW
@@ -3790,29 +3781,15 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
     Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL;                             \
     break;
 
-#define CASE_VMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE)                    \
+#define CASE_VMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                       \
+  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8)                       \
+  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4)                       \
+  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2)                       \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M1)                        \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M2)                        \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M4)                        \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M8)
 
-#define CASE_VMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE)                   \
-  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE)
-
-#define CASE_VMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE)                   \
-  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE)
-
-#define CASE_VMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                       \
-  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE)
-
-#define CASE_VMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP)                            \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VFPR32)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VFPR64)
-
 // VFMA depends on SEW.
 #define CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, LMUL, SEW)          \
   case RISCV::PseudoV##OLDOP##_##TYPE##_##LMUL##_##SEW:                        \
@@ -3829,18 +3806,14 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2, SEW)                 \
   CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE, SEW)
 
-#define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP)                               \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16)                     \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32)                     \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
-
 #define CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE, SEW)             \
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4, SEW)                 \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE, SEW)
 
-#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE, SEW)                 \
-  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8, SEW)                 \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE, SEW)
+#define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP)                               \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16)                     \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32)                     \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
 
 #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP)                           \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16, E16)                 \
@@ -3963,6 +3936,15 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
 }
 
+#undef CASE_VMA_CHANGE_OPCODE_COMMON
+#undef CASE_VMA_CHANGE_OPCODE_LMULS
+#undef CASE_VFMA_CHANGE_OPCODE_COMMON
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS_M1
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS_MF2
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS_MF4
+#undef CASE_VFMA_CHANGE_OPCODE_VV
+#undef CASE_VFMA_CHANGE_OPCODE_SPLATS
+
 #undef CASE_RVV_OPCODE_UNMASK_LMUL
 #undef CASE_RVV_OPCODE_MASK_LMUL
 #undef CASE_RVV_OPCODE_LMUL
@@ -3974,9 +3956,6 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
 #undef CASE_RVV_OPCODE
 
 #undef CASE_VMA_OPCODE_COMMON
-#undef CASE_VMA_OPCODE_LMULS_M1
-#undef CASE_VMA_OPCODE_LMULS_MF2
-#undef CASE_VMA_OPCODE_LMULS_MF4
 #undef CASE_VMA_OPCODE_LMULS
 #undef CASE_VFMA_OPCODE_COMMON
 #undef CASE_VFMA_OPCODE_LMULS_M1

From 0bd614a8ee11cfc5cee8719b3209f40b163d5a62 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 14:36:14 +0900
Subject: [PATCH 0370/1322] [Cygwin] Don't use version script for Cygwin target
 (#143133)

Cygwin is a COFF platform and does not support version-script.
I guess I should use LLVM_HAVE_LINK_VERSION_SCRIPT here, but I don't
know why this is not currently the case.
---
 llvm/tools/llvm-shlib/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index 089255f36104..9a2015f61f2b 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -66,7 +66,7 @@ if(LLVM_BUILD_LLVM_DYLIB)
     else()
       # GNU ld doesn't resolve symbols in the version script.
       set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive)
-      if (NOT LLVM_LINKER_IS_SOLARISLD AND NOT MINGW)
+      if (NOT LLVM_LINKER_IS_SOLARISLD AND NOT MINGW AND NOT CYGWIN)
         # Solaris ld does not accept global: *; so there is no way to version *all* global symbols
         set(LIB_NAMES -Wl,--version-script,${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map ${LIB_NAMES})
       endif()

From 07fa6d1d90c714fa269529c3e5004a063d814c4a Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <thevar1able@users.noreply.github.com>
Date: Sat, 14 Jun 2025 09:32:54 +0300
Subject: [PATCH 0371/1322] [InstCombine] Avoid folding `select(umin(X, Y), X)`
 with min/max values in false arm (#143020)

Fixes https://github.com/llvm/llvm-project/issues/139050.

This patch adds a check to avoid folding min/max reduction into select, which may block loop vectorization.

The issue is that the following snippet:
```
declare i8 @llvm.umin.i8(i8, i8)

define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) {
; CHECK-LABEL: @masked_min_fold_bug(
; CHECK:       %cond = icmp eq i8 %mask, 0
; CHECK:       %masked_val = select i1 %cond, i8 %val, i8 255
; CHECK:       call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
;
  %cond = icmp eq i8 %mask, 0
  %masked_val = select i1 %cond, i8 %val, i8 255
  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
  ret i8 %res
}
```

is being optimized to the following code, which can not be vectorized
later.
```
declare i8 @llvm.umin.i8(i8, i8) #0

define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) {
  %cond = icmp eq i8 %mask, 0
  %1 = call i8 @llvm.umin.i8(i8 %acc, i8 %val)
  %res = select i1 %cond, i8 %1, i8 %acc
  ret i8 %res
}

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```

Expected:
```
declare i8 @llvm.umin.i8(i8, i8) #0

define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) {
  %cond = icmp eq i8 %mask, 0
  %masked_val = select i1 %cond, i8 %val, i8 -1
  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
  ret i8 %res
}

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```

https://godbolt.org/z/cYMheKE5r
---
 .../InstCombine/InstructionCombining.cpp      |  9 ++++
 llvm/test/Transforms/InstCombine/select.ll    | 47 +++++++++++++++++
 .../PhaseOrdering/X86/vector-reductions.ll    | 50 ++++++++++++++-----
 3 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 29582939fa06..4fe900e9421f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1739,6 +1739,15 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
   if (SI->getType()->isIntOrIntVectorTy(1))
     return nullptr;
 
+  // Avoid breaking min/max reduction pattern,
+  // which is necessary for vectorization later.
+  if (isa<MinMaxIntrinsic>(&Op))
+    for (Value *IntrinOp : Op.operands())
+      if (auto *PN = dyn_cast<PHINode>(IntrinOp))
+        for (Value *PhiOp : PN->operands())
+          if (PhiOp == &Op)
+            return nullptr;
+
   // Test if a FCmpInst instruction is used exclusively by a select as
   // part of a minimum or maximum operation. If so, refrain from doing
   // any other folding. This helps out other analyses which understand
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index e16f6ad2cfc9..ef5874ffd46a 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -5047,3 +5047,50 @@ define <2 x ptr> @select_freeze_constant_expression_vector_gep(i1 %cond, <2 x pt
   %sel = select i1 %cond, <2 x ptr> %y, <2 x ptr> %freeze
   ret <2 x ptr> %sel
 }
+
+define void @no_fold_masked_min_loop(ptr nocapture readonly %vals, ptr nocapture readonly %masks, ptr nocapture %out, i64 %n) {
+; CHECK-LABEL: @no_fold_masked_min_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXT_INDEX:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[RES:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[VALS:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds i8, ptr [[MASKS:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[VAL_PTR]], align 1
+; CHECK-NEXT:    [[MASK:%.*]] = load i8, ptr [[MASK_PTR]], align 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[MASK]], 0
+; CHECK-NEXT:    [[MASKED_VAL:%.*]] = select i1 [[COND]], i8 [[VAL]], i8 -1
+; CHECK-NEXT:    [[RES]] = call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[MASKED_VAL]])
+; CHECK-NEXT:    [[NEXT_INDEX]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXT_INDEX]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 [[RES]], ptr [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%next_index, %loop]
+  %acc = phi i8 [255, %entry], [%res, %loop]
+
+  %val_ptr = getelementptr inbounds i8, ptr %vals, i64 %index
+  %mask_ptr = getelementptr inbounds i8, ptr %masks, i64 %index
+
+  %val = load i8, ptr %val_ptr, align 1
+  %mask = load i8, ptr %mask_ptr, align 1
+
+  %cond = icmp eq i8 %mask, 0
+  %masked_val = select i1 %cond, i8 %val, i8 -1
+  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
+
+  %next_index = add i64 %index, 1
+  %done = icmp eq i64 %next_index, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  store i8 %res, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index f8450766037b..2ec48a8637da 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -326,26 +326,52 @@ cleanup:
   ret i1 %retval.0
 }
 
-; From https://github.com/llvm/llvm-project/issues/139050.
-; FIXME: This should be vectorized.
 define i8 @masked_min_reduction(ptr %data, ptr %mask) {
 ; CHECK-LABEL: @masked_min_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       loop:
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[DATA:%.*]] = getelementptr i8, ptr [[DATA1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[DATA]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DATA]], i64 32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DATA]], i64 64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DATA]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[DATA]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[MASK:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[M:%.*]] = load i8, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[M]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[VAL]])
-; CHECK-NEXT:    [[TMP21]] = select i1 [[COND]], i8 [[TMP0]], i8 [[ACC]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <32 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP8]], <32 x i8> [[WIDE_LOAD]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP9]], <32 x i8> [[WIDE_LOAD4]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <32 x i1> [[TMP10]], <32 x i8> [[WIDE_LOAD5]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP11]], <32 x i8> [[WIDE_LOAD6]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP16]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI]], <32 x i8> [[TMP12]])
+; CHECK-NEXT:    [[TMP17]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI1]], <32 x i8> [[TMP13]])
+; CHECK-NEXT:    [[TMP18]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI2]], <32 x i8> [[TMP14]])
+; CHECK-NEXT:    [[TMP19]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI3]], <32 x i8> [[TMP15]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP20]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP16]], <32 x i8> [[TMP17]])
+; CHECK-NEXT:    [[RDX_MINMAX11:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX]], <32 x i8> [[TMP18]])
+; CHECK-NEXT:    [[RDX_MINMAX12:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX11]], <32 x i8> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[RDX_MINMAX12]])
 ; CHECK-NEXT:    ret i8 [[TMP21]]
 ;
 entry:

From 2796c412499a276ad23ae184daac33175c32424f Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Sat, 14 Jun 2025 14:59:36 +0800
Subject: [PATCH 0372/1322] [MSan] Fix minor issues in testcases (#144073)

Previously,
1. ifaddrs.cpp : mistake `size_t (xxx)` as `sizeof (xxx)`, resulting in
inadequate checks.
2. qsort.cpp : mistake `kSize2` as `kSize1`, resulting in an unexpected
buffer overlow issue.
---
 compiler-rt/test/msan/ifaddrs.cpp | 10 +++++-----
 compiler-rt/test/msan/qsort.cpp   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/test/msan/ifaddrs.cpp b/compiler-rt/test/msan/ifaddrs.cpp
index 91730a01f2d8..e06775db3251 100644
--- a/compiler-rt/test/msan/ifaddrs.cpp
+++ b/compiler-rt/test/msan/ifaddrs.cpp
@@ -16,10 +16,10 @@
 
 #include <sanitizer/msan_interface.h>
 
-#define CHECK_AND_PUSH(addr, size)                                \
-  if (addr) {                                                     \
-    assert(-1 == __msan_test_shadow(addr, sizeof(size)));         \
-    ranges.push_back(std::make_pair((void *)addr, (size_t)size)); \
+#define CHECK_AND_PUSH(addr, size)                                             \
+  if (addr) {                                                                  \
+    assert(-1 == __msan_test_shadow(addr, (size_t)(size)));                    \
+    ranges.push_back(std::make_pair((void *)addr, (size_t)size));              \
   }
 
 int main(int argc, char *argv[]) {
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
   assert(res == 0);
   assert(-1 == __msan_test_shadow(&ifas, sizeof(ifaddrs *)));
 
-  std::vector<std::pair<void *, size_t> > ranges;
+  std::vector<std::pair<void *, size_t>> ranges;
   ifaddrs *p = ifas;
   while (p) {
     CHECK_AND_PUSH(p, sizeof(ifaddrs));
diff --git a/compiler-rt/test/msan/qsort.cpp b/compiler-rt/test/msan/qsort.cpp
index af287ed64357..93e6845e1ea7 100644
--- a/compiler-rt/test/msan/qsort.cpp
+++ b/compiler-rt/test/msan/qsort.cpp
@@ -52,7 +52,7 @@ int compar1(const void *a, const void *b) {
   // kind of random
   for (int i = 0; i < kSize2; ++i)
     p[i] = i * 2 + (i % 3 - 1) * 3;
-  qsort(p, kSize1, sizeof(long), compar2);
+  qsort(p, kSize2, sizeof(long), compar2);
   __msan_check_mem_is_initialized(p, sizeof(long) * kSize2);
   delete[] p;
 

From 2e7fbb94bc268d37996408a525781961989d8627 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 14 Jun 2025 00:21:39 -0700
Subject: [PATCH 0373/1322] [clang-format] Fix a bug in annotating braces
 (#144095)

Stop looking for function decls after hitting a BK_BracedInit brace.

Fixes #144057.
---
 clang/lib/Format/TokenAnnotator.cpp           | 2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index aed1672afac6..d2f8b2703a9a 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3978,7 +3978,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
   for (auto *Tok = FirstNonComment && FirstNonComment->isNot(tok::kw_using)
                        ? FirstNonComment->Next
                        : nullptr;
-       Tok; Tok = Tok->Next) {
+       Tok && Tok->isNot(BK_BracedInit); Tok = Tok->Next) {
     if (Tok->is(TT_StartOfName))
       SeenName = true;
     if (Tok->Previous->EndsCppAttributeGroup)
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 873c6c492d18..a1285e4bc9bf 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3754,6 +3754,13 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_BRACE_KIND(Tokens[4], BK_BracedInit);
   EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit);
+
+  Tokens = annotate("auto f1{&T::operator()};");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[2], BK_BracedInit);
+  // Not TT_FunctionDeclarationName.
+  EXPECT_TOKEN(Tokens[6], tok::kw_operator, TT_Unknown);
+  EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) {

From f46c44dbc0d225277178cf5b6646a96f591fdeaa Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Sat, 14 Jun 2025 10:55:42 +0300
Subject: [PATCH 0374/1322] [clang-tidy][NFC] change patterns 'anyOf(...,
 anything())' to 'optionally(...)' (#143558)

Writing `optionally()` instead of `anyOf(..., anything())` lowers code
size and gives the author's intention better.
---
 .../bugprone/NotNullTerminatedResultCheck.cpp | 21 +++++++++----------
 .../hicpp/ExceptionBaseclassCheck.cpp         | 12 +++++------
 .../clang-tidy/misc/StaticAssertCheck.cpp     | 13 +++++-------
 .../modernize/UseBoolLiteralsCheck.cpp        | 13 ++++++------
 .../ImplicitBoolConversionCheck.cpp           |  4 ++--
 5 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
index bedecb60569e..203170d55f69 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
@@ -702,17 +702,16 @@ void NotNullTerminatedResultCheck::registerMatchers(MatchFinder *Finder) {
     return hasArgument(
         CC.LengthPos,
         allOf(
-            anyOf(
-                ignoringImpCasts(integerLiteral().bind(WrongLengthExprName)),
-                allOf(unless(hasDefinition(SizeOfCharExpr)),
-                      allOf(CC.WithIncrease
-                                ? ignoringImpCasts(hasDefinition(HasIncOp))
-                                : ignoringImpCasts(allOf(
-                                      unless(hasDefinition(HasIncOp)),
-                                      anyOf(hasDefinition(binaryOperator().bind(
-                                                UnknownLengthName)),
-                                            hasDefinition(anything())))),
-                            AnyOfWrongLengthInit))),
+            anyOf(ignoringImpCasts(integerLiteral().bind(WrongLengthExprName)),
+                  allOf(unless(hasDefinition(SizeOfCharExpr)),
+                        allOf(CC.WithIncrease
+                                  ? ignoringImpCasts(hasDefinition(HasIncOp))
+                                  : ignoringImpCasts(
+                                        allOf(unless(hasDefinition(HasIncOp)),
+                                              hasDefinition(optionally(
+                                                  binaryOperator().bind(
+                                                      UnknownLengthName))))),
+                              AnyOfWrongLengthInit))),
             expr().bind(LengthExprName)));
   };
 
diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
index 2b2acfdf5b08..ed39568ea554 100644
--- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
@@ -24,14 +24,12 @@ void ExceptionBaseclassCheck::registerMatchers(MatchFinder *Finder) {
                   isSameOrDerivedFrom(hasName("::std::exception")))))))))),
           // This condition is always true, but will bind to the
           // template value if the thrown type is templated.
-          anyOf(has(expr(
-                    hasType(substTemplateTypeParmType().bind("templ_type")))),
-                anything()),
+          optionally(has(
+              expr(hasType(substTemplateTypeParmType().bind("templ_type"))))),
           // Bind to the declaration of the type of the value that
-          // is thrown. 'anything()' is necessary to always succeed
-          // in the 'eachOf' because builtin types are not
-          // 'namedDecl'.
-          eachOf(has(expr(hasType(namedDecl().bind("decl")))), anything()))
+          // is thrown. 'optionally' is necessary because builtin types
+          // are not 'namedDecl'.
+          optionally(has(expr(hasType(namedDecl().bind("decl"))))))
           .bind("bad_throw"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
index faff1c17fc61..37fbd8c0d725 100644
--- a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
@@ -38,8 +38,7 @@ void StaticAssertCheck::registerMatchers(MatchFinder *Finder) {
       binaryOperator(
           hasAnyOperatorName("&&", "=="),
           hasEitherOperand(ignoringImpCasts(stringLiteral().bind("assertMSG"))),
-          anyOf(binaryOperator(hasEitherOperand(IsAlwaysFalseWithCast)),
-                anything()))
+          optionally(binaryOperator(hasEitherOperand(IsAlwaysFalseWithCast))))
           .bind("assertExprRoot"),
       IsAlwaysFalse);
   auto NonConstexprFunctionCall =
@@ -52,12 +51,10 @@ void StaticAssertCheck::registerMatchers(MatchFinder *Finder) {
   auto NonConstexprCode =
       expr(anyOf(NonConstexprFunctionCall, NonConstexprVariableReference));
   auto AssertCondition =
-      expr(
-          anyOf(expr(ignoringParenCasts(anyOf(
-                    AssertExprRoot, unaryOperator(hasUnaryOperand(
-                                        ignoringParenCasts(AssertExprRoot)))))),
-                anything()),
-          unless(NonConstexprCode), unless(hasDescendant(NonConstexprCode)))
+      expr(optionally(expr(ignoringParenCasts(anyOf(
+               AssertExprRoot, unaryOperator(hasUnaryOperand(
+                                   ignoringParenCasts(AssertExprRoot))))))),
+           unless(NonConstexprCode), unless(hasDescendant(NonConstexprCode)))
           .bind("condition");
   auto Condition =
       anyOf(ignoringParenImpCasts(callExpr(
diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
index c8e6bf47bb82..339462093a6d 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
@@ -26,13 +26,12 @@ void UseBoolLiteralsCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 
 void UseBoolLiteralsCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
-      traverse(
-          TK_AsIs,
-          implicitCastExpr(
-              has(ignoringParenImpCasts(integerLiteral().bind("literal"))),
-              hasImplicitDestinationType(qualType(booleanType())),
-              unless(isInTemplateInstantiation()),
-              anyOf(hasParent(explicitCastExpr().bind("cast")), anything()))),
+      traverse(TK_AsIs,
+               implicitCastExpr(
+                   has(ignoringParenImpCasts(integerLiteral().bind("literal"))),
+                   hasImplicitDestinationType(qualType(booleanType())),
+                   unless(isInTemplateInstantiation()),
+                   optionally(hasParent(explicitCastExpr().bind("cast"))))),
       this);
 
   Finder->addMatcher(
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index f9fd1d903e23..20c73299915a 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -348,8 +348,8 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
               implicitCastExpr().bind("implicitCastFromBool"),
               unless(hasParent(BitfieldConstruct)),
               // Check also for nested casts, for example: bool -> int -> float.
-              anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
-                    anything()),
+              optionally(
+                  hasParent(implicitCastExpr().bind("furtherImplicitCast"))),
               unless(isInTemplateInstantiation()),
               unless(IsInCompilerGeneratedFunction))),
       this);

From 892513e51864f3e21120eab87c0c5a6aa37cae31 Mon Sep 17 00:00:00 2001
From: Zhikai Zeng <backlight.zzk@gmail.com>
Date: Sat, 14 Jun 2025 17:14:16 +0800
Subject: [PATCH 0375/1322] [clang] fix infinite recursion (#143244)

fix https://github.com/llvm/llvm-project/issues/141789

The direct cause of infinite recursion is that `T` is changing from
`struct X` and `S<X>` infinitely, this pr add a check that if `T`
visited before then return false directly.

```plaintext
/home/backlight/llvm-project/clang/lib/Sema/SemaDeclCXX.cpp:7196] FD->getType().getAsString()=struct X, T.getAsString()=S<X>, FD->getType().getCanonicalType().getUnqualifiedType().getAsString()=struct X, CanUnqualT.getAsString()=struct S<struct X>,
/home/backlight/llvm-project/clang/lib/Sema/SemaDeclCXX.cpp:7196] FD->getType().getAsString()=S<X>, T.getAsString()=struct X, FD->getType().getCanonicalType().getUnqualifiedType().getAsString()=struct S<struct X>, CanUnqualT.getAsString()=struct X,
```

https://github.com/llvm/llvm-project/pull/104829 fix similar infinite
recursion, but I think it is no longer needed so I kind of revert it.
---
 clang/docs/ReleaseNotes.rst     |  1 +
 clang/lib/Sema/SemaDeclCXX.cpp  | 14 ++++++--------
 clang/test/SemaCXX/gh102293.cpp | 17 +++++++++++++++++
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9ab8031b9ea8..33ee8a53b5f3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -703,6 +703,7 @@ Bug Fixes in This Version
   the second clause of a C-style ``for`` loop. (#GH139818)
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
 - Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
+- Fixed an infinite recursion when checking constexpr destructors. (#GH141789)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 31e283433674..6f62c53aaf04 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7159,7 +7159,10 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
     // "effectively constexpr" for better compatibility.
     // See https://github.com/llvm/llvm-project/issues/102293 for more info.
     if (isa<CXXDestructorDecl>(M)) {
-      auto Check = [](QualType T, auto &&Check) -> bool {
+      llvm::SmallDenseSet<QualType> Visited;
+      auto Check = [&Visited](QualType T, auto &&Check) -> bool {
+        if (!Visited.insert(T->getCanonicalTypeUnqualified()).second)
+          return false;
         const CXXRecordDecl *RD =
             T->getBaseElementTypeUnsafe()->getAsCXXRecordDecl();
         if (!RD || !RD->isCompleteDefinition())
@@ -7168,16 +7171,11 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
         if (!RD->hasConstexprDestructor())
           return false;
 
-        QualType CanUnqualT = T.getCanonicalType().getUnqualifiedType();
         for (const CXXBaseSpecifier &B : RD->bases())
-          if (B.getType().getCanonicalType().getUnqualifiedType() !=
-                  CanUnqualT &&
-              !Check(B.getType(), Check))
+          if (!Check(B.getType(), Check))
             return false;
         for (const FieldDecl *FD : RD->fields())
-          if (FD->getType().getCanonicalType().getUnqualifiedType() !=
-                  CanUnqualT &&
-              !Check(FD->getType(), Check))
+          if (!Check(FD->getType(), Check))
             return false;
         return true;
       };
diff --git a/clang/test/SemaCXX/gh102293.cpp b/clang/test/SemaCXX/gh102293.cpp
index d4218cc13dce..fe417e697841 100644
--- a/clang/test/SemaCXX/gh102293.cpp
+++ b/clang/test/SemaCXX/gh102293.cpp
@@ -45,3 +45,20 @@ class quux : quux { // expected-error {{base class has incomplete type}} \
   virtual int c();
 };
 }
+
+// Ensure we don't get infinite recursion from the check, however. See GH141789
+namespace GH141789 {
+template <typename Ty>
+struct S {
+  Ty t; // expected-error {{field has incomplete type 'GH141789::X'}}
+};
+
+struct T {
+  ~T();
+};
+
+struct X { // expected-note {{definition of 'GH141789::X' is not complete until the closing '}'}}
+  S<X> next; // expected-note {{in instantiation of template class 'GH141789::S<GH141789::X>' requested here}}
+  T m;
+};
+}

From 732ebf803b80a8a3fc3aaaceb600cebdf659118e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 14 Jun 2025 10:44:20 +0100
Subject: [PATCH 0376/1322] [VPlan] Address post-commit comments for
 f68848015f62.

Assign sentinel value to named variable to clarify naming and update
comments.

Addresses post-commit comments from
https://github.com/llvm/llvm-project/pull/142291.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 +++++-----
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93f53996425d..7c006ae326ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7263,14 +7263,13 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
                  RdxDesc.getRecurrenceKind())) {
     Value *StartV = getStartValueFromReductionResult(EpiRedResult);
+    Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
     using namespace llvm::PatternMatch;
     Value *Cmp, *OrigResumeV, *CmpOp;
     bool IsExpectedPattern =
         match(MainResumeValue,
-              m_Select(
-                  m_OneUse(m_Value(Cmp)),
-                  m_Specific(EpiRedResult->getOperand(2)->getLiveInIRValue()),
-                  m_Value(OrigResumeV))) &&
+              m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
+                       m_Value(OrigResumeV))) &&
         (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
                                    m_Value(CmpOp))) &&
          ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
@@ -9224,11 +9223,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
             RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
-      FinalReductionResult = Builder.createNaryOp(
-          VPInstruction::ComputeFindLastIVResult,
-          {PhiR, Start, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()),
-           NewExitingVPV},
-          ExitDL);
+      VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
+      FinalReductionResult =
+          Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult,
+                               {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
     } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
                    RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
@@ -9816,8 +9814,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
         Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
-        ResumeV = Builder.CreateSelect(
-            Cmp, RdxResult->getOperand(2)->getLiveInIRValue(), ResumeV);
+        Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
+        ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
       } else {
         VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
         auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d59cec892d40..c64bda167b85 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -643,8 +643,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
     assert(!PhiR->isInLoop() &&
            "In-loop FindLastIV reduction is not supported yet");
 
-    // The recipe's operands are the reduction phi, followed by one operand for
-    // each part of the reduction.
+    // The recipe's operands are the reduction phi, the start value, the
+    // sentinel value, followed by one operand for each part of the reduction.
     unsigned UF = getNumOperands() - 3;
     Value *ReducedPartRdx = State.get(getOperand(3));
     for (unsigned Part = 1; Part < UF; ++Part) {
@@ -652,9 +652,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                       State.get(getOperand(3 + Part)));
     }
 
-    return createFindLastIVReduction(Builder, ReducedPartRdx,
-                                     State.get(getOperand(1), true),
-                                     getOperand(2)->getLiveInIRValue());
+    Value *Start = State.get(getOperand(1), true);
+    Value *Sentinel = getOperand(2)->getLiveInIRValue();
+    return createFindLastIVReduction(Builder, ReducedPartRdx, Start, Sentinel);
   }
   case VPInstruction::ComputeReductionResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary

From 1bc0b08e19788f2b34f46b183e89f5049468da2a Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sat, 14 Jun 2025 19:02:42 +0900
Subject: [PATCH 0377/1322] CMake: Fix LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 to be 1 or 0.

It has been introduced in #107278 but it was passing
"DISABLED" of LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING to cmakedefine01.

cmakadefine01 treats non-false-like strings as 1.
"DISABLED" is replaced with 1.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index e2f9826d3981..743eb6f5529f 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -204,10 +204,13 @@ elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_
   set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
   set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
-  # The DISABLED setting is default and requires no additional defines.
+  # The DISABLED setting is default.
+  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0 )
 else()
   message(FATAL_ERROR "Unknown value for LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING: \"${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}\"!")
 endif()
+# LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING (non-cached) is expected to be
+# 1 or 0 here, assuming referenced in #cmakedefine01.
 
 if(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS)
   add_compile_definitions(EXPERIMENTAL_KEY_INSTRUCTIONS)

From 64640667871990e4d73ae6221b9c4f05d0b36ea6 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 14 Jun 2025 13:26:03 +0200
Subject: [PATCH 0378/1322] [CIR] Upstream CreateOp for ComplexType with folder
 (#143192)

This change adds support for the create op for ComplexType with folder
and support for empty init list

https://github.com/llvm/llvm-project/issues/141365
---
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  |   4 +-
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  32 ++++
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  |   3 +-
 clang/include/clang/CIR/MissingFeatures.h     |   1 -
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |   6 +
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          |  12 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  11 ++
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp   |  79 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |   9 ++
 clang/lib/CIR/CodeGen/CMakeLists.txt          |   1 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  27 ++++
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   5 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  48 +++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  10 ++
 clang/test/CIR/CodeGen/complex.cpp            | 149 ++++++++++++++++++
 .../CIR/Transforms/complex-create-fold.cir    |  30 ++++
 16 files changed, 415 insertions(+), 12 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
 create mode 100644 clang/test/CIR/Transforms/complex-create-fold.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index d22d265e8242..b48f4ed461cc 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -307,9 +307,9 @@ def ConstComplexAttr : CIR_Attr<"ConstComplex", "const_complex",
   );
 
   let builders = [
-    AttrBuilderWithInferredContext<(ins "cir::ComplexType":$type,
-                                        "mlir::TypedAttr":$real,
+    AttrBuilderWithInferredContext<(ins "mlir::TypedAttr":$real,
                                         "mlir::TypedAttr":$imag), [{
+      auto type = cir::ComplexType::get(real.getType());
       return $_get(type.getContext(), type, real, imag);
     }]>,
   ];
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 194153caa927..bd36d228578b 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2353,4 +2353,36 @@ def BaseClassAddrOp : CIR_Op<"base_class_addr"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexCreateOp
+//===----------------------------------------------------------------------===//
+
+def ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
+  let summary = "Create a complex value from its real and imaginary parts";
+  let description = [{
+    The `cir.complex.create` operation takes two operands that represent the
+    real and imaginary part of a complex number, and yields the complex number.
+
+    ```mlir
+    %0 = cir.const #cir.fp<1.000000e+00> : !cir.double
+    %1 = cir.const #cir.fp<2.000000e+00> : !cir.double
+    %2 = cir.complex.create %0, %1 : !cir.double -> !cir.complex<!cir.double>
+    ```
+  }];
+
+  let results = (outs CIR_ComplexType:$result);
+  let arguments = (ins
+    CIR_AnyIntOrFloatType:$real,
+    CIR_AnyIntOrFloatType:$imag
+  );
+
+  let assemblyFormat = [{
+    $real `,` $imag
+    `:` qualified(type($real)) `->` qualified(type($result)) attr-dict
+  }];
+
+  let hasVerifier = 1;
+  let hasFolder = 1;
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index fb9697607513..41d7d725a09e 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -600,7 +600,8 @@ def CIRRecordType : Type<
 
 def CIR_AnyType : AnyTypeOf<[
   CIR_VoidType, CIR_BoolType, CIR_ArrayType, CIR_VectorType, CIR_IntType,
-  CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType
+  CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType,
+  CIR_ComplexType
 ]>;
 
 #endif // MLIR_CIR_DIALECT_CIR_TYPES
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 225e9ec89a82..13ddc77835fb 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -251,7 +251,6 @@ struct MissingFeatures {
   // Future CIR operations
   static bool awaitOp() { return false; }
   static bool callOp() { return false; }
-  static bool complexCreateOp() { return false; }
   static bool complexImagOp() { return false; }
   static bool complexRealOp() { return false; }
   static bool ifOp() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index adf7cb77f1a5..e38faba83b80 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -360,6 +360,12 @@ public:
     return CIRBaseBuilderTy::createStore(loc, val, dst.getPointer(), align);
   }
 
+  mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real,
+                                  mlir::Value imag) {
+    auto resultComplexTy = cir::ComplexType::get(real.getType());
+    return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
+  }
+
   /// Create a cir.ptr_stride operation to get access to an array element.
   /// \p idx is the index of the element to access, \p shouldDecay is true if
   /// the result should decay to a pointer to the element type.
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 1941b5066edb..afbe92aded80 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -499,7 +499,13 @@ void CIRGenFunction::emitExprAsInit(const Expr *init, const ValueDecl *d,
     emitScalarInit(init, getLoc(d->getSourceRange()), lvalue);
     return;
   case cir::TEK_Complex: {
-    cgm.errorNYI(init->getSourceRange(), "emitExprAsInit: complex type");
+    mlir::Value complex = emitComplexExpr(init);
+    if (capturedByInit)
+      cgm.errorNYI(init->getSourceRange(),
+                   "emitExprAsInit: complex type captured by init");
+    mlir::Location loc = getLoc(init->getExprLoc());
+    emitStoreOfComplex(loc, complex, lvalue,
+                       /*isInit*/ true);
     return;
   }
   case cir::TEK_Aggregate:
@@ -593,8 +599,8 @@ void CIRGenFunction::emitDecl(const Decl &d) {
     // None of these decls require codegen support.
     return;
 
-  case Decl::Enum:   // enum X;
-  case Decl::Record: // struct/union/class X;
+  case Decl::Enum:      // enum X;
+  case Decl::Record:    // struct/union/class X;
   case Decl::CXXRecord: // struct/union/class X; [C++]
   case Decl::NamespaceAlias:
   case Decl::Using:          // using X; [C++]
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 42d0c78013f5..2e43f10be132 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1690,3 +1690,14 @@ mlir::Value CIRGenFunction::emitScalarConstant(
   }
   return builder.getConstant(getLoc(e->getSourceRange()), constant.getValue());
 }
+
+/// An LValue is a candidate for having its loads and stores be made atomic if
+/// we are operating under /volatile:ms *and* the LValue itself is volatile and
+/// performing such an operation can be performed without a libcall.
+bool CIRGenFunction::isLValueSuitableForInlineAtomic(LValue lv) {
+  if (!cgm.getLangOpts().MSVolatile)
+    return false;
+
+  cgm.errorNYI("LValueSuitableForInlineAtomic LangOpts MSVolatile");
+  return false;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
new file mode 100644
index 000000000000..2ffe75a388e9
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -0,0 +1,79 @@
+#include "CIRGenBuilder.h"
+#include "CIRGenFunction.h"
+
+#include "clang/AST/StmtVisitor.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
+  CIRGenFunction &cgf;
+  CIRGenBuilderTy &builder;
+
+public:
+  explicit ComplexExprEmitter(CIRGenFunction &cgf)
+      : cgf(cgf), builder(cgf.getBuilder()) {}
+
+  /// Store the specified real/imag parts into the
+  /// specified value pointer.
+  void emitStoreOfComplex(mlir::Location loc, mlir::Value val, LValue lv,
+                          bool isInit);
+
+  mlir::Value VisitInitListExpr(InitListExpr *e);
+};
+
+} // namespace
+
+static const ComplexType *getComplexType(QualType type) {
+  type = type.getCanonicalType();
+  if (const ComplexType *comp = dyn_cast<ComplexType>(type))
+    return comp;
+  return cast<ComplexType>(cast<AtomicType>(type)->getValueType());
+}
+
+void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
+                                            LValue lv, bool isInit) {
+  if (lv.getType()->isAtomicType() ||
+      (!isInit && cgf.isLValueSuitableForInlineAtomic(lv))) {
+    cgf.cgm.errorNYI("StoreOfComplex with Atomic LV");
+    return;
+  }
+
+  const Address destAddr = lv.getAddress();
+  builder.createStore(loc, val, destAddr);
+}
+
+mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
+  mlir::Location loc = cgf.getLoc(e->getExprLoc());
+  if (e->getNumInits() == 2) {
+    mlir::Value real = cgf.emitScalarExpr(e->getInit(0));
+    mlir::Value imag = cgf.emitScalarExpr(e->getInit(1));
+    return builder.createComplexCreate(loc, real, imag);
+  }
+
+  if (e->getNumInits() == 1) {
+    cgf.cgm.errorNYI("Create Complex with InitList with size 1");
+    return {};
+  }
+
+  assert(e->getNumInits() == 0 && "Unexpected number of inits");
+  QualType complexElemTy =
+      e->getType()->castAs<clang::ComplexType>()->getElementType();
+  mlir::Type complexElemLLVMTy = cgf.convertType(complexElemTy);
+  mlir::TypedAttr defaultValue = builder.getZeroInitAttr(complexElemLLVMTy);
+  auto complexAttr = cir::ConstComplexAttr::get(defaultValue, defaultValue);
+  return builder.create<cir::ConstantOp>(loc, complexAttr);
+}
+
+mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
+  assert(e && getComplexType(e->getType()) &&
+         "Invalid complex expression to emit");
+
+  return ComplexExprEmitter(*this).Visit(const_cast<Expr *>(e));
+}
+
+void CIRGenFunction::emitStoreOfComplex(mlir::Location loc, mlir::Value v,
+                                        LValue dest, bool isInit) {
+  ComplexExprEmitter(*this).emitStoreOfComplex(loc, v, dest, isInit);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 318d3fbf3f9e..de6ef2a69faf 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -338,6 +338,8 @@ public:
     PrototypeWrapper(const clang::ObjCMethodDecl *md) : p(md) {}
   };
 
+  bool isLValueSuitableForInlineAtomic(LValue lv);
+
   /// An abstract representation of regular/ObjC call/message targets.
   class AbstractCallee {
     /// The function declaration of the callee.
@@ -860,6 +862,10 @@ public:
 
   mlir::LogicalResult emitForStmt(const clang::ForStmt &s);
 
+  /// Emit the computation of the specified expression of complex type,
+  /// returning the result.
+  mlir::Value emitComplexExpr(const Expr *e);
+
   void emitCompoundStmt(const clang::CompoundStmt &s);
 
   void emitCompoundStmtWithoutScope(const clang::CompoundStmt &s);
@@ -961,6 +967,9 @@ public:
 
   void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage);
 
+  void emitStoreOfComplex(mlir::Location loc, mlir::Value v, LValue dest,
+                          bool isInit);
+
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
                          clang::QualType ty, bool isInit = false,
                          bool isNontemporal = false);
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 217609687eab..385bea066c61 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -19,6 +19,7 @@ add_clang_library(clangCIR
   CIRGenDeclOpenACC.cpp
   CIRGenExpr.cpp
   CIRGenExprAggregate.cpp
+  CIRGenExprComplex.cpp
   CIRGenExprConstant.cpp
   CIRGenExprScalar.cpp
   CIRGenFunction.cpp
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index a685253b7d82..5578d4f5825a 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1748,6 +1748,33 @@ OpFoldResult cir::VecTernaryOp::fold(FoldAdaptor adaptor) {
       vecTy, mlir::ArrayAttr::get(getContext(), elements));
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexCreateOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult cir::ComplexCreateOp::verify() {
+  if (getType().getElementType() != getReal().getType()) {
+    emitOpError()
+        << "operand type of cir.complex.create does not match its result type";
+    return failure();
+  }
+
+  return success();
+}
+
+OpFoldResult cir::ComplexCreateOp::fold(FoldAdaptor adaptor) {
+  mlir::Attribute real = adaptor.getReal();
+  mlir::Attribute imag = adaptor.getImag();
+  if (!real || !imag)
+    return {};
+
+  // When both of real and imag are constants, we can fold the operation into an
+  // `#cir.const_complex` operation.
+  auto realAttr = mlir::cast<mlir::TypedAttr>(real);
+  auto imagAttr = mlir::cast<mlir::TypedAttr>(imag);
+  return cir::ConstComplexAttr::get(realAttr, imagAttr);
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 6f8a64ce0251..20c634d6c66f 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -134,7 +134,6 @@ void CIRCanonicalizePass::runOnOperation() {
   getOperation()->walk([&](Operation *op) {
     assert(!cir::MissingFeatures::switchOp());
     assert(!cir::MissingFeatures::tryOp());
-    assert(!cir::MissingFeatures::complexCreateOp());
     assert(!cir::MissingFeatures::complexRealOp());
     assert(!cir::MissingFeatures::complexImagOp());
     assert(!cir::MissingFeatures::callOp());
@@ -142,8 +141,8 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            VecCreateOp, VecExtractOp, VecShuffleOp, VecShuffleDynamicOp,
-            VecTernaryOp>(op))
+            ComplexCreateOp, VecCreateOp, VecExtractOp, VecShuffleOp,
+            VecShuffleDynamicOp, VecTernaryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 619e113202c9..6a4e4e4a7df3 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -905,7 +905,32 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
     rewriter.replaceOp(op, lowerCirAttrAsValue(op, op.getValue(), rewriter,
                                                getTypeConverter()));
     return mlir::success();
-  } else {
+  } else if (auto complexTy = mlir::dyn_cast<cir::ComplexType>(op.getType())) {
+    auto complexAttr = mlir::cast<cir::ConstComplexAttr>(op.getValue());
+    mlir::Type complexElemTy = complexTy.getElementType();
+    mlir::Type complexElemLLVMTy = typeConverter->convertType(complexElemTy);
+
+    mlir::Attribute components[2];
+    if (mlir::isa<cir::IntType>(complexElemTy)) {
+      components[0] = rewriter.getIntegerAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::IntAttr>(complexAttr.getReal()).getValue());
+      components[1] = rewriter.getIntegerAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::IntAttr>(complexAttr.getImag()).getValue());
+    } else {
+      components[0] = rewriter.getFloatAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::FPAttr>(complexAttr.getReal()).getValue());
+      components[1] = rewriter.getFloatAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::FPAttr>(complexAttr.getImag()).getValue());
+    }
+
+    attr = rewriter.getArrayAttr(components);
+  }
+
+  else {
     return op.emitError() << "unsupported constant type " << op.getType();
   }
 
@@ -1810,7 +1835,8 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMVecSplatOpLowering,
                CIRToLLVMVecShuffleOpLowering,
                CIRToLLVMVecShuffleDynamicOpLowering,
-               CIRToLLVMVecTernaryOpLowering
+               CIRToLLVMVecTernaryOpLowering,
+               CIRToLLVMComplexCreateOpLowering
       // clang-format on
       >(converter, patterns.getContext());
 
@@ -2096,6 +2122,24 @@ mlir::LogicalResult CIRToLLVMVecTernaryOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMComplexCreateOpLowering::matchAndRewrite(
+    cir::ComplexCreateOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type complexLLVMTy =
+      getTypeConverter()->convertType(op.getResult().getType());
+  auto initialComplex =
+      rewriter.create<mlir::LLVM::UndefOp>(op->getLoc(), complexLLVMTy);
+
+  auto realComplex = rewriter.create<mlir::LLVM::InsertValueOp>(
+      op->getLoc(), initialComplex, adaptor.getReal(), 0);
+
+  auto complex = rewriter.create<mlir::LLVM::InsertValueOp>(
+      op->getLoc(), realComplex, adaptor.getImag(), 1);
+
+  rewriter.replaceOp(op, complex);
+  return mlir::success();
+}
+
 std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
   return std::make_unique<ConvertCIRToLLVMPass>();
 }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 2eda568c84bd..a80981806354 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -408,6 +408,16 @@ public:
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMComplexCreateOpLowering
+    : public mlir::OpConversionPattern<cir::ComplexCreateOp> {
+public:
+  using mlir::OpConversionPattern<cir::ComplexCreateOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::ComplexCreateOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 } // namespace direct
 } // namespace cir
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 6fa7bca3749c..d193b9f32efb 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -27,3 +27,152 @@ float _Complex cf2 = { 1.0f, 2.0f };
 // OGCG: {{.*}} = global { float, float } zeroinitializer, align 4
 // OGCG: {{.*}} = global { i32, i32 } { i32 1, i32 2 }, align 4
 // OGCG: {{.*}} = global { float, float } { float 1.000000e+00, float 2.000000e+00 }, align 4
+
+void foo() { int _Complex c = {}; }
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<0> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } zeroinitializer, ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 0, ptr %[[C_IMAG_PTR]], align 4
+
+void foo2() { int _Complex c = {1, 2}; }
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } { i32 1, i32 2 }, ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 1, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 2, ptr %[[C_IMAG_PTR]], align 4
+
+void foo3() {
+  int a;
+  int b;
+  int _Complex c = {a, b};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!s32i>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!s32i>
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[TMP_B]] : !s32i -> !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load i32, ptr {{.*}}, align 4
+// LLVM: %[[TMP_B:.*]] = load i32, ptr {{.*}}, align 4
+// LLVM: %[[TMP:.*]] = insertvalue { i32, i32 } undef, i32 %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { i32, i32 } %[[TMP]], i32 %[[TMP_B]], 1
+// LLVM: store { i32, i32 } %[[TMP_2]], ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[REAL_VAL:.*]] = load i32, ptr {{.*}}, align 4
+// OGCG: %[[IMAG_VAL:.*]] = load i32, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 %[[REAL_VAL]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 %[[IMAG_VAL]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo4() {
+  int a;
+  int _Complex c = {1, a};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[CONST_1]], %[[TMP_A]] : !s32i -> !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load i32, ptr {{.*}}, align 4
+// LLVM: %[[COMPLEX:.*]] = insertvalue { i32, i32 } { i32 1, i32 undef }, i32 %[[TMP_A]], 1
+// LLVM: store { i32, i32 } %[[COMPLEX]], ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[TMP_A:.*]] = load i32, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 1, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 %[[TMP_A]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo5() {
+  float _Complex c = {1.0f, 2.0f};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[INIT:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: store { float, float } { float 1.000000e+00, float 2.000000e+00 }, ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX]] = alloca { float, float }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store float 1.000000e+00, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
+
+void foo6() {
+  float a;
+  float b;
+  float _Complex c = {a, b};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[TMP_B]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// LLVM: %[[TMP_B:.*]] = load float, ptr {{.*}}, align 4
+// LLVM: %[[TMP:.*]] = insertvalue { float, float } undef, float %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { float, float } %[[TMP]], float %[[TMP_B]], 1
+// LLVM: store { float, float } %[[TMP_2]], ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX]] = alloca { float, float }, align 4
+// OGCG: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// OGCG: %[[TMP_B:.*]] = load float, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store float %[[TMP_B]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo7() {
+  float a;
+  float _Complex c = {a, 2.0f};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[CONST_2F:.*]] = cir.const #cir.fp<2.000000e+00> : !cir.float
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[CONST_2F]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// LLVM: %[[TMP:.*]] = insertvalue { float, float } undef, float %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { float, float } %[[TMP]], float 2.000000e+00, 1
+// LLVM: store { float, float } %[[TMP_2]], ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4
+// OGCG: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
+
diff --git a/clang/test/CIR/Transforms/complex-create-fold.cir b/clang/test/CIR/Transforms/complex-create-fold.cir
new file mode 100644
index 000000000000..5d9d22112c8b
--- /dev/null
+++ b/clang/test/CIR/Transforms/complex-create-fold.cir
@@ -0,0 +1,30 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @fold_complex_create_test() -> !cir.complex<!s32i>  {
+    %0 = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["__retval"]
+    %1 = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+    %2 = cir.const #cir.int<1> : !s32i
+    %3 = cir.const #cir.int<2> : !s32i
+    %4 = cir.complex.create %2, %3 : !s32i -> !cir.complex<!s32i>
+    cir.store align(4) %4, %1 : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+    %5 = cir.load align(4) %1 : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+    cir.store align(4) %5, %0 : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+    %6 = cir.load %0 : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+    cir.return %6 : !cir.complex<!s32i>
+  }
+
+// CHECK: cir.func @fold_complex_create_test() -> !cir.complex<!s32i> {
+// CHECK:   %[[RET:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["__retval"]
+// CHECK:   %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CHECK:   %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+// CHECK:   cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CHECK:   %[[TMP:.*]] = cir.load{{.*}} %[[INIT]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CHECK:   cir.store{{.*}} %[[TMP:.*]], %[[RET]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CHECK:   %[[TMP_2:.*]] = cir.load %[[RET]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CHECK:   cir.return %[[TMP_2]] : !cir.complex<!s32i>
+// CHECK: }
+
+}

From 2cb32e29408a6c598072ea0f066a246957be69f9 Mon Sep 17 00:00:00 2001
From: Ross Kirsling <rkirsling@gmail.com>
Date: Sat, 14 Jun 2025 22:03:23 +0900
Subject: [PATCH 0379/1322] [Clang] Fix fix-it hint regression from #143460
 (#144069)

Following #143460, `:` began displaying as `colon` in the fix-it hint
for a `case` with a missing colon, as is visible in the description of
(the separate bug) #144052.

This PR simply reverts a line that didn't need to be changed.
---
 clang/lib/Parse/ParseStmt.cpp                  |  3 +--
 clang/test/FixIt/fixit-punctuator-spelling.cpp | 10 ++++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/FixIt/fixit-punctuator-spelling.cpp

diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 434ea6844281..c0c9bbc2e15c 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -836,8 +836,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
 
       Diag(ExpectedLoc, diag::err_expected_after)
           << "'case'" << tok::colon
-          << FixItHint::CreateInsertion(ExpectedLoc,
-                                        tok::getTokenName(tok::colon));
+          << FixItHint::CreateInsertion(ExpectedLoc, ":");
 
       ColonLoc = ExpectedLoc;
     }
diff --git a/clang/test/FixIt/fixit-punctuator-spelling.cpp b/clang/test/FixIt/fixit-punctuator-spelling.cpp
new file mode 100644
index 000000000000..3cba0e7b6459
--- /dev/null
+++ b/clang/test/FixIt/fixit-punctuator-spelling.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+void f(int x) {
+  switch (x) {
+    case 1 // expected-error {{expected ':' after 'case'}}
+      break;
+  }
+}
+// CHECK: fix-it:"{{.*}}":{6:11-6:11}:":"

From 42595d34bda74e0d6e3b6ec0cf253875330f9c42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sat, 14 Jun 2025 16:11:41 +0200
Subject: [PATCH 0380/1322] [llvm] [cmake] Use pkg-config to obtain libffi
 search hints (#144221)

Extend `FindFFI.cmake` to include the paths obtained from pkg-config
when searching for libffi. This is going to help systems where libffi is
installed in nonstandard directory such as Gentoo, saving us from having
to copy the paths from pkg-config to `FFI_*` variables explicitly. The
logic is inspired by `FindLibEdit.cmake`.
---
 llvm/cmake/modules/FindFFI.cmake | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/cmake/modules/FindFFI.cmake b/llvm/cmake/modules/FindFFI.cmake
index 8e67c5d8c6d1..b1f64522b268 100644
--- a/llvm/cmake/modules/FindFFI.cmake
+++ b/llvm/cmake/modules/FindFFI.cmake
@@ -23,7 +23,10 @@
 # Additionally, the following import target will be defined:
 # FFI::ffi
 
-find_path(FFI_INCLUDE_DIRS ffi.h PATHS ${FFI_INCLUDE_DIR})
+find_package(PkgConfig QUIET)
+pkg_check_modules(PC_LIBFFI QUIET libffi)
+
+find_path(FFI_INCLUDE_DIRS ffi.h PATHS ${FFI_INCLUDE_DIR} ${PC_LIBFFI_INCLUDE_DIRS})
 if( EXISTS "${FFI_INCLUDE_DIRS}/ffi.h" )
   set(FFI_HEADER ffi.h CACHE INTERNAL "")
   set(HAVE_FFI_H 1 CACHE INTERNAL "")
@@ -35,8 +38,8 @@ else()
   endif()
 endif()
 
-find_library(FFI_LIBRARIES NAMES ffi PATHS ${FFI_LIBRARY_DIR})
-find_library(FFI_STATIC_LIBRARIES NAMES libffi.a PATHS ${FFI_LIBRARY_DIR})
+find_library(FFI_LIBRARIES NAMES ffi PATHS ${FFI_LIBRARY_DIR} ${PC_LIBFFI_LIBRARY_DIRS})
+find_library(FFI_STATIC_LIBRARIES NAMES libffi.a PATHS ${FFI_LIBRARY_DIR} ${PC_LIBFFI_LIBRARY_DIRS})
 
 if(FFI_LIBRARIES)
   include(CMakePushCheckState)

From ff295d2f3429a5a2a93b2c86099af40544f467d4 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Sat, 14 Jun 2025 16:17:08 +0200
Subject: [PATCH 0381/1322] [OpenMP][clang] declare mapper: fix handling of
 nested types (#143504)

Fix a crash that happened during parsing of a "declare mapper" construct
for a struct that contains an element for which we also declared a
custom default mapper.
---
 clang/include/clang/Sema/SemaOpenMP.h         |  2 +-
 clang/lib/Parse/ParseOpenMP.cpp               | 12 ++++++---
 clang/test/OpenMP/declare_mapper_ast_print.c  | 25 ++++++++++++++++++
 .../test/OpenMP/declare_mapper_ast_print.cpp  | 26 +++++++++++++++++++
 4 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index be6bec206878..7b169f56b680 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -283,7 +283,7 @@ public:
   /// mapper' construct.
   QualType ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
                                         TypeResult ParsedType);
-  /// Called on start of '#pragma omp declare mapper'.
+  /// Called for '#pragma omp declare mapper'.
   DeclGroupPtrTy ActOnOpenMPDeclareMapperDirective(
       Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType,
       SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index def1a52ba7d4..78d3503d8eb6 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -576,6 +576,7 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
     return DeclGroupPtrTy();
   }
 
+  Scope *OuterScope = getCurScope();
   // Enter scope.
   DeclarationNameInfo DirName;
   SourceLocation Loc = Tok.getLocation();
@@ -614,12 +615,17 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
     IsCorrect = false;
   }
 
+  // This needs to be called within the scope because
+  // processImplicitMapsWithDefaultMappers may add clauses when analyzing nested
+  // types. The scope used for calling ActOnOpenMPDeclareMapperDirective,
+  // however, needs to be the outer one, otherwise declared mappers don't become
+  // visible.
+  DeclGroupPtrTy DG = Actions.OpenMP().ActOnOpenMPDeclareMapperDirective(
+      OuterScope, Actions.getCurLexicalContext(), MapperId, MapperType,
+      Range.getBegin(), VName, AS, MapperVarRef.get(), Clauses);
   // Exit scope.
   Actions.OpenMP().EndOpenMPDSABlock(nullptr);
   OMPDirectiveScope.Exit();
-  DeclGroupPtrTy DG = Actions.OpenMP().ActOnOpenMPDeclareMapperDirective(
-      getCurScope(), Actions.getCurLexicalContext(), MapperId, MapperType,
-      Range.getBegin(), VName, AS, MapperVarRef.get(), Clauses);
   if (!IsCorrect)
     return DeclGroupPtrTy();
 
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.c b/clang/test/OpenMP/declare_mapper_ast_print.c
index 3c554a106fe4..bb83f23a0c18 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.c
+++ b/clang/test/OpenMP/declare_mapper_ast_print.c
@@ -49,6 +49,23 @@ struct dat {
 #pragma omp declare mapper(struct dat d) map(to: d.d)
 // CHECK: #pragma omp declare mapper (default : struct dat d) map(to: d.d){{$}}
 
+// Verify that nested default mappers do not lead to a crash during parsing / sema.
+// CHECK: struct inner {
+struct inner {
+  int size;
+  int *data;
+};
+#pragma omp declare mapper(struct inner i) map(i, i.data[0 : i.size])
+// CHECK: #pragma omp declare mapper (default : struct inner i) map(tofrom: default::i,i.data[0:i.size]){{$}}
+
+// CHECK: struct outer {
+struct outer {
+  int a;
+  struct inner i;
+};
+#pragma omp declare mapper(struct outer o) map(o)
+// CHECK: #pragma omp declare mapper (default : struct outer o) map(tofrom: default::o) map(tofrom: o.i){{$}}
+
 // CHECK: int main(void) {
 int main(void) {
 #pragma omp declare mapper(id: struct vec v) map(v.len)
@@ -77,6 +94,14 @@ int main(void) {
 #pragma omp declare mapper(id1: struct vec vvec) map(iterator(it=0:vvec.len:2), tofrom:vvec.data[it])
 // OMP52: #pragma omp declare mapper (id1 : struct vec vvec) map(iterator(int it = 0:vvec.len:2),tofrom: vvec.data[it]);
 #endif
+
+  {
+    struct outer outer;
+#pragma omp target map(outer)
+// CHECK: #pragma omp target map(tofrom: outer)
+    { }
+  }
+
   return 0;
 }
 // CHECK: }
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.cpp b/clang/test/OpenMP/declare_mapper_ast_print.cpp
index 422fa9981672..9ca3412e3e3d 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.cpp
+++ b/clang/test/OpenMP/declare_mapper_ast_print.cpp
@@ -34,6 +34,28 @@ public:
 // CHECK: }
 // CHECK: ;
 
+// Verify that nested default mappers do not lead to a crash during parsing / sema.
+// CHECK: namespace N2 {
+namespace N2
+{
+// CHECK: struct inner {
+struct inner {
+  int size;
+  int *data;
+};
+#pragma omp declare mapper(struct inner i) map(i, i.data[0 : i.size])
+// CHECK: #pragma omp declare mapper (default : struct inner i) map(tofrom: N2::default::i,i.data[0:i.size]){{$}}
+
+// CHECK: struct outer {
+struct outer {
+  int a;
+  struct inner i;
+};
+#pragma omp declare mapper(struct outer o) map(o)
+// CHECK: #pragma omp declare mapper (default : struct outer o) map(tofrom: N2::default::o) map(tofrom: o.i){{$}}
+} // namespace N2
+// CHECK: }
+
 template <class T>
 class dat {
 public:
@@ -122,6 +144,7 @@ T foo(T a) {
 int main() {
   N1::vec vv, vvv;
   N1::vecchild vc;
+  N2::outer outer;
   dat<double> dd;
 #pragma omp target map(mapper(N1::id) tofrom: vv) map(mapper(dat<double>::id) alloc: vvv)
 // CHECK: #pragma omp target map(mapper(N1::id),tofrom: vv) map(mapper(dat<double>::id),alloc: vvv)
@@ -132,6 +155,9 @@ int main() {
 #pragma omp target map(mapper(default) tofrom: dd)
 // CHECK: #pragma omp target map(mapper(default),tofrom: dd)
   { dd.d++; }
+#pragma omp target map(outer)
+// CHECK: #pragma omp target map(tofrom: outer)
+  { }
 
 #pragma omp target update to(mapper(N1::id) : vc)
 // CHECK: #pragma omp target update to(mapper(N1::id): vc)

From 10bc17fc3676b82c7240046a948d2925dd2045d3 Mon Sep 17 00:00:00 2001
From: Tom Vijlbrief <tvijlbrief@gmail.com>
Date: Sat, 14 Jun 2025 17:10:04 +0200
Subject: [PATCH 0382/1322] [AVR] Add support for many new AVR MCUs (#143914)

fixes https://github.com/llvm/llvm-project/issues/116116
---
 clang/lib/Basic/Targets/AVR.cpp     | 69 ++++++++++++++++++++++++++
 clang/lib/Driver/ToolChains/AVR.cpp | 70 ++++++++++++++++++++++++++
 llvm/lib/Target/AVR/AVRDevices.td   | 76 +++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+)

diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 85ca4bc30c46..bbe7b01ca036 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -336,6 +336,9 @@ static MCUInfo AVRMcus[] = {
     {"attiny1624", "__AVR_ATtiny1624__", "103", 1},
     {"attiny1626", "__AVR_ATtiny1626__", "103", 1},
     {"attiny1627", "__AVR_ATtiny1627__", "103", 1},
+    {"attiny3224", "__AVR_ATtiny3224__", "103", 1},
+    {"attiny3226", "__AVR_ATtiny3226__", "103", 1},
+    {"attiny3227", "__AVR_ATtiny3227__", "103", 1},
     {"atmega808", "__AVR_ATmega808__", "103", 1},
     {"atmega809", "__AVR_ATmega809__", "103", 1},
     {"atmega1608", "__AVR_ATmega1608__", "103", 1},
@@ -344,6 +347,72 @@ static MCUInfo AVRMcus[] = {
     {"atmega3209", "__AVR_ATmega3209__", "103", 1},
     {"atmega4808", "__AVR_ATmega4808__", "103", 1},
     {"atmega4809", "__AVR_ATmega4809__", "103", 1},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "__AVR_AVR64DA28__", "102", 1},
+    {"avr64da32", "__AVR_AVR64DA32__", "102", 1},
+    {"avr64da48", "__AVR_AVR64DA48__", "102", 1},
+    {"avr64da64", "__AVR_AVR64DA64__", "102", 1},
+    {"avr64db28", "__AVR_AVR64DB28__", "102", 1},
+    {"avr64db32", "__AVR_AVR64DB32__", "102", 1},
+    {"avr64db48", "__AVR_AVR64DB48__", "102", 1},
+    {"avr64db64", "__AVR_AVR64DB64__", "102", 1},
+    {"avr64dd14", "__AVR_AVR64DD14__", "102", 1},
+    {"avr64dd20", "__AVR_AVR64DD20__", "102", 1},
+    {"avr64dd28", "__AVR_AVR64DD28__", "102", 1},
+    {"avr64dd32", "__AVR_AVR64DD32__", "102", 1},
+    {"avr64du28", "__AVR_AVR64DU28__", "102", 1},
+    {"avr64du32", "__AVR_AVR64DU32__", "102", 1},
+    {"avr64ea28", "__AVR_AVR64EA28__", "102", 1},
+    {"avr64ea32", "__AVR_AVR64EA32__", "102", 1},
+    {"avr64ea48", "__AVR_AVR64EA48__", "102", 1},
+    {"avr64sd28", "__AVR_AVR64SD28__", "102", 1},
+    {"avr64sd32", "__AVR_AVR64SD32__", "102", 1},
+    {"avr64sd48", "__AVR_AVR64SD48__", "102", 1},
+
+    {"avr16dd20", "__AVR_AVR16DD20__", "103", 1},
+    {"avr16dd28", "__AVR_AVR16DD28__", "103", 1},
+    {"avr16dd32", "__AVR_AVR16DD32__", "103", 1},
+    {"avr16du14", "__AVR_AVR16DU14__", "103", 1},
+    {"avr16du20", "__AVR_AVR16DU20__", "103", 1},
+    {"avr16du28", "__AVR_AVR16DU28__", "103", 1},
+    {"avr16du32", "__AVR_AVR16DU32__", "103", 1},
+    {"avr32da28", "__AVR_AVR32DA28__", "103", 1},
+    {"avr32da32", "__AVR_AVR32DA32__", "103", 1},
+    {"avr32da48", "__AVR_AVR32DA48__", "103", 1},
+    {"avr32db28", "__AVR_AVR32DB28__", "103", 1},
+    {"avr32db32", "__AVR_AVR32DB32__", "103", 1},
+    {"avr32db48", "__AVR_AVR32DB48__", "103", 1},
+    {"avr32dd14", "__AVR_AVR32DD14__", "103", 1},
+    {"avr32dd20", "__AVR_AVR32DD20__", "103", 1},
+    {"avr32dd28", "__AVR_AVR32DD28__", "103", 1},
+    {"avr32dd32", "__AVR_AVR32DD32__", "103", 1},
+    {"avr32du14", "__AVR_AVR32DU14__", "103", 1},
+    {"avr32du20", "__AVR_AVR32DU20__", "103", 1},
+    {"avr32du28", "__AVR_AVR32DU28__", "103", 1},
+    {"avr32du32", "__AVR_AVR32DU32__", "103", 1},
+    {"avr16eb14", "__AVR_AVR16EB14__", "103", 1},
+    {"avr16eb20", "__AVR_AVR16EB20__", "103", 1},
+    {"avr16eb28", "__AVR_AVR16EB28__", "103", 1},
+    {"avr16eb32", "__AVR_AVR16EB32__", "103", 1},
+    {"avr16ea28", "__AVR_AVR16EA28__", "103", 1},
+    {"avr16ea32", "__AVR_AVR16EA32__", "103", 1},
+    {"avr16ea48", "__AVR_AVR16EA48__", "103", 1},
+    {"avr32ea28", "__AVR_AVR32EA28__", "103", 1},
+    {"avr32ea32", "__AVR_AVR32EA32__", "103", 1},
+    {"avr32ea48", "__AVR_AVR32EA48__", "103", 1},
+    {"avr32sd20", "__AVR_AVR32SD20__", "103", 1},
+    {"avr32sd28", "__AVR_AVR32SD28__", "103", 1},
+    {"avr32sd32", "__AVR_AVR32SD32__", "103", 1},
+    {"avr128da28", "__AVR_AVR128DA28__", "104", 2},
+    {"avr128da32", "__AVR_AVR128DA32__", "104", 2},
+    {"avr128da48", "__AVR_AVR128DA48__", "104", 2},
+    {"avr128da64", "__AVR_AVR128DA64__", "104", 2},
+    {"avr128db28", "__AVR_AVR128DB28__", "104", 2},
+    {"avr128db32", "__AVR_AVR128DB32__", "104", 2},
+    {"avr128db48", "__AVR_AVR128DB48__", "104", 2},
+    {"avr128db64", "__AVR_AVR128DB64__", "104", 2},
 };
 
 } // namespace targets
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index b0523a7f4e40..731076d9754a 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -326,8 +326,78 @@ constexpr struct {
     {"attiny1624", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1626", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1627", "avrxmega3", "avrxmega3", 0x803800},
+    {"attiny3224", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3226", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3227", "avrxmega3", "avrxmega3", 0x803400},
     {"attiny3216", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny3217", "avrxmega3", "avrxmega3", 0x803800},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd14", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd20", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64ea28", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea32", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea48", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64sd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd48", "avrxmega2", "avrxmega2", 0x806000},
+
+    {"avr16dd20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32da28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr16eb14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea48", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32ea28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr128da28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da64", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db64", "avrxmega4", "avrxmega4", 0x804000},
+
 };
 
 std::string GetMCUSubPath(StringRef MCUName) {
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 56147bb473bc..efe78391f731 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -215,6 +215,13 @@ def FamilyXMEGA3 : Family<"xmega3",
                            FeatureMultiplication, FeatureMOVW, FeatureLPMX,
                            FeatureBREAK, FeatureLowByteFirst]>;
 
+def FamilyXMEGA4 : Family<"xmega4",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureELPM,
+                           FeatureBREAK, FeatureLowByteFirst]>;
+
 def FamilyXMEGA : Family<"xmega",
                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
@@ -567,6 +574,9 @@ def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3224", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3226", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3227", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
@@ -575,3 +585,69 @@ def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
+
+// Additions from gcc 14:
+
+def : Device<"avr64da28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd14", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd20", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd48", FamilyXMEGA2, ELFArchXMEGA2>;
+
+def : Device<"avr16dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr128da28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da64", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db64", FamilyXMEGA4, ELFArchXMEGA4>;

From 62d8e001dac4b1a68f5b33c8784adba1335003f4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 14 Jun 2025 08:18:50 -0700
Subject: [PATCH 0383/1322] Revert "[AVR] Add support for many new AVR MCUs
 (#143914)"

This reverts commit 10bc17fc3676b82c7240046a948d2925dd2045d3.

Multiple buildbot failures have been reported:
https://github.com/llvm/llvm-project/pull/143914
---
 clang/lib/Basic/Targets/AVR.cpp     | 69 --------------------------
 clang/lib/Driver/ToolChains/AVR.cpp | 70 --------------------------
 llvm/lib/Target/AVR/AVRDevices.td   | 76 -----------------------------
 3 files changed, 215 deletions(-)

diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index bbe7b01ca036..85ca4bc30c46 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -336,9 +336,6 @@ static MCUInfo AVRMcus[] = {
     {"attiny1624", "__AVR_ATtiny1624__", "103", 1},
     {"attiny1626", "__AVR_ATtiny1626__", "103", 1},
     {"attiny1627", "__AVR_ATtiny1627__", "103", 1},
-    {"attiny3224", "__AVR_ATtiny3224__", "103", 1},
-    {"attiny3226", "__AVR_ATtiny3226__", "103", 1},
-    {"attiny3227", "__AVR_ATtiny3227__", "103", 1},
     {"atmega808", "__AVR_ATmega808__", "103", 1},
     {"atmega809", "__AVR_ATmega809__", "103", 1},
     {"atmega1608", "__AVR_ATmega1608__", "103", 1},
@@ -347,72 +344,6 @@ static MCUInfo AVRMcus[] = {
     {"atmega3209", "__AVR_ATmega3209__", "103", 1},
     {"atmega4808", "__AVR_ATmega4808__", "103", 1},
     {"atmega4809", "__AVR_ATmega4809__", "103", 1},
-
-    // gcc 14 additions:
-
-    {"avr64da28", "__AVR_AVR64DA28__", "102", 1},
-    {"avr64da32", "__AVR_AVR64DA32__", "102", 1},
-    {"avr64da48", "__AVR_AVR64DA48__", "102", 1},
-    {"avr64da64", "__AVR_AVR64DA64__", "102", 1},
-    {"avr64db28", "__AVR_AVR64DB28__", "102", 1},
-    {"avr64db32", "__AVR_AVR64DB32__", "102", 1},
-    {"avr64db48", "__AVR_AVR64DB48__", "102", 1},
-    {"avr64db64", "__AVR_AVR64DB64__", "102", 1},
-    {"avr64dd14", "__AVR_AVR64DD14__", "102", 1},
-    {"avr64dd20", "__AVR_AVR64DD20__", "102", 1},
-    {"avr64dd28", "__AVR_AVR64DD28__", "102", 1},
-    {"avr64dd32", "__AVR_AVR64DD32__", "102", 1},
-    {"avr64du28", "__AVR_AVR64DU28__", "102", 1},
-    {"avr64du32", "__AVR_AVR64DU32__", "102", 1},
-    {"avr64ea28", "__AVR_AVR64EA28__", "102", 1},
-    {"avr64ea32", "__AVR_AVR64EA32__", "102", 1},
-    {"avr64ea48", "__AVR_AVR64EA48__", "102", 1},
-    {"avr64sd28", "__AVR_AVR64SD28__", "102", 1},
-    {"avr64sd32", "__AVR_AVR64SD32__", "102", 1},
-    {"avr64sd48", "__AVR_AVR64SD48__", "102", 1},
-
-    {"avr16dd20", "__AVR_AVR16DD20__", "103", 1},
-    {"avr16dd28", "__AVR_AVR16DD28__", "103", 1},
-    {"avr16dd32", "__AVR_AVR16DD32__", "103", 1},
-    {"avr16du14", "__AVR_AVR16DU14__", "103", 1},
-    {"avr16du20", "__AVR_AVR16DU20__", "103", 1},
-    {"avr16du28", "__AVR_AVR16DU28__", "103", 1},
-    {"avr16du32", "__AVR_AVR16DU32__", "103", 1},
-    {"avr32da28", "__AVR_AVR32DA28__", "103", 1},
-    {"avr32da32", "__AVR_AVR32DA32__", "103", 1},
-    {"avr32da48", "__AVR_AVR32DA48__", "103", 1},
-    {"avr32db28", "__AVR_AVR32DB28__", "103", 1},
-    {"avr32db32", "__AVR_AVR32DB32__", "103", 1},
-    {"avr32db48", "__AVR_AVR32DB48__", "103", 1},
-    {"avr32dd14", "__AVR_AVR32DD14__", "103", 1},
-    {"avr32dd20", "__AVR_AVR32DD20__", "103", 1},
-    {"avr32dd28", "__AVR_AVR32DD28__", "103", 1},
-    {"avr32dd32", "__AVR_AVR32DD32__", "103", 1},
-    {"avr32du14", "__AVR_AVR32DU14__", "103", 1},
-    {"avr32du20", "__AVR_AVR32DU20__", "103", 1},
-    {"avr32du28", "__AVR_AVR32DU28__", "103", 1},
-    {"avr32du32", "__AVR_AVR32DU32__", "103", 1},
-    {"avr16eb14", "__AVR_AVR16EB14__", "103", 1},
-    {"avr16eb20", "__AVR_AVR16EB20__", "103", 1},
-    {"avr16eb28", "__AVR_AVR16EB28__", "103", 1},
-    {"avr16eb32", "__AVR_AVR16EB32__", "103", 1},
-    {"avr16ea28", "__AVR_AVR16EA28__", "103", 1},
-    {"avr16ea32", "__AVR_AVR16EA32__", "103", 1},
-    {"avr16ea48", "__AVR_AVR16EA48__", "103", 1},
-    {"avr32ea28", "__AVR_AVR32EA28__", "103", 1},
-    {"avr32ea32", "__AVR_AVR32EA32__", "103", 1},
-    {"avr32ea48", "__AVR_AVR32EA48__", "103", 1},
-    {"avr32sd20", "__AVR_AVR32SD20__", "103", 1},
-    {"avr32sd28", "__AVR_AVR32SD28__", "103", 1},
-    {"avr32sd32", "__AVR_AVR32SD32__", "103", 1},
-    {"avr128da28", "__AVR_AVR128DA28__", "104", 2},
-    {"avr128da32", "__AVR_AVR128DA32__", "104", 2},
-    {"avr128da48", "__AVR_AVR128DA48__", "104", 2},
-    {"avr128da64", "__AVR_AVR128DA64__", "104", 2},
-    {"avr128db28", "__AVR_AVR128DB28__", "104", 2},
-    {"avr128db32", "__AVR_AVR128DB32__", "104", 2},
-    {"avr128db48", "__AVR_AVR128DB48__", "104", 2},
-    {"avr128db64", "__AVR_AVR128DB64__", "104", 2},
 };
 
 } // namespace targets
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index 731076d9754a..b0523a7f4e40 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -326,78 +326,8 @@ constexpr struct {
     {"attiny1624", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1626", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1627", "avrxmega3", "avrxmega3", 0x803800},
-    {"attiny3224", "avrxmega3", "avrxmega3", 0x803400},
-    {"attiny3226", "avrxmega3", "avrxmega3", 0x803400},
-    {"attiny3227", "avrxmega3", "avrxmega3", 0x803400},
     {"attiny3216", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny3217", "avrxmega3", "avrxmega3", 0x803800},
-
-    // gcc 14 additions:
-
-    {"avr64da28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64da32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64da48", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64da64", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db48", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db64", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd14", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd20", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64du28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64du32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64ea28", "avrxmega2", "avrxmega2", 0x806800},
-    {"avr64ea32", "avrxmega2", "avrxmega2", 0x806800},
-    {"avr64ea48", "avrxmega2", "avrxmega2", 0x806800},
-    {"avr64sd28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64sd32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64sd48", "avrxmega2", "avrxmega2", 0x806000},
-
-    {"avr16dd20", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16dd28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16dd32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du14", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du20", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr32da28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32da32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32da48", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32db28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32db32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32db48", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd14", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd20", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du14", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du20", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr16eb14", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16eb20", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16eb28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16eb32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16ea28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16ea32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16ea48", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr32ea28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32ea32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32ea48", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32sd20", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32sd28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32sd32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr128da28", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128da32", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128da48", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128da64", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db28", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db32", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db48", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db64", "avrxmega4", "avrxmega4", 0x804000},
-
 };
 
 std::string GetMCUSubPath(StringRef MCUName) {
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index efe78391f731..56147bb473bc 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -215,13 +215,6 @@ def FamilyXMEGA3 : Family<"xmega3",
                            FeatureMultiplication, FeatureMOVW, FeatureLPMX,
                            FeatureBREAK, FeatureLowByteFirst]>;
 
-def FamilyXMEGA4 : Family<"xmega4",
-                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
-                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
-                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
-                           FeatureELPM,
-                           FeatureBREAK, FeatureLowByteFirst]>;
-
 def FamilyXMEGA : Family<"xmega",
                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
@@ -574,9 +567,6 @@ def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"attiny3224", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"attiny3226", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"attiny3227", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
@@ -585,69 +575,3 @@ def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
-
-// Additions from gcc 14:
-
-def : Device<"avr64da28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64da32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64da48", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64da64", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db48", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db64", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd14", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd20", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64du28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64du32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64ea28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64ea32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64ea48", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64sd28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64sd32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64sd48", FamilyXMEGA2, ELFArchXMEGA2>;
-
-def : Device<"avr16dd20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16dd28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16dd32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32da28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32da32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32da48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32db28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32db32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32db48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16ea28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16ea32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16ea48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32ea28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32ea32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32ea48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32sd20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32sd28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32sd32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr128da28", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128da32", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128da48", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128da64", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db28", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db32", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db48", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db64", FamilyXMEGA4, ELFArchXMEGA4>;

From 72f99b75afc12bb15a7730544339bcc1ca11e8ee Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 14 Jun 2025 16:48:44 +0100
Subject: [PATCH 0384/1322] [LV] Add test case with branch weights.

Add test case with branch weights where the vector loop can
be removed. Exposed a crash with db8d34db26e9
(https://github.com/llvm/llvm-project/pull/143035).
---
 ...oop-backedge-elimination-branch-weights.ll | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll

diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
new file mode 100644
index 000000000000..d5acf5c38f76
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF8UF1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF8UF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF16UF1 %s
+
+; Check if the vector loop condition can be simplified to true for a given
+; VF/IC combination.
+define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
+; VF8UF1-LABEL: define void @test_tc_between_8_and_17(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 range(i64 8, 17) [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP2]], ptr [[TMP1]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @test_tc_between_8_and_17(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 range(i64 8, 17) [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @test_tc_between_8_and_17(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 range(i64 8, 17) [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+  %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1
+  %l = load i8, ptr %p.src, align 1
+  %add = add nsw i8 %l, 10
+  store i8 %add, ptr %p.src
+  %iv.next = add nsw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, %N
+  br i1 %cmp, label %exit, label %loop, !prof !0
+
+exit:
+  ret void
+}
+
+!0 = !{!"branch_weights", !"expected", i32 1, i32 2000}

From 577199f9221ebc805a69372a2b19f4c8ebaf1daf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 14 Jun 2025 17:18:36 +0100
Subject: [PATCH 0385/1322] Reapply "[VPlan] Set branch weight metadata on
 middle term in VPlan (NFC) (#143035)"

This reverts commit 0604dc199c019b23746f4a54885ba0c75569cdae.

The recommitted version addresses post-commit comments and adjusts the
place the branch weights are added. It now runs before VPlans are optimized
for VF and UF, which may remove the vector loop region, causing a crash
trying to get the middle block after that. Test case added in
72f99b75afc12bb.

Original message:
Manage branch weights for the BranchOnCond in the middle block in VPlan.
This requires updating VPInstruction to inherit from VPIRMetadata, which
in general makes sense as there are a number of opcodes that could take
metadata.

There are other branches (part of the skeleton) that also need branch
weights adding.

PR: https://github.com/llvm/llvm-project/pull/143035
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 24 ++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 54 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 23 ++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  4 ++
 5 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7c006ae326ec..9b5ad1658953 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7300,6 +7300,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
                            OrigLoop->getHeader()->getContext());
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
+  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+    VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
+                             BestVPlan, BestVF);
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
   VPlanTransforms::narrowInterleaveGroups(
@@ -7309,11 +7312,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
                                             *Legal->getWidestInductionType());
-  // Retrieve and store the middle block before dissolving regions. Regions are
-  // dissolved after optimizing for VF and UF, which completely removes unneeded
-  // loop regions first.
-  VPBasicBlock *MiddleVPBB =
-      BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
+  // Regions are dissolved after optimizing for VF and UF, which completely
+  // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7456,20 +7456,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
-  // 4. Adjust branch weight of the branch in the middle block.
-  if (HeaderVPBB) {
-    auto *MiddleTerm =
-        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
-    if (MiddleTerm->isConditional() &&
-        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-      // Assume that `Count % VectorTripCount` is equally distributed.
-      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-      assert(TripCount > 0 && "trip count should not be zero");
-      const uint32_t Weights[] = {1, TripCount - 1};
-      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
-    }
-  }
-
   return ExpandedSCEVs;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53619b39219e..5a3c4a514a5d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,11 +882,40 @@ protected:
   unsigned getUnrollPart(VPUser &U) const;
 };
 
+/// Helper to manage IR metadata for recipes. It filters out metadata that
+/// cannot be propagated.
+class VPIRMetadata {
+  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
+
+public:
+  VPIRMetadata() {}
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I.
+  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
+  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
+
+  /// Copy constructor for cloning.
+  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+
+  /// Add all metadata to \p I.
+  void applyMetadata(Instruction &I) const;
+
+  /// Add metadata with kind \p Kind and \p Node.
+  void addMetadata(unsigned Kind, MDNode *Node) {
+    Metadata.emplace_back(Kind, Node);
+  }
+};
+
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeWithIRFlags,
+                      public VPIRMetadata,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -976,7 +1005,7 @@ public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        Opcode(Opcode), Name(Name.str()) {}
+        VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags, DebugLoc DL = {},
@@ -1268,29 +1297,6 @@ protected:
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
-/// Helper to manage IR metadata for recipes. It filters out metadata that
-/// cannot be propagated.
-class VPIRMetadata {
-  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
-
-public:
-  VPIRMetadata() {}
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I.
-  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
-  /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
-
-  /// Add all metadata to \p I.
-  void applyMetadata(Instruction &I) const;
-};
-
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c64bda167b85..3bdfa6724f69 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -409,7 +409,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      Opcode(Opcode), Name(Name.str()) {
+      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
 }
@@ -590,7 +590,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
-    return createCondBranch(Cond, getParent(), State);
+    auto *Br = createCondBranch(Cond, getParent(), State);
+    applyMetadata(*Br);
+    return Br;
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc3c7bfe5cd1..44a72755b9cf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/TypeSize.h"
@@ -3203,3 +3204,25 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   removeDeadRecipes(Plan);
 }
+
+/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
+/// BranchOnCond recipe.
+void VPlanTransforms::addBranchWeightToMiddleTerminator(VPlan &Plan,
+                                                        ElementCount VF) {
+  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+  auto *MiddleTerm =
+      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
+  // Only add branch metadata if there is a (conditional) terminator.
+  if (!MiddleTerm)
+    return;
+
+  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
+         "must have a BranchOnCond");
+  // Assume that `TripCount % VectorStep ` is equally distributed.
+  unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
+  assert(VectorStep > 0 && "trip count should not be zero");
+  MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+  MDNode *BranchWeights =
+      MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
+  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 34e2de4eb3b7..5a03bdb7c688 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,6 +234,10 @@ struct VPlanTransforms {
   /// removed in the future.
   static DenseMap<VPBasicBlock *, VPValue *>
   introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
+
+  /// Add branch weight metadata, if the \p Plan's middle block is terminated by
+  /// a BranchOnCond recipe.
+  static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF);
 };
 
 } // namespace llvm

From d6e25c4d21ebe20aaa6cbf6e2b9afde8f6713160 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Sat, 14 Jun 2025 10:21:08 -0700
Subject: [PATCH 0386/1322] [SelectionDAG] Take passthru into account when
 widening ISD::MLOAD (#144170)

#140595 used vp.load in the cases where we need to widen masked.load.
However, we didn't account for the passthru operand so it might
miscompile when the passthru is not undef. While we can simply avoid
using vp.load to widen when passthru is not undef, doing so will ran
into the exact same crash described in #140198 , so for scalable vector,
this patch manually merges the vp.load result with passthru when the
latter is not undef.
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 17 ++++++++++++--
 .../rvv/fixed-vectors-masked-load-int.ll      | 13 +++++++++++
 .../test/CodeGen/RISCV/rvv/masked-load-int.ll | 22 ++++++++++++++++++-
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f63fe17da51f..c56cfec81acd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6149,7 +6149,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
 
   if (ExtType == ISD::NON_EXTLOAD &&
       TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WidenVT) &&
-      TLI.isTypeLegal(WideMaskVT)) {
+      TLI.isTypeLegal(WideMaskVT) &&
+      // If there is a passthru, we shouldn't use vp.load. However,
+      // type legalizer will struggle on masked.load with
+      // scalable vectors, so for scalable vectors, we still use vp.load
+      // but manually merge the load result with the passthru using vp.select.
+      (N->getPassThru()->isUndef() || VT.isScalableVector())) {
     Mask = DAG.getInsertSubvector(dl, DAG.getUNDEF(WideMaskVT), Mask, 0);
     SDValue EVL = DAG.getElementCount(dl, TLI.getVPExplicitVectorLengthTy(),
                                       VT.getVectorElementCount());
@@ -6157,12 +6162,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
         DAG.getLoadVP(N->getAddressingMode(), ISD::NON_EXTLOAD, WidenVT, dl,
                       N->getChain(), N->getBasePtr(), N->getOffset(), Mask, EVL,
                       N->getMemoryVT(), N->getMemOperand());
+    SDValue NewVal = NewLoad;
+
+    // Manually merge with vp.select
+    if (!N->getPassThru()->isUndef()) {
+      assert(WidenVT.isScalableVector());
+      NewVal =
+          DAG.getNode(ISD::VP_SELECT, dl, WidenVT, Mask, NewVal, PassThru, EVL);
+    }
 
     // Modified the chain - switch anything that used the old chain to use
     // the new one.
     ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
 
-    return NewLoad;
+    return NewVal;
   }
 
   // The mask should be widened as well
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index 545c89495e62..ed60d9130849 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -341,3 +341,16 @@ define <7 x i8> @masked_load_v7i8(ptr %a, <7 x i1> %mask) {
   ret <7 x i8> %load
 }
 
+define <7 x i8> @masked_load_passthru_v7i8(ptr %a, <7 x i1> %mask) {
+; CHECK-LABEL: masked_load_passthru_v7i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vmand.mm v0, v0, v8
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vle8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <7 x i8> @llvm.masked.load.v7i8(ptr %a, i32 8, <7 x i1> %mask, <7 x i8> zeroinitializer)
+  ret <7 x i8> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
index d992669306fb..75537406f351 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
@@ -21,7 +21,27 @@ define <vscale x 1 x i8> @masked_load_nxv1i8(ptr %a, <vscale x 1 x i1> %mask) no
   %load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> undef)
   ret <vscale x 1 x i8> %load
 }
-declare <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr, i32, <vscale x 1 x i1>, <vscale x 1 x i8>)
+
+define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1> %mask) nounwind {
+; V-LABEL: masked_load_passthru_nxv1i8:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a1, zero, e8, mf8, ta, mu
+; V-NEXT:    vmv.v.i v8, 0
+; V-NEXT:    vle8.v v8, (a0), v0.t
+; V-NEXT:    ret
+;
+; ZVE32-LABEL: masked_load_passthru_nxv1i8:
+; ZVE32:       # %bb.0:
+; ZVE32-NEXT:    csrr a1, vlenb
+; ZVE32-NEXT:    srli a1, a1, 3
+; ZVE32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; ZVE32-NEXT:    vmv.v.i v8, 0
+; ZVE32-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; ZVE32-NEXT:    vle8.v v8, (a0), v0.t
+; ZVE32-NEXT:    ret
+  %load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> zeroinitializer)
+  ret <vscale x 1 x i8> %load
+}
 
 define <vscale x 1 x i16> @masked_load_nxv1i16(ptr %a, <vscale x 1 x i1> %mask) nounwind {
 ; V-LABEL: masked_load_nxv1i16:

From db682a721aabf3c33dfda471bf6a7908fbf656b4 Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Sat, 14 Jun 2025 21:06:43 +0300
Subject: [PATCH 0387/1322] [utils] Add "aarch64-apple-macosx" triple to
 update_llc_test_checks.py (#144023)

Add a missing valid triple "aarch64-apple-macosx" for usability.
---
 llvm/utils/UpdateTestChecks/asm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index da7e7ecc24bd..3754aa2eeba8 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -561,6 +561,7 @@ def get_run_handler(triple):
         "aarch64": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_RE),
         "aarch64-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE),
         "aarch64-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE),
+        "aarch64-apple-macosx": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE),
         "bpf": (scrub_asm_bpf, ASM_FUNCTION_BPF_RE),
         "bpfel": (scrub_asm_bpf, ASM_FUNCTION_BPF_RE),
         "bpfeb": (scrub_asm_bpf, ASM_FUNCTION_BPF_RE),

From 0ff95c9eb1e3b0785724d3e33df1e1f77f2c7473 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Sun, 15 Jun 2025 00:01:25 +0300
Subject: [PATCH 0388/1322] [Clang] add fix-it hints for unknown attributes
 (#141305)

This patch adds fix-it hints for unknown attribute names when Clang
suggests a correction
---
 .../include/clang/Basic/AttributeCommonInfo.h |  50 +++--
 .../include/clang/Basic/AttributeScopeInfo.h  |  48 +++++
 clang/include/clang/Sema/ParsedAttr.h         | 184 +++++++++---------
 clang/lib/AST/ASTImporter.cpp                 |   5 +-
 clang/lib/Basic/Attributes.cpp                |  55 ++++--
 clang/lib/Parse/ParseDecl.cpp                 |  96 ++++-----
 clang/lib/Parse/ParseDeclCXX.cpp              |  37 ++--
 clang/lib/Parse/ParseExprCXX.cpp              |   4 +-
 clang/lib/Parse/ParseHLSL.cpp                 |   4 +-
 clang/lib/Parse/ParseObjc.cpp                 |   2 +-
 clang/lib/Parse/ParsePragma.cpp               |   2 +-
 clang/lib/Parse/ParseStmt.cpp                 |   4 +-
 clang/lib/Sema/SemaAPINotes.cpp               |   7 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  58 ++++--
 clang/lib/Sema/SemaDeclCXX.cpp                |   3 +-
 clang/lib/Sema/SemaStmtAttr.cpp               |  14 +-
 clang/lib/Sema/SemaType.cpp                   |  10 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |   4 +-
 .../dcl.module/dcl.module.import/p1.cppm      |   2 +-
 clang/test/FixIt/fixit-unknown-attributes.cpp |  74 +++++++
 .../Parser/cxx11-base-spec-attributes.cpp     |   2 +-
 clang/test/Parser/objcxx11-attributes.mm      |   2 +-
 clang/test/Sema/unknown-attributes.c          |  11 +-
 ...attr-non-x86-no_caller_saved_registers.cpp |   2 +-
 24 files changed, 432 insertions(+), 248 deletions(-)
 create mode 100644 clang/include/clang/Basic/AttributeScopeInfo.h
 create mode 100644 clang/test/FixIt/fixit-unknown-attributes.cpp

diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index 34fc77436255..21a7a88a3fb9 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H
 #define LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H
 
+#include "clang/Basic/AttributeScopeInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/TokenKinds.h"
 
@@ -61,6 +62,7 @@ public:
     /// implicitly.
     AS_Implicit
   };
+
   enum Kind {
 #define PARSED_ATTR(NAME) AT_##NAME,
 #include "clang/Basic/AttrParsedAttrList.inc"
@@ -78,9 +80,9 @@ public:
 
 private:
   const IdentifierInfo *AttrName = nullptr;
-  const IdentifierInfo *ScopeName = nullptr;
+  AttributeScopeInfo AttrScope;
   SourceRange AttrRange;
-  const SourceLocation ScopeLoc;
+
   // Corresponds to the Kind enum.
   LLVM_PREFERRED_TYPE(Kind)
   unsigned AttrKind : 16;
@@ -146,11 +148,10 @@ public:
   };
 
   AttributeCommonInfo(const IdentifierInfo *AttrName,
-                      const IdentifierInfo *ScopeName, SourceRange AttrRange,
-                      SourceLocation ScopeLoc, Kind AttrKind, Form FormUsed)
-      : AttrName(AttrName), ScopeName(ScopeName), AttrRange(AttrRange),
-        ScopeLoc(ScopeLoc), AttrKind(AttrKind),
-        SyntaxUsed(FormUsed.getSyntax()),
+                      AttributeScopeInfo AttrScope, SourceRange AttrRange,
+                      Kind AttrKind, Form FormUsed)
+      : AttrName(AttrName), AttrScope(AttrScope), AttrRange(AttrRange),
+        AttrKind(AttrKind), SyntaxUsed(FormUsed.getSyntax()),
         SpellingIndex(FormUsed.getSpellingIndex()),
         IsAlignas(FormUsed.isAlignas()),
         IsRegularKeywordAttribute(FormUsed.isRegularKeywordAttribute()) {
@@ -158,21 +159,20 @@ public:
            "Invalid syntax!");
   }
 
-  AttributeCommonInfo(const IdentifierInfo *AttrName,
-                      const IdentifierInfo *ScopeName, SourceRange AttrRange,
-                      SourceLocation ScopeLoc, Form FormUsed)
+  AttributeCommonInfo(const IdentifierInfo *AttrName, AttributeScopeInfo Scope,
+                      SourceRange AttrRange, Form FormUsed)
       : AttributeCommonInfo(
-            AttrName, ScopeName, AttrRange, ScopeLoc,
-            getParsedKind(AttrName, ScopeName, FormUsed.getSyntax()),
+            AttrName, Scope, AttrRange,
+            getParsedKind(AttrName, Scope.getName(), FormUsed.getSyntax()),
             FormUsed) {}
 
   AttributeCommonInfo(const IdentifierInfo *AttrName, SourceRange AttrRange,
                       Form FormUsed)
-      : AttributeCommonInfo(AttrName, nullptr, AttrRange, SourceLocation(),
+      : AttributeCommonInfo(AttrName, AttributeScopeInfo(), AttrRange,
                             FormUsed) {}
 
   AttributeCommonInfo(SourceRange AttrRange, Kind K, Form FormUsed)
-      : AttributeCommonInfo(nullptr, nullptr, AttrRange, SourceLocation(), K,
+      : AttributeCommonInfo(nullptr, AttributeScopeInfo(), AttrRange, K,
                             FormUsed) {}
 
   AttributeCommonInfo(AttributeCommonInfo &&) = default;
@@ -190,17 +190,27 @@ public:
   SourceRange getRange() const { return AttrRange; }
   void setRange(SourceRange R) { AttrRange = R; }
 
-  bool hasScope() const { return ScopeName; }
-  const IdentifierInfo *getScopeName() const { return ScopeName; }
-  SourceLocation getScopeLoc() const { return ScopeLoc; }
+  bool hasScope() const { return AttrScope.isValid(); }
+  bool isExplicitScope() const { return AttrScope.isExplicit(); }
+
+  const IdentifierInfo *getScopeName() const { return AttrScope.getName(); }
+  SourceLocation getScopeLoc() const { return AttrScope.getNameLoc(); }
 
   /// Gets the normalized full name, which consists of both scope and name and
   /// with surrounding underscores removed as appropriate (e.g.
   /// __gnu__::__attr__ will be normalized to gnu::attr).
   std::string getNormalizedFullName() const;
-  std::optional<std::string>
-  getCorrectedFullName(const TargetInfo &Target,
-                       const LangOptions &LangOpts) const;
+  std::string getNormalizedFullName(StringRef ScopeName,
+                                    StringRef AttrName) const;
+  StringRef getNormalizedScopeName() const;
+  StringRef getNormalizedAttrName(StringRef ScopeName) const;
+
+  std::optional<StringRef> tryGetCorrectedScopeName(StringRef ScopeName) const;
+  std::optional<StringRef>
+  tryGetCorrectedAttrName(StringRef ScopeName, StringRef AttrName,
+                          const TargetInfo &Target,
+                          const LangOptions &LangOpts) const;
+
   SourceRange getNormalizedRange() const;
 
   bool isDeclspecAttribute() const { return SyntaxUsed == AS_Declspec; }
diff --git a/clang/include/clang/Basic/AttributeScopeInfo.h b/clang/include/clang/Basic/AttributeScopeInfo.h
new file mode 100644
index 000000000000..cca4df7c11b0
--- /dev/null
+++ b/clang/include/clang/Basic/AttributeScopeInfo.h
@@ -0,0 +1,48 @@
+//==- AttributeScopeInfo.h - Base info about an Attribute Scope --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AttributeScopeInfo type, which represents information
+// about the scope of an attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_ATTRIBUTESCOPEINFO_H
+#define LLVM_CLANG_BASIC_ATTRIBUTESCOPEINFO_H
+
+#include "clang/Basic/SourceLocation.h"
+
+namespace clang {
+
+class IdentifierInfo;
+
+class AttributeScopeInfo {
+public:
+  AttributeScopeInfo() = default;
+
+  AttributeScopeInfo(const IdentifierInfo *Name, SourceLocation NameLoc)
+      : Name(Name), NameLoc(NameLoc) {}
+
+  AttributeScopeInfo(const IdentifierInfo *Name, SourceLocation NameLoc,
+                     SourceLocation CommonScopeLoc)
+      : Name(Name), NameLoc(NameLoc), CommonScopeLoc(CommonScopeLoc) {}
+
+  const IdentifierInfo *getName() const { return Name; }
+  SourceLocation getNameLoc() const { return NameLoc; }
+
+  bool isValid() const { return Name != nullptr; }
+  bool isExplicit() const { return CommonScopeLoc.isInvalid(); }
+
+private:
+  const IdentifierInfo *Name = nullptr;
+  SourceLocation NameLoc;
+  SourceLocation CommonScopeLoc;
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_BASIC_ATTRIBUTESCOPEINFO_H
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 9e050ab9a620..6b3c5a173417 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -204,10 +204,9 @@ private:
 
   /// Constructor for attributes with expression arguments.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             ArgsUnion *args, unsigned numArgs, Form formUsed,
-             SourceLocation ellipsisLoc)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
+             AttributeScopeInfo scope, ArgsUnion *args, unsigned numArgs,
+             Form formUsed, SourceLocation ellipsisLoc)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed),
         EllipsisLoc(ellipsisLoc), NumArgs(numArgs), Invalid(false),
         UsedAsTypeAttr(false), IsAvailability(false),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
@@ -219,14 +218,14 @@ private:
 
   /// Constructor for availability attributes.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierLoc *Parm, const AvailabilityChange &introduced,
+             AttributeScopeInfo scope, IdentifierLoc *Parm,
+             const AvailabilityChange &introduced,
              const AvailabilityChange &deprecated,
              const AvailabilityChange &obsoleted, SourceLocation unavailable,
              const Expr *messageExpr, Form formUsed, SourceLocation strict,
              const Expr *replacementExpr, const IdentifierLoc *environmentLoc)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(1), Invalid(false), UsedAsTypeAttr(false), IsAvailability(true),
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(1),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(true),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
         HasProcessingCache(false), IsPragmaClangAttribute(false),
         UnavailableLoc(unavailable), MessageExpr(messageExpr),
@@ -240,14 +239,13 @@ private:
 
   /// Constructor for objc_bridge_related attributes.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierLoc *Parm1, IdentifierLoc *Parm2, IdentifierLoc *Parm3,
-             Form formUsed)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(3), Invalid(false), UsedAsTypeAttr(false),
-        IsAvailability(false), IsTypeTagForDatatype(false), IsProperty(false),
-        HasParsedType(false), HasProcessingCache(false),
-        IsPragmaClangAttribute(false), Info(ParsedAttrInfo::get(*this)) {
+             AttributeScopeInfo scope, IdentifierLoc *Parm1,
+             IdentifierLoc *Parm2, IdentifierLoc *Parm3, Form formUsed)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(3),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
+        IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
+        HasProcessingCache(false), IsPragmaClangAttribute(false),
+        Info(ParsedAttrInfo::get(*this)) {
     ArgsUnion *Args = getArgsBuffer();
     Args[0] = Parm1;
     Args[1] = Parm2;
@@ -256,14 +254,14 @@ private:
 
   /// Constructor for type_tag_for_datatype attribute.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierLoc *ArgKind, ParsedType matchingCType,
-             bool layoutCompatible, bool mustBeNull, Form formUsed)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(1), Invalid(false), UsedAsTypeAttr(false),
-        IsAvailability(false), IsTypeTagForDatatype(true), IsProperty(false),
-        HasParsedType(false), HasProcessingCache(false),
-        IsPragmaClangAttribute(false), Info(ParsedAttrInfo::get(*this)) {
+             AttributeScopeInfo scope, IdentifierLoc *ArgKind,
+             ParsedType matchingCType, bool layoutCompatible, bool mustBeNull,
+             Form formUsed)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(1),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
+        IsTypeTagForDatatype(true), IsProperty(false), HasParsedType(false),
+        HasProcessingCache(false), IsPragmaClangAttribute(false),
+        Info(ParsedAttrInfo::get(*this)) {
     ArgsUnion PVal(ArgKind);
     memcpy(getArgsBuffer(), &PVal, sizeof(ArgsUnion));
     detail::TypeTagForDatatypeData &ExtraData = getTypeTagForDatatypeDataSlot();
@@ -274,9 +272,9 @@ private:
 
   /// Constructor for attributes with a single type argument.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             ParsedType typeArg, Form formUsed, SourceLocation ellipsisLoc)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
+             AttributeScopeInfo scope, ParsedType typeArg, Form formUsed,
+             SourceLocation ellipsisLoc)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed),
         EllipsisLoc(ellipsisLoc), NumArgs(0), Invalid(false),
         UsedAsTypeAttr(false), IsAvailability(false),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(true),
@@ -287,13 +285,13 @@ private:
 
   /// Constructor for microsoft __declspec(property) attribute.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierInfo *getterId, IdentifierInfo *setterId, Form formUsed)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(0), Invalid(false), UsedAsTypeAttr(false),
-        IsAvailability(false), IsTypeTagForDatatype(false), IsProperty(true),
-        HasParsedType(false), HasProcessingCache(false),
-        IsPragmaClangAttribute(false), Info(ParsedAttrInfo::get(*this)) {
+             AttributeScopeInfo scope, IdentifierInfo *getterId,
+             IdentifierInfo *setterId, Form formUsed)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(0),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
+        IsTypeTagForDatatype(false), IsProperty(true), HasParsedType(false),
+        HasProcessingCache(false), IsPragmaClangAttribute(false),
+        Info(ParsedAttrInfo::get(*this)) {
     new (&getPropertyDataBuffer()) detail::PropertyData(getterId, setterId);
   }
 
@@ -735,21 +733,21 @@ public:
   void takeFrom(ParsedAttributesView &List, AttributePool &Pool);
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     ArgsUnion *args, unsigned numArgs, ParsedAttr::Form form,
+                     AttributeScopeInfo scope, ArgsUnion *args,
+                     unsigned numArgs, ParsedAttr::Form form,
                      SourceLocation ellipsisLoc = SourceLocation()) {
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
                                      detail::PropertyData>(numArgs, 0, 0, 0,
                                                            0));
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       args, numArgs, form, ellipsisLoc));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, args,
+                                       numArgs, form, ellipsisLoc));
   }
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param, const AvailabilityChange &introduced,
+                     AttributeScopeInfo scope, IdentifierLoc *Param,
+                     const AvailabilityChange &introduced,
                      const AvailabilityChange &deprecated,
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
@@ -757,58 +755,54 @@ public:
                      const Expr *ReplacementExpr,
                      IdentifierLoc *EnvironmentLoc) {
     void *memory = allocate(AttributeFactory::AvailabilityAllocSize);
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       Param, introduced, deprecated, obsoleted,
-                                       unavailable, MessageExpr, form, strict,
-                                       ReplacementExpr, EnvironmentLoc));
+    return add(new (memory)
+                   ParsedAttr(attrName, attrRange, scope, Param, introduced,
+                              deprecated, obsoleted, unavailable, MessageExpr,
+                              form, strict, ReplacementExpr, EnvironmentLoc));
   }
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param1, IdentifierLoc *Param2,
-                     IdentifierLoc *Param3, ParsedAttr::Form form) {
+                     AttributeScopeInfo scope, IdentifierLoc *Param1,
+                     IdentifierLoc *Param2, IdentifierLoc *Param3,
+                     ParsedAttr::Form form) {
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
                                      detail::PropertyData>(3, 0, 0, 0, 0));
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       Param1, Param2, Param3, form));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, Param1,
+                                       Param2, Param3, form));
   }
 
-  ParsedAttr *
-  createTypeTagForDatatype(IdentifierInfo *attrName, SourceRange attrRange,
-                           IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                           IdentifierLoc *argumentKind,
-                           ParsedType matchingCType, bool layoutCompatible,
-                           bool mustBeNull, ParsedAttr::Form form) {
+  ParsedAttr *createTypeTagForDatatype(
+      IdentifierInfo *attrName, SourceRange attrRange, AttributeScopeInfo scope,
+      IdentifierLoc *argumentKind, ParsedType matchingCType,
+      bool layoutCompatible, bool mustBeNull, ParsedAttr::Form form) {
     void *memory = allocate(AttributeFactory::TypeTagForDatatypeAllocSize);
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       argumentKind, matchingCType,
-                                       layoutCompatible, mustBeNull, form));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, argumentKind,
+                                       matchingCType, layoutCompatible,
+                                       mustBeNull, form));
   }
 
   ParsedAttr *createTypeAttribute(IdentifierInfo *attrName,
                                   SourceRange attrRange,
-                                  IdentifierInfo *scopeName,
-                                  SourceLocation scopeLoc, ParsedType typeArg,
+                                  AttributeScopeInfo scope, ParsedType typeArg,
                                   ParsedAttr::Form formUsed,
                                   SourceLocation ellipsisLoc) {
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
                                      detail::PropertyData>(0, 0, 0, 1, 0));
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       typeArg, formUsed, ellipsisLoc));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, typeArg,
+                                       formUsed, ellipsisLoc));
   }
 
   ParsedAttr *
   createPropertyAttribute(IdentifierInfo *attrName, SourceRange attrRange,
-                          IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                          IdentifierInfo *getterId, IdentifierInfo *setterId,
-                          ParsedAttr::Form formUsed) {
+                          AttributeScopeInfo scope, IdentifierInfo *getterId,
+                          IdentifierInfo *setterId, ParsedAttr::Form formUsed) {
     void *memory = allocate(AttributeFactory::PropertyAllocSize);
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       getterId, setterId, formUsed));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, getterId,
+                                       setterId, formUsed));
   }
 };
 
@@ -982,19 +976,19 @@ public:
 
   /// Add attribute with expression arguments.
   ParsedAttr *addNew(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     ArgsUnion *args, unsigned numArgs, ParsedAttr::Form form,
+                     AttributeScopeInfo scope, ArgsUnion *args,
+                     unsigned numArgs, ParsedAttr::Form form,
                      SourceLocation ellipsisLoc = SourceLocation()) {
-    ParsedAttr *attr = pool.create(attrName, attrRange, scopeName, scopeLoc,
-                                   args, numArgs, form, ellipsisLoc);
+    ParsedAttr *attr = pool.create(attrName, attrRange, scope, args, numArgs,
+                                   form, ellipsisLoc);
     addAtEnd(attr);
     return attr;
   }
 
   /// Add availability attribute.
   ParsedAttr *addNew(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param, const AvailabilityChange &introduced,
+                     AttributeScopeInfo scope, IdentifierLoc *Param,
+                     const AvailabilityChange &introduced,
                      const AvailabilityChange &deprecated,
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
@@ -1002,33 +996,31 @@ public:
                      const Expr *ReplacementExpr,
                      IdentifierLoc *EnvironmentLoc) {
     ParsedAttr *attr =
-        pool.create(attrName, attrRange, scopeName, scopeLoc, Param, introduced,
-                    deprecated, obsoleted, unavailable, MessageExpr, form,
-                    strict, ReplacementExpr, EnvironmentLoc);
+        pool.create(attrName, attrRange, scope, Param, introduced, deprecated,
+                    obsoleted, unavailable, MessageExpr, form, strict,
+                    ReplacementExpr, EnvironmentLoc);
     addAtEnd(attr);
     return attr;
   }
 
   /// Add objc_bridge_related attribute.
   ParsedAttr *addNew(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param1, IdentifierLoc *Param2,
-                     IdentifierLoc *Param3, ParsedAttr::Form form) {
-    ParsedAttr *attr = pool.create(attrName, attrRange, scopeName, scopeLoc,
-                                   Param1, Param2, Param3, form);
+                     AttributeScopeInfo scope, IdentifierLoc *Param1,
+                     IdentifierLoc *Param2, IdentifierLoc *Param3,
+                     ParsedAttr::Form form) {
+    ParsedAttr *attr =
+        pool.create(attrName, attrRange, scope, Param1, Param2, Param3, form);
     addAtEnd(attr);
     return attr;
   }
 
   /// Add type_tag_for_datatype attribute.
-  ParsedAttr *
-  addNewTypeTagForDatatype(IdentifierInfo *attrName, SourceRange attrRange,
-                           IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                           IdentifierLoc *argumentKind,
-                           ParsedType matchingCType, bool layoutCompatible,
-                           bool mustBeNull, ParsedAttr::Form form) {
+  ParsedAttr *addNewTypeTagForDatatype(
+      IdentifierInfo *attrName, SourceRange attrRange, AttributeScopeInfo scope,
+      IdentifierLoc *argumentKind, ParsedType matchingCType,
+      bool layoutCompatible, bool mustBeNull, ParsedAttr::Form form) {
     ParsedAttr *attr = pool.createTypeTagForDatatype(
-        attrName, attrRange, scopeName, scopeLoc, argumentKind, matchingCType,
+        attrName, attrRange, scope, argumentKind, matchingCType,
         layoutCompatible, mustBeNull, form);
     addAtEnd(attr);
     return attr;
@@ -1036,12 +1028,11 @@ public:
 
   /// Add an attribute with a single type argument.
   ParsedAttr *addNewTypeAttr(IdentifierInfo *attrName, SourceRange attrRange,
-                             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                             ParsedType typeArg, ParsedAttr::Form formUsed,
+                             AttributeScopeInfo scope, ParsedType typeArg,
+                             ParsedAttr::Form formUsed,
                              SourceLocation ellipsisLoc = SourceLocation()) {
-    ParsedAttr *attr =
-        pool.createTypeAttribute(attrName, attrRange, scopeName, scopeLoc,
-                                 typeArg, formUsed, ellipsisLoc);
+    ParsedAttr *attr = pool.createTypeAttribute(attrName, attrRange, scope,
+                                                typeArg, formUsed, ellipsisLoc);
     addAtEnd(attr);
     return attr;
   }
@@ -1049,11 +1040,10 @@ public:
   /// Add microsoft __delspec(property) attribute.
   ParsedAttr *
   addNewPropertyAttr(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierInfo *getterId, IdentifierInfo *setterId,
-                     ParsedAttr::Form formUsed) {
+                     AttributeScopeInfo scope, IdentifierInfo *getterId,
+                     IdentifierInfo *setterId, ParsedAttr::Form formUsed) {
     ParsedAttr *attr = pool.createPropertyAttribute(
-        attrName, attrRange, scopeName, scopeLoc, getterId, setterId, formUsed);
+        attrName, attrRange, scope, getterId, setterId, formUsed);
     addAtEnd(attr);
     return attr;
   }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 003bad225e30..5c44353d8b98 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -9333,8 +9333,9 @@ public:
     if (Err)
       return;
 
-    AttributeCommonInfo ToI(ToAttrName, ToScopeName, ToAttrRange, ToScopeLoc,
-                            FromAttr->getParsedKind(), FromAttr->getForm());
+    AttributeCommonInfo ToI(
+        ToAttrName, AttributeScopeInfo(ToScopeName, ToScopeLoc), ToAttrRange,
+        FromAttr->getParsedKind(), FromAttr->getForm());
     // The "SemanticSpelling" is not needed to be passed to the constructor.
     // That value is recalculated from the SpellingListIndex if needed.
     ToAttr = T::Create(Importer.getToContext(),
diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp
index 905046685934..004e5209a44a 100644
--- a/clang/lib/Basic/Attributes.cpp
+++ b/clang/lib/Basic/Attributes.cpp
@@ -119,7 +119,6 @@ normalizeAttrScopeName(const IdentifierInfo *ScopeName,
                        AttributeCommonInfo::Syntax SyntaxUsed) {
   if (ScopeName)
     return normalizeAttrScopeName(ScopeName->getName(), SyntaxUsed);
-
   return "";
 }
 
@@ -141,12 +140,23 @@ static StringRef normalizeAttrName(StringRef AttrName,
   return AttrName;
 }
 
+StringRef AttributeCommonInfo::getNormalizedScopeName() const {
+  return normalizeAttrScopeName(getScopeName(), getSyntax());
+}
+
+StringRef
+AttributeCommonInfo::getNormalizedAttrName(StringRef ScopeName) const {
+  return normalizeAttrName(getAttrName()->getName(), ScopeName, getSyntax());
+}
+
 bool AttributeCommonInfo::isGNUScope() const {
-  return ScopeName && (ScopeName->isStr("gnu") || ScopeName->isStr("__gnu__"));
+  return AttrScope.isValid() && (AttrScope.getName()->isStr("gnu") ||
+                                 AttrScope.getName()->isStr("__gnu__"));
 }
 
 bool AttributeCommonInfo::isClangScope() const {
-  return ScopeName && (ScopeName->isStr("clang") || ScopeName->isStr("_Clang"));
+  return AttrScope.isValid() && (AttrScope.getName()->isStr("clang") ||
+                                 AttrScope.getName()->isStr("_Clang"));
 }
 
 #include "clang/Sema/AttrParsedAttrKinds.inc"
@@ -198,8 +208,16 @@ std::string AttributeCommonInfo::getNormalizedFullName() const {
       normalizeName(getAttrName(), getScopeName(), getSyntax()));
 }
 
+std::string
+AttributeCommonInfo::getNormalizedFullName(StringRef ScopeName,
+                                           StringRef AttrName) const {
+  return static_cast<std::string>(
+      normalizeName(AttrName, ScopeName, getSyntax()));
+}
+
 SourceRange AttributeCommonInfo::getNormalizedRange() const {
-  return hasScope() ? SourceRange(ScopeLoc, AttrRange.getEnd()) : AttrRange;
+  return hasScope() ? SourceRange(AttrScope.getNameLoc(), AttrRange.getEnd())
+                    : AttrRange;
 }
 
 static AttributeCommonInfo::Scope
@@ -239,10 +257,8 @@ static constexpr const char *AttrScopeSpellingList[] = {
 #include "clang/Basic/AttributeSpellingList.inc"
 };
 
-std::optional<std::string>
-AttributeCommonInfo::getCorrectedFullName(const TargetInfo &Target,
-                                          const LangOptions &LangOpts) const {
-  StringRef ScopeName = normalizeAttrScopeName(getScopeName(), getSyntax());
+std::optional<StringRef>
+AttributeCommonInfo::tryGetCorrectedScopeName(StringRef ScopeName) const {
   if (ScopeName.size() > 0 &&
       llvm::none_of(AttrScopeSpellingList,
                     [&](const char *S) { return S == ScopeName; })) {
@@ -251,25 +267,26 @@ AttributeCommonInfo::getCorrectedFullName(const TargetInfo &Target,
       STC.add(Scope);
 
     if (auto CorrectedScopeName = STC.getCorrection())
-      ScopeName = *CorrectedScopeName;
+      return CorrectedScopeName;
   }
+  return std::nullopt;
+}
 
-  StringRef AttrName =
-      normalizeAttrName(getAttrName()->getName(), ScopeName, getSyntax());
+std::optional<StringRef> AttributeCommonInfo::tryGetCorrectedAttrName(
+    StringRef ScopeName, StringRef AttrName, const TargetInfo &Target,
+    const LangOptions &LangOpts) const {
   if (llvm::none_of(AttrSpellingList,
                     [&](const char *A) { return A == AttrName; })) {
     SimpleTypoCorrection STC(AttrName);
     for (const auto &Attr : AttrSpellingList)
       STC.add(Attr);
 
-    if (auto CorrectedAttrName = STC.getCorrection())
-      AttrName = *CorrectedAttrName;
+    if (auto CorrectedAttrName = STC.getCorrection()) {
+      if (hasAttribute(getSyntax(), ScopeName, *CorrectedAttrName, Target,
+                       LangOpts,
+                       /*CheckPlugins=*/true))
+        return CorrectedAttrName;
+    }
   }
-
-  if (hasAttribute(getSyntax(), ScopeName, AttrName, Target, LangOpts,
-                   /*CheckPlugins=*/true))
-    return static_cast<std::string>(
-        normalizeName(AttrName, ScopeName, getSyntax()));
-
   return std::nullopt;
 }
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 647ee34efcab..02f33511dbd6 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -151,7 +151,7 @@ bool Parser::ParseSingleGNUAttribute(ParsedAttributes &Attrs,
   SourceLocation AttrNameLoc = ConsumeToken();
 
   if (Tok.isNot(tok::l_paren)) {
-    Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  ParsedAttr::Form::GNU());
     return false;
   }
@@ -396,12 +396,12 @@ void Parser::ParseAttributeWithTypeArg(IdentifierInfo &AttrName,
     return;
 
   if (T.isUsable())
-    Attrs.addNewTypeAttr(&AttrName,
-                         SourceRange(AttrNameLoc, Parens.getCloseLocation()),
-                         ScopeName, ScopeLoc, T.get(), Form);
+    Attrs.addNewTypeAttr(
+        &AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
+        AttributeScopeInfo(ScopeName, ScopeLoc), T.get(), Form);
   else
     Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
-                 ScopeName, ScopeLoc, nullptr, 0, Form);
+                 AttributeScopeInfo(ScopeName, ScopeLoc), nullptr, 0, Form);
 }
 
 ExprResult
@@ -609,10 +609,12 @@ unsigned Parser::ParseAttributeArgsCommon(
 
     if (AttributeIsTypeArgAttr && !TheParsedType.get().isNull()) {
       Attrs.addNewTypeAttr(AttrName, SourceRange(AttrNameLoc, RParen),
-                           ScopeName, ScopeLoc, TheParsedType, Form);
+                           AttributeScopeInfo(ScopeName, ScopeLoc),
+                           TheParsedType, Form);
     } else {
-      Attrs.addNew(AttrName, SourceRange(AttrLoc, RParen), ScopeName, ScopeLoc,
-                   ArgExprs.data(), ArgExprs.size(), Form);
+      Attrs.addNew(AttrName, SourceRange(AttrLoc, RParen),
+                   AttributeScopeInfo(ScopeName, ScopeLoc), ArgExprs.data(),
+                   ArgExprs.size(), Form);
     }
   }
 
@@ -854,7 +856,7 @@ bool Parser::ParseMicrosoftDeclSpecArgs(IdentifierInfo *AttrName,
 
     // Only add the property attribute if it was well-formed.
     if (!HasInvalidAccessor)
-      Attrs.addNewPropertyAttr(AttrName, AttrNameLoc, nullptr, SourceLocation(),
+      Attrs.addNewPropertyAttr(AttrName, AttrNameLoc, AttributeScopeInfo(),
                                AccessorNames[AK_Get], AccessorNames[AK_Put],
                                ParsedAttr::Form::Declspec());
     T.skipToEnd();
@@ -940,7 +942,7 @@ void Parser::ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs) {
             << AttrName->getName();
 
       if (!AttrHandled)
-        Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+        Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                      ParsedAttr::Form::Declspec());
     }
     T.consumeClose();
@@ -968,7 +970,7 @@ void Parser::ParseMicrosoftTypeAttributes(ParsedAttributes &attrs) {
     case tok::kw___uptr: {
       IdentifierInfo *AttrName = Tok.getIdentifierInfo();
       SourceLocation AttrNameLoc = ConsumeToken();
-      attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+      attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                    Kind);
       break;
     }
@@ -989,9 +991,8 @@ void Parser::ParseWebAssemblyFuncrefTypeAttribute(ParsedAttributes &attrs) {
 
   IdentifierInfo *AttrName = Tok.getIdentifierInfo();
   SourceLocation AttrNameLoc = ConsumeToken();
-  attrs.addNew(AttrName, AttrNameLoc, /*ScopeName=*/nullptr,
-               /*ScopeLoc=*/SourceLocation{}, /*Args=*/nullptr, /*numArgs=*/0,
-               tok::kw___funcref);
+  attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), /*Args=*/nullptr,
+               /*numArgs=*/0, tok::kw___funcref);
 }
 
 void Parser::DiagnoseAndSkipExtendedMicrosoftTypeAttributes() {
@@ -1035,7 +1036,7 @@ void Parser::ParseBorlandTypeAttributes(ParsedAttributes &attrs) {
   while (Tok.is(tok::kw___pascal)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  tok::kw___pascal);
   }
 }
@@ -1045,7 +1046,7 @@ void Parser::ParseOpenCLKernelAttributes(ParsedAttributes &attrs) {
   while (Tok.is(tok::kw___kernel)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  tok::kw___kernel);
   }
 }
@@ -1054,7 +1055,7 @@ void Parser::ParseCUDAFunctionAttributes(ParsedAttributes &attrs) {
   while (Tok.is(tok::kw___noinline__)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  tok::kw___noinline__);
   }
 }
@@ -1062,7 +1063,7 @@ void Parser::ParseCUDAFunctionAttributes(ParsedAttributes &attrs) {
 void Parser::ParseOpenCLQualifiers(ParsedAttributes &Attrs) {
   IdentifierInfo *AttrName = Tok.getIdentifierInfo();
   SourceLocation AttrNameLoc = Tok.getLocation();
-  Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+  Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                Tok.getKind());
 }
 
@@ -1074,7 +1075,7 @@ void Parser::ParseHLSLQualifiers(ParsedAttributes &Attrs) {
   IdentifierInfo *AttrName = Tok.getIdentifierInfo();
   auto Kind = Tok.getKind();
   SourceLocation AttrNameLoc = ConsumeToken();
-  Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+  Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0, Kind);
 }
 
 void Parser::ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs) {
@@ -1091,7 +1092,7 @@ void Parser::ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs) {
       if (!getLangOpts().ObjC)
         Diag(AttrNameLoc, diag::ext_nullability)
           << AttrName;
-      attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+      attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                    Kind);
       break;
     }
@@ -1435,10 +1436,11 @@ void Parser::ParseAvailabilityAttribute(
 
   // Record this attribute
   attrs.addNew(&Availability,
-               SourceRange(AvailabilityLoc, T.getCloseLocation()), ScopeName,
-               ScopeLoc, Platform, Changes[Introduced], Changes[Deprecated],
-               Changes[Obsoleted], UnavailableLoc, MessageExpr.get(), Form,
-               StrictLoc, ReplacementExpr.get(), EnvironmentLoc);
+               SourceRange(AvailabilityLoc, T.getCloseLocation()),
+               AttributeScopeInfo(ScopeName, ScopeLoc), Platform,
+               Changes[Introduced], Changes[Deprecated], Changes[Obsoleted],
+               UnavailableLoc, MessageExpr.get(), Form, StrictLoc,
+               ReplacementExpr.get(), EnvironmentLoc);
 }
 
 void Parser::ParseExternalSourceSymbolAttribute(
@@ -1556,7 +1558,8 @@ void Parser::ParseExternalSourceSymbolAttribute(
   ArgsUnion Args[] = {Language.get(), DefinedInExpr.get(), GeneratedDeclaration,
                       USR.get()};
   Attrs.addNew(&ExternalSourceSymbol, SourceRange(Loc, T.getCloseLocation()),
-               ScopeName, ScopeLoc, Args, std::size(Args), Form);
+               AttributeScopeInfo(ScopeName, ScopeLoc), Args, std::size(Args),
+               Form);
 }
 
 void Parser::ParseObjCBridgeRelatedAttribute(
@@ -1624,8 +1627,8 @@ void Parser::ParseObjCBridgeRelatedAttribute(
   // Record this attribute
   Attrs.addNew(&ObjCBridgeRelated,
                SourceRange(ObjCBridgeRelatedLoc, T.getCloseLocation()),
-               ScopeName, ScopeLoc, RelatedClass, ClassMethod, InstanceMethod,
-               Form);
+               AttributeScopeInfo(ScopeName, ScopeLoc), RelatedClass,
+               ClassMethod, InstanceMethod, Form);
 }
 
 void Parser::ParseSwiftNewTypeAttribute(
@@ -1666,7 +1669,8 @@ void Parser::ParseSwiftNewTypeAttribute(
 
   ArgsUnion Args[] = {SwiftType};
   Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, T.getCloseLocation()),
-               ScopeName, ScopeLoc, Args, std::size(Args), Form);
+               AttributeScopeInfo(ScopeName, ScopeLoc), Args, std::size(Args),
+               Form);
 }
 
 void Parser::ParseTypeTagForDatatypeAttribute(
@@ -1719,9 +1723,9 @@ void Parser::ParseTypeTagForDatatypeAttribute(
   }
 
   if (!T.consumeClose()) {
-    Attrs.addNewTypeTagForDatatype(&AttrName, AttrNameLoc, ScopeName, ScopeLoc,
-                                   ArgumentKind, MatchingCType.get(),
-                                   LayoutCompatible, MustBeNull, Form);
+    Attrs.addNewTypeTagForDatatype(
+        &AttrName, AttrNameLoc, AttributeScopeInfo(ScopeName, ScopeLoc),
+        ArgumentKind, MatchingCType.get(), LayoutCompatible, MustBeNull, Form);
   }
 
   if (EndLoc)
@@ -1828,9 +1832,10 @@ void Parser::ProhibitCXX11Attributes(ParsedAttributes &Attrs,
     if (!AL.isStandardAttributeSyntax())
       continue;
     if (AL.getKind() == ParsedAttr::UnknownAttribute) {
-      if (WarnOnUnknownAttrs)
-        Diag(AL.getLoc(), diag::warn_unknown_attribute_ignored)
-            << AL << AL.getRange();
+      if (WarnOnUnknownAttrs) {
+        Actions.DiagnoseUnknownAttribute(AL);
+        AL.setInvalid();
+      }
     } else {
       Diag(AL.getLoc(), AttrDiagID) << AL;
       AL.setInvalid();
@@ -3117,12 +3122,12 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
     *EndLoc = T.getCloseLocation();
 
   if (IsType) {
-    Attrs.addNewTypeAttr(KWName, KWLoc, nullptr, KWLoc, TypeResult, Kind,
+    Attrs.addNewTypeAttr(KWName, KWLoc, AttributeScopeInfo(), TypeResult, Kind,
                          EllipsisLoc);
   } else {
     ArgsVector ArgExprs;
     ArgExprs.push_back(ArgExpr.get());
-    Attrs.addNew(KWName, KWLoc, nullptr, KWLoc, ArgExprs.data(), 1, Kind,
+    Attrs.addNew(KWName, KWLoc, AttributeScopeInfo(), ArgExprs.data(), 1, Kind,
                  EllipsisLoc);
   }
 }
@@ -3168,9 +3173,8 @@ void Parser::ParsePtrauthQualifier(ParsedAttributes &Attrs) {
     return;
   }
 
-  Attrs.addNew(KwName, SourceRange(KwLoc, EndLoc),
-               /*scope*/ nullptr, SourceLocation(), ArgExprs.data(),
-               ArgExprs.size(),
+  Attrs.addNew(KwName, SourceRange(KwLoc, EndLoc), AttributeScopeInfo(),
+               ArgExprs.data(), ArgExprs.size(),
                ParsedAttr::Form::Keyword(/*IsAlignAs=*/false,
                                          /*IsRegularKeywordAttribute=*/false));
 }
@@ -3216,7 +3220,7 @@ void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
       Ctx.getSizeType(), SourceLocation()));
 
   Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
-               ScopeName, ScopeLoc, ArgExprs.data(), ArgExprs.size(), Form);
+               AttributeScopeInfo(), ArgExprs.data(), ArgExprs.size(), Form);
 }
 
 ExprResult Parser::ParseExtIntegerArgument() {
@@ -3995,7 +3999,7 @@ void Parser::ParseDeclarationSpecifiers(
       isInvalid = DS.setFunctionSpecForceInline(Loc, PrevSpec, DiagID);
       IdentifierInfo *AttrName = Tok.getIdentifierInfo();
       SourceLocation AttrNameLoc = Tok.getLocation();
-      DS.getAttributes().addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc,
+      DS.getAttributes().addNew(AttrName, AttrNameLoc, AttributeScopeInfo(),
                                 nullptr, 0, tok::kw___forceinline);
       break;
     }
@@ -4053,8 +4057,9 @@ void Parser::ParseDeclarationSpecifiers(
 
     // Objective-C 'kindof' types.
     case tok::kw___kindof:
-      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc, nullptr, Loc,
-                                nullptr, 0, tok::kw___kindof);
+      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc,
+                                AttributeScopeInfo(), nullptr, 0,
+                                tok::kw___kindof);
       (void)ConsumeToken();
       continue;
 
@@ -6238,8 +6243,9 @@ void Parser::ParseTypeQualifierListOpt(
 
     // Objective-C 'kindof' types.
     case tok::kw___kindof:
-      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc, nullptr, Loc,
-                                nullptr, 0, tok::kw___kindof);
+      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc,
+                                AttributeScopeInfo(), nullptr, 0,
+                                tok::kw___kindof);
       (void)ConsumeToken();
       continue;
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 5f34370aeeb2..f31c9265a007 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1430,7 +1430,7 @@ void Parser::ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     auto Kind = Tok.getKind();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0, Kind);
   }
 }
 
@@ -1439,7 +1439,7 @@ void Parser::ParseNullabilityClassAttributes(ParsedAttributes &attrs) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     auto Kind = Tok.getKind();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0, Kind);
   }
 }
 
@@ -4493,8 +4493,8 @@ bool Parser::ParseCXXAssumeAttributeArg(
   ArgsUnion Assumption = Res.get();
   auto RParen = Tok.getLocation();
   T.consumeClose();
-  Attrs.addNew(AttrName, SourceRange(AttrNameLoc, RParen), ScopeName, ScopeLoc,
-               &Assumption, 1, Form);
+  Attrs.addNew(AttrName, SourceRange(AttrNameLoc, RParen),
+               AttributeScopeInfo(ScopeName, ScopeLoc), &Assumption, 1, Form);
 
   if (EndLoc)
     *EndLoc = RParen;
@@ -4574,7 +4574,7 @@ bool Parser::ParseCXX11AttributeArgs(
 
     // Ignore attributes that don't exist for the target.
     if (!Attr.existsInTarget(getTargetInfo())) {
-      Diag(LParenLoc, diag::warn_unknown_attribute_ignored) << AttrName;
+      Actions.DiagnoseUnknownAttribute(Attr);
       Attr.setInvalid(true);
       return true;
     }
@@ -4629,7 +4629,7 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs,
                                  /*ScopeName*/ nullptr,
                                  /*ScopeLoc*/ Loc, Form);
     } else
-      Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Form);
+      Attrs.addNew(AttrName, Loc, AttributeScopeInfo(), nullptr, 0, Form);
     return;
   }
 
@@ -4724,12 +4724,15 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs,
                                            ScopeName, ScopeLoc, OpenMPTokens);
 
     if (!AttrParsed) {
-      Attrs.addNew(
-          AttrName,
-          SourceRange(ScopeLoc.isValid() ? ScopeLoc : AttrLoc, AttrLoc),
-          ScopeName, ScopeLoc, nullptr, 0,
-          getLangOpts().CPlusPlus ? ParsedAttr::Form::CXX11()
-                                  : ParsedAttr::Form::C23());
+      Attrs.addNew(AttrName,
+                   SourceRange(ScopeLoc.isValid() && CommonScopeLoc.isInvalid()
+                                   ? ScopeLoc
+                                   : AttrLoc,
+                               AttrLoc),
+                   AttributeScopeInfo(ScopeName, ScopeLoc, CommonScopeLoc),
+                   nullptr, 0,
+                   getLangOpts().CPlusPlus ? ParsedAttr::Form::CXX11()
+                                           : ParsedAttr::Form::C23());
       AttrParsed = true;
     }
 
@@ -4890,8 +4893,8 @@ void Parser::ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs) {
   }
 
   if (!T.consumeClose()) {
-    Attrs.addNew(UuidIdent, SourceRange(UuidLoc, T.getCloseLocation()), nullptr,
-                 SourceLocation(), ArgExprs.data(), ArgExprs.size(),
+    Attrs.addNew(UuidIdent, SourceRange(UuidLoc, T.getCloseLocation()),
+                 AttributeScopeInfo(), ArgExprs.data(), ArgExprs.size(),
                  ParsedAttr::Form::Microsoft());
   }
 }
@@ -4975,8 +4978,8 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
 
   if (!T.consumeClose())
     Attrs.addNew(RootSignatureIdent,
-                 SourceRange(RootSignatureLoc, T.getCloseLocation()), nullptr,
-                 SourceLocation(), Args.data(), Args.size(),
+                 SourceRange(RootSignatureLoc, T.getCloseLocation()),
+                 AttributeScopeInfo(), Args.data(), Args.size(),
                  ParsedAttr::Form::Microsoft());
 }
 
@@ -5026,7 +5029,7 @@ void Parser::ParseMicrosoftAttributes(ParsedAttributes &Attrs) {
             ReplayOpenMPAttributeTokens(OpenMPTokens);
           }
           if (!AttrParsed) {
-            Attrs.addNew(II, NameLoc, nullptr, SourceLocation(), nullptr, 0,
+            Attrs.addNew(II, NameLoc, AttributeScopeInfo(), nullptr, 0,
                          ParsedAttr::Form::Microsoft());
           }
         }
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 329572047da0..1ea0cf52933f 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1234,8 +1234,8 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
       if (Tok.is(tok::kw___noinline__)) {
         IdentifierInfo *AttrName = Tok.getIdentifierInfo();
         SourceLocation AttrNameLoc = ConsumeToken();
-        Attributes.addNew(AttrName, AttrNameLoc, /*ScopeName=*/nullptr,
-                          AttrNameLoc, /*ArgsUnion=*/nullptr,
+        Attributes.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(),
+                          /*ArgsUnion=*/nullptr,
                           /*numArgs=*/0, tok::kw___noinline__);
       } else if (Tok.is(tok::kw___attribute))
         ParseGNUAttributes(Attributes, /*LatePArsedAttrList=*/nullptr, &D);
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 53d46465e336..e6caa81b309c 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -296,6 +296,6 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
     break;
   }
 
-  Attrs.addNew(II, Loc, nullptr, SourceLocation(), ArgExprs.data(),
-               ArgExprs.size(), ParsedAttr::Form::HLSLAnnotation());
+  Attrs.addNew(II, Loc, AttributeScopeInfo(), ArgExprs.data(), ArgExprs.size(),
+               ParsedAttr::Form::HLSLAnnotation());
 }
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 8ef16a4d3808..291c70e7bad4 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -370,7 +370,7 @@ static void addContextSensitiveTypeNullability(Parser &P,
   // Create the attribute.
   auto getNullabilityAttr = [&](AttributePool &Pool) -> ParsedAttr * {
     return Pool.create(P.getNullabilityKeyword(nullability),
-                       SourceRange(nullabilityLoc), nullptr, SourceLocation(),
+                       SourceRange(nullabilityLoc), AttributeScopeInfo(),
                        nullptr, 0, ParsedAttr::Form::ContextSensitiveKeyword());
   };
 
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 6341e565b504..98933811265e 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -1926,7 +1926,7 @@ void Parser::HandlePragmaAttribute() {
       SourceLocation AttrNameLoc = ConsumeToken();
 
       if (Tok.isNot(tok::l_paren))
-        Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+        Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                      ParsedAttr::Form::GNU());
       else
         ParseGNUAttributeArgs(AttrName, AttrNameLoc, Attrs, /*EndLoc=*/nullptr,
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index c0c9bbc2e15c..bc40b726bf41 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -2345,8 +2345,8 @@ StmtResult Parser::ParsePragmaLoopHint(StmtVector &Stmts,
     ArgsUnion ArgHints[] = {Hint.PragmaNameLoc, Hint.OptionLoc, Hint.StateLoc,
                             ArgsUnion(Hint.ValueExpr)};
     TempAttrs.addNew(Hint.PragmaNameLoc->getIdentifierInfo(), Hint.Range,
-                     /*scopeName=*/nullptr, Hint.PragmaNameLoc->getLoc(),
-                     ArgHints, /*numArgs=*/4, ParsedAttr::Form::Pragma());
+                     AttributeScopeInfo(), ArgHints, /*numArgs=*/4,
+                     ParsedAttr::Form::Pragma());
   }
 
   // Get the next statement.
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index def909fc2478..f21cbbbdb44e 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -303,10 +303,9 @@ static void ProcessAPINotes(Sema &S, Decl *D,
           AttributeFactory AF{};
           AttributePool AP{AF};
           auto &C = S.getASTContext();
-          ParsedAttr *SNA =
-              AP.create(&C.Idents.get("swift_name"), SourceRange(), nullptr,
-                        SourceLocation(), nullptr, nullptr, nullptr,
-                        ParsedAttr::Form::GNU());
+          ParsedAttr *SNA = AP.create(
+              &C.Idents.get("swift_name"), SourceRange(), AttributeScopeInfo(),
+              nullptr, nullptr, nullptr, ParsedAttr::Form::GNU());
 
           if (!S.Swift().DiagnoseName(D, Info.SwiftName, D->getLocation(), *SNA,
                                       /*IsAsync=*/false))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1aeae41042a1..9c985e6bd5e0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1986,14 +1986,13 @@ bool Sema::CheckAttrNoArgs(const ParsedAttr &Attrs) {
 bool Sema::CheckAttrTarget(const ParsedAttr &AL) {
   // Check whether the attribute is valid on the current target.
   if (!AL.existsInTarget(Context.getTargetInfo())) {
-    Diag(AL.getLoc(), AL.isRegularKeywordAttribute()
-                          ? diag::err_keyword_not_supported_on_target
-                          : diag::warn_unknown_attribute_ignored)
-        << AL << AL.getRange();
+    if (AL.isRegularKeywordAttribute())
+      Diag(AL.getLoc(), diag::err_keyword_not_supported_on_target);
+    else
+      DiagnoseUnknownAttribute(AL);
     AL.setInvalid();
     return true;
   }
-
   return false;
 }
 
@@ -7956,8 +7955,7 @@ static void checkUnusedDeclAttributes(Sema &S, const ParsedAttributesView &A) {
       continue;
 
     if (AL.getKind() == ParsedAttr::UnknownAttribute) {
-      S.Diag(AL.getLoc(), diag::warn_unknown_attribute_ignored)
-          << AL << AL.getRange();
+      S.DiagnoseUnknownAttribute(AL);
     } else {
       S.Diag(AL.getLoc(), diag::warn_attribute_not_on_decl) << AL
                                                             << AL.getRange();
@@ -7975,15 +7973,45 @@ void Sema::checkUnusedDeclAttributes(Declarator &D) {
 
 void Sema::DiagnoseUnknownAttribute(const ParsedAttr &AL) {
   std::string NormalizedFullName = '\'' + AL.getNormalizedFullName() + '\'';
-  if (auto CorrectedFullName =
-          AL.getCorrectedFullName(Context.getTargetInfo(), getLangOpts())) {
-    Diag(AL.getNormalizedRange().getBegin(),
-         diag::warn_unknown_attribute_ignored_suggestion)
-        << NormalizedFullName << *CorrectedFullName << AL.getNormalizedRange();
+  SourceRange NR = AL.getNormalizedRange();
+
+  StringRef ScopeName = AL.getNormalizedScopeName();
+  std::optional<StringRef> CorrectedScopeName =
+      AL.tryGetCorrectedScopeName(ScopeName);
+  if (CorrectedScopeName) {
+    ScopeName = *CorrectedScopeName;
+  }
+
+  StringRef AttrName = AL.getNormalizedAttrName(ScopeName);
+  std::optional<StringRef> CorrectedAttrName = AL.tryGetCorrectedAttrName(
+      ScopeName, AttrName, Context.getTargetInfo(), getLangOpts());
+  if (CorrectedAttrName) {
+    AttrName = *CorrectedAttrName;
+  }
+
+  if (CorrectedScopeName || CorrectedAttrName) {
+    std::string CorrectedFullName =
+        AL.getNormalizedFullName(ScopeName, AttrName);
+    SemaDiagnosticBuilder D =
+        Diag(CorrectedScopeName ? NR.getBegin() : AL.getRange().getBegin(),
+             diag::warn_unknown_attribute_ignored_suggestion);
+
+    D << NormalizedFullName << CorrectedFullName;
+
+    if (AL.isExplicitScope()) {
+      D << FixItHint::CreateReplacement(NR, CorrectedFullName) << NR;
+    } else {
+      if (CorrectedScopeName) {
+        D << FixItHint::CreateReplacement(SourceRange(AL.getScopeLoc()),
+                                          ScopeName);
+      }
+      if (CorrectedAttrName) {
+        D << FixItHint::CreateReplacement(AL.getRange(), AttrName);
+      }
+    }
   } else {
-    Diag(AL.getNormalizedRange().getBegin(),
-         diag::warn_unknown_attribute_ignored)
-        << NormalizedFullName << AL.getNormalizedRange();
+    Diag(NR.getBegin(), diag::warn_unknown_attribute_ignored)
+        << NormalizedFullName << NR;
   }
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 6f62c53aaf04..16645ecf411e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -2865,8 +2865,7 @@ BaseResult Sema::ActOnBaseSpecifier(Decl *classdecl, SourceRange SpecifierRange,
     if (AL.isInvalid() || AL.getKind() == ParsedAttr::IgnoredAttribute)
       continue;
     if (AL.getKind() == ParsedAttr::UnknownAttribute)
-      Diag(AL.getLoc(), diag::warn_unknown_attribute_ignored)
-          << AL << AL.getRange();
+      DiagnoseUnknownAttribute(AL);
     else
       Diag(AL.getLoc(), diag::err_base_specifier_attribute)
           << AL << AL.isRegularKeywordAttribute() << AL.getRange();
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index b78080c99176..857d46af9ada 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -672,12 +672,14 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
       !(A.existsInTarget(S.Context.getTargetInfo()) ||
         (S.Context.getLangOpts().SYCLIsDevice && Aux &&
          A.existsInTarget(*Aux)))) {
-    S.Diag(A.getLoc(), A.isRegularKeywordAttribute()
-                           ? (unsigned)diag::err_keyword_not_supported_on_target
-                       : A.isDeclspecAttribute()
-                           ? (unsigned)diag::warn_unhandled_ms_attribute_ignored
-                           : (unsigned)diag::warn_unknown_attribute_ignored)
-        << A << A.getRange();
+    if (A.isRegularKeywordAttribute() || A.isDeclspecAttribute()) {
+      S.Diag(A.getLoc(), A.isRegularKeywordAttribute()
+                             ? diag::err_keyword_not_supported_on_target
+                             : diag::warn_unhandled_ms_attribute_ignored)
+          << A << A.getRange();
+    } else {
+      S.DiagnoseUnknownAttribute(A);
+    }
     return nullptr;
   }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a0cd2d161524..785d7b89e778 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -4552,7 +4552,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
                                           false /*IsRegularKeywordAttribute*/);
       ParsedAttr *nullabilityAttr = Pool.create(
           S.getNullabilityKeyword(*inferNullability), SourceRange(pointerLoc),
-          nullptr, SourceLocation(), nullptr, 0, form);
+          AttributeScopeInfo(), nullptr, 0, form);
 
       attrs.addAtEnd(nullabilityAttr);
 
@@ -5735,10 +5735,10 @@ static void transferARCOwnershipToDeclaratorChunk(TypeProcessingState &state,
 
   // If there wasn't one, add one (with an invalid source location
   // so that we don't make an AttributedType for it).
-  ParsedAttr *attr = D.getAttributePool().create(
-      &S.Context.Idents.get("objc_ownership"), SourceLocation(),
-      /*scope*/ nullptr, SourceLocation(),
-      /*args*/ &Args, 1, ParsedAttr::Form::GNU());
+  ParsedAttr *attr =
+      D.getAttributePool().create(&S.Context.Idents.get("objc_ownership"),
+                                  SourceLocation(), AttributeScopeInfo(),
+                                  /*args*/ &Args, 1, ParsedAttr::Form::GNU());
   chunk.getAttrs().addAtEnd(attr);
   // TODO: mark whether we did this inference?
 }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 8dafefb9696b..a1368a48351c 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3198,8 +3198,8 @@ Attr *ASTRecordReader::readAttr() {
                     SpellingIndex == AlignedAttr::Keyword_alignas);
   bool IsRegularKeywordAttribute = Record.readBool();
 
-  AttributeCommonInfo Info(AttrName, ScopeName, AttrRange, ScopeLoc,
-                           AttributeCommonInfo::Kind(ParsedKind),
+  AttributeCommonInfo Info(AttrName, AttributeScopeInfo(ScopeName, ScopeLoc),
+                           AttrRange, AttributeCommonInfo::Kind(ParsedKind),
                            {AttributeCommonInfo::Syntax(Syntax), SpellingIndex,
                             IsAlignas, IsRegularKeywordAttribute});
 
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
index 873e4c0edeac..3670f9430ed4 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
@@ -45,7 +45,7 @@ import x;
 import x [[]];
 import x [[foo]]; // expected-warning {{unknown attribute 'foo' ignored}}
 import x [[noreturn]]; // expected-error {{'noreturn' attribute cannot be applied to a module import}}
-import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'noreturn' ignored}}
+import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'blarg::noreturn' ignored}}
 
 import x.y;
 import x.; // expected-error {{expected a module name after 'import'}}
diff --git a/clang/test/FixIt/fixit-unknown-attributes.cpp b/clang/test/FixIt/fixit-unknown-attributes.cpp
new file mode 100644
index 000000000000..7dff510f5ddf
--- /dev/null
+++ b/clang/test/FixIt/fixit-unknown-attributes.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+[[gmu::deprected]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f1(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:17}:"gnu::deprecated"
+
+[[gmu::deprecated]] // expected-warning {{unknown attribute 'gmu::deprecated' ignored; did you mean 'gnu::deprecated'?}}
+int f2(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:18}:"gnu::deprecated"
+
+[[gnu::deprected]] // expected-warning {{unknown attribute 'gnu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f3(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:17}:"gnu::deprecated"
+
+[[deprected]] // expected-warning {{unknown attribute 'deprected' ignored; did you mean 'deprecated'?}}
+int f4(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:12}:"deprecated"
+
+[[using gmu : deprected]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f5(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:9-[[@LINE-4]]:12}:"gnu"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-5]]:15-[[@LINE-5]]:24}:"deprecated"
+
+[[using gmu : deprecated]] // expected-warning {{unknown attribute 'gmu::deprecated' ignored; did you mean 'gnu::deprecated'?}}
+int f6(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:9-[[@LINE-4]]:12}:"gnu"
+
+[[using gnu : deprected]] // expected-warning {{unknown attribute 'gnu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f7(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:15-[[@LINE-4]]:24}:"deprecated"
+
+[[using gnu : deprecated, noretyrn]] // expected-warning {{unknown attribute 'gnu::noretyrn' ignored; did you mean 'gnu::noreturn'?}}
+void f8(void) {
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:27-[[@LINE-3]]:35}:"noreturn"
+
+[[using gmu : deprected, noretyrn]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}} \
+                                    // expected-warning {{unknown attribute 'gmu::noretyrn' ignored; did you mean 'gnu::noreturn'?}}
+void f9(void) {
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:9-[[@LINE-4]]:12}:"gnu"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-5]]:15-[[@LINE-5]]:24}:"deprecated"
+
+// CHECK: fix-it:"{{.*}}":{[[@LINE-7]]:9-[[@LINE-7]]:12}:"gnu"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-8]]:26-[[@LINE-8]]:34}:"noreturn"
+
+__attribute__((cont, deprected)) // expected-warning {{unknown attribute 'cont' ignored; did you mean 'const'?}} \
+                                 // expected-warning {{unknown attribute 'deprected' ignored; did you mean 'deprecated'?}}
+int f10(int) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-5]]:16-[[@LINE-5]]:20}:"const"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-6]]:22-[[@LINE-6]]:31}:"deprecated"
+
+[[using gnu: noretyrn, address_spaci(0)]] // expected-warning {{unknown attribute 'gnu::noretyrn' ignored; did you mean 'gnu::noreturn'?}} \
+                                          // expected-warning {{unknown attribute 'gnu::address_spaci' ignored}}
+void f11(void) {
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:14-[[@LINE-4]]:22}:"noreturn"
diff --git a/clang/test/Parser/cxx11-base-spec-attributes.cpp b/clang/test/Parser/cxx11-base-spec-attributes.cpp
index 7338c5116c16..6f2f54ead62b 100644
--- a/clang/test/Parser/cxx11-base-spec-attributes.cpp
+++ b/clang/test/Parser/cxx11-base-spec-attributes.cpp
@@ -7,4 +7,4 @@ struct D : [[]] public virtual A {};
 struct E : public [[]] virtual A {}; // expected-error {{an attribute list cannot appear here}}
 struct F : virtual [[]] public A {}; // expected-error {{an attribute list cannot appear here}}
 struct G : [[noreturn]] A {}; // expected-error {{'noreturn' attribute cannot be applied to a base specifier}}
-struct H : [[unknown::foobar]] A {}; // expected-warning {{unknown attribute 'foobar' ignored}}
+struct H : [[unknown::foobar]] A {}; // expected-warning {{unknown attribute 'unknown::foobar' ignored}}
diff --git a/clang/test/Parser/objcxx11-attributes.mm b/clang/test/Parser/objcxx11-attributes.mm
index d7ba609ebd74..88fa3103593e 100644
--- a/clang/test/Parser/objcxx11-attributes.mm
+++ b/clang/test/Parser/objcxx11-attributes.mm
@@ -57,7 +57,7 @@ void f(X *noreturn) {
 
 template<typename...Ts> void f(Ts ...x) {
   [[test::foo(bar, baz)...]]; // expected-error {{attribute 'foo' cannot be used as an attribute pack}} \
-  // expected-warning {{unknown attribute 'foo' ignored}}
+  // expected-warning {{unknown attribute 'test::foo' ignored}}
 
   [[used(x)...]]; // expected-error {{attribute 'used' cannot be used as an attribute pack}} \
   // expected-warning {{unknown attribute 'used' ignored}}
diff --git a/clang/test/Sema/unknown-attributes.c b/clang/test/Sema/unknown-attributes.c
index a701650c9e05..4711c9fa667b 100644
--- a/clang/test/Sema/unknown-attributes.c
+++ b/clang/test/Sema/unknown-attributes.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -verify %s
-// RUN: %clang_cc1 -x c++ -Wunknown-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -verify=expected,c %s
+// RUN: %clang_cc1 -x c++ -Wunknown-attributes -fsyntax-only -verify=expected,cxx %s
 
 [[gmu::deprected]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}}
 int f1(void) {
@@ -20,3 +20,10 @@ int f3(void) {
 int f4(void) {
   return 0;
 }
+
+[[using gnu : deprected]] // c-error {{expected ','}} \
+                          // c-warning {{unknown attribute 'using' ignored}} \
+                          // cxx-warning {{unknown attribute 'gnu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f5(void) {
+  return 0;
+}
diff --git a/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp b/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
index 00fa5bd7336b..acd9846bb20f 100644
--- a/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
+++ b/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
@@ -11,7 +11,7 @@ __attribute__((no_caller_saved_registers(999))) void bar(int *) {} // expected-w
 
 __attribute__((no_caller_saved_registers)) void foo(int *){} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
 
-[[gnu::no_caller_saved_registers]] void foo2(int *) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+[[gnu::no_caller_saved_registers]] void foo2(int *) {} // expected-warning {{unknown attribute 'gnu::no_caller_saved_registers' ignored}}
 
 typedef __attribute__((no_caller_saved_registers)) void (*foo3)(int *); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
 

From 951ea8b681451ff2db8b895f1dcfe0fbc91d939a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Sat, 14 Jun 2025 18:20:47 -0700
Subject: [PATCH 0389/1322] [mlir][nvvm][NFC] Fix typo in TargetAttr (#144159)

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 026c1fae0eb8..2dd7ac29cfed 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -3835,7 +3835,7 @@ def NVVM_DotAccumulate2WayOp : NVVM_Op<"dot.accumulate.2way"> {
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
 
-def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target", 
+def NVVM_TargetAttr : NVVM_Attr<"NVVMTarget", "target", 
   [DeclareAttrInterfaceMethods<GPUTargetAttrVerifyInterface>]> {
   let description = [{
     GPU target attribute for controlling compilation of NVIDIA targets. All

From 4ed10db85919d3d87bf0b3353340b58354a75994 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Sat, 14 Jun 2025 14:07:14 +0100
Subject: [PATCH 0390/1322] [clang][cmake] Don't pass -fno-strict-aliasing for
 GCC

This was added a long time ago..
* to the Makefiles in 40fee6313df688d43d1f8bbe85bc35161689afca;
* first to CMake in b3ce035c7155644d5bced46c45ae5ac865b7aedc;
* then moved to only apply when building Clang with GCC in
  c5635a6af7c643169f81145bfae8c895f2207792.

This shouldn't be needed these days. If an issue does arise, it really
ought to be documented better and the cause will certainly be different
than it was back then.

The two GCC bugs cited in 40fee6313df688d43d1f8bbe85bc35161689afca were:
* https://gcc.gnu.org/PR41874
* https://gcc.gnu.org/PR41838
and both are long-fixed. Not only that, if those issues did come up again,
we'd be better off doing -Wno-strict-aliasing where appropriate if there
weren't a real code issue or some suppression that was tighter in scope
wasn't appropriate.
---
 clang/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index ab2ac9bc6b9a..94607a8e8473 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -345,9 +345,6 @@ configure_file(
 # Add appropriate flags for GCC
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-common -Woverloaded-virtual")
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing")
-  endif ()
 
   # Enable -pedantic for Clang even if it's not enabled for LLVM.
   if (NOT LLVM_ENABLE_PEDANTIC)

From 24c8d900c47edeefb85643a06bc32235d9f42ea3 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Sun, 15 Jun 2025 11:38:04 +0800
Subject: [PATCH 0391/1322] [RISCV] Remove B and Zbc extension from Andes
 series cpus. (#144022)

The Andes CPU is configurable with optional extensions. The minimal
required extension set does not include `B` and `Zbc` extensions. So we
decided to remove them.
---
 .../Driver/print-enabled-extensions/riscv-andes-a25.c     | 7 +------
 .../Driver/print-enabled-extensions/riscv-andes-a45.c     | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-ax25.c    | 7 +------
 .../Driver/print-enabled-extensions/riscv-andes-ax45.c    | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-n45.c     | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-nx45.c    | 6 +-----
 llvm/lib/Target/RISCV/RISCVProcessors.td                  | 8 --------
 llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s              | 2 +-
 8 files changed, 7 insertions(+), 41 deletions(-)

diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
index d8b3848d8452..cfb4d0ed58d1 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,12 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
index a0a1c3591140..3c3c554dffc5 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,11 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
index 3f933ecd8ac8..70100a0a8df1 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,12 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
index 6460d701411b..d2b1a32e321e 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,11 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
index 4d9c514b756e..1a2c30bfc7a2 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,11 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
index 5eaada3f9e16..50c38da3bd03 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,11 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 32f4ab607a34..d7e6c71ea062 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -703,8 +703,6 @@ def ANDES_A25 : RISCVProcessorModel<"andes-a25",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
-                                     FeatureStdExtZbc,
                                      FeatureVendorXAndesPerf]>;
 
 def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
@@ -718,8 +716,6 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
-                                      FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
 defvar Andes45TuneFeatures = [TuneAndes45,
@@ -741,7 +737,6 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -756,7 +751,6 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
 
@@ -771,7 +765,6 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -786,6 +779,5 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
diff --git a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
index f6dc6eef3f0f..d90dce8c5c3f 100644
--- a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
+++ b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+zbc -timeline -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+b,+zbc -timeline -iterations=1 < %s | FileCheck %s
 
 # Two ALUs without dependency can be dispatched in the same cycle.
 add a0, a0, a0

From a0c00ccd5ff180c721def8001c870338d5de319e Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Sun, 15 Jun 2025 07:45:48 +0300
Subject: [PATCH 0392/1322] [libc++] P2944R3: Constrained comparisons - update
 `reference_wrapper` implementation (#139368)

Updates the implementation `std::reference_wrapper` -
[P2944R3](https://wg21.link/P2944R3) as discussed in
https://github.com/llvm/llvm-project/pull/117664#discussion_r1857826166
This PR also refactors the tests in preparation to implements the
constrained comparisons for `optional`, `variant` etc.

- Moves the test helpers (concepts and types) for testing constrained
comparisons to `test_comparisons.h`.
- Updates the `std::reference_wrapper` implementation to use the concept
`__core_convertible_to<bool>` as per comments in #135759.

Closes #138233

# References:
- [refwrap.comparisons](https://wg21.link/refwrap.comparisons)

---------

Co-authored-by: Hristo Hristov <zingam@outlook.com>
Co-authored-by: Nikolas Klauser <nikolasklauser@berlin.de>
---
 libcxx/docs/Status/Cxx2cPapers.csv            |  2 +-
 .../include/__functional/reference_wrapper.h  |  8 ++--
 .../array/compare.three_way.pass.cpp          |  1 -
 ...mpare.three_way.refwrap.const_ref.pass.cpp | 13 +++----
 ...compare.three_way.refwrap.refwrap.pass.cpp | 14 +++----
 ...e.three_way.refwrap.refwrap_const.pass.cpp | 17 ++++-----
 .../equal.refwrap.const_ref.pass.cpp          | 11 +++---
 .../equal.refwrap.refwrap.pass.cpp            |  9 ++---
 .../equal.refwrap.refwrap_const.pass.cpp      | 13 +++----
 .../refwrap.comparissons/helper_concepts.h    | 38 -------------------
 .../refwrap.comparissons/helper_types.h       | 30 ---------------
 libcxx/test/support/test_comparisons.h        | 25 +++++++++++-
 .../test/support/test_container_comparisons.h |  4 --
 13 files changed, 59 insertions(+), 126 deletions(-)
 delete mode 100644 libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
 delete mode 100644 libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h

diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 3809446a5789..8a0417e120d7 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
 "`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","Implemented changes to ``reference_wrapper`` and ``pair``"
+"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","The changes to ``optional``, ``tuple`` and ``variant`` are not yet implemented"
 "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19",""
 "","","","","",""
diff --git a/libcxx/include/__functional/reference_wrapper.h b/libcxx/include/__functional/reference_wrapper.h
index b409ad7511f6..c46203a4ca9a 100644
--- a/libcxx/include/__functional/reference_wrapper.h
+++ b/libcxx/include/__functional/reference_wrapper.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FUNCTIONAL_REFERENCE_WRAPPER_H
 
 #include <__compare/synth_three_way.h>
-#include <__concepts/boolean_testable.h>
 #include <__config>
 #include <__functional/weak_result_type.h>
 #include <__memory/addressof.h>
@@ -19,6 +18,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_const.h>
+#include <__type_traits/is_core_convertible.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
@@ -75,7 +75,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, reference_wrapper __y)
     requires requires {
-      { __x.get() == __y.get() } -> __boolean_testable;
+      { __x.get() == __y.get() } -> __core_convertible_to<bool>;
     }
   {
     return __x.get() == __y.get();
@@ -83,7 +83,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, const _Tp& __y)
     requires requires {
-      { __x.get() == __y } -> __boolean_testable;
+      { __x.get() == __y } -> __core_convertible_to<bool>;
     }
   {
     return __x.get() == __y;
@@ -91,7 +91,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, reference_wrapper<const _Tp> __y)
     requires(!is_const_v<_Tp>) && requires {
-      { __x.get() == __y.get() } -> __boolean_testable;
+      { __x.get() == __y.get() } -> __core_convertible_to<bool>;
     }
   {
     return __x.get() == __y.get();
diff --git a/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp
index 01be1db73041..671747f89a82 100644
--- a/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp
@@ -26,7 +26,6 @@ constexpr std::size_t N{1};
 static_assert(std::three_way_comparable<std::array<int, N>>);
 
 // Thanks to SFINAE, the following is not a compiler error but returns `false`
-struct NonComparable {};
 static_assert(!std::three_way_comparable<std::array<NonComparable, N>>);
 
 // Implementation detail of `test_sequence_container_array_spaceship`
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
index 85106c18ec35..4a2ae963e3bd 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,16 +23,13 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<StrongOrder>>);
-static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<WeakOrder>>);
-static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<PartialOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<StrongOrder>, int>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<WeakOrder>, int>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<PartialOrder>, int>);
 
-static_assert(!HasSpaceshipOperatorWithInt<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<NonComparable>, int>);
 
 // Test comparisons.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
index 794fac00de8a..3d72459bc5a1 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -22,17 +22,13 @@
 
 #include "test_comparisons.h"
 #include "test_macros.h"
-
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::three_way_comparable<std::reference_wrapper<StrongOrder>>);
-static_assert(std::three_way_comparable<std::reference_wrapper<WeakOrder>>);
-static_assert(std::three_way_comparable<std::reference_wrapper<PartialOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<StrongOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<WeakOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<PartialOrder>>);
 
-static_assert(!std::three_way_comparable<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<NonComparable>>);
 
 // Test comparisons.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
index 9b1302affa85..1ae22b4ac58e 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,18 +23,15 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::three_way_comparable_with<std::reference_wrapper<StrongOrder>, const StrongOrder>);
-static_assert(std::three_way_comparable_with<std::reference_wrapper<WeakOrder>, const WeakOrder>);
-static_assert(std::three_way_comparable_with<std::reference_wrapper<PartialOrder>, const PartialOrder>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<StrongOrder>, std::reference_wrapper<const StrongOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<WeakOrder>, std::reference_wrapper<const WeakOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<PartialOrder>, std::reference_wrapper<const PartialOrder>>);
 
-static_assert(!std::three_way_comparable_with<std::reference_wrapper<StrongOrder>, const NonComparable>);
-static_assert(!std::three_way_comparable_with<std::reference_wrapper<WeakOrder>, const NonComparable>);
-static_assert(!std::three_way_comparable_with<std::reference_wrapper<PartialOrder>, const NonComparable>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<StrongOrder>, std::reference_wrapper<const NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<WeakOrder>, std::reference_wrapper<const NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<PartialOrder>, std::reference_wrapper<const NonComparable>>);
 
 // Test comparisons.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
index 465326818f17..316ff7c30331 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,14 +23,13 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(HasEqualityOperatorWithInt<std::reference_wrapper<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::reference_wrapper<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::reference_wrapper<EqualityComparable>, int>);
 
-static_assert(!HasEqualityOperatorWithInt<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorEqual<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorEqual<std::reference_wrapper<NonComparable>, int>);
 
 // Test equality.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
index a50b530bbc6e..70e79d399861 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -22,14 +22,11 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::equality_comparable<std::reference_wrapper<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::reference_wrapper<EqualityComparable>>);
 
-static_assert(!std::equality_comparable<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorEqual<std::reference_wrapper<NonComparable>>);
 
 // Test equality.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
index 10f017742a87..c68ad5c4aa52 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,16 +23,13 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::equality_comparable_with<std::reference_wrapper<EqualityComparable>,
-                                            std::reference_wrapper<const EqualityComparable>>);
+static_assert(
+    HasOperatorEqual<std::reference_wrapper<EqualityComparable>, std::reference_wrapper<const EqualityComparable>>);
 
-static_assert(!std::equality_comparable_with<std::reference_wrapper<EqualityComparable>,
-                                             std::reference_wrapper<const NonComparable>>);
+static_assert(
+    !HasOperatorEqual<std::reference_wrapper<EqualityComparable>, std::reference_wrapper<const NonComparable>>);
 
 // Test equality.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
deleted file mode 100644
index 2dbb304f8af6..000000000000
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
-#define TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
-
-#include <concepts>
-#include <utility>
-
-// Equality
-
-template <typename T>
-concept HasEqualityOperatorWithInt = requires(T t, int i) {
-  { t.get() == i } -> std::convertible_to<bool>;
-};
-
-// Spaceship
-
-template <class T>
-concept BooleanTestableImpl = std::convertible_to<T, bool>;
-
-template <class T>
-concept BooleanTestable = BooleanTestableImpl<T> && requires(T&& t) {
-  { !std::forward<T>(t) } -> BooleanTestableImpl;
-};
-
-template <typename T>
-concept HasSpaceshipOperatorWithInt = requires(T t, int i) {
-  { t < i } -> BooleanTestable;
-  { i < t } -> BooleanTestable;
-};
-
-#endif // TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h
deleted file mode 100644
index cf5e568dbf93..000000000000
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
-#define TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
-
-#include <concepts>
-
-struct EqualityComparable {
-  constexpr EqualityComparable(int value) : value_{value} {};
-
-  friend constexpr bool operator==(const EqualityComparable&, const EqualityComparable&) noexcept = default;
-
-  int value_;
-};
-
-static_assert(std::equality_comparable<EqualityComparable>);
-static_assert(EqualityComparable{94} == EqualityComparable{94});
-static_assert(EqualityComparable{94} != EqualityComparable{82});
-
-struct NonComparable {};
-
-static_assert(!std::three_way_comparable<NonComparable>);
-
-#endif // TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
diff --git a/libcxx/test/support/test_comparisons.h b/libcxx/test/support/test_comparisons.h
index db6977a96a2f..d9729e0451b4 100644
--- a/libcxx/test/support/test_comparisons.h
+++ b/libcxx/test/support/test_comparisons.h
@@ -268,6 +268,29 @@ struct PartialOrder {
   }
 };
 
-#endif
+template <typename T1, typename T2 = T1>
+concept HasOperatorEqual = requires(T1 t1, T2 t2) { t1 == t2; };
+
+template <typename T1, typename T2 = T1>
+concept HasOperatorSpaceship = requires(T1 t1, T2 t2) { t1 <=> t2; };
+
+struct NonComparable {};
+static_assert(!std::equality_comparable<NonComparable>);
+static_assert(!HasOperatorEqual<NonComparable>);
+static_assert(!HasOperatorSpaceship<NonComparable>);
+
+class EqualityComparable {
+public:
+  constexpr EqualityComparable(int value) : value_{value} {};
+
+  friend constexpr bool operator==(const EqualityComparable&, const EqualityComparable&) noexcept = default;
+
+private:
+  int value_;
+};
+static_assert(std::equality_comparable<EqualityComparable>);
+static_assert(HasOperatorEqual<EqualityComparable>);
+
+#endif // TEST_STD_VER >= 20
 
 #endif // TEST_COMPARISONS_H
diff --git a/libcxx/test/support/test_container_comparisons.h b/libcxx/test/support/test_container_comparisons.h
index f7bf78e48a1f..53db5ba99ce4 100644
--- a/libcxx/test/support/test_container_comparisons.h
+++ b/libcxx/test/support/test_container_comparisons.h
@@ -88,7 +88,6 @@ constexpr bool test_sequence_container_spaceship() {
                                               std::weak_ordering>();
 
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<Container<NonComparable>>);
 
   return true;
@@ -163,7 +162,6 @@ constexpr void test_sequence_container_adaptor_spaceship_with_type() {
 template <template <typename...> typename ContainerAdaptor, template <typename...> typename Container>
 constexpr bool test_sequence_container_adaptor_spaceship() {
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<ContainerAdaptor<NonComparable>>);
 
   // The container should fulfill `std::three_way_comparable`
@@ -301,7 +299,6 @@ constexpr void test_ordered_map_container_spaceship_with_type(Compare comp) {
 template <template <typename...> typename Container>
 constexpr bool test_ordered_map_container_spaceship() {
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<Container<int, NonComparable>>);
 
   // The container should fulfill `std::three_way_comparable`
@@ -444,7 +441,6 @@ constexpr void test_ordered_set_spaceship_with_type(Compare comp) {
 template <template <typename...> typename Container>
 constexpr bool test_ordered_set_container_spaceship() {
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<Container<NonComparable>>);
 
   // The container should fulfill `std::three_way_comparable`

From c4ba734993ac7ca39cc101db62797aad3a2a265a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 14 Jun 2025 23:23:42 -0700
Subject: [PATCH 0393/1322] [mlir] Compare std::optional<T> to values directly
 (NFC) (#144241)

This patch transforms:

  X && *X == Y

to:

  X == Y

where X is of std::optional<T>, and Y is of T or similar.
---
 mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp  | 2 +-
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp              | 9 ++++-----
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp           | 6 ++----
 mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp   | 3 +--
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp               | 2 +-
 mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp | 2 +-
 mlir/lib/Dialect/Utils/StaticValueUtils.cpp           | 3 +--
 7 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
index dab15d23f6e0..ac8ed4fdff7c 100644
--- a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
+++ b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
@@ -173,7 +173,7 @@ getTreePredicates(std::vector<PositionalPredicate> &predList, Value val,
 
       // Ignore the specified operand, usually because this position was
       // visited in an upward traversal via an iterative choice.
-      if (ignoreOperand && *ignoreOperand == operandIt.index())
+      if (ignoreOperand == operandIt.index())
         continue;
 
       Position *pos =
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 8a708eb29210..3d09c6a9b2c2 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2367,7 +2367,7 @@ struct AffineForEmptyLoopFolder : public OpRewritePattern<AffineForOp> {
     if (forOp.getNumResults() == 0)
       return success();
     std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp);
-    if (tripCount && *tripCount == 0) {
+    if (tripCount == 0) {
       // The initial values of the iteration arguments would be the op's
       // results.
       rewriter.replaceOp(forOp, forOp.getInits());
@@ -2447,7 +2447,7 @@ void AffineForOp::getSuccessorRegions(
 
   // From the loop body, if the trip count is one, we can only branch back to
   // the parent.
-  if (!point.isParent() && tripCount && *tripCount == 1) {
+  if (!point.isParent() && tripCount == 1) {
     regions.push_back(RegionSuccessor(getResults()));
     return;
   }
@@ -2460,8 +2460,7 @@ void AffineForOp::getSuccessorRegions(
 
 /// Returns true if the affine.for has zero iterations in trivial cases.
 static bool hasTrivialZeroTripCount(AffineForOp op) {
-  std::optional<uint64_t> tripCount = getTrivialConstantTripCount(op);
-  return tripCount && *tripCount == 0;
+  return getTrivialConstantTripCount(op) == 0;
 }
 
 LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
@@ -4789,7 +4788,7 @@ struct DropUnitExtentBasis
          llvm::enumerate(delinearizeOp.getPaddedBasis())) {
       std::optional<int64_t> basisVal =
           basis ? getConstantIntValue(basis) : std::nullopt;
-      if (basisVal && *basisVal == 1)
+      if (basisVal == 1)
         replacements[index] = getZero();
       else
         newBasis.push_back(basis);
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 0d4ba3940c48..4aa1fe318efa 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -1015,8 +1015,7 @@ LogicalResult mlir::affine::loopUnrollByFactor(
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   if (unrollFactor == 1) {
-    if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
-        failed(promoteIfSingleIteration(forOp)))
+    if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
   }
@@ -1103,8 +1102,7 @@ LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   if (unrollJamFactor == 1) {
-    if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
-        failed(promoteIfSingleIteration(forOp)))
+    if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index bd4ffabfbb92..5e6dde36d7f9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -606,8 +606,7 @@ struct DropPadUnitDims : public OpRewritePattern<tensor::PadOp> {
     int64_t padRank = sourceShape.size();
 
     auto isStaticZero = [](OpFoldResult f) {
-      std::optional<int64_t> maybeInt = getConstantIntValue(f);
-      return maybeInt && *maybeInt == 0;
+      return getConstantIntValue(f) == 0;
     };
 
     llvm::SmallDenseSet<unsigned> unitDimsFilter(allowedUnitDims.begin(),
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index bae06c003fd9..2527d90cfa2e 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -688,7 +688,7 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
     //    tensors with "0" dimensions would never be constructed.
     int64_t shapeSize = shape[r];
     std::optional<int64_t> sizeCst = getConstantIntValue(size);
-    auto hasTileSizeOne = sizeCst && *sizeCst == 1;
+    auto hasTileSizeOne = sizeCst == 1;
     auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
                          ((shapeSize % *sizeCst) == 0);
     if (!hasTileSizeOne && !dividesEvenly) {
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index f5a58c58e05d..1e7bb046d375 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -737,7 +737,7 @@ static spirv::GlobalVariableOp getBuiltinVariable(Block &body,
             spirv::SPIRVDialect::getAttributeName(
                 spirv::Decoration::BuiltIn))) {
       auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
-      if (varBuiltIn && *varBuiltIn == builtin) {
+      if (varBuiltIn == builtin) {
         return varOp;
       }
     }
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 29f7bd6857c2..8e3f796af54d 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -142,8 +142,7 @@ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
 }
 
 bool isConstantIntValue(OpFoldResult ofr, int64_t value) {
-  auto val = getConstantIntValue(ofr);
-  return val && *val == value;
+  return getConstantIntValue(ofr) == value;
 }
 
 bool areAllConstantIntValue(ArrayRef<OpFoldResult> ofrs, int64_t value) {

From 84ff1bda2977e580265997ad2d4c47b18cd3bf9f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 14 Jun 2025 23:23:49 -0700
Subject: [PATCH 0394/1322] [RISCV] Use StringRef in a range-based for loop
 (NFC) (#144243)

When we iterate over std::vector<std::string>, we can directly assign
each element to StringRef.  We do not need to go through a separate
statement.
---
 llvm/lib/TargetParser/RISCVISAInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index e76ddd4b648d..17c98332ab0a 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -449,8 +449,7 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
   assert(XLen == 32 || XLen == 64);
   std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
 
-  for (auto &Feature : Features) {
-    StringRef ExtName = Feature;
+  for (StringRef ExtName : Features) {
     assert(ExtName.size() > 1 && (ExtName[0] == '+' || ExtName[0] == '-'));
     bool Add = ExtName[0] == '+';
     ExtName = ExtName.drop_front(1); // Drop '+' or '-'

From 9e16792639242a86314e5d6531010953a0a96216 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Sun, 15 Jun 2025 02:35:20 -0400
Subject: [PATCH 0395/1322] [mlir][bzl] Add CAPIIndex rule. (#144248)

---
 .../bazel/llvm-project-overlay/mlir/BUILD.bazel | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b62d5595fe94..e7398a696bea 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -535,6 +535,23 @@ mlir_c_api_cc_library(
     ],
 )
 
+mlir_c_api_cc_library(
+    name = "CAPIIndex",
+    srcs = [
+        "lib/CAPI/Dialect/Index.cpp",
+    ],
+    hdrs = [
+        "include/mlir-c/Dialect/Index.h",
+    ],
+    capi_deps = [
+        ":CAPIIR",
+    ],
+    includes = ["include"],
+    deps = [
+        ":IndexDialect",
+    ],
+)
+
 mlir_c_api_cc_library(
     name = "CAPILinalg",
     srcs = [

From 149cb5c43c3a75ecb827b8b7ae853250c3c09449 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 15 Jun 2025 15:17:53 +0800
Subject: [PATCH 0396/1322] [ValueTracking] Infer `X | Y != 0` from `X != Y`
 (#117443)

Alive2: https://alive2.llvm.org/ce/z/cJ75Ya

Closes https://github.com/llvm/llvm-project/issues/117436.
---
 llvm/lib/Analysis/ValueTracking.cpp          | 4 ++++
 llvm/test/Transforms/InstCombine/icmp-dom.ll | 5 +----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d8c1096049dc..99670b92187c 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3043,6 +3043,10 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     // (X | (X != 0)) is non zero
     if (matchOpWithOpEqZero(I->getOperand(0), I->getOperand(1)))
       return true;
+    // X | Y != 0 if X != Y.
+    if (isKnownNonEqual(I->getOperand(0), I->getOperand(1), DemandedElts, Q,
+                        Depth))
+      return true;
     // X | Y != 0 if X != 0 or Y != 0.
     return isKnownNonZero(I->getOperand(1), DemandedElts, Q, Depth) ||
            isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
diff --git a/llvm/test/Transforms/InstCombine/icmp-dom.ll b/llvm/test/Transforms/InstCombine/icmp-dom.ll
index 6613bbeb8d6a..a72b5e0bbfa0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-dom.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-dom.ll
@@ -535,16 +535,13 @@ else:
   ret i1 %cmp1
 }
 
-; TODO: X != Y implies X | Y != 0
 define i1 @or_nonzero_from_nonequal(i8 %x, i8 %y) {
 ; CHECK-LABEL: @or_nonzero_from_nonequal(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[OR]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ; CHECK:       if.else:
 ; CHECK-NEXT:    ret i1 false
 ;

From 30a41a642358d0f427c3cbc0299ea48fbc0cf79e Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Sun, 15 Jun 2025 03:32:34 -0400
Subject: [PATCH 0397/1322] [ValueTracking] Add subtraction support for
 setLimitsForBinOp (#143618)

We can determine the range from a subtraction if it has nsw or nuw.

https://alive2.llvm.org/ce/z/tXAKVV
---
 llvm/lib/Analysis/ValueTracking.cpp          | 36 ++++++++++++++++++--
 llvm/test/Transforms/InstCombine/div.ll      |  8 ++---
 llvm/test/Transforms/InstCombine/icmp-sub.ll |  3 +-
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 99670b92187c..e7a1f07c0270 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9580,15 +9580,45 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (BO.getOpcode()) {
-  case Instruction::Add:
-    if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
+  case Instruction::Sub:
+    if (match(BO.getOperand(0), m_APInt(C))) {
       bool HasNSW = IIQ.hasNoSignedWrap(&BO);
       bool HasNUW = IIQ.hasNoUnsignedWrap(&BO);
 
       // If the caller expects a signed compare, then try to use a signed range.
       // Otherwise if both no-wraps are set, use the unsigned range because it
       // is never larger than the signed range. Example:
-      // "add nuw nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125].
+      // "sub nuw nsw i8 -2, x" is unsigned [0, 254] vs. signed [-128, 126].
+      // "sub nuw nsw i8 2, x" is unsigned [0, 2] vs. signed [-125, 127].
+      if (PreferSignedRange && HasNSW && HasNUW)
+        HasNUW = false;
+
+      if (HasNUW) {
+        // 'sub nuw c, x' produces [0, C].
+        Upper = *C + 1;
+      } else if (HasNSW) {
+        if (C->isNegative()) {
+          // 'sub nsw -C, x' produces [SINT_MIN, -C - SINT_MIN].
+          Lower = APInt::getSignedMinValue(Width);
+          Upper = *C - APInt::getSignedMaxValue(Width);
+        } else {
+          // Note that sub 0, INT_MIN is not NSW. It techically is a signed wrap
+          // 'sub nsw C, x' produces [C - SINT_MAX, SINT_MAX].
+          Lower = *C - APInt::getSignedMaxValue(Width);
+          Upper = APInt::getSignedMinValue(Width);
+        }
+      }
+    }
+    break;
+  case Instruction::Add:
+    if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
+      bool HasNSW = IIQ.hasNoSignedWrap(&BO);
+      bool HasNUW = IIQ.hasNoUnsignedWrap(&BO);
+
+      // If the caller expects a signed compare, then try to use a signed
+      // range. Otherwise if both no-wraps are set, use the unsigned range
+      // because it is never larger than the signed range. Example: "add nuw
+      // nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125].
       if (PreferSignedRange && HasNSW && HasNUW)
         HasNUW = false;
 
diff --git a/llvm/test/Transforms/InstCombine/div.ll b/llvm/test/Transforms/InstCombine/div.ll
index 7e93612150e8..f0fdc5f54366 100644
--- a/llvm/test/Transforms/InstCombine/div.ll
+++ b/llvm/test/Transforms/InstCombine/div.ll
@@ -494,9 +494,7 @@ define <2 x i8> @sdiv_exact_negated_dividend_constant_divisor_vec_splat(<2 x i8>
 
 define i8 @sdiv_negated_dividend_constant_divisor_smin(i8 %x) {
 ; CHECK-LABEL: @sdiv_negated_dividend_constant_divisor_smin(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    [[D:%.*]] = zext i1 [[TMP1]] to i8
-; CHECK-NEXT:    ret i8 [[D]]
+; CHECK-NEXT:    ret i8 0
 ;
   %neg = sub nsw i8 0, %x
   %d = sdiv i8 %neg, -128
@@ -505,9 +503,7 @@ define i8 @sdiv_negated_dividend_constant_divisor_smin(i8 %x) {
 
 define <2 x i8> @sdiv_negated_dividend_constant_divisor_vec_splat_smin(<2 x i8> %x) {
 ; CHECK-LABEL: @sdiv_negated_dividend_constant_divisor_vec_splat_smin(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], splat (i8 -128)
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
-; CHECK-NEXT:    ret <2 x i8> [[D]]
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
   %neg = sub nsw <2 x i8> zeroinitializer, %x
   %d = sdiv <2 x i8> %neg, <i8 -128, i8 -128>
diff --git a/llvm/test/Transforms/InstCombine/icmp-sub.ll b/llvm/test/Transforms/InstCombine/icmp-sub.ll
index 4143902bc9c4..13ed7ba0c170 100644
--- a/llvm/test/Transforms/InstCombine/icmp-sub.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-sub.ll
@@ -290,8 +290,7 @@ define i1 @subC_nsw_ne(i32 %x) {
 ; CHECK-LABEL: @subC_nsw_ne(
 ; CHECK-NEXT:    [[SUBX:%.*]] = sub nsw i32 -2147483647, [[X:%.*]]
 ; CHECK-NEXT:    call void @use(i32 [[SUBX]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i32 [[X]], 2147483603
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %subx = sub nsw i32 -2147483647, %x
   call void @use(i32 %subx)

From 48e54f3a225062b5d229e6fd3b06140f76c0613b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 15 Jun 2025 08:51:59 +0100
Subject: [PATCH 0398/1322] [CostModel] Mark all TTIImpls as final. NFC
 (#143404)

In the AArch64 version this helps reduce the number of blr instruction
(indirect jumps) in from 325 to 87, and reduces the size of the object
file by 4%. It seems to help make the code more efficient even if it
doesn't greatly affect compile time.

The AMDGPU variants are already marked as final.
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 +-
 llvm/lib/Target/ARC/ARCTargetTransformInfo.h         | 2 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h         | 2 +-
 llvm/lib/Target/BPF/BPFTargetTransformInfo.h         | 2 +-
 llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h | 2 +-
 llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h | 2 +-
 llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/Mips/MipsTargetTransformInfo.h       | 2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h | 2 +-
 llvm/lib/Target/VE/VETargetTransformInfo.h           | 2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h         | 2 +-
 llvm/lib/Target/XCore/XCoreTargetTransformInfo.h     | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea..0184e748b3d8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -39,7 +39,7 @@ class Type;
 class Value;
 class VectorType;
 
-class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
+class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
   using BaseT = BasicTTIImplBase<AArch64TTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/ARC/ARCTargetTransformInfo.h b/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
index bb7f2a0a459c..3d5ff6dc256d 100644
--- a/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
+++ b/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
@@ -26,7 +26,7 @@ class ARCSubtarget;
 class ARCTargetLowering;
 class ARCTargetMachine;
 
-class ARCTTIImpl : public BasicTTIImplBase<ARCTTIImpl> {
+class ARCTTIImpl final : public BasicTTIImplBase<ARCTTIImpl> {
   using BaseT = BasicTTIImplBase<ARCTTIImpl>;
   friend BaseT;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 20a2c5951108..c1af4e3dc5da 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,7 +54,7 @@ namespace TPLoop {
 enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
 }
 
-class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
+class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
   using BaseT = BasicTTIImplBase<ARMTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index e94497896f68..d7b2ceff105c 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -21,7 +21,7 @@
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 namespace llvm {
-class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
+class BPFTTIImpl final : public BasicTTIImplBase<BPFTTIImpl> {
   typedef BasicTTIImplBase<BPFTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
index 9f344d7d52ba..e2dd4354a816 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
@@ -17,7 +17,7 @@
 #include "llvm/IR/Function.h"
 
 namespace llvm {
-class DirectXTTIImpl : public BasicTTIImplBase<DirectXTTIImpl> {
+class DirectXTTIImpl final : public BasicTTIImplBase<DirectXTTIImpl> {
   using BaseT = BasicTTIImplBase<DirectXTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index d7509c3bb1d2..c03cad4713e4 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -30,7 +30,7 @@ class ScalarEvolution;
 class User;
 class Value;
 
-class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
+class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
   using BaseT = BasicTTIImplBase<HexagonTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index f17abf4c8af0..0342af65c1ef 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -25,7 +25,7 @@
 #include "llvm/Support/MathExtras.h"
 
 namespace llvm {
-class LanaiTTIImpl : public BasicTTIImplBase<LanaiTTIImpl> {
+class LanaiTTIImpl final : public BasicTTIImplBase<LanaiTTIImpl> {
   typedef BasicTTIImplBase<LanaiTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
index 5e3884cd8016..8f8173915b2f 100644
--- a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
+++ b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
@@ -15,7 +15,7 @@
 
 namespace llvm {
 
-class MipsTTIImpl : public BasicTTIImplBase<MipsTTIImpl> {
+class MipsTTIImpl final : public BasicTTIImplBase<MipsTTIImpl> {
   using BaseT = BasicTTIImplBase<MipsTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 98aea4e535f0..aa7850acbd64 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -26,7 +26,7 @@
 
 namespace llvm {
 
-class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
+class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
   typedef BasicTTIImplBase<NVPTXTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 361b2ff223ea..8618f3064c18 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
+class PPCTTIImpl final : public BasicTTIImplBase<PPCTTIImpl> {
   typedef BasicTTIImplBase<PPCTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 0a784461d67b..dd7e9f7709f8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
+class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
   using BaseT = BasicTTIImplBase<RISCVTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 3f211b5a8b16..40e561ba3888 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 
 namespace llvm {
-class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
+class SPIRVTTIImpl final : public BasicTTIImplBase<SPIRVTTIImpl> {
   using BaseT = BasicTTIImplBase<SPIRVTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b4bc41974b70..368a4af768b3 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -15,7 +15,7 @@
 
 namespace llvm {
 
-class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
+class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
   typedef BasicTTIImplBase<SystemZTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index 5cb028608782..5c0ddca62c76 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -49,7 +49,7 @@ static bool isVectorLaneType(llvm::Type &ElemTy) {
 
 namespace llvm {
 
-class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
+class VETTIImpl final : public BasicTTIImplBase<VETTIImpl> {
   using BaseT = BasicTTIImplBase<VETTIImpl>;
   friend BaseT;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 72673d6fbd80..8045f1b1d663 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 class InstCombiner;
 
-class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
   typedef BasicTTIImplBase<X86TTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
index f2c10518109d..cb809b992396 100644
--- a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
+class XCoreTTIImpl final : public BasicTTIImplBase<XCoreTTIImpl> {
   typedef BasicTTIImplBase<XCoreTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;

From 89f692a24f6a13ae5cf9e37f91abe6f34c403258 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 15 Jun 2025 09:43:18 +0100
Subject: [PATCH 0399/1322] [GlobalISel] Split Legalizer debug ouput into
 paragraphs. NFC (#143427)

This helps keep the legalizer output easier to read, splitting each
instructions legalization into a separate block.
---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 83ba71e4c9d4..028bffd1bf5a 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -118,7 +118,7 @@ LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
                                    LostDebugLocObserver &LocObserver) {
-  LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
+  LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
 
   MIRBuilder.setInstrAndDebugLoc(MI);
 

From 147a4c7743c44af3537bae69dcf513153b03b00e Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Sun, 15 Jun 2025 06:54:11 -0700
Subject: [PATCH 0400/1322] [rtsan] Fix issue where close test would lead to
 crash (#144017)

---
 .../tests/rtsan_test_interceptors_posix.cpp   | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index c2d07400593d..2ee35555c24d 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -449,12 +449,6 @@ TEST_F(RtsanFileTest, FcntlSetFdDiesWhenRealtime) {
   close(fd);
 }
 
-TEST(TestRtsanInterceptors, CloseDiesWhenRealtime) {
-  auto Func = []() { close(0); };
-  ExpectRealtimeDeath(Func, "close");
-  ExpectNonRealtimeSurvival(Func);
-}
-
 TEST(TestRtsanInterceptors, ChdirDiesWhenRealtime) {
   auto Func = []() { chdir("."); };
   ExpectRealtimeDeath(Func, "chdir");
@@ -606,8 +600,10 @@ protected:
   }
 
   void TearDown() override {
-    if (file != nullptr)
+    const bool is_open = fcntl(fd, F_GETFD) != -1;
+    if (is_open && file != nullptr)
       fclose(file);
+
     RtsanFileTest::TearDown();
   }
 
@@ -620,6 +616,16 @@ private:
   int fd = -1;
 };
 
+TEST_F(RtsanOpenedFileTest, CloseDiesWhenRealtime) {
+  auto Func = [this]() { close(GetOpenFd()); };
+  ExpectRealtimeDeath(Func, "close");
+}
+
+TEST_F(RtsanOpenedFileTest, CloseSurvivesWhenNotRealtime) {
+  auto Func = [this]() { close(GetOpenFd()); };
+  ExpectNonRealtimeSurvival(Func);
+}
+
 #if SANITIZER_INTERCEPT_FSEEK
 TEST_F(RtsanOpenedFileTest, FgetposDieWhenRealtime) {
   auto Func = [this]() {

From b983431c281a0acb9e446c7c9d72474f4d09e8e0 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Sun, 15 Jun 2025 06:55:22 -0700
Subject: [PATCH 0401/1322] [rtsan] Fix issue when intercepted function was not
 execve in test (#144018)

---
 compiler-rt/test/rtsan/fork_exec.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/rtsan/fork_exec.cpp b/compiler-rt/test/rtsan/fork_exec.cpp
index 3b2d2e5ca2f5..5890a0936a2f 100644
--- a/compiler-rt/test/rtsan/fork_exec.cpp
+++ b/compiler-rt/test/rtsan/fork_exec.cpp
@@ -45,7 +45,12 @@ int main() MAYBE_NONBLOCKING {
 }
 
 // CHECK-NOHALT: Intercepted call to {{.*}} `fork` {{.*}}
-// CHECK-NOHALT: Intercepted call to {{.*}} `execve` {{.*}}
+
+// We should also get some other intercepted call. On some systems this
+// is `execve`, on others, it's a lock to set up `execve`. In either
+// case, just check that we get a second intercepted call, don't sweat
+// the name.
+// CHECK-NOHALT: Intercepted call to {{.*}}
 
 // usleep checks that rtsan is still enabled in the parent process
 // See note in our interceptors file for why we don't look for `wait`

From 567647888ea3dd292827bbac445d316d6a6b0ecb Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Sun, 15 Jun 2025 23:00:16 +0800
Subject: [PATCH 0402/1322] [clang][bytecode] Avoid revisiting decomposition
 decl in visitDeclRef (#144226)

This simple patch removes the code to revisit `DecompositionDecl` in
`visitDeclRef`. The revisit will try to emit the initializer of the
`DecompositionDecl`, which could result in evaluation errors if the
`DecompositionDecl` is not within a constexpr context.
---
 clang/lib/AST/ByteCode/Compiler.cpp | 4 ----
 clang/test/AST/ByteCode/cxx17.cpp   | 8 ++++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index bf38b2e5d537..9fe4803ce98e 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6591,10 +6591,6 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     return T->isReferenceType();
   };
 
-  // DecompositionDecls are just proxies for us.
-  if (isa<DecompositionDecl>(VD))
-    return revisit(VD);
-
   if ((VD->hasGlobalStorage() || VD->isStaticDataMember()) &&
       typeShouldBeVisited(VD->getType())) {
     if (const Expr *Init = VD->getAnyInitializer();
diff --git a/clang/test/AST/ByteCode/cxx17.cpp b/clang/test/AST/ByteCode/cxx17.cpp
index 08a40e0a9286..0cf3a4f666d6 100644
--- a/clang/test/AST/ByteCode/cxx17.cpp
+++ b/clang/test/AST/ByteCode/cxx17.cpp
@@ -141,3 +141,11 @@ template <int x> constexpr auto c() {
 }
 
 auto y = c<1>(); // both-note {{in instantiation of function template specialization 'c<1>' requested here}}
+
+namespace NonConstexprStructuredBinding {
+  void f1() {
+    int arr[2] = {};
+    auto [a, b] = arr;
+    static_assert(&a != &b);
+  }
+}

From 886174a835208ecd2d06b378d2094b10611030d5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 15 Jun 2025 17:43:14 +0100
Subject: [PATCH 0403/1322] [X86] shuffle-blend.ll - regenerate test checks

---
 .../X86/avx512-shuffles/shuffle-blend.ll      | 35 +++++++------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
index 59e9fb1c4a9f..78957d10301c 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X86-AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW
 
@@ -61,27 +61,16 @@ entry:
 }
 
 define <64 x i8> @addb_selectw_64xi8(<64 x i8> %t0, <64 x i8> %t1) {
-; X86-AVX512F-LABEL: addb_selectw_64xi8:
-; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; X86-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; X86-AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
-; X86-AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
-; X86-AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; X86-AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; X86-AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm2, %zmm0
-; X86-AVX512F-NEXT:    retl
-;
-; X64-AVX512F-LABEL: addb_selectw_64xi8:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; X64-AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
-; X64-AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
-; X64-AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; X64-AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; X64-AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; X64-AVX512F-NEXT:    retq
+; AVX512F-LABEL: addb_selectw_64xi8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    ret{{[l|q]}}
 ;
 ; X86-AVX512BW-LABEL: addb_selectw_64xi8:
 ; X86-AVX512BW:       # %bb.0:

From 2669664605d00e1b3a9c479545b95a6844786d0c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:30 -0700
Subject: [PATCH 0404/1322] [modularize] Use range-based for loops (NFC)
 (#144244)

---
 clang-tools-extra/modularize/CoverageChecker.cpp     | 6 ++----
 clang-tools-extra/modularize/Modularize.cpp          | 4 ++--
 clang-tools-extra/modularize/ModularizeUtilities.cpp | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index fe6711398ab7..1345a6ef8f48 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -329,10 +329,8 @@ bool CoverageChecker::collectFileSystemHeaders() {
   else {
     // Otherwise we only look at the sub-trees specified by the
     // include paths.
-    for (std::vector<std::string>::const_iterator I = IncludePaths.begin(),
-      E = IncludePaths.end();
-      I != E; ++I) {
-      if (!collectFileSystemHeaders(*I))
+    for (const std::string &IncludePath : IncludePaths) {
+      if (!collectFileSystemHeaders(IncludePath))
         return false;
     }
   }
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index 7f8a19280b11..2a90c5e3f678 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -339,8 +339,8 @@ static std::string findInputFile(const CommandLineArguments &CLArgs) {
   llvm::opt::Visibility VisibilityMask(options::CC1Option);
   unsigned MissingArgIndex, MissingArgCount;
   SmallVector<const char *, 256> Argv;
-  for (auto I = CLArgs.begin(), E = CLArgs.end(); I != E; ++I)
-    Argv.push_back(I->c_str());
+  for (const std::string &CLArg : CLArgs)
+    Argv.push_back(CLArg.c_str());
   InputArgList Args = getDriverOptTable().ParseArgs(
       Argv, MissingArgIndex, MissingArgCount, VisibilityMask);
   std::vector<std::string> Inputs = Args.getAllArgValues(OPT_INPUT);
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 9ad1731915a8..8a24f21d658d 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -69,8 +69,7 @@ ModularizeUtilities *ModularizeUtilities::createModularizeUtilities(
 // Load all header lists and dependencies.
 std::error_code ModularizeUtilities::loadAllHeaderListsAndDependencies() {
   // For each input file.
-  for (auto I = InputFilePaths.begin(), E = InputFilePaths.end(); I != E; ++I) {
-    llvm::StringRef InputPath = *I;
+  for (llvm::StringRef InputPath : InputFilePaths) {
     // If it's a module map.
     if (InputPath.ends_with(".modulemap")) {
       // Load the module map.

From fef5df9d843745b2c4ed163911ed1305028350ca Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:37 -0700
Subject: [PATCH 0405/1322] [TableGen] Use range-based for loops (NFC)
 (#144250)

---
 .../utils/TableGen/ClangCommentCommandInfoEmitter.cpp | 11 +++++------
 ...ClangCommentHTMLNamedCharacterReferenceEmitter.cpp |  4 ++--
 clang/utils/TableGen/ClangDiagnosticsEmitter.cpp      |  4 ++--
 clang/utils/TableGen/ClangOpcodesEmitter.cpp          |  3 +--
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
index f15e30cd3f8f..161dd425fbc7 100644
--- a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
@@ -78,10 +78,10 @@ void clang::EmitClangCommentCommandInfo(const RecordKeeper &Records,
 
 static std::string MangleName(StringRef Str) {
   std::string Mangled;
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    switch (Str[i]) {
+  for (char C : Str) {
+    switch (C) {
     default:
-      Mangled += Str[i];
+      Mangled += C;
       break;
     case '(':
       Mangled += "lparen";
@@ -122,9 +122,8 @@ void clang::EmitClangCommentCommandList(const RecordKeeper &Records,
      << "#endif\n";
 
   ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command");
-  for (size_t i = 0, e = Tags.size(); i != e; ++i) {
-    const Record &Tag = *Tags[i];
-    std::string MangledName = MangleName(Tag.getValueAsString("Name"));
+  for (const Record *Tag : Tags) {
+    std::string MangledName = MangleName(Tag->getValueAsString("Name"));
 
     OS << "COMMENT_COMMAND(" << MangledName << ")\n";
   }
diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
index b8d8ac853a5c..e5eec5e7ca8d 100644
--- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
@@ -37,9 +37,9 @@ static bool translateCodePointToUTF8(unsigned CodePoint,
 
   raw_svector_ostream OS(CLiteral);
   OS << "\"";
-  for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
+  for (char C : UTF8) {
     OS << "\\x";
-    OS.write_hex(static_cast<unsigned char>(UTF8[i]));
+    OS.write_hex(static_cast<unsigned char>(C));
   }
   OS << "\"";
 
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index e347b89a85d4..bfc60f485cd3 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1794,8 +1794,8 @@ static std::string getDiagCategoryEnum(StringRef name) {
   if (name.empty())
     return "DiagCat_None";
   SmallString<256> enumName = StringRef("DiagCat_");
-  for (StringRef::iterator I = name.begin(), E = name.end(); I != E; ++I)
-    enumName += isalnum(*I) ? *I : '_';
+  for (char C : name)
+    enumName += isalnum(C) ? C : '_';
   return std::string(enumName);
 }
 
diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
index 5d6d90994cf3..9d0773e1aff8 100644
--- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
@@ -224,8 +224,7 @@ void ClangOpcodesEmitter::EmitProto(raw_ostream &OS, StringRef N,
   auto Args = R->getValueAsListOfDefs("Args");
   Enumerate(R, N, [&OS, &Args](ArrayRef<const Record *> TS, const Twine &ID) {
     OS << "bool emit" << ID << "(";
-    for (size_t I = 0, N = Args.size(); I < N; ++I) {
-      const auto *Arg = Args[I];
+    for (const Record *Arg : Args) {
       bool AsRef = Arg->getValueAsBit("AsRef");
       auto Name = Arg->getValueAsString("Name");
 

From d78eec864c60729685487c884724f27edd53b3b8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:45 -0700
Subject: [PATCH 0406/1322] [lld] Use range-based for loops (NFC) (#144251)

---
 lld/ELF/Arch/ARM.cpp            | 6 +++---
 lld/ELF/SyntheticSections.cpp   | 7 +++----
 lld/MachO/SyntheticSections.cpp | 4 +---
 lld/wasm/Driver.cpp             | 6 +++---
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index aa90fecc533e..91a673f13d68 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1317,11 +1317,11 @@ void elf::processArmCmseSymbols(Ctx &ctx) {
   // with its corresponding special symbol __acle_se_<sym>.
   parallelForEach(ctx.objectFiles, [&](InputFile *file) {
     MutableArrayRef<Symbol *> syms = file->getMutableSymbols();
-    for (size_t i = 0, e = syms.size(); i != e; ++i) {
-      StringRef symName = syms[i]->getName();
+    for (Symbol *&sym : syms) {
+      StringRef symName = sym->getName();
       auto it = ctx.symtab->cmseSymMap.find(symName);
       if (it != ctx.symtab->cmseSymMap.end())
-        syms[i] = it->second.acleSeSym;
+        sym = it->second.acleSeSym;
     }
   });
 }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 051e5cd04ef5..efec41a737b6 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -4026,10 +4026,9 @@ void MergeNoTailSection::finalizeContents() {
   // So far, section pieces have offsets from beginning of shards, but
   // we want offsets from beginning of the whole section. Fix them.
   parallelForEach(sections, [&](MergeInputSection *sec) {
-    for (size_t i = 0, e = sec->pieces.size(); i != e; ++i)
-      if (sec->pieces[i].live)
-        sec->pieces[i].outputOff +=
-            shardOffsets[getShardId(sec->pieces[i].hash)];
+    for (SectionPiece &piece : sec->pieces)
+      if (piece.live)
+        piece.outputOff += shardOffsets[getShardId(piece.hash)];
   });
 }
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 0b7f23304248..979a4ee6d813 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -947,9 +947,7 @@ uint64_t ObjCStubsSection::getSize() const {
 
 void ObjCStubsSection::writeTo(uint8_t *buf) const {
   uint64_t stubOffset = 0;
-  for (size_t i = 0, n = symbols.size(); i < n; ++i) {
-    Defined *sym = symbols[i];
-
+  for (Defined *sym : symbols) {
     auto methname = getMethname(sym);
     InputSection *selRef = ObjCSelRefsHelper::getSelRef(methname);
     assert(selRef != nullptr && "no selref for methname");
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 2b1fb945f41c..1c5d21c06f5a 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -1226,9 +1226,9 @@ static void wrapSymbols(ArrayRef<WrappedSymbol> wrapped) {
   // Update pointers in input files.
   parallelForEach(ctx.objectFiles, [&](InputFile *file) {
     MutableArrayRef<Symbol *> syms = file->getMutableSymbols();
-    for (size_t i = 0, e = syms.size(); i != e; ++i)
-      if (Symbol *s = map.lookup(syms[i]))
-        syms[i] = s;
+    for (Symbol *&sym : syms)
+      if (Symbol *s = map.lookup(sym))
+        sym = s;
   });
 
   // Update pointers in the symbol table.

From 8f5c338b89a22abc3191a0d931071c09630d6195 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:52 -0700
Subject: [PATCH 0407/1322] [Sema] Use a range-based for loop (NFC) (#144252)

Note that LLVM Coding Standards discourages for_each.
---
 clang/lib/Sema/SemaOverload.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 49e5a311e239..8c5f81f126c7 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9272,11 +9272,10 @@ class BuiltinOperatorOverloadBuilder {
     /// the candidates into a unique set, then move from that set into the list
     /// of arithmetic types.
     llvm::SmallSetVector<CanQualType, 2> BitIntCandidates;
-    llvm::for_each(CandidateTypes, [&BitIntCandidates](
-                                       BuiltinCandidateTypeSet &Candidate) {
+    for (BuiltinCandidateTypeSet &Candidate : CandidateTypes) {
       for (QualType BitTy : Candidate.bitint_types())
         BitIntCandidates.insert(CanQualType::CreateUnsafe(BitTy));
-    });
+    }
     llvm::move(BitIntCandidates, std::back_inserter(ArithmeticTypes));
     LastPromotedIntegralType = ArithmeticTypes.size();
     LastPromotedArithmeticType = ArithmeticTypes.size();

From b16d43a874748a496da5cd774dd864c95b78d6b0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 10:46:47 -0700
Subject: [PATCH 0408/1322] VE: Rename VEMCExpr::VK_ to VE::S_

Prepare for removing VEMCExpr. Adopt the newer naming convention adopted
by AMDGPU/WebAssembly.
---
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp  | 66 +++++++++----------
 .../VE/MCTargetDesc/VEELFObjectWriter.cpp     | 10 +--
 .../Target/VE/MCTargetDesc/VEMCAsmInfo.cpp    | 30 ++++-----
 .../VE/MCTargetDesc/VEMCCodeEmitter.cpp       |  2 +-
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp  | 32 ++++-----
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h    | 46 +++++++------
 llvm/lib/Target/VE/VEAsmPrinter.cpp           | 26 +++-----
 llvm/lib/Target/VE/VEISelLowering.cpp         | 49 +++++++-------
 8 files changed, 128 insertions(+), 133 deletions(-)

diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index a58ef127bbd5..418587947e1e 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -1042,7 +1042,7 @@ bool VEAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
 const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
                                             VEMCExpr::Specifier &Variant) {
   MCContext &Context = getParser().getContext();
-  Variant = VEMCExpr::VK_None;
+  Variant = VE::S_None;
 
   switch (E->getKind()) {
   case MCExpr::Target:
@@ -1055,51 +1055,51 @@ const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
 
     switch (SRE->getSpecifier()) {
-    case VEMCExpr::VK_None:
+    case VE::S_None:
       // Use VK_REFLONG to a symbol without modifiers.
-      Variant = VEMCExpr::VK_REFLONG;
+      Variant = VE::S_REFLONG;
       break;
-    case VEMCExpr::VK_HI32:
-      Variant = VEMCExpr::VK_HI32;
+    case VE::S_HI32:
+      Variant = VE::S_HI32;
       break;
-    case VEMCExpr::VK_LO32:
-      Variant = VEMCExpr::VK_LO32;
+    case VE::S_LO32:
+      Variant = VE::S_LO32;
       break;
-    case VEMCExpr::VK_PC_HI32:
-      Variant = VEMCExpr::VK_PC_HI32;
+    case VE::S_PC_HI32:
+      Variant = VE::S_PC_HI32;
       break;
-    case VEMCExpr::VK_PC_LO32:
-      Variant = VEMCExpr::VK_PC_LO32;
+    case VE::S_PC_LO32:
+      Variant = VE::S_PC_LO32;
       break;
-    case VEMCExpr::VK_GOT_HI32:
-      Variant = VEMCExpr::VK_GOT_HI32;
+    case VE::S_GOT_HI32:
+      Variant = VE::S_GOT_HI32;
       break;
-    case VEMCExpr::VK_GOT_LO32:
-      Variant = VEMCExpr::VK_GOT_LO32;
+    case VE::S_GOT_LO32:
+      Variant = VE::S_GOT_LO32;
       break;
-    case VEMCExpr::VK_GOTOFF_HI32:
-      Variant = VEMCExpr::VK_GOTOFF_HI32;
+    case VE::S_GOTOFF_HI32:
+      Variant = VE::S_GOTOFF_HI32;
       break;
-    case VEMCExpr::VK_GOTOFF_LO32:
-      Variant = VEMCExpr::VK_GOTOFF_LO32;
+    case VE::S_GOTOFF_LO32:
+      Variant = VE::S_GOTOFF_LO32;
       break;
-    case VEMCExpr::VK_PLT_HI32:
-      Variant = VEMCExpr::VK_PLT_HI32;
+    case VE::S_PLT_HI32:
+      Variant = VE::S_PLT_HI32;
       break;
-    case VEMCExpr::VK_PLT_LO32:
-      Variant = VEMCExpr::VK_PLT_LO32;
+    case VE::S_PLT_LO32:
+      Variant = VE::S_PLT_LO32;
       break;
-    case VEMCExpr::VK_TLS_GD_HI32:
-      Variant = VEMCExpr::VK_TLS_GD_HI32;
+    case VE::S_TLS_GD_HI32:
+      Variant = VE::S_TLS_GD_HI32;
       break;
-    case VEMCExpr::VK_TLS_GD_LO32:
-      Variant = VEMCExpr::VK_TLS_GD_LO32;
+    case VE::S_TLS_GD_LO32:
+      Variant = VE::S_TLS_GD_LO32;
       break;
-    case VEMCExpr::VK_TPOFF_HI32:
-      Variant = VEMCExpr::VK_TPOFF_HI32;
+    case VE::S_TPOFF_HI32:
+      Variant = VE::S_TPOFF_HI32;
       break;
-    case VEMCExpr::VK_TPOFF_LO32:
-      Variant = VEMCExpr::VK_TPOFF_LO32;
+    case VE::S_TPOFF_LO32:
+      Variant = VE::S_TPOFF_LO32;
       break;
     default:
       return nullptr;
@@ -1130,9 +1130,9 @@ const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
     if (!RHS)
       RHS = BE->getRHS();
 
-    if (LHSVariant == VEMCExpr::VK_None)
+    if (LHSVariant == VE::S_None)
       Variant = RHSVariant;
-    else if (RHSVariant == VEMCExpr::VK_None)
+    else if (RHSVariant == VE::S_None)
       Variant = LHSVariant;
     else if (LHSVariant == RHSVariant)
       Variant = LHSVariant;
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index e707bb2fe3e1..bdedde505295 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -40,10 +40,10 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                          const MCValue &Target,
                                          bool IsPCRel) const {
   switch (Target.getSpecifier()) {
-  case VEMCExpr::VK_TLS_GD_HI32:
-  case VEMCExpr::VK_TLS_GD_LO32:
-  case VEMCExpr::VK_TPOFF_HI32:
-  case VEMCExpr::VK_TPOFF_LO32:
+  case VE::S_TLS_GD_HI32:
+  case VE::S_TLS_GD_LO32:
+  case VE::S_TPOFF_HI32:
+  case VE::S_TPOFF_LO32:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -51,7 +51,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
     break;
   }
   if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Fixup.getValue())) {
-    if (SExpr->getSpecifier() == VEMCExpr::VK_PC_LO32)
+    if (SExpr->getSpecifier() == VE::S_PC_LO32)
       return ELF::R_VE_PC_LO32;
   }
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index fdde46f09d5b..ac580f79a77b 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -19,20 +19,20 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {VEMCExpr::VK_HI32, "hi"},
-    {VEMCExpr::VK_LO32, "lo"},
-    {VEMCExpr::VK_PC_HI32, "pc_hi"},
-    {VEMCExpr::VK_PC_LO32, "pc_lo"},
-    {VEMCExpr::VK_GOT_HI32, "got_hi"},
-    {VEMCExpr::VK_GOT_LO32, "got_lo"},
-    {VEMCExpr::VK_GOTOFF_HI32, "gotoff_hi"},
-    {VEMCExpr::VK_GOTOFF_LO32, "gotoff_lo"},
-    {VEMCExpr::VK_PLT_HI32, "plt_hi"},
-    {VEMCExpr::VK_PLT_LO32, "plt_lo"},
-    {VEMCExpr::VK_TLS_GD_HI32, "tls_gd_hi"},
-    {VEMCExpr::VK_TLS_GD_LO32, "tls_gd_lo"},
-    {VEMCExpr::VK_TPOFF_HI32, "tpoff_hi"},
-    {VEMCExpr::VK_TPOFF_LO32, "tpoff_lo"},
+    {VE::S_HI32, "hi"},
+    {VE::S_LO32, "lo"},
+    {VE::S_PC_HI32, "pc_hi"},
+    {VE::S_PC_LO32, "pc_lo"},
+    {VE::S_GOT_HI32, "got_hi"},
+    {VE::S_GOT_LO32, "got_lo"},
+    {VE::S_GOTOFF_HI32, "gotoff_hi"},
+    {VE::S_GOTOFF_LO32, "gotoff_lo"},
+    {VE::S_PLT_HI32, "plt_hi"},
+    {VE::S_PLT_LO32, "plt_lo"},
+    {VE::S_TLS_GD_HI32, "tls_gd_hi"},
+    {VE::S_TLS_GD_LO32, "tls_gd_lo"},
+    {VE::S_TPOFF_HI32, "tpoff_hi"},
+    {VE::S_TPOFF_LO32, "tpoff_lo"},
 };
 
 void VEELFMCAsmInfo::anchor() {}
@@ -61,6 +61,6 @@ void VEELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
                                         const MCSpecifierExpr &Expr) const {
   printExpr(OS, *Expr.getSubExpr());
   auto specifier = Expr.getSpecifier();
-  if (specifier && specifier != VEMCExpr::VK_REFLONG)
+  if (specifier && specifier != VE::S_REFLONG)
     OS << '@' << getSpecifierName(specifier);
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index 7dece1b309a9..c3fae1a0c77d 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -99,7 +99,7 @@ unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   const MCExpr *Expr = MO.getExpr();
   if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
-    auto Kind = VEMCExpr::getFixupKind(SExpr->getSpecifier());
+    auto Kind = VE::getFixupKind(SExpr->getSpecifier());
     Fixups.push_back(MCFixup::create(0, Expr, Kind));
     return 0;
   }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index fa4d9b18a9ad..ed0eafc75888 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -27,39 +27,39 @@ const VEMCExpr *VEMCExpr::create(Specifier S, const MCExpr *Expr,
   return new (Ctx) VEMCExpr(Expr, S);
 }
 
-VE::Fixups VEMCExpr::getFixupKind(MCSpecifierExpr::Spec S) {
+VE::Fixups VE::getFixupKind(uint8_t S) {
   switch (S) {
   default:
     llvm_unreachable("Unhandled VEMCExpr::Specifier");
-  case VK_REFLONG:
+  case VE::S_REFLONG:
     return VE::fixup_ve_reflong;
-  case VK_HI32:
+  case VE::S_HI32:
     return VE::fixup_ve_hi32;
-  case VK_LO32:
+  case VE::S_LO32:
     return VE::fixup_ve_lo32;
-  case VK_PC_HI32:
+  case VE::S_PC_HI32:
     return VE::fixup_ve_pc_hi32;
-  case VK_PC_LO32:
+  case VE::S_PC_LO32:
     return VE::fixup_ve_pc_lo32;
-  case VK_GOT_HI32:
+  case VE::S_GOT_HI32:
     return VE::fixup_ve_got_hi32;
-  case VK_GOT_LO32:
+  case VE::S_GOT_LO32:
     return VE::fixup_ve_got_lo32;
-  case VK_GOTOFF_HI32:
+  case VE::S_GOTOFF_HI32:
     return VE::fixup_ve_gotoff_hi32;
-  case VK_GOTOFF_LO32:
+  case VE::S_GOTOFF_LO32:
     return VE::fixup_ve_gotoff_lo32;
-  case VK_PLT_HI32:
+  case VE::S_PLT_HI32:
     return VE::fixup_ve_plt_hi32;
-  case VK_PLT_LO32:
+  case VE::S_PLT_LO32:
     return VE::fixup_ve_plt_lo32;
-  case VK_TLS_GD_HI32:
+  case VE::S_TLS_GD_HI32:
     return VE::fixup_ve_tls_gd_hi32;
-  case VK_TLS_GD_LO32:
+  case VE::S_TLS_GD_LO32:
     return VE::fixup_ve_tls_gd_lo32;
-  case VK_TPOFF_HI32:
+  case VE::S_TPOFF_HI32:
     return VE::fixup_ve_tpoff_hi32;
-  case VK_TPOFF_LO32:
+  case VE::S_TPOFF_LO32:
     return VE::fixup_ve_tpoff_lo32;
   }
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
index 4d191149d4aa..d4e0f77c8ece 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -22,25 +22,7 @@ namespace llvm {
 class StringRef;
 class VEMCExpr : public MCSpecifierExpr {
 public:
-  enum Specifier {
-    VK_None,
-
-    VK_REFLONG = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_HI32,        // @hi
-    VK_LO32,        // @lo
-    VK_PC_HI32,     // @pc_hi
-    VK_PC_LO32,     // @pc_lo
-    VK_GOT_HI32,    // @got_hi
-    VK_GOT_LO32,    // @got_lo
-    VK_GOTOFF_HI32, // @gotoff_hi
-    VK_GOTOFF_LO32, // @gotoff_lo
-    VK_PLT_HI32,    // @plt_hi
-    VK_PLT_LO32,    // @plt_lo
-    VK_TLS_GD_HI32, // @tls_gd_hi
-    VK_TLS_GD_LO32, // @tls_gd_lo
-    VK_TPOFF_HI32,  // @tpoff_hi
-    VK_TPOFF_LO32,  // @tpoff_lo
-  };
+  using Specifier = uint8_t;
 
 private:
   explicit VEMCExpr(const MCExpr *Expr, Specifier S)
@@ -52,10 +34,32 @@ public:
 
   bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAssembler *Asm) const override;
-
-  static VE::Fixups getFixupKind(Spec S);
 };
 
+namespace VE {
+enum Specifier {
+  S_None,
+
+  S_REFLONG = MCSymbolRefExpr::FirstTargetSpecifier,
+  S_HI32,        // @hi
+  S_LO32,        // @lo
+  S_PC_HI32,     // @pc_hi
+  S_PC_LO32,     // @pc_lo
+  S_GOT_HI32,    // @got_hi
+  S_GOT_LO32,    // @got_lo
+  S_GOTOFF_HI32, // @gotoff_hi
+  S_GOTOFF_LO32, // @gotoff_lo
+  S_PLT_HI32,    // @plt_hi
+  S_PLT_LO32,    // @plt_lo
+  S_TLS_GD_HI32, // @tls_gd_hi
+  S_TLS_GD_LO32, // @tls_gd_lo
+  S_TPOFF_HI32,  // @tpoff_hi
+  S_TPOFF_LO32,  // @tpoff_lo
+};
+
+VE::Fixups getFixupKind(uint8_t S);
+} // namespace VE
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index ee347cda0521..f0d6f5226854 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -194,8 +194,8 @@ void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
     case CodeModel::Small:
     case CodeModel::Medium:
     case CodeModel::Large:
-      emitHiLo(*OutStreamer, GOTLabel, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32,
-               MCRegOP, OutContext, STI);
+      emitHiLo(*OutStreamer, GOTLabel, VE::S_HI32, VE::S_LO32, MCRegOP,
+               OutContext, STI);
       break;
     }
     return;
@@ -209,14 +209,12 @@ void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
   // sic %plt
   // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%plt, %got)
   MCOperand cim24 = MCOperand::createImm(-24);
-  MCOperand loImm =
-      createGOTRelExprOp(VEMCExpr::VK_PC_LO32, GOTLabel, OutContext);
+  MCOperand loImm = createGOTRelExprOp(VE::S_PC_LO32, GOTLabel, OutContext);
   emitLEAzii(*OutStreamer, cim24, loImm, MCRegOP, STI);
   MCOperand M032 = MCOperand::createImm(M0(32));
   emitANDrm(*OutStreamer, MCRegOP, M032, MCRegOP, STI);
   emitSIC(*OutStreamer, RegPLT, STI);
-  MCOperand hiImm =
-      createGOTRelExprOp(VEMCExpr::VK_PC_HI32, GOTLabel, OutContext);
+  MCOperand hiImm = createGOTRelExprOp(VE::S_PC_HI32, GOTLabel, OutContext);
   emitLEASLrri(*OutStreamer, RegGOT, RegPLT, hiImm, MCRegOP, STI);
 }
 
@@ -257,14 +255,12 @@ void VEAsmPrinter::lowerGETFunPLTAndEmitMCInsts(const MachineInstr *MI,
   // sic %plt                            ; FIXME: is it safe to use %plt here?
   // lea.sl %dst, func@plt_hi(%plt, %dst)
   MCOperand cim24 = MCOperand::createImm(-24);
-  MCOperand loImm =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_LO32, AddrSym, OutContext);
+  MCOperand loImm = createGOTRelExprOp(VE::S_PLT_LO32, AddrSym, OutContext);
   emitLEAzii(*OutStreamer, cim24, loImm, MCRegOP, STI);
   MCOperand M032 = MCOperand::createImm(M0(32));
   emitANDrm(*OutStreamer, MCRegOP, M032, MCRegOP, STI);
   emitSIC(*OutStreamer, RegPLT, STI);
-  MCOperand hiImm =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_HI32, AddrSym, OutContext);
+  MCOperand hiImm = createGOTRelExprOp(VE::S_PLT_HI32, AddrSym, OutContext);
   emitLEASLrri(*OutStreamer, MCRegOP, RegPLT, hiImm, MCRegOP, STI);
 }
 
@@ -305,22 +301,20 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
   // lea.sl %s12, __tls_get_addr@plt_hi(%s12, %lr)
   // bsic %lr, (, %s12)
   MCOperand cim24 = MCOperand::createImm(-24);
-  MCOperand loImm =
-      createGOTRelExprOp(VEMCExpr::VK_TLS_GD_LO32, AddrSym, OutContext);
+  MCOperand loImm = createGOTRelExprOp(VE::S_TLS_GD_LO32, AddrSym, OutContext);
   emitLEAzii(*OutStreamer, cim24, loImm, RegS0, STI);
   MCOperand M032 = MCOperand::createImm(M0(32));
   emitANDrm(*OutStreamer, RegS0, M032, RegS0, STI);
   emitSIC(*OutStreamer, RegLR, STI);
-  MCOperand hiImm =
-      createGOTRelExprOp(VEMCExpr::VK_TLS_GD_HI32, AddrSym, OutContext);
+  MCOperand hiImm = createGOTRelExprOp(VE::S_TLS_GD_HI32, AddrSym, OutContext);
   emitLEASLrri(*OutStreamer, RegS0, RegLR, hiImm, RegS0, STI);
   MCOperand ci8 = MCOperand::createImm(8);
   MCOperand loImm2 =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_LO32, GetTLSLabel, OutContext);
+      createGOTRelExprOp(VE::S_PLT_LO32, GetTLSLabel, OutContext);
   emitLEAzii(*OutStreamer, ci8, loImm2, RegS12, STI);
   emitANDrm(*OutStreamer, RegS12, M032, RegS12, STI);
   MCOperand hiImm2 =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_HI32, GetTLSLabel, OutContext);
+      createGOTRelExprOp(VE::S_PLT_HI32, GetTLSLabel, OutContext);
   emitLEASLrri(*OutStreamer, RegS12, RegLR, hiImm2, RegS12, STI);
   emitBSIC(*OutStreamer, RegLR, RegS12, STI);
 }
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 313c894cafa8..b5a0d26abbf8 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -664,7 +664,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
-      Callee = makeHiLoPair(Callee, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+      Callee = makeHiLoPair(Callee, VE::S_HI32, VE::S_LO32, DAG);
     }
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (IsPICCall) {
@@ -673,7 +673,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
-      Callee = makeHiLoPair(Callee, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+      Callee = makeHiLoPair(Callee, VE::S_HI32, VE::S_LO32, DAG);
     }
   }
 
@@ -1020,8 +1020,8 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
       //     lea %reg, label@gotoff_lo
       //     and %reg, %reg, (32)0
       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
-      SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_GOTOFF_HI32,
-                                  VEMCExpr::VK_GOTOFF_LO32, DAG);
+      SDValue HiLo =
+          makeHiLoPair(Op, VE::S_GOTOFF_HI32, VE::S_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     }
@@ -1030,8 +1030,7 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     //     and %reg, %reg, (32)0
     //     lea.sl %reg, label@got_hi(%reg)
     //     ld %reg, (%reg, %got)
-    SDValue HiLo =
-        makeHiLoPair(Op, VEMCExpr::VK_GOT_HI32, VEMCExpr::VK_GOT_LO32, DAG);
+    SDValue HiLo = makeHiLoPair(Op, VE::S_GOT_HI32, VE::S_GOT_LO32, DAG);
     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
@@ -1046,7 +1045,7 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   case CodeModel::Medium:
   case CodeModel::Large:
     // abs64.
-    return makeHiLoPair(Op, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+    return makeHiLoPair(Op, VE::S_HI32, VE::S_LO32, DAG);
   }
 }
 
@@ -1782,12 +1781,11 @@ SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     SDValue Addr =
         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
     if (isPositionIndependent()) {
-      Addr = makeHiLoPair(Addr, VEMCExpr::VK_GOTOFF_HI32,
-                          VEMCExpr::VK_GOTOFF_LO32, DAG);
+      Addr = makeHiLoPair(Addr, VE::S_GOTOFF_HI32, VE::S_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
     }
-    return makeHiLoPair(Addr, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+    return makeHiLoPair(Addr, VE::S_HI32, VE::S_LO32, DAG);
   }
   }
 }
@@ -2011,8 +2009,7 @@ SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
   // In order to do so, we need to genarate correctly marked DAG node using
   // makeHiLoPair.
   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
-  SDValue HiLo =
-      makeHiLoPair(Op, VEMCExpr::VK_GOTOFF_HI32, VEMCExpr::VK_GOTOFF_LO32, DAG);
+  SDValue HiLo = makeHiLoPair(Op, VE::S_GOTOFF_HI32, VE::S_GOTOFF_LO32, DAG);
   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
 }
@@ -2038,14 +2035,14 @@ Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addMBB(TargetBB, VEMCExpr::VK_GOTOFF_LO32);
+        .addMBB(TargetBB, VE::S_GOTOFF_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
         .addReg(VE::SX15)
         .addReg(Tmp2, getKillRegState(true))
-        .addMBB(TargetBB, VEMCExpr::VK_GOTOFF_HI32);
+        .addMBB(TargetBB, VE::S_GOTOFF_HI32);
   } else {
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, TargetBB@lo
@@ -2054,14 +2051,14 @@ Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addMBB(TargetBB, VEMCExpr::VK_LO32);
+        .addMBB(TargetBB, VE::S_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
-        .addMBB(TargetBB, VEMCExpr::VK_HI32);
+        .addMBB(TargetBB, VE::S_HI32);
   }
   return Result;
 }
@@ -2099,14 +2096,14 @@ Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
           .addImm(0)
           .addImm(0)
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOTOFF_LO32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOTOFF_LO32);
       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
           .addReg(Tmp1, getKillRegState(true))
           .addImm(M0(32));
       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
           .addReg(VE::SX15)
           .addReg(Tmp2, getKillRegState(true))
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOTOFF_HI32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOTOFF_HI32);
     } else {
       Register Tmp1 = MRI.createVirtualRegister(RC);
       Register Tmp2 = MRI.createVirtualRegister(RC);
@@ -2119,14 +2116,14 @@ Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
           .addImm(0)
           .addImm(0)
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOT_LO32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOT_LO32);
       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
           .addReg(Tmp1, getKillRegState(true))
           .addImm(M0(32));
       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
           .addReg(VE::SX15)
           .addReg(Tmp2, getKillRegState(true))
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOT_HI32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOT_HI32);
       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
           .addReg(Tmp3, getKillRegState(true))
           .addImm(0)
@@ -2142,14 +2139,14 @@ Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addExternalSymbol(Symbol.data(), VEMCExpr::VK_LO32);
+        .addExternalSymbol(Symbol.data(), VE::S_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
-        .addExternalSymbol(Symbol.data(), VEMCExpr::VK_HI32);
+        .addExternalSymbol(Symbol.data(), VE::S_HI32);
   }
   return Result;
 }
@@ -2528,14 +2525,14 @@ VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_GOTOFF_LO32);
+        .addJumpTableIndex(MJTI, VE::S_GOTOFF_LO32);
     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
         .addReg(VE::SX15)
         .addReg(Tmp2, getKillRegState(true))
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_GOTOFF_HI32);
+        .addJumpTableIndex(MJTI, VE::S_GOTOFF_HI32);
   } else {
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, .LJTI0_0@lo
@@ -2544,14 +2541,14 @@ VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_LO32);
+        .addJumpTableIndex(MJTI, VE::S_LO32);
     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_HI32);
+        .addJumpTableIndex(MJTI, VE::S_HI32);
   }
 
   switch (JTE) {

From df54a2d9357fe7f56ca3c6fa2f07889449b50325 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 15 Jun 2025 19:31:30 +0100
Subject: [PATCH 0409/1322] [VPlan] Only skip induction phis in
 planContainsAdditionalSimps (NFC).

Skip induction phis when checking for simplifications, as they may not
be lowered directly be lowered to a corresponding PHI recipe. Reductions
and first-order recurrences will get lowered to phi recipes, unless they
are removed. Considering them for simplifications allows removing them
if there are no remaining users.

NFC as currently reduction and recurrence phis are not
simplified/removed if dead.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9b5ad1658953..eb04e2d5ca7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7061,7 +7061,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
                                     TheLoop](BasicBlock *BB) {
     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
-      if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
+      // Skip induction phis when checking for simplifications, as they may not
+      // be lowered directly be lowered to a corresponding PHI recipe.
+      if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
+          CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
         return false;
       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
     });

From 254a92d49a4c1e1f7f747b1c2f1ccbfd7f217880 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 11:41:33 -0700
Subject: [PATCH 0410/1322] MC: Add MCSpecifierExpr::create

as a target-agnostic implementation to replace target-specific
XXXMCExpr::create.
---
 llvm/include/llvm/MC/MCExpr.h                         | 10 +++++++---
 llvm/lib/MC/MCExpr.cpp                                | 10 ++++++++++
 llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp    | 10 +++++-----
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp |  4 ++--
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp    | 10 ----------
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h      |  4 ----
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp             |  6 +++---
 llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp       |  2 +-
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 1c72269e53e2..cd57fafc50b5 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -510,12 +510,16 @@ protected:
   // Target-specific relocation specifier code
   const Spec specifier;
 
-public:
-  explicit MCSpecifierExpr(const MCExpr *Expr, Spec S)
-      : MCExpr(Specifier, SMLoc()), Expr(Expr), specifier(S) {}
+  explicit MCSpecifierExpr(const MCExpr *Expr, Spec S, SMLoc Loc = SMLoc())
+      : MCExpr(Specifier, Loc), Expr(Expr), specifier(S) {}
   virtual ~MCSpecifierExpr() = default;
 
 public:
+  LLVM_ABI static const MCSpecifierExpr *
+  create(const MCExpr *Expr, Spec S, MCContext &Ctx, SMLoc Loc = SMLoc());
+  LLVM_ABI static const MCSpecifierExpr *
+  create(const MCSymbol *Sym, Spec S, MCContext &Ctx, SMLoc Loc = SMLoc());
+
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 2ae440cba46f..e83ce05b37a8 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -737,6 +737,16 @@ MCFragment *MCExpr::findAssociatedFragment() const {
   llvm_unreachable("Invalid assembly expression kind!");
 }
 
+const MCSpecifierExpr *MCSpecifierExpr::create(const MCExpr *Expr, Spec S,
+                                               MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCSpecifierExpr(Expr, S, Loc);
+}
+
+const MCSpecifierExpr *MCSpecifierExpr::create(const MCSymbol *Sym, Spec S,
+                                               MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S, Loc);
+}
+
 bool MCSpecifierExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                                 const MCAssembler *Asm) const {
   if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 187ecbaad4bb..90aacacd8ed2 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -848,14 +848,14 @@ bool SparcAsmParser::expandSETX(MCInst &Inst, SMLoc IDLoc,
   // sethi %hh(val), tmp
   Instructions.push_back(MCInstBuilder(SP::SETHIi)
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(Sparc::createSpecifierExpr(
-                                 getContext(), ValExpr, ELF::R_SPARC_HH22)));
+                             .addExpr(MCSpecifierExpr::create(
+                                 ValExpr, ELF::R_SPARC_HH22, getContext())));
   // or    tmp, %hm(val), tmp
   Instructions.push_back(MCInstBuilder(SP::ORri)
                              .addReg(MCTmpOp.getReg())
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(Sparc::createSpecifierExpr(
-                                 getContext(), ValExpr, ELF::R_SPARC_HM10)));
+                             .addExpr(MCSpecifierExpr::create(
+                                 ValExpr, ELF::R_SPARC_HM10, getContext())));
   // sllx  tmp, 32, tmp
   Instructions.push_back(MCInstBuilder(SP::SLLXri)
                              .addReg(MCTmpOp.getReg())
@@ -1689,7 +1689,7 @@ const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
     }
   }
 
-  return Sparc::createSpecifierExpr(getContext(), subExpr, RelType);
+  return MCSpecifierExpr::create(subExpr, RelType, getContext());
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 4156780e962d..800567bf58ff 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -50,7 +50,7 @@ SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
+    return MCSpecifierExpr::create(Sym, ELF::R_SPARC_DISP32, Ctx);
   }
 
   return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
@@ -62,7 +62,7 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
                                        MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
+    return MCSpecifierExpr::create(Sym, ELF::R_SPARC_DISP32, Ctx);
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 6a08fa5c9f3f..6d43b9371390 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -22,16 +22,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sparcmcexpr"
 
-const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
-                                              const MCExpr *Expr, uint16_t S) {
-  return new (Ctx) MCSpecifierExpr(Expr, S);
-}
-
-const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
-                                              const MCSymbol *Sym, uint16_t S) {
-  return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
 StringRef Sparc::getSpecifierName(uint16_t S) {
   // clang-format off
   switch (uint16_t(S)) {
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 78af9a815020..8e7c173c70cc 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -23,10 +23,6 @@ class StringRef;
 using SparcMCExpr = MCSpecifierExpr;
 
 namespace Sparc {
-const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCExpr *Expr,
-                                       uint16_t S);
-const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCSymbol *Sym,
-                                       uint16_t S);
 uint16_t parseSpecifier(StringRef name);
 StringRef getSpecifierName(uint16_t S);
 } // namespace Sparc
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index ffefdf97edab..dab2de7d56c0 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -82,7 +82,7 @@ public:
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, MCSym, Kind);
+  const SparcMCExpr *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +101,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, Add, Spec);
+  const SparcMCExpr *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
   return MCOperand::createExpr(expr);
 }
 
@@ -302,7 +302,7 @@ MCOperand SparcAsmPrinter::lowerOperand(const MachineOperand &MO) const {
 
     const MCExpr *expr = MCSymbolRefExpr::create(Symbol, OutContext);
     if (RelType)
-      expr = Sparc::createSpecifierExpr(OutContext, expr, RelType);
+      expr = MCSpecifierExpr::create(expr, RelType, OutContext);
     return MCOperand::createExpr(expr);
   }
 
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index be11ea272ed1..a42a67d91d84 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -39,7 +39,7 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     }
 
     MCContext &Ctx = getContext();
-    return Sparc::createSpecifierExpr(Ctx, SSym, ELF::R_SPARC_DISP32);
+    return MCSpecifierExpr::create(SSym, ELF::R_SPARC_DISP32, Ctx);
   }
 
   return TargetLoweringObjectFileELF::getTTypeGlobalReference(GV, Encoding, TM,

From 72de33a406383cb8555234c40e7b31db593e164f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 11:52:43 -0700
Subject: [PATCH 0411/1322] MC: Add MCAsmInfo::evaluateAsRelocatableImpl and
 replace VEMCExpr with MCSpecifierExpr

Expressions with specifier can only be folded during relocation
generatin. At parse time the `MCAssembler *` argument might be null, and
targets should not rely on the evaluateAsRelocatable result.

Therefore, we can move evaluateAsRelocatableImpl from MCSpecifierExpr to
MCAsmInfo, so that targets do not need to inherit from MCSpecifierExpr.
---
 llvm/include/llvm/MC/MCAsmInfo.h               |  4 ++++
 llvm/lib/MC/MCAsmInfo.cpp                      |  8 ++++++++
 llvm/lib/MC/MCExpr.cpp                         |  5 ++++-
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp   | 14 +++++++-------
 .../VE/MCTargetDesc/VEELFObjectWriter.cpp      |  2 +-
 .../lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp | 10 ++++++++++
 llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h  |  2 ++
 .../Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp |  2 +-
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp   | 13 -------------
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h     | 15 ---------------
 llvm/lib/Target/VE/VEAsmPrinter.cpp            | 18 ++++++++----------
 llvm/lib/Target/VE/VEMCInstLower.cpp           |  4 ++--
 12 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 1f2ea0cfaaff..a7bf1b965bf2 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -25,6 +25,7 @@
 
 namespace llvm {
 
+class MCAssembler;
 class MCContext;
 class MCCFIInstruction;
 class MCExpr;
@@ -33,6 +34,7 @@ class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbol;
+class MCValue;
 class raw_ostream;
 
 namespace WinEH {
@@ -714,6 +716,8 @@ public:
 
   void printExpr(raw_ostream &, const MCExpr &) const;
   virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
+  virtual bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &Res,
+                                         const MCAssembler *Asm) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index 13b077349a58..e8eaf4619df5 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -163,3 +163,11 @@ void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
   // migrate to MCAsmInfo::printSpecifierExpr.
   Expr.printImpl(OS, this);
 }
+
+bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                          MCValue &Res,
+                                          const MCAssembler *Asm) const {
+  // TODO: Remove after all targets that use MCSpecifierExpr migrate to
+  // MCAsmInfo::evaluateAsRelocatableImpl.
+  return Expr.evaluateAsRelocatableImpl(Res, Asm);
+}
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index e83ce05b37a8..5ccad6d48797 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -680,7 +680,10 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     return true;
   }
   case Specifier:
-    return cast<MCSpecifierExpr>(this)->evaluateAsRelocatableImpl(Res, Asm);
+    // Fold the expression during relocation generation. As parse time Asm might
+    // be null, and targets should not rely on the folding.
+    return Asm && Asm->getContext().getAsmInfo()->evaluateAsRelocatableImpl(
+                      cast<MCSpecifierExpr>(*this), Res, Asm);
   }
 
   llvm_unreachable("Invalid assembly expression kind!");
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 418587947e1e..c54ce40de45f 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -73,7 +73,7 @@ class VEAsmParser : public MCTargetAsmParser {
   ParseStatus parseVEAsmOperand(std::unique_ptr<VEOperand> &Operand);
 
   // Helper function to parse expression with a symbol.
-  const MCExpr *extractSpecifier(const MCExpr *E, VEMCExpr::Specifier &Variant);
+  const MCExpr *extractSpecifier(const MCExpr *E, VE::Specifier &Variant);
   bool parseExpression(const MCExpr *&EVal);
 
   // Split the mnemonic stripping conditional code and quantifiers
@@ -1036,11 +1036,11 @@ bool VEAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
 /// Extract \code @lo32/@hi32/etc \endcode specifier from expression.
 /// Recursively scan the expression and check for VK_HI32/LO32/etc
 /// symbol variants.  If all symbols with modifier use the same
-/// variant, return the corresponding VEMCExpr::Specifier,
+/// variant, return the corresponding VE::Specifier,
 /// and a modified expression using the default symbol variant.
 /// Otherwise, return NULL.
 const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
-                                            VEMCExpr::Specifier &Variant) {
+                                            VE::Specifier &Variant) {
   MCContext &Context = getParser().getContext();
   Variant = VE::S_None;
 
@@ -1118,7 +1118,7 @@ const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
 
   case MCExpr::Binary: {
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    VEMCExpr::Specifier LHSVariant, RHSVariant;
+    VE::Specifier LHSVariant, RHSVariant;
     const MCExpr *LHS = extractSpecifier(BE->getLHS(), LHSVariant);
     const MCExpr *RHS = extractSpecifier(BE->getRHS(), RHSVariant);
 
@@ -1153,11 +1153,11 @@ bool VEAsmParser::parseExpression(const MCExpr *&EVal) {
   if (getParser().parseExpression(EVal))
     return true;
 
-  // Convert MCSymbolRefExpr with VK_* to MCExpr with VK_*.
-  VEMCExpr::Specifier Specifier;
+  // Convert MCSymbolRefExpr with specifier to MCSpecifierExpr.
+  VE::Specifier Specifier;
   const MCExpr *E = extractSpecifier(EVal, Specifier);
   if (E)
-    EVal = VEMCExpr::create(Specifier, E, getParser().getContext());
+    EVal = MCSpecifierExpr::create(E, Specifier, getParser().getContext());
 
   return false;
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index bdedde505295..0e3f5d18de07 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -50,7 +50,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
   default:
     break;
   }
-  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Fixup.getValue())) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Fixup.getValue())) {
     if (SExpr->getSpecifier() == VE::S_PC_LO32)
       return ELF::R_VE_PC_LO32;
   }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index ac580f79a77b..8eb3aedd668e 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -14,6 +14,7 @@
 #include "VEMCExpr.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -64,3 +65,12 @@ void VEELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
   if (specifier && specifier != VE::S_REFLONG)
     OS << '@' << getSpecifierName(specifier);
 }
+
+bool VEELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                               MCValue &Res,
+                                               const MCAssembler *Asm) const {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
index 444f422c7ec1..2d73c94e2113 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
@@ -26,6 +26,8 @@ public:
   explicit VEELFMCAsmInfo(const Triple &TheTriple);
   void printSpecifierExpr(raw_ostream &OS,
                           const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index c3fae1a0c77d..712de5accce5 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -98,7 +98,7 @@ unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   assert(MO.isExpr());
 
   const MCExpr *Expr = MO.getExpr();
-  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
+  if (const auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     auto Kind = VE::getFixupKind(SExpr->getSpecifier());
     Fixups.push_back(MCFixup::create(0, Expr, Kind));
     return 0;
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index ed0eafc75888..ca13aba095e2 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -22,11 +22,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "vemcexpr"
 
-const VEMCExpr *VEMCExpr::create(Specifier S, const MCExpr *Expr,
-                                 MCContext &Ctx) {
-  return new (Ctx) VEMCExpr(Expr, S);
-}
-
 VE::Fixups VE::getFixupKind(uint8_t S) {
   switch (S) {
   default:
@@ -63,11 +58,3 @@ VE::Fixups VE::getFixupKind(uint8_t S) {
     return VE::fixup_ve_tpoff_lo32;
   }
 }
-
-bool VEMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(specifier);
-  return true;
-}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
index d4e0f77c8ece..b7913513bd51 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -20,21 +20,6 @@
 namespace llvm {
 
 class StringRef;
-class VEMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint8_t;
-
-private:
-  explicit VEMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const VEMCExpr *create(Specifier Kind, const MCExpr *Expr,
-                                MCContext &Ctx);
-
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-};
 
 namespace VE {
 enum Specifier {
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index f0d6f5226854..af0dc0404d3c 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -67,18 +67,17 @@ public:
 };
 } // end of anonymous namespace
 
-static MCOperand createVEMCOperand(VEMCExpr::Specifier Kind, MCSymbol *Sym,
+static MCOperand createVEMCOperand(VE::Specifier Kind, MCSymbol *Sym,
                                    MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const VEMCExpr *expr = VEMCExpr::create(Kind, MCSym, OutContext);
-  return MCOperand::createExpr(expr);
+  return MCOperand::createExpr(
+      MCSpecifierExpr::create(MCSym, Kind, OutContext));
 }
 
-static MCOperand createGOTRelExprOp(VEMCExpr::Specifier Kind,
-                                    MCSymbol *GOTLabel, MCContext &OutContext) {
+static MCOperand createGOTRelExprOp(VE::Specifier Kind, MCSymbol *GOTLabel,
+                                    MCContext &OutContext) {
   const MCSymbolRefExpr *GOT = MCSymbolRefExpr::create(GOTLabel, OutContext);
-  const VEMCExpr *expr = VEMCExpr::create(Kind, GOT, OutContext);
-  return MCOperand::createExpr(expr);
+  return MCOperand::createExpr(MCSpecifierExpr::create(GOT, Kind, OutContext));
 }
 
 static void emitSIC(MCStreamer &OutStreamer, MCOperand &RD,
@@ -166,9 +165,8 @@ static void emitANDrm(MCStreamer &OutStreamer, MCOperand &RS1, MCOperand &Imm,
 }
 
 static void emitHiLo(MCStreamer &OutStreamer, MCSymbol *GOTSym,
-                     VEMCExpr::Specifier HiKind, VEMCExpr::Specifier LoKind,
-                     MCOperand &RD, MCContext &OutContext,
-                     const MCSubtargetInfo &STI) {
+                     VE::Specifier HiKind, VE::Specifier LoKind, MCOperand &RD,
+                     MCContext &OutContext, const MCSubtargetInfo &STI) {
 
   MCOperand hi = createVEMCOperand(HiKind, GOTSym, OutContext);
   MCOperand lo = createVEMCOperand(LoKind, GOTSym, OutContext);
diff --git a/llvm/lib/Target/VE/VEMCInstLower.cpp b/llvm/lib/Target/VE/VEMCInstLower.cpp
index bed71df3921c..a438d8740cd0 100644
--- a/llvm/lib/Target/VE/VEMCInstLower.cpp
+++ b/llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 static MCOperand LowerSymbolOperand(const MachineInstr *MI,
                                     const MachineOperand &MO,
                                     const MCSymbol *Symbol, AsmPrinter &AP) {
-  VEMCExpr::Specifier Kind = (VEMCExpr::Specifier)MO.getTargetFlags();
+  auto Kind = (VE::Specifier)MO.getTargetFlags();
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, AP.OutContext);
   // Add offset iff MO is not jump table info or machine basic block.
@@ -36,7 +36,7 @@ static MCOperand LowerSymbolOperand(const MachineInstr *MI,
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(MO.getOffset(), AP.OutContext),
         AP.OutContext);
-  Expr = VEMCExpr::create(Kind, Expr, AP.OutContext);
+  Expr = MCSpecifierExpr::create(Expr, Kind, AP.OutContext);
   return MCOperand::createExpr(Expr);
 }
 

From 490d7bb89a029edd037ed5e46747d0085a649ee8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:18:46 -0700
Subject: [PATCH 0412/1322] Xtensa: Remove unneeded XtensaMCExpr::create calls

MCSpecifierExpr and its subclasses should only be used with the
relocation specifier is not zero.
---
 llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp | 12 +++---------
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp          |  1 -
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index e0bbbc79b201..1f6cfec8edf4 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -393,9 +393,7 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case Xtensa::L32R: {
     const MCSymbolRefExpr *OpExpr =
         static_cast<const MCSymbolRefExpr *>(Inst.getOperand(1).getExpr());
-    XtensaMCExpr::Specifier Kind = XtensaMCExpr::VK_None;
-    const MCExpr *NewOpExpr = XtensaMCExpr::create(OpExpr, Kind, getContext());
-    Inst.getOperand(1).setExpr(NewOpExpr);
+    Inst.getOperand(1).setExpr(OpExpr);
     break;
   }
   case Xtensa::MOVI: {
@@ -413,10 +411,8 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         const MCExpr *Value = MCConstantExpr::create(ImmOp64, getContext());
         MCSymbol *Sym = getContext().createTempSymbol();
         const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-        const MCExpr *OpExpr =
-            XtensaMCExpr::create(Expr, XtensaMCExpr::VK_None, getContext());
         TmpInst.addOperand(Inst.getOperand(0));
-        MCOperand Op1 = MCOperand::createExpr(OpExpr);
+        MCOperand Op1 = MCOperand::createExpr(Expr);
         TmpInst.addOperand(Op1);
         TS.emitLiteral(Sym, Value, true, IDLoc);
         Inst = TmpInst;
@@ -428,10 +424,8 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       const MCExpr *Value = Inst.getOperand(1).getExpr();
       MCSymbol *Sym = getContext().createTempSymbol();
       const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-      const MCExpr *OpExpr =
-          XtensaMCExpr::create(Expr, XtensaMCExpr::VK_None, getContext());
       TmpInst.addOperand(Inst.getOperand(0));
-      MCOperand Op1 = MCOperand::createExpr(OpExpr);
+      MCOperand Op1 = MCOperand::createExpr(Expr);
       TmpInst.addOperand(Op1);
       Inst = TmpInst;
       TS.emitLiteral(Sym, Value, true, IDLoc);
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index 9182ea272bef..4f3a2e791a3c 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -257,7 +257,6 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   const MCExpr *ME = MCSymbolRefExpr::create(Symbol, OutContext);
-  ME = XtensaMCExpr::create(ME, Kind, OutContext);
 
   if (Offset) {
     // Assume offset is never negative.

From cf9665dd2bcef3ff2f3e22d3f44e8603f4ba9577 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:32:10 -0700
Subject: [PATCH 0413/1322] Xtensa: Migrate to newer relocation specifier
 representation

* Rename specifier constants from XtensaMCExpr::Specifier::VK_ to
  Xtensa::S_, following Sparc and VE.
* Use MCAsmInfo::printSpecifierExpr instead of MCExpr::print.
* Remove unneeded XtensaMCExpr. Just use MCSpecifierExpr when a
  specifier is needed.
---
 .../Target/Xtensa/MCTargetDesc/CMakeLists.txt |  1 -
 .../Xtensa/MCTargetDesc/XtensaInstPrinter.cpp | 21 +++-----
 .../Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp   | 21 ++++++++
 .../Xtensa/MCTargetDesc/XtensaMCAsmInfo.h     |  3 ++
 .../Xtensa/MCTargetDesc/XtensaMCExpr.cpp      | 52 -------------------
 .../Target/Xtensa/MCTargetDesc/XtensaMCExpr.h | 21 ++------
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp   | 13 ++---
 7 files changed, 40 insertions(+), 92 deletions(-)
 delete mode 100644 llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp

diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt
index dc12863394c7..6c5a6bef5e24 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMXtensaDesc
   XtensaInstPrinter.cpp
   XtensaMCAsmInfo.cpp
   XtensaMCCodeEmitter.cpp
-  XtensaMCExpr.cpp
   XtensaMCTargetDesc.cpp
   XtensaTargetStreamer.cpp
 
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
index fc5e1780de2e..408a6ac01de9 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "XtensaInstPrinter.h"
 #include "MCTargetDesc/XtensaMCExpr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegister.h"
@@ -35,14 +36,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   if (!(SRE = cast<MCSymbolRefExpr>(Expr)))
     assert(false && "Unexpected MCExpr type.");
 
-  auto Spec = XtensaMCExpr::Specifier(SRE->getKind());
-  switch (Spec) {
-  case XtensaMCExpr::VK_None:
-    break;
-  // TODO
-  default:
-    report_fatal_error("Invalid kind!");
-  }
+  assert(SRE->getSpecifier() == 0);
 
   OS << SRE->getSymbol();
 
@@ -51,9 +45,6 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
       OS << '+';
     OS << Offset;
   }
-
-  if (Spec != XtensaMCExpr::VK_None)
-    OS << ')';
 }
 
 void XtensaInstPrinter::printOperand(const MCOperand &MC, raw_ostream &O) {
@@ -97,7 +88,7 @@ void XtensaInstPrinter::printBranchTarget(const MCInst *MI, uint64_t Address,
     int64_t Val = MC.getImm() + 4;
     printPCRelImm(Address, Val, O);
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
@@ -109,7 +100,7 @@ void XtensaInstPrinter::printLoopTarget(const MCInst *MI, uint64_t Address,
     int64_t Val = MC.getImm() + 4;
     printPCRelImm(Address, Val, O);
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI, true);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
@@ -121,7 +112,7 @@ void XtensaInstPrinter::printJumpTarget(const MCInst *MI, uint64_t Address,
     int64_t Val = MC.getImm() + 4;
     printPCRelImm(Address, Val, O);
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
   ;
@@ -168,7 +159,7 @@ void XtensaInstPrinter::printL32RTarget(const MCInst *MI, uint64_t Address,
       printPCRelImm(Address, Value, O);
     }
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp
index 28764d369247..0b20f2e14a84 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "XtensaMCAsmInfo.h"
+#include "XtensaMCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -30,3 +32,22 @@ XtensaMCAsmInfo::XtensaMCAsmInfo(const Triple &TT) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
   AlignmentIsInBytes = false;
 }
+
+void XtensaMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                         const MCSpecifierExpr &Expr) const {
+  StringRef S = Xtensa::getSpecifierName(Expr.getSpecifier());
+  if (!S.empty())
+    OS << '%' << S << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (!S.empty())
+    OS << ')';
+}
+
+uint8_t Xtensa::parseSpecifier(StringRef name) { return 0; }
+
+StringRef Xtensa::getSpecifierName(uint8_t S) {
+  switch (S) {
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h
index a86a95f6be37..6f6f4bcb7047 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h
@@ -23,6 +23,9 @@ class Triple;
 class XtensaMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit XtensaMCAsmInfo(const Triple &TT);
+
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp
deleted file mode 100644
index f7f92e1646c3..000000000000
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- XtensaMCExpr.cpp - Xtensa specific MC expression classes ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the assembly expression modifiers
-// accepted by the Xtensa architecture
-//
-//===----------------------------------------------------------------------===//
-
-#include "XtensaMCExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "xtensamcexpr"
-
-const XtensaMCExpr *XtensaMCExpr::create(const MCExpr *Expr, Specifier S,
-                                         MCContext &Ctx) {
-  return new (Ctx) XtensaMCExpr(Expr, S);
-}
-
-void XtensaMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool HasSpecifier = getSpecifier() != VK_None;
-  if (HasSpecifier)
-    OS << '%' << getSpecifierName(getSpecifier()) << '(';
-  Expr->print(OS, MAI);
-  if (HasSpecifier)
-    OS << ')';
-}
-
-XtensaMCExpr::Specifier XtensaMCExpr::parseSpecifier(StringRef name) {
-  return StringSwitch<XtensaMCExpr::Specifier>(name).Default(VK_None);
-}
-
-StringRef XtensaMCExpr::getSpecifierName(Specifier S) {
-  switch (S) {
-  default:
-    llvm_unreachable("Invalid ELF symbol kind");
-  }
-}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h
index 54b5ad30516b..5a7b1ee9880f 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h
@@ -20,24 +20,13 @@
 namespace llvm {
 
 class StringRef;
-class XtensaMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-  enum { VK_None, VK_TPOFF };
 
-private:
-  explicit XtensaMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
+namespace Xtensa {
+enum Specifier { S_None, S_TPOFF };
 
-public:
-  static const XtensaMCExpr *create(const MCExpr *Expr, Specifier,
-                                    MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static Specifier parseSpecifier(StringRef name);
-  static StringRef getSpecifierName(Specifier Kind);
-};
+uint8_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint8_t S);
+} // namespace Xtensa
 
 } // end namespace llvm.
 
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index 4f3a2e791a3c..4e3ed4b9e8ee 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -32,13 +32,13 @@
 
 using namespace llvm;
 
-static XtensaMCExpr::Specifier
+static Xtensa::Specifier
 getModifierSpecifier(XtensaCP::XtensaCPModifier Modifier) {
   switch (Modifier) {
   case XtensaCP::no_modifier:
-    return XtensaMCExpr::VK_None;
+    return Xtensa::S_None;
   case XtensaCP::TPOFF:
-    return XtensaMCExpr::VK_TPOFF;
+    return Xtensa::S_TPOFF;
   }
   report_fatal_error("Invalid XtensaCPModifier!");
 }
@@ -92,7 +92,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
   MCSymbol *LblSym = GetCPISymbol(ACPV->getLabelId());
   auto *TS =
       static_cast<XtensaTargetStreamer *>(OutStreamer->getTargetStreamer());
-  XtensaMCExpr::Specifier VK = getModifierSpecifier(ACPV->getModifier());
+  auto Spec = getModifierSpecifier(ACPV->getModifier());
 
   if (ACPV->getModifier() != XtensaCP::no_modifier) {
     std::string SymName(MCSym->getName());
@@ -101,7 +101,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
     MCSym = OutContext.getOrCreateSymbol(SymName);
   }
 
-  const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, VK, OutContext);
+  const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, Spec, OutContext);
   TS->emitLiteral(LblSym, Expr, false);
 }
 
@@ -227,8 +227,6 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
                                      MachineOperand::MachineOperandType MOTy,
                                      unsigned Offset) const {
   const MCSymbol *Symbol;
-  XtensaMCExpr::Specifier Kind = XtensaMCExpr::VK_None;
-
   switch (MOTy) {
   case MachineOperand::MO_GlobalAddress:
     Symbol = getSymbol(MO.getGlobal());
@@ -257,7 +255,6 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   const MCExpr *ME = MCSymbolRefExpr::create(Symbol, OutContext);
-
   if (Offset) {
     // Assume offset is never negative.
     assert(Offset > 0);

From 7c22612b2948d8657b4a22ce59870ddd708c4677 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:43:37 -0700
Subject: [PATCH 0414/1322] SPARC: Remove dead specifier code from asm operand
 printer

We don't currently print %specifier( ) for asm operands.
The old code was also incorrect - as it did not print "(".
---
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index dab2de7d56c0..f4201f9a8dc1 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -371,11 +371,7 @@ void SparcAsmPrinter::emitFunctionBodyStart() {
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
-  const MachineOperand &MO = MI->getOperand (opNum);
-  auto TF = MO.getTargetFlags();
-
-  StringRef Spec = Sparc::getSpecifierName(TF);
-  O << Spec;
+  const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
     O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
@@ -406,8 +402,6 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   default:
     llvm_unreachable("<unknown operand type>");
   }
-  if (!Spec.empty())
-    O << ")";
 }
 
 void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,

From d3e9e2d433a666d6620afb00a1533ef4937c667f Mon Sep 17 00:00:00 2001
From: Ross Kirsling <ross.kirsling@sony.com>
Date: Sun, 15 Jun 2025 12:45:08 -0700
Subject: [PATCH 0415/1322] [Clang] Fix typo in is_replaceable diagnostic
 (#144247)

Adjustment to #143265; `because it not` should be `because it is not`.
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td     | 2 +-
 clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8fe7ad6138aa..979ff60b73b7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1779,7 +1779,7 @@ def note_unsatisfied_trait_reason
            "%HasArcLifetime{has an ARC lifetime qualifier}|"
            "%VLA{is a variably-modified type}|"
            "%VBase{has a virtual base %1}|"
-           "%NotScalarOrClass{not %select{a|an array of objects of}1 scalar or "
+           "%NotScalarOrClass{is not %select{a|an array of objects of}1 scalar or "
            "class type}|"
            "%NTRBase{has a non-trivially-relocatable base %1}|"
            "%NTRField{has a non-trivially-relocatable member %1 of type %2}|"
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index a8c78f6304ca..5210354a66d4 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -166,7 +166,7 @@ static_assert(__builtin_is_replaceable(const volatile int));
 static_assert(__builtin_is_replaceable(void()));
 // expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(void ())}} \
 // expected-note@-1 {{'void ()' is not replaceable}} \
-// expected-note@-1 {{because it not a scalar or class type}}
+// expected-note@-1 {{because it is not a scalar or class type}}
 
 struct B {
  virtual ~B();

From 5cf138a68744904562e81436181df668b00cdb1f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:53:30 -0700
Subject: [PATCH 0416/1322] M68k: Replace M68kMCExpr::VK_ to M68k::S_

Prepare for removing VEMCExpr. Adopt the newer naming convention adopted
by AMDGPU/WebAssembly/VE.
---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |  2 +-
 llvm/lib/Target/M68k/M68kMCInstLower.cpp      | 20 +++++------
 .../M68k/MCTargetDesc/M68kELFObjectWriter.cpp | 32 ++++++++---------
 .../M68k/MCTargetDesc/M68kMCAsmInfo.cpp       | 12 +++----
 .../Target/M68k/MCTargetDesc/M68kMCExpr.cpp   |  2 +-
 .../lib/Target/M68k/MCTargetDesc/M68kMCExpr.h | 34 +++++++++----------
 6 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 9d3ab606ab8c..c1860fa88a83 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -2833,7 +2833,7 @@ unsigned M68kTargetLowering::getJumpTableEncoding() const {
 const MCExpr *M68kTargetLowering::LowerCustomJumpTableEntry(
     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
     unsigned uid, MCContext &Ctx) const {
-  return MCSymbolRefExpr::create(MBB->getSymbol(), M68kMCExpr::VK_GOTOFF, Ctx);
+  return MCSymbolRefExpr::create(MBB->getSymbol(), M68k::S_GOTOFF, Ctx);
 }
 
 SDValue M68kTargetLowering::getPICJumpTableRelocBase(SDValue Table,
diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
index 8698fc0de471..b256d56c032c 100644
--- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp
+++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
@@ -76,7 +76,7 @@ MCOperand M68kMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // FIXME We would like an efficient form for this, so we don't have to do a
   // lot of extra uniquing. This fixme is originally from X86
   const MCExpr *Expr = nullptr;
-  M68kMCExpr::Specifier RefKind = M68kMCExpr::VK_None;
+  M68k::Specifier RefKind = M68k::S_None;
 
   switch (MO.getTargetFlags()) {
   default:
@@ -86,31 +86,31 @@ MCOperand M68kMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case M68kII::MO_PC_RELATIVE_ADDRESS:
     break;
   case M68kII::MO_GOTPCREL:
-    RefKind = M68kMCExpr::VK_GOTPCREL;
+    RefKind = M68k::S_GOTPCREL;
     break;
   case M68kII::MO_GOT:
-    RefKind = M68kMCExpr::VK_GOT;
+    RefKind = M68k::S_GOT;
     break;
   case M68kII::MO_GOTOFF:
-    RefKind = M68kMCExpr::VK_GOTOFF;
+    RefKind = M68k::S_GOTOFF;
     break;
   case M68kII::MO_PLT:
-    RefKind = M68kMCExpr::VK_PLT;
+    RefKind = M68k::S_PLT;
     break;
   case M68kII::MO_TLSGD:
-    RefKind = M68kMCExpr::VK_TLSGD;
+    RefKind = M68k::S_TLSGD;
     break;
   case M68kII::MO_TLSLD:
-    RefKind = M68kMCExpr::VK_TLSLD;
+    RefKind = M68k::S_TLSLD;
     break;
   case M68kII::MO_TLSLDM:
-    RefKind = M68kMCExpr::VK_TLSLDM;
+    RefKind = M68k::S_TLSLDM;
     break;
   case M68kII::MO_TLSIE:
-    RefKind = M68kMCExpr::VK_GOTTPOFF;
+    RefKind = M68k::S_GOTTPOFF;
     break;
   case M68kII::MO_TLSLE:
-    RefKind = M68kMCExpr::VK_TPOFF;
+    RefKind = M68k::S_TPOFF;
     break;
   }
 
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
index 1a61325008aa..3f7593cf4352 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -45,7 +45,7 @@ M68kELFObjectWriter::~M68kELFObjectWriter() {}
 
 enum M68kRelType { RT_32, RT_16, RT_8 };
 
-static M68kRelType getType(unsigned Kind, M68kMCExpr::Specifier &Modifier,
+static M68kRelType getType(unsigned Kind, M68k::Specifier &Modifier,
                            bool &IsPCRel) {
   switch (Kind) {
   case FK_Data_4:
@@ -64,15 +64,15 @@ static M68kRelType getType(unsigned Kind, M68kMCExpr::Specifier &Modifier,
 unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                            const MCValue &Target,
                                            bool IsPCRel) const {
-  auto Specifier = M68kMCExpr::Specifier(Target.getSpecifier());
+  auto Specifier = M68k::Specifier(Target.getSpecifier());
   unsigned Kind = Fixup.getKind();
   M68kRelType Type = getType(Kind, Specifier, IsPCRel);
   switch (Specifier) {
-  case M68kMCExpr::VK_GOTTPOFF:
-  case M68kMCExpr::VK_TLSGD:
-  case M68kMCExpr::VK_TLSLD:
-  case M68kMCExpr::VK_TLSLDM:
-  case M68kMCExpr::VK_TPOFF:
+  case M68k::S_GOTTPOFF:
+  case M68k::S_TLSGD:
+  case M68k::S_TLSLD:
+  case M68k::S_TLSLDM:
+  case M68k::S_TPOFF:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -84,7 +84,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
   default:
     llvm_unreachable("Unimplemented");
 
-  case M68kMCExpr::VK_TLSGD:
+  case M68k::S_TLSGD:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_GD32;
@@ -94,7 +94,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_GD8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_TLSLDM:
+  case M68k::S_TLSLDM:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_LDM32;
@@ -104,7 +104,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_LDM8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_TLSLD:
+  case M68k::S_TLSLD:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_LDO32;
@@ -114,7 +114,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_LDO8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_GOTTPOFF:
+  case M68k::S_GOTTPOFF:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_IE32;
@@ -124,7 +124,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_IE8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_TPOFF:
+  case M68k::S_TPOFF:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_LE32;
@@ -134,7 +134,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_LE8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_None:
+  case M68k::S_None:
     switch (Type) {
     case RT_32:
       return IsPCRel ? ELF::R_68K_PC32 : ELF::R_68K_32;
@@ -144,7 +144,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return IsPCRel ? ELF::R_68K_PC8 : ELF::R_68K_8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_GOTPCREL:
+  case M68k::S_GOTPCREL:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_GOTPCREL32;
@@ -154,7 +154,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_GOTPCREL8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_GOTOFF:
+  case M68k::S_GOTOFF:
     assert(!IsPCRel);
     switch (Type) {
     case RT_32:
@@ -165,7 +165,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_GOTOFF8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_PLT:
+  case M68k::S_PLT:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_PLT32;
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
index ba1b0dc2bb09..8259546fbae5 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
@@ -20,14 +20,10 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {M68kMCExpr::VK_GOTOFF, "GOTOFF"},
-    {M68kMCExpr::VK_GOTPCREL, "GOTPCREL"},
-    {M68kMCExpr::VK_GOTTPOFF, "GOTTPOFF"},
-    {M68kMCExpr::VK_PLT, "PLT"},
-    {M68kMCExpr::VK_TLSGD, "TLSGD"},
-    {M68kMCExpr::VK_TLSLD, "TLSLD"},
-    {M68kMCExpr::VK_TLSLDM, "TLSLDM"},
-    {M68kMCExpr::VK_TPOFF, "TPOFF"},
+    {M68k::S_GOTOFF, "GOTOFF"},     {M68k::S_GOTPCREL, "GOTPCREL"},
+    {M68k::S_GOTTPOFF, "GOTTPOFF"}, {M68k::S_PLT, "PLT"},
+    {M68k::S_TLSGD, "TLSGD"},       {M68k::S_TLSLD, "TLSLD"},
+    {M68k::S_TLSLDM, "TLSLDM"},     {M68k::S_TPOFF, "TPOFF"},
 };
 
 void M68kELFMCAsmInfo::anchor() {}
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
index 22d8da263cea..18301d7ea9b3 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
@@ -13,7 +13,7 @@
 
 using namespace llvm;
 
-const M68kMCExpr *M68kMCExpr::create(const MCExpr *Expr, Specifier S,
+const M68kMCExpr *M68kMCExpr::create(const MCExpr *Expr, Spec S,
                                      MCContext &Ctx) {
   return new (Ctx) M68kMCExpr(Expr, S);
 }
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
index 02bffdcb2889..39a2898e2eda 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
@@ -19,30 +19,28 @@
 namespace llvm {
 
 class M68kMCExpr : public MCSpecifierExpr {
-public:
-  enum Specifier {
-    VK_None,
-
-    VK_GOT = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_GOTOFF,
-    VK_GOTPCREL,
-    VK_GOTTPOFF,
-    VK_PLT,
-    VK_TLSGD,
-    VK_TLSLD,
-    VK_TLSLDM,
-    VK_TPOFF,
-  };
-
 protected:
-  explicit M68kMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
+  explicit M68kMCExpr(const MCExpr *Expr, Spec S) : MCSpecifierExpr(Expr, S) {}
 
 public:
-  static const M68kMCExpr *create(const MCExpr *, Specifier, MCContext &);
+  static const M68kMCExpr *create(const MCExpr *, Spec, MCContext &);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 };
+namespace M68k {
+enum Specifier {
+  S_None,
+  S_GOT,
+  S_GOTOFF,
+  S_GOTPCREL,
+  S_GOTTPOFF,
+  S_PLT,
+  S_TLSGD,
+  S_TLSLD,
+  S_TLSLDM,
+  S_TPOFF,
+};
+}
 } // namespace llvm
 
 #endif

From 444c6ae530e4814af2cfd6918e3f852ef14ff50d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 13:02:41 -0700
Subject: [PATCH 0417/1322] M68k: Remove M68kMCExpr

---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |  2 +-
 llvm/lib/Target/M68k/M68kMCInstLower.cpp      |  2 +-
 .../Target/M68k/MCTargetDesc/CMakeLists.txt   |  1 -
 .../M68k/MCTargetDesc/M68kELFObjectWriter.cpp |  2 +-
 .../M68k/MCTargetDesc/M68kMCAsmInfo.cpp       |  1 -
 .../Target/M68k/MCTargetDesc/M68kMCAsmInfo.h  | 15 ++++++
 .../Target/M68k/MCTargetDesc/M68kMCExpr.cpp   | 21 ---------
 .../lib/Target/M68k/MCTargetDesc/M68kMCExpr.h | 46 -------------------
 8 files changed, 18 insertions(+), 72 deletions(-)
 delete mode 100644 llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
 delete mode 100644 llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index c1860fa88a83..594ea9f48c20 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -19,7 +19,7 @@
 #include "M68kSubtarget.h"
 #include "M68kTargetMachine.h"
 #include "M68kTargetObjectFile.h"
-#include "MCTargetDesc/M68kMCExpr.h"
+#include "MCTargetDesc/M68kMCAsmInfo.h"
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
index b256d56c032c..301112c41efb 100644
--- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp
+++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
@@ -18,7 +18,7 @@
 #include "M68kInstrInfo.h"
 
 #include "MCTargetDesc/M68kBaseInfo.h"
-#include "MCTargetDesc/M68kMCExpr.h"
+#include "MCTargetDesc/M68kMCAsmInfo.h"
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
index 0146e21acf36..1127b3b547f1 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMM68kDesc
   M68kInstPrinter.cpp
   M68kMCAsmInfo.cpp
   M68kMCCodeEmitter.cpp
-  M68kMCExpr.cpp
   M68kMCTargetDesc.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
index 3f7593cf4352..03416df639cf 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/M68kFixupKinds.h"
-#include "MCTargetDesc/M68kMCExpr.h"
+#include "MCTargetDesc/M68kMCAsmInfo.h"
 #include "MCTargetDesc/M68kMCTargetDesc.h"
 
 #include "llvm/BinaryFormat/ELF.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
index 8259546fbae5..b0a19309b50f 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "M68kMCAsmInfo.h"
-#include "MCTargetDesc/M68kMCExpr.h"
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
index 873264d88674..1ab36260cef1 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
@@ -26,6 +26,21 @@ public:
   explicit M68kELFMCAsmInfo(const Triple &Triple);
 };
 
+namespace M68k {
+enum Specifier {
+  S_None,
+  S_GOT,
+  S_GOTOFF,
+  S_GOTPCREL,
+  S_GOTTPOFF,
+  S_PLT,
+  S_TLSGD,
+  S_TLSLD,
+  S_TLSLDM,
+  S_TPOFF,
+};
+}
+
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCASMINFO_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
deleted file mode 100644
index 18301d7ea9b3..000000000000
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===- M68k specific MC expression classes ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "M68kMCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-
-using namespace llvm;
-
-const M68kMCExpr *M68kMCExpr::create(const MCExpr *Expr, Spec S,
-                                     MCContext &Ctx) {
-  return new (Ctx) M68kMCExpr(Expr, S);
-}
-
-void M68kMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {}
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
deleted file mode 100644
index 39a2898e2eda..000000000000
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- M68k specific MC expression classes ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The MCTargetExpr subclass describes a relocatable expression with a
-// M68k-specific relocation specifier.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCEXPR_H
-#define LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class M68kMCExpr : public MCSpecifierExpr {
-protected:
-  explicit M68kMCExpr(const MCExpr *Expr, Spec S) : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const M68kMCExpr *create(const MCExpr *, Spec, MCContext &);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-};
-namespace M68k {
-enum Specifier {
-  S_None,
-  S_GOT,
-  S_GOTOFF,
-  S_GOTPCREL,
-  S_GOTTPOFF,
-  S_PLT,
-  S_TLSGD,
-  S_TLSLD,
-  S_TLSLDM,
-  S_TPOFF,
-};
-}
-} // namespace llvm
-
-#endif

From b839632bf44f56e6f17777857f4b23d4eccb6f33 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 13:17:22 -0700
Subject: [PATCH 0418/1322] PowerPC: Rename PPCMCExpr::VK_ to PPC::S_

Prepare for removing PPCMCExpr. Adopt the newer naming convention with
AMDGPU/WebAssembly/VE/M68k.
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp |  60 ++--
 .../PowerPC/MCTargetDesc/PPCAsmBackend.cpp    |   2 +-
 .../MCTargetDesc/PPCELFObjectWriter.cpp       | 272 +++++++++---------
 .../PowerPC/MCTargetDesc/PPCELFStreamer.cpp   |   8 +-
 .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp   |   8 +-
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp     | 152 +++++-----
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.h       |  84 ++++++
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp |  16 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp |  23 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.h   |  96 +------
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp  |   6 +-
 .../MCTargetDesc/PPCXCOFFObjectWriter.cpp     |  34 +--
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 118 ++++----
 llvm/lib/Target/PowerPC/PPCMCInstLower.cpp    |  44 +--
 .../Target/PowerPC/PPCTargetObjectFile.cpp    |   6 +-
 15 files changed, 459 insertions(+), 470 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 9d3d04e6b8ad..7e79d85d6017 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -745,8 +745,8 @@ public:
       return CreateImm(CE->getValue(), S, E, IsPPC64);
 
     if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
-      if (getSpecifier(SRE) == PPCMCExpr::VK_TLS ||
-          getSpecifier(SRE) == PPCMCExpr::VK_TLS_PCREL)
+      if (getSpecifier(SRE) == PPC::S_TLS ||
+          getSpecifier(SRE) == PPC::S_TLS_PCREL)
         return CreateTLSReg(SRE, S, E, IsPPC64);
 
     if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
@@ -1378,25 +1378,25 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
     auto *TE = cast<PPCMCExpr>(E);
     Spec = TE->getSpecifier();
     (void)extractSpecifier(TE->getSubExpr(), Spec);
-    Spec = PPCMCExpr::VK_None;
+    Spec = PPC::S_None;
   } break;
 
   case MCExpr::SymbolRef: {
     const auto *SRE = cast<MCSymbolRefExpr>(E);
     switch (getSpecifier(SRE)) {
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
     default:
       break;
-    case PPCMCExpr::VK_LO:
-    case PPCMCExpr::VK_HI:
-    case PPCMCExpr::VK_HA:
-    case PPCMCExpr::VK_HIGH:
-    case PPCMCExpr::VK_HIGHA:
-    case PPCMCExpr::VK_HIGHER:
-    case PPCMCExpr::VK_HIGHERA:
-    case PPCMCExpr::VK_HIGHEST:
-    case PPCMCExpr::VK_HIGHESTA:
-      if (Spec == PPCMCExpr::VK_None)
+    case PPC::S_LO:
+    case PPC::S_HI:
+    case PPC::S_HA:
+    case PPC::S_HIGH:
+    case PPC::S_HIGHA:
+    case PPC::S_HIGHER:
+    case PPC::S_HIGHERA:
+    case PPC::S_HIGHEST:
+    case PPC::S_HIGHESTA:
+      if (Spec == PPC::S_None)
         Spec = getSpecifier(SRE);
       else
         Error(E->getLoc(), "cannot contain more than one relocation specifier");
@@ -1408,7 +1408,7 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
   case MCExpr::Unary: {
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
     const MCExpr *Sub = extractSpecifier(UE->getSubExpr(), Spec);
-    if (Spec != PPCMCExpr::VK_None)
+    if (Spec != PPC::S_None)
       return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
     break;
   }
@@ -1417,7 +1417,7 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
     const MCExpr *LHS = extractSpecifier(BE->getLHS(), Spec);
     const MCExpr *RHS = extractSpecifier(BE->getRHS(), Spec);
-    if (Spec != PPCMCExpr::VK_None)
+    if (Spec != PPC::S_None)
       return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
     break;
   }
@@ -1436,9 +1436,9 @@ bool PPCAsmParser::parseExpression(const MCExpr *&EVal) {
   if (getParser().parseExpression(EVal))
     return true;
 
-  uint16_t Spec = PPCMCExpr::VK_None;
+  uint16_t Spec = PPC::S_None;
   const MCExpr *E = extractSpecifier(EVal, Spec);
-  if (Spec != PPCMCExpr::VK_None)
+  if (Spec != PPC::S_None)
     EVal = PPCMCExpr::create(Spec, E, getParser().getContext());
 
   return false;
@@ -1512,9 +1512,9 @@ bool PPCAsmParser::parseOperand(OperandVector &Operands) {
       if (!(parseOptionalToken(AsmToken::Identifier) &&
             Tok.getString().compare_insensitive("plt") == 0))
         return Error(Tok.getLoc(), "expected 'plt'");
-      EVal = MCSymbolRefExpr::create(
-          getContext().getOrCreateSymbol(TlsGetAddr),
-          MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_PLT), getContext());
+      EVal = MCSymbolRefExpr::create(getContext().getOrCreateSymbol(TlsGetAddr),
+                                     MCSymbolRefExpr::VariantKind(PPC::S_PLT),
+                                     getContext());
       if (parseOptionalToken(AsmToken::Plus)) {
         const MCExpr *Addend = nullptr;
         SMLoc EndLoc;
@@ -1826,15 +1826,15 @@ const MCExpr *PPCAsmParser::applySpecifier(const MCExpr *E, uint32_t Spec,
                                            MCContext &Ctx) {
   if (isa<MCConstantExpr>(E)) {
     switch (PPCMCExpr::Specifier(Spec)) {
-    case PPCMCExpr::VK_LO:
-    case PPCMCExpr::VK_HI:
-    case PPCMCExpr::VK_HA:
-    case PPCMCExpr::VK_HIGH:
-    case PPCMCExpr::VK_HIGHA:
-    case PPCMCExpr::VK_HIGHER:
-    case PPCMCExpr::VK_HIGHERA:
-    case PPCMCExpr::VK_HIGHEST:
-    case PPCMCExpr::VK_HIGHESTA:
+    case PPC::S_LO:
+    case PPC::S_HI:
+    case PPC::S_HA:
+    case PPC::S_HIGH:
+    case PPC::S_HIGHA:
+    case PPC::S_HIGHER:
+    case PPC::S_HIGHERA:
+    case PPC::S_HIGHEST:
+    case PPC::S_HIGHESTA:
       break;
     default:
       return nullptr;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 84cd12248842..d4b86d5e2811 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -140,7 +140,7 @@ public:
     // In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to
     // reference the null symbol.
     auto Target = TargetVal;
-    if (Target.getSpecifier() == PPCMCExpr::VK_TOCBASE)
+    if (Target.getSpecifier() == PPC::S_TOCBASE)
       Target.setAddSym(nullptr);
     return MCAsmBackend::addReloc(F, Fixup, Target, FixedValue, IsResolved);
   }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 096c019f8556..8e885c3d86a0 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -43,49 +43,49 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
   SMLoc Loc = Fixup.getValue()->getLoc();
   auto Spec = static_cast<PPCMCExpr::Specifier>(Target.getSpecifier());
   switch (Spec) {
-  case PPCMCExpr::VK_DTPMOD:
-  case PPCMCExpr::VK_DTPREL:
-  case PPCMCExpr::VK_DTPREL_HA:
-  case PPCMCExpr::VK_DTPREL_HI:
-  case PPCMCExpr::VK_DTPREL_HIGH:
-  case PPCMCExpr::VK_DTPREL_HIGHA:
-  case PPCMCExpr::VK_DTPREL_HIGHER:
-  case PPCMCExpr::VK_DTPREL_HIGHERA:
-  case PPCMCExpr::VK_DTPREL_HIGHEST:
-  case PPCMCExpr::VK_DTPREL_HIGHESTA:
-  case PPCMCExpr::VK_DTPREL_LO:
-  case PPCMCExpr::VK_GOT_DTPREL:
-  case PPCMCExpr::VK_GOT_DTPREL_HA:
-  case PPCMCExpr::VK_GOT_DTPREL_HI:
-  case PPCMCExpr::VK_GOT_DTPREL_LO:
-  case PPCMCExpr::VK_GOT_TLSGD:
-  case PPCMCExpr::VK_GOT_TLSGD_HA:
-  case PPCMCExpr::VK_GOT_TLSGD_HI:
-  case PPCMCExpr::VK_GOT_TLSGD_LO:
-  case PPCMCExpr::VK_GOT_TLSGD_PCREL:
-  case PPCMCExpr::VK_GOT_TLSLD:
-  case PPCMCExpr::VK_GOT_TLSLD_HA:
-  case PPCMCExpr::VK_GOT_TLSLD_HI:
-  case PPCMCExpr::VK_GOT_TLSLD_LO:
-  case PPCMCExpr::VK_GOT_TPREL:
-  case PPCMCExpr::VK_GOT_TPREL_HA:
-  case PPCMCExpr::VK_GOT_TPREL_HI:
-  case PPCMCExpr::VK_GOT_TPREL_LO:
-  case PPCMCExpr::VK_GOT_TPREL_PCREL:
-  case PPCMCExpr::VK_TLS:
-  case PPCMCExpr::VK_TLSGD:
-  case PPCMCExpr::VK_TLSLD:
-  case PPCMCExpr::VK_TLS_PCREL:
-  case PPCMCExpr::VK_TPREL:
-  case PPCMCExpr::VK_TPREL_HA:
-  case PPCMCExpr::VK_TPREL_HI:
-  case PPCMCExpr::VK_TPREL_HIGH:
-  case PPCMCExpr::VK_TPREL_HIGHA:
-  case PPCMCExpr::VK_TPREL_HIGHER:
-  case PPCMCExpr::VK_TPREL_HIGHERA:
-  case PPCMCExpr::VK_TPREL_HIGHEST:
-  case PPCMCExpr::VK_TPREL_HIGHESTA:
-  case PPCMCExpr::VK_TPREL_LO:
+  case PPC::S_DTPMOD:
+  case PPC::S_DTPREL:
+  case PPC::S_DTPREL_HA:
+  case PPC::S_DTPREL_HI:
+  case PPC::S_DTPREL_HIGH:
+  case PPC::S_DTPREL_HIGHA:
+  case PPC::S_DTPREL_HIGHER:
+  case PPC::S_DTPREL_HIGHERA:
+  case PPC::S_DTPREL_HIGHEST:
+  case PPC::S_DTPREL_HIGHESTA:
+  case PPC::S_DTPREL_LO:
+  case PPC::S_GOT_DTPREL:
+  case PPC::S_GOT_DTPREL_HA:
+  case PPC::S_GOT_DTPREL_HI:
+  case PPC::S_GOT_DTPREL_LO:
+  case PPC::S_GOT_TLSGD:
+  case PPC::S_GOT_TLSGD_HA:
+  case PPC::S_GOT_TLSGD_HI:
+  case PPC::S_GOT_TLSGD_LO:
+  case PPC::S_GOT_TLSGD_PCREL:
+  case PPC::S_GOT_TLSLD:
+  case PPC::S_GOT_TLSLD_HA:
+  case PPC::S_GOT_TLSLD_HI:
+  case PPC::S_GOT_TLSLD_LO:
+  case PPC::S_GOT_TPREL:
+  case PPC::S_GOT_TPREL_HA:
+  case PPC::S_GOT_TPREL_HI:
+  case PPC::S_GOT_TPREL_LO:
+  case PPC::S_GOT_TPREL_PCREL:
+  case PPC::S_TLS:
+  case PPC::S_TLSGD:
+  case PPC::S_TLSLD:
+  case PPC::S_TLS_PCREL:
+  case PPC::S_TPREL:
+  case PPC::S_TPREL_HA:
+  case PPC::S_TPREL_HI:
+  case PPC::S_TPREL_HIGH:
+  case PPC::S_TPREL_HIGHA:
+  case PPC::S_TPREL_HIGHER:
+  case PPC::S_TPREL_HIGHERA:
+  case PPC::S_TPREL_HIGHEST:
+  case PPC::S_TPREL_HIGHESTA:
+  case PPC::S_TPREL_LO:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -106,16 +106,16 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC_REL24;
         break;
-      case PPCMCExpr::VK_PLT:
+      case PPC::S_PLT:
         Type = ELF::R_PPC_PLTREL24;
         break;
-      case PPCMCExpr::VK_LOCAL:
+      case PPC::S_LOCAL:
         Type = ELF::R_PPC_LOCAL24PC;
         break;
-      case PPCMCExpr::VK_NOTOC:
+      case PPC::S_NOTOC:
         Type = ELF::R_PPC64_REL24_NOTOC;
         break;
       }
@@ -129,13 +129,13 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         return ELF::R_PPC_NONE;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         return ELF::R_PPC_REL16;
-      case PPCMCExpr::VK_LO:
+      case PPC::S_LO:
         return ELF::R_PPC_REL16_LO;
-      case PPCMCExpr::VK_HI:
+      case PPC::S_HI:
         return ELF::R_PPC_REL16_HI;
-      case PPCMCExpr::VK_HA:
+      case PPC::S_HA:
         return ELF::R_PPC_REL16_HA;
       }
       break;
@@ -148,19 +148,19 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_PCREL:
+      case PPC::S_PCREL:
         Type = ELF::R_PPC64_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_PCREL:
+      case PPC::S_GOT_PCREL:
         Type = ELF::R_PPC64_GOT_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_PCREL:
+      case PPC::S_GOT_TLSGD_PCREL:
         Type = ELF::R_PPC64_GOT_TLSGD_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_PCREL:
+      case PPC::S_GOT_TLSLD_PCREL:
         Type = ELF::R_PPC64_GOT_TLSLD_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_PCREL:
+      case PPC::S_GOT_TPREL_PCREL:
         Type = ELF::R_PPC64_GOT_TPREL_PCREL34;
         break;
       }
@@ -186,172 +186,172 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_LO:
+      case PPC::S_LO:
         return ELF::R_PPC_ADDR16_LO;
-      case PPCMCExpr::VK_HI:
+      case PPC::S_HI:
         return ELF::R_PPC_ADDR16_HI;
-      case PPCMCExpr::VK_HA:
+      case PPC::S_HA:
         return ELF::R_PPC_ADDR16_HA;
-      case PPCMCExpr::VK_HIGH:
+      case PPC::S_HIGH:
         return ELF::R_PPC64_ADDR16_HIGH;
-      case PPCMCExpr::VK_HIGHA:
+      case PPC::S_HIGHA:
         return ELF::R_PPC64_ADDR16_HIGHA;
-      case PPCMCExpr::VK_HIGHER:
+      case PPC::S_HIGHER:
         return ELF::R_PPC64_ADDR16_HIGHER;
-      case PPCMCExpr::VK_HIGHERA:
+      case PPC::S_HIGHERA:
         return ELF::R_PPC64_ADDR16_HIGHERA;
-      case PPCMCExpr::VK_HIGHEST:
+      case PPC::S_HIGHEST:
         return ELF::R_PPC64_ADDR16_HIGHEST;
-      case PPCMCExpr::VK_HIGHESTA:
+      case PPC::S_HIGHESTA:
         return ELF::R_PPC64_ADDR16_HIGHESTA;
 
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC_ADDR16;
         break;
-      case PPCMCExpr::VK_GOT:
+      case PPC::S_GOT:
         Type = ELF::R_PPC_GOT16;
         break;
-      case PPCMCExpr::VK_GOT_LO:
+      case PPC::S_GOT_LO:
         Type = ELF::R_PPC_GOT16_LO;
         break;
-      case PPCMCExpr::VK_GOT_HI:
+      case PPC::S_GOT_HI:
         Type = ELF::R_PPC_GOT16_HI;
         break;
-      case PPCMCExpr::VK_GOT_HA:
+      case PPC::S_GOT_HA:
         Type = ELF::R_PPC_GOT16_HA;
         break;
-      case PPCMCExpr::VK_TOC:
+      case PPC::S_TOC:
         Type = ELF::R_PPC64_TOC16;
         break;
-      case PPCMCExpr::VK_TOC_LO:
+      case PPC::S_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO;
         break;
-      case PPCMCExpr::VK_TOC_HI:
+      case PPC::S_TOC_HI:
         Type = ELF::R_PPC64_TOC16_HI;
         break;
-      case PPCMCExpr::VK_TOC_HA:
+      case PPC::S_TOC_HA:
         Type = ELF::R_PPC64_TOC16_HA;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC_TPREL16;
         break;
-      case PPCMCExpr::VK_TPREL_LO:
+      case PPC::S_TPREL_LO:
         Type = ELF::R_PPC_TPREL16_LO;
         break;
-      case PPCMCExpr::VK_TPREL_HI:
+      case PPC::S_TPREL_HI:
         Type = ELF::R_PPC_TPREL16_HI;
         break;
-      case PPCMCExpr::VK_TPREL_HA:
+      case PPC::S_TPREL_HA:
         Type = ELF::R_PPC_TPREL16_HA;
         break;
-      case PPCMCExpr::VK_TPREL_HIGH:
+      case PPC::S_TPREL_HIGH:
         Type = ELF::R_PPC64_TPREL16_HIGH;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHA:
+      case PPC::S_TPREL_HIGHA:
         Type = ELF::R_PPC64_TPREL16_HIGHA;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHER:
+      case PPC::S_TPREL_HIGHER:
         Type = ELF::R_PPC64_TPREL16_HIGHER;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHERA:
+      case PPC::S_TPREL_HIGHERA:
         Type = ELF::R_PPC64_TPREL16_HIGHERA;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHEST:
+      case PPC::S_TPREL_HIGHEST:
         Type = ELF::R_PPC64_TPREL16_HIGHEST;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHESTA:
+      case PPC::S_TPREL_HIGHESTA:
         Type = ELF::R_PPC64_TPREL16_HIGHESTA;
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL16;
         break;
-      case PPCMCExpr::VK_DTPREL_LO:
+      case PPC::S_DTPREL_LO:
         Type = ELF::R_PPC64_DTPREL16_LO;
         break;
-      case PPCMCExpr::VK_DTPREL_HI:
+      case PPC::S_DTPREL_HI:
         Type = ELF::R_PPC64_DTPREL16_HI;
         break;
-      case PPCMCExpr::VK_DTPREL_HA:
+      case PPC::S_DTPREL_HA:
         Type = ELF::R_PPC64_DTPREL16_HA;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGH:
+      case PPC::S_DTPREL_HIGH:
         Type = ELF::R_PPC64_DTPREL16_HIGH;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHA:
+      case PPC::S_DTPREL_HIGHA:
         Type = ELF::R_PPC64_DTPREL16_HIGHA;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHER:
+      case PPC::S_DTPREL_HIGHER:
         Type = ELF::R_PPC64_DTPREL16_HIGHER;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHERA:
+      case PPC::S_DTPREL_HIGHERA:
         Type = ELF::R_PPC64_DTPREL16_HIGHERA;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHEST:
+      case PPC::S_DTPREL_HIGHEST:
         Type = ELF::R_PPC64_DTPREL16_HIGHEST;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHESTA:
+      case PPC::S_DTPREL_HIGHESTA:
         Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD:
+      case PPC::S_GOT_TLSGD:
         if (is64Bit())
           Type = ELF::R_PPC64_GOT_TLSGD16;
         else
           Type = ELF::R_PPC_GOT_TLSGD16;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_LO:
+      case PPC::S_GOT_TLSGD_LO:
         Type = ELF::R_PPC64_GOT_TLSGD16_LO;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_HI:
+      case PPC::S_GOT_TLSGD_HI:
         Type = ELF::R_PPC64_GOT_TLSGD16_HI;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_HA:
+      case PPC::S_GOT_TLSGD_HA:
         Type = ELF::R_PPC64_GOT_TLSGD16_HA;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD:
+      case PPC::S_GOT_TLSLD:
         if (is64Bit())
           Type = ELF::R_PPC64_GOT_TLSLD16;
         else
           Type = ELF::R_PPC_GOT_TLSLD16;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_LO:
+      case PPC::S_GOT_TLSLD_LO:
         Type = ELF::R_PPC64_GOT_TLSLD16_LO;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_HI:
+      case PPC::S_GOT_TLSLD_HI:
         Type = ELF::R_PPC64_GOT_TLSLD16_HI;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_HA:
+      case PPC::S_GOT_TLSLD_HA:
         Type = ELF::R_PPC64_GOT_TLSLD16_HA;
         break;
-      case PPCMCExpr::VK_GOT_TPREL:
+      case PPC::S_GOT_TPREL:
         /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS.  */
         Type = ELF::R_PPC64_GOT_TPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_LO:
+      case PPC::S_GOT_TPREL_LO:
         /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS.  */
         Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_HI:
+      case PPC::S_GOT_TPREL_HI:
         Type = ELF::R_PPC64_GOT_TPREL16_HI;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL:
+      case PPC::S_GOT_DTPREL:
         /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS.  */
         Type = ELF::R_PPC64_GOT_DTPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_LO:
+      case PPC::S_GOT_DTPREL_LO:
         /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS.  */
         Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_HA:
+      case PPC::S_GOT_TPREL_HA:
         Type = ELF::R_PPC64_GOT_TPREL16_HA;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_HI:
+      case PPC::S_GOT_DTPREL_HI:
         Type = ELF::R_PPC64_GOT_DTPREL16_HI;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_HA:
+      case PPC::S_GOT_DTPREL_HA:
         Type = ELF::R_PPC64_GOT_DTPREL16_HA;
         break;
       }
@@ -362,45 +362,45 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_LO:
+      case PPC::S_LO:
         return ELF::R_PPC64_ADDR16_LO_DS;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC64_ADDR16_DS;
         break;
-      case PPCMCExpr::VK_GOT:
+      case PPC::S_GOT:
         Type = ELF::R_PPC64_GOT16_DS;
         break;
-      case PPCMCExpr::VK_GOT_LO:
+      case PPC::S_GOT_LO:
         Type = ELF::R_PPC64_GOT16_LO_DS;
         break;
-      case PPCMCExpr::VK_TOC:
+      case PPC::S_TOC:
         Type = ELF::R_PPC64_TOC16_DS;
         break;
-      case PPCMCExpr::VK_TOC_LO:
+      case PPC::S_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO_DS;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC64_TPREL16_DS;
         break;
-      case PPCMCExpr::VK_TPREL_LO:
+      case PPC::S_TPREL_LO:
         Type = ELF::R_PPC64_TPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL16_DS;
         break;
-      case PPCMCExpr::VK_DTPREL_LO:
+      case PPC::S_DTPREL_LO:
         Type = ELF::R_PPC64_DTPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL:
+      case PPC::S_GOT_TPREL:
         Type = ELF::R_PPC64_GOT_TPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_LO:
+      case PPC::S_GOT_TPREL_LO:
         Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL:
+      case PPC::S_GOT_DTPREL:
         Type = ELF::R_PPC64_GOT_DTPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_LO:
+      case PPC::S_GOT_DTPREL_LO:
         Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
         break;
       }
@@ -410,25 +410,25 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_TLSGD:
+      case PPC::S_TLSGD:
         if (is64Bit())
           Type = ELF::R_PPC64_TLSGD;
         else
           Type = ELF::R_PPC_TLSGD;
         break;
-      case PPCMCExpr::VK_TLSLD:
+      case PPC::S_TLSLD:
         if (is64Bit())
           Type = ELF::R_PPC64_TLSLD;
         else
           Type = ELF::R_PPC_TLSLD;
         break;
-      case PPCMCExpr::VK_TLS:
+      case PPC::S_TLS:
         if (is64Bit())
           Type = ELF::R_PPC64_TLS;
         else
           Type = ELF::R_PPC_TLS;
         break;
-      case PPCMCExpr::VK_TLS_PCREL:
+      case PPC::S_TLS_PCREL:
         Type = ELF::R_PPC64_TLS;
         break;
       }
@@ -438,10 +438,10 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL34;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC64_TPREL34;
         break;
       }
@@ -451,26 +451,26 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_TOCBASE:
+      case PPC::S_TOCBASE:
         Type = ELF::R_PPC64_TOC;
         break;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC64_ADDR64;
         break;
-      case PPCMCExpr::VK_DTPMOD:
+      case PPC::S_DTPMOD:
         Type = ELF::R_PPC64_DTPMOD64;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC64_TPREL64;
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL64;
         break;
       }
       break;
     case FK_Data_4:
       switch (Spec) {
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC_DTPREL32;
         break;
       default:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 501ef460b693..78065541f0d0 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -139,7 +139,7 @@ void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) {
   // Cast the last operand to MCSymbolRefExpr to get the symbol.
   const MCExpr *Expr = Operand.getExpr();
   const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
-  assert(getSpecifier(SymExpr) == PPCMCExpr::VK_PCREL_OPT &&
+  assert(getSpecifier(SymExpr) == PPC::S_PCREL_OPT &&
          "Expecting a symbol of type VK_PCREL_OPT");
   MCSymbol *LabelSym =
       getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
@@ -174,7 +174,7 @@ void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) {
   // Cast the last operand to MCSymbolRefExpr to get the symbol.
   const MCExpr *Expr = Operand.getExpr();
   const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
-  assert(getSpecifier(SymExpr) == PPCMCExpr::VK_PCREL_OPT &&
+  assert(getSpecifier(SymExpr) == PPC::S_PCREL_OPT &&
          "Expecting a symbol of type VK_PCREL_OPT");
   MCSymbol *LabelSym =
       getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
@@ -190,7 +190,7 @@ void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) {
 // The above is a pair of such instructions and this function will not return
 // std::nullopt for either one of them. In both cases we are looking for the
 // last operand <MCOperand Expr:(.Lpcrel@<<invalid>>)> which needs to be an
-// MCExpr and has the flag PPCMCExpr::VK_PCREL_OPT. After that we just
+// MCExpr and has the flag PPC::S_PCREL_OPT. After that we just
 // look at the opcode and in the case of PLDpc we will return true. For the load
 // (or store) this function will return false indicating it has found the second
 // instruciton in the pair.
@@ -212,7 +212,7 @@ std::optional<bool> llvm::isPartOfGOTToPCRelPair(const MCInst &Inst,
   // Check for the variant kind VK_PCREL_OPT in this expression.
   const MCExpr *Expr = Operand.getExpr();
   const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
-  if (!SymExpr || getSpecifier(SymExpr) != PPCMCExpr::VK_PCREL_OPT)
+  if (!SymExpr || getSpecifier(SymExpr) != PPC::S_PCREL_OPT)
     return std::nullopt;
 
   return (Inst.getOpcode() == PPC::PLDpc);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 0e1b28af691d..bd01767f41bd 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -92,7 +92,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
       const MCSymbolRefExpr *SymExpr =
           static_cast<const MCSymbolRefExpr *>(Expr);
 
-      if (SymExpr && getSpecifier(SymExpr) == PPCMCExpr::VK_PCREL_OPT) {
+      if (SymExpr && getSpecifier(SymExpr) == PPC::S_PCREL_OPT) {
         const MCSymbol &Symbol = SymExpr->getSymbol();
         if (MI->getOpcode() == PPC::PLDpc) {
           printInstruction(MI, Address, STI, O);
@@ -579,13 +579,13 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
   // because we do not want the assembly to print out the @notoc at the
   // end like __tls_get_addr(x@tlsgd)@notoc. Instead we want it to look
   // like __tls_get_addr@notoc(x@tlsgd).
-  if (getSpecifier(RefExp) == PPCMCExpr::VK_NOTOC)
+  if (getSpecifier(RefExp) == PPC::S_NOTOC)
     O << '@' << MAI.getSpecifierName(RefExp->getKind());
   O << '(';
   printOperand(MI, OpNo + 1, STI, O);
   O << ')';
-  if (getSpecifier(RefExp) != PPCMCExpr::VK_None &&
-      getSpecifier(RefExp) != PPCMCExpr::VK_NOTOC)
+  if (getSpecifier(RefExp) != PPC::S_None &&
+      getSpecifier(RefExp) != PPC::S_NOTOC)
     O << '@' << MAI.getSpecifierName(RefExp->getKind());
   if (Rhs) {
     SmallString<0> Buf;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index b5be23c5a96a..bb1f21d8f032 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -20,82 +20,82 @@ using namespace llvm;
 void PPCELFMCAsmInfo::anchor() { }
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {PPCMCExpr::VK_DTPREL, "DTPREL"},
-    {PPCMCExpr::VK_GOT, "GOT"},
-    {PPCMCExpr::VK_GOT_HA, "got@ha"},
-    {PPCMCExpr::VK_GOT_HI, "got@h"},
-    {PPCMCExpr::VK_GOT_LO, "got@l"},
-    {PPCMCExpr::VK_HA, "ha"},
-    {PPCMCExpr::VK_HI, "h"},
-    {PPCMCExpr::VK_HIGH, "high"},
-    {PPCMCExpr::VK_HIGHA, "higha"},
-    {PPCMCExpr::VK_HIGHER, "higher"},
-    {PPCMCExpr::VK_HIGHERA, "highera"},
-    {PPCMCExpr::VK_HIGHEST, "highest"},
-    {PPCMCExpr::VK_HIGHESTA, "highesta"},
-    {PPCMCExpr::VK_LO, "l"},
-    {PPCMCExpr::VK_L, "l"}, // FIXME: share the name with VK_LO
-    {PPCMCExpr::VK_PCREL, "PCREL"},
-    {PPCMCExpr::VK_PLT, "PLT"},
-    {PPCMCExpr::VK_TLSGD, "tlsgd"},
-    {PPCMCExpr::VK_TLSLD, "tlsld"},
-    {PPCMCExpr::VK_TOC, "toc"},
-    {PPCMCExpr::VK_TOCBASE, "tocbase"},
-    {PPCMCExpr::VK_TOC_HA, "toc@ha"},
-    {PPCMCExpr::VK_TOC_HI, "toc@h"},
-    {PPCMCExpr::VK_TOC_LO, "toc@l"},
-    {PPCMCExpr::VK_TPREL, "TPREL"},
-    {PPCMCExpr::VK_AIX_TLSGD, "gd"},
-    {PPCMCExpr::VK_AIX_TLSGDM, "m"},
-    {PPCMCExpr::VK_AIX_TLSIE, "ie"},
-    {PPCMCExpr::VK_AIX_TLSLD, "ld"},
-    {PPCMCExpr::VK_AIX_TLSLE, "le"},
-    {PPCMCExpr::VK_AIX_TLSML, "ml"},
-    {PPCMCExpr::VK_DTPMOD, "dtpmod"},
-    {PPCMCExpr::VK_DTPREL_HA, "dtprel@ha"},
-    {PPCMCExpr::VK_DTPREL_HI, "dtprel@h"},
-    {PPCMCExpr::VK_DTPREL_HIGH, "dtprel@high"},
-    {PPCMCExpr::VK_DTPREL_HIGHA, "dtprel@higha"},
-    {PPCMCExpr::VK_DTPREL_HIGHER, "dtprel@higher"},
-    {PPCMCExpr::VK_DTPREL_HIGHERA, "dtprel@highera"},
-    {PPCMCExpr::VK_DTPREL_HIGHEST, "dtprel@highest"},
-    {PPCMCExpr::VK_DTPREL_HIGHESTA, "dtprel@highesta"},
-    {PPCMCExpr::VK_DTPREL_LO, "dtprel@l"},
-    {PPCMCExpr::VK_GOT_DTPREL, "got@dtprel"},
-    {PPCMCExpr::VK_GOT_DTPREL_HA, "got@dtprel@ha"},
-    {PPCMCExpr::VK_GOT_DTPREL_HI, "got@dtprel@h"},
-    {PPCMCExpr::VK_GOT_DTPREL_LO, "got@dtprel@l"},
-    {PPCMCExpr::VK_GOT_PCREL, "got@pcrel"},
-    {PPCMCExpr::VK_GOT_TLSGD, "got@tlsgd"},
-    {PPCMCExpr::VK_GOT_TLSGD_HA, "got@tlsgd@ha"},
-    {PPCMCExpr::VK_GOT_TLSGD_HI, "got@tlsgd@h"},
-    {PPCMCExpr::VK_GOT_TLSGD_LO, "got@tlsgd@l"},
-    {PPCMCExpr::VK_GOT_TLSGD_PCREL, "got@tlsgd@pcrel"},
-    {PPCMCExpr::VK_GOT_TLSLD, "got@tlsld"},
-    {PPCMCExpr::VK_GOT_TLSLD_HA, "got@tlsld@ha"},
-    {PPCMCExpr::VK_GOT_TLSLD_HI, "got@tlsld@h"},
-    {PPCMCExpr::VK_GOT_TLSLD_LO, "got@tlsld@l"},
-    {PPCMCExpr::VK_GOT_TLSLD_PCREL, "got@tlsld@pcrel"},
-    {PPCMCExpr::VK_GOT_TPREL, "got@tprel"},
-    {PPCMCExpr::VK_GOT_TPREL_HA, "got@tprel@ha"},
-    {PPCMCExpr::VK_GOT_TPREL_HI, "got@tprel@h"},
-    {PPCMCExpr::VK_GOT_TPREL_LO, "got@tprel@l"},
-    {PPCMCExpr::VK_GOT_TPREL_PCREL, "got@tprel@pcrel"},
-    {PPCMCExpr::VK_LOCAL, "local"},
-    {PPCMCExpr::VK_NOTOC, "notoc"},
-    {PPCMCExpr::VK_PCREL_OPT, "<<invalid>>"},
-    {PPCMCExpr::VK_TLS, "tls"},
-    {PPCMCExpr::VK_TLS_PCREL, "tls@pcrel"},
-    {PPCMCExpr::VK_TPREL_HA, "tprel@ha"},
-    {PPCMCExpr::VK_TPREL_HI, "tprel@h"},
-    {PPCMCExpr::VK_TPREL_HIGH, "tprel@high"},
-    {PPCMCExpr::VK_TPREL_HIGHA, "tprel@higha"},
-    {PPCMCExpr::VK_TPREL_HIGHER, "tprel@higher"},
-    {PPCMCExpr::VK_TPREL_HIGHERA, "tprel@highera"},
-    {PPCMCExpr::VK_TPREL_HIGHEST, "tprel@highest"},
-    {PPCMCExpr::VK_TPREL_HIGHESTA, "tprel@highesta"},
-    {PPCMCExpr::VK_TPREL_LO, "tprel@l"},
-    {PPCMCExpr::VK_U, "u"},
+    {PPC::S_DTPREL, "DTPREL"},
+    {PPC::S_GOT, "GOT"},
+    {PPC::S_GOT_HA, "got@ha"},
+    {PPC::S_GOT_HI, "got@h"},
+    {PPC::S_GOT_LO, "got@l"},
+    {PPC::S_HA, "ha"},
+    {PPC::S_HI, "h"},
+    {PPC::S_HIGH, "high"},
+    {PPC::S_HIGHA, "higha"},
+    {PPC::S_HIGHER, "higher"},
+    {PPC::S_HIGHERA, "highera"},
+    {PPC::S_HIGHEST, "highest"},
+    {PPC::S_HIGHESTA, "highesta"},
+    {PPC::S_LO, "l"},
+    {PPC::S_L, "l"}, // FIXME: share the name with VK_LO
+    {PPC::S_PCREL, "PCREL"},
+    {PPC::S_PLT, "PLT"},
+    {PPC::S_TLSGD, "tlsgd"},
+    {PPC::S_TLSLD, "tlsld"},
+    {PPC::S_TOC, "toc"},
+    {PPC::S_TOCBASE, "tocbase"},
+    {PPC::S_TOC_HA, "toc@ha"},
+    {PPC::S_TOC_HI, "toc@h"},
+    {PPC::S_TOC_LO, "toc@l"},
+    {PPC::S_TPREL, "TPREL"},
+    {PPC::S_AIX_TLSGD, "gd"},
+    {PPC::S_AIX_TLSGDM, "m"},
+    {PPC::S_AIX_TLSIE, "ie"},
+    {PPC::S_AIX_TLSLD, "ld"},
+    {PPC::S_AIX_TLSLE, "le"},
+    {PPC::S_AIX_TLSML, "ml"},
+    {PPC::S_DTPMOD, "dtpmod"},
+    {PPC::S_DTPREL_HA, "dtprel@ha"},
+    {PPC::S_DTPREL_HI, "dtprel@h"},
+    {PPC::S_DTPREL_HIGH, "dtprel@high"},
+    {PPC::S_DTPREL_HIGHA, "dtprel@higha"},
+    {PPC::S_DTPREL_HIGHER, "dtprel@higher"},
+    {PPC::S_DTPREL_HIGHERA, "dtprel@highera"},
+    {PPC::S_DTPREL_HIGHEST, "dtprel@highest"},
+    {PPC::S_DTPREL_HIGHESTA, "dtprel@highesta"},
+    {PPC::S_DTPREL_LO, "dtprel@l"},
+    {PPC::S_GOT_DTPREL, "got@dtprel"},
+    {PPC::S_GOT_DTPREL_HA, "got@dtprel@ha"},
+    {PPC::S_GOT_DTPREL_HI, "got@dtprel@h"},
+    {PPC::S_GOT_DTPREL_LO, "got@dtprel@l"},
+    {PPC::S_GOT_PCREL, "got@pcrel"},
+    {PPC::S_GOT_TLSGD, "got@tlsgd"},
+    {PPC::S_GOT_TLSGD_HA, "got@tlsgd@ha"},
+    {PPC::S_GOT_TLSGD_HI, "got@tlsgd@h"},
+    {PPC::S_GOT_TLSGD_LO, "got@tlsgd@l"},
+    {PPC::S_GOT_TLSGD_PCREL, "got@tlsgd@pcrel"},
+    {PPC::S_GOT_TLSLD, "got@tlsld"},
+    {PPC::S_GOT_TLSLD_HA, "got@tlsld@ha"},
+    {PPC::S_GOT_TLSLD_HI, "got@tlsld@h"},
+    {PPC::S_GOT_TLSLD_LO, "got@tlsld@l"},
+    {PPC::S_GOT_TLSLD_PCREL, "got@tlsld@pcrel"},
+    {PPC::S_GOT_TPREL, "got@tprel"},
+    {PPC::S_GOT_TPREL_HA, "got@tprel@ha"},
+    {PPC::S_GOT_TPREL_HI, "got@tprel@h"},
+    {PPC::S_GOT_TPREL_LO, "got@tprel@l"},
+    {PPC::S_GOT_TPREL_PCREL, "got@tprel@pcrel"},
+    {PPC::S_LOCAL, "local"},
+    {PPC::S_NOTOC, "notoc"},
+    {PPC::S_PCREL_OPT, "<<invalid>>"},
+    {PPC::S_TLS, "tls"},
+    {PPC::S_TLS_PCREL, "tls@pcrel"},
+    {PPC::S_TPREL_HA, "tprel@ha"},
+    {PPC::S_TPREL_HI, "tprel@h"},
+    {PPC::S_TPREL_HIGH, "tprel@high"},
+    {PPC::S_TPREL_HIGHA, "tprel@higha"},
+    {PPC::S_TPREL_HIGHER, "tprel@higher"},
+    {PPC::S_TPREL_HIGHERA, "tprel@highera"},
+    {PPC::S_TPREL_HIGHEST, "tprel@highest"},
+    {PPC::S_TPREL_HIGHESTA, "tprel@highesta"},
+    {PPC::S_TPREL_LO, "tprel@l"},
+    {PPC::S_U, "u"},
 };
 
 PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 48806051f581..9fbb73c2e318 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -33,6 +33,90 @@ public:
   explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
 };
 
+namespace PPC {
+enum Specifier {
+  S_None,
+
+  S_LO,
+  S_HI,
+  S_HA,
+  S_HIGH,
+  S_HIGHA,
+  S_HIGHER,
+  S_HIGHERA,
+  S_HIGHEST,
+  S_HIGHESTA,
+
+  S_AIX_TLSGD,       // symbol@gd
+  S_AIX_TLSGDM,      // symbol@m
+  S_AIX_TLSIE,       // symbol@ie
+  S_AIX_TLSLD,       // symbol@ld
+  S_AIX_TLSLE,       // symbol@le
+  S_AIX_TLSML,       // symbol@ml
+  S_DTPMOD,          // symbol@dtpmod
+  S_DTPREL,          // symbol@dprel
+  S_DTPREL_HA,       // symbol@dtprel@ha
+  S_DTPREL_HI,       // symbol@dtprel@h
+  S_DTPREL_HIGH,     // symbol@dtprel@high
+  S_DTPREL_HIGHA,    // symbol@dtprel@higha
+  S_DTPREL_HIGHER,   // symbol@dtprel@higher
+  S_DTPREL_HIGHERA,  // symbol@dtprel@highera
+  S_DTPREL_HIGHEST,  // symbol@dtprel@highest
+  S_DTPREL_HIGHESTA, // symbol@dtprel@highesta
+  S_DTPREL_LO,       // symbol@dtprel@l
+  S_GOT,             // symbol@got
+  S_GOT_DTPREL,      // symbol@got@dtprel
+  S_GOT_DTPREL_HA,   // symbol@got@dtprel@ha
+  S_GOT_DTPREL_HI,   // symbol@got@dtprel@h
+  S_GOT_DTPREL_LO,   // symbol@got@dtprel@l
+  S_GOT_HA,          // symbol@got@ha
+  S_GOT_HI,          // symbol@got@h
+  S_GOT_LO,          // symbol@got@l
+  S_GOT_PCREL,       // symbol@got@pcrel
+  S_GOT_TLSGD,       // symbol@got@tlsgd
+  S_GOT_TLSGD_HA,    // symbol@got@tlsgd@ha
+  S_GOT_TLSGD_HI,    // symbol@got@tlsgd@h
+  S_GOT_TLSGD_LO,    // symbol@got@tlsgd@l
+  S_GOT_TLSGD_PCREL, // symbol@got@tlsgd@pcrel
+  S_GOT_TLSLD,       // symbol@got@tlsld
+  S_GOT_TLSLD_HA,    // symbol@got@tlsld@ha
+  S_GOT_TLSLD_HI,    // symbol@got@tlsld@h
+  S_GOT_TLSLD_LO,    // symbol@got@tlsld@l
+  S_GOT_TLSLD_PCREL, // symbol@got@tlsld@pcrel
+  S_GOT_TPREL,       // symbol@got@tprel
+  S_GOT_TPREL_HA,    // symbol@got@tprel@ha
+  S_GOT_TPREL_HI,    // symbol@got@tprel@h
+  S_GOT_TPREL_LO,    // symbol@got@tprel@l
+  S_GOT_TPREL_PCREL, // symbol@got@tprel@pcrel
+  S_L,               // symbol@l
+  S_LOCAL,           // symbol@local
+  S_NOTOC,           // symbol@notoc
+  S_PCREL,
+  S_PCREL_OPT,      // .reloc expr, R_PPC64_PCREL_OPT, expr
+  S_PLT,            // symbol@plt
+  S_TLS,            // symbol@tls
+  S_TLSGD,          // symbol@tlsgd
+  S_TLSLD,          // symbol@tlsld
+  S_TLS_PCREL,      // symbol@tls@pcrel
+  S_TOC,            // symbol@toc
+  S_TOCBASE,        // symbol@tocbase
+  S_TOC_HA,         // symbol@toc@ha
+  S_TOC_HI,         // symbol@toc@h
+  S_TOC_LO,         // symbol@toc@l
+  S_TPREL,          // symbol@tprel
+  S_TPREL_HA,       // symbol@tprel@ha
+  S_TPREL_HI,       // symbol@tprel@h
+  S_TPREL_HIGH,     // symbol@tprel@high
+  S_TPREL_HIGHA,    // symbol@tprel@higha
+  S_TPREL_HIGHER,   // symbol@tprel@higher
+  S_TPREL_HIGHERA,  // symbol@tprel@highera
+  S_TPREL_HIGHEST,  // symbol@tprel@highest
+  S_TPREL_HIGHESTA, // symbol@tprel@highesta
+  S_TPREL_LO,       // symbol@tprel@l
+  S_U,              // symbol@u
+};
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index ef067f745239..b1b1c5280f2a 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -329,11 +329,11 @@ PPCMCCodeEmitter::getDispRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
     (void)SRE;
     // Currently these are the only valid PCRelative Relocations.
-    assert((getSpecifier(SRE) == PPCMCExpr::VK_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_TLSGD_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_TLSLD_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_TPREL_PCREL) &&
+    assert((getSpecifier(SRE) == PPC::S_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_TLSGD_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_TLSLD_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_TPREL_PCREL) &&
            "VariantKind must be VK_PCREL or VK_GOT_PCREL or "
            "VK_GOT_TLSGD_PCREL or VK_GOT_TLSLD_PCREL or "
            "VK_GOT_TPREL_PCREL.");
@@ -368,8 +368,8 @@ PPCMCCodeEmitter::getDispRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
            "Value must fit in 34 bits.");
 
     // Currently these are the only valid PCRelative Relocations.
-    assert((getSpecifier(SRE) == PPCMCExpr::VK_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_PCREL) &&
+    assert((getSpecifier(SRE) == PPC::S_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_PCREL) &&
            "VariantKind must be VK_PCREL or VK_GOT_PCREL");
     // Generate the fixup for the relocation.
     Fixups.push_back(
@@ -433,7 +433,7 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
   // if using PC relative memops.
   const MCExpr *Expr = MO.getExpr();
   const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
-  bool IsPCRel = getSpecifier(SRE) == PPCMCExpr::VK_TLS_PCREL;
+  bool IsPCRel = getSpecifier(SRE) == PPC::S_TLS_PCREL;
   Fixups.push_back(MCFixup::create(IsPCRel ? 1 : 0, Expr,
                                    (MCFixupKind)PPC::fixup_ppc_nofixup));
   const Triple &TT = STI.getTargetTriple();
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 728b6799f94d..49ae6bb5fa45 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -21,6 +21,11 @@ const PPCMCExpr *PPCMCExpr::create(Specifier S, const MCExpr *Expr,
   return new (Ctx) PPCMCExpr(S, Expr);
 }
 
+const PPCMCExpr *PPCMCExpr::create(const MCExpr *Expr, Specifier S,
+                                   MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(S, Expr);
+}
+
 void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   getSubExpr()->print(OS, MAI);
   OS << '@' << MAI->getSpecifierName(specifier);
@@ -44,23 +49,23 @@ PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
 
 std::optional<int64_t> PPCMCExpr::evaluateAsInt64(int64_t Value) const {
   switch (specifier) {
-  case VK_LO:
+  case PPC::S_LO:
     return Value & 0xffff;
-  case VK_HI:
+  case PPC::S_HI:
     return (Value >> 16) & 0xffff;
-  case VK_HA:
+  case PPC::S_HA:
     return ((Value + 0x8000) >> 16) & 0xffff;
-  case VK_HIGH:
+  case PPC::S_HIGH:
     return (Value >> 16) & 0xffff;
-  case VK_HIGHA:
+  case PPC::S_HIGHA:
     return ((Value + 0x8000) >> 16) & 0xffff;
-  case VK_HIGHER:
+  case PPC::S_HIGHER:
     return (Value >> 32) & 0xffff;
-  case VK_HIGHERA:
+  case PPC::S_HIGHERA:
     return ((Value + 0x8000) >> 32) & 0xffff;
-  case VK_HIGHEST:
+  case PPC::S_HIGHEST:
     return (Value >> 48) & 0xffff;
-  case VK_HIGHESTA:
+  case PPC::S_HIGHESTA:
     return ((Value + 0x8000) >> 48) & 0xffff;
   default:
     return {};
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 3d0511da2749..814217ea060e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
 #define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
 
+#include "PPCMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
 #include <optional>
@@ -18,87 +19,6 @@ namespace llvm {
 class PPCMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = uint16_t;
-  enum {
-    VK_None,
-
-    VK_LO = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_HI,
-    VK_HA,
-    VK_HIGH,
-    VK_HIGHA,
-    VK_HIGHER,
-    VK_HIGHERA,
-    VK_HIGHEST,
-    VK_HIGHESTA,
-
-    VK_AIX_TLSGD,       // symbol@gd
-    VK_AIX_TLSGDM,      // symbol@m
-    VK_AIX_TLSIE,       // symbol@ie
-    VK_AIX_TLSLD,       // symbol@ld
-    VK_AIX_TLSLE,       // symbol@le
-    VK_AIX_TLSML,       // symbol@ml
-    VK_DTPMOD,          // symbol@dtpmod
-    VK_DTPREL,          // symbol@dprel
-    VK_DTPREL_HA,       // symbol@dtprel@ha
-    VK_DTPREL_HI,       // symbol@dtprel@h
-    VK_DTPREL_HIGH,     // symbol@dtprel@high
-    VK_DTPREL_HIGHA,    // symbol@dtprel@higha
-    VK_DTPREL_HIGHER,   // symbol@dtprel@higher
-    VK_DTPREL_HIGHERA,  // symbol@dtprel@highera
-    VK_DTPREL_HIGHEST,  // symbol@dtprel@highest
-    VK_DTPREL_HIGHESTA, // symbol@dtprel@highesta
-    VK_DTPREL_LO,       // symbol@dtprel@l
-    VK_GOT,             // symbol@got
-    VK_GOT_DTPREL,      // symbol@got@dtprel
-    VK_GOT_DTPREL_HA,   // symbol@got@dtprel@ha
-    VK_GOT_DTPREL_HI,   // symbol@got@dtprel@h
-    VK_GOT_DTPREL_LO,   // symbol@got@dtprel@l
-    VK_GOT_HA,          // symbol@got@ha
-    VK_GOT_HI,          // symbol@got@h
-    VK_GOT_LO,          // symbol@got@l
-    VK_GOT_PCREL,       // symbol@got@pcrel
-    VK_GOT_TLSGD,       // symbol@got@tlsgd
-    VK_GOT_TLSGD_HA,    // symbol@got@tlsgd@ha
-    VK_GOT_TLSGD_HI,    // symbol@got@tlsgd@h
-    VK_GOT_TLSGD_LO,    // symbol@got@tlsgd@l
-    VK_GOT_TLSGD_PCREL, // symbol@got@tlsgd@pcrel
-    VK_GOT_TLSLD,       // symbol@got@tlsld
-    VK_GOT_TLSLD_HA,    // symbol@got@tlsld@ha
-    VK_GOT_TLSLD_HI,    // symbol@got@tlsld@h
-    VK_GOT_TLSLD_LO,    // symbol@got@tlsld@l
-    VK_GOT_TLSLD_PCREL, // symbol@got@tlsld@pcrel
-    VK_GOT_TPREL,       // symbol@got@tprel
-    VK_GOT_TPREL_HA,    // symbol@got@tprel@ha
-    VK_GOT_TPREL_HI,    // symbol@got@tprel@h
-    VK_GOT_TPREL_LO,    // symbol@got@tprel@l
-    VK_GOT_TPREL_PCREL, // symbol@got@tprel@pcrel
-    VK_L,               // symbol@l
-    VK_LOCAL,           // symbol@local
-    VK_NOTOC,           // symbol@notoc
-    VK_PCREL,
-    VK_PCREL_OPT,      // .reloc expr, R_PPC64_PCREL_OPT, expr
-    VK_PLT,            // symbol@plt
-    VK_TLS,            // symbol@tls
-    VK_TLSGD,          // symbol@tlsgd
-    VK_TLSLD,          // symbol@tlsld
-    VK_TLS_PCREL,      // symbol@tls@pcrel
-    VK_TOC,            // symbol@toc
-    VK_TOCBASE,        // symbol@tocbase
-    VK_TOC_HA,         // symbol@toc@ha
-    VK_TOC_HI,         // symbol@toc@h
-    VK_TOC_LO,         // symbol@toc@l
-    VK_TPREL,          // symbol@tprel
-    VK_TPREL_HA,       // symbol@tprel@ha
-    VK_TPREL_HI,       // symbol@tprel@h
-    VK_TPREL_HIGH,     // symbol@tprel@high
-    VK_TPREL_HIGHA,    // symbol@tprel@higha
-    VK_TPREL_HIGHER,   // symbol@tprel@higher
-    VK_TPREL_HIGHERA,  // symbol@tprel@highera
-    VK_TPREL_HIGHEST,  // symbol@tprel@highest
-    VK_TPREL_HIGHESTA, // symbol@tprel@highesta
-    VK_TPREL_LO,       // symbol@tprel@l
-    VK_U,              // symbol@u
-  };
 
 private:
   std::optional<int64_t> evaluateAsInt64(int64_t Value) const;
@@ -109,18 +29,8 @@ private:
 public:
   static const PPCMCExpr *create(Specifier S, const MCExpr *Expr,
                                  MCContext &Ctx);
-
-  static const PPCMCExpr *createLo(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO, Expr, Ctx);
-  }
-
-  static const PPCMCExpr *createHi(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI, Expr, Ctx);
-  }
-
-  static const PPCMCExpr *createHa(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HA, Expr, Ctx);
-  }
+  static const PPCMCExpr *create(const MCExpr *Expr, Specifier S,
+                                 MCContext &Ctx);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 03a034182ae1..7f80c101bcc9 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -221,9 +221,9 @@ public:
       // variables. Finally for local-exec and initial-exec, we have a thread
       // pointer, in r13 for 64-bit mode and returned by .__get_tpointer for
       // 32-bit mode.
-      if (Kind == PPCMCExpr::VK_AIX_TLSGD || Kind == PPCMCExpr::VK_AIX_TLSGDM ||
-          Kind == PPCMCExpr::VK_AIX_TLSIE || Kind == PPCMCExpr::VK_AIX_TLSLE ||
-          Kind == PPCMCExpr::VK_AIX_TLSLD || Kind == PPCMCExpr::VK_AIX_TLSML)
+      if (Kind == PPC::S_AIX_TLSGD || Kind == PPC::S_AIX_TLSGDM ||
+          Kind == PPC::S_AIX_TLSIE || Kind == PPC::S_AIX_TLSLE ||
+          Kind == PPC::S_AIX_TLSLD || Kind == PPC::S_AIX_TLSML)
         OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@"
            << getContext().getAsmInfo()->getSpecifierName(Kind) << '\n';
       else
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 1b5fe08bea49..8532f537e2d6 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -61,15 +61,15 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     switch (Specifier) {
     default:
       report_fatal_error("Unsupported modifier for half16 fixup.");
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
       return {XCOFF::RelocationType::R_TOC, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_U:
+    case PPC::S_U:
       return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_L:
+    case PPC::S_L:
       return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_AIX_TLSLE:
+    case PPC::S_AIX_TLSLE:
       return {XCOFF::RelocationType::R_TLS_LE, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_AIX_TLSLD:
+    case PPC::S_AIX_TLSLD:
       return {XCOFF::RelocationType::R_TLS_LD, SignAndSizeForHalf16};
     }
   } break;
@@ -80,13 +80,13 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     switch (Specifier) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
       return {XCOFF::RelocationType::R_TOC, 15};
-    case PPCMCExpr::VK_L:
+    case PPC::S_L:
       return {XCOFF::RelocationType::R_TOCL, 15};
-    case PPCMCExpr::VK_AIX_TLSLE:
+    case PPC::S_AIX_TLSLE:
       return {XCOFF::RelocationType::R_TLS_LE, 15};
-    case PPCMCExpr::VK_AIX_TLSLD:
+    case PPC::S_AIX_TLSLD:
       return {XCOFF::RelocationType::R_TLS_LD, 15};
     }
   } break;
@@ -97,7 +97,7 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
   case PPC::fixup_ppc_br24abs:
     return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
   case PPC::fixup_ppc_nofixup: {
-    if (Specifier == PPCMCExpr::VK_None)
+    if (Specifier == PPC::S_None)
       return {XCOFF::RelocationType::R_REF, 0};
     else
       llvm_unreachable("Unsupported Modifier");
@@ -110,19 +110,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     switch (Specifier) {
     default:
       report_fatal_error("Unsupported modifier");
-    case PPCMCExpr::VK_AIX_TLSGD:
+    case PPC::S_AIX_TLSGD:
       return {XCOFF::RelocationType::R_TLS, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSGDM:
+    case PPC::S_AIX_TLSGDM:
       return {XCOFF::RelocationType::R_TLSM, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSIE:
+    case PPC::S_AIX_TLSIE:
       return {XCOFF::RelocationType::R_TLS_IE, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSLE:
+    case PPC::S_AIX_TLSLE:
       return {XCOFF::RelocationType::R_TLS_LE, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSLD:
+    case PPC::S_AIX_TLSLD:
       return {XCOFF::RelocationType::R_TLS_LD, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSML:
+    case PPC::S_AIX_TLSML:
       return {XCOFF::RelocationType::R_TLSML, SignAndSizeForFKData};
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
       return {XCOFF::RelocationType::R_POS, SignAndSizeForFKData};
     }
   }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 0fe615a95894..8a1357c5fd55 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -108,9 +108,9 @@ template <>
 struct DenseMapInfo<std::pair<const MCSymbol *, PPCMCExpr::Specifier>> {
   using TOCKey = std::pair<const MCSymbol *, PPCMCExpr::Specifier>;
 
-  static inline TOCKey getEmptyKey() { return {nullptr, PPCMCExpr::VK_None}; }
+  static inline TOCKey getEmptyKey() { return {nullptr, PPC::S_None}; }
   static inline TOCKey getTombstoneKey() {
-    return {(const MCSymbol *)1, PPCMCExpr::VK_None};
+    return {(const MCSymbol *)1, PPC::S_None};
   }
   static unsigned getHashValue(const TOCKey &PairVal) {
     return detail::combineHashValue(
@@ -174,9 +174,8 @@ public:
     TOCType_EHBlock
   };
 
-  MCSymbol *
-  lookUpOrCreateTOCEntry(const MCSymbol *Sym, TOCEntryType Type,
-                         PPCMCExpr::Specifier Kind = PPCMCExpr::VK_None);
+  MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym, TOCEntryType Type,
+                                   PPCMCExpr::Specifier Kind = PPC::S_None);
 
   bool doInitialization(Module &M) override {
     if (!TOC.empty())
@@ -691,13 +690,13 @@ void PPCAsmPrinter::EmitAIXTlsCallHelper(const MachineInstr *MI) {
 /// the current output stream.
 void PPCAsmPrinter::emitTlsCall(const MachineInstr *MI,
                                 PPCMCExpr::Specifier VK) {
-  PPCMCExpr::Specifier Kind = PPCMCExpr::VK_None;
+  PPCMCExpr::Specifier Kind = PPC::S_None;
   unsigned Opcode = PPC::BL8_NOP_TLS;
 
   assert(MI->getNumOperands() >= 3 && "Expecting at least 3 operands from MI");
   if (MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG ||
       MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG) {
-    Kind = PPCMCExpr::VK_NOTOC;
+    Kind = PPC::S_NOTOC;
     Opcode = PPC::BL8_NOTOC_TLS;
   }
   const Module *M = MF->getFunction().getParent();
@@ -730,13 +729,13 @@ void PPCAsmPrinter::emitTlsCall(const MachineInstr *MI,
   MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol("__tls_get_addr");
 
   if (Subtarget->is32BitELFABI() && isPositionIndependent())
-    Kind = PPCMCExpr::VK_PLT;
+    Kind = PPC::S_PLT;
 
   const MCExpr *TlsRef = MCSymbolRefExpr::create(
       TlsGetAddr, MCSymbolRefExpr::VariantKind(Kind), OutContext);
 
   // Add 32768 offset to the symbol so we follow up the latest GOT/PLT ABI.
-  if (Kind == PPCMCExpr::VK_PLT && Subtarget->isSecurePlt() &&
+  if (Kind == PPC::S_PLT && Subtarget->isSecurePlt() &&
       M->getPICLevel() == PICLevel::BigPIC)
     TlsRef = MCBinaryExpr::createAdd(
         TlsRef, MCConstantExpr::create(32768, OutContext), OutContext);
@@ -861,7 +860,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   auto getTOCEntryLoadingExprForXCOFF =
       [IsPPC64, getTOCRelocAdjustedExprForXCOFF,
        this](const MCSymbol *MOSymbol, const MCExpr *Expr,
-             PPCMCExpr::Specifier VK = PPCMCExpr::VK_None) -> const MCExpr * {
+             PPCMCExpr::Specifier VK = PPC::S_None) -> const MCExpr * {
     const unsigned EntryByteSize = IsPPC64 ? 8 : 4;
     const auto TOCEntryIter = TOC.find({MOSymbol, VK});
     assert(TOCEntryIter != TOC.end() &&
@@ -886,9 +885,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       assert(MO.isGlobal() && "Only expecting a global MachineOperand here!\n");
       TLSModel::Model Model = TM.getTLSModel(MO.getGlobal());
       if (Model == TLSModel::LocalExec)
-        return PPCMCExpr::VK_AIX_TLSLE;
+        return PPC::S_AIX_TLSLE;
       if (Model == TLSModel::InitialExec)
-        return PPCMCExpr::VK_AIX_TLSIE;
+        return PPC::S_AIX_TLSIE;
       // On AIX, TLS model opt may have turned local-dynamic accesses into
       // initial-exec accesses.
       PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
@@ -896,7 +895,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
           FuncInfo->isAIXFuncUseTLSIEForLD()) {
         LLVM_DEBUG(
             dbgs() << "Current function uses IE access for default LD vars.\n");
-        return PPCMCExpr::VK_AIX_TLSIE;
+        return PPC::S_AIX_TLSIE;
       }
       llvm_unreachable("Only expecting local-exec or initial-exec accesses!");
     }
@@ -904,17 +903,17 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // the variable offset and the other for the region handle). They are
     // differentiated by MO_TLSGD_FLAG and MO_TLSGDM_FLAG.
     if (Flag == PPCII::MO_TLSGDM_FLAG)
-      return PPCMCExpr::VK_AIX_TLSGDM;
+      return PPC::S_AIX_TLSGDM;
     if (Flag == PPCII::MO_TLSGD_FLAG || Flag == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
-      return PPCMCExpr::VK_AIX_TLSGD;
+      return PPC::S_AIX_TLSGD;
     // For local-dynamic TLS access on AIX, we have one TOC entry for the symbol
     // (the variable offset) and one shared TOC entry for the module handle.
     // They are differentiated by MO_TLSLD_FLAG and MO_TLSLDM_FLAG.
     if (Flag == PPCII::MO_TLSLD_FLAG && IsAIX)
-      return PPCMCExpr::VK_AIX_TLSLD;
+      return PPC::S_AIX_TLSLD;
     if (Flag == PPCII::MO_TLSLDM_FLAG && IsAIX)
-      return PPCMCExpr::VK_AIX_TLSML;
-    return PPCMCExpr::VK_None;
+      return PPC::S_AIX_TLSML;
+    return PPC::S_None;
   };
 
   // Lower multi-instruction pseudo operations.
@@ -955,8 +954,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *OffsExpr = MCBinaryExpr::createSub(
         MCSymbolRefExpr::create(
-            GOTSymbol, MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_LOCAL),
-            OutContext),
+            GOTSymbol, MCSymbolRefExpr::VariantKind(PPC::S_LOCAL), OutContext),
         MCConstantExpr::create(4, OutContext), OutContext);
 
     // Emit the 'bl'.
@@ -1002,12 +1000,14 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       const MCExpr *DeltaExpr = MCBinaryExpr::createSub(
           MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext);
 
-      const MCExpr *DeltaHi = PPCMCExpr::createHa(DeltaExpr, OutContext);
+      const MCExpr *DeltaHi =
+          PPCMCExpr::create(DeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi));
 
-      const MCExpr *DeltaLo = PPCMCExpr::createLo(DeltaExpr, OutContext);
+      const MCExpr *DeltaLo =
+          PPCMCExpr::create(DeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo));
@@ -1055,7 +1055,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Create a reference to the GOT entry for the symbol. The GOT entry will be
     // synthesized later.
     if (PL == PICLevel::SmallPIC && !IsAIX) {
-      const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT);
+      const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPC::S_GOT);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
@@ -1144,8 +1144,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     MCSymbol *TOCEntry =
         lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
-    PPCMCExpr::Specifier VKExpr =
-        IsAIX ? PPCMCExpr::VK_None : PPCMCExpr::VK_TOC;
+    PPCMCExpr::Specifier VKExpr = IsAIX ? PPC::S_None : PPC::S_TOC;
     const MCExpr *Exp = symbolWithSpecifier(TOCEntry, VKExpr);
     TmpInst.getOperand(1) = MCOperand::createExpr(
         IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp, VK) : Exp);
@@ -1195,7 +1194,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
     }
 
-    const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_U);
+    const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPC::S_U);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1227,7 +1226,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // 'MOSymbol'.
     MCSymbol *TOCEntry =
         lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
-    const MCExpr *Exp = symbolWithSpecifier(TOCEntry, PPCMCExpr::VK_L);
+    const MCExpr *Exp = symbolWithSpecifier(TOCEntry, PPC::S_L);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1260,7 +1259,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         (MO.isCPI() && CM == CodeModel::Large))
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
-    VK = IsAIX ? PPCMCExpr::VK_U : PPCMCExpr::VK_TOC_HA;
+    VK = IsAIX ? PPC::S_U : PPC::S_TOC_HA;
 
     const MCExpr *Exp = symbolWithSpecifier(MOSymbol, VK);
 
@@ -1302,7 +1301,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (!MO.isCPI() || CM == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
-    VK = IsAIX ? PPCMCExpr::VK_L : PPCMCExpr::VK_TOC_LO;
+    VK = IsAIX ? PPC::S_L : PPC::S_TOC_LO;
     const MCExpr *Exp = symbolWithSpecifier(MOSymbol, VK);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -1332,8 +1331,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     const MCExpr *Exp = MCSymbolRefExpr::create(
         MOSymbol,
-        MCSymbolRefExpr::VariantKind(IsAIX ? PPCMCExpr::VK_L
-                                           : PPCMCExpr::VK_TOC_LO),
+        MCSymbolRefExpr::VariantKind(IsAIX ? PPC::S_L : PPC::S_TOC_LO),
         OutContext);
 
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
@@ -1348,7 +1346,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT_TPREL_HA);
+        symbolWithSpecifier(MOSymbol, PPC::S_GOT_TPREL_HA);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -1365,9 +1363,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *Exp =
-        symbolWithSpecifier(MOSymbol, IsPPC64 ? PPCMCExpr::VK_GOT_TPREL_LO
-                                              : PPCMCExpr::VK_GOT_TPREL);
+    const MCExpr *Exp = symbolWithSpecifier(
+        MOSymbol, IsPPC64 ? PPC::S_GOT_TPREL_LO : PPC::S_GOT_TPREL);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1405,11 +1402,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     MCSymbol *GOTSymbol =
         OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *SymGotTlsL = PPCMCExpr::create(
-        PPCMCExpr::VK_LO, MCSymbolRefExpr::create(GOTSymbol, OutContext),
-        OutContext);
+        PPC::S_LO, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
     const MCExpr *SymGotTlsHA = PPCMCExpr::create(
-        PPCMCExpr::VK_HA, MCSymbolRefExpr::create(GOTSymbol, OutContext),
-        OutContext);
+        PPC::S_HA, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addExpr(SymGotTlsL));
@@ -1427,7 +1422,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT_TLSGD_HA);
+        symbolWithSpecifier(MOSymbol, PPC::S_GOT_TLSGD_HA);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -1443,9 +1438,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsGD =
-        symbolWithSpecifier(MOSymbol, IsPPC64 ? PPCMCExpr::VK_GOT_TLSGD_LO
-                                              : PPCMCExpr::VK_GOT_TLSGD);
+    const MCExpr *SymGotTlsGD = symbolWithSpecifier(
+        MOSymbol, IsPPC64 ? PPC::S_GOT_TLSGD_LO : PPC::S_GOT_TLSGD);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                    .addReg(MI->getOperand(0).getReg())
@@ -1470,7 +1464,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::GETtlsADDR32: {
     // Transform: %r3 = GETtlsADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
-    emitTlsCall(MI, PPCMCExpr::VK_TLSGD);
+    emitTlsCall(MI, PPC::S_TLSGD);
     return;
   }
   case PPC::GETtlsTpointer32AIX: {
@@ -1487,7 +1481,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT_TLSLD_HA);
+        symbolWithSpecifier(MOSymbol, PPC::S_GOT_TLSLD_HA);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -1503,9 +1497,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsLD =
-        symbolWithSpecifier(MOSymbol, IsPPC64 ? PPCMCExpr::VK_GOT_TLSLD_LO
-                                              : PPCMCExpr::VK_GOT_TLSLD);
+    const MCExpr *SymGotTlsLD = symbolWithSpecifier(
+        MOSymbol, IsPPC64 ? PPC::S_GOT_TLSLD_LO : PPC::S_GOT_TLSLD);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                        .addReg(MI->getOperand(0).getReg())
@@ -1520,7 +1513,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::GETtlsldADDR32: {
     // Transform: %r3 = GETtlsldADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
-    emitTlsCall(MI, PPCMCExpr::VK_TLSLD);
+    emitTlsCall(MI, PPC::S_TLSLD);
     return;
   }
   case PPC::ADDISdtprelHA:
@@ -1532,8 +1525,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymDtprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_DTPREL_HA);
+    const MCExpr *SymDtprel = symbolWithSpecifier(MOSymbol, PPC::S_DTPREL_HA);
     EmitToStreamer(
         *OutStreamer,
         MCInstBuilder(IsPPC64 ? PPC::ADDIS8 : PPC::ADDIS)
@@ -1548,8 +1540,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymDtprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_DTPREL);
+    const MCExpr *SymDtprel = symbolWithSpecifier(MOSymbol, PPC::S_DTPREL);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::PADDI8)
                                      .addReg(MI->getOperand(0).getReg())
                                      .addReg(MI->getOperand(1).getReg())
@@ -1566,8 +1557,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymDtprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_DTPREL_LO);
+    const MCExpr *SymDtprel = symbolWithSpecifier(MOSymbol, PPC::S_DTPREL_LO);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                        .addReg(MI->getOperand(0).getReg())
@@ -1737,9 +1727,8 @@ PPCAsmPrinter::getAdjustedFasterLocalExpr(const MachineOperand &MO,
   // assume that the address of extern TLS variables are zero.
   const MCExpr *Expr = MCSymbolRefExpr::create(
       getSymbol(GValue),
-      MCSymbolRefExpr::VariantKind(Model == TLSModel::LocalExec
-                                       ? PPCMCExpr::VK_AIX_TLSLE
-                                       : PPCMCExpr::VK_AIX_TLSLD),
+      MCSymbolRefExpr::VariantKind(
+          Model == TLSModel::LocalExec ? PPC::S_AIX_TLSLE : PPC::S_AIX_TLSLD),
       OutContext);
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
@@ -2028,8 +2017,7 @@ void PPCLinuxAsmPrinter::emitFunctionEntryLabel() {
   // Generates a R_PPC64_TOC relocation for TOC base insertion.
   OutStreamer->emitValue(
       MCSymbolRefExpr::create(
-          Symbol2, MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_TOCBASE),
-          OutContext),
+          Symbol2, MCSymbolRefExpr::VariantKind(PPC::S_TOCBASE), OutContext),
       8 /*size*/);
   // Emit a null environment pointer.
   OutStreamer->emitIntValue(0, 8 /* size */);
@@ -2136,13 +2124,15 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
                                 GlobalEntryLabelExp, OutContext);
 
-      const MCExpr *TOCDeltaHi = PPCMCExpr::createHa(TOCDeltaExpr, OutContext);
+      const MCExpr *TOCDeltaHi =
+          PPCMCExpr::create(TOCDeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X12)
                                    .addExpr(TOCDeltaHi));
 
-      const MCExpr *TOCDeltaLo = PPCMCExpr::createLo(TOCDeltaExpr, OutContext);
+      const MCExpr *TOCDeltaLo =
+          PPCMCExpr::create(TOCDeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X2)
@@ -3007,9 +2997,9 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
     // new symbol to prefix the name with a dot.
     // If TLS model opt is turned on, create a new symbol to prefix the name
     // with a dot.
-    if (I.first.second == PPCMCExpr::VK_AIX_TLSGDM ||
+    if (I.first.second == PPC::S_AIX_TLSGDM ||
         (Subtarget->hasAIXShLibTLSModelOpt() &&
-         I.first.second == PPCMCExpr::VK_AIX_TLSLD)) {
+         I.first.second == PPC::S_AIX_TLSLD)) {
       SmallString<128> Name;
       StringRef Prefix = ".";
       Name += Prefix;
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 0a04b7fb8d16..f6624ec989ee 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -54,31 +54,31 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
                               AsmPrinter &Printer) {
   MCContext &Ctx = Printer.OutContext;
-  PPCMCExpr::Specifier RefKind = PPCMCExpr::VK_None;
+  PPCMCExpr::Specifier RefKind = PPC::S_None;
 
   unsigned access = MO.getTargetFlags();
 
   switch (access) {
     case PPCII::MO_TPREL_LO:
-      RefKind = PPCMCExpr::VK_TPREL_LO;
+      RefKind = PPC::S_TPREL_LO;
       break;
     case PPCII::MO_TPREL_HA:
-      RefKind = PPCMCExpr::VK_TPREL_HA;
+      RefKind = PPC::S_TPREL_HA;
       break;
     case PPCII::MO_DTPREL_LO:
-      RefKind = PPCMCExpr::VK_DTPREL_LO;
+      RefKind = PPC::S_DTPREL_LO;
       break;
     case PPCII::MO_TLSLD_LO:
-      RefKind = PPCMCExpr::VK_GOT_TLSLD_LO;
+      RefKind = PPC::S_GOT_TLSLD_LO;
       break;
     case PPCII::MO_TOC_LO:
-      RefKind = PPCMCExpr::VK_TOC_LO;
+      RefKind = PPC::S_TOC_LO;
       break;
     case PPCII::MO_TLS:
-      RefKind = PPCMCExpr::VK_TLS;
+      RefKind = PPC::S_TLS;
       break;
     case PPCII::MO_TLS_PCREL_FLAG:
-      RefKind = PPCMCExpr::VK_TLS_PCREL;
+      RefKind = PPC::S_TLS_PCREL;
       break;
   }
 
@@ -87,19 +87,19 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   const MachineFunction *MF = MI->getMF();
 
   if (MO.getTargetFlags() == PPCII::MO_PLT)
-    RefKind = PPCMCExpr::VK_PLT;
+    RefKind = PPC::S_PLT;
   else if (MO.getTargetFlags() == PPCII::MO_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_PCREL;
+    RefKind = PPC::S_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_PCREL;
+    RefKind = PPC::S_GOT_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_TPREL_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_TPREL;
+    RefKind = PPC::S_TPREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_TLSGD_PCREL;
+    RefKind = PPC::S_GOT_TLSGD_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_TLSLD_PCREL;
+    RefKind = PPC::S_GOT_TLSLD_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_TPREL_PCREL;
+    RefKind = PPC::S_GOT_TPREL_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_TPREL_FLAG ||
            MO.getTargetFlags() == PPCII::MO_TLSLD_FLAG) {
     assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
@@ -110,14 +110,14 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     // the relocation type in case the result is used for purposes other than a
     // TOC reference. In TOC reference cases, this result is discarded.
     if (Model == TLSModel::LocalExec)
-      RefKind = PPCMCExpr::VK_AIX_TLSLE;
+      RefKind = PPC::S_AIX_TLSLE;
     else if (Model == TLSModel::LocalDynamic &&
              FuncInfo->isAIXFuncUseTLSIEForLD())
       // On AIX, TLS model opt may have turned local-dynamic accesses into
       // initial-exec accesses.
-      RefKind = PPCMCExpr::VK_AIX_TLSIE;
+      RefKind = PPC::S_AIX_TLSIE;
     else if (Model == TLSModel::LocalDynamic)
-      RefKind = PPCMCExpr::VK_AIX_TLSLD;
+      RefKind = PPC::S_AIX_TLSLD;
   }
 
   const Module *M = MF->getFunction().getParent();
@@ -130,10 +130,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     if (MIOpcode == PPC::TAILB || MIOpcode == PPC::TAILB8 ||
         MIOpcode == PPC::TCRETURNdi || MIOpcode == PPC::TCRETURNdi8 ||
         MIOpcode == PPC::BL8_NOTOC || MIOpcode == PPC::BL8_NOTOC_RM) {
-      RefKind = PPCMCExpr::VK_NOTOC;
+      RefKind = PPC::S_NOTOC;
     }
     if (MO.getTargetFlags() == PPCII::MO_PCREL_OPT_FLAG)
-      RefKind = PPCMCExpr::VK_PCREL_OPT;
+      RefKind = PPC::S_PCREL_OPT;
   }
 
   const MCExpr *Expr = MCSymbolRefExpr::create(
@@ -164,11 +164,11 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   switch (access) {
     case PPCII::MO_LO:
     case PPCII::MO_PIC_LO_FLAG:
-      Expr = PPCMCExpr::createLo(Expr, Ctx);
+      Expr = PPCMCExpr::create(Expr, PPC::S_LO, Ctx);
       break;
     case PPCII::MO_HA:
     case PPCII::MO_PIC_HA_FLAG:
-      Expr = PPCMCExpr::createHa(Expr, Ctx);
+      Expr = PPCMCExpr::create(Expr, PPC::S_HA, Ctx);
       break;
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index 078f4b1effbb..29e4286cf4ad 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetObjectFile.h"
-#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCAsmInfo.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -49,8 +49,8 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
 
 const MCExpr *PPC64LinuxTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
-  const MCExpr *Expr = MCSymbolRefExpr::create(
-      Sym, MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_DTPREL), getContext());
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Sym, PPC::S_DTPREL, getContext());
   return MCBinaryExpr::createAdd(Expr,
                                  MCConstantExpr::create(0x8000, getContext()),
                                  getContext());

From a8d76acdd88b25a98e50ac2da9e6f311fc2c2cb8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 13:22:52 -0700
Subject: [PATCH 0419/1322] PowerPC: Replace MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp  | 4 ++--
 llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index bd01767f41bd..d587e7d339e8 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -590,7 +590,7 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
   if (Rhs) {
     SmallString<0> Buf;
     raw_svector_ostream Tmp(Buf);
-    Rhs->print(Tmp, &MAI);
+    MAI.printExpr(Tmp, *Rhs);
     if (isdigit(Buf[0]))
       O << '+';
     O << Buf;
@@ -671,5 +671,5 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 7f80c101bcc9..44b5732be6e3 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -255,7 +255,7 @@ public:
     OS << "\t.localentry\t";
     S->print(OS, MAI);
     OS << ", ";
-    LocalOffset->print(OS, MAI);
+    MAI->printExpr(OS, *LocalOffset);
     OS << '\n';
   }
 };

From 087a6ac420ad99c523b9dd517351e0c6d1f1a980 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 15 Jun 2025 21:22:29 +0100
Subject: [PATCH 0420/1322] [LV] Add users to some first-order recurrence
 tests.

Add extra users to ensure the recurrence cannot be DCE'd.

Also re-generates some checks.
---
 .../partial-reduce-dot-product-neon.ll        |  89 +-
 .../first-order-recurrence-scalable-vf1.ll    |  25 +-
 .../first-order-recurrence-chains.ll          | 927 +++++++++++++-----
 .../LoopVectorize/first-order-recurrence.ll   | 263 +++--
 .../scalable-first-order-recurrence.ll        | 851 ++++++++++++++--
 5 files changed, 1697 insertions(+), 458 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index e6687fe767c0..0fc324f720e6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -589,88 +589,100 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
-define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
+define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVED-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-MAXBW-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
@@ -688,6 +700,8 @@ for.body:                                         ; preds = %for.body, %entry
   %ext.b = zext i8 %load.b to i32
   %mul = mul i32 %ext.b, %ext.a
   %add = add i32 %mul, %ext.b
+  %gep.c = getelementptr i32, ptr %c, i64 %iv
+  store i32 %accum, ptr %gep.c
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %for.exit, label %for.body
@@ -946,6 +960,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 1d12f11b20e1..d34098545716 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -6,9 +6,9 @@ target triple = "riscv64-unknown-linux-gnu"
 
 ; Make sure we do not pick <vscale x 1 x i64> as VF for a loop with a
 ; first-order recurrence.
-define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
+define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
-; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -17,8 +17,17 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -31,15 +40,17 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[SCALAR_RECUR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
@@ -49,8 +60,10 @@ loop:
   %for = phi i64 [ 0, %entry ], [ %l, %loop ]
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %iv.next = add i64 %iv, 1
-  %gep = getelementptr inbounds i64, ptr %src, i64 %iv
-  %l = load i64, ptr %gep, align 8
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src, align 8
+  %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
+  store i64 %for, ptr %gep.dst
   %ec = icmp eq i64 %iv, 22
   br i1 %ec, label %exit, label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 89268ac25c34..0c5784b32fc9 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -1,26 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
 define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_1
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES]]
 ;
 entry:
   br label %loop
@@ -43,26 +71,53 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_2
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:      middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1]] = phi i16 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES]]
 ;
 entry:
   br label %loop
@@ -85,31 +140,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -135,8 +222,23 @@ exit:
 }
 
 define void @test_cyclic_phis(ptr %ptr) {
-; CHECK-LABEL: @test_cyclic_phis
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define void @test_cyclic_phis(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, %[[ENTRY]] ], [ [[FOR_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2]] = phi i16 [ 33, %[[ENTRY]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT:%.*]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -158,19 +260,47 @@ exit:
 }
 
 define void @test_first_order_recurrences_incoming_cycle_preheader(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_incoming_cycle_preheader
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
-; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define void @test_first_order_recurrences_incoming_cycle_preheader(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    br i1 true, label %[[LOOP_PREHEADER:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP2]], splat (i16 10)
+; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], 10
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.1
@@ -195,31 +325,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_reordered_1
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_reordered_1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -245,31 +407,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_reordered_2
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_reordered_2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -295,31 +489,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_for2_no_other_uses
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], splat (i16 10)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], 10
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -345,30 +571,61 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_for1_for2_no_other_uses
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP6]], splat (i16 10)
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_3]], 10
+; CHECK-NEXT:    store i16 [[ADD_1]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -393,28 +650,56 @@ exit:
 }
 
 define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
+; CHECK-LABEL: define double @test_chained_first_order_recurrence_sink_users_1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[PTR:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 1.000000e+01, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 2.000000e+01, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double 1.000000e+01, [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd double [[ADD_1]], [[FOR_1]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load double, ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    store double [[ADD_2]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    ret double [[RES]]
 ;
 entry:
   br label %loop
@@ -438,8 +723,25 @@ exit:
 }
 
 define void @test_first_order_recurrences_and_reduction(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_reduction(
-; CHECK-NOT:   vector.body:
+; CHECK-LABEL: define void @test_first_order_recurrences_and_reduction(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, %[[ENTRY]] ], [ [[RED:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED]] = phi i16 [ 33, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[FOR_1_NEXT:%.*]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], 10
+; CHECK-NEXT:    [[RED_NEXT]] = add i16 [[RED]], [[LV]]
+; CHECK-NEXT:    store i16 [[ADD_1]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -463,24 +765,46 @@ exit:
 }
 
 define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_induction(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %vector.ph ], [ [[VEC_IND:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-LABEL: define i64 @test_first_order_recurrences_and_induction(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %[[VECTOR_PH]] ], [ [[VEC_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[FOR_1]], 10
+; CHECK-NEXT:    store i64 [[ADD_1]], ptr [[GEP_PTR]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i64 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[FOR_1_LCSSA]]
+;
 
 entry:
   br label %loop
@@ -502,24 +826,45 @@ exit:
 ; Same as @test_first_order_recurrences_and_induction but with order of phis
 ; flipped.
 define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_induction2(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %vector.ph ], [ [[VEC_IND]], %vector.body ]
+; CHECK-LABEL: define i64 @test_first_order_recurrences_and_induction2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %[[VECTOR_PH]] ], [ [[VEC_IND]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[FOR_1]], 10
+; CHECK-NEXT:    store i64 [[ADD_1]], ptr [[GEP_PTR]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i64 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[FOR_1_LCSSA]]
 ;
 entry:
   br label %loop
@@ -539,26 +884,50 @@ exit:
 }
 
 define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_pointer_induction1(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 4000
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
+; CHECK-LABEL: define ptr @test_first_order_recurrences_and_pointer_induction1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 4000
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[PTR_IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    store ptr [[PTR_IV]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i32, ptr [[PTR_IV]], i64 1
+; CHECK-NEXT:    store ptr [[FOR_1]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi ptr [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret ptr [[FOR_1_LCSSA]]
 ;
 entry:
   br label %loop
@@ -571,6 +940,7 @@ loop:
   %gep.ptr = getelementptr inbounds ptr, ptr %ptr, i64 %iv
   store ptr %ptr.iv, ptr %gep.ptr
   %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
+  store ptr %for.1, ptr %gep.ptr
   %exitcond.not = icmp eq i64 %iv.next, 1000
   br i1 %exitcond.not, label %exit, label %loop
 
@@ -581,26 +951,50 @@ exit:
 ; same as @test_first_order_recurrences_and_pointer_induction1 but with order
 ; of phis flipped.
 define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_pointer_induction2(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 4000
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-LABEL: define ptr @test_first_order_recurrences_and_pointer_induction2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 4000
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %[[VECTOR_PH]] ], [ [[VECTOR_GEP:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_GEP]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[VECTOR_GEP]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[PTR_IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    store ptr [[PTR_IV]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i32, ptr [[PTR_IV]], i64 1
+; CHECK-NEXT:    store ptr [[FOR_1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi ptr [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret ptr [[FOR_1_LCSSA]]
 ;
 entry:
   br label %loop
@@ -613,6 +1007,7 @@ loop:
   %gep.ptr = getelementptr inbounds ptr, ptr %ptr, i64 %iv
   store ptr %ptr.iv, ptr %gep.ptr
   %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
+  store ptr %for.1, ptr %gep.ptr
   %exitcond.not = icmp eq i64 %iv.next, 1000
   br i1 %exitcond.not, label %exit, label %loop
 
@@ -623,39 +1018,64 @@ exit:
 ; In this test case, %USE_2_FORS uses 2 different fixed-order recurrences and
 ; it needs to be sunk past the previous value for both recurrences.
 define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
-; CHECK-LABEL: @test_resinking_required(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[BROADCAST_SPLAT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[BROADCAST_SPLAT4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr %a, align 8
+; CHECK-LABEL: define double @test_resinking_required(
+; CHECK-SAME: ptr [[P:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %[[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %[[VECTOR_PH]] ], [ [[BROADCAST_SPLAT4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[A]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr %b, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT3]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[BROADCAST_SPLAT4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
-; CHECK-NEXT:    store double [[TMP6]], ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR2]], <4 x double> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP8]], i32 3
+; CHECK-NEXT:    store double [[TMP6]], ptr [[P]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 true, label %End, label %scalar.ph
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ]
-; CHECK-NEXT:    phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ]
-; CHECK-NEXT:    phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ]
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ 1000, %middle.block ], [ 0, %Entry ]
-; CHECK:      End:
-; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ]
-; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ]
-; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI10]], %middle.block ]
+; CHECK-NEXT:    br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT5:%.*]] = phi double [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ [[L1:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ [[L2:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT5]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[USE_2_FORS:%.*]] = fdiv double [[FOR_3]], [[FOR_1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double 0.000000e+00, [[FOR_1]]
+; CHECK-NEXT:    [[L1]] = load double, ptr [[A]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[L2]] = load double, ptr [[B]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[DIV]], [[FOR_3]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[P]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi double [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = fadd double [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret double [[RES_2]]
 ;
 Entry:
   br label %Loop
@@ -670,7 +1090,8 @@ Loop:
   %l1 = load double, ptr %a, align 8
   %iv.next= add nuw nsw i64 %iv, 1
   %l2 = load double, ptr %b, align 8
-  store double %div, ptr %p, align 8
+  %add = fadd double %div, %for.3
+  store double %add, ptr %p, align 8
   %cond = icmp eq i64 %iv.next, 1000
   br i1 %cond, label %End, label %Loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 594b8ff70feb..d28db1c77efa 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -870,7 +870,7 @@ for.end:
 ; }
 ;
 ;
-define i32 @PR27246() {
+define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-LABEL: @PR27246(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
@@ -882,21 +882,25 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -904,19 +908,21 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND2:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-IC:       for.cond1:
-; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
 ; UNROLL-NO-IC-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-IC-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-IC:       for.cond.cleanup3:
-; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -932,34 +938,39 @@ define i32 @PR27246() {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[FOR_COND1]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
-; UNROLL-NO-VF-NEXT:    [[TMP1]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND2:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-VF:       for.cond1:
-; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
 ; UNROLL-NO-VF-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-VF-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-VF:       for.cond.cleanup3:
-; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -975,20 +986,24 @@ define i32 @PR27246() {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
 ; SINK-AFTER-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; SINK-AFTER-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[FOR_COND1]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
@@ -996,19 +1011,21 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND2:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; SINK-AFTER-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; SINK-AFTER:       for.cond1:
-; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
 ; SINK-AFTER-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; SINK-AFTER-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; SINK-AFTER-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
+; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SINK-AFTER:       for.cond.cleanup3:
-; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; SINK-AFTER-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -1028,8 +1045,10 @@ for.cond.cleanup:
 for.cond1:
   %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
   %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %i.016
   %cmp2 = icmp sgt i32 %k.0, 1
   %dec = add nsw i32 %k.0, -1
+  store i32 %e.1, ptr %gep.dst
   br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
 
 for.cond.cleanup3:
@@ -1056,7 +1075,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
@@ -1074,10 +1093,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP17]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP18]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1160,7 +1179,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -1178,10 +1197,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
-; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP17]], i32 2
-; SINK-AFTER-NEXT:    [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP18]], i32 3
+; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
+; SINK-AFTER-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
+; SINK-AFTER-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1371,19 +1390,19 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[TMP1]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP4]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
@@ -2650,7 +2669,7 @@ for.end:
   ret void
 }
 
-define i32 @sink_into_replication_region(i32 %y) {
+define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
@@ -2741,18 +2760,74 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
 ; UNROLL-NO-IC:       pred.udiv.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
 ; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
+; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NO-IC:       pred.store.if:
+; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP65]], ptr [[DST:%.*]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; UNROLL-NO-IC:       pred.store.continue:
+; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; UNROLL-NO-IC:       pred.store.if21:
+; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP67]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; UNROLL-NO-IC:       pred.store.continue22:
+; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; UNROLL-NO-IC:       pred.store.if23:
+; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP53]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; UNROLL-NO-IC:       pred.store.continue24:
+; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; UNROLL-NO-IC:       pred.store.if25:
+; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP55]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; UNROLL-NO-IC:       pred.store.continue26:
+; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; UNROLL-NO-IC:       pred.store.if27:
+; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP57]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; UNROLL-NO-IC:       pred.store.continue28:
+; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; UNROLL-NO-IC:       pred.store.if29:
+; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP59]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; UNROLL-NO-IC:       pred.store.continue30:
+; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; UNROLL-NO-IC:       pred.store.if31:
+; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP61]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE32]]
+; UNROLL-NO-IC:       pred.store.continue32:
+; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF33:%.*]], label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC:       pred.store.if33:
+; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP63]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC:       pred.store.continue34:
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2777,6 +2852,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
+; UNROLL-NO-IC-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2808,15 +2884,25 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; UNROLL-NO-VF:       pred.udiv.if3:
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; UNROLL-NO-VF:       pred.udiv.continue4:
 ; UNROLL-NO-VF-NEXT:    [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
 ; UNROLL-NO-VF-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NO-VF:       pred.store.if:
+; UNROLL-NO-VF-NEXT:    store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; UNROLL-NO-VF:       pred.store.continue:
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF5:%.*]], label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF:       pred.store.if5:
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP6]], ptr [[DST]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF:       pred.store.continue6:
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2840,6 +2926,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
+; UNROLL-NO-VF-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2894,16 +2981,44 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER:       pred.udiv.continue6:
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ]
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; SINK-AFTER:       pred.udiv.if7:
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; SINK-AFTER:       pred.udiv.continue8:
 ; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ]
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
+; SINK-AFTER-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; SINK-AFTER-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; SINK-AFTER:       pred.store.if:
+; SINK-AFTER-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
+; SINK-AFTER-NEXT:    store i32 [[TMP34]], ptr [[DST:%.*]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; SINK-AFTER:       pred.store.continue:
+; SINK-AFTER-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; SINK-AFTER-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; SINK-AFTER:       pred.store.if9:
+; SINK-AFTER-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
+; SINK-AFTER-NEXT:    store i32 [[TMP28]], ptr [[DST]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; SINK-AFTER:       pred.store.continue10:
+; SINK-AFTER-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; SINK-AFTER-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; SINK-AFTER:       pred.store.if11:
+; SINK-AFTER-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
+; SINK-AFTER-NEXT:    store i32 [[TMP30]], ptr [[DST]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; SINK-AFTER:       pred.store.continue12:
+; SINK-AFTER-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER:       pred.store.if13:
+; SINK-AFTER-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
+; SINK-AFTER-NEXT:    store i32 [[TMP32]], ptr [[DST]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER:       pred.store.continue14:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2926,6 +3041,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
+; SINK-AFTER-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2943,6 +3059,7 @@ bb:
   %var6 = add i32 %var5, %var4
   %var7 = udiv i32 219220132, %var3
   %var8 = add nsw i32 %var3, -1
+  store i32 %var4, ptr %dst
   %var9 = icmp slt i32 %var3, 2
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
@@ -3430,28 +3547,28 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP7]] = zext i16 [[TMP5]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10]] = zext i16 [[TMP5]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-VF:       loop:
 ; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 59727aeb8249..e1b264620261 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "(for.body:|scalar.body:)" --filter-out-after "for.body:" --version 5
 ; RUN: opt -passes=loop-vectorize -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -force-target-supports-scalable-vectors=true -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF1
 ; RUN: opt -passes=loop-vectorize -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=2 -force-target-supports-scalable-vectors=true -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF2
 
@@ -7,30 +8,150 @@
 ; }
 ;
 define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
-; CHECK-VF4UF1-LABEL: @recurrence_1
-; CHECK-VF4UF1: for.preheader
-; CHECK-VF4UF1: %[[SUB_1:.*]] = add i32 %n, -1
-; CHECK-VF4UF1: %[[ZEXT:.*]] = zext i32 %[[SUB_1]] to i64
-; CHECK-VF4UF1: %[[ADD:.*]] = add nuw nsw i64 %[[ZEXT]], 1
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF1: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %pre_load, i32 %[[SUB1]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %[[NEXT_IDX:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2
-; CHECK-VF4UF1: %[[VEC_RECUR_FOR_PHI:.*]] =  extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB3]]
-; CHECK-VF4UF1: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4
-; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL3]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB3]]
+; CHECK-VF4UF1-LABEL: define i32 @recurrence_1(
+; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-VF4UF1-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_PREHEADER:.*]]
+; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF1-NEXT:    [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = add i64 [[B1]], -4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
+; CHECK-VF4UF1-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]]
+; CHECK-VF4UF1-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP11]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[PRE_LOAD]], i32 [[TMP16]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP20]]
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP23]], align 4
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 2
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP27]]
+; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP29]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP30]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i32 @recurrence_1(
+; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-VF4UF2-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_PREHEADER:.*]]
+; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF2-NEXT:    [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = add i64 [[B1]], -4
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
+; CHECK-VF4UF2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]]
+; CHECK-VF4UF2-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP11]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[PRE_LOAD]], i32 [[TMP16]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i32>, ptr [[TMP22]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD3]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP23]]
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD3]], [[TMP24]]
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP30]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP26]], ptr [[TMP28]], align 4
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP31]], align 4
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP33]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 2
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP35]]
+; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP38]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 entry:
   br label %for.preheader
 
@@ -63,21 +184,142 @@ for.exit:
 ; }
 ;
 define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
-; CHECK-VF4UF1-LABEL: @recurrence_2
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF1: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %.pre, i32 %[[SUB1]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB2:.*]] = sub i32 %[[MUL2]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB2]]
+; CHECK-VF4UF1-LABEL: define i32 @recurrence_2(
+; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF1-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
+; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTPRE]], i32 [[TMP9]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP12]]
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP13]], zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]]
+; CHECK-VF4UF1-NEXT:    [[TMP17]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP15]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP22]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF1:       [[FOR_COND_CLEANUP]]:
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-VF4UF1-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i32 @recurrence_2(
+; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF2-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
+; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTPRE]], i32 [[TMP9]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD2]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD2]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP15]]
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD2]], [[TMP16]]
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP17]], zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP18]], zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = icmp slt <vscale x 4 x i32> [[VEC_PHI]], [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = icmp slt <vscale x 4 x i32> [[VEC_PHI1]], [[TMP22]]
+; CHECK-VF4UF2-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[TMP26]] = select <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> [[TMP22]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[RDX_MINMAX:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[TMP25]], <vscale x 4 x i32> [[TMP26]])
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[RDX_MINMAX]])
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD2]], i32 [[TMP31]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF2:       [[FOR_COND_CLEANUP]]:
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-VF4UF2-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 entry:
   %cmp27 = icmp sgt i32 %n, 0
   br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
@@ -113,23 +355,180 @@ scalar.body:
 }
 
 define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, float %f, i16 %p) {
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF1: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF1: %vector.recur.init = insertelement <vscale x 4 x i16> poison, i16 %0, i32 %[[SUB1]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1: %vector.recur = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[L1:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[L1]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
+; CHECK-VF4UF1-LABEL: define void @recurrence_3(
+; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]], float [[F:%.*]], i16 [[P:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF1-NEXT:    [[CONV:%.*]] = sitofp i16 [[TMP0]] to double
+; CHECK-VF4UF1-NEXT:    [[CONV1:%.*]] = fpext float [[F]] to double
+; CHECK-VF4UF1-NEXT:    [[CONV2:%.*]] = sitofp i16 [[P]] to double
+; CHECK-VF4UF1-NEXT:    [[MUL:%.*]] = fmul fast double [[CONV2]], [[CONV1]]
+; CHECK-VF4UF1-NEXT:    [[SUB:%.*]] = fsub fast double [[CONV]], [[MUL]]
+; CHECK-VF4UF1-NEXT:    store double [[SUB]], ptr [[B]], align 8
+; CHECK-VF4UF1-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[N]], 1
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]]
+; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -2
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 16
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 2
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP6]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-VF4UF1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-VF4UF1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double> poison, double [[CONV1]], i64 0
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP0]], i32 [[TMP18]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x double>
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = sitofp <vscale x 4 x i16> [[TMP21]] to <vscale x 4 x double>
+; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = fmul fast <vscale x 4 x double> [[TMP23]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = fsub fast <vscale x 4 x double> [[TMP22]], [[TMP24]]
+; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i32 0
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x double> [[TMP25]], ptr [[TMP27]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP31]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ 1, %[[FOR_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define void @recurrence_3(
+; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]], float [[F:%.*]], i16 [[P:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF2-NEXT:    [[CONV:%.*]] = sitofp i16 [[TMP0]] to double
+; CHECK-VF4UF2-NEXT:    [[CONV1:%.*]] = fpext float [[F]] to double
+; CHECK-VF4UF2-NEXT:    [[CONV2:%.*]] = sitofp i16 [[P]] to double
+; CHECK-VF4UF2-NEXT:    [[MUL:%.*]] = fmul fast double [[CONV2]], [[CONV1]]
+; CHECK-VF4UF2-NEXT:    [[SUB:%.*]] = fsub fast double [[CONV]], [[MUL]]
+; CHECK-VF4UF2-NEXT:    store double [[SUB]], ptr [[B]], align 8
+; CHECK-VF4UF2-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[N]], 1
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]]
+; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -2
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 16
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 2
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP6]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-VF4UF2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-VF4UF2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double> poison, double [[CONV1]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP0]], i32 [[TMP18]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i64 [[TMP22]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD4]] = load <vscale x 4 x i16>, ptr [[TMP23]], align 2, !alias.scope [[META6]]
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16> [[WIDE_LOAD4]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD4]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = sitofp <vscale x 4 x i16> [[TMP24]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = sitofp <vscale x 4 x i16> [[TMP25]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = fmul fast <vscale x 4 x double> [[TMP28]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = fmul fast <vscale x 4 x double> [[TMP29]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = fsub fast <vscale x 4 x double> [[TMP26]], [[TMP30]]
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = fsub fast <vscale x 4 x double> [[TMP27]], [[TMP31]]
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[TMP37]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP32]], ptr [[TMP35]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP33]], ptr [[TMP38]], align 8, !alias.scope [[META9]], !noalias [[META6]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-VF4UF2-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP41:%.*]] = mul i32 [[TMP40]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP42:%.*]] = sub i32 [[TMP41]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD4]], i32 [[TMP42]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ 1, %[[FOR_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 ; Check also that the casts were not moved needlessly.
-; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[L1]] to <vscale x 4 x double>
-; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x double>
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB2:.*]] = sub i32 %[[MUL2]], 1
-; CHECK-VF4UF1: %vector.recur.extract = extractelement <vscale x 4 x i16> %[[L1]], i32 %[[SUB2]]
 entry:
   %0 = load i16, ptr %a, align 2
   %conv = sitofp i16 %0 to double
@@ -168,10 +567,72 @@ for.end:
 }
 
 define i64 @constant_folded_previous_value() {
-; CHECK-VF4UF2-LABEL: @constant_folded_previous_value
-; CHECK-VF4UF2: vector.body
-; CHECK-VF4UF2: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i64> [ %vector.recur.init, %vector.ph ], [ splat (i64 1), %vector.body ]
-; CHECK-VF4UF2: br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-VF4UF1-LABEL: define i64 @constant_folded_previous_value() {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i64 @constant_folded_previous_value() {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 entry:
   br label %scalar.body
 
@@ -180,7 +641,7 @@ scalar.body:
   %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ]
   %tmp3 = add i64 0, 1
   %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, undef
+  %cond = icmp eq i64 %i.next, 1000
   br i1 %cond, label %for.end, label %scalar.body, !llvm.loop !0
 
 for.end:
@@ -193,28 +654,113 @@ for.end:
 ; the first order recurrence phi is used outside the loop, so we require the phi
 ; itself and not its update (addx).
 define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
-; CHECK-VF4UF2-LABEL: @extract_second_last_iteration
-; CHECK-VF4UF2: vector.ph
-; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[SPLAT_INS1:.*]] = insertelement <vscale x 4 x i32> poison, i32 %x, i64 0
-; CHECK-VF4UF2: %[[SPLAT1:.*]] = shufflevector <vscale x 4 x i32> %[[SPLAT_INS1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF2: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF2: %[[VEC_RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 %[[SUB1]]
-; ; CHECK-VF4UF2: vector.body
-; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[ADD2:.*]], %vector.body ]
-; CHECK-VF4UF2: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[SPLAT1]]
-; CHECK-VF4UF2: middle.block
-; CHECK-VF4UF2: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF2: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2
-; CHECK-VF4UF2: %vector.recur.extract.for.phi = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB3]]
-; CHECK-VF4UF2: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4
-; CHECK-VF4UF2: %[[SUB2:.*]] = sub i32 %[[MUL3]], 1
-; CHECK-VF4UF2: %vector.recur.extract = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB2]]
+; CHECK-VF4UF1-LABEL: define i32 @extract_second_last_iteration(
+; CHECK-VF4UF1-SAME: ptr [[CVAL:%.*]], i32 [[X:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 96, [[TMP1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 96, [[TMP3]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
+; CHECK-VF4UF1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP7]]
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
+; CHECK-VF4UF1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP8]], i64 0
+; CHECK-VF4UF1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; CHECK-VF4UF1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF1:       [[FOR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i32 @extract_second_last_iteration(
+; CHECK-VF4UF2-SAME: ptr [[CVAL:%.*]], i32 [[X:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 96, [[TMP1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i32 96, [[TMP3]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
+; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF2:       [[FOR_BODY]]:
+;
 entry:
   br label %for.body
 
@@ -238,13 +784,140 @@ for.end:
 
 ; Check that the sext sank after the load in the vector loop.
 define void @sink_after(ptr %a, ptr %b, i64 %n) {
-; CHECK-VF4UF1-LABEL: @sink_after
-; CHECK-VF4UF1: vector.body
-; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
-; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[LOAD]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-LABEL: define void @sink_after(
+; CHECK-VF4UF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl i64 [[N]], 2
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 2
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 2
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF4UF1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-VF4UF1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i16> [[TMP15]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]]
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP23]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP24]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[ENTRY]] ], [ [[DOTPRE]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF1:       [[FOR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define void @sink_after(
+; CHECK-VF4UF2-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl i64 [[N]], 2
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 2
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 2
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF4UF2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-VF4UF2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i64 [[TMP16]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i16>, ptr [[TMP17]], align 2, !alias.scope [[META17]]
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16> [[WIDE_LOAD3]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i16> [[TMP18]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[TMP19]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = mul nsw <vscale x 4 x i32> [[TMP22]], [[TMP20]]
+; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 4 x i32> [[TMP23]], [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP29]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP24]], ptr [[TMP27]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP25]], ptr [[TMP30]], align 4, !alias.scope [[META20]], !noalias [[META17]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP33]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD3]], i32 [[TMP34]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[ENTRY]] ], [ [[DOTPRE]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF2:       [[FOR_BODY]]:
+;
 entry:
   %.pre = load i16, ptr %a
   br label %for.body

From 790df93298b3ad6c57dafb55fc6d18bddff16c4a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 15 Jun 2025 21:59:30 +0100
Subject: [PATCH 0421/1322] [VPlan] Mark VPFirstOrderRecurrencePHI as not
 reading/writing memory.

First-order recurrence phis don't have side-effects and don't read or
write memory. Mark them as such.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   9 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   7 +
 .../LoopVectorize/AArch64/induction-costs.ll  |   9 +-
 .../AArch64/loop-vectorization-factors.ll     |   6 +-
 .../AArch64/partial-reduce-dot-product.ll     |  28 +---
 .../RISCV/blocks-with-dead-instructions.ll    |  40 -----
 .../X86/fixed-order-recurrence.ll             |  13 +-
 .../LoopVectorize/X86/induction-costs.ll      |   3 +-
 .../X86/pr131359-dead-for-splice.ll           |   6 +-
 .../Transforms/LoopVectorize/X86/pr72969.ll   |   3 +-
 .../first-order-recurrence-interleave-only.ll |   7 +-
 .../LoopVectorize/first-order-recurrence.ll   | 145 +++++++-----------
 .../interleave-and-scalarize-only.ll          |   3 +-
 llvm/test/Transforms/LoopVectorize/optsize.ll |   3 +-
 .../scalable-first-order-recurrence.ll        |  30 +---
 15 files changed, 99 insertions(+), 213 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 10906d9a30df..cca3d32c0783 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1071,7 +1071,14 @@ void VPlan::execute(VPTransformState *State) {
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   // For now only return the cost of the vector loop region, ignoring any other
   // blocks, like the preheader or middle blocks.
-  return getVectorLoopRegion()->cost(VF, Ctx);
+  InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
+
+  // If any instructions in the middle block are invalid return invalid.
+  // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
+  if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+    return InstructionCost::getInvalid();
+
+  return Cost;
 }
 
 VPRegionBlock *VPlan::getVectorLoopRegion() {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3bdfa6724f69..048286d7a97b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -66,6 +66,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
   case VPBranchOnMaskSC:
+  case VPFirstOrderRecurrencePHISC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
     return false;
@@ -113,6 +114,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
   case VPBranchOnMaskSC:
+  case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
   case VPScalarIVStepsSC:
   case VPWidenStoreEVLSC:
@@ -146,6 +148,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
 bool VPRecipeBase::mayHaveSideEffects() const {
   switch (getVPDefID()) {
   case VPDerivedIVSC:
+  case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
   case VPVectorEndPointerSC:
     return false;
@@ -837,6 +840,10 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::ExtractPenultimateElement:
+    if (VF == ElementCount::getScalable(1))
+      return InstructionCost::getInvalid();
+  LLVM_FALLTHROUGH;
   default:
     // TODO: Compute cost other VPInstructions once the legacy cost model has
     // been retired.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 6ed9c856f50c..4af4929fad52 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -105,12 +105,11 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i64> [ <i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP9]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -162,12 +161,11 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -306,7 +304,6 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
@@ -316,7 +313,7 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i32> [[TMP7]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
-; CHECK-NEXT:    [[TMP10]] = shl <4 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <4 x i64> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index e97bb857fdba..31be8862a887 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -750,11 +750,10 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <16 x i32> [[TMP3]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
@@ -832,11 +831,10 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <16 x i32> [[TMP5]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 52dcba69d036..b091452e28b4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -890,14 +890,9 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
@@ -917,7 +912,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1020,14 +1015,9 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
@@ -1037,7 +1027,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1067,14 +1057,9 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
@@ -1088,7 +1073,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1118,14 +1103,9 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
@@ -1135,7 +1115,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index ea8df1669288..d41caca97e1f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -31,15 +31,10 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP10]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 1, i32 [[TMP19]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -121,15 +116,10 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 0, i32 [[TMP16]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x ptr> [[TMP17]], i32 2, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -211,15 +201,10 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 0, i32 [[TMP16]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x ptr> [[TMP17]], i32 2, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -311,15 +296,10 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP10]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 1, i32 [[TMP19]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -413,15 +393,10 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 1, i32 [[TMP16]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x ptr> [[TMP17]], i32 2, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -523,15 +498,10 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP10]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 1, i32 [[TMP19]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -621,14 +591,9 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 1, i32 [[TMP8]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
@@ -705,14 +670,9 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 1, i32 [[TMP8]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 83e9d6146755..3361068c9922 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -217,14 +217,12 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
 ; CHECK-NEXT:    [[REC_START:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i64> poison, i64 [[REC_START]], i32 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i64> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP1]] = sub nsw <2 x i64> zeroinitializer, [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i64> zeroinitializer, [[STEP_ADD]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP2]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -299,13 +297,12 @@ define void @for_iv_trunc_optimized(ptr %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 1>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1, i32 2, i32 3, i32 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP0]], splat (i32 3)
-; CHECK-NEXT:    [[TMP3]] = or <4 x i32> [[TMP1]], splat (i32 3)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[DST:%.*]], align 4
@@ -364,11 +361,9 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 8, i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x ptr> poison, ptr [[A:%.*]], i32 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 2
@@ -377,7 +372,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
@@ -392,7 +387,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
-; CHECK-NEXT:    [[TMP28]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 19424e44a902..7aeb32afe43b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -407,11 +407,10 @@ define i16 @iv_and_step_trunc() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i16> [ <i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i16>
-; CHECK-NEXT:    [[TMP2]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll b/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
index bcfa212cf364..c02ec91c4a0c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
@@ -15,9 +15,8 @@ define void @no_use() {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, %[[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
@@ -60,9 +59,8 @@ define void @dead_use() {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, %[[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 0cd746590e0f..368842634c37 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -56,7 +56,6 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VEC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 1>, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 1, i16 2, i16 3, i16 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
 ; VEC-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 1
@@ -75,7 +74,7 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    store i64 0, ptr [[TMP25]], align 8
 ; VEC-NEXT:    store i64 0, ptr [[TMP26]], align 8
 ; VEC-NEXT:    [[TMP27:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; VEC-NEXT:    [[TMP28]] = zext <4 x i16> [[TMP27]] to <4 x i64>
+; VEC-NEXT:    [[TMP28:%.*]] = zext <4 x i16> [[TMP27]] to <4 x i64>
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; VEC-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
index 6b7736fa9f61..53113b2bdf49 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
@@ -11,13 +11,12 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi float [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[TMP3]] = load float, ptr [[NEXT_GEP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
@@ -27,12 +26,12 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1001, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[FOR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 16
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index d28db1c77efa..13dc53559d28 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -884,23 +884,21 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -943,10 +941,9 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[FOR_COND1]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1071,32 +1068,26 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD:%.*]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1105,7 +1096,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD:%.*]], [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1133,7 +1124,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[PRE_LOAD:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2
@@ -1141,7 +1131,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]]
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; UNROLL-NO-VF-NEXT:    [[TMP10]] = load i32, ptr [[TMP8]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1150,7 +1140,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD:%.*]], [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1175,32 +1165,26 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; SINK-AFTER-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD:%.*]], i32 3
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 2
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 4
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
+; SINK-AFTER-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
-; SINK-AFTER-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
-; SINK-AFTER-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
+; SINK-AFTER-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
+; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1209,7 +1193,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD:%.*]], [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1248,7 +1232,6 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ splat (i64 1), [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1276,7 +1259,6 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ 1, [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1304,7 +1286,6 @@ define i64 @constant_folded_previous_value() {
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ splat (i64 1), [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1357,9 +1338,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; UNROLL-NO-IC-NEXT:    [[TMP0]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
@@ -1391,29 +1371,28 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP4]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT1:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
-; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADDX:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
-; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
-; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT1]], [[SCALAR_PH]] ], [ [[ADDX1:%.*]], [[FOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[VAL_PHI]], 1
+; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[VAL_PHI]] to i64
+; UNROLL-NO-VF-NEXT:    [[ADDX1]] = add i32 [[VAL_PHI]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[VAL_PHI]], 95
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI1]], [[FOR_BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[VAL_PHI_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
@@ -1426,8 +1405,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
@@ -2507,13 +2485,12 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 -27>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -2561,13 +2538,12 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP4]] = zext i16 [[TMP3]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP2]], 5
 ; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP3]], 5
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10
@@ -2610,11 +2586,10 @@ define void @sink_dead_inst(ptr %a) {
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 -27>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP2]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5)
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
@@ -3500,12 +3475,11 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
@@ -3548,15 +3522,14 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = or i16 [[TMP5]], [[TMP5]]
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP10]] = zext i16 [[TMP5]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP3]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP4]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
@@ -3596,11 +3569,10 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
@@ -3668,10 +3640,9 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP1]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -3701,12 +3672,11 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2]] = add i16 [[TMP1]], 5
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
@@ -3735,9 +3705,8 @@ define void @unused_recurrence(ptr %a) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP1]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index f54d3bad9512..db25e7bede5c 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -243,10 +243,9 @@ define void @first_order_recurrence_using_induction(i32 %n, ptr %dst) {
 ; CHECK-LABEL: @first_order_recurrence_using_induction(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDUCTION1:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add i32 [[TMP3]], 0
-; CHECK-NEXT:    [[INDUCTION1]] = add i32 [[TMP3]], 1
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INDUCTION]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index c7149b084598..ebddca2294d9 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -621,8 +621,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; NPGSO-NEXT:    [[TMP0:%.*]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; NPGSO-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index e1b264620261..a70d8f72c8a3 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -580,14 +580,9 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -613,14 +608,9 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -676,16 +666,11 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
 ; CHECK-VF4UF1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP8]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; CHECK-VF4UF1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -729,19 +714,14 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; CHECK-VF4UF2-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
-; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT1]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT2]]
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:

From f4a63523b88631e224496435bea0940ac05897bf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 14:51:20 -0700
Subject: [PATCH 0422/1322] PowerPC: Migrate to newer relocation specifier
 representation

* Use MCAsmInfo::printSpecifierExpr instead of MCExpr::print.
* Replace PPCMCExpr with MCSpecifierExpr.
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 10 ++--
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp     | 24 ++++++++
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.h       |  8 +++
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp | 60 +++++++------------
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.h   | 31 +++-------
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 16 ++---
 llvm/lib/Target/PowerPC/PPCMCInstLower.cpp    |  4 +-
 7 files changed, 78 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 7e79d85d6017..bb4c2fd3e5cf 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -749,9 +749,9 @@ public:
           getSpecifier(SRE) == PPC::S_TLS_PCREL)
         return CreateTLSReg(SRE, S, E, IsPPC64);
 
-    if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
+    if (const auto *SE = dyn_cast<MCSpecifierExpr>(Val)) {
       int64_t Res;
-      if (TE->evaluateAsConstant(Res))
+      if (PPC::evaluateAsConstant(*SE, Res))
         return CreateContextImm(Res, S, E, IsPPC64);
     }
 
@@ -1375,7 +1375,7 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
     break;
   case MCExpr::Specifier: {
     // Detect error but do not return a modified expression.
-    auto *TE = cast<PPCMCExpr>(E);
+    auto *TE = cast<MCSpecifierExpr>(E);
     Spec = TE->getSpecifier();
     (void)extractSpecifier(TE->getSubExpr(), Spec);
     Spec = PPC::S_None;
@@ -1439,7 +1439,7 @@ bool PPCAsmParser::parseExpression(const MCExpr *&EVal) {
   uint16_t Spec = PPC::S_None;
   const MCExpr *E = extractSpecifier(EVal, Spec);
   if (Spec != PPC::S_None)
-    EVal = PPCMCExpr::create(Spec, E, getParser().getContext());
+    EVal = MCSpecifierExpr::create(E, Spec, getParser().getContext());
 
   return false;
 }
@@ -1841,5 +1841,5 @@ const MCExpr *PPCAsmParser::applySpecifier(const MCExpr *E, uint32_t Spec,
     }
   }
 
-  return PPCMCExpr::create(PPCMCExpr::Specifier(Spec), E, Ctx);
+  return MCSpecifierExpr::create(E, Spec, Ctx);
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index bb1f21d8f032..971b592643dc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -13,6 +13,7 @@
 #include "PPCMCAsmInfo.h"
 #include "PPCMCExpr.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -136,6 +137,18 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   initializeVariantKinds(variantKindDescs);
 }
 
+void PPCELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                         const MCSpecifierExpr &Expr) const {
+  printExpr(OS, *Expr.getSubExpr());
+  OS << '@' << getSpecifierName(Expr.getSpecifier());
+}
+
+bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                                MCValue &Res,
+                                                const MCAssembler *Asm) const {
+  return PPC::evaluateAsRelocatableImpl(Expr, Res, Asm);
+}
+
 void PPCXCOFFMCAsmInfo::anchor() {}
 
 PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
@@ -159,3 +172,14 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
 
   initializeVariantKinds(variantKindDescs);
 }
+
+void PPCXCOFFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                           const MCSpecifierExpr &Expr) const {
+  printExpr(OS, *Expr.getSubExpr());
+  OS << '@' << getSpecifierName(Expr.getSpecifier());
+}
+
+bool PPCXCOFFMCAsmInfo::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return PPC::evaluateAsRelocatableImpl(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 9fbb73c2e318..172fe81c2bce 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -24,6 +24,10 @@ class PPCELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
@@ -31,6 +35,10 @@ class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
 
 public:
   explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace PPC {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 49ae6bb5fa45..8d9c0892ae16 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -16,38 +16,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppcmcexpr"
 
-const PPCMCExpr *PPCMCExpr::create(Specifier S, const MCExpr *Expr,
-                                   MCContext &Ctx) {
-  return new (Ctx) PPCMCExpr(S, Expr);
-}
-
-const PPCMCExpr *PPCMCExpr::create(const MCExpr *Expr, Specifier S,
-                                   MCContext &Ctx) {
-  return new (Ctx) PPCMCExpr(S, Expr);
-}
-
-void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  getSubExpr()->print(OS, MAI);
-  OS << '@' << MAI->getSpecifierName(specifier);
-}
-
-bool
-PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
-  MCValue Value;
-
-  if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr))
-    return false;
-
-  if (!Value.isAbsolute())
-    return false;
-  auto Tmp = evaluateAsInt64(Value.getConstant());
-  if (!Tmp)
-    return false;
-  Res = *Tmp;
-  return true;
-}
-
-std::optional<int64_t> PPCMCExpr::evaluateAsInt64(int64_t Value) const {
+static std::optional<int64_t> evaluateAsInt64(uint16_t specifier,
+                                              int64_t Value) {
   switch (specifier) {
   case PPC::S_LO:
     return Value & 0xffff;
@@ -72,21 +42,35 @@ std::optional<int64_t> PPCMCExpr::evaluateAsInt64(int64_t Value) const {
   }
 }
 
-bool PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                          const MCAssembler *Asm) const {
-  if (!Asm)
+bool PPC::evaluateAsConstant(const MCSpecifierExpr &Expr, int64_t &Res) {
+  MCValue Value;
+
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Value, nullptr))
     return false;
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+
+  if (!Value.isAbsolute())
+    return false;
+  auto Tmp = evaluateAsInt64(Expr.getSpecifier(), Value.getConstant());
+  if (!Tmp)
+    return false;
+  Res = *Tmp;
+  return true;
+}
+
+bool PPC::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                    const MCAssembler *Asm) {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
     return false;
 
   // The signedness of the result is dependent on the instruction operand. E.g.
   // in addis 3,3,65535@l, 65535@l is signed. In the absence of information at
   // parse time (!Asm), disable the folding.
-  std::optional<int64_t> MaybeInt = evaluateAsInt64(Res.getConstant());
+  std::optional<int64_t> MaybeInt =
+      evaluateAsInt64(Expr.getSpecifier(), Res.getConstant());
   if (Res.isAbsolute() && MaybeInt) {
     Res = MCValue::get(*MaybeInt);
   } else {
-    Res.setSpecifier(specifier);
+    Res.setSpecifier(Expr.getSpecifier());
   }
 
   return true;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 814217ea060e..d97a1204efbc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -16,33 +16,20 @@
 
 namespace llvm {
 
-class PPCMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-
-private:
-  std::optional<int64_t> evaluateAsInt64(int64_t Value) const;
-
-  explicit PPCMCExpr(Specifier S, const MCExpr *Expr)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const PPCMCExpr *create(Specifier S, const MCExpr *Expr,
-                                 MCContext &Ctx);
-  static const PPCMCExpr *create(const MCExpr *Expr, Specifier S,
-                                 MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
-  bool evaluateAsConstant(int64_t &Res) const;
-};
+namespace PPCMCExpr {
+using Specifier = uint16_t;
+}
 
 static inline PPCMCExpr::Specifier getSpecifier(const MCSymbolRefExpr *SRE) {
   return PPCMCExpr::Specifier(SRE->getKind());
 }
 
+namespace PPC {
+bool evaluateAsConstant(const MCSpecifierExpr &Expr, int64_t &Res);
+bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                               const MCAssembler *Asm);
+} // namespace PPC
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 8a1357c5fd55..d5d51e3ca638 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1001,13 +1001,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
           MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext);
 
       const MCExpr *DeltaHi =
-          PPCMCExpr::create(DeltaExpr, PPC::S_HA, OutContext);
+          MCSpecifierExpr::create(DeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi));
 
       const MCExpr *DeltaLo =
-          PPCMCExpr::create(DeltaExpr, PPC::S_LO, OutContext);
+          MCSpecifierExpr::create(DeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo));
@@ -1401,10 +1401,10 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::PPC32GOT: {
     MCSymbol *GOTSymbol =
         OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
-    const MCExpr *SymGotTlsL = PPCMCExpr::create(
-        PPC::S_LO, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
-    const MCExpr *SymGotTlsHA = PPCMCExpr::create(
-        PPC::S_HA, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
+    const MCExpr *SymGotTlsL =
+        MCSpecifierExpr::create(GOTSymbol, PPC::S_LO, OutContext);
+    const MCExpr *SymGotTlsHA =
+        MCSpecifierExpr::create(GOTSymbol, PPC::S_HA, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addExpr(SymGotTlsL));
@@ -2125,14 +2125,14 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
                                 GlobalEntryLabelExp, OutContext);
 
       const MCExpr *TOCDeltaHi =
-          PPCMCExpr::create(TOCDeltaExpr, PPC::S_HA, OutContext);
+          MCSpecifierExpr::create(TOCDeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X12)
                                    .addExpr(TOCDeltaHi));
 
       const MCExpr *TOCDeltaLo =
-          PPCMCExpr::create(TOCDeltaExpr, PPC::S_LO, OutContext);
+          MCSpecifierExpr::create(TOCDeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X2)
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index f6624ec989ee..cbd53651bbbf 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -164,11 +164,11 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   switch (access) {
     case PPCII::MO_LO:
     case PPCII::MO_PIC_LO_FLAG:
-      Expr = PPCMCExpr::create(Expr, PPC::S_LO, Ctx);
+      Expr = MCSpecifierExpr::create(Expr, PPC::S_LO, Ctx);
       break;
     case PPCII::MO_HA:
     case PPCII::MO_PIC_HA_FLAG:
-      Expr = PPCMCExpr::create(Expr, PPC::S_HA, Ctx);
+      Expr = MCSpecifierExpr::create(Expr, PPC::S_HA, Ctx);
       break;
   }
 

From 34c85ed2bc1adfa375745db6de7f62d350a8f768 Mon Sep 17 00:00:00 2001
From: Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
Date: Mon, 16 Jun 2025 00:07:51 +0200
Subject: [PATCH 0423/1322] [clang-reorder-fields] Use expanded location for
 macros (#142147)

Fixes macros being replaced instead of their expansion.

Closes #52632
---
 .../ReorderFieldsAction.cpp                   |  4 ++++
 .../MacroExpansionField.cpp                   | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp

diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
index ea0207619fb2..3b1cd18d8034 100644
--- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
+++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
@@ -86,6 +86,10 @@ getNewFieldsOrder(const RecordDecl *Definition,
 static void
 addReplacement(SourceRange Old, SourceRange New, const ASTContext &Context,
                std::map<std::string, tooling::Replacements> &Replacements) {
+  if (Old.getBegin().isMacroID())
+    Old = Context.getSourceManager().getExpansionRange(Old).getAsRange();
+  if (New.getBegin().isMacroID())
+    New = Context.getSourceManager().getExpansionRange(New).getAsRange();
   StringRef NewText =
       Lexer::getSourceText(CharSourceRange::getTokenRange(New),
                            Context.getSourceManager(), Context.getLangOpts());
diff --git a/clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp b/clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp
new file mode 100644
index 000000000000..a4c3cbc1e12f
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp
@@ -0,0 +1,24 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order z,y,x %s -- | FileCheck %s
+
+namespace bar {
+
+#define INT_DECL(NAME) int NAME // CHECK:      {{^#define INT_DECL\(NAME\) int NAME}}
+#define MACRO_DECL int x;       // CHECK-NEXT: {{^#define MACRO_DECL int x;}}
+
+struct Foo {
+  MACRO_DECL   // CHECK:      {{^ INT_DECL\(z\);}}
+  int y;       // CHECK-NEXT: {{^ int y;}}
+  INT_DECL(z); // CHECK-NEXT: {{^ MACRO_DECL}}
+};
+
+#define FOO 0 // CHECK:      {{^#define FOO 0}}
+#define BAR 1 // CHECK-NEXT: {{^#define BAR 1}}
+#define BAZ 2 // CHECK-NEXT: {{^#define BAZ 2}}
+
+struct Foo foo = {
+  FOO, // CHECK:      {{^ BAZ,}}
+  BAR, // CHECK-NEXT: {{^ BAR,}}
+  BAZ, // CHECK-NEXT: {{^ FOO,}}
+};
+
+} // end namespace bar

From e448c3e5fc2ab4244356e29c9c9135b6ccf5f6ff Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 15:36:45 -0700
Subject: [PATCH 0424/1322] LoongArch: Migrate to MCAsmInfo::printExpr

---
 .../LoongArch/AsmParser/LoongArchAsmParser.cpp |  3 ++-
 .../Target/LoongArch/LoongArchAsmPrinter.cpp   |  8 +++++---
 .../MCTargetDesc/LoongArchInstPrinter.cpp      |  2 +-
 .../MCTargetDesc/LoongArchMCAsmInfo.cpp        | 13 +++++++++++++
 .../MCTargetDesc/LoongArchMCAsmInfo.h          |  7 +++++++
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.cpp | 18 ++++--------------
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.h   |  5 -----
 7 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index f5c540728852..7d5827008957 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -8,6 +8,7 @@
 
 #include "MCTargetDesc/LoongArchBaseInfo.h"
 #include "MCTargetDesc/LoongArchInstPrinter.h"
+#include "MCTargetDesc/LoongArchMCAsmInfo.h"
 #include "MCTargetDesc/LoongArchMCExpr.h"
 #include "MCTargetDesc/LoongArchMCTargetDesc.h"
 #include "MCTargetDesc/LoongArchMatInt.h"
@@ -755,7 +756,7 @@ LoongArchAsmParser::parseOperandWithModifier(OperandVector &Operands) {
   if (getLexer().getKind() != AsmToken::Identifier)
     return Error(getLoc(), "expected valid identifier for operand modifier");
   StringRef Identifier = getParser().getTok().getIdentifier();
-  LoongArchMCExpr::Specifier VK = LoongArchMCExpr::parseSpecifier(Identifier);
+  auto VK = LoongArch::parseSpecifier(Identifier);
   if (VK == LoongArchMCExpr::VK_None)
     return Error(getLoc(), "invalid relocation specifier");
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 9181e539f75c..64ac7c03c041 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -160,9 +161,10 @@ bool LoongArchAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   else if (OffsetMO.isImm())
     OS << ", " << OffsetMO.getImm();
   else if (OffsetMO.isGlobal() || OffsetMO.isBlockAddress() ||
-           OffsetMO.isMCSymbol())
-    OS << ", " << *MCO.getExpr();
-  else
+           OffsetMO.isMCSymbol()) {
+    OS << ", ";
+    MAI->printExpr(OS, *MCO.getExpr());
+  } else
     return true;
 
   return false;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
index e59cac7726a6..f912af330e34 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
@@ -84,7 +84,7 @@ void LoongArchInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void LoongArchInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
index 9b7fccd0078e..dc55ceab2dd3 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "LoongArchMCAsmInfo.h"
+#include "LoongArchMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -32,3 +34,14 @@ LoongArchMCAsmInfo::LoongArchMCAsmInfo(const Triple &TT) {
   DwarfRegNumForCFI = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
+
+void LoongArchMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                            const MCSpecifierExpr &Expr) const {
+  auto S = Expr.getSpecifier();
+  bool HasSpecifier = S != 0 && S != ELF::R_LARCH_B26;
+  if (HasSpecifier)
+    OS << '%' << LoongArch::getSpecifierName(S) << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (HasSpecifier)
+    OS << ')';
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
index ed1abbf46153..58ffb723d62c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
@@ -23,8 +23,15 @@ class LoongArchMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit LoongArchMCAsmInfo(const Triple &TargetTriple);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
+namespace LoongArch {
+StringRef getSpecifierName(uint16_t S);
+uint16_t parseSpecifier(StringRef name);
+} // namespace LoongArch
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCASMINFO_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 7eec23647500..c763aaa7276f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LoongArchMCExpr.h"
+#include "LoongArchMCAsmInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -27,18 +28,7 @@ const LoongArchMCExpr *LoongArchMCExpr::create(const MCExpr *Expr, uint16_t S,
   return new (Ctx) LoongArchMCExpr(Expr, Specifier(S), Hint);
 }
 
-void LoongArchMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  Specifier S = getSpecifier();
-  bool HasVariant = S != VK_None && S != ELF::R_LARCH_B26;
-
-  if (HasVariant)
-    OS << '%' << getSpecifierName(specifier) << '(';
-  Expr->print(OS, MAI);
-  if (HasVariant)
-    OS << ')';
-}
-
-StringRef LoongArchMCExpr::getSpecifierName(uint16_t S) {
+StringRef LoongArch::getSpecifierName(uint16_t S) {
   switch (S) {
   default:
     llvm_unreachable("Invalid ELF symbol kind");
@@ -149,7 +139,7 @@ StringRef LoongArchMCExpr::getSpecifierName(uint16_t S) {
   }
 }
 
-LoongArchMCExpr::Specifier LoongArchMCExpr::parseSpecifier(StringRef name) {
+LoongArchMCExpr::Specifier LoongArch::parseSpecifier(StringRef name) {
   return StringSwitch<LoongArchMCExpr::Specifier>(name)
       .Case("plt", ELF::R_LARCH_B26)
       .Case("b16", ELF::R_LARCH_B16)
@@ -205,5 +195,5 @@ LoongArchMCExpr::Specifier LoongArchMCExpr::parseSpecifier(StringRef name) {
       .Case("ld_pcrel_20", ELF::R_LARCH_TLS_LD_PCREL20_S2)
       .Case("gd_pcrel_20", ELF::R_LARCH_TLS_GD_PCREL20_S2)
       .Case("desc_pcrel_20", ELF::R_LARCH_TLS_DESC_PCREL20_S2)
-      .Default(VK_None);
+      .Default(0);
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 06370001fa41..36563d8a6b60 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -36,11 +36,6 @@ public:
                                        MCContext &Ctx, bool Hint = false);
 
   bool getRelaxHint() const { return RelaxHint; }
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static StringRef getSpecifierName(uint16_t S);
-  static Specifier parseSpecifier(StringRef name);
 };
 
 } // end namespace llvm

From e3025c95090f74b26e36106d2aa394b213f713a1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 15:51:10 -0700
Subject: [PATCH 0425/1322] RISCV: Rename RISCVMCExpr::VK_ to RISCV::S_

Prepare for removing RISCVMCExpr. Adopt the newer naming convention (S_)
used by AMDGPU/WebAssembly/VE/M68k/PowerPC.
---
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 43 +++++++++----------
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  8 ++--
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 20 ++++-----
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   | 27 ++++++------
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     | 10 ++---
 5 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 1f434beca538..040900064b90 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -544,9 +544,9 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isShiftedInt<N - 1, 1>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_None;
+           VK == RISCV::S_None;
   }
 
   // True if operand is a symbol with no modifiers, or a constant with no
@@ -559,9 +559,9 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<N>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_None;
+           VK == RISCV::S_None;
   }
 
   // Predicate methods for AsmOperands defined in RISCVInstrInfo.td
@@ -572,9 +572,9 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_None;
+           VK == RISCV::S_None;
   }
 
   bool isCallSymbol() const {
@@ -583,7 +583,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -594,7 +594,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -605,7 +605,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TPREL_ADD;
   }
@@ -616,7 +616,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TLSDESC_CALL;
   }
@@ -870,11 +870,10 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<12>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           (VK == RISCVMCExpr::VK_LO || VK == RISCVMCExpr::VK_PCREL_LO ||
-            VK == RISCVMCExpr::VK_TPREL_LO ||
-            VK == ELF::R_RISCV_TLSDESC_LOAD_LO12 ||
+           (VK == RISCV::S_LO || VK == RISCV::S_PCREL_LO ||
+            VK == RISCV::S_TPREL_LO || VK == ELF::R_RISCV_TLSDESC_LOAD_LO12 ||
             VK == ELF::R_RISCV_TLSDESC_ADD_LO12);
   }
 
@@ -903,9 +902,9 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<20>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_QC_ABS20;
+           VK == RISCV::S_QC_ABS20;
   }
 
   bool isUImm20LUI() const {
@@ -916,7 +915,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_HI20 || VK == ELF::R_RISCV_TPREL_HI20);
   }
@@ -929,7 +928,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_PCREL_HI20 || VK == ELF::R_RISCV_GOT_HI20 ||
             VK == ELF::R_RISCV_TLS_GOT_HI20 || VK == ELF::R_RISCV_TLS_GD_HI20 ||
@@ -2920,7 +2919,7 @@ bool RISCVAsmParser::parseInstruction(ParseInstructionInfo &Info,
 
 bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
                                        RISCVMCExpr::Specifier &Kind) {
-  Kind = RISCVMCExpr::VK_None;
+  Kind = RISCV::S_None;
 
   if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
     Kind = RE->getSpecifier();
@@ -2929,14 +2928,14 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
 
   MCValue Res;
   if (Expr->evaluateAsRelocatable(Res, nullptr))
-    return Res.getSpecifier() == RISCVMCExpr::VK_None;
+    return Res.getSpecifier() == RISCV::S_None;
   return false;
 }
 
 bool RISCVAsmParser::isSymbolDiff(const MCExpr *Expr) {
   MCValue Res;
   if (Expr->evaluateAsRelocatable(Res, nullptr)) {
-    return Res.getSpecifier() == RISCVMCExpr::VK_None && Res.getAddSym() &&
+    return Res.getSpecifier() == RISCV::S_None && Res.getAddSym() &&
            Res.getSubSym();
   }
   return false;
@@ -3451,7 +3450,7 @@ void RISCVAsmParser::emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
                  MCInstBuilder(RISCV::AUIPC).addReg(TmpReg).addExpr(SymbolHi));
 
   const MCExpr *RefToLinkTmpLabel = RISCVMCExpr::create(
-      MCSymbolRefExpr::create(TmpLabel, Ctx), RISCVMCExpr::VK_PCREL_LO, Ctx);
+      MCSymbolRefExpr::create(TmpLabel, Ctx), RISCV::S_PCREL_LO, Ctx);
 
   emitToStreamer(Out, MCInstBuilder(SecondOpcode)
                           .addReg(DestReg)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index dd5540038c43..20014611499c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -580,7 +580,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       // encounter it here is an error.
       llvm_unreachable(
           "ELF::R_RISCV_TPREL_ADD should not represent an instruction operand");
-    case RISCVMCExpr::VK_LO:
+    case RISCV::S_LO:
       if (MIFrm == RISCVII::InstFormatI)
         FixupKind = RISCV::fixup_riscv_lo12_i;
       else if (MIFrm == RISCVII::InstFormatS)
@@ -593,7 +593,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_hi20;
       RelaxCandidate = true;
       break;
-    case RISCVMCExpr::VK_PCREL_LO:
+    case RISCV::S_PCREL_LO:
       if (MIFrm == RISCVII::InstFormatI)
         FixupKind = RISCV::fixup_riscv_pcrel_lo12_i;
       else if (MIFrm == RISCVII::InstFormatS)
@@ -606,7 +606,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_pcrel_hi20;
       RelaxCandidate = true;
       break;
-    case RISCVMCExpr::VK_TPREL_LO:
+    case RISCV::S_TPREL_LO:
       if (MIFrm == RISCVII::InstFormatI)
         FixupKind = ELF::R_RISCV_TPREL_LO12_I;
       else if (MIFrm == RISCVII::InstFormatS)
@@ -622,7 +622,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_call_plt;
       RelaxCandidate = true;
       break;
-    case RISCVMCExpr::VK_QC_ABS20:
+    case RISCV::S_QC_ABS20:
       FixupKind = RISCV::fixup_riscv_qc_abs20_u;
       RelaxCandidate = true;
       break;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index f5f40ad44ac1..ce0ac067cb27 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -33,7 +33,7 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S,
 
 void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   Specifier S = getSpecifier();
-  bool HasVariant = S != VK_None && S != ELF::R_RISCV_CALL_PLT;
+  bool HasVariant = S != RISCV::S_None && S != ELF::R_RISCV_CALL_PLT;
 
   if (HasVariant)
     OS << '%' << getSpecifierName(S) << '(';
@@ -90,12 +90,12 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
 std::optional<RISCVMCExpr::Specifier>
 RISCVMCExpr::getSpecifierForName(StringRef name) {
   return StringSwitch<std::optional<RISCVMCExpr::Specifier>>(name)
-      .Case("lo", VK_LO)
+      .Case("lo", RISCV::S_LO)
       .Case("hi", ELF::R_RISCV_HI20)
-      .Case("pcrel_lo", VK_PCREL_LO)
+      .Case("pcrel_lo", RISCV::S_PCREL_LO)
       .Case("pcrel_hi", ELF::R_RISCV_PCREL_HI20)
       .Case("got_pcrel_hi", ELF::R_RISCV_GOT_HI20)
-      .Case("tprel_lo", VK_TPREL_LO)
+      .Case("tprel_lo", RISCV::S_TPREL_LO)
       .Case("tprel_hi", ELF::R_RISCV_TPREL_HI20)
       .Case("tprel_add", ELF::R_RISCV_TPREL_ADD)
       .Case("tls_ie_pcrel_hi", ELF::R_RISCV_TLS_GOT_HI20)
@@ -104,7 +104,7 @@ RISCVMCExpr::getSpecifierForName(StringRef name) {
       .Case("tlsdesc_load_lo", ELF::R_RISCV_TLSDESC_LOAD_LO12)
       .Case("tlsdesc_add_lo", ELF::R_RISCV_TLSDESC_ADD_LO12)
       .Case("tlsdesc_call", ELF::R_RISCV_TLSDESC_CALL)
-      .Case("qc.abs20", VK_QC_ABS20)
+      .Case("qc.abs20", RISCV::S_QC_ABS20)
       // Used in data directives
       .Case("pltpcrel", ELF::R_RISCV_PLT32)
       .Case("gotpcrel", ELF::R_RISCV_GOT32_PCREL)
@@ -113,19 +113,19 @@ RISCVMCExpr::getSpecifierForName(StringRef name) {
 
 StringRef RISCVMCExpr::getSpecifierName(Specifier S) {
   switch (S) {
-  case VK_None:
+  case RISCV::S_None:
     llvm_unreachable("not used as %specifier()");
-  case VK_LO:
+  case RISCV::S_LO:
     return "lo";
   case ELF::R_RISCV_HI20:
     return "hi";
-  case VK_PCREL_LO:
+  case RISCV::S_PCREL_LO:
     return "pcrel_lo";
   case ELF::R_RISCV_PCREL_HI20:
     return "pcrel_hi";
   case ELF::R_RISCV_GOT_HI20:
     return "got_pcrel_hi";
-  case VK_TPREL_LO:
+  case RISCV::S_TPREL_LO:
     return "tprel_lo";
   case ELF::R_RISCV_TPREL_HI20:
     return "tprel_hi";
@@ -151,7 +151,7 @@ StringRef RISCVMCExpr::getSpecifierName(Specifier S) {
     return "gotpcrel";
   case ELF::R_RISCV_PLT32:
     return "pltpcrel";
-  case VK_QC_ABS20:
+  case RISCV::S_QC_ABS20:
     return "qc.abs20";
   }
   llvm_unreachable("Invalid ELF symbol kind");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index d3b4a94f2f28..7e3acdfcb87b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -24,18 +24,6 @@ class StringRef;
 class RISCVMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = uint16_t;
-  // Specifiers mapping to relocation types below FirstTargetFixupKind are
-  // encoded literally, with these exceptions:
-  enum {
-    VK_None,
-    // Specifiers mapping to distinct relocation types.
-    VK_LO = FirstTargetFixupKind,
-    VK_PCREL_LO,
-    VK_TPREL_LO,
-    // Vendor-specific relocation types might conflict across vendors.
-    // Refer to them using Specifier constants.
-    VK_QC_ABS20,
-  };
 
 private:
   explicit RISCVMCExpr(const MCExpr *Expr, Specifier S)
@@ -57,6 +45,21 @@ public:
   static std::optional<Specifier> getSpecifierForName(StringRef name);
   static StringRef getSpecifierName(Specifier Kind);
 };
+
+namespace RISCV {
+// Specifiers mapping to relocation types below FirstTargetFixupKind are
+// encoded literally, with these exceptions:
+enum Specifier {
+  S_None,
+  // Specifiers mapping to distinct relocation types.
+  S_LO = FirstTargetFixupKind,
+  S_PCREL_LO,
+  S_TPREL_LO,
+  // Vendor-specific relocation types might conflict across vendors.
+  // Refer to them using Specifier constants.
+  S_QC_ABS20,
+};
+} // namespace RISCV
 } // end namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 4fb71a3ed000..4a75a559a927 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -963,19 +963,19 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   default:
     llvm_unreachable("Unknown target flag on GV operand");
   case RISCVII::MO_None:
-    Kind = RISCVMCExpr::VK_None;
+    Kind = RISCV::S_None;
     break;
   case RISCVII::MO_CALL:
     Kind = ELF::R_RISCV_CALL_PLT;
     break;
   case RISCVII::MO_LO:
-    Kind = RISCVMCExpr::VK_LO;
+    Kind = RISCV::S_LO;
     break;
   case RISCVII::MO_HI:
     Kind = ELF::R_RISCV_HI20;
     break;
   case RISCVII::MO_PCREL_LO:
-    Kind = RISCVMCExpr::VK_PCREL_LO;
+    Kind = RISCV::S_PCREL_LO;
     break;
   case RISCVII::MO_PCREL_HI:
     Kind = ELF::R_RISCV_PCREL_HI20;
@@ -984,7 +984,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
     Kind = ELF::R_RISCV_GOT_HI20;
     break;
   case RISCVII::MO_TPREL_LO:
-    Kind = RISCVMCExpr::VK_TPREL_LO;
+    Kind = RISCV::S_TPREL_LO;
     break;
   case RISCVII::MO_TPREL_HI:
     Kind = ELF::R_RISCV_TPREL_HI20;
@@ -1018,7 +1018,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
     ME = MCBinaryExpr::createAdd(
         ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
-  if (Kind != RISCVMCExpr::VK_None)
+  if (Kind != RISCV::S_None)
     ME = RISCVMCExpr::create(ME, Kind, Ctx);
   return MCOperand::createExpr(ME);
 }

From 4635b6076dc1933b7ebd9fcca9f22ec93e2f9c0c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:01:28 -0700
Subject: [PATCH 0426/1322] RISCV: Rename RISCVMCExpr::VK_ to RISCV::S_

---
 bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index 18be125d53ae..ee6f067ff3a3 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -438,12 +438,12 @@ public:
       return RISCVMCExpr::create(Expr, ELF::R_RISCV_PCREL_HI20, Ctx);
     case ELF::R_RISCV_PCREL_LO12_I:
     case ELF::R_RISCV_PCREL_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_PCREL_LO, Ctx);
+      return RISCVMCExpr::create(Expr, RISCV::S_PCREL_LO, Ctx);
     case ELF::R_RISCV_HI20:
       return RISCVMCExpr::create(Expr, ELF::R_RISCV_HI20, Ctx);
     case ELF::R_RISCV_LO12_I:
     case ELF::R_RISCV_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_LO, Ctx);
+      return RISCVMCExpr::create(Expr, RISCV::S_LO, Ctx);
     case ELF::R_RISCV_CALL:
       return RISCVMCExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
     case ELF::R_RISCV_CALL_PLT:

From fedf6c68ddfb43730578837aad394afcd97fe65a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 15:43:41 -0700
Subject: [PATCH 0427/1322] RISCV: Move RISCVMCExpr functions to RISCVMCAsmInfo
 or RISCVMCAsmBackend

* Move getPCRelHiFixup closer to the only caller RISCVAsmBackend::evaluateTargetFixup.
* Declare getSpecifierForName in RISCVMCAsmInfo, in align with other
  targets that have migrated to the new relocation specifier representation.
---
 bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp  |  2 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  6 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    | 55 +++++++++++++++-
 .../MCTargetDesc/RISCVELFObjectWriter.cpp     |  6 +-
 .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp   |  3 +-
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp     | 12 +++-
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.h       | 23 +++++++
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  2 +-
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 66 ++-----------------
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   | 27 --------
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |  2 +-
 11 files changed, 102 insertions(+), 102 deletions(-)

diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index ee6f067ff3a3..cf30ad272d1c 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "bolt/Core/MCPlusBuilder.h"
 #include "llvm/BinaryFormat/ELF.h"
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 040900064b90..612ac428dd1b 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -9,7 +9,7 @@
 #include "MCTargetDesc/RISCVAsmBackend.h"
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVInstPrinter.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
@@ -2087,7 +2087,7 @@ bool RISCVAsmParser::parseExprWithSpecifier(const MCExpr *&Res, SMLoc &E) {
   if (getLexer().getKind() != AsmToken::Identifier)
     return Error(getLoc(), "expected '%' relocation specifier");
   StringRef Identifier = getParser().getTok().getIdentifier();
-  auto Spec = RISCVMCExpr::getSpecifierForName(Identifier);
+  auto Spec = RISCV::parseSpecifierName(Identifier);
   if (!Spec)
     return Error(getLoc(), "invalid relocation specifier");
 
@@ -2099,7 +2099,7 @@ bool RISCVAsmParser::parseExprWithSpecifier(const MCExpr *&Res, SMLoc &E) {
   if (getParser().parseParenExpression(SubExpr, E))
     return true;
 
-  Res = RISCVMCExpr::create(SubExpr, *Spec, getContext());
+  Res = RISCVMCExpr::create(SubExpr, Spec, getContext());
   return false;
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 338e5a477283..2f37c351baf9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -8,7 +8,6 @@
 
 #include "RISCVAsmBackend.h"
 #include "RISCVFixupKinds.h"
-#include "RISCVMCExpr.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -591,6 +590,57 @@ bool RISCVAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
   return !Res.getSubSym();
 }
 
+// Get the corresponding PC-relative HI fixup that a S_PCREL_LO points to, and
+// optionally the fragment containing it.
+//
+// \returns nullptr if this isn't a S_PCREL_LO pointing to a known PC-relative
+// HI fixup.
+static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr,
+                                      const MCFragment **DFOut) {
+  MCValue AUIPCLoc;
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(AUIPCLoc, nullptr))
+    return nullptr;
+
+  const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym();
+  if (!AUIPCSymbol)
+    return nullptr;
+  const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
+
+  if (!DF)
+    return nullptr;
+
+  uint64_t Offset = AUIPCSymbol->getOffset();
+  if (DF->getContents().size() == Offset) {
+    DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
+    if (!DF)
+      return nullptr;
+    Offset = 0;
+  }
+
+  for (const MCFixup &F : DF->getFixups()) {
+    if (F.getOffset() != Offset)
+      continue;
+    auto Kind = F.getTargetKind();
+    if (!mc::isRelocation(F.getKind())) {
+      if (Kind == RISCV::fixup_riscv_pcrel_hi20) {
+        *DFOut = DF;
+        return &F;
+      }
+      break;
+    }
+    switch (Kind) {
+    case ELF::R_RISCV_GOT_HI20:
+    case ELF::R_RISCV_TLS_GOT_HI20:
+    case ELF::R_RISCV_TLS_GD_HI20:
+    case ELF::R_RISCV_TLSDESC_HI20:
+      *DFOut = DF;
+      return &F;
+    }
+  }
+
+  return nullptr;
+}
+
 bool RISCVAsmBackend::evaluateTargetFixup(const MCFixup &Fixup,
                                           const MCValue &Target,
                                           uint64_t &Value) {
@@ -602,7 +652,8 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCFixup &Fixup,
     llvm_unreachable("Unexpected fixup kind!");
   case RISCV::fixup_riscv_pcrel_lo12_i:
   case RISCV::fixup_riscv_pcrel_lo12_s: {
-    AUIPCFixup = cast<RISCVMCExpr>(Fixup.getValue())->getPCRelHiFixup(&AUIPCDF);
+    AUIPCFixup =
+        getPCRelHiFixup(cast<MCSpecifierExpr>(*Fixup.getValue()), &AUIPCDF);
     if (!AUIPCFixup) {
       getContext().reportError(Fixup.getLoc(),
                                "could not find corresponding %pcrel_hi");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index a0bf378f3c76..1d81096d6b60 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/RISCVFixupKinds.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -49,7 +49,7 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                             const MCValue &Target,
                                             bool IsPCRel) const {
   unsigned Kind = Fixup.getTargetKind();
-  auto Spec = RISCVMCExpr::Specifier(Target.getSpecifier());
+  auto Spec = Target.getSpecifier();
   switch (Spec) {
   case ELF::R_RISCV_TPREL_HI20:
   case ELF::R_RISCV_TLS_GOT_HI20:
@@ -62,7 +62,7 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
   case ELF::R_RISCV_GOT32_PCREL:
     if (Kind == FK_Data_4)
       break;
-    reportError(Fixup.getLoc(), "%" + RISCVMCExpr::getSpecifierName(Spec) +
+    reportError(Fixup.getLoc(), "%" + RISCV::getSpecifierName(Spec) +
                                     " can only be used in a .word directive");
     return ELF::R_RISCV_NONE;
   default:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 1f4a77414db6..8c9ab8effa71 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -12,7 +12,6 @@
 
 #include "RISCVInstPrinter.h"
 #include "RISCVBaseInfo.h"
-#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -102,7 +101,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void RISCVInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index e75bc521d47c..88b1d21f86c5 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVMCAsmInfo.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCExpr.h"
@@ -47,3 +46,14 @@ const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding");
   return RISCVMCExpr::create(ME, ELF::R_RISCV_32_PCREL, Ctx);
 }
+
+void RISCVMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                        const MCSpecifierExpr &Expr) const {
+  auto S = Expr.getSpecifier();
+  bool HasSpecifier = S != 0 && S != ELF::R_RISCV_CALL_PLT;
+  if (HasSpecifier)
+    OS << '%' << RISCV::getSpecifierName(S) << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (HasSpecifier)
+    OS << ')';
+}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index bceeb1256471..05f04a618560 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 
+#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCFixup.h"
 
 namespace llvm {
 class Triple;
@@ -26,8 +28,29 @@ public:
 
   const MCExpr *getExprForFDESymbol(const MCSymbol *Sym, unsigned Encoding,
                                     MCStreamer &Streamer) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
+namespace RISCV {
+using Specifier = uint16_t;
+// Specifiers mapping to relocation types below FirstTargetFixupKind are
+// encoded literally, with these exceptions:
+enum {
+  S_None,
+  // Specifiers mapping to distinct relocation types.
+  S_LO = FirstTargetFixupKind,
+  S_PCREL_LO,
+  S_TPREL_LO,
+  // Vendor-specific relocation types might conflict across vendors.
+  // Refer to them using Specifier constants.
+  S_QC_ABS20,
+};
+
+Specifier parseSpecifierName(StringRef name);
+StringRef getSpecifierName(Specifier Kind);
+} // namespace RISCV
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 20014611499c..03c6701a1795 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -12,7 +12,7 @@
 
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVFixupKinds.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCAsmInfo.h"
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index ce0ac067cb27..1f6f940cac6f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,6 +13,7 @@
 
 #include "RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVAsmBackend.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "RISCVFixupKinds.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
@@ -31,65 +32,8 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S,
   return new (Ctx) RISCVMCExpr(Expr, S);
 }
 
-void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  Specifier S = getSpecifier();
-  bool HasVariant = S != RISCV::S_None && S != ELF::R_RISCV_CALL_PLT;
-
-  if (HasVariant)
-    OS << '%' << getSpecifierName(S) << '(';
-  Expr->print(OS, MAI);
-  if (HasVariant)
-    OS << ')';
-}
-
-const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
-  MCValue AUIPCLoc;
-  if (!getSubExpr()->evaluateAsRelocatable(AUIPCLoc, nullptr))
-    return nullptr;
-
-  const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym();
-  if (!AUIPCSymbol)
-    return nullptr;
-  const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
-
-  if (!DF)
-    return nullptr;
-
-  uint64_t Offset = AUIPCSymbol->getOffset();
-  if (DF->getContents().size() == Offset) {
-    DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
-    if (!DF)
-      return nullptr;
-    Offset = 0;
-  }
-
-  for (const MCFixup &F : DF->getFixups()) {
-    if (F.getOffset() != Offset)
-      continue;
-    auto Kind = F.getTargetKind();
-    if (!mc::isRelocation(F.getKind())) {
-      if (Kind == RISCV::fixup_riscv_pcrel_hi20) {
-        *DFOut = DF;
-        return &F;
-      }
-      break;
-    }
-    switch (Kind) {
-    case ELF::R_RISCV_GOT_HI20:
-    case ELF::R_RISCV_TLS_GOT_HI20:
-    case ELF::R_RISCV_TLS_GD_HI20:
-    case ELF::R_RISCV_TLSDESC_HI20:
-      *DFOut = DF;
-      return &F;
-    }
-  }
-
-  return nullptr;
-}
-
-std::optional<RISCVMCExpr::Specifier>
-RISCVMCExpr::getSpecifierForName(StringRef name) {
-  return StringSwitch<std::optional<RISCVMCExpr::Specifier>>(name)
+RISCV::Specifier RISCV::parseSpecifierName(StringRef name) {
+  return StringSwitch<RISCV::Specifier>(name)
       .Case("lo", RISCV::S_LO)
       .Case("hi", ELF::R_RISCV_HI20)
       .Case("pcrel_lo", RISCV::S_PCREL_LO)
@@ -108,10 +52,10 @@ RISCVMCExpr::getSpecifierForName(StringRef name) {
       // Used in data directives
       .Case("pltpcrel", ELF::R_RISCV_PLT32)
       .Case("gotpcrel", ELF::R_RISCV_GOT32_PCREL)
-      .Default(std::nullopt);
+      .Default(0);
 }
 
-StringRef RISCVMCExpr::getSpecifierName(Specifier S) {
+StringRef RISCV::getSpecifierName(Specifier S) {
   switch (S) {
   case RISCV::S_None:
     llvm_unreachable("not used as %specifier()");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 7e3acdfcb87b..3e842abc8da7 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -32,34 +32,7 @@ private:
 public:
   static const RISCVMCExpr *create(const MCExpr *Expr, Specifier S,
                                    MCContext &Ctx);
-
-  /// Get the corresponding PC-relative HI fixup that a VK_PCREL_LO
-  /// points to, and optionally the fragment containing it.
-  ///
-  /// \returns nullptr if this isn't a VK_PCREL_LO pointing to a
-  /// known PC-relative HI fixup.
-  const MCFixup *getPCRelHiFixup(const MCFragment **DFOut) const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static std::optional<Specifier> getSpecifierForName(StringRef name);
-  static StringRef getSpecifierName(Specifier Kind);
 };
-
-namespace RISCV {
-// Specifiers mapping to relocation types below FirstTargetFixupKind are
-// encoded literally, with these exceptions:
-enum Specifier {
-  S_None,
-  // Specifiers mapping to distinct relocation types.
-  S_LO = FirstTargetFixupKind,
-  S_PCREL_LO,
-  S_TPREL_LO,
-  // Vendor-specific relocation types might conflict across vendors.
-  // Refer to them using Specifier constants.
-  S_QC_ABS20,
-};
-} // namespace RISCV
 } // end namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 4a75a559a927..33dbed5f7861 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -13,7 +13,7 @@
 
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVInstPrinter.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
 #include "RISCV.h"

From 51b63bbee56c2253643f41c53bc3592af261b82d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:27:20 -0700
Subject: [PATCH 0428/1322] RISCV: Replace MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 33dbed5f7861..2c636c4efadc 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -432,7 +432,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   if (Offset.isImm())
     OS << MCO.getImm();
   else if (Offset.isGlobal() || Offset.isBlockAddress() || Offset.isMCSymbol())
-    OS << *MCO.getExpr();
+    MAI->printExpr(OS, *MCO.getExpr());
 
   if (Offset.isMCSymbol())
     MMI->getContext().registerInlineAsmLabel(Offset.getMCSymbol());

From f11dd116e0aa8cf35bdb82dba0a3a926538c05c2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:51:08 -0700
Subject: [PATCH 0429/1322] RISCV: Replace RISCVMCExpr with MCSpecifierExpr

---
 bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp  | 24 +++++-----
 .../CodeGen/TargetLoweringObjectFileImpl.h    |  4 --
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  2 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 46 +++++++++----------
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp     |  2 +-
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.h       |  1 -
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  6 +--
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp |  6 ---
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   | 38 ---------------
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     | 10 ++--
 .../Target/RISCV/RISCVTargetObjectFile.cpp    | 10 +---
 llvm/lib/Target/RISCV/RISCVTargetObjectFile.h |  3 --
 12 files changed, 45 insertions(+), 107 deletions(-)
 delete mode 100644 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h

diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index cf30ad272d1c..c7d664ab09d4 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -33,8 +33,8 @@ public:
 
   bool equals(const MCSpecifierExpr &A, const MCSpecifierExpr &B,
               CompFuncTy Comp) const override {
-    const auto &RISCVExprA = cast<RISCVMCExpr>(A);
-    const auto &RISCVExprB = cast<RISCVMCExpr>(B);
+    const auto &RISCVExprA = cast<MCSpecifierExpr>(A);
+    const auto &RISCVExprB = cast<MCSpecifierExpr>(B);
     if (RISCVExprA.getSpecifier() != RISCVExprB.getSpecifier())
       return false;
 
@@ -245,7 +245,7 @@ public:
                   MCContext *Ctx) {
     Inst.setOpcode(Opcode);
     Inst.clear();
-    Inst.addOperand(MCOperand::createExpr(RISCVMCExpr::create(
+    Inst.addOperand(MCOperand::createExpr(MCSpecifierExpr::create(
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
         ELF::R_RISCV_CALL_PLT, *Ctx)));
   }
@@ -342,7 +342,7 @@ public:
   }
 
   const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
-    auto *RISCVExpr = dyn_cast<RISCVMCExpr>(Expr);
+    auto *RISCVExpr = dyn_cast<MCSpecifierExpr>(Expr);
     if (RISCVExpr && RISCVExpr->getSubExpr())
       return getTargetSymbol(RISCVExpr->getSubExpr());
 
@@ -435,19 +435,19 @@ public:
     case ELF::R_RISCV_TLS_GD_HI20:
       // The GOT is reused so no need to create GOT relocations
     case ELF::R_RISCV_PCREL_HI20:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_PCREL_HI20, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_PCREL_HI20, Ctx);
     case ELF::R_RISCV_PCREL_LO12_I:
     case ELF::R_RISCV_PCREL_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCV::S_PCREL_LO, Ctx);
+      return MCSpecifierExpr::create(Expr, RISCV::S_PCREL_LO, Ctx);
     case ELF::R_RISCV_HI20:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_HI20, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_HI20, Ctx);
     case ELF::R_RISCV_LO12_I:
     case ELF::R_RISCV_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCV::S_LO, Ctx);
+      return MCSpecifierExpr::create(Expr, RISCV::S_LO, Ctx);
     case ELF::R_RISCV_CALL:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
     case ELF::R_RISCV_CALL_PLT:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
     }
   }
 
@@ -466,10 +466,10 @@ public:
       return false;
 
     const auto *ImmExpr = ImmOp.getExpr();
-    if (!isa<RISCVMCExpr>(ImmExpr))
+    if (!isa<MCSpecifierExpr>(ImmExpr))
       return false;
 
-    switch (cast<RISCVMCExpr>(ImmExpr)->getSpecifier()) {
+    switch (cast<MCSpecifierExpr>(ImmExpr)->getSpecifier()) {
     default:
       return false;
     case ELF::R_RISCV_CALL_PLT:
diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index fa6cb75338d4..00e681e6bf53 100644
--- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -117,10 +117,6 @@ public:
   MCSection *getStaticDtorSection(unsigned Priority,
                                   const MCSymbol *KeySym) const override;
 
-  virtual const MCExpr *createTargetMCExpr(const MCExpr *Expr,
-                                           uint8_t Specifier) const {
-    return nullptr;
-  }
   const MCExpr *
   lowerSymbolDifference(const MCSymbol *LHS, const MCSymbol *RHS,
                         int64_t Addend,
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b8c632d11f2e..c804a179d886 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1219,7 +1219,7 @@ const MCExpr *TargetLoweringObjectFileELF::lowerSymbolDifference(
     if (Addend)
       Res = MCBinaryExpr::createAdd(Res, MCConstantExpr::create(Addend, Ctx),
                                     Ctx);
-    return createTargetMCExpr(Res, PLTPCRelativeSpecifier);
+    return MCSpecifierExpr::create(Res, PLTPCRelativeSpecifier, getContext());
   }
 
   if (!PLTRelativeSpecifier)
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 612ac428dd1b..510ca5f8c0d9 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -137,7 +137,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
   // helpers such as emitLoadLocalAddress and emitLoadAddress.
   void emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
-                         const MCExpr *Symbol, RISCVMCExpr::Specifier VKHi,
+                         const MCExpr *Symbol, RISCV::Specifier VKHi,
                          unsigned SecondOpcode, SMLoc IDLoc, MCStreamer &Out);
 
   // Helper to emit pseudo instruction "lla" used in PC-rel addressing.
@@ -295,8 +295,7 @@ public:
 #undef GET_OPERAND_DIAGNOSTIC_TYPES
   };
 
-  static bool classifySymbolRef(const MCExpr *Expr,
-                                RISCVMCExpr::Specifier &Kind);
+  static bool classifySymbolRef(const MCExpr *Expr, RISCV::Specifier &Kind);
   static bool isSymbolDiff(const MCExpr *Expr);
 
   RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
@@ -544,7 +543,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isShiftedInt<N - 1, 1>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_None;
   }
@@ -559,7 +558,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<N>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_None;
   }
@@ -572,7 +571,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_None;
   }
@@ -583,7 +582,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -594,7 +593,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -605,7 +604,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TPREL_ADD;
   }
@@ -616,7 +615,7 @@ public:
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TLSDESC_CALL;
   }
@@ -870,7 +869,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<12>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == RISCV::S_LO || VK == RISCV::S_PCREL_LO ||
             VK == RISCV::S_TPREL_LO || VK == ELF::R_RISCV_TLSDESC_LOAD_LO12 ||
@@ -902,7 +901,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<20>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_QC_ABS20;
   }
@@ -915,7 +914,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_HI20 || VK == ELF::R_RISCV_TPREL_HI20);
   }
@@ -928,7 +927,7 @@ public:
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_PCREL_HI20 || VK == ELF::R_RISCV_GOT_HI20 ||
             VK == ELF::R_RISCV_TLS_GOT_HI20 || VK == ELF::R_RISCV_TLS_GD_HI20 ||
@@ -2099,7 +2098,7 @@ bool RISCVAsmParser::parseExprWithSpecifier(const MCExpr *&Res, SMLoc &E) {
   if (getParser().parseParenExpression(SubExpr, E))
     return true;
 
-  Res = RISCVMCExpr::create(SubExpr, Spec, getContext());
+  Res = MCSpecifierExpr::create(SubExpr, Spec, getContext());
   return false;
 }
 
@@ -2183,11 +2182,11 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
   }
 
   SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
-  RISCVMCExpr::Specifier Kind = ELF::R_RISCV_CALL_PLT;
+  RISCV::Specifier Kind = ELF::R_RISCV_CALL_PLT;
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
   Res = MCSymbolRefExpr::create(Sym, getContext());
-  Res = RISCVMCExpr::create(Res, Kind, getContext());
+  Res = MCSpecifierExpr::create(Res, Kind, getContext());
   Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
   return ParseStatus::Success;
 }
@@ -2203,7 +2202,7 @@ ParseStatus RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) {
   if (Res->getKind() != MCExpr::ExprKind::SymbolRef)
     return Error(S, "operand must be a valid jump target");
 
-  Res = RISCVMCExpr::create(Res, ELF::R_RISCV_CALL_PLT, getContext());
+  Res = MCSpecifierExpr::create(Res, ELF::R_RISCV_CALL_PLT, getContext());
   Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
   return ParseStatus::Success;
 }
@@ -2918,10 +2917,9 @@ bool RISCVAsmParser::parseInstruction(ParseInstructionInfo &Info,
 }
 
 bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
-                                       RISCVMCExpr::Specifier &Kind) {
+                                       RISCV::Specifier &Kind) {
   Kind = RISCV::S_None;
-
-  if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
+  if (const auto *RE = dyn_cast<MCSpecifierExpr>(Expr)) {
     Kind = RE->getSpecifier();
     Expr = RE->getSubExpr();
   }
@@ -3434,7 +3432,7 @@ void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
 
 void RISCVAsmParser::emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
                                        const MCExpr *Symbol,
-                                       RISCVMCExpr::Specifier VKHi,
+                                       RISCV::Specifier VKHi,
                                        unsigned SecondOpcode, SMLoc IDLoc,
                                        MCStreamer &Out) {
   // A pair of instructions for PC-relative addressing; expands to
@@ -3445,11 +3443,11 @@ void RISCVAsmParser::emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
   MCSymbol *TmpLabel = Ctx.createNamedTempSymbol("pcrel_hi");
   Out.emitLabel(TmpLabel);
 
-  const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
+  const auto *SymbolHi = MCSpecifierExpr::create(Symbol, VKHi, Ctx);
   emitToStreamer(Out,
                  MCInstBuilder(RISCV::AUIPC).addReg(TmpReg).addExpr(SymbolHi));
 
-  const MCExpr *RefToLinkTmpLabel = RISCVMCExpr::create(
+  const MCExpr *RefToLinkTmpLabel = MCSpecifierExpr::create(
       MCSymbolRefExpr::create(TmpLabel, Ctx), RISCV::S_PCREL_LO, Ctx);
 
   emitToStreamer(Out, MCInstBuilder(SecondOpcode)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index 88b1d21f86c5..090d331d99ca 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -44,7 +44,7 @@ const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   MCContext &Ctx = Streamer.getContext();
   const MCExpr *ME = MCSymbolRefExpr::create(Sym, Ctx);
   assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding");
-  return RISCVMCExpr::create(ME, ELF::R_RISCV_32_PCREL, Ctx);
+  return MCSpecifierExpr::create(ME, ELF::R_RISCV_32_PCREL, Ctx);
 }
 
 void RISCVMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index 05f04a618560..097e94b6117c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 
-#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCFixup.h"
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 03c6701a1795..1185e3558b00 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -177,7 +177,7 @@ void RISCVMCCodeEmitter::expandTLSDESCCall(const MCInst &MI,
   MCOperand SrcSymbol = MI.getOperand(3);
   assert(SrcSymbol.isExpr() &&
          "Expected expression as first input to TLSDESCCALL");
-  const RISCVMCExpr *Expr = dyn_cast<RISCVMCExpr>(SrcSymbol.getExpr());
+  const auto *Expr = dyn_cast<MCSpecifierExpr>(SrcSymbol.getExpr());
   MCRegister Link = MI.getOperand(0).getReg();
   MCRegister Dest = MI.getOperand(1).getReg();
   int64_t Imm = MI.getOperand(2).getImm();
@@ -205,7 +205,7 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI,
   assert(SrcSymbol.isExpr() &&
          "Expected expression as third input to TP-relative add");
 
-  const RISCVMCExpr *Expr = dyn_cast<RISCVMCExpr>(SrcSymbol.getExpr());
+  const auto *Expr = dyn_cast<MCSpecifierExpr>(SrcSymbol.getExpr());
   assert(Expr && Expr->getSpecifier() == ELF::R_RISCV_TPREL_ADD &&
          "Expected tprel_add relocation on TP-relative symbol");
 
@@ -566,7 +566,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
   unsigned FixupKind = RISCV::fixup_riscv_invalid;
   bool RelaxCandidate = false;
   if (Kind == MCExpr::Specifier) {
-    const RISCVMCExpr *RVExpr = cast<RISCVMCExpr>(Expr);
+    const auto *RVExpr = cast<MCSpecifierExpr>(Expr);
     FixupKind = RVExpr->getSpecifier();
     switch (RVExpr->getSpecifier()) {
     default:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 1f6f940cac6f..baa508ad3a68 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVAsmBackend.h"
 #include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "RISCVFixupKinds.h"
@@ -27,11 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "riscvmcexpr"
 
-const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S,
-                                       MCContext &Ctx) {
-  return new (Ctx) RISCVMCExpr(Expr, S);
-}
-
 RISCV::Specifier RISCV::parseSpecifierName(StringRef name) {
   return StringSwitch<RISCV::Specifier>(name)
       .Case("lo", RISCV::S_LO)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
deleted file mode 100644
index 3e842abc8da7..000000000000
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- RISCVMCExpr.h - RISC-V specific MC expression classes----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes RISC-V specific MCExprs, used for modifiers like
-// "%hi" or "%lo" etc.,
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCEXPR_H
-#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-
-class StringRef;
-
-class RISCVMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-
-private:
-  explicit RISCVMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const RISCVMCExpr *create(const MCExpr *Expr, Specifier S,
-                                   MCContext &Ctx);
-};
-} // end namespace llvm.
-
-#endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 2c636c4efadc..83e9b4b4d7c5 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -630,7 +630,7 @@ void RISCVAsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
     Sym = OutContext.getOrCreateSymbol(SymName);
   }
   auto Res = MCSymbolRefExpr::create(Sym, OutContext);
-  auto Expr = RISCVMCExpr::create(Res, ELF::R_RISCV_CALL_PLT, OutContext);
+  auto Expr = MCSpecifierExpr::create(Res, ELF::R_RISCV_CALL_PLT, OutContext);
 
   EmitToStreamer(*OutStreamer, MCInstBuilder(RISCV::PseudoCALL).addExpr(Expr));
 }
@@ -741,8 +741,8 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
 
   const MCSymbolRefExpr *HwasanTagMismatchV2Ref =
       MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext);
-  auto Expr = RISCVMCExpr::create(HwasanTagMismatchV2Ref, ELF::R_RISCV_CALL_PLT,
-                                  OutContext);
+  auto Expr = MCSpecifierExpr::create(HwasanTagMismatchV2Ref,
+                                      ELF::R_RISCV_CALL_PLT, OutContext);
 
   for (auto &P : HwasanMemaccessSymbols) {
     unsigned Reg = std::get<0>(P.first);
@@ -957,7 +957,7 @@ void RISCVAsmPrinter::emitNoteGnuProperty(const Module &M) {
 static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
                                     const AsmPrinter &AP) {
   MCContext &Ctx = AP.OutContext;
-  RISCVMCExpr::Specifier Kind;
+  RISCV::Specifier Kind;
 
   switch (MO.getTargetFlags()) {
   default:
@@ -1019,7 +1019,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
         ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
   if (Kind != RISCV::S_None)
-    ME = RISCVMCExpr::create(ME, Kind, Ctx);
+    ME = MCSpecifierExpr::create(ME, Kind, Ctx);
   return MCOperand::createExpr(ME);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index 3cb5c7e13dd7..bc90cf8f53ab 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetObjectFile.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCObjectFileInfo.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -53,7 +52,7 @@ const MCExpr *RISCVELFTargetObjectFile::getIndirectSymViaGOTPCRel(
   const MCExpr *Res = MCSymbolRefExpr::create(Sym, Ctx);
   Res = MCBinaryExpr::createAdd(
       Res, MCConstantExpr::create(Offset + MV.getConstant(), Ctx), Ctx);
-  return RISCVMCExpr::create(Res, ELF::R_RISCV_GOT32_PCREL, Ctx);
+  return MCSpecifierExpr::create(Res, ELF::R_RISCV_GOT32_PCREL, Ctx);
 }
 
 // A address must be loaded from a small section if its size is less than the
@@ -180,10 +179,3 @@ MCSection *RISCVELFTargetObjectFile::getSectionForConstant(
   return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C,
                                                             Alignment);
 }
-
-const MCExpr *
-RISCVELFTargetObjectFile::createTargetMCExpr(const MCExpr *Expr,
-                                             uint8_t Specifier) const {
-  return RISCVMCExpr::create(Expr, RISCVMCExpr::Specifier(Specifier),
-                             getContext());
-}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
index b6da3f4721f4..ff7e3e4c752c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -48,9 +48,6 @@ public:
 
   bool isInSmallSection(uint64_t Size) const;
 
-  const MCExpr *createTargetMCExpr(const MCExpr *Expr,
-                                   uint8_t Specifier) const override;
-
   const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
                                           const MCSymbol *Sym,
                                           const MCValue &MV, int64_t Offset,

From 489dcc9e5233b52152272e6e5377784a56a12f1d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:55:45 -0700
Subject: [PATCH 0430/1322] AArch64: Replace MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 9d9e23e99ab3..bbe83821eca8 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -278,7 +278,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
     {
       WithMarkup M = markup(O, Markup::Immediate);
       O << "#";
-      MI->getOperand(1).getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *MI->getOperand(1).getExpr());
     }
     return;
   }
@@ -291,7 +291,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
     {
       WithMarkup M = markup(O, Markup::Immediate);
       O << "#";
-      MI->getOperand(2).getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *MI->getOperand(2).getExpr());
     }
     return;
   }
@@ -1163,7 +1163,7 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     printImm(MI, OpNo, STI, O);
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -1240,7 +1240,7 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
     }
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
     printShifter(MI, OpNum + 1, STI, O);
   }
 }
@@ -1431,7 +1431,7 @@ void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
     markup(O, Markup::Immediate) << '#' << formatImm(MO.getImm() * Scale);
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
   }
 }
 
@@ -1446,7 +1446,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
   } else {
     assert(MO1.isExpr() && "Unexpected operand type!");
     O << ", ";
-    MO1.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO1.getExpr());
   }
   O << ']';
 }
@@ -1805,7 +1805,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
     markup(O, Markup::Target) << formatHex((uint64_t)TargetAddress);
   } else {
     // Otherwise, just print the expression.
-    MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MI->getOperand(OpNum).getExpr());
   }
 }
 
@@ -1832,7 +1832,7 @@ void AArch64InstPrinter::printAdrAdrpLabel(const MCInst *MI, uint64_t Address,
   }
 
   // Otherwise, just print the expression.
-  MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MI->getOperand(OpNum).getExpr());
 }
 
 void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,

From 9a87c94622863cf712c6ab432931dfdb704fae3e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:00:53 -0700
Subject: [PATCH 0431/1322] MIPS: Replace MCExpr::print with
 MCAsmInfo::printExpr

---
 .../lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp |  3 ++-
 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp     |  2 +-
 .../Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp  | 12 ++++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
index d743f00da273..f67356c105a4 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
@@ -13,6 +13,7 @@
 #include "MipsInstPrinter.h"
 #include "Mips.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -138,7 +139,7 @@ void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
 
 void MipsInstPrinter::printJumpOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index d5d64ae8a0cd..704ee0375f7a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -47,7 +47,7 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   case MEK_DTPREL:
     // MEK_DTPREL is used for marking TLS DIEExpr only
     // and contains a regular sub-expression.
-    getSubExpr()->print(OS, MAI);
+    MAI->printExpr(OS, *getSubExpr());
     return;
   case MEK_CALL_HI16:
     OS << "%call_hi";
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 4919d4f10856..49aea9c69162 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -398,42 +398,42 @@ MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
 void MipsTargetAsmStreamer::emitDTPRel32Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.dtprelword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitDTPRel64Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.dtpreldword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitTPRel32Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.tprelword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitTPRel64Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.tpreldword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitGPRel32Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.gpword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitGPRel64Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.gpdword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 

From 81d8c89da056a7751f6c7714fccb30c071dbc31a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:02:48 -0700
Subject: [PATCH 0432/1322] M68k: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
index 68ac15b57508..778d31280adc 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
@@ -27,6 +27,7 @@
 #include "M68kBaseInfo.h"
 
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -68,7 +69,7 @@ void M68kInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void M68kInstPrinter::printImmediate(const MCInst *MI, unsigned opNum,
@@ -78,7 +79,7 @@ void M68kInstPrinter::printImmediate(const MCInst *MI, unsigned opNum,
     O << '#' << MO.getImm();
   else if (MO.isExpr()) {
     O << '#';
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
   } else
     llvm_unreachable("Unknown immediate kind");
 }
@@ -144,7 +145,7 @@ void M68kInstPrinter::printDisp(const MCInst *MI, unsigned opNum,
     return;
   }
   assert(Op.isExpr() && "Unknown operand kind in printOperand");
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
 
 // NOTE forcing (W,L) size available since M68020 only
@@ -153,7 +154,7 @@ void M68kInstPrinter::printAbsMem(const MCInst *MI, unsigned opNum,
   const MCOperand &MO = MI->getOperand(opNum);
 
   if (MO.isExpr()) {
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
     return;
   }
 

From 95acd6199f3799da00e45b62fd1045ece7142cad Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:11:20 -0700
Subject: [PATCH 0433/1322] AMDGPU: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp   | 5 +++--
 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp        | 4 ++--
 .../lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 5 +++--
 llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp     | 6 ++++--
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a56bca514aff..a6ce42dca92b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -12,6 +12,7 @@
 #include "SIDefines.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -51,7 +52,7 @@ void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isExpr()) {
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
     return;
   }
 
@@ -787,7 +788,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
     }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
-    Exp->print(O, &MAI);
+    MAI.printExpr(O, *Exp);
   } else {
     O << "/*INV_OP*/";
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 7b04fb576f43..dc1445621c7a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -77,7 +77,7 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
     break;
   }
   for (const auto *It = Args.begin(); It != Args.end(); ++It) {
-    (*It)->print(OS, MAI);
+    MAI->printExpr(OS, **It);
     if ((It + 1) != Args.end())
       OS << ", ";
   }
@@ -709,5 +709,5 @@ void llvm::AMDGPU::printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS,
     return;
   }
 
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index a6c97a02cb95..6d69bb75f293 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -18,6 +18,7 @@
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -282,7 +283,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
   OS << ", ";                                                                  \
-  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  getContext().getAsmInfo()->printExpr(OS, *ARG->getVariableValue());          \
   Streamer.addBlankLine();
 
   PRINT_RES_INFO(NumVGPR);
@@ -304,7 +305,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
   OS << ", ";                                                                  \
-  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  getContext().getAsmInfo()->printExpr(OS, *ARG->getVariableValue());          \
   Streamer.addBlankLine();
 
   PRINT_RES_INFO(MaxVGPR);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
index 71e06dbbd151..46728e59a644 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -10,6 +10,7 @@
 #include "R600InstPrinter.h"
 #include "AMDGPUInstPrinter.h"
 #include "R600MCTargetDesc.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -98,7 +99,8 @@ void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
     O << Imm << '(' << llvm::bit_cast<float>(static_cast<uint32_t>(Imm)) << ')';
   }
   if (Op.isExpr()) {
-    Op.getExpr()->print(O << '@', &MAI);
+    O << '@';
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -160,7 +162,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
-    Exp->print(O, &MAI);
+    MAI.printExpr(O, *Exp);
   } else {
     O << "/*INV_OP*/";
   }

From 0894094efdfb1ff4f93f818cef9f2aec9c1ea1a8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:14:23 -0700
Subject: [PATCH 0434/1322] X86: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp   | 7 ++++---
 .../lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp | 2 +-
 llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp | 9 +++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index cd89b88f4619..6614eea3901b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -14,6 +14,7 @@
 #include "X86ATTInstPrinter.h"
 #include "X86BaseInfo.h"
 #include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -414,7 +415,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     WithMarkup M = markup(O, Markup::Immediate);
     O << '$';
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -445,7 +446,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
       O << formatImm(DispVal);
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   }
 
   if (IndexReg.getReg() || BaseReg.getReg()) {
@@ -500,7 +501,7 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
     O << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 8470d26011cd..7523d2aedcce 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -374,7 +374,7 @@ void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
       markup(O, Markup::Immediate) << formatHex((uint64_t)Address);
     } else {
       // Otherwise, just print the expression.
-      Op.getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *Op.getExpr());
     }
   }
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 0d92609b3a63..b8e117be465e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -14,6 +14,7 @@
 #include "X86IntelInstPrinter.h"
 #include "X86BaseInfo.h"
 #include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -372,7 +373,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << "offset ";
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -415,7 +416,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   } else {
     int64_t DispVal = DispSpec.getImm();
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
@@ -470,7 +471,7 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
     markup(O, Markup::Immediate) << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   }
 
   O << ']';
@@ -479,7 +480,7 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
 void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
   if (MI->getOperand(Op).isExpr())
-    return MI->getOperand(Op).getExpr()->print(O, &MAI);
+    return MAI.printExpr(O, *MI->getOperand(Op).getExpr());
 
   markup(O, Markup::Immediate) << formatImm(MI->getOperand(Op).getImm() & 0xff);
 }

From dcb8cd8ecdd74eb2ceca2365e0fb4c9545e3cd97 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:19:14 -0700
Subject: [PATCH 0435/1322] ARM: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp |  2 +-
 llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp | 10 +++++-----
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp      |  3 ++-
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp      |  3 ++-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 49b89cad6d47..2b959768d213 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -288,7 +288,7 @@ void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
   OS << "\t.thumb_set\t";
   Symbol->print(OS, MAI);
   OS << ", ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index ac90095a20be..ad00b171aaf6 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -350,7 +350,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     switch (Expr->getKind()) {
     case MCExpr::Binary:
       O << '#';
-      Expr->print(O, &MAI);
+      MAI.printExpr(O, *Expr);
       break;
     case MCExpr::Constant: {
       // If a symbolic branch target was added as a constant expression then
@@ -360,7 +360,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       int64_t TargetAddress;
       if (!Constant->evaluateAsAbsolute(TargetAddress)) {
         O << '#';
-        Expr->print(O, &MAI);
+        MAI.printExpr(O, *Expr);
       } else {
         O << "0x";
         O.write_hex(static_cast<uint32_t>(TargetAddress));
@@ -370,7 +370,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     default:
       // FIXME: Should we always treat this as if it is a constant literal and
       // prefix it with '#'?
-      Expr->print(O, &MAI);
+      MAI.printExpr(O, *Expr);
       break;
     }
   }
@@ -395,7 +395,7 @@ void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                                raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   if (MO1.isExpr()) {
-    MO1.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO1.getExpr());
     return;
   }
 
@@ -1081,7 +1081,7 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO = MI->getOperand(OpNum);
 
   if (MO.isExpr()) {
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
     return;
   }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index b5e17e3c2da0..1035a9e131c4 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
@@ -44,7 +45,7 @@ void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   const MCExpr *Expr = getSubExpr();
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << '(';
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << ')';
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 49784e806b88..5be799093d2c 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -8,6 +8,7 @@
 
 #include "AVRMCExpr.h"
 
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -44,7 +45,7 @@ void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << getName() << '(';
   if (isNegated())
     OS << '-' << '(';
-  getSubExpr()->print(OS, MAI);
+  MAI->printExpr(OS, *getSubExpr());
   if (isNegated())
     OS << ')';
   OS << ')';

From a7e5de472314a891604abee390beb8af5493b29a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:23:13 -0700
Subject: [PATCH 0436/1322] SystemZ: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 .../Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp | 6 +++---
 .../SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp       | 4 ++--
 llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp      | 3 ++-
 .../Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp   | 3 ++-
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp               | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
index 165feec7a7d4..ec8c81080930 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
@@ -209,7 +209,7 @@ void SystemZHLASMAsmStreamer::emitHLASMValueImpl(const MCExpr *Value,
   switch (Value->getKind()) {
   case MCExpr::Constant: {
     OS << "XL" << Size << '\'';
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     OS << '\'';
     return;
   }
@@ -258,12 +258,12 @@ void SystemZHLASMAsmStreamer::emitHLASMValueImpl(const MCExpr *Value,
     return;
   }
   case MCExpr::Target:
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     return;
   default:
     if (Parens)
       OS << "A(";
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     if (Parens)
       OS << ')';
     return;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index 5ba55e27a613..7fd1a1c2d801 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -53,7 +53,7 @@ void SystemZInstPrinterCommon::printOperand(const MCOperand &MO,
   } else if (MO.isImm())
     markup(O, Markup::Immediate) << MO.getImm();
   else if (MO.isExpr())
-    MO.getExpr()->print(O, MAI);
+    MAI->printExpr(O, *MO.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
@@ -171,7 +171,7 @@ void SystemZInstPrinterCommon::printPCRelOperand(const MCInst *MI,
     markup(O, Markup::Target) << formatHex((uint64_t)TargetAddress);
   } else {
     // Otherwise, just print the expression.
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
index 0167eae60452..6dcca60dcedd 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
@@ -32,7 +33,7 @@ StringRef SystemZMCExpr::getVariantKindName() const {
 
 void SystemZMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << getVariantKindName() << '(';
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
   OS << ')';
 }
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp
index a4506eddaa69..772067809744 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 
 using namespace llvm;
@@ -44,7 +45,7 @@ const MCExpr *SystemZTargetHLASMStreamer::createWordDiffExpr(
   OS << Temp->getName() << " EQU ";
   const MCBinaryExpr *TempExpr = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(Hi, Ctx), MCSymbolRefExpr::create(Lo, Ctx), Ctx);
-  TempExpr->print(OS, Ctx.getAsmInfo());
+  Ctx.getAsmInfo()->printExpr(OS, *TempExpr);
   OS << "\n";
   return MCBinaryExpr::createLShr(MCSymbolRefExpr::create(Temp, Ctx),
                                   MCConstantExpr::create(1, Ctx), Ctx);
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 57911ac1ec2f..eb4b4c1647a1 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -1043,7 +1043,7 @@ static void printOperand(const MCOperand &MCOp, const MCAsmInfo *MAI,
   else if (MCOp.isImm())
     OS << MCOp.getImm();
   else if (MCOp.isExpr())
-    MCOp.getExpr()->print(OS, MAI);
+    MAI->printExpr(OS, *MCOp.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }

From dca2b261d77a9b758587b660e5b88b6a312d057c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:25:13 -0700
Subject: [PATCH 0437/1322] Lanai: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 .../Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp   | 12 ++++++------
 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp   |  5 +++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
index 837d8fea1c89..add4096ef936 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
@@ -152,7 +152,7 @@ void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     OS << formatHex(Op.getImm());
   else {
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -165,7 +165,7 @@ void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo,
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
     OS << '[';
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
     OS << ']';
   }
 }
@@ -178,7 +178,7 @@ void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo,
   } else {
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -190,7 +190,7 @@ void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo,
   } else {
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -202,7 +202,7 @@ void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo,
   } else {
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -227,7 +227,7 @@ static void printMemoryImmediateOffset(const MCAsmInfo &MAI,
     assert(isInt<SizeInBits>(OffsetOp.getImm()) && "Constant value truncated");
     OS << OffsetOp.getImm();
   } else
-    OffsetOp.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *OffsetOp.getExpr());
 }
 
 void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo,
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
index eec1b7f482f1..b75a09915660 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LanaiMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -21,7 +22,7 @@ const LanaiMCExpr *LanaiMCExpr::create(Spec S, const MCExpr *Expr,
 
 void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   if (specifier == VK_Lanai_None) {
-    Expr->print(OS, MAI);
+    MAI->printExpr(OS, *Expr);
     return;
   }
 
@@ -38,6 +39,6 @@ void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
   OS << '(';
   const MCExpr *Expr = getSubExpr();
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
   OS << ')';
 }

From 178fac3d61aa7fc4eb9e4a3d385ae02e660c0d3a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:31:08 -0700
Subject: [PATCH 0438/1322] Hexagon: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp      | 3 ++-
 llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp | 4 ++--
 llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp      | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index ae0305d570dc..f9b4bc0d14fd 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonMCExpr.h"
@@ -457,7 +458,7 @@ public:
 void HexagonOperand::print(raw_ostream &OS) const {
   switch (Kind) {
   case Immediate:
-    getImm()->print(OS, nullptr);
+    HexagonMCAsmInfo(Triple()).printExpr(OS, *getImm());
     break;
   case Register:
     OS << "<register R";
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 16b8cec54199..9030e43b7149 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -72,7 +72,7 @@ void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
     if (MO.getExpr()->evaluateAsAbsolute(Value))
       O << formatImm(Value);
     else
-      MO.getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *MO.getExpr());
   } else {
     llvm_unreachable("Unknown operand");
   }
@@ -90,6 +90,6 @@ void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo,
     if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))
       if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo)
         O << "##";
-    Expr.print(O, &MAI);
+    MAI.printExpr(O, Expr);
   }
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index 92a8be359d73..d96e9601bf9e 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
@@ -58,7 +59,7 @@ HexagonMCExpr::HexagonMCExpr(MCExpr const *Expr)
       SignMismatch(false) {}
 
 void HexagonMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
 }
 
 void HexagonMCExpr::setSignMismatch(bool Val) {

From 22ad0359f9006f47a1707170896f359abbd6e10d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:34:31 -0700
Subject: [PATCH 0439/1322] NVPTX: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 ++-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp               | 2 +-
 llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp                   | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 732950deca9f..cc79257fb9c8 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "NVPTXUtilities.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/NVVMIntrinsicUtils.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -90,7 +91,7 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     markup(O, Markup::Immediate) << formatImm(Op.getImm());
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index a1f528c4379e..b4e2c46b9444 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1850,7 +1850,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV,
 }
 
 void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) const {
-  Expr.print(OS, OutContext.getAsmInfo());
+  OutContext.getAsmInfo()->printExpr(OS, Expr);
 }
 
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
diff --git a/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
index 95125eb41bc0..8cde0873d4d2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -8,6 +8,7 @@
 
 #include "NVPTXMCExpr.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Format.h"
@@ -64,6 +65,6 @@ NVPTXGenericMCSymbolRefExpr::create(const MCSymbolRefExpr *SymExpr,
 void NVPTXGenericMCSymbolRefExpr::printImpl(raw_ostream &OS,
                                             const MCAsmInfo *MAI) const {
   OS << "generic(";
-  SymExpr->print(OS, MAI);
+  MAI->printExpr(OS, *SymExpr);
   OS << ")";
 }

From c9d511bc642fbf612014eee4749ad7ee2646af32 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:41:17 -0700
Subject: [PATCH 0440/1322] Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 bolt/lib/Passes/RetpolineInsertion.cpp                      | 2 +-
 llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp   | 6 +++---
 .../WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp     | 3 ++-
 llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp   | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Passes/RetpolineInsertion.cpp b/bolt/lib/Passes/RetpolineInsertion.cpp
index 98e5a8fba645..bda26206e16c 100644
--- a/bolt/lib/Passes/RetpolineInsertion.cpp
+++ b/bolt/lib/Passes/RetpolineInsertion.cpp
@@ -195,7 +195,7 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,
 
   TagOS << "+";
   if (MemRef.DispExpr)
-    MemRef.DispExpr->print(TagOS, BC.AsmInfo.get());
+    BC.AsmInfo->printExpr(TagOS, *MemRef.DispExpr);
   else
     TagOS << MemRef.DispImm;
 
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
index f925a1efc88f..80a1e85e4a5d 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
@@ -48,7 +48,7 @@ void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
     O << Imm;
   } else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -62,7 +62,7 @@ void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << '#';
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -83,7 +83,7 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
     O << '&';
 
   if (Disp.isExpr())
-    Disp.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Disp.getExpr());
   else {
     assert(Disp.isImm() && "Expected immediate in displacement field");
     O << Disp.getImm();
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index 344ccec58aff..321aee472082 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -344,7 +345,7 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       auto &Sym = static_cast<const MCSymbolWasm &>(SRE->getSymbol());
       O << WebAssembly::signatureToString(Sym.getSignature());
     } else {
-      Op.getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *Op.getExpr());
     }
   }
 }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
index 408a6ac01de9..6f9f29765452 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
@@ -135,7 +135,7 @@ void XtensaInstPrinter::printCallOperand(const MCInst *MI, uint64_t Address,
       O << Val;
     }
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }

From d793168e3b1a0343debfdfe143d7fb4127f9038c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:04:22 -0700
Subject: [PATCH 0441/1322] MIPS: Rename MipsMCExpr::MEK_ to Mips::S_

Prepare for removing MipsMCExpr. Adopt the newer naming convention (S_)
used by AMDGPU/WebAssembly/VE/M68k/PowerPC/LoongArch/RISCV.
---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 113 ++++++++-------
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp |  18 +--
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       | 132 +++++++++++++++++
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  34 +++++
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  56 ++++----
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   | 134 +-----------------
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h |  29 ----
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   9 +-
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |   3 +-
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      |  68 ++++-----
 llvm/lib/Target/Mips/MipsTargetObjectFile.cpp |   4 +-
 11 files changed, 305 insertions(+), 295 deletions(-)

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 76bbdef7ae22..8d9c3a96b32a 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -9,7 +9,7 @@
 #include "MCTargetDesc/MipsABIFlagsSection.h"
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MCTargetDesc/MipsTargetStreamer.h"
 #include "TargetInfo/MipsTargetInfo.h"
@@ -2964,10 +2964,10 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
         Res.getConstant() == 0 && !IsLocalSym) {
       if (UseXGOT) {
-        const MCExpr *CallHiExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16,
-                                                      SymExpr, getContext());
-        const MCExpr *CallLoExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16,
-                                                      SymExpr, getContext());
+        const MCExpr *CallHiExpr =
+            MipsMCExpr::create(Mips::S_CALL_HI16, SymExpr, getContext());
+        const MCExpr *CallLoExpr =
+            MipsMCExpr::create(Mips::S_CALL_LO16, SymExpr, getContext());
         TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                     STI);
         TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
@@ -2976,7 +2976,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                      MCOperand::createExpr(CallLoExpr), IDLoc, STI);
       } else {
         const MCExpr *CallExpr =
-            MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+            MipsMCExpr::create(Mips::S_GOT_CALL, SymExpr, getContext());
         TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
                      MCOperand::createExpr(CallExpr), IDLoc, STI);
       }
@@ -3009,9 +3009,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       const MCExpr *CallHiExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, SymExpr, getContext());
-      const MCExpr *CallLoExpr = MipsMCExpr::create(
-          Res.getAddSym(), MipsMCExpr::MEK_GOT_LO16, getContext());
+          MipsMCExpr::create(Mips::S_GOT_HI16, SymExpr, getContext());
+      const MCExpr *CallLoExpr =
+          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_LO16, getContext());
 
       TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                   STI);
@@ -3042,8 +3042,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // The daddiu's marked with a '>' may be omitted if they are redundant. If
       // this happens then the last instruction must use $rd as the result
       // register.
-      GotExpr = MipsMCExpr::create(Res.getAddSym(), MipsMCExpr::MEK_GOT_DISP,
-                                   getContext());
+      GotExpr =
+          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_DISP, getContext());
       if (Res.getConstant() != 0) {
         // Symbols fully resolve with just the %got_disp(symbol) but we
         // must still account for any offset to the symbol for
@@ -3070,15 +3070,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       if (IsLocalSym) {
-        GotExpr =
-            MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
-        LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+        GotExpr = MipsMCExpr::create(Mips::S_GOT, SymExpr, getContext());
+        LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
       } else {
         // External symbols fully resolve the symbol with just the %got(symbol)
         // but we must still account for any offset to the symbol for
         // expressions like symbol+8.
-        GotExpr = MipsMCExpr::create(Res.getAddSym(), MipsMCExpr::MEK_GOT,
-                                     getContext());
+        GotExpr =
+            MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
         if (Res.getConstant() != 0)
           LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
       }
@@ -3099,9 +3098,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
   }
 
   const MipsMCExpr *HiExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
+      MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+      MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3113,9 +3112,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // source register.
 
     const MipsMCExpr *HighestExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
+        MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
     const MipsMCExpr *HigherExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
+        MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
 
     bool RdRegIsRsReg =
         UseSrcReg &&
@@ -3314,7 +3313,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
     const MipsMCExpr *GotExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext());
+        MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3326,7 +3325,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
     const MipsMCExpr *HiExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_HI, HiSym, getContext());
+        MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3339,10 +3338,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
       const MipsMCExpr *HighestExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, HighestSym, getContext());
+          MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
       const MipsMCExpr *HigherExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, HigherSym, getContext());
+          MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
                   STI);
@@ -3430,7 +3429,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3481,7 +3480,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3562,7 +3561,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3785,15 +3784,15 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       //                  sw  $8,  %lo(sym)($at)
       const MCExpr *OffExpr = OffsetOp.getExpr();
       MCOperand LoOperand = MCOperand::createExpr(
-          MipsMCExpr::create(MipsMCExpr::MEK_LO, OffExpr, getContext()));
+          MipsMCExpr::create(Mips::S_LO, OffExpr, getContext()));
       MCOperand HiOperand = MCOperand::createExpr(
-          MipsMCExpr::create(MipsMCExpr::MEK_HI, OffExpr, getContext()));
+          MipsMCExpr::create(Mips::S_HI, OffExpr, getContext()));
 
       if (ABI.IsN64()) {
         MCOperand HighestOperand = MCOperand::createExpr(
-            MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, OffExpr, getContext()));
+            MipsMCExpr::create(Mips::S_HIGHEST, OffExpr, getContext()));
         MCOperand HigherOperand = MCOperand::createExpr(
-            MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, OffExpr, getContext()));
+            MipsMCExpr::create(Mips::S_HIGHER, OffExpr, getContext()));
 
         TOut.emitRX(Mips::LUi, TmpReg, HighestOperand, IDLoc, STI);
         TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HigherOperand, IDLoc, STI);
@@ -6355,31 +6354,31 @@ MCRegister MipsAsmParser::getReg(int RC, int RegNo) {
 const MCExpr *MipsAsmParser::parseRelocExpr() {
   auto getOp = [](StringRef Op) {
     return StringSwitch<MipsMCExpr::Specifier>(Op)
-        .Case("call16", MipsMCExpr::MEK_GOT_CALL)
-        .Case("call_hi", MipsMCExpr::MEK_CALL_HI16)
-        .Case("call_lo", MipsMCExpr::MEK_CALL_LO16)
-        .Case("dtprel_hi", MipsMCExpr::MEK_DTPREL_HI)
-        .Case("dtprel_lo", MipsMCExpr::MEK_DTPREL_LO)
-        .Case("got", MipsMCExpr::MEK_GOT)
-        .Case("got_disp", MipsMCExpr::MEK_GOT_DISP)
-        .Case("got_hi", MipsMCExpr::MEK_GOT_HI16)
-        .Case("got_lo", MipsMCExpr::MEK_GOT_LO16)
-        .Case("got_ofst", MipsMCExpr::MEK_GOT_OFST)
-        .Case("got_page", MipsMCExpr::MEK_GOT_PAGE)
-        .Case("gottprel", MipsMCExpr::MEK_GOTTPREL)
-        .Case("gp_rel", MipsMCExpr::MEK_GPREL)
-        .Case("hi", MipsMCExpr::MEK_HI)
-        .Case("higher", MipsMCExpr::MEK_HIGHER)
-        .Case("highest", MipsMCExpr::MEK_HIGHEST)
-        .Case("lo", MipsMCExpr::MEK_LO)
-        .Case("neg", MipsMCExpr::MEK_NEG)
-        .Case("pcrel_hi", MipsMCExpr::MEK_PCREL_HI16)
-        .Case("pcrel_lo", MipsMCExpr::MEK_PCREL_LO16)
-        .Case("tlsgd", MipsMCExpr::MEK_TLSGD)
-        .Case("tlsldm", MipsMCExpr::MEK_TLSLDM)
-        .Case("tprel_hi", MipsMCExpr::MEK_TPREL_HI)
-        .Case("tprel_lo", MipsMCExpr::MEK_TPREL_LO)
-        .Default(MipsMCExpr::MEK_None);
+        .Case("call16", Mips::S_GOT_CALL)
+        .Case("call_hi", Mips::S_CALL_HI16)
+        .Case("call_lo", Mips::S_CALL_LO16)
+        .Case("dtprel_hi", Mips::S_DTPREL_HI)
+        .Case("dtprel_lo", Mips::S_DTPREL_LO)
+        .Case("got", Mips::S_GOT)
+        .Case("got_disp", Mips::S_GOT_DISP)
+        .Case("got_hi", Mips::S_GOT_HI16)
+        .Case("got_lo", Mips::S_GOT_LO16)
+        .Case("got_ofst", Mips::S_GOT_OFST)
+        .Case("got_page", Mips::S_GOT_PAGE)
+        .Case("gottprel", Mips::S_GOTTPREL)
+        .Case("gp_rel", Mips::S_GPREL)
+        .Case("hi", Mips::S_HI)
+        .Case("higher", Mips::S_HIGHER)
+        .Case("highest", Mips::S_HIGHEST)
+        .Case("lo", Mips::S_LO)
+        .Case("neg", Mips::S_NEG)
+        .Case("pcrel_hi", Mips::S_PCREL_HI16)
+        .Case("pcrel_lo", Mips::S_PCREL_LO16)
+        .Case("tlsgd", Mips::S_TLSGD)
+        .Case("tlsldm", Mips::S_TLSLDM)
+        .Case("tprel_hi", Mips::S_TPREL_HI)
+        .Case("tprel_lo", Mips::S_TPREL_LO)
+        .Default(Mips::S_None);
   };
 
   MCAsmParser &Parser = getParser();
@@ -6391,7 +6390,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
         Parser.parseToken(AsmToken::LParen, "expected '('"))
       return nullptr;
     auto Op = getOp(Name);
-    if (Op == MipsMCExpr::MEK_None) {
+    if (Op == Mips::S_None) {
       Error(Parser.getTok().getLoc(), "invalid relocation operator");
       return nullptr;
     }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index d5e19ccaa168..58aa374e5302 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -157,14 +157,14 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup,
   // Determine the type of the relocation.
   unsigned Kind = Fixup.getTargetKind();
   switch (Target.getSpecifier()) {
-  case MipsMCExpr::MEK_DTPREL:
-  case MipsMCExpr::MEK_DTPREL_HI:
-  case MipsMCExpr::MEK_DTPREL_LO:
-  case MipsMCExpr::MEK_TLSLDM:
-  case MipsMCExpr::MEK_TLSGD:
-  case MipsMCExpr::MEK_GOTTPREL:
-  case MipsMCExpr::MEK_TPREL_HI:
-  case MipsMCExpr::MEK_TPREL_LO:
+  case Mips::S_DTPREL:
+  case Mips::S_DTPREL_HI:
+  case Mips::S_DTPREL_LO:
+  case Mips::S_TLSLDM:
+  case Mips::S_TLSGD:
+  case Mips::S_GOTTPREL:
+  case Mips::S_TPREL_HI:
+  case Mips::S_TPREL_LO:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 11df6fecaf37..97c173618167 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsMCAsmInfo.h"
 #include "MipsABIInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -57,3 +58,134 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   PrivateLabelPrefix = ".L";
   AllowAtInName = true;
 }
+
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  int64_t AbsVal;
+
+  switch (specifier) {
+  case Mips::S_None:
+  case Mips::S_Special:
+    llvm_unreachable("Mips::S_None and MEK_Special are invalid");
+    break;
+  case Mips::S_DTPREL:
+    // Mips::S_DTPREL is used for marking TLS DIEExpr only
+    // and contains a regular sub-expression.
+    MAI->printExpr(OS, *getSubExpr());
+    return;
+  case Mips::S_CALL_HI16:
+    OS << "%call_hi";
+    break;
+  case Mips::S_CALL_LO16:
+    OS << "%call_lo";
+    break;
+  case Mips::S_DTPREL_HI:
+    OS << "%dtprel_hi";
+    break;
+  case Mips::S_DTPREL_LO:
+    OS << "%dtprel_lo";
+    break;
+  case Mips::S_GOT:
+    OS << "%got";
+    break;
+  case Mips::S_GOTTPREL:
+    OS << "%gottprel";
+    break;
+  case Mips::S_GOT_CALL:
+    OS << "%call16";
+    break;
+  case Mips::S_GOT_DISP:
+    OS << "%got_disp";
+    break;
+  case Mips::S_GOT_HI16:
+    OS << "%got_hi";
+    break;
+  case Mips::S_GOT_LO16:
+    OS << "%got_lo";
+    break;
+  case Mips::S_GOT_PAGE:
+    OS << "%got_page";
+    break;
+  case Mips::S_GOT_OFST:
+    OS << "%got_ofst";
+    break;
+  case Mips::S_GPREL:
+    OS << "%gp_rel";
+    break;
+  case Mips::S_HI:
+    OS << "%hi";
+    break;
+  case Mips::S_HIGHER:
+    OS << "%higher";
+    break;
+  case Mips::S_HIGHEST:
+    OS << "%highest";
+    break;
+  case Mips::S_LO:
+    OS << "%lo";
+    break;
+  case Mips::S_NEG:
+    OS << "%neg";
+    break;
+  case Mips::S_PCREL_HI16:
+    OS << "%pcrel_hi";
+    break;
+  case Mips::S_PCREL_LO16:
+    OS << "%pcrel_lo";
+    break;
+  case Mips::S_TLSGD:
+    OS << "%tlsgd";
+    break;
+  case Mips::S_TLSLDM:
+    OS << "%tlsldm";
+    break;
+  case Mips::S_TPREL_HI:
+    OS << "%tprel_hi";
+    break;
+  case Mips::S_TPREL_LO:
+    OS << "%tprel_lo";
+    break;
+  }
+
+  OS << '(';
+  if (Expr->evaluateAsAbsolute(AbsVal))
+    OS << AbsVal;
+  else
+    Expr->print(OS, MAI);
+  OS << ')';
+}
+
+bool MipsMCExpr::isGpOff(Specifier &S) const {
+  if (getSpecifier() == Mips::S_HI || getSpecifier() == Mips::S_LO) {
+    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+        if (S1->getSpecifier() == Mips::S_NEG &&
+            S2->getSpecifier() == Mips::S_GPREL) {
+          S = getSpecifier();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                           const MCAssembler *Asm) const {
+  // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X)))
+  // special cases.
+  if (isGpOff()) {
+    const MCExpr *SubExpr =
+        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+            ->getSubExpr();
+    if (!SubExpr->evaluateAsRelocatable(Res, Asm))
+      return false;
+
+    Res.setSpecifier(Mips::S_Special);
+    return true;
+  }
+
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(specifier);
+  return !Res.getSubSym();
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 3a2895a79f9c..d8b96f8b568c 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -13,8 +13,10 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCFixup.h"
 
 namespace llvm {
 class Triple;
@@ -34,6 +36,38 @@ public:
   explicit MipsCOFFMCAsmInfo();
 };
 
+namespace Mips {
+enum {
+  S_None,
+  S_CALL_HI16 = FirstTargetFixupKind,
+  S_CALL_LO16,
+  S_DTPREL,
+  S_DTPREL_HI,
+  S_DTPREL_LO,
+  S_GOT,
+  S_GOTTPREL,
+  S_GOT_CALL,
+  S_GOT_DISP,
+  S_GOT_HI16,
+  S_GOT_LO16,
+  S_GOT_OFST,
+  S_GOT_PAGE,
+  S_GPREL,
+  S_HI,
+  S_HIGHER,
+  S_HIGHEST,
+  S_LO,
+  S_NEG,
+  S_PCREL_HI16,
+  S_PCREL_LO16,
+  S_TLSGD,
+  S_TLSLDM,
+  S_TPREL_HI,
+  S_TPREL_LO,
+  S_Special,
+};
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index a426ca7360ce..4035618e0252 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 #include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -585,62 +585,62 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
 
     Mips::Fixups FixupKind = Mips::Fixups(0);
     switch (MipsExpr->getSpecifier()) {
-    case MipsMCExpr::MEK_None:
-    case MipsMCExpr::MEK_Special:
+    case Mips::S_None:
+    case Mips::S_Special:
       llvm_unreachable("Unhandled fixup kind!");
       break;
-    case MipsMCExpr::MEK_DTPREL:
+    case Mips::S_DTPREL:
       // MEK_DTPREL is used for marking TLS DIEExpr only
       // and contains a regular sub-expression.
       return getExprOpValue(MipsExpr->getSubExpr(), Fixups, STI);
-    case MipsMCExpr::MEK_CALL_HI16:
+    case Mips::S_CALL_HI16:
       FixupKind = Mips::fixup_Mips_CALL_HI16;
       break;
-    case MipsMCExpr::MEK_CALL_LO16:
+    case Mips::S_CALL_LO16:
       FixupKind = Mips::fixup_Mips_CALL_LO16;
       break;
-    case MipsMCExpr::MEK_DTPREL_HI:
+    case Mips::S_DTPREL_HI:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
                                    : Mips::fixup_Mips_DTPREL_HI;
       break;
-    case MipsMCExpr::MEK_DTPREL_LO:
+    case Mips::S_DTPREL_LO:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
                                    : Mips::fixup_Mips_DTPREL_LO;
       break;
-    case MipsMCExpr::MEK_GOTTPREL:
+    case Mips::S_GOTTPREL:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOTTPREL
                                    : Mips::fixup_Mips_GOTTPREL;
       break;
-    case MipsMCExpr::MEK_GOT:
+    case Mips::S_GOT:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
                                    : Mips::fixup_Mips_GOT;
       break;
-    case MipsMCExpr::MEK_GOT_CALL:
+    case Mips::S_GOT_CALL:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
                                    : Mips::fixup_Mips_CALL16;
       break;
-    case MipsMCExpr::MEK_GOT_DISP:
+    case Mips::S_GOT_DISP:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
                                    : Mips::fixup_Mips_GOT_DISP;
       break;
-    case MipsMCExpr::MEK_GOT_HI16:
+    case Mips::S_GOT_HI16:
       FixupKind = Mips::fixup_Mips_GOT_HI16;
       break;
-    case MipsMCExpr::MEK_GOT_LO16:
+    case Mips::S_GOT_LO16:
       FixupKind = Mips::fixup_Mips_GOT_LO16;
       break;
-    case MipsMCExpr::MEK_GOT_PAGE:
+    case Mips::S_GOT_PAGE:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
                                    : Mips::fixup_Mips_GOT_PAGE;
       break;
-    case MipsMCExpr::MEK_GOT_OFST:
+    case Mips::S_GOT_OFST:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
                                    : Mips::fixup_Mips_GOT_OFST;
       break;
-    case MipsMCExpr::MEK_GPREL:
+    case Mips::S_GPREL:
       FixupKind = Mips::fixup_Mips_GPREL16;
       break;
-    case MipsMCExpr::MEK_LO:
+    case Mips::S_LO:
       // Check for %lo(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff())
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_LO
@@ -649,15 +649,15 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
                                      : Mips::fixup_Mips_LO16;
       break;
-    case MipsMCExpr::MEK_HIGHEST:
+    case Mips::S_HIGHEST:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHEST
                                    : Mips::fixup_Mips_HIGHEST;
       break;
-    case MipsMCExpr::MEK_HIGHER:
+    case Mips::S_HIGHER:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHER
                                    : Mips::fixup_Mips_HIGHER;
       break;
-    case MipsMCExpr::MEK_HI:
+    case Mips::S_HI:
       // Check for %hi(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff())
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_HI
@@ -666,29 +666,29 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
                                      : Mips::fixup_Mips_HI16;
       break;
-    case MipsMCExpr::MEK_PCREL_HI16:
+    case Mips::S_PCREL_HI16:
       FixupKind = Mips::fixup_MIPS_PCHI16;
       break;
-    case MipsMCExpr::MEK_PCREL_LO16:
+    case Mips::S_PCREL_LO16:
       FixupKind = Mips::fixup_MIPS_PCLO16;
       break;
-    case MipsMCExpr::MEK_TLSGD:
+    case Mips::S_TLSGD:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
                                    : Mips::fixup_Mips_TLSGD;
       break;
-    case MipsMCExpr::MEK_TLSLDM:
+    case Mips::S_TLSLDM:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
                                    : Mips::fixup_Mips_TLSLDM;
       break;
-    case MipsMCExpr::MEK_TPREL_HI:
+    case Mips::S_TPREL_HI:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
                                    : Mips::fixup_Mips_TPREL_HI;
       break;
-    case MipsMCExpr::MEK_TPREL_LO:
+    case Mips::S_TPREL_LO:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
                                    : Mips::fixup_Mips_TPREL_LO;
       break;
-    case MipsMCExpr::MEK_NEG:
+    case Mips::S_NEG:
       FixupKind =
           isMicroMips(STI) ? Mips::fixup_MICROMIPS_SUB : Mips::fixup_Mips_SUB;
       break;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 704ee0375f7a..821f662f0cbf 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -33,135 +34,6 @@ const MipsMCExpr *MipsMCExpr::create(const MCSymbol *Sym, Specifier S,
 
 const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
                                           const MCExpr *Expr, MCContext &Ctx) {
-  return create(S, create(MEK_NEG, create(MEK_GPREL, Expr, Ctx), Ctx), Ctx);
-}
-
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  int64_t AbsVal;
-
-  switch (specifier) {
-  case MEK_None:
-  case MEK_Special:
-    llvm_unreachable("MEK_None and MEK_Special are invalid");
-    break;
-  case MEK_DTPREL:
-    // MEK_DTPREL is used for marking TLS DIEExpr only
-    // and contains a regular sub-expression.
-    MAI->printExpr(OS, *getSubExpr());
-    return;
-  case MEK_CALL_HI16:
-    OS << "%call_hi";
-    break;
-  case MEK_CALL_LO16:
-    OS << "%call_lo";
-    break;
-  case MEK_DTPREL_HI:
-    OS << "%dtprel_hi";
-    break;
-  case MEK_DTPREL_LO:
-    OS << "%dtprel_lo";
-    break;
-  case MEK_GOT:
-    OS << "%got";
-    break;
-  case MEK_GOTTPREL:
-    OS << "%gottprel";
-    break;
-  case MEK_GOT_CALL:
-    OS << "%call16";
-    break;
-  case MEK_GOT_DISP:
-    OS << "%got_disp";
-    break;
-  case MEK_GOT_HI16:
-    OS << "%got_hi";
-    break;
-  case MEK_GOT_LO16:
-    OS << "%got_lo";
-    break;
-  case MEK_GOT_PAGE:
-    OS << "%got_page";
-    break;
-  case MEK_GOT_OFST:
-    OS << "%got_ofst";
-    break;
-  case MEK_GPREL:
-    OS << "%gp_rel";
-    break;
-  case MEK_HI:
-    OS << "%hi";
-    break;
-  case MEK_HIGHER:
-    OS << "%higher";
-    break;
-  case MEK_HIGHEST:
-    OS << "%highest";
-    break;
-  case MEK_LO:
-    OS << "%lo";
-    break;
-  case MEK_NEG:
-    OS << "%neg";
-    break;
-  case MEK_PCREL_HI16:
-    OS << "%pcrel_hi";
-    break;
-  case MEK_PCREL_LO16:
-    OS << "%pcrel_lo";
-    break;
-  case MEK_TLSGD:
-    OS << "%tlsgd";
-    break;
-  case MEK_TLSLDM:
-    OS << "%tlsldm";
-    break;
-  case MEK_TPREL_HI:
-    OS << "%tprel_hi";
-    break;
-  case MEK_TPREL_LO:
-    OS << "%tprel_lo";
-    break;
-  }
-
-  OS << '(';
-  if (Expr->evaluateAsAbsolute(AbsVal))
-    OS << AbsVal;
-  else
-    Expr->print(OS, MAI);
-  OS << ')';
-}
-
-bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                           const MCAssembler *Asm) const {
-  // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X)))
-  // special cases.
-  if (isGpOff()) {
-    const MCExpr *SubExpr =
-        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
-            ->getSubExpr();
-    if (!SubExpr->evaluateAsRelocatable(Res, Asm))
-      return false;
-
-    Res.setSpecifier(MEK_Special);
-    return true;
-  }
-
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(specifier);
-  return !Res.getSubSym();
-}
-
-bool MipsMCExpr::isGpOff(Specifier &S) const {
-  if (getSpecifier() == MEK_HI || getSpecifier() == MEK_LO) {
-    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
-      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
-        if (S1->getSpecifier() == MEK_NEG && S2->getSpecifier() == MEK_GPREL) {
-          S = getSpecifier();
-          return true;
-        }
-      }
-    }
-  }
-  return false;
+  return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
+                Ctx);
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 5aad02c38d6e..216077a1aa48 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -17,35 +17,6 @@ namespace llvm {
 class MipsMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = Spec;
-  enum {
-    MEK_None,
-    MEK_CALL_HI16,
-    MEK_CALL_LO16,
-    MEK_DTPREL,
-    MEK_DTPREL_HI,
-    MEK_DTPREL_LO,
-    MEK_GOT,
-    MEK_GOTTPREL,
-    MEK_GOT_CALL,
-    MEK_GOT_DISP,
-    MEK_GOT_HI16,
-    MEK_GOT_LO16,
-    MEK_GOT_OFST,
-    MEK_GOT_PAGE,
-    MEK_GPREL,
-    MEK_HI,
-    MEK_HIGHER,
-    MEK_HIGHEST,
-    MEK_LO,
-    MEK_NEG,
-    MEK_PCREL_HI16,
-    MEK_PCREL_LO16,
-    MEK_TLSGD,
-    MEK_TLSLDM,
-    MEK_TPREL_HI,
-    MEK_TPREL_LO,
-    MEK_Special,
-  };
 
 private:
   explicit MipsMCExpr(const MCExpr *Expr, Specifier S)
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 49aea9c69162..80a854c79901 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsTargetStreamer.h"
 #include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MipsBaseInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
@@ -1266,7 +1267,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   const MCExpr *HiSym = MipsMCExpr::create(
-      MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
+      Mips::S_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
       MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().emitInstruction(TmpInst, STI);
@@ -1277,7 +1278,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   const MCExpr *LoSym = MipsMCExpr::create(
-      MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
+      Mips::S_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
       MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().emitInstruction(TmpInst, STI);
@@ -1342,10 +1343,10 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   }
 
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
-      MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      Mips::S_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
       MCA.getContext());
   const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
-      MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      Mips::S_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
       MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 1a3e99ec7f68..da3f7cb55b30 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MCTargetDesc/MipsTargetStreamer.h"
@@ -1244,7 +1245,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // and value for debug thread local expression.
 void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
   if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
-    if (MipsExpr && MipsExpr->getSpecifier() == MipsMCExpr::MEK_DTPREL) {
+    if (MipsExpr && MipsExpr->getSpecifier() == Mips::S_DTPREL) {
       switch (Size) {
       case 4:
         getTargetStreamer().emitDTPRel32Value(MipsExpr->getSubExpr());
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index d1eef1775aa6..3c3690a7f983 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -13,7 +13,7 @@
 
 #include "MipsMCInstLower.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MipsAsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -35,7 +35,7 @@ void MipsMCInstLower::Initialize(MCContext *C) {
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
                                               int64_t Offset) const {
-  MipsMCExpr::Specifier TargetKind = MipsMCExpr::MEK_None;
+  MipsMCExpr::Specifier TargetKind = Mips::S_None;
   bool IsGpOff = false;
   const MCSymbol *Symbol;
   SmallString<128> Name;
@@ -53,75 +53,75 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_NO_FLAG:
     break;
   case MipsII::MO_GPREL:
-    TargetKind = MipsMCExpr::MEK_GPREL;
+    TargetKind = Mips::S_GPREL;
     break;
   case MipsII::MO_GOT_CALL:
-    TargetKind = MipsMCExpr::MEK_GOT_CALL;
+    TargetKind = Mips::S_GOT_CALL;
     break;
   case MipsII::MO_GOT:
-    TargetKind = MipsMCExpr::MEK_GOT;
+    TargetKind = Mips::S_GOT;
     break;
   case MipsII::MO_ABS_HI:
-    TargetKind = MipsMCExpr::MEK_HI;
+    TargetKind = Mips::S_HI;
     break;
   case MipsII::MO_ABS_LO:
-    TargetKind = MipsMCExpr::MEK_LO;
+    TargetKind = Mips::S_LO;
     break;
   case MipsII::MO_TLSGD:
-    TargetKind = MipsMCExpr::MEK_TLSGD;
+    TargetKind = Mips::S_TLSGD;
     break;
   case MipsII::MO_TLSLDM:
-    TargetKind = MipsMCExpr::MEK_TLSLDM;
+    TargetKind = Mips::S_TLSLDM;
     break;
   case MipsII::MO_DTPREL_HI:
-    TargetKind = MipsMCExpr::MEK_DTPREL_HI;
+    TargetKind = Mips::S_DTPREL_HI;
     break;
   case MipsII::MO_DTPREL_LO:
-    TargetKind = MipsMCExpr::MEK_DTPREL_LO;
+    TargetKind = Mips::S_DTPREL_LO;
     break;
   case MipsII::MO_GOTTPREL:
-    TargetKind = MipsMCExpr::MEK_GOTTPREL;
+    TargetKind = Mips::S_GOTTPREL;
     break;
   case MipsII::MO_TPREL_HI:
-    TargetKind = MipsMCExpr::MEK_TPREL_HI;
+    TargetKind = Mips::S_TPREL_HI;
     break;
   case MipsII::MO_TPREL_LO:
-    TargetKind = MipsMCExpr::MEK_TPREL_LO;
+    TargetKind = Mips::S_TPREL_LO;
     break;
   case MipsII::MO_GPOFF_HI:
-    TargetKind = MipsMCExpr::MEK_HI;
+    TargetKind = Mips::S_HI;
     IsGpOff = true;
     break;
   case MipsII::MO_GPOFF_LO:
-    TargetKind = MipsMCExpr::MEK_LO;
+    TargetKind = Mips::S_LO;
     IsGpOff = true;
     break;
   case MipsII::MO_GOT_DISP:
-    TargetKind = MipsMCExpr::MEK_GOT_DISP;
+    TargetKind = Mips::S_GOT_DISP;
     break;
   case MipsII::MO_GOT_HI16:
-    TargetKind = MipsMCExpr::MEK_GOT_HI16;
+    TargetKind = Mips::S_GOT_HI16;
     break;
   case MipsII::MO_GOT_LO16:
-    TargetKind = MipsMCExpr::MEK_GOT_LO16;
+    TargetKind = Mips::S_GOT_LO16;
     break;
   case MipsII::MO_GOT_PAGE:
-    TargetKind = MipsMCExpr::MEK_GOT_PAGE;
+    TargetKind = Mips::S_GOT_PAGE;
     break;
   case MipsII::MO_GOT_OFST:
-    TargetKind = MipsMCExpr::MEK_GOT_OFST;
+    TargetKind = Mips::S_GOT_OFST;
     break;
   case MipsII::MO_HIGHER:
-    TargetKind = MipsMCExpr::MEK_HIGHER;
+    TargetKind = Mips::S_HIGHER;
     break;
   case MipsII::MO_HIGHEST:
-    TargetKind = MipsMCExpr::MEK_HIGHEST;
+    TargetKind = Mips::S_HIGHEST;
     break;
   case MipsII::MO_CALL_HI16:
-    TargetKind = MipsMCExpr::MEK_CALL_HI16;
+    TargetKind = Mips::S_CALL_HI16;
     break;
   case MipsII::MO_CALL_LO16:
-    TargetKind = MipsMCExpr::MEK_CALL_LO16;
+    TargetKind = Mips::S_CALL_LO16;
     break;
   case MipsII::MO_JALR:
     return MCOperand();
@@ -176,7 +176,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   if (IsGpOff)
     Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
-  else if (TargetKind != MipsMCExpr::MEK_None)
+  else if (TargetKind != Mips::S_None)
     Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
 
   return MCOperand::createExpr(Expr);
@@ -230,16 +230,16 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   unsigned TargetFlags = MI->getOperand(1).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
-    Spec = MipsMCExpr::MEK_HIGHEST;
+    Spec = Mips::S_HIGHEST;
     break;
   case MipsII::MO_HIGHER:
-    Spec = MipsMCExpr::MEK_HIGHER;
+    Spec = Mips::S_HIGHER;
     break;
   case MipsII::MO_ABS_HI:
-    Spec = MipsMCExpr::MEK_HI;
+    Spec = Mips::S_HI;
     break;
   case MipsII::MO_ABS_LO:
-    Spec = MipsMCExpr::MEK_LO;
+    Spec = Mips::S_LO;
     break;
   default:
     report_fatal_error("Unexpected flags for lowerLongBranchLUi");
@@ -265,16 +265,16 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
   unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
-    Spec = MipsMCExpr::MEK_HIGHEST;
+    Spec = Mips::S_HIGHEST;
     break;
   case MipsII::MO_HIGHER:
-    Spec = MipsMCExpr::MEK_HIGHER;
+    Spec = Mips::S_HIGHER;
     break;
   case MipsII::MO_ABS_HI:
-    Spec = MipsMCExpr::MEK_HI;
+    Spec = Mips::S_HI;
     break;
   case MipsII::MO_ABS_LO:
-    Spec = MipsMCExpr::MEK_LO;
+    Spec = Mips::S_LO;
     break;
   default:
     report_fatal_error("Unexpected flags for lowerLongBranchADDiu");
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index e74b3ad5ebca..23aa699318a2 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsTargetObjectFile.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -189,5 +189,5 @@ MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
-  return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL, Expr, getContext());
+  return MipsMCExpr::create(Mips::S_DTPREL, Expr, getContext());
 }

From ad94f77a6a0c421e1f5815d1b8e5aa86d8dd2e92 Mon Sep 17 00:00:00 2001
From: Tom Vijlbrief <tvijlbrief@gmail.com>
Date: Mon, 16 Jun 2025 03:25:40 +0200
Subject: [PATCH 0442/1322] [AVR] Add many new AVR MCU model definitions
 (#144229)

1. Added the missing XMEGA2 definition. The avr64 devices use xmega2 which has SPM(X) defined.

2. The avr16/avr32 devices do have SPM and SPMX features, but the current xmega3 definition has not.
   Xmega3 is also used for modern attiny series which do not have SPM(X), so that is correct.
   Leave the avr16/avr32 devices unchanged (using xmega3 to be in sync with gcc definitions).

Fixes https://github.com/llvm/llvm-project/issues/116116
---
 clang/lib/Basic/Targets/AVR.cpp               | 69 +++++++++++++++
 clang/lib/Driver/ToolChains/AVR.cpp           | 70 ++++++++++++++++
 clang/test/Misc/target-invalid-cpu-note/avr.c | 65 ++++++++++++++
 llvm/lib/Target/AVR/AVRDevices.td             | 84 +++++++++++++++++++
 4 files changed, 288 insertions(+)

diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 85ca4bc30c46..bbe7b01ca036 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -336,6 +336,9 @@ static MCUInfo AVRMcus[] = {
     {"attiny1624", "__AVR_ATtiny1624__", "103", 1},
     {"attiny1626", "__AVR_ATtiny1626__", "103", 1},
     {"attiny1627", "__AVR_ATtiny1627__", "103", 1},
+    {"attiny3224", "__AVR_ATtiny3224__", "103", 1},
+    {"attiny3226", "__AVR_ATtiny3226__", "103", 1},
+    {"attiny3227", "__AVR_ATtiny3227__", "103", 1},
     {"atmega808", "__AVR_ATmega808__", "103", 1},
     {"atmega809", "__AVR_ATmega809__", "103", 1},
     {"atmega1608", "__AVR_ATmega1608__", "103", 1},
@@ -344,6 +347,72 @@ static MCUInfo AVRMcus[] = {
     {"atmega3209", "__AVR_ATmega3209__", "103", 1},
     {"atmega4808", "__AVR_ATmega4808__", "103", 1},
     {"atmega4809", "__AVR_ATmega4809__", "103", 1},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "__AVR_AVR64DA28__", "102", 1},
+    {"avr64da32", "__AVR_AVR64DA32__", "102", 1},
+    {"avr64da48", "__AVR_AVR64DA48__", "102", 1},
+    {"avr64da64", "__AVR_AVR64DA64__", "102", 1},
+    {"avr64db28", "__AVR_AVR64DB28__", "102", 1},
+    {"avr64db32", "__AVR_AVR64DB32__", "102", 1},
+    {"avr64db48", "__AVR_AVR64DB48__", "102", 1},
+    {"avr64db64", "__AVR_AVR64DB64__", "102", 1},
+    {"avr64dd14", "__AVR_AVR64DD14__", "102", 1},
+    {"avr64dd20", "__AVR_AVR64DD20__", "102", 1},
+    {"avr64dd28", "__AVR_AVR64DD28__", "102", 1},
+    {"avr64dd32", "__AVR_AVR64DD32__", "102", 1},
+    {"avr64du28", "__AVR_AVR64DU28__", "102", 1},
+    {"avr64du32", "__AVR_AVR64DU32__", "102", 1},
+    {"avr64ea28", "__AVR_AVR64EA28__", "102", 1},
+    {"avr64ea32", "__AVR_AVR64EA32__", "102", 1},
+    {"avr64ea48", "__AVR_AVR64EA48__", "102", 1},
+    {"avr64sd28", "__AVR_AVR64SD28__", "102", 1},
+    {"avr64sd32", "__AVR_AVR64SD32__", "102", 1},
+    {"avr64sd48", "__AVR_AVR64SD48__", "102", 1},
+
+    {"avr16dd20", "__AVR_AVR16DD20__", "103", 1},
+    {"avr16dd28", "__AVR_AVR16DD28__", "103", 1},
+    {"avr16dd32", "__AVR_AVR16DD32__", "103", 1},
+    {"avr16du14", "__AVR_AVR16DU14__", "103", 1},
+    {"avr16du20", "__AVR_AVR16DU20__", "103", 1},
+    {"avr16du28", "__AVR_AVR16DU28__", "103", 1},
+    {"avr16du32", "__AVR_AVR16DU32__", "103", 1},
+    {"avr32da28", "__AVR_AVR32DA28__", "103", 1},
+    {"avr32da32", "__AVR_AVR32DA32__", "103", 1},
+    {"avr32da48", "__AVR_AVR32DA48__", "103", 1},
+    {"avr32db28", "__AVR_AVR32DB28__", "103", 1},
+    {"avr32db32", "__AVR_AVR32DB32__", "103", 1},
+    {"avr32db48", "__AVR_AVR32DB48__", "103", 1},
+    {"avr32dd14", "__AVR_AVR32DD14__", "103", 1},
+    {"avr32dd20", "__AVR_AVR32DD20__", "103", 1},
+    {"avr32dd28", "__AVR_AVR32DD28__", "103", 1},
+    {"avr32dd32", "__AVR_AVR32DD32__", "103", 1},
+    {"avr32du14", "__AVR_AVR32DU14__", "103", 1},
+    {"avr32du20", "__AVR_AVR32DU20__", "103", 1},
+    {"avr32du28", "__AVR_AVR32DU28__", "103", 1},
+    {"avr32du32", "__AVR_AVR32DU32__", "103", 1},
+    {"avr16eb14", "__AVR_AVR16EB14__", "103", 1},
+    {"avr16eb20", "__AVR_AVR16EB20__", "103", 1},
+    {"avr16eb28", "__AVR_AVR16EB28__", "103", 1},
+    {"avr16eb32", "__AVR_AVR16EB32__", "103", 1},
+    {"avr16ea28", "__AVR_AVR16EA28__", "103", 1},
+    {"avr16ea32", "__AVR_AVR16EA32__", "103", 1},
+    {"avr16ea48", "__AVR_AVR16EA48__", "103", 1},
+    {"avr32ea28", "__AVR_AVR32EA28__", "103", 1},
+    {"avr32ea32", "__AVR_AVR32EA32__", "103", 1},
+    {"avr32ea48", "__AVR_AVR32EA48__", "103", 1},
+    {"avr32sd20", "__AVR_AVR32SD20__", "103", 1},
+    {"avr32sd28", "__AVR_AVR32SD28__", "103", 1},
+    {"avr32sd32", "__AVR_AVR32SD32__", "103", 1},
+    {"avr128da28", "__AVR_AVR128DA28__", "104", 2},
+    {"avr128da32", "__AVR_AVR128DA32__", "104", 2},
+    {"avr128da48", "__AVR_AVR128DA48__", "104", 2},
+    {"avr128da64", "__AVR_AVR128DA64__", "104", 2},
+    {"avr128db28", "__AVR_AVR128DB28__", "104", 2},
+    {"avr128db32", "__AVR_AVR128DB32__", "104", 2},
+    {"avr128db48", "__AVR_AVR128DB48__", "104", 2},
+    {"avr128db64", "__AVR_AVR128DB64__", "104", 2},
 };
 
 } // namespace targets
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index b0523a7f4e40..731076d9754a 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -326,8 +326,78 @@ constexpr struct {
     {"attiny1624", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1626", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1627", "avrxmega3", "avrxmega3", 0x803800},
+    {"attiny3224", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3226", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3227", "avrxmega3", "avrxmega3", 0x803400},
     {"attiny3216", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny3217", "avrxmega3", "avrxmega3", 0x803800},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd14", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd20", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64ea28", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea32", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea48", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64sd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd48", "avrxmega2", "avrxmega2", 0x806000},
+
+    {"avr16dd20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32da28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr16eb14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea48", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32ea28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr128da28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da64", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db64", "avrxmega4", "avrxmega4", 0x804000},
+
 };
 
 std::string GetMCUSubPath(StringRef MCUName) {
diff --git a/clang/test/Misc/target-invalid-cpu-note/avr.c b/clang/test/Misc/target-invalid-cpu-note/avr.c
index 86ffbb683858..49d68bcc2edf 100644
--- a/clang/test/Misc/target-invalid-cpu-note/avr.c
+++ b/clang/test/Misc/target-invalid-cpu-note/avr.c
@@ -311,6 +311,9 @@
 // CHECK-SAME: {{^}}, attiny1624
 // CHECK-SAME: {{^}}, attiny1626
 // CHECK-SAME: {{^}}, attiny1627
+// CHECK-SAME: {{^}}, attiny3224
+// CHECK-SAME: {{^}}, attiny3226
+// CHECK-SAME: {{^}}, attiny3227
 // CHECK-SAME: {{^}}, atmega808
 // CHECK-SAME: {{^}}, atmega809
 // CHECK-SAME: {{^}}, atmega1608
@@ -319,4 +322,66 @@
 // CHECK-SAME: {{^}}, atmega3209
 // CHECK-SAME: {{^}}, atmega4808
 // CHECK-SAME: {{^}}, atmega4809
+// CHECK-SAME: {{^}}, avr64da28
+// CHECK-SAME: {{^}}, avr64da32
+// CHECK-SAME: {{^}}, avr64da48
+// CHECK-SAME: {{^}}, avr64da64
+// CHECK-SAME: {{^}}, avr64db28
+// CHECK-SAME: {{^}}, avr64db32
+// CHECK-SAME: {{^}}, avr64db48
+// CHECK-SAME: {{^}}, avr64db64
+// CHECK-SAME: {{^}}, avr64dd14
+// CHECK-SAME: {{^}}, avr64dd20
+// CHECK-SAME: {{^}}, avr64dd28
+// CHECK-SAME: {{^}}, avr64dd32
+// CHECK-SAME: {{^}}, avr64du28
+// CHECK-SAME: {{^}}, avr64du32
+// CHECK-SAME: {{^}}, avr64ea28
+// CHECK-SAME: {{^}}, avr64ea32
+// CHECK-SAME: {{^}}, avr64ea48
+// CHECK-SAME: {{^}}, avr64sd28
+// CHECK-SAME: {{^}}, avr64sd32
+// CHECK-SAME: {{^}}, avr64sd48
+// CHECK-SAME: {{^}}, avr16dd20
+// CHECK-SAME: {{^}}, avr16dd28
+// CHECK-SAME: {{^}}, avr16dd32
+// CHECK-SAME: {{^}}, avr16du14
+// CHECK-SAME: {{^}}, avr16du20
+// CHECK-SAME: {{^}}, avr16du28
+// CHECK-SAME: {{^}}, avr16du32
+// CHECK-SAME: {{^}}, avr32da28
+// CHECK-SAME: {{^}}, avr32da32
+// CHECK-SAME: {{^}}, avr32da48
+// CHECK-SAME: {{^}}, avr32db28
+// CHECK-SAME: {{^}}, avr32db32
+// CHECK-SAME: {{^}}, avr32db48
+// CHECK-SAME: {{^}}, avr32dd14
+// CHECK-SAME: {{^}}, avr32dd20
+// CHECK-SAME: {{^}}, avr32dd28
+// CHECK-SAME: {{^}}, avr32dd32
+// CHECK-SAME: {{^}}, avr32du14
+// CHECK-SAME: {{^}}, avr32du20
+// CHECK-SAME: {{^}}, avr32du28
+// CHECK-SAME: {{^}}, avr32du32
+// CHECK-SAME: {{^}}, avr16eb14
+// CHECK-SAME: {{^}}, avr16eb20
+// CHECK-SAME: {{^}}, avr16eb28
+// CHECK-SAME: {{^}}, avr16eb32
+// CHECK-SAME: {{^}}, avr16ea28
+// CHECK-SAME: {{^}}, avr16ea32
+// CHECK-SAME: {{^}}, avr16ea48
+// CHECK-SAME: {{^}}, avr32ea28
+// CHECK-SAME: {{^}}, avr32ea32
+// CHECK-SAME: {{^}}, avr32ea48
+// CHECK-SAME: {{^}}, avr32sd20
+// CHECK-SAME: {{^}}, avr32sd28
+// CHECK-SAME: {{^}}, avr32sd32
+// CHECK-SAME: {{^}}, avr128da28
+// CHECK-SAME: {{^}}, avr128da32
+// CHECK-SAME: {{^}}, avr128da48
+// CHECK-SAME: {{^}}, avr128da64
+// CHECK-SAME: {{^}}, avr128db28
+// CHECK-SAME: {{^}}, avr128db32
+// CHECK-SAME: {{^}}, avr128db48
+// CHECK-SAME: {{^}}, avr128db64
 // CHECK-SAME: {{$}}
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 56147bb473bc..ad760d740357 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -209,12 +209,27 @@ def FamilyTiny
              [FamilyAVR0, FeatureBREAK, FeatureSRAM, FeatureTinyEncoding,
               FeatureSmallStack]>;
 
+def FamilyXMEGA2 : Family<"xmega2",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureSPM, FeatureSPMX,
+                           FeatureBREAK, FeatureLowByteFirst]>;
+
 def FamilyXMEGA3 : Family<"xmega3",
                           [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                            FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
                            FeatureMultiplication, FeatureMOVW, FeatureLPMX,
                            FeatureBREAK, FeatureLowByteFirst]>;
 
+def FamilyXMEGA4 : Family<"xmega4",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureELPM, FeatureELPMX,
+                           FeatureSPM, FeatureSPMX,
+                           FeatureBREAK, FeatureLowByteFirst]>;
+
 def FamilyXMEGA : Family<"xmega",
                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
@@ -567,6 +582,9 @@ def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3224", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3226", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3227", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
@@ -575,3 +593,69 @@ def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
+
+// Additions from gcc 14:
+
+def : Device<"avr64da28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd14", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd20", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd48", FamilyXMEGA2, ELFArchXMEGA2>;
+
+def : Device<"avr16dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr128da28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da64", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db64", FamilyXMEGA4, ELFArchXMEGA4>;

From 1506ba95d7c3dca1abff0190550945f6cc263a99 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 15 Jun 2025 18:28:06 -0700
Subject: [PATCH 0443/1322] [clang-format][NFC] Clean up DisallowLineBreaks
 lambda (#144255)

See also
https://github.com/llvm/llvm-project/pull/141576/files#r2141808121
---
 clang/lib/Format/ContinuationIndenter.cpp | 78 ++++++++++++-----------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 424b6dbc0da7..b4745477b96e 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -329,9 +329,9 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
   // statement and we are aligning lambda blocks to their signatures.
   if (Previous.is(tok::l_brace) && State.Stack.size() > 1 &&
       State.Stack[State.Stack.size() - 2].NestedBlockInlined &&
-      State.Stack[State.Stack.size() - 2].HasMultipleNestedBlocks &&
-      Style.LambdaBodyIndentation == FormatStyle::LBI_Signature) {
-    return false;
+      State.Stack[State.Stack.size() - 2].HasMultipleNestedBlocks) {
+    return Style.isCpp() &&
+           Style.LambdaBodyIndentation == FormatStyle::LBI_OuterScope;
   }
 
   // Don't break after very short return types (e.g. "void") as that is often
@@ -706,42 +706,48 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   const FormatToken &Previous = *State.NextToken->Previous;
   auto &CurrentState = State.Stack.back();
 
-  bool DisallowLineBreaksOnThisLine =
-      Style.LambdaBodyIndentation == FormatStyle::LBI_Signature &&
-      // Deal with lambda arguments in C++. The aim here is to ensure that we
-      // don't over-indent lambda function bodies when lambdas are passed as
-      // arguments to function calls. We do this by ensuring that either all
-      // arguments (including any lambdas) go on the same line as the function
-      // call, or we break before the first argument.
-      Style.isCpp() && [&] {
-        // For example, `/*Newline=*/false`.
-        if (Previous.is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
-          return false;
-        const auto *PrevNonComment = Current.getPreviousNonComment();
-        if (!PrevNonComment || PrevNonComment->isNot(tok::l_paren))
-          return false;
-        if (Current.isOneOf(tok::comment, tok::l_paren, TT_LambdaLSquare))
-          return false;
-        auto BlockParameterCount = PrevNonComment->BlockParameterCount;
-        if (BlockParameterCount == 0)
-          return false;
+  // Deal with lambda arguments in C++. The aim here is to ensure that we don't
+  // over-indent lambda function bodies when lambdas are passed as arguments to
+  // function calls. We do this by ensuring that either all arguments (including
+  // any lambdas) go on the same line as the function call, or we break before
+  // the first argument.
+  auto DisallowLineBreaks = [&] {
+    if (!Style.isCpp() ||
+        Style.LambdaBodyIndentation == FormatStyle::LBI_OuterScope) {
+      return false;
+    }
 
-        // Multiple lambdas in the same function call.
-        if (BlockParameterCount > 1)
-          return true;
+    // For example, `/*Newline=*/false`.
+    if (Previous.is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
+      return false;
 
-        // A lambda followed by another arg.
-        if (!PrevNonComment->Role)
-          return false;
-        auto Comma = PrevNonComment->Role->lastComma();
-        if (!Comma)
-          return false;
-        auto Next = Comma->getNextNonComment();
-        return Next &&
-               !Next->isOneOf(TT_LambdaLSquare, tok::l_brace, tok::caret);
-      }();
+    if (Current.isOneOf(tok::comment, tok::l_paren, TT_LambdaLSquare))
+      return false;
 
-  if (DisallowLineBreaksOnThisLine)
+    const auto *Prev = Current.getPreviousNonComment();
+    if (!Prev || Prev->isNot(tok::l_paren))
+      return false;
+
+    if (Prev->BlockParameterCount == 0)
+      return false;
+
+    // Multiple lambdas in the same function call.
+    if (Prev->BlockParameterCount > 1)
+      return true;
+
+    // A lambda followed by another arg.
+    if (!Prev->Role)
+      return false;
+
+    const auto *Comma = Prev->Role->lastComma();
+    if (!Comma)
+      return false;
+
+    const auto *Next = Comma->getNextNonComment();
+    return Next && !Next->isOneOf(TT_LambdaLSquare, tok::l_brace, tok::caret);
+  };
+
+  if (DisallowLineBreaks())
     State.NoLineBreak = true;
 
   if (Current.is(tok::equal) &&

From f23b841f0fa7576b90fe226e66192b861a8cf1cf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:35:07 -0700
Subject: [PATCH 0444/1322] MIPS: Move MipsMCExpr functions to MipsMCAsmInfo

---
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       | 52 +++++++++++++------
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  | 10 ++++
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  4 +-
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   |  7 +++
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h |  8 ---
 5 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 97c173618167..9b2b25c60c94 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -59,10 +59,11 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   AllowAtInName = true;
 }
 
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
+                      const MCSpecifierExpr &Expr) {
   int64_t AbsVal;
 
-  switch (specifier) {
+  switch (Expr.getSpecifier()) {
   case Mips::S_None:
   case Mips::S_Special:
     llvm_unreachable("Mips::S_None and MEK_Special are invalid");
@@ -70,7 +71,7 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   case Mips::S_DTPREL:
     // Mips::S_DTPREL is used for marking TLS DIEExpr only
     // and contains a regular sub-expression.
-    MAI->printExpr(OS, *getSubExpr());
+    MAI.printExpr(OS, *Expr.getSubExpr());
     return;
   case Mips::S_CALL_HI16:
     OS << "%call_hi";
@@ -147,20 +148,20 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   }
 
   OS << '(';
-  if (Expr->evaluateAsAbsolute(AbsVal))
+  if (Expr.evaluateAsAbsolute(AbsVal))
     OS << AbsVal;
   else
-    Expr->print(OS, MAI);
+    MAI.printExpr(OS, *Expr.getSubExpr());
   OS << ')';
 }
 
-bool MipsMCExpr::isGpOff(Specifier &S) const {
-  if (getSpecifier() == Mips::S_HI || getSpecifier() == Mips::S_LO) {
-    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+bool Mips::isGpOff(const MCSpecifierExpr &E) {
+  if (E.getSpecifier() == Mips::S_HI || E.getSpecifier() == Mips::S_LO) {
+    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(E.getSubExpr())) {
       if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
         if (S1->getSpecifier() == Mips::S_NEG &&
             S2->getSpecifier() == Mips::S_GPREL) {
-          S = getSpecifier();
+          // S = E.getSpecifier();
           return true;
         }
       }
@@ -169,13 +170,13 @@ bool MipsMCExpr::isGpOff(Specifier &S) const {
   return false;
 }
 
-bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                           const MCAssembler *Asm) const {
+static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
+                     const MCAssembler *Asm) {
   // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X)))
   // special cases.
-  if (isGpOff()) {
+  if (Mips::isGpOff(Expr)) {
     const MCExpr *SubExpr =
-        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+        cast<MipsMCExpr>(cast<MipsMCExpr>(Expr.getSubExpr())->getSubExpr())
             ->getSubExpr();
     if (!SubExpr->evaluateAsRelocatable(Res, Asm))
       return false;
@@ -184,8 +185,29 @@ bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
     return true;
   }
 
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
     return false;
-  Res.setSpecifier(specifier);
+  Res.setSpecifier(Expr.getSpecifier());
   return !Res.getSubSym();
 }
+
+void MipsELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                          const MCSpecifierExpr &Expr) const {
+  printImpl(*this, OS, Expr);
+}
+
+bool MipsELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                                 MCValue &Res,
+                                                 const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
+void MipsCOFFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                           const MCSpecifierExpr &Expr) const {
+  printImpl(*this, OS, Expr);
+}
+
+bool MipsCOFFMCAsmInfo::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index d8b96f8b568c..39699fdb9827 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -27,6 +27,10 @@ class MipsELFMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit MipsELFMCAsmInfo(const Triple &TheTriple,
                             const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
@@ -34,6 +38,10 @@ class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
 
 public:
   explicit MipsCOFFMCAsmInfo();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace Mips {
@@ -66,6 +74,8 @@ enum {
   S_TPREL_LO,
   S_Special,
 };
+
+bool isGpOff(const MCSpecifierExpr &E);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4035618e0252..d2981c4ad4d2 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -642,7 +642,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       break;
     case Mips::S_LO:
       // Check for %lo(%neg(%gp_rel(X)))
-      if (MipsExpr->isGpOff())
+      if (Mips::isGpOff(*MipsExpr))
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_LO
                                      : Mips::fixup_Mips_GPOFF_LO;
       else
@@ -659,7 +659,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       break;
     case Mips::S_HI:
       // Check for %hi(%neg(%gp_rel(X)))
-      if (MipsExpr->isGpOff())
+      if (Mips::isGpOff(*MipsExpr))
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_HI
                                      : Mips::fixup_Mips_GPOFF_HI;
       else
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 821f662f0cbf..280d944f2fbb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -37,3 +37,10 @@ const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
   return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
                 Ctx);
 }
+
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (MAI)
+    MAI->printExpr(OS, *this);
+  else // llc -asm-show-inst
+    MipsELFMCAsmInfo(Triple(), MCTargetOptions()).printExpr(OS, *this);
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 216077a1aa48..91ec09482185 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -31,14 +31,6 @@ public:
                                        MCContext &Ctx);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
-  bool isGpOff(Specifier &S) const;
-  bool isGpOff() const {
-    Specifier S;
-    return isGpOff(S);
-  }
 };
 
 } // end namespace llvm

From ba7369c49c6f638a4ce6f6be3acbdab5e0b5f418 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 16 Jun 2025 10:46:05 +0900
Subject: [PATCH 0445/1322] WebAssembly: Move runtime libcall setting out of
 TargetLowering (#142624)

RuntimeLibcallInfo needs to be correct outside of codegen contexts.
---
 .../wasm/lto/Inputs/libcall-return-addr.ll     |  6 ------
 lld/test/wasm/lto/libcall-return-addr.ll       | 18 ------------------
 llvm/lib/IR/RuntimeLibcalls.cpp                |  5 +++++
 .../WebAssembly/WebAssemblyISelLowering.cpp    |  5 -----
 4 files changed, 5 insertions(+), 29 deletions(-)
 delete mode 100644 lld/test/wasm/lto/Inputs/libcall-return-addr.ll
 delete mode 100644 lld/test/wasm/lto/libcall-return-addr.ll

diff --git a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll
deleted file mode 100644
index 271bdae11e49..000000000000
--- a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20"
-target triple = "wasm32-unknown-emscripten"
-
-define ptr @emscripten_return_address() {
-  ret ptr null
-}
diff --git a/lld/test/wasm/lto/libcall-return-addr.ll b/lld/test/wasm/lto/libcall-return-addr.ll
deleted file mode 100644
index 74eba74f9701..000000000000
--- a/lld/test/wasm/lto/libcall-return-addr.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llvm-as %s -o %t.o
-; RUN: llvm-as %p/Inputs/libcall-return-addr.ll -o %t.return-addr.o
-; RUN: rm -f %t.a
-; RUN: llvm-ar rcs %t.a %t.return-addr.o
-; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20"
-target triple = "wasm32-unknown-emscripten"
-
-@g_ptr = global ptr null
-
-define void @_start() {
-  %addr = call ptr @llvm.returnaddress(i32 1)
-  store ptr %addr, ptr @g_ptr
-  ret void
-}
-
-; CHECK: wasm-ld: error: {{.*}}return-addr.o): attempt to add bitcode file after LTO (emscripten_return_address)
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d84c56f0af5c..d655f84b37c5 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -531,6 +531,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
       setLibcallName(RTLIB::MULO_I64, nullptr);
     }
     setLibcallName(RTLIB::MULO_I128, nullptr);
+  } else {
+    // Define the emscripten name for return address helper.
+    // TODO: when implementing other Wasm backends, make this generic or only do
+    // this on emscripten depending on what they end up doing.
+    setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
   }
 
   if (TT.isSystemZ() && TT.isOSzOS()) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index aac347331119..3cd923c0ba05 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -385,11 +385,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   setMaxAtomicSizeInBitsSupported(64);
 
-  // Define the emscripten name for return address helper.
-  // TODO: when implementing other Wasm backends, make this generic or only do
-  // this on emscripten depending on what they end up doing.
-  setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
-
   // Always convert switches to br_tables unless there is only one case, which
   // is equivalent to a simple branch. This reduces code size for wasm, and we
   // defer possible jump table optimizations to the VM.

From 993c158a30b9ddc881e55efcd33e33abc10f3a5c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:46:50 -0700
Subject: [PATCH 0446/1322] MIPS: Reduce MipsMCExpr uses

---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 35 ++++++++-----------
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       |  7 ++--
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  1 +
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      | 12 +++----
 llvm/lib/Target/Mips/MipsMCInstLower.h        |  4 +--
 5 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 8d9c3a96b32a..7ea7c58f1a51 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3032,7 +3032,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       return false;
     }
 
-    const MipsMCExpr *GotExpr = nullptr;
+    const MCSpecifierExpr *GotExpr = nullptr;
     const MCExpr *LoExpr = nullptr;
     if (ABI.IsN32() || ABI.IsN64()) {
       // The remaining cases are:
@@ -3097,10 +3097,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
-  const MipsMCExpr *HiExpr =
-      MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+  const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3111,9 +3109,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // If it is not available we exit if the destination is the same as the
     // source register.
 
-    const MipsMCExpr *HighestExpr =
+    const auto *HighestExpr =
         MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
-    const MipsMCExpr *HigherExpr =
+    const auto *HigherExpr =
         MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
 
     bool RdRegIsRsReg =
@@ -3312,8 +3310,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
 
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
-    const MipsMCExpr *GotExpr =
-        MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
+    const auto *GotExpr = MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3324,8 +3321,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     }
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
-    const MipsMCExpr *HiExpr =
-        MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
+    const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3337,10 +3333,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
-      const MipsMCExpr *HighestExpr =
+      const auto *HighestExpr =
           MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
-      const MipsMCExpr *HigherExpr =
+      const auto *HigherExpr =
           MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
@@ -3428,8 +3424,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3479,8 +3474,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3560,8 +3554,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -6353,7 +6346,7 @@ MCRegister MipsAsmParser::getReg(int RC, int RegNo) {
 // e.g. "%lo foo", "(%lo(foo))", "%lo(foo)+1".
 const MCExpr *MipsAsmParser::parseRelocExpr() {
   auto getOp = [](StringRef Op) {
-    return StringSwitch<MipsMCExpr::Specifier>(Op)
+    return StringSwitch<Mips::Specifier>(Op)
         .Case("call16", Mips::S_GOT_CALL)
         .Case("call_hi", Mips::S_CALL_HI16)
         .Case("call_lo", Mips::S_CALL_LO16)
@@ -6384,7 +6377,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
   MCAsmParser &Parser = getParser();
   StringRef Name;
   const MCExpr *Res = nullptr;
-  SmallVector<MipsMCExpr::Specifier, 0> Ops;
+  SmallVector<Mips::Specifier, 0> Ops;
   while (parseOptionalToken(AsmToken::Percent)) {
     if (Parser.parseIdentifier(Name) ||
         Parser.parseToken(AsmToken::LParen, "expected '('"))
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 9b2b25c60c94..b64f86f38297 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -157,8 +157,8 @@ static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
 
 bool Mips::isGpOff(const MCSpecifierExpr &E) {
   if (E.getSpecifier() == Mips::S_HI || E.getSpecifier() == Mips::S_LO) {
-    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(E.getSubExpr())) {
-      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+    if (const auto *S1 = dyn_cast<const MCSpecifierExpr>(E.getSubExpr())) {
+      if (const auto *S2 = dyn_cast<const MCSpecifierExpr>(S1->getSubExpr())) {
         if (S1->getSpecifier() == Mips::S_NEG &&
             S2->getSpecifier() == Mips::S_GPREL) {
           // S = E.getSpecifier();
@@ -176,7 +176,8 @@ static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
   // special cases.
   if (Mips::isGpOff(Expr)) {
     const MCExpr *SubExpr =
-        cast<MipsMCExpr>(cast<MipsMCExpr>(Expr.getSubExpr())->getSubExpr())
+        cast<MCSpecifierExpr>(
+            cast<MCSpecifierExpr>(Expr.getSubExpr())->getSubExpr())
             ->getSubExpr();
     if (!SubExpr->evaluateAsRelocatable(Res, Asm))
       return false;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 39699fdb9827..0975116328fc 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -45,6 +45,7 @@ public:
 };
 
 namespace Mips {
+using Specifier = uint16_t;
 enum {
   S_None,
   S_CALL_HI16 = FirstTargetFixupKind,
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index 3c3690a7f983..935fcd8fa715 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -35,7 +35,7 @@ void MipsMCInstLower::Initialize(MCContext *C) {
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
                                               int64_t Offset) const {
-  MipsMCExpr::Specifier TargetKind = Mips::S_None;
+  Mips::Specifier TargetKind = Mips::S_None;
   bool IsGpOff = false;
   const MCSymbol *Symbol;
   SmallString<128> Name;
@@ -211,7 +211,7 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
 
 MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
                                      MachineBasicBlock *BB2,
-                                     MipsMCExpr::Specifier Kind) const {
+                                     Mips::Specifier Kind) const {
   const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::create(BB1->getSymbol(), *Ctx);
   const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
@@ -226,7 +226,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   // Lower register operand.
   OutMI.addOperand(LowerOperand(MI->getOperand(0)));
 
-  MipsMCExpr::Specifier Spec;
+  Mips::Specifier Spec;
   unsigned TargetFlags = MI->getOperand(1).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
@@ -248,7 +248,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   if (MI->getNumOperands() == 2) {
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
-    const MipsMCExpr *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 3) {
     // Create %hi($tgt-$baltgt).
@@ -261,7 +261,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
                                            MCInst &OutMI, int Opcode) const {
   OutMI.setOpcode(Opcode);
 
-  MipsMCExpr::Specifier Spec;
+  Mips::Specifier Spec;
   unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
@@ -290,7 +290,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
     // Lower register operand.
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
-    const MipsMCExpr *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 4) {
     // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.h b/llvm/lib/Target/Mips/MipsMCInstLower.h
index b6ddbe98955d..a618c6fb7bfa 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.h
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 #define LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
 
@@ -41,7 +41,7 @@ private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, int64_t Offset) const;
   MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-                      MipsMCExpr::Specifier Kind) const;
+                      Mips::Specifier Kind) const;
   void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
   void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
                             int Opcode) const;

From cf679e66fade71220535775cca895628bf7692af Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:59:16 -0700
Subject: [PATCH 0447/1322] SystemZ: Rename SystemZMCExpr::VK_ to SystemZ::S_

Prepare for removing SystemZMCExpr. Adopt the newer naming convention
used by most other targets.
---
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |  6 +-
 .../MCTargetDesc/SystemZELFObjectWriter.cpp   | 38 ++++-----
 .../MCTargetDesc/SystemZInstPrinterCommon.cpp |  6 +-
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 14 ++--
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.h   | 22 +++++
 .../SystemZ/MCTargetDesc/SystemZMCExpr.cpp    |  7 +-
 .../SystemZ/MCTargetDesc/SystemZMCExpr.h      | 18 ----
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 82 +++++++++----------
 .../lib/Target/SystemZ/SystemZMCInstLower.cpp |  8 +-
 .../SystemZ/SystemZTargetObjectFile.cpp       |  4 +-
 10 files changed, 100 insertions(+), 105 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 83c74c8a976d..74a8822a12ac 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -1707,12 +1707,12 @@ ParseStatus SystemZAsmParser::parsePCRel(OperandVector &Operands,
     if (Parser.getTok().isNot(AsmToken::Identifier))
       return Error(Parser.getTok().getLoc(), "unexpected token");
 
-    SystemZMCExpr::Specifier Kind = SystemZMCExpr::VK_None;
+    SystemZMCExpr::Specifier Kind = SystemZ::S_None;
     StringRef Name = Parser.getTok().getString();
     if (Name == "tls_gdcall")
-      Kind = SystemZMCExpr::VK_TLSGD;
+      Kind = SystemZ::S_TLSGD;
     else if (Name == "tls_ldcall")
-      Kind = SystemZMCExpr::VK_TLSLDM;
+      Kind = SystemZ::S_TLSLDM;
     else
       return Error(Parser.getTok().getLoc(), "unknown TLS tag");
     Parser.Lex();
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
index b44859d75df0..8b5587ab7125 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -103,14 +103,14 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                               bool IsPCRel) const {
   SMLoc Loc = Fixup.getLoc();
   unsigned Kind = Fixup.getKind();
-  auto Specifier = SystemZMCExpr::Specifier(Target.getSpecifier());
+  auto Specifier = SystemZ::Specifier(Target.getSpecifier());
   switch (Specifier) {
-  case SystemZMCExpr::VK_INDNTPOFF:
-  case SystemZMCExpr::VK_NTPOFF:
-  case SystemZMCExpr::VK_TLSGD:
-  case SystemZMCExpr::VK_TLSLD:
-  case SystemZMCExpr::VK_TLSLDM:
-  case SystemZMCExpr::VK_DTPOFF:
+  case SystemZ::S_INDNTPOFF:
+  case SystemZ::S_NTPOFF:
+  case SystemZ::S_TLSGD:
+  case SystemZ::S_TLSLD:
+  case SystemZ::S_TLSLDM:
+  case SystemZ::S_DTPOFF:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -119,12 +119,12 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
   }
 
   switch (Specifier) {
-  case SystemZMCExpr::VK_None:
+  case SystemZ::S_None:
     if (IsPCRel)
       return getPCRelReloc(Loc, Kind);
     return getAbsoluteReloc(Loc, Kind);
 
-  case SystemZMCExpr::VK_NTPOFF:
+  case SystemZ::S_NTPOFF:
     assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -135,14 +135,14 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (local-exec)");
     return 0;
 
-  case SystemZMCExpr::VK_INDNTPOFF:
+  case SystemZ::S_INDNTPOFF:
     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
       return ELF::R_390_TLS_IEENT;
     reportError(Loc,
                 "Only PC-relative INDNTPOFF accesses are supported for now");
     return 0;
 
-  case SystemZMCExpr::VK_DTPOFF:
+  case SystemZ::S_DTPOFF:
     assert(!IsPCRel && "DTPOFF shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -153,7 +153,7 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (local-dynamic)");
     return 0;
 
-  case SystemZMCExpr::VK_TLSLDM:
+  case SystemZ::S_TLSLDM:
     assert(!IsPCRel && "TLSLDM shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -166,7 +166,7 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (local-dynamic)");
     return 0;
 
-  case SystemZMCExpr::VK_TLSGD:
+  case SystemZ::S_TLSGD:
     assert(!IsPCRel && "TLSGD shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -179,14 +179,14 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (general-dynamic)");
     return 0;
 
-  case SystemZMCExpr::VK_GOT:
-  case SystemZMCExpr::VK_GOTENT:
+  case SystemZ::S_GOT:
+  case SystemZ::S_GOTENT:
     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
       return ELF::R_390_GOTENT;
     reportError(Loc, "Only PC-relative GOT accesses are supported for now");
     return 0;
 
-  case SystemZMCExpr::VK_PLT:
+  case SystemZ::S_PLT:
     assert(IsPCRel && "@PLT shouldn't be PC-relative");
     switch (Kind) {
     case SystemZ::FK_390_PC12DBL:
@@ -209,8 +209,8 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
 bool SystemZELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
                                                      unsigned Type) const {
   switch (V.getSpecifier()) {
-  case SystemZMCExpr::VK_GOT:
-  case SystemZMCExpr::VK_PLT:
+  case SystemZ::S_GOT:
+  case SystemZ::S_PLT:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index 7fd1a1c2d801..297fdc832592 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZInstPrinterCommon.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegister.h"
@@ -186,10 +186,10 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
     const MCOperand &MO = MI->getOperand(OpNum + 1);
     const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
     switch (refExp.getSpecifier()) {
-    case SystemZMCExpr::VK_TLSGD:
+    case SystemZ::S_TLSGD:
       O << ":tls_gdcall:";
       break;
-    case SystemZMCExpr::VK_TLSLDM:
+    case SystemZ::S_TLSLDM:
       O << ":tls_ldcall:";
       break;
     default:
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index e9d387399bf3..0f7341e6d03b 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -14,15 +14,11 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {SystemZMCExpr::VK_DTPOFF, "DTPOFF"},
-    {SystemZMCExpr::VK_GOT, "GOT"},
-    {SystemZMCExpr::VK_GOTENT, "GOTENT"},
-    {SystemZMCExpr::VK_INDNTPOFF, "INDNTPOFF"},
-    {SystemZMCExpr::VK_NTPOFF, "NTPOFF"},
-    {SystemZMCExpr::VK_PLT, "PLT"},
-    {SystemZMCExpr::VK_TLSGD, "TLSGD"},
-    {SystemZMCExpr::VK_TLSLD, "TLSLD"},
-    {SystemZMCExpr::VK_TLSLDM, "TLSLDM"},
+    {SystemZ::S_DTPOFF, "DTPOFF"}, {SystemZ::S_GOT, "GOT"},
+    {SystemZ::S_GOTENT, "GOTENT"}, {SystemZ::S_INDNTPOFF, "INDNTPOFF"},
+    {SystemZ::S_NTPOFF, "NTPOFF"}, {SystemZ::S_PLT, "PLT"},
+    {SystemZ::S_TLSGD, "TLSGD"},   {SystemZ::S_TLSLD, "TLSLD"},
+    {SystemZ::S_TLSLDM, "TLSLDM"},
 };
 
 SystemZMCAsmInfoELF::SystemZMCAsmInfoELF(const Triple &TT) {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 58b9a3dd652e..6d7d669fa8e1 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -28,6 +28,28 @@ public:
   bool isAcceptableChar(char C) const override;
 };
 
+namespace SystemZ {
+using Specifier = uint16_t;
+enum {
+  S_None,
+
+  S_DTPOFF,
+  S_GOT,
+  S_GOTENT,
+  S_INDNTPOFF,
+  S_NTPOFF,
+  S_PLT,
+  S_TLSGD,
+  S_TLSLD,
+  S_TLSLDM,
+
+  // HLASM docs for address constants:
+  // https://www.ibm.com/docs/en/hla-and-tf/1.6?topic=value-address-constants
+  S_RCon, // Address of ADA of symbol.
+  S_VCon, // Address of external function symbol.
+};
+} // namespace SystemZ
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
index 6dcca60dcedd..7b82c0cb6609 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCExpr.h"
+#include "SystemZMCAsmInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
@@ -20,11 +21,11 @@ const SystemZMCExpr *SystemZMCExpr::create(MCSpecifierExpr::Spec S,
 
 StringRef SystemZMCExpr::getVariantKindName() const {
   switch (getSpecifier()) {
-  case VK_None:
+  case SystemZ::S_None:
     return "A";
-  case VK_SystemZ_RCon:
+  case SystemZ::S_RCon:
     return "R";
-  case VK_SystemZ_VCon:
+  case SystemZ::S_VCon:
     return "V";
   default:
     llvm_unreachable("Invalid kind");
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
index ac1de97ecf0a..8e730e50ae9d 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
@@ -18,24 +18,6 @@ namespace llvm {
 class SystemZMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = Spec;
-  enum {
-    VK_None,
-
-    VK_DTPOFF = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_GOT,
-    VK_GOTENT,
-    VK_INDNTPOFF,
-    VK_NTPOFF,
-    VK_PLT,
-    VK_TLSGD,
-    VK_TLSLD,
-    VK_TLSLDM,
-
-    // HLASM docs for address constants:
-    // https://www.ibm.com/docs/en/hla-and-tf/1.6?topic=value-address-constants
-    VK_SystemZ_RCon, // Address of ADA of symbol.
-    VK_SystemZ_VCon, // Address of external function symbol.
-  };
 
 private:
   explicit SystemZMCExpr(const MCExpr *Expr, Spec S)
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index eb4b4c1647a1..d5e034b5a009 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -14,7 +14,7 @@
 #include "SystemZAsmPrinter.h"
 #include "MCTargetDesc/SystemZGNUInstPrinter.h"
 #include "MCTargetDesc/SystemZHLASMInstPrinter.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "SystemZConstantPoolValue.h"
 #include "SystemZMCInstLower.h"
@@ -79,7 +79,7 @@ static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
 static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
   StringRef Name = "__tls_get_offset";
   return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
-                                 SystemZMCExpr::VK_PLT, Context);
+                                 SystemZ::S_PLT, Context);
 }
 
 static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
@@ -319,11 +319,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case SystemZ::CallBRASL_XPLINK64:
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(SystemZ::BRASL)
-            .addReg(SystemZ::R7D)
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_PLT)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
+                                     .addReg(SystemZ::R7D)
+                                     .addExpr(Lower.getExpr(MI->getOperand(0),
+                                                            SystemZ::S_PLT)));
     emitCallInformation(CallType::BRASL7);
     return;
 
@@ -380,10 +379,9 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case SystemZ::CallBRASL:
-    LoweredMI =
-        MCInstBuilder(SystemZ::BRASL)
-            .addReg(SystemZ::R14D)
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_PLT));
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+                    .addReg(SystemZ::R14D)
+                    .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_PLT));
     break;
 
   case SystemZ::CallBASR:
@@ -393,17 +391,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case SystemZ::CallJG:
-    LoweredMI =
-        MCInstBuilder(SystemZ::JG)
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_PLT));
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+                    .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_PLT));
     break;
 
   case SystemZ::CallBRCL:
-    LoweredMI =
-        MCInstBuilder(SystemZ::BRCL)
-            .addImm(MI->getOperand(0).getImm())
-            .addImm(MI->getOperand(1).getImm())
-            .addExpr(Lower.getExpr(MI->getOperand(2), SystemZMCExpr::VK_PLT));
+    LoweredMI = MCInstBuilder(SystemZ::BRCL)
+                    .addImm(MI->getOperand(0).getImm())
+                    .addImm(MI->getOperand(1).getImm())
+                    .addExpr(Lower.getExpr(MI->getOperand(2), SystemZ::S_PLT));
     break;
 
   case SystemZ::CallBR:
@@ -495,15 +491,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
         MCInstBuilder(SystemZ::BRASL)
             .addReg(SystemZ::R14D)
             .addExpr(getTLSGetOffset(MF->getContext()))
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_TLSGD));
+            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_TLSGD));
     break;
 
   case SystemZ::TLS_LDCALL:
-    LoweredMI = MCInstBuilder(SystemZ::BRASL)
-                    .addReg(SystemZ::R14D)
-                    .addExpr(getTLSGetOffset(MF->getContext()))
-                    .addExpr(Lower.getExpr(MI->getOperand(0),
-                                           SystemZMCExpr::VK_TLSLDM));
+    LoweredMI =
+        MCInstBuilder(SystemZ::BRASL)
+            .addReg(SystemZ::R14D)
+            .addExpr(getTLSGetOffset(MF->getContext()))
+            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_TLSLDM));
     break;
 
   case SystemZ::GOT:
@@ -798,7 +794,7 @@ void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
 
   MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
   const MCSymbolRefExpr *Op =
-      MCSymbolRefExpr::create(fentry, SystemZMCExpr::VK_PLT, Ctx);
+      MCSymbolRefExpr::create(fentry, SystemZ::S_PLT, Ctx);
   OutStreamer->emitInstruction(
       MCInstBuilder(SystemZ::BRASL).addReg(SystemZ::R0D).addExpr(Op),
       getSubtargetInfo());
@@ -880,7 +876,7 @@ void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
       EncodedBytes += 2;
     }
   } else if (CalleeMO.isGlobal()) {
-    const MCExpr *Expr = Lower.getExpr(CalleeMO, SystemZMCExpr::VK_PLT);
+    const MCExpr *Expr = Lower.getExpr(CalleeMO, SystemZ::S_PLT);
     EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
                                    .addReg(SystemZ::R14D)
                                    .addExpr(Expr));
@@ -923,11 +919,10 @@ void SystemZAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(
   EmitNop(OutContext, *OutStreamer, 2, getSubtargetInfo());
   EmitToStreamer(*OutStreamer,
                  MCInstBuilder(SystemZ::LLILF).addReg(SystemZ::R2D).addImm(0));
-  EmitToStreamer(*OutStreamer,
-                 MCInstBuilder(SystemZ::BRASL)
-                     .addReg(SystemZ::R14D)
-                     .addExpr(MCSymbolRefExpr::create(
-                         FuncEntry, SystemZMCExpr::VK_PLT, OutContext)));
+  EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
+                                   .addReg(SystemZ::R14D)
+                                   .addExpr(MCSymbolRefExpr::create(
+                                       FuncEntry, SystemZ::S_PLT, OutContext)));
   OutStreamer->emitLabel(EndOfSled);
   recordSled(BeginOfSled, MI, SledKind::FUNCTION_ENTER, 2);
 }
@@ -967,10 +962,9 @@ void SystemZAsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   EmitNop(OutContext, *OutStreamer, 4, getSubtargetInfo());
   EmitToStreamer(*OutStreamer,
                  MCInstBuilder(SystemZ::LLILF).addReg(SystemZ::R2D).addImm(0));
-  EmitToStreamer(*OutStreamer,
-                 MCInstBuilder(SystemZ::J)
-                     .addExpr(MCSymbolRefExpr::create(
-                         FuncExit, SystemZMCExpr::VK_PLT, OutContext)));
+  EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::J)
+                                   .addExpr(MCSymbolRefExpr::create(
+                                       FuncExit, SystemZ::S_PLT, OutContext)));
   if (FallthroughLabel)
     OutStreamer->emitLabel(FallthroughLabel);
   recordSled(BeginOfSled, MI, SledKind::FUNCTION_EXIT, 2);
@@ -992,13 +986,13 @@ void SystemZAsmPrinter::emitAttributes(Module &M) {
 static uint8_t getSpecifierFromModifier(SystemZCP::SystemZCPModifier Modifier) {
   switch (Modifier) {
   case SystemZCP::TLSGD:
-    return SystemZMCExpr::VK_TLSGD;
+    return SystemZ::S_TLSGD;
   case SystemZCP::TLSLDM:
-    return SystemZMCExpr::VK_TLSLDM;
+    return SystemZ::S_TLSLDM;
   case SystemZCP::DTPOFF:
-    return SystemZMCExpr::VK_DTPOFF;
+    return SystemZ::S_DTPOFF;
   case SystemZCP::NTPOFF:
-    return SystemZMCExpr::VK_NTPOFF;
+    return SystemZ::S_NTPOFF;
   }
   llvm_unreachable("Invalid SystemCPModifier!");
 }
@@ -1145,12 +1139,12 @@ void SystemZAsmPrinter::emitADASection() {
       // imported functions, that are placed in the ADA to be 8 byte aligned.
       EMIT_COMMENT("function descriptor of");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_SystemZ_RCon,
+          SystemZMCExpr::create(SystemZ::S_RCon,
                                 MCSymbolRefExpr::create(Sym, OutContext),
                                 OutContext),
           PointerSize);
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_SystemZ_VCon,
+          SystemZMCExpr::create(SystemZ::S_VCon,
                                 MCSymbolRefExpr::create(Sym, OutContext),
                                 OutContext),
           PointerSize);
@@ -1159,7 +1153,7 @@ void SystemZAsmPrinter::emitADASection() {
     case SystemZII::MO_ADA_DATA_SYMBOL_ADDR:
       EMIT_COMMENT("pointer to data symbol");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_None,
+          SystemZMCExpr::create(SystemZ::S_None,
                                 MCSymbolRefExpr::create(Sym, OutContext),
                                 OutContext),
           PointerSize);
@@ -1174,7 +1168,7 @@ void SystemZAsmPrinter::emitADASection() {
 
       EMIT_COMMENT("pointer to function descriptor");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_SystemZ_VCon,
+          SystemZMCExpr::create(SystemZ::S_VCon,
                                 MCSymbolRefExpr::create(Alias, OutContext),
                                 OutContext),
           PointerSize);
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 1aa71618082f..4a68c5d6462d 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCInstLower.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "SystemZAsmPrinter.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
@@ -20,11 +20,11 @@ using namespace llvm;
 static SystemZMCExpr::Specifier getSpecifierForTFlags(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
     case 0:
-      return SystemZMCExpr::VK_None;
+      return SystemZ::S_None;
     case SystemZII::MO_GOT:
-      return SystemZMCExpr::VK_GOT;
+      return SystemZ::S_GOT;
     case SystemZII::MO_INDNTPOFF:
-      return SystemZMCExpr::VK_INDNTPOFF;
+      return SystemZ::S_INDNTPOFF;
   }
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp
index 7d22c26ff9a8..ae90c51432fe 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetObjectFile.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -15,5 +15,5 @@ using namespace llvm;
 
 const MCExpr *SystemZELFTargetObjectFile::getDebugThreadLocalSymbol(
     const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::create(Sym, SystemZMCExpr::VK_DTPOFF, getContext());
+  return MCSymbolRefExpr::create(Sym, SystemZ::S_DTPOFF, getContext());
 }

From d64ee2cd4fe488b6dc21e7a8173fbb9cf3610ba0 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 16 Jun 2025 10:12:51 +0800
Subject: [PATCH 0448/1322] [RISCV] Add GetVTypeMinimalPredicates for the
 operation supported by zvfhmin. NFC. (#143847)

This patch adds a new `GetVTypeMinimalPredicates` for `f16` operation
supported by `Zvfhmin`. Split the type predicates for minimal support
and full compute support. This is a refactor patch for implementing
vector compute support for bf16 (Zvfbfa), that we can check `bf16` type
whether with `Zvfbfa` extension in `GetVTypePredicates`.
---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 68 ++++++++++---------
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td |  8 +--
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 13 ++--
 3 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index f9fc6f0be380..22b5b52541d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -783,6 +783,15 @@ class GetVTypePredicates<VTypeInfo vti> {
                                      true : [HasVInstructions]);
 }
 
+class GetVTypeMinimalPredicates<VTypeInfo vti> {
+  list<Predicate> Predicates = !cond(!eq(vti.Scalar, f16) : [HasVInstructionsF16Minimal],
+                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16Minimal],
+                                     !eq(vti.Scalar, f32) : [HasVInstructionsAnyF],
+                                     !eq(vti.Scalar, f64) : [HasVInstructionsF64],
+                                     !eq(vti.SEW, 64) : [HasVInstructionsI64],
+                                     true : [HasVInstructions]);
+}
+
 class VPseudoUSLoadNoMask<VReg RetClass,
                           int EEW,
                           DAGOperand sewop = sew> :
@@ -4568,7 +4577,7 @@ multiclass VPatUnaryS_M<string intrinsic_name,
 multiclass VPatUnaryV_V_AnyMask<string intrinsic, string instruction,
                                 list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     def : VPatUnaryAnyMask<intrinsic, instruction, "VM",
                            vti.Vector, vti.Vector, vti.Mask,
                            vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
@@ -4887,7 +4896,7 @@ multiclass VPatBinaryV_VV_INT<string intrinsic, string instruction,
                               list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
     defvar ivti = GetIntVTypeInfo<vti>.Vti;
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinary<intrinsic,
                       instruction # "_VV_" # vti.LMul.MX # "_E" # vti.SEW,
                       vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
@@ -4950,7 +4959,7 @@ multiclass VPatBinaryV_VX_RM<string intrinsic, string instruction,
 multiclass VPatBinaryV_VX_INT<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinary<intrinsic, instruction # "_VX_" # vti.LMul.MX,
                       vti.Vector, vti.Vector, XLenVT, vti.Mask,
                       vti.Log2SEW, vti.RegClass,
@@ -4979,6 +4988,16 @@ multiclass VPatBinaryV_VI_RM<string intrinsic, string instruction,
                                   vti.RegClass, imm_type>;
 }
 
+multiclass VPatBinaryV_VI_INT<string intrinsic, string instruction,
+                              list<VTypeInfo> vtilist, Operand imm_type> {
+  foreach vti = vtilist in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
+    defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                      vti.Log2SEW, vti.RegClass,
+                      vti.RegClass, imm_type>;
+}
+
 multiclass VPatBinaryM_MM<string intrinsic, string instruction> {
   foreach mti = AllMasks in
     let Predicates = [HasVInstructions] in
@@ -5709,7 +5728,7 @@ multiclass VPatBinaryV_VV_VX_VI_INT<string intrinsic, string instruction,
                                     list<VTypeInfo> vtilist, Operand ImmType>
     : VPatBinaryV_VV_INT<intrinsic#"_vv", instruction, vtilist>,
       VPatBinaryV_VX_INT<intrinsic#"_vx", instruction, vtilist>,
-      VPatBinaryV_VI<intrinsic#"_vx", instruction, vtilist, ImmType>;
+      VPatBinaryV_VI_INT<intrinsic#"_vx", instruction, vtilist, ImmType>;
 
 multiclass VPatReductionV_VS<string intrinsic, string instruction, bit IsFloat = 0> {
   foreach vti = !if(IsFloat, NoGroupFloatVectors, NoGroupIntegerVectors) in {
@@ -5887,12 +5906,11 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
-    let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                         !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                     GetVTypePredicates<fwti>.Predicates)) in
-      defm : VPatConversion<intrinsic, instruction, "V",
-                            fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
-                            fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+    let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                                 GetVTypeMinimalPredicates<fwti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+                          fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
   }
 }
 
@@ -5979,8 +5997,9 @@ multiclass VPatConversionVF_WF_RM<string intrinsic, string instruction,
   foreach fvtiToFWti = wlist in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
-    let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                 GetVTypePredicates<fwti>.Predicates) in
+    // Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
+    let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                                 GetVTypeMinimalPredicates<fwti>.Predicates) in
     defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
                                       fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
                                       fvti.LMul, fvti.RegClass, fwti.RegClass,
@@ -6999,8 +7018,7 @@ defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
 // 11.16. Vector Integer Move Instructions
 //===----------------------------------------------------------------------===//
 foreach vti = AllVectors in {
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in {
+  let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru),
                                              (vti.Vector vti.RegClass:$rs1),
                                              VLOpFrag)),
@@ -7195,8 +7213,7 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
 foreach vti = AllFloatVectors in {
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in
+  let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
                                  vti.Vector,
                                  vti.Vector, vti.Vector, vti.Mask,
@@ -7275,16 +7292,8 @@ defm : VPatConversionVF_WI_RM<"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU",
                               isSEWAware=1>;
 defm : VPatConversionVF_WI_RM<"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X",
                               isSEWAware=1>;
-defvar WidenableFloatVectorsExceptF16 = !filter(fvtiToFWti, AllWidenableFloatVectors,
-                                                !ne(fvtiToFWti.Vti.Scalar, f16));
 defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F",
-                           WidenableFloatVectorsExceptF16, isSEWAware=1>;
-// Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
-defvar F16WidenableFloatVectors = !filter(fvtiToFWti, AllWidenableFloatVectors,
-                                          !eq(fvtiToFWti.Vti.Scalar, f16));
-let Predicates = [HasVInstructionsF16Minimal] in
-defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F",
-                           F16WidenableFloatVectors, isSEWAware=1>;
+                              AllWidenableFloatVectors, isSEWAware=1>;
 defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w", 
                                  "PseudoVFNCVTBF16_F_F", isSEWAware=1>;
 defm : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F",
@@ -7419,10 +7428,7 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
                               eew=16, vtilist=AllIntegerVectors>;
 
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
-                                AllFloatVectorsExceptFP16, uimm5>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
-                                  AllFP16Vectors, uimm5>;
+                                AllFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                 AllBFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
@@ -7431,9 +7437,7 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
 defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
-defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectorsExceptFP16>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFP16Vectors>;
+defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
 defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBFloatVectors>;
 
 // Include the non-intrinsic ISel patterns
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index e318a78285a2..520959b0896f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -864,8 +864,7 @@ multiclass VPatAVGADD_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
 
 // 7.4. Vector Unit-Stride Instructions
 foreach vti = AllVectors in
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in 
+  let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
   defm : VPatUSLoadStoreSDNode<vti.Vector, vti.RegClass, vti.Log2SEW, vti.LMul,
                                vti.AVL, vti.RegClass>;
 foreach mti = AllMasks in
@@ -1449,9 +1448,8 @@ defm : VPatNConvertI2FPSDNode_W_RM<any_uint_to_fp, "PseudoVFNCVT_F_XU_W">;
 foreach fvtiToFWti = AllWidenableFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                   GetVTypePredicates<fwti>.Predicates)) in
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                               GetVTypeMinimalPredicates<fwti>.Predicates) in
   def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
             (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
                 (fvti.Vector (IMPLICIT_DEF)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index b54c2b042b4d..6328e6c860f7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2683,9 +2683,9 @@ defm : VPatWConvertI2FPVL_V<any_riscv_sint_to_fp_vl, "PseudoVFWCVT_F_X_V">;
 foreach fvtiToFWti = AllWidenableFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                   GetVTypePredicates<fwti>.Predicates)) in
+  // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                               GetVTypeMinimalPredicates<fwti>.Predicates) in
   def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                              (fvti.Vector fvti.RegClass:$rs1),
                              (fvti.Mask VMV0:$vm),
@@ -2726,10 +2726,9 @@ defm : VPatNConvertI2FP_RM_VL_W<riscv_vfcvt_rm_f_x_vl, "PseudoVFNCVT_F_X_W">;
 foreach fvtiToFWti = AllWidenableFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
-  let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                   GetVTypePredicates<fwti>.Predicates)) in {
+  // Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                               GetVTypeMinimalPredicates<fwti>.Predicates) in {
     def : Pat<(fvti.Vector (any_riscv_fpround_vl
                                (fwti.Vector fwti.RegClass:$rs1),
                                (fwti.Mask VMV0:$vm), VLOpFrag)),

From b591f6dad4079401fadc4a516b32d3900b7946de Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 19:26:40 -0700
Subject: [PATCH 0449/1322] SystemZ: Migrate to newer relocation specifier
 representation

z/OS creates SystemZMCExpr objects (https://reviews.llvm.org/D153788)
while ELF doesn't. Define the SystemZMCAsmInfoGOFF hooks
instead of the legacy MCSpecifierExpr:: hooks.
---
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |  3 +-
 .../SystemZ/MCTargetDesc/CMakeLists.txt       |  1 -
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 30 +++++++++++-
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.h   |  4 ++
 .../SystemZ/MCTargetDesc/SystemZMCExpr.cpp    | 47 -------------------
 .../SystemZ/MCTargetDesc/SystemZMCExpr.h      | 38 ---------------
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 20 ++++----
 .../lib/Target/SystemZ/SystemZMCInstLower.cpp |  8 ++--
 llvm/lib/Target/SystemZ/SystemZMCInstLower.h  |  5 +-
 9 files changed, 48 insertions(+), 108 deletions(-)
 delete mode 100644 llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
 delete mode 100644 llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h

diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 74a8822a12ac..6ee2a87565ba 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -8,7 +8,6 @@
 
 #include "MCTargetDesc/SystemZGNUInstPrinter.h"
 #include "MCTargetDesc/SystemZMCAsmInfo.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZTargetStreamer.h"
 #include "TargetInfo/SystemZTargetInfo.h"
@@ -1707,7 +1706,7 @@ ParseStatus SystemZAsmParser::parsePCRel(OperandVector &Operands,
     if (Parser.getTok().isNot(AsmToken::Identifier))
       return Error(Parser.getTok().getLoc(), "unexpected token");
 
-    SystemZMCExpr::Specifier Kind = SystemZ::S_None;
+    auto Kind = SystemZ::S_None;
     StringRef Name = Parser.getTok().getString();
     if (Name == "tls_gdcall")
       Kind = SystemZ::S_TLSGD;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
index c95445637d0b..28f7ced8d7ce 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_component_library(LLVMSystemZDesc
   SystemZMCAsmBackend.cpp
   SystemZMCAsmInfo.cpp
   SystemZMCCodeEmitter.cpp
-  SystemZMCExpr.cpp
   SystemZMCTargetDesc.cpp
   SystemZTargetStreamer.cpp
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 0f7341e6d03b..052875bf0d3f 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCAsmInfo.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -58,3 +58,31 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) {
 bool SystemZMCAsmInfoGOFF::isAcceptableChar(char C) const {
   return MCAsmInfo::isAcceptableChar(C) || C == '#';
 }
+
+void SystemZMCAsmInfoGOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  switch (Expr.getSpecifier()) {
+  case SystemZ::S_None:
+    OS << "A";
+    break;
+  case SystemZ::S_RCon:
+    OS << "R";
+    break;
+  case SystemZ::S_VCon:
+    OS << "V";
+    break;
+  default:
+    llvm_unreachable("Invalid kind");
+  }
+  OS << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  OS << ')';
+}
+
+bool SystemZMCAsmInfoGOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 6d7d669fa8e1..11c2833b8ada 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -26,6 +26,10 @@ class SystemZMCAsmInfoGOFF : public MCAsmInfoGOFF {
 public:
   explicit SystemZMCAsmInfoGOFF(const Triple &TT);
   bool isAcceptableChar(char C) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace SystemZ {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
deleted file mode 100644
index 7b82c0cb6609..000000000000
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- SystemZMCExpr.cpp - SystemZ specific MC expression classes --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SystemZMCExpr.h"
-#include "SystemZMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "systemzmcexpr"
-
-const SystemZMCExpr *SystemZMCExpr::create(MCSpecifierExpr::Spec S,
-                                           const MCExpr *Expr, MCContext &Ctx) {
-  return new (Ctx) SystemZMCExpr(Expr, S);
-}
-
-StringRef SystemZMCExpr::getVariantKindName() const {
-  switch (getSpecifier()) {
-  case SystemZ::S_None:
-    return "A";
-  case SystemZ::S_RCon:
-    return "R";
-  case SystemZ::S_VCon:
-    return "V";
-  default:
-    llvm_unreachable("Invalid kind");
-  }
-}
-
-void SystemZMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  OS << getVariantKindName() << '(';
-  MAI->printExpr(OS, *Expr);
-  OS << ')';
-}
-
-bool SystemZMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(specifier);
-  return true;
-}
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
deleted file mode 100644
index 8e730e50ae9d..000000000000
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- SystemZMCExpr.h - SystemZ specific MC expression classes -*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCEXPR_H
-#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class SystemZMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = Spec;
-
-private:
-  explicit SystemZMCExpr(const MCExpr *Expr, Spec S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const SystemZMCExpr *create(Spec Kind, const MCExpr *Expr,
-                                     MCContext &Ctx);
-
-  StringRef getVariantKindName() const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-};
-} // end namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index d5e034b5a009..aaf12b88de13 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -1139,23 +1139,20 @@ void SystemZAsmPrinter::emitADASection() {
       // imported functions, that are placed in the ADA to be 8 byte aligned.
       EMIT_COMMENT("function descriptor of");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_RCon,
-                                MCSymbolRefExpr::create(Sym, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Sym, OutContext),
+                                  SystemZ::S_RCon, OutContext),
           PointerSize);
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_VCon,
-                                MCSymbolRefExpr::create(Sym, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Sym, OutContext),
+                                  SystemZ::S_VCon, OutContext),
           PointerSize);
       EmittedBytes += PointerSize * 2;
       break;
     case SystemZII::MO_ADA_DATA_SYMBOL_ADDR:
       EMIT_COMMENT("pointer to data symbol");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_None,
-                                MCSymbolRefExpr::create(Sym, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Sym, OutContext),
+                                  SystemZ::S_None, OutContext),
           PointerSize);
       EmittedBytes += PointerSize;
       break;
@@ -1168,9 +1165,8 @@ void SystemZAsmPrinter::emitADASection() {
 
       EMIT_COMMENT("pointer to function descriptor");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_VCon,
-                                MCSymbolRefExpr::create(Alias, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Alias, OutContext),
+                                  SystemZ::S_VCon, OutContext),
           PointerSize);
       EmittedBytes += PointerSize;
       break;
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 4a68c5d6462d..c1d0994a9e17 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -16,8 +16,8 @@
 
 using namespace llvm;
 
-// Return the VK_* enumeration for MachineOperand target flags Flags.
-static SystemZMCExpr::Specifier getSpecifierForTFlags(unsigned Flags) {
+// Return the S_* enumeration for MachineOperand target flags Flags.
+static SystemZ::Specifier getSpecifierForTFlags(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
     case 0:
       return SystemZ::S_None;
@@ -34,7 +34,7 @@ SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
   : Ctx(ctx), AsmPrinter(asmprinter) {}
 
 const MCExpr *SystemZMCInstLower::getExpr(const MachineOperand &MO,
-                                          SystemZMCExpr::Specifier Spec) const {
+                                          SystemZ::Specifier Spec) const {
   const MCSymbol *Symbol;
   bool HasOffset = true;
   switch (MO.getType()) {
@@ -85,7 +85,7 @@ MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
     return MCOperand::createImm(MO.getImm());
 
   default: {
-    SystemZMCExpr::Specifier Kind = getSpecifierForTFlags(MO.getTargetFlags());
+    auto Kind = getSpecifierForTFlags(MO.getTargetFlags());
     return MCOperand::createExpr(getExpr(MO, Kind));
   }
   }
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
index 90526882c853..3187d7726c31 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
 
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
@@ -35,8 +35,7 @@ public:
   MCOperand lowerOperand(const MachineOperand& MO) const;
 
   // Return an MCExpr for symbolic operand MO with variant kind Kind.
-  const MCExpr *getExpr(const MachineOperand &MO,
-                        SystemZMCExpr::Specifier) const;
+  const MCExpr *getExpr(const MachineOperand &MO, SystemZ::Specifier) const;
 };
 } // end namespace llvm
 

From 167223f8c2c2350a3de9478355885c63b35ca6a9 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 16 Jun 2025 02:26:58 +0000
Subject: [PATCH 0450/1322] [gn build] Port b591f6dad407

---
 .../gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn
index 4182b4911538..360cdc5f10e6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn
@@ -70,7 +70,6 @@ static_library("MCTargetDesc") {
     "SystemZMCAsmBackend.cpp",
     "SystemZMCAsmInfo.cpp",
     "SystemZMCCodeEmitter.cpp",
-    "SystemZMCExpr.cpp",
     "SystemZMCTargetDesc.cpp",
     "SystemZTargetStreamer.cpp",
   ]

From 9adde28df784f5c0cc960bdabd413ac131a5852e Mon Sep 17 00:00:00 2001
From: Ming-Yi Lai <ming-yi.lai@mediatek.com>
Date: Mon, 16 Jun 2025 11:18:41 +0800
Subject: [PATCH 0451/1322] [LLD][ELF][RISCV][Zicfilp][Zicfiss] Support `-z
 zicfilp=` and `-z zicfiss=` to force enable/disable features (#143114)

+ If `-z zicfilp=implicit` or option not specified, the output would
have the ZICFILP feature enabled/disabled based on input objects
+ If `-z zicfilp=<never|unlabeled|func-sig>`, the output would have
ZICFILP feature forced <off|on to the "unlabeled" scheme|on to the
"func-sig" scheme>
+ If `-z zicfiss=implicit` or option not specified, the output would
have the ZICFISS feature enabled/disabled based on input objects
+ If `-z zicfiss=<never|always>`, the output would have the ZICFISS
feature forced <off|on>
---
 lld/ELF/Config.h                              |  8 ++
 lld/ELF/Driver.cpp                            | 77 +++++++++++++++++++
 lld/test/ELF/riscv-feature-zicfilp-func-sig.s | 47 ++++++++++-
 .../ELF/riscv-feature-zicfilp-unlabeled.s     | 48 +++++++++++-
 lld/test/ELF/riscv-feature-zicfiss.s          | 20 ++++-
 5 files changed, 191 insertions(+), 9 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index a2f7759fb7d3..2b72d54ba410 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -136,6 +136,12 @@ enum LtoKind : uint8_t {UnifiedThin, UnifiedRegular, Default};
 // For -z gcs=
 enum class GcsPolicy { Implicit, Never, Always };
 
+// For -z zicfilp=
+enum class ZicfilpPolicy { Implicit, Never, Unlabeled, FuncSig };
+
+// For -z zicfiss=
+enum class ZicfissPolicy { Implicit, Never, Always };
+
 // For some options that resemble -z bti-report={none,warning,error}
 enum class ReportPolicy { None, Warning, Error };
 
@@ -411,6 +417,8 @@ struct Config {
   bool zText;
   bool zRetpolineplt;
   bool zWxneeded;
+  ZicfilpPolicy zZicfilp;
+  ZicfissPolicy zZicfiss;
   DiscardPolicy discard;
   GnuStackKind zGnustack;
   ICFLevel icf;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c9ac71f7236f..7e132a387a04 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -430,6 +430,10 @@ static void checkOptions(Ctx &ctx) {
                         "RISC-V targets";
     if (ctx.arg.zZicfissReport != ReportPolicy::None)
       ErrAlways(ctx) << "-z zicfiss-report is only supported on RISC-V targets";
+    if (ctx.arg.zZicfilp != ZicfilpPolicy::Implicit)
+      ErrAlways(ctx) << "-z zicfilp is only supported on RISC-V targets";
+    if (ctx.arg.zZicfiss != ZicfissPolicy::Implicit)
+      ErrAlways(ctx) << "-z zicfiss is only supported on RISC-V targets";
   }
 
   if (ctx.arg.emachine != EM_386 && ctx.arg.emachine != EM_X86_64 &&
@@ -584,6 +588,46 @@ static GcsPolicy getZGcs(Ctx &ctx, opt::InputArgList &args) {
   return ret;
 }
 
+static ZicfilpPolicy getZZicfilp(Ctx &ctx, opt::InputArgList &args) {
+  auto ret = ZicfilpPolicy::Implicit;
+  for (auto *arg : args.filtered(OPT_z)) {
+    std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
+    if (kv.first == "zicfilp") {
+      arg->claim();
+      if (kv.second == "unlabeled")
+        ret = ZicfilpPolicy::Unlabeled;
+      else if (kv.second == "func-sig")
+        ret = ZicfilpPolicy::FuncSig;
+      else if (kv.second == "never")
+        ret = ZicfilpPolicy::Never;
+      else if (kv.second == "implicit")
+        ret = ZicfilpPolicy::Implicit;
+      else
+        ErrAlways(ctx) << "unknown -z zicfilp= value: " << kv.second;
+    }
+  }
+  return ret;
+}
+
+static ZicfissPolicy getZZicfiss(Ctx &ctx, opt::InputArgList &args) {
+  auto ret = ZicfissPolicy::Implicit;
+  for (auto *arg : args.filtered(OPT_z)) {
+    std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
+    if (kv.first == "zicfiss") {
+      arg->claim();
+      if (kv.second == "always")
+        ret = ZicfissPolicy::Always;
+      else if (kv.second == "never")
+        ret = ZicfissPolicy::Never;
+      else if (kv.second == "implicit")
+        ret = ZicfissPolicy::Implicit;
+      else
+        ErrAlways(ctx) << "unknown -z zicfiss= value: " << kv.second;
+    }
+  }
+  return ret;
+}
+
 // Report a warning for an unknown -z option.
 static void checkZOptions(Ctx &ctx, opt::InputArgList &args) {
   // This function is called before getTarget(), when certain options are not
@@ -1567,6 +1611,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.zCopyreloc = getZFlag(args, "copyreloc", "nocopyreloc", true);
   ctx.arg.zForceBti = hasZOption(args, "force-bti");
   ctx.arg.zForceIbt = hasZOption(args, "force-ibt");
+  ctx.arg.zZicfilp = getZZicfilp(ctx, args);
+  ctx.arg.zZicfiss = getZZicfiss(ctx, args);
   ctx.arg.zGcs = getZGcs(ctx, args);
   ctx.arg.zGlobal = hasZOption(args, "global");
   ctx.arg.zGnustack = getZGnuStack(args);
@@ -2926,6 +2972,18 @@ static void readSecurityNotes(Ctx &ctx) {
           << f
           << ": -z zicfiss-report: file does not have "
              "GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS property";
+
+      if (ctx.arg.zZicfilp == ZicfilpPolicy::Unlabeled &&
+          (features & GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG))
+        Warn(ctx) << f
+                  << ": -z zicfilp=unlabeled: file has conflicting property: "
+                     "GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG";
+
+      if (ctx.arg.zZicfilp == ZicfilpPolicy::FuncSig &&
+          (features & GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED))
+        Warn(ctx) << f
+                  << ": -z zicfilp=func-sig: file has conflicting property: "
+                     "GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED";
     }
 
     if (ctx.arg.zForceBti && !(features & GNU_PROPERTY_AARCH64_FEATURE_1_BTI)) {
@@ -2989,6 +3047,25 @@ static void readSecurityNotes(Ctx &ctx) {
   else if (ctx.arg.zGcs == GcsPolicy::Never)
     ctx.arg.andFeatures &= ~GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
 
+  if (ctx.arg.emachine == EM_RISCV) {
+    // Force enable/disable Zicfilp.
+    if (ctx.arg.zZicfilp == ZicfilpPolicy::Unlabeled) {
+      ctx.arg.andFeatures |= GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED;
+      ctx.arg.andFeatures &= ~GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG;
+    } else if (ctx.arg.zZicfilp == ZicfilpPolicy::FuncSig) {
+      ctx.arg.andFeatures |= GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG;
+      ctx.arg.andFeatures &= ~GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED;
+    } else if (ctx.arg.zZicfilp == ZicfilpPolicy::Never)
+      ctx.arg.andFeatures &= ~(GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED |
+                               GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG);
+
+    // Force enable/disable Zicfiss.
+    if (ctx.arg.zZicfiss == ZicfissPolicy::Always)
+      ctx.arg.andFeatures |= GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS;
+    else if (ctx.arg.zZicfiss == ZicfissPolicy::Never)
+      ctx.arg.andFeatures &= ~GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS;
+  }
+
   // If we are utilising GCS at any stage, the sharedFiles should be checked to
   // ensure they also support this feature. The gcs-report-dynamic option is
   // used to indicate if the user wants information relating to this, and will
diff --git a/lld/test/ELF/riscv-feature-zicfilp-func-sig.s b/lld/test/ELF/riscv-feature-zicfilp-func-sig.s
index f68fbddfa602..c5818dd33978 100644
--- a/lld/test/ELF/riscv-feature-zicfilp-func-sig.s
+++ b/lld/test/ELF/riscv-feature-zicfilp-func-sig.s
@@ -2,6 +2,7 @@
 ## Test the ZICFILP func-sig feature.
 ## To lift maintenance burden, most tests are conducted only with 64-bit RISC-V
 ## Naming convention: *-s.s files enables ZICFILP func-sig.
+## Naming convention: *-u.s files enables ZICFILP unlabeled.
 # RUN: rm -rf %t && split-file %s %t && cd %t
 # RUN: llvm-mc --filetype=obj --triple=riscv32 rv32-f1-s.s -o rv32-f1-s.o
 # RUN: llvm-mc --filetype=obj --triple=riscv32 rv32-f2-s.s -o rv32-f2-s.o
@@ -12,14 +13,20 @@
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f2-s.s -o f2-s.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3.s   -o f3.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3-s.s -o f3-s.o
+# RUN: llvm-mc --filetype=obj --triple=riscv64 f3-u.s -o f3-u.o
 
-## ZICFILP-func-sig should be enabled when it's enabled in all inputs
+## ZICFILP-func-sig should be enabled when it's enabled in all inputs or when
+## it's forced on.
 # RUN: ld.lld rv32-f1-s.o rv32-f2-s.o rv32-f3-s.o -o out.rv32 --fatal-warnings
 # RUN: llvm-readelf -n out.rv32 | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f2-s.o f3-s.o -o out --fatal-warnings
 # RUN: llvm-readelf -n out | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f3-s.o --shared -o out.so --fatal-warnings
 # RUN: llvm-readelf -n out.so | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -o out.force -z zicfilp=func-sig --fatal-warnings
+# RUN: llvm-readelf -n out.force | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f2-s.o f3.o --shared -o out.force.so -z zicfilp=never -z zicfilp=func-sig --fatal-warnings
+# RUN: llvm-readelf -n out.force.so | FileCheck --check-prefix=ZICFILP %s
 # ZICFILP: Properties: RISC-V feature: ZICFILP-func-sig
 
 ## ZICFILP-func-sig should not be enabled if it's not enabled in at least one
@@ -29,11 +36,18 @@
 # RUN: ld.lld f2-s.o f3.o --shared -o out.no.so --fatal-warnings
 # RUN: llvm-readelf -n out.no.so | count 0
 
+## ZICFILP-func-sig should be disabled with zicfilp=never, even if
+## ZICFILP-func-sig is present in all inputs.
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp=func-sig -z zicfilp=never -o out.never --fatal-warnings
+# RUN: llvm-readelf -n out.never | count 0
+
 ## zicfilp-func-sig-report should report any input files that don't have the
 ## ZICFILP-func-sig property
 # RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-func-sig-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-func-sig-report=warning -z zicfilp=func-sig 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-func-sig-report=warning -z zicfilp=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
 # RUN: not ld.lld f2-s.o f3.o --shared -z zicfilp-func-sig-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-func-sig-report=warning 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-func-sig-report=warning -z zicfilp=func-sig 2>&1 | count 0
 # REPORT-WARN: warning: f2.o: -z zicfilp-func-sig-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG property
 # REPORT-ERROR: error: f3.o: -z zicfilp-func-sig-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG property
 
@@ -41,6 +55,14 @@
 # RUN: not ld.lld f2-s.o -z zicfilp-func-sig-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
 # INVALID: error: unknown -z zicfilp-func-sig-report= value: x
 
+## ZICFILP-unlabeled and ZICFILP-func-sig should conflict with each other.
+# RUN: ld.lld f3-u.o -o out.override -z zicfilp=func-sig 2>&1 | FileCheck --check-prefix=FORCE-CONFLICT %s
+# FORCE-CONFLICT: warning: f3-u.o: -z zicfilp=func-sig: file has conflicting property: GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED
+
+## -z zicfilp=func-sig should override and disable ZICFILP-unlabeled.
+# RUN: llvm-readelf -n out.override | FileCheck --check-prefixes=ZICFILP,OVERRIDE %s
+# OVERRIDE-NOT: ZICFILP-unlabeled
+
 #--- rv32-f1-s.s
 .section ".note.gnu.property", "a"
 .balign 4
@@ -191,3 +213,24 @@ ndesc_end:
 .type f3,@function
 f3:
   ret
+
+#--- f3-u.s
+.section ".note.gnu.property", "a"
+.balign 8
+.4byte 4
+.4byte (ndesc_end - ndesc_begin)
+.4byte 0x5        // NT_GNU_PROPERTY_TYPE_0
+.asciz "GNU"
+ndesc_begin:
+.balign 8
+.4byte 0xc0000000 // GNU_PROPERTY_RISCV_FEATURE_1_AND
+.4byte 4
+.4byte 1          // GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED
+.balign 8
+ndesc_end:
+
+.text
+.globl f3
+.type f3,@function
+f3:
+  ret
diff --git a/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s b/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s
index 0fcd8538d24a..20491f057c8e 100644
--- a/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s
+++ b/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s
@@ -2,6 +2,7 @@
 ## Test the ZICFILP unlabeled feature.
 ## To lift maintenance burden, most tests are conducted only with 64-bit RISC-V
 ## Naming convention: *-s.s files enables ZICFILP unlabeled.
+## Naming convention: *-f.s files enables ZICFILP func-sig.
 ## Naming convention: *-c.s files enables both of the conflicting ZICFILP unlabeled and ZICFILP func-sig features.
 # RUN: rm -rf %t && split-file %s %t && cd %t
 # RUN: llvm-mc --filetype=obj --triple=riscv32 rv32-f1-s.s -o rv32-f1-s.o
@@ -14,14 +15,20 @@
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f2-s.s -o f2-s.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3.s   -o f3.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3-s.s -o f3-s.o
+# RUN: llvm-mc --filetype=obj --triple=riscv64 f3-f.s -o f3-f.o
 
-## ZICFILP-unlabeled should be enabled when it's enabled in all inputs
+## ZICFILP-unlabeled should be enabled when it's enabled in all inputs or when
+## it's forced on.
 # RUN: ld.lld rv32-f1-s.o rv32-f2-s.o rv32-f3-s.o -o out.rv32 --fatal-warnings
 # RUN: llvm-readelf -n out.rv32 | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f2-s.o f3-s.o -o out --fatal-warnings
 # RUN: llvm-readelf -n out | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f3-s.o --shared -o out.so --fatal-warnings
 # RUN: llvm-readelf -n out.so | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -o out.force -z zicfilp=unlabeled --fatal-warnings
+# RUN: llvm-readelf -n out.force | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f2-s.o f3.o --shared -o out.force.so -z zicfilp=never -z zicfilp=unlabeled --fatal-warnings
+# RUN: llvm-readelf -n out.force.so | FileCheck --check-prefix=ZICFILP %s
 # ZICFILP: Properties: RISC-V feature: ZICFILP-unlabeled
 
 ## ZICFILP-unlabeled should not be enabled if it's not enabled in at least one
@@ -31,21 +38,35 @@
 # RUN: ld.lld f2-s.o f3.o --shared -o out.no.so --fatal-warnings
 # RUN: llvm-readelf -n out.no.so | count 0
 
+## ZICFILP-unlabeled should be disabled with zicfilp=never, even if
+## ZICFILP-unlabeled is present in all inputs.
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp=unlabeled -z zicfilp=never -o out.never --fatal-warnings
+# RUN: llvm-readelf -n out.never | count 0
+
 ## zicfilp-unlabeled-report should report any input files that don't have the
 ## ZICFILP-unlabeled property
 # RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-unlabeled-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-unlabeled-report=warning -z zicfilp=unlabeled 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-unlabeled-report=warning -z zicfilp=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
 # RUN: not ld.lld f2-s.o f3.o --shared -z zicfilp-unlabeled-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-unlabeled-report=warning 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-unlabeled-report=warning -z zicfilp=never 2>&1 | count 0
 # REPORT-WARN: warning: f2.o: -z zicfilp-unlabeled-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED property
 # REPORT-ERROR: error: f3.o: -z zicfilp-unlabeled-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED property
 
 ## An invalid -z zicfilp-unlabeled-report option should give an error
-# RUN: not ld.lld f2-s.o -z zicfilp-unlabeled-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# RUN: not ld.lld f2-s.o -z zicfilp=x -z zicfilp-unlabeled-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# INVALID: error: unknown -z zicfilp= value: x
 # INVALID: error: unknown -z zicfilp-unlabeled-report= value: x
 
 ## ZICFILP-unlabeled and ZICFILP-func-sig should conflict with each other
 # RUN: not ld.lld f1-c.o 2>&1 | FileCheck --check-prefix=CONFLICT %s
+# RUN: ld.lld f3-f.o -o out.override -z zicfilp=unlabeled 2>&1 | FileCheck --check-prefix=FORCE-CONFLICT %s
 # CONFLICT: error: f1-c.o: file has conflicting properties: GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED and GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG
+# FORCE-CONFLICT: warning: f3-f.o: -z zicfilp=unlabeled: file has conflicting property: GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG
+
+## -z zicfilp=unlabeled should override and disable ZICFILP-func-sig.
+# RUN: llvm-readelf -n out.override | FileCheck --check-prefixes=ZICFILP,OVERRIDE %s
+# OVERRIDE-NOT: ZICFILP-func-sig
 
 #--- rv32-f1-s.s
 .section ".note.gnu.property", "a"
@@ -219,3 +240,24 @@ ndesc_end:
 .type f3,@function
 f3:
   ret
+
+#--- f3-f.s
+.section ".note.gnu.property", "a"
+.balign 8
+.4byte 4
+.4byte (ndesc_end - ndesc_begin)
+.4byte 0x5        // NT_GNU_PROPERTY_TYPE_0
+.asciz "GNU"
+ndesc_begin:
+.balign 8
+.4byte 0xc0000000 // GNU_PROPERTY_RISCV_FEATURE_1_AND
+.4byte 4
+.4byte 4          // GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG
+.balign 8
+ndesc_end:
+
+.text
+.globl f3
+.type f3,@function
+f3:
+  ret
diff --git a/lld/test/ELF/riscv-feature-zicfiss.s b/lld/test/ELF/riscv-feature-zicfiss.s
index 4623522f5ed7..7b208ddd9b8e 100644
--- a/lld/test/ELF/riscv-feature-zicfiss.s
+++ b/lld/test/ELF/riscv-feature-zicfiss.s
@@ -13,13 +13,17 @@
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3.s   -o f3.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3-s.s -o f3-s.o
 
-## ZICFISS should be enabled when it's enabled in all inputs
+## ZICFISS should be enabled when it's enabled in all inputs or when it's forced on.
 # RUN: ld.lld rv32-f1-s.o rv32-f2-s.o rv32-f3-s.o -o out.rv32 --fatal-warnings
 # RUN: llvm-readelf -n out.rv32 | FileCheck --check-prefix=ZICFISS %s
 # RUN: ld.lld f1-s.o f2-s.o f3-s.o -o out --fatal-warnings
 # RUN: llvm-readelf -n out | FileCheck --check-prefix=ZICFISS %s
 # RUN: ld.lld f1-s.o f3-s.o --shared -o out.so --fatal-warnings
 # RUN: llvm-readelf -n out.so | FileCheck --check-prefix=ZICFISS %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -o out.force -z zicfiss=always --fatal-warnings
+# RUN: llvm-readelf -n out.force | FileCheck --check-prefix=ZICFISS %s
+# RUN: ld.lld f2-s.o f3.o --shared -o out.force.so -z zicfiss=never -z zicfiss=always --fatal-warnings
+# RUN: llvm-readelf -n out.force.so | FileCheck --check-prefix=ZICFISS %s
 # ZICFISS: Properties: RISC-V feature: ZICFISS
 
 ## ZICFISS should not be enabled if it's not enabled in at least one input
@@ -28,17 +32,25 @@
 # RUN: ld.lld f2-s.o f3.o --shared -o out.no.so --fatal-warnings
 # RUN: llvm-readelf -n out.no.so | count 0
 
+## ZICFISS should be disabled with zicfiss=never, even if ZICFISS is present in
+## all inputs.
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss=always -z zicfiss=never -o out.never --fatal-warnings
+# RUN: llvm-readelf -n out.never | count 0
+
 ## zicfiss-report should report any input files that don't have the zicfiss
 ## property
 # RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfiss-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfiss-report=warning -z zicfiss=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfiss-report=warning -z zicfiss=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
 # RUN: not ld.lld f2-s.o f3.o --shared -z zicfiss-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=warning 2>&1 | count 0
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=error 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=warning -z zicfiss=always 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=error -z zicfiss=always 2>&1 | count 0
 # REPORT-WARN: warning: f2.o: -z zicfiss-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS property
 # REPORT-ERROR: error: f3.o: -z zicfiss-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS property
 
 ## An invalid -z zicfiss-report option should give an error
-# RUN: not ld.lld f2-s.o f3-s.o -z zicfiss-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# RUN: not ld.lld f2-s.o f3-s.o -z zicfiss=x -z zicfiss-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# INVALID: error: unknown -z zicfiss= value: x
 # INVALID: error: unknown -z zicfiss-report= value: x
 
 #--- rv32-f1-s.s

From f71fb2dc01e117481f56e040c25391883d43c1c5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 21:00:22 -0700
Subject: [PATCH 0452/1322] [clang] Use StringRef in range-based for loops
 (NFC) (#144242)

When we iterate over std::vector<std::string>, we can directly assign
each element to StringRef.  We do not need to go through separate
statements.
---
 clang/lib/Basic/TargetInfo.cpp           | 3 +--
 clang/lib/Sema/SemaDeclAttr.cpp          | 6 ++----
 clang/lib/Tooling/ArgumentsAdjusters.cpp | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 7b577632fdf5..9429a316a919 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -555,8 +555,7 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
 bool TargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeatureVec) const {
-  for (const auto &F : FeatureVec) {
-    StringRef Name = F;
+  for (StringRef Name : FeatureVec) {
     if (Name.empty())
       continue;
     // Apply the feature via the target.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 9c985e6bd5e0..2e826adf9229 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3232,8 +3232,7 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
     if (ParsedAttrs.Duplicate != "")
       return Diag(LiteralLoc, diag::err_duplicate_target_attribute)
              << Duplicate << None << ParsedAttrs.Duplicate << Target;
-    for (const auto &Feature : ParsedAttrs.Features) {
-      StringRef CurFeature = Feature;
+    for (StringRef CurFeature : ParsedAttrs.Features) {
       if (!CurFeature.starts_with('+') && !CurFeature.starts_with('-'))
         return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
                << Unsupported << None << AttrStr << Target;
@@ -3241,8 +3240,7 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   }
 
   if (Context.getTargetInfo().getTriple().isLoongArch()) {
-    for (const auto &Feature : ParsedAttrs.Features) {
-      StringRef CurFeature = Feature;
+    for (StringRef CurFeature : ParsedAttrs.Features) {
       if (CurFeature.starts_with("!arch=")) {
         StringRef ArchValue = CurFeature.split("=").second.trim();
         return Diag(LiteralLoc, diag::err_attribute_unsupported)
diff --git a/clang/lib/Tooling/ArgumentsAdjusters.cpp b/clang/lib/Tooling/ArgumentsAdjusters.cpp
index d01c57ee69c0..999fa790124c 100644
--- a/clang/lib/Tooling/ArgumentsAdjusters.cpp
+++ b/clang/lib/Tooling/ArgumentsAdjusters.cpp
@@ -22,8 +22,7 @@ namespace clang {
 namespace tooling {
 
 static StringRef getDriverMode(const CommandLineArguments &Args) {
-  for (const auto &Arg : Args) {
-    StringRef ArgRef = Arg;
+  for (StringRef ArgRef : Args) {
     if (ArgRef.consume_front("--driver-mode=")) {
       return ArgRef;
     }

From 7a4a83b551eaf159ce10b612def3be62d80706d4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 21:00:29 -0700
Subject: [PATCH 0453/1322] [TableGen] Use range-based for loops (NFC)
 (#144283)

---
 .../TableGen/Common/CodeGenDAGPatterns.cpp    |  4 +--
 .../TableGen/Common/CodeGenDAGPatterns.h      | 14 +++++-----
 .../TableGen/Common/CodeGenInstruction.h      |  3 +--
 .../utils/TableGen/Common/CodeGenSchedule.cpp |  7 ++---
 llvm/utils/TableGen/Common/DAGISelMatcher.cpp |  8 +++---
 llvm/utils/TableGen/DAGISelMatcherGen.cpp     | 27 +++++++++----------
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   | 12 ++++-----
 llvm/utils/TableGen/X86DisassemblerTables.cpp |  6 ++---
 8 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index d33c0dba91fd..810b35e65b31 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -1824,8 +1824,8 @@ bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo,
 }
 
 bool TreePatternNode::ContainsUnresolvedType(TreePattern &TP) const {
-  for (unsigned i = 0, e = Types.size(); i != e; ++i)
-    if (!TP.getInfer().isConcrete(Types[i], true))
+  for (const TypeSetByHwMode &Type : Types)
+    if (!TP.getInfer().isConcrete(Type, true))
       return true;
   for (const TreePatternNode &Child : children())
     if (Child.ContainsUnresolvedType(TP))
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index a5aadf2ee113..64fec275faa6 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -747,8 +747,8 @@ public:
 
   /// hasChild - Return true if N is any of our children.
   bool hasChild(const TreePatternNode *N) const {
-    for (unsigned i = 0, e = Children.size(); i != e; ++i)
-      if (Children[i].get() == N)
+    for (const TreePatternNodePtr &Child : Children)
+      if (Child.get() == N)
         return true;
     return false;
   }
@@ -1171,9 +1171,9 @@ public:
   }
 
   const CodeGenIntrinsic &getIntrinsic(const Record *R) const {
-    for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
-      if (Intrinsics[i].TheDef == R)
-        return Intrinsics[i];
+    for (const CodeGenIntrinsic &Intrinsic : Intrinsics)
+      if (Intrinsic.TheDef == R)
+        return Intrinsic;
     llvm_unreachable("Unknown intrinsic!");
   }
 
@@ -1280,8 +1280,8 @@ private:
 inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode &N,
                                              TreePattern &TP) const {
   bool MadeChange = false;
-  for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
-    MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
+  for (const SDTypeConstraint &TypeConstraint : TypeConstraints)
+    MadeChange |= TypeConstraint.ApplyTypeConstraint(N, *this, TP);
   return MadeChange;
 }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index e38979af3909..3a5abc55319b 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -127,8 +127,7 @@ public:
     /// getTiedOperand - If this operand is tied to another one, return the
     /// other operand number.  Otherwise, return -1.
     int getTiedRegister() const {
-      for (unsigned j = 0, e = Constraints.size(); j != e; ++j) {
-        const CGIOperandList::ConstraintInfo &CI = Constraints[j];
+      for (const CGIOperandList::ConstraintInfo &CI : Constraints) {
         if (CI.isTied())
           return CI.getTiedOperand();
       }
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 72954804b66f..af7e43929bcf 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -2232,13 +2232,10 @@ void PredTransitions::dump() const {
       dbgs() << LS << SchedModels.getSchedRW(PC.RWIdx, PC.IsRead).Name << ":"
              << PC.Predicate->getName();
     dbgs() << "},\n  => {";
-    for (SmallVectorImpl<SmallVector<unsigned, 4>>::const_iterator
-             WSI = TI.WriteSequences.begin(),
-             WSE = TI.WriteSequences.end();
-         WSI != WSE; ++WSI) {
+    for (const auto &WS : TI.WriteSequences) {
       dbgs() << "(";
       ListSeparator LS;
-      for (unsigned N : *WSI)
+      for (unsigned N : WS)
         dbgs() << LS << SchedModels.getSchedWrite(N).Name;
       dbgs() << "),";
     }
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
index 8780c4f5b61c..3543bb5a55c6 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
@@ -286,11 +286,11 @@ void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, indent Indent) const {
   OS << (isa<MorphNodeToMatcher>(this) ? "MorphNodeTo: " : "EmitNode: ")
      << CGI.Namespace << "::" << CGI.TheDef->getName() << ": <todo flags> ";
 
-  for (unsigned i = 0, e = VTs.size(); i != e; ++i)
-    OS << ' ' << getEnumName(VTs[i]);
+  for (MVT::SimpleValueType VT : VTs)
+    OS << ' ' << getEnumName(VT);
   OS << '(';
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-    OS << Operands[i] << ' ';
+  for (unsigned Operand : Operands)
+    OS << Operand << ' ';
   OS << ")\n";
 }
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index afdb6879eede..0039ff4f3e2d 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -526,23 +526,20 @@ void MatcherGen::EmitMatchCode(const TreePatternNode &N,
     EmitOperatorMatchCode(N, NodeNoTypes);
 
   // If there are node predicates for this node, generate their checks.
-  for (unsigned i = 0, e = N.getPredicateCalls().size(); i != e; ++i) {
-    const TreePredicateCall &Pred = N.getPredicateCalls()[i];
+  for (const TreePredicateCall &Pred : N.getPredicateCalls()) {
     SmallVector<unsigned, 4> Operands;
     if (Pred.Fn.usesOperands()) {
       TreePattern *TP = Pred.Fn.getOrigPatFragRecord();
-      for (unsigned i = 0; i < TP->getNumArgs(); ++i) {
-        std::string Name =
-            ("pred:" + Twine(Pred.Scope) + ":" + TP->getArgName(i)).str();
+      for (const std::string &Arg : TP->getArgList()) {
+        std::string Name = ("pred:" + Twine(Pred.Scope) + ":" + Arg).str();
         Operands.push_back(getNamedArgumentSlot(Name));
       }
     }
     AddMatcher(new CheckPredicateMatcher(Pred.Fn, Operands));
   }
 
-  for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i)
-    AddMatcher(new CheckTypeMatcher(N.getSimpleType(ResultsToTypeCheck[i]),
-                                    ResultsToTypeCheck[i]));
+  for (unsigned I : ResultsToTypeCheck)
+    AddMatcher(new CheckTypeMatcher(N.getSimpleType(I), I));
 }
 
 /// EmitMatcherCode - Generate the code that matches the predicate of this
@@ -836,8 +833,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
       const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode);
-      for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i)
-        EmitResultOperand(*DefaultOp.DefaultOps[i], InstOps);
+      for (const TreePatternNodePtr &Op : DefaultOp.DefaultOps)
+        EmitResultOperand(*Op, InstOps);
       continue;
     }
 
@@ -886,10 +883,10 @@ void MatcherGen::EmitResultInstructionAsOperand(
   if (isRoot && !PhysRegInputs.empty()) {
     // Emit all of the CopyToReg nodes for the input physical registers.  These
     // occur in patterns like (mul:i8 AL:i8, GR8:i8:$src).
-    for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i) {
+    for (const auto &PhysRegInput : PhysRegInputs) {
       const CodeGenRegister *Reg =
-          CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
-      AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second, Reg));
+          CGP.getTargetInfo().getRegBank().getReg(PhysRegInput.first);
+      AddMatcher(new EmitCopyToRegMatcher(PhysRegInput.second, Reg));
     }
 
     // Even if the node has no other glue inputs, the resultant node must be
@@ -977,8 +974,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
                                  NumFixedArityOperands, NextRecordedOperandNo));
 
   // The non-chain and non-glue results of the newly emitted node get recorded.
-  for (unsigned i = 0, e = ResultVTs.size(); i != e; ++i) {
-    if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue)
+  for (MVT::SimpleValueType ResultVT : ResultVTs) {
+    if (ResultVT == MVT::Other || ResultVT == MVT::Glue)
       break;
     OutputOps.push_back(NextRecordedOperandNo++);
   }
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index bc1650a4acf0..7d24c0f80cdd 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -726,11 +726,12 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   // Output the rows.
   OS << "  static const " << getMinimalTypeForRange(SubRegIndicesSize + 1, 32)
      << " Rows[" << Rows.size() << "][" << SubRegIndicesSize << "] = {\n";
-  for (unsigned r = 0, re = Rows.size(); r != re; ++r) {
+  for (const auto &Row : Rows) {
     OS << "    { ";
-    for (unsigned i = 0, e = SubRegIndicesSize; i != e; ++i)
-      if (Rows[r][i])
-        OS << Rows[r][i]->getQualifiedName() << ", ";
+    for (const llvm::CodeGenSubRegIndex *Elem :
+         ArrayRef(&Row[0], SubRegIndicesSize))
+      if (Elem)
+        OS << Elem->getQualifiedName() << ", ";
       else
         OS << "0, ";
     OS << "},\n";
@@ -830,8 +831,7 @@ void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   for (size_t s = 0, se = Sequences.size(); s != se; ++s) {
     OS << "    ";
     const SmallVectorImpl<MaskRolPair> &Sequence = Sequences[s];
-    for (size_t p = 0, pe = Sequence.size(); p != pe; ++p) {
-      const MaskRolPair &P = Sequence[p];
+    for (const MaskRolPair &P : Sequence) {
       printMask(OS << "{ ", P.Mask);
       OS << format(", %2u }, ", P.RotateLeft);
     }
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 36f752a1ebe6..3c422a32dcaf 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -882,9 +882,9 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
     N = ++OperandSetNum;
 
     o << "  { /* " << (OperandSetNum - 1) << " */\n";
-    for (unsigned i = 0, e = OperandList.size(); i != e; ++i) {
-      const char *Encoding = stringForOperandEncoding(OperandList[i].first);
-      const char *Type = stringForOperandType(OperandList[i].second);
+    for (const auto &[Enc, Ty] : OperandList) {
+      const char *Encoding = stringForOperandEncoding(Enc);
+      const char *Type = stringForOperandType(Ty);
       o << "    { " << Encoding << ", " << Type << " },\n";
     }
     o << "  },\n";

From c01532177ff61a768d5dc1ea541f9a8d986497fa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 21:00:36 -0700
Subject: [PATCH 0454/1322] [clang] Remove unused includes (NFC) (#144285)

These are identified by misc-include-cleaner.  I've filtered out those
that break builds.  Also, I'm staying away from llvm-config.h,
config.h, and Compiler.h, which likely cause platform- or
compiler-specific build failures.
---
 clang/lib/Analysis/UnsafeBufferUsage.cpp                       | 1 -
 clang/lib/Basic/LangOptions.cpp                                | 1 -
 clang/lib/CodeGen/CGBuiltin.cpp                                | 1 -
 clang/lib/CodeGen/CGHLSLRuntime.cpp                            | 1 -
 clang/lib/Edit/EditedSource.cpp                                | 2 --
 clang/lib/ExtractAPI/ExtractAPIConsumer.cpp                    | 1 -
 clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp   | 2 --
 clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp             | 1 -
 clang/lib/InstallAPI/DirectoryScanner.cpp                      | 1 -
 clang/lib/InstallAPI/FileList.cpp                              | 2 --
 clang/lib/InstallAPI/Frontend.cpp                              | 1 -
 clang/lib/InstallAPI/Visitor.cpp                               | 1 -
 clang/lib/Interpreter/InterpreterValuePrinter.cpp              | 2 --
 clang/lib/Interpreter/Value.cpp                                | 3 ---
 clang/lib/Lex/HeaderMap.cpp                                    | 1 -
 clang/lib/Rewrite/HTMLRewrite.cpp                              | 2 --
 clang/lib/Sema/SemaDeclAttr.cpp                                | 1 -
 clang/lib/Sema/SemaExprCXX.cpp                                 | 1 -
 .../Checkers/RetainCountChecker/RetainCountChecker.cpp         | 1 -
 clang/lib/Support/RISCVVIntrinsicUtils.cpp                     | 3 ---
 20 files changed, 29 deletions(-)

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 8b1ca6b80971..631a546b45ff 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -30,7 +30,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
 #include <cstddef>
 #include <optional>
 #include <queue>
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index 7e696620993f..912b890569cf 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c0b02a104d95..1f6927435167 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17,7 +17,6 @@
 #include "CGDebugInfo.h"
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
-#include "CGPointerAuthInfo.h"
 #include "CGRecordLayout.h"
 #include "CGValue.h"
 #include "CodeGenFunction.h"
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 977ff792bae2..571ff53b7d64 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
-#include <utility>
 
 using namespace clang;
 using namespace CodeGen;
diff --git a/clang/lib/Edit/EditedSource.cpp b/clang/lib/Edit/EditedSource.cpp
index a3386b2489b0..398cce71d5e2 100644
--- a/clang/lib/Edit/EditedSource.cpp
+++ b/clang/lib/Edit/EditedSource.cpp
@@ -16,10 +16,8 @@
 #include "clang/Edit/FileOffset.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include <algorithm>
 #include <cassert>
 #include <tuple>
 #include <utility>
diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
index 764c345a9db9..1087eb300185 100644
--- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
+++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
@@ -43,7 +43,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index e881d56258e5..139023f32e8d 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -19,14 +19,12 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iterator>
 #include <optional>
-#include <type_traits>
 
 using namespace clang;
 using namespace clang::extractapi;
diff --git a/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp b/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
index fd9db8113a41..37b428216c91 100644
--- a/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
+++ b/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
@@ -8,7 +8,6 @@
 
 #include "DiagnosticBuilderWrappers.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/Platform.h"
 
diff --git a/clang/lib/InstallAPI/DirectoryScanner.cpp b/clang/lib/InstallAPI/DirectoryScanner.cpp
index be43a96f3d97..f8f708fda4ca 100644
--- a/clang/lib/InstallAPI/DirectoryScanner.cpp
+++ b/clang/lib/InstallAPI/DirectoryScanner.cpp
@@ -9,7 +9,6 @@
 #include "clang/InstallAPI/DirectoryScanner.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/TextAPI/DylibReader.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
diff --git a/clang/lib/InstallAPI/FileList.cpp b/clang/lib/InstallAPI/FileList.cpp
index 65610903840a..8f8ed6e8a5db 100644
--- a/clang/lib/InstallAPI/FileList.cpp
+++ b/clang/lib/InstallAPI/FileList.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/InstallAPI/FileList.h"
-#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/InstallAPI/FileList.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Error.h"
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 9e8c60fbda3d..cce0b19b5061 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -9,7 +9,6 @@
 #include "clang/InstallAPI/Frontend.h"
 #include "clang/AST/Availability.h"
 #include "clang/InstallAPI/FrontendRecords.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index a73ea0b0d124..487be2c30088 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -13,7 +13,6 @@
 #include "clang/Basic/Linkage.h"
 #include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/FrontendRecords.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Mangler.h"
diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
index 3e3fbfd172ca..3e7e32b2e855 100644
--- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp
+++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
@@ -18,7 +18,6 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
-#include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
 
@@ -26,7 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
-#include <string>
 
 #include <cstdarg>
 
diff --git a/clang/lib/Interpreter/Value.cpp b/clang/lib/Interpreter/Value.cpp
index eb2ce9c9fd33..afdf406b3725 100644
--- a/clang/lib/Interpreter/Value.cpp
+++ b/clang/lib/Interpreter/Value.cpp
@@ -16,10 +16,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Interpreter/Interpreter.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_os_ostream.h"
 #include <cassert>
-#include <cstdint>
 #include <utility>
 
 namespace {
diff --git a/clang/lib/Lex/HeaderMap.cpp b/clang/lib/Lex/HeaderMap.cpp
index 588b32ee9ca8..a7b670f00ac6 100644
--- a/clang/lib/Lex/HeaderMap.cpp
+++ b/clang/lib/Lex/HeaderMap.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SystemZ/zOSSupport.h"
 #include <cstring>
 #include <memory>
 #include <optional>
diff --git a/clang/lib/Rewrite/HTMLRewrite.cpp b/clang/lib/Rewrite/HTMLRewrite.cpp
index 1829a4ff3504..109cdf990543 100644
--- a/clang/lib/Rewrite/HTMLRewrite.cpp
+++ b/clang/lib/Rewrite/HTMLRewrite.cpp
@@ -17,9 +17,7 @@
 #include "clang/Lex/TokenConcatenation.h"
 #include "clang/Rewrite/Core/Rewriter.h"
 #include "llvm/ADT/RewriteBuffer.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2e826adf9229..1c2fa80e782d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -53,7 +53,6 @@
 #include "clang/Sema/SemaOpenCL.h"
 #include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaRISCV.h"
-#include "clang/Sema/SemaSPIRV.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "clang/Sema/SemaSwift.h"
 #include "clang/Sema/SemaWasm.h"
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index ba52e8f8932d..4a86cbd0633b 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -40,7 +40,6 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
-#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaLambda.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaPPC.h"
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
index a00a09f60fd5..62bc3218d9ce 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RetainCountChecker.h"
-#include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include <optional>
 
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
index daf09ac66f21..5a4e805d4a9d 100644
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -8,13 +8,10 @@
 
 #include "clang/Support/RISCVVIntrinsicUtils.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <numeric>
 #include <optional>
 
 using namespace llvm;

From cab09e76e0c4c95f44cf90bf2bf7a6eaa15b14b2 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 16 Jun 2025 12:07:47 +0800
Subject: [PATCH 0455/1322] [InstCombine] Propagate FMF from fptrunc when
 folding `fptrunc fabs(X) -> fabs(fptrunc X)` (#143352)

Alive2: https://alive2.llvm.org/ce/z/DWV3G3
fptrunc yields infinity when the input cannot fit in the target type. So
ninf should be propagated from fptrunc. For other intrinsics, the
previous check ensures that the result is never an infinity:

https://github.com/llvm/llvm-project/blob/5d3899d293e902124c3602b466031b6b799fb123/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp#L1910-L1917

Closes https://github.com/llvm/llvm-project/issues/143122.
---
 .../InstCombine/InstCombineCasts.cpp          |  4 ++-
 .../InstCombine/double-float-shrink-2.ll      |  4 +--
 llvm/test/Transforms/InstCombine/fabs.ll      |  2 +-
 llvm/test/Transforms/InstCombine/fpcast.ll    | 35 ++++++++++++++++++-
 4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2db79228bf0e..d4a2fe5e37ef 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1917,7 +1917,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
       II->getOperandBundlesAsDefs(OpBundles);
       CallInst *NewCI =
           CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName());
-      NewCI->copyFastMathFlags(II);
+      // A normal value may be converted to an infinity. It means that we cannot
+      // propagate ninf from the intrinsic. So we propagate FMF from fptrunc.
+      NewCI->copyFastMathFlags(&FPT);
       return NewCI;
     }
     }
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index f2049e2813eb..f884d2bd1ab5 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -449,7 +449,7 @@ define float @test_shrink_intrin_fabs_fast_double_src(double %D) {
 ; CHECK-NEXT:    ret float [[F]]
 ;
   %E = call fast double @llvm.fabs.f64(double %D)
-  %F = fptrunc double %E to float
+  %F = fptrunc fast double %E to float
   ret float %F
 }
 
@@ -611,7 +611,7 @@ define half @test_mismatched_type_intrin_fabs_fast_double_src(double %D) {
 ; CHECK-NEXT:    ret half [[F]]
 ;
   %E = call fast double @llvm.fabs.f64(double %D)
-  %F = fptrunc double %E to half
+  %F = fptrunc fast double %E to half
   ret half %F
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll
index 0a22d1431b5f..0d9374410a1d 100644
--- a/llvm/test/Transforms/InstCombine/fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fabs.ll
@@ -1522,7 +1522,7 @@ define float @test_fabs_nsz_used_by_frem(float %x) {
 define half @test_fabs_nsz_used_by_fptrunc(float %x) {
 ; CHECK-LABEL: @test_fabs_nsz_used_by_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[X:%.*]] to half
-; CHECK-NEXT:    [[OP:%.*]] = call nsz half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    [[OP:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
 ; CHECK-NEXT:    ret half [[OP]]
 ;
   %cmp = fcmp oge float %x, 0.000000e+00
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index d5290b572aef..1a3faceebf24 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -32,7 +32,7 @@ define half @test3(float %a) {
 define half @test3_fast(float %a) {
 ; CHECK-LABEL: @test3_fast(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
-; CHECK-NEXT:    [[C:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    [[C:%.*]] = call fast half @llvm.fabs.f16(half [[TMP1]])
 ; CHECK-NEXT:    ret half [[C]]
 ;
   %b = call float @llvm.fabs.f32(float %a)
@@ -40,6 +40,39 @@ define half @test3_fast(float %a) {
   ret half %c
 }
 
+define half @test3_both_ninf(float %a) {
+; CHECK-LABEL: @test3_both_ninf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call ninf half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call ninf float @llvm.fabs.f32(float %a)
+  %c = fptrunc ninf float %b to half
+  ret half %c
+}
+
+define half @test3_fabs_ninf(float %a) {
+; CHECK-LABEL: @test3_fabs_ninf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call ninf float @llvm.fabs.f32(float %a)
+  %c = fptrunc float %b to half
+  ret half %c
+}
+
+define half @test3_fptrunc_ninf(float %a) {
+; CHECK-LABEL: @test3_fptrunc_ninf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call ninf half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call float @llvm.fabs.f32(float %a)
+  %c = fptrunc ninf float %b to half
+  ret half %c
+}
+
 define half @fneg_fptrunc(float %a) {
 ; CHECK-LABEL: @fneg_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half

From e2afda6fc95ef63b54d449fc1a9eb13cd0ff3639 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 16 Jun 2025 12:15:30 +0800
Subject: [PATCH 0456/1322] [RISCV] Fix incorrect predicates for fp16
 permutation intrinsics (#144063)

vrgatherei16, vslideup and vslidedown should be supported with fp16 type
for Zvfhmin.

Fixes https://github.com/llvm/llvm-project/issues/143975.
---
 clang/include/clang/Basic/riscv_vector.td           |  2 +-
 clang/include/clang/Basic/riscv_vector_common.td    |  4 ++--
 .../non-policy/non-overloaded/vrgatherei16.c        |  2 +-
 .../non-policy/non-overloaded/vslidedown.c          |  2 +-
 .../non-policy/non-overloaded/vslideup.c            |  2 +-
 .../non-policy/overloaded/vrgatherei16.c            |  2 +-
 .../non-policy/overloaded/vslidedown.c              |  2 +-
 .../non-policy/overloaded/vslideup.c                |  2 +-
 .../policy/non-overloaded/vrgatherei16.c            |  2 +-
 .../policy/non-overloaded/vslidedown.c              |  2 +-
 .../policy/non-overloaded/vslideup.c                |  2 +-
 .../policy/overloaded/vrgatherei16.c                |  2 +-
 .../policy/overloaded/vslidedown.c                  |  2 +-
 .../policy/overloaded/vslideup.c                    |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td     | 13 ++++++++++---
 llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll         |  4 ++--
 llvm/test/CodeGen/RISCV/rvv/vslidedown.ll           |  4 ++--
 llvm/test/CodeGen/RISCV/rvv/vslideup.ll             |  4 ++--
 18 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index 3e22bfb330af..c6fd8a1a45fd 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -2397,7 +2397,7 @@ let RequiredFeatures = ["zvfbfmin"] in {
 }
 defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "csilfd",
                                      [["vv", "v", "vv(Log2EEW:4)Uv"]]>;
-let RequiredFeatures = ["zvfh"] in
+let RequiredFeatures = ["zvfhmin"] in
 defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "x",
                                      [["vv", "v", "vv(Log2EEW:4)Uv"]]>;
 // unsigned type
diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td
index c6753978274a..e3d589699645 100644
--- a/clang/include/clang/Basic/riscv_vector_common.td
+++ b/clang/include/clang/Basic/riscv_vector_common.td
@@ -593,7 +593,7 @@ let UnMaskedPolicyScheme = HasPolicyOperand,
   multiclass RVVSlideUpBuiltinSet {
     defm "" : RVVOutBuiltinSet<NAME, "csilfd",
                                [["vx","v", "vvvz"]]>;
-    let RequiredFeatures = ["zvfh"] in
+    let RequiredFeatures = ["zvfhmin"] in
       defm "" : RVVOutBuiltinSet<NAME, "x",
                                  [["vx","v", "vvvz"]]>;
     defm "" : RVVOutBuiltinSet<NAME, "csil",
@@ -618,7 +618,7 @@ let UnMaskedPolicyScheme = HasPassthruOperand,
   multiclass RVVSlideDownBuiltinSet {
     defm "" : RVVOutBuiltinSet<NAME, "csilfd",
                                [["vx","v", "vvz"]]>;
-    let RequiredFeatures = ["zvfh"] in
+    let RequiredFeatures = ["zvfhmin"] in
       defm "" : RVVOutBuiltinSet<NAME, "x",
                                  [["vx","v", "vvz"]]>;
     defm "" : RVVOutBuiltinSet<NAME, "csil",
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c
index 32469731d114..41214f7cdce2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c
index c4e7d86e7d53..8b97ce8f760c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c
index 0ab387525f6a..c302b2940bc6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c
index f69613c4777f..a63f0a59a34e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c
index 14b3a99a6f0f..fb99a750a670 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c
index 1b3c3f6c0f85..77e8122890ab 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c
index e22da32dbfa8..cf98549c41af 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c
index 205866db3566..4f1c00bef076 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c
index b32264fd88e7..c9fa994e51b3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c
index 3d53e46b4885..c50f1f731ffb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c
index c275ee9bb2f6..476b9b59dc19 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c
index 9bd602fa5d76..1e0228e17caf 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 22b5b52541d6..5e554d2d0391 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -7414,8 +7414,12 @@ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllInteger
 defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
 
-defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
-defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectorsExceptFP16, uimm5>;
+let Predicates = [HasVInstructionsF16Minimal] in
+  defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFP16Vectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectorsExceptFP16, uimm5>;
+let Predicates = [HasVInstructionsF16Minimal] in
+  defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFP16Vectors, uimm5>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
 
@@ -7432,7 +7436,10 @@ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                 AllBFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
-                              eew=16, vtilist=AllFloatVectors>;
+                              eew=16, vtilist=AllFloatVectorsExceptFP16>;
+let Predicates = [HasVInstructionsF16Minimal] in
+  defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
+                                eew=16, vtilist=AllFP16Vectors>;
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
index f386fd9cd3ae..75c00e406b4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll b/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll
index 2be187c50af2..f0f78c211c4a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vslidedown.nxv1i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslideup.ll b/llvm/test/CodeGen/RISCV/rvv/vslideup.ll
index 1e3ede7fee9c..8e3c05611bc7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslideup.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslideup.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vslideup.nxv1i8(

From 29fcad000ca63078d28dd231e0727b7811df43b0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 21:27:12 -0700
Subject: [PATCH 0457/1322] AVR: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index bf512dd07c0a..1e4b2e27a183 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -14,6 +14,7 @@
 
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -133,7 +134,7 @@ void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << formatImm(Op.getImm());
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 

From f8e0518120cd2850a7f674322bf428bc7d7d3326 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 21:46:08 -0700
Subject: [PATCH 0458/1322] MC: Adjust -show-inst output for MCExpr

This dump feature does not pass MCAsmInfo to the printer function.
When we remove MCSpecifierExpr subclasses (and the printImpl overrides),
we will not be able to print target-specific specifier strings.
Just print a textual representation.
---
 llvm/lib/MC/MCExpr.cpp                   |   13 +-
 llvm/lib/MC/MCInst.cpp                   |    2 +-
 llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll |  496 ++---
 llvm/test/CodeGen/Mips/llvm-ir/load.ll   | 2316 +++++++++++-----------
 llvm/test/CodeGen/Mips/llvm-ir/store.ll  | 1308 ++++++------
 llvm/test/MC/Lanai/conditional_inst.s    |    6 +-
 llvm/test/MC/Lanai/memory.s              |    4 +-
 7 files changed, 2075 insertions(+), 2070 deletions(-)

diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 5ccad6d48797..89191294f3ed 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -173,10 +173,15 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI,
     return;
   }
 
-  case MCExpr::Specifier:
-    // TODO: Remove after all targets that use MCSpecifierExpr migrate to
-    // MCAsmInfo::printSpecifierExpr.
-    return cast<MCSpecifierExpr>(this)->printImpl(OS, MAI);
+  case MCExpr::Specifier: {
+    auto &SE = cast<MCSpecifierExpr>(*this);
+    if (MAI)
+      return MAI->printSpecifierExpr(OS, SE);
+    // Used by dump features like -show-inst. Regular MCAsmStreamer output must
+    // set MAI.
+    OS << "specifier(" << SE.getSpecifier() << ',' << *SE.getSubExpr() << ')';
+    return;
+  }
   }
 
   llvm_unreachable("Invalid expression kind!");
diff --git a/llvm/lib/MC/MCInst.cpp b/llvm/lib/MC/MCInst.cpp
index 639619fe4e99..832d25060f88 100644
--- a/llvm/lib/MC/MCInst.cpp
+++ b/llvm/lib/MC/MCInst.cpp
@@ -35,7 +35,7 @@ void MCOperand::print(raw_ostream &OS, const MCRegisterInfo *RegInfo) const {
   else if (isDFPImm())
     OS << "DFPImm:" << bit_cast<double>(getDFPImm());
   else if (isExpr()) {
-    OS << "Expr:(" << *getExpr() << ")";
+    OS << "Expr:" << *getExpr();
   } else if (isInst()) {
     OS << "Inst:(";
     if (const auto *Inst = getInst())
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll b/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll
index 3bf17abc7965..79fe2fd26a6e 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll
@@ -38,189 +38,189 @@
 define i32 @test1(float %t) {
 ; M32-LABEL: test1:
 ; M32:       # %bb.0: # %entry
-; M32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-FP64-LABEL: test1:
 ; M32R2-FP64:       # %bb.0: # %entry
-; M32R2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32R2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-SF-LABEL: test1:
 ; M32R2-SF:       # %bb.0: # %entry
-; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4:]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; M32R2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; M32R2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST5:]] SW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
 ; M32R2-SF-NEXT:    .cfi_offset 31, -4
-; M32R2-SF-NEXT:    jal __fixsfsi # <MCInst #{{[0-9]+}} JAL
-; M32R2-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
-; M32R2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jal __fixsfsi # <MCInst #[[#MCINST6:]] JAL
+; M32R2-SF-NEXT:    # <MCOperand Expr:__fixsfsi>>
+; M32R2-SF-NEXT:    nop # <MCInst #[[#MCINST7:]] SLL
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:0>>
 ; M32R2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST8:]] LW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
-; M32R2-SF-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:24>>
 ;
 ; M32R3R5-LABEL: test1:
 ; M32R3R5:       # %bb.0: # %entry
-; M32R3R5-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R3R5-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32R3R5-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R6-LABEL: test1:
 ; M32R6:       # %bb.0: # %entry
-; M32R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32R6-NEXT:    jr $ra # <MCInst #[[#MCINST9:]] JALR
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64-LABEL: test1:
 ; M64:       # %bb.0: # %entry
-; M64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M64-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>>
+; M64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64R6-LABEL: test1:
 ; M64R6:       # %bb.0: # %entry
-; M64R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M64R6-NEXT:    jr $ra # <MCInst #[[#MCINST10:]] JALR64
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>>
+; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP32-LABEL: test1:
 ; MMR2-FP32:       # %bb.0: # %entry
-; MMR2-FP32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST11:]] TRUNC_W_S_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR2-FP32-NEXT:    jr $ra # <MCInst #[[#MCINST12:]] JR_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13:]] MFC1_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP64-LABEL: test1:
 ; MMR2-FP64:       # %bb.0: # %entry
-; MMR2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST11:]] TRUNC_W_S_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST12:]] JR_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13:]] MFC1_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-SF-LABEL: test1:
 ; MMR2-SF:       # %bb.0: # %entry
-; MMR2-SF-NEXT:    addiusp -24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp -24 # <MCInst #[[#MCINST14:]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} SWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST15:]] SWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR2-SF-NEXT:    .cfi_offset 31, -4
-; MMR2-SF-NEXT:    jal __fixsfsi # <MCInst #{{[0-9]+}} JAL_MM
-; MMR2-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
-; MMR2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    jal __fixsfsi # <MCInst #[[#MCINST16:]] JAL_MM
+; MMR2-SF-NEXT:    # <MCOperand Expr:__fixsfsi>>
+; MMR2-SF-NEXT:    nop # <MCInst #[[#MCINST17:]] SLL_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:0>>
 ; MMR2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} LWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST18:]] LWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR2-SF-NEXT:    addiusp 24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp 24 # <MCInst #[[#MCINST14]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR2-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19:]] JRC16_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 ;
 ; MMR6-LABEL: test1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST20:]] TRUNC_W_S_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13:]] MFC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST19:]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
 ;
 ; MMR6-SF-LABEL: test1:
 ; MMR6-SF:       # %bb.0: # %entry
-; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4:]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR6-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST5:]] SW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR6-SF-NEXT:    .cfi_offset 31, -4
-; MMR6-SF-NEXT:    balc __fixsfsi # <MCInst #{{[0-9]+}} BALC_MMR6
-; MMR6-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
+; MMR6-SF-NEXT:    balc __fixsfsi # <MCInst #[[#MCINST21:]] BALC_MMR6
+; MMR6-SF-NEXT:    # <MCOperand Expr:__fixsfsi>>
 ; MMR6-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST8:]] LW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR6-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19:]] JRC16_MM
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 entry:
   %conv = fptosi float %t to i32
   ret i32 %conv
@@ -229,189 +229,189 @@ entry:
 define i32 @test2(double %t) {
 ; M32-LABEL: test2:
 ; M32:       # %bb.0: # %entry
-; M32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D32
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST22:]] TRUNC_W_D32
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>>
+; M32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-FP64-LABEL: test2:
 ; M32R2-FP64:       # %bb.0: # %entry
-; M32R2-FP64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R2-FP64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M32R2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-SF-LABEL: test2:
 ; M32R2-SF:       # %bb.0: # %entry
-; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; M32R2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; M32R2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST5]] SW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
 ; M32R2-SF-NEXT:    .cfi_offset 31, -4
-; M32R2-SF-NEXT:    jal __fixdfsi # <MCInst #{{[0-9]+}} JAL
-; M32R2-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
-; M32R2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jal __fixdfsi # <MCInst #[[#MCINST6]] JAL
+; M32R2-SF-NEXT:    # <MCOperand Expr:__fixdfsi>>
+; M32R2-SF-NEXT:    nop # <MCInst #[[#MCINST7]] SLL
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:0>>
 ; M32R2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST8]] LW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
-; M32R2-SF-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:24>>
 ;
 ; M32R3R5-LABEL: test2:
 ; M32R3R5:       # %bb.0: # %entry
-; M32R3R5-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D32
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R3R5-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST22:]] TRUNC_W_D32
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>>
+; M32R3R5-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R6-LABEL: test2:
 ; M32R6:       # %bb.0: # %entry
-; M32R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M32R6-NEXT:    jr $ra # <MCInst #[[#MCINST9]] JALR
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64-LABEL: test2:
 ; M64:       # %bb.0: # %entry
-; M64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG7]]>>
+; M64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64R6-LABEL: test2:
 ; M64R6:       # %bb.0: # %entry
-; M64R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M64R6-NEXT:    jr $ra # <MCInst #[[#MCINST10]] JALR64
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>>
+; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP32-LABEL: test2:
 ; MMR2-FP32:       # %bb.0: # %entry
-; MMR2-FP32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST24:]] TRUNC_W_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>>
+; MMR2-FP32-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JR_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13]] MFC1_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP64-LABEL: test2:
 ; MMR2-FP64:       # %bb.0: # %entry
-; MMR2-FP64-NEXT:    cvt.w.d $f0, $f12 # <MCInst #{{[0-9]+}} CVT_W_D64_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP64-NEXT:    cvt.w.d $f0, $f12 # <MCInst #[[#MCINST25:]] CVT_W_D64_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; MMR2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JR_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13]] MFC1_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-SF-LABEL: test2:
 ; MMR2-SF:       # %bb.0: # %entry
-; MMR2-SF-NEXT:    addiusp -24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp -24 # <MCInst #[[#MCINST14]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} SWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST15]] SWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR2-SF-NEXT:    .cfi_offset 31, -4
-; MMR2-SF-NEXT:    jal __fixdfsi # <MCInst #{{[0-9]+}} JAL_MM
-; MMR2-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
-; MMR2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    jal __fixdfsi # <MCInst #[[#MCINST16]] JAL_MM
+; MMR2-SF-NEXT:    # <MCOperand Expr:__fixdfsi>>
+; MMR2-SF-NEXT:    nop # <MCInst #[[#MCINST17]] SLL_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:0>>
 ; MMR2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} LWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST18]] LWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR2-SF-NEXT:    addiusp 24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp 24 # <MCInst #[[#MCINST14]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR2-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19]] JRC16_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 ;
 ; MMR6-LABEL: test2:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST26:]] TRUNC_W_D_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13]] MFC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST19]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 ;
 ; MMR6-SF-LABEL: test2:
 ; MMR6-SF:       # %bb.0: # %entry
-; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR6-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST5]] SW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR6-SF-NEXT:    .cfi_offset 31, -4
-; MMR6-SF-NEXT:    balc __fixdfsi # <MCInst #{{[0-9]+}} BALC_MMR6
-; MMR6-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
+; MMR6-SF-NEXT:    balc __fixdfsi # <MCInst #[[#MCINST21]] BALC_MMR6
+; MMR6-SF-NEXT:    # <MCOperand Expr:__fixdfsi>>
 ; MMR6-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST8]] LW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR6-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19]] JRC16_MM
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 entry:
   %conv = fptosi double %t to i32
   ret i32 %conv
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/load.ll b/llvm/test/CodeGen/Mips/llvm-ir/load.ll
index b96bdff227ca..ee858ac94aed 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/load.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/load.ll
@@ -25,161 +25,161 @@
 define i8 @f1() {
 ; MIPS32-LABEL: f1:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR3-LABEL: f1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST6:]] LBu_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R6-LABEL: f1:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7:]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR6-LABEL: f1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(a))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST6:]] LBu_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
 ;
 ; MIPS3-LABEL: f1:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64-LABEL: f1:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64R6-LABEL: f1:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12:]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS64R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR5FP64-LABEL: f1:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR5FP64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST6:]] LBu_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R5FP643-LABEL: f1:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R5FP643-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 entry:
   %0 = load i8, ptr @a
   ret i8 %0
@@ -188,161 +188,161 @@ entry:
 define i32 @f2() {
 ; MIPS32-LABEL: f2:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR3-LABEL: f2:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST14:]] LB_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R6-LABEL: f2:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR6-LABEL: f2:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR6-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(a))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR6-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST14:]] LB_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f2:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64-LABEL: f2:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64R6-LABEL: f2:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR5FP64-LABEL: f2:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST14:]] LB_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R5FP643-LABEL: f2:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 entry:
   %0 = load i8, ptr @a
   %1 = sext i8 %0 to i32
@@ -352,161 +352,161 @@ entry:
 define i16 @f3() {
 ; MIPS32-LABEL: f3:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR3-LABEL: f3:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST16:]] LHu_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R6-LABEL: f3:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR6-LABEL: f3:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(b))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST16:]] LHu_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f3:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64-LABEL: f3:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64R6-LABEL: f3:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR5FP64-LABEL: f3:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST16:]] LHu_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R5FP643-LABEL: f3:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 entry:
   %0 = load i16, ptr @b
   ret i16 %0
@@ -515,161 +515,161 @@ entry:
 define i32 @f4() {
 ; MIPS32-LABEL: f4:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR3-LABEL: f4:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST18:]] LH_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R6-LABEL: f4:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR6-LABEL: f4:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR6-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(b))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR6-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST18:]] LH_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f4:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64-LABEL: f4:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64R6-LABEL: f4:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR5FP64-LABEL: f4:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST18:]] LH_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R5FP643-LABEL: f4:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 entry:
   %0 = load i16, ptr @b
   %1 = sext i16 %0 to i32
@@ -679,161 +679,161 @@ entry:
 define i32 @f5() {
 ; MIPS32-LABEL: f5:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR3-LABEL: f5:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST20:]] LW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R6-LABEL: f5:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR6-LABEL: f5:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST20:]] LW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f5:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64-LABEL: f5:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f5:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f5:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST20:]] LW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R5FP643-LABEL: f5:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 entry:
   %0 = load i32, ptr @c
   ret i32 %0
@@ -842,181 +842,181 @@ entry:
 define i64 @f6() {
 ; MIPS32-LABEL: f6:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    addiu $2, $zero, 0 # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    addiu $2, $zero, 0 # <MCInst #[[#MCINST21:]] ADDiu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
 ; MIPS32-NEXT:    # <MCOperand Imm:0>>
 ;
 ; MMR3-LABEL: f6:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    li16 $2, 0 # <MCInst #[[#MCINST22:]] LI16_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
 ; MMR3-NEXT:    # <MCOperand Imm:0>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R6-LABEL: f6:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    addiu $2, $zero, 0 # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    addiu $2, $zero, 0 # <MCInst #[[#MCINST21:]] ADDiu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
 ; MIPS32R6-NEXT:    # <MCOperand Imm:0>>
 ;
 ; MMR6-LABEL: f6:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    li16 $2, 0 # <MCInst #[[#MCINST22:]] LI16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
 ; MMR6-NEXT:    # <MCOperand Imm:0>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f6:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lwu $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LWu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lwu $2, %lo(c)($1) # <MCInst #[[#MCINST23:]] LWu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64-LABEL: f6:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lwu $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LWu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lwu $2, %lo(c)($1) # <MCInst #[[#MCINST23:]] LWu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f6:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lwu $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LWu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lwu $2, %lo(c)($1) # <MCInst #[[#MCINST23:]] LWu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f6:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    li16 $2, 0 # <MCInst #[[#MCINST22:]] LI16_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
 ; MMR5FP64-NEXT:    # <MCOperand Imm:0>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R5FP643-LABEL: f6:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    addiu $2, $zero, 0 # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    addiu $2, $zero, 0 # <MCInst #[[#MCINST21:]] ADDiu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
 ; MIPS32R5FP643-NEXT:    # <MCOperand Imm:0>>
 entry:
   %0 = load i32, ptr @c
@@ -1027,184 +1027,184 @@ entry:
 define i64 @f7() {
 ; MIPS32-LABEL: f7:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST24:]] SRA
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MIPS32-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MMR3-LABEL: f7:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST25:]] SRA_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MMR3-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MIPS32R6-LABEL: f7:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST24:]] SRA
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MIPS32R6-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MMR6-LABEL: f7:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST25:]] SRA_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MMR6-NEXT:    # <MCOperand Imm:31>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f7:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST26:]] LW64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG9]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64-LABEL: f7:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST26:]] LW64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG9]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f7:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST26:]] LW64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG9]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f7:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST25:]] SRA_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MMR5FP64-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MIPS32R5FP643-LABEL: f7:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST24:]] SRA
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MIPS32R5FP643-NEXT:    # <MCOperand Imm:31>>
 entry:
   %0 = load i32, ptr @c
@@ -1215,161 +1215,161 @@ entry:
 define float @f8() {
 ; MIPS32-LABEL: f8:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR3-LABEL: f8:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST28:]] LWC1_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R6-LABEL: f8:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR6-LABEL: f8:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(e))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST28:]] LWC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f8:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS64-LABEL: f8:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS64R6-LABEL: f8:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR5FP64-LABEL: f8:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST28:]] LWC1_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R5FP643-LABEL: f8:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 entry:
   %0 = load float, ptr @e
   ret float %0
@@ -1378,161 +1378,161 @@ entry:
 define double @f9() {
 ; MIPS32-LABEL: f9:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST29:]] LDC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR3-LABEL: f9:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1_MM_D32
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST30:]] LDC1_MM_D32
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R6-LABEL: f9:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR6-LABEL: f9:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1_D64_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(f))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST32:]] LDC1_D64_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f9:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS64-LABEL: f9:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS64R6-LABEL: f9:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR5FP64-LABEL: f9:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1_MM_D64
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST33:]] LDC1_MM_D64
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R5FP643-LABEL: f9:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 entry:
   %0 = load double, ptr @f
   ret double %0
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/store.ll b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
index 3922db72f2a7..880a0f522574 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/store.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
@@ -24,133 +24,133 @@
 define void @f1(i8 %a) {
 ; MIPS32-LABEL: f1:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST3:]] SB
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR3-LABEL: f1:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR3-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST6:]] SB_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R6-LABEL: f1:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7:]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST3:]] SB
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR6-LABEL: f1:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR6-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(a))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR6-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST6:]] SB_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
 ;
 ; MIPS4-LABEL: f1:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS4-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST12:]] SB64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64R6-LABEL: f1:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13:]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS64R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST12:]] SB64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR5FP64-LABEL: f1:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR5FP64-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST6:]] SB_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R5FP643-LABEL: f1:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R5FP643-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST3:]] SB
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
   store i8 %a, ptr @a
   ret void
 }
@@ -158,133 +158,133 @@ define void @f1(i8 %a) {
 define void @f2(i16 %a) {
 ; MIPS32-LABEL: f2:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST14:]] SH
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR3-LABEL: f2:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST15:]] SH_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R6-LABEL: f2:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST14:]] SH
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR6-LABEL: f2:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR6-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(b))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR6-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST15:]] SH_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f2:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST16:]] SH64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64R6-LABEL: f2:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST16:]] SH64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR5FP64-LABEL: f2:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST15:]] SH_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R5FP643-LABEL: f2:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST14:]] SH
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
   store i16 %a, ptr @b
   ret void
 }
@@ -292,133 +292,133 @@ define void @f2(i16 %a) {
 define void @f3(i32 %a) {
 ; MIPS32-LABEL: f3:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST17:]] SW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR3-LABEL: f3:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST18:]] SW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R6-LABEL: f3:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST17:]] SW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR6-LABEL: f3:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST18:]] SW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f3:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST19:]] SW64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f3:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST19:]] SW64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f3:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST18:]] SW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R5FP643-LABEL: f3:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST17:]] SW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
   store i32 %a, ptr @c
   ret void
 }
@@ -426,180 +426,180 @@ define void @f3(i32 %a) {
 define void @f4(i64 %a) {
 ; MIPS32-LABEL: f4:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS32-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32-NEXT:    addiu $1, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sw $5, 4($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS32-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32-NEXT:    addiu $1, $1, %lo(d) # <MCInst #[[#MCINST20:]] ADDiu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sw $5, 4($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
 ; MIPS32-NEXT:    # <MCOperand Imm:4>>
 ;
 ; MMR3-LABEL: f4:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MMR3-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR3-NEXT:    addiu $2, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR3-NEXT:    sw16 $5, 4($2) # <MCInst #{{[0-9]+}} SW16_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR3-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MMR3-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST18]] SW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR3-NEXT:    addiu $2, $1, %lo(d) # <MCInst #[[#MCINST21:]] ADDiu_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR3-NEXT:    sw16 $5, 4($2) # <MCInst #[[#MCINST22:]] SW16_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG10]]>
 ; MMR3-NEXT:    # <MCOperand Imm:4>>
-; MMR3-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR3-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS32R6-LABEL: f4:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS32R6-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R6-NEXT:    addiu $1, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sw $5, 4($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R6-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS32R6-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R6-NEXT:    addiu $1, $1, %lo(d) # <MCInst #[[#MCINST20:]] ADDiu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sw $5, 4($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
 ; MIPS32R6-NEXT:    # <MCOperand Imm:4>>
 ;
 ; MMR6-LABEL: f4:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MMR6-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR6-NEXT:    addiu $2, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR6-NEXT:    sw16 $5, 4($2) # <MCInst #{{[0-9]+}} SW16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MMR6-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST18]] SW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR6-NEXT:    addiu $2, $1, %lo(d) # <MCInst #[[#MCINST21:]] ADDiu_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR6-NEXT:    sw16 $5, 4($2) # <MCInst #[[#MCINST22:]] SW16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10]]>
 ; MMR6-NEXT:    # <MCOperand Imm:4>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f4:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(d) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(d))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(d))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(d) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,d)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,d)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sd $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SD
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(d))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sd $4, %lo(d)($1) # <MCInst #[[#MCINST23:]] SD
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
 ;
 ; MIPS64R6-LABEL: f4:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(d) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(d))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(d))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(d) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,d)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,d)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sd $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SD
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(d))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sd $4, %lo(d)($1) # <MCInst #[[#MCINST23:]] SD
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
 ;
 ; MMR5FP64-LABEL: f4:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MMR5FP64-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR5FP64-NEXT:    addiu $2, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR5FP64-NEXT:    sw16 $5, 4($2) # <MCInst #{{[0-9]+}} SW16_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR5FP64-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MMR5FP64-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST18]] SW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR5FP64-NEXT:    addiu $2, $1, %lo(d) # <MCInst #[[#MCINST21:]] ADDiu_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR5FP64-NEXT:    sw16 $5, 4($2) # <MCInst #[[#MCINST22:]] SW16_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG10]]>
 ; MMR5FP64-NEXT:    # <MCOperand Imm:4>>
-; MMR5FP64-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR5FP64-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS32R5FP643-LABEL: f4:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS32R5FP643-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R5FP643-NEXT:    addiu $1, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sw $5, 4($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS32R5FP643-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R5FP643-NEXT:    addiu $1, $1, %lo(d) # <MCInst #[[#MCINST20:]] ADDiu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sw $5, 4($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
 ; MIPS32R5FP643-NEXT:    # <MCOperand Imm:4>>
   store i64 %a, ptr @d
   ret void
@@ -608,133 +608,133 @@ define void @f4(i64 %a) {
 define void @f5(float %e) {
 ; MIPS32-LABEL: f5:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR3-LABEL: f5:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST25:]] SWC1_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R6-LABEL: f5:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR6-LABEL: f5:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(e))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST25:]] SWC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f5:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS64R6-LABEL: f5:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR5FP64-LABEL: f5:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST25:]] SWC1_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R5FP643-LABEL: f5:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
   store float %e, ptr @e
   ret void
 }
@@ -742,133 +742,133 @@ define void @f5(float %e) {
 define void @f6(double %f) {
 ; MIPS32-LABEL: f6:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST26:]] SDC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR3-LABEL: f6:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1_MM_D32
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST27:]] SDC1_MM_D32
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R6-LABEL: f6:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR6-LABEL: f6:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1_D64_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(f))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST29:]] SDC1_D64_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f6:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS64R6-LABEL: f6:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR5FP64-LABEL: f6:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1_MM_D64
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST30:]] SDC1_MM_D64
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R5FP643-LABEL: f6:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
   store double %f, ptr @f
   ret void
 }
diff --git a/llvm/test/MC/Lanai/conditional_inst.s b/llvm/test/MC/Lanai/conditional_inst.s
index d167d1af00eb..a0a8caf269fe 100644
--- a/llvm/test/MC/Lanai/conditional_inst.s
+++ b/llvm/test/MC/Lanai/conditional_inst.s
@@ -27,14 +27,14 @@ jump2:
 ! CHECK: encoding: [0b1110110A,A,A,0x01'A']
 ! CHECK-NEXT: fixup A - offset: 0, value: jump1, kind: FIXUP_LANAI_25
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} BRCC{{$}}
-! CHECK-NEXT: <MCOperand Expr:(jump1)>
+! CHECK-NEXT: <MCOperand Expr:specifier(0,jump1)>
 ! CHECK-NEXT: <MCOperand Imm:13>
 
     bpl jump2
 ! CHECK: encoding: [0b1110101A,A,A,A]
 ! CHECK-NEXT: fixup A - offset: 0, value: jump2, kind: FIXUP_LANAI_25
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} BRCC{{$}}
-! CHECK-NEXT: <MCOperand Expr:(jump2)>
+! CHECK-NEXT: <MCOperand Expr:specifier(0,jump2)>
 ! CHECK-NEXT: <MCOperand Imm:10>
 
     bt .
@@ -43,7 +43,7 @@ jump2:
 ! CHECK:      encoding: [0b1110000A,A,A,A]
 ! CHECK-NEXT:   fixup A - offset: 0, value: .Ltmp0, kind: FIXUP_LANAI_25
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} BT{{$}}
-! CHECK-NEXT:   <MCOperand Expr:(.Ltmp0)>
+! CHECK-NEXT:   <MCOperand Expr:.Ltmp0>
 
 ! SCC
     spl %r19
diff --git a/llvm/test/MC/Lanai/memory.s b/llvm/test/MC/Lanai/memory.s
index 41dc8fba7bf2..0e6234645a80 100644
--- a/llvm/test/MC/Lanai/memory.s
+++ b/llvm/test/MC/Lanai/memory.s
@@ -235,7 +235,7 @@
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} ADD_I_HI
 ! CHECK-NEXT: <MCOperand Reg:11>
 ! CHECK-NEXT: <MCOperand Reg:7>
-! CHECK-NEXT: <MCOperand Expr:(hi(x))>
+! CHECK-NEXT: <MCOperand Expr:specifier(1,x)>
 
     mov hi(l+4), %r7
 ! CHECK: encoding: [0x03,0x81,A,A]
@@ -243,5 +243,5 @@
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} ADD_I_HI
 ! CHECK-NEXT: <MCOperand Reg:14>
 ! CHECK-NEXT: <MCOperand Reg:7>
-! CHECK-NEXT: <MCOperand Expr:(hi(l)+4)>
+! CHECK-NEXT: <MCOperand Expr:specifier(1,l)+4>
 

From 05a9ad977624c4f6def7c0f4cf7103e28d6c6541 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 21:50:48 -0700
Subject: [PATCH 0459/1322] Lanai: Migrate to the new relocation specifier
 representation

Use MCSpecifierExpr directly and remove the LanaiMCExpr subclass. Define
MCSpecifierExpr::printImpl to print the relocation specifier in decimal
for llvm-mc -show-inst. The output is not guaranteed to be stable.

Depends on f8e0518120cd2850a7f674322bf428bc7d7d3326
("MC: Adjust -show-inst output for MCExpr")
---
 .../Target/Lanai/AsmParser/LanaiAsmParser.cpp | 104 +++++++++---------
 llvm/lib/Target/Lanai/LanaiMCInstLower.cpp    |  13 +--
 .../Target/Lanai/MCTargetDesc/CMakeLists.txt  |   1 -
 .../Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp     |  26 ++++-
 .../Lanai/MCTargetDesc/LanaiMCAsmInfo.h       |   7 ++
 .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp |  14 +--
 .../Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp |  44 --------
 .../Target/Lanai/MCTargetDesc/LanaiMCExpr.h   |  33 ------
 8 files changed, 95 insertions(+), 147 deletions(-)
 delete mode 100644 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
 delete mode 100644 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h

diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index b2fcd7af0733..9cb7f71945d1 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -9,7 +9,7 @@
 #include "LanaiAluCode.h"
 #include "LanaiCondCode.h"
 #include "LanaiInstrInfo.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCAsmInfo.h"
 #include "TargetInfo/LanaiTargetInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -231,14 +231,14 @@ public:
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_HI;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_HI;
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_HI;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_HI;
 
     return false;
   }
@@ -268,14 +268,14 @@ public:
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     return false;
   }
@@ -292,14 +292,14 @@ public:
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     return false;
   }
@@ -339,8 +339,8 @@ public:
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_None;
     if (const MCSymbolRefExpr *SymbolRefExpr =
             dyn_cast<MCSymbolRefExpr>(Imm.Value)) {
       return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
@@ -348,9 +348,9 @@ public:
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value)) {
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_None;
       if (const MCSymbolRefExpr *SymbolRefExpr =
               dyn_cast<MCSymbolRefExpr>(BinaryExpr->getLHS()))
         return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
@@ -464,19 +464,18 @@ public:
     if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
       Inst.addOperand(
           MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
-    else if (isa<LanaiMCExpr>(getImm())) {
+    else if (isa<MCSpecifierExpr>(getImm())) {
 #ifndef NDEBUG
-      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO);
+      const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(getImm());
+      assert(SymbolRefExpr && SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getSpecifier() ==
-                 LanaiMCExpr::VK_Lanai_ABS_LO);
+      assert(BinaryExpr && isa<MCSpecifierExpr>(BinaryExpr->getLHS()) &&
+             cast<MCSpecifierExpr>(BinaryExpr->getLHS())->getSpecifier() ==
+                 Lanai::S_ABS_LO);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -495,19 +494,18 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
       Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
-    else if (isa<LanaiMCExpr>(getImm())) {
+    else if (isa<MCSpecifierExpr>(getImm())) {
 #ifndef NDEBUG
-      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_HI);
+      const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(getImm());
+      assert(SymbolRefExpr && SymbolRefExpr->getSpecifier() == Lanai::S_ABS_HI);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getSpecifier() ==
-                 LanaiMCExpr::VK_Lanai_ABS_HI);
+      assert(BinaryExpr && isa<MCSpecifierExpr>(BinaryExpr->getLHS()) &&
+             cast<MCSpecifierExpr>(BinaryExpr->getLHS())->getSpecifier() ==
+                 Lanai::S_ABS_HI);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -526,11 +524,10 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
       Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0x1fffff));
-    else if (isa<LanaiMCExpr>(getImm())) {
+    else if (isa<MCSpecifierExpr>(getImm())) {
 #ifndef NDEBUG
-      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None);
+      const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(getImm());
+      assert(SymbolRefExpr && SymbolRefExpr->getSpecifier() == Lanai::S_None);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCSymbolRefExpr>(getImm())) {
@@ -544,9 +541,9 @@ public:
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getSpecifier() ==
-                 LanaiMCExpr::VK_Lanai_None);
+      assert(BinaryExpr && isa<MCSpecifierExpr>(BinaryExpr->getLHS()) &&
+             cast<MCSpecifierExpr>(BinaryExpr->getLHS())->getSpecifier() ==
+                 Lanai::S_None);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -737,7 +734,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   SMLoc Start = Parser.getTok().getLoc();
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *Res, *RHS = nullptr;
-  LanaiMCExpr::Spec Kind = LanaiMCExpr::VK_Lanai_None;
+  auto Kind = Lanai::S_None;
 
   if (Lexer.getKind() != AsmToken::Identifier)
     return nullptr;
@@ -748,13 +745,13 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
 
   // Check if identifier has a modifier
   if (Identifier.equals_insensitive("hi"))
-    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+    Kind = Lanai::S_ABS_HI;
   else if (Identifier.equals_insensitive("lo"))
-    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+    Kind = Lanai::S_ABS_LO;
 
   // If the identifier corresponds to a variant then extract the real
   // identifier.
-  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+  if (Kind != Lanai::S_None) {
     if (Lexer.getKind() != AsmToken::LParen) {
       Error(Lexer.getLoc(), "Expected '('");
       return nullptr;
@@ -771,7 +768,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
     return nullptr;
 
   // For variants parse the final ')'
-  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+  if (Kind != Lanai::S_None) {
     if (Lexer.getKind() != AsmToken::RParen) {
       Error(Lexer.getLoc(), "Expected ')'");
       return nullptr;
@@ -781,8 +778,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
 
   End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
-  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-  Res = LanaiMCExpr::create(Kind, Expr, getContext());
+  Res = MCSpecifierExpr::create(Sym, Kind, getContext());
 
   // Nest if this was an addition
   if (RHS)
@@ -865,16 +861,16 @@ bool shouldBeSls(const LanaiOperand &Op) {
   }
   // The instruction should be encoded as an SLS if the operand is a symbolic
   // reference with no variant.
-  if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Op.getImm()))
-    return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None;
+  if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Op.getImm()))
+    return SymbolRefExpr->getSpecifier() == Lanai::S_None;
   // The instruction should be encoded as an SLS if the operand is a binary
   // expression with the left-hand side being a symbolic reference with no
   // variant.
   if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Op.getImm())) {
-    const LanaiMCExpr *LHSSymbolRefExpr =
-        dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+    const auto *LHSSymbolRefExpr =
+        dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS());
     return (LHSSymbolRefExpr &&
-            LHSSymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None);
+            LHSSymbolRefExpr->getSpecifier() == Lanai::S_None);
   }
   return false;
 }
diff --git a/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
index 21b327fd8f7c..b0db8d088768 100644
--- a/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
+++ b/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -14,7 +14,7 @@
 #include "LanaiMCInstLower.h"
 
 #include "MCTargetDesc/LanaiBaseInfo.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCAsmInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -64,17 +64,16 @@ LanaiMCInstLower::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
 
 MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                MCSymbol *Sym) const {
-  LanaiMCExpr::Spec Kind;
-
+  Lanai::Specifier Kind;
   switch (MO.getTargetFlags()) {
   case LanaiII::MO_NO_FLAG:
-    Kind = LanaiMCExpr::VK_Lanai_None;
+    Kind = Lanai::S_None;
     break;
   case LanaiII::MO_ABS_HI:
-    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+    Kind = Lanai::S_ABS_HI;
     break;
   case LanaiII::MO_ABS_LO:
-    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+    Kind = Lanai::S_ABS_LO;
     break;
   default:
     llvm_unreachable("Unknown target flag on GV operand");
@@ -84,7 +83,7 @@ MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
-  Expr = LanaiMCExpr::create(Kind, Expr, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, Kind, Ctx);
   return MCOperand::createExpr(Expr);
 }
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
index 04fca878ca5a..ff3b6abc70ec 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMLanaiDesc
   LanaiInstPrinter.cpp
   LanaiMCAsmInfo.cpp
   LanaiMCCodeEmitter.cpp
-  LanaiMCExpr.cpp
   LanaiMCTargetDesc.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
index 7ae693130da5..6ad018c12a28 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -11,7 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "LanaiMCAsmInfo.h"
-
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -38,3 +39,26 @@ LanaiMCAsmInfo::LanaiMCAsmInfo(const Triple & /*TheTriple*/,
   // in dwarf generation.
   MinInstAlignment = 4;
 }
+
+void LanaiMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                        const MCSpecifierExpr &Expr) const {
+  if (Expr.getSpecifier() == 0) {
+    printExpr(OS, *Expr.getSubExpr());
+    return;
+  }
+
+  switch (Expr.getSpecifier()) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case Lanai::S_ABS_HI:
+    OS << "hi";
+    break;
+  case Lanai::S_ABS_LO:
+    OS << "lo";
+    break;
+  }
+
+  OS << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  OS << ')';
+}
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
index f0352d021291..2696975e71c0 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
@@ -24,8 +24,15 @@ class LanaiMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit LanaiMCAsmInfo(const Triple &TheTriple,
                           const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
+namespace Lanai {
+using Specifier = uint8_t;
+enum { S_None, S_ABS_HI, S_ABS_LO };
+} // namespace Lanai
+
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index 779c83e5b3f2..d1b2da40446a 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 #include "LanaiAluCode.h"
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCAsmInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -89,14 +89,14 @@ public:
 static Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
-  if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
-    LanaiMCExpr::Spec ExprKind = McExpr->getSpecifier();
+  if (const MCSpecifierExpr *McExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
+    Lanai::Specifier ExprKind = McExpr->getSpecifier();
     switch (ExprKind) {
-    case LanaiMCExpr::VK_Lanai_None:
+    case Lanai::S_None:
       return Lanai::FIXUP_LANAI_21;
-    case LanaiMCExpr::VK_Lanai_ABS_HI:
+    case Lanai::S_ABS_HI:
       return Lanai::FIXUP_LANAI_HI16;
-    case LanaiMCExpr::VK_Lanai_ABS_LO:
+    case Lanai::S_ABS_LO:
       return Lanai::FIXUP_LANAI_LO16;
     }
   }
@@ -123,7 +123,7 @@ unsigned LanaiMCCodeEmitter::getMachineOpValue(
     Expr = BinaryExpr->getLHS();
   }
 
-  assert(isa<LanaiMCExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
+  assert(isa<MCSpecifierExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
   // Push fixup (all info is contained within)
   Fixups.push_back(
       MCFixup::create(0, MCOp.getExpr(), MCFixupKind(FixupKind(Expr))));
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
deleted file mode 100644
index b75a09915660..000000000000
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LanaiMCExpr.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "lanaimcexpr"
-
-const LanaiMCExpr *LanaiMCExpr::create(Spec S, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) LanaiMCExpr(Expr, S);
-}
-
-void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  if (specifier == VK_Lanai_None) {
-    MAI->printExpr(OS, *Expr);
-    return;
-  }
-
-  switch (specifier) {
-  default:
-    llvm_unreachable("Invalid kind!");
-  case VK_Lanai_ABS_HI:
-    OS << "hi";
-    break;
-  case VK_Lanai_ABS_LO:
-    OS << "lo";
-    break;
-  }
-
-  OS << '(';
-  const MCExpr *Expr = getSubExpr();
-  MAI->printExpr(OS, *Expr);
-  OS << ')';
-}
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
deleted file mode 100644
index 90f8a3e5bbd5..000000000000
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
-#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class LanaiMCExpr : public MCSpecifierExpr {
-public:
-  using Spec = MCSpecifierExpr::Spec;
-  enum { VK_Lanai_None, VK_Lanai_ABS_HI, VK_Lanai_ABS_LO };
-
-private:
-  explicit LanaiMCExpr(const MCExpr *Expr, Spec S) : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const LanaiMCExpr *create(Spec Kind, const MCExpr *Expr,
-                                   MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-};
-} // end namespace llvm
-
-#endif

From 945b12f6c823c49336a878e7afe2a96e4d3382ea Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 16 Jun 2025 04:53:40 +0000
Subject: [PATCH 0460/1322] [gn build] Port 05a9ad977624

---
 .../gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn
index a52132e69cc3..874cdc1b7839 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn
@@ -56,7 +56,6 @@ static_library("MCTargetDesc") {
     "LanaiInstPrinter.cpp",
     "LanaiMCAsmInfo.cpp",
     "LanaiMCCodeEmitter.cpp",
-    "LanaiMCExpr.cpp",
     "LanaiMCTargetDesc.cpp",
   ]
 }

From 4ea616d072d126a31149174ca2efdbdace9ce568 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 22:41:36 -0700
Subject: [PATCH 0461/1322] AArch64: Move AArch64MCExpr functions to
 AArch64MCAsmInfo

To migrate away from the legacy
XXXMCExpr::printImpl/evaluateAsRelocatableImpl overrides and align with
other targets.

While the AArch64MCAsmInfoXXX hooks introduce some duplication, they
enable better separation for object file formats.
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 138 ++++++++++++++++++
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   |  23 +++
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |  97 +-----------
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      |  14 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   6 +-
 5 files changed, 166 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 31965d85d9eb..a82896dbe0d6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
@@ -53,6 +54,80 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
+StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
+  // clang-format off
+  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
+  case AArch64MCExpr::VK_CALL:                return "";
+  case AArch64MCExpr::VK_LO12:                return ":lo12:";
+  case AArch64MCExpr::VK_ABS_G3:              return ":abs_g3:";
+  case AArch64MCExpr::VK_ABS_G2:              return ":abs_g2:";
+  case AArch64MCExpr::VK_ABS_G2_S:            return ":abs_g2_s:";
+  case AArch64MCExpr::VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case AArch64MCExpr::VK_ABS_G1:              return ":abs_g1:";
+  case AArch64MCExpr::VK_ABS_G1_S:            return ":abs_g1_s:";
+  case AArch64MCExpr::VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case AArch64MCExpr::VK_ABS_G0:              return ":abs_g0:";
+  case AArch64MCExpr::VK_ABS_G0_S:            return ":abs_g0_s:";
+  case AArch64MCExpr::VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case AArch64MCExpr::VK_PREL_G3:             return ":prel_g3:";
+  case AArch64MCExpr::VK_PREL_G2:             return ":prel_g2:";
+  case AArch64MCExpr::VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case AArch64MCExpr::VK_PREL_G1:             return ":prel_g1:";
+  case AArch64MCExpr::VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case AArch64MCExpr::VK_PREL_G0:             return ":prel_g0:";
+  case AArch64MCExpr::VK_PREL_G0_NC:          return ":prel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_G2:           return ":dtprel_g2:";
+  case AArch64MCExpr::VK_DTPREL_G1:           return ":dtprel_g1:";
+  case AArch64MCExpr::VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case AArch64MCExpr::VK_DTPREL_G0:           return ":dtprel_g0:";
+  case AArch64MCExpr::VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case AArch64MCExpr::VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case AArch64MCExpr::VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case AArch64MCExpr::VK_TPREL_G2:            return ":tprel_g2:";
+  case AArch64MCExpr::VK_TPREL_G1:            return ":tprel_g1:";
+  case AArch64MCExpr::VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case AArch64MCExpr::VK_TPREL_G0:            return ":tprel_g0:";
+  case AArch64MCExpr::VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case AArch64MCExpr::VK_TPREL_HI12:          return ":tprel_hi12:";
+  case AArch64MCExpr::VK_TPREL_LO12:          return ":tprel_lo12:";
+  case AArch64MCExpr::VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case AArch64MCExpr::VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
+  case AArch64MCExpr::VK_ABS_PAGE:            return "";
+  case AArch64MCExpr::VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
+  case AArch64MCExpr::VK_GOT:                 return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE:            return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
+  case AArch64MCExpr::VK_GOT_LO12:            return ":got_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL:            return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case AArch64MCExpr::VK_TLSDESC:             return "";
+  case AArch64MCExpr::VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH:        return "";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
+  case AArch64MCExpr::VK_SECREL_LO12:         return ":secrel_lo12:";
+  case AArch64MCExpr::VK_SECREL_HI12:         return ":secrel_hi12:";
+  case AArch64MCExpr::VK_GOT_AUTH:            return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_PAGE:       return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
+  default:
+    llvm_unreachable("Invalid relocation specifier");
+  }
+  // clang-format on
+}
+
+static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
+                     const MCAssembler *Asm) {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
+
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
@@ -91,6 +166,34 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
+void AArch64AuthMCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
+  if (WrapSubExprInParens)
+    OS << '(';
+  getSubExpr()->print(OS, MAI);
+  if (WrapSubExprInParens)
+    OS << ')';
+
+  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
+  if (hasAddressDiversity())
+    OS << ",addr";
+  OS << ')';
+}
+
+void AArch64MCAsmInfoDarwin::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  // FIXME: tryParseAdrLabel should not use VK_ABS for Mach-O
+  assert(Expr.getSpecifier() == AArch64MCExpr::VK_ABS);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoDarwin::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
@@ -127,6 +230,19 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   initializeVariantKinds(ELFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoELF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoELF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -146,6 +262,17 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   initializeVariantKinds(COFFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoMicrosoftCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -164,3 +291,14 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   initializeVariantKinds(COFFAtSpecifiers);
 }
+
+void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoGNUCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 225e0c8e55fc..bc02586d7388 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -26,20 +27,42 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
+namespace AArch64 {
+/// Return the string representation of the ELF relocation specifier
+/// (e.g. ":got:", ":lo12:").
+StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+} // namespace AArch64
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index d934af91b9ff..7a7c6f7effd9 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,100 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
+#include "AArch64MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-
 const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
                                            MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, S);
 }
 
-StringRef AArch64MCExpr::getSpecifierName() const {
-  // clang-format off
-  switch (static_cast<uint32_t>(getSpecifier())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_S:            return ":abs_g2_s:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_S:            return ":abs_g1_s:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_S:            return ":abs_g0_s:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_PREL_G3:             return ":prel_g3:";
-  case VK_PREL_G2:             return ":prel_g2:";
-  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
-  case VK_PREL_G1:             return ":prel_g1:";
-  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
-  case VK_PREL_G0:             return ":prel_g0:";
-  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_HI12:          return ":tprel_hi12:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
-  case VK_GOT:                 return ":got:";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL:            return ":gottprel:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  case VK_TLSDESC_AUTH:        return "";
-  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
-  case VK_SECREL_LO12:         return ":secrel_lo12:";
-  case VK_SECREL_HI12:         return ":secrel_hi12:";
-  case VK_GOT_AUTH:            return ":got_auth:";
-  case VK_GOT_AUTH_PAGE:       return ":got_auth:";
-  case VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
-  default:
-    llvm_unreachable("Invalid relocation specifier");
-  }
-  // clang-format on
-}
-
-void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  OS << getSpecifierName();
-  Expr->print(OS, MAI);
-}
-
-bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(getSpecifier());
-  return true;
-}
-
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
@@ -114,17 +33,3 @@ const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
   return new (Ctx)
       AArch64AuthMCExpr(Expr, Discriminator, Key, HasAddressDiversity);
 }
-
-void AArch64AuthMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
-  if (WrapSubExprInParens)
-    OS << '(';
-  getSubExpr()->print(OS, MAI);
-  if (WrapSubExprInParens)
-    OS << ')';
-
-  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
-  if (hasAddressDiversity())
-    OS << ",addr";
-  OS << ')';
-}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 9c383894c7f5..541f24c943a1 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,8 +147,6 @@ protected:
 public:
   static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
                                      MCContext &Ctx);
-  /// @name VariantKind information extractors.
-  /// @{
 
   static Specifier getSymbolLoc(Specifier S) {
     return static_cast<Specifier>(S & VK_SymLocBits);
@@ -159,16 +157,6 @@ public:
   }
 
   static bool isNotChecked(Specifier S) { return S & VK_NC; }
-
-  /// @}
-
-  /// Return the string representation of the ELF relocation specifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getSpecifierName() const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 class AArch64AuthMCExpr final : public AArch64MCExpr {
@@ -189,7 +177,7 @@ public:
   uint16_t getDiscriminator() const { return Discriminator; }
   bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   static bool classof(const MCExpr *E) {
     return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 3009bd2ca275..2e997631655e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -73,7 +73,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       break;
     default:
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
@@ -83,7 +83,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   default: {
     if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());

From 4f9e6bad8438f4440bfd68be2f0ebdca0d588d47 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 16 Jun 2025 08:28:52 +0200
Subject: [PATCH 0462/1322] [clang][bytecode] Fix calling operator new with
 nothrow/align parameter (#144271)

Discard all the parameters we don't care about.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 20 +++++++++++++++++++-
 clang/test/AST/ByteCode/new-delete.cpp   | 22 ++++++++++++++++------
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5fc503456959..d01e3d042a8b 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1423,7 +1423,6 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
   // Walk up the call stack to find the appropriate caller and get the
   // element type from it.
   auto [NewCall, ElemType] = S.getStdAllocatorCaller("allocate");
-  APSInt Bytes = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(0)));
 
   if (ElemType.isNull()) {
     S.FFDiag(Call, S.getLangOpts().CPlusPlus20
@@ -1439,6 +1438,25 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
     return false;
   }
 
+  // We only care about the first parameter (the size), so discard all the
+  // others.
+  {
+    unsigned NumArgs = Call->getNumArgs();
+    assert(NumArgs >= 1);
+
+    // The std::nothrow_t arg never gets put on the stack.
+    if (Call->getArg(NumArgs - 1)->getType()->isNothrowT())
+      --NumArgs;
+    auto Args = llvm::ArrayRef(Call->getArgs(), Call->getNumArgs());
+    // First arg is needed.
+    Args = Args.drop_front();
+
+    // Discard the rest.
+    for (const Expr *Arg : Args)
+      discard(S.Stk, *S.getContext().classify(Arg));
+  }
+
+  APSInt Bytes = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(0)));
   CharUnits ElemSize = S.getASTContext().getTypeSizeInChars(ElemType);
   assert(!ElemSize.isZero());
   // Divide the number of bytes by sizeof(ElemType), so we get the number of
diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp
index 1ee41a98e13b..9c293e5d15fc 100644
--- a/clang/test/AST/ByteCode/new-delete.cpp
+++ b/clang/test/AST/ByteCode/new-delete.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -triple=i686-linux-gnu -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -verify=ref,both %s
-// RUN: %clang_cc1 -std=c++20 -verify=ref,both %s
-// RUN: %clang_cc1 -triple=i686-linux-gnu -std=c++20 -verify=ref,both %s
+// RUN: %clang_cc1            -verify=expected,both                        -fexperimental-new-constant-interpreter %s
+// RUN: %clang_cc1 -std=c++20 -verify=expected,both                        -fexperimental-new-constant-interpreter %s
+// RUN: %clang_cc1 -std=c++20 -verify=expected,both -triple=i686-linux-gnu -fexperimental-new-constant-interpreter %s
+// RUN: %clang_cc1            -verify=ref,both                                                                     %s
+// RUN: %clang_cc1 -std=c++20 -verify=ref,both                                                                     %s
+// RUN: %clang_cc1 -std=c++20 -verify=ref,both      -triple=i686-linux-gnu                                         %s
 
 #if __cplusplus >= 202002L
 
@@ -1012,6 +1012,16 @@ constexpr int no_deallocate_nonalloc = (std::allocator<int>().deallocate((int*)&
                                                                                                              // both-note {{in call}} \
                                                                                                              // both-note {{declared here}}
 
+namespace OpNewNothrow {
+  constexpr int f() {
+      int *v = (int*)operator new(sizeof(int), std::align_val_t(2), std::nothrow); // both-note {{cannot allocate untyped memory in a constant expression; use 'std::allocator<T>::allocate' to allocate memory of type 'T'}}
+      operator delete(v, std::align_val_t(2), std::nothrow);
+      return 1;
+  }
+  static_assert(f()); // both-error {{not an integral constant expression}} \
+                      // both-note {{in call to}}
+}
+
 #else
 /// Make sure we reject this prior to C++20
 constexpr int a() { // both-error {{never produces a constant expression}}

From f3021e79fd5a4cab5537f37df2e6010a325d0a7c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 23:29:06 -0700
Subject: [PATCH 0463/1322] ARM: Rename ARMMCExpr::VK_ to ARM::S_

Prepare for removing ARMMCExpr. Adopt the new naming convention (S_
instead of VK_; the relocation specifier was previously named
`VariantKind`)) used by most other targets.

Make ARMMCAsmInfo.h include ARMMCExpr.h and change .cpp files to include
ARMMCAsmInfo.h. We will eventually remove ARMMCExpr.h.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         |  17 ++-
 llvm/lib/Target/ARM/ARMMCInstLower.cpp        |   6 +-
 llvm/lib/Target/ARM/ARMTargetObjectFile.cpp   |  12 +--
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp |  34 +++---
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp |   6 +-
 .../ARM/MCTargetDesc/ARMELFObjectWriter.cpp   | 102 +++++++++---------
 .../ARM/MCTargetDesc/ARMELFStreamer.cpp       |  14 +--
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp  |  44 ++++----
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.h    |  45 ++++++++
 .../ARM/MCTargetDesc/ARMMCCodeEmitter.cpp     |  26 ++---
 .../lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp |  39 +++++--
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h  |  69 ++----------
 12 files changed, 214 insertions(+), 200 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 13efd70c0f22..fef7a17ae0b6 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -18,7 +18,7 @@
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "MCTargetDesc/ARMInstPrinter.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "TargetInfo/ARMTargetInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/BinaryFormat/COFF.h"
@@ -92,8 +92,7 @@ void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
 
   const MCExpr *E = MCSymbolRefExpr::create(
       GetARMGVSymbol(GV, ARMII::MO_NO_FLAG),
-      (Subtarget->isTargetELF() ? ARMMCExpr::VK_TARGET1 : ARMMCExpr::VK_None),
-      OutContext);
+      (Subtarget->isTargetELF() ? ARM::S_TARGET1 : ARM::S_None), OutContext);
 
   OutStreamer->emitValue(E, Size);
 }
@@ -843,17 +842,17 @@ static MCSymbol *getPICLabel(StringRef Prefix, unsigned FunctionNumber,
 static uint8_t getModifierSpecifier(ARMCP::ARMCPModifier Modifier) {
   switch (Modifier) {
   case ARMCP::no_modifier:
-    return ARMMCExpr::VK_None;
+    return ARM::S_None;
   case ARMCP::TLSGD:
-    return ARMMCExpr::VK_TLSGD;
+    return ARM::S_TLSGD;
   case ARMCP::TPOFF:
-    return ARMMCExpr::VK_TPOFF;
+    return ARM::S_TPOFF;
   case ARMCP::GOTTPOFF:
-    return ARMMCExpr::VK_GOTTPOFF;
+    return ARM::S_GOTTPOFF;
   case ARMCP::SBREL:
-    return ARMMCExpr::VK_SBREL;
+    return ARM::S_SBREL;
   case ARMCP::GOT_PREL:
-    return ARMMCExpr::VK_GOT_PREL;
+    return ARM::S_GOT_PREL;
   case ARMCP::SECREL:
     return MCSymbolRefExpr::VK_SECREL;
   }
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index 6892db6eb52c..b32de6b66058 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -18,7 +18,7 @@
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -37,9 +37,9 @@ using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
-  auto Specifier = ARMMCExpr::VK_None;
+  auto Specifier = ARM::S_None;
   if (MO.getTargetFlags() & ARMII::MO_SBREL)
-    Specifier = ARMMCExpr::VK_SBREL;
+    Specifier = ARM::S_SBREL;
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
   switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 131703ec082b..a0a400f93848 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -9,7 +9,7 @@
 #include "ARMTargetObjectFile.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -30,7 +30,7 @@ using namespace dwarf;
 //===----------------------------------------------------------------------===//
 
 ARMElfTargetObjectFile::ARMElfTargetObjectFile() {
-  PLTRelativeSpecifier = ARMMCExpr::VK_PREL31;
+  PLTRelativeSpecifier = ARM::S_PREL31;
   SupportIndirectSymViaGOTPCRel = true;
 }
 
@@ -68,14 +68,14 @@ const MCExpr *ARMElfTargetObjectFile::getIndirectSymViaGOTPCRel(
     int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   int64_t FinalOffset = Offset + MV.getConstant();
   const MCExpr *Res =
-      MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_GOT_PREL, getContext());
+      MCSymbolRefExpr::create(Sym, ARM::S_GOT_PREL, getContext());
   const MCExpr *Off = MCConstantExpr::create(FinalOffset, getContext());
   return MCBinaryExpr::createAdd(Res, Off, getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::
 getIndirectSymViaRWPI(const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_SBREL, getContext());
+  return MCSymbolRefExpr::create(Sym, ARM::S_SBREL, getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
@@ -87,13 +87,13 @@ const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
 
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::create(TM.getSymbol(GV), ARMMCExpr::VK_TARGET2,
+  return MCSymbolRefExpr::create(TM.getSymbol(GV), ARM::S_TARGET2,
                                  getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_TLSLDO, getContext());
+  return MCSymbolRefExpr::create(Sym, ARM::S_TLSLDO, getContext());
 }
 
 static bool isExecuteOnlyFunction(const GlobalObject *GO, SectionKind SK,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 19c417b2c6e9..6e9efe40dc54 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -11,7 +11,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMInstPrinter.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "TargetInfo/ARMTargetInfo.h"
 #include "Utils/ARMBaseInfo.h"
@@ -1327,8 +1327,8 @@ public:
       // We want to avoid matching :upper16: and :lower16: as we want these
       // expressions to match in isImm0_65535Expr()
       const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(getImm());
-      return (!ARM16Expr || (ARM16Expr->getSpecifier() != ARMMCExpr::VK_HI16 &&
-                             ARM16Expr->getSpecifier() != ARMMCExpr::VK_LO16));
+      return (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
+                             ARM16Expr->getSpecifier() != ARM::S_LO16));
     }
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -6473,7 +6473,7 @@ bool ARMAsmParser::parseImmExpr(int64_t &Out) {
 // :upper8_15:, :upper0_7:, :lower8_15: and :lower0_7:
 bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
   MCAsmParser &Parser = getParser();
-  Spec = ARMMCExpr::VK_None;
+  Spec = ARM::S_None;
 
   // consume an optional '#' (GNU compatibility)
   if (getLexer().is(AsmToken::Hash))
@@ -6498,12 +6498,12 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
     ARMMCExpr::Specifier Spec;
     uint8_t SupportedFormats;
   } PrefixEntries[] = {
-      {"upper16", ARMMCExpr::VK_HI16, COFF | ELF | MACHO},
-      {"lower16", ARMMCExpr::VK_LO16, COFF | ELF | MACHO},
-      {"upper8_15", ARMMCExpr::VK_HI_8_15, ELF},
-      {"upper0_7", ARMMCExpr::VK_HI_0_7, ELF},
-      {"lower8_15", ARMMCExpr::VK_LO_8_15, ELF},
-      {"lower0_7", ARMMCExpr::VK_LO_0_7, ELF},
+      {"upper16", ARM::S_HI16, COFF | ELF | MACHO},
+      {"lower16", ARM::S_LO16, COFF | ELF | MACHO},
+      {"upper8_15", ARM::S_HI_8_15, ELF},
+      {"upper0_7", ARM::S_HI_0_7, ELF},
+      {"lower8_15", ARM::S_LO_8_15, ELF},
+      {"lower0_7", ARM::S_LO_0_7, ELF},
   };
 
   StringRef IDVal = Parser.getTok().getIdentifier();
@@ -6880,10 +6880,10 @@ static bool isThumbI8Relocation(MCParsedAsmOperand &MCOp) {
   if (!E)
     return false;
   const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
-  if (ARM16Expr && (ARM16Expr->getSpecifier() == ARMMCExpr::VK_HI_8_15 ||
-                    ARM16Expr->getSpecifier() == ARMMCExpr::VK_HI_0_7 ||
-                    ARM16Expr->getSpecifier() == ARMMCExpr::VK_LO_8_15 ||
-                    ARM16Expr->getSpecifier() == ARMMCExpr::VK_LO_0_7))
+  if (ARM16Expr && (ARM16Expr->getSpecifier() == ARM::S_HI_8_15 ||
+                    ARM16Expr->getSpecifier() == ARM::S_HI_0_7 ||
+                    ARM16Expr->getSpecifier() == ARM::S_LO_8_15 ||
+                    ARM16Expr->getSpecifier() == ARM::S_LO_0_7))
     return true;
   return false;
 }
@@ -8287,8 +8287,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
     if (!E) break;
     const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
-    if (!ARM16Expr || (ARM16Expr->getSpecifier() != ARMMCExpr::VK_HI16 &&
-                       ARM16Expr->getSpecifier() != ARMMCExpr::VK_LO16))
+    if (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
+                       ARM16Expr->getSpecifier() != ARM::S_LO16))
       return Error(
           Op.getStartLoc(),
           "immediate expression for mov requires :lower16: or :upper16");
@@ -12437,7 +12437,7 @@ bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
 
   auto *Sym = getContext().getOrCreateSymbol(Parser.getTok().getIdentifier());
   const auto *SRE =
-      MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_TLSDESCSEQ, getContext());
+      MCSymbolRefExpr::create(Sym, ARM::S_TLSDESCSEQ, getContext());
   Lex();
 
   if (parseEOL())
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index a7320eea80b0..f43fdae554b8 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -12,7 +12,7 @@
 #include "MCTargetDesc/ARMAsmBackendELF.h"
 #include "MCTargetDesc/ARMAsmBackendWinCOFF.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -619,7 +619,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     // Offset by 8 just as above.
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getSpecifier() == ARMMCExpr::VK_TLSCALL)
+      if (SRE->getSpecifier() == ARM::S_TLSCALL)
         return 0;
     return 0xffffff & (Value >> 2);
   case ARM::fixup_t2_uncondbranch: {
@@ -746,7 +746,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     uint32_t offset = (Value - 4) >> 2;
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getSpecifier() == ARMMCExpr::VK_TLSCALL)
+      if (SRE->getSpecifier() == ARM::S_TLSCALL)
         offset = 0;
     uint32_t signBit = (offset & 0x400000) >> 22;
     uint32_t I1Bit = (offset & 0x200000) >> 21;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index f5a6ee5c5a2e..b0ebb74424c7 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
@@ -87,16 +87,16 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
   };
 
   switch (Specifier) {
-  case ARMMCExpr::VK_GOTTPOFF:
-  case ARMMCExpr::VK_GOTTPOFF_FDPIC:
-  case ARMMCExpr::VK_TLSCALL:
-  case ARMMCExpr::VK_TLSDESC:
-  case ARMMCExpr::VK_TLSGD:
-  case ARMMCExpr::VK_TLSGD_FDPIC:
-  case ARMMCExpr::VK_TLSLDM:
-  case ARMMCExpr::VK_TLSLDM_FDPIC:
-  case ARMMCExpr::VK_TLSLDO:
-  case ARMMCExpr::VK_TPOFF:
+  case ARM::S_GOTTPOFF:
+  case ARM::S_GOTTPOFF_FDPIC:
+  case ARM::S_TLSCALL:
+  case ARM::S_TLSDESC:
+  case ARM::S_TLSGD:
+  case ARM::S_TLSGD_FDPIC:
+  case ARM::S_TLSLDM:
+  case ARM::S_TLSLDM_FDPIC:
+  case ARM::S_TLSLDO:
+  case ARM::S_TPOFF:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -115,7 +115,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
         reportError(Fixup.getLoc(),
                     "invalid fixup for 4-byte pc-relative data relocation");
         return ELF::R_ARM_NONE;
-      case ARMMCExpr::VK_None: {
+      case ARM::S_None: {
         if (const auto *SA = Target.getAddSym()) {
           // For GNU AS compatibility expressions such as
           // _GLOBAL_OFFSET_TABLE_ - label emit a R_ARM_BASE_PREL relocation.
@@ -124,19 +124,19 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
         }
         return ELF::R_ARM_REL32;
       }
-      case ARMMCExpr::VK_GOTTPOFF:
+      case ARM::S_GOTTPOFF:
         return ELF::R_ARM_TLS_IE32;
-      case ARMMCExpr::VK_GOT_PREL:
+      case ARM::S_GOT_PREL:
         return ELF::R_ARM_GOT_PREL;
-      case ARMMCExpr::VK_PREL31:
+      case ARM::S_PREL31:
         return ELF::R_ARM_PREL31;
       }
     case ARM::fixup_arm_blx:
     case ARM::fixup_arm_uncondbl:
       switch (Specifier) {
-      case ARMMCExpr::VK_PLT:
+      case ARM::S_PLT:
         return ELF::R_ARM_CALL;
-      case ARMMCExpr::VK_TLSCALL:
+      case ARM::S_TLSCALL:
         return ELF::R_ARM_TLS_CALL;
       default:
         return ELF::R_ARM_CALL;
@@ -172,7 +172,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     case ARM::fixup_arm_thumb_bl:
     case ARM::fixup_arm_thumb_blx:
       switch (Specifier) {
-      case ARMMCExpr::VK_TLSCALL:
+      case ARM::S_TLSCALL:
         return ELF::R_ARM_THM_TLS_CALL;
       default:
         return ELF::R_ARM_THM_CALL;
@@ -206,7 +206,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for 1-byte data relocation");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_ABS8;
     }
   case FK_Data_2:
@@ -214,7 +214,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for 2-byte data relocation");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_ABS16;
     }
   case FK_Data_4:
@@ -222,51 +222,51 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for 4-byte data relocation");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_ARM_NONE:
+    case ARM::S_ARM_NONE:
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_GOT:
+    case ARM::S_GOT:
       return ELF::R_ARM_GOT_BREL;
-    case ARMMCExpr::VK_TLSGD:
+    case ARM::S_TLSGD:
       return ELF::R_ARM_TLS_GD32;
-    case ARMMCExpr::VK_TPOFF:
+    case ARM::S_TPOFF:
       return ELF::R_ARM_TLS_LE32;
-    case ARMMCExpr::VK_GOTTPOFF:
+    case ARM::S_GOTTPOFF:
       return ELF::R_ARM_TLS_IE32;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_ABS32;
-    case ARMMCExpr::VK_GOTOFF:
+    case ARM::S_GOTOFF:
       return ELF::R_ARM_GOTOFF32;
-    case ARMMCExpr::VK_GOT_PREL:
+    case ARM::S_GOT_PREL:
       return ELF::R_ARM_GOT_PREL;
-    case ARMMCExpr::VK_TARGET1:
+    case ARM::S_TARGET1:
       return ELF::R_ARM_TARGET1;
-    case ARMMCExpr::VK_TARGET2:
+    case ARM::S_TARGET2:
       return ELF::R_ARM_TARGET2;
-    case ARMMCExpr::VK_PREL31:
+    case ARM::S_PREL31:
       return ELF::R_ARM_PREL31;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_SBREL32;
-    case ARMMCExpr::VK_TLSLDO:
+    case ARM::S_TLSLDO:
       return ELF::R_ARM_TLS_LDO32;
-    case ARMMCExpr::VK_TLSCALL:
+    case ARM::S_TLSCALL:
       return ELF::R_ARM_TLS_CALL;
-    case ARMMCExpr::VK_TLSDESC:
+    case ARM::S_TLSDESC:
       return ELF::R_ARM_TLS_GOTDESC;
-    case ARMMCExpr::VK_TLSLDM:
+    case ARM::S_TLSLDM:
       return ELF::R_ARM_TLS_LDM32;
-    case ARMMCExpr::VK_TLSDESCSEQ:
+    case ARM::S_TLSDESCSEQ:
       return ELF::R_ARM_TLS_DESCSEQ;
-    case ARMMCExpr::VK_FUNCDESC:
+    case ARM::S_FUNCDESC:
       return CheckFDPIC(ELF::R_ARM_FUNCDESC);
-    case ARMMCExpr::VK_GOTFUNCDESC:
+    case ARM::S_GOTFUNCDESC:
       return CheckFDPIC(ELF::R_ARM_GOTFUNCDESC);
-    case ARMMCExpr::VK_GOTOFFFUNCDESC:
+    case ARM::S_GOTOFFFUNCDESC:
       return CheckFDPIC(ELF::R_ARM_GOTOFFFUNCDESC);
-    case ARMMCExpr::VK_TLSGD_FDPIC:
+    case ARM::S_TLSGD_FDPIC:
       return CheckFDPIC(ELF::R_ARM_TLS_GD32_FDPIC);
-    case ARMMCExpr::VK_TLSLDM_FDPIC:
+    case ARM::S_TLSLDM_FDPIC:
       return CheckFDPIC(ELF::R_ARM_TLS_LDM32_FDPIC);
-    case ARMMCExpr::VK_GOTTPOFF_FDPIC:
+    case ARM::S_GOTTPOFF_FDPIC:
       return CheckFDPIC(ELF::R_ARM_TLS_IE32_FDPIC);
     }
   case ARM::fixup_arm_condbranch:
@@ -277,9 +277,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for ARM MOVT instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_MOVT_ABS;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_MOVT_BREL;
     }
   case ARM::fixup_arm_movw_lo16:
@@ -287,9 +287,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for ARM MOVW instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_MOVW_ABS_NC;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_MOVW_BREL_NC;
     }
   case ARM::fixup_t2_movt_hi16:
@@ -297,9 +297,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for Thumb MOVT instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_THM_MOVT_ABS;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_THM_MOVT_BREL;
     }
   case ARM::fixup_t2_movw_lo16:
@@ -307,9 +307,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for Thumb MOVW instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_THM_MOVW_ABS_NC;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_THM_MOVW_BREL_NC;
     }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 2b959768d213..73ad62ed7953 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -14,7 +14,7 @@
 
 #include "ARMMCTargetDesc.h"
 #include "ARMUnwindOpAsm.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "Utils/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
@@ -590,7 +590,7 @@ public:
   /// necessary.
   void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
     if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
-      if (SRE->getSpecifier() == ARMMCExpr::VK_SBREL && !(Size == 4)) {
+      if (SRE->getSpecifier() == ARM::S_SBREL && !(Size == 4)) {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
         return;
       }
@@ -1255,7 +1255,7 @@ void ARMELFStreamer::emitFnEnd() {
     EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
 
   const MCSymbolRefExpr *FnStartRef =
-      MCSymbolRefExpr::create(FnStart, ARMMCExpr::VK_PREL31, getContext());
+      MCSymbolRefExpr::create(FnStart, ARM::S_PREL31, getContext());
 
   emitValue(FnStartRef, 4);
 
@@ -1264,7 +1264,7 @@ void ARMELFStreamer::emitFnEnd() {
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
-        MCSymbolRefExpr::create(ExTab, ARMMCExpr::VK_PREL31, getContext());
+        MCSymbolRefExpr::create(ExTab, ARM::S_PREL31, getContext());
     emitValue(ExTabEntryRef, 4);
   } else {
     // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
@@ -1294,8 +1294,8 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
 void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
   const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
 
-  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
-      PersonalitySym, ARMMCExpr::VK_ARM_NONE, getContext());
+  const MCSymbolRefExpr *PersonalityRef =
+      MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
 
   visitUsedExpr(*PersonalityRef);
   MCDataFragment *DF = getOrCreateDataFragment();
@@ -1341,7 +1341,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // Emit personality
   if (Personality) {
     const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
-        Personality, uint16_t(ARMMCExpr::VK_PREL31), getContext());
+        Personality, uint16_t(ARM::S_PREL31), getContext());
 
     emitValue(PersonalityRef, 4);
   }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 92121dd5704d..f8ec0237dcb5 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -18,30 +18,30 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {ARMMCExpr::VK_GOT_PREL, "GOT_PREL"},
-    {ARMMCExpr::VK_ARM_NONE, "none"},
-    {ARMMCExpr::VK_PREL31, "prel31"},
-    {ARMMCExpr::VK_SBREL, "sbrel"},
-    {ARMMCExpr::VK_TARGET1, "target1"},
-    {ARMMCExpr::VK_TARGET2, "target2"},
-    {ARMMCExpr::VK_TLSLDO, "TLSLDO"},
+    {ARM::S_GOT_PREL, "GOT_PREL"},
+    {ARM::S_ARM_NONE, "none"},
+    {ARM::S_PREL31, "prel31"},
+    {ARM::S_SBREL, "sbrel"},
+    {ARM::S_TARGET1, "target1"},
+    {ARM::S_TARGET2, "target2"},
+    {ARM::S_TLSLDO, "TLSLDO"},
     {MCSymbolRefExpr::VK_COFF_IMGREL32, "imgrel"},
-    {ARMMCExpr::VK_FUNCDESC, "FUNCDESC"},
-    {ARMMCExpr::VK_GOT, "GOT"},
-    {ARMMCExpr::VK_GOTFUNCDESC, "GOTFUNCDESC"},
-    {ARMMCExpr::VK_GOTOFF, "GOTOFF"},
-    {ARMMCExpr::VK_GOTOFFFUNCDESC, "GOTOFFFUNCDESC"},
-    {ARMMCExpr::VK_GOTTPOFF, "GOTTPOFF"},
-    {ARMMCExpr::VK_GOTTPOFF_FDPIC, "gottpoff_fdpic"},
-    {ARMMCExpr::VK_PLT, "PLT"},
+    {ARM::S_FUNCDESC, "FUNCDESC"},
+    {ARM::S_GOT, "GOT"},
+    {ARM::S_GOTFUNCDESC, "GOTFUNCDESC"},
+    {ARM::S_GOTOFF, "GOTOFF"},
+    {ARM::S_GOTOFFFUNCDESC, "GOTOFFFUNCDESC"},
+    {ARM::S_GOTTPOFF, "GOTTPOFF"},
+    {ARM::S_GOTTPOFF_FDPIC, "gottpoff_fdpic"},
+    {ARM::S_PLT, "PLT"},
     {MCSymbolRefExpr::VK_SECREL, "SECREL32"},
-    {ARMMCExpr::VK_TLSCALL, "tlscall"},
-    {ARMMCExpr::VK_TLSDESC, "tlsdesc"},
-    {ARMMCExpr::VK_TLSGD, "TLSGD"},
-    {ARMMCExpr::VK_TLSGD_FDPIC, "tlsgd_fdpic"},
-    {ARMMCExpr::VK_TLSLDM, "TLSLDM"},
-    {ARMMCExpr::VK_TLSLDM_FDPIC, "tlsldm_fdpic"},
-    {ARMMCExpr::VK_TPOFF, "TPOFF"},
+    {ARM::S_TLSCALL, "tlscall"},
+    {ARM::S_TLSDESC, "tlsdesc"},
+    {ARM::S_TLSGD, "TLSGD"},
+    {ARM::S_TLSGD_FDPIC, "tlsgd_fdpic"},
+    {ARM::S_TLSLDM, "TLSLDM"},
+    {ARM::S_TLSLDM_FDPIC, "tlsldm_fdpic"},
+    {ARM::S_TPOFF, "TPOFF"},
 };
 
 void ARMMCAsmInfoDarwin::anchor() { }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 55d7b299674d..baadf74e0d5a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -13,9 +13,11 @@
 #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 class Triple;
@@ -50,6 +52,49 @@ public:
   explicit ARMCOFFMCAsmInfoGNU();
 };
 
+namespace ARM {
+enum {
+  S_None,
+  S_HI16 =
+      MCSymbolRefExpr::FirstTargetSpecifier, // The R_ARM_MOVT_ABS relocation
+                                             // (:upper16: in the .s file)
+  S_LO16, // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+
+  S_HI_8_15, // The R_ARM_THM_ALU_ABS_G3    relocation (:upper8_15: in
+             // the .s file)
+  S_HI_0_7,  // The R_ARM_THM_ALU_ABS_G2_NC relocation (:upper0_8: in the
+             // .s file)
+  S_LO_8_15, // The R_ARM_THM_ALU_ABS_G1_NC relocation (:lower8_15: in
+             // the .s file)
+  S_LO_0_7,  // The R_ARM_THM_ALU_ABS_G0_NC relocation (:lower0_7: in the
+             // .s file)
+
+  S_ARM_NONE,
+  S_FUNCDESC,
+  S_GOT,
+  S_GOTFUNCDESC,
+  S_GOTOFF,
+  S_GOTOFFFUNCDESC,
+  S_GOTTPOFF,
+  S_GOTTPOFF_FDPIC,
+  S_GOT_PREL,
+  S_PLT,
+  S_PREL31,
+  S_SBREL,
+  S_TARGET1,
+  S_TARGET2,
+  S_TLSCALL,
+  S_TLSDESC,
+  S_TLSDESCSEQ,
+  S_TLSGD,
+  S_TLSGD_FDPIC,
+  S_TLSLDM,
+  S_TLSLDM_FDPIC,
+  S_TLSLDO,
+  S_TPOFF,
+};
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index e79cdbde62ca..f006e00ada32 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1201,18 +1201,18 @@ uint32_t ARMMCCodeEmitter::getHiLoImmOpValue(const MCInst &MI, unsigned OpIdx,
         report_fatal_error("constant value truncated (limited to 32-bit)");
 
       switch (ARM16Expr->getSpecifier()) {
-      case ARMMCExpr::VK_HI16:
+      case ARM::S_HI16:
         return (int32_t(Value) & 0xffff0000) >> 16;
-      case ARMMCExpr::VK_LO16:
+      case ARM::S_LO16:
         return (int32_t(Value) & 0x0000ffff);
 
-      case ARMMCExpr::VK_HI_8_15:
+      case ARM::S_HI_8_15:
         return (int32_t(Value) & 0xff000000) >> 24;
-      case ARMMCExpr::VK_HI_0_7:
+      case ARM::S_HI_0_7:
         return (int32_t(Value) & 0x00ff0000) >> 16;
-      case ARMMCExpr::VK_LO_8_15:
+      case ARM::S_LO_8_15:
         return (int32_t(Value) & 0x0000ff00) >> 8;
-      case ARMMCExpr::VK_LO_0_7:
+      case ARM::S_LO_0_7:
         return (int32_t(Value) & 0x000000ff);
 
       default: llvm_unreachable("Unsupported ARMFixup");
@@ -1221,30 +1221,30 @@ uint32_t ARMMCCodeEmitter::getHiLoImmOpValue(const MCInst &MI, unsigned OpIdx,
 
     switch (ARM16Expr->getSpecifier()) {
     default: llvm_unreachable("Unsupported ARMFixup");
-    case ARMMCExpr::VK_HI16:
+    case ARM::S_HI16:
       Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movt_hi16
                                       : ARM::fixup_arm_movt_hi16);
       break;
-    case ARMMCExpr::VK_LO16:
+    case ARM::S_LO16:
       Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movw_lo16
                                       : ARM::fixup_arm_movw_lo16);
       break;
-    case ARMMCExpr::VK_HI_8_15:
+    case ARM::S_HI_8_15:
       if (!isThumb(STI))
         llvm_unreachable(":upper_8_15: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_upper_8_15);
       break;
-    case ARMMCExpr::VK_HI_0_7:
+    case ARM::S_HI_0_7:
       if (!isThumb(STI))
         llvm_unreachable(":upper_0_7: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_upper_0_7);
       break;
-    case ARMMCExpr::VK_LO_8_15:
+    case ARM::S_LO_8_15:
       if (!isThumb(STI))
         llvm_unreachable(":lower_8_15: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_lower_8_15);
       break;
-    case ARMMCExpr::VK_LO_0_7:
+    case ARM::S_LO_0_7:
       if (!isThumb(STI))
         llvm_unreachable(":lower_0_7: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_lower_0_7);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index 1035a9e131c4..1e6760a57608 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCExpr.h"
+#include "ARMMCAsmInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -22,22 +23,22 @@ const ARMMCExpr *ARMMCExpr::create(Specifier S, const MCExpr *Expr,
 void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   switch (specifier) {
   default: llvm_unreachable("Invalid kind!");
-  case VK_HI16:
+  case ARM::S_HI16:
     OS << ":upper16:";
     break;
-  case VK_LO16:
+  case ARM::S_LO16:
     OS << ":lower16:";
     break;
-  case VK_HI_8_15:
+  case ARM::S_HI_8_15:
     OS << ":upper8_15:";
     break;
-  case VK_HI_0_7:
+  case ARM::S_HI_0_7:
     OS << ":upper0_7:";
     break;
-  case VK_LO_8_15:
+  case ARM::S_LO_8_15:
     OS << ":lower8_15:";
     break;
-  case VK_LO_0_7:
+  case ARM::S_LO_0_7:
     OS << ":lower0_7:";
     break;
   }
@@ -49,3 +50,29 @@ void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << ')';
 }
+
+const ARMMCExpr *ARMMCExpr::createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_HI16, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createLower16(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_LO16, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createUpper8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_HI_8_15, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_HI_0_7, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createLower8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_LO_8_15, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_LO_0_7, Expr, Ctx);
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index bcd92de3434a..f29d05ba2a88 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -16,46 +16,6 @@ namespace llvm {
 class ARMMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = uint16_t;
-  enum {
-    VK_None,
-    VK_HI16 =
-        MCSymbolRefExpr::FirstTargetSpecifier, // The R_ARM_MOVT_ABS relocation
-                                               // (:upper16: in the .s file)
-    VK_LO16, // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
-
-    VK_HI_8_15, // The R_ARM_THM_ALU_ABS_G3    relocation (:upper8_15: in
-                // the .s file)
-    VK_HI_0_7,  // The R_ARM_THM_ALU_ABS_G2_NC relocation (:upper0_8: in the
-                // .s file)
-    VK_LO_8_15, // The R_ARM_THM_ALU_ABS_G1_NC relocation (:lower8_15: in
-                // the .s file)
-    VK_LO_0_7,  // The R_ARM_THM_ALU_ABS_G0_NC relocation (:lower0_7: in the
-                // .s file)
-
-    VK_ARM_NONE,
-    VK_FUNCDESC,
-    VK_GOT,
-    VK_GOTFUNCDESC,
-    VK_GOTOFF,
-    VK_GOTOFFFUNCDESC,
-    VK_GOTTPOFF,
-    VK_GOTTPOFF_FDPIC,
-    VK_GOT_PREL,
-    VK_PLT,
-    VK_PREL31,
-    VK_SBREL,
-    VK_TARGET1,
-    VK_TARGET2,
-    VK_TLSCALL,
-    VK_TLSDESC,
-    VK_TLSDESCSEQ,
-    VK_TLSGD,
-    VK_TLSGD_FDPIC,
-    VK_TLSLDM,
-    VK_TLSLDM_FDPIC,
-    VK_TLSLDO,
-    VK_TPOFF,
-  };
 
 private:
   explicit ARMMCExpr(Specifier S, const MCExpr *Expr)
@@ -65,29 +25,12 @@ public:
   static const ARMMCExpr *create(Specifier S, const MCExpr *Expr,
                                  MCContext &Ctx);
 
-  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI16, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO16, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI_8_15, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI_0_7, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO_8_15, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO_0_7, Expr, Ctx);
-  }
+  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res,

From 7efc861ec45e05be9dae59fc7483a98510066160 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 16 Jun 2025 07:29:18 +0100
Subject: [PATCH 0464/1322] [AArch64][GlobalISel] Add test coverage for
 fdiv-combine.ll. NFC

---
 llvm/test/CodeGen/AArch64/fdiv-combine.ll | 156 +++++++++++++++-------
 1 file changed, 105 insertions(+), 51 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
index 0627250d0779..d8f7f0a30668 100644
--- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
@@ -1,19 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for splat_fdiv_nxv4f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for splat_three_fdiv_nxv4f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for splat_fdiv_nxv2f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for splat_two_fdiv_nxv2f64
 
 ; Following test cases check:
 ;   a / D; b / D; c / D;
 ;                =>
 ;   recip = 1.0 / D; a * recip; b * recip; c * recip;
 define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
-; CHECK-LABEL: three_fdiv_float:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s4, #1.00000000
-; CHECK-NEXT:    fdiv s4, s4, s0
-; CHECK-NEXT:    fmul s0, s1, s4
-; CHECK-NEXT:    fmul s1, s2, s4
-; CHECK-NEXT:    fmul s2, s3, s4
-; CHECK-NEXT:    b foo_3f
+; CHECK-SD-LABEL: three_fdiv_float:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s4, #1.00000000
+; CHECK-SD-NEXT:    fdiv s4, s4, s0
+; CHECK-SD-NEXT:    fmul s0, s1, s4
+; CHECK-SD-NEXT:    fmul s1, s2, s4
+; CHECK-SD-NEXT:    fmul s2, s3, s4
+; CHECK-SD-NEXT:    b foo_3f
+;
+; CHECK-GI-LABEL: three_fdiv_float:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv s4, s1, s0
+; CHECK-GI-NEXT:    fdiv s1, s2, s0
+; CHECK-GI-NEXT:    fdiv s2, s3, s0
+; CHECK-GI-NEXT:    fmov s0, s4
+; CHECK-GI-NEXT:    b foo_3f
   %div = fdiv float %a, %D
   %div1 = fdiv float %b, %D
   %div2 = fdiv float %c, %D
@@ -22,14 +36,22 @@ define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
 }
 
 define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
-; CHECK-LABEL: three_fdiv_double:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d4, #1.00000000
-; CHECK-NEXT:    fdiv d4, d4, d0
-; CHECK-NEXT:    fmul d0, d1, d4
-; CHECK-NEXT:    fmul d1, d2, d4
-; CHECK-NEXT:    fmul d2, d3, d4
-; CHECK-NEXT:    b foo_3d
+; CHECK-SD-LABEL: three_fdiv_double:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d4, #1.00000000
+; CHECK-SD-NEXT:    fdiv d4, d4, d0
+; CHECK-SD-NEXT:    fmul d0, d1, d4
+; CHECK-SD-NEXT:    fmul d1, d2, d4
+; CHECK-SD-NEXT:    fmul d2, d3, d4
+; CHECK-SD-NEXT:    b foo_3d
+;
+; CHECK-GI-LABEL: three_fdiv_double:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv d4, d1, d0
+; CHECK-GI-NEXT:    fdiv d1, d2, d0
+; CHECK-GI-NEXT:    fdiv d2, d3, d0
+; CHECK-GI-NEXT:    fmov d0, d4
+; CHECK-GI-NEXT:    b foo_3d
   %div = fdiv double %a, %D
   %div1 = fdiv double %b, %D
   %div2 = fdiv double %c, %D
@@ -38,14 +60,22 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
 }
 
 define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
-; CHECK-LABEL: three_fdiv_4xfloat:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov v4.4s, #1.00000000
-; CHECK-NEXT:    fdiv v4.4s, v4.4s, v0.4s
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmul v2.4s, v3.4s, v4.4s
-; CHECK-NEXT:    b foo_3_4xf
+; CHECK-SD-LABEL: three_fdiv_4xfloat:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov v4.4s, #1.00000000
+; CHECK-SD-NEXT:    fdiv v4.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    b foo_3_4xf
+;
+; CHECK-GI-LABEL: three_fdiv_4xfloat:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv v4.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    fdiv v1.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    fdiv v2.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    b foo_3_4xf
   %div = fdiv <4 x float> %a, %D
   %div1 = fdiv <4 x float> %b, %D
   %div2 = fdiv <4 x float> %c, %D
@@ -54,14 +84,22 @@ define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b,
 }
 
 define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
-; CHECK-LABEL: three_fdiv_2xdouble:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov v4.2d, #1.00000000
-; CHECK-NEXT:    fdiv v4.2d, v4.2d, v0.2d
-; CHECK-NEXT:    fmul v0.2d, v1.2d, v4.2d
-; CHECK-NEXT:    fmul v1.2d, v2.2d, v4.2d
-; CHECK-NEXT:    fmul v2.2d, v3.2d, v4.2d
-; CHECK-NEXT:    b foo_3_2xd
+; CHECK-SD-LABEL: three_fdiv_2xdouble:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov v4.2d, #1.00000000
+; CHECK-SD-NEXT:    fdiv v4.2d, v4.2d, v0.2d
+; CHECK-SD-NEXT:    fmul v0.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    fmul v1.2d, v2.2d, v4.2d
+; CHECK-SD-NEXT:    fmul v2.2d, v3.2d, v4.2d
+; CHECK-SD-NEXT:    b foo_3_2xd
+;
+; CHECK-GI-LABEL: three_fdiv_2xdouble:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv v4.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    fdiv v1.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    fdiv v2.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    b foo_3_2xd
   %div = fdiv <2 x double> %a, %D
   %div1 = fdiv <2 x double> %b, %D
   %div2 = fdiv <2 x double> %c, %D
@@ -98,16 +136,25 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 {
 }
 
 define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
-; CHECK-LABEL: splat_three_fdiv_4xfloat:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    fmov v4.4s, #1.00000000
-; CHECK-NEXT:    dup v0.4s, v0.s[0]
-; CHECK-NEXT:    fdiv v4.4s, v4.4s, v0.4s
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmul v2.4s, v3.4s, v4.4s
-; CHECK-NEXT:    b foo_3_4xf
+; CHECK-SD-LABEL: splat_three_fdiv_4xfloat:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    fmov v4.4s, #1.00000000
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT:    fdiv v4.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    b foo_3_4xf
+;
+; CHECK-GI-LABEL: splat_three_fdiv_4xfloat:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    dup v4.4s, v0.s[0]
+; CHECK-GI-NEXT:    fdiv v0.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    fdiv v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    fdiv v2.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    b foo_3_4xf
   %D.ins = insertelement <4 x float> poison, float %D, i64 0
   %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
   %div = fdiv <4 x float> %a, %splat
@@ -118,14 +165,21 @@ define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b,
 }
 
 define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {
-; CHECK-LABEL: splat_fdiv_v4f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    fmov v2.4s, #1.00000000
-; CHECK-NEXT:    dup v0.4s, v0.s[0]
-; CHECK-NEXT:    fdiv v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: splat_fdiv_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    fmov v2.4s, #1.00000000
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT:    fdiv v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: splat_fdiv_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-GI-NEXT:    fdiv v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %D.ins = insertelement <4 x float> poison, float %D, i64 0
   %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer

From f875efe1d82d920790e368f9ab2b31f173a523e1 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 16 Jun 2025 13:32:44 +0800
Subject: [PATCH 0465/1322] [RISCV] Use `GetVTypeMinimalPredicates` instead of
 `GetVTypePredicates` for vrgatherei16/vslideup/vslidedown. NFC.

---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 5e554d2d0391..9c03c7c83af0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -4916,8 +4916,8 @@ multiclass VPatBinaryV_VV_INT_EEW<string intrinsic, string instruction,
       defvar emul_str = octuple_to_str<octuple_emul>.ret;
       defvar ivti = !cast<VTypeInfo>("VI" # eew # emul_str);
       defvar inst = instruction # "_VV_" # vti.LMul.MX # "_E" # vti.SEW # "_" # emul_str;
-      let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                                   GetVTypePredicates<ivti>.Predicates) in
+      let Predicates = !listconcat(GetVTypeMinimalPredicates<vti>.Predicates,
+                                   GetVTypeMinimalPredicates<ivti>.Predicates) in
       defm : VPatBinary<intrinsic, inst,
                         vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
                         vti.Log2SEW, vti.RegClass,
@@ -5584,7 +5584,7 @@ multiclass VPatTernaryV_VV_AAXA_RM<string intrinsic, string instruction,
 multiclass VPatTernaryV_VX<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatTernaryWithPolicy<intrinsic, instruction, "VX",
                                  vti.Vector, vti.Vector, XLenVT, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
@@ -5616,7 +5616,7 @@ multiclass VPatTernaryV_VX_AAXA_RM<string intrinsic, string instruction,
 multiclass VPatTernaryV_VI<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist, Operand Imm_type> {
   foreach vti = vtilist in
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatTernaryWithPolicy<intrinsic, instruction, "VI",
                                  vti.Vector, vti.Vector, XLenVT, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
@@ -7414,12 +7414,8 @@ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllInteger
 defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
 
-defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectorsExceptFP16, uimm5>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFP16Vectors, uimm5>;
-defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectorsExceptFP16, uimm5>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFP16Vectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
 
@@ -7436,10 +7432,7 @@ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                 AllBFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
-                              eew=16, vtilist=AllFloatVectorsExceptFP16>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
-                                eew=16, vtilist=AllFP16Vectors>;
+                              eew=16, vtilist=AllFloatVectors>;
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//

From 7d9a451d875368baece310ca7226e3adbc00e1bf Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Mon, 16 Jun 2025 12:28:12 +0530
Subject: [PATCH 0466/1322] [RISCV] Change input register type for QC_SWM and
 QC_SWMI (#144294)

Version 0.13 of the `Xqci` spec changes the register type of input
operand `rs3` from `GPR` to `GPRNoX0` for these two instructions.

The spec can be found at
https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0
---
 llvm/docs/RISCVUsage.rst                    |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 17 +++++++++--------
 llvm/test/MC/RISCV/xqcilsm-invalid.s        |  8 ++++++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 7d0d0cc21a27..64f17f59575e 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -487,7 +487,7 @@ The current vendor extensions supported are:
   LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilsm``
-  LLVM implements `version 0.5 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.6 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisim``
   LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 9f96a3ed8056..b94fee3c6e57 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -574,9 +574,10 @@ class QCILoadMultiple<bits<2> funct2, DAGOperand InTyRs2, string opcodestr>
 
 // rd corresponds to the source for the store 'rs3' described in the spec.
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-class QCIStoreMultiple<bits<2> funct2, DAGOperand InTyRs2, string opcodestr>
+class QCIStoreMultiple<bits<2> funct2, DAGOperand InTyRd, DAGOperand InTyRs2,
+                       string opcodestr>
     : RVInstRBase<0b111, OPC_CUSTOM_1, (outs),
-                  (ins GPR:$rd, GPR:$rs1, InTyRs2:$rs2, uimm7_lsb00:$imm),
+                  (ins InTyRd:$rd, GPR:$rs1, InTyRs2:$rs2, uimm7_lsb00:$imm),
                   opcodestr, "$rd, $rs2, ${imm}(${rs1})"> {
   bits<7> imm;
   let Inst{31-25} = {funct2, imm{6-2}};
@@ -967,10 +968,10 @@ let Predicates = [HasVendorXqcics, IsRV32] in {
 } // Predicates = [HasVendorXqcics, IsRV32]
 
 let Predicates = [HasVendorXqcilsm, IsRV32] in {
-    def QC_SWM : QCIStoreMultiple<0b00, GPRNoX0, "qc.swm">;
-    def QC_SWMI : QCIStoreMultiple<0b01, uimm5nonzero, "qc.swmi">;
-    def QC_SETWM : QCIStoreMultiple<0b10, GPRNoX0, "qc.setwm">;
-    def QC_SETWMI : QCIStoreMultiple<0b11, uimm5nonzero, "qc.setwmi">;
+    def QC_SWM : QCIStoreMultiple<0b00, GPRNoX0, GPRNoX0, "qc.swm">;
+    def QC_SWMI : QCIStoreMultiple<0b01, GPRNoX0, uimm5nonzero, "qc.swmi">;
+    def QC_SETWM : QCIStoreMultiple<0b10, GPR, GPRNoX0, "qc.setwm">;
+    def QC_SETWMI : QCIStoreMultiple<0b11, GPR, uimm5nonzero, "qc.setwmi">;
 
     def QC_LWM : QCILoadMultiple<0b00, GPRNoX0, "qc.lwm">;
     def QC_LWMI : QCILoadMultiple<0b01, uimm5nonzero, "qc.lwmi">;
@@ -1211,9 +1212,9 @@ let EmitPriority = 0 in {
 let Predicates = [HasVendorXqcilsm, IsRV32] in {
 let EmitPriority = 0 in {
   def : InstAlias<"qc.swm $rs3, $rs2, (${rs1})",
-                  (QC_SWM GPR:$rs3, GPR:$rs1, GPRNoX0:$rs2, 0)>;
+                  (QC_SWM GPRNoX0:$rs3, GPR:$rs1, GPRNoX0:$rs2, 0)>;
   def : InstAlias<"qc.swmi $rs3, $length, (${rs1})",
-                  (QC_SWMI GPR:$rs3, GPR:$rs1, uimm5nonzero:$length, 0)>;
+                  (QC_SWMI GPRNoX0:$rs3, GPR:$rs1, uimm5nonzero:$length, 0)>;
   def : InstAlias<"qc.setwm $rs3, $rs2, (${rs1})",
                   (QC_SETWM GPR:$rs3, GPR:$rs1, GPRNoX0:$rs2, 0)>;
   def : InstAlias<"qc.setwmi $rs3, $length, (${rs1})",
diff --git a/llvm/test/MC/RISCV/xqcilsm-invalid.s b/llvm/test/MC/RISCV/xqcilsm-invalid.s
index 15d55021d64e..a3421db0eff4 100644
--- a/llvm/test/MC/RISCV/xqcilsm-invalid.s
+++ b/llvm/test/MC/RISCV/xqcilsm-invalid.s
@@ -7,6 +7,10 @@
 # CHECK: :[[@LINE+1]]:20: error: expected register
 qc.swm x5, x20, 12(20)
 
+# CHECK-PLUS: :[[@LINE+2]]:8: error: register must be a GPR excluding zero (x0)
+# CHECK-MINUS: :[[@LINE+1]]:8: error: invalid operand for instruction
+qc.swm x0, x20, 12(x3)
+
 # CHECK-PLUS: :[[@LINE+2]]:12: error: register must be a GPR excluding zero (x0)
 # CHECK-MINUS: :[[@LINE+1]]:12: error: invalid operand for instruction
 qc.swm x5, x0, 12(x3)
@@ -24,6 +28,10 @@ qc.swm x5, x20, 12(x3)
 # CHECK: :[[@LINE+1]]:20: error: expected register
 qc.swmi x10, 4, 20(4)
 
+# CHECK-PLUS: :[[@LINE+2]]:9: error: register must be a GPR excluding zero (x0)
+# CHECK-MINUS: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.swmi x0, 4, 20(x4)
+
 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
 qc.swmi x10, 4, 20
 

From 222ab28a9240e03479341cba2f487b8350635fce Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Mon, 16 Jun 2025 00:15:06 -0700
Subject: [PATCH 0467/1322] [aarch64] Fix Arm64EC libcall lowering after recent
 refactoring. (#143977)

The refactored code accidentally tokenized a string instead of just
concatenating it.

Add a regression test and some assertions to ensure consistency.

Fixes #143890 .
---
 llvm/include/llvm/IR/RuntimeLibcalls.def      | 48 +++++++++----------
 llvm/lib/IR/RuntimeLibcalls.cpp               | 15 ++++--
 .../WebAssemblyRuntimeLibcallSignatures.cpp   |  2 +
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll | 45 +++++++++++++++++
 4 files changed, 82 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 4ddae8e48193..247643525ff4 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -70,16 +70,16 @@ HANDLE_LIBCALL(UREM_I16, "__umodhi3")
 HANDLE_LIBCALL(UREM_I32, "__umodsi3")
 HANDLE_LIBCALL(UREM_I64, "__umoddi3")
 HANDLE_LIBCALL(UREM_I128, "__umodti3")
-HANDLE_LIBCALL(SDIVREM_I8, nullptr)
-HANDLE_LIBCALL(SDIVREM_I16, nullptr)
-HANDLE_LIBCALL(SDIVREM_I32, nullptr)
-HANDLE_LIBCALL(SDIVREM_I64, nullptr)
-HANDLE_LIBCALL(SDIVREM_I128, nullptr)
-HANDLE_LIBCALL(UDIVREM_I8, nullptr)
-HANDLE_LIBCALL(UDIVREM_I16, nullptr)
-HANDLE_LIBCALL(UDIVREM_I32, nullptr)
-HANDLE_LIBCALL(UDIVREM_I64, nullptr)
-HANDLE_LIBCALL(UDIVREM_I128, nullptr)
+HANDLE_LIBCALL(SDIVREM_I8, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I16, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I64, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I128, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I8, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I16, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I64, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I128, LIBCALL_NO_NAME)
 HANDLE_LIBCALL(NEG_I32, "__negsi2")
 HANDLE_LIBCALL(NEG_I64, "__negdi2")
 HANDLE_LIBCALL(CTLZ_I32, "__clzsi2")
@@ -240,13 +240,13 @@ HANDLE_LIBCALL(ATAN2_F64, "atan2")
 HANDLE_LIBCALL(ATAN2_F80, "atan2l")
 HANDLE_LIBCALL(ATAN2_F128,"atan2l")
 HANDLE_LIBCALL(ATAN2_PPCF128, "atan2l")
-HANDLE_LIBCALL(SINCOS_F32, nullptr)
-HANDLE_LIBCALL(SINCOS_F64, nullptr)
-HANDLE_LIBCALL(SINCOS_F80, nullptr)
-HANDLE_LIBCALL(SINCOS_F128, nullptr)
-HANDLE_LIBCALL(SINCOS_PPCF128, nullptr)
-HANDLE_LIBCALL(SINCOS_STRET_F32, nullptr)
-HANDLE_LIBCALL(SINCOS_STRET_F64, nullptr)
+HANDLE_LIBCALL(SINCOS_F32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_F64, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_F80, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_F128, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_PPCF128, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_STRET_F32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_STRET_F64, LIBCALL_NO_NAME)
 HANDLE_LIBCALL(POW_F32, "powf")
 HANDLE_LIBCALL(POW_F64, "pow")
 HANDLE_LIBCALL(POW_F80, "powl")
@@ -518,7 +518,7 @@ HANDLE_LIBCALL(MEMMOVE, "memmove")
 HANDLE_LIBCALL(MEMSET, "memset")
 // DSEPass can emit calloc if it finds a pair of malloc/memset
 HANDLE_LIBCALL(CALLOC, "calloc")
-HANDLE_LIBCALL(BZERO, nullptr)
+HANDLE_LIBCALL(BZERO, LIBCALL_NO_NAME)
 
 // Element-wise unordered-atomic memory of different sizes
 HANDLE_LIBCALL(MEMCPY_ELEMENT_UNORDERED_ATOMIC_1, "__llvm_memcpy_element_unordered_atomic_1")
@@ -669,10 +669,10 @@ HANDLE_LIBCALL(ATOMIC_FETCH_NAND_16, "__atomic_fetch_nand_16")
 
 // Out-of-line atomics libcalls
 #define HLCALLS(A, N)                                                          \
-  HANDLE_LIBCALL(A##N##_RELAX, nullptr)                                        \
-  HANDLE_LIBCALL(A##N##_ACQ, nullptr)                                          \
-  HANDLE_LIBCALL(A##N##_REL, nullptr)                                          \
-  HANDLE_LIBCALL(A##N##_ACQ_REL, nullptr)
+  HANDLE_LIBCALL(A##N##_RELAX, LIBCALL_NO_NAME)                                \
+  HANDLE_LIBCALL(A##N##_ACQ, LIBCALL_NO_NAME)                                  \
+  HANDLE_LIBCALL(A##N##_REL, LIBCALL_NO_NAME)                                  \
+  HANDLE_LIBCALL(A##N##_ACQ_REL, LIBCALL_NO_NAME)
 #define HLCALL5(A)                                                             \
   HLCALLS(A, 1) HLCALLS(A, 2) HLCALLS(A, 4) HLCALLS(A, 8) HLCALLS(A, 16)
 HLCALL5(OUTLINE_ATOMIC_CAS)
@@ -691,11 +691,11 @@ HANDLE_LIBCALL(STACKPROTECTOR_CHECK_FAIL, "__stack_chk_fail")
 HANDLE_LIBCALL(DEOPTIMIZE, "__llvm_deoptimize")
 
 // Return address
-HANDLE_LIBCALL(RETURN_ADDRESS, nullptr)
+HANDLE_LIBCALL(RETURN_ADDRESS, LIBCALL_NO_NAME)
 
 // Clear cache
 HANDLE_LIBCALL(CLEAR_CACHE, "__clear_cache")
 HANDLE_LIBCALL(RISCV_FLUSH_ICACHE, "__riscv_flush_icache")
 
-HANDLE_LIBCALL(UNKNOWN_LIBCALL, nullptr)
+HANDLE_LIBCALL(UNKNOWN_LIBCALL, LIBCALL_NO_NAME)
 
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d655f84b37c5..d63d398e243f 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -21,13 +21,17 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
 #define HANDLE_LIBCALL(code, name)                                             \
-  {                                                                            \
+  if (sizeof(name) != 1) {                                                     \
     const char *libcallName = Info.getLibcallName(RTLIB::code);                \
-    if (libcallName && libcallName[0] != '#')                                  \
-      Info.setLibcallName(RTLIB::code, "#" #name);                             \
+    if (libcallName && libcallName[0] != '#') {                                \
+      assert(strcmp(libcallName, name) == 0 && "Unexpected name");             \
+      Info.setLibcallName(RTLIB::code, "#" name);                              \
+    }                                                                          \
   }
+#define LIBCALL_NO_NAME ""
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+#undef LIBCALL_NO_NAME
   }
 }
 
@@ -223,8 +227,10 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
             nullptr);
 
 #define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
+#define LIBCALL_NO_NAME nullptr
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+#undef LIBCALL_NO_NAME
 
   // Initialize calling conventions to their default.
   for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
@@ -462,7 +468,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
   }
 
   // Setup Windows compiler runtime calls.
-  if (TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()) {
+  if (TT.getArch() == Triple::x86 &&
+      (TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment())) {
     static const struct {
       const RTLIB::Libcall Op;
       const char *const Name;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index ce795d3dedc6..d5c4532824c0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -531,8 +531,10 @@ struct StaticLibcallNameMap {
   StaticLibcallNameMap() {
     static const std::pair<const char *, RTLIB::Libcall> NameLibcalls[] = {
 #define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code},
+#define LIBCALL_NO_NAME nullptr
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+#undef LIBCALL_NO_NAME
     };
     for (const auto &NameLibcall : NameLibcalls) {
       if (NameLibcall.first != nullptr &&
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
new file mode 100644
index 000000000000..92b95a90d89a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+
+define void @f1(ptr %p, i64 %n) {
+; CHECK-LABEL: "#f1":
+; CHECK: bl "#memset"
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 %n, i1 false)
+  ret void
+}
+
+define void @f2(ptr %p1, ptr %p2, i64 %n) {
+; CHECK-LABEL: "#f2":
+; CHECK: bl "#memcpy"
+  call void @llvm.memcpy.p0.i64(ptr %p1, ptr %p2, i64 %n, i1 false)
+  ret void
+}
+
+define double @f3(double %x, double %y) {
+; CHECK-LABEL: "#f3":
+; CHECK: b "#fmod"
+  %r = frem double %x, %y
+  ret double %r
+}
+
+define i128 @f4(i128 %x, i128 %y) {
+; CHECK-LABEL: "#f4":
+; CHECK: bl "#__divti3"
+  %r = sdiv i128 %x, %y
+  ret i128 %r
+}
+
+; FIXME: This is wrong; should be "#__aarch64_cas1_relax"
+define i8 @f5(i8 %expected, i8 %new, ptr %ptr) "target-features"="+outline-atomics" {
+; CHECK-LABEL: "#f5":
+; CHECK: bl __aarch64_cas1_relax
+    %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
+   %r = extractvalue { i8, i1 } %pair, 0
+    ret i8 %r
+}
+
+define float @f6(float %val, i32 %a) {
+; CHECK-LABEL: "#f6":
+; CHECK: bl "#ldexp"
+  %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
+  ret float %call
+}

From 9fcd14d9b013d0c4b8ec245772b3be3d5c31b885 Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Mon, 16 Jun 2025 09:21:05 +0200
Subject: [PATCH 0468/1322] [MLIR][ODS] Optionally generate public C++
 functions for attribute constraints (#144275)

Add `gen-attr-constraint-decls` and `gen-attr-constraint-defs`, which
generate public C++ functions for attribute constraints. The name of the C++
function is specified in the `cppFunctionName` field.

This generalize `cppFunctionName` from `TypeConstraint` introduced in
 https://github.com/llvm/llvm-project/pull/104577 to be usable also in `AttrConstraint`.
---
 mlir/docs/DefiningDialects/Constraints.md   | 23 ++---
 mlir/include/mlir/IR/Constraints.td         | 19 +++--
 mlir/test/mlir-tblgen/attr-constraints.td   | 14 ++++
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 93 +++++++++++++++++----
 4 files changed, 115 insertions(+), 34 deletions(-)
 create mode 100644 mlir/test/mlir-tblgen/attr-constraints.td

diff --git a/mlir/docs/DefiningDialects/Constraints.md b/mlir/docs/DefiningDialects/Constraints.md
index 52a4283d6084..40863e7aecf4 100644
--- a/mlir/docs/DefiningDialects/Constraints.md
+++ b/mlir/docs/DefiningDialects/Constraints.md
@@ -24,8 +24,8 @@ code is generated for type/attribute constraints. Type constraints can not only
 be used when defining operation arguments, but also when defining type
 parameters.
 
-Optionally, C++ functions can be generated, so that type constraints can be
-checked from C++. The name of the C++ function must be specified in the
+Optionally, C++ functions can be generated, so that type/attribute constraints
+can be checked from C++. The name of the C++ function must be specified in the
 `cppFunctionName` field. If no function name is specified, no C++ function is
 emitted.
 
@@ -43,17 +43,20 @@ bool isValidVectorTypeElementType(::mlir::Type type) {
 }
 ```
 
-An extra TableGen rule is needed to emit C++ code for type constraints. This
-will generate only the declarations/definitions of the type constaraints that
-are defined in the specified `.td` file, but not those that are in included
-`.td` files.
+An extra TableGen rule is needed to emit C++ code for type/attribute
+constraints. This will generate only the declarations/definitions of the
+type/attribute constaraints that are defined in the specified `.td` file, but
+not those that are in included `.td` files.
 
 ```cmake
 mlir_tablegen(<Your Dialect>TypeConstraints.h.inc -gen-type-constraint-decls)
 mlir_tablegen(<Your Dialect>TypeConstraints.cpp.inc -gen-type-constraint-defs)
+mlir_tablegen(<Your Dialect>AttrConstraints.h.inc -gen-attr-constraint-decls)
+mlir_tablegen(<Your Dialect>AttrConstraints.cpp.inc -gen-attr-constraint-defs)
 ```
 
-The generated `<Your Dialect>TypeConstraints.h.inc` will need to be included
-whereever you are referencing the type constraint in C++. Note that no C++
-namespace will be emitted by the code generator. The `#include` statements of
-the `.h.inc`/`.cpp.inc` files should be wrapped in C++ namespaces by the user.
+The generated `<Your Dialect>TypeConstraints.h.inc` respectivelly
+`<Your Dialect>AttrConstraints.h.inc` will need to be included whereever you are
+referencing the type/attributes constraint in C++. Note that no C++ namespace
+will be emitted by the code generator. The `#include` statements of the
+`.h.inc`/`.cpp.inc` files should be wrapped in C++ namespaces by the user.
diff --git a/mlir/include/mlir/IR/Constraints.td b/mlir/include/mlir/IR/Constraints.td
index 33e8581ecd35..0d59fffce9df 100644
--- a/mlir/include/mlir/IR/Constraints.td
+++ b/mlir/include/mlir/IR/Constraints.td
@@ -148,6 +148,15 @@ class Constraint<Pred pred, string desc = ""> {
   string summary = desc;
 }
 
+// Base class for constraints on types and attributes.
+class AttrTypeConstraint<Pred pred, string summary = "",
+                         string cppFunctionNameParam = ""> :
+    Constraint<pred, summary> {
+  // The name of the C++ function that is generated for this constraint.
+  // If empty, no C++ function is generated.
+  string cppFunctionName = cppFunctionNameParam;
+}
+
 // Subclasses used to differentiate different constraint kinds. These are used
 // as markers for the TableGen backend to handle different constraint kinds
 // differently if needed. Constraints not deriving from the following subclasses
@@ -157,17 +166,15 @@ class Constraint<Pred pred, string desc = ""> {
 class TypeConstraint<Pred predicate, string summary = "",
                      string cppTypeParam = "::mlir::Type",
                      string cppFunctionNameParam = ""> :
-    Constraint<predicate, summary> {
+    AttrTypeConstraint<predicate, summary, cppFunctionNameParam> {
   // The name of the C++ Type class if known, or Type if not.
   string cppType = cppTypeParam;
-  // The name of the C++ function that is generated for this type constraint.
-  // If empty, no C++ function is generated.
-  string cppFunctionName = cppFunctionNameParam;
 }
 
 // Subclass for constraints on an attribute.
-class AttrConstraint<Pred predicate, string summary = ""> :
-    Constraint<predicate, summary>;
+class AttrConstraint<Pred predicate, string summary = "",
+                     string cppFunctionNameParam = ""> :
+    AttrTypeConstraint<predicate, summary, cppFunctionNameParam>;
 
 // Subclass for constraints on a property.
 class PropConstraint<Pred predicate, string summary = "", string interfaceTypeParam = ""> :
diff --git a/mlir/test/mlir-tblgen/attr-constraints.td b/mlir/test/mlir-tblgen/attr-constraints.td
new file mode 100644
index 000000000000..59bc5f252660
--- /dev/null
+++ b/mlir/test/mlir-tblgen/attr-constraints.td
@@ -0,0 +1,14 @@
+// RUN: mlir-tblgen -gen-attr-constraint-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-attr-constraint-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEF
+
+include "mlir/IR/CommonAttrConstraints.td"
+
+def DummyConstraint : AnyAttrOf<[APIntAttr, ArrayAttr, UnitAttr]> {
+  let cppFunctionName = "isValidDummy";
+}
+
+// DECL: bool isValidDummy(::mlir::Attribute attr);
+
+// DEF: bool isValidDummy(::mlir::Attribute attr) {
+// DEF:   return (((::llvm::isa<::mlir::IntegerAttr>(attr))) || ((::llvm::isa<::mlir::ArrayAttr>(attr))) || ((::llvm::isa<::mlir::UnitAttr>(attr))));
+// DEF: }
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 2a6071602fa4..defd1fa12ca1 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -1083,15 +1083,15 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) {
 }
 
 //===----------------------------------------------------------------------===//
-// Type Constraints
+// Constraints
 //===----------------------------------------------------------------------===//
 
 /// Find all type constraints for which a C++ function should be generated.
-static std::vector<Constraint>
-getAllTypeConstraints(const RecordKeeper &records) {
+static std::vector<Constraint> getAllCppConstraints(const RecordKeeper &records,
+                                                    StringRef constraintKind) {
   std::vector<Constraint> result;
   for (const Record *def :
-       records.getAllDerivedDefinitionsIfDefined("TypeConstraint")) {
+       records.getAllDerivedDefinitionsIfDefined(constraintKind)) {
     // Ignore constraints defined outside of the top-level file.
     if (llvm::SrcMgr.FindBufferContainingLoc(def->getLoc()[0]) !=
         llvm::SrcMgr.getMainFileID())
@@ -1105,30 +1105,72 @@ getAllTypeConstraints(const RecordKeeper &records) {
   return result;
 }
 
+static std::vector<Constraint>
+getAllCppTypeConstraints(const RecordKeeper &records) {
+  return getAllCppConstraints(records, "TypeConstraint");
+}
+
+static std::vector<Constraint>
+getAllCppAttrConstraints(const RecordKeeper &records) {
+  return getAllCppConstraints(records, "AttrConstraint");
+}
+
+/// Emit the declarations for the given constraints, of the form:
+/// `bool <constraintCppFunctionName>(<parameterTypeName> <parameterName>);`
+static void emitConstraintDecls(const std::vector<Constraint> &constraints,
+                                raw_ostream &os, StringRef parameterTypeName,
+                                StringRef parameterName) {
+  static const char *const constraintDecl = "bool {0}({1} {2});\n";
+  for (Constraint constr : constraints)
+    os << strfmt(constraintDecl, *constr.getCppFunctionName(),
+                 parameterTypeName, parameterName);
+}
+
 static void emitTypeConstraintDecls(const RecordKeeper &records,
                                     raw_ostream &os) {
-  static const char *const typeConstraintDecl = R"(
-bool {0}(::mlir::Type type);
+  emitConstraintDecls(getAllCppTypeConstraints(records), os, "::mlir::Type",
+                      "type");
+}
+
+static void emitAttrConstraintDecls(const RecordKeeper &records,
+                                    raw_ostream &os) {
+  emitConstraintDecls(getAllCppAttrConstraints(records), os,
+                      "::mlir::Attribute", "attr");
+}
+
+/// Emit the definitions for the given constraints, of the form:
+/// `bool <constraintCppFunctionName>(<parameterTypeName> <parameterName>) {
+///   return (<condition>); }`
+/// where `<condition>` is the condition template with the `self` variable
+/// replaced with the `selfName` parameter.
+static void emitConstraintDefs(const std::vector<Constraint> &constraints,
+                               raw_ostream &os, StringRef parameterTypeName,
+                               StringRef selfName) {
+  static const char *const constraintDef = R"(
+bool {0}({1} {2}) {
+return ({3});
+}
 )";
 
-  for (Constraint constr : getAllTypeConstraints(records))
-    os << strfmt(typeConstraintDecl, *constr.getCppFunctionName());
+  for (Constraint constr : constraints) {
+    FmtContext ctx;
+    ctx.withSelf(selfName);
+    std::string condition = tgfmt(constr.getConditionTemplate(), &ctx);
+    os << strfmt(constraintDef, *constr.getCppFunctionName(), parameterTypeName,
+                 selfName, condition);
+  }
 }
 
 static void emitTypeConstraintDefs(const RecordKeeper &records,
                                    raw_ostream &os) {
-  static const char *const typeConstraintDef = R"(
-bool {0}(::mlir::Type type) {
-  return ({1});
+  emitConstraintDefs(getAllCppTypeConstraints(records), os, "::mlir::Type",
+                     "type");
 }
-)";
 
-  for (Constraint constr : getAllTypeConstraints(records)) {
-    FmtContext ctx;
-    ctx.withSelf("type");
-    std::string condition = tgfmt(constr.getConditionTemplate(), &ctx);
-    os << strfmt(typeConstraintDef, *constr.getCppFunctionName(), condition);
-  }
+static void emitAttrConstraintDefs(const RecordKeeper &records,
+                                   raw_ostream &os) {
+  emitConstraintDefs(getAllCppAttrConstraints(records), os, "::mlir::Attribute",
+                     "attr");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1158,6 +1200,21 @@ static mlir::GenRegistration
                    return generator.emitDecls(attrDialect);
                  });
 
+static mlir::GenRegistration
+    genAttrConstrDefs("gen-attr-constraint-defs",
+                      "Generate attribute constraint definitions",
+                      [](const RecordKeeper &records, raw_ostream &os) {
+                        emitAttrConstraintDefs(records, os);
+                        return false;
+                      });
+static mlir::GenRegistration
+    genAttrConstrDecls("gen-attr-constraint-decls",
+                       "Generate attribute constraint declarations",
+                       [](const RecordKeeper &records, raw_ostream &os) {
+                         emitAttrConstraintDecls(records, os);
+                         return false;
+                       });
+
 //===----------------------------------------------------------------------===//
 // TypeDef
 //===----------------------------------------------------------------------===//

From 0bb4d9c30207c4a69731e6848ba7cb6ef52b5906 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 00:21:13 -0700
Subject: [PATCH 0469/1322] ARM: Migrate to the new relocation specifier
 representation

Use MCSpecifierExpr directly and remove the ARMMCExpr subclass. Define
printImpl and evaluateAsRelocationImpl within ARM*MCAsmInfo classes.
While there is some duplication, it enables better separation for
object file formats.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         | 30 ++++---
 llvm/lib/Target/ARM/ARMMCInstLower.cpp        | 12 +--
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 17 ++--
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp  | 61 ++++++++++++++-
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.h    | 46 ++++++++++-
 .../ARM/MCTargetDesc/ARMMCCodeEmitter.cpp     |  2 +-
 .../lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp | 78 -------------------
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h  | 43 ----------
 .../MCTargetDesc/ARMMachORelocationInfo.cpp   |  6 +-
 .../Target/ARM/MCTargetDesc/CMakeLists.txt    |  1 -
 .../llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn |  1 -
 11 files changed, 142 insertions(+), 155 deletions(-)
 delete mode 100644 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
 delete mode 100644 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fef7a17ae0b6..fa1437002551 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1619,12 +1619,15 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
                     MI->getOperand(2).getImm(), OutContext);
     const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
-    const MCExpr *PCRelExpr =
-      ARMMCExpr::createLower16(MCBinaryExpr::createSub(GVSymExpr,
-                                      MCBinaryExpr::createAdd(LabelSymExpr,
-                                      MCConstantExpr::create(PCAdj, OutContext),
-                                      OutContext), OutContext), OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+    const MCExpr *PCRelExpr = ARM::createLower16(
+        MCBinaryExpr::createSub(
+            GVSymExpr,
+            MCBinaryExpr::createAdd(LabelSymExpr,
+                                    MCConstantExpr::create(PCAdj, OutContext),
+                                    OutContext),
+            OutContext),
+        OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
 
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
@@ -1652,12 +1655,15 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
                     MI->getOperand(3).getImm(), OutContext);
     const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
-    const MCExpr *PCRelExpr =
-        ARMMCExpr::createUpper16(MCBinaryExpr::createSub(GVSymExpr,
-                                   MCBinaryExpr::createAdd(LabelSymExpr,
-                                      MCConstantExpr::create(PCAdj, OutContext),
-                                          OutContext), OutContext), OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+    const MCExpr *PCRelExpr = ARM::createUpper16(
+        MCBinaryExpr::createSub(
+            GVSymExpr,
+            MCBinaryExpr::createAdd(LabelSymExpr,
+                                    MCConstantExpr::create(PCAdj, OutContext),
+                                    OutContext),
+            OutContext),
+        OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
     TmpInst.addOperand(MCOperand::createReg(0));
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index b32de6b66058..f5d6597f214d 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -49,27 +49,27 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
     break;
   case ARMII::MO_LO16:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createLower16(Expr, OutContext);
+    Expr = ARM::createLower16(Expr, OutContext);
     break;
   case ARMII::MO_HI16:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createUpper16(Expr, OutContext);
+    Expr = ARM::createUpper16(Expr, OutContext);
     break;
   case ARMII::MO_LO_0_7:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createLower0_7(Expr, OutContext);
+    Expr = ARM::createLower0_7(Expr, OutContext);
     break;
   case ARMII::MO_LO_8_15:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createLower8_15(Expr, OutContext);
+    Expr = ARM::createLower8_15(Expr, OutContext);
     break;
   case ARMII::MO_HI_0_7:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createUpper0_7(Expr, OutContext);
+    Expr = ARM::createUpper0_7(Expr, OutContext);
     break;
   case ARMII::MO_HI_8_15:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createUpper8_15(Expr, OutContext);
+    Expr = ARM::createUpper8_15(Expr, OutContext);
     break;
   }
 
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 6e9efe40dc54..f3bdcd64805d 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -454,7 +454,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
-  bool parsePrefix(ARMMCExpr::Specifier &);
+  bool parsePrefix(ARM::Specifier &);
   bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
                               unsigned &ShiftAmount);
   bool parseLiteralValues(unsigned Size, SMLoc L);
@@ -1326,7 +1326,7 @@ public:
     if (isImm() && !isa<MCConstantExpr>(getImm())) {
       // We want to avoid matching :upper16: and :lower16: as we want these
       // expressions to match in isImm0_65535Expr()
-      const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(getImm());
+      auto *ARM16Expr = dyn_cast<MCSpecifierExpr>(getImm());
       return (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
                              ARM16Expr->getSpecifier() != ARM::S_LO16));
     }
@@ -6424,7 +6424,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     // ":upper8_15:", expression prefixes
     // FIXME: Check it's an expression prefix,
     // e.g. (FOO - :lower16:BAR) isn't legal.
-    ARMMCExpr::Specifier Spec;
+    ARM::Specifier Spec;
     if (parsePrefix(Spec))
       return true;
 
@@ -6432,7 +6432,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     if (getParser().parseExpression(SubExprVal))
       return true;
 
-    const MCExpr *ExprVal = ARMMCExpr::create(Spec, SubExprVal, getContext());
+    const auto *ExprVal =
+        MCSpecifierExpr::create(SubExprVal, Spec, getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E, *this));
     return false;
@@ -6471,7 +6472,7 @@ bool ARMAsmParser::parseImmExpr(int64_t &Out) {
 // parsePrefix - Parse ARM 16-bit relocations expression prefixes, i.e.
 // :lower16: and :upper16: and Thumb 8-bit relocation expression prefixes, i.e.
 // :upper8_15:, :upper0_7:, :lower8_15: and :lower0_7:
-bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
+bool ARMAsmParser::parsePrefix(ARM::Specifier &Spec) {
   MCAsmParser &Parser = getParser();
   Spec = ARM::S_None;
 
@@ -6495,7 +6496,7 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
   };
   static const struct PrefixEntry {
     const char *Spelling;
-    ARMMCExpr::Specifier Spec;
+    ARM::Specifier Spec;
     uint8_t SupportedFormats;
   } PrefixEntries[] = {
       {"upper16", ARM::S_HI16, COFF | ELF | MACHO},
@@ -6879,7 +6880,7 @@ static bool isThumbI8Relocation(MCParsedAsmOperand &MCOp) {
   const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
   if (!E)
     return false;
-  const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+  auto *ARM16Expr = dyn_cast<MCSpecifierExpr>(E);
   if (ARM16Expr && (ARM16Expr->getSpecifier() == ARM::S_HI_8_15 ||
                     ARM16Expr->getSpecifier() == ARM::S_HI_0_7 ||
                     ARM16Expr->getSpecifier() == ARM::S_LO_8_15 ||
@@ -8286,7 +8287,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     if (CE) break;
     const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
     if (!E) break;
-    const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+    auto *ARM16Expr = dyn_cast<MCSpecifierExpr>(E);
     if (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
                        ARM16Expr->getSpecifier() != ARM::S_LO16))
       return Error(
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index f8ec0237dcb5..a3d86f690e4a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCAsmInfo.h"
-#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -153,3 +153,62 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
 
   initializeVariantKinds(variantKindDescs);
 }
+
+void ARM::printSpecifierExpr(const MCAsmInfo &MAI, raw_ostream &OS,
+                             const MCSpecifierExpr &Expr) {
+  switch (Expr.getSpecifier()) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case ARM::S_HI16:
+    OS << ":upper16:";
+    break;
+  case ARM::S_LO16:
+    OS << ":lower16:";
+    break;
+  case ARM::S_HI_8_15:
+    OS << ":upper8_15:";
+    break;
+  case ARM::S_HI_0_7:
+    OS << ":upper0_7:";
+    break;
+  case ARM::S_LO_8_15:
+    OS << ":lower8_15:";
+    break;
+  case ARM::S_LO_0_7:
+    OS << ":lower0_7:";
+    break;
+  }
+
+  const MCExpr *Sub = Expr.getSubExpr();
+  if (Sub->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  MAI.printExpr(OS, *Sub);
+  if (Sub->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+const MCSpecifierExpr *ARM::createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_HI16, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createLower16(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_LO16, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createUpper8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_HI_8_15, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_HI_0_7, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createLower8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_LO_8_15, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_LO_0_7, Ctx);
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index baadf74e0d5a..f3f075e99d96 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 
-#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -22,11 +21,24 @@
 namespace llvm {
 class Triple;
 
+namespace ARM {
+void printSpecifierExpr(const MCAsmInfo &MAI, raw_ostream &OS,
+                        const MCSpecifierExpr &Expr);
+}
+
 class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
   virtual void anchor();
 
 public:
   explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 class ARMELFMCAsmInfo : public MCAsmInfoELF {
@@ -36,6 +48,14 @@ public:
   explicit ARMELFMCAsmInfo(const Triple &TT);
 
   void setUseIntegratedAssembler(bool Value) override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
@@ -43,6 +63,14 @@ class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
 
 public:
   explicit ARMCOFFMCAsmInfoMicrosoft();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
@@ -50,9 +78,18 @@ class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
 
 public:
   explicit ARMCOFFMCAsmInfoGNU();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 namespace ARM {
+using Specifier = uint16_t;
 enum {
   S_None,
   S_HI16 =
@@ -93,6 +130,13 @@ enum {
   S_TLSLDO,
   S_TPOFF,
 };
+
+const MCSpecifierExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createLower16(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index f006e00ada32..fba32eae4dfa 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1192,7 +1192,7 @@ uint32_t ARMMCCodeEmitter::getHiLoImmOpValue(const MCInst &MI, unsigned OpIdx,
   const MCExpr *E = MO.getExpr();
   MCFixupKind Kind;
   if (E->getKind() == MCExpr::Specifier) {
-    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    auto *ARM16Expr = cast<MCSpecifierExpr>(E);
     E = ARM16Expr->getSubExpr();
 
     if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
deleted file mode 100644
index 1e6760a57608..000000000000
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARMMCExpr.h"
-#include "ARMMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "armmcexpr"
-
-const ARMMCExpr *ARMMCExpr::create(Specifier S, const MCExpr *Expr,
-                                   MCContext &Ctx) {
-  return new (Ctx) ARMMCExpr(S, Expr);
-}
-
-void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  switch (specifier) {
-  default: llvm_unreachable("Invalid kind!");
-  case ARM::S_HI16:
-    OS << ":upper16:";
-    break;
-  case ARM::S_LO16:
-    OS << ":lower16:";
-    break;
-  case ARM::S_HI_8_15:
-    OS << ":upper8_15:";
-    break;
-  case ARM::S_HI_0_7:
-    OS << ":upper0_7:";
-    break;
-  case ARM::S_LO_8_15:
-    OS << ":lower8_15:";
-    break;
-  case ARM::S_LO_0_7:
-    OS << ":lower0_7:";
-    break;
-  }
-
-  const MCExpr *Expr = getSubExpr();
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << '(';
-  MAI->printExpr(OS, *Expr);
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << ')';
-}
-
-const ARMMCExpr *ARMMCExpr::createUpper16(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_HI16, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createLower16(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_LO16, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createUpper8_15(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_HI_8_15, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_HI_0_7, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createLower8_15(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_LO_8_15, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_LO_0_7, Expr, Ctx);
-}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
deleted file mode 100644
index f29d05ba2a88..000000000000
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
-#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class ARMMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-
-private:
-  explicit ARMMCExpr(Specifier S, const MCExpr *Expr)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const ARMMCExpr *create(Specifier S, const MCExpr *Expr,
-                                 MCContext &Ctx);
-
-  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override {
-    return false;
-  }
-};
-} // end namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 886b7e7bc84e..72d9379f5038 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMMCExpr.h"
+#include "ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
@@ -24,9 +24,9 @@ public:
                                              unsigned VariantKind) override {
     switch(VariantKind) {
     case LLVMDisassembler_VariantKind_ARM_HI16:
-      return ARMMCExpr::createUpper16(SubExpr, Ctx);
+      return ARM::createUpper16(SubExpr, Ctx);
     case LLVMDisassembler_VariantKind_ARM_LO16:
-      return ARMMCExpr::createLower16(SubExpr, Ctx);
+      return ARM::createLower16(SubExpr, Ctx);
     default:
       return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
                                                             VariantKind);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 8b3ef0ee651e..977f8bf5548f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -7,7 +7,6 @@ add_llvm_component_library(LLVMARMDesc
   ARMMachORelocationInfo.cpp
   ARMMCAsmInfo.cpp
   ARMMCCodeEmitter.cpp
-  ARMMCExpr.cpp
   ARMMCTargetDesc.cpp
   ARMTargetStreamer.cpp
   ARMUnwindOpAsm.cpp
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn
index 981639faf71d..698607f3a226 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn
@@ -67,7 +67,6 @@ static_library("MCTargetDesc") {
     "ARMInstPrinter.cpp",
     "ARMMCAsmInfo.cpp",
     "ARMMCCodeEmitter.cpp",
-    "ARMMCExpr.cpp",
     "ARMMCTargetDesc.cpp",
     "ARMMachORelocationInfo.cpp",
     "ARMMachObjectWriter.cpp",

From ee2d7a6975f37c11bffbf3207879696aca7fcc65 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 00:35:18 -0700
Subject: [PATCH 0470/1322] MIPS: Remove unneeded printImpl

Follow-up to 05a9ad977624c4f6def7c0f4cf7103e28d6c6541
---
 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp | 7 -------
 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h   | 2 --
 2 files changed, 9 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 280d944f2fbb..821f662f0cbf 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -37,10 +37,3 @@ const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
   return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
                 Ctx);
 }
-
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  if (MAI)
-    MAI->printExpr(OS, *this);
-  else // llc -asm-show-inst
-    MipsELFMCAsmInfo(Triple(), MCTargetOptions()).printExpr(OS, *this);
-}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 91ec09482185..b78aeabb5799 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -29,8 +29,6 @@ public:
                                   MCContext &Ctx);
   static const MipsMCExpr *createGpOff(Specifier S, const MCExpr *Expr,
                                        MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 };
 
 } // end namespace llvm

From 4e0dd007ac6a7b7e0a284062b61c6d22250337df Mon Sep 17 00:00:00 2001
From: mayanksolanki393 <mayanksolanki393@gmail.com>
Date: Mon, 16 Jun 2025 13:16:52 +0530
Subject: [PATCH 0471/1322] [InstCombine] Combine trunc (lshr X, BW-1) to i1
 --> icmp slt X, 0 (#142593) (#143846)

Fixes #142593, the issue was fixed using the suggestion on the ticket
itself.

Godbolt: https://godbolt.org/z/oW5b74jc4
alive2 proof: https://alive2.llvm.org/ce/z/QHnD7e
---
 .../InstCombine/InstCombineCasts.cpp          |  6 ++
 .../Transforms/InstCombine/logical-select.ll  |  4 +-
 .../test/Transforms/InstCombine/trunc-lshr.ll | 95 +++++++++++++++++++
 3 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/trunc-lshr.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index d4a2fe5e37ef..033ef8be700e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -815,6 +815,12 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
       return new ICmpInst(ICmpInst::ICMP_EQ, X, CmpC);
     }
 
+    if (match(Src, m_Shr(m_Value(X), m_SpecificInt(SrcWidth - 1)))) {
+      // trunc (ashr X, BW-1) to i1 --> icmp slt X, 0
+      // trunc (lshr X, BW-1) to i1 --> icmp slt X, 0
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, Zero);
+    }
+
     Constant *C;
     if (match(Src, m_OneUse(m_LShr(m_Value(X), m_ImmConstant(C))))) {
       // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index 050a53406a9c..87e05002665c 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -807,9 +807,9 @@ define <2 x i16> @bitcast_vec_cond_commute3(<4 x i8> %cond, <2 x i16> %pc, <2 x
 ; CHECK-LABEL: @bitcast_vec_cond_commute3(
 ; CHECK-NEXT:    [[C:%.*]] = mul <2 x i16> [[PC:%.*]], [[PC]]
 ; CHECK-NEXT:    [[D:%.*]] = mul <2 x i16> [[PD:%.*]], [[PD]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp slt <4 x i8> [[COND:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[D]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[C]] to <4 x i8>
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp slt <4 x i8> [[COND:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[DOTNOT2]], <4 x i8> [[TMP1]], <4 x i8> [[TMP2]]
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[TMP3]] to <2 x i16>
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
@@ -1069,8 +1069,8 @@ define <2 x i1> @not_d_bools_vector_poison(<2 x i1> %c, <2 x i1> %x, <2 x i1> %y
 
 define i32 @not_d_allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 ; CHECK-LABEL: @not_d_allSignBits(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[FVAL:%.*]], -1
 ; CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp slt i32 [[COND:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[FVAL:%.*]], -1
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[DOTNOT2]], i32 [[TVAL:%.*]], i32 [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/trunc-lshr.ll b/llvm/test/Transforms/InstCombine/trunc-lshr.ll
new file mode 100644
index 000000000000..4364b09cfa70
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/trunc-lshr.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @test1(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test1(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[I]], 31
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 false
+;
+  %lobit = lshr i32 %i, 31
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+
+define i1 @test2(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test2(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = ashr i32 [[I]], 31
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 false
+;
+  %lobit = ashr i32 %i, 31
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+
+define i1 @test3(i32 %i, ptr %p, ptr %q) {
+; CHECK-LABEL: define i1 @test3(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[I]], 31
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[Q]], align 1
+; CHECK-NEXT:    ret i1 false
+;
+  %lobit = lshr i32 %i, 31
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  store i32 %lobit, ptr %q, align 1
+  ret i1 %op
+}
+
+; Negative Test
+define i1 @test4(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test4(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[I]], 30
+; CHECK-NEXT:    [[T:%.*]] = trunc nuw i32 [[DOTLOBIT]] to i1
+; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[I]], 0
+; CHECK-NEXT:    [[NOT_:%.*]] = xor i1 [[T]], true
+; CHECK-NEXT:    [[COMMON_RET1_OP:%.*]] = select i1 [[NOT_]], i1 [[B]], i1 false
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 [[COMMON_RET1_OP]]
+;
+  %lobit = lshr i32 %i, 30 ; should not fold as no. of bits shifted < BitWidth - 1
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+
+; Negative Test
+define i1 @test5(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test5(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = ashr i32 [[I]], 30
+; CHECK-NEXT:    [[T:%.*]] = trunc nuw i32 [[DOTLOBIT]] to i1
+; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[I]], 0
+; CHECK-NEXT:    [[NOT_:%.*]] = xor i1 [[T]], true
+; CHECK-NEXT:    [[COMMON_RET1_OP:%.*]] = select i1 [[NOT_]], i1 [[B]], i1 false
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 [[COMMON_RET1_OP]]
+;
+  %lobit = ashr i32 %i, 30 ; should not fold as no. of bits shifted < BitWidth - 1
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+

From e61405033bbaec3604c79a0b323a3e21efc720bc Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Mon, 16 Jun 2025 09:55:22 +0200
Subject: [PATCH 0472/1322] [clang] Fix -fclang-abi-compat for clang 20
 (#144109)

The value was known already, but it was parsed as latest which is
incorrect because we are already doing clang 21.
---
 clang/lib/Frontend/CompilerInvocation.cpp | 2 ++
 clang/test/CodeGen/X86/avx-cxx-record.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index dd021ad2e441..5c52dc33ddf6 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4475,6 +4475,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
         Opts.setClangABICompat(LangOptions::ClangABI::Ver18);
       else if (Major <= 19)
         Opts.setClangABICompat(LangOptions::ClangABI::Ver19);
+      else if (Major <= 20)
+        Opts.setClangABICompat(LangOptions::ClangABI::Ver20);
     } else if (Ver != "latest") {
       Diags.Report(diag::err_drv_invalid_value)
           << A->getAsString(Args) << A->getValue();
diff --git a/clang/test/CodeGen/X86/avx-cxx-record.cpp b/clang/test/CodeGen/X86/avx-cxx-record.cpp
index bcd9c361fda9..6ce6815a521a 100644
--- a/clang/test/CodeGen/X86/avx-cxx-record.cpp
+++ b/clang/test/CodeGen/X86/avx-cxx-record.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -O2 -target-cpu x86-64-v3 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -O2 -target-cpu x86-64-v3 -fclang-abi-compat=20 -o - | FileCheck --check-prefix CLANG-20 %s
 
 using UInt64x2 = unsigned long long __attribute__((__vector_size__(16), may_alias));
 
@@ -11,6 +12,7 @@ struct XMM2 : XMM1<0>, XMM1<1> {
 };
 
 // CHECK: define{{.*}} @_Z3foov({{.*}} [[ARG:%.*]]){{.*}}
+// CLANG-20: define{{.*}} <4 x double> @_Z3foov()
 // CHECK: entry:
 // CHECK-NEXT: store {{.*}}, ptr [[ARG]]{{.*}}
 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr {{.*}}, ptr [[ARG]]{{.*}}

From fbade95ebf2bc959fada5206e47f792a2090d72e Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 16 Jun 2025 08:55:46 +0100
Subject: [PATCH 0473/1322] [LV] Strip unnecessary make_{pair,optional} (NFC)
 (#141924)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 62 ++++++++-----------
 1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index eb04e2d5ca7b..34f49a7721a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1080,7 +1080,7 @@ public:
   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
                            InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
-    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+    WideningDecisions[{I, VF}] = {W, Cost};
   }
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1102,11 +1102,9 @@ public:
     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
       if (auto *I = Grp->getMember(Idx)) {
         if (Grp->getInsertPos() == I)
-          WideningDecisions[std::make_pair(I, VF)] =
-              std::make_pair(W, InsertPosCost);
+          WideningDecisions[{I, VF}] = {W, InsertPosCost};
         else
-          WideningDecisions[std::make_pair(I, VF)] =
-              std::make_pair(W, OtherMemberCost);
+          WideningDecisions[{I, VF}] = {W, OtherMemberCost};
       }
     }
   }
@@ -1120,7 +1118,7 @@ public:
         TheLoop->isInnermost() &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
 
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
     auto Itr = WideningDecisions.find(InstOnVF);
     if (Itr == WideningDecisions.end())
       return CM_Unknown;
@@ -1131,7 +1129,7 @@ public:
   /// width \p VF.
   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
     assert(VF.isVector() && "Expected VF >=2");
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
     assert(WideningDecisions.contains(InstOnVF) &&
            "The cost is not calculated");
     return WideningDecisions[InstOnVF].second;
@@ -1150,8 +1148,7 @@ public:
                                std::optional<unsigned> MaskPos,
                                InstructionCost Cost) {
     assert(!VF.isScalar() && "Expected vector VF");
-    CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
-                                                     MaskPos, Cost};
+    CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
   }
 
   CallWideningDecision getCallWideningDecision(CallInst *CI,
@@ -1348,21 +1345,20 @@ public:
   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
     if (!Legal->canFoldTailByMasking()) {
-      ChosenTailFoldingStyle =
-          std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+      ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
       return;
     }
 
     if (!ForceTailFoldingStyle.getNumOccurrences()) {
-      ChosenTailFoldingStyle = std::make_pair(
+      ChosenTailFoldingStyle = {
           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
-          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
+          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
       return;
     }
 
     // Set styles when forced.
-    ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
-                                            ForceTailFoldingStyle.getValue());
+    ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
+                              ForceTailFoldingStyle.getValue()};
     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
       return;
     // Override forced styles if needed.
@@ -1375,9 +1371,8 @@ public:
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
       // in a generic way.
-      ChosenTailFoldingStyle =
-          std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
-                         TailFoldingStyle::DataWithoutLaneMask);
+      ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+                                TailFoldingStyle::DataWithoutLaneMask};
       LLVM_DEBUG(
           dbgs()
           << "LV: Preference for VP intrinsics indicated. Will "
@@ -8138,7 +8133,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
     PartialReductionChain Chain = Pair.first;
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
+      ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
   }
 }
 
@@ -8210,12 +8205,11 @@ bool VPRecipeBuilder::getScaledReductions(
           [&](ElementCount VF) {
             InstructionCost Cost = TTI->getPartialReductionCost(
                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
-                VF, OpAExtend, OpBExtend,
-                std::make_optional(BinOp->getOpcode()));
+                VF, OpAExtend, OpBExtend, BinOp->getOpcode());
             return Cost.isValid();
           },
           Range)) {
-    Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
+    Chains.emplace_back(Chain, TargetScaleFactor);
     return true;
   }
 
@@ -10108,9 +10102,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool VectorizeLoop = true, InterleaveLoop = true;
   if (VF.Width.isScalar()) {
     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-    VecDiagMsg = std::make_pair(
+    VecDiagMsg = {
         "VectorizationNotBeneficial",
-        "the cost-model indicates that vectorization is not beneficial");
+        "the cost-model indicates that vectorization is not beneficial"};
     VectorizeLoop = false;
   }
 
@@ -10119,16 +10113,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // requested.
     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
                          "interleaving should be avoided up front\n");
-    IntDiagMsg = std::make_pair(
-        "InterleavingAvoided",
-        "Ignoring UserIC, because interleaving was avoided up front");
+    IntDiagMsg = {"InterleavingAvoided",
+                  "Ignoring UserIC, because interleaving was avoided up front"};
     InterleaveLoop = false;
   } else if (IC == 1 && UserIC <= 1) {
     // Tell the user interleaving is not beneficial.
     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
-    IntDiagMsg = std::make_pair(
+    IntDiagMsg = {
         "InterleavingNotBeneficial",
-        "the cost-model indicates that interleaving is not beneficial");
+        "the cost-model indicates that interleaving is not beneficial"};
     InterleaveLoop = false;
     if (UserIC == 1) {
       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
@@ -10139,10 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // Tell the user interleaving is beneficial, but it explicitly disabled.
     LLVM_DEBUG(
         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
-    IntDiagMsg = std::make_pair(
-        "InterleavingBeneficialButDisabled",
-        "the cost-model indicates that interleaving is beneficial "
-        "but is explicitly disabled or interleave count is set to 1");
+    IntDiagMsg = {"InterleavingBeneficialButDisabled",
+                  "the cost-model indicates that interleaving is beneficial "
+                  "but is explicitly disabled or interleave count is set to 1"};
     InterleaveLoop = false;
   }
 
@@ -10152,10 +10144,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
                       << "to histogram operations.\n");
-    IntDiagMsg = std::make_pair(
+    IntDiagMsg = {
         "HistogramPreventsScalarInterleaving",
         "Unable to interleave without vectorization due to constraints on "
-        "the order of histogram operations");
+        "the order of histogram operations"};
     InterleaveLoop = false;
   }
 

From cca454b54c7d58930e261c7fa72f44a1a8976997 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 16 Jun 2025 09:12:42 +0100
Subject: [PATCH 0474/1322] [ValueTracking] Remove opcode whitelist from
 matchSimpleRecurrence. (#144031)

This also patches HashRecognize to avoid it mishandling some opcodes.
---
 llvm/lib/Analysis/HashRecognize.cpp           | 11 ++++--
 llvm/lib/Analysis/ValueTracking.cpp           | 38 +++++--------------
 .../HashRecognize/cyclic-redundancy-check.ll  | 30 ++++++++++++++-
 3 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index b245548dea6d..1edb8b3bdc9a 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -542,7 +542,11 @@ static bool arePHIsIntertwined(
 // doing this, we're immune to whether the IR expression is mul/udiv or
 // equivalently shl/lshr. Return false when it is a UDiv, true when it is a Mul,
 // and std::nullopt otherwise.
-static std::optional<bool> isBigEndianBitShift(const SCEV *E) {
+static std::optional<bool> isBigEndianBitShift(Value *V, ScalarEvolution &SE) {
+  if (!V->getType()->isIntegerTy())
+    return {};
+
+  const SCEV *E = SE.getSCEV(V);
   if (match(E, m_scev_UDiv(m_SCEV(), m_scev_SpecificInt(2))))
     return false;
   if (match(E, m_scev_Mul(m_scev_SpecificInt(2), m_SCEV())))
@@ -576,12 +580,11 @@ HashRecognize::recognizeCRC() const {
   // Make sure that all recurrences are either all SCEVMul with two or SCEVDiv
   // with two, or in other words, that they're single bit-shifts.
   std::optional<bool> ByteOrderSwapped =
-      isBigEndianBitShift(SE.getSCEV(ConditionalRecurrence.BO));
+      isBigEndianBitShift(ConditionalRecurrence.BO, SE);
   if (!ByteOrderSwapped)
     return "Loop with non-unit bitshifts";
   if (SimpleRecurrence) {
-    if (isBigEndianBitShift(SE.getSCEV(SimpleRecurrence.BO)) !=
-        ByteOrderSwapped)
+    if (isBigEndianBitShift(SimpleRecurrence.BO, SE) != ByteOrderSwapped)
       return "Loop with non-unit bitshifts";
     if (!arePHIsIntertwined(SimpleRecurrence.Phi, ConditionalRecurrence.Phi, L,
                             Instruction::BinaryOps::Xor))
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index e7a1f07c0270..d39efb285974 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9071,6 +9071,7 @@ bool llvm::matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO,
   // Handle the case of a simple two-predecessor recurrence PHI.
   // There's a lot more that could theoretically be done here, but
   // this is sufficient to catch some interesting cases.
+  // TODO: Expand list -- gep, uadd.sat etc.
   if (P->getNumIncomingValues() != 2)
     return false;
 
@@ -9081,35 +9082,16 @@ bool llvm::matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO,
     if (!LU)
       continue;
     unsigned Opcode = LU->getOpcode();
+    Value *LL = LU->getOperand(0);
+    Value *LR = LU->getOperand(1);
 
-    switch (Opcode) {
-    default:
-      continue;
-    // TODO: Expand list -- xor, gep, uadd.sat etc.
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::Shl:
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::UDiv:
-    case Instruction::URem:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Mul:
-    case Instruction::FMul: {
-      Value *LL = LU->getOperand(0);
-      Value *LR = LU->getOperand(1);
-      // Find a recurrence.
-      if (LL == P)
-        L = LR;
-      else if (LR == P)
-        L = LL;
-      else
-        continue; // Check for recurrence with L and R flipped.
-
-      break; // Match!
-    }
-    };
+    // Find a recurrence.
+    if (LL == P)
+      L = LR;
+    else if (LR == P)
+      L = LL;
+    else
+      continue; // Check for recurrence with L and R flipped.
 
     // We have matched a recurrence of the form:
     //   %iv = [R, %entry], [%iv.next, %backedge]
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 3e05a9b5c849..7a3082056ad2 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -873,7 +873,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.float.simple.recurrence(float %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.float.simple.recurrence'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Found stray PHI
+; CHECK-NEXT:  Reason: Loop with non-unit bitshifts
 ;
 entry:
   br label %loop
@@ -897,3 +897,31 @@ loop:                                              ; preds = %loop, %entry
 exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
+
+define i16 @not.crc.stray.phi(i8 %msg, i16 %checksum, i1 %c) {
+; CHECK-LABEL: 'not.crc.stray.phi'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Found stray PHI
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %checksum, %entry ], [ %crc.next, %loop ]
+  %data = phi i8 [ %msg, %entry ], [ %data.next, %loop ]
+  %crc.trunc = trunc i16 %crc to i8
+  %xor.data.crc = xor i8 %data, %crc.trunc
+  %and.data.crc = and i8 %xor.data.crc, 1
+  %data.next = select i1 %c, i8 %data, i8 1
+  %check.sb = icmp eq i8 %and.data.crc, 0
+  %crc.lshr = lshr i16 %crc, 1
+  %xor = xor i16 %crc.lshr, -24575
+  %crc.next = select i1 %check.sb, i16 %crc.lshr, i16 %xor
+  %iv.next = add nuw nsw i8 %iv, 1
+  %exit.cond = icmp samesign ult i8 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}

From 0952992ac6e1470d9f776a99c5793745a6b58d98 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Mon, 16 Jun 2025 09:42:59 +0100
Subject: [PATCH 0475/1322] [BOLT] Fix LLVM_APPEND_VC_REV support (#142410)

The CMake flag LLVM_APPEND_VC_REV can be passed when building BOLT a
BOLT to prevent including a VC Revision. This patch enables this
functionality.

Usage: `-DLLVM_APPEND_VC_REV=OFF` when running CMake.
---
 bolt/lib/Utils/CMakeLists.txt | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Utils/CMakeLists.txt b/bolt/lib/Utils/CMakeLists.txt
index efba6d54449d..94933644ef5e 100644
--- a/bolt/lib/Utils/CMakeLists.txt
+++ b/bolt/lib/Utils/CMakeLists.txt
@@ -6,12 +6,25 @@ set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/VCSVersion.inc")
 
 set(generate_vcs_version_script "${LLVM_CMAKE_DIR}/GenerateVersionFromVCS.cmake")
 
+if(llvm_vc AND LLVM_APPEND_VC_REV)
+  set(llvm_source_dir ${LLVM_MAIN_SRC_DIR})
+endif()
+if(LLVM_VC_REPOSITORY AND LLVM_VC_REVISION)
+  set(llvm_source_dir ${LLVM_SOURCE_DIR})
+  set(llvm_vc_repository ${LLVM_VC_REPOSITORY})
+  set(llvm_vc_revision ${LLVM_VC_REVISION})
+endif()
+if(bolt_vc AND LLVM_APPEND_VC_REV)
+  set(bolt_source_dir ${BOLT_SOURCE_DIR})
+endif()
+
 # Create custom target to generate the VC revision include.
 add_custom_command(OUTPUT "${version_inc}"
   DEPENDS "${llvm_vc}" "${bolt_vc}" "${generate_vcs_version_script}"
   COMMAND ${CMAKE_COMMAND} "-DNAMES=BOLT"
+                           "-DLLVM_SOURCE_DIR=${llvm_source_dir}"
+                           "-DBOLT_SOURCE_DIR=${bolt_source_dir}"
                            "-DHEADER_FILE=${version_inc}"
-                           "-DBOLT_SOURCE_DIR=${BOLT_SOURCE_DIR}"
                            "-DLLVM_VC_REPOSITORY=${llvm_vc_repository}"
                            "-DLLVM_VC_REVISION=${llvm_vc_revision}"
                            "-DLLVM_FORCE_VC_REVISION=${LLVM_FORCE_VC_REVISION}"

From 383b3268794da1ca763deb91cec777742e6e54a8 Mon Sep 17 00:00:00 2001
From: Javier Lopez-Gomez <javier.lopez.gomez@proton.me>
Date: Mon, 16 Jun 2025 10:47:00 +0200
Subject: [PATCH 0476/1322] [llvm-debuginfo-analyzer] Fix ODR violation in
 llvm::logicalview::LVObject (#140265)

Some data members are only part of a class definition in a Debug build,
e.g. `LVObject::ID`. If `debuginfologicalview` is used as a library,
`NDEBUG` cannot be used for this purpose, as this PP macro may have a
different definition in a downstream project, which in turn triggers an
ODR violation. Fix it by
- Making `LVObject::ID` an unconditional data member.
- Making `LVObject::dump()` non-virtual. Rationale: `virtual` is not
needed (and it calls `print()`, which is virtual anyway).

Fixes #139098.
---
 .../CommandGuide/llvm-debuginfo-analyzer.rst  |  3 +-
 .../llvm/DebugInfo/LogicalView/Core/LVLine.h  |  4 --
 .../DebugInfo/LogicalView/Core/LVLocation.h   |  6 +--
 .../DebugInfo/LogicalView/Core/LVObject.h     | 39 ++++++-------------
 .../llvm/DebugInfo/LogicalView/Core/LVRange.h |  4 --
 .../llvm/DebugInfo/LogicalView/Core/LVScope.h |  4 --
 .../DebugInfo/LogicalView/Core/LVSymbol.h     |  4 --
 .../llvm/DebugInfo/LogicalView/Core/LVType.h  |  4 --
 .../DebugInfo/LogicalView/Core/LVObject.cpp   |  6 +--
 .../DebugInfo/LogicalView/Core/LVOptions.cpp  |  2 -
 10 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
index 453af0751e2a..1264f8020661 100644
--- a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
+++ b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
@@ -676,8 +676,7 @@ INTERNAL
  Typically these kind of options are available only in *debug* builds.
 
  :program:`llvm-debuginfo-analyzer` supports these advanced options in
- both *release* and *debug* builds, with the exception of the unique ID
- that is generated only in *debug* builds.
+ both *release* and *debug* builds.
 
 .. option:: --internal=<value[,value,...]>
 
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
index c979dc4a6be2..3618ce7b0ecd 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
@@ -105,10 +105,6 @@ public:
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 // Class to represent a DWARF line record object.
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
index 7b466ae206e4..0718e33f5645 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
@@ -51,7 +51,7 @@ public:
   LLVM_ABI void print(raw_ostream &OS, bool Full = true) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() { print(dbgs()); }
+  void dump() const { print(dbgs()); }
 #endif
 };
 
@@ -159,10 +159,6 @@ public:
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 class LLVM_ABI LVLocationSymbol final : public LVLocation {
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
index ec02120e69b7..be64cdaea3d7 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
@@ -36,7 +36,7 @@ namespace logicalview {
 using LVSectionIndex = uint64_t;
 using LVAddress = uint64_t;
 using LVHalf = uint16_t;
-using LVLevel = uint32_t;
+using LVLevel = uint16_t;
 using LVOffset = uint64_t;
 using LVSigned = int64_t;
 using LVUnsigned = uint64_t;
@@ -129,8 +129,6 @@ class LLVM_ABI LVObject {
     HasCodeViewLocation, // CodeView object with debug location.
     LastEntry
   };
-  // Typed bitvector with properties for this object.
-  LVProperties<Property> Properties;
 
   LVOffset Offset = 0;
   uint32_t LineNumber = 0;
@@ -140,6 +138,14 @@ class LLVM_ABI LVObject {
     dwarf::Attribute Attr;
     LVSmall Opcode;
   } TagAttrOpcode = {dwarf::DW_TAG_null};
+  // Typed bitvector with properties for this object.
+  LVProperties<Property> Properties;
+
+  // This is an internal ID used for debugging logical elements. It is used
+  // for cases where an unique offset within the binary input file is not
+  // available.
+  static uint32_t GID;
+  uint32_t ID = 0;
 
   // The parent of this object (nullptr if the root scope). For locations,
   // the parent is a symbol object; otherwise it is a scope object.
@@ -155,9 +161,7 @@ class LLVM_ABI LVObject {
   // copy constructor to create that object; it is used to print a reference
   // to another object and in the case of templates, to print its encoded args.
   LVObject(const LVObject &Object) {
-#ifndef NDEBUG
     incID();
-#endif
     Properties = Object.Properties;
     Offset = Object.Offset;
     LineNumber = Object.LineNumber;
@@ -166,18 +170,10 @@ class LLVM_ABI LVObject {
     Parent = Object.Parent;
   }
 
-#ifndef NDEBUG
-  // This is an internal ID used for debugging logical elements. It is used
-  // for cases where an unique offset within the binary input file is not
-  // available.
-  static uint64_t GID;
-  uint64_t ID = 0;
-
   void incID() {
     ++GID;
     ID = GID;
   }
-#endif
 
 protected:
   // Get a string representation for the given number and discriminator.
@@ -193,11 +189,7 @@ protected:
   virtual void printFileIndex(raw_ostream &OS, bool Full = true) const {}
 
 public:
-  LVObject() {
-#ifndef NDEBUG
-    incID();
-#endif
-  };
+  LVObject() { incID(); };
   LVObject &operator=(const LVObject &) = delete;
   virtual ~LVObject() = default;
 
@@ -313,17 +305,10 @@ public:
   virtual void printExtra(raw_ostream &OS, bool Full = true) const {}
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  virtual void dump() const { print(dbgs()); }
+  void dump() const { print(dbgs()); }
 #endif
 
-  uint64_t getID() const {
-    return
-#ifndef NDEBUG
-        ID;
-#else
-        0;
-#endif
-  }
+  uint32_t getID() const { return ID; }
 };
 
 } // end namespace logicalview
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
index 07d5813e5b19..b5c833330e59 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
@@ -87,10 +87,6 @@ public:
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 } // end namespace logicalview
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
index 0f536b5c16b9..5715a37185b2 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
@@ -325,10 +325,6 @@ public:
   void printExtra(raw_ostream &OS, bool Full = true) const override;
   virtual void printWarnings(raw_ostream &OS, bool Full = true) const {}
   virtual void printMatchedElements(raw_ostream &OS, bool UseMatchedElements) {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 // Class to represent a DWARF Union/Structure/Class.
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
index 93ca2a73d64d..ec9017e16b65 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
@@ -183,10 +183,6 @@ public:
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 } // end namespace logicalview
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
index cbce9cb65c92..59e6a92be8ce 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
@@ -146,10 +146,6 @@ public:
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 // Class to represent DW_TAG_typedef_type.
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp
index 75acbf3225e0..5ccbcbfa4f0a 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp
@@ -21,9 +21,7 @@ using namespace llvm::logicalview;
 
 #define DEBUG_TYPE "Object"
 
-#ifndef NDEBUG
-uint64_t LVObject::GID = 0;
-#endif
+uint32_t LVObject::GID = 0;
 
 StringRef llvm::logicalview::typeNone() { return StringRef(); }
 StringRef llvm::logicalview::typeVoid() { return "void"; }
@@ -137,10 +135,8 @@ void LVObject::printAttributes(raw_ostream &OS, bool Full, StringRef Name,
 }
 
 void LVObject::printAttributes(raw_ostream &OS, bool Full) const {
-#ifndef NDEBUG
   if (options().getInternalID())
     OS << hexSquareString(getID());
-#endif
   if (options().getCompareExecute() &&
       (options().getAttributeAdded() || options().getAttributeMissing()))
     OS << (getIsAdded() ? '+' : getIsMissing() ? '-' : ' ');
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
index 467bb98670b4..af35e58ac0dd 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
@@ -259,12 +259,10 @@ void LVOptions::resolveDependencies() {
 }
 
 void LVOptions::calculateIndentationSize() {
-#ifndef NDEBUG
   if (getInternalID()) {
     std::string String = hexSquareString(0);
     IndentationSize += String.length();
   }
-#endif
   if (getCompareExecute() && (getAttributeAdded() || getAttributeMissing()))
     ++IndentationSize;
   if (getAttributeOffset()) {

From f12dd8f86a2911f69349807359d3bc792e6b773d Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 16 Jun 2025 09:57:21 +0100
Subject: [PATCH 0477/1322] [ValueTracking] Remove unused variable in
 matchSimpleRecurrence (NFC). (#144316)

---
 llvm/lib/Analysis/ValueTracking.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d39efb285974..9df667926faf 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9081,7 +9081,6 @@ bool llvm::matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO,
     auto *LU = dyn_cast<BinaryOperator>(L);
     if (!LU)
       continue;
-    unsigned Opcode = LU->getOpcode();
     Value *LL = LU->getOperand(0);
     Value *LR = LU->getOperand(1);
 

From 3dd61c1876446fb9db7c87b89006ad6d81f72f0d Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Mon, 16 Jun 2025 09:58:03 +0100
Subject: [PATCH 0478/1322] [LV] Fix MVE regression from #132190 (#141736)

Register pressure was only considered if the vector bandwidth was being
maximised (chosen either by the target or user options), but #132190
inadvertently caused high pressure VFs to be pruned even when max
bandwidth wasn't enabled. This PR returns to the previous behaviour.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  51 +++-
 .../ARM/mve-reg-pressure-vmla.ll              | 136 +++++++++
 .../RISCV/interleaved-masked-access.ll        | 284 +++++++++---------
 .../LoopVectorize/RISCV/reg-usage-bf16.ll     |   3 +-
 .../LoopVectorize/RISCV/reg-usage-f16.ll      |   6 +-
 .../LoopVectorize/RISCV/reg-usage.ll          |  40 +--
 6 files changed, 338 insertions(+), 182 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 34f49a7721a3..bdbfecd96244 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -953,6 +953,14 @@ public:
     return expectedCost(UserVF).isValid();
   }
 
+  /// \return True if maximizing vector bandwidth is enabled by the target or
+  /// user options, for the given register kind.
+  bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
+
+  /// \return True if maximizing vector bandwidth is enabled by the target or
+  /// user options, for the given vector factor.
+  bool useMaxBandwidth(ElementCount VF);
+
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
@@ -3921,6 +3929,20 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return FixedScalableVFPair::getNone();
 }
 
+bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
+  return useMaxBandwidth(VF.isScalable()
+                             ? TargetTransformInfo::RGK_ScalableVector
+                             : TargetTransformInfo::RGK_FixedWidthVector);
+}
+
+bool LoopVectorizationCostModel::useMaxBandwidth(
+    TargetTransformInfo::RegisterKind RegKind) {
+  return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+                               (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+                                (UseWiderVFIfCallVariantsPresent &&
+                                 Legal->hasVectorCallVariants())));
+}
+
 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, bool FoldTailByMasking) {
@@ -3986,10 +4008,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
-  if (MaximizeBandwidth ||
-      (MaximizeBandwidth.getNumOccurrences() == 0 &&
-       (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
-        (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
+  if (useMaxBandwidth(RegKind)) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);
@@ -4344,15 +4363,21 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
   for (auto &P : VPlans) {
     ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
                                P->vectorFactors().end());
-    auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
-    for (auto [VF, RU] : zip_equal(VFs, RUs)) {
+
+    SmallVector<VPRegisterUsage, 8> RUs;
+    if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
+        CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+      RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
+
+    for (unsigned I = 0; I < VFs.size(); I++) {
+      ElementCount VF = VFs[I];
       // The cost for scalar VF=1 is already calculated, so ignore it.
       if (VF.isScalar())
         continue;
 
       /// Don't consider the VF if it exceeds the number of registers for the
       /// target.
-      if (RU.exceedsMaxNumRegs(TTI))
+      if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
         continue;
 
       InstructionCost C = CM.expectedCost(VF);
@@ -7106,8 +7131,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   for (auto &P : VPlans) {
     ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
                                P->vectorFactors().end());
-    auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
-    for (auto [VF, RU] : zip_equal(VFs, RUs)) {
+
+    SmallVector<VPRegisterUsage, 8> RUs;
+    if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
+        CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+      RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
+
+    for (unsigned I = 0; I < VFs.size(); I++) {
+      ElementCount VF = VFs[I];
       if (VF.isScalar())
         continue;
       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -7129,7 +7160,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (RU.exceedsMaxNumRegs(TTI)) {
+      if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
         LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
                           << VF << " because it uses too many registers\n");
         continue;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
new file mode 100644
index 000000000000..4c29a3a0d1d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -mattr=+mve -passes=loop-vectorize < %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+; Even though it has high register pressure, this example should still vectorise since the mul+add chains become VMLAs.
+
+define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
+; CHECK-LABEL: define void @fn(
+; CHECK-SAME: i32 noundef [[N:%.*]], ptr [[IN:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP46_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP46_NOT]], [[EXIT:label %.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[N]], 3
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT]], i32 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[IN]], i32 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[IN]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI2:%.*]] = phi ptr [ [[OUT]], %[[VECTOR_PH]] ], [ [[PTR_IND3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI2]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[VECTOR_GEP]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 19595)
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], splat (i32 38470)
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw <4 x i32> [[TMP7]], splat (i32 7471)
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw <4 x i32> [[TMP4]], splat (i32 32768)
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <4 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], splat (i32 16)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 32767)
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw <4 x i32> [[TMP5]], splat (i32 16762097)
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i32> [[TMP7]], splat (i32 16759568)
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP14]], splat (i32 32768)
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw <4 x i32> [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = lshr <4 x i32> [[TMP19]], splat (i32 16)
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i8>
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 13282)
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw <4 x i32> [[TMP5]], splat (i32 16744449)
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw nsw <4 x i32> [[TMP7]], splat (i32 19485)
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP22]], splat (i32 32768)
+; CHECK-NEXT:    [[TMP26:%.*]] = add nuw <4 x i32> [[TMP25]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw <4 x i32> [[TMP26]], [[TMP24]]
+; CHECK-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], splat (i32 16)
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> [[VECTOR_GEP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 2
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> [[TMP30]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> [[TMP31]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    [[PTR_IND3]] = getelementptr i8, ptr [[POINTER_PHI2]], i32 12
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT_LOOPEXIT:label %.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp46.not = icmp eq i32 %n, 0
+  br i1 %cmp46.not, label %exit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %ptr.iv.1 = phi ptr [ %in, %entry ], [ %ptr.iv.1.next, %for.body ]
+  %ptr.iv.2 = phi ptr [ %out, %entry ], [ %ptr.iv.2.next, %for.body ]
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %ptr.iv.1, i32 1
+  %0 = load i8, ptr %ptr.iv.1, align 1
+  %incdec.ptr1 = getelementptr inbounds nuw i8, ptr %ptr.iv.1, i32 2
+  %1 = load i8, ptr %incdec.ptr, align 1
+  %ptr.iv.1.next = getelementptr inbounds nuw i8, ptr %ptr.iv.1, i32 3
+  %2 = load i8, ptr %incdec.ptr1, align 1
+  %conv = zext i8 %0 to i32
+  %mul = mul nuw nsw i32 %conv, 19595
+  %conv3 = zext i8 %1 to i32
+  %mul4 = mul nuw nsw i32 %conv3, 38470
+  %conv5 = zext i8 %2 to i32
+  %mul6 = mul nuw nsw i32 %conv5, 7471
+  %add = add nuw nsw i32 %mul, 32768
+  %add7 = add nuw nsw i32 %add, %mul4
+  %add8 = add nuw nsw i32 %add7, %mul6
+  %shr = lshr i32 %add8, 16
+  %conv9 = trunc nuw i32 %shr to i8
+  %mul11 = mul nuw nsw i32 %conv, 32767
+  %mul13 = mul nuw i32 %conv3, 16762097
+  %mul16 = mul nuw i32 %conv5, 16759568
+  %add14 = add nuw nsw i32 %mul11, 32768
+  %add17 = add nuw i32 %add14, %mul13
+  %add18 = add i32 %add17, %mul16
+  %shr19 = lshr i32 %add18, 16
+  %conv20 = trunc i32 %shr19 to i8
+  %mul22 = mul nuw nsw i32 %conv, 13282
+  %mul24 = mul nuw i32 %conv3, 16744449
+  %mul27 = mul nuw nsw i32 %conv5, 19485
+  %add25 = add nuw nsw i32 %mul22, 32768
+  %add28 = add nuw i32 %add25, %mul24
+  %add29 = add nuw i32 %add28, %mul27
+  %shr30 = lshr i32 %add29, 16
+  %conv31 = trunc i32 %shr30 to i8
+  %incdec.ptr32 = getelementptr inbounds nuw i8, ptr %ptr.iv.2, i32 1
+  store i8 %conv9, ptr %ptr.iv.2, align 1
+  %incdec.ptr33 = getelementptr inbounds nuw i8, ptr %ptr.iv.2, i32 2
+  store i8 %conv20, ptr %incdec.ptr32, align 1
+  %ptr.iv.2.next = getelementptr inbounds nuw i8, ptr %ptr.iv.2, i32 3
+  store i8 %conv31, ptr %incdec.ptr33, align 1
+  %iv.next = add nuw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 1b0feef3e666..b7c9612e57ae 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -11,44 +11,44 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:  entry:
 ; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_EPILOGUE:       vector.ph:
 ; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 3
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 1)
-; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP9]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP10]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 1)
-; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP13]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP15]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP14]], <vscale x 8 x ptr> [[TMP16]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP17]], <vscale x 8 x ptr> [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP9]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -63,42 +63,42 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i1> [[TMP5]], <vscale x 8 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP9]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP12]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP13]], <vscale x 8 x ptr> [[TMP15]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP16]], <vscale x 8 x ptr> [[TMP18]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -180,60 +180,60 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:  entry:
 ; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_EPILOGUE:       vector.ph:
 ; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 3
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
-; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 1)
-; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 2)
-; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 3)
-; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP13]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP15]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP16]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP17]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP20]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER3]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP22]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP24]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP20]], <vscale x 8 x ptr> [[TMP25]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP26:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP26]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP21]], <vscale x 8 x ptr> [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP28:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP28]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP22]], <vscale x 8 x ptr> [[TMP29]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP30:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP30]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP23]], <vscale x 8 x ptr> [[TMP31]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 2)
+; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 3)
+; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP14]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP16]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP18]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP20]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP22]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP24]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP25]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP26:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP26]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP27]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP28:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP28]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP29]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP30:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP30]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP23]], <vscale x 16 x ptr> [[TMP31]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -248,58 +248,58 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i1> [[TMP5]], <vscale x 8 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 3)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP12]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP14]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP16]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP18]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP23]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP19]], <vscale x 8 x ptr> [[TMP24]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP25]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP20]], <vscale x 8 x ptr> [[TMP26]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP27]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP21]], <vscale x 8 x ptr> [[TMP28]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP29]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP22]], <vscale x 8 x ptr> [[TMP30]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
index cb071f989daf..5a67b54c7a3d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
@@ -3,8 +3,7 @@
 
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
-; CHECK:       LV(REG): VF = 8
-; CHECK-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
index 15facfc48137..d4909fa61b4f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
@@ -4,14 +4,12 @@
 
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
-; ZVFH:       LV(REG): VF = 8
-; ZVFH-NEXT:  LV(REG): Found max usage: 2 item
+; ZVFH:  LV(REG): Found max usage: 2 item
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; ZVFH-NEXT:  LV(REG): Found invariant usage: 1 item
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; ZVFHMIN:       LV(REG): VF = 8
-; ZVFHMIN-NEXT:  LV(REG): Found max usage: 2 item
+; ZVFHMIN:  LV(REG): Found max usage: 2 item
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; ZVFHMIN-NEXT:  LV(REG): Found invariant usage: 1 item
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
index 870f52876c5a..cee0b1222b6b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
@@ -28,28 +28,24 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture rea
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL1:       LV(REG): VF = 2
-; CHECK-LMUL1-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL2:       LV(REG): VF = 4
-; CHECK-LMUL2-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL4:       LV(REG): VF = 8
-; CHECK-LMUL4-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL8:       LV(REG): VF = 16
-; CHECK-LMUL8-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 
@@ -80,21 +76,17 @@ define void @goo(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-SCALAR:      LV(REG): VF = 1
 ; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL1:       LV(REG): VF = 2
-; CHECK-LMUL1-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-LMUL2:       LV(REG): VF = 4
-; CHECK-LMUL2-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 1 registers
+; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-LMUL4:       LV(REG): VF = 8
-; CHECK-LMUL4-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
-; CHECK-LMUL8:       LV(REG): VF = 16
-; CHECK-LMUL8-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 entry:
   %cmp3 = icmp sgt i32 %n, 0

From a75e0627f97ccc36ec222a53c6a1106157a380ac Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 16 Jun 2025 10:02:38 +0100
Subject: [PATCH 0479/1322] [LV] Use vscale for tuning when updating profile
 information (#143690)

In fixVectorizedLoop we call setProfileInfoAfterUnrolling to update the
profile information after vectorising, however for scalable VFs we
pessimistically assume vscale=1. We can improve upon this by using the
value of vscale used for tuning, i.e. when targeting neoverse-v1 the
expected value is 2.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  35 +++---
 .../LoopVectorize/AArch64/check-prof-info.ll  | 119 ++++++++++++++++++
 .../LoopVectorize/check-prof-info.ll          |  32 +++++
 3 files changed, 169 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bdbfecd96244..bd0a2ec3986d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2691,6 +2691,20 @@ static void cse(BasicBlock *BB) {
   }
 }
 
+/// This function attempts to return a value that represents the vectorization
+/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// time, but for scalable VFs we calculate it based on an estimate of the
+/// vscale value.
+static unsigned getEstimatedRuntimeVF(ElementCount VF,
+                                      std::optional<unsigned> VScale) {
+  unsigned EstimatedVF = VF.getKnownMinValue();
+  if (VF.isScalable())
+    if (VScale)
+      EstimatedVF *= *VScale;
+  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+  return EstimatedVF;
+}
+
 InstructionCost
 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                               ElementCount VF) const {
@@ -2790,10 +2804,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   //
   // For scalable vectorization we can't know at compile time how many
   // iterations of the loop are handled in one vector iteration, so instead
-  // assume a pessimistic vscale of '1'.
+  // use the value of vscale used for tuning.
   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
-  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
-                               VF.getKnownMinValue() * UF);
+  unsigned EstimatedVFxUF =
+      getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
+  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
 }
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -4031,20 +4046,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   return MaxVF;
 }
 
-/// This function attempts to return a value that represents the vectorization
-/// factor at runtime. For fixed-width VFs we know this precisely at compile
-/// time, but for scalable VFs we calculate it based on an estimate of the
-/// vscale value.
-static unsigned getEstimatedRuntimeVF(ElementCount VF,
-                                      std::optional<unsigned> VScale) {
-  unsigned EstimatedVF = VF.getKnownMinValue();
-  if (VF.isScalable())
-    if (VScale)
-      EstimatedVF *= *VScale;
-  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
-  return EstimatedVF;
-}
-
 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
                                                 const unsigned MaxTripCount,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
new file mode 100644
index 000000000000..9435c544fc81
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:" --version 5
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V1-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC4
+
+target triple = "aarch64-unknown-linux-gnu"
+
+@a = global [1024 x i32] zeroinitializer, align 16
+@b = global [1024 x i32] zeroinitializer, align 16
+
+; We expect the branch weight computations after vectorisation to use
+; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
+define void @_Z3foov() {
+; CHECK-V1-IC1-LABEL: define void @_Z3foov(
+; CHECK-V1-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V1-IC1:  [[ENTRY:.*:]]
+; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V1-IC1:  [[VECTOR_PH]]:
+; CHECK-V1-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1:  [[VECTOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1:  [[SCALAR_PH]]:
+; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1:  [[FOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @_Z3foov(
+; CHECK-V2-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC1:  [[ENTRY:.*:]]
+; CHECK-V2-IC1:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC1:  [[VECTOR_PH]]:
+; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC1:  [[VECTOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC1:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC1:  [[SCALAR_PH]]:
+; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC1:  [[FOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC4-LABEL: define void @_Z3foov(
+; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY1:.*:]]
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4:  [[VECTOR_PH]]:
+; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC4:  [[FOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %load = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  store i32 %load, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 1023}
+;.
+; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V1-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V1-IC1: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
+;.
+; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
+; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+;.
+; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-V2-IC4: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0}
+; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 87c1ccb70227..40741941d4b0 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:"
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s |  FileCheck %s
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-MASKED
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 \
+; RUN:   -scalable-vectorization=on -force-target-supports-scalable-vectors -S < %s |  FileCheck %s -check-prefix=CHECK-SCALABLE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -39,6 +41,21 @@ define void @_Z3foov() {
 ; CHECK-MASKED:  for.body:
 ; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
+; CHECK-SCALABLE-LABEL: @_Z3foov(
+; CHECK-SCALABLE:  entry:
+; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-SCALABLE:  middle.block:
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-SCALABLE:  scalar.ph:
+; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
+; CHECK-SCALABLE:  for.cond.cleanup:
+; CHECK-SCALABLE:  for.body:
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -92,6 +109,21 @@ define void @_Z3foo2v() {
 ; CHECK-MASKED:  for.body:
 ; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
+; CHECK-SCALABLE-LABEL: @_Z3foo2v(
+; CHECK-SCALABLE:  entry:
+; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SCALABLE:  middle.block:
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-SCALABLE:  scalar.ph:
+; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
+; CHECK-SCALABLE:  for.cond.cleanup:
+; CHECK-SCALABLE:  for.body:
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+;
 entry:
   br label %for.body
 

From 79a2b15a4c2d63784fe2a92a72828a14b72412df Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 16 Jun 2025 11:12:15 +0200
Subject: [PATCH 0480/1322] [libc++] Remove a few workarounds for old Clang
 versions (#143858)

---
 libcxx/include/__config         |  9 +--------
 libcxx/include/__utility/pair.h | 15 +++------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 38c47e8d45c8..af8a297fdf3f 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -505,13 +505,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_HIDE_FROM_ABI_AFTER_V1 _LIBCPP_HIDE_FROM_ABI
 #  endif
 
-// TODO: Remove this workaround once we drop support for Clang 16
-#  if __has_warning("-Wc++23-extensions")
-#    define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++23-extensions")
-#  else
-#    define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++2b-extensions")
-#  endif
-
 // Clang modules take a significant compile time hit when pushing and popping diagnostics.
 // Since all the headers are marked as system headers unless _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER is defined, we can
 // simply disable this pushing and popping when _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER isn't defined.
@@ -522,7 +515,7 @@ typedef __char32_t char32_t;
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++14-extensions")                                                           \
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++17-extensions")                                                           \
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++20-extensions")                                                           \
-      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION                                                                 \
+      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++23-extensions")                                                           \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++14-extensions")                                                             \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++17-extensions")                                                             \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++20-extensions")                                                             \
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index ab390aafa0d9..dbacbce04476 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -209,21 +209,12 @@ struct pair
 #  endif
 
 #  if _LIBCPP_STD_VER >= 23
-  // TODO: Remove this workaround in LLVM 20. The bug got fixed in Clang 18.
-  // This is a workaround for http://llvm.org/PR60710. We should be able to remove it once Clang is fixed.
-  template <class _PairLike>
-  _LIBCPP_HIDE_FROM_ABI static constexpr bool __pair_like_explicit_wknd() {
-    if constexpr (__pair_like_no_subrange<_PairLike>) {
-      return !is_convertible_v<decltype(std::get<0>(std::declval<_PairLike&&>())), first_type> ||
-             !is_convertible_v<decltype(std::get<1>(std::declval<_PairLike&&>())), second_type>;
-    }
-    return false;
-  }
-
   template <__pair_like_no_subrange _PairLike>
     requires(is_constructible_v<first_type, decltype(std::get<0>(std::declval<_PairLike &&>()))> &&
              is_constructible_v<second_type, decltype(std::get<1>(std::declval<_PairLike &&>()))>)
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit(__pair_like_explicit_wknd<_PairLike>()) pair(_PairLike&& __p)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit(
+      !is_convertible_v<decltype(std::get<0>(std::declval<_PairLike&&>())), first_type> ||
+      !is_convertible_v<decltype(std::get<1>(std::declval<_PairLike&&>())), second_type>) pair(_PairLike&& __p)
       : first(std::get<0>(std::forward<_PairLike>(__p))), second(std::get<1>(std::forward<_PairLike>(__p))) {}
 #  endif
 

From eddab9b757722da7b908723a5a61d280540b48cf Mon Sep 17 00:00:00 2001
From: Oliver Hunt <oliver@apple.com>
Date: Mon, 16 Jun 2025 12:12:22 +0300
Subject: [PATCH 0481/1322] [clang] Fix PointerAuth semantics of
 cpp_trivially_relocatable (#143969)

This adds a number of functions to ASTContext to query whether a
type contains data protected with address discriminated pointer
authentication, and whether the protected values are just vtable
pointers, or if there are other address discriminated types included.

For the standardized version, __builtin_is_cpp_trivially_relocatable
this means accepting types where the only address discriminated
values are vtable pointers. Other address discriminated types are
not considered relocatable. In addition to that any union containing
any address discriminated data, including vtable pointers, is not
relocatable.

For the old deprecated __builtin_is_trivially_relocatable we reject
any type containing any address discriminated value, as it is
semantically intended as being a "is this memcopyable" which is
not true for anything with address discrimination.

This PR does not update the codegen for __builtin_trivially_relocate,
that will be in a follow on PR that is much more complex.
---
 clang/include/clang/AST/ASTContext.h          |  39 +++++++
 clang/lib/AST/ASTContext.cpp                  |  67 +++++++++++
 clang/lib/Sema/SemaTypeTraits.cpp             |  17 ++-
 .../SemaCXX/cxx2c-trivially-relocatable.cpp   |   1 +
 clang/test/SemaCXX/ptrauth-triviality.cpp     |  44 ++++++-
 .../SemaCXX/trivially-relocatable-ptrauth.cpp | 109 ++++++++++++++++++
 6 files changed, 268 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 3abb49312255..e01361e2466b 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -629,10 +629,48 @@ public:
   void setRelocationInfoForCXXRecord(const CXXRecordDecl *,
                                      CXXRecordDeclRelocationInfo);
 
+  /// Examines a given type, and returns whether the type itself
+  /// is address discriminated, or any transitively embedded types
+  /// contain data that is address discriminated. This includes
+  /// implicitly authenticated values like vtable pointers, as well as
+  /// explicitly qualified fields.
+  bool containsAddressDiscriminatedPointerAuth(QualType T) {
+    if (!isPointerAuthenticationAvailable())
+      return false;
+    return findPointerAuthContent(T) != PointerAuthContent::None;
+  }
+
+  /// Examines a given type, and returns whether the type itself
+  /// or any data it transitively contains has a pointer authentication
+  /// schema that is not safely relocatable. e.g. any data or fields
+  /// with address discrimination other than any otherwise similar
+  /// vtable pointers.
+  bool containsNonRelocatablePointerAuth(QualType T) {
+    if (!isPointerAuthenticationAvailable())
+      return false;
+    return findPointerAuthContent(T) ==
+           PointerAuthContent::AddressDiscriminatedData;
+  }
+
 private:
   llvm::DenseMap<const CXXRecordDecl *, CXXRecordDeclRelocationInfo>
       RelocatableClasses;
 
+  // FIXME: store in RecordDeclBitfields in future?
+  enum class PointerAuthContent : uint8_t {
+    None,
+    AddressDiscriminatedVTable,
+    AddressDiscriminatedData
+  };
+
+  // A simple helper function to short circuit pointer auth checks.
+  bool isPointerAuthenticationAvailable() const {
+    return LangOpts.PointerAuthCalls || LangOpts.PointerAuthIntrinsics;
+  }
+  PointerAuthContent findPointerAuthContent(QualType T);
+  llvm::DenseMap<const RecordDecl *, PointerAuthContent>
+      RecordContainsAddressDiscriminatedPointerAuth;
+
   ImportDecl *FirstLocalImport = nullptr;
   ImportDecl *LastLocalImport = nullptr;
 
@@ -3668,6 +3706,7 @@ public:
   /// authentication policy for the specified record.
   const CXXRecordDecl *
   baseForVTableAuthentication(const CXXRecordDecl *ThisClass);
+
   bool useAbbreviatedThunkName(GlobalDecl VirtualMethodDecl,
                                StringRef MangledName);
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 4d44f23c0f50..189e67e4eed0 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1705,6 +1705,73 @@ void ASTContext::setRelocationInfoForCXXRecord(
   RelocatableClasses.insert({D, Info});
 }
 
+static bool primaryBaseHaseAddressDiscriminatedVTableAuthentication(
+    ASTContext &Context, const CXXRecordDecl *Class) {
+  if (!Class->isPolymorphic())
+    return false;
+  const CXXRecordDecl *BaseType = Context.baseForVTableAuthentication(Class);
+  using AuthAttr = VTablePointerAuthenticationAttr;
+  const AuthAttr *ExplicitAuth = BaseType->getAttr<AuthAttr>();
+  if (!ExplicitAuth)
+    return Context.getLangOpts().PointerAuthVTPtrAddressDiscrimination;
+  AuthAttr::AddressDiscriminationMode AddressDiscrimination =
+      ExplicitAuth->getAddressDiscrimination();
+  if (AddressDiscrimination == AuthAttr::DefaultAddressDiscrimination)
+    return Context.getLangOpts().PointerAuthVTPtrAddressDiscrimination;
+  return AddressDiscrimination == AuthAttr::AddressDiscrimination;
+}
+
+ASTContext::PointerAuthContent ASTContext::findPointerAuthContent(QualType T) {
+  assert(isPointerAuthenticationAvailable());
+
+  T = T.getCanonicalType();
+  if (T.hasAddressDiscriminatedPointerAuth())
+    return PointerAuthContent::AddressDiscriminatedData;
+  const RecordDecl *RD = T->getAsRecordDecl();
+  if (!RD)
+    return PointerAuthContent::None;
+
+  if (auto Existing = RecordContainsAddressDiscriminatedPointerAuth.find(RD);
+      Existing != RecordContainsAddressDiscriminatedPointerAuth.end())
+    return Existing->second;
+
+  PointerAuthContent Result = PointerAuthContent::None;
+
+  auto SaveResultAndReturn = [&]() -> PointerAuthContent {
+    auto [ResultIter, DidAdd] =
+        RecordContainsAddressDiscriminatedPointerAuth.try_emplace(RD, Result);
+    (void)ResultIter;
+    (void)DidAdd;
+    assert(DidAdd);
+    return Result;
+  };
+  auto ShouldContinueAfterUpdate = [&](PointerAuthContent NewResult) {
+    static_assert(PointerAuthContent::None <
+                  PointerAuthContent::AddressDiscriminatedVTable);
+    static_assert(PointerAuthContent::AddressDiscriminatedVTable <
+                  PointerAuthContent::AddressDiscriminatedData);
+    if (NewResult > Result)
+      Result = NewResult;
+    return Result != PointerAuthContent::AddressDiscriminatedData;
+  };
+  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    if (primaryBaseHaseAddressDiscriminatedVTableAuthentication(*this, CXXRD) &&
+        !ShouldContinueAfterUpdate(
+            PointerAuthContent::AddressDiscriminatedVTable))
+      return SaveResultAndReturn();
+    for (auto Base : CXXRD->bases()) {
+      if (!ShouldContinueAfterUpdate(findPointerAuthContent(Base.getType())))
+        return SaveResultAndReturn();
+    }
+  }
+  for (auto *FieldDecl : RD->fields()) {
+    if (!ShouldContinueAfterUpdate(
+            findPointerAuthContent(FieldDecl->getType())))
+      return SaveResultAndReturn();
+  }
+  return SaveResultAndReturn();
+}
+
 void ASTContext::addedLocalImportDecl(ImportDecl *Import) {
   assert(!Import->getNextLocalImport() &&
          "Import declaration already in the chain");
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 1738ab446600..4dbb2450857e 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -188,6 +188,7 @@ static bool IsEligibleForTrivialRelocation(Sema &SemaRef,
       return false;
   }
 
+  bool IsUnion = D->isUnion();
   for (const FieldDecl *Field : D->fields()) {
     if (Field->getType()->isDependentType())
       continue;
@@ -197,6 +198,12 @@ static bool IsEligibleForTrivialRelocation(Sema &SemaRef,
     // of a trivially relocatable type
     if (!SemaRef.IsCXXTriviallyRelocatableType(Field->getType()))
       return false;
+
+    // A union contains values with address discriminated pointer auth
+    // cannot be relocated.
+    if (IsUnion && SemaRef.Context.containsAddressDiscriminatedPointerAuth(
+                       Field->getType()))
+      return false;
   }
   return !D->hasDeletedDestructor();
 }
@@ -313,7 +320,6 @@ bool Sema::IsCXXTriviallyRelocatableType(const CXXRecordDecl &RD) {
 }
 
 bool Sema::IsCXXTriviallyRelocatableType(QualType Type) {
-
   QualType BaseElementType = getASTContext().getBaseElementType(Type);
 
   if (Type->isVariableArrayType())
@@ -322,10 +328,10 @@ bool Sema::IsCXXTriviallyRelocatableType(QualType Type) {
   if (BaseElementType.hasNonTrivialObjCLifetime())
     return false;
 
-  if (BaseElementType.hasAddressDiscriminatedPointerAuth())
+  if (BaseElementType->isIncompleteType())
     return false;
 
-  if (BaseElementType->isIncompleteType())
+  if (Context.containsNonRelocatablePointerAuth(Type))
     return false;
 
   if (BaseElementType->isScalarType() || BaseElementType->isVectorType())
@@ -670,7 +676,10 @@ static bool IsTriviallyRelocatableType(Sema &SemaRef, QualType T) {
   if (!BaseElementType->isObjectType())
     return false;
 
-  if (T.hasAddressDiscriminatedPointerAuth())
+  // The deprecated __builtin_is_trivially_relocatable does not have
+  // an equivalent to __builtin_trivially_relocate, so there is no
+  // safe way to use it if there are any address discriminated values.
+  if (SemaRef.getASTContext().containsAddressDiscriminatedPointerAuth(T))
     return false;
 
   if (const auto *RD = BaseElementType->getAsCXXRecordDecl();
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index 9d43994ee766..7152a5937d9b 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-intrinsics -fptrauth-calls -std=c++2c -verify %s
 
 class Trivial {};
 static_assert(__builtin_is_cpp_trivially_relocatable(Trivial));
diff --git a/clang/test/SemaCXX/ptrauth-triviality.cpp b/clang/test/SemaCXX/ptrauth-triviality.cpp
index 60d1b57230f1..ba8a8273d5c0 100644
--- a/clang/test/SemaCXX/ptrauth-triviality.cpp
+++ b/clang/test/SemaCXX/ptrauth-triviality.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++26 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++26 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
 
 #define AQ __ptrauth(1,1,50)
 #define IQ __ptrauth(1,0,50)
@@ -83,7 +83,7 @@ static_assert(!__is_trivially_constructible(Holder<S3>, const Holder<S3>&));
 static_assert(!__is_trivially_assignable(Holder<S3>, const Holder<S3>&));
 static_assert(__is_trivially_destructible(Holder<S3>));
 static_assert(!__is_trivially_copyable(Holder<S3>));
-static_assert(__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
+static_assert(!__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
 static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S3>));
 static_assert(!__is_trivially_equality_comparable(Holder<S3>));
 
@@ -99,7 +99,6 @@ static_assert(!__is_trivially_assignable(S4, const S4&));
 static_assert(__is_trivially_destructible(S4));
 static_assert(!__is_trivially_copyable(S4));
 static_assert(!__is_trivially_relocatable(S4)); // expected-warning{{deprecated}}
-//FIXME
 static_assert(__builtin_is_cpp_trivially_relocatable(S4));
 static_assert(!__is_trivially_equality_comparable(S4));
 
@@ -124,7 +123,6 @@ static_assert(!__is_trivially_assignable(S5, const S5&));
 static_assert(__is_trivially_destructible(S5));
 static_assert(!__is_trivially_copyable(S5));
 static_assert(!__is_trivially_relocatable(S5)); // expected-warning{{deprecated}}
-//FIXME
 static_assert(__builtin_is_cpp_trivially_relocatable(S5));
 static_assert(!__is_trivially_equality_comparable(S5));
 
@@ -182,3 +180,39 @@ static_assert(__is_trivially_copyable(Holder<S7>));
 static_assert(__is_trivially_relocatable(Holder<S7>)); // expected-warning{{deprecated}}
 static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S7>));
 static_assert(__is_trivially_equality_comparable(Holder<S7>));
+
+template <class... Bases> struct MultipleInheriter : Bases... {
+};
+
+template <class T> static const bool test_is_trivially_relocatable_v = __builtin_is_cpp_trivially_relocatable(T);
+template <class... Types> static const bool multiple_inheritance_is_relocatable = test_is_trivially_relocatable_v<MultipleInheriter<Types...>>;
+template <class... Types> static const bool inheritance_relocatability_matches_bases_v =
+  (test_is_trivially_relocatable_v<Types> && ...) == multiple_inheritance_is_relocatable<Types...>;
+
+static_assert(multiple_inheritance_is_relocatable<S4, S5> == multiple_inheritance_is_relocatable<S5, S4>);
+static_assert(inheritance_relocatability_matches_bases_v<S4, S5>);
+static_assert(inheritance_relocatability_matches_bases_v<S5, S4>);
+
+struct AA AddressDiscriminatedPolymorphicBase trivially_relocatable_if_eligible {
+  virtual void foo();
+};
+
+struct IA NoAddressDiscriminatedPolymorphicBase trivially_relocatable_if_eligible {
+  virtual void bar();
+};
+
+template <class T> struct UnionWrapper trivially_relocatable_if_eligible {
+  union U {
+    T field1;
+  } u;
+};
+
+static_assert(test_is_trivially_relocatable_v<AddressDiscriminatedPolymorphicBase>);
+static_assert(test_is_trivially_relocatable_v<NoAddressDiscriminatedPolymorphicBase>);
+static_assert(inheritance_relocatability_matches_bases_v<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>);
+static_assert(inheritance_relocatability_matches_bases_v<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>);
+
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<AddressDiscriminatedPolymorphicBase>>);
+static_assert(test_is_trivially_relocatable_v<UnionWrapper<NoAddressDiscriminatedPolymorphicBase>>);
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<MultipleInheriter<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>>>);
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<MultipleInheriter<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>>>);
diff --git a/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
new file mode 100644
index 000000000000..b38499a634fc
--- /dev/null
+++ b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -triple arm64 -fptrauth-calls -fptrauth-intrinsics -std=c++26 -verify %s
+
+// This test intentionally does not enable the global address discrimination
+// of vtable pointers. This lets us configure them with different schemas
+// and verify that we're correctly tracking the existence of address discrimination
+
+// expected-no-diagnostics
+
+struct NonAddressDiscPtrauth {
+  void * __ptrauth(1, 0, 1234) p;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(NonAddressDiscPtrauth));
+
+struct AddressDiscPtrauth {
+  void * __ptrauth(1, 1, 1234) p;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(AddressDiscPtrauth));
+
+struct MultipleBaseClasses : NonAddressDiscPtrauth, AddressDiscPtrauth {
+
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(MultipleBaseClasses));
+
+struct MultipleMembers1 {
+   NonAddressDiscPtrauth field0;
+   AddressDiscPtrauth field1;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(MultipleMembers1));
+
+struct MultipleMembers2 {
+   NonAddressDiscPtrauth field0;
+   NonAddressDiscPtrauth field1;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(MultipleMembers2));
+
+struct UnionOfPtrauth {
+    union {
+        NonAddressDiscPtrauth field0;
+        AddressDiscPtrauth field1;
+    } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfPtrauth));
+
+struct [[clang::ptrauth_vtable_pointer(process_independent,address_discrimination,no_extra_discrimination)]] Polymorphic trivially_relocatable_if_eligible {
+  virtual ~Polymorphic();
+};
+
+struct Foo : Polymorphic {
+  Foo(const Foo&);
+  ~Foo();
+};
+
+
+static_assert(__builtin_is_cpp_trivially_relocatable(Polymorphic));
+
+struct [[clang::ptrauth_vtable_pointer(process_independent,no_address_discrimination,no_extra_discrimination)]] NonAddressDiscriminatedPolymorphic trivially_relocatable_if_eligible {
+  virtual ~NonAddressDiscriminatedPolymorphic();
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(NonAddressDiscriminatedPolymorphic));
+
+
+struct PolymorphicMembers {
+    Polymorphic field;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(PolymorphicMembers));
+
+struct UnionOfPolymorphic {
+  union trivially_relocatable_if_eligible {
+    Polymorphic p;
+    int i;
+  } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfPolymorphic));
+
+
+struct UnionOfNonAddressDiscriminatedPolymorphic {
+  union trivially_relocatable_if_eligible {
+    NonAddressDiscriminatedPolymorphic p;
+    int i;
+  } u;
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfNonAddressDiscriminatedPolymorphic));
+
+struct UnionOfNonAddressDiscriminatedPtrauth {
+  union {
+    NonAddressDiscPtrauth p;
+    int i;
+  } u;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(UnionOfNonAddressDiscriminatedPtrauth));
+
+struct UnionOfAddressDisriminatedPtrauth {
+  union {
+    AddressDiscPtrauth p;
+    int i;
+  } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfAddressDisriminatedPtrauth));

From b2bf017acd0369fff89b933cf7c653f62b49f8d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 10:31:24 +0100
Subject: [PATCH 0482/1322] [X86] X86FixupInstTuning - prefer VPBLENDD to
 VPBLENDW shuffles on AVX2+ targets (#144269)

On many Intel AVX2 targets (Haswell+), VPBLENDD has notably better throughput than VPBLENDW - and the remaining Intel/AMD targets have no preference.

This patch replaces VPBLENDW shuffles if the shuffle mask can be safely widened from vXi16 to vXi32 and that the scheduler model doesn't consider it a regression (I haven't found any target where this is true, but we should retain the model check).

Noticed while working on #142972 where VMOVSS nodes were regressing to VPBLENDW nodes during domain switching.
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    | 26 ++++++++++++
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |  2 +-
 llvm/test/CodeGen/X86/dpbusd.ll               | 12 +++---
 llvm/test/CodeGen/X86/dpbusd_const.ll         | 16 +++----
 llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll |  6 +--
 .../CodeGen/X86/vector-reduce-add-mask.ll     |  2 +-
 .../CodeGen/X86/vector-reduce-add-zext.ll     |  4 +-
 llvm/test/CodeGen/X86/vector-reduce-add.ll    | 27 ++++++++----
 .../CodeGen/X86/zero_extend_vector_inreg.ll   | 42 +++++++++----------
 .../zero_extend_vector_inreg_of_broadcast.ll  |  6 +--
 10 files changed, 89 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 89093b2e1a3f..33dc0a232815 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -242,6 +242,26 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
+  auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
+    if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
+      return false;
+    // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
+    APInt MaskW =
+        APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
+    APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
+    if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
+      return false;
+    APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(MovOpc));
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return true;
+  };
+
   auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
                                unsigned MovImm) -> bool {
     if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
@@ -270,6 +290,12 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
            ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
 
+  case X86::VPBLENDWrri:
+    // TODO: Add X86::VPBLENDWrmi handling
+    // TODO: Add X86::VPBLENDWYrri handling
+    // TODO: Add X86::VPBLENDWYrmi handling
+    return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 14e3767f6556..38ea796c0fcb 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -424,7 +424,7 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 04d7a9691b64..3aa77c3955c6 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -317,8 +317,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVXVNNI-NEXT:    vmovd %xmm2, %eax
 ; AVXVNNI-NEXT:    addl %edx, %eax
@@ -328,9 +328,9 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
@@ -343,8 +343,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index dfae853f9961..456e6e8f263a 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -27,7 +27,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVXVNNI-LABEL: mul_4xi8_zc:
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -36,7 +36,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI-LABEL: mul_4xi8_zc:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512VNNI-NEXT:    vmovd %xmm1, %eax
@@ -47,7 +47,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-LABEL: mul_4xi8_zc:
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
@@ -67,7 +67,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVXVNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVXVNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -78,7 +78,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVX512VNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512VNNI-NEXT:    vmovd %xmm1, %eax
@@ -107,7 +107,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVXVNNI-LABEL: mul_4xi8_cs:
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
@@ -117,7 +117,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI-LABEL: mul_4xi8_cs:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
@@ -129,7 +129,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-LABEL: mul_4xi8_cs:
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 84ae818d9183..05c855ed90b3 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1014,7 +1014,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1023,7 +1023,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1038,7 +1038,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 4898ae98faea..983ae594e3ab 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -112,7 +112,7 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index 937ac3d2db88..d99b20038558 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -231,7 +231,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX2-LABEL: test_v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    retq
@@ -239,7 +239,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX512-LABEL: test_v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index 6cc0e1e73fcd..aed4e023e340 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -1025,19 +1025,28 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index ddd7f1016893..cacc43e96b6e 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1329,7 +1329,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -1340,7 +1340,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1351,7 +1351,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2428,7 +2428,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -2439,7 +2439,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -2450,7 +2450,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4996,7 +4996,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
 ; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
@@ -5063,7 +5063,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-SLOW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5282,7 +5282,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -5295,7 +5295,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -5308,7 +5308,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -7347,9 +7347,9 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -7362,7 +7362,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -7376,7 +7376,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -7405,9 +7405,9 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -7419,7 +7419,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
@@ -7491,7 +7491,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -7504,7 +7504,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -7517,7 +7517,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ed53c3693c9d..572ed314ab31 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -4875,7 +4875,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
 ; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5068,7 +5068,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -6847,7 +6847,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)

From 3824a2dbcefe266849b9f8b3eaa1dd23354b15de Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 16 Jun 2025 11:48:55 +0200
Subject: [PATCH 0483/1322] [MemoryBuiltins] Support allocas in
 getInitialValueOfAllocation (NFC)

---
 llvm/lib/Analysis/MemoryBuiltins.cpp   | 3 +++
 llvm/lib/Transforms/IPO/Attributor.cpp | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 6b7a3e1ffe34..e0b7f65d18a3 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -428,6 +428,9 @@ llvm::getAllocSize(const CallBase *CB, const TargetLibraryInfo *TLI,
 Constant *llvm::getInitialValueOfAllocation(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             Type *Ty) {
+  if (isa<AllocaInst>(V))
+    return UndefValue::get(Ty);
+
   auto *Alloc = dyn_cast<CallBase>(V);
   if (!Alloc)
     return nullptr;
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 050eed376ed3..dac1f7a30c37 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -242,8 +242,6 @@ Constant *
 AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA,
                           Value &Obj, Type &Ty, const TargetLibraryInfo *TLI,
                           const DataLayout &DL, AA::RangeTy *RangePtr) {
-  if (isa<AllocaInst>(Obj))
-    return UndefValue::get(&Ty);
   if (Constant *Init = getInitialValueOfAllocation(&Obj, TLI, &Ty))
     return Init;
   auto *GV = dyn_cast<GlobalVariable>(&Obj);

From 299a55a88fae4fc423c440436b2632d2a6bd800a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 16 Jun 2025 18:07:27 +0800
Subject: [PATCH 0484/1322] [InstCombine][Docs] Update InstCombine contributor
 guide (#144228)

Update the guideline to reduce the chance of miscompilation/performance
regression.

---------

Co-authored-by: Nikita Popov <github@npopov.com>
Co-authored-by: Antonio Frighetto <me@antoniofrighetto.com>
---
 llvm/docs/InstCombineContributorGuide.md | 39 ++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/llvm/docs/InstCombineContributorGuide.md b/llvm/docs/InstCombineContributorGuide.md
index b4041f8a5b93..cee0a7ce446a 100644
--- a/llvm/docs/InstCombineContributorGuide.md
+++ b/llvm/docs/InstCombineContributorGuide.md
@@ -404,11 +404,32 @@ The use of TargetTransformInfo is only allowed for hooks for target-specific
 intrinsics, such as `TargetTransformInfo::instCombineIntrinsic()`. These are
 already inherently target-dependent anyway.
 
+If some canonicalization narrow/widen the integer width of expressions, please
+check `shouldChangeType()` first. Otherwise, we may evaluate the expression 
+in illegal/inefficient types.
+
 For vector-specific transforms that require cost-modelling, the VectorCombine
 pass can be used instead. In very rare circumstances, if there are no other
 alternatives, target-dependent transforms may be accepted into
 AggressiveInstCombine.
 
+Generally, we prefer unsigned operations over signed operations in the middle-end, even
+if signed operations are more efficient on some targets. The following is an incomplete
+list of canonicalizations that are implemented in InstCombine:
+
+| Original Pattern             | Canonical Form             | Condition                     |
+|------------------------------|----------------------------|-------------------------------|
+| `icmp spred X, Y`            | `icmp samesign upred X, Y` | `sign(X) == sign(Y)`          |
+| `smin/smax X, Y`             | `umin/umax X, Y`           | `sign(X) == sign(Y)`          |
+| `sext X`                     | `zext nneg X`              | `X >=s 0`                     |
+| `sitofp X`                   | `uitofp nneg X`            | `X >=s 0`                     |
+| `ashr X, Y`                  | `lshr X, Y`                | `X >=s 0`                     |
+| `sdiv/srem X, Y`             | `udiv/urem X, Y`           | `X >=s 0 && Y >=s 0`          |
+| `add X, Y`                   | `or disjoint X, Y`         | `(X & Y) != 0`                |
+| `mul X, C`                   | `shl X, Log2(C)`           | `isPowerOf2(C)`               |
+| `select Cond1, Cond2, false` | `and Cond1, Cond2`         | `impliesPoison(Cond2, Cond1)` |
+| `select Cond1, true, Cond2`  | `or Cond1, Cond2`          | `impliesPoison(Cond2, Cond1)` |
+
 ### PatternMatch
 
 Many transforms make use of the matching infrastructure defined in
@@ -531,6 +552,19 @@ need to add a one-use check for the inner instruction.
 One-use checks can be performed using the `m_OneUse()` matcher, or the
 `V->hasOneUse()` method.
 
+### Flag handling
+
+When possible, favour propagation of poison-generating flags like `nuw` and `nsw` since they may be
+hard to salvage later. Avoid doing so if it introduces additional complexity (e.g. requires querying `willNotOverflow`
+or KnownBits).
+
+Be careful with in-place operand/predicate changes, as poison-generating flags may not be valid for new
+operands. It is recommended to create a new instruction with careful handling of flags. If not
+applicable, call `Instruction::dropPoisonGeneratingFlags()` to clear flags in a conservative manner.
+
+Do not rely on fcmp's `nsz` flag to perform optimizations. It is meaningless for fcmp so it should not affect
+the optimization.
+
 ### Generalization
 
 Transforms can both be too specific (only handling some odd subset of patterns,
@@ -558,6 +592,11 @@ guidelines.
    use of ValueTracking queries. Whether this makes sense depends on the case,
    but it's usually a good idea to only handle the constant pattern first, and
    then generalize later if it seems useful.
+ * When possible, handle more canonical patterns as well. It is encouraged to avoid
+   potential phase-ordering issues. For example, if the motivating transform holds for
+   `add`, it also holds for `or disjoint`. See the canonicalization list above for details.
+   In most cases, it can be easily implemented with matchers like
+   `m_AddLike/m_SExtLike/m_LogicalAnd/m_LogicalOr`.
 
 ## Guidelines for reviewers
 

From e00853859e89114d8db24aa0b863b618175f79c7 Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Mon, 16 Jun 2025 13:40:50 +0200
Subject: [PATCH 0485/1322] [MLIR][Transform] apply_registered_pass: support
 ListOptions (#144026)

Interpret an option value with multiple values, either in the form of an
`ArrayAttr` (either static or passed through a param) or as the multiple
attrs associated to a param, as a comma-separated list, i.e. as a
ListOption on a pass.
---
 .../mlir/Dialect/Transform/IR/TransformOps.td |   5 +-
 .../lib/Dialect/Transform/IR/TransformOps.cpp | 168 ++++++++++++------
 .../mlir/dialects/transform/__init__.py       |  59 +++---
 .../Transform/test-pass-application.mlir      | 145 ++++++++++++---
 mlir/test/python/dialects/transform.py        |  58 ++++--
 5 files changed, 311 insertions(+), 124 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 0aa750e62543..62e66b3dabee 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -418,11 +418,14 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
         with options = { "top-down" = false,
                          "max-iterations" = %max_iter,
                          "test-convergence" = true,
-                         "max-num-rewrites" =  %max_rewrites }
+                         "max-num-rewrites" = %max_rewrites }
         to %module
     : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
     ```
 
+    Options' values which are `ArrayAttr`s are converted to comma-separated
+    lists of options. Likewise for params which associate multiple values.
+
     This op first looks for a pass pipeline with the specified name. If no such
     pipeline exists, it looks for a pass with the specified name. If no such
     pass exists either, this op fails definitely.
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 582d082153be..bb9bdd70625e 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -788,46 +788,47 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
   // Obtain a single options-string to pass to the pass(-pipeline) from options
   // passed in as a dictionary of keys mapping to values which are either
   // attributes or param-operands pointing to attributes.
+  OperandRange dynamicOptions = getDynamicOptions();
 
   std::string options;
   llvm::raw_string_ostream optionsStream(options); // For "printing" attrs.
 
-  OperandRange dynamicOptions = getDynamicOptions();
-  for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) {
-    if (idx > 0)
-      optionsStream << " "; // Interleave options separator.
-    optionsStream << namedAttribute.getName().str(); // Append the key.
-    optionsStream << "="; // And the key-value separator.
-
-    Attribute valueAttrToAppend;
-    if (auto paramOperandIndex =
-            dyn_cast<transform::ParamOperandAttr>(namedAttribute.getValue())) {
-      // The corresponding value attribute is passed in via a param.
+  // A helper to convert an option's attribute value into a corresponding
+  // string representation, with the ability to obtain the attr(s) from a param.
+  std::function<void(Attribute)> appendValueAttr = [&](Attribute valueAttr) {
+    if (auto paramOperand = dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
+      // The corresponding value attribute(s) is/are passed in via a param.
       // Obtain the param-operand via its specified index.
-      size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt();
+      size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
       assert(dynamicOptionIdx < dynamicOptions.size() &&
-             "number of dynamic option markers (UnitAttr) in options ArrayAttr "
+             "the number of ParamOperandAttrs in the options DictionaryAttr"
              "should be the same as the number of options passed as params");
-      ArrayRef<Attribute> dynamicOption =
+      ArrayRef<Attribute> attrsAssociatedToParam =
           state.getParams(dynamicOptions[dynamicOptionIdx]);
-      if (dynamicOption.size() != 1)
-        return emitSilenceableError()
-               << "options passed as a param must have "
-                  "a single value associated, param "
-               << dynamicOptionIdx << " associates " << dynamicOption.size();
-      valueAttrToAppend = dynamicOption[0];
-    } else {
-      // Value is a static attribute.
-      valueAttrToAppend = namedAttribute.getValue();
-    }
-
-    // Append string representation of value attribute.
-    if (auto strAttr = dyn_cast<StringAttr>(valueAttrToAppend)) {
+      // Recursive so as to append all attrs associated to the param.
+      llvm::interleave(attrsAssociatedToParam, optionsStream, appendValueAttr,
+                       ",");
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // Recursive so as to append all nested attrs of the array.
+      llvm::interleave(arrayAttr, optionsStream, appendValueAttr, ",");
+    } else if (auto strAttr = dyn_cast<StringAttr>(valueAttr)) {
+      // Convert to unquoted string.
       optionsStream << strAttr.getValue().str();
     } else {
-      valueAttrToAppend.print(optionsStream, /*elideType=*/true);
+      // For all other attributes, ask the attr to print itself (without type).
+      valueAttr.print(optionsStream, /*elideType=*/true);
     }
-  }
+  };
+
+  // Convert the options DictionaryAttr into a single string.
+  llvm::interleave(
+      getOptions(), optionsStream,
+      [&](auto namedAttribute) {
+        optionsStream << namedAttribute.getName().str(); // Append the key.
+        optionsStream << "="; // And the key-value separator.
+        appendValueAttr(namedAttribute.getValue()); // And the attr's str repr.
+      },
+      " ");
   optionsStream.flush();
 
   // Get pass or pass pipeline from registry.
@@ -878,23 +879,30 @@ static ParseResult parseApplyRegisteredPassOptions(
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions) {
   // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax.
   SmallVector<NamedAttribute> keyValuePairs;
-
   size_t dynamicOptionsIdx = 0;
-  auto parseKeyValuePair = [&]() -> ParseResult {
-    // Parse items of the form `key = value` where `key` is a bare identifier or
-    // a string and `value` is either an attribute or an operand.
 
-    std::string key;
-    Attribute valueAttr;
-    if (parser.parseOptionalKeywordOrString(&key))
-      return parser.emitError(parser.getCurrentLocation())
-             << "expected key to either be an identifier or a string";
-    if (key.empty())
-      return failure();
+  // Helper for allowing parsing of option values which can be of the form:
+  // - a normal attribute
+  // - an operand (which would be converted to an attr referring to the operand)
+  // - ArrayAttrs containing the foregoing (in correspondence with ListOptions)
+  std::function<ParseResult(Attribute &)> parseValue =
+      [&](Attribute &valueAttr) -> ParseResult {
+    // Allow for array syntax, e.g. `[0 : i64, %param, true, %other_param]`:
+    if (succeeded(parser.parseOptionalLSquare())) {
+      SmallVector<Attribute> attrs;
 
-    if (parser.parseEqual())
-      return parser.emitError(parser.getCurrentLocation())
-             << "expected '=' after key in key-value pair";
+      // Recursively parse the array's elements, which might be operands.
+      if (parser.parseCommaSeparatedList(
+              AsmParser::Delimiter::None,
+              [&]() -> ParseResult { return parseValue(attrs.emplace_back()); },
+              " in options dictionary") ||
+          parser.parseRSquare())
+        return failure(); // NB: Attempted parse should've output error message.
+
+      valueAttr = ArrayAttr::get(parser.getContext(), attrs);
+
+      return success();
+    }
 
     // Parse the value, which can be either an attribute or an operand.
     OptionalParseResult parsedValueAttr =
@@ -903,9 +911,7 @@ static ParseResult parseApplyRegisteredPassOptions(
       OpAsmParser::UnresolvedOperand operand;
       ParseResult parsedOperand = parser.parseOperand(operand);
       if (failed(parsedOperand))
-        return parser.emitError(parser.getCurrentLocation())
-               << "expected a valid attribute or operand as value associated "
-               << "to key '" << key << "'";
+        return failure(); // NB: Attempted parse should've output error message.
       // To make use of the operand, we need to store it in the options dict.
       // As SSA-values cannot occur in attributes, what we do instead is store
       // an attribute in its place that contains the index of the param-operand,
@@ -924,7 +930,30 @@ static ParseResult parseApplyRegisteredPassOptions(
              << "in the generic print format";
     }
 
+    return success();
+  };
+
+  // Helper for `key = value`-pair parsing where `key` is a bare identifier or a
+  // string and `value` looks like either an attribute or an operand-in-an-attr.
+  std::function<ParseResult()> parseKeyValuePair = [&]() -> ParseResult {
+    std::string key;
+    Attribute valueAttr;
+
+    if (failed(parser.parseOptionalKeywordOrString(&key)) || key.empty())
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected key to either be an identifier or a string";
+
+    if (failed(parser.parseEqual()))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected '=' after key in key-value pair";
+
+    if (failed(parseValue(valueAttr)))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected a valid attribute or operand as value associated "
+             << "to key '" << key << "'";
+
     keyValuePairs.push_back(NamedAttribute(key, valueAttr));
+
     return success();
   };
 
@@ -951,16 +980,27 @@ static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
   if (options.empty())
     return;
 
+  std::function<void(Attribute)> printOptionValue = [&](Attribute valueAttr) {
+    if (auto paramOperandAttr =
+            dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
+      // Resolve index of param-operand to its actual SSA-value and print that.
+      printer.printOperand(
+          dynamicOptions[paramOperandAttr.getIndex().getInt()]);
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // This case is so that ArrayAttr-contained operands are pretty-printed.
+      printer << "[";
+      llvm::interleaveComma(arrayAttr, printer, printOptionValue);
+      printer << "]";
+    } else {
+      printer.printAttribute(valueAttr);
+    }
+  };
+
   printer << "{";
   llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
-    printer << namedAttribute.getName() << " = ";
-    Attribute value = namedAttribute.getValue();
-    if (auto indexAttr = dyn_cast<transform::ParamOperandAttr>(value)) {
-      // Resolve index of param-operand to its actual SSA-value and print that.
-      printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]);
-    } else {
-      printer.printAttribute(value);
-    }
+    printer << namedAttribute.getName();
+    printer << " = ";
+    printOptionValue(namedAttribute.getValue());
   });
   printer << "}";
 }
@@ -970,9 +1010,11 @@ LogicalResult transform::ApplyRegisteredPassOp::verify() {
   // and references to dynamic options in the options dictionary.
 
   auto dynamicOptions = SmallVector<Value>(getDynamicOptions());
-  for (NamedAttribute namedAttr : getOptions())
-    if (auto paramOperand =
-            dyn_cast<transform::ParamOperandAttr>(namedAttr.getValue())) {
+
+  // Helper for option values to mark seen operands as having been seen (once).
+  std::function<LogicalResult(Attribute)> checkOptionValue =
+      [&](Attribute valueAttr) -> LogicalResult {
+    if (auto paramOperand = dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
       size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
       if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size())
         return emitOpError()
@@ -983,8 +1025,20 @@ LogicalResult transform::ApplyRegisteredPassOp::verify() {
         return emitOpError() << "dynamic option index " << dynamicOptionIdx
                              << " is already used in options";
       dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used.
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // Recurse into ArrayAttrs as they may contain references to operands.
+      for (auto eltAttr : arrayAttr)
+        if (failed(checkOptionValue(eltAttr)))
+          return failure();
     }
+    return success();
+  };
 
+  for (NamedAttribute namedAttr : getOptions())
+    if (failed(checkOptionValue(namedAttr.getValue())))
+      return failure();
+
+  // All dynamicOptions-params seen in the dict will have been set to null.
   for (Value dynamicOption : dynamicOptions)
     if (dynamicOption)
       return emitOpError() << "a param operand does not have a corresponding "
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index bfe96b1b3e5d..b075919d1ef0 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -219,6 +219,11 @@ class YieldOp(YieldOp):
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
 
 
+OptionValueTypes = Union[
+    Sequence["OptionValueTypes"], Attribute, Value, Operation, OpView, str, int, bool
+]
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
     def __init__(
@@ -227,12 +232,7 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
         target: Union[Operation, Value, OpView],
         pass_name: Union[str, StringAttr],
         *,
-        options: Optional[
-            Dict[
-                Union[str, StringAttr],
-                Union[Attribute, Value, Operation, OpView, str, int, bool],
-            ]
-        ] = None,
+        options: Optional[Dict[Union[str, StringAttr], OptionValueTypes]] = None,
         loc=None,
         ip=None,
     ):
@@ -243,26 +243,32 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
         context = (loc and loc.context) or Context.current
 
         cur_param_operand_idx = 0
+
+        def option_value_to_attr(value):
+            nonlocal cur_param_operand_idx
+            if isinstance(value, (Value, Operation, OpView)):
+                dynamic_options.append(_get_op_result_or_value(value))
+                cur_param_operand_idx += 1
+                return ParamOperandAttr(cur_param_operand_idx - 1, context)
+            elif isinstance(value, Attribute):
+                return value
+            # The following cases auto-convert Python values to attributes.
+            elif isinstance(value, bool):
+                return BoolAttr.get(value)
+            elif isinstance(value, int):
+                default_int_type = IntegerType.get_signless(64, context)
+                return IntegerAttr.get(default_int_type, value)
+            elif isinstance(value, str):
+                return StringAttr.get(value)
+            elif isinstance(value, Sequence):
+                return ArrayAttr.get([option_value_to_attr(elt) for elt in value])
+            else:
+                raise TypeError(f"Unsupported option type: {type(value)}")
+
         for key, value in options.items() if options is not None else {}:
             if isinstance(key, StringAttr):
                 key = key.value
-
-            if isinstance(value, (Value, Operation, OpView)):
-                dynamic_options.append(_get_op_result_or_value(value))
-                options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context)
-                cur_param_operand_idx += 1
-            elif isinstance(value, Attribute):
-                options_dict[key] = value
-            # The following cases auto-convert Python values to attributes.
-            elif isinstance(value, bool):
-                options_dict[key] = BoolAttr.get(value)
-            elif isinstance(value, int):
-                default_int_type = IntegerType.get_signless(64, context)
-                options_dict[key] = IntegerAttr.get(default_int_type, value)
-            elif isinstance(value, str):
-                options_dict[key] = StringAttr.get(value)
-            else:
-                raise TypeError(f"Unsupported option type: {type(value)}")
+            options_dict[key] = option_value_to_attr(value)
         super().__init__(
             result,
             _get_op_result_or_value(target),
@@ -279,12 +285,7 @@ def apply_registered_pass(
     target: Union[Operation, Value, OpView],
     pass_name: Union[str, StringAttr],
     *,
-    options: Optional[
-        Dict[
-            Union[str, StringAttr],
-            Union[Attribute, Value, Operation, OpView, str, int, bool],
-        ]
-    ] = None,
+    options: Optional[Dict[Union[str, StringAttr], OptionValueTypes]] = None,
     loc=None,
     ip=None,
 ) -> Value:
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 1d1be9eda349..ce8f69c58701 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -164,6 +164,128 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func private @valid_multiple_values_as_list_option_single_param()
+module {
+  func.func @valid_multiple_values_as_list_option_single_param() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %symbol_a = transform.param.constant "a" -> !transform.any_param
+    %symbol_b = transform.param.constant "b" -> !transform.any_param
+    %multiple_symbol_names = transform.merge_handles %symbol_a, %symbol_b : !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = %multiple_symbol_names } to %2
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_array_attr_as_list_option()
+module {
+  func.func @valid_array_attr_as_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = ["a", "b"] } to %2
+        : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_array_attr_param_as_list_option()
+module {
+  func.func @valid_array_attr_param_as_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %multiple_symbol_names = transform.param.constant ["a","b"] -> !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = %multiple_symbol_names } to %2
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_multiple_params_as_single_list_option()
+module {
+  func.func @valid_multiple_params_as_single_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %symbol_a = transform.param.constant "a" -> !transform.any_param
+    %symbol_b = transform.param.constant "b" -> !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = [%symbol_a, %symbol_b] } to %2
+        : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @invalid_options_as_str() {
   return
 }
@@ -203,7 +325,8 @@ func.func @invalid_options_due_to_reserved_attr() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    // expected-error @+3 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    // expected-error @+2 {{expected a valid attribute or operand as value associated to key 'top-down'}}
     %2 = transform.apply_registered_pass "canonicalize"
         with options = { "top-down" = #transform.param_operand<index=0> } to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
@@ -262,26 +385,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @too_many_pass_option_params() {
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
-    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %x = transform.param.constant true -> !transform.any_param
-    %y = transform.param.constant false -> !transform.any_param
-    %topdown_options = transform.merge_handles %x, %y : !transform.any_param
-    // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
-    transform.apply_registered_pass "canonicalize"
-        with options = { "top-down" = %topdown_options } to %1
-        : (!transform.any_op, !transform.any_param) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
 module attributes {transform.with_named_sequence} {
   // expected-error @below {{trying to schedule a pass on an unsupported operation}}
   // expected-note @below {{target op}}
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index eeb95605d7a9..6c5e4e5505b1 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -256,30 +256,45 @@ def testReplicateOp(module: Module):
     # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
 
 
+# CHECK-LABEL: TEST: testApplyRegisteredPassOp
 @run
 def testApplyRegisteredPassOp(module: Module):
+    # CHECK: transform.sequence
     sequence = transform.SequenceOp(
         transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
     )
     with InsertionPoint(sequence.body):
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize"
         )
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+        # CHECK-SAME:    with options = {"top-down" = false}
+        # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(),
             mod.result,
             "canonicalize",
             options={"top-down": BoolAttr.get(False)},
         )
+        # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
         max_iter = transform.param_constant(
             transform.AnyParamType.get(),
             IntegerAttr.get(IntegerType.get_signless(64), 10),
         )
+        # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
         max_rewrites = transform.param_constant(
             transform.AnyParamType.get(),
             IntegerAttr.get(IntegerType.get_signless(64), 1),
         )
-        transform.apply_registered_pass(
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+        # NB: MLIR has sorted the dict lexicographically by key:
+        # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
+        # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
+        # CHECK-SAME:                    "test-convergence" = true,
+        # CHECK-SAME:                    "top-down" = false}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+        mod = transform.apply_registered_pass(
             transform.AnyOpType.get(),
             mod,
             "canonicalize",
@@ -290,19 +305,30 @@ def testApplyRegisteredPassOp(module: Module):
                 "max-rewrites": max_rewrites,
             },
         )
+        # CHECK:   %{{.*}} = apply_registered_pass "symbol-privatize"
+        # CHECK-SAME:    with options = {"exclude" = ["a", "b"]}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op) -> !transform.any_op
+        mod = transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            mod,
+            "symbol-privatize",
+            options={"exclude": ("a", "b")},
+        )
+        # CHECK:   %[[SYMBOL_A:.+]] = transform.param.constant
+        symbol_a = transform.param_constant(
+            transform.AnyParamType.get(), StringAttr.get("a")
+        )
+        # CHECK:   %[[SYMBOL_B:.+]] = transform.param.constant
+        symbol_b = transform.param_constant(
+            transform.AnyParamType.get(), StringAttr.get("b")
+        )
+        # CHECK:   %{{.*}} = apply_registered_pass "symbol-privatize"
+        # CHECK-SAME:    with options = {"exclude" = [%[[SYMBOL_A]], %[[SYMBOL_B]]]}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+        mod = transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            mod,
+            "symbol-privatize",
+            options={"exclude": (symbol_a, symbol_b)},
+        )
         transform.YieldOp()
-    # CHECK-LABEL: TEST: testApplyRegisteredPassOp
-    # CHECK: transform.sequence
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
-    # CHECK-SAME:    with options = {"top-down" = false}
-    # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
-    # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
-    # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
-    # NB: MLIR has sorted the dict lexicographically by key:
-    # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
-    # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
-    # CHECK-SAME:                    "test-convergence" = true,
-    # CHECK-SAME:                    "top-down" = false}
-    # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op

From ddea4fe85a01f645a1c5e2c4a8ea607a85cf986f Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 07:48:50 -0400
Subject: [PATCH 0486/1322] Fix some "not all control paths return" warnings;
 NFC

---
 clang-tools-extra/clang-doc/Representation.cpp | 1 +
 clang-tools-extra/clang-doc/Serialize.cpp      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 820d644ef8b8..71a926f1c73e 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -147,6 +147,7 @@ mergeInfos(std::vector<std::unique_ptr<Info>> &Values) {
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
   }
+  llvm_unreachable("unhandled enumerator");
 }
 
 bool CommentInfo::operator==(const CommentInfo &Other) const {
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index e8f1a9cee267..820e8bfd8e64 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -392,6 +392,7 @@ std::string serialize(std::unique_ptr<Info> &I) {
   case InfoType::IT_default:
     return "";
   }
+  llvm_unreachable("unhandled enumerator");
 }
 
 static void parseFullComment(const FullComment *C, CommentInfo &CI) {

From 4f7b5e6d8327f8cea41ba31fdbbb0ee9c1f754c3 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 07:49:20 -0400
Subject: [PATCH 0487/1322] Fix a tablegen pattern that results in a warning;
 NFC

We were generating `1 || 1` which caused some issues for -Werror builds
---
 clang/utils/TableGen/ClangAttrEmitter.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 42627f02cf35..f892626a447e 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3739,7 +3739,8 @@ static void GenerateHasAttrSpellingStringSwitch(
                       : '(' + itostr(Version) + ')';
 
     if (Scope.empty() || Scope == Spelling.nameSpace()) {
-      if (TestStringMap.contains(Spelling.name()))
+      if (TestStringMap.contains(Spelling.name()) &&
+          TestStringMap[Spelling.name()] != TestStr)
         TestStringMap[Spelling.name()] += " || " + TestStr;
       else
         TestStringMap[Spelling.name()] = TestStr;

From 01f9dff61fb028f69493a44616014256dee5fb2a Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 16 Jun 2025 13:10:45 +0100
Subject: [PATCH 0488/1322] [Flang] Add llvm-profdata to list of tools to be
 built (#144325)

Fixes #144179
---
 flang/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index a658f6f984fa..8520bec64697 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -73,6 +73,7 @@ if (NOT FLANG_STANDALONE_BUILD)
     not
     llvm-dis
     llvm-objdump
+    llvm-profdata
     llvm-readobj
     split-file
   )

From 329dfa16564da74451d26b601cab2d8af0e5f4d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 14:37:34 +0100
Subject: [PATCH 0489/1322] [X86] fixup-blend.ll - add commuted load test
 coverage

---
 llvm/test/CodeGen/X86/fixup-blend.ll | 208 ++++++++++++++++++++++++---
 1 file changed, 187 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/X86/fixup-blend.ll b/llvm/test/CodeGen/X86/fixup-blend.ll
index 3126e4823bee..d64dd6d3114a 100644
--- a/llvm/test/CodeGen/X86/fixup-blend.ll
+++ b/llvm/test/CodeGen/X86/fixup-blend.ll
@@ -59,21 +59,45 @@ define <2 x double> @test_v2f64_blend_movsd_optsize(<2 x double> %a0, <2 x doubl
   ret <2 x double> %r
 }
 
-define <2 x double> @test_v2f64_blend_movsd_load(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+define <2 x double> @test_v2f64_blend_movsd_load(ptr %p0, <2 x double> %a1, <2 x double> %a2) {
 ; SSE2-LABEL: test_v2f64_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v2f64_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE4-NEXT:    addpd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_blend_movsd_load:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <2 x double>, ptr %p0
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_load_commute(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+; SSE2-LABEL: test_v2f64_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2f64_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    addpd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_load_commute:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
 ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -178,27 +202,57 @@ define <2 x i64> @test_v2i64_blend_movsd_optsize(<2 x i64> %a0, <2 x i64> %a1, <
   ret <2 x i64> %r
 }
 
-define <2 x i64> @test_v2i64_blend_movsd_load(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+define <2 x i64> @test_v2i64_blend_movsd_load(ptr %p0, <2 x i64> %a1, <2 x i64> %a2) {
 ; SSE2-LABEL: test_v2i64_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v2i64_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; SSE4-NEXT:    paddq %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v2i64_blend_movsd_load:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_v2i64_blend_movsd_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <2 x i64>, ptr %p0
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_load_commute(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_load_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_load_commute:
+; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -338,21 +392,47 @@ define <4 x float> @test_v4f32_blend_movsd_optsize(<4 x float> %a0, <4 x float>
   ret <4 x float> %r
 }
 
-define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+define <4 x float> @test_v4f32_blend_movss_load(ptr %p0, <4 x float> %a1, <4 x float> %a2) {
 ; SSE2-LABEL: test_v4f32_blend_movss_load:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <4 x float>, ptr %p0
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_load_commute(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movss_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4-LABEL: test_v4f32_blend_movss_load_commute:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; SSE4-NEXT:    addps %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX-LABEL: test_v4f32_blend_movss_load_commute:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
@@ -363,21 +443,45 @@ define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x f
   ret <4 x float> %r
 }
 
-define <4 x float> @test_v4f32_blend_movsd_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+define <4 x float> @test_v4f32_blend_movsd_load(ptr %p0, <4 x float> %a1, <4 x float> %a2) {
 ; SSE2-LABEL: test_v4f32_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v4f32_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
 ; SSE4-NEXT:    addps %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f32_blend_movsd_load:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <4 x float>, ptr %p0
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_load_commute(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_load_commute:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -580,27 +684,59 @@ define <4 x i32> @test_v4i32_blend_movsd_optsize(<4 x i32> %a0, <4 x i32> %a1, <
   ret <4 x i32> %r
 }
 
-define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+define <4 x i32> @test_v4i32_blend_movss_load(ptr %p0, <4 x i32> %a1, <4 x i32> %a2) {
 ; SSE2-LABEL: test_v4i32_blend_movss_load:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <4 x i32>, ptr %p0
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_load_commute(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4-LABEL: test_v4i32_blend_movss_load_commute:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
 ; SSE4-NEXT:    paddd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1-LABEL: test_v4i32_blend_movss_load_commute:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2-LABEL: test_v4i32_blend_movss_load_commute:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -611,27 +747,57 @@ define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32>
   ret <4 x i32> %r
 }
 
-define <4 x i32> @test_v4i32_blend_movsd_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+define <4 x i32> @test_v4i32_blend_movsd_load(ptr %p0, <4 x i32> %a1, <4 x i32> %a2) {
 ; SSE2-LABEL: test_v4i32_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v4i32_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; SSE4-NEXT:    paddd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v4i32_blend_movsd_load:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_v4i32_blend_movsd_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <4 x i32>, ptr %p0
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_load_commute(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_load_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_load_commute:
+; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq

From d57b86701a7b5bf7d98fea032f33e726b2abb424 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Mon, 16 Jun 2025 14:14:56 +0000
Subject: [PATCH 0490/1322] Revert "AArch64: Move AArch64MCExpr functions to
 AArch64MCAsmInfo"

This reverts commit 4ea616d072d126a31149174ca2efdbdace9ce568.

This change is causing buildbot failures on MacOS:
 - https://lab.llvm.org/buildbot/#/builders/190/builds/21510
 - http://45.33.8.238/macm1/108620/step_10.txt
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 138 ------------------
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   |  23 ---
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |  97 +++++++++++-
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      |  14 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   6 +-
 5 files changed, 112 insertions(+), 166 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index a82896dbe0d6..31965d85d9eb 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -15,7 +15,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
@@ -54,80 +53,6 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
-StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
-  // clang-format off
-  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
-  case AArch64MCExpr::VK_CALL:                return "";
-  case AArch64MCExpr::VK_LO12:                return ":lo12:";
-  case AArch64MCExpr::VK_ABS_G3:              return ":abs_g3:";
-  case AArch64MCExpr::VK_ABS_G2:              return ":abs_g2:";
-  case AArch64MCExpr::VK_ABS_G2_S:            return ":abs_g2_s:";
-  case AArch64MCExpr::VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case AArch64MCExpr::VK_ABS_G1:              return ":abs_g1:";
-  case AArch64MCExpr::VK_ABS_G1_S:            return ":abs_g1_s:";
-  case AArch64MCExpr::VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case AArch64MCExpr::VK_ABS_G0:              return ":abs_g0:";
-  case AArch64MCExpr::VK_ABS_G0_S:            return ":abs_g0_s:";
-  case AArch64MCExpr::VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case AArch64MCExpr::VK_PREL_G3:             return ":prel_g3:";
-  case AArch64MCExpr::VK_PREL_G2:             return ":prel_g2:";
-  case AArch64MCExpr::VK_PREL_G2_NC:          return ":prel_g2_nc:";
-  case AArch64MCExpr::VK_PREL_G1:             return ":prel_g1:";
-  case AArch64MCExpr::VK_PREL_G1_NC:          return ":prel_g1_nc:";
-  case AArch64MCExpr::VK_PREL_G0:             return ":prel_g0:";
-  case AArch64MCExpr::VK_PREL_G0_NC:          return ":prel_g0_nc:";
-  case AArch64MCExpr::VK_DTPREL_G2:           return ":dtprel_g2:";
-  case AArch64MCExpr::VK_DTPREL_G1:           return ":dtprel_g1:";
-  case AArch64MCExpr::VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case AArch64MCExpr::VK_DTPREL_G0:           return ":dtprel_g0:";
-  case AArch64MCExpr::VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case AArch64MCExpr::VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case AArch64MCExpr::VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case AArch64MCExpr::VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case AArch64MCExpr::VK_TPREL_G2:            return ":tprel_g2:";
-  case AArch64MCExpr::VK_TPREL_G1:            return ":tprel_g1:";
-  case AArch64MCExpr::VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case AArch64MCExpr::VK_TPREL_G0:            return ":tprel_g0:";
-  case AArch64MCExpr::VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case AArch64MCExpr::VK_TPREL_HI12:          return ":tprel_hi12:";
-  case AArch64MCExpr::VK_TPREL_LO12:          return ":tprel_lo12:";
-  case AArch64MCExpr::VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case AArch64MCExpr::VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case AArch64MCExpr::VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
-  case AArch64MCExpr::VK_ABS_PAGE:            return "";
-  case AArch64MCExpr::VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
-  case AArch64MCExpr::VK_GOT:                 return ":got:";
-  case AArch64MCExpr::VK_GOT_PAGE:            return ":got:";
-  case AArch64MCExpr::VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
-  case AArch64MCExpr::VK_GOT_LO12:            return ":got_lo12:";
-  case AArch64MCExpr::VK_GOTTPREL:            return ":gottprel:";
-  case AArch64MCExpr::VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case AArch64MCExpr::VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case AArch64MCExpr::VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case AArch64MCExpr::VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case AArch64MCExpr::VK_TLSDESC:             return "";
-  case AArch64MCExpr::VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  case AArch64MCExpr::VK_TLSDESC_AUTH:        return "";
-  case AArch64MCExpr::VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
-  case AArch64MCExpr::VK_SECREL_LO12:         return ":secrel_lo12:";
-  case AArch64MCExpr::VK_SECREL_HI12:         return ":secrel_hi12:";
-  case AArch64MCExpr::VK_GOT_AUTH:            return ":got_auth:";
-  case AArch64MCExpr::VK_GOT_AUTH_PAGE:       return ":got_auth:";
-  case AArch64MCExpr::VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
-  default:
-    llvm_unreachable("Invalid relocation specifier");
-  }
-  // clang-format on
-}
-
-static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
-                     const MCAssembler *Asm) {
-  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(Expr.getSpecifier());
-  return true;
-}
-
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
@@ -166,34 +91,6 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
-void AArch64AuthMCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
-  if (WrapSubExprInParens)
-    OS << '(';
-  getSubExpr()->print(OS, MAI);
-  if (WrapSubExprInParens)
-    OS << ')';
-
-  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
-  if (hasAddressDiversity())
-    OS << ",addr";
-  OS << ')';
-}
-
-void AArch64MCAsmInfoDarwin::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
-    return AE->print(OS, this);
-  // FIXME: tryParseAdrLabel should not use VK_ABS for Mach-O
-  assert(Expr.getSpecifier() == AArch64MCExpr::VK_ABS);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoDarwin::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
-
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
@@ -230,19 +127,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   initializeVariantKinds(ELFAtSpecifiers);
 }
 
-void AArch64MCAsmInfoELF::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
-    return AE->print(OS, this);
-  OS << AArch64::getSpecifierName(Expr);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoELF::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
-
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -262,17 +146,6 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   initializeVariantKinds(COFFAtSpecifiers);
 }
 
-void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  OS << AArch64::getSpecifierName(Expr);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoMicrosoftCOFF::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
-
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -291,14 +164,3 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   initializeVariantKinds(COFFAtSpecifiers);
 }
-
-void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  OS << AArch64::getSpecifierName(Expr);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoGNUCOFF::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index bc02586d7388..225e0c8e55fc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
-#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -27,42 +26,20 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
-namespace AArch64 {
-/// Return the string representation of the ELF relocation specifier
-/// (e.g. ":got:", ":lo12:").
-StringRef getSpecifierName(const MCSpecifierExpr &Expr);
-} // namespace AArch64
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 7a7c6f7effd9..d934af91b9ff 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,19 +12,100 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
-#include "AArch64MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
 const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
                                            MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, S);
 }
 
+StringRef AArch64MCExpr::getSpecifierName() const {
+  // clang-format off
+  switch (static_cast<uint32_t>(getSpecifier())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_PREL_G3:             return ":prel_g3:";
+  case VK_PREL_G2:             return ":prel_g2:";
+  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case VK_PREL_G1:             return ":prel_g1:";
+  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case VK_PREL_G0:             return ":prel_g0:";
+  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
+  case VK_GOT:                 return ":got:";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL:            return ":gottprel:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case VK_TLSDESC_AUTH:        return "";
+  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
+  case VK_SECREL_LO12:         return ":secrel_lo12:";
+  case VK_SECREL_HI12:         return ":secrel_hi12:";
+  case VK_GOT_AUTH:            return ":got_auth:";
+  case VK_GOT_AUTH_PAGE:       return ":got_auth:";
+  case VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
+  default:
+    llvm_unreachable("Invalid relocation specifier");
+  }
+  // clang-format on
+}
+
+void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  OS << getSpecifierName();
+  Expr->print(OS, MAI);
+}
+
+bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                              const MCAssembler *Asm) const {
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(getSpecifier());
+  return true;
+}
+
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
@@ -33,3 +114,17 @@ const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
   return new (Ctx)
       AArch64AuthMCExpr(Expr, Discriminator, Key, HasAddressDiversity);
 }
+
+void AArch64AuthMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
+  if (WrapSubExprInParens)
+    OS << '(';
+  getSubExpr()->print(OS, MAI);
+  if (WrapSubExprInParens)
+    OS << ')';
+
+  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
+  if (hasAddressDiversity())
+    OS << ",addr";
+  OS << ')';
+}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 541f24c943a1..9c383894c7f5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,6 +147,8 @@ protected:
 public:
   static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
                                      MCContext &Ctx);
+  /// @name VariantKind information extractors.
+  /// @{
 
   static Specifier getSymbolLoc(Specifier S) {
     return static_cast<Specifier>(S & VK_SymLocBits);
@@ -157,6 +159,16 @@ public:
   }
 
   static bool isNotChecked(Specifier S) { return S & VK_NC; }
+
+  /// @}
+
+  /// Return the string representation of the ELF relocation specifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getSpecifierName() const;
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 class AArch64AuthMCExpr final : public AArch64MCExpr {
@@ -177,7 +189,7 @@ public:
   uint16_t getDiscriminator() const { return Discriminator; }
   bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
 
-  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 
   static bool classof(const MCExpr *E) {
     return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 2e997631655e..3009bd2ca275 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCAsmInfo.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -73,7 +73,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       break;
     default:
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          AArch64::getSpecifierName(*A64E) +
+                                          A64E->getSpecifierName() +
                                           " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
@@ -83,7 +83,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   default: {
     if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          AArch64::getSpecifierName(*A64E) +
+                                          A64E->getSpecifierName() +
                                           " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());

From a54712c8ec25a94ab55a4783bfd9d5467d2ec968 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Mon, 16 Jun 2025 15:23:40 +0100
Subject: [PATCH 0491/1322] [LSR] Make canHoistIVInc allow non-integer types
 (#143707)

canHoistIVInc was made to only allow integer types to avoid a crash in
isIndexedLoadLegal/isIndexedStoreLegal due to them failing an assertion
in getValueType (or rather in MVT::getVT which gets called from that)
when passed a struct type. Adjusting these functions to pass
AllowUnknown=true to getValueType means we don't get an assertion
failure (MVT::Other is returned which TLI->isIndexedLoadLegal should
then return false for), meaning we can remove this check for integer
type.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   4 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |   5 +-
 .../AArch64/postidx-load.ll                   | 189 ++++++++++++++++++
 3 files changed, 193 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f1..3b87978fe3fa 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -478,12 +478,12 @@ public:
   }
 
   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
   }
 
   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 242e571c072a..e4f35e4b2108 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6008,9 +6008,8 @@ static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
 
   Instruction *I = Fixup.UserInst;
   Type *Ty = I->getType();
-  return Ty->isIntegerTy() &&
-         ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
-          (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
+  return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
+         (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
 }
 
 /// Rewrite all the fixup locations with new values, following the chosen
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
new file mode 100644
index 000000000000..5976658ccdf8
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-none-elf | FileCheck %s
+
+; Check that the load in the loop has postindex addressing, regardless of the
+; type or whether the input uses postindex or offset addressing.
+
+define i32 @i32_initially_postidx(ptr %p, i64 %n) {
+; CHECK-LABEL: i32_initially_postidx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB0_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB0_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    b.lo .LBB0_5
+; CHECK-NEXT:  // %bb.3: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %accum = phi i32 [ %add, %for.inc ], [ 0, %entry ]
+  %ptr = phi ptr [ %ptr.next, %for.inc ], [ %p, %entry ]
+  %val = load i32, ptr %ptr, align 4
+  %ptr.next = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %add = add i32 %accum, %val
+  %cmp2 = icmp ult i32 %add, 0
+  br i1 %cmp2, label %cleanup, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+cleanup:
+  %ret = phi i32 [ 0, %entry ], [ 0, %for.body ], [ %add, %for.inc ]
+  ret i32 %ret
+}
+
+define i32 @i32_initially_offset(ptr %p, i64 %n) {
+; CHECK-LABEL: i32_initially_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB1_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB1_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    b.lo .LBB1_5
+; CHECK-NEXT:  // %bb.3: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB1_2
+; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.cond:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.cond ], [ 0, %entry ]
+  %accum = phi i32 [ %add, %for.cond ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %accum, %val
+  %cmp2 = icmp ult i32 %add, 0
+  br i1 %cmp2, label %cleanup, label %for.cond
+
+cleanup:
+  %ret = phi i32 [ 0, %entry ], [ 0, %for.body ], [ %add, %for.cond ]
+  ret i32 %ret
+}
+
+define float @float_initially_postidx(ptr %p, i64 %n) {
+; CHECK-LABEL: float_initially_postidx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB2_3
+; CHECK-NEXT:  .LBB2_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    b.mi .LBB2_4
+; CHECK-NEXT:  // %bb.2: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %accum = phi float [ %add, %for.inc ], [ 0.000000e+00, %entry ]
+  %ptr = phi ptr [ %ptr.next, %for.inc ], [ %p, %entry ]
+  %val = load float, ptr %ptr, align 4
+  %ptr.next = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %add = fadd float %accum, %val
+  %cmp2 = fcmp olt float %add, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+cleanup:
+  %ret = phi float [ 0.000000e+00, %entry ], [ 0.000000e+00, %for.body ], [ %add, %for.inc ]
+  ret float %ret
+}
+
+define float @float_initially_offset(ptr %p, i64 %n) {
+; CHECK-LABEL: float_initially_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB3_3
+; CHECK-NEXT:  .LBB3_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    b.mi .LBB3_4
+; CHECK-NEXT:  // %bb.2: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.cond:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.cond ], [ 0, %entry ]
+  %accum = phi float [ %add, %for.cond ], [ 0.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %p, i64 %iv
+  %val = load float, ptr %arrayidx, align 4
+  %add = fadd float %accum, %val
+  %cmp2 = fcmp olt float %add, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %for.cond
+
+cleanup:
+  %ret = phi float [ 0.000000e+00, %entry ], [ 0.000000e+00, %for.body ], [ %add, %for.cond ]
+  ret float %ret
+}

From 39ad3151e073e9f721d1e2e2849fb4bdc9443ae3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 16 Jun 2025 15:26:47 +0100
Subject: [PATCH 0492/1322] [TableGen] Use default member initializers. NFC.
 (#144349)

Automated with clang-tidy -fix -checks=-*,modernize-use-default-member-init
---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp      |  9 ++++-----
 .../TableGen/Common/CodeGenDAGPatterns.cpp     | 16 +++++++---------
 .../utils/TableGen/Common/CodeGenRegisters.cpp |  4 ++--
 llvm/utils/TableGen/Common/CodeGenRegisters.h  | 10 ++++------
 llvm/utils/TableGen/Common/CodeGenSchedule.h   | 14 +++++++-------
 .../Common/GlobalISel/GlobalISelMatchTable.h   | 18 ++++++++----------
 llvm/utils/TableGen/Common/PredicateExpander.h |  9 ++++-----
 llvm/utils/TableGen/DAGISelMatcherGen.cpp      |  9 ++++-----
 llvm/utils/TableGen/DecoderEmitter.cpp         |  5 ++---
 llvm/utils/TableGen/FastISelEmitter.cpp        |  4 ++--
 10 files changed, 44 insertions(+), 54 deletions(-)

diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 32098e96ce72..b6d9c9f3a158 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -388,7 +388,7 @@ struct MatchableInfo {
     StringRef Token;
 
     /// The unique class instance this operand should match.
-    ClassInfo *Class;
+    ClassInfo *Class = nullptr;
 
     /// The operand name this is, if anything.
     StringRef SrcOpName;
@@ -397,18 +397,17 @@ struct MatchableInfo {
     StringRef OrigSrcOpName;
 
     /// The suboperand index within SrcOpName, or -1 for the entire operand.
-    int SubOpIdx;
+    int SubOpIdx = -1;
 
     /// Whether the token is "isolated", i.e., it is preceded and followed
     /// by separators.
     bool IsIsolatedToken;
 
     /// Register record if this token is singleton register.
-    const Record *SingletonReg;
+    const Record *SingletonReg = nullptr;
 
     explicit AsmOperand(bool IsIsolatedToken, StringRef T)
-        : Token(T), Class(nullptr), SubOpIdx(-1),
-          IsIsolatedToken(IsIsolatedToken), SingletonReg(nullptr) {}
+        : Token(T), IsIsolatedToken(IsIsolatedToken) {}
   };
 
   /// ResOperand - This represents a single operand in the result instruction
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 810b35e65b31..3a4ca1b45156 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3604,16 +3604,14 @@ class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
 
 public:
-  bool hasSideEffects;
-  bool mayStore;
-  bool mayLoad;
-  bool isBitcast;
-  bool isVariadic;
-  bool hasChain;
+  bool hasSideEffects = false;
+  bool mayStore = false;
+  bool mayLoad = false;
+  bool isBitcast = false;
+  bool isVariadic = false;
+  bool hasChain = false;
 
-  InstAnalyzer(const CodeGenDAGPatterns &cdp)
-      : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
-        isBitcast(false), isVariadic(false), hasChain(false) {}
+  InstAnalyzer(const CodeGenDAGPatterns &cdp) : CDP(cdp) {}
 
   void Analyze(const PatternToMatch &Pat) {
     const TreePatternNode &N = Pat.getSrcPattern();
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index f52c21e97f9c..57a243158692 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -164,8 +164,8 @@ CodeGenRegister::CodeGenRegister(const Record *R, unsigned Enum)
     : TheDef(R), EnumValue(Enum),
       CostPerUse(R->getValueAsListOfInts("CostPerUse")),
       CoveredBySubRegs(R->getValueAsBit("CoveredBySubRegs")),
-      HasDisjunctSubRegs(false), Constant(R->getValueAsBit("isConstant")),
-      SubRegsComplete(false), SuperRegsComplete(false), TopoSig(~0u) {
+      Constant(R->getValueAsBit("isConstant")), SubRegsComplete(false),
+      SuperRegsComplete(false), TopoSig(~0u) {
   Artificial = R->getValueAsBit("isArtificial");
 }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 3f4c157fab69..bbcd44ce2cc5 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -564,7 +564,7 @@ struct RegUnit {
   // Weight assigned to this RegUnit for estimating register pressure.
   // This is useful when equalizing weights in register classes with mixed
   // register topologies.
-  unsigned Weight;
+  unsigned Weight = 0;
 
   // Each native RegUnit corresponds to one or two root registers. The full
   // set of registers containing this unit can be computed as the union of
@@ -573,14 +573,12 @@ struct RegUnit {
 
   // Index into RegClassUnitSets where we can find the list of UnitSets that
   // contain this unit.
-  unsigned RegClassUnitSetsIdx;
+  unsigned RegClassUnitSetsIdx = 0;
   // A register unit is artificial if at least one of its roots is
   // artificial.
-  bool Artificial;
+  bool Artificial = false;
 
-  RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
-    Roots[0] = Roots[1] = nullptr;
-  }
+  RegUnit() { Roots[0] = Roots[1] = nullptr; }
 
   ArrayRef<const CodeGenRegister *> getRoots() const {
     assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h
index 697a1ce8f75a..1d5e953cf70c 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h
@@ -193,7 +193,7 @@ struct CodeGenRegisterFile {
   unsigned MaxMovesEliminatedPerCycle;
   bool AllowZeroMoveEliminationOnly;
 
-  unsigned NumPhysRegs;
+  unsigned NumPhysRegs = 0;
   std::vector<CodeGenRegisterCost> Costs;
 
   CodeGenRegisterFile(StringRef name, const Record *def,
@@ -201,7 +201,7 @@ struct CodeGenRegisterFile {
                       bool AllowZeroMoveElimOnly = false)
       : Name(name), RegisterFileDef(def),
         MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
-        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), NumPhysRegs(0) {}
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
@@ -261,16 +261,16 @@ struct CodeGenProcModel {
   std::vector<CodeGenRegisterFile> RegisterFiles;
 
   // Optional Retire Control Unit definition.
-  const Record *RetireControlUnit;
+  const Record *RetireControlUnit = nullptr;
 
   // Load/Store queue descriptors.
-  const Record *LoadQueue;
-  const Record *StoreQueue;
+  const Record *LoadQueue = nullptr;
+  const Record *StoreQueue = nullptr;
 
   CodeGenProcModel(unsigned Idx, std::string Name, const Record *MDef,
                    const Record *IDef)
-      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-        RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
+      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef) {
+  }
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 66472576eea8..620f88db6610 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -501,13 +501,13 @@ protected:
 
   /// ID for the next instruction variable defined with
   /// implicitlyDefineInsnVar()
-  unsigned NextInsnVarID;
+  unsigned NextInsnVarID = 0;
 
   /// ID for the next output instruction allocated with allocateOutputInsnID()
-  unsigned NextOutputInsnID;
+  unsigned NextOutputInsnID = 0;
 
   /// ID for the next temporary register ID allocated with allocateTempRegID()
-  unsigned NextTempRegID;
+  unsigned NextTempRegID = 0;
 
   /// ID for the next recorded type. Starts at -1 and counts down.
   TempTypeIdx NextTempTypeIdx = -1;
@@ -545,9 +545,7 @@ protected:
                              StringRef FlagName, GISelFlags FlagBit);
 
 public:
-  RuleMatcher(ArrayRef<SMLoc> SrcLoc)
-      : NextInsnVarID(0), NextOutputInsnID(0), NextTempRegID(0), SrcLoc(SrcLoc),
-        RuleID(NextRuleID++) {}
+  RuleMatcher(ArrayRef<SMLoc> SrcLoc) : SrcLoc(SrcLoc), RuleID(NextRuleID++) {}
   RuleMatcher(RuleMatcher &&Other) = default;
   RuleMatcher &operator=(RuleMatcher &&Other) = default;
 
@@ -2039,12 +2037,12 @@ protected:
   unsigned NewInsnID;
   /// The name of the operand.
   const std::string SymbolicName;
-  bool Signed;
+  bool Signed = true;
 
 public:
   CopyConstantAsImmRenderer(unsigned NewInsnID, StringRef SymbolicName)
       : OperandRenderer(OR_CopyConstantAsImm), NewInsnID(NewInsnID),
-        SymbolicName(SymbolicName), Signed(true) {}
+        SymbolicName(SymbolicName) {}
 
   static bool classof(const OperandRenderer *R) {
     return R->getKind() == OR_CopyConstantAsImm;
@@ -2359,7 +2357,7 @@ class BuildMIAction : public MatchAction {
 private:
   unsigned InsnID;
   const CodeGenInstruction *I;
-  InstructionMatcher *Matched;
+  InstructionMatcher *Matched = nullptr;
   std::vector<std::unique_ptr<OperandRenderer>> OperandRenderers;
   SmallPtrSet<const Record *, 4> DeadImplicitDefs;
 
@@ -2372,7 +2370,7 @@ private:
 
 public:
   BuildMIAction(unsigned InsnID, const CodeGenInstruction *I)
-      : MatchAction(AK_BuildMI), InsnID(InsnID), I(I), Matched(nullptr) {}
+      : MatchAction(AK_BuildMI), InsnID(InsnID), I(I) {}
 
   static bool classof(const MatchAction *A) {
     return A->getKind() == AK_BuildMI;
diff --git a/llvm/utils/TableGen/Common/PredicateExpander.h b/llvm/utils/TableGen/Common/PredicateExpander.h
index 0c3a8718a473..4439327af2b0 100644
--- a/llvm/utils/TableGen/Common/PredicateExpander.h
+++ b/llvm/utils/TableGen/Common/PredicateExpander.h
@@ -25,9 +25,9 @@ namespace llvm {
 class Record;
 
 class PredicateExpander {
-  bool EmitCallsByRef;
-  bool NegatePredicate;
-  bool ExpandForMC;
+  bool EmitCallsByRef = true;
+  bool NegatePredicate = false;
+  bool ExpandForMC = false;
   StringRef TargetName;
 
   PredicateExpander(const PredicateExpander &) = delete;
@@ -38,8 +38,7 @@ protected:
 
 public:
   explicit PredicateExpander(StringRef Target, unsigned Indent = 1)
-      : EmitCallsByRef(true), NegatePredicate(false), ExpandForMC(false),
-        TargetName(Target), Indent(Indent, 2) {}
+      : TargetName(Target), Indent(Indent, 2) {}
   bool isByRef() const { return EmitCallsByRef; }
   bool shouldNegate() const { return NegatePredicate; }
   bool shouldExpandForMC() const { return ExpandForMC; }
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 0039ff4f3e2d..227311b0a3bc 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -76,7 +76,7 @@ class MatcherGen {
   /// NextRecordedOperandNo - As we emit opcodes to record matched values in
   /// the RecordedNodes array, this keeps track of which slot will be next to
   /// record into.
-  unsigned NextRecordedOperandNo;
+  unsigned NextRecordedOperandNo = 0;
 
   /// MatchedChainNodes - This maintains the position in the recorded nodes
   /// array of all of the recorded input nodes that have chains.
@@ -94,11 +94,11 @@ class MatcherGen {
   SmallVector<std::pair<const Record *, unsigned>, 2> PhysRegInputs;
 
   /// Matcher - This is the top level of the generated matcher, the result.
-  Matcher *TheMatcher;
+  Matcher *TheMatcher = nullptr;
 
   /// CurPredicate - As we emit matcher nodes, this points to the latest check
   /// which should have future checks stuck into its Next position.
-  Matcher *CurPredicate;
+  Matcher *CurPredicate = nullptr;
 
 public:
   MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
@@ -147,8 +147,7 @@ private:
 
 MatcherGen::MatcherGen(const PatternToMatch &pattern,
                        const CodeGenDAGPatterns &cgp)
-    : Pattern(pattern), CGP(cgp), NextRecordedOperandNo(0), TheMatcher(nullptr),
-      CurPredicate(nullptr) {
+    : Pattern(pattern), CGP(cgp) {
   // We need to produce the matcher tree for the patterns source pattern.  To
   // do this we need to match the structure as well as the types.  To do the
   // type matching, we want to figure out the fewest number of type checks we
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 56c3644c134f..7489d369c993 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -104,10 +104,9 @@ struct OperandInfo {
   std::vector<EncodingField> Fields;
   std::string Decoder;
   bool HasCompleteDecoder;
-  uint64_t InitValue;
+  uint64_t InitValue = 0;
 
-  OperandInfo(std::string D, bool HCD)
-      : Decoder(D), HasCompleteDecoder(HCD), InitValue(0) {}
+  OperandInfo(std::string D, bool HCD) : Decoder(D), HasCompleteDecoder(HCD) {}
 
   void addField(unsigned Base, unsigned Width, unsigned Offset) {
     Fields.push_back(EncodingField(Base, Width, Offset));
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index a8b6f79c176a..694d89a5ada3 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -86,10 +86,10 @@ namespace {
 struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
-    char Repr;
+    char Repr = OK_Invalid;
 
   public:
-    OpKind() : Repr(OK_Invalid) {}
+    OpKind() {}
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }

From 595a273d9232a7378c583fb109212370d6d2f4e4 Mon Sep 17 00:00:00 2001
From: Andrey Timonin <timonina1909@gmail.com>
Date: Mon, 16 Jun 2025 19:37:39 +0500
Subject: [PATCH 0493/1322] [mlir][emitc] Support 'emitc::LValueType' in
 'emitc::VerbatimOp' (#144151)

This PR introduces support for `emitc::LvalueType` in
`emitc::VerbatimOp`, providing a mechanism to reduce the number of
operations required when working with verbatim operations whose
arguments are of type `emitc::LvalueType`.

Before:
```mlir
emitc.func @foo() {
  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
  %loaded_a = load %a : !emitc.lvalue<i32>
  emitc.verbatim "{} + {};" args %loaded_a, %loaded_a : i32, i32

  return
}
```

After:
```mlir
emitc.func @bar() {
  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
  emitc.verbatim "{} + {};" args %a, %a : !emitc.lvalue<i32>, !emitc.lvalue<i32>

  return
}
```

You can now write something like this:
```mlir
emitc.func @baz() {
  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
  emitc.verbatim "++{};" args %a : !emitc.lvalue<i32>

  return
}
```
---
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td |  2 +-
 mlir/test/Dialect/EmitC/ops.mlir            | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index d4aea52a0d48..e53d3e45875d 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -1304,7 +1304,7 @@ def EmitC_VerbatimOp : EmitC_Op<"verbatim"> {
     FailureOr<SmallVector<::mlir::emitc::ReplacementItem>> parseFormatString();
   }];
 
-  let arguments = (ins StrAttr:$value, Variadic<EmitCType>:$fmtArgs);
+  let arguments = (ins StrAttr:$value, Variadic<AnyTypeOf<[EmitCType, EmitC_LValueType]>>:$fmtArgs);
 
   let builders = [OpBuilder<(ins "::mlir::StringAttr":$value),
                             [{ build($_builder, $_state, value, {}); }]>];
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 36d12e763afc..ad40313f95df 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -246,12 +246,20 @@ emitc.verbatim "typedef float f32;"
 // The value is not interpreted as format string if there are no operands.
 emitc.verbatim "{} {  }"
 
-func.func @test_verbatim(%arg0 : !emitc.ptr<i32>, %arg1 : i32) {
+func.func @test_verbatim(%arg0 : !emitc.ptr<i32>, %arg1 : i32, %arg2: !emitc.array<3x!emitc.ptr<i32>>) {
+  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
+
+  // Check that the lvalue type can be used by verbatim.
+  emitc.verbatim "++{};" args %a : !emitc.lvalue<i32>
+
+  // Check that the array type can be used by verbatim.
+  emitc.verbatim "*{}[0] = 1;" args %arg2 : !emitc.array<3x!emitc.ptr<i32>>
+
   emitc.verbatim "{} + {};" args %arg0, %arg1 : !emitc.ptr<i32>, i32
 
-  // Check there is no ambiguity whether %a is the argument to the emitc.verbatim op.
-  emitc.verbatim "a"
-  %a = "emitc.constant"(){value = 42 : i32} : () -> i32
+  // Check there is no ambiguity whether %b is the argument to the emitc.verbatim op.
+  emitc.verbatim "b"
+  %b = "emitc.constant"(){value = 42 : i32} : () -> i32
 
   return
 }

From 8e333e3cedec69f9e538ed79ed9f577956215edb Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Mon, 16 Jun 2025 10:50:13 -0400
Subject: [PATCH 0494/1322] [mlir] Expose linearize/delinearize lowering
 transforms (#144156)

Moves the transformation logic from the AffineLinearizeOp and
AffineDelinearizeOp lowerings into separate transform functions that can
now be called separately. This provides a more controlled way to apply
the op lowerings.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 .../Dialect/Affine/Transforms/Transforms.h    |  14 ++
 .../Transforms/AffineExpandIndexOps.cpp       | 218 +++++++++---------
 2 files changed, 125 insertions(+), 107 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
index bf830a29613f..5c538d28c183 100644
--- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
@@ -32,6 +32,20 @@ enum class BoundType;
 
 namespace affine {
 class AffineApplyOp;
+class AffineDelinearizeIndexOp;
+class AffineLinearizeIndexOp;
+
+/// Lowers `affine.delinearize_index` into a sequence of division and remainder
+/// operations.
+LogicalResult lowerAffineDelinearizeIndexOp(RewriterBase &rewriter,
+                                            AffineDelinearizeIndexOp op);
+
+/// Lowers `affine.linearize_index` into a sequence of multiplications and
+/// additions. Make a best effort to sort the input indices so that
+/// the most loop-invariant terms are at the left of the additions
+/// to enable loop-invariant code motion.
+LogicalResult lowerAffineLinearizeIndexOp(RewriterBase &rewriter,
+                                          AffineLinearizeIndexOp op);
 
 /// Populate patterns that expand affine index operations into more fundamental
 /// operations (not necessarily restricted to Affine dialect).
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
index 35205a6ca2ee..c0ef28c648ac 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
@@ -84,126 +84,130 @@ static SmallVector<Value> computeStrides(Location loc, RewriterBase &rewriter,
   return result;
 }
 
+LogicalResult
+affine::lowerAffineDelinearizeIndexOp(RewriterBase &rewriter,
+                                      AffineDelinearizeIndexOp op) {
+  Location loc = op.getLoc();
+  Value linearIdx = op.getLinearIndex();
+  unsigned numResults = op.getNumResults();
+  ArrayRef<int64_t> staticBasis = op.getStaticBasis();
+  if (numResults == staticBasis.size())
+    staticBasis = staticBasis.drop_front();
+
+  if (numResults == 1) {
+    rewriter.replaceOp(op, linearIdx);
+    return success();
+  }
+
+  SmallVector<Value> results;
+  results.reserve(numResults);
+  SmallVector<Value> strides =
+      computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
+                     /*knownNonNegative=*/true);
+
+  Value zero = rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0);
+
+  Value initialPart =
+      rewriter.create<arith::FloorDivSIOp>(loc, linearIdx, strides.front());
+  results.push_back(initialPart);
+
+  auto emitModTerm = [&](Value stride) -> Value {
+    Value remainder = rewriter.create<arith::RemSIOp>(loc, linearIdx, stride);
+    Value remainderNegative = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::slt, remainder, zero);
+    // If the correction is relevant, this term is <= stride, which is known
+    // to be positive in `index`. Otherwise, while 2 * stride might overflow,
+    // this branch won't be taken, so the risk of `poison` is fine.
+    Value corrected = rewriter.create<arith::AddIOp>(
+        loc, remainder, stride, arith::IntegerOverflowFlags::nsw);
+    Value mod = rewriter.create<arith::SelectOp>(loc, remainderNegative,
+                                                 corrected, remainder);
+    return mod;
+  };
+
+  // Generate all the intermediate parts
+  for (size_t i = 0, e = strides.size() - 1; i < e; ++i) {
+    Value thisStride = strides[i];
+    Value nextStride = strides[i + 1];
+    Value modulus = emitModTerm(thisStride);
+    // We know both inputs are positive, so floorDiv == div.
+    // This could potentially be a divui, but it's not clear if that would
+    // cause issues.
+    Value divided = rewriter.create<arith::DivSIOp>(loc, modulus, nextStride);
+    results.push_back(divided);
+  }
+
+  results.push_back(emitModTerm(strides.back()));
+
+  rewriter.replaceOp(op, results);
+  return success();
+}
+
+LogicalResult affine::lowerAffineLinearizeIndexOp(RewriterBase &rewriter,
+                                                  AffineLinearizeIndexOp op) {
+  // Should be folded away, included here for safety.
+  if (op.getMultiIndex().empty()) {
+    rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
+    return success();
+  }
+
+  Location loc = op.getLoc();
+  ValueRange multiIndex = op.getMultiIndex();
+  size_t numIndexes = multiIndex.size();
+  ArrayRef<int64_t> staticBasis = op.getStaticBasis();
+  if (numIndexes == staticBasis.size())
+    staticBasis = staticBasis.drop_front();
+
+  SmallVector<Value> strides =
+      computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
+                     /*knownNonNegative=*/op.getDisjoint());
+  SmallVector<std::pair<Value, int64_t>> scaledValues;
+  scaledValues.reserve(numIndexes);
+
+  // Note: strides doesn't contain a value for the final element (stride 1)
+  // and everything else lines up. We use the "mutable" accessor so we can get
+  // our hands on an `OpOperand&` for the loop invariant counting function.
+  for (auto [stride, idxOp] :
+       llvm::zip_equal(strides, llvm::drop_end(op.getMultiIndexMutable()))) {
+    Value scaledIdx = rewriter.create<arith::MulIOp>(
+        loc, idxOp.get(), stride, arith::IntegerOverflowFlags::nsw);
+    int64_t numHoistableLoops = numEnclosingInvariantLoops(idxOp);
+    scaledValues.emplace_back(scaledIdx, numHoistableLoops);
+  }
+  scaledValues.emplace_back(
+      multiIndex.back(),
+      numEnclosingInvariantLoops(op.getMultiIndexMutable()[numIndexes - 1]));
+
+  // Sort by how many enclosing loops there are, ties implicitly broken by
+  // size of the stride.
+  llvm::stable_sort(scaledValues,
+                    [&](auto l, auto r) { return l.second > r.second; });
+
+  Value result = scaledValues.front().first;
+  for (auto [scaledValue, numHoistableLoops] : llvm::drop_begin(scaledValues)) {
+    std::ignore = numHoistableLoops;
+    result = rewriter.create<arith::AddIOp>(loc, result, scaledValue,
+                                            arith::IntegerOverflowFlags::nsw);
+  }
+  rewriter.replaceOp(op, result);
+  return success();
+}
+
 namespace {
-/// Lowers `affine.delinearize_index` into a sequence of division and remainder
-/// operations.
 struct LowerDelinearizeIndexOps
     : public OpRewritePattern<AffineDelinearizeIndexOp> {
   using OpRewritePattern<AffineDelinearizeIndexOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineDelinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value linearIdx = op.getLinearIndex();
-    unsigned numResults = op.getNumResults();
-    ArrayRef<int64_t> staticBasis = op.getStaticBasis();
-    if (numResults == staticBasis.size())
-      staticBasis = staticBasis.drop_front();
-
-    if (numResults == 1) {
-      rewriter.replaceOp(op, linearIdx);
-      return success();
-    }
-
-    SmallVector<Value> results;
-    results.reserve(numResults);
-    SmallVector<Value> strides =
-        computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
-                       /*knownNonNegative=*/true);
-
-    Value zero = rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0);
-
-    Value initialPart =
-        rewriter.create<arith::FloorDivSIOp>(loc, linearIdx, strides.front());
-    results.push_back(initialPart);
-
-    auto emitModTerm = [&](Value stride) -> Value {
-      Value remainder = rewriter.create<arith::RemSIOp>(loc, linearIdx, stride);
-      Value remainderNegative = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, remainder, zero);
-      // If the correction is relevant, this term is <= stride, which is known
-      // to be positive in `index`. Otherwise, while 2 * stride might overflow,
-      // this branch won't be taken, so the risk of `poison` is fine.
-      Value corrected = rewriter.create<arith::AddIOp>(
-          loc, remainder, stride, arith::IntegerOverflowFlags::nsw);
-      Value mod = rewriter.create<arith::SelectOp>(loc, remainderNegative,
-                                                   corrected, remainder);
-      return mod;
-    };
-
-    // Generate all the intermediate parts
-    for (size_t i = 0, e = strides.size() - 1; i < e; ++i) {
-      Value thisStride = strides[i];
-      Value nextStride = strides[i + 1];
-      Value modulus = emitModTerm(thisStride);
-      // We know both inputs are positive, so floorDiv == div.
-      // This could potentially be a divui, but it's not clear if that would
-      // cause issues.
-      Value divided = rewriter.create<arith::DivSIOp>(loc, modulus, nextStride);
-      results.push_back(divided);
-    }
-
-    results.push_back(emitModTerm(strides.back()));
-
-    rewriter.replaceOp(op, results);
-    return success();
+    return affine::lowerAffineDelinearizeIndexOp(rewriter, op);
   }
 };
 
-/// Lowers `affine.linearize_index` into a sequence of multiplications and
-/// additions. Make a best effort to sort the input indices so that
-/// the most loop-invariant terms are at the left of the additions
-/// to enable loop-invariant code motion.
 struct LowerLinearizeIndexOps final : OpRewritePattern<AffineLinearizeIndexOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineLinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    // Should be folded away, included here for safety.
-    if (op.getMultiIndex().empty()) {
-      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
-      return success();
-    }
-
-    Location loc = op.getLoc();
-    ValueRange multiIndex = op.getMultiIndex();
-    size_t numIndexes = multiIndex.size();
-    ArrayRef<int64_t> staticBasis = op.getStaticBasis();
-    if (numIndexes == staticBasis.size())
-      staticBasis = staticBasis.drop_front();
-
-    SmallVector<Value> strides =
-        computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
-                       /*knownNonNegative=*/op.getDisjoint());
-    SmallVector<std::pair<Value, int64_t>> scaledValues;
-    scaledValues.reserve(numIndexes);
-
-    // Note: strides doesn't contain a value for the final element (stride 1)
-    // and everything else lines up. We use the "mutable" accessor so we can get
-    // our hands on an `OpOperand&` for the loop invariant counting function.
-    for (auto [stride, idxOp] :
-         llvm::zip_equal(strides, llvm::drop_end(op.getMultiIndexMutable()))) {
-      Value scaledIdx = rewriter.create<arith::MulIOp>(
-          loc, idxOp.get(), stride, arith::IntegerOverflowFlags::nsw);
-      int64_t numHoistableLoops = numEnclosingInvariantLoops(idxOp);
-      scaledValues.emplace_back(scaledIdx, numHoistableLoops);
-    }
-    scaledValues.emplace_back(
-        multiIndex.back(),
-        numEnclosingInvariantLoops(op.getMultiIndexMutable()[numIndexes - 1]));
-
-    // Sort by how many enclosing loops there are, ties implicitly broken by
-    // size of the stride.
-    llvm::stable_sort(scaledValues,
-                      [&](auto l, auto r) { return l.second > r.second; });
-
-    Value result = scaledValues.front().first;
-    for (auto [scaledValue, numHoistableLoops] :
-         llvm::drop_begin(scaledValues)) {
-      std::ignore = numHoistableLoops;
-      result = rewriter.create<arith::AddIOp>(loc, result, scaledValue,
-                                              arith::IntegerOverflowFlags::nsw);
-    }
-    rewriter.replaceOp(op, result);
-    return success();
+    return affine::lowerAffineLinearizeIndexOp(rewriter, op);
   }
 };
 

From 7c25db3fbfc63f76b270940e341f267e497e95d9 Mon Sep 17 00:00:00 2001
From: Xu Zhang <simonzgx@gmail.com>
Date: Mon, 16 Jun 2025 22:55:26 +0800
Subject: [PATCH 0495/1322] [DAG] Fold (and X, (add (not Y), Z)) -> (and X,
 (not (sub Y, Z))). (#141476)

Fixes #140639

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++
 .../AArch64/aarch64-bitwisenot-fold.ll        | 98 +++++++++++++++++++
 .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 10 +-
 3 files changed, 126 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5d62ded171f4..f6d811ddba8a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -396,6 +396,8 @@ namespace {
     bool PromoteLoad(SDValue Op);
 
     SDValue foldShiftToAvg(SDNode *N);
+    // Fold `a bitwiseop (~b +/- c)` -> `a bitwiseop ~(b -/+ c)`
+    SDValue foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT);
 
     SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                 SDValue RHS, SDValue True, SDValue False,
@@ -7541,6 +7543,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return DAG.getNode(ISD::AND, DL, VT, X,
                          DAG.getNOT(DL, DAG.getNode(Opc, DL, VT, Y, Z), VT));
 
+  // Fold (and X, (add (not Y), Z)) -> (and X, (not (sub Y, Z)))
+  // Fold (and X, (sub (not Y), Z)) -> (and X, (not (add Y, Z)))
+  if (TLI.hasAndNot(SDValue(N, 0)))
+    if (SDValue Folded = foldBitwiseOpWithNeg(N, DL, VT))
+      return Folded;
+
   // Fold (and (srl X, C), 1) -> (srl X, BW-1) for signbit extraction
   // If we are shifting down an extended sign bit, see if we can simplify
   // this to shifting the MSB directly to expose further simplifications.
@@ -11652,6 +11660,22 @@ SDValue DAGCombiner::foldShiftToAvg(SDNode *N) {
   return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B});
 }
 
+SDValue DAGCombiner::foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT) {
+  unsigned Opc = N->getOpcode();
+  SDValue X, Y, Z;
+  if (sd_match(
+          N, m_BitwiseLogic(m_Value(X), m_Add(m_Not(m_Value(Y)), m_Value(Z)))))
+    return DAG.getNode(Opc, DL, VT, X,
+                       DAG.getNOT(DL, DAG.getNode(ISD::SUB, DL, VT, Y, Z), VT));
+
+  if (sd_match(N, m_BitwiseLogic(m_Value(X), m_Sub(m_OneUse(m_Not(m_Value(Y))),
+                                                   m_Value(Z)))))
+    return DAG.getNode(Opc, DL, VT, X,
+                       DAG.getNOT(DL, DAG.getNode(ISD::ADD, DL, VT, Y, Z), VT));
+
+  return SDValue();
+}
+
 /// Generate Min/Max node
 SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                          SDValue RHS, SDValue True,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll b/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
new file mode 100644
index 000000000000..5fbf38b2560d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-linux | FileCheck %s
+
+define i8 @andnot_add_with_neg_i8(i8 %a0, i8 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %sum = add i8 %not, %a1
+  %and = and i8 %sum, %a0
+  ret i8 %and
+}
+
+define i8 @andnot_sub_with_neg_i8(i8 %a0, i8 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %diff = sub i8 %not, %a1
+  %and = and i8 %diff, %a0
+  ret i8 %and
+}
+
+define i16 @andnot_add_with_neg_i16(i16 %a0, i16 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i16 %a0, -1
+  %sum = add i16 %not, %a1
+  %and = and i16 %sum, %a0
+  ret i16 %and
+}
+
+define i16 @andnot_sub_with_neg_i16(i16 %a0, i16 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i16 %a0, -1
+  %diff = sub i16 %not, %a1
+  %and = and i16 %diff, %a0
+  ret i16 %and
+}
+
+define i32 @andnot_add_with_neg_i32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i32 %a0, -1
+  %sum = add i32 %not, %a1
+  %and = and i32 %sum, %a0
+  ret i32 %and
+}
+
+define i32 @andnot_sub_with_neg_i32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i32 %a0, -1
+  %diff = sub i32 %not, %a1
+  %and = and i32 %diff, %a0
+  ret i32 %and
+}
+
+define i64 @andnot_add_with_neg_i64(i64 %a0, i64 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, x1
+; CHECK-NEXT:    bic x0, x0, x8
+; CHECK-NEXT:    ret
+  %not = xor i64 %a0, -1
+  %sum = add i64 %not, %a1
+  %and = and i64 %sum, %a0
+  ret i64 %and
+}
+
+define i64 @andnot_sub_with_neg_i64(i64 %a0, i64 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    bic x0, x0, x8
+; CHECK-NEXT:    ret
+  %not = xor i64 %a0, -1
+  %diff = sub i64 %not, %a1
+  %and = and i64 %diff, %a0
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index e564d7bddea6..27be02c50f1c 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -885,9 +885,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 define i8 @test_not_cttz_i8(i8 %a) nounwind {
 ; LA32R-LABEL: test_not_cttz_i8:
 ; LA32R:       # %bb.0:
-; LA32R-NEXT:    nor $a1, $a0, $zero
-; LA32R-NEXT:    addi.w $a1, $a1, -1
-; LA32R-NEXT:    and $a0, $a0, $a1
+; LA32R-NEXT:    addi.w $a1, $a0, 1
+; LA32R-NEXT:    andn $a0, $a0, $a1
 ; LA32R-NEXT:    srli.w $a1, $a0, 1
 ; LA32R-NEXT:    andi $a1, $a1, 85
 ; LA32R-NEXT:    sub.w $a0, $a0, $a1
@@ -921,9 +920,8 @@ define i8 @test_not_cttz_i8(i8 %a) nounwind {
 define i16 @test_not_cttz_i16(i16 %a) nounwind {
 ; LA32R-LABEL: test_not_cttz_i16:
 ; LA32R:       # %bb.0:
-; LA32R-NEXT:    nor $a1, $a0, $zero
-; LA32R-NEXT:    addi.w $a1, $a1, -1
-; LA32R-NEXT:    and $a0, $a0, $a1
+; LA32R-NEXT:    addi.w $a1, $a0, 1
+; LA32R-NEXT:    andn $a0, $a0, $a1
 ; LA32R-NEXT:    srli.w $a1, $a0, 1
 ; LA32R-NEXT:    lu12i.w $a2, 5
 ; LA32R-NEXT:    ori $a2, $a2, 1365

From f2734aa25e808e8c1967f7125fdea6c8b2dab9e1 Mon Sep 17 00:00:00 2001
From: Acthinks Yang <yangzhh@mail.ustc.edu.cn>
Date: Mon, 16 Jun 2025 23:05:30 +0800
Subject: [PATCH 0496/1322] [InstCombine] fold icmp with add/sub instructions
 having the same operands (#143241)

Closes #143211.
---
 .../InstCombine/InstCombineCompares.cpp       |  24 ++++
 .../Transforms/InstCombine/icmp-subadd.ll     | 111 ++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/icmp-subadd.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c112fae35181..084e7fbaa268 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -7728,6 +7728,30 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     }
   }
 
+  // icmp slt (sub nsw x, y), (add nsw x, y)  -->  icmp sgt y, 0
+  // icmp ult (sub nuw x, y), (add nuw x, y)  -->  icmp ugt y, 0
+  // icmp eq (sub nsw/nuw x, y), (add nsw/nuw x, y)   -->  icmp eq y, 0
+  {
+    Value *A, *B;
+    CmpPredicate CmpPred;
+    if (match(&I, m_c_ICmp(CmpPred, m_Sub(m_Value(A), m_Value(B)),
+                           m_c_Add(m_Deferred(A), m_Deferred(B))))) {
+      auto *I0 = cast<OverflowingBinaryOperator>(Op0);
+      auto *I1 = cast<OverflowingBinaryOperator>(Op1);
+      bool I0NUW = I0->hasNoUnsignedWrap();
+      bool I1NUW = I1->hasNoUnsignedWrap();
+      bool I0NSW = I0->hasNoSignedWrap();
+      bool I1NSW = I1->hasNoSignedWrap();
+      if ((ICmpInst::isUnsigned(Pred) && I0NUW && I1NUW) ||
+          (ICmpInst::isSigned(Pred) && I0NSW && I1NSW) ||
+          (ICmpInst::isEquality(Pred) &&
+           ((I0NUW || I0NSW) && (I1NUW || I1NSW)))) {
+        return new ICmpInst(CmpPredicate::getSwapped(CmpPred), B,
+                            ConstantInt::get(Op0->getType(), 0));
+      }
+    }
+  }
+
   // Try to optimize equality comparisons against alloca-based pointers.
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
     assert(Op1->getType()->isPointerTy() &&
diff --git a/llvm/test/Transforms/InstCombine/icmp-subadd.ll b/llvm/test/Transforms/InstCombine/icmp-subadd.ll
new file mode 100644
index 000000000000..fd7e1250d893
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-subadd.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @test-same-operands-sub-add-nsw-icmp-sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-sgt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sgt i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-slt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-slt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp slt i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-sle(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-sle(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sle i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-nuw-icmp-eq(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-nuw-icmp-eq(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp eq i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-eq(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-eq(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp eq i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-add-sub-nsw-icmp-sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nsw-icmp-sgt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sgt i8 %add, %sub
+  ret i1 %cmp
+}
+
+define i1 @test-add-sub-nuw-icmp-uge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nuw-icmp-uge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp uge i8 %add, %sub
+  ret i1 %cmp
+}
+
+; Check not folded
+define i1 @test-add-sub-nuw-icmp-sge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nuw-icmp-sge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i8 [[A]], [[B]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[A]], [[B]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[ADD]], [[SUB]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp sge i8 %add, %sub
+  ret i1 %cmp
+}
+
+define i1 @test-add-swap-sub-nuw-icmp-uge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-swap-sub-nuw-icmp-uge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %b, %a
+  %cmp = icmp uge i8 %add, %sub
+  ret i1 %cmp
+}

From 4692f0d3448e32381a2b21c7359c7daed07a8850 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Mon, 16 Jun 2025 09:06:18 -0600
Subject: [PATCH 0497/1322] Revert "[AMDGPU] Extended vector promotion to
 aggregate types." (#144366)

Reverts llvm/llvm-project#143784

Patch fails some internal tests. Will investigate more thoroughly before
attempting to remerge.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 108 +++----
 .../CodeGen/AMDGPU/promote-alloca-structs.ll  | 286 ------------------
 2 files changed, 42 insertions(+), 352 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e90a3a275f67..700dc87d2f82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,39 +818,6 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
-/// type is non-homogeneous.
-static Type *getHomogeneousType(Type *Ty) {
-  Type *ElemTy = nullptr;
-  SmallVector<Type *> WorkList;
-  WorkList.push_back(Ty);
-  while (!WorkList.empty()) {
-    Type *CurTy = WorkList.pop_back_val();
-
-    // Check if the current type is an aggregate type.
-    if (auto *VectorTy = dyn_cast<FixedVectorType>(CurTy)) {
-      WorkList.push_back(VectorTy->getElementType());
-      continue;
-    }
-    if (auto *ArrayTy = dyn_cast<ArrayType>(CurTy)) {
-      WorkList.push_back(ArrayTy->getElementType());
-      continue;
-    }
-    if (auto *StructTy = dyn_cast<StructType>(CurTy)) {
-      WorkList.append(StructTy->element_begin(), StructTy->element_end());
-      continue;
-    }
-
-    // If not, it must be the same as all other non-aggregate types.
-    if (!ElemTy)
-      ElemTy = CurTy;
-    else if (ElemTy != CurTy)
-      return nullptr;
-  }
-
-  return ElemTy;
-}
-
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -861,43 +828,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   Type *AllocaTy = Alloca.getAllocatedType();
-  Type *ElemTy = getHomogeneousType(AllocaTy);
+  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+    uint64_t NumElems = 1;
+    Type *ElemTy;
+    do {
+      NumElems *= ArrayTy->getNumElements();
+      ElemTy = ArrayTy->getElementType();
+    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
 
-  if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
+    // Check for array of vectors
+    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
+    if (InnerVectorTy) {
+      NumElems *= InnerVectorTy->getNumElements();
+      ElemTy = InnerVectorTy->getElementType();
+    }
+
+    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
+      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
+    }
+  }
+
+  if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
-  if (ElementSize == 0) {
-    LLVM_DEBUG(dbgs() << "  Cannot create vector of zero-sized elements\n");
-    return false;
-  }
-
-  // Calculate the size of the corresponding vector, accounting for padding of
-  // inner types, e.g., odd-sized subvectors. Storage size of new vector must
-  // match that of alloca for correct behaviour of byte offsets and GEP
-  // computation.
-  unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-  unsigned NumElems = AllocaSize / ElementSize;
-  if (NumElems == 0) {
-    LLVM_DEBUG(dbgs() << "  Cannot vectorize an empty aggregate type\n");
-    return false;
-  }
-  if (NumElems * ElementSize != AllocaSize) {
-    LLVM_DEBUG(
-        dbgs() << "  Cannot convert type into vector of the same size\n");
-    return false;
-  }
-  auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
-  assert(VectorTy && "Failed to create vector type.");
-
   const unsigned MaxElements =
       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
 
@@ -928,6 +895,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
+  Type *VecEltTy = VectorTy->getElementType();
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
@@ -967,7 +943,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
deleted file mode 100644
index 1cdd027fef89..000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
+++ /dev/null
@@ -1,286 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s
-
-define i8 @test_v4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_v4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca <4 x i8>, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [4 x i8], align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2v3i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2v3i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2a4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2a4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2a3i8(i48 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2a3i8(
-; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
-  store i48 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s1v4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s1v4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<4 x i8>}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s1a4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s1a4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {[4 x i8]}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v2i8v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v2i8v3i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2s2i8s4i8(
-; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
-  store i48 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2s2i8s3i8(
-; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <5 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
-  store i40 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s3i8i8s0(
-; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <2 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, i8, {}}, align 4, addrspace(5)
-  store i16 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-; heterogeneous element types are not supported
-define i8 @test_heterogeneous(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_heterogeneous(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
-; CHECK-NEXT:    store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
-; CHECK-NEXT:    ret i8 [[VAL]]
-;
-  %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-; empty types are not supported
-define void @test_empty() {
-; CHECK-LABEL: define void @test_empty() {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca {}, align 4, addrspace(5)
-; CHECK-NEXT:    ret void
-;
-  %stack = alloca {}, align 4, addrspace(5)
-  ret void
-}
-
-; singleton types are not supported
-define i8 @test_singleton(i8 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_singleton(
-; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
-; CHECK-NEXT:    store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
-; CHECK-NEXT:    ret i8 [[VAL]]
-;
-  %stack = alloca {i8, {}}, align 4, addrspace(5)
-  store i8 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}

From 38fa7533fbac525198206200cf2caf04071fcdb1 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 11:12:59 -0400
Subject: [PATCH 0498/1322] Fix diagnostic documentation build errors

---
 clang/include/clang/Basic/DiagnosticGroups.td | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 38b4f581fa5c..36fa3227fd6a 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -815,19 +815,22 @@ changes to one object won't affect the others, the object's initializer will run
 once per copy, etc.
 
 Specifically, this warning fires when it detects an object which:
-1. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
-2. Has external linkage (otherwise it's supposed to be duplicated), and
-3. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
+
+#. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
+#. Has external linkage (otherwise it's supposed to be duplicated), and
+#. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
 
 As well as one of the following:
-1. The object is mutable, or
-2. The object's initializer definitely has side effects.
+
+#. The object is mutable, or
+#. The object's initializer definitely has side effects.
 
 The warning can be resolved by removing one of the conditions above. In rough
 order of preference, this may be done by:
-1. Marking the object ``const`` (if possible)
-2. Moving the object's definition to a source file
-3. Making the object visible using ``__attribute((visibility("default")))``,
+
+#. Marking the object ``const`` (if possible)
+#. Moving the object's definition to a source file
+#. Making the object visible using ``__attribute((visibility("default")))``,
    ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
 
 When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``,

From f0373295e82315f95a97ce1b34c78ff46f475863 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Mon, 16 Jun 2025 23:17:47 +0800
Subject: [PATCH 0499/1322] [clang][Parser] Fix crash on malformed using
 declaration in constexpr function (#144286)

---
 clang/docs/ReleaseNotes.rst                               | 1 +
 clang/lib/Parse/ParseDeclCXX.cpp                          | 4 ++++
 .../Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp  | 8 ++++++++
 3 files changed, 13 insertions(+)
 create mode 100644 clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 33ee8a53b5f3..59d9612268d3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -704,6 +704,7 @@ Bug Fixes in This Version
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
 - Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
 - Fixed an infinite recursion when checking constexpr destructors. (#GH141789)
+- Fixed a crash when a malformed using declaration appears in a ``constexpr`` function. (#GH144264)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index f31c9265a007..a5c76501c7c1 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -760,6 +760,10 @@ Parser::DeclGroupPtrTy Parser::ParseUsingDeclaration(
 
     Decl *AD = ParseAliasDeclarationAfterDeclarator(
         TemplateInfo, UsingLoc, D, DeclEnd, AS, Attrs, &DeclFromDeclSpec);
+
+    if (!AD)
+      return nullptr;
+
     return Actions.ConvertDeclToDeclGroup(AD, DeclFromDeclSpec);
   }
 
diff --git a/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp b/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
new file mode 100644
index 000000000000..94fa8c8c820a
--- /dev/null
+++ b/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// issue144264
+constexpr void test() 
+{ 
+    using TT = struct T[; 
+    // expected-error@-1 {{expected expression}}
+}

From 6f1b5ed7e127b7806ae36783c6b9406434416c95 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 16:00:17 +0100
Subject: [PATCH 0500/1322] [X86] LowerCONCAT_VECTORS - pull out repeated
 SDLoc(). NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +++++++++++--------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b4670e270141..290fad07be4f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9614,13 +9614,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 // TODO: Detect subvector broadcast here instead of DAG combine?
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
+                                      SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
-  SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
-
-  assert((ResVT.is256BitVector() ||
-          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+  assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
+         "Value type must be 256-/512-bit wide");
 
   unsigned NumOperands = Op.getNumOperands();
   unsigned NumFreezeUndef = 0;
@@ -9688,13 +9687,11 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
 // zeros) of the result of a node that already zeros all upper bits of
 // k-register.
 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
-static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
-  SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
-
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
@@ -9766,19 +9763,18 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
+  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
-    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
-
-  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
-         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
-          Op.getNumOperands() == 4)));
+    return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
 
   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
-
   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
-  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() &&
+          (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
+  return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
 }
 
 //===----------------------------------------------------------------------===//

From 404597061f974470e8bd1198e44d024fac8319a1 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar@amd.com>
Date: Mon, 16 Jun 2025 10:27:48 -0500
Subject: [PATCH 0501/1322] [OMPIRBuilder] - Make offloading input data persist
 for deferred target tasks (#133499)

When we offload to the target, the pointers to data used by the kernel
are passed in arrays created by `OMPIRBuilder`. These arrays of pointers
are allocated on the stack on the host. This is fine for the most part
because absent the `nowait` clause, the default behavior is that target
tasks are included tasks. That is, the host is blocked until the
offloaded target kernel is done. In turn, this means that the host's
stack frame is intact and accessing the array of pointers when
offloading is safe. However, when `nowait` is used on the `!$ omp
target` instance, then the target task is a deferred task meaning, the
generating task on the host does not have to wait for the target task
to finish. In such cases, it is very likely that the stack frame of the
function invoking the target call is wound up thereby leading to memory
access errors as shown below.
```
AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested allocation is not valid.
AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested allocation is not valid. "PluginInterface" error: Failure to allocate device memory: Failed to allocate from memory manager
fort.cod.out: /llvm/llvm-project/offload/plugins-nextgen/common/src/PluginInterface.cpp:1434: Error llvm::omp::target::plugin::PinnedAllocationMapTy::lockMappedHostBuffer(void *, size_t): Assertion `HstPtr && "Invalid pointer"' failed.
Aborted (core dumped)
```
This PR implements support in `OMPIRBuilder` to store these arrays of
pointers in the task structure that is passed to the target task thereby
ensuring it is available to the target task when the target task is
eventually scheduled.

---------

Co-authored-by: Sergio Afonso <safonsof@amd.com>
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 335 ++++++++++++++----
 mlir/test/Target/LLVMIR/omptarget-depend.mlir |   3 +-
 .../Target/LLVMIR/omptarget-nowait-llvm.mlir  |  45 ++-
 mlir/test/Target/LLVMIR/omptarget-nowait.mlir |  70 ++++
 .../LLVMIR/omptargetdata-nowait-llvm.mlir     |  45 +--
 6 files changed, 387 insertions(+), 113 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-nowait.mlir

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e4b1241151e9..93fb0d8e8d07 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2507,7 +2507,7 @@ public:
       TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
       const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-      bool HasNoWait);
+      const TargetDataRTArgs &RTArgs, bool HasNoWait);
 
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ca3d8438654d..c1f02b2b240d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6703,7 +6703,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
                             /*TargetTaskAllocaIP=*/{}));
       else
         cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
-                                /*Dependencies=*/{}, Info.HasNoWait));
+                                /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
     } else {
       Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
           omp::OMPRTL___tgt_target_data_begin_mapper);
@@ -7150,15 +7150,55 @@ static Expected<Function *> createOutlinedFunction(
                                     ValueReplacementMap);
   return Func;
 }
+/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
+/// of pointers containing shared data between the parent task and the created
+/// task.
+static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
+                                                  IRBuilderBase &Builder,
+                                                  Value *TaskWithPrivates,
+                                                  Type *TaskWithPrivatesTy) {
 
+  Type *TaskTy = OMPIRBuilder.Task;
+  LLVMContext &Ctx = Builder.getContext();
+  Value *TaskT =
+      Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
+  Value *Shareds = TaskT;
+  // TaskWithPrivatesTy can be one of the following
+  // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+  //                                        %struct.privates }
+  // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
+  //
+  // In the former case, that is when  TaskWithPrivatesTy != TaskTy,
+  // its first member has to be the task descriptor. TaskTy is the type of the
+  // task descriptor. TaskT is the pointer to the task descriptor. Loading the
+  // first member of TaskT, gives us the pointer to shared data.
+  if (TaskWithPrivatesTy != TaskTy)
+    Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+  return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+}
 /// Create an entry point for a target task with the following.
 /// It'll have the following signature
 /// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
 /// This function is called from emitTargetTask once the
 /// code to launch the target kernel has been outlined already.
-static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
-                                             IRBuilderBase &Builder,
-                                             CallInst *StaleCI) {
+/// NumOffloadingArrays is the number of offloading arrays that we need to copy
+/// into the task structure so that the deferred target task can access this
+/// data even after the stack frame of the generating task has been rolled
+/// back. Offloading arrays contain base pointers, pointers, sizes etc
+/// of the data that the target kernel will access. These in effect are the
+/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
+static Function *emitTargetTaskProxyFunction(
+    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
+    StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
+    const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
+
+  // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
+  // This is because PrivatesTy is the type of the structure in which
+  // we pass the offloading arrays to the deferred target task.
+  assert((!NumOffloadingArrays || PrivatesTy) &&
+         "PrivatesTy cannot be nullptr when there are offloadingArrays"
+         "to privatize");
+
   Module &M = OMPBuilder.M;
   // KernelLaunchFunction is the target launch function, i.e.
   // the function that sets up kernel arguments and calls
@@ -7185,34 +7225,48 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
   // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
   OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
                                     StaleCI->getIterator());
+
   LLVMContext &Ctx = StaleCI->getParent()->getContext();
+
   Type *ThreadIDTy = Type::getInt32Ty(Ctx);
   Type *TaskPtrTy = OMPBuilder.TaskPtr;
   Type *TaskTy = OMPBuilder.Task;
+
   auto ProxyFnTy =
       FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
                         /* isVarArg */ false);
   auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
                                   ".omp_target_task_proxy_func",
                                   Builder.GetInsertBlock()->getModule());
-  ProxyFn->getArg(0)->setName("thread.id");
-  ProxyFn->getArg(1)->setName("task");
+  Value *ThreadId = ProxyFn->getArg(0);
+  Value *TaskWithPrivates = ProxyFn->getArg(1);
+  ThreadId->setName("thread.id");
+  TaskWithPrivates->setName("task");
 
+  bool HasShareds = SharedArgsOperandNo > 0;
+  bool HasOffloadingArrays = NumOffloadingArrays > 0;
   BasicBlock *EntryBB =
       BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
   Builder.SetInsertPoint(EntryBB);
 
-  bool HasShareds = StaleCI->arg_size() > 1;
-  // TODO: This is a temporary assert to prove to ourselves that
-  // the outlined target launch function is always going to have
-  // atmost two arguments if there is any data shared between
-  // host and device.
-  assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
-         "StaleCI with shareds should have exactly two arguments.");
+  SmallVector<Value *> KernelLaunchArgs;
+  KernelLaunchArgs.reserve(StaleCI->arg_size());
+  KernelLaunchArgs.push_back(ThreadId);
+
+  if (HasOffloadingArrays) {
+    assert(TaskTy != TaskWithPrivatesTy &&
+           "If there are offloading arrays to pass to the target"
+           "TaskTy cannot be the same as TaskWithPrivatesTy");
+    Value *Privates =
+        Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
+    for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
+      KernelLaunchArgs.push_back(
+          Builder.CreateStructGEP(PrivatesTy, Privates, i));
+  }
 
-  Value *ThreadId = ProxyFn->getArg(0);
   if (HasShareds) {
-    auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+    auto *ArgStructAlloca =
+        dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
     assert(ArgStructAlloca &&
            "Unable to find the alloca instruction corresponding to arguments "
            "for extracted function");
@@ -7220,27 +7274,67 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
 
     AllocaInst *NewArgStructAlloca =
         Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
-    Value *TaskT = ProxyFn->getArg(1);
+
     Value *SharedsSize =
         Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
 
-    Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
-    LoadInst *LoadShared =
-        Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+    LoadInst *LoadShared = loadSharedDataFromTaskDescriptor(
+        OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
 
     Builder.CreateMemCpy(
         NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
         LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
-
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
-  } else {
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId});
+    KernelLaunchArgs.push_back(NewArgStructAlloca);
   }
-
+  Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
   Builder.CreateRetVoid();
   return ProxyFn;
 }
+static Type *getOffloadingArrayType(Value *V) {
 
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
+    return GEP->getSourceElementType();
+  if (auto *Alloca = dyn_cast<AllocaInst>(V))
+    return Alloca->getAllocatedType();
+
+  llvm_unreachable("Unhandled Instruction type");
+  return nullptr;
+}
+// This function returns a struct that has at most two members.
+// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
+// descriptor. The second member, if needed, is a struct containing arrays
+// that need to be passed to the offloaded target kernel. For example,
+// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
+// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
+// respectively, then the types created  by this function are
+//
+// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
+// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+//                                     %struct.privates }
+// %struct.task_with_privates is returned by this function.
+// If there aren't any offloading arrays to pass to the target kernel,
+// %struct.kmp_task_ompbuilder_t is returned.
+static StructType *
+createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
+                         ArrayRef<Value *> OffloadingArraysToPrivatize) {
+
+  if (OffloadingArraysToPrivatize.empty())
+    return OMPIRBuilder.Task;
+
+  SmallVector<Type *, 4> StructFieldTypes;
+  for (Value *V : OffloadingArraysToPrivatize) {
+    assert(V->getType()->isPointerTy() &&
+           "Expected pointer to array to privatize. Got a non-pointer value "
+           "instead");
+    Type *ArrayTy = getOffloadingArrayType(V);
+    assert(ArrayTy && "ArrayType cannot be nullptr");
+    StructFieldTypes.push_back(ArrayTy);
+  }
+  StructType *PrivatesStructTy =
+      StructType::create(StructFieldTypes, "struct.privates");
+  return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
+                            "struct.task_with_privates");
+}
 static Error emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
     TargetRegionEntryInfo &EntryInfo,
@@ -7266,7 +7360,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
     OpenMPIRBuilder::InsertPointTy AllocaIP,
     const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-    bool HasNoWait) {
+    const TargetDataRTArgs &RTArgs, bool HasNoWait) {
 
   // The following explains the code-gen scenario for the `target` directive. A
   // similar scneario is followed for other device-related directives (e.g.
@@ -7276,27 +7370,30 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // When we arrive at this function, the target region itself has been
   // outlined into the function OutlinedFn.
   // So at ths point, for
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //   void user_code_that_offloads(...) {
-  //     omp target depend(..) map(from:a) map(to:b, c)
-  //        a = b + c
+  //     omp target depend(..) map(from:a) map(to:b) private(i)
+  //     do i = 1, 10
+  //        a(i) = b(i) + n
   //   }
   //
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //
   // we have
   //
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //
   //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     %.offload_baseptrs = alloca [2 x ptr], align 8
+  //     %.offload_ptrs = alloca [2 x ptr], align 8
+  //     %.offload_mappers = alloca [2 x ptr], align 8
   //     ;; target region has been outlined and now we need to
   //     ;; offload to it via a target task.
   //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //     *a = *b + *c
+  //   void outlined_device_function(ptr a, ptr b, ptr n) {
+  //     n = *n_ptr;
+  //     do i = 1, 10
+  //       a(i) = b(i) +  n
   //   }
   //
   // We have to now do the following
@@ -7309,33 +7406,59 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // (iii) Create a task with the task entry point created in (ii)
   //
   // That is we create the following
-  //
+  //   struct task_with_privates {
+  //      struct kmp_task_ompbuilder_t task_struct;
+  //      struct privates {
+  //         [2 x ptr] ; baseptrs
+  //         [2 x ptr] ; ptrs
+  //         [2 x i64] ; sizes
+  //      }
+  //   }
   //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     %.offload_baseptrs = alloca [2 x ptr], align 8
+  //     %.offload_ptrs = alloca [2 x ptr], align 8
+  //     %.offload_sizes = alloca [2 x i64], align 8
   //
   //     %structArg = alloca { ptr, ptr, ptr }, align 8
-  //     %strucArg[0] = %.offload_baseptrs
-  //     %strucArg[1] = %.offload_ptrs
-  //     %strucArg[2] = %.offload_mappers
-  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
-  //                                               @.omp_target_task_proxy_func)
-  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+  //     %strucArg[0] = a
+  //     %strucArg[1] = b
+  //     %strucArg[2] = &n
+  //
+  //     target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
+  //                                               sizeof(kmp_task_ompbuilder_t),
+  //                                               sizeof(structArg),
+  //                                               @.omp_target_task_proxy_func,
+  //                                               ...)
+  //     memcpy(target_task_with_privates->task_struct->shareds, %structArg,
+  //            sizeof(structArg))
+  //     memcpy(target_task_with_privates->privates->baseptrs,
+  //            offload_baseptrs, sizeof(offload_baseptrs)
+  //     memcpy(target_task_with_privates->privates->ptrs,
+  //            offload_ptrs, sizeof(offload_ptrs)
+  //     memcpy(target_task_with_privates->privates->sizes,
+  //            offload_sizes, sizeof(offload_sizes)
   //     dependencies_array = ...
   //     ;; if nowait not present
   //     call @__kmpc_omp_wait_deps(..., dependencies_array)
   //     call @__kmpc_omp_task_begin_if0(...)
   //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
-  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+  //     %target_task_with_privates)
+  //     call @__kmpc_omp_task_complete_if0(...)
   //   }
   //
   //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
   //                                                     ptr %task) {
   //       %structArg = alloca {ptr, ptr, ptr}
-  //       %shared_data = load (getelementptr %task, 0, 0)
-  //       mempcy(%structArg, %shared_data, sizeof(structArg))
-  //       kernel_launch_function(%thread.id, %structArg)
+  //       %task_ptr = getelementptr(%task, 0, 0)
+  //       %shared_data = load (getelementptr %task_ptr, 0, 0)
+  //       mempcy(%structArg, %shared_data, sizeof(%structArg))
+  //
+  //       %offloading_arrays = getelementptr(%task, 0, 1)
+  //       %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
+  //       %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
+  //       %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
+  //       kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
+  //                              %offload_sizes, %structArg)
   //   }
   //
   //   We need the proxy function because the signature of the task entry point
@@ -7343,21 +7466,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   //   that of the kernel_launch function.
   //
   //   kernel_launch_function is generated by emitKernelLaunch and has the
-  //   always_inline attribute.
-  //   void kernel_launch_function(thread_id,
-  //                               structArg) alwaysinline {
+  //   always_inline attribute. For this example, it'll look like so:
+  //   void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
+  //                               %offload_sizes,  %structArg) alwaysinline {
   //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
-  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
-  //       offload_ptrs = load(getelementptr structArg, 0, 1)
-  //       offload_mappers = load(getelementptr structArg, 0, 2)
+  //       ; load aggregated data from %structArg
   //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
-  //       ; offload_mappers
+  //       ; offload_sizes
   //       call i32 @__tgt_target_kernel(...,
   //                                     outlined_device_function,
   //                                     ptr %kernel_args)
   //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //      *a = *b + *c
+  //   void outlined_device_function(ptr a, ptr b, ptr n) {
+  //     n = *n_ptr;
+  //     do i = 1, 10
+  //       a(i) = b(i) +  n
   //   }
   //
   BasicBlock *TargetTaskBodyBB =
@@ -7378,6 +7501,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
 
+  // Generate the task body which will subsequently be outlined.
   Builder.restoreIP(TargetTaskBodyIP);
   if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
     return Err;
@@ -7396,15 +7520,57 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
             /*IsFinished=*/true);
 
-  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
-                      DeviceID](Function &OutlinedFn) mutable {
+  SmallVector<Value *, 2> OffloadingArraysToPrivatize;
+  bool NeedsTargetTask = HasNoWait && DeviceID;
+  if (NeedsTargetTask) {
+    for (auto *V :
+         {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
+          RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
+          RTArgs.SizesArray}) {
+      if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
+        OffloadingArraysToPrivatize.push_back(V);
+        OI.ExcludeArgsFromAggregate.push_back(V);
+      }
+    }
+  }
+  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
+                      DeviceID, OffloadingArraysToPrivatize](
+                         Function &OutlinedFn) mutable {
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
 
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    bool HasShareds = StaleCI->arg_size() > 1;
 
-    Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
+    // The first argument of StaleCI is always the thread id.
+    // The next few arguments are the pointers to offloading arrays
+    // if any. (see OffloadingArraysToPrivatize)
+    // Finally, all other local values that are live-in into the outlined region
+    // end up in a structure whose pointer is passed as the last argument. This
+    // piece of data is passed in the "shared" field of the task structure. So,
+    // we know we have to pass shareds to the task if the number of arguments is
+    // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
+    // thread id. Further, for safety, we assert that the number of arguments of
+    // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
+    const unsigned int NumStaleCIArgs = StaleCI->arg_size();
+    bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
+    assert(
+        !HasShareds ||
+        NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) &&
+            "Wrong number of arguments for StaleCI when shareds are present");
+    int SharedArgOperandNo =
+        HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
+
+    StructType *TaskWithPrivatesTy =
+        createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
+    StructType *PrivatesTy = nullptr;
+
+    if (!OffloadingArraysToPrivatize.empty())
+      PrivatesTy =
+          static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
+
+    Function *ProxyFn = emitTargetTaskProxyFunction(
+        *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
+        OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
 
     LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
                       << "\n");
@@ -7422,7 +7588,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     // If `HasNoWait == true`, we call  @__kmpc_omp_target_task_alloc to provide
     // the DeviceID to the deferred task and also since
     // @__kmpc_omp_target_task_alloc creates an untied/async task.
-    bool NeedsTargetTask = HasNoWait && DeviceID;
     Function *TaskAllocFn =
         !NeedsTargetTask
             ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
@@ -7435,17 +7600,19 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
     // Argument - `sizeof_kmp_task_t` (TaskSize)
     // Tasksize refers to the size in bytes of kmp_task_t data structure
-    // including private vars accessed in task.
-    // TODO: add kmp_task_t_with_privates (privates)
-    Value *TaskSize =
-        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
+    // plus any other data to be passed to the target task, if any, which
+    // is packed into a struct. kmp_task_t and the struct so created are
+    // packed into a wrapper struct whose type is TaskWithPrivatesTy.
+    Value *TaskSize = Builder.getInt64(
+        M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
 
     // Argument - `sizeof_shareds` (SharedsSize)
     // SharedsSize refers to the shareds array size in the kmp_task_t data
     // structure.
     Value *SharedsSize = Builder.getInt64(0);
     if (HasShareds) {
-      auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+      auto *ArgStructAlloca =
+          dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
       assert(ArgStructAlloca &&
              "Unable to find the alloca instruction corresponding to arguments "
              "for extracted function");
@@ -7483,13 +7650,32 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
     TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
 
+    Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
     if (HasShareds) {
-      Value *Shareds = StaleCI->getArgOperand(1);
-      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
-      Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+      Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
+      Value *TaskShareds = loadSharedDataFromTaskDescriptor(
+          *this, Builder, TaskData, TaskWithPrivatesTy);
       Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
                            SharedsSize);
     }
+    if (!OffloadingArraysToPrivatize.empty()) {
+      Value *Privates =
+          Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
+      for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
+        Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
+        Type *ArrayType = getOffloadingArrayType(PtrToPrivatize);
+        assert(ArrayType && "ArrayType cannot be nullptr");
+
+        Type *ElementType = PrivatesTy->getElementType(i);
+        assert(ElementType == ArrayType &&
+               "ElementType should match ArrayType");
+
+        Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
+        Builder.CreateMemCpy(
+            Dst, Alignment, PtrToPrivatize, Alignment,
+            Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
+      }
+    }
 
     Value *DepArray = emitTaskDependencies(*this, Dependencies);
 
@@ -7635,9 +7821,10 @@ static void emitTargetCall(
         // Arguments that are intended to be directly forwarded to an
         // emitKernelLaunch call are pased as nullptr, since
         // OutlinedFnID=nullptr results in that call not being done.
+        OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
         return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
                                          /*RTLoc=*/nullptr, AllocaIP,
-                                         Dependencies, HasNoWait);
+                                         Dependencies, EmptyRTArgs, HasNoWait);
       }
       return EmitTargetCallFallbackCB(Builder.saveIP());
     }());
@@ -7649,6 +7836,7 @@ static void emitTargetCall(
   auto &&EmitTargetCallThen =
       [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
           OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
+    Info.HasNoWait = HasNoWait;
     OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
     OpenMPIRBuilder::TargetDataRTArgs RTArgs;
     if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
@@ -7726,7 +7914,8 @@ static void emitTargetCall(
       // explicit generation of the target task.
       if (RequiresOuterTargetTask)
         return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
-                                         Dependencies, HasNoWait);
+                                         Dependencies, KArgs.RTArgs,
+                                         Info.HasNoWait);
 
       return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
                                          EmitTargetCallFallbackCB, KArgs,
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
index f2948c651013..0f2437639319 100644
--- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -126,7 +126,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 // CHECK-DAG:  %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 
 // CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
-// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: %[[SHARED_PTR:.+]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASKDATA]], i32 0, i32 0
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[SHARED_PTR]], align 8
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
 
 // CHECK: %[[DEP_INFO:.+]]  = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
index b487b31d5447..5eee7b7d7d97 100644
--- a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
@@ -13,19 +13,48 @@ module attributes {omp.target_triples = ["dummy-target-triple"]} {
     }
     llvm.return
   }
+}
 
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [1 x ptr], [1 x ptr] }
 
 // CHECK: define void @_QPfoo() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [1 x ptr], align 8
 
-// CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
-// CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
 
-// CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
+
+// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
+// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
+// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 8, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 8, i1 false)
+// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @_QPfoo..omp_par(i32 %{{.*}}, ptr %{{.*}})
-// CHECK: }
-}
+// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 8, i1 false)
+// CHECK:   call void @[[WORKER]](i32 %{{.*}}, ptr %{{.*}})
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
new file mode 100644
index 000000000000..19333c44322f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-translate -mlir-to-llvmir %s 2>&1 | FileCheck %s
+
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @launch_(%arg0: !llvm.ptr {fir.bindc_name = "a", llvm.nocapture}) {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x f64 {bindc_name = "n"} : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%2 : !llvm.ptr)  -> !llvm.ptr {name = ""}
+    %4 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(to) capture(ByRef) members(%3 : [0] : !llvm.ptr) -> !llvm.ptr {name = "a"}
+    %5 = omp.map.info var_ptr(%1 : !llvm.ptr, f64) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+    omp.target nowait map_entries(%4 -> %arg1, %5 -> %arg2, %3 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %two_f = llvm.mlir.constant(2.000000e+00 : f64) : f64
+      %one_i = llvm.mlir.constant(1 : index) : i64
+      %6 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+      %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
+      %8 = llvm.getelementptr %7[%one_i] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+      %9 = llvm.load %8 : !llvm.ptr -> f64
+      %10 = llvm.fmul %9, %two_f {fastmathFlags = #llvm.fastmath<contract>} : f64
+      llvm.store %10, %8 : f64, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [5 x ptr], [5 x ptr], [5 x i64] }
+
+// CHECK: define void @launch_(ptr captures(none) %0)
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[SIZES:.*]] = alloca [5 x i64], align 4
+
+
+// CHECK: %[[VAL_20:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[SIZES_GEP:.*]] = getelementptr inbounds [5 x i64], ptr %[[SIZES]], i32 0, i32 0
+
+// CHECK: %[[GL_THRD_NUM:.*]] = call i32 @__kmpc_global_thread_num
+// CHECK: %[[TASK_DESC:.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @4, i32 {{.*}}, i32 0, i64 160, i64 16, ptr [[TGT_TSK_PRXY_FNC:.*]], i64 -1)
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 16, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_54:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 2
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_54]], ptr align 1 %[[SIZES_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_omp_task(ptr @4, i32 %[[GL_THRD_NUM]], ptr %[[TASK_DESC]])
+
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
+
+// CHECK: define internal void [[TGT_TSK_PRXY_FNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 2
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 16, i1 false)
+// CHECK: call void @[[WORKER]](i32 %[[THREAD_ID_PARAM]], ptr %[[BASEPTRS]], ptr %[[PTRS]], ptr %[[SIZES]], ptr %[[STRUCTARG]])
diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
index 8124d02ef217..dba8c553aaca 100644
--- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
@@ -14,25 +14,20 @@ llvm.func @_QPopenmp_target_data_enter() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_ENTER:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_ENTER:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_begin_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}, ptr %{{.*}})
 // CHECK: }
 
 // -----
@@ -51,25 +46,20 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_UPDATE:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_UPDATE:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_update_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}})
 // CHECK: }
 
 // -----
@@ -88,23 +78,18 @@ llvm.func @_QPopenmp_target_data_exit() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_EXIT:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_EXIT:.*]](i32 %{{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_end_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}})
 // CHECK: }

From c7d85813fda88329979ae6c091d59a60833a9765 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 16 Jun 2025 16:31:20 +0100
Subject: [PATCH 0502/1322] [IndVars] Add tests showing missed simplifications.

---
 .../simplify-icmp-operands-order.ll           | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
new file mode 100644
index 000000000000..b0dbbd5eaedf
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p indvars -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+
+declare void @use(ptr)
+declare void @use.i64(i64)
+
+define i64 @test_simplifycompare_rhs_constant(i64 %num_bytes, ptr %src) {
+; CHECK-LABEL: define i64 @test_simplifycompare_rhs_constant(
+; CHECK-SAME: i64 [[NUM_BYTES:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ne i64 [[NUM_BYTES]], 0
+; CHECK-NEXT:    [[COND_I:%.*]] = zext i1 [[CMP_NOT_I]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ule i64 [[IV]], [[COND_I]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_0]])
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    call void @use(ptr [[SRC]])
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP]]
+;
+entry:
+  %cmp.not.i = icmp ne i64 %num_bytes, 0
+  %cond.i = zext i1 %cmp.not.i to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c.0 = icmp ule i64 %iv, %cond.i
+  tail call void @llvm.assume(i1 %c.0)
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %c.1 = icmp eq i32 %l, 0
+  br i1 %c.1, label %then, label %loop.latch
+
+then:
+  call void @use(ptr %src)
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop
+}
+
+define void @test_simplifycompare_rhs_not_constant1() {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant1() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[P]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 -8
+; CHECK-NEXT:    call void @use(ptr [[PTR_IV]])
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult ptr [[PTR_IV_NEXT]], [[P]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p = alloca i64, align 8
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %p, %entry ], [ %ptr.iv.next, %loop ]
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 -8
+  call void @use(ptr %ptr.iv)
+  %ec = icmp ult ptr %ptr.iv.next, %p
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_simplifycompare_rhs_not_constant2(i32 %x) {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw i64 [[INDVARS_IV:%.*]], 2
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], %[[OUTER_HEADER_LOOPEXIT]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[OUTER_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOP_PREHEADER:.*]], label %[[OUTER_LATCH_PREHEADER:.*]]
+; CHECK:       [[EXIT_LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOP:.*]]
+; CHECK:       [[OUTER_LATCH_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 2
+; CHECK-NEXT:    br label %[[OUTER_LATCH:.*]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, %[[OUTER_LATCH_PREHEADER]] ], [ [[X]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[P]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[OUTER_LATCH]], label %[[OUTER_HEADER_LOOPEXIT]]
+; CHECK:       [[EXIT_LOOP]]:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_LCSSA]], %[[EXIT_LOOP_PREHEADER]] ], [ [[INDVARS_IV_NEXT2:%.*]], %[[EXIT_LOOP]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[INDVARS_IV1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    br label %[[EXIT_LOOP]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %outer.latch ]
+  %c.1 = icmp sgt i32 %x, 0
+  br i1 %c.1, label %exit.loop, label %outer.latch.preheader
+
+outer.latch.preheader:
+  %iv.1.next = add nsw i32 %iv.1, 2
+  br label %outer.latch
+
+outer.latch:
+  %p = phi i32 [ 0, %outer.latch.preheader ], [ %x, %outer.latch ]
+  %c.2 = icmp ult i32 %p, %iv.1.next
+  br i1 %c.2, label %outer.latch, label %outer.header
+
+exit.loop:
+  %iv.2 = phi i32 [ %iv.1, %outer.header ], [ %iv.2.next, %exit.loop ]
+  %iv.2.ext = zext i32 %iv.2 to i64
+  call void @use.i64(i64 %iv.2.ext)
+  %iv.2.next = add nsw i32 %iv.2, 1
+  br label %exit.loop
+}
+
+define void @test_simplifycompare_rhs_addrec(i32 %x) {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_addrec(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV:%.*]], 2
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], %[[OUTER_HEADER_LOOPEXIT]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 2, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[OUTER_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[OUTER_EXIT:.*]], label %[[OUTER_LATCH_PREHEADER:.*]]
+; CHECK:       [[OUTER_LATCH_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 2
+; CHECK-NEXT:    br label %[[OUTER_LATCH:.*]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[X]], %[[OUTER_LATCH]] ], [ 0, %[[OUTER_LATCH_PREHEADER]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[P]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[OUTER_LATCH]], label %[[OUTER_HEADER_LOOPEXIT]]
+; CHECK:       [[OUTER_EXIT]]:
+; CHECK-NEXT:    [[INDVARS_IV_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOP:.*]]
+; CHECK:       [[EXIT_LOOP]]:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], %[[EXIT_LOOP]] ], [ [[INDVARS_IV_LCSSA]], %[[OUTER_EXIT]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[INDVARS_IV1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    br label %[[EXIT_LOOP]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i32 [ 2, %entry ], [ %iv.1.next, %outer.latch ]
+  %c.1 = icmp sgt i32 %x, 0
+  br i1 %c.1, label %outer.exit, label %outer.latch.preheader
+
+outer.latch.preheader:
+  %iv.1.next = add nuw nsw i32 %iv.1, 2
+  br label %outer.latch
+
+outer.latch:
+  %p = phi i32 [ %x, %outer.latch ], [ 0, %outer.latch.preheader ]
+  %c.2 = icmp ult i32 %p, %iv.1.next
+  br i1 %c.2, label %outer.latch, label %outer.header
+
+outer.exit:
+  %sub = add nsw i32 %iv.1, -2
+  br label %exit.loop
+
+exit.loop:
+  %iv.2 = phi i32 [ %sub, %outer.exit ], [ %iv.2.next, %exit.loop ]
+  %iv.2.ext = sext i32 %iv.2 to i64
+  call void @use.i64(i64 %iv.2.ext)
+  %iv.2.next = add nsw i32 %iv.2, 1
+  br label %exit.loop
+}

From a5f0525d4b3edba50706cb0e4b9a48f0691e2b4c Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 16 Jun 2025 16:47:55 +0100
Subject: [PATCH 0503/1322] [AArch64][SelectionDAG] Enable new partial
 reduction lowering by default (#143565)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   96 +-
 .../neon-partial-reduce-dot-product.ll        | 1303 ++++++-----
 .../sve-fixed-length-partial-reduce.ll        |    6 +-
 .../AArch64/sve-partial-reduce-dot-product.ll | 1926 +++++++++--------
 .../AArch64/sve-partial-reduce-wide-add.ll    |  290 +--
 5 files changed, 1804 insertions(+), 1817 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7519ac5260a6..c86aed7b38c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,13 +153,6 @@ cl::opt<bool> EnableSVEGISel(
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
-// FIXME : This is a temporary flag, and is used to help transition to
-// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD
-// nodes.
-static cl::opt<bool> EnablePartialReduceNodes(
-    "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden,
-    cl::desc("Use the new method of lowering partial reductions."));
-
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -1457,7 +1450,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
 
-    if (EnablePartialReduceNodes && Subtarget->hasDotProd()) {
+    if (Subtarget->hasDotProd()) {
       static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
                                         ISD::PARTIAL_REDUCE_UMLA};
 
@@ -1895,7 +1888,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // Handle partial reduction operations
-  if (EnablePartialReduceNodes && Subtarget->isSVEorStreamingSVEAvailable()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
     // Other pairs will default to 'Expand'.
     static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
@@ -1957,17 +1950,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
 
-      if (EnablePartialReduceNodes) {
-        static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                          ISD::PARTIAL_REDUCE_UMLA};
-        // Must be lowered to SVE instructions.
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
-      }
+      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                        ISD::PARTIAL_REDUCE_UMLA};
+      // Must be lowered to SVE instructions.
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
     }
   }
 
@@ -2165,16 +2156,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   assert(I->getIntrinsicID() ==
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Unexpected intrinsic!");
-  if (EnablePartialReduceNodes)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  auto Op1 = I->getOperand(1);
-  EVT Op1VT = EVT::getEVT(Op1->getType());
-  if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-      (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
-       VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
-    return false;
   return true;
 }
 
@@ -2252,37 +2233,32 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
   bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
 
-  if (EnablePartialReduceNodes) {
-    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                      ISD::PARTIAL_REDUCE_UMLA};
-    unsigned NumElts = VT.getVectorNumElements();
-    if (VT.getVectorElementType() == MVT::i64) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i32) {
-      setPartialReduceMLAAction(MLAOps, VT,
+  static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                    ISD::PARTIAL_REDUCE_UMLA};
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.getVectorElementType() == MVT::i64) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i32) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i16) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
+  }
+  if (Subtarget->hasMatMulInt8()) {
+    if (VT.getVectorElementType() == MVT::i32)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
                                 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i16) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
-    }
-
-    if (Subtarget->hasMatMulInt8()) {
-      if (VT.getVectorElementType() == MVT::i32)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 4),
-                                  Custom);
-      else if (VT.getVectorElementType() == MVT::i64)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 8),
-                                  Custom);
-    }
+    else if (VT.getVectorElementType() == MVT::i64)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0c7b3c7d3c13..0ea80a075fae 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,15 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT-I8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: udot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
@@ -19,6 +13,16 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -27,22 +31,6 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
-; CHECK-DOT-LABEL: udot_in_loop:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    mov x8, xzr
-; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
-; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
-; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-DOT-NEXT:    add x8, x8, #16
-; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
-; CHECK-DOT-NEXT:    cmp x8, #16
-; CHECK-DOT-NEXT:    b.ne .LBB1_1
-; CHECK-DOT-NEXT:  // %bb.2: // %end
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_in_loop:
 ; CHECK-NODOT:       // %bb.0: // %entry
 ; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
@@ -63,6 +51,38 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -86,11 +106,6 @@ end:
 }
 
 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: udot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
@@ -105,6 +120,16 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -113,11 +138,6 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: sdot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
@@ -127,6 +147,16 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -135,11 +165,6 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: sdot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
@@ -154,6 +179,16 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -162,27 +197,34 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -191,60 +233,67 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: usdot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB6_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -268,32 +317,44 @@ end:
 }
 
 define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: usdot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -302,27 +363,34 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %s.wide = sext <16 x i8> %u to <16 x i32>
   %u.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
@@ -331,60 +399,67 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: sudot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB9_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -408,32 +483,44 @@ end:
 }
 
 define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -460,21 +547,21 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -503,21 +590,21 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -528,45 +615,61 @@ entry:
 }
 
 define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -577,45 +680,61 @@ entry:
 }
 
 define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -626,12 +745,6 @@ entry:
 }
 
 define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
@@ -641,77 +754,53 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
-; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NODOT-NEXT:    mov x8, xzr
-; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    add x8, x8, #16
-; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NODOT-NEXT:    b.ne .LBB16_1
-; CHECK-NODOT-NEXT:  // %bb.2: // %end
-; CHECK-NODOT-NEXT:    ret
-;
-; CHECK-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v2.16b, #1
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q3, [x0, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    udot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x9, .LCPI16_2
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x10, .LCPI16_3
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q6, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v17.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v7.4s, v16.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_no_bin_op_in_loop:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_0
+; CHECK-COMMON-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-COMMON-NEXT:    adrp x9, .LCPI16_2
+; CHECK-COMMON-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_1
+; CHECK-COMMON-NEXT:    adrp x10, .LCPI16_3
+; CHECK-COMMON-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
+; CHECK-COMMON-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
+; CHECK-COMMON-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
+; CHECK-COMMON-NEXT:    mov x8, xzr
+; CHECK-COMMON-NEXT:  .LBB16_1: // %vector.body
+; CHECK-COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-COMMON-NEXT:    ldr q6, [x0, x8]
+; CHECK-COMMON-NEXT:    mov v0.16b, v2.16b
+; CHECK-COMMON-NEXT:    add x8, x8, #16
+; CHECK-COMMON-NEXT:    cmp x8, #16
+; CHECK-COMMON-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
+; CHECK-COMMON-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
+; CHECK-COMMON-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
+; CHECK-COMMON-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v17.4s
+; CHECK-COMMON-NEXT:    add v7.4s, v16.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-COMMON-NEXT:    b.ne .LBB16_1
+; CHECK-COMMON-NEXT:  // %bb.2: // %end
+; CHECK-COMMON-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -731,12 +820,6 @@ end:
 }
 
 define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
@@ -746,18 +829,24 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
@@ -772,18 +861,24 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
 }
 
 define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
@@ -798,6 +893,18 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
@@ -822,23 +929,23 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
@@ -863,35 +970,35 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
 }
 
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-COMMON-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-COMMON-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -900,16 +1007,16 @@ define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
-; CHECK-LABEL: not_udot_narrow:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v2.4h, #255, lsl #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umull v3.4s, v2.4h, v1.4h
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot_narrow:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-COMMON-NEXT:    umull v3.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
+; CHECK-COMMON-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <4 x i8> %u to <4 x i32>
   %s.wide = zext <4 x i8> %s to <4 x i32>
   %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
@@ -918,18 +1025,18 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
 }
 
 define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: udot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -939,18 +1046,18 @@ entry:
 }
 
 define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sdot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -960,18 +1067,18 @@ entry:
 }
 
 define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: usdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: usdot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -981,18 +1088,18 @@ entry:
 }
 
 define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sudot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sudot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -1002,74 +1109,86 @@ entry:
 }
 
 define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
-; CHECK-NOI8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.8h, v4.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #1024
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v7.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_multiple_zext_users:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NODOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #1024
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB28_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #1024
-; CHECK-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-DOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #1024
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-DOT-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #1024
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #1024
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -1100,15 +1219,15 @@ end:
 }
 
 define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){
-; CHECK-LABEL: udot_16to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_16to64:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
     %input.wide = zext <8 x i16> %input to <8 x i64>
     %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index af813ff16a20..33d5ac4cd299 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
-; RUN: llc -mattr=+sve,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
-; RUN: llc -mattr=+sme,+i8mm -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
+; RUN: llc -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mattr=+sve,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,SVE
+; RUN: llc -mattr=+sme,+i8mm -force-streaming < %s | FileCheck %s --check-prefix=SME
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 221a15e5c8fe..b2cde51e9961 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1,20 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+sme,+i8mm -force-streaming -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SME
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefix=CHECK-SVE2-I8MM
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+sme,+i8mm -force-streaming %s -o - | FileCheck %s --check-prefix=CHECK-SME
 
 define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -24,15 +27,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -42,15 +50,20 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -60,15 +73,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -78,36 +96,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -117,36 +135,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -156,41 +174,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -201,41 +207,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-LABEL: sdot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -246,82 +240,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    sunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: usdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -332,82 +306,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    uunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sudot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -418,51 +372,69 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: udot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -470,17 +442,23 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: sdot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -488,137 +466,93 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
@@ -628,47 +562,29 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
-; CHECK-LABEL: not_udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SME-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SME-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
   %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
@@ -678,47 +594,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_usdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: not_usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -728,47 +665,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_sudot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: not_sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -778,49 +736,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -830,51 +810,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -884,51 +887,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: usdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: usdot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -938,49 +964,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sudot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sudot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -990,29 +1038,26 @@ entry:
 }
 
 define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1022,31 +1067,29 @@ entry:
 }
 
 define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1056,35 +1099,26 @@ entry:
 }
 
 define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: partial_reduce_only_split_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z2.s
-; CHECK-NEXT:    uunpklo z24.d, z5.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z2.h, z3.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: partial_reduce_only_split_acc:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -1095,25 +1129,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sub z0.s, z0.s, z2.s
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1)
@@ -1122,41 +1154,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
@@ -1165,27 +1215,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    mov z2.s, #255 // =0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255)
@@ -1194,41 +1240,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 428dd4c3a015..e62979d077fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -1,16 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
 
 define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.d, z1.s
@@ -19,19 +11,11 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -39,12 +23,6 @@ entry:
 }
 
 define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.d, z1.s
@@ -53,19 +31,11 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -73,12 +43,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.s, z1.h
@@ -87,19 +51,11 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -107,12 +63,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.s, z1.h
@@ -121,19 +71,11 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -141,12 +83,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.h, z1.b
@@ -155,19 +91,11 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -175,12 +103,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.h, z1.b
@@ -189,19 +111,11 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -209,16 +123,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    ptrue p0.s
-; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    ptrue p0.s
@@ -229,23 +133,13 @@ define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -253,15 +147,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    and z1.s, z1.s, #0xffff
@@ -271,21 +156,12 @@ define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -293,18 +169,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    sunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    sunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z4.d, z3.s
@@ -317,25 +181,13 @@ define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
@@ -343,18 +195,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    uunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z4.d, z3.s
@@ -367,25 +207,13 @@ define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)

From 58d23476f0ce76c847497a880f975550a645c796 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Mon, 16 Jun 2025 08:48:41 -0700
Subject: [PATCH 0504/1322] [MLIR][XeGPU] Add unroll patterns for scatter ops 
 (#143602)

Add unrolling support for create_tdesc, load, store, prefetch, and update_offset.

---------

Co-authored-by: Adam Siemieniuk <adam.siemieniuk@intel.com>
Co-authored-by: Chao Chen <chao.chen@intel.com>
---
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  | 207 +++++++++++++++++-
 .../Dialect/XeGPU/xegpu-unroll-patterns.mlir  | 141 ++++++++++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  23 ++
 3 files changed, 369 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 885477fe4cbd..9c234c1e866b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -396,11 +396,214 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
   }
 };
 
+struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
+  using UnrollPattern<xegpu::CreateDescOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::CreateDescOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
+
+    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
+    VectorType indiceVecTy = indiceVec.getType();
+
+    SmallVector<Type> convertedIndiceTypes =
+        getUnrolledTypes(indiceVecTy, *targetShape);
+    SmallVector<Value> convertedIndiceVec =
+        pack(indiceVec, convertedIndiceTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+    for (auto indice : convertedIndiceVec) {
+      auto newOp = rewriter.create<xegpu::CreateDescOp>(loc, newTdescTy,
+                                                        op.getSource(), indice);
+      newOps.push_back(newOp);
+    }
+
+    Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+
+    return success();
+  }
+};
+
+struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
+  using UnrollPattern<xegpu::LoadGatherOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::LoadGatherOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    Type elemTy = tdescTy.getElementType();
+    VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdescs = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Type> convertedMaskTypes =
+        getUnrolledTypes(maskTy, *targetShape);
+    SmallVector<Value> convertedMasks =
+        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+    for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) {
+      auto newOp = rewriter.create<xegpu::LoadGatherOp>(
+          loc, newValueTy, t, m, op.getTransposeAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+      newOps.push_back(newOp);
+    }
+
+    Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+
+    rewriter.replaceOp(op, castOp);
+    return success();
+  }
+};
+
+struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
+  using UnrollPattern<xegpu::PrefetchOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::PrefetchOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdesc = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    for (auto t : convertedTdesc)
+      rewriter.create<xegpu::PrefetchOp>(loc, TypeRange(), t, op->getAttrs());
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
+  using UnrollPattern<xegpu::StoreScatterOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::StoreScatterOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedValTypes =
+        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+
+    SmallVector<Value> convertedValues =
+        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
+    SmallVector<Value> convertedTdescs = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Type> convertedMaskTypes =
+        getUnrolledTypes(maskTy, *targetShape);
+    SmallVector<Value> convertedMasks =
+        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+
+    for (size_t i = 0; i < convertedValues.size(); ++i) {
+      Value v = convertedValues[i];
+      Value t = convertedTdescs[i];
+      Value m = op.getMask() ? convertedMasks[i] : nullptr;
+      rewriter.create<xegpu::StoreScatterOp>(
+          loc, v, t, m, op.getTransposeAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+    }
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
+  using UnrollPattern<xegpu::UpdateOffsetOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::UpdateOffsetOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdesc = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
+    VectorType offsetVecTy = offsetVec.getType();
+    SmallVector<Type> convertedOffsetTypes =
+        getUnrolledTypes(offsetVecTy, *targetShape);
+    SmallVector<Value> convertedOffsetVec =
+        pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+    for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
+      auto newOp =
+          rewriter.create<xegpu::UpdateOffsetOp>(loc, t.getType(), t, o);
+      newOps.push_back(newOp);
+    }
+    Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::xegpu::populateXeGPUUnrollPatterns(
     RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
   patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
-               UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp>(
-      patterns.getContext(), options);
+               UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
+               UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
+               UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
+                                                       options);
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index b911bb3bbdc1..52ec3b856da4 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -158,4 +158,145 @@ gpu.module @test {
     %c = xegpu.dpas %a, %b : vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
     gpu.return %c : vector<32x32xf32>
   }
+
+//-----
+
+  // CHECK-LABEL: test_create_tdesc_vec
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @test_create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>,  #xegpu.layout<inst_data = [16]>>
+  }
+
+//-----
+
+  // CHECK-LABEL: test_create_tdesc_step
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @test_create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+  }
+
+//-----
+
+  // CHECK-LABEL: test_load
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  gpu.func @test_load(%src: ui64) -> vector<32xf32> {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+      
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    %ld = xegpu.load %tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+      
+    gpu.return %ld : vector<32xf32> 
+  }
+
+//-----
+
+  // CHECK-LABEL: test_prefetch
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @test_prefetch(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return
+  }
+
+//-----
+
+  // CHECK-LABEL: test_store
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  gpu.func @test_store(%src: ui64) {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %st_vec = arith.constant dense<1023.0>: vector<32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.store %st_vec, %tdesc, %mask: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1>
+    
+    gpu.return
+  }
+
+//-----
+
+  // CHECK-LABEL: test_prefetch_load_store_update
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+   // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
+   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+
+  gpu.func @test_prefetch_load_store_update(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+   
+    %delta = arith.constant dense<[
+    32,   32,  32,  32,  32,  32,  32,  32,
+    32,   32,  32,  32,  32,  32,  32,  64,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 256 
+    ]> : vector<32xindex>
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+              : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>     
+ 
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+
+    %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
+    xegpu.store %st_vec, %tdesc, %mask: 
+                 vector<32xf32>, 
+                 !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, 
+                 vector<32xi1>
+  
+    gpu.return
+  }
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 3f3461e92bc0..57aaecbd7962 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -71,6 +71,29 @@ struct TestXeGPUUnrollingPatterns
             }
           }
 
+          if (isa<xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
+                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
+            xegpu::TensorDescType tdescTy;
+            if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
+              tdescTy = createOp.getType();
+            } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
+              tdescTy = updateOp.getTensorDescType();
+            } else if (auto prefetchOp = dyn_cast<xegpu::PrefetchOp>(op)) {
+              tdescTy = prefetchOp.getTensorDescType();
+            } else if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
+              tdescTy = loadOp.getTensorDescType();
+            } else if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(op)) {
+              tdescTy = storeOp.getTensorDescType();
+            }
+
+            if (auto layout = tdescTy.getLayoutAttr()) {
+              auto inst_data = layout.getInstData();
+              if (inst_data && layout.isSgLayout())
+                return SmallVector<int64_t>(inst_data.asArrayRef().begin(),
+                                            inst_data.asArrayRef().end());
+            }
+          }
+
           if (isa<xegpu::DpasOp>(op))
             return SmallVector<int64_t>{8, 16, 16};
 

From fc6aac72cc2c9a7a9dab443bca52f813a18461ef Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Mon, 16 Jun 2025 11:53:55 -0400
Subject: [PATCH 0505/1322] [DirectX] Fix bug where Flatten arrays was only
 using last index (#144146)

fixes #142836

We added a function called `collectIndicesAndDimsFromGEP` which builds
the Indicies and Dims up for the recursive case and the base case.
really to solve #142836 we didn't need to add it to the recursive case.
The recursive cases exists for gep chains which are ussually two
indicies per gep ie ptr index and array index. adding
collectIndicesAndDimsFromGEP to the recursive cases means we can now do
some mixed mode indexing say we get a case where its not the ussual 2
indicies but instead 3 we can now treat those last two indicies as part
of the computation for the flat array index.
---
 llvm/lib/Target/DirectX/DXILFlattenArrays.cpp | 44 +++++++++---
 llvm/test/CodeGen/DirectX/flatten-array.ll    | 70 +++++++++++++++++++
 .../DirectX/llc-vector-load-scalarize.ll      |  8 +--
 3 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index b1f3f41a28e8..0b7cf2f97017 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -86,6 +86,13 @@ private:
   Value *genInstructionFlattenIndices(ArrayRef<Value *> Indices,
                                       ArrayRef<uint64_t> Dims,
                                       IRBuilder<> &Builder);
+
+  // Helper function to collect indices and dimensions from a GEP instruction
+  void collectIndicesAndDimsFromGEP(GetElementPtrInst &GEP,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    SmallVectorImpl<uint64_t> &Dims,
+                                    bool &AllIndicesAreConstInt);
+
   void
   recursivelyCollectGEPs(GetElementPtrInst &CurrGEP,
                          ArrayType *FlattenedArrayType, Value *PtrOperand,
@@ -218,6 +225,26 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
   return true;
 }
 
+void DXILFlattenArraysVisitor::collectIndicesAndDimsFromGEP(
+    GetElementPtrInst &GEP, SmallVectorImpl<Value *> &Indices,
+    SmallVectorImpl<uint64_t> &Dims, bool &AllIndicesAreConstInt) {
+
+  Type *CurrentType = GEP.getSourceElementType();
+
+  // Note index 0 is the ptr index.
+  for (Value *Index : llvm::drop_begin(GEP.indices(), 1)) {
+    Indices.push_back(Index);
+    AllIndicesAreConstInt &= isa<ConstantInt>(Index);
+
+    if (auto *ArrayTy = dyn_cast<ArrayType>(CurrentType)) {
+      Dims.push_back(ArrayTy->getNumElements());
+      CurrentType = ArrayTy->getElementType();
+    } else {
+      assert(false && "Expected array type in GEP chain");
+    }
+  }
+}
+
 void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
     GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
     Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
@@ -226,12 +253,8 @@ void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
   if (GEPChainMap.count(&CurrGEP) > 0)
     return;
 
-  Value *LastIndex = CurrGEP.getOperand(CurrGEP.getNumOperands() - 1);
-  AllIndicesAreConstInt &= isa<ConstantInt>(LastIndex);
-  Indices.push_back(LastIndex);
-  assert(isa<ArrayType>(CurrGEP.getSourceElementType()));
-  Dims.push_back(
-      cast<ArrayType>(CurrGEP.getSourceElementType())->getNumElements());
+  // Collect indices and dimensions from the current GEP
+  collectIndicesAndDimsFromGEP(CurrGEP, Indices, Dims, AllIndicesAreConstInt);
   bool IsMultiDimArr = isMultiDimensionalArray(CurrGEP.getSourceElementType());
   if (!IsMultiDimArr) {
     assert(GEPChainUseCount < FlattenedArrayType->getNumElements());
@@ -316,9 +339,12 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Handle zero uses here because there won't be an update via
   // a child in the chain later.
   if (GEPChainUseCount == 0) {
-    SmallVector<Value *> Indices({GEP.getOperand(GEP.getNumOperands() - 1)});
-    SmallVector<uint64_t> Dims({ArrType->getNumElements()});
-    bool AllIndicesAreConstInt = isa<ConstantInt>(Indices[0]);
+    SmallVector<Value *> Indices;
+    SmallVector<uint64_t> Dims;
+    bool AllIndicesAreConstInt = true;
+
+    // Collect indices and dimensions from the GEP
+    collectIndicesAndDimsFromGEP(GEP, Indices, Dims, AllIndicesAreConstInt);
     GEPData GEPInfo{std::move(FlattenedArrayType), PtrOperand,
                     std::move(Indices), std::move(Dims), AllIndicesAreConstInt};
     return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index 5c761014d471..dc8c5f8421bf 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -187,5 +187,75 @@ define void @global_gep_store() {
   ret void
 }
 
+@g = local_unnamed_addr addrspace(3) global [2 x [2 x float]] zeroinitializer, align 4
+define void @two_index_gep() {
+  ; CHECK-LABEL: define void @two_index_gep(
+  ; CHECK: [[THREAD_ID:%.*]] =  tail call i32 @llvm.dx.thread.id(i32 0)
+  ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[THREAD_ID]], 2
+  ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, [[MUL]]
+  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 [[ADD]]
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = tail call i32 @llvm.dx.thread.id(i32 0)
+  %2 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 %1, i32 1
+  %3 = load float, ptr addrspace(3) %2, align 4
+  ret void
+}
+
+define void @two_index_gep_const() {
+  ; CHECK-LABEL: define void @two_index_gep_const(
+  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 3
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 1, i32 1
+  %3 = load float, ptr addrspace(3) %1, align 4
+  ret void
+}
+
+define void @gep_4d_index_test()  {
+    ; CHECK-LABEL: gep_4d_index_test
+    ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 1
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 3
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 7
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 15
+    ; CHECK-NEXT:    ret void
+    %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
+    %2 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0, i32 0, i32 1
+    %3 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0, i32 1, i32 1
+    %4 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 1, i32 1, i32 1
+    %5 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 1, i32 1, i32 1
+    ret void
+}
+
+define void @gep_4d_index_and_gep_chain_mixed() {
+  ; CHECK-LABEL: gep_4d_index_and_gep_chain_mixed
+  ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4
+  ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[ALLOCA]], i32 0, i32 {{[0-9]|1[0-5]}}
+  ; CHECK-NEXT: ret void
+  %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
+  %a4d0_0 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0
+  %a2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 0, i32 0
+  %a2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 0, i32 1
+  %a2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 1, i32 0
+  %a2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 1, i32 1
+  %b4d0_1 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 1
+  %b2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 0, i32 0
+  %b2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 0, i32 1
+  %b2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 1, i32 0
+  %b2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 1, i32 1
+  %c4d1_0 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 0
+  %c2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 0, i32 0
+  %c2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 0, i32 1
+  %c2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 1, i32 0
+  %c2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 1, i32 1
+  %g4d1_1 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 1
+  %g2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 0, i32 0
+  %g2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 0, i32 1
+  %g2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 1, i32 0
+  %g2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 1, i32 1
+  ret void
+}
+
 ; Make sure we don't try to walk the body of a function declaration.
 declare void @opaque_function()
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index c960aad3d262..778113bd3160 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -111,13 +111,13 @@ define <4 x i32> @multid_load_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
 ; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
 ; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]

From 2dd50bf79edefa28beffdbba4edfc2c753adae61 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 08:54:04 -0700
Subject: [PATCH 0506/1322] [OpenMP] Fix warnings

This patch fixes:

  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp:7233:9: error: unused
  variable 'TaskTy' [-Werror,-Wunused-variable]

  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp:7666:15: error: unused
  variable 'ArrayType' [-Werror,-Wunused-variable]
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c1f02b2b240d..828205776f3f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7257,6 +7257,7 @@ static Function *emitTargetTaskProxyFunction(
     assert(TaskTy != TaskWithPrivatesTy &&
            "If there are offloading arrays to pass to the target"
            "TaskTy cannot be the same as TaskWithPrivatesTy");
+    (void)TaskTy;
     Value *Privates =
         Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
     for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
@@ -7669,6 +7670,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
         Type *ElementType = PrivatesTy->getElementType(i);
         assert(ElementType == ArrayType &&
                "ElementType should match ArrayType");
+        (void)ArrayType;
 
         Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
         Builder.CreateMemCpy(

From dfb14b65bc0a277f920c797b4557e79685055b4f Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 16 Jun 2025 16:55:40 +0100
Subject: [PATCH 0507/1322] [Flang] NFC: Update test to work on Mac (#144253)

`%flang` expands to `flang -isysroot <SDK location>` in Mac and probably
other OS as well. `fc1` is only accepted as the first argument and hence
in this case it fails.

Use the `%flang_fc1` option to correctly expand to `flang -fc1 -isysroot
<SDK location>`.
---
 flang/test/Preprocessing/bug518.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Preprocessing/bug518.F b/flang/test/Preprocessing/bug518.F
index 346e04cc56d3..0b680dd5751b 100644
--- a/flang/test/Preprocessing/bug518.F
+++ b/flang/test/Preprocessing/bug518.F
@@ -1,4 +1,4 @@
-! RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
 ! CHECK: k=1_4
                         k=                                            1_99999999
      &4

From 711f6a8603717a6dc7e6202c614433ea2f9c0967 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 16 Jun 2025 16:58:00 +0100
Subject: [PATCH 0508/1322] [llvm][DebugInfo] Encode DW_AT_object_pointer on
 method declarations with DW_FORM_implicit_const (#124790)

We started attaching `DW_AT_object_pointer`s on method declarations in
https://github.com/llvm/llvm-project/pull/122742. However, that caused
the `.debug_info` section size to increase significantly (by around ~10%
on some projects). This was mainly due to the large number of new
`DW_FORM_ref4` values. This patch tries to address that regression by
changing the `DW_FORM_ref4` to a `DW_FORM_implicit_const` for
declarations. The value of `DW_FORM_implicit_const` will be the *index*
of the object parameter in the list of formal parameters of the
subprogram (i.e., if the first `DW_TAG_formal_parameter` is the object
pointer, the `DW_FORM_implicit_const` would be `0`). The DWARFv5 spec
only mentions the use of the `reference` attribute class to for
`DW_AT_object_pointer`. So using a `DW_FORM_impilicit_const` would be an
extension to (and not something mandated/specified by) the standard.
Though it'd make sense to extend the wording in the spec to allow for
this optimization.

That way we don't pay for the 4 byte references on every attribute
occurrence. In a local build of clang this barely affected the
`.debug_info` section size (but did increase `.debug_abbrev` by up to
10%, which doesn't impact the total debug-info size much however).

We guarded this on LLDB tuning (since using `DW_FORM_implicit_const` for
this purpose may surprise consumers) and DWARFv5 (since that's where
`DW_FORM_implicit_const` was first standardized).
---
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     | 28 ++++++-
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h       |  6 +-
 ...DW_AT_object_pointer-non-standard-index.ll | 79 +++++++++++++++++++
 .../DebugInfo/X86/DW_AT_object_pointer.ll     | 24 +++++-
 .../tools/llvm-dwarfdump/X86/statistics.ll    |  4 +-
 5 files changed, 132 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 2481a9bd3ce7..bfe6e7d6a802 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -895,7 +895,10 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   }
 }
 
-void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+std::optional<unsigned>
+DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+  // Args[0] is the return type.
+  std::optional<unsigned> ObjectPointerIndex;
   for (unsigned i = 1, N = Args.size(); i < N; ++i) {
     const DIType *Ty = Args[i];
     if (!Ty) {
@@ -906,8 +909,16 @@ void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
       addType(Arg, Ty);
       if (Ty->isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
+
+      if (Ty->isObjectPointer()) {
+        assert(!ObjectPointerIndex &&
+               "Can't have more than one object pointer");
+        ObjectPointerIndex = i;
+      }
     }
   }
+
+  return ObjectPointerIndex;
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
@@ -1458,7 +1469,20 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    constructSubprogramArguments(SPDie, Args);
+    //
+    // Encode the object pointer as an index instead of a DIE reference in order
+    // to minimize the affect on the .debug_info size.
+    if (std::optional<unsigned> ObjectPointerIndex =
+            constructSubprogramArguments(SPDie, Args)) {
+      if (getDwarfDebug().tuneForLLDB() &&
+          getDwarfDebug().getDwarfVersion() >= 5) {
+        // 0th index in Args is the return type, hence adjust by 1. In DWARF
+        // we want the first parameter to be at index 0.
+        assert(*ObjectPointerIndex > 0);
+        addSInt(SPDie, dwarf::DW_AT_object_pointer,
+                dwarf::DW_FORM_implicit_const, *ObjectPointerIndex - 1);
+      }
+    }
   }
 
   addThrownTypes(SPDie, SP->getThrownTypes());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index e1156bccfb1a..43bf19756386 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -273,7 +273,11 @@ public:
   void constructContainingTypeDIEs();
 
   /// Construct function argument DIEs.
-  void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
+  ///
+  /// \returns The index of the object parameter in \c Args if one exists.
+  /// Returns std::nullopt otherwise.
+  std::optional<unsigned> constructSubprogramArguments(DIE &Buffer,
+                                                       DITypeRefArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
new file mode 100644
index 000000000000..40b791fd27e3
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
@@ -0,0 +1,79 @@
+; Similar to DW_AT_object_pointer.ll but tests that we correctly
+; encode the object pointer index even if it's not the first argument
+; of the subprogram (which isn't something the major compilers do,
+; but is not mandated by DWARF).
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK
+
+; CHECK: DW_TAG_class_type
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK: DW_AT_object_pointer [DW_FORM_implicit_const] (2)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: "this"
+; CHECK: [[PARAM]]: DW_TAG_formal_parameter
+; CHECK: DW_AT_name
+; CHECK-SAME: = "this")
+; CHECK:   DW_TAG_formal_parameter
+
+%class.A = type { i8 }
+
+define linkonce_odr noundef ptr @_ZN1AC1Eii(ptr noundef nonnull returned align 1 dereferenceable(1) %this, i32 noundef %x, i32 noundef %y, i32 noundef %z) !dbg !24 {
+entry:
+  %this.addr = alloca ptr, align 8
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %z.addr = alloca i32, align 4
+  store ptr %this, ptr %this.addr, align 8
+    #dbg_declare(ptr %this.addr, !26, !DIExpression(), !28)
+  store i32 %x, ptr %x.addr, align 4
+    #dbg_declare(ptr %x.addr, !29, !DIExpression(), !30)
+  store i32 %y, ptr %y.addr, align 4
+    #dbg_declare(ptr %y.addr, !31, !DIExpression(), !32)
+  store i32 %z, ptr %y.addr, align 4
+    #dbg_declare(ptr %z.addr, !36, !DIExpression(), !37)
+  %this1 = load ptr, ptr %this.addr, align 8
+  %0 = load i32, ptr %x.addr, align 4, !dbg !33
+  %1 = load i32, ptr %y.addr, align 4, !dbg !33
+  %2 = load i32, ptr %z.addr, align 4, !dbg !33
+  ret ptr %this1, !dbg !34
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!12, !13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 3, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!3 = !DIFile(filename: "object_ptr.cpp", directory: "/tmp")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !3, line: 1, size: 8, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !6, identifier: "_ZTS1A")
+!6 = !{!7}
+!7 = !DISubprogram(name: "A", scope: !5, file: !3, line: 2, type: !8, scopeLine: 2, flags: DIFlagPublic | DIFlagPrototyped, spFlags: 0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !11, !11, !10, !35}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 7, !"Dwarf Version", i32 5}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{!"clang version 20.0.0git"}
+!24 = distinct !DISubprogram(name: "A", linkageName: "_ZN1AC1Eii", scope: !5, file: !3, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, declaration: !7, retainedNodes: !25)
+!25 = !{}
+!26 = !DILocalVariable(name: "this", arg: 3, scope: !24, type: !27, flags: DIFlagArtificial | DIFlagObjectPointer)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64)
+!28 = !DILocation(line: 0, scope: !24)
+!29 = !DILocalVariable(name: "x", arg: 2, scope: !24, file: !3, line: 2, type: !11)
+!30 = !DILocation(line: 2, column: 19, scope: !24)
+!31 = !DILocalVariable(name: "y", arg: 1, scope: !24, file: !3, line: 2, type: !11)
+!32 = !DILocation(line: 2, column: 26, scope: !24)
+!33 = !DILocation(line: 2, column: 29, scope: !24)
+!34 = !DILocation(line: 2, column: 30, scope: !24)
+!35 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!36 = !DILocalVariable(name: "z", arg: 4, scope: !24, file: !3, line: 2, type: !35)
+!37 = !DILocation(line: 2, column: 35, scope: !24)
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
index d9988ac31451..596727dce043 100644
--- a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -1,14 +1,30 @@
-; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=gdb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-GDB
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=4 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-LLDB-DWARF4
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-LLDB-DWARF5
 
 ; CHECK: DW_TAG_formal_parameter [
 ; CHECK-NOT: ""
 ; CHECK: DW_TAG
 ; CHECK: DW_TAG_class_type
-; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK-LLDB-DWARF5:             DW_AT_object_pointer [DW_FORM_implicit_const] (0)
+; CHECK-GDB-NOT:                 DW_AT_object_pointer
+; CHECK-LLDB-DWARF4-NOT:         DW_AT_object_pointer
+; CHECK: DW_TAG_formal_parameter
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
 ; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
+; CHECK: DW_AT_name
+; CHECK-SAME        = "this")
 
 %class.A = type { i32 }
 
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
index a454bf14c335..77de0241daea 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
+++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 %s -o - -filetype=obj \
+; RUN: llc -O0 %s -o - -filetype=obj -debugger-tune=gdb -accel-tables=Apple \
 ; RUN:   | llvm-dwarfdump -statistics - | FileCheck %s
 ; CHECK: "version": 9,
 
@@ -55,7 +55,7 @@
 ; CHECK:      "#bytes within functions": [[FUNCSIZE:[0-9]+]]
 ; CHECK:      "#bytes within inlined functions": [[INLINESIZE:[0-9]+]]
 ; CHECK:      "#bytes in __debug_loc": 35,
-; CHECK-NEXT: "#bytes in __debug_abbrev": 384,
+; CHECK-NEXT: "#bytes in __debug_abbrev": 375,
 ; CHECK-NEXT: "#bytes in __debug_info": 459,
 ; CHECK-NEXT: "#bytes in __debug_str": 231,
 ; CHECK-NEXT: "#bytes in __apple_names": 348,

From c9ac1679b5d3a3839640486dd4bd931a19f4725a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 08:59:10 -0700
Subject: [PATCH 0509/1322] [lldb] Remove a redundant control flow statement
 (NFC) (#144284)

---
 lldb/tools/debugserver/source/RNBRemote.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index 391d1c50168e..8be384c6d24a 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -1476,7 +1476,6 @@ bool RNBRemote::InitializeRegisters(bool force) {
 
 void RNBRemote::NotifyThatProcessStopped(void) {
   RNBRemote::HandlePacket_last_signal(NULL);
-  return;
 }
 
 /* 'A arglen,argnum,arg,...'

From 05cd32adb7ce2354563814ab6e0b818f2ed6fa26 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 08:59:18 -0700
Subject: [PATCH 0510/1322] [llvm] Remove unused includes (NFC) (#144293)

These are identified by misc-include-cleaner.  I've filtered out those
that break builds.  Also, I'm staying away from llvm-config.h,
config.h, and Compiler.h, which likely cause platform- or
compiler-specific build failures.
---
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp                  | 1 -
 llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp                   | 1 -
 llvm/lib/CodeGen/MachineBasicBlock.cpp                         | 1 -
 llvm/lib/IR/BasicBlock.cpp                                     | 1 -
 llvm/lib/IR/IRBuilder.cpp                                      | 1 -
 llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp | 1 -
 llvm/lib/Transforms/Scalar/ConstantHoisting.cpp                | 1 -
 llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp                 | 1 -
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp                  | 3 +--
 llvm/lib/Transforms/Utils/Local.cpp                            | 1 -
 10 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index e3e6c72165eb..0f2c580c759c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -16,7 +16,6 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index f68420ed66e4..1c4150127a90 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 48b406e016c0..c3c5a0f5102d 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 8b3e91750f86..3642e935397c 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 
 #include "LLVMContextImpl.h"
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index a33ef9c7d4a1..0a8b26b5f3d8 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 1d208de75db3..a9751ab03e20 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PatternMatch.h"
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 839f5933e09b..db594e033e21 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -44,7 +44,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d20378ece4ee..a09303bb4469 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 1feed14b4fed..98c65ae11b1c 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -28,11 +28,10 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index a3252a69874d..33143700f560 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -49,7 +49,6 @@
 #include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"

From ec32d8858559e4e6b5e520dfd36bfb64056fbdbb Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Mon, 16 Jun 2025 17:02:24 +0100
Subject: [PATCH 0511/1322] Annotate potentially unused variables introduced in
 #133499 (#144379)

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 828205776f3f..cf17a84242c7 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7230,7 +7230,7 @@ static Function *emitTargetTaskProxyFunction(
 
   Type *ThreadIDTy = Type::getInt32Ty(Ctx);
   Type *TaskPtrTy = OMPBuilder.TaskPtr;
-  Type *TaskTy = OMPBuilder.Task;
+  [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
 
   auto ProxyFnTy =
       FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
@@ -7664,7 +7664,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
           Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
       for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
         Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
-        Type *ArrayType = getOffloadingArrayType(PtrToPrivatize);
+        [[maybe_unused]] Type *ArrayType =
+            getOffloadingArrayType(PtrToPrivatize);
         assert(ArrayType && "ArrayType cannot be nullptr");
 
         Type *ElementType = PrivatesTy->getElementType(i);

From 5acdd8d0cf785595b06c1a28326b560f720b4f16 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 09:15:59 -0700
Subject: [PATCH 0512/1322] AVR: Rename AVRMCExpr::VK_ to AVR::S_

Prepare for removing AVRMCExpr. Adopt the new naming convention (S_
instead of VK_; the relocation specifier was previously named
`VariantKind`)) used by most other targets.

Make AVRMCAsmInfo.h include AVRMCExpr.h and change .cpp files to include
AVRMCAsmInfo.h. We will eventually remove AVRMCExpr.h.
---
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp         |   4 +-
 llvm/lib/Target/AVR/AVRMCInstLower.cpp        |  14 +-
 .../lib/Target/AVR/AsmParser/AVRAsmParser.cpp |  14 +-
 .../AVR/MCTargetDesc/AVRELFObjectWriter.cpp   |  32 +--
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp  | 179 +++++++++++++++++
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.h    |  29 +++
 .../AVR/MCTargetDesc/AVRMCCodeEmitter.cpp     |   2 +-
 .../AVR/MCTargetDesc/AVRMCELFStreamer.cpp     |  22 +--
 .../AVR/MCTargetDesc/AVRMCELFStreamer.h       |   9 +-
 .../lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 185 +-----------------
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h  |  23 ---
 11 files changed, 260 insertions(+), 253 deletions(-)

diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index ed537f8cc717..1a1e5155979e 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -16,7 +16,7 @@
 #include "AVRSubtarget.h"
 #include "AVRTargetMachine.h"
 #include "MCTargetDesc/AVRInstPrinter.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -215,7 +215,7 @@ const MCExpr *AVRAsmPrinter::lowerConstant(const Constant *CV,
     bool IsProgMem = GV->getAddressSpace() == AVR::ProgramMemory;
     if (IsProgMem) {
       const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GV), Ctx);
-      return AVRMCExpr::create(AVRMCExpr::VK_PM, Expr, false, Ctx);
+      return AVRMCExpr::create(AVR::S_PM, Expr, false, Ctx);
     }
   }
 
diff --git a/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
index 47d9073f6eb8..f4bddfdac346 100644
--- a/llvm/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -13,7 +13,7 @@
 
 #include "AVRMCInstLower.h"
 #include "AVRInstrInfo.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/Mangler.h"
@@ -42,19 +42,19 @@ AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
 
   if (TF & AVRII::MO_LO) {
     if (IsFunction) {
-      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVRMCExpr::VK_LO8_GS
-                                                        : AVRMCExpr::VK_PM_LO8,
+      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVR::S_LO8_GS
+                                                        : AVR::S_PM_LO8,
                                Expr, IsNegated, Ctx);
     } else {
-      Expr = AVRMCExpr::create(AVRMCExpr::VK_LO8, Expr, IsNegated, Ctx);
+      Expr = AVRMCExpr::create(AVR::S_LO8, Expr, IsNegated, Ctx);
     }
   } else if (TF & AVRII::MO_HI) {
     if (IsFunction) {
-      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVRMCExpr::VK_HI8_GS
-                                                        : AVRMCExpr::VK_PM_HI8,
+      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVR::S_HI8_GS
+                                                        : AVR::S_PM_HI8,
                                Expr, IsNegated, Ctx);
     } else {
-      Expr = AVRMCExpr::create(AVRMCExpr::VK_HI8, Expr, IsNegated, Ctx);
+      Expr = AVRMCExpr::create(AVR::S_HI8, Expr, IsNegated, Ctx);
     }
   } else if (TF != 0) {
     llvm_unreachable("Unknown target flag on symbol operand");
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index cab5caffdcba..e82bd761eeb3 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRRegisterInfo.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCELFStreamer.h"
-#include "MCTargetDesc/AVRMCExpr.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
@@ -447,7 +447,7 @@ bool AVRAsmParser::tryParseExpression(OperandVector &Operands, int64_t offset) {
 
 bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   bool isNegated = false;
-  AVRMCExpr::Specifier ModifierKind = AVRMCExpr::VK_AVR_NONE;
+  AVR::Specifier ModifierKind = AVR::S_AVR_NONE;
 
   SMLoc S = Parser.getTok().getLoc();
 
@@ -473,14 +473,14 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   StringRef ModifierName = Parser.getTok().getString();
   ModifierKind = AVRMCExpr::parseSpecifier(ModifierName);
 
-  if (ModifierKind != AVRMCExpr::VK_AVR_NONE) {
+  if (ModifierKind != AVR::S_AVR_NONE) {
     Parser.Lex();
     Parser.Lex(); // Eat modifier name and parenthesis
     if (Parser.getTok().getString() == GENERATE_STUBS &&
         Parser.getTok().getKind() == AsmToken::Identifier) {
       std::string GSModName = ModifierName.str() + "_" + GENERATE_STUBS;
       ModifierKind = AVRMCExpr::parseSpecifier(GSModName);
-      if (ModifierKind != AVRMCExpr::VK_AVR_NONE)
+      if (ModifierKind != AVR::S_AVR_NONE)
         Parser.Lex(); // Eat gs modifier name
     }
   } else {
@@ -698,15 +698,15 @@ ParseStatus AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
       Tokens[1].getKind() == AsmToken::Identifier) {
     MCSymbol *Symbol = getContext().getOrCreateSymbol(".text");
     AVRStreamer.emitValueForModiferKind(Symbol, SizeInBytes, L,
-                                        AVRMCExpr::VK_AVR_NONE);
+                                        AVR::S_AVR_NONE);
     return ParseStatus::NoMatch;
   }
 
   if (Parser.getTok().getKind() == AsmToken::Identifier &&
       Parser.getLexer().peekTok().getKind() == AsmToken::LParen) {
     StringRef ModifierName = Parser.getTok().getString();
-    AVRMCExpr::Specifier Spec = AVRMCExpr::parseSpecifier(ModifierName);
-    if (Spec != AVRMCExpr::VK_AVR_NONE) {
+    AVR::Specifier Spec = AVRMCExpr::parseSpecifier(ModifierName);
+    if (Spec != AVR::S_AVR_NONE) {
       Parser.Lex();
       Parser.Lex(); // Eat the modifier and parenthesis
     } else {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index e79ba29e0cbe..619efb376c61 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AVRFixupKinds.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 #include "llvm/MC/MCAssembler.h"
@@ -36,42 +36,42 @@ AVRELFObjectWriter::AVRELFObjectWriter(uint8_t OSABI)
 unsigned AVRELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                           const MCValue &Target,
                                           bool IsPCRel) const {
-  auto Modifier = AVRMCExpr::Specifier(Target.getSpecifier());
+  auto Spec = Target.getSpecifier();
   switch ((unsigned)Fixup.getKind()) {
   case FK_Data_1:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_8;
-    case AVRMCExpr::VK_DIFF8:
+    case AVR::S_DIFF8:
       return ELF::R_AVR_DIFF8;
-    case AVRMCExpr::VK_LO8:
+    case AVR::S_LO8:
       return ELF::R_AVR_8_LO8;
-    case AVRMCExpr::VK_HI8:
+    case AVR::S_HI8:
       return ELF::R_AVR_8_HI8;
-    case AVRMCExpr::VK_HH8:
+    case AVR::S_HH8:
       return ELF::R_AVR_8_HLO8;
     }
   case FK_Data_4:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_32;
-    case AVRMCExpr::VK_DIFF32:
+    case AVR::S_DIFF32:
       return ELF::R_AVR_DIFF32;
     }
   case FK_Data_2:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_16;
-    case AVRMCExpr::VK_AVR_NONE:
-    case AVRMCExpr::VK_PM:
+    case AVR::S_AVR_NONE:
+    case AVR::S_PM:
       return ELF::R_AVR_16_PM;
-    case AVRMCExpr::VK_DIFF16:
+    case AVR::S_DIFF16:
       return ELF::R_AVR_DIFF16;
     }
   case AVR::fixup_32:
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index d37e39c51e15..68db5227d073 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRMCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -26,3 +29,179 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
 }
+
+namespace {
+const struct ModifierEntry {
+  const char *const Spelling;
+  AVRMCExpr::Specifier specifier;
+} ModifierNames[] = {
+    {"lo8", AVR::S_LO8},       {"hi8", AVR::S_HI8},
+    {"hh8", AVR::S_HH8}, // synonym with hlo8
+    {"hlo8", AVR::S_HH8},      {"hhi8", AVR::S_HHI8},
+
+    {"pm", AVR::S_PM},         {"pm_lo8", AVR::S_PM_LO8},
+    {"pm_hi8", AVR::S_PM_HI8}, {"pm_hh8", AVR::S_PM_HH8},
+
+    {"lo8_gs", AVR::S_LO8_GS}, {"hi8_gs", AVR::S_HI8_GS},
+    {"gs", AVR::S_GS},
+};
+
+} // end of anonymous namespace
+
+AVRMCExpr::Specifier AVRMCExpr::parseSpecifier(StringRef Name) {
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
+        return Mod.Spelling == Name;
+      });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->specifier;
+  }
+  return AVR::S_AVR_NONE;
+}
+
+const char *AVRMCExpr::getName() const {
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
+        return Mod.specifier == specifier;
+      });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->Spelling;
+  }
+  return nullptr;
+}
+
+AVR::Fixups AVRMCExpr::getFixupKind() const {
+  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
+
+  switch (specifier) {
+  case AVR::S_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
+    break;
+  case AVR::S_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
+    break;
+  case AVR::S_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
+    break;
+  case AVR::S_HHI8:
+    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
+    break;
+
+  case AVR::S_PM_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
+    break;
+  case AVR::S_PM_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
+    break;
+  case AVR::S_PM_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
+    break;
+  case AVR::S_PM:
+  case AVR::S_GS:
+    Kind = AVR::fixup_16_pm;
+    break;
+  case AVR::S_LO8_GS:
+    Kind = AVR::fixup_lo8_ldi_gs;
+    break;
+  case AVR::S_HI8_GS:
+    Kind = AVR::fixup_hi8_ldi_gs;
+    break;
+
+  default:
+    llvm_unreachable("Uninitialized expression");
+  }
+
+  return Kind;
+}
+
+int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
+  if (Negated)
+    Value *= -1;
+
+  switch (specifier) {
+  case AVR::S_LO8:
+    Value &= 0xff;
+    break;
+  case AVR::S_HI8:
+    Value &= 0xff00;
+    Value >>= 8;
+    break;
+  case AVR::S_HH8:
+    Value &= 0xff0000;
+    Value >>= 16;
+    break;
+  case AVR::S_HHI8:
+    Value &= 0xff000000;
+    Value >>= 24;
+    break;
+  case AVR::S_PM_LO8:
+  case AVR::S_LO8_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff;
+    break;
+  case AVR::S_PM_HI8:
+  case AVR::S_HI8_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff00;
+    Value >>= 8;
+    break;
+  case AVR::S_PM_HH8:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff0000;
+    Value >>= 16;
+    break;
+  case AVR::S_PM:
+  case AVR::S_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    break;
+
+  case AVR::S_AVR_NONE:
+  default:
+    llvm_unreachable("Uninitialized expression.");
+  }
+  return static_cast<uint64_t>(Value) & 0xff;
+}
+
+bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+                                          const MCAssembler *Asm) const {
+  MCValue Value;
+  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
+  } else {
+    if (!Asm || !Asm->hasLayout())
+      return false;
+
+    auto Spec = AVR::S_None;
+    if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
+      return false;
+    assert(!Value.getSubSym());
+    if (specifier == AVR::S_PM)
+      Spec = AVR::S_PM;
+
+    // TODO: don't attach specifier to MCSymbolRefExpr.
+    Result =
+        MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec);
+  }
+
+  return true;
+}
+
+bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
+  MCValue Value;
+  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, nullptr);
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = evaluateAsInt64(Value.getConstant());
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index 17dd77f6266a..649e247adab0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_AVR_ASM_INFO_H
 #define LLVM_AVR_ASM_INFO_H
 
+#include "MCTargetDesc/AVRMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 
@@ -25,6 +27,33 @@ public:
   explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
 };
 
+namespace AVR {
+using Specifier = uint16_t;
+enum {
+  S_None,
+
+  S_AVR_NONE = MCSymbolRefExpr::FirstTargetSpecifier,
+
+  S_HI8,  ///< Corresponds to `hi8()`.
+  S_LO8,  ///< Corresponds to `lo8()`.
+  S_HH8,  ///< Corresponds to `hlo8() and hh8()`.
+  S_HHI8, ///< Corresponds to `hhi8()`.
+
+  S_PM,     ///< Corresponds to `pm()`, reference to program memory.
+  S_PM_LO8, ///< Corresponds to `pm_lo8()`.
+  S_PM_HI8, ///< Corresponds to `pm_hi8()`.
+  S_PM_HH8, ///< Corresponds to `pm_hh8()`.
+
+  S_LO8_GS, ///< Corresponds to `lo8(gs())`.
+  S_HI8_GS, ///< Corresponds to `hi8(gs())`.
+  S_GS,     ///< Corresponds to `gs()`.
+
+  S_DIFF8,
+  S_DIFF16,
+  S_DIFF32,
+};
+} // namespace AVR
+
 } // end namespace llvm
 
 #endif // LLVM_AVR_ASM_INFO_H
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index fa01dad5ec12..4934e1c71bc0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -12,7 +12,7 @@
 
 #include "AVRMCCodeEmitter.h"
 
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 #include "llvm/ADT/APFloat.h"
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index 88393fb9928a..0644f422b328 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -23,19 +23,19 @@ using namespace llvm;
 void AVRMCELFStreamer::emitValueForModiferKind(
     const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc,
     AVRMCExpr::Specifier ModifierKind) {
-  AVRMCExpr::Specifier Kind = AVRMCExpr::VK_AVR_NONE;
-  if (ModifierKind == AVRMCExpr::VK_AVR_NONE) {
-    Kind = AVRMCExpr::VK_DIFF8;
+  AVRMCExpr::Specifier Kind = AVR::S_AVR_NONE;
+  if (ModifierKind == AVR::S_AVR_NONE) {
+    Kind = AVR::S_DIFF8;
     if (SizeInBytes == SIZE_LONG)
-      Kind = AVRMCExpr::VK_DIFF32;
+      Kind = AVR::S_DIFF32;
     else if (SizeInBytes == SIZE_WORD)
-      Kind = AVRMCExpr::VK_DIFF16;
-  } else if (ModifierKind == AVRMCExpr::VK_LO8)
-    Kind = AVRMCExpr::VK_LO8;
-  else if (ModifierKind == AVRMCExpr::VK_HI8)
-    Kind = AVRMCExpr::VK_HI8;
-  else if (ModifierKind == AVRMCExpr::VK_HH8)
-    Kind = AVRMCExpr::VK_HH8;
+      Kind = AVR::S_DIFF16;
+  } else if (ModifierKind == AVR::S_LO8)
+    Kind = AVR::S_LO8;
+  else if (ModifierKind == AVR::S_HI8)
+    Kind = AVR::S_HI8;
+  else if (ModifierKind == AVR::S_HH8)
+    Kind = AVR::S_HH8;
   MCELFStreamer::emitValue(
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VariantKind(Kind),
                               getContext()),
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 2d45de083583..88352337524a 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AVR_MCTARGETDESC_AVRMCELFSTREAMER_H
 #define LLVM_LIB_TARGET_AVR_MCTARGETDESC_AVRMCELFSTREAMER_H
 
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -41,9 +41,10 @@ public:
                       std::move(Emitter)),
         MCII(createAVRMCInstrInfo()) {}
 
-  void emitValueForModiferKind(
-      const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc = SMLoc(),
-      AVRMCExpr::Specifier ModifierKind = AVRMCExpr::VK_AVR_NONE);
+  void
+  emitValueForModiferKind(const MCSymbol *Sym, unsigned SizeInBytes,
+                          SMLoc Loc = SMLoc(),
+                          AVRMCExpr::Specifier ModifierKind = AVR::S_AVR_NONE);
 };
 
 MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5be799093d2c..3067e854d8dc 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,41 +7,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
 
 namespace llvm {
 
-namespace {
-
-const struct ModifierEntry {
-  const char *const Spelling;
-  AVRMCExpr::Specifier specifier;
-} ModifierNames[] = {
-    {"lo8", AVRMCExpr::VK_LO8},       {"hi8", AVRMCExpr::VK_HI8},
-    {"hh8", AVRMCExpr::VK_HH8}, // synonym with hlo8
-    {"hlo8", AVRMCExpr::VK_HH8},      {"hhi8", AVRMCExpr::VK_HHI8},
-
-    {"pm", AVRMCExpr::VK_PM},         {"pm_lo8", AVRMCExpr::VK_PM_LO8},
-    {"pm_hi8", AVRMCExpr::VK_PM_HI8}, {"pm_hh8", AVRMCExpr::VK_PM_HH8},
-
-    {"lo8_gs", AVRMCExpr::VK_LO8_GS}, {"hi8_gs", AVRMCExpr::VK_HI8_GS},
-    {"gs", AVRMCExpr::VK_GS},
-};
-
-} // end of anonymous namespace
-
 const AVRMCExpr *AVRMCExpr::create(Specifier Kind, const MCExpr *Expr,
                                    bool Negated, MCContext &Ctx) {
   return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
 }
 
 void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  assert(specifier != VK_AVR_NONE);
+  assert(specifier != AVR::S_AVR_NONE);
   OS << getName() << '(';
   if (isNegated())
     OS << '-' << '(';
@@ -51,164 +32,4 @@ void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << ')';
 }
 
-bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
-  MCValue Value;
-
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, nullptr);
-
-  if (!isRelocatable)
-    return false;
-
-  if (Value.isAbsolute()) {
-    Result = evaluateAsInt64(Value.getConstant());
-    return true;
-  }
-
-  return false;
-}
-
-bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
-                                          const MCAssembler *Asm) const {
-  MCValue Value;
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
-  if (!isRelocatable)
-    return false;
-
-  if (Value.isAbsolute()) {
-    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
-  } else {
-    if (!Asm || !Asm->hasLayout())
-      return false;
-
-    auto Spec = AVRMCExpr::VK_None;
-    if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
-      return false;
-    assert(!Value.getSubSym());
-    if (specifier == VK_PM)
-      Spec = AVRMCExpr::VK_PM;
-
-    // TODO: don't attach specifier to MCSymbolRefExpr.
-    Result =
-        MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec);
-  }
-
-  return true;
-}
-
-int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
-  if (Negated)
-    Value *= -1;
-
-  switch (specifier) {
-  case AVRMCExpr::VK_LO8:
-    Value &= 0xff;
-    break;
-  case AVRMCExpr::VK_HI8:
-    Value &= 0xff00;
-    Value >>= 8;
-    break;
-  case AVRMCExpr::VK_HH8:
-    Value &= 0xff0000;
-    Value >>= 16;
-    break;
-  case AVRMCExpr::VK_HHI8:
-    Value &= 0xff000000;
-    Value >>= 24;
-    break;
-  case AVRMCExpr::VK_PM_LO8:
-  case AVRMCExpr::VK_LO8_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff;
-    break;
-  case AVRMCExpr::VK_PM_HI8:
-  case AVRMCExpr::VK_HI8_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff00;
-    Value >>= 8;
-    break;
-  case AVRMCExpr::VK_PM_HH8:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff0000;
-    Value >>= 16;
-    break;
-  case AVRMCExpr::VK_PM:
-  case AVRMCExpr::VK_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    break;
-
-  case AVRMCExpr::VK_AVR_NONE:
-  default:
-    llvm_unreachable("Uninitialized expression.");
-  }
-  return static_cast<uint64_t>(Value) & 0xff;
-}
-
-AVR::Fixups AVRMCExpr::getFixupKind() const {
-  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
-
-  switch (specifier) {
-  case VK_LO8:
-    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
-    break;
-  case VK_HI8:
-    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
-    break;
-  case VK_HH8:
-    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
-    break;
-  case VK_HHI8:
-    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
-    break;
-
-  case VK_PM_LO8:
-    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
-    break;
-  case VK_PM_HI8:
-    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
-    break;
-  case VK_PM_HH8:
-    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
-    break;
-  case VK_PM:
-  case VK_GS:
-    Kind = AVR::fixup_16_pm;
-    break;
-  case VK_LO8_GS:
-    Kind = AVR::fixup_lo8_ldi_gs;
-    break;
-  case VK_HI8_GS:
-    Kind = AVR::fixup_hi8_ldi_gs;
-    break;
-
-  default:
-    llvm_unreachable("Uninitialized expression");
-  }
-
-  return Kind;
-}
-
-const char *AVRMCExpr::getName() const {
-  const auto &Modifier =
-      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
-        return Mod.specifier == specifier;
-      });
-
-  if (Modifier != std::end(ModifierNames)) {
-    return Modifier->Spelling;
-  }
-  return nullptr;
-}
-
-AVRMCExpr::Specifier AVRMCExpr::parseSpecifier(StringRef Name) {
-  const auto &Modifier =
-      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
-        return Mod.Spelling == Name;
-      });
-
-  if (Modifier != std::end(ModifierNames)) {
-    return Modifier->specifier;
-  }
-  return VK_AVR_NONE;
-}
-
-} // end of namespace llvm
+} // namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index 69c60cde1f74..d72d36f10858 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -20,29 +20,6 @@ class AVRMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = Spec;
   /// Specifies the type of an expression.
-  enum {
-    VK_None,
-
-    VK_AVR_NONE = MCSymbolRefExpr::FirstTargetSpecifier,
-
-    VK_HI8,  ///< Corresponds to `hi8()`.
-    VK_LO8,  ///< Corresponds to `lo8()`.
-    VK_HH8,  ///< Corresponds to `hlo8() and hh8()`.
-    VK_HHI8, ///< Corresponds to `hhi8()`.
-
-    VK_PM,     ///< Corresponds to `pm()`, reference to program memory.
-    VK_PM_LO8, ///< Corresponds to `pm_lo8()`.
-    VK_PM_HI8, ///< Corresponds to `pm_hi8()`.
-    VK_PM_HH8, ///< Corresponds to `pm_hh8()`.
-
-    VK_LO8_GS, ///< Corresponds to `lo8(gs())`.
-    VK_HI8_GS, ///< Corresponds to `hi8(gs())`.
-    VK_GS,     ///< Corresponds to `gs()`.
-
-    VK_DIFF8,
-    VK_DIFF16,
-    VK_DIFF32,
-  };
 
 public:
   /// Creates an AVR machine code expression.

From 25dcd231bfee1120c21b102e074542c54fb7c5c2 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Mon, 16 Jun 2025 10:16:47 -0600
Subject: [PATCH 0513/1322] [IPO] Added attributor for identifying invariant
 loads (#141800)

The attributor conservatively marks pointers whose loads are eligible to
be marked as `!invariant.load`.
It does so by identifying:
1. Pointers marked `noalias` and `readonly`
2. Pointers whose underlying objects are all eligible for invariant
loads.

The attributor then manifests this attribute at non-atomic non-volatile
load instructions.
---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  41 ++
 llvm/lib/Transforms/IPO/Attributor.cpp        |   2 +
 .../Transforms/IPO/AttributorAttributes.cpp   | 339 ++++++++++++++++
 .../Attributor/AMDGPU/tag-invariant-loads.ll  | 382 ++++++++++++++++++
 .../Attributor/dereferenceable-1.ll           |   1 -
 .../Attributor/value-simplify-local-remote.ll |  22 +-
 6 files changed, 772 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index e6eb756df987..f19f3292c479 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -6335,6 +6335,47 @@ struct AAUnderlyingObjects : AbstractAttribute {
                           AA::ValueScope Scope = AA::Interprocedural) const = 0;
 };
 
+/// An abstract interface for identifying pointers from which loads can be
+/// marked invariant.
+struct AAInvariantLoadPointer : public AbstractAttribute {
+  AAInvariantLoadPointer(const IRPosition &IRP) : AbstractAttribute(IRP) {}
+
+  /// See AbstractAttribute::isValidIRPositionForInit
+  static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
+    if (!IRP.getAssociatedType()->isPointerTy())
+      return false;
+
+    return AbstractAttribute::isValidIRPositionForInit(A, IRP);
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAInvariantLoadPointer &createForPosition(const IRPosition &IRP,
+                                                   Attributor &A);
+
+  /// Return true if the pointer's contents are known to remain invariant.
+  virtual bool isKnownInvariant() const = 0;
+  virtual bool isKnownLocallyInvariant() const = 0;
+
+  /// Return true if the pointer's contents are assumed to remain invariant.
+  virtual bool isAssumedInvariant() const = 0;
+  virtual bool isAssumedLocallyInvariant() const = 0;
+
+  /// See AbstractAttribute::getName().
+  StringRef getName() const override { return "AAInvariantLoadPointer"; }
+
+  /// See AbstractAttribute::getIdAddr().
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAInvariantLoadPointer
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address).
+  static const char ID;
+};
+
 /// An abstract interface for address space information.
 struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
   AAAddressSpace(const IRPosition &IRP, Attributor &A)
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index dac1f7a30c37..a2548258ddaf 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3612,6 +3612,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       if (SimplifyAllLoads)
         getAssumedSimplified(IRPosition::value(I), nullptr,
                              UsedAssumedInformation, AA::Intraprocedural);
+      getOrCreateAAFor<AAInvariantLoadPointer>(
+          IRPosition::value(*LI->getPointerOperand()));
       getOrCreateAAFor<AAAddressSpace>(
           IRPosition::value(*LI->getPointerOperand()));
     } else {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 3799a696f67a..5cb8f888354b 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -191,6 +191,7 @@ PIPE_OPERATOR(AAInterFnReachability)
 PIPE_OPERATOR(AAPointerInfo)
 PIPE_OPERATOR(AAAssumptionInfo)
 PIPE_OPERATOR(AAUnderlyingObjects)
+PIPE_OPERATOR(AAInvariantLoadPointer)
 PIPE_OPERATOR(AAAddressSpace)
 PIPE_OPERATOR(AAAllocationInfo)
 PIPE_OPERATOR(AAIndirectCallInfo)
@@ -12533,6 +12534,342 @@ private:
 };
 } // namespace
 
+/// --------------------- Invariant Load Pointer -------------------------------
+namespace {
+
+struct AAInvariantLoadPointerImpl
+    : public StateWrapper<BitIntegerState<uint8_t, 15>,
+                          AAInvariantLoadPointer> {
+
+  enum {
+    // pointer does not alias within the bounds of the function
+    IS_NOALIAS = 1 << 0,
+    // pointer is not involved in any effectful instructions within the bounds
+    // of the function
+    IS_NOEFFECT = 1 << 1,
+    // loads are invariant within the bounds of the function
+    IS_LOCALLY_INVARIANT = 1 << 2,
+    // memory lifetime is constrained within the bounds of the function
+    IS_LOCALLY_CONSTRAINED = 1 << 3,
+
+    IS_BEST_STATE = IS_NOALIAS | IS_NOEFFECT | IS_LOCALLY_INVARIANT |
+                    IS_LOCALLY_CONSTRAINED,
+  };
+  static_assert(getBestState() == IS_BEST_STATE, "Unexpected best state");
+
+  using Base =
+      StateWrapper<BitIntegerState<uint8_t, 15>, AAInvariantLoadPointer>;
+
+  // the BitIntegerState is optimistic about IS_NOALIAS and IS_NOEFFECT, but
+  // pessimistic about IS_KNOWN_INVARIANT
+  AAInvariantLoadPointerImpl(const IRPosition &IRP, Attributor &A)
+      : Base(IRP) {}
+
+  bool isKnownInvariant() const final {
+    return isKnownLocallyInvariant() && isKnown(IS_LOCALLY_CONSTRAINED);
+  }
+
+  bool isKnownLocallyInvariant() const final {
+    if (isKnown(IS_LOCALLY_INVARIANT))
+      return true;
+    return isKnown(IS_NOALIAS | IS_NOEFFECT);
+  }
+
+  bool isAssumedInvariant() const final {
+    return isAssumedLocallyInvariant() && isAssumed(IS_LOCALLY_CONSTRAINED);
+  }
+
+  bool isAssumedLocallyInvariant() const final {
+    if (isAssumed(IS_LOCALLY_INVARIANT))
+      return true;
+    return isAssumed(IS_NOALIAS | IS_NOEFFECT);
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    Changed |= updateNoAlias(A);
+    if (requiresNoAlias() && !isAssumed(IS_NOALIAS))
+      return indicatePessimisticFixpoint();
+
+    Changed |= updateNoEffect(A);
+
+    Changed |= updateLocalInvariance(A);
+
+    return Changed;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (!isKnownInvariant())
+      return ChangeStatus::UNCHANGED;
+
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    const Value *Ptr = &getAssociatedValue();
+    const auto TagInvariantLoads = [&](const Use &U, bool &) {
+      if (U.get() != Ptr)
+        return true;
+      auto *I = dyn_cast<Instruction>(U.getUser());
+      if (!I)
+        return true;
+
+      // Ensure that we are only changing uses from the corresponding callgraph
+      // SSC in the case that the AA isn't run on the entire module
+      if (!A.isRunOn(I->getFunction()))
+        return true;
+
+      if (I->hasMetadata(LLVMContext::MD_invariant_load))
+        return true;
+
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        LI->setMetadata(LLVMContext::MD_invariant_load,
+                        MDNode::get(LI->getContext(), {}));
+        Changed = ChangeStatus::CHANGED;
+      }
+      return true;
+    };
+
+    (void)A.checkForAllUses(TagInvariantLoads, *this, *Ptr);
+    return Changed;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr(Attributor *) const override {
+    if (isKnownInvariant())
+      return "load-invariant pointer";
+    return "non-invariant pointer";
+  }
+
+  /// See AbstractAttribute::trackStatistics().
+  void trackStatistics() const override {}
+
+private:
+  /// Indicate that noalias is required for the pointer to be invariant.
+  bool requiresNoAlias() const {
+    switch (getPositionKind()) {
+    default:
+      // Conservatively default to require noalias.
+      return true;
+    case IRP_FLOAT:
+    case IRP_RETURNED:
+    case IRP_CALL_SITE:
+      return false;
+    case IRP_CALL_SITE_RETURNED: {
+      const auto &CB = cast<CallBase>(getAnchorValue());
+      return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+          &CB, /*MustPreserveNullness=*/false);
+    }
+    case IRP_ARGUMENT: {
+      const Function *F = getAssociatedFunction();
+      assert(F && "no associated function for argument");
+      return !isCallableCC(F->getCallingConv());
+    }
+    }
+  }
+
+  bool isExternal() const {
+    const Function *F = getAssociatedFunction();
+    if (!F)
+      return true;
+    return isCallableCC(F->getCallingConv()) &&
+           getPositionKind() != IRP_CALL_SITE_RETURNED;
+  }
+
+  ChangeStatus updateNoAlias(Attributor &A) {
+    if (isKnown(IS_NOALIAS) || !isAssumed(IS_NOALIAS))
+      return ChangeStatus::UNCHANGED;
+
+    // Try to use AANoAlias.
+    if (const auto *ANoAlias = A.getOrCreateAAFor<AANoAlias>(
+            getIRPosition(), this, DepClassTy::REQUIRED)) {
+      if (ANoAlias->isKnownNoAlias()) {
+        addKnownBits(IS_NOALIAS);
+        return ChangeStatus::CHANGED;
+      }
+
+      if (!ANoAlias->isAssumedNoAlias()) {
+        removeAssumedBits(IS_NOALIAS);
+        return ChangeStatus::CHANGED;
+      }
+
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // Try to infer noalias from argument attribute, since it is applicable for
+    // the duration of the function.
+    if (const Argument *Arg = getAssociatedArgument()) {
+      if (Arg->hasNoAliasAttr()) {
+        addKnownBits(IS_NOALIAS);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      // Noalias information is not provided, and cannot be inferred,
+      // so we conservatively assume the pointer aliases.
+      removeAssumedBits(IS_NOALIAS);
+      return ChangeStatus::CHANGED;
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus updateNoEffect(Attributor &A) {
+    if (isKnown(IS_NOEFFECT) || !isAssumed(IS_NOEFFECT))
+      return ChangeStatus::UNCHANGED;
+
+    if (!getAssociatedFunction())
+      return indicatePessimisticFixpoint();
+
+    const auto HasNoEffectLoads = [&](const Use &U, bool &) {
+      const auto *LI = dyn_cast<LoadInst>(U.getUser());
+      return !LI || !LI->mayHaveSideEffects();
+    };
+    if (!A.checkForAllUses(HasNoEffectLoads, *this, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+
+    // Try to use AAMemoryBehavior to infer readonly attribute.
+    if (const auto *AMemoryBehavior = A.getOrCreateAAFor<AAMemoryBehavior>(
+            getIRPosition(), this, DepClassTy::REQUIRED)) {
+      if (!AMemoryBehavior->isAssumedReadOnly())
+        return indicatePessimisticFixpoint();
+
+      if (AMemoryBehavior->isKnownReadOnly()) {
+        addKnownBits(IS_NOEFFECT);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      return ChangeStatus::UNCHANGED;
+    }
+
+    if (const Argument *Arg = getAssociatedArgument()) {
+      if (Arg->onlyReadsMemory()) {
+        addKnownBits(IS_NOEFFECT);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      // Readonly information is not provided, and cannot be inferred from
+      // AAMemoryBehavior.
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus updateLocalInvariance(Attributor &A) {
+    if (isKnown(IS_LOCALLY_INVARIANT) || !isAssumed(IS_LOCALLY_INVARIANT))
+      return ChangeStatus::UNCHANGED;
+
+    // try to infer invariance from underlying objects
+    const auto *AUO = A.getOrCreateAAFor<AAUnderlyingObjects>(
+        getIRPosition(), this, DepClassTy::REQUIRED);
+    if (!AUO)
+      return ChangeStatus::UNCHANGED;
+
+    bool UsedAssumedInformation = false;
+    const auto IsLocallyInvariantLoadIfPointer = [&](const Value &V) {
+      if (!V.getType()->isPointerTy())
+        return true;
+      const auto *IsInvariantLoadPointer =
+          A.getOrCreateAAFor<AAInvariantLoadPointer>(IRPosition::value(V), this,
+                                                     DepClassTy::REQUIRED);
+      // Conservatively fail if invariance cannot be inferred.
+      if (!IsInvariantLoadPointer)
+        return false;
+
+      if (IsInvariantLoadPointer->isKnownLocallyInvariant())
+        return true;
+      if (!IsInvariantLoadPointer->isAssumedLocallyInvariant())
+        return false;
+
+      UsedAssumedInformation = true;
+      return true;
+    };
+    if (!AUO->forallUnderlyingObjects(IsLocallyInvariantLoadIfPointer))
+      return indicatePessimisticFixpoint();
+
+    if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue())) {
+      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+              CB, /*MustPreserveNullness=*/false)) {
+        for (const Value *Arg : CB->args()) {
+          if (!IsLocallyInvariantLoadIfPointer(*Arg))
+            return indicatePessimisticFixpoint();
+        }
+      }
+    }
+
+    if (!UsedAssumedInformation) {
+      // Pointer is known and not just assumed to be locally invariant.
+      addKnownBits(IS_LOCALLY_INVARIANT);
+      return ChangeStatus::CHANGED;
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AAInvariantLoadPointerFloating final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerFloating(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+};
+
+struct AAInvariantLoadPointerReturned final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerReturned(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &) override {
+    removeAssumedBits(IS_LOCALLY_CONSTRAINED);
+  }
+};
+
+struct AAInvariantLoadPointerCallSiteReturned final
+    : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    const Function *F = getAssociatedFunction();
+    assert(F && "no associated function for return from call");
+
+    if (!F->isDeclaration() && !F->isIntrinsic())
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    const auto &CB = cast<CallBase>(getAnchorValue());
+    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+            &CB, /*MustPreserveNullness=*/false))
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    if (F->onlyReadsMemory() && F->hasNoSync())
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    // At this point, the function is opaque, so we conservatively assume
+    // non-invariance.
+    indicatePessimisticFixpoint();
+  }
+};
+
+struct AAInvariantLoadPointerArgument final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerArgument(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &) override {
+    const Function *F = getAssociatedFunction();
+    assert(F && "no associated function for argument");
+
+    if (!isCallableCC(F->getCallingConv())) {
+      addKnownBits(IS_LOCALLY_CONSTRAINED);
+      return;
+    }
+
+    if (!F->hasLocalLinkage())
+      removeAssumedBits(IS_LOCALLY_CONSTRAINED);
+  }
+};
+
+struct AAInvariantLoadPointerCallSiteArgument final
+    : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+};
+} // namespace
+
 /// ------------------------ Address Space  ------------------------------------
 namespace {
 
@@ -13038,6 +13375,7 @@ const char AAInterFnReachability::ID = 0;
 const char AAPointerInfo::ID = 0;
 const char AAAssumptionInfo::ID = 0;
 const char AAUnderlyingObjects::ID = 0;
+const char AAInvariantLoadPointer::ID = 0;
 const char AAAddressSpace::ID = 0;
 const char AAAllocationInfo::ID = 0;
 const char AAIndirectCallInfo::ID = 0;
@@ -13172,6 +13510,7 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFPClass)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPointerInfo)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAInvariantLoadPointer)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAddressSpace)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAllocationInfo)
 
diff --git a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
new file mode 100644
index 000000000000..ace68a19bf41
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
@@ -0,0 +1,382 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=attributor %s -S | FileCheck %s --check-prefix=AMDGCN
+
+@G = addrspace(1) global i32 zeroinitializer, align 4
+declare void @clobber(i32) #0
+declare ptr addrspace(1) @get_ptr() #0
+declare noalias ptr addrspace(1) @get_noalias_ptr() #0
+declare noalias ptr addrspace(1) @get_untouched_ptr() #1
+
+define void @test_nonkernel(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define void @test_nonkernel(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6:[0-9]+]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be !invariant.load, as the caller may modify %ptr
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_plain(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_plain(
+; AMDGCN-SAME: ptr addrspace(1) nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be !invariant.load, as %ptr may alias a pointer in @clobber
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_noalias_ptr(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_ptr(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0:![0-9]+]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_gep(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_gep(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %val = load i32, ptr addrspace(1) %gep, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_noalias_gep(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_gep(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %val = load i32, ptr addrspace(1) %gep, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_swap(ptr addrspace(1) noalias %ptr, i32 inreg %swap) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_swap(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]], i32 inreg [[SWAP:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    store i32 [[SWAP]], ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; cannot be !invariant.load due to the write to %ptr
+  store i32 %swap, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_volatile(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_volatile(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load volatile i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load volatile i32, ptr addrspace(1) %ptr, align 4
+  ;; volatiles loads cannot be !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_unordered(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_unordered(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] unordered, align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load atomic i32, ptr addrspace(1) %ptr unordered, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_monotonic(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_monotonic(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] monotonic, align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load atomic i32, ptr addrspace(1) %ptr monotonic, align 4
+  ;; atomic loads with ordering guarantees may have side effects
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_global() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_global(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) @G, align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) @G, align 4
+  ;; is not an !invariant.load as global variables may change
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_noalias_load(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_noalias_load(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; is an !invariant.load due to its only caller @test_call_internal_noalias
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal_noalias(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_noalias(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_noalias_load(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7:[0-9]+]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_noalias_load(ptr addrspace(1) %ptr)
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_load(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_load(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since the pointer in @test_call_internal may alias
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_load(ptr addrspace(1) nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_load(ptr addrspace(1) %ptr)
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_written(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_written(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; cannot be an !invariant.load because of the write in caller @test_call_internal_written
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal_written(ptr addrspace(1) noalias %ptr, i32 inreg %x) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_written(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree captures(none) [[PTR:%.*]], i32 inreg [[X:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_written(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
+; AMDGCN-NEXT:    store i32 [[X]], ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_written(ptr addrspace(1) %ptr)
+  store i32 %x, ptr addrspace(1) %ptr
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_ptr() #[[ATTR6]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since %ptr may alias
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_noalias_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_noalias_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_noalias_ptr() #[[ATTR6]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_noalias_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since %ptr may have been written to before returning
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_untouched_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_untouched_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call noalias align 4 ptr addrspace(1) @get_untouched_ptr() #[[ATTR8:[0-9]+]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_untouched_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9:[0-9]+]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
+  %val = load i32, ptr addrspace(7) %rsrc, align 4
+  ;; original %ptr may alias
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_make_buffer_noalias(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer_noalias(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
+  %val = load i32, ptr addrspace(7) %rsrc, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_selected_load(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load(
+; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; either pointer yields an !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_selected_load_partial_noalias(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load_partial_noalias(
+; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; %ptr.false may alias, so no !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_branch_load(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load(
+; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:  [[ENTRY:.*:]]
+; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; AMDGCN:       [[TRUE]]:
+; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
+; AMDGCN-NEXT:    br label %[[FINISH:.*]]
+; AMDGCN:       [[FALSE]]:
+; AMDGCN-NEXT:    br label %[[FINISH]]
+; AMDGCN:       [[FINISH]]:
+; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %true, label %false
+true:
+  call void @clobber(i32 1)
+  br label %finish
+false:
+  br label %finish
+finish:
+  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; either pointer yields an !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_branch_load_partial_noalias(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load_partial_noalias(
+; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:  [[ENTRY:.*:]]
+; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; AMDGCN:       [[TRUE]]:
+; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
+; AMDGCN-NEXT:    br label %[[FINISH:.*]]
+; AMDGCN:       [[FALSE]]:
+; AMDGCN-NEXT:    br label %[[FINISH]]
+; AMDGCN:       [[FINISH]]:
+; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %true, label %false
+true:
+  call void @clobber(i32 1)
+  br label %finish
+false:
+  br label %finish
+finish:
+  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; ptr.false may alias, so no !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+attributes #0 = { nofree norecurse nosync nounwind willreturn }
+attributes #1 = { nofree norecurse nosync nounwind willreturn readonly }
+;.
+; AMDGCN: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 07e2d5ea1575..5bff2a2e6b20 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -207,7 +207,6 @@ define void @f7_1(ptr %ptr, i1 %cnd) {
 ; CHECK-LABEL: define {{[^@]+}}@f7_1
 ; CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
-; CHECK-NEXT:    [[PTR_0:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    br i1 [[CND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
index 374d5ba7ff52..4767244800d2 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
@@ -135,7 +135,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; TUNIT-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR5:[0-9]+]]
-; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
 ; TUNIT-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -145,7 +145,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; CGSCC-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
 ; CGSCC-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 entry:
@@ -234,7 +234,7 @@ define internal %S @bar.5(ptr %this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]]
-; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; TUNIT-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -244,7 +244,7 @@ define internal %S @bar.5(ptr %this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR9:[0-9]+]]
-; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; CGSCC-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 entry:
@@ -286,7 +286,7 @@ define internal void @boom(ptr %this, ptr %data) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[DATA_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[DATA]], ptr [[DATA_ADDR]], align 8
-; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8
+; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8, !invariant.load [[META8]]
 ; TUNIT-NEXT:    store ptr [[V]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    ret void
 ;
@@ -342,14 +342,6 @@ define %S.2 @t3.helper() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[S_2:%.*]], align 8
 ; CHECK-NEXT:    call void @ext1(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]])
-; CHECK-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load ptr, ptr [[RETVAL]], align 8
-; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[S_2]] poison, ptr [[DOTFCA_0_LOAD]], 0
-; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 1
-; CHECK-NEXT:    [[DOTFCA_1_LOAD:%.*]] = load i64, ptr [[DOTFCA_1_GEP]], align 8
-; CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_0_INSERT]], i64 [[DOTFCA_1_LOAD]], 1
-; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 2
-; CHECK-NEXT:    [[DOTFCA_2_LOAD:%.*]] = load i64, ptr [[DOTFCA_2_GEP]], align 8
-; CHECK-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_1_INSERT]], i64 [[DOTFCA_2_LOAD]], 2
 ; CHECK-NEXT:    ret [[S_2]] zeroinitializer
 ;
 entry:
@@ -508,7 +500,7 @@ define internal %S @t4a(ptr %this) {
 ; CGSCC-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @t4b(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; CGSCC-NEXT:    ret [[S]] [[TMP0]]
 ;
 entry:
@@ -623,6 +615,7 @@ entry:
 ; TUNIT: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; TUNIT: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; TUNIT: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; TUNIT: [[META8]] = !{}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
@@ -632,4 +625,5 @@ entry:
 ; CGSCC: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; CGSCC: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; CGSCC: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CGSCC: [[META8]] = !{}
 ;.

From 1bd4f9719faac77f368a7bdfdb47ead56a808375 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Mon, 16 Jun 2025 13:20:30 -0300
Subject: [PATCH 0514/1322] [flang][OpenMP] Put taskgroup in a new scope
 (#144122)

Although taskgroup is a privatizing construct, because of
task_reduction clause, a new scope was not being created for it.
This could cause an extra privatization of variables when
taskgroup was lowered, because its scope would be the same as of
the parent privatizing construct.

This fixes regressions in tests 1052_0201 and 1052_0205, from
Fujitsu testsuite.

This issue didn't happen before because implicit symbols were
being created in a different way before #142154.
---
 flang/lib/Semantics/resolve-names.cpp        |  1 -
 flang/test/Lower/OpenMP/implicit-dsa.f90     | 23 ++++++++------
 flang/test/Lower/OpenMP/taskgroup02.f90      | 32 ++++++++++++++++++++
 flang/test/Semantics/OpenMP/implicit-dsa.f90 |  6 ++--
 4 files changed, 49 insertions(+), 13 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/taskgroup02.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e23e91b674a7..f66918e5c140 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1729,7 +1729,6 @@ bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) {
   switch (beginDir.v) {
   case llvm::omp::Directive::OMPD_master:
   case llvm::omp::Directive::OMPD_ordered:
-  case llvm::omp::Directive::OMPD_taskgroup:
     return false;
   default:
     return true;
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index f0f149bb415b..0d2db63edfe7 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -5,6 +5,14 @@
 
 ! Privatizers
 
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TEST7_X_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
+
 ! CHECK-LABEL: omp.private
 ! CHECK-SAME:      {type = private} @[[TEST6_Y_PRIV:.*]] : i32
 ! CHECK-NOT:   copy {
@@ -277,22 +285,19 @@ subroutine implicit_dsa_test6
   !$omp end task
 end subroutine
 
-! Test taskgroup - it uses the same scope as task.
+! Test taskgroup.
 !CHECK-LABEL: func @_QPimplicit_dsa_test7
 !CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test7Ex"}
 !CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:       omp.task {
+!CHECK:       omp.task private(@[[TEST7_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X:[^,]*]],
+!CHECK-SAME:      @[[TEST7_Y_FIRSTPRIV]] %[[Y_DECL]]#0 -> %[[PRIV_Y:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+!CHECK:         %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"}
+!CHECK:         %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK:         omp.taskgroup {
-!CHECK-NEXT:      %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test7Ex"}
-!CHECK-NEXT:      %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK-NEXT:      %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test7Ey"}
-!CHECK-NEXT:      %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:         }
 !CHECK:       }
 subroutine implicit_dsa_test7
diff --git a/flang/test/Lower/OpenMP/taskgroup02.f90 b/flang/test/Lower/OpenMP/taskgroup02.f90
new file mode 100644
index 000000000000..1e996a030c23
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup02.f90
@@ -0,0 +1,32 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! Check that variables are not privatized twice when TASKGROUP is used.
+
+!CHECK-LABEL: func.func @_QPsub() {
+!CHECK:         omp.parallel {
+!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsubEi"}
+!CHECK:           omp.master {
+!CHECK:             omp.taskgroup {
+!CHECK-NEXT:          omp.task private(@_QFsubEi_firstprivate_i32 %[[PAR_I]]#0 -> %[[TASK_I:.*]] : !fir.ref<i32>) {
+!CHECK:                 %[[TASK_I_DECL:.*]]:2 = hlfir.declare %[[TASK_I]] {uniq_name = "_QFsubEi"}
+!CHECK:               }
+!CHECK:             }
+!CHECK:           }
+!CHECK:         }
+
+subroutine sub()
+  integer, dimension(10) :: a
+  integer :: i
+
+  !$omp parallel
+    !$omp master
+      do i=1,10
+       !$omp taskgroup
+         !$omp task shared(a)
+           a(i) = 1
+         !$omp end task
+       !$omp end taskgroup
+      end do
+    !$omp end master
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 4a07e256e2bb..1ee777d6b972 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -141,7 +141,7 @@ subroutine implicit_dsa_test6
   !$omp end task
 end subroutine
 
-! Test taskgroup - it uses the same scope as task.
+! Test taskgroup.
 !DEF: /implicit_dsa_test7 (Subroutine) Subprogram
 subroutine implicit_dsa_test7
   !DEF: /implicit_dsa_test7/x ObjectEntity INTEGER(4)
@@ -150,8 +150,8 @@ subroutine implicit_dsa_test7
 
   !$omp task
     !$omp taskgroup
-      !DEF: /implicit_dsa_test7/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
-      !DEF: /implicit_dsa_test7/OtherConstruct1/y (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/OtherConstruct1/y HostAssoc INTEGER(4)
       x = y
     !$omp end taskgroup
   !$omp end task

From 22d9ea1b636d2c72a24fb0a8ce5216d609164635 Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Mon, 16 Jun 2025 17:41:52 +0100
Subject: [PATCH 0515/1322] [mlir][spirv] Add definition for GL Length
 (#144041)

A canonicalization pattern from `spirv.GL.Length` to `spirv.GL.FAbs` for scalar operands is also added.
---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       | 40 +++++++++++
 .../Dialect/SPIRV/IR/SPIRVCanonicalization.td |  8 +++
 .../SPIRV/IR/SPIRVGLCanonicalization.cpp      |  4 +-
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 66 +++++++++++++++++++
 .../SPIRV/Transforms/gl-canonicalize.mlir     | 22 +++++++
 mlir/test/Target/SPIRV/gl-ops.mlir            |  4 ++
 6 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 2ec61758ba8e..8c4da9b2dce1 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1160,6 +1160,46 @@ def SPIRV_GLFMixOp :
 
 // -----
 
+def SPIRV_GLLengthOp : SPIRV_GLOp<"Length", 66, [
+    Pure,
+    TypesMatchWith<"result type must match operand element type",
+                  "operand", "result",
+                  "::mlir::getElementTypeOrSelf($_self)">
+  ]> {
+  let summary = "Return the length of a vector x";
+
+  let description = [{
+    Result is the length of vector x, i.e., sqrt(x[0]**2 + x[1]**2 + ...).
+
+    The operand x must be a scalar or vector whose component type is floating-point.
+
+    Result Type must be a scalar of the same type as the component type of x.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Length %0 : vector<3xf32> -> f32
+    %3 = spirv.GL.Length %1 : f32 -> f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_ScalarOrVectorOf<SPIRV_Float>:$operand
+  );
+
+  let results = (outs
+    SPIRV_Float:$result
+  );
+
+  let assemblyFormat = [{
+    $operand attr-dict `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
+// -----
+
 def SPIRV_GLDistanceOp : SPIRV_GLOp<"Distance", 67, [
     Pure,
     AllTypesMatch<["p0", "p1"]>,
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
index e8d2274d29aa..39fbab8f37a2 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
@@ -75,3 +75,11 @@ def ConvertComparisonIntoClamp2_#CmpClampPair[0] : Pat<
         )),
     (CmpClampPair[1] $input, $min, $max)>;
 }
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length -> spirv.GL.FAbs
+//===----------------------------------------------------------------------===//
+
+def ConvertGLLengthToGLFAbs : Pat<
+    (SPIRV_GLLengthOp SPIRV_Float:$operand),
+    (SPIRV_GLFAbsOp $operand)>;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
index 3ad8057a58dc..46acb8c156fc 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
@@ -34,8 +34,8 @@ void populateSPIRVGLCanonicalizationPatterns(RewritePatternSet &results) {
               ConvertComparisonIntoClamp2_SPIRV_SLessThanOp,
               ConvertComparisonIntoClamp2_SPIRV_SLessThanEqualOp,
               ConvertComparisonIntoClamp2_SPIRV_ULessThanOp,
-              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp>(
-      results.getContext());
+              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp,
+              ConvertGLLengthToGLFAbs>(results.getContext());
 }
 } // namespace spirv
 } // namespace mlir
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 642346cc40b0..5c5d94c40e57 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -1000,3 +1000,69 @@ func.func @unpack_half_2x16_scalar_out(%arg0 : i32) -> () {
   %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> f32
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length
+//===----------------------------------------------------------------------===//
+
+func.func @length(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Length {{%.*}} : f32 -> f32
+  %0 = spirv.GL.Length %arg0 : f32 -> f32
+  return
+}
+
+func.func @lengthvec(%arg0 : vector<3xf32>) -> () {
+  // CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32_in(%arg0 : i32) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'i32'}}
+  %0 = spirv.GL.Length %arg0 : i32 -> f32
+  return
+}
+
+// -----
+
+func.func @length_f16_in(%arg0 : f16) -> () {
+  // expected-error @+1 {{op failed to verify that result type must match operand element type}}
+  %0 = spirv.GL.Length %arg0 : f16 -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32vec_in(%arg0 : vector<3xi32>) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'vector<3xi32>'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xi32> -> f32
+  return
+}
+
+// -----
+
+func.func @length_f16vec_in(%arg0 : vector<3xf16>) -> () {
+  // expected-error @+1 {{op failed to verify that result type must match operand element type}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf16> -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32_out(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be 16/32/64-bit float, but got 'i32'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @length_vec_out(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be 16/32/64-bit float, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> vector<3xf32>
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
index c1447b38f0a4..33b877667512 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
@@ -177,3 +177,25 @@ func.func @clamp_ulessthanequal(%input: i32, %min: i32, %max: i32) -> i32 {
   // CHECK-NEXT: spirv.ReturnValue [[RES]]
   spirv.ReturnValue %2 : i32
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @convert_length_into_fabs_scalar
+func.func @convert_length_into_fabs_scalar(%arg0 : f32) -> f32 {
+  //CHECK: spirv.GL.FAbs {{%.*}} : f32
+  //CHECK-NOT: spirv.GL.Length
+  %0 = spirv.GL.Length %arg0 : f32 -> f32
+  spirv.ReturnValue %0 : f32
+}
+
+// CHECK-LABEL: @dont_convert_length_into_fabs_vec
+func.func @dont_convert_length_into_fabs_vec(%arg0 : vector<3xf32>) -> f32 {
+  //CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+  //CHECK-NOT: spirv.GL.FAbs
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
+  spirv.ReturnValue %0 : f32
+}
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index eacf36bfba9c..832f7ea2fe31 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -128,6 +128,10 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %8 = spirv.GL.FindSMsb %arg3 : vector<3xi32>
     // CHECK: {{%.*}} = spirv.GL.FindUMsb {{%.*}} : vector<3xi32>
     %9 = spirv.GL.FindUMsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.Length {{%.*}} : f32 -> f32
+    %10 = spirv.GL.Length %arg0 : f32 -> f32
+    // CHECK: {{%.*}} = spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+    %11 = spirv.GL.Length %arg1 : vector<3xf32> -> f32
     spirv.Return
   }
 

From 8bbef3d1c9115b3c64365e9b8e4ee84275a4d001 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Mon, 16 Jun 2025 12:46:54 -0400
Subject: [PATCH 0516/1322] [InstCombine] Iterative replacement in PtrReplacer
 (#137215)

This patch enhances the PtrReplacer as follows:
1. Users are now collected iteratively to be generous on the stack. In
the case of PHIs with incoming values which have not yet been visited,
they are pushed back into the stack for reconsideration.
2. Replace users of the pointer root in a reverse-postorder traversal,
instead of a simple traversal over the collected users. This reordering
ensures that the operands of an instruction are replaced before
replacing the instruction itself.
3. During the replacement of PHI, use the same incoming value if it does
not have a replacement.

This patch specifically fixes the case when an incoming value of a PHI
is addrspacecasted.
---
 .../InstCombineLoadStoreAlloca.cpp            | 163 ++++++++++--------
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |  79 +++++++++
 2 files changed, 174 insertions(+), 68 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index a9751ab03e20..9aec90120d8b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -243,11 +243,10 @@ public:
   void replacePointer(Value *V);
 
 private:
-  bool collectUsersRecursive(Instruction &I);
   void replace(Instruction *I);
-  Value *getReplacement(Value *I);
+  Value *getReplacement(Value *V) const { return WorkMap.lookup(V); }
   bool isAvailable(Instruction *I) const {
-    return I == &Root || Worklist.contains(I);
+    return I == &Root || UsersToReplace.contains(I);
   }
 
   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
@@ -259,8 +258,7 @@ private:
     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
   }
 
-  SmallPtrSet<Instruction *, 32> ValuesToRevisit;
-  SmallSetVector<Instruction *, 4> Worklist;
+  SmallSetVector<Instruction *, 32> UsersToReplace;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
   Instruction &Root;
@@ -269,72 +267,79 @@ private:
 } // end anonymous namespace
 
 bool PointerReplacer::collectUsers() {
-  if (!collectUsersRecursive(Root))
-    return false;
+  SmallVector<Instruction *> Worklist;
+  SmallSetVector<Instruction *, 32> ValuesToRevisit;
 
-  // Ensure that all outstanding (indirect) users of I
-  // are inserted into the Worklist. Return false
-  // otherwise.
-  return llvm::set_is_subset(ValuesToRevisit, Worklist);
-}
+  auto PushUsersToWorklist = [&](Instruction *Inst) {
+    for (auto *U : Inst->users())
+      if (auto *I = dyn_cast<Instruction>(U))
+        if (!isAvailable(I) && !ValuesToRevisit.contains(I))
+          Worklist.emplace_back(I);
+  };
 
-bool PointerReplacer::collectUsersRecursive(Instruction &I) {
-  for (auto *U : I.users()) {
-    auto *Inst = cast<Instruction>(&*U);
+  PushUsersToWorklist(&Root);
+  while (!Worklist.empty()) {
+    Instruction *Inst = Worklist.pop_back_val();
     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
-      Worklist.insert(Load);
+      UsersToReplace.insert(Load);
     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
-      // All incoming values must be instructions for replacability
-      if (any_of(PHI->incoming_values(),
-                 [](Value *V) { return !isa<Instruction>(V); }))
-        return false;
-
-      // If at least one incoming value of the PHI is not in Worklist,
-      // store the PHI for revisiting and skip this iteration of the
-      // loop.
-      if (any_of(PHI->incoming_values(), [this](Value *V) {
-            return !isAvailable(cast<Instruction>(V));
+      /// TODO: Handle poison and null pointers for PHI and select.
+      // If all incoming values are available, mark this PHI as
+      // replacable and push it's users into the worklist.
+      bool IsReplacable = true;
+      if (all_of(PHI->incoming_values(), [&](Value *V) {
+            if (!isa<Instruction>(V))
+              return IsReplacable = false;
+            return isAvailable(cast<Instruction>(V));
           })) {
-        ValuesToRevisit.insert(Inst);
+        UsersToReplace.insert(PHI);
+        PushUsersToWorklist(PHI);
         continue;
       }
 
-      Worklist.insert(PHI);
-      if (!collectUsersRecursive(*PHI))
+      // Either an incoming value is not an instruction or not all
+      // incoming values are available. If this PHI was already
+      // visited prior to this iteration, return false.
+      if (!IsReplacable || !ValuesToRevisit.insert(PHI))
         return false;
+
+      // Push PHI back into the stack, followed by unavailable
+      // incoming values.
+      Worklist.emplace_back(PHI);
+      for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
+        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
+        if (UsersToReplace.contains(IncomingValue))
+          continue;
+        if (!ValuesToRevisit.insert(IncomingValue))
+          return false;
+        Worklist.emplace_back(IncomingValue);
+      }
     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
-      if (!isa<Instruction>(SI->getTrueValue()) ||
-          !isa<Instruction>(SI->getFalseValue()))
+      auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
+      auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
+      if (!TrueInst || !FalseInst)
         return false;
 
-      if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
-          !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
-        ValuesToRevisit.insert(Inst);
-        continue;
-      }
-      Worklist.insert(SI);
-      if (!collectUsersRecursive(*SI))
-        return false;
-    } else if (isa<GetElementPtrInst>(Inst)) {
-      Worklist.insert(Inst);
-      if (!collectUsersRecursive(*Inst))
-        return false;
+      UsersToReplace.insert(SI);
+      PushUsersToWorklist(SI);
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      UsersToReplace.insert(GEP);
+      PushUsersToWorklist(GEP);
     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
       if (MI->isVolatile())
         return false;
-      Worklist.insert(Inst);
+      UsersToReplace.insert(Inst);
     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
-      Worklist.insert(Inst);
-      if (!collectUsersRecursive(*Inst))
-        return false;
+      UsersToReplace.insert(Inst);
+      PushUsersToWorklist(Inst);
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
     } else {
       // TODO: For arbitrary uses with address space mismatches, should we check
       // if we can introduce a valid addrspacecast?
-      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n');
       return false;
     }
   }
@@ -342,7 +347,39 @@ bool PointerReplacer::collectUsersRecursive(Instruction &I) {
   return true;
 }
 
-Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
+void PointerReplacer::replacePointer(Value *V) {
+  assert(cast<PointerType>(Root.getType()) != cast<PointerType>(V->getType()) &&
+         "Invalid usage");
+  WorkMap[&Root] = V;
+  SmallVector<Instruction *> Worklist;
+  SetVector<Instruction *> PostOrderWorklist;
+  SmallPtrSet<Instruction *, 32> Visited;
+
+  // Perform a postorder traversal of the users of Root.
+  Worklist.push_back(&Root);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+
+    // If I has not been processed before, push each of its
+    // replacable users into the worklist.
+    if (Visited.insert(I).second) {
+      for (auto *U : I->users()) {
+        auto *UserInst = cast<Instruction>(U);
+        if (UsersToReplace.contains(UserInst))
+          Worklist.push_back(UserInst);
+      }
+      // Otherwise, users of I have already been pushed into
+      // the PostOrderWorklist. Push I as well.
+    } else {
+      PostOrderWorklist.insert(I);
+      Worklist.pop_back();
+    }
+  }
+
+  // Replace pointers in reverse-postorder.
+  for (Instruction *I : reverse(PostOrderWorklist))
+    replace(I);
+}
 
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
@@ -364,13 +401,15 @@ void PointerReplacer::replace(Instruction *I) {
     // replacement (new value).
     WorkMap[NewI] = NewI;
   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
-    Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
-    auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
-                                   PHI->getName(), PHI->getIterator());
-    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
-      NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
-                          PHI->getIncomingBlock(I));
-    WorkMap[PHI] = NewPHI;
+    // Create a new PHI by replacing any incoming value that is a user of the
+    // root pointer and has a replacement.
+    Value *V = WorkMap.lookup(PHI->getIncomingValue(0));
+    PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType());
+    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) {
+      Value *V = WorkMap.lookup(PHI->getIncomingValue(I));
+      PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I));
+    }
+    WorkMap[PHI] = PHI;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
@@ -434,18 +473,6 @@ void PointerReplacer::replace(Instruction *I) {
   }
 }
 
-void PointerReplacer::replacePointer(Value *V) {
-#ifndef NDEBUG
-  auto *PT = cast<PointerType>(Root.getType());
-  auto *NT = cast<PointerType>(V->getType());
-  assert(PT != NT && "Invalid usage");
-#endif
-  WorkMap[&Root] = V;
-
-  for (Instruction *Workitem : Worklist)
-    replace(Workitem);
-}
-
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
     return I;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
new file mode 100644
index 000000000000..538cc19f9722
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s
+
+%struct.type = type { [256 x <2 x i64>] }
+@g1 = external hidden addrspace(3) global %struct.type, align 16
+
+; This test requires the PtrReplacer to replace users in an RPO traversal.
+; Furthermore, %ptr.else need not to be replaced so it must be retained in
+; %ptr.sink.
+define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
+; CHECK-LABEL: define <2 x i64> @func(
+; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+; CHECK-NEXT:    br label %[[SINK:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+; CHECK-NEXT:    br label %[[SINK]]
+; CHECK:       [[SINK]]:
+; CHECK-NEXT:    [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ]
+; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
+;
+entry:
+  %coerce = alloca %struct.type, align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
+  br i1 %cmp.0, label %if.then, label %if.else
+
+if.then:                                    ; preds = %entry
+  %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
+  %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr
+  br label %sink
+
+if.else:                                      ; preds = %entry
+  %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+  %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0
+  br label %sink
+
+sink:
+  %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ]
+  %val.sink = load <2 x i64>, ptr %ptr.sink, align 16
+  ret <2 x i64> %val.sink
+}
+
+define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
+; CHECK-LABEL: define <2 x i64> @func_phi_loop(
+; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+; CHECK-NEXT:    br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]]
+; CHECK:       [[SINK]]:
+; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
+;
+entry:
+  %coerce = alloca %struct.type, align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
+  %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
+  %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr
+  br label %loop
+
+loop:
+  %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ]
+  %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+  %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0
+  br i1 %cmp.0, label %loop, label %sink
+
+sink:
+  %val.sink = load <2 x i64>, ptr %ptr.phi, align 16
+  ret <2 x i64> %val.sink
+}
+
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0

From d1dc080a858ca47c314334fb14f1ecb605fb4371 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 16 Jun 2025 17:53:34 +0100
Subject: [PATCH 0517/1322] [lldb-dap] show function name in the instruction
 comment. (#144070)

putting the function name is the dissassembly instruction messes up the
alignment making it less readable. put it instead with the comment.

This also aligns the opcodes and instruction to the left matching the
cli
---
 .../Handler/DisassembleRequestHandler.cpp     | 34 ++++++++-----------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
index d5878d18289d..85214b84b5c9 100644
--- a/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
@@ -100,7 +100,7 @@ static DisassembledInstruction ConvertSBInstructionToDisassembledInstruction(
 
   const char *m = inst.GetMnemonic(target);
   const char *o = inst.GetOperands(target);
-  const char *c = inst.GetComment(target);
+  std::string c = inst.GetComment(target);
   auto d = inst.GetData(target);
 
   std::string bytes;
@@ -114,34 +114,30 @@ static DisassembledInstruction ConvertSBInstructionToDisassembledInstruction(
 
   DisassembledInstruction disassembled_inst;
   disassembled_inst.address = inst_addr;
-  disassembled_inst.instructionBytes =
-      bytes.size() > 0 ? bytes.substr(0, bytes.size() - 1) : "";
 
-  std::string instruction;
-  llvm::raw_string_ostream si(instruction);
+  if (!bytes.empty()) // remove last whitespace
+    bytes.pop_back();
+  disassembled_inst.instructionBytes = std::move(bytes);
+
+  llvm::raw_string_ostream si(disassembled_inst.instruction);
+  si << llvm::formatv("{0,-7} {1,-25}", m, o);
 
-  lldb::SBSymbol symbol = addr.GetSymbol();
   // Only add the symbol on the first line of the function.
-  if (symbol.IsValid() && symbol.GetStartAddress() == addr) {
-    // If we have a valid symbol, append it as a label prefix for the first
-    // instruction. This is so you can see the start of a function/callsite
-    // in the assembly, at the moment VS Code (1.80) does not visualize the
-    // symbol associated with the assembly instruction.
-    si << (symbol.GetMangledName() != nullptr ? symbol.GetMangledName()
-                                              : symbol.GetName())
-       << ": ";
+  // in the comment section
+  if (lldb::SBSymbol symbol = addr.GetSymbol();
+      symbol.GetStartAddress() == addr) {
+    const llvm::StringRef sym_display_name = symbol.GetDisplayName();
+    c.append(" ");
+    c.append(sym_display_name);
 
     if (resolve_symbols)
-      disassembled_inst.symbol = symbol.GetDisplayName();
+      disassembled_inst.symbol = sym_display_name;
   }
 
-  si << llvm::formatv("{0,7} {1,12}", m, o);
-  if (c && c[0]) {
+  if (!c.empty()) {
     si << " ; " << c;
   }
 
-  disassembled_inst.instruction = std::move(instruction);
-
   protocol::Source source = CreateSource(addr, target);
   lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, addr);
 

From 20a1b357c0ff3c3f71de45bae42cb2dead7b66c9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 09:57:10 -0700
Subject: [PATCH 0518/1322] AArch64: Move AArch64MCExpr functions to
 AArch64MCAsmInfo

To migrate away from the legacy
XXXMCExpr::printImpl/evaluateAsRelocatableImpl overrides and align with
other targets.

While the AArch64MCAsmInfoXXX hooks introduce some duplication, they
enable better separation for object file formats.

Note: While AArch64MCAsmInfoDarwin uses the `@specifier` notation, it
might use AArch64MCExpr with specifier VK_ABS.
test/tools/llvm-mca/AArch64/Exynos/zero-latency-move.s abuses a parser
behavior that :lo12: is also parsed for Mach-O (though it will fail for
-filetype=obj).
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 137 ++++++++++++++++++
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   |  23 +++
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |  97 +------------
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      |  14 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   6 +-
 5 files changed, 165 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 31965d85d9eb..b2cd1d0f4156 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
@@ -53,6 +54,80 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
+StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
+  // clang-format off
+  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
+  case AArch64MCExpr::VK_CALL:                return "";
+  case AArch64MCExpr::VK_LO12:                return ":lo12:";
+  case AArch64MCExpr::VK_ABS_G3:              return ":abs_g3:";
+  case AArch64MCExpr::VK_ABS_G2:              return ":abs_g2:";
+  case AArch64MCExpr::VK_ABS_G2_S:            return ":abs_g2_s:";
+  case AArch64MCExpr::VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case AArch64MCExpr::VK_ABS_G1:              return ":abs_g1:";
+  case AArch64MCExpr::VK_ABS_G1_S:            return ":abs_g1_s:";
+  case AArch64MCExpr::VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case AArch64MCExpr::VK_ABS_G0:              return ":abs_g0:";
+  case AArch64MCExpr::VK_ABS_G0_S:            return ":abs_g0_s:";
+  case AArch64MCExpr::VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case AArch64MCExpr::VK_PREL_G3:             return ":prel_g3:";
+  case AArch64MCExpr::VK_PREL_G2:             return ":prel_g2:";
+  case AArch64MCExpr::VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case AArch64MCExpr::VK_PREL_G1:             return ":prel_g1:";
+  case AArch64MCExpr::VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case AArch64MCExpr::VK_PREL_G0:             return ":prel_g0:";
+  case AArch64MCExpr::VK_PREL_G0_NC:          return ":prel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_G2:           return ":dtprel_g2:";
+  case AArch64MCExpr::VK_DTPREL_G1:           return ":dtprel_g1:";
+  case AArch64MCExpr::VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case AArch64MCExpr::VK_DTPREL_G0:           return ":dtprel_g0:";
+  case AArch64MCExpr::VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case AArch64MCExpr::VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case AArch64MCExpr::VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case AArch64MCExpr::VK_TPREL_G2:            return ":tprel_g2:";
+  case AArch64MCExpr::VK_TPREL_G1:            return ":tprel_g1:";
+  case AArch64MCExpr::VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case AArch64MCExpr::VK_TPREL_G0:            return ":tprel_g0:";
+  case AArch64MCExpr::VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case AArch64MCExpr::VK_TPREL_HI12:          return ":tprel_hi12:";
+  case AArch64MCExpr::VK_TPREL_LO12:          return ":tprel_lo12:";
+  case AArch64MCExpr::VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case AArch64MCExpr::VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
+  case AArch64MCExpr::VK_ABS_PAGE:            return "";
+  case AArch64MCExpr::VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
+  case AArch64MCExpr::VK_GOT:                 return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE:            return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
+  case AArch64MCExpr::VK_GOT_LO12:            return ":got_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL:            return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case AArch64MCExpr::VK_TLSDESC:             return "";
+  case AArch64MCExpr::VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH:        return "";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
+  case AArch64MCExpr::VK_SECREL_LO12:         return ":secrel_lo12:";
+  case AArch64MCExpr::VK_SECREL_HI12:         return ":secrel_hi12:";
+  case AArch64MCExpr::VK_GOT_AUTH:            return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_PAGE:       return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
+  default:
+    llvm_unreachable("Invalid relocation specifier");
+  }
+  // clang-format on
+}
+
+static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
+                     const MCAssembler *Asm) {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
+
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
@@ -91,6 +166,33 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
+void AArch64AuthMCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
+  if (WrapSubExprInParens)
+    OS << '(';
+  getSubExpr()->print(OS, MAI);
+  if (WrapSubExprInParens)
+    OS << ')';
+
+  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
+  if (hasAddressDiversity())
+    OS << ",addr";
+  OS << ')';
+}
+
+void AArch64MCAsmInfoDarwin::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoDarwin::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
@@ -127,6 +229,19 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   initializeVariantKinds(ELFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoELF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoELF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -146,6 +261,17 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   initializeVariantKinds(COFFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoMicrosoftCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -164,3 +290,14 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   initializeVariantKinds(COFFAtSpecifiers);
 }
+
+void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoGNUCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 225e0c8e55fc..bc02586d7388 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -26,20 +27,42 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
+namespace AArch64 {
+/// Return the string representation of the ELF relocation specifier
+/// (e.g. ":got:", ":lo12:").
+StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+} // namespace AArch64
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index d934af91b9ff..7a7c6f7effd9 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,100 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
+#include "AArch64MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-
 const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
                                            MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, S);
 }
 
-StringRef AArch64MCExpr::getSpecifierName() const {
-  // clang-format off
-  switch (static_cast<uint32_t>(getSpecifier())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_S:            return ":abs_g2_s:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_S:            return ":abs_g1_s:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_S:            return ":abs_g0_s:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_PREL_G3:             return ":prel_g3:";
-  case VK_PREL_G2:             return ":prel_g2:";
-  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
-  case VK_PREL_G1:             return ":prel_g1:";
-  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
-  case VK_PREL_G0:             return ":prel_g0:";
-  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_HI12:          return ":tprel_hi12:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
-  case VK_GOT:                 return ":got:";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL:            return ":gottprel:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  case VK_TLSDESC_AUTH:        return "";
-  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
-  case VK_SECREL_LO12:         return ":secrel_lo12:";
-  case VK_SECREL_HI12:         return ":secrel_hi12:";
-  case VK_GOT_AUTH:            return ":got_auth:";
-  case VK_GOT_AUTH_PAGE:       return ":got_auth:";
-  case VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
-  default:
-    llvm_unreachable("Invalid relocation specifier");
-  }
-  // clang-format on
-}
-
-void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  OS << getSpecifierName();
-  Expr->print(OS, MAI);
-}
-
-bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(getSpecifier());
-  return true;
-}
-
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
@@ -114,17 +33,3 @@ const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
   return new (Ctx)
       AArch64AuthMCExpr(Expr, Discriminator, Key, HasAddressDiversity);
 }
-
-void AArch64AuthMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
-  if (WrapSubExprInParens)
-    OS << '(';
-  getSubExpr()->print(OS, MAI);
-  if (WrapSubExprInParens)
-    OS << ')';
-
-  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
-  if (hasAddressDiversity())
-    OS << ",addr";
-  OS << ')';
-}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 9c383894c7f5..541f24c943a1 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,8 +147,6 @@ protected:
 public:
   static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
                                      MCContext &Ctx);
-  /// @name VariantKind information extractors.
-  /// @{
 
   static Specifier getSymbolLoc(Specifier S) {
     return static_cast<Specifier>(S & VK_SymLocBits);
@@ -159,16 +157,6 @@ public:
   }
 
   static bool isNotChecked(Specifier S) { return S & VK_NC; }
-
-  /// @}
-
-  /// Return the string representation of the ELF relocation specifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getSpecifierName() const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 class AArch64AuthMCExpr final : public AArch64MCExpr {
@@ -189,7 +177,7 @@ public:
   uint16_t getDiscriminator() const { return Discriminator; }
   bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   static bool classof(const MCExpr *E) {
     return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 3009bd2ca275..2e997631655e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -73,7 +73,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       break;
     default:
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
@@ -83,7 +83,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   default: {
     if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());

From a733c6c7bb1c533ec28c96c49d3c5de7babd8b7f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 16 Jun 2025 10:04:28 -0700
Subject: [PATCH 0519/1322] [TargetLowering][RISCV] Allow scalable non-simple
 EVTs to be split even if the element type isn't a legal scalar type.
 (#144007)

This fixes an inconsistency in i64 vector handling between RV32 and
RV64. Even if i64 isn't legal as a scalar, we should still be able
to split a large i64 vector to get down to a legal vector type. We only
need to give up if we need to split a vscale x 1 vector.
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    2 +-
 .../Analysis/CostModel/RISCV/cast-half.ll     |   16 +-
 llvm/test/Analysis/CostModel/RISCV/cast.ll    | 6262 ++++++-----------
 llvm/test/Analysis/CostModel/RISCV/cmp.ll     |  490 +-
 llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll    |  798 +++
 5 files changed, 3059 insertions(+), 4509 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 935afaf9dd55..b1afdc2a3ac3 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1030,7 +1030,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     // If type is to be expanded, split the vector.
     //  <4 x i140> -> <2 x i140>
     if (LK.first == TypeExpandInteger) {
-      if (VT.getVectorElementCount().isScalable())
+      if (NumElts.isScalable() && NumElts.getKnownMinValue() == 1)
         return LegalizeKind(TypeScalarizeScalableVector, EltVT);
       return LegalizeKind(TypeSplitVector,
                           VT.getHalfNumVectorElementsVT(Context));
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
index 244c42cc94ba..971b14467c0f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
@@ -74,7 +74,7 @@ define void @fptosi() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -147,7 +147,7 @@ define void @fptosi() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -440,7 +440,7 @@ define void @fptoui() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -513,7 +513,7 @@ define void @fptoui() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -806,7 +806,7 @@ define void @sitofp() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -879,7 +879,7 @@ define void @sitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1172,7 +1172,7 @@ define void @uitofp() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1245,7 +1245,7 @@ define void @uitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index e498ccc73304..bdd8540a2c47 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -3,651 +3,328 @@
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @sext() {
-; RV32-LABEL: 'sext'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'sext'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'sext'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
   %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
@@ -1005,651 +682,328 @@ define void @sext() {
 }
 
 define void @zext() {
-; RV32-LABEL: 'zext'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'zext'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'zext'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
   %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
@@ -2007,631 +1361,318 @@ define void @zext() {
 }
 
 define void @trunc() {
-; RV32-LABEL: 'trunc'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'trunc'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'trunc'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
   %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
@@ -3386,571 +2427,288 @@ define void @fptrunc() {
 }
 
 define void @fptosi() {
-; RV32-LABEL: 'fptosi'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'fptosi'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'fptosi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
@@ -4264,571 +3022,288 @@ define void @fptosi() {
 }
 
 define void @fptoui() {
-; RV32-LABEL: 'fptoui'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'fptoui'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'fptoui'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
@@ -5142,571 +3617,288 @@ define void @fptoui() {
 }
 
 define void @sitofp() {
-; RV32-LABEL: 'sitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'sitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'sitofp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
@@ -6020,571 +4212,288 @@ define void @sitofp() {
 }
 
 define void @uitofp() {
-; RV32-LABEL: 'uitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'uitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'uitofp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
@@ -6985,3 +4894,6 @@ define void @legalization_crash() {
   fptoui <192 x float> undef to <192 x i1>
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp.ll b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
index 69d4f27ac41b..793f0dd2fe04 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
@@ -3,331 +3,168 @@
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+f -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @icmp() {
-; RV32-LABEL: 'icmp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'icmp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'icmp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   icmp slt <2 x i1> undef, undef
   icmp slt <2 x i8> undef, undef
@@ -658,3 +495,6 @@ define void @fcmp() {
 
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
index 7442be92fffc..83192930f5cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
@@ -921,3 +921,801 @@ define <vscale x 8 x i32> @vsub_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = sub <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
+
+; Make sure we are able to split a type that isn't an MVT even if the scalar
+; element type isn't legal on RV32. This used to crash.
+define <vscale x 64 x i64> @vsub_vv_nxv64i64(<vscale x 64 x i64> %va, <vscale x 64 x i64> %vb) {
+; RV32-LABEL: vsub_vv_nxv64i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    .cfi_def_cfa_offset 80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    .cfi_offset s5, -28
+; RV32-NEXT:    .cfi_offset s6, -32
+; RV32-NEXT:    .cfi_offset s7, -36
+; RV32-NEXT:    .cfi_offset s8, -40
+; RV32-NEXT:    .cfi_offset s9, -44
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    mv a3, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a3, a3, a2
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0f, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 88 * vlenb
+; RV32-NEXT:    mv s2, a7
+; RV32-NEXT:    mv s3, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr s6, vlenb
+; RV32-NEXT:    slli s7, s6, 4
+; RV32-NEXT:    slli s8, s6, 3
+; RV32-NEXT:    add a1, a7, s7
+; RV32-NEXT:    vl8re64.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a1, a7, s8
+; RV32-NEXT:    vl8re64.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    add a0, s2, a0
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s4, a0
+; RV32-NEXT:    add a0, s3, a0
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    add a0, s2, a0
+; RV32-NEXT:    slli s9, s6, 5
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a0, s3, s9
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a0, s2, s4
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (s3)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a1, s3, a0
+; RV32-NEXT:    add a2, s2, s9
+; RV32-NEXT:    add a3, s3, s7
+; RV32-NEXT:    add a4, s2, a0
+; RV32-NEXT:    add s3, s3, s8
+; RV32-NEXT:    vl8re64.v v8, (s2)
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    mv a6, a5
+; RV32-NEXT:    slli a5, a5, 1
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 32
+; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a2)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a4)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (s3)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a3)
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v24, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v0, v0, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16
+; RV32-NEXT:    vs8r.v v0, (s0)
+; RV32-NEXT:    add s1, s0, s1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v24, (s1)
+; RV32-NEXT:    add s5, s0, s5
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v24, (s5)
+; RV32-NEXT:    add s4, s0, s4
+; RV32-NEXT:    vs8r.v v16, (s4)
+; RV32-NEXT:    add s9, s0, s9
+; RV32-NEXT:    add a0, s0, a0
+; RV32-NEXT:    add s7, s0, s7
+; RV32-NEXT:    add s0, s0, s8
+; RV32-NEXT:    vs8r.v v8, (s9)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (s7)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 6
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (s0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsub_vv_nxv64i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    .cfi_offset s5, -56
+; RV64-NEXT:    .cfi_offset s6, -64
+; RV64-NEXT:    .cfi_offset s7, -72
+; RV64-NEXT:    .cfi_offset s8, -80
+; RV64-NEXT:    .cfi_offset s9, -88
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    mv a3, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a3, a3, a2
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0f, 0x72, 0x00, 0x11, 0x80, 0x01, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 128 + 88 * vlenb
+; RV64-NEXT:    mv s2, a7
+; RV64-NEXT:    mv s3, a1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr s6, vlenb
+; RV64-NEXT:    slli s7, s6, 4
+; RV64-NEXT:    slli s8, s6, 3
+; RV64-NEXT:    add a1, a7, s7
+; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a1, a7, s8
+; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s1, a0
+; RV64-NEXT:    add a0, s2, a0
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 40
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s4, a0
+; RV64-NEXT:    add a0, s3, a0
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s5, a0
+; RV64-NEXT:    add a0, s2, a0
+; RV64-NEXT:    slli s9, s6, 5
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a0, s3, s9
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a0, s2, s4
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (s3)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a1, s3, a0
+; RV64-NEXT:    add a2, s2, s9
+; RV64-NEXT:    add a3, s3, s7
+; RV64-NEXT:    add a4, s2, a0
+; RV64-NEXT:    add s3, s3, s8
+; RV64-NEXT:    vl8re64.v v8, (s2)
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 3
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a2)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    mv a5, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a4)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (s3)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a3)
+; RV64-NEXT:    addi a2, sp, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v24, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v0, v0, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    addi a1, sp, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a2, a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16
+; RV64-NEXT:    vs8r.v v0, (s0)
+; RV64-NEXT:    add s1, s0, s1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v24, (s1)
+; RV64-NEXT:    add s5, s0, s5
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v24, (s5)
+; RV64-NEXT:    add s4, s0, s4
+; RV64-NEXT:    vs8r.v v16, (s4)
+; RV64-NEXT:    add s9, s0, s9
+; RV64-NEXT:    add a0, s0, a0
+; RV64-NEXT:    add s7, s0, s7
+; RV64-NEXT:    add s0, s0, s8
+; RV64-NEXT:    vs8r.v v8, (s9)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (s7)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 6
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (s0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    .cfi_restore s4
+; RV64-NEXT:    .cfi_restore s5
+; RV64-NEXT:    .cfi_restore s6
+; RV64-NEXT:    .cfi_restore s7
+; RV64-NEXT:    .cfi_restore s8
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %vc = sub <vscale x 64 x i64> %va, %vb
+  ret <vscale x 64 x i64> %vc
+}

From c62a6138d9d02bcc0fb6660bbed78b4e979fc3dc Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Mon, 16 Jun 2025 13:05:31 -0400
Subject: [PATCH 0520/1322] Revert "[InstCombine] Iterative replacement in
 PtrReplacer" (#144394)

Reverts llvm/llvm-project#137215

This commit caused a failure in the LLVM CI:
https://lab.llvm.org/buildbot/#/builders/10/builds/7442
---
 .../InstCombineLoadStoreAlloca.cpp            | 163 ++++++++----------
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |  79 ---------
 2 files changed, 68 insertions(+), 174 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 9aec90120d8b..a9751ab03e20 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -243,10 +243,11 @@ public:
   void replacePointer(Value *V);
 
 private:
+  bool collectUsersRecursive(Instruction &I);
   void replace(Instruction *I);
-  Value *getReplacement(Value *V) const { return WorkMap.lookup(V); }
+  Value *getReplacement(Value *I);
   bool isAvailable(Instruction *I) const {
-    return I == &Root || UsersToReplace.contains(I);
+    return I == &Root || Worklist.contains(I);
   }
 
   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
@@ -258,7 +259,8 @@ private:
     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
   }
 
-  SmallSetVector<Instruction *, 32> UsersToReplace;
+  SmallPtrSet<Instruction *, 32> ValuesToRevisit;
+  SmallSetVector<Instruction *, 4> Worklist;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
   Instruction &Root;
@@ -267,79 +269,72 @@ private:
 } // end anonymous namespace
 
 bool PointerReplacer::collectUsers() {
-  SmallVector<Instruction *> Worklist;
-  SmallSetVector<Instruction *, 32> ValuesToRevisit;
+  if (!collectUsersRecursive(Root))
+    return false;
 
-  auto PushUsersToWorklist = [&](Instruction *Inst) {
-    for (auto *U : Inst->users())
-      if (auto *I = dyn_cast<Instruction>(U))
-        if (!isAvailable(I) && !ValuesToRevisit.contains(I))
-          Worklist.emplace_back(I);
-  };
+  // Ensure that all outstanding (indirect) users of I
+  // are inserted into the Worklist. Return false
+  // otherwise.
+  return llvm::set_is_subset(ValuesToRevisit, Worklist);
+}
 
-  PushUsersToWorklist(&Root);
-  while (!Worklist.empty()) {
-    Instruction *Inst = Worklist.pop_back_val();
+bool PointerReplacer::collectUsersRecursive(Instruction &I) {
+  for (auto *U : I.users()) {
+    auto *Inst = cast<Instruction>(&*U);
     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
-      UsersToReplace.insert(Load);
+      Worklist.insert(Load);
     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
-      /// TODO: Handle poison and null pointers for PHI and select.
-      // If all incoming values are available, mark this PHI as
-      // replacable and push it's users into the worklist.
-      bool IsReplacable = true;
-      if (all_of(PHI->incoming_values(), [&](Value *V) {
-            if (!isa<Instruction>(V))
-              return IsReplacable = false;
-            return isAvailable(cast<Instruction>(V));
+      // All incoming values must be instructions for replacability
+      if (any_of(PHI->incoming_values(),
+                 [](Value *V) { return !isa<Instruction>(V); }))
+        return false;
+
+      // If at least one incoming value of the PHI is not in Worklist,
+      // store the PHI for revisiting and skip this iteration of the
+      // loop.
+      if (any_of(PHI->incoming_values(), [this](Value *V) {
+            return !isAvailable(cast<Instruction>(V));
           })) {
-        UsersToReplace.insert(PHI);
-        PushUsersToWorklist(PHI);
+        ValuesToRevisit.insert(Inst);
         continue;
       }
 
-      // Either an incoming value is not an instruction or not all
-      // incoming values are available. If this PHI was already
-      // visited prior to this iteration, return false.
-      if (!IsReplacable || !ValuesToRevisit.insert(PHI))
+      Worklist.insert(PHI);
+      if (!collectUsersRecursive(*PHI))
         return false;
-
-      // Push PHI back into the stack, followed by unavailable
-      // incoming values.
-      Worklist.emplace_back(PHI);
-      for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
-        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
-        if (UsersToReplace.contains(IncomingValue))
-          continue;
-        if (!ValuesToRevisit.insert(IncomingValue))
-          return false;
-        Worklist.emplace_back(IncomingValue);
-      }
     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
-      auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
-      auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
-      if (!TrueInst || !FalseInst)
+      if (!isa<Instruction>(SI->getTrueValue()) ||
+          !isa<Instruction>(SI->getFalseValue()))
         return false;
 
-      UsersToReplace.insert(SI);
-      PushUsersToWorklist(SI);
-    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-      UsersToReplace.insert(GEP);
-      PushUsersToWorklist(GEP);
+      if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
+          !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
+        ValuesToRevisit.insert(Inst);
+        continue;
+      }
+      Worklist.insert(SI);
+      if (!collectUsersRecursive(*SI))
+        return false;
+    } else if (isa<GetElementPtrInst>(Inst)) {
+      Worklist.insert(Inst);
+      if (!collectUsersRecursive(*Inst))
+        return false;
     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
       if (MI->isVolatile())
         return false;
-      UsersToReplace.insert(Inst);
+      Worklist.insert(Inst);
     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
-      UsersToReplace.insert(Inst);
-      PushUsersToWorklist(Inst);
+      Worklist.insert(Inst);
+      if (!collectUsersRecursive(*Inst))
+        return false;
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
     } else {
       // TODO: For arbitrary uses with address space mismatches, should we check
       // if we can introduce a valid addrspacecast?
-      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n');
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
       return false;
     }
   }
@@ -347,39 +342,7 @@ bool PointerReplacer::collectUsers() {
   return true;
 }
 
-void PointerReplacer::replacePointer(Value *V) {
-  assert(cast<PointerType>(Root.getType()) != cast<PointerType>(V->getType()) &&
-         "Invalid usage");
-  WorkMap[&Root] = V;
-  SmallVector<Instruction *> Worklist;
-  SetVector<Instruction *> PostOrderWorklist;
-  SmallPtrSet<Instruction *, 32> Visited;
-
-  // Perform a postorder traversal of the users of Root.
-  Worklist.push_back(&Root);
-  while (!Worklist.empty()) {
-    Instruction *I = Worklist.back();
-
-    // If I has not been processed before, push each of its
-    // replacable users into the worklist.
-    if (Visited.insert(I).second) {
-      for (auto *U : I->users()) {
-        auto *UserInst = cast<Instruction>(U);
-        if (UsersToReplace.contains(UserInst))
-          Worklist.push_back(UserInst);
-      }
-      // Otherwise, users of I have already been pushed into
-      // the PostOrderWorklist. Push I as well.
-    } else {
-      PostOrderWorklist.insert(I);
-      Worklist.pop_back();
-    }
-  }
-
-  // Replace pointers in reverse-postorder.
-  for (Instruction *I : reverse(PostOrderWorklist))
-    replace(I);
-}
+Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
 
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
@@ -401,15 +364,13 @@ void PointerReplacer::replace(Instruction *I) {
     // replacement (new value).
     WorkMap[NewI] = NewI;
   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
-    // Create a new PHI by replacing any incoming value that is a user of the
-    // root pointer and has a replacement.
-    Value *V = WorkMap.lookup(PHI->getIncomingValue(0));
-    PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType());
-    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) {
-      Value *V = WorkMap.lookup(PHI->getIncomingValue(I));
-      PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I));
-    }
-    WorkMap[PHI] = PHI;
+    Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
+    auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
+                                   PHI->getName(), PHI->getIterator());
+    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
+      NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
+                          PHI->getIncomingBlock(I));
+    WorkMap[PHI] = NewPHI;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
@@ -473,6 +434,18 @@ void PointerReplacer::replace(Instruction *I) {
   }
 }
 
+void PointerReplacer::replacePointer(Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(Root.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && "Invalid usage");
+#endif
+  WorkMap[&Root] = V;
+
+  for (Instruction *Workitem : Worklist)
+    replace(Workitem);
+}
+
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
     return I;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
deleted file mode 100644
index 538cc19f9722..000000000000
--- a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s
-
-%struct.type = type { [256 x <2 x i64>] }
-@g1 = external hidden addrspace(3) global %struct.type, align 16
-
-; This test requires the PtrReplacer to replace users in an RPO traversal.
-; Furthermore, %ptr.else need not to be replaced so it must be retained in
-; %ptr.sink.
-define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
-; CHECK-LABEL: define <2 x i64> @func(
-; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
-; CHECK-NEXT:    br label %[[SINK:.*]]
-; CHECK:       [[IF_ELSE]]:
-; CHECK-NEXT:    [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-; CHECK-NEXT:    br label %[[SINK]]
-; CHECK:       [[SINK]]:
-; CHECK-NEXT:    [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ]
-; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16
-; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
-;
-entry:
-  %coerce = alloca %struct.type, align 16, addrspace(5)
-  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
-  br i1 %cmp.0, label %if.then, label %if.else
-
-if.then:                                    ; preds = %entry
-  %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
-  %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr
-  br label %sink
-
-if.else:                                      ; preds = %entry
-  %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-  %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0
-  br label %sink
-
-sink:
-  %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ]
-  %val.sink = load <2 x i64>, ptr %ptr.sink, align 16
-  ret <2 x i64> %val.sink
-}
-
-define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
-; CHECK-LABEL: define <2 x i64> @func_phi_loop(
-; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-; CHECK-NEXT:    br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]]
-; CHECK:       [[SINK]]:
-; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16
-; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
-;
-entry:
-  %coerce = alloca %struct.type, align 16, addrspace(5)
-  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
-  %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
-  %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr
-  br label %loop
-
-loop:
-  %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ]
-  %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-  %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0
-  br i1 %cmp.0, label %loop, label %sink
-
-sink:
-  %val.sink = load <2 x i64>, ptr %ptr.phi, align 16
-  ret <2 x i64> %val.sink
-}
-
-declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0

From 6f9cd79fa2f43b8128be3e4386ee182ad5a843cc Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 16 Jun 2025 10:07:56 -0700
Subject: [PATCH 0521/1322] [InstSimplify] Add basic simplifications for
 vp.reverse (#144112)

Directly modeled after what we do for vector.reverse, but with
restrictions on EVL and mask added.
---
 llvm/lib/Analysis/InstructionSimplify.cpp     | 17 +++++++++++++++++
 .../Transforms/InstSimplify/vp-reverse.ll     | 19 ++++++-------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index e397a228afee..d1ac8d9fbdfd 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6969,6 +6969,23 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
     }
     return nullptr;
   }
+  case Intrinsic::experimental_vp_reverse: {
+    Value *Vec = Call->getArgOperand(0);
+    Value *Mask = Call->getArgOperand(1);
+    Value *EVL = Call->getArgOperand(2);
+
+    Value *X;
+    // vp.reverse(vp.reverse(X)) == X (with all ones mask and matching EVL)
+    if (match(Mask, m_AllOnes()) &&
+        match(Vec, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                       m_Value(X), m_AllOnes(), m_Specific(EVL))))
+      return X;
+
+    // vp.reverse(splat(X)) -> splat(X) (regardless of mask and EVL)
+    if (isSplatValue(Vec))
+      return Vec;
+    return nullptr;
+  }
   default:
     return nullptr;
   }
diff --git a/llvm/test/Transforms/InstSimplify/vp-reverse.ll b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
index 3c3bb871dc61..f19a2ac8ca9e 100644
--- a/llvm/test/Transforms/InstSimplify/vp-reverse.ll
+++ b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
@@ -3,9 +3,7 @@
 
 define <vscale x 4 x i32> @rev_of_rev(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @rev_of_rev(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -25,8 +23,7 @@ define <vscale x 4 x i32> @rev_of_rev_diffevl(<vscale x 4 x i32> %a, i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
 ; CHECK-LABEL: @rev_of_poison(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -34,8 +31,7 @@ define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
 ; CHECK-LABEL: @rev_of_undef(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -43,8 +39,7 @@ define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_zero(i32 %evl) {
 ; CHECK-LABEL: @rev_of_zero(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -54,8 +49,7 @@ define <vscale x 4 x i32> @rev_of_splat(i32 %a, i32 %evl) {
 ; CHECK-LABEL: @rev_of_splat(
 ; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_VEC]]
 ;
   %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -67,8 +61,7 @@ define <vscale x 4 x i32> @rev_of_splat2(i32 %a, <vscale x 4 x i1> %m, i32 %evl)
 ; CHECK-LABEL: @rev_of_splat2(
 ; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_VEC]]
 ;
   %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer

From 90d62e0ae352e67d808f94ffb6d215d033f4ec22 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 16 Jun 2025 10:20:09 -0700
Subject: [PATCH 0522/1322] [RISCV][TTI] Refine reverse shuffle costing for
 high LMUL (#144155)

This contains two closely related changes:
1) Explicitly recurse on the i1 case - "3" happens to be the right
   magic constant at m1, but is not otherwise correct, and we're
   better off deferring this to existing logic.
2) Match the lowering for high LMUL shuffles - we've switched to using
   a linear number of m1 vrgather instead of a single big vrgather.
   This results in substantially faster (but also larger) code for
   reverse shuffles larger than m1.  Note that fixed vectors need
   a slide at the end, but scalable ones don't.

This will have the effect of biasing the vectorizer towards larger
(particularly scalable larger) vector factors. This increases VF for the
s112 and s1112 loops from TSVC_2 (in all configurations).

We could refine the high LMUL estimates a bit more, but I think getting
the linear scaling right is probably close enough for the moment.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 84 ++++++++++++++-----
 .../Analysis/CostModel/RISCV/rvv-shuffle.ll   | 68 +++++++--------
 .../CostModel/RISCV/shuffle-reverse.ll        | 52 ++++++------
 .../RISCV/riscv-vector-reverse.ll             | 20 ++---
 4 files changed, 132 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bee47527cf42..fcc9d3977e5c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -602,6 +602,15 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
   return FirstSlideCost + SecondSlideCost + MaskCost;
 }
 
+// Consolidate!
+static MVT getLMUL1VT(MVT VT) {
+  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+         "Unexpected vector MVT");
+  return MVT::getScalableVectorVT(
+      VT.getVectorElementType(),
+      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
+}
+
 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                              VectorType *Tp, ArrayRef<int> Mask,
                                              TTI::TargetCostKind CostKind,
@@ -840,33 +849,64 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
   }
   case TTI::SK_Reverse: {
+
+    if (!LT.second.isVector())
+      return InstructionCost::getInvalid();
+
     // TODO: Cases to improve here:
     // * Illegal vector types
     // * i64 on RV32
-    // * i1 vector
-    // At low LMUL, most of the cost is producing the vrgather index register.
-    // At high LMUL, the cost of the vrgather itself will dominate.
-    // Example sequence:
-    //   csrr a0, vlenb
-    //   srli a0, a0, 3
-    //   addi a0, a0, -1
-    //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
-    //   vid.v v9
-    //   vrsub.vx v10, v9, a0
-    //   vrgather.vv v9, v8, v10
-    InstructionCost LenCost = 3;
+    if (Tp->getElementType()->isIntegerTy(1)) {
+      VectorType *WideTy =
+          VectorType::get(IntegerType::get(Tp->getContext(), 8),
+                          cast<VectorType>(Tp)->getElementCount());
+      return getCastInstrCost(Instruction::ZExt, WideTy, Tp,
+                              TTI::CastContextHint::None, CostKind) +
+             getShuffleCost(TTI::SK_Reverse, WideTy, {}, CostKind, 0, nullptr) +
+             getCastInstrCost(Instruction::Trunc, Tp, WideTy,
+                              TTI::CastContextHint::None, CostKind);
+    }
+
+    MVT ContainerVT = LT.second;
     if (LT.second.isFixedLengthVector())
-      // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
-      LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
-    if (LT.second.isFixedLengthVector() &&
-        isInt<5>(LT.second.getVectorNumElements() - 1))
-      Opcodes[1] = RISCV::VRSUB_VI;
+      ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
+    MVT M1VT = getLMUL1VT(ContainerVT);
+    if (ContainerVT.bitsLE(M1VT)) {
+      // Example sequence:
+      //   csrr a0, vlenb
+      //   srli a0, a0, 3
+      //   addi a0, a0, -1
+      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
+      //   vid.v v9
+      //   vrsub.vx v10, v9, a0
+      //   vrgather.vv v9, v8, v10
+      InstructionCost LenCost = 3;
+      if (LT.second.isFixedLengthVector())
+        // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
+        LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
+      unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
+      if (LT.second.isFixedLengthVector() &&
+          isInt<5>(LT.second.getVectorNumElements() - 1))
+        Opcodes[1] = RISCV::VRSUB_VI;
+      InstructionCost GatherCost =
+          getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+      return LT.first * (LenCost + GatherCost);
+    }
+
+    // At high LMUL, we split into a series of M1 reverses (see
+    // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
+    // the resulting gap at the bottom (for fixed vectors only).  The important
+    // bit is that the cost scales linearly, not quadratically with LMUL.
+    unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
+    InstructionCost FixedCost =
+        getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
+    unsigned Ratio =
+        ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements();
     InstructionCost GatherCost =
-        getRISCVInstructionCost(Opcodes, LT.second, CostKind);
-    // Mask operation additionally required extend and truncate
-    InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
-    return LT.first * (LenCost + GatherCost + ExtendCost);
+        getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
+    InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
+      getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
+    return FixedCost + LT.first * (GatherCost + SlideCost);
   }
   }
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index e1bca7161412..437a9af8fcc8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -78,47 +78,47 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 8f3219861f2f..d97d70e99ccb 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -11,10 +11,10 @@
 define void @reverse() {
 ;
 ; CHECK-LABEL: 'reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -22,31 +22,31 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -54,24 +54,24 @@ define void @reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b026e6868581..ba4c4b6d58ad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -38,10 +38,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -147,10 +147,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -178,7 +178,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 32
+; CHECK-NEXT:  LV: Loop cost is 24
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -447,10 +447,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -556,10 +556,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -587,7 +587,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 34
+; CHECK-NEXT:  LV: Loop cost is 26
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.

From 267b859fc60acda510027bd6139c54d660c6fb21 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 16 Jun 2025 19:35:34 +0200
Subject: [PATCH 0523/1322] [CIR] Implement folder for VecCmpOp (#143322)

This change adds a folder for the VecCmpOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |   2 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  98 ++++++++
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   2 +-
 clang/test/CIR/Transforms/vector-cmp-fold.cir | 227 ++++++++++++++++++
 4 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/Transforms/vector-cmp-fold.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index bd36d228578b..8dd1f0ce361d 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2155,6 +2155,8 @@ def VecCmpOp : CIR_Op<"vec.cmp", [Pure, SameTypeOperands]> {
     `(` $kind `,` $lhs `,` $rhs `)` `:` qualified(type($lhs)) `,`
     qualified(type($result)) attr-dict
   }];
+
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 5578d4f5825a..3fcb0213b219 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1589,6 +1589,104 @@ OpFoldResult cir::VecExtractOp::fold(FoldAdaptor adaptor) {
   return elements[index];
 }
 
+//===----------------------------------------------------------------------===//
+// VecCmpOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::VecCmpOp::fold(FoldAdaptor adaptor) {
+  auto lhsVecAttr =
+      mlir::dyn_cast_if_present<cir::ConstVectorAttr>(adaptor.getLhs());
+  auto rhsVecAttr =
+      mlir::dyn_cast_if_present<cir::ConstVectorAttr>(adaptor.getRhs());
+  if (!lhsVecAttr || !rhsVecAttr)
+    return {};
+
+  mlir::Type inputElemTy =
+      mlir::cast<cir::VectorType>(lhsVecAttr.getType()).getElementType();
+  if (!isAnyIntegerOrFloatingPointType(inputElemTy))
+    return {};
+
+  cir::CmpOpKind opKind = adaptor.getKind();
+  mlir::ArrayAttr lhsVecElhs = lhsVecAttr.getElts();
+  mlir::ArrayAttr rhsVecElhs = rhsVecAttr.getElts();
+  uint64_t vecSize = lhsVecElhs.size();
+
+  SmallVector<mlir::Attribute, 16> elements(vecSize);
+  bool isIntAttr = vecSize && mlir::isa<cir::IntAttr>(lhsVecElhs[0]);
+  for (uint64_t i = 0; i < vecSize; i++) {
+    mlir::Attribute lhsAttr = lhsVecElhs[i];
+    mlir::Attribute rhsAttr = rhsVecElhs[i];
+    int cmpResult = 0;
+    switch (opKind) {
+    case cir::CmpOpKind::lt: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() <
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() <
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::le: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() <=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() <=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::gt: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() >
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() >
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::ge: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() >=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() >=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::eq: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() ==
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() ==
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::ne: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() !=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() !=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    }
+
+    elements[i] = cir::IntAttr::get(getType().getElementType(), cmpResult);
+  }
+
+  return cir::ConstVectorAttr::get(
+      getType(), mlir::ArrayAttr::get(getContext(), elements));
+}
+
 //===----------------------------------------------------------------------===//
 // VecShuffleOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 20c634d6c66f..f07e234e5e84 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -141,7 +141,7 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            ComplexCreateOp, VecCreateOp, VecExtractOp, VecShuffleOp,
+            ComplexCreateOp, VecCmpOp, VecCreateOp, VecExtractOp, VecShuffleOp,
             VecShuffleDynamicOp, VecTernaryOp>(op))
       ops.push_back(op);
   });
diff --git a/clang/test/CIR/Transforms/vector-cmp-fold.cir b/clang/test/CIR/Transforms/vector-cmp-fold.cir
new file mode 100644
index 000000000000..b207fc08748e
--- /dev/null
+++ b/clang/test/CIR/Transforms/vector-cmp-fold.cir
@@ -0,0 +1,227 @@
+// RUN: cir-opt %s -cir-canonicalize -o - -split-input-file | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(eq, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(ne, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(lt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(le, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(eq, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(ne, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(lt, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(le, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(ge, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}

From 4cd3e41bce449a10f431a3112b6cb8d7bc1b09cf Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Mon, 16 Jun 2025 11:03:21 -0700
Subject: [PATCH 0524/1322] [libc] Removed public function calls in table.h
 (#144168)

Removed strcmp, strlen, and memset calls from table.h and replaced them
with internal functions.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/HashTable/CMakeLists.txt      |  5 ++---
 libc/src/__support/HashTable/table.h             | 15 ++++++++-------
 libc/test/src/__support/HashTable/table_test.cpp |  4 +++-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/HashTable/CMakeLists.txt b/libc/src/__support/HashTable/CMakeLists.txt
index 3c487e4f2926..a1de0680cc7d 100644
--- a/libc/src/__support/HashTable/CMakeLists.txt
+++ b/libc/src/__support/HashTable/CMakeLists.txt
@@ -32,9 +32,8 @@ add_header_library(
     libc.src.__support.macros.attributes
     libc.src.__support.macros.optimization
     libc.src.__support.memory_size
-    libc.src.string.memset
-    libc.src.string.strcmp
-    libc.src.string.strlen
+    libc.src.string.memory_utils.inline_strcmp
+    libc.src.string.string_utils
 )
 
 add_header_library(
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 13badb90dbfd..10dd9711afbf 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -18,9 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/memory_size.h"
-#include "src/string/memset.h"
-#include "src/string/strcmp.h"
-#include "src/string/strlen.h"
+#include "src/string/memory_utils/inline_strcmp.h"
+#include "src/string/string_utils.h"
 #include <stddef.h>
 #include <stdint.h>
 
@@ -158,7 +157,9 @@ private:
       for (size_t i : masks) {
         size_t index = (pos + i) & entries_mask;
         ENTRY &entry = this->entry(index);
-        if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0))
+        auto comp = [](char l, char r) -> int { return l - r; };
+        if (LIBC_LIKELY(entry.key != nullptr &&
+                        inline_strcmp(entry.key, key, comp) == 0))
           return index;
       }
       BitMask available = ctrls.mask_available();
@@ -176,7 +177,7 @@ private:
 
   LIBC_INLINE uint64_t oneshot_hash(const char *key) const {
     LIBC_NAMESPACE::internal::HashState hasher = state;
-    hasher.update(key, strlen(key));
+    hasher.update(key, internal::string_length(key));
     return hasher.finish();
   }
 
@@ -282,8 +283,8 @@ public:
       table->entries_mask = entries - 1u;
       table->available_slots = entries / 8 * 7;
       table->state = HashState{randomness};
-      memset(&table->control(0), 0x80, ctrl_sizes);
-      memset(mem, 0, table->offset_from_entries());
+      __builtin_memset(&table->control(0), 0x80, ctrl_sizes);
+      __builtin_memset(mem, 0, table->offset_from_entries());
     }
     return table;
   }
diff --git a/libc/test/src/__support/HashTable/table_test.cpp b/libc/test/src/__support/HashTable/table_test.cpp
index a579bfabb2d7..ba9849b6b5af 100644
--- a/libc/test/src/__support/HashTable/table_test.cpp
+++ b/libc/test/src/__support/HashTable/table_test.cpp
@@ -108,7 +108,9 @@ TEST(LlvmLibcTableTest, Insertion) {
             static_cast<void *>(keys[CAP].bytes));
 
   for (size_t i = 0; i <= CAP; ++i) {
-    ASSERT_EQ(strcmp(table->find(keys[i].bytes)->key, keys[i].bytes), 0);
+    auto comp = [](char l, char r) -> int { return l - r; };
+    ASSERT_EQ(
+        inline_strcmp(table->find(keys[i].bytes)->key, keys[i].bytes, comp), 0);
   }
   for (size_t i = CAP + 1; i < 256; ++i) {
     ASSERT_EQ(table->find(keys[i].bytes), static_cast<ENTRY *>(nullptr));

From ffc4d87f9b2b57f7020fa5fd0f1d3003370c2d80 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 16 Jun 2025 11:03:48 -0700
Subject: [PATCH 0525/1322] [llvm] annotate interfaces in Passes for DLL export
 (#143794)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/Passes` library and
other pass-related headers. These annotations currently have no
meaningful impact on the LLVM build; however, they are a prerequisite to
support an LLVM Windows DLL (shared library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

The bulk of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

The following manual adjustments were also applied after running IDS on
Linux:
- Remove the redundant declaration of the `initializeKCFIPass` function
from llvm/include/llvm/InitializePasses.h because IDS only
auto-annotates the first declaration it encounters, and the second
un-annotated declaration results in an MSVC warning
- Add `LLVM_ABI` to a number of private `AnalysisKey` fields in classes
that extend the `AnalysisInfoMixin` template class.
- Add `LLVM_ABI` to the `ChangeReporter` and `TextChangeReporter`
template class definitions in
llvm/include/llvm/Passes/StandardInstrumentations.h and remove the
extern template instantiations. This is the only way I've found to get
everything compiling warning-free when building a DLL because both
template classes have methods implemented out-of-line.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/InitializePasses.h          | 589 +++++++++---------
 llvm/include/llvm/Pass.h                      |  13 +-
 llvm/include/llvm/PassAnalysisSupport.h       |  16 +-
 llvm/include/llvm/PassRegistry.h              |  17 +-
 llvm/include/llvm/PassSupport.h               |   3 +-
 llvm/include/llvm/Passes/OptimizationLevel.h  |  13 +-
 llvm/include/llvm/Passes/PassBuilder.h        | 191 +++---
 llvm/include/llvm/Passes/PassPlugin.h         |   2 +-
 .../llvm/Passes/StandardInstrumentations.h    |  68 +-
 9 files changed, 469 insertions(+), 443 deletions(-)

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 42610d505c2b..1b5b1d588882 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -14,318 +14,331 @@
 #ifndef LLVM_INITIALIZEPASSES_H
 #define LLVM_INITIALIZEPASSES_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class PassRegistry;
 
 /// Initialize all passes linked into the Core library.
-void initializeCore(PassRegistry &);
+LLVM_ABI void initializeCore(PassRegistry &);
 
 /// Initialize all passes linked into the TransformUtils library.
-void initializeTransformUtils(PassRegistry &);
+LLVM_ABI void initializeTransformUtils(PassRegistry &);
 
 /// Initialize all passes linked into the ScalarOpts library.
-void initializeScalarOpts(PassRegistry &);
+LLVM_ABI void initializeScalarOpts(PassRegistry &);
 
 /// Initialize all passes linked into the Vectorize library.
-void initializeVectorization(PassRegistry &);
+LLVM_ABI void initializeVectorization(PassRegistry &);
 
 /// Initialize all passes linked into the InstCombine library.
-void initializeInstCombine(PassRegistry &);
+LLVM_ABI void initializeInstCombine(PassRegistry &);
 
 /// Initialize all passes linked into the IPO library.
-void initializeIPO(PassRegistry &);
+LLVM_ABI void initializeIPO(PassRegistry &);
 
 /// Initialize all passes linked into the Analysis library.
-void initializeAnalysis(PassRegistry &);
+LLVM_ABI void initializeAnalysis(PassRegistry &);
 
 /// Initialize all passes linked into the CodeGen library.
-void initializeCodeGen(PassRegistry &);
+LLVM_ABI void initializeCodeGen(PassRegistry &);
 
 /// Initialize all passes linked into the GlobalISel library.
-void initializeGlobalISel(PassRegistry &);
+LLVM_ABI void initializeGlobalISel(PassRegistry &);
 
 /// Initialize all passes linked into the CodeGen library.
-void initializeTarget(PassRegistry &);
+LLVM_ABI void initializeTarget(PassRegistry &);
 
-void initializeAAResultsWrapperPassPass(PassRegistry &);
-void initializeAlwaysInlinerLegacyPassPass(PassRegistry &);
-void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
-void initializeAssumptionCacheTrackerPass(PassRegistry &);
-void initializeAtomicExpandLegacyPass(PassRegistry &);
-void initializeBasicBlockPathCloningPass(PassRegistry &);
-void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
-void initializeBasicBlockSectionsPass(PassRegistry &);
-void initializeBarrierNoopPass(PassRegistry &);
-void initializeBasicAAWrapperPassPass(PassRegistry &);
-void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
-void initializeBranchFolderLegacyPass(PassRegistry &);
-void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeBranchRelaxationLegacyPass(PassRegistry &);
-void initializeBreakCriticalEdgesPass(PassRegistry &);
-void initializeBreakFalseDepsPass(PassRegistry &);
-void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
-void initializeCFGSimplifyPassPass(PassRegistry &);
-void initializeCFGuardPass(PassRegistry &);
-void initializeCFGuardLongjmpPass(PassRegistry &);
-void initializeCFIFixupPass(PassRegistry &);
-void initializeCFIInstrInserterPass(PassRegistry &);
-void initializeCallBrPreparePass(PassRegistry &);
-void initializeCallGraphDOTPrinterPass(PassRegistry &);
-void initializeCallGraphViewerPass(PassRegistry &);
-void initializeCallGraphWrapperPassPass(PassRegistry &);
-void initializeCheckDebugMachineModulePass(PassRegistry &);
-void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
-void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
-void initializeConstantHoistingLegacyPassPass(PassRegistry &);
-void initializeCycleInfoWrapperPassPass(PassRegistry &);
-void initializeDAEPass(PassRegistry &);
-void initializeDAHPass(PassRegistry &);
-void initializeDCELegacyPassPass(PassRegistry &);
-void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
-void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
-void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
-void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
-void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
-void initializeDXILResourceWrapperPassPass(PassRegistry &);
-void initializeDeadMachineInstructionElimPass(PassRegistry &);
-void initializeDebugifyMachineModulePass(PassRegistry &);
-void initializeDependenceAnalysisWrapperPassPass(PassRegistry &);
-void initializeDetectDeadLanesLegacyPass(PassRegistry &);
-void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
-void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
-void initializeDomPrinterWrapperPassPass(PassRegistry &);
-void initializeDomViewerWrapperPassPass(PassRegistry &);
-void initializeDominanceFrontierWrapperPassPass(PassRegistry &);
-void initializeDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
-void initializeEarlyCSELegacyPassPass(PassRegistry &);
-void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
-void initializeEarlyIfConverterLegacyPass(PassRegistry &);
-void initializeEarlyIfPredicatorPass(PassRegistry &);
-void initializeEarlyMachineLICMPass(PassRegistry &);
-void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
-void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
-void initializeEHContGuardTargetsPass(PassRegistry &);
-void initializeExpandFpLegacyPassPass(PassRegistry &);
-void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
-void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
-void initializeExpandPostRALegacyPass(PassRegistry &);
-void initializeExpandReductionsPass(PassRegistry &);
-void initializeExpandVariadicsPass(PassRegistry &);
-void initializeExternalAAWrapperPassPass(PassRegistry &);
-void initializeFEntryInserterLegacyPass(PassRegistry &);
-void initializeFinalizeISelPass(PassRegistry &);
-void initializeFinalizeMachineBundlesPass(PassRegistry &);
-void initializeFixIrreduciblePass(PassRegistry &);
-void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
-void initializeFlattenCFGLegacyPassPass(PassRegistry &);
-void initializeFuncletLayoutPass(PassRegistry &);
-void initializeGCEmptyBasicBlocksPass(PassRegistry &);
-void initializeGCMachineCodeAnalysisPass(PassRegistry &);
-void initializeGCModuleInfoPass(PassRegistry &);
-void initializeGVNLegacyPassPass(PassRegistry &);
-void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
-void initializeGlobalMergePass(PassRegistry &);
-void initializeGlobalsAAWrapperPassPass(PassRegistry &);
-void initializeHardwareLoopsLegacyPass(PassRegistry &);
-void initializeMIRProfileLoaderPassPass(PassRegistry &);
-void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
-void initializeIRTranslatorPass(PassRegistry &);
-void initializeIVUsersWrapperPassPass(PassRegistry &);
-void initializeIfConverterPass(PassRegistry &);
-void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeImplicitNullChecksPass(PassRegistry &);
-void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
-void initializeInferAddressSpacesPass(PassRegistry &);
-void initializeInstSimplifyLegacyPassPass(PassRegistry &);
-void initializeInstructionCombiningPassPass(PassRegistry &);
-void initializeInstructionSelectPass(PassRegistry &);
-void initializeInterleavedAccessPass(PassRegistry &);
-void initializeInterleavedLoadCombinePass(PassRegistry &);
-void initializeJMCInstrumenterPass(PassRegistry &);
-void initializeKCFIPass(PassRegistry &);
-void initializeLCSSAVerificationPassPass(PassRegistry &);
-void initializeLCSSAWrapperPassPass(PassRegistry &);
-void initializeLazyBFIPassPass(PassRegistry &);
-void initializeLazyBlockFrequencyInfoPassPass(PassRegistry &);
-void initializeLazyBranchProbabilityInfoPassPass(PassRegistry &);
-void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry &);
-void initializeLazyValueInfoWrapperPassPass(PassRegistry &);
-void initializeLegacyLICMPassPass(PassRegistry &);
-void initializeLegalizerPass(PassRegistry &);
-void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
-void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &);
-void initializeLiveDebugValuesLegacyPass(PassRegistry &);
-void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
-void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
-void initializeLiveRangeShrinkPass(PassRegistry &);
-void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
-void initializeLiveStacksWrapperLegacyPass(PassRegistry &);
-void initializeLiveVariablesWrapperPassPass(PassRegistry &);
-void initializeLoadStoreOptPass(PassRegistry &);
-void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &);
-void initializeLocalStackSlotPassPass(PassRegistry &);
-void initializeLocalizerPass(PassRegistry &);
-void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
-void initializeLoopExtractorLegacyPassPass(PassRegistry &);
-void initializeLoopInfoWrapperPassPass(PassRegistry &);
-void initializeLoopPassPass(PassRegistry &);
-void initializeLoopSimplifyPass(PassRegistry &);
-void initializeLoopStrengthReducePass(PassRegistry &);
-void initializeLoopTermFoldPass(PassRegistry &);
-void initializeLoopUnrollPass(PassRegistry &);
-void initializeLowerAtomicLegacyPassPass(PassRegistry &);
-void initializeLowerEmuTLSPass(PassRegistry &);
-void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
-void initializeLowerIntrinsicsPass(PassRegistry &);
-void initializeLowerInvokeLegacyPassPass(PassRegistry &);
-void initializeLowerSwitchLegacyPassPass(PassRegistry &);
-void initializeKCFIPass(PassRegistry &);
-void initializeMIRAddFSDiscriminatorsPass(PassRegistry &);
-void initializeMIRCanonicalizerPass(PassRegistry &);
-void initializeMIRNamerPass(PassRegistry &);
-void initializeMIRPrintingPassPass(PassRegistry &);
-void initializeMachineBlockFrequencyInfoWrapperPassPass(PassRegistry &);
-void initializeMachineBlockPlacementLegacyPass(PassRegistry &);
-void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &);
-void initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeMachineCFGPrinterPass(PassRegistry &);
-void initializeMachineCSELegacyPass(PassRegistry &);
-void initializeMachineCombinerPass(PassRegistry &);
-void initializeMachineCopyPropagationLegacyPass(PassRegistry &);
-void initializeMachineCycleInfoPrinterLegacyPass(PassRegistry &);
-void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
-void initializeMachineDominanceFrontierPass(PassRegistry &);
-void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeMachineFunctionPrinterPassPass(PassRegistry &);
-void initializeMachineFunctionSplitterPass(PassRegistry &);
-void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
-void initializeMachineLICMPass(PassRegistry &);
-void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
-void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
-void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
-void initializeMachineOutlinerPass(PassRegistry &);
-void initializeStaticDataProfileInfoWrapperPassPass(PassRegistry &);
-void initializeStaticDataAnnotatorPass(PassRegistry &);
-void initializeMachinePipelinerPass(PassRegistry &);
-void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeMachineRegionInfoPassPass(PassRegistry &);
-void initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
-void initializeMachineSchedulerLegacyPass(PassRegistry &);
-void initializeMachineSinkingLegacyPass(PassRegistry &);
-void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
-void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
-void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
-void initializeMachineVerifierLegacyPassPass(PassRegistry &);
-void initializeMemoryDependenceWrapperPassPass(PassRegistry &);
-void initializeMemorySSAWrapperPassPass(PassRegistry &);
-void initializeMergeICmpsLegacyPassPass(PassRegistry &);
-void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeModuloScheduleTestPass(PassRegistry &);
-void initializeNaryReassociateLegacyPassPass(PassRegistry &);
-void initializeObjCARCContractLegacyPassPass(PassRegistry &);
-void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
-void initializeOptimizePHIsLegacyPass(PassRegistry &);
-void initializePEILegacyPass(PassRegistry &);
-void initializePHIEliminationPass(PassRegistry &);
-void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
-void initializePatchableFunctionLegacyPass(PassRegistry &);
-void initializePeepholeOptimizerLegacyPass(PassRegistry &);
-void initializePhiValuesWrapperPassPass(PassRegistry &);
-void initializePhysicalRegisterUsageInfoWrapperLegacyPass(PassRegistry &);
-void initializePlaceBackedgeSafepointsLegacyPassPass(PassRegistry &);
-void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
-void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
-void initializePostDomPrinterWrapperPassPass(PassRegistry &);
-void initializePostDomViewerWrapperPassPass(PassRegistry &);
-void initializePostDominatorTreeWrapperPassPass(PassRegistry &);
-void initializePostInlineEntryExitInstrumenterPass(PassRegistry &);
-void initializePostMachineSchedulerLegacyPass(PassRegistry &);
-void initializePostRAHazardRecognizerLegacyPass(PassRegistry &);
-void initializePostRAMachineSinkingPass(PassRegistry &);
-void initializePostRASchedulerLegacyPass(PassRegistry &);
-void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
-void initializePrintFunctionPassWrapperPass(PassRegistry &);
-void initializePrintModulePassWrapperPass(PassRegistry &);
-void initializeProcessImplicitDefsPass(PassRegistry &);
-void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
-void initializePromoteLegacyPassPass(PassRegistry &);
-void initializeRABasicPass(PassRegistry &);
-void initializePseudoProbeInserterPass(PassRegistry &);
-void initializeRAGreedyLegacyPass(PassRegistry &);
-void initializeReachingDefAnalysisPass(PassRegistry &);
-void initializeReassociateLegacyPassPass(PassRegistry &);
-void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
-void initializeRegAllocFastPass(PassRegistry &);
-void initializeRegAllocPriorityAdvisorAnalysisLegacyPass(PassRegistry &);
-void initializeRegAllocScoringPass(PassRegistry &);
-void initializeRegBankSelectPass(PassRegistry &);
-void initializeRegToMemWrapperPassPass(PassRegistry &);
-void initializeRegUsageInfoCollectorLegacyPass(PassRegistry &);
-void initializeRegUsageInfoPropagationLegacyPass(PassRegistry &);
-void initializeRegionInfoPassPass(PassRegistry &);
-void initializeRegionOnlyPrinterPass(PassRegistry &);
-void initializeRegionOnlyViewerPass(PassRegistry &);
-void initializeRegionPrinterPass(PassRegistry &);
-void initializeRegionViewerPass(PassRegistry &);
-void initializeRegisterCoalescerLegacyPass(PassRegistry &);
-void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
-void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
-void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
-void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
-void initializeResetMachineFunctionPass(PassRegistry &);
-void initializeSCEVAAWrapperPassPass(PassRegistry &);
-void initializeSROALegacyPassPass(PassRegistry &);
-void initializeSafeStackLegacyPassPass(PassRegistry &);
-void initializeSafepointIRVerifierPass(PassRegistry &);
-void initializeSelectOptimizePass(PassRegistry &);
-void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
-void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
-void initializeScalarizerLegacyPassPass(PassRegistry &);
-void initializeScavengerTestPass(PassRegistry &);
-void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
-void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
-void initializeShadowStackGCLoweringPass(PassRegistry &);
-void initializeShrinkWrapLegacyPass(PassRegistry &);
-void initializeSingleLoopExtractorPass(PassRegistry &);
-void initializeSinkingLegacyPassPass(PassRegistry &);
-void initializeSjLjEHPreparePass(PassRegistry &);
-void initializeSlotIndexesWrapperPassPass(PassRegistry &);
-void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
-void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
-void initializeStackColoringLegacyPass(PassRegistry &);
-void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &);
-void initializeStaticDataSplitterPass(PassRegistry &);
-void initializeStackMapLivenessPass(PassRegistry &);
-void initializeStackProtectorPass(PassRegistry &);
-void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
-void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
-void initializeStackSlotColoringLegacyPass(PassRegistry &);
-void initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
-void initializeStripDebugMachineModulePass(PassRegistry &);
-void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
-void initializeTailCallElimPass(PassRegistry &);
-void initializeTailDuplicateLegacyPass(PassRegistry &);
-void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
-void initializeTargetPassConfigPass(PassRegistry &);
-void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
-void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
-void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
-void initializeTypePromotionLegacyPass(PassRegistry &);
-void initializeInitUndefPass(PassRegistry &);
-void initializeUniformityInfoWrapperPassPass(PassRegistry &);
-void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
-void initializeUnpackMachineBundlesPass(PassRegistry &);
-void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
-void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
-void initializeVerifierLegacyPassPass(PassRegistry &);
-void initializeVirtRegMapWrapperLegacyPass(PassRegistry &);
-void initializeVirtRegRewriterLegacyPass(PassRegistry &);
-void initializeWasmEHPreparePass(PassRegistry &);
-void initializeWinEHPreparePass(PassRegistry &);
-void initializeWriteBitcodePassPass(PassRegistry &);
-void initializeXRayInstrumentationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeAAResultsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeAlwaysInlinerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeAssumptionCacheTrackerPass(PassRegistry &);
+LLVM_ABI void initializeAtomicExpandLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBasicBlockPathCloningPass(PassRegistry &);
+LLVM_ABI void
+initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBasicBlockSectionsPass(PassRegistry &);
+LLVM_ABI void initializeBarrierNoopPass(PassRegistry &);
+LLVM_ABI void initializeBasicAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBranchFolderLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBranchRelaxationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBreakCriticalEdgesPass(PassRegistry &);
+LLVM_ABI void initializeBreakFalseDepsPass(PassRegistry &);
+LLVM_ABI void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
+LLVM_ABI void initializeCFGSimplifyPassPass(PassRegistry &);
+LLVM_ABI void initializeCFGuardPass(PassRegistry &);
+LLVM_ABI void initializeCFGuardLongjmpPass(PassRegistry &);
+LLVM_ABI void initializeCFIFixupPass(PassRegistry &);
+LLVM_ABI void initializeCFIInstrInserterPass(PassRegistry &);
+LLVM_ABI void initializeCallBrPreparePass(PassRegistry &);
+LLVM_ABI void initializeCallGraphDOTPrinterPass(PassRegistry &);
+LLVM_ABI void initializeCallGraphViewerPass(PassRegistry &);
+LLVM_ABI void initializeCallGraphWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeCheckDebugMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeConstantHoistingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeCycleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDAEPass(PassRegistry &);
+LLVM_ABI void initializeDAHPass(PassRegistry &);
+LLVM_ABI void initializeDCELegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &);
+LLVM_ABI void initializeDebugifyMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeDependenceAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDetectDeadLanesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDominanceFrontierWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyCSELegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyIfConverterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEarlyIfPredicatorPass(PassRegistry &);
+LLVM_ABI void initializeEarlyMachineLICMPass(PassRegistry &);
+LLVM_ABI void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEHContGuardTargetsPass(PassRegistry &);
+LLVM_ABI void initializeExpandFpLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandPostRALegacyPass(PassRegistry &);
+LLVM_ABI void initializeExpandReductionsPass(PassRegistry &);
+LLVM_ABI void initializeExpandVariadicsPass(PassRegistry &);
+LLVM_ABI void initializeExternalAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeFEntryInserterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeFinalizeISelPass(PassRegistry &);
+LLVM_ABI void initializeFinalizeMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeFixIrreduciblePass(PassRegistry &);
+LLVM_ABI void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
+LLVM_ABI void initializeFlattenCFGLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeFuncletLayoutPass(PassRegistry &);
+LLVM_ABI void initializeGCEmptyBasicBlocksPass(PassRegistry &);
+LLVM_ABI void initializeGCMachineCodeAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeGCModuleInfoPass(PassRegistry &);
+LLVM_ABI void initializeGVNLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
+LLVM_ABI void initializeGlobalMergePass(PassRegistry &);
+LLVM_ABI void initializeGlobalsAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeHardwareLoopsLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMIRProfileLoaderPassPass(PassRegistry &);
+LLVM_ABI void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeIRTranslatorPass(PassRegistry &);
+LLVM_ABI void initializeIVUsersWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeIfConverterPass(PassRegistry &);
+LLVM_ABI void
+initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeImplicitNullChecksPass(PassRegistry &);
+LLVM_ABI void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeInferAddressSpacesPass(PassRegistry &);
+LLVM_ABI void initializeInstSimplifyLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeInstructionCombiningPassPass(PassRegistry &);
+LLVM_ABI void initializeInstructionSelectPass(PassRegistry &);
+LLVM_ABI void initializeInterleavedAccessPass(PassRegistry &);
+LLVM_ABI void initializeInterleavedLoadCombinePass(PassRegistry &);
+LLVM_ABI void initializeJMCInstrumenterPass(PassRegistry &);
+LLVM_ABI void initializeKCFIPass(PassRegistry &);
+LLVM_ABI void initializeLCSSAVerificationPassPass(PassRegistry &);
+LLVM_ABI void initializeLCSSAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBFIPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBlockFrequencyInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBranchProbabilityInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyValueInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLegacyLICMPassPass(PassRegistry &);
+LLVM_ABI void initializeLegalizerPass(PassRegistry &);
+LLVM_ABI void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveDebugValuesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLiveRangeShrinkPass(PassRegistry &);
+LLVM_ABI void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveStacksWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveVariablesWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLoadStoreOptPass(PassRegistry &);
+LLVM_ABI void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLocalStackSlotPassPass(PassRegistry &);
+LLVM_ABI void initializeLocalizerPass(PassRegistry &);
+LLVM_ABI void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopExtractorLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopSimplifyPass(PassRegistry &);
+LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &);
+LLVM_ABI void initializeLoopTermFoldPass(PassRegistry &);
+LLVM_ABI void initializeLoopUnrollPass(PassRegistry &);
+LLVM_ABI void initializeLowerAtomicLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerEmuTLSPass(PassRegistry &);
+LLVM_ABI void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerIntrinsicsPass(PassRegistry &);
+LLVM_ABI void initializeLowerInvokeLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerSwitchLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMIRAddFSDiscriminatorsPass(PassRegistry &);
+LLVM_ABI void initializeMIRCanonicalizerPass(PassRegistry &);
+LLVM_ABI void initializeMIRNamerPass(PassRegistry &);
+LLVM_ABI void initializeMIRPrintingPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockPlacementLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineCFGPrinterPass(PassRegistry &);
+LLVM_ABI void initializeMachineCSELegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCombinerPass(PassRegistry &);
+LLVM_ABI void initializeMachineCopyPropagationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCycleInfoPrinterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineDominanceFrontierPass(PassRegistry &);
+LLVM_ABI void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineFunctionPrinterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineFunctionSplitterPass(PassRegistry &);
+LLVM_ABI void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineLICMPass(PassRegistry &);
+LLVM_ABI void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineOutlinerPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataProfileInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataAnnotatorPass(PassRegistry &);
+LLVM_ABI void initializeMachinePipelinerPass(PassRegistry &);
+LLVM_ABI void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineRegionInfoPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineSchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineSinkingLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineVerifierLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMemoryDependenceWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMemorySSAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMergeICmpsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeModuloScheduleTestPass(PassRegistry &);
+LLVM_ABI void initializeNaryReassociateLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeObjCARCContractLegacyPassPass(PassRegistry &);
+LLVM_ABI void
+initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeOptimizePHIsLegacyPass(PassRegistry &);
+LLVM_ABI void initializePEILegacyPass(PassRegistry &);
+LLVM_ABI void initializePHIEliminationPass(PassRegistry &);
+LLVM_ABI void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePatchableFunctionLegacyPass(PassRegistry &);
+LLVM_ABI void initializePeepholeOptimizerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePhiValuesWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializePhysicalRegisterUsageInfoWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializePlaceBackedgeSafepointsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostInlineEntryExitInstrumenterPass(PassRegistry &);
+LLVM_ABI void initializePostMachineSchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePostRAHazardRecognizerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePostRAMachineSinkingPass(PassRegistry &);
+LLVM_ABI void initializePostRASchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePrintFunctionPassWrapperPass(PassRegistry &);
+LLVM_ABI void initializePrintModulePassWrapperPass(PassRegistry &);
+LLVM_ABI void initializeProcessImplicitDefsPass(PassRegistry &);
+LLVM_ABI void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePromoteLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeRABasicPass(PassRegistry &);
+LLVM_ABI void initializePseudoProbeInserterPass(PassRegistry &);
+LLVM_ABI void initializeRAGreedyLegacyPass(PassRegistry &);
+LLVM_ABI void initializeReachingDefAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeReassociateLegacyPassPass(PassRegistry &);
+LLVM_ABI void
+initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegAllocFastPass(PassRegistry &);
+LLVM_ABI void
+initializeRegAllocPriorityAdvisorAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegAllocScoringPass(PassRegistry &);
+LLVM_ABI void initializeRegBankSelectPass(PassRegistry &);
+LLVM_ABI void initializeRegToMemWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeRegUsageInfoCollectorLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegUsageInfoPropagationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegionInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeRegionOnlyPrinterPass(PassRegistry &);
+LLVM_ABI void initializeRegionOnlyViewerPass(PassRegistry &);
+LLVM_ABI void initializeRegionPrinterPass(PassRegistry &);
+LLVM_ABI void initializeRegionViewerPass(PassRegistry &);
+LLVM_ABI void initializeRegisterCoalescerLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
+LLVM_ABI void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
+LLVM_ABI void initializeResetMachineFunctionPass(PassRegistry &);
+LLVM_ABI void initializeSCEVAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSafeStackLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSafepointIRVerifierPass(PassRegistry &);
+LLVM_ABI void initializeSelectOptimizePass(PassRegistry &);
+LLVM_ABI void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeScalarizerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeScavengerTestPass(PassRegistry &);
+LLVM_ABI void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeShadowStackGCLoweringPass(PassRegistry &);
+LLVM_ABI void initializeShrinkWrapLegacyPass(PassRegistry &);
+LLVM_ABI void initializeSingleLoopExtractorPass(PassRegistry &);
+LLVM_ABI void initializeSinkingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSjLjEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeSlotIndexesWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStackColoringLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataSplitterPass(PassRegistry &);
+LLVM_ABI void initializeStackMapLivenessPass(PassRegistry &);
+LLVM_ABI void initializeStackProtectorPass(PassRegistry &);
+LLVM_ABI void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStackSlotColoringLegacyPass(PassRegistry &);
+LLVM_ABI void
+initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeStripDebugMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeTailCallElimPass(PassRegistry &);
+LLVM_ABI void initializeTailDuplicateLegacyPass(PassRegistry &);
+LLVM_ABI void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTargetPassConfigPass(PassRegistry &);
+LLVM_ABI void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTypePromotionLegacyPass(PassRegistry &);
+LLVM_ABI void initializeInitUndefPass(PassRegistry &);
+LLVM_ABI void initializeUniformityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeUnpackMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
+LLVM_ABI void initializeVerifierLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeVirtRegMapWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeVirtRegRewriterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeWasmEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeWinEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeWriteBitcodePassPass(PassRegistry &);
+LLVM_ABI void initializeXRayInstrumentationLegacyPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
index 921db0b5f7ae..58c45e75b3f0 100644
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -31,6 +31,7 @@
 #ifdef EXPENSIVE_CHECKS
 #include <cstdint>
 #endif
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -95,7 +96,7 @@ const char *to_string(ThinOrFullLTOPhase Phase);
 /// interprocedural optimization or you do not fit into any of the more
 /// constrained passes described below.
 ///
-class Pass {
+class LLVM_ABI Pass {
   AnalysisResolver *Resolver = nullptr;  // Used to resolve analysis
   const void *PassID;
   PassKind Kind;
@@ -252,7 +253,7 @@ public:
 /// interprocedural optimizations and analyses.  ModulePasses may do anything
 /// they want to the program.
 ///
-class ModulePass : public Pass {
+class LLVM_ABI ModulePass : public Pass {
 public:
   explicit ModulePass(char &pid) : Pass(PT_Module, pid) {}
 
@@ -282,7 +283,7 @@ protected:
 /// ImmutablePass class - This class is used to provide information that does
 /// not need to be run.  This is useful for things like target information.
 ///
-class ImmutablePass : public ModulePass {
+class LLVM_ABI ImmutablePass : public ModulePass {
 public:
   explicit ImmutablePass(char &pid) : ModulePass(pid) {}
 
@@ -311,7 +312,7 @@ public:
 ///  2. Optimizing a function does not cause the addition or removal of any
 ///     functions in the module
 ///
-class FunctionPass : public Pass {
+class LLVM_ABI FunctionPass : public Pass {
 public:
   explicit FunctionPass(char &pid) : Pass(PT_Function, pid) {}
 
@@ -338,13 +339,13 @@ protected:
 /// If the user specifies the -time-passes argument on an LLVM tool command line
 /// then the value of this boolean will be true, otherwise false.
 /// This is the storage for the -time-passes option.
-extern bool TimePassesIsEnabled;
+LLVM_ABI extern bool TimePassesIsEnabled;
 /// If TimePassesPerRun is true, there would be one line of report for
 /// each pass invocation.
 /// If TimePassesPerRun is false, there would be only one line of
 /// report for each pass (even there are more than one pass objects).
 /// (For new pass manager only)
-extern bool TimePassesPerRun;
+LLVM_ABI extern bool TimePassesPerRun;
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/PassAnalysisSupport.h b/llvm/include/llvm/PassAnalysisSupport.h
index 4bed3cb55a90..02abb00b66b5 100644
--- a/llvm/include/llvm/PassAnalysisSupport.h
+++ b/llvm/include/llvm/PassAnalysisSupport.h
@@ -24,6 +24,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <tuple>
 #include <utility>
@@ -69,14 +70,14 @@ public:
 
   ///@{
   /// Add the specified ID to the required set of the usage info for a pass.
-  AnalysisUsage &addRequiredID(const void *ID);
-  AnalysisUsage &addRequiredID(char &ID);
+  LLVM_ABI AnalysisUsage &addRequiredID(const void *ID);
+  LLVM_ABI AnalysisUsage &addRequiredID(char &ID);
   template<class PassClass>
   AnalysisUsage &addRequired() {
     return addRequiredID(PassClass::ID);
   }
 
-  AnalysisUsage &addRequiredTransitiveID(char &ID);
+  LLVM_ABI AnalysisUsage &addRequiredTransitiveID(char &ID);
   template<class PassClass>
   AnalysisUsage &addRequiredTransitive() {
     return addRequiredTransitiveID(PassClass::ID);
@@ -124,7 +125,7 @@ public:
   /// preserved by this pass. If no such Pass exists, do nothing. This can be
   /// useful when a pass is trivially preserved, but may not be linked in. Be
   /// careful about spelling!
-  AnalysisUsage &addPreserved(StringRef Arg);
+  LLVM_ABI AnalysisUsage &addPreserved(StringRef Arg);
 
   /// Set by analyses that do not transform their input at all
   void setPreservesAll() { PreservesAll = true; }
@@ -139,7 +140,7 @@ public:
   ///
   /// This function annotates the AnalysisUsage info object to say that analyses
   /// that only depend on the CFG are preserved by this pass.
-  void setPreservesCFG();
+  LLVM_ABI void setPreservesCFG();
 
   const VectorType &getRequiredSet() const { return Required; }
   const VectorType &getRequiredTransitiveSet() const {
@@ -174,7 +175,8 @@ public:
   }
 
   /// Find pass that is implementing PI. Initialize pass for Function F.
-  std::tuple<Pass *, bool> findImplPass(Pass *P, AnalysisID PI, Function &F);
+  LLVM_ABI std::tuple<Pass *, bool> findImplPass(Pass *P, AnalysisID PI,
+                                                 Function &F);
 
   void addAnalysisImplsPair(AnalysisID PI, Pass *P) {
     if (findImplPass(PI) == P)
@@ -189,7 +191,7 @@ public:
   }
 
   /// Return analysis result or null if it doesn't exist.
-  Pass *getAnalysisIfAvailable(AnalysisID ID) const;
+  LLVM_ABI Pass *getAnalysisIfAvailable(AnalysisID ID) const;
 
 private:
   /// This keeps track of which passes implements the interfaces that are
diff --git a/llvm/include/llvm/PassRegistry.h b/llvm/include/llvm/PassRegistry.h
index 003c0ac4c374..f3dada0c0ba6 100644
--- a/llvm/include/llvm/PassRegistry.h
+++ b/llvm/include/llvm/PassRegistry.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/RWMutex.h"
 #include <memory>
 #include <vector>
@@ -49,36 +50,36 @@ class PassRegistry {
 
 public:
   PassRegistry() = default;
-  ~PassRegistry();
+  LLVM_ABI ~PassRegistry();
 
   /// getPassRegistry - Access the global registry object, which is
   /// automatically initialized at application launch and destroyed by
   /// llvm_shutdown.
-  static PassRegistry *getPassRegistry();
+  LLVM_ABI static PassRegistry *getPassRegistry();
 
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// type identifier (&MyPass::ID).
-  const PassInfo *getPassInfo(const void *TI) const;
+  LLVM_ABI const PassInfo *getPassInfo(const void *TI) const;
 
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// argument string.
-  const PassInfo *getPassInfo(StringRef Arg) const;
+  LLVM_ABI const PassInfo *getPassInfo(StringRef Arg) const;
 
   /// registerPass - Register a pass (by means of its PassInfo) with the
   /// registry.  Required in order to use the pass with a PassManager.
-  void registerPass(const PassInfo &PI, bool ShouldFree = false);
+  LLVM_ABI void registerPass(const PassInfo &PI, bool ShouldFree = false);
 
   /// enumerateWith - Enumerate the registered passes, calling the provided
   /// PassRegistrationListener's passEnumerate() callback on each of them.
-  void enumerateWith(PassRegistrationListener *L);
+  LLVM_ABI void enumerateWith(PassRegistrationListener *L);
 
   /// addRegistrationListener - Register the given PassRegistrationListener
   /// to receive passRegistered() callbacks whenever a new pass is registered.
-  void addRegistrationListener(PassRegistrationListener *L);
+  LLVM_ABI void addRegistrationListener(PassRegistrationListener *L);
 
   /// removeRegistrationListener - Unregister a PassRegistrationListener so that
   /// it no longer receives passRegistered() callbacks.
-  void removeRegistrationListener(PassRegistrationListener *L);
+  LLVM_ABI void removeRegistrationListener(PassRegistrationListener *L);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/PassSupport.h b/llvm/include/llvm/PassSupport.h
index b0897a6be37d..7f0306e33e83 100644
--- a/llvm/include/llvm/PassSupport.h
+++ b/llvm/include/llvm/PassSupport.h
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Threading.h"
 #include <functional>
@@ -112,7 +113,7 @@ struct PassRegistrationListener {
 
   /// enumeratePasses - Iterate over the registered passes, calling the
   /// passEnumerate callback on each PassInfo object.
-  void enumeratePasses();
+  LLVM_ABI void enumeratePasses();
 
   /// passEnumerate - Callback function invoked when someone calls
   /// enumeratePasses on this PassRegistrationListener object.
diff --git a/llvm/include/llvm/Passes/OptimizationLevel.h b/llvm/include/llvm/Passes/OptimizationLevel.h
index d2c3fde4935f..1cf258f1ffd0 100644
--- a/llvm/include/llvm/Passes/OptimizationLevel.h
+++ b/llvm/include/llvm/Passes/OptimizationLevel.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_PASSES_OPTIMIZATIONLEVEL_H
 #define LLVM_PASSES_OPTIMIZATIONLEVEL_H
 
+#include "llvm/Support/Compiler.h"
 #include <assert.h>
 
 namespace llvm {
@@ -38,7 +39,7 @@ public:
   /// Disable as many optimizations as possible. This doesn't completely
   /// disable the optimizer in all cases, for example always_inline functions
   /// can be required to be inlined for correctness.
-  static const OptimizationLevel O0;
+  LLVM_ABI static const OptimizationLevel O0;
 
   /// Optimize quickly without destroying debuggability.
   ///
@@ -54,7 +55,7 @@ public:
   /// vectorization, or fusion don't make sense here due to the degree to
   /// which the executed code differs from the source code, and the compile
   /// time cost.
-  static const OptimizationLevel O1;
+  LLVM_ABI static const OptimizationLevel O1;
   /// Optimize for fast execution as much as possible without triggering
   /// significant incremental compile time or code size growth.
   ///
@@ -71,7 +72,7 @@ public:
   ///
   /// This is expected to be a good default optimization level for the vast
   /// majority of users.
-  static const OptimizationLevel O2;
+  LLVM_ABI static const OptimizationLevel O2;
   /// Optimize for fast execution as much as possible.
   ///
   /// This mode is significantly more aggressive in trading off compile time
@@ -86,7 +87,7 @@ public:
   /// order to make even significantly slower compile times at least scale
   /// reasonably. This does not preclude very substantial constant factor
   /// costs though.
-  static const OptimizationLevel O3;
+  LLVM_ABI static const OptimizationLevel O3;
   /// Similar to \c O2 but tries to optimize for small code size instead of
   /// fast execution without triggering significant incremental execution
   /// time slowdowns.
@@ -97,7 +98,7 @@ public:
   /// A consequence of the different core goal is that this should in general
   /// produce substantially smaller executables that still run in
   /// a reasonable amount of time.
-  static const OptimizationLevel Os;
+  LLVM_ABI static const OptimizationLevel Os;
   /// A very specialized mode that will optimize for code size at any and all
   /// costs.
   ///
@@ -105,7 +106,7 @@ public:
   /// any effort taken to reduce the size is worth it regardless of the
   /// execution time impact. You should expect this level to produce rather
   /// slow, but very small, code.
-  static const OptimizationLevel Oz;
+  LLVM_ABI static const OptimizationLevel Oz;
 
   bool isOptimizingForSpeed() const { return SizeLevel == 0 && SpeedLevel > 0; }
 
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 51ccaa53447d..f13b5c678a89 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/RegAllocCommon.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/raw_ostream.h"
@@ -44,7 +45,7 @@ class PipelineTuningOptions {
 public:
   /// Constructor sets pipeline tuning defaults based on cl::opts. Each option
   /// can be set in the PassBuilder when using a LLVM as a library.
-  PipelineTuningOptions();
+  LLVM_ABI PipelineTuningOptions();
 
   /// Tuning option to set loop interleaving on/off, set based on opt level.
   bool LoopInterleaving;
@@ -126,20 +127,20 @@ public:
     std::vector<PipelineElement> InnerPipeline;
   };
 
-  explicit PassBuilder(TargetMachine *TM = nullptr,
-                       PipelineTuningOptions PTO = PipelineTuningOptions(),
-                       std::optional<PGOOptions> PGOOpt = std::nullopt,
-                       PassInstrumentationCallbacks *PIC = nullptr);
+  LLVM_ABI explicit PassBuilder(
+      TargetMachine *TM = nullptr,
+      PipelineTuningOptions PTO = PipelineTuningOptions(),
+      std::optional<PGOOptions> PGOOpt = std::nullopt,
+      PassInstrumentationCallbacks *PIC = nullptr);
 
   /// Cross register the analysis managers through their proxies.
   ///
   /// This is an interface that can be used to cross register each
   /// AnalysisManager with all the others analysis managers.
-  void crossRegisterProxies(LoopAnalysisManager &LAM,
-                            FunctionAnalysisManager &FAM,
-                            CGSCCAnalysisManager &CGAM,
-                            ModuleAnalysisManager &MAM,
-                            MachineFunctionAnalysisManager *MFAM = nullptr);
+  LLVM_ABI void
+  crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM,
+                       CGSCCAnalysisManager &CGAM, ModuleAnalysisManager &MAM,
+                       MachineFunctionAnalysisManager *MFAM = nullptr);
 
   /// Registers all available module analysis passes.
   ///
@@ -147,7 +148,7 @@ public:
   /// ModuleAnalysisManager with all registered module analyses. Callers can
   /// still manually register any additional analyses. Callers can also
   /// pre-register analyses and this will not override those.
-  void registerModuleAnalyses(ModuleAnalysisManager &MAM);
+  LLVM_ABI void registerModuleAnalyses(ModuleAnalysisManager &MAM);
 
   /// Registers all available CGSCC analysis passes.
   ///
@@ -155,7 +156,7 @@ public:
   /// with all registered CGSCC analyses. Callers can still manually register any
   /// additional analyses. Callers can also pre-register analyses and this will
   /// not override those.
-  void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
+  LLVM_ABI void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
 
   /// Registers all available function analysis passes.
   ///
@@ -163,14 +164,14 @@ public:
   /// FunctionAnalysisManager with all registered function analyses. Callers can
   /// still manually register any additional analyses. Callers can also
   /// pre-register analyses and this will not override those.
-  void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
+  LLVM_ABI void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
 
   /// Registers all available loop analysis passes.
   ///
   /// This is an interface that can be used to populate a \c LoopAnalysisManager
   /// with all registered loop analyses. Callers can still manually register any
   /// additional analyses.
-  void registerLoopAnalyses(LoopAnalysisManager &LAM);
+  LLVM_ABI void registerLoopAnalyses(LoopAnalysisManager &LAM);
 
   /// Registers all available machine function analysis passes.
   ///
@@ -178,7 +179,8 @@ public:
   /// MachineFunctionAnalysisManager with all registered function analyses.
   /// Callers can still manually register any additional analyses. Callers can
   /// also pre-register analyses and this will not override those.
-  void registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &MFAM);
+  LLVM_ABI void
+  registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &MFAM);
 
   /// Construct the core LLVM function canonicalization and simplification
   /// pipeline.
@@ -194,9 +196,8 @@ public:
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  FunctionPassManager
-  buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                      ThinOrFullLTOPhase Phase);
+  LLVM_ABI FunctionPassManager buildFunctionSimplificationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module canonicalization and simplification
   /// pipeline.
@@ -213,18 +214,18 @@ public:
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  ModulePassManager buildModuleSimplificationPipeline(OptimizationLevel Level,
-                                                      ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModulePassManager buildModuleSimplificationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining as well as
   /// the inlining-driven cleanups.
-  ModuleInlinerWrapperPass buildInlinerPipeline(OptimizationLevel Level,
-                                                ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModuleInlinerWrapperPass
+  buildInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining with
   /// module inliner pass.
-  ModulePassManager buildModuleInlinerPipeline(OptimizationLevel Level,
-                                               ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModulePassManager
+  buildModuleInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module optimization pipeline.
   ///
@@ -239,9 +240,8 @@ public:
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager
-  buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                  ThinOrFullLTOPhase LTOPhase);
+  LLVM_ABI ModulePassManager buildModuleOptimizationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase LTOPhase);
 
   /// Build a per-module default optimization pipeline.
   ///
@@ -249,7 +249,7 @@ public:
   /// optimization and code generation without any link-time optimization. It
   /// typically correspond to frontend "-O[123]" options for optimization
   /// levels \c O1, \c O2 and \c O3 resp.
-  ModulePassManager buildPerModuleDefaultPipeline(
+  LLVM_ABI ModulePassManager buildPerModuleDefaultPipeline(
       OptimizationLevel Level,
       ThinOrFullLTOPhase Phase = ThinOrFullLTOPhase::None);
 
@@ -258,8 +258,9 @@ public:
   /// This builds a pipeline that runs the LTO/ThinLTO  pre-link pipeline, and
   /// emits a section containing the pre-link bitcode along side the object code
   /// generated in non-LTO compilation.
-  ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
-                                               bool ThinLTO, bool EmitSummary);
+  LLVM_ABI ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
+                                                        bool ThinLTO,
+                                                        bool EmitSummary);
 
   /// Build a pre-link, ThinLTO-targeting default optimization pipeline to
   /// a pass manager.
@@ -268,7 +269,8 @@ public:
   /// a ThinLTO run. It works to minimize the IR which needs to be analyzed
   /// without making irreversible decisions which could be made better during
   /// the LTO run.
-  ModulePassManager buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
+  LLVM_ABI ModulePassManager
+  buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build a ThinLTO default optimization pipeline to a pass manager.
   ///
@@ -276,9 +278,8 @@ public:
   /// optimization and code generation. It is particularly tuned to fit well
   /// when IR coming into the LTO phase was first run through \c
   /// buildThinLTOPreLinkDefaultPipeline, and the two coordinate closely.
-  ModulePassManager
-  buildThinLTODefaultPipeline(OptimizationLevel Level,
-                              const ModuleSummaryIndex *ImportSummary);
+  LLVM_ABI ModulePassManager buildThinLTODefaultPipeline(
+      OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary);
 
   /// Build a pre-link, LTO-targeting default optimization pipeline to a pass
   /// manager.
@@ -287,7 +288,8 @@ public:
   /// run. It works to minimize the IR which needs to be analyzed without
   /// making irreversible decisions which could be made better during the LTO
   /// run.
-  ModulePassManager buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
+  LLVM_ABI ModulePassManager
+  buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build an LTO default optimization pipeline to a pass manager.
   ///
@@ -295,13 +297,13 @@ public:
   /// optimization and code generation. It is particularly tuned to fit well
   /// when IR coming into the LTO phase was first run through \c
   /// buildLTOPreLinkDefaultPipeline, and the two coordinate closely.
-  ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level,
-                                            ModuleSummaryIndex *ExportSummary);
+  LLVM_ABI ModulePassManager buildLTODefaultPipeline(
+      OptimizationLevel Level, ModuleSummaryIndex *ExportSummary);
 
   /// Build an O0 pipeline with the minimal semantically required passes.
   ///
   /// This should only be used for non-LTO and LTO pre-link pipelines.
-  ModulePassManager
+  LLVM_ABI ModulePassManager
   buildO0DefaultPipeline(OptimizationLevel Level,
                          ThinOrFullLTOPhase Phase = ThinOrFullLTOPhase::None);
 
@@ -310,7 +312,7 @@ public:
   ///
   /// This also adds target-specific alias analyses registered via
   /// TargetMachine::registerDefaultAliasAnalyses().
-  AAManager buildDefaultAAPipeline();
+  LLVM_ABI AAManager buildDefaultAAPipeline();
 
   /// Parse a textual pass pipeline description into a \c
   /// ModulePassManager.
@@ -352,7 +354,8 @@ public:
   /// specifically want the pass to run under a adaptor directly. This is
   /// preferred when a pipeline is largely of one type, but one or just a few
   /// passes are of different types(See PassBuilder.cpp for examples).
-  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(ModulePassManager &MPM,
+                                   StringRef PipelineText);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -361,9 +364,12 @@ public:
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText);
-  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText);
-  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(CGSCCPassManager &CGPM,
+                                   StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(FunctionPassManager &FPM,
+                                   StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(LoopPassManager &LPM,
+                                   StringRef PipelineText);
   /// @}}
 
   /// Parse a textual MIR pipeline into the provided \c MachineFunctionPass
@@ -375,8 +381,8 @@ public:
   ///
   /// There is no need to specify the pass nesting, and this function
   /// currently cannot handle the pass nesting.
-  Error parsePassPipeline(MachineFunctionPassManager &MFPM,
-                          StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(MachineFunctionPassManager &MFPM,
+                                   StringRef PipelineText);
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
   ///
@@ -393,14 +399,14 @@ public:
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  LLVM_ABI Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Parse RegAllocFilterName to get RegAllocFilterFunc.
-  std::optional<RegAllocFilterFunc>
+  LLVM_ABI std::optional<RegAllocFilterFunc>
   parseRegAllocFilter(StringRef RegAllocFilterName);
 
   /// Print pass names.
-  void printPassNames(raw_ostream &OS);
+  LLVM_ABI void printPassNames(raw_ostream &OS);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -614,16 +620,17 @@ public:
   /// If the PassManager type is not given at the top level of the pipeline
   /// text, this Callback should be used to determine the appropriate stack of
   /// PassManagers and populate the passed ModulePassManager.
-  void registerParseTopLevelPipelineCallback(
+  LLVM_ABI void registerParseTopLevelPipelineCallback(
       const std::function<bool(ModulePassManager &, ArrayRef<PipelineElement>)>
           &C);
 
   /// Add PGOInstrumenation passes for O0 only.
-  void addPGOInstrPassesForO0(ModulePassManager &MPM, bool RunProfileGen,
-                              bool IsCS, bool AtomicCounterUpdate,
-                              std::string ProfileFile,
-                              std::string ProfileRemappingFile,
-                              IntrusiveRefCntPtr<vfs::FileSystem> FS);
+  LLVM_ABI void addPGOInstrPassesForO0(ModulePassManager &MPM,
+                                       bool RunProfileGen, bool IsCS,
+                                       bool AtomicCounterUpdate,
+                                       std::string ProfileFile,
+                                       std::string ProfileRemappingFile,
+                                       IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Returns PIC. External libraries can use this to register pass
   /// instrumentation callbacks.
@@ -634,35 +641,38 @@ public:
   // Invoke the callbacks registered for the various extension points.
   // Custom pipelines should use these to invoke the callbacks registered
   // by TargetMachines and other clients.
-  void invokePeepholeEPCallbacks(FunctionPassManager &FPM,
-                                 OptimizationLevel Level);
-  void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM,
-                                              OptimizationLevel Level);
-  void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
-                                         OptimizationLevel Level);
-  void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM,
-                                            OptimizationLevel Level);
-  void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
-                                           OptimizationLevel Level);
-  void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
-                                        OptimizationLevel Level);
-  void invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM,
-                                      OptimizationLevel Level);
-  void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
-                                       OptimizationLevel Level,
-                                       ThinOrFullLTOPhase Phase);
-  void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
-                                      OptimizationLevel Level,
-                                      ThinOrFullLTOPhase Phase);
-  void invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM,
-                                                      OptimizationLevel Level);
-  void invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM,
+  LLVM_ABI void invokePeepholeEPCallbacks(FunctionPassManager &FPM,
+                                          OptimizationLevel Level);
+  LLVM_ABI void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM,
+                                                       OptimizationLevel Level);
+  LLVM_ABI void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
+                                                  OptimizationLevel Level);
+  LLVM_ABI void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM,
                                                      OptimizationLevel Level);
-  void invokePipelineStartEPCallbacks(ModulePassManager &MPM,
-                                      OptimizationLevel Level);
-  void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
-                                                    OptimizationLevel Level,
-                                                    ThinOrFullLTOPhase Phase);
+  LLVM_ABI void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
+                                                    OptimizationLevel Level);
+  LLVM_ABI void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
+                                                 OptimizationLevel Level);
+  LLVM_ABI void invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM,
+                                               OptimizationLevel Level);
+  LLVM_ABI void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
+                                                OptimizationLevel Level,
+                                                ThinOrFullLTOPhase Phase);
+  LLVM_ABI void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
+  LLVM_ABI void
+  invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM,
+                                                 OptimizationLevel Level);
+  LLVM_ABI void
+  invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM,
+                                                OptimizationLevel Level);
+  LLVM_ABI void invokePipelineStartEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level);
+  LLVM_ABI void
+  invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
 
   static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
     if (!Name.consume_front(PassName))
@@ -713,9 +723,9 @@ public:
   /// Handle passes only accept one bool-valued parameter.
   ///
   /// \return false when Params is empty.
-  static Expected<bool> parseSinglePassOption(StringRef Params,
-                                              StringRef OptionName,
-                                              StringRef PassName);
+  LLVM_ABI static Expected<bool> parseSinglePassOption(StringRef Params,
+                                                       StringRef OptionName,
+                                                       StringRef PassName);
 
 private:
   // O1 pass pipeline
@@ -898,7 +908,7 @@ struct NoOpModulePass : PassInfoMixin<NoOpModulePass> {
 /// No-op module analysis.
 class NoOpModuleAnalysis : public AnalysisInfoMixin<NoOpModuleAnalysis> {
   friend AnalysisInfoMixin<NoOpModuleAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -916,7 +926,7 @@ struct NoOpCGSCCPass : PassInfoMixin<NoOpCGSCCPass> {
 /// No-op CGSCC analysis.
 class NoOpCGSCCAnalysis : public AnalysisInfoMixin<NoOpCGSCCAnalysis> {
   friend AnalysisInfoMixin<NoOpCGSCCAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -935,7 +945,7 @@ struct NoOpFunctionPass : PassInfoMixin<NoOpFunctionPass> {
 /// No-op function analysis.
 class NoOpFunctionAnalysis : public AnalysisInfoMixin<NoOpFunctionAnalysis> {
   friend AnalysisInfoMixin<NoOpFunctionAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -968,7 +978,7 @@ struct NoOpMachineFunctionPass : public PassInfoMixin<NoOpMachineFunctionPass> {
 /// No-op loop analysis.
 class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
   friend AnalysisInfoMixin<NoOpLoopAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -978,8 +988,7 @@ public:
 };
 
 /// Common option used by multiple tools to print pipeline passes
-extern cl::opt<bool> PrintPipelinePasses;
-
+LLVM_ABI extern cl::opt<bool> PrintPipelinePasses;
 }
 
 #endif
diff --git a/llvm/include/llvm/Passes/PassPlugin.h b/llvm/include/llvm/Passes/PassPlugin.h
index 013b7a827c47..947504bc207a 100644
--- a/llvm/include/llvm/Passes/PassPlugin.h
+++ b/llvm/include/llvm/Passes/PassPlugin.h
@@ -64,7 +64,7 @@ public:
   /// \returns Returns an error if either the library cannot be found or loaded,
   /// there is no public entry point, or the plugin implements the wrong API
   /// version.
-  static Expected<PassPlugin> Load(const std::string &Filename);
+  LLVM_ABI static Expected<PassPlugin> Load(const std::string &Filename);
 
   /// Get the filename of the loaded plugin.
   StringRef getFilename() const { return Filename; }
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index f7a65a88ecf5..4ee5ab255486 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
 
@@ -46,9 +47,9 @@ class PassInstrumentationCallbacks;
 /// (typically Loop or SCC).
 class PrintIRInstrumentation {
 public:
-  ~PrintIRInstrumentation();
+  LLVM_ABI ~PrintIRInstrumentation();
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   struct PassRunDescriptor {
@@ -104,7 +105,7 @@ private:
 class OptNoneInstrumentation {
 public:
   OptNoneInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   bool DebugLogging;
@@ -116,8 +117,8 @@ class OptPassGateInstrumentation {
   bool HasWrittenIR = false;
 public:
   OptPassGateInstrumentation(LLVMContext &Context) : Context(Context) {}
-  bool shouldRun(StringRef PassName, Any IR);
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI bool shouldRun(StringRef PassName, Any IR);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 };
 
 struct PrintPassOptions {
@@ -136,7 +137,7 @@ class PrintPassInstrumentation {
 public:
   PrintPassInstrumentation(bool Enabled, PrintPassOptions Opts)
       : Enabled(Enabled), Opts(Opts) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   bool Enabled;
@@ -167,7 +168,7 @@ public:
     std::optional<DenseMap<intptr_t, BBGuard>> BBGuards;
     DenseMap<const BasicBlock *, DenseMap<const BasicBlock *, unsigned>> Graph;
 
-    CFG(const Function *F, bool TrackBBLifetime);
+    LLVM_ABI CFG(const Function *F, bool TrackBBLifetime);
 
     bool operator==(const CFG &G) const {
       return !isPoisoned() && !G.isPoisoned() && Graph == G.Graph;
@@ -179,18 +180,18 @@ public:
              });
     }
 
-    static void printDiff(raw_ostream &out, const CFG &Before,
-                          const CFG &After);
-    bool invalidate(Function &F, const PreservedAnalyses &PA,
-                    FunctionAnalysisManager::Invalidator &);
+    LLVM_ABI static void printDiff(raw_ostream &out, const CFG &Before,
+                                   const CFG &After);
+    LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
+                             FunctionAnalysisManager::Invalidator &);
   };
 
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
   SmallVector<StringRef, 8> PassStack;
 #endif
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager &MAM);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager &MAM);
 };
 
 // Base class for classes that report changes to the IR.
@@ -208,7 +209,7 @@ public:
 // 6.  When a pass is run on an IR that is not interesting (based on options).
 // 7.  When a pass is ignored (pass manager or adapter pass).
 // 8.  To compare two IR representations (of type \p T).
-template <typename IRUnitT> class ChangeReporter {
+template <typename IRUnitT> class LLVM_ABI ChangeReporter {
 protected:
   ChangeReporter(bool RunInVerboseMode) : VerboseMode(RunInVerboseMode) {}
 
@@ -257,7 +258,7 @@ protected:
 // An abstract template base class that handles printing banners and
 // reporting when things have not changed or are filtered out.
 template <typename IRUnitT>
-class TextChangeReporter : public ChangeReporter<IRUnitT> {
+class LLVM_ABI TextChangeReporter : public ChangeReporter<IRUnitT> {
 protected:
   TextChangeReporter(bool Verbose);
 
@@ -281,7 +282,7 @@ protected:
 // by unwrapAndPrint.  The string representation is stored in a std::string
 // to preserve it as the IR changes in each pass.  Note that the banner is
 // included in this representation but it is massaged before reporting.
-class IRChangedPrinter : public TextChangeReporter<std::string> {
+class LLVM_ABI IRChangedPrinter : public TextChangeReporter<std::string> {
 public:
   IRChangedPrinter(bool VerboseMode)
       : TextChangeReporter<std::string>(VerboseMode) {}
@@ -298,7 +299,7 @@ protected:
                    Any) override;
 };
 
-class IRChangedTester : public IRChangedPrinter {
+class LLVM_ABI IRChangedTester : public IRChangedPrinter {
 public:
   IRChangedTester() : IRChangedPrinter(true) {}
   ~IRChangedTester() override;
@@ -444,7 +445,8 @@ protected:
 // and added, respectively.  Changes to the IR that do not affect basic
 // blocks are not reported as having changed the IR.  The option
 // -print-module-scope does not affect this change reporter.
-class InLineChangePrinter : public TextChangeReporter<IRDataT<EmptyData>> {
+class LLVM_ABI InLineChangePrinter
+    : public TextChangeReporter<IRDataT<EmptyData>> {
 public:
   InLineChangePrinter(bool VerboseMode, bool ColourMode)
       : TextChangeReporter<IRDataT<EmptyData>>(VerboseMode),
@@ -475,8 +477,8 @@ class VerifyInstrumentation {
 
 public:
   VerifyInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager *MAM);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager *MAM);
 };
 
 /// This class implements --time-trace functionality for new pass manager.
@@ -484,12 +486,12 @@ public:
 /// execution time. They collect time tracing info by TimeProfiler.
 class TimeProfilingPassesHandler {
 public:
-  TimeProfilingPassesHandler();
+  LLVM_ABI TimeProfilingPassesHandler();
   // We intend this to be unique per-compilation, thus no copies.
   TimeProfilingPassesHandler(const TimeProfilingPassesHandler &) = delete;
   void operator=(const TimeProfilingPassesHandler &) = delete;
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   // Implementation of pass instrumentation callbacks.
@@ -502,8 +504,8 @@ private:
 class DCData {
 public:
   // Fill the map with the transitions from basic block \p B.
-  DCData(const BasicBlock &B);
-  DCData(const MachineBasicBlock &B);
+  LLVM_ABI DCData(const BasicBlock &B);
+  LLVM_ABI DCData(const MachineBasicBlock &B);
 
   // Return an iterator to the names of the successor blocks.
   StringMap<std::string>::const_iterator begin() const {
@@ -531,7 +533,7 @@ protected:
 
 // A change reporter that builds a website with links to pdf files showing
 // dot control flow graphs with changed instructions shown in colour.
-class DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
+class LLVM_ABI DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
 public:
   DotCfgChangeReporter(bool Verbose);
   ~DotCfgChangeReporter() override;
@@ -578,9 +580,9 @@ class PrintCrashIRInstrumentation {
 public:
   PrintCrashIRInstrumentation()
       : SavedIR("*** Dump of IR Before Last Pass Unknown ***") {}
-  ~PrintCrashIRInstrumentation();
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
-  void reportCrashIR();
+  LLVM_ABI ~PrintCrashIRInstrumentation();
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void reportCrashIR();
 
 protected:
   std::string SavedIR;
@@ -614,26 +616,22 @@ class StandardInstrumentations {
   bool VerifyEach;
 
 public:
+  LLVM_ABI
   StandardInstrumentations(LLVMContext &Context, bool DebugLogging,
                            bool VerifyEach = false,
                            PrintPassOptions PrintPassOpts = PrintPassOptions());
 
   // Register all the standard instrumentation callbacks. If \p FAM is nullptr
   // then PreservedCFGChecker is not enabled.
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager *MAM = nullptr);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager *MAM = nullptr);
 
   TimePassesHandler &getTimePasses() { return TimePasses; }
 };
 
-extern template class ChangeReporter<std::string>;
-extern template class TextChangeReporter<std::string>;
-
 extern template class BlockDataT<EmptyData>;
 extern template class FuncDataT<EmptyData>;
 extern template class IRDataT<EmptyData>;
-extern template class ChangeReporter<IRDataT<EmptyData>>;
-extern template class TextChangeReporter<IRDataT<EmptyData>>;
 extern template class IRComparer<EmptyData>;
 
 } // namespace llvm

From febb7e8443c4e8ff55e6b21bec4e2233b62d832b Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 16 Jun 2025 11:04:17 -0700
Subject: [PATCH 0526/1322] [llvm] annotate interfaces in XRay for DLL export
 (#143765)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/XRay` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

The bulk of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

Additionally, I manually added `LLVM_ABI_FRIEND` to friend member
functions declared with `LLVM_ABI`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/XRay/BlockIndexer.h       |  3 ++-
 llvm/include/llvm/XRay/BlockPrinter.h       |  3 ++-
 llvm/include/llvm/XRay/BlockVerifier.h      |  3 ++-
 llvm/include/llvm/XRay/FDRRecordConsumer.h  |  5 ++--
 llvm/include/llvm/XRay/FDRRecordProducer.h  |  3 ++-
 llvm/include/llvm/XRay/FDRRecords.h         | 29 +++++++++++----------
 llvm/include/llvm/XRay/FDRTraceWriter.h     |  5 ++--
 llvm/include/llvm/XRay/FileHeaderReader.h   |  5 ++--
 llvm/include/llvm/XRay/InstrumentationMap.h | 11 +++++---
 llvm/include/llvm/XRay/Profile.h            | 19 +++++++-------
 llvm/include/llvm/XRay/RecordPrinter.h      |  3 ++-
 llvm/include/llvm/XRay/Trace.h              |  8 +++---
 12 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index 77af77e5ec26..e9782dafed61 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -14,6 +14,7 @@
 #define LLVM_XRAY_BLOCKINDEXER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 #include <cstdint>
 #include <vector>
@@ -23,7 +24,7 @@ namespace xray {
 
 // The BlockIndexer will gather all related records associated with a
 // process+thread and group them by 'Block'.
-class BlockIndexer : public RecordVisitor {
+class LLVM_ABI BlockIndexer : public RecordVisitor {
 public:
   struct Block {
     uint64_t ProcessID;
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index 2f9fed668069..caf78c5c4a5a 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_BLOCKPRINTER_H
 #define LLVM_XRAY_BLOCKPRINTER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/RecordPrinter.h"
@@ -20,7 +21,7 @@
 namespace llvm {
 namespace xray {
 
-class BlockPrinter : public RecordVisitor {
+class LLVM_ABI BlockPrinter : public RecordVisitor {
   enum class State {
     Start,
     Preamble,
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index 2450ad89ffe3..b88785c393e3 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -13,12 +13,13 @@
 #ifndef LLVM_XRAY_BLOCKVERIFIER_H
 #define LLVM_XRAY_BLOCKVERIFIER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 
 namespace llvm {
 namespace xray {
 
-class BlockVerifier : public RecordVisitor {
+class LLVM_ABI BlockVerifier : public RecordVisitor {
 public:
   // We force State elements to be size_t, to be used as indices for containers.
   enum class State : std::size_t {
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 8fff9fb86158..473777f0e04f 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -8,6 +8,7 @@
 #ifndef LLVM_XRAY_FDRRECORDCONSUMER_H
 #define LLVM_XRAY_FDRRECORDCONSUMER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
 #include <algorithm>
@@ -25,7 +26,7 @@ public:
 
 // This consumer will collect all the records into a vector of records, in
 // arrival order.
-class LogBuilderConsumer : public RecordConsumer {
+class LLVM_ABI LogBuilderConsumer : public RecordConsumer {
   std::vector<std::unique_ptr<Record>> &Records;
 
 public:
@@ -38,7 +39,7 @@ public:
 // A PipelineConsumer applies a set of visitors to every consumed Record, in the
 // order by which the visitors are added to the pipeline in the order of
 // appearance.
-class PipelineConsumer : public RecordConsumer {
+class LLVM_ABI PipelineConsumer : public RecordConsumer {
   std::vector<RecordVisitor *> Visitors;
 
 public:
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 25c123aec1b2..083b57139d39 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -8,6 +8,7 @@
 #ifndef LLVM_XRAY_FDRRECORDPRODUCER_H
 #define LLVM_XRAY_FDRRECORDPRODUCER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -24,7 +25,7 @@ public:
   virtual ~RecordProducer() = default;
 };
 
-class FileBasedRecordProducer : public RecordProducer {
+class LLVM_ABI FileBasedRecordProducer : public RecordProducer {
   const XRayFileHeader &Header;
   DataExtractor &E;
   uint64_t &OffsetPtr;
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 8af88f5b0e13..7ee8db61b210 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_FDRRECORDS_H
 #define LLVM_XRAY_FDRRECORDS_H
 
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 #include <string>
 
@@ -47,7 +48,7 @@ public:
     RK_Function,
   };
 
-  static StringRef kindToString(RecordKind K);
+  LLVM_ABI static StringRef kindToString(RecordKind K);
 
 private:
   const RecordKind T;
@@ -107,7 +108,7 @@ public:
 // What follows are specific Metadata record types which encapsulate the
 // information associated with specific metadata record types in an FDR mode
 // log.
-class BufferExtents : public MetadataRecord {
+class LLVM_ABI BufferExtents : public MetadataRecord {
   uint64_t Size = 0;
   friend class RecordInitializer;
 
@@ -130,7 +131,7 @@ public:
   }
 };
 
-class WallclockRecord : public MetadataRecord {
+class LLVM_ABI WallclockRecord : public MetadataRecord {
   uint64_t Seconds = 0;
   uint32_t Nanos = 0;
   friend class RecordInitializer;
@@ -155,7 +156,7 @@ public:
   }
 };
 
-class NewCPUIDRecord : public MetadataRecord {
+class LLVM_ABI NewCPUIDRecord : public MetadataRecord {
   uint16_t CPUId = 0;
   uint64_t TSC = 0;
   friend class RecordInitializer;
@@ -181,7 +182,7 @@ public:
   }
 };
 
-class TSCWrapRecord : public MetadataRecord {
+class LLVM_ABI TSCWrapRecord : public MetadataRecord {
   uint64_t BaseTSC = 0;
   friend class RecordInitializer;
 
@@ -203,7 +204,7 @@ public:
   }
 };
 
-class CustomEventRecord : public MetadataRecord {
+class LLVM_ABI CustomEventRecord : public MetadataRecord {
   int32_t Size = 0;
   uint64_t TSC = 0;
   uint16_t CPU = 0;
@@ -232,7 +233,7 @@ public:
   }
 };
 
-class CustomEventRecordV5 : public MetadataRecord {
+class LLVM_ABI CustomEventRecordV5 : public MetadataRecord {
   int32_t Size = 0;
   int32_t Delta = 0;
   std::string Data{};
@@ -259,7 +260,7 @@ public:
   }
 };
 
-class TypedEventRecord : public MetadataRecord {
+class LLVM_ABI TypedEventRecord : public MetadataRecord {
   int32_t Size = 0;
   int32_t Delta = 0;
   uint16_t EventType = 0;
@@ -288,7 +289,7 @@ public:
   }
 };
 
-class CallArgRecord : public MetadataRecord {
+class LLVM_ABI CallArgRecord : public MetadataRecord {
   uint64_t Arg = 0;
   friend class RecordInitializer;
 
@@ -310,7 +311,7 @@ public:
   }
 };
 
-class PIDRecord : public MetadataRecord {
+class LLVM_ABI PIDRecord : public MetadataRecord {
   int32_t PID = 0;
   friend class RecordInitializer;
 
@@ -333,7 +334,7 @@ public:
   }
 };
 
-class NewBufferRecord : public MetadataRecord {
+class LLVM_ABI NewBufferRecord : public MetadataRecord {
   int32_t TID = 0;
   friend class RecordInitializer;
 
@@ -356,7 +357,7 @@ public:
   }
 };
 
-class EndBufferRecord : public MetadataRecord {
+class LLVM_ABI EndBufferRecord : public MetadataRecord {
 public:
   EndBufferRecord()
       : MetadataRecord(RecordKind::RK_Metadata_EndOfBuffer,
@@ -369,7 +370,7 @@ public:
   }
 };
 
-class FunctionRecord : public Record {
+class LLVM_ABI FunctionRecord : public Record {
   RecordTypes Kind;
   int32_t FuncId = 0;
   uint32_t Delta = 0;
@@ -415,7 +416,7 @@ public:
   virtual Error visit(TypedEventRecord &) = 0;
 };
 
-class RecordInitializer : public RecordVisitor {
+class LLVM_ABI RecordInitializer : public RecordVisitor {
   DataExtractor &E;
   uint64_t &OffsetPtr;
   uint16_t Version;
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index 40d5f5af91c9..a3dc58e03333 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -12,8 +12,9 @@
 #ifndef LLVM_XRAY_FDRTRACEWRITER_H
 #define LLVM_XRAY_FDRTRACEWRITER_H
 
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
@@ -26,7 +27,7 @@ namespace xray {
 /// generate various kinds of execution traces without using the XRay runtime.
 /// Note that this writer does not do any validation, but uses the types of
 /// records defined in the FDRRecords.h file.
-class FDRTraceWriter : public RecordVisitor {
+class LLVM_ABI FDRTraceWriter : public RecordVisitor {
 public:
   // Construct an FDRTraceWriter associated with an output stream.
   explicit FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H);
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index 485d26d71456..ecdb975a3066 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_FILEHEADERREADER_H
 #define LLVM_XRAY_FILEHEADERREADER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -23,8 +24,8 @@ namespace xray {
 
 /// Convenience function for loading the file header given a data extractor at a
 /// specified offset.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint64_t &OffsetPtr);
+LLVM_ABI Expected<XRayFileHeader>
+readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
 
 } // namespace xray
 } // namespace llvm
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index 1979108ff413..54737e226df8 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -15,6 +15,7 @@
 #define LLVM_XRAY_INSTRUMENTATIONMAP_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -31,7 +32,8 @@ class InstrumentationMap;
 
 /// Loads the instrumentation map from |Filename|. This auto-deduces the type of
 /// the instrumentation map.
-Expected<InstrumentationMap> loadInstrumentationMap(StringRef Filename);
+LLVM_ABI Expected<InstrumentationMap>
+loadInstrumentationMap(StringRef Filename);
 
 /// Represents an XRay instrumentation sled entry from an object file.
 struct SledEntry {
@@ -83,17 +85,18 @@ private:
   FunctionAddressMap FunctionAddresses;
   FunctionAddressReverseMap FunctionIds;
 
-  friend Expected<InstrumentationMap> loadInstrumentationMap(StringRef);
+  LLVM_ABI_FRIEND friend Expected<InstrumentationMap>
+      loadInstrumentationMap(StringRef);
 
 public:
   /// Provides a raw accessor to the unordered map of function addresses.
   const FunctionAddressMap &getFunctionAddresses() { return FunctionAddresses; }
 
   /// Returns an XRay computed function id, provided a function address.
-  std::optional<int32_t> getFunctionId(uint64_t Addr) const;
+  LLVM_ABI std::optional<int32_t> getFunctionId(uint64_t Addr) const;
 
   /// Returns the function address for a function id.
-  std::optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
+  LLVM_ABI std::optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
 
   /// Provide read-only access to the entries of the instrumentation map.
   const SledContainer &sleds() const { return Sleds; };
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index 79d9b53387f3..e30c01e489d3 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include <list>
 #include <utility>
@@ -34,18 +35,18 @@ class Trace;
 ///
 /// For any errors encountered in the loading of the profile data from
 /// |Filename|, this function will return an Error condition appropriately.
-Expected<Profile> loadProfile(StringRef Filename);
+LLVM_ABI Expected<Profile> loadProfile(StringRef Filename);
 
 /// This algorithm will merge two Profile instances into a single Profile
 /// instance, aggregating blocks by Thread ID.
-Profile mergeProfilesByThread(const Profile &L, const Profile &R);
+LLVM_ABI Profile mergeProfilesByThread(const Profile &L, const Profile &R);
 
 /// This algorithm will merge two Profile instances into a single Profile
 /// instance, aggregating blocks by function call stack.
-Profile mergeProfilesByStack(const Profile &L, const Profile &R);
+LLVM_ABI Profile mergeProfilesByStack(const Profile &L, const Profile &R);
 
 /// This function takes a Trace and creates a Profile instance from it.
-Expected<Profile> profileFromTrace(const Trace &T);
+LLVM_ABI Expected<Profile> profileFromTrace(const Trace &T);
 
 /// Profile instances are thread-compatible.
 class Profile {
@@ -68,11 +69,11 @@ public:
   ///
   /// Returns an error if |P| had not been interned before into the Profile.
   ///
-  Expected<std::vector<FuncID>> expandPath(PathID P) const;
+  LLVM_ABI Expected<std::vector<FuncID>> expandPath(PathID P) const;
 
   /// The stack represented in |P| must be in stack order (leaf to root). This
   /// will always return the same PathID for |P| that has the same sequence.
-  PathID internPath(ArrayRef<FuncID> P);
+  LLVM_ABI PathID internPath(ArrayRef<FuncID> P);
 
   /// Appends a fully-formed Block instance into the Profile.
   ///
@@ -80,7 +81,7 @@ public:
   ///
   ///    - The PathData component of the Block is empty
   ///
-  Error addBlock(Block &&B);
+  LLVM_ABI Error addBlock(Block &&B);
 
   Profile() = default;
   ~Profile() = default;
@@ -99,8 +100,8 @@ public:
     return *this;
   }
 
-  Profile(const Profile &);
-  Profile &operator=(const Profile &);
+  LLVM_ABI Profile(const Profile &);
+  LLVM_ABI Profile &operator=(const Profile &);
 
   friend void swap(Profile &L, Profile &R) {
     using std::swap;
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 8ca4794dce5e..5d2c27757255 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -13,13 +13,14 @@
 #ifndef LLVM_XRAY_RECORDPRINTER_H
 #define LLVM_XRAY_RECORDPRINTER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 
 namespace llvm {
 namespace xray {
 
-class RecordPrinter : public RecordVisitor {
+class LLVM_ABI RecordPrinter : public RecordVisitor {
   raw_ostream &OS;
   std::string Delim;
 
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index eb1f03b2a0d4..af1d35c67817 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -50,7 +51,7 @@ class Trace {
 
   typedef std::vector<XRayRecord>::const_iterator citerator;
 
-  friend Expected<Trace> loadTrace(const DataExtractor &, bool);
+  LLVM_ABI_FRIEND friend Expected<Trace> loadTrace(const DataExtractor &, bool);
 
 public:
   using size_type = RecordVector::size_type;
@@ -68,11 +69,12 @@ public:
 
 /// This function will attempt to load XRay trace records from the provided
 /// |Filename|.
-Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
+LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
 
 /// This function will attempt to load XRay trace records from the provided
 /// DataExtractor.
-Expected<Trace> loadTrace(const DataExtractor &Extractor, bool Sort = false);
+LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
+                                   bool Sort = false);
 
 } // namespace xray
 } // namespace llvm

From 695c4f2309718c441bc2e5b7dd3e3267737a12e6 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Mon, 16 Jun 2025 14:04:30 -0400
Subject: [PATCH 0527/1322] [NFC][mlir][tensor] Use `ValueRange` instead of
 `SmallVector` in `tensor::createPadHighOp` (#144397)

Use `ValueRange` instead of `SmallVector` in `tensor::createPadHighOp`
for the `dynOutDims` arg.
---
 mlir/include/mlir/Dialect/Tensor/Utils/Utils.h | 2 +-
 mlir/lib/Dialect/Tensor/Utils/Utils.cpp        | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
index 1a4733df3f18..a1ce4e252c2f 100644
--- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -30,7 +30,7 @@ namespace tensor {
 // for _static_ dimensions.
 PadOp createPadHighOp(RankedTensorType resType, Value source, Value pad,
                       bool nofold, Location loc, OpBuilder &builder,
-                      SmallVector<Value> dynOutDims = {});
+                      ValueRange dynOutDims = std::nullopt);
 
 // Creates dim ops for each dynamic dimension of the ranked tensor argument and
 // returns these as values.
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index 11ae0108594d..289296a07d9d 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -24,8 +24,7 @@ using namespace mlir::tensor;
 
 PadOp mlir::tensor::createPadHighOp(RankedTensorType resType, Value source,
                                     Value pad, bool nofold, Location loc,
-                                    OpBuilder &b,
-                                    SmallVector<Value> dynOutDims) {
+                                    OpBuilder &b, ValueRange dynOutDims) {
 
   // This assumption simplifies the following logic without limiting what's
   // required _today_. If needed, we can relax it in the future.

From 492d25bbe12af7702a392fa7ad41eb9e09a48cf2 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 16 Jun 2025 11:04:56 -0700
Subject: [PATCH 0528/1322] [llvm] annotate interfaces in llvm/ObjectYAML for
 DLL export (#143763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/ObjectYAML`
library. These annotations currently have no meaningful impact on the
LLVM build; however, they are a prerequisite to support an LLVM Windows
DLL (shared library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

These were generated automatically using the [Interface Definition
Scanner (IDS)](https://github.com/compnerd/ids) tool, followed
formatting with `git clang-format`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 .../ObjectYAML/CodeViewYAMLDebugSections.h    | 12 +--
 .../llvm/ObjectYAML/CodeViewYAMLSymbols.h     |  6 +-
 .../llvm/ObjectYAML/CodeViewYAMLTypes.h       | 14 ++--
 llvm/include/llvm/ObjectYAML/DWARFEmitter.h   | 35 ++++----
 llvm/include/llvm/ObjectYAML/DWARFYAML.h      | 64 ++++++++-------
 .../include/llvm/ObjectYAML/DXContainerYAML.h | 81 ++++++++++---------
 llvm/include/llvm/ObjectYAML/YAML.h           |  9 ++-
 llvm/include/llvm/ObjectYAML/yaml2obj.h       | 40 +++++----
 8 files changed, 145 insertions(+), 116 deletions(-)

diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
index 6c712956dfb5..4e7984c54a72 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
@@ -19,6 +19,7 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -108,23 +109,24 @@ struct InlineeInfo {
 };
 
 struct YAMLDebugSubsection {
-  static Expected<YAMLDebugSubsection>
+  LLVM_ABI static Expected<YAMLDebugSubsection>
   fromCodeViewSubection(const codeview::StringsAndChecksumsRef &SC,
                         const codeview::DebugSubsectionRecord &SS);
 
   std::shared_ptr<detail::YAMLSubsectionBase> Subsection;
 };
 
-Expected<std::vector<std::shared_ptr<codeview::DebugSubsection>>>
+LLVM_ABI Expected<std::vector<std::shared_ptr<codeview::DebugSubsection>>>
 toCodeViewSubsectionList(BumpPtrAllocator &Allocator,
                          ArrayRef<YAMLDebugSubsection> Subsections,
                          const codeview::StringsAndChecksums &SC);
 
-std::vector<YAMLDebugSubsection>
+LLVM_ABI std::vector<YAMLDebugSubsection>
 fromDebugS(ArrayRef<uint8_t> Data, const codeview::StringsAndChecksumsRef &SC);
 
-void initializeStringsAndChecksums(ArrayRef<YAMLDebugSubsection> Sections,
-                                   codeview::StringsAndChecksums &SC);
+LLVM_ABI void
+initializeStringsAndChecksums(ArrayRef<YAMLDebugSubsection> Sections,
+                              codeview::StringsAndChecksums &SC);
 
 } // end namespace CodeViewYAML
 
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
index 7c05c9eea05e..dccc77dc1a0c 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
@@ -16,6 +16,7 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <memory>
@@ -32,11 +33,12 @@ struct SymbolRecordBase;
 struct SymbolRecord {
   std::shared_ptr<detail::SymbolRecordBase> Symbol;
 
-  codeview::CVSymbol
+  LLVM_ABI codeview::CVSymbol
   toCodeViewSymbol(BumpPtrAllocator &Allocator,
                    codeview::CodeViewContainer Container) const;
 
-  static Expected<SymbolRecord> fromCodeViewSymbol(codeview::CVSymbol Symbol);
+  LLVM_ABI static Expected<SymbolRecord>
+  fromCodeViewSymbol(codeview::CVSymbol Symbol);
 };
 
 } // end namespace CodeViewYAML
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
index 04b5e0ba3aa1..3c239ce507df 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -45,15 +46,16 @@ struct MemberRecord {
 struct LeafRecord {
   std::shared_ptr<detail::LeafRecordBase> Leaf;
 
-  codeview::CVType
+  LLVM_ABI codeview::CVType
   toCodeViewRecord(codeview::AppendingTypeTableBuilder &Serializer) const;
-  static Expected<LeafRecord> fromCodeViewRecord(codeview::CVType Type);
+  LLVM_ABI static Expected<LeafRecord>
+  fromCodeViewRecord(codeview::CVType Type);
 };
 
-std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
-                                   StringRef SectionName);
-ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc,
-                           StringRef SectionName);
+LLVM_ABI std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
+                                            StringRef SectionName);
+LLVM_ABI ArrayRef<uint8_t>
+toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc, StringRef SectionName);
 
 } // end namespace CodeViewYAML
 
diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index 5e1b88f4fef6..050ff60bcd40 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TargetParser/Host.h"
@@ -27,26 +28,26 @@ namespace DWARFYAML {
 
 struct Data;
 
-Error emitDebugAbbrev(raw_ostream &OS, const Data &DI);
-Error emitDebugStr(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugAbbrev(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugStr(raw_ostream &OS, const Data &DI);
 
-Error emitDebugAranges(raw_ostream &OS, const Data &DI);
-Error emitDebugRanges(raw_ostream &OS, const Data &DI);
-Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
-Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
-Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
-Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
-Error emitDebugInfo(raw_ostream &OS, const Data &DI);
-Error emitDebugLine(raw_ostream &OS, const Data &DI);
-Error emitDebugAddr(raw_ostream &OS, const Data &DI);
-Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
-Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
-Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
-Error emitDebugNames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugAranges(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugRanges(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugInfo(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugLine(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugAddr(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugNames(raw_ostream &OS, const Data &DI);
 
-std::function<Error(raw_ostream &, const Data &)>
+LLVM_ABI std::function<Error(raw_ostream &, const Data &)>
 getDWARFEmitterByName(StringRef SecName);
-Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
+LLVM_ABI Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
 emitDebugSections(StringRef YAMLString,
                   bool IsLittleEndian = sys::IsLittleEndianHost,
                   bool Is64BitAddrSize = true);
diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 69f8c4f27d7a..c8528686592a 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 #include <optional>
@@ -255,16 +256,16 @@ struct Data {
   std::optional<std::vector<ListTable<LoclistEntry>>> DebugLoclists;
   std::optional<DebugNamesSection> DebugNames;
 
-  bool isEmpty() const;
+  LLVM_ABI bool isEmpty() const;
 
-  SetVector<StringRef> getNonEmptySectionNames() const;
+  LLVM_ABI SetVector<StringRef> getNonEmptySectionNames() const;
 
   struct AbbrevTableInfo {
     uint64_t Index;
     uint64_t Offset;
   };
-  Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
-  StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
+  LLVM_ABI Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
+  LLVM_ABI StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
 
 private:
   mutable std::unordered_map<uint64_t, AbbrevTableInfo> AbbrevTableInfoMap;
@@ -310,88 +311,90 @@ namespace llvm {
 namespace yaml {
 
 template <> struct MappingTraits<DWARFYAML::Data> {
-  static void mapping(IO &IO, DWARFYAML::Data &DWARF);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Data &DWARF);
 };
 
 template <> struct MappingTraits<DWARFYAML::AbbrevTable> {
-  static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::Abbrev> {
-  static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
 };
 
 template <> struct MappingTraits<DWARFYAML::AttributeAbbrev> {
-  static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
 };
 
 template <> struct MappingTraits<DWARFYAML::ARangeDescriptor> {
-  static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
 };
 
 template <> struct MappingTraits<DWARFYAML::ARange> {
-  static void mapping(IO &IO, DWARFYAML::ARange &ARange);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::ARange &ARange);
 };
 
 template <> struct MappingTraits<DWARFYAML::RangeEntry> {
-  static void mapping(IO &IO, DWARFYAML::RangeEntry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::RangeEntry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::Ranges> {
-  static void mapping(IO &IO, DWARFYAML::Ranges &Ranges);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Ranges &Ranges);
 };
 
 template <> struct MappingTraits<DWARFYAML::PubEntry> {
-  static void mapping(IO &IO, DWARFYAML::PubEntry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::PubEntry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::PubSection> {
-  static void mapping(IO &IO, DWARFYAML::PubSection &Section);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::PubSection &Section);
 };
 
 template <> struct MappingTraits<DWARFYAML::Unit> {
-  static void mapping(IO &IO, DWARFYAML::Unit &Unit);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Unit &Unit);
 };
 
 template <> struct MappingTraits<DWARFYAML::DebugNamesSection> {
-  static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
 };
 template <> struct MappingTraits<DWARFYAML::DebugNameEntry> {
-  static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
 };
 template <> struct MappingTraits<DWARFYAML::DebugNameAbbreviation> {
-  static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
 };
 template <> struct MappingTraits<DWARFYAML::IdxForm> {
-  static void mapping(IO &IO, DWARFYAML::IdxForm &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::IdxForm &);
 };
 
 template <> struct MappingTraits<DWARFYAML::Entry> {
-  static void mapping(IO &IO, DWARFYAML::Entry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Entry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::FormValue> {
-  static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
 };
 
 template <> struct MappingTraits<DWARFYAML::File> {
-  static void mapping(IO &IO, DWARFYAML::File &File);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::File &File);
 };
 
 template <> struct MappingTraits<DWARFYAML::LineTableOpcode> {
-  static void mapping(IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::LineTableOpcode &LineTableOpcode);
 };
 
 template <> struct MappingTraits<DWARFYAML::LineTable> {
-  static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::SegAddrPair> {
-  static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
 };
 
 template <> struct MappingTraits<DWARFYAML::DWARFOperation> {
-  static void mapping(IO &IO, DWARFYAML::DWARFOperation &DWARFOperation);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::DWARFOperation &DWARFOperation);
 };
 
 template <typename EntryType>
@@ -407,19 +410,20 @@ struct MappingTraits<DWARFYAML::ListEntries<EntryType>> {
 };
 
 template <> struct MappingTraits<DWARFYAML::RnglistEntry> {
-  static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
 };
 
 template <> struct MappingTraits<DWARFYAML::LoclistEntry> {
-  static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
 };
 
 template <> struct MappingTraits<DWARFYAML::AddrTableEntry> {
-  static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::StringOffsetsTable> {
-  static void mapping(IO &IO, DWARFYAML::StringOffsetsTable &StrOffsetsTable);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::StringOffsetsTable &StrOffsetsTable);
 };
 
 template <> struct ScalarEnumerationTraits<dwarf::DwarfFormat> {
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 8a0dfd871879..c235112dacf7 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -19,6 +19,7 @@
 #include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/Object/DXContainer.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <array>
 #include <optional>
@@ -59,14 +60,14 @@ struct DXILProgram {
 #define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) bool Val = false;
 struct ShaderFeatureFlags {
   ShaderFeatureFlags() = default;
-  ShaderFeatureFlags(uint64_t FlagData);
-  uint64_t getEncodedFlags();
+  LLVM_ABI ShaderFeatureFlags(uint64_t FlagData);
+  LLVM_ABI uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
 
 struct ShaderHash {
   ShaderHash() = default;
-  ShaderHash(const dxbc::ShaderHash &Data);
+  LLVM_ABI ShaderHash(const dxbc::ShaderHash &Data);
 
   bool IncludesSource;
   std::vector<llvm::yaml::Hex8> Digest;
@@ -84,7 +85,7 @@ struct RootDescriptorYaml {
   uint32_t ShaderRegister;
   uint32_t RegisterSpace;
 
-  uint32_t getEncodedFlags() const;
+  LLVM_ABI uint32_t getEncodedFlags() const;
 
 #define ROOT_DESCRIPTOR_FLAG(Num, Val) bool Val = false;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -97,7 +98,7 @@ struct DescriptorRangeYaml {
   uint32_t RegisterSpace;
   uint32_t OffsetInDescriptorsFromTableStart;
 
-  uint32_t getEncodedFlags() const;
+  LLVM_ABI uint32_t getEncodedFlags() const;
 
 #define DESCRIPTOR_RANGE_FLAG(Num, Val) bool Val = false;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -193,13 +194,13 @@ struct RootSignatureYamlDesc {
   RootParameterYamlDesc Parameters;
   SmallVector<StaticSamplerYamlDesc> StaticSamplers;
 
-  uint32_t getEncodedFlags();
+  LLVM_ABI uint32_t getEncodedFlags();
 
   iterator_range<StaticSamplerYamlDesc *> samplers() {
     return make_range(StaticSamplers.begin(), StaticSamplers.end());
   }
 
-  static llvm::Expected<DXContainerYAML::RootSignatureYamlDesc>
+  LLVM_ABI static llvm::Expected<DXContainerYAML::RootSignatureYamlDesc>
   create(const object::DirectX::RootSignature &Data);
 
 #define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false;
@@ -258,13 +259,13 @@ struct PSVInfo {
 
   StringRef EntryName;
 
-  void mapInfoForVersion(yaml::IO &IO);
+  LLVM_ABI void mapInfoForVersion(yaml::IO &IO);
 
-  PSVInfo();
-  PSVInfo(const dxbc::PSV::v0::RuntimeInfo *P, uint16_t Stage);
-  PSVInfo(const dxbc::PSV::v1::RuntimeInfo *P);
-  PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
-  PSVInfo(const dxbc::PSV::v3::RuntimeInfo *P, StringRef StringTable);
+  LLVM_ABI PSVInfo();
+  LLVM_ABI PSVInfo(const dxbc::PSV::v0::RuntimeInfo *P, uint16_t Stage);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v1::RuntimeInfo *P);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v3::RuntimeInfo *P, StringRef StringTable);
 };
 
 struct SignatureParameter {
@@ -328,88 +329,96 @@ class raw_ostream;
 namespace yaml {
 
 template <> struct MappingTraits<DXContainerYAML::VersionTuple> {
-  static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
 };
 
 template <> struct MappingTraits<DXContainerYAML::FileHeader> {
-  static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
 };
 
 template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
-  static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
+  LLVM_ABI static void mapping(IO &IO,
+                               DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderHash &Hash);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ShaderHash &Hash);
 };
 
 template <> struct MappingTraits<DXContainerYAML::PSVInfo> {
-  static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Part> {
-  static void mapping(IO &IO, DXContainerYAML::Part &Version);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Part &Version);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Object> {
-  static void mapping(IO &IO, DXContainerYAML::Object &Obj);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Object &Obj);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ResourceFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ResourceBindInfo> {
-  static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
 };
 
 template <> struct MappingTraits<DXContainerYAML::SignatureElement> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureElement &El);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::SignatureElement &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::SignatureParameter> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureParameter &El);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::SignatureParameter &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Signature> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
+  LLVM_ABI static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::RootSignatureYamlDesc> {
-  static void mapping(IO &IO,
-                      DXContainerYAML::RootSignatureYamlDesc &RootSignature);
+  LLVM_ABI static void
+  mapping(IO &IO, DXContainerYAML::RootSignatureYamlDesc &RootSignature);
 };
 
 template <>
 struct MappingContextTraits<DXContainerYAML::RootParameterLocationYaml,
                             DXContainerYAML::RootSignatureYamlDesc> {
-  static void mapping(IO &IO,
-                      llvm::DXContainerYAML::RootParameterLocationYaml &L,
-                      DXContainerYAML::RootSignatureYamlDesc &S);
+  LLVM_ABI static void
+  mapping(IO &IO, llvm::DXContainerYAML::RootParameterLocationYaml &L,
+          DXContainerYAML::RootSignatureYamlDesc &S);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::RootConstantsYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::RootConstantsYaml &C);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::RootConstantsYaml &C);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::RootDescriptorYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::RootDescriptorYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::RootDescriptorYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::DescriptorTableYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::DescriptorTableYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::DescriptorTableYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::DescriptorRangeYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::DescriptorRangeYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::DescriptorRangeYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::StaticSamplerYamlDesc> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::StaticSamplerYamlDesc &S);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::StaticSamplerYamlDesc &S);
 };
 
 } // namespace yaml
diff --git a/llvm/include/llvm/ObjectYAML/YAML.h b/llvm/include/llvm/ObjectYAML/YAML.h
index 3bf6527a7e2d..709520c934d7 100644
--- a/llvm/include/llvm/ObjectYAML/YAML.h
+++ b/llvm/include/llvm/ObjectYAML/YAML.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 
@@ -86,13 +87,13 @@ public:
   /// Write the contents (regardless of whether it is binary or a
   /// hex string) as binary to the given raw_ostream.
   /// N can be used to specify the maximum number of bytes.
-  void writeAsBinary(raw_ostream &OS, uint64_t N = UINT64_MAX) const;
+  LLVM_ABI void writeAsBinary(raw_ostream &OS, uint64_t N = UINT64_MAX) const;
 
   /// Write the contents (regardless of whether it is binary or a
   /// hex string) as hex to the given raw_ostream.
   ///
   /// For example, a possible output could be `DEADBEEFCAFEBABE`.
-  void writeAsHex(raw_ostream &OS) const;
+  LLVM_ABI void writeAsHex(raw_ostream &OS) const;
 };
 
 inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
@@ -104,8 +105,8 @@ inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
 }
 
 template <> struct ScalarTraits<BinaryRef> {
-  static void output(const BinaryRef &, void *, raw_ostream &);
-  static StringRef input(StringRef, void *, BinaryRef &);
+  LLVM_ABI static void output(const BinaryRef &, void *, raw_ostream &);
+  LLVM_ABI static StringRef input(StringRef, void *, BinaryRef &);
   static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
diff --git a/llvm/include/llvm/ObjectYAML/yaml2obj.h b/llvm/include/llvm/ObjectYAML/yaml2obj.h
index 3b458c3cd890..4c9084b79050 100644
--- a/llvm/include/llvm/ObjectYAML/yaml2obj.h
+++ b/llvm/include/llvm/ObjectYAML/yaml2obj.h
@@ -12,6 +12,7 @@
 #define LLVM_OBJECTYAML_YAML2OBJ_H
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 namespace llvm {
@@ -66,25 +67,32 @@ struct YamlObjectFile;
 
 using ErrorHandler = llvm::function_ref<void(const Twine &Msg)>;
 
-bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2goff(GOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
-              uint64_t MaxSize);
-bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
-                   ErrorHandler EH);
-bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
-                      ErrorHandler EH);
+LLVM_ABI bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out,
+                           ErrorHandler EH);
+LLVM_ABI bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2goff(GOFFYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
+                       uint64_t MaxSize);
+LLVM_ABI bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out,
+                         ErrorHandler EH);
+LLVM_ABI bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
+                            ErrorHandler EH);
+LLVM_ABI bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out,
+                           ErrorHandler EH);
+LLVM_ABI bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out,
+                         ErrorHandler EH);
+LLVM_ABI bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
+                               ErrorHandler EH);
 
-bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
-                 unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
+LLVM_ABI bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
+                          unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
 
 /// Convenience function for tests.
-std::unique_ptr<object::ObjectFile>
+LLVM_ABI std::unique_ptr<object::ObjectFile>
 yaml2ObjectFile(SmallVectorImpl<char> &Storage, StringRef Yaml,
                 ErrorHandler ErrHandler);
 

From fccab5d757778204666d70e2f1592952fc8b336d Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 16 Jun 2025 20:10:40 +0200
Subject: [PATCH 0529/1322] [CIR] Upstream ComplexType ImaginaryLiteral
 (#144223)

This change adds support for ComplexType ImaginaryLiteral

https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 30 +++++++++++++++++++++
 clang/test/CIR/CodeGen/complex.cpp          | 29 ++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 2ffe75a388e9..26070a6ca307 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -21,6 +21,8 @@ public:
                           bool isInit);
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
+
+  mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il);
 };
 
 } // namespace
@@ -66,6 +68,34 @@ mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
   return builder.create<cir::ConstantOp>(loc, complexAttr);
 }
 
+mlir::Value
+ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *il) {
+  auto ty = mlir::cast<cir::ComplexType>(cgf.convertType(il->getType()));
+  mlir::Type elementTy = ty.getElementType();
+  mlir::Location loc = cgf.getLoc(il->getExprLoc());
+
+  mlir::TypedAttr realValueAttr;
+  mlir::TypedAttr imagValueAttr;
+
+  if (mlir::isa<cir::IntType>(elementTy)) {
+    llvm::APInt imagValue = cast<IntegerLiteral>(il->getSubExpr())->getValue();
+    realValueAttr = cir::IntAttr::get(elementTy, 0);
+    imagValueAttr = cir::IntAttr::get(elementTy, imagValue);
+  } else {
+    assert(mlir::isa<cir::CIRFPTypeInterface>(elementTy) &&
+           "Expected complex element type to be floating-point");
+
+    llvm::APFloat imagValue =
+        cast<FloatingLiteral>(il->getSubExpr())->getValue();
+    realValueAttr = cir::FPAttr::get(
+        elementTy, llvm::APFloat::getZero(imagValue.getSemantics()));
+    imagValueAttr = cir::FPAttr::get(elementTy, imagValue);
+  }
+
+  auto complexAttr = cir::ConstComplexAttr::get(realValueAttr, imagValueAttr);
+  return builder.create<cir::ConstantOp>(loc, complexAttr);
+}
+
 mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   assert(e && getComplexType(e->getType()) &&
          "Invalid complex expression to emit");
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index d193b9f32efb..db0b9111ab4f 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -176,3 +176,32 @@ void foo7() {
 // OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
 // OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
 
+void foo8() {
+  double _Complex c = 2.00i;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<2.000000e+00> : !cir.double> : !cir.complex<!cir.double>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: store { double, double } { double 0.000000e+00, double 2.000000e+00 }, ptr %[[COMPLEX]], align 8
+
+// OGCG: %[[COMPLEX:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store double 0.000000e+00, ptr %[[C_REAL_PTR]], align 8
+// OGCG: store double 2.000000e+00, ptr %[[C_IMAG_PTR]], align 8
+
+void foo14() {
+  int _Complex c = 2i;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } { i32 0, i32 2 }, ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 2, ptr %[[C_IMAG_PTR]], align 4

From 3f794759f4f2c0ba248a21fb3ec9eb4ff7e35724 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 16 Jun 2025 11:24:22 -0700
Subject: [PATCH 0530/1322] [build] Fixed
 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING handling. (#144391)

Change in #107278 modified the CMake CACHE variable with values
that are not supported for it as documented. This patch renames the
derived vars so that they do not conflict with the CACHE variable.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake         | 10 +++++-----
 llvm/include/llvm/Config/llvm-config.h.cmake       |  4 ++--
 llvm/include/llvm/IR/DebugLoc.h                    | 14 +++++++-------
 llvm/lib/IR/DebugLoc.cpp                           |  4 ++--
 llvm/lib/Transforms/Utils/Debugify.cpp             |  2 +-
 .../gn/secondary/llvm/include/llvm/Config/BUILD.gn |  4 ++--
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 743eb6f5529f..8004d3571fc8 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -199,17 +199,17 @@ endif()
 string(TOUPPER "${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}" uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING)
 
 if( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE" )
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_ORIGIN" )
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
-  set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
   # The DISABLED setting is default.
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 0 )
 else()
   message(FATAL_ERROR "Unknown value for LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING: \"${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}\"!")
 endif()
-# LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING (non-cached) is expected to be
+# LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE (non-cached) is expected to be
 # 1 or 0 here, assuming referenced in #cmakedefine01.
 
 if(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS)
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 6d3c37cc8b19..a0ad517a6ecf 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -131,10 +131,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
 #endif
diff --git a/llvm/include/llvm/IR/DebugLoc.h b/llvm/include/llvm/IR/DebugLoc.h
index 2fabae9bfc66..999e03b6374a 100644
--- a/llvm/include/llvm/IR/DebugLoc.h
+++ b/llvm/include/llvm/IR/DebugLoc.h
@@ -26,7 +26,7 @@ namespace llvm {
   class DILocation;
   class Function;
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
   // Used to represent different "kinds" of DebugLoc, expressing that the
   // instruction it is part of is either normal and should contain a valid
   // DILocation, or otherwise describing the reason why the instruction does
@@ -90,7 +90,7 @@ namespace llvm {
   using DebugLocTrackingRef = DILocAndCoverageTracking;
 #else
   using DebugLocTrackingRef = TrackingMDNodeRef;
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
   /// A debug info location.
   ///
@@ -117,12 +117,12 @@ namespace llvm {
     /// IR.
     LLVM_ABI explicit DebugLoc(const MDNode *N);
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     DebugLoc(DebugLocKind Kind) : Loc(Kind) {}
     DebugLocKind getKind() const { return Loc.Kind; }
 #endif
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     static inline DebugLoc getTemporary() {
       return DebugLoc(DebugLocKind::Temporary);
     }
@@ -140,7 +140,7 @@ namespace llvm {
     static inline DebugLoc getUnknown() { return DebugLoc(); }
     static inline DebugLoc getCompilerGenerated() { return DebugLoc(); }
     static inline DebugLoc getDropped() { return DebugLoc(); }
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
     /// When two instructions are combined into a single instruction we also
     /// need to combine the original locations into a single location.
@@ -174,7 +174,7 @@ namespace llvm {
     DebugLoc orElse(DebugLoc Other) const {
       if (*this)
         return *this;
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
       if (Other)
         return Other;
       if (getKind() != DebugLocKind::Normal)
@@ -184,7 +184,7 @@ namespace llvm {
       return *this;
 #else
       return Other;
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     }
 
     /// Get the underlying \a DILocation.
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 0be6d55d724e..ffeeeb6f1e4b 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -11,11 +11,11 @@
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
     : TrackingMDNodeRef(const_cast<DILocation *>(L)),
       Kind(DebugLocKind::Normal) {}
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 //===----------------------------------------------------------------------===//
 // DebugLoc Implementation
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 729813a92f51..ff8a91bc7e7d 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -299,7 +299,7 @@ bool llvm::stripDebugifyMetadata(Module &M) {
 
 bool hasLoc(const Instruction &I) {
   const DILocation *Loc = I.getDebugLoc().get();
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
   DebugLocKind Kind = I.getDebugLoc().getKind();
   return Loc || Kind != DebugLocKind::Normal;
 #else
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index ca05ac1b2464..c1d107eefdf9 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -298,8 +298,8 @@ write_cmake_config("llvm-config") {
     "LLVM_BUILD_SHARED_LIBS=",
     "LLVM_ENABLE_TELEMETRY=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
-    "LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=",
-    "LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING=",
+    "LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE=",
+    "LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN=",
     "LLVM_ENABLE_DUMP=",
     "LLVM_ENABLE_HTTPLIB=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",

From a3d35b87eacece8cdbb4615ff6c65003773f5cbf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 16 Jun 2025 11:24:33 -0700
Subject: [PATCH 0531/1322] [RISCV] Use RISCV::RVVBitsPerBlock instead of 64 in
 getLMUL1VT. NFC (#144401)

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cfada6c0601..779786fa400f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3499,7 +3499,7 @@ getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
 }
 
 static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
          "Unexpected vector MVT");
   return MVT::getScalableVectorVT(
       VT.getVectorElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fcc9d3977e5c..0093c92ea5ef 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -604,7 +604,7 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
 
 // Consolidate!
 static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
          "Unexpected vector MVT");
   return MVT::getScalableVectorVT(
       VT.getVectorElementType(),

From 539cf824259cbb23ccc68b83ef3cde575ca50842 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 16 Jun 2025 19:24:59 +0100
Subject: [PATCH 0532/1322] [lldb-dap] Use structured types for stepInTargets
 request (#144072)

uses the `SendTargetCapabilities` from #142831
---
 .../test/tools/lldb-dap/dap_server.py         |   2 +-
 .../stepInTargets/TestDAP_stepInTargets.py    |  44 ++++
 lldb/tools/lldb-dap/EventHelper.cpp           |   5 +
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |  13 +-
 .../Handler/StepInTargetsRequestHandler.cpp   | 200 +++++++-----------
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  10 +
 .../lldb-dap/Protocol/ProtocolRequests.h      |  15 ++
 .../tools/lldb-dap/Protocol/ProtocolTypes.cpp |  24 +++
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |  28 +++
 lldb/unittests/DAP/ProtocolTypesTest.cpp      |  20 ++
 10 files changed, 225 insertions(+), 136 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 9786678aa53f..baf2d4ae542b 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -494,7 +494,7 @@ class DebugCommunication(object):
             raise ValueError("didn't get terminated event")
         return event_dict
 
-    def get_capability(self, key):
+    def get_capability(self, key: str):
         """Get a value for the given key if it there is a key/value pair in
         the capabilities reported by the adapter.
         """
diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
index 07acfe07c9ff..51ccf2ccbdca 100644
--- a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
@@ -78,3 +78,47 @@ class TestDAP_stepInTargets(lldbdap_testcase.DAPTestCaseBase):
         leaf_frame = self.dap_server.get_stackFrame()
         self.assertIsNotNone(leaf_frame, "expect a leaf frame")
         self.assertEqual(step_in_targets[1]["label"], leaf_frame["name"])
+
+    @skipIf(archs=no_match(["x86", "x86_64"]))
+    def test_supported_capability_x86_arch(self):
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        bp_lines = [line_number(source, "// set breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, bp_lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        self.assertEqual(
+            is_supported,
+            True,
+            f"expect capability `stepInTarget` is supported with architecture {self.getArchitecture()}",
+        )
+        # clear breakpoints.
+        self.set_source_breakpoints(source, [])
+        self.continue_to_exit()
+
+    @skipIf(archs=["x86", "x86_64"])
+    def test_supported_capability_other_archs(self):
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        bp_lines = [line_number(source, "// set breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, bp_lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        self.assertEqual(
+            is_supported,
+            False,
+            f"expect capability `stepInTarget` is not supported with architecture {self.getArchitecture()}",
+        )
+        # clear breakpoints.
+        self.set_source_breakpoints(source, [])
+        self.continue_to_exit()
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index 9641f29698b1..364cc7ab4ef8 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -44,6 +44,11 @@ void SendTargetBasedCapabilities(DAP &dap) {
 
   protocol::CapabilitiesEventBody body;
 
+  const llvm::StringRef target_triple = dap.target.GetTriple();
+  if (target_triple.starts_with("x86"))
+    body.capabilities.supportedFeatures.insert(
+        protocol::eAdapterFeatureStepInTargetsRequest);
+
   // We only support restarting launch requests not attach requests.
   if (dap.last_launch_request)
     body.capabilities.supportedFeatures.insert(
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index d3f231589b54..0ac8ca7c9a49 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -353,14 +353,15 @@ public:
   llvm::Error Run(const protocol::StepInArguments &args) const override;
 };
 
-class StepInTargetsRequestHandler : public LegacyRequestHandler {
+class StepInTargetsRequestHandler
+    : public RequestHandler<
+          protocol::StepInTargetsArguments,
+          llvm::Expected<protocol::StepInTargetsResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "stepInTargets"; }
-  FeatureSet GetSupportedFeatures() const override {
-    return {protocol::eAdapterFeatureStepInTargetsRequest};
-  }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::StepInTargetsResponseBody>
+  Run(const protocol::StepInTargetsArguments &args) const override;
 };
 
 class StepOutRequestHandler : public RequestHandler<protocol::StepOutArguments,
diff --git a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
index 9b99791599f8..1a76371be2d5 100644
--- a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
@@ -7,143 +7,85 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
-#include "EventHelper.h"
-#include "JSONUtils.h"
+#include "Protocol/ProtocolRequests.h"
 #include "RequestHandler.h"
 #include "lldb/API/SBInstruction.h"
+#include "lldb/lldb-defines.h"
 
+using namespace lldb_dap::protocol;
 namespace lldb_dap {
 
-// "StepInTargetsRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "This request retrieves the possible step-in targets for
-//     the specified stack frame.\nThese targets can be used in the `stepIn`
-//     request.\nClients should only call this request if the corresponding
-//     capability `supportsStepInTargetsRequest` is true.", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "stepInTargets" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/StepInTargetsArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "StepInTargetsArguments": {
-//   "type": "object",
-//   "description": "Arguments for `stepInTargets` request.",
-//   "properties": {
-//     "frameId": {
-//       "type": "integer",
-//       "description": "The stack frame for which to retrieve the possible
-//       step-in targets."
-//     }
-//   },
-//   "required": [ "frameId" ]
-// },
-// "StepInTargetsResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to `stepInTargets` request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "targets": {
-//             "type": "array",
-//             "items": {
-//               "$ref": "#/definitions/StepInTarget"
-//             },
-//             "description": "The possible step-in targets of the specified
-//             source location."
-//           }
-//         },
-//         "required": [ "targets" ]
-//       }
-//     },
-//     "required": [ "body" ]
-//   }]
-// }
-void StepInTargetsRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-
+// This request retrieves the possible step-in targets for the specified stack
+// frame.
+// These targets can be used in the `stepIn` request.
+// Clients should only call this request if the corresponding capability
+// `supportsStepInTargetsRequest` is true.
+llvm::Expected<StepInTargetsResponseBody>
+StepInTargetsRequestHandler::Run(const StepInTargetsArguments &args) const {
   dap.step_in_targets.clear();
-  lldb::SBFrame frame = dap.GetLLDBFrame(*arguments);
-  if (frame.IsValid()) {
-    lldb::SBAddress pc_addr = frame.GetPCAddress();
-    lldb::SBAddress line_end_addr =
-        pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
-    lldb::SBInstructionList insts = dap.target.ReadInstructions(
-        pc_addr, line_end_addr, /*flavor_string=*/nullptr);
+  const lldb::SBFrame frame = dap.GetLLDBFrame(args.frameId);
+  if (!frame.IsValid())
+    return llvm::make_error<DAPError>("Failed to get frame for input frameId.");
 
-    if (!insts.IsValid()) {
-      response["success"] = false;
-      response["message"] = "Failed to get instructions for frame.";
-      dap.SendJSON(llvm::json::Value(std::move(response)));
-      return;
+  lldb::SBAddress pc_addr = frame.GetPCAddress();
+  lldb::SBAddress line_end_addr =
+      pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
+  lldb::SBInstructionList insts = dap.target.ReadInstructions(
+      pc_addr, line_end_addr, /*flavor_string=*/nullptr);
+
+  if (!insts.IsValid())
+    return llvm::make_error<DAPError>("Failed to get instructions for frame.");
+
+  StepInTargetsResponseBody body;
+  const size_t num_insts = insts.GetSize();
+  for (size_t i = 0; i < num_insts; ++i) {
+    lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
+    if (!inst.IsValid())
+      break;
+
+    const lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(dap.target);
+    if (inst_addr == LLDB_INVALID_ADDRESS)
+      break;
+
+    // Note: currently only x86/x64 supports flow kind.
+    const lldb::InstructionControlFlowKind flow_kind =
+        inst.GetControlFlowKind(dap.target);
+
+    if (flow_kind == lldb::eInstructionControlFlowKindCall) {
+
+      const llvm::StringRef call_operand_name = inst.GetOperands(dap.target);
+      lldb::addr_t call_target_addr = LLDB_INVALID_ADDRESS;
+      if (call_operand_name.getAsInteger(0, call_target_addr))
+        continue;
+
+      const lldb::SBAddress call_target_load_addr =
+          dap.target.ResolveLoadAddress(call_target_addr);
+      if (!call_target_load_addr.IsValid())
+        continue;
+
+      // The existing ThreadPlanStepInRange only accept step in target
+      // function with debug info.
+      lldb::SBSymbolContext sc = dap.target.ResolveSymbolContextForAddress(
+          call_target_load_addr, lldb::eSymbolContextFunction);
+
+      // The existing ThreadPlanStepInRange only accept step in target
+      // function with debug info.
+      llvm::StringRef step_in_target_name;
+      if (sc.IsValid() && sc.GetFunction().IsValid())
+        step_in_target_name = sc.GetFunction().GetDisplayName();
+
+      // Skip call sites if we fail to resolve its symbol name.
+      if (step_in_target_name.empty())
+        continue;
+
+      StepInTarget target;
+      target.id = inst_addr;
+      target.label = step_in_target_name;
+      dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
+      body.targets.emplace_back(std::move(target));
     }
-
-    llvm::json::Array step_in_targets;
-    const auto num_insts = insts.GetSize();
-    for (size_t i = 0; i < num_insts; ++i) {
-      lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
-      if (!inst.IsValid())
-        break;
-
-      lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(dap.target);
-
-      // Note: currently only x86/x64 supports flow kind.
-      lldb::InstructionControlFlowKind flow_kind =
-          inst.GetControlFlowKind(dap.target);
-      if (flow_kind == lldb::eInstructionControlFlowKindCall) {
-        // Use call site instruction address as id which is easy to debug.
-        llvm::json::Object step_in_target;
-        step_in_target["id"] = inst_addr;
-
-        llvm::StringRef call_operand_name = inst.GetOperands(dap.target);
-        lldb::addr_t call_target_addr;
-        if (call_operand_name.getAsInteger(0, call_target_addr))
-          continue;
-
-        lldb::SBAddress call_target_load_addr =
-            dap.target.ResolveLoadAddress(call_target_addr);
-        if (!call_target_load_addr.IsValid())
-          continue;
-
-        // The existing ThreadPlanStepInRange only accept step in target
-        // function with debug info.
-        lldb::SBSymbolContext sc = dap.target.ResolveSymbolContextForAddress(
-            call_target_load_addr, lldb::eSymbolContextFunction);
-
-        // The existing ThreadPlanStepInRange only accept step in target
-        // function with debug info.
-        std::string step_in_target_name;
-        if (sc.IsValid() && sc.GetFunction().IsValid())
-          step_in_target_name = sc.GetFunction().GetDisplayName();
-
-        // Skip call sites if we fail to resolve its symbol name.
-        if (step_in_target_name.empty())
-          continue;
-
-        dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
-        step_in_target.try_emplace("label", step_in_target_name);
-        step_in_targets.emplace_back(std::move(step_in_target));
-      }
-    }
-    llvm::json::Object body;
-    body.try_emplace("targets", std::move(step_in_targets));
-    response.try_emplace("body", std::move(body));
-  } else {
-    response["success"] = llvm::json::Value(false);
-    response["message"] = "Failed to get frame for input frameId.";
   }
-  dap.SendJSON(llvm::json::Value(std::move(response)));
-}
+  return body;
+};
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 2cb7c47d6020..1b1891ba59e6 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -368,6 +368,16 @@ bool fromJSON(const json::Value &Params, StepInArguments &SIA, json::Path P) {
          OM.mapOptional("granularity", SIA.granularity);
 }
 
+bool fromJSON(const llvm::json::Value &Params, StepInTargetsArguments &SITA,
+              llvm::json::Path P) {
+  json::ObjectMapper OM(Params, P);
+  return OM && OM.map("frameId", SITA.frameId);
+}
+
+llvm::json::Value toJSON(const StepInTargetsResponseBody &SITR) {
+  return llvm::json::Object{{"targets", SITR.targets}};
+}
+
 bool fromJSON(const json::Value &Params, StepOutArguments &SOA, json::Path P) {
   json::ObjectMapper OM(Params, P);
   return OM && OM.map("threadId", SOA.threadId) &&
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index d199cc886b11..583c203be8e1 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -533,6 +533,21 @@ bool fromJSON(const llvm::json::Value &, StepInArguments &, llvm::json::Path);
 /// body field is required.
 using StepInResponse = VoidResponse;
 
+/// Arguments for `stepInTargets` request.
+struct StepInTargetsArguments {
+  /// The stack frame for which to retrieve the possible step-in targets.
+  uint64_t frameId = LLDB_INVALID_FRAME_ID;
+};
+bool fromJSON(const llvm::json::Value &, StepInTargetsArguments &,
+              llvm::json::Path);
+
+/// Response to `stepInTargets` request.
+struct StepInTargetsResponseBody {
+  /// The possible step-in targets of the specified source location.
+  std::vector<StepInTarget> targets;
+};
+llvm::json::Value toJSON(const StepInTargetsResponseBody &);
+
 /// Arguments for `stepOut` request.
 struct StepOutArguments {
   /// Specifies the thread for which to resume execution for one step-out (of
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index 085d53bb006e..c21f8382320a 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -582,6 +582,30 @@ llvm::json::Value toJSON(const SteppingGranularity &SG) {
   llvm_unreachable("unhandled stepping granularity.");
 }
 
+bool fromJSON(const json::Value &Params, StepInTarget &SIT, json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("id", SIT.id) && O.map("label", SIT.label) &&
+         O.mapOptional("line", SIT.line) &&
+         O.mapOptional("column", SIT.column) &&
+         O.mapOptional("endLine", SIT.endLine) &&
+         O.mapOptional("endColumn", SIT.endColumn);
+}
+
+llvm::json::Value toJSON(const StepInTarget &SIT) {
+  json::Object target{{"id", SIT.id}, {"label", SIT.label}};
+
+  if (SIT.line != LLDB_INVALID_LINE_NUMBER)
+    target.insert({"line", SIT.line});
+  if (SIT.column != LLDB_INVALID_COLUMN_NUMBER)
+    target.insert({"column", SIT.column});
+  if (SIT.endLine != LLDB_INVALID_LINE_NUMBER)
+    target.insert({"endLine", SIT.endLine});
+  if (SIT.endLine != LLDB_INVALID_COLUMN_NUMBER)
+    target.insert({"endColumn", SIT.endColumn});
+
+  return target;
+}
+
 bool fromJSON(const json::Value &Params, Thread &T, json::Path P) {
   json::ObjectMapper O(Params, P);
   return O && O.map("id", T.id) && O.map("name", T.name);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index c7acfc482987..d7094fbab9e5 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -414,6 +414,34 @@ bool fromJSON(const llvm::json::Value &, SteppingGranularity &,
               llvm::json::Path);
 llvm::json::Value toJSON(const SteppingGranularity &);
 
+/// A `StepInTarget` can be used in the `stepIn` request and determines into
+/// which single target the `stepIn` request should step.
+struct StepInTarget {
+  /// Unique identifier for a step-in target.
+  lldb::addr_t id = LLDB_INVALID_ADDRESS;
+
+  /// The name of the step-in target (shown in the UI).
+  std::string label;
+
+  /// The line of the step-in target.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// Start position of the range covered by the step in target. It is measured
+  /// in UTF-16 code units and the client capability `columnsStartAt1`
+  /// determines whether it is 0- or 1-based.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The end line of the range covered by the step-in target.
+  uint32_t endLine = LLDB_INVALID_LINE_NUMBER;
+
+  /// End position of the range covered by the step in target. It is measured in
+  /// UTF-16 code units and the client capability `columnsStartAt1` determines
+  /// whether it is 0- or 1-based.
+  uint32_t endColumn = LLDB_INVALID_COLUMN_NUMBER;
+};
+bool fromJSON(const llvm::json::Value &, StepInTarget &, llvm::json::Path);
+llvm::json::Value toJSON(const StepInTarget &);
+
 /// A Thread.
 struct Thread {
   /// Unique identifier for the thread.
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index adf43c9ac204..f2a23db34656 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -686,3 +686,23 @@ TEST(ProtocolTypesTest, CapabilitiesEventBody) {
   // Validate toJSON
   EXPECT_EQ(json, pp(body));
 }
+
+TEST(ProtocolTypesTest, StepInTarget) {
+  StepInTarget target;
+  target.id = 230;
+  target.label = "the_function_name";
+  target.line = 2;
+  target.column = 320;
+  target.endLine = 32;
+  target.endColumn = 23;
+
+  llvm::Expected<StepInTarget> deserialized_target = roundtrip(target);
+  ASSERT_THAT_EXPECTED(deserialized_target, llvm::Succeeded());
+
+  EXPECT_EQ(target.id, deserialized_target->id);
+  EXPECT_EQ(target.label, deserialized_target->label);
+  EXPECT_EQ(target.line, deserialized_target->line);
+  EXPECT_EQ(target.column, deserialized_target->column);
+  EXPECT_EQ(target.endLine, deserialized_target->endLine);
+  EXPECT_EQ(target.endColumn, deserialized_target->endColumn);
+}
\ No newline at end of file

From b5c6245cb46354923940b95a89213fa0924e5c5f Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 16 Jun 2025 11:26:23 -0700
Subject: [PATCH 0533/1322] [CIR][NFC] Refactor constant pointer l-value
 handling (#144165)

This change introduces a ConstantLValueEmitter class, which will be
needed for emitting CIR for non-trivial constant pointers. This change
introduces the class with most branches reaching an NYI diagnostic. The
only path that is currently implemented is the case where an absolute
pointer (usually a null pointer) is emitted. This corresponds to the
existing handler for emitting l-value constants.
---
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp | 235 +++++++++++++++++--
 1 file changed, 218 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index c41ab54be09c..1976742d4039 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -329,6 +329,222 @@ emitArrayConstant(CIRGenModule &cgm, mlir::Type desiredType,
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+//                          ConstantLValueEmitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A struct which can be used to peephole certain kinds of finalization
+/// that normally happen during l-value emission.
+struct ConstantLValue {
+  llvm::PointerUnion<mlir::Value, mlir::Attribute> value;
+  bool hasOffsetApplied;
+
+  ConstantLValue(std::nullptr_t) : value(nullptr), hasOffsetApplied(false) {}
+  ConstantLValue() : value(nullptr), hasOffsetApplied(false) {}
+};
+
+/// A helper class for emitting constant l-values.
+class ConstantLValueEmitter
+    : public ConstStmtVisitor<ConstantLValueEmitter, ConstantLValue> {
+  CIRGenModule &cgm;
+  ConstantEmitter &emitter;
+  const APValue &value;
+  QualType destType;
+
+  // Befriend StmtVisitorBase so that we don't have to expose Visit*.
+  friend StmtVisitorBase;
+
+public:
+  ConstantLValueEmitter(ConstantEmitter &emitter, const APValue &value,
+                        QualType destType)
+      : cgm(emitter.cgm), emitter(emitter), value(value), destType(destType) {}
+
+  mlir::Attribute tryEmit();
+
+private:
+  mlir::Attribute tryEmitAbsolute(mlir::Type destTy);
+  ConstantLValue tryEmitBase(const APValue::LValueBase &base);
+
+  ConstantLValue VisitStmt(const Stmt *s) { return nullptr; }
+  ConstantLValue VisitConstantExpr(const ConstantExpr *e);
+  ConstantLValue VisitCompoundLiteralExpr(const CompoundLiteralExpr *e);
+  ConstantLValue VisitStringLiteral(const StringLiteral *e);
+  ConstantLValue VisitObjCBoxedExpr(const ObjCBoxedExpr *e);
+  ConstantLValue VisitObjCEncodeExpr(const ObjCEncodeExpr *e);
+  ConstantLValue VisitObjCStringLiteral(const ObjCStringLiteral *e);
+  ConstantLValue VisitPredefinedExpr(const PredefinedExpr *e);
+  ConstantLValue VisitAddrLabelExpr(const AddrLabelExpr *e);
+  ConstantLValue VisitCallExpr(const CallExpr *e);
+  ConstantLValue VisitBlockExpr(const BlockExpr *e);
+  ConstantLValue VisitCXXTypeidExpr(const CXXTypeidExpr *e);
+  ConstantLValue
+  VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e);
+};
+
+} // namespace
+
+mlir::Attribute ConstantLValueEmitter::tryEmit() {
+  const APValue::LValueBase &base = value.getLValueBase();
+
+  // The destination type should be a pointer or reference
+  // type, but it might also be a cast thereof.
+  //
+  // FIXME: the chain of casts required should be reflected in the APValue.
+  // We need this in order to correctly handle things like a ptrtoint of a
+  // non-zero null pointer and addrspace casts that aren't trivially
+  // represented in LLVM IR.
+  mlir::Type destTy = cgm.getTypes().convertTypeForMem(destType);
+  assert(mlir::isa<cir::PointerType>(destTy));
+
+  // If there's no base at all, this is a null or absolute pointer,
+  // possibly cast back to an integer type.
+  if (!base)
+    return tryEmitAbsolute(destTy);
+
+  // Otherwise, try to emit the base.
+  ConstantLValue result = tryEmitBase(base);
+
+  // If that failed, we're done.
+  llvm::PointerUnion<mlir::Value, mlir::Attribute> &value = result.value;
+  if (!value)
+    return {};
+
+  // Apply the offset if necessary and not already done.
+  if (!result.hasOffsetApplied) {
+    cgm.errorNYI("ConstantLValueEmitter: apply offset");
+    return {};
+  }
+
+  // Convert to the appropriate type; this could be an lvalue for
+  // an integer. FIXME: performAddrSpaceCast
+  if (mlir::isa<cir::PointerType>(destTy)) {
+    if (auto attr = mlir::dyn_cast<mlir::Attribute>(value))
+      return attr;
+    cgm.errorNYI("ConstantLValueEmitter: non-attribute pointer");
+    return {};
+  }
+
+  cgm.errorNYI("ConstantLValueEmitter: other?");
+  return {};
+}
+
+/// Try to emit an absolute l-value, such as a null pointer or an integer
+/// bitcast to pointer type.
+mlir::Attribute ConstantLValueEmitter::tryEmitAbsolute(mlir::Type destTy) {
+  // If we're producing a pointer, this is easy.
+  auto destPtrTy = mlir::cast<cir::PointerType>(destTy);
+  return cgm.getBuilder().getConstPtrAttr(
+      destPtrTy, value.getLValueOffset().getQuantity());
+}
+
+ConstantLValue
+ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) {
+  // Handle values.
+  if (const ValueDecl *d = base.dyn_cast<const ValueDecl *>()) {
+    // The constant always points to the canonical declaration. We want to look
+    // at properties of the most recent declaration at the point of emission.
+    d = cast<ValueDecl>(d->getMostRecentDecl());
+
+    if (d->hasAttr<WeakRefAttr>()) {
+      cgm.errorNYI(d->getSourceRange(),
+                   "ConstantLValueEmitter: emit pointer base for weakref");
+      return {};
+    }
+
+    if (auto *fd = dyn_cast<FunctionDecl>(d)) {
+      cgm.errorNYI(fd->getSourceRange(),
+                   "ConstantLValueEmitter: function decl");
+      return {};
+    }
+
+    if (auto *vd = dyn_cast<VarDecl>(d)) {
+      cgm.errorNYI(vd->getSourceRange(), "ConstantLValueEmitter: var decl");
+      return {};
+    }
+  }
+
+  // Handle typeid(T).
+  if (base.dyn_cast<TypeInfoLValue>()) {
+    cgm.errorNYI("ConstantLValueEmitter: typeid");
+    return {};
+  }
+
+  // Otherwise, it must be an expression.
+  return Visit(base.get<const Expr *>());
+}
+
+ConstantLValue ConstantLValueEmitter::VisitConstantExpr(const ConstantExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: constant expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitCompoundLiteralExpr(const CompoundLiteralExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: compound literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitStringLiteral(const StringLiteral *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: string literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCEncodeExpr(const ObjCEncodeExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: objc encode expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+  cgm.errorNYI(e->getSourceRange(),
+               "ConstantLValueEmitter: objc string literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCBoxedExpr(const ObjCBoxedExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: objc boxed expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitPredefinedExpr(const PredefinedExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: predefined expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitAddrLabelExpr(const AddrLabelExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: addr label expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitCallExpr(const CallExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: call expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitBlockExpr(const BlockExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: block expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitCXXTypeidExpr(const CXXTypeidExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: cxx typeid expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitMaterializeTemporaryExpr(
+    const MaterializeTemporaryExpr *e) {
+  cgm.errorNYI(e->getSourceRange(),
+               "ConstantLValueEmitter: materialize temporary expr");
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 //                             ConstantEmitter
 //===----------------------------------------------------------------------===//
@@ -556,23 +772,8 @@ mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value,
     cgm.errorNYI("ConstExprEmitter::tryEmitPrivate member pointer");
     return {};
   }
-  case APValue::LValue: {
-
-    if (value.getLValueBase()) {
-      cgm.errorNYI("non-null pointer initialization");
-    } else {
-
-      mlir::Type desiredType = cgm.convertType(destType);
-      if (const cir::PointerType ptrType =
-              mlir::dyn_cast<cir::PointerType>(desiredType)) {
-        return builder.getConstPtrAttr(ptrType,
-                                       value.getLValueOffset().getQuantity());
-      } else {
-        llvm_unreachable("non-pointer variable initialized with a pointer");
-      }
-    }
-    return {};
-  }
+  case APValue::LValue:
+    return ConstantLValueEmitter(*this, value, destType).tryEmit();
   case APValue::Struct:
   case APValue::Union:
     cgm.errorNYI("ConstExprEmitter::tryEmitPrivate struct or union");

From 00582728767599bb0e88beb96e8264dbe676da53 Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Mon, 16 Jun 2025 11:27:25 -0700
Subject: [PATCH 0534/1322] [NFC] Remove unused test code from
 ELFObjectFileTest.cpp

---
 llvm/unittests/Object/ELFObjectFileTest.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 493e673d6a07..1073df95c379 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -896,17 +896,6 @@ Sections:
             "are enabled: version = 1 feature = 4");
   }
 
-  SmallString<128> CommonVersionedYamlString(CommonYamlString);
-  CommonVersionedYamlString += R"(
-      - Version: 2
-        BBRanges:
-          - BBEntries:
-              - ID:            1
-                AddressOffset: 0x0
-                Size:          0x1
-                Metadata:      0x2
-)";
-
   // Check that we fail when function entry count is enabled but not provided.
   SmallString<128> MissingFuncEntryCount(CommonYamlString);
   MissingFuncEntryCount += R"(

From 8ed43c47dec36bc38bbae4c6f024cdb824555a76 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Mon, 16 Jun 2025 14:38:27 -0400
Subject: [PATCH 0535/1322] [Matrix] Hoist IRBuilder<> out of Visit* functions.
 NFC (#144369)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 48 ++++++++-----------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 1e37f40fa9d5..ece0bb56fff0 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1146,24 +1146,24 @@ public:
       Value *Op1;
       Value *Op2;
       MatrixTy Result;
+      IRBuilder<> Builder(Inst);
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
-        Result = VisitBinaryOperator(BinOp, SI);
+        Result = VisitBinaryOperator(BinOp, SI, Builder);
       else if (auto *Cast = dyn_cast<CastInst>(Inst))
-        Result = VisitCastInstruction(Cast, SI);
+        Result = VisitCastInstruction(Cast, SI, Builder);
       else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
-        Result = VisitUnaryOperator(UnOp, SI);
+        Result = VisitUnaryOperator(UnOp, SI, Builder);
       else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))
-        Result = VisitIntrinsicInst(Intr, SI);
+        Result = VisitIntrinsicInst(Intr, SI, Builder);
       else if (auto *Select = dyn_cast<SelectInst>(Inst))
-        Result = VisitSelectInst(Select, SI);
+        Result = VisitSelectInst(Select, SI, Builder);
       else if (match(Inst, m_Load(m_Value(Op1))))
-        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1);
+        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
-        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2);
+        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);
       else
         continue;
 
-      IRBuilder<> Builder(Inst);
       finalizeLowering(Inst, Result, Builder);
       Changed = true;
     }
@@ -1204,7 +1204,8 @@ public:
   }
 
   /// Replace intrinsic calls.
-  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI,
+                              IRBuilder<> &Builder) {
     assert(Inst->getCalledFunction() &&
            Inst->getCalledFunction()->isIntrinsic());
 
@@ -1219,7 +1220,6 @@ public:
       return LowerColumnMajorStore(Inst);
     case Intrinsic::abs:
     case Intrinsic::fabs: {
-      IRBuilder<> Builder(Inst);
       MatrixTy Result;
       MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);
       Builder.setFastMathFlags(getFastMathFlags(Inst));
@@ -1298,7 +1298,6 @@ public:
                       ShapeInfo MatrixShape, Value *I, Value *J,
                       ShapeInfo ResultShape, Type *EltTy,
                       IRBuilder<> &Builder) {
-
     Value *Offset = Builder.CreateAdd(
         Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
@@ -2228,26 +2227,24 @@ public:
   }
 
   /// Lower load instructions.
-  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr) {
-    IRBuilder<> Builder(Inst);
+  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
+                     IRBuilder<> &Builder) {
     return LowerLoad(Inst, Ptr, Inst->getAlign(),
                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
-                      Value *Ptr) {
-    IRBuilder<> Builder(Inst);
+                      Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
                       Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
   }
 
   /// Lower binary operators.
-  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI,
+                               IRBuilder<> &Builder) {
     Value *Lhs = Inst->getOperand(0);
     Value *Rhs = Inst->getOperand(1);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy A = getMatrix(Lhs, SI, Builder);
     MatrixTy B = getMatrix(Rhs, SI, Builder);
@@ -2265,11 +2262,10 @@ public:
   }
 
   /// Lower unary operators.
-  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI,
+                              IRBuilder<> &Builder) {
     Value *Op = Inst->getOperand(0);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, SI, Builder);
 
@@ -2293,11 +2289,10 @@ public:
   }
 
   /// Lower cast instructions.
-  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape,
+                                IRBuilder<> &Builder) {
     Value *Op = Inst->getOperand(0);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, Shape, Builder);
 
@@ -2315,13 +2310,12 @@ public:
   }
 
   /// Lower selects.
-  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape,
+                           IRBuilder<> &Builder) {
     Value *Cond = Inst->getOperand(0);
     Value *OpA = Inst->getOperand(1);
     Value *OpB = Inst->getOperand(2);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy A = getMatrix(OpA, Shape, Builder);
     MatrixTy B = getMatrix(OpB, Shape, Builder);

From 63b80dd01dafc92104ee43e4f0f5296d644c25ec Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Mon, 16 Jun 2025 11:45:19 -0700
Subject: [PATCH 0536/1322] [NFC][RootSignature] Use `llvm::EnumEntry` for
 serialization of Root Signature Elements (#144106)

It has pointed out
[here](https://github.com/llvm/llvm-project/pull/143198#discussion_r2132877388)
that we may be able to use `llvm::EnumEntry` so that we can re-use the
printing logic across enumerations.

- Enables re-use of `printEnum` and `printFlags` methods via templates
- Allows easy definition of `getEnumName` function for enum-to-string
conversion, eliminating the need to use a string stream for constructing
the Name SmallString

- Also, does a small fix-up of the operands for descriptor table clause
to be consistent with other `Build*` methods

For reference, the
[test-cases](https://github.com/llvm/llvm-project/blob/main/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp)
that must not change expected output.
---
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  | 213 +++++++++---------
 1 file changed, 106 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 765a3bcbed7e..7d744781da04 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -15,111 +15,46 @@
 #include "llvm/ADT/bit.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
 namespace hlsl {
 namespace rootsig {
 
-static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
-  switch (Reg.ViewType) {
-  case RegisterType::BReg:
-    OS << "b";
-    break;
-  case RegisterType::TReg:
-    OS << "t";
-    break;
-  case RegisterType::UReg:
-    OS << "u";
-    break;
-  case RegisterType::SReg:
-    OS << "s";
-    break;
-  }
-  OS << Reg.Number;
+template <typename T>
+static std::optional<StringRef> getEnumName(const T Value,
+                                            ArrayRef<EnumEntry<T>> Enums) {
+  for (const auto &EnumItem : Enums)
+    if (EnumItem.Value == Value)
+      return EnumItem.Name;
+  return std::nullopt;
+}
+
+template <typename T>
+static raw_ostream &printEnum(raw_ostream &OS, const T Value,
+                              ArrayRef<EnumEntry<T>> Enums) {
+  auto MaybeName = getEnumName(Value, Enums);
+  if (MaybeName)
+    OS << *MaybeName;
   return OS;
 }
 
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const ShaderVisibility &Visibility) {
-  switch (Visibility) {
-  case ShaderVisibility::All:
-    OS << "All";
-    break;
-  case ShaderVisibility::Vertex:
-    OS << "Vertex";
-    break;
-  case ShaderVisibility::Hull:
-    OS << "Hull";
-    break;
-  case ShaderVisibility::Domain:
-    OS << "Domain";
-    break;
-  case ShaderVisibility::Geometry:
-    OS << "Geometry";
-    break;
-  case ShaderVisibility::Pixel:
-    OS << "Pixel";
-    break;
-  case ShaderVisibility::Amplification:
-    OS << "Amplification";
-    break;
-  case ShaderVisibility::Mesh:
-    OS << "Mesh";
-    break;
-  }
-
-  return OS;
-}
-
-static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
-  switch (Type) {
-  case ClauseType::CBuffer:
-    OS << "CBV";
-    break;
-  case ClauseType::SRV:
-    OS << "SRV";
-    break;
-  case ClauseType::UAV:
-    OS << "UAV";
-    break;
-  case ClauseType::Sampler:
-    OS << "Sampler";
-    break;
-  }
-
-  return OS;
-}
-
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const DescriptorRangeFlags &Flags) {
+template <typename T>
+static raw_ostream &printFlags(raw_ostream &OS, const T Value,
+                               ArrayRef<EnumEntry<T>> Flags) {
   bool FlagSet = false;
-  unsigned Remaining = llvm::to_underlying(Flags);
+  unsigned Remaining = llvm::to_underlying(Value);
   while (Remaining) {
     unsigned Bit = 1u << llvm::countr_zero(Remaining);
     if (Remaining & Bit) {
       if (FlagSet)
         OS << " | ";
 
-      switch (static_cast<DescriptorRangeFlags>(Bit)) {
-      case DescriptorRangeFlags::DescriptorsVolatile:
-        OS << "DescriptorsVolatile";
-        break;
-      case DescriptorRangeFlags::DataVolatile:
-        OS << "DataVolatile";
-        break;
-      case DescriptorRangeFlags::DataStaticWhileSetAtExecute:
-        OS << "DataStaticWhileSetAtExecute";
-        break;
-      case DescriptorRangeFlags::DataStatic:
-        OS << "DataStatic";
-        break;
-      case DescriptorRangeFlags::DescriptorsStaticKeepingBufferBoundsChecks:
-        OS << "DescriptorsStaticKeepingBufferBoundsChecks";
-        break;
-      default:
+      auto MaybeFlag = getEnumName(T(Bit), Flags);
+      if (MaybeFlag)
+        OS << *MaybeFlag;
+      else
         OS << "invalid: " << Bit;
-        break;
-      }
 
       FlagSet = true;
     }
@@ -128,6 +63,68 @@ static raw_ostream &operator<<(raw_ostream &OS,
 
   if (!FlagSet)
     OS << "None";
+  return OS;
+}
+
+static const EnumEntry<RegisterType> RegisterNames[] = {
+    {"b", RegisterType::BReg},
+    {"t", RegisterType::TReg},
+    {"u", RegisterType::UReg},
+    {"s", RegisterType::SReg},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
+  printEnum(OS, Reg.ViewType, ArrayRef(RegisterNames));
+  OS << Reg.Number;
+
+  return OS;
+}
+
+static const EnumEntry<ShaderVisibility> VisibilityNames[] = {
+    {"All", ShaderVisibility::All},
+    {"Vertex", ShaderVisibility::Vertex},
+    {"Hull", ShaderVisibility::Hull},
+    {"Domain", ShaderVisibility::Domain},
+    {"Geometry", ShaderVisibility::Geometry},
+    {"Pixel", ShaderVisibility::Pixel},
+    {"Amplification", ShaderVisibility::Amplification},
+    {"Mesh", ShaderVisibility::Mesh},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const ShaderVisibility &Visibility) {
+  printEnum(OS, Visibility, ArrayRef(VisibilityNames));
+
+  return OS;
+}
+
+static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
+    {"CBV", dxil::ResourceClass::CBuffer},
+    {"SRV", dxil::ResourceClass::SRV},
+    {"UAV", dxil::ResourceClass::UAV},
+    {"Sampler", dxil::ResourceClass::Sampler},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
+  printEnum(OS, dxil::ResourceClass(llvm::to_underlying(Type)),
+            ArrayRef(ResourceClassNames));
+
+  return OS;
+}
+
+static const EnumEntry<DescriptorRangeFlags> DescriptorRangeFlagNames[] = {
+    {"DescriptorsVolatile", DescriptorRangeFlags::DescriptorsVolatile},
+    {"DataVolatile", DescriptorRangeFlags::DataVolatile},
+    {"DataStaticWhileSetAtExecute",
+     DescriptorRangeFlags::DataStaticWhileSetAtExecute},
+    {"DataStatic", DescriptorRangeFlags::DataStatic},
+    {"DescriptorsStaticKeepingBufferBoundsChecks",
+     DescriptorRangeFlags::DescriptorsStaticKeepingBufferBoundsChecks},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const DescriptorRangeFlags &Flags) {
+  printFlags(OS, Flags, ArrayRef(DescriptorRangeFlagNames));
 
   return OS;
 }
@@ -236,12 +233,13 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) {
 
 MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
   IRBuilder<> Builder(Ctx);
-  llvm::SmallString<7> Name;
-  llvm::raw_svector_ostream OS(Name);
-  OS << "Root" << ClauseType(llvm::to_underlying(Descriptor.Type));
-
+  std::optional<StringRef> TypeName =
+      getEnumName(dxil::ResourceClass(llvm::to_underlying(Descriptor.Type)),
+                  ArrayRef(ResourceClassNames));
+  assert(TypeName && "Provided an invalid Resource Class");
+  llvm::SmallString<7> Name({"Root", *TypeName});
   Metadata *Operands[] = {
-      MDString::get(Ctx, OS.str()),
+      MDString::get(Ctx, Name),
       ConstantAsMetadata::get(
           Builder.getInt32(llvm::to_underlying(Descriptor.Visibility))),
       ConstantAsMetadata::get(Builder.getInt32(Descriptor.Reg.Number)),
@@ -277,19 +275,20 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) {
 MDNode *MetadataBuilder::BuildDescriptorTableClause(
     const DescriptorTableClause &Clause) {
   IRBuilder<> Builder(Ctx);
-  std::string Name;
-  llvm::raw_string_ostream OS(Name);
-  OS << Clause.Type;
-  return MDNode::get(
-      Ctx, {
-               MDString::get(Ctx, OS.str()),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)),
-               ConstantAsMetadata::get(
-                   Builder.getInt32(llvm::to_underlying(Clause.Flags))),
-           });
+  std::optional<StringRef> Name =
+      getEnumName(dxil::ResourceClass(llvm::to_underlying(Clause.Type)),
+                  ArrayRef(ResourceClassNames));
+  assert(Name && "Provided an invalid Resource Class");
+  Metadata *Operands[] = {
+      MDString::get(Ctx, *Name),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)),
+      ConstantAsMetadata::get(
+          Builder.getInt32(llvm::to_underlying(Clause.Flags))),
+  };
+  return MDNode::get(Ctx, Operands);
 }
 
 MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {

From fcc10e55cabb90f3097a8da4c114e827a1d746eb Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 14:51:48 -0400
Subject: [PATCH 0537/1322] Remove unnecessary BOM from file; NFC

Fixes #144373
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 715df7ab9a7a..c2460c497b40 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-﻿//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From f83d09a1f60aee28a8ed9020cd72971ec2885f24 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 14:53:15 -0400
Subject: [PATCH 0538/1322] Revert "[RISCV] Remove B and Zbc extension from
 Andes series cpus." (#144402)

Reverts llvm/llvm-project#144022

This has been failing postcommit CI for two days:
https://lab.llvm.org/buildbot/#/builders/63
---
 .../Driver/print-enabled-extensions/riscv-andes-a25.c     | 7 ++++++-
 .../Driver/print-enabled-extensions/riscv-andes-a45.c     | 6 +++++-
 .../Driver/print-enabled-extensions/riscv-andes-ax25.c    | 7 ++++++-
 .../Driver/print-enabled-extensions/riscv-andes-ax45.c    | 6 +++++-
 .../Driver/print-enabled-extensions/riscv-andes-n45.c     | 6 +++++-
 .../Driver/print-enabled-extensions/riscv-andes-nx45.c    | 6 +++++-
 llvm/lib/Target/RISCV/RISCVProcessors.td                  | 8 ++++++++
 llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s              | 2 +-
 8 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
index cfb4d0ed58d1..d8b3848d8452 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,12 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
index 3c3c554dffc5..a0a1c3591140 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,11 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
index 70100a0a8df1..3f933ecd8ac8 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,12 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
index d2b1a32e321e..6460d701411b 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,11 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
index 1a2c30bfc7a2..4d9c514b756e 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,11 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
index 50c38da3bd03..5eaada3f9e16 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,11 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index d7e6c71ea062..32f4ab607a34 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -703,6 +703,8 @@ def ANDES_A25 : RISCVProcessorModel<"andes-a25",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
+                                     FeatureStdExtZbc,
                                      FeatureVendorXAndesPerf]>;
 
 def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
@@ -716,6 +718,8 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
+                                      FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
 defvar Andes45TuneFeatures = [TuneAndes45,
@@ -737,6 +741,7 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -751,6 +756,7 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
 
@@ -765,6 +771,7 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -779,5 +786,6 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
diff --git a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
index d90dce8c5c3f..f6dc6eef3f0f 100644
--- a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
+++ b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+b,+zbc -timeline -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+zbc -timeline -iterations=1 < %s | FileCheck %s
 
 # Two ALUs without dependency can be dispatched in the same cycle.
 add a0, a0, a0

From e8362234f60612a250d832cc8d0f68fe7fa9ea17 Mon Sep 17 00:00:00 2001
From: Scott Linder <scott.linder@amd.com>
Date: Mon, 16 Jun 2025 15:03:02 -0400
Subject: [PATCH 0539/1322] [Object][AMDGPU] Support REL relocations (#143966)

Shaders compiled with DXC/LLPC generate these relocations, and even if
that changes in the future we want to handle existing binaries. The
friction to support this and the maintenance cost long term both seem
incredibly low, considering other targets like ARM support both REL/RELA
static relocations behind the same interface.
---
 llvm/docs/AMDGPUUsage.rst                     |  3 +-
 llvm/lib/Object/RelocationResolver.cpp        |  6 +-
 llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml | 86 +++++++++++++++++++
 3 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 39f04f8e01b8..c052b076c21c 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2709,7 +2709,8 @@ The following relocation types are supported:
 the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.
 
 There is no current OS loader support for 32-bit programs and so
-``R_AMDGPU_ABS32`` is not used.
+``R_AMDGPU_ABS32`` is only generated for static relocations, for example to
+implement some DWARF32 forms.
 
 .. _amdgpu-loaded-code-object-path-uniform-resource-identifier:
 
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 8cf748aa5681..b6318bbe3ab7 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -274,11 +274,13 @@ static bool supportsAmdgpu(uint64_t Type) {
 }
 
 static uint64_t resolveAmdgpu(uint64_t Type, uint64_t Offset, uint64_t S,
-                              uint64_t /*LocData*/, int64_t Addend) {
+                              uint64_t LocData, int64_t Addend) {
+  assert((LocData == 0 || Addend == 0) &&
+         "one of LocData and Addend must be 0");
   switch (Type) {
   case ELF::R_AMDGPU_ABS32:
   case ELF::R_AMDGPU_ABS64:
-    return S + Addend;
+    return S + LocData + Addend;
   default:
     llvm_unreachable("Invalid relocation type");
   }
diff --git a/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml b/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
new file mode 100644
index 000000000000..23b7f087e957
--- /dev/null
+++ b/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
@@ -0,0 +1,86 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-dwarfdump -i %t | FileCheck %s
+
+# Test REL relocation handling for AMDGPU
+
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_producer ("dxc")
+# CHECK: DW_AT_name (".\\example.hlsl")
+# CHECK: DW_AT_str_offsets_base (0x00000008)
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  OSABI:           ELFOSABI_AMDGPU_PAL
+  Type:            ET_REL
+  Machine:         EM_AMDGPU
+  Flags:           [ EF_AMDGPU_MACH_AMDGCN_GFX1201 ]
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         01110125251305032572171017110B120673178C0117000000
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         23000000050001080000000001000400010800000000000000005C000000080000000C00000000
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         0C000000050000000000000004000000
+  - Name:            .rel.debug_info
+    Type:            SHT_REL
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .debug_info
+    Relocations:
+      - Offset:          0x8
+        Symbol:          .debug_abbrev
+        Type:            R_AMDGPU_ABS32
+      - Offset:          0x11
+        Symbol:          .debug_str_offsets
+        Type:            R_AMDGPU_ABS32
+  - Name:            .rel.debug_str_offsets
+    Type:            SHT_REL
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .debug_str_offsets
+    Relocations:
+      - Offset:          0x8
+        Symbol:          .debug_str
+        Type:            R_AMDGPU_ABS32
+      - Offset:          0xC
+        Symbol:          .debug_str
+        Type:            R_AMDGPU_ABS32
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .debug_abbrev
+      - Name:            .debug_info
+      - Name:            .rel.debug_info
+      - Name:            .debug_str_offsets
+      - Name:            .rel.debug_str_offsets
+      - Name:            .debug_str
+      - Name:            .symtab
+Symbols:
+  - Name:            .debug_abbrev
+    Type:            STT_SECTION
+    Section:         .debug_abbrev
+  - Name:            .debug_info
+    Type:            STT_SECTION
+    Section:         .debug_info
+  - Name:            .debug_str_offsets
+    Type:            STT_SECTION
+    Section:         .debug_str_offsets
+  - Name:            .debug_str
+    Type:            STT_SECTION
+    Section:         .debug_str
+DWARF:
+  debug_str:
+    - 'dxc'
+    - '.\example.hlsl'
+...

From a00b736a797d252d9e26cc13fb45993d7b02ede2 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Mon, 16 Jun 2025 12:05:20 -0700
Subject: [PATCH 0540/1322] [mlir][Vector] Support `vector.extract(xfer_read)`
 folding with dynamic indices (#143269)

This PR is part of the last step to remove `vector.extractelement` and `vector.insertelement` ops.
RFC: https://discourse.llvm.org/t/rfc-psa-remove-vector-extractelement-and-vector-insertelement-ops-in-favor-of-vector-extract-and-vector-insert-ops

It adds support for folding `vector.transfer_read(vector.extract) ->
memref.load` with dynamic indices, which is currently supported by
`vector.extractelement`.
---
 .../Transforms/VectorTransferOpTransforms.cpp | 28 ++++++++++++-----
 .../scalar-vector-transfer-to-memref.mlir     | 30 +++++++++++++++++++
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 7dbb7a334fe6..384717aeca66 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -886,17 +886,31 @@ class RewriteScalarExtractOfTransferRead
     SmallVector<Value> newIndices(xferOp.getIndices().begin(),
                                   xferOp.getIndices().end());
     for (auto [i, pos] : llvm::enumerate(extractOp.getMixedPosition())) {
-      assert(isa<Attribute>(pos) && "Unexpected non-constant index");
-      int64_t offset = cast<IntegerAttr>(cast<Attribute>(pos)).getInt();
       int64_t idx = newIndices.size() - extractOp.getNumIndices() + i;
-      OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
-          rewriter, extractOp.getLoc(),
-          rewriter.getAffineSymbolExpr(0) + offset, {newIndices[idx]});
-      if (auto value = dyn_cast<Value>(ofr)) {
+
+      // Compute affine expression `newIndices[idx] + pos` where `pos` can be
+      // either a constant or a value.
+      OpFoldResult composedIdx;
+      if (auto attr = dyn_cast<Attribute>(pos)) {
+        int64_t offset = cast<IntegerAttr>(attr).getInt();
+        composedIdx = affine::makeComposedFoldedAffineApply(
+            rewriter, extractOp.getLoc(),
+            rewriter.getAffineSymbolExpr(0) + offset, {newIndices[idx]});
+      } else {
+        Value dynamicOffset = cast<Value>(pos);
+        AffineExpr sym0, sym1;
+        bindSymbols(rewriter.getContext(), sym0, sym1);
+        composedIdx = affine::makeComposedFoldedAffineApply(
+            rewriter, extractOp.getLoc(), sym0 + sym1,
+            {newIndices[idx], dynamicOffset});
+      }
+
+      // Update the corresponding index with the folded result.
+      if (auto value = dyn_cast<Value>(composedIdx)) {
         newIndices[idx] = value;
       } else {
         newIndices[idx] = rewriter.create<arith::ConstantIndexOp>(
-            extractOp.getLoc(), *getConstantIntValue(ofr));
+            extractOp.getLoc(), *getConstantIntValue(composedIdx));
       }
     }
     if (isa<MemRefType>(xferOp.getBase().getType())) {
diff --git a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
index 52b0fdee184f..7a1d6b3a8344 100644
--- a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
+++ b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
@@ -148,3 +148,33 @@ func.func @subvector_extract(%m: memref<?x?xf32>, %idx: index) -> vector<16xf32>
   return %1 : vector<16xf32>
 }
 
+// -----
+
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @transfer_read_1d_extract_dynamic(
+//  CHECK-SAME:     %[[MEMREF:.*]]: memref<?xf32>, %[[M_IDX:.*]]: index, %[[E_IDX:.*]]: index
+//       CHECK:   %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[M_IDX]], %[[E_IDX]]]
+//       CHECK:   %[[RES:.*]] = memref.load %[[MEMREF]][%[[APPLY]]]
+func.func @transfer_read_1d_extract_dynamic(%m: memref<?xf32>, %idx: index,
+                                            %offset: index) -> f32 {
+  %cst = arith.constant 0.0 : f32
+  %vec = vector.transfer_read %m[%idx], %cst {in_bounds = [true]} : memref<?xf32>, vector<5xf32>
+  %elem = vector.extract %vec[%offset] : f32 from vector<5xf32>
+  return %elem : f32
+}
+
+// -----
+
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @transfer_read_2d_extract_dynamic(
+//  CHECK-SAME:     %[[MEMREF:.*]]: memref<?x?xf32>, %[[ROW_IDX:.*]]: index, %[[COL_IDX:.*]]: index, %[[ROW_OFFSET:.*]]: index, %[[COL_OFFSET:.*]]: index
+//       CHECK:   %[[ROW_APPLY:.*]] = affine.apply #[[$MAP]]()[%[[ROW_IDX]], %[[ROW_OFFSET]]]
+//       CHECK:   %[[COL_APPLY:.*]] = affine.apply #[[$MAP]]()[%[[COL_IDX]], %[[COL_OFFSET]]]
+//       CHECK:   %[[RES:.*]] = memref.load %[[MEMREF]][%[[ROW_APPLY]], %[[COL_APPLY]]]
+func.func @transfer_read_2d_extract_dynamic(%m: memref<?x?xf32>, %row_idx: index, %col_idx: index,
+                                            %row_offset: index, %col_offset: index) -> f32 {
+  %cst = arith.constant 0.0 : f32
+  %vec = vector.transfer_read %m[%row_idx, %col_idx], %cst {in_bounds = [true, true]} : memref<?x?xf32>, vector<10x5xf32>
+  %elem = vector.extract %vec[%row_offset, %col_offset] : f32 from vector<10x5xf32>
+  return %elem : f32
+}

From a0662ceba83cf8782da4047b8ee6d175591f168f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez=20Troiti=C3=B1o?=
 <danielrodriguez@meta.com>
Date: Mon, 16 Jun 2025 12:06:25 -0700
Subject: [PATCH 0541/1322] [objcopy][MachO] Revert special handling of
 encryptable binaries (#144058)

Code originally added in #120995 and later corrected in #130517 but
apparently still not correct according to #141494 and
rust-lang/rust#141913.

Revert the special handling because the test written in #120995 and
#130517 still passes without those changes. Kept the test and improved
it with a `__DATA` section to keep the current behaviour checked in case
other changes modify the behaviour and break this edge case.
---
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp |   8 -
 llvm/lib/ObjCopy/MachO/MachOObject.cpp        |   4 -
 llvm/lib/ObjCopy/MachO/MachOObject.h          |   3 -
 llvm/lib/ObjCopy/MachO/MachOReader.cpp        |   4 -
 .../MachO/strip-with-encryption-info.test     | 160 ++++++++++++------
 5 files changed, 108 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index 8ecd669e6717..93bc6631e64c 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,10 +116,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
-  // If we are emitting an encryptable binary, our load commands must have a
-  // separate (non-encrypted) page to themselves.
-  bool RequiresFirstSectionOutsideFirstPage =
-      O.EncryptionInfoCommandIndex.has_value();
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
@@ -173,10 +169,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
         if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
-          if (RequiresFirstSectionOutsideFirstPage) {
-            SectOffset = alignToPowerOf2(SectOffset, PageSize);
-            RequiresFirstSectionOutsideFirstPage = false;
-          }
           Sec->Offset = SegOffset + SectOffset;
           Sec->Size = Sec->Content.size();
           SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index e0819d89d24f..8d2c02dc37c9 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -98,10 +98,6 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_DYLD_EXPORTS_TRIE:
       ExportsTrieCommandIndex = Index;
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      EncryptionInfoCommandIndex = Index;
-      break;
     }
   }
 }
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 13ac87ed3ed0..8f9444f5fb02 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -341,9 +341,6 @@ struct Object {
   /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
   /// corresponding to the __TEXT segment.
   std::optional<size_t> TextSegmentCommandIndex;
-  /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command
-  /// if present.
-  std::optional<size_t> EncryptionInfoCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index ef0e0262f939..2b344f36d8e7 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -184,10 +184,6 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_DYLD_CHAINED_FIXUPS:
       O.ChainedFixupsCommandIndex = O.LoadCommands.size();
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      O.EncryptionInfoCommandIndex = O.LoadCommands.size();
-      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
index 2b2bd670613d..d6f6fe10d88c 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -16,7 +16,11 @@
 # CHECK:       fileoff: 0
 
 # The YAML below is the following code
+# ```
+# static int foo = 12345;
+# int bar = 4567;
 # int main(int argc, char **argv) { return 0; }
+# ```
 # Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
 # Contents are removed, since they are not important for the test. We need a
 # small text segment (smaller than a page).
@@ -26,8 +30,8 @@ FileHeader:
   cputype:         0x100000C
   cpusubtype:      0x0
   filetype:        0x2
-  ncmds:           15
-  sizeofcmds:      696
+  ncmds:           18
+  sizeofcmds:      920
   flags:           0x200085
   reserved:        0x0
 LoadCommands:
@@ -69,7 +73,7 @@ LoadCommands:
       - sectname:        __unwind_info
         segname:         __TEXT
         addr:            0x100004020
-        size:            4152
+        size:            88
         offset:          0x4020
         align:           2
         reloff:          0x0
@@ -78,38 +82,62 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          4295000064
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x100008000
+        size:            4
+        offset:          0x8000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
   - cmd:             LC_SEGMENT_64
     cmdsize:         72
     segname:         __LINKEDIT
-    vmaddr:          4295000064
-    vmsize:          592
-    fileoff:         32768
-    filesize:        592
+    vmaddr:          4295016448
+    vmsize:          16384
+    fileoff:         49152
+    filesize:        768
     maxprot:         1
     initprot:        1
     nsects:          0
     flags:           0
   - cmd:             LC_DYLD_CHAINED_FIXUPS
     cmdsize:         16
-    dataoff:         32768
-    datasize:        48
+    dataoff:         49152
+    datasize:        56
   - cmd:             LC_DYLD_EXPORTS_TRIE
     cmdsize:         16
-    dataoff:         32816
-    datasize:        48
+    dataoff:         49208
+    datasize:        64
   - cmd:             LC_SYMTAB
     cmdsize:         24
-    symoff:          32872
-    nsyms:           2
-    stroff:          32904
-    strsize:         32
+    symoff:          49280
+    nsyms:           3
+    stroff:          49328
+    strsize:         40
   - cmd:             LC_DYSYMTAB
     cmdsize:         80
     ilocalsym:       0
     nlocalsym:       0
     iextdefsym:      0
-    nextdefsym:      2
-    iundefsym:       2
+    nextdefsym:      3
+    iundefsym:       3
     nundefsym:       0
     tocoff:          0
     ntoc:            0
@@ -123,12 +151,6 @@ LoadCommands:
     nextrel:         0
     locreloff:       0
     nlocrel:         0
-  - cmd:             LC_ENCRYPTION_INFO_64
-    cmdsize:         24
-    cryptoff:        16384
-    cryptsize:       16384
-    cryptid:         0
-    pad:             0
   - cmd:             LC_LOAD_DYLINKER
     cmdsize:         32
     name:            12
@@ -136,32 +158,50 @@ LoadCommands:
     ZeroPadBytes:    7
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+    uuid:            ADDA943C-657A-3A49-9580-168E17A40FFB
   - cmd:             LC_BUILD_VERSION
     cmdsize:         32
     platform:        1
     minos:           983040
-    sdk:             983552
+    sdk:             984320
     ntools:          1
     Tools:
-      - tool:            4
-        version:         1310720
+      - tool:            3
+        version:         76481537
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
   - cmd:             LC_MAIN
     cmdsize:         24
     entryoff:        16384
     stacksize:       0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 88539136
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
   - cmd:             LC_FUNCTION_STARTS
     cmdsize:         16
-    dataoff:         32864
+    dataoff:         49272
     datasize:        8
   - cmd:             LC_DATA_IN_CODE
     cmdsize:         16
-    dataoff:         32872
+    dataoff:         49280
     datasize:        0
   - cmd:             LC_CODE_SIGNATURE
     cmdsize:         16
-    dataoff:         32944
-    datasize:        416
+    dataoff:         49376
+    datasize:        544
 LinkEditData:
   ExportTrie:
     TerminalSize:    0
@@ -173,51 +213,67 @@ LinkEditData:
     ImportName:      ''
     Children:
       - TerminalSize:    0
-        NodeOffset:      5
+        NodeOffset:      25
         Name:            _
         Flags:           0x0
         Address:         0x0
         Other:           0x0
         ImportName:      ''
         Children:
-          - TerminalSize:    4
-            NodeOffset:      33
-            Name:            main
-            Flags:           0x0
-            Address:         0x4000
-            Other:           0x0
-            ImportName:      ''
           - TerminalSize:    2
-            NodeOffset:      39
+            NodeOffset:      9
             Name:            _mh_execute_header
             Flags:           0x0
             Address:         0x0
             Other:           0x0
             ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      13
+            Name:            bar
+            Flags:           0x0
+            Address:         0x8000
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      19
+            Name:            main
+            Flags:           0x0
+            Address:         0x4000
+            Other:           0x0
+            ImportName:      ''
   NameList:
     - n_strx:          2
-      n_type:          0xF
-      n_sect:          1
-      n_desc:          0
-      n_value:         4294983680
-    - n_strx:          8
       n_type:          0xF
       n_sect:          1
       n_desc:          16
       n_value:         4294967296
+    - n_strx:          22
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         4295000064
+    - n_strx:          27
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294983680
   StringTable:
     - ' '
-    - _main
     - __mh_execute_header
+    - _bar
+    - _main
+    - ''
+    - ''
+    - ''
     - ''
     - ''
     - ''
     - ''
   FunctionStarts:  [ 0x4000 ]
-  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
-                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x34, 0x0,
+                     0x0, 0x0, 0x34, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
 ...
-

From 402c376daa659c0c3a477ad038a415079ffa0a48 Mon Sep 17 00:00:00 2001
From: William Huynh <William.Huynh@arm.com>
Date: Mon, 16 Jun 2025 20:22:58 +0100
Subject: [PATCH 0542/1322] [libc] Change default behaviour of baremetal/printf
 to use stdout (#143703)

In #94078, `write_to_stdout` had not been fully implemented. However,
now that it has been implemented, to conform with the C standard
(7.23.6.3. The printf function, specifically point 2), we use `stdout`.
This issue is tracked in #94685.

- Also prefer `static constexpr`
- Made it explicit that we are writing to `stdout`
---
 libc/src/stdio/baremetal/printf.cpp  | 8 ++++----
 libc/src/stdio/baremetal/putchar.cpp | 2 +-
 libc/src/stdio/baremetal/puts.cpp    | 4 ++--
 libc/src/stdio/baremetal/vprintf.cpp | 8 ++++----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index c94698ec0295..7253c6549a4e 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE int raw_write_hook(cpp::string_view new_str, void *) {
-  write_to_stderr(new_str);
+LIBC_INLINE int stdout_write_hook(cpp::string_view new_str, void *) {
+  write_to_stdout(new_str);
   return printf_core::WRITE_OK;
 }
 
@@ -35,11 +35,11 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  constexpr size_t BUFF_SIZE = 1024;
+  static constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
 
   printf_core::WriteBuffer<printf_core::WriteMode::FLUSH_TO_STREAM> wb(
-      buffer, BUFF_SIZE, &raw_write_hook, nullptr);
+      buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
   int retval = printf_core::printf_main(&writer, format, args);
diff --git a/libc/src/stdio/baremetal/putchar.cpp b/libc/src/stdio/baremetal/putchar.cpp
index 0ba46a5ade6c..ac21e6e783b0 100644
--- a/libc/src/stdio/baremetal/putchar.cpp
+++ b/libc/src/stdio/baremetal/putchar.cpp
@@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, putchar, (int c)) {
   char uc = static_cast<char>(c);
 
-  write_to_stderr(cpp::string_view(&uc, 1));
+  write_to_stdout(cpp::string_view(&uc, 1));
 
   return 0;
 }
diff --git a/libc/src/stdio/baremetal/puts.cpp b/libc/src/stdio/baremetal/puts.cpp
index 5062efda1c0d..fcd3aa086b2b 100644
--- a/libc/src/stdio/baremetal/puts.cpp
+++ b/libc/src/stdio/baremetal/puts.cpp
@@ -17,8 +17,8 @@ LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
   cpp::string_view str_view(str);
 
   // TODO: Can we combine these to avoid needing two writes?
-  write_to_stderr(str_view);
-  write_to_stderr("\n");
+  write_to_stdout(str_view);
+  write_to_stdout("\n");
 
   return 0;
 }
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index 3e8631abd90d..ab02533f1491 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE int raw_write_hook(cpp::string_view new_str, void *) {
-  write_to_stderr(new_str);
+LIBC_INLINE int stdout_write_hook(cpp::string_view new_str, void *) {
+  write_to_stdout(new_str);
   return printf_core::WRITE_OK;
 }
 
@@ -33,11 +33,11 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  constexpr size_t BUFF_SIZE = 1024;
+  static constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
 
   printf_core::WriteBuffer<printf_core::WriteMode::FLUSH_TO_STREAM> wb(
-      buffer, BUFF_SIZE, &raw_write_hook, nullptr);
+      buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
   int retval = printf_core::printf_main(&writer, format, args);

From 25781221d68a700eae679a19f701d4ad67e91dc9 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 16 Jun 2025 12:43:13 -0700
Subject: [PATCH 0543/1322] [instcombine] Delete dead transform for reverse of
 binop (#143967)

We canonicalize reverse to after a binop in foldVectorBinop, and
simplify reverse pairs in InstSimplify, so these elimination transforms
are redundant.
---
 .../InstCombine/InstCombineCalls.cpp          | 29 ++++---------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8c8cc0859e4a..03897117861f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3555,32 +3555,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::vector_reverse: {
-    Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
-    if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
-      auto *OldBinOp = cast<BinaryOperator>(Vec);
-      if (match(BO0, m_VecReverse(m_Value(X)))) {
-        // rev(binop rev(X), rev(Y)) --> binop X, Y
-        if (match(BO1, m_VecReverse(m_Value(Y))))
-          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
-                                             OldBinOp->getOpcode(), X, Y,
-                                             OldBinOp, OldBinOp->getName(),
-                                             II->getIterator()));
-        // rev(binop rev(X), BO1Splat) --> binop X, BO1Splat
-        if (isSplatValue(BO1))
-          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
-                                             OldBinOp->getOpcode(), X, BO1,
-                                             OldBinOp, OldBinOp->getName(),
-                                             II->getIterator()));
-      }
-      // rev(binop BO0Splat, rev(Y)) --> binop BO0Splat, Y
-      if (match(BO1, m_VecReverse(m_Value(Y))) && isSplatValue(BO0))
-        return replaceInstUsesWith(CI,
-                                   BinaryOperator::CreateWithCopiedFlags(
-                                       OldBinOp->getOpcode(), BO0, Y, OldBinOp,
-                                       OldBinOp->getName(), II->getIterator()));
-    }
+    // Note: We canonicalize reverse after binops, so we don't need a
+    // corresponding binop case here. TODO: Consider canonicalizing
+    // reverse after fneg?
+
     // rev(unop rev(X)) --> unop X
+    Value *X;
     if (match(Vec, m_OneUse(m_UnOp(m_VecReverse(m_Value(X)))))) {
       auto *OldUnOp = cast<UnaryOperator>(Vec);
       auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(

From b0378e7ca953c2390168f352c5a88fd325cde894 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 16 Jun 2025 12:55:12 -0700
Subject: [PATCH 0544/1322] [AArch64TargetParser]Fix
 reconstructFromParsedFeatures ignoring negative features (#142236)

The `targetFeatureToExtension` function used by
reconstructFromParsedFeatures only found positive `+FEATURE` strings,
but not negative `-FEATURE` strings. Extend the function to handle both
to fix `reconstructFromParsedFeatures`.
---
 .../CodeGen/aarch64-always-inline-feature-bug.c  |  8 ++++++++
 llvm/lib/TargetParser/AArch64TargetParser.cpp    |  5 +++--
 llvm/unittests/TargetParser/TargetParserTest.cpp | 16 ++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-always-inline-feature-bug.c

diff --git a/clang/test/CodeGen/aarch64-always-inline-feature-bug.c b/clang/test/CodeGen/aarch64-always-inline-feature-bug.c
new file mode 100644
index 000000000000..27c3983c66d2
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-always-inline-feature-bug.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple aarch64-- -target-feature +neon -target-feature +sve\
+// RUN:   -target-feature -sve -emit-llvm %s -o - | FileCheck %s
+
+// Reproducer for bug where clang would reject always_inline for unrelated
+// target features if they were disable with `-feature` on the command line.
+// CHECK: @bar
+__attribute__((always_inline)) __attribute__((target("neon"))) void foo() {}
+void bar() { foo(); }
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index e13c6e6d28c2..4a2523440f0f 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -60,7 +60,7 @@ uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
   ExtensionSet FeatureBits;
   for (const StringRef Feature : Features) {
     std::optional<FMVInfo> FMV = parseFMVExtension(Feature);
-    if (!FMV) {
+    if (!FMV && Feature.starts_with('+')) {
       if (std::optional<ExtensionInfo> Info = targetFeatureToExtension(Feature))
         FMV = lookupFMVByID(Info->ID);
     }
@@ -181,7 +181,8 @@ std::optional<AArch64::FMVInfo> AArch64::parseFMVExtension(StringRef FMVExt) {
 std::optional<AArch64::ExtensionInfo>
 AArch64::targetFeatureToExtension(StringRef TargetFeature) {
   for (const auto &E : Extensions)
-    if (TargetFeature == E.PosTargetFeature)
+    if (TargetFeature == E.PosTargetFeature ||
+        TargetFeature == E.NegTargetFeature)
       return E;
   return {};
 }
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index f4c93334ac68..c4efb991ab6f 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1831,6 +1831,22 @@ TEST_P(AArch64ExtensionDependenciesBaseCPUTestFixture,
   }
 }
 
+TEST(TargetParserTest, testAArch64ReconstructFromParsedFeatures) {
+  AArch64::ExtensionSet Extensions;
+  std::vector<std::string> FeatureOptions = {
+      "-sve2", "-Baz", "+sve", "+FooBar", "+sve2", "+neon", "-sve",
+  };
+  std::vector<std::string> NonExtensions;
+  Extensions.reconstructFromParsedFeatures(FeatureOptions, NonExtensions);
+
+  std::vector<std::string> NonExtensionsExpected = {"-Baz", "+FooBar"};
+  ASSERT_THAT(NonExtensions, testing::ContainerEq(NonExtensionsExpected));
+  std::vector<StringRef> Features;
+  Extensions.toLLVMFeatureList(Features);
+  std::vector<StringRef> FeaturesExpected = {"+neon", "-sve", "+sve2"};
+  ASSERT_THAT(Features, testing::ContainerEq(FeaturesExpected));
+}
+
 AArch64ExtensionDependenciesBaseArchTestParams
     AArch64ExtensionDependenciesArchData[] = {
         // Base architecture features

From a637584fadb1f0b9a4fc526a2952345b14147634 Mon Sep 17 00:00:00 2001
From: DrSergei <serzhdruzhok@gmail.com>
Date: Mon, 16 Jun 2025 22:56:02 +0300
Subject: [PATCH 0545/1322] [lldb-dap] Add supported languages in package.json
 (#144414)

This patch fixes the [problem]. It was caused by missing supported
languages list in `package.json`. VSCode uses `guessDebugger` [function]
to find supported debuggers based on supported languages in case of
opened file. It uses `interestedInLanguage` [function][1] to do that, so
we should provide list of supported languages. Also, fixed typo in
`fortran`.

[problem]: https://github.com/llvm/llvm-project/issues/144239
[function]: https://github.com/microsoft/vscode/blob/main/src/vs/workbench/contrib/debug/browser/debugAdapterManager.ts#L344
[1]: https://github.com/microsoft/vscode/blob/main/src/vs/workbench/contrib/debug/common/debugger.ts#L171
---
 lldb/tools/lldb-dap/package.json | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 0f51c4f935e3..b150dee792c3 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -290,7 +290,7 @@
         "language": "d"
       },
       {
-        "language": "fortan"
+        "language": "fortran"
       },
       {
         "language": "fortran-modern"
@@ -318,6 +318,22 @@
       {
         "type": "lldb-dap",
         "label": "LLDB DAP Debugger",
+        "languages": [
+          "ada",
+          "arm",
+          "c",
+          "cpp",
+          "crystal",
+          "d",
+          "fortran",
+          "fortran-modern",
+          "nim",
+          "objective-c",
+          "objectpascal",
+          "pascal",
+          "rust",
+          "swift"
+        ],
         "configurationAttributes": {
           "launch": {
             "required": [

From 8adccaee2a9e2d967ac54a783ffb71ac6ff79e85 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Mon, 16 Jun 2025 20:06:46 +0000
Subject: [PATCH 0546/1322] [libc] Implemented CharacterConverter push/pop for
 utf32->utf8 conversions (#143971)

Implemented CharacterConverter methods for conversion between utf32 ->
utf8
Added tests

---------

Co-authored-by: Michael Jones <michaelrj@google.com>
---
 libc/src/__support/wchar/CMakeLists.txt       |   9 +-
 .../__support/wchar/character_converter.cpp   |  70 ++++++-
 .../src/__support/wchar/character_converter.h |   8 +-
 libc/src/__support/wchar/mbstate.h            |   9 +
 libc/src/__support/wchar/utf_ret.h            |  24 ---
 libc/test/src/__support/CMakeLists.txt        |   6 +
 libc/test/src/__support/wchar/CMakeLists.txt  |  11 ++
 .../src/__support/wchar/utf32_to_8_test.cpp   | 180 ++++++++++++++++++
 8 files changed, 278 insertions(+), 39 deletions(-)
 delete mode 100644 libc/src/__support/wchar/utf_ret.h
 create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt
 create mode 100644 libc/test/src/__support/wchar/utf32_to_8_test.cpp

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 5cca58400ff4..6715e354e23e 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -15,12 +15,7 @@ add_object_library(
   DEPENDS
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.src.__support.error_or
+    libc.src.__support.math_extras
     .mbstate
-    .utf_ret
-)
-
-add_header_library(
-  utf_ret
-  HDRS
-    utf_ret.h
 )
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc..bac2f6d827e1 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,8 +8,10 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/math_extras.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 #include "character_converter.h"
 
@@ -18,17 +20,75 @@ namespace internal {
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
+void CharacterConverter::clear() {
+  state->partial = 0;
+  state->bytes_processed = 0;
+  state->total_bytes = 0;
+}
+
 bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char32_t utf32) {
+  // we can't be partially through a conversion when pushing a utf32 value
+  if (!isComplete())
+    return -1;
 
-int CharacterConverter::push(char32_t utf32) {}
+  state->partial = utf32;
+  state->bytes_processed = 0;
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+  // determine number of utf-8 bytes needed to represent this utf32 value
+  constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+  constexpr int NUM_RANGES = 4;
+  for (uint8_t i = 0; i < NUM_RANGES; i++) {
+    if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
+      state->total_bytes = i + 1;
+      return 0;
+    }
+  }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+  // `utf32` contains a value that is too large to actually represent a valid
+  // unicode character
+  clear();
+  return -1;
+}
+
+ErrorOr<char8_t> CharacterConverter::pop_utf8() {
+  if (isComplete())
+    return Error(-1);
+
+  constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
+  constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
+
+  // the number of bits per utf-8 byte that actually encode character
+  // information not metadata (# of bits excluding the byte headers)
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+  constexpr int MASK_ENCODED_BITS =
+      mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
+
+  char32_t output;
+
+  // Shift to get the next 6 bits from the utf32 encoding
+  const char32_t shift_amount =
+      (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
+  if (state->bytes_processed == 0) {
+    /*
+      Choose the correct set of most significant bits to encode the length
+      of the utf8 sequence. The remaining bits contain the most significant
+      bits of the unicode value of the character.
+    */
+    output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
+             (state->partial >> shift_amount);
+  } else {
+    // Get the next 6 bits and format it like so: 10xxxxxx
+    output = CONTINUING_BYTE_HEADER |
+             ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
+  }
+
+  state->bytes_processed++;
+  return static_cast<char8_t>(output);
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d0602d2defe2..c4ba7cf6b689 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,8 +11,9 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -24,13 +25,14 @@ private:
 public:
   CharacterConverter(mbstate *mbstate);
 
+  void clear();
   bool isComplete();
 
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
-  utf_ret<char8_t> pop_utf8();
-  utf_ret<char32_t> pop_utf32();
+  ErrorOr<char8_t> pop_utf8();
+  ErrorOr<char32_t> pop_utf32();
 };
 
 } // namespace internal
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index d33ee354a544..fb08fb4eaa18 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 struct mbstate {
+  // store a partial codepoint (in UTF-32)
   char32_t partial;
+
+  /*
+  Progress towards a conversion
+    For utf8  -> utf32, increases with each CharacterConverter::push(utf8_byte)
+    For utf32 ->  utf8, increases with each CharacterConverter::pop_utf8()
+  */
   uint8_t bytes_processed;
+
+  // Total number of bytes that will be needed to represent this character
   uint8_t total_bytes;
 };
 
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
deleted file mode 100644
index fa99b76159bd..000000000000
--- a/libc/src/__support/wchar/utf_ret.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-template <typename T> struct utf_ret {
-  T out;
-  int error;
-};
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5c..76218a16e0cf 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,9 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 000000000000..5dff6e9115f7
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf32_to_8_test
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
new file mode 100644
index 000000000000..f4c5cb863ff3
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -0,0 +1,180 @@
+//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // utf8 1-byte encodings are identical to their utf32 representations
+  char32_t utf32_A = 0x41; // 'A'
+  cr.push(utf32_A);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'A');
+  ASSERT_TRUE(cr.isComplete());
+
+  char32_t utf32_B = 0x42; // 'B'
+  cr.push(utf32_B);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'B');
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
+  char32_t utf32 = 0xff;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x58e -> utf8: 0xd6 0x8e
+  utf32 = 0x58e;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  char32_t utf32 = 0x1f921;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  utf32 = 0x12121;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  char32_t utf32 = 0x12121;
+  ASSERT_EQ(cr.push(utf32), 0);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+
+  // can't push a utf32 without finishing popping the utf8 bytes out
+  int err = cr.push(utf32);
+  ASSERT_EQ(err, -1);
+}

From 1e60dd4f236dcca0215decc0e4885fb2dcdc1528 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 13:09:57 -0700
Subject: [PATCH 0547/1322] [lldb] Fix a warning

This patch fixes:

  lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp:89:2:
  error: extra ';' outside of a function is incompatible with C++98
  [-Werror,-Wc++98-compat-extra-semi]
---
 lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
index 1a76371be2d5..9295b6ceae36 100644
--- a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
@@ -86,6 +86,6 @@ StepInTargetsRequestHandler::Run(const StepInTargetsArguments &args) const {
     }
   }
   return body;
-};
+}
 
 } // namespace lldb_dap

From d3bc834ece48cb993fcabcf20311bdcc9e591a21 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 16 Jun 2025 21:10:11 +0100
Subject: [PATCH 0548/1322] [LV] Update check to find epilogue resume value to
 check all incoming.

This fixes a crash where all incoming values for the epilogue resume
value are zero, because there are no remaining iterations to execute for
the epilogue loop.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 ...sve-epilog-vect-no-remaining-iterations.ll | 146 ++++++++++++++++++
 2 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bd0a2ec3986d..f1470fd1f731 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9765,7 +9765,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
                 match(
                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
                     m_SpecificInt(0)) &&
-                is_contained(P.incoming_values(), EPI.VectorTripCount))
+                all_of(P.incoming_values(), [&EPI](Value *Inc) {
+                  return Inc == EPI.VectorTripCount ||
+                         match(Inc, m_SpecificInt(0));
+                }))
               return &P;
             return nullptr;
           });
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
new file mode 100644
index 000000000000..f8551d774de4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64>
+; CHECK-NEXT:    [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP31]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]]
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP20]]
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]]
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT:    [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT:    [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+  %l = load i8, ptr %gep.src.i.i, align 1
+  %l.ext = zext i8 %l to i32
+  %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+  %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+  %gep.dst = getelementptr inbounds i8, ptr  %dst, i64 %iv
+  store i8 0, ptr %gep.dst, align 1
+  %min.ext = zext i32 %min.1 to i64
+  %red.next = or i64 %red, %min.ext
+  %iv.next = add i64 %iv, 1
+  %exitcond.not.i.i = icmp eq i64 %iv.next, 16
+  br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+declare i32 @llvm.umin.i32(i32, i32)
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }

From 34be09ad731d631d7b950a334cfe25673ebe5519 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 16 Jun 2025 21:18:21 +0100
Subject: [PATCH 0549/1322] [lldb-dap][test] fix not supported error. (#144419)

Fixes #144072

buildbot error.
---
 .../tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
index 51ccf2ccbdca..03b79a805d34 100644
--- a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
@@ -112,7 +112,13 @@ class TestDAP_stepInTargets(lldbdap_testcase.DAPTestCaseBase):
             len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
         )
         self.continue_to_breakpoints(breakpoint_ids)
-        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        try:
+            is_supported = self.dap_server.get_capability(
+                "supportsStepInTargetsRequest"
+            )
+        except dap_server.NotSupportedError:
+            is_supported = False
 
         self.assertEqual(
             is_supported,

From a027eb4472ee8fa504c98bef655cac8c8bfe333a Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 16 Jun 2025 16:44:55 -0400
Subject: [PATCH 0550/1322] [HLSL] Use hidden visibility for external linkage.
 (#140292)

Implements

https://github.com/llvm/wg-hlsl/blob/main/proposals/0026-symbol-visibility.md.

The change is to stop using the `hlsl.export` attribute. Instead,
symbols with "program linkage" in HLSL will have export linkage with
default visibility, and symbols with "external linkage" in HLSL will
have export linkage with hidden visibility.
---
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |  8 --
 clang/lib/CodeGen/CodeGenFunction.cpp         |  1 -
 clang/lib/CodeGen/CodeGenModule.cpp           |  5 +
 clang/test/CodeGenHLSL/ArrayAssignable.hlsl   | 30 +++---
 clang/test/CodeGenHLSL/ArrayTemporary.hlsl    | 12 +--
 .../BasicFeatures/ArrayOutputArguments.hlsl   | 14 +--
 .../CodeGenHLSL/BasicFeatures/InitLists.hlsl  | 36 ++++----
 .../BasicFeatures/OutputArguments.hlsl        | 16 ++--
 clang/test/CodeGenHLSL/Bool.hlsl              |  2 +-
 clang/test/CodeGenHLSL/BoolVector.hlsl        | 14 +--
 .../CodeGenHLSL/GlobalConstructorLib.hlsl     |  2 +-
 clang/test/CodeGenHLSL/basic_types.hlsl       | 64 ++++++-------
 .../test/CodeGenHLSL/builtins/AddUint64.hlsl  |  4 +-
 .../ByteAddressBuffers-constructors.hlsl      | 14 +--
 .../GroupMemoryBarrierWithGroupSync.hlsl      |  8 +-
 .../builtins/RWBuffer-constructor.hlsl        | 12 +--
 .../CodeGenHLSL/builtins/ScalarSwizzles.hlsl  |  2 +-
 .../StructuredBuffers-constructors.hlsl       | 12 +--
 clang/test/CodeGenHLSL/builtins/abs.hlsl      | 56 +++++------
 clang/test/CodeGenHLSL/builtins/all.hlsl      |  8 +-
 clang/test/CodeGenHLSL/builtins/and.hlsl      | 12 +--
 clang/test/CodeGenHLSL/builtins/any.hlsl      |  8 +-
 .../CodeGenHLSL/builtins/ceil-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/ceil.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/clamp-overloads.hlsl |  8 +-
 clang/test/CodeGenHLSL/builtins/clamp.hlsl    |  8 +-
 .../CodeGenHLSL/builtins/clip-builtin.hlsl    |  2 +-
 clang/test/CodeGenHLSL/builtins/clip.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/cos-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/cos.hlsl      | 24 ++---
 clang/test/CodeGenHLSL/builtins/cross.hlsl    |  8 +-
 .../builtins/degrees-overloads.hlsl           |  4 +-
 clang/test/CodeGenHLSL/builtins/degrees.hlsl  |  8 +-
 clang/test/CodeGenHLSL/builtins/distance.hlsl | 32 +++----
 .../CodeGenHLSL/builtins/exp-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/exp.hlsl      | 24 ++---
 .../CodeGenHLSL/builtins/exp2-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/exp2.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/floor-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/floor.hlsl    | 24 ++---
 clang/test/CodeGenHLSL/builtins/fmod.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/frac-overloads.hlsl  |  4 +-
 clang/test/CodeGenHLSL/builtins/frac.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/hlsl_resource_t.hlsl | 14 +--
 .../CodeGenHLSL/builtins/isinf-overloads.hlsl |  8 +-
 clang/test/CodeGenHLSL/builtins/isinf.hlsl    | 16 ++--
 clang/test/CodeGenHLSL/builtins/ldexp.hlsl    | 16 ++--
 clang/test/CodeGenHLSL/builtins/length.hlsl   | 45 ++++-----
 .../CodeGenHLSL/builtins/lerp-overloads.hlsl  |  8 +-
 .../CodeGenHLSL/builtins/log-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/log.hlsl      | 24 ++---
 .../CodeGenHLSL/builtins/log10-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/log10.hlsl    | 24 ++---
 .../CodeGenHLSL/builtins/log2-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/log2.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/max-overloads.hlsl   | 22 ++---
 clang/test/CodeGenHLSL/builtins/max.hlsl      | 80 ++++++++--------
 .../CodeGenHLSL/builtins/min-overloads.hlsl   | 22 ++---
 clang/test/CodeGenHLSL/builtins/min.hlsl      | 82 ++++++++---------
 .../builtins/normalize-overloads.hlsl         |  4 +-
 .../test/CodeGenHLSL/builtins/normalize.hlsl  |  8 +-
 clang/test/CodeGenHLSL/builtins/or.hlsl       | 14 +--
 .../CodeGenHLSL/builtins/pow-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/pow.hlsl      | 24 ++---
 .../builtins/radians-overloads.hlsl           |  4 +-
 clang/test/CodeGenHLSL/builtins/radians.hlsl  |  8 +-
 clang/test/CodeGenHLSL/builtins/rcp.hlsl      | 64 ++++++-------
 clang/test/CodeGenHLSL/builtins/reflect.hlsl  | 32 +++----
 .../CodeGenHLSL/builtins/reversebits.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/round-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/round.hlsl    | 24 ++---
 .../CodeGenHLSL/builtins/rsqrt-overloads.hlsl |  4 +-
 clang/test/CodeGenHLSL/builtins/rsqrt.hlsl    |  8 +-
 clang/test/CodeGenHLSL/builtins/sign.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/sin-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/sin.hlsl      | 24 ++---
 .../test/CodeGenHLSL/builtins/smoothstep.hlsl | 32 +++----
 .../CodeGenHLSL/builtins/splitdouble.hlsl     | 10 +-
 .../CodeGenHLSL/builtins/sqrt-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/sqrt.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/step-overloads.hlsl  |  4 +-
 clang/test/CodeGenHLSL/builtins/step.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/trunc-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/trunc.hlsl    | 24 ++---
 .../wave_get_lane_index_do_while.hlsl         |  2 +-
 .../builtins/wave_get_lane_index_simple.hlsl  |  4 +-
 .../builtins/wave_get_lane_index_subcall.hlsl |  4 +-
 clang/test/CodeGenHLSL/cbuffer.hlsl           | 92 +++++++++----------
 .../CodeGenHLSL/cbuffer_and_namespaces.hlsl   |  8 +-
 .../CodeGenHLSL/cbuffer_with_packoffset.hlsl  | 10 +-
 ...uffer_with_static_global_and_function.hlsl |  2 +-
 .../CodeGenHLSL/convergence/do.while.hlsl     | 10 +-
 clang/test/CodeGenHLSL/convergence/for.hlsl   | 14 +--
 clang/test/CodeGenHLSL/convergence/while.hlsl | 12 +--
 clang/test/CodeGenHLSL/default_cbuffer.hlsl   | 12 +--
 .../default_cbuffer_with_layout.hlsl          | 12 +--
 clang/test/CodeGenHLSL/export.hlsl            | 10 +-
 clang/test/CodeGenHLSL/group_shared.hlsl      |  2 +-
 .../implicit-norecurse-attrib.hlsl            | 11 +--
 clang/test/CodeGenHLSL/inline-functions.hlsl  | 17 ++--
 .../CodeGenHLSL/inline-spirv/SpirvType.hlsl   |  4 +-
 clang/test/CodeGenHLSL/no_int_promotion.hlsl  | 14 +--
 .../test/CodeGenHLSL/out-of-line-static.hlsl  |  4 +-
 clang/test/CodeGenHLSL/shift-mask.hlsl        | 16 ++--
 .../CodeGenHLSL/this-assignment-overload.hlsl |  4 +-
 clang/test/CodeGenHLSL/vk-input-builtin.hlsl  |  2 +-
 .../enable_16bit_types_validation_spirv.hlsl  |  2 +-
 .../Target/DirectX/DXILFinalizeLinkage.cpp    |  4 +-
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   |  3 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  3 +-
 .../finalize-linkage-remove-dead-lib.ll       | 77 ++++++----------
 .../DirectX/finalize-linkage-remove-dead.ll   | 46 +++++-----
 llvm/test/CodeGen/DirectX/finalize_linkage.ll | 25 +++--
 113 files changed, 1101 insertions(+), 1140 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 571ff53b7d64..585411bc59e1 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -471,14 +471,6 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD,
   }
 }
 
-void CGHLSLRuntime::setHLSLFunctionAttributes(const FunctionDecl *FD,
-                                              llvm::Function *Fn) {
-  if (FD->isInExportDeclContext()) {
-    const StringRef ExportAttrKindStr = "hlsl.export";
-    Fn->addFnAttr(ExportAttrKindStr);
-  }
-}
-
 static void gatherFunctions(SmallVectorImpl<Function *> &Fns, llvm::Module &M,
                             bool CtorOrDtor) {
   const auto *GV =
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 13d0633e9b1c..70a09795d02f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1267,7 +1267,6 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     if (FD->hasAttr<HLSLShaderAttr>()) {
       CGM.getHLSLRuntime().emitEntryFunction(FD, Fn);
     }
-    CGM.getHLSLRuntime().setHLSLFunctionAttributes(FD, Fn);
   }
 
   EmitFunctionProlog(*CurFnInfo, CurFn, Args);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c036902b0b13..06c0e1f8afe1 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1666,6 +1666,11 @@ void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
     return;
   }
 
+  if (Context.getLangOpts().HLSL && !D->isInExportDeclContext()) {
+    GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    return;
+  }
+
   if (GV->hasDLLExportStorageClass() || GV->hasDLLImportStorageClass()) {
     // Reject incompatible dlllstorage and visibility annotations.
     if (!LV.isVisibilityExplicit())
diff --git a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
index c3204570d6ef..aaa486eff10b 100644
--- a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
@@ -7,10 +7,10 @@ struct S {
 
 // CHECK: [[CBLayout:%.*]] = type <{ [2 x float], [2 x <4 x i32>], [2 x [2 x i32]], [1 x target("dx.Layout", %S, 8, 0, 4)] }>
 // CHECK: @CBArrays.cb = global target("dx.CBuffer", target("dx.Layout", [[CBLayout]], 136, 0, 32, 64, 128))
-// CHECK: @c1 = external addrspace(2) global [2 x float], align 4
-// CHECK: @c2 = external addrspace(2) global [2 x <4 x i32>], align 16
-// CHECK: @c3 = external addrspace(2) global [2 x [2 x i32]], align 4
-// CHECK: @c4 = external addrspace(2) global [1 x target("dx.Layout", %S, 8, 0, 4)], align 1
+// CHECK: @c1 = external hidden addrspace(2) global [2 x float], align 4
+// CHECK: @c2 = external hidden addrspace(2) global [2 x <4 x i32>], align 16
+// CHECK: @c3 = external hidden addrspace(2) global [2 x [2 x i32]], align 4
+// CHECK: @c4 = external hidden addrspace(2) global [1 x target("dx.Layout", %S, 8, 0, 4)], align 1
 
 cbuffer CBArrays : register(b0) {
   float c1[2];
@@ -19,7 +19,7 @@ cbuffer CBArrays : register(b0) {
   S c4[1];
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign1
+// CHECK-LABEL: define hidden void {{.*}}arr_assign1
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NOT: alloca
@@ -33,7 +33,7 @@ void arr_assign1() {
   Arr = Arr2;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign2
+// CHECK-LABEL: define hidden void {{.*}}arr_assign2
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr3:%.*]] = alloca [2 x i32], align 4
@@ -51,7 +51,7 @@ void arr_assign2() {
   Arr = Arr2 = Arr3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign3
+// CHECK-LABEL: define hidden void {{.*}}arr_assign3
 // CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -65,7 +65,7 @@ void arr_assign3() {
   Arr2 = Arr3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign4
+// CHECK-LABEL: define hidden void {{.*}}arr_assign4
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NOT: alloca
@@ -81,7 +81,7 @@ void arr_assign4() {
   (Arr = Arr2)[0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign5
+// CHECK-LABEL: define hidden void {{.*}}arr_assign5
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr3:%.*]] = alloca [2 x i32], align 4
@@ -101,7 +101,7 @@ void arr_assign5() {
   (Arr = Arr2 = Arr3)[0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign6
+// CHECK-LABEL: define hidden void {{.*}}arr_assign6
 // CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -118,7 +118,7 @@ void arr_assign6() {
   (Arr = Arr2)[0][0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign7
+// CHECK-LABEL: define hidden void {{.*}}arr_assign7
 // CHECK: [[Arr:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -138,7 +138,7 @@ void arr_assign7() {
 
 // Verify you can assign from a cbuffer array
 
-// CHECK-LABEL: define void {{.*}}arr_assign8
+// CHECK-LABEL: define hidden void {{.*}}arr_assign8
 // CHECK: [[C:%.*]] = alloca [2 x float], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[C]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 4 [[C]], ptr addrspace(2) align 4 @c1, i32 8, i1 false)
@@ -148,7 +148,7 @@ void arr_assign8() {
   C = c1;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign9
+// CHECK-LABEL: define hidden void {{.*}}arr_assign9
 // CHECK: [[C:%.*]] = alloca [2 x <4 x i32>], align 16
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[C]], ptr align 16 {{.*}}, i32 32, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 16 [[C]], ptr addrspace(2) align 16 @c2, i32 32, i1 false)
@@ -158,7 +158,7 @@ void arr_assign9() {
   C = c2;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign10
+// CHECK-LABEL: define hidden void {{.*}}arr_assign10
 // CHECK: [[C:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[C]], ptr align 4 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 4 [[C]], ptr addrspace(2) align 4 @c3, i32 16, i1 false)
@@ -168,7 +168,7 @@ void arr_assign10() {
   C = c3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign11
+// CHECK-LABEL: define hidden void {{.*}}arr_assign11
 // CHECK: [[C:%.*]] = alloca [1 x %struct.S], align 1
 // CHECK: call void @llvm.memcpy.p0.p2.i32(ptr align 1 [[C]], ptr addrspace(2) align 1 @c4, i32 8, i1 false)
 // CHECK-NEXT: ret void
diff --git a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
index 29ea896045bb..42a469ae8795 100644
--- a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
@@ -3,7 +3,7 @@
 
 void fn(float x[2]) { }
 
-// CHECK-LABEL: define void {{.*}}call{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x float]
 // CHECK: [[Tmp:%.*]] = alloca [2 x float]
 // CHECK: call void @llvm.memset.p0.i32(ptr align 4 [[Arr]], i8 0, i32 8, i1 false)
@@ -21,7 +21,7 @@ struct Obj {
 
 void fn2(Obj O[4]) { }
 
-// CHECK-LABEL: define void {{.*}}call2{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call2{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [4 x %struct.Obj]
 // CHECK: [[Tmp:%.*]] = alloca [4 x %struct.Obj]
 // CHECK: call void @llvm.memset.p0.i32(ptr align 1 [[Arr]], i8 0, i32 32, i1 false)
@@ -35,7 +35,7 @@ void call2() {
 
 void fn3(float x[2][2]) { }
 
-// CHECK-LABEL: define void {{.*}}call3{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call3{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x [2 x float]]
 // CHECK: [[Tmp:%.*]] = alloca [2 x [2 x float]]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr]], ptr align 4 {{.*}}, i32 16, i1 false)
@@ -46,7 +46,7 @@ void call3() {
   fn3(Arr);
 }
 
-// CHECK-LABEL: define void {{.*}}call4{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}call4{{.*}}(ptr
 // CHECK-SAME: noundef byval([2 x [2 x float]]) align 4 [[Arr:%.*]])
 // CHECK: [[Tmp:%.*]] = alloca [2 x [2 x float]]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[Arr]], i32 16, i1 false)
@@ -59,7 +59,7 @@ void call4(float Arr[2][2]) {
 // Verify that each template instantiation codegens to a unique and correctly
 // mangled function name.
 
-// CHECK-LABEL: define void {{.*}}template_call{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}template_call{{.*}}(ptr
 
 // CHECK-SAME: noundef byval([2 x float]) align 4 [[FA2:%[0-9A-Z]+]],
 // CHECK-SAME: ptr noundef byval([4 x float]) align 4 [[FA4:%[0-9A-Z]+]],
@@ -86,7 +86,7 @@ void template_call(float FA2[2], float FA4[4], int IA3[3]) {
 
 
 // Verify that Array parameter element access correctly codegens.
-// CHECK-LABEL: define void {{.*}}element_access{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}element_access{{.*}}(ptr
 // CHECK-SAME: noundef byval([2 x float]) align 4 [[FA2:%[0-9A-Z]+]]
 
 // CHECK: [[Addr:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
index eb7d755bca61..bccfaf597f0e 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
@@ -11,7 +11,7 @@ void increment(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -32,7 +32,7 @@ void fn2(out int Arr[2]) {
 // CHECK: [[A:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -56,7 +56,7 @@ void nestedCall(inout int Arr[2], uint index) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0) #3
+// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 1
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -70,7 +70,7 @@ export int arrayCall3() {
 // CHECK-LABEL: outerCall
 // CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 %{{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 {{.*}}, ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: ret void
 void outerCall(inout int Arr[2]) {
@@ -82,7 +82,7 @@ void outerCall(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -99,7 +99,7 @@ void fn3(int Arr[2]) {}
 // CHECK-LABEL: outerCall2
 // CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 {{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: ret void
 void outerCall2(inout int Arr[2]) {
   fn3(Arr);
@@ -110,7 +110,7 @@ void outerCall2(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
index 371f31c9e4af..c30c640519cd 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
@@ -46,7 +46,7 @@ struct SlicyBits {
 };
 
 // Case 1: Extraneous braces get ignored in literal instantiation.
-// CHECK-LABEL: define void @_Z5case1v(
+// CHECK-LABEL: define hidden void @_Z5case1v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_RESULT]], ptr align 1 @__const._Z5case1v.TF1, i32 8, i1 false)
@@ -58,7 +58,7 @@ TwoFloats case1() {
 }
 
 // Case 2: Valid C/C++ initializer is handled appropriately.
-// CHECK-LABEL: define void @_Z5case2v(
+// CHECK-LABEL: define hidden void @_Z5case2v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_RESULT]], ptr align 1 @__const._Z5case2v.TF2, i32 8, i1 false)
@@ -70,7 +70,7 @@ TwoFloats case2() {
 }
 
 // Case 3: Simple initialization with conversion of an argument.
-// CHECK-LABEL: define void @_Z5case3i(
+// CHECK-LABEL: define hidden void @_Z5case3i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], i32 noundef [[VAL:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
@@ -90,7 +90,7 @@ TwoFloats case3(int Val) {
 
 // Case 4: Initialization from a scalarized vector into a structure with element
 // conversions.
-// CHECK-LABEL: define void @_Z5case4Dv2_i(
+// CHECK-LABEL: define hidden void @_Z5case4Dv2_i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -113,7 +113,7 @@ TwoFloats case4(int2 TwoVals) {
 }
 
 // Case 5: Initialization from a scalarized vector of matching type.
-// CHECK-LABEL: define void @_Z5case5Dv2_i(
+// CHECK-LABEL: define hidden void @_Z5case5Dv2_i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -135,7 +135,7 @@ TwoInts case5(int2 TwoVals) {
 
 // Case 6: Initialization from a scalarized structure of different type with
 // different element types.
-// CHECK-LABEL: define void @_Z5case69TwoFloats(
+// CHECK-LABEL: define hidden void @_Z5case69TwoFloats(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF4:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -157,7 +157,7 @@ TwoInts case6(TwoFloats TF4) {
 
 // Case 7: Initialization of a complex structure, with bogus braces and element
 // conversions from a collection of scalar values, and structures.
-// CHECK-LABEL: define void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
+// CHECK-LABEL: define hidden void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_DOGGO:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI1:%.*]], ptr noundef byval([[STRUCT_TWOINTS]]) align 1 [[TI2:%.*]], i32 noundef [[VAL:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF2:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF3:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF4:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
@@ -221,7 +221,7 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
 
 // Case 8: Initialization of a structure from a different structure with
 // significantly different element types and grouping.
-// CHECK-LABEL: define void @_Z5case85Doggo(
+// CHECK-LABEL: define hidden void @_Z5case85Doggo(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ANIMALBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 1 [[D1:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LEGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -307,7 +307,7 @@ AnimalBits case8(Doggo D1) {
 // Case 9: Everything everywhere all at once... Initializing mismatched
 // structures from different layouts, different component groupings, with no
 // top-level bracing separation.
-// CHECK-LABEL: define void @_Z5case95Doggo10AnimalBits(
+// CHECK-LABEL: define hidden void @_Z5case95Doggo10AnimalBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ZOO:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 1 [[D1:%.*]], ptr noundef byval([[STRUCT_ANIMALBITS:%.*]]) align 1 [[A1:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ZOO]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -723,7 +723,7 @@ Zoo case9(Doggo D1, AnimalBits A1) {
 }
 
 // Case 10: Initialize an object with a base class from two objects.
-// CHECK-LABEL: define void @_Z6case109TwoFloatsS_(
+// CHECK-LABEL: define hidden void @_Z6case109TwoFloatsS_(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -750,7 +750,7 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) {
 }
 
 // Case 11: Initialize an object with a base class from a vector splat.
-// CHECK-LABEL: define void @_Z6case11f(
+// CHECK-LABEL: define hidden void @_Z6case11f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], float noundef nofpclass(nan inf) [[F:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
@@ -799,7 +799,7 @@ FourFloats case11(float F) {
 }
 
 // Case 12: Initialize bitfield from two integers.
-// CHECK-LABEL: define void @_Z6case12ii(
+// CHECK-LABEL: define hidden void @_Z6case12ii(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
@@ -821,7 +821,7 @@ SlicyBits case12(int I, int J) {
 }
 
 // Case 13: Initialize bitfield from a struct of two ints.
-// CHECK-LABEL: define void @_Z6case137TwoInts(
+// CHECK-LABEL: define hidden void @_Z6case137TwoInts(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
@@ -841,7 +841,7 @@ SlicyBits case13(TwoInts TI) {
 }
 
 // Case 14: Initialize struct of ints from struct with bitfields.
-// CHECK-LABEL: define void @_Z6case149SlicyBits(
+// CHECK-LABEL: define hidden void @_Z6case149SlicyBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 1 [[SB:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -861,7 +861,7 @@ TwoInts case14(SlicyBits SB) {
 }
 
 // Case 15: Initialize struct of floats from struct with bitfields.
-// CHECK-LABEL: define void @_Z6case159SlicyBits(
+// CHECK-LABEL: define hidden void @_Z6case159SlicyBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 1 [[SB:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -884,7 +884,7 @@ TwoFloats case15(SlicyBits SB) {
 
 // Case 16: Side-effecting initialization list arguments. The important thing
 // here is that case16 only has _one_ call to makeTwo.
-// CHECK-LABEL: define void @_Z7makeTwoRf(
+// CHECK-LABEL: define hidden void @_Z7makeTwoRf(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
@@ -910,7 +910,7 @@ TwoFloats makeTwo(inout float X) {
     return TF;
 }
 
-// CHECK-LABEL: define void @_Z6case16v(
+// CHECK-LABEL: define hidden void @_Z6case16v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = alloca float, align 4
@@ -948,7 +948,7 @@ int case17Helper(int x) {
 }
 
 // InitList with OpaqueValueExpr
-// CHECK-LABEL: define void {{.*}}case17
+// CHECK-LABEL: define hidden void {{.*}}case17
 // CHECK: [[X:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: [[C:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 0)
 // CHECK-NEXT: [[C1:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 1)
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
index 1f45a7f9b46d..d0ba8f447b73 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
@@ -6,7 +6,7 @@
 // integer. It is converted to an integer on call and converted back after the
 // function.
 
-// CHECK: define void {{.*}}trunc_Param{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}trunc_Param{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void trunc_Param(inout int X) {}
 
 // ALL-LABEL: define noundef nofpclass(nan inf) float {{.*}}case1
@@ -32,7 +32,7 @@ export float case1(float F) {
 // uninitialized in the function. If they are not initialized before the
 // function returns the value is undefined.
 
-// CHECK: define void {{.*}}undef{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}undef{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void undef(out int Z) { }
 
 // ALL-LABEL: define noundef i32 {{.*}}case2
@@ -54,7 +54,7 @@ export int case2() {
 // This test should verify that an out parameter value is written to as
 // expected.
 
-// CHECK: define void {{.*}}zero{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}zero{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void zero(out int Z) { Z = 0; }
 
 // ALL-LABEL: define noundef i32 {{.*}}case3
@@ -76,7 +76,7 @@ export int case3() {
 // Vector swizzles in HLSL produce lvalues, so they can be used as arguments to
 // inout parameters and the swizzle is reversed on writeback.
 
-// CHECK: define void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
+// CHECK: define hidden void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
 void funky(inout int3 X) {
   X.x += 1;
   X.y += 2;
@@ -116,7 +116,7 @@ export int3 case4() {
 
 // Case 5: Straightforward inout of a scalar value.
 
-// CHECK: define void {{.*}}increment{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}increment{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void increment(inout int I) {
   I += 1;
 }
@@ -144,7 +144,7 @@ struct S {
   float Y;
 };
 
-// CHECK: define void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
+// CHECK: define hidden void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
 void init(out S s) {
   s.X = 3;
   s.Y = 4;
@@ -170,7 +170,7 @@ struct R {
   float Y;
 };
 
-// CHECK: define void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
+// CHECK: define hidden void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
 void init(inout R s) {
   s.X = 3;
   s.Y = 4;
@@ -194,7 +194,7 @@ export int case7() {
 
 // Case 8: Non-scalars with a cast expression.
 
-// CHECK: define void {{.*}}trunc_vec{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
+// CHECK: define hidden void {{.*}}trunc_vec{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
 void trunc_vec(inout int3 V) {}
 
 // ALL-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}case8
diff --git a/clang/test/CodeGenHLSL/Bool.hlsl b/clang/test/CodeGenHLSL/Bool.hlsl
index fb0f32b11241..21328c1f9d4d 100644
--- a/clang/test/CodeGenHLSL/Bool.hlsl
+++ b/clang/test/CodeGenHLSL/Bool.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
 // CHECK: [[X:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[Y:%.*]] = zext i1 {{%.*}} to i32
 // CHECK-NEXT: store i32 [[Y]], ptr [[X]], align 4
diff --git a/clang/test/CodeGenHLSL/BoolVector.hlsl b/clang/test/CodeGenHLSL/BoolVector.hlsl
index 35d8b9dac801..d5054a5a92b5 100644
--- a/clang/test/CodeGenHLSL/BoolVector.hlsl
+++ b/clang/test/CodeGenHLSL/BoolVector.hlsl
@@ -9,7 +9,7 @@ struct S {
     float f;
 };
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn1{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn1{{.*}}
 // CHECK: [[B:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[B]], align 8
 // CHECK-NEXT: [[BoolVec:%.*]] = load <2 x i32>, ptr [[B]], align 8
@@ -21,7 +21,7 @@ bool fn1() {
   return B[0];
 }
 
-// CHECK-LABEL: define noundef <2 x i1> {{.*}}fn2{{.*}}
+// CHECK-LABEL: define hidden noundef <2 x i1> {{.*}}fn2{{.*}}
 // CHECK: [[VAddr:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[A:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: [[StoreV:%.*]] = zext i1 {{.*}} to i32
@@ -40,7 +40,7 @@ bool2 fn2(bool V) {
   return A;
 }
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn3{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn3{{.*}}
 // CHECK: [[s:%.*]] = alloca %struct.S, align 1
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[s]], ptr align 1 [[ConstS]], i32 12, i1 false)
 // CHECK-NEXT: [[BV:%.*]] = getelementptr inbounds nuw %struct.S, ptr [[s]], i32 0, i32 0
@@ -53,7 +53,7 @@ bool fn3() {
   return s.bv[0];
 }
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn4{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn4{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 8 [[ConstArr]], i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[Arr]], i32 0, i32 0
@@ -66,7 +66,7 @@ bool fn4() {
   return Arr[0][1];
 }
 
-// CHECK-LABEL: define void {{.*}}fn5{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn5{{.*}}
 // CHECK: [[Arr:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[Arr]], align 8
 // CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[Arr]], align 8
@@ -78,7 +78,7 @@ void fn5() {
   Arr[1] = false;
 }
 
-// CHECK-LABEL: define void {{.*}}fn6{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn6{{.*}}
 // CHECK: [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[S:%.*]] = alloca %struct.S, align 1
 // CHECK-NEXT: store i32 0, ptr [[V]], align 4
@@ -97,7 +97,7 @@ void fn6() {
   s.bv[1] = V;
 }
 
-// CHECK-LABEL: define void {{.*}}fn7{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn7{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 8 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[Arr]], i32 0, i32 0
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
index 9090e9e85ed9..afda714106fa 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
@@ -33,7 +33,7 @@ void SecondEntry() {}
 
 // Verify the constructor is alwaysinline
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
-// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
+// NOINLINE-NEXT: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
 
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
 // NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]]
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 362042654ea8..37fb5195e976 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -6,38 +6,38 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
 
 
-// CHECK: @uint16_t_Val = external addrspace(2) global i16, align 2
-// CHECK: @int16_t_Val = external addrspace(2) global i16, align 2
-// CHECK: @uint_Val = external addrspace(2) global i32, align 4
-// CHECK: @uint64_t_Val = external addrspace(2) global i64, align 8
-// CHECK: @int64_t_Val = external addrspace(2) global i64, align 8
-// CHECK: @int16_t2_Val = external addrspace(2) global <2 x i16>, align 4
-// CHECK: @int16_t3_Val = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @int16_t4_Val = external addrspace(2) global <4 x i16>, align 8
-// CHECK: @uint16_t2_Val = external addrspace(2) global <2 x i16>, align 4
-// CHECK: @uint16_t3_Val = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @uint16_t4_Val = external addrspace(2) global <4 x i16>, align 8
-// CHECK: @int2_Val = external addrspace(2) global <2 x i32>, align 8
-// CHECK: @int3_Val = external addrspace(2) global <3 x i32>, align 16
-// CHECK: @int4_Val = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @uint2_Val = external addrspace(2) global <2 x i32>, align 8
-// CHECK: @uint3_Val = external addrspace(2) global <3 x i32>, align 16
-// CHECK: @uint4_Val = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @int64_t2_Val = external addrspace(2) global <2 x i64>, align 16
-// CHECK: @int64_t3_Val = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @int64_t4_Val = external addrspace(2) global <4 x i64>, align 32
-// CHECK: @uint64_t2_Val = external addrspace(2) global <2 x i64>, align 16
-// CHECK: @uint64_t3_Val = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @uint64_t4_Val = external addrspace(2) global <4 x i64>, align 32
-// CHECK: @half2_Val = external addrspace(2) global <2 x half>, align 4
-// CHECK: @half3_Val = external addrspace(2) global <3 x half>, align 8
-// CHECK: @half4_Val = external addrspace(2) global <4 x half>, align 8
-// CHECK: @float2_Val = external addrspace(2) global <2 x float>, align 8
-// CHECK: @float3_Val = external addrspace(2) global <3 x float>, align 16
-// CHECK: @float4_Val = external addrspace(2) global <4 x float>, align 16
-// CHECK: @double2_Val = external addrspace(2) global <2 x double>, align 16
-// CHECK: @double3_Val = external addrspace(2) global <3 x double>, align 32
-// CHECK: @double4_Val = external addrspace(2) global <4 x double>, align 32
+// CHECK: @uint16_t_Val = external hidden addrspace(2) global i16, align 2
+// CHECK: @int16_t_Val = external hidden addrspace(2) global i16, align 2
+// CHECK: @uint_Val = external hidden addrspace(2) global i32, align 4
+// CHECK: @uint64_t_Val = external hidden addrspace(2) global i64, align 8
+// CHECK: @int64_t_Val = external hidden addrspace(2) global i64, align 8
+// CHECK: @int16_t2_Val = external hidden addrspace(2) global <2 x i16>, align 4
+// CHECK: @int16_t3_Val = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @int16_t4_Val = external hidden addrspace(2) global <4 x i16>, align 8
+// CHECK: @uint16_t2_Val = external hidden addrspace(2) global <2 x i16>, align 4
+// CHECK: @uint16_t3_Val = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @uint16_t4_Val = external hidden addrspace(2) global <4 x i16>, align 8
+// CHECK: @int2_Val = external hidden addrspace(2) global <2 x i32>, align 8
+// CHECK: @int3_Val = external hidden addrspace(2) global <3 x i32>, align 16
+// CHECK: @int4_Val = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @uint2_Val = external hidden addrspace(2) global <2 x i32>, align 8
+// CHECK: @uint3_Val = external hidden addrspace(2) global <3 x i32>, align 16
+// CHECK: @uint4_Val = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @int64_t2_Val = external hidden addrspace(2) global <2 x i64>, align 16
+// CHECK: @int64_t3_Val = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @int64_t4_Val = external hidden addrspace(2) global <4 x i64>, align 32
+// CHECK: @uint64_t2_Val = external hidden addrspace(2) global <2 x i64>, align 16
+// CHECK: @uint64_t3_Val = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @uint64_t4_Val = external hidden addrspace(2) global <4 x i64>, align 32
+// CHECK: @half2_Val = external hidden addrspace(2) global <2 x half>, align 4
+// CHECK: @half3_Val = external hidden addrspace(2) global <3 x half>, align 8
+// CHECK: @half4_Val = external hidden addrspace(2) global <4 x half>, align 8
+// CHECK: @float2_Val = external hidden addrspace(2) global <2 x float>, align 8
+// CHECK: @float3_Val = external hidden addrspace(2) global <3 x float>, align 16
+// CHECK: @float4_Val = external hidden addrspace(2) global <4 x float>, align 16
+// CHECK: @double2_Val = external hidden addrspace(2) global <2 x double>, align 16
+// CHECK: @double3_Val = external hidden addrspace(2) global <3 x double>, align 32
+// CHECK: @double4_Val = external hidden addrspace(2) global <4 x double>, align 32
 
 #ifdef NAMESPACED
 #define TYPE_DECL(T)  hlsl::T T##_Val
diff --git a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
index e1832bdbbf33..8457ad6da293 100644
--- a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
@@ -4,7 +4,7 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
 
-// CHECK-LABEL: define noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -31,7 +31,7 @@ uint2 test_AddUint64_uint2(uint2 a, uint2 b) {
   return AddUint64(a, b);
 }
 
-// CHECK-LABEL: define noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i32>, align 16
diff --git a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
index 403d473ce968..3a8d2c03e173 100644
--- a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
@@ -35,7 +35,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 1, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of ByteAddressBuffer C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME:  %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -47,27 +47,27 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWByteAddressBuffer C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this1,
 // CHECK-SAME: i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by 
 // RasterizerOrderedByteAddressBuffer C1 default constructor
-// CHECK: define void @_Z3foov() #2 {
+// CHECK: define void @_Z3foov()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RasterizerOrderedByteAddressBuffer", align 4
 // CHECK-NEXT: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
 
 // Buf3 initialization part 2 - body of RasterizerOrderedByteAddressBuffer default C1 constructor that
 // calls the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 // CHECK-NEXT: ret void
 
 // Buf1 initialization part 3 - ByteAddressBuffer C2 constructor with explicit binding that initializes
 // handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
 // CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -76,7 +76,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWByteAddressBuffer C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_i8_1_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -85,7 +85,7 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of RasterizerOrderedByteAddressBuffer default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", i8, 1, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
index 9d95d54852c0..114230d38ba5 100644
--- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef -check-prefixes=CHECK,CHECK-DXIL
+// RUN:   -DTARGET=dx -check-prefixes=CHECK,CHECK-DXIL
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef" -check-prefixes=CHECK,CHECK-SPIRV
+// RUN:   -DTARGET=spv -check-prefixes=CHECK,CHECK-SPIRV
 
-// CHECK-DXIL: define void @
-// CHECK-SPIRV: define spir_func void @
+// CHECK-DXIL: define hidden void @
+// CHECK-SPIRV: define hidden spir_func void @
 void test_GroupMemoryBarrierWithGroupSync() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
 // CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index e74a7ed270b0..114468914e2e 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -35,7 +35,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 5, i32 noundef 3, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of RWBuffer<float> C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -47,7 +47,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -59,12 +59,12 @@ export void foo() {
 // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
 
 // Buf3 initialization part 2 - body of RWBuffer<int> default C1 constructor that calls the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 
 // Buf1 initialization part 3 - body of RWBuffer<float> C2 constructor with explicit binding that initializes
 // handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(
 // CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -73,7 +73,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWBuffer<float> C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -81,7 +81,7 @@ export void foo() {
 // CHECK-NEXT: store target("dx.TypedBuffer", double, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4
 
 // Buf3 initialization part 3 - body of RWBuffer<int> default C2 constructor that initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %{{.*}}, i32 0, i32 0
 // CHECK-NEXT: store target("dx.TypedBuffer", i32, 1, 0, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
index 8a3958ad8fd0..7804239edcca 100644
--- a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
@@ -304,7 +304,7 @@ bool2 AccessBools() {
   return X.zw;
 }
 
-// CHECK-LABEL: define void {{.*}}BoolSizeMismatch{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}BoolSizeMismatch{{.*}}
 // CHECK: [[B:%.*]] = alloca <4 x i32>, align 16
 // CHECK-NEXT: [[Tmp:%.*]] = alloca <1 x i32>, align 4
 // CHECK-NEXT: store <4 x i32> splat (i32 1), ptr [[B]], align 16
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
index fc7b6be5c900..28841732df99 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
@@ -36,7 +36,7 @@ export void foo() {
 
 // Buf1 initialization part 2 - body of StructuredBuffer<float> C1 constructor with explicit binding 
 // that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -49,7 +49,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWStructuredBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -63,12 +63,12 @@ export void foo() {
 
 // Buf3 initialization part 2 - body of AppendStructuredBuffer<float> default C1 constructor that calls
 // the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 
 // Buf1 initialization part 3 - body of AppendStructuredBuffer<float> C2 constructor with explicit binding 
 // that initializes handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(
 // CHECK-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -77,7 +77,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWStructuredBuffer<float> C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -86,7 +86,7 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of AppendStructuredBuffer<float> default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index e8a6ee044957..6abe2f816c84 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -8,16 +8,16 @@
 using hlsl::abs;
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_abs_int16_t
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z16test_abs_int16_t
 // NATIVE_HALF: call i16 @llvm.abs.i16(
 int16_t test_abs_int16_t(int16_t p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_abs_int16_t2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z17test_abs_int16_t2
 // NATIVE_HALF: call <2 x i16> @llvm.abs.v2i16(
 int16_t2 test_abs_int16_t2(int16_t2 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_abs_int16_t3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z17test_abs_int16_t3
 // NATIVE_HALF: call <3 x i16> @llvm.abs.v3i16(
 int16_t3 test_abs_int16_t3(int16_t3 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_abs_int16_t4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z17test_abs_int16_t4
 // NATIVE_HALF: call <4 x i16> @llvm.abs.v4i16(
 int16_t4 test_abs_int16_t4(int16_t4 p0) { return abs(p0); }
 
@@ -50,76 +50,76 @@ uint16_t3 test_abs_uint64_t3(uint16_t3 p0) { return abs(p0); }
 uint16_t4 test_abs_uint64_t4(uint16_t4 p0) { return abs(p0); }
 #endif // __HLSL_ENABLE_16_BIT
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_abs_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_abs_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.fabs.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_abs_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_abs_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(float %0)
 half test_abs_half(half p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_abs_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_abs_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.fabs.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_abs_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_abs_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.fabs.v2f32(
 half2 test_abs_half2(half2 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_abs_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_abs_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.fabs.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_abs_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_abs_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.fabs.v3f32(
 half3 test_abs_half3(half3 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_abs_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_abs_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.fabs.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_abs_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_abs_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.fabs.v4f32(
 half4 test_abs_half4(half4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef i32 @_Z12test_abs_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_abs_int
 // CHECK: call i32 @llvm.abs.i32(
 int test_abs_int(int p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_abs_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_abs_int2
 // CHECK: call <2 x i32> @llvm.abs.v2i32(
 int2 test_abs_int2(int2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_abs_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_abs_int3
 // CHECK: call <3 x i32> @llvm.abs.v3i32(
 int3 test_abs_int3(int3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_abs_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_abs_int4
 // CHECK: call <4 x i32> @llvm.abs.v4i32(
 int4 test_abs_int4(int4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_abs_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_abs_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(
 float test_abs_float(float p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_abs_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_abs_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.fabs.v2f32(
 float2 test_abs_float2(float2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_abs_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_abs_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.fabs.v3f32(
 float3 test_abs_float3(float3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_abs_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_abs_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.fabs.v4f32(
 float4 test_abs_float4(float4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef i64 @_Z16test_abs_int64_t
+// CHECK-LABEL: define hidden noundef i64 @_Z16test_abs_int64_t
 // CHECK: call i64 @llvm.abs.i64(
 int64_t test_abs_int64_t(int64_t p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z17test_abs_int64_t2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z17test_abs_int64_t2
 // CHECK: call <2 x i64> @llvm.abs.v2i64(
 int64_t2 test_abs_int64_t2(int64_t2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z17test_abs_int64_t3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z17test_abs_int64_t3
 // CHECK: call <3 x i64> @llvm.abs.v3i64(
 int64_t3 test_abs_int64_t3(int64_t3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z17test_abs_int64_t4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z17test_abs_int64_t4
 // CHECK: call <4 x i64> @llvm.abs.v4i64(
 int64_t4 test_abs_int64_t4(int64_t4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_abs_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_abs_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.fabs.f64(
 double test_abs_double(double p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_abs_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_abs_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.fabs.v2f64(
 double2 test_abs_double2(double2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_abs_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_abs_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.fabs.v3f64(
 double3 test_abs_double3(double3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_abs_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_abs_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.fabs.v4f64(
 double4 test_abs_double4(double4 p0) { return abs(p0); }
 
diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl
index 39f364c5953d..391fad0ef33f 100644
--- a/clang/test/CodeGenHLSL/builtins/all.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/all.hlsl
@@ -2,20 +2,20 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i1 @
diff --git a/clang/test/CodeGenHLSL/builtins/and.hlsl b/clang/test/CodeGenHLSL/builtins/and.hlsl
index b77889cd9ae7..d2ca7cf4163e 100644
--- a/clang/test/CodeGenHLSL/builtins/and.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/and.hlsl
@@ -3,7 +3,7 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 
-// CHECK-LABEL: define noundef i1 @_Z15test_and_scalarbb(
+// CHECK-LABEL: define hidden noundef i1 @_Z15test_and_scalarbb(
 // CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and i1 [[X]], [[Y]]
@@ -13,7 +13,7 @@ bool test_and_scalar(bool x, bool y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_(
+// CHECK-LABEL: define hidden noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_(
 // CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <2 x i1> [[X]], [[Y]]
@@ -23,7 +23,7 @@ bool2 test_and_bool2(bool2 x, bool2 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_(
+// CHECK-LABEL: define hidden noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_(
 // CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <3 x i1> [[X]], [[Y]]
@@ -33,7 +33,7 @@ bool3 test_and_bool3(bool3 x, bool3 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_(
 // CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <4 x i1> [[X]], [[Y]]
@@ -43,7 +43,7 @@ bool4 test_and_bool4(bool4 x, bool4 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z13test_and_int4Dv4_iS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z13test_and_int4Dv4_iS_(
 // CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer
@@ -55,7 +55,7 @@ bool4 test_and_int4(int4 x, int4 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z15test_and_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z15test_and_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[X]], zeroinitializer
diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl
index 3d9d8e9e689e..e4837876e269 100644
--- a/clang/test/CodeGenHLSL/builtins/any.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/any.hlsl
@@ -2,20 +2,20 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i1 @
diff --git a/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
index b313c99e89a5..bdefe46b802e 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
@@ -4,67 +4,67 @@
 
 using hlsl::ceil;
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_double(double p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_double2(double2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_double3(double3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_double4(double4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_int(int p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_int2(int2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_int3(int3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_int4(int4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_uint(uint p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_uint2(uint2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_uint3(uint3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_uint4(uint4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_int64_t(int64_t p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_int64_t2(int64_t2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_int64_t3(int64_t3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_int64_t4(int64_t4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_uint64_t(uint64_t p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_uint64_t2(uint64_t2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_uint64_t3(uint64_t3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_uint64_t4(uint64_t4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index fe0b8f898383..1a9c630b60e5 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -7,36 +7,36 @@
 
 using hlsl::ceil;
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_ceil_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_ceil_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.ceil.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_ceil_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_ceil_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(float %0)
 half test_ceil_half(half p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_ceil_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_ceil_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.ceil.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_ceil_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_ceil_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 half2 test_ceil_half2(half2 p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_ceil_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_ceil_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.ceil.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_ceil_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_ceil_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 half3 test_ceil_half3(half3 p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_ceil_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_ceil_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.ceil.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_ceil_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_ceil_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 half4 test_ceil_half4(half4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_ceil_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_ceil_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_float(float p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_ceil_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_ceil_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_float2(float2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_ceil_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_ceil_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_float3(float3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_ceil_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_ceil_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_float4(float4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
index c0e1e914831a..eaedfb419c19 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] <4 x i16> {{.*}}test_clamp_short4_mismatch
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index d01c2a45c43c..58db4423799b 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i16 @_Z16test_clamp_short
diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
index c864f93af472..aaeb2f026449 100644
--- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK:      define void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
+// CHECK:      define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
 // CHECK:      [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4
 // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
 // CHECK-NO:   call i1 @llvm.dx.any
diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl
index 5a1753766a8a..e067828c38bf 100644
--- a/clang/test/CodeGenHLSL/builtins/clip.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl
@@ -3,13 +3,13 @@
 
 
 void test_scalar(float Buf) {
-  // CHECK:      define void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
+  // CHECK:      define hidden void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
   // CHECK:      [[LOAD:%.*]] = load float, ptr [[VALP]].addr, align 4
   // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
   // CHECK-NO:   call i1 @llvm.dx.any
   // CHECK-NEXT: call void @llvm.dx.discard(i1 [[FCMP]])
   //
-  // SPIRV:      define spir_func void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
+  // SPIRV:      define hidden spir_func void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
   // SPIRV:      [[LOAD:%.*]] = load float, ptr [[VALP]].addr, align 4
   // SPIRV-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
   // SPIRV-NO:   call i1 @llvm.spv.any
@@ -21,13 +21,13 @@ void test_scalar(float Buf) {
 }
 
 void test_vector4(float4 Buf) {
-  // CHECK:      define void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
+  // CHECK:      define hidden void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
   // CHECK:      [[LOAD:%.*]] = load <4 x float>, ptr [[VALP]].addr, align 16
   // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> [[LOAD]], zeroinitializer
   // CHECK-NEXT: [[ANYC:%.*]] = call i1 @llvm.dx.any.v4i1(<4 x i1> [[FCMP]])
   // CHECK-NEXT: call void @llvm.dx.discard(i1 [[ANYC]])
   //
-  // SPIRV:      define spir_func void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
+  // SPIRV:      define hidden spir_func void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
   // SPIRV:      [[LOAD:%.*]] = load <4 x float>, ptr [[VALP]].addr, align 16
   // SPIRV-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> [[LOAD]], zeroinitializer
   // SPIRV-NEXT: [[ANYC:%.*]] = call i1 @llvm.spv.any.v4i1(<4 x i1> [[FCMP]]) 
diff --git a/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
index b7b11b1c3bd6..70926cc8ba74 100644
--- a/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_double(double p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_double2(double2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_double3(double3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_double4(double4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_int(int p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_int2(int2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_int3(int3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_int4(int4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_uint(uint p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_uint2(uint2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_uint3(uint3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_uint4(uint4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_int64_t(int64_t p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_int64_t2(int64_t2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_int64_t3(int64_t3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_int64_t4(int64_t4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_uint64_t(uint64_t p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_uint64_t2(uint64_t2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_uint64_t3(uint64_t3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_uint64_t4(uint64_t4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 5f993d50498b..79f9e1e6fbec 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_cos_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_cos_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.cos.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_cos_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_cos_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 half test_cos_half(half p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_cos_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_cos_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.cos.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_cos_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_cos_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32(
 half2 test_cos_half2(half2 p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_cos_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_cos_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.cos.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_cos_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_cos_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32(
 half3 test_cos_half3(half3 p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_cos_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_cos_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.cos.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_cos_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_cos_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32(
 half4 test_cos_half4(half4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_cos_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_cos_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_float(float p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_cos_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_cos_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_float2(float2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_cos_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_cos_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_float3(float3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_cos_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_cos_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_float4(float4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
index b2a1d6316787..89ac383e2517 100644
--- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] <3 x half> @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half>
diff --git a/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
index bafd2368c996..a1abf435ea10 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.degrees = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].degrees.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
index 64531dd2785e..f0fb12855e5f 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.degrees = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].degrees.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
index ac38cf185379..0c24fbb9f185 100644
--- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -6,14 +6,14 @@
 // RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
@@ -22,7 +22,7 @@
 //
 half test_distance_half(half X, half Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
@@ -30,7 +30,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
@@ -39,7 +39,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 //
 half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
@@ -47,7 +47,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
@@ -56,7 +56,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 //
 half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
@@ -64,7 +64,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
@@ -73,14 +73,14 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 //
 half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
@@ -89,7 +89,7 @@ half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 //
 float test_distance_float(float X, float Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
@@ -97,7 +97,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
@@ -106,7 +106,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 //
 float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
@@ -114,7 +114,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
@@ -123,7 +123,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 //
 float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
@@ -131,7 +131,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
diff --git a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
index 858a1210169d..df34beeba7a8 100644
--- a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_double
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_double(double p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_double2(double2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_double3(double3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_double4(double4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_int(int p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_int2(int2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_int3(int3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_int4(int4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_uint(uint p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_uint2(uint2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_uint3(uint3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_uint4(uint4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_int64_t(int64_t p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_int64_t2(int64_t2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_int64_t3(int64_t3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_int64_t4(int64_t4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_uint64_t(uint64_t p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_uint64_t2(uint64_t2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_uint64_t3(uint64_t3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_uint64_t4(uint64_t4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index 6ed40ed8f433..5a8f60528a84 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_exp_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_exp_half
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(
 // NATIVE_HALF: ret half %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_exp_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_exp_half
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // NO_HALF: ret float %elt.exp
 half test_exp_half(half p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp
 half2 test_exp_half2(half2 p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp
 half3 test_exp_half3(half3 p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp
 half4 test_exp_half4(half4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_exp_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp_float
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_float(float p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_float2(float2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_float3(float3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_float4(float4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
index ef522afc244a..20482777a18d 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_double
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_double(double p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_double2(double2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_double3(double3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_double4(double4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_int(int p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_int2(int2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_int3(int3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_int4(int4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_uint(uint p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_uint2(uint2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_uint3(uint3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_uint4(uint4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_int64_t(int64_t p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_int64_t2(int64_t2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_int64_t3(int64_t3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_int64_t4(int64_t4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_uint64_t(uint64_t p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_uint64_t2(uint64_t2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_uint64_t3(uint64_t3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_uint64_t4(uint64_t4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index b067427e4636..a9bbcb0d9bff 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_exp2_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_exp2_half
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(
 // NATIVE_HALF: ret half %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_exp2_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp2_half
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // NO_HALF: ret float %elt.exp2
 half test_exp2_half(half p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp2
 half2 test_exp2_half2(half2 p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp2
 half3 test_exp2_half3(half3 p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp2
 half4 test_exp2_half4(half4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_exp2_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_exp2_float
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_float(float p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_float2(float2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_float3(float3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_float4(float4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
index 26d83443ea48..1e413e53f333 100644
--- a/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
@@ -4,67 +4,67 @@
 
 using hlsl::floor;
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_double(double p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_double2(double2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_double3(double3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_double4(double4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_int(int p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_int2(int2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_int3(int3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_int4(int4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_uint(uint p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_uint2(uint2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_uint3(uint3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_uint4(uint4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_int64_t(int64_t p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_int64_t2(int64_t2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_int64_t3(int64_t3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_int64_t4(int64_t4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_uint64_t(uint64_t p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_uint64_t2(uint64_t2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_uint64_t3(uint64_t3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_uint64_t4(uint64_t4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index f610baeeefd4..b3ff58317981 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -7,36 +7,36 @@
 
 using hlsl::floor;
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_floor_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_floor_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.floor.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_floor_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_floor_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(float %0)
 half test_floor_half(half p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_floor_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_floor_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.floor.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_floor_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_floor_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 half2 test_floor_half2(half2 p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_floor_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_floor_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.floor.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_floor_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_floor_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 half3 test_floor_half3(half3 p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_floor_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_floor_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.floor.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_floor_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_floor_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 half4 test_floor_half4(half4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_floor_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_floor_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_float(float p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_floor_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_floor_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_float2(float2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_floor_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_floor_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_float3(float3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_floor_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_floor_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_float4(float4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
index 7ecc5854b398..cc91c0b67f6c 100644
--- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="noundef nofpclass(nan inf)" \
+// RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK
 
 //
@@ -12,7 +12,7 @@
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm \
-// RUN:   -o - | FileCheck %s -DFNATTRS="noundef nofpclass(nan inf)" \
+// RUN:   -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=float -DINT_TYPE=f32 --check-prefixes=DXCHECK
 
 
@@ -23,7 +23,7 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -o - | FileCheck %s \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTYPE=half
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half
 
 //
 // ---------- No Native Half support test -----------
@@ -31,7 +31,7 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm \
 // RUN:   -o - | FileCheck %s \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTYPE=float
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=float
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
index b0e844bd8a8d..7a3f7b006948 100644
--- a/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.frac = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].frac.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
index 7b105ce84359..d8397407cd01 100644
--- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.frac = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].frac.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
index 6d2ae6535ecb..24114b11c760 100644
--- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
@@ -6,9 +6,9 @@ using handle_float_t = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::c
 // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", %struct.MyStruct, 0, 0)
 // CHECK: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }>
 
-// CHECK: define void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: define hidden void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
 // CHECK: call void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %0)
-// CHECK: declare void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
+// CHECK: declare hidden void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
 
 void foo1(handle_float_t res);
 
@@ -16,14 +16,14 @@ void fa(handle_float_t a) {
     foo1(a);
 }
 
-// CHECK: define void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: define hidden void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
 void fb(handle_float_t a) {
     handle_float_t b = a;
 }
 
-// CHECK: define void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %a)
+// CHECK: define hidden void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %a)
 // CHECK: call void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %agg.tmp)
-// CHECK: declare void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4)
+// CHECK: declare hidden void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4)
 void foo2(RWBuffer<float4> buf);
 
 void fc(RWBuffer<float4> a) {
@@ -39,9 +39,9 @@ struct MyStruct {
   int2 i;
 };
 
-// CHECK: define void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %a)
+// CHECK: define hidden void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %a)
 // CHECK: call void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %agg.tmp)
-// CHECK: declare void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4)
+// CHECK: declare hidden void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4)
 void foo3(StructuredBuffer<MyStruct> buf);
 
 void fe(StructuredBuffer<MyStruct> a) {
diff --git a/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
index ace209003ce4..f39cba9ace6e 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
@@ -2,19 +2,19 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_double(double p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_double2(double2 p0) { return isinf(p0); }
-// CHECK: define noundef <3 x i1> @
+// CHECK: define hidden noundef <3 x i1> @
 // CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_double3(double3 p0) { return isinf(p0); }
-// CHECK: define noundef <4 x i1> @
+// CHECK: define hidden noundef <4 x i1> @
 // CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_double4(double4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
index df44fc4a91df..4d53daaafb69 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -6,40 +6,40 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // NATIVE_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f16(
 // NO_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_half(half p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // NATIVE_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f16
 // NO_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32(
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_half2(half2 p0) { return isinf(p0); }
-// NATIVE_HALF: define noundef <3 x i1> @
+// NATIVE_HALF: define hidden noundef <3 x i1> @
 // NATIVE_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f16
 // NO_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32(
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_half3(half3 p0) { return isinf(p0); }
-// NATIVE_HALF: define noundef <4 x i1> @
+// NATIVE_HALF: define hidden noundef <4 x i1> @
 // NATIVE_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f16
 // NO_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32(
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_half4(half4 p0) { return isinf(p0); }
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_float(float p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_float2(float2 p0) { return isinf(p0); }
-// CHECK: define noundef <3 x i1> @
+// CHECK: define hidden noundef <3 x i1> @
 // CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_float3(float3 p0) { return isinf(p0); }
-// CHECK: define noundef <4 x i1> @
+// CHECK: define hidden noundef <4 x i1> @
 // CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_float4(float4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
index ea0d1348c6e4..f8fa06c39f2a 100644
--- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
@@ -1,48 +1,48 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn half %elt.exp2, %{{.*}}
 // CHECK: ret half %mul
 half test_ldexp_half(half X, half Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16(<2 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <2 x half> %mul
 half2 test_ldexp_half2(half2 X, half2 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16(<3 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <3 x half> %mul
 half3 test_ldexp_half3(half3 X, half3 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16(<4 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <4 x half> %mul
 half4 test_ldexp_half4(half4 X, half4 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(float %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn float %elt.exp2, %{{.*}}
 // CHECK: ret float %mul
 float test_ldexp_float(float X, float Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(<2 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <2 x float> %mul
 float2 test_ldexp_float2(float2 X, float2 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(<3 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <3 x float> %mul
 float3 test_ldexp_float3(float3 X, float3 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(<4 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <4 x float> %mul
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index 0b17d03d7097..9297c35abfd1 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -8,16 +8,13 @@
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
-//
-
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]])
@@ -28,18 +25,14 @@ half test_length_half(half p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
-//
-
-
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[P0]], <2 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[P0]])
@@ -50,15 +43,14 @@ half test_length_half2(half2 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[P0]], <3 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[P0]])
@@ -69,15 +61,14 @@ half test_length_half3(half3 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[P0]], <4 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[P0]])
@@ -88,14 +79,13 @@ half test_length_half4(half4 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]])
@@ -106,15 +96,14 @@ float test_length_float(float p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[P0]], <2 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[P0]])
@@ -125,15 +114,14 @@ float test_length_float2(float2 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[P0]], <3 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[P0]])
@@ -144,15 +132,14 @@ float test_length_float3(float3 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[P0]], <4 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[P0]])
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
index 3cb14f8555ca..3b13e43873c7 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled(
 // CHECK:    [[CONV0:%.*]] = fptrunc {{.*}} double %{{.*}} to float
diff --git a/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
index 5c63d630c3f3..d7aacdc486ac 100644
--- a/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_double(double p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_double2(double2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_double3(double3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_double4(double4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_int(int p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_int2(int2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_int3(int3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_int4(int4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_uint(uint p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_uint2(uint2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_uint3(uint3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_uint4(uint4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_int64_t(int64_t p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_int64_t2(int64_t2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_int64_t3(int64_t3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_int64_t4(int64_t4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_uint64_t(uint64_t p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_uint64_t2(uint64_t2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_uint64_t3(uint64_t3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_uint64_t4(uint64_t4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index e489939594a5..0136c1a052ed 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_log_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_log_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_log_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_log_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 half test_log_half(half p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_log_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_log_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_log_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_log_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32(
 half2 test_log_half2(half2 p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_log_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_log_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_log_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_log_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32(
 half3 test_log_half3(half3 p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_log_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_log_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_log_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_log_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32(
 half4 test_log_half4(half4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_log_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_log_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_float(float p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_log_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_log_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_float2(float2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_log_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_log_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_float3(float3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_log_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_log_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_float4(float4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
index 1a0539c3517d..e408f4a5d45c 100644
--- a/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_double(double p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_double2(double2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_double3(double3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_double4(double4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_int(int p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_int2(int2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_int3(int3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_int4(int4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_uint(uint p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_uint2(uint2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_uint3(uint3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_uint4(uint4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_int64_t(int64_t p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_int64_t2(int64_t2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_int64_t3(int64_t3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_int64_t4(int64_t4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_uint64_t(uint64_t p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_uint64_t2(uint64_t2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_uint64_t3(uint64_t3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_uint64_t4(uint64_t4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 37c8e837c45a..6a75444143b1 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_log10_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_log10_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log10.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_log10_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_log10_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 half test_log10_half(half p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_log10_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_log10_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log10.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_log10_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_log10_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32(
 half2 test_log10_half2(half2 p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_log10_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_log10_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log10.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_log10_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_log10_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32(
 half3 test_log10_half3(half3 p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_log10_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_log10_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log10.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_log10_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_log10_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32(
 half4 test_log10_half4(half4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_log10_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_log10_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_float(float p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_log10_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_log10_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_float2(float2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_log10_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_log10_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_float3(float3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_log10_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_log10_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_float4(float4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
index c35b50d8e490..f88d5ab84921 100644
--- a/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_double(double p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_double2(double2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_double3(double3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_double4(double4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_int(int p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_int2(int2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_int3(int3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_int4(int4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_uint(uint p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_uint2(uint2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_uint3(uint3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_uint4(uint4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_int64_t(int64_t p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_int64_t2(int64_t2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_int64_t3(int64_t3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_int64_t4(int64_t4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_uint64_t(uint64_t p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_uint64_t2(uint64_t2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_uint64_t3(uint64_t3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_uint64_t4(uint64_t4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 5159d5bb0fa4..84d73c181089 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_log2_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_log2_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log2.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_log2_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_log2_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 half test_log2_half(half p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_log2_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_log2_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log2.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_log2_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_log2_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32(
 half2 test_log2_half2(half2 p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_log2_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_log2_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log2.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_log2_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_log2_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32(
 half3 test_log2_half3(half3 p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_log2_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_log2_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log2.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_log2_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_log2_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32(
 half4 test_log2_half4(half4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_log2_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_log2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_float(float p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_log2_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_log2_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_float2(float2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_log2_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_log2_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_float3(float3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_log2_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_log2_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_float4(float4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
index d952398a6a59..cd7013ba7582 100644
--- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
@@ -4,14 +4,14 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_max_short4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_max_short4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call noundef <4 x i16> @llvm.smax.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
 // NATIVE_HALF: ret <4 x i16> [[MAX]]
 int16_t4 test_max_short4_mismatch(int16_t4 p0, int16_t p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_max_ushort4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_max_ushort4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call noundef <4 x i16> @llvm.umax.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
@@ -19,61 +19,61 @@ int16_t4 test_max_short4_mismatch(int16_t4 p0, int16_t p1) { return max(p0, p1);
 uint16_t4 test_max_ushort4_mismatch(uint16_t4 p0, uint16_t p1) { return max(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_max_int4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_max_int4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MAX]]
 int4 test_max_int4_mismatch(int4 p0, int p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_max_uint4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_max_uint4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MAX]]
 uint4 test_max_uint4_mismatch(uint4 p0, uint p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_max_long4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_max_long4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i64> @llvm.smax.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MAX]]
 int64_t4 test_max_long4_mismatch(int64_t4 p0, int64_t p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_max_ulong4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_max_ulong4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i64> @llvm.umax.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MAX]]
 uint64_t4 test_max_ulong4_mismatch(uint64_t4 p0, uint64_t p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> {{.*}}test_max_half4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> {{.*}}test_max_half4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x half> poison, half %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x half> [[CONV0]], <4 x half> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.maxnum.v4f16(<4 x half> %{{.*}}, <4 x half> [[CONV1]])
 // NATIVE_HALF: ret <4 x half> [[MAX]]
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_half4_mismatch
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_half4_mismatch
 // NO_HALF: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // NO_HALF: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // NO_HALF: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // NO_HALF: ret <4 x float> [[MAX]]
 half4 test_max_half4_mismatch(half4 p0, half p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_float4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_float4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[MAX]]
 float4 test_max_float4_mismatch(float4 p0, float p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.maxnum.v4f64(<4 x double> %{{.*}}, <4 x double> [[CONV1]])
 // CHECK: ret <4 x double> [[MAX]]
 double4 test_max_double4_mismatch(double4 p0, double p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch2
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.maxnum.v4f64(<4 x double> [[CONV1]], <4 x double> %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index 0b767335556e..fab53a160c85 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -6,128 +6,128 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_max_short
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z14test_max_short
 // NATIVE_HALF: call i16 @llvm.smax.i16(
 int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_max_short2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z15test_max_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
 int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_max_short3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z15test_max_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
 int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_max_short4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z15test_max_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
 int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_max_ushort
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z15test_max_ushort
 // NATIVE_HALF: call i16 @llvm.umax.i16(
 uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_max_ushort2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z16test_max_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
 uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_max_ushort3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z16test_max_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
 uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_max_ushort4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z16test_max_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
 uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef i32 @_Z12test_max_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_max_int
 // CHECK: call i32 @llvm.smax.i32(
 int test_max_int(int p0, int p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_max_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_max_int2
 // CHECK: call <2 x i32> @llvm.smax.v2i32
 int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_max_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_max_int3
 // CHECK: call <3 x i32> @llvm.smax.v3i32
 int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_max_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_max_int4
 // CHECK: call <4 x i32> @llvm.smax.v4i32
 int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i32 @_Z13test_max_uint
+// CHECK-LABEL: define hidden noundef i32 @_Z13test_max_uint
 // CHECK: call i32 @llvm.umax.i32(
 int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z14test_max_uint2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z14test_max_uint2
 // CHECK: call <2 x i32> @llvm.umax.v2i32
 uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z14test_max_uint3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z14test_max_uint3
 // CHECK: call <3 x i32> @llvm.umax.v3i32
 uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z14test_max_uint4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z14test_max_uint4
 // CHECK: call <4 x i32> @llvm.umax.v4i32
 uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z13test_max_long
+// CHECK-LABEL: define hidden noundef i64 @_Z13test_max_long
 // CHECK: call i64 @llvm.smax.i64(
 int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z14test_max_long2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z14test_max_long2
 // CHECK: call <2 x i64> @llvm.smax.v2i64
 int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z14test_max_long3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z14test_max_long3
 // CHECK: call <3 x i64> @llvm.smax.v3i64
 int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z14test_max_long4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z14test_max_long4
 // CHECK: call <4 x i64> @llvm.smax.v4i64
 int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z14test_max_ulong
+// CHECK-LABEL: define hidden noundef i64 @_Z14test_max_ulong
 // CHECK: call i64 @llvm.umax.i64(
 uint64_t test_max_ulong(uint64_t p0, uint64_t p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z15test_max_ulong2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z15test_max_ulong2
 // CHECK: call <2 x i64> @llvm.umax.v2i64
 uint64_t2 test_max_ulong2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z15test_max_ulong3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z15test_max_ulong3
 // CHECK: call <3 x i64> @llvm.umax.v3i64
 uint64_t3 test_max_ulong3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z15test_max_ulong4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z15test_max_ulong4
 // CHECK: call <4 x i64> @llvm.umax.v4i64
 uint64_t4 test_max_ulong4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_max_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_max_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.maxnum.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_max_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_max_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.maxnum.f32(
 half test_max_half(half p0, half p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_max_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_max_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.maxnum.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_max_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_max_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.maxnum.v2f32(
 half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_max_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_max_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.maxnum.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_max_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_max_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.maxnum.v3f32(
 half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_max_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_max_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.maxnum.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_max_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_max_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.maxnum.v4f32(
 half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_max_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_max_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.maxnum.f32(
 float test_max_float(float p0, float p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_max_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_max_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.maxnum.v2f32
 float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_max_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_max_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.maxnum.v3f32
 float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_max_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_max_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.maxnum.v4f32
 float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_max_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_max_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.maxnum.f64(
 double test_max_double(double p0, double p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_max_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_max_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.maxnum.v2f64
 double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_max_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_max_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.maxnum.v3f64
 double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_max_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_max_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.maxnum.v4f64
 double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
index 5c200f488c24..f81fa128ce9c 100644
--- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
@@ -4,14 +4,14 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_min_short4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_min_short4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call noundef <4 x i16> @llvm.smin.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
 // NATIVE_HALF: ret <4 x i16> [[MIN]]
 int16_t4 test_min_short4_mismatch(int16_t4 p0, int16_t p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_min_ushort4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_min_ushort4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call noundef <4 x i16> @llvm.umin.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
@@ -19,61 +19,61 @@ int16_t4 test_min_short4_mismatch(int16_t4 p0, int16_t p1) { return min(p0, p1);
 uint16_t4 test_min_ushort4_mismatch(uint16_t4 p0, uint16_t p1) { return min(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_min_int4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_min_int4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MIN]]
 int4 test_min_int4_mismatch(int4 p0, int p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_min_uint4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_min_uint4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MIN]]
 uint4 test_min_uint4_mismatch(uint4 p0, uint p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_min_long4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_min_long4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i64> @llvm.smin.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MIN]]
 int64_t4 test_min_long4_mismatch(int64_t4 p0, int64_t p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_min_ulong4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_min_ulong4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i64> @llvm.umin.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MIN]]
 uint64_t4 test_min_ulong4_mismatch(uint64_t4 p0, uint64_t p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> {{.*}}test_min_half4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> {{.*}}test_min_half4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x half> poison, half %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x half> [[CONV0]], <4 x half> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.minnum.v4f16(<4 x half> %{{.*}}, <4 x half> [[CONV1]])
 // NATIVE_HALF: ret <4 x half> [[MIN]]
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_half4_mismatch
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_half4_mismatch
 // NO_HALF: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // NO_HALF: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // NO_HALF: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // NO_HALF: ret <4 x float> [[MIN]]
 half4 test_min_half4_mismatch(half4 p0, half p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_float4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_float4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[MIN]]
 float4 test_min_float4_mismatch(float4 p0, float p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.minnum.v4f64(<4 x double> %{{.*}}, <4 x double> [[CONV1]])
 // CHECK: ret <4 x double> [[MIN]]
 double4 test_min_double4_mismatch(double4 p0, double p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch2
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.minnum.v4f64(<4 x double> [[CONV1]], <4 x double> %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index 508d8b68ea45..b3e8fedff9b1 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -6,131 +6,131 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_min_short
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z14test_min_short
 // NATIVE_HALF: call i16 @llvm.smin.i16(
 int16_t test_min_short(int16_t p0, int16_t p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_min_short2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z15test_min_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smin.v2i16(
 int16_t2 test_min_short2(int16_t2 p0, int16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_min_short3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z15test_min_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smin.v3i16
 int16_t3 test_min_short3(int16_t3 p0, int16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_min_short4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z15test_min_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smin.v4i16
 int16_t4 test_min_short4(int16_t4 p0, int16_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_min_ushort
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z15test_min_ushort
 // NATIVE_HALF: call i16 @llvm.umin.i16(
 uint16_t test_min_ushort(uint16_t p0, uint16_t p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_min_ushort2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z16test_min_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umin.v2i16
 uint16_t2 test_min_ushort2(uint16_t2 p0, uint16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_min_ushort3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z16test_min_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umin.v3i16
 uint16_t3 test_min_ushort3(uint16_t3 p0, uint16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_min_ushort4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z16test_min_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umin.v4i16
 uint16_t4 test_min_ushort4(uint16_t4 p0, uint16_t4 p1) { return min(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef i32 @_Z12test_min_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_min_int
 // CHECK: call i32 @llvm.smin.i32(
 int test_min_int(int p0, int p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_min_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_min_int2
 // CHECK: call <2 x i32> @llvm.smin.v2i32
 int2 test_min_int2(int2 p0, int2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_min_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_min_int3
 // CHECK: call <3 x i32> @llvm.smin.v3i32
 int3 test_min_int3(int3 p0, int3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_min_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_min_int4
 // CHECK: call <4 x i32> @llvm.smin.v4i32
 int4 test_min_int4(int4 p0, int4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i32 @_Z13test_min_uint
+// CHECK-LABEL: define hidden noundef i32 @_Z13test_min_uint
 // CHECK: call i32 @llvm.umin.i32(
 int test_min_uint(uint p0, uint p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z14test_min_uint2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z14test_min_uint2
 // CHECK: call <2 x i32> @llvm.umin.v2i32
 uint2 test_min_uint2(uint2 p0, uint2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z14test_min_uint3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z14test_min_uint3
 // CHECK: call <3 x i32> @llvm.umin.v3i32
 uint3 test_min_uint3(uint3 p0, uint3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z14test_min_uint4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z14test_min_uint4
 // CHECK: call <4 x i32> @llvm.umin.v4i32
 uint4 test_min_uint4(uint4 p0, uint4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z13test_min_long
+// CHECK-LABEL: define hidden noundef i64 @_Z13test_min_long
 // CHECK: call i64 @llvm.smin.i64(
 int64_t test_min_long(int64_t p0, int64_t p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z14test_min_long2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z14test_min_long2
 // CHECK: call <2 x i64> @llvm.smin.v2i64
 int64_t2 test_min_long2(int64_t2 p0, int64_t2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z14test_min_long3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z14test_min_long3
 // CHECK: call <3 x i64> @llvm.smin.v3i64
 int64_t3 test_min_long3(int64_t3 p0, int64_t3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z14test_min_long4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z14test_min_long4
 // CHECK: call <4 x i64> @llvm.smin.v4i64
 int64_t4 test_min_long4(int64_t4 p0, int64_t4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z14test_min_ulong
+// CHECK-LABEL: define hidden noundef i64 @_Z14test_min_ulong
 // CHECK: call i64 @llvm.umin.i64(
 uint64_t test_min_ulong(uint64_t p0, uint64_t p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z15test_min_ulong2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z15test_min_ulong2
 // CHECK: call <2 x i64> @llvm.umin.v2i64
 uint64_t2 test_min_ulong2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z15test_min_ulong3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z15test_min_ulong3
 // CHECK: call <3 x i64> @llvm.umin.v3i64
 uint64_t3 test_min_ulong3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z15test_min_ulong4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z15test_min_ulong4
 // CHECK: call <4 x i64> @llvm.umin.v4i64
 uint64_t4 test_min_ulong4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_min_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_min_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.minnum.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_min_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_min_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.minnum.f32(
 half test_min_half(half p0, half p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_min_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_min_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.minnum.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_min_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_min_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.minnum.v2f32(
 half2 test_min_half2(half2 p0, half2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_min_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_min_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.minnum.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_min_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_min_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.minnum.v3f32(
 half3 test_min_half3(half3 p0, half3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_min_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_min_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.minnum.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_min_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_min_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.minnum.v4f32(
 half4 test_min_half4(half4 p0, half4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_min_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_min_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.minnum.f32(
 float test_min_float(float p0, float p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_min_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_min_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.minnum.v2f32
 float2 test_min_float2(float2 p0, float2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_min_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_min_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.minnum.v3f32
 float3 test_min_float3(float3 p0, float3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_min_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_min_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.minnum.v4f32
 float4 test_min_float4(float4 p0, float4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_min_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_min_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.minnum.f64(
 double test_min_double(double p0, double p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_min_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_min_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.minnum.v2f64
 double2 test_min_double2(double2 p0, double2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_min_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_min_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.minnum.v3f64
 double3 test_min_double3(double3 p0, double3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_min_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_min_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4(double4 p0, double4 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4_mismatch(double4 p0, double p1) { return min(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
index e9baa25fc640..52ff7da94c4f 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].normalize.f32(float
diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
index 830fc26b7acf..cc2378756a50 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].normalize.f16(half
diff --git a/clang/test/CodeGenHLSL/builtins/or.hlsl b/clang/test/CodeGenHLSL/builtins/or.hlsl
index 69c57c5455f7..66cc5572a75b 100644
--- a/clang/test/CodeGenHLSL/builtins/or.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/or.hlsl
@@ -2,7 +2,7 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-//CHECK-LABEL: define noundef i1 @_Z14test_or_scalarbb(
+//CHECK-LABEL: define hidden noundef i1 @_Z14test_or_scalarbb(
 //CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
@@ -12,7 +12,7 @@ bool test_or_scalar(bool x, bool y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <2 x i1> @_Z13test_or_bool2Dv2_bS_(
+//CHECK-LABEL: define hidden noundef <2 x i1> @_Z13test_or_bool2Dv2_bS_(
 //CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <2 x i1> [[A:%.*]], [[B:%.*]]
@@ -22,7 +22,7 @@ bool2 test_or_bool2(bool2 x, bool2 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <3 x i1> @_Z13test_or_bool3Dv3_bS_(
+//CHECK-LABEL: define hidden noundef <3 x i1> @_Z13test_or_bool3Dv3_bS_(
 //CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <3 x i1> [[A:%.*]], [[B:%.*]]
@@ -32,7 +32,7 @@ bool3 test_or_bool3(bool3 x, bool3 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z13test_or_bool4Dv4_bS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z13test_or_bool4Dv4_bS_(
 //CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <4 x i1> [[A:%.*]], [[B:%.*]]
@@ -42,7 +42,7 @@ bool4 test_or_bool4(bool4 x, bool4 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef i1 @_Z11test_or_intii(
+//CHECK-LABEL: define hidden noundef i1 @_Z11test_or_intii(
 //CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBBOL:%.*]] = icmp ne i32 [[A:%.*]], 0
@@ -54,7 +54,7 @@ bool test_or_int(int x, int y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z12test_or_int4Dv4_iS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z12test_or_int4Dv4_iS_(
 //CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBOOL:%.*]] = icmp ne <4 x i32> [[A:%.*]], zeroinitializer
@@ -66,7 +66,7 @@ bool4 test_or_int4(int4 x, int4 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z14test_or_float4Dv4_fS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z14test_or_float4Dv4_fS_(
 //CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBOOL:%.*]] =  fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[A:%.*]], zeroinitializer
diff --git a/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
index 39003aef7b7b..0d1f3d3546a3 100644
--- a/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
@@ -2,125 +2,125 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:  -DFLOATATTRS="reassoc nnan ninf nsz arcp afn"
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_double
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] double %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] double %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_double(double p0, double p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_double2
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <2 x double> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <2 x double> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_double2(double2 p0, double2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_double3
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <3 x double> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <3 x double> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_double3(double3 p0, double3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_double4
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <4 x double> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <4 x double> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_double4(double4 p0, double4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_int
 // CHECK: [[CONV0:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_int(int p0, int p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int2
 // CHECK: [[CONV0:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_int2(int2 p0, int2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int3
 // CHECK: [[CONV0:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_int3(int3 p0, int3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int4
 // CHECK: [[CONV0:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_int4(int4 p0, int4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_uint
 // CHECK: [[CONV0:%.*]] = uitofp i32 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = uitofp i32 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_uint(uint p0, uint p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint2
 // CHECK: [[CONV0:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_uint2(uint2 p0, uint2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint3
 // CHECK: [[CONV0:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_uint3(uint3 p0, uint3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint4
 // CHECK: [[CONV0:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_uint4(uint4 p0, uint4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_int64_t
 // CHECK: [[CONV0:%.*]] = sitofp i64 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = sitofp i64 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_int64_t(int64_t p0, int64_t p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int64_t2
 // CHECK: [[CONV0:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_int64_t2(int64_t2 p0, int64_t2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int64_t3
 // CHECK: [[CONV0:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_int64_t3(int64_t3 p0, int64_t3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int64_t4
 // CHECK: [[CONV0:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_int64_t4(int64_t4 p0, int64_t4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_uint64_t
 // CHECK: [[CONV0:%.*]] = uitofp i64 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = uitofp i64 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_uint64_t(uint64_t p0, uint64_t p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint64_t2
 // CHECK: [[CONV0:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_uint64_t2(uint64_t2 p0, uint64_t2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint64_t3
 // CHECK: [[CONV0:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_uint64_t3(uint64_t3 p0, uint64_t3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint64_t4
 // CHECK: [[CONV0:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index fd21f1b94c57..fcde755e15fc 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_pow_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_pow_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.pow.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_pow_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_pow_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.pow.f32(
 half test_pow_half(half p0, half p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_pow_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_pow_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.pow.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_pow_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_pow_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.pow.v2f32(
 half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_pow_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_pow_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.pow.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_pow_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_pow_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.pow.v3f32(
 half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_pow_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_pow_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.pow.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_pow_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_pow_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.pow.v4f32(
 half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_pow_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_pow_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.pow.f32(
 float test_pow_float(float p0, float p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_pow_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_pow_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.pow.v2f32
 float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_pow_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_pow_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.pow.v3f32
 float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_pow_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_pow_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.pow.v4f32
 float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
index d0cfc7b60265..4b12f590edcd 100644
--- a/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].radians.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl
index efdeb9f6e142..f281747fbf29 100644
--- a/clang/test/CodeGenHLSL/builtins/radians.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 
 
 // NATIVE_HALF: define [[FNATTRS]] half @
diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
index 8f07f3a03153..cdfaa3c5f1ee 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
@@ -13,90 +13,90 @@
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) half @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) half @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) half @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn half 0xH3C00, %{{.*}} 
 // NATIVE_HALF: ret half %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) float @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) float @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) float @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) float @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn float 1.000000e+00, %{{.*}}
 // NO_HALF: ret float %hlsl.rcp
 half test_rcp_half(half p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <2 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <2 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <2 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <2 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <2 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <2 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <2 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <2 x float> %hlsl.rcp
 half2 test_rcp_half2(half2 p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <3 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <3 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <3 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <3 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <3 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <3 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <3 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <3 x float> %hlsl.rcp
 half3 test_rcp_half3(half3 p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <4 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <4 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <4 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <4 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <4 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <4 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <4 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <4 x float> %hlsl.rcp
 half4 test_rcp_half4(half4 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) float @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) float @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) float @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) float @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn float 1.000000e+00, %{{.*}}
 // CHECK: ret float %hlsl.rcp
 float test_rcp_float(float p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <2 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <2 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <2 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <2 x float> %hlsl.rcp
 float2 test_rcp_float2(float2 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <3 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <3 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <3 x float> %hlsl.rcp
 float3 test_rcp_float3(float3 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <4 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <4 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <4 x float> %hlsl.rcp
 float4 test_rcp_float4(float4 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) double @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) double @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) double @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) double @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn double 1.000000e+00, %{{.*}} 
 // CHECK: ret double %hlsl.rcp
 double test_rcp_double(double p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <2 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <2 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <2 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <2 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <2 x double> %hlsl.rcp
 double2 test_rcp_double2(double2 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <3 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <3 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <3 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <3 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <3 x double> %hlsl.rcp
 double3 test_rcp_double3(double3 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <4 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <4 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <4 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <4 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <4 x double> %hlsl.rcp
 double4 test_rcp_double4(double4 p0) { return rcp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
index c082e63ac1da..65fefd801ffe 100644
--- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
@@ -6,7 +6,7 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000
@@ -15,7 +15,7 @@
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]]
 // CHECK-NEXT:    ret half [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000
@@ -28,7 +28,7 @@ half test_reflect_half(half I, half N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]])
@@ -39,7 +39,7 @@ half test_reflect_half(half I, half N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <2 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]])
@@ -49,7 +49,7 @@ half2 test_reflect_half2(half2 I, half2 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]])
@@ -60,7 +60,7 @@ half2 test_reflect_half2(half2 I, half2 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <3 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]])
@@ -70,7 +70,7 @@ half3 test_reflect_half3(half3 I, half3 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]])
@@ -81,7 +81,7 @@ half3 test_reflect_half3(half3 I, half3 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <4 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]])
@@ -91,7 +91,7 @@ half4 test_reflect_half4(half4 I, half4 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00
@@ -100,7 +100,7 @@ half4 test_reflect_half4(half4 I, half4 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]]
 // CHECK-NEXT:    ret float [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00
@@ -113,7 +113,7 @@ float test_reflect_float(float I, float N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]])
@@ -124,7 +124,7 @@ float test_reflect_float(float I, float N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <2 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]])
@@ -134,7 +134,7 @@ float2 test_reflect_float2(float2 I, float2 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]])
@@ -145,7 +145,7 @@ float2 test_reflect_float2(float2 I, float2 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <3 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]])
@@ -155,7 +155,7 @@ float3 test_reflect_float3(float3 I, float3 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]])
@@ -166,7 +166,7 @@ float3 test_reflect_float3(float3 I, float3 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <4 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]])
diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
index fe137b9cae4e..91375c8f4eb8 100644
--- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
@@ -3,25 +3,25 @@
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
+// CHECK: define hidden noundef i16 @
 // CHECK: call i16 @llvm.bitreverse.i16(
 uint16_t test_bitreverse_ushort(uint16_t p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: call <2 x i16> @llvm.bitreverse.v2i16
 uint16_t2 test_bitreverse_ushort2(uint16_t2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: call <3 x i16> @llvm.bitreverse.v3i16
 uint16_t3 test_bitreverse_ushort3(uint16_t3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: call <4 x i16> @llvm.bitreverse.v4i16
 uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
 {
@@ -29,50 +29,50 @@ uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
 }
 #endif
 
-// CHECK: define noundef i32 @
+// CHECK: define hidden noundef i32 @
 // CHECK: call i32 @llvm.bitreverse.i32(
 int test_bitreverse_uint(uint p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i32> @
+// CHECK: define hidden noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.bitreverse.v2i32
 uint2 test_bitreverse_uint2(uint2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i32> @
+// CHECK: define hidden noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.bitreverse.v3i32
 uint3 test_bitreverse_uint3(uint3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i32> @
+// CHECK: define hidden noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.bitreverse.v4i32
 uint4 test_bitreverse_uint4(uint4 p0)
 {
 	return reversebits(p0);
 }
 
-// CHECK: define noundef i64 @
+// CHECK: define hidden noundef i64 @
 // CHECK: call i64 @llvm.bitreverse.i64(
 uint64_t test_bitreverse_long(uint64_t p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i64> @
+// CHECK: define hidden noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.bitreverse.v2i64
 uint64_t2 test_bitreverse_long2(uint64_t2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i64> @
+// CHECK: define hidden noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.bitreverse.v3i64
 uint64_t3 test_bitreverse_long3(uint64_t3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i64> @
+// CHECK: define hidden noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.bitreverse.v4i64
 uint64_t4 test_bitreverse_long4(uint64_t4 p0)
 {
diff --git a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
index 109633a64d34..3b07fcec064d 100644
--- a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_double
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_double(double p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_double2(double2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_double3(double3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_double4(double4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_int(int p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_int2(int2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_int3(int3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_int4(int4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_uint(uint p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_uint2(uint2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_uint3(uint3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_uint4(uint4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_int64_t(int64_t p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_int64_t2(int64_t2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_int64_t3(int64_t3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_int64_t4(int64_t4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_uint64_t(uint64_t p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_uint64_t2(uint64_t2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_uint64_t3(uint64_t3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_uint64_t4(uint64_t4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index a945a9677abb..755f2e86fb11 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_round_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_round_half
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn half @llvm.roundeven.f16(
 // NATIVE_HALF: ret half %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_round_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_round_half
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // NO_HALF: ret float %elt.roundeven
 half test_round_half(half p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.roundeven.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32(
 // NO_HALF: ret <2 x float> %elt.roundeven
 half2 test_round_half2(half2 p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.roundeven.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32(
 // NO_HALF: ret <3 x float> %elt.roundeven
 half3 test_round_half3(half3 p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.roundeven.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32(
 // NO_HALF: ret <4 x float> %elt.roundeven
 half4 test_round_half4(half4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_round_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_round_float
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_float(float p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_float2(float2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_float3(float3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_float4(float4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
index 09f21f366b9d..262f306b9257 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.rsqrt = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].rsqrt.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
index 6c9b1f643713..9c398fd6f06c 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.rsqrt = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].rsqrt.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index 8cc910933f46..cbdb92938893 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
 
 // NATIVE_HALF: define [[FNATTRS]] i32 @
 // NATIVE_HALF: %hlsl.sign = call i32 @llvm.[[TARGET]].sign.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
index a5522e4f28b7..e471cb3d42c5 100644
--- a/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_double(double p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_double2(double2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_double3(double3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_double4(double4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_int(int p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_int2(int2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_int3(int3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_int4(int4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_uint(uint p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_uint2(uint2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_uint3(uint3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_uint4(uint4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_int64_t(int64_t p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_int64_t2(int64_t2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_int64_t3(int64_t3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_int64_t4(int64_t4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_uint64_t(uint64_t p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_uint64_t2(uint64_t2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_uint64_t3(uint64_t3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_uint64_t4(uint64_t4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 69c657239ef9..9bbe97997aa3 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_sin_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_sin_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.sin.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_sin_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_sin_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 half test_sin_half(half p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_sin_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_sin_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sin.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_sin_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_sin_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32(
 half2 test_sin_half2(half2 p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_sin_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_sin_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sin.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_sin_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_sin_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32(
 half3 test_sin_half3(half3 p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_sin_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_sin_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sin.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_sin_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_sin_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32(
 half4 test_sin_half4(half4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_sin_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_sin_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_float(float p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_sin_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_sin_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_float2(float2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_sin_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_sin_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_float3(float3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_sin_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_sin_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_float4(float4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
index d3e5c1059029..bef64ce77d47 100644
--- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
@@ -6,7 +6,7 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[MIN]]
@@ -19,7 +19,7 @@
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret half [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half nofpclass(nan inf) [[MIN]], half nofpclass(nan inf) [[MAX]], half nofpclass(nan inf) [[X]])
@@ -27,7 +27,7 @@
 //
 half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[MIN]]
@@ -40,7 +40,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <2 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> nofpclass(nan inf) [[MIN]], <2 x half> nofpclass(nan inf) [[MAX]], <2 x half> nofpclass(nan inf) [[X]])
@@ -48,7 +48,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M
 //
 half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[MIN]]
@@ -61,7 +61,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <3 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> nofpclass(nan inf) [[MIN]], <3 x half> nofpclass(nan inf) [[MAX]], <3 x half> nofpclass(nan inf) [[X]])
@@ -69,7 +69,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M
 //
 half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[MIN]]
@@ -82,7 +82,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <4 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> nofpclass(nan inf) [[MIN]], <4 x half> nofpclass(nan inf) [[MAX]], <4 x half> nofpclass(nan inf) [[X]])
@@ -90,7 +90,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M
 //
 half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[MIN]]
@@ -103,7 +103,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret float [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float nofpclass(nan inf) [[MIN]], float nofpclass(nan inf) [[MAX]], float nofpclass(nan inf) [[X]])
@@ -111,7 +111,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M
 //
 float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[MIN]]
@@ -124,7 +124,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <2 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> nofpclass(nan inf) [[MIN]], <2 x float> nofpclass(nan inf) [[MAX]], <2 x float> nofpclass(nan inf) [[X]])
@@ -132,7 +132,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M
 //
 float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[MIN]]
@@ -145,7 +145,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <3 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> nofpclass(nan inf) [[MIN]], <3 x float> nofpclass(nan inf) [[MAX]], <3 x float> nofpclass(nan inf) [[X]])
@@ -153,7 +153,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths
 //
 float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[MIN]]
@@ -166,7 +166,7 @@ float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smooths
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <4 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> nofpclass(nan inf) [[MIN]], <4 x float> nofpclass(nan inf) [[MAX]], <4 x float> nofpclass(nan inf) [[X]])
diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
index a883c9d5cc35..aeb2b79e9029 100644
--- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
@@ -8,7 +8,7 @@
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load double, ptr [[VALD]].addr, align 8
 // SPIRV-NEXT: [[CAST:%.*]] = bitcast double [[LOAD]] to <2 x i32>
@@ -26,7 +26,7 @@ uint test_scalar(double D) {
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <1 x double>, ptr [[VALD]].addr, align 8
 // SPIRV-NEXT: [[TRUNC:%.*]] = extractelement <1 x double> [[LOAD]], i64 0
@@ -44,7 +44,7 @@ uint1 test_double1(double1 D) {
 // CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <2 x double>, ptr [[VALD]].addr, align 16
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <2 x double> [[LOAD]] to <4 x i32>
@@ -61,7 +61,7 @@ uint2 test_vector2(double2 D) {
 // CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <3 x double>, ptr [[VALD]].addr, align 32
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <3 x double> [[LOAD]] to <6 x i32>
@@ -78,7 +78,7 @@ uint3 test_vector3(double3 D) {
 // CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT: @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <4 x double>, ptr [[VALD]].addr, align 32
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <4 x double> [[LOAD]] to <8 x i32>
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
index 48b74c9db5c6..d4de244f38b3 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_double
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_double(double p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_double2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_double2(double2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_double3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_double3(double3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_double4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_double4(double4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_int
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_int(int p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_int2(int2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_int3(int3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_int4(int4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_uint(uint p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_uint2(uint2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_uint3(uint3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_uint4(uint4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_int64_t
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_int64_t(int64_t p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int64_t2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_int64_t2(int64_t2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int64_t3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_int64_t3(int64_t3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int64_t4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_int64_t4(int64_t4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint64_t
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_uint64_t(uint64_t p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint64_t2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_uint64_t2(uint64_t2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint64_t3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_uint64_t3(uint64_t3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint64_t4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_uint64_t4(uint64_t4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
index 94d966f0bef8..31839f6bc177 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_sqrt_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_sqrt_half
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn half @llvm.sqrt.f16(
 // NATIVE_HALF: ret half %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_sqrt_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_sqrt_half
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // NO_HALF: ret float %{{.*}}
 half test_sqrt_half(half p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_sqrt_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_sqrt_half2
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sqrt.v2f16
 // NATIVE_HALF: ret <2 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_sqrt_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_sqrt_half2
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32(
 // NO_HALF: ret <2 x float> %{{.*}}
 half2 test_sqrt_half2(half2 p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_sqrt_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_sqrt_half3
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sqrt.v3f16
 // NATIVE_HALF: ret <3 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_sqrt_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_sqrt_half3
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32(
 // NO_HALF: ret <3 x float> %{{.*}}
 half3 test_sqrt_half3(half3 p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_sqrt_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_sqrt_half4
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sqrt.v4f16
 // NATIVE_HALF: ret <4 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_sqrt_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_sqrt_half4
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32(
 // NO_HALF: ret <4 x float> %{{.*}}
 half4 test_sqrt_half4(half4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_sqrt_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_sqrt_float
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_float(float p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_sqrt_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_sqrt_float2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_float2(float2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_sqrt_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_sqrt_float3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_float3(float3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_sqrt_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_sqrt_float4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_float4(float4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
index d3b979254391..f55a8f8aff92 100644
--- a/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].step.f32(float
diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
index 49d09e5c6fe6..be0ffbd79464 100644
--- a/clang/test/CodeGenHLSL/builtins/step.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].step.f16(half
diff --git a/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
index d913aabfb406..51eb20c58e40 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
@@ -2,82 +2,82 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_double(double p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_double2(double2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_double3(double3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_double4(double4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_int(int p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_int2(int2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_int3(int3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_int4(int4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_uint(uint p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_uint2(uint2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_uint3(uint3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_uint4(uint4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_int64_t(int64_t p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_int64_t2(int64_t2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_int64_t3(int64_t3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_int64_t4(int64_t4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_uint64_t(uint64_t p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_uint64_t2(uint64_t2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_uint64_t3(uint64_t3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_uint64_t4(uint64_t4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index 26de5bf94c3c..c1c6ee4119f0 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -5,42 +5,42 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_trunc_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_trunc_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.trunc.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_trunc_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_trunc_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 half test_trunc_half(half p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_trunc_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_trunc_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.trunc.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_trunc_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_trunc_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32(
 half2 test_trunc_half2(half2 p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_trunc_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_trunc_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.trunc.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_trunc_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_trunc_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32(
 half3 test_trunc_half3(half3 p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_trunc_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_trunc_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.trunc.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_trunc_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_trunc_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32(
 half4 test_trunc_half4(half4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_trunc_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_trunc_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_float(float p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_trunc_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_trunc_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_float2(float2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_trunc_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_trunc_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_float3(float3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_trunc_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_trunc_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_float4(float4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
index 3ab8048146ad..0df3598a3cc3 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
+// CHECK: define hidden spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
 void main() {
 // CHECK: entry:
 // CHECK:   %[[CT_ENTRY:[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
index 8e1f2d69e743..9034cae25403 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
@@ -6,8 +6,8 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,CHECK-DXIL
 
-// CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
-// CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-SPIRV: define hidden spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-DXIL: define hidden noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
 // CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK-SPIRV: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
 // CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
index 12b120d0c067..a71b988417f0 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
+// CHECK: define hidden spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
 // CHECK: %[[C1:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
 uint test_1() {
@@ -10,7 +10,7 @@ uint test_1() {
 
 // CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
 
-// CHECK: define spir_func noundef i32 @_Z6test_2v() [[A0]] {
+// CHECK: define hidden spir_func noundef i32 @_Z6test_2v() [[A0]] {
 // CHECK: %[[C2:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK: call spir_func noundef i32 @_Z6test_1v() {{#[0-9]+}} [ "convergencectrl"(token %[[C2]]) ]
 uint test_2() {
diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl
index eebf0f682d3d..b58a49b41eb9 100644
--- a/clang/test/CodeGenHLSL/cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer.hlsl
@@ -46,14 +46,14 @@ cbuffer CBScalars : register(b1, space5) {
 
 // CHECK: @CBScalars.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars,
 // CHECK-SAME: 56, 0, 8, 16, 24, 32, 36, 40, 48))
-// CHECK: @a1 = external addrspace(2) global float, align 4
-// CHECK: @a2 = external addrspace(2) global double, align 8
-// CHECK: @a3 = external addrspace(2) global half, align 2
-// CHECK: @a4 = external addrspace(2) global i64, align 8
-// CHECK: @a5 = external addrspace(2) global i32, align 4
-// CHECK: @a6 = external addrspace(2) global i16, align 2
-// CHECK: @a7 = external addrspace(2) global i32, align 4
-// CHECK: @a8 = external addrspace(2) global i64, align 8
+// CHECK: @a1 = external hidden addrspace(2) global float, align 4
+// CHECK: @a2 = external hidden addrspace(2) global double, align 8
+// CHECK: @a3 = external hidden addrspace(2) global half, align 2
+// CHECK: @a4 = external hidden addrspace(2) global i64, align 8
+// CHECK: @a5 = external hidden addrspace(2) global i32, align 4
+// CHECK: @a6 = external hidden addrspace(2) global i16, align 2
+// CHECK: @a7 = external hidden addrspace(2) global i32, align 4
+// CHECK: @a8 = external hidden addrspace(2) global i64, align 8
 // CHECK: @CBScalars.str = private unnamed_addr constant [10 x i8] c"CBScalars\00", align 1
 
 cbuffer CBVectors {
@@ -69,13 +69,13 @@ cbuffer CBVectors {
 
 // CHECK: @CBVectors.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBVectors,
 // CHECK-SAME: 136, 0, 16, 40, 48, 80, 96, 112))
-// CHECK: @b1 = external addrspace(2) global <3 x float>, align 16
-// CHECK: @b2 = external addrspace(2) global <3 x double>, align 32
-// CHECK: @b3 = external addrspace(2) global <2 x half>, align 4
-// CHECK: @b4 = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @b5 = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @b6 = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @b7 = external addrspace(2) global <3 x i64>, align 32
+// CHECK: @b1 = external hidden addrspace(2) global <3 x float>, align 16
+// CHECK: @b2 = external hidden addrspace(2) global <3 x double>, align 32
+// CHECK: @b3 = external hidden addrspace(2) global <2 x half>, align 4
+// CHECK: @b4 = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @b5 = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @b6 = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @b7 = external hidden addrspace(2) global <3 x i64>, align 32
 // CHECK: @CBVectors.str = private unnamed_addr constant [10 x i8] c"CBVectors\00", align 1
 
 cbuffer CBArrays : register(b2) {
@@ -91,14 +91,14 @@ cbuffer CBArrays : register(b2) {
 
 // CHECK: @CBArrays.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays,
 // CHECK-SAME: 708, 0, 48, 112, 176, 224, 608, 624, 656))
-// CHECK: @c1 = external addrspace(2) global [3 x float], align 4
-// CHECK: @c2 = external addrspace(2) global [2 x <3 x double>], align 32
-// CHECK: @c3 = external addrspace(2) global [2 x [2 x half]], align 2
-// CHECK: @c4 = external addrspace(2) global [3 x i64], align 8
-// CHECK: @c5 = external addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
-// CHECK: @c6 = external addrspace(2) global [1 x i16], align 2
-// CHECK: @c7 = external addrspace(2) global [2 x i64], align 8
-// CHECK: @c8 = external addrspace(2) global [4 x i32], align 4
+// CHECK: @c1 = external hidden addrspace(2) global [3 x float], align 4
+// CHECK: @c2 = external hidden addrspace(2) global [2 x <3 x double>], align 32
+// CHECK: @c3 = external hidden addrspace(2) global [2 x [2 x half]], align 2
+// CHECK: @c4 = external hidden addrspace(2) global [3 x i64], align 8
+// CHECK: @c5 = external hidden addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
+// CHECK: @c6 = external hidden addrspace(2) global [1 x i16], align 2
+// CHECK: @c7 = external hidden addrspace(2) global [2 x i64], align 8
+// CHECK: @c8 = external hidden addrspace(2) global [4 x i32], align 4
 // CHECK: @CBArrays.str = private unnamed_addr constant [9 x i8] c"CBArrays\00", align 1
 
 typedef uint32_t4 uint32_t8[2];
@@ -112,8 +112,8 @@ cbuffer CBTypedefArray : register(space2) {
 
 // CHECK: @CBTypedefArray.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBTypedefArray,
 // CHECK-SAME: 128, 0, 64))
-// CHECK: @t1 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16
-// CHECK: @t2 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16
+// CHECK: @t1 = external hidden addrspace(2) global [2 x [2 x <4 x i32>]], align 16
+// CHECK: @t2 = external hidden addrspace(2) global [2 x [2 x <4 x i32>]], align 16
 // CHECK: @CBTypedefArray.str = private unnamed_addr constant [15 x i8] c"CBTypedefArray\00", align 1
 struct Empty {};
 
@@ -137,13 +137,13 @@ struct D {
 
 // CHECK: @CBStructs.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBStructs,
 // CHECK-SAME: 246, 0, 16, 32, 64, 144, 238, 240))
-// CHECK: @a = external addrspace(2) global target("dx.Layout", %A, 8, 0), align 1
-// CHECK: @b = external addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 1
-// CHECK: @c = external addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 1
-// CHECK: @array_of_A = external addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 1
-// CHECK: @d = external addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 1
-// CHECK: @e = external addrspace(2) global half, align 2
-// CHECK: @f = external addrspace(2) global <3 x i16>, align 8
+// CHECK: @a = external hidden addrspace(2) global target("dx.Layout", %A, 8, 0), align 1
+// CHECK: @b = external hidden addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 1
+// CHECK: @c = external hidden addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 1
+// CHECK: @array_of_A = external hidden addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 1
+// CHECK: @d = external hidden addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 1
+// CHECK: @e = external hidden addrspace(2) global half, align 2
+// CHECK: @f = external hidden addrspace(2) global <3 x i16>, align 8
 // CHECK: @CBStructs.str = private unnamed_addr constant [10 x i8] c"CBStructs\00", align 1
 
 cbuffer CBStructs {
@@ -178,10 +178,10 @@ cbuffer CBClasses {
 
 // CHECK: @CBClasses.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBClasses,
 // CHECK-SAME: 260, 0, 16, 32, 112))
-// CHECK: @k = external addrspace(2) global target("dx.Layout", %K, 4, 0), align 1
-// CHECK: @l = external addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 1
-// CHECK: @m = external addrspace(2) global target("dx.Layout", %M, 68, 0), align 1
-// CHECK: @ka = external addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 1
+// CHECK: @k = external hidden addrspace(2) global target("dx.Layout", %K, 4, 0), align 1
+// CHECK: @l = external hidden addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 1
+// CHECK: @m = external hidden addrspace(2) global target("dx.Layout", %M, 68, 0), align 1
+// CHECK: @ka = external hidden addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 1
 // CHECK: @CBClasses.str = private unnamed_addr constant [10 x i8] c"CBClasses\00", align 1
 
 struct Test {
@@ -190,16 +190,16 @@ struct Test {
 
 // CHECK: @CBMix.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBMix,
 // CHECK-SAME: 170, 0, 24, 32, 120, 128, 136, 144, 152, 160, 168))
-// CHECK: @test = external addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 1
-// CHECK: @f1 = external addrspace(2) global float, align 4
-// CHECK: @f2 = external addrspace(2) global [3 x [2 x <2 x float>]], align 8
-// CHECK: @f3 = external addrspace(2) global float, align 4
-// CHECK: @f4 = external addrspace(2) global target("dx.Layout", %anon, 4, 0), align 1
-// CHECK: @f5 = external addrspace(2) global double, align 8
-// CHECK: @f6 = external addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 1
-// CHECK: @f7 = external addrspace(2) global float, align 4
-// CHECK: @f8 = external addrspace(2) global <1 x double>, align 8
-// CHECK: @f9 = external addrspace(2) global i16, align 2
+// CHECK: @test = external hidden addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 1
+// CHECK: @f1 = external hidden addrspace(2) global float, align 4
+// CHECK: @f2 = external hidden addrspace(2) global [3 x [2 x <2 x float>]], align 8
+// CHECK: @f3 = external hidden addrspace(2) global float, align 4
+// CHECK: @f4 = external hidden addrspace(2) global target("dx.Layout", %anon, 4, 0), align 1
+// CHECK: @f5 = external hidden addrspace(2) global double, align 8
+// CHECK: @f6 = external hidden addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 1
+// CHECK: @f7 = external hidden addrspace(2) global float, align 4
+// CHECK: @f8 = external hidden addrspace(2) global <1 x double>, align 8
+// CHECK: @f9 = external hidden addrspace(2) global i16, align 2
 // CHECK: @CBMix.str = private unnamed_addr constant [6 x i8] c"CBMix\00", align 1
 
 cbuffer CBMix {
diff --git a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
index 4a7e2597dc0f..33f480bf445e 100644
--- a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
@@ -8,14 +8,14 @@
 // CHECK: %"n0::Foo" = type <{ float }>
 
 // CHECK: @A.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::n1::__cblayout_A", 4, 0))
-// CHECK: @_ZN2n02n11aE = external addrspace(2) global float, align 4
+// CHECK: @_ZN2n02n11aE = external hidden addrspace(2) global float, align 4
 
 // CHECK: @B.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::__cblayout_B", 4, 0))
-// CHECK: @_ZN2n01aE = external addrspace(2) global float, align 4
+// CHECK: @_ZN2n01aE = external hidden addrspace(2) global float, align 4
 
 // CHECK: @C.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::n2::__cblayout_C", 20, 0, 16))
-// CHECK: @_ZN2n02n21aE = external addrspace(2) global float, align 4
-// CHECK: external addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 1
+// CHECK: @_ZN2n02n21aE = external hidden addrspace(2) global float, align 4
+// CHECK: external hidden addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 1
 
 namespace n0 {
   struct Foo {
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
index 0d092f0c36c2..16d22a5b1fdd 100644
--- a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
@@ -6,9 +6,9 @@
 // CHECK: %__cblayout_CB_1 = type <{ float, <2 x float> }>
 
 // CHECK: @CB.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 176, 16, 168, 88))
-// CHECK: @a = external addrspace(2) global float, align 4
-// CHECK: @b = external addrspace(2) global double, align 8
-// CHECK: @c = external addrspace(2) global <2 x i32>, align 8
+// CHECK: @a = external hidden addrspace(2) global float, align 4
+// CHECK: @b = external hidden addrspace(2) global double, align 8
+// CHECK: @c = external hidden addrspace(2) global <2 x i32>, align 8
 // CHECK: @CB.str = private unnamed_addr constant [3 x i8] c"CB\00", align 1
 
 cbuffer CB : register(b1, space3) {
@@ -18,8 +18,8 @@ cbuffer CB : register(b1, space3) {
 }
 
 // CHECK: @CB.cb.1 = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_1, 92, 88, 80))
-// CHECK: @x = external addrspace(2) global float, align 4
-// CHECK: @y = external addrspace(2) global <2 x float>, align 8
+// CHECK: @x = external hidden addrspace(2) global float, align 4
+// CHECK: @y = external hidden addrspace(2) global <2 x float>, align 8
 
 // Missing packoffset annotation will produce a warning.
 // Element x will be placed after the element y that has an explicit packoffset.
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
index a6034386ac45..cda231d8d2eb 100644
--- a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
@@ -3,7 +3,7 @@
 // CHECK: %__cblayout_A = type <{ float }>
 
 // CHECK: @A.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_A, 4, 0))
-// CHECK: @a = external addrspace(2) global float, align 4
+// CHECK: @a = external hidden addrspace(2) global float, align 4
 // CHECK-DAG: @_ZL1b = internal global float 3.000000e+00, align 4
 // CHECK-NOT: @B.cb
 
diff --git a/clang/test/CodeGenHLSL/convergence/do.while.hlsl b/clang/test/CodeGenHLSL/convergence/do.while.hlsl
index 934fe3ea9eb7..9aabbfd54e53 100644
--- a/clang/test/CodeGenHLSL/convergence/do.while.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/do.while.hlsl
@@ -8,7 +8,7 @@ void test1() {
   do {
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -22,7 +22,7 @@ void test2() {
     foo();
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
       foo();
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -57,7 +57,7 @@ void test4() {
     }
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -78,7 +78,7 @@ void test5() {
     }
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/convergence/for.hlsl b/clang/test/CodeGenHLSL/convergence/for.hlsl
index 363c6a48839b..b7b11e9959ea 100644
--- a/clang/test/CodeGenHLSL/convergence/for.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/for.hlsl
@@ -10,7 +10,7 @@ void test1() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -23,7 +23,7 @@ void test2() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -52,7 +52,7 @@ void test4() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -67,7 +67,7 @@ void test5() {
   for (cond();cond2();foo()) {
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -86,7 +86,7 @@ void test6() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test6v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test6v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -112,7 +112,7 @@ void test7() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test7v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test7v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/convergence/while.hlsl b/clang/test/CodeGenHLSL/convergence/while.hlsl
index 570b4b133671..32579e863100 100644
--- a/clang/test/CodeGenHLSL/convergence/while.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/while.hlsl
@@ -8,7 +8,7 @@ void test1() {
   while (cond()) {
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -21,7 +21,7 @@ void test2() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -59,7 +59,7 @@ void test4() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -82,7 +82,7 @@ void test5() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -107,7 +107,7 @@ void test6() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test6v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test6v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/default_cbuffer.hlsl b/clang/test/CodeGenHLSL/default_cbuffer.hlsl
index 557913042e88..ad4d92f8afc0 100644
--- a/clang/test/CodeGenHLSL/default_cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/default_cbuffer.hlsl
@@ -6,14 +6,14 @@
 // CHECK: %__cblayout_S = type <{ float }>
 
 // DXIL-DAG: @"$Globals.cb" = global target("dx.CBuffer", target("dx.Layout", %"__cblayout_$Globals", 20, 0, 4, 16))
-// DXIL-DAG: @a = external addrspace(2) global float
-// DXIL-DAG: @g = external addrspace(2) global float
-// DXIL-DAG: @h = external addrspace(2) global target("dx.Layout", %__cblayout_S, 4, 0), align 4
+// DXIL-DAG: @a = external hidden addrspace(2) global float
+// DXIL-DAG: @g = external hidden addrspace(2) global float
+// DXIL-DAG: @h = external hidden addrspace(2) global target("dx.Layout", %__cblayout_S, 4, 0), align 4
 
 // SPIRV-DAG: @"$Globals.cb" = global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 20, 0, 4, 16), 2, 0)
-// SPIRV-DAG: @a = external addrspace(12) global float
-// SPIRV-DAG: @g = external addrspace(12) global float
-// SPIRV-DAG: @h = external addrspace(12) global target("spirv.Layout", %__cblayout_S, 4, 0), align 8
+// SPIRV-DAG: @a = external hidden addrspace(12) global float
+// SPIRV-DAG: @g = external hidden addrspace(12) global float
+// SPIRV-DAG: @h = external hidden addrspace(12) global target("spirv.Layout", %__cblayout_S, 4, 0), align 8
 
 struct EmptyStruct {
 };
diff --git a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
index 40e3196649a5..1b2cb0e99aa8 100644
--- a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
+++ b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
@@ -4,14 +4,14 @@
 // CHECK-SAME: target("dx.Layout", %S, 8, 0) }>
 // CHECK: %S = type <{ <2 x float> }>
 
-// CHECK-DAG: @b = external addrspace(2) global float, align 4
-// CHECK-DAG: @d = external addrspace(2) global <4 x i32>, align 16
+// CHECK-DAG: @b = external hidden addrspace(2) global float, align 4
+// CHECK-DAG: @d = external hidden addrspace(2) global <4 x i32>, align 16
 // CHECK-DAG: @"$Globals.cb" = global target("dx.CBuffer",
 // CHECK-DAG-SAME: target("dx.Layout", %"__cblayout_$Globals", 144, 120, 16, 32, 64, 128, 112))
-// CHECK-DAG: @a = external addrspace(2) global i32, align 4
-// CHECK-DAG: @c = external addrspace(2) global [4 x double], align 8
-// CHECK-DAG: @e = external addrspace(2) global <4 x float>, align 16
-// CHECK-DAG: @s = external addrspace(2) global target("dx.Layout", %S, 8, 0), align 1
+// CHECK-DAG: @a = external hidden addrspace(2) global i32, align 4
+// CHECK-DAG: @c = external hidden addrspace(2) global [4 x double], align 8
+// CHECK-DAG: @e = external hidden addrspace(2) global <4 x float>, align 16
+// CHECK-DAG: @s = external hidden addrspace(2) global target("dx.Layout", %S, 8, 0), align 1
 
 struct S {
   float2 v;
diff --git a/clang/test/CodeGenHLSL/export.hlsl b/clang/test/CodeGenHLSL/export.hlsl
index 770618ff2e07..e72dbde5188a 100644
--- a/clang/test/CodeGenHLSL/export.hlsl
+++ b/clang/test/CodeGenHLSL/export.hlsl
@@ -5,17 +5,15 @@
 export void f1() {
 }
 
-// CHECK: define void @_ZN11MyNamespace2f2Ev() [[Attr]]
+// CHECK: define void @_ZN11MyNamespace2f2Ev()
 namespace MyNamespace {
   export void f2() {
   }
 }
 
 export {
-// CHECK: define void @_Z2f3v() [[Attr]]
-// CHECK: define void @_Z2f4v() [[Attr]]
+// CHECK: define void @_Z2f3v()
+// CHECK: define void @_Z2f4v()
     void f3() {}
     void f4() {}
-}
-
-// CHECK: attributes [[Attr]] = { {{.*}} "hlsl.export" {{.*}} }
+}
\ No newline at end of file
diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl
index a562e75b3488..6498c53752d4 100644
--- a/clang/test/CodeGenHLSL/group_shared.hlsl
+++ b/clang/test/CodeGenHLSL/group_shared.hlsl
@@ -8,7 +8,7 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure groupshared translated into address space 3.
-// CHECK:@a = addrspace(3) global [10 x float]
+// CHECK:@a = hidden addrspace(3) global [10 x float]
 
  groupshared float a[10];
 
diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
index 12d3eeedb590..60238cbf8eff 100644
--- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
+++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
@@ -12,7 +12,7 @@ struct Node {
 };
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 1 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]]
+// CHECK: define hidden noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 1 %SortedTree, i32 noundef %key) [[Attr:\#[0-9]+]]
 // CHECK: ret i32
 // Find and return value corresponding to key in the SortedTree
 uint Find(Node SortedTree[MAX], uint key) {
@@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) {
 }
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 1 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 1 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[Attr:\#[0-9]+]]
 // CHECK: ret i1
 // Initialize tree with given buffer
 // Imagine the inout works
@@ -52,7 +52,7 @@ RWBuffer<uint4> gTree;
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
+// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[Attr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -71,7 +71,7 @@ void main(uint GI : SV_GroupIndex) {
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @_Z11defaultMainv() [[IntAttr]]
+// CHECK: define internal void @_Z11defaultMainv() [[Attr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -88,6 +88,5 @@ void defaultMain() {
     needle = Find(haystack, needle);
 }
 
-// CHECK: attributes [[IntAttr]] = {{.*}} norecurse
-// CHECK: attributes [[ExtAttr]] = {{.*}} norecurse
+// CHECK: attributes [[Attr]] = {{.*}} norecurse
 // CHECK: attributes [[EntryAttr]] = {{.*}} norecurse
diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl
index 4748eeee7475..0c7467e2f972 100644
--- a/clang/test/CodeGenHLSL/inline-functions.hlsl
+++ b/clang/test/CodeGenHLSL/inline-functions.hlsl
@@ -15,7 +15,7 @@ float nums[MAX];
 
 // Verify that all functions have the alwaysinline attribute
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]]
+// NOINLINE: define hidden void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[Attr:\#[0-9]+]]
 // NOINLINE: ret void
 // Swap the values of Buf at indices ix1 and ix2
 void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
@@ -25,7 +25,7 @@ void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
 }
 
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]]
+// NOINLINE: define hidden void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[Attr]]
 // NOINLINE: ret void
 // Inefficiently sort Buf in place
 void BubbleSort(unsigned Buf[MAX], unsigned size) {
@@ -43,7 +43,7 @@ void BubbleSort(unsigned Buf[MAX], unsigned size) {
 
 // Note ExtAttr is the inlined export set of attribs
 // CHECK: Function Attrs: alwaysinline
-// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 {{.*}}%Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 {{.*}}%Buf, i32 noundef %size) {{[a-z_ ]*}}[[Attr:\#[0-9]+]]
 // CHECK: ret i32
 // Sort Buf and remove any duplicate values
 // returns the number of values left
@@ -65,9 +65,9 @@ RWBuffer<unsigned> Indices;
 
 // The mangled version of main only remains without inlining
 // because it has internal linkage from the start
-// Note main functions get the norecurse attrib, which IntAttr reflects
+// Note main functions get the alwaysinline attrib, which Attr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
+// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[Attr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -93,9 +93,9 @@ void main(unsigned int GI : SV_GroupIndex) {
 
 // The mangled version of main only remains without inlining
 // because it has internal linkage from the start
-// Note main functions get the norecurse attrib, which IntAttr reflects
+// Note main functions get the alwaysinline attrib, which Attr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @_Z6main10v() [[IntAttr]]
+// NOINLINE: define internal void @_Z6main10v() [[Attr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -113,6 +113,5 @@ void main10() {
   main(10);
 }
 
-// NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline
-// CHECK: attributes [[ExtAttr]] = {{.*}} alwaysinline
+// CHECK: attributes [[Attr]] = {{.*}} alwaysinline
 // CHECK: attributes [[EntryAttr]] = {{.*}} noinline
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
index 5b58e436bbed..7149be0122f4 100644
--- a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
+++ b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
@@ -18,12 +18,12 @@ struct S {
     Int i;
 };
 
-// CHECK: define spir_func target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) @_Z14getArrayBufferu17spirv_type_28_0_0U5_TypeN4hlsl8RWBufferIfEEU6_ConstLm4E(target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) %v) #0
+// CHECK: define hidden spir_func target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) @_Z14getArrayBufferu17spirv_type_28_0_0U5_TypeN4hlsl8RWBufferIfEEU6_ConstLm4E(target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) %v) #0
 ArrayBuffer<4> getArrayBuffer(ArrayBuffer<4> v) {
     return v;
 }
 
-// CHECK: define spir_func target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) @_Z6getIntu18spirv_type_21_4_32U4_LitLi32EU4_LitLi0E(target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) %v) #0
+// CHECK: define hidden spir_func target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) @_Z6getIntu18spirv_type_21_4_32U4_LitLi32EU4_LitLi0E(target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) %v) #0
 Int getInt(Int v) {
     return v;
 }
diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
index 78bff3b13810..b4ffcb477f1b 100644
--- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl
+++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
@@ -10,37 +10,37 @@
 int16_t add(int16_t a, int16_t b) {
   return a + b;
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: add <2 x i16>
 int16_t2 add(int16_t2 a, int16_t2 b) {
   return a + b;
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: add <3 x i16>
 int16_t3 add(int16_t3 a, int16_t3 b) {
   return a + b;
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: add <4 x i16>
 int16_t4 add(int16_t4 a, int16_t4 b) {
   return a + b;
 }
-// CHECK: define noundef i16 @
+// CHECK: define hidden noundef i16 @
 // CHECK: add i16 %
 uint16_t add(uint16_t a, uint16_t b) {
   return a + b;
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: add <2 x i16>
 uint16_t2 add(uint16_t2 a, uint16_t2 b) {
   return a + b;
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: add <3 x i16>
 uint16_t3 add(uint16_t3 a, uint16_t3 b) {
   return a + b;
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: add <4 x i16>
 uint16_t4 add(uint16_t4 a, uint16_t4 b) {
   return a + b;
diff --git a/clang/test/CodeGenHLSL/out-of-line-static.hlsl b/clang/test/CodeGenHLSL/out-of-line-static.hlsl
index 8127a6c2ec1e..57f6c123e50e 100644
--- a/clang/test/CodeGenHLSL/out-of-line-static.hlsl
+++ b/clang/test/CodeGenHLSL/out-of-line-static.hlsl
@@ -6,8 +6,8 @@ struct S {
 };
 
 int S::Value = 1;
-// DXIL: @_ZN1S5ValueE = global i32 1, align 4
-// SPIRV: @_ZN1S5ValueE = addrspace(10) global i32 1, align 4
+// DXIL: @_ZN1S5ValueE = hidden global i32 1, align 4
+// SPIRV: @_ZN1S5ValueE = hidden addrspace(10) global i32 1, align 4
 
 [shader("compute")]
 [numthreads(1,1,1)]
diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl
index 7b3890ae560d..41e05330ed1a 100644
--- a/clang/test/CodeGenHLSL/shift-mask.hlsl
+++ b/clang/test/CodeGenHLSL/shift-mask.hlsl
@@ -5,7 +5,7 @@ int shl32(int V, int S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -13,7 +13,7 @@ int shr32(int V, int S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = ashr i32 %{{.*}}, %[[Masked]]
 
@@ -21,7 +21,7 @@ int64_t shl64(int64_t V, int64_t S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -29,7 +29,7 @@ int64_t shr64(int64_t V, int64_t S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = ashr i64 %{{.*}}, %[[Masked]]
 
@@ -37,7 +37,7 @@ uint shlu32(uint V, uint S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -45,7 +45,7 @@ uint shru32(uint V, uint S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = lshr i32 %{{.*}}, %[[Masked]]
 
@@ -53,7 +53,7 @@ uint64_t shlu64(uint64_t V, uint64_t S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -61,6 +61,6 @@ uint64_t shru64(uint64_t V, uint64_t S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = lshr i64 %{{.*}}, %[[Masked]]
diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
index a87eb0b38f60..a2df30703877 100644
--- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
@@ -25,7 +25,7 @@ void main() {
 }
 
 // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
-// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr hidden noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 1
@@ -42,7 +42,7 @@ void main() {
 // CHECK-NEXT:%0 = load i32, ptr %First2, align 1
 // CHECK-NEXT:ret i32 %0
 
-// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr hidden noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 1
diff --git a/clang/test/CodeGenHLSL/vk-input-builtin.hlsl b/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
index 1cc7963c0e28..157a1818c82f 100644
--- a/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
@@ -3,7 +3,7 @@
 
 [[vk::ext_builtin_input(/* WorkgroupId */ 26)]]
 static const uint3 groupid;
-// CHECK: @_ZL7groupid = external local_unnamed_addr addrspace(7) externally_initialized constant <3 x i32>, align 16, !spirv.Decorations [[META0:![0-9]+]]
+// CHECK: @_ZL7groupid = external hidden local_unnamed_addr addrspace(7) externally_initialized constant <3 x i32>, align 16, !spirv.Decorations [[META0:![0-9]+]]
 
 RWStructuredBuffer<int> output : register(u1, space0);
 
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index aad8836db106..f37d00503fe5 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -4,7 +4,7 @@
 // SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
 
 // valid: "spirv-unknown-vulkan-library"
-// valid: define spir_func void @{{.*main.*}}() #0 {
+// valid: define hidden spir_func void @{{.*main.*}}() #0 {
 
 [numthreads(1,1,1)]
 void main()
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 035899205bf8..94b2dbe78c4f 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -24,7 +24,9 @@ static bool finalizeLinkage(Module &M) {
   for (Function &EF : M.functions()) {
     if (EF.isIntrinsic())
       continue;
-    if (EF.hasFnAttribute("hlsl.shader") || EF.hasFnAttribute("hlsl.export"))
+    if (EF.hasExternalLinkage() && EF.hasDefaultVisibility())
+      continue;
+    if (EF.hasFnAttribute("hlsl.shader"))
       continue;
     Funcs.push_back(&EF);
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 36cc5cbe655b..a412887e51ad 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -480,7 +480,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
   } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
-             F.getLinkage() != GlobalValue::PrivateLinkage) {
+             F.getLinkage() != GlobalValue::PrivateLinkage &&
+             F.getVisibility() != GlobalValue::HiddenVisibility) {
     SPIRV::LinkageType::LinkageType LnkTy =
         F.isDeclaration()
             ? SPIRV::LinkageType::Import
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 911a6966aaef..851e0c6b81fc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3898,7 +3898,8 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   if (hasInitializer(GlobalVar) && !Init)
     return true;
 
-  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage();
+  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage() &&
+                  !GV->hasHiddenVisibility();
   SPIRV::LinkageType::LinkageType LnkType =
       GV->isDeclarationForLinker()
           ? SPIRV::LinkageType::Import
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
index 202609c8156a..78045ddcd85a 100644
--- a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
@@ -5,25 +5,25 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 
 ; Confirm that DXILFinalizeLinkage will remove functions that have compatible
 ; linkage and are not called from anywhere. This should be any function that
-; is not explicitly marked export and is not an entry point.
+; is marked hidden or internal.
 
-; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; Is hidden, and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNothingUncalled
-define void @"?doNothingUncalled@@YAXXZ"() #2 {
+define hidden void @"?doNothingUncalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline and uncalled, this should be removed.
+; Alwaysinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
-define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and uncalled, this should be removed.
+; Noinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineUncalled
-define void @"?doNoinlineUncalled@@YAXXZ"() #4 {
+define hidden void @"?doNoinlineUncalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -49,44 +49,44 @@ entry:
   ret void
 }
 
-; Marked external and uncalled, this should become internal and be removed.
+; Marked external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doExternalUncalled
-define external void @"?doExternalUncalled@@YAXXZ"() #2 {
+define external hidden void @"?doExternalUncalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline, external and uncalled, this should become internal and be removed.
+; Alwaysinline, external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
-define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and uncalled, this should become internal and be removed.
+; Noinline, external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
-define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
+define external hidden void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
 
-; No inlining attribute and called, this should stay.
+; No inlining attribute, hidden and called, this should stay.
 ; CHECK: define {{.*}}doNothingCalled
-define void @"?doNothingCalled@@YAXXZ"() #2 {
+define hidden void @"?doNothingCalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline and called, this should stay.
+; Alwaysinline, hidden and called, this should stay.
 ; CHECK: define {{.*}}doAlwaysInlineCalled
-define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and called, this should stay.
+; Noinline, hidden and called, this should stay.
 ; CHECK: define {{.*}}doNoinlineCalled
-define void @"?doNoinlineCalled@@YAXXZ"() #4 {
+define hidden void @"?doNoinlineCalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -112,23 +112,23 @@ entry:
   ret void
 }
 
-; Marked external and called, this should become internal and stay.
+; Marked external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doExternalCalled
-define external void @"?doExternalCalled@@YAXXZ"() #2 {
+define external hidden void @"?doExternalCalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Always inlined, external and called, this should become internal and stay.
+; Always inlined, external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doAlwaysInlineExternalCalled
-define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and called, this should become internal and stay.
+; Noinline, external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doNoinlineExternalCalled
-define external void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
+define external hidden void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -154,27 +154,6 @@ entry:
   ret void
 }
 
-; No inlining attribute, internal, and exported; this should stay.
-; CHECK: define {{.*}}doInternalExported
-define internal void @"?doInternalExported@@YAXXZ"() #3 {
-entry:
-  ret void
-}
-
-; Alwaysinline, internal, and exported; this should stay.
-; CHECK: define {{.*}}doAlwaysInlineInternalExported
-define internal void @"?doAlwaysInlineInternalExported@@YAXXZ"() #1 {
-entry:
-  ret void
-}
-
-; Noinline, internal, and exported; this should stay.
-; CHECK: define {{.*}}doNoinlineInternalExported
-define internal void @"?doNoinlineInternalExported@@YAXXZ"() #5 {
-entry:
-  ret void
-}
-
 ; Marked external and exported, this should stay.
 ; CHECK: define {{.*}}doExternalExported
 define external void @"?doExternalExported@@YAXXZ"() #3 {
@@ -213,10 +192,10 @@ entry:
 }
 
 attributes #0 = { alwaysinline convergent norecurse nounwind }
-attributes #1 = { alwaysinline convergent norecurse nounwind "hlsl.export"}
+attributes #1 = { alwaysinline convergent norecurse nounwind }
 attributes #2 = { convergent norecurse nounwind }
-attributes #3 = { convergent norecurse nounwind "hlsl.export"}
+attributes #3 = { convergent norecurse nounwind }
 attributes #4 = { convergent noinline norecurse nounwind }
-attributes #5 = { convergent noinline norecurse nounwind "hlsl.export"}
+attributes #5 = { convergent noinline norecurse nounwind }
 attributes #6 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 attributes #7 = { convergent }
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
index 49c3bda621d7..971451f981c9 100644
--- a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
@@ -7,23 +7,23 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 ; linkage and are not called from anywhere. This should be any function that
 ; is not an entry point.
 
-; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; Is hidden and is uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNothingUncalled
-define void @"?doNothingUncalled@@YAXXZ"() #1 {
+define hidden void @"?doNothingUncalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline and uncalled, this should be removed.
+; Alwaysinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
-define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and uncalled, this should be removed.
+; Noinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineUncalled
-define void @"?doNoinlineUncalled@@YAXXZ"() #3 {
+define hidden void @"?doNoinlineUncalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
@@ -49,44 +49,44 @@ entry:
   ret void
 }
 
-; Marked external and uncalled, this should become internal and be removed.
+; Marked external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doExternalUncalled
-define external void @"?doExternalUncalled@@YAXXZ"() #1 {
+define external hidden void @"?doExternalUncalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline, external and uncalled, this should become internal and be removed.
+; Alwaysinline, external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
-define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and uncalled, this should become internal and be removed.
+; Noinline, external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
-define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
+define external hidden void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
 
 ; No inlining attribute and called, this should stay.
 ; CHECK: define {{.*}}doNothingCalled
-define void @"?doNothingCalled@@YAXXZ"() #1 {
+define hidden void @"?doNothingCalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline and called, this should stay.
+; Alwaysinline, hidden, and called, this should stay.
 ; CHECK: define {{.*}}doAlwaysInlineCalled
-define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and called, this should stay.
+; Noinline, hidden, and called, this should stay.
 ; CHECK: define {{.*}}doNoinlineCalled
-define void @"?doNoinlineCalled@@YAXXZ"() #3 {
+define hidden void @"?doNoinlineCalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
@@ -112,23 +112,23 @@ entry:
   ret void
 }
 
-; Marked external and called, this should become internal and stay.
+; Marked external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doExternalCalled
-define external void @"?doExternalCalled@@YAXXZ"() #1 {
+define external hidden void @"?doExternalCalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Always inlined, external and called, this should become internal and stay.
+; Always inlined, external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doAlwaysInlineExternalCalled
-define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and called, this should become internal and stay.
+; Noinline, external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doNoinlineExternalCalled
-define external void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
+define external hidden void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/finalize_linkage.ll b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
index c761a79a5c28..df691db5cff3 100644
--- a/llvm/test/CodeGen/DirectX/finalize_linkage.ll
+++ b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
@@ -3,8 +3,8 @@
 
 target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 
-; DXILFinalizeLinkage changes linkage of all functions that are not
-; entry points or exported function to internal.
+; DXILFinalizeLinkage changes linkage of all functions that are hidden to
+; internal.
 
 ; CHECK-NOT: define internal void @"?f1@@YAXXZ"()
 define void @"?f1@@YAXXZ"() #0 {
@@ -13,19 +13,19 @@ entry:
 }
 
 ; CHECK: define internal void @"?f2@@YAXXZ"()
-define void @"?f2@@YAXXZ"() #0 {
+define hidden void @"?f2@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
 ; CHECK: define internal void @"?f3@@YAXXZ"()
-define void @"?f3@@YAXXZ"() #0 {
+define hidden void @"?f3@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
 ; CHECK: define internal void @"?foo@@YAXXZ"()
-define void @"?foo@@YAXXZ"() #0 {
+define hidden void @"?foo@@YAXXZ"() #0 {
 entry:
   call void @"?f2@@YAXXZ"() #3
   ret void
@@ -33,7 +33,7 @@ entry:
 
 ; Exported function - do not change linkage
 ; CHECK: define void @"?bar@@YAXXZ"()
-define void @"?bar@@YAXXZ"() #1 {
+define void @"?bar@@YAXXZ"() #0 {
 entry:
   call void @"?f3@@YAXXZ"() #3
   ret void
@@ -42,23 +42,22 @@ entry:
 ; CHECK: define internal void @"?main@@YAXXZ"() #0
 define internal void @"?main@@YAXXZ"() #0 {
 entry:
-  call void @"?foo@@YAXXZ"() #3
-  call void @"?bar@@YAXXZ"() #3
+  call void @"?foo@@YAXXZ"() #2
+  call void @"?bar@@YAXXZ"() #2
   ret void
 }
 
 ; Entry point function - do not change linkage
-; CHECK: define void @main() #2
-define void @main() #2 {
+; CHECK: define void @main() #1
+define void @main() #1 {
 entry:
   call void @"?main@@YAXXZ"()
   ret void
 }
 
 attributes #0 = { convergent noinline nounwind optnone}
-attributes #1 = { convergent noinline nounwind optnone "hlsl.export"}
-attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
-attributes #3 = { convergent }
+attributes #1 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
+attributes #2 = { convergent }
 
 ; Make sure "hlsl.export" attribute is stripped by llc
 ; CHECK-LLC-NOT: "hlsl.export"

From 60a59e350bfa909d3caf5b5b0dba8b473746ea0f Mon Sep 17 00:00:00 2001
From: Yuta Saito <kateinoigakukun@gmail.com>
Date: Tue, 17 Jun 2025 06:23:50 +0900
Subject: [PATCH 0551/1322] [ASan] Recognize WASI platform in
 sanitizer_platform.h (#139017)

---
 compiler-rt/lib/sanitizer_common/sanitizer_platform.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 9f5f41cd8551..4c8d9a9b86be 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -14,7 +14,8 @@
 
 #if !defined(__linux__) && !defined(__FreeBSD__) && !defined(__NetBSD__) && \
     !defined(__APPLE__) && !defined(_WIN32) && !defined(__Fuchsia__) &&     \
-    !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__)
+    !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__) &&      \
+    !defined(__wasi__)
 #  error "This operating system is not supported"
 #endif
 
@@ -61,6 +62,12 @@
 #  define SANITIZER_HAIKU 0
 #endif
 
+#if defined(__wasi__)
+#  define SANITIZER_WASI 1
+#else
+#  define SANITIZER_WASI 0
+#endif
+
 // - SANITIZER_APPLE: all Apple code
 //   - TARGET_OS_OSX: macOS
 //   - SANITIZER_IOS: devices (iOS and iOS-like)

From 38daa6d4ef1f3386cc50198199c5ec61dcb012af Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Mon, 16 Jun 2025 21:28:51 +0000
Subject: [PATCH 0552/1322] [libc] build fix: always use our char8_t headers
 even in overlay mode (#144433)

Build fix caused by certain platforms not providing char8_t when
expected
Temporary fix to just always use our own definition, even in overlay
mode.
---
 libc/hdr/types/char8_t.h                         | 8 --------
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h
index 31de764658f9..4d71e3dd8909 100644
--- a/libc/hdr/types/char8_t.h
+++ b/libc/hdr/types/char8_t.h
@@ -9,14 +9,6 @@
 #ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H
 #define LLVM_LIBC_HDR_TYPES_CHAR8_T_H
 
-#ifdef LIBC_FULL_BUILD
-
 #include "include/llvm-libc-types/char8_t.h"
 
-#else // overlay mode
-
-#include "hdr/uchar_overlay.h"
-
-#endif // LLVM_LIBC_FULL_BUILD
-
 #endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index bac2f6d827e1..ca709769616c 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -70,7 +70,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   char32_t output;
 
   // Shift to get the next 6 bits from the utf32 encoding
-  const char32_t shift_amount =
+  const size_t shift_amount =
       (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
   if (state->bytes_processed == 0) {
     /*

From 95418bc8a8fd765d5e60e0c8ac7f8b77d2c15ef2 Mon Sep 17 00:00:00 2001
From: Justin King <jcking@wulver.com>
Date: Mon, 16 Jun 2025 14:29:08 -0700
Subject: [PATCH 0553/1322] lsan: Support free_sized and free_aligned_sized
 from C23 (#144415)

Adds support to LSan for `free_sized` and `free_aligned_sized` from C23.

Other sanitizers will be handled with their own separate PRs.

For #144435

Signed-off-by: Justin King <jcking@google.com>
---
 compiler-rt/lib/lsan/lsan_allocator.cpp       |  4 ++++
 compiler-rt/lib/lsan/lsan_allocator.h         |  2 ++
 compiler-rt/lib/lsan/lsan_interceptors.cpp    | 18 +++++++++++++++
 compiler-rt/lib/lsan/lsan_malloc_mac.cpp      | 23 +++++++++++--------
 .../sanitizer_common/sanitizer_malloc_mac.inc | 15 ++++++++++++
 5 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index 493bf5f9efc5..a436d9c07ac6 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -220,6 +220,10 @@ void lsan_free(void *p) {
   Deallocate(p);
 }
 
+void lsan_free_sized(void *p, uptr) { Deallocate(p); }
+
+void lsan_free_aligned_sized(void *p, uptr, uptr) { Deallocate(p); }
+
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack) {
   return SetErrnoOnNull(Reallocate(stack, p, size, 1));
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 5eed0cbdb309..2342f11fb5d0 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -127,6 +127,8 @@ void *lsan_aligned_alloc(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_memalign(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_malloc(uptr size, const StackTrace &stack);
 void lsan_free(void *p);
+void lsan_free_sized(void *p, uptr size);
+void lsan_free_aligned_sized(void *p, uptr alignment, uptr size);
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack);
 void *lsan_reallocarray(void *p, uptr nmemb, uptr size,
                         const StackTrace &stack);
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index a8252cddacf2..8e33130840e9 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -84,6 +84,24 @@ INTERCEPTOR(void, free, void *p) {
   lsan_free(p);
 }
 
+INTERCEPTOR(void, free_sized, void *p, uptr size) {
+  if (UNLIKELY(!p))
+    return;
+  if (DlsymAlloc::PointerIsMine(p))
+    return DlsymAlloc::Free(p);
+  ENSURE_LSAN_INITED;
+  lsan_free_sized(p, size);
+}
+
+INTERCEPTOR(void, free_aligned_sized, void *p, uptr alignment, uptr size) {
+  if (UNLIKELY(!p))
+    return;
+  if (DlsymAlloc::PointerIsMine(p))
+    return DlsymAlloc::Free(p);
+  ENSURE_LSAN_INITED;
+  lsan_free_aligned_sized(p, alignment, size);
+}
+
 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
   if (DlsymAlloc::Use())
     return DlsymAlloc::Callocate(nmemb, size);
diff --git a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
index 525c30272ccc..8a16c053da23 100644
--- a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
+++ b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
@@ -44,16 +44,19 @@ using namespace __lsan;
   void *p = lsan_valloc(size, stack)
 #define COMMON_MALLOC_FREE(ptr) \
   lsan_free(ptr)
-#define COMMON_MALLOC_SIZE(ptr) \
-  uptr size = lsan_mz_size(ptr)
-#define COMMON_MALLOC_FILL_STATS(zone, stats)
-#define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name) \
-  (void)zone_name; \
-  Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", ptr);
-#define COMMON_MALLOC_NAMESPACE __lsan
-#define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
-#define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
+#  define COMMON_MALLOC_FREE_SIZED(ptr, size) lsan_free_sized(ptr, size)
+#  define COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size) \
+    lsan_free_aligned_sized(ptr, alignment, size)
+#  define COMMON_MALLOC_SIZE(ptr) uptr size = lsan_mz_size(ptr)
+#  define COMMON_MALLOC_FILL_STATS(zone, stats)
+#  define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name)    \
+    (void)zone_name;                                                        \
+    Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", \
+           ptr);
+#  define COMMON_MALLOC_NAMESPACE __lsan
+#  define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
+#  define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
 
-#include "sanitizer_common/sanitizer_malloc_mac.inc"
+#  include "sanitizer_common/sanitizer_malloc_mac.inc"
 
 #endif // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
index 6343eb284afb..72ad22999b5a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
@@ -144,6 +144,21 @@ INTERCEPTOR(void, free, void *ptr) {
   COMMON_MALLOC_FREE(ptr);
 }
 
+#ifdef COMMON_MALLOC_FREE_SIZED
+INTERCEPTOR(void, free_sized, void *ptr, size_t size) {
+  COMMON_MALLOC_ENTER();
+  COMMON_MALLOC_FREE_SIZED(ptr, size);
+}
+#endif
+
+#ifdef COMMON_MALLOC_FREE_ALIGNED_SIZED
+INTERCEPTOR(void, free_aligned_sized, void *ptr, size_t alignment,
+            size_t size) {
+  COMMON_MALLOC_ENTER();
+  COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size);
+}
+#endif
+
 INTERCEPTOR(void *, realloc, void *ptr, size_t size) {
   COMMON_MALLOC_ENTER();
   COMMON_MALLOC_REALLOC(ptr, size);

From 9c25ca78f9bdfe74e5dbaa60a864411bdbae4943 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 16 Jun 2025 14:36:13 -0700
Subject: [PATCH 0554/1322] [flang] Don't generate module file for hermetic
 USE'd dependency (#144143)

It's possible for the module file generation code to think that it needs
to (re)generate a module file for a dependent module read from a
hermetic module file, if it defines contains a procedure imported via
renaming due to a name clash. Adjust the logic that determines whether a
module file should be written to include a check for having originated
in a module file.
---
 flang/lib/Semantics/mod-file.cpp   | 44 ++++++++++++++++--------------
 flang/test/Semantics/modfile79.F90 | 33 ++++++++++++++++++++++
 2 files changed, 56 insertions(+), 21 deletions(-)
 create mode 100644 flang/test/Semantics/modfile79.F90

diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 9f9e9f584045..82c8536902eb 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -109,15 +109,14 @@ bool ModFileWriter::WriteAll() {
 }
 
 void ModFileWriter::WriteAll(const Scope &scope) {
-  for (const auto &child : scope.children()) {
+  for (const Scope &child : scope.children()) {
     WriteOne(child);
   }
 }
 
 void ModFileWriter::WriteOne(const Scope &scope) {
   if (scope.kind() == Scope::Kind::Module) {
-    auto *symbol{scope.symbol()};
-    if (!symbol->test(Symbol::Flag::ModFile)) {
+    if (const auto *symbol{scope.symbol()}) {
       Write(*symbol);
     }
     WriteAll(scope); // write out submodules
@@ -134,7 +133,7 @@ static std::string ModFileName(const SourceName &name,
 // Write the module file for symbol, which must be a module or submodule.
 void ModFileWriter::Write(const Symbol &symbol) {
   const auto &module{symbol.get<ModuleDetails>()};
-  if (module.moduleFileHash()) {
+  if (symbol.test(Symbol::Flag::ModFile) || module.moduleFileHash()) {
     return; // already written
   }
   const auto *ancestor{module.ancestor()};
@@ -372,16 +371,19 @@ void ModFileWriter::PutSymbols(
   CollectSymbols(scope, sorted, uses, modules);
   // Write module files for dependencies first so that their
   // hashes are known.
-  for (auto ref : modules) {
+  for (const Symbol &mod : modules) {
     if (hermeticModules) {
-      hermeticModules->insert(*ref);
+      hermeticModules->insert(mod);
     } else {
-      Write(*ref);
-      needs_ << ModHeader::need
-             << CheckSumString(
-                    ref->get<ModuleDetails>().moduleFileHash().value())
-             << (ref->owner().IsIntrinsicModules() ? " i " : " n ")
-             << ref->name().ToString() << '\n';
+      Write(mod);
+      // It's possible that the module's file already existed and
+      // without its own hash due to being embedded in a hermetic
+      // module file.
+      if (auto hash{mod.get<ModuleDetails>().moduleFileHash()}) {
+        needs_ << ModHeader::need << CheckSumString(*hash)
+               << (mod.owner().IsIntrinsicModules() ? " i " : " n ")
+               << mod.name().ToString() << '\n';
+      }
     }
   }
   std::string buf; // stuff after CONTAINS in derived type
@@ -855,25 +857,25 @@ void CollectSymbols(const Scope &scope, SymbolVector &sorted,
   auto symbols{scope.GetSymbols()};
   std::size_t commonSize{scope.commonBlocks().size()};
   sorted.reserve(symbols.size() + commonSize);
-  for (SymbolRef symbol : symbols) {
-    const auto *generic{symbol->detailsIf<GenericDetails>()};
+  for (const Symbol &symbol : symbols) {
+    const auto *generic{symbol.detailsIf<GenericDetails>()};
     if (generic) {
       uses.insert(uses.end(), generic->uses().begin(), generic->uses().end());
-      for (auto ref : generic->uses()) {
-        modules.insert(GetUsedModule(ref->get<UseDetails>()));
+      for (const Symbol &used : generic->uses()) {
+        modules.insert(GetUsedModule(used.get<UseDetails>()));
       }
-    } else if (const auto *use{symbol->detailsIf<UseDetails>()}) {
+    } else if (const auto *use{symbol.detailsIf<UseDetails>()}) {
       modules.insert(GetUsedModule(*use));
     }
-    if (symbol->test(Symbol::Flag::ParentComp)) {
-    } else if (symbol->has<NamelistDetails>()) {
+    if (symbol.test(Symbol::Flag::ParentComp)) {
+    } else if (symbol.has<NamelistDetails>()) {
       namelist.push_back(symbol);
     } else if (generic) {
       if (generic->specific() &&
-          &generic->specific()->owner() == &symbol->owner()) {
+          &generic->specific()->owner() == &symbol.owner()) {
         sorted.push_back(*generic->specific());
       } else if (generic->derivedType() &&
-          &generic->derivedType()->owner() == &symbol->owner()) {
+          &generic->derivedType()->owner() == &symbol.owner()) {
         sorted.push_back(*generic->derivedType());
       }
       generics.push_back(symbol);
diff --git a/flang/test/Semantics/modfile79.F90 b/flang/test/Semantics/modfile79.F90
new file mode 100644
index 000000000000..7d3b42166654
--- /dev/null
+++ b/flang/test/Semantics/modfile79.F90
@@ -0,0 +1,33 @@
+!RUN: %flang -c -DWHICH=1 %s && FileCheck %s <modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c %s && FileCheck %s <modfile79a.mod
+
+!Ensure that writing modfile79c.mod doesn't cause a spurious
+!regeneration of modfile79a.mod from its copy in the hermetic
+!module file modfile79b.mod.
+!CHECK: !mod$ v1 sum:93ec75fe672c5b6c
+!CHECK-NEXT: module modfile79a
+
+#if WHICH == 1
+module modfile79a
+  interface foo
+    module procedure foo
+  end interface
+ contains
+  subroutine foo
+  end
+end
+#elif WHICH == 2
+module modfile79b
+  use modfile79a
+  interface bar
+    procedure foo
+  end interface
+end
+#else
+module modfile79c
+  use modfile79b
+ contains
+  subroutine test
+    call bar
+  end
+end
+#endif

From 65b06cd983e59c25f30b680167559a4db2b44609 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 16 Jun 2025 14:36:35 -0700
Subject: [PATCH 0555/1322] [flang][runtime] Check SOURCE= conformability on
 ALLOCATE (#144113)

The SOURCE= expression of an ALLOCATE statement, when present and not
scalar, must conform to the shape of the allocated objects. Check this
at runtime, and return a recoverable error, or crash, when appropriate.

Fixes https://github.com/llvm/llvm-project/issues/143900.
---
 flang-rt/lib/runtime/allocatable.cpp   | 20 ++++++++++
 flang/lib/Semantics/check-allocate.cpp | 51 ++++++++++++++++++++++++++
 flang/test/Semantics/allocate11.f90    |  1 +
 3 files changed, 72 insertions(+)

diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index ef18da6ea078..f724f0a20884 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -165,6 +165,26 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
       alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
+    if (alloc.rank() != source.rank() && source.rank() != 0) {
+      terminator.Crash("ALLOCATE object has rank %d while SOURCE= has rank %d",
+          alloc.rank(), source.rank());
+    }
+    if (int rank{source.rank()}; rank > 0) {
+      SubscriptValue allocExtent[maxRank], sourceExtent[maxRank];
+      alloc.GetShape(allocExtent);
+      source.GetShape(sourceExtent);
+      for (int j{0}; j < rank; ++j) {
+        if (allocExtent[j] != sourceExtent[j]) {
+          if (!hasStat) {
+            terminator.Crash("ALLOCATE object has extent %jd on dimension %d, "
+                             "but SOURCE= has extent %jd",
+                static_cast<std::intmax_t>(allocExtent[j]), j + 1,
+                static_cast<std::intmax_t>(sourceExtent[j]));
+          }
+          return StatInvalidExtent;
+        }
+      }
+    }
     DoFromSourceAssign(alloc, source, terminator);
   }
   return stat;
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 2c215f45bf51..08053594c12e 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -10,6 +10,7 @@
 #include "assignment.h"
 #include "definable.h"
 #include "flang/Evaluate/fold.h"
+#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
@@ -33,6 +34,7 @@ struct AllocateCheckerInfo {
   bool gotMold{false};
   bool gotStream{false};
   bool gotPinned{false};
+  std::optional<evaluate::ConstantSubscripts> sourceExprShape;
 };
 
 class AllocationCheckerHelper {
@@ -259,6 +261,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
           CheckCopyabilityInPureScope(messages, *expr, scope);
         }
       }
+      auto maybeShape{evaluate::GetShape(context.foldingContext(), *expr)};
+      info.sourceExprShape =
+          evaluate::AsConstantExtents(context.foldingContext(), maybeShape);
     } else {
       // Error already reported on source expression.
       // Do not continue allocate checks.
@@ -581,6 +586,52 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
             .Attach(
                 ultimate_->name(), "Declared here with rank %d"_en_US, rank_);
         return false;
+      } else if (allocateInfo_.gotSource && allocateInfo_.sourceExprShape &&
+          allocateInfo_.sourceExprShape->size() ==
+              static_cast<std::size_t>(allocateShapeSpecRank_)) {
+        std::size_t j{0};
+        for (const auto &shapeSpec :
+            std::get<std::list<parser::AllocateShapeSpec>>(allocation_.t)) {
+          if (j >= allocateInfo_.sourceExprShape->size()) {
+            break;
+          }
+          std::optional<evaluate::ConstantSubscript> lbound;
+          if (const auto &lb{std::get<0>(shapeSpec.t)}) {
+            lbound.reset();
+            const auto &lbExpr{lb->thing.thing.value()};
+            if (const auto *expr{GetExpr(context, lbExpr)}) {
+              auto folded{
+                  evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
+              lbound = evaluate::ToInt64(folded);
+              evaluate::SetExpr(lbExpr, std::move(folded));
+            }
+          } else {
+            lbound = 1;
+          }
+          if (lbound) {
+            const auto &ubExpr{std::get<1>(shapeSpec.t).thing.thing.value()};
+            if (const auto *expr{GetExpr(context, ubExpr)}) {
+              auto folded{
+                  evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
+              auto ubound{evaluate::ToInt64(folded)};
+              evaluate::SetExpr(ubExpr, std::move(folded));
+              if (ubound) {
+                auto extent{*ubound - *lbound + 1};
+                if (extent < 0) {
+                  extent = 0;
+                }
+                if (extent != allocateInfo_.sourceExprShape->at(j)) {
+                  context.Say(name_.source,
+                      "Allocation has extent %jd on dimension %d, but SOURCE= has extent %jd"_err_en_US,
+                      static_cast<std::intmax_t>(extent), j + 1,
+                      static_cast<std::intmax_t>(
+                          allocateInfo_.sourceExprShape->at(j)));
+                }
+              }
+            }
+          }
+          ++j;
+        }
       }
     }
   } else { // allocating a scalar object
diff --git a/flang/test/Semantics/allocate11.f90 b/flang/test/Semantics/allocate11.f90
index 1b7495e9fc07..8aeb069df09f 100644
--- a/flang/test/Semantics/allocate11.f90
+++ b/flang/test/Semantics/allocate11.f90
@@ -163,6 +163,7 @@ subroutine C938_C947(var2, ptr, ptr2, fptr, my_team, srca)
   allocate(var2(2)[5:*], MOLD=my_team)
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
   allocate(var2(2)[5:*], MOLD=ptr)
+  !ERROR: Allocation has extent 2 on dimension 1, but SOURCE= has extent 9
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
   allocate(var2(2)[5:*], SOURCE=ptr2)
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray

From 2bf3ccabfa37ee1b2d74da7b370cdb16a5cc8ac0 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 16 Jun 2025 14:37:01 -0700
Subject: [PATCH 0556/1322] [flang] Restructure runtime to avoid recursion
 (relanding) (#143993)

Recursion, both direct and indirect, prevents accurate stack size
calculation at link time for GPU device code. Restructure these
recursive (often mutually so) routines in the Fortran runtime with new
implementations based on an iterative work queue with
suspendable/resumable work tickets: Assign, Initialize, initializeClone,
Finalize, and Destroy.

Default derived type I/O is also recursive, but already disabled. It can
be added to this new framework later if the overall approach succeeds.

Note that derived type FINAL subroutine calls, defined assignments, and
defined I/O procedures all perform callbacks into user code, which may
well reenter the runtime library. This kind of recursion is not handled
by this change, although it may be possible to do so in the future using
thread-local work queues.

(Relanding this patch after reverting initial attempt due to some test
failures that needed some time to analyze and fix.)

Fixes https://github.com/llvm/llvm-project/issues/142481.
---
 .../include/flang-rt/runtime/environment.h    |   3 +
 flang-rt/include/flang-rt/runtime/stat.h      |  10 +-
 flang-rt/include/flang-rt/runtime/type-info.h |  15 +-
 .../include/flang-rt/runtime/work-queue.h     | 555 ++++++++++++++
 flang-rt/lib/runtime/CMakeLists.txt           |   2 +
 flang-rt/lib/runtime/assign.cpp               | 693 +++++++++++-------
 flang-rt/lib/runtime/derived.cpp              | 536 +++++++-------
 flang-rt/lib/runtime/descriptor-io.cpp        | 668 ++++++++++++++++-
 flang-rt/lib/runtime/descriptor-io.h          | 620 +---------------
 flang-rt/lib/runtime/environment.cpp          |   4 +
 flang-rt/lib/runtime/namelist.cpp             |   1 +
 flang-rt/lib/runtime/tools.cpp                |   4 +-
 flang-rt/lib/runtime/type-info.cpp            |  12 +-
 flang-rt/lib/runtime/work-queue.cpp           | 161 ++++
 flang-rt/unittests/Runtime/ExternalIOTest.cpp |   2 +-
 flang/docs/Extensions.md                      |  10 +
 flang/include/flang/Runtime/assign.h          |   3 +-
 flang/include/flang/Semantics/tools.h         |   7 +-
 flang/lib/Semantics/runtime-type-info.cpp     |  88 ++-
 flang/lib/Semantics/tools.cpp                 |  32 +
 flang/module/__fortran_type_info.f90          |   5 +-
 flang/test/Lower/volatile-openmp.f90          |   8 +-
 flang/test/Semantics/typeinfo01.f90           |  34 +-
 flang/test/Semantics/typeinfo03.f90           |   2 +-
 flang/test/Semantics/typeinfo04.f90           |   8 +-
 flang/test/Semantics/typeinfo05.f90           |   4 +-
 flang/test/Semantics/typeinfo06.f90           |   4 +-
 flang/test/Semantics/typeinfo07.f90           |   8 +-
 flang/test/Semantics/typeinfo08.f90           |   2 +-
 flang/test/Semantics/typeinfo11.f90           |   2 +-
 flang/test/Semantics/typeinfo12.f90           |  67 ++
 flang/test/Semantics/typeinfo13.f90           |   2 +-
 32 files changed, 2375 insertions(+), 1197 deletions(-)
 create mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h
 create mode 100644 flang-rt/lib/runtime/work-queue.cpp
 create mode 100644 flang/test/Semantics/typeinfo12.f90

diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index 16258b3bbba9..e579f6012ce8 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -64,6 +64,9 @@ struct ExecutionEnvironment {
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 
+  enum InternalDebugging { WorkQueue = 1 };
+  int internalDebugging{0}; // FLANG_RT_DEBUG
+
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
   bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
index 070d0bf8673f..dc372de53506 100644
--- a/flang-rt/include/flang-rt/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -24,7 +24,7 @@ class Terminator;
 enum Stat {
   StatOk = 0, // required to be zero by Fortran
 
-  // Interoperable STAT= codes
+  // Interoperable STAT= codes (>= 11)
   StatBaseNull = CFI_ERROR_BASE_ADDR_NULL,
   StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL,
   StatInvalidElemLen = CFI_INVALID_ELEM_LEN,
@@ -36,7 +36,7 @@ enum Stat {
   StatMemAllocation = CFI_ERROR_MEM_ALLOCATION,
   StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS,
 
-  // Standard STAT= values
+  // Standard STAT= values (>= 101)
   StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE,
   StatLocked = FORTRAN_RUNTIME_STAT_LOCKED,
   StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE,
@@ -49,10 +49,14 @@ enum Stat {
   // Additional "processor-defined" STAT= values
   StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER,
   StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG,
-  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT,
+  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1
   StatMoveAllocSameAllocatable =
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
   StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION,
+
+  // Dummy status for work queue continuation, declared here to perhaps
+  // avoid collisions
+  StatContinue = 201
 };
 
 RT_API_ATTRS const char *StatErrorString(int);
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index 5e79efde164f..80301a313282 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -154,12 +154,17 @@ public:
   RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const {
     return (isArgDescriptorSet_ >> zeroBasedArg) & 1;
   }
-  RT_API_ATTRS bool isTypeBound() const { return isTypeBound_; }
+  RT_API_ATTRS bool IsTypeBound() const { return isTypeBound_ != 0; }
   RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const {
     return (isArgContiguousSet_ >> zeroBasedArg) & 1;
   }
-  template <typename PROC> RT_API_ATTRS PROC GetProc() const {
-    return reinterpret_cast<PROC>(proc_);
+  template <typename PROC>
+  RT_API_ATTRS PROC GetProc(const Binding *bindings = nullptr) const {
+    if (bindings && isTypeBound_ > 0) {
+      return reinterpret_cast<PROC>(bindings[isTypeBound_ - 1].proc);
+    } else {
+      return reinterpret_cast<PROC>(proc_);
+    }
   }
 
   FILE *Dump(FILE *) const;
@@ -193,6 +198,8 @@ private:
   //     When false, the defined I/O subroutine must have been
   //     called via a generic interface, not a generic TBP.
   std::uint8_t isArgDescriptorSet_{0};
+  // When a special binding is type-bound, this is its binding's index (plus 1,
+  // so that 0 signifies that it's not type-bound).
   std::uint8_t isTypeBound_{0};
   // True when a FINAL subroutine has a dummy argument that is an array that
   // is CONTIGUOUS or neither assumed-rank nor assumed-shape.
@@ -240,6 +247,7 @@ public:
   RT_API_ATTRS bool noFinalizationNeeded() const {
     return noFinalizationNeeded_;
   }
+  RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; }
 
   RT_API_ATTRS std::size_t LenParameters() const {
     return lenParameterKind().Elements();
@@ -322,6 +330,7 @@ private:
   bool noInitializationNeeded_{false};
   bool noDestructionNeeded_{false};
   bool noFinalizationNeeded_{false};
+  bool noDefinedAssignment_{false};
 };
 
 } // namespace Fortran::runtime::typeInfo
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
new file mode 100644
index 000000000000..0daa7bc4d338
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -0,0 +1,555 @@
+//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Internal runtime utilities for work queues that replace the use of recursion
+// for better GPU device support.
+//
+// A work queue comprises a list of tickets.  Each ticket class has a Begin()
+// member function, which is called once, and a Continue() member function
+// that can be called zero or more times.  A ticket's execution terminates
+// when either of these member functions returns a status other than
+// StatContinue.  When that status is not StatOk, then the whole queue
+// is shut down.
+//
+// By returning StatContinue from its Continue() member function,
+// a ticket suspends its execution so that any nested tickets that it
+// may have created can be run to completion.  It is the reponsibility
+// of each ticket class to maintain resumption information in its state
+// and manage its own progress.  Most ticket classes inherit from
+// class ComponentsOverElements, which implements an outer loop over all
+// components of a derived type, and an inner loop over all elements
+// of a descriptor, possibly with multiple phases of execution per element.
+//
+// Tickets are created by WorkQueue::Begin...() member functions.
+// There is one of these for each "top level" recursive function in the
+// Fortran runtime support library that has been restructured into this
+// ticket framework.
+//
+// When the work queue is running tickets, it always selects the last ticket
+// on the list for execution -- "work stack" might have been a more accurate
+// name for this framework.  This ticket may, while doing its job, create
+// new tickets, and since those are pushed after the active one, the first
+// such nested ticket will be the next one executed to completion -- i.e.,
+// the order of nested WorkQueue::Begin...() calls is respected.
+// Note that a ticket's Continue() member function won't be called again
+// until all nested tickets have run to completion and it is once again
+// the last ticket on the queue.
+//
+// Example for an assignment to a derived type:
+// 1. Assign() is called, and its work queue is created.  It calls
+//    WorkQueue::BeginAssign() and then WorkQueue::Run().
+// 2. Run calls AssignTicket::Begin(), which pushes a tickets via
+//    BeginFinalize() and returns StatContinue.
+// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called
+//    until one of them returns StatOk, which ends the finalization ticket.
+// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket
+//    and then returns StatOk, which ends the ticket.
+// 5. At this point, only one ticket remains.  DerivedAssignTicket::Begin()
+//    and ::Continue() are called until they are done (not StatContinue).
+//    Along the way, it may create nested AssignTickets for components,
+//    and suspend itself so that they may each run to completion.
+
+#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_
+#define FLANG_RT_RUNTIME_WORK_QUEUE_H_
+
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
+#include <flang/Common/variant.h>
+
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
+
+namespace Fortran::runtime {
+class Terminator;
+class WorkQueue;
+
+// Ticket worker base classes
+
+template <typename TICKET> class ImmediateTicketRunner {
+public:
+  RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket)
+      : ticket_{ticket} {}
+  RT_API_ATTRS int Run(WorkQueue &workQueue) {
+    int status{ticket_.Begin(workQueue)};
+    while (status == StatContinue) {
+      status = ticket_.Continue(workQueue);
+    }
+    return status;
+  }
+
+private:
+  TICKET &ticket_;
+};
+
+// Base class for ticket workers that operate elementwise over descriptors
+class Elementwise {
+public:
+  RT_API_ATTRS Elementwise(
+      const Descriptor &instance, const Descriptor *from = nullptr)
+      : instance_{instance}, from_{from} {
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; }
+  RT_API_ATTRS void Advance() {
+    ++elementAt_;
+    instance_.IncrementSubscripts(subscripts_);
+    if (from_) {
+      from_->IncrementSubscripts(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; }
+  RT_API_ATTRS void Reset() {
+    elementAt_ = 0;
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+
+protected:
+  const Descriptor &instance_, *from_{nullptr};
+  std::size_t elements_{instance_.Elements()};
+  std::size_t elementAt_{0};
+  SubscriptValue subscripts_[common::maxRank];
+  SubscriptValue fromSubscripts_[common::maxRank];
+};
+
+// Base class for ticket workers that operate over derived type components.
+class Componentwise {
+public:
+  RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
+  RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
+  RT_API_ATTRS void Advance() {
+    ++componentAt_;
+    GetComponent();
+  }
+  RT_API_ATTRS void SkipToEnd() {
+    component_ = nullptr;
+    componentAt_ = components_;
+  }
+  RT_API_ATTRS void Reset() {
+    component_ = nullptr;
+    componentAt_ = 0;
+    GetComponent();
+  }
+  RT_API_ATTRS void GetComponent();
+
+protected:
+  const typeInfo::DerivedType &derived_;
+  std::size_t components_{0}, componentAt_{0};
+  const typeInfo::Component *component_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
+};
+
+// Base class for ticket workers that operate over derived type components
+// in an outer loop, and elements in an inner loop.
+class ComponentsOverElements : public Componentwise, public Elementwise {
+public:
+  RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Componentwise{derived}, Elementwise{instance, from} {
+    if (Elementwise::IsComplete()) {
+      Componentwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextElement();
+    if (Elementwise::IsComplete()) {
+      Elementwise::Reset();
+      Componentwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Elementwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void Reset() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Reset();
+  }
+
+protected:
+  int phase_{0};
+};
+
+// Base class for ticket workers that operate over elements in an outer loop,
+// type components in an inner loop.
+class ElementsOverComponents : public Elementwise, public Componentwise {
+public:
+  RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Elementwise{instance, from}, Componentwise{derived} {
+    if (Componentwise::IsComplete()) {
+      Elementwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextComponent();
+    if (Componentwise::IsComplete()) {
+      Componentwise::Reset();
+      Elementwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Componentwise::Reset();
+    Elementwise::Advance();
+  }
+
+protected:
+  int phase_{0};
+};
+
+// Ticket worker classes
+
+// Implements derived type instance initialization
+class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
+                         private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<InitializeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+};
+
+// Initializes one derived type instance from the value of another
+class InitializeCloneTicket
+    : public ImmediateTicketRunner<InitializeCloneTicket>,
+      private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg)
+      : ImmediateTicketRunner<InitializeCloneTicket>{*this},
+        ComponentsOverElements{original, derived}, clone_{clone},
+        hasStat_{hasStat}, errMsg_{errMsg} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const Descriptor &clone_;
+  bool hasStat_{false};
+  const Descriptor *errMsg_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> cloneComponentDescriptor_;
+};
+
+// Implements derived type instance finalization
+class FinalizeTicket : public ImmediateTicketRunner<FinalizeTicket>,
+                       private ComponentsOverElements {
+public:
+  RT_API_ATTRS FinalizeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<FinalizeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const typeInfo::DerivedType *finalizableParentType_{nullptr};
+};
+
+// Implements derived type instance destruction
+class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
+                      private ComponentsOverElements {
+public:
+  RT_API_ATTRS DestroyTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, bool finalize)
+      : ImmediateTicketRunner<DestroyTicket>{*this},
+        ComponentsOverElements{instance, derived}, finalize_{finalize} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  bool finalize_{false};
+};
+
+// Implements general intrinsic assignment
+class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
+public:
+  RT_API_ATTRS AssignTicket(Descriptor &to, const Descriptor &from, int flags,
+      MemmoveFct memmoveFct, const typeInfo::DerivedType *declaredType)
+      : ImmediateTicketRunner<AssignTicket>{*this}, to_{to}, from_{&from},
+        flags_{flags}, memmoveFct_{memmoveFct}, declaredType_{declaredType} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  RT_API_ATTRS bool IsSimpleMemmove() const {
+    return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
+        from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
+  }
+  RT_API_ATTRS Descriptor &GetTempDescriptor();
+
+  Descriptor &to_;
+  const Descriptor *from_{nullptr};
+  int flags_{0}; // enum AssignFlags
+  MemmoveFct memmoveFct_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> tempDescriptor_;
+  const typeInfo::DerivedType *declaredType_{nullptr};
+  const typeInfo::DerivedType *toDerived_{nullptr};
+  Descriptor *toDeallocate_{nullptr};
+  bool persist_{false};
+  bool done_{false};
+};
+
+// Implements derived type intrinsic assignment.
+template <bool IS_COMPONENTWISE>
+class DerivedAssignTicket
+    : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
+      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+          ElementsOverComponents> {
+public:
+  using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      ElementsOverComponents>;
+  RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter)
+      : ImmediateTicketRunner<DerivedAssignTicket>{*this},
+        Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct},
+        deallocateAfter_{deallocateAfter} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  static constexpr bool isComponentwise_{IS_COMPONENTWISE};
+  bool toIsContiguous_{this->instance_.IsContiguous()};
+  bool fromIsContiguous_{this->from_->IsContiguous()};
+  int flags_{0};
+  MemmoveFct memmoveFct_{nullptr};
+  Descriptor *deallocateAfter_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> fromComponentDescriptor_;
+};
+
+namespace io::descr {
+
+template <io::Direction DIR>
+class DescriptorIoTicket
+    : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
+      private Elementwise {
+public:
+  RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DescriptorIoTicket>(*this),
+        Elementwise{descriptor}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+  RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; }
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+  common::optional<typeInfo::SpecialBinding> nonTbpSpecial_;
+  const typeInfo::DerivedType *derived_{nullptr};
+  const typeInfo::SpecialBinding *special_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> elementDescriptor_;
+};
+
+template <io::Direction DIR>
+class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
+                        private ElementsOverComponents {
+public:
+  RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DerivedIoTicket>(*this),
+        ElementsOverComponents{descriptor, derived}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+};
+
+} // namespace io::descr
+
+struct NullTicket {
+  RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; }
+  RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; }
+};
+
+struct Ticket {
+  RT_API_ATTRS int Continue(WorkQueue &);
+  bool begun{false};
+  std::variant<NullTicket, InitializeTicket, InitializeCloneTicket,
+      FinalizeTicket, DestroyTicket, AssignTicket, DerivedAssignTicket<false>,
+      DerivedAssignTicket<true>,
+      io::descr::DescriptorIoTicket<io::Direction::Output>,
+      io::descr::DescriptorIoTicket<io::Direction::Input>,
+      io::descr::DerivedIoTicket<io::Direction::Output>,
+      io::descr::DerivedIoTicket<io::Direction::Input>>
+      u;
+};
+
+class WorkQueue {
+public:
+  RT_API_ATTRS explicit WorkQueue(Terminator &terminator)
+      : terminator_{terminator} {
+    for (int j{1}; j < numStatic_; ++j) {
+      static_[j].previous = &static_[j - 1];
+      static_[j - 1].next = &static_[j];
+    }
+  }
+  RT_API_ATTRS ~WorkQueue();
+  RT_API_ATTRS Terminator &terminator() { return terminator_; };
+
+  // APIs for particular tasks.  These can return StatOk if the work is
+  // completed immediately.
+  RT_API_ATTRS int BeginInitialize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return InitializeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg) {
+    if (runTicketsImmediately_) {
+      return InitializeCloneTicket{clone, original, derived, hasStat, errMsg}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeCloneTicket>(
+          clone, original, derived, hasStat, errMsg);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginFinalize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return FinalizeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<FinalizeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived, bool finalize) {
+    if (runTicketsImmediately_) {
+      return DestroyTicket{descriptor, derived, finalize}.Run(*this);
+    } else {
+      StartTicket().u.emplace<DestroyTicket>(descriptor, derived, finalize);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from,
+      int flags, MemmoveFct memmoveFct,
+      const typeInfo::DerivedType *declaredType) {
+    if (runTicketsImmediately_) {
+      return AssignTicket{to, from, flags, memmoveFct, declaredType}.Run(*this);
+    } else {
+      StartTicket().u.emplace<AssignTicket>(
+          to, from, flags, memmoveFct, declaredType);
+      return StatContinue;
+    }
+  }
+  template <bool IS_COMPONENTWISE>
+  RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter) {
+    if (runTicketsImmediately_) {
+      return DerivedAssignTicket<IS_COMPONENTWISE>{
+          to, from, derived, flags, memmoveFct, deallocateAfter}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<DerivedAssignTicket<IS_COMPONENTWISE>>(
+          to, from, derived, flags, memmoveFct, deallocateAfter);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DescriptorIoTicket<DIR>{
+          io, descriptor, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DescriptorIoTicket<DIR>>(
+          io, descriptor, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DerivedIoTicket<DIR>{
+          io, descriptor, derived, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DerivedIoTicket<DIR>>(
+          io, descriptor, derived, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+
+  RT_API_ATTRS int Run();
+
+private:
+#if RT_DEVICE_COMPILATION
+  // Always use the work queue on a GPU device to avoid recursion.
+  static constexpr bool runTicketsImmediately_{false};
+#else
+  // Avoid the work queue overhead on the host, unless it needs
+  // debugging, which is so much easier there.
+  static constexpr bool runTicketsImmediately_{true};
+#endif
+
+  // Most uses of the work queue won't go very deep.
+  static constexpr int numStatic_{2};
+
+  struct TicketList {
+    bool isStatic{true};
+    Ticket ticket;
+    TicketList *previous{nullptr}, *next{nullptr};
+  };
+
+  RT_API_ATTRS Ticket &StartTicket();
+  RT_API_ATTRS void Stop();
+
+  Terminator &terminator_;
+  TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
+  TicketList static_[numStatic_];
+  TicketList *firstFree_{static_};
+};
+
+} // namespace Fortran::runtime
+#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index a3f63b431564..332c0872e065 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -68,6 +68,7 @@ set(supported_sources
   type-info.cpp
   unit.cpp
   utf.cpp
+  work-queue.cpp
 )
 
 # List of source not used for GPU offloading.
@@ -131,6 +132,7 @@ set(gpu_sources
   type-code.cpp
   type-info.cpp
   utf.cpp
+  work-queue.cpp
   complex-powi.cpp
   reduce.cpp
   reduction.cpp
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index bf67b5dc8b64..f936a4192a33 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -14,6 +14,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -62,9 +63,22 @@ static inline RT_API_ATTRS bool MustDeallocateLHS(
     // Distinct shape? Deallocate
     int rank{to.rank()};
     for (int j{0}; j < rank; ++j) {
-      if (to.GetDimension(j).Extent() != from.GetDimension(j).Extent()) {
+      const auto &toDim{to.GetDimension(j)};
+      const auto &fromDim{from.GetDimension(j)};
+      if (toDim.Extent() != fromDim.Extent()) {
         return true;
       }
+      if ((flags & UpdateLHSBounds) &&
+          toDim.LowerBound() != fromDim.LowerBound()) {
+        return true;
+      }
+    }
+  }
+  // Not reallocating; may have to update bounds
+  if (flags & UpdateLHSBounds) {
+    int rank{to.rank()};
+    for (int j{0}; j < rank; ++j) {
+      to.GetDimension(j).SetLowerBound(from.GetDimension(j).LowerBound());
     }
   }
   return false;
@@ -102,11 +116,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
-  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
-    result = ReturnError(terminator, Initialize(to, *derived, terminator));
-  }
-  return result;
+  return ReturnError(terminator, to.Allocate(kNoAsyncObject));
 }
 
 // least <= 0, most >= 0
@@ -169,24 +179,27 @@ static RT_API_ATTRS bool MayAlias(const Descriptor &x, const Descriptor &y) {
 }
 
 static RT_API_ATTRS void DoScalarDefinedAssignment(const Descriptor &to,
-    const Descriptor &from, const typeInfo::SpecialBinding &special) {
+    const Descriptor &from, const typeInfo::DerivedType &derived,
+    const typeInfo::SpecialBinding &special) {
   bool toIsDesc{special.IsArgDescriptor(0)};
   bool fromIsDesc{special.IsArgDescriptor(1)};
+  const auto *bindings{
+      derived.binding().OffsetElement<const typeInfo::Binding>()};
   if (toIsDesc) {
     if (fromIsDesc) {
-      auto *p{
-          special.GetProc<void (*)(const Descriptor &, const Descriptor &)>()};
+      auto *p{special.GetProc<void (*)(const Descriptor &, const Descriptor &)>(
+          bindings)};
       p(to, from);
     } else {
-      auto *p{special.GetProc<void (*)(const Descriptor &, void *)>()};
+      auto *p{special.GetProc<void (*)(const Descriptor &, void *)>(bindings)};
       p(to, from.raw().base_addr);
     }
   } else {
     if (fromIsDesc) {
-      auto *p{special.GetProc<void (*)(void *, const Descriptor &)>()};
+      auto *p{special.GetProc<void (*)(void *, const Descriptor &)>(bindings)};
       p(to.raw().base_addr, from);
     } else {
-      auto *p{special.GetProc<void (*)(void *, void *)>()};
+      auto *p{special.GetProc<void (*)(void *, void *)>(bindings)};
       p(to.raw().base_addr, from.raw().base_addr);
     }
   }
@@ -208,7 +221,7 @@ static RT_API_ATTRS void DoElementalDefinedAssignment(const Descriptor &to,
        to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
     toElementDesc.set_base_addr(to.Element<char>(toAt));
     fromElementDesc.set_base_addr(from.Element<char>(fromAt));
-    DoScalarDefinedAssignment(toElementDesc, fromElementDesc, special);
+    DoScalarDefinedAssignment(toElementDesc, fromElementDesc, derived, special);
   }
 }
 
@@ -231,6 +244,8 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
   }
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 // Common implementation of assignments, both intrinsic assignments and
 // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not
 // be resolved in semantics.  Most assignment statements do not need any
@@ -244,275 +259,461 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
 // dealing with array constructors.
 RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
     Terminator &terminator, int flags, MemmoveFct memmoveFct) {
-  bool mustDeallocateLHS{(flags & DeallocateLHS) ||
-      MustDeallocateLHS(to, from, terminator, flags)};
-  DescriptorAddendum *toAddendum{to.Addendum()};
-  const typeInfo::DerivedType *toDerived{
-      toAddendum ? toAddendum->derivedType() : nullptr};
-  if (toDerived && (flags & NeedFinalization) &&
-      toDerived->noFinalizationNeeded()) {
-    flags &= ~NeedFinalization;
+  WorkQueue workQueue{terminator};
+  if (workQueue.BeginAssign(to, from, flags, memmoveFct, nullptr) ==
+      StatContinue) {
+    workQueue.Run();
   }
-  std::size_t toElementBytes{to.ElementBytes()};
-  std::size_t fromElementBytes{from.ElementBytes()};
-  // The following lambda definition violates the conding style,
-  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
-  auto isSimpleMemmove = [&]() {
-    return !toDerived && to.rank() == from.rank() && to.IsContiguous() &&
-        from.IsContiguous() && toElementBytes == fromElementBytes;
-  };
-  StaticDescriptor<maxRank, true, 10 /*?*/> deferredDeallocStatDesc;
-  Descriptor *deferDeallocation{nullptr};
-  if (MayAlias(to, from)) {
+}
+
+RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
+  bool mustDeallocateLHS{(flags_ & DeallocateLHS) ||
+      MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)};
+  DescriptorAddendum *toAddendum{to_.Addendum()};
+  toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr;
+  if (toDerived_ && (flags_ & NeedFinalization) &&
+      toDerived_->noFinalizationNeeded()) {
+    flags_ &= ~NeedFinalization;
+  }
+  if (MayAlias(to_, *from_)) {
     if (mustDeallocateLHS) {
-      deferDeallocation = &deferredDeallocStatDesc.descriptor();
+      // Convert the LHS into a temporary, then make it look deallocated.
+      toDeallocate_ = &tempDescriptor_.descriptor();
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
-          reinterpret_cast<void *>(deferDeallocation), &to, to.SizeInBytes());
-      to.set_base_addr(nullptr);
-    } else if (!isSimpleMemmove()) {
+          reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
+      to_.set_base_addr(nullptr);
+      if (toDerived_ && (flags_ & NeedFinalization)) {
+        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+        flags_ &= ~NeedFinalization;
+      }
+    } else if (!IsSimpleMemmove()) {
       // Handle LHS/RHS aliasing by copying RHS into a temp, then
       // recursively assigning from that temp.
-      auto descBytes{from.SizeInBytes()};
-      StaticDescriptor<maxRank, true, 16> staticDesc;
-      Descriptor &newFrom{staticDesc.descriptor()};
-      std::memcpy(reinterpret_cast<void *>(&newFrom), &from, descBytes);
+      auto descBytes{from_->SizeInBytes()};
+      Descriptor &newFrom{tempDescriptor_.descriptor()};
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
+      std::memcpy(reinterpret_cast<void *>(&newFrom), from_, descBytes);
       // Pretend the temporary descriptor is for an ALLOCATABLE
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
-      if (stat == StatOk) {
-        if (HasDynamicComponent(from)) {
-          // If 'from' has allocatable/automatic component, we cannot
-          // just make a shallow copy of the descriptor member.
-          // This will still leave data overlap in 'to' and 'newFrom'.
-          // For example:
-          //   type t
-          //     character, allocatable :: c(:)
-          //   end type t
-          //   type(t) :: x(3)
-          //   x(2:3) = x(1:2)
-          // We have to make a deep copy into 'newFrom' in this case.
-          RTNAME(AssignTemporary)
-          (newFrom, from, terminator.sourceFileName(), terminator.sourceLine());
-        } else {
-          ShallowCopy(newFrom, from, true, from.IsContiguous());
-        }
-        Assign(to, newFrom, terminator,
-            flags &
-                (NeedFinalization | ComponentCanBeDefinedAssignment |
-                    ExplicitLengthCharacterLHS | CanBeDefinedAssignment));
-        newFrom.Deallocate();
+      if (int stat{ReturnError(
+              workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
+          stat != StatOk) {
+        return stat;
       }
-      return;
-    }
-  }
-  if (to.IsAllocatable()) {
-    if (mustDeallocateLHS) {
-      if (deferDeallocation) {
-        if ((flags & NeedFinalization) && toDerived) {
-          Finalize(*deferDeallocation, *toDerived, &terminator);
-          flags &= ~NeedFinalization;
-        }
-      } else {
-        to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false,
-            &terminator);
-        flags &= ~NeedFinalization;
-      }
-    } else if (to.rank() != from.rank() && !to.IsAllocated()) {
-      terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to "
-                       "unallocated allocatable",
-          to.rank(), from.rank());
-    }
-    if (!to.IsAllocated()) {
-      if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) {
-        return;
-      }
-      flags &= ~NeedFinalization;
-      toElementBytes = to.ElementBytes(); // may have changed
-      toDerived = toAddendum ? toAddendum->derivedType() : nullptr;
-    }
-  }
-  if (toDerived && (flags & CanBeDefinedAssignment)) {
-    // Check for a user-defined assignment type-bound procedure;
-    // see 10.2.1.4-5.  A user-defined assignment TBP defines all of
-    // the semantics, including allocatable (re)allocation and any
-    // finalization.
-    //
-    // Note that the aliasing and LHS (re)allocation handling above
-    // needs to run even with CanBeDefinedAssignment flag, when
-    // the Assign() is invoked recursively for component-per-component
-    // assignments.
-    if (to.rank() == 0) {
-      if (const auto *special{toDerived->FindSpecialBinding(
-              typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
-        return DoScalarDefinedAssignment(to, from, *special);
-      }
-    }
-    if (const auto *special{toDerived->FindSpecialBinding(
-            typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
-      return DoElementalDefinedAssignment(to, from, *toDerived, *special);
-    }
-  }
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
-  // Scalar expansion of the RHS is implied by using the same empty
-  // subscript values on each (seemingly) elemental reference into
-  // "from".
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
-  std::size_t toElements{to.Elements()};
-  if (from.rank() > 0 && toElements != from.Elements()) {
-    terminator.Crash("Assign: mismatching element counts in array assignment "
-                     "(to %zd, from %zd)",
-        toElements, from.Elements());
-  }
-  if (to.type() != from.type()) {
-    terminator.Crash("Assign: mismatching types (to code %d != from code %d)",
-        to.type().raw(), from.type().raw());
-  }
-  if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) {
-    terminator.Crash("Assign: mismatching non-character element sizes (to %zd "
-                     "bytes != from %zd bytes)",
-        toElementBytes, fromElementBytes);
-  }
-  if (const typeInfo::DerivedType *
-      updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) {
-    // Derived type intrinsic assignment, which is componentwise and elementwise
-    // for all components, including parent components (10.2.1.2-3).
-    // The target is first finalized if still necessary (7.5.6.3(1))
-    if (flags & NeedFinalization) {
-      Finalize(to, *updatedToDerived, &terminator);
-    } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) {
-      Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator);
-    }
-    // Copy the data components (incl. the parent) first.
-    const Descriptor &componentDesc{updatedToDerived->component()};
-    std::size_t numComponents{componentDesc.Elements()};
-    for (std::size_t j{0}; j < toElements;
-         ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-      for (std::size_t k{0}; k < numComponents; ++k) {
-        const auto &comp{
-            *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
-                k)}; // TODO: exploit contiguity here
-        // Use PolymorphicLHS for components so that the right things happen
-        // when the components are polymorphic; when they're not, they're both
-        // not, and their declared types will match.
-        int nestedFlags{MaybeReallocate | PolymorphicLHS};
-        if (flags & ComponentCanBeDefinedAssignment) {
-          nestedFlags |=
-              CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
-        }
-        switch (comp.genre()) {
-        case typeInfo::Component::Genre::Data:
-          if (comp.category() == TypeCategory::Derived) {
-            StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
-            Descriptor &toCompDesc{statDesc[0].descriptor()};
-            Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
-            Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
-          } else { // Component has intrinsic type; simply copy raw bytes
-            std::size_t componentByteSize{comp.SizeInBytes(to)};
-            memmoveFct(to.Element<char>(toAt) + comp.offset(),
-                from.Element<const char>(fromAt) + comp.offset(),
-                componentByteSize);
-          }
-          break;
-        case typeInfo::Component::Genre::Pointer: {
-          std::size_t componentByteSize{comp.SizeInBytes(to)};
-          memmoveFct(to.Element<char>(toAt) + comp.offset(),
-              from.Element<const char>(fromAt) + comp.offset(),
-              componentByteSize);
-        } break;
-        case typeInfo::Component::Genre::Allocatable:
-        case typeInfo::Component::Genre::Automatic: {
-          auto *toDesc{reinterpret_cast<Descriptor *>(
-              to.Element<char>(toAt) + comp.offset())};
-          const auto *fromDesc{reinterpret_cast<const Descriptor *>(
-              from.Element<char>(fromAt) + comp.offset())};
-          // Allocatable components of the LHS are unconditionally
-          // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
-          // unlike a "top-level" assignment to a variable, where
-          // deallocation is optional.
-          //
-          // Be careful not to destroy/reallocate the LHS, if there is
-          // overlap between LHS and RHS (it seems that partial overlap
-          // is not possible, though).
-          // Invoke Assign() recursively to deal with potential aliasing.
-          if (toDesc->IsAllocatable()) {
-            if (!fromDesc->IsAllocated()) {
-              // No aliasing.
-              //
-              // If to is not allocated, the Destroy() call is a no-op.
-              // This is just a shortcut, because the recursive Assign()
-              // below would initiate the destruction for to.
-              // No finalization is required.
-              toDesc->Destroy(
-                  /*finalize=*/false, /*destroyPointers=*/false, &terminator);
-              continue; // F'2018 10.2.1.3(13)(2)
+      if (HasDynamicComponent(*from_)) {
+        // If 'from' has allocatable/automatic component, we cannot
+        // just make a shallow copy of the descriptor member.
+        // This will still leave data overlap in 'to' and 'newFrom'.
+        // For example:
+        //   type t
+        //     character, allocatable :: c(:)
+        //   end type t
+        //   type(t) :: x(3)
+        //   x(2:3) = x(1:2)
+        // We have to make a deep copy into 'newFrom' in this case.
+        if (const DescriptorAddendum *addendum{newFrom.Addendum()}) {
+          if (const auto *derived{addendum->derivedType()}) {
+            if (!derived->noInitializationNeeded()) {
+              if (int status{workQueue.BeginInitialize(newFrom, *derived)};
+                  status != StatOk && status != StatContinue) {
+                return status;
+              }
             }
           }
-          // Force LHS deallocation with DeallocateLHS flag.
-          // The actual deallocation may be avoided, if the existing
-          // location can be reoccupied.
-          Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS);
-        } break;
         }
+        static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS};
+        if (int status{workQueue.BeginAssign(
+                newFrom, *from_, nestedFlags, memmoveFct_, nullptr)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+      } else {
+        ShallowCopy(newFrom, *from_, true, from_->IsContiguous());
       }
-      // Copy procedure pointer components
-      const Descriptor &procPtrDesc{updatedToDerived->procPtr()};
-      std::size_t numProcPtrs{procPtrDesc.Elements()};
-      for (std::size_t k{0}; k < numProcPtrs; ++k) {
-        const auto &procPtr{
-            *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(
-                k)};
-        memmoveFct(to.Element<char>(toAt) + procPtr.offset,
-            from.Element<const char>(fromAt) + procPtr.offset,
-            sizeof(typeInfo::ProcedurePointer));
+      from_ = &newFrom; // this is why from_ has to be a pointer
+      flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment |
+          ExplicitLengthCharacterLHS | CanBeDefinedAssignment;
+      toDeallocate_ = &newFrom;
+    }
+  }
+  if (to_.IsAllocatable()) {
+    if (mustDeallocateLHS) {
+      if (!toDeallocate_ && to_.IsAllocated()) {
+        toDeallocate_ = &to_;
+      }
+    } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) {
+      workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in "
+                                   "assignment to unallocated allocatable",
+          to_.rank(), from_->rank());
+    }
+  } else if (!to_.IsAllocated()) {
+    workQueue.terminator().Crash(
+        "Assign: left-hand side variable is neither allocated nor allocatable");
+  }
+  if (toDerived_ && to_.IsAllocated()) {
+    // Schedule finalization or destruction of the LHS.
+    if (flags_ & NeedFinalization) {
+      if (int status{workQueue.BeginFinalize(to_, *toDerived_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    } else if (!toDerived_->noDestructionNeeded()) {
+      if (int status{
+              workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
     }
-  } else { // intrinsic type, intrinsic assignment
-    if (isSimpleMemmove()) {
-      memmoveFct(to.raw().base_addr, from.raw().base_addr,
-          toElements * toElementBytes);
-    } else if (toElementBytes > fromElementBytes) { // blank padding
-      switch (to.type().raw()) {
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
+  if (done_) {
+    // All child tickets are complete; can release this ticket's state.
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+    }
+    return StatOk;
+  }
+  // All necessary finalization or destruction that was initiated by Begin()
+  // has been completed.  Deallocation may be pending, and if it's for the LHS,
+  // do it now so that the LHS gets reallocated.
+  if (toDeallocate_ == &to_) {
+    toDeallocate_ = nullptr;
+    to_.Deallocate();
+  }
+  // Allocate the LHS if needed
+  if (!to_.IsAllocated()) {
+    if (int stat{
+            AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)};
+        stat != StatOk) {
+      return stat;
+    }
+    const auto *addendum{to_.Addendum()};
+    toDerived_ = addendum ? addendum->derivedType() : nullptr;
+    if (toDerived_) {
+      if (!toDerived_->noInitializationNeeded()) {
+        if (int status{workQueue.BeginInitialize(to_, *toDerived_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    }
+  }
+  // Check for a user-defined assignment type-bound procedure;
+  // see 10.2.1.4-5.
+  // Note that the aliasing and LHS (re)allocation handling above
+  // needs to run even with CanBeDefinedAssignment flag, since
+  // Assign() can be invoked recursively for component-wise assignments.
+  // The declared type (if known) must be used for generic resolution
+  // of ASSIGNMENT(=) to a binding, but that binding can be overridden.
+  if (declaredType_ && (flags_ & CanBeDefinedAssignment)) {
+    if (to_.rank() == 0) {
+      if (const auto *special{declaredType_->FindSpecialBinding(
+              typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
+        DoScalarDefinedAssignment(to_, *from_, *toDerived_, *special);
+        done_ = true;
+        return StatContinue;
+      }
+    }
+    if (const auto *special{declaredType_->FindSpecialBinding(
+            typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
+      DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special);
+      done_ = true;
+      return StatContinue;
+    }
+  }
+  // Intrinsic assignment
+  std::size_t toElements{to_.Elements()};
+  if (from_->rank() > 0 && toElements != from_->Elements()) {
+    workQueue.terminator().Crash("Assign: mismatching element counts in array "
+                                 "assignment (to %zd, from %zd)",
+        toElements, from_->Elements());
+  }
+  if (to_.type() != from_->type()) {
+    workQueue.terminator().Crash(
+        "Assign: mismatching types (to code %d != from code %d)",
+        to_.type().raw(), from_->type().raw());
+  }
+  std::size_t toElementBytes{to_.ElementBytes()};
+  std::size_t fromElementBytes{from_->ElementBytes()};
+  if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) {
+    workQueue.terminator().Crash("Assign: mismatching non-character element "
+                                 "sizes (to %zd bytes != from %zd bytes)",
+        toElementBytes, fromElementBytes);
+  }
+  if (toDerived_) {
+    if (toDerived_->noDefinedAssignment()) { // componentwise
+      if (int status{workQueue.BeginDerivedAssign<true>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    } else { // elementwise
+      if (int status{workQueue.BeginDerivedAssign<false>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    }
+    toDeallocate_ = nullptr;
+  } else if (IsSimpleMemmove()) {
+    memmoveFct_(to_.raw().base_addr, from_->raw().base_addr,
+        toElements * toElementBytes);
+  } else {
+    // Scalar expansion of the RHS is implied by using the same empty
+    // subscript values on each (seemingly) elemental reference into
+    // "from".
+    SubscriptValue toAt[maxRank];
+    to_.GetLowerBounds(toAt);
+    SubscriptValue fromAt[maxRank];
+    from_->GetLowerBounds(fromAt);
+    if (toElementBytes > fromElementBytes) { // blank padding
+      switch (to_.type().raw()) {
       case CFI_type_signed_char:
       case CFI_type_char:
-        BlankPadCharacterAssignment<char>(to, from, toAt, fromAt, toElements,
+        BlankPadCharacterAssignment<char>(to_, *from_, toAt, fromAt, toElements,
             toElementBytes, fromElementBytes);
         break;
       case CFI_type_char16_t:
-        BlankPadCharacterAssignment<char16_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char16_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       case CFI_type_char32_t:
-        BlankPadCharacterAssignment<char32_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char32_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       default:
-        terminator.Crash("unexpected type code %d in blank padded Assign()",
-            to.type().raw());
+        workQueue.terminator().Crash(
+            "unexpected type code %d in blank padded Assign()",
+            to_.type().raw());
       }
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
-          to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-        memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
+          to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) {
+        memmoveFct_(to_.Element<char>(toAt), from_->Element<const char>(fromAt),
             toElementBytes);
       }
     }
   }
-  if (deferDeallocation) {
-    // deferDeallocation is used only when LHS is an allocatable.
-    // The finalization has already been run for it.
-    deferDeallocation->Destroy(
-        /*finalize=*/false, /*destroyPointers=*/false, &terminator);
+  if (persist_) {
+    done_ = true;
+    return StatContinue;
+  } else {
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+      toDeallocate_ = nullptr;
+    }
+    return StatOk;
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
+    WorkQueue &workQueue) {
+  if (toIsContiguous_ && fromIsContiguous_ &&
+      this->derived_.noDestructionNeeded() &&
+      this->derived_.noDefinedAssignment() &&
+      this->instance_.rank() == this->from_->rank()) {
+    if (std::size_t elementBytes{this->instance_.ElementBytes()};
+        elementBytes == this->from_->ElementBytes()) {
+      // Fastest path.  Both LHS and RHS are contiguous, RHS is not a scalar
+      // to be expanded, the types have the same size, and there are no
+      // allocatable components or defined ASSIGNMENT(=) at any level.
+      memmoveFct_(this->instance_.template OffsetElement<char>(),
+          this->from_->template OffsetElement<const char *>(),
+          this->instance_.Elements() * elementBytes);
+      return StatOk;
+    }
+  }
+  // Use PolymorphicLHS for components so that the right things happen
+  // when the components are polymorphic; when they're not, they're both
+  // not, and their declared types will match.
+  int nestedFlags{MaybeReallocate | PolymorphicLHS};
+  if (flags_ & ComponentCanBeDefinedAssignment) {
+    nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
+  }
+  flags_ = nestedFlags;
+  // Copy procedure pointer components
+  const Descriptor &procPtrDesc{this->derived_.procPtr()};
+  bool noDataComponents{this->IsComplete()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &procPtr{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (k > 0) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        memmoveFct_(this->instance_.template ElementComponent<char>(
+                        this->subscripts_, procPtr.offset),
+            this->from_->template ElementComponent<const char>(
+                this->fromSubscripts_, procPtr.offset),
+            sizeof(typeInfo::ProcedurePointer));
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  if (noDataComponents) {
+    return StatOk;
+  }
+  return StatContinue;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Begin(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Begin(WorkQueue &);
+
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
+    WorkQueue &workQueue) {
+  while (!this->IsComplete()) {
+    // Copy the data components (incl. the parent) first.
+    switch (this->component_->genre()) {
+    case typeInfo::Component::Genre::Data:
+      if (this->component_->category() == TypeCategory::Derived) {
+        Descriptor &toCompDesc{this->componentDescriptor_.descriptor()};
+        Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()};
+        this->component_->CreatePointerDescriptor(toCompDesc, this->instance_,
+            workQueue.terminator(), this->subscripts_);
+        this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_,
+            workQueue.terminator(), this->fromSubscripts_);
+        const auto *componentDerived{this->component_->derivedType()};
+        this->Advance();
+        if (int status{workQueue.BeginAssign(toCompDesc, fromCompDesc, flags_,
+                memmoveFct_, componentDerived)};
+            status != StatOk) {
+          return status;
+        }
+      } else { // Component has intrinsic type; simply copy raw bytes
+        std::size_t componentByteSize{
+            this->component_->SizeInBytes(this->instance_)};
+        if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+          std::size_t offset{this->component_->offset()};
+          char *to{this->instance_.template OffsetElement<char>(offset)};
+          const char *from{
+              this->from_->template OffsetElement<const char>(offset)};
+          std::size_t toElementStride{this->instance_.ElementBytes()};
+          std::size_t fromElementStride{
+              this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+          if (toElementStride == fromElementStride &&
+              toElementStride == componentByteSize) {
+            memmoveFct_(to, from, this->elements_ * componentByteSize);
+          } else {
+            for (std::size_t n{this->elements_}; n--;
+                to += toElementStride, from += fromElementStride) {
+              memmoveFct_(to, from, componentByteSize);
+            }
+          }
+          this->Componentwise::Advance();
+        } else {
+          memmoveFct_(
+              this->instance_.template Element<char>(this->subscripts_) +
+                  this->component_->offset(),
+              this->from_->template Element<const char>(this->fromSubscripts_) +
+                  this->component_->offset(),
+              componentByteSize);
+          this->Advance();
+        }
+      }
+      break;
+    case typeInfo::Component::Genre::Pointer: {
+      std::size_t componentByteSize{
+          this->component_->SizeInBytes(this->instance_)};
+      if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+        std::size_t offset{this->component_->offset()};
+        char *to{this->instance_.template OffsetElement<char>(offset)};
+        const char *from{
+            this->from_->template OffsetElement<const char>(offset)};
+        std::size_t toElementStride{this->instance_.ElementBytes()};
+        std::size_t fromElementStride{
+            this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+        if (toElementStride == fromElementStride &&
+            toElementStride == componentByteSize) {
+          memmoveFct_(to, from, this->elements_ * componentByteSize);
+        } else {
+          for (std::size_t n{this->elements_}; n--;
+              to += toElementStride, from += fromElementStride) {
+            memmoveFct_(to, from, componentByteSize);
+          }
+        }
+        this->Componentwise::Advance();
+      } else {
+        memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
+                this->component_->offset(),
+            this->from_->template Element<const char>(this->fromSubscripts_) +
+                this->component_->offset(),
+            componentByteSize);
+        this->Advance();
+      }
+    } break;
+    case typeInfo::Component::Genre::Allocatable:
+    case typeInfo::Component::Genre::Automatic: {
+      auto *toDesc{reinterpret_cast<Descriptor *>(
+          this->instance_.template Element<char>(this->subscripts_) +
+          this->component_->offset())};
+      const auto *fromDesc{reinterpret_cast<const Descriptor *>(
+          this->from_->template Element<char>(this->fromSubscripts_) +
+          this->component_->offset())};
+      const auto *componentDerived{this->component_->derivedType()};
+      if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
+        if (toDesc->IsAllocated()) {
+          if (this->phase_ == 0) {
+            this->phase_++;
+            if (componentDerived && !componentDerived->noDestructionNeeded()) {
+              if (int status{workQueue.BeginDestroy(
+                      *toDesc, *componentDerived, /*finalize=*/false)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
+          toDesc->Deallocate();
+        }
+        this->Advance();
+      } else {
+        // Allocatable components of the LHS are unconditionally
+        // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
+        // unlike a "top-level" assignment to a variable, where
+        // deallocation is optional.
+        int nestedFlags{flags_};
+        if (!componentDerived ||
+            (componentDerived->noFinalizationNeeded() &&
+                componentDerived->noInitializationNeeded() &&
+                componentDerived->noDestructionNeeded())) {
+          // The actual deallocation might be avoidable when the existing
+          // location can be reoccupied.
+          nestedFlags |= MaybeReallocate | UpdateLHSBounds;
+        } else {
+          // Force LHS deallocation with DeallocateLHS flag.
+          nestedFlags |= DeallocateLHS;
+        }
+        this->Advance();
+        if (int status{workQueue.BeginAssign(*toDesc, *fromDesc, nestedFlags,
+                memmoveFct_, componentDerived)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    } break;
+    }
+  }
+  if (deallocateAfter_) {
+    deallocateAfter_->Deallocate();
+  }
+  return StatOk;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Continue(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Continue(WorkQueue &);
 
 RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
     const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
@@ -582,7 +783,6 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
       }
     }
   }
-
   Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
@@ -599,7 +799,6 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
 void RTDEF(CopyOutAssign)(
     Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-
   // Copyout from the temporary must not cause any finalizations
   // for LHS. The variable must be properly initialized already.
   if (var) {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 35037036f63e..4e36b1e2edfc 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -12,6 +12,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -30,180 +31,192 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 }
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
-    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
-    const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{instance.Elements()};
-  int stat{StatOk};
-  // Initialize data components in each element; the per-element iterations
-  // constitute the inner loops, not the outer ones
-  std::size_t myComponents{componentDesc.Elements()};
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &allocDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(allocDesc, instance, terminator);
-        allocDesc.raw().attribute = CFI_attribute_allocatable;
-        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
-          stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
-              if (const auto *derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  stat = Initialize(
-                      allocDesc, *derived, terminator, hasStat, errMsg);
-                }
-              }
-            }
-          }
-          if (stat != StatOk) {
-            break;
-          }
-        }
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool,
+    const Descriptor *) {
+  WorkQueue workQueue{terminator};
+  int status{workQueue.BeginInitialize(instance, derived)};
+  return status == StatContinue ? workQueue.Run() : status;
+}
+
+RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) {
+  // Initialize procedure pointer components in each element
+  const Descriptor &procPtrDesc{derived_.procPtr()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &comp{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (k > 0) {
+        Elementwise::Reset();
       }
-    } else if (const void *init{comp.initialization()}) {
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        auto &pptr{*instance_.ElementComponent<typeInfo::ProcedurePointer>(
+            subscripts_, comp.offset)};
+        pptr = comp.procInitialization;
+      }
+    }
+    if (IsComplete()) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      // Establish allocatable descriptors
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &allocDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            allocDesc, instance_, workQueue.terminator());
+        allocDesc.raw().attribute = CFI_attribute_allocatable;
+      }
+      SkipToNextComponent();
+    } else if (const void *init{component_->initialization()}) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
-      std::size_t bytes{comp.SizeInBytes(instance)};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        char *ptr{instance.ElementComponent<char>(at, comp.offset())};
+      std::size_t bytes{component_->SizeInBytes(instance_)};
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        char *ptr{instance_.ElementComponent<char>(
+            subscripts_, component_->offset())};
         std::memcpy(ptr, init, bytes);
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Pointer) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Pointer) {
       // Data pointers without explicit initialization are established
       // so that they are valid right-hand side targets of pointer
       // assignment statements.
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &ptrDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(ptrDesc, instance, terminator);
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &ptrDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            ptrDesc, instance_, workQueue.terminator());
         ptrDesc.raw().attribute = CFI_attribute_pointer;
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noInitializationNeeded()) {
       // Default initialization of non-pointer non-allocatable/automatic
-      // data component.  Handles parent component's elements.  Recursive.
+      // data component.  Handles parent component's elements.
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, instance);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            instance.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = Initialize(compDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
-        }
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginInitialize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  // Initialize procedure pointer components in each element
-  const Descriptor &procPtrDesc{derived.procPtr()};
-  std::size_t myProcPtrs{procPtrDesc.Elements()};
-  for (std::size_t k{0}; k < myProcPtrs; ++k) {
-    const auto &comp{
-        *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-      auto &pptr{*instance.ElementComponent<typeInfo::ProcedurePointer>(
-          at, comp.offset)};
-      pptr = comp.procInitialization;
-    }
-  }
-  return stat;
+  return StatOk;
 }
 
 RT_API_ATTRS int InitializeClone(const Descriptor &clone,
-    const Descriptor &orig, const typeInfo::DerivedType &derived,
+    const Descriptor &original, const typeInfo::DerivedType &derived,
     Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{orig.Elements()};
-  int stat{StatOk};
-
-  // Skip pointers and unallocated variables.
-  if (orig.IsPointer() || !orig.IsAllocated()) {
-    return stat;
+  if (original.IsPointer() || !original.IsAllocated()) {
+    return StatOk; // nothing to do
+  } else {
+    WorkQueue workQueue{terminator};
+    int status{workQueue.BeginInitializeClone(
+        clone, original, derived, hasStat, errMsg)};
+    return status == StatContinue ? workQueue.Run() : status;
   }
-  // Initialize each data component.
-  std::size_t components{componentDesc.Elements()};
-  for (std::size_t i{0}; i < components; ++i) {
-    const typeInfo::Component &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
-    SubscriptValue at[maxRank];
-    orig.GetLowerBounds(at);
-    // Allocate allocatable components that are also allocated in the original
-    // object.
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
-        Descriptor &origDesc{
-            *orig.ElementComponent<Descriptor>(at, comp.offset())};
-        Descriptor &cloneDesc{
-            *clone.ElementComponent<Descriptor>(at, comp.offset())};
-        if (origDesc.IsAllocated()) {
+}
+
+RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      Descriptor &origDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (origDesc.IsAllocated()) {
+        Descriptor &cloneDesc{*clone_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        if (phase_ == 0) {
+          ++phase_;
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
-          stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
-              if (const typeInfo::DerivedType *
-                  derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  // Perform default initialization for the allocated element.
-                  stat = Initialize(
-                      cloneDesc, *derived, terminator, hasStat, errMsg);
-                }
-                // Initialize derived type's allocatables.
-                if (stat == StatOk) {
-                  stat = InitializeClone(cloneDesc, origDesc, *derived,
-                      terminator, hasStat, errMsg);
+          if (int stat{ReturnError(workQueue.terminator(),
+                  cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)};
+              stat != StatOk) {
+            return stat;
+          }
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              if (!derived->noInitializationNeeded()) {
+                // Perform default initialization for the allocated element.
+                if (int status{workQueue.BeginInitialize(cloneDesc, *derived)};
+                    status != StatOk) {
+                  return status;
                 }
               }
             }
           }
         }
-        if (stat != StatOk) {
-          break;
+        if (phase_ == 1) {
+          ++phase_;
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              // Initialize derived type's allocatables.
+              if (int status{workQueue.BeginInitializeClone(
+                      cloneDesc, origDesc, *derived, hasStat_, errMsg_)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType()) {
-      // Handle nested derived types.
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, orig);
-      // Data components don't have descriptors, allocate them.
-      StaticDescriptor<maxRank, true, 0> origStaticDesc;
-      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
-      Descriptor &origDesc{origStaticDesc.descriptor()};
-      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (component_->derivedType()) {
+        // Handle nested derived types.
+        const typeInfo::DerivedType &compType{*component_->derivedType()};
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &origDesc{componentDescriptor_.descriptor()};
+        Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()};
         origDesc.Establish(compType,
-            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
         cloneDesc.Establish(compType,
-            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = InitializeClone(
-            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
+            clone_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginInitializeClone(
+                cloneDesc, origDesc, compType, hasStat_, errMsg_)};
+            status != StatOk) {
+          return status;
         }
+      } else {
+        SkipToNextComponent();
       }
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatOk;
+}
+
+// Fortran 2018 subclause 7.5.6.2
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Finalize() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) {
+      workQueue.Run();
     }
   }
-  return stat;
 }
 
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
@@ -221,7 +234,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
 }
 
 static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
+    const typeInfo::DerivedType &derived, Terminator &terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
       std::size_t elements{descriptor.Elements()};
@@ -258,9 +271,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy = descriptor;
         copy.set_base_addr(nullptr);
         copy.raw().attribute = CFI_attribute_allocatable;
-        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
-        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
+        RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
@@ -284,87 +295,94 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
   }
 }
 
-// Fortran 2018 subclause 7.5.6.2
-RT_API_ATTRS void Finalize(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
-    return;
-  }
-  CallFinalSubroutine(descriptor, derived, terminator);
-  const auto *parentType{derived.GetParentType()};
-  bool recurse{parentType && !parentType->noFinalizationNeeded()};
+RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) {
+  CallFinalSubroutine(instance_, derived_, workQueue.terminator());
   // If there's a finalizable parent component, handle it last, as required
   // by the Fortran standard (7.5.6.2), and do so recursively with the same
   // descriptor so that the rank is preserved.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  for (auto k{recurse ? std::size_t{1}
-                      /* skip first component, it's the parent */
-                      : 0};
-       k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    descriptor.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable &&
-        comp.category() == TypeCategory::Derived) {
+  finalizableParentType_ = derived_.GetParentType();
+  if (finalizableParentType_) {
+    if (finalizableParentType_->noFinalizationNeeded()) {
+      finalizableParentType_ = nullptr;
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable &&
+        component_->category() == TypeCategory::Derived) {
       // Component may be polymorphic or unlimited polymorphic. Need to use the
       // dynamic type to check whether finalization is needed.
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        const Descriptor &compDesc{
-            *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (compDesc.IsAllocated()) {
-          if (const DescriptorAddendum * addendum{compDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *
-                compDynamicType{addendum->derivedType()}) {
-              if (!compDynamicType->noFinalizationNeeded()) {
-                Finalize(compDesc, *compDynamicType, terminator);
+      const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (const DescriptorAddendum *addendum{compDesc.Addendum()}) {
+          if (const typeInfo::DerivedType *compDynamicType{
+                  addendum->derivedType()}) {
+            if (!compDynamicType->noFinalizationNeeded()) {
+              if (int status{
+                      workQueue.BeginFinalize(compDesc, *compDynamicType)};
+                  status != StatOk) {
+                return status;
               }
             }
           }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      if (const typeInfo::DerivedType * compType{comp.derivedType()}) {
-        if (!compType->noFinalizationNeeded()) {
-          for (std::size_t j{0}; j++ < elements;
-               descriptor.IncrementSubscripts(at)) {
-            const Descriptor &compDesc{
-                *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-            if (compDesc.IsAllocated()) {
-              Finalize(compDesc, *compType, terminator);
-            }
+    } else if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      if (const typeInfo::DerivedType *compType{component_->derivedType()};
+          compType && !compType->noFinalizationNeeded()) {
+        const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        Advance();
+        if (compDesc.IsAllocated()) {
+          if (int status{workQueue.BeginFinalize(compDesc, *compType)};
+              status != StatOk) {
+            return status;
           }
         }
+      } else {
+        SkipToNextComponent();
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) {
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noFinalizationNeeded()) {
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Finalize(compDesc, compType, terminator);
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginFinalize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  if (recurse) {
-    StaticDescriptor<maxRank, true, 8 /*?*/> statDesc;
-    Descriptor &tmpDesc{statDesc.descriptor()};
-    tmpDesc = descriptor;
+  // Last, do the parent component, if any and finalizable.
+  if (finalizableParentType_) {
+    Descriptor &tmpDesc{componentDescriptor_.descriptor()};
+    tmpDesc = instance_;
     tmpDesc.raw().attribute = CFI_attribute_pointer;
-    tmpDesc.Addendum()->set_derivedType(parentType);
-    tmpDesc.raw().elem_len = parentType->sizeInBytes();
-    Finalize(tmpDesc, *parentType, terminator);
+    tmpDesc.Addendum()->set_derivedType(finalizableParentType_);
+    tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes();
+    const auto &parentType{*finalizableParentType_};
+    finalizableParentType_ = nullptr;
+    // Don't return StatOk here if the nested FInalize is still running;
+    // it needs this->componentDescriptor_.
+    return workQueue.BeginFinalize(tmpDesc, parentType);
   }
+  return StatOk;
 }
 
 // The order of finalization follows Fortran 2018 7.5.6.2, with
@@ -373,53 +391,73 @@ RT_API_ATTRS void Finalize(const Descriptor &descriptor,
 // preceding any deallocation.
 RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
-    return;
-  }
-  if (finalize && !derived.noFinalizationNeeded()) {
-    Finalize(descriptor, derived, terminator);
-  }
-  // Deallocate all direct and indirect allocatable and automatic components.
-  // Contrary to finalization, the order of deallocation does not matter.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  SubscriptValue at[maxRank];
-  descriptor.GetLowerBounds(at);
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    const bool destroyComp{
-        comp.derivedType() && !comp.derivedType()->noDestructionNeeded()};
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j < elements; ++j) {
-        Descriptor *d{
-            descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (destroyComp) {
-          Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator);
-        }
-        d->Deallocate();
-        descriptor.IncrementSubscripts(at);
-      }
-    } else if (destroyComp &&
-        comp.genre() == typeInfo::Component::Genre::Data) {
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator);
-      }
+  if (descriptor.IsAllocated() && !derived.noDestructionNeeded()) {
+    Terminator stubTerminator{"Destroy() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) {
+      workQueue.Run();
     }
   }
 }
 
+RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) {
+  if (finalize_ && !derived_.noFinalizationNeeded()) {
+    if (int status{workQueue.BeginFinalize(instance_, derived_)};
+        status != StatOk && status != StatContinue) {
+      return status;
+    }
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) {
+  // Deallocate all direct and indirect allocatable and automatic components.
+  // Contrary to finalization, the order of deallocation does not matter.
+  while (!IsComplete()) {
+    const auto *componentDerived{component_->derivedType()};
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      Descriptor *d{instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (d->IsAllocated()) {
+        if (phase_ == 0) {
+          ++phase_;
+          if (componentDerived && !componentDerived->noDestructionNeeded()) {
+            if (int status{workQueue.BeginDestroy(
+                    *d, *componentDerived, /*finalize=*/false)};
+                status != StatOk) {
+              return status;
+            }
+          }
+        }
+        d->Deallocate();
+      }
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (!componentDerived || componentDerived->noDestructionNeeded()) {
+        SkipToNextComponent();
+      } else {
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &compDesc{componentDescriptor_.descriptor()};
+        const typeInfo::DerivedType &compType{*componentDerived};
+        compDesc.Establish(compType,
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginDestroy(
+                compDesc, *componentDerived, /*finalize=*/false)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatOk;
+}
+
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived = addendum->derivedType()) {
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 3db1455af52f..e7b99e6fc3a2 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -7,15 +7,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
+#include "edit-input.h"
+#include "edit-output.h"
+#include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/restorer.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/freestanding-tools.h"
 
+// Implementation of I/O data list item transfers based on descriptors.
+// (All I/O items come through here so that the code is exercised for test;
+// some scalar I/O data transfer APIs could be changed to bypass their use
+// of descriptors in the future for better efficiency.)
+
 namespace Fortran::runtime::io::descr {
 RT_OFFLOAD_API_GROUP_BEGIN
 
+template <typename A>
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
+  A *p{descriptor.Element<A>(subscripts)};
+  if (!p) {
+    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
+                                 "address or subscripts out of range");
+  }
+  return *p;
+}
+
 // Defined formatted I/O (maybe)
-Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &io, const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
   Fortran::common::optional<DataEdit> peek{
@@ -65,10 +94,13 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
       // I/O subroutine reads counts towards READ(SIZE=).
       startPos = io.InquirePos();
     }
+    const auto *bindings{
+        derived.binding().OffsetElement<const typeInfo::Binding>()};
     if (special.IsArgDescriptor(0)) {
       // "dtv" argument is "class(t)", pass a descriptor
       auto *p{special.GetProc<void (*)(const Descriptor &, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>()};
+          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
+          bindings)};
       StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
       Descriptor &elementDesc{elementStatDesc.descriptor()};
       elementDesc.Establish(
@@ -79,7 +111,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
     } else {
       // "dtv" argument is "type(t)", pass a raw pointer
       auto *p{special.GetProc<void (*)(const void *, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>()};
+          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
+          bindings)};
       p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
           ioMsg, ioTypeLen, sizeof ioMsg);
     }
@@ -104,8 +137,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
 }
 
 // Defined unformatted I/O
-bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived,
+static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   // Unformatted I/O must have an external unit (or child thereof).
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -121,10 +154,12 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
   descriptor.GetLowerBounds(subscripts);
+  const auto *bindings{
+      derived.binding().OffsetElement<const typeInfo::Binding>()};
   if (special.IsArgDescriptor(0)) {
     // "dtv" argument is "class(t)", pass a descriptor
     auto *p{special.GetProc<void (*)(
-        const Descriptor &, int &, int &, char *, std::size_t)>()};
+        const Descriptor &, int &, int &, char *, std::size_t)>(bindings)};
     StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
     Descriptor &elementDesc{elementStatDesc.descriptor()};
     elementDesc.Establish(derived, nullptr, 0, nullptr, CFI_attribute_pointer);
@@ -137,8 +172,9 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
     }
   } else {
     // "dtv" argument is "type(t)", pass a raw pointer
-    auto *p{special.GetProc<void (*)(
-        const void *, int &, int &, char *, std::size_t)>()};
+    auto *p{special
+            .GetProc<void (*)(const void *, int &, int &, char *, std::size_t)>(
+                bindings)};
     for (; numElements-- > 0; descriptor.IncrementSubscripts(subscripts)) {
       p(descriptor.Element<char>(subscripts), unit, ioStat, ioMsg,
           sizeof ioMsg);
@@ -152,5 +188,619 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   return handler.GetIoStat() == IostatOk;
 }
 
+// Per-category descriptor-based I/O templates
+
+// TODO (perhaps as a nontrivial but small starter project): implement
+// automatic repetition counts, like "10*3.14159", for list-directed and
+// NAMELIST array output.
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
+    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditIntegerInput(
+                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedIntegerIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedRealIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedRealIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedComplexIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  bool isListOutput{
+      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
+    if (isListOutput) {
+      DataEdit rEdit, iEdit;
+      rEdit.descriptor = DataEdit::ListDirectedRealPart;
+      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
+      rEdit.modes = iEdit.modes = io.mutableModes();
+      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
+          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
+        return false;
+      }
+    } else {
+      for (int k{0}; k < 2; ++k, ++x) {
+        auto edit{io.GetNextDataEdit()};
+        if (!edit) {
+          return false;
+        } else if constexpr (DIR == Direction::Output) {
+          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
+            return false;
+          }
+        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
+          break;
+        } else if (EditRealInput<KIND>(
+                       io, *edit, reinterpret_cast<void *>(x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedComplexIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <typename A, Direction DIR>
+inline RT_API_ATTRS bool FormattedCharacterIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
+          return false;
+        }
+      } else { // input
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          if (EditCharacterInput(io, *edit, x, length)) {
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedLogicalIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditLogicalOutput(io, *edit, x != 0)) {
+          return false;
+        }
+      } else {
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          bool truth{};
+          if (EditLogicalInput(io, *edit, truth)) {
+            x = truth;
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <Direction DIR>
+RT_API_ATTRS int DerivedIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Data) {
+      // Create a descriptor for the component
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      component_->CreatePointerDescriptor(
+          compDesc, instance_, io_.GetIoErrorHandler(), subscripts_);
+      Advance();
+      if (int status{workQueue.BeginDescriptorIo<DIR>(
+              io_, compDesc, table_, anyIoTookPlace_)};
+          status != StatOk) {
+        return status;
+      }
+    } else {
+      // Component is itself a descriptor
+      char *pointer{
+          instance_.Element<char>(subscripts_) + component_->offset()};
+      const Descriptor &compDesc{
+          *reinterpret_cast<const Descriptor *>(pointer)};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (int status{workQueue.BeginDescriptorIo<DIR>(
+                io_, compDesc, table_, anyIoTookPlace_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DerivedIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DerivedIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
+  IoErrorHandler &handler{io_.GetIoErrorHandler()};
+  if (handler.InError()) {
+    return handler.GetIoStat();
+  }
+  if (!io_.get_if<IoDirectionState<DIR>>()) {
+    handler.Crash("DescriptorIO() called for wrong I/O direction");
+    return handler.GetIoStat();
+  }
+  if constexpr (DIR == Direction::Input) {
+    if (!io_.BeginReadingRecord()) {
+      return StatOk;
+    }
+  }
+  if (!io_.get_if<FormattedIoStatementState<DIR>>()) {
+    // Unformatted I/O
+    IoErrorHandler &handler{io_.GetIoErrorHandler()};
+    const DescriptorAddendum *addendum{instance_.Addendum()};
+    if (const typeInfo::DerivedType *type{
+            addendum ? addendum->derivedType() : nullptr}) {
+      // derived type unformatted I/O
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*type,
+                DIR == Direction::Input
+                    ? common::DefinedIo::ReadUnformatted
+                    : common::DefinedIo::WriteUnformatted)}) {
+          if (definedIo->subroutine) {
+            typeInfo::SpecialBinding special{DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                    : typeInfo::SpecialBinding::Which::WriteUnformatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false};
+            if (DefinedUnformattedIo(io_, instance_, *type, special)) {
+              anyIoTookPlace_ = true;
+              return StatOk;
+            }
+          } else {
+            int status{workQueue.BeginDerivedIo<DIR>(
+                io_, instance_, *type, table_, anyIoTookPlace_)};
+            return status == StatContinue ? StatOk : status; // done here
+          }
+        }
+      }
+      if (const typeInfo::SpecialBinding *special{
+              type->FindSpecialBinding(DIR == Direction::Input
+                      ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                      : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
+        if (!table_ || !table_->ignoreNonTbpEntries || special->IsTypeBound()) {
+          // defined derived type unformatted I/O
+          if (DefinedUnformattedIo(io_, instance_, *type, *special)) {
+            anyIoTookPlace_ = true;
+            return StatOk;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+      // Default derived type unformatted I/O
+      // TODO: If no component at any level has defined READ or WRITE
+      // (as appropriate), the elements are contiguous, and no byte swapping
+      // is active, do a block transfer via the code below.
+      int status{workQueue.BeginDerivedIo<DIR>(
+          io_, instance_, *type, table_, anyIoTookPlace_)};
+      return status == StatContinue ? StatOk : status; // done here
+    } else {
+      // intrinsic type unformatted I/O
+      auto *externalUnf{io_.get_if<ExternalUnformattedIoStatementState<DIR>>()};
+      ChildUnformattedIoStatementState<DIR> *childUnf{nullptr};
+      InquireIOLengthState *inq{nullptr};
+      bool swapEndianness{false};
+      if (externalUnf) {
+        swapEndianness = externalUnf->unit().swapEndianness();
+      } else {
+        childUnf = io_.get_if<ChildUnformattedIoStatementState<DIR>>();
+        if (!childUnf) {
+          inq = DIR == Direction::Output ? io_.get_if<InquireIOLengthState>()
+                                         : nullptr;
+          RUNTIME_CHECK(handler, inq != nullptr);
+        }
+      }
+      std::size_t elementBytes{instance_.ElementBytes()};
+      std::size_t swappingBytes{elementBytes};
+      if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) {
+        // Byte swapping units can be smaller than elements, namely
+        // for COMPLEX and CHARACTER.
+        if (maybeCatAndKind->first == TypeCategory::Character) {
+          // swap each character position independently
+          swappingBytes = maybeCatAndKind->second; // kind
+        } else if (maybeCatAndKind->first == TypeCategory::Complex) {
+          // swap real and imaginary components independently
+          swappingBytes /= 2;
+        }
+      }
+      using CharType =
+          std::conditional_t<DIR == Direction::Output, const char, char>;
+      auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
+        if constexpr (DIR == Direction::Output) {
+          return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
+              : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
+                             : inq->Emit(&x, totalBytes, swappingBytes);
+        } else {
+          return externalUnf
+              ? externalUnf->Receive(&x, totalBytes, swappingBytes)
+              : childUnf->Receive(&x, totalBytes, swappingBytes);
+        }
+      }};
+      if (!swapEndianness &&
+          instance_.IsContiguous()) { // contiguous unformatted I/O
+        char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+        if (Transfer(x, elements_ * elementBytes)) {
+          anyIoTookPlace_ = true;
+        } else {
+          return IostatEnd;
+        }
+      } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
+        for (; !IsComplete(); Advance()) {
+          char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+          if (Transfer(x, elementBytes)) {
+            anyIoTookPlace_ = true;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+    }
+    // Unformatted I/O never needs to call Continue().
+    return StatOk;
+  }
+  // Formatted I/O
+  if (auto catAndKind{instance_.type().GetCategoryAndKind()}) {
+    TypeCategory cat{catAndKind->first};
+    int kind{catAndKind->second};
+    bool any{false};
+    switch (cat) {
+    case TypeCategory::Integer:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, true);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, true);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, true);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, true);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, true);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Unsigned:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, false);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, false);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, false);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, false);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, false);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Real:
+      switch (kind) {
+      case 2:
+        any = FormattedRealIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedRealIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedRealIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedRealIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedRealIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedRealIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Complex:
+      switch (kind) {
+      case 2:
+        any = FormattedComplexIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedComplexIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedComplexIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedComplexIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedComplexIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedComplexIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Character:
+      switch (kind) {
+      case 1:
+        any = FormattedCharacterIO<char, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedCharacterIO<char16_t, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedCharacterIO<char32_t, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Logical:
+      switch (kind) {
+      case 1:
+        any = FormattedLogicalIO<1, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedLogicalIO<2, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedLogicalIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedLogicalIO<8, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Derived: {
+      // Derived type information must be present for formatted I/O.
+      IoErrorHandler &handler{io_.GetIoErrorHandler()};
+      const DescriptorAddendum *addendum{instance_.Addendum()};
+      RUNTIME_CHECK(handler, addendum != nullptr);
+      derived_ = addendum->derivedType();
+      RUNTIME_CHECK(handler, derived_ != nullptr);
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*derived_,
+                DIR == Direction::Input ? common::DefinedIo::ReadFormatted
+                                        : common::DefinedIo::WriteFormatted)}) {
+          if (definedIo->subroutine) {
+            nonTbpSpecial_.emplace(DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadFormatted
+                    : typeInfo::SpecialBinding::Which::WriteFormatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false);
+            special_ = &*nonTbpSpecial_;
+          }
+        }
+      }
+      if (!special_) {
+        if (const typeInfo::SpecialBinding *binding{
+                derived_->FindSpecialBinding(DIR == Direction::Input
+                        ? typeInfo::SpecialBinding::Which::ReadFormatted
+                        : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
+          if (!table_ || !table_->ignoreNonTbpEntries ||
+              binding->IsTypeBound()) {
+            special_ = binding;
+          }
+        }
+      }
+      return StatContinue;
+    }
+    }
+    if (any) {
+      anyIoTookPlace_ = true;
+    } else {
+      return IostatEnd;
+    }
+  } else {
+    handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
+        static_cast<int>(instance_.type().raw()));
+    return handler.GetIoStat();
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Begin(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Begin(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  // Only derived type formatted I/O gets here.
+  while (!IsComplete()) {
+    if (special_) {
+      if (auto defined{DefinedFormattedIo(
+              io_, instance_, *derived_, *special_, subscripts_)}) {
+        anyIoTookPlace_ |= *defined;
+        Advance();
+        continue;
+      }
+    }
+    Descriptor &elementDesc{elementDescriptor_.descriptor()};
+    elementDesc.Establish(
+        *derived_, nullptr, 0, nullptr, CFI_attribute_pointer);
+    elementDesc.set_base_addr(instance_.Element<char>(subscripts_));
+    Advance();
+    if (int status{workQueue.BeginDerivedIo<DIR>(
+            io_, elementDesc, *derived_, table_, anyIoTookPlace_)};
+        status != StatOk) {
+      return status;
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  bool anyIoTookPlace{false};
+  WorkQueue workQueue{io.GetIoErrorHandler()};
+  if (workQueue.BeginDescriptorIo<DIR>(io, descriptor, table, anyIoTookPlace) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+  return anyIoTookPlace;
+}
+
+template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
index eb60f106c920..88ad59bd24b5 100644
--- a/flang-rt/lib/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -9,619 +9,27 @@
 #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
-// Implementation of I/O data list item transfers based on descriptors.
-// (All I/O items come through here so that the code is exercised for test;
-// some scalar I/O data transfer APIs could be changed to bypass their use
-// of descriptors in the future for better efficiency.)
+#include "flang-rt/runtime/connection.h"
 
-#include "edit-input.h"
-#include "edit-output.h"
-#include "unit.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/namelist.h"
-#include "flang-rt/runtime/terminator.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/optional.h"
-#include "flang/Common/uint128.h"
-#include "flang/Runtime/cpp-type.h"
+namespace Fortran::runtime {
+class Descriptor;
+} // namespace Fortran::runtime
+
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
 
 namespace Fortran::runtime::io::descr {
-template <typename A>
-inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
-    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
-  A *p{descriptor.Element<A>(subscripts)};
-  if (!p) {
-    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
-                                 "address or subscripts out of range");
-  }
-  return *p;
-}
-
-// Per-category descriptor-based I/O templates
-
-// TODO (perhaps as a nontrivial but small starter project): implement
-// automatic repetition counts, like "10*3.14159", for list-directed and
-// NAMELIST array output.
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
-    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditIntegerInput(
-                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedIntegerIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedRealIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedRealIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedComplexIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  bool isListOutput{
-      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
-    if (isListOutput) {
-      DataEdit rEdit, iEdit;
-      rEdit.descriptor = DataEdit::ListDirectedRealPart;
-      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
-      rEdit.modes = iEdit.modes = io.mutableModes();
-      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
-          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
-        return false;
-      }
-    } else {
-      for (int k{0}; k < 2; ++k, ++x) {
-        auto edit{io.GetNextDataEdit()};
-        if (!edit) {
-          return false;
-        } else if constexpr (DIR == Direction::Output) {
-          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
-            return false;
-          }
-        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
-          break;
-        } else if (EditRealInput<KIND>(
-                       io, *edit, reinterpret_cast<void *>(x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedComplexIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <typename A, Direction DIR>
-inline RT_API_ATTRS bool FormattedCharacterIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditCharacterOutput(io, *edit, x, length)) {
-          return false;
-        }
-      } else { // input
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditCharacterInput(io, *edit, x, length)) {
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedCharacterIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedLogicalIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditLogicalOutput(io, *edit, x != 0)) {
-          return false;
-        }
-      } else {
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          bool truth{};
-          if (EditLogicalInput(io, *edit, truth)) {
-            x = truth;
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedLogicalIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
 
 template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
+RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
-// For intrinsic (not defined) derived type I/O, formatted & unformatted
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
-    const typeInfo::Component &component, const Descriptor &origDescriptor,
-    const SubscriptValue origSubscripts[], Terminator &terminator,
-    const NonTbpDefinedIoTable *table) {
-#if !defined(RT_DEVICE_AVOID_RECURSION)
-  if (component.genre() == typeInfo::Component::Genre::Data) {
-    // Create a descriptor for the component
-    StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
-    Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
-        desc, origDescriptor, terminator, origSubscripts);
-    return DescriptorIO<DIR>(io, desc, table);
-  } else {
-    // Component is itself a descriptor
-    char *pointer{
-        origDescriptor.Element<char>(origSubscripts) + component.offset()};
-    const Descriptor &compDesc{*reinterpret_cast<const Descriptor *>(pointer)};
-    return compDesc.IsAllocated() && DescriptorIO<DIR>(io, compDesc, table);
-  }
-#else
-  terminator.Crash("not yet implemented: component IO");
-#endif
-}
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
 
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  SubscriptValue at[maxRank];
-  compArray.GetLowerBounds(at);
-  for (std::size_t k{0}; k < numComponents;
-       ++k, compArray.IncrementSubscripts(at)) {
-    const typeInfo::Component &component{
-        *compArray.Element<typeInfo::Component>(at)};
-    if (!DefaultComponentIO<DIR>(
-            io, component, descriptor, subscripts, handler, table)) {
-      // Return true for NAMELIST input if any component appeared.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && k > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    SubscriptValue at[maxRank];
-    compArray.GetLowerBounds(at);
-    for (std::size_t k{0}; k < numComponents;
-         ++k, compArray.IncrementSubscripts(at)) {
-      const typeInfo::Component &component{
-          *compArray.Element<typeInfo::Component>(at)};
-      if (!DefaultComponentIO<DIR>(
-              io, component, descriptor, subscripts, handler, table)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
-    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
-    const typeInfo::SpecialBinding &, const SubscriptValue[]);
-
-template <Direction DIR>
-static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  // Derived type information must be present for formatted I/O.
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  RUNTIME_CHECK(handler, addendum != nullptr);
-  const typeInfo::DerivedType *type{addendum->derivedType()};
-  RUNTIME_CHECK(handler, type != nullptr);
-  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
-  const typeInfo::SpecialBinding *special{nullptr};
-  if (table) {
-    if (const auto *definedIo{table->Find(*type,
-            DIR == Direction::Input ? common::DefinedIo::ReadFormatted
-                                    : common::DefinedIo::WriteFormatted)}) {
-      if (definedIo->subroutine) {
-        nonTbpSpecial.emplace(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted,
-            definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-            false);
-        special = &*nonTbpSpecial;
-      }
-    }
-  }
-  if (!special) {
-    if (const typeInfo::SpecialBinding *
-        binding{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) {
-        special = binding;
-      }
-    }
-  }
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t numElements{descriptor.Elements()};
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    Fortran::common::optional<bool> result;
-    if (special) {
-      result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
-    }
-    if (!result) {
-      result = DefaultComponentwiseFormattedIO<DIR>(
-          io, descriptor, *type, table, subscripts);
-    }
-    if (!result.value()) {
-      // Return true for NAMELIST input if we got anything.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && j > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
-
-// Unformatted I/O
-template <Direction DIR>
-static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  if (const typeInfo::DerivedType *
-      type{addendum ? addendum->derivedType() : nullptr}) {
-    // derived type unformatted I/O
-    if (table) {
-      if (const auto *definedIo{table->Find(*type,
-              DIR == Direction::Input ? common::DefinedIo::ReadUnformatted
-                                      : common::DefinedIo::WriteUnformatted)}) {
-        if (definedIo->subroutine) {
-          typeInfo::SpecialBinding special{DIR == Direction::Input
-                  ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                  : typeInfo::SpecialBinding::Which::WriteUnformatted,
-              definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-              false};
-          if (Fortran::common::optional<bool> wasDefined{
-                  DefinedUnformattedIo(io, descriptor, *type, special)}) {
-            return *wasDefined;
-          }
-        } else {
-          return DefaultComponentwiseUnformattedIO<DIR>(
-              io, descriptor, *type, table);
-        }
-      }
-    }
-    if (const typeInfo::SpecialBinding *
-        special{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) {
-        // defined derived type unformatted I/O
-        return DefinedUnformattedIo(io, descriptor, *type, *special);
-      }
-    }
-    // Default derived type unformatted I/O
-    // TODO: If no component at any level has defined READ or WRITE
-    // (as appropriate), the elements are contiguous, and no byte swapping
-    // is active, do a block transfer via the code below.
-    return DefaultComponentwiseUnformattedIO<DIR>(io, descriptor, *type, table);
-  } else {
-    // intrinsic type unformatted I/O
-    auto *externalUnf{io.get_if<ExternalUnformattedIoStatementState<DIR>>()};
-    auto *childUnf{io.get_if<ChildUnformattedIoStatementState<DIR>>()};
-    auto *inq{
-        DIR == Direction::Output ? io.get_if<InquireIOLengthState>() : nullptr};
-    RUNTIME_CHECK(handler, externalUnf || childUnf || inq);
-    std::size_t elementBytes{descriptor.ElementBytes()};
-    std::size_t numElements{descriptor.Elements()};
-    std::size_t swappingBytes{elementBytes};
-    if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) {
-      // Byte swapping units can be smaller than elements, namely
-      // for COMPLEX and CHARACTER.
-      if (maybeCatAndKind->first == TypeCategory::Character) {
-        // swap each character position independently
-        swappingBytes = maybeCatAndKind->second; // kind
-      } else if (maybeCatAndKind->first == TypeCategory::Complex) {
-        // swap real and imaginary components independently
-        swappingBytes /= 2;
-      }
-    }
-    SubscriptValue subscripts[maxRank];
-    descriptor.GetLowerBounds(subscripts);
-    using CharType =
-        std::conditional_t<DIR == Direction::Output, const char, char>;
-    auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
-      if constexpr (DIR == Direction::Output) {
-        return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
-            : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
-                           : inq->Emit(&x, totalBytes, swappingBytes);
-      } else {
-        return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes)
-                           : childUnf->Receive(&x, totalBytes, swappingBytes);
-      }
-    }};
-    bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()};
-    if (!swapEndianness &&
-        descriptor.IsContiguous()) { // contiguous unformatted I/O
-      char &x{ExtractElement<char>(io, descriptor, subscripts)};
-      return Transfer(x, numElements * elementBytes);
-    } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
-      for (std::size_t j{0}; j < numElements; ++j) {
-        char &x{ExtractElement<char>(io, descriptor, subscripts)};
-        if (!Transfer(x, elementBytes)) {
-          return false;
-        }
-        if (!descriptor.IncrementSubscripts(subscripts) &&
-            j + 1 < numElements) {
-          handler.Crash("DescriptorIO: subscripts out of bounds");
-        }
-      }
-      return true;
-    }
-  }
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  if (handler.InError()) {
-    return false;
-  }
-  if (!io.get_if<IoDirectionState<DIR>>()) {
-    handler.Crash("DescriptorIO() called for wrong I/O direction");
-    return false;
-  }
-  if constexpr (DIR == Direction::Input) {
-    if (!io.BeginReadingRecord()) {
-      return false;
-    }
-  }
-  if (!io.get_if<FormattedIoStatementState<DIR>>()) {
-    return UnformattedDescriptorIO<DIR>(io, descriptor, table);
-  }
-  if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) {
-    TypeCategory cat{catAndKind->first};
-    int kind{catAndKind->second};
-    switch (cat) {
-    case TypeCategory::Integer:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, true);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, true);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, true);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, true);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, true);
-      default:
-        handler.Crash(
-            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Unsigned:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, false);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, false);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, false);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, false);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, false);
-      default:
-        handler.Crash(
-            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Real:
-      switch (kind) {
-      case 2:
-        return FormattedRealIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedRealIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedRealIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedRealIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedRealIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedRealIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Complex:
-      switch (kind) {
-      case 2:
-        return FormattedComplexIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedComplexIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedComplexIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedComplexIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedComplexIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedComplexIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Character:
-      switch (kind) {
-      case 1:
-        return FormattedCharacterIO<char, DIR>(io, descriptor);
-      case 2:
-        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
-      case 4:
-        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Logical:
-      switch (kind) {
-      case 1:
-        return FormattedLogicalIO<1, DIR>(io, descriptor);
-      case 2:
-        return FormattedLogicalIO<2, DIR>(io, descriptor);
-      case 4:
-        return FormattedLogicalIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedLogicalIO<8, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Derived:
-      return FormattedDerivedTypeIO<DIR>(io, descriptor, table);
-    }
-  }
-  handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
-      static_cast<int>(descriptor.type().raw()));
-  return false;
-}
 } // namespace Fortran::runtime::io::descr
 #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 1d5304254ed0..0f0564403c0e 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -143,6 +143,10 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("FLANG_RT_DEBUG")}) {
+    internalDebugging = std::strtol(x, nullptr, 10);
+  }
+
   if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) {
     char *end;
     auto n{std::strtoul(x, &end, 10)};
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index b0cf2180fc6d..1bef387a9771 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -10,6 +10,7 @@
 #include "descriptor-io.h"
 #include "flang-rt/runtime/emit-encoded.h"
 #include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index b08195cd31e0..24d05f369fcb 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
 // Doing the recursion upwards instead of downwards puts the more common
 // cases earlier in the if-chain and has a tangible impact on performance.
 template <typename P, int RANK> struct ShallowCopyRankSpecialize {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     if (to.rank() == RANK && from.rank() == RANK) {
       ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
@@ -217,7 +217,7 @@ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
 };
 
 template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     return false;
   }
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 82182696d70c..d023c3392d55 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
   EstablishDescriptor(descriptor, container, terminator);
+  std::size_t offset{offset_};
   if (subscripts) {
-    descriptor.set_base_addr(container.Element<char>(subscripts) + offset_);
-  } else {
-    descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
+    offset += container.SubscriptsToByteOffset(subscripts);
   }
+  descriptor.set_base_addr(container.OffsetElement<char>() + offset);
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
@@ -279,6 +279,10 @@ FILE *Component::Dump(FILE *f) const {
   }
   std::fprintf(f, " category %d  kind %d  rank %d  offset 0x%zx\n", category_,
       kind_, rank_, static_cast<std::size_t>(offset_));
+  const auto &dtDesc{derivedType_.descriptor()};
+  if (dtDesc.raw().base_addr) {
+    std::fprintf(f, " derivedType_ %p\n", dtDesc.raw().base_addr);
+  }
   if (initialization_) {
     std::fprintf(f, " initialization @ %p:\n",
         reinterpret_cast<const void *>(initialization_));
@@ -325,7 +329,7 @@ FILE *SpecialBinding::Dump(FILE *f) const {
     break;
   }
   std::fprintf(f, "    isArgDescriptorSet: 0x%x\n", isArgDescriptorSet_);
-  std::fprintf(f, "    isTypeBound: 0x%x\n", isTypeBound_);
+  std::fprintf(f, "    isTypeBound: %d\n", isTypeBound_);
   std::fprintf(f, "    isArgContiguousSet: 0x%x\n", isArgContiguousSet_);
   std::fprintf(f, "    proc: %p\n", reinterpret_cast<void *>(proc_));
   return f;
diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp
new file mode 100644
index 000000000000..a508ecb63710
--- /dev/null
+++ b/flang-rt/lib/runtime/work-queue.cpp
@@ -0,0 +1,161 @@
+//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/work-queue.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/visit.h"
+
+namespace Fortran::runtime {
+
+#if !defined(RT_DEVICE_COMPILATION)
+// FLANG_RT_DEBUG code is disabled when false.
+static constexpr bool enableDebugOutput{false};
+#endif
+
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived)
+    : derived_{derived}, components_{derived_.component().Elements()} {
+  GetComponent();
+}
+
+RT_API_ATTRS void Componentwise::GetComponent() {
+  if (IsComplete()) {
+    component_ = nullptr;
+  } else {
+    const Descriptor &componentDesc{derived_.component()};
+    component_ = componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
+        componentAt_);
+  }
+}
+
+RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) {
+  if (!begun) {
+    begun = true;
+    return common::visit(
+        [&workQueue](
+            auto &specificTicket) { return specificTicket.Begin(workQueue); },
+        u);
+  } else {
+    return common::visit(
+        [&workQueue](auto &specificTicket) {
+          return specificTicket.Continue(workQueue);
+        },
+        u);
+  }
+}
+
+RT_API_ATTRS WorkQueue::~WorkQueue() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+  while (firstFree_) {
+    TicketList *next{firstFree_->next};
+    if (!firstFree_->isStatic) {
+      FreeMemory(firstFree_);
+    }
+    firstFree_ = next;
+  }
+}
+
+RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
+  if (!firstFree_) {
+    void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))};
+    firstFree_ = new (p) TicketList;
+    firstFree_->isStatic = false;
+  }
+  TicketList *newTicket{firstFree_};
+  if ((firstFree_ = newTicket->next)) {
+    firstFree_->previous = nullptr;
+  }
+  TicketList *after{insertAfter_ ? insertAfter_->next : nullptr};
+  if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) {
+    newTicket->previous->next = newTicket;
+  } else {
+    first_ = newTicket;
+  }
+  if ((newTicket->next = after)) {
+    after->previous = newTicket;
+  } else {
+    last_ = newTicket;
+  }
+  newTicket->ticket.begun = false;
+#if !defined(RT_DEVICE_COMPILATION)
+  if (enableDebugOutput &&
+      (executionEnvironment.internalDebugging &
+          ExecutionEnvironment::WorkQueue)) {
+    std::fprintf(stderr, "WQ: new ticket\n");
+  }
+#endif
+  return newTicket->ticket;
+}
+
+RT_API_ATTRS int WorkQueue::Run() {
+  while (last_) {
+    TicketList *at{last_};
+    insertAfter_ = last_;
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(),
+          at->ticket.begun ? "Continue" : "Begin");
+    }
+#endif
+    int stat{at->ticket.Continue(*this)};
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: ... stat %d\n", stat);
+    }
+#endif
+    insertAfter_ = nullptr;
+    if (stat == StatOk) {
+      if (at->previous) {
+        at->previous->next = at->next;
+      } else {
+        first_ = at->next;
+      }
+      if (at->next) {
+        at->next->previous = at->previous;
+      } else {
+        last_ = at->previous;
+      }
+      if ((at->next = firstFree_)) {
+        at->next->previous = at;
+      }
+      at->previous = nullptr;
+      firstFree_ = at;
+    } else if (stat != StatContinue) {
+      Stop();
+      return stat;
+    }
+  }
+  return StatOk;
+}
+
+RT_API_ATTRS void WorkQueue::Stop() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+}
+
+RT_OFFLOAD_API_GROUP_END
+
+} // namespace Fortran::runtime
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 3833e48be3dd..6c148b1de6f8 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) {
   io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__);
   for (int j{1}; j <= 3; ++j) {
     ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc))
-        << "OutputDescriptor() for InquireIoLength";
+        << "OutputDescriptor() for InquireIoLength " << j;
   }
   ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength";
   ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 78d871c593e1..871749934810 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -858,6 +858,16 @@ print *, [(j,j=1,10)]
   warning since such values may have become defined by the time the nested
   expression's value is required.
 
+* Intrinsic assignment of arrays is defined elementally, and intrinsic
+  assignment of derived type components is defined componentwise.
+  However, when intrinsic assignment takes place for an array of derived
+  type, the order of the loop nesting is not defined.
+  Some compilers will loop over the elements, assigning all of the components
+  of each element before proceeding to the next element.
+  This compiler loops over all of the components, and assigns all of
+  the elements for each component before proceeding to the next component.
+  A program using defined assignment might be able to detect the difference.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index bc80997a1bec..7d198bdcc9e8 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -38,7 +38,8 @@ enum AssignFlags {
   ComponentCanBeDefinedAssignment = 1 << 3,
   ExplicitLengthCharacterLHS = 1 << 4,
   PolymorphicLHS = 1 << 5,
-  DeallocateLHS = 1 << 6
+  DeallocateLHS = 1 << 6,
+  UpdateLHSBounds = 1 << 7,
 };
 
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index b13370512e5c..69375a83dec2 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -182,9 +182,12 @@ const Symbol *HasImpureFinal(
     const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
-bool MayRequireFinalization(const DerivedTypeSpec &derived);
+bool MayRequireFinalization(const DerivedTypeSpec &);
 // Does this type have an allocatable direct component?
-bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &);
+// Does this type have any defined assignment at any level (or any polymorphic
+// allocatable)?
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &);
 
 bool IsInBlankCommon(const Symbol &);
 bool IsAssumedLengthCharacter(const Symbol &);
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 26ae81f97895..51ba21a9e5ed 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -82,17 +82,17 @@ private:
       const SomeExpr &genre, std::int64_t = 0) const;
   SomeExpr PackageIntValueExpr(const SomeExpr &genre, std::int64_t = 0) const;
   std::vector<evaluate::StructureConstructor> DescribeBindings(
-      const Scope &dtScope, Scope &);
+      const Scope &dtScope, Scope &, const SymbolVector &bindings);
   std::map<int, evaluate::StructureConstructor> DescribeSpecialGenerics(
-      const Scope &dtScope, const Scope &thisScope,
-      const DerivedTypeSpec *) const;
+      const Scope &dtScope, const Scope &thisScope, const DerivedTypeSpec *,
+      const SymbolVector &bindings) const;
   void DescribeSpecialGeneric(const GenericDetails &,
       std::map<int, evaluate::StructureConstructor> &, const Scope &,
-      const DerivedTypeSpec *) const;
+      const DerivedTypeSpec *, const SymbolVector &bindings) const;
   void DescribeSpecialProc(std::map<int, evaluate::StructureConstructor> &,
       const Symbol &specificOrBinding, bool isAssignment, bool isFinal,
       std::optional<common::DefinedIo>, const Scope *, const DerivedTypeSpec *,
-      bool isTypeBound) const;
+      const SymbolVector *bindings) const;
   void IncorporateDefinedIoGenericInterfaces(
       std::map<int, evaluate::StructureConstructor> &, common::DefinedIo,
       const Scope *, const DerivedTypeSpec *);
@@ -595,8 +595,9 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     // Compile the "vtable" of type-bound procedure bindings
     std::uint32_t specialBitSet{0};
     if (!dtSymbol->attrs().test(Attr::ABSTRACT)) {
+      SymbolVector boundProcedures{CollectBindings(dtScope)};
       std::vector<evaluate::StructureConstructor> bindings{
-          DescribeBindings(dtScope, scope)};
+          DescribeBindings(dtScope, scope, boundProcedures)};
       AddValue(dtValues, derivedTypeSchema_, bindingDescCompName,
           SaveDerivedPointerTarget(scope,
               SaveObjectName(
@@ -609,12 +610,14 @@ const Symbol *RuntimeTableBuilder::DescribeType(
       // subroutines override any parent bindings, but FINAL subroutines do not
       // (the runtime will call all of them).
       std::map<int, evaluate::StructureConstructor> specials{
-          DescribeSpecialGenerics(dtScope, dtScope, derivedTypeSpec)};
+          DescribeSpecialGenerics(
+              dtScope, dtScope, derivedTypeSpec, boundProcedures)};
       if (derivedTypeSpec) {
-        for (auto &ref : FinalsForDerivedTypeInstantiation(*derivedTypeSpec)) {
-          DescribeSpecialProc(specials, *ref, /*isAssignment-*/ false,
+        for (const Symbol &symbol :
+            FinalsForDerivedTypeInstantiation(*derivedTypeSpec)) {
+          DescribeSpecialProc(specials, symbol, /*isAssignment-*/ false,
               /*isFinal=*/true, std::nullopt, nullptr, derivedTypeSpec,
-              /*isTypeBound=*/true);
+              &boundProcedures);
         }
         IncorporateDefinedIoGenericInterfaces(specials,
             common::DefinedIo::ReadFormatted, &scope, derivedTypeSpec);
@@ -661,6 +664,10 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s,
         IntExpr<1>(
             derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec)));
+    // Similarly, a flag to enable optimized runtime assignment.
+    AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s,
+        IntExpr<1>(
+            derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec)));
   }
   dtObject.get<ObjectEntityDetails>().set_init(MaybeExpr{
       StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))});
@@ -1041,15 +1048,16 @@ SymbolVector CollectBindings(const Scope &dtScope) {
 }
 
 std::vector<evaluate::StructureConstructor>
-RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
+RuntimeTableBuilder::DescribeBindings(
+    const Scope &dtScope, Scope &scope, const SymbolVector &bindings) {
   std::vector<evaluate::StructureConstructor> result;
-  for (const SymbolRef &ref : CollectBindings(dtScope)) {
+  for (const Symbol &symbol : bindings) {
     evaluate::StructureConstructorValues values;
     AddValue(values, bindingSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{
-            ref.get().get<ProcBindingDetails>().symbol()}});
+            symbol.get<ProcBindingDetails>().symbol()}});
     AddValue(values, bindingSchema_, "name"s,
-        SaveNameAsPointerTarget(scope, ref.get().name().ToString()));
+        SaveNameAsPointerTarget(scope, symbol.name().ToString()));
     result.emplace_back(DEREF(bindingSchema_.AsDerived()), std::move(values));
   }
   return result;
@@ -1057,16 +1065,18 @@ RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
 
 std::map<int, evaluate::StructureConstructor>
 RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
-    const Scope &thisScope, const DerivedTypeSpec *derivedTypeSpec) const {
+    const Scope &thisScope, const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector &bindings) const {
   std::map<int, evaluate::StructureConstructor> specials;
   if (const Scope * parentScope{dtScope.GetDerivedTypeParent()}) {
-    specials =
-        DescribeSpecialGenerics(*parentScope, thisScope, derivedTypeSpec);
+    specials = DescribeSpecialGenerics(
+        *parentScope, thisScope, derivedTypeSpec, bindings);
   }
   for (const auto &pair : dtScope) {
     const Symbol &symbol{*pair.second};
     if (const auto *generic{symbol.detailsIf<GenericDetails>()}) {
-      DescribeSpecialGeneric(*generic, specials, thisScope, derivedTypeSpec);
+      DescribeSpecialGeneric(
+          *generic, specials, thisScope, derivedTypeSpec, bindings);
     }
   }
   return specials;
@@ -1074,15 +1084,16 @@ RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
 
 void RuntimeTableBuilder::DescribeSpecialGeneric(const GenericDetails &generic,
     std::map<int, evaluate::StructureConstructor> &specials,
-    const Scope &dtScope, const DerivedTypeSpec *derivedTypeSpec) const {
+    const Scope &dtScope, const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector &bindings) const {
   common::visit(
       common::visitors{
           [&](const GenericKind::OtherKind &k) {
             if (k == GenericKind::OtherKind::Assignment) {
-              for (auto ref : generic.specificProcs()) {
-                DescribeSpecialProc(specials, *ref, /*isAssignment=*/true,
+              for (const Symbol &specific : generic.specificProcs()) {
+                DescribeSpecialProc(specials, specific, /*isAssignment=*/true,
                     /*isFinal=*/false, std::nullopt, &dtScope, derivedTypeSpec,
-                    /*isTypeBound=*/true);
+                    &bindings);
               }
             }
           },
@@ -1092,10 +1103,10 @@ void RuntimeTableBuilder::DescribeSpecialGeneric(const GenericDetails &generic,
             case common::DefinedIo::ReadUnformatted:
             case common::DefinedIo::WriteFormatted:
             case common::DefinedIo::WriteUnformatted:
-              for (auto ref : generic.specificProcs()) {
-                DescribeSpecialProc(specials, *ref, /*isAssignment=*/false,
+              for (const Symbol &specific : generic.specificProcs()) {
+                DescribeSpecialProc(specials, specific, /*isAssignment=*/false,
                     /*isFinal=*/false, io, &dtScope, derivedTypeSpec,
-                    /*isTypeBound=*/true);
+                    &bindings);
               }
               break;
             }
@@ -1109,7 +1120,8 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     std::map<int, evaluate::StructureConstructor> &specials,
     const Symbol &specificOrBinding, bool isAssignment, bool isFinal,
     std::optional<common::DefinedIo> io, const Scope *dtScope,
-    const DerivedTypeSpec *derivedTypeSpec, bool isTypeBound) const {
+    const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector *bindings) const {
   const auto *binding{specificOrBinding.detailsIf<ProcBindingDetails>()};
   if (binding && dtScope) { // use most recent override
     binding = &DEREF(dtScope->FindComponent(specificOrBinding.name()))
@@ -1128,6 +1140,9 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       // component assignment as part of intrinsic assignment.
       // Non-type-bound generic INTERFACEs and assignments from incompatible
       // types must not be used for component intrinsic assignment.
+      if (!binding) {
+        return;
+      }
       CHECK(proc->dummyArguments.size() == 2);
       const auto t1{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
@@ -1137,7 +1152,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
                     &proc->dummyArguments[1].u))
               .type.type()};
-      if (!binding || t1.category() != TypeCategory::Derived ||
+      if (t1.category() != TypeCategory::Derived ||
           t2.category() != TypeCategory::Derived ||
           t1.IsUnlimitedPolymorphic() || t2.IsUnlimitedPolymorphic()) {
         return;
@@ -1149,7 +1164,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       }
       which = proc->IsElemental() ? elementalAssignmentEnum_
                                   : scalarAssignmentEnum_;
-      if (binding && binding->passName() &&
+      if (binding->passName() &&
           *binding->passName() == proc->dummyArguments[1].name) {
         argThatMightBeDescriptor = 1;
         isArgDescriptorSet |= 2;
@@ -1234,8 +1249,19 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         values, specialSchema_, "which"s, SomeExpr{std::move(which.value())});
     AddValue(values, specialSchema_, "isargdescriptorset"s,
         IntExpr<1>(isArgDescriptorSet));
-    AddValue(values, specialSchema_, "istypebound"s,
-        IntExpr<1>(isTypeBound ? 1 : 0));
+    int bindingIndex{0};
+    if (bindings) {
+      int j{0};
+      for (const Symbol &bind : DEREF(bindings)) {
+        ++j;
+        if (&bind.get<ProcBindingDetails>().symbol() == &specific) {
+          bindingIndex = j; // index offset by 1
+          break;
+        }
+      }
+    }
+    CHECK(bindingIndex <= 255);
+    AddValue(values, specialSchema_, "istypebound"s, IntExpr<1>(bindingIndex));
     AddValue(values, specialSchema_, "isargcontiguousset"s,
         IntExpr<1>(isArgContiguousSet));
     AddValue(values, specialSchema_, procCompName,
@@ -1260,7 +1286,7 @@ void RuntimeTableBuilder::IncorporateDefinedIoGenericInterfaces(
       CHECK(std::get<common::DefinedIo>(genericDetails.kind().u) == definedIo);
       for (auto ref : genericDetails.specificProcs()) {
         DescribeSpecialProc(specials, *ref, false, false, definedIo, nullptr,
-            derivedTypeSpec, false);
+            derivedTypeSpec, /*bindings=*/nullptr);
       }
     }
   }
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index a1445187b1e9..bf520d04a50c 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -814,6 +814,38 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
   return std::any_of(directs.begin(), directs.end(), IsAllocatable);
 }
 
+static bool MayHaveDefinedAssignment(
+    const DerivedTypeSpec &derived, std::set<const Scope *> &checked) {
+  if (const Scope *scope{derived.GetScope()};
+      scope && checked.find(scope) == checked.end()) {
+    checked.insert(scope);
+    for (const auto &[_, symbolRef] : *scope) {
+      if (const auto *generic{symbolRef->detailsIf<GenericDetails>()}) {
+        if (generic->kind().IsAssignment()) {
+          return true;
+        }
+      } else if (symbolRef->has<ObjectEntityDetails>() &&
+          !IsPointer(*symbolRef)) {
+        if (const DeclTypeSpec *type{symbolRef->GetType()}) {
+          if (type->IsPolymorphic()) {
+            return true;
+          } else if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+            if (MayHaveDefinedAssignment(*derived, checked)) {
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) {
+  std::set<const Scope *> checked;
+  return MayHaveDefinedAssignment(derived, checked);
+}
+
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index b30a6bf69756..8dd27d6e4c01 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -52,7 +52,8 @@ module __fortran_type_info
     integer(1) :: noInitializationNeeded ! 1 if no component w/ init
     integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final
     integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable
-    integer(1) :: __padding0(4)
+    integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=)
+    integer(1) :: __padding0(3)
   end type
 
   type :: Binding
@@ -116,7 +117,7 @@ module __fortran_type_info
   type, bind(c) :: SpecialBinding
     integer(1) :: which ! SpecialBinding::Which
     integer(1) :: isArgDescriptorSet
-    integer(1) :: isTypeBound
+    integer(1) :: isTypeBound ! binding index + 1, if any
     integer(1) :: isArgContiguousSet
     integer(1) :: __padding0(4)
     type(__builtin_c_funptr) :: proc
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 28f0bf78f33c..2e05b652822b 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@ end
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index d228cd2a84ca..bb20c546e026 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -8,7 +8,7 @@ module m01
   end type
 !CHECK: Module scope: m01
 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
 !CHECK: DerivedType scope: t1
@@ -23,8 +23,8 @@ module m02
   end type
 !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end module
 
 module m03
@@ -35,7 +35,7 @@ module m03
   type(kpdt(4)) :: x
 !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL())
-!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8]
 end module
 
@@ -49,7 +49,7 @@ module m04
   subroutine s1(x)
     class(tbps), intent(in) :: x
   end subroutine
-!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)]
 end module
 
@@ -61,7 +61,7 @@ module m05
   subroutine s1(x)
     class(t), intent(in) :: x
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)]
 end module
 
@@ -85,8 +85,8 @@ module m06
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -113,8 +113,8 @@ module m06a
     class(t2), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -132,7 +132,7 @@ module m07
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
@@ -155,8 +155,8 @@ module m08
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
 module m09
@@ -197,8 +197,8 @@ module m09
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -246,7 +246,7 @@ module m10
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
@@ -263,7 +263,7 @@ module m11
 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())]
 !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target)
 !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1]
 !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer
 !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4)
diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90
index f0c0a817da4a..e2552d0a21d6 100644
--- a/flang/test/Semantics/typeinfo03.f90
+++ b/flang/test/Semantics/typeinfo03.f90
@@ -6,4 +6,4 @@ module m
     class(*), pointer :: sp, ap(:)
   end type
 end module
-!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a40..94dd2199db35 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,18 +7,18 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
   type, abstract :: t1
   end type
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t2
     real, allocatable :: a(:)
   end type
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t3
     type(finalizable) :: x
   end type
-!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
  contains
   impure elemental subroutine final(x)
     type(finalizable), intent(in out) :: x
diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90
index 2a7f12a153eb..df1aecf3821d 100644
--- a/flang/test/Semantics/typeinfo05.f90
+++ b/flang/test/Semantics/typeinfo05.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), pointer :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90
index 2385709a8eb4..22f37b1a4369 100644
--- a/flang/test/Semantics/typeinfo06.f90
+++ b/flang/test/Semantics/typeinfo06.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), allocatable :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90
index e8766d9811db..ab20d6f60110 100644
--- a/flang/test/Semantics/typeinfo07.f90
+++ b/flang/test/Semantics/typeinfo07.f90
@@ -16,7 +16,7 @@
     type(t_container_extension) :: wrapper
   end type
 end
-! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
-! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90
index 689cf469dee3..391a66f3d666 100644
--- a/flang/test/Semantics/typeinfo08.f90
+++ b/flang/test/Semantics/typeinfo08.f90
@@ -13,7 +13,7 @@ end module
 
 !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes
 !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1]
 !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
index 92efc8f9ea54..08e0b95abb76 100644
--- a/flang/test/Semantics/typeinfo11.f90
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -14,4 +14,4 @@ end type
 type(t2) x
 end
 
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 000000000000..6b23b63d28b1
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,67 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!Check "nodefinedassignment" settings.
+
+module m01
+
+  type hasAsst1
+   contains
+    procedure asst1
+    generic :: assignment(=) => asst1
+  end type
+!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type hasAsst2 ! no defined assignment relevant to the runtime
+  end type
+  interface assignment(=)
+    procedure asst2
+  end interface
+!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test1
+    type(hasAsst1) c
+  end type
+!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type test2
+    type(hasAsst2) c
+  end type
+!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test3
+    type(hasAsst1), pointer :: p
+  end type
+!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test4
+    type(hasAsst2), pointer :: p
+  end type
+!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type, extends(hasAsst1) :: test5
+  end type
+!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type, extends(hasAsst2) :: test6
+  end type
+!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test7
+    type(test7), allocatable :: c
+  end type
+!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test8
+    class(test8), allocatable :: c
+  end type
+!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+
+ contains
+  impure elemental subroutine asst1(left, right)
+    class(hasAsst1), intent(out) :: left
+    class(hasAsst1), intent(in) :: right
+  end
+  impure elemental subroutine asst2(left, right)
+    class(hasAsst2), intent(out) :: left
+    class(hasAsst2), intent(in) :: right
+  end
+end
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
index cf4abf9e3818..ad824ad3590a 100644
--- a/flang/test/Semantics/typeinfo13.f90
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -22,5 +22,5 @@ module m
   end
 end
 
-!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=override)]
 !CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]

From 6cbb67f84c53d88e67b0d5a9f0ad2cf4782e6f66 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <lazar_2004@list.ru>
Date: Tue, 17 Jun 2025 00:51:49 +0300
Subject: [PATCH 0557/1322] [mlir][emitc] Fix the emitc::ExpressionOp (#143894)

Fix the lack of verification that the definingOp of the return value
belongs to emitc::ExpressionOp.
---
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp      | 12 +++++++++---
 mlir/test/Dialect/EmitC/invalid_ops.mlir | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 1709654b9013..f82b20712b8c 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -386,9 +386,7 @@ OpFoldResult emitc::ConstantOp::fold(FoldAdaptor adaptor) { return getValue(); }
 Operation *ExpressionOp::getRootOp() {
   auto yieldOp = cast<YieldOp>(getBody()->getTerminator());
   Value yieldedValue = yieldOp.getResult();
-  Operation *rootOp = yieldedValue.getDefiningOp();
-  assert(rootOp && "Yielded value not defined within expression");
-  return rootOp;
+  return yieldedValue.getDefiningOp();
 }
 
 LogicalResult ExpressionOp::verify() {
@@ -406,6 +404,14 @@ LogicalResult ExpressionOp::verify() {
   if (!yieldResult)
     return emitOpError("must yield a value at termination");
 
+  Operation *rootOp = yieldResult.getDefiningOp();
+
+  if (!rootOp)
+    return emitOpError("yielded value has no defining op");
+
+  if (rootOp->getParentOp() != getOperation())
+    return emitOpError("yielded value not defined within expression");
+
   Type yieldType = yieldResult.getType();
 
   if (resultType != yieldType)
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 3793dfe3f173..3946a36a83c6 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -346,6 +346,28 @@ func.func @test_expression_multiple_results(%arg0: i32) -> i32 {
 
 // -----
 
+emitc.func @test_expression_no_defining_op(%a : i32) {
+  // expected-error @+1 {{'emitc.expression' op yielded value has no defining op}}
+  %res = emitc.expression : i32 {
+    emitc.yield %a : i32
+  }
+
+  return
+}
+
+// -----
+
+emitc.func @test_expression_op_outside_expression() {
+  %cond = literal "true" : i1
+  // expected-error @+1 {{'emitc.expression' op yielded value not defined within expression}}
+  %res = emitc.expression : i1 {
+    emitc.yield %cond : i1
+  }
+  return
+}
+
+// -----
+
 // expected-error @+1 {{'emitc.func' op requires zero or exactly one result, but has 2}}
 emitc.func @multiple_results(%0: i32) -> (i32, i32) {
   emitc.return %0 : i32

From a383b1a95b63cf120b3dea554c2d66ccfaee066b Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Mon, 16 Jun 2025 14:52:59 -0700
Subject: [PATCH 0558/1322] Reland "[HLSL][RootSignature] Implement
 serialization of RootConstants and RootFlags" (#143019)

This relands #141130.

The initial commit uncovered that we are missing the correct linking of
FrontendHLSL into clang/lib/Parse and clang/lib/unittests/Parse.

This change addreses this by linking them accordingly.

It was also checked and ensured that the LexHLSLRootSignature libraries
do not depend on FrontendHLSL and so we are not required to link there.

Resolves: #138190 and #138192
---
 clang/lib/Parse/CMakeLists.txt                |  1 +
 clang/unittests/Parse/CMakeLists.txt          |  1 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |  5 ++
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  | 33 +++++++++
 .../Frontend/HLSLRootSignatureDumpTest.cpp    | 69 +++++++++++++++++++
 5 files changed, 109 insertions(+)

diff --git a/clang/lib/Parse/CMakeLists.txt b/clang/lib/Parse/CMakeLists.txt
index 00fde537bb9c..e6cbf3b868b7 100644
--- a/clang/lib/Parse/CMakeLists.txt
+++ b/clang/lib/Parse/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  FrontendHLSL
   FrontendOpenMP
   MC
   MCParser
diff --git a/clang/unittests/Parse/CMakeLists.txt b/clang/unittests/Parse/CMakeLists.txt
index 6859efed294c..2ed43a83b878 100644
--- a/clang/unittests/Parse/CMakeLists.txt
+++ b/clang/unittests/Parse/CMakeLists.txt
@@ -11,5 +11,6 @@ add_clang_unittest(ParseTests
   LLVMTestingSupport
   clangTesting
   LLVM_COMPONENTS
+  FrontendHLSL
   Support
   )
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index 6d959ad5bdc7..ca20e6719f3a 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -27,6 +27,11 @@ class Metadata;
 namespace hlsl {
 namespace rootsig {
 
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const RootFlags &Flags);
+
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const RootConstants &Constants);
+
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
                                  const DescriptorTableClause &Clause);
 
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 7d744781da04..5bae72a3986f 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -129,6 +129,39 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+static const EnumEntry<RootFlags> RootFlagNames[] = {
+    {"AllowInputAssemblerInputLayout",
+     RootFlags::AllowInputAssemblerInputLayout},
+    {"DenyVertexShaderRootAccess", RootFlags::DenyVertexShaderRootAccess},
+    {"DenyHullShaderRootAccess", RootFlags::DenyHullShaderRootAccess},
+    {"DenyDomainShaderRootAccess", RootFlags::DenyDomainShaderRootAccess},
+    {"DenyGeometryShaderRootAccess", RootFlags::DenyGeometryShaderRootAccess},
+    {"DenyPixelShaderRootAccess", RootFlags::DenyPixelShaderRootAccess},
+    {"AllowStreamOutput", RootFlags::AllowStreamOutput},
+    {"LocalRootSignature", RootFlags::LocalRootSignature},
+    {"DenyAmplificationShaderRootAccess",
+     RootFlags::DenyAmplificationShaderRootAccess},
+    {"DenyMeshShaderRootAccess", RootFlags::DenyMeshShaderRootAccess},
+    {"CBVSRVUAVHeapDirectlyIndexed", RootFlags::CBVSRVUAVHeapDirectlyIndexed},
+    {"SamplerHeapDirectlyIndexed", RootFlags::SamplerHeapDirectlyIndexed},
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const RootFlags &Flags) {
+  OS << "RootFlags(";
+  printFlags(OS, Flags, ArrayRef(RootFlagNames));
+  OS << ")";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const RootConstants &Constants) {
+  OS << "RootConstants(num32BitConstants = " << Constants.Num32BitConstants
+     << ", " << Constants.Reg << ", space = " << Constants.Space
+     << ", visibility = " << Constants.Visibility << ")";
+
+  return OS;
+}
+
 raw_ostream &operator<<(raw_ostream &OS, const DescriptorTable &Table) {
   OS << "DescriptorTable(numClauses = " << Table.NumClauses
      << ", visibility = " << Table.Visibility << ")";
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 90e6cd0a80d6..1a0c8e2a1639 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -108,4 +108,73 @@ TEST(HLSLRootSignatureTest, DescriptorTableDump) {
   EXPECT_EQ(Out, Expected);
 }
 
+TEST(HLSLRootSignatureTest, DefaultRootConstantsDump) {
+  RootConstants Constants;
+  Constants.Num32BitConstants = 1;
+  Constants.Reg = {RegisterType::BReg, 3};
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Constants;
+  OS.flush();
+
+  std::string Expected = "RootConstants(num32BitConstants = 1, b3, space = 0, "
+                         "visibility = All)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, SetRootConstantsDump) {
+  RootConstants Constants;
+  Constants.Num32BitConstants = 983;
+  Constants.Reg = {RegisterType::BReg, 34593};
+  Constants.Space = 7;
+  Constants.Visibility = ShaderVisibility::Pixel;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Constants;
+  OS.flush();
+
+  std::string Expected = "RootConstants(num32BitConstants = 983, b34593, "
+                         "space = 7, visibility = Pixel)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, NoneRootFlagsDump) {
+  RootFlags Flags = RootFlags::None;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Flags;
+  OS.flush();
+
+  std::string Expected = "RootFlags(None)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, AllRootFlagsDump) {
+  RootFlags Flags = RootFlags::ValidFlags;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Flags;
+  OS.flush();
+
+  std::string Expected = "RootFlags("
+                         "AllowInputAssemblerInputLayout | "
+                         "DenyVertexShaderRootAccess | "
+                         "DenyHullShaderRootAccess | "
+                         "DenyDomainShaderRootAccess | "
+                         "DenyGeometryShaderRootAccess | "
+                         "DenyPixelShaderRootAccess | "
+                         "AllowStreamOutput | "
+                         "LocalRootSignature | "
+                         "DenyAmplificationShaderRootAccess | "
+                         "DenyMeshShaderRootAccess | "
+                         "CBVSRVUAVHeapDirectlyIndexed | "
+                         "SamplerHeapDirectlyIndexed)";
+
+  EXPECT_EQ(Out, Expected);
+}
+
 } // namespace

From 30b16ec3415e7ddb597d096f818d011b1b4e6a63 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 16 Jun 2025 22:54:25 +0100
Subject: [PATCH 0559/1322] [VPlan] Simplify trivial VPFirstOrderRecurrencePHI
 recipes.

VPFirstOrderRecurrencePHIRecipes where the incoming values are the same
can be simplified and removed.

Fixes https://github.com/llvm/llvm-project/issues/144212.

The new test is added together with other related tests from
first-order-recurrence.ll
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   6 +
 ...irst-order-recurrence-dead-instructions.ll | 270 +++++++++++
 .../LoopVectorize/first-order-recurrence.ll   | 452 ------------------
 3 files changed, 276 insertions(+), 452 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 44a72755b9cf..05a0e15f9a19 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1141,6 +1141,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
+    if (Phi->getOperand(0) == Phi->getOperand(1))
+      Def->replaceAllUsesWith(Phi->getOperand(0));
+    return;
+  }
+
   // Some simplifications can only be applied after unrolling. Perform them
   // below.
   if (!Plan->isUnrolled())
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
new file mode 100644
index 000000000000..d98cd45cb634
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/144212.
+define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(i8 %for.start, ptr %dst) {
+; CHECK-LABEL: define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(
+; CHECK-SAME: i8 [[FOR_START:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[FOR_START]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLAT]], <4 x i8> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
+; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], -8
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -7, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ], [ 1, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_START]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_NEXT]] = and i8 [[FOR_START]], -1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i8 [[FOR]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[FOR_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %for = phi i8 [ %for.start, %entry ], [ %for.next, %loop ]
+  %for.next = and i8 %for.start, -1
+  %iv.next = add i32 %iv, 1
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %iv
+  store i8 %for, ptr %gep.dst
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i8 %for.next
+}
+
+; %vec.dead will be marked as dead instruction in the vector loop and no recipe
+; will be created for it. Make sure a valid sink target is used.
+define i32 @sink_after_dead_inst(ptr %A.ptr) {
+; CHECK-LABEL: define i32 @sink_after_dead_inst(
+; CHECK-SAME: ptr [[A_PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i16> [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; CHECK-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; CHECK-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; CHECK-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; CHECK-NEXT:    br i1 [[VEC_DEAD]], label %[[FOR_END]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
+  %cmp = icmp eq i32 %for, 15
+  %C = icmp eq i1 %cmp, true
+  %vec.dead = and i1 %C, 1
+  %iv.next = add i16 %iv, 1
+  %B1 = or i16 %iv.next, %iv.next
+  %B3 = and i1 %cmp, %C
+  %for.prev = zext i16 %B1 to i32
+
+  %ext = zext i1 %B3 to i32
+  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
+  store i32 0, ptr %A.gep
+  br i1 %vec.dead, label %for.end, label %loop
+
+for.end:
+  ret i32 %for
+}
+
+; Dead instructions, like the exit condition are not part of the actual VPlan
+; and do not need to be sunk. PR44634.
+define void @sink_dead_inst(ptr %a) {
+; CHECK-LABEL: define void @sink_dead_inst(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; CHECK-NEXT:    [[TMP4]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i16> [[TMP5]], splat (i16 10)
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP9]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT1]], %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], %[[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; CHECK-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %cmp = icmp eq i32 %rec.2, 15
+  %iv.next = add i16 %iv, 1
+  %rec.2.prev = zext i16 %iv.next to i32
+  %rec.1.prev = add i16 %iv.next, 5
+  %gep = getelementptr i16, ptr %a, i16 %iv
+  store i16 %use.rec.1, ptr %gep
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
+; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
+; to be removed also.
+define void @unused_recurrence(ptr %a) {
+; CHECK-LABEL: define void @unused_recurrence(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %iv.next= add i16 %iv, 1
+  %rec.1.prev = add i16 %iv.next, 5
+  %cmp = icmp eq i16 %iv, 1000
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 13dc53559d28..9be26d4247a3 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -2473,177 +2473,6 @@ for.end12.loopexit:                               ; preds = %cond.end
   ret void
 }
 
-; Dead instructions, like the exit condition are not part of the actual VPlan
-; and do not need to be sunk. PR44634.
-define void @sink_dead_inst(ptr %a) {
-; UNROLL-NO-IC-LABEL: @sink_dead_inst(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5)
-; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5)
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], splat (i16 10)
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP10]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
-; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP12]], align 2
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-IC:       for.cond:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
-; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
-; UNROLL-NO-IC-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    ret void
-;
-; UNROLL-NO-VF-LABEL: @sink_dead_inst(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP2]], 5
-; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP3]], 5
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sub i16 [[TMP5]], 10
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A]], i16 [[TMP1]]
-; UNROLL-NO-VF-NEXT:    store i16 [[TMP7]], ptr [[TMP9]], align 2
-; UNROLL-NO-VF-NEXT:    store i16 [[TMP8]], ptr [[TMP10]], align 2
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 15, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-VF:       for.cond:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
-; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-VF-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
-; UNROLL-NO-VF-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    ret void
-;
-; SINK-AFTER-LABEL: @sink_dead_inst(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5)
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; SINK-AFTER-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
-; SINK-AFTER:       for.cond:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
-; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; SINK-AFTER-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
-; SINK-AFTER-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
-  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
-  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
-  %use.rec.1 = sub i16 %rec.1, 10
-  %cmp = icmp eq i32 %rec.2, 15
-  %iv.next = add i16 %iv, 1
-  %rec.2.prev = zext i16 %iv.next to i32
-  %rec.1.prev = add i16 %iv.next, 5
-  %gep = getelementptr i16, ptr %a, i16 %iv
-  store i16 %use.rec.1, ptr %gep
-  br i1 %cmp, label %for.end, label %for.cond
-
-for.end:
-  ret void
-}
-
 define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
@@ -3464,287 +3293,6 @@ bb:
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
 
-; %vec.dead will be marked as dead instruction in the vector loop and no recipe
-; will be created for it. Make sure a valid sink target is used.
-define i32 @sink_after_dead_inst(ptr %A.ptr) {
-; UNROLL-NO-IC-LABEL: @sink_after_dead_inst(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; UNROLL-NO-IC-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; UNROLL-NO-IC-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; UNROLL-NO-IC-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; UNROLL-NO-IC-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; UNROLL-NO-IC-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; UNROLL-NO-IC-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
-; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[FOR_LCSSA]]
-;
-; UNROLL-NO-VF-LABEL: @sink_after_dead_inst(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = or i16 [[TMP5]], [[TMP5]]
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP4]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
-; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-VF:       loop:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; UNROLL-NO-VF-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; UNROLL-NO-VF-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; UNROLL-NO-VF-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; UNROLL-NO-VF-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
-; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
-;
-; SINK-AFTER-LABEL: @sink_after_dead_inst(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
-; SINK-AFTER:       loop:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; SINK-AFTER-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; SINK-AFTER-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; SINK-AFTER-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; SINK-AFTER-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; SINK-AFTER-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; SINK-AFTER-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; SINK-AFTER-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
-; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; SINK-AFTER-NEXT:    ret i32 [[FOR_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
-  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
-  %cmp = icmp eq i32 %for, 15
-  %C = icmp eq i1 %cmp, true
-  %vec.dead = and i1 %C, 1
-  %iv.next = add i16 %iv, 1
-  %B1 = or i16 %iv.next, %iv.next
-  %B3 = and i1 %cmp, %C
-  %for.prev = zext i16 %B1 to i32
-
-  %ext = zext i1 %B3 to i32
-  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
-  store i32 0, ptr %A.gep
-  br i1 %vec.dead, label %for.end, label %loop
-
-for.end:
-  ret i32 %for
-}
-
-; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
-; to be removed also.
-define void @unused_recurrence(ptr %a) {
-; UNROLL-NO-IC-LABEL: @unused_recurrence(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-IC:       for.cond:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
-; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    ret void
-;
-; UNROLL-NO-VF-LABEL: @unused_recurrence(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-VF:       for.cond:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
-; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    ret void
-;
-; SINK-AFTER-LABEL: @unused_recurrence(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
-; SINK-AFTER:       for.cond:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
-; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
-  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
-  %use.rec.1 = sub i16 %rec.1, 10
-  %iv.next= add i16 %iv, 1
-  %rec.1.prev = add i16 %iv.next, 5
-  %cmp = icmp eq i16 %iv, 1000
-  br i1 %cmp, label %for.end, label %for.cond
-
-for.end:
-  ret void
-}
-
 ; Test case for https://github.com/llvm/llvm-project/issues/95520.
 define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-LABEL: @recurence_uniform_load(

From 4bcf9732c7361b3ea5208ced592245e0302fc7a2 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Mon, 16 Jun 2025 23:03:49 +0100
Subject: [PATCH 0560/1322] [CIR] Add Support For Library Builtins (#143984)

This patch upstreams support for builtins that map to a standard library
function. Examples would be abort() and printf().

It also fixes a minor issue with the errorNYI for all remaining
unimplemented builtins using the mlir::Location instead of the clang AST
SourceLocation.
---
 clang/include/clang/CIR/MissingFeatures.h |  1 +
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp   | 39 +++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.h      |  4 ++
 clang/test/CIR/CodeGen/builtin_call.cpp   | 18 +++++++
 clang/test/CIR/CodeGen/builtin_printf.cpp | 65 +++++++++++++++++++++++
 5 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/builtin_printf.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 13ddc77835fb..3dc28e6f2e5b 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -236,6 +236,7 @@ struct MissingFeatures {
   static bool runCleanupsScope() { return false; }
   static bool lowerAggregateLoadStore() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
+  static bool asmLabelAttr() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index c59ac78210f8..19fac00ab873 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -20,10 +20,18 @@
 #include "mlir/Support/LLVM.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
+using namespace llvm;
+
+static RValue emitLibraryCall(CIRGenFunction &cgf, const FunctionDecl *fd,
+                              const CallExpr *e, mlir::Operation *calleeValue) {
+  CIRGenCallee callee = CIRGenCallee::forDirect(calleeValue, GlobalDecl(fd));
+  return cgf.emitCall(e->getCallee()->getType(), callee, e, ReturnValueSlot());
+}
 
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
@@ -49,7 +57,34 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     }
   }
 
-  mlir::Location loc = getLoc(e->getExprLoc());
-  cgm.errorNYI(loc, "non constant foldable builtin calls");
+  const FunctionDecl *fd = gd.getDecl()->getAsFunction();
+
+  // If this is an alias for a lib function (e.g. __builtin_sin), emit
+  // the call using the normal call path, but using the unmangled
+  // version of the function name.
+  if (getContext().BuiltinInfo.isLibFunction(builtinID))
+    return emitLibraryCall(*this, fd, e,
+                           cgm.getBuiltinLibFunction(fd, builtinID));
+
+  cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
   return getUndefRValue(e->getType());
 }
+
+/// Given a builtin id for a function like "__builtin_fabsf", return a Function*
+/// for "fabsf".
+cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
+                                                unsigned builtinID) {
+  assert(astContext.BuiltinInfo.isLibFunction(builtinID));
+
+  // Get the name, skip over the __builtin_ prefix (if necessary). We may have
+  // to build this up so provide a small stack buffer to handle the vast
+  // majority of names.
+  llvm::SmallString<64> name;
+
+  assert(!cir::MissingFeatures::asmLabelAttr());
+  name = astContext.BuiltinInfo.getName(builtinID).substr(10);
+
+  GlobalDecl d(fd);
+  mlir::Type type = convertType(fd->getType());
+  return getOrCreateCIRFunction(name, type, d, /*forVTable=*/false);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 03606dba200f..0ea2d9f9c822 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -301,6 +301,10 @@ public:
                                 cir::FuncType funcType,
                                 const clang::FunctionDecl *funcDecl);
 
+  /// Given a builtin id for a function like "__builtin_fabsf", return a
+  /// Function* for "fabsf".
+  cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned builtinID);
+
   mlir::IntegerAttr getSize(CharUnits size) {
     return builder.getSizeFromCharUnits(size);
   }
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index 2706ea7f8f85..322c13c8f081 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -76,3 +76,21 @@ float constant_fp_builtin_single() {
 // OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev()
 // OGCG: ret float 0x3FB99999A0000000
 // OGCG: }
+
+void library_builtins() {
+  __builtin_printf(nullptr);
+  __builtin_abort();
+}
+
+// CIR: cir.func @_Z16library_builtinsv() {
+// CIR: %[[NULL:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
+// CIR: cir.call @printf(%[[NULL]]) : (!cir.ptr<!s8i>) -> !s32i
+// CIR: cir.call @abort() : () -> ()
+
+// LLVM: define void @_Z16library_builtinsv()
+// LLVM: call i32 (ptr, ...) @printf(ptr null)
+// LLVM: call void @abort()
+
+// OGCG: define dso_local void @_Z16library_builtinsv()
+// OGCG: call i32 (ptr, ...) @printf(ptr noundef null)
+// OGCG: call void @abort()
diff --git a/clang/test/CIR/CodeGen/builtin_printf.cpp b/clang/test/CIR/CodeGen/builtin_printf.cpp
new file mode 100644
index 000000000000..366e474c2b09
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_printf.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+// CIR: cir.global "private" cir_private dsolocal @".str" = #cir.const_array<"%s\00" : !cir.array<!s8i x 3>> : !cir.array<!s8i x 3> 
+// CIR: cir.global "private" cir_private dsolocal @".str.1" = #cir.const_array<"%s %d\0A\00" : !cir.array<!s8i x 7>> : !cir.array<!s8i x 7>
+// LLVM: @.str = private global [3 x i8] c"%s\00"
+// LLVM: @.str.1 = private global [7 x i8] c"%s %d\0A\00"
+// OGCG: @.str = private unnamed_addr constant [3 x i8] c"%s\00"
+// OGCG: @.str.1 = private unnamed_addr constant [7 x i8] c"%s %d\0A\00"
+
+void func(char const * const str, int i) {
+  __builtin_printf(nullptr);
+  __builtin_printf("%s", str);
+  __builtin_printf("%s %d\n", str, i);
+}
+
+// CIR: cir.func @printf(!cir.ptr<!s8i>, ...) -> !s32i
+
+// CIR: cir.func @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i{{.*}}) {
+// CIR:   %[[str_ptr:.+]] = cir.alloca !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>, ["str", init, const]
+// CIR:   %[[i_ptr:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+// CIR:   cir.store %[[arg0]], %[[str_ptr]] : !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>
+// CIR:   cir.store %[[arg1]], %[[i_ptr]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[null_ptr:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
+// CIR:   %[[printf_result1:.+]] = cir.call @printf(%[[null_ptr]]) : (!cir.ptr<!s8i>) -> !s32i
+// CIR:   %[[str_fmt_global:.+]] = cir.get_global @".str" : !cir.ptr<!cir.array<!s8i x 3>>
+// CIR:   %[[str_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[str_fmt_global]] : !cir.ptr<!cir.array<!s8i x 3>>), !cir.ptr<!s8i>
+// CIR:   %[[str_val:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
+// CIR:   %[[printf_result2:.+]] = cir.call @printf(%[[str_fmt_ptr]], %[[str_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>) -> !s32i
+// CIR:   %[[full_fmt_global:.+]] = cir.get_global @".str.1" : !cir.ptr<!cir.array<!s8i x 7>>
+// CIR:   %[[full_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[full_fmt_global]] : !cir.ptr<!cir.array<!s8i x 7>>), !cir.ptr<!s8i>
+// CIR:   %[[str_val2:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
+// CIR:   %[[i_val:.+]] = cir.load{{.*}} %[[i_ptr]] : !cir.ptr<!s32i>, !s32i
+// CIR:   %[[printf_result3:.+]] = cir.call @printf(%[[full_fmt_ptr]], %[[str_val2]], %[[i_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>, !s32i) -> !s32i
+// CIR:   cir.return
+
+// LLVM: define void @_Z4funcPKci(ptr %[[arg0:.+]], i32 %[[arg1:.+]])
+// LLVM:   %[[str_ptr:.+]] = alloca ptr
+// LLVM:   %[[i_ptr:.+]] = alloca i32
+// LLVM:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
+// LLVM:   store i32 %[[arg1]], ptr %[[i_ptr]]{{.*}}
+// LLVM:   %[[printf_result1:.+]] = call i32 (ptr, ...) @printf(ptr null)
+// LLVM:   %[[str_val:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// LLVM:   %[[printf_result2:.+]] = call i32 (ptr, ...) @printf(ptr @.str, ptr %[[str_val]])
+// LLVM:   %[[str_val2:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// LLVM:   %[[i_val:.+]] = load i32, ptr %[[i_ptr]]{{.*}}
+// LLVM:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr @.str.1, ptr %[[str_val2]], i32 %[[i_val]])
+// LLVM:   ret void
+
+// OGCG: define dso_local void @_Z4funcPKci(ptr noundef %[[arg0:.+]], i32 noundef %[[arg1:.+]])
+// OGCG:   %[[str_ptr:.+]] = alloca ptr
+// OGCG:   %[[i_ptr:.+]] = alloca i32
+// OGCG:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
+// OGCG:   store i32 %[[arg1]], ptr %[[i_ptr]]{{.*}}
+// OGCG:   %[[printf_result1:.+]] = call i32 (ptr, ...) @printf(ptr noundef null)
+// OGCG:   %[[str_val:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// OGCG:   %[[printf_result2:.+]] = call i32 (ptr, ...) @printf(ptr noundef @.str, ptr noundef %[[str_val]])
+// OGCG:   %[[str_val2:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// OGCG:   %[[i_val:.+]] = load i32, ptr %[[i_ptr]]{{.*}}
+// OGCG:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr noundef @.str.1, ptr noundef %[[str_val2]], i32 noundef %[[i_val]])
+// OGCG:   ret void

From 2488f26d15e7e12aef9ead3fcb2d1b6da51812fb Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 16 Jun 2025 15:06:41 -0700
Subject: [PATCH 0561/1322] [win][x64] Unwind v2 3/n: Add support for requiring
 unwind v2 to be used (equivalent to MSVC's /d2epilogunwindrequirev2)
 (#143577)

#129142 added support for emitting Windows x64 unwind v2 information,
but it was "best effort". If any function didn't follow the requirements
for v2 it was silently downgraded to v1.

There are some parts of Windows (specifically kernel-mode code running
on Xbox) that require v2, hence we need the ability to fail the
compilation if v2 can't be used.

This change also adds a heuristic to check if there might be too many
unwind codes, it's currently conservative (i.e., assumes that certain
prolog instructions will use the maximum number of unwind codes).

Future work: attempting to chain unwind info across multiple tables if
there are too many unwind codes due to epilogs and adding a heuristic to
detect if an epilog will be too far from the end of the function.
---
 clang/include/clang/Basic/CodeGenOptions.def  |   6 +-
 clang/include/clang/Driver/Options.td         |  17 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   6 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   9 +-
 clang/test/CodeGen/epilog-unwind.c            |  10 +-
 clang/test/Driver/cl-options.c                |   6 +-
 llvm/include/llvm/IR/Module.h                 |   4 +
 llvm/include/llvm/Support/CodeGen.h           |   9 +
 llvm/lib/IR/Module.cpp                        |   7 +
 llvm/lib/Target/X86/X86WinEHUnwindV2.cpp      | 152 +++++++--
 .../CodeGen/X86/win64-eh-unwindv2-errors.mir  | 318 ++++++++++++++++++
 .../win64-eh-unwindv2-too-many-epilogs.mir    |  94 ++++++
 12 files changed, 595 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
 create mode 100644 llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 2a30ff11464d..e5566a540dc6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -483,8 +483,10 @@ CODEGENOPT(StaticClosure, 1, 0)
 /// Assume that UAVs/SRVs may alias
 CODEGENOPT(ResMayAlias, 1, 0)
 
-/// Enables unwind v2 (epilog) information for x64 Windows.
-CODEGENOPT(WinX64EHUnwindV2, 1, 0)
+/// Controls how unwind v2 (epilog) information should be generated for x64
+/// Windows.
+ENUM_CODEGENOPT(WinX64EHUnwindV2, llvm::WinX64EHUnwindV2Mode,
+                2, llvm::WinX64EHUnwindV2Mode::Disabled)
 
 /// FIXME: Make DebugOptions its own top-level .def file.
 #include "DebugOptions.def"
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1b07deb4a848..72d564e1ba0b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2167,11 +2167,14 @@ defm assume_nothrow_exception_dtor: BoolFOption<"assume-nothrow-exception-dtor",
   LangOpts<"AssumeNothrowExceptionDtor">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Assume that exception objects' destructors are non-throwing">,
   NegFlag<SetFalse>>;
-defm winx64_eh_unwindv2 : BoolFOption<"winx64-eh-unwindv2",
-  CodeGenOpts<"WinX64EHUnwindV2">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
-  NegFlag<SetFalse, [], [ClangOption], "Disable">,
-  BothFlags<[], [ClangOption], " unwind v2 (epilog) information for x64 Windows">>;
+def winx64_eh_unwindv2
+    : Joined<["-"], "fwinx64-eh-unwindv2=">, Group<f_Group>,
+    Visibility<[ClangOption, CC1Option]>,
+      HelpText<"Generate unwind v2 (epilog) information for x64 Windows">,
+      Values<"disabled,best-effort,required">,
+      NormalizedValues<["Disabled", "BestEffort", "Required"]>,
+      NormalizedValuesScope<"llvm::WinX64EHUnwindV2Mode">,
+      MarshallingInfoEnum<CodeGenOpts<"WinX64EHUnwindV2">, "Disabled">;
 def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">, Group<f_Group>,
   Visibility<[ClangOption, CLOption]>,
   HelpText<"Allows control over excess precision on targets where native "
@@ -8972,7 +8975,9 @@ def _SLASH_volatile_Group : OptionGroup<"</volatile group>">,
   Group<cl_compile_Group>;
 
 def _SLASH_d2epilogunwind : CLFlag<"d2epilogunwind">,
-  HelpText<"Enable unwind v2 (epilog) information for x64 Windows">;
+  HelpText<"Best effort generate unwind v2 (epilog) information for x64 Windows">;
+def _SLASH_d2epilogunwindrequirev2 : CLFlag<"d2epilogunwindrequirev2">,
+  HelpText<"Require generation of unwind v2 (epilog) information for x64 Windows">;
 def _SLASH_EH : CLJoined<"EH">, HelpText<"Set exception handling model">;
 def _SLASH_EP : CLFlag<"EP">,
   HelpText<"Disable linemarker output and preprocess to stdout">;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 06c0e1f8afe1..c27168e4c4bf 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1319,8 +1319,10 @@ void CodeGenModule::Release() {
                               1);
 
   // Enable unwind v2 (epilog).
-  if (CodeGenOpts.WinX64EHUnwindV2)
-    getModule().addModuleFlag(llvm::Module::Warning, "winx64-eh-unwindv2", 1);
+  if (CodeGenOpts.getWinX64EHUnwindV2() != llvm::WinX64EHUnwindV2Mode::Disabled)
+    getModule().addModuleFlag(
+        llvm::Module::Warning, "winx64-eh-unwindv2",
+        static_cast<unsigned>(CodeGenOpts.getWinX64EHUnwindV2()));
 
   // Indicate whether this Module was compiled with -fopenmp
   if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8556bcadf091..7dfed3a3356b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7360,8 +7360,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   // Unwind v2 (epilog) information for x64 Windows.
-  Args.addOptInFlag(CmdArgs, options::OPT_fwinx64_eh_unwindv2,
-                    options::OPT_fno_winx64_eh_unwindv2);
+  Args.AddLastArg(CmdArgs, options::OPT_winx64_eh_unwindv2);
 
   // C++ "sane" operator new.
   Args.addOptOutFlag(CmdArgs, options::OPT_fassume_sane_operator_new,
@@ -8418,8 +8417,10 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
     CmdArgs.push_back("-fms-kernel");
 
   // Unwind v2 (epilog) information for x64 Windows.
-  if (Args.hasArg(options::OPT__SLASH_d2epilogunwind))
-    CmdArgs.push_back("-fwinx64-eh-unwindv2");
+  if (Args.hasArg(options::OPT__SLASH_d2epilogunwindrequirev2))
+    CmdArgs.push_back("-fwinx64-eh-unwindv2=required");
+  else if (Args.hasArg(options::OPT__SLASH_d2epilogunwind))
+    CmdArgs.push_back("-fwinx64-eh-unwindv2=best-effort");
 
   for (const Arg *A : Args.filtered(options::OPT__SLASH_guard)) {
     StringRef GuardArgs = A->getValue();
diff --git a/clang/test/CodeGen/epilog-unwind.c b/clang/test/CodeGen/epilog-unwind.c
index 991ff09fb37c..b2f7497b455b 100644
--- a/clang/test/CodeGen/epilog-unwind.c
+++ b/clang/test/CodeGen/epilog-unwind.c
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
-// RUN: %clang_cc1 -fwinx64-eh-unwindv2 -emit-llvm %s -o - | FileCheck %s -check-prefix=ENABLED
-// RUN: %clang -fwinx64-eh-unwindv2 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=ENABLED
-// RUN: %clang -fno-winx64-eh-unwindv2 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=disabled -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=best-effort -emit-llvm %s -o - | FileCheck %s -check-prefix=BESTEFFORT
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=required -emit-llvm %s -o - | FileCheck %s -check-prefix=REQUIRED
+// RUN: %clang -fwinx64-eh-unwindv2=best-effort -S -emit-llvm %s -o - | FileCheck %s -check-prefix=BESTEFFORT
 
 void f(void) {}
 
-// ENABLED: !"winx64-eh-unwindv2", i32 1}
+// BESTEFFORT: !"winx64-eh-unwindv2", i32 1}
+// REQUIRED: !"winx64-eh-unwindv2", i32 2}
 // DISABLED-NOT: "winx64-eh-unwindv2"
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 0535285862b9..eb079895a0a8 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -821,7 +821,11 @@
 // ARM64EC_OVERRIDE: warning: /arm64EC has been overridden by specified target: x86_64-pc-windows-msvc; option ignored
 
 // RUN: %clang_cl /d2epilogunwind /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWIND
-// EPILOGUNWIND: -fwinx64-eh-unwindv2
+// EPILOGUNWIND: -fwinx64-eh-unwindv2=best-effort
+
+// RUN: %clang_cl /d2epilogunwindrequirev2 /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWINDREQUIREV2
+// RUN: %clang_cl /d2epilogunwindrequirev2 /d2epilogunwind /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWINDREQUIREV2
+// EPILOGUNWINDREQUIREV2: -fwinx64-eh-unwindv2=require
 
 // RUN: %clang_cl /funcoverride:override_me1 /funcoverride:override_me2 /c -### -- %s 2>&1 | FileCheck %s --check-prefix=FUNCOVERRIDE
 // FUNCOVERRIDE: -loader-replaceable-function=override_me1
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index f4420f460741..a99937a90cbb 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -1041,6 +1041,10 @@ public:
 
   /// Returns target-abi from MDString, null if target-abi is absent.
   StringRef getTargetABIFromMD();
+
+  /// Get how unwind v2 (epilog) information should be generated for x64
+  /// Windows.
+  WinX64EHUnwindV2Mode getWinX64EHUnwindV2Mode() const;
 };
 
 /// Given "llvm.used" or "llvm.compiler.used" as a global name, collect the
diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index 0e42789ba932..48745f7f4d2a 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -130,6 +130,15 @@ namespace llvm {
     Invalid = 2, ///< Not used.
   };
 
+  enum class WinX64EHUnwindV2Mode {
+    // Don't use unwind v2 (i.e., use v1).
+    Disabled = 0,
+    // Use unwind v2 here possible, otherwise fallback to v1.
+    BestEffort = 1,
+    // Use unwind v2 everywhere, otherwise raise an error.
+    Required = 2,
+  };
+
   } // namespace llvm
 
 #endif
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 37f4a72d8c20..2d31481f62c6 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -917,3 +917,10 @@ StringRef Module::getTargetABIFromMD() {
     TargetABI = TargetABIMD->getString();
   return TargetABI;
 }
+
+WinX64EHUnwindV2Mode Module::getWinX64EHUnwindV2Mode() const {
+  Metadata *MD = getModuleFlag("winx64-eh-unwindv2");
+  if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
+    return static_cast<WinX64EHUnwindV2Mode>(CI->getZExtValue());
+  return WinX64EHUnwindV2Mode::Disabled;
+}
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index 2c1f9a5746e3..e9081a4ae4e7 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
 
 using namespace llvm;
@@ -31,6 +32,15 @@ STATISTIC(MeetsUnwindV2Criteria,
 STATISTIC(FailsUnwindV2Criteria,
           "Number of functions that fail Unwind v2 criteria");
 
+static cl::opt<unsigned> MaximumUnwindCodes(
+    "x86-wineh-unwindv2-max-unwind-codes", cl::Hidden,
+    cl::desc("Maximum number of unwind codes permitted in each unwind info."),
+    cl::init(UINT8_MAX));
+
+static cl::opt<unsigned>
+    ForceMode("x86-wineh-unwindv2-force-mode", cl::Hidden,
+              cl::desc("Overwrites the Unwind v2 mode for testing purposes."));
+
 namespace {
 
 class X86WinEHUnwindV2 : public MachineFunctionPass {
@@ -44,10 +54,12 @@ public:
   StringRef getPassName() const override { return "WinEH Unwind V2"; }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
-  bool rejectCurrentFunction() const {
-    FailsUnwindV2Criteria++;
-    return false;
-  }
+
+private:
+  /// Rejects the current function due to an internal error within LLVM.
+  static bool rejectCurrentFunctionInternalError(const MachineFunction &MF,
+                                                 WinX64EHUnwindV2Mode Mode,
+                                                 StringRef Reason);
 };
 
 enum class FunctionState {
@@ -69,8 +81,21 @@ FunctionPass *llvm::createX86WinEHUnwindV2Pass() {
   return new X86WinEHUnwindV2();
 }
 
+DebugLoc findDebugLoc(const MachineBasicBlock &MBB) {
+  for (const MachineInstr &MI : MBB)
+    if (MI.getDebugLoc())
+      return MI.getDebugLoc();
+
+  return DebugLoc::getUnknown();
+}
+
 bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
-  if (!MF.getFunction().getParent()->getModuleFlag("winx64-eh-unwindv2"))
+  WinX64EHUnwindV2Mode Mode =
+      ForceMode.getNumOccurrences()
+          ? static_cast<WinX64EHUnwindV2Mode>(ForceMode.getValue())
+          : MF.getFunction().getParent()->getWinX64EHUnwindV2Mode();
+
+  if (Mode == WinX64EHUnwindV2Mode::Disabled)
     return false;
 
   // Current state of processing the function. We'll assume that all functions
@@ -80,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
   // Prolog information.
   SmallVector<int64_t> PushedRegs;
   bool HasStackAlloc = false;
+  unsigned ApproximatePrologCodeCount = 0;
 
   // Requested changes.
   SmallVector<MachineInstr *> UnwindV2StartLocations;
@@ -99,6 +125,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_PushReg:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_PushReg outside of prolog");
+        ApproximatePrologCodeCount++;
         PushedRegs.push_back(MI.getOperand(0).getImm());
         break;
 
@@ -106,9 +133,26 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_SetFrame:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+        // Assume a large alloc...
+        ApproximatePrologCodeCount +=
+            (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
         HasStackAlloc = true;
         break;
 
+      case X86::SEH_SaveReg:
+      case X86::SEH_SaveXMM:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_SaveXMM or SEH_SaveReg outside of prolog");
+        // Assume a big reg...
+        ApproximatePrologCodeCount += 3;
+        break;
+
+      case X86::SEH_PushFrame:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_PushFrame outside of prolog");
+        ApproximatePrologCodeCount++;
+        break;
+
       case X86::SEH_EndPrologue:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_EndPrologue outside of prolog");
@@ -127,10 +171,16 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_EndEpilogue:
         if (State != FunctionState::InEpilog)
           llvm_unreachable("SEH_EndEpilogue outside of epilog");
-        if ((HasStackAlloc != HasStackDealloc) ||
-            (PoppedRegCount != PushedRegs.size()))
-          // Non-canonical epilog, reject the function.
-          return rejectCurrentFunction();
+        if (HasStackAlloc != HasStackDealloc)
+          return rejectCurrentFunctionInternalError(
+              MF, Mode,
+              "The prolog made a stack allocation, "
+              "but the epilog did not deallocate it");
+        if (PoppedRegCount != PushedRegs.size())
+          return rejectCurrentFunctionInternalError(
+              MF, Mode,
+              "The prolog pushed more registers than "
+              "the epilog popped");
 
         // If we didn't find the start location, then use the end of the
         // epilog.
@@ -145,13 +195,26 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
         if (State == FunctionState::InEpilog) {
           // If the prolog contains a stack allocation, then the first
           // instruction in the epilog must be to adjust the stack pointer.
-          if (!HasStackAlloc || HasStackDealloc || (PoppedRegCount > 0)) {
-            return rejectCurrentFunction();
-          }
+          if (!HasStackAlloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is deallocating a stack "
+                "allocation, but the prolog did "
+                "not allocate one");
+          if (HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is deallocating the stack "
+                "allocation more than once");
+          if (PoppedRegCount > 0)
+            llvm_unreachable(
+                "Should have raised an error: either popping before "
+                "deallocating or deallocating without an allocation");
+
           HasStackDealloc = true;
         } else if (State == FunctionState::FinishedEpilog)
-          // Unexpected instruction after the epilog.
-          return rejectCurrentFunction();
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Unexpected mov or add instruction after the epilog");
         break;
 
       case X86::POP64r:
@@ -159,12 +222,22 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           // After the stack pointer has been adjusted, the epilog must
           // POP each register in reverse order of the PUSHes in the prolog.
           PoppedRegCount++;
-          if ((HasStackAlloc != HasStackDealloc) ||
-              (PoppedRegCount > PushedRegs.size()) ||
-              (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
-               MI.getOperand(0).getReg())) {
-            return rejectCurrentFunction();
-          }
+          if (HasStackAlloc != HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "Cannot pop registers before the stack "
+                "allocation has been deallocated");
+          if (PoppedRegCount > PushedRegs.size())
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is popping more registers than the prolog pushed");
+          if (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
+              MI.getOperand(0).getReg())
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is popping a registers in "
+                "a different order than the "
+                "prolog pushed them");
 
           // Unwind v2 records the size of the epilog not from where we place
           // SEH_BeginEpilogue (as that contains the instruction to adjust the
@@ -176,7 +249,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           }
         } else if (State == FunctionState::FinishedEpilog)
           // Unexpected instruction after the epilog.
-          return rejectCurrentFunction();
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Registers are being popped after the epilog");
         break;
 
       default:
@@ -191,7 +265,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           if ((State == FunctionState::FinishedEpilog) ||
               (State == FunctionState::InEpilog))
             // Unknown instruction in or after the epilog.
-            return rejectCurrentFunction();
+            return rejectCurrentFunctionInternalError(
+                MF, Mode, "Unexpected instruction in or after the epilog");
         }
       }
     }
@@ -203,6 +278,25 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 
+  MachineBasicBlock &FirstMBB = MF.front();
+  // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and
+  // that we won't be able to use the "last epilog at the end of function"
+  // optimization.
+  if (ApproximatePrologCodeCount + UnwindV2StartLocations.size() + 1 >
+      static_cast<unsigned>(MaximumUnwindCodes)) {
+    if (Mode == WinX64EHUnwindV2Mode::Required)
+      MF.getFunction().getContext().diagnose(DiagnosticInfoGenericWithLoc(
+          "Windows x64 Unwind v2 is required, but the function '" +
+              MF.getName() +
+              "' has too many unwind codes. Try splitting the function or "
+              "reducing the number of places where it exits early with a tail "
+              "call.",
+          MF.getFunction(), findDebugLoc(FirstMBB)));
+
+    FailsUnwindV2Criteria++;
+    return false;
+  }
+
   MeetsUnwindV2Criteria++;
 
   // Emit the pseudo instruction that marks the start of each epilog.
@@ -212,10 +306,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
             TII->get(X86::SEH_UnwindV2Start));
   }
   // Note that the function is using Unwind v2.
-  MachineBasicBlock &FirstMBB = MF.front();
-  BuildMI(FirstMBB, FirstMBB.front(), FirstMBB.front().getDebugLoc(),
+  BuildMI(FirstMBB, FirstMBB.front(), findDebugLoc(FirstMBB),
           TII->get(X86::SEH_UnwindVersion))
       .addImm(2);
 
   return true;
 }
+
+bool X86WinEHUnwindV2::rejectCurrentFunctionInternalError(
+    const MachineFunction &MF, WinX64EHUnwindV2Mode Mode, StringRef Reason) {
+  if (Mode == WinX64EHUnwindV2Mode::Required)
+    reportFatalInternalError("Windows x64 Unwind v2 is required, but LLVM has "
+                             "generated incompatible code in function '" +
+                             MF.getName() + "': " + Reason);
+
+  FailsUnwindV2Criteria++;
+  return false;
+}
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
new file mode 100644
index 000000000000..f099d4fddcb3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
@@ -0,0 +1,318 @@
+# RUN: split-file %s %t
+
+# If we force "best effort" mode, then we won't see any errors, but we won't use
+# v2.
+# BESTEFFORT-NOT: SEH_UnwindVersion
+# BESTEFFORT-NOT: SEH_UnwindV2Start
+
+;--- alloc_no_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/alloc_no_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=ALLOC-NO-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/alloc_no_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# ALLOC-NO-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'alloc_no_dealloc':
+# ALLOC-NO-DEALLOC-SAME: The prolog made a stack allocation, but the epilog did not deallocate it
+
+--- |
+  define dso_local void @alloc_no_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            alloc_no_dealloc
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- missed_push.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/missed_push.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=MISSED-PUSH
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/missed_push.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MISSED-PUSH: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'missed_push':
+# MISSED-PUSH-SAME: The prolog pushed more registers than the epilog popped
+
+--- |
+  define dso_local void @missed_push() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            missed_push
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- dealloc_no_alloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_no_alloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-NO-ALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/dealloc_no_alloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DEALLOC-NO-ALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_no_alloc':
+# DEALLOC-NO-ALLOC-SAME: The epilog is deallocating a stack allocation, but the prolog did not allocate one
+
+--- |
+  define dso_local void @dealloc_no_alloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_no_alloc
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- double_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=DOUBLE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DOUBLE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'double_dealloc':
+# DOUBLE-DEALLOC-SAME: The epilog is deallocating the stack allocation more than once
+
+--- |
+  define dso_local void @double_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            double_dealloc
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- dealloc_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
+# RUN:    --check-prefix=BESTEFFORT
+# DEALLOC-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_after_epilog':
+# DEALLOC-AFTER-EPILOG-SAME: Unexpected mov or add instruction after the epilog
+
+--- |
+  define dso_local void @dealloc_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    RET64
+...
+
+;--- pop_before_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_before_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-BEFORE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-BEFORE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_dealloc':
+# POP-BEFORE-DEALLOC-SAME: Cannot pop registers before the stack allocation has been deallocated
+
+--- |
+  define dso_local void @pop_before_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_before_dealloc
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- too_many_pops.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=TOO-MANY-POPS
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# TOO-MANY-POPS: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'too_many_pops':
+# TOO-MANY-POPS-SAME: The epilog is popping more registers than the prolog pushed
+
+--- |
+  define dso_local void @too_many_pops() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            too_many_pops
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- pop_in_wrong_order.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_in_wrong_order.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-WRONG-ORDER
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_in_wrong_order.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-WRONG-ORDER: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_in_wrong_order':
+# POP-WRONG-ORDER-SAME: The epilog is popping a registers in a different order than the prolog pushed them
+
+--- |
+  define dso_local void @pop_in_wrong_order() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_in_wrong_order
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- pop_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_after_epilog.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_after_epilog':
+# POP-AFTER-EPILOG-SAME: Registers are being popped after the epilog
+
+--- |
+  define dso_local void @pop_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    RET64
+...
+
+;--- instr_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/instr_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=INSTR-AFTER-END
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/instr_after_epilog.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# INSTR-AFTER-END: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'instr_after_epilog':
+# INSTR-AFTER-END-SAME: Unexpected instruction in or after the epilog
+
+--- |
+  define dso_local void @instr_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            instr_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $ecx = MOV32rr killed $eax
+    RET64
+...
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
new file mode 100644
index 000000000000..70c87ad87f79
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
@@ -0,0 +1,94 @@
+# Require V2 and restrict the number of unwind codes to 8
+# RUN: not llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-max-unwind-codes=8 \
+# RUN:    2>&1 | FileCheck %s -check-prefix=REQUIREV2
+
+# Force best-effort and restrict the number of unwind codes to 8
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-max-unwind-codes=8 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s -check-prefix=BESTEFFORT
+
+# Require V2, but allow the default number of unwind codes (255)
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 | FileCheck %s -check-prefix=ALLOWMORE
+
+# Usually 255 unwind codes are permitted, but we passed an arg to llc to limit
+# it to 8.
+# REQUIREV2: error: example.c:2:1: Windows x64 Unwind v2 is required, but the function 'too_many_epilogs' has too many unwind codes.
+# REQUIREV2-SAME: Try splitting the function or reducing the number of places where it exits early with a tail call.
+
+# If we force "best effort" mode, then we won't see any errors, but we won't use
+# v2.
+# BESTEFFORT-NOT: SEH_UnwindVersion
+# BESTEFFORT-NOT: SEH_UnwindV2Start
+
+# If we allow more epilogs then too_many_epilogs will compile with v2.
+# ALLOWMORE-LABEL: too_many_epilogs
+# ALLOWMORE: SEH_UnwindVersion 2
+# ALLOWMORE: SEH_UnwindV2Start
+
+--- |
+  define dso_local void @too_many_epilogs() local_unnamed_addr !dbg !9 {
+  entry:
+    ret void, !dbg !10
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2, !3, !4, !5}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "/app/example.c", directory: "/app")
+  !2 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"CodeView", i32 1}
+  !5 = !{i32 2, !"Debug Info Version", i32 3}
+  !6 = !DIFile(filename: "example.c", directory: "/app")
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{null}
+  !9 = distinct !DISubprogram(name: "too_many_epilogs", scope: !6, file: !6, line: 1, type: !7, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0)
+  !10 = !DILocation(line: 2, column: 1, scope: !9)
+  !11 = !DILocation(line: 3, column: 1, scope: !9)
+...
+---
+name:            too_many_epilogs
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !10
+  bb.1:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.2:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.3:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.4:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.5:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.6:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.7:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.8:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+
+...

From 98eee4b554be18f734088455cb4cd9dc634e7602 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:20:08 -0700
Subject: [PATCH 0562/1322] [libc] utf8 to 32 CharacterConverter (#143973)

Implemented push and pop for utf8 to 32 conversion and tests.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 .../__support/wchar/character_converter.cpp   |  55 +++++
 libc/test/src/__support/CMakeLists.txt        |   5 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 +
 .../src/__support/wchar/utf8_to_32_test.cpp   | 196 ++++++++++++++++++
 4 files changed, 263 insertions(+), 3 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ca709769616c..7f147ac26d3d 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
@@ -30,6 +31,49 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
+int CharacterConverter::push(char8_t utf8_byte) {
+  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0) {
+    // UTF-8 char has 1 byte total
+    if (num_ones == 0) {
+      state->total_bytes = 1;
+    }
+    // UTF-8 char has 2 through 4 bytes total
+    else if (num_ones >= 2 && num_ones <= 4) {
+      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+      we will make the base mask with 7 ones and right shift it as necessary. */
+      constexpr size_t SIGNIFICANT_BITS = 7;
+      uint32_t base_mask = mask_trailing_ones<uint32_t, SIGNIFICANT_BITS>();
+      state->total_bytes = num_ones;
+      utf8_byte &= (base_mask >> num_ones);
+    }
+    // Invalid first byte
+    else {
+      // bytes_processed and total_bytes will always be 0 here
+      state->partial = static_cast<char32_t>(0);
+      return -1;
+    }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
+  }
+  // Any subsequent push
+  // Adding 6 more bits so need to left shift
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+  if (num_ones == 1 && !isComplete()) {
+    char32_t byte =
+        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  // Invalid byte -> reset the state
+  clear();
+  return -1;
+}
+
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
   if (!isComplete())
@@ -54,6 +98,17 @@ int CharacterConverter::push(char32_t utf32) {
   return -1;
 }
 
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+  // If pop is called too early, do not reset the state, use error to determine
+  // whether enough bytes have been pushed
+  if (!isComplete() || state->bytes_processed == 0)
+    return Error(-1);
+  char32_t utf32 = state->partial;
+  // reset if successful pop
+  clear();
+  return utf32;
+}
+
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isComplete())
     return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 76218a16e0cf..9f626ed31cc0 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-
-# Requires access to uchar header which is not on macos
-# Therefore, cannot currently build this on macos in overlay mode
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5dff6e9115f7..5176bfd4b024 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,5 +1,15 @@
 add_custom_target(libc-support-wchar-tests)
 
+add_libc_test(
+  utf8_to_32_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf8_to_32_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
+
 add_libc_test(
   utf32_to_8_test
   SUITE
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 000000000000..9cb059faa937
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,196 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  char ch = 'A';
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(err, 0);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; //  car symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  char_conv.push(static_cast<char8_t>(ch[3]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {
+      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+      static_cast<char>(0x00)}; // first and third bytes are invalid
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Prev byte was single byte so trying to push another should error.
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Should produce an error on 3rd byte
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+
+  // Should produce an error since mbstate was reset
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(wch.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+
+  // Second two byte character
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 460);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(
+      wch.has_value()); // Should fail since we have not read enough bytes
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}

From 6e124423546e5d22b4b6dc64d6cedfe93e627d58 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:33:32 -0700
Subject: [PATCH 0563/1322] Revert "[libc] utf8 to 32 CharacterConverter"
 (#144446)

Reverts llvm/llvm-project#143973
This merge broke the build and I'm currently looking into the issue to
fix it.
---
 .../__support/wchar/character_converter.cpp   |  55 -----
 libc/test/src/__support/CMakeLists.txt        |   5 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 -
 .../src/__support/wchar/utf8_to_32_test.cpp   | 196 ------------------
 4 files changed, 3 insertions(+), 263 deletions(-)
 delete mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 7f147ac26d3d..ca709769616c 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,7 +8,6 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
@@ -31,49 +30,6 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {
-  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
-  // Checking the first byte if first push
-  if (state->bytes_processed == 0) {
-    // UTF-8 char has 1 byte total
-    if (num_ones == 0) {
-      state->total_bytes = 1;
-    }
-    // UTF-8 char has 2 through 4 bytes total
-    else if (num_ones >= 2 && num_ones <= 4) {
-      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
-      we will make the base mask with 7 ones and right shift it as necessary. */
-      constexpr size_t SIGNIFICANT_BITS = 7;
-      uint32_t base_mask = mask_trailing_ones<uint32_t, SIGNIFICANT_BITS>();
-      state->total_bytes = num_ones;
-      utf8_byte &= (base_mask >> num_ones);
-    }
-    // Invalid first byte
-    else {
-      // bytes_processed and total_bytes will always be 0 here
-      state->partial = static_cast<char32_t>(0);
-      return -1;
-    }
-    state->partial = static_cast<char32_t>(utf8_byte);
-    state->bytes_processed++;
-    return 0;
-  }
-  // Any subsequent push
-  // Adding 6 more bits so need to left shift
-  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
-  if (num_ones == 1 && !isComplete()) {
-    char32_t byte =
-        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
-    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
-    state->partial |= byte;
-    state->bytes_processed++;
-    return 0;
-  }
-  // Invalid byte -> reset the state
-  clear();
-  return -1;
-}
-
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
   if (!isComplete())
@@ -98,17 +54,6 @@ int CharacterConverter::push(char32_t utf32) {
   return -1;
 }
 
-ErrorOr<char32_t> CharacterConverter::pop_utf32() {
-  // If pop is called too early, do not reset the state, use error to determine
-  // whether enough bytes have been pushed
-  if (!isComplete() || state->bytes_processed == 0)
-    return Error(-1);
-  char32_t utf32 = state->partial;
-  // reset if successful pop
-  clear();
-  return utf32;
-}
-
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isComplete())
     return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9f626ed31cc0..76218a16e0cf 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,8 +275,9 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-# Requires access to uchar header which is not on MacOS
-# Cannot currently build this on MacOS in overlay mode
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024..5dff6e9115f7 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,15 +1,5 @@
 add_custom_target(libc-support-wchar-tests)
 
-add_libc_test(
-  utf8_to_32_test 
-  SUITE
-    libc-support-tests
-  SRCS
-    utf8_to_32_test.cpp 
-  DEPENDS
-    libc.src.__support.wchar.character_converter
-)
-
 add_libc_test(
   utf32_to_8_test
   SUITE
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
deleted file mode 100644
index 9cb059faa937..000000000000
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===-- Unittests for character_converter utf8->utf32 ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/__support/error_or.h"
-#include "src/__support/wchar/character_converter.h"
-#include "src/__support/wchar/mbstate.h"
-#include "test/UnitTest/Test.h"
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  char ch = 'A';
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_EQ(err, 0);
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 65);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[2] = {static_cast<char>(0xC2),
-                      static_cast<char>(0x8E)}; //  car symbol
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  char_conv.push(static_cast<char8_t>(ch[0]));
-  char_conv.push(static_cast<char8_t>(ch[1]));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 142);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
-                      static_cast<char>(0x91)}; // ∑ sigma symbol
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  char_conv.push(static_cast<char8_t>(ch[0]));
-  char_conv.push(static_cast<char8_t>(ch[1]));
-  char_conv.push(static_cast<char8_t>(ch[2]));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
-                      static_cast<char>(0xA4),
-                      static_cast<char>(0xA1)}; // 🤡 clown emoji
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  char_conv.push(static_cast<char8_t>(ch[0]));
-  char_conv.push(static_cast<char8_t>(ch[1]));
-  char_conv.push(static_cast<char8_t>(ch[2]));
-  char_conv.push(static_cast<char8_t>(ch[3]));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch));
-
-  ASSERT_EQ(err, -1);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[4] = {
-      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
-      static_cast<char>(0x00)}; // first and third bytes are invalid
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, -1);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  // Prev byte was single byte so trying to push another should error.
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, -1);
-  err = char_conv.push(static_cast<char8_t>(ch[3]));
-  ASSERT_EQ(err, 0);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  // Last byte is invalid since it does not have correct starting sequence.
-  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
-  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
-                      static_cast<char>(0x80), static_cast<char>(0xC0)};
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[3]));
-  ASSERT_EQ(err, -1);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
-                      static_cast<char>(0x80)};
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  // Should produce an error on 3rd byte
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, -1);
-
-  // Should produce an error since mbstate was reset
-  auto wch = char_conv.pop_utf32();
-  ASSERT_FALSE(wch.has_value());
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
-                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  auto wch = char_conv.pop_utf32();
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 142);
-
-  // Second two byte character
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[3]));
-  ASSERT_EQ(err, 0);
-  wch = char_conv.pop_utf32();
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 460);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  auto wch = char_conv.pop_utf32();
-  ASSERT_FALSE(
-      wch.has_value()); // Should fail since we have not read enough bytes
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  wch = char_conv.pop_utf32();
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 142);
-}

From 99e53cb4139eda491f97cb33ee42ea424d352200 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:47:43 -0700
Subject: [PATCH 0564/1322] [llvm][StackProtector] Add noreturn to
 __stack_chk_fail call (#143976)

It's possible for __stack_chk_fail to be an alias when using CrossDSOCFI
since it will make a jump table entry for this function and replace it
with an alias. StackProtector can crash since it always expects this to
be a regular function. Instead add the noreturn attribute to the call.
---
 llvm/lib/CodeGen/StackProtector.cpp           |  4 +--
 .../cross-dso-cfi-stack-chk-fail.ll           | 33 +++++++++++++++++++
 .../StackProtector/stack-chk-fail-alias.ll    | 21 ++++++++++++
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 create mode 100644 llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll

diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 5f866eea7d4e..dda392d38b27 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -725,8 +725,8 @@ BasicBlock *CreateFailBB(Function *F, const Triple &Trip) {
     StackChkFail =
         M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
   }
-  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
-  B.CreateCall(StackChkFail, Args);
+  CallInst *Call = B.CreateCall(StackChkFail, Args);
+  Call->addFnAttr(Attribute::NoReturn);
   B.CreateUnreachable();
   return FailBB;
 }
diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
new file mode 100644
index 000000000000..af03039813a2
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
@@ -0,0 +1,33 @@
+;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when
+;; CrossDSOCFI is used. This test just needs to not crash.
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector
+
+define hidden void @__stack_chk_fail() !type !1{
+  unreachable
+}
+
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+define void @func(ptr %0) {
+entry:
+  %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid")
+  br i1 %1, label %cont, label %trap
+
+trap:                                             ; preds = %entry
+  call void @llvm.trap()
+  unreachable
+
+cont:                                             ; preds = %entry
+  call void %0()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"Cross-DSO CFI", i32 1}
+!1 = !{i64 0, !"typeid"}
diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
new file mode 100644
index 000000000000..ab0a6e3f455e
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
@@ -0,0 +1,21 @@
+;; __stack_chk_fail should have the noreturn attr even if it is an alias
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s
+
+define hidden void @__stack_chk_fail_impl() {
+  unreachable
+}
+
+@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl
+
+; CHECK-LABEL: @store_captures(
+; CHECK:       CallStackCheckFailBlk:
+; CHECK-NEXT:      call void @__stack_chk_fail() [[ATTRS:#.*]]
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+; CHECK: attributes [[ATTRS]] = { noreturn }

From 964888d01f0b0f81540f8548370f00c315952042 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:24:45 -0700
Subject: [PATCH 0565/1322] [llvm][CFI] Ensure COFF comdat renaming applies for
 imported functions (#143421)

I ran into the same issue as
https://github.com/llvm/llvm-project/pull/139962 regarding the comdat
corresponding to a renamed key function but for thinlto. My last patch
had not considered the thinlto case, so this applies the same fix for
imported functions.
---
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp    | 34 ++++++++++++-------
 .../Inputs/import-thinlto-funcs.yaml          |  5 +++
 .../LowerTypeTests/cfi-coff-comdat-rename.ll  |  2 ++
 3 files changed, 28 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml

diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index ab67a0980e0c..20b54c056cc2 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -561,6 +561,8 @@ class LowerTypeTestsModule {
     return FunctionAnnotations.contains(V);
   }
 
+  void maybeReplaceComdat(Function *F, StringRef OriginalName);
+
 public:
   LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM,
                        ModuleSummaryIndex *ExportSummary,
@@ -1082,6 +1084,23 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
   }
 }
 
+void LowerTypeTestsModule::maybeReplaceComdat(Function *F,
+                                              StringRef OriginalName) {
+  // For COFF we should also rename the comdat if this function also
+  // happens to be the key function. Even if the comdat name changes, this
+  // should still be fine since comdat and symbol resolution happens
+  // before LTO, so all symbols which would prevail have been selected.
+  if (F->hasComdat() && ObjectFormat == Triple::COFF &&
+      F->getComdat()->getName() == OriginalName) {
+    Comdat *OldComdat = F->getComdat();
+    Comdat *NewComdat = M.getOrInsertComdat(F->getName());
+    for (GlobalObject &GO : M.global_objects()) {
+      if (GO.getComdat() == OldComdat)
+        GO.setComdat(NewComdat);
+    }
+  }
+}
+
 // ThinLTO backend: the function F has a jump table entry; update this module
 // accordingly. isJumpTableCanonical describes the type of the jump table entry.
 void LowerTypeTestsModule::importFunction(
@@ -1115,6 +1134,7 @@ void LowerTypeTestsModule::importFunction(
     FDecl->setVisibility(GlobalValue::HiddenVisibility);
   } else {
     F->setName(Name + ".cfi");
+    maybeReplaceComdat(F, Name);
     F->setLinkage(GlobalValue::ExternalLinkage);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
                              F->getAddressSpace(), Name, &M);
@@ -1734,19 +1754,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
       FAlias->takeName(F);
       if (FAlias->hasName()) {
         F->setName(FAlias->getName() + ".cfi");
-        // For COFF we should also rename the comdat if this function also
-        // happens to be the key function. Even if the comdat name changes, this
-        // should still be fine since comdat and symbol resolution happens
-        // before LTO, so all symbols which would prevail have been selected.
-        if (F->hasComdat() && ObjectFormat == Triple::COFF &&
-            F->getComdat()->getName() == FAlias->getName()) {
-          Comdat *OldComdat = F->getComdat();
-          Comdat *NewComdat = M.getOrInsertComdat(F->getName());
-          for (GlobalObject &GO : M.global_objects()) {
-            if (GO.getComdat() == OldComdat)
-              GO.setComdat(NewComdat);
-          }
-        }
+        maybeReplaceComdat(F, FAlias->getName());
       }
       replaceCfiUses(F, FAlias, IsJumpTableCanonical);
       if (!F->hasLocalLinkage())
diff --git a/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml b/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
new file mode 100644
index 000000000000..459d45032b0c
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
@@ -0,0 +1,5 @@
+---
+CfiFunctionDefs:
+  - f1
+  - f2
+...
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll b/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
index 7dda7f6df10c..7eede8b7322f 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: x86-registered-target
 ; RUN: opt -S -passes=lowertypetests %s | FileCheck %s
+; RUN: opt -S -passes=lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%p/Inputs/import-thinlto-funcs.yaml %s | FileCheck %s
 
 ;; This is a check to assert we don't crash with:
 ;;
@@ -7,6 +8,7 @@
 ;;
 ;; So this just needs to exit normally.
 ; RUN: opt -S -passes=lowertypetests %s | llc -asm-verbose=false
+; RUN: opt -S -passes=lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%p/Inputs/import-thinlto-funcs.yaml %s | llc -asm-verbose=false
 
 target datalayout = "e-p:64:64"
 target triple = "x86_64-pc-windows-msvc"

From ac7af53d05b94849fd590b1875db7b85957fb0f6 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 16 Jun 2025 16:26:03 -0700
Subject: [PATCH 0566/1322] [flang] Fixed LIT tests to create modfiles in a
 temp dir. (#144448)

---
 flang/test/Semantics/modfile71.F90 |  7 ++++---
 flang/test/Semantics/modfile75.F90 |  3 ++-
 flang/test/Semantics/modfile76.F90 | 15 ++++++++-------
 flang/test/Semantics/modfile77.F90 |  3 ++-
 flang/test/Semantics/modfile78.F90 |  3 ++-
 flang/test/Semantics/modfile79.F90 |  3 ++-
 6 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/flang/test/Semantics/modfile71.F90 b/flang/test/Semantics/modfile71.F90
index 7c3c7f5b4895..7f32eb18c6f8 100644
--- a/flang/test/Semantics/modfile71.F90
+++ b/flang/test/Semantics/modfile71.F90
@@ -1,6 +1,7 @@
-!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
-!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 %s
-!RUN: not %flang_fc1 -fsyntax-only -pedantic %s 2>&1 | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 -J%t %s
+!RUN: not %flang_fc1 -fsyntax-only -pedantic -J%t %s 2>&1 | FileCheck %s
 
 ! Tests that a module captured in a hermetic module file is compatible when
 ! USE'd with a module of the same name USE'd directly.
diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index aba00ffac848..8f7adafe7204 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang_fc1 -fdebug-unparse %s | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1
 module modfile75a
diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90
index 50ee9a088e11..c7ae91bd42be 100644
--- a/flang/test/Semantics/modfile76.F90
+++ b/flang/test/Semantics/modfile76.F90
@@ -1,23 +1,24 @@
-!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
-!RUN: %flang_fc1 -fsyntax-only %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -fsyntax-only -J%t %s
 
 ! Tests that a BIND(C) variable in a module A captured in a hermetic module
 ! file USE'd in a module B is not creating bogus complaints about BIND(C) name
 ! conflict when both module A and B are later accessed.
 
 #if STEP == 1
-module modfile75a
+module modfile76a
   integer, bind(c) :: x
 end
 
-module modfile75b
-  use modfile75a ! capture hermetically
+module modfile76b
+  use modfile76a ! capture hermetically
 end
 
 #else
 subroutine test
-  use modfile75a
-  use modfile75b
+  use modfile76a
+  use modfile76b
   implicit none
   print *, x
 end subroutine
diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90
index a82904ebbcc2..9ad615c16c43 100644
--- a/flang/test/Semantics/modfile77.F90
+++ b/flang/test/Semantics/modfile77.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -fhermetic-module-files -J%t %s && cat %t/modfile77c.mod | FileCheck %s
 
 #if WHICH == 1
 module modfile77a
diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90
index cb3eccd9a410..19b9ac39de93 100644
--- a/flang/test/Semantics/modfile78.F90
+++ b/flang/test/Semantics/modfile78.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -fhermetic-module-files -J%t %s && cat %t/modfile78c.mod | FileCheck %s
 
 #if WHICH == 1
 module modfile78a
diff --git a/flang/test/Semantics/modfile79.F90 b/flang/test/Semantics/modfile79.F90
index 7d3b42166654..ae156527b3bf 100644
--- a/flang/test/Semantics/modfile79.F90
+++ b/flang/test/Semantics/modfile79.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -DWHICH=1 %s && FileCheck %s <modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c %s && FileCheck %s <modfile79a.mod
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -DWHICH=1 -J%t %s && FileCheck %s <%t/modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -J%t %s && FileCheck %s <%t/modfile79a.mod
 
 !Ensure that writing modfile79c.mod doesn't cause a spurious
 !regeneration of modfile79a.mod from its copy in the hermetic

From d882670d498a29f4e02f357ef9fe07c43de034c8 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:34:40 -0700
Subject: [PATCH 0567/1322] Revert "[llvm][StackProtector] Add noreturn to
 __stack_chk_fail call" (#144452)

Reverts llvm/llvm-project#143976

Reverting since this broke a builder:
https://lab.llvm.org/buildbot/#/builders/190/builds/21563
---
 llvm/lib/CodeGen/StackProtector.cpp           |  4 +--
 .../cross-dso-cfi-stack-chk-fail.ll           | 33 -------------------
 .../StackProtector/stack-chk-fail-alias.ll    | 21 ------------
 3 files changed, 2 insertions(+), 56 deletions(-)
 delete mode 100644 llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 delete mode 100644 llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll

diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index dda392d38b27..5f866eea7d4e 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -725,8 +725,8 @@ BasicBlock *CreateFailBB(Function *F, const Triple &Trip) {
     StackChkFail =
         M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
   }
-  CallInst *Call = B.CreateCall(StackChkFail, Args);
-  Call->addFnAttr(Attribute::NoReturn);
+  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
+  B.CreateCall(StackChkFail, Args);
   B.CreateUnreachable();
   return FailBB;
 }
diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
deleted file mode 100644
index af03039813a2..000000000000
--- a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when
-;; CrossDSOCFI is used. This test just needs to not crash.
-; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector
-
-define hidden void @__stack_chk_fail() !type !1{
-  unreachable
-}
-
-define void @store_captures() sspstrong {
-entry:
-  %a = alloca i32, align 4
-  %j = alloca ptr, align 8
-  store ptr %a, ptr %j, align 8
-  ret void
-}
-
-define void @func(ptr %0) {
-entry:
-  %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid")
-  br i1 %1, label %cont, label %trap
-
-trap:                                             ; preds = %entry
-  call void @llvm.trap()
-  unreachable
-
-cont:                                             ; preds = %entry
-  call void %0()
-  ret void
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 4, !"Cross-DSO CFI", i32 1}
-!1 = !{i64 0, !"typeid"}
diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
deleted file mode 100644
index ab0a6e3f455e..000000000000
--- a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-;; __stack_chk_fail should have the noreturn attr even if it is an alias
-; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s
-
-define hidden void @__stack_chk_fail_impl() {
-  unreachable
-}
-
-@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl
-
-; CHECK-LABEL: @store_captures(
-; CHECK:       CallStackCheckFailBlk:
-; CHECK-NEXT:      call void @__stack_chk_fail() [[ATTRS:#.*]]
-define void @store_captures() sspstrong {
-entry:
-  %a = alloca i32, align 4
-  %j = alloca ptr, align 8
-  store ptr %a, ptr %j, align 8
-  ret void
-}
-
-; CHECK: attributes [[ATTRS]] = { noreturn }

From 6421bd94eabdb71975c75e2c1621a095b3d8b6ad Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 16 Jun 2025 17:24:48 -0700
Subject: [PATCH 0568/1322] [lldb-dap] Creating protocol types for
 setExceptionBreakpoints. (#144153)

This adds new types for setExceptionBreakpoints and adds support for
`supportsExceptionFilterOptions`, which allows exception breakpoints to
set a condition.

While testing this, I noticed that obj-c exception catch breakpoints may
not be working correctly in lldb-dap.
---
 .../test/tools/lldb-dap/dap_server.py         |   6 +-
 .../TestDAP_setExceptionBreakpoints.py        |  11 +-
 .../tools/lldb-dap/exception/objc/Makefile    |   2 +-
 .../exception/objc/TestDAP_exception_objc.py  |  39 ++++-
 .../API/tools/lldb-dap/exception/objc/main.m  |  12 +-
 lldb/tools/lldb-dap/DAP.cpp                   | 160 ++++++++----------
 lldb/tools/lldb-dap/DAP.h                     |   4 +-
 lldb/tools/lldb-dap/ExceptionBreakpoint.cpp   |  26 ++-
 lldb/tools/lldb-dap/ExceptionBreakpoint.h     |  14 +-
 .../Handler/InitializeRequestHandler.cpp      |   1 -
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |  15 +-
 .../SetExceptionBreakpointsRequestHandler.cpp | 107 +++++-------
 lldb/tools/lldb-dap/JSONUtils.cpp             |   9 -
 lldb/tools/lldb-dap/JSONUtils.h               |  12 --
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  14 ++
 .../lldb-dap/Protocol/ProtocolRequests.h      |  50 ++++++
 .../tools/lldb-dap/Protocol/ProtocolTypes.cpp |  56 +++---
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |  41 +++--
 lldb/tools/lldb-dap/ProtocolUtils.cpp         |  11 ++
 lldb/tools/lldb-dap/ProtocolUtils.h           |  13 ++
 lldb/unittests/DAP/ProtocolTypesTest.cpp      | 111 +++++++++---
 21 files changed, 454 insertions(+), 260 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index baf2d4ae542b..6d32491eaa5e 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -1050,8 +1050,12 @@ class DebugCommunication(object):
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
-    def request_setExceptionBreakpoints(self, filters):
+    def request_setExceptionBreakpoints(
+        self, *, filters: list[str] = [], filter_options: list[dict] = []
+    ):
         args_dict = {"filters": filters}
+        if filter_options:
+            args_dict["filterOptions"] = filter_options
         command_dict = {
             "command": "setExceptionBreakpoints",
             "type": "request",
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
index 4dc8c5b3c7de..4ca733a9a59c 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
@@ -1,16 +1,12 @@
 """
-Test lldb-dap setBreakpoints request
+Test lldb-dap setExceptionBreakpoints request
 """
 
-
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
 import lldbdap_testcase
 
 
-@skip("Temporarily disable the breakpoint tests")
 class TestDAP_setExceptionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     def test_functionality(self):
@@ -33,8 +29,9 @@ class TestDAP_setExceptionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
 
-        filters = ["cpp_throw", "cpp_catch"]
-        response = self.dap_server.request_setExceptionBreakpoints(filters)
+        response = self.dap_server.request_setExceptionBreakpoints(
+            filters=["cpp_throw", "cpp_catch"],
+        )
         if response:
             self.assertTrue(response["success"])
 
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/Makefile b/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
index 9b6528337cb9..17e6dc76699a 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
@@ -1,6 +1,6 @@
 OBJC_SOURCES := main.m
 
-CFLAGS_EXTRAS := -w
+CFLAGS_EXTRAS := -w -fobjc-exceptions
 
 USE_SYSTEM_STDLIB := 1
 
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py b/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
index 777d55f48e85..ddedf7a6de8c 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
@@ -2,7 +2,6 @@
 Test exception behavior in DAP with obj-c throw.
 """
 
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 import lldbdap_testcase
@@ -25,3 +24,41 @@ class TestDAP_exception_objc(lldbdap_testcase.DAPTestCaseBase):
         exception_details = exception_info["details"]
         self.assertRegex(exception_details["message"], "SomeReason")
         self.assertRegex(exception_details["stackTrace"], "main.m")
+
+    @skipUnlessDarwin
+    def test_break_on_throw_and_catch(self):
+        """
+        Test that breakpoints on exceptions work as expected.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        response = self.dap_server.request_setExceptionBreakpoints(
+            filter_options=[
+                {
+                    "filterId": "objc_throw",
+                    "condition": '[[((NSException *)$arg1) name] isEqual:@"ThrownException"]',
+                },
+            ]
+        )
+        if response:
+            self.assertTrue(response["success"])
+
+        self.continue_to_exception_breakpoint("Objective-C Throw")
+
+        # FIXME: Catching objc exceptions do not appear to be working.
+        # Xcode appears to set a breakpoint on '__cxa_begin_catch' for objc
+        # catch, which is different than
+        # SBTarget::BreakpointCreateForException(eLanguageObjectiveC, /*catch_bp=*/true, /*throw_bp=*/false);
+        # self.continue_to_exception_breakpoint("Objective-C Catch")
+
+        self.do_continue()
+
+        self.assertTrue(self.verify_stop_exception_info("signal SIGABRT"))
+        exception_info = self.get_exceptionInfo()
+        self.assertEqual(exception_info["breakMode"], "always")
+        self.assertEqual(exception_info["description"], "signal SIGABRT")
+        self.assertEqual(exception_info["exceptionId"], "signal")
+        exception_details = exception_info["details"]
+        self.assertRegex(exception_details["message"], "SomeReason")
+        self.assertRegex(exception_details["stackTrace"], "main.m")
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/main.m b/lldb/test/API/tools/lldb-dap/exception/objc/main.m
index e8db04fb40de..bbfa62199279 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/main.m
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/main.m
@@ -1,8 +1,14 @@
 #import <Foundation/Foundation.h>
 
 int main(int argc, char const *argv[]) {
-  @throw [[NSException alloc] initWithName:@"ThrownException"
-                                    reason:@"SomeReason"
-                                  userInfo:nil];
+  @try {
+    NSException *e = [[NSException alloc] initWithName:@"ThrownException"
+                                      reason:@"SomeReason"
+                                    userInfo:nil];
+    @throw e;
+  } @catch (NSException *e) {
+    NSLog(@"Caught %@", e);
+    @throw; // let the process crash...
+  }
   return 0;
 }
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 9fe8227cd2d6..c171b55951cb 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -9,6 +9,7 @@
 #include "DAP.h"
 #include "DAPLog.h"
 #include "EventHelper.h"
+#include "ExceptionBreakpoint.h"
 #include "Handler/RequestHandler.h"
 #include "Handler/ResponseHandler.h"
 #include "JSONUtils.h"
@@ -17,6 +18,7 @@
 #include "Protocol/ProtocolBase.h"
 #include "Protocol/ProtocolRequests.h"
 #include "Protocol/ProtocolTypes.h"
+#include "ProtocolUtils.h"
 #include "Transport.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBCommandInterpreter.h"
@@ -129,93 +131,81 @@ DAP::DAP(Log *log, const ReplMode default_repl_mode,
 DAP::~DAP() = default;
 
 void DAP::PopulateExceptionBreakpoints() {
-  llvm::call_once(init_exception_breakpoints_flag, [this]() {
-    exception_breakpoints = std::vector<ExceptionBreakpoint>{};
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
+    exception_breakpoints.emplace_back(*this, "cpp_catch", "C++ Catch",
+                                       lldb::eLanguageTypeC_plus_plus,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "cpp_throw", "C++ Throw",
+                                       lldb::eLanguageTypeC_plus_plus,
+                                       eExceptionKindThrow);
+  }
 
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
-      exception_breakpoints->emplace_back(*this, "cpp_catch", "C++ Catch",
-                                          lldb::eLanguageTypeC_plus_plus);
-      exception_breakpoints->emplace_back(*this, "cpp_throw", "C++ Throw",
-                                          lldb::eLanguageTypeC_plus_plus);
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
+    exception_breakpoints.emplace_back(*this, "objc_catch", "Objective-C Catch",
+                                       lldb::eLanguageTypeObjC,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "objc_throw", "Objective-C Throw",
+                                       lldb::eLanguageTypeObjC,
+                                       eExceptionKindThrow);
+  }
+
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
+    exception_breakpoints.emplace_back(*this, "swift_catch", "Swift Catch",
+                                       lldb::eLanguageTypeSwift,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "swift_throw", "Swift Throw",
+                                       lldb::eLanguageTypeSwift,
+                                       eExceptionKindThrow);
+  }
+
+  // Besides handling the hardcoded list of languages from above, we try to find
+  // any other languages that support exception breakpoints using the SB API.
+  for (int raw_lang = lldb::eLanguageTypeUnknown;
+       raw_lang < lldb::eNumLanguageTypes; ++raw_lang) {
+    lldb::LanguageType lang = static_cast<lldb::LanguageType>(raw_lang);
+
+    // We first discard any languages already handled above.
+    if (lldb::SBLanguageRuntime::LanguageIsCFamily(lang) ||
+        lang == lldb::eLanguageTypeSwift)
+      continue;
+
+    if (!lldb::SBDebugger::SupportsLanguage(lang))
+      continue;
+
+    const char *name = lldb::SBLanguageRuntime::GetNameForLanguageType(lang);
+    if (!name)
+      continue;
+    std::string raw_lang_name = name;
+    std::string capitalized_lang_name = capitalize(name);
+
+    if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnThrow(lang)) {
+      const char *raw_throw_keyword =
+          lldb::SBLanguageRuntime::GetThrowKeywordForLanguage(lang);
+      std::string throw_keyword =
+          raw_throw_keyword ? raw_throw_keyword : "throw";
+
+      exception_breakpoints.emplace_back(
+          *this, raw_lang_name + "_" + throw_keyword,
+          capitalized_lang_name + " " + capitalize(throw_keyword), lang,
+          eExceptionKindThrow);
     }
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
-      exception_breakpoints->emplace_back(
-          *this, "objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC);
-      exception_breakpoints->emplace_back(
-          *this, "objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC);
+
+    if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnCatch(lang)) {
+      const char *raw_catch_keyword =
+          lldb::SBLanguageRuntime::GetCatchKeywordForLanguage(lang);
+      std::string catch_keyword =
+          raw_catch_keyword ? raw_catch_keyword : "catch";
+
+      exception_breakpoints.emplace_back(
+          *this, raw_lang_name + "_" + catch_keyword,
+          capitalized_lang_name + " " + capitalize(catch_keyword), lang,
+          eExceptionKindCatch);
     }
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
-      exception_breakpoints->emplace_back(*this, "swift_catch", "Swift Catch",
-                                          lldb::eLanguageTypeSwift);
-      exception_breakpoints->emplace_back(*this, "swift_throw", "Swift Throw",
-                                          lldb::eLanguageTypeSwift);
-    }
-    // Besides handling the hardcoded list of languages from above, we try to
-    // find any other languages that support exception breakpoints using the
-    // SB API.
-    for (int raw_lang = lldb::eLanguageTypeUnknown;
-         raw_lang < lldb::eNumLanguageTypes; ++raw_lang) {
-      lldb::LanguageType lang = static_cast<lldb::LanguageType>(raw_lang);
-
-      // We first discard any languages already handled above.
-      if (lldb::SBLanguageRuntime::LanguageIsCFamily(lang) ||
-          lang == lldb::eLanguageTypeSwift)
-        continue;
-
-      if (!lldb::SBDebugger::SupportsLanguage(lang))
-        continue;
-
-      const char *name = lldb::SBLanguageRuntime::GetNameForLanguageType(lang);
-      if (!name)
-        continue;
-      std::string raw_lang_name = name;
-      std::string capitalized_lang_name = capitalize(name);
-
-      if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnThrow(lang)) {
-        const char *raw_throw_keyword =
-            lldb::SBLanguageRuntime::GetThrowKeywordForLanguage(lang);
-        std::string throw_keyword =
-            raw_throw_keyword ? raw_throw_keyword : "throw";
-
-        exception_breakpoints->emplace_back(
-            *this, raw_lang_name + "_" + throw_keyword,
-            capitalized_lang_name + " " + capitalize(throw_keyword), lang);
-      }
-
-      if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnCatch(lang)) {
-        const char *raw_catch_keyword =
-            lldb::SBLanguageRuntime::GetCatchKeywordForLanguage(lang);
-        std::string catch_keyword =
-            raw_catch_keyword ? raw_catch_keyword : "catch";
-
-        exception_breakpoints->emplace_back(
-            *this, raw_lang_name + "_" + catch_keyword,
-            capitalized_lang_name + " " + capitalize(catch_keyword), lang);
-      }
-    }
-    assert(!exception_breakpoints->empty() && "should not be empty");
-  });
+  }
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) {
-  // PopulateExceptionBreakpoints() is called after g_dap.debugger is created
-  // in a request-initialize.
-  //
-  // But this GetExceptionBreakpoint() method may be called before attaching, in
-  // which case, we may not have populated the filter yet.
-  //
-  // We also cannot call PopulateExceptionBreakpoints() in DAP::DAP() because
-  // we need SBDebugger::Initialize() to have been called before this.
-  //
-  // So just calling PopulateExceptionBreakoints(),which does lazy-populating
-  // seems easiest. Two other options include:
-  //  + call g_dap.PopulateExceptionBreakpoints() in lldb-dap.cpp::main()
-  //    right after the call to SBDebugger::Initialize()
-  //  + Just call PopulateExceptionBreakpoints() to get a fresh list  everytime
-  //    we query (a bit overkill since it's not likely to change?)
-  PopulateExceptionBreakpoints();
-
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.GetFilter() == filter)
       return &bp;
   }
@@ -223,10 +213,7 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) {
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
-  // See comment in the other GetExceptionBreakpoint().
-  PopulateExceptionBreakpoints();
-
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.GetID() == bp_id)
       return &bp;
   }
@@ -1118,8 +1105,9 @@ protocol::Capabilities DAP::GetCapabilities() {
   }
 
   // Available filters or options for the setExceptionBreakpoints request.
+  PopulateExceptionBreakpoints();
   std::vector<protocol::ExceptionBreakpointsFilter> filters;
-  for (const auto &exc_bp : *exception_breakpoints)
+  for (const auto &exc_bp : exception_breakpoints)
     filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp));
   capabilities.exceptionBreakpointFilters = std::move(filters);
 
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 89bc827c1141..5ca5822f9bce 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -99,7 +99,7 @@ struct DAP {
   lldb::SBBroadcaster broadcaster;
   FunctionBreakpointMap function_breakpoints;
   InstructionBreakpointMap instruction_breakpoints;
-  std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
+  std::vector<ExceptionBreakpoint> exception_breakpoints;
   llvm::once_flag init_exception_breakpoints_flag;
 
   /// Map step in target id to list of function targets that user can choose.
@@ -320,7 +320,7 @@ struct DAP {
     });
   }
 
-  /// The set of capablities supported by this adapter.
+  /// The set of capabilities supported by this adapter.
   protocol::Capabilities GetCapabilities();
 
   /// Debuggee will continue from stopped state.
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
index 9772e7344ced..5bf06268a5af 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
@@ -9,23 +9,33 @@
 #include "ExceptionBreakpoint.h"
 #include "BreakpointBase.h"
 #include "DAP.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBMutex.h"
 #include "lldb/API/SBTarget.h"
 #include <mutex>
 
+using namespace llvm;
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-void ExceptionBreakpoint::SetBreakpoint() {
+protocol::Breakpoint ExceptionBreakpoint::SetBreakpoint(StringRef condition) {
   lldb::SBMutex lock = m_dap.GetAPIMutex();
   std::lock_guard<lldb::SBMutex> guard(lock);
 
-  if (m_bp.IsValid())
-    return;
-  bool catch_value = m_filter.find("_catch") != std::string::npos;
-  bool throw_value = m_filter.find("_throw") != std::string::npos;
-  m_bp = m_dap.target.BreakpointCreateForException(m_language, catch_value,
-                                                   throw_value);
-  m_bp.AddName(BreakpointBase::kDAPBreakpointLabel);
+  if (!m_bp.IsValid()) {
+    m_bp = m_dap.target.BreakpointCreateForException(
+        m_language, m_kind == eExceptionKindCatch,
+        m_kind == eExceptionKindThrow);
+    m_bp.AddName(BreakpointBase::kDAPBreakpointLabel);
+  }
+
+  m_bp.SetCondition(condition.data());
+
+  protocol::Breakpoint breakpoint;
+  breakpoint.id = m_bp.GetID();
+  breakpoint.verified = m_bp.IsValid();
+  return breakpoint;
 }
 
 void ExceptionBreakpoint::ClearBreakpoint() {
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.h b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
index 319b472a89a3..802ec71ce6ad 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.h
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
@@ -10,6 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H
 
 #include "DAPForward.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/StringRef.h"
@@ -18,14 +19,20 @@
 
 namespace lldb_dap {
 
+enum ExceptionKind : unsigned {
+  eExceptionKindCatch,
+  eExceptionKindThrow,
+};
+
 class ExceptionBreakpoint {
 public:
   ExceptionBreakpoint(DAP &d, std::string f, std::string l,
-                      lldb::LanguageType lang)
+                      lldb::LanguageType lang, ExceptionKind kind)
       : m_dap(d), m_filter(std::move(f)), m_label(std::move(l)),
-        m_language(lang), m_bp() {}
+        m_language(lang), m_kind(kind), m_bp() {}
 
-  void SetBreakpoint();
+  protocol::Breakpoint SetBreakpoint() { return SetBreakpoint(""); };
+  protocol::Breakpoint SetBreakpoint(llvm::StringRef condition);
   void ClearBreakpoint();
 
   lldb::break_id_t GetID() const { return m_bp.GetID(); }
@@ -39,6 +46,7 @@ protected:
   std::string m_filter;
   std::string m_label;
   lldb::LanguageType m_language;
+  ExceptionKind m_kind;
   lldb::SBBreakpoint m_bp;
 };
 
diff --git a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
index dcd02d61ca4f..b499a69876e2 100644
--- a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
@@ -54,7 +54,6 @@ llvm::Expected<InitializeResponse> InitializeRequestHandler::Run(
   if (llvm::Error err = dap.RunPreInitCommands())
     return err;
 
-  dap.PopulateExceptionBreakpoints();
   auto cmd = dap.debugger.GetCommandInterpreter().AddMultiwordCommand(
       "lldb-dap", "Commands for managing lldb-dap.");
   if (arguments.supportedFeatures.contains(
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index 0ac8ca7c9a49..054cc7a32131 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -387,14 +387,21 @@ public:
   Run(const protocol::SetBreakpointsArguments &args) const override;
 };
 
-class SetExceptionBreakpointsRequestHandler : public LegacyRequestHandler {
+class SetExceptionBreakpointsRequestHandler
+    : public RequestHandler<
+          protocol::SetExceptionBreakpointsArguments,
+          llvm::Expected<protocol::SetExceptionBreakpointsResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "setExceptionBreakpoints"; }
   FeatureSet GetSupportedFeatures() const override {
-    return {protocol::eAdapterFeatureExceptionOptions};
+    /// Prefer the `filterOptions` feature over the `exceptionOptions`.
+    /// exceptionOptions is not supported in VSCode, while `filterOptions` is
+    /// supported.
+    return {protocol::eAdapterFeatureExceptionFilterOptions};
   }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::SetExceptionBreakpointsResponseBody>
+  Run(const protocol::SetExceptionBreakpointsArguments &args) const override;
 };
 
 class SetFunctionBreakpointsRequestHandler
diff --git a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
index 2214833f8a77..6a271fb82513 100644
--- a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
@@ -8,86 +8,61 @@
 
 #include "DAP.h"
 #include "EventHelper.h"
-#include "JSONUtils.h"
+#include "Protocol/ProtocolRequests.h"
 #include "RequestHandler.h"
 #include <set>
 
+using namespace llvm;
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-// "SetExceptionBreakpointsRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "SetExceptionBreakpoints request; value of command field
-//     is 'setExceptionBreakpoints'. The request configures the debuggers
-//     response to thrown exceptions. If an exception is configured to break, a
-//     StoppedEvent is fired (event type 'exception').", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "setExceptionBreakpoints" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/SetExceptionBreakpointsArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "SetExceptionBreakpointsArguments": {
-//   "type": "object",
-//   "description": "Arguments for 'setExceptionBreakpoints' request.",
-//   "properties": {
-//     "filters": {
-//       "type": "array",
-//       "items": {
-//         "type": "string"
-//       },
-//       "description": "IDs of checked exception options. The set of IDs is
-//       returned via the 'exceptionBreakpointFilters' capability."
-//     },
-//     "exceptionOptions": {
-//       "type": "array",
-//       "items": {
-//         "$ref": "#/definitions/ExceptionOptions"
-//       },
-//       "description": "Configuration options for selected exceptions."
-//     }
-//   },
-//   "required": [ "filters" ]
-// },
-// "SetExceptionBreakpointsResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to 'setExceptionBreakpoints' request. This is
-//     just an acknowledgement, so no body field is required."
-//   }]
-// }
-void SetExceptionBreakpointsRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  lldb::SBError error;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-  const auto *filters = arguments->getArray("filters");
+/// The request configures the debugger’s response to thrown exceptions. Each of
+/// the `filters`, `filterOptions`, and `exceptionOptions` in the request are
+/// independent configurations to a debug adapter indicating a kind of exception
+/// to catch. An exception thrown in a program should result in a `stopped`
+/// event from the debug adapter (with reason `exception`) if any of the
+/// configured filters match.
+///
+/// Clients should only call this request if the corresponding capability
+/// `exceptionBreakpointFilters` returns one or more filters.
+Expected<SetExceptionBreakpointsResponseBody>
+SetExceptionBreakpointsRequestHandler::Run(
+    const SetExceptionBreakpointsArguments &arguments) const {
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
-  std::set<llvm::StringRef> unset_filters;
-  for (const auto &bp : *dap.exception_breakpoints)
+  std::set<StringRef> unset_filters;
+  for (const auto &bp : dap.exception_breakpoints)
     unset_filters.insert(bp.GetFilter());
 
-  for (const auto &value : *filters) {
-    const auto filter = GetAsString(value);
+  SetExceptionBreakpointsResponseBody body;
+  for (const auto &filter : arguments.filters) {
     auto *exc_bp = dap.GetExceptionBreakpoint(filter);
-    if (exc_bp) {
-      exc_bp->SetBreakpoint();
-      unset_filters.erase(std::string(filter));
-    }
+    if (!exc_bp)
+      continue;
+
+    body.breakpoints.push_back(exc_bp->SetBreakpoint());
+    unset_filters.erase(filter);
   }
+  for (const auto &filterOptions : arguments.filterOptions) {
+    auto *exc_bp = dap.GetExceptionBreakpoint(filterOptions.filterId);
+    if (!exc_bp)
+      continue;
+
+    body.breakpoints.push_back(exc_bp->SetBreakpoint(filterOptions.condition));
+    unset_filters.erase(filterOptions.filterId);
+  }
+
+  // Clear any unset filters.
   for (const auto &filter : unset_filters) {
     auto *exc_bp = dap.GetExceptionBreakpoint(filter);
-    if (exc_bp)
-      exc_bp->ClearBreakpoint();
+    if (!exc_bp)
+      continue;
+
+    exc_bp->ClearBreakpoint();
   }
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  return body;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 6cdde63e9796..cf7db41559b8 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -482,15 +482,6 @@ llvm::json::Object CreateEventObject(const llvm::StringRef event_name) {
   return event;
 }
 
-protocol::ExceptionBreakpointsFilter
-CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
-  protocol::ExceptionBreakpointsFilter filter;
-  filter.filter = bp.GetFilter();
-  filter.label = bp.GetLabel();
-  filter.defaultState = ExceptionBreakpoint::kDefaultValue;
-  return filter;
-}
-
 // "StackFrame": {
 //   "type": "object",
 //   "description": "A Stackframe contains the source location.",
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 10dc46b94184..69da0725bd05 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -224,18 +224,6 @@ llvm::json::Value CreateModule(lldb::SBTarget &target, lldb::SBModule &module,
 ///     definition outlined by Microsoft.
 llvm::json::Object CreateEventObject(const llvm::StringRef event_name);
 
-/// Create a "ExceptionBreakpointsFilter" JSON object as described in
-/// the debug adapter definition.
-///
-/// \param[in] bp
-///     The exception breakpoint object to use
-///
-/// \return
-///     A "ExceptionBreakpointsFilter" JSON object with that follows
-///     the formal JSON definition outlined by Microsoft.
-protocol::ExceptionBreakpointsFilter
-CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
-
 /// Create a "StackFrame" object for a LLDB frame object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 1b1891ba59e6..e6ba54ed4dcd 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -448,6 +448,20 @@ json::Value toJSON(const SetDataBreakpointsResponseBody &SDBR) {
   return json::Object{{"breakpoints", SDBR.breakpoints}};
 }
 
+bool fromJSON(const json::Value &Params, SetExceptionBreakpointsArguments &Args,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("filters", Args.filters) &&
+         O.mapOptional("filterOptions", Args.filterOptions);
+}
+
+json::Value toJSON(const SetExceptionBreakpointsResponseBody &B) {
+  json::Object result;
+  if (!B.breakpoints.empty())
+    result.insert({"breakpoints", B.breakpoints});
+  return result;
+}
+
 json::Value toJSON(const ThreadsResponseBody &TR) {
   return json::Object{{"threads", TR.threads}};
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 583c203be8e1..01b8f2445c9f 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -751,6 +751,56 @@ struct SetDataBreakpointsResponseBody {
 };
 llvm::json::Value toJSON(const SetDataBreakpointsResponseBody &);
 
+/// Arguments for `setExceptionBreakpoints` request.
+struct SetExceptionBreakpointsArguments {
+  /// Set of exception filters specified by their ID. The set of all possible
+  /// exception filters is defined by the `exceptionBreakpointFilters`
+  /// capability. The `filter` and `filterOptions` sets are additive.
+  std::vector<std::string> filters;
+
+  /// Set of exception filters and their options. The set of all possible
+  /// exception filters is defined by the `exceptionBreakpointFilters`
+  /// capability. This attribute is only honored by a debug adapter if the
+  /// corresponding capability `supportsExceptionFilterOptions` is true. The
+  /// `filter` and `filterOptions` sets are additive.
+  std::vector<ExceptionFilterOptions> filterOptions;
+
+  // unsupported keys: exceptionOptions
+};
+bool fromJSON(const llvm::json::Value &, SetExceptionBreakpointsArguments &,
+              llvm::json::Path);
+
+/// Response to `setExceptionBreakpoints` request.
+///
+/// The response contains an array of `Breakpoint` objects with information
+/// about each exception breakpoint or filter. The `Breakpoint` objects are in
+/// the same order as the elements of the `filters`, `filterOptions`,
+/// `exceptionOptions` arrays given as arguments. If both `filters` and
+/// `filterOptions` are given, the returned array must start with `filters`
+/// information first, followed by `filterOptions` information.
+///
+/// The `verified` property of a `Breakpoint` object signals whether the
+/// exception breakpoint or filter could be successfully created and whether the
+/// condition is valid. In case of an error the `message` property explains the
+/// problem. The `id` property can be used to introduce a unique ID for the
+/// exception breakpoint or filter so that it can be updated subsequently by
+/// sending breakpoint events.
+///
+/// For backward compatibility both the `breakpoints` array and the enclosing
+/// `body` are optional. If these elements are missing a client is not able to
+/// show problems for individual exception breakpoints or filters.
+struct SetExceptionBreakpointsResponseBody {
+  /// Information about the exception breakpoints or filters.
+  ///
+  /// The breakpoints returned are in the same order as the elements of the
+  /// `filters`, `filterOptions`, `exceptionOptions` arrays in the arguments. If
+  /// both `filters` and `filterOptions` are given, the returned array must
+  /// start with `filters` information first, followed by `filterOptions`
+  /// information.
+  std::vector<Breakpoint> breakpoints;
+};
+llvm::json::Value toJSON(const SetExceptionBreakpointsResponseBody &);
+
 /// Arguments to `disassemble` request.
 struct DisassembleArguments {
   /// Memory reference to the base location containing the instructions to
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index c21f8382320a..7f96c07faae1 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -86,14 +86,14 @@ bool fromJSON(const llvm::json::Value &Params, ExceptionBreakpointsFilter &EBF,
 json::Value toJSON(const ExceptionBreakpointsFilter &EBF) {
   json::Object result{{"filter", EBF.filter}, {"label", EBF.label}};
 
-  if (EBF.description)
-    result.insert({"description", *EBF.description});
+  if (!EBF.description.empty())
+    result.insert({"description", EBF.description});
   if (EBF.defaultState)
-    result.insert({"default", *EBF.defaultState});
+    result.insert({"default", EBF.defaultState});
   if (EBF.supportsCondition)
-    result.insert({"supportsCondition", *EBF.supportsCondition});
-  if (EBF.conditionDescription)
-    result.insert({"conditionDescription", *EBF.conditionDescription});
+    result.insert({"supportsCondition", EBF.supportsCondition});
+  if (!EBF.conditionDescription.empty())
+    result.insert({"conditionDescription", EBF.conditionDescription});
 
   return result;
 }
@@ -418,23 +418,41 @@ json::Value toJSON(const Capabilities &C) {
   for (const auto &feature : C.supportedFeatures)
     result.insert({ToString(feature), true});
 
-  if (C.exceptionBreakpointFilters && !C.exceptionBreakpointFilters->empty())
+  if (!C.exceptionBreakpointFilters.empty())
+    result.insert({"exceptionBreakpointFilters", C.exceptionBreakpointFilters});
+  if (!C.completionTriggerCharacters.empty())
     result.insert(
-        {"exceptionBreakpointFilters", *C.exceptionBreakpointFilters});
-  if (C.completionTriggerCharacters && !C.completionTriggerCharacters->empty())
+        {"completionTriggerCharacters", C.completionTriggerCharacters});
+  if (!C.additionalModuleColumns.empty())
+    result.insert({"additionalModuleColumns", C.additionalModuleColumns});
+  if (!C.supportedChecksumAlgorithms.empty())
     result.insert(
-        {"completionTriggerCharacters", *C.completionTriggerCharacters});
-  if (C.additionalModuleColumns && !C.additionalModuleColumns->empty())
-    result.insert({"additionalModuleColumns", *C.additionalModuleColumns});
-  if (C.supportedChecksumAlgorithms && !C.supportedChecksumAlgorithms->empty())
-    result.insert(
-        {"supportedChecksumAlgorithms", *C.supportedChecksumAlgorithms});
-  if (C.breakpointModes && !C.breakpointModes->empty())
-    result.insert({"breakpointModes", *C.breakpointModes});
+        {"supportedChecksumAlgorithms", C.supportedChecksumAlgorithms});
+  if (!C.breakpointModes.empty())
+    result.insert({"breakpointModes", C.breakpointModes});
 
   // lldb-dap extensions
-  if (C.lldbExtVersion && !C.lldbExtVersion->empty())
-    result.insert({"$__lldb_version", *C.lldbExtVersion});
+  if (!C.lldbExtVersion.empty())
+    result.insert({"$__lldb_version", C.lldbExtVersion});
+
+  return result;
+}
+
+bool fromJSON(const json::Value &Params, ExceptionFilterOptions &EFO,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("filterId", EFO.filterId) &&
+         O.mapOptional("condition", EFO.condition) &&
+         O.mapOptional("mode", EFO.mode);
+}
+
+json::Value toJSON(const ExceptionFilterOptions &EFO) {
+  json::Object result{{"filterId", EFO.filterId}};
+
+  if (!EFO.condition.empty())
+    result.insert({"condition", EFO.condition});
+  if (!EFO.mode.empty())
+    result.insert({"mode", EFO.mode});
 
   return result;
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index d7094fbab9e5..7fe745411399 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -43,19 +43,19 @@ struct ExceptionBreakpointsFilter {
 
   /// A help text providing additional information about the exception filter.
   /// This string is typically shown as a hover and can be translated.
-  std::optional<std::string> description;
+  std::string description;
 
   /// Initial value of the filter option. If not specified a value false is
   /// assumed.
-  std::optional<bool> defaultState;
+  bool defaultState = false;
 
   /// Controls whether a condition can be specified for this filter option. If
   /// false or missing, a condition can not be set.
-  std::optional<bool> supportsCondition;
+  bool supportsCondition = false;
 
   /// A help text providing information about the condition. This string is
   /// shown as the placeholder text for a text box and can be translated.
-  std::optional<std::string> conditionDescription;
+  std::string conditionDescription;
 };
 bool fromJSON(const llvm::json::Value &, ExceptionBreakpointsFilter &,
               llvm::json::Path);
@@ -253,18 +253,17 @@ struct Capabilities {
 
   /// Available exception filter options for the `setExceptionBreakpoints`
   /// request.
-  std::optional<std::vector<ExceptionBreakpointsFilter>>
-      exceptionBreakpointFilters;
+  std::vector<ExceptionBreakpointsFilter> exceptionBreakpointFilters;
 
   /// The set of characters that should trigger completion in a REPL. If not
   /// specified, the UI should assume the `.` character.
-  std::optional<std::vector<std::string>> completionTriggerCharacters;
+  std::vector<std::string> completionTriggerCharacters;
 
   /// The set of additional module information exposed by the debug adapter.
-  std::optional<std::vector<ColumnDescriptor>> additionalModuleColumns;
+  std::vector<ColumnDescriptor> additionalModuleColumns;
 
   /// Checksum algorithms supported by the debug adapter.
-  std::optional<std::vector<ChecksumAlgorithm>> supportedChecksumAlgorithms;
+  std::vector<ChecksumAlgorithm> supportedChecksumAlgorithms;
 
   /// Modes of breakpoints supported by the debug adapter, such as 'hardware' or
   /// 'software'. If present, the client may allow the user to select a mode and
@@ -272,19 +271,39 @@ struct Capabilities {
   ///
   /// Clients may present the first applicable mode in this array as the
   /// 'default' mode in gestures that set breakpoints.
-  std::optional<std::vector<BreakpointMode>> breakpointModes;
+  std::vector<BreakpointMode> breakpointModes;
 
   /// lldb-dap Extensions
   /// @{
 
   /// The version of the adapter.
-  std::optional<std::string> lldbExtVersion;
+  std::string lldbExtVersion;
 
   /// @}
 };
 bool fromJSON(const llvm::json::Value &, Capabilities &, llvm::json::Path);
 llvm::json::Value toJSON(const Capabilities &);
 
+/// An `ExceptionFilterOptions` is used to specify an exception filter together
+/// with a condition for the `setExceptionBreakpoints` request.
+struct ExceptionFilterOptions {
+  /// ID of an exception filter returned by the `exceptionBreakpointFilters`
+  /// capability.
+  std::string filterId;
+
+  /// An expression for conditional exceptions.
+  /// The exception breaks into the debugger if the result of the condition is
+  /// true.
+  std::string condition;
+
+  /// The mode of this exception breakpoint. If defined, this must be one of the
+  /// `breakpointModes` the debug adapter advertised in its `Capabilities`.
+  std::string mode;
+};
+bool fromJSON(const llvm::json::Value &, ExceptionFilterOptions &,
+              llvm::json::Path);
+llvm::json::Value toJSON(const ExceptionFilterOptions &);
+
 /// A `Source` is a descriptor for source code. It is returned from the debug
 /// adapter as part of a `StackFrame` and it is used by clients when specifying
 /// breakpoints.
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index 6e0adf5bc8b5..cb1ee6a42400 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -161,4 +161,15 @@ std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
   return threads;
 }
 
+protocol::ExceptionBreakpointsFilter
+CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
+  protocol::ExceptionBreakpointsFilter filter;
+  filter.filter = bp.GetFilter();
+  filter.label = bp.GetLabel();
+  filter.description = bp.GetLabel();
+  filter.defaultState = ExceptionBreakpoint::kDefaultValue;
+  filter.supportsCondition = true;
+  return filter;
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.h b/lldb/tools/lldb-dap/ProtocolUtils.h
index 2b2ac9e8e35f..788d2fd054e2 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.h
+++ b/lldb/tools/lldb-dap/ProtocolUtils.h
@@ -13,6 +13,7 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_UTILS_H
 #define LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_UTILS_H
 
+#include "ExceptionBreakpoint.h"
 #include "Protocol/ProtocolTypes.h"
 
 #include "lldb/API/SBAddress.h"
@@ -74,6 +75,18 @@ protocol::Thread CreateThread(lldb::SBThread &thread, lldb::SBFormat &format);
 std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
                                          lldb::SBFormat &format);
 
+/// Create a "ExceptionBreakpointsFilter" JSON object as described in
+/// the debug adapter definition.
+///
+/// \param[in] bp
+///     The exception breakpoint object to use
+///
+/// \return
+///     A "ExceptionBreakpointsFilter" JSON object with that follows
+///     the formal JSON definition outlined by Microsoft.
+protocol::ExceptionBreakpointsFilter
+CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
+
 } // namespace lldb_dap
 
 #endif
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index f2a23db34656..46a09f090fea 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -243,14 +243,12 @@ TEST(ProtocolTypesTest, Capabilities) {
             deserialized_capabilities->supportedFeatures);
 
   // Verify exception breakpoint filters.
-  ASSERT_TRUE(
-      deserialized_capabilities->exceptionBreakpointFilters.has_value());
-  EXPECT_EQ(capabilities.exceptionBreakpointFilters->size(),
-            deserialized_capabilities->exceptionBreakpointFilters->size());
-  for (size_t i = 0; i < capabilities.exceptionBreakpointFilters->size(); ++i) {
-    const auto &original = capabilities.exceptionBreakpointFilters->at(i);
+  EXPECT_EQ(capabilities.exceptionBreakpointFilters.size(),
+            deserialized_capabilities->exceptionBreakpointFilters.size());
+  for (size_t i = 0; i < capabilities.exceptionBreakpointFilters.size(); ++i) {
+    const auto &original = capabilities.exceptionBreakpointFilters.at(i);
     const auto &deserialized =
-        deserialized_capabilities->exceptionBreakpointFilters->at(i);
+        deserialized_capabilities->exceptionBreakpointFilters.at(i);
     EXPECT_EQ(original.filter, deserialized.filter);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.description, deserialized.description);
@@ -260,19 +258,16 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify completion trigger characters.
-  ASSERT_TRUE(
-      deserialized_capabilities->completionTriggerCharacters.has_value());
   EXPECT_EQ(capabilities.completionTriggerCharacters,
             deserialized_capabilities->completionTriggerCharacters);
 
   // Verify additional module columns.
-  ASSERT_TRUE(deserialized_capabilities->additionalModuleColumns.has_value());
-  EXPECT_EQ(capabilities.additionalModuleColumns->size(),
-            deserialized_capabilities->additionalModuleColumns->size());
-  for (size_t i = 0; i < capabilities.additionalModuleColumns->size(); ++i) {
-    const auto &original = capabilities.additionalModuleColumns->at(i);
+  EXPECT_EQ(capabilities.additionalModuleColumns.size(),
+            deserialized_capabilities->additionalModuleColumns.size());
+  for (size_t i = 0; i < capabilities.additionalModuleColumns.size(); ++i) {
+    const auto &original = capabilities.additionalModuleColumns.at(i);
     const auto &deserialized =
-        deserialized_capabilities->additionalModuleColumns->at(i);
+        deserialized_capabilities->additionalModuleColumns.at(i);
     EXPECT_EQ(original.attributeName, deserialized.attributeName);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.format, deserialized.format);
@@ -281,19 +276,15 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify supported checksum algorithms.
-  ASSERT_TRUE(
-      deserialized_capabilities->supportedChecksumAlgorithms.has_value());
   EXPECT_EQ(capabilities.supportedChecksumAlgorithms,
             deserialized_capabilities->supportedChecksumAlgorithms);
 
   // Verify breakpoint modes.
-  ASSERT_TRUE(deserialized_capabilities->breakpointModes.has_value());
-  EXPECT_EQ(capabilities.breakpointModes->size(),
-            deserialized_capabilities->breakpointModes->size());
-  for (size_t i = 0; i < capabilities.breakpointModes->size(); ++i) {
-    const auto &original = capabilities.breakpointModes->at(i);
-    const auto &deserialized =
-        deserialized_capabilities->breakpointModes->at(i);
+  EXPECT_EQ(capabilities.breakpointModes.size(),
+            deserialized_capabilities->breakpointModes.size());
+  for (size_t i = 0; i < capabilities.breakpointModes.size(); ++i) {
+    const auto &original = capabilities.breakpointModes.at(i);
+    const auto &deserialized = deserialized_capabilities->breakpointModes.at(i);
     EXPECT_EQ(original.mode, deserialized.mode);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.description, deserialized.description);
@@ -301,7 +292,6 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify lldb extension version.
-  ASSERT_TRUE(deserialized_capabilities->lldbExtVersion.has_value());
   EXPECT_EQ(capabilities.lldbExtVersion,
             deserialized_capabilities->lldbExtVersion);
 }
@@ -687,6 +677,75 @@ TEST(ProtocolTypesTest, CapabilitiesEventBody) {
   EXPECT_EQ(json, pp(body));
 }
 
+TEST(ProtocolTypesTest, ExceptionFilterOptions) {
+  EXPECT_THAT_EXPECTED(parse<ExceptionFilterOptions>(R"({"filterId":"id"})"),
+                       HasValue(Value(ExceptionFilterOptions{
+                           /*filterId=*/"id", /*condition=*/"", /*mode*/ ""})));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","condition":"1+2"})"),
+      HasValue(Value(ExceptionFilterOptions{
+          /*filterId=*/"id", /*condition=*/"1+2", /*mode*/ ""})));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(
+          R"({"filterId":"id","condition":"1+2","mode":"m"})"),
+      HasValue(Value(ExceptionFilterOptions{
+          /*filterId=*/"id", /*condition=*/"1+2", /*mode*/ "m"})));
+
+  // Validate parsing errors
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({})", "exceptionFilterOptions"),
+      FailedWithMessage("missing value at exceptionFilterOptions.filterId"));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","condition":42})",
+                                    "exceptionFilterOptions"),
+      FailedWithMessage("expected string at exceptionFilterOptions.condition"));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","mode":42})",
+                                    "exceptionFilterOptions"),
+      FailedWithMessage("expected string at exceptionFilterOptions.mode"));
+}
+
+TEST(ProtocolTypesTest, SetExceptionBreakpointsArguments) {
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":[]})"),
+      HasValue(testing::FieldsAre(/*filters=*/testing::IsEmpty(),
+                                  /*filterOptions=*/testing::IsEmpty())));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":["abc"]})"),
+      HasValue(testing::FieldsAre(/*filters=*/std::vector<std::string>{"abc"},
+                                  /*filterOptions=*/testing::IsEmpty())));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(
+          R"({"filters":[],"filterOptions":[{"filterId":"abc"}]})"),
+      HasValue(testing::FieldsAre(
+          /*filters=*/testing::IsEmpty(),
+          /*filterOptions=*/testing::Contains(testing::FieldsAre(
+              /*filterId=*/"abc", /*condition=*/"", /*mode=*/"")))));
+
+  // Validate parse errors
+  EXPECT_THAT_EXPECTED(parse<SetExceptionBreakpointsArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).filters"));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":false})"),
+      FailedWithMessage("expected array at (root).filters"));
+}
+
+TEST(ProtocolTypesTest, SetExceptionBreakpointsResponseBody) {
+  SetExceptionBreakpointsResponseBody body;
+  Breakpoint bp;
+  bp.id = 12, bp.verified = true;
+  body.breakpoints = {bp};
+  EXPECT_EQ(R"({
+  "breakpoints": [
+    {
+      "id": 12,
+      "verified": true
+    }
+  ]
+})",
+            pp(body));
+}
+
 TEST(ProtocolTypesTest, StepInTarget) {
   StepInTarget target;
   target.id = 230;
@@ -705,4 +764,4 @@ TEST(ProtocolTypesTest, StepInTarget) {
   EXPECT_EQ(target.column, deserialized_target->column);
   EXPECT_EQ(target.endLine, deserialized_target->endLine);
   EXPECT_EQ(target.endColumn, deserialized_target->endColumn);
-}
\ No newline at end of file
+}

From 97bfb936af4077e8cb6c75664231f27a9989d563 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:42:53 +0900
Subject: [PATCH 0569/1322] DAG: Move soft float predicate management into
 RuntimeLibcalls (#142905)

Work towards making RuntimeLibcalls the centralized location for
all libcall information. This requires changing the encoding from
tracking the ISD::CondCode to using CmpInst::Predicate.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  14 +-
 llvm/include/llvm/IR/RuntimeLibcalls.h        |  25 +++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   5 +-
 llvm/lib/IR/RuntimeLibcalls.cpp               |  33 ++++
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 182 +++++++++---------
 5 files changed, 160 insertions(+), 99 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4ed81d25e8e2..dd44afd0855a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3574,20 +3574,18 @@ public:
 
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
-  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
-  void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    CmpLibcallCCs[Call] = CC;
+  /// FIXME: This should be removed
+  void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) {
+    Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred);
   }
 
-
   /// Get the CondCode that's to be used to test the result of the comparison
   /// libcall against zero.
-  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
-  ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return CmpLibcallCCs[Call];
+  CmpInst::Predicate
+  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
+    return Libcalls.getSoftFloatCmpLibcallPredicate(Call);
   }
 
-
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
     Libcalls.setLibcallCallingConv(Call, CC);
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 051bcc147cb7..45826fcd19f3 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -86,6 +87,20 @@ struct RuntimeLibcallsInfo {
     return ArrayRef(LibcallRoutineNames).drop_back();
   }
 
+  /// Get the comparison predicate that's to be used to test the result of the
+  /// comparison libcall against zero. This should only be used with
+  /// floating-point compare libcalls.
+  CmpInst::Predicate
+  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
+    return SoftFloatCompareLibcallPredicates[Call];
+  }
+
+  // FIXME: This should be removed. This should be private constant.
+  void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call,
+                                       CmpInst::Predicate Pred) {
+    SoftFloatCompareLibcallPredicates[Call] = Pred;
+  }
+
 private:
   /// Stores the name each libcall.
   const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
@@ -93,6 +108,14 @@ private:
   /// Stores the CallingConv that should be used for each libcall.
   CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
 
+  /// The condition type that should be used to test the result of each of the
+  /// soft floating-point comparison libcall against integer zero.
+  ///
+  // FIXME: This is only relevant for the handful of floating-point comparison
+  // runtime calls; it's excessive to have a table entry for every single
+  // opcode.
+  CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL];
+
   static bool darwinHasSinCos(const Triple &TT) {
     assert(TT.isOSDarwin() && "should be called with darwin triple");
     // Don't bother with 32 bit x86.
@@ -108,6 +131,8 @@ private:
     return true;
   }
 
+  void initSoftFloatCmpLibcallPredicates();
+
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
   LLVM_ABI void initLibcalls(const Triple &TT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a0ffb4b6d5a4..52f19cc6e1ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -419,7 +420,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   NewLHS = Call.first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
 
-  CCCode = getCmpLibcallCC(LC1);
+  CCCode = getICmpCondCode(getSoftFloatCmpLibcallPredicate(LC1));
   if (ShouldInvertCC) {
     assert(RetVT.isInteger());
     CCCode = getSetCCInverse(CCCode, RetVT);
@@ -441,7 +442,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
 
     SDValue Tmp = DAG.getSetCC(dl, SetCCVT, NewLHS, NewRHS, CCCode);
     auto Call2 = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl, Chain);
-    CCCode = getCmpLibcallCC(LC2);
+    CCCode = getICmpCondCode(getSoftFloatCmpLibcallPredicate(LC2));
     if (ShouldInvertCC)
       CCCode = getSetCCInverse(CCCode, RetVT);
     NewLHS = DAG.getSetCC(dl, SetCCVT, Call2.first, NewRHS, CCCode);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d63d398e243f..8506a0c03d33 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -220,12 +220,45 @@ static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) {
   // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
 }
 
+void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F32] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F64] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F128] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_PPCF128] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F32] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F64] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_PPCF128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F32] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F64] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F128] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_PPCF128] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F32] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F64] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F128] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_PPCF128] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F32] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F64] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F128] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_PPCF128] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F32] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F64] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F128] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_PPCF128] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F32] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F64] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_PPCF128] = CmpInst::ICMP_NE;
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
   std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
             nullptr);
 
+  initSoftFloatCmpLibcallPredicates();
+
 #define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
 #define LIBCALL_NO_NAME nullptr
 #include "llvm/IR/RuntimeLibcalls.def"
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5b3664c4e961..05d8a1190ada 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -522,67 +522,69 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
+      // clang-format off
       static const struct {
         const RTLIB::Libcall Op;
         const char * const Name;
-        const ISD::CondCode Cond;
+        const CmpInst::Predicate Cond;
       } LibraryCalls[] = {
         // Single-precision floating-point arithmetic.
-        { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::ADD_F32, "__addsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SUB_F32, "__subsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::MUL_F32, "__mulsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::DIV_F32, "__divsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Double-precision floating-point arithmetic.
-        { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::ADD_F64, "__adddf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SUB_F64, "__subdf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::MUL_F64, "__muldf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::DIV_F64, "__divdf3vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Single-precision comparisons.
-        { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
-        { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
-        { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
-        { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
-        { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
-        { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
-        { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
+        { RTLIB::OEQ_F32, "__eqsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UNE_F32, "__nesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLT_F32, "__ltsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLE_F32, "__lesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGE_F32, "__gesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGT_F32, "__gtsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UO_F32,  "__unordsf2vfp", CmpInst::ICMP_NE },
 
         // Double-precision comparisons.
-        { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
-        { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
-        { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
-        { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
-        { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
-        { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
-        { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
+        { RTLIB::OEQ_F64, "__eqdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UNE_F64, "__nedf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLT_F64, "__ltdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLE_F64, "__ledf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGE_F64, "__gedf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGT_F64, "__gtdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UO_F64,  "__unorddf2vfp", CmpInst::ICMP_NE },
 
         // Floating-point to integer conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
-        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
-        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
-        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
-        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Conversions between floating types.
-        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
-        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
+        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Integer to floating-point conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
         // e.g., __floatunsidf vs. __floatunssidfvfp.
-        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
-        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
-        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
-        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", CmpInst::BAD_ICMP_PREDICATE },
       };
+      // clang-format on
 
       for (const auto &LC : LibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
-        if (LC.Cond != ISD::SETCC_INVALID)
+        if (LC.Cond != CmpInst::BAD_ICMP_PREDICATE)
           setCmpLibcallCC(LC.Op, LC.Cond);
       }
     }
@@ -592,97 +594,99 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
+    // clang-format off
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
       const CallingConv::ID CC;
-      const ISD::CondCode Cond;
+      const CmpInst::Predicate Cond;
     } LibraryCalls[] = {
       // Double-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 2
-      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Double-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 3
-      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
-      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_EQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Single-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 4
-      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Single-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 5
-      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
-      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_EQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Floating-point to integer conversions.
       // RTABI chapter 4.1.2, Table 6
-      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Conversions between floating types.
       // RTABI chapter 4.1.2, Table 7
-      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Integer to floating-point conversions.
       // RTABI chapter 4.1.2, Table 8
-      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Long long helper functions
       // RTABI chapter 4.2, Table 9
-      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Integer division functions
       // RTABI chapter 4.3.1
-      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
     };
+    // clang-format on
 
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
-      if (LC.Cond != ISD::SETCC_INVALID)
+      if (LC.Cond != CmpInst::BAD_ICMP_PREDICATE)
         setCmpLibcallCC(LC.Op, LC.Cond);
     }
 

From 1ffd9f553ccba27c0def5f38e7928af8f3976bac Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:46:17 +0900
Subject: [PATCH 0570/1322] RuntimeLibcalls: Cleanup sincos predicate functions
 (#143081)

The darwinHasSinCos wasn't actually used for sincos, only the stret
variant. Rename this to reflect that, and introduce a new one for
enabling sincos.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h | 8 +++++++-
 llvm/lib/IR/RuntimeLibcalls.cpp        | 5 ++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 45826fcd19f3..d8f467e30fa6 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -116,7 +116,7 @@ private:
   // opcode.
   CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL];
 
-  static bool darwinHasSinCos(const Triple &TT) {
+  static bool darwinHasSinCosStret(const Triple &TT) {
     assert(TT.isOSDarwin() && "should be called with darwin triple");
     // Don't bother with 32 bit x86.
     if (TT.getArch() == Triple::x86)
@@ -131,6 +131,12 @@ private:
     return true;
   }
 
+  /// Return true if the target has sincosf/sincos/sincosl functions
+  static bool hasSinCos(const Triple &TT) {
+    return TT.isGNUEnvironment() || TT.isOSFuchsia() ||
+           (TT.isAndroid() && !TT.isAndroidVersionLT(9));
+  }
+
   void initSoftFloatCmpLibcallPredicates();
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 8506a0c03d33..882f0db193b5 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -412,7 +412,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
       break;
     }
 
-    if (darwinHasSinCos(TT)) {
+    if (darwinHasSinCosStret(TT)) {
       setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
       setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
       if (TT.isWatchABI()) {
@@ -456,8 +456,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::EXP10_F64, "__exp10");
   }
 
-  if (TT.isGNUEnvironment() || TT.isOSFuchsia() ||
-      (TT.isAndroid() && !TT.isAndroidVersionLT(9))) {
+  if (hasSinCos(TT)) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
     setLibcallName(RTLIB::SINCOS_F80, "sincosl");

From 9bd234a4330c6882f23ebf1f7861c5ec97e74d95 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:49:55 +0900
Subject: [PATCH 0571/1322] AArch64: Move outline atomic libcalls configuration
 (#144374)

This de-conditionalizes the setting of the libcall names
on outlineAtomics() && !hasLSE(). The existence of the
libcall is a module level property, which cannot depend on the
subtarget so this is fine. It's better if the initial list of
calls has more entries than will be used than to have missing
ones. There aren't any alternative names set, so this is also
fine.

Currently RuntimeLibcallsInfo conflates the existence of the calls
with the lowering usage decision, so this suboptimally will report
the libcall name on subtargets that should not use the calls. This
doesn't matter in this case though, as the atomic lowering actions
are already separately controlled and aren't based on decisions on
libcall availability. We could be paranoid and clear the names in
TargetLowering.

Also fixes not catching all aarch64 triples in the RuntimeLibcallsInfo
construction; the previous check missed aarch64_be.
---
 llvm/lib/IR/RuntimeLibcalls.cpp               | 24 ++++++++++++++++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 21 ----------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 882f0db193b5..cb64426a111d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -18,6 +18,28 @@ static cl::opt<bool>
 
 static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
                                    const Triple &TT) {
+#define LCALLNAMES(A, B, N)                                                    \
+  Info.setLibcallName(A##N##_RELAX, #B #N "_relax");                           \
+  Info.setLibcallName(A##N##_ACQ, #B #N "_acq");                               \
+  Info.setLibcallName(A##N##_REL, #B #N "_rel");                               \
+  Info.setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
+#define LCALLNAME4(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
+#define LCALLNAME5(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2)                                                          \
+  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
+  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
+
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
 #define HANDLE_LIBCALL(code, name)                                             \
@@ -520,7 +542,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     }
   }
 
-  if (TT.getArch() == Triple::ArchType::aarch64)
+  if (TT.isAArch64())
     setAArch64LibcallNames(*this, TT);
   else if (TT.isARM() || TT.isThumb())
     setARMLibcallNames(*this, TT);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c86aed7b38c8..c7ffc39b5b16 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -959,27 +959,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
-#define LCALLNAMES(A, B, N)                                                    \
-  setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
-  setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
-  setLibcallName(A##N##_REL, #B #N "_rel");                                    \
-  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
-#define LCALLNAME4(A, B)                                                       \
-  LCALLNAMES(A, B, 1)                                                          \
-  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
-#define LCALLNAME5(A, B)                                                       \
-  LCALLNAMES(A, B, 1)                                                          \
-  LCALLNAMES(A, B, 2)                                                          \
-  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
-    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
-#undef LCALLNAMES
-#undef LCALLNAME4
-#undef LCALLNAME5
   }
 
   if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {

From 24631e5440eed3093dfb52e7a631504b71845923 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:52:50 +0900
Subject: [PATCH 0572/1322] AArch64: Fix outline atomic libcall names for
 arm64ec (#144378)

Add a missing # prefix to each libcall name
---
 llvm/lib/IR/RuntimeLibcalls.cpp               | 14 +++++++++++---
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll |  3 +--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index cb64426a111d..5d22b41e28aa 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -36,9 +36,6 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
   LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
   LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
   LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
-#undef LCALLNAMES
-#undef LCALLNAME4
-#undef LCALLNAME5
 
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
@@ -54,7 +51,18 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
 #undef LIBCALL_NO_NAME
+
+    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, #__aarch64_cas)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, #__aarch64_swp)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, #__aarch64_ldadd)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, #__aarch64_ldset)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, #__aarch64_ldclr)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, #__aarch64_ldeor)
   }
+
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
 }
 
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
index 92b95a90d89a..cc4ec9c2eebd 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -28,10 +28,9 @@ define i128 @f4(i128 %x, i128 %y) {
   ret i128 %r
 }
 
-; FIXME: This is wrong; should be "#__aarch64_cas1_relax"
 define i8 @f5(i8 %expected, i8 %new, ptr %ptr) "target-features"="+outline-atomics" {
 ; CHECK-LABEL: "#f5":
-; CHECK: bl __aarch64_cas1_relax
+; CHECK: bl "#__aarch64_cas1_relax"
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
    %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r

From 6e8cf9c63f643768a1d54a9ce2a73a570429c4bc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:55:41 +0900
Subject: [PATCH 0573/1322] AArch64: Add arm64ec libcall tests for __arm_sc_*
 functions (#144356)

---
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
index cc4ec9c2eebd..38416310b353 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -42,3 +42,33 @@ define float @f6(float %val, i32 %a) {
   %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
   ret float %call
 }
+
+@dst = global [512 x i8] zeroinitializer, align 1
+@src = global [512 x i8] zeroinitializer, align 1
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memcpy(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memcpy":
+; CHECK: bl __arm_sc_memcpy
+
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memmove(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memmove":
+; CHECK: bl __arm_sc_memmove
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memset(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memset":
+; CHECK: bl __arm_sc_memset
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sme2" }

From d4e2c0b359ea90236fd1b62791a04fb845f5d9f3 Mon Sep 17 00:00:00 2001
From: Bryan Chan <bryan.chan@huawei.com>
Date: Mon, 16 Jun 2025 20:59:18 -0400
Subject: [PATCH 0574/1322] [Driver] Add options to control workaround for
 Cortex-A53 Erratum 843419 (#143915)

Implement the -mfix-cortex-a53-843419 and -mno-fix-cortex-a53-843419 options,
which have been introduced to GCC to allow the user to control the workaround
for the erratum. If the option is enabled (which is the default, unchanged by
this patch), Clang passes --fix-cortex-a53-843419 to the linker when it cannot
ensure that the target is not a Cortex A53, otherwise it doesn't.

See https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-mfix-cortex-a53-843419
for information on the GCC options.
---
 clang/include/clang/Driver/Options.td   | 10 ++++++++--
 clang/lib/Driver/ToolChains/Fuchsia.cpp |  4 +++-
 clang/lib/Driver/ToolChains/Gnu.cpp     |  4 +++-
 clang/test/Driver/android-link.cpp      | 12 ++++++++++++
 4 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 72d564e1ba0b..8b7708e530b1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5150,10 +5150,16 @@ def mno_fix_cortex_a72_aes_1655431 : Flag<["-"], "mno-fix-cortex-a72-aes-1655431
   Alias<mno_fix_cortex_a57_aes_1742098>;
 def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
-  HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+  HelpText<"Work around Cortex-A53 erratum 835769 (AArch64 only)">;
 def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
-  HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+  HelpText<"Don't work around Cortex-A53 erratum 835769 (AArch64 only)">;
+def mfix_cortex_a53_843419 : Flag<["-"], "mfix-cortex-a53-843419">,
+  Group<m_aarch64_Features_Group>,
+  HelpText<"Work around Cortex-A53 erratum 843419 (AArch64 only)">;
+def mno_fix_cortex_a53_843419 : Flag<["-"], "mno-fix-cortex-a53-843419">,
+  Group<m_aarch64_Features_Group>,
+  HelpText<"Don't work around Cortex-A53 erratum 843419 (AArch64 only)">;
 def mmark_bti_property : Flag<["-"], "mmark-bti-property">,
   Group<m_aarch64_Features_Group>,
   HelpText<"Add .note.gnu.property with BTI to assembly files (AArch64 only)">;
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 1c165bbfe84f..146dc8bbd531 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -91,7 +91,9 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--execute-only");
 
     std::string CPU = getCPUName(D, Args, Triple);
-    if (CPU.empty() || CPU == "generic" || CPU == "cortex-a53")
+    if (Args.hasFlag(options::OPT_mfix_cortex_a53_843419,
+                     options::OPT_mno_fix_cortex_a53_843419, true) &&
+        (CPU.empty() || CPU == "generic" || CPU == "cortex-a53"))
       CmdArgs.push_back("--fix-cortex-a53-843419");
   }
 
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9c68c5c6de2b..9203bbc91b0b 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -402,7 +402,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   // Most Android ARM64 targets should enable the linker fix for erratum
   // 843419. Only non-Cortex-A53 devices are allowed to skip this flag.
-  if (Arch == llvm::Triple::aarch64 && (isAndroid || isOHOSFamily)) {
+  if (Arch == llvm::Triple::aarch64 && (isAndroid || isOHOSFamily) &&
+      Args.hasFlag(options::OPT_mfix_cortex_a53_843419,
+                   options::OPT_mno_fix_cortex_a53_843419, true)) {
     std::string CPU = getCPUName(D, Args, Triple);
     if (CPU.empty() || CPU == "generic" || CPU == "cortex-a53")
       CmdArgs.push_back("--fix-cortex-a53-843419");
diff --git a/clang/test/Driver/android-link.cpp b/clang/test/Driver/android-link.cpp
index ab7dae540558..b103263cdd3f 100644
--- a/clang/test/Driver/android-link.cpp
+++ b/clang/test/Driver/android-link.cpp
@@ -16,6 +16,16 @@
 // RUN: FileCheck -check-prefix=CORTEX-A57 < %t %s
 
 // RUN: %clang --target=aarch64-none-linux-android \
+// RUN:   -mno-fix-cortex-a53-843419 \
+// RUN:   -### -v %s 2> %t
+// RUN: FileCheck -check-prefix=OVERRIDDEN < %t %s
+//
+// RUN: %clang -target aarch64-none-linux-android \
+// RUN:   -mno-fix-cortex-a53-843419 -mfix-cortex-a53-843419 \
+// RUN:   -### -v %s 2> %t
+// RUN: FileCheck -check-prefix=OVERRIDDEN2 < %t %s
+//
+// RUN: %clang -target aarch64-none-linux-android \
 // RUN:   -### -v %s 2> %t
 // RUN: FileCheck -check-prefix=MAX-PAGE-SIZE-16KB < %t %s
 
@@ -31,6 +41,8 @@
 // GENERIC-ARM: --fix-cortex-a53-843419
 // CORTEX-A53: --fix-cortex-a53-843419
 // CORTEX-A57-NOT: --fix-cortex-a53-843419
+// OVERRIDDEN-NOT: --fix-cortex-a53-843419
+// OVERRIDDEN2: --fix-cortex-a53-843419
 // MAX-PAGE-SIZE-4KB: "-z" "max-page-size=4096"
 // MAX-PAGE-SIZE-16KB: "-z" "max-page-size=16384"
 // NO-MAX-PAGE-SIZE-16KB-NOT: "-z" "max-page-size=16384"

From 8b1528fad99a18d2e094968f1341efb3048a23da Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 10:00:41 +0900
Subject: [PATCH 0575/1322] RuntimeLibcalls: Use array initializers for default
 values (#143082)

---
 llvm/include/llvm/IR/RuntimeLibcalls.h | 7 +++++--
 llvm/lib/IR/RuntimeLibcalls.cpp        | 7 +------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index d8f467e30fa6..3e1531ebfd9d 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -103,10 +103,13 @@ struct RuntimeLibcallsInfo {
 
 private:
   /// Stores the name each libcall.
-  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
+  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1] = {nullptr};
+
+  static_assert(static_cast<int>(CallingConv::C) == 0,
+                "default calling conv should be encoded as 0");
 
   /// Stores the CallingConv that should be used for each libcall.
-  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
+  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {};
 
   /// The condition type that should be used to test the result of each of the
   /// soft floating-point comparison libcall against integer zero.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 5d22b41e28aa..7396626a03d4 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -284,8 +284,7 @@ void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
-  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
-            nullptr);
+  initSoftFloatCmpLibcallPredicates();
 
   initSoftFloatCmpLibcallPredicates();
 
@@ -295,10 +294,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 #undef HANDLE_LIBCALL
 #undef LIBCALL_NO_NAME
 
-  // Initialize calling conventions to their default.
-  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
-    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
-
   // Use the f128 variants of math functions on x86
   if (TT.isX86() && TT.isGNUEnvironment()) {
     setLibcallName(RTLIB::REM_F128, "fmodf128");

From f626620e33ba2c76ba226ecaeb09c320b60aa4d9 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 16 Jun 2025 18:19:17 -0700
Subject: [PATCH 0576/1322] [LLVM][TableGen] Use `StringRef` for
 CodeGenInstruction::AsmString (#144440)

---
 llvm/utils/TableGen/Common/CodeGenInstruction.cpp | 2 +-
 llvm/utils/TableGen/Common/CodeGenInstruction.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
index 0dfcf200d7e4..2ec3683e116e 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
@@ -435,7 +435,7 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
 CodeGenInstruction::CodeGenInstruction(const Record *R)
     : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
-  AsmString = R->getValueAsString("AsmString").str();
+  AsmString = R->getValueAsString("AsmString");
 
   isPreISelOpcode = R->getValueAsBit("isPreISelOpcode");
   isReturn = R->getValueAsBit("isReturn");
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index 3a5abc55319b..0db12b551b43 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -226,7 +226,7 @@ public:
 
   /// AsmString - The format string used to emit a .s file for the
   /// instruction.
-  std::string AsmString;
+  StringRef AsmString;
 
   /// Operands - This is information about the (ins) and (outs) list specified
   /// to the instruction.

From 2e3d212e40bc6fca9fbe53978a87c901eb19a01d Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Tue, 17 Jun 2025 10:41:08 +0800
Subject: [PATCH 0577/1322] [LoongArch] Allow difference across sections
 (#141722)

For SecA != SecB but SecB is current section, fallback for pcrel{64,32}
relocations. For linker relaxation being disabled and SecA == SecB,
return directly for avoid record relocations. In other cases, record
relocations which also allows across sections.
---
 .../MCTargetDesc/LoongArchAsmBackend.cpp      | 45 +++++++--
 .../MCTargetDesc/LoongArchAsmBackend.h        |  4 +
 llvm/test/MC/LoongArch/Misc/cfi-advance.s     | 12 +++
 .../test/MC/LoongArch/Relocations/fde-reloc.s |  7 +-
 llvm/test/MC/LoongArch/Relocations/sub-expr.s | 91 ++++++++++++++++---
 5 files changed, 136 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index d7569ab0ea59..b1491b75ac5b 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -429,6 +429,26 @@ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
   return true;
 }
 
+bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
+                                               const MCFragment &F) {
+  // If the section does not contain linker-relaxable fragments, PC-relative
+  // fixups can be resolved.
+  if (!F.getParent()->isLinkerRelaxable())
+    return true;
+
+  // Otherwise, check if the offset between the symbol and fragment is fully
+  // resolved, unaffected by linker-relaxable fragments (e.g. instructions or
+  // offset-affected MCAlignFragment). Complements the generic
+  // isSymbolRefDifferenceFullyResolvedImpl.
+  if (!PCRelTemp)
+    PCRelTemp = getContext().createTempSymbol();
+  PCRelTemp->setFragment(const_cast<MCFragment *>(&F));
+  MCValue Res;
+  MCExpr::evaluateSymbolicAdd(Asm, false, MCValue::get(SymA),
+                              MCValue::get(nullptr, PCRelTemp), Res);
+  return !Res.getSubSym();
+}
+
 bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
                                    const MCValue &Target, uint64_t &FixedValue,
                                    bool IsResolved) {
@@ -447,19 +467,24 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     if (!force) {
       const MCSection &SecA = SA.getSection();
       const MCSection &SecB = SB.getSection();
+      const MCSection &SecCur = *F.getParent();
 
-      // We need record relocation if SecA != SecB. Usually SecB is same as the
-      // section of Fixup, which will be record the relocation as PCRel. If SecB
-      // is not same as the section of Fixup, it will report error. Just return
-      // false and then this work can be finished by handleFixup.
-      if (&SecA != &SecB)
+      // To handle the case of A - B which B is same section with the current,
+      // generate PCRel relocations is better than ADD/SUB relocation pair.
+      // We can resolve it as A - PC + PC - B. The A - PC will be resolved
+      // as a PCRel relocation, while PC - B will serve as the addend.
+      // If the linker relaxation is disabled, it can be done directly since
+      // PC - B is constant. Otherwise, we should evaluate whether PC - B
+      // is constant. If it can be resolved as PCRel, use Fallback which
+      // generates R_LARCH_{32,64}_PCREL relocation later.
+      if (&SecA != &SecB && &SecB == &SecCur &&
+          isPCRelFixupResolved(Target.getSubSym(), F))
         return Fallback();
 
-      // In SecA == SecB case. If the linker relaxation is enabled, we need
-      // record the ADD, SUB relocations. Otherwise the FixedValue has already
-      // been calc- ulated out in evaluateFixup, return true and avoid record
-      // relocations.
-      if (!STI.hasFeature(LoongArch::FeatureRelax))
+      // In SecA == SecB case. If the linker relaxation is disabled, the
+      // FixedValue has already been calculated out in evaluateFixup,
+      // return true and avoid record relocations.
+      if (&SecA == &SecB && !STI.hasFeature(LoongArch::FeatureRelax))
         return true;
     }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index aeedafe2b44b..56554c5c664e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -30,6 +30,10 @@ class LoongArchAsmBackend : public MCAsmBackend {
   bool Is64Bit;
   const MCTargetOptions &TargetOptions;
   DenseMap<MCSection *, const MCSymbolRefExpr *> SecToAlignSym;
+  // Temporary symbol used to check whether a PC-relative fixup is resolved.
+  MCSymbol *PCRelTemp = nullptr;
+
+  bool isPCRelFixupResolved(const MCSymbol *SymA, const MCFragment &F);
 
 public:
   LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
diff --git a/llvm/test/MC/LoongArch/Misc/cfi-advance.s b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
index 662c43e6bcea..38eba7caf610 100644
--- a/llvm/test/MC/LoongArch/Misc/cfi-advance.s
+++ b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
@@ -1,6 +1,8 @@
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=-relax %s -o %t.o
 # RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOC %s
 # RUN: llvm-dwarfdump --debug-frame %t.o | FileCheck --check-prefix=DWARFDUMP %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=RELAX %s
 
 # RELOC:       Relocations [
 # RELOC-NEXT:    .rela.eh_frame {
@@ -12,6 +14,16 @@
 # DWARFDUMP-NEXT:  DW_CFA_advance_loc: 8
 # DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
 
+# RELAX:       Relocations [
+# RELAX:         .rela.eh_frame {
+# RELAX-NEXT:       0x1C R_LARCH_32_PCREL .L{{.*}} 0x0
+# RELAX-NEXT:       0x20 R_LARCH_ADD32 .L{{.*}} 0x0
+# RELAX-NEXT:       0x20 R_LARCH_SUB32 .L{{.*}} 0x0
+# RELAX-NEXT:       0x28 R_LARCH_ADD6 .L{{.*}} 0x0
+# RELAX-NEXT:       0x28 R_LARCH_SUB6 .L{{.*}} 0x0
+# RELAX-NEXT:    }
+# RELAX-NEXT:  ]
+
         .text
         .globl test
         .p2align 2
diff --git a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
index 990e07c7f00b..ab911d1853a8 100644
--- a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
+++ b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
@@ -1,5 +1,7 @@
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 < %s \
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax < %s \
 # RUN:     | llvm-readobj -r - | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX
 
 ## Ensure that the eh_frame records the symbolic difference with
 ## the R_LARCH_32_PCREL relocation.
@@ -12,3 +14,6 @@ func:
 # CHECK:   Section (4) .rela.eh_frame {
 # CHECK-NEXT:   0x1C R_LARCH_32_PCREL .text 0x0
 # CHECK-NEXT: }
+# RELAX:   Section ({{.*}}) .rela.eh_frame {
+# RELAX-NEXT:   0x1C R_LARCH_32_PCREL .L{{.*}} 0x0
+# RELAX-NEXT: }
diff --git a/llvm/test/MC/LoongArch/Relocations/sub-expr.s b/llvm/test/MC/LoongArch/Relocations/sub-expr.s
index 0179e1027af8..8bf046acc697 100644
--- a/llvm/test/MC/LoongArch/Relocations/sub-expr.s
+++ b/llvm/test/MC/LoongArch/Relocations/sub-expr.s
@@ -1,28 +1,95 @@
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t
-# RUN: llvm-readobj -r %t | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX
 
 ## Check that subtraction expressions emit R_LARCH_32_PCREL and R_LARCH_64_PCREL relocations.
 
 ## TODO: 1- or 2-byte data relocations are not supported for now.
 
 # CHECK:      Relocations [
-# CHECK-NEXT:   Section ({{.*}}) .rela.data {
-# CHECK-NEXT:     0x0 R_LARCH_64_PCREL sx 0x0
-# CHECK-NEXT:     0x8 R_LARCH_64_PCREL sy 0x0
-# CHECK-NEXT:     0x10 R_LARCH_32_PCREL sx 0x0
-# CHECK-NEXT:     0x14 R_LARCH_32_PCREL sy 0x0
-# CHECK-NEXT:   }
+# CHECK-NEXT:     Section ({{.*}}) .rela.sx {
+# CHECK-NEXT:       0x4 R_LARCH_PCALA_HI20 z 0x0
+# CHECK-NEXT:       0x8 R_LARCH_PCALA_LO12 z 0x0
+# CHECK-NEXT:       0xC R_LARCH_32_PCREL .sy 0xC
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section ({{.*}}) .rela.data {
+# CHECK-NEXT:       0x0 R_LARCH_64_PCREL .sx 0x4
+# CHECK-NEXT:       0x8 R_LARCH_64_PCREL .sy 0x4
+# CHECK-NEXT:       0x10 R_LARCH_32_PCREL .sx 0x4
+# CHECK-NEXT:       0x14 R_LARCH_32_PCREL .sy 0x4
+# CHECK-NEXT:       0x18 R_LARCH_ADD64 .sx 0x4
+# CHECK-NEXT:       0x18 R_LARCH_SUB64 .sy 0x4
+# CHECK-NEXT:       0x20 R_LARCH_ADD64 .sy 0x4
+# CHECK-NEXT:       0x20 R_LARCH_SUB64 .sx 0x4
+# CHECK-NEXT:       0x28 R_LARCH_ADD32 .sx 0x4
+# CHECK-NEXT:       0x28 R_LARCH_SUB32 .sy 0x4
+# CHECK-NEXT:       0x2C R_LARCH_ADD32 .sy 0x4
+# CHECK-NEXT:       0x2C R_LARCH_SUB32 .sx 0x4
+# CHECK-NEXT:       0x30 R_LARCH_ADD64 .data 0x30
+# CHECK-NEXT:       0x30 R_LARCH_SUB64 .sx 0x4
+# CHECK-NEXT:       0x38 R_LARCH_ADD32 .data 0x38
+# CHECK-NEXT:       0x38 R_LARCH_SUB32 .sy 0x4
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section ({{.*}}) .rela.sy {
+# CHECK-NEXT:       0x10 R_LARCH_32_PCREL .sx 0x10
+# CHECK-NEXT:     }
+# CHECK-NEXT:   ]
 
-.section sx,"a"
-x:
+# RELAX:      Relocations [
+# RELAX-NEXT:   Section ({{.*}}) .rela.sx {
+# RELAX-NEXT:     0x4 R_LARCH_PCALA_HI20 z 0x0
+# RELAX-NEXT:     0x4 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:     0x8 R_LARCH_PCALA_LO12 z 0x0
+# RELAX-NEXT:     0x8 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:     0xC R_LARCH_ADD32 y 0x0
+# RELAX-NEXT:     0xC R_LARCH_SUB32 x 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT:   Section ({{.*}}) .rela.data {
+# RELAX-NEXT:     0x0 R_LARCH_64_PCREL x 0x0
+# RELAX-NEXT:     0x8 R_LARCH_64_PCREL y 0x0
+# RELAX-NEXT:     0x10 R_LARCH_32_PCREL x 0x0
+# RELAX-NEXT:     0x14 R_LARCH_32_PCREL y 0x0
+# RELAX-NEXT:     0x18 R_LARCH_ADD64 x 0x0
+# RELAX-NEXT:     0x18 R_LARCH_SUB64 y 0x0
+# RELAX-NEXT:     0x20 R_LARCH_ADD64 y 0x0
+# RELAX-NEXT:     0x20 R_LARCH_SUB64 x 0x0
+# RELAX-NEXT:     0x28 R_LARCH_ADD32 x 0x0
+# RELAX-NEXT:     0x28 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:     0x2C R_LARCH_ADD32 y 0x0
+# RELAX-NEXT:     0x2C R_LARCH_SUB32 x 0x0
+# RELAX-NEXT:     0x30 R_LARCH_ADD64 {{.*}} 0x0
+# RELAX-NEXT:     0x30 R_LARCH_SUB64 x 0x0
+# RELAX-NEXT:     0x38 R_LARCH_ADD32 {{.*}} 0x0
+# RELAX-NEXT:     0x38 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT:   Section ({{.*}}) .rela.sy {
+# RELAX-NEXT:     0x4 R_LARCH_ALIGN - 0xC
+# RELAX-NEXT:     0x10 R_LARCH_ADD32 x 0x0
+# RELAX-NEXT:     0x10 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT: ]
+
+.section .sx,"ax"
 nop
+x:
+la.pcrel $a0, z
+.4byte y-x
 
 .data
 .8byte x-.
 .8byte y-.
 .4byte x-.
 .4byte y-.
+.8byte x-y
+.8byte y-x
+.4byte x-y
+.4byte y-x
+.8byte .-x
+.4byte .-y
 
-.section sy,"a"
-y:
+.section .sy,"ax"
 nop
+y:
+.p2align 4
+.4byte x-y

From ab7aaaca93a0670e96a454136bb9cf13bb1ae372 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 16 Jun 2025 19:50:43 -0700
Subject: [PATCH 0578/1322] [flang][tests] Remove stale module files to fix
 buildbots.

---
 flang/test/Semantics/modfile75.F90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index 8f7adafe7204..a61c59bbb31b 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,4 +1,6 @@
 !RUN: rm -rf %t && mkdir -p %t
+! The next line is a temporary clean-up for the buildbots to pass.
+!RUN: rm -f modfile75a.mod modfile75b.mod
 !RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1

From 9093bc7eff33b002d7f16d4d62ff1af2a5a993f8 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Tue, 17 Jun 2025 10:57:36 +0800
Subject: [PATCH 0579/1322] [llvm-exegesis] Ignore the instructions for which
 InstrDesc.getSchedClass() == 0 (#143840)

This allows llvm-exegesis to skip instructions that lack scheduling
information, avoiding invalid benchmarking. e.g. `InstB` in RISC-V.
---
 llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test | 3 +++
 llvm/tools/llvm-exegesis/lib/Target.cpp                     | 2 ++
 2 files changed, 5 insertions(+)
 create mode 100644 llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test

diff --git a/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test b/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
new file mode 100644
index 000000000000..fcf3b8f5463d
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
@@ -0,0 +1,3 @@
+# RUN: llvm-exegesis -mtriple=riscv64-unknown-linux-gnu -mcpu=generic --benchmark-phase=assemble-measured-code -mode=inverse_throughput -opcode-name=InsnB 2>&1 | FileCheck %s
+
+CHECK: Unsupported opcode: No Sched Class
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 68d19514bedb..fc5f82f288ae 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -45,6 +45,8 @@ ExegesisTarget::getIgnoredOpcodeReasonOrNull(const LLVMState &State,
     return "Unsupported opcode: isBranch/isIndirectBranch";
   if (InstrDesc.isCall() || InstrDesc.isReturn())
     return "Unsupported opcode: isCall/isReturn";
+  if (InstrDesc.getSchedClass() == 0)
+    return "Unsupported opcode: No Sched Class";
   return nullptr;
 }
 

From 602c3089f749ec3b61b93652ea9eb5947a61bcf2 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Tue, 17 Jun 2025 11:17:37 +0800
Subject: [PATCH 0580/1322] [TargetParser] Increase MAX_SUBTARGET_FEATURES to
 384 (#144326)

There are 314 features in RISC-V backend, which is about to exceed
the maxinum 320 as there are some ongoing new extensions.

We increase the `MAX_SUBTARGET_FEATURES` to 384 so that we won't
surprise anyone.
---
 llvm/include/llvm/TargetParser/SubtargetFeature.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h
index 6f1723dec5d0..cdcfcdd0e802 100644
--- a/llvm/include/llvm/TargetParser/SubtargetFeature.h
+++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h
@@ -32,7 +32,7 @@ namespace llvm {
 class raw_ostream;
 class Triple;
 
-const unsigned MAX_SUBTARGET_WORDS = 5;
+const unsigned MAX_SUBTARGET_WORDS = 6;
 const unsigned MAX_SUBTARGET_FEATURES = MAX_SUBTARGET_WORDS * 64;
 
 /// Container class for subtarget features.

From a02afb0def589ec28f8240ff15760e5f241b833c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 20:40:17 -0700
Subject: [PATCH 0581/1322] AVR: Migrate to the new relocation specifier
 representation

Define printImpl and evaluateAsRelocationImpl within AVRMCAsmInfo.
---
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp  | 27 +++++++++++++++----
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.h    |  4 +++
 .../lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 11 --------
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h  |  5 +---
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index 68db5227d073..cfd7dc582262 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -116,6 +116,19 @@ AVR::Fixups AVRMCExpr::getFixupKind() const {
   return Kind;
 }
 
+void AVRMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                      const MCSpecifierExpr &Expr) const {
+  auto &E = static_cast<const AVRMCExpr &>(Expr);
+  assert(E.getSpecifier() != AVR::S_AVR_NONE);
+  OS << E.getName() << '(';
+  if (E.isNegated())
+    OS << '-' << '(';
+  printExpr(OS, *E.getSubExpr());
+  if (E.isNegated())
+    OS << ')';
+  OS << ')';
+}
+
 int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
   if (Negated)
     Value *= -1;
@@ -164,15 +177,19 @@ int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
   return static_cast<uint64_t>(Value) & 0xff;
 }
 
-bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
-                                          const MCAssembler *Asm) const {
+// bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+//                                           const MCAssembler *Asm) const {
+bool AVRMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                             MCValue &Result,
+                                             const MCAssembler *Asm) const {
+  auto &E = static_cast<const AVRMCExpr &>(Expr);
   MCValue Value;
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
+  bool isRelocatable = E.getSubExpr()->evaluateAsRelocatable(Value, Asm);
   if (!isRelocatable)
     return false;
 
   if (Value.isAbsolute()) {
-    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
+    Result = MCValue::get(E.evaluateAsInt64(Value.getConstant()));
   } else {
     if (!Asm || !Asm->hasLayout())
       return false;
@@ -181,7 +198,7 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
     if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
       return false;
     assert(!Value.getSubSym());
-    if (specifier == AVR::S_PM)
+    if (E.getSpecifier() == AVR::S_PM)
       Spec = AVR::S_PM;
 
     // TODO: don't attach specifier to MCSymbolRefExpr.
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index 649e247adab0..fab271304e27 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -25,6 +25,10 @@ class Triple;
 class AVRMCAsmInfo : public MCAsmInfo {
 public:
   explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace AVR {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 3067e854d8dc..5963976d0dc7 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -21,15 +21,4 @@ const AVRMCExpr *AVRMCExpr::create(Specifier Kind, const MCExpr *Expr,
   return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
 }
 
-void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  assert(specifier != AVR::S_AVR_NONE);
-  OS << getName() << '(';
-  if (isNegated())
-    OS << '-' << '(';
-  MAI->printExpr(OS, *getSubExpr());
-  if (isNegated())
-    OS << ')';
-  OS << ')';
-}
-
 } // namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index d72d36f10858..5592e24be537 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -18,6 +18,7 @@ namespace llvm {
 /// A expression in AVR machine code.
 class AVRMCExpr : public MCSpecifierExpr {
 public:
+  friend class AVRMCAsmInfo;
   using Specifier = Spec;
   /// Specifies the type of an expression.
 
@@ -36,10 +37,6 @@ public:
   bool isNegated() const { return Negated; }
   void setNegated(bool negated = true) { Negated = negated; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
 public:
   static Specifier parseSpecifier(StringRef Name);
 

From 199428e0472c80d9b742d0a3e492ab902005fb6a Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Mon, 16 Jun 2025 20:41:40 -0700
Subject: [PATCH 0582/1322] [bazel][lld] Remove unneeded dependencies.
 (#144455)

As far as I can tell these are not used in any includes in their
respective targets, and building all of LLD with
```
bazel build --config=generic_clang @llvm-project//lld/...
```
still works.
---
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 2c9f3e56e311..450157758d75 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -49,7 +49,6 @@ cc_library(
         "//llvm:CodeGen",
         "//llvm:Core",
         "//llvm:DebugInfoDWARF",
-        "//llvm:IRPrinter",
         "//llvm:Option",
         "//llvm:ProfileData",
         "//llvm:Support",
@@ -153,7 +152,6 @@ cc_library(
         "//llvm:Option",
         "//llvm:Support",
         "//llvm:Symbolize",
-        "//llvm:Target",
         "//llvm:TargetParser",
         "//llvm:TransformUtils",
         "//llvm:WindowsDriver",
@@ -210,15 +208,11 @@ cc_library(
         "//llvm:BitReader",
         "//llvm:BitWriter",
         "//llvm:CGData",
-        "//llvm:Core",
         "//llvm:DebugInfoDWARF",
         "//llvm:Demangle",
         "//llvm:LTO",
-        "//llvm:MC",
-        "//llvm:ObjCARC",
         "//llvm:Object",
         "//llvm:Option",
-        "//llvm:ProfileData",
         "//llvm:Support",
         "//llvm:TargetParser",
         "//llvm:TextAPI",

From 30350afd023c4e9583d5a8bbfd56af7c354923fa Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 20:58:10 -0700
Subject: [PATCH 0583/1322] MCSpecifierExpr: Remove unused virtual functions

... now that all targets using MCSpecifierExpr have migrated to
XXXMCAsmInfo::printExpr/evaluateAsRelocatableImpl.
---
 llvm/include/llvm/MC/MCAsmInfo.h |  5 ++++-
 llvm/include/llvm/MC/MCExpr.h    |  7 -------
 llvm/lib/MC/MCAsmInfo.cpp        | 18 +++++++-----------
 llvm/lib/MC/MCExpr.cpp           |  9 ---------
 4 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index a7bf1b965bf2..93ce3cc44421 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -715,7 +715,10 @@ public:
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
 
   void printExpr(raw_ostream &, const MCExpr &) const;
-  virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
+  virtual void printSpecifierExpr(raw_ostream &,
+                                  const MCSpecifierExpr &) const {
+    llvm_unreachable("Need to implement hook if target uses MCSpecifierExpr");
+  }
   virtual bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &Res,
                                          const MCAssembler *Asm) const;
 };
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index cd57fafc50b5..4ec780d8ff94 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -512,7 +512,6 @@ protected:
 
   explicit MCSpecifierExpr(const MCExpr *Expr, Spec S, SMLoc Loc = SMLoc())
       : MCExpr(Specifier, Loc), Expr(Expr), specifier(S) {}
-  virtual ~MCSpecifierExpr() = default;
 
 public:
   LLVM_ABI static const MCSpecifierExpr *
@@ -523,12 +522,6 @@ public:
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
-  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-    llvm_unreachable("Replace MCExpr::print calls with MCAsmInfo::printExpr");
-  }
-  virtual bool evaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAssembler *Asm) const;
-
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Specifier;
   }
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index e8eaf4619df5..ba672d2fc2ec 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -157,17 +158,12 @@ void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
     Expr.print(OS, this);
 }
 
-void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
-                                   const MCSpecifierExpr &Expr) const {
-  // TODO: Switch to unreachable after all targets that use MCSpecifierExpr
-  // migrate to MCAsmInfo::printSpecifierExpr.
-  Expr.printImpl(OS, this);
-}
-
-bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &E,
                                           MCValue &Res,
                                           const MCAssembler *Asm) const {
-  // TODO: Remove after all targets that use MCSpecifierExpr migrate to
-  // MCAsmInfo::evaluateAsRelocatableImpl.
-  return Expr.evaluateAsRelocatableImpl(Res, Asm);
+  if (!E.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+
+  Res.setSpecifier(E.getSpecifier());
+  return !Res.getSubSym();
 }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 89191294f3ed..8919a2627cf6 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -754,12 +754,3 @@ const MCSpecifierExpr *MCSpecifierExpr::create(const MCSymbol *Sym, Spec S,
                                                MCContext &Ctx, SMLoc Loc) {
   return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S, Loc);
 }
-
-bool MCSpecifierExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                                const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-
-  Res.setSpecifier(specifier);
-  return !Res.getSubSym();
-}

From 7caeec599998bd8aa01d498574e148e4e9c982db Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 06:08:15 +0200
Subject: [PATCH 0584/1322] [NFC][flang][OpenMP] Unify `genSectionsOp`'s
 prototype to match other `genXXXOp` functions (#144013)

Unifies the prototype of `genSectionsOp` to match other ops generators.
Doing so, we are able to call `genSectionsOp` directtly from
`genOMPDispatch` instead of the special handling needed now to pass the
section blocks. This is useful because now we can handle symbol mapping
scopes easier for nested OpenMP directives. See

https://github.com/llvm/llvm-project/pull/143706#issuecomment-2965344723
and the following discussion for more info.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 35 +++++++++++--------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 82673f0948a5..060eba1b906e 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -201,6 +201,8 @@ private:
 /// structures, but it will probably still require some further work to support
 /// reverse offloading.
 static llvm::SmallVector<HostEvalInfo, 0> hostEvalInfo;
+static llvm::SmallVector<const parser::OpenMPSectionsConstruct *, 0>
+    sectionsStack;
 
 /// Bind symbols to their corresponding entry block arguments.
 ///
@@ -2220,8 +2222,12 @@ static mlir::omp::SectionsOp
 genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
-              const ConstructQueue &queue, ConstructQueue::const_iterator item,
-              const parser::OmpSectionBlocks &sectionBlocks) {
+              const ConstructQueue &queue,
+              ConstructQueue::const_iterator item) {
+  assert(!sectionsStack.empty());
+  const auto &sectionBlocks =
+      std::get<parser::OmpSectionBlocks>(sectionsStack.back()->t);
+  sectionsStack.pop_back();
   mlir::omp::SectionsOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps,
@@ -3458,10 +3464,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     // Lowered in the enclosing genSectionsOp.
     break;
   case llvm::omp::Directive::OMPD_sections:
-    // Called directly from genOMP([...], OpenMPSectionsConstruct) because it
-    // has a different prototype.
-    // This code path is still taken when iterating through the construct queue
-    // in genBodyOfOp
+    genSectionsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_simd:
     newOp =
@@ -4137,8 +4140,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       std::get<parser::OmpClauseList>(beginSectionsDirective.t), semaCtx);
   const auto &endSectionsDirective =
       std::get<parser::OmpEndSectionsDirective>(sectionsConstruct.t);
-  const auto &sectionBlocks =
-      std::get<parser::OmpSectionBlocks>(sectionsConstruct.t);
   clauses.append(makeClauses(
       std::get<parser::OmpClauseList>(endSectionsDirective.t), semaCtx));
   mlir::Location currentLocation = converter.getCurrentLocation();
@@ -4150,22 +4151,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   ConstructQueue queue{
       buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
                           eval, source, directive, clauses)};
-  ConstructQueue::iterator next = queue.begin();
-  // Generate constructs that come first e.g. Parallel
-  while (next != queue.end() &&
-         next->id != llvm::omp::Directive::OMPD_sections) {
-    genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
-                   next);
-    next = std::next(next);
-  }
 
-  // call genSectionsOp directly (not via genOMPDispatch) so that we can add the
-  // sectionBlocks argument
-  assert(next != queue.end());
-  assert(next->id == llvm::omp::Directive::OMPD_sections);
-  genSectionsOp(converter, symTable, semaCtx, eval, currentLocation, queue,
-                next, sectionBlocks);
-  assert(std::next(next) == queue.end());
+  sectionsStack.push_back(&sectionsConstruct);
+  genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,

From b5dbf8210a57b986b9802304745f4c5c108cf37b Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 06:08:38 +0200
Subject: [PATCH 0585/1322] [flang] Enable delayed localization by default for
 `do concurrent` (#144074)

Reintroduces changes from
https://github.com/llvm/llvm-project/issues/143897. A fix for the
reported problem in https://github.com/llvm/llvm-project/issues/143897
is hopefully resolved in
https://github.com/llvm/llvm-project/pull/144027.

This PR aims to make it easier and more self-contained to revert the
switch/flag if we discover any problems with enabling it by default.
---
 flang/lib/Lower/Bridge.cpp                            | 6 +-----
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 64b16b3abe99..5ff8101dba09 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,11 +2033,7 @@ private:
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
-    // default unlike the staging flag) once the implementation of this is more
-    // complete.
-    bool useDelayedPriv =
-        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
+    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 6cae0eb46db1..039b17808d19 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index a3d0c34ed856..67f080eb2c1c 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index d64321385474..798cbb335c8c 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 60df27a591dc..64f14ff97227 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 84db1972cca1..34d7bcfb7d7a 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From 2dc58e02cbce83784a38b4cc33f83529ad1a7c7e Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 07:01:53 +0200
Subject: [PATCH 0586/1322] [flang][OpenMP] Add symbol table scopes for `teams`
 and `parallel` (#144015)

Adds symbol map scopes for standalone `teams` and `parallel` constructs.
This is required to properly bind the privatized symbols in both
constructs so that nested constructs can find them.

Resolves https://github.com/llvm/llvm-project/issues/116428.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  9 ++------
 .../OpenMP/Todo/target-parallel-private.f90   | 13 ------------
 .../OpenMP/Todo/target-teams-private.f90      | 13 ------------
 .../Lower/OpenMP/target-parallel-private.f90  | 21 +++++++++++++++++++
 .../Lower/OpenMP/target-teams-private.f90     | 20 ++++++++++++++++++
 5 files changed, 43 insertions(+), 33 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
 delete mode 100644 flang/test/Lower/OpenMP/Todo/target-teams-private.f90
 create mode 100644 flang/test/Lower/OpenMP/target-parallel-private.f90
 create mode 100644 flang/test/Lower/OpenMP/target-teams-private.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 060eba1b906e..3e865a1ee718 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2674,6 +2674,7 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
+  lower::SymMapScope scope(symTable);
   mlir::omp::TeamsOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
@@ -2981,6 +2982,7 @@ static mlir::omp::ParallelOp genStandaloneParallel(
     lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
+  lower::SymMapScope scope(symTable);
   mlir::omp::ParallelOperands parallelClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
@@ -4027,13 +4029,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
       TODO(clauseLocation, name + " clause is not implemented yet");
     }
-
-    if (std::holds_alternative<clause::Private>(clause.u) &&
-        origDirective == llvm::omp::Directive::OMPD_target_teams)
-      TODO(clauseLocation, "TARGET TEAMS PRIVATE is not implemented yet");
-    if (std::holds_alternative<clause::Private>(clause.u) &&
-        origDirective == llvm::omp::Directive::OMPD_target_parallel)
-      TODO(clauseLocation, "TARGET PARALLEL PRIVATE is not implemented yet");
   }
 
   llvm::omp::Directive directive =
diff --git a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90 b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
deleted file mode 100644
index e820143021f9..000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `private` clause on `target parallel`
-!===============================================================================
-
-! CHECK: not yet implemented: TARGET PARALLEL PRIVATE is not implemented yet
-subroutine target_teams_private()
-integer, dimension(3) :: i
-!$omp target parallel private(i)
-!$omp end target parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90 b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
deleted file mode 100644
index c8d998a5cbf9..000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `private` clause on `target teams`
-!===============================================================================
-
-! CHECK: not yet implemented: TARGET TEAMS PRIVATE is not implemented yet
-subroutine target_teams_private()
-integer, dimension(3) :: i
-!$omp target teams private(i)
-!$omp end target teams
-end subroutine
diff --git a/flang/test/Lower/OpenMP/target-parallel-private.f90 b/flang/test/Lower/OpenMP/target-parallel-private.f90
new file mode 100644
index 000000000000..cc04b77e4a52
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-parallel-private.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+!===============================================================================
+! `private` clause on `target parallel`
+!===============================================================================
+
+subroutine target_parallel_private()
+integer, dimension(3) :: i
+!$omp target parallel private(i)
+!$omp end target parallel
+end subroutine
+
+! CHECK: omp.private {type = private} @[[PRIVATIZER:.*]] : {{.*}}
+
+! CHECK: omp.target {{.*}} {
+! CHECK:   omp.parallel private(@[[PRIVATIZER]] %{{.*}} -> %{{.*}} : {{.*}}) {
+! CHECK:   }
+! CHECK: }
diff --git a/flang/test/Lower/OpenMP/target-teams-private.f90 b/flang/test/Lower/OpenMP/target-teams-private.f90
new file mode 100644
index 000000000000..65d97649b5cf
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-teams-private.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+!===============================================================================
+! `private` clause on `target teams`
+!===============================================================================
+
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target teams private(i)
+!$omp end target teams
+end subroutine
+
+! CHECK: omp.target {{.*}} {
+! CHECK:   omp.teams {
+! CHECK:     %{{.*}} = fir.alloca !fir.array<3xi32> {bindc_name = "i", {{.*}}}
+! CHECK:   }
+! CHECK: }

From 84d879d6999b61cea3f9f200df57653f5a51ee41 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Mon, 16 Jun 2025 22:11:16 -0700
Subject: [PATCH 0587/1322] [RISCV] Rename Relocation QC_E_JUMP_PLT to
 QC_E_CALL_PLT (#143998)

The semantics and definition of this relocation are unchanged. The new
name reflects that instructions with the relocation should be assumed to
clobber non-callee-saved registers, as with the R_RISCV_CALL_PLT
relocation.

The name was changed in v0.2 of the ABI extensions:
https://github.com/quic/riscv-elf-psabi-quic-extensions/releases/tag/v0.2
---
 .../llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def       | 2 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp      | 6 +++---
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp | 4 ++--
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h        | 2 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp   | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
index 7ae3d3f20577..b02462ca89fd 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -25,4 +25,4 @@
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U,    192)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH,   193)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32,       194)
-ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_JUMP_PLT, 195)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_CALL_PLT, 195)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 2f37c351baf9..9161f23c8a95 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -85,7 +85,7 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_riscv_qc_e_branch", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_riscv_qc_e_32", 16, 32, 0},
       {"fixup_riscv_qc_abs20_u", 12, 20, 0},
-      {"fixup_riscv_qc_e_jump_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_qc_e_call_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
   };
   static_assert((std::size(Infos)) == RISCV::NumTargetFixupKinds,
                 "Not all fixup kinds added to Infos array");
@@ -552,7 +552,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value = (Bit19 << 31) | (Bit14_0 << 16) | (Bit18_15 << 12);
     return Value;
   }
-  case RISCV::fixup_riscv_qc_e_jump_plt: {
+  case RISCV::fixup_riscv_qc_e_call_plt: {
     if (!isInt<32>(Value))
       Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     if (Value & 0x1)
@@ -699,7 +699,7 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   case RISCV::fixup_riscv_qc_e_branch:
   case RISCV::fixup_riscv_qc_abs20_u:
   case RISCV::fixup_riscv_qc_e_32:
-  case RISCV::fixup_riscv_qc_e_jump_plt:
+  case RISCV::fixup_riscv_qc_e_call_plt:
     VendorIdentifier = "QUALCOMM";
     break;
   }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 1d81096d6b60..3c1f9450a099 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -101,8 +101,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_RISCV_CALL_PLT;
     case RISCV::fixup_riscv_qc_e_branch:
       return ELF::R_RISCV_QC_E_BRANCH;
-    case RISCV::fixup_riscv_qc_e_jump_plt:
-      return ELF::R_RISCV_QC_E_JUMP_PLT;
+    case RISCV::fixup_riscv_qc_e_call_plt:
+      return ELF::R_RISCV_QC_E_CALL_PLT;
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 80fbed8d10f9..8d869a64cde4 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -54,7 +54,7 @@ enum Fixups {
   // 20-bit fixup for symbol references in the 32-bit qc.li instruction
   fixup_riscv_qc_abs20_u,
   // 32-bit fixup for symbol references in the 48-bit qc.j/qc.jal instructions
-  fixup_riscv_qc_e_jump_plt,
+  fixup_riscv_qc_e_call_plt,
 
   // Used as a sentinel, must be the last
   fixup_riscv_invalid,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 1185e3558b00..2a90552037f9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -645,7 +645,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_qc_e_32;
       RelaxCandidate = true;
     } else if (MIFrm == RISCVII::InstFormatQC_EJ) {
-      FixupKind = RISCV::fixup_riscv_qc_e_jump_plt;
+      FixupKind = RISCV::fixup_riscv_qc_e_call_plt;
       RelaxCandidate = true;
     }
   }

From c0ac95181eededc85027d63fe9f97bc742b7a552 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Mon, 16 Jun 2025 22:12:12 -0700
Subject: [PATCH 0588/1322] [RISCV] Update Xqci to v0.13.0 (#144398)

---
 clang/include/clang/Basic/AttrDocs.td         |  2 +-
 .../Driver/print-supported-extensions-riscv.c |  6 ++--
 llvm/docs/RISCVUsage.rst                      | 34 +++++++++----------
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  6 ++--
 llvm/test/CodeGen/RISCV/attributes.ll         |  6 ++--
 .../TargetParser/RISCVISAInfoTest.cpp         | 10 +++---
 6 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 047f51ffa59e..6051e1fc4511 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2934,7 +2934,7 @@ https://gcc.gnu.org/onlinedocs/gcc/RISC-V-Function-Attributes.html
 https://riscv.org/specifications/privileged-isa/
 The RISC-V Instruction Set Manual Volume II: Privileged Architecture
 Version 1.10.
-https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.7
+https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0
 https://sifive.cdn.prismic.io/sifive/d1984d2b-c9b9-4c91-8de0-d68a5e64fa0f_sifive-interrupt-cookbook-v1p2.pdf
   }];
 }
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 95464f06378e..33d8738d5a9b 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -221,14 +221,14 @@
 // CHECK-NEXT:     xqcicli              0.3       'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
 // CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
-// CHECK-NEXT:     xqcicsr              0.3       'Xqcicsr' (Qualcomm uC CSR Extension)
-// CHECK-NEXT:     xqciint              0.7       'Xqciint' (Qualcomm uC Interrupts Extension)
+// CHECK-NEXT:     xqcicsr              0.4       'Xqcicsr' (Qualcomm uC CSR Extension)
+// CHECK-NEXT:     xqciint              0.10      'Xqciint' (Qualcomm uC Interrupts Extension)
 // CHECK-NEXT:     xqciio               0.1       'Xqciio' (Qualcomm uC External Input Output Extension)
 // CHECK-NEXT:     xqcilb               0.2       'Xqcilb' (Qualcomm uC Long Branch Extension)
 // CHECK-NEXT:     xqcili               0.2       'Xqcili' (Qualcomm uC Load Large Immediate Extension)
 // CHECK-NEXT:     xqcilia              0.2       'Xqcilia' (Qualcomm uC Large Immediate Arithmetic Extension)
 // CHECK-NEXT:     xqcilo               0.3       'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)
-// CHECK-NEXT:     xqcilsm              0.5       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
+// CHECK-NEXT:     xqcilsm              0.6       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
 // CHECK-NEXT:     xqcisim              0.2       'Xqcisim' (Qualcomm uC Simulation Hint Extension)
 // CHECK-NEXT:     xqcisls              0.2       'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
 // CHECK-NEXT:     xqcisync             0.3       'Xqcisync' (Qualcomm uC Sync Delay Extension)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 64f17f59575e..78890b605d83 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -445,58 +445,58 @@ The current vendor extensions supported are:
   LLVM implements `version 0.1 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.1.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
 
 ``experimental-Xqcia``
-  LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciac``
-  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcibi``
-  LLVM implements `version 0.2 of the Qualcomm uC Branch Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Branch Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcibm``
-  LLVM implements `version 0.8 of the Qualcomm uC Bit Manipulation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.8 of the Qualcomm uC Bit Manipulation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicli``
-  LLVM implements `version 0.3 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicm``
-  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcics``
-  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicsr``
-  LLVM implements `version 0.3 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.4 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciint``
-  LLVM implements `version 0.7 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.10 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciio``
-  LLVM implements `version 0.1 of the Qualcomm uC External Input Output extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.1 of the Qualcomm uC External Input Output extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilb``
-  LLVM implements `version 0.2 of the Qualcomm uC Long Branch extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Long Branch extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcili``
-  LLVM implements `version 0.2 of the Qualcomm uC Load Large Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Load Large Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilia``
-  LLVM implements `version 0.2 of the Qualcomm uC Large Immediate Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Large Immediate Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilo``
-  LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilsm``
   LLVM implements `version 0.6 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisim``
-  LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisls``
-  LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisync``
-  LLVM implements `version 0.3 of the Qualcomm uC Sync Delay extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Sync Delay extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``Xmipscmov``
   LLVM implements conditional move for the `p8700 processor <https://mips.com/products/hardware/p8700/>`__ by MIPS.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 940caa4f4044..0f26c6f1e0a5 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1488,14 +1488,14 @@ def HasVendorXqcics
                          "'Xqcics' (Qualcomm uC Conditional Select Extension)">;
 
 def FeatureVendorXqcicsr
-    : RISCVExperimentalExtension<0, 3, "Qualcomm uC CSR Extension">;
+    : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
 def HasVendorXqcicsr
     : Predicate<"Subtarget->hasVendorXqcicsr()">,
       AssemblerPredicate<(all_of FeatureVendorXqcicsr),
                          "'Xqcicsr' (Qualcomm uC CSR Extension)">;
 
 def FeatureVendorXqciint
-    : RISCVExperimentalExtension<0, 7, "Qualcomm uC Interrupts Extension",
+    : RISCVExperimentalExtension<0, 10, "Qualcomm uC Interrupts Extension",
                                  [FeatureStdExtZca]>;
 def HasVendorXqciint
     : Predicate<"Subtarget->hasVendorXqciint()">,
@@ -1542,7 +1542,7 @@ def HasVendorXqcilo
                          "'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)">;
 
 def FeatureVendorXqcilsm
-    : RISCVExperimentalExtension<0, 5,
+    : RISCVExperimentalExtension<0, 6,
                                  "Qualcomm uC Load Store Multiple Extension">;
 def HasVendorXqcilsm
     : Predicate<"Subtarget->hasVendorXqcilsm()">,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index ba8969b5a538..c9cfb2fb20b1 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -445,14 +445,14 @@
 ; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p3"
 ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
-; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p3"
-; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p7"
+; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p4"
+; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p10"
 ; RV32XQCIIO: .attribute 5, "rv32i2p1_xqciio0p1"
 ; RV32XQCILB: .attribute 5, "rv32i2p1_zca1p0_xqcilb0p2"
 ; RV32XQCILI: .attribute 5, "rv32i2p1_zca1p0_xqcili0p2"
 ; RV32XQCILIA: .attribute 5, "rv32i2p1_zca1p0_xqcilia0p2"
 ; RV32XQCILO: .attribute 5, "rv32i2p1_zca1p0_xqcilo0p3"
-; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p5"
+; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p6"
 ; RV32XQCISIM: attribute 5, "rv32i2p1_zca1p0_xqcisim0p2"
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3"
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 29bfa30848ec..0316e6470422 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -684,9 +684,9 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   for (StringRef Input :
        {"rv64i_xqcia0p7", "rv64i_xqciac0p3", "rv64i_xqcibi0p2",
         "rv64i_xqcibm0p8", "rv64i_xqcicli0p3", "rv64i_xqcicm0p2",
-        "rv64i_xqcics0p2", "rv64i_xqcicsr0p3", "rv64i_xqciint0p7",
+        "rv64i_xqcics0p2", "rv64i_xqcicsr0p4", "rv64i_xqciint0p10",
         "rv64i_xqciio0p1", "rv64i_xqcilb0p2", "rv64i_xqcili0p2",
-        "rv64i_xqcilia0p2", "rv64i_xqcilo0p3", "rv64i_xqcilsm0p5",
+        "rv64i_xqcilia0p2", "rv64i_xqcilo0p3", "rv64i_xqcilsm0p6",
         "rv64i_xqcisim0p2", "rv64i_xqcisls0p2", "rv64i_xqcisync0p3"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
@@ -1192,14 +1192,14 @@ Experimental extensions
     xqcicli              0.3
     xqcicm               0.2
     xqcics               0.2
-    xqcicsr              0.3
-    xqciint              0.7
+    xqcicsr              0.4
+    xqciint              0.10
     xqciio               0.1
     xqcilb               0.2
     xqcili               0.2
     xqcilia              0.2
     xqcilo               0.3
-    xqcilsm              0.5
+    xqcilsm              0.6
     xqcisim              0.2
     xqcisls              0.2
     xqcisync             0.3

From 98c6c371d6dc09454d541474ef65a0e47c4baae6 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Mon, 16 Jun 2025 22:13:45 -0700
Subject: [PATCH 0589/1322] [RISCV] Xqccmp v0.3 (#137854)

All the changes for v0.2 and v0.3 are either already implemented, or
irrelevant to the compiler implementation.
---
 clang/test/Driver/print-supported-extensions-riscv.c | 2 +-
 llvm/docs/RISCVUsage.rst                             | 2 +-
 llvm/lib/Target/RISCV/RISCVFeatures.td               | 3 ++-
 llvm/test/CodeGen/RISCV/attributes.ll                | 4 ++--
 llvm/unittests/TargetParser/RISCVISAInfoTest.cpp     | 6 +++---
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 33d8738d5a9b..e1f5a7a0105d 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -213,7 +213,7 @@
 // CHECK-NEXT:     smctr                1.0       'Smctr' (Control Transfer Records Machine Level)
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
-// CHECK-NEXT:     xqccmp               0.1       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
+// CHECK-NEXT:     xqccmp               0.3       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
 // CHECK-NEXT:     xqcia                0.7       'Xqcia' (Qualcomm uC Arithmetic Extension)
 // CHECK-NEXT:     xqciac               0.3       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
 // CHECK-NEXT:     xqcibi               0.2       'Xqcibi' (Qualcomm uC Branch Immediate Extension)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 78890b605d83..aadda309feab 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -442,7 +442,7 @@ The current vendor extensions supported are:
   LLVM implements `the custom compressed opcodes present in some QingKe cores` by WCH / Nanjing Qinheng Microelectronics. The vendor refers to these opcodes by the name "XW".
 
 ``experimental-Xqccmp``
-  LLVM implements `version 0.1 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.1.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
+  LLVM implements `version 0.3 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.3.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
 
 ``experimental-Xqcia``
   LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 0f26c6f1e0a5..0b3508426732 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1426,7 +1426,8 @@ def HasVendorXwchc
 // Qualcomm Extensions
 
 def FeatureVendorXqccmp
-    : RISCVExperimentalExtension<0, 1, "Qualcomm 16-bit Push/Pop and Double Moves",
+    : RISCVExperimentalExtension<0, 3,
+                                 "Qualcomm 16-bit Push/Pop and Double Moves",
                                  [FeatureStdExtZca]>;
 def HasVendorXqccmp
     : Predicate<"Subtarget->hasVendorXqccmp()">,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index c9cfb2fb20b1..cdbf1caff5d8 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -437,7 +437,7 @@
 ; RV32XTHEADMEMPAIR: .attribute 5, "rv32i2p1_xtheadmempair1p0"
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
-; RV32XQCCMP: .attribute 5, "rv32i2p1_zca1p0_xqccmp0p1"
+; RV32XQCCMP: .attribute 5, "rv32i2p1_zca1p0_xqccmp0p3"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p7"
 ; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p3"
 ; RV32XQCIBI: .attribute 5, "rv32i2p1_zca1p0_xqcibi0p2"
@@ -683,7 +683,7 @@
 ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0"
 ; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0"
 ; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0"
-; RV64XQCCMP: .attribute 5, "rv64i2p1_zca1p0_xqccmp0p1"
+; RV64XQCCMP: .attribute 5, "rv64i2p1_zca1p0_xqccmp0p3"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
 ; RVI20U64: .attribute 5, "rv64i2p1"
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 0316e6470422..a0910a164ea0 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -695,13 +695,13 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
 
   for (StringRef Input :
        {"rv32idc_xqciac0p3", "rv32i_zcd_xqciac0p3", "rv32idc_xqcicm0p2",
-        "rv32i_zcd_xqcicm0p2", "rv32idc_xqccmp0p1", "rv32i_zcd_xqccmp0p1"}) {
+        "rv32i_zcd_xqcicm0p2", "rv32idc_xqccmp0p3", "rv32i_zcd_xqccmp0p3"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith("extension when 'd' extension is enabled"));
   }
 
-  for (StringRef Input : {"rv32i_zcmp_xqccmp0p1", "rv64i_zcmp_xqccmp0p1"}) {
+  for (StringRef Input : {"rv32i_zcmp_xqccmp0p3", "rv64i_zcmp_xqccmp0p3"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
               "'zcmp' and 'xqccmp' extensions are incompatible");
   }
@@ -1184,7 +1184,7 @@ Experimental extensions
     smctr                1.0
     ssctr                1.0
     svukte               0.3
-    xqccmp               0.1
+    xqccmp               0.3
     xqcia                0.7
     xqciac               0.3
     xqcibi               0.2

From e86740e6003739a41139d94e1643a3207f8fd8f8 Mon Sep 17 00:00:00 2001
From: no92 <no92@users.noreply.github.com>
Date: Tue, 17 Jun 2025 07:51:46 +0200
Subject: [PATCH 0590/1322] [clang] Add managarm support (#139271)

This PR is part of a series to upstream managarm support, as laid out in
the
[RFC](https://discourse.llvm.org/t/rfc-new-proposed-managarm-support-for-llvm-and-clang-87845/85884/1).
This PR is a follow-up to #87845 and #138854.
---
 clang/lib/Basic/Targets.cpp                   |   9 +
 clang/lib/Basic/Targets/OSTargets.h           |  30 ++
 clang/lib/Driver/CMakeLists.txt               |   1 +
 clang/lib/Driver/Driver.cpp                   |   4 +
 clang/lib/Driver/ToolChains/Gnu.cpp           |   2 +
 clang/lib/Driver/ToolChains/Managarm.cpp      | 218 ++++++++++++++
 clang/lib/Driver/ToolChains/Managarm.h        |  55 ++++
 clang/lib/Lex/InitHeaderSearch.cpp            |   1 +
 .../lib/aarch64-managarm-mlibc/.keep          |   0
 .../lib/riscv64-managarm-mlibc/.keep          |   0
 .../lib/x86_64-managarm-mlibc/.keep           |   0
 .../lib64/aarch64-managarm-mlibc/.keep        |   0
 .../lib64/riscv64-managarm-mlibc/.keep        |   0
 .../lib64/x86_64-managarm-mlibc/.keep         |   0
 .../aarch64-managarm-mlibc/c++/10/.keep       |   0
 .../usr/include/c++/10/.keep                  |   0
 .../usr/include/c++/v1/.keep                  |   0
 .../riscv64-managarm-mlibc/c++/10/.keep       |   0
 .../x86_64-managarm-mlibc/c++/10/.keep        |   0
 .../usr/lib/aarch64-managarm-mlibc/.keep      |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbegin.o   |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginS.o  |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginT.o  |   0
 .../usr/lib/riscv64-managarm-mlibc/.keep      |   0
 .../usr/lib/x86_64-managarm-mlibc/.keep       |   0
 .../basic_managarm_tree/usr/lib64/.keep       |   0
 clang/test/Driver/managarm.cpp                | 267 ++++++++++++++++++
 clang/test/Preprocessor/init.c                |   5 +
 .../predefined-macros-no-warnings.c           |   3 +
 35 files changed, 595 insertions(+)
 create mode 100644 clang/lib/Driver/ToolChains/Managarm.cpp
 create mode 100644 clang/lib/Driver/ToolChains/Managarm.h
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
 create mode 100644 clang/test/Driver/managarm.cpp

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 9889141ad208..af1111a86330 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -164,6 +164,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
       }
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<AArch64leTargetInfo>>(Triple,
+                                                                       Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
@@ -466,6 +469,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<RISCV64TargetInfo>>(Triple,
                                                                    Opts);
       }
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<RISCV64TargetInfo>>(Triple,
+                                                                     Opts);
     default:
       return std::make_unique<RISCV64TargetInfo>(Triple, Opts);
     }
@@ -654,6 +660,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<PS5OSTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::Hurd:
       return std::make_unique<HurdTargetInfo<X86_64TargetInfo>>(Triple, Opts);
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<X86_64TargetInfo>>(Triple,
+                                                                    Opts);
     default:
       return std::make_unique<X86_64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index d148b38d03c7..5dac699c2bb4 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -395,6 +395,36 @@ public:
   }
 };
 
+// Managarm Target
+template <typename Target>
+class LLVM_LIBRARY_VISIBILITY ManagarmTargetInfo : public OSTargetInfo<Target> {
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override {
+    DefineStd(Builder, "unix", Opts);
+    Builder.defineMacro("__managarm__");
+    if (Opts.POSIXThreads)
+      Builder.defineMacro("_REENTRANT");
+    if (Opts.CPlusPlus)
+      Builder.defineMacro("_GNU_SOURCE");
+    if (this->HasFloat128)
+      Builder.defineMacro("__FLOAT128__");
+  }
+
+public:
+  ManagarmTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
+    switch (Triple.getArch()) {
+    default:
+      break;
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      this->HasFloat128 = true;
+      break;
+    }
+  }
+};
+
 // NetBSD Target
 template <typename Target>
 class LLVM_LIBRARY_VISIBILITY NetBSDTargetInfo : public OSTargetInfo<Target> {
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 44e16edfb1cc..3cfd671e9d8f 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -66,6 +66,7 @@ add_clang_library(clangDriver
   ToolChains/HLSL.cpp
   ToolChains/Hurd.cpp
   ToolChains/Linux.cpp
+  ToolChains/Managarm.cpp
   ToolChains/MipsLinux.cpp
   ToolChains/MinGW.cpp
   ToolChains/MSP430.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 060f76fb653c..6c27d8c67072 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -33,6 +33,7 @@
 #include "ToolChains/Linux.h"
 #include "ToolChains/MSP430.h"
 #include "ToolChains/MSVC.h"
+#include "ToolChains/Managarm.h"
 #include "ToolChains/MinGW.h"
 #include "ToolChains/MipsLinux.h"
 #include "ToolChains/NaCl.h"
@@ -6842,6 +6843,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::Fuchsia:
       TC = std::make_unique<toolchains::Fuchsia>(*this, Target, Args);
       break;
+    case llvm::Triple::Managarm:
+      TC = std::make_unique<toolchains::Managarm>(*this, Target, Args);
+      break;
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9203bbc91b0b..afce4fffe1d5 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -226,6 +226,8 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
       return "elf_iamcu";
     return "elf_i386";
   case llvm::Triple::aarch64:
+    if (T.isOSManagarm())
+      return "aarch64managarm";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
     return "aarch64linuxb";
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
new file mode 100644
index 000000000000..ff455f2c6ec7
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/Managarm.cpp
@@ -0,0 +1,218 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Managarm.h"
+#include "Arch/ARM.h"
+#include "Arch/RISCV.h"
+#include "clang/Config/config.h"
+#include "clang/Driver/CommonArgs.h"
+#include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/Path.h"
+
+using namespace clang::driver;
+using namespace clang::driver::toolchains;
+using namespace clang;
+using namespace llvm::opt;
+
+using tools::addPathIfExists;
+
+std::string Managarm::getMultiarchTriple(const Driver &D,
+                                         const llvm::Triple &TargetTriple,
+                                         StringRef SysRoot) const {
+  switch (TargetTriple.getArch()) {
+  default:
+    return TargetTriple.str();
+  case llvm::Triple::x86_64:
+    return "x86_64-managarm-" + TargetTriple.getEnvironmentName().str();
+  case llvm::Triple::aarch64:
+    return "aarch64-managarm-" + TargetTriple.getEnvironmentName().str();
+  case llvm::Triple::riscv64:
+    return "riscv64-managarm-" + TargetTriple.getEnvironmentName().str();
+  }
+}
+
+static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
+  // It happens that only x86, PPC and SPARC use the 'lib32' variant of
+  // oslibdir, and using that variant while targeting other architectures causes
+  // problems because the libraries are laid out in shared system roots that
+  // can't cope with a 'lib32' library search path being considered. So we only
+  // enable them when we know we may need it.
+  //
+  // FIXME: This is a bit of a hack. We should really unify this code for
+  // reasoning about oslibdir spellings with the lib dir spellings in the
+  // GCCInstallationDetector, but that is a more significant refactoring.
+  if (Triple.getArch() == llvm::Triple::x86 || Triple.isPPC32() ||
+      Triple.getArch() == llvm::Triple::sparc)
+    return "lib32";
+
+  if (Triple.getArch() == llvm::Triple::x86_64 && Triple.isX32())
+    return "libx32";
+
+  if (Triple.getArch() == llvm::Triple::riscv32)
+    return "lib32";
+
+  return Triple.isArch32Bit() ? "lib" : "lib64";
+}
+
+Managarm::Managarm(const Driver &D, const llvm::Triple &Triple,
+                   const ArgList &Args)
+    : Generic_ELF(D, Triple, Args) {
+  GCCInstallation.init(Triple, Args);
+  Multilibs = GCCInstallation.getMultilibs();
+  SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+  std::string SysRoot = computeSysRoot();
+
+  ToolChain::path_list &PPaths = getProgramPaths();
+
+  Generic_GCC::PushPPaths(PPaths);
+
+#ifdef ENABLE_LINKER_BUILD_ID
+  ExtraOpts.push_back("--build-id");
+#endif
+
+  // The selection of paths to try here is designed to match the patterns which
+  // the GCC driver itself uses, as this is part of the GCC-compatible driver.
+  // This was determined by running GCC in a fake filesystem, creating all
+  // possible permutations of these directories, and seeing which ones it added
+  // to the link paths.
+  path_list &Paths = getFilePaths();
+
+  const std::string OSLibDir = std::string(getOSLibDir(Triple, Args));
+  const std::string MultiarchTriple = getMultiarchTriple(D, Triple, SysRoot);
+
+  Generic_GCC::AddMultilibPaths(D, SysRoot, OSLibDir, MultiarchTriple, Paths);
+
+  addPathIfExists(D, concat(SysRoot, "/lib", MultiarchTriple), Paths);
+  addPathIfExists(D, concat(SysRoot, "/lib/..", OSLibDir), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr/lib", MultiarchTriple), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr", OSLibDir), Paths);
+
+  Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths);
+
+  addPathIfExists(D, concat(SysRoot, "/lib"), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr/lib"), Paths);
+}
+
+bool Managarm::HasNativeLLVMSupport() const { return true; }
+
+Tool *Managarm::buildLinker() const {
+  return new tools::gnutools::Linker(*this);
+}
+
+Tool *Managarm::buildAssembler() const {
+  return new tools::gnutools::Assembler(*this);
+}
+
+std::string Managarm::computeSysRoot() const {
+  if (!getDriver().SysRoot.empty())
+    return getDriver().SysRoot;
+  return std::string();
+}
+
+std::string Managarm::getDynamicLinker(const ArgList &Args) const {
+  switch (getTriple().getArch()) {
+  case llvm::Triple::aarch64:
+    return "/lib/aarch64-managarm/ld.so";
+  case llvm::Triple::riscv64: {
+    StringRef ABIName = tools::riscv::getRISCVABI(Args, getTriple());
+    return ("/lib/riscv64-managarm/ld-riscv64-" + ABIName + ".so").str();
+  }
+  case llvm::Triple::x86_64:
+    return "/lib/x86_64-managarm/ld.so";
+  default:
+    llvm_unreachable("unsupported architecture");
+  }
+}
+
+void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                         ArgStringList &CC1Args) const {
+  const Driver &D = getDriver();
+  std::string SysRoot = computeSysRoot();
+
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+    return;
+
+  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
+    addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include");
+
+  // Add 'include' in the resource directory, which is similar to
+  // GCC_INCLUDE_DIR (private headers) in GCC.
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> ResourceDirInclude(D.ResourceDir);
+    llvm::sys::path::append(ResourceDirInclude, "include");
+    addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
+    return;
+
+  // TOOL_INCLUDE_DIR
+  AddMultilibIncludeArgs(DriverArgs, CC1Args);
+
+  // Check for configure-time C include directories.
+  StringRef CIncludeDirs(C_INCLUDE_DIRS);
+  if (CIncludeDirs != "") {
+    SmallVector<StringRef, 5> dirs;
+    CIncludeDirs.split(dirs, ":");
+    for (StringRef dir : dirs) {
+      StringRef Prefix =
+          llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : "";
+      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
+    }
+    return;
+  }
+
+  // On systems using multiarch, add /usr/include/$triple before
+  // /usr/include.
+  std::string MultiarchIncludeDir = getMultiarchTriple(D, getTriple(), SysRoot);
+  if (!MultiarchIncludeDir.empty())
+    addExternCSystemInclude(
+        DriverArgs, CC1Args,
+        concat(SysRoot, "/usr/include", MultiarchIncludeDir));
+
+  // Add an include of '/include' directly. This isn't provided by default by
+  // system GCCs, but is often used with cross-compiling GCCs, and harmless to
+  // add even when Clang is acting as-if it were a system compiler.
+  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/include"));
+
+  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/usr/include"));
+}
+
+void Managarm::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  // We need a detected GCC installation on Managarm to provide libstdc++'s
+  // headers.
+  if (!GCCInstallation.isValid())
+    return;
+
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+
+  // Try generic GCC detection.
+  Generic_GCC::addGCCLibStdCxxIncludePaths(DriverArgs, CC1Args, TripleStr);
+}
+
+SanitizerMask Managarm::getSupportedSanitizers() const {
+  const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
+  SanitizerMask Res = ToolChain::getSupportedSanitizers();
+  Res |= SanitizerKind::PointerCompare;
+  Res |= SanitizerKind::PointerSubtract;
+  Res |= SanitizerKind::KernelAddress;
+  Res |= SanitizerKind::Vptr;
+  if (IsX86_64)
+    Res |= SanitizerKind::KernelMemory;
+  return Res;
+}
+
+void Managarm::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
+  for (const auto &Opt : ExtraOpts)
+    CmdArgs.push_back(Opt.c_str());
+}
diff --git a/clang/lib/Driver/ToolChains/Managarm.h b/clang/lib/Driver/ToolChains/Managarm.h
new file mode 100644
index 000000000000..2082e2c615f2
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/Managarm.h
@@ -0,0 +1,55 @@
+//===--- Managarm.h - Managarm ToolChain Implementations --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
+
+#include "Gnu.h"
+#include "clang/Driver/ToolChain.h"
+
+namespace clang {
+namespace driver {
+namespace toolchains {
+
+class LLVM_LIBRARY_VISIBILITY Managarm : public Generic_ELF {
+public:
+  Managarm(const Driver &D, const llvm::Triple &Triple,
+           const llvm::opt::ArgList &Args);
+
+  bool HasNativeLLVMSupport() const override;
+
+  std::string getMultiarchTriple(const Driver &D,
+                                 const llvm::Triple &TargetTriple,
+                                 StringRef SysRoot) const override;
+
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
+  SanitizerMask getSupportedSanitizers() const override;
+  std::string computeSysRoot() const override;
+
+  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
+
+  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
+
+  std::vector<std::string> ExtraOpts;
+
+protected:
+  Tool *buildAssembler() const override;
+  Tool *buildLinker() const override;
+};
+
+} // end namespace toolchains
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 641e3beebc08..3e22b4001bde 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -221,6 +221,7 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
   case llvm::Triple::Hurd:
   case llvm::Triple::Linux:
   case llvm::Triple::LiteOS:
+  case llvm::Triple::Managarm:
   case llvm::Triple::NaCl:
   case llvm::Triple::NetBSD:
   case llvm::Triple::OpenBSD:
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/managarm.cpp b/clang/test/Driver/managarm.cpp
new file mode 100644
index 000000000000..5afa17aadb6d
--- /dev/null
+++ b/clang/test/Driver/managarm.cpp
@@ -0,0 +1,267 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-X86-64 %s
+// CHECK-X86-64:      "-cc1"
+// CHECK-X86-64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
+// CHECK-X86-64-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
+// CHECK-X86-64-SAME: "-L
+// CHECK-X86-64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-X86-64-LIBS %s
+// CHECK-X86-64-LIBS:      "-cc1"
+// CHECK-X86-64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-LIBS-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
+// CHECK-X86-64-LIBS-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
+// CHECK-X86-64-LIBS-SAME: "-L
+// CHECK-X86-64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-X86-64-STATIC %s
+// CHECK-X86-64-STATIC:      "-cc1"
+// CHECK-X86-64-STATIC-SAME: "-static-define"
+// CHECK-X86-64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-STATIC-SAME: "-static"
+// CHECK-X86-64-STATIC-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-X86-64-STATIC-SAME: "-L
+// CHECK-X86-64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-X86-64-SHARED %s
+// CHECK-X86-64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SHARED-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-X86-64-SHARED-SAME: "-L
+// CHECK-X86-64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
+// CHECK-AARCH64:      "-cc1"
+// CHECK-AARCH64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
+// CHECK-AARCH64-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
+// CHECK-AARCH64-SAME: {{^}} "-L
+// CHECK-AARCH64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-LIBS %s
+// CHECK-AARCH64-LIBS:      "-cc1"
+// CHECK-AARCH64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-LIBS-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
+// CHECK-AARCH64-LIBS-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L
+// CHECK-AARCH64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-STATIC %s
+// CHECK-AARCH64-STATIC:      "-cc1"
+// CHECK-AARCH64-STATIC-SAME: "-static-define"
+// CHECK-AARCH64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-STATIC-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-static"
+// CHECK-AARCH64-STATIC-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L
+// CHECK-AARCH64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SHARED %s
+// CHECK-AARCH64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SHARED-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L
+// CHECK-AARCH64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-RISCV64 %s
+// CHECK-RISCV64:      "-cc1"
+// CHECK-RISCV64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
+// CHECK-RISCV64-SAME: "-L
+// CHECK-RISCV64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-LIBS %s
+// CHECK-RISCV64-LIBS:      "-cc1"
+// CHECK-RISCV64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-LIBS-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
+// CHECK-RISCV64-LIBS-SAME: "-L
+// CHECK-RISCV64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-STATIC %s
+// CHECK-RISCV64-STATIC:      "-cc1"
+// CHECK-RISCV64-STATIC-SAME: "-static-define"
+// CHECK-RISCV64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-STATIC-SAME: "-static"
+// CHECK-RISCV64-STATIC-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-RISCV64-STATIC-SAME: "-L
+// CHECK-RISCV64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-SHARED %s
+// CHECK-RISCV64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SHARED-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-RISCV64-SHARED-SAME: "-L
+// CHECK-RISCV64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 031a6c1a755b..bed39dc3e34d 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,6 +1622,11 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
+// RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// MANAGARM: #define __managarm__ 1
+
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c
index 4e3e29ccfa8a..fe27ed8814ee 100644
--- a/clang/test/Preprocessor/predefined-macros-no-warnings.c
+++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c
@@ -14,6 +14,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux-openhos
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-netbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-openbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-win32-gnu
@@ -108,6 +109,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux-openhos
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-solaris
@@ -167,6 +169,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps4
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps5
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32

From 41b9d28327bf20befe63a683b2a2f90670837b2f Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Tue, 17 Jun 2025 07:42:57 +0100
Subject: [PATCH 0591/1322] [BOLT][NFC] Using target_triple in lit config
 (#144078)

---
 bolt/test/lit.local.cfg | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg
index d5a6849b27a7..8a61d11f5825 100644
--- a/bolt/test/lit.local.cfg
+++ b/bolt/test/lit.local.cfg
@@ -1,6 +1,11 @@
-host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu"
+host_triple = config.target_triple
+
+# Force triple on non-linux hosts to get ELF binaries on all platforms.
+if not "linux" in host_triple:
+  host_triple = host_triple.split("-")[0] + "-unknown-linux-gnu"
+
 common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie"
-flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}"
+flags = f"--target={host_triple} -fPIE {common_linker_flags}"
 
 config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))
 config.substitutions.insert(0, ("%cxxflags", f"%cxxflags {flags}"))

From 7e6c1bd3edf4fc19be70587a4ac33a76bab78c02 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 16 Jun 2025 23:54:40 -0700
Subject: [PATCH 0592/1322] [BOLT][NFCI] Simplify DataAggregator using traces
 (#143289)

Consistently apply traces as defined in #127125 for branch profile
aggregation. This combines branches and fall-through records into one.

With large input binaries/profiles, the speed up in aggregation time
(`-time-aggr`, wall time):
- perf.data, pre-BOLT input: 154.5528s -> 144.0767s
- pre-aggregated data, pre-BOLT input: 15.1026s -> 9.0711s
- pre-aggregated data, BOLTed input: 15.4871s -> 10.0077s

Test Plan: NFC
---
 bolt/include/bolt/Profile/DataAggregator.h |  54 ++++--
 bolt/lib/Profile/DataAggregator.cpp        | 182 ++++++++-------------
 2 files changed, 104 insertions(+), 132 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 3f07a6dc03a4..10d96fbeca3e 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -99,24 +99,28 @@ private:
     uint64_t Addr;
   };
 
+  /// Container for the unit of branch data.
+  /// Backwards compatible with legacy use for branches and fall-throughs:
+  /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only
+  ///   contains fall-through data,
+  /// - if \p To is BR_ONLY, the trace only contains branch data.
   struct Trace {
+    static constexpr const uint64_t EXTERNAL = 0ULL;
+    static constexpr const uint64_t BR_ONLY = -1ULL;
+    static constexpr const uint64_t FT_ONLY = -1ULL;
+    static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+
+    uint64_t Branch;
     uint64_t From;
     uint64_t To;
-    Trace(uint64_t From, uint64_t To) : From(From), To(To) {}
-    bool operator==(const Trace &Other) const {
-      return From == Other.From && To == Other.To;
-    }
+    auto tie() const { return std::tie(Branch, From, To); }
+    bool operator==(const Trace &Other) const { return tie() == Other.tie(); }
+    bool operator<(const Trace &Other) const { return tie() < Other.tie(); }
   };
+  friend raw_ostream &operator<<(raw_ostream &OS, const Trace &);
 
   struct TraceHash {
-    size_t operator()(const Trace &L) const {
-      return std::hash<uint64_t>()(L.From << 32 | L.To);
-    }
-  };
-
-  struct FTInfo {
-    uint64_t InternCount{0};
-    uint64_t ExternCount{0};
+    size_t operator()(const Trace &L) const { return hash_combine(L.tie()); }
   };
 
   struct TakenBranchInfo {
@@ -126,8 +130,8 @@ private:
 
   /// Intermediate storage for profile data. We save the results of parsing
   /// and use them later for processing and assigning profile.
-  std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs;
-  std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
+  std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
+  std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -200,8 +204,8 @@ private:
   /// Return a vector of offsets corresponding to a trace in a function
   /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-  getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
-                         const LBREntry &Second, uint64_t Count = 1) const;
+  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
+                         uint64_t Count) const;
 
   /// Record external entry into the function \p BF.
   ///
@@ -265,8 +269,7 @@ private:
   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
 
   /// Register a trace between two LBR entries supplied in execution order.
-  bool doTrace(const LBREntry &First, const LBREntry &Second,
-               uint64_t Count = 1);
+  bool doTrace(const Trace &Trace, uint64_t Count);
 
   /// Parser helpers
   /// Return false if we exhausted our parser buffer and finished parsing
@@ -516,6 +519,21 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   OS << formatv("{0:x} -> {1:x}/{2}", L.From, L.To, L.Mispred ? 'M' : 'P');
   return OS;
 }
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const DataAggregator::Trace &T) {
+  switch (T.Branch) {
+  case DataAggregator::Trace::FT_ONLY:
+  case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
+    break;
+  default:
+    OS << Twine::utohexstr(T.Branch) << " -> ";
+  }
+  OS << Twine::utohexstr(T.From);
+  if (T.To != DataAggregator::Trace::BR_ONLY)
+    OS << " ... " << Twine::utohexstr(T.To);
+  return OS;
+}
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index ade8478f556e..118629b04f6f 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -523,6 +523,10 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
   deleteTempFiles();
 
 heatmap:
+  // Sort parsed traces for faster processing.
+  if (!opts::BasicAggregation)
+    llvm::sort(Traces, llvm::less_first());
+
   if (!opts::HeatmapMode)
     return Error::success();
 
@@ -598,8 +602,7 @@ void DataAggregator::processProfile(BinaryContext &BC) {
     llvm::stable_sort(MemEvents.second.Data);
 
   // Release intermediate storage.
-  clear(BranchLBRs);
-  clear(FallthroughLBRs);
+  clear(Traces);
   clear(BasicSamples);
   clear(MemSamples);
 }
@@ -780,37 +783,19 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
 }
 
-bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
-                             uint64_t Count) {
-  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(First.To);
-  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
+bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
+  const uint64_t From = Trace.From, To = Trace.To;
+  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
+  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
+  NumTraces += Count;
   if (!FromFunc || !ToFunc) {
-    LLVM_DEBUG({
-      dbgs() << "Out of range trace starting in ";
-      if (FromFunc)
-        dbgs() << formatv("{0} @ {1:x}", *FromFunc,
-                          First.To - FromFunc->getAddress());
-      else
-        dbgs() << Twine::utohexstr(First.To);
-      dbgs() << " and ending in ";
-      if (ToFunc)
-        dbgs() << formatv("{0} @ {1:x}", *ToFunc,
-                          Second.From - ToFunc->getAddress());
-      else
-        dbgs() << Twine::utohexstr(Second.From);
-      dbgs() << '\n';
-    });
+    LLVM_DEBUG(dbgs() << "Out of range trace " << Trace << '\n');
     NumLongRangeTraces += Count;
     return false;
   }
   if (FromFunc != ToFunc) {
+    LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
-    LLVM_DEBUG({
-      dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-             << formatv(" @ {0:x}", First.To - FromFunc->getAddress())
-             << " and ending in " << ToFunc->getPrintName()
-             << formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress());
-    });
     return false;
   }
 
@@ -818,28 +803,21 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
   BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
   if (!ParentFunc)
     ParentFunc = FromFunc;
-  ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
+  ParentFunc->SampleCountInBytes += Count * (To - From);
 
   const uint64_t FuncAddress = FromFunc->getAddress();
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
       BAT && BAT->isBATFunction(FuncAddress)
-          ? BAT->getFallthroughsInTrace(FuncAddress, First.To, Second.From)
-          : getFallthroughsInTrace(*FromFunc, First, Second, Count);
+          ? BAT->getFallthroughsInTrace(FuncAddress, From, To)
+          : getFallthroughsInTrace(*FromFunc, Trace, Count);
   if (!FTs) {
-    LLVM_DEBUG(
-        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
-               << " and ending in " << ToFunc->getPrintName() << " @ "
-               << ToFunc->getPrintName() << " @ "
-               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
+    LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
     return false;
   }
 
   LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
-                    << FromFunc->getPrintName() << ":"
-                    << Twine::utohexstr(First.To) << " to "
-                    << Twine::utohexstr(Second.From) << ".\n");
+                    << FromFunc->getPrintName() << ":" << Trace << '\n');
   for (auto [From, To] : *FTs) {
     if (BAT) {
       From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
@@ -852,17 +830,15 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
 }
 
 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
-                                       const LBREntry &FirstLBR,
-                                       const LBREntry &SecondLBR,
+DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
                                        uint64_t Count) const {
   SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
 
   BinaryContext &BC = BF.getBinaryContext();
 
   // Offsets of the trace within this function.
-  const uint64_t From = FirstLBR.To - BF.getAddress();
-  const uint64_t To = SecondLBR.From - BF.getAddress();
+  const uint64_t From = Trace.From - BF.getAddress();
+  const uint64_t To = Trace.To - BF.getAddress();
 
   if (From > To)
     return std::nullopt;
@@ -889,8 +865,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (From == FromBB->getOffset() && !BF.containsAddress(FirstLBR.From) &&
-      !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
+  if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
+      From == FromBB->getOffset() && !FromBB->isEntryPoint() &&
+      !FromBB->isLandingPad()) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -898,10 +875,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
       if (Instr && BC.MIB->isCall(*Instr))
         FromBB = PrevBB;
       else
-        LLVM_DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR
-                          << '\n');
+        LLVM_DEBUG(dbgs() << "invalid trace (no call): " << Trace << '\n');
     } else {
-      LLVM_DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n');
+      LLVM_DEBUG(dbgs() << "invalid trace: " << Trace << '\n');
     }
   }
 
@@ -920,9 +896,7 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
 
     // Check for bad LBRs.
     if (!BB->getSuccessor(NextBB->getLabel())) {
-      LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n"
-                        << "  " << FirstLBR << '\n'
-                        << "  " << SecondLBR << '\n');
+      LLVM_DEBUG(dbgs() << "no fall-through for the trace: " << Trace << '\n');
       return std::nullopt;
     }
 
@@ -1227,14 +1201,15 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     FT_EXTERNAL_ORIGIN // f
   } Type = INVALID;
 
-  // The number of fields to parse, set based on Type.
+  /// The number of fields to parse, set based on \p Type.
   int AddrNum = 0;
   int CounterNum = 0;
-  // Storage for parsed fields.
+  /// Storage for parsed fields.
   StringRef EventName;
   std::optional<Location> Addr[3];
   int64_t Counters[2] = {0};
 
+  /// Parse strings: record type and optionally an event name.
   while (Type == INVALID || Type == EVENT_NAME) {
     while (checkAndConsumeFS()) {
     }
@@ -1268,6 +1243,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
+  /// Parse locations depending on entry type, recording them in \p Addr array.
   for (int I = 0; I < AddrNum; ++I) {
     while (checkAndConsumeFS()) {
     }
@@ -1277,6 +1253,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     Addr[I] = AddrOrErr.get();
   }
 
+  /// Parse counters depending on entry type.
   for (int I = 0; I < CounterNum; ++I) {
     while (checkAndConsumeFS()) {
     }
@@ -1287,11 +1264,13 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     Counters[I] = CountOrErr.get();
   }
 
+  /// Expect end of line here.
   if (!checkAndConsumeNewLine()) {
     reportError("expected end of line");
     return make_error_code(llvm::errc::io_error);
   }
 
+  /// Record event name into \p EventNames and return.
   if (Type == EVENT_NAME) {
     EventNames.insert(EventName);
     return std::error_code();
@@ -1305,6 +1284,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   int64_t Count = Counters[0];
   int64_t Mispreds = Counters[1];
 
+  /// Record basic IP sample into \p BasicSamples and return.
   if (Type == SAMPLE) {
     BasicSamples[FromOffset] += Count;
     NumTotalSamples += Count;
@@ -1316,29 +1296,25 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   if (ToFunc)
     ToFunc->setHasProfileAvailable();
 
-  Trace Trace(FromOffset, ToOffset);
-  // Taken trace
-  if (Type == TRACE || Type == BRANCH) {
-    TakenBranchInfo &Info = BranchLBRs[Trace];
-    Info.TakenCount += Count;
-    Info.MispredCount += Mispreds;
+  /// For legacy fall-through types, adjust locations to match Trace container.
+  if (Type == FT || Type == FT_EXTERNAL_ORIGIN) {
+    Addr[2] = Location(Addr[1]->Offset); // Trace To
+    Addr[1] = Location(Addr[0]->Offset); // Trace From
+    // Put a magic value into Trace Branch to differentiate from a full trace.
+    Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
+  }
 
-    NumTotalSamples += Count;
+  /// For legacy branch type, mark Trace To to differentite from a full trace.
+  if (Type == BRANCH) {
+    Addr[2] = Location(Trace::BR_ONLY);
   }
-  // Construct fallthrough part of the trace
-  if (Type == TRACE) {
-    const uint64_t TraceFtEndOffset = Addr[2]->Offset;
-    Trace.From = ToOffset;
-    Trace.To = TraceFtEndOffset;
-    Type = FromFunc == ToFunc ? FT : FT_EXTERNAL_ORIGIN;
-  }
-  // Add fallthrough trace
-  if (Type != BRANCH) {
-    FTInfo &Info = FallthroughLBRs[Trace];
-    (Type == FT ? Info.InternCount : Info.ExternCount) += Count;
 
-    NumTraces += Count;
-  }
+  /// Record a trace.
+  Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset};
+  TakenBranchInfo TI{(uint64_t)Count, (uint64_t)Mispreds};
+  Traces.emplace_back(T, TI);
+
+  NumTotalSamples += Count;
 
   return std::error_code();
 }
@@ -1350,7 +1326,7 @@ bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
 
 std::error_code DataAggregator::printLBRHeatMap() {
   outs() << "PERF2BOLT: parse branch events...\n";
-  NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
+  NamedRegionTimer T("buildHeatmap", "Building heatmap", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
 
   if (BC->IsLinuxKernel) {
@@ -1386,12 +1362,9 @@ std::error_code DataAggregator::printLBRHeatMap() {
   // Register basic samples and perf LBR addresses not covered by fallthroughs.
   for (const auto &[PC, Hits] : BasicSamples)
     HM.registerAddress(PC, Hits);
-  for (const auto &LBR : FallthroughLBRs) {
-    const Trace &Trace = LBR.first;
-    const FTInfo &Info = LBR.second;
-    HM.registerAddressRange(Trace.From, Trace.To,
-                            Info.InternCount + Info.ExternCount);
-  }
+  for (const auto &[Trace, Info] : Traces)
+    if (Trace.To != Trace::BR_ONLY)
+      HM.registerAddressRange(Trace.From, Trace.To, Info.TakenCount);
 
   if (HM.getNumInvalidRanges())
     outs() << "HEATMAP: invalid traces: " << HM.getNumInvalidRanges() << '\n';
@@ -1437,22 +1410,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
     // chronological order)
     if (NeedsSkylakeFix && NumEntry <= 2)
       continue;
-    if (NextLBR) {
-      // Record fall-through trace.
-      const uint64_t TraceFrom = LBR.To;
-      const uint64_t TraceTo = NextLBR->From;
-      const BinaryFunction *TraceBF =
-          getBinaryFunctionContainingAddress(TraceFrom);
-      FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
-      if (TraceBF && TraceBF->containsAddress(LBR.From))
-        ++Info.InternCount;
-      else
-        ++Info.ExternCount;
-      ++NumTraces;
-    }
+    uint64_t TraceTo = NextLBR ? NextLBR->From : Trace::BR_ONLY;
     NextLBR = &LBR;
 
-    TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)];
+    TakenBranchInfo &Info = TraceMap[Trace{LBR.From, LBR.To, TraceTo}];
     ++Info.TakenCount;
     Info.MispredCount += LBR.Mispred;
   }
@@ -1563,10 +1524,14 @@ std::error_code DataAggregator::parseBranchEvents() {
     parseLBRSample(Sample, NeedsSkylakeFix);
   }
 
-  for (const Trace &Trace : llvm::make_first_range(BranchLBRs))
-    for (const uint64_t Addr : {Trace.From, Trace.To})
+  Traces.reserve(TraceMap.size());
+  for (const auto &[Trace, Info] : TraceMap) {
+    Traces.emplace_back(Trace, Info);
+    for (const uint64_t Addr : {Trace.Branch, Trace.From})
       if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
         BF->setHasProfileAvailable();
+  }
+  clear(TraceMap);
 
   outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries
          << " LBR entries\n";
@@ -1591,23 +1556,12 @@ void DataAggregator::processBranchEvents() {
   NamedRegionTimer T("processBranch", "Processing branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
-  for (const auto &AggrLBR : FallthroughLBRs) {
-    const Trace &Loc = AggrLBR.first;
-    const FTInfo &Info = AggrLBR.second;
-    LBREntry First{Loc.From, Loc.From, false};
-    LBREntry Second{Loc.To, Loc.To, false};
-    if (Info.InternCount)
-      doTrace(First, Second, Info.InternCount);
-    if (Info.ExternCount) {
-      First.From = 0;
-      doTrace(First, Second, Info.ExternCount);
-    }
-  }
-
-  for (const auto &AggrLBR : BranchLBRs) {
-    const Trace &Loc = AggrLBR.first;
-    const TakenBranchInfo &Info = AggrLBR.second;
-    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
+  for (const auto &[Trace, Info] : Traces) {
+    if (Trace.Branch != Trace::FT_ONLY &&
+        Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
+      doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
+    if (Trace.To != Trace::BR_ONLY)
+      doTrace(Trace, Info.TakenCount);
   }
   printBranchSamplesDiagnostics();
 }

From 80b79ce432bbe12701fd9fe495ff9feeb5e4b9ca Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 09:30:28 +0200
Subject: [PATCH 0593/1322] [ConstantFolding] Handle reading from type padding
 (#144330)

ReadDataFromGlobal() did not handle reads from the padding of types (in
the sense of type store size != type alloc size, rather than struct
padding).

Return zero in that case.

Fixes https://github.com/llvm/llvm-project/issues/144279.
---
 llvm/lib/Analysis/ConstantFolding.cpp         |  4 +++
 .../InstSimplify/ConstProp/loads.ll           | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2b7a438a9ef0..b58f9b26a865 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -432,6 +432,10 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
   assert(ByteOffset <= DL.getTypeAllocSize(C->getType()) &&
          "Out of range access");
 
+  // Reading type padding, return zero.
+  if (ByteOffset >= DL.getTypeStoreSize(C->getType()))
+    return true;
+
   // If this element is zero or undefined, we can just return since *CurPtr is
   // zero initialized.
   if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index dd75560e25ce..061c6834eb97 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -441,3 +441,39 @@ define i128 @load-128bit(){
   %1 = load i128, ptr @global128, align 4
   ret i128 %1
 }
+
+
+@i40_struct = constant { i40, i8 } { i40 0, i8 1 }
+@i40_array = constant [2 x i40] [i40 0, i40 1]
+
+define i8 @load_i40_struct_padding() {
+; CHECK-LABEL: @load_i40_struct_padding(
+; CHECK-NEXT:    ret i8 0
+;
+  %v = load i8, ptr getelementptr (i8, ptr @i40_struct, i64 6)
+  ret i8 %v
+}
+
+define i16 @load_i40_struct_partial_padding() {
+; CHECK-LABEL: @load_i40_struct_partial_padding(
+; CHECK-NEXT:    ret i16 0
+;
+  %v = load i16, ptr getelementptr (i8, ptr @i40_struct, i64 4)
+  ret i16 %v
+}
+
+define i8 @load_i40_array_padding() {
+; CHECK-LABEL: @load_i40_array_padding(
+; CHECK-NEXT:    ret i8 0
+;
+  %v = load i8, ptr getelementptr (i8, ptr @i40_array, i64 6)
+  ret i8 %v
+}
+
+define i16 @load_i40_array_partial_padding() {
+; CHECK-LABEL: @load_i40_array_partial_padding(
+; CHECK-NEXT:    ret i16 0
+;
+  %v = load i16, ptr getelementptr (i8, ptr @i40_array, i64 4)
+  ret i16 %v
+}

From bb70023cbfecf7880e4cc89966947ef475e070e9 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 09:49:18 +0200
Subject: [PATCH 0594/1322] [MemoryLocation][DSE] Allow other read effects in
 MemoryLocation::getForDest() (#144343)

MemoryLocation::getForDest() returns a (potentially) written location,
while still allowing other reads. Currently, this is limited to
argmemonly functions. However, we can ignore other (non-argmem) read
effects here for the same reason we can ignore argument reads.

Fixes https://github.com/llvm/llvm-project/issues/144300.

Proof: https://alive2.llvm.org/ce/z/LKq_dc
---
 llvm/lib/Analysis/MemoryLocation.cpp          |  4 ++-
 .../DeadStoreElimination/trivial-dse-calls.ll | 32 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 3b42bb412b9b..c8daab7abde1 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -111,7 +111,9 @@ MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
 
 std::optional<MemoryLocation>
 MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
-  if (!CB->onlyAccessesArgMemory())
+  // Check that the only possible writes are to arguments.
+  MemoryEffects WriteME = CB->getMemoryEffects() & MemoryEffects::writeOnly();
+  if (!WriteME.onlyAccessesArgPointees())
     return std::nullopt;
 
   if (CB->hasOperandBundles())
diff --git a/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll b/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
index 030d315bfd92..df2feb087e39 100644
--- a/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
@@ -286,3 +286,35 @@ define void @test_dse_non_alloca() {
   ret void
 }
 
+define void @test_other_read_effects() {
+; CHECK-LABEL: @test_other_read_effects(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(read, argmem: readwrite) nounwind willreturn
+  ret void
+}
+
+define i32 @test_other_read_effects_read_after() {
+; CHECK-LABEL: @test_other_read_effects_read_after(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @f(ptr [[A]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(read, argmem: readwrite) nounwind willreturn
+  %v = load i32, ptr %a
+  ret i32 %v
+}
+
+define void @test_other_write_effects() {
+; CHECK-LABEL: @test_other_write_effects(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @f(ptr [[A]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(write, argmem: readwrite) nounwind willreturn
+  ret void
+}

From 632151fbeea972f4aa3c14921eca1e45c07646f3 Mon Sep 17 00:00:00 2001
From: gaynor-anthropic <gaynor@anthropic.com>
Date: Tue, 17 Jun 2025 00:52:18 -0700
Subject: [PATCH 0595/1322] InstCombine: improve optimizations for ceiling
 division with no overflow (#142869)

Fixes #142497.

Alive2: https://alive2.llvm.org/ce/z/CeaHaH

The contents of this pull request were substantially written using
claude-code. I've reviewed to the best of my ability (it's been years
since I did any compilers work).

---------

Co-authored-by: Yingwei Zheng <dtcxzyw@qq.com>
Co-authored-by: Nikita Popov <github@npopov.com>
---
 .../InstCombine/InstCombineAddSub.cpp         |  28 ++
 llvm/test/Transforms/InstCombine/add.ll       | 261 ++++++++++++++++++
 2 files changed, 289 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c1ce364eb179..0a3837f2c0ce 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1787,6 +1787,34 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Ashr = foldAddToAshr(I))
     return Ashr;
 
+  // Ceiling division by power-of-2:
+  // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
+  // This is valid when adding (N-1) to X doesn't overflow.
+  {
+    Value *X;
+    const APInt *ShiftAmt, *Mask;
+    CmpPredicate Pred;
+
+    // Match: (X >> C) + zext((X & Mask) != 0)
+    // or:    zext((X & Mask) != 0) + (X >> C)
+    if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))),
+                          m_ZExt(m_SpecificICmp(
+                              ICmpInst::ICMP_NE,
+                              m_And(m_Deferred(X), m_LowBitMask(Mask)),
+                              m_ZeroInt())))) &&
+        Mask->popcount() == *ShiftAmt) {
+
+      // Check if X + Mask doesn't overflow
+      Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
+      if (willNotOverflowUnsignedAdd(X, MaskC, I)) {
+        // (X + Mask) >> ShiftAmt
+        Value *Add = Builder.CreateNUWAdd(X, MaskC);
+        return BinaryOperator::CreateLShr(
+            Add, ConstantInt::get(X->getType(), *ShiftAmt));
+      }
+    }
+  }
+
   // (~X) + (~Y) --> -2 - (X + Y)
   {
     // To ensure we can save instructions we need to ensure that we consume both
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 495f99824652..a16e30bb4945 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -4273,4 +4273,265 @@ define i32 @fold_zext_nneg_add_const_fail2(i8 %x) {
 }
 
 declare void @llvm.assume(i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+; Ceiling division by power-of-2: (x >> log2(N)) + ((x & (N-1)) != 0) -> (x + (N-1)) >> log2(N)
+; This is only valid when x + (N-1) doesn't overflow
+
+; Test with known range that prevents overflow
+define i32 @ceil_div_by_8_known_range(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_by_8_known_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Test with the exact IR from the original testcase
+define i32 @ceil_div_from_clz(i32 %v) {
+; CHECK-LABEL: @ceil_div_from_clz(
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i32 39, [[CTLZ]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ctlz = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
+  %sub = sub nuw nsw i32 32, %ctlz
+  %shr = lshr i32 %sub, 3
+  %and = and i32 %sub, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw nsw i32 %shr, %ext
+  ret i32 %r
+}
+
+; Vector version with known range
+define <2 x i32> @ceil_div_by_8_vec_range(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_8_vec_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <2 x i32> [[X:%.*]], splat (i32 7)
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3)
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+; Ceiling division by 16 with known range
+define i16 @ceil_div_by_16_i16(i16 range(i16 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[X:%.*]], 15
+; CHECK-NEXT:    [[R:%.*]] = lshr i16 [[TMP1]], 4
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %shr = lshr i16 %x, 4
+  %and = and i16 %x, 15
+  %cmp = icmp ne i16 %and, 0
+  %ext = zext i1 %cmp to i16
+  %r = add i16 %shr, %ext
+  ret i16 %r
+}
+
+; Negative test: no overflow guarantee - should NOT optimize
+define i32 @ceil_div_by_8_no_overflow_info(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_no_overflow_info(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: nuw on final add doesn't help
+define i32 @ceil_div_by_8_only_nuw_on_add(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_only_nuw_on_add(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw i32 %shr, %ext  ; nuw here doesn't prove x+7 won't overflow
+  ret i32 %r
+}
+
+; Negative test: wrong mask
+define i32 @ceil_div_wrong_mask(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_mask(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 6
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 6  ; Wrong mask: should be 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong shift amount
+define i32 @ceil_div_wrong_shift(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_shift(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 4  ; Shift by 4, but mask is 7 (should be 15)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong comparison
+define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_cmp(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp eq i32 %and, 0  ; Wrong: should be ne
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Multi-use test: all intermediate values have uses
+define i32 @ceil_div_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    call void @use_i32(i32 [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  call void @use_i32(i32 %and)
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Commuted test: add operands are swapped  
+define i32 @ceil_div_commuted(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Commuted with multi-use
+define i32 @ceil_div_commuted_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Multi-use test where only zext has multiple uses - should still optimize
+define i32 @ceil_div_zext_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_zext_multi_use(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Multi-use with vector type
+define <2 x i32> @ceil_div_vec_multi_use(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_vec_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], splat (i32 3)
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[X]], splat (i32 7)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw <2 x i32> [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  call void @use_vec(<2 x i32> %shr)
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+declare void @use_i32(i32)
+declare void @use_vec(<2 x i32>)
 declare void @fake_func(i32)

From c564ebba22ae9af315e08789c628810a3bbcf3df Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 18:09:02 +0100
Subject: [PATCH 0596/1322] [X86] combineEXTRACT_SUBVECTOR - move AVX1 ANDNP
 comment and fold back together. NFC.

These appear to have been split by a merge at some point.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 290fad07be4f..820b9c53a508 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59683,16 +59683,6 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL,
 static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget &Subtarget) {
-  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
-  // eventually get combined/lowered into ANDNP) with a concatenated operand,
-  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
-  // We let generic combining take over from there to simplify the
-  // insert/extract and 'not'.
-  // This pattern emerges during AVX1 legalization. We handle it before lowering
-  // to avoid complications like splitting constant vector loads.
-
-  // Capture the original wide type in the likely case that we need to bitcast
-  // back to this type.
   if (!N->getValueType(0).isSimple())
     return SDValue();
 
@@ -59708,8 +59698,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc DL(N);
 
-  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
-      TLI.isTypeLegal(InVecVT) &&
+  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
+  // eventually get combined/lowered into ANDNP) with a concatenated operand,
+  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
+  // We let generic combining take over from there to simplify the
+  // insert/extract and 'not'.
+  // This pattern emerges during AVX1 legalization. We handle it before lowering
+  // to avoid complications like splitting constant vector loads.
+  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
     auto isConcatenatedNot = [](SDValue V) {
       V = peekThroughBitcasts(V);

From cb355def9561e2d1d4b363f44dcedf5522f0f8a1 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles@arm.com>
Date: Tue, 17 Jun 2025 09:05:36 +0100
Subject: [PATCH 0597/1322] [Flang][OpenMP] Add Parsing support for Indirect
 Clause (#143505)

As part of OpenMP Version 5.1, support for the `indirect` clause was
added for the `declare target` directive. This clause should follow an
`enter` clause, and allows procedure calls to be done indirectly through
OpenMP.

This adds Parsing support for the clause, along with semantics checks.
Currently, lowering for the clause is not supported so a TODO message
will be outputted to the user. It also performs version checking as
`indirect` is only support in OpenMP 5.1 or greater.

See also: #110008
---
 flang/include/flang/Parser/dump-parse-tree.h  |  1 +
 flang/include/flang/Parser/parse-tree.h       |  6 +++
 flang/lib/Lower/OpenMP/Clauses.cpp            |  4 +-
 flang/lib/Parser/openmp-parsers.cpp           |  2 +
 flang/lib/Semantics/check-omp-structure.cpp   |  9 ++++
 .../Lower/OpenMP/Todo/omp-clause-indirect.f90 | 34 ++++++++++++
 .../OpenMP/declare-target-indirect-tree.f90   | 53 +++++++++++++++++++
 flang/test/Semantics/indirect01.f90           | 34 ++++++++++++
 flang/test/Semantics/indirect02.f90           | 36 +++++++++++++
 llvm/include/llvm/Frontend/OpenMP/ClauseT.h   |  2 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  5 +-
 11 files changed, 181 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
 create mode 100644 flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
 create mode 100644 flang/test/Semantics/indirect01.f90
 create mode 100644 flang/test/Semantics/indirect02.f90

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index c6a5150a85a4..e3eed6aed807 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -565,6 +565,7 @@ public:
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
   NODE_ENUM(OmpTaskDependenceType, Value)
+  NODE(parser, OmpIndirectClause)
   NODE(parser, OmpIterationOffset)
   NODE(parser, OmpIteration)
   NODE(parser, OmpIterationVector)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 67405f88e09f..61f97b855b0e 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4300,6 +4300,12 @@ struct OmpHoldsClause {
   WRAPPER_CLASS_BOILERPLATE(OmpHoldsClause, common::Indirection<Expr>);
 };
 
+// Ref: [5.2: 209]
+struct OmpIndirectClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpIndirectClause, std::optional<ScalarLogicalExpr>);
+};
+
 // Ref: [5.2:72-73], in 4.5-5.1 it's scattered over individual directives
 // that allow the IF clause.
 //
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 4d0f5c3a127e..c0c57d1832d4 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -905,8 +905,8 @@ Inclusive make(const parser::OmpClause::Inclusive &inp,
 
 Indirect make(const parser::OmpClause::Indirect &inp,
               semantics::SemanticsContext &semaCtx) {
-  // inp -> empty
-  llvm_unreachable("Empty: indirect");
+  // inp.v.v -> std::optional<parser::ScalarLogicalExpr>
+  return Indirect{maybeApply(makeExprFn(semaCtx), inp.v.v)};
 }
 
 Init make(const parser::OmpClause::Init &inp,
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 9b112a213391..c55642d96950 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1004,6 +1004,8 @@ TYPE_PARSER( //
     "IF" >> construct<OmpClause>(construct<OmpClause::If>(
                 parenthesized(Parser<OmpIfClause>{}))) ||
     "INBRANCH" >> construct<OmpClause>(construct<OmpClause::Inbranch>()) ||
+    "INDIRECT" >> construct<OmpClause>(construct<OmpClause::Indirect>(
+                      maybe(parenthesized(scalarLogicalExpr)))) ||
     "INIT" >> construct<OmpClause>(construct<OmpClause::Init>(
                   parenthesized(Parser<OmpInitClause>{}))) ||
     "INCLUSIVE" >> construct<OmpClause>(construct<OmpClause::Inclusive>(
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 58d28dce7094..83f4d1edf3c4 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1820,15 +1820,24 @@ void OmpStructureChecker::Leave(const parser::OmpDeclareTargetWithClause &x) {
     const parser::OmpClause *toClause = FindClause(llvm::omp::Clause::OMPC_to);
     const parser::OmpClause *linkClause =
         FindClause(llvm::omp::Clause::OMPC_link);
+    const parser::OmpClause *indirectClause =
+        FindClause(llvm::omp::Clause::OMPC_indirect);
     if (!enterClause && !toClause && !linkClause) {
       context_.Say(x.source,
           "If the DECLARE TARGET directive has a clause, it must contain at least one ENTER clause or LINK clause"_err_en_US);
     }
+    if (indirectClause && !enterClause) {
+      context_.Say(x.source,
+          "The INDIRECT clause cannot be used without the ENTER clause with the DECLARE TARGET directive."_err_en_US);
+    }
     unsigned version{context_.langOptions().OpenMPVersion};
     if (toClause && version >= 52) {
       context_.Warn(common::UsageWarning::OpenMPUsage, toClause->source,
           "The usage of TO clause on DECLARE TARGET directive has been deprecated. Use ENTER clause instead."_warn_en_US);
     }
+    if (indirectClause) {
+      CheckAllowedClause(llvm::omp::Clause::OMPC_indirect);
+    }
   }
 }
 
diff --git a/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90 b/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
new file mode 100644
index 000000000000..d441cac47f5d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
@@ -0,0 +1,34 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fc1 -emit-fir -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK: not yet implemented: Unhandled clause INDIRECT in DECLARE TARGET construct
+    !$omp declare target enter(func1) indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90 b/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
new file mode 100644
index 000000000000..df85942ec15a
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
@@ -0,0 +1,53 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %flang_fc1 %openmp_flags -fopenmp-version=52 -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-unparse -fopenmp-version=52 %s | FileCheck %s --check-prefix="UNPARSE"
+
+module functions
+  implicit none
+
+  interface
+  function func() result(i)
+    character(1) :: i
+  end function
+  end interface
+
+contains
+  function func1() result(i)
+    !$omp declare target enter(func1) indirect(.true.)
+    !CHECK: | | | | | OmpDeclareTargetSpecifier -> OmpDeclareTargetWithClause -> OmpClauseList -> OmpClause -> Enter -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'func1'
+    !CHECK-NEXT: | | | | | OmpClause -> Indirect -> OmpIndirectClause -> Scalar -> Logical -> Expr = '.true._4'
+    !CHECK-NEXT: | | | | | | LiteralConstant -> LogicalLiteralConstant
+    !CHECK-NEXT: | | | | | | | bool = 'true'
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+
+  function func2() result(i)
+    !$omp declare target enter(func2) indirect
+    !CHECK: | | | | | OmpDeclareTargetSpecifier -> OmpDeclareTargetWithClause -> OmpClauseList -> OmpClause -> Enter -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'func2'
+    !CHECK-NEXT: | | | | | OmpClause -> Indirect -> OmpIndirectClause ->
+    character(1) :: i
+    i = 'b'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1, ptr2=>func2
+  character(1) :: val1, val2
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+  !$omp target map(from: val2)
+  val2 = ptr2()
+  !$omp end target
+
+end program
+
+!UNPARSE: !$OMP DECLARE TARGET  ENTER(func1) INDIRECT(.true._4)
+!UNPARSE: !$OMP DECLARE TARGET  ENTER(func2) INDIRECT()
diff --git a/flang/test/Semantics/indirect01.f90 b/flang/test/Semantics/indirect01.f90
new file mode 100644
index 000000000000..59850662275d
--- /dev/null
+++ b/flang/test/Semantics/indirect01.f90
@@ -0,0 +1,34 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK: The INDIRECT clause cannot be used without the ENTER clause with the DECLARE TARGET directive.
+    !$omp declare target indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Semantics/indirect02.f90 b/flang/test/Semantics/indirect02.f90
new file mode 100644
index 000000000000..273f8856626b
--- /dev/null
+++ b/flang/test/Semantics/indirect02.f90
@@ -0,0 +1,36 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s --check-prefix="CHECK-50"
+! RUN: not flang -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s --check-prefix="CHECK-52"
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK-50: INDIRECT clause is not allowed on directive DECLARE TARGET in OpenMP v5.0, try -fopenmp-version=51
+    !CHECK-52: not yet implemented: Unhandled clause INDIRECT in DECLARE TARGET construct
+    !$omp declare target enter(func1) indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index e0714e812e5c..de888ff86fe9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -701,7 +701,7 @@ template <typename T, typename I, typename E> //
 struct IndirectT {
   using InvokedByFptr = E;
   using WrapperTrait = std::true_type;
-  InvokedByFptr v;
+  OPT(InvokedByFptr) v;
 };
 
 // V5.2: [14.1.2] `init` clause
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 027692275b63..a87111cb5a11 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -246,6 +246,7 @@ def OMPC_Inclusive : Clause<[Spelling<"inclusive">]> {
   let flangClass = "OmpObjectList";
 }
 def OMPC_Indirect : Clause<[Spelling<"indirect">]> {
+  let flangClass = "OmpIndirectClause";
 }
 def OMPC_Init : Clause<[Spelling<"init">]> {
   let clangClass = "OMPInitClause";
@@ -646,7 +647,7 @@ def OMP_EndAssumes : Directive<[Spelling<"end assumes">]> {
 def OMP_BeginDeclareTarget : Directive<[Spelling<"begin declare target">]> {
   let allowedClauses = [
     VersionedClause<OMPC_DeviceType>,
-    VersionedClause<OMPC_Indirect>,
+    VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
     VersionedClause<OMPC_To>,
   ];
@@ -724,7 +725,7 @@ def OMP_DeclareSimd : Directive<[Spelling<"declare simd">]> {
 def OMP_DeclareTarget : Directive<[Spelling<"declare target">]> {
   let allowedClauses = [
     VersionedClause<OMPC_Enter, 52>,
-    VersionedClause<OMPC_Indirect>,
+    VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
     VersionedClause<OMPC_To>,
   ];

From 90905a638e483dd9040c153785148fcea7c3e412 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Tue, 17 Jun 2025 13:49:48 +0530
Subject: [PATCH 0598/1322] [lldb][AIX] Added XCOFF ParseSymtab handling
 (#141577)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

**Description:**
Adding ParseSymtab logic after creating sections. It is able to handle
both 32 and 64 bit symbols,
without the need to add template logic.

This is an incremental PR on top of my previous couple of XCOFF support
commits.
---
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 102 +++++++++++++-
 .../Shell/ObjectFile/XCOFF/symbol-info.yaml   | 121 +++++++++++++++++
 .../Shell/ObjectFile/XCOFF/symbol-info32.yaml | 124 ++++++++++++++++++
 3 files changed, 346 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
 create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index 84d05e173f83..d2c46edaf28c 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -188,7 +188,107 @@ AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
   return AddressClass::eUnknown;
 }
 
-void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {}
+static lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  switch (sym_type) {
+  case llvm::object::SymbolRef::ST_Function:
+    return lldb::eSymbolTypeCode;
+  case llvm::object::SymbolRef::ST_Data:
+    return lldb::eSymbolTypeData;
+  case llvm::object::SymbolRef::ST_File:
+    return lldb::eSymbolTypeSourceFile;
+  default:
+    return lldb::eSymbolTypeInvalid;
+  }
+}
+
+void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
+  Log *log = GetLog(LLDBLog::Object);
+  SectionList *sectionList = GetSectionList();
+
+  for (const auto &symbol_ref : m_binary->symbols()) {
+    llvm::object::XCOFFSymbolRef xcoff_sym_ref(symbol_ref);
+
+    llvm::Expected<llvm::StringRef> name_or_err = xcoff_sym_ref.getName();
+    if (!name_or_err) {
+      LLDB_LOG_ERROR(log, name_or_err.takeError(),
+                     "Unable to extract name from the xcoff symbol ref object");
+      continue;
+    }
+
+    llvm::StringRef symbolName = name_or_err.get();
+    // Remove the . prefix added during compilation. This prefix is usually
+    // added to differentiate between reference to the code and function
+    // descriptor. For instance, Adding .func will only allow user to put bp on
+    // .func, which is not known to the user, instead of func.
+    llvm::StringRef name_no_dot =
+        symbolName.starts_with(".") ? symbolName.drop_front() : symbolName;
+    auto storageClass = xcoff_sym_ref.getStorageClass();
+    // C_HIDEXT symbols are not needed to be exposed, with the exception of TOC
+    // which is responsible for storing references to global data
+    if (storageClass == XCOFF::C_HIDEXT && symbolName != "TOC") {
+
+      // Zero or muliple aux entries may suggest ambiguous data
+      if (xcoff_sym_ref.getNumberOfAuxEntries() != 1)
+        continue;
+
+      auto aux_csect_or_err = xcoff_sym_ref.getXCOFFCsectAuxRef();
+      if (!aux_csect_or_err) {
+        LLDB_LOG_ERROR(log, aux_csect_or_err.takeError(),
+                       "Unable to access xcoff csect aux ref object");
+        continue;
+      }
+
+      const llvm::object::XCOFFCsectAuxRef csect_aux = aux_csect_or_err.get();
+
+      // Only add hidden ext entries which come under Program Code, skip others
+      // as they are not useful as debugging data.
+      if (csect_aux.getStorageMappingClass() != XCOFF::XMC_PR)
+        continue;
+
+      // This does not apply to 32-bit,
+      // Only add csect symbols identified by the aux entry, as they are
+      // needed to reference section information. Skip others
+      if (m_binary->is64Bit())
+        if (csect_aux.getAuxType64() != XCOFF::AUX_CSECT)
+          continue;
+    }
+
+    Symbol symbol;
+    symbol.GetMangled().SetValue(ConstString(name_no_dot));
+
+    int16_t sectionNumber = xcoff_sym_ref.getSectionNumber();
+    // Note that XCOFF section headers are numbered from 1 and not 0.
+    size_t sectionIndex = static_cast<size_t>(sectionNumber - 1);
+    if (sectionNumber > 0) {
+      if (sectionIndex < sectionList->GetSize()) {
+
+        lldb::SectionSP section_sp =
+            sectionList->GetSectionAtIndex(sectionIndex);
+        if (!section_sp || section_sp->GetFileAddress() == LLDB_INVALID_ADDRESS)
+          continue;
+
+        lldb::addr_t file_addr = section_sp->GetFileAddress();
+        lldb::addr_t symbolValue = xcoff_sym_ref.getValue();
+        if (symbolValue < file_addr)
+          continue;
+
+        symbol.GetAddressRef() = Address(section_sp, symbolValue - file_addr);
+      }
+    }
+
+    Expected<llvm::object::SymbolRef::Type> sym_type_or_err =
+        symbol_ref.getType();
+    if (!sym_type_or_err) {
+      LLDB_LOG_ERROR(log, sym_type_or_err.takeError(),
+                     "Unable to access xcoff symbol type");
+      continue;
+    }
+
+    symbol.SetType(MapSymbolType(sym_type_or_err.get()));
+
+    lldb_symtab.AddSymbol(symbol);
+  }
+}
 
 bool ObjectFileXCOFF::IsStripped() { return false; }
 
diff --git a/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
new file mode 100644
index 000000000000..6b1a40a28344
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
@@ -0,0 +1,121 @@
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb %t -o "image dump symtab" -o exit | FileCheck %s
+# CHECK: Index   UserID     DSX Type            File Address/Value Load Address       Size               Flags      Name
+# CHECK: [    0] 4294967295     Invalid         0xffffffffffffffff                    0x0000000000000000 0x00000000 errno
+# CHECK: [    1] 4294967295     Code            0x0000000100000500                    0x0000000000000398 0x00000000 __threads_init
+# CHECK: [    2] 4294967295     Data            0x0000000110000a70                    0x0000000000000060 0x00000000 __threads_init
+# CHECK: [    3] 4294967295     Invalid         0x0000000110000ad0                    0x00000000000000b0 0x00000000 TOC
+# CHECK: [    4] 4294967295     Invalid         0x0000000100000898                    0x00000000100001d8 0x00000000 text
+# CHECK: [    5] 4294967295     Code            0x0000000100000898                    0x00000000100001d8 0x00000000 main
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1F7
+  NumberOfSections: 2
+  CreationTime:    000000000
+  Flags:           0x0002
+Sections:
+  - Name:            .text
+    Address:         0x100000438
+    Size:            0x38
+    FileOffsetToData: 0x0
+    FileOffsetToLineNumbers: 0x0
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_TEXT ]
+    SectionData:     E8C20000
+  - Name:            .data
+    Address:         0x1100008D2
+    Size:            0x2AE
+    FileOffsetToData: 0x8D2
+    FileOffsetToRelocations: 0x132E
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x22
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_DATA ]
+    SectionData:     '' 
+Symbols:
+  - Name:            errno
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 0
+        StorageMappingClass: XMC_RW
+        SectionOrLengthLo: 0
+        SectionOrLengthHi: 0
+  - Name:            .__threads_init
+    Value:           0x100000500
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 80
+        SectionOrLengthHi: 0
+  - Name:            __threads_init
+    Value:           0x110000A70
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 25
+        StorageMappingClass: XMC_DS
+        SectionOrLengthLo: 24
+        SectionOrLengthHi: 0
+  - Name:            TOC
+    Value:           0x110000AD0
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 25
+        StorageMappingClass: XMC_TC0
+        SectionOrLengthLo: 0
+        SectionOrLengthHi: 0
+  - Name:            .text
+    Value:           0x100000898
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 17
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 58
+        SectionOrLengthHi: 0
+  - Name:            .main
+    Value:           0x100000898
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 135
+        SectionOrLengthHi: 0
+...
diff --git a/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
new file mode 100644
index 000000000000..59c018ba0e42
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
@@ -0,0 +1,124 @@
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb %t -o "image dump symtab" -o exit | FileCheck %s
+# CHECK: Index   UserID     DSX Type            File Address/Value Load Address       Size               Flags      Name
+# CHECK: [    0] 4294967295     Invalid         0xffffffffffffffff                    0x0000000000000000 0x00000000 errno
+# CHECK: [    1] 4294967295     Code            0x0000000010000320                    0x0000000000000420 0x00000000 __threads_init
+# CHECK: [    2] 4294967295     Data            0x0000000020000920                    0x000000000000003c 0x00000000 __threads_init
+# CHECK: [    3] 4294967295     Invalid         0x000000002000095c                    0x0000000000000060 0x00000000 TOC
+# CHECK: [    4] 4294967295     Invalid         0x0000000010000740                    0x000000000000003a 0x00000000 text
+# CHECK: [    5] 4294967295     Invalid         0x0000000010000740                    0x000000000000003a 0x00000000 main
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1DF
+  NumberOfSections: 2
+  CreationTime:    000000000
+  Flags:           0x1002
+Sections:
+  - Name:            .text
+    Address:         0x10000268
+    Size:            0x512
+    FileOffsetToData: 0x268
+    FileOffsetToRelocations: 0xECC
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x24
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_TEXT ]
+    SectionData:     80C20000
+  - Name:            .data
+    Address:         0x2000077A
+    Size:            0x242
+    FileOffsetToData: 0x77A
+    FileOffsetToRelocations: 0x1034
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x25
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_DATA ]
+    SectionData:     ''
+Symbols:
+  - Name:            errno
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_RW
+        SectionOrLength: 0
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .__threads_init
+    Value:           0x10000320
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 84
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            __threads_init
+    Value:           0x20000920
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_DS
+        SectionOrLength: 12
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            TOC
+    Value:           0x2000095C
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_TC0
+        SectionOrLength: 0
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .text
+    Value:           0x10000740
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 58
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .main
+    Value:           0x10000740
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 137
+        StabInfoIndex:   0
+        StabSectNum:     0
+
+...

From 437945b28838c71fb32a76f6433cef8807967f71 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 17 Jun 2025 09:26:52 +0100
Subject: [PATCH 0599/1322] [AArch64][SVE] Move incorrectly placed assert
 (#144318)

This assert is only valid if FPAfterSVECalleeSaves is true, for the
default layout resolving CSR works correctly.
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 2650c621e19f..7ffe779f2408 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2896,8 +2896,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
 
   if (isSVE) {
-    assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
-           "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
     StackOffset FPOffset =
         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
     StackOffset SPOffset =
@@ -2905,6 +2903,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
                          ObjectOffset);
     if (FPAfterSVECalleeSaves) {
+      assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
+             "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
       FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
     }
     // Always use the FP for SVE spills if available and beneficial.

From 85b110e0419af4b1b9a238b6978029e20010e794 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 17 Jun 2025 09:30:35 +0100
Subject: [PATCH 0600/1322] [mlir][vector] Add documentation note on adding new
 ops (#144308)

This adds a note requesting that additions of new ops to the Vector
dialect go through an RFC process. The goal is to clarify expectations
for contributors.

Note: this documents an existing (though previously unwritten)
convention. See, e.g.:
* https://discourse.llvm.org/t/rfc-adding-vector-to-elements-op-to-the-vector-dialect
* https://discourse.llvm.org/t/rfc-improving-gather-codegen-for-vector-dialect
---
 mlir/docs/Dialects/Vector.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/docs/Dialects/Vector.md b/mlir/docs/Dialects/Vector.md
index ade0068c56fb..ebeb0a2de0ff 100644
--- a/mlir/docs/Dialects/Vector.md
+++ b/mlir/docs/Dialects/Vector.md
@@ -1,5 +1,8 @@
 # 'vector' Dialect
 
+**Please post an RFC on the [forum](https://llvm.discourse.group/c/mlir/31)
+before adding  any operation in this dialect.**
+
 [TOC]
 
 MLIR supports multi-dimensional `vector` types and custom operations on those

From e2551c14d0d9180ccaef9d33c524d83e7813a361 Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Tue, 17 Jun 2025 10:31:38 +0200
Subject: [PATCH 0601/1322] [analyzer] Fix a false memory leak reports
 involving placement new (#144341)

Placement new does not allocate memory, so it should not be reported as
a memory leak. A recent MallocChecker refactor changed inlining of
placement-new calls with manual evaluation by MallocChecker.
https://github.com/llvm/llvm-project/commit/339282d49f5310a2837da45c0ccc19da15675554

This change avoids marking the value returned by placement new as
allocated and hence avoids the false leak reports.

Note that the there are two syntaxes to invoke placement new:
`new (p) int` and an explicit operator call `operator new(sizeof(int), p)`.
The first syntax was already properly handled by the engine.
This change corrects handling of the second syntax.

CPP-6375
---
 .../StaticAnalyzer/Checkers/MallocChecker.cpp | 22 +++++++++++
 .../test/Analysis/NewDelete-checker-test.cpp  | 38 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index fef33509c0b6..35e98a5e2719 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -1371,6 +1371,20 @@ void MallocChecker::checkIfFreeNameIndex(ProgramStateRef State,
   C.addTransition(State);
 }
 
+const Expr *getPlacementNewBufferArg(const CallExpr *CE,
+                                     const FunctionDecl *FD) {
+  // Checking for signature:
+  // void* operator new  ( std::size_t count, void* ptr );
+  // void* operator new[]( std::size_t count, void* ptr );
+  if (CE->getNumArgs() != 2 || (FD->getOverloadedOperator() != OO_New &&
+                                FD->getOverloadedOperator() != OO_Array_New))
+    return nullptr;
+  auto BuffType = FD->getParamDecl(1)->getType();
+  if (BuffType.isNull() || !BuffType->isVoidPointerType())
+    return nullptr;
+  return CE->getArg(1);
+}
+
 void MallocChecker::checkCXXNewOrCXXDelete(ProgramStateRef State,
                                            const CallEvent &Call,
                                            CheckerContext &C) const {
@@ -1386,6 +1400,14 @@ void MallocChecker::checkCXXNewOrCXXDelete(ProgramStateRef State,
   // processed by the checkPostStmt callbacks for CXXNewExpr and
   // CXXDeleteExpr.
   const FunctionDecl *FD = C.getCalleeDecl(CE);
+  if (const auto *BufArg = getPlacementNewBufferArg(CE, FD)) {
+    // Placement new does not allocate memory
+    auto RetVal = State->getSVal(BufArg, Call.getLocationContext());
+    State = State->BindExpr(CE, C.getLocationContext(), RetVal);
+    C.addTransition(State);
+    return;
+  }
+
   switch (FD->getOverloadedOperator()) {
   case OO_New:
     State = MallocMemAux(C, Call, CE->getArg(0), UndefinedVal(), State,
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index 06754f669b1e..da0eef7c52bd 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -26,9 +26,10 @@
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
 // RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
-// RUN:   -verify=expected,leak \
+// RUN:   -verify=expected,leak,inspection \
 // RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
+// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
+// RUN:   -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -63,6 +64,39 @@ void testGlobalNoThrowPlacementExprNewBeforeOverload() {
   int *p = new(std::nothrow) int;
 } // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
+//----- Standard pointer placement operators
+void testGlobalPointerPlacementNew() {
+  int i;
+  void *p1 = operator new(0, &i); // no leak: placement new never allocates
+  void *p2 = operator new[](0, &i); // no leak
+  int *p3 = new(&i) int; // no leak
+  int *p4 = new(&i) int[0]; // no leak
+}
+
+template<typename T>
+void clang_analyzer_dump(T x);
+
+void testPlacementNewBufValue() {
+  int i = 10;
+  int *p = new(&i) int;
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
+void testPlacementNewBufValueExplicitOp() {
+  int i = 10;
+  int *p = (int*)operator new(sizeof(int), &i);
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
+void testPlacementArrNewBufValueExplicitArrOp() {
+  int i = 10;
+  int *p = (int*)operator new[](sizeof(int), &i);
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
 //----- Other cases
 void testNewMemoryIsInHeap() {
   int *p = new int;

From 308b97a5d48583680f56b888165295c62744b9e5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 10:33:04 +0200
Subject: [PATCH 0602/1322] [LICM] Regenerate test checks (NFC)

---
 llvm/test/Transforms/LICM/funclet.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/LICM/funclet.ll b/llvm/test/Transforms/LICM/funclet.ll
index cacb0c90d370..1cdd12ddc98e 100644
--- a/llvm/test/Transforms/LICM/funclet.ll
+++ b/llvm/test/Transforms/LICM/funclet.ll
@@ -14,7 +14,7 @@ define void @test1(ptr %s, i1 %b) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRY_CONT_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[DOTLCSSA1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %catch] unwind to caller
@@ -59,7 +59,7 @@ define void @test2(ptr %s, i1 %b) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRY_CONT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[CP:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @pure_computation() [ "funclet"(token [[CP]]) ]
@@ -114,10 +114,10 @@ define void @test3(i1 %a, i1 %b, i1 %c) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    [[CS]] = catchswitch within none [label %catch.object.Throwable] unwind to caller
 ; CHECK:       forbody:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[POSTINVOKE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[POSTINVOKE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[FORCOND_BACKEDGE]] unwind label [[CATCH_DISPATCH]]
+; CHECK-NEXT:            to label [[FORCOND_BACKEDGE]] unwind label [[CATCH_DISPATCH]]
 ;
 entry:
   %.frame = alloca i8, align 4

From 2c90ebf3a79e25db3e6bcd9b3a66590b5996de4d Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Tue, 17 Jun 2025 09:34:47 +0100
Subject: [PATCH 0603/1322] [OMPIRBuilder][debug] Don't drop debug info for
 loop constructs. (#144393)

In OMPIRBuilder, we have many cases where we don't handle the debug
location correctly while changing the location or insertion point. This
is one of those cases.

Please see the following test program.
```
program main
  implicit none
  integer i, j
  integer array(16384)

!$omp target teams distribute
  DO i=1,16384
    !$omp parallel do
      DO j=1,16384
        array(j) = i
      ENDDO
    !$omp end parallel do
  ENDDO
!$omp end target teams distribute

print *, array
end program main
```

When tried to compile with the follownig command
`flang -g -O2 -fopenmp  test.f90 -o test  --offload-arch=gfx90a`

will fail in the verification with the following errors: `!dbg
attachment points at wrong subprogram for function`

This happens because we were dropping the debug location in the
createCanonicalLoop and the call to the functions like
`__kmpc_distribute_static_4u` get generated without a debug location.
When it gets inlined, the locations inside it are not adjusted as the
call instruction does not have the debug locations
(`llvm/lib/Transforms/Utils/InlineFunction.cpp:fixupLineNumbers`). Later
Verifier finds that the caller have instructions with debug locations
that point to another function and fails.

The fix is simple to not drop the debug location.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  6 +-
 .../LLVMIR/omptarget-debug-loop-loc.mlir      | 66 +++++++++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cf17a84242c7..7cbbbff511c8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4184,7 +4184,11 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
     Value *IndVar = Builder.CreateAdd(Span, Start);
     return BodyGenCB(Builder.saveIP(), IndVar);
   };
-  LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
+  LocationDescription LoopLoc =
+      ComputeIP.isSet()
+          ? Loc
+          : LocationDescription(Builder.saveIP(),
+                                Builder.getCurrentDebugLocation());
   return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
new file mode 100644
index 000000000000..a755cef98d7c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
+  omp.private {type = private} @_QFEj_private_i32 : i32 loc(#loc1)
+  omp.private {type = private} @_QFEi_private_i32 : i32 loc(#loc1)
+  llvm.func @test() {
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr loc(#loc4)
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+    %9 = llvm.mlir.constant(16383 : index) : i64
+    %10 = llvm.mlir.constant(0 : index) : i64
+    %11 = llvm.mlir.constant(1 : index) : i64
+    %12 = llvm.mlir.constant(16384 : i32) : i32
+    %14 = llvm.mlir.addressof @_QFEarray : !llvm.ptr
+    %18 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3)
+    %20 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"} loc(#loc3)
+    %22 = omp.map.bounds lower_bound(%10 : i64) upper_bound(%9 : i64) extent(%9 : i64) stride(%11 : i64) start_idx(%11 : i64) loc(#loc3)
+    %23 = omp.map.info var_ptr(%14 : !llvm.ptr, !llvm.array<16384 x i32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%22) -> !llvm.ptr {name = "array"} loc(#loc3)
+    %24 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3)
+    omp.target map_entries(%18 -> %arg0, %20 -> %arg2, %23 -> %arg4, %24 -> %arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %25 = llvm.mlir.constant(1 : i32) : i32
+      %27 = llvm.mlir.constant(16384 : i32) : i32
+      omp.teams {
+        omp.distribute private(@_QFEi_private_i32 %arg5 -> %arg6 : !llvm.ptr) {
+          omp.loop_nest (%arg7) : i32 = (%25) to (%27) inclusive step (%25) {
+            omp.parallel {
+              omp.wsloop private(@_QFEj_private_i32 %arg2 -> %arg8 : !llvm.ptr) {
+                omp.loop_nest (%arg9) : i32 = (%25) to (%27) inclusive step (%25) {
+                  llvm.store %arg9, %arg8 : i32, !llvm.ptr loc(#loc9)
+                  omp.yield
+                } loc(#loc9)
+              } loc(#loc9)
+              omp.terminator loc(#loc9)
+            } loc(#loc9)
+            omp.yield loc(#loc9)
+          } loc(#loc9)
+        } loc(#loc9)
+        omp.terminator loc(#loc9)
+      } loc(#loc9)
+      omp.terminator loc(#loc9)
+    } loc(#loc9)
+    llvm.return loc(#loc9)
+  } loc(#loc14)
+  llvm.mlir.global internal @_QFEarray() {addr_space = 0 : i32} : !llvm.array<16384 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<16384 x i32>
+    llvm.return %0 : !llvm.array<16384 x i32>
+  } loc(#loc2)
+}
+#di_file = #llvm.di_file<"test.f90" in "">
+#di_null_type = #llvm.di_null_type
+#loc1 = loc("test.f90":4:23)
+#loc2 = loc("test.f90":4:15)
+#loc3 = loc("test.f90":1:7)
+#loc4 = loc("test.f90":4:18)
+#loc9 = loc("test.f90":13:11)
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[0]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang", isOptimized = true, emissionKind = LineTablesOnly>
+#di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_program, types = #di_null_type>
+#di_subprogram = #llvm.di_subprogram<id = distinct[1]<>, compileUnit = #di_compile_unit, scope = #di_file, name = "main", file = #di_file, subprogramFlags = "Definition|Optimized|MainSubprogram", type = #di_subroutine_type>
+#loc14 = loc(fused<#di_subprogram>[#loc3])
+
+
+// CHECK: call void @__kmpc_distribute_static{{.*}}!dbg
+

From 0f8c72160ec001599ecb29f0fa182c5550f5dd0a Mon Sep 17 00:00:00 2001
From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com>
Date: Tue, 17 Jun 2025 09:45:18 +0100
Subject: [PATCH 0604/1322] [C++20][Modules] Disable preferred_name when
 writing a C++20 header unit (#144377)

https://reviews.llvm.org/D130331 added workaround for named modules
only. But the same issue happens for headees units. Link issue #56490
---
 clang/include/clang/Serialization/ASTWriter.h |  4 ++
 clang/lib/Serialization/ASTWriter.cpp         |  5 +-
 .../Modules/preferred_name_header_unit.cpp    | 64 +++++++++++++++++++
 3 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/preferred_name_header_unit.cpp

diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index cf4ae610ea51..0f49646f3f02 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -899,6 +899,10 @@ public:
     return WritingModule && WritingModule->isNamedModule();
   }
 
+  bool isWritingStdCXXHeaderUnit() const {
+    return WritingModule && WritingModule->isHeaderUnit();
+  }
+
   bool isGeneratingReducedBMI() const { return GeneratingReducedBMI; }
 
   bool getDoneWritingDeclsAndTypes() const { return DoneWritingDeclsAndTypes; }
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index ab1b5b333e06..be22ee522191 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5167,8 +5167,9 @@ void ASTRecordWriter::AddAttr(const Attr *A) {
   // FIXME: Clang can't handle the serialization/deserialization of
   // preferred_name properly now. See
   // https://github.com/llvm/llvm-project/issues/56490 for example.
-  if (!A || (isa<PreferredNameAttr>(A) &&
-             Writer->isWritingStdCXXNamedModules()))
+  if (!A ||
+      (isa<PreferredNameAttr>(A) && (Writer->isWritingStdCXXNamedModules() ||
+                                     Writer->isWritingStdCXXHeaderUnit())))
     return Record.push_back(0);
 
   Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs
diff --git a/clang/test/Modules/preferred_name_header_unit.cpp b/clang/test/Modules/preferred_name_header_unit.cpp
new file mode 100644
index 000000000000..b1f1e3579f31
--- /dev/null
+++ b/clang/test/Modules/preferred_name_header_unit.cpp
@@ -0,0 +1,64 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: cd %t
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-name=h1.h -emit-header-unit -xc++-user-header h1.h -o h1.pcm
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-map-file=module.modulemap -fmodule-file=h1.h=h1.pcm main.cpp -o main.o
+
+//--- module.modulemap
+module "h1.h" {
+  header "h1.h"
+  export *
+}
+
+//--- h0.h
+// expected-no-diagnostics
+#pragma once
+namespace std {
+
+template <class _CharT, class = _CharT, class = _CharT> class basic_string;
+
+namespace pmr {
+using string = basic_string<char>;
+}
+
+template <class, class, class>
+class __attribute__((__preferred_name__(pmr::string))) basic_string;
+
+template <class> class basic_string_view {};
+
+template <class _CharT, class _Traits, class _Allocator> class basic_string {
+  typedef _CharT value_type;
+  typedef _Allocator allocator_type;
+  struct __rep;
+public:
+  template <class _Tp>
+  basic_string(_Tp) {}
+  basic_string operator+=(value_type);
+};
+
+namespace filesystem {
+class path {
+  typedef char value_type;
+  value_type preferred_separator;
+  typedef basic_string<value_type> string_type;
+  typedef basic_string_view<value_type> __string_view;
+  template <class _Source> void append(_Source) {
+    __pn_ += preferred_separator;
+  }
+  void __root_directory() { append(string_type(__string_view{})); }
+  string_type __pn_;
+};
+} // namespace filesystem
+} // namespace std
+
+//--- h1.h
+// expected-no-diagnostics
+#pragma once
+
+#include "h0.h"
+
+//--- main.cpp
+// expected-no-diagnostics
+#include "h0.h"
+
+import "h1.h";

From 26d082d330e4d8d1fc3194b4b87ede9332a297f5 Mon Sep 17 00:00:00 2001
From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com>
Date: Tue, 17 Jun 2025 09:47:15 +0100
Subject: [PATCH 0605/1322] [clang-tidy][performance-unnecessary-value-param]
 Avoid in coroutines (#140912)

Summary:
Replacing by-value parameters with passing by-reference is not safe for
coroutines because the caller may be executed in parallel with the
callee, which increases the chances of resulting in dangling references
and hard-to-find crashes. See for the reference
[cppcoreguidelines-avoid-reference-coroutine-parameters](https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-reference-coroutine-parameters.html).

Test Plan: check-clang-tools
---
 .../UnnecessaryValueParamCheck.cpp            | 18 +++--
 .../performance/UnnecessaryValueParamCheck.h  |  1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |  2 +
 .../performance/unnecessary-value-param.rst   |  9 ++-
 .../unnecessary-value-param-coroutine.cpp     | 65 +++++++++++++++++++
 5 files changed, 87 insertions(+), 8 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp

diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index a877f9a7ee91..d89c3a69fc84 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -50,7 +50,8 @@ UnnecessaryValueParamCheck::UnnecessaryValueParamCheck(
                                         utils::IncludeSorter::IS_LLVM),
                areDiagsSelfContained()),
       AllowedTypes(
-          utils::options::parseStringList(Options.get("AllowedTypes", ""))) {}
+          utils::options::parseStringList(Options.get("AllowedTypes", ""))),
+      IgnoreCoroutines(Options.get("IgnoreCoroutines", true)) {}
 
 void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) {
   const auto ExpensiveValueParamDecl = parmVarDecl(
@@ -61,12 +62,14 @@ void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) {
                            matchers::matchesAnyListedName(AllowedTypes))))))),
       decl().bind("param"));
   Finder->addMatcher(
-      traverse(
-          TK_AsIs,
-          functionDecl(hasBody(stmt()), isDefinition(), unless(isImplicit()),
-                       unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))),
-                       has(typeLoc(forEach(ExpensiveValueParamDecl))),
-                       decl().bind("functionDecl"))),
+      traverse(TK_AsIs,
+               functionDecl(
+                   hasBody(IgnoreCoroutines ? stmt(unless(coroutineBodyStmt()))
+                                            : stmt()),
+                   isDefinition(), unless(isImplicit()),
+                   unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))),
+                   has(typeLoc(forEach(ExpensiveValueParamDecl))),
+                   decl().bind("functionDecl"))),
       this);
 }
 
@@ -123,6 +126,7 @@ void UnnecessaryValueParamCheck::storeOptions(
   Options.store(Opts, "IncludeStyle", Inserter.getStyle());
   Options.store(Opts, "AllowedTypes",
                 utils::options::serializeStringList(AllowedTypes));
+  Options.store(Opts, "IgnoreCoroutines", IgnoreCoroutines);
 }
 
 void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index 8bfd814d1635..b52043416e76 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -46,6 +46,7 @@ private:
   ExprMutationAnalyzer::Memoized MutationAnalyzerCache;
   utils::IncludeInserter Inserter;
   const std::vector<StringRef> AllowedTypes;
+  bool IgnoreCoroutines;
 };
 
 } // namespace clang::tidy::performance
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 19ccd1790e75..3c1ca2f92904 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -265,6 +265,8 @@ Changes in existing checks
   <clang-tidy/checks/performance/unnecessary-value-param>` check performance by
   tolerating fix-it breaking compilation when functions is used as pointers
   to avoid matching usage of functions within the current compilation unit.
+  Added an option `IgnoreCoroutines` with the default value `true` to
+  suppress this check for coroutines where passing by reference may be unsafe.
 
 - Improved :doc:`readability-convert-member-functions-to-static
   <clang-tidy/checks/readability/convert-member-functions-to-static>` check by
diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
index dc86530b95f1..cd25d7d94d99 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
@@ -56,7 +56,7 @@ Will become:
 
 Because the fix-it needs to change the signature of the function, it may break
 builds if the function is used in multiple translation units or some codes
-depends on funcion signatures.
+depends on function signatures.
 
 Options
 -------
@@ -74,3 +74,10 @@ Options
    default is empty. If a name in the list contains the sequence `::`, it is
    matched against the qualified type name (i.e. ``namespace::Type``),
    otherwise it is matched against only the type name (i.e. ``Type``).
+
+.. option:: IgnoreCoroutines
+
+   A boolean specifying whether the check should suggest passing parameters by
+   reference in coroutines. Passing parameters by reference in coroutines may
+   not be safe, please see :doc:`cppcoreguidelines-avoid-reference-coroutine-parameters <../cppcoreguidelines/avoid-reference-coroutine-parameters>`
+   for more information. Default is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
new file mode 100644
index 000000000000..0a84dc467647
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
@@ -0,0 +1,65 @@
+// RUN: %check_clang_tidy -std=c++20-or-later %s performance-unnecessary-value-param %t -- -fix-errors
+// RUN: %check_clang_tidy -std=c++20-or-later %s performance-unnecessary-value-param %t -- \
+// RUN:   -config='{CheckOptions: {performance-unnecessary-value-param.IgnoreCoroutines: true}}' -fix-errors
+// RUN: %check_clang_tidy -check-suffix=ALLOWED -std=c++20-or-later %s performance-unnecessary-value-param %t -- \
+// RUN:   -config='{CheckOptions: {performance-unnecessary-value-param.IgnoreCoroutines: false}}' -fix-errors
+
+namespace std {
+
+template <class Ret, typename... T> struct coroutine_traits {
+  using promise_type = typename Ret::promise_type;
+};
+
+template <class Promise = void> struct coroutine_handle {
+  static coroutine_handle from_address(void *) noexcept;
+  static coroutine_handle from_promise(Promise &promise);
+  constexpr void *address() const noexcept;
+};
+
+template <> struct coroutine_handle<void> {
+  template <class PromiseType>
+  coroutine_handle(coroutine_handle<PromiseType>) noexcept;
+  static coroutine_handle from_address(void *);
+  constexpr void *address() const noexcept;
+};
+
+struct suspend_always {
+  bool await_ready() noexcept { return false; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+struct suspend_never {
+  bool await_ready() noexcept { return true; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+} // namespace std
+
+struct ReturnObject {
+    struct promise_type {
+        ReturnObject get_return_object() { return {}; }
+        ReturnObject return_void() { return {}; }
+        std::suspend_always initial_suspend() { return {}; }
+        std::suspend_always final_suspend() noexcept { return {}; }
+        void unhandled_exception() {}
+        std::suspend_always yield_value(int value) { return {}; }
+    };
+};
+
+struct A {
+  A(const A&);
+};
+
+ReturnObject foo_coroutine(const A a) {
+// CHECK-MESSAGES-ALLOWED: [[@LINE-1]]:36: warning: the const qualified parameter 'a'
+// CHECK-FIXES: ReturnObject foo_coroutine(const A a) {
+  co_return;
+}
+
+ReturnObject foo_not_coroutine(const A a) {
+// CHECK-MESSAGES: [[@LINE-1]]:40: warning: the const qualified parameter 'a'
+// CHECK-MESSAGES-ALLOWED: [[@LINE-2]]:40: warning: the const qualified parameter 'a'
+  return ReturnObject{};
+}

From 5dc632dd56c61fb768424cc8027760490683d00d Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Tue, 17 Jun 2025 10:53:11 +0200
Subject: [PATCH 0606/1322] [MLIR][VSCode] update packages to fix
 CVE-2022-25883 and CVE-2022-3517 (#144479)

Fixes issue #140869.
---
 mlir/utils/vscode/package-lock.json | 127 +++++++++++++---------------
 mlir/utils/vscode/package.json      |   2 +
 2 files changed, 62 insertions(+), 67 deletions(-)

diff --git a/mlir/utils/vscode/package-lock.json b/mlir/utils/vscode/package-lock.json
index 1efd5779f5cb..28454c680177 100644
--- a/mlir/utils/vscode/package-lock.json
+++ b/mlir/utils/vscode/package-lock.json
@@ -10,6 +10,8 @@
       "dependencies": {
         "base64-js": "^1.5.1",
         "chokidar": "3.5.2",
+        "minimatch": "^3.0.5",
+        "semver": "^7.5.2",
         "vscode-languageclient": "^8.0.2-next.5"
       },
       "devDependencies": {
@@ -89,6 +91,16 @@
         "keytar": "^7.7.0"
       }
     },
+    "node_modules/@vscode/vsce/node_modules/semver": {
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
     "node_modules/@vscode/vsce/node_modules/xml2js": {
       "version": "0.5.0",
       "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -1195,9 +1207,10 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
-      "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.5.tgz",
+      "integrity": "sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw==",
+      "license": "ISC",
       "dependencies": {
         "brace-expansion": "^1.1.7"
       },
@@ -1262,22 +1275,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/node-abi/node_modules/semver": {
-      "version": "7.3.7",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-      "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-      "dev": true,
-      "optional": true,
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/node-addon-api": {
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-4.3.0.tgz",
@@ -1365,6 +1362,16 @@
         "semver": "^5.1.0"
       }
     },
+    "node_modules/parse-semver/node_modules/semver": {
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
     "node_modules/parse5": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
@@ -1567,12 +1574,18 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
-      "dev": true,
+      "version": "7.5.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.2.tgz",
+      "integrity": "sha512-SoftuTROv/cRjCze/scjGyiDtcUyxw1rgYQSZY7XTmtR5hX+dm76iDbTH8TkLPHCQmlbQVSSbNZCPM2hb0knnQ==",
+      "license": "ISC",
+      "dependencies": {
+        "lru-cache": "^6.0.0"
+      },
       "bin": {
-        "semver": "bin/semver"
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
       }
     },
     "node_modules/set-blocking": {
@@ -1901,20 +1914,6 @@
         "vscode": "^1.67.0"
       }
     },
-    "node_modules/vscode-languageclient/node_modules/semver": {
-      "version": "7.3.7",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-      "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/vscode-languageserver-protocol": {
       "version": "3.17.2-next.6",
       "resolved": "https://registry.npmjs.org/vscode-languageserver-protocol/-/vscode-languageserver-protocol-3.17.2-next.6.tgz",
@@ -2049,6 +2048,12 @@
         "yazl": "^2.2.2"
       },
       "dependencies": {
+        "semver": {
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+          "dev": true
+        },
         "xml2js": {
           "version": "0.5.0",
           "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -2895,9 +2900,9 @@
       "optional": true
     },
     "minimatch": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
-      "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.5.tgz",
+      "integrity": "sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw==",
       "requires": {
         "brace-expansion": "^1.1.7"
       }
@@ -2951,18 +2956,6 @@
       "optional": true,
       "requires": {
         "semver": "^7.3.5"
-      },
-      "dependencies": {
-        "semver": {
-          "version": "7.3.7",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-          "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-          "dev": true,
-          "optional": true,
-          "requires": {
-            "lru-cache": "^6.0.0"
-          }
-        }
       }
     },
     "node-addon-api": {
@@ -3035,6 +3028,14 @@
       "dev": true,
       "requires": {
         "semver": "^5.1.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+          "dev": true
+        }
       }
     },
     "parse5": {
@@ -3200,10 +3201,12 @@
       "dev": true
     },
     "semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
-      "dev": true
+      "version": "7.5.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.2.tgz",
+      "integrity": "sha512-SoftuTROv/cRjCze/scjGyiDtcUyxw1rgYQSZY7XTmtR5hX+dm76iDbTH8TkLPHCQmlbQVSSbNZCPM2hb0knnQ==",
+      "requires": {
+        "lru-cache": "^6.0.0"
+      }
     },
     "set-blocking": {
       "version": "2.0.0",
@@ -3454,16 +3457,6 @@
         "minimatch": "^3.0.4",
         "semver": "^7.3.5",
         "vscode-languageserver-protocol": "3.17.2-next.6"
-      },
-      "dependencies": {
-        "semver": {
-          "version": "7.3.7",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-          "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-          "requires": {
-            "lru-cache": "^6.0.0"
-          }
-        }
       }
     },
     "vscode-languageserver-protocol": {
diff --git a/mlir/utils/vscode/package.json b/mlir/utils/vscode/package.json
index 6d0f6f5c88ad..74f9ba37c7f1 100644
--- a/mlir/utils/vscode/package.json
+++ b/mlir/utils/vscode/package.json
@@ -39,6 +39,8 @@
   "dependencies": {
     "base64-js": "^1.5.1",
     "chokidar": "3.5.2",
+    "minimatch": "^3.0.5",
+    "semver": "^7.5.2",
     "vscode-languageclient": "^8.0.2-next.5"
   },
   "devDependencies": {

From 64bd4858dc2d64311622e793b66094b07ca7bdc5 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Tue, 17 Jun 2025 10:09:10 +0100
Subject: [PATCH 0607/1322] Amend enviroment variables in bazel - change from
 #144391 (#144484)

---
 .../llvm/include/llvm/Config/llvm-config.h                    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 5dd53cffb7bd..8a9c74d67b12 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -132,10 +132,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0
+#define LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 0
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 0
+#define LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN 0
 
 #endif

From e5ad7f4556ba4f31380153f70a8c6186926764e2 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang@sifive.com>
Date: Tue, 17 Jun 2025 17:21:24 +0800
Subject: [PATCH 0608/1322] [RISCV] Move RISCVIndirectBranchTracking before
 Branch Relaxation (#139993)

The `RISCVIndirectBranchTracking` pass inserts `lpad` instruction and
could change the basic block alignment, so this should not happen after
the branch relaxation as the adjusted offset is possible to exceed the
branch range.
---
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 5 ++++-
 llvm/test/CodeGen/RISCV/O0-pipeline.ll       | 2 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll       | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 8a47453cedcd..0bea3bc432b6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -570,6 +570,10 @@ void RISCVPassConfig::addPreEmitPass() {
     addPass(createMachineCopyPropagationPass(true));
   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
     addPass(createRISCVLateBranchOptPass());
+  // The IndirectBranchTrackingPass inserts lpad and could have changed the
+  // basic block alignment. It must be done before Branch Relaxation to
+  // prevent the adjusted offset exceeding the branch range.
+  addPass(createRISCVIndirectBranchTrackingPass());
   addPass(&BranchRelaxationPassID);
   addPass(createRISCVMakeCompressibleOptPass());
 }
@@ -581,7 +585,6 @@ void RISCVPassConfig::addPreEmitPass2() {
     // ensuring return instruction is detected correctly.
     addPass(createRISCVPushPopOptimizationPass());
   }
-  addPass(createRISCVIndirectBranchTrackingPass());
   addPass(createRISCVExpandPseudoPass());
 
   // Schedule the expansion of AMOs at the last possible moment, avoiding the
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 694662eab168..8714b286374a 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -62,6 +62,7 @@
 ; CHECK-NEXT:       Insert fentry calls
 ; CHECK-NEXT:       Insert XRay ops
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
+; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
@@ -73,7 +74,6 @@
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
-; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 19de864422bc..c7f70a9d266c 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -195,6 +195,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       RISC-V Late Branch Optimisation Pass
+; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
@@ -210,7 +211,6 @@
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISC-V Zcmp move merging pass
 ; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass
-; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles

From 97e17e15957bf6f03923ca46301b32cad507f34b Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 11:34:05 +0200
Subject: [PATCH 0609/1322] Revert "[flang] Enable delayed localization by
 default for `do concurrent` (#144074)" (#144476)

This reverts commit b5dbf8210a57b986b9802304745f4c5c108cf37b.

Reverting again due to gfortran failure:
https://lab.llvm.org/buildbot/#/builders/17/builds/8868
---
 flang/lib/Lower/Bridge.cpp                            | 6 +++++-
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 5ff8101dba09..64b16b3abe99 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,7 +2033,11 @@ private:
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
+    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
+    // default unlike the staging flag) once the implementation of this is more
+    // complete.
+    bool useDelayedPriv =
+        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 039b17808d19..6cae0eb46db1 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index 67f080eb2c1c..a3d0c34ed856 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 798cbb335c8c..d64321385474 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 64f14ff97227..60df27a591dc 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 34d7bcfb7d7a..84db1972cca1 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From dfd00edbabef8094bec663cca9314a950ec56e0d Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Tue, 17 Jun 2025 10:37:18 +0100
Subject: [PATCH 0610/1322] Fix for #144391 not fully addressed by #144484
 (#144488)

---
 utils/bazel/llvm_configs/llvm-config.h.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 6d3c37cc8b19..a0ad517a6ecf 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -131,10 +131,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
 #endif

From 277b2b6da70b488e08b0f0eecba2a4cd1dd01129 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 10:39:54 +0100
Subject: [PATCH 0611/1322] [X86] combineCastedMaskArithmetic - convert to
 SDPatternMatch matching. NFC. (#144472)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 28 +++++++++----------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 820b9c53a508..2eadcc5416c2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45513,6 +45513,7 @@ static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
 
   if (!DCI.isBeforeLegalizeOps())
@@ -45526,15 +45527,6 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
 
-  if (!Op.hasOneUse())
-    return SDValue();
-
-  // Look for logic ops.
-  if (Op.getOpcode() != ISD::AND &&
-      Op.getOpcode() != ISD::OR &&
-      Op.getOpcode() != ISD::XOR)
-    return SDValue();
-
   // Make sure we have a bitcast between mask registers and a scalar type.
   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
         DstVT.isScalarInteger()) &&
@@ -45542,18 +45534,18 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
         SrcVT.isScalarInteger()))
     return SDValue();
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+  SDValue LHS, RHS;
 
-  if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
-      LHS.getOperand(0).getValueType() == DstVT)
-    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
-                       DAG.getBitcast(DstVT, RHS));
+  // Look for logic ops.
+  if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))
+    return SDValue();
 
-  if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
-      RHS.getOperand(0).getValueType() == DstVT)
+  // If either operand was bitcast from DstVT, then perform logic with DstVT (at
+  // least one of the getBitcast() will fold away).
+  if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
+      sd_match(RHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))))
     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
-                       DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+                       DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
 
   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
   // Most of these have to move a constant from the scalar domain anyway.

From aa01e8e9cff9e754b47be57b2f85b962cf1ec9fb Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Tue, 17 Jun 2025 10:42:42 +0100
Subject: [PATCH 0612/1322] [mlir][OpenMP] Fix broken insertion point for
 charbox with omp task (#143112)

Fixes #142365
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  3 +-
 .../Target/LLVMIR/openmp-task-charbox.mlir    | 87 +++++++++++++++++++
 2 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-task-charbox.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6bccc1d6f5d3..90ce06a0345c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2294,8 +2294,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
     if (!privateVarOrErr)
       return handleError(privateVarOrErr, *taskOp.getOperation());
 
-    llvm::IRBuilderBase::InsertPointGuard guard(builder);
-    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+    setInsertPointForPossiblyEmptyBlock(builder);
 
     // TODO: this is a bit of a hack for Fortran character boxes.
     // Character boxes are passed by value into the init region and then the
diff --git a/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir b/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
new file mode 100644
index 000000000000..7a448f74ed64
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
@@ -0,0 +1,87 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// Regression test for a compiler crash. Ensure that the insertion point is set
+// correctly when triggering the charbox hack multiple times.
+// Nonsense test code to minimally reproduce the issue.
+
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  omp.private {type = private} @_QFEc2_private_box_heap_c8xU : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(24 : i32) : i32
+    %1 = llvm.mlir.constant(0 : i64) : i64
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    "llvm.intr.memcpy"(%3, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    %6 = llvm.ptrtoint %arg0 : !llvm.ptr to i64
+    %7 = llvm.icmp "eq" %6, %1 : i64
+    llvm.cond_br %7, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    llvm.br ^bb3
+  ^bb2:  // pred: ^bb0
+    llvm.br ^bb3
+  ^bb3:  // 2 preds: ^bb1, ^bb2
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    omp.yield
+  }
+  omp.private {type = private} @_QFEc1_private_box_ptr_c8xU : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(24 : i32) : i32
+    %1 = llvm.mlir.constant(1 : i32) : i32
+    %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    "llvm.intr.memcpy"(%2, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "c2"} : (i64) -> !llvm.ptr
+    %2 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "c1"} : (i64) -> !llvm.ptr
+    omp.task private(@_QFEc1_private_box_ptr_c8xU %2 -> %arg0, @_QFEc2_private_box_heap_c8xU %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: @_QQmain() {
+// CHECK:         %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
+// CHECK:         br label %[[VAL_2:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_3:.*]]
+// CHECK:         br label %[[VAL_4:.*]]
+// CHECK:       omp.private.init:                                 ; preds = %[[VAL_2]]
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr null, i32 1) to i64))
+// CHECK:         %[[VAL_6:.*]] = getelementptr { { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr %[[VAL_5]], i32 0, i32 0
+// CHECK:         %[[VAL_7:.*]] = getelementptr { { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr %[[VAL_5]], i32 0, i32 1
+// ...
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.private.init4:                                ; preds = %[[VAL_10:.*]], %[[VAL_11:.*]]
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       omp.private.init3:                                ; preds = %[[VAL_9]]
+// CHECK:         br label %[[VAL_13:.*]]
+// CHECK:       omp.private.init2:                                ; preds = %[[VAL_9]]
+// CHECK:         br label %[[VAL_13]]
+// CHECK:       omp.private.init1:                                ; preds = %[[VAL_4]]
+// CHECK:         %[[VAL_14:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[VAL_14]], ptr %[[VAL_0]], i32 24, i1 false)
+// CHECK:         %[[VAL_15:.*]] = ptrtoint ptr %[[VAL_0]] to i64
+// CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_15]], 0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_10]], label %[[VAL_11]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_13]]
+// CHECK:         %[[VAL_17:.*]] = phi ptr [ %[[VAL_7]], %[[VAL_13]] ]
+// CHECK:         br label %[[VAL_18:.*]]
+// CHECK:       omp.private.copy:                                 ; preds = %[[VAL_12]]
+// CHECK:         br label %[[VAL_19:.*]]
+// CHECK:       omp.task.start:                                   ; preds = %[[VAL_18]]
+// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:       codeRepl:                                         ; preds = %[[VAL_19]]
+// CHECK:         %[[VAL_21:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_5]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_23:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[VAL_22]], i32 1, i64 40, i64 8, ptr @_QQmain..omp_par)
+// CHECK:         %[[VAL_24:.*]] = load ptr, ptr %[[VAL_23]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_24]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK:         %[[VAL_25:.*]] = call i32 @__kmpc_omp_task(ptr @1, i32 %[[VAL_22]], ptr %[[VAL_23]])

From 00709c306d0a0f60d169ab25f612ed6715e16743 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 18:44:32 +0900
Subject: [PATCH 0613/1322] AArch64: Fix hardcoding calling convention of
 sincos_stret (NFC) (#144336)

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c7ffc39b5b16..1169efce3123 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5061,9 +5061,10 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
+  CallingConv::ID CC = getLibcallCallingConv(LC);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+      .setLibCallee(CC, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;

From 4c8f43440955c93a54b9547421513867bc81788a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Tue, 17 Jun 2025 11:51:09 +0200
Subject: [PATCH 0614/1322] [analyzer] Conversion to CheckerFamily:
 NullabilityChecker (#143735)

This commit converts NullabilityChecker to the new checker family
framework that was introduced in the recent commit
6833076a5d9f5719539a24e900037da5a3979289

This commit removes the dummy checker `nullability.NullabilityBase`
because it was hidden from the users and didn't have any useful role
except for helping the registration of the checker parts in the old
ad-hoc system (which is replaced by the new standardized framework).

Except for the removal of this dummy checker, no functional changes
intended.
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td |  53 +++---
 .../Checkers/NullabilityChecker.cpp           | 173 +++++++++---------
 .../test/Analysis/analyzer-enabled-checkers.c |   1 -
 clang/test/Analysis/bugfix-124477.m           |   2 +-
 ...c-library-functions-arg-enabled-checkers.c |   1 -
 5 files changed, 107 insertions(+), 123 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 2a96df80d100..211ce585fbac 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -326,39 +326,34 @@ def StdVariantChecker : Checker<"StdVariant">,
 
 let ParentPackage = Nullability in {
 
-def NullabilityBase : Checker<"NullabilityBase">,
-  HelpText<"Stores information during the analysis about nullability.">,
-  Documentation<NotDocumented>,
-  Hidden;
+  def NullPassedToNonnullChecker
+      : Checker<"NullPassedToNonnull">,
+        HelpText<"Warns when a null pointer is passed to a pointer which has a "
+                 "_Nonnull type.">,
+        Documentation<HasDocumentation>;
 
-def NullPassedToNonnullChecker : Checker<"NullPassedToNonnull">,
-  HelpText<"Warns when a null pointer is passed to a pointer which has a "
-           "_Nonnull type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullReturnedFromNonnullChecker
+      : Checker<"NullReturnedFromNonnull">,
+        HelpText<"Warns when a null pointer is returned from a function that "
+                 "has _Nonnull return type.">,
+        Documentation<HasDocumentation>;
 
-def NullReturnedFromNonnullChecker : Checker<"NullReturnedFromNonnull">,
-  HelpText<"Warns when a null pointer is returned from a function that has "
-           "_Nonnull return type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullableDereferencedChecker
+      : Checker<"NullableDereferenced">,
+        HelpText<"Warns when a nullable pointer is dereferenced.">,
+        Documentation<HasDocumentation>;
 
-def NullableDereferencedChecker : Checker<"NullableDereferenced">,
-  HelpText<"Warns when a nullable pointer is dereferenced.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullablePassedToNonnullChecker
+      : Checker<"NullablePassedToNonnull">,
+        HelpText<"Warns when a nullable pointer is passed to a pointer which "
+                 "has a _Nonnull type.">,
+        Documentation<HasDocumentation>;
 
-def NullablePassedToNonnullChecker : Checker<"NullablePassedToNonnull">,
-  HelpText<"Warns when a nullable pointer is passed to a pointer which has a "
-           "_Nonnull type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
-
-def NullableReturnedFromNonnullChecker : Checker<"NullableReturnedFromNonnull">,
-  HelpText<"Warns when a nullable pointer is returned from a function that has "
-           "_Nonnull return type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<NotDocumented>;
+  def NullableReturnedFromNonnullChecker
+      : Checker<"NullableReturnedFromNonnull">,
+        HelpText<"Warns when a nullable pointer is returned from a function "
+                 "that has _Nonnull return type.">,
+        Documentation<NotDocumented>;
 
 } // end "nullability"
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index 461d01b452fd..9744d1abf779 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -81,11 +81,12 @@ enum class ErrorKind : int {
 };
 
 class NullabilityChecker
-    : public Checker<check::Bind, check::PreCall, check::PreStmt<ReturnStmt>,
-                     check::PostCall, check::PostStmt<ExplicitCastExpr>,
-                     check::PostObjCMessage, check::DeadSymbols, eval::Assume,
-                     check::Location, check::Event<ImplicitNullDerefEvent>,
-                     check::BeginFunction> {
+    : public CheckerFamily<
+          check::Bind, check::PreCall, check::PreStmt<ReturnStmt>,
+          check::PostCall, check::PostStmt<ExplicitCastExpr>,
+          check::PostObjCMessage, check::DeadSymbols, eval::Assume,
+          check::Location, check::Event<ImplicitNullDerefEvent>,
+          check::BeginFunction> {
 
 public:
   // If true, the checker will not diagnose nullabilility issues for calls
@@ -113,25 +114,21 @@ public:
   void printState(raw_ostream &Out, ProgramStateRef State, const char *NL,
                   const char *Sep) const override;
 
-  enum CheckKind {
-    CK_NullPassedToNonnull,
-    CK_NullReturnedFromNonnull,
-    CK_NullableDereferenced,
-    CK_NullablePassedToNonnull,
-    CK_NullableReturnedFromNonnull,
-    CK_NumCheckKinds
-  };
+  StringRef getDebugTag() const override { return "NullabilityChecker"; }
 
-  bool ChecksEnabled[CK_NumCheckKinds] = {false};
-  CheckerNameRef CheckNames[CK_NumCheckKinds];
-  mutable std::unique_ptr<BugType> BTs[CK_NumCheckKinds];
-
-  const std::unique_ptr<BugType> &getBugType(CheckKind Kind) const {
-    if (!BTs[Kind])
-      BTs[Kind].reset(new BugType(CheckNames[Kind], "Nullability",
-                                  categories::MemoryError));
-    return BTs[Kind];
-  }
+  // FIXME: All bug types share the same Description ("Nullability") since the
+  // creation of this checker. We should write more descriptive descriptions...
+  // or just eliminate the Description field if it is meaningless?
+  CheckerFrontendWithBugType NullPassedToNonnull{"Nullability",
+                                                 categories::MemoryError};
+  CheckerFrontendWithBugType NullReturnedFromNonnull{"Nullability",
+                                                     categories::MemoryError};
+  CheckerFrontendWithBugType NullableDereferenced{"Nullability",
+                                                  categories::MemoryError};
+  CheckerFrontendWithBugType NullablePassedToNonnull{"Nullability",
+                                                     categories::MemoryError};
+  CheckerFrontendWithBugType NullableReturnedFromNonnull{
+      "Nullability", categories::MemoryError};
 
   // When set to false no nullability information will be tracked in
   // NullabilityMap. It is possible to catch errors like passing a null pointer
@@ -164,17 +161,16 @@ private:
   ///
   /// When \p SuppressPath is set to true, no more bugs will be reported on this
   /// path by this checker.
-  void reportBugIfInvariantHolds(StringRef Msg, ErrorKind Error, CheckKind CK,
-                                 ExplodedNode *N, const MemRegion *Region,
-                                 CheckerContext &C,
+  void reportBugIfInvariantHolds(StringRef Msg, ErrorKind Error,
+                                 const BugType &BT, ExplodedNode *N,
+                                 const MemRegion *Region, CheckerContext &C,
                                  const Stmt *ValueExpr = nullptr,
                                  bool SuppressPath = false) const;
 
-  void reportBug(StringRef Msg, ErrorKind Error, CheckKind CK, ExplodedNode *N,
-                 const MemRegion *Region, BugReporter &BR,
+  void reportBug(StringRef Msg, ErrorKind Error, const BugType &BT,
+                 ExplodedNode *N, const MemRegion *Region, BugReporter &BR,
                  const Stmt *ValueExpr = nullptr) const {
-    const std::unique_ptr<BugType> &BT = getBugType(CK);
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     if (Region) {
       R->markInteresting(Region);
       R->addVisitor<NullabilityBugVisitor>(Region);
@@ -480,7 +476,7 @@ static bool checkInvariantViolation(ProgramStateRef State, ExplodedNode *N,
 }
 
 void NullabilityChecker::reportBugIfInvariantHolds(
-    StringRef Msg, ErrorKind Error, CheckKind CK, ExplodedNode *N,
+    StringRef Msg, ErrorKind Error, const BugType &BT, ExplodedNode *N,
     const MemRegion *Region, CheckerContext &C, const Stmt *ValueExpr,
     bool SuppressPath) const {
   ProgramStateRef OriginalState = N->getState();
@@ -492,7 +488,7 @@ void NullabilityChecker::reportBugIfInvariantHolds(
     N = C.addTransition(OriginalState, N);
   }
 
-  reportBug(Msg, Error, CK, N, Region, C.getBugReporter(), ValueExpr);
+  reportBug(Msg, Error, BT, N, Region, C.getBugReporter(), ValueExpr);
 }
 
 /// Cleaning up the program state.
@@ -546,19 +542,19 @@ void NullabilityChecker::checkEvent(ImplicitNullDerefEvent Event) const {
   if (!TrackedNullability)
     return;
 
-  if (ChecksEnabled[CK_NullableDereferenced] &&
+  if (NullableDereferenced.isEnabled() &&
       TrackedNullability->getValue() == Nullability::Nullable) {
     BugReporter &BR = *Event.BR;
     // Do not suppress errors on defensive code paths, because dereferencing
     // a nullable pointer is always an error.
     if (Event.IsDirectDereference)
       reportBug("Nullable pointer is dereferenced",
-                ErrorKind::NullableDereferenced, CK_NullableDereferenced,
+                ErrorKind::NullableDereferenced, NullableDereferenced,
                 Event.SinkNode, Region, BR);
     else {
       reportBug("Nullable pointer is passed to a callee that requires a "
                 "non-null",
-                ErrorKind::NullablePassedToNonnull, CK_NullableDereferenced,
+                ErrorKind::NullablePassedToNonnull, NullableDereferenced,
                 Event.SinkNode, Region, BR);
     }
   }
@@ -710,29 +706,28 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
   Nullability RetExprTypeLevelNullability =
         getNullabilityAnnotation(lookThroughImplicitCasts(RetExpr)->getType());
 
-  bool NullReturnedFromNonNull = (RequiredNullability == Nullability::Nonnull &&
-                                  Nullness == NullConstraint::IsNull);
-  if (ChecksEnabled[CK_NullReturnedFromNonnull] && NullReturnedFromNonNull &&
-      RetExprTypeLevelNullability != Nullability::Nonnull &&
-      !InSuppressedMethodFamily) {
-    ExplodedNode *N = C.generateErrorNode(State);
-    if (!N)
+  if (RequiredNullability == Nullability::Nonnull &&
+      Nullness == NullConstraint::IsNull) {
+    if (NullReturnedFromNonnull.isEnabled() &&
+        RetExprTypeLevelNullability != Nullability::Nonnull &&
+        !InSuppressedMethodFamily) {
+      ExplodedNode *N = C.generateErrorNode(State);
+      if (!N)
+        return;
+
+      SmallString<256> SBuf;
+      llvm::raw_svector_ostream OS(SBuf);
+      OS << (RetExpr->getType()->isObjCObjectPointerType() ? "nil" : "Null");
+      OS << " returned from a " << C.getDeclDescription(D)
+         << " that is expected to return a non-null value";
+      reportBugIfInvariantHolds(OS.str(), ErrorKind::NilReturnedToNonnull,
+                                NullReturnedFromNonnull, N, nullptr, C,
+                                RetExpr);
       return;
+    }
 
-    SmallString<256> SBuf;
-    llvm::raw_svector_ostream OS(SBuf);
-    OS << (RetExpr->getType()->isObjCObjectPointerType() ? "nil" : "Null");
-    OS << " returned from a " << C.getDeclDescription(D) <<
-          " that is expected to return a non-null value";
-    reportBugIfInvariantHolds(OS.str(), ErrorKind::NilReturnedToNonnull,
-                              CK_NullReturnedFromNonnull, N, nullptr, C,
-                              RetExpr);
-    return;
-  }
-
-  // If null was returned from a non-null function, mark the nullability
-  // invariant as violated even if the diagnostic was suppressed.
-  if (NullReturnedFromNonNull) {
+    // If null was returned from a non-null function, mark the nullability
+    // invariant as violated even if the diagnostic was suppressed.
     State = State->set<InvariantViolated>(true);
     C.addTransition(State);
     return;
@@ -746,7 +741,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
       State->get<NullabilityMap>(Region);
   if (TrackedNullability) {
     Nullability TrackedNullabValue = TrackedNullability->getValue();
-    if (ChecksEnabled[CK_NullableReturnedFromNonnull] &&
+    if (NullableReturnedFromNonnull.isEnabled() &&
         Nullness != NullConstraint::IsNotNull &&
         TrackedNullabValue == Nullability::Nullable &&
         RequiredNullability == Nullability::Nonnull) {
@@ -758,7 +753,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
             " that is expected to return a non-null value";
 
       reportBugIfInvariantHolds(OS.str(), ErrorKind::NullableReturnedToNonnull,
-                                CK_NullableReturnedFromNonnull, N, Region, C);
+                                NullableReturnedFromNonnull, N, Region, C);
     }
     return;
   }
@@ -809,8 +804,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
 
     unsigned ParamIdx = Param->getFunctionScopeIndex() + 1;
 
-    if (ChecksEnabled[CK_NullPassedToNonnull] &&
-        Nullness == NullConstraint::IsNull &&
+    if (NullPassedToNonnull.isEnabled() && Nullness == NullConstraint::IsNull &&
         ArgExprTypeLevelNullability != Nullability::Nonnull &&
         RequiredNullability == Nullability::Nonnull &&
         isDiagnosableCall(Call)) {
@@ -824,7 +818,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
       OS << " passed to a callee that requires a non-null " << ParamIdx
          << llvm::getOrdinalSuffix(ParamIdx) << " parameter";
       reportBugIfInvariantHolds(OS.str(), ErrorKind::NilPassedToNonnull,
-                                CK_NullPassedToNonnull, N, nullptr, C, ArgExpr,
+                                NullPassedToNonnull, N, nullptr, C, ArgExpr,
                                 /*SuppressPath=*/false);
       return;
     }
@@ -841,7 +835,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
           TrackedNullability->getValue() != Nullability::Nullable)
         continue;
 
-      if (ChecksEnabled[CK_NullablePassedToNonnull] &&
+      if (NullablePassedToNonnull.isEnabled() &&
           RequiredNullability == Nullability::Nonnull &&
           isDiagnosableCall(Call)) {
         ExplodedNode *N = C.addTransition(State);
@@ -850,17 +844,16 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
         OS << "Nullable pointer is passed to a callee that requires a non-null "
            << ParamIdx << llvm::getOrdinalSuffix(ParamIdx) << " parameter";
         reportBugIfInvariantHolds(OS.str(), ErrorKind::NullablePassedToNonnull,
-                                  CK_NullablePassedToNonnull, N, Region, C,
+                                  NullablePassedToNonnull, N, Region, C,
                                   ArgExpr, /*SuppressPath=*/true);
         return;
       }
-      if (ChecksEnabled[CK_NullableDereferenced] &&
+      if (NullableDereferenced.isEnabled() &&
           Param->getType()->isReferenceType()) {
         ExplodedNode *N = C.addTransition(State);
-        reportBugIfInvariantHolds("Nullable pointer is dereferenced",
-                                  ErrorKind::NullableDereferenced,
-                                  CK_NullableDereferenced, N, Region, C,
-                                  ArgExpr, /*SuppressPath=*/true);
+        reportBugIfInvariantHolds(
+            "Nullable pointer is dereferenced", ErrorKind::NullableDereferenced,
+            NullableDereferenced, N, Region, C, ArgExpr, /*SuppressPath=*/true);
         return;
       }
       continue;
@@ -1294,7 +1287,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
 
   bool NullAssignedToNonNull = (LocNullability == Nullability::Nonnull &&
                                 RhsNullness == NullConstraint::IsNull);
-  if (ChecksEnabled[CK_NullPassedToNonnull] && NullAssignedToNonNull &&
+  if (NullPassedToNonnull.isEnabled() && NullAssignedToNonNull &&
       ValNullability != Nullability::Nonnull &&
       ValueExprTypeLevelNullability != Nullability::Nonnull &&
       !isARCNilInitializedLocal(C, S)) {
@@ -1312,7 +1305,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
     OS << (LocType->isObjCObjectPointerType() ? "nil" : "Null");
     OS << " assigned to a pointer which is expected to have non-null value";
     reportBugIfInvariantHolds(OS.str(), ErrorKind::NilAssignedToNonnull,
-                              CK_NullPassedToNonnull, N, nullptr, C, ValueStmt);
+                              NullPassedToNonnull, N, nullptr, C, ValueStmt);
     return;
   }
 
@@ -1338,13 +1331,13 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
     if (RhsNullness == NullConstraint::IsNotNull ||
         TrackedNullability->getValue() != Nullability::Nullable)
       return;
-    if (ChecksEnabled[CK_NullablePassedToNonnull] &&
+    if (NullablePassedToNonnull.isEnabled() &&
         LocNullability == Nullability::Nonnull) {
       ExplodedNode *N = C.addTransition(State, C.getPredecessor());
       reportBugIfInvariantHolds("Nullable pointer is assigned to a pointer "
                                 "which is expected to have non-null value",
                                 ErrorKind::NullableAssignedToNonnull,
-                                CK_NullablePassedToNonnull, N, ValueRegion, C);
+                                NullablePassedToNonnull, N, ValueRegion, C);
     }
     return;
   }
@@ -1391,28 +1384,26 @@ void NullabilityChecker::printState(raw_ostream &Out, ProgramStateRef State,
   }
 }
 
-void ento::registerNullabilityBase(CheckerManager &mgr) {
-  mgr.registerChecker<NullabilityChecker>();
-}
+// The checker group "nullability" (which consists of the checkers that are
+// implemented in this file) has a group-level configuration option which
+// affects all the checkers in the group. As this is a completely unique
+// remnant of old design (this is the only group option in the analyzer), there
+// is no machinery to inject the group name from `Checkers.td`, so it is simply
+// hardcoded here:
+constexpr llvm::StringLiteral GroupName = "nullability";
+constexpr llvm::StringLiteral GroupOptName = "NoDiagnoseCallsToSystemHeaders";
 
-bool ento::shouldRegisterNullabilityBase(const CheckerManager &mgr) {
-  return true;
-}
-
-#define REGISTER_CHECKER(name, trackingRequired)                               \
-  void ento::register##name##Checker(CheckerManager &mgr) {                    \
-    NullabilityChecker *checker = mgr.getChecker<NullabilityChecker>();        \
-    checker->ChecksEnabled[NullabilityChecker::CK_##name] = true;              \
-    checker->CheckNames[NullabilityChecker::CK_##name] =                       \
-        mgr.getCurrentCheckerName();                                           \
-    checker->NeedTracking = checker->NeedTracking || trackingRequired;         \
-    checker->NoDiagnoseCallsToSystemHeaders =                                  \
-        checker->NoDiagnoseCallsToSystemHeaders ||                             \
-        mgr.getAnalyzerOptions().getCheckerBooleanOption(                      \
-            checker, "NoDiagnoseCallsToSystemHeaders", true);                  \
+#define REGISTER_CHECKER(NAME, TRACKING_REQUIRED)                              \
+  void ento::register##NAME##Checker(CheckerManager &Mgr) {                    \
+    NullabilityChecker *Chk = Mgr.getChecker<NullabilityChecker>();            \
+    Chk->NAME.enable(Mgr);                                                     \
+    Chk->NeedTracking = Chk->NeedTracking || TRACKING_REQUIRED;                \
+    Chk->NoDiagnoseCallsToSystemHeaders =                                      \
+        Mgr.getAnalyzerOptions().getCheckerBooleanOption(GroupName,            \
+                                                         GroupOptName, true);  \
   }                                                                            \
                                                                                \
-  bool ento::shouldRegister##name##Checker(const CheckerManager &mgr) {        \
+  bool ento::shouldRegister##NAME##Checker(const CheckerManager &) {           \
     return true;                                                               \
   }
 
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 66b9be9795f1..78ee00deea18 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -34,7 +34,6 @@
 // CHECK-NEXT: core.uninitialized.CapturedBlockVariable
 // CHECK-NEXT: core.uninitialized.UndefReturn
 // CHECK-NEXT: deadcode.DeadStores
-// CHECK-NEXT: nullability.NullabilityBase
 // CHECK-NEXT: nullability.NullPassedToNonnull
 // CHECK-NEXT: nullability.NullReturnedFromNonnull
 // CHECK-NEXT: security.insecureAPI.SecuritySyntaxChecker
diff --git a/clang/test/Analysis/bugfix-124477.m b/clang/test/Analysis/bugfix-124477.m
index 80820f4c9344..8bb0196b2f9b 100644
--- a/clang/test/Analysis/bugfix-124477.m
+++ b/clang/test/Analysis/bugfix-124477.m
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced,nullability.NullabilityBase -x objective-c %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced -x objective-c %s
 /*
   This test is reduced from a static analyzer crash. The bug causing
   the crash is explained in #124477.  It can only be triggered in some
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 8c6078a49c23..7f9c9ff4c9fd 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -42,7 +42,6 @@
 // CHECK-NEXT: core.uninitialized.CapturedBlockVariable
 // CHECK-NEXT: core.uninitialized.UndefReturn
 // CHECK-NEXT: deadcode.DeadStores
-// CHECK-NEXT: nullability.NullabilityBase
 // CHECK-NEXT: nullability.NullPassedToNonnull
 // CHECK-NEXT: nullability.NullReturnedFromNonnull
 // CHECK-NEXT: security.insecureAPI.SecuritySyntaxChecker

From 6f2983765983b9403ae40430da8034d2d1b6e8a4 Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Tue, 17 Jun 2025 10:54:22 +0100
Subject: [PATCH 0615/1322] Reland: "[Frontend][PCH]-Add support for ignoring
 PCH options (-ignore-pch). (#142409)" (#143614)

Visual Studio has an argument to ignore all PCH related switches.
clang-cl has also support option /Y-. Having the same option in clang
would be helpful. This commit is to add support for ignoring PCH options
(-ignore-pch).

The commit includes:
  1. Implement -ignore-pch as a Driver option.
  2. Add a Driver test and a PCH test.
  3. Add a section of -ignore-pch to user manual.
  4. Add a release note for the new option '-ignore-pch'.

The change since the original landing:
  1. preprocessing-only mode doesn't imply that -include-pch is disabled.

Co-authored-by: Matheus Izvekov <mizvekov@gmail.com>
---
 clang/docs/ReleaseNotes.rst           |   2 +
 clang/docs/UsersManual.rst            |  13 +++
 clang/include/clang/Driver/Options.td |   3 +
 clang/lib/Driver/Driver.cpp           |   8 ++
 clang/lib/Driver/ToolChains/Clang.cpp |   5 +-
 clang/test/Driver/ignored-pch.cpp     |  19 +++++
 clang/test/PCH/Inputs/ignored-pch.h   |   6 ++
 clang/test/PCH/ignored-pch.c          | 113 ++++++++++++++++++++++++++
 8 files changed, 167 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/ignored-pch.cpp
 create mode 100644 clang/test/PCH/Inputs/ignored-pch.h
 create mode 100644 clang/test/PCH/ignored-pch.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 59d9612268d3..d32d3921b74f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -339,6 +339,8 @@ New Compiler Flags
 
 - New option ``-Wnrvo`` added and disabled by default to warn about missed NRVO opportunities.
 
+- New option ``-ignore-pch`` added to disable precompiled headers. It overrides ``-emit-pch`` and ``-include-pch``. (#GH142409, `PCHDocs <https://clang.llvm.org/docs/UsersManual.html#ignoring-a-pch-file>`_).
+
 Deprecated Compiler Flags
 -------------------------
 
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 62844f7e6a2f..284a404026df 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1458,6 +1458,19 @@ will be processed from the PCH file. Otherwise, Clang will report an error.
   ``test.h`` since ``test.h`` was included directly in the source file and not
   specified on the command line using ``-include-pch``.
 
+Ignoring a PCH File
+^^^^^^^^^^^^^^^^^^^
+
+To ignore PCH options, a `-ignore-pch` option is passed to ``clang``:
+
+.. code-block:: console
+
+  $ clang -x c-header test.h -Xclang -ignore-pch -o test.h.pch
+  $ clang -include-pch test.h.pch -Xclang -ignore-pch test.c -o test
+
+This option disables precompiled headers, overrides -emit-pch and -include-pch.
+test.h.pch is not generated and not used as a prefix header.
+
 Relocatable PCH Files
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8b7708e530b1..1ba52d50056e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3351,6 +3351,9 @@ defm pch_codegen: OptInCC1FFlag<"pch-codegen", "Generate ", "Do not generate ",
   "code for uses of this PCH that assumes an explicit object file will be built for the PCH">;
 defm pch_debuginfo: OptInCC1FFlag<"pch-debuginfo", "Generate ", "Do not generate ",
   "debug info for types in an object file built from this PCH and do not generate them elsewhere">;
+def ignore_pch : Flag<["-"], "ignore-pch">, Group<f_Group>,
+  Visibility<[ClangOption]>,
+  HelpText<"Disable precompiled headers, overrides -emit-pch and -include-pch">;
 
 def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 6c27d8c67072..780bfc83dc62 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4331,6 +4331,14 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
     YcArg = YuArg = nullptr;
   }
 
+  if (Args.hasArg(options::OPT_include_pch) &&
+      Args.hasArg(options::OPT_ignore_pch)) {
+    // If -ignore-pch is used, -include-pch is disabled. Since -emit-pch is
+    // CC1option, it will not be added to command argments if -ignore-pch is
+    // used.
+    Args.eraseArg(options::OPT_include_pch);
+  }
+
   bool LinkOnly = phases::Link == FinalPhase && Inputs.size() > 0;
   for (auto &I : Inputs) {
     types::ID InputType = I.first;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7dfed3a3356b..bb7e5f424337 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5202,7 +5202,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-emit-module-interface");
     else if (JA.getType() == types::TY_HeaderUnit)
       CmdArgs.push_back("-emit-header-unit");
-    else
+    else if (!Args.hasArg(options::OPT_ignore_pch))
       CmdArgs.push_back("-emit-pch");
   } else if (isa<VerifyPCHJobAction>(JA)) {
     CmdArgs.push_back("-verify-pch");
@@ -5259,7 +5259,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     } else if (JA.getType() == types::TY_PP_Asm) {
       CmdArgs.push_back("-S");
     } else if (JA.getType() == types::TY_AST) {
-      CmdArgs.push_back("-emit-pch");
+      if (!Args.hasArg(options::OPT_ignore_pch))
+        CmdArgs.push_back("-emit-pch");
     } else if (JA.getType() == types::TY_ModuleFile) {
       CmdArgs.push_back("-module-file-info");
     } else if (JA.getType() == types::TY_RewrittenObjC) {
diff --git a/clang/test/Driver/ignored-pch.cpp b/clang/test/Driver/ignored-pch.cpp
new file mode 100644
index 000000000000..a3597dc0fe0d
--- /dev/null
+++ b/clang/test/Driver/ignored-pch.cpp
@@ -0,0 +1,19 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// Create PCH without -ignore-pch.
+// RUN: %clang -x c++-header %S/Inputs/pchfile.h -### 2>&1 | FileCheck %s -check-prefix=CHECK-EMIT-PCH
+// RUN: %clang -x c++-header %S/Inputs/pchfile.h -o %t/pchfile.h.pch
+// RUN: %clang %s -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-INCLUDE-PCH
+// RUN: %clang %s -emit-ast -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefixes=CHECK-EMIT-PCH,CHECK-INCLUDE-PCH
+
+
+// Create PCH with -ignore-pch.
+// RUN: %clang -x c++-header -ignore-pch %S/Inputs/pchfile.h -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+// RUN: %clang %s -ignore-pch -include-pch  %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+// RUN: %clang %s -ignore-pch -emit-ast -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+
+// CHECK-EMIT-PCH: -emit-pch
+// CHECK-INCLUDE-PCH: -include-pch
+// CHECK-IGNORE-PCH-NOT: -emit-pch
+// CHECK-IGNORE-PCH-NOT: -include-pch
diff --git a/clang/test/PCH/Inputs/ignored-pch.h b/clang/test/PCH/Inputs/ignored-pch.h
new file mode 100644
index 000000000000..56047037c331
--- /dev/null
+++ b/clang/test/PCH/Inputs/ignored-pch.h
@@ -0,0 +1,6 @@
+#ifndef IGNORED_PCH_H
+#define IGNORED_PCH_H
+inline int f() {
+  return 42;
+}
+#endif // IGNORED_PCH_H
diff --git a/clang/test/PCH/ignored-pch.c b/clang/test/PCH/ignored-pch.c
new file mode 100644
index 000000000000..5b64582cba61
--- /dev/null
+++ b/clang/test/PCH/ignored-pch.c
@@ -0,0 +1,113 @@
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -o %t.ll
+// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang %s -emit-ast -include-pch %t.pch -o %t.ll
+// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Check that -ignore-pch causes -emit-pch and -include-pch options to be ignored.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ %s
+
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -emit-ast %s -include-pch %t.pch -ignore-pch -o %t.ll
+// RUN: not ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ-ERROR %s
+
+// Check that -ignore-pch works for multiple PCH related options.
+// Test with -building-pch-with-obj.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -building-pch-with-obj -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -building-pch-with-obj -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fallow-pch-with-compiler-errors.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fallow-pch-with-different-modules-cache-path.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.pch
+// RUN: %clang -S -emit-llvm %s -ignore-pch -include-pch %t.pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-codegen.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-codegen -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-codegen -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-debuginfo.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-debuginfo -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-debuginfo -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-instantiate-templates.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-instantiate-templates -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-instantiate-templates -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fno-pch-timestamp.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-pch-timestamp -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-pch-timestamp -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fno-validate-pch.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-validate-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-validate-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -relocatable-pch.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -relocatable-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -relocatable-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -pch-through-hdrstop-create/-pch-through-hdrstop-use
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -pch-through-hdrstop-create -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -pch-through-hdrstop-use -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+
+// Test with AST dump output:
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang %s -include-pch %t.pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST-PCH %s
+// RUN: %clang %s -include-pch %t.pch -ignore-pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST %s
+
+// CHECK-PCH: ignored-pch.c.{{.*}}.pch
+// CHECK-OBJ: ignored-pch.c.{{.*}}.ll
+// CHECK-PCH-ERROR: ignored-pch.c.{{.*}}.pch{{'?}}: No such file or directory
+// CHECK-OBJ-ERROR: ignored-pch.c.{{.*}}.ll{{'?}}: No such file or directory
+// CHECK-AST-PCH: <undeserialized declarations>
+// CHECK-AST-NOT: <undeserialized declarations>
+
+#pragma hdrstop
+#include "Inputs/ignored-pch.h"
+int main() {
+  return f();
+}

From 7eda8274fed9a87f25a54616f5009bb68e511b77 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Tue, 17 Jun 2025 11:03:14 +0100
Subject: [PATCH 0616/1322] [MLIR] Integration tests for lowering
 vector.contract to SVE FEAT_I8MM (#140573)

---
 .../CPU/ArmSVE/vector-contract-i8mm.mlir      | 463 ++++++++++++++++++
 1 file changed, 463 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
new file mode 100644
index 000000000000..5f6e8e4c3089
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
@@ -0,0 +1,463 @@
+// REQUIRES: arm-emulator
+
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:   --convert-vector-to-scf --convert-scf-to-cf  --convert-vector-to-llvm='enable-arm-sve enable-arm-i8mm' \
+// DEFINE:   --expand-strided-metadata --convert-to-llvm --finalize-memref-to-llvm  \
+// DEFINE:   --lower-affine --convert-arith-to-llvm --reconcile-unrealized-casts \
+// DEFINE: -o %t
+
+// DEFINE: %{entry_point} = main
+
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
+
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
+
+#packed_maps = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+
+//
+// Test the lowering of `vector.contract` using the `LowerContractionToSVEI8MMPattern`
+//
+// The operation that the `vector.contract` in this test performs is matrix
+// multiplication with accumulate
+//     OUT = ACC + LHS * RHS
+// of two 8-bit integer matrices LHS and RHS, and a 32-bit integer matrix ACC
+// into a 32-bit integer matrix OUT. The LHS and RHS can be sign- or zero- extended,
+// this test covers all the possible variants.
+//
+// Tested are calculations as well as that the relevant `ArmSVE` dialect
+// operations ('arm_sve.smmla`, arm_sve.ummla`, etc) are emitted.
+//
+// That pattern above handles (therefore this test prepares) input/output vectors with
+// specific shapes:
+//   * LHS:      vector<Mx8xi8>
+//   * RHS:      vector<[N]x8xi8>
+//   * ACC, OUT: vector<Mx[N]xi32>
+// Note that the RHS is transposed.
+// This data layout makes it efficient to load data into SVE
+// registers in the layout expected by FEAT_I8MM instructions.
+// Such a `vector.contract` is representative of the code we aim to generate
+// by scalable vectorisation of `linalg.mmt4d`.
+// See mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+// for more information and rationale about these shapes.
+//
+// In this specific test we use M == 4 and N == 4
+//
+
+// Allocate and initialise a memref containing test data for use as the ACC
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the columns of the memref.
+func.func private @prepareAccTestData(%in: vector<4x4xi32>) -> memref<4x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  %vs = vector.vscale
+  %d = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%d) : memref<4x?xi32>
+
+  scf.for %j = %c0 to %d step %c4 {
+    vector.transfer_write %in, %mem[%c0, %j] {in_bounds = [true, true]} : vector<4x4xi32>, memref<4x?xi32>
+  }
+
+  return %mem : memref<4x?xi32>
+}
+
+// Allocate and initialise a memref containing test data for use as the LHS
+// operand. This function just writes the parameter `%in` into the memref.
+// The size of the LHS does not depends on VSCALE.
+func.func private @prepareLHSTestData(%in: vector<4x8xi8>) -> memref<4x8xi8> {
+  %c0 = arith.constant 0 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %mem = memref.alloc() : memref<4x8xi8>
+  vector.transfer_write %in, %mem[%c0, %c0] {in_bounds = [true, true]} : vector<4x8xi8>, memref<4x8xi8>
+
+  return %mem : memref<4x8xi8>
+}
+
+// Allocate and initialise a memref containing test data for use as the RHS
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the rows of the memref.
+//
+// For convenience, flatten the memref, since the RHS vector is read first as a
+// single-dimensional scalable vector and then cast into [N]x8 shape.
+func.func private @prepareRHSTestData(%in: vector<4x8xi8>) -> memref<?xi8> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %vs = vector.vscale
+  %d = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%d) : memref<?x8xi8>
+
+  scf.for %i = %c0 to %d step %c4 {
+    vector.transfer_write %in, %mem[%i, %c0] {in_bounds = [true, true]} : vector<4x8xi8>, memref<?x8xi8>
+  }
+
+  %mem_out = memref.collapse_shape %mem [[0, 1]] : memref<?x8xi8> into memref<?xi8>
+  return %mem_out : memref<?xi8>
+}
+
+// Test the operation where both LHS and RHS are interpreted as signed, hence
+// we ultimately emit and execute the `smmla` instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_smmla
+// CHECK-IR-COUNT-4: arm_sve.intr.smmla
+func.func @test_smmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // FIXME: Workaround for a crash, see https://github.com/llvm/llvm-project/issues/143670
+  %acc_cast = memref.cast %acc_mem : memref<4x?xi32> to memref<*xi32>
+  call @printMemrefI32(%acc_cast) : (memref<*xi32>) -> ()
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
+                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
+                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[-17, -50,  -1,  48, -13,  22,  39,  33],
+                                   [-35, -24,  37, -32,  33,  30, -11, -17],
+                                   [-28,  31,   3, -44, -15, -27,  22,  35],
+                                   [-23,  39,  48,  26, -23,  32, -39, -38]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(SMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where both LHS and RHS are interpreted as unsigned, hence
+// we ultimately emit and execute the `ummla` instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_ummla
+// CHECK-IR-COUNT-4: arm_sve.intr.ummla
+func.func @test_ummla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[16, 16, 48, 40],
+                                   [40, 24, 35, 12],
+                                   [33, 24, 29, 19],
+                                   [28, 13, 33, 18]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[35, 42, 37, 49, 36, 36, 23, 33],
+                                   [39, 34, 33, 45, 43, 10, 44, 47],
+                                   [18, 35, 29, 25, 36, 33, 28, 29],
+                                   [26, 49, 43, 32, 27, 16, 45, 33]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[18, 31, 37, 35, 44, 22, 37, 28],
+                                   [21, 22, 49, 39, 30, 28, 35, 37],
+                                   [21, 47, 39, 35, 23, 43, 24, 49],
+                                   [49, 49, 40, 32, 37, 20, 47, 40]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(UMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where LHS is interpreted as unsigned and RHS is
+// interpreted as signed, hence we ultimately emit and execute the `usmmla`
+// instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_usmmla
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+func.func @test_usmmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[153, 161,  24, 157, 211, 154,  52,  27],
+                                   [168,  77, 136, 124, 249,  28,  13, 122],
+                                   [ 97,  82, 181,  39,  53,  25,  80, 240],
+                                   [184, 227, 106, 165, 126, 113, 121, 228]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[ 40,  27,  37,  43,  38,  -6,  37,  49],
+                                   [-17, -50,  -1,  48, -13,  22,  39,  33],
+                                   [-35, -24,  37, -32,  33,  30, -11, -17],
+                                   [-28,  31,   3, -44, -15, -27,  22,  35]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(USMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where LHS is interpreted as signed and RHS is interpreted
+// as unsigned. In this test we ultimately emit end execute the `usmmla`
+// instruction with reversed operands, see `LowerContractionToSVEI8MMPattern.cpp`
+// for more details.
+
+// CHECK-IR-LABEL: llvm.func @test_summla
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+func.func @test_summla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
+                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
+                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[125, 171, 138, 187, 108, 175,  82,  99],
+                                   [221,  25, 164,  97, 156, 221, 218, 177],
+                                   [171, 160, 219, 191, 144,  45, 161, 210],
+                                   [223, 165, 123,  99, 108,  86,  37,  92]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(SUMMLA (i.e. USMMLA transposed)):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Perform each test with SVE vector lengths 128 bits and 256 bits (i.e. VSCALEs
+// 1 and 2, respectively). The vector length is set via the `setArmVLBits`
+// function. The effect of setting a different vector length is that the tests
+// allocate and operate on different sized buffers (see `prepare<X>TestData`
+// functions).
+
+func.func @main() {
+  %c128 = arith.constant 128 : i32
+  %c256 = arith.constant 256 : i32
+
+// CHECK-LABEL: Result(SMMLA):
+// CHECK: ( -1999,  1941,   685, -2879 )
+// CHECK: ( -3705,  2952,   987,  -685 )
+// CHECK: (  2565,  4157, -1589,  -357 )
+// CHECK: (  2383, -2252,    32, -1365 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_smmla() : () -> ()
+
+// CHECK: Result(SMMLA):
+// CHECK: ( -1999,  1941,   685, -2879, -1999,  1941,   685, -2879 )
+// CHECK: ( -3705,  2952,   987,  -685, -3705,  2952,   987,  -685 )
+// CHECK: (  2565,  4157, -1589,  -357,  2565,  4157, -1589,  -357 )
+// CHECK: (  2383, -2252,    32, -1365,  2383, -2252,    32, -1365 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_smmla() : () -> ()
+
+// CHECK-LABEL: Result(UMMLA):
+// CHECK: ( 9183, 9513, 10460, 11314 )
+// CHECK: ( 9648, 9812, 10092, 12088 )
+// CHECK: ( 7548, 7625,  8398,  9044 )
+// CHECK: ( 8855, 9046,  9685, 11191 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_ummla() : () -> ()
+
+// CHECK: Result(UMMLA):
+// CHECK: ( 9183, 9513, 10460, 11314, 9183, 9513, 10460, 11314 )
+// CHECK: ( 9648, 9812, 10092, 12088, 9648, 9812, 10092, 12088 )
+// CHECK: ( 7548, 7625,  8398,  9044, 7548, 7625,  8398,  9044 )
+// CHECK: ( 8855, 9046,  9685, 11191, 8855, 9046,  9685, 11191 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_ummla() : () -> ()
+
+// CHECK-LABEL: Result(USMMLA):
+// CHECK: ( 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_usmmla() : () -> ()
+
+// CHECK: Result(USMMLA):
+// CHECK: ( 28403,  445,  -2759, -11409, 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274, 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382, 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623, 44217, 6396, -10930,    623 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_usmmla() : () -> ()
+
+// CHECK-LABEL: Result(SUMMLA (i.e. USMMLA transposed)):
+// CHECK: ( -27190, -28812, -30502, -23575 )
+// CHECK: (  -7613,  -8386, -15938,  -6521 )
+// CHECK: (   9468,  18750,   9199,   5764 )
+// CHECK: (  33655,  41064,  48900,  31627 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_summla() : () -> ()
+
+// CHECK: Result(SUMMLA (i.e. USMMLA transposed)):
+// CHECK: ( -27190, -28812, -30502, -23575, -27190, -28812, -30502, -23575 )
+// CHECK: (  -7613,  -8386, -15938,  -6521,  -7613,  -8386, -15938,  -6521 )
+// CHECK: (   9468,  18750,   9199,   5764,   9468,  18750,   9199,   5764 )
+// CHECK: (  33655,  41064,  48900,  31627,  33655,  41064,  48900,  31627 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_summla() : () -> ()
+
+  return
+}
+
+func.func private @setArmVLBits(%bits : i32)
+func.func private @printMemrefI32(%ptr : memref<*xi32>)

From c377ce1216a8ce73c940d2366a7bf223790f43b4 Mon Sep 17 00:00:00 2001
From: Mary Kassayova <mary.kassayova@arm.com>
Date: Tue, 17 Jun 2025 11:07:43 +0100
Subject: [PATCH 0617/1322] [AArch64][VecLib] Add libmvec support for AArch64
 targets (#143696)

This patch adds support for the `libmvec` vector library on AArch64
targets. Currently, all `libmvec` functions in GLIBC version 2.40 are
supported. The full list of math functions enabled can be found
[here](https://github.com/bminor/glibc/blob/96abd59bf2a11ddd4e7ccaac840ec13c0b62d3ba/sysdeps/aarch64/fpu/Versions)
(up to GLIBC 2.40).

Previously, `libmvec` was only supported on x86_64 targets. Attempts to
use it on AArch64 resulted in the following error from Clang:
`unsupported option 'libmvec' for target 'aarch64'`.
---
 clang/docs/ReleaseNotes.rst                   |    2 +
 clang/include/clang/Driver/Options.td         |    5 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |    9 +-
 clang/test/Driver/fveclib.c                   |   10 +-
 llvm/include/llvm/Analysis/VecFuncs.def       |  260 +++++
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |   12 +
 .../replace-with-veclib-libmvec-scalable.ll   |  579 +++++++++
 .../AArch64/replace-with-veclib-libmvec.ll    |  577 +++++++++
 .../AArch64/veclib-function-calls.ll          | 1035 +++++++++++++++++
 .../AArch64/veclib-intrinsic-calls.ll         |  735 ++++++++++++
 llvm/test/Transforms/Util/add-TLI-mappings.ll |   28 +-
 11 files changed, 3243 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
 create mode 100644 llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d32d3921b74f..03641f5d0ea0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -359,6 +359,8 @@ Modified Compiler Flags
 
 - The ``-fchar8_t`` flag is no longer considered in non-C++ languages modes. (#GH55373)
 
+- The ``-fveclib=libmvec`` option now supports AArch64 targets (requires GLIBC 2.40 or newer).
+
 Removed Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1ba52d50056e..0ffd8c40da7d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3479,8 +3479,9 @@ def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
     HelpText<"Use the given vector functions library">,
     HelpTextForVariants<[ClangOption, CC1Option],
-      "Use the given vector functions library. "
-      "Note: -fveclib={ArmPL,SLEEF} implies -fno-math-errno">,
+      "Use the given vector functions library.\n"
+      "  Note: -fveclib={ArmPL,SLEEF,libmvec} implies -fno-math-errno.\n"
+      "  Note: -fveclib=libmvec on AArch64 requires GLIBC 2.40 or newer.">,
     Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,ArmPL,AMDLIBM,none">,
     NormalizedValuesScope<"llvm::driver::VectorLibrary">,
     NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML", "SLEEF",
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index bb7e5f424337..a78a1c897818 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5702,11 +5702,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           Triple.getArch() != llvm::Triple::x86_64)
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Name << Triple.getArchName();
-    } else if (Name == "libmvec" || Name == "AMDLIBM") {
+    } else if (Name == "AMDLIBM") {
       if (Triple.getArch() != llvm::Triple::x86 &&
           Triple.getArch() != llvm::Triple::x86_64)
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Name << Triple.getArchName();
+    } else if (Name == "libmvec") {
+      if (Triple.getArch() != llvm::Triple::x86 &&
+          Triple.getArch() != llvm::Triple::x86_64 &&
+          Triple.getArch() != llvm::Triple::aarch64 &&
+          Triple.getArch() != llvm::Triple::aarch64_be)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Name << Triple.getArchName();
     } else if (Name == "SLEEF" || Name == "ArmPL") {
       if (Triple.getArch() != llvm::Triple::aarch64 &&
           Triple.getArch() != llvm::Triple::aarch64_be &&
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 5420555c36a2..c57e9aa7a3cc 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -1,6 +1,7 @@
 // RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck --check-prefix=CHECK-NOLIB %s
 // RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck --check-prefix=CHECK-ACCELERATE %s
 // RUN: %clang -### -c --target=x86_64-unknown-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-libmvec %s
+// RUN: %clang -### -c --target=aarch64-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-LIBMVEC-AARCH64 %s
 // RUN: %clang -### -c --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-AMDLIBM %s
 // RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck --check-prefix=CHECK-MASSV %s
 // RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN_LIBSYSTEM_M %s
@@ -12,6 +13,7 @@
 // CHECK-NOLIB: "-fveclib=none"
 // CHECK-ACCELERATE: "-fveclib=Accelerate"
 // CHECK-libmvec: "-fveclib=libmvec"
+// CHECK-LIBMVEC-AARCH64: "-fveclib=libmvec"
 // CHECK-AMDLIBM: "-fveclib=AMDLIBM"
 // CHECK-MASSV: "-fveclib=MASSV"
 // CHECK-DARWIN_LIBSYSTEM_M: "-fveclib=Darwin_libsystem_m"
@@ -23,7 +25,6 @@
 
 // RUN: not %clang --target=x86 -c -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=x86 -c -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
-// RUN: not %clang --target=aarch64 -c -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=aarch64 -c -fveclib=SVML %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=aarch64 -c -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // CHECK-ERROR: unsupported option {{.*}} for target
@@ -43,6 +44,9 @@
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s
 // CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC"
 
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC-AARCH64 %s
+// CHECK-LTO-LIBMVEC-AARCH64: "-plugin-opt=-vector-library=LIBMVEC"
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-AMDLIBM %s
 // CHECK-LTO-AMDLIBM: "-plugin-opt=-vector-library=AMDLIBM"
 
@@ -68,6 +72,10 @@
 // CHECK-ERRNO-LIBMVEC: "-fveclib=libmvec"
 // CHECK-ERRNO-LIBMVEC-SAME: "-fmath-errno"
 
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-ERRNO-LIBMVEC-AARCH64 %s
+// CHECK-ERRNO-LIBMVEC-AARCH64: "-fveclib=libmvec"
+// CHECK-ERRNO-LIBMVEC-AARCH64-SAME: "-fmath-errno"
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-ERRNO-AMDLIBM %s
 // CHECK-ERRNO-AMDLIBM: "-fveclib=AMDLIBM"
 // CHECK-ERRNO-AMDLIBM-SAME: "-fmath-errno"
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 68753a2497db..4015df990729 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -237,6 +237,266 @@ TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", FIXED(8), "_ZGV_LLVM_N8v")
 
+#elif defined(TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS)
+
+TLI_DEFINE_VECFUNC("acos", "_ZGVnN2v_acos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVnN2v_acosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVnN4v_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVnN2v_acos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN2v_acosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN4v_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("acosh", "_ZGVnN2v_acosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN2v_acoshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN4v_acoshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosh", "_ZGVsMxv_acosh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVsMxv_acoshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("asin", "_ZGVnN2v_asin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVnN2v_asinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVnN4v_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVnN2v_asin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN2v_asinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN4v_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("asinh", "_ZGVnN2v_asinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN2v_asinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN4v_asinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinh", "_ZGVsMxv_asinh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVsMxv_asinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("atan", "_ZGVnN2v_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVnN2v_atanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVnN4v_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVnN2v_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN2v_atanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN4v_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("atan2", "_ZGVnN2vv_atan2", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN2vv_atan2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN4vv_atan2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.atan2.f64", "_ZGVnN2vv_atan2", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVnN2vv_atan2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVnN4vv_atan2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f64", "_ZGVsMxvv_atan2", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("atanh", "_ZGVnN2v_atanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVnN2v_atanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVnN4v_atanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cbrt", "_ZGVnN2v_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVnN2v_cbrtf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVnN4v_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrt", "_ZGVsMxv_cbrt",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVsMxv_cbrtf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cos", "_ZGVnN2v_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVnN2v_cosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVnN4v_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN2v_cosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cosh", "_ZGVnN2v_cosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVnN2v_coshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVnN4v_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVnN2v_cosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN2v_coshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN4v_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVsMxv_cosh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("erf", "_ZGVnN2v_erf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erff", "_ZGVnN2v_erff", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erff", "_ZGVnN4v_erff", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erf", "_ZGVsMxv_erf",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("erff", "_ZGVsMxv_erff", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("erfc", "_ZGVnN2v_erfc", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVnN2v_erfcf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVnN4v_erfcf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfc", "_ZGVsMxv_erfc",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVsMxv_erfcf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp", "_ZGVnN2v_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expf", "_ZGVnN2v_expf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expf", "_ZGVnN4v_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVnN2v_expf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp10", "_ZGVnN2v_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVnN2v_exp10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVnN4v_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "_ZGVnN2v_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVnN2v_exp10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVnN4v_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "_ZGVsMxv_exp10", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp2", "_ZGVnN2v_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVnN2v_exp2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVnN4v_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVnN2v_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVnN2v_exp2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVnN4v_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("expm1", "_ZGVnN2v_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVnN2v_expm1f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVnN4v_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1", "_ZGVsMxv_expm1",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVsMxv_expm1f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("hypot", "_ZGVnN2vv_hypot", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVnN2vv_hypotf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVnN4vv_hypotf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypot", "_ZGVsMxvv_hypot", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVsMxvv_hypotf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("log", "_ZGVnN2v_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("logf", "_ZGVnN2v_logf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("logf", "_ZGVnN4v_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVnN2v_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVnN2v_logf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVnN4v_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log10", "_ZGVnN2v_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVnN2v_log10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVnN4v_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVnN2v_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVnN2v_log10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVnN4v_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log1p", "_ZGVnN2v_log1p", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVnN2v_log1pf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVnN4v_log1pf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1p", "_ZGVsMxv_log1p",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVsMxv_log1pf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log2", "_ZGVnN2v_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVnN2v_log2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVnN4v_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2", "_ZGVsMxv_log2",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVnN2v_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVnN2v_log2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVnN4v_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("pow", "_ZGVnN2vv_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("powf", "_ZGVnN2vv_powf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("powf", "_ZGVnN4vv_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVnN2vv_powf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("sin", "_ZGVnN2v_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVnN2v_sinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVnN4v_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVnN2v_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVnN2v_sinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVnN4v_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("sinh", "_ZGVnN2v_sinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN2v_sinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN4v_sinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVnN2v_sinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN2v_sinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN4v_sinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVsMxv_sinh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("tan", "_ZGVnN2v_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVnN2v_tanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVnN4v_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVnN2v_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN2v_tanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN4v_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVsMxv_tan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("tanh", "_ZGVnN2v_tanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN2v_tanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN4v_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVnN2v_tanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN2v_tanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN4v_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVsMxv_tanh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
 #elif defined(TLI_DEFINE_MASSV_VECFUNCS)
 // IBM MASS library's vector Functions
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c8b568354965..a3ed09313439 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1299,6 +1299,14 @@ static const VecDesc VecFuncs_LIBMVEC_X86[] = {
 #undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 };
 
+static const VecDesc VecFuncs_LIBMVEC_AARCH64[] = {
+#define TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX, CC)               \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX, CC},
+#include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS
+};
+
 static const VecDesc VecFuncs_MASSV[] = {
 #define TLI_DEFINE_MASSV_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
@@ -1376,6 +1384,10 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     case llvm::Triple::x86_64:
       addVectorizableFunctions(VecFuncs_LIBMVEC_X86);
       break;
+    case llvm::Triple::aarch64:
+    case llvm::Triple::aarch64_be:
+      addVectorizableFunctions(VecFuncs_LIBMVEC_AARCH64);
+      break;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
new file mode 100644
index 000000000000..1b541d1330aa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
@@ -0,0 +1,579 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;.
+; CHECK: @llvm.compiler.used = appending global [34 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxvv_pow, ptr @_ZGVsMxvv_powf, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf, ptr @_ZGVsMxv_acos, ptr @_ZGVsMxv_acosf, ptr @_ZGVsMxv_asin, ptr @_ZGVsMxv_asinf, ptr @_ZGVsMxv_atan, ptr @_ZGVsMxv_atanf, ptr @_ZGVsMxvv_atan2, ptr @_ZGVsMxvv_atan2f, ptr @_ZGVsMxv_cosh, ptr @_ZGVsMxv_coshf, ptr @_ZGVsMxv_sinh, ptr @_ZGVsMxv_sinhf, ptr @_ZGVsMxv_tanh, ptr @_ZGVsMxv_tanhf], section "llvm.metadata"
+;.
+define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_ceil_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_ceil_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_ceil_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_copysign_vscale_f64(<vscale x 2 x double> %mag, <vscale x 2 x double> %sgn) {
+; CHECK-LABEL: @llvm_copysign_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[MAG:%.*]], <vscale x 2 x double> [[SGN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> %mag, <vscale x 2 x double> %sgn)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag, <vscale x 4 x float> %sgn) {
+; CHECK-LABEL: @llvm_copysign_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[MAG:%.*]], <vscale x 4 x float> [[SGN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> %mag, <vscale x 4 x float> %sgn)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp10_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp10_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_fabs_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_fabs_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_fabs_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_fabs_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_floor_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_floor_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_floor_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_floor_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_fma_vscale_f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c ) {
+; CHECK-LABEL: @llvm_fma_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
+; CHECK-LABEL: @llvm_fma_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log10_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log10_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_maxnum_vscale_f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1) {
+; CHECK-LABEL: @llvm_maxnum_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[IN0:%.*]], <vscale x 2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_maxnum_vscale_f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1) {
+; CHECK-LABEL: @llvm_maxnum_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[IN0:%.*]], <vscale x 4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_minnum_vscale_f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1) {
+; CHECK-LABEL: @llvm_minnum_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[IN0:%.*]], <vscale x 2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_minnum_vscale_f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1) {
+; CHECK-LABEL: @llvm_minnum_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[IN0:%.*]], <vscale x 4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_nearbyint_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_nearbyint_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_nearbyint_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_nearbyint_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_pow_vscale_f64(<vscale x 2 x double> %in, <vscale x 2 x double> %pow) {
+; CHECK-LABEL: @llvm_pow_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x double> [[POW:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> %in, <vscale x 2 x double> %pow)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_pow_vscale_f32(<vscale x 4 x float> %in, <vscale x 4 x float> %pow) {
+; CHECK-LABEL: @llvm_pow_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x float> [[POW:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %in, <vscale x 4 x float> %pow)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_rint_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_rint_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_rint_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_rint_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_round_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_round_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_round_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sqrt_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sqrt_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sqrt_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sqrt_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_acos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.acos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_acos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.acos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_asin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.asin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_asin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.asin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan2_vscale_f64(<vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: @llvm_atan2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan2.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan2_vscale_f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: @llvm_atan2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan2.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cosh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cosh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cosh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cosh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sinh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sinh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sinh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sinh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tanh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tanh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tanh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+
+define <vscale x 2 x double> @llvm_trunc_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_trunc_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_trunc_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_trunc_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+sve" }
+;.
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
new file mode 100644
index 000000000000..6323d942a08e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
@@ -0,0 +1,577 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;.
+; CHECK: @llvm.compiler.used = appending global [34 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2vv_pow, ptr @_ZGVnN4vv_powf, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf, ptr @_ZGVnN2v_acos, ptr @_ZGVnN4v_acosf, ptr @_ZGVnN2v_asin, ptr @_ZGVnN4v_asinf, ptr @_ZGVnN2v_atan, ptr @_ZGVnN4v_atanf, ptr @_ZGVnN2vv_atan2, ptr @_ZGVnN4vv_atan2f, ptr @_ZGVnN2v_cosh, ptr @_ZGVnN4v_coshf, ptr @_ZGVnN2v_sinh, ptr @_ZGVnN4v_sinhf, ptr @_ZGVnN2v_tanh, ptr @_ZGVnN4v_tanhf], section "llvm.metadata"
+;.
+define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_ceil_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.ceil.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.ceil.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_ceil_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_ceil_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_copysign_f64(<2 x double> %mag, <2 x double> %sgn) {
+; CHECK-LABEL: @llvm_copysign_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> [[MAG:%.*]], <2 x double> [[SGN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sgn)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_copysign_f32(<4 x float> %mag, <4 x float> %sgn) {
+; CHECK-LABEL: @llvm_copysign_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> [[MAG:%.*]], <4 x float> [[SGN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sgn)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_cosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_expf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp10_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp10_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp10(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp10.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp10_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp10.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp2_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp2(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp2.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_fabs_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_fabs_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_fabs_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_fabs_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_floor_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_floor_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.floor.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_floor_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_floor_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_fma_f64(<2 x double> %a, <2 x double> %b, <2 x double> %c ) {
+; CHECK-LABEL: @llvm_fma_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_fma_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @llvm_fma_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_logf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log10_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log10_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log10(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log10.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log10_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log10_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_log10f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log10.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log2_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log2(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log2.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log2_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_log2f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_maxnum_f64(<2 x double> %in0, <2 x double> %in1) {
+; CHECK-LABEL: @llvm_maxnum_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[IN0:%.*]], <2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> %in0, <2 x double> %in1)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_maxnum_f32(<4 x float> %in0, <4 x float> %in1) {
+; CHECK-LABEL: @llvm_maxnum_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> [[IN0:%.*]], <4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %in0, <4 x float> %in1)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_minnum_f64(<2 x double> %in0, <2 x double> %in1) {
+; CHECK-LABEL: @llvm_minnum_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.minnum.v2f64(<2 x double> [[IN0:%.*]], <2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.minnum.v2f64(<2 x double> %in0, <2 x double> %in1)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_minnum_f32(<4 x float> %in0, <4 x float> %in1) {
+; CHECK-LABEL: @llvm_minnum_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> [[IN0:%.*]], <4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %in0, <4 x float> %in1)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_nearbyint_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_nearbyint_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_nearbyint_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_nearbyint_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_pow_f64(<2 x double> %in, <2 x double> %pow) {
+; CHECK-LABEL: @llvm_pow_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2vv_pow(<2 x double> [[IN:%.*]], <2 x double> [[POW:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %in, <2 x double> %pow)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_pow_f32(<4 x float> %in, <4 x float> %pow) {
+; CHECK-LABEL: @llvm_pow_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4vv_powf(<4 x float> [[IN:%.*]], <4 x float> [[POW:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %in, <4 x float> %pow)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_rint_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_rint_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.rint.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_rint_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_rint_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.rint.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_round_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_round_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.round.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.round.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_round_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_round_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.round.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sqrt_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sqrt_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sqrt_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sqrt_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_acos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_acos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.acos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_acos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_acosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.acos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_asin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_asin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.asin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_asin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_asinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.asin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_atan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_atanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan2_f64(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @llvm_atan2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[INX:%.*]], <2 x double> [[INY:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan2.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan2_f32(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @llvm_atan2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[INX:%.*]], <4 x float> [[INY:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan2.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cosh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cosh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cosh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cosh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_coshf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cosh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sinh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sinh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sinh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sinh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sinh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tanh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tanh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tanh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tanh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tanh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_trunc_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_trunc_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_trunc_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_trunc_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.cos.v2f64(<2 x double>)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp10.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp10.v4f32(<4 x float>)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare <2 x double> @llvm.floor.v2f64(<2 x double>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.log.v2f64(<2 x double>)
+declare <4 x float> @llvm.log.v4f32(<4 x float>)
+declare <2 x double> @llvm.log10.v2f64(<2 x double>)
+declare <4 x float> @llvm.log10.v4f32(<4 x float>)
+declare <2 x double> @llvm.log2.v2f64(<2 x double>)
+declare <4 x float> @llvm.log2.v4f32(<4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
+declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.rint.v2f64(<2 x double>)
+declare <4 x float> @llvm.rint.v4f32(<4 x float>)
+declare <2 x double> @llvm.round.v2f64(<2 x double>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
+declare <2 x double> @llvm.sin.v2f64(<2 x double>)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <2 x double> @llvm.tan.v2f64(<2 x double>)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
index c6ea44bb85f1..670b08987c81 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --filter "call.*(cos|sin|tan|cbrt|erf|exp[^e]|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=LIBMVEC-NEON
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s -check-prefix=LIBMVEC-NEON-WIDTH-2
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=LIBMVEC-SVE
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=SLEEF-SVE
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=SLEEF-SVE-NOPRED
@@ -19,6 +22,18 @@ declare double @acos(double)
 declare float @acosf(float)
 
 define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -64,6 +79,18 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -112,6 +139,18 @@ declare double @acosh(double)
 declare float @acoshf(float)
 
 define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -157,6 +196,18 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acoshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acoshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acoshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acoshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -205,6 +256,18 @@ declare double @asin(double)
 declare float @asinf(float)
 
 define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -250,6 +313,18 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -298,6 +373,18 @@ declare double @asinh(double)
 declare float @asinhf(float)
 
 define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -343,6 +430,18 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -391,6 +490,18 @@ declare double @atan(double)
 declare float @atanf(float)
 
 define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -436,6 +547,18 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -484,6 +607,18 @@ declare double @atan2(double, double)
 declare float @atan2f(float, float)
 
 define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -529,6 +664,18 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_atan2f(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -577,6 +724,18 @@ declare double @atanh(double)
 declare float @atanhf(float)
 
 define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -622,6 +781,18 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -670,6 +841,18 @@ declare double @cbrt(double)
 declare float @cbrtf(float)
 
 define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cbrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cbrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cbrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cbrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cbrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
@@ -715,6 +898,18 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cbrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cbrtf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cbrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cbrtf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cbrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cbrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cbrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cbrtf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -763,6 +958,18 @@ declare double @copysign(double, double)
 declare float @copysignf(float, float)
 
 define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_copysign(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -808,6 +1015,18 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_copysignf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -856,6 +1075,18 @@ declare double @cos(double)
 declare float @cosf(float)
 
 define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -901,6 +1132,18 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -949,6 +1192,18 @@ declare double @cosh(double)
 declare float @coshf(float)
 
 define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -994,6 +1249,18 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_coshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1042,6 +1309,18 @@ declare double @cospi(double)
 declare float @cospif(float)
 
 define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cospi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cospi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cospi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cospi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cospi(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1087,6 +1366,18 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cospi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cospi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cospi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cospi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cospif(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1135,6 +1426,18 @@ declare double @erf(double)
 declare float @erff(float)
 
 define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erf_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erf_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erf_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erf(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erf_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1180,6 +1483,18 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erf_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_erff(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erf_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_erff(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erf_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erff(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erf_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_erff(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1228,6 +1543,18 @@ declare double @erfc(double)
 declare float @erfcf(float)
 
 define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erfc_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erfc_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erfc_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erfc(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erfc_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1273,6 +1600,18 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erfc_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_erfcf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erfc_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_erfcf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erfc_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erfcf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erfc_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_erfcf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1321,6 +1660,18 @@ declare double @exp(double)
 declare float @expf(float)
 
 define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1366,6 +1717,18 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1414,6 +1777,18 @@ declare double @exp10(double)
 declare float @exp10f(float)
 
 define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1459,6 +1834,18 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1507,6 +1894,18 @@ declare double @exp2(double)
 declare float @exp2f(float)
 
 define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1552,6 +1951,18 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1600,6 +2011,18 @@ declare double @expm1(double)
 declare float @expm1f(float)
 
 define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @expm1_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @expm1_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @expm1_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_expm1(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @expm1_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1645,6 +2068,18 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @expm1_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expm1f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @expm1_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expm1f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @expm1_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expm1f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @expm1_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expm1f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1693,6 +2128,18 @@ declare double @fdim(double, double)
 declare float @fdimf(float, float)
 
 define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fdim_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fdim_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fdim_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fdim_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fdim(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1738,6 +2185,18 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fdim_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fdim_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fdim_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fdim_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fdimf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1786,6 +2245,18 @@ declare double @fma(double, double, double)
 declare float @fmaf(float, float, float)
 
 define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vvv_fma(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1831,6 +2302,18 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vvv_fmaf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1879,6 +2362,18 @@ declare double @fmax(double, double)
 declare float @fmaxf(float, float)
 
 define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmax_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmax_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmax_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmax_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmax(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1924,6 +2419,18 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmax_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmax_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmax_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmax_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmaxf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1972,6 +2479,18 @@ declare double @fmin(double, double)
 declare float @fminf(float, float)
 
 define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmin(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2017,6 +2536,18 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fminf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2065,6 +2596,18 @@ declare double @fmod(double, double)
 declare float @fmodf(float, float)
 
 define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmod_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmod_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmod_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmod_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2110,6 +2653,18 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmod_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmod_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmod_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmod_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2158,6 +2713,18 @@ declare double @hypot(double, double)
 declare float @hypotf(float, float)
 
 define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @hypot_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @hypot_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @hypot_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_hypot(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @hypot_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2203,6 +2770,18 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @hypot_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_hypotf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @hypot_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_hypotf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @hypot_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_hypotf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @hypot_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_hypotf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2251,6 +2830,18 @@ declare i32 @ilogb(double)
 declare i32 @ilogbf(float)
 
 define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ilogb_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ilogb_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ilogb_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ilogb_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x i32> @_ZGVnN2v_ilogb(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2296,6 +2887,18 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ilogb_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ilogb_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ilogb_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ilogb_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x i32> @_ZGVnN4v_ilogbf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2344,6 +2947,18 @@ declare double @ldexp(double, i32)
 declare float @ldexpf(float, i32)
 
 define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ldexp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ldexp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ldexp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ldexp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP4:%.*]] = call <2 x double> @_ZGVnN2vv_ldexp(<2 x double> [[WIDE_LOAD:%.*]], <2 x i32> [[WIDE_LOAD1:%.*]])
@@ -2391,6 +3006,18 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 }
 
 define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ldexp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ldexp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ldexp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ldexp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP4:%.*]] = call <4 x float> @_ZGVnN4vv_ldexpf(<4 x float> [[WIDE_LOAD:%.*]], <4 x i32> [[WIDE_LOAD1:%.*]])
@@ -2441,6 +3068,18 @@ declare double @lgamma(double)
 declare float @lgammaf(float)
 
 define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @lgamma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @lgamma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @lgamma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @lgamma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2486,6 +3125,18 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @lgamma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @lgamma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @lgamma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @lgamma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2534,6 +3185,18 @@ declare double @log(double)
 declare float @logf(float)
 
 define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2579,6 +3242,18 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_logf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2627,6 +3302,18 @@ declare double @log10(double)
 declare float @log10f(float)
 
 define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2672,6 +3359,18 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2720,6 +3419,18 @@ declare double @log1p(double)
 declare float @log1pf(float)
 
 define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log1p_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log1p_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log1p_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log1p(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log1p_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2765,6 +3476,18 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log1p_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log1pf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log1p_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log1pf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log1p_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log1pf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log1p_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log1pf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2813,6 +3536,18 @@ declare double @log2(double)
 declare float @log2f(float)
 
 define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2858,6 +3593,18 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2909,6 +3656,18 @@ declare double @modf(double, ptr)
 declare float @modff(float, ptr)
 
 define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @modf_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @modf_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @modf_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @modf_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
@@ -2953,6 +3712,18 @@ for.cond.cleanup:
 }
 
 define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @modf_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @modf_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @modf_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @modf_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
@@ -3000,6 +3771,18 @@ declare double @nextafter(double, double)
 declare float @nextafterf(float, float)
 
 define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nextafter_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nextafter_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @nextafter_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @nextafter_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_nextafter(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -3045,6 +3828,18 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nextafter_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nextafter_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @nextafter_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @nextafter_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_nextafterf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -3093,6 +3888,18 @@ declare double @pow(double, double)
 declare float @powf(float, float)
 
 define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -3138,6 +3945,18 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_powf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -3186,6 +4005,18 @@ declare double @sin(double)
 declare float @sinf(float)
 
 define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3231,6 +4062,18 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3282,6 +4125,18 @@ declare void @sincos(double, ptr, ptr)
 declare void @sincosf(float, ptr, ptr)
 
 define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3325,6 +4180,18 @@ for.cond.cleanup:
 }
 
 define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3374,6 +4241,18 @@ declare void @sincospi(double, ptr, ptr)
 declare void @sincospif(float, ptr, ptr)
 
 define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincospi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincospi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincospi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincospi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3417,6 +4296,18 @@ for.cond.cleanup:
 }
 
 define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincospi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincospi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincospi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincospi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3463,6 +4354,18 @@ declare double @sinh(double)
 declare float @sinhf(float)
 
 define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3508,6 +4411,18 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3556,6 +4471,18 @@ declare double @sinpi(double)
 declare float @sinpif(float)
 
 define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinpi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinpi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinpi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinpi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinpi(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3601,6 +4528,18 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinpi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinpi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinpi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinpi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinpif(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3649,6 +4588,18 @@ declare double @sqrt(double)
 declare float @sqrtf(float)
 
 define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3694,6 +4645,18 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3742,6 +4705,18 @@ declare double @tan(double)
 declare float @tanf(float)
 
 define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3787,6 +4762,18 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3835,6 +4822,18 @@ declare double @tanh(double)
 declare float @tanhf(float)
 
 define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3880,6 +4879,18 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3928,6 +4939,18 @@ declare double @tgamma(double)
 declare float @tgammaf(float)
 
 define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tgamma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tgamma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tgamma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tgamma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3973,6 +4996,18 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tgamma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tgamma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tgamma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tgamma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index f753df32d9eb..f6f2e39594dd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --filter "call.*(acos|asin|atan|atan2|cos|cosh|exp|log|sin|sinh|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|tan|tanh|trunc)" --version 2
 
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=LIBMVEC-NEON
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=LIBMVEC-NEON-WIDTH-2
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=LIBMVEC-SVE
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
@@ -16,6 +19,19 @@ declare double @llvm.acos.f64(double)
 declare float @llvm.acos.f32(float)
 
 define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.acos.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -51,6 +67,19 @@ define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @acos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.acos.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -89,6 +118,19 @@ declare double @llvm.asin.f64(double)
 declare float @llvm.asin.f32(float)
 
 define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.asin.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -124,6 +166,19 @@ define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @asin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.asin.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -162,6 +217,19 @@ declare double @llvm.atan.f64(double)
 declare float @llvm.atan.f32(float)
 
 define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.atan.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -197,6 +265,19 @@ define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @atan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.atan.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -235,6 +316,19 @@ declare double @llvm.atan2.f64(double, double)
 declare float @llvm.atan2.f32(float, float)
 
 define void @atan2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.atan2.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -270,6 +364,19 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @atan2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_atan2f(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.atan2.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -308,6 +415,18 @@ declare double @llvm.ceil.f64(double)
 declare float @llvm.ceil.f32(float)
 
 define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ceil_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ceil_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ceil_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ceil_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -343,6 +462,18 @@ define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @ceil_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ceil_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ceil_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ceil_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ceil_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -381,6 +512,19 @@ declare double @llvm.copysign.f64(double, double)
 declare float @llvm.copysign.f32(float, float)
 
 define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.copysign.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -416,6 +560,19 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @copysign_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.copysign.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -454,6 +611,19 @@ declare double @llvm.cos.f64(double)
 declare float @llvm.cos.f32(float)
 
 define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -489,6 +659,19 @@ define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -527,6 +710,19 @@ declare double @llvm.cosh.f64(double)
 declare float @llvm.cosh.f32(float)
 
 define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.cosh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -562,6 +758,19 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @cosh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_coshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.cosh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -600,6 +809,19 @@ declare double @llvm.exp.f64(double)
 declare float @llvm.exp.f32(float)
 
 define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
@@ -635,6 +857,19 @@ define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -673,6 +908,19 @@ declare double @llvm.exp10.f64(double)
 declare float @llvm.exp10.f32(float)
 
 define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -708,6 +956,19 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -746,6 +1007,19 @@ declare double @llvm.exp2.f64(double)
 declare float @llvm.exp2.f32(float)
 
 define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -781,6 +1055,19 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -819,6 +1106,18 @@ declare double @llvm.fabs.f64(double)
 declare float @llvm.fabs.f32(float)
 
 define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fabs_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fabs_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @fabs_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @fabs_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -854,6 +1153,18 @@ define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @fabs_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fabs_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fabs_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @fabs_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @fabs_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -892,6 +1203,18 @@ declare double @llvm.floor.f64(double)
 declare float @llvm.floor.f32(float)
 
 define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @floor_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @floor_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @floor_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @floor_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -927,6 +1250,18 @@ define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @floor_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @floor_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @floor_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @floor_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @floor_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -965,6 +1300,18 @@ declare double @llvm.fma.f64(double, double, double)
 declare float @llvm.fma.f32(float, float, float)
 
 define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1000,6 +1347,18 @@ define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1038,6 +1397,19 @@ declare double @llvm.log.f64(double)
 declare float @llvm.log.f32(float)
 
 define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1073,6 +1445,19 @@ define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_logf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1111,6 +1496,19 @@ declare double @llvm.log10.f64(double)
 declare float @llvm.log10.f32(float)
 
 define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1146,6 +1544,19 @@ define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1184,6 +1595,19 @@ declare double @llvm.log2.f64(double)
 declare float @llvm.log2.f32(float)
 
 define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1219,6 +1643,19 @@ define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1257,6 +1694,18 @@ declare double @llvm.maxnum.f64(double, double)
 declare float @llvm.maxnum.f32(float, float)
 
 define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @maxnum_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @maxnum_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @maxnum_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @maxnum_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1292,6 +1741,18 @@ define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @maxnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @maxnum_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @maxnum_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @maxnum_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @maxnum_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1330,6 +1791,18 @@ declare double @llvm.minnum.f64(double, double)
 declare float @llvm.minnum.f32(float, float)
 
 define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @minnum_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @minnum_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @minnum_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @minnum_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1365,6 +1838,18 @@ define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @minnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @minnum_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @minnum_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @minnum_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @minnum_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1403,6 +1888,18 @@ declare double @llvm.nearbyint.f64(double)
 declare float @llvm.nearbyint.f32(float)
 
 define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nearbyint_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nearbyint_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @nearbyint_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @nearbyint_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1438,6 +1935,18 @@ define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @nearbyint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nearbyint_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nearbyint_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @nearbyint_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @nearbyint_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1476,6 +1985,19 @@ declare double @llvm.pow.f64(double, double)
 declare float @llvm.pow.f32(float, float)
 
 define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1511,6 +2033,19 @@ define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_powf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1549,6 +2084,18 @@ declare double @llvm.rint.f64(double)
 declare float @llvm.rint.f32(float)
 
 define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @rint_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @rint_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @rint_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @rint_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1584,6 +2131,18 @@ define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @rint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @rint_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @rint_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @rint_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @rint_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1622,6 +2181,18 @@ declare double @llvm.round.f64(double)
 declare float @llvm.round.f32(float)
 
 define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @round_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @round_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @round_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @round_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1657,6 +2228,18 @@ define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @round_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @round_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @round_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @round_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @round_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1695,6 +2278,19 @@ declare double @llvm.sin.f64(double)
 declare float @llvm.sin.f32(float)
 
 define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1730,6 +2326,19 @@ define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1768,6 +2377,19 @@ declare double @llvm.sinh.f64(double)
 declare float @llvm.sinh.f32(float)
 
 define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.sinh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1803,6 +2425,19 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sinh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.sinh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1841,6 +2476,18 @@ declare double @llvm.sqrt.f64(double)
 declare float @llvm.sqrt.f32(float)
 
 define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1876,6 +2523,18 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sqrt_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1914,6 +2573,19 @@ declare double @llvm.tan.f64(double)
 declare float @llvm.tan.f32(float)
 
 define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.tan.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1949,6 +2621,19 @@ define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @tan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.tan.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1987,6 +2672,19 @@ declare double @llvm.tanh.f64(double)
 declare float @llvm.tanh.f32(float)
 
 define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.tanh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2022,6 +2720,19 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @tanh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.tanh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2060,6 +2771,18 @@ declare double @llvm.trunc.f64(double)
 declare float @llvm.trunc.f32(float)
 
 define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @trunc_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @trunc_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @trunc_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @trunc_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2095,6 +2818,18 @@ define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @trunc_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @trunc_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @trunc_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @trunc_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @trunc_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index a1f660d31668..5459512239b6 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -1,15 +1,13 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=SVML -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SVML
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=AMDLIBM -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,AMDLIBM
 ; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -vector-library=MASSV -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,MASSV
-; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=LIBMVEC-AARCH64
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-AARCH64
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=Accelerate -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ACCELERATE
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI
 ; RUN: opt -mtriple=riscv64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI_RISCV
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=ArmPL -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ARMPL
 
-; LIBMVEC-AARCH64-NOT: llvm.compiler.used
-
 ; COMMON-LABEL: @llvm.compiler.used = appending global
 ; SVML-SAME:        [6 x ptr] [
 ; SVML-SAME:          ptr @__svml_sin2,
@@ -35,6 +33,12 @@
 ; MASSV-SAME:         ptr @__log10f4
 ; ACCELERATE-SAME:  [1 x ptr] [
 ; ACCELERATE-SAME:    ptr @vlog10f
+; LIBMVEC-AARCH64-SAME: [5 x ptr] [
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN2v_sin,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVsMxv_sin,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN2v_log10f,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN4v_log10f,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVsMxv_log10f
 ; LIBMVEC-X86-SAME: [2 x ptr] [
 ; LIBMVEC-X86-SAME:   ptr @_ZGVbN2v_sin,
 ; LIBMVEC-X86-SAME:   ptr @_ZGVdN4v_sin
@@ -100,6 +104,7 @@ define double @sin_f64(double %in) {
 ; AMDLIBM:            call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; MASSV:              call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; ACCELERATE:         call double @sin(double %{{.*}})
+; LIBMVEC-AARCH64:    call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; LIBMVEC-X86:        call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; SLEEFGNUABI:        call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; SLEEFGNUABI_RISCV:  call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
@@ -158,6 +163,7 @@ define float @call_llvm.log10.f32(float %in) {
 ; COMMON-LABEL:       @call_llvm.log10.f32(
 ; SVML:               call float @llvm.log10.f32(float %{{.*}})
 ; AMDLIBM:            call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
+; LIBMVEC-AARCH64:    call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
 ; LIBMVEC-X86:        call float @llvm.log10.f32(float %{{.*}})
 ; MASSV:              call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
 ; ACCELERATE:         call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
@@ -167,6 +173,7 @@ define float @call_llvm.log10.f32(float %in) {
 ; No mapping of "llvm.log10.f32" to a vector function for SVML.
 ; SVML-NOT:        _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
 ; AMDLIBM-NOT:        _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
+; LIBMVEC-AARCH64-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
 ; LIBMVEC-X86-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
   %call = tail call float @llvm.log10.f32(float %in)
   ret float %call
@@ -196,8 +203,11 @@ declare float @llvm.log10.f32(float) #0
 ; MASSV: declare <2 x double> @__sind2(<2 x double>)
 ; MASSV: declare <4 x float> @__log10f4(<4 x float>)
 
-; LIBMVEC-AARCH64-NOT: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
-; LIBMVEC-AARCH64-NOT: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double>)
+; LIBMVEC-AARCH64: declare <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double>, <vscale x 2 x i1>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float>)
+; LIBMVEC-AARCH64: declare <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float>, <vscale x 4 x i1>)
 
 ; LIBMVEC-X86: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
 ; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
@@ -272,6 +282,14 @@ attributes #0 = { nounwind readnone }
 ; ACCELERATE:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ACCELERATE-SAME:   "_ZGV_LLVM_N4v_llvm.log10.f32(vlog10f)" }
 
+; LIBMVEC-AARCH64:      attributes #[[SIN]] = { "vector-function-abi-variant"=
+; LIBMVEC-AARCH64-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVnN2v_sin),
+; LIBMVEC-AARCH64-SAME:   _ZGVsMxv_sin(_ZGVsMxv_sin)" }
+; LIBMVEC-AARCH64:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
+; LIBMVEC-AARCH64-SAME:   "_ZGV_LLVM_N2v_llvm.log10.f32(_ZGVnN2v_log10f),
+; LIBMVEC-AARCH64-SAME:   _ZGV_LLVM_N4v_llvm.log10.f32(_ZGVnN4v_log10f),
+; LIBMVEC-AARCH64-SAME:   _ZGVsMxv_llvm.log10.f32(_ZGVsMxv_log10f)" }
+
 ; LIBMVEC-X86:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; LIBMVEC-X86-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVbN2v_sin),
 ; LIBMVEC-X86-SAME:   _ZGV_LLVM_N4v_sin(_ZGVdN4v_sin)" }

From 465e3ce9f10019db071dc7794ae9ab22f9fc76f7 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 17 Jun 2025 11:09:22 +0100
Subject: [PATCH 0618/1322] [LLVM][CodeGen] Lower ConstantInt vectors like
 shufflevector base splats. (#144395)

ConstantInt vectors utilise DAG.getConstant() when constructing the
initial DAG. This can have the effect of legalising the constant before
the DAG combiner is run, significant altering the generated code. To
mitigate this (hopefully as a temporary measure) we instead try to
construct the DAG in the same way as shufflevector based splats.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 22 +++++++++++++++++--
 llvm/test/CodeGen/AArch64/sve-expand-div.ll   |  1 +
 .../AArch64/sve-fixed-length-sdiv-pow2.ll     |  1 +
 llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll    |  1 +
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c63eb7fc6b37..4f548cbad5c3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1791,8 +1791,26 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
 
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
-      return DAG.getConstant(*CI, getCurSDLoc(), VT);
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+      SDLoc DL = getCurSDLoc();
+
+      // DAG.getConstant() may attempt to legalise the vector constant which can
+      // significantly change the combines applied to the DAG. To reduce the
+      // divergence when enabling ConstantInt based vectors we try to construct
+      // the DAG in the same way as shufflevector based splats. TODO: The
+      // divergence sometimes leads to better optimisations. Ideally we should
+      // prevent DAG.getConstant() from legalising too early but there are some
+      // degradations preventing this.
+      if (VT.isScalableVector())
+        return DAG.getNode(
+            ISD::SPLAT_VECTOR, DL, VT,
+            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
+      if (VT.isFixedLengthVector())
+        return DAG.getSplatBuildVector(
+            VT, DL,
+            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
+      return DAG.getConstant(*CI, DL, VT);
+    }
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 180c64e0a7de..bd6c72a3946c 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 ; Check that expensive divides are expanded into a more performant sequence
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
index 8b4386e2c221..45781fa47c6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=128 -use-constant-int-for-fixed-length-splat < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
index 4607f225f81e..a799b51f15cb 100644
--- a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 

From 71f72f4d5d1b820a3e6147289547821332eaf115 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 11:21:56 +0100
Subject: [PATCH 0619/1322] [DAG] Move foldMaskedMerge before visitAND. NFC.

Reduces diff in #144342
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f6d811ddba8a..d14615dcbc5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7206,6 +7206,32 @@ static SDValue foldLogicTreeOfShifts(SDNode *N, SDValue LeftHand,
   return DAG.getNode(LogicOpcode, DL, VT, CombinedShifts, W);
 }
 
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const TargetLowering &TLI, const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+
+  // If the target supports and-not, don't fold this.
+  if (TLI.hasAndNot(SDValue(Node, 0)))
+    return SDValue();
+
+  SDValue M, X, Y;
+  if (sd_match(Node,
+               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
+                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
+    EVT VT = M.getValueType();
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
+    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8136,32 +8162,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for targets without a fused
-/// "and-not" operation.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
-                               const TargetLowering &TLI, const SDLoc &DL) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-
-  // If the target supports and-not, don't fold this.
-  if (TLI.hasAndNot(SDValue(Node, 0)))
-    return SDValue();
-
-  SDValue M, X, Y;
-  if (sd_match(Node,
-               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
-                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
-    EVT VT = M.getValueType();
-    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
-    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
-    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
-  }
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);

From d3f13a0732c2d937a4c12cb8b1a61992ee5b0d9c Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 17 Jun 2025 12:30:47 +0200
Subject: [PATCH 0620/1322] [GVN] MemorySSA for GVN: embed the memory state in
 symbolic expressions (#123218)

While migrating towards MemorySSA, account for the memory state modeled
by MemorySSA by hashing it, when computing the symbolic expressions for
the memory operations. Likewise, when phi-translating while walking the
CFG for PRE possibilities, see if the value number of an operand may be
refined with one of the value from the incoming edges of the MemoryPhi
associated to the current phi.

Co-authored-by: Momchil Velikov <momchil.velikov@arm.com>
---
 llvm/include/llvm/Transforms/Scalar/GVN.h | 21 +++++-
 llvm/lib/Transforms/Scalar/GVN.cpp        | 89 +++++++++++++++++++++--
 2 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index e156ec469a14..245414935bc0 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -46,7 +46,9 @@ class ImplicitControlFlowTracking;
 class LoadInst;
 class LoopInfo;
 class MemDepResult;
+class MemoryAccess;
 class MemoryDependenceResults;
+class MemoryLocation;
 class MemorySSA;
 class MemorySSAUpdater;
 class NonLocalDepResult;
@@ -170,6 +172,10 @@ public:
     // Value number to PHINode mapping. Used for phi-translate in scalarpre.
     DenseMap<uint32_t, PHINode *> NumberingPhi;
 
+    // Value number to BasicBlock mapping. Used for phi-translate across
+    // MemoryPhis.
+    DenseMap<uint32_t, BasicBlock *> NumberingBB;
+
     // Cache for phi-translate in scalarpre.
     using PhiTranslateMap =
         DenseMap<std::pair<uint32_t, const BasicBlock *>, uint32_t>;
@@ -177,6 +183,9 @@ public:
 
     AAResults *AA = nullptr;
     MemoryDependenceResults *MD = nullptr;
+    bool IsMDEnabled = false;
+    MemorySSA *MSSA = nullptr;
+    bool IsMSSAEnabled = false;
     DominatorTree *DT = nullptr;
 
     uint32_t NextValueNumber = 1;
@@ -187,12 +196,14 @@ public:
     Expression createExtractvalueExpr(ExtractValueInst *EI);
     Expression createGEPExpr(GetElementPtrInst *GEP);
     uint32_t lookupOrAddCall(CallInst *C);
+    uint32_t computeLoadStoreVN(Instruction *I);
     uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
                               uint32_t Num, GVNPass &GVN);
     bool areCallValsEqual(uint32_t Num, uint32_t NewNum, const BasicBlock *Pred,
                           const BasicBlock *PhiBlock, GVNPass &GVN);
     std::pair<uint32_t, bool> assignExpNewValueNum(Expression &Exp);
     bool areAllValsInBB(uint32_t Num, const BasicBlock *BB, GVNPass &GVN);
+    void addMemoryStateToExp(Instruction *I, Expression &Exp);
 
   public:
     LLVM_ABI ValueTable();
@@ -201,6 +212,7 @@ public:
     LLVM_ABI ~ValueTable();
     LLVM_ABI ValueTable &operator=(const ValueTable &Arg);
 
+    LLVM_ABI uint32_t lookupOrAdd(MemoryAccess *MA);
     LLVM_ABI uint32_t lookupOrAdd(Value *V);
     LLVM_ABI uint32_t lookup(Value *V, bool Verify = true) const;
     LLVM_ABI uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred,
@@ -216,7 +228,14 @@ public:
     LLVM_ABI void erase(Value *V);
     void setAliasAnalysis(AAResults *A) { AA = A; }
     AAResults *getAliasAnalysis() const { return AA; }
-    void setMemDep(MemoryDependenceResults *M) { MD = M; }
+    void setMemDep(MemoryDependenceResults *M, bool MDEnabled = true) {
+      MD = M;
+      IsMDEnabled = MDEnabled;
+    }
+    void setMemorySSA(MemorySSA *M, bool MSSAEnabled = false) {
+      MSSA = M;
+      IsMSSAEnabled = MSSAEnabled;
+    }
     void setDomTree(DominatorTree *D) { DT = D; }
     uint32_t getNextUnusedValueNumber() { return NextValueNumber; }
     LLVM_ABI void verifyRemoved(const Value *) const;
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c8a0479358ea..c580dd4ff230 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -474,6 +474,19 @@ void GVNPass::ValueTable::add(Value *V, uint32_t Num) {
     NumberingPhi[Num] = PN;
 }
 
+/// Include the incoming memory state into the hash of the expression for the
+/// given instruction. If the incoming memory state is:
+/// * LiveOnEntry, add the value number of the entry block,
+/// * a MemoryPhi, add the value number of the basic block corresponding to that
+/// MemoryPhi,
+/// * a MemoryDef, add the value number of the memory setting instruction.
+void GVNPass::ValueTable::addMemoryStateToExp(Instruction *I, Expression &Exp) {
+  assert(MSSA && "addMemoryStateToExp should not be called without MemorySSA");
+  assert(MSSA->getMemoryAccess(I) && "Instruction does not access memory");
+  MemoryAccess *MA = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(I);
+  Exp.VarArgs.push_back(lookupOrAdd(MA));
+}
+
 uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
   // FIXME: Currently the calls which may access the thread id may
   // be considered as not accessing the memory. But this is
@@ -594,15 +607,48 @@ uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
     return V;
   }
 
+  if (MSSA && IsMSSAEnabled && AA->onlyReadsMemory(C)) {
+    Expression Exp = createExpr(C);
+    addMemoryStateToExp(C, Exp);
+    auto [V, _] = assignExpNewValueNum(Exp);
+    ValueNumbering[C] = V;
+    return V;
+  }
+
   ValueNumbering[C] = NextValueNumber;
   return NextValueNumber++;
 }
 
+/// Returns the value number for the specified load or store instruction.
+uint32_t GVNPass::ValueTable::computeLoadStoreVN(Instruction *I) {
+  if (!MSSA || !IsMSSAEnabled) {
+    ValueNumbering[I] = NextValueNumber;
+    return NextValueNumber++;
+  }
+
+  Expression Exp;
+  Exp.Ty = I->getType();
+  Exp.Opcode = I->getOpcode();
+  for (Use &Op : I->operands())
+    Exp.VarArgs.push_back(lookupOrAdd(Op));
+  addMemoryStateToExp(I, Exp);
+
+  auto [V, _] = assignExpNewValueNum(Exp);
+  ValueNumbering[I] = V;
+  return V;
+}
+
 /// Returns true if a value number exists for the specified value.
 bool GVNPass::ValueTable::exists(Value *V) const {
   return ValueNumbering.contains(V);
 }
 
+uint32_t GVNPass::ValueTable::lookupOrAdd(MemoryAccess *MA) {
+  return MSSA->isLiveOnEntryDef(MA) || isa<MemoryPhi>(MA)
+             ? lookupOrAdd(MA->getBlock())
+             : lookupOrAdd(cast<MemoryUseOrDef>(MA)->getMemoryInst());
+}
+
 /// lookupOrAdd - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
 uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
@@ -613,6 +659,8 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
   auto *I = dyn_cast<Instruction>(V);
   if (!I) {
     ValueNumbering[V] = NextValueNumber;
+    if (isa<BasicBlock>(V))
+      NumberingBB[NextValueNumber] = cast<BasicBlock>(V);
     return NextValueNumber++;
   }
 
@@ -672,6 +720,9 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
       ValueNumbering[V] = NextValueNumber;
       NumberingPhi[NextValueNumber] = cast<PHINode>(V);
       return NextValueNumber++;
+    case Instruction::Load:
+    case Instruction::Store:
+      return computeLoadStoreVN(I);
     default:
       ValueNumbering[V] = NextValueNumber;
       return NextValueNumber++;
@@ -709,6 +760,7 @@ void GVNPass::ValueTable::clear() {
   ValueNumbering.clear();
   ExpressionNumbering.clear();
   NumberingPhi.clear();
+  NumberingBB.clear();
   PhiTranslateTable.clear();
   NextValueNumber = 1;
   Expressions.clear();
@@ -723,6 +775,8 @@ void GVNPass::ValueTable::erase(Value *V) {
   // If V is PHINode, V <--> value number is an one-to-one mapping.
   if (isa<PHINode>(V))
     NumberingPhi.erase(Num);
+  else if (isa<BasicBlock>(V))
+    NumberingBB.erase(Num);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
@@ -2310,15 +2364,39 @@ bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
 uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
                                                const BasicBlock *PhiBlock,
                                                uint32_t Num, GVNPass &GVN) {
+  // See if we can refine the value number by looking at the PN incoming value
+  // for the given predecessor.
   if (PHINode *PN = NumberingPhi[Num]) {
-    for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) {
-      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(I) == Pred)
-        if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
-          return TransVal;
-    }
+    if (PN->getParent() == PhiBlock)
+      for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I)
+        if (PN->getIncomingBlock(I) == Pred)
+          if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
+            return TransVal;
     return Num;
   }
 
+  if (BasicBlock *BB = NumberingBB[Num]) {
+    assert(MSSA && "NumberingBB is non-empty only when using MemorySSA");
+    // Value numbers of basic blocks are used to represent memory state in
+    // load/store instructions and read-only function calls when said state is
+    // set by a MemoryPhi.
+    if (BB != PhiBlock)
+      return Num;
+    MemoryPhi *MPhi = MSSA->getMemoryAccess(BB);
+    for (unsigned i = 0, N = MPhi->getNumIncomingValues(); i != N; ++i) {
+      if (MPhi->getIncomingBlock(i) != Pred)
+        continue;
+      MemoryAccess *MA = MPhi->getIncomingValue(i);
+      if (auto *PredPhi = dyn_cast<MemoryPhi>(MA))
+        return lookupOrAdd(PredPhi->getBlock());
+      if (MSSA->isLiveOnEntryDef(MA))
+        return lookupOrAdd(&BB->getParent()->getEntryBlock());
+      return lookupOrAdd(cast<MemoryUseOrDef>(MA)->getMemoryInst());
+    }
+    llvm_unreachable(
+        "CFG/MemorySSA mismatch: predecessor not found among incoming blocks");
+  }
+
   // If there is any value related with Num is defined in a BB other than
   // PhiBlock, it cannot depend on a phi in PhiBlock without going through
   // a backedge. We can do an early exit in that case to save compile time.
@@ -2761,6 +2839,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   ICF = &ImplicitCFT;
   this->LI = &LI;
   VN.setMemDep(MD);
+  VN.setMemorySSA(MSSA);
   ORE = RunORE;
   InvalidBlockRPONumbers = true;
   MemorySSAUpdater Updater(MSSA);

From ce96fdde54c379fa3893f3f07d8233df9e16b9e2 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 12:38:02 +0200
Subject: [PATCH 0621/1322] [clang][bytecode] Keep the last chunk in
 InterpStack::clear() (#144487)

We call clear when checking for potential constant expressions, but that
used to free all the chunks. Keep the last one so we don't have to
re-allocate it.
---
 clang/lib/AST/ByteCode/InterpStack.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpStack.cpp b/clang/lib/AST/ByteCode/InterpStack.cpp
index b183335dd588..6b748d62b83b 100644
--- a/clang/lib/AST/ByteCode/InterpStack.cpp
+++ b/clang/lib/AST/ByteCode/InterpStack.cpp
@@ -19,9 +19,7 @@
 using namespace clang;
 using namespace clang::interp;
 
-InterpStack::~InterpStack() { clear(); }
-
-void InterpStack::clear() {
+InterpStack::~InterpStack() {
   if (Chunk && Chunk->Next)
     std::free(Chunk->Next);
   if (Chunk)
@@ -33,6 +31,21 @@ void InterpStack::clear() {
 #endif
 }
 
+// We keep the last chunk around to reuse.
+void InterpStack::clear() {
+  if (!Chunk)
+    return;
+
+  if (Chunk->Next)
+    std::free(Chunk->Next);
+
+  assert(Chunk);
+  StackSize = 0;
+#ifndef NDEBUG
+  ItemTypes.clear();
+#endif
+}
+
 void InterpStack::clearTo(size_t NewSize) {
   assert(NewSize <= size());
   size_t ToShrink = size() - NewSize;

From 576ced56d78b48e658b0a170603388e4802f6311 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 12:43:39 +0200
Subject: [PATCH 0622/1322] [clang][bytecode] Simplify Block::replacePointer()
 (#144490)

Try to do less work here instead of a full remove + add.
---
 clang/lib/AST/ByteCode/InterpBlock.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBlock.cpp b/clang/lib/AST/ByteCode/InterpBlock.cpp
index 9ef44cd29ff8..f60307870ffc 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.cpp
+++ b/clang/lib/AST/ByteCode/InterpBlock.cpp
@@ -69,20 +69,26 @@ void Block::cleanup() {
 void Block::replacePointer(Pointer *Old, Pointer *New) {
   assert(Old);
   assert(New);
+  assert(Old != New);
   if (IsStatic) {
     assert(!Pointers);
     return;
   }
-
 #ifndef NDEBUG
   assert(hasPointer(Old));
 #endif
 
-  removePointer(Old);
-  addPointer(New);
+  if (Old->Prev)
+    Old->Prev->Next = New;
+  if (Old->Next)
+    Old->Next->Prev = New;
+  New->Prev = Old->Prev;
+  New->Next = Old->Next;
+  if (Pointers == Old)
+    Pointers = New;
 
   Old->PointeeStorage.BS.Pointee = nullptr;
-
+  New->PointeeStorage.BS.Pointee = this;
 #ifndef NDEBUG
   assert(!hasPointer(Old));
   assert(hasPointer(New));

From 49c6235d1fb3bcecfe37a8e41bec69d6c7dc86ff Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 12:51:37 +0200
Subject: [PATCH 0623/1322] [PowerPC] Regenerate MIR test checks (NFC)

---
 .../PowerPC/aix-vector-vararg-fixed-caller.ll | 137 +++++++++---------
 1 file changed, 69 insertions(+), 68 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
index f3e58b789794..fad275f58cd0 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
@@ -12,76 +12,77 @@ define void @caller() {
 
   ; 32BIT-LABEL: name: caller
   ; 32BIT: bb.0.entry:
-  ; 32BIT:   ADJCALLSTACKDOWN 88, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI:%[0-9]+]]:gprc = LI 64
-  ; 32BIT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
-  ; 32BIT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
-  ; 32BIT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
-  ; 32BIT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
-  ; 32BIT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
-  ; 32BIT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
-  ; 32BIT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc2]] :: (load (s64) from constant-pool)
-  ; 32BIT:   [[LIS2:%[0-9]+]]:gprc = LIS 16393
-  ; 32BIT:   [[ORI2:%[0-9]+]]:gprc = ORI killed [[LIS2]], 8697
-  ; 32BIT:   [[LIS3:%[0-9]+]]:gprc = LIS 61467
-  ; 32BIT:   [[ORI3:%[0-9]+]]:gprc = ORI killed [[LIS3]], 34414
-  ; 32BIT:   [[LWZtoc3:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc3]] :: (load (s64) from constant-pool)
-  ; 32BIT:   [[LI1:%[0-9]+]]:gprc = LI 55
-  ; 32BIT:   $r3 = COPY [[LI1]]
-  ; 32BIT:   $v2 = COPY [[LXVW4X1]]
-  ; 32BIT:   $f1 = COPY [[LFD]]
-  ; 32BIT:   $r9 = COPY [[ORI2]]
-  ; 32BIT:   $r10 = COPY [[ORI3]]
-  ; 32BIT:   $f2 = COPY [[LFD1]]
-  ; 32BIT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $v2, implicit $f1, implicit $r9, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1, implicit-def $v2
-  ; 32BIT:   ADJCALLSTACKUP 88, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 32BIT:   BLR implicit $lr, implicit $rm
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 88, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 64
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
+  ; 32BIT-NEXT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
+  ; 32BIT-NEXT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
+  ; 32BIT-NEXT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
+  ; 32BIT-NEXT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc2]] :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   [[LIS2:%[0-9]+]]:gprc = LIS 16393
+  ; 32BIT-NEXT:   [[ORI2:%[0-9]+]]:gprc = ORI killed [[LIS2]], 8697
+  ; 32BIT-NEXT:   [[LIS3:%[0-9]+]]:gprc = LIS 61467
+  ; 32BIT-NEXT:   [[ORI3:%[0-9]+]]:gprc = ORI killed [[LIS3]], 34414
+  ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc3]] :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   [[LI1:%[0-9]+]]:gprc = LI 55
+  ; 32BIT-NEXT:   $r3 = COPY [[LI1]]
+  ; 32BIT-NEXT:   $v2 = COPY [[LXVW4X1]]
+  ; 32BIT-NEXT:   $f1 = COPY [[LFD]]
+  ; 32BIT-NEXT:   $r9 = COPY [[ORI2]]
+  ; 32BIT-NEXT:   $r10 = COPY [[ORI3]]
+  ; 32BIT-NEXT:   $f2 = COPY [[LFD1]]
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $v2, implicit $f1, implicit $r9, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1, implicit-def $v2
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 88, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
   ; 64BIT-LABEL: name: caller
   ; 64BIT: bb.0.entry:
-  ; 64BIT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
-  ; 64BIT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
-  ; 64BIT:   [[LIS8_:%[0-9]+]]:g8rc = LIS8 16389
-  ; 64BIT:   [[ORI8_:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_]], 48905
-  ; 64BIT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
-  ; 64BIT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
-  ; 64BIT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
-  ; 64BIT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
-  ; 64BIT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
-  ; 64BIT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
-  ; 64BIT:   [[LDtocCPT2:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT2]] :: (load (s64) from constant-pool)
-  ; 64BIT:   [[LDtocCPT3:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT3]] :: (load (s64) from constant-pool)
-  ; 64BIT:   [[LIS8_1:%[0-9]+]]:g8rc = LIS8 16393
-  ; 64BIT:   [[ORI8_2:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_1]], 8697
-  ; 64BIT:   [[RLDIC1:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_2]], 32, 1
-  ; 64BIT:   [[ORIS8_1:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC1]], 61467
-  ; 64BIT:   [[ORI8_3:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_1]], 34414
-  ; 64BIT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 55
-  ; 64BIT:   $x3 = COPY [[LI8_1]]
-  ; 64BIT:   $v2 = COPY [[LXVW4X1]]
-  ; 64BIT:   $f1 = COPY [[LFD]]
-  ; 64BIT:   $x7 = COPY [[ORI8_3]]
-  ; 64BIT:   $x9 = COPY [[LD1]]
-  ; 64BIT:   $x10 = COPY [[LD]]
-  ; 64BIT:   $f2 = COPY [[LFD1]]
-  ; 64BIT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $v2, implicit $f1, implicit $x7, implicit $x9, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1, implicit-def $v2
-  ; 64BIT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 64BIT:   BLR8 implicit $lr8, implicit $rm
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LIS8_:%[0-9]+]]:g8rc = LIS8 16389
+  ; 64BIT-NEXT:   [[ORI8_:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_]], 48905
+  ; 64BIT-NEXT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
+  ; 64BIT-NEXT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
+  ; 64BIT-NEXT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
+  ; 64BIT-NEXT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LDtocCPT2:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT2]] :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT3]] :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   [[LIS8_1:%[0-9]+]]:g8rc = LIS8 16393
+  ; 64BIT-NEXT:   [[ORI8_2:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_1]], 8697
+  ; 64BIT-NEXT:   [[RLDIC1:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_2]], 32, 1
+  ; 64BIT-NEXT:   [[ORIS8_1:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC1]], 61467
+  ; 64BIT-NEXT:   [[ORI8_3:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_1]], 34414
+  ; 64BIT-NEXT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 55
+  ; 64BIT-NEXT:   $x3 = COPY [[LI8_1]]
+  ; 64BIT-NEXT:   $v2 = COPY [[LXVW4X1]]
+  ; 64BIT-NEXT:   $f1 = COPY [[LFD]]
+  ; 64BIT-NEXT:   $x7 = COPY [[ORI8_3]]
+  ; 64BIT-NEXT:   $x9 = COPY [[LD1]]
+  ; 64BIT-NEXT:   $x10 = COPY [[LD]]
+  ; 64BIT-NEXT:   $f2 = COPY [[LFD1]]
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $v2, implicit $f1, implicit $x7, implicit $x9, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1, implicit-def $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
 entry:
   %call = tail call <4 x i32> (i32, <4 x i32>, double, ...) @callee(i32 signext 55, <4 x i32> <i32 170, i32 187, i32 204, i32 221>, double 3.141590e+00, <4 x i32> <i32 10, i32 20, i32 30, i32 40>, double 2.718280e+00)
   ret void

From 2d336e7c5e821383816a9dca080f713747cc9e1e Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Tue, 17 Jun 2025 13:07:44 +0200
Subject: [PATCH 0624/1322] [analyzer] Avoid contradicting assumption in
 tainted div-by-0 error node (#144491)

This patch corrects the state of the error node generated by the
core.DivideZero checker when it detects potential division by zero
involving a tainted denominator.

The checker split in

https://github.com/llvm/llvm-project/pull/106389/commits/91ac5ed10a154410c246d985752c1bbfcf23b105
started to introduce a conflicting assumption about the denominator into
the error node:
Node with the Bug Report "Division by a tainted value, possibly zero"
has an assumption "denominator != 0".

This has been done as a shortcut to continue analysis with the correct
assumption *after* the division - if we proceed, we can only assume the
denominator was not zero. However, this assumption is introduced
one-node too soon, leading to a self-contradictory error node.

In this patch, I make the error node with assumption of zero denominator
fatal, but allow analysis to continue on the second half of the state
split with the assumption of non-zero denominator.

---

CPP-6376
---
 .../lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp  |  8 ++++----
 clang/test/Analysis/taint-generic.c                 | 13 +++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
index 15d73fb9ca39..ab90615f6318 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
@@ -69,7 +69,7 @@ void DivZeroChecker::reportTaintBug(
     llvm::ArrayRef<SymbolRef> TaintedSyms) const {
   if (!TaintedDivChecker.isEnabled())
     return;
-  if (ExplodedNode *N = C.generateNonFatalErrorNode(StateZero)) {
+  if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
     auto R =
         std::make_unique<PathSensitiveBugReport>(TaintedDivChecker, Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
@@ -113,9 +113,9 @@ void DivZeroChecker::checkPreStmt(const BinaryOperator *B,
   if ((stateNotZero && stateZero)) {
     std::vector<SymbolRef> taintedSyms = getTaintedSymbols(C.getState(), *DV);
     if (!taintedSyms.empty()) {
-      reportTaintBug("Division by a tainted value, possibly zero", stateNotZero,
-                     C, taintedSyms);
-      return;
+      reportTaintBug("Division by a tainted value, possibly zero", stateZero, C,
+                     taintedSyms);
+      // Fallthrough to continue analysis in case of non-zero denominator.
     }
   }
 
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 3c520612c5d9..9d6d2942df4a 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -412,6 +412,19 @@ int testTaintedDivFP(void) {
   return 5/x; // x cannot be 0, so no tainted warning either
 }
 
+void clang_analyzer_warnIfReached();
+
+int testTaintDivZeroNonfatal() {
+  int x;
+  scanf("%d", &x);
+  int y = 5/x; // expected-warning {{Division by a tainted value, possibly zero}}
+  if (x == 0)
+    clang_analyzer_warnIfReached();
+  else
+    clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+  return y;
+}
+
 // Zero-sized VLAs.
 void testTaintedVLASize(void) {
   int x;

From 990d2540bf0545cc4024c3718069f6d0b42c461b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 12:12:46 +0100
Subject: [PATCH 0625/1322] [X86] isAddSubOrSubAdd - convert to SDPatternMatch
 matching. NFC. (#144486)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2eadcc5416c2..a2e3873fe31a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8268,6 +8268,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
                              SDValue &Opnd0, SDValue &Opnd1,
                              unsigned &NumExtracts,
                              bool &IsSubAdd) {
+  using namespace SDPatternMatch;
 
   MVT VT = BV->getSimpleValueType(0);
   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
@@ -8302,14 +8303,8 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
     // Early exit if we cannot match that sequence.
-    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
-        Op0.getOperand(1) != Op1.getOperand(1))
-      return false;
-
-    unsigned I0 = Op0.getConstantOperandVal(1);
-    if (I0 != i)
+    if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
+        !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
       return false;
 
     // We found a valid add/sub node, make sure its the same opcode as previous
@@ -8319,16 +8314,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     Opc[i % 2] = Opcode;
 
     // Update InVec0 and InVec1.
-    if (InVec0.isUndef()) {
+    if (InVec0.isUndef())
       InVec0 = Op0.getOperand(0);
-      if (InVec0.getSimpleValueType() != VT)
-        return false;
-    }
-    if (InVec1.isUndef()) {
+    if (InVec1.isUndef())
       InVec1 = Op1.getOperand(0);
-      if (InVec1.getSimpleValueType() != VT)
-        return false;
-    }
 
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.

From 875b36a8742437b95f623bab1e0332562c7b4b3f Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Tue, 17 Jun 2025 13:40:57 +0200
Subject: [PATCH 0626/1322] [mlir] fix MemRefToLLVM lowering of atomic
 operations (#139045)

We have been confusingly, and arguably incorrectly, lowering `m**imumf`
atomic RMW operations in the MemRef dialect to `fm**` atomic RMW
operations in the LLVM dialect, which have different NaN-propagation
semantics: `m**imumf` propagates NaNs from either operand whereas
`fm**`, which lowers to the `fm**num` intrinsic returns the non-NaN
operand. This also contradicts the lowering of `arith.m**imumf` and
`arith.m**numf` operations.

Change the lowering to match the terminology in arith.

Add tests for these lowerings.

Keep a debug message in case of surprising behavior downstream (the code
may be producing more NaNs now).
---
 mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp   | 13 +++++++++++++
 .../Conversion/MemRefToLLVM/memref-to-llvm.mlir     | 10 +++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index ade4e4d3de8e..8ccf1bfc292d 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -28,6 +28,9 @@
 #include "llvm/Support/MathExtras.h"
 #include <optional>
 
+#define DEBUG_TYPE "memref-to-llvm"
+#define DBGS() llvm::dbgs() << "[" DEBUG_TYPE "] "
+
 namespace mlir {
 #define GEN_PASS_DEF_FINALIZEMEMREFTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
@@ -1782,12 +1785,22 @@ matchSimpleAtomicOp(memref::AtomicRMWOp atomicOp) {
   case arith::AtomicRMWKind::assign:
     return LLVM::AtomicBinOp::xchg;
   case arith::AtomicRMWKind::maximumf:
+    // TODO: remove this by end of 2025.
+    LLVM_DEBUG(DBGS() << "the lowering of memref.atomicrmw maximumf changed "
+                         "from fmax to fmaximum, expect more NaNs");
+    return LLVM::AtomicBinOp::fmaximum;
+  case arith::AtomicRMWKind::maxnumf:
     return LLVM::AtomicBinOp::fmax;
   case arith::AtomicRMWKind::maxs:
     return LLVM::AtomicBinOp::max;
   case arith::AtomicRMWKind::maxu:
     return LLVM::AtomicBinOp::umax;
   case arith::AtomicRMWKind::minimumf:
+    // TODO: remove this by end of 2025.
+    LLVM_DEBUG(DBGS() << "the lowering of memref.atomicrmw minimum changed "
+                         "from fmin to fminimum, expect more NaNs");
+    return LLVM::AtomicBinOp::fminimum;
+  case arith::AtomicRMWKind::minnumf:
     return LLVM::AtomicBinOp::fmin;
   case arith::AtomicRMWKind::mins:
     return LLVM::AtomicBinOp::min;
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index acfc18857425..51d56389dac9 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -452,11 +452,19 @@ func.func @atomic_rmw(%I : memref<10xi32>, %ival : i32, %F : memref<10xf32>, %fv
   // CHECK: llvm.atomicrmw umin %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw addf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
   // CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw maximumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmaximum %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw maxnumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmax %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw minimumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fminimum %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw minnumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmin %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw ori %ival, %I[%i] : (i32, memref<10xi32>) -> i32
   // CHECK: llvm.atomicrmw _or %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw andi %ival, %I[%i] : (i32, memref<10xi32>) -> i32
   // CHECK: llvm.atomicrmw _and %{{.*}}, %{{.*}} acq_rel
-  // CHECK-INTERFACE-COUNT-9: llvm.atomicrmw
+  // CHECK-INTERFACE-COUNT-13: llvm.atomicrmw
   return
 }
 

From 9700930bd90a099f702332cf86dd898f00840f99 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 13:06:10 +0100
Subject: [PATCH 0627/1322] [X86] detectZextAbsDiff - convert to SDPatternMatch
 matching. NFC. (#144498)

Match the entire ABS(SUB(ZEXT(vXi8),ZEXT(vXi8))) pattern and simplify the logic in combineBasicSADPattern accordingly
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 37 ++++++++++---------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a2e3873fe31a..cd02d275d6b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46058,22 +46058,18 @@ static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
 // Given a ABS node, detect the following pattern:
 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
 // This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
-  SDValue AbsOp1 = Abs->getOperand(0);
-  if (AbsOp1.getOpcode() != ISD::SUB)
-    return false;
-
-  Op0 = AbsOp1.getOperand(0);
-  Op1 = AbsOp1.getOperand(1);
+static bool detectZextAbsDiff(SDValue Abs, SDValue &Op0, SDValue &Op1) {
+  using namespace SDPatternMatch;
 
   // Check if the operands of the sub are zero-extended from vectors of i8.
-  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
-      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
-      Op1.getOpcode() != ISD::ZERO_EXTEND ||
-      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
-    return false;
-
-  return true;
+  EVT SrcVT0, SrcVT1;
+  return sd_match(
+             Abs,
+             m_UnaryOp(ISD::ABS,
+                       m_Sub(m_AllOf(m_Value(Op0), m_ZExt(m_VT(SrcVT0))),
+                             m_AllOf(m_Value(Op1), m_ZExt(m_VT(SrcVT1)))))) &&
+         SrcVT0.getVectorElementType() == MVT::i8 &&
+         SrcVT1.getVectorElementType() == MVT::i8;
 }
 
 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
@@ -46455,6 +46451,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // Match shuffle + add pyramid.
   ISD::NodeType BinOp;
   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
+  if (!Root)
+    return SDValue();
 
   // The operand is expected to be zero extended from i8
   // (verified in detectZextAbsDiff).
@@ -46464,16 +46462,11 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // Also the sign extend is basically zero extend
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
-  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
-               Root.getOpcode() == ISD::ZERO_EXTEND ||
-               Root.getOpcode() == ISD::ANY_EXTEND))
+  if (Root.getOpcode() == ISD::SIGN_EXTEND ||
+      Root.getOpcode() == ISD::ZERO_EXTEND ||
+      Root.getOpcode() == ISD::ANY_EXTEND)
     Root = Root.getOperand(0);
 
-  // If there was a match, we want Root to be a select that is the root of an
-  // abs-diff pattern.
-  if (!Root || Root.getOpcode() != ISD::ABS)
-    return SDValue();
-
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue Zext0, Zext1;
   if (!detectZextAbsDiff(Root, Zext0, Zext1))

From 12611a7fc71376e88aa01e3f0bbc74517f1a1703 Mon Sep 17 00:00:00 2001
From: Denzel-Brian Budii <73462654+chios202@users.noreply.github.com>
Date: Tue, 17 Jun 2025 15:07:20 +0300
Subject: [PATCH 0628/1322] [mlir] Improve mlir-query by adding matcher
 combinators (#141423)

Whereas backward-slice matching provides support to limit traversal by
specifying the desired depth level, this pull request introduces support
for limiting traversal with a nested matcher (adding forward-slice
also). It also adds support for variadic operators, including `anyOf`
and `allOf`. Rather than simply stopping traversal when an operation
named foo is encountered, one can now define a matcher that specifies
different exit conditions. Variadic support implementation within
mlir-query is very similar to clang-query.
---
 mlir/include/mlir/Query/Matcher/Marshallers.h |  61 +++++++++
 mlir/include/mlir/Query/Matcher/MatchFinder.h |   4 +-
 .../mlir/Query/Matcher/MatchersInternal.h     | 116 +++++++++++++++++-
 .../mlir/Query/Matcher/SliceMatchers.h        | 104 +++++++++++++++-
 .../include/mlir/Query/Matcher/VariantValue.h |  11 +-
 mlir/lib/Query/Matcher/CMakeLists.txt         |   1 +
 mlir/lib/Query/Matcher/MatchersInternal.cpp   |  33 +++++
 mlir/lib/Query/Matcher/RegistryManager.cpp    |   7 +-
 mlir/lib/Query/Matcher/VariantValue.cpp       |  52 ++++++++
 mlir/lib/Query/Query.cpp                      |   5 +
 ...ex-test.mlir => backward-slice-union.mlir} |  13 +-
 .../forward-slice-by-predicate.mlir           |  27 ++++
 .../mlir-query/logical-operator-test.mlir     |  11 ++
 .../mlir-query/slice-function-extraction.mlir |  29 +++++
 mlir/tools/mlir-query/mlir-query.cpp          |  14 ++-
 15 files changed, 471 insertions(+), 17 deletions(-)
 create mode 100644 mlir/lib/Query/Matcher/MatchersInternal.cpp
 rename mlir/test/mlir-query/{complex-test.mlir => backward-slice-union.mlir} (71%)
 create mode 100644 mlir/test/mlir-query/forward-slice-by-predicate.mlir
 create mode 100644 mlir/test/mlir-query/logical-operator-test.mlir
 create mode 100644 mlir/test/mlir-query/slice-function-extraction.mlir

diff --git a/mlir/include/mlir/Query/Matcher/Marshallers.h b/mlir/include/mlir/Query/Matcher/Marshallers.h
index 012bf7b9ec4a..5fe6965f32ef 100644
--- a/mlir/include/mlir/Query/Matcher/Marshallers.h
+++ b/mlir/include/mlir/Query/Matcher/Marshallers.h
@@ -108,6 +108,9 @@ public:
                                 const llvm::ArrayRef<ParserValue> args,
                                 Diagnostics *error) const = 0;
 
+  // If the matcher is variadic, it can take any number of arguments.
+  virtual bool isVariadic() const = 0;
+
   // Returns the number of arguments accepted by the matcher.
   virtual unsigned getNumArgs() const = 0;
 
@@ -140,6 +143,8 @@ public:
     return marshaller(matcherFunc, matcherName, nameRange, args, error);
   }
 
+  bool isVariadic() const override { return false; }
+
   unsigned getNumArgs() const override { return argKinds.size(); }
 
   void getArgKinds(unsigned argNo, std::vector<ArgKind> &kinds) const override {
@@ -153,6 +158,54 @@ private:
   const std::vector<ArgKind> argKinds;
 };
 
+class VariadicOperatorMatcherDescriptor : public MatcherDescriptor {
+public:
+  using VarOp = DynMatcher::VariadicOperator;
+  VariadicOperatorMatcherDescriptor(unsigned minCount, unsigned maxCount,
+                                    VarOp varOp, StringRef matcherName)
+      : minCount(minCount), maxCount(maxCount), varOp(varOp),
+        matcherName(matcherName) {}
+
+  VariantMatcher create(SourceRange nameRange, ArrayRef<ParserValue> args,
+                        Diagnostics *error) const override {
+    if (args.size() < minCount || maxCount < args.size()) {
+      addError(error, nameRange, ErrorType::RegistryWrongArgCount,
+               {llvm::Twine("requires between "), llvm::Twine(minCount),
+                llvm::Twine(" and "), llvm::Twine(maxCount),
+                llvm::Twine(" args, got "), llvm::Twine(args.size())});
+      return VariantMatcher();
+    }
+
+    std::vector<VariantMatcher> innerArgs;
+    for (int64_t i = 0, e = args.size(); i != e; ++i) {
+      const ParserValue &arg = args[i];
+      const VariantValue &value = arg.value;
+      if (!value.isMatcher()) {
+        addError(error, arg.range, ErrorType::RegistryWrongArgType,
+                 {llvm::Twine(i + 1), llvm::Twine("matcher: "),
+                  llvm::Twine(value.getTypeAsString())});
+        return VariantMatcher();
+      }
+      innerArgs.push_back(value.getMatcher());
+    }
+    return VariantMatcher::VariadicOperatorMatcher(varOp, std::move(innerArgs));
+  }
+
+  bool isVariadic() const override { return true; }
+
+  unsigned getNumArgs() const override { return 0; }
+
+  void getArgKinds(unsigned argNo, std::vector<ArgKind> &kinds) const override {
+    kinds.push_back(ArgKind(ArgKind::Matcher));
+  }
+
+private:
+  const unsigned minCount;
+  const unsigned maxCount;
+  const VarOp varOp;
+  const StringRef matcherName;
+};
+
 // Helper function to check if argument count matches expected count
 inline bool checkArgCount(SourceRange nameRange, size_t expectedArgCount,
                           llvm::ArrayRef<ParserValue> args,
@@ -224,6 +277,14 @@ makeMatcherAutoMarshall(ReturnType (*matcherFunc)(ArgTypes...),
       reinterpret_cast<void (*)()>(matcherFunc), matcherName, argKinds);
 }
 
+// Variadic operator overload.
+template <unsigned MinCount, unsigned MaxCount>
+std::unique_ptr<MatcherDescriptor>
+makeMatcherAutoMarshall(VariadicOperatorMatcherFunc<MinCount, MaxCount> func,
+                        StringRef matcherName) {
+  return std::make_unique<VariadicOperatorMatcherDescriptor>(
+      MinCount, MaxCount, func.varOp, matcherName);
+}
 } // namespace mlir::query::matcher::internal
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHER_MARSHALLERS_H
diff --git a/mlir/include/mlir/Query/Matcher/MatchFinder.h b/mlir/include/mlir/Query/Matcher/MatchFinder.h
index f8abf20ef60b..6d06ca13d134 100644
--- a/mlir/include/mlir/Query/Matcher/MatchFinder.h
+++ b/mlir/include/mlir/Query/Matcher/MatchFinder.h
@@ -21,7 +21,9 @@
 
 namespace mlir::query::matcher {
 
-/// A class that provides utilities to find operations in the IR.
+/// Finds and collects matches from the IR. After construction
+/// `collectMatches` can be used to traverse the IR and apply
+/// matchers.
 class MatchFinder {
 
 public:
diff --git a/mlir/include/mlir/Query/Matcher/MatchersInternal.h b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
index 183b2514e109..88109430b6fe 100644
--- a/mlir/include/mlir/Query/Matcher/MatchersInternal.h
+++ b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
@@ -8,11 +8,11 @@
 //
 // Implements the base layer of the matcher framework.
 //
-// Matchers are methods that return a Matcher which provides a method one of the
-// following methods: match(Operation *op), match(Operation *op,
-// SetVector<Operation *> &matchedOps)
+// Matchers are methods that return a Matcher which provides a
+// `match(...)` method whose parameters define the context of the match.
+// Support includes simple (unary) matchers as well as matcher combinators
+// (anyOf, allOf, etc.)
 //
-// The matcher functions are defined in include/mlir/IR/Matchers.h.
 // This file contains the wrapper classes needed to construct matchers for
 // mlir-query.
 //
@@ -25,6 +25,15 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 
 namespace mlir::query::matcher {
+class DynMatcher;
+namespace internal {
+
+bool allOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers);
+bool anyOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers);
+
+} // namespace internal
 
 // Defaults to false if T has no match() method with the signature:
 // match(Operation* op).
@@ -84,6 +93,27 @@ private:
   MatcherFn matcherFn;
 };
 
+// VariadicMatcher takes a vector of Matchers and returns true if any Matchers
+// match the given operation.
+using VariadicOperatorFunction = bool (*)(Operation *op,
+                                          SetVector<Operation *> *matchedOps,
+                                          ArrayRef<DynMatcher> innerMatchers);
+
+template <VariadicOperatorFunction Func>
+class VariadicMatcher : public MatcherInterface {
+public:
+  VariadicMatcher(std::vector<DynMatcher> matchers)
+      : matchers(std::move(matchers)) {}
+
+  bool match(Operation *op) override { return Func(op, nullptr, matchers); }
+  bool match(Operation *op, SetVector<Operation *> &matchedOps) override {
+    return Func(op, &matchedOps, matchers);
+  }
+
+private:
+  std::vector<DynMatcher> matchers;
+};
+
 // Matcher wraps a MatcherInterface implementation and provides match()
 // methods that redirect calls to the underlying implementation.
 class DynMatcher {
@@ -92,6 +122,31 @@ public:
   DynMatcher(MatcherInterface *implementation)
       : implementation(implementation) {}
 
+  // Construct from a variadic function.
+  enum VariadicOperator {
+    // Matches operations for which all provided matchers match.
+    AllOf,
+    // Matches operations for which at least one of the provided matchers
+    // matches.
+    AnyOf
+  };
+
+  static std::unique_ptr<DynMatcher>
+  constructVariadic(VariadicOperator Op,
+                    std::vector<DynMatcher> innerMatchers) {
+    switch (Op) {
+    case AllOf:
+      return std::make_unique<DynMatcher>(
+          new VariadicMatcher<internal::allOfVariadicOperator>(
+              std::move(innerMatchers)));
+    case AnyOf:
+      return std::make_unique<DynMatcher>(
+          new VariadicMatcher<internal::anyOfVariadicOperator>(
+              std::move(innerMatchers)));
+    }
+    llvm_unreachable("Invalid Op value.");
+  }
+
   template <typename MatcherFn>
   static std::unique_ptr<DynMatcher>
   constructDynMatcherFromMatcherFn(MatcherFn &matcherFn) {
@@ -113,6 +168,59 @@ private:
   std::string functionName;
 };
 
+// VariadicOperatorMatcher related types.
+template <typename... Ps>
+class VariadicOperatorMatcher {
+public:
+  VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp, Ps &&...params)
+      : varOp(varOp), params(std::forward<Ps>(params)...) {}
+
+  operator std::unique_ptr<DynMatcher>() const & {
+    return DynMatcher::constructVariadic(
+        varOp, getMatchers(std::index_sequence_for<Ps...>()));
+  }
+
+  operator std::unique_ptr<DynMatcher>() && {
+    return DynMatcher::constructVariadic(
+        varOp, std::move(*this).getMatchers(std::index_sequence_for<Ps...>()));
+  }
+
+private:
+  // Helper method to unpack the tuple into a vector.
+  template <std::size_t... Is>
+  std::vector<DynMatcher> getMatchers(std::index_sequence<Is...>) const & {
+    return {DynMatcher(std::get<Is>(params))...};
+  }
+
+  template <std::size_t... Is>
+  std::vector<DynMatcher> getMatchers(std::index_sequence<Is...>) && {
+    return {DynMatcher(std::get<Is>(std::move(params)))...};
+  }
+
+  const DynMatcher::VariadicOperator varOp;
+  std::tuple<Ps...> params;
+};
+
+// Overloaded function object to generate VariadicOperatorMatcher objects from
+// arbitrary matchers.
+template <unsigned MinCount, unsigned MaxCount>
+struct VariadicOperatorMatcherFunc {
+  DynMatcher::VariadicOperator varOp;
+
+  template <typename... Ms>
+  VariadicOperatorMatcher<Ms...> operator()(Ms &&...Ps) const {
+    static_assert(MinCount <= sizeof...(Ms) && sizeof...(Ms) <= MaxCount,
+                  "invalid number of parameters for variadic matcher");
+    return VariadicOperatorMatcher<Ms...>(varOp, std::forward<Ms>(Ps)...);
+  }
+};
+
+namespace internal {
+const VariadicOperatorMatcherFunc<1, std::numeric_limits<unsigned>::max()>
+    anyOf = {DynMatcher::AnyOf};
+const VariadicOperatorMatcherFunc<1, std::numeric_limits<unsigned>::max()>
+    allOf = {DynMatcher::AllOf};
+} // namespace internal
 } // namespace mlir::query::matcher
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHER_MATCHERSINTERNAL_H
diff --git a/mlir/include/mlir/Query/Matcher/SliceMatchers.h b/mlir/include/mlir/Query/Matcher/SliceMatchers.h
index 441205b3a961..7181648f06f8 100644
--- a/mlir/include/mlir/Query/Matcher/SliceMatchers.h
+++ b/mlir/include/mlir/Query/Matcher/SliceMatchers.h
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides matchers for MLIRQuery that peform slicing analysis
+// This file defines slicing-analysis matchers that extend and abstract the
+// core implementations from `SliceAnalysis.h`.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,9 +17,9 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/IR/Operation.h"
 
-/// A matcher encapsulating `getBackwardSlice` method from SliceAnalysis.h.
-/// Additionally, it limits the slice computation to a certain depth level using
-/// a custom filter.
+/// Computes the backward-slice of all transitive defs reachable from `rootOp`,
+/// if `innerMatcher` matches. The traversal stops once the desired depth level
+/// is reached.
 ///
 /// Example: starting from node 9, assuming the matcher
 /// computes the slice for the first two depth levels:
@@ -119,6 +120,77 @@ bool BackwardSliceMatcher<Matcher>::matches(
                            : backwardSlice.size() >= 1;
 }
 
+/// Computes the backward-slice of all transitive defs reachable from `rootOp`,
+/// if `innerMatcher` matches. Traversal stops where `filterMatcher` matches.
+template <typename BaseMatcher, typename Filter>
+class PredicateBackwardSliceMatcher {
+public:
+  PredicateBackwardSliceMatcher(BaseMatcher innerMatcher, Filter filterMatcher,
+                                bool inclusive, bool omitBlockArguments,
+                                bool omitUsesFromAbove)
+      : innerMatcher(std::move(innerMatcher)),
+        filterMatcher(std::move(filterMatcher)), inclusive(inclusive),
+        omitBlockArguments(omitBlockArguments),
+        omitUsesFromAbove(omitUsesFromAbove) {}
+
+  bool match(Operation *rootOp, SetVector<Operation *> &backwardSlice) {
+    backwardSlice.clear();
+    BackwardSliceOptions options;
+    options.inclusive = inclusive;
+    options.omitUsesFromAbove = omitUsesFromAbove;
+    options.omitBlockArguments = omitBlockArguments;
+    if (innerMatcher.match(rootOp)) {
+      options.filter = [&](Operation *subOp) {
+        return !filterMatcher.match(subOp);
+      };
+      LogicalResult result = getBackwardSlice(rootOp, &backwardSlice, options);
+      assert(result.succeeded() && "expected backward slice to succeed");
+      (void)result;
+      return options.inclusive ? backwardSlice.size() > 1
+                               : backwardSlice.size() >= 1;
+    }
+    return false;
+  }
+
+private:
+  BaseMatcher innerMatcher;
+  Filter filterMatcher;
+  bool inclusive;
+  bool omitBlockArguments;
+  bool omitUsesFromAbove;
+};
+
+/// Computes the forward-slice of all users reachable from `rootOp`,
+/// if `innerMatcher` matches. Traversal stops where `filterMatcher` matches.
+template <typename BaseMatcher, typename Filter>
+class PredicateForwardSliceMatcher {
+public:
+  PredicateForwardSliceMatcher(BaseMatcher innerMatcher, Filter filterMatcher,
+                               bool inclusive)
+      : innerMatcher(std::move(innerMatcher)),
+        filterMatcher(std::move(filterMatcher)), inclusive(inclusive) {}
+
+  bool match(Operation *rootOp, SetVector<Operation *> &forwardSlice) {
+    forwardSlice.clear();
+    ForwardSliceOptions options;
+    options.inclusive = inclusive;
+    if (innerMatcher.match(rootOp)) {
+      options.filter = [&](Operation *subOp) {
+        return !filterMatcher.match(subOp);
+      };
+      getForwardSlice(rootOp, &forwardSlice, options);
+      return options.inclusive ? forwardSlice.size() > 1
+                               : forwardSlice.size() >= 1;
+    }
+    return false;
+  }
+
+private:
+  BaseMatcher innerMatcher;
+  Filter filterMatcher;
+  bool inclusive;
+};
+
 /// Matches transitive defs of a top-level operation up to N levels.
 template <typename Matcher>
 inline BackwardSliceMatcher<Matcher>
@@ -130,7 +202,7 @@ m_GetDefinitions(Matcher innerMatcher, int64_t maxDepth, bool inclusive,
                                        omitUsesFromAbove);
 }
 
-/// Matches all transitive defs of a top-level operation up to N levels
+/// Matches all transitive defs of a top-level operation up to N levels.
 template <typename Matcher>
 inline BackwardSliceMatcher<Matcher> m_GetAllDefinitions(Matcher innerMatcher,
                                                          int64_t maxDepth) {
@@ -139,6 +211,28 @@ inline BackwardSliceMatcher<Matcher> m_GetAllDefinitions(Matcher innerMatcher,
                                        false, false);
 }
 
+/// Matches all transitive defs of a top-level operation and stops where
+/// `filterMatcher` rejects.
+template <typename BaseMatcher, typename Filter>
+inline PredicateBackwardSliceMatcher<BaseMatcher, Filter>
+m_GetDefinitionsByPredicate(BaseMatcher innerMatcher, Filter filterMatcher,
+                            bool inclusive, bool omitBlockArguments,
+                            bool omitUsesFromAbove) {
+  return PredicateBackwardSliceMatcher<BaseMatcher, Filter>(
+      std::move(innerMatcher), std::move(filterMatcher), inclusive,
+      omitBlockArguments, omitUsesFromAbove);
+}
+
+/// Matches all users of a top-level operation and stops where
+/// `filterMatcher` rejects.
+template <typename BaseMatcher, typename Filter>
+inline PredicateForwardSliceMatcher<BaseMatcher, Filter>
+m_GetUsersByPredicate(BaseMatcher innerMatcher, Filter filterMatcher,
+                      bool inclusive) {
+  return PredicateForwardSliceMatcher<BaseMatcher, Filter>(
+      std::move(innerMatcher), std::move(filterMatcher), inclusive);
+}
+
 } // namespace mlir::query::matcher
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHERS_SLICEMATCHERS_H
diff --git a/mlir/include/mlir/Query/Matcher/VariantValue.h b/mlir/include/mlir/Query/Matcher/VariantValue.h
index 98c0a18e2510..1a47576de184 100644
--- a/mlir/include/mlir/Query/Matcher/VariantValue.h
+++ b/mlir/include/mlir/Query/Matcher/VariantValue.h
@@ -26,7 +26,12 @@ enum class ArgKind { Boolean, Matcher, Signed, String };
 // A variant matcher object to abstract simple and complex matchers into a
 // single object type.
 class VariantMatcher {
-  class MatcherOps;
+  class MatcherOps {
+  public:
+    std::optional<DynMatcher>
+    constructVariadicOperator(DynMatcher::VariadicOperator varOp,
+                              ArrayRef<VariantMatcher> innerMatchers) const;
+  };
 
   // Payload interface to be specialized by each matcher type. It follows a
   // similar interface as VariantMatcher itself.
@@ -43,6 +48,9 @@ public:
 
   // Clones the provided matcher.
   static VariantMatcher SingleMatcher(DynMatcher matcher);
+  static VariantMatcher
+  VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp,
+                          ArrayRef<VariantMatcher> args);
 
   // Makes the matcher the "null" matcher.
   void reset();
@@ -61,6 +69,7 @@ private:
       : value(std::move(value)) {}
 
   class SinglePayload;
+  class VariadicOpPayload;
 
   std::shared_ptr<const Payload> value;
 };
diff --git a/mlir/lib/Query/Matcher/CMakeLists.txt b/mlir/lib/Query/Matcher/CMakeLists.txt
index 629479bf7adc..ba202762fdfb 100644
--- a/mlir/lib/Query/Matcher/CMakeLists.txt
+++ b/mlir/lib/Query/Matcher/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRQueryMatcher
   MatchFinder.cpp
+  MatchersInternal.cpp
   Parser.cpp
   RegistryManager.cpp
   VariantValue.cpp
diff --git a/mlir/lib/Query/Matcher/MatchersInternal.cpp b/mlir/lib/Query/Matcher/MatchersInternal.cpp
new file mode 100644
index 000000000000..01f412ade846
--- /dev/null
+++ b/mlir/lib/Query/Matcher/MatchersInternal.cpp
@@ -0,0 +1,33 @@
+//===--- MatchersInternal.cpp----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Query/Matcher/MatchersInternal.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir::query::matcher {
+
+namespace internal {
+
+bool allOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers) {
+  return llvm::all_of(innerMatchers, [&](const DynMatcher &matcher) {
+    if (matchedOps)
+      return matcher.match(op, *matchedOps);
+    return matcher.match(op);
+  });
+}
+bool anyOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers) {
+  return llvm::any_of(innerMatchers, [&](const DynMatcher &matcher) {
+    if (matchedOps)
+      return matcher.match(op, *matchedOps);
+    return matcher.match(op);
+  });
+}
+} // namespace internal
+} // namespace mlir::query::matcher
diff --git a/mlir/lib/Query/Matcher/RegistryManager.cpp b/mlir/lib/Query/Matcher/RegistryManager.cpp
index 4b511c5f009e..08b610453b11 100644
--- a/mlir/lib/Query/Matcher/RegistryManager.cpp
+++ b/mlir/lib/Query/Matcher/RegistryManager.cpp
@@ -64,7 +64,7 @@ std::vector<ArgKind> RegistryManager::getAcceptedCompletionTypes(
     unsigned argNumber = ctxEntry.second;
     std::vector<ArgKind> nextTypeSet;
 
-    if (argNumber < ctor->getNumArgs())
+    if (ctor->isVariadic() || argNumber < ctor->getNumArgs())
       ctor->getArgKinds(argNumber, nextTypeSet);
 
     typeSet.insert(nextTypeSet.begin(), nextTypeSet.end());
@@ -83,7 +83,7 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
     const internal::MatcherDescriptor &matcher = *m.getValue();
     llvm::StringRef name = m.getKey();
 
-    unsigned numArgs = matcher.getNumArgs();
+    unsigned numArgs = matcher.isVariadic() ? 1 : matcher.getNumArgs();
     std::vector<std::vector<ArgKind>> argKinds(numArgs);
 
     for (const ArgKind &kind : acceptedTypes) {
@@ -115,6 +115,9 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
       }
     }
 
+    if (matcher.isVariadic())
+      os << ",...";
+
     os << ")";
     typedText += "(";
 
diff --git a/mlir/lib/Query/Matcher/VariantValue.cpp b/mlir/lib/Query/Matcher/VariantValue.cpp
index 1cb2d48f9d56..7bf4774dba83 100644
--- a/mlir/lib/Query/Matcher/VariantValue.cpp
+++ b/mlir/lib/Query/Matcher/VariantValue.cpp
@@ -27,12 +27,64 @@ private:
   DynMatcher matcher;
 };
 
+class VariantMatcher::VariadicOpPayload : public VariantMatcher::Payload {
+public:
+  VariadicOpPayload(DynMatcher::VariadicOperator varOp,
+                    std::vector<VariantMatcher> args)
+      : varOp(varOp), args(std::move(args)) {}
+
+  std::optional<DynMatcher> getDynMatcher() const override {
+    std::vector<DynMatcher> dynMatchers;
+    for (auto variantMatcher : args) {
+      std::optional<DynMatcher> dynMatcher = variantMatcher.getDynMatcher();
+      if (dynMatcher)
+        dynMatchers.push_back(dynMatcher.value());
+    }
+    auto result = DynMatcher::constructVariadic(varOp, dynMatchers);
+    return *result;
+  }
+
+  std::string getTypeAsString() const override {
+    std::string inner;
+    llvm::interleave(
+        args, [&](auto const &arg) { inner += arg.getTypeAsString(); },
+        [&] { inner += " & "; });
+    return inner;
+  }
+
+private:
+  const DynMatcher::VariadicOperator varOp;
+  const std::vector<VariantMatcher> args;
+};
+
 VariantMatcher::VariantMatcher() = default;
 
 VariantMatcher VariantMatcher::SingleMatcher(DynMatcher matcher) {
   return VariantMatcher(std::make_shared<SinglePayload>(std::move(matcher)));
 }
 
+VariantMatcher
+VariantMatcher::VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp,
+                                        ArrayRef<VariantMatcher> args) {
+  return VariantMatcher(
+      std::make_shared<VariadicOpPayload>(varOp, std::move(args)));
+}
+
+std::optional<DynMatcher> VariantMatcher::MatcherOps::constructVariadicOperator(
+    DynMatcher::VariadicOperator varOp,
+    ArrayRef<VariantMatcher> innerMatchers) const {
+  std::vector<DynMatcher> dynMatchers;
+  for (const auto &innerMatcher : innerMatchers) {
+    if (!innerMatcher.value)
+      return std::nullopt;
+    std::optional<DynMatcher> inner = innerMatcher.value->getDynMatcher();
+    if (!inner)
+      return std::nullopt;
+    dynMatchers.push_back(*inner);
+  }
+  return *DynMatcher::constructVariadic(varOp, dynMatchers);
+}
+
 std::optional<DynMatcher> VariantMatcher::getDynMatcher() const {
   return value ? value->getDynMatcher() : std::nullopt;
 }
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index 803284d6df86..637e1f3cdef8 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -10,6 +10,7 @@
 #include "QueryParser.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Query/Matcher/MatchFinder.h"
 #include "mlir/Query/QuerySession.h"
 #include "llvm/ADT/SetVector.h"
@@ -68,6 +69,8 @@ static Operation *extractFunction(std::vector<Operation *> &ops,
   // Clone operations and build function body
   std::vector<Operation *> clonedOps;
   std::vector<Value> clonedVals;
+  // TODO: Handle extraction of operations with compute payloads defined via
+  // regions.
   for (Operation *slicedOp : slice) {
     Operation *clonedOp =
         clonedOps.emplace_back(builder.clone(*slicedOp, mapper));
@@ -129,6 +132,8 @@ LogicalResult MatchQuery::run(llvm::raw_ostream &os, QuerySession &qs) const {
         finder.flattenMatchedOps(matches);
     Operation *function =
         extractFunction(flattenedMatches, rootOp->getContext(), functionName);
+    if (failed(verify(function)))
+      return mlir::failure();
     os << "\n" << *function << "\n\n";
     function->erase();
     return mlir::success();
diff --git a/mlir/test/mlir-query/complex-test.mlir b/mlir/test/mlir-query/backward-slice-union.mlir
similarity index 71%
rename from mlir/test/mlir-query/complex-test.mlir
rename to mlir/test/mlir-query/backward-slice-union.mlir
index ad96f03747a4..f8f88c204374 100644
--- a/mlir/test/mlir-query/complex-test.mlir
+++ b/mlir/test/mlir-query/backward-slice-union.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-query %s -c "m getAllDefinitions(hasOpName(\"arith.addf\"),2)" | FileCheck %s
+// RUN: mlir-query %s -c "m anyOf(getAllDefinitions(hasOpName(\"arith.addf\"),2),getAllDefinitions(hasOpName(\"tensor.extract\"),1))" | FileCheck %s
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>) {
@@ -19,14 +19,23 @@ func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>)
 }
 
 // CHECK: Match #1:
-
 // CHECK: %[[LINALG:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} 
 // CHECK-SAME: ins(%arg0 : tensor<5x5xf32>) outs(%arg1 : tensor<5x5xf32>)
+
+// CHECK: {{.*}}.mlir:7:10: note: "root" binds here
 // CHECK: %[[ADDF1:.*]] = arith.addf %in, %in : f32
 
 // CHECK: Match #2:
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[LINALG]] {{\[\[.*\]\]}} : tensor<5x5xf32> into tensor<25xf32>
+// CHECK: %[[C2:.*]] = arith.constant {{.*}} : index
 
+// CHECK: {{.*}}.mlir:14:18: note: "root" binds here
+// CHECK: %[[EXTRACTED:.*]] = tensor.extract %[[COLLAPSED]][%[[C2]]] : tensor<25xf32>
+
+// CHECK: Match #3:
 // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[LINALG]] {{\[\[.*\]\]}} : tensor<5x5xf32> into tensor<25xf32>
 // CHECK: %[[C2:.*]] = arith.constant {{.*}} : index
 // CHECK: %[[EXTRACTED:.*]] = tensor.extract %[[COLLAPSED]][%[[C2]]] : tensor<25xf32>
+
+// CHECK: {{.*}}.mlir:15:10: note: "root" binds here
 // CHECK: %[[ADDF2:.*]] = arith.addf %[[EXTRACTED]], %[[EXTRACTED]] : f32  
diff --git a/mlir/test/mlir-query/forward-slice-by-predicate.mlir b/mlir/test/mlir-query/forward-slice-by-predicate.mlir
new file mode 100644
index 000000000000..e11378da89d9
--- /dev/null
+++ b/mlir/test/mlir-query/forward-slice-by-predicate.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-query %s -c "m getUsersByPredicate(anyOf(hasOpName(\"memref.alloc\"),isConstantOp()),anyOf(hasOpName(\"affine.load\"), hasOpName(\"memref.dealloc\")),true)" | FileCheck %s
+
+func.func @slice_depth1_loop_nest_with_offsets() {
+  %0 = memref.alloc() : memref<100xf32>
+  %cst = arith.constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    %a0 = affine.apply affine_map<(d0) -> (d0 + 2)>(%i0)
+    affine.store %cst, %0[%a0] : memref<100xf32>
+  }
+  affine.for %i1 = 4 to 8 {
+    %a1 = affine.apply affine_map<(d0) -> (d0 - 1)>(%i1)
+    %1 = affine.load %0[%a1] : memref<100xf32>
+  }
+  return
+}
+
+// CHECK: Match #1:
+// CHECK: {{.*}}.mlir:4:8: note: "root" binds here
+// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<100xf32>
+
+// CHECK: affine.store %cst, %0[%a0] : memref<100xf32>
+
+// CHECK: Match #2:
+// CHECK: {{.*}}.mlir:5:10: note: "root" binds here
+// CHECK: %[[CST:.*]] = arith.constant 7.000000e+00 : f32
+
+// CHECK: affine.store %[[CST]], %0[%a0] : memref<100xf32>
diff --git a/mlir/test/mlir-query/logical-operator-test.mlir b/mlir/test/mlir-query/logical-operator-test.mlir
new file mode 100644
index 000000000000..ac05428287ab
--- /dev/null
+++ b/mlir/test/mlir-query/logical-operator-test.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-query %s -c "m allOf(hasOpName(\"memref.alloca\"), hasOpAttrName(\"alignment\"))" | FileCheck %s
+
+func.func @dynamic_alloca(%arg0: index, %arg1: index) -> memref<?x?xf32> {
+  %0 = memref.alloca(%arg0, %arg1) : memref<?x?xf32>
+  memref.alloca(%arg0, %arg1) {alignment = 32} : memref<?x?xf32>
+  return %0 : memref<?x?xf32>
+}
+
+// CHECK: Match #1:
+// CHECK: {{.*}}.mlir:5:3: note: "root" binds here
+// CHECK: memref.alloca(%arg0, %arg1) {alignment = 32} : memref<?x?xf32>
diff --git a/mlir/test/mlir-query/slice-function-extraction.mlir b/mlir/test/mlir-query/slice-function-extraction.mlir
new file mode 100644
index 000000000000..e55d5e77c573
--- /dev/null
+++ b/mlir/test/mlir-query/slice-function-extraction.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-query %s -c "m getDefinitionsByPredicate(hasOpName(\"memref.store\"),hasOpName(\"memref.alloc\"),true,false,false).extract(\"backward_slice\")" | FileCheck %s
+
+// CHECK:       func.func @backward_slice(%{{.*}}: memref<10xf32>) -> (f32, index, index, f32, index, index, f32) {
+// CHECK:         %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:    %[[I0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[C0]]]
+// CHECK-NEXT:    memref.store %[[CST0]], %{{.*}}[%[[I0]]] : memref<10xf32>
+// CHECK-NEXT:    %[[CST2:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[I1:.*]] = affine.apply affine_map<() -> (0)>()
+// CHECK-NEXT:    memref.store %[[CST2]], %{{.*}}[%[[I1]]] : memref<10xf32>
+// CHECK-NEXT:    %[[C1:.*]] = arith.constant 0 : index
+// CHECK-NEXT:    %[[LOAD:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<10xf32>
+// CHECK-NEXT:    memref.store %[[LOAD]], %{{.*}}[%[[C1]]] : memref<10xf32>
+// CHECK-NEXT:    return %[[CST0]], %[[C0]], %[[I0]], %[[CST2]], %[[I1]], %[[C1]], %[[LOAD]] : f32, index, index, f32, index, index, f32
+
+func.func @slicing_memref_store_trivial() {
+  %0 = memref.alloc() : memref<10xf32>
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.apply affine_map<()[s0] -> (s0)>()[%c0]
+    memref.store %cst, %0[%1] : memref<10xf32>
+    %2 = memref.load %0[%c0] : memref<10xf32>
+    %3 = affine.apply affine_map<()[] -> (0)>()[]
+    memref.store %cst, %0[%3] : memref<10xf32>
+    memref.store %2, %0[%c0] : memref<10xf32>
+  }
+  return
+}
diff --git a/mlir/tools/mlir-query/mlir-query.cpp b/mlir/tools/mlir-query/mlir-query.cpp
index 78c0ec97c0cd..8a17a33c6183 100644
--- a/mlir/tools/mlir-query/mlir-query.cpp
+++ b/mlir/tools/mlir-query/mlir-query.cpp
@@ -40,12 +40,22 @@ int main(int argc, char **argv) {
   query::matcher::Registry matcherRegistry;
 
   // Matchers registered in alphabetical order for consistency:
+  matcherRegistry.registerMatcher("allOf", query::matcher::internal::allOf);
+  matcherRegistry.registerMatcher("anyOf", query::matcher::internal::anyOf);
+  matcherRegistry.registerMatcher(
+      "getAllDefinitions",
+      query::matcher::m_GetAllDefinitions<query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher(
       "getDefinitions",
       query::matcher::m_GetDefinitions<query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher(
-      "getAllDefinitions",
-      query::matcher::m_GetAllDefinitions<query::matcher::DynMatcher>);
+      "getDefinitionsByPredicate",
+      query::matcher::m_GetDefinitionsByPredicate<query::matcher::DynMatcher,
+                                                  query::matcher::DynMatcher>);
+  matcherRegistry.registerMatcher(
+      "getUsersByPredicate",
+      query::matcher::m_GetUsersByPredicate<query::matcher::DynMatcher,
+                                            query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher("hasOpAttrName",
                                   static_cast<HasOpAttrName *>(m_Attr));
   matcherRegistry.registerMatcher("hasOpName", static_cast<HasOpName *>(m_Op));

From 087d83e0c6d94c1ad6a68b089950d05185d0e043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu@arm.com>
Date: Tue, 17 Jun 2025 13:20:52 +0100
Subject: [PATCH 0629/1322] [SLP] vectorizeStores: Name things a bit more
 clearly (NFC) (#144511)

I believe the new variable names better convey their purpose. However, I
also believe that function is more complex than it needs to be, and this
tiny patch should be seen as a first step towards (maybe) further
refactoring.

The previous names were very generic (Size, Sz, Cnt, StartIdx). This
made it easy to get confused given that the vecotrizeStores() function
is already complex enough.

My hope would be to eventually have a function concise enough to clearly
see what are the different strategies being attempted to vectorise a
group of related store instructions.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 113 ++++++++++--------
 1 file changed, 63 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c3ca22dce0cc..9a7e9b75da51 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21191,25 +21191,30 @@ bool SLPVectorizerPass::vectorizeStores(
         ++Repeat;
         bool RepeatChanged = false;
         bool AnyProfitableGraph = false;
-        for (unsigned Size : CandidateVFs) {
+        for (unsigned VF : CandidateVFs) {
           AnyProfitableGraph = false;
-          unsigned StartIdx = std::distance(
-              RangeSizes.begin(),
-              find_if(RangeSizes,
-                      std::bind(IsNotVectorized, Size >= MaxRegVF, _1)));
-          while (StartIdx < End) {
-            unsigned EndIdx = std::distance(
+          unsigned FirstUnvecStore =
+              std::distance(RangeSizes.begin(),
+                            find_if(RangeSizes, std::bind(IsNotVectorized,
+                                                          VF >= MaxRegVF, _1)));
+
+          // Form slices of size VF starting from FirstUnvecStore and try to
+          // vectorize them.
+          while (FirstUnvecStore < End) {
+            unsigned FirstVecStore = std::distance(
                 RangeSizes.begin(),
-                find_if(RangeSizes.drop_front(StartIdx),
-                        std::bind(IsVectorized, Size >= MaxRegVF, _1)));
-            unsigned Sz = EndIdx >= End ? End : EndIdx;
-            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
-              if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
-                                  Size >= MaxRegVF)) {
-                ++Cnt;
+                find_if(RangeSizes.drop_front(FirstUnvecStore),
+                        std::bind(IsVectorized, VF >= MaxRegVF, _1)));
+            unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
+            for (unsigned SliceStartIdx = FirstUnvecStore;
+                 SliceStartIdx + VF <= MaxSliceEnd;) {
+              if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
+                                  VF >= MaxRegVF)) {
+                ++SliceStartIdx;
                 continue;
               }
-              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              ArrayRef<Value *> Slice =
+                  ArrayRef(Operands).slice(SliceStartIdx, VF);
               assert(all_of(Slice,
                             [&](Value *V) {
                               return cast<StoreInst>(V)
@@ -21223,19 +21228,23 @@ bool SLPVectorizerPass::vectorizeStores(
               if (!NonSchedulable.empty()) {
                 auto [NonSchedSizeMax, NonSchedSizeMin] =
                     NonSchedulable.lookup(Slice.front());
-                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
-                  Cnt += NonSchedSizeMax;
+                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
+                  // VF is too ambitious. Try to vectorize another slice before
+                  // trying a smaller VF.
+                  SliceStartIdx += NonSchedSizeMax;
                   continue;
                 }
               }
               unsigned TreeSize;
               std::optional<bool> Res =
-                  vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
+                  vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
               if (!Res) {
+                // Update the range of non schedulable VFs for slices starting
+                // at SliceStartIdx.
                 NonSchedulable
-                    .try_emplace(Slice.front(), std::make_pair(Size, Size))
+                    .try_emplace(Slice.front(), std::make_pair(VF, VF))
                     .first->getSecond()
-                    .second = Size;
+                    .second = VF;
               } else if (*Res) {
                 // Mark the vectorized stores so that we don't vectorize them
                 // again.
@@ -21246,63 +21255,67 @@ bool SLPVectorizerPass::vectorizeStores(
                 // If we vectorized initial block, no need to try to vectorize
                 // it again.
                 for (std::pair<unsigned, unsigned> &P :
-                     RangeSizes.slice(Cnt, Size))
+                     RangeSizes.slice(SliceStartIdx, VF))
                   P.first = P.second = 0;
-                if (Cnt < StartIdx + MinVF) {
-                  for (std::pair<unsigned, unsigned> &P :
-                       RangeSizes.slice(StartIdx, Cnt - StartIdx))
+                if (SliceStartIdx < FirstUnvecStore + MinVF) {
+                  for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
+                           FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
                     P.first = P.second = 0;
-                  StartIdx = Cnt + Size;
+                  FirstUnvecStore = SliceStartIdx + VF;
                 }
-                if (Cnt > Sz - Size - MinVF) {
+                if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
                   for (std::pair<unsigned, unsigned> &P :
-                       RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)))
+                       RangeSizes.slice(SliceStartIdx + VF,
+                                        MaxSliceEnd - (SliceStartIdx + VF)))
                     P.first = P.second = 0;
-                  if (Sz == End)
-                    End = Cnt;
-                  Sz = Cnt;
+                  if (MaxSliceEnd == End)
+                    End = SliceStartIdx;
+                  MaxSliceEnd = SliceStartIdx;
                 }
-                Cnt += Size;
+                SliceStartIdx += VF;
                 continue;
               }
-              if (Size > 2 && Res &&
-                  !all_of(RangeSizes.slice(Cnt, Size),
-                          std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
+              if (VF > 2 && Res &&
+                  !all_of(RangeSizes.slice(SliceStartIdx, VF),
+                          std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
                                     _1))) {
-                Cnt += Size;
+                SliceStartIdx += VF;
                 continue;
               }
               // Check for the very big VFs that we're not rebuilding same
               // trees, just with larger number of elements.
-              if (Size > MaxRegVF && TreeSize > 1 &&
-                  all_of(RangeSizes.slice(Cnt, Size),
+              if (VF > MaxRegVF && TreeSize > 1 &&
+                  all_of(RangeSizes.slice(SliceStartIdx, VF),
                          std::bind(FirstSizeSame, TreeSize, _1))) {
-                Cnt += Size;
-                while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
-                  ++Cnt;
+                SliceStartIdx += VF;
+                while (SliceStartIdx != MaxSliceEnd &&
+                       RangeSizes[SliceStartIdx].first == TreeSize)
+                  ++SliceStartIdx;
                 continue;
               }
-              if (TreeSize > 1)
+              if (TreeSize > 1) {
                 for (std::pair<unsigned, unsigned> &P :
-                     RangeSizes.slice(Cnt, Size)) {
-                  if (Size >= MaxRegVF)
+                     RangeSizes.slice(SliceStartIdx, VF)) {
+                  if (VF >= MaxRegVF)
                     P.second = std::max(P.second, TreeSize);
                   else
                     P.first = std::max(P.first, TreeSize);
                 }
-              ++Cnt;
+              }
+              ++SliceStartIdx;
               AnyProfitableGraph = true;
             }
-            if (StartIdx >= End)
+            if (FirstUnvecStore >= End)
               break;
-            if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
+            if (MaxSliceEnd - FirstUnvecStore < VF &&
+                MaxSliceEnd - FirstUnvecStore >= MinVF)
               AnyProfitableGraph = true;
-            StartIdx = std::distance(
+            FirstUnvecStore = std::distance(
                 RangeSizes.begin(),
-                find_if(RangeSizes.drop_front(Sz),
-                        std::bind(IsNotVectorized, Size >= MaxRegVF, _1)));
+                find_if(RangeSizes.drop_front(MaxSliceEnd),
+                        std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
           }
-          if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
+          if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
             break;
         }
         // All values vectorized - exit.

From cb011d3199e1160ad2706cb5b1d43692fa4784d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Tue, 17 Jun 2025 14:32:05 +0200
Subject: [PATCH 0630/1322] [CUDA][HIP] Add a __device__ version of
 std::__glibcxx_assert_fail() (#136133)

libstdc++ 15 uses the non-constexpr function
std::__glibcxx_assert_fail() to trigger compilation errors when the
__glibcxx_assert(cond) macro is used in a constantly evaluated context.

Compilation fails when using code from the libstdc++ (such as
std::array) on device code, since these assertions invoke a
non-constexpr host function from device code.

This patch proposes a cuda wrapper header "bits/c++config.h" which adds
a __device__ version of std::__glibcxx_assert_fail().

Solves SWDEV-518041
---
 clang/lib/Headers/CMakeLists.txt              |  1 +
 .../Headers/cuda_wrappers/bits/c++config.h    | 51 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 clang/lib/Headers/cuda_wrappers/bits/c++config.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c1c9d2e8c7b7..c96d209c1fc0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -341,6 +341,7 @@ set(cuda_wrapper_files
 )
 
 set(cuda_wrapper_bits_files
+  cuda_wrappers/bits/c++config.h
   cuda_wrappers/bits/shared_ptr_base.h
   cuda_wrappers/bits/basic_string.h
   cuda_wrappers/bits/basic_string.tcc
diff --git a/clang/lib/Headers/cuda_wrappers/bits/c++config.h b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
new file mode 100644
index 000000000000..eafa13a9cc64
--- /dev/null
+++ b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
@@ -0,0 +1,51 @@
+// libstdc++ uses the non-constexpr function std::__glibcxx_assert_fail()
+// to trigger compilation errors when the __glibcxx_assert(cond) macro
+// is used in a constexpr context.
+// Compilation fails when using code from the libstdc++ (such as std::array) on
+// device code, since these assertions invoke a non-constexpr host function from
+// device code.
+//
+// To work around this issue, we declare our own device version of the function
+
+#ifndef __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
+#define __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
+
+#include_next <bits/c++config.h>
+
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+
+#ifdef _GLIBCXX_VERBOSE_ASSERT
+__attribute__((device, noreturn)) inline void
+__glibcxx_assert_fail(const char *file, int line, const char *function,
+                      const char *condition) noexcept {
+  if (file && function && condition)
+    __builtin_printf("%s:%d: %s: Assertion '%s' failed.\n", file, line,
+                     function, condition);
+  else if (function)
+    __builtin_printf("%s: Undefined behavior detected.\n", function);
+  __builtin_abort();
+}
+#endif
+
+#endif
+__attribute__((device, noreturn, __always_inline__,
+               __visibility__("default"))) inline void
+__glibcxx_assert_fail(...) noexcept {
+  __builtin_abort();
+}
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif

From 3377b56338d93760507e1707ebde48536e28ee1c Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 17 Jun 2025 08:39:15 -0400
Subject: [PATCH 0631/1322] Revert "[clang] Add managarm support" (#144514)

Reverts llvm/llvm-project#139271

There are multiple failing build bots:
https://lab.llvm.org/buildbot/#/builders/10/builds/7482
https://lab.llvm.org/buildbot/#/builders/11/builds/17473
---
 clang/lib/Basic/Targets.cpp                   |   9 -
 clang/lib/Basic/Targets/OSTargets.h           |  30 --
 clang/lib/Driver/CMakeLists.txt               |   1 -
 clang/lib/Driver/Driver.cpp                   |   4 -
 clang/lib/Driver/ToolChains/Gnu.cpp           |   2 -
 clang/lib/Driver/ToolChains/Managarm.cpp      | 218 --------------
 clang/lib/Driver/ToolChains/Managarm.h        |  55 ----
 clang/lib/Lex/InitHeaderSearch.cpp            |   1 -
 .../lib/aarch64-managarm-mlibc/.keep          |   0
 .../lib/riscv64-managarm-mlibc/.keep          |   0
 .../lib/x86_64-managarm-mlibc/.keep           |   0
 .../lib64/aarch64-managarm-mlibc/.keep        |   0
 .../lib64/riscv64-managarm-mlibc/.keep        |   0
 .../lib64/x86_64-managarm-mlibc/.keep         |   0
 .../aarch64-managarm-mlibc/c++/10/.keep       |   0
 .../usr/include/c++/10/.keep                  |   0
 .../usr/include/c++/v1/.keep                  |   0
 .../riscv64-managarm-mlibc/c++/10/.keep       |   0
 .../x86_64-managarm-mlibc/c++/10/.keep        |   0
 .../usr/lib/aarch64-managarm-mlibc/.keep      |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbegin.o   |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginS.o  |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginT.o  |   0
 .../usr/lib/riscv64-managarm-mlibc/.keep      |   0
 .../usr/lib/x86_64-managarm-mlibc/.keep       |   0
 .../basic_managarm_tree/usr/lib64/.keep       |   0
 clang/test/Driver/managarm.cpp                | 267 ------------------
 clang/test/Preprocessor/init.c                |   5 -
 .../predefined-macros-no-warnings.c           |   3 -
 35 files changed, 595 deletions(-)
 delete mode 100644 clang/lib/Driver/ToolChains/Managarm.cpp
 delete mode 100644 clang/lib/Driver/ToolChains/Managarm.h
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
 delete mode 100644 clang/test/Driver/managarm.cpp

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index af1111a86330..9889141ad208 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -164,9 +164,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
       }
-    case llvm::Triple::Managarm:
-      return std::make_unique<ManagarmTargetInfo<AArch64leTargetInfo>>(Triple,
-                                                                       Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
@@ -469,9 +466,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<RISCV64TargetInfo>>(Triple,
                                                                    Opts);
       }
-    case llvm::Triple::Managarm:
-      return std::make_unique<ManagarmTargetInfo<RISCV64TargetInfo>>(Triple,
-                                                                     Opts);
     default:
       return std::make_unique<RISCV64TargetInfo>(Triple, Opts);
     }
@@ -660,9 +654,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<PS5OSTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::Hurd:
       return std::make_unique<HurdTargetInfo<X86_64TargetInfo>>(Triple, Opts);
-    case llvm::Triple::Managarm:
-      return std::make_unique<ManagarmTargetInfo<X86_64TargetInfo>>(Triple,
-                                                                    Opts);
     default:
       return std::make_unique<X86_64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 5dac699c2bb4..d148b38d03c7 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -395,36 +395,6 @@ public:
   }
 };
 
-// Managarm Target
-template <typename Target>
-class LLVM_LIBRARY_VISIBILITY ManagarmTargetInfo : public OSTargetInfo<Target> {
-protected:
-  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
-                    MacroBuilder &Builder) const override {
-    DefineStd(Builder, "unix", Opts);
-    Builder.defineMacro("__managarm__");
-    if (Opts.POSIXThreads)
-      Builder.defineMacro("_REENTRANT");
-    if (Opts.CPlusPlus)
-      Builder.defineMacro("_GNU_SOURCE");
-    if (this->HasFloat128)
-      Builder.defineMacro("__FLOAT128__");
-  }
-
-public:
-  ManagarmTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : OSTargetInfo<Target>(Triple, Opts) {
-    switch (Triple.getArch()) {
-    default:
-      break;
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      this->HasFloat128 = true;
-      break;
-    }
-  }
-};
-
 // NetBSD Target
 template <typename Target>
 class LLVM_LIBRARY_VISIBILITY NetBSDTargetInfo : public OSTargetInfo<Target> {
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 3cfd671e9d8f..44e16edfb1cc 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -66,7 +66,6 @@ add_clang_library(clangDriver
   ToolChains/HLSL.cpp
   ToolChains/Hurd.cpp
   ToolChains/Linux.cpp
-  ToolChains/Managarm.cpp
   ToolChains/MipsLinux.cpp
   ToolChains/MinGW.cpp
   ToolChains/MSP430.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 780bfc83dc62..2f86b6633df1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -33,7 +33,6 @@
 #include "ToolChains/Linux.h"
 #include "ToolChains/MSP430.h"
 #include "ToolChains/MSVC.h"
-#include "ToolChains/Managarm.h"
 #include "ToolChains/MinGW.h"
 #include "ToolChains/MipsLinux.h"
 #include "ToolChains/NaCl.h"
@@ -6851,9 +6850,6 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::Fuchsia:
       TC = std::make_unique<toolchains::Fuchsia>(*this, Target, Args);
       break;
-    case llvm::Triple::Managarm:
-      TC = std::make_unique<toolchains::Managarm>(*this, Target, Args);
-      break;
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index afce4fffe1d5..9203bbc91b0b 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -226,8 +226,6 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
       return "elf_iamcu";
     return "elf_i386";
   case llvm::Triple::aarch64:
-    if (T.isOSManagarm())
-      return "aarch64managarm";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
     return "aarch64linuxb";
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
deleted file mode 100644
index ff455f2c6ec7..000000000000
--- a/clang/lib/Driver/ToolChains/Managarm.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Managarm.h"
-#include "Arch/ARM.h"
-#include "Arch/RISCV.h"
-#include "clang/Config/config.h"
-#include "clang/Driver/CommonArgs.h"
-#include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
-#include "clang/Driver/SanitizerArgs.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Support/Path.h"
-
-using namespace clang::driver;
-using namespace clang::driver::toolchains;
-using namespace clang;
-using namespace llvm::opt;
-
-using tools::addPathIfExists;
-
-std::string Managarm::getMultiarchTriple(const Driver &D,
-                                         const llvm::Triple &TargetTriple,
-                                         StringRef SysRoot) const {
-  switch (TargetTriple.getArch()) {
-  default:
-    return TargetTriple.str();
-  case llvm::Triple::x86_64:
-    return "x86_64-managarm-" + TargetTriple.getEnvironmentName().str();
-  case llvm::Triple::aarch64:
-    return "aarch64-managarm-" + TargetTriple.getEnvironmentName().str();
-  case llvm::Triple::riscv64:
-    return "riscv64-managarm-" + TargetTriple.getEnvironmentName().str();
-  }
-}
-
-static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
-  // It happens that only x86, PPC and SPARC use the 'lib32' variant of
-  // oslibdir, and using that variant while targeting other architectures causes
-  // problems because the libraries are laid out in shared system roots that
-  // can't cope with a 'lib32' library search path being considered. So we only
-  // enable them when we know we may need it.
-  //
-  // FIXME: This is a bit of a hack. We should really unify this code for
-  // reasoning about oslibdir spellings with the lib dir spellings in the
-  // GCCInstallationDetector, but that is a more significant refactoring.
-  if (Triple.getArch() == llvm::Triple::x86 || Triple.isPPC32() ||
-      Triple.getArch() == llvm::Triple::sparc)
-    return "lib32";
-
-  if (Triple.getArch() == llvm::Triple::x86_64 && Triple.isX32())
-    return "libx32";
-
-  if (Triple.getArch() == llvm::Triple::riscv32)
-    return "lib32";
-
-  return Triple.isArch32Bit() ? "lib" : "lib64";
-}
-
-Managarm::Managarm(const Driver &D, const llvm::Triple &Triple,
-                   const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-  GCCInstallation.init(Triple, Args);
-  Multilibs = GCCInstallation.getMultilibs();
-  SelectedMultilibs.assign({GCCInstallation.getMultilib()});
-  std::string SysRoot = computeSysRoot();
-
-  ToolChain::path_list &PPaths = getProgramPaths();
-
-  Generic_GCC::PushPPaths(PPaths);
-
-#ifdef ENABLE_LINKER_BUILD_ID
-  ExtraOpts.push_back("--build-id");
-#endif
-
-  // The selection of paths to try here is designed to match the patterns which
-  // the GCC driver itself uses, as this is part of the GCC-compatible driver.
-  // This was determined by running GCC in a fake filesystem, creating all
-  // possible permutations of these directories, and seeing which ones it added
-  // to the link paths.
-  path_list &Paths = getFilePaths();
-
-  const std::string OSLibDir = std::string(getOSLibDir(Triple, Args));
-  const std::string MultiarchTriple = getMultiarchTriple(D, Triple, SysRoot);
-
-  Generic_GCC::AddMultilibPaths(D, SysRoot, OSLibDir, MultiarchTriple, Paths);
-
-  addPathIfExists(D, concat(SysRoot, "/lib", MultiarchTriple), Paths);
-  addPathIfExists(D, concat(SysRoot, "/lib/..", OSLibDir), Paths);
-  addPathIfExists(D, concat(SysRoot, "/usr/lib", MultiarchTriple), Paths);
-  addPathIfExists(D, concat(SysRoot, "/usr", OSLibDir), Paths);
-
-  Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths);
-
-  addPathIfExists(D, concat(SysRoot, "/lib"), Paths);
-  addPathIfExists(D, concat(SysRoot, "/usr/lib"), Paths);
-}
-
-bool Managarm::HasNativeLLVMSupport() const { return true; }
-
-Tool *Managarm::buildLinker() const {
-  return new tools::gnutools::Linker(*this);
-}
-
-Tool *Managarm::buildAssembler() const {
-  return new tools::gnutools::Assembler(*this);
-}
-
-std::string Managarm::computeSysRoot() const {
-  if (!getDriver().SysRoot.empty())
-    return getDriver().SysRoot;
-  return std::string();
-}
-
-std::string Managarm::getDynamicLinker(const ArgList &Args) const {
-  switch (getTriple().getArch()) {
-  case llvm::Triple::aarch64:
-    return "/lib/aarch64-managarm/ld.so";
-  case llvm::Triple::riscv64: {
-    StringRef ABIName = tools::riscv::getRISCVABI(Args, getTriple());
-    return ("/lib/riscv64-managarm/ld-riscv64-" + ABIName + ".so").str();
-  }
-  case llvm::Triple::x86_64:
-    return "/lib/x86_64-managarm/ld.so";
-  default:
-    llvm_unreachable("unsupported architecture");
-  }
-}
-
-void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
-                                         ArgStringList &CC1Args) const {
-  const Driver &D = getDriver();
-  std::string SysRoot = computeSysRoot();
-
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
-    return;
-
-  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
-    addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include");
-
-  // Add 'include' in the resource directory, which is similar to
-  // GCC_INCLUDE_DIR (private headers) in GCC.
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    SmallString<128> ResourceDirInclude(D.ResourceDir);
-    llvm::sys::path::append(ResourceDirInclude, "include");
-    addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
-  }
-
-  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
-    return;
-
-  // TOOL_INCLUDE_DIR
-  AddMultilibIncludeArgs(DriverArgs, CC1Args);
-
-  // Check for configure-time C include directories.
-  StringRef CIncludeDirs(C_INCLUDE_DIRS);
-  if (CIncludeDirs != "") {
-    SmallVector<StringRef, 5> dirs;
-    CIncludeDirs.split(dirs, ":");
-    for (StringRef dir : dirs) {
-      StringRef Prefix =
-          llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : "";
-      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
-    }
-    return;
-  }
-
-  // On systems using multiarch, add /usr/include/$triple before
-  // /usr/include.
-  std::string MultiarchIncludeDir = getMultiarchTriple(D, getTriple(), SysRoot);
-  if (!MultiarchIncludeDir.empty())
-    addExternCSystemInclude(
-        DriverArgs, CC1Args,
-        concat(SysRoot, "/usr/include", MultiarchIncludeDir));
-
-  // Add an include of '/include' directly. This isn't provided by default by
-  // system GCCs, but is often used with cross-compiling GCCs, and harmless to
-  // add even when Clang is acting as-if it were a system compiler.
-  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/include"));
-
-  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/usr/include"));
-}
-
-void Managarm::addLibStdCxxIncludePaths(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  // We need a detected GCC installation on Managarm to provide libstdc++'s
-  // headers.
-  if (!GCCInstallation.isValid())
-    return;
-
-  StringRef TripleStr = GCCInstallation.getTriple().str();
-
-  // Try generic GCC detection.
-  Generic_GCC::addGCCLibStdCxxIncludePaths(DriverArgs, CC1Args, TripleStr);
-}
-
-SanitizerMask Managarm::getSupportedSanitizers() const {
-  const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
-  SanitizerMask Res = ToolChain::getSupportedSanitizers();
-  Res |= SanitizerKind::PointerCompare;
-  Res |= SanitizerKind::PointerSubtract;
-  Res |= SanitizerKind::KernelAddress;
-  Res |= SanitizerKind::Vptr;
-  if (IsX86_64)
-    Res |= SanitizerKind::KernelMemory;
-  return Res;
-}
-
-void Managarm::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
-  for (const auto &Opt : ExtraOpts)
-    CmdArgs.push_back(Opt.c_str());
-}
diff --git a/clang/lib/Driver/ToolChains/Managarm.h b/clang/lib/Driver/ToolChains/Managarm.h
deleted file mode 100644
index 2082e2c615f2..000000000000
--- a/clang/lib/Driver/ToolChains/Managarm.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===--- Managarm.h - Managarm ToolChain Implementations --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
-#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
-
-#include "Gnu.h"
-#include "clang/Driver/ToolChain.h"
-
-namespace clang {
-namespace driver {
-namespace toolchains {
-
-class LLVM_LIBRARY_VISIBILITY Managarm : public Generic_ELF {
-public:
-  Managarm(const Driver &D, const llvm::Triple &Triple,
-           const llvm::opt::ArgList &Args);
-
-  bool HasNativeLLVMSupport() const override;
-
-  std::string getMultiarchTriple(const Driver &D,
-                                 const llvm::Triple &TargetTriple,
-                                 StringRef SysRoot) const override;
-
-  void
-  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                            llvm::opt::ArgStringList &CC1Args) const override;
-  void
-  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
-                           llvm::opt::ArgStringList &CC1Args) const override;
-  SanitizerMask getSupportedSanitizers() const override;
-  std::string computeSysRoot() const override;
-
-  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
-
-  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
-
-  std::vector<std::string> ExtraOpts;
-
-protected:
-  Tool *buildAssembler() const override;
-  Tool *buildLinker() const override;
-};
-
-} // end namespace toolchains
-} // end namespace driver
-} // end namespace clang
-
-#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 3e22b4001bde..641e3beebc08 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -221,7 +221,6 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
   case llvm::Triple::Hurd:
   case llvm::Triple::Linux:
   case llvm::Triple::LiteOS:
-  case llvm::Triple::Managarm:
   case llvm::Triple::NaCl:
   case llvm::Triple::NetBSD:
   case llvm::Triple::OpenBSD:
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/managarm.cpp b/clang/test/Driver/managarm.cpp
deleted file mode 100644
index 5afa17aadb6d..000000000000
--- a/clang/test/Driver/managarm.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-X86-64 %s
-// CHECK-X86-64:      "-cc1"
-// CHECK-X86-64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-X86-64-SAME: "-internal-externc-isystem"
-// CHECK-X86-64-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
-// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-X86-64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
-// CHECK-X86-64-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
-// CHECK-X86-64-SAME: "-L
-// CHECK-X86-64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-X86-64-LIBS %s
-// CHECK-X86-64-LIBS:      "-cc1"
-// CHECK-X86-64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-X86-64-LIBS-SAME: "-internal-externc-isystem"
-// CHECK-X86-64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-X86-64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-LIBS-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
-// CHECK-X86-64-LIBS-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
-// CHECK-X86-64-LIBS-SAME: "-L
-// CHECK-X86-64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-X86-64-STATIC %s
-// CHECK-X86-64-STATIC:      "-cc1"
-// CHECK-X86-64-STATIC-SAME: "-static-define"
-// CHECK-X86-64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-X86-64-STATIC-SAME: "-internal-externc-isystem"
-// CHECK-X86-64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-X86-64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-STATIC-SAME: "-static"
-// CHECK-X86-64-STATIC-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o"
-// CHECK-X86-64-STATIC-SAME: "-L
-// CHECK-X86-64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-X86-64-SHARED %s
-// CHECK-X86-64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-SHARED-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o"
-// CHECK-X86-64-SHARED-SAME: "-L
-// CHECK-X86-64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
-// CHECK-AARCH64:      "-cc1"
-// CHECK-AARCH64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-AARCH64-SAME: "-internal-externc-isystem"
-// CHECK-AARCH64-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-AARCH64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
-// CHECK-AARCH64-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
-// CHECK-AARCH64-SAME: {{^}} "-L
-// CHECK-AARCH64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-LIBS %s
-// CHECK-AARCH64-LIBS:      "-cc1"
-// CHECK-AARCH64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-AARCH64-LIBS-SAME: "-internal-externc-isystem"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-AARCH64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-LIBS-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
-// CHECK-AARCH64-LIBS-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L
-// CHECK-AARCH64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-STATIC %s
-// CHECK-AARCH64-STATIC:      "-cc1"
-// CHECK-AARCH64-STATIC-SAME: "-static-define"
-// CHECK-AARCH64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-AARCH64-STATIC-SAME: "-internal-externc-isystem"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-AARCH64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-STATIC-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-static"
-// CHECK-AARCH64-STATIC-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L
-// CHECK-AARCH64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SHARED %s
-// CHECK-AARCH64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-SHARED-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L
-// CHECK-AARCH64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-RISCV64 %s
-// CHECK-RISCV64:      "-cc1"
-// CHECK-RISCV64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-RISCV64-SAME: "-internal-externc-isystem"
-// CHECK-RISCV64-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-RISCV64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
-// CHECK-RISCV64-SAME: "-L
-// CHECK-RISCV64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-LIBS %s
-// CHECK-RISCV64-LIBS:      "-cc1"
-// CHECK-RISCV64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-RISCV64-LIBS-SAME: "-internal-externc-isystem"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-RISCV64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-LIBS-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
-// CHECK-RISCV64-LIBS-SAME: "-L
-// CHECK-RISCV64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-STATIC %s
-// CHECK-RISCV64-STATIC:      "-cc1"
-// CHECK-RISCV64-STATIC-SAME: "-static-define"
-// CHECK-RISCV64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-RISCV64-STATIC-SAME: "-internal-externc-isystem"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-RISCV64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-STATIC-SAME: "-static"
-// CHECK-RISCV64-STATIC-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o"
-// CHECK-RISCV64-STATIC-SAME: "-L
-// CHECK-RISCV64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-SHARED %s
-// CHECK-RISCV64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-SHARED-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o"
-// CHECK-RISCV64-SHARED-SAME: "-L
-// CHECK-RISCV64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index bed39dc3e34d..031a6c1a755b 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,11 +1622,6 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
-// RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
-// RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
-// RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
-// MANAGARM: #define __managarm__ 1
-
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c
index fe27ed8814ee..4e3e29ccfa8a 100644
--- a/clang/test/Preprocessor/predefined-macros-no-warnings.c
+++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c
@@ -14,7 +14,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux-openhos
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-netbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-openbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-win32-gnu
@@ -109,7 +108,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux-openhos
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-solaris
@@ -169,7 +167,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps4
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps5
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32

From 5f841a6284900026929edcbe8d2b98ce813e0bbc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 17 Jun 2025 07:41:20 -0500
Subject: [PATCH 0632/1322] [flang][OpenMP] Set _OPENMP macro for version 6.0
 (#144410)

---
 flang/include/flang/Support/OpenMP-features.h    | 3 +++
 flang/test/Driver/flang-openmp-version-macro.f90 | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/flang/include/flang/Support/OpenMP-features.h b/flang/include/flang/Support/OpenMP-features.h
index 1dd7ea560cc9..349cd19c1224 100644
--- a/flang/include/flang/Support/OpenMP-features.h
+++ b/flang/include/flang/Support/OpenMP-features.h
@@ -42,6 +42,9 @@ void setOpenMPMacro(int version, FortranPredefinitions &predefinitions) {
   case 52:
     predefinitions.emplace_back("_OPENMP", "202111");
     break;
+  case 60:
+    predefinitions.emplace_back("_OPENMP", "202411");
+    break;
   case 11:
   default:
     predefinitions.emplace_back("_OPENMP", "199911");
diff --git a/flang/test/Driver/flang-openmp-version-macro.f90 b/flang/test/Driver/flang-openmp-version-macro.f90
index 95b3071544d0..f690ab381948 100644
--- a/flang/test/Driver/flang-openmp-version-macro.f90
+++ b/flang/test/Driver/flang-openmp-version-macro.f90
@@ -2,7 +2,6 @@
 
 ! RUN: %flang_fc1 -fopenmp -cpp -E %s | FileCheck %s --check-prefix=DEFAULT-OPENMP-VERSION
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=20 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-20
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=25 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-25
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=30 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-30
@@ -12,6 +11,7 @@
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-50
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-51
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-52
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-60
 
 ! DEFAULT-OPENMP-VERSION: integer :: var1 = 201107
 ! OPENMP-VERSION-11: integer :: var1 = 199911
@@ -24,6 +24,7 @@
 ! OPENMP-VERSION-50: integer :: var1 = 201811
 ! OPENMP-VERSION-51: integer :: var1 = 202011
 ! OPENMP-VERSION-52: integer :: var1 = 202111
+! OPENMP-VERSION-60: integer :: var1 = 202411
 
 #if _OPENMP
   integer :: var1 = _OPENMP

From b91936aeffb798b7deb67aff7bc5c84acea5452e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 21:55:57 +0900
Subject: [PATCH 0633/1322] AMDGPU: Combine nnan fminimum/fmaximum to
 fminnum_ieee/fmaxnum_ieee (#142217)

This improves codegen for gfx950, where fminimum/fmaximum are
legal through fminimum3/fmaximum3, so may have an additional
encoding cost.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  11 ++
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 141 +++++-----------
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 165 ++++++-------------
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 141 +++++-----------
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 165 ++++++-------------
 5 files changed, 203 insertions(+), 420 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 30535ae88f7b..0ced3a6ba9bc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13880,6 +13880,17 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
       return Res;
   }
 
+  // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
+  // for some types, but at a higher cost since it's implemented with a 3
+  // operand form.
+  const SDNodeFlags Flags = N->getFlags();
+  if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
+      !Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
+    unsigned NewOpc =
+        Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index a56c92785d48..92a2f54841ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -113,17 +113,11 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -270,17 +264,11 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -771,17 +759,11 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -939,17 +921,11 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1296,19 +1272,12 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1501,19 +1470,12 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1741,19 +1703,12 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1981,19 +1936,12 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2788,4 +2736,3 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 826bf427503a..6c4f13a4eab8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -85,17 +85,11 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 3dcc70b0ea3b..9e82b41bb958 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -91,17 +91,11 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -225,17 +219,11 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -646,17 +634,11 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -779,17 +761,11 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1062,19 +1038,12 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1220,19 +1189,12 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1401,19 +1363,12 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1582,19 +1537,12 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2207,4 +2155,3 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 021579546732..8adbe861fe6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -85,17 +85,11 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}

From b4e39e4ff923334a8a1fdcc6d92b01d3885a01f2 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 17 Jun 2025 15:03:37 +0200
Subject: [PATCH 0634/1322] [LLVM] [Support] Query the terminal width using
 `ioctl()` (#143514)

On unix systems, we were trying to determine the terminal width using
the `COULMNS` environment variable. Unfortunately, `COLUMNS` is not
exported by all shells and thus not available on some systems.

We were previously using `ioctl()` for this; fall back to doing so if `COLUMNS`
does not exist or does not store a positive integer.

This essentially reverts a3eb3d3d92d037fe3c9deaad87f6fc42fe9ea766 and
parts of https://reviews.llvm.org/D61326.

For more information, see #139499.

Fixes #139499.
---
 llvm/cmake/config-ix.cmake              |  5 +++++
 llvm/include/llvm/Config/config.h.cmake |  3 +++
 llvm/lib/Support/Unix/Process.inc       | 24 ++++++++++++++++++------
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 9895469973e4..0fcd73e75231 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -19,6 +19,7 @@ if (ANDROID OR CYGWIN OR CMAKE_SYSTEM_NAME MATCHES "AIX|DragonFly|FreeBSD|Haiku|
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 elseif (APPLE)
   set(HAVE_MACH_MACH_H 1)
   set(HAVE_MALLOC_MALLOC_H 1)
@@ -26,6 +27,7 @@ elseif (APPLE)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 elseif (WIN32)
   set(HAVE_MACH_MACH_H 0)
   set(HAVE_MALLOC_MALLOC_H 0)
@@ -33,6 +35,7 @@ elseif (WIN32)
   set(HAVE_SYS_MMAN_H 0)
   set(HAVE_SYSEXITS_H 0)
   set(HAVE_UNISTD_H 0)
+  set(HAVE_SYS_IOCTL_H 0)
 elseif (ZOS)
   # Confirmed in
   # https://github.com/llvm/llvm-project/pull/104706#issuecomment-2297109613
@@ -42,6 +45,7 @@ elseif (ZOS)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 0)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 else()
   # Other platforms that we don't promise support for.
   check_include_file(mach/mach.h HAVE_MACH_MACH_H)
@@ -50,6 +54,7 @@ else()
   check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
   check_include_file(sysexits.h HAVE_SYSEXITS_H)
   check_include_file(unistd.h HAVE_UNISTD_H)
+  check_include_file(sys/ioctl.h HAVE_SYS_IOCTL_H)
 endif()
 
 if( UNIX AND NOT (APPLE OR BEOS OR HAIKU) )
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 06d475639791..ce83de8e4cba 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -164,6 +164,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index b5c3719f5796..db735b7484ad 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -34,6 +34,9 @@
 #ifdef HAVE_GETAUXVAL
 #include <sys/auxv.h>
 #endif
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
@@ -304,31 +307,40 @@ bool Process::FileDescriptorIsDisplayed(int fd) {
 #endif
 }
 
-static unsigned getColumns() {
+static unsigned getColumns(int FileID) {
   // If COLUMNS is defined in the environment, wrap to that many columns.
+  // This matches GCC.
   if (const char *ColumnsStr = std::getenv("COLUMNS")) {
     int Columns = std::atoi(ColumnsStr);
     if (Columns > 0)
       return Columns;
   }
 
-  // We used to call ioctl TIOCGWINSZ to determine the width. It is considered
-  // unuseful.
-  return 0;
+  // Some shells do not export COLUMNS; query the column count via ioctl()
+  // instead if it isn't available.
+  unsigned Columns = 0;
+
+#ifdef HAVE_SYS_IOCTL_H
+  struct winsize ws;
+  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
+    Columns = ws.ws_col;
+#endif
+
+  return Columns;
 }
 
 unsigned Process::StandardOutColumns() {
   if (!StandardOutIsDisplayed())
     return 0;
 
-  return getColumns();
+  return getColumns(0);
 }
 
 unsigned Process::StandardErrColumns() {
   if (!StandardErrIsDisplayed())
     return 0;
 
-  return getColumns();
+  return getColumns(1);
 }
 
 static bool terminalHasColors() {

From 3451cd5d206f29df5b6ab5c200b7b8b17f3f2e3f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 15:03:48 +0200
Subject: [PATCH 0635/1322] [PowerPC] Regenerate MIR test checks (NFC)

---
 .../PowerPC/aix-vector-vararg-caller.ll       | 227 +++++++++---------
 1 file changed, 114 insertions(+), 113 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
index 472be4fa6364..4697a093e5d6 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
@@ -8,123 +8,124 @@
 ; RUN: FileCheck --check-prefix=64BIT %s
 
 define <4 x i32> @caller() {
+
   ; 32BIT-LABEL: name: caller
   ; 32BIT: bb.0.entry:
-  ; 32BIT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI:%[0-9]+]]:gprc = LI 48
-  ; 32BIT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI1:%[0-9]+]]:gprc = LI 32
-  ; 32BIT:   STXVW4X killed [[LXVW4X1]], $r1, killed [[LI1]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI2:%[0-9]+]]:gprc = LI 160
-  ; 32BIT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI3:%[0-9]+]]:gprc = LI 144
-  ; 32BIT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI4:%[0-9]+]]:gprc = LI 128
-  ; 32BIT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI5:%[0-9]+]]:gprc = LI 112
-  ; 32BIT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI6:%[0-9]+]]:gprc = LI 96
-  ; 32BIT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI7:%[0-9]+]]:gprc = LI 80
-  ; 32BIT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI8:%[0-9]+]]:gprc = LI 64
-  ; 32BIT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
-  ; 32BIT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ3:%[0-9]+]]:gprc = LWZ 40, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ4:%[0-9]+]]:gprc = LWZ 36, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ5:%[0-9]+]]:gprc = LWZ 32, $r1 :: (load (s32))
-  ; 32BIT:   [[LI9:%[0-9]+]]:gprc = LI 9
-  ; 32BIT:   $r3 = COPY [[LI9]]
-  ; 32BIT:   $r5 = COPY [[LWZ5]]
-  ; 32BIT:   $r6 = COPY [[LWZ4]]
-  ; 32BIT:   $r7 = COPY [[LWZ3]]
-  ; 32BIT:   $r8 = COPY [[LWZ2]]
-  ; 32BIT:   $r9 = COPY [[LWZ1]]
-  ; 32BIT:   $r10 = COPY [[LWZ]]
-  ; 32BIT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $v2
-  ; 32BIT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 32BIT:   $v2 = COPY [[COPY]]
-  ; 32BIT:   BLR implicit $lr, implicit $rm, implicit $v2
-
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 48
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI1:%[0-9]+]]:gprc = LI 32
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X1]], $r1, killed [[LI1]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI2:%[0-9]+]]:gprc = LI 160
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI3:%[0-9]+]]:gprc = LI 144
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI4:%[0-9]+]]:gprc = LI 128
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI5:%[0-9]+]]:gprc = LI 112
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI6:%[0-9]+]]:gprc = LI 96
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI7:%[0-9]+]]:gprc = LI 80
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI8:%[0-9]+]]:gprc = LI 64
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ3:%[0-9]+]]:gprc = LWZ 40, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ4:%[0-9]+]]:gprc = LWZ 36, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ5:%[0-9]+]]:gprc = LWZ 32, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LI9:%[0-9]+]]:gprc = LI 9
+  ; 32BIT-NEXT:   $r3 = COPY [[LI9]]
+  ; 32BIT-NEXT:   $r5 = COPY [[LWZ5]]
+  ; 32BIT-NEXT:   $r6 = COPY [[LWZ4]]
+  ; 32BIT-NEXT:   $r7 = COPY [[LWZ3]]
+  ; 32BIT-NEXT:   $r8 = COPY [[LWZ2]]
+  ; 32BIT-NEXT:   $r9 = COPY [[LWZ1]]
+  ; 32BIT-NEXT:   $r10 = COPY [[LWZ]]
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $v2
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 32BIT-NEXT:   $v2 = COPY [[COPY]]
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $v2
+  ;
   ; 64BIT-LABEL: name: caller
   ; 64BIT: bb.0.entry:
-  ; 64BIT:   ADJCALLSTACKDOWN 208, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
-  ; 64BIT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 80
-  ; 64BIT:   STXVW4X killed [[LXVW4X1]], $x1, killed [[LI8_1]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT2:%[0-9]+]]:g8rc = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT2]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_2:%[0-9]+]]:g8rc = LI8 64
-  ; 64BIT:   STXVW4X killed [[LXVW4X2]], $x1, killed [[LI8_2]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
-  ; 64BIT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
-  ; 64BIT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
-  ; 64BIT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
-  ; 64BIT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
-  ; 64BIT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
-  ; 64BIT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
-  ; 64BIT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
-  ; 64BIT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
-  ; 64BIT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
-  ; 64BIT:   [[LD3:%[0-9]+]]:g8rc = LD 80, $x1 :: (load (s64))
-  ; 64BIT:   [[LD4:%[0-9]+]]:g8rc = LD 72, $x1 :: (load (s64))
-  ; 64BIT:   [[LD5:%[0-9]+]]:g8rc = LD 64, $x1 :: (load (s64))
-  ; 64BIT:   [[LI8_9:%[0-9]+]]:g8rc = LI8 9
-  ; 64BIT:   $x3 = COPY [[LI8_9]]
-  ; 64BIT:   $x5 = COPY [[LD5]]
-  ; 64BIT:   $x6 = COPY [[LD4]]
-  ; 64BIT:   $x7 = COPY [[LD3]]
-  ; 64BIT:   $x8 = COPY [[LD2]]
-  ; 64BIT:   $x9 = COPY [[LD1]]
-  ; 64BIT:   $x10 = COPY [[LD]]
-  ; 64BIT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def $v2
-  ; 64BIT:   ADJCALLSTACKUP 208, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 64BIT:   $v2 = COPY [[COPY]]
-  ; 64BIT:   BLR8 implicit $lr8, implicit $rm, implicit $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 208, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 80
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X1]], $x1, killed [[LI8_1]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT2:%[0-9]+]]:g8rc = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT2]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_2:%[0-9]+]]:g8rc = LI8 64
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $x1, killed [[LI8_2]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD3:%[0-9]+]]:g8rc = LD 80, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD4:%[0-9]+]]:g8rc = LD 72, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD5:%[0-9]+]]:g8rc = LD 64, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LI8_9:%[0-9]+]]:g8rc = LI8 9
+  ; 64BIT-NEXT:   $x3 = COPY [[LI8_9]]
+  ; 64BIT-NEXT:   $x5 = COPY [[LD5]]
+  ; 64BIT-NEXT:   $x6 = COPY [[LD4]]
+  ; 64BIT-NEXT:   $x7 = COPY [[LD3]]
+  ; 64BIT-NEXT:   $x8 = COPY [[LD2]]
+  ; 64BIT-NEXT:   $x9 = COPY [[LD1]]
+  ; 64BIT-NEXT:   $x10 = COPY [[LD]]
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 208, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 64BIT-NEXT:   $v2 = COPY [[COPY]]
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $v2
   entry:
     %call = tail call <4 x i32> (i32, ...) @callee(i32 9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>, <4 x i32> <i32 12, i32 13, i32 14, i32 15>, <4 x i32> <i32 16, i32 17, i32 18, i32 19>, <4 x i32> <i32 20, i32 21, i32 22, i32 23>, <4 x i32> <i32 24, i32 25, i32 26, i32 27>, <4 x i32> <i32 28, i32 29, i32 30, i32 31>, <4 x i32> <i32 32, i32 33, i32 34, i32 35>)
       ret <4 x i32> %call

From 76ea1db1746db254716aafbc992b637cd10c6ea3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 15:16:24 +0200
Subject: [PATCH 0636/1322] [PowerPC] Split test into assembly and MIR variants
 (NFC)

So that both can be generated.
---
 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll |  588 ++++++++
 llvm/test/CodeGen/PowerPC/aix-cc-byval.ll     | 1301 +++++++----------
 2 files changed, 1080 insertions(+), 809 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll

diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
new file mode 100644
index 000000000000..67800df6ed4b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
@@ -0,0 +1,588 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=64BIT %s
+
+%struct.S0 = type {}
+
+%struct.S1 = type { [1 x i8] }
+@gS1 = external global %struct.S1, align 1
+
+define void @call_test_byval_1Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_1Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 0, killed renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 24, 0, 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_1Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_1Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 0, killed renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x3 = RLDICR killed renamable $x3, 56, 7
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_1Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %s0 = alloca %struct.S0, align 8
+  %call = call zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 @gS1)
+  ret void
+}
+
+define zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_1Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = COPY $r3
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM $r3, 8, 24, 31
+  ; 32BIT-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_1Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x4 = COPY $x3
+  ; 64BIT-NEXT:   renamable $x3 = RLDICL $x3, 8, 56
+  ; 64BIT-NEXT:   STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %0 = load i8, ptr %s, align 1
+  ret i8 %0
+}
+
+@f = common global float 0.000000e+00, align 4
+
+%struct.S2 = type { [2 x i8] }
+
+@gS2 = external global %struct.S2, align 1
+
+define void @call_test_byval_2Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_2Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LHZ 0, killed renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r5 = RLWINM killed renamable $r3, 16, 0, 15
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 32BIT-NEXT:   $r7 = LI 43
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_2Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r5, implicit killed $f2, implicit killed $r7, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_2Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LHZ8 0, killed renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = RLDICR killed renamable $x3, 48, 15
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 64BIT-NEXT:   $x7 = LI8 43
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_2Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x5, implicit killed $f2, implicit killed $x7, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f, align 4
+  %call = call zeroext i8 @test_byval_2Byte(i32 signext 42, float %0, ptr byval(%struct.S2) align 1 @gS2, float %0, i32 signext 43)
+  ret void
+}
+
+define zeroext i8 @test_byval_2Byte(i32, float, ptr byval(%struct.S2) align 1 %s, float, i32) {
+  ; 32BIT-LABEL: name: test_byval_2Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r5
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 1, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_2Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x5
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 1, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S2, ptr %s, i32 0, i32 0, i32 1
+  %4 = load i8, ptr %arrayidx, align 1
+  ret i8 %4
+}
+
+%struct.S3 = type <{ i8, i16 }>
+@gS3 = external global %struct.S3, align 1
+
+define void @call_test_byval_3Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_3Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LI 42
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @gS3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 2, renamable $r4 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r4 = LHZ 0, killed renamable $r4 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r3, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r4, 16, 0, 15
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_3Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_3Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LI8 42
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @gS3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 2, renamable $x4 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 0, killed renamable $x4 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x10 = RLDIC killed renamable $x3, 40, 16
+  ; 64BIT-NEXT:   renamable $x10 = RLDIMI killed renamable $x10, killed renamable $x4, 48, 0
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_3Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i16 @test_byval_3Byte(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, ptr byval(%struct.S3) align 1 @gS3, i32 42)
+  ret void
+}
+
+define zeroext i16 @test_byval_3Byte(i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S3) align 1 %s, i32) {
+  ; 32BIT-LABEL: name: test_byval_3Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r10, 0, %fixed-stack.1 :: (store (s32) into %fixed-stack.1)
+  ; 32BIT-NEXT:   renamable $r3 = LHZ 1, %fixed-stack.1 :: (dereferenceable load (s16) from %ir.gep, align 1)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_3Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x10, 0, %fixed-stack.1 :: (store (s64) into %fixed-stack.1)
+  ; 64BIT-NEXT:   renamable $x3 = LHZ8 1, %fixed-stack.1 :: (dereferenceable load (s16) from %ir.gep, align 1)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %gep = getelementptr inbounds %struct.S3, ptr %s, i32 0, i32 1
+  %8 = load i16, ptr %gep, align 1
+  ret i16 %8
+}
+
+%struct.S4 = type { [4 x i8] }
+%struct.S4A = type { i32 }
+
+@gS4 = external global %struct.S4, align 1
+
+define void @call_test_byval_4Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_4Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_4Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_4Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x4 = LWZ8 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
+  ; 64BIT-NEXT:   renamable $x3 = RLDICR killed renamable $x3, 32, 31
+  ; 64BIT-NEXT:   renamable $x4 = RLDICR killed renamable $x4, 32, 31
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_4Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %s0 = alloca %struct.S0, align 8
+  %s4a = alloca %struct.S4A, align 8
+  %call = call signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 @gS4, ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S4A) align 4 %s4a)
+  ret void
+}
+
+define signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 %s, ptr byval(%struct.S0) align 1, ptr byval(%struct.S4A) align 4 %s4a) {
+  ; 32BIT-LABEL: name: test_byval_4Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW renamable $r3, 0, %fixed-stack.2 :: (store (s32) into %fixed-stack.2, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 0, 24, 31
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3
+  ; 32BIT-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_4Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.2 :: (store (s64) into %fixed-stack.2, align 16)
+  ; 64BIT-NEXT:   STD renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $r3 = LBZ 3, %fixed-stack.2 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   renamable $x4 = RLDICL killed renamable $x4, 32, 32
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3, implicit killed $x4
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S4, ptr %s, i32 0, i32 0, i32 3
+  %1 = load i8, ptr %arrayidx, align 1
+  %2 = load i32, ptr %s4a, align 4
+  %conv = zext i8 %1 to i32
+  %add = add nsw i32 %2, %conv
+  ret i32 %add
+}
+
+%struct.S5 = type { [5 x i8] }
+
+@gS5 = external global %struct.S5, align 1
+
+define void @call_test_byval_5Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_5Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LBZ 4, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 24, 0, 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_5Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LBZ8 4, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x4, 24, 0, 7
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x5, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1 @gS5)
+  ret void
+}
+
+declare zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1)
+
+%struct.S6 = type { [6 x i8] }
+
+@gS6 = external global %struct.S6, align 1
+
+define void @call_test_byval_6Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_6Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LHZ 4, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_6Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 4, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x4, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x5, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1 @gS6)
+  ret void
+}
+
+declare zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1)
+
+%struct.S7 = type { [7 x i8] }
+
+@gS7 = external global %struct.S7, align 1
+
+define void @call_test_byval_7Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_7Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r5 = LHZ 4, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r4 = LBZ 6, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r5, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_7Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 4, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = LBZ8 6, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x6 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x5, 8, 16, 23
+  ; 64BIT-NEXT:   renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x4, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x6, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1 @gS7)
+  ret void
+}
+
+declare zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1)
+
+%struct.S8 = type { [8 x i8] }
+
+@gS8 = external global %struct.S8, align 1
+
+define void @call_test_byval_8Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_8Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_8Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1 @gS8)
+  ret void
+}
+
+declare zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1)
+
+%struct.S32 = type { [32 x i8] }
+
+@gS32 = external global %struct.S32, align 1
+
+define void @call_test_byval_32Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_32Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS32, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r10 = LWZ 28, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 24, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 20, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 16, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 12, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_32Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_32Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS32, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x6 = LD 24, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x5 = LD 16, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LD 8, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_32Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 @gS32)
+  ret void
+}
+
+define zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_32Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_32Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S32, ptr %s, i32 0, i32 0, i32 21
+  %0 = load i8, ptr %arrayidx, align 1
+  ret i8 %0
+}
+
+; The ByVal handling produces dead stores. See `LowerFormalArguments_AIX` for
+; details on why.
+
+%struct.S31 = type <{ float, i32, i64, double, i32, i16, i8 }>
+
+@gS31 = external global %struct.S31, align 1
+
+define void @call_test_byval_31Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_31Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS31, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r11 = LHZ 28, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r10 = LBZ 30, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 24, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 20, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 16, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 12, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r10, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r11, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_31Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $f1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_31Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS31, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x7 = LHZ8 28, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x6 = LBZ8 30, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x8 = LWZ8 24, renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x5 = LD 16, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LD 8, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x6 = RLWINM8 killed renamable $x6, 8, 16, 23
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x6 = RLWIMI8 killed renamable $x6, killed renamable $x7, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x6 = RLDIMI killed renamable $x6, killed renamable $x8, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_31Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1, implicit-def dead $f1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call double @test_byval_31Byte(ptr byval(%struct.S31) align 1 @gS31)
+  ret void
+}
+
+define double @test_byval_31Byte(ptr byval(%struct.S31) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_31Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64) from %ir.gep)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+  ;
+  ; 64BIT-LABEL: name: test_byval_31Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64) from %ir.gep, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %gep = getelementptr inbounds %struct.S31, ptr %s, i32 0, i32 3
+  %load = load double, ptr %gep, align 1
+  ret double %load
+}
+
+%struct.F = type { float, float, float }
+
+define i32 @call_test_byval_homogeneous_float_struct() {
+  ; 32BIT-LABEL: name: call_test_byval_homogeneous_float_struct
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LI 0
+  ; 32BIT-NEXT:   STW renamable $r3, 8, %stack.0.s :: (store (s32) into %ir.s + 8, align 8)
+  ; 32BIT-NEXT:   STW renamable $r3, 4, %stack.0.s :: (store (s32) into %ir.s + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %stack.0.s :: (store (s32) into %ir.s, align 8)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
+  ; 32BIT-NEXT:   $r3 = LI 0
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_homogeneous_float_struct
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LI8 0
+  ; 64BIT-NEXT:   STW8 renamable $x3, 8, %stack.0.s :: (store (s32) into %ir.s + 8, align 8)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.0.s :: (store (s64) into %ir.s)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
+  ; 64BIT-NEXT:   renamable $x4 = RLDICR killed renamable $x3, 32, 31
+  ; 64BIT-NEXT:   $x3 = LI8 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %s = alloca %struct.F, align 8
+  call void @llvm.memset.p0.i32(ptr align 4 %s, i8 0, i32 12, i1 false)
+  %call = call i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4 %s)
+  ret i32 %call
+}
+
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
+
+declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
index 5e7a1bc81916..a06b61fc4533 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
@@ -1,18 +1,11 @@
-; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
-; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,32BIT %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefixes=CHECKASM,ASM32 %s
-
-; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
-; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,64BIT %s
+; RUN: FileCheck --check-prefixes=32BIT %s
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc64-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefixes=CHECKASM,ASM64 %s
+; RUN: FileCheck --check-prefixes=64BIT %s
 
 %struct.S0 = type {}
 
@@ -20,96 +13,60 @@
 @gS1 = external global %struct.S1, align 1
 
 define void @call_test_byval_1Byte() {
+; 32BIT-LABEL: call_test_byval_1Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C0(2) # @gS1
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 3, 0(3)
+; 32BIT-NEXT:    slwi 3, 3, 24
+; 32BIT-NEXT:    bl .test_byval_1Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_1Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 3, L..C0(2) # @gS1
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    lbz 3, 0(3)
+; 64BIT-NEXT:    sldi 3, 3, 56
+; 64BIT-NEXT:    bl .test_byval_1Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s0 = alloca %struct.S0, align 8
   %call = call zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 @gS1)
   ret void
 }
 
-
-; CHECK-LABEL: name: call_test_byval_1Byte{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REG:[0-9]+]] = LWZtoc @gS1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT:  renamable $r3 = LBZ 0, killed renamable $r[[REG]] :: (load (s8))
-; 32BIT-NEXT:  renamable $r3 = RLWINM killed renamable $r3, 24, 0, 7
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_1Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_1Byte:
-
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-NEXT:  stw 0, 72(1)
-; ASM32-NEXT:  lbz 3, 0([[REG]])
-; ASM32-NEXT:  slwi 3, 3, 24
-; ASM32-NEXT:  bl .test_byval_1Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 64
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REG:[0-9]+]] = LDtoc @gS1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $x3 = LBZ8 0, killed renamable $x[[REG]] :: (load (s8))
-; 64BIT-NEXT:  renamable $x3 = RLDICR killed renamable $x3, 56, 7
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_1Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -128(1)
-; ASM64-NEXT:  ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-NEXT:  std 0, 144(1)
-; ASM64-NEXT:  lbz 3, 0([[REG]])
-; ASM64-NEXT:  sldi 3, 3, 56
-; ASM64-NEXT:  bl .test_byval_1Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 128
-
-
 define zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 %s) {
+; 32BIT-LABEL: test_byval_1Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mr 4, 3
+; 32BIT-NEXT:    srwi 3, 3, 24
+; 32BIT-NEXT:    stw 4, 24(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_1Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mr 4, 3
+; 64BIT-NEXT:    rldicl 3, 3, 8, 56
+; 64BIT-NEXT:    std 4, 48(1)
+; 64BIT-NEXT:    blr
 entry:
   %0 = load i8, ptr %s, align 1
   ret i8 %0
 }
 
-; CHECK-LABEL: name:            test_byval_1Byte
-
-; 32BIT:       fixedStack:
-; 32BIT-NEXT:    - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:         - { id: 1, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:       bb.0.entry:
-; 32BIT-NEXT:    liveins: $r3
-; 32BIT:         renamable $r4 = COPY $r3
-; 32BIT:         renamable $r3 = RLWINM $r3, 8, 24, 31
-; 32BIT:         STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
-; 32BIT-NEXT:    BLR
-
-; 64BIT:       fixedStack:
-; 64BIT-NEXT:    - { id: 0, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:         - { id: 1, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3
-; 64BIT:        renamable $x4 = COPY $x3
-; 64BIT:        renamable $x3 = RLDICL $x3, 8, 56
-; 64BIT:        STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
-
-; CHECKASM-LABEL: .test_byval_1Byte:
-
-; ASM32:      mr 4, 3
-; ASM32-NEXT: srwi 3, 3, 24
-; ASM32-NEXT: stw 4, 24(1)
-; ASM32-NEXT: blr
-
-; ASM64:      mr 4, 3
-; ASM64-NEXT: rldicl 3, 3, 8, 56
-; ASM64-NEXT: std 4, 48(1)
-; ASM64-NEXT: blr
-
-
 @f = common global float 0.000000e+00, align 4
 
 %struct.S2 = type { [2 x i8] }
@@ -117,240 +74,184 @@ entry:
 @gS2 = external global %struct.S2, align 1
 
 define void @call_test_byval_2Byte() {
+; 32BIT-LABEL: call_test_byval_2Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C1(2) # @f
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    li 7, 43
+; 32BIT-NEXT:    lfs 1, 0(3)
+; 32BIT-NEXT:    lwz 3, L..C2(2) # @gS2
+; 32BIT-NEXT:    lhz 3, 0(3)
+; 32BIT-NEXT:    fmr 2, 1
+; 32BIT-NEXT:    slwi 5, 3, 16
+; 32BIT-NEXT:    li 3, 42
+; 32BIT-NEXT:    bl .test_byval_2Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_2Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C1(2) # @f
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    li 7, 43
+; 64BIT-NEXT:    lfs 1, 0(3)
+; 64BIT-NEXT:    ld 3, L..C2(2) # @gS2
+; 64BIT-NEXT:    lhz 3, 0(3)
+; 64BIT-NEXT:    fmr 2, 1
+; 64BIT-NEXT:    sldi 5, 3, 48
+; 64BIT-NEXT:    li 3, 42
+; 64BIT-NEXT:    bl .test_byval_2Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %0 = load float, ptr @f, align 4
   %call = call zeroext i8 @test_byval_2Byte(i32 signext 42, float %0, ptr byval(%struct.S2) align 1 @gS2, float %0, i32 signext 43)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_2Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       renamable $r[[REG1:[0-9]+]] = LWZtoc @f, $r2 :: (load (s32) from got)
-; 32BIT-NEXT:  renamable $f1 = LFS 0, killed renamable $r[[REG1]] :: (dereferenceable load (s32) from @f)
-; 32BIT-NEXT:  ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 42
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LWZtoc @gS2, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG3:[0-9]+]] = LHZ 0, killed renamable $r[[REG2]] :: (load (s16))
-; 32BIT-DAG:   renamable $r5 = RLWINM killed renamable $r[[REG3]], 16, 0, 15
-; 32BIT-DAG:   $f2 = COPY renamable $f1
-; 32BIT-DAG:   $r7 = LI 43
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_2Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r5, implicit killed $f2, implicit killed $r7, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_2Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-DAG:   li 3, 42
-; ASM32-DAG:   lwz [[REG1:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lfs 1, 0([[REG1]])
-; ASM32-DAG:   lwz [[REG2:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG3:[0-9]+]], 0([[REG2]])
-; ASM32-DAG:   slwi 5, [[REG3]], 16
-; ASM32-DAG:   fmr 2, 1
-; ASM32-DAG:   li 7, 43
-; ASM32-NEXT:  bl .test_byval_2Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 64
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       renamable $x[[REG1:[0-9]+]] = LDtoc @f, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $f1 = LFS 0, killed renamable $x[[REG1]] :: (dereferenceable load (s32) from @f)
-; 64BIT-NEXT:  ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 42
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LDtoc @gS2, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LHZ8 0, killed renamable $x[[REG2]] :: (load (s16))
-; 64BIT-DAG:   renamable $x5 = RLDICR killed renamable $x[[REG3]], 48, 15
-; 64BIT-DAG:   $f2 = COPY renamable $f1
-; 64BIT-DAG:   $x7 = LI8 43
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_2Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x5, implicit killed $f2, implicit killed $x7, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-DAG:   std 0, 128(1)
-; ASM64-DAG:   li 3, 42
-; ASM64-DAG:   ld [[REG1:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lfs 1, 0([[REG1]])
-; ASM64-DAG:   ld [[REG2:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lhz [[REG3:[0-9]+]], 0([[REG2]])
-; ASM64-DAG:   sldi 5, [[REG3]], 48
-; ASM64-DAG:   fmr 2, 1
-; ASM64-DAG:   li 7, 43
-; ASM64-NEXT:  bl .test_byval_2Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 112
-
 define zeroext i8 @test_byval_2Byte(i32, float, ptr byval(%struct.S2) align 1 %s, float, i32) {
+; 32BIT-LABEL: test_byval_2Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    lbz 3, 33(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_2Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    lbz 3, 65(1)
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S2, ptr %s, i32 0, i32 0, i32 1
   %4 = load i8, ptr %arrayidx, align 1
   ret i8 %4
 }
 
-; CHECK-LABEL: name:            test_byval_2Byte
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 32, size: 4, alignment: 16, stack-id: default,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r5
-; 32BIT:        STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
-; 32BIT-NEXT:   renamable $r3 = LBZ 1, %fixed-stack.0 :: (dereferenceable load (s8)
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 64, size: 8, alignment: 16, stack-id: default,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x5
-; 64BIT:        STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
-; 64BIT-NEXT:   renamable $x3 = LBZ8 1, %fixed-stack.0 :: (dereferenceable load (s8)
-
-; CHECKASM-LABEL: .test_byval_2Byte:
-
-; ASM32:        stw 5, 32(1)
-; ASM32-NEXT:   lbz 3, 33(1)
-; ASM32-NEXT:   blr
-
-; ASM64:        std 5, 64(1)
-; ASM64-NEXT:   lbz 3, 65(1)
-; ASM64-NEXT:   blr
-
-
 %struct.S3 = type <{ i8, i16 }>
 @gS3 = external global %struct.S3, align 1
 
 define void @call_test_byval_3Byte() {
+; 32BIT-LABEL: call_test_byval_3Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 4, L..C3(2) # @gS3
+; 32BIT-NEXT:    li 3, 42
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    li 5, 3
+; 32BIT-NEXT:    li 6, 4
+; 32BIT-NEXT:    li 7, 5
+; 32BIT-NEXT:    stw 3, 56(1)
+; 32BIT-NEXT:    li 8, 6
+; 32BIT-NEXT:    li 9, 7
+; 32BIT-NEXT:    lbz 3, 2(4)
+; 32BIT-NEXT:    lhz 4, 0(4)
+; 32BIT-NEXT:    rlwinm 10, 3, 8, 16, 23
+; 32BIT-NEXT:    li 3, 1
+; 32BIT-NEXT:    rlwimi 10, 4, 16, 0, 15
+; 32BIT-NEXT:    li 4, 2
+; 32BIT-NEXT:    bl .test_byval_3Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_3Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 4, L..C3(2) # @gS3
+; 64BIT-NEXT:    li 3, 42
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    li 5, 3
+; 64BIT-NEXT:    li 6, 4
+; 64BIT-NEXT:    li 7, 5
+; 64BIT-NEXT:    std 3, 112(1)
+; 64BIT-NEXT:    li 8, 6
+; 64BIT-NEXT:    li 9, 7
+; 64BIT-NEXT:    lbz 3, 2(4)
+; 64BIT-NEXT:    lhz 4, 0(4)
+; 64BIT-NEXT:    rldic 10, 3, 40, 16
+; 64BIT-NEXT:    li 3, 1
+; 64BIT-NEXT:    rldimi 10, 4, 48, 0
+; 64BIT-NEXT:    li 4, 2
+; 64BIT-NEXT:    bl .test_byval_3Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i16 @test_byval_3Byte(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, ptr byval(%struct.S3) align 1 @gS3, i32 42)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_3Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS3, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 0, killed renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LBZ 2, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r10 = RLWINM killed renamable $r[[REG2]], 8, 16, 23
-; 32BIT-DAG:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-DAG:   renamable $r[[REG3:[0-9]+]] = LI 42
-; 32BIT-DAG:   STW killed renamable $r[[REG3]], 56, $r1 :: (store (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_3Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_3Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-DAG:   li 3, 1
-; ASM32-DAG:   li 4, 2
-; ASM32-DAG:   li 5, 3
-; ASM32-DAG:   li 6, 4
-; ASM32-DAG:   li 7, 5
-; ASM32-DAG:   li 8, 6
-; ASM32-DAG:   li 9, 7
-; ASM32-DAG:   lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM32-DAG:   lbz [[REG2:[0-9]+]], 2([[REGADDR]])
-; ASM32-DAG:   rlwinm 10, [[REG2]], 8, 16, 23
-; ASM32-DAG:   rlwimi 10, [[REG1]], 16, 0, 15
-; ASM32-DAG:   li [[REG3:[0-9]+]], 42
-; ASM32-DAG:   stw [[REG3]], 56(1)
-; ASM32-NEXT:  bl .test_byval_3Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS3, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LHZ8 0, killed renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LBZ8 2, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x10 = RLDIC killed renamable $x[[REG2]], 40, 16
-; 64BIT-DAG:   renamable $x10 = RLDIMI killed renamable $x10, killed renamable $x[[REG1]], 48, 0
-; 64BIT-DAG:   $x[[REG3:[0-9]+]] = LI8 42
-; 64BIT-DAG:   STD killed renamable $x[[REG3]], 112, $x1 :: (store (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_3Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $x10, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -128(1)
-; ASM64-DAG:   li 3, 1
-; ASM64-DAG:   li 4, 2
-; ASM64-DAG:   li 5, 3
-; ASM64-DAG:   li 6, 4
-; ASM64-DAG:   li 7, 5
-; ASM64-DAG:   li 8, 6
-; ASM64-DAG:   li 9, 7
-; ASM64-DAG:   ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lhz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lbz [[REG2:[0-9]+]], 2([[REGADDR]])
-; ASM64-DAG:   rldic 10, [[REG2]], 40, 16
-; ASM64-DAG:   rldimi 10, [[REG1]], 48, 0
-; ASM64-DAG:   li [[REG3:[0-9]+]], 42
-; ASM64-DAG:   std [[REG3]], 112(1)
-; ASM64-NEXT:  bl .test_byval_3Byte
-; ASM64-NEXT:  nop
-
-
 define zeroext i16 @test_byval_3Byte(i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S3) align 1 %s, i32) {
+; 32BIT-LABEL: test_byval_3Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    lhz 3, 53(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_3Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 10, 104(1)
+; 64BIT-NEXT:    lhz 3, 105(1)
+; 64BIT-NEXT:    blr
 entry:
   %gep = getelementptr inbounds %struct.S3, ptr %s, i32 0, i32 1
   %8 = load i16, ptr %gep, align 1
   ret i16 %8
 }
 
-; CHECK-LABEL: name:            test_byval_3Byte
-
-; 32BIT:       fixedStack:
-; 32BIT-NEXT:    - { id: 0, type: default, offset: 56, size: 4, alignment: 8, stack-id: default,
-; 32BIT:         - { id: 1, type: default, offset: 52, size: 4, alignment: 4, stack-id: default,
-
-; 32BIT-LABEL: bb.0.entry:
-; 32BIT-NEXT:    liveins: $r10
-; 32BIT:         STW killed renamable $r10, 0, %fixed-stack.1 :: (store (s32) into %fixed-stack.1)
-; 32BIT-NEXT:    renamable $r3 = LHZ 1, %fixed-stack.1 :: (dereferenceable load (s16)
-
-; 64BIT:       fixedStack:
-; 64BIT-NEXT:     - { id: 0, type: default, offset: 116, size: 4, alignment: 4, stack-id: default,
-; 64BIT:          - { id: 1, type: default, offset: 104, size: 8, alignment: 8, stack-id: default,
-
-; 64BIT-LABEL: bb.0.entry:
-; 64BIT-NEXT:    liveins: $x10
-; 64BIT:         STD killed renamable $x10, 0, %fixed-stack.1 :: (store (s64) into %fixed-stack.1)
-; 64BIT-NEXT:    renamable $x3 = LHZ8 1, %fixed-stack.1 :: (dereferenceable load (s16)
-
-; CHECKASM-LABEL: .test_byval_3Byte:
-
-; ASM32:        stw 10, 52(1)
-; ASM32-NEXT:   lhz 3, 53(1)
-; ASM32-NEXT:   blr
-
-; ASM64:        std 10, 104(1)
-; ASM64-NEXT:   lhz 3, 105(1)
-; ASM64-NEXT:   blr
-
-
 %struct.S4 = type { [4 x i8] }
 %struct.S4A = type { i32 }
 
 @gS4 = external global %struct.S4, align 1
 
 define void @call_test_byval_4Byte() {
+; 32BIT-LABEL: call_test_byval_4Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    lwz 3, L..C4(2) # @gS4
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    lwz 4, 64(1)
+; 32BIT-NEXT:    bl .test_byval_4Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_4Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 3, L..C4(2) # @gS4
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    lwz 3, 0(3)
+; 64BIT-NEXT:    lwz 4, 112(1)
+; 64BIT-NEXT:    sldi 3, 3, 32
+; 64BIT-NEXT:    sldi 4, 4, 32
+; 64BIT-NEXT:    bl .test_byval_4Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s0 = alloca %struct.S0, align 8
   %s4a = alloca %struct.S4A, align 8
@@ -358,46 +259,24 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_4Byte{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REG:[0-9]+]] = LWZtoc @gS4, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REG]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_4Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3,  implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_4Byte:
-
-; ASM32:       stwu 1, -80(1)
-; ASM32-NEXT:  lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REG]])
-; ASM32-DAG:   lwz 4, 64(1)
-; ASM32-NEXT:  bl .test_byval_4Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 80
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS4, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[LD1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[LD2:[0-9]+]] = LWZ8 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
-; 64BIT-DAG:   renamable $x3 = RLDICR killed renamable $x[[LD1]], 32, 31
-; 64BIT-DAG:   renamable $x4 = RLDICR killed renamable $x[[LD2]], 32, 31
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_4Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3,  implicit $x4, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -128(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[LD1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lwz [[LD2:[0-9]+]], 112(1)
-; ASM64-DAG:   sldi 3, [[LD1]], 32
-; ASM64-DAG:   sldi 4, [[LD2]], 32
-; ASM64-NEXT:  bl .test_byval_4Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 128
-
-
 define signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 %s, ptr byval(%struct.S0) align 1, ptr byval(%struct.S4A) align 4 %s4a) {
+; 32BIT-LABEL: test_byval_4Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    clrlwi 3, 3, 24
+; 32BIT-NEXT:    add 3, 4, 3
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_4Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    lbz 3, 51(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    rldicl 4, 4, 32, 32
+; 64BIT-NEXT:    add 3, 4, 3
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S4, ptr %s, i32 0, i32 0, i32 3
   %1 = load i8, ptr %arrayidx, align 1
@@ -407,64 +286,43 @@ entry:
   ret i32 %add
 }
 
-; CHECK-LABEL: name:            test_byval_4Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 28, size: 4, alignment: 4, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:        - { id: 1, type: default, offset: 28, size: 4, alignment: 4, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:        - { id: 2, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3
-; 32BIT:        STW renamable $r3, 0, %fixed-stack.2 :: (store (s32) into %fixed-stack.2, align 8)
-; 32BIT-DAG:    STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
-; 32BIT-DAG:    renamable $r[[SCRATCH:[0-9]+]] = RLWINM killed renamable $r3, 0, 24, 31
-; 32BIT-DAG:    renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r[[SCRATCH]]
-; 32BIT:        BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT: - { id: 0, type: default, offset: 56, size: 8, alignment: 8, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:      - { id: 1, type: default, offset: 56, size: 8, alignment: 8, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:      - { id: 2, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3
-; 64BIT:        STD killed renamable $x3, 0, %fixed-stack.2 :: (store (s64) into %fixed-stack.2, align 16)
-; 64BIT:        STD renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
-; 64BIT-DAG:    renamable $r[[SCRATCH1:[0-9]+]] = LBZ 3, %fixed-stack.2 :: (dereferenceable load (s8)
-; 64BIT-DAG:    renamable $x[[SCRATCH2:[0-9]+]] = RLDICL killed renamable $x4, 32, 32
-; 64BIT-NEXT:   renamable $r[[SCRATCH3:[0-9]+]] = nsw ADD4 renamable $r[[SCRATCH2]], killed renamable $r[[SCRATCH1]], implicit killed $x[[SCRATCH2]]
-; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r[[SCRATCH3]]
-; 64BIT-NEXT:   BLR8
-
-; CHECKASM-LABEL: .test_byval_4Byte:
-
-; ASM32:        stw 3, 24(1)
-; ASM32-DAG:    stw 4, 28(1)
-; ASM32-DAG:    clrlwi  [[SCRATCH:[0-9]+]], 3, 24
-; ASM32-DAG:    add 3, 4, [[SCRATCH]]
-; ASM32-NEXT:   blr
-
-; ASM64:        std 3, 48(1)
-; ASM64-NEXT:   lbz [[SCRATCH1:[0-9]+]], 51(1)
-; ASM64-NEXT:   std 4, 56(1)
-; ASM64-NEXT:   rldicl [[SCRATCH2:[0-9]+]], 4, 32, 32
-; ASM64-NEXT:   add [[SCRATCH3:[0-9]+]], [[SCRATCH2]], [[SCRATCH1]]
-; ASM64-NEXT:   extsw 3, [[SCRATCH3]]
-; ASM64-NEXT:   blr
-
-
 %struct.S5 = type { [5 x i8] }
 
 @gS5 = external global %struct.S5, align 1
 
 define void @call_test_byval_5Byte() {
+; 32BIT-LABEL: call_test_byval_5Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C5(2) # @gS5
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    slwi 4, 4, 24
+; 32BIT-NEXT:    bl .test_byval_5Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_5Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C5(2) # @gS5
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 4, 4(3)
+; 64BIT-NEXT:    lwz 5, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 4, 24, 0, 7
+; 64BIT-NEXT:    rldimi 3, 5, 32, 0
+; 64BIT-NEXT:    bl .test_byval_5Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1 @gS5)
   ret void
@@ -472,54 +330,43 @@ entry:
 
 declare zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1)
 
-; CHECK-LABEL: name: call_test_byval_5Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS5, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LBZ 4, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG1]], 24, 0, 7
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_5Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lbz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   slwi 4, [[REG1]], 24
-; ASM32-NEXT:  bl .test_byval_5Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lbz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG2]], 24, 0, 7
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_5Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S6 = type { [6 x i8] }
 
 @gS6 = external global %struct.S6, align 1
 
 define void @call_test_byval_6Byte() {
+; 32BIT-LABEL: call_test_byval_6Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C6(2) # @gS6
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lhz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    slwi 4, 4, 16
+; 32BIT-NEXT:    bl .test_byval_6Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_6Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C6(2) # @gS6
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lhz 4, 4(3)
+; 64BIT-NEXT:    lwz 5, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 4, 16, 0, 15
+; 64BIT-NEXT:    rldimi 3, 5, 32, 0
+; 64BIT-NEXT:    bl .test_byval_6Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1 @gS6)
   ret void
@@ -527,54 +374,47 @@ entry:
 
 declare zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1)
 
-; CHECK-LABEL: name: call_test_byval_6Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS6, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 4, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_6Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   slwi 4, [[REG1]], 16
-; ASM32-NEXT:  bl .test_byval_6Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_6Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S7 = type { [7 x i8] }
 
 @gS7 = external global %struct.S7, align 1
 
 define void @call_test_byval_7Byte() {
+; 32BIT-LABEL: call_test_byval_7Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C7(2) # @gS7
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 4, 6(3)
+; 32BIT-NEXT:    lhz 5, 4(3)
+; 32BIT-NEXT:    rlwinm 4, 4, 8, 16, 23
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    rlwimi 4, 5, 16, 0, 15
+; 32BIT-NEXT:    bl .test_byval_7Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_7Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C7(2) # @gS7
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 5, 6(3)
+; 64BIT-NEXT:    lhz 4, 4(3)
+; 64BIT-NEXT:    lwz 6, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 5, 8, 16, 23
+; 64BIT-NEXT:    rlwimi 3, 4, 16, 0, 15
+; 64BIT-NEXT:    rldimi 3, 6, 32, 0
+; 64BIT-NEXT:    bl .test_byval_7Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1 @gS7)
   ret void
@@ -582,62 +422,39 @@ entry:
 
 declare zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1)
 
-; CHECK-LABEL: name: call_test_byval_7Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS7, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 4, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LBZ 6, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG2]], 8, 16, 23
-; 32BIT-DAG:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_7Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lbz [[REG2:[0-9]+]], 6([[REGADDR]])
-; ASM32-DAG:   rlwinm 4, [[REG2]], 8, 16, 23
-; ASM32-DAG:   rlwimi 4, [[REG1]], 16, 0, 15
-; ASM32-NEXT:  bl .test_byval_7Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23
-; 64BIT-DAG:   renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   lbz [[REG3:[0-9]+]], 6([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG3]], 8, 16, 23
-; ASM64-DAG:   rlwimi 3, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_7Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S8 = type { [8 x i8] }
 
 @gS8 = external global %struct.S8, align 1
 
 define void @call_test_byval_8Byte() {
+; 32BIT-LABEL: call_test_byval_8Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C8(2) # @gS8
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_8Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_8Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C8(2) # @gS8
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_8Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1 @gS8)
   ret void
@@ -645,102 +462,75 @@ entry:
 
 declare zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1)
 
-; CHECK-LABEL: name: call_test_byval_8Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS8, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_8Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-NEXT:  bl .test_byval_8Byte[PR]
-; ASM32-NEXT:  nop
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-NEXT:  std 0, 128(1)
-; ASM64-NEXT:  ld 3, 0([[REGADDR]])
-; ASM64-NEXT:  bl .test_byval_8Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S32 = type { [32 x i8] }
 
 @gS32 = external global %struct.S32, align 1
 
 define void @call_test_byval_32Byte() {
+; 32BIT-LABEL: call_test_byval_32Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C9(2) # @gS32
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lwz 10, 28(3)
+; 32BIT-NEXT:    lwz 9, 24(3)
+; 32BIT-NEXT:    lwz 8, 20(3)
+; 32BIT-NEXT:    lwz 7, 16(3)
+; 32BIT-NEXT:    lwz 6, 12(3)
+; 32BIT-NEXT:    lwz 5, 8(3)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_32Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_32Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C9(2) # @gS32
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    ld 6, 24(3)
+; 64BIT-NEXT:    ld 5, 16(3)
+; 64BIT-NEXT:    ld 4, 8(3)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_32Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 @gS32)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_32Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS32, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r5 = LWZ 8, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r6 = LWZ 12, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r7 = LWZ 16, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r8 = LWZ 20, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r9 = LWZ 24, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r10 = LWZ 28, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_32Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_32Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-DAG:   lwz 5, 8([[REGADDR]])
-; ASM32-DAG:   lwz 6, 12([[REGADDR]])
-; ASM32-DAG:   lwz 7, 16([[REGADDR]])
-; ASM32-DAG:   lwz 8, 20([[REGADDR]])
-; ASM32-DAG:   lwz 9, 24([[REGADDR]])
-; ASM32-DAG:   lwz 10, 28([[REGADDR]])
-; ASM32-NEXT:  bl .test_byval_32Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS32, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x4 = LD 8, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x5 = LD 16, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x6 = LD 24, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_32Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   ld 3, 0([[REGADDR]])
-; ASM64-DAG:   ld 4, 8([[REGADDR]])
-; ASM64-DAG:   ld 5, 16([[REGADDR]])
-; ASM64-DAG:   ld 6, 24([[REGADDR]])
-; ASM64-NEXT:  bl .test_byval_32Byte
-; ASM64-NEXT:  nop
-
 define zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 %s) {
+; 32BIT-LABEL: test_byval_32Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    lbz 3, 45(1)
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_32Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    lbz 3, 69(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S32, ptr %s, i32 0, i32 0, i32 21
   %0 = load i8, ptr %arrayidx, align 1
@@ -750,200 +540,127 @@ entry:
 ; The ByVal handling produces dead stores. See `LowerFormalArguments_AIX` for
 ; details on why.
 
-; CHECK-LABEL: name:            test_byval_32Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 24, size: 32, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT:        STW killed renamable $r8,  20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20
-; 32BIT-DAG:    STW killed renamable $r3,   0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0
-; 32BIT-DAG:    STW killed renamable $r4,   4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4
-; 32BIT-DAG:    STW killed renamable $r5,   8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8
-; 32BIT-DAG:    STW killed renamable $r6,  12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12
-; 32BIT-DAG:    STW killed renamable $r7,  16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16
-; 32BIT:        renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load (s8)
-; 32BIT-DAG:    STW killed renamable $r9,  24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24
-; 32BIT-DAG:    STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28
-; 32BIT:        BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 48, size: 32, alignment: 16, stack-id: default,
-; 64BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
-; 64BIT:        STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16
-; 64BIT-DAG:    STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0
-; 64BIT-NEXT:   renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load (s8)
-; 64BIT-DAG:    STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8
-; 64BIT-DAG:    STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24
-; 64BIT-NEXT:   BLR8
-
-; CHECKASM-LABEL: .test_byval_32Byte:
-
-; ASM32:       stw 8, 44(1)
-; ASM32:       stw 3, 24(1)
-; ASM32-DAG:   lbz 3, 45(1)
-; ASM32-DAG:   stw 4, 28(1)
-; ASM32-DAG:   stw 5, 32(1)
-; ASM32-DAG:   stw 6, 36(1)
-; ASM32-DAG:   stw 7, 40(1)
-; ASM32-DAG:   stw 9, 48(1)
-; ASM32-DAG:   stw 10, 52(1)
-; ASM32-NEXT:  blr
-
-; ASM64:       std 5, 64(1)
-; ASM64:       std 3, 48(1)
-; ASM64-DAG:   lbz 3, 69(1)
-; ASM64-DAG:   std 4, 56(1)
-; ASM64-DAG:   std 6, 72(1)
-; ASM64-NEXT:  blr
-
 %struct.S31 = type <{ float, i32, i64, double, i32, i16, i8 }>
 
 @gS31 = external global %struct.S31, align 1
 
 define void @call_test_byval_31Byte() {
+; 32BIT-LABEL: call_test_byval_31Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C10(2) # @gS31
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 10, 30(3)
+; 32BIT-NEXT:    lhz 11, 28(3)
+; 32BIT-NEXT:    rlwinm 10, 10, 8, 16, 23
+; 32BIT-NEXT:    lwz 9, 24(3)
+; 32BIT-NEXT:    rlwimi 10, 11, 16, 0, 15
+; 32BIT-NEXT:    lwz 8, 20(3)
+; 32BIT-NEXT:    lwz 7, 16(3)
+; 32BIT-NEXT:    lwz 6, 12(3)
+; 32BIT-NEXT:    lwz 5, 8(3)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_31Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_31Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C10(2) # @gS31
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 6, 30(3)
+; 64BIT-NEXT:    lhz 7, 28(3)
+; 64BIT-NEXT:    rlwinm 6, 6, 8, 16, 23
+; 64BIT-NEXT:    lwz 8, 24(3)
+; 64BIT-NEXT:    rlwimi 6, 7, 16, 0, 15
+; 64BIT-NEXT:    ld 5, 16(3)
+; 64BIT-NEXT:    rldimi 6, 8, 32, 0
+; 64BIT-NEXT:    ld 4, 8(3)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_31Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call double @test_byval_31Byte(ptr byval(%struct.S31) align 1 @gS31)
   ret void
 }
 
-
-; CHECK-LABEL: name: call_test_byval_31Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS31, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r5 = LWZ 8, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r6 = LWZ 12, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r7 = LWZ 16, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r8 = LWZ 20, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r9 = LWZ 24, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r[[REG:[0-9]+]] = LHZ 28, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r10 = LBZ 30, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r10 = RLWINM killed renamable $r10, 8, 16, 23
-; 32BIT-DAG:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r[[REG]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_31Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_31Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-DAG:   lwz 5, 8([[REGADDR]])
-; ASM32-DAG:   lwz 6, 12([[REGADDR]])
-; ASM32-DAG:   lwz 7, 16([[REGADDR]])
-; ASM32-DAG:   lwz 8, 20([[REGADDR]])
-; ASM32-DAG:   lwz 9, 24([[REGADDR]])
-; ASM32-DAG:   lbz 10, 30([[REGADDR]])
-; ASM32-DAG:   lhz [[REG:[0-9]+]], 28([[REGADDR]])
-; ASM32-DAG:   rlwinm 10, 10, 8, 16, 23
-; ASM32-DAG:   rlwimi 10, [[REG]], 16, 0, 15
-; ASM32-NEXT:  bl .test_byval_31Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS31, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x4 = LD 8, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x5 = LD 16, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 24, renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 28, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LBZ8 30, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x6 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23
-; 64BIT-DAG:   renamable $x6 = RLWIMI8 killed renamable $x6, killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x6 = RLDIMI killed renamable $x6, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_31Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   ld 3, 0([[REGADDR]])
-; ASM64-DAG:   ld 4, 8([[REGADDR]])
-; ASM64-DAG:   ld 5, 16([[REGADDR]])
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 24([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 28([[REGADDR]])
-; ASM64-DAG:   lbz [[REG3:[0-9]+]], 30([[REGADDR]])
-; ASM64-DAG:   rlwinm 6, [[REG3]], 8, 16, 23
-; ASM64-DAG:   rlwimi 6, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 6, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_31Byte
-; ASM64-NEXT:  nop
-
-
-
 define double @test_byval_31Byte(ptr byval(%struct.S31) align 1 %s) {
+; 32BIT-LABEL: test_byval_31Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    lfd 1, 40(1)
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_31Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    lfd 1, 64(1)
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    blr
 entry:
   %gep = getelementptr inbounds %struct.S31, ptr %s, i32 0, i32 3
   %load = load double, ptr %gep, align 1
   ret double %load
 }
 
-; CHECK-LABEL: name:            test_byval_31Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 24, size: 32, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT-DAG:    STW killed renamable $r3,   0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0
-; 32BIT-DAG:    STW killed renamable $r4,   4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4
-; 32BIT-DAG:    STW killed renamable $r5,   8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8
-; 32BIT-DAG:    STW killed renamable $r6,  12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12
-; 32BIT-DAG:    STW killed renamable $r7,  16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16
-; 32BIT-DAG:    STW killed renamable $r8,  20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20
-; 32BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64)
-; 32BIT-DAG:    STW killed renamable $r9,  24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24
-; 32BIT-DAG:    STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28
-; 32BIT-NEXT:   BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 48, size: 32, alignment: 16, stack-id: default,
-; 64BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
-; 64BIT-DAG:    STD killed renamable $x3,  0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0
-; 64BIT-DAG:    STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16
-; 64BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64)
-; 64BIT-DAG:    STD killed renamable $x4,  8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8
-; 64BIT-DAG:    STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24
-; 64BIT-NEXT:   BLR8
-
-; ASM32-LABEL: .test_byval_31Byte:
-
-; ASM32-DAG:      stw 8, 44(1)
-; ASM32:          stw 7, 40(1)
-; ASM32-DAG:      lfd 1, 40(1)
-; ASM32-DAG:      stw 3, 24(1)
-; ASM32-DAG:      stw 4, 28(1)
-; ASM32-DAG:      stw 5, 32(1)
-; ASM32-DAG:      stw 6, 36(1)
-; ASM32-DAG:      stw 9, 48(1)
-; ASM32-DAG:      stw 10, 52(1)
-; ASM32-NEXT:     blr
-
-; ASM64:          std 5, 64(1)
-; ASM64:          lfd 1, 64(1)
-; ASM64-DAG:      std 3, 48(1)
-; ASM64-DAG:      std 4, 56(1)
-; ASM64-DAG:      std 6, 72(1)
-; ASM64-NEXT:     blr
-
 %struct.F = type { float, float, float }
 
 define i32 @call_test_byval_homogeneous_float_struct() {
+; 32BIT-LABEL: call_test_byval_homogeneous_float_struct:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    li 3, 0
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    stw 3, 72(1)
+; 32BIT-NEXT:    stw 3, 68(1)
+; 32BIT-NEXT:    lwz 5, 72(1)
+; 32BIT-NEXT:    lwz 4, 68(1)
+; 32BIT-NEXT:    stw 3, 64(1)
+; 32BIT-NEXT:    bl .test_byval_homogeneous_float_struct[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_homogeneous_float_struct:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    li 3, 0
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    stw 3, 120(1)
+; 64BIT-NEXT:    std 3, 112(1)
+; 64BIT-NEXT:    lwz 3, 120(1)
+; 64BIT-NEXT:    sldi 4, 3, 32
+; 64BIT-NEXT:    li 3, 0
+; 64BIT-NEXT:    bl .test_byval_homogeneous_float_struct[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s = alloca %struct.F, align 8
   call void @llvm.memset.p0.i32(ptr align 4 %s, i8 0, i32 12, i1 false)
@@ -954,37 +671,3 @@ entry:
 declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
 
 declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
-
-; CHECK-LABEL: name: call_test_byval_homogeneous_float_struct{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
-; 32BIT-DAG:   renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
-; 32BIT-DAG:   $r3 = LI 0
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_homogeneous_float_struct:
-
-; ASM32:       stwu 1, -80(1)
-; ASM32-DAG:   lwz 4, 68(1)
-; ASM32-DAG:   lwz 5, 72(1)
-; ASM32-DAG:   stw 3, 64(1)
-; ASM32-NEXT:  bl .test_byval_homogeneous_float_struct[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT:       renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
-; 64BIT-NEXT:  renamable $x4 = RLDICR killed renamable $x3, 32, 31
-; 64BIT-NEXT:  $x3 = LI8 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -128(1)
-; ASM64:       lwz 3, 120(1)
-; ASM64-NEXT:  sldi 4, 3, 32
-; ASM64-NEXT:  li 3, 0
-; ASM64-NEXT:  bl .test_byval_homogeneous_float_struct[PR]
-; ASM64-NEXT:  nop

From 977d8a4bcd83797217433709201922b9deb97ae2 Mon Sep 17 00:00:00 2001
From: Vincent <llvm@viceroygroup.ca>
Date: Tue, 17 Jun 2025 06:20:41 -0700
Subject: [PATCH 0637/1322] [clang][Sema] Fixed Compound Literal is not
 Constant Expression (#143852)

Added a check for a compound literal hiding inside a function.

fixes #87867
---
 clang/docs/ReleaseNotes.rst      |  2 ++
 clang/include/clang/Sema/Scope.h | 11 +++++++++++
 clang/lib/Sema/SemaExpr.cpp      |  1 +
 clang/test/Sema/gh87867.c        | 33 ++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+)
 create mode 100644 clang/test/Sema/gh87867.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 03641f5d0ea0..6f28dbd03ca2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -692,6 +692,8 @@ Bug Fixes in This Version
   ``#include`` directive. (#GH138094)
 - Fixed a crash during constant evaluation involving invalid lambda captures
   (#GH138832)
+- Fixed compound literal is not constant expression inside initializer list
+  (#GH87867)
 - Fixed a crash when instantiating an invalid dependent friend template specialization.
   (#GH139052)
 - Fixed a crash with an invalid member function parameter list with a default
diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h
index ad12a3d73413..07b9e1bc10f5 100644
--- a/clang/include/clang/Sema/Scope.h
+++ b/clang/include/clang/Sema/Scope.h
@@ -427,6 +427,17 @@ public:
     return false;
   }
 
+  /// isInObjcMethodScope - Return true if this scope is, or is contained, in an
+  /// C function body.
+  bool isInCFunctionScope() const {
+    for (const Scope *S = this; S; S = S->getParent()) {
+      if (S->isFunctionScope())
+        return true;
+    }
+
+    return false;
+  }
+
   /// isInObjcMethodScope - Return true if this scope is, or is contained in, an
   /// Objective-C method body.  Note that this method is not constant time.
   bool isInObjcMethodScope() const {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 413eff4aa294..ebc43157d4c2 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7176,6 +7176,7 @@ Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo,
   //   void func(char *para[(int [1]){ 0 }[0]);
   const Scope *S = getCurScope();
   bool IsFileScope = !CurContext->isFunctionOrMethod() &&
+                     !S->isInCFunctionScope() &&
                      (!S || !S->isFunctionPrototypeScope());
 
   // In C, compound literals are l-values for some reason.
diff --git a/clang/test/Sema/gh87867.c b/clang/test/Sema/gh87867.c
new file mode 100644
index 000000000000..0568c734424c
--- /dev/null
+++ b/clang/test/Sema/gh87867.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 %s
+
+// Compound literal doesn't need a constant expression inside a initializer-list if it is already inside a function 
+// see: https://github.com/llvm/llvm-project/issues/87867
+int foo(int *a, int b) {
+    return 0;
+}
+
+int x;
+struct{int t;} a = (struct {
+    typeof(foo(&(struct { int t; }){.t = x}.t, 0)) t; // expected-error {{initializer element is not a compile-time constant}}
+}){0};
+
+void inside_a_func(){
+    int x;
+    (void)(struct {
+        typeof(foo(&(struct { int t; }){.t = x}.t, 0)) t;
+    }){0};
+}
+
+// see: https://github.com/llvm/llvm-project/issues/143613
+#define bitcast(type, value) \
+    (((union{ typeof(value) src; type dst; }){ (value) }).dst)
+
+double placeholder = 10.0;
+double bar = bitcast(double, placeholder);  // expected-error {{initializer element is not a compile-time constant}}
+
+int main(void)
+{
+    int foo = 4;
+    foo = bitcast(int, bitcast(double, foo));
+    return 0;
+}

From 816ab1af0da1dc833f487933e7d6fb470d844001 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 17 Jun 2025 06:21:21 -0700
Subject: [PATCH 0638/1322] [NFCI][TableGen][DecoderEmitter] Cull Op handling
 when possible (#142974)

TryDecode/CheckPredicate/SoftFail MCD ops are not used by many targets.
Track the set of opcodes that were emitted and emit code for handling
TryDecode/CheckPredicate/SoftFail ops when decoding only if there were
emitted. This is purely eliminating dead code in the generated
`decodeInstruction` function.

This results in the following reduction in the size of the Disassembler
.so files with a release x86_64 release build on Linux:

```
Target                                                   Old Size        New Size  %  reduction
build/lib/libLLVMAArch64Disassembler.so.21.0git             256656          256656          0.00
build/lib/libLLVMAMDGPUDisassembler.so.21.0git              813000          808168          0.59
build/lib/libLLVMARCDisassembler.so.21.0git                  44816           43536          2.86
build/lib/libLLVMARMDisassembler.so.21.0git                 281744          278808          1.04
build/lib/libLLVMAVRDisassembler.so.21.0git                  36040           34496          4.28
build/lib/libLLVMBPFDisassembler.so.21.0git                  26248           23168         11.73
build/lib/libLLVMCSKYDisassembler.so.21.0git                 55960           53632          4.16
build/lib/libLLVMHexagonDisassembler.so.21.0git             115952          113416          2.19
build/lib/libLLVMLanaiDisassembler.so.21.0git                24360           21008         13.76
build/lib/libLLVMLoongArchDisassembler.so.21.0git            58584           56168          4.12
build/lib/libLLVMM68kDisassembler.so.21.0git                 57264           53880          5.91
build/lib/libLLVMMSP430Disassembler.so.21.0git               28896           28440          1.58
build/lib/libLLVMMipsDisassembler.so.21.0git                123128          120568          2.08
build/lib/libLLVMPowerPCDisassembler.so.21.0git              80656           78096          3.17
build/lib/libLLVMRISCVDisassembler.so.21.0git               154080          150200          2.52
build/lib/libLLVMSparcDisassembler.so.21.0git                42040           39568          5.88
build/lib/libLLVMSystemZDisassembler.so.21.0git              97056           94552          2.58
build/lib/libLLVMVEDisassembler.so.21.0git                   83944           81352          3.09
build/lib/libLLVMWebAssemblyDisassembler.so.21.0git          25280           25280          0.00
build/lib/libLLVMX86Disassembler.so.21.0git                2920624         2920624          0.00
build/lib/libLLVMXCoreDisassembler.so.21.0git                48320           44288          8.34
build/lib/libLLVMXtensaDisassembler.so.21.0git               42248           35840         15.17
```
---
 llvm/utils/TableGen/DecoderEmitter.cpp | 114 +++++++++++++++----------
 1 file changed, 71 insertions(+), 43 deletions(-)

diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 7489d369c993..37814113b467 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -222,10 +222,11 @@ public:
   DecoderEmitter(const RecordKeeper &R, StringRef PredicateNamespace)
       : RK(R), Target(R), PredicateNamespace(PredicateNamespace) {}
 
-  // Emit the decoder state machine table.
-  void emitTable(formatted_raw_ostream &OS, DecoderTable &Table, indent Indent,
-                 unsigned BitWidth, StringRef Namespace,
-                 const EncodingIDsVec &EncodingIDs) const;
+  // Emit the decoder state machine table. Returns a mask of MCD decoder ops
+  // that were emitted.
+  unsigned emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
+                     indent Indent, unsigned BitWidth, StringRef Namespace,
+                     const EncodingIDsVec &EncodingIDs) const;
   void emitInstrLenTable(formatted_raw_ostream &OS,
                          ArrayRef<unsigned> InstrLen) const;
   void emitPredicateFunction(formatted_raw_ostream &OS,
@@ -826,11 +827,12 @@ unsigned Filter::usefulness() const {
 //                              //
 //////////////////////////////////
 
-// Emit the decoder state machine table.
-void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
-                               indent Indent, unsigned BitWidth,
-                               StringRef Namespace,
-                               const EncodingIDsVec &EncodingIDs) const {
+// Emit the decoder state machine table. Returns a mask of MCD decoder ops
+// that were emitted.
+unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
+                                   DecoderTable &Table, indent Indent,
+                                   unsigned BitWidth, StringRef Namespace,
+                                   const EncodingIDsVec &EncodingIDs) const {
   // We'll need to be able to map from a decoded opcode into the corresponding
   // EncodingID for this specific combination of BitWidth and Namespace. This
   // is used below to index into NumberedEncodings.
@@ -884,6 +886,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
       OS << " (Fail)";
   };
 
+  unsigned OpcodeMask = 0;
+
   while (I != E) {
     assert(I < E && "incomplete decode table entry!");
 
@@ -892,6 +896,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
     OS.PadToColumn(12);
 
     const uint8_t DecoderOp = *I++;
+    OpcodeMask |= (1 << DecoderOp);
     switch (DecoderOp) {
     default:
       PrintFatalError("Invalid decode table opcode: " + Twine((int)DecoderOp) +
@@ -1027,6 +1032,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
   Indent -= 2;
 
   OS << Indent << "};\n\n";
+
+  return OpcodeMask;
 }
 
 void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
@@ -1045,19 +1052,13 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
   OS << Indent << "static bool checkDecoderPredicate(unsigned Idx, "
      << "const FeatureBitset &Bits) {\n";
   Indent += 2;
-  if (!Predicates.empty()) {
-    OS << Indent << "switch (Idx) {\n";
-    OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
-    unsigned Index = 0;
-    for (const auto &Predicate : Predicates) {
-      OS << Indent << "case " << Index++ << ":\n";
-      OS << Indent + 2 << "return (" << Predicate << ");\n";
-    }
-    OS << Indent << "}\n";
-  } else {
-    // No case statement to emit
-    OS << Indent << "llvm_unreachable(\"Invalid index!\");\n";
+  OS << Indent << "switch (Idx) {\n";
+  OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
+  for (const auto &[Index, Predicate] : enumerate(Predicates)) {
+    OS << Indent << "case " << Index << ":\n";
+    OS << Indent + 2 << "return (" << Predicate << ");\n";
   }
+  OS << Indent << "}\n";
   Indent -= 2;
   OS << Indent << "}\n\n";
 }
@@ -2217,8 +2218,15 @@ static void insertBits(InsnType &field, InsnType bits, unsigned startBit,
 
 // emitDecodeInstruction - Emit the templated helper function
 // decodeInstruction().
-static void emitDecodeInstruction(formatted_raw_ostream &OS,
-                                  bool IsVarLenInst) {
+static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst,
+                                  unsigned OpcodeMask) {
+  const bool HasTryDecode = OpcodeMask & ((1 << MCD::OPC_TryDecode) |
+                                          (1 << MCD::OPC_TryDecodeOrFail));
+  const bool HasCheckPredicate =
+      OpcodeMask &
+      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+  const bool HasSoftFail = OpcodeMask & (1 << MCD::OPC_SoftFail);
+
   OS << R"(
 static unsigned decodeNumToSkip(const uint8_t *&Ptr) {
   unsigned NumToSkip = *Ptr++;
@@ -2238,9 +2246,11 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     OS << ",\n                                      "
           "llvm::function_ref<void(APInt &, uint64_t)> makeUp";
   }
-  OS << R"() {
-  const FeatureBitset &Bits = STI.getFeatureBits();
+  OS << ") {\n";
+  if (HasCheckPredicate)
+    OS << "  const FeatureBitset &Bits = STI.getFeatureBits();\n";
 
+  OS << R"(
   const uint8_t *Ptr = DecodeTable;
   uint64_t CurFieldValue = 0;
   DecodeStatus S = MCDisassembler::Success;
@@ -2321,7 +2331,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         Ptr += NumToSkip;
       }
       break;
-    }
+    })";
+  if (HasCheckPredicate) {
+    OS << R"(
     case MCD::OPC_CheckPredicate:
     case MCD::OPC_CheckPredicateOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_CheckPredicateOrFail;
@@ -2343,7 +2355,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         Ptr += NumToSkip;
       }
       break;
-    }
+    })";
+  }
+  OS << R"(
     case MCD::OPC_Decode: {
       // Decode the Opcode value.
       unsigned Opc = decodeULEB128AndIncUnsafe(Ptr);
@@ -2364,7 +2378,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
                    << ", using decoder " << DecodeIdx << ": "
                    << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
       return S;
-    }
+    })";
+  if (HasTryDecode) {
+    OS << R"(
     case MCD::OPC_TryDecode:
     case MCD::OPC_TryDecodeOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_TryDecodeOrFail;
@@ -2399,17 +2415,22 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       // set before the decode attempt.
       S = MCDisassembler::Success;
       break;
-    }
-    case MCD::OPC_SoftFail: {
-      // Decode the mask values.
-      uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
-      uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
-      bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
-      if (Failed)
-        S = MCDisassembler::SoftFail;
-      LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
-      break;
-    }
+    })";
+  }
+  if (HasSoftFail) {
+    OS << R"(
+      case MCD::OPC_SoftFail: {
+        // Decode the mask values.
+        uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
+        uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
+        bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
+        if (Failed)
+          S = MCDisassembler::SoftFail;
+        LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
+        break;
+  })";
+  }
+  OS << R"(
     case MCD::OPC_Fail: {
       LLVM_DEBUG(dbgs() << Loc << ": OPC_Fail\n");
       return MCDisassembler::Fail;
@@ -2609,6 +2630,7 @@ namespace {
   }
 
   DecoderTableInfo TableInfo;
+  unsigned OpcodeMask = 0;
   for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
     ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
@@ -2634,8 +2656,8 @@ namespace {
     TableInfo.Table.push_back(MCD::OPC_Fail);
 
     // Print the table to the output stream.
-    emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(), Opc.first.first,
-              Opc.second);
+    OpcodeMask |= emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(),
+                            Opc.first.first, Opc.second);
   }
 
   // For variable instruction, we emit a instruction length table
@@ -2643,14 +2665,20 @@ namespace {
   // You can see example usage in M68k's disassembler.
   if (IsVarLenInst)
     emitInstrLenTable(OS, InstrLen);
+
+  const bool HasCheckPredicate =
+      OpcodeMask &
+      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+
   // Emit the predicate function.
-  emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
+  if (HasCheckPredicate)
+    emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
 
   // Emit the decoder function.
   emitDecoderFunction(OS, TableInfo.Decoders, indent(0));
 
   // Emit the main entry point for the decoder, decodeInstruction().
-  emitDecodeInstruction(OS, IsVarLenInst);
+  emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
 
   OS << "\n} // namespace\n";
 }

From 9fed480f183d9cfa784228cd77b2c0a642fca697 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 17 Jun 2025 06:28:27 -0700
Subject: [PATCH 0639/1322] [BOLT] Explicitly check for returns when extending
 call continuation profile (#143295)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Call continuation logic relies on assumptions about fall-through origin:
- the branch is external to the function,
- fall-through start is at the beginning of the block,
- the block is not an entry point or a landing pad.

Leverage trace information to explicitly check whether the origin is a
return instruction, and defer to checks above only in case of
DSO-external branch source.

This covers both regular and BAT cases, addressing call continuation
fall-through undercounting in the latter mode, which improves BAT
profile quality metrics. For example, for one large binary:
- CFG discontinuity 21.83% -> 0.00%,
- CFG flow imbalance 10.77%/100.00% -> 3.40%/13.82% (weighted/worst)
- CG flow imbalance 8.49% —> 8.49%.

Depends on #143289.

Test Plan: updated callcont-fallthru.s
---
 bolt/include/bolt/Profile/DataAggregator.h | 12 +++-
 bolt/lib/Profile/DataAggregator.cpp        | 71 +++++++++++++---------
 bolt/test/X86/callcont-fallthru.s          | 70 +++++++++++----------
 3 files changed, 89 insertions(+), 64 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 10d96fbeca3e..96969cf53bac 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -132,6 +132,9 @@ private:
   /// and use them later for processing and assigning profile.
   std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
   std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
+  /// Pre-populated addresses of returns, coming from pre-aggregated data or
+  /// disassembly. Used to disambiguate call-continuation fall-throughs.
+  std::unordered_set<uint64_t> Returns;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -204,8 +207,8 @@ private:
   /// Return a vector of offsets corresponding to a trace in a function
   /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
-                         uint64_t Count) const;
+  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count,
+                         bool IsReturn) const;
 
   /// Record external entry into the function \p BF.
   ///
@@ -265,11 +268,14 @@ private:
                      uint64_t From, uint64_t To, uint64_t Count,
                      uint64_t Mispreds);
 
+  /// Checks if \p Addr corresponds to a return instruction.
+  bool checkReturn(uint64_t Addr);
+
   /// Register a \p Branch.
   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
 
   /// Register a trace between two LBR entries supplied in execution order.
-  bool doTrace(const Trace &Trace, uint64_t Count);
+  bool doTrace(const Trace &Trace, uint64_t Count, bool IsReturn);
 
   /// Parser helpers
   /// Return false if we exhausted our parser buffer and finished parsing
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 118629b04f6f..178c9d3a6373 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -730,50 +730,54 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
   return true;
 }
 
+bool DataAggregator::checkReturn(uint64_t Addr) {
+  auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
+  if (llvm::is_contained(Returns, Addr))
+    return true;
+
+  BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
+  if (!Func)
+    return false;
+
+  const uint64_t Offset = Addr - Func->getAddress();
+  if (Func->hasInstructions()
+          ? isReturn(Func->getInstructionAtOffset(Offset))
+          : isReturn(Func->disassembleInstructionAtOffset(Offset))) {
+    Returns.emplace(Addr);
+    return true;
+  }
+  return false;
+}
+
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
                               uint64_t Mispreds) {
-  // Returns whether \p Offset in \p Func contains a return instruction.
-  auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
-    auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
-    return Func.hasInstructions()
-               ? isReturn(Func.getInstructionAtOffset(Offset))
-               : isReturn(Func.disassembleInstructionAtOffset(Offset));
-  };
-
   // Mutates \p Addr to an offset into the containing function, performing BAT
   // offset translation and parent lookup.
   //
-  // Returns the containing function (or BAT parent) and whether the address
-  // corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
+  // Returns the containing function (or BAT parent).
   auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
     BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
     if (!Func) {
       Addr = 0;
-      return std::pair{Func, false};
+      return Func;
     }
 
     Addr -= Func->getAddress();
 
-    bool IsRet = IsFrom && checkReturn(*Func, Addr);
-
     if (BAT)
       Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
 
     if (BinaryFunction *ParentFunc = getBATParentFunction(*Func))
-      Func = ParentFunc;
+      return ParentFunc;
 
-    return std::pair{Func, IsRet};
+    return Func;
   };
 
-  auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
-  auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
+  BinaryFunction *FromFunc = handleAddress(From, /*IsFrom*/ true);
+  BinaryFunction *ToFunc = handleAddress(To, /*IsFrom*/ false);
   if (!FromFunc && !ToFunc)
     return false;
 
-  // Ignore returns.
-  if (IsReturn)
-    return true;
-
   // Treat recursive control transfers as inter-branches.
   if (FromFunc == ToFunc && To != 0) {
     recordBranch(*FromFunc, From, To, Count, Mispreds);
@@ -783,7 +787,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
 }
 
-bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
+bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
+                             bool IsReturn) {
   const uint64_t From = Trace.From, To = Trace.To;
   BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
   BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
@@ -808,8 +813,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
   const uint64_t FuncAddress = FromFunc->getAddress();
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
       BAT && BAT->isBATFunction(FuncAddress)
-          ? BAT->getFallthroughsInTrace(FuncAddress, From, To)
-          : getFallthroughsInTrace(*FromFunc, Trace, Count);
+          ? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To)
+          : getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn);
   if (!FTs) {
     LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
@@ -831,7 +836,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
 
 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
 DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
-                                       uint64_t Count) const {
+                                       uint64_t Count, bool IsReturn) const {
   SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
 
   BinaryContext &BC = BF.getBinaryContext();
@@ -865,9 +870,13 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
-      From == FromBB->getOffset() && !FromBB->isEntryPoint() &&
-      !FromBB->isLandingPad()) {
+  if (IsReturn) {
+    if (From)
+      FromBB = BF.getBasicBlockContainingOffset(From - 1);
+    else
+      LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
+  } else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
+             !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -1557,11 +1566,13 @@ void DataAggregator::processBranchEvents() {
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
   for (const auto &[Trace, Info] : Traces) {
-    if (Trace.Branch != Trace::FT_ONLY &&
+    bool IsReturn = checkReturn(Trace.Branch);
+    // Ignore returns.
+    if (!IsReturn && Trace.Branch != Trace::FT_ONLY &&
         Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
       doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
     if (Trace.To != Trace::BR_ONLY)
-      doTrace(Trace, Info.TakenCount);
+      doTrace(Trace, Info.TakenCount, IsReturn);
   }
   printBranchSamplesDiagnostics();
 }
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 4994cfb541ee..c2ef024db947 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,29 +4,43 @@
 # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 ## Link against a DSO to ensure PLT entries.
 # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pat PREAGGT1
-# RUN: link_fdata %s %t %t.pat2 PREAGGT2
-# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT
+# Trace to a call continuation, not a landing pad/entry point
+# RUN: link_fdata %s %t %t.pa-base PREAGG-BASE
+# Trace from a return to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
+# Trace from an external location to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
 
 ## Check pre-aggregated traces attach call continuation fallthrough count
-# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s
+## in the basic case (not an entry point, not a landing pad).
+# RUN: llvm-bolt %t.noeh --pa -p %t.pa-base -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-BASE
 
-## Check pre-aggregated traces don't attach call continuation fallthrough count
-## to secondary entry point (unstripped)
-# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
-## Check pre-aggregated traces don't attach call continuation fallthrough count
-## to landing pad (stripped, LP)
-# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
+## Check pre-aggregated traces from a return attach call continuation
+## fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-ret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated traces from a return attach call continuation
+## fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-ret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
+## Check pre-aggregated traces from external location don't attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-ext -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
+## Check pre-aggregated traces from external location don't attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
 
 ## Check pre-aggregated traces don't report zero-sized PLT fall-through as
 ## invalid trace
-# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \
+# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
 # RUN-DISABLED:   --check-prefix=CHECK-PLT
 # CHECK-PLT: traces mismatching disassembled function contents: 0
 
@@ -56,11 +70,11 @@ main:
 Ltmp0_br:
 	callq	puts@PLT
 ## Check PLT traces are accepted
-# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
+# PREAGG-PLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
 ## Target is an external-origin call continuation
-# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
-# CHECK:      callq puts@PLT
-# CHECK-NEXT: count: 2
+# PREAGG-BASE: T X:0 #Ltmp1# #Ltmp4_br# 2
+# CHECK-BASE:      callq puts@PLT
+# CHECK-BASE-NEXT: count: 2
 
 Ltmp1:
 	movq	-0x10(%rbp), %rax
@@ -71,24 +85,18 @@ Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
 Ltmp4_br:
 	je	Ltmp0
-# CHECK2:      je .Ltmp0
-# CHECK2-NEXT: count: 3
 
 	movl	$0xa, -0x18(%rbp)
 	callq	foo
 ## Target is a binary-local call continuation
-# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
-# CHECK:      callq foo
-# CHECK-NEXT: count: 1
-
-## PLT call continuation fallthrough spanning the call
-# CHECK2:      callq foo
-# CHECK2-NEXT: count: 3
-
+# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
-# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
-# CHECK3:      callq foo
-# CHECK3-NEXT: count: 0
+# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+
+# CHECK-ATTACH:      callq foo
+# CHECK-ATTACH-NEXT: count: 1
+# CHECK-SKIP:        callq foo
+# CHECK-SKIP-NEXT:   count: 0
 
 Ltmp3:
 	cmpl	$0x0, -0x18(%rbp)

From 917bc909673a491fe070fe41c4ad112bcffd4c06 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Tue, 17 Jun 2025 06:41:15 -0700
Subject: [PATCH 0640/1322] [MLIR][LLVMIR] Mark Funcop as affinescope (#144456)

All functions are conceptually an affine scope.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index eda1d544cd81..68fa620d239b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1829,7 +1829,7 @@ def LLVM_ComdatOp : LLVM_Op<"comdat", [NoTerminator, NoRegionArguments, SymbolTa
 }
 
 def LLVM_LLVMFuncOp : LLVM_Op<"func", [
-    AutomaticAllocationScope, IsolatedFromAbove, FunctionOpInterface
+    AffineScope, AutomaticAllocationScope, IsolatedFromAbove, FunctionOpInterface
   ]> {
   let summary = "LLVM dialect function.";
 

From de3339063ae5a926ab2ed17651a0e628b9c34fb0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Tue, 17 Jun 2025 15:44:16 +0200
Subject: [PATCH 0641/1322] [bazel] Port
 b4e39e4ff923334a8a1fdcc6d92b01d3885a01f2

---
 utils/bazel/llvm-project-overlay/llvm/config.bzl               | 1 +
 .../llvm-project-overlay/llvm/include/llvm/Config/config.h     | 3 +++
 utils/bazel/llvm_configs/config.h.cmake                        | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index d9d3666a3ecc..7cb4b7e9ffe7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -43,6 +43,7 @@ posix_defines = [
     "HAVE_SETENV_R=1",
     "HAVE_STRERROR_R=1",
     "HAVE_SYSEXITS_H=1",
+    "HAVE_SYS_IOCTL_H=1",
     "HAVE_UNISTD_H=1",
 ]
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index 3ef1d0c4b165..feac6a9d3308 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -171,6 +171,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #define HAVE_SYS_MMAN_H 1
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+/* HAVE_SYS_IOCTL_H defined in Bazel */
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 /* #undef HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC */
 
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 06d475639791..ce83de8e4cba 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -164,6 +164,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 

From dc72b91ffedf791a44a1af19b00064a2a3c59ab9 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Tue, 17 Jun 2025 09:59:38 -0400
Subject: [PATCH 0642/1322] [AArch64] Report icmp as free if it can be folded
 into ands (#143286)

Since changing the backend to fold x >= 1 / x < 1 -> x > 0 / x <= 0 and
x <= -1 / x > -1 -> x > 0 / x <= 0, this should be reflected in the
cost.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 25 +++++++++++++------
 llvm/test/Analysis/CostModel/AArch64/cmp.ll   | 21 +++++++++++++++-
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aed..ed051f295752 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4353,15 +4353,26 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
     }
   }
 
-  // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
-  // FIXME: This can apply to more conditions and add/sub if it can be shown to
-  // be profitable.
+  // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
+  // icmp(and, 0) as free, as we can make use of ands, but only if the
+  // comparison is not unsigned.
   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
-      ICmpInst::isEquality(VecPred) &&
+      !CmpInst::isUnsigned(VecPred) &&
       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
-      match(I->getOperand(1), m_Zero()) &&
-      match(I->getOperand(0), m_And(m_Value(), m_Value())))
-    return 0;
+      match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
+    if (match(I->getOperand(1), m_Zero()))
+      return 0;
+
+    // x >= 1 / x < 1 -> x > 0 / x <= 0
+    if (match(I->getOperand(1), m_One()) &&
+        (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
+      return 0;
+
+    // x <= -1 / x > -1 -> x > 0 / x <= 0
+    if (match(I->getOperand(1), m_AllOnes()) &&
+        (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
+      return 0;
+  }
 
   // The base case handles scalable vectors fine for now, since it treats the
   // cost as 1 * legalization cost.
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index aba113865af1..16b3913f5202 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -53,6 +53,14 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32 = icmp eq i32 %a32, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %a64 = and i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64 = icmp ne i64 %a64, 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32ge = icmp sge i32 %a32, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32le = icmp slt i32 %a32, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32leneg = icmp sle i32 %a32, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32gtneg = icmp sgt i32 %a32, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64ge = icmp sge i64 %a64, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64le = icmp slt i64 %a64, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64leneg = icmp sle i64 %a64, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64gtneg = icmp sgt i64 %a64, -1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a128 = and i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c128 = icmp eq i128 %a128, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av16i8 = and <16 x i8> undef, undef
@@ -62,7 +70,7 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av4i32 = and <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv4i32 = icmp ne <4 x i32> %av4i32, zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c32not0 = icmp eq i32 %a32, 1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c64sle = icmp sle i64 %a64, 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64sle = icmp sle i64 %a64, 0
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a8 = and i8 undef, undef
@@ -73,6 +81,17 @@ define void @andcmp() {
   %c32 = icmp eq i32 %a32, 0
   %a64 = and i64 undef, undef
   %c64 = icmp ne i64 %a64, 0
+
+  %c32ge = icmp sge i32 %a32, 1
+  %c32le = icmp slt i32 %a32, 1
+  %c32leneg = icmp sle i32 %a32, -1
+  %c32gtneg  = icmp sgt i32 %a32, -1
+
+  %c64ge = icmp sge i64 %a64, 1
+  %c64le = icmp slt i64 %a64, 1
+  %c64leneg = icmp sle i64 %a64, -1
+  %c64gtneg  = icmp sgt i64 %a64, -1
+
   %a128 = and i128 undef, undef
   %c128 = icmp eq i128 %a128, zeroinitializer
   %av16i8 = and <16 x i8> undef, undef

From 414710c753d87d314529857e15d1ad01a76c6605 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Tue, 17 Jun 2025 23:03:14 +0900
Subject: [PATCH 0643/1322] [SLP] Fix isCommutative to check uses of the
 original instruction instead of the converted instruction. (#143094)

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 36 +++++++++++++++----
 .../Transforms/SLPVectorizer/isCommutative.ll | 34 ++++++++++++++++++
 2 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/isCommutative.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9a7e9b75da51..8bff3c018714 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -511,15 +511,25 @@ static bool isSplat(ArrayRef<Value *> VL) {
 }
 
 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
-static bool isCommutative(Instruction *I) {
+/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
+/// patterns that make it effectively commutative (like equality comparisons
+/// with zero).
+/// In most cases, users should not call this function directly (since \p I and
+/// \p InstWithUses are the same). However, when analyzing interchangeable
+/// instructions, we need to use the converted opcode along with the original
+/// uses.
+/// \param I The instruction to check for commutativity
+/// \param InstWithUses The instruction whose uses are analyzed for special
+/// patterns
+static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
   if (auto *Cmp = dyn_cast<CmpInst>(I))
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
     return BO->isCommutative() ||
            (BO->getOpcode() == Instruction::Sub &&
-            !BO->hasNUsesOrMore(UsesLimit) &&
+            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
             all_of(
-                BO->uses(),
+                InstWithUses->uses(),
                 [](const Use &U) {
                   // Commutative, if icmp eq/ne sub, 0
                   CmpPredicate Pred;
@@ -536,14 +546,24 @@ static bool isCommutative(Instruction *I) {
                           Flag->isOne());
                 })) ||
            (BO->getOpcode() == Instruction::FSub &&
-            !BO->hasNUsesOrMore(UsesLimit) &&
-            all_of(BO->uses(), [](const Use &U) {
+            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
+            all_of(InstWithUses->uses(), [](const Use &U) {
               return match(U.getUser(),
                            m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
             }));
   return I->isCommutative();
 }
 
+/// This is a helper function to check whether \p I is commutative.
+/// This is a convenience wrapper that calls the two-parameter version of
+/// isCommutative with the same instruction for both parameters. This is
+/// the common case where the instruction being checked for commutativity
+/// is the same as the instruction whose uses are analyzed for special
+/// patterns (see the two-parameter version above for details).
+/// \param I The instruction to check for commutativity
+/// \returns true if the instruction is commutative, false otherwise
+static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
+
 template <typename T>
 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
                                                      unsigned Offset) {
@@ -2898,7 +2918,11 @@ public:
           continue;
         }
         auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
-        bool IsInverseOperation = !isCommutative(SelectedOp);
+        // We cannot check commutativity by the converted instruction
+        // (SelectedOp) because isCommutative also examines def-use
+        // relationships.
+        bool IsInverseOperation =
+            !isCommutative(SelectedOp, cast<Instruction>(V));
         for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
           OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
diff --git a/llvm/test/Transforms/SLPVectorizer/isCommutative.ll b/llvm/test/Transforms/SLPVectorizer/isCommutative.ll
new file mode 100644
index 000000000000..704ac8295f55
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/isCommutative.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+define i16 @check_isCommutative_with_the_original_source() {
+; CHECK-LABEL: @check_isCommutative_with_the_original_source(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND3:%.*]] = select i1 true, i16 1, i16 0
+; CHECK-NEXT:    ret i16 [[COND3]]
+;
+entry:
+  %sub = sub i16 0, -1
+  %cmp = icmp eq i16 %sub, 1
+
+  %sub1 = sub i16 0, -1
+  %cmp2 = icmp eq i16 %sub1, 1
+  %cond3 = select i1 %cmp2, i16 1, i16 0
+
+  %sub5 = sub nsw i16 0, 0
+  %cmp6 = icmp eq i16 %sub5, 0
+  %cmp9 = icmp eq i16 %sub5, 0
+
+  %sub12 = sub nsw i16 0, 0
+  %cmp13 = icmp eq i16 %sub12, 0
+
+  %sub16 = sub nsw i16 0, 0
+  %cmp17 = icmp eq i16 %sub16, 0
+
+  %sub20 = sub nsw i16 0, 0
+  %cmp21 = icmp eq i16 %sub20, 0
+  %cmp24 = icmp eq i16 %sub20, 0
+
+  ret i16 %cond3
+}
+

From 35f6d917206d79ab0e3d382a36ca05ccc13983d5 Mon Sep 17 00:00:00 2001
From: Richard Howell <rmaz@users.noreply.github.com>
Date: Tue, 17 Jun 2025 07:18:50 -0700
Subject: [PATCH 0644/1322] [lld] check cache in loadDylib before real_path
 (#143595)

---
 lld/MachO/DriverUtils.cpp              | 40 +++++++++++---
 lld/test/MachO/reexport-with-symlink.s | 74 ++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 6 deletions(-)
 create mode 100644 lld/test/MachO/reexport-with-symlink.s

diff --git a/lld/MachO/DriverUtils.cpp b/lld/MachO/DriverUtils.cpp
index f7f6be049f0e..a3b722f13dac 100644
--- a/lld/MachO/DriverUtils.cpp
+++ b/lld/MachO/DriverUtils.cpp
@@ -225,14 +225,21 @@ std::optional<StringRef> macho::resolveDylibPath(StringRef dylibPath) {
 // especially if it's a commonly re-exported core library.
 static DenseMap<CachedHashStringRef, DylibFile *> loadedDylibs;
 
+static StringRef realPathIfDifferent(StringRef path) {
+  SmallString<128> realPathBuf;
+  if (fs::real_path(path, realPathBuf))
+    return StringRef();
+
+  SmallString<128> absPathBuf = path;
+  if (!fs::make_absolute(absPathBuf) && realPathBuf == absPathBuf)
+    return StringRef();
+
+  return uniqueSaver().save(StringRef(realPathBuf));
+}
+
 DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
                             bool isBundleLoader, bool explicitlyLinked) {
-  // Frameworks can be found from different symlink paths, so resolve
-  // symlinks before looking up in the dylib cache.
-  SmallString<128> realPath;
-  std::error_code err = fs::real_path(mbref.getBufferIdentifier(), realPath);
-  CachedHashStringRef path(!err ? uniqueSaver().save(StringRef(realPath))
-                                : mbref.getBufferIdentifier());
+  CachedHashStringRef path(mbref.getBufferIdentifier());
   DylibFile *&file = loadedDylibs[path];
   if (file) {
     if (explicitlyLinked)
@@ -240,6 +247,22 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
     return file;
   }
 
+  // Frameworks can be found from different symlink paths, so resolve
+  // symlinks and look up in the dylib cache.
+  CachedHashStringRef realPath(
+      realPathIfDifferent(mbref.getBufferIdentifier()));
+  if (!realPath.val().empty()) {
+    // Avoid map insertions here so that we do not invalidate the "file"
+    // reference.
+    auto it = loadedDylibs.find(realPath);
+    if (it != loadedDylibs.end()) {
+      DylibFile *realfile = it->second;
+      if (explicitlyLinked)
+        realfile->setExplicitlyLinked();
+      return realfile;
+    }
+  }
+
   DylibFile *newFile;
   file_magic magic = identify_magic(mbref.getBuffer());
   if (magic == file_magic::tapi_file) {
@@ -292,6 +315,11 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
             sys::path::filename(newFile->installName) + "' because " +
             config->clientName + " is not an allowed client");
   }
+
+  // If the load path was a symlink, cache the real path too.
+  if (!realPath.val().empty())
+    loadedDylibs[realPath] = newFile;
+
   return newFile;
 }
 
diff --git a/lld/test/MachO/reexport-with-symlink.s b/lld/test/MachO/reexport-with-symlink.s
new file mode 100644
index 000000000000..a6b5992713f3
--- /dev/null
+++ b/lld/test/MachO/reexport-with-symlink.s
@@ -0,0 +1,74 @@
+# REQUIRES: aarch64, shell
+# RUN: rm -rf %t; split-file %s %t
+# RUN: ln -s Versions/A/Developer %t/Developer/Library/Frameworks/Developer.framework/
+# RUN: llvm-mc -filetype obj -triple arm64-apple-macos11.0 %t/test.s -o %t/test.o
+# RUN: %lld -arch arm64 -platform_version macos 11.0 11.0 -o %t/test -framework Developer -F %t/Developer/Library/Frameworks -L %t/Developer/usr/lib %t/test.o -t | FileCheck %s
+
+# CHECK: {{.*}}/Developer/Library/Frameworks/Developer.framework/Developer
+# CHECK: {{.*}}/Developer/usr/lib/libDeveloperSupport.tbd(@rpath/libDeveloperSupport.dylib)
+# CHECK-NOT: {{.*}}/Developer/Library/Frameworks/Developer.framework/Versions/A/Developer
+
+#--- Developer/Library/Frameworks/Developer.framework/Versions/A/Developer
+{
+  "tapi_tbd_version": 5,
+  "main_library": {
+    "target_info": [
+      {
+        "target": "arm64-macos"
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/Developer.framework/Developer"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": ["_funcPublic"]
+        }
+      }
+    ]
+  }
+}
+#--- Developer/usr/lib/libDeveloperSupport.tbd
+{
+  "tapi_tbd_version": 5,
+  "main_library": {
+    "target_info": [
+      {
+        "target": "arm64-macos"
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libDeveloperSupport.dylib"
+      }
+    ],
+    "reexported_libraries": [
+      {
+        "names": [
+          "@rpath/Developer.framework/Versions/A/Developer"
+        ]
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": ["_funcSupport"]
+        }
+      }
+    ]
+  }
+}
+#--- test.s
+.text
+.globl _main
+.linker_option "-lDeveloperSupport"
+
+_main:
+  ret
+
+.data
+  .quad _funcPublic
+  .quad _funcSupport

From 0a7b0c844c59189ad4f5072b73d7dfdfd78e76b7 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 17 Jun 2025 15:24:26 +0100
Subject: [PATCH 0645/1322] [lldb][Expression] Remove IR pointer checker
 (#144483)

Currently when jitting expressions, LLDB scans the IR instructions of
the `$__lldb_expr` and will insert a call to a utility function for each
load/store instruction. The purpose of the utility funciton is to
dereference the load/store operand. If that operand was an invalid
pointer the utility function would trap and LLDB asks the IR checker
whether it was responsible for the trap, in which case it prints out an
error message saying the expression dereferenced an invalid pointer.

This is a lot of setup for not much gain. In fact, creating/running this
utility expression shows up as ~2% of the expression evaluation time
(though we cache them for subsequent expressions). And the error message
we get out of it is arguably less useful than if we hadn't instrumented
the IR. It was also untested.

Before:
```
(lldb) expr int a = *returns_invalid_ptr()

error: Execution was interrupted, reason: Attempted to dereference an invalid pointer..
The process has been returned to the state before expression evaluation.
```

After:
```
(lldb) expr int a = *returns_invalid_ptr()

error: Expression execution was interrupted: EXC_BAD_ACCESS (code=1, address=0x5).
The process has been returned to the state before expression evaluation.
```

This patch removes this IR checker.
---
 .../Clang/IRDynamicChecks.cpp                 | 107 +-----------------
 .../ExpressionParser/Clang/IRDynamicChecks.h  |   1 -
 .../lldb-dap/save-core/TestDAP_save_core.py   |   6 -
 3 files changed, 4 insertions(+), 110 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
index c7c292a8a7e4..6ef5d3f5be6d 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
@@ -32,31 +32,16 @@ using namespace lldb_private;
 
 static char ID;
 
-#define VALID_POINTER_CHECK_NAME "_$__lldb_valid_pointer_check"
 #define VALID_OBJC_OBJECT_CHECK_NAME "$__lldb_objc_object_check"
 
-static const char g_valid_pointer_check_text[] =
-    "extern \"C\" void\n"
-    "_$__lldb_valid_pointer_check (unsigned char *$__lldb_arg_ptr)\n"
-    "{\n"
-    "    unsigned char $__lldb_local_val = *$__lldb_arg_ptr;\n"
-    "}";
-
 ClangDynamicCheckerFunctions::ClangDynamicCheckerFunctions()
     : DynamicCheckerFunctions(DCF_Clang) {}
 
 ClangDynamicCheckerFunctions::~ClangDynamicCheckerFunctions() = default;
 
-llvm::Error ClangDynamicCheckerFunctions::Install(
-    DiagnosticManager &diagnostic_manager, ExecutionContext &exe_ctx) {
-  Expected<std::unique_ptr<UtilityFunction>> utility_fn =
-      exe_ctx.GetTargetRef().CreateUtilityFunction(
-          g_valid_pointer_check_text, VALID_POINTER_CHECK_NAME,
-          lldb::eLanguageTypeC, exe_ctx);
-  if (!utility_fn)
-    return utility_fn.takeError();
-  m_valid_pointer_check = std::move(*utility_fn);
-
+llvm::Error
+ClangDynamicCheckerFunctions::Install(DiagnosticManager &diagnostic_manager,
+                                      ExecutionContext &exe_ctx) {
   if (Process *process = exe_ctx.GetProcessPtr()) {
     ObjCLanguageRuntime *objc_language_runtime =
         ObjCLanguageRuntime::Get(*process);
@@ -78,11 +63,7 @@ bool ClangDynamicCheckerFunctions::DoCheckersExplainStop(lldb::addr_t addr,
   // FIXME: We have to get the checkers to know why they scotched the call in
   // more detail,
   // so we can print a better message here.
-  if (m_valid_pointer_check && m_valid_pointer_check->ContainsAddress(addr)) {
-    message.Printf("Attempted to dereference an invalid pointer.");
-    return true;
-  } else if (m_objc_object_check &&
-             m_objc_object_check->ContainsAddress(addr)) {
+  if (m_objc_object_check && m_objc_object_check->ContainsAddress(addr)) {
     message.Printf("Attempted to dereference an invalid ObjC Object or send it "
                    "an unrecognized selector");
     return true;
@@ -223,29 +204,6 @@ protected:
     return true;
   }
 
-  /// Build a function pointer for a function with signature void
-  /// (*)(uint8_t*) with a given address
-  ///
-  /// \param[in] start_address
-  ///     The address of the function.
-  ///
-  /// \return
-  ///     The function pointer, for use in a CallInst.
-  llvm::FunctionCallee BuildPointerValidatorFunc(lldb::addr_t start_address) {
-    llvm::Type *param_array[1];
-
-    param_array[0] = const_cast<llvm::PointerType *>(GetI8PtrTy());
-
-    ArrayRef<llvm::Type *> params(param_array, 1);
-
-    FunctionType *fun_ty = FunctionType::get(
-        llvm::Type::getVoidTy(m_module.getContext()), params, true);
-    PointerType *fun_ptr_ty = PointerType::getUnqual(m_module.getContext());
-    Constant *fun_addr_int =
-        ConstantInt::get(GetIntptrTy(), start_address, false);
-    return {fun_ty, ConstantExpr::getIntToPtr(fun_addr_int, fun_ptr_ty)};
-  }
-
   /// Build a function pointer for a function with signature void
   /// (*)(uint8_t*, uint8_t*) with a given address
   ///
@@ -300,53 +258,6 @@ private:
   IntegerType *m_intptr_ty = nullptr;
 };
 
-class ValidPointerChecker : public Instrumenter {
-public:
-  ValidPointerChecker(llvm::Module &module,
-                      std::shared_ptr<UtilityFunction> checker_function)
-      : Instrumenter(module, checker_function),
-        m_valid_pointer_check_func(nullptr) {}
-
-  ~ValidPointerChecker() override = default;
-
-protected:
-  bool InstrumentInstruction(llvm::Instruction *inst) override {
-    Log *log = GetLog(LLDBLog::Expressions);
-
-    LLDB_LOGF(log, "Instrumenting load/store instruction: %s\n",
-              PrintValue(inst).c_str());
-
-    if (!m_valid_pointer_check_func)
-      m_valid_pointer_check_func =
-          BuildPointerValidatorFunc(m_checker_function->StartAddress());
-
-    llvm::Value *dereferenced_ptr = nullptr;
-
-    if (llvm::LoadInst *li = dyn_cast<llvm::LoadInst>(inst))
-      dereferenced_ptr = li->getPointerOperand();
-    else if (llvm::StoreInst *si = dyn_cast<llvm::StoreInst>(inst))
-      dereferenced_ptr = si->getPointerOperand();
-    else
-      return false;
-
-    // Insert an instruction to call the helper with the result
-    CallInst::Create(m_valid_pointer_check_func, dereferenced_ptr, "",
-                     inst->getIterator());
-
-    return true;
-  }
-
-  bool InspectInstruction(llvm::Instruction &i) override {
-    if (isa<llvm::LoadInst>(&i) || isa<llvm::StoreInst>(&i))
-      RegisterInstruction(i);
-
-    return true;
-  }
-
-private:
-  llvm::FunctionCallee m_valid_pointer_check_func;
-};
-
 class ObjcObjectChecker : public Instrumenter {
 public:
   ObjcObjectChecker(llvm::Module &module,
@@ -527,16 +438,6 @@ bool IRDynamicChecks::runOnModule(llvm::Module &M) {
     return false;
   }
 
-  if (m_checker_functions.m_valid_pointer_check) {
-    ValidPointerChecker vpc(M, m_checker_functions.m_valid_pointer_check);
-
-    if (!vpc.Inspect(*function))
-      return false;
-
-    if (!vpc.Instrument())
-      return false;
-  }
-
   if (m_checker_functions.m_objc_object_check) {
     ObjcObjectChecker ooc(M, m_checker_functions.m_objc_object_check);
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
index ff20c1f08be0..f67229afc215 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
@@ -53,7 +53,6 @@ public:
 
   bool DoCheckersExplainStop(lldb::addr_t addr, Stream &message) override;
 
-  std::shared_ptr<UtilityFunction> m_valid_pointer_check;
   std::shared_ptr<UtilityFunction> m_objc_object_check;
 };
 
diff --git a/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py b/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
index 4045dd8fb656..77c1e47914a3 100644
--- a/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
+++ b/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
@@ -32,13 +32,7 @@ class TestDAP_save_core(lldbdap_testcase.DAPTestCaseBase):
         # Getting dap stack trace may trigger __lldb_caller_function JIT module to be created.
         self.get_stackFrames(startFrame=0)
 
-        # Evaluating an expression that cause "_$__lldb_valid_pointer_check" JIT module to be created.
-        expression = 'printf("this is a test")'
-        self.dap_server.request_evaluate(expression, context="watch")
-
-        # Verify "_$__lldb_valid_pointer_check" JIT module is created.
         modules = self.dap_server.get_modules()
-        self.assertTrue(modules["_$__lldb_valid_pointer_check"])
         thread_count = len(self.dap_server.get_threads())
 
         core_stack = self.getBuildArtifact("core.stack.dmp")

From 8f797542258f6e682eb251d0851922a1ac08fb44 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 17 Jun 2025 15:30:08 +0100
Subject: [PATCH 0646/1322] [SCEV] Better preserve wrapping info in
 SimplifyICmpOperands for UGE. (#144404)

Update SimplifyICmpOperands to only try subtracting 1 from RHS first, if
RHS is an op we can fold the subtract directly into. Otherwise try
adding to LHS first, as we can preserve NUW flags.

This improves results in a few cases, including the modified test case
from berkeley-abc and new code to be added in
https://github.com/llvm/llvm-project/pull/128061.

Note that there are more cases where the results can be improved by
better ordering here which I'll try to investigate as follow-up.

PR: https://github.com/llvm/llvm-project/pull/144404
---
 llvm/lib/Analysis/ScalarEvolution.cpp                 | 11 ++++++++++-
 .../IndVarSimplify/simplify-icmp-operands-order.ll    |  9 +++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 2dfe625eb0dc..dd309bc2c54a 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10892,7 +10892,12 @@ bool ScalarEvolution::SimplifyICmpOperands(CmpPredicate &Pred, const SCEV *&LHS,
     }
     break;
   case ICmpInst::ICMP_UGE:
-    if (!getUnsignedRangeMin(RHS).isMinValue()) {
+    // If RHS is an op we can fold the -1, try that first.
+    // Otherwise prefer LHS to preserve the nuw flag.
+    if ((isa<SCEVConstant>(RHS) ||
+         (isa<SCEVAddExpr, SCEVAddRecExpr>(RHS) &&
+          isa<SCEVConstant>(cast<SCEVNAryExpr>(RHS)->getOperand(0)))) &&
+        !getUnsignedRangeMin(RHS).isMinValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
@@ -10901,6 +10906,10 @@ bool ScalarEvolution::SimplifyICmpOperands(CmpPredicate &Pred, const SCEV *&LHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
+    } else if (!getUnsignedRangeMin(RHS).isMinValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
+      Pred = ICmpInst::ICMP_UGT;
+      Changed = true;
     }
     break;
   default:
diff --git a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
index b0dbbd5eaedf..fb2fdb116f90 100644
--- a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
+++ b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
@@ -53,15 +53,12 @@ loop.latch:
 
 define void @test_simplifycompare_rhs_not_constant1() {
 ; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant1() {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[P]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 -8
-; CHECK-NEXT:    call void @use(ptr [[PTR_IV]])
-; CHECK-NEXT:    [[EC:%.*]] = icmp ult ptr [[PTR_IV_NEXT]], [[P]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    call void @use(ptr [[P]])
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;

From 0fb198e132eff36281a20698588d815c3c30f991 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 15:30:49 +0100
Subject: [PATCH 0647/1322] [X86] Remove combineShuffleOfConcatUndef fold
 (#144524)

We can now let a mixture of combineConcatVectorOps and target shuffle combining handle this instead of creating ISD::CONCAT_VECTORS nodes and hoping they will merge properly.

In the horizontal-sum.ll test changes we were creating a ISD::CONCAT_VECTORS node that was being split shortly after, but not before causing issues with HADD folding due to additional uses.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 54 -------------------------
 llvm/test/CodeGen/X86/horizontal-sum.ll | 28 ++++++-------
 2 files changed, 14 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd02d275d6b5..12fcc614ab25 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43301,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
-// We are looking for a shuffle where both sources are concatenated with undef
-// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
-// if we can express this as a single-source shuffle, that's preferable.
-static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL,
-                                           SelectionDAG &DAG,
-                                           const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
-  if (!VT.is128BitVector() && !VT.is256BitVector())
-    return SDValue();
-
-  if (VT.getVectorElementType() != MVT::i32 &&
-      VT.getVectorElementType() != MVT::i64 &&
-      VT.getVectorElementType() != MVT::f32 &&
-      VT.getVectorElementType() != MVT::f64)
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Check that both sources are concats with undef.
-  if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
-      N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
-      N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
-      !N1.getOperand(1).isUndef())
-    return SDValue();
-
-  // Construct the new shuffle mask. Elements from the first source retain their
-  // index, but elements from the second source no longer need to skip an undef.
-  SmallVector<int, 8> Mask;
-  int NumElts = VT.getVectorNumElements();
-
-  auto *SVOp = cast<ShuffleVectorSDNode>(N);
-  for (int Elt : SVOp->getMask())
-    Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
-
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
-                               N1.getOperand(0));
-  return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
-}
-
 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
 /// low half of each source vector and does not set any high half elements in
 /// the destination vector, narrow the shuffle to half its original size.
@@ -43401,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
     return LD;
 
-  // For AVX2, we sometimes want to combine
-  // (vector_shuffle <mask> (concat_vectors t1, undef)
-  //                        (concat_vectors t2, undef))
-  // Into:
-  // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
-  // Since the latter can be efficiently lowered with VPERMD/VPERMQ
-  if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
-    return ShufConcat;
-
   if (isTargetShuffle(N->getOpcode())) {
     SDValue Op(N, 0);
     if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 0afc4f784bc5..568150cfa397 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
 ; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
 ; AVX2-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
 ; AVX2-FAST-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]

From 4cfe0d7f4c2c39dd90e27258aa448789f2ba4278 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Tue, 17 Jun 2025 15:32:23 +0100
Subject: [PATCH 0648/1322] [flang][OpenMP] Support using copyprivate with
 fir.boxchar arguments (#144092)

Implement the lowering for passing a fir.boxchar argument to the
copyprivate clause.

Resolves https://github.com/llvm/llvm-project/issues/142123.

---------

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 50 ++++++++++++++++------
 flang/test/Lower/OpenMP/copyprivate5.f90   | 36 ++++++++++++++++
 2 files changed, 74 insertions(+), 12 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/copyprivate5.f90

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index b5c8de8c2ce8..bc8fc14bcaeb 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -727,12 +727,15 @@ public:
   // Is the type inside a box?
   bool isBox() const { return inBox; }
 
+  bool isBoxChar() const { return inBoxChar; }
+
 private:
   void typeScan(mlir::Type type);
 
   std::optional<fir::CharacterType::LenType> charLen;
   llvm::SmallVector<int64_t> shape;
   bool inBox = false;
+  bool inBoxChar = false;
 };
 
 void TypeInfo::typeScan(mlir::Type ty) {
@@ -748,6 +751,9 @@ void TypeInfo::typeScan(mlir::Type ty) {
     typeScan(cty.getEleTy());
   } else if (auto cty = mlir::dyn_cast<fir::CharacterType>(ty)) {
     charLen = cty.getLen();
+  } else if (auto cty = mlir::dyn_cast<fir::BoxCharType>(ty)) {
+    inBoxChar = true;
+    typeScan(cty.getEleTy());
   } else if (auto hty = mlir::dyn_cast<fir::HeapType>(ty)) {
     typeScan(hty.getEleTy());
   } else if (auto pty = mlir::dyn_cast<fir::PointerType>(ty)) {
@@ -791,12 +797,6 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
   fir::FortranVariableFlagsAttr attrs;
   if (varAttrs != fir::FortranVariableFlagsEnum::None)
     attrs = fir::FortranVariableFlagsAttr::get(builder.getContext(), varAttrs);
-  llvm::SmallVector<mlir::Value> typeparams;
-  if (typeInfo.getCharLength().has_value()) {
-    mlir::Value charLen = builder.createIntegerConstant(
-        loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
-    typeparams.push_back(charLen);
-  }
   mlir::Value shape;
   if (!typeInfo.isBox() && !typeInfo.getShape().empty()) {
     llvm::SmallVector<mlir::Value> extents;
@@ -805,11 +805,34 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
           builder.createIntegerConstant(loc, builder.getIndexType(), extent));
     shape = builder.create<fir::ShapeOp>(loc, extents);
   }
+  mlir::Value dst = funcOp.getArgument(0);
+  mlir::Value src = funcOp.getArgument(1);
+  llvm::SmallVector<mlir::Value> typeparams;
+  if (typeInfo.isBoxChar()) {
+    // fir.boxchar will be passed here as fir.ref<fir.boxchar>
+    auto loadDst = builder.create<fir::LoadOp>(loc, dst);
+    auto loadSrc = builder.create<fir::LoadOp>(loc, src);
+    // get the actual fir.ref<fir.char> type
+    mlir::Type refType =
+        fir::ReferenceType::get(mlir::cast<fir::BoxCharType>(eleTy).getEleTy());
+    auto unboxedDst = builder.create<fir::UnboxCharOp>(
+        loc, refType, builder.getIndexType(), loadDst);
+    auto unboxedSrc = builder.create<fir::UnboxCharOp>(
+        loc, refType, builder.getIndexType(), loadSrc);
+    // Add length to type parameters
+    typeparams.push_back(unboxedDst.getResult(1));
+    dst = unboxedDst.getResult(0);
+    src = unboxedSrc.getResult(0);
+  } else if (typeInfo.getCharLength().has_value()) {
+    mlir::Value charLen = builder.createIntegerConstant(
+        loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
+    typeparams.push_back(charLen);
+  }
   auto declDst = builder.create<hlfir::DeclareOp>(
-      loc, funcOp.getArgument(0), copyFuncName + "_dst", shape, typeparams,
+      loc, dst, copyFuncName + "_dst", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   auto declSrc = builder.create<hlfir::DeclareOp>(
-      loc, funcOp.getArgument(1), copyFuncName + "_src", shape, typeparams,
+      loc, src, copyFuncName + "_src", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   converter.copyVar(loc, declDst.getBase(), declSrc.getBase(), varAttrs);
   builder.create<mlir::func::ReturnOp>(loc);
@@ -835,10 +858,13 @@ bool ClauseProcessor::processCopyprivate(
 
     // CopyPrivate variables must be passed by reference. However, in the case
     // of assumed shapes/vla the type is not a !fir.ref, but a !fir.box.
-    // In these cases to retrieve the appropriate !fir.ref<!fir.box<...>> to
-    // access the data we need we must perform an alloca and then store to it
-    // and retrieve the data from the new alloca.
-    if (mlir::isa<fir::BaseBoxType>(symType)) {
+    // In the case of character types, the passed in type can also be
+    // !fir.boxchar. In these cases to retrieve the appropriate
+    // !fir.ref<!fir.box<...>> or !fir.ref<!fir.boxchar<..>> to access the data
+    // we need we must perform an alloca and then store to it and retrieve the
+    // data from the new alloca.
+    if (mlir::isa<fir::BaseBoxType>(symType) ||
+        mlir::isa<fir::BoxCharType>(symType)) {
       fir::FirOpBuilder &builder = converter.getFirOpBuilder();
       auto alloca = builder.create<fir::AllocaOp>(currentLocation, symType);
       builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
diff --git a/flang/test/Lower/OpenMP/copyprivate5.f90 b/flang/test/Lower/OpenMP/copyprivate5.f90
new file mode 100644
index 000000000000..c75eb82a45e9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/copyprivate5.f90
@@ -0,0 +1,36 @@
+! Test lowering of COPYPRIVATE with character arguments
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! Testcase from: https://github.com/llvm/llvm-project/issues/142123
+
+! CHECK-LABEL:  func.func private @_copy_boxchar_c8xU(
+! CHECK-SAME:     %arg0: [[TYPE:!fir.ref<!fir.boxchar<1>>]],
+! CHECK-SAME:     %arg1: [[TYPE]]) attributes {llvm.linkage = #llvm.linkage<internal>} {
+! CHECK:    %[[RDST:.*]] = fir.load %arg0 : [[TYPE]]
+! CHECK:    %[[RSRC:.*]] = fir.load %arg1 : [[TYPE]]
+! CHECK:    %[[UDST:.*]]:2 = fir.unboxchar %[[RDST:.*]] : ([[UTYPE:!fir.boxchar<1>]]) -> ([[RTYPE:!fir.ref<!fir.char<1,\?>>]], [[ITYPE:index]])
+! CHECK:    %[[USRC:.*]]:2 = fir.unboxchar %[[RSRC:.*]] : ([[UTYPE]]) -> ([[RTYPE]], [[ITYPE]])
+! CHECK:    %[[DST:.*]]:2 = hlfir.declare %[[UDST:.*]]#0 typeparams %[[UDST:.*]]#1 {uniq_name = "[[NAME1:.*]]"} : ([[RTYPE]], [[ITYPE]]) -> ([[UTYPE]], [[RTYPE]])
+! CHECK:    %[[SRC:.*]]:2 = hlfir.declare %[[USRC:.*]]#0 typeparams %[[UDST:.*]]#1 {uniq_name = "[[NAME2:.*]]"} : ([[RTYPE]], [[ITYPE]]) -> ([[UTYPE]], [[RTYPE]])
+! CHECK:    hlfir.assign %[[SRC:.*]]#0 to %[[DST:.*]]#0 : [[UTYPE]], [[UTYPE]]
+! CHECK:    return
+! CHECK:  }
+
+! CHECK-LABEL: func.func @_QPs(%arg0: !fir.boxchar<1> {fir.bindc_name = "c"}) {
+! CHECK: %[[ALLOC:.*]] = fir.alloca !fir.boxchar<1>
+! CHECK: fir.store %[[SRC:.*]] to %[[ALLOC:.*]] : !fir.ref<!fir.boxchar<1>>
+! CHECK: omp.single copyprivate([[ALLOC:.*]] -> @_copy_boxchar_c8xU : !fir.ref<!fir.boxchar<1>>) {
+! CHECK:   hlfir.assign %[[NEW_VAL:.*]] to %[[SRC:.*]] : !fir.ref<!fir.char<1,3>>, !fir.boxchar<1>
+! CHECK:   omp.terminator
+! CHECK: }
+
+subroutine s(c)
+character(*) :: c
+!$omp single copyprivate(c)
+c = "bar"
+!$omp end single
+end subroutine
+
+character(len=3) :: c
+call s(c)
+end

From 549bc55cc39bb9fb22df464bcf3b7d4d4a5ff507 Mon Sep 17 00:00:00 2001
From: Davide Grohmann <davide.grohmann@arm.com>
Date: Tue, 17 Jun 2025 16:35:14 +0200
Subject: [PATCH 0649/1322] [mlir][spirv] Fix int type declaration duplication
 when serializing (#143108)

At the MLIR level unsigned integer and signless integers are different
types. Indeed when looking up the two types in type definition cache
they do not match.

Hence when translating a SPIR-V module which contains both usign and
signless integers will contain the same type declaration twice
(something like OpTypeInt 32 0) which is not permitted in SPIR-V and
such generated modules fail validation.

This patch solves the problem by mapping unisgned integer types to
singless integer types before looking up in the type definition cache.

---------

Signed-off-by: Davide Grohmann <davide.grohmann@arm.com>
---
 mlir/lib/Target/SPIRV/Serialization/Serializer.cpp | 13 +++++++++++++
 mlir/test/CMakeLists.txt                           |  6 ++++++
 mlir/test/Target/SPIRV/constant.mlir               |  5 ++++-
 mlir/test/lit.cfg.py                               |  1 +
 mlir/test/lit.local.cfg                            |  7 +++++++
 mlir/test/lit.site.cfg.py.in                       |  4 +++-
 6 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/lit.local.cfg

diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index d258bfd85296..56c64f38fe29 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -446,6 +446,19 @@ LogicalResult Serializer::processType(Location loc, Type type,
 LogicalResult
 Serializer::processTypeImpl(Location loc, Type type, uint32_t &typeID,
                             SetVector<StringRef> &serializationCtx) {
+
+  // Map unsigned integer types to singless integer types.
+  // This is needed otherwise the generated spirv assembly will contain
+  // twice a type declaration (like OpTypeInt 32 0) which is no permitted and
+  // such module fails validation. Indeed at MLIR level the two types are
+  // different and lookup in the cache below misses.
+  // Note: This conversion needs to happen here before the type is looked up in
+  // the cache.
+  if (type.isUnsignedInteger()) {
+    type = IntegerType::get(loc->getContext(), type.getIntOrFloatBitWidth(),
+                            IntegerType::SignednessSemantics::Signless);
+  }
+
   typeID = getTypeID(type);
   if (typeID)
     return success();
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index ac8b44f53aeb..89568e7766ae 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -68,6 +68,7 @@ endif()
 llvm_canonicalize_cmake_booleans(
   LLVM_BUILD_EXAMPLES
   LLVM_HAS_NVPTX_TARGET
+  LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   MLIR_ENABLE_BINDINGS_PYTHON
   MLIR_ENABLE_CUDA_RUNNER
   MLIR_ENABLE_ROCM_CONVERSIONS
@@ -217,6 +218,11 @@ if(MLIR_ENABLE_BINDINGS_PYTHON)
   )
 endif()
 
+if (LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
+  list(APPEND MLIR_TEST_DEPENDS spirv-as)
+  list(APPEND MLIR_TEST_DEPENDS spirv-val)
+endif()
+
 # This target can be used to just build the dependencies
 # for the check-mlir target without executing the tests.
 # This is useful for bots when splitting the build step
diff --git a/mlir/test/Target/SPIRV/constant.mlir b/mlir/test/Target/SPIRV/constant.mlir
index 8d4e53418b70..50d9b09ee004 100644
--- a/mlir/test/Target/SPIRV/constant.mlir
+++ b/mlir/test/Target/SPIRV/constant.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+// RUN: %if spirv-tools %{ mlir-translate -no-implicit-module -serialize-spirv %s | spirv-val %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, Int64, Int16, Int8, Float64, Float16, CooperativeMatrixKHR], [SPV_KHR_vulkan_memory_model, SPV_KHR_cooperative_matrix]> {
   // CHECK-LABEL: @bool_const
   spirv.func @bool_const() -> () "None" {
     // CHECK: spirv.Constant true
@@ -305,4 +306,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %coop = spirv.Constant dense<4> : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
     spirv.ReturnValue %coop : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
   }
+
+  spirv.EntryPoint "GLCompute" @bool_const
 }
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9b5cadd62bef..a6f1ac0d568f 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -332,6 +332,7 @@ if config.enable_assertions:
 else:
     config.available_features.add("noasserts")
 
+config.targets = frozenset(config.targets_to_build.split())
 
 def have_host_jit_feature_support(feature_name):
     mlir_runner_exe = lit.util.which("mlir-runner", config.mlir_tools_dir)
diff --git a/mlir/test/lit.local.cfg b/mlir/test/lit.local.cfg
new file mode 100644
index 000000000000..167c454db518
--- /dev/null
+++ b/mlir/test/lit.local.cfg
@@ -0,0 +1,7 @@
+if not "SPIRV" in config.root.targets:
+    config.unsupported = True
+
+if config.spirv_tools_tests:
+    config.available_features.add("spirv-tools")
+    config.substitutions.append(("spirv-as", os.path.join(config.llvm_tools_dir, "spirv-as")))
+    config.substitutions.append(("spirv-val", os.path.join(config.llvm_tools_dir, "spirv-val")))
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 132aabe13594..77f24e0f29b0 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -5,6 +5,8 @@ import sys
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
 config.python_executable = "@Python3_EXECUTABLE@"
@@ -41,7 +43,7 @@ config.mlir_run_amx_tests = @MLIR_RUN_AMX_TESTS@
 config.mlir_run_arm_sve_tests = @MLIR_RUN_ARM_SVE_TESTS@
 # This is a workaround for the fact that LIT's:
 #   %if <cond>
-# requires <cond> to be in the set of available features. 
+# requires <cond> to be in the set of available features.
 # TODO: Update LIT's TestRunner so that this is not required.
 if config.mlir_run_arm_sve_tests:
     config.available_features.add("mlir_arm_sve_tests")

From 7ec103a984ff114d24f26d935fe2292379269b53 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Tue, 17 Jun 2025 15:52:33 +0100
Subject: [PATCH 0650/1322] Port #143108 to bazel (#144538)

---
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 53405a0dea24..a2fb5ade7324 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -37,6 +37,7 @@ expand_template(
         # All disabled, but required to substituted because they are not in quotes.
         "@LLVM_BUILD_EXAMPLES@": "0",
         "@LLVM_HAS_NVPTX_TARGET@": "0",
+        "@LLVM_INCLUDE_SPIRV_TOOLS_TESTS@": "0",
         "@MLIR_ENABLE_CUDA_RUNNER@": "0",
         "@MLIR_ENABLE_ROCM_CONVERSIONS@": "0",
         "@MLIR_ENABLE_ROCM_RUNNER@": "0",

From 9eb0020555fc643582b2802abb8c1bc92059c248 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 17 Jun 2025 15:55:14 +0100
Subject: [PATCH 0651/1322] [DebugInfo][RemoveDIs] Remove a swathe of
 debug-intrinsic code (#144389)

Seeing how we can't generate any debug intrinsics any more: delete a
variety of codepaths where they're handled. For the most part these are
plain deletions, in others I've tweaked comments to remain coherent, or
added a type to (what was) type-generic-lambdas.

This isn't all the DbgInfoIntrinsic call sites but it's most of the
simple scenarios.

Co-authored-by: Nikita Popov <github@npopov.com>
---
 .../llvm/Analysis/IRSimilarityIdentifier.h    |   4 -
 llvm/include/llvm/Analysis/PtrUseVisitor.h    |   1 -
 llvm/include/llvm/IR/InstVisitor.h            |  10 --
 llvm/include/llvm/Transforms/Utils/Local.h    |   7 +-
 llvm/lib/Analysis/AliasSetTracker.cpp         |   3 -
 llvm/lib/Analysis/CallGraph.cpp               |   5 +-
 llvm/lib/Analysis/DemandedBits.cpp            |   3 +-
 llvm/lib/Analysis/Loads.cpp                   |   2 +-
 .../lib/Analysis/MemoryDependenceAnalysis.cpp |   8 -
 llvm/lib/Analysis/ValueTracking.cpp           |   6 -
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  14 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   5 +-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |   1 -
 llvm/lib/IR/DebugInfo.cpp                     |   5 -
 .../Target/AArch64/AArch64StackTagging.cpp    |   3 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |   2 +-
 llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp   |   5 +-
 .../AggressiveInstCombine.cpp                 |   4 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |  11 --
 .../lib/Transforms/IPO/SampleProfileProbe.cpp |   3 +-
 .../InstCombine/InstructionCombining.cpp      |  12 +-
 .../Instrumentation/GCOVProfiling.cpp         |   8 -
 .../Instrumentation/ThreadSanitizer.cpp       |   3 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp           |  15 +-
 llvm/lib/Transforms/Scalar/GVN.cpp            |   7 +-
 llvm/lib/Transforms/Scalar/GVNHoist.cpp       |   3 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |   4 -
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |   6 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   2 +-
 .../Scalar/SpeculativeExecution.cpp           |   8 +-
 .../Scalar/TailRecursionElimination.cpp       |  17 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  51 +-----
 llvm/lib/Transforms/Utils/Debugify.cpp        |  16 +-
 llvm/lib/Transforms/Utils/Evaluator.cpp       |   7 -
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |   9 +-
 llvm/lib/Transforms/Utils/Local.cpp           |  11 +-
 .../Transforms/Utils/LoopRotationUtils.cpp    |  44 ++---
 .../Utils/ScalarEvolutionExpander.cpp         |  15 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 159 ++++++------------
 .../Vectorize/LoopVectorizationLegality.cpp   |   2 -
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   3 -
 41 files changed, 104 insertions(+), 400 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index dfda2dcee0db..09a8875e1e28 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -545,10 +545,6 @@ struct IRInstructionMapper {
     // dependent.
     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
-    // DebugInfo should be included in the regions, but should not be
-    // analyzed for similarity as it has no bearing on the outcome of the
-    // program.
-    InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
     InstrType visitIntrinsicInst(IntrinsicInst &II) {
       // These are disabled due to complications in the CodeExtractor when
       // outlining these instructions.  For instance, It is unclear what we
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index c9d3874e7dd9..0858d8aee218 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -285,7 +285,6 @@ protected:
 
   // No-op intrinsics which we know don't escape the pointer to logic in
   // some other function.
-  void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) {}
   void visitMemIntrinsic(MemIntrinsic &I) {}
   void visitIntrinsicInst(IntrinsicInst &II) {
     switch (II.getIntrinsicID()) {
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index b4eb729c7ce3..6d5398bb7a4c 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -199,13 +199,6 @@ public:
   RetTy visitCatchPadInst(CatchPadInst &I)     { DELEGATE(FuncletPadInst); }
   RetTy visitFreezeInst(FreezeInst &I)         { DELEGATE(Instruction); }
 
-  // Handle the special intrinsic instruction classes.
-  RetTy visitDbgDeclareInst(DbgDeclareInst &I)    { DELEGATE(DbgVariableIntrinsic);}
-  RetTy visitDbgValueInst(DbgValueInst &I)        { DELEGATE(DbgVariableIntrinsic);}
-  RetTy visitDbgVariableIntrinsic(DbgVariableIntrinsic &I)
-                                                  { DELEGATE(DbgInfoIntrinsic);}
-  RetTy visitDbgLabelInst(DbgLabelInst &I)        { DELEGATE(DbgInfoIntrinsic);}
-  RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetPatternInst(MemSetPatternInst &I) {
     DELEGATE(IntrinsicInst);
@@ -286,9 +279,6 @@ private:
     if (const Function *F = I.getCalledFunction()) {
       switch (F->getIntrinsicID()) {
       default:                     DELEGATE(IntrinsicInst);
-      case Intrinsic::dbg_declare: DELEGATE(DbgDeclareInst);
-      case Intrinsic::dbg_value:   DELEGATE(DbgValueInst);
-      case Intrinsic::dbg_label:   DELEGATE(DbgLabelInst);
       case Intrinsic::memcpy:
       case Intrinsic::memcpy_inline:
         DELEGATE(MemCpyInst);
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 55e153f28959..df146458b4e6 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -394,12 +394,9 @@ handleUnreachableTerminator(Instruction *I,
                             SmallVectorImpl<Value *> &PoisonedValues);
 
 /// Remove all instructions from a basic block other than its terminator
-/// and any present EH pad instructions. Returns a pair where the first element
-/// is the number of instructions (excluding debug info intrinsics) that have
-/// been removed, and the second element is the number of debug info intrinsics
+/// and any present EH pad instructions. Returns the number of instructions
 /// that have been removed.
-LLVM_ABI std::pair<unsigned, unsigned>
-removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
+LLVM_ABI unsigned removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
 
 /// Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp
index 6d1dafbae60b..1e2f05b60a9a 100644
--- a/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -343,9 +343,6 @@ void AliasSetTracker::add(AnyMemTransferInst *MTI) {
 }
 
 void AliasSetTracker::addUnknown(Instruction *Inst) {
-  if (isa<DbgInfoIntrinsic>(Inst))
-    return; // Ignore DbgInfo Intrinsics.
-
   if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
     // These intrinsics will show up as affecting memory, but they are just
     // markers.
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index 5d1af52e8ab5..d7695e5cfc0d 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -34,8 +34,7 @@ CallGraph::CallGraph(Module &M)
       CallsExternalNode(std::make_unique<CallGraphNode>(this, nullptr)) {
   // Add every interesting function to the call graph.
   for (Function &F : M)
-    if (!isDbgInfoIntrinsic(F.getIntrinsicID()))
-      addToCallGraph(&F);
+    addToCallGraph(&F);
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
@@ -101,7 +100,7 @@ void CallGraph::populateCallGraphNode(CallGraphNode *Node) {
         const Function *Callee = Call->getCalledFunction();
         if (!Callee)
           Node->addCalledFunction(Call, CallsExternalNode.get());
-        else if (!isDbgInfoIntrinsic(Callee->getIntrinsicID()))
+        else
           Node->addCalledFunction(Call, getOrInsertFunction(Callee));
 
         // Add reference to callback functions.
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index d7e2a3fa4fc5..6694d5cc06c8 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -46,8 +46,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "demanded-bits"
 
 static bool isAlwaysLive(Instruction *I) {
-  return I->isTerminator() || isa<DbgInfoIntrinsic>(I) || I->isEHPad() ||
-         I->mayHaveSideEffects();
+  return I->isTerminator() || I->isEHPad() || I->mayHaveSideEffects();
 }
 
 void DemandedBits::determineLiveOperandBits(
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 425f3682122c..71a75b496455 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -434,7 +434,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &S
     // If we see a free or a call which may write to memory (i.e. which might do
     // a free) the pointer could be marked invalid.
     if (isa<CallInst>(BBI) && BBI->mayWriteToMemory() &&
-        !isa<LifetimeIntrinsic>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+        !isa<LifetimeIntrinsic>(BBI))
       return false;
 
     Value *AccessedPtr;
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index f062189bac6a..d6f490cb69a5 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -188,9 +188,6 @@ MemDepResult MemoryDependenceResults::getCallDependencyFrom(
   // Walk backwards through the block, looking for dependencies.
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
-    // Debug intrinsics don't cause dependences and should not affect Limit
-    if (isa<DbgInfoIntrinsic>(Inst))
-      continue;
 
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
@@ -432,11 +429,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
 
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-      // Debug intrinsics don't (and can't) cause dependencies.
-      if (isa<DbgInfoIntrinsic>(II))
-        continue;
-
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
     --*Limit;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9df667926faf..a17417cb5189 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7846,8 +7846,6 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(
    iterator_range<BasicBlock::const_iterator> Range, unsigned ScanLimit) {
   assert(ScanLimit && "scan limit must be non-zero");
   for (const Instruction &I : Range) {
-    if (isa<DbgInfoIntrinsic>(I))
-        continue;
     if (--ScanLimit == 0)
       return false;
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
@@ -8050,8 +8048,6 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     // well-defined operands.
 
     for (const auto &I : make_range(Begin, End)) {
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
       if (--ScanLimit == 0)
         break;
 
@@ -8076,8 +8072,6 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
 
   while (true) {
     for (const auto &I : make_range(Begin, End)) {
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
       if (--ScanLimit == 0)
         return false;
       if (mustTriggerUB(&I, YieldsPoison))
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 3792b456c836..43574a54c37d 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -896,12 +896,7 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
   BasicBlock::iterator BBI = BI->getIterator();
   if (BBI != BB->begin()) {
     --BBI;
-    while (isa<DbgInfoIntrinsic>(BBI)) {
-      if (BBI == BB->begin())
-        break;
-      --BBI;
-    }
-    if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+    if (!isa<PHINode>(BBI))
       return nullptr;
   }
 
@@ -2981,10 +2976,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
   // Make sure there are no instructions between the first instruction
   // and return.
   BasicBlock::const_iterator BI = BB->getFirstNonPHIIt();
-  // Skip over debug and the bitcast.
-  while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
-         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(&*BI) ||
-         isFakeUse(&*BI))
+  // Skip over pseudo-probes and the bitcast.
+  while (&*BI == BCI || &*BI == EVI || isa<PseudoProbeInst>(BI) ||
+         isLifetimeEndOrBitCastFor(&*BI) || isFakeUse(&*BI))
     BI = std::next(BI);
   if (&*BI != RetI)
     return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4f548cbad5c3..ec0c5473b0db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1320,10 +1320,7 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
     HandlePHINodesInSuccessorBlocks(I.getParent());
   }
 
-  // Increase the SDNodeOrder if dealing with a non-debug instruction.
-  if (!isa<DbgInfoIntrinsic>(I))
-    ++SDNodeOrder;
-
+  ++SDNodeOrder;
   CurInst = &I;
 
   // Set inserted listener only if required.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index b02a03c0b0cb..ac6d25f141ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1507,7 +1507,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
                                       const FunctionLoweringInfo &FuncInfo) {
   return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
          !I->isTerminator() &&     // Terminators aren't folded.
-         !isa<DbgInfoIntrinsic>(I) && // Debug instructions aren't folded.
          !I->isEHPad() &&             // EH pad instructions aren't folded.
          !FuncInfo.isExportedInst(I); // Exported instrs must be computed.
 }
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e6b1f76dfacf..196fe294a274 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -586,11 +586,6 @@ bool llvm::stripDebugInfo(Function &F) {
   DenseMap<MDNode *, MDNode *> LoopIDsMap;
   for (BasicBlock &BB : F) {
     for (Instruction &I : llvm::make_early_inc_range(BB)) {
-      if (isa<DbgInfoIntrinsic>(&I)) {
-        I.eraseFromParent();
-        Changed = true;
-        continue;
-      }
       if (I.getDebugLoc()) {
         Changed = true;
         I.setDebugLoc(DebugLoc());
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 0c0b512e3b6c..75c7dd944b46 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -369,8 +369,7 @@ Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
 
   unsigned Count = 0;
   for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
-    if (!isa<DbgInfoIntrinsic>(*BI))
-      ++Count;
+    ++Count;
 
     if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 9604f252dd3d..c2eb24b482d4 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -2318,7 +2318,7 @@ bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
   // instructions in it that are not involved in the original set Insts.
   for (auto *B : L->blocks()) {
     for (auto &In : *B) {
-      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
+      if (isa<BranchInst>(In))
         continue;
       if (!Worklist.count(&In) && In.mayHaveSideEffects())
         return false;
diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 47bb20f4aa07..d0a5be8b2e23 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -117,7 +117,7 @@ class PPCBoolRetToInt : public FunctionPass {
 
   // A PHINode is Promotable if:
   // 1. Its type is i1 AND
-  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // 2. All of its uses are ReturnInt, CallInst, or PHINode
   // AND
   // 3. All of its operands are Constant or Argument or
   //    CallInst or PHINode AND
@@ -136,8 +136,7 @@ class PPCBoolRetToInt : public FunctionPass {
     for (const PHINode *P : Promotable) {
       // Condition 2 and 3
       auto IsValidUser = [] (const Value *V) -> bool {
-        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
-        isa<DbgInfoIntrinsic>(V);
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V);
       };
       auto IsValidOperand = [] (const Value *V) -> bool {
         return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index f62361d33470..8c156c93ba8d 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -719,9 +719,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
     if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
       return false;
 
-    // Ignore debug info so that's not counted against MaxInstrsToScan.
-    // Otherwise debug info could affect codegen.
-    if (!isa<DbgInfoIntrinsic>(Inst) && ++NumScanned > MaxInstrsToScan)
+    if (++NumScanned > MaxInstrsToScan)
       return false;
   }
 
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index cb18b55ae218..2c17863266a9 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -717,8 +717,6 @@ static void moveFunctionData(Function &Old, Function &New,
     if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
       NewEnds.insert(std::make_pair(RI->getReturnValue(), &CurrBB));
 
-    std::vector<Instruction *> DebugInsts;
-
     for (Instruction &Val : CurrBB) {
       // Since debug-info originates from many different locations in the
       // program, it will cause incorrect reporting from a debugger if we keep
@@ -749,21 +747,12 @@ static void moveFunctionData(Function &Old, Function &New,
       // From this point we are only handling call instructions.
       CallInst *CI = cast<CallInst>(&Val);
 
-      // Collect debug intrinsics for later removal.
-      if (isa<DbgInfoIntrinsic>(CI)) {
-        DebugInsts.push_back(&Val);
-        continue;
-      }
-
       // Edit the scope of called functions inside of outlined functions.
       if (DISubprogram *SP = New.getSubprogram()) {
         DILocation *DI = DILocation::get(New.getContext(), 0, 0, SP);
         Val.setDebugLoc(DI);
       }
     }
-
-    for (Instruction *I : DebugInsts)
-      I->eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index dda3d5a78815..7fd7d4d4f750 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -385,8 +385,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
     // line number. Real instructions generated by optimizations may not come
     // with a line number either.
     auto HasValidDbgLine = [](Instruction *J) {
-      return !isa<PHINode>(J) && !isa<DbgInfoIntrinsic>(J) &&
-             !J->isLifetimeStartOrEnd() && J->getDebugLoc();
+      return !isa<PHINode>(J) && !J->isLifetimeStartOrEnd() && J->getDebugLoc();
     };
 
     Instruction *J = &*BB->getFirstInsertionPt();
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4fe900e9421f..e2cd2a59fab9 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -4787,11 +4787,7 @@ bool InstCombinerImpl::freezeOtherUses(FreezeInst &FI) {
     MoveBefore = *MoveBeforeOpt;
   }
 
-  // Don't move to the position of a debug intrinsic.
-  if (isa<DbgInfoIntrinsic>(MoveBefore))
-    MoveBefore = MoveBefore->getNextNonDebugInstruction()->getIterator();
-  // Re-point iterator to come after any debug-info records, if we're
-  // running in "RemoveDIs" mode
+  // Re-point iterator to come after any debug-info records.
   MoveBefore.setHeadBit(false);
 
   bool Changed = false;
@@ -5582,11 +5578,9 @@ bool InstCombinerImpl::prepareWorklist(Function &F) {
       continue;
 
     unsigned NumDeadInstInBB;
-    unsigned NumDeadDbgInstInBB;
-    std::tie(NumDeadInstInBB, NumDeadDbgInstInBB) =
-        removeAllNonTerminatorAndEHPadInstructions(&BB);
+    NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
 
-    MadeIRChange |= NumDeadInstInBB + NumDeadDbgInstInBB > 0;
+    MadeIRChange |= NumDeadInstInBB != 0;
     NumDeadInst += NumDeadInstInBB;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 9351a42581ba..3dfb36f4f181 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -583,10 +583,6 @@ static bool functionHasLines(const Function &F, unsigned &EndLine) {
   EndLine = 0;
   for (const auto &BB : F) {
     for (const auto &I : BB) {
-      // Debug intrinsic locations correspond to the location of the
-      // declaration, not necessarily any statements or expressions.
-      if (isa<DbgInfoIntrinsic>(&I)) continue;
-
       const DebugLoc &Loc = I.getDebugLoc();
       if (!Loc)
         continue;
@@ -874,10 +870,6 @@ bool GCOVProfiler::emitProfileNotes(
         }
 
         for (const auto &I : BB) {
-          // Debug intrinsic locations correspond to the location of the
-          // declaration, not necessarily any statements or expressions.
-          if (isa<DbgInfoIntrinsic>(&I)) continue;
-
           const DebugLoc &Loc = I.getDebugLoc();
           if (!Loc)
             continue;
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index ec9f78edfeb1..8ae6f7745a9e 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -527,8 +527,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
         AtomicAccesses.push_back(&Inst);
       else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
         LocalLoadsAndStores.push_back(&Inst);
-      else if ((isa<CallInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) ||
-               isa<InvokeInst>(Inst)) {
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
           maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
         if (isa<MemIntrinsic>(Inst))
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index ea907af96edd..985b9c0e5312 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -562,20 +562,7 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() {
     if (isLive(&I))
       continue;
 
-    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
-      // Avoid removing a dbg.assign that is linked to instructions because it
-      // holds information about an existing store.
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DII))
-        if (!at::getAssignmentInsts(DAI).empty())
-          continue;
-      // Check if the scope of this variable location is alive.
-      if (AliveScopes.count(DII->getDebugLoc()->getScope()))
-        continue;
-
-      // Fallthrough and drop the intrinsic.
-    } else {
-      Changed.ChangedNonDebugInstr = true;
-    }
+    Changed.ChangedNonDebugInstr = true;
 
     // Prepare to delete.
     Worklist.push_back(&I);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c580dd4ff230..d9d05c3e8cc4 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2684,10 +2684,6 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
 /// When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets.
 bool GVNPass::processInstruction(Instruction *I) {
-  // Ignore dbg info intrinsics.
-  if (isa<DbgInfoIntrinsic>(I))
-    return false;
-
   // If the instruction can be easily simplified then do so now in preference
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
@@ -2974,8 +2970,7 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
 bool GVNPass::performScalarPRE(Instruction *CurInst) {
   if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
       isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
-      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
-      isa<DbgInfoIntrinsic>(CurInst))
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects())
     return false;
 
   // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 1c2e1531e47d..0acbaf58a8f7 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -1166,8 +1166,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
         SI.insert(Store, VN);
       else if (auto *Call = dyn_cast<CallInst>(&I1)) {
         if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
-          if (isa<DbgInfoIntrinsic>(Intr) ||
-              Intr->getIntrinsicID() == Intrinsic::assume ||
+          if (Intr->getIntrinsicID() == Intrinsic::assume ||
               Intr->getIntrinsicID() == Intrinsic::sideeffect)
             continue;
         }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bd59caa6a959..abb6ff1dcfe6 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1204,10 +1204,6 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
 
     return !Invalidated;
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-    // Don't sink or hoist dbg info; it's legal, but not useful.
-    if (isa<DbgInfoIntrinsic>(I))
-      return false;
-
     // Don't sink calls which can throw.
     if (CI->mayThrow())
       return false;
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e4f35e4b2108..4ba69034d644 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5613,8 +5613,7 @@ BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
     }
   }
 
-  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
-         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
          "Insertion point must be a normal instruction");
 
   // Then, climb up the immediate dominator tree as far as we can go while
@@ -5627,9 +5626,6 @@ BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
   // Ignore landingpad instructions.
   while (IP->isEHPad()) ++IP;
 
-  // Ignore debug intrinsics.
-  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
-
   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
   // IP consistent across expansions and allows the previously inserted
   // instructions to be reused by subsequent expansion.
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index afa7abfea419..a22d84dcf014 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -551,7 +551,7 @@ static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
       for (Instruction &I : *BB) {
         // These won't get into the final code - don't even try calculating the
         // cost for them.
-        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
+        if (EphValues.count(&I))
           continue;
 
         // Track this instruction's expected baseline cost when executing the
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index cb202f5f71b9..f053e202655b 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -296,10 +296,6 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   };
   auto AllPrecedingUsesFromBlockHoisted =
       [&HasNoUnhoistedInstr](const User *U) {
-        // Do not hoist any debug info intrinsics.
-        if (isa<DbgInfoIntrinsic>(U))
-          return false;
-
         return HasNoUnhoistedInstr(U->operand_values());
       };
 
@@ -313,9 +309,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
       if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
         return false;  // too much to hoist
     } else {
-      // Debug info intrinsics should not be counted for threshold.
-      if (!isa<DbgInfoIntrinsic>(I))
-        NotHoistedInstCount++;
+      NotHoistedInstCount++;
       if (NotHoistedInstCount > SpecExecMaxNotHoisted)
         return false; // too much left behind
       NotHoisted.insert(&I);
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index c71c5a70a12f..e7d989a43840 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -239,8 +239,7 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
       // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
       // considered accessing memory and will be marked as a tail call if we
       // don't bail out here.
-      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
-          isa<PseudoProbeInst>(&I))
+      if (!CI || CI->isTailCall() || isa<PseudoProbeInst>(&I))
         continue;
 
       // Bail out for intrinsic stackrestore call because it can modify
@@ -335,9 +334,6 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
 /// instructions between the call and this instruction are movable.
 ///
 static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
-  if (isa<DbgInfoIntrinsic>(I))
-    return true;
-
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
     if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
         llvm::findAllocaForValue(II->getArgOperand(1)))
@@ -396,12 +392,6 @@ static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
   return true;
 }
 
-static Instruction *firstNonDbg(BasicBlock::iterator I) {
-  while (isa<DbgInfoIntrinsic>(I))
-    ++I;
-  return &*I;
-}
-
 namespace {
 class TailRecursionEliminator {
   Function &F;
@@ -493,9 +483,8 @@ CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) {
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
-  if (BB == &F.getEntryBlock() &&
-      firstNonDbg(BB->front().getIterator()) == CI &&
-      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+  if (BB == &F.getEntryBlock() && &BB->front() == CI &&
+      &*std::next(BB->begin()) == TI && CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 1210bdf4a1c9..9883974c55e3 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -471,10 +471,6 @@ CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
         Info.LifeEnd = IntrInst;
         continue;
       }
-      // At this point, permit debug uses outside of the region.
-      // This is fixed in a later call to fixupDebugInfoPostExtraction().
-      if (isa<DbgInfoIntrinsic>(IntrInst))
-        continue;
     }
     // Find untracked uses of the address, bail.
     if (!definedInRegion(Blocks, U))
@@ -1077,10 +1073,6 @@ static void applyFirstDebugLoc(Function *oldFunction,
       return any_of(*BB, [&BranchI](const Instruction &I) {
         if (!I.getDebugLoc())
           return false;
-        // Don't use source locations attached to debug-intrinsics: they could
-        // be from completely unrelated scopes.
-        if (isa<DbgInfoIntrinsic>(I))
-          return false;
         BranchI->setDebugLoc(I.getDebugLoc());
         return true;
       });
@@ -1329,7 +1321,6 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   //  2) They need to point to fresh metadata, e.g. because they currently
   //     point to a variable in the wrong scope.
   SmallDenseMap<DINode *, DINode *> RemappedMetadata;
-  SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
   SmallVector<DbgVariableRecord *, 4> DVRsToDelete;
   DenseMap<const MDNode *, MDNode *> Cache;
 
@@ -1370,55 +1361,29 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       }
 
       DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
-      // Apply the two updates that dbg.values get: invalid operands, and
-      // variable metadata fixup.
+      // If any of the used locations are invalid, delete the record.
       if (any_of(DVR.location_ops(), IsInvalidLocation)) {
         DVRsToDelete.push_back(&DVR);
         continue;
       }
+
+      // DbgAssign intrinsics have an extra Value argument:
       if (DVR.isDbgAssign() && IsInvalidLocation(DVR.getAddress())) {
         DVRsToDelete.push_back(&DVR);
         continue;
       }
+
+      // If the variable was in the scope of the old function, i.e. it was not
+      // inlined, point the intrinsic to a fresh variable within the new
+      // function.
       if (!DVR.getDebugLoc().getInlinedAt())
         DVR.setVariable(GetUpdatedDIVariable(DVR.getVariable()));
     }
   };
 
-  for (Instruction &I : instructions(NewFunc)) {
+  for (Instruction &I : instructions(NewFunc))
     UpdateDbgRecordsOnInst(I);
 
-    auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
-    if (!DII)
-      continue;
-
-    // Point the intrinsic to a fresh label within the new function if the
-    // intrinsic was not inlined from some other function.
-    if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      UpdateDbgLabel(DLI);
-      continue;
-    }
-
-    auto *DVI = cast<DbgVariableIntrinsic>(DII);
-    // If any of the used locations are invalid, delete the intrinsic.
-    if (any_of(DVI->location_ops(), IsInvalidLocation)) {
-      DebugIntrinsicsToDelete.push_back(DVI);
-      continue;
-    }
-    // DbgAssign intrinsics have an extra Value argument:
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-        DAI && IsInvalidLocation(DAI->getAddress())) {
-      DebugIntrinsicsToDelete.push_back(DVI);
-      continue;
-    }
-    // If the variable was in the scope of the old function, i.e. it was not
-    // inlined, point the intrinsic to a fresh variable within the new function.
-    if (!DVI->getDebugLoc().getInlinedAt())
-      DVI->setVariable(GetUpdatedDIVariable(DVI->getVariable()));
-  }
-
-  for (auto *DII : DebugIntrinsicsToDelete)
-    DII->eraseFromParent();
   for (auto *DVR : DVRsToDelete)
     DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
   DIB.finalizeSubprogram(NewSP);
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index ff8a91bc7e7d..c2dbdc57eb3b 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -353,7 +353,7 @@ bool llvm::collectDebugInfoMetadata(Module &M,
 
         // Cllect dbg.values and dbg.declare.
         if (DebugifyLevel > Level::Locations) {
-          auto HandleDbgVariable = [&](auto *DbgVar) {
+          auto HandleDbgVariable = [&](DbgVariableRecord *DbgVar) {
             if (!SP)
               return;
             // Skip inlined variables.
@@ -368,14 +368,8 @@ bool llvm::collectDebugInfoMetadata(Module &M,
           };
           for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
             HandleDbgVariable(&DVR);
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-            HandleDbgVariable(DVI);
         }
 
-        // Skip debug instructions other than dbg.value and dbg.declare.
-        if (isa<DbgInfoIntrinsic>(&I))
-          continue;
-
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
         DebugInfoBeforePass.InstToDelete.insert({&I, &I});
 
@@ -597,7 +591,7 @@ bool llvm::checkDebugInfoMetadata(Module &M,
 
         // Collect dbg.values and dbg.declares.
         if (DebugifyLevel > Level::Locations) {
-          auto HandleDbgVariable = [&](auto *DbgVar) {
+          auto HandleDbgVariable = [&](DbgVariableRecord *DbgVar) {
             if (!SP)
               return;
             // Skip inlined variables.
@@ -612,14 +606,8 @@ bool llvm::checkDebugInfoMetadata(Module &M,
           };
           for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
             HandleDbgVariable(&DVR);
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-            HandleDbgVariable(DVI);
         }
 
-        // Skip debug instructions other than dbg.value and dbg.declare.
-        if (isa<DbgInfoIntrinsic>(&I))
-          continue;
-
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
 
         DebugInfoAfterPass.DILocations.insert({&I, hasLoc(I)});
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index d1db2ee29f3a..3a5c7a3b1738 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -353,13 +353,6 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallBase &CB = *cast<CallBase>(&*CurInst);
 
-      // Debug info can safely be ignored here.
-      if (isa<DbgInfoIntrinsic>(CB)) {
-        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
-        ++CurInst;
-        continue;
-      }
-
       // Cannot handle inline asm.
       if (CB.isInlineAsm()) {
         LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f47c467d1514..7df5e9958182 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1927,16 +1927,11 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
       }
     }
 
-    // Remove debug info intrinsics if we're not keeping inline info.
+    // Remove debug info records if we're not keeping inline info.
     if (NoInlineLineTables) {
       BasicBlock::iterator BI = FI->begin();
       while (BI != FI->end()) {
-        if (isa<DbgInfoIntrinsic>(BI)) {
-          BI = BI->eraseFromParent();
-          continue;
-        } else {
-          BI->dropDbgRecords();
-        }
+        BI->dropDbgRecords();
         ++BI;
       }
     }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 33143700f560..f5208d50c6aa 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2848,10 +2848,8 @@ bool llvm::handleUnreachableTerminator(
   return Changed;
 }
 
-std::pair<unsigned, unsigned>
-llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
-  unsigned NumDeadDbgInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
   // having to update as many def-use and use-def chains.
   Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
@@ -2870,15 +2868,12 @@ llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
       EndInst = Inst;
       continue;
     }
-    if (isa<DbgInfoIntrinsic>(Inst))
-      ++NumDeadDbgInst;
-    else
-      ++NumDeadInst;
+    ++NumDeadInst;
     // RemoveDIs: erasing debug-info must be done manually.
     Inst->dropDbgRecords();
     Inst->eraseFromParent();
   }
-  return {NumDeadInst, NumDeadDbgInst};
+  return NumDeadInst;
 }
 
 unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 6b42503b2e01..66d0573e83f6 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -547,36 +547,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // possible or create a clone in the OldPreHeader if not.
     Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
 
-    // Record all debug intrinsics preceding LoopEntryBranch to avoid
+    // Record all debug records preceding LoopEntryBranch to avoid
     // duplication.
-    using DbgIntrinsicHash =
+    using DbgHash =
         std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
-    auto makeHash = [](auto *D) -> DbgIntrinsicHash {
+    auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
       auto VarLocOps = D->location_ops();
       return {{hash_combine_range(VarLocOps), D->getVariable()},
               D->getExpression()};
     };
 
-    SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
-    for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-        DbgIntrinsics.insert(makeHash(DII));
-        // Until RemoveDIs supports dbg.declares in DbgVariableRecord format,
-        // we'll need to collect DbgVariableRecords attached to any other debug
-        // intrinsics.
-        for (const DbgVariableRecord &DVR :
-             filterDbgVars(DII->getDbgRecordRange()))
-          DbgIntrinsics.insert(makeHash(&DVR));
-      } else {
-        break;
-      }
-    }
-
+    SmallDenseSet<DbgHash, 8> DbgRecords;
     // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
-    // terminator, which isn't considered in the loop above.
+    // terminator.
     for (const DbgVariableRecord &DVR :
          filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgIntrinsics.insert(makeHash(&DVR));
+      DbgRecords.insert(makeHash(&DVR));
 
     // Remember the local noalias scope declarations in the header. After the
     // rotation, they must be duplicated and the scope must be cloned. This
@@ -623,7 +609,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // memory (without proving that the loop doesn't write).
       if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
           !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-          !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst) &&
+          !isa<AllocaInst>(Inst) &&
           // It is not safe to hoist the value of these instructions in
           // coroutines, as the addresses of otherwise eligible variables (e.g.
           // thread-local variables and errno) may change if the coroutine is
@@ -642,7 +628,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           // Erase anything we've seen before.
           for (DbgVariableRecord &DVR :
                make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgIntrinsics.count(makeHash(&DVR)))
+            if (DbgRecords.count(makeHash(&DVR)))
               DVR.eraseFromParent();
         }
 
@@ -671,7 +657,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
              make_early_inc_range(filterDbgVars(Range)))
-          if (DbgIntrinsics.count(makeHash(&DVR)))
+          if (DbgRecords.count(makeHash(&DVR)))
             DVR.eraseFromParent();
       }
 
@@ -679,13 +665,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       RemapInstruction(C, ValueMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-      // Avoid inserting the same intrinsic twice.
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
-        if (DbgIntrinsics.count(makeHash(DII))) {
-          C->eraseFromParent();
-          continue;
-        }
-
       // With the operands remapped, see if the instruction constant folds or is
       // otherwise simplifyable.  This commonly occurs because the entry from PHI
       // nodes allows icmps and other instructions to fold.
@@ -806,7 +785,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
                                     &InsertedPHIs);
 
-    // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+    // Attach debug records to the new phis if that phi uses a value that
     // previously had debug metadata attached. This keeps the debug info
     // up-to-date in the loop body.
     if (!InsertedPHIs.empty())
@@ -952,9 +931,6 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
     if (!isSafeToSpeculativelyExecute(&*I))
       return false;
 
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
     switch (I->getOpcode()) {
     default:
       return false;
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 70afd4133df7..24fe08d6c3e4 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -182,8 +182,7 @@ SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const {
     BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
     while ((isa<BitCastInst>(IP) &&
             isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
-            cast<BitCastInst>(IP)->getOperand(0) != A) ||
-           isa<DbgInfoIntrinsic>(IP))
+            cast<BitCastInst>(IP)->getOperand(0) != A))
       ++IP;
     return IP;
   }
@@ -278,11 +277,6 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   if (IP != BlockBegin) {
     --IP;
     for (; ScanLimit; --IP, --ScanLimit) {
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
-      // generated code.
-      if (isa<DbgInfoIntrinsic>(IP))
-        ScanLimit++;
-
       auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) {
         // Ensure that no-wrap flags match.
         if (isa<OverflowingBinaryOperator>(I)) {
@@ -382,10 +376,6 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *Offset, Value *V,
   if (IP != BlockBegin) {
     --IP;
     for (; ScanLimit; --IP, --ScanLimit) {
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
-      // generated code.
-      if (isa<DbgInfoIntrinsic>(IP))
-        ScanLimit++;
       if (auto *GEP = dyn_cast<GetElementPtrInst>(IP)) {
         if (GEP->getPointerOperand() == V &&
             GEP->getSourceElementType() == Builder.getInt8Ty() &&
@@ -1545,8 +1535,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
           InsertPt = L->getHeader()->getFirstInsertionPt();
 
         while (InsertPt != Builder.GetInsertPoint() &&
-               (isInsertedInstruction(&*InsertPt) ||
-                isa<DbgInfoIntrinsic>(&*InsertPt))) {
+               (isInsertedInstruction(&*InsertPt))) {
           InsertPt = std::next(InsertPt);
         }
         break;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0980f0e57aa6..eb52c1b7e6fb 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1130,17 +1130,14 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     Instruction *NewBonusInst = BonusInst.clone();
 
-    if (!isa<DbgInfoIntrinsic>(BonusInst)) {
-      if (!NewBonusInst->getDebugLoc().isSameSourceLocation(
-              PTI->getDebugLoc())) {
-        // Unless the instruction has the same !dbg location as the original
-        // branch, drop it. When we fold the bonus instructions we want to make
-        // sure we reset their debug locations in order to avoid stepping on
-        // dead code caused by folding dead branches.
-        NewBonusInst->setDebugLoc(DebugLoc::getDropped());
-      } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
-        mapAtomInstance(DL, VMap);
-      }
+    if (!NewBonusInst->getDebugLoc().isSameSourceLocation(PTI->getDebugLoc())) {
+      // Unless the instruction has the same !dbg location as the original
+      // branch, drop it. When we fold the bonus instructions we want to make
+      // sure we reset their debug locations in order to avoid stepping on
+      // dead code caused by folding dead branches.
+      NewBonusInst->setDebugLoc(DebugLoc::getDropped());
+    } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
+      mapAtomInstance(DL, VMap);
     }
 
     RemapInstruction(NewBonusInst, VMap,
@@ -1158,9 +1155,6 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
     RemapDbgRecordRange(NewBonusInst->getModule(), Range, VMap,
                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-    if (isa<DbgInfoIntrinsic>(BonusInst))
-      continue;
-
     NewBonusInst->takeName(&BonusInst);
     BonusInst.setName(NewBonusInst->getName() + ".old");
     VMap[&BonusInst] = NewBonusInst;
@@ -1903,21 +1897,6 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
 
     Instruction *I1 = &*BB1ItrPair.first;
 
-    // Skip debug info if it is not identical.
-    bool AllDbgInstsAreIdentical = all_of(OtherSuccIterRange, [I1](auto &Iter) {
-      Instruction *I2 = &*Iter;
-      return I1->isIdenticalToWhenDefined(I2);
-    });
-    if (!AllDbgInstsAreIdentical) {
-      while (isa<DbgInfoIntrinsic>(I1))
-        I1 = &*++BB1ItrPair.first;
-      for (auto &SuccIter : OtherSuccIterRange) {
-        Instruction *I2 = &*SuccIter;
-        while (isa<DbgInfoIntrinsic>(I2))
-          I2 = &*++SuccIter;
-      }
-    }
-
     bool AllInstsAreIdentical = true;
     bool HasTerminator = I1->isTerminator();
     for (auto &SuccIter : OtherSuccIterRange) {
@@ -1965,49 +1944,33 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
 
     if (AllInstsAreIdentical) {
       BB1ItrPair.first++;
-      if (isa<DbgInfoIntrinsic>(I1)) {
-        // The debug location is an integral part of a debug info intrinsic
-        // and can't be separated from it or replaced.  Instead of attempting
-        // to merge locations, simply hoist both copies of the intrinsic.
-        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
-        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
-        // and leave any that were not hoisted behind (by calling moveBefore
-        // rather than moveBeforePreserving).
-        I1->moveBefore(TI->getIterator());
-        for (auto &SuccIter : OtherSuccIterRange) {
-          auto *I2 = &*SuccIter++;
-          assert(isa<DbgInfoIntrinsic>(I2));
-          I2->moveBefore(TI->getIterator());
+      // For a normal instruction, we just move one to right before the
+      // branch, then replace all uses of the other with the first.  Finally,
+      // we remove the now redundant second instruction.
+      hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+      // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+      // and leave any that were not hoisted behind (by calling moveBefore
+      // rather than moveBeforePreserving).
+      I1->moveBefore(TI->getIterator());
+      for (auto &SuccIter : OtherSuccIterRange) {
+        Instruction *I2 = &*SuccIter++;
+        assert(I2 != I1);
+        if (!I2->use_empty())
+          I2->replaceAllUsesWith(I1);
+        I1->andIRFlags(I2);
+        if (auto *CB = dyn_cast<CallBase>(I1)) {
+          bool Success = CB->tryIntersectAttributes(cast<CallBase>(I2));
+          assert(Success && "We should not be trying to hoist callbases "
+                            "with non-intersectable attributes");
+          // For NDEBUG Compile.
+          (void)Success;
         }
-      } else {
-        // For a normal instruction, we just move one to right before the
-        // branch, then replace all uses of the other with the first.  Finally,
-        // we remove the now redundant second instruction.
-        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
-        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
-        // and leave any that were not hoisted behind (by calling moveBefore
-        // rather than moveBeforePreserving).
-        I1->moveBefore(TI->getIterator());
-        for (auto &SuccIter : OtherSuccIterRange) {
-          Instruction *I2 = &*SuccIter++;
-          assert(I2 != I1);
-          if (!I2->use_empty())
-            I2->replaceAllUsesWith(I1);
-          I1->andIRFlags(I2);
-          if (auto *CB = dyn_cast<CallBase>(I1)) {
-            bool Success = CB->tryIntersectAttributes(cast<CallBase>(I2));
-            assert(Success && "We should not be trying to hoist callbases "
-                              "with non-intersectable attributes");
-            // For NDEBUG Compile.
-            (void)Success;
-          }
 
-          combineMetadataForCSE(I1, I2, true);
-          // I1 and I2 are being combined into a single instruction.  Its debug
-          // location is the merged locations of the original instructions.
-          I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-          I2->eraseFromParent();
-        }
+        combineMetadataForCSE(I1, I2, true);
+        // I1 and I2 are being combined into a single instruction.  Its debug
+        // location is the merged locations of the original instructions.
+        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+        I2->eraseFromParent();
       }
       if (!Changed)
         NumHoistCommonCode += SuccIterPairs.size();
@@ -2297,11 +2260,8 @@ static void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   SmallVector<Instruction*,4> Insts;
   for (auto *BB : Blocks) {
     Instruction *I = BB->getTerminator();
-    do {
-      I = I->getPrevNode();
-    } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
-    if (!isa<DbgInfoIntrinsic>(I))
-      Insts.push_back(I);
+    I = I->getPrevNode();
+    Insts.push_back(I);
   }
 
   // We don't need to do any more checking here; canSinkInstructions should
@@ -3234,7 +3194,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // - All of their uses are in ThenBB.
   SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
 
-  SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
+  SmallVector<Instruction *, 4> SpeculatedPseudoProbes;
 
   unsigned SpeculatedInstructions = 0;
   bool HoistLoadsStores = Options.HoistLoadsStoresWithCondFaulting;
@@ -3243,12 +3203,6 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   StoreInst *SpeculatedStore = nullptr;
   EphemeralValueTracker EphTracker;
   for (Instruction &I : reverse(drop_end(*ThenBB))) {
-    // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I)) {
-      SpeculatedDbgIntrinsics.push_back(&I);
-      continue;
-    }
-
     // Skip pseudo probes. The consequence is we lose track of the branch
     // probability for ThenBB, which is fine since the optimization here takes
     // place regardless of the branch probability.
@@ -3257,7 +3211,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
       // the samples collected on the non-conditional path are counted towards
       // the conditional path. We leave it for the counts inference algorithm to
       // figure out a proper count for an unknown probe.
-      SpeculatedDbgIntrinsics.push_back(&I);
+      SpeculatedPseudoProbes.push_back(&I);
       continue;
     }
 
@@ -3388,9 +3342,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // hoisting above.
   for (auto &I : make_early_inc_range(*ThenBB)) {
     if (!SpeculatedStoreValue || &I != SpeculatedStore) {
-      // Don't update the DILocation of dbg.assign intrinsics.
-      if (!isa<DbgAssignIntrinsic>(&I))
-        I.setDebugLoc(DebugLoc::getDropped());
+      I.setDebugLoc(DebugLoc::getDropped());
     }
     I.dropUBImplyingAttrsAndMetadata();
 
@@ -3402,9 +3354,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   }
 
   // Hoist the instructions.
-  // In "RemoveDIs" non-instr debug-info mode, drop DbgVariableRecords attached
-  // to these instructions, in the same way that dbg.value intrinsics are
-  // dropped at the end of this block.
+  // Drop DbgVariableRecords attached to these instructions.
   for (auto &It : *ThenBB)
     for (DbgRecord &DR : make_early_inc_range(It.getDbgRecordRange()))
       // Drop all records except assign-kind DbgVariableRecords (dbg.assign
@@ -3442,15 +3392,9 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
     PN.setIncomingValue(ThenI, V);
   }
 
-  // Remove speculated dbg intrinsics.
-  // FIXME: Is it possible to do this in a more elegant way? Moving/merging the
-  // dbg value for the different flows and inserting it after the select.
-  for (Instruction *I : SpeculatedDbgIntrinsics) {
-    // We still want to know that an assignment took place so don't remove
-    // dbg.assign intrinsics.
-    if (!isa<DbgAssignIntrinsic>(I))
-      I->eraseFromParent();
-  }
+  // Remove speculated pseudo probes.
+  for (Instruction *I : SpeculatedPseudoProbes)
+    I->eraseFromParent();
 
   ++NumSpeculations;
   return true;
@@ -4162,8 +4106,8 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     // Don't check the branch condition comparison itself.
     if (&I == Cond)
       continue;
-    // Ignore dbg intrinsics, and the terminator.
-    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
+    // Ignore the terminator.
+    if (isa<BranchInst>(I))
       continue;
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
@@ -7762,8 +7706,7 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
     if (!LPad2 || !LPad2->isIdenticalTo(LPad))
       continue;
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
+    ++I;
     BranchInst *BI2 = dyn_cast<BranchInst>(I);
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
@@ -7784,12 +7727,6 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
       }
     }
 
-    // The debug info in OtherPred doesn't cover the merged control flow that
-    // used to go through BB.  We need to delete it or update it.
-    for (Instruction &Inst : llvm::make_early_inc_range(*OtherPred))
-      if (isa<DbgInfoIntrinsic>(Inst))
-        Inst.eraseFromParent();
-
     SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : UniqueSuccs) {
       Succ->removePredecessor(BB);
@@ -7837,8 +7774,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // constant, try to simplify the block.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
-      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-        ;
+      ++I;
       if (I->isTerminator() &&
           tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
         return true;
@@ -7847,8 +7783,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // See if we can merge an empty landing pad block with another which is
   // equivalent.
   if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
+    ++I;
     if (I->isTerminator() && tryToMergeLandingPad(LPad, BI, BB, DTU))
       return true;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8e09e6f8d493..0c4e5bb3d472 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -896,13 +896,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       } // end of PHI handling
 
       // We handle calls that:
-      //   * Are debug info intrinsics.
       //   * Have a mapping to an IR intrinsic.
       //   * Have a vector version available.
       auto *CI = dyn_cast<CallInst>(&I);
 
       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
-          !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
             (!VFDatabase::getMappings(*CI).empty() ||
              isTLIScalarize(*TLI, *CI)))) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8bff3c018714..d0bf637b70ab 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24334,9 +24334,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       continue;
     }
 
-    if (isa<DbgInfoIntrinsic>(It))
-      continue;
-
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(It)) {
       // Check that the PHI is a reduction PHI.

From c9a87a50aee3c91f36d33c170d5131bcc370c289 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Tue, 17 Jun 2025 08:14:05 -0700
Subject: [PATCH 0652/1322] [SLPVectorizer] Use accurate cost for external
 users of resize shuffles (#137419)

When implementing the vectorization, we potentially need to add shuffles
for external users. In such cases, we may be shuffling a smaller vector
into a larger vector. When this happens `ResizeToVF` will just build a
poison padded identity vector. Then the to build the final shuffle, we
just use the `SK_InsertSubvector` mask.

This is possibly clearer by looking at the included test in
SLPVectorizer/AMDGPU/external-shuffle.ll

In the exit block we have a bunch of shuffles to glue the vectorized
tree match the `InsertElement` users. `TMP25` holds the result of
resizing the v2i16 vectorized sequence to match the `InsertElement` size
v16i16. Then `TMP26` is the final shuffle which replaces the
`InsertElement` sequence. This is just an insertsubvector.

However, when calculating the cost for these shuffles, we aren't
modelling this correctly. `ResizeToVF` will indicate to
`performExtractsShuffleAction` that we cannot use the original mask due
to the resize shuffle. The consequence is that the cost calculation uses
a different shuffle mask than what is ultimately used.

Going back to the included test, we can consider again `TMP26`. Clearly
we can see the shuffle uses a mask {0, 1, 2, 3, 16, 17, poison ..}.
However, we will currently calculate the cost with a mask {0, 1, 2, 3,
20, 21, ...} we have replaced 16 and 17 with 20 and 21 (Index + Vector
Size). Queries like BasicTTImpl::improveShuffleKindFromMask will not
recognize this as an `SK_InsertSubvector` mask, and targets which have
reduced costs for `SK_InsertSubvector` will not accurately calculate the
cost.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  54 +++++---
 .../SLPVectorizer/AMDGPU/external-shuffle.ll  | 128 ++++++------------
 .../extractelement-single-use-many-nodes.ll   |  11 +-
 .../X86/vec_list_bias-inseltpoison.ll         |  25 ++--
 4 files changed, 99 insertions(+), 119 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d0bf637b70ab..d811e9d77d18 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14910,25 +14910,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
 
   Cost += ExtractCost;
   auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
-                                    bool) {
+                                    bool ForSingleMask) {
     InstructionCost C = 0;
     unsigned VF = Mask.size();
     unsigned VecVF = TE->getVectorFactor();
-    if (VF != VecVF &&
-        (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
-         !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
-      SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
-      std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
-                OrigMask.begin());
-      C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
-                           getWidenedType(TE->getMainOp()->getType(), VecVF),
-                           OrigMask);
-      LLVM_DEBUG(
-          dbgs() << "SLP: Adding cost " << C
-                 << " for final shuffle of insertelement external users.\n";
-          TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
-      Cost += C;
-      return std::make_pair(TE, true);
+    bool HasLargeIndex =
+        any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
+    if ((VF != VecVF && HasLargeIndex) ||
+        !ShuffleVectorInst::isIdentityMask(Mask, VF)) {
+
+      if (HasLargeIndex) {
+        SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
+        std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+                  OrigMask.begin());
+        C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
+                             getWidenedType(TE->getMainOp()->getType(), VecVF),
+                             OrigMask);
+        LLVM_DEBUG(
+            dbgs() << "SLP: Adding cost " << C
+                   << " for final shuffle of insertelement external users.\n";
+            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+        Cost += C;
+        return std::make_pair(TE, true);
+      }
+
+      if (!ForSingleMask) {
+        SmallVector<int> ResizeMask(VF, PoisonMaskElem);
+        for (unsigned I = 0; I < VF; ++I) {
+          if (Mask[I] != PoisonMaskElem)
+            ResizeMask[Mask[I]] = Mask[I];
+        }
+        if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
+          C = ::getShuffleCost(
+              *TTI, TTI::SK_PermuteSingleSrc,
+              getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
+        LLVM_DEBUG(
+            dbgs() << "SLP: Adding cost " << C
+                   << " for final shuffle of insertelement external users.\n";
+            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+
+        Cost += C;
+      }
     }
     return std::make_pair(TE, false);
   };
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
index ce9e47a03dee..f3e89b60b804 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
@@ -10,124 +10,84 @@ define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out,
 ; GCN-NEXT:  [[ENTRY:.*]]:
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
 ; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2
-; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 3
 ; GCN-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
 ; GCN-NEXT:    [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4
-; GCN-NEXT:    [[GEP5:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 5
 ; GCN-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
 ; GCN-NEXT:    [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6
-; GCN-NEXT:    [[GEP7:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 7
 ; GCN-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
 ; GCN-NEXT:    [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8
-; GCN-NEXT:    [[GEP9:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 9
 ; GCN-NEXT:    [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
 ; GCN-NEXT:    [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10
-; GCN-NEXT:    [[GEP11:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 11
 ; GCN-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
 ; GCN-NEXT:    [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12
-; GCN-NEXT:    [[GEP13:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 13
 ; GCN-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
 ; GCN-NEXT:    [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14
 ; GCN-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
-; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GCN-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
-; GCN-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
-; GCN-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; GCN-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; GCN-NEXT:    [[TMP24:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
-; GCN-NEXT:    [[TMP26:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
-; GCN-NEXT:    [[TMP28:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
-; GCN-NEXT:    [[TMP38:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; GCN-NEXT:    br label %[[DO_BODY:.*]]
 ; GCN:       [[DO_BODY]]:
-; GCN-NEXT:    [[PHI2:%.*]] = phi i16 [ [[TMP8]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI3:%.*]] = phi i16 [ [[TMP9]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI4:%.*]] = phi i16 [ [[TMP10]], %[[ENTRY]] ], [ [[TMP39:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI5:%.*]] = phi i16 [ [[TMP11]], %[[ENTRY]] ], [ [[OTHERELE5:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI6:%.*]] = phi i16 [ [[TMP12]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI7:%.*]] = phi i16 [ [[TMP13]], %[[ENTRY]] ], [ [[OTHERELE7:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI8:%.*]] = phi i16 [ [[TMP14]], %[[ENTRY]] ], [ [[TMP40:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI9:%.*]] = phi i16 [ [[TMP15]], %[[ENTRY]] ], [ [[OTHERELE9:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI10:%.*]] = phi i16 [ [[TMP24]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI11:%.*]] = phi i16 [ [[TMP26]], %[[ENTRY]] ], [ [[OTHERELE11:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI12:%.*]] = phi i16 [ [[TMP28]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI13:%.*]] = phi i16 [ [[TMP38]], %[[ENTRY]] ], [ [[OTHERELE13:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[TMP41:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP17:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP18:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP19:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP20:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP21:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP22:%.*]], %[[DO_BODY]] ]
 ; GCN-NEXT:    [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ]
 ; GCN-NEXT:    [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
-; GCN-NEXT:    [[OTHERELE3]] = load i16, ptr addrspace(3) [[GEP3]], align 1
-; GCN-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
-; GCN-NEXT:    [[OTHERELE5]] = load i16, ptr addrspace(3) [[GEP5]], align 1
-; GCN-NEXT:    [[TMP18:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
-; GCN-NEXT:    [[OTHERELE7]] = load i16, ptr addrspace(3) [[GEP7]], align 1
-; GCN-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
-; GCN-NEXT:    [[OTHERELE9]] = load i16, ptr addrspace(3) [[GEP9]], align 1
-; GCN-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
-; GCN-NEXT:    [[OTHERELE11]] = load i16, ptr addrspace(3) [[GEP11]], align 1
-; GCN-NEXT:    [[TMP21:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
-; GCN-NEXT:    [[OTHERELE13]] = load i16, ptr addrspace(3) [[GEP13]], align 1
-; GCN-NEXT:    [[TMP22:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[TMP17]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[TMP18]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[TMP19]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[TMP20]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[TMP21]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[TMP22]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
 ; GCN-NEXT:    [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
 ; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
-; GCN-NEXT:    [[TMP30]] = extractelement <2 x i16> [[TMP17]], i32 0
-; GCN-NEXT:    [[TMP39]] = extractelement <2 x i16> [[TMP18]], i32 0
-; GCN-NEXT:    [[TMP32]] = extractelement <2 x i16> [[TMP19]], i32 0
-; GCN-NEXT:    [[TMP40]] = extractelement <2 x i16> [[TMP20]], i32 0
-; GCN-NEXT:    [[TMP34]] = extractelement <2 x i16> [[TMP21]], i32 0
-; GCN-NEXT:    [[TMP35]] = extractelement <2 x i16> [[TMP22]], i32 0
 ; GCN-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
 ; GCN:       [[EXIT]]:
-; GCN-NEXT:    [[TMP36:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC038:%.*]] = shufflevector <16 x i16> [[TMP36]], <16 x i16> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC059:%.*]] = shufflevector <16 x i16> [[VEC038]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP24]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC0710:%.*]] = shufflevector <16 x i16> [[VEC059]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i16> [[TMP26]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC0911:%.*]] = shufflevector <16 x i16> [[VEC0710]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP28]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC01112:%.*]] = shufflevector <16 x i16> [[VEC0911]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i16> [[TMP30]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[VEC01112]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i16> [[TMP32]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP49:%.*]] = shufflevector <16 x i16> [[TMP47]], <16 x i16> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP38:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP41:%.*]] = shufflevector <16 x i16> [[TMP39]], <16 x i16> [[TMP40]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP43:%.*]] = shufflevector <16 x i16> [[TMP41]], <16 x i16> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP44:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP43]], <16 x i16> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP46:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[TMP45]], <16 x i16> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP41]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC22:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[PHI2]], i64 2
-; GCN-NEXT:    [[VEC23:%.*]] = insertelement <16 x i16> [[VEC22]], i16 [[PHI3]], i64 3
-; GCN-NEXT:    [[VEC24:%.*]] = insertelement <16 x i16> [[VEC23]], i16 [[PHI4]], i64 4
-; GCN-NEXT:    [[VEC25:%.*]] = insertelement <16 x i16> [[VEC24]], i16 [[PHI5]], i64 5
-; GCN-NEXT:    [[VEC26:%.*]] = insertelement <16 x i16> [[VEC25]], i16 [[PHI6]], i64 6
-; GCN-NEXT:    [[VEC27:%.*]] = insertelement <16 x i16> [[VEC26]], i16 [[PHI7]], i64 7
-; GCN-NEXT:    [[VEC28:%.*]] = insertelement <16 x i16> [[VEC27]], i16 [[PHI8]], i64 8
-; GCN-NEXT:    [[VEC29:%.*]] = insertelement <16 x i16> [[VEC28]], i16 [[PHI9]], i64 9
-; GCN-NEXT:    [[VEC210:%.*]] = insertelement <16 x i16> [[VEC29]], i16 [[PHI10]], i64 10
-; GCN-NEXT:    [[VEC211:%.*]] = insertelement <16 x i16> [[VEC210]], i16 [[PHI11]], i64 11
-; GCN-NEXT:    [[VEC212:%.*]] = insertelement <16 x i16> [[VEC211]], i16 [[PHI12]], i64 12
-; GCN-NEXT:    [[VEC213:%.*]] = insertelement <16 x i16> [[VEC212]], i16 [[PHI13]], i64 13
-; GCN-NEXT:    [[TMP61:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC2152:%.*]] = shufflevector <16 x i16> [[VEC213]], <16 x i16> [[TMP61]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT]], align 32
-; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT1]], align 32
-; GCN-NEXT:    store <16 x i16> [[VEC2152]], ptr [[OUT2]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT]], align 32
+; GCN-NEXT:    store <16 x i16> [[TMP49]], ptr [[OUT1]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT2]], align 32
 ; GCN-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 28bab3276c47..6942df532ae2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -7,9 +7,8 @@ define void @foo(double %i) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[I82:%.*]] = fsub double 0.000000e+00, poison
+; CHECK-NEXT:    [[I103:%.*]] = fsub double 0.000000e+00, [[I]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 1, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2
@@ -22,13 +21,11 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]])
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]]
 ; CHECK:       bb115:
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul double 0.000000e+00, [[I103]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, <4 x double> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
index e3a6020a542f..2cc2f28ccf6d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -25,7 +25,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -34,7 +33,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
@@ -42,17 +40,20 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], splat (i32 3)
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], splat (i32 3)
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;

From 02b78ff9c639993356ccc72b847128fd1ff7f2ba Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 08:21:24 -0700
Subject: [PATCH 0653/1322] [llvm] include Compiler.h in a few headers where it
 was missed (#144464)

Add missing `#include "llvm/Support/Compiler.h"` in a few LLVM headers
that use the `LLVM_ABI` macro.
---
 llvm/include/llvm/Option/OptSpecifier.h              | 2 ++
 llvm/include/llvm/Transforms/IPO.h                   | 2 ++
 llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/llvm/include/llvm/Option/OptSpecifier.h b/llvm/include/llvm/Option/OptSpecifier.h
index dc6acae7fc00..cb87fbd17ec1 100644
--- a/llvm/include/llvm/Option/OptSpecifier.h
+++ b/llvm/include/llvm/Option/OptSpecifier.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_OPTION_OPTSPECIFIER_H
 #define LLVM_OPTION_OPTSPECIFIER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 namespace opt {
 
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index 56b30968ffd7..7523ae66429a 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_IPO_H
 #define LLVM_TRANSFORMS_IPO_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class ModulePass;
diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
index ab0bd3a5a996..f20ae1809aa5 100644
--- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 #define LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 template <typename T> class ArrayRef;

From 14286244f1dca9300ead8bf83f049df2ffa97180 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 17 Jun 2025 16:12:14 +0100
Subject: [PATCH 0654/1322] Follow up to 9eb0020555, squelch unused variable
 warning

It turns out that this now-deleted debug-intrinsic code was the only use of
CI.
---
 llvm/lib/Transforms/IPO/IROutliner.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 2c17863266a9..8d6ff72fa606 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -744,9 +744,6 @@ static void moveFunctionData(Function &Old, Function &New,
         continue;
       }
 
-      // From this point we are only handling call instructions.
-      CallInst *CI = cast<CallInst>(&Val);
-
       // Edit the scope of called functions inside of outlined functions.
       if (DISubprogram *SP = New.getSubprogram()) {
         DILocation *DI = DILocation::get(New.getContext(), 0, 0, SP);

From 1410e69b641182e942470a90d4a0bb5a2910805f Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Tue, 17 Jun 2025 11:26:47 -0400
Subject: [PATCH 0655/1322] [SPIRV] Allow __spirv_SpecConstant in Vulkan
 shaders (#143543)

There is a builtin __spirv_SpecConstant that the SPIR-V backend expands
into a specialization constant. However, it is currently only enable for
OpenCL shaders, and not the graphic shaders.

We want to use it for specialization constants coming from HLSL, so we
are enabling it for graphic shaders as well.

Implements https://github.com/llvm/wg-hlsl/pull/287

Fixes https://github.com/llvm/llvm-project/issues/142991
---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |  2 +
 .../CodeGen/SPIRV/constant/spec-constant.ll   | 73 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/constant/spec-constant.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6842e5ff067c..401a762cd62a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -674,6 +674,8 @@ defm : DemangledNativeBuiltin<"ndrange_3D", OpenCL_std, Enqueue, 1, 3, OpBuildND
 
 // Spec constant builtin records:
 defm : DemangledNativeBuiltin<"__spirv_SpecConstant", OpenCL_std, SpecConstant, 2, 2, OpSpecConstant>;
+defm : DemangledNativeBuiltin<"__spirv_SpecConstant", GLSL_std_450,
+                              SpecConstant, 2, 2, OpSpecConstant>;
 defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecConstant, 1, 0, OpSpecConstantComposite>;
 
 // Async Copy and Prefetch builtin records:
diff --git a/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll b/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
new file mode 100644
index 000000000000..299d61d3bffd
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
@@ -0,0 +1,73 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+; CHECK-DAG: OpDecorate [[bool_const:%[0-9]+]] SpecId 1
+; CHECK-DAG: OpDecorate [[short_const:%[0-9]+]] SpecId 2
+; CHECK-DAG: OpDecorate [[int_const:%[0-9]+]] SpecId 3
+; CHECK-DAG: OpDecorate [[long_const:%[0-9]+]] SpecId 4
+; CHECK-DAG: OpDecorate [[float_const:%[0-9]+]] SpecId 8
+; CHECK-DAG: OpDecorate [[double_const:%[0-9]+]] SpecId 9
+; CHECK-DAG: OpDecorate [[enum_const:%[0-9]+]] SpecId 10
+
+; CHECK-DAG: [[bool_const]] = OpSpecConstantTrue {{%[0-9]+}}
+; CHECK-DAG: [[short_const]] = OpSpecConstant {{%[0-9]+}} 4
+; CHECK-DAG: [[int_const]] = OpSpecConstant {{%[0-9]+}} 5
+; CHECK-DAG: [[long_const]] = OpSpecConstant {{%[0-9]+}} 8
+; CHECK-DAG: [[float_const]] = OpSpecConstant {{%[0-9]+}} 1112014848
+; CHECK-DAG: [[double_const]] = OpSpecConstant {{%[0-9]+}} 0 1079574528
+; CHECK-DAG: [[enum_const]] = OpSpecConstant {{%[0-9]+}} 30
+
+@_ZL10bool_const = internal addrspace(10) global i32 0, align 4
+@_ZL11short_const = internal addrspace(10) global i16 0, align 2
+@_ZL9int_const = internal addrspace(10) global i32 0, align 4
+@_ZL10long_const = internal addrspace(10) global i64 0, align 8
+@_ZL11float_const = internal addrspace(10) global float 0.000000e+00, align 4
+@_ZL12double_const = internal addrspace(10) global double 0.000000e+00, align 8
+@_ZL10enum_const = internal addrspace(10) global i32 0, align 4
+
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @main() local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[b:%[0-9]+]] = OpSelect {{%[0-9]+}} [[bool_const]]
+  ; CHECK: OpStore {{%[0-9]+}} [[b]]
+  %0 = tail call spir_func i1 @_Z20__spirv_SpecConstantib(i32 1, i1 true)
+  %storedv.i.i = zext i1 %0 to i32
+  store i32 %storedv.i.i, ptr addrspace(10) @_ZL10bool_const, align 4
+
+  ; CHECK: OpStore {{%[0-9]+}} [[short_const]]
+  %2 = tail call spir_func i16 @_Z20__spirv_SpecConstantis(i32 2, i16 4)
+  store i16 %2, ptr addrspace(10) @_ZL11short_const, align 2
+
+  ; CHECK: OpStore {{%[0-9]+}} [[int_const]]
+  %4 = tail call spir_func i32 @_Z20__spirv_SpecConstantii(i32 3, i32 5)
+  store i32 %4, ptr addrspace(10) @_ZL9int_const, align 4
+
+
+  ; CHECK: OpStore {{%[0-9]+}} [[long_const]]
+  %6 = tail call spir_func i64 @_Z20__spirv_SpecConstantix(i32 4, i64 8)
+  store i64 %6, ptr addrspace(10) @_ZL10long_const, align 8
+
+  ; CHECK: OpStore {{%[0-9]+}} [[float_const]]
+  %14 = tail call reassoc nnan ninf nsz arcp afn spir_func float @_Z20__spirv_SpecConstantif(i32 8, float 5.000000e+01)
+  store float %14, ptr addrspace(10) @_ZL11float_const, align 4
+
+  ; CHECK: OpStore {{%[0-9]+}} [[double_const]]
+  %16 = tail call reassoc nnan ninf nsz arcp afn spir_func double @_Z20__spirv_SpecConstantid(i32 9, double 1.000000e+02)
+  store double %16, ptr addrspace(10) @_ZL12double_const, align 8
+
+  ; CHECK: OpStore {{%[0-9]+}} [[enum_const]]
+  %18 = tail call spir_func i32 @_Z20__spirv_SpecConstantii(i32 10, i32 30)
+  store i32 %18, ptr addrspace(10) @_ZL10enum_const, align 4
+  ret void
+}
+
+
+declare i1 @_Z20__spirv_SpecConstantib(i32, i1)
+declare i8 @_Z20__spirv_SpecConstantia(i32, i8)
+declare i16 @_Z20__spirv_SpecConstantis(i32, i16)
+declare i32 @_Z20__spirv_SpecConstantii(i32, i32)
+declare i64 @_Z20__spirv_SpecConstantix(i32, i64)
+declare float @_Z20__spirv_SpecConstantif(i32, float)
+declare double @_Z20__spirv_SpecConstantid(i32, double)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
\ No newline at end of file

From c80282d333d7248c8a34694ce1bec9a40681c1c5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 00:27:41 +0900
Subject: [PATCH 0656/1322] AMDGPU: Directly select minimumnum/maximumnum with
 ieee_mode=0 (#141903)

The hardware min/max follow the IR rules with IEEE mode disabled,
so we can avoid the canonicalizes of the input. We lose the quieting
of a signaling nan if both inputs are nans, but we only require that
with strictfp.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |    2 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   25 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |    2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   36 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   49 +
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |   42 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        | 1438 +++++++----------
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        | 1438 +++++++----------
 9 files changed, 1364 insertions(+), 1669 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97..7a50923ffedc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -92,6 +92,8 @@ def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().F
 def NoFP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
 def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals == DenormalMode::getPreserveSign()">;
 def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
+def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
+def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e8dff8506438..f82e6df9bcbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -957,12 +957,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
-  auto &MinNumMaxNum = getActionDefinitionsBuilder({
-      G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
-
-  // TODO: These should be custom lowered and are directly legal with IEEE=0
-  auto &MinimumNumMaximumNum =
-      getActionDefinitionsBuilder({G_FMINIMUMNUM, G_FMAXIMUMNUM});
+  auto &MinNumMaxNum = getActionDefinitionsBuilder(
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
+       G_FMAXNUM_IEEE});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -980,8 +977,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
-  MinimumNumMaximumNum.lower();
-
   if (ST.hasVOP3PInsts())
     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
 
@@ -2162,6 +2157,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeFPTOI(MI, MRI, B, false);
   case TargetOpcode::G_FMINNUM:
   case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINIMUMNUM:
+  case TargetOpcode::G_FMAXIMUMNUM:
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
@@ -2741,9 +2738,17 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
 
   // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINNUM/G_FMAXNUM
-  if (!MFI->getMode().IEEE)
+  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
+  //
+  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
+  // enabled.
+  if (!MFI->getMode().IEEE) {
+    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
+        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
+      return true;
+
     return !IsIEEEOp;
+  }
 
   if (IsIEEEOp)
     return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c58..4391a48ff2b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,6 +4009,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FMAXNUM:
   case AMDGPU::G_FMINIMUM:
   case AMDGPU::G_FMAXIMUM:
+  case AMDGPU::G_FMINIMUMNUM:
+  case AMDGPU::G_FMAXIMUMNUM:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_STRICT_FADD:
   case AMDGPU::G_STRICT_FSUB:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ced3a6ba9bc..586de433ea28 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -531,8 +531,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
                        Legal);
 
-  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
-                     Custom);
+  setOperationAction(
+      {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+      {MVT::f32, MVT::f64}, Custom);
 
   // These are really only legal for ieee_mode functions. We should be avoiding
   // them for functions that don't have ieee_mode enabled, so just say they are
@@ -771,7 +772,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         MVT::v32f16, MVT::v32bf16},
                        Custom);
 
-    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+    setOperationAction(
+        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+        MVT::f16, Custom);
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
@@ -825,8 +828,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
                          VT, Custom);
 
-    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
-                       Custom);
+    setOperationAction(
+        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+        {MVT::v2f16, MVT::v4f16}, Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
@@ -6062,6 +6066,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
     return lowerFMINNUM_FMAXNUM(Op, DAG);
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
+    return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
     return lowerFMINIMUM_FMAXIMUM(Op, DAG);
@@ -6086,8 +6093,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
-  case ISD::FMINIMUMNUM:
-  case ISD::FMAXIMUMNUM:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
   case ISD::SADDSAT:
@@ -6995,6 +7000,23 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   return Op;
 }
 
+SDValue
+SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  bool IsIEEEMode = Info->getMode().IEEE;
+
+  if (IsIEEEMode)
+    return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+      VT == MVT::v16bf16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d71a22722129..89fb12b52c3e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ private:
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitFP_ROUNDVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1419f63202a7..897c30948cf0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1390,6 +1390,55 @@ def : GCNPat<
   (S_ADD_U64_PSEUDO $src0, $src1)>;
 }
 
+//===----------------------------------------------------------------------===//
+// FP min/max patterns
+//===----------------------------------------------------------------------===//
+
+
+class FPBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+  : GCNPat <(vt (node (vt (VOP3Mods vt:$src0, i32:$src0_mods)),
+                      (vt (VOP3Mods vt:$src1, i32:$src1_mods)))),
+    (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+class FPPkBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+  : GCNPat <(vt (node (VOP3PMods v2f16:$src0, i32:$src0_mods),
+                      (VOP3PMods v2f16:$src1, i32:$src1_mods))),
+  (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE)
+>;
+
+/// With IEEE=0, signalingness is ignored and the non-nan input will
+/// be directly returned.
+let OtherPredicates = [IEEEModeDisabled] in {
+  def : FPBinOpPat<fminimumnum, f32, V_MIN_F32_e64>;
+  def : FPBinOpPat<fmaximumnum, f32, V_MAX_F32_e64>;
+  def : FPBinOpPat<fminimumnum, f64, V_MIN_F64_e64>;
+  def : FPBinOpPat<fmaximumnum, f64, V_MAX_F64_e64>;
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = NotHasTrue16BitInsts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_e64>;
+  }
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = UseRealTrue16Insts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_t16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_t16_e64>;
+  }
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = UseFakeTrue16Insts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_fake16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_fake16_e64>;
+  }
+
+  let SubtargetPredicate = HasVOP3PInsts in {
+    def : FPPkBinOpPat<fminimumnum, v2f16, V_PK_MIN_F16>;
+    def : FPPkBinOpPat<fmaximumnum, v2f16, V_PK_MAX_F16>;
+  }
+}
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 46da9d33639b..86e73ed03f18 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2019,9 +2019,7 @@ define float @v_fneg_minimumnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -2044,8 +2042,7 @@ define float @v_fneg_self_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %a)
   %min.fneg = fneg float %min
@@ -2068,8 +2065,7 @@ define float @v_fneg_posk_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float 4.0, float %a)
   %fneg = fneg float %min
@@ -2092,8 +2088,7 @@ define float @v_fneg_negk_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float -4.0, float %a)
   %fneg = fneg float %min
@@ -2251,8 +2246,7 @@ define float @v_fneg_neg0_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float -0.0, float %a)
   %fneg = fneg float %min
@@ -2299,7 +2293,6 @@ define float @v_fneg_0_minimumnum_foldable_use_f32_no_ieee(float %a, float %b) #
 ; GCN-LABEL: v_fneg_0_minimumnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -2330,9 +2323,7 @@ define <2 x float> @v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee(float %a,
 ; GCN-LABEL: v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %b)
@@ -2364,9 +2355,7 @@ define float @v_fneg_maximumnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -2389,8 +2378,7 @@ define float @v_fneg_self_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %a)
   %max.fneg = fneg float %max
@@ -2413,8 +2401,7 @@ define float @v_fneg_posk_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float 4.0, float %a)
   %fneg = fneg float %max
@@ -2437,8 +2424,7 @@ define float @v_fneg_negk_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float -4.0, float %a)
   %fneg = fneg float %max
@@ -2473,8 +2459,7 @@ define float @v_fneg_neg0_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float -0.0, float %a)
   %fneg = fneg float %max
@@ -2499,7 +2484,6 @@ define float @v_fneg_0_maximumnum_foldable_use_f32_no_ieee(float %a, float %b) #
 ; GCN-LABEL: v_fneg_0_maximumnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -2530,9 +2514,7 @@ define <2 x float> @v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee(float %a,
 ; GCN-LABEL: v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %b)
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index c45d86ce306e..4f73e8e9c188 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -3414,8 +3414,8 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3652,57 +3652,57 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -3712,8 +3712,8 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -3722,11 +3722,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-SDAG-LABEL: v_maximumnum_v3f16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3735,10 +3735,10 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3750,11 +3750,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3767,10 +3767,10 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3834,12 +3834,19 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3939,16 +3946,16 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v5, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3965,16 +3972,16 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v4f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v4f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v4f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -3988,6 +3995,18 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4003,8 +4022,8 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -4026,10 +4045,10 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4058,10 +4077,10 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4268,22 +4287,22 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v7, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v8, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -4304,19 +4323,19 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v3
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v6f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v5
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v6f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v6f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4333,6 +4352,21 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v3
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v6f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v6f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4351,10 +4385,10 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
@@ -4380,15 +4414,14 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4419,15 +4452,14 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x half> @llvm.maximumnum.v6f16(<6 x half> %x, <6 x half> %y)
@@ -4554,28 +4586,28 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v15, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v9, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v10, v14
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v5, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v11, v15
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -4600,22 +4632,22 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v8f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v8f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v8f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4635,6 +4667,24 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v4
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v8f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v8f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4656,12 +4706,12 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
@@ -4691,18 +4741,17 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4736,18 +4785,17 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %x, <8 x half> %y)
@@ -4978,52 +5026,52 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v16, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v16, v19
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v9, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v17, v8
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v10, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v18, v9
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v11, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v19, v10
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v17, v11
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v18, v12
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v14, v14
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v12, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v15, v15
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v19, v18
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v14, v14
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v13, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v17, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v15, v15
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v14, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v8, v1
@@ -5031,8 +5079,8 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v14, v7
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5064,34 +5112,34 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v7, v7, v8
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v16f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v16f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v8
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v16f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -5123,6 +5171,36 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v7, v7, v8
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v16f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v8
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v16f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5156,29 +5234,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v12
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5214,29 +5292,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5280,29 +5358,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v12, v12
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v13, v13
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v14, v14
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v15, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %result
@@ -6174,34 +6252,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6285,34 +6363,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6396,34 +6474,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6516,34 +6594,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v16, v16, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v17, v17, v17
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v18, v18, v18
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v19, v19, v19
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v20, v20, v20
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v21, v21, v21
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v22, v22, v22
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v23, v23, v23
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v24, v24, v24
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v25, v25, v25
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v26, v26, v26
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v27, v27, v27
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v28, v28, v28
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v29, v29, v29
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v30, v30, v30
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v17
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v18
@@ -6584,11 +6662,11 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v2f32:
@@ -6606,11 +6684,11 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v2f32:
@@ -6624,29 +6702,16 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6663,8 +6728,8 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
@@ -6784,14 +6849,14 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v3f32:
@@ -6812,14 +6877,14 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v3f32:
@@ -6836,40 +6901,19 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v7, v7
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6889,10 +6933,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
@@ -6913,10 +6957,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX11-GISEL-LABEL: v_maximumnum_v3f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4
 ; GFX11-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6943,10 +6987,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4
 ; GFX12-GISEL-NEXT:    v_max_num_f32_e32 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7030,17 +7074,17 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v4f32:
@@ -7064,17 +7108,17 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v4f32:
@@ -7094,43 +7138,22 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7153,12 +7176,12 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
@@ -7182,10 +7205,10 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7214,10 +7237,10 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7365,11 +7388,11 @@ define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f64:
@@ -7606,14 +7629,14 @@ define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f64:
@@ -7895,17 +7918,17 @@ define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f64:
@@ -8091,10 +8114,10 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-SDAG-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8107,89 +8130,35 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f16_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_no_ieee:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-FAKE16-SDAG:       ; %bb.0:
-; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-FAKE16-GISEL:       ; %bb.0:
-; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_no_ieee:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX12-TRUE16-SDAG:       ; %bb.0:
@@ -8320,85 +8289,35 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 }
 
 define float @v_maximumnum_f32_no_ieee(float %x, float %y) #0 {
-; GFX7-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f32_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f32_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f32_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f32_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_f32_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_f32_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8472,87 +8391,35 @@ define float @v_maximumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 }
 
 define double @v_maximumnum_f64_no_ieee(double %x, double %y) #0 {
-; GFX7-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f64_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f64_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f64_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f64_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_f64_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_f64_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8631,14 +8498,14 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8659,11 +8526,7 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8671,82 +8534,28 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8909,12 +8718,19 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 5cb051d2ab85..558006d2b695 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -3239,8 +3239,8 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3477,57 +3477,57 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3537,8 +3537,8 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3547,11 +3547,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-SDAG-LABEL: v_minimumnum_v3f16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3560,10 +3560,10 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3575,11 +3575,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3592,10 +3592,10 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3659,12 +3659,19 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3764,16 +3771,16 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v6
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v5, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3790,16 +3797,16 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v4f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v4f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v4f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -3813,6 +3820,18 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3828,8 +3847,8 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3851,10 +3870,10 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3883,10 +3902,10 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4093,22 +4112,22 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v9
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v7, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v8, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -4129,19 +4148,19 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v2, v2, v3
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v6f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v5
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v6f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v6f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4158,6 +4177,21 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v2, v2, v3
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v6f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v6f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4176,10 +4210,10 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
@@ -4205,15 +4239,14 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4244,15 +4277,14 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x half> @llvm.minimumnum.v6f16(<6 x half> %x, <6 x half> %y)
@@ -4379,28 +4411,28 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v15, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v12
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v9, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v5, v10, v14
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v5, v5, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v11, v15
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -4425,22 +4457,22 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v3, v3, v4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v8f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v8f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v8f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4460,6 +4492,24 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v3, v3, v4
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v8f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v8f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4481,12 +4531,12 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
@@ -4516,18 +4566,17 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4561,18 +4610,17 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %x, <8 x half> %y)
@@ -4803,52 +4851,52 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v16, v16, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v16, v16, v19
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v9, v9, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v17, v8
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v10, v10, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v9, v18, v9
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v11, v11, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v10, v19, v10
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v11, v17, v11
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v12, v18, v12
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v14, v14
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v12, v12, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v15, v15
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v18, v19, v18
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v14, v14
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v13, v13, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v13, v17, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v15, v15
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v14, v14, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v8, v1
@@ -4856,8 +4904,8 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v14, v7
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_minimumnum_v16f16:
@@ -4889,34 +4937,34 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v7, v7, v8
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v16f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v16f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v4, v4, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v5, v5, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v7, v7, v8
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v16f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4948,6 +4996,36 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v7, v7, v8
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v16f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v4, v4, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v5, v5, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v7, v7, v8
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v16f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4981,29 +5059,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v7, v7, v12
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_minimumnum_v16f16:
@@ -5039,29 +5117,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_v16f16:
@@ -5105,29 +5183,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v12, v12
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v13, v13
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v14, v14
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v15, v15
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %result
@@ -5999,34 +6077,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6110,34 +6188,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6221,34 +6299,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6341,34 +6419,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v16, v16, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v17, v17, v17
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v18, v18, v18
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v19, v19, v19
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v20, v20, v20
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v21, v21, v21
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v22, v22, v22
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v23, v23, v23
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v24, v24, v24
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v25, v25, v25
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v26, v26, v26
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v27, v27, v27
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v28, v28, v28
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v29, v29, v29
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v30, v30, v30
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v16
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v17
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v18
@@ -6409,11 +6487,11 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v2f32:
@@ -6431,11 +6509,11 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v2f32:
@@ -6449,29 +6527,16 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6488,8 +6553,8 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
@@ -6609,14 +6674,14 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v3f32:
@@ -6637,14 +6702,14 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v3f32:
@@ -6661,40 +6726,19 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v7, v7
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6714,10 +6758,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
@@ -6738,10 +6782,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX11-GISEL-LABEL: v_minimumnum_v3f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4
 ; GFX11-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6768,10 +6812,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4
 ; GFX12-GISEL-NEXT:    v_min_num_f32_e32 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6855,17 +6899,17 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v4f32:
@@ -6889,17 +6933,17 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v4f32:
@@ -6919,43 +6963,22 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6978,12 +7001,12 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
@@ -7007,10 +7030,10 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7039,10 +7062,10 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7190,11 +7213,11 @@ define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f64:
@@ -7431,14 +7454,14 @@ define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f64:
@@ -7720,17 +7743,17 @@ define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
+; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f64:
@@ -7916,10 +7939,10 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-SDAG-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7932,89 +7955,35 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f16_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_no_ieee:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-GISEL-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-FAKE16-SDAG:       ; %bb.0:
-; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-FAKE16-GISEL:       ; %bb.0:
-; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_no_ieee:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX12-TRUE16-SDAG:       ; %bb.0:
@@ -8145,85 +8114,35 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 }
 
 define float @v_minimumnum_f32_no_ieee(float %x, float %y) #0 {
-; GFX7-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f32_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f32_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f32_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f32_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_f32_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_f32_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8297,87 +8216,35 @@ define float @v_minimumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 }
 
 define double @v_minimumnum_f64_no_ieee(double %x, double %y) #0 {
-; GFX7-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f64_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f64_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f64_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f64_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_f64_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_f64_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8456,14 +8323,14 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8484,11 +8351,7 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8496,82 +8359,28 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8734,12 +8543,19 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:

From 72fb8ae541dcb6d4ab24283bd91a1fc64a9b7e3b Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Tue, 17 Jun 2025 15:29:37 +0000
Subject: [PATCH 0657/1322] [lld][test][PAC] Do not rely on concrete offsets in
 LTO tests (#143358)

When changing codegen (e.g. in #130809), offsets in binaries produced by
LTO tests might change. We do not need to match concrete offset values,
it's enough to ensure that hex values in particular places are
identical.

---------

Co-authored-by: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
---
 lld/test/ELF/lto/aarch64-pac-got-func.ll | 50 ++++++++++++------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/lld/test/ELF/lto/aarch64-pac-got-func.ll b/lld/test/ELF/lto/aarch64-pac-got-func.ll
index a37c67a2f3ba..0baa3559a6f9 100644
--- a/lld/test/ELF/lto/aarch64-pac-got-func.ll
+++ b/lld/test/ELF/lto/aarch64-pac-got-func.ll
@@ -5,29 +5,29 @@
 ; RUN: llvm-readelf -r -x.got %t | FileCheck %s
 
 ; CHECK:      Relocation section '.rela.dyn' at offset 0x3d0 contains 8 entries:
-; CHECK-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-; CHECK-NEXT: 00000000000206a0  0000000100000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 func_undef + 0
-; CHECK-NEXT: 00000000000206a8  0000000200000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g1 + 0
-; CHECK-NEXT: 00000000000206b0  0000000300000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g2 + 0
-; CHECK-NEXT: 00000000000206b8  0000000400000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g3 + 0
-; CHECK-NEXT: 00000000000206c0  0000000500000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g4 + 0
-; CHECK-NEXT: 00000000000206c8  0000000600000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 var_undef + 0
-; CHECK-NEXT: 0000000000020690  0000000700000412 R_AARCH64_AUTH_GLOB_DAT 0000000000010490 func + 0
-; CHECK-NEXT: 0000000000020698  0000000a00000412 R_AARCH64_AUTH_GLOB_DAT 00000000000306d0 var + 0
+; CHECK-NEXT:     Offset                Info             Type               Symbol's Value  Symbol's Name + Addend
+; CHECK-NEXT: [[#%x,ADDR:]]        0000000100000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 func_undef + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x8]]  0000000200000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g1 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x10]] 0000000300000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g2 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x18]] 0000000400000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g3 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x20]] 0000000500000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g4 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x28]] 0000000600000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 var_undef + 0
+; CHECK-NEXT: {{0*}}[[#ADDR-0x10]] 0000000700000412 R_AARCH64_AUTH_GLOB_DAT 0000000000010800 func + 0
+; CHECK-NEXT: {{0*}}[[#ADDR-0x8]]  0000000a00000412 R_AARCH64_AUTH_GLOB_DAT 0000000000031400 var + 0
 
 ; CHECK:      Hex dump of section '.got':
-; CHECK-NEXT: 0x00020690 00000000 00000080 00000000 000000a0
-;;                                      ^^ func: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
-;;                                                        ^^ var: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206a0 00000000 00000080 00000000 000000a0
-;;                                      ^^ func_undef: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
-;;                                                        ^^ g1: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206b0 00000000 000000a0 00000000 000000a0
-;;                                      ^^ g2: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-;;                                                        ^^ g3: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206c0 00000000 000000a0 00000000 000000a0
-;;                                      ^^ g4: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-;;                                                        ^^ var_undef: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR-0x10]] 00000000 00000080 00000000 000000a0
+;;                                                  ^^ func: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
+;;                                                                    ^^ var: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR]]      00000000 00000080 00000000 000000a0
+;;                                                  ^^ func_undef: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
+;;                                                                    ^^ g1: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR+0x10]] 00000000 000000a0 00000000 000000a0
+;;                                                  ^^ g2: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+;;                                                                    ^^ g3: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR+0x20]] 00000000 000000a0 00000000 000000a0
+;;                                                  ^^ g4: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+;;                                                                    ^^ var_undef: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
@@ -37,16 +37,16 @@ target triple = "aarch64-unknown-linux-gnu"
 @g3 = external global ptr
 @g4 = external global ptr
 
-define void @func() {
+define void @func() align 1024 {
 entry:
   ret void
 }
 declare void @func_undef()
 
-@var = global i32 42
+@var = global i32 42, align 1024
 @var_undef = external global i32
 
-define void @bar() #0 {
+define void @bar() #0 align 1024 {
 entry:
   store ptr ptrauth (ptr @func, i32 0), ptr @g1
   store ptr ptrauth (ptr @func_undef, i32 0), ptr @g2
@@ -55,7 +55,7 @@ entry:
   ret void
 }
 
-define void @_start() {
+define void @_start() align 1024 {
 entry:
   ret void
 }

From 4ced29b8482e3537da7d27d410bf7947b0666b4c Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 17 Jun 2025 16:33:24 +0100
Subject: [PATCH 0658/1322] [lldb][Expression] Don't create Objective-C IR
 checker for pure-C++ targets/frames (#144503)

There's no need to create this utility function (and run it) for
targets/frames that aren't Objective-C/Objective-C++.
---
 .../Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
index 6ef5d3f5be6d..be17c5421fc5 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
@@ -18,6 +18,7 @@
 
 #include "lldb/Expression/UtilityFunction.h"
 #include "lldb/Target/ExecutionContext.h"
+#include "lldb/Target/Language.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StackFrame.h"
 #include "lldb/Target/Target.h"
@@ -46,7 +47,13 @@ ClangDynamicCheckerFunctions::Install(DiagnosticManager &diagnostic_manager,
     ObjCLanguageRuntime *objc_language_runtime =
         ObjCLanguageRuntime::Get(*process);
 
-    if (objc_language_runtime) {
+    SourceLanguage lang = process->GetTarget().GetLanguage();
+    if (!lang)
+      if (auto *frame = exe_ctx.GetFramePtr())
+        lang = frame->GetLanguage();
+
+    if (objc_language_runtime &&
+        Language::LanguageIsObjC(lang.AsLanguageType())) {
       Expected<std::unique_ptr<UtilityFunction>> checker_fn =
           objc_language_runtime->CreateObjectChecker(VALID_OBJC_OBJECT_CHECK_NAME, exe_ctx);
       if (!checker_fn)

From a5f5f1209aa122ee295ae0dc0f1ee594ad988ecd Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 17 Jun 2025 16:46:44 +0100
Subject: [PATCH 0659/1322] [AMDGPU] Use subtarget feature for v_lshl_add_u64
 pattern. NFC. (#144544)

Following on from #133723, use the new subtarget feature for the
selection pattern as well as for the instruction definition.
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a005e0245b8f..f372101cb7b7 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -806,7 +806,7 @@ def : GCNPat<
  (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
  (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
 
-let SubtargetPredicate = isGFX940Plus in
+let SubtargetPredicate = HasLshlAddU64Inst in
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
   (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)

From cd4e3843395329538feb1c29cd582471b482caf7 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 17 Jun 2025 08:37:15 -0700
Subject: [PATCH 0660/1322] [flang][test] Removed temporary workaround for
 buildbots.

---
 flang/test/Semantics/modfile75.F90 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index a61c59bbb31b..8f7adafe7204 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,6 +1,4 @@
 !RUN: rm -rf %t && mkdir -p %t
-! The next line is a temporary clean-up for the buildbots to pass.
-!RUN: rm -f modfile75a.mod modfile75b.mod
 !RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1

From cf637b7e3554976419a0d672ad4c252137dc34f3 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Tue, 17 Jun 2025 16:57:32 +0100
Subject: [PATCH 0661/1322] [flang][OpenMP] Fix goto within SECTION (#144502)

Previously we didn't push any context for SECTION and they are not
modelled with differing scopes and so goto detection couldn't tell that
GOTOs between two SECTIONs were between constructs rather than just
staying inside of the parent SECTIONS construct.

Fixes #143231
---
 flang/lib/Semantics/resolve-directives.cpp     | 18 ++++++++++++++++--
 .../Semantics/OpenMP/parallel-sections01.f90   |  2 ++
 flang/test/Semantics/OpenMP/sections-goto.f90  | 11 +++++++++++
 flang/test/Semantics/OpenMP/sections02.f90     |  2 ++
 4 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/sections-goto.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index b5f8667fe36f..282660684e78 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -384,6 +384,9 @@ public:
   bool Pre(const parser::OpenMPSectionsConstruct &);
   void Post(const parser::OpenMPSectionsConstruct &) { PopContext(); }
 
+  bool Pre(const parser::OpenMPSectionConstruct &);
+  void Post(const parser::OpenMPSectionConstruct &) { PopContext(); }
+
   bool Pre(const parser::OpenMPCriticalConstruct &critical);
   void Post(const parser::OpenMPCriticalConstruct &) { PopContext(); }
 
@@ -2003,6 +2006,12 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionsConstruct &x) {
   return true;
 }
 
+bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
+  PushContext(x.source, llvm::omp::Directive::OMPD_section);
+  GetContext().withinConstruct = true;
+  return true;
+}
+
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const auto &beginCriticalDir{std::get<parser::OmpCriticalDirective>(x.t)};
   const auto &endCriticalDir{std::get<parser::OmpEndCriticalDirective>(x.t)};
@@ -3024,8 +3033,13 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
     const parser::CharBlock target, std::optional<DirContext> sourceContext,
     std::optional<DirContext> targetContext) {
   auto dirContextsSame = [](DirContext &lhs, DirContext &rhs) -> bool {
-    // Sometimes nested constructs share a scope but are different contexts
-    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive);
+    // Sometimes nested constructs share a scope but are different contexts.
+    // The directiveSource comparison is for OmpSection. Sections do not have
+    // their own scopes and two different sections both have the same directive.
+    // Their source however is different. This string comparison is unfortunate
+    // but should only happen for GOTOs inside of SECTION.
+    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive) &&
+        (lhs.directiveSource == rhs.directiveSource);
   };
   unsigned version{context_.langOptions().OpenMPVersion};
   if (targetContext &&
diff --git a/flang/test/Semantics/OpenMP/parallel-sections01.f90 b/flang/test/Semantics/OpenMP/parallel-sections01.f90
index 6c5a053bf49c..19448258af76 100644
--- a/flang/test/Semantics/OpenMP/parallel-sections01.f90
+++ b/flang/test/Semantics/OpenMP/parallel-sections01.f90
@@ -35,6 +35,8 @@ program OmpConstructSections01
    !$omp section
    print *, "This is a single statement structured block"
    !$omp section
+   !ERROR: invalid branch into an OpenMP structured block
+   !ERROR: invalid branch leaving an OpenMP structured block
    open (10, file="random-file-name.txt", err=30)
    !ERROR: invalid branch into an OpenMP structured block
    !ERROR: invalid branch leaving an OpenMP structured block
diff --git a/flang/test/Semantics/OpenMP/sections-goto.f90 b/flang/test/Semantics/OpenMP/sections-goto.f90
new file mode 100644
index 000000000000..9fa9df9f50b9
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/sections-goto.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Regression test for #143231
+
+!$omp sections
+! ERROR: invalid branch into an OpenMP structured block
+! ERROR: invalid branch leaving an OpenMP structured block
+goto 10
+!$omp section
+10 print *, "Invalid jump"
+!$omp end sections
+end
diff --git a/flang/test/Semantics/OpenMP/sections02.f90 b/flang/test/Semantics/OpenMP/sections02.f90
index ee29922a72c0..8144b491071d 100644
--- a/flang/test/Semantics/OpenMP/sections02.f90
+++ b/flang/test/Semantics/OpenMP/sections02.f90
@@ -19,6 +19,8 @@ program OmpConstructSections01
    !$omp section
    print *, "This is a single statement structured block"
    !$omp section
+   !ERROR: invalid branch into an OpenMP structured block
+   !ERROR: invalid branch leaving an OpenMP structured block
    open (10, file="random-file-name.txt", err=30)
    !ERROR: invalid branch into an OpenMP structured block
    !ERROR: invalid branch leaving an OpenMP structured block

From 0108a5908cab5e418c683ef9b6e1810755344b5e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 17 Jun 2025 07:55:52 -0700
Subject: [PATCH 0662/1322] [SLP]Fix a crash on an subvector size calculation
 for non-power-of-2 vector

Patch fixes cost estimation for the extractelements from non-power-of-2
vectors, defined as subvector extracts. In this case the subvector size
might be not adjusted to a whole register size, need to get the minimum
between whole vector size and the actual difference to prevent compiler
crash.

Fixes #143513
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 16 ++--
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 25 +++---
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 25 +++---
 .../SystemZ/non-power-2-subvector-extract.ll  | 87 +++++++++++++++++++
 4 files changed, 119 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d811e9d77d18..4551a365a696 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12085,7 +12085,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // FIXME: this must be moved to TTI for better estimation.
     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
     auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
-                                        SmallVectorImpl<unsigned> &Indices)
+                                        SmallVectorImpl<unsigned> &Indices,
+                                        SmallVectorImpl<unsigned> &SubVecSizes)
         -> std::optional<TTI::ShuffleKind> {
       if (NumElts <= EltsPerVector)
         return std::nullopt;
@@ -12130,7 +12131,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return std::min(S, I);
                     }),
                 EltsPerVector);
-            Indices.push_back(OffsetReg1 % NumElts);
+            unsigned Index = OffsetReg1 % NumElts;
+            Indices.push_back(Index);
+            SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
           }
           Idx = I - OffsetReg1;
         }
@@ -12152,8 +12155,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
       SmallVector<unsigned, 2> Indices;
+      SmallVector<unsigned, 2> SubVecSizes;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask, Indices);
+          CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
       if (!RegShuffleKind) {
         if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
             !ShuffleVectorInst::isIdentityMask(
@@ -12171,12 +12175,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
       const unsigned BaseVF = getFullVectorNumberOfElements(
           *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
-      for (unsigned Idx : Indices) {
-        assert((Idx + EltsPerVector) <= BaseVF &&
+      for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
+        assert((Idx + SubVecSize) <= BaseVF &&
                "SK_ExtractSubvector index out of range");
         Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
                                  getWidenedType(ScalarTy, BaseVF), {}, CostKind,
-                                 Idx, getWidenedType(ScalarTy, EltsPerVector));
+                                 Idx, getWidenedType(ScalarTy, SubVecSize));
       }
       // Second attempt to check, if just a permute is better estimated than
       // subvector extract.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 28b48bd3ce6d..9bfd92ef35a4 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -272,24 +272,21 @@ define <16 x i16> @add_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HADDD1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADDB:%.*]] = insertelement <16 x i16> [[HADD92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 0062527b678c..13b4d7da97c9 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -272,24 +272,21 @@ define <16 x i16> @sub_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HSUBD1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUBB:%.*]] = insertelement <16 x i16> [[HSUB92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
new file mode 100644
index 000000000000..6006bf9cb262
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-ibm-linux -mcpu=z13 -slp-max-reg-size=256 -slp-vectorize-hor-store -slp-vectorize-non-power-of-2 < %s | FileCheck %s
+
+@c = external global [1 x [10 x i32]]
+@j.0 = external global i32
+
+define void @p() {
+; CHECK-LABEL: define void @p(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP0]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <7 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <7 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[OR_1_5_I_3:%.*]] = or i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[OR_1_5_I_3]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <7 x i32> [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <7 x i32> [[TMP4]], splat (i32 1)
+; CHECK-NEXT:    store <7 x i32> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    [[OR_1_5_I_5:%.*]] = or i32 [[TMP10]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[OR_1_6_I_5:%.*]] = or i32 [[OR_1_5_I_5]], [[TMP11]]
+; CHECK-NEXT:    store i32 [[OR_1_6_I_5]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx12.promoted.5.i = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %conv14.5.i = xor i32 %arrayidx12.promoted.5.i, 1
+  store i32 %conv14.5.i, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %arrayidx12.promoted.5.i.1 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %conv14.5.i.1 = xor i32 %arrayidx12.promoted.5.i.1, 1
+  store i32 %conv14.5.i.1, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %arrayidx12.promoted.5.i.2 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %conv14.5.i.2 = xor i32 %arrayidx12.promoted.5.i.2, 1
+  store i32 %conv14.5.i.2, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %arrayidx12.promoted.1.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %conv14.1.i.3 = xor i32 %arrayidx12.promoted.1.i.3, 1
+  store i32 %conv14.1.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %arrayidx12.promoted.5.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %conv14.5.i.3 = xor i32 %arrayidx12.promoted.5.i.3, 1
+  store i32 %conv14.5.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %arrayidx12.promoted.6.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  %conv14.6.i.3 = xor i32 %arrayidx12.promoted.6.i.3, 1
+  %or.1.5.i.3 = or i32 %arrayidx12.promoted.1.i.3, %arrayidx12.promoted.5.i.3
+  store i32 %conv14.6.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  store i32 %or.1.5.i.3, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %conv14.1.i.4 = xor i32 %arrayidx12.promoted.1.i.4, 1
+  store i32 %conv14.1.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %arrayidx12.promoted.5.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %conv14.5.i.4 = xor i32 %arrayidx12.promoted.5.i.4, 1
+  store i32 %conv14.5.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %arrayidx12.promoted.6.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %conv14.6.i.4 = xor i32 %arrayidx12.promoted.6.i.4, 1
+  store i32 %conv14.6.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %arrayidx12.promoted.1.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %conv14.1.i.5 = xor i32 %arrayidx12.promoted.1.i.5, 1
+  store i32 %conv14.1.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %arrayidx12.promoted.5.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %conv14.5.i.5 = xor i32 %arrayidx12.promoted.5.i.5, 1
+  store i32 %conv14.5.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %arrayidx12.promoted.6.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  %conv14.6.i.5 = xor i32 %arrayidx12.promoted.6.i.5, 1
+  %0 = or i32 %arrayidx12.promoted.6.i.4, %arrayidx12.promoted.1.i.5
+  %or.1.5.i.5 = or i32 %0, %arrayidx12.promoted.5.i.5
+  %or.1.6.i.5 = or i32 %or.1.5.i.5, %arrayidx12.promoted.6.i.5
+  store i32 %conv14.6.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  store i32 %or.1.6.i.5, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %conv14.1.i.6 = xor i32 %arrayidx12.promoted.1.i.6, 1
+  store i32 %conv14.1.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %arrayidx12.promoted.5.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %conv14.5.i.6 = xor i32 %arrayidx12.promoted.5.i.6, 1
+  store i32 %conv14.5.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %arrayidx12.promoted.6.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  %conv14.6.i.6 = xor i32 %arrayidx12.promoted.6.i.6, 1
+  store i32 %conv14.6.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  ret void
+}

From 00139f10c3cd4118de7148635c820bb42843287a Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Tue, 17 Jun 2025 09:00:18 -0700
Subject: [PATCH 0663/1322] [NVPTX] Cleanup ld/st lowering (#143936)

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp  | 458 +++++++------------
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h    |   3 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td      |   4 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td     |  90 ++--
 llvm/test/CodeGen/NVPTX/bug26185-2.ll        |  22 +-
 llvm/test/CodeGen/NVPTX/bug26185.ll          |  73 ++-
 llvm/test/CodeGen/NVPTX/i1-ext-load.ll       |   4 +-
 llvm/test/CodeGen/NVPTX/ldu-ldg.ll           |   8 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll |  19 +-
 9 files changed, 308 insertions(+), 373 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 79b1bfbc8072..ff10eea37104 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -136,7 +136,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     break;
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
-    if (tryLDGLDU(N))
+    if (tryLDU(N))
       return;
     break;
   case NVPTXISD::StoreV2:
@@ -324,7 +324,7 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_p:
-    return tryLDGLDU(N);
+    return tryLDU(N);
 
   case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
   case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
@@ -1048,35 +1048,28 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   assert(LD->readMem() && "Expected load");
 
   // do not support pre/post inc/dec
-  LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
+  const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
   if (PlainLoad && PlainLoad->isIndexed())
     return false;
 
-  EVT LoadedVT = LD->getMemoryVT();
-  if (!LoadedVT.isSimple())
+  const EVT LoadedEVT = LD->getMemoryVT();
+  if (!LoadedEVT.isSimple())
     return false;
+  const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
   // Address Space Setting
   const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
-    return tryLDGLDU(N);
+    return tryLDG(LD);
 
-  SDLoc DL(N);
+  SDLoc DL(LD);
   SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
-  // Type Setting: fromType + fromTypeWidth
-  //
-  // Sign   : ISD::SEXTLOAD
-  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
-  //          type is integer
-  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
-  MVT SimpleVT = LoadedVT.getSimpleVT();
-  // Read at least 8 bits (predicates are stored as 8-bit values)
-  unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
+  const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
 
   // Vector Setting
-  unsigned int FromType =
+  const unsigned FromType =
       (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
           ? NVPTX::PTXLdStInstCode::Signed
           : NVPTX::PTXLdStInstCode::Untyped;
@@ -1102,29 +1095,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!Opcode)
     return false;
 
-  SDNode *NVPTXLD =
-      CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
+  SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
   if (!NVPTXLD)
     return false;
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+  MachineMemOperand *MemRef = LD->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
 
-  ReplaceNode(N, NVPTXLD);
+  ReplaceNode(LD, NVPTXLD);
   return true;
 }
 
-static bool isSubVectorPackedInI32(EVT EltVT) {
-  // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
-  // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
-  // vectorized loads/stores with the actual element type for i8/i16 as that
-  // would require v8/v16 variants that do not exist.
-  // In order to load/store such vectors efficiently, in Type Legalization
-  // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
-  // lower to PTX as vectors of b32.
-  return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
-}
-
 static unsigned getLoadStoreVectorNumElts(SDNode *N) {
   switch (N->getOpcode()) {
   case NVPTXISD::LoadV2:
@@ -1142,21 +1123,21 @@ static unsigned getLoadStoreVectorNumElts(SDNode *N) {
 }
 
 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
-  MemSDNode *MemSD = cast<MemSDNode>(N);
-  const EVT MemEVT = MemSD->getMemoryVT();
+  MemSDNode *LD = cast<MemSDNode>(N);
+  const EVT MemEVT = LD->getMemoryVT();
   if (!MemEVT.isSimple())
     return false;
   const MVT MemVT = MemEVT.getSimpleVT();
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
-  if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
-    return tryLDGLDU(N);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+  if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
+    return tryLDG(LD);
 
-  EVT EltVT = N->getValueType(0);
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+  const MVT EltVT = LD->getSimpleValueType(0);
+  SDLoc DL(LD);
+  SDValue Chain = LD->getChain();
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
   // Type Setting: fromType + fromTypeWidth
   //
@@ -1167,18 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   // Read at least 8 bits (predicates are stored as 8-bit values)
   // The last operand holds the original LoadSDNode::getExtensionType() value
   const unsigned TotalWidth = MemVT.getSizeInBits();
-  unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
-  unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
-                          ? NVPTX::PTXLdStInstCode::Signed
-                          : NVPTX::PTXLdStInstCode::Untyped;
+  const unsigned ExtensionType =
+      N->getConstantOperandVal(N->getNumOperands() - 1);
+  const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+                                ? NVPTX::PTXLdStInstCode::Signed
+                                : NVPTX::PTXLdStInstCode::Untyped;
 
-  unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
-
-  if (isSubVectorPackedInI32(EltVT)) {
-    assert(ExtensionType == ISD::NON_EXTLOAD);
-    EltVT = MVT::i32;
-  }
+  const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
 
+  assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
   assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
          FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
@@ -1196,192 +1174,183 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   std::optional<unsigned> Opcode;
   switch (N->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable("Unexpected opcode");
   case NVPTXISD::LoadV2:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2,
-                             NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2,
-                             NVPTX::LDV_i64_v2);
+    Opcode =
+        pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v2, NVPTX::LDV_i16_v2,
+                        NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
     break;
   case NVPTXISD::LoadV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
-                             NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4,
-                             NVPTX::LDV_i64_v4);
+    Opcode =
+        pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v4, NVPTX::LDV_i16_v4,
+                        NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
     break;
   case NVPTXISD::LoadV8:
-    Opcode =
-        pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                        {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::LDV_i32_v8, {/* no v8i64 */});
     break;
   }
   if (!Opcode)
     return false;
 
-  SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+  SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
+  MachineMemOperand *MemRef = LD->getMemOperand();
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
 
-  ReplaceNode(N, LD);
+  ReplaceNode(LD, NVPTXLD);
   return true;
 }
 
-bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
-  auto *Mem = cast<MemSDNode>(N);
+bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
+  const EVT LoadedEVT = LD->getMemoryVT();
+  if (!LoadedEVT.isSimple())
+    return false;
+  const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
-  // If this is an LDG intrinsic, the address is the third operand. If its an
-  // LDG/LDU SD node (from custom vector handling), then its the second operand
-  SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+  SDLoc DL(LD);
 
-  const EVT OrigType = N->getValueType(0);
-  EVT EltVT = Mem->getMemoryVT();
-  unsigned NumElts = 1;
-
-  if (EltVT == MVT::i128 || EltVT == MVT::f128) {
-    EltVT = MVT::i64;
-    NumElts = 2;
-  }
-  if (EltVT.isVector()) {
-    NumElts = EltVT.getVectorNumElements();
-    EltVT = EltVT.getVectorElementType();
-    // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
-    // elements.
-    if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
-        (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
-        (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
-        (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
-      assert(NumElts % OrigType.getVectorNumElements() == 0 &&
-             "NumElts must be divisible by the number of elts in subvectors");
-      EltVT = OrigType;
-      NumElts /= OrigType.getVectorNumElements();
-    }
+  const unsigned TotalWidth = LoadedVT.getSizeInBits();
+  unsigned ExtensionType;
+  unsigned NumElts;
+  if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
+    ExtensionType = Load->getExtensionType();
+    NumElts = 1;
+  } else {
+    ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
+    NumElts = getLoadStoreVectorNumElts(LD);
   }
+  const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+                                ? NVPTX::PTXLdStInstCode::Signed
+                                : NVPTX::PTXLdStInstCode::Untyped;
 
-  // Build the "promoted" result VTList for the load. If we are really loading
-  // i8s, then the return type will be promoted to i16 since we do not expose
-  // 8-bit registers in NVPTX.
-  const EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
-  SmallVector<EVT, 5> InstVTs;
-  InstVTs.append(NumElts, NodeVT);
-  InstVTs.push_back(MVT::Other);
-  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
-  SDValue Chain = N->getOperand(0);
+  const unsigned FromTypeWidth = TotalWidth / NumElts;
+
+  assert(!(LD->getSimpleValueType(0).isVector() &&
+           ExtensionType != ISD::NON_EXTLOAD));
+  assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+         FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
   SDValue Base, Offset;
-  SelectADDR(Op1, Base, Offset);
-  SDValue Ops[] = {Base, Offset, Chain};
+  SelectADDR(LD->getOperand(1), Base, Offset);
+  SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
+                   Offset, LD->getChain()};
+
+  const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
+  std::optional<unsigned> Opcode;
+  switch (LD->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::LOAD:
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i8,
+                             NVPTX::LD_GLOBAL_NC_i16, NVPTX::LD_GLOBAL_NC_i32,
+                             NVPTX::LD_GLOBAL_NC_i64);
+    break;
+  case NVPTXISD::LoadV2:
+    Opcode = pickOpcodeForVT(
+        TargetVT, NVPTX::LD_GLOBAL_NC_v2i8, NVPTX::LD_GLOBAL_NC_v2i16,
+        NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
+    break;
+  case NVPTXISD::LoadV4:
+    Opcode = pickOpcodeForVT(
+        TargetVT, NVPTX::LD_GLOBAL_NC_v4i8, NVPTX::LD_GLOBAL_NC_v4i16,
+        NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
+    break;
+  case NVPTXISD::LoadV8:
+    Opcode = pickOpcodeForVT(TargetVT, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
+    break;
+  }
+  if (!Opcode)
+    return false;
+
+  SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
+
+  ReplaceNode(LD, NVPTXLDG);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
+  auto *LD = cast<MemSDNode>(N);
+
+  unsigned NumElts;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::INTRINSIC_W_CHAIN:
+    NumElts = 1;
+    break;
+  case NVPTXISD::LDUV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::LDUV4:
+    NumElts = 4;
+    break;
+  }
+
+  const MVT::SimpleValueType SelectVT =
+      MVT::getIntegerVT(LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy;
+
+  // If this is an LDU intrinsic, the address is the third operand. If its an
+  // LDU SD node (from custom vector handling), then its the second operand
+  SDValue Addr =
+      LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+
+  SDValue Base, Offset;
+  SelectADDR(Addr, Base, Offset);
+  SDValue Ops[] = {Base, Offset, LD->getChain()};
 
   std::optional<unsigned> Opcode;
   switch (N->getOpcode()) {
   default:
-    return false;
-  case ISD::LOAD:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
-        NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
-        NVPTX::INT_PTX_LDG_GLOBAL_i64);
-    break;
+    llvm_unreachable("Unexpected opcode");
   case ISD::INTRINSIC_W_CHAIN:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
-        NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
-        NVPTX::INT_PTX_LDU_GLOBAL_i64);
-    break;
-  case NVPTXISD::LoadV2:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
-        NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
-        NVPTX::INT_PTX_LDG_G_v2i64_ELE);
+    Opcode =
+        pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_i8, NVPTX::LDU_GLOBAL_i16,
+                        NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
     break;
   case NVPTXISD::LDUV2:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
-        NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
-        NVPTX::INT_PTX_LDU_G_v2i64_ELE);
-    break;
-  case NVPTXISD::LoadV4:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
-        NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
-        NVPTX::INT_PTX_LDG_G_v4i64_ELE);
+    Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v2i8,
+                             NVPTX::LDU_GLOBAL_v2i16, NVPTX::LDU_GLOBAL_v2i32,
+                             NVPTX::LDU_GLOBAL_v2i64);
     break;
   case NVPTXISD::LDUV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                             NVPTX::INT_PTX_LDU_G_v4i8_ELE,
-                             NVPTX::INT_PTX_LDU_G_v4i16_ELE,
-                             NVPTX::INT_PTX_LDU_G_v4i32_ELE, {/* no v4i64 */});
-    break;
-  case NVPTXISD::LoadV8:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                             {/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE,
-                             {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v4i8,
+                             NVPTX::LDU_GLOBAL_v4i16, NVPTX::LDU_GLOBAL_v4i32,
+                             {/* no v4i64 */});
     break;
   }
   if (!Opcode)
     return false;
 
   SDLoc DL(N);
-  SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
+  SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
 
-  // For automatic generation of LDG (through SelectLoad[Vector], not the
-  // intrinsics), we may have an extending load like:
-  //
-  //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
-  //
-  // In this case, the matching logic above will select a load for the original
-  // memory type (in this case, i8) and our types will not match (the node needs
-  // to return an i32 in this case). Our LDG/LDU nodes do not support the
-  // concept of sign-/zero-extension, so emulate it here by adding an explicit
-  // CVT instruction. Ptxas should clean up any redundancies here.
-
-  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
-
-  if (OrigType != EltVT &&
-      (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
-    // We have an extending-load. The instruction we selected operates on the
-    // smaller type, but the SDNode we are replacing has the larger type. We
-    // need to emit a CVT to make the types match.
-    unsigned CvtOpc =
-        GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
-
-    // For each output value, apply the manual sign/zero-extension and make sure
-    // all users of the load go through that CVT.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Res(LD, i);
-      SDValue OrigVal(N, i);
-
-      SDNode *CvtNode =
-        CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
-                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
-                                                         DL, MVT::i32));
-      ReplaceUses(OrigVal, SDValue(CvtNode, 0));
-    }
-  }
-
-  ReplaceNode(N, LD);
+  ReplaceNode(LD, NVPTXLDU);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   MemSDNode *ST = cast<MemSDNode>(N);
   assert(ST->writeMem() && "Expected store");
-  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
-  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
+  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
+  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
   assert((PlainStore || AtomicStore) && "Expected store");
 
   // do not support pre/post inc/dec
   if (PlainStore && PlainStore->isIndexed())
     return false;
 
-  EVT StoreVT = ST->getMemoryVT();
+  const EVT StoreVT = ST->getMemoryVT();
   if (!StoreVT.isSimple())
     return false;
 
   // Address Space Setting
-  unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
 
-  SDLoc DL(N);
+  SDLoc DL(ST);
   SDValue Chain = ST->getChain();
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Vector Setting
   const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
@@ -1417,85 +1386,78 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (!NVPTXST)
     return false;
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+  MachineMemOperand *MemRef = ST->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
-  ReplaceNode(N, NVPTXST);
+  ReplaceNode(ST, NVPTXST);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
-  SDValue Op1 = N->getOperand(1);
-  EVT EltVT = Op1.getValueType();
-  MemSDNode *MemSD = cast<MemSDNode>(N);
-  EVT StoreVT = MemSD->getMemoryVT();
+  MemSDNode *ST = cast<MemSDNode>(N);
+  const EVT StoreVT = ST->getMemoryVT();
   assert(StoreVT.isSimple() && "Store value is not simple");
 
   // Address Space Setting
-  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
   if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
     report_fatal_error("Cannot store to pointer that points to constant "
                        "memory space");
   }
 
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+  SDLoc DL(ST);
+  SDValue Chain = ST->getChain();
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
 
-  unsigned NumElts = getLoadStoreVectorNumElts(N);
+  const unsigned NumElts = getLoadStoreVectorNumElts(ST);
 
-  SmallVector<SDValue, 16> Ops(N->ops().slice(1, NumElts));
-  SDValue N2 = N->getOperand(NumElts + 1);
-  unsigned ToTypeWidth = TotalWidth / NumElts;
-
-  if (isSubVectorPackedInI32(EltVT)) {
-    EltVT = MVT::i32;
-  }
+  SmallVector<SDValue, 16> Ops(ST->ops().slice(1, NumElts));
+  SDValue Addr = N->getOperand(NumElts + 1);
+  const unsigned ToTypeWidth = TotalWidth / NumElts;
 
   assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
          TotalWidth <= 256 && "Invalid width for store");
 
   SDValue Offset, Base;
-  SelectADDR(N2, Base, Offset);
+  SelectADDR(Addr, Base, Offset);
 
   Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
               getI32Imm(CodeAddrSpace, DL),
               getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
               getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
 
+  const MVT::SimpleValueType EltVT =
+      ST->getOperand(1).getSimpleValueType().SimpleTy;
   std::optional<unsigned> Opcode;
-  switch (N->getOpcode()) {
+  switch (ST->getOpcode()) {
   default:
     return false;
   case NVPTXISD::StoreV2:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2,
-                             NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
-                             NVPTX::STV_i64_v2);
+    Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v2, NVPTX::STV_i16_v2,
+                             NVPTX::STV_i32_v2, NVPTX::STV_i64_v2);
     break;
   case NVPTXISD::StoreV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
-                             NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
-                             NVPTX::STV_i64_v4);
+    Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v4, NVPTX::STV_i16_v4,
+                             NVPTX::STV_i32_v4, NVPTX::STV_i64_v4);
     break;
   case NVPTXISD::StoreV8:
-    Opcode =
-        pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                        {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(EltVT, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::STV_i32_v8, {/* no v8i64 */});
     break;
   }
 
   if (!Opcode)
     return false;
 
-  SDNode *ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
+  SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
+  MachineMemOperand *MemRef = ST->getMemOperand();
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
 
-  ReplaceNode(N, ST);
+  ReplaceNode(ST, NVPTXST);
   return true;
 }
 
@@ -2285,70 +2247,6 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
   ReplaceNode(N, Mov);
 }
 
-/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
-/// conversion from \p SrcTy to \p DestTy.
-unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
-                                             LoadSDNode *LdNode) {
-  bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
-  switch (SrcTy.SimpleTy) {
-  default:
-    llvm_unreachable("Unhandled source type");
-  case MVT::i8:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
-    }
-  case MVT::i16:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
-    }
-  case MVT::i32:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
-    }
-  case MVT::i64:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
-    }
-  case MVT::f16:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::f32:
-      return NVPTX::CVT_f32_f16;
-    case MVT::f64:
-      return NVPTX::CVT_f64_f16;
-    }
-  }
-}
-
 bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
   SDLoc DL(N);
   assert(N->getOpcode() == ISD::ATOMIC_FENCE);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 473f4781a6c3..ff58e4486a22 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -75,7 +75,8 @@ private:
   void SelectTexSurfHandle(SDNode *N);
   bool tryLoad(SDNode *N);
   bool tryLoadVector(SDNode *N);
-  bool tryLDGLDU(SDNode *N);
+  bool tryLDU(SDNode *N);
+  bool tryLDG(MemSDNode *N);
   bool tryStore(SDNode *N);
   bool tryStoreVector(SDNode *N);
   bool tryLoadParam(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4c3501df57f8..5dbdce52f055 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
-def hasVote : Predicate<"Subtarget->hasVote()">;
-def hasDouble : Predicate<"Subtarget->hasDouble()">;
 def hasClusters : Predicate<"Subtarget->hasClusters()">;
-def hasLDG : Predicate<"Subtarget->hasLDG()">;
-def hasLDU : Predicate<"Subtarget->hasLDU()">;
 def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
 def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
 def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b3c1296cf0ca..5de3dee1fb34 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR  : ATOM2_bitwise_impl<"xor">;
 
 class LDU_G<string TyStr, NVPTXRegClass regclass>
   :  NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
-               "ldu.global." # TyStr # " \t$result, [$src];",
-                      []>, Requires<[hasLDU]>;
+               "ldu.global." # TyStr # " \t$result, [$src];", []>;
 
-def INT_PTX_LDU_GLOBAL_i8  : LDU_G<"b8", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
-def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
-def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
-def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
+def LDU_GLOBAL_i8  : LDU_G<"b8",  Int16Regs>;
+def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
+def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
+def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
 
 // vector
 
@@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
                "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
 
-def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
+def LDU_GLOBAL_v2i8  : VLDU_G_ELE_V2<"b8",  Int16Regs>;
+def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
+def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
+def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;
 
-def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f16_ELE   : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4f16x2_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
+def LDU_GLOBAL_v4i8  : VLDU_G_ELE_V4<"b8",  Int16Regs>;
+def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
+def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;
 
 
 //-----------------------------------
@@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
 // non-coherent texture cache, and therefore the values read must be read-only
 // during the lifetime of the kernel.
 
-class LDG_G<string TyStr, NVPTXRegClass regclass>
-  : NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
-               "ld.global.nc." # TyStr # " \t$result, [$src];",
-                        []>, Requires<[hasLDG]>;
+class LDG_G<NVPTXRegClass regclass>
+  : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+               "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
 
-def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
-def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
-def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
-def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
+def LD_GLOBAL_NC_i8  : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
+def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;
 
 // vector
 
 // Elementized vector ldg
-class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-            (ins ADDR:$src),
-            "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
 
 
-class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 
-            (ins ADDR:$src),
-            "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
-class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
                   regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
-             (ins ADDR:$src),
-             "ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
+             (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+             "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
+def LD_GLOBAL_NC_v2i8  : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
+def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;
 
-def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
+def LD_GLOBAL_NC_v4i8  : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;
 
-def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
-def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
+def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
+def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;
 
 multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
   if Supports32 then
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index c4d1537557ca..4e11f58f85ee 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -10,14 +11,29 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; CHECK-LABEL: spam
 define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
+; CHECK-LABEL: spam(
+; CHECK:       .maxntid 1, 1, 1
+; CHECK-NEXT:  {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %bb
+; CHECK-NEXT:    ld.param.b64 %rd1, [spam_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [spam_param_3];
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 1;
+; CHECK-NEXT:    add.s64 %rd4, %rd1, %rd3;
+; CHECK-NEXT:    ld.param.b64 %rd5, [spam_param_1];
+; CHECK-NEXT:    ld.global.nc.s16 %r1, [%rd4+16];
+; CHECK-NEXT:    mul.wide.s32 %rd6, %r1, %r1;
+; CHECK-NEXT:    ld.global.b64 %rd7, [%rd5];
+; CHECK-NEXT:    add.s64 %rd8, %rd6, %rd7;
+; CHECK-NEXT:    st.global.b64 [%rd5], %rd8;
+; CHECK-NEXT:    ret;
 bb:
   %tmp5 = add nsw i64 %arg3, 8
   %tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
-; CHECK: ld.global.nc.b16
   %tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
-; CHECK: cvt.s32.s16
   %tmp8 = sext i16 %tmp7 to i64
   %tmp9 = mul nsw i64 %tmp8, %tmp8
   %tmp10 = load i64, ptr addrspace(1) %arg1, align 8
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 3b30ce560edb..6148c0756e39 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -7,45 +8,93 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-; CHECK-LABEL: ex_zext
 define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_zext_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_zext_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.b8 %r1, [%rd2];
+; CHECK-NEXT:    st.global.b32 [%rd4], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
-; CHECK: cvt.u32.u8
   %valext = zext i8 %val to i32
   store i32 %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_sext
 define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_sext_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_sext_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.s8 %r1, [%rd2];
+; CHECK-NEXT:    st.global.b32 [%rd4], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
-; CHECK: cvt.s32.s8
   %valext = sext i8 %val to i32
   store i32 %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_zext_v2
 define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext_v2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_zext_v2_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_zext_v2_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT:    st.global.v2.b32 [%rd4], {%r2, %r1};
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
-; CHECK: cvt.u32.u16
   %valext = zext <2 x i8> %val to <2 x i32>
   store <2 x i32> %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_sext_v2
 define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext_v2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_sext_v2_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_sext_v2_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    cvt.s32.s8 %r2, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    cvt.s32.s8 %r4, %r3;
+; CHECK-NEXT:    st.global.v2.b32 [%rd4], {%r4, %r2};
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
-; CHECK: cvt.s32.s8
   %valext = sext <2 x i8> %val to <2 x i32>
   store <2 x i32> %valext, ptr %res
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index bb88d1f2755c..3dceefb93a47 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -7,7 +7,6 @@ target triple = "nvptx-nvidia-cuda"
 
 define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK-LABEL: foo(
-; CHECK:    .reg .b16 %rs<2>;
 ; CHECK:    .reg .b32 %r<4>;
 ; CHECK:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
@@ -15,8 +14,7 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK:    cvta.to.global.u64 %rd2, %rd1;
 ; CHECK:    ld.param.b64 %rd3, [foo_param_1];
 ; CHECK:    cvta.to.global.u64 %rd4, %rd3;
-; CHECK:    ld.global.nc.b8 %rs1, [%rd2];
-; CHECK:    cvt.u32.u8 %r1, %rs1;
+; CHECK:    ld.global.nc.b8 %r1, [%rd2];
 ; CHECK:    add.s32 %r2, %r1, 1;
 ; CHECK:    and.b32 %r3, %r2, 1;
 ; CHECK:    st.global.b32 [%rd4], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 7ac697c4ce20..7f4b049af84f 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i8_param_0];
-; CHECK-NEXT:    ld.global.nc.b8 %rs1, [%rd1];
-; CHECK-NEXT:    cvt.u32.u8 %r1, %rs1;
+; CHECK-NEXT:    ld.global.nc.b8 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
@@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i16_param_0];
-; CHECK-NEXT:    ld.global.nc.b16 %rs1, [%rd1];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.global.nc.b16 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 3bbdf641ade2..ddaa9fd831af 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -211,7 +211,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot3[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b16 %rs<8>;
+; CHECK-PTX-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX-EMPTY:
@@ -220,18 +220,15 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
 ; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs6;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs1;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs2;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs3;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.b32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    mov.b16 %rs7, 1;
-; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs7;
+; CHECK-PTX-NEXT:    mov.b16 %rs4, 1;
+; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs4;
 ; CHECK-PTX-NEXT:    mov.b64 %rd3, 1;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], %rd3;
 ; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 8;

From eb31c422d0dc816bf285a81bf92690d4d16273ed Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 21:43:17 +0530
Subject: [PATCH 0664/1322] [Driver] Add support for GCC installation detection
 in Baremetal toolchain (#121829)

This patch introduces enhancements to the Baremetal toolchain to support
GCC toolchain detection.
- If the --gcc-install-dir or --gcc-toolchain options are provided and
point to valid paths, the sysroot is derived from those locations.
- If not, the logic falls back to the existing sysroot inference
mechanism already present in the Baremetal toolchain.
- Support for adding include paths for the libstdc++ library has also
been added.

Additionally, the restriction to always use the integrated assembler has
been removed. With a valid GCC installation, the GNU assembler can now
be used as well.

This patch currently updates and adds tests for the ARM target only.
RISC-V-specific tests will be introduced in a later patch, once the
RISCVToolChain is fully merged into the Baremetal toolchain. At this
stage, there is no way to test the RISC-V target within this PR.

RFC:
https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/docs/Toolchain.rst                      |   5 +
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 +
 clang/lib/Driver/ToolChains/BareMetal.cpp     | 245 +++++++++++++-----
 clang/lib/Driver/ToolChains/BareMetal.h       |  19 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |   0
 .../aarch64-none-elf/lib/.keep                |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../bin/aarch64-none-elf-ld                   |   1 +
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |   0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../aarch64-none-elf/lib/crtbegin.o           |   0
 .../aarch64-none-elf/lib/crtend.o             |   0
 .../bin/aarch64-none-elf-ld                   |   1 +
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |   0
 .../armv6m-none-eabi/lib/.keep                |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../bin/armv6m-none-eabi-ld                   |   1 +
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |   0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../armv6m-none-eabi/lib/crtbegin.o           |   0
 .../armv6m-none-eabi/lib/crtend.o             |   0
 .../bin/armv6m-none-eabi-ld                   |   1 +
 clang/test/Driver/aarch64-gnutools.c          |   4 +
 clang/test/Driver/aarch64-toolchain-extra.c   |  28 ++
 clang/test/Driver/aarch64-toolchain.c         |  61 +++++
 clang/test/Driver/arm-gnutools.c              |   6 +
 clang/test/Driver/arm-toolchain-extra.c       |  29 +++
 clang/test/Driver/arm-toolchain.c             |  62 +++++
 clang/test/Driver/baremetal.cpp               |  16 ++
 clang/test/Driver/check-no-multlib-warning.c  |  10 +
 32 files changed, 423 insertions(+), 69 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/aarch64-gnutools.c
 create mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 create mode 100644 clang/test/Driver/aarch64-toolchain.c
 create mode 100644 clang/test/Driver/arm-gnutools.c
 create mode 100644 clang/test/Driver/arm-toolchain-extra.c
 create mode 100644 clang/test/Driver/arm-toolchain.c
 create mode 100644 clang/test/Driver/check-no-multlib-warning.c

diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index 958199eb7a2e..d56b21d74c7e 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,3 +347,8 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
+
+GCC Installation
+=================
+Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
+using ``-gcc-install-dir`` flag.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 29f6480ba935..94224e103875 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,6 +847,9 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
+def warn_drv_multilib_not_available_for_target: Warning<
+  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
+  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index d8168ed15feb..0fbfe6c77f34 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,6 +31,40 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
+}
+
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -95,7 +129,8 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
+static std::string computeClangRuntimesSysRoot(const Driver &D,
+                                               bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -108,58 +143,125 @@ static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   return std::string(SysRootDir);
 }
 
+// Only consider the GCC toolchain based on the values provided through the
+// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
+// whether the GCC toolchain was initialized successfully.
+bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
+                                    const llvm::opt::ArgList &Args) {
+  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
+      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
+    GCCInstallation.init(Triple, Args);
+    return GCCInstallation.isValid();
+  }
+  return false;
+}
+
+// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
+// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
+// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
+// `bin/../<target-triple>/lib` directory.
+static bool detectGCCToolchainAdjacent(const Driver &D) {
+  SmallString<128> GCCDir;
+  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
+                          "lib/crt0.o");
+  return llvm::sys::fs::exists(GCCDir);
+}
+
+// If no sysroot is provided the driver will first attempt to infer it from the
+// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
+// location of a GCC toolchain.
+// If neither flag is used, the sysroot defaults to either:
+//    - `bin/../<target-triple>`
+//    - `bin/../lib/clang-runtimes/<target-triple>`
+//
+// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
+// does not exist relative to the driver.
+std::string BareMetal::computeSysRoot() const {
+  // Use Baremetal::sysroot if it has already been set.
+  if (!SysRoot.empty())
+    return SysRoot;
+
+  // Use the sysroot specified via the `--sysroot` command-line flag, if
+  // provided.
+  const Driver &D = getDriver();
+  if (!D.SysRoot.empty())
+    return D.SysRoot;
+
+  // Attempt to infer sysroot from a valid GCC installation.
+  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
+  SmallString<128> inferredSysRoot;
+  if (IsGCCInstallationValid) {
+    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
+                            "..", GCCInstallation.getTriple().str());
+  } else if (detectGCCToolchainAdjacent(D)) {
+    // Use the triple as provided to the driver. Unlike the parsed triple
+    // this has not been normalized to always contain every field.
+    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
+  }
+  // If a valid sysroot was inferred and exists, use it
+  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
+    return std::string(inferredSysRoot);
+
+  // Use the clang-runtimes path.
+  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
+}
+
+static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
+                                  const Multilib &Multilib,
+                                  StringRef InstallPath,
+                                  ToolChain::path_list &Paths) {
+  if (const auto &PathsCallback = Multilibs.filePathsCallback())
+    for (const auto &Path : PathsCallback(Multilib))
+      addPathIfExists(D, InstallPath + Path, Paths);
+}
+
+// GCC mutltilibs will only work for those targets that have their multlib
+// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
+// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
+// in GCCInstallation.
 BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
                      const ArgList &Args)
-    : ToolChain(D, Triple, Args),
-      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
-  getProgramPaths().push_back(getDriver().Dir);
+    : Generic_ELF(D, Triple, Args) {
+  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
+  std::string ComputedSysRoot = computeSysRoot();
+  if (IsGCCInstallationValid) {
+    if (!isRISCVBareMetal(Triple))
+      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
 
-  findMultilibs(D, Triple, Args);
-  SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
-    for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
-      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-      getFilePaths().push_back(std::string(Dir));
-      getLibraryPaths().push_back(std::string(Dir));
+    Multilibs = GCCInstallation.getMultilibs();
+    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+
+    path_list &Paths = getFilePaths();
+    // Add toolchain/multilib specific file paths.
+    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
+                          GCCInstallation.getInstallPath(), Paths);
+    // Adding filepath for locating crt{begin,end}.o files.
+    Paths.push_back(GCCInstallation.getInstallPath().str());
+    // Adding filepath for locating crt0.o file.
+    Paths.push_back(ComputedSysRoot + "/lib");
+
+    ToolChain::path_list &PPaths = getProgramPaths();
+    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
+    // directory off of the parent of the GCC installation.
+    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
+                           GCCInstallation.getTriple().str() + "/bin")
+                         .str());
+    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
+  } else {
+    getProgramPaths().push_back(getDriver().Dir);
+    findMultilibs(D, Triple, Args);
+    const SmallString<128> SysRootDir(computeSysRoot());
+    if (!SysRootDir.empty()) {
+      for (const Multilib &M : getOrderedMultilibs()) {
+        SmallString<128> Dir(SysRootDir);
+        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+        getFilePaths().push_back(std::string(Dir));
+        getLibraryPaths().push_back(std::string(Dir));
+      }
     }
   }
 }
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
-}
-
 static void
 findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
                       StringRef MultilibPath, const ArgList &Args,
@@ -216,7 +318,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -234,7 +336,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -242,7 +344,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple)) {
+  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -263,8 +365,6 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
-std::string BareMetal::computeSysRoot() const { return SysRoot; }
-
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -292,10 +392,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
+  const SmallString<128> SysRootDir(computeSysRoot());
+  if (!SysRootDir.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
+      SmallString<128> Dir(SysRootDir);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -309,6 +409,19 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
+void BareMetal::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  if (!IsGCCInstallationValid)
+    return;
+  const GCCVersion &Version = GCCInstallation.getVersion();
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+  const Multilib &Multilib = GCCInstallation.getMultilib();
+  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
+                           TripleStr, Multilib.includeSuffix(), DriverArgs,
+                           CC1Args);
+}
+
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -339,23 +452,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-    case ToolChain::CST_Libcxx: {
-      SmallString<128> P(D.Dir);
-      llvm::sys::path::append(P, "..", "include");
-      AddCXXIncludePath(P);
-      break;
-    }
-    case ToolChain::CST_Libstdcxx:
-      // We only support libc++ toolchain installation.
-      break;
+  case ToolChain::CST_Libcxx: {
+    SmallString<128> P(D.Dir);
+    llvm::sys::path::append(P, "..", "include");
+    AddCXXIncludePath(P);
+    break;
+  }
+  case ToolChain::CST_Libstdcxx:
+    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
+    break;
   }
 
-  std::string SysRoot(computeSysRoot());
-  if (SysRoot.empty())
+  std::string SysRootDir(computeSysRoot());
+  if (SysRootDir.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRoot);
+    SmallString<128> Dir(SysRootDir);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index f6295bda0a6a..930f8584e643 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
+#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -19,7 +20,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -35,7 +36,8 @@ protected:
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool useIntegratedAs() const override { return true; }
+  bool initGCCInstallation(const llvm::Triple &Triple,
+                           const llvm::opt::ArgList &Args);
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -48,9 +50,15 @@ public:
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
+  UnwindTableLevel
+  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
+    return UnwindTableLevel::None;
+  }
+
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
+
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
@@ -67,6 +75,9 @@ public:
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -80,6 +91,8 @@ private:
 
   std::string SysRoot;
 
+  bool IsGCCInstallationValid;
+
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -104,7 +117,7 @@ public:
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
new file mode 100644
index 000000000000..0214639ed380
--- /dev/null
+++ b/clang/test/Driver/aarch64-gnutools.c
@@ -0,0 +1,4 @@
+// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
new file mode 100644
index 000000000000..2610e962bd69
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -0,0 +1,28 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in aarch64-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/aarch64-nogcc/bin
+// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
new file mode 100644
index 000000000000..7f2c01d928e4
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -0,0 +1,61 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
+
+// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
+
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
new file mode 100644
index 000000000000..6e107f19dabc
--- /dev/null
+++ b/clang/test/Driver/arm-gnutools.c
@@ -0,0 +1,6 @@
+// check that gnu assembler is invoked with arm baremetal as well
+
+// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
new file mode 100644
index 000000000000..114de0a8154a
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -0,0 +1,29 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in arm-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/arm-nogcc/bin
+// RUN: ln -s %clang %t/arm-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
+
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
new file mode 100644
index 000000000000..2e38461fb7a3
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain.c
@@ -0,0 +1,62 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
+
+// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
+
+// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
+
+// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index a80aa9b43711..2ac83402dda3 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -196,6 +196,22 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
+// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
+// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
new file mode 100644
index 000000000000..9a0d7cee450a
--- /dev/null
+++ b/clang/test/Driver/check-no-multlib-warning.c
@@ -0,0 +1,10 @@
+// UNSUPPORTED: system-windows
+
+
+// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+
+// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
+// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets

From e6a41399cb8796e5d18940d49b0151704568321a Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Tue, 17 Jun 2025 09:24:01 -0700
Subject: [PATCH 0665/1322] Reland "[libc] utf8 to 32 CharacterConverter"
 (#144450)

Reverts llvm/llvm-project#144446
Figured out the issue, so creating a new pull request.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 .../__support/wchar/character_converter.cpp   |  56 +++++
 libc/test/src/__support/CMakeLists.txt        |   5 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 +
 .../src/__support/wchar/utf8_to_32_test.cpp   | 196 ++++++++++++++++++
 4 files changed, 264 insertions(+), 3 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ca709769616c..3b9046dfb9a7 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
@@ -30,6 +31,50 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
+int CharacterConverter::push(char8_t utf8_byte) {
+  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0) {
+    // UTF-8 char has 1 byte total
+    if (num_ones == 0) {
+      state->total_bytes = 1;
+    }
+    // UTF-8 char has 2 through 4 bytes total
+    else if (num_ones >= 2 && num_ones <= 4) {
+      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+      we will make the base mask with 7 ones and right shift it as necessary. */
+      constexpr size_t SIGNIFICANT_BITS = 7;
+      char8_t base_mask =
+          static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
+      state->total_bytes = num_ones;
+      utf8_byte &= (base_mask >> num_ones);
+    }
+    // Invalid first byte
+    else {
+      // bytes_processed and total_bytes will always be 0 here
+      state->partial = static_cast<char32_t>(0);
+      return -1;
+    }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
+  }
+  // Any subsequent push
+  // Adding 6 more bits so need to left shift
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+  if (num_ones == 1 && !isComplete()) {
+    char32_t byte =
+        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  // Invalid byte -> reset the state
+  clear();
+  return -1;
+}
+
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
   if (!isComplete())
@@ -54,6 +99,17 @@ int CharacterConverter::push(char32_t utf32) {
   return -1;
 }
 
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+  // If pop is called too early, do not reset the state, use error to determine
+  // whether enough bytes have been pushed
+  if (!isComplete() || state->bytes_processed == 0)
+    return Error(-1);
+  char32_t utf32 = state->partial;
+  // reset if successful pop
+  clear();
+  return utf32;
+}
+
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isComplete())
     return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 76218a16e0cf..9f626ed31cc0 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-
-# Requires access to uchar header which is not on macos
-# Therefore, cannot currently build this on macos in overlay mode
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5dff6e9115f7..5176bfd4b024 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,5 +1,15 @@
 add_custom_target(libc-support-wchar-tests)
 
+add_libc_test(
+  utf8_to_32_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf8_to_32_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
+
 add_libc_test(
   utf32_to_8_test
   SUITE
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 000000000000..9cb059faa937
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,196 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  char ch = 'A';
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(err, 0);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; //  car symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  char_conv.push(static_cast<char8_t>(ch[3]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {
+      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+      static_cast<char>(0x00)}; // first and third bytes are invalid
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Prev byte was single byte so trying to push another should error.
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Should produce an error on 3rd byte
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+
+  // Should produce an error since mbstate was reset
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(wch.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+
+  // Second two byte character
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 460);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(
+      wch.has_value()); // Should fail since we have not read enough bytes
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}

From 65d590e8d012df9dabbf8b3ec929fd1543c7398a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:25:09 +0100
Subject: [PATCH 0666/1322] [X86] combineLogicBlendIntoConditionalNegate -
 convert to SDPatternMatch matching. NFC. (#144536)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12fcc614ab25..4cff42c2ac46 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47591,27 +47591,19 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
 static SDValue combineLogicBlendIntoConditionalNegate(
     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   EVT MaskVT = Mask.getValueType();
   assert(MaskVT.isInteger() &&
          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
          "Mask must be zero/all-bits");
 
-  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
+      !DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
     return SDValue();
-  if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
-    return SDValue();
-
-  auto IsNegV = [](SDNode *N, SDValue V) {
-    return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
-           ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
-  };
 
   SDValue V;
-  if (IsNegV(Y.getNode(), X))
-    V = X;
-  else if (IsNegV(X.getNode(), Y))
-    V = Y;
-  else
+  if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
+      !sd_match(X, m_Neg(m_AllOf(m_Specific(Y), m_Value(V)))))
     return SDValue();
 
   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

From c66be289901b3f035187d391e80e3610d7d6232e Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 18:31:06 +0200
Subject: [PATCH 0667/1322] [clang][bytecode] Allocate IntegralAP and Floating
 types using an allocator (#144246)

Both `APInt` and `APFloat` will heap-allocate memory themselves using
the system allocator when the size of their data exceeds 64 bits.

This is why clang has `APNumericStorage`, which allocates its memory
using an allocator (via `ASTContext`) instead. Calling `getValue()` on
an ast node like that will then create a new `APInt`/`APFloat` , which
will copy the data (in the `APFloat` case, we even copy it twice).
That's sad but whatever.

In the bytecode interpreter, we have a similar problem. Large integers
and floating-point values are placement-new allocated into the
`InterpStack` (or into the bytecode, which is a `vector<std::byte>`).
When we then later interrupt interpretation, we don't run the destructor
for all items on the stack, which means we leak the memory the
`APInt`/`APFloat` (which backs the `IntegralAP`/`Floating` the
interpreter uses).

Fix this by using an approach similar to the one used in the AST. Add an
allocator to `InterpState`, which is used for temporaries and local
values. Those values will be freed at the end of interpretation. For
global variables, we need to promote the values to global lifetime,
which we do via `InitGlobal` and `FinishInitGlobal` ops.

Interestingly, this results in a slight _improvement_ in compile times:
https://llvm-compile-time-tracker.com/compare.php?from=6bfcdda9b1ddf0900f82f7e30cb5e3253a791d50&to=88d1d899127b408f0fb0f385c2c58e6283195049&stat=instructions:u
(but don't ask me why).

Fixes https://github.com/llvm/llvm-project/issues/139012
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 122 ++++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 +
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  58 ++-
 clang/lib/AST/ByteCode/Floating.h             | 252 ++++++++-----
 clang/lib/AST/ByteCode/Integral.h             |   3 +
 clang/lib/AST/ByteCode/IntegralAP.h           | 233 +++++++-----
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +++++-
 clang/lib/AST/ByteCode/Interp.h               | 341 ++++++++++++++----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 ++-
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 ++
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 +
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 17 files changed, 936 insertions(+), 348 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 9fe4803ce98e..3f884ed8d094 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,7 +748,8 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  return this->emitConstFloat(E->getValue(), E);
+  APFloat F = E->getValue();
+  return this->emitFloat(F, E);
 }
 
 template <class Emitter>
@@ -4185,8 +4186,10 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float:
-    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
+  case PT_Float: {
+    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
+    return this->emitFloat(F, E);
+  }
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
@@ -4674,10 +4677,7 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      if (!this->emitFinishInit(Init))
-        return false;
-
-      return this->emitPopPtr(Init);
+      return this->emitFinishInitGlobal(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,51 +4698,45 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  } else {
-    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
-
-    if (VarT) {
-      unsigned Offset = this->allocateLocalPrimitive(
-          VD, *VarT, VD->getType().isConstQualified(), nullptr,
-          ScopeKind::Block, IsConstexprUnknown);
-      if (Init) {
-        // If this is a toplevel declaration, create a scope for the
-        // initializer.
-        if (Toplevel) {
-          LocalScope<Emitter> Scope(this);
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-        } else {
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD);
-        }
-      }
-    } else {
-      if (std::optional<unsigned> Offset =
-              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
-                                  IsConstexprUnknown)) {
-        if (!Init)
-          return true;
-
-        if (!this->emitGetPtrLocal(*Offset, Init))
-          return false;
-
-        if (!visitInitializer(Init))
-          return false;
-
-        if (!this->emitFinishInit(Init))
-          return false;
-
-        return this->emitPopPtr(Init);
-      }
-      return false;
-    }
-    return true;
   }
+  // Local variables.
+  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-  return false;
+  if (VarT) {
+    unsigned Offset = this->allocateLocalPrimitive(
+        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
+        IsConstexprUnknown);
+    if (Init) {
+      // If this is a toplevel declaration, create a scope for the
+      // initializer.
+      if (Toplevel) {
+        LocalScope<Emitter> Scope(this);
+        if (!this->visit(Init))
+          return false;
+        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+      } else {
+        if (!this->visit(Init))
+          return false;
+        return this->emitSetLocal(*VarT, Offset, VD);
+      }
+    }
+  } else {
+    if (std::optional<unsigned> Offset = this->allocateLocal(
+            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
+      if (!Init)
+        return true;
+
+      if (!this->emitGetPtrLocal(*Offset, Init))
+        return false;
+
+      if (!visitInitializer(Init))
+        return false;
+
+      return this->emitFinishInitPop(Init);
+    }
+    return false;
+  }
+  return true;
 }
 
 template <class Emitter>
@@ -4751,8 +4745,10 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat())
-    return this->emitConstFloat(Val.getFloat(), E);
+  else if (Val.isFloat()) {
+    APFloat F = Val.getFloat();
+    return this->emitFloat(F, E);
+  }
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6133,8 +6129,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6176,8 +6174,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,6 +6953,20 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
+  assert(!DiscardResult && "Should've been checked before");
+
+  if (Floating::singleWord(F.getSemantics()))
+    return this->emitConstFloat(Floating(F), E);
+
+  APInt I = F.bitcastToAPInt();
+  return this->emitConstFloat(
+      Floating(const_cast<uint64_t *>(I.getRawData()),
+               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
+      E);
+}
+
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index ac3ad84766dc..a1d068cc7e0a 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,6 +391,7 @@ private:
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
+  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 5531295dfa2f..46e4d0d940b3 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 846dc2fe92a7..7c6b78386b14 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,34 +50,56 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto F = Floating::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  auto Sem = Floating::deserializeSemantics(*OpPC);
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
+      llvm::APFloatBase::EnumToSemantics(Sem));
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  Floating Result(Memory.get(), Sem);
+  Floating::deserialize(*OpPC, &Result);
+
+  OpPC += align(Result.bytesToSerialize());
+
+  std::string S;
+  llvm::raw_string_ostream SS(S);
+  SS << Result;
+  return S;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<false>;
+  unsigned BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  OpPC += Result.bytesToSerialize();
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+  return Str;
 }
+
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<true>;
+  unsigned BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+
+  OpPC += Result.bytesToSerialize();
+  return Str;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 3750568fc23c..659892e720ab 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,63 +17,79 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
+// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
+// floating values.
+#define ALLOCATE_ALL 0
+
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
+using APInt = llvm::APInt;
 
+/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 class Floating final {
 private:
-  // The underlying value storage.
-  APFloat F;
+  union {
+    uint64_t Val = 0;
+    uint64_t *Memory;
+  };
+  llvm::APFloatBase::Semantics Semantics;
+
+  APFloat getValue() const {
+    unsigned BitWidth = bitWidth();
+    if (singleWord())
+      return APFloat(getSemantics(), APInt(BitWidth, Val));
+    unsigned NumWords = numWords();
+    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+  }
 
 public:
-  /// Zero-initializes a Floating.
-  Floating() : F(0.0f) {}
-  Floating(const APFloat &F) : F(F) {}
+  Floating() = default;
+  Floating(llvm::APFloatBase::Semantics Semantics)
+      : Val(0), Semantics(Semantics) {}
+  Floating(const APFloat &F) {
 
-  // Static constructors for special floating point values.
-  static Floating getInf(const llvm::fltSemantics &Sem) {
-    return Floating(APFloat::getInf(Sem));
+    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
+    this->copy(F);
   }
-  const APFloat &getAPFloat() const { return F; }
+  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
+      : Memory(Memory), Semantics(Semantics) {}
 
-  bool operator<(Floating RHS) const { return F < RHS.F; }
-  bool operator>(Floating RHS) const { return F > RHS.F; }
-  bool operator<=(Floating RHS) const { return F <= RHS.F; }
-  bool operator>=(Floating RHS) const { return F >= RHS.F; }
-  bool operator==(Floating RHS) const { return F == RHS.F; }
-  bool operator!=(Floating RHS) const { return F != RHS.F; }
-  Floating operator-() const { return Floating(-F); }
+  APFloat getAPFloat() const { return getValue(); }
+
+  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
+  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
+  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
+  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
+    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
+                                       &IsExact);
   }
 
-  Floating toSemantics(const llvm::fltSemantics *Sem,
-                       llvm::RoundingMode RM) const {
-    APFloat Copy = F;
+  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
+                   Floating *Result) const {
+    APFloat Copy = getValue();
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    return Floating(Copy);
-  }
-
-  /// Convert this Floating to one with the same semantics as \Other.
-  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
-    return toSemantics(&Other.F.getSemantics(), RM);
+    Result->copy(Copy);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(F.bitcastToAPInt());
+    return APSInt(getValue().bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(F); }
+  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    F.toString(Buffer);
+    getValue().toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -83,25 +99,62 @@ public:
     return NameStr;
   }
 
-  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
+  unsigned bitWidth() const {
+    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
+  }
+  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
+  bool singleWord() const {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return numWords() == 1;
+  }
+  static bool singleWord(const llvm::fltSemantics &Sem) {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
+  }
+  const llvm::fltSemantics &getSemantics() const {
+    return llvm::APFloatBase::EnumToSemantics(Semantics);
+  }
+
+  void copy(const APFloat &F) {
+    if (singleWord()) {
+      Val = F.bitcastToAPInt().getZExtValue();
+    } else {
+      assert(Memory);
+      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
+                  numWords() * sizeof(uint64_t));
+    }
+  }
+
+  void take(uint64_t *NewMemory) {
+    if (singleWord())
+      return;
+
+    if (Memory)
+      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return F.isNegative(); }
-  bool isZero() const { return F.isZero(); }
-  bool isNonZero() const { return F.isNonZero(); }
-  bool isMin() const { return F.isSmallest(); }
-  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
-  bool isNan() const { return F.isNaN(); }
-  bool isSignaling() const { return F.isSignaling(); }
-  bool isInf() const { return F.isInfinity(); }
-  bool isFinite() const { return F.isFinite(); }
-  bool isNormal() const { return F.isNormal(); }
-  bool isDenormal() const { return F.isDenormal(); }
-  llvm::FPClassTest classify() const { return F.classify(); }
-  APFloat::fltCategory getCategory() const { return F.getCategory(); }
+  bool isNegative() const { return getValue().isNegative(); }
+  bool isZero() const { return getValue().isZero(); }
+  bool isNonZero() const { return getValue().isNonZero(); }
+  bool isMin() const { return getValue().isSmallest(); }
+  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
+  bool isNan() const { return getValue().isNaN(); }
+  bool isSignaling() const { return getValue().isSignaling(); }
+  bool isInf() const { return getValue().isInfinity(); }
+  bool isFinite() const { return getValue().isFinite(); }
+  bool isNormal() const { return getValue().isNormal(); }
+  bool isDenormal() const { return getValue().isDenormal(); }
+  llvm::FPClassTest classify() const { return getValue().classify(); }
+  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
+    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -118,97 +171,130 @@ public:
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating &Result) {
+                                        Floating *Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result = Floating(F);
+    Result->copy(F);
     return Status;
   }
 
-  static Floating bitcastFromMemory(const std::byte *Buff,
-                                    const llvm::fltSemantics &Sem) {
+  static void bitcastFromMemory(const std::byte *Buff,
+                                const llvm::fltSemantics &Sem,
+                                Floating *Result) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-
-    return Floating(APFloat(Sem, API));
+    Result->copy(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = F.bitcastToAPInt();
+    llvm::APInt API = getValue().bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(llvm::fltSemantics *) +
-           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
+    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    // Semantics followed by an APInt.
-    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
-
-    llvm::APInt API = F.bitcastToAPInt();
-    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
-                           bitWidth() / 8);
+    std::memcpy(Buff, &Semantics, sizeof(Semantics));
+    if (singleWord()) {
+      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
+    } else {
+      std::memcpy(Buff + sizeof(Semantics), Memory,
+                  numWords() * sizeof(uint64_t));
+    }
   }
 
-  static Floating deserialize(const std::byte *Buff) {
-    const llvm::fltSemantics *Sem;
-    std::memcpy((void *)&Sem, Buff, sizeof(void *));
-    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
+  static llvm::APFloatBase::Semantics
+  deserializeSemantics(const std::byte *Buff) {
+    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
   }
 
-  static Floating abs(const Floating &F) {
-    APFloat V = F.F;
-    if (V.isNegative())
-      V.changeSign();
-    return Floating(V);
+  static void deserialize(const std::byte *Buff, Floating *Result) {
+    llvm::APFloatBase::Semantics Semantics;
+    std::memcpy(&Semantics, Buff, sizeof(Semantics));
+
+    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
+        llvm::APFloatBase::EnumToSemantics(Semantics));
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+
+    Result->Semantics = Semantics;
+    if (NumWords == 1 && !ALLOCATE_ALL) {
+      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
+    } else {
+      assert(Result->Memory);
+      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
+                  NumWords * sizeof(uint64_t));
+    }
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.add(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.add(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.add(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.add(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.subtract(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.subtract(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.subtract(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.subtract(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.multiply(B.F, RM);
+
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.multiply(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.divide(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.divide(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    *R = -A;
+    R->copy(-A.getValue());
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index 13fdb5369f2b..af5cd2d13ecc 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,6 +99,9 @@ public:
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
+  bool operator>=(unsigned RHS) const {
+    return static_cast<unsigned>(V) >= RHS;
+  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 8ee08dfb5cfe..259262bdc524 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,12 +28,19 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
-template <unsigned Bits, bool Signed> class Integral;
 
+/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 template <bool Signed> class IntegralAP final {
-private:
+public:
+  union {
+    uint64_t *Memory = nullptr;
+    uint64_t Val;
+  };
+  unsigned BitWidth = 0;
   friend IntegralAP<!Signed>;
-  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -52,106 +59,129 @@ private:
                                : V.trunc(BitSize).getZExtValue();
   }
 
+  APInt getValue() const {
+    if (singleWord())
+      return APInt(BitWidth, Val, Signed);
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+    return llvm::APInt(BitWidth, NumWords, Memory);
+  }
+
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  template <typename T>
-  IntegralAP(T Value, unsigned BitWidth)
-      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
+  void take(uint64_t *NewMemory) {
+    assert(!singleWord());
+    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
 
-  IntegralAP(APInt V) : V(V) {}
-  /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
+  void copy(const APInt &V) {
+    assert(BitWidth == V.getBitWidth());
+    assert(numWords() == V.getNumWords());
 
-  IntegralAP operator-() const { return IntegralAP(-V); }
+    if (V.isSingleWord()) {
+      if constexpr (Signed)
+        Val = V.getSExtValue();
+      else
+        Val = V.getZExtValue();
+      return;
+    }
+    assert(Memory);
+    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
+  }
+
+  // Constructors.
+  IntegralAP() = default;
+  IntegralAP(unsigned BitWidth) : BitWidth(BitWidth) {}
+  IntegralAP(uint64_t *Memory, unsigned BitWidth)
+      : Memory(Memory), BitWidth(BitWidth) {}
+  IntegralAP(const APInt &V)
+      : IntegralAP(const_cast<uint64_t *>((const uint64_t *)V.getRawData()),
+                   V.getBitWidth()) {}
+
+  IntegralAP operator-() const { return IntegralAP(-getValue()); }
   IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(V - Other.V);
+    return IntegralAP(getValue() - Other.getValue());
   }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return V.ugt(RHS.V);
-    return V.sgt(RHS.V);
+      return getValue().sgt(RHS.getValue());
+    return getValue().ugt(RHS.getValue());
   }
-  bool operator>=(IntegralAP RHS) const {
+  bool operator>=(unsigned RHS) const {
     if constexpr (Signed)
-      return V.uge(RHS.V);
-    return V.sge(RHS.V);
+      return getValue().sge(RHS);
+    return getValue().uge(RHS);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return V.slt(RHS.V);
-    return V.slt(RHS.V);
-  }
-  bool operator<=(IntegralAP RHS) const {
-    if constexpr (Signed)
-      return V.ult(RHS.V);
-    return V.ult(RHS.V);
+      return getValue().slt(RHS.getValue());
+    return getValue().ult(RHS.getValue());
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(V);
+    return truncateCast<Ty, Signed>(getValue());
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
+    if (NumBits == 0)
+      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-
+    assert(false);
     return IntegralAP<Signed>(Copy);
   }
 
+  static IntegralAP from(const APInt &Value) {
+    return IntegralAP<Signed>(Value);
+  }
+
   template <bool InputSigned>
   static IntegralAP from(IntegralAP<InputSigned> V, unsigned NumBits = 0) {
     if (NumBits == 0)
       NumBits = V.bitWidth();
 
     if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
+      return IntegralAP<Signed>(V.getValue().sextOrTrunc(NumBits));
+    return IntegralAP<Signed>(V.getValue().zextOrTrunc(NumBits));
   }
 
-  template <unsigned Bits, bool InputSigned>
-  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
-    return IntegralAP<Signed>(I.toAPInt(BitWidth));
-  }
-
-  static IntegralAP zero(int32_t BitWidth) {
-    APInt V = APInt(BitWidth, 0LL, Signed);
-    return IntegralAP(V);
-  }
-
-  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
+  constexpr unsigned bitWidth() const { return BitWidth; }
+  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
+  constexpr bool singleWord() const { return numWords() == 1; }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
+    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(V.sext(Bits), !Signed);
+      return APSInt(getValue().sext(Bits), !Signed);
     else
-      return APSInt(V.zext(Bits), !Signed);
+      return APSInt(getValue().zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return V.isZero(); }
+  bool isZero() const { return getValue().isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return V.isNonNegative();
+      return getValue().isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !V.isNonNegative();
+      return !getValue().isNonNegative();
     return false;
   }
-  bool isMin() const { return V.isMinValue(); }
-  bool isMax() const { return V.isMaxValue(); }
+  bool isMin() const { return getValue().isMinValue(); }
+  bool isMax() const { return getValue().isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && V == -1; }
+  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
 
-  unsigned countLeadingZeros() const { return V.countl_zero(); }
+  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
+  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -161,53 +191,64 @@ public:
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    APInt Copy = V;
-    return IntegralAP<false>(Copy);
+    return IntegralAP<false>(Memory, BitWidth);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
   }
 
   static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
+    // FIXME: Remove this.
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
     return IntegralAP(V);
   }
 
+  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
+                                IntegralAP *Result) {
+    APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
+    llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
+    Result->copy(V);
+  }
+
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
+    APInt V1 = getValue();
+    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V.slt(RHS.V))
+      if (V1.slt(V2))
         return ComparisonCategoryResult::Less;
-      if (V.sgt(RHS.V))
+      if (V1.sgt(V2))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V.ult(RHS.V))
+    if (V1.ult(V2))
       return ComparisonCategoryResult::Less;
-    if (V.ugt(RHS.V))
+    if (V1.ugt(V2))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return add(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return sub(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -224,87 +265,95 @@ public:
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.srem(B.V));
+      R->copy(A.getValue().srem(B.getValue()));
     else
-      *R = IntegralAP(A.V.urem(B.V));
+      R->copy(A.getValue().urem(B.getValue()));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.sdiv(B.V));
+      R->copy(A.getValue().sdiv(B.getValue()));
     else
-      *R = IntegralAP(A.V.udiv(B.V));
+      R->copy(A.getValue().udiv(B.getValue()));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V & B.V);
+    R->copy(A.getValue() & B.getValue());
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    *R = IntegralAP(A.V | B.V);
+    R->copy(A.getValue() | B.getValue());
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V ^ B.V);
+    R->copy(A.getValue() ^ B.getValue());
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.V;
+    APInt AI = A.getValue();
     AI.negate();
-    *R = IntegralAP(AI);
+    R->copy(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    *R = IntegralAP(~A.V);
+    R->copy(~A.getValue());
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
+    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.V.getZExtValue();
+    unsigned ShiftAmount = B.getValue().getZExtValue();
     if constexpr (Signed)
-      *R = IntegralAP(A.V.ashr(ShiftAmount));
+      R->copy(A.getValue().ashr(ShiftAmount));
     else
-      *R = IntegralAP(A.V.lshr(ShiftAmount));
+      R->copy(A.getValue().lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
-    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
+    assert(BitWidth != 0);
+    uint32_t NumWords = llvm::APInt::getNumWords(bitWidth());
+    return sizeof(uint64_t) + (NumWords * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
-    uint32_t BitWidth = V.getBitWidth();
-
-    std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
-    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
-                           BitWidth / CHAR_BIT);
+    uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
+    std::memcpy(Buff, &BitWidth, sizeof(uint64_t));
+    if (singleWord())
+      std::memcpy(Buff + sizeof(uint64_t), &Val, NumWords * sizeof(uint64_t));
+    else
+      std::memcpy(Buff + sizeof(uint64_t), Memory, NumWords * sizeof(uint64_t));
   }
 
-  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
-    uint32_t BitWidth;
-    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
-    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
+  static uint32_t deserializeSize(const std::byte *Buff) {
+    return *reinterpret_cast<const uint64_t *>(Buff);
+  }
 
-    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
-                            BitWidth / CHAR_BIT);
-    return Val;
+  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
+    uint32_t BitWidth = Result->BitWidth;
+    uint32_t NumWords = llvm::APInt::getNumWords(BitWidth);
+    assert(BitWidth == Result->BitWidth);
+    assert(Result->Memory);
+
+    if (NumWords == 1)
+      std::memcpy(&Result->Val, Buff + sizeof(uint64_t), sizeof(uint64_t));
+    else
+      std::memcpy(Result->Memory, Buff + sizeof(uint64_t),
+                  NumWords * sizeof(uint64_t));
   }
 
 private:
@@ -312,7 +361,7 @@ private:
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->V = Op<APInt>{}(A.V, B.V);
+      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
       return false;
     }
 
@@ -320,7 +369,7 @@ private:
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->V = Result;
+    R->copy(Result);
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 5c8abffb3a99..1e2032feabb6 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,8 +1935,10 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
@@ -1946,8 +1948,10 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2053,6 +2057,100 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
+                                PrimType T) {
+
+  if (T == PT_IntAPS) {
+    auto &Val = Ptr.deref<IntegralAP<true>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_IntAP) {
+    auto &Val = Ptr.deref<IntegralAP<false>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_Float) {
+    auto &Val = Ptr.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+}
+
+template <typename T>
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
+  assert(needsAlloc<T>());
+  auto &Val = Ptr.deref<T>();
+  if (!Val.singleWord()) {
+    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+    Val.take(NewMemory);
+  }
+}
+
+static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
+  if (const Record *R = Ptr.getRecord()) {
+    for (const Record::Field &Fi : R->fields()) {
+      if (Fi.Desc->isPrimitive()) {
+        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
+          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
+        });
+        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
+      } else
+        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
+    }
+    return;
+  }
+
+  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
+    unsigned NumElems = D->getNumElems();
+    if (NumElems == 0)
+      return;
+
+    if (D->isPrimitiveArray()) {
+      PrimType PT = D->getPrimType();
+      if (!needsAlloc(PT))
+        return;
+      assert(NumElems >= 1);
+      const Pointer EP = Ptr.atIndex(0);
+      bool AllSingleWord = true;
+      TYPE_SWITCH_ALLOC(PT, {
+        if (!EP.deref<T>().singleWord()) {
+          copyPrimitiveMemory<T>(S, EP);
+          AllSingleWord = false;
+        }
+      });
+      if (AllSingleWord)
+        return;
+      for (unsigned I = 1; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I);
+        copyPrimitiveMemory(S, EP, PT);
+      }
+    } else {
+      assert(D->isCompositeArray());
+      for (unsigned I = 0; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I).narrow();
+        finishGlobalRecurse(S, EP);
+      }
+    }
+  }
+}
+
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+
+  finishGlobalRecurse(S, Ptr);
+  if (Ptr.canBeInitialized()) {
+    Ptr.initialize();
+    Ptr.activate();
+  }
+
+  return true;
+}
+
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index ae3d4a441a79..66d3e6d79e8b 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
+  if (Bits > 1 && RHS >= Bits) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,6 +370,9 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -408,6 +411,7 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -423,7 +427,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -434,6 +438,7 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -442,7 +447,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -453,6 +458,7 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
+
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -461,8 +467,10 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
+
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -484,9 +492,14 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -539,10 +552,20 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).initialize();
+    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    // Result.atIndex(1).initialize();
+
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    Result.atIndex(1).initialize();
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+
     Result.initialize();
   } else {
     // Integer element type.
@@ -608,9 +631,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -625,9 +651,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -644,7 +673,11 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -659,12 +692,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -679,12 +715,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -707,8 +746,10 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -730,31 +771,44 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
-  T Result;
 
-  if (!T::neg(Value, &Result)) {
+  if constexpr (std::is_same_v<T, Floating>) {
+    T Result = S.allocFloat(Value.getSemantics());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+    return false;
+  } else {
+    T Result;
+    if constexpr (needsAlloc<T>())
+      Result = S.allocAP<T>(Value.bitWidth());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+
+    assert(isIntegralType(Name) &&
+           "don't expect other types to fail at constexpr negation");
     S.Stk.push<T>(Result);
-    return true;
+
+    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+    if (S.checkingForUndefinedBehavior()) {
+      const Expr *E = S.Current->getExpr(OpPC);
+      QualType Type = E->getType();
+      SmallString<32> Trunc;
+      NegatedValue.trunc(Result.bitWidth())
+          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                    /*UpperCase=*/true, /*InsertSeparators=*/true);
+      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+          << Trunc << Type << E->getSourceRange();
+      return true;
+    }
+
+    return handleOverflow(S, OpPC, NegatedValue);
   }
-
-  assert(isIntegralType(Name) &&
-         "don't expect other types to fail at constexpr negation");
-  S.Stk.push<T>(Result);
-
-  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-  if (S.checkingForUndefinedBehavior()) {
-    const Expr *E = S.Current->getExpr(OpPC);
-    QualType Type = E->getType();
-    SmallString<32> Trunc;
-    NegatedValue.trunc(Result.bitWidth())
-        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                  /*UpperCase=*/true, /*InsertSeparators=*/true);
-    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-        << Trunc << Type << E->getSourceRange();
-    return true;
-  }
-
-  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -783,6 +837,8 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -890,7 +946,6 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
-
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -898,7 +953,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result;
+  Floating Result = S.allocFloat(Value.getSemantics());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -952,12 +1007,15 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Val.bitWidth());
+
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
-
   return false;
 }
 
@@ -1325,10 +1383,23 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Arg.bitWidth());
+    Result.copy(Arg.toAPSInt());
+    S.Stk.push<T>(Result);
+    return true;
+  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
+inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
+  Floating Result = S.allocFloat(F.getSemantics());
+  Result.copy(F.getAPFloat());
+  S.Stk.push<Floating>(Result);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1483,7 +1554,24 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
+
   P.deref<T>() = S.Stk.pop<T>();
+
+  if constexpr (std::is_same_v<T, Floating>) {
+    auto &Val = P.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+
+  } else if constexpr (needsAlloc<T>()) {
+    auto &Val = P.deref<T>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+
   P.initialize();
   return true;
 }
@@ -1585,7 +1673,22 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Value.bitWidth());
+    if (T::isSigned())
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .sextOrTrunc(Value.bitWidth()));
+    else
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .zextOrTrunc(Value.bitWidth()));
+
+    Field.deref<T>() = Result;
+  } else {
+    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+  }
   Field.activate();
   Field.initialize();
   return true;
@@ -1765,6 +1868,8 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
+
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2271,7 +2376,8 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = F.toSemantics(Sem, RM);
+  Floating Result = S.allocFloat(*Sem);
+  F.toSemantics(Sem, RM, &Result);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2295,15 +2401,25 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2312,11 +2428,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
-  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
+  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2365,7 +2481,12 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<false>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2381,7 +2502,12 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<true>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2441,8 +2567,9 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-
-  S.Stk.push<Floating>(Fixed.toFloat(Sem));
+  Floating Result = S.allocFloat(*Sem);
+  Result.copy(Fixed.toFloat(Sem));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -2506,12 +2633,18 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2578,7 +2711,9 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                    LT *Result) {
+
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2596,7 +2731,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS);
+        S, OpPC, LHS, RHS, Result);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2644,6 +2779,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
+      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2652,6 +2788,48 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   return true;
 }
 
+/// A version of DoShift that works on IntegralAP.
+template <class LT, class RT, ShiftDir Dir>
+inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                      LT *Result) {
+  const unsigned Bits = LHS.bitWidth();
+  const APSInt &LHSAP = LHS.toAPSInt();
+  APSInt RHSAP = RHS.toAPSInt();
+
+  // OpenCL 6.3j: shift values are effectively % word size of LHS.
+  if (S.getLangOpts().OpenCL)
+    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
+                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
+                    RHSAP.isUnsigned());
+
+  if (RHS.isNegative()) {
+    // During constant-folding, a negative shift is an opposite shift. Such a
+    // shift is not a constant expression.
+    const SourceInfo &Loc = S.Current->getSource(OpPC);
+    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
+    if (!S.noteUndefinedBehavior())
+      return false;
+    RHS = -RHS;
+    return DoShiftAP<LT, RT,
+                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
+        S, OpPC, LHS, RHS, Result);
+  }
+
+  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
+    return false;
+
+  if constexpr (Dir == ShiftDir::Left) {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP << SA);
+  } else {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP >> SA);
+  }
+
+  S.Stk.push<LT>(*Result);
+  return true;
+}
+
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2659,7 +2837,13 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2668,8 +2852,13 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-
-  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3252,7 +3441,15 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+      Floating Result = S.allocFloat(*Sem);
+      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
+      S.Stk.push<Floating>(Result);
+
+      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+    } else if constexpr (needsAlloc<T>()) {
+      T Result = S.allocAP<T>(ResultBitWidth);
+      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
+      S.Stk.push<T>(Result);
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3310,7 +3507,11 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  Floating F = Floating::deserialize(*OpPC);
+  auto &Semantics =
+      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
+
+  auto F = S.allocFloat(Semantics);
+  Floating::deserialize(*OpPC, &F);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3318,17 +3519,25 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<false>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<true>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d01e3d042a8b..5304bd77f2c0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,6 +57,21 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
+
+  if (T == PT_IntAPS) {
+    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<true>>(Result);
+    return;
+  }
+
+  if (T == PT_IntAP) {
+    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<false>>(Result);
+    return;
+  }
+
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -327,13 +342,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result;
+  Floating Result = S.allocFloat(TargetSemantics);
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -342,10 +357,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -360,7 +375,9 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
+  Floating Result = S.allocFloat(TargetSemantics);
+  Result.copy(APFloat::getInf(TargetSemantics));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -368,10 +385,12 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  S.Stk.push<Floating>(Floating(Copy));
+  Result.copy(Copy);
+  S.Stk.push<Floating>(Result);
 
   return true;
 }
@@ -380,11 +399,13 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -392,11 +413,13 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -571,8 +594,16 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
+  APFloat F = Val.getAPFloat();
+  if (!F.isNegative()) {
+    S.Stk.push<Floating>(Val);
+    return true;
+  }
 
-  S.Stk.push<Floating>(Floating::abs(Val));
+  Floating Result = S.allocFloat(Val.getSemantics());
+  F.changeSign();
+  Result.copy(F);
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 239b3104e89f..2569cac018b3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,7 +402,9 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
+          Floating R = S.allocFloat(Semantics);
+          Floating::bitcastFromMemory(M.get(), Semantics, &R);
+          P.deref<Floating>() = R;
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index e8dc6f0483d6..08765561985e 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,6 +15,7 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
+#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -126,6 +127,33 @@ public:
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
+  void *allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *allocate(size_t Num = 1) const {
+    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
+  }
+
+  template <typename T> T allocAP(unsigned BitWidth) {
+    unsigned NumWords = APInt::getNumWords(BitWidth);
+    if (NumWords == 1)
+      return T(BitWidth);
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return T(Mem, BitWidth);
+  }
+
+  Floating allocFloat(const llvm::fltSemantics &Sem) {
+    if (Floating::singleWord(Sem))
+      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
+
+    unsigned NumWords =
+        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
+  }
+
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -161,6 +189,8 @@ public:
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
+
+  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index c76ac5f8ae86..57e01f7bd9da 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,6 +48,7 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
+
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -88,6 +89,9 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
+def IntegralTypeClass : TypeClass {
+  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
+}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -265,12 +269,13 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstFloat : ConstOpcode<Float, ArgFloat>;
-def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
+def ConstFloat : Opcode { let Args = [ArgFloat]; }
+
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -328,6 +333,7 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
+def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -389,7 +395,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [AluTypeClass];
+  let Types = [IntegralTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index 6152fbfbe3a7..a156cccbb3c1 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,6 +76,13 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
+template <typename T> constexpr bool needsAlloc() {
+  return std::is_same_v<T, IntegralAP<false>> ||
+         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
+}
+constexpr bool needsAlloc(PrimType T) {
+  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
+}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -209,6 +216,16 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
+#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
+  do {                                                                         \
+    switch (Expr) {                                                            \
+      TYPE_SWITCH_CASE(PT_Float, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
+    default:;                                                                  \
+    }                                                                          \
+  } while (0)
+
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 23ba1bbd193b..5d9c42244749 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,6 +132,14 @@ public:
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
+  void *Allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *Allocate(size_t Num = 1) const {
+    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
+  }
+  void Deallocate(void *Ptr) const {}
+
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -204,7 +212,7 @@ private:
   };
 
   /// Allocator for globals.
-  PoolAllocTy Allocator;
+  mutable PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -238,4 +246,18 @@ public:
 } // namespace interp
 } // namespace clang
 
+inline void *operator new(size_t Bytes, const clang::interp::Program &C,
+                          size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
+inline void operator delete(void *Ptr, const clang::interp::Program &C,
+                            size_t) {
+  C.Deallocate(Ptr);
+}
+inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
+                            size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 710612bef8fd..1013a771d13b 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,6 +21,9 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
+#if __x86_64
+  // both-note@-2 {{indeterminate value can only initialize an object of type}}
+#endif
 }
 
 template <class Intermediate, class Init>
@@ -38,11 +41,8 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
-#if 0
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
-                                                                                 // expected-note{{in call}}
-#endif
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
+                                                                                 // both-note{{in call}}
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 21dca15a4577..174c1ffa79a4 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  //constexpr long double NaN5 = __builtin_nanf128("");
+  constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,8 +655,6 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
-/// FIXME: The commented out tests here use a IntAP value and fail.
-/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -709,7 +707,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int clz50 = __builtin_clzg((unsigned __int128)0);
+  int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -717,7 +715,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -775,7 +773,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -785,7 +783,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 9ec75a50bc48c84c68430f113332769d23481ef5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 17 Jun 2025 09:36:52 -0700
Subject: [PATCH 0668/1322] MIPS: Replace MipsMCExpr with MCSpecifierExpr

---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 58 ++++++++++---------
 .../Target/Mips/MCTargetDesc/CMakeLists.txt   |  1 -
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp |  1 +
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       |  8 +++
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  3 +-
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  2 +-
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   | 39 -------------
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h | 36 ------------
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  | 21 +++----
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |  2 +-
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      | 10 ++--
 llvm/lib/Target/Mips/MipsTargetObjectFile.cpp |  2 +-
 12 files changed, 58 insertions(+), 125 deletions(-)
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 7ea7c58f1a51..071c016b92e7 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2965,9 +2965,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
         Res.getConstant() == 0 && !IsLocalSym) {
       if (UseXGOT) {
         const MCExpr *CallHiExpr =
-            MipsMCExpr::create(Mips::S_CALL_HI16, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_CALL_HI16, getContext());
         const MCExpr *CallLoExpr =
-            MipsMCExpr::create(Mips::S_CALL_LO16, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_CALL_LO16, getContext());
         TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                     STI);
         TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
@@ -2976,7 +2976,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                      MCOperand::createExpr(CallLoExpr), IDLoc, STI);
       } else {
         const MCExpr *CallExpr =
-            MipsMCExpr::create(Mips::S_GOT_CALL, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_GOT_CALL, getContext());
         TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
                      MCOperand::createExpr(CallExpr), IDLoc, STI);
       }
@@ -3009,9 +3009,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       const MCExpr *CallHiExpr =
-          MipsMCExpr::create(Mips::S_GOT_HI16, SymExpr, getContext());
-      const MCExpr *CallLoExpr =
-          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_LO16, getContext());
+          MCSpecifierExpr::create(SymExpr, Mips::S_GOT_HI16, getContext());
+      const MCExpr *CallLoExpr = MCSpecifierExpr::create(
+          Res.getAddSym(), Mips::S_GOT_LO16, getContext());
 
       TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                   STI);
@@ -3042,8 +3042,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // The daddiu's marked with a '>' may be omitted if they are redundant. If
       // this happens then the last instruction must use $rd as the result
       // register.
-      GotExpr =
-          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_DISP, getContext());
+      GotExpr = MCSpecifierExpr::create(Res.getAddSym(), Mips::S_GOT_DISP,
+                                        getContext());
       if (Res.getConstant() != 0) {
         // Symbols fully resolve with just the %got_disp(symbol) but we
         // must still account for any offset to the symbol for
@@ -3070,14 +3070,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       if (IsLocalSym) {
-        GotExpr = MipsMCExpr::create(Mips::S_GOT, SymExpr, getContext());
-        LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+        GotExpr = MCSpecifierExpr::create(SymExpr, Mips::S_GOT, getContext());
+        LoExpr = MCSpecifierExpr::create(SymExpr, Mips::S_LO, getContext());
       } else {
         // External symbols fully resolve the symbol with just the %got(symbol)
         // but we must still account for any offset to the symbol for
         // expressions like symbol+8.
         GotExpr =
-            MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
+            MCSpecifierExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
         if (Res.getConstant() != 0)
           LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
       }
@@ -3097,8 +3097,10 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
-  const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+  const auto *HiExpr =
+      MCSpecifierExpr::create(SymExpr, Mips::S_HI, getContext());
+  const auto *LoExpr =
+      MCSpecifierExpr::create(SymExpr, Mips::S_LO, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3110,9 +3112,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // source register.
 
     const auto *HighestExpr =
-        MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
+        MCSpecifierExpr::create(SymExpr, Mips::S_HIGHEST, getContext());
     const auto *HigherExpr =
-        MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
+        MCSpecifierExpr::create(SymExpr, Mips::S_HIGHER, getContext());
 
     bool RdRegIsRsReg =
         UseSrcReg &&
@@ -3310,7 +3312,8 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
 
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
-    const auto *GotExpr = MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
+    const auto *GotExpr =
+        MCSpecifierExpr::create(GotSym, Mips::S_GOT, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3321,7 +3324,8 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     }
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
-    const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
+    const auto *HiExpr =
+        MCSpecifierExpr::create(HiSym, Mips::S_HI, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3334,10 +3338,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
       const auto *HighestExpr =
-          MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
+          MCSpecifierExpr::create(HighestSym, Mips::S_HIGHEST, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
       const auto *HigherExpr =
-          MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
+          MCSpecifierExpr::create(HigherSym, Mips::S_HIGHER, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
                   STI);
@@ -3424,7 +3428,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3474,7 +3478,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3554,7 +3558,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3777,15 +3781,15 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       //                  sw  $8,  %lo(sym)($at)
       const MCExpr *OffExpr = OffsetOp.getExpr();
       MCOperand LoOperand = MCOperand::createExpr(
-          MipsMCExpr::create(Mips::S_LO, OffExpr, getContext()));
+          MCSpecifierExpr::create(OffExpr, Mips::S_LO, getContext()));
       MCOperand HiOperand = MCOperand::createExpr(
-          MipsMCExpr::create(Mips::S_HI, OffExpr, getContext()));
+          MCSpecifierExpr::create(OffExpr, Mips::S_HI, getContext()));
 
       if (ABI.IsN64()) {
         MCOperand HighestOperand = MCOperand::createExpr(
-            MipsMCExpr::create(Mips::S_HIGHEST, OffExpr, getContext()));
+            MCSpecifierExpr::create(OffExpr, Mips::S_HIGHEST, getContext()));
         MCOperand HigherOperand = MCOperand::createExpr(
-            MipsMCExpr::create(Mips::S_HIGHER, OffExpr, getContext()));
+            MCSpecifierExpr::create(OffExpr, Mips::S_HIGHER, getContext()));
 
         TOut.emitRX(Mips::LUi, TmpReg, HighestOperand, IDLoc, STI);
         TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HigherOperand, IDLoc, STI);
@@ -6394,7 +6398,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
   while (Ops.size()) {
     if (Parser.parseToken(AsmToken::RParen, "expected ')'"))
       return nullptr;
-    Res = MipsMCExpr::create(Ops.pop_back_val(), Res, getContext());
+    Res = MCSpecifierExpr::create(Res, Ops.pop_back_val(), getContext());
   }
   return Res;
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index d3f16e5042c3..8b73a7bdd4bc 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -7,7 +7,6 @@ add_llvm_component_library(LLVMMipsDesc
   MipsInstPrinter.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
-  MipsMCExpr.cpp
   MipsMCTargetDesc.cpp
   MipsNaClELFStreamer.cpp
   MipsOptionRecord.cpp
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 58aa374e5302..25e31941bbb4 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index b64f86f38297..0941d93fe0eb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsMCAsmInfo.h"
 #include "MipsABIInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -59,6 +60,13 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   AllowAtInName = true;
 }
 
+const MCSpecifierExpr *Mips::createGpOff(const MCExpr *Expr, Mips::Specifier S,
+                                         MCContext &Ctx) {
+  Expr = MCSpecifierExpr::create(Expr, Mips::S_GPREL, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, Mips::S_NEG, Ctx);
+  return MCSpecifierExpr::create(Expr, S, Ctx);
+}
+
 static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
                       const MCSpecifierExpr &Expr) {
   int64_t AbsVal;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 0975116328fc..6ba90a5c2025 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
-#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCFixup.h"
@@ -77,6 +76,8 @@ enum {
 };
 
 bool isGpOff(const MCSpecifierExpr &E);
+const MCSpecifierExpr *createGpOff(const MCExpr *Expr, Specifier S,
+                                   MCContext &Ctx);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index d2981c4ad4d2..35d4e0db35c3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -581,7 +581,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                const MCSubtargetInfo &STI) const {
   MCExpr::ExprKind Kind = Expr->getKind();
   if (Kind == MCExpr::Specifier) {
-    const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+    const auto *MipsExpr = cast<MCSpecifierExpr>(Expr);
 
     Mips::Fixups FixupKind = Mips::Fixups(0);
     switch (MipsExpr->getSpecifier()) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
deleted file mode 100644
index 821f662f0cbf..000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsMCExpr.h"
-#include "MCTargetDesc/MipsMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mipsmcexpr"
-
-const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::Specifier S,
-                                     const MCExpr *Expr, MCContext &Ctx) {
-  return new (Ctx) MipsMCExpr(Expr, S);
-}
-
-const MipsMCExpr *MipsMCExpr::create(const MCSymbol *Sym, Specifier S,
-                                     MCContext &Ctx) {
-  return new (Ctx) MipsMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
-const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
-                                          const MCExpr *Expr, MCContext &Ctx) {
-  return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
-                Ctx);
-}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
deleted file mode 100644
index b78aeabb5799..000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
-#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class MipsMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = Spec;
-
-private:
-  explicit MipsMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const MipsMCExpr *create(Specifier S, const MCExpr *Expr,
-                                  MCContext &Ctx);
-  static const MipsMCExpr *create(const MCSymbol *Sym, Specifier S,
-                                  MCContext &Ctx);
-  static const MipsMCExpr *createGpOff(Specifier S, const MCExpr *Expr,
-                                       MCContext &Ctx);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 80a854c79901..6097ad801784 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -16,7 +16,6 @@
 #include "MipsBaseInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
-#include "MipsMCExpr.h"
 #include "MipsMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -1266,9 +1265,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   MCInst TmpInst;
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  const MCExpr *HiSym = MipsMCExpr::create(
-      Mips::S_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
-      MCA.getContext());
+  auto *HiSym = MCSpecifierExpr::create(GP_Disp, Mips::S_HI, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().emitInstruction(TmpInst, STI);
 
@@ -1277,9 +1274,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::ADDiu);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  const MCExpr *LoSym = MipsMCExpr::create(
-      Mips::S_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
-      MCA.getContext());
+  auto *LoSym = MCSpecifierExpr::create(GP_Disp, Mips::S_LO, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().emitInstruction(TmpInst, STI);
 
@@ -1342,12 +1337,12 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
-      Mips::S_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
-      MCA.getContext());
-  const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
-      Mips::S_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
-      MCA.getContext());
+  auto *HiExpr =
+      Mips::createGpOff(MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+                        Mips::S_HI, MCA.getContext());
+  auto *LoExpr =
+      Mips::createGpOff(MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+                        Mips::S_LO, MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
   emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index da3f7cb55b30..a6300a9c11d4 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1244,7 +1244,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
 void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
-  if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
+  if (auto *MipsExpr = dyn_cast<MCSpecifierExpr>(Value)) {
     if (MipsExpr && MipsExpr->getSpecifier() == Mips::S_DTPREL) {
       switch (Size) {
       case 4:
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index 935fcd8fa715..cdf58384427f 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -175,9 +175,9 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   if (IsGpOff)
-    Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
+    Expr = Mips::createGpOff(Expr, TargetKind, *Ctx);
   else if (TargetKind != Mips::S_None)
-    Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
+    Expr = MCSpecifierExpr::create(Expr, TargetKind, *Ctx);
 
   return MCOperand::createExpr(Expr);
 }
@@ -216,7 +216,7 @@ MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
   const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
 
-  return MCOperand::createExpr(MipsMCExpr::create(Kind, Sub, *Ctx));
+  return MCOperand::createExpr(MCSpecifierExpr::create(Sub, Kind, *Ctx));
 }
 
 void MipsMCInstLower::
@@ -248,7 +248,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   if (MI->getNumOperands() == 2) {
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
-    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MCSpecifierExpr::create(Expr, Spec, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 3) {
     // Create %hi($tgt-$baltgt).
@@ -290,7 +290,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
     // Lower register operand.
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
-    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MCSpecifierExpr::create(Expr, Spec, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 4) {
     // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index 23aa699318a2..78a9f3b7cc71 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -189,5 +189,5 @@ MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
-  return MipsMCExpr::create(Mips::S_DTPREL, Expr, getContext());
+  return MCSpecifierExpr::create(Expr, Mips::S_DTPREL, getContext());
 }

From 382e3fdbb476a5d5771b315daedcd05a15883fbc Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 17 Jun 2025 17:37:27 +0100
Subject: [PATCH 0669/1322] [lldb][Formatter] Get element type for
 unordered_maps from __hash_table::value_type (#144517)

https://github.com/llvm/llvm-project/pull/143501 changes usage of
`__hash_value_type` in libcxx to an empty tag type. This type will no
longer have a definition in DWARF. Currently the LLDB unordered_map
formatter deduces the map's `element_type` by looking at the `__cc_`
member of `__hash_value_type`. But that will no longer work because we
only have its forward declaration. Since what we're really after is the
type that `__hash_value_type` is wrapping, we can just look at the
`__hash_table::value_type` typedef. With
https://github.com/llvm/llvm-project/pull/143501 that will now point to
the `std::pair` element type (which used to be what we got from
`__cc_`).

TBD: need to double-check this works for older layouts. Quick glance at
the code makes me suspicious of cases like `unordered_map<std::pair<int,
int>, int>`
---
 .../Language/CPlusPlus/LibCxxUnorderedMap.cpp  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 642723dd9113..ffc33395830b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -99,14 +99,20 @@ static bool isUnorderedMap(ConstString type_name) {
 
 CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     GetElementType(CompilerType table_type) {
-  auto element_type = table_type.GetTypedefedType().GetTypeTemplateArgument(0);
+  auto element_type =
+      table_type.GetDirectNestedTypeWithName("value_type").GetTypedefedType();
+
+  // In newer unordered_map layouts, the std::pair element type isn't wrapped
+  // in any helper types. So return it directly.
+  if (isStdTemplate(element_type.GetTypeName(), "pair"))
+    return element_type;
 
   // This synthetic provider is used for both unordered_(multi)map and
-  // unordered_(multi)set. For unordered_map, the element type has an
-  // additional type layer, an internal struct (`__hash_value_type`)
-  // that wraps a std::pair. Peel away the internal wrapper type - whose
-  // structure is of no value to users, to expose the std::pair. This
-  // matches the structure returned by the std::map synthetic provider.
+  // unordered_(multi)set. For older unordered_map layouts, the element type has
+  // an additional type layer, an internal struct (`__hash_value_type`) that
+  // wraps a std::pair. Peel away the internal wrapper type - whose structure is
+  // of no value to users, to expose the std::pair. This matches the structure
+  // returned by the std::map synthetic provider.
   if (isUnorderedMap(
           m_backend.GetCompilerType().GetCanonicalType().GetTypeName())) {
     std::string name;

From 4e884dd993e040f7ccd83ecdc3c4570d23a42ee6 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 17 Jun 2025 09:42:53 -0700
Subject: [PATCH 0670/1322] SPARC: Remove SparcMCExpr.h

---
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 11 ++++---
 .../MCTargetDesc/SparcELFObjectWriter.cpp     |  4 +--
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.cpp     |  1 -
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.h       |  5 +++
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp |  8 ++---
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.cpp |  2 +-
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.h   | 32 -------------------
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp     |  6 ++--
 llvm/lib/Target/Sparc/SparcISelLowering.cpp   |  2 +-
 .../Target/Sparc/SparcTargetObjectFile.cpp    |  2 +-
 10 files changed, 23 insertions(+), 50 deletions(-)
 delete mode 100644 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h

diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 90aacacd8ed2..28ae34903166 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
@@ -109,7 +109,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   ParseStatus parseExpression(int64_t &Val);
 
   // Helper function for dealing with %lo / %hi in PIC mode.
-  const SparcMCExpr *adjustPICRelocation(uint16_t VK, const MCExpr *subExpr);
+  const MCSpecifierExpr *adjustPICRelocation(uint16_t VK,
+                                             const MCExpr *subExpr);
 
   // Helper function to see if current token can start an expression.
   bool isPossibleExpression(const AsmToken &Token);
@@ -1642,7 +1643,7 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
 static bool hasGOTReference(const MCExpr *Expr) {
   switch (Expr->getKind()) {
   case MCExpr::Target:
-    if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+    if (const MCSpecifierExpr *SE = dyn_cast<MCSpecifierExpr>(Expr))
       return hasGOTReference(SE->getSubExpr());
     break;
 
@@ -1668,8 +1669,8 @@ static bool hasGOTReference(const MCExpr *Expr) {
   return false;
 }
 
-const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
-                                                       const MCExpr *subExpr) {
+const MCSpecifierExpr *
+SparcAsmParser::adjustPICRelocation(uint16_t RelType, const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
   // If the expression refers contains _GLOBAL_OFFSET_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index bef7f3c02dae..2a581d381d4a 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -72,7 +72,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
   if (mc::isRelocation(Fixup.getKind()))
     return Kind;
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+  if (const auto *SExpr = dyn_cast<MCSpecifierExpr>(Fixup.getValue())) {
     if (SExpr->getSpecifier() == ELF::R_SPARC_DISP32)
       return ELF::R_SPARC_DISP32;
   }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 800567bf58ff..36365593e246 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcMCAsmInfo.h"
-#include "SparcMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 7ea800f11917..a4a2fa3f9933 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -36,6 +36,11 @@ public:
                           const MCSpecifierExpr &Expr) const override;
 };
 
+namespace Sparc {
+uint16_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint16_t S);
+} // namespace Sparc
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 4ce9bea5d795..8ba99719946a 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "SparcMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -134,7 +134,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 
   assert(MO.isExpr());
   const MCExpr *Expr = MO.getExpr();
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
@@ -164,7 +164,7 @@ unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo,
   if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
     return CE->getValue();
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
@@ -190,7 +190,7 @@ SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
   if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
     return CE->getValue();
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 6d43b9371390..1ee6e8098560 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
deleted file mode 100644
index 8e7c173c70cc..000000000000
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes Sparc-specific MCExprs, used for modifiers like
-// "%hi" or "%lo" etc.,
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
-#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
-
-#include "SparcFixupKinds.h"
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class StringRef;
-using SparcMCExpr = MCSpecifierExpr;
-
-namespace Sparc {
-uint16_t parseSpecifier(StringRef name);
-StringRef getSpecifierName(uint16_t S);
-} // namespace Sparc
-
-} // end namespace llvm.
-
-#endif
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index f4201f9a8dc1..5366e905d6df 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcInstPrinter.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "MCTargetDesc/SparcTargetStreamer.h"
 #include "Sparc.h"
@@ -82,7 +82,7 @@ public:
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
+  auto *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +101,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
+  auto *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
   return MCOperand::createExpr(expr);
 }
 
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index a6ea07974609..21ecf3d5ed70 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcISelLowering.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index a42a67d91d84..711bf9b31a37 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetObjectFile.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"

From 0c608175c11cf0ce797be7575a7c8d8ebcdecbd8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:43:04 +0100
Subject: [PATCH 0671/1322] [X86] matchLogicBlend - convert to SDPatternMatch
 matching. NFC. (#144546)

Removes a LOT of commutative matching.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 36 +++++--------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4cff42c2ac46..7f425b3d479d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52084,36 +52084,14 @@ static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL,
   return DAG.getNode(ISD::OR, DL, VT, X, Y);
 }
 
-// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
+// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+// Waiting for ANDNP combine allows other combines to happen that prevent
+// matching.
 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
-  if (N->getOpcode() != ISD::OR)
-    return false;
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Canonicalize AND to LHS.
-  if (N1.getOpcode() == ISD::AND)
-    std::swap(N0, N1);
-
-  // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
-  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
-    return false;
-
-  Mask = N1.getOperand(0);
-  X = N1.getOperand(1);
-
-  // Check to see if the mask appeared in both the AND and ANDNP.
-  if (N0.getOperand(0) == Mask)
-    Y = N0.getOperand(1);
-  else if (N0.getOperand(1) == Mask)
-    Y = N0.getOperand(0);
-  else
-    return false;
-
-  // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
-  // ANDNP combine allows other combines to happen that prevent matching.
-  return true;
+  using namespace SDPatternMatch;
+  return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
+                          m_And(m_Deferred(Mask), m_Value(Y))));
 }
 
 // Try to fold:

From b14e03d8555043bc35e9c75fff7f52d28950b3ab Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 17 Jun 2025 18:44:37 +0200
Subject: [PATCH 0672/1322] [LLDB] Consolidate C++ string buffer summaries
 (#144258)

As part of https://github.com/llvm/llvm-project/pull/143177, I moved the
non-libc++ specific formatting of `std::string`s out to `CxxStringTypes`
as MSVC's STL `std::string` can also be thought of a pointer+size pair.
I named this kind of string "string buffer".

This PR picks that change, so the MSVC PR can be smaller.
Unfortunately, libstdc++'s `std::string` does not fit this (it also uses
a different string printer function).

This resolves two FIXMEs in the libc++ tests, where empty u16 and u32
strings didn't have any prefix (u/U).
---
 .../Language/CPlusPlus/CxxStringTypes.cpp     | 102 +++++++++---
 .../Language/CPlusPlus/CxxStringTypes.h       |  29 ++++
 .../Plugins/Language/CPlusPlus/LibCxx.cpp     | 147 ++++--------------
 .../string/TestDataFormatterLibcxxString.py   |   8 +-
 .../TestDataFormatterLibcxxStringView.py      |   8 +-
 5 files changed, 148 insertions(+), 146 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
index fc17b76804d9..bf8c39344590 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
@@ -116,15 +116,7 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
     return false;
 
   // Get a wchar_t basic type from the current type system
-  CompilerType wchar_compiler_type =
-      valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeWChar);
-
-  if (!wchar_compiler_type)
-    return false;
-
-  // Safe to pass nullptr for exe_scope here.
-  std::optional<uint64_t> size =
-      llvm::expectedToOptional(wchar_compiler_type.GetBitSize(nullptr));
+  std::optional<uint64_t> size = GetWCharByteSize(valobj);
   if (!size)
     return false;
   const uint32_t wchar_size = *size;
@@ -136,13 +128,13 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
   options.SetPrefixToken("L");
 
   switch (wchar_size) {
-  case 8:
+  case 1:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>(
         options);
-  case 16:
+  case 2:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>(
         options);
-  case 32:
+  case 4:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>(
         options);
   default:
@@ -177,15 +169,7 @@ bool lldb_private::formatters::WCharSummaryProvider(
     return false;
 
   // Get a wchar_t basic type from the current type system
-  CompilerType wchar_compiler_type =
-      valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeWChar);
-
-  if (!wchar_compiler_type)
-    return false;
-
-    // Safe to pass nullptr for exe_scope here.
-  std::optional<uint64_t> size =
-      llvm::expectedToOptional(wchar_compiler_type.GetBitSize(nullptr));
+  std::optional<uint64_t> size = GetWCharByteSize(valobj);
   if (!size)
     return false;
   const uint32_t wchar_size = *size;
@@ -199,13 +183,13 @@ bool lldb_private::formatters::WCharSummaryProvider(
   options.SetBinaryZeroIsTerminator(false);
 
   switch (wchar_size) {
-  case 8:
+  case 1:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>(
         options);
-  case 16:
+  case 2:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>(
         options);
-  case 32:
+  case 4:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>(
         options);
   default:
@@ -214,3 +198,73 @@ bool lldb_private::formatters::WCharSummaryProvider(
   }
   return true;
 }
+
+std::optional<uint64_t>
+lldb_private::formatters::GetWCharByteSize(ValueObject &valobj) {
+  return llvm::expectedToOptional(
+      valobj.GetCompilerType()
+          .GetBasicTypeFromAST(lldb::eBasicTypeWChar)
+          .GetByteSize(nullptr));
+}
+
+template <StringPrinter::StringElementType element_type>
+bool lldb_private::formatters::StringBufferSummaryProvider(
+    Stream &stream, const TypeSummaryOptions &summary_options,
+    lldb::ValueObjectSP location_sp, uint64_t size, std::string prefix_token) {
+
+  if (size == 0) {
+    stream.PutCString(prefix_token);
+    stream.PutCString("\"\"");
+    return true;
+  }
+
+  if (!location_sp)
+    return false;
+
+  StringPrinter::ReadBufferAndDumpToStreamOptions options(*location_sp);
+
+  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
+    const auto max_size =
+        location_sp->GetTargetSP()->GetMaximumSizeOfStringSummary();
+    if (size > max_size) {
+      size = max_size;
+      options.SetIsTruncated(true);
+    }
+  }
+
+  {
+    DataExtractor extractor;
+    const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
+    if (bytes_read < size)
+      return false;
+
+    options.SetData(std::move(extractor));
+  }
+  options.SetStream(&stream);
+  if (prefix_token.empty())
+    options.SetPrefixToken(nullptr);
+  else
+    options.SetPrefixToken(prefix_token);
+  options.SetQuote('"');
+  options.SetSourceSize(size);
+  options.SetBinaryZeroIsTerminator(false);
+  return StringPrinter::ReadBufferAndDumpToStream<element_type>(options);
+}
+
+// explicit instantiations for all string element types
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::ASCII>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF8>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF16>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF32>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
index a2b606d28cac..337dcf2fefdc 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
@@ -10,6 +10,7 @@
 #ifndef LLDB_SOURCE_PLUGINS_LANGUAGE_CPLUSPLUS_CXXSTRINGTYPES_H
 #define LLDB_SOURCE_PLUGINS_LANGUAGE_CPLUSPLUS_CXXSTRINGTYPES_H
 
+#include "lldb/DataFormatters/StringPrinter.h"
 #include "lldb/DataFormatters/TypeSummary.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/ValueObject/ValueObject.h"
@@ -43,6 +44,34 @@ bool Char32SummaryProvider(ValueObject &valobj, Stream &stream,
 bool WCharSummaryProvider(ValueObject &valobj, Stream &stream,
                           const TypeSummaryOptions &options); // wchar_t
 
+std::optional<uint64_t> GetWCharByteSize(ValueObject &valobj);
+
+/// Print a summary for a string buffer to \a stream.
+///
+/// \param[in] stream
+///     The output stream to print the summary to.
+///
+/// \param[in] summary_options
+///     Options for printing the string contents. This function respects the
+///     capping.
+///
+/// \param[in] location_sp
+///     ValueObject of a pointer to the string being printed.
+///
+/// \param[in] size
+///     The size of the buffer pointed to by \a location_sp.
+///
+/// \param[in] prefix_token
+///     A prefix before the double quotes (e.g. 'u' results in u"...").
+///
+/// \return
+///     Returns whether the string buffer was successfully printed.
+template <StringPrinter::StringElementType element_type>
+bool StringBufferSummaryProvider(Stream &stream,
+                                 const TypeSummaryOptions &summary_options,
+                                 lldb::ValueObjectSP location_sp, uint64_t size,
+                                 std::string prefix_token);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 358cf7d78fa2..7143089209dd 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -24,6 +24,7 @@
 #include "lldb/ValueObject/ValueObject.h"
 #include "lldb/ValueObject/ValueObjectConstResult.h"
 
+#include "Plugins/Language/CPlusPlus/CxxStringTypes.h"
 #include "Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/lldb-enumerations.h"
@@ -535,70 +536,6 @@ ExtractLibcxxStringInfo(ValueObject &valobj) {
   return std::make_pair(size, location_sp);
 }
 
-static bool
-LibcxxWStringSummaryProvider(ValueObject &valobj, Stream &stream,
-                             const TypeSummaryOptions &summary_options,
-                             ValueObjectSP location_sp, size_t size) {
-  if (size == 0) {
-    stream.Printf("L\"\"");
-    return true;
-  }
-  if (!location_sp)
-    return false;
-
-  StringPrinter::ReadBufferAndDumpToStreamOptions options(valobj);
-  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
-    const auto max_size = valobj.GetTargetSP()->GetMaximumSizeOfStringSummary();
-    if (size > max_size) {
-      size = max_size;
-      options.SetIsTruncated(true);
-    }
-  }
-
-  DataExtractor extractor;
-  const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
-  if (bytes_read < size)
-    return false;
-
-  // std::wstring::size() is measured in 'characters', not bytes
-  TypeSystemClangSP scratch_ts_sp =
-      ScratchTypeSystemClang::GetForTarget(*valobj.GetTargetSP());
-  if (!scratch_ts_sp)
-    return false;
-
-  auto wchar_t_size =
-      scratch_ts_sp->GetBasicType(lldb::eBasicTypeWChar).GetByteSize(nullptr);
-  if (!wchar_t_size)
-    return false;
-
-  options.SetData(std::move(extractor));
-  options.SetStream(&stream);
-  options.SetPrefixToken("L");
-  options.SetQuote('"');
-  options.SetSourceSize(size);
-  options.SetBinaryZeroIsTerminator(false);
-
-  switch (*wchar_t_size) {
-  case 1:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF8>(
-        options);
-    break;
-
-  case 2:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF16>(
-        options);
-    break;
-
-  case 4:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF32>(
-        options);
-  }
-  return false;
-}
-
 bool lldb_private::formatters::LibcxxWStringSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &summary_options) {
@@ -609,52 +546,22 @@ bool lldb_private::formatters::LibcxxWStringSummaryProvider(
   ValueObjectSP location_sp;
   std::tie(size, location_sp) = *string_info;
 
-  return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
-                                        location_sp, size);
-}
-
-template <StringPrinter::StringElementType element_type>
-static bool
-LibcxxStringSummaryProvider(ValueObject &valobj, Stream &stream,
-                            const TypeSummaryOptions &summary_options,
-                            std::string prefix_token, ValueObjectSP location_sp,
-                            uint64_t size) {
-
-  if (size == 0) {
-    stream.Printf("\"\"");
-    return true;
-  }
-
-  if (!location_sp)
+  auto wchar_t_size = GetWCharByteSize(valobj);
+  if (!wchar_t_size)
     return false;
 
-  StringPrinter::ReadBufferAndDumpToStreamOptions options(valobj);
-
-  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
-    const auto max_size = valobj.GetTargetSP()->GetMaximumSizeOfStringSummary();
-    if (size > max_size) {
-      size = max_size;
-      options.SetIsTruncated(true);
-    }
+  switch (*wchar_t_size) {
+  case 1:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF8>(
+        stream, summary_options, location_sp, size, "L");
+  case 2:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF16>(
+        stream, summary_options, location_sp, size, "L");
+  case 4:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF32>(
+        stream, summary_options, location_sp, size, "L");
   }
-
-  {
-    DataExtractor extractor;
-    const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
-    if (bytes_read < size)
-      return false;
-
-    options.SetData(std::move(extractor));
-  }
-  options.SetStream(&stream);
-  if (prefix_token.empty())
-    options.SetPrefixToken(nullptr);
-  else
-    options.SetPrefixToken(prefix_token);
-  options.SetQuote('"');
-  options.SetSourceSize(size);
-  options.SetBinaryZeroIsTerminator(false);
-  return StringPrinter::ReadBufferAndDumpToStream<element_type>(options);
+  return false;
 }
 
 template <StringPrinter::StringElementType element_type>
@@ -669,8 +576,8 @@ LibcxxStringSummaryProvider(ValueObject &valobj, Stream &stream,
   ValueObjectSP location_sp;
   std::tie(size, location_sp) = *string_info;
 
-  return LibcxxStringSummaryProvider<element_type>(
-      valobj, stream, summary_options, prefix_token, location_sp, size);
+  return StringBufferSummaryProvider<element_type>(
+      stream, summary_options, location_sp, size, prefix_token);
 }
 template <StringPrinter::StringElementType element_type>
 static bool formatStringImpl(ValueObject &valobj, Stream &stream,
@@ -742,8 +649,8 @@ static bool formatStringViewImpl(ValueObject &valobj, Stream &stream,
     return true;
   }
 
-  return LibcxxStringSummaryProvider<element_type>(
-      valobj, stream, summary_options, prefix_token, dataobj, size);
+  return StringBufferSummaryProvider<element_type>(stream, summary_options,
+                                                   dataobj, size, prefix_token);
 }
 
 bool lldb_private::formatters::LibcxxStringViewSummaryProviderASCII(
@@ -781,8 +688,22 @@ bool lldb_private::formatters::LibcxxWStringViewSummaryProvider(
     return true;
   }
 
-  return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
-                                        dataobj, size);
+  auto wchar_t_size = GetWCharByteSize(valobj);
+  if (!wchar_t_size)
+    return false;
+
+  switch (*wchar_t_size) {
+  case 1:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF8>(
+        stream, summary_options, dataobj, size, "L");
+  case 2:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF16>(
+        stream, summary_options, dataobj, size, "L");
+  case 4:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF32>(
+        stream, summary_options, dataobj, size, "L");
+  }
+  return false;
 }
 
 static bool
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
index 5c5cf4ca16b9..32764629d65a 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
@@ -65,11 +65,9 @@ class LibcxxStringDataFormatterTestCase(TestBase):
                 '(%s::wstring) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"'
                 % ns,
                 '(%s::u16string) u16_string = u"ß水氶"' % ns,
-                # FIXME: This should have a 'u' prefix.
-                '(%s::u16string) u16_empty = ""' % ns,
+                '(%s::u16string) u16_empty = u""' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                # FIXME: This should have a 'U' prefix.
-                '(%s::u32string) u32_empty = ""' % ns,
+                '(%s::u32string) u32_empty = U""' % ns,
                 "(%s::string *) null_str = nullptr" % ns,
             ],
         )
@@ -123,7 +121,7 @@ class LibcxxStringDataFormatterTestCase(TestBase):
                 % ns,
                 '(%s::u16string) u16_string = u"ß水氶"' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                '(%s::u32string) u32_empty = ""' % ns,
+                '(%s::u32string) u32_empty = U""' % ns,
                 "(%s::string *) null_str = nullptr" % ns,
             ],
         )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
index f8fc8ae66405..3883395f2392 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
@@ -81,11 +81,11 @@ class LibcxxStringViewDataFormatterTestCase(TestBase):
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
         self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='""')
+        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
         self.expect_var_path(
             "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
         )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='""')
+        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
         self.expect_var_path(
             "oops", type="std::string_view", summary='"Hellooo World\\n"'
         )
@@ -145,11 +145,11 @@ class LibcxxStringViewDataFormatterTestCase(TestBase):
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
         self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='""')
+        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
         self.expect_var_path(
             "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
         )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='""')
+        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
 
         self.runCmd("cont")
         self.expect(

From 5baf351ba819e1e6bae0250492e85a2862ef406b Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Tue, 17 Jun 2025 09:51:40 -0700
Subject: [PATCH 0673/1322] [BPF] Do not allow gotol in the middle of asm insn
 (#144545)

Previously I accidentally allowed 'gotol' insn in the middle of asm insn
([1]). But actually 'gotol' is not allowed in the middle of any asm
insn, so remove it from isValidIdInMiddle().

[1] https://github.com/yonghong-song/llvm-project/commit/6c412b6c6faa2dabd8602d35d3f5e796fb1daf80
---
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 139ac429dd13..7d1819134d16 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -261,7 +261,6 @@ public:
         .Case("bswap32", true)
         .Case("bswap64", true)
         .Case("goto", true)
-        .Case("gotol", true)
         .Case("ll", true)
         .Case("skb", true)
         .Case("s", true)

From 556e69b7f4328a0d7c36c9d7ca0dd8f52f82ad71 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Tue, 17 Jun 2025 17:52:03 +0100
Subject: [PATCH 0674/1322] [lldb] make lit use the same Python executable for
 building and testing (#143756)

When testing LLDB, we want to make sure to use the same Python as the
one we used to build it.

This patch uses the CMake variable `Python3_ROOT_DIR` to add the correct
Python to the `PATH` in LLDB lit tests, in order to ensure of this.

Please see https://github.com/swiftlang/swift/pull/82063 for the
original issue.

This is a continuation of https://github.com/swiftlang/swift/pull/82063.
---
 lldb/cmake/modules/FindPythonAndSwig.cmake | 4 +++-
 lldb/test/API/lit.cfg.py                   | 7 +++++++
 lldb/test/API/lit.site.cfg.py.in           | 1 +
 lldb/test/Shell/lit.cfg.py                 | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/lldb/cmake/modules/FindPythonAndSwig.cmake b/lldb/cmake/modules/FindPythonAndSwig.cmake
index 1f6f553e8604..b478038f144d 100644
--- a/lldb/cmake/modules/FindPythonAndSwig.cmake
+++ b/lldb/cmake/modules/FindPythonAndSwig.cmake
@@ -6,7 +6,9 @@
 
 macro(FindPython3)
   # Use PYTHON_HOME as a hint to find Python 3.
-  set(Python3_ROOT_DIR "${PYTHON_HOME}")
+  if(NOT Python3_ROOT_DIR)
+    set(Python3_ROOT_DIR "${PYTHON_HOME}")
+  endif()
   find_package(Python3 COMPONENTS Interpreter Development)
   if(Python3_FOUND AND Python3_Interpreter_FOUND)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 646a446c86fd..83713213ce1f 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -349,3 +349,10 @@ if platform.system() == "Windows":
     for v in ["SystemDrive"]:
         if v in os.environ:
             config.environment[v] = os.environ[v]
+
+# Some steps required to initialize the tests dynamically link with python.dll
+# and need to know the location of the Python libraries. This ensures that we
+# use the same version of Python that was used to build lldb to run our tests.
+config.environment["PATH"] = os.path.pathsep.join(
+    (config.python_root_dir, config.environment.get("PATH", ""))
+)
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 8552d17d6663..86d58889cc4a 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -20,6 +20,7 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@"
 config.python_executable = "@LLDB_PYTHON_API_TEST_EXECUTABLE@"
+config.python_root_dir = "@Python3_ROOT_DIR@"
 config.lua_executable = "@LUA_EXECUTABLE@"
 config.lldb_lua_cpath = "@LLDB_LUA_CPATH@"
 config.lua_test_entry = "TestLuaAPI.py"
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index ab6113767187..8c9448b23c56 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -203,3 +203,6 @@ if platform.system() == "Darwin":
 # location of the Python libraries. This ensures that we use the same
 # version of Python that was used to build lldb to run our tests.
 config.environment["PYTHONHOME"] = config.python_root_dir
+config.environment["PATH"] = os.path.pathsep.join(
+    (config.python_root_dir, config.environment.get("PATH", ""))
+)

From 8063bd153c6aca43869d96aee64aeceb9be98ca5 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel@intel.com>
Date: Tue, 17 Jun 2025 09:55:02 -0700
Subject: [PATCH 0675/1322] [MLIR][XeGPU] Add support for elementwise ops in Wg
 to Sg distribute pass [1/N] (#142797)

This PR adds support for Elementwise operations' (unary & binary)
lowering from Workgroup to Subgroup.
---
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  89 +++++++++-
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        | 164 ++++++++++++++++++
 2 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index a26c6b52f0dd..e3563d10bc6f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -8,10 +8,12 @@
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -19,6 +21,7 @@
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include <optional>
 
 namespace mlir {
 namespace xegpu {
@@ -328,6 +331,65 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
   }
 };
 
+// This pattern transforms elementwise ops to work at subgroup level.
+struct WgToSgElementwiseOp : public ConversionPattern {
+  WgToSgElementwiseOp(MLIRContext *ctx)
+      : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only match ops with elementwise trait and single result.
+    if (!OpTrait::hasElementwiseMappableTraits(op) || op->getNumResults() != 1)
+      return failure();
+
+    auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());
+    assert(resultType && "Expected result to be a VectorType");
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+
+    xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0));
+    if (!layout || !layout.getSgLayout())
+      return failure();
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+
+    size_t numVariants = operands.empty() ? 0 : operands.front().size();
+
+    if (llvm::any_of(operands, [&](const ValueRange &operandVec) {
+          return operandVec.size() != numVariants;
+        }))
+      return failure();
+
+    SmallVector<Value> newResults;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+
+    for (size_t i = 0; i < numVariants; ++i) {
+      SmallVector<Value> opOperands;
+      for (auto &operandVec : operands)
+        opOperands.push_back(operandVec[i]);
+
+      OperationState state(op->getLoc(), op->getName());
+      state.addOperands(opOperands);
+      state.addTypes(newResultType);
+      // Copy all attributes, but update "layout_result_0" to drop
+      // sgLayout/sgData
+      for (auto attr : op->getAttrs()) {
+        if (auto layout = dyn_cast<xegpu::LayoutAttr>(attr.getValue()))
+          state.addAttribute(attr.getName(), layout.dropSgLayoutAndData());
+        else
+          state.addAttribute(attr.getName(), attr.getValue());
+      }
+      Operation *newOp = rewriter.create(state);
+      newResults.push_back(newOp->getResult(0));
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newResults});
+    return success();
+  }
+};
+
 // Handles UnrealizedConversionCastOp generated during
 // SCFStructuralTypeConversions (step 1). This op may appear as either a
 // target or source materialization for Vector values, e.g.:
@@ -411,7 +473,8 @@ namespace xegpu {
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
   patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
                WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
-               UnrealizedConversionCastOpPattern>(patterns.getContext());
+               UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>(
+      patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -518,6 +581,30 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
     return isLegal(layout);
   });
 
+  target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+      [=](Operation *op) -> std::optional<bool> {
+        // Only handle elementwise mappable ops
+        if (!OpTrait::hasElementwiseMappableTraits(op))
+          return true;
+
+        VectorType resultType =
+            dyn_cast<VectorType>(op->getResult(0).getType());
+        if (!resultType)
+          return true;
+
+        // Check if all operands are vectors of the same shape
+        // TODO: Support other types.
+        for (Value operand : op->getOperands()) {
+          VectorType operandType = dyn_cast<VectorType>(operand.getType());
+          if (!operandType || operandType.getShape() != resultType.getShape()) {
+            return true;
+          }
+        }
+
+        xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0));
+        return isLegal(layout);
+      });
+
   target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
       [=](UnrealizedConversionCastOp op) {
         return llvm::is_contained(existingCastOps, op.getOperation());
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
new file mode 100644
index 000000000000..64f01d61d6e8
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
+
+gpu.module @test_elementwise_ops {
+  // CHECK-LABEL: unary_ops
+  gpu.func @unary_ops(%a: memref<24x32xf32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK: math.exp {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+    %exp = math.exp %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: arith.negf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+    %negf = arith.negf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: binary_ops
+  gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %addf = arith.addf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: math.powf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %powf = math.powf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: ternary_ops
+  gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1>
+      -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_c = xegpu.load_nd %tdesc_c
+      : !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi1>
+    // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi1>, vector<12x8xf32>
+    %select = arith.select %load_c, %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi1>, vector<24x32xf32>
+    // CHECK: math.fma  {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %fma = math.fma %load_a, %load_b, %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: type_conversion_ops
+  gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    // CHECK: arith.truncf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32> to vector<12x8xf16>
+    %truncf = arith.truncf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32> to vector<24x32xf16>
+    // CHECK: arith.bitcast {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi32> to vector<12x8xf32>
+    %bitcast = arith.bitcast %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi32> to vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: comparison_ops
+  gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_c = xegpu.load_nd %tdesc_c
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    %load_d = xegpu.load_nd %tdesc_d
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %cmpf = arith.cmpf ult, %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi32>
+    %cmpi = arith.cmpi eq, %load_c, %load_d
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi32>
+    gpu.return
+  }
+
+  // 1 to N decomposition of elementwise operations
+  // CHECK-LABEL: elementwise_ops_rr_assignment
+  gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
+     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-NOT: arith.negf
+    %negf = arith.negf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-NOT: math.powf
+    %powf = math.powf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+}

From 01a7a21a4b8070a88e5dcc9753066e38d26faf85 Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Tue, 17 Jun 2025 12:04:04 -0500
Subject: [PATCH 0676/1322] [CMake] Add BINARY_DIR argument for
 add_lit_testsuites (#144431)

We're doing some slightly odd things with LIT in the offload-test-suite.
Specifically we generate multiple binary directories to configure and
run tests with different configurations from the same source root.

In this configuration the subdirectory targets need to instead point to
the correct generated binary directory and use test filtering to get a
subset of tests.
---
 llvm/cmake/modules/AddLLVM.cmake | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 7a7340ff8a45..8d8a94d1cddc 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2192,7 +2192,7 @@ endfunction()
 
 function(add_lit_testsuites project directory)
   if (NOT LLVM_ENABLE_IDE)
-    cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "FOLDER" "PARAMS;DEPENDS;ARGS" ${ARGN})
+    cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "FOLDER;BINARY_DIR" "PARAMS;DEPENDS;ARGS" ${ARGN})
 
     if (NOT ARG_FOLDER)
       get_subproject_title(subproject_title)
@@ -2213,13 +2213,18 @@ function(add_lit_testsuites project directory)
       endif()
 
       # Create a check- target for the directory.
-      string(REPLACE ${directory} "" name_slash ${lit_suite})
+      string(REPLACE "${directory}/" "" name_slash ${lit_suite})
       if (name_slash)
+        set(filter ${name_slash})
         string(REPLACE "/" "-" name_slash ${name_slash})
         string(REPLACE "\\" "-" name_dashes ${name_slash})
-        string(TOLOWER "${project}${name_dashes}" name_var)
+        string(TOLOWER "${project}-${name_dashes}" name_var)
+        set(lit_args ${lit_suite})
+        if (ARG_BINARY_DIR)
+          set(lit_args ${ARG_BINARY_DIR} --filter=${filter})
+        endif()
         add_lit_target("check-${name_var}" "Running lit suite ${lit_suite}"
-          ${lit_suite}
+          ${lit_args}
           ${EXCLUDE_FROM_CHECK_ALL}
           PARAMS ${ARG_PARAMS}
           DEPENDS ${ARG_DEPENDS}

From 526310e916af2073e30b57b678307ce94df803f3 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 17 Jun 2025 13:10:51 -0400
Subject: [PATCH 0677/1322] [Remarks] Elaborate on called intrinsics (#143985)

---
 llvm/lib/IR/DiagnosticInfo.cpp          |  4 ++++
 llvm/test/Transforms/GVN/opt-remarks.ll | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 0f1291b8bd8b..b94dcace5e3c 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -211,6 +212,9 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
+  } else if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+    raw_string_ostream OS(Val);
+    OS << "call " << II->getCalledFunction()->getName();
   } else if (auto *I = dyn_cast<Instruction>(V)) {
     Val = I->getOpcodeName();
   } else if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll
index 7c3f16917bc9..8fb2d5756f95 100644
--- a/llvm/test/Transforms/GVN/opt-remarks.ll
+++ b/llvm/test/Transforms/GVN/opt-remarks.ll
@@ -62,6 +62,19 @@
 ; YAML-NEXT:   - ClobberedBy:     store
 ; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
 ; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            gvn
+; YAML-NEXT: Name:            LoadClobbered
+; YAML-NEXT: Function:        lifetime_end
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'load of type '
+; YAML-NEXT:   - Type:            i8
+; YAML-NEXT:   - String:          ' not eliminated'
+; YAML-NEXT:   - String:          ' in favor of '
+; YAML-NEXT:   - OtherAccess:     store
+; YAML-NEXT:   - String:          ' because it is clobbered by '
+; YAML-NEXT:   - ClobberedBy:     call llvm.lifetime.end.p0
+; YAML-NEXT: ...
 
 define i32 @arg(ptr %p, i32 %i) {
 entry:
@@ -93,6 +106,15 @@ entry:
   %add = add i32 %load1, %load
   ret i32 %add
 }
+
+define i8 @lifetime_end(ptr %p, i8 %val) {
+  call void @llvm.lifetime.start.p0(i64 32, ptr %p)
+  store i8 %val, ptr %p
+  call void @llvm.lifetime.end.p0(i64 32, ptr %p)
+  %1 = load i8, ptr %p
+  ret i8 %1
+}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}

From ec230aa7a7d13c222c0b34b87c3c16937383b4a0 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 22:49:55 +0530
Subject: [PATCH 0678/1322] [Driver] Add support for crtbegin.o, crtend.o and
 libgloss lib to BareMetal toolchain object (#121830)

This patch conditionalise the addition of crt{begin,end}.o object files
along
with addition of -lgloss lib based on whether libc selected is newlib or
llvm
libc. Since there is no way a user can specify which libc it wants to
link
against, currently passing valid GCCInstallation to driver will select
newlib
otherwise it will default to llvm libc.

Moreover, this patch makes gnuld the default linker for baremetal
toolchain
object. User need to pass `-fuse-ld=lld` explicitly to driver to select
lld

This is the 2nd patch in the series of patches of merging RISCVToolchain
into
BareMetal toolchain object.

RFC:

https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/lib/Driver/ToolChains/BareMetal.cpp   | 37 +++++++-
 clang/lib/Driver/ToolChains/BareMetal.h     |  3 +-
 clang/test/Driver/aarch64-toolchain-extra.c | 13 ++-
 clang/test/Driver/aarch64-toolchain.c       | 95 ++++++++++++++++++++
 clang/test/Driver/arm-toolchain-extra.c     |  7 ++
 clang/test/Driver/arm-toolchain.c           | 99 ++++++++++++++++++++-
 clang/test/Driver/baremetal.cpp             |  3 +-
 clang/test/Driver/sanitizer-ld.c            |  2 +-
 8 files changed, 246 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 0fbfe6c77f34..a08bb588dd76 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -584,9 +584,31 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Arch == llvm::Triple::aarch64_be ? "-EB" : "-EL");
   }
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
-                   options::OPT_r)) {
-    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
+  bool NeedCRTs =
+      !Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles);
+
+  const char *CRTBegin, *CRTEnd;
+  if (NeedCRTs) {
+    if (!Args.hasArg(options::OPT_r))
+      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
+    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) {
+      auto RuntimeLib = TC.GetRuntimeLibType(Args);
+      switch (RuntimeLib) {
+      case (ToolChain::RLT_Libgcc): {
+        CRTBegin = "crtbegin.o";
+        CRTEnd = "crtend.o";
+        break;
+      }
+      case (ToolChain::RLT_CompilerRT): {
+        CRTBegin =
+            TC.getCompilerRTArgString(Args, "crtbegin", ToolChain::FT_Object);
+        CRTEnd =
+            TC.getCompilerRTArgString(Args, "crtend", ToolChain::FT_Object);
+        break;
+      }
+      }
+      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTBegin)));
+    }
   }
 
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
@@ -609,15 +631,22 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
+    CmdArgs.push_back("--start-group");
     AddRunTimeLibs(TC, D, CmdArgs, Args);
-
     CmdArgs.push_back("-lc");
+    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D))
+      CmdArgs.push_back("-lgloss");
+    CmdArgs.push_back("--end-group");
   }
 
   if (D.isUsingLTO())
     addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
                   D.getLTOMode() == LTOK_Thin);
 
+  if ((TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) &&
+      NeedCRTs)
+    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
+
   if (TC.getTriple().isRISCV())
     CmdArgs.push_back("-X");
 
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index 930f8584e643..54805530bae8 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -38,6 +38,7 @@ protected:
 public:
   bool initGCCInstallation(const llvm::Triple &Triple,
                            const llvm::opt::ArgList &Args);
+  bool hasValidGCCInstallation() const { return IsGCCInstallationValid; }
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -63,8 +64,6 @@ public:
     return ToolChain::CST_Libcxx;
   }
 
-  const char *getDefaultLinker() const override { return "ld.lld"; }
-
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
index 2610e962bd69..2a930e35acd4 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -15,14 +15,21 @@
 // RUN: mkdir -p %t/aarch64-nogcc/bin
 // RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
 // RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
+// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld %t/aarch64-nogcc/bin/aarch64-none-elf-ld
 // RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
 // RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
 // RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
 
 // RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
 // RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
 // RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
 
-// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/aarch64-none-elf-ld"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib"
+// C-AARCH64-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtend.o"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index 7f2c01d928e4..83cd95136b15 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -1,5 +1,24 @@
 // UNSUPPORTED: system-windows
 
+// Test interaction with -fuse-ld=lld
+// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=LLD-AARCH64-BAREMETAL %s
+
+// LLD-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
+// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// LLD-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
+
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf --rtlib=libgcc \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
@@ -9,6 +28,14 @@
 // C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// C-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// C-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf --rtlib=libgcc \
@@ -18,6 +45,14 @@
 
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
@@ -29,6 +64,14 @@
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
@@ -40,6 +83,14 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
@@ -50,6 +101,14 @@
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
@@ -59,3 +118,39 @@
 
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-COMPILER-RT %s
+
+// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
+// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
+// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
+
+// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
+// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
+// AARCH64-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
index 114de0a8154a..2adf4ab698ba 100644
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -15,6 +15,7 @@
 // RUN: mkdir -p %t/arm-nogcc/bin
 // RUN: ln -s %clang %t/arm-nogcc/bin/clang
 // RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
+// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld %t/arm-nogcc/bin/armv6m-none-eabi-ld
 // RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
 // RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
 // RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
@@ -26,4 +27,10 @@
 // RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
 
 // C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/armv6m-none-eabi-ld"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib/crt0.o"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtbegin.o"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib"
+// C-ARM-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtend.o"
 
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index 2e38461fb7a3..66bed1b0c4d8 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -1,5 +1,23 @@
 // UNSUPPORTED: system-windows
 
+// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=LLD-ARM-BAREMETAL %s
+
+// LLD-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
+// LLD-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// LLD-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
+
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
@@ -9,6 +27,14 @@
 // C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// C-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// C-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
@@ -18,6 +44,14 @@
 
 // C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
+// C-ARM-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
@@ -28,8 +62,17 @@
 // CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
+
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
@@ -41,6 +84,14 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
@@ -51,6 +102,14 @@
 // CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
@@ -59,4 +118,40 @@
 // RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
 
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-COMPILER-RT %s
+
+// ARM-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
+// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
+// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
+
+// ARM-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
+// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
+// ARM-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 2ac83402dda3..eff8f775a9c1 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -319,7 +319,8 @@
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc"
+// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index befd322d027c..d2e4877e89d7 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -1033,7 +1033,7 @@
 // RUN:     --target=riscv32-unknown-elf -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32
 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error:
-// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
+// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv64-unknown-linux -fuse-ld=ld \

From 8513066f2c49457f5d1f63e275403330f854041c Mon Sep 17 00:00:00 2001
From: someoneinjd <someoneinjd@outlook.com>
Date: Wed, 18 Jun 2025 01:23:14 +0800
Subject: [PATCH 0679/1322] [clangd] Implement LSP 3.17 positionEncoding
 (#142903)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds support for the `positionEncoding` client capability
introduced in LSP 3.17. Clangd can now negotiate the position encoding
with the client during initialization.

Fix https://github.com/clangd/clangd/issues/1746

Co-authored-by: kadir çetinkaya <kadircetinkaya.06.tr@gmail.com>
---
 clang-tools-extra/clangd/ClangdLSPServer.cpp  | 10 ++++--
 clang-tools-extra/clangd/Protocol.cpp         | 20 +++++++++---
 clang-tools-extra/clangd/Protocol.h           |  5 +--
 .../clangd/test/positionencoding.test         | 32 +++++++++++++++++++
 4 files changed, 59 insertions(+), 8 deletions(-)
 create mode 100644 clang-tools-extra/clangd/test/positionencoding.test

diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 29321f7cd3fa..a703009e2b46 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -494,9 +494,9 @@ static std::vector<llvm::StringRef> semanticTokenModifiers() {
 void ClangdLSPServer::onInitialize(const InitializeParams &Params,
                                    Callback<llvm::json::Value> Reply) {
   // Determine character encoding first as it affects constructed ClangdServer.
-  if (Params.capabilities.offsetEncoding && !Opts.Encoding) {
+  if (Params.capabilities.PositionEncodings && !Opts.Encoding) {
     Opts.Encoding = OffsetEncoding::UTF16; // fallback
-    for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding)
+    for (OffsetEncoding Supported : *Params.capabilities.PositionEncodings)
       if (Supported != OffsetEncoding::UnsupportedEncoding) {
         Opts.Encoding = Supported;
         break;
@@ -686,6 +686,9 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
   ServerCaps["executeCommandProvider"] =
       llvm::json::Object{{"commands", Commands}};
 
+  if (Opts.Encoding)
+    ServerCaps["positionEncoding"] = *Opts.Encoding;
+
   llvm::json::Object Result{
       {{"serverInfo",
         llvm::json::Object{
@@ -693,6 +696,9 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
             {"version", llvm::formatv("{0} {1} {2}", versionString(),
                                       featureString(), platformString())}}},
        {"capabilities", std::move(ServerCaps)}}};
+
+  // TODO: offsetEncoding capability is a deprecated clangd extension and should
+  // be deleted.
   if (Opts.Encoding)
     Result["offsetEncoding"] = *Opts.Encoding;
   Reply(std::move(Result));
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index c9e8a175b5d7..2c858e28fa24 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -497,10 +497,19 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
       if (auto Cancel = StaleRequestSupport->getBoolean("cancel"))
         R.CancelsStaleRequests = *Cancel;
     }
+    if (auto *PositionEncodings = General->get("positionEncodings")) {
+      R.PositionEncodings.emplace();
+      if (!fromJSON(*PositionEncodings, *R.PositionEncodings,
+                    P.field("general").field("positionEncodings")))
+        return false;
+    }
   }
   if (auto *OffsetEncoding = O->get("offsetEncoding")) {
-    R.offsetEncoding.emplace();
-    if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+    R.PositionEncodings.emplace();
+    elog("offsetEncoding capability is a deprecated clangd extension that'll "
+         "go away with clangd 23. Migrate to standard positionEncodings "
+         "capability introduced by LSP 3.17");
+    if (!fromJSON(*OffsetEncoding, *R.PositionEncodings,
                   P.field("offsetEncoding")))
       return false;
   }
@@ -536,8 +545,11 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
       }
     }
     if (auto *OffsetEncoding = Experimental->get("offsetEncoding")) {
-      R.offsetEncoding.emplace();
-      if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+      R.PositionEncodings.emplace();
+      elog("offsetEncoding capability is a deprecated clangd extension that'll "
+           "go away with clangd 23. Migrate to standard positionEncodings "
+           "capability introduced by LSP 3.17");
+      if (!fromJSON(*OffsetEncoding, *R.PositionEncodings,
                     P.field("offsetEncoding")))
         return false;
     }
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index 8a7809d6677e..3a6bf155ee15 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -528,8 +528,9 @@ struct ClientCapabilities {
   /// textDocument.semanticHighlightingCapabilities.semanticHighlighting
   bool TheiaSemanticHighlighting = false;
 
-  /// Supported encodings for LSP character offsets. (clangd extension).
-  std::optional<std::vector<OffsetEncoding>> offsetEncoding;
+  /// Supported encodings for LSP character offsets.
+  /// general.positionEncodings
+  std::optional<std::vector<OffsetEncoding>> PositionEncodings;
 
   /// The content format that should be used for Hover requests.
   /// textDocument.hover.contentEncoding
diff --git a/clang-tools-extra/clangd/test/positionencoding.test b/clang-tools-extra/clangd/test/positionencoding.test
new file mode 100644
index 000000000000..eea7a1a596e9
--- /dev/null
+++ b/clang-tools-extra/clangd/test/positionencoding.test
@@ -0,0 +1,32 @@
+# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s
+# This test verifies that we can negotiate UTF-8 offsets via the positionEncodings capability introduced in LSP 3.17.
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"general":{"positionEncodings":["utf-8","utf-16"]}},"trace":"off"}}
+# CHECK: "positionEncoding": "utf-8"
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ö*/int x;\nint y=x;"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}}
+# /*ö*/int x;
+# 01234567890
+# x is character (and utf-16) range [9,10) but byte range [10,11).
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": [
+# CHECK-NEXT:    {
+# CHECK-NEXT:      "range": {
+# CHECK-NEXT:        "end": {
+# CHECK-NEXT:          "character": 11,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        },
+# CHECK-NEXT:        "start": {
+# CHECK-NEXT:          "character": 10,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        }
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "uri": "file://{{.*}}/main.cpp"
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
+---
+{"jsonrpc":"2.0","id":10000,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}

From 9dd1c66e8ffba73fead13aaf359e290f6e1d4899 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 17 Jun 2025 18:24:07 +0100
Subject: [PATCH 0680/1322] [VPlan] Expand VPWidenIntOrFpInductionRecipe into
 separate recipes (#118638)

The motivation of this PR is to make #115274 easier to implement, and
should allow us to add EVL support by just passing EVL to the VF
operand.

The current difficulty with widening IVs with EVL is that
VPWidenIntOrFpInductionRecipe generates its own backedge value. Since
it's a VPHeaderPHIRecipe the VF operand must be in the preheader, which
means we can't use the EVL since it's defined in the loop body.

The gist in this PR is to take the approach in #114305 and expand
VPWidenIntOrFpInductionRecipe into several recipes for the initial
value, phi and backedge value just before execution. I.e. this example:

```
  vector.ph:
  Successor(s): vector loop

  <x1> vector loop: {
    vector.body:
      WIDEN-INDUCTION %i = phi %start, %step, %vf
      ...
      EMIT branch-on-count ...
    No successors
  }
```

gets expanded to:

```
vector.ph:
  ...
  vp<%induction.start> = ...
  vp<%induction.increment> = ...

Successor(s): vector loop

<x1> vector loop: {
  vector.body:
    ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
    ...
    vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
    EMIT branch-on-count ...
  No successors
}
```

This allows us to a value defined in the loop in the backedge value, and
also means we can just reuse the existing backedge fixups in
VPlan::execute without having to specially handle it ourselves.

After this #115274 should just become a matter of setting the VF operand
to EVL (and building the increment step in the loop body, not the
preheader).
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  20 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  22 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 154 +---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 159 +++-
 .../Transforms/Vectorize/VPlanTransforms.h    |   5 -
 .../AArch64/clamped-trip-count.ll             |   8 +-
 .../AArch64/conditional-branches-cost.ll      |   4 +-
 .../AArch64/divs-with-scalable-vfs.ll         |   4 +-
 .../AArch64/epilog-iv-select-cmp.ll           |  28 +-
 .../LoopVectorize/AArch64/optsize_minsize.ll  |   8 +-
 .../AArch64/simple_early_exit.ll              |   2 +-
 .../AArch64/sve-inductions-unusual-types.ll   |   8 +-
 .../AArch64/sve-interleaved-accesses.ll       |  22 +-
 .../sve-interleaved-masked-accesses.ll        |  80 +-
 ...eave-to-widen-memory-remove-loop-region.ll |   2 +-
 .../LoopVectorize/ARM/optsize_minsize.ll      |   2 +-
 .../RISCV/interleaved-masked-access.ll        |  60 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |   2 +-
 ...ruction-or-drop-poison-generating-flags.ll |   2 +-
 .../Transforms/LoopVectorize/RISCV/pr88802.ll |   2 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |   2 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll |  10 +-
 ...ectorize-force-tail-with-evl-interleave.ll |   6 +-
 .../LoopVectorize/X86/constant-fold.ll        |   4 +-
 .../X86/drop-poison-generating-flags.ll       |  24 +-
 .../LoopVectorize/X86/induction-costs.ll      |   2 +-
 .../LoopVectorize/X86/interleave-cost.ll      |   4 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |   2 +-
 .../LoopVectorize/X86/scatter_crash.ll        |  16 +-
 .../epilog-vectorization-any-of-reductions.ll |   4 +-
 .../LoopVectorize/first-order-recurrence.ll   | 719 +++++++++++++-----
 ...eref-pred-poison-ub-ops-feeding-pointer.ll |  10 +-
 .../LoopVectorize/pointer-induction.ll        | 138 ++++
 .../LoopVectorize/reduction-inloop-pred.ll    |  36 +-
 .../scalable-first-order-recurrence.ll        |   8 +-
 .../LoopVectorize/scalable-inductions.ll      |   8 +-
 .../LoopVectorize/scalable-iv-outside-user.ll |   6 +-
 .../single_early_exit_live_outs.ll            |   2 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |   2 +-
 .../LoopVectorize/vplan-iv-transforms.ll      |  65 ++
 41 files changed, 1069 insertions(+), 597 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f1470fd1f731..f887b34e7642 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2764,8 +2764,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
 
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
-  if (EnableVPlanNativePath)
-    fixNonInductionPHIs(State);
+  fixNonInductionPHIs(State);
 
   // After vectorization, the exit blocks of the original loop will have
   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
@@ -7324,7 +7323,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
          "Trying to execute plan with unsupported VF");
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
-  VPlanTransforms::runPass(VPlanTransforms::materializeStepVectors, BestVPlan);
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index cca3d32c0783..4332332ef5cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1029,17 +1029,11 @@ void VPlan::execute(VPTransformState *State) {
     if (isa<VPWidenPHIRecipe>(&R))
       continue;
 
-    if (isa<VPWidenInductionRecipe>(&R)) {
-      PHINode *Phi = nullptr;
-      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-        Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
-      } else {
-        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
-        assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-               "recipe generating only scalars should have been replaced");
-        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-        Phi = cast<PHINode>(GEP->getPointerOperand());
-      }
+    if (auto *WidenPhi = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+      assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+             "recipe generating only scalars should have been replaced");
+      auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+      PHINode *Phi = cast<PHINode>(GEP->getPointerOperand());
 
       Phi->setIncomingBlock(1, VectorLatchBB);
 
@@ -1047,10 +1041,6 @@ void VPlan::execute(VPTransformState *State) {
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
       Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
-
-      // Use the steps for the last part as backedge value for the induction.
-      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a3c4a514a5d..f3306ad7cb8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1951,12 +1951,13 @@ public:
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector values.
+/// producing their vector values. This is an abstract recipe and must be
+/// converted to concrete recipes before executing.
 class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   TruncInst *Trunc;
 
   // If this recipe is unrolled it will have 2 additional operands.
-  bool isUnrolled() const { return getNumOperands() == 6; }
+  bool isUnrolled() const { return getNumOperands() == 5; }
 
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
@@ -1992,9 +1993,10 @@ public:
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
 
-  /// Generate the vectorized and scalarized versions of the phi node as
-  /// needed by their users.
-  void execute(VPTransformState &State) override;
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("cannot execute this recipe, should be expanded via "
+                     "expandVPWidenIntOrFpInductionRecipe");
+  }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -2005,16 +2007,6 @@ public:
   VPValue *getVFValue() { return getOperand(2); }
   const VPValue *getVFValue() const { return getOperand(2); }
 
-  // TODO: Remove once VPWidenIntOrFpInduction is fully expanded in
-  // convertToConcreteRecipes.
-  VPInstructionWithType *getStepVector() {
-    auto *StepVector =
-        cast<VPInstructionWithType>(getOperand(3)->getDefiningRecipe());
-    assert(StepVector->getOpcode() == VPInstruction::StepVector &&
-           "step vector operand must be a VPInstruction::StepVector");
-    return StepVector;
-  }
-
   VPValue *getSplatVFValue() {
     // If the recipe has been unrolled return the VPValue for the induction
     // increment.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 048286d7a97b..1ed0b97849a8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -952,6 +952,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::Broadcast:
   case VPInstruction::ReductionStartVector:
     return true;
   case VPInstruction::PtrAdd:
@@ -1077,15 +1078,14 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 
 void VPInstructionWithType::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
-  switch (getOpcode()) {
-  case Instruction::ZExt:
-  case Instruction::Trunc: {
+  if (isScalarCast()) {
     Value *Op = State.get(getOperand(0), VPLane(0));
     Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
                                            Op, ResultTy);
     State.set(this, Cast, VPLane(0));
-    break;
+    return;
   }
+  switch (getOpcode()) {
   case VPInstruction::StepVector: {
     Value *StepVector =
         State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
@@ -1965,44 +1965,6 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 }
 
-/// This function adds
-/// (0 * Step, 1 * Step, 2 * Step, ...)
-/// to each vector element of Val.
-/// \p Opcode is relevant for FP induction variable.
-/// \p InitVec is an integer step vector from 0 with a step of 1.
-static Value *getStepVector(Value *Val, Value *Step, Value *InitVec,
-                            Instruction::BinaryOps BinOp, ElementCount VF,
-                            IRBuilderBase &Builder) {
-  assert(VF.isVector() && "only vector VFs are supported");
-
-  // Create and check the types.
-  auto *ValVTy = cast<VectorType>(Val->getType());
-  ElementCount VLen = ValVTy->getElementCount();
-
-  Type *STy = Val->getType()->getScalarType();
-  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
-         "Induction Step must be an integer or FP");
-  assert(Step->getType() == STy && "Step has wrong type");
-
-  if (STy->isIntegerTy()) {
-    Step = Builder.CreateVectorSplat(VLen, Step);
-    assert(Step->getType() == Val->getType() && "Invalid step vec");
-    // FIXME: The newly created binary instructions should contain nsw/nuw
-    // flags, which can be found from the original scalar operations.
-    Step = Builder.CreateMul(InitVec, Step);
-    return Builder.CreateAdd(Val, Step, "induction");
-  }
-
-  // Floating point induction.
-  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
-         "Binary Opcode should be specified for FP induction");
-  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
-
-  Step = Builder.CreateVectorSplat(VLen, Step);
-  Value *MulOp = Builder.CreateFMul(InitVec, Step);
-  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
 /// A helper function that returns an integer or floating-point constant with
 /// value C.
 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
@@ -2010,104 +1972,6 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
                            : ConstantFP::get(Ty, C);
 }
 
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Int or FP induction being replicated.");
-
-  Value *Start = getStartValue()->getLiveInIRValue();
-  const InductionDescriptor &ID = getInductionDescriptor();
-  TruncInst *Trunc = getTruncInst();
-  IRBuilderBase &Builder = State.Builder;
-  assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
-         "Types must match");
-  assert(State.VF.isVector() && "must have vector VF");
-
-  // The value from the original loop to which we are mapping the new induction
-  // variable.
-  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
-
-  // Fast-math-flags propagate from the original induction instruction.
-  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
-    Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
-  // Now do the actual transformations, and start with fetching the step value.
-  Value *Step = State.get(getStepValue(), VPLane(0));
-
-  assert((isa<PHINode, TruncInst>(EntryVal)) &&
-         "Expected either an induction phi-node or a truncate of it!");
-
-  // Construct the initial value of the vector IV in the vector loop preheader
-  auto CurrIP = Builder.saveIP();
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  Builder.SetInsertPoint(VectorPH->getTerminator());
-  if (isa<TruncInst>(EntryVal)) {
-    assert(Start->getType()->isIntegerTy() &&
-           "Truncation requires an integer type");
-    auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = Builder.CreateTrunc(Step, TruncType);
-    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
-  }
-
-  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
-  Value *SteppedStart =
-      ::getStepVector(SplatStart, Step, State.get(getStepVector()),
-                      ID.getInductionOpcode(), State.VF, State.Builder);
-
-  // We create vector phi nodes for both integer and floating-point induction
-  // variables. Here, we determine the kind of arithmetic we will perform.
-  Instruction::BinaryOps AddOp;
-  Instruction::BinaryOps MulOp;
-  if (Step->getType()->isIntegerTy()) {
-    AddOp = Instruction::Add;
-    MulOp = Instruction::Mul;
-  } else {
-    AddOp = ID.getInductionOpcode();
-    MulOp = Instruction::FMul;
-  }
-
-  Value *SplatVF;
-  if (VPValue *SplatVFOperand = getSplatVFValue()) {
-    // The recipe has been unrolled. In that case, fetch the splat value for the
-    // induction increment.
-    SplatVF = State.get(SplatVFOperand);
-  } else {
-    // Multiply the vectorization factor by the step using integer or
-    // floating-point arithmetic as appropriate.
-    Type *StepType = Step->getType();
-    Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
-    if (Step->getType()->isFloatingPointTy())
-      RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
-    else
-      RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
-    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-    // Create a vector splat to use in the induction update.
-    SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
-  }
-
-  Builder.restoreIP(CurrIP);
-
-  // We may need to add the step a number of times, depending on the unroll
-  // factor. The last of those goes into the PHI.
-  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
-  VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
-  VecInd->setDebugLoc(getDebugLoc());
-  State.set(this, VecInd);
-
-  Instruction *LastInduction = cast<Instruction>(
-      Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
-  LastInduction->setDebugLoc(getDebugLoc());
-
-  VecInd->addIncoming(SteppedStart, VectorPH);
-  // Add induction update using an incorrect block temporarily. The phi node
-  // will be fixed after VPlan execution. Note that at this point the latch
-  // block cannot be used, as it does not exist yet.
-  // TODO: Model increment value in VPlan, by turning the recipe into a
-  // multi-def and a subclass of VPHeaderPHIRecipe.
-  VecInd->addIncoming(LastInduction, VectorPH);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -3871,12 +3735,14 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  assert(EnableVPlanNativePath &&
-         "Non-native vplans are not expected to have VPWidenPHIRecipes.");
-
   Value *Op0 = State.get(getOperand(0));
   Type *VecTy = Op0->getType();
-  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
+  Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
+  // Manually move it with the other PHIs in case PHI recipes above this one
+  // also inserted non-phi instructions.
+  // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
+  // convertToConcreteRecipes.
+  VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
   State.set(this, VecPhi);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 05a0e15f9a19..11f0f2a93032 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1358,17 +1358,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
     WideIV->setStartValue(NewStart);
     auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
     WideIV->setStepValue(NewStep);
-    // TODO: Remove once VPWidenIntOrFpInductionRecipe is fully expanded.
-    VPInstructionWithType *OldStepVector = WideIV->getStepVector();
-    assert(OldStepVector->getNumUsers() == 1 &&
-           "step vector should only be used by single "
-           "VPWidenIntOrFpInductionRecipe");
-    auto *NewStepVector =
-        new VPInstructionWithType(VPInstruction::StepVector, {}, NewIVTy, {},
-                                  OldStepVector->getDebugLoc());
-    NewStepVector->insertAfter(OldStepVector->getDefiningRecipe());
-    OldStepVector->replaceAllUsesWith(NewStepVector);
-    OldStepVector->eraseFromParent();
 
     auto *NewBTC = new VPWidenCastRecipe(
         Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
@@ -2518,6 +2507,127 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
+/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
+/// value, phi and backedge value. In the following example:
+///
+///  vector.ph:
+///  Successor(s): vector loop
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      WIDEN-INDUCTION %i = phi %start, %step, %vf
+///      ...
+///      EMIT branch-on-count ...
+///    No successors
+///  }
+///
+/// WIDEN-INDUCTION will get expanded to:
+///
+///  vector.ph:
+///    ...
+///    vp<%induction.start> = ...
+///    vp<%induction.increment> = ...
+///
+///  Successor(s): vector loop
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
+///      ...
+///      vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
+///      EMIT branch-on-count ...
+///    No successors
+///  }
+static void
+expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
+                              VPTypeAnalysis &TypeInfo) {
+  VPlan *Plan = WidenIVR->getParent()->getPlan();
+  VPValue *Start = WidenIVR->getStartValue();
+  VPValue *Step = WidenIVR->getStepValue();
+  VPValue *VF = WidenIVR->getVFValue();
+  DebugLoc DL = WidenIVR->getDebugLoc();
+
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Type *Ty = TypeInfo.inferScalarType(WidenIVR);
+
+  const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  // FIXME: The newly created binary instructions should contain nsw/nuw
+  // flags, which can be found from the original scalar operations.
+  VPIRFlags Flags;
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+    Flags = ID.getInductionBinOp()->getFastMathFlags();
+  }
+
+  // If the phi is truncated, truncate the start and step values.
+  VPBuilder Builder(Plan->getVectorPreheader());
+  Type *StepTy = TypeInfo.inferScalarType(Step);
+  if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
+    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
+    Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
+    Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
+    StepTy = Ty;
+  }
+
+  // Construct the initial value of the vector IV in the vector loop preheader.
+  Type *IVIntTy =
+      IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
+  VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
+  if (StepTy->isFloatingPointTy())
+    Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
+
+  VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
+  VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
+
+  Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
+  Init =
+      Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
+
+  // Create the widened phi of the vector IV.
+  auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
+                                       WidenIVR->getDebugLoc(), "vec.ind");
+  WidePHI->addOperand(Init);
+  WidePHI->insertBefore(WidenIVR);
+
+  // Create the backedge value for the vector IV.
+  VPValue *Inc;
+  VPValue *Prev;
+  // If unrolled, use the increment and prev value from the operands.
+  if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
+    Inc = SplatVF;
+    Prev = WidenIVR->getLastUnrolledPartOperand();
+  } else {
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    if (StepTy->isFloatingPointTy())
+      VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
+                                    DL);
+    else
+      VF =
+          Builder.createScalarCast(Instruction::CastOps::Trunc, VF, StepTy, DL);
+
+    Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
+    Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
+    Prev = WidePHI;
+  }
+
+  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
+  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
+  auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
+                                    WidenIVR->getDebugLoc(), "vec.ind.next");
+
+  WidePHI->addOperand(Next);
+
+  WidenIVR->replaceAllUsesWith(WidePHI);
+}
+
 void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
   // Replace loop regions with explicity CFG.
   SmallVector<VPRegionBlock *> LoopRegions;
@@ -2625,6 +2735,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
         continue;
       }
 
+      if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+        expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
+        ToRemove.push_back(WidenIVR);
+        continue;
+      }
+
       VPValue *VectorStep;
       VPValue *ScalarStep;
       if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
@@ -2935,27 +3051,6 @@ void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
   }
 }
 
-void VPlanTransforms::materializeStepVectors(VPlan &Plan) {
-  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
-    if (!IVR)
-      continue;
-
-    Type *Ty = IVR->getPHINode()->getType();
-    if (TruncInst *Trunc = IVR->getTruncInst())
-      Ty = Trunc->getType();
-    if (Ty->isFloatingPointTy())
-      Ty = IntegerType::get(Ty->getContext(), Ty->getScalarSizeInBits());
-
-    VPBuilder Builder(Plan.getVectorPreheader());
-    VPInstruction *StepVector = Builder.createNaryOp(
-        VPInstruction::StepVector, {}, Ty, {}, IVR->getDebugLoc());
-    assert(IVR->getNumOperands() == 3 &&
-           "can only add step vector before unrolling");
-    IVR->addOperand(StepVector);
-  }
-}
-
 void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a03bdb7c688..7e51c05d1b5b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -209,11 +209,6 @@ struct VPlanTransforms {
   optimizeInductionExitUsers(VPlan &Plan,
                              DenseMap<VPValue *, VPValue *> &EndValues);
 
-  /// Materialize VPInstruction::StepVectors for VPWidenIntOrFpInductionRecipes.
-  /// TODO: Remove once all of VPWidenIntOrFpInductionRecipe is expanded in
-  /// convertToConcreteRecipes.
-  static void materializeStepVectors(VPlan &Plan);
-
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ae7719757dc3..24c703ae42f0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -16,9 +16,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP6]]
@@ -36,8 +36,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
@@ -100,9 +100,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP6]]
@@ -120,8 +120,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f36161703dba..976f95ff4f0b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -862,8 +862,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; DEFAULT-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE14]]:
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[EXIT:.*]]
@@ -964,8 +964,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; PRED-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
 ; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; PRED:       [[PRED_STORE_CONTINUE14]]:
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 4775a6ec3f91..d59607711b5b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -123,9 +123,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP9]]
@@ -246,9 +246,9 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 5508a65744c6..895781de31f3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -87,12 +87,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP1]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 3
@@ -184,16 +184,16 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT10]], zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
-; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT14]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT14]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
 ; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4
@@ -207,12 +207,12 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL21:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX22:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL23:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX24:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX22]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL23]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX24]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[START]], 0
 ; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-NEXT:    [[RED_NEXT]] = select i1 [[C]], i32 [[IV_TRUNC]], i32 [[RED]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 4d3afd71921d..e4718dc21635 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -394,9 +394,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
@@ -514,13 +514,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; DEFAULT-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
@@ -590,13 +590,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; OPTSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; OPTSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; OPTSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
@@ -666,13 +666,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; MINSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; MINSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; MINSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 74b0c2c0e033..d02d03b4b437 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -120,8 +120,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 49584bd47353..f44744071ae5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -20,11 +20,11 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7> 
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7> 
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i7> [[TMP7]], splat (i7 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -85,11 +85,11 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i3> [[TMP7]], splat (i3 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 77e713256d24..7e4edf739695 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -101,11 +101,11 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
@@ -185,11 +185,11 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
@@ -579,9 +579,9 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i64> splat (i64 1023), [[TMP2]]
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = sub nsw i64 0, [[TMP1]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[DOTNEG]], i64 0
@@ -809,9 +809,9 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -958,9 +958,9 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1113,13 +1113,13 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
@@ -1191,13 +1191,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP21]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP7]], 3
@@ -1284,14 +1284,14 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shl <vscale x 4 x i64> [[TMP14]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP9]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP33]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = add nsw i32 [[TMP16]], -1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP34]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3567aff0ace4..bd2bd5aa2795 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -37,11 +37,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -60,10 +60,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -83,11 +83,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -108,11 +108,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -182,11 +182,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -201,7 +201,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -221,11 +221,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -243,7 +243,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -309,13 +309,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -331,7 +331,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -352,13 +352,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -378,7 +378,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -456,11 +456,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -483,10 +483,10 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -506,11 +506,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -535,11 +535,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 022789ad9de7..fea57fa8b6b6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -107,8 +107,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; VF4:       [[PRED_STORE_CONTINUE6]]:
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
 ; VF4-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index a7a7b1af5953..1d898fbaaed3 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -390,9 +390,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index b7c9612e57ae..79425ae3a67e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -21,11 +21,11 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -38,8 +38,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP7]])
@@ -48,7 +48,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -69,11 +69,11 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -88,8 +88,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
@@ -98,7 +98,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -190,11 +190,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -209,16 +209,16 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP16]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP20]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP22]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP24]]
@@ -233,7 +233,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP30]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP23]], <vscale x 16 x ptr> [[TMP31]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -254,11 +254,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -275,16 +275,16 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
@@ -299,7 +299,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index c64f6df075a0..3e4d337c0706 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -22,9 +22,9 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; VLENUNK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
 ; VLENUNK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 88d9ed2ce201..2f9ff20bf0f9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -20,13 +20,13 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1001, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP6]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 3dc17e615048..51a8b451dffd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -137,8 +137,8 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    store i8 [[TMP40]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT1:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index f89a863d1e5f..79590f5060ad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -553,9 +553,9 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; STRIDED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; STRIDED-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i64> [[TMP12]], splat (i64 1)
 ; STRIDED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
 ; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 70c04ded5cf5..827612cfe36d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -325,9 +325,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
@@ -432,9 +432,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
@@ -996,11 +996,11 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
@@ -1127,11 +1127,11 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
 ; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
@@ -1233,11 +1233,11 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 427123cfca6d..cd246053bcb3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -29,17 +29,17 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 7d9ed7d6215c..05a495d51c45 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -110,8 +110,8 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -214,8 +214,8 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 38b58fbfd102..53fd2ed43972 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -39,8 +39,8 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -91,8 +91,8 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nusw float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nusw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -143,8 +143,8 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nuw float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -196,8 +196,8 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -248,8 +248,8 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -305,8 +305,8 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -356,8 +356,8 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -409,8 +409,8 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -518,8 +518,8 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -570,8 +570,8 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -698,8 +698,8 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -778,8 +778,8 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 7aeb32afe43b..0a85548f8750 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -411,9 +411,9 @@ define i16 @iv_and_step_trunc() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 4fbee321b6a4..7f2544ddf149 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -519,8 +519,8 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -640,8 +640,8 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 6480c0ab1099..02d48cbda1aa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -71,8 +71,8 @@
 ; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
 
 ; AVX: [[ForInc]]:
-; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
 ; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8
+; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
 ; AVX: br i1 true, label %middle.block, label %vector.body
 
 @arr2 = external global [8 x i32], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 4038ace617c1..99650592d2de 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -131,7 +131,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
-; CHECK:       vector.body28:
+; CHECK:       vector.body30:
 ; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
@@ -153,18 +153,18 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK35:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block35:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK37:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block37:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
-; CHECK:       vec.epilog.iter.check42:
+; CHECK:       vec.epilog.iter.check44:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
 ; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH40]], label [[VEC_EPILOG_PH42]]
-; CHECK:       vec.epilog.ph41:
+; CHECK:       vec.epilog.ph43:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
@@ -183,7 +183,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY49:%.*]]
-; CHECK:       vec.epilog.vector.body49:
+; CHECK:       vec.epilog.vector.body57:
 ; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
 ; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
 ; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
@@ -206,10 +206,10 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block62:
+; CHECK:       vec.epilog.middle.block64:
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH40]]
-; CHECK:       vec.epilog.scalar.ph40:
+; CHECK:       vec.epilog.scalar.ph42:
 ; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index 1365e9f73d85..6e62ff842c6d 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -245,8 +245,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT6]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 9be26d4247a3..7684d274a75c 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -870,7 +870,7 @@ for.end:
 ; }
 ;
 ;
-define i32 @PR27246(ptr %dst) {
+define i32 @PR27246() {
 ; UNROLL-NO-IC-LABEL: @PR27246(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
@@ -882,8 +882,7 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
@@ -892,13 +891,10 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -906,21 +902,19 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND2:%.*]]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-IC:       for.cond1:
-; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-IC-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-IC-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-IC:       for.cond.cleanup3:
-; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -936,38 +930,33 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND2:%.*]]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-VF:       for.cond1:
-; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-VF-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-VF-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-VF:       for.cond.cleanup3:
-; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -983,24 +972,18 @@ define i32 @PR27246(ptr %dst) {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; SINK-AFTER-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
@@ -1008,21 +991,19 @@ define i32 @PR27246(ptr %dst) {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND2:%.*]]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; SINK-AFTER-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; SINK-AFTER:       for.cond1:
-; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; SINK-AFTER-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; SINK-AFTER-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; SINK-AFTER-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SINK-AFTER:       for.cond.cleanup3:
-; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; SINK-AFTER-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -1042,10 +1023,8 @@ for.cond.cleanup:
 for.cond1:
   %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
   %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
-  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %i.016
   %cmp2 = icmp sgt i32 %k.0, 1
   %dec = add nsw i32 %k.0, -1
-  store i32 %e.1, ptr %gep.dst
   br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
 
 for.cond.cleanup3:
@@ -1072,22 +1051,22 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1169,22 +1148,22 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 2
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 4
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 6
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; SINK-AFTER-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; SINK-AFTER-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
-; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
+; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1372,27 +1351,27 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT1:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
+; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADDX:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT1]], [[SCALAR_PH]] ], [ [[ADDX1:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[VAL_PHI]], 1
-; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[VAL_PHI]] to i64
-; UNROLL-NO-VF-NEXT:    [[ADDX1]] = add i32 [[VAL_PHI]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[VAL_PHI]], 95
+; UNROLL-NO-VF-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
+; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
+; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI1]], [[FOR_BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[VAL_PHI_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
@@ -2473,7 +2452,178 @@ for.end12.loopexit:                               ; preds = %cond.end
   ret void
 }
 
-define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
+; Dead instructions, like the exit condition are not part of the actual VPlan
+; and do not need to be sunk. PR44634.
+define void @sink_dead_inst(ptr %a) {
+; UNROLL-NO-IC-LABEL: @sink_dead_inst(
+; UNROLL-NO-IC-NEXT:  entry:
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], splat (i16 10)
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP10]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP12]], align 2
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-IC:       middle.block:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC:       scalar.ph:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-IC:       for.cond:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; UNROLL-NO-IC-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; UNROLL-NO-IC:       for.end:
+; UNROLL-NO-IC-NEXT:    ret void
+;
+; UNROLL-NO-VF-LABEL: @sink_dead_inst(
+; UNROLL-NO-VF-NEXT:  entry:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP2]], 5
+; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP3]], 5
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sub i16 [[TMP5]], 10
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A]], i16 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    store i16 [[TMP7]], ptr [[TMP9]], align 2
+; UNROLL-NO-VF-NEXT:    store i16 [[TMP8]], ptr [[TMP10]], align 2
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-VF:       middle.block:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF:       scalar.ph:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 15, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-VF:       for.cond:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-VF-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; UNROLL-NO-VF-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; UNROLL-NO-VF:       for.end:
+; UNROLL-NO-VF-NEXT:    ret void
+;
+; SINK-AFTER-LABEL: @sink_dead_inst(
+; SINK-AFTER-NEXT:  entry:
+; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
+; SINK-AFTER-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; SINK-AFTER-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; SINK-AFTER-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; SINK-AFTER:       middle.block:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER:       scalar.ph:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
+; SINK-AFTER:       for.cond:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; SINK-AFTER-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; SINK-AFTER-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; SINK-AFTER:       for.end:
+; SINK-AFTER-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %cmp = icmp eq i32 %rec.2, 15
+  %iv.next = add i16 %iv, 1
+  %rec.2.prev = zext i16 %iv.next to i32
+  %rec.1.prev = add i16 %iv.next, 5
+  %gep = getelementptr i16, ptr %a, i16 %iv
+  store i16 %use.rec.1, ptr %gep
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
+define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
@@ -2564,74 +2714,18 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]]
 ; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
 ; UNROLL-NO-IC:       pred.udiv.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
 ; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
-; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; UNROLL-NO-IC:       pred.store.if:
-; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP65]], ptr [[DST:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; UNROLL-NO-IC:       pred.store.continue:
-; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; UNROLL-NO-IC:       pred.store.if21:
-; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP67]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; UNROLL-NO-IC:       pred.store.continue22:
-; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; UNROLL-NO-IC:       pred.store.if23:
-; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP53]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; UNROLL-NO-IC:       pred.store.continue24:
-; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; UNROLL-NO-IC:       pred.store.if25:
-; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP55]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; UNROLL-NO-IC:       pred.store.continue26:
-; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
-; UNROLL-NO-IC:       pred.store.if27:
-; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP57]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; UNROLL-NO-IC:       pred.store.continue28:
-; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
-; UNROLL-NO-IC:       pred.store.if29:
-; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP59]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE30]]
-; UNROLL-NO-IC:       pred.store.continue30:
-; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
-; UNROLL-NO-IC:       pred.store.if31:
-; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP61]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; UNROLL-NO-IC:       pred.store.continue32:
-; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF33:%.*]], label [[PRED_UDIV_CONTINUE20]]
-; UNROLL-NO-IC:       pred.store.if33:
-; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP63]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
-; UNROLL-NO-IC:       pred.store.continue34:
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2656,7 +2750,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-IC-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2688,25 +2781,15 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-VF:       pred.udiv.if3:
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-VF:       pred.udiv.continue4:
 ; UNROLL-NO-VF-NEXT:    [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
 ; UNROLL-NO-VF-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; UNROLL-NO-VF:       pred.store.if:
-; UNROLL-NO-VF-NEXT:    store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; UNROLL-NO-VF:       pred.store.continue:
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF5:%.*]], label [[PRED_UDIV_CONTINUE4]]
-; UNROLL-NO-VF:       pred.store.if5:
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP6]], ptr [[DST]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
-; UNROLL-NO-VF:       pred.store.continue6:
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2730,7 +2813,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2785,44 +2867,16 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; SINK-AFTER:       pred.udiv.continue6:
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ]
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
 ; SINK-AFTER:       pred.udiv.if7:
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; SINK-AFTER:       pred.udiv.continue8:
 ; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ]
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
-; SINK-AFTER-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; SINK-AFTER-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; SINK-AFTER:       pred.store.if:
-; SINK-AFTER-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
-; SINK-AFTER-NEXT:    store i32 [[TMP34]], ptr [[DST:%.*]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; SINK-AFTER:       pred.store.continue:
-; SINK-AFTER-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; SINK-AFTER-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; SINK-AFTER:       pred.store.if9:
-; SINK-AFTER-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
-; SINK-AFTER-NEXT:    store i32 [[TMP28]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; SINK-AFTER:       pred.store.continue10:
-; SINK-AFTER-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; SINK-AFTER-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; SINK-AFTER:       pred.store.if11:
-; SINK-AFTER-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
-; SINK-AFTER-NEXT:    store i32 [[TMP30]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; SINK-AFTER:       pred.store.continue12:
-; SINK-AFTER-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_UDIV_CONTINUE8]]
-; SINK-AFTER:       pred.store.if13:
-; SINK-AFTER-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
-; SINK-AFTER-NEXT:    store i32 [[TMP32]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; SINK-AFTER:       pred.store.continue14:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2845,7 +2899,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; SINK-AFTER-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2863,7 +2916,6 @@ bb:
   %var6 = add i32 %var5, %var4
   %var7 = udiv i32 219220132, %var3
   %var8 = add nsw i32 %var3, -1
-  store i32 %var4, ptr %dst
   %var9 = icmp slt i32 %var3, 2
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
@@ -3293,6 +3345,287 @@ bb:
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
 
+; %vec.dead will be marked as dead instruction in the vector loop and no recipe
+; will be created for it. Make sure a valid sink target is used.
+define i32 @sink_after_dead_inst(ptr %A.ptr) {
+; UNROLL-NO-IC-LABEL: @sink_after_dead_inst(
+; UNROLL-NO-IC-NEXT:  entry:
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-IC:       middle.block:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC:       scalar.ph:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
+; UNROLL-NO-IC:       loop:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; UNROLL-NO-IC-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; UNROLL-NO-IC-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; UNROLL-NO-IC-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; UNROLL-NO-IC-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; UNROLL-NO-IC-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; UNROLL-NO-IC-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; UNROLL-NO-IC-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; UNROLL-NO-IC:       for.end:
+; UNROLL-NO-IC-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+; UNROLL-NO-VF-LABEL: @sink_after_dead_inst(
+; UNROLL-NO-VF-NEXT:  entry:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP5]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-VF:       middle.block:
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF:       scalar.ph:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
+; UNROLL-NO-VF:       loop:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; UNROLL-NO-VF-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; UNROLL-NO-VF-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; UNROLL-NO-VF-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; UNROLL-NO-VF-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; UNROLL-NO-VF:       for.end:
+; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+; SINK-AFTER-LABEL: @sink_after_dead_inst(
+; SINK-AFTER-NEXT:  entry:
+; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; SINK-AFTER:       middle.block:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER:       scalar.ph:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
+; SINK-AFTER:       loop:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; SINK-AFTER-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; SINK-AFTER-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; SINK-AFTER-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; SINK-AFTER-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; SINK-AFTER-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; SINK-AFTER-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; SINK-AFTER-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; SINK-AFTER-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; SINK-AFTER:       for.end:
+; SINK-AFTER-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
+  %cmp = icmp eq i32 %for, 15
+  %C = icmp eq i1 %cmp, true
+  %vec.dead = and i1 %C, 1
+  %iv.next = add i16 %iv, 1
+  %B1 = or i16 %iv.next, %iv.next
+  %B3 = and i1 %cmp, %C
+  %for.prev = zext i16 %B1 to i32
+
+  %ext = zext i1 %B3 to i32
+  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
+  store i32 0, ptr %A.gep
+  br i1 %vec.dead, label %for.end, label %loop
+
+for.end:
+  ret i32 %for
+}
+
+; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
+; to be removed also.
+define void @unused_recurrence(ptr %a) {
+; UNROLL-NO-IC-LABEL: @unused_recurrence(
+; UNROLL-NO-IC-NEXT:  entry:
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-IC:       middle.block:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC:       scalar.ph:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-IC:       for.cond:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-IC:       for.end:
+; UNROLL-NO-IC-NEXT:    ret void
+;
+; UNROLL-NO-VF-LABEL: @unused_recurrence(
+; UNROLL-NO-VF-NEXT:  entry:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-VF:       middle.block:
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF:       scalar.ph:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-VF:       for.cond:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-VF:       for.end:
+; UNROLL-NO-VF-NEXT:    ret void
+;
+; SINK-AFTER-LABEL: @unused_recurrence(
+; SINK-AFTER-NEXT:  entry:
+; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
+; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; SINK-AFTER:       middle.block:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER:       scalar.ph:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
+; SINK-AFTER:       for.cond:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; SINK-AFTER:       for.end:
+; SINK-AFTER-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %iv.next= add i16 %iv, 1
+  %rec.1.prev = add i16 %iv.next, 5
+  %cmp = icmp eq i16 %iv, 1000
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
 ; Test case for https://github.com/llvm/llvm-project/issues/95520.
 define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-LABEL: @recurence_uniform_load(
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index a622193290c8..e27734755dfb 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -59,8 +59,8 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
 ; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -132,8 +132,8 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -203,8 +203,8 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -276,8 +276,8 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -350,8 +350,8 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index d973e451d887..a4f2b077cb06 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -282,3 +282,141 @@ for.cond:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.cond
   ret void
 }
+
+; Test that when WidenPointerInductionRecipes are ordered before other
+; WidenIntOrFpInductionRecipes that their PHIs are emitted in the right place.
+define void @outside_lattice(ptr noalias %p, ptr noalias %q, i32 %n) {
+; DEFAULT-LABEL: @outside_lattice(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = zext i32 [[N:%.*]] to i64
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; DEFAULT-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; DEFAULT-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; DEFAULT-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ], [ [[P]], [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[IV_INT:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[P_GEP:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[IV_INT]]
+; DEFAULT-NEXT:    store ptr [[IV_PTR]], ptr [[P_GEP]], align 8
+; DEFAULT-NEXT:    [[Q_GEP:%.*]] = getelementptr inbounds i32, ptr [[Q]], i32 [[IV_INT]]
+; DEFAULT-NEXT:    store i32 [[IV_INT]], ptr [[Q_GEP]], align 4
+; DEFAULT-NEXT:    [[IV_INT_NEXT]] = add i32 [[IV_INT]], 1
+; DEFAULT-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, ptr [[IV_PTR]], i32 1
+; DEFAULT-NEXT:    [[DONE:%.*]] = icmp ult i32 [[IV_INT_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[DONE]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       for.end:
+; DEFAULT-NEXT:    ret void
+;
+; STRIDED-LABEL: @outside_lattice(
+; STRIDED-NEXT:  entry:
+; STRIDED-NEXT:    [[TMP0:%.*]] = zext i32 [[N:%.*]] to i64
+; STRIDED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; STRIDED:       vector.scevcheck:
+; STRIDED-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; STRIDED-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; STRIDED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; STRIDED-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; STRIDED:       vector.ph:
+; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; STRIDED-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
+; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; STRIDED:       vector.body:
+; STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; STRIDED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
+; STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
+; STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; STRIDED-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; STRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; STRIDED:       middle.block:
+; STRIDED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; STRIDED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; STRIDED:       scalar.ph:
+; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ], [ [[P]], [[VECTOR_SCEVCHECK]] ]
+; STRIDED-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; STRIDED-NEXT:    br label [[FOR_BODY:%.*]]
+; STRIDED:       for.body:
+; STRIDED-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[FOR_BODY]] ]
+; STRIDED-NEXT:    [[IV_INT:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[FOR_BODY]] ]
+; STRIDED-NEXT:    [[P_GEP:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[IV_INT]]
+; STRIDED-NEXT:    store ptr [[IV_PTR]], ptr [[P_GEP]], align 8
+; STRIDED-NEXT:    [[Q_GEP:%.*]] = getelementptr inbounds i32, ptr [[Q]], i32 [[IV_INT]]
+; STRIDED-NEXT:    store i32 [[IV_INT]], ptr [[Q_GEP]], align 4
+; STRIDED-NEXT:    [[IV_INT_NEXT]] = add i32 [[IV_INT]], 1
+; STRIDED-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, ptr [[IV_PTR]], i32 1
+; STRIDED-NEXT:    [[DONE:%.*]] = icmp ult i32 [[IV_INT_NEXT]], [[N]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
+; STRIDED:       for.end:
+; STRIDED-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv.ptr = phi ptr [ %p, %entry ], [ %iv.ptr.next, %for.body ]
+  %iv.int = phi i32 [ 0, %entry ], [ %iv.int.next, %for.body ]
+
+  %p.gep = getelementptr inbounds ptr, ptr %p, i32 %iv.int
+  store ptr %iv.ptr, ptr %p.gep
+
+  %q.gep = getelementptr inbounds i32, ptr %q, i32 %iv.int
+  store i32 %iv.int, ptr %q.gep
+
+  %iv.int.next = add i32 %iv.int, 1
+  %iv.ptr.next = getelementptr inbounds i32, ptr %iv.ptr, i32 1
+
+  %done = icmp ult i32 %iv.int.next, %n
+  br i1 %done, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index d497f0c22dbb..fbe3a7a470e8 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -116,7 +116,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
@@ -125,12 +125,12 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
@@ -139,12 +139,12 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
@@ -153,7 +153,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
@@ -321,7 +321,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
@@ -330,12 +330,12 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
@@ -344,12 +344,12 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
@@ -358,7 +358,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> splat (i32 1)
@@ -436,7 +436,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
@@ -445,12 +445,12 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP22]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -459,12 +459,12 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP32]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP34:%.*]] = phi <4 x i32> [ [[TMP24]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP33]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
@@ -473,7 +473,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP38]], align 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP48]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP50:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP49]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index a70d8f72c8a3..bb84dbf8ed23 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -658,9 +658,9 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-VF4UF1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP7]]
 ; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
@@ -707,17 +707,17 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
 ; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
-; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
-; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
 ; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index f136b0e2e0b3..10f96284c018 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -23,12 +23,12 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
@@ -103,12 +103,12 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
index 2a48e0a5e531..15db687ba64f 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -19,17 +19,17 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 5ff43dcf42bc..ec1e8fa1e1b3 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -918,8 +918,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 130db548ca8c..fe533672f2ca 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -291,8 +291,8 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) {
 ; CHECK-NEXT:    store i32 [[TMP34]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; CHECK:       [[PRED_STORE_CONTINUE12]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 59277186195f..7654bc9a141e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -65,3 +65,68 @@ loop.latch:
 exit:
   ret void
 }
+
+; Check that VPWidenIntOrFPInductionRecipe is expanded into smaller recipes in
+; the final VPlan.
+define void @iv_expand(ptr %p, i64 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'iv_expand'
+; CHECK:      VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION  ir<0>, ir<1>, vp<%0>
+; CHECK-NEXT:     vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:     CLONE ir<%q> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%q>
+; CHECK-NEXT:     WIDEN ir<%x> = load vp<%5>
+; CHECK-NEXT:     WIDEN ir<%y> = add ir<%x>, ir<%iv>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%q>
+; CHECK-NEXT:     WIDEN store vp<%6>, ir<%y>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK:      VPlan 'Final VPlan for VF={8},UF={1}'
+; CHECK:      ir-bb<vector.ph>:
+; CHECK-NEXT:     IR   %n.mod.vf = urem i64 %n, 8
+; CHECK-NEXT:     IR   %n.vec = sub i64 %n, %n.mod.vf
+; CHECK-NEXT:     EMIT vp<[[STEP_VECTOR:%.+]]> = step-vector
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_0:%.+]]> = broadcast ir<0>
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_1:%.+]]> = broadcast ir<1>
+; CHECK-NEXT:     EMIT vp<[[MUL:%.+]]> = mul vp<[[STEP_VECTOR]]>, vp<[[BROADCAST_1]]>
+; CHECK-NEXT:     EMIT vp<[[INDUCTION:%.+]]> = add vp<[[BROADCAST_0]]>, vp<[[MUL]]>
+; CHECK-NEXT:     EMIT vp<[[TRUNC:%.+]]> = trunc ir<8> to i64
+; CHECK-NEXT:     EMIT vp<[[INC:%.+]]> = mul ir<1>, vp<[[TRUNC]]>
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_INC:%.+]]> = broadcast vp<[[INC]]>
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<[[SCALAR_PHI:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-PHI ir<%iv> = phi [ vp<[[INDUCTION]]>, ir-bb<vector.ph> ], [ vp<%vec.ind.next>, vector.body ]
+; CHECK-NEXT:   CLONE ir<%q> = getelementptr ir<%p>, vp<[[SCALAR_PHI]]>
+; CHECK-NEXT:   vp<[[VEC_PTR_1:%.+]]> = vector-pointer ir<%q>
+; CHECK-NEXT:   WIDEN ir<%x> = load vp<[[VEC_PTR_1]]>
+; CHECK-NEXT:   WIDEN ir<%y> = add ir<%x>, ir<%iv>
+; CHECK-NEXT:   vp<[[VEC_PTR_2:%.+]]> = vector-pointer ir<%q>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR_2]]>, ir<%y>
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<[[SCALAR_PHI]]>, ir<8>
+; CHECK-NEXT:   EMIT vp<%vec.ind.next> = add ir<%iv>, vp<[[BROADCAST_INC]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<%n.vec>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %q = getelementptr i64, ptr %p, i64 %iv
+  %x = load i64, ptr %q
+  %y = add i64 %x, %iv
+  store i64 %y, ptr %q
+  %iv.next = add i64 %iv, 1
+  %done = icmp eq i64 %iv.next, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret void
+}

From 9e0186d925f0c375a627866c59394f25c22eb3ff Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Tue, 17 Jun 2025 10:24:57 -0700
Subject: [PATCH 0681/1322] [HLSL][RootSignature] Implement `ResourceRange` as
 an `IntervalMap` (#140957)

A resource range consists of a closed interval, `[a;b]`, denoting which
shader registers it is bound to.

For instance:
 - `CBV(b1)`  corresponds to the resource range of `[1;1]`
 - `CBV(b0, numDescriptors = 3)` likewise to `[0;2]`

We want to provide an error diagnostic when there is an overlap in the
required registers (an overlap in the resource ranges).

The goal of this pr is to implement a structure to model a set of
resource ranges and provide an api to detect any overlap over a set of
resource ranges.

`ResourceRange` models this by implementing an `IntervalMap` to denote a
mapping from an interval of registers back to a resource range. It
allows for a new `ResourceRange` to be added to the mapping and it will
report if and what the first overlap is.

For the context of how this will be used in validation of a
`RootSignatureDecl` please see the proceeding pull request here:
https://github.com/llvm/llvm-project/pull/140962.

- Implements `ResourceRange` as an `IntervalMap`
- Adds unit testing of the various `insert` scenarios

Note: it was also considered to implement this as an `IntervalTree`,
this would allow reporting of a diagnostic for each overlap that is
encountered, as opposed to just the first. However, error generation of
just reporting the first error is already rather verbose, and adding the
additional diagnostics only made this worse.

Part 1 of https://github.com/llvm/llvm-project/issues/129942
---
 .../llvm/Frontend/HLSL/HLSLRootSignature.h    |   1 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |  57 ++++++
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  |  61 ++++++
 llvm/unittests/Frontend/CMakeLists.txt        |   1 +
 .../Frontend/HLSLRootSignatureRangesTest.cpp  | 177 ++++++++++++++++++
 5 files changed, 297 insertions(+)
 create mode 100644 llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp

diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
index 2f028817b45b..9dfbd3cb6892 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DXILABI.h"
+#include <limits>
 #include <variant>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index ca20e6719f3a..4d2cd183ebcb 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -15,6 +15,7 @@
 #define LLVM_FRONTEND_HLSL_HLSLROOTSIGNATUREUTILS_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/Frontend/HLSL/HLSLRootSignature.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
@@ -64,6 +65,62 @@ private:
   SmallVector<Metadata *> GeneratedMetadata;
 };
 
+// RangeInfo holds the information to correctly construct a ResourceRange
+// and retains this information to be used for displaying a better diagnostic
+struct RangeInfo {
+  const static uint32_t Unbounded = ~0u;
+
+  uint32_t LowerBound;
+  uint32_t UpperBound;
+};
+
+class ResourceRange {
+public:
+  using MapT = llvm::IntervalMap<uint32_t, const RangeInfo *, 16,
+                                 llvm::IntervalMapInfo<uint32_t>>;
+
+private:
+  MapT Intervals;
+
+public:
+  ResourceRange(MapT::Allocator &Allocator) : Intervals(MapT(Allocator)) {}
+
+  // Returns a reference to the first RangeInfo that overlaps with
+  // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
+  std::optional<const RangeInfo *> getOverlapping(const RangeInfo &Info) const;
+
+  // Return the mapped RangeInfo at X or nullptr if no mapping exists
+  const RangeInfo *lookup(uint32_t X) const;
+
+  // Insert the required (sub-)intervals such that the interval of [a;b] =
+  // [Info.LowerBound, Info.UpperBound] is covered and points to a valid
+  // RangeInfo &.
+  //
+  // For instance consider the following chain of inserting RangeInfos with the
+  // intervals denoting the Lower/Upper-bounds:
+  //
+  // A = [0;2]
+  //   insert(A) -> false
+  //   intervals: [0;2] -> &A
+  // B = [5;7]
+  //   insert(B) -> false
+  //   intervals: [0;2] -> &A, [5;7] -> &B
+  // C = [4;7]
+  //   insert(C) -> true
+  //   intervals: [0;2] -> &A, [4;7] -> &C
+  // D = [1;5]
+  //   insert(D) -> true
+  //   intervals: [0;2] -> &A, [3;3] -> &D, [4;7] -> &C
+  // E = [0;unbounded]
+  //   insert(E) -> true
+  //   intervals: [0;unbounded] -> E
+  //
+  // Returns a reference to the first RangeInfo that overlaps with
+  // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
+  // (equivalent to getOverlapping)
+  std::optional<const RangeInfo *> insert(const RangeInfo &Info);
+};
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 5bae72a3986f..1e198b639cfd 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -355,6 +355,67 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
   return MDNode::get(Ctx, Operands);
 }
 
+std::optional<const RangeInfo *>
+ResourceRange::getOverlapping(const RangeInfo &Info) const {
+  MapT::const_iterator Interval = Intervals.find(Info.LowerBound);
+  if (!Interval.valid() || Info.UpperBound < Interval.start())
+    return std::nullopt;
+  return Interval.value();
+}
+
+const RangeInfo *ResourceRange::lookup(uint32_t X) const {
+  return Intervals.lookup(X, nullptr);
+}
+
+std::optional<const RangeInfo *> ResourceRange::insert(const RangeInfo &Info) {
+  uint32_t LowerBound = Info.LowerBound;
+  uint32_t UpperBound = Info.UpperBound;
+
+  std::optional<const RangeInfo *> Res = std::nullopt;
+  MapT::iterator Interval = Intervals.begin();
+
+  while (true) {
+    if (UpperBound < LowerBound)
+      break;
+
+    Interval.advanceTo(LowerBound);
+    if (!Interval.valid()) // No interval found
+      break;
+
+    // Let Interval = [x;y] and [LowerBound;UpperBound] = [a;b] and note that
+    // a <= y implicitly from Intervals.find(LowerBound)
+    if (UpperBound < Interval.start())
+      break; // found interval does not overlap with inserted one
+
+    if (!Res.has_value()) // Update to be the first found intersection
+      Res = Interval.value();
+
+    if (Interval.start() <= LowerBound && UpperBound <= Interval.stop()) {
+      // x <= a <= b <= y implies that [a;b] is covered by [x;y]
+      //  -> so we don't need to insert this, report an overlap
+      return Res;
+    } else if (LowerBound <= Interval.start() &&
+               Interval.stop() <= UpperBound) {
+      // a <= x <= y <= b implies that [x;y] is covered by [a;b]
+      //  -> so remove the existing interval that we will cover with the
+      //  overwrite
+      Interval.erase();
+    } else if (LowerBound < Interval.start() && UpperBound <= Interval.stop()) {
+      // a < x <= b <= y implies that [a; x] is not covered but [x;b] is
+      //  -> so set b = x - 1 such that [a;x-1] is now the interval to insert
+      UpperBound = Interval.start() - 1;
+    } else if (Interval.start() <= LowerBound && Interval.stop() < UpperBound) {
+      // a < x <= b <= y implies that [y; b] is not covered but [a;y] is
+      //  -> so set a = y + 1 such that [y+1;b] is now the interval to insert
+      LowerBound = Interval.stop() + 1;
+    }
+  }
+
+  assert(LowerBound <= UpperBound && "Attempting to insert an empty interval");
+  Intervals.insert(LowerBound, UpperBound, &Info);
+  return Res;
+}
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 2119642769e3..4048143b3681 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(LLVMFrontendTests
   HLSLRootSignatureDumpTest.cpp
+  HLSLRootSignatureRangesTest.cpp
   OpenACCTest.cpp
   OpenMPContextTest.cpp
   OpenMPIRBuilderTest.cpp
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
new file mode 100644
index 000000000000..0ef6fe84f0ec
--- /dev/null
+++ b/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
@@ -0,0 +1,177 @@
+//===------ HLSLRootSignatureRangeTest.cpp - RootSignature Range tests ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/HLSL/HLSLRootSignatureUtils.h"
+#include "gtest/gtest.h"
+
+using namespace llvm::hlsl::rootsig;
+
+namespace {
+
+TEST(HLSLRootSignatureTest, NoOverlappingInsertTests) {
+  // Ensures that there is never a reported overlap
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 3;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 7;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  RangeInfo C;
+  C.LowerBound = 10;
+  C.UpperBound = RangeInfo::Unbounded;
+  EXPECT_EQ(Range.insert(C), std::nullopt);
+
+  // A = [0;3]
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), &A);
+
+  // B = [4;7]
+  EXPECT_EQ(Range.lookup(4), &B);
+  EXPECT_EQ(Range.lookup(5), &B);
+  EXPECT_EQ(Range.lookup(7), &B);
+
+  EXPECT_EQ(Range.lookup(8), nullptr);
+  EXPECT_EQ(Range.lookup(9), nullptr);
+
+  // C = [10;unbounded]
+  EXPECT_EQ(Range.lookup(10), &C);
+  EXPECT_EQ(Range.lookup(42), &C);
+  EXPECT_EQ(Range.lookup(98237423), &C);
+  EXPECT_EQ(Range.lookup(RangeInfo::Unbounded), &C);
+}
+
+TEST(HLSLRootSignatureTest, SingleOverlappingInsertTests) {
+  // Ensures that we correctly report an overlap when we insert a range that
+  // overlaps with one other range but does not cover (replace) it
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 1;
+  A.UpperBound = 5;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 0;
+  B.UpperBound = 2;
+  EXPECT_EQ(Range.insert(B).value(), &A);
+
+  RangeInfo C;
+  C.LowerBound = 4;
+  C.UpperBound = RangeInfo::Unbounded;
+  EXPECT_EQ(Range.insert(C).value(), &A);
+
+  // A = [1;5]
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), &A);
+  EXPECT_EQ(Range.lookup(4), &A);
+  EXPECT_EQ(Range.lookup(5), &A);
+
+  // B = [0;0]
+  EXPECT_EQ(Range.lookup(0), &B);
+
+  // C = [6; unbounded]
+  EXPECT_EQ(Range.lookup(6), &C);
+  EXPECT_EQ(Range.lookup(RangeInfo::Unbounded), &C);
+}
+
+TEST(HLSLRootSignatureTest, MultipleOverlappingInsertTests) {
+  // Ensures that we correctly report an overlap when inserted range
+  // overlaps more than one range and it does not cover (replace) either
+  // range. In this case it will just fill in the interval between the two
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 2;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 6;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  RangeInfo C;
+  C.LowerBound = 1;
+  C.UpperBound = 5;
+  EXPECT_EQ(Range.insert(C).value(), &A);
+
+  // A = [0;2]
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+
+  // B = [4;6]
+  EXPECT_EQ(Range.lookup(4), &B);
+  EXPECT_EQ(Range.lookup(5), &B);
+  EXPECT_EQ(Range.lookup(6), &B);
+
+  // C = [3;3]
+  EXPECT_EQ(Range.lookup(3), &C);
+}
+
+TEST(HLSLRootSignatureTest, CoverInsertTests) {
+  // Ensures that we correctly report an overlap when inserted range
+  // covers one or more ranges
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 2;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 5;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  // Covers B
+  RangeInfo C;
+  C.LowerBound = 4;
+  C.UpperBound = 6;
+  EXPECT_EQ(Range.insert(C).value(), &B);
+
+  // A = [0;2]
+  // C = [4;6] <- covers reference to B
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), nullptr);
+  EXPECT_EQ(Range.lookup(4), &C);
+  EXPECT_EQ(Range.lookup(5), &C);
+  EXPECT_EQ(Range.lookup(6), &C);
+
+  // Covers all other ranges
+  RangeInfo D;
+  D.LowerBound = 0;
+  D.UpperBound = 7;
+  EXPECT_EQ(Range.insert(D).value(), &A);
+
+  // D = [0;7] <- Covers reference to A and C
+  EXPECT_EQ(Range.lookup(0), &D);
+  EXPECT_EQ(Range.lookup(1), &D);
+  EXPECT_EQ(Range.lookup(2), &D);
+  EXPECT_EQ(Range.lookup(3), &D);
+  EXPECT_EQ(Range.lookup(4), &D);
+  EXPECT_EQ(Range.lookup(5), &D);
+  EXPECT_EQ(Range.lookup(6), &D);
+  EXPECT_EQ(Range.lookup(7), &D);
+}
+
+} // namespace

From ed07b54b38c675235b4ce1bfd49e1fff372f6520 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Tue, 17 Jun 2025 18:35:49 +0100
Subject: [PATCH 0682/1322] [CIR][NFCI] Represent Complex RValues As Single
 Value (#144519)

This patch removes one mlir::Value in the RValue class that has been
used to represent complex values in classic CG. In CIR we plan on
representing complex as a single value. It also removes some now
unnecessary member functions related to complex handling.
---
 clang/lib/CIR/CodeGen/CIRGenCall.cpp       |  2 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp       |  6 ++---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 10 ++++----
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp       |  3 +--
 clang/lib/CIR/CodeGen/CIRGenValue.h        | 27 +++++++---------------
 5 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 0d9064425fa9..af0e6ca822b8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -443,7 +443,7 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       mlir::Value v;
       if (arg.isAggregate())
         cgm.errorNYI(loc, "emitCall: aggregate call argument");
-      v = arg.getKnownRValue().getScalarVal();
+      v = arg.getKnownRValue().getValue();
 
       // We might have to widen integers, but we should never truncate.
       if (argType != v.getType() && mlir::isa<cir::IntType>(v.getType()))
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 2e43f10be132..4f2046ad26d7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -219,7 +219,7 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
       const mlir::Value vector =
           builder.createLoad(loc, dst.getVectorAddress());
       const mlir::Value newVector = builder.create<cir::VecInsertOp>(
-          loc, vector, src.getScalarVal(), dst.getVectorIdx());
+          loc, vector, src.getValue(), dst.getVectorIdx());
       builder.createStore(loc, newVector, dst.getVectorAddress());
       return;
     }
@@ -232,7 +232,7 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
   assert(!cir::MissingFeatures::opLoadStoreObjC());
 
   assert(src.isScalar() && "Can't emit an aggregate store with this method");
-  emitStoreOfScalar(src.getScalarVal(), dst, isInit);
+  emitStoreOfScalar(src.getValue(), dst, isInit);
 }
 
 static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
@@ -949,7 +949,7 @@ LValue CIRGenFunction::emitCallExprLValue(const CallExpr *e) {
          "Can't have a scalar return unless the return type is a "
          "reference type!");
 
-  return makeNaturalAlignPointeeAddrLValue(rv.getScalarVal(), e->getType());
+  return makeNaturalAlignPointeeAddrLValue(rv.getValue(), e->getType());
 }
 
 LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 75b4d2a637e6..8d0db5cd0a1e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -131,11 +131,11 @@ public:
   mlir::Value emitLoadOfLValue(const Expr *e) {
     LValue lv = cgf.emitLValue(e);
     // FIXME: add some akin to EmitLValueAlignmentAssumption(E, V);
-    return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+    return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
   }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
-    return cgf.emitLoadOfLValue(lv, loc).getScalarVal();
+    return cgf.emitLoadOfLValue(lv, loc).getValue();
   }
 
   // l-values
@@ -400,10 +400,10 @@ public:
       cgf.cgm.errorNYI(e->getSourceRange(), "Atomic inc/dec");
       // TODO(cir): This is not correct, but it will produce reasonable code
       // until atomic operations are implemented.
-      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
       input = value;
     } else {
-      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
       input = value;
     }
 
@@ -1805,7 +1805,7 @@ mlir::Value ScalarExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
 
-  auto v = cgf.emitCallExpr(e).getScalarVal();
+  auto v = cgf.emitCallExpr(e).getValue();
   assert(!cir::MissingFeatures::emitLValueAlignmentAssumption());
   return v;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 019a44636ce3..9193f6f1cd99 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -391,8 +391,7 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
     // If this function returns a reference, take the address of the
     // expression rather than the value.
     RValue result = emitReferenceBindingToExpr(rv);
-    builder.CIRBaseBuilderTy::createStore(loc, result.getScalarVal(),
-                                          *fnRetAlloca);
+    builder.CIRBaseBuilderTy::createStore(loc, result.getValue(), *fnRetAlloca);
   } else {
     mlir::Value value = nullptr;
     switch (CIRGenFunction::getEvaluationKind(rv->getType())) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index c1e08ba1e9b6..84972fc7f911 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -33,11 +33,7 @@ class RValue {
   enum Flavor { Scalar, Complex, Aggregate };
 
   union {
-    // Stores first and second value.
-    struct {
-      mlir::Value first;
-      mlir::Value second;
-    } vals;
+    mlir::Value value;
 
     // Stores aggregate address.
     Address aggregateAddr;
@@ -47,7 +43,7 @@ class RValue {
   unsigned flavor : 2;
 
 public:
-  RValue() : vals{nullptr, nullptr}, flavor(Scalar) {}
+  RValue() : value(nullptr), flavor(Scalar) {}
 
   bool isScalar() const { return flavor == Scalar; }
   bool isComplex() const { return flavor == Complex; }
@@ -56,14 +52,9 @@ public:
   bool isVolatileQualified() const { return isVolatile; }
 
   /// Return the value of this scalar value.
-  mlir::Value getScalarVal() const {
+  mlir::Value getValue() const {
     assert(isScalar() && "Not a scalar!");
-    return vals.first;
-  }
-
-  /// Return the real/imag components of this complex value.
-  std::pair<mlir::Value, mlir::Value> getComplexVal() const {
-    return std::make_pair(vals.first, vals.second);
+    return value;
   }
 
   /// Return the value of the address of the aggregate.
@@ -83,22 +74,20 @@ public:
 
   static RValue get(mlir::Value v) {
     RValue er;
-    er.vals.first = v;
+    er.value = v;
     er.flavor = Scalar;
     er.isVolatile = false;
     return er;
   }
 
-  static RValue getComplex(mlir::Value v1, mlir::Value v2) {
+  static RValue getComplex(mlir::Value v) {
     RValue er;
-    er.vals = {v1, v2};
+    er.value = v;
     er.flavor = Complex;
     er.isVolatile = false;
     return er;
   }
-  static RValue getComplex(const std::pair<mlir::Value, mlir::Value> &c) {
-    return getComplex(c.first, c.second);
-  }
+
   // FIXME: Aggregate rvalues need to retain information about whether they are
   // volatile or not.  Remove default to find all places that probably get this
   // wrong.

From 3a06e9a710b7cfdbf1c002acc46fa76617e8baf8 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 23:09:38 +0530
Subject: [PATCH 0683/1322] Conditionalise the addition of Aarch64 function
 Multi versioning support on aarch64 target (#143749)

Currently, `ENABLE_BAREMETAL_AARCH64_FMV` is added to builtin defines
for all baremetal targets though it is only needed for aarch64. This
patch fixes this by adding it only for aarch64 target.
---
 compiler-rt/lib/builtins/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 075c4647abf6..5e832315f366 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -884,7 +884,11 @@ else ()
   if(COMPILER_RT_DISABLE_AARCH64_FMV)
     list(APPEND BUILTIN_DEFS DISABLE_AARCH64_FMV)
   elseif(COMPILER_RT_BAREMETAL_BUILD)
-    list(APPEND BUILTIN_DEFS ENABLE_BAREMETAL_AARCH64_FMV)
+    foreach (arch ${BUILTIN_SUPPORTED_ARCH})
+      if("${arch}" MATCHES "arm64|aarch64")
+        list(APPEND BUILTIN_DEFS ENABLE_BAREMETAL_AARCH64_FMV)
+      endif()
+    endforeach ()
   endif()
 
   append_list_if(COMPILER_RT_HAS_ASM_LSE HAS_ASM_LSE BUILTIN_DEFS)

From 7ea710fafa5782a274ded2ab6933c63c5c71f2ee Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Tue, 17 Jun 2025 10:44:21 -0700
Subject: [PATCH 0684/1322] Fix/reapply "[libc] Migrate stdio tests to
 ErrnoCheckingTest. (#144134)

This reverts commit 92a116c4ef822950f8c57eaa5164c844c73a1f7e with a fix
for fgets test - convert nullptr to fgets return type (char*), since the
matcher is pedantic.
---
 libc/test/src/stdio/CMakeLists.txt           | 13 ++++++++++++
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++-------
 libc/test/src/stdio/fgetc_test.cpp           | 22 +++++++++++---------
 libc/test/src/stdio/fgetc_unlocked_test.cpp  | 22 +++++++++++---------
 libc/test/src/stdio/fgets_test.cpp           | 19 ++++++++++-------
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++-------------
 libc/test/src/stdio/fopencookie_test.cpp     | 15 +++++++------
 libc/test/src/stdio/remove_test.cpp          | 10 ++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  9 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 +++----
 libc/test/src/stdlib/StrtolTest.h            |  1 -
 libc/test/src/stdlib/strtold_test.cpp        |  1 -
 13 files changed, 85 insertions(+), 77 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index ce2171f19597..4aa8b9588001 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -438,6 +442,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -452,6 +457,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -468,6 +474,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -488,6 +495,8 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -510,6 +519,8 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -527,6 +538,8 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100..b53184c30be3 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a..be2e50271b51 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,15 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -27,29 +30,28 @@ public:
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82..bef9dafd3d87 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,15 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,31 +33,30 @@ public:
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d405293..8fc38b065918 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,14 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -29,15 +32,16 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+              Succeeds(WRITE_SIZE));
   // This is a write-only file so reads should fail.
-  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
+  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file),
+              Fails(EBADF, static_cast<char *>(nullptr)));
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -55,6 +59,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
+  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -86,5 +91,5 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b..e097785832d5 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b64..bcf5e674141a 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c..296bff1f5dc1 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8..135fb98c07fb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb4..a0936ba79ef7 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,12 +11,14 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +54,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +104,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064b..e99b382d1211 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e7..03f0a6539c78 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c..eb4056dc7ba6 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 4943e746909ddbf8845e7fa397a97b918bf777df Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 17 Jun 2025 10:51:07 -0700
Subject: [PATCH 0685/1322] fixup! [Remarks] Elaborate on called intrinsics
 (#143985)

---
 llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
index 2ad068eb7dc3..49276c941623 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   i64)
 
-; FALLBACK_WITH_REPORT_ERR:  <unknown>:0:0: unable to translate instruction: call:
+; FALLBACK_WITH_REPORT_ERR:  <unknown>:0:0: unable to translate instruction: call
 ; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_arg
 define <vscale x 1 x i8> @scalable_arg(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
 entry:
@@ -22,7 +22,7 @@ entry:
   ret <vscale x 1 x i8> %a
 }
 
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call:
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call
 ; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_inst
 define <vscale x 1 x i8> @scalable_inst(i64 %0) nounwind {
 entry:

From 030b5519ec139757c13a6d6f337e69750ec24d6e Mon Sep 17 00:00:00 2001
From: Yijia Gu <yijiagu@google.com>
Date: Tue, 17 Jun 2025 10:52:34 -0700
Subject: [PATCH 0686/1322] [mlir][bazel] add missing deps for XeGPUTransforms

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e7398a696bea..55ee49444dc1 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3512,6 +3512,7 @@ cc_library(
     deps = [
         ":AffineUtils",
         ":Analysis",
+        ":ArithDialect",	
         ":ArithUtils",
         ":DialectUtils",
         ":FunctionInterfaces",
@@ -3521,6 +3522,7 @@ cc_library(
         ":IndexDialect",
         ":InliningUtils",
         ":LoopLikeInterface",
+        ":MathDialect",
         ":MemRefDialect",
         ":Pass",
         ":SCFTransforms",

From b876b3fa98cffd5b8755398f9a8218f667464d76 Mon Sep 17 00:00:00 2001
From: vitor1001 <56533861+vitor1001@users.noreply.github.com>
Date: Tue, 17 Jun 2025 19:52:56 +0200
Subject: [PATCH 0687/1322] Add missing intrinsics to cuda headers (#143664)

LLVM prevents the sm_32_intrinsics.hpp header from being included with a
#define __SM_32_INTRINSICS_HPP__. It also provides drop-in replacements
of the functions defined in the CUDA header.

One issue is that some intrinsics were added after the replacement was
written, and thus have no replacement, breaking code that calls them
(Raft is one example).

This patch adds the missing intrinsics.
---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 284 ++++++++++++++++++++
 1 file changed, 284 insertions(+)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 8b230af6f664..5e13f3f78df7 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,6 +479,290 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
   return ret;
 }
 
+#pragma push_macro("__INTRINSIC_LOAD")
+#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType,  \
+                         __Clobber)                                            \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __TmpType __ret;                                                           \
+    asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber);        \
+    return (__DeclType)__ret;                                                  \
+  }
+
+#pragma push_macro("__INTRINSIC_LOAD2")
+#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
+                          __Clobber)                                           \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __DeclType __ret;                                                          \
+    __TmpType __tmp;                                                           \
+    asm(__AsmOp " {%0,%1}, [%2];"                                              \
+        : __AsmType(__tmp.x), __AsmType(__tmp.y)                               \
+        : "l"(__ptr)__Clobber);                                                \
+    using __ElementType = decltype(__ret.x);                                   \
+    __ret.x = (__ElementType)(__tmp.x);                                        \
+    __ret.y = (__ElementType)__tmp.y;                                          \
+    return __ret;                                                              \
+  }
+
+#pragma push_macro("__INTRINSIC_LOAD4")
+#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
+                          __Clobber)                                           \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __DeclType __ret;                                                          \
+    __TmpType __tmp;                                                           \
+    asm(__AsmOp " {%0,%1,%2,%3}, [%4];"                                        \
+        : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z),          \
+          __AsmType(__tmp.w)                                                   \
+        : "l"(__ptr)__Clobber);                                                \
+    using __ElementType = decltype(__ret.x);                                   \
+    __ret.x = (__ElementType)__tmp.x;                                          \
+    __ret.y = (__ElementType)__tmp.y;                                          \
+    __ret.z = (__ElementType)__tmp.z;                                          \
+    __ret.w = (__ElementType)__tmp.w;                                          \
+    return __ret;                                                              \
+  }
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
+                 "=l", );
+
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
+                 "=h", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
+                 unsigned long long, "=l", );
+
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
+                  "=l", );
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
+
+inline __device__ long __ldcg(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
+                 "=h", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
+                 unsigned long long, "=l", : "memory");
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
+                 "=h", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
+                 "=l", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
+                  "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
+                  "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
+                  "=h", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
+                  "=h", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
+                  "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
+                  "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
+                  "=l", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
+                  "=h", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
+                  "=h", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
+                  "=l", : "memory");
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
+                  "=f", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
+                  "=f", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
+                  "=d", : "memory");
+
+inline __device__ long __ldcv(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
+                 "=l", );
+
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
+                 "=h", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
+                 unsigned long long, "=l", );
+
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
+                  "=l", );
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
+
+#pragma pop_macro("__INTRINSIC_LOAD")
+#pragma pop_macro("__INTRINSIC_LOAD2")
+#pragma pop_macro("__INTRINSIC_LOAD4")
+
+inline __device__ long __ldcs(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+#pragma push_macro("__INTRINSIC_STORE")
+#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp = (__TmpType)__value;                                      \
+    asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory");       \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE2")
+#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType,           \
+                           __AsmType)                                          \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp;                                                           \
+    using __ElementType = decltype(__tmp.x);                                   \
+    __tmp.x = (__ElementType)(__value.x);                                      \
+    __tmp.y = (__ElementType)(__value.y);                                      \
+    asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x),            \
+        __AsmType(__tmp.y)                                                     \
+        : "memory");                                                           \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE4")
+#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType,           \
+                           __AsmType)                                          \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp;                                                           \
+    using __ElementType = decltype(__tmp.x);                                   \
+    __tmp.x = (__ElementType)(__value.x);                                      \
+    __tmp.y = (__ElementType)(__value.y);                                      \
+    __tmp.z = (__ElementType)(__value.z);                                      \
+    __tmp.w = (__ElementType)(__value.w);                                      \
+    asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x),      \
+        __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w)             \
+        : "memory");                                                           \
+  }
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
+
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
+                  "h");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
+                  unsigned long long, "l");
+
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
+__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
+
+#pragma pop_macro("__INTRINSIC_STORE")
+#pragma pop_macro("__INTRINSIC_STORE2")
+#pragma pop_macro("__INTRINSIC_STORE4")
+
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
 
 #if CUDA_VERSION >= 11000

From 0cfc59ff51720ee60a71dd34077fc161886a3701 Mon Sep 17 00:00:00 2001
From: Yijia Gu <yijiagu@google.com>
Date: Tue, 17 Jun 2025 10:56:31 -0700
Subject: [PATCH 0688/1322] [mlir][bazel] remove extra empty space for
 XeGPUTransforms

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 55ee49444dc1..cb0f9d8c7413 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3512,7 +3512,7 @@ cc_library(
     deps = [
         ":AffineUtils",
         ":Analysis",
-        ":ArithDialect",	
+        ":ArithDialect",
         ":ArithUtils",
         ":DialectUtils",
         ":FunctionInterfaces",

From e29bb9a038245320164c5890d1a75843e4a664ef Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Tue, 17 Jun 2025 10:57:52 -0700
Subject: [PATCH 0689/1322] [IR2Vec] Consider only reachable BBs and non-debug
 instructions (#143476)

Changes to consider BBs that are reachable from the entry block. Similarly we skip debug instruction while computing the embeddings.

(Tracking issue - #141817)
---
 llvm/lib/Analysis/IR2Vec.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 0f7303c1b091..fa38c35796a0 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -13,7 +13,9 @@
 
 #include "llvm/Analysis/IR2Vec.h"
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
@@ -190,7 +192,8 @@ Embedding SymbolicEmbedder::getOperandEmbedding(const Value *Op) const {
 void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   Embedding BBVector(Dimension, 0);
 
-  for (const auto &I : BB) {
+  // We consider only the non-debug and non-pseudo instructions
+  for (const auto &I : BB.instructionsWithoutDebug()) {
     Embedding InstVector(Dimension, 0);
 
     const auto OpcVec = lookupVocab(I.getOpcodeName());
@@ -215,9 +218,11 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
 void SymbolicEmbedder::computeEmbeddings() const {
   if (F.isDeclaration())
     return;
-  for (const auto &BB : F) {
-    computeEmbeddings(BB);
-    FuncVector += BBVecMap[&BB];
+
+  // Consider only the basic blocks that are reachable from entry
+  for (const BasicBlock *BB : depth_first(&F)) {
+    computeEmbeddings(*BB);
+    FuncVector += BBVecMap[BB];
   }
 }
 

From 31523de4b000ca254259ae3167d28922e1302648 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 23:43:07 +0530
Subject: [PATCH 0690/1322] [Driver] Fix link order of BareMetal toolchain
 object (#132806)

The linker job in BareMetal toolchain object will be used by GNU ld and
lld both.
However, gnuld process the arguments in the order in which they appear
on command
line, whereas there is no such restriction with lld.

The previous order was:
LibraryPaths -> Libraries -> LTOOptions -> LinkerInputs
The new order is:
LibraryPaths -> LTOOptions -> LinkerInputs -> Libraries

LTO options need to be added before adding any linker inputs because
file format
after compile stage during LTO is bitcode which gnuld natively cannot
process.
Hence will need to pass appropriate plugins before adding any bitcode
file on the
command line.

Object files that are getting linked need to be passed before processing
any
libraries so that gnuld can appropriately do symbol resolution for the
symbols
for which no definition is provided through user code.

Similar link order is also followed by other linker jobs for gnuld such
as in
gnutools::Linker in Gnu.cpp

This is the 3rd patch in the series of patches of merging RISCVToolchain
into
BareMetal toolchain object.

RFC:

https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/lib/Driver/ToolChains/BareMetal.cpp   | 12 ++--
 clang/test/Driver/aarch64-toolchain-extra.c |  2 +-
 clang/test/Driver/aarch64-toolchain.c       | 28 ++++----
 clang/test/Driver/arm-toolchain-extra.c     |  2 +-
 clang/test/Driver/arm-toolchain.c           | 28 ++++----
 clang/test/Driver/baremetal-multilib.yaml   |  3 +-
 clang/test/Driver/baremetal-sysroot.cpp     |  8 ++-
 clang/test/Driver/baremetal.cpp             | 79 +++++++++++++--------
 8 files changed, 92 insertions(+), 70 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index a08bb588dd76..a665040662a3 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -568,8 +568,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   const llvm::Triple::ArchType Arch = TC.getArch();
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
-  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
-
   CmdArgs.push_back("-Bstatic");
 
   if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
@@ -619,6 +617,12 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   for (const auto &LibPath : TC.getLibraryPaths())
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath)));
 
+  if (D.isUsingLTO())
+    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
+                  D.getLTOMode() == LTOK_Thin);
+
+  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
+
   if (TC.ShouldLinkCXXStdlib(Args)) {
     bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                !Args.hasArg(options::OPT_static);
@@ -639,10 +643,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--end-group");
   }
 
-  if (D.isUsingLTO())
-    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
-                  D.getLTOMode() == LTOK_Thin);
-
   if ((TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) &&
       NeedCRTs)
     CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
index 2a930e35acd4..a0b5f2902962 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -31,5 +31,5 @@
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtend.o"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index 83cd95136b15..e12107fa2c50 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -11,12 +11,12 @@
 // LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-AARCH64-BAREMETAL: "-Bstatic" "-EL"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// LLD-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -29,12 +29,12 @@
 // C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL: "-Bstatic" "-EL"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -46,12 +46,12 @@
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -65,12 +65,12 @@
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -84,12 +84,12 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -102,12 +102,12 @@
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -119,12 +119,12 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
index 2adf4ab698ba..a04b41c13e95 100644
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -31,6 +31,6 @@
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtbegin.o"
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtend.o"
 
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index 66bed1b0c4d8..d4f9bf2aaf3d 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -10,12 +10,12 @@
 // LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-ARM-BAREMETAL: "-Bstatic" "-EL"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// LLD-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -28,12 +28,12 @@
 // C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL: "-Bstatic" "-EL"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -45,12 +45,12 @@
 // C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -65,12 +65,12 @@
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 
@@ -85,12 +85,12 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -103,12 +103,12 @@
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -120,12 +120,12 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml
index 853a4e9e36e4..1a80c3b4ccfc 100644
--- a/clang/test/Driver/baremetal-multilib.yaml
+++ b/clang/test/Driver/baremetal-multilib.yaml
@@ -8,8 +8,9 @@
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include/c++/v1"
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include"
 # CHECK-SAME: "-x" "c++" "{{.*}}baremetal-multilib.yaml"
-# CHECK-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+# CHECK-NEXT: ld{{(.exe)?}}" "-Bstatic"
 # CHECK-SAME: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/lib"
+# CHECK-SAME: "{{.*}}.o"
 # CHECK-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 # CHECK-SAME: "-lc"
 # CHECK-SAME: "-o" "{{.*}}.tmp.out"
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 5d5b336a01b0..47f0616df850 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -9,15 +9,17 @@
 // RUN: mkdir -p %T/baremetal_default_sysroot/lib/clang-runtimes/armv6m-none-eabi
 // RUN: ln -s %clang %T/baremetal_default_sysroot/bin/clang
 
-// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.out 2>&1 \
 // RUN:     -target armv6m-none-eabi --sysroot= \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-C %s
 // CHECK-V6M-C: "{{.*}}clang{{.*}}" "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
-// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "-Bstatic"
+// CHECK-V6M-C-SAME: "crt0.o"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
+// CHECK-V6M-C-SAME: "{{.*}}.o"
 // CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
-// CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
+// CHECK-V6M-C-SAME: "-o" "{{.*}}.tmp.out"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index eff8f775a9c1..b75f1a9280d1 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -15,11 +15,12 @@
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
+// CHECK-V6M-C-SAME: "{{.*}}.o"
+// CHECK-V6M-C-SAME: {{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
 // CHECK-V6M-C-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
 
@@ -39,9 +40,10 @@
 // CHECK-V6M-TREE-SAME: {{^}} "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-TREE-SAME: "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-TREE-SAME: "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi{{[/\\]+}}crt0.o"
 // CHECK-V6M-TREE-SAME: "-L[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi"
+// CHECK-V6M-TREE-SAME "{{.*}}.o"
 // CHECK-V6M-TREE-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-TREE-SAME: "-lc"
 // CHECK-V6M-TREE-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
@@ -53,19 +55,21 @@
 // CHECK-ARMV7M-PER-TARGET: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-ARMV7M-PER-TARGET: "-isysroot" "[[SYSROOT:[^"]*]]"
 // CHECK-ARMV7M-PER-TARGET: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-ARMV7M-PER_TARGET: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
+// CHECK-ARMV7M-PER-TARGET: "{{.*}}.o"
 // CHECK-ARMV7M-PER-TARGET: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-ARMV7M-PER-TARGET: "-lc"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-DEFAULTCXX %s
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
+// CHECK-V6M-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lm"
 // CHECK-V6M-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -77,8 +81,9 @@
 // CHECK-V6M-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
+// CHECK-V6M-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-SAME: "-lc++"
 // CHECK-V6M-LIBCXX-SAME: "-lm"
 // CHECK-V6M-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -92,8 +97,9 @@
 // CHECK-V6M-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
-// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
+// CHECK-V6M-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-V6M-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lc"
@@ -104,7 +110,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-NDL %s
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-NDL: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
@@ -119,6 +125,7 @@
 // CHECK-V6M-LIBCXX-USR-SAME: "-internal-isystem" "{{[^"]+}}baremetal_cxx_sysroot{{[/\\]+}}usr{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
+// CHECK-V6M-LIBCXX-USR: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lm"
 // CHECK-V6M-LIBCXX-USR-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc"
@@ -149,7 +156,7 @@
 
 // RUN: %clang -### %s --target=armebv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
-// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "--be8" "-EB"
+// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "--be8" "-EB"
 
 // RUN: %clang -### %s --target=armv7-none-eabi -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
@@ -159,7 +166,7 @@
 
 // RUN: %clang -### %s --target=armv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EL %s
-// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-ARMV7EL-NOT: "--be8"
 
 // RUN: %clang -### %s --target=armebv7-none-eabi -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -170,7 +177,7 @@
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64BE %s
-// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EB"
+// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EB"
 // CHECK-AARCH64BE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64-none-elf -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -181,7 +188,7 @@
 
 // RUN: %clang -### %s --target=aarch64-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64LE %s
-// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-AARCH64LE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -221,9 +228,10 @@
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV64-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
+// CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
 // CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
@@ -232,8 +240,9 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-DEFAULTCXX %s
 // CHECK-RV64-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV64-DEFAULTCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
@@ -246,8 +255,9 @@
 // CHECK-RV64-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV64-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV64-LIBCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
@@ -260,8 +270,9 @@
 // CHECK-RV64-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV64-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
@@ -277,9 +288,10 @@
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV32-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
+// CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
 // CHECK-RV32-SAME: "-X" "-o" "a.out"
@@ -288,8 +300,9 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV32-DEFAULTCXX %s
 // CHECK-RV32-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV32-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
@@ -302,8 +315,9 @@
 // CHECK-RV32-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV32-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
@@ -315,8 +329,9 @@
 // CHECK-RV32-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV32-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lc"
@@ -337,7 +352,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-NDL %s
 // CHECK-RV64-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-NDL: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
@@ -356,7 +371,7 @@
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include"
 // CHECK-RV64FD-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64FD-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -375,7 +390,7 @@
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32I-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32I-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -394,7 +409,7 @@
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IM-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32IM-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -408,7 +423,7 @@
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IAC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32IAC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf -march=rv32imafc -mabi=ilp32f \
@@ -429,7 +444,7 @@
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include"
 // CHECK-RV32IMAFC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32IMAFC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}lib"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc-unknown-eabi 2>&1 \
@@ -440,8 +455,9 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPCEABI-SAME:"{{.*}}.o"
 // CHECK-PPCEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCEABI-SAME: "-lc"
 // CHECK-PPCEABI-SAME: "-o" "a.out"
@@ -454,8 +470,9 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPC64EABI-SAME:"{{.*}}.o"
 // CHECK-PPC64EABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64EABI-SAME: "-lc"
 // CHECK-PPC64EABI-SAME: "-o" "a.out"
@@ -468,8 +485,9 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPCLEEABI-SAME:"{{.*}}.o"
 // CHECK-PPCLEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCLEEABI-SAME: "-lc"
 // CHECK-PPCLEEABI-SAME: "-o" "a.out"
@@ -482,8 +500,9 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPC64LEEABI-SAME:"{{.*}}.o"
 // CHECK-PPC64LEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64LEEABI-SAME: "-lc"
 // CHECK-PPC64LEEABI-SAME: "-o" "a.out"

From 2ab9c35ea93f8557827d4cadcceb05e4eed2d30a Mon Sep 17 00:00:00 2001
From: joaosaffran <126493771+joaosaffran@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:16:09 -0700
Subject: [PATCH 0691/1322] [DXContainer] Update DXContainer to match D3D12
 spec (#143201)

Update the descriptor range flag values in DXContainerConstants.def to
match
the Direct3D12 specification. This changes two aspects:

1. Modify the DESCRIPTOR_RANGE_FLAG macro to use direct values instead
of
   bit shifts
2. Update the flag values to use hex notation and match D3D12's
   D3D12_DESCRIPTOR_RANGE_FLAGS enumeration:
   - DESCRIPTORS_VOLATILE: 0x1
   - DATA_VOLATILE: 0x2
   - DATA_STATIC_WHILE_SET_AT_EXECUTE: 0x4
   - DATA_STATIC: 0x8
   - DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: 0x10000
3. Removed NONE value from ROOT_DESCRIPTOR_FLAG

This ensures better compatibility with the D3D12 API and makes the
values
more explicit in the code.

Requested here:
https://github.com/llvm/llvm-project/pull/138315#discussion_r2132818269

---------

Co-authored-by: joaosaffran <joao.saffran@microsoft.com>
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  |  6 +--
 .../BinaryFormat/DXContainerConstants.def     | 41 ++++++++++---------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 08949e39716d..6d625dad5853 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -154,17 +154,17 @@ enum class FeatureFlags : uint64_t {
 static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-#define ROOT_ELEMENT_FLAG(Num, Val) Val = 1ull << Num,
+#define ROOT_ELEMENT_FLAG(Num, Val) Val = Num,
 enum class RootElementFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
 
-#define ROOT_DESCRIPTOR_FLAG(Num, Val) Val = 1ull << Num,
+#define ROOT_DESCRIPTOR_FLAG(Num, Val) Val = Num,
 enum class RootDescriptorFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
 
-#define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = 1ull << Num,
+#define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = Num,
 enum class DescriptorRangeFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 501ef0c31cdd..18e79e6fa65a 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -59,18 +59,19 @@ SHADER_FEATURE_FLAG(33, 39, NextUnusedBit, "Next reserved shader flag bit (not a
 // ROOT_ELEMENT_FLAG(bit offset for the flag, name).
 #ifdef ROOT_ELEMENT_FLAG
 
-ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout)
-ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess)
-ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess)
-ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess)
-ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess)
-ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess)
-ROOT_ELEMENT_FLAG(6, AllowStreamOutput)
-ROOT_ELEMENT_FLAG(7, LocalRootSignature)
-ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess)
-ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess)
-ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed)
-ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed)
+ROOT_ELEMENT_FLAG(0, NONE)
+ROOT_ELEMENT_FLAG(0x1, AllowInputAssemblerInputLayout)
+ROOT_ELEMENT_FLAG(0x2, DenyVertexShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x4, DenyHullShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x8, DenyDomainShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x10, DenyGeometryShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x20, DenyPixelShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x40, AllowStreamOutput)
+ROOT_ELEMENT_FLAG(0x80, LocalRootSignature)
+ROOT_ELEMENT_FLAG(0x100, DenyAmplificationShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x200, DenyMeshShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x400, CBVSRVUAVHeapDirectlyIndexed)
+ROOT_ELEMENT_FLAG(0x800, SamplerHeapDirectlyIndexed)
 #undef ROOT_ELEMENT_FLAG
 #endif // ROOT_ELEMENT_FLAG
 
@@ -79,9 +80,9 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed)
 #ifdef ROOT_DESCRIPTOR_FLAG
 
 ROOT_DESCRIPTOR_FLAG(0, NONE)
-ROOT_DESCRIPTOR_FLAG(1, DATA_VOLATILE)
-ROOT_DESCRIPTOR_FLAG(2, DATA_STATIC_WHILE_SET_AT_EXECUTE)
-ROOT_DESCRIPTOR_FLAG(3, DATA_STATIC)
+ROOT_DESCRIPTOR_FLAG(0x2, DATA_VOLATILE)
+ROOT_DESCRIPTOR_FLAG(0x4, DATA_STATIC_WHILE_SET_AT_EXECUTE)
+ROOT_DESCRIPTOR_FLAG(0x8, DATA_STATIC)
 #undef ROOT_DESCRIPTOR_FLAG
 #endif // ROOT_DESCRIPTOR_FLAG
 
@@ -90,11 +91,11 @@ ROOT_DESCRIPTOR_FLAG(3, DATA_STATIC)
 #ifdef DESCRIPTOR_RANGE_FLAG
 
 DESCRIPTOR_RANGE_FLAG(0, NONE)
-DESCRIPTOR_RANGE_FLAG(1, DESCRIPTORS_VOLATILE)
-DESCRIPTOR_RANGE_FLAG(2, DATA_VOLATILE)
-DESCRIPTOR_RANGE_FLAG(3, DATA_STATIC_WHILE_SET_AT_EXECUTE)
-DESCRIPTOR_RANGE_FLAG(4, DATA_STATIC)
-DESCRIPTOR_RANGE_FLAG(16, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
+DESCRIPTOR_RANGE_FLAG(0x1, DESCRIPTORS_VOLATILE)
+DESCRIPTOR_RANGE_FLAG(0x2, DATA_VOLATILE)
+DESCRIPTOR_RANGE_FLAG(0x4, DATA_STATIC_WHILE_SET_AT_EXECUTE)
+DESCRIPTOR_RANGE_FLAG(0x8, DATA_STATIC)
+DESCRIPTOR_RANGE_FLAG(0x10000, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
 #undef DESCRIPTOR_RANGE_FLAG
 #endif // DESCRIPTOR_RANGE_FLAG
 

From bb288de4e0e74f235402ff41be60dabcd57e379f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 17 Jun 2025 11:22:23 -0700
Subject: [PATCH 0692/1322] [LoopPeel] Support last iteration peeling of
 min/max intrinsics (#143598)

This isn't terribly useful at the moment because of the step=1
restriction but it should be functionally sound. This is mostly just
making sure the codepaths don't diverge as we make other changes.
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |  5 +-
 .../LoopUnroll/peel-last-iteration-minmax.ll  | 48 +++++++++++++++----
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f34396254825..27e70c5ddc0f 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -545,8 +545,11 @@ countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), NewPeelCount), SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
-                                   Pred))
+                                   Pred)) {
+      if (shouldPeelLastIteration(L, Pred, AddRec, BoundSCEV, SE, TTI))
+        DesiredPeelCountLast = 1;
       return;
+    }
     DesiredPeelCount = NewPeelCount;
   };
 
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
index cd098e123b5f..5e8540814fff 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
@@ -41,16 +41,27 @@ define i32 @smin_unit_step() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT1]], 1023
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
 ; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 1)
 ; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[MINMAX]]
 ;
 entry:
   br label %loop
@@ -74,16 +85,28 @@ define i32 @smax_unit_step() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nuw nsw i32 1024, [[IV1]]
+; CHECK-NEXT:    call void @foo(i32 [[SUB1]])
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT1]], 1023
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
 ; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SUB]], i32 1)
 ; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[MINMAX]]
 ;
 entry:
   br label %loop
@@ -135,3 +158,8 @@ exit:
   ret i32 %minmax.lcssa
 }
 
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+;.

From 8cd05b88ec623018ca2c68cf2418d2beed026d27 Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Tue, 17 Jun 2025 11:27:35 -0700
Subject: [PATCH 0693/1322] [NFC][HLSL] Move Sema work from
 `ParseMicrosoftRootSignatureAttributeArgs` (#143184)

This separates semantic analysis from parsing by moving `RootSignatureDecl` creation, scope storage, and lookup logic into
`SemaHLSL`.

For more context see:
https://github.com/llvm/llvm-project/issues/142834.

- Define `ActOnStartRootSignatureDecl` and `ActOnFinishRootSignatureDecl` on `SemaHLSL`
- NFC so no test changes.

Resolves: https://github.com/llvm/llvm-project/issues/142834

---------

Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/include/clang/Parse/Parser.h  |  2 +-
 clang/include/clang/Sema/SemaHLSL.h | 13 ++++++++++++
 clang/lib/Parse/ParseDeclCXX.cpp    | 32 ++++++++++++-----------------
 clang/lib/Sema/SemaDecl.cpp         |  1 +
 clang/lib/Sema/SemaHLSL.cpp         | 25 ++++++++++++++++++++++
 5 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 3243b94c5e5e..a47e23ffbd35 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3598,7 +3598,7 @@ private:
   /// keyword.
   bool isClassCompatibleKeyword(Token Tok) const;
 
-  void ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs);
+  void ParseHLSLRootSignatureAttributeArgs(ParsedAttributes &Attrs);
 
   ///@}
 
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index ba5f06f93dc3..33c4b8d1568b 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -119,6 +119,19 @@ public:
                                        bool IsCompAssign);
   void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc);
 
+  /// Computes the unique Root Signature identifier from the given signature,
+  /// then lookup if there is a previousy created Root Signature decl.
+  ///
+  /// Returns the identifier and if it was found
+  std::pair<IdentifierInfo *, bool>
+  ActOnStartRootSignatureDecl(StringRef Signature);
+
+  /// Creates the Root Signature decl of the parsed Root Signature elements
+  /// onto the AST and push it onto current Scope
+  void ActOnFinishRootSignatureDecl(
+      SourceLocation Loc, IdentifierInfo *DeclIdent,
+      SmallVector<llvm::hlsl::rootsig::RootElement> &Elements);
+
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index a5c76501c7c1..c1493a5bfd3b 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -29,6 +29,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCodeCompletion.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <optional>
 
@@ -4903,7 +4904,7 @@ void Parser::ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs) {
   }
 }
 
-void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
+void Parser::ParseHLSLRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
   assert(Tok.is(tok::identifier) &&
          "Expected an identifier to denote which MS attribute to consider");
   IdentifierInfo *RootSignatureIdent = Tok.getIdentifierInfo();
@@ -4945,18 +4946,14 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
 
   // Construct our identifier
   StringRef Signature = StrLiteral.value()->getString();
-  auto Hash = llvm::hash_value(Signature);
-  std::string IdStr = "__hlsl_rootsig_decl_" + std::to_string(Hash);
-  IdentifierInfo *DeclIdent = &(Actions.getASTContext().Idents.get(IdStr));
-
-  LookupResult R(Actions, DeclIdent, SourceLocation(),
-                 Sema::LookupOrdinaryName);
-  // Check if we have already found a decl of the same name, if we haven't
-  // then parse the root signature string and construct the in-memory elements
-  if (!Actions.LookupQualifiedName(R, Actions.CurContext)) {
+  auto [DeclIdent, Found] =
+      Actions.HLSL().ActOnStartRootSignatureDecl(Signature);
+  // If we haven't found an already defined DeclIdent then parse the root
+  // signature string and construct the in-memory elements
+  if (!Found) {
+    // Offset location 1 to account for '"'
     SourceLocation SignatureLoc =
-        StrLiteral.value()->getExprLoc().getLocWithOffset(
-            1); // offset 1 for '"'
+        StrLiteral.value()->getExprLoc().getLocWithOffset(1);
     // Invoke the root signature parser to construct the in-memory constructs
     hlsl::RootSignatureLexer Lexer(Signature, SignatureLoc);
     SmallVector<llvm::hlsl::rootsig::RootElement> RootElements;
@@ -4966,12 +4963,9 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
       return;
     }
 
-    // Create the Root Signature
-    auto *SignatureDecl = HLSLRootSignatureDecl::Create(
-        Actions.getASTContext(), /*DeclContext=*/Actions.CurContext,
-        RootSignatureLoc, DeclIdent, RootElements);
-    SignatureDecl->setImplicit();
-    Actions.PushOnScopeChains(SignatureDecl, getCurScope());
+    // Construct the declaration.
+    Actions.HLSL().ActOnFinishRootSignatureDecl(RootSignatureLoc, DeclIdent,
+                                                RootElements);
   }
 
   // Create the arg for the ParsedAttr
@@ -5014,7 +5008,7 @@ void Parser::ParseMicrosoftAttributes(ParsedAttributes &Attrs) {
       if (Tok.getIdentifierInfo()->getName() == "uuid")
         ParseMicrosoftUuidAttributeArgs(Attrs);
       else if (Tok.getIdentifierInfo()->getName() == "RootSignature")
-        ParseMicrosoftRootSignatureAttributeArgs(Attrs);
+        ParseHLSLRootSignatureAttributeArgs(Attrs);
       else {
         IdentifierInfo *II = Tok.getIdentifierInfo();
         SourceLocation NameLoc = Tok.getLocation();
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5cffd82e3372..02ac898a2b70 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -62,6 +62,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Frontend/HLSL/HLSLRootSignature.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index ba491b613429..4a8479a00e0e 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -978,6 +978,31 @@ void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS,
       << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str());
 }
 
+std::pair<IdentifierInfo *, bool>
+SemaHLSL::ActOnStartRootSignatureDecl(StringRef Signature) {
+  llvm::hash_code Hash = llvm::hash_value(Signature);
+  std::string IdStr = "__hlsl_rootsig_decl_" + std::to_string(Hash);
+  IdentifierInfo *DeclIdent = &(getASTContext().Idents.get(IdStr));
+
+  // Check if we have already found a decl of the same name.
+  LookupResult R(SemaRef, DeclIdent, SourceLocation(),
+                 Sema::LookupOrdinaryName);
+  bool Found = SemaRef.LookupQualifiedName(R, SemaRef.CurContext);
+  return {DeclIdent, Found};
+}
+
+void SemaHLSL::ActOnFinishRootSignatureDecl(
+    SourceLocation Loc, IdentifierInfo *DeclIdent,
+    SmallVector<llvm::hlsl::rootsig::RootElement> &Elements) {
+
+  auto *SignatureDecl = HLSLRootSignatureDecl::Create(
+      SemaRef.getASTContext(), /*DeclContext=*/SemaRef.CurContext, Loc,
+      DeclIdent, Elements);
+
+  SignatureDecl->setImplicit();
+  SemaRef.PushOnScopeChains(SignatureDecl, SemaRef.getCurScope());
+}
+
 void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) {
   if (AL.getNumArgs() != 1) {
     Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;

From 80f3a28bbe7c2e17fb4b60e974c4157ec7e1eefc Mon Sep 17 00:00:00 2001
From: Justin King <jcking@wulver.com>
Date: Tue, 17 Jun 2025 11:28:14 -0700
Subject: [PATCH 0694/1322] Revert "lsan: Support free_sized and
 free_aligned_sized from C23" (#144575)

Reverts llvm/llvm-project#144415

Need to update approach to handle Apple platforms gracefully.
---
 compiler-rt/lib/lsan/lsan_allocator.cpp       |  4 ----
 compiler-rt/lib/lsan/lsan_allocator.h         |  2 --
 compiler-rt/lib/lsan/lsan_interceptors.cpp    | 18 ---------------
 compiler-rt/lib/lsan/lsan_malloc_mac.cpp      | 23 ++++++++-----------
 .../sanitizer_common/sanitizer_malloc_mac.inc | 15 ------------
 5 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index a436d9c07ac6..493bf5f9efc5 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -220,10 +220,6 @@ void lsan_free(void *p) {
   Deallocate(p);
 }
 
-void lsan_free_sized(void *p, uptr) { Deallocate(p); }
-
-void lsan_free_aligned_sized(void *p, uptr, uptr) { Deallocate(p); }
-
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack) {
   return SetErrnoOnNull(Reallocate(stack, p, size, 1));
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 2342f11fb5d0..5eed0cbdb309 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -127,8 +127,6 @@ void *lsan_aligned_alloc(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_memalign(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_malloc(uptr size, const StackTrace &stack);
 void lsan_free(void *p);
-void lsan_free_sized(void *p, uptr size);
-void lsan_free_aligned_sized(void *p, uptr alignment, uptr size);
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack);
 void *lsan_reallocarray(void *p, uptr nmemb, uptr size,
                         const StackTrace &stack);
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index 8e33130840e9..a8252cddacf2 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -84,24 +84,6 @@ INTERCEPTOR(void, free, void *p) {
   lsan_free(p);
 }
 
-INTERCEPTOR(void, free_sized, void *p, uptr size) {
-  if (UNLIKELY(!p))
-    return;
-  if (DlsymAlloc::PointerIsMine(p))
-    return DlsymAlloc::Free(p);
-  ENSURE_LSAN_INITED;
-  lsan_free_sized(p, size);
-}
-
-INTERCEPTOR(void, free_aligned_sized, void *p, uptr alignment, uptr size) {
-  if (UNLIKELY(!p))
-    return;
-  if (DlsymAlloc::PointerIsMine(p))
-    return DlsymAlloc::Free(p);
-  ENSURE_LSAN_INITED;
-  lsan_free_aligned_sized(p, alignment, size);
-}
-
 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
   if (DlsymAlloc::Use())
     return DlsymAlloc::Callocate(nmemb, size);
diff --git a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
index 8a16c053da23..525c30272ccc 100644
--- a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
+++ b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
@@ -44,19 +44,16 @@ using namespace __lsan;
   void *p = lsan_valloc(size, stack)
 #define COMMON_MALLOC_FREE(ptr) \
   lsan_free(ptr)
-#  define COMMON_MALLOC_FREE_SIZED(ptr, size) lsan_free_sized(ptr, size)
-#  define COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size) \
-    lsan_free_aligned_sized(ptr, alignment, size)
-#  define COMMON_MALLOC_SIZE(ptr) uptr size = lsan_mz_size(ptr)
-#  define COMMON_MALLOC_FILL_STATS(zone, stats)
-#  define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name)    \
-    (void)zone_name;                                                        \
-    Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", \
-           ptr);
-#  define COMMON_MALLOC_NAMESPACE __lsan
-#  define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
-#  define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
+#define COMMON_MALLOC_SIZE(ptr) \
+  uptr size = lsan_mz_size(ptr)
+#define COMMON_MALLOC_FILL_STATS(zone, stats)
+#define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name) \
+  (void)zone_name; \
+  Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", ptr);
+#define COMMON_MALLOC_NAMESPACE __lsan
+#define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
+#define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
 
-#  include "sanitizer_common/sanitizer_malloc_mac.inc"
+#include "sanitizer_common/sanitizer_malloc_mac.inc"
 
 #endif // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
index 72ad22999b5a..6343eb284afb 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
@@ -144,21 +144,6 @@ INTERCEPTOR(void, free, void *ptr) {
   COMMON_MALLOC_FREE(ptr);
 }
 
-#ifdef COMMON_MALLOC_FREE_SIZED
-INTERCEPTOR(void, free_sized, void *ptr, size_t size) {
-  COMMON_MALLOC_ENTER();
-  COMMON_MALLOC_FREE_SIZED(ptr, size);
-}
-#endif
-
-#ifdef COMMON_MALLOC_FREE_ALIGNED_SIZED
-INTERCEPTOR(void, free_aligned_sized, void *ptr, size_t alignment,
-            size_t size) {
-  COMMON_MALLOC_ENTER();
-  COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size);
-}
-#endif
-
 INTERCEPTOR(void *, realloc, void *ptr, size_t size) {
   COMMON_MALLOC_ENTER();
   COMMON_MALLOC_REALLOC(ptr, size);

From 391dafd8af9c0309f2ca75621dae1dbae307b428 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 17 Jun 2025 11:28:43 -0700
Subject: [PATCH 0695/1322] [RISCV] Consolidate both copies of getLMUL1VT [nfc]
 (#144568)

Put one copy on RISCVTargetLowering as a static function so that both
locations can use it, and rename the method to getM1VT for slightly
improved readability.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 64 +++++++++----------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  9 +++
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 11 +---
 3 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 779786fa400f..33aae7ab16cc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3498,14 +3498,6 @@ getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
   return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
 }
 
-static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
-         "Unexpected vector MVT");
-  return MVT::getScalableVectorVT(
-      VT.getVectorElementType(),
-      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
-}
-
 struct VIDSequence {
   int64_t StepNumerator;
   unsigned StepDenominator;
@@ -4316,7 +4308,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
     MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
     MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
-    assert(M1VT == getLMUL1VT(M1VT));
+    assert(M1VT == RISCVTargetLowering::getM1VT(M1VT));
 
     // The following semantically builds up a fixed length concat_vector
     // of the component build_vectors.  We eagerly lower to scalable and
@@ -4356,7 +4348,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
   unsigned NumDefElts = NumElts - NumUndefElts;
   if (NumDefElts >= 8 && NumDefElts > NumElts / 2 &&
-      ContainerVT.bitsLE(getLMUL1VT(ContainerVT))) {
+      ContainerVT.bitsLE(RISCVTargetLowering::getM1VT(ContainerVT))) {
     SmallVector<SDValue> SubVecAOps, SubVecBOps;
     SmallVector<SDValue> MaskVals;
     SDValue UndefElem = DAG.getUNDEF(Op->getOperand(0)->getValueType(0));
@@ -5114,7 +5106,8 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1,
 
   MVT InnerVT = ContainerVT;
   auto [Mask, VL] = getDefaultVLOps(IntVT, InnerVT, DL, DAG, Subtarget);
-  if (Op1.isUndef() && ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+  if (Op1.isUndef() &&
+      ContainerVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVT)) &&
       (RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc)) {
     InnerVT = ContainerVT.getHalfNumVectorElementsVT();
     VL = DAG.getConstant(VT.getVectorNumElements() / 2, DL,
@@ -5382,7 +5375,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
   MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
-  assert(M1VT == getLMUL1VT(M1VT));
+  assert(M1VT == RISCVTargetLowering::getM1VT(M1VT));
   unsigned NumOpElts = M1VT.getVectorMinNumElements();
   unsigned NumElts = ContainerVT.getVectorMinNumElements();
   unsigned NumOfSrcRegs = NumElts / NumOpElts;
@@ -6152,7 +6145,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       return convertFromScalableVector(VT, Gather, DAG, Subtarget);
     }
 
-    const MVT M1VT = getLMUL1VT(ContainerVT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType());
     auto [InnerTrueMask, InnerVL] =
         getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget);
@@ -7801,7 +7794,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // This reduces the length of the chain of vslideups and allows us to
     // perform the vslideups at a smaller LMUL, limited to MF2.
     if (Op.getNumOperands() > 2 &&
-        ContainerVT.bitsGE(getLMUL1VT(ContainerVT))) {
+        ContainerVT.bitsGE(RISCVTargetLowering::getM1VT(ContainerVT))) {
       MVT HalfVT = VT.getHalfNumVectorElementsVT();
       assert(isPowerOf2_32(Op.getNumOperands()));
       size_t HalfNumOps = Op.getNumOperands() / 2;
@@ -9821,11 +9814,12 @@ getSmallestVTForIndex(MVT VecVT, unsigned MaxIdx, SDLoc DL, SelectionDAG &DAG,
   const unsigned MinVLMAX = VectorBitsMin / EltSize;
   MVT SmallerVT;
   if (MaxIdx < MinVLMAX)
-    SmallerVT = getLMUL1VT(VecVT);
+    SmallerVT = RISCVTargetLowering::getM1VT(VecVT);
   else if (MaxIdx < MinVLMAX * 2)
-    SmallerVT = getLMUL1VT(VecVT).getDoubleNumVectorElementsVT();
+    SmallerVT =
+        RISCVTargetLowering::getM1VT(VecVT).getDoubleNumVectorElementsVT();
   else if (MaxIdx < MinVLMAX * 4)
-    SmallerVT = getLMUL1VT(VecVT)
+    SmallerVT = RISCVTargetLowering::getM1VT(VecVT)
                     .getDoubleNumVectorElementsVT()
                     .getDoubleNumVectorElementsVT();
   if (!SmallerVT.isValid() || !VecVT.bitsGT(SmallerVT))
@@ -9898,9 +9892,8 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     // If we're compiling for an exact VLEN value, we can always perform
     // the insert in m1 as we can determine the register corresponding to
     // the index in the register group.
-    const MVT M1VT = getLMUL1VT(ContainerVT);
-    if (auto VLEN = Subtarget.getRealVLen();
-        VLEN && ContainerVT.bitsGT(M1VT)) {
+    const MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
+    if (auto VLEN = Subtarget.getRealVLen(); VLEN && ContainerVT.bitsGT(M1VT)) {
       EVT ElemVT = VecVT.getVectorElementType();
       unsigned ElemsPerVReg = *VLEN / ElemVT.getFixedSizeInBits();
       unsigned RemIdx = OrigIdx % ElemsPerVReg;
@@ -10127,7 +10120,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   const auto VLen = Subtarget.getRealVLen();
   if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
       IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
-    MVT M1VT = getLMUL1VT(ContainerVT);
+    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     unsigned OrigIdx = IdxC->getZExtValue();
     EVT ElemVT = VecVT.getVectorElementType();
     unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
@@ -10175,7 +10168,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   // TODO: We don't have the same code for insert_vector_elt because we
   // have BUILD_VECTOR and handle the degenerate case there.  Should we
   // consider adding an inverse BUILD_VECTOR node?
-  MVT LMUL2VT = getLMUL1VT(ContainerVT).getDoubleNumVectorElementsVT();
+  MVT LMUL2VT =
+      RISCVTargetLowering::getM1VT(ContainerVT).getDoubleNumVectorElementsVT();
   if (ContainerVT.bitsGT(LMUL2VT) && VecVT.isFixedLengthVector())
     return SDValue();
 
@@ -11107,7 +11101,7 @@ static SDValue lowerReductionSeq(unsigned RVVOpcode, MVT ResVT,
                                  SDValue VL, const SDLoc &DL, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   const MVT VecVT = Vec.getSimpleValueType();
-  const MVT M1VT = getLMUL1VT(VecVT);
+  const MVT M1VT = RISCVTargetLowering::getM1VT(VecVT);
   const MVT XLenVT = Subtarget.getXLenVT();
   const bool NonZeroAVL = isNonZeroAVL(VL);
 
@@ -11485,8 +11479,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     assert(VLen);
     AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
   }
-  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
-    InterSubVT = getLMUL1VT(ContainerVecVT);
+  if (ContainerVecVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVecVT))) {
+    InterSubVT = RISCVTargetLowering::getM1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getExtractSubvector(DL, InterSubVT, Vec, AlignedIdx);
@@ -11677,7 +11671,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
   MVT InterSubVT = VecVT;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
+  if (VecVT.bitsGT(RISCVTargetLowering::getM1VT(VecVT))) {
     // If VecVT has an LMUL > 1, then SubVecVT should have a smaller LMUL, and
     // we should have successfully decomposed the extract into a subregister.
     // We use an extract_subvector that will resolve to a subreg extract.
@@ -11688,7 +11682,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
       assert(VLen);
       Idx /= *VLen / RISCV::RVVBitsPerBlock;
     }
-    InterSubVT = getLMUL1VT(VecVT);
+    InterSubVT = RISCVTargetLowering::getM1VT(VecVT);
     Vec = DAG.getExtractSubvector(DL, InterSubVT, Vec, Idx);
   }
 
@@ -11805,7 +11799,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
     // For fractional LMUL, check if we can use a higher LMUL
     // instruction to avoid a vslidedown.
     if (SDValue Src = foldConcatVector(V1, V2);
-        Src && getLMUL1VT(VT).bitsGT(VT)) {
+        Src && RISCVTargetLowering::getM1VT(VT).bitsGT(VT)) {
       EVT NewVT = VT.getDoubleNumVectorElementsVT();
       Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
       // Freeze the source so we can increase its use count.
@@ -12187,7 +12181,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   // vrgather.vv v14, v9, v16
   // vrgather.vv v13, v10, v16
   // vrgather.vv v12, v11, v16
-  if (ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+  if (ContainerVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVT)) &&
       ContainerVT.getVectorElementCount().isKnownMultipleOf(2)) {
     auto [Lo, Hi] = DAG.SplitVector(Vec, DL);
     Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, Lo.getSimpleValueType(), Lo);
@@ -12252,7 +12246,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   // At LMUL > 1, do the index computation in 16 bits to reduce register
   // pressure.
   if (IntVT.getScalarType().bitsGT(MVT::i16) &&
-      IntVT.bitsGT(getLMUL1VT(IntVT))) {
+      IntVT.bitsGT(RISCVTargetLowering::getM1VT(IntVT))) {
     assert(isUInt<16>(MaxVLMAX - 1)); // Largest VLMAX is 65536 @ zvl65536b
     GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
     IntVT = IntVT.changeVectorElementType(MVT::i16);
@@ -12339,7 +12333,7 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
-      getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
+      RISCVTargetLowering::getM1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Load->getMemOperand();
     SDValue NewLoad =
         DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
@@ -12400,7 +12394,7 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
-      getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
+      RISCVTargetLowering::getM1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Store->getMemOperand();
     return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(),
                         MMO->getPointerInfo(), MMO->getBaseAlign(),
@@ -20368,7 +20362,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return Scalar.getOperand(0);
 
     // Use M1 or smaller to avoid over constraining register allocation
-    const MVT M1VT = getLMUL1VT(VT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(VT);
     if (M1VT.bitsLT(VT)) {
       SDValue M1Passthru = DAG.getExtractSubvector(DL, M1VT, Passthru, 0);
       SDValue Result =
@@ -20382,7 +20376,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // no purpose.
     if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Scalar);
         Const && !Const->isZero() && isInt<5>(Const->getSExtValue()) &&
-        VT.bitsLE(getLMUL1VT(VT)) && Passthru.isUndef())
+        VT.bitsLE(RISCVTargetLowering::getM1VT(VT)) && Passthru.isUndef())
       return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Scalar, VL);
 
     break;
@@ -20390,7 +20384,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::VMV_X_S: {
     SDValue Vec = N->getOperand(0);
     MVT VecVT = N->getOperand(0).getSimpleValueType();
-    const MVT M1VT = getLMUL1VT(VecVT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(VecVT);
     if (M1VT.bitsLT(VecVT)) {
       Vec = DAG.getExtractSubvector(DL, M1VT, Vec, 0);
       return DAG.getNode(RISCVISD::VMV_X_S, DL, N->getSimpleValueType(0), Vec);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 417d684a6238..f67d7f155c9d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -363,6 +363,15 @@ public:
   static std::pair<unsigned, unsigned>
   computeVLMAXBounds(MVT ContainerVT, const RISCVSubtarget &Subtarget);
 
+  /// Given a vector (either fixed or scalable), return the scalable vector
+  /// corresponding to a vector register (i.e. an m1 register group).
+  static MVT getM1VT(MVT VT) {
+    unsigned EltSizeInBits = VT.getVectorElementType().getSizeInBits();
+    assert(EltSizeInBits <= RISCV::RVVBitsPerBlock && "Unexpected vector MVT");
+    return MVT::getScalableVectorVT(VT.getVectorElementType(),
+                                    RISCV::RVVBitsPerBlock / EltSizeInBits);
+  }
+
   static unsigned getRegClassIDForLMUL(RISCVVType::VLMUL LMul);
   static unsigned getSubregIndexByMVT(MVT VT, unsigned Index);
   static unsigned getRegClassIDForVecVT(MVT VT);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 0093c92ea5ef..aadda2ce8552 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -602,15 +602,6 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
   return FirstSlideCost + SecondSlideCost + MaskCost;
 }
 
-// Consolidate!
-static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
-         "Unexpected vector MVT");
-  return MVT::getScalableVectorVT(
-      VT.getVectorElementType(),
-      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
-}
-
 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                              VectorType *Tp, ArrayRef<int> Mask,
                                              TTI::TargetCostKind CostKind,
@@ -870,7 +861,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     MVT ContainerVT = LT.second;
     if (LT.second.isFixedLengthVector())
       ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
-    MVT M1VT = getLMUL1VT(ContainerVT);
+    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     if (ContainerVT.bitsLE(M1VT)) {
       // Example sequence:
       //   csrr a0, vlenb

From 1f10c6a277fbc1b1c6ceb7546b001af39feb92ce Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 17 Jun 2025 10:46:08 -0700
Subject: [PATCH 0696/1322] [Matrix] Hoist more IRBuilder<>'s. NFC

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index ece0bb56fff0..96b156494fd9 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1211,13 +1211,13 @@ public:
 
     switch (Inst->getCalledFunction()->getIntrinsicID()) {
     case Intrinsic::matrix_multiply:
-      return LowerMultiply(Inst);
+      return LowerMultiply(Inst, Builder);
     case Intrinsic::matrix_transpose:
-      return LowerTranspose(Inst);
+      return LowerTranspose(Inst, Builder);
     case Intrinsic::matrix_column_major_load:
-      return LowerColumnMajorLoad(Inst);
+      return LowerColumnMajorLoad(Inst, Builder);
     case Intrinsic::matrix_column_major_store:
-      return LowerColumnMajorStore(Inst);
+      return LowerColumnMajorStore(Inst, Builder);
     case Intrinsic::abs:
     case Intrinsic::fabs: {
       MatrixTy Result;
@@ -1312,8 +1312,8 @@ public:
 
   /// Lower a load instruction with shape information.
   MatrixTy LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align,
-                     Value *Stride, bool IsVolatile, ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
+                     Value *Stride, bool IsVolatile, ShapeInfo Shape,
+                     IRBuilder<> &Builder) {
     return loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, Shape,
                       Builder);
   }
@@ -1321,14 +1321,14 @@ public:
   /// Lowers llvm.matrix.column.major.load.
   ///
   /// The intrinsic loads a matrix from memory using a stride between columns.
-  MatrixTy LowerColumnMajorLoad(CallInst *Inst) {
+  MatrixTy LowerColumnMajorLoad(CallInst *Inst, IRBuilder<> &Builder) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Ptr = Inst->getArgOperand(0);
     Value *Stride = Inst->getArgOperand(1);
     return LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
                      cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
-                     {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+                     {Inst->getArgOperand(3), Inst->getArgOperand(4)}, Builder);
   }
 
   /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
@@ -1373,8 +1373,7 @@ public:
   /// Lower a store instruction with shape information.
   MatrixTy LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr,
                       MaybeAlign A, Value *Stride, bool IsVolatile,
-                      ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
+                      ShapeInfo Shape, IRBuilder<> &Builder) {
     auto StoreVal = getMatrix(Matrix, Shape, Builder);
     return storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, IsVolatile,
                        Builder);
@@ -1383,7 +1382,7 @@ public:
   /// Lowers llvm.matrix.column.major.store.
   ///
   /// The intrinsic store a matrix back memory using a stride between columns.
-  MatrixTy LowerColumnMajorStore(CallInst *Inst) {
+  MatrixTy LowerColumnMajorStore(CallInst *Inst, IRBuilder<> &Builder) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Matrix = Inst->getArgOperand(0);
@@ -1391,7 +1390,8 @@ public:
     Value *Stride = Inst->getArgOperand(2);
     return LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
                       cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
-                      {Inst->getArgOperand(4), Inst->getArgOperand(5)});
+                      {Inst->getArgOperand(4), Inst->getArgOperand(5)},
+                      Builder);
   }
 
   // Set elements I..I+NumElts-1 to Block
@@ -2166,8 +2166,7 @@ public:
   }
 
   /// Lowers llvm.matrix.multiply.
-  MatrixTy LowerMultiply(CallInst *MatMul) {
-    IRBuilder<> Builder(MatMul);
+  MatrixTy LowerMultiply(CallInst *MatMul, IRBuilder<> &Builder) {
     auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();
     ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
     ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
@@ -2192,9 +2191,8 @@ public:
   }
 
   /// Lowers llvm.matrix.transpose.
-  MatrixTy LowerTranspose(CallInst *Inst) {
+  MatrixTy LowerTranspose(CallInst *Inst, IRBuilder<> &Builder) {
     MatrixTy Result;
-    IRBuilder<> Builder(Inst);
     Value *InputVal = Inst->getArgOperand(0);
     FixedVectorType *VectorTy = cast<FixedVectorType>(InputVal->getType());
     ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
@@ -2230,13 +2228,15 @@ public:
   MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
                      IRBuilder<> &Builder) {
     return LowerLoad(Inst, Ptr, Inst->getAlign(),
-                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                     Builder);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
                       Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
-                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                      Builder);
   }
 
   /// Lower binary operators.

From b59d4cf05447fdaf3d3c859e10db0b3c892f6ec6 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Tue, 17 Jun 2025 11:46:12 -0700
Subject: [PATCH 0697/1322]  [Reland] Adjust bit cast instruction filter for
 DXIL Prepare pass (#143783)

Relands https://github.com/llvm/llvm-project/pull/142678, with a new
change to remove an unnecessary gep argument, after a revert was needed
due to unforeseen bugs.
Fixes https://github.com/llvm/llvm-project/issues/139013
---
 llvm/lib/Target/DirectX/DXILPrepare.cpp       | 44 ++++++++++++-
 .../DirectX/llc-vector-load-scalarize.ll      | 64 ++++++++-----------
 .../DirectX/noop_bitcast_global_array_type.ll | 53 +++++++++++++++
 3 files changed, 121 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll

diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index e0068787f5e5..cb58f4833631 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -148,9 +148,49 @@ class DXILPrepareModule : public ModulePass {
                                      Type *Ty) {
     // Omit bitcasts if the incoming value matches the instruction type.
     auto It = PointerTypes.find(Operand);
-    if (It != PointerTypes.end())
-      if (cast<TypedPointerType>(It->second)->getElementType() == Ty)
+    if (It != PointerTypes.end()) {
+      auto *OpTy = cast<TypedPointerType>(It->second)->getElementType();
+      if (OpTy == Ty)
         return nullptr;
+    }
+
+    Type *ValTy = Operand->getType();
+    // Also omit the bitcast for matching global array types
+    if (auto *GlobalVar = dyn_cast<GlobalVariable>(Operand))
+      ValTy = GlobalVar->getValueType();
+
+    if (auto *AI = dyn_cast<AllocaInst>(Operand))
+      ValTy = AI->getAllocatedType();
+
+    if (auto *ArrTy = dyn_cast<ArrayType>(ValTy)) {
+      Type *ElTy = ArrTy->getElementType();
+      if (ElTy == Ty)
+        return nullptr;
+    }
+
+    // finally, drill down GEP instructions until we get the array
+    // that is being accessed, and compare element types
+    if (ConstantExpr *GEPInstr = dyn_cast<ConstantExpr>(Operand)) {
+      while (GEPInstr->getOpcode() == Instruction::GetElementPtr) {
+        Value *OpArg = GEPInstr->getOperand(0);
+        if (ConstantExpr *NewGEPInstr = dyn_cast<ConstantExpr>(OpArg)) {
+          GEPInstr = NewGEPInstr;
+          continue;
+        }
+
+        if (auto *GlobalVar = dyn_cast<GlobalVariable>(OpArg))
+          ValTy = GlobalVar->getValueType();
+        if (auto *AI = dyn_cast<AllocaInst>(Operand))
+          ValTy = AI->getAllocatedType();
+        if (auto *ArrTy = dyn_cast<ArrayType>(ValTy)) {
+          Type *ElTy = ArrTy->getElementType();
+          if (ElTy == Ty)
+            return nullptr;
+        }
+        break;
+      }
+    }
+
     // Insert bitcasts where we are removing the instruction.
     Builder.SetInsertPoint(&Inst);
     // This code only gets hit in opaque-pointer mode, so the type of the
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index 778113bd3160..d5797f6b5134 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -60,19 +60,15 @@ define <4 x i32> @load_array_vec_test() #0 {
 define <4 x i32> @load_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_vec_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @vecData.scalarized to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP4]], i32 1
-; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[TMP8]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3), align 4
+; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* @"vecData", align 4
   ret <4 x i32> %1
@@ -103,31 +99,23 @@ define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
 define <4 x i32> @multid_load_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @multid_load_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
-; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
-; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
-; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
-; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
-; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
-; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1), align 4
+; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2), align 4
+; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3), align 4
+; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP2]], [[DOTI13]]
+; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP3]], [[DOTI25]]
+; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP4]], [[DOTI37]]
+; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
+; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
+; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
diff --git a/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll b/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
new file mode 100644
index 000000000000..1f33700e014c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S --dxil-prepare %s | FileCheck %s
+
+; Test that global arrays do not get a bitcast instruction
+; after the dxil-prepare pass.
+
+target triple = "dxilv1.2-unknown-shadermodel6.2-compute"
+
+@inputTile.1dim = local_unnamed_addr addrspace(3) global [3 x float] zeroinitializer, align 2
+
+; CHECK-LABEL: testload
+define float @testload() local_unnamed_addr {
+  ; NOTE: this would be "bitcast ptr addrspace(3)..." before the change that introduced this test,
+  ; after the dxil-prepare pass is run
+  ; CHECK-NEXT: load float, ptr addrspace(3) @inputTile.1dim, align 2
+  %v = load float, ptr addrspace(3) @inputTile.1dim, align 2  
+  
+  ret float %v
+}
+
+; CHECK-LABEL: teststore
+define void @teststore() local_unnamed_addr {  
+  ; CHECK-next: store float 2.000000e+00, ptr addrspace(3) @inputTile.1dim, align 2
+  store float 2.000000e+00, ptr addrspace(3) @inputTile.1dim, align 2  
+  
+  ret void
+}
+
+; CHECK-LABEL: testGEPConst
+define float @testGEPConst() local_unnamed_addr {  
+  ; CHECK-NEXT: load float, ptr addrspace(3) getelementptr (float, ptr addrspace(3) @inputTile.1dim, i32 1), align 4
+  %v = load float, ptr addrspace(3) getelementptr (float, ptr addrspace(3) @inputTile.1dim, i32 1), align 4
+  
+  ret float %v
+}
+
+; CHECK-LABEL: testGEPNonConst
+define float @testGEPNonConst(i32 %i) local_unnamed_addr {  
+  ; CHECK-NEXT: getelementptr float, ptr addrspace(3) @inputTile.1dim, i32 %i
+  %gep = getelementptr float, ptr addrspace(3) @inputTile.1dim, i32 %i
+  %v = load float, ptr addrspace(3) %gep
+  
+  ret float %v
+}
+
+; CHECK-LABEL: testAlloca
+define float @testAlloca(i32 %i) local_unnamed_addr {  
+  ; CHECK-NEXT: alloca [3 x float], align 4
+  %arr = alloca [3 x float], align 4
+  ; CHECK-NEXT: getelementptr [3 x float], ptr %arr, i32 1
+  %gep = getelementptr [3 x float], ptr %arr, i32 1
+  %v = load float, ptr %gep
+  ret float %v
+}

From dd65e6e0608c3390752750a0f19bca4409603db9 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Tue, 17 Jun 2025 20:51:40 +0200
Subject: [PATCH 0698/1322] [Offload][libc] Add cmake cache AMDGPU buildbot
 (#144500)

An upcoming libc4GPU buildbot will be using this CMake cache file for
its build configuration.
---
 offload/cmake/caches/AMDGPULibcBot.cmake | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 offload/cmake/caches/AMDGPULibcBot.cmake

diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake
new file mode 100644
index 000000000000..728dfe3f0a3f
--- /dev/null
+++ b/offload/cmake/caches/AMDGPULibcBot.cmake
@@ -0,0 +1,20 @@
+set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "")
+
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(BUILD_SHARED_LIBS ON CACHE BOOL "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+
+set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "compiler-rt;libunwind;openmp;offload" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+
+set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
+
+set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
+set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
+
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")

From 9cb754509608b9d9143fa17f775631bbfcce0848 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Wed, 18 Jun 2025 00:28:23 +0530
Subject: [PATCH 0699/1322] [Driver] Add option to force undefined symbols
 during linking in BareMetal toolchain object. (#132807)

Add support for `-u` option to force defined symbols. This option is
supported by both lld and gnuld.

This is done as a part of the effort to merge RISCVToolchain object into
BareMetal toolchain object.

This is the 4th patch in the series of patches for merging
RISCVToolchain object into BareMetal toolchain object.

RFC:
https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/lib/Driver/ToolChains/BareMetal.cpp       |  5 +++--
 clang/test/Driver/baremetal-undefined-symbols.c | 14 ++++++++++++++
 clang/test/Driver/riscv-args.c                  |  6 ------
 3 files changed, 17 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/Driver/baremetal-undefined-symbols.c
 delete mode 100644 clang/test/Driver/riscv-args.c

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index a665040662a3..d4e4e6d04b41 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -609,8 +609,9 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
-                            options::OPT_s, options::OPT_t, options::OPT_r});
+  Args.addAllArgs(CmdArgs,
+                  {options::OPT_L, options::OPT_u, options::OPT_T_Group,
+                   options::OPT_s, options::OPT_t, options::OPT_r});
 
   TC.AddFilePathLibArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/baremetal-undefined-symbols.c b/clang/test/Driver/baremetal-undefined-symbols.c
new file mode 100644
index 000000000000..bff58c7c54c3
--- /dev/null
+++ b/clang/test/Driver/baremetal-undefined-symbols.c
@@ -0,0 +1,14 @@
+// Check the arguments are correctly passed
+
+// Make sure -T is the last with gcc-toolchain option
+// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LD %s
+// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"
+
+// TODO: Merge this test with the above in the last patch when finally integrating riscv
+// Make sure -T is the last with gcc-toolchain option
+// RUN: %clang -### --target=aarch64-none-elf --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
+// RUN: %clang -### --target=armv6m-none-eabi --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
+// CHECK-ARM-LD: {{.*}} "-T" "a.lds" "-u" "foo" {{.*}} "--defsym=FOO=10"
diff --git a/clang/test/Driver/riscv-args.c b/clang/test/Driver/riscv-args.c
deleted file mode 100644
index cab08e5b0f81..000000000000
--- a/clang/test/Driver/riscv-args.c
+++ /dev/null
@@ -1,6 +0,0 @@
-// Check the arguments are correctly passed
-
-// Make sure -T is the last with gcc-toolchain option
-// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-LD %s
-// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"

From 57828fec760f086b334ce0cb1c465fc559dcaea4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 21:08:23 +0200
Subject: [PATCH 0700/1322] Revert "[clang][bytecode] Allocate IntegralAP and
 Floating types using an allocator (#144246)"

This reverts commit c66be289901b3f035187d391e80e3610d7d6232e.

This breaks the armv8-quick builder:
https://lab.llvm.org/buildbot/#/builders/154/builds/17549
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 114 +++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 -
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  58 +--
 clang/lib/AST/ByteCode/Floating.h             | 252 +++++--------
 clang/lib/AST/ByteCode/Integral.h             |   3 -
 clang/lib/AST/ByteCode/IntegralAP.h           | 233 +++++-------
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +-----
 clang/lib/AST/ByteCode/Interp.h               | 341 ++++--------------
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 +--
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 --
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 -
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 17 files changed, 344 insertions(+), 932 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 3f884ed8d094..9fe4803ce98e 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,8 +748,7 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  APFloat F = E->getValue();
-  return this->emitFloat(F, E);
+  return this->emitConstFloat(E->getValue(), E);
 }
 
 template <class Emitter>
@@ -4186,10 +4185,8 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float: {
-    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
-    return this->emitFloat(F, E);
-  }
+  case PT_Float:
+    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
@@ -4677,7 +4674,10 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      return this->emitFinishInitGlobal(Init);
+      if (!this->emitFinishInit(Init))
+        return false;
+
+      return this->emitPopPtr(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,45 +4698,51 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  }
-  // Local variables.
-  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
-
-  if (VarT) {
-    unsigned Offset = this->allocateLocalPrimitive(
-        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
-        IsConstexprUnknown);
-    if (Init) {
-      // If this is a toplevel declaration, create a scope for the
-      // initializer.
-      if (Toplevel) {
-        LocalScope<Emitter> Scope(this);
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-      } else {
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD);
-      }
-    }
   } else {
-    if (std::optional<unsigned> Offset = this->allocateLocal(
-            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
-      if (!Init)
-        return true;
+    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-      if (!this->emitGetPtrLocal(*Offset, Init))
-        return false;
+    if (VarT) {
+      unsigned Offset = this->allocateLocalPrimitive(
+          VD, *VarT, VD->getType().isConstQualified(), nullptr,
+          ScopeKind::Block, IsConstexprUnknown);
+      if (Init) {
+        // If this is a toplevel declaration, create a scope for the
+        // initializer.
+        if (Toplevel) {
+          LocalScope<Emitter> Scope(this);
+          if (!this->visit(Init))
+            return false;
+          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+        } else {
+          if (!this->visit(Init))
+            return false;
+          return this->emitSetLocal(*VarT, Offset, VD);
+        }
+      }
+    } else {
+      if (std::optional<unsigned> Offset =
+              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
+                                  IsConstexprUnknown)) {
+        if (!Init)
+          return true;
 
-      if (!visitInitializer(Init))
-        return false;
+        if (!this->emitGetPtrLocal(*Offset, Init))
+          return false;
 
-      return this->emitFinishInitPop(Init);
+        if (!visitInitializer(Init))
+          return false;
+
+        if (!this->emitFinishInit(Init))
+          return false;
+
+        return this->emitPopPtr(Init);
+      }
+      return false;
     }
-    return false;
+    return true;
   }
-  return true;
+
+  return false;
 }
 
 template <class Emitter>
@@ -4745,10 +4751,8 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat()) {
-    APFloat F = Val.getFloat();
-    return this->emitFloat(F, E);
-  }
+  else if (Val.isFloat())
+    return this->emitConstFloat(Val.getFloat(), E);
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6129,10 +6133,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      APFloat F(TargetSemantics, 1);
-      if (!this->emitFloat(F, E))
+      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
         return false;
-
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6174,10 +6176,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      APFloat F(TargetSemantics, 1);
-      if (!this->emitFloat(F, E))
+      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
         return false;
-
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,20 +6953,6 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
-template <class Emitter>
-bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
-  assert(!DiscardResult && "Should've been checked before");
-
-  if (Floating::singleWord(F.getSemantics()))
-    return this->emitConstFloat(Floating(F), E);
-
-  APInt I = F.bitcastToAPInt();
-  return this->emitConstFloat(
-      Floating(const_cast<uint64_t *>(I.getRawData()),
-               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
-      E);
-}
-
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index a1d068cc7e0a..ac3ad84766dc 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,7 +391,6 @@ private:
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
-  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 46e4d0d940b3..5531295dfa2f 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 7c6b78386b14..846dc2fe92a7 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,56 +50,34 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto Sem = Floating::deserializeSemantics(*OpPC);
+  auto F = Floating::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
-      llvm::APFloatBase::EnumToSemantics(Sem));
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
-  Floating Result(Memory.get(), Sem);
-  Floating::deserialize(*OpPC, &Result);
-
-  OpPC += align(Result.bytesToSerialize());
-
-  std::string S;
-  llvm::raw_string_ostream SS(S);
-  SS << Result;
-  return S;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  using T = IntegralAP<false>;
-  unsigned BitWidth = T::deserializeSize(*OpPC);
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  auto F = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  T Result(Memory.get(), BitWidth);
-  T::deserialize(*OpPC, &Result);
-
-  OpPC += Result.bytesToSerialize();
-  std::string Str;
-  llvm::raw_string_ostream SS(Str);
-  SS << Result;
-  return Str;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
-
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  using T = IntegralAP<true>;
-  unsigned BitWidth = T::deserializeSize(*OpPC);
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  auto F = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  T Result(Memory.get(), BitWidth);
-  T::deserialize(*OpPC, &Result);
-
-  std::string Str;
-  llvm::raw_string_ostream SS(Str);
-  SS << Result;
-
-  OpPC += Result.bytesToSerialize();
-  return Str;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 659892e720ab..3750568fc23c 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,79 +17,63 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
-// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
-// floating values.
-#define ALLOCATE_ALL 0
-
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
-using APInt = llvm::APInt;
 
-/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
-/// It will NOT copy the memory (unless, of course, copy() is called) and it
-/// won't alllocate anything. The allocation should happen via InterpState or
-/// Program.
 class Floating final {
 private:
-  union {
-    uint64_t Val = 0;
-    uint64_t *Memory;
-  };
-  llvm::APFloatBase::Semantics Semantics;
-
-  APFloat getValue() const {
-    unsigned BitWidth = bitWidth();
-    if (singleWord())
-      return APFloat(getSemantics(), APInt(BitWidth, Val));
-    unsigned NumWords = numWords();
-    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
-  }
+  // The underlying value storage.
+  APFloat F;
 
 public:
-  Floating() = default;
-  Floating(llvm::APFloatBase::Semantics Semantics)
-      : Val(0), Semantics(Semantics) {}
-  Floating(const APFloat &F) {
+  /// Zero-initializes a Floating.
+  Floating() : F(0.0f) {}
+  Floating(const APFloat &F) : F(F) {}
 
-    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
-    this->copy(F);
+  // Static constructors for special floating point values.
+  static Floating getInf(const llvm::fltSemantics &Sem) {
+    return Floating(APFloat::getInf(Sem));
   }
-  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
-      : Memory(Memory), Semantics(Semantics) {}
+  const APFloat &getAPFloat() const { return F; }
 
-  APFloat getAPFloat() const { return getValue(); }
-
-  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
-  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
-  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
-  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
+  bool operator<(Floating RHS) const { return F < RHS.F; }
+  bool operator>(Floating RHS) const { return F > RHS.F; }
+  bool operator<=(Floating RHS) const { return F <= RHS.F; }
+  bool operator>=(Floating RHS) const { return F >= RHS.F; }
+  bool operator==(Floating RHS) const { return F == RHS.F; }
+  bool operator!=(Floating RHS) const { return F != RHS.F; }
+  Floating operator-() const { return Floating(-F); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
-                                       &IsExact);
+    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
   }
 
-  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
-                   Floating *Result) const {
-    APFloat Copy = getValue();
+  Floating toSemantics(const llvm::fltSemantics *Sem,
+                       llvm::RoundingMode RM) const {
+    APFloat Copy = F;
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    Result->copy(Copy);
+    return Floating(Copy);
+  }
+
+  /// Convert this Floating to one with the same semantics as \Other.
+  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
+    return toSemantics(&Other.F.getSemantics(), RM);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(getValue().bitcastToAPInt());
+    return APSInt(F.bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
+  APValue toAPValue(const ASTContext &) const { return APValue(F); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    getValue().toString(Buffer);
+    F.toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -99,62 +83,25 @@ public:
     return NameStr;
   }
 
-  unsigned bitWidth() const {
-    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
-  }
-  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
-  bool singleWord() const {
-#if ALLOCATE_ALL
-    return false;
-#endif
-    return numWords() == 1;
-  }
-  static bool singleWord(const llvm::fltSemantics &Sem) {
-#if ALLOCATE_ALL
-    return false;
-#endif
-    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
-  }
-  const llvm::fltSemantics &getSemantics() const {
-    return llvm::APFloatBase::EnumToSemantics(Semantics);
-  }
-
-  void copy(const APFloat &F) {
-    if (singleWord()) {
-      Val = F.bitcastToAPInt().getZExtValue();
-    } else {
-      assert(Memory);
-      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
-                  numWords() * sizeof(uint64_t));
-    }
-  }
-
-  void take(uint64_t *NewMemory) {
-    if (singleWord())
-      return;
-
-    if (Memory)
-      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
-    Memory = NewMemory;
-  }
+  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return getValue().isNegative(); }
-  bool isZero() const { return getValue().isZero(); }
-  bool isNonZero() const { return getValue().isNonZero(); }
-  bool isMin() const { return getValue().isSmallest(); }
-  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
-  bool isNan() const { return getValue().isNaN(); }
-  bool isSignaling() const { return getValue().isSignaling(); }
-  bool isInf() const { return getValue().isInfinity(); }
-  bool isFinite() const { return getValue().isFinite(); }
-  bool isNormal() const { return getValue().isNormal(); }
-  bool isDenormal() const { return getValue().isDenormal(); }
-  llvm::FPClassTest classify() const { return getValue().classify(); }
-  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
+  bool isNegative() const { return F.isNegative(); }
+  bool isZero() const { return F.isZero(); }
+  bool isNonZero() const { return F.isNonZero(); }
+  bool isMin() const { return F.isSmallest(); }
+  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
+  bool isNan() const { return F.isNaN(); }
+  bool isSignaling() const { return F.isSignaling(); }
+  bool isInf() const { return F.isInfinity(); }
+  bool isFinite() const { return F.isFinite(); }
+  bool isNormal() const { return F.isNormal(); }
+  bool isDenormal() const { return F.isDenormal(); }
+  llvm::FPClassTest classify() const { return F.classify(); }
+  APFloat::fltCategory getCategory() const { return F.getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
+    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -171,130 +118,97 @@ public:
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating *Result) {
+                                        Floating &Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result->copy(F);
+    Result = Floating(F);
     return Status;
   }
 
-  static void bitcastFromMemory(const std::byte *Buff,
-                                const llvm::fltSemantics &Sem,
-                                Floating *Result) {
+  static Floating bitcastFromMemory(const std::byte *Buff,
+                                    const llvm::fltSemantics &Sem) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-    Result->copy(APFloat(Sem, API));
+
+    return Floating(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = getValue().bitcastToAPInt();
+    llvm::APInt API = F.bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
+    return sizeof(llvm::fltSemantics *) +
+           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
   }
 
   void serialize(std::byte *Buff) const {
-    std::memcpy(Buff, &Semantics, sizeof(Semantics));
-    if (singleWord()) {
-      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
-    } else {
-      std::memcpy(Buff + sizeof(Semantics), Memory,
-                  numWords() * sizeof(uint64_t));
-    }
+    // Semantics followed by an APInt.
+    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
+
+    llvm::APInt API = F.bitcastToAPInt();
+    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
+                           bitWidth() / 8);
   }
 
-  static llvm::APFloatBase::Semantics
-  deserializeSemantics(const std::byte *Buff) {
-    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
+  static Floating deserialize(const std::byte *Buff) {
+    const llvm::fltSemantics *Sem;
+    std::memcpy((void *)&Sem, Buff, sizeof(void *));
+    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
   }
 
-  static void deserialize(const std::byte *Buff, Floating *Result) {
-    llvm::APFloatBase::Semantics Semantics;
-    std::memcpy(&Semantics, Buff, sizeof(Semantics));
-
-    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
-        llvm::APFloatBase::EnumToSemantics(Semantics));
-    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-
-    Result->Semantics = Semantics;
-    if (NumWords == 1 && !ALLOCATE_ALL) {
-      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
-    } else {
-      assert(Result->Memory);
-      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
-                  NumWords * sizeof(uint64_t));
-    }
+  static Floating abs(const Floating &F) {
+    APFloat V = F.F;
+    if (V.isNegative())
+      V.changeSign();
+    return Floating(V);
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.add(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.add(B.F, RM);
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.getSemantics(), 1);
-    APFloat LHS = A.getValue();
-
-    auto Status = LHS.add(One, RM);
-    R->copy(LHS);
-    return Status;
+    APFloat One(A.F.getSemantics(), 1);
+    *R = Floating(A.F);
+    return R->F.add(One, RM);
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.subtract(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.subtract(B.F, RM);
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.getSemantics(), 1);
-    APFloat LHS = A.getValue();
-
-    auto Status = LHS.subtract(One, RM);
-    R->copy(LHS);
-    return Status;
+    APFloat One(A.F.getSemantics(), 1);
+    *R = Floating(A.F);
+    return R->F.subtract(One, RM);
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.multiply(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.multiply(B.F, RM);
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.divide(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.divide(B.F, RM);
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    R->copy(-A.getValue());
+    *R = -A;
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index af5cd2d13ecc..13fdb5369f2b 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,9 +99,6 @@ public:
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
-  bool operator>=(unsigned RHS) const {
-    return static_cast<unsigned>(V) >= RHS;
-  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 259262bdc524..8ee08dfb5cfe 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,19 +28,12 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
+template <unsigned Bits, bool Signed> class Integral;
 
-/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
-/// It will NOT copy the memory (unless, of course, copy() is called) and it
-/// won't alllocate anything. The allocation should happen via InterpState or
-/// Program.
 template <bool Signed> class IntegralAP final {
-public:
-  union {
-    uint64_t *Memory = nullptr;
-    uint64_t Val;
-  };
-  unsigned BitWidth = 0;
+private:
   friend IntegralAP<!Signed>;
+  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -59,82 +52,52 @@ public:
                                : V.trunc(BitSize).getZExtValue();
   }
 
-  APInt getValue() const {
-    if (singleWord())
-      return APInt(BitWidth, Val, Signed);
-    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-    return llvm::APInt(BitWidth, NumWords, Memory);
-  }
-
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  void take(uint64_t *NewMemory) {
-    assert(!singleWord());
-    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
-    Memory = NewMemory;
-  }
+  template <typename T>
+  IntegralAP(T Value, unsigned BitWidth)
+      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
 
-  void copy(const APInt &V) {
-    assert(BitWidth == V.getBitWidth());
-    assert(numWords() == V.getNumWords());
+  IntegralAP(APInt V) : V(V) {}
+  /// Arbitrary value for uninitialized variables.
+  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
 
-    if (V.isSingleWord()) {
-      if constexpr (Signed)
-        Val = V.getSExtValue();
-      else
-        Val = V.getZExtValue();
-      return;
-    }
-    assert(Memory);
-    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
-  }
-
-  // Constructors.
-  IntegralAP() = default;
-  IntegralAP(unsigned BitWidth) : BitWidth(BitWidth) {}
-  IntegralAP(uint64_t *Memory, unsigned BitWidth)
-      : Memory(Memory), BitWidth(BitWidth) {}
-  IntegralAP(const APInt &V)
-      : IntegralAP(const_cast<uint64_t *>((const uint64_t *)V.getRawData()),
-                   V.getBitWidth()) {}
-
-  IntegralAP operator-() const { return IntegralAP(-getValue()); }
+  IntegralAP operator-() const { return IntegralAP(-V); }
   IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(getValue() - Other.getValue());
+    return IntegralAP(V - Other.V);
   }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return getValue().sgt(RHS.getValue());
-    return getValue().ugt(RHS.getValue());
+      return V.ugt(RHS.V);
+    return V.sgt(RHS.V);
   }
-  bool operator>=(unsigned RHS) const {
+  bool operator>=(IntegralAP RHS) const {
     if constexpr (Signed)
-      return getValue().sge(RHS);
-    return getValue().uge(RHS);
+      return V.uge(RHS.V);
+    return V.sge(RHS.V);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return getValue().slt(RHS.getValue());
-    return getValue().ult(RHS.getValue());
+      return V.slt(RHS.V);
+    return V.slt(RHS.V);
+  }
+  bool operator<=(IntegralAP RHS) const {
+    if constexpr (Signed)
+      return V.ult(RHS.V);
+    return V.ult(RHS.V);
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(getValue());
+    return truncateCast<Ty, Signed>(V);
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
-    if (NumBits == 0)
-      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-    assert(false);
-    return IntegralAP<Signed>(Copy);
-  }
 
-  static IntegralAP from(const APInt &Value) {
-    return IntegralAP<Signed>(Value);
+    return IntegralAP<Signed>(Copy);
   }
 
   template <bool InputSigned>
@@ -143,45 +106,52 @@ public:
       NumBits = V.bitWidth();
 
     if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.getValue().sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.getValue().zextOrTrunc(NumBits));
+      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
+    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
   }
 
-  constexpr unsigned bitWidth() const { return BitWidth; }
-  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
-  constexpr bool singleWord() const { return numWords() == 1; }
+  template <unsigned Bits, bool InputSigned>
+  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
+    return IntegralAP<Signed>(I.toAPInt(BitWidth));
+  }
+
+  static IntegralAP zero(int32_t BitWidth) {
+    APInt V = APInt(BitWidth, 0LL, Signed);
+    return IntegralAP(V);
+  }
+
+  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
-    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(getValue().sext(Bits), !Signed);
+      return APSInt(V.sext(Bits), !Signed);
     else
-      return APSInt(getValue().zext(Bits), !Signed);
+      return APSInt(V.zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return getValue().isZero(); }
+  bool isZero() const { return V.isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return getValue().isNonNegative();
+      return V.isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !getValue().isNonNegative();
+      return !V.isNonNegative();
     return false;
   }
-  bool isMin() const { return getValue().isMinValue(); }
-  bool isMax() const { return getValue().isMaxValue(); }
+  bool isMin() const { return V.isMinValue(); }
+  bool isMax() const { return V.isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
+  bool isMinusOne() const { return Signed && V == -1; }
 
-  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
+  unsigned countLeadingZeros() const { return V.countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
+  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -191,64 +161,53 @@ public:
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(
-          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(
-          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    return IntegralAP<false>(Memory, BitWidth);
+    APInt Copy = V;
+    return IntegralAP<false>(Copy);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
   }
 
   static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
-    // FIXME: Remove this.
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
     return IntegralAP(V);
   }
 
-  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
-                                IntegralAP *Result) {
-    APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
-    llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
-    Result->copy(V);
-  }
-
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
-    APInt V1 = getValue();
-    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V1.slt(V2))
+      if (V.slt(RHS.V))
         return ComparisonCategoryResult::Less;
-      if (V1.sgt(V2))
+      if (V.sgt(RHS.V))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V1.ult(V2))
+    if (V.ult(RHS.V))
       return ComparisonCategoryResult::Less;
-    if (V1.ugt(V2))
+    if (V.ugt(RHS.V))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
-    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
+    IntegralAP<Signed> One(1, A.bitWidth());
+    return add(A, One, A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
-    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
+    IntegralAP<Signed> One(1, A.bitWidth());
+    return sub(A, One, A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -265,95 +224,87 @@ public:
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      R->copy(A.getValue().srem(B.getValue()));
+      *R = IntegralAP(A.V.srem(B.V));
     else
-      R->copy(A.getValue().urem(B.getValue()));
+      *R = IntegralAP(A.V.urem(B.V));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      R->copy(A.getValue().sdiv(B.getValue()));
+      *R = IntegralAP(A.V.sdiv(B.V));
     else
-      R->copy(A.getValue().udiv(B.getValue()));
+      *R = IntegralAP(A.V.udiv(B.V));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    R->copy(A.getValue() & B.getValue());
+    *R = IntegralAP(A.V & B.V);
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    R->copy(A.getValue() | B.getValue());
+    *R = IntegralAP(A.V | B.V);
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    R->copy(A.getValue() ^ B.getValue());
+    *R = IntegralAP(A.V ^ B.V);
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.getValue();
+    APInt AI = A.V;
     AI.negate();
-    R->copy(AI);
+    *R = IntegralAP(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    R->copy(~A.getValue());
+    *R = IntegralAP(~A.V);
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
+    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.getValue().getZExtValue();
+    unsigned ShiftAmount = B.V.getZExtValue();
     if constexpr (Signed)
-      R->copy(A.getValue().ashr(ShiftAmount));
+      *R = IntegralAP(A.V.ashr(ShiftAmount));
     else
-      R->copy(A.getValue().lshr(ShiftAmount));
+      *R = IntegralAP(A.V.lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    assert(BitWidth != 0);
-    uint32_t NumWords = llvm::APInt::getNumWords(bitWidth());
-    return sizeof(uint64_t) + (NumWords * sizeof(uint64_t));
+    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
+    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
   }
 
   void serialize(std::byte *Buff) const {
-    uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
-    std::memcpy(Buff, &BitWidth, sizeof(uint64_t));
-    if (singleWord())
-      std::memcpy(Buff + sizeof(uint64_t), &Val, NumWords * sizeof(uint64_t));
-    else
-      std::memcpy(Buff + sizeof(uint64_t), Memory, NumWords * sizeof(uint64_t));
+    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
+    uint32_t BitWidth = V.getBitWidth();
+
+    std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
+    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
+                           BitWidth / CHAR_BIT);
   }
 
-  static uint32_t deserializeSize(const std::byte *Buff) {
-    return *reinterpret_cast<const uint64_t *>(Buff);
-  }
+  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
+    uint32_t BitWidth;
+    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
+    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
 
-  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
-    uint32_t BitWidth = Result->BitWidth;
-    uint32_t NumWords = llvm::APInt::getNumWords(BitWidth);
-    assert(BitWidth == Result->BitWidth);
-    assert(Result->Memory);
-
-    if (NumWords == 1)
-      std::memcpy(&Result->Val, Buff + sizeof(uint64_t), sizeof(uint64_t));
-    else
-      std::memcpy(Result->Memory, Buff + sizeof(uint64_t),
-                  NumWords * sizeof(uint64_t));
+    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
+                            BitWidth / CHAR_BIT);
+    return Val;
   }
 
 private:
@@ -361,7 +312,7 @@ private:
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
+      R->V = Op<APInt>{}(A.V, B.V);
       return false;
     }
 
@@ -369,7 +320,7 @@ private:
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->copy(Result);
+    R->V = Result;
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 1e2032feabb6..5c8abffb3a99 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,10 +1935,8 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
-
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(
+      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
   return true;
 }
 
@@ -1948,10 +1946,8 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
-
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(
+      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
   return true;
 }
 
@@ -2057,100 +2053,6 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
-static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
-                                PrimType T) {
-
-  if (T == PT_IntAPS) {
-    auto &Val = Ptr.deref<IntegralAP<true>>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  } else if (T == PT_IntAP) {
-    auto &Val = Ptr.deref<IntegralAP<false>>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  } else if (T == PT_Float) {
-    auto &Val = Ptr.deref<Floating>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  }
-}
-
-template <typename T>
-static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
-  assert(needsAlloc<T>());
-  auto &Val = Ptr.deref<T>();
-  if (!Val.singleWord()) {
-    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-    Val.take(NewMemory);
-  }
-}
-
-static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
-  if (const Record *R = Ptr.getRecord()) {
-    for (const Record::Field &Fi : R->fields()) {
-      if (Fi.Desc->isPrimitive()) {
-        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
-          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
-        });
-        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
-      } else
-        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
-    }
-    return;
-  }
-
-  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
-    unsigned NumElems = D->getNumElems();
-    if (NumElems == 0)
-      return;
-
-    if (D->isPrimitiveArray()) {
-      PrimType PT = D->getPrimType();
-      if (!needsAlloc(PT))
-        return;
-      assert(NumElems >= 1);
-      const Pointer EP = Ptr.atIndex(0);
-      bool AllSingleWord = true;
-      TYPE_SWITCH_ALLOC(PT, {
-        if (!EP.deref<T>().singleWord()) {
-          copyPrimitiveMemory<T>(S, EP);
-          AllSingleWord = false;
-        }
-      });
-      if (AllSingleWord)
-        return;
-      for (unsigned I = 1; I != D->getNumElems(); ++I) {
-        const Pointer EP = Ptr.atIndex(I);
-        copyPrimitiveMemory(S, EP, PT);
-      }
-    } else {
-      assert(D->isCompositeArray());
-      for (unsigned I = 0; I != D->getNumElems(); ++I) {
-        const Pointer EP = Ptr.atIndex(I).narrow();
-        finishGlobalRecurse(S, EP);
-      }
-    }
-  }
-}
-
-bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
-  const Pointer &Ptr = S.Stk.pop<Pointer>();
-
-  finishGlobalRecurse(S, Ptr);
-  if (Ptr.canBeInitialized()) {
-    Ptr.initialize();
-    Ptr.activate();
-  }
-
-  return true;
-}
-
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 66d3e6d79e8b..ae3d4a441a79 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= Bits) {
+  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,9 +370,6 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -411,7 +408,6 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
-
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -427,7 +423,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -438,7 +434,6 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
-
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -447,7 +442,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -458,7 +453,6 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
-
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -467,10 +461,8 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
-
+  Floating Result;
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
-
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -492,14 +484,9 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Floating RA = S.allocFloat(A.getSemantics());
-    RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.atIndex(0).deref<Floating>() = Floating(ResR);
     Result.atIndex(0).initialize();
-
-    Floating RI = S.allocFloat(A.getSemantics());
-    RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+    Result.atIndex(1).deref<Floating>() = Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -552,20 +539,10 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
-    // Result.atIndex(0).initialize();
-    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    // Result.atIndex(1).initialize();
-
-    Floating RA = S.allocFloat(A.getSemantics());
-    RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.atIndex(0).deref<Floating>() = Floating(ResR);
     Result.atIndex(0).initialize();
-
-    Floating RI = S.allocFloat(A.getSemantics());
-    RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
-
+    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    Result.atIndex(1).initialize();
     Result.initialize();
   } else {
     // Integer element type.
@@ -631,12 +608,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -651,12 +625,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -673,11 +644,7 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -692,15 +659,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
+  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -715,15 +679,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
+  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -746,10 +707,8 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
-
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -771,44 +730,31 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
+  T Result;
 
-  if constexpr (std::is_same_v<T, Floating>) {
-    T Result = S.allocFloat(Value.getSemantics());
-
-    if (!T::neg(Value, &Result)) {
-      S.Stk.push<T>(Result);
-      return true;
-    }
-    return false;
-  } else {
-    T Result;
-    if constexpr (needsAlloc<T>())
-      Result = S.allocAP<T>(Value.bitWidth());
-
-    if (!T::neg(Value, &Result)) {
-      S.Stk.push<T>(Result);
-      return true;
-    }
-
-    assert(isIntegralType(Name) &&
-           "don't expect other types to fail at constexpr negation");
+  if (!T::neg(Value, &Result)) {
     S.Stk.push<T>(Result);
-
-    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-    if (S.checkingForUndefinedBehavior()) {
-      const Expr *E = S.Current->getExpr(OpPC);
-      QualType Type = E->getType();
-      SmallString<32> Trunc;
-      NegatedValue.trunc(Result.bitWidth())
-          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                    /*UpperCase=*/true, /*InsertSeparators=*/true);
-      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-          << Trunc << Type << E->getSourceRange();
-      return true;
-    }
-
-    return handleOverflow(S, OpPC, NegatedValue);
+    return true;
   }
+
+  assert(isIntegralType(Name) &&
+         "don't expect other types to fail at constexpr negation");
+  S.Stk.push<T>(Result);
+
+  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+  if (S.checkingForUndefinedBehavior()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    QualType Type = E->getType();
+    SmallString<32> Trunc;
+    NegatedValue.trunc(Result.bitWidth())
+        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                  /*UpperCase=*/true, /*InsertSeparators=*/true);
+    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+        << Trunc << Type << E->getSourceRange();
+    return true;
+  }
+
+  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -837,8 +783,6 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -946,6 +890,7 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
+
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -953,7 +898,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result = S.allocFloat(Value.getSemantics());
+  Floating Result;
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -1007,15 +952,12 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Val.bitWidth());
-
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
+
   return false;
 }
 
@@ -1383,23 +1325,10 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
-  if constexpr (needsAlloc<T>()) {
-    T Result = S.allocAP<T>(Arg.bitWidth());
-    Result.copy(Arg.toAPSInt());
-    S.Stk.push<T>(Result);
-    return true;
-  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
-inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
-  Floating Result = S.allocFloat(F.getSemantics());
-  Result.copy(F.getAPFloat());
-  S.Stk.push<Floating>(Result);
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1554,24 +1483,7 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
-
   P.deref<T>() = S.Stk.pop<T>();
-
-  if constexpr (std::is_same_v<T, Floating>) {
-    auto &Val = P.deref<Floating>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-
-  } else if constexpr (needsAlloc<T>()) {
-    auto &Val = P.deref<T>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  }
-
   P.initialize();
   return true;
 }
@@ -1673,22 +1585,7 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-
-  if constexpr (needsAlloc<T>()) {
-    T Result = S.allocAP<T>(Value.bitWidth());
-    if (T::isSigned())
-      Result.copy(Value.toAPSInt()
-                      .trunc(F->Decl->getBitWidthValue())
-                      .sextOrTrunc(Value.bitWidth()));
-    else
-      Result.copy(Value.toAPSInt()
-                      .trunc(F->Decl->getBitWidthValue())
-                      .zextOrTrunc(Value.bitWidth()));
-
-    Field.deref<T>() = Result;
-  } else {
-    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
-  }
+  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
   Field.activate();
   Field.initialize();
   return true;
@@ -1868,8 +1765,6 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
-bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
-
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2376,8 +2271,7 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(*Sem);
-  F.toSemantics(Sem, RM, &Result);
+  Floating Result = F.toSemantics(Sem, RM);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2401,25 +2295,15 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  // Copy data.
-  {
-    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
-    Result.copy(Source);
-  }
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(
+      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  // Copy data.
-  {
-    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
-    Result.copy(Source);
-  }
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(
+      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
   return true;
 }
 
@@ -2428,11 +2312,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
+  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2481,12 +2365,7 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
-  ResultAP.copy(Result);
-
-  S.Stk.push<IntegralAP<false>>(ResultAP);
-
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2502,12 +2381,7 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
-  ResultAP.copy(Result);
-
-  S.Stk.push<IntegralAP<true>>(ResultAP);
-
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2567,9 +2441,8 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-  Floating Result = S.allocFloat(*Sem);
-  Result.copy(Fixed.toFloat(Sem));
-  S.Stk.push<Floating>(Result);
+
+  S.Stk.push<Floating>(Fixed.toFloat(Sem));
   return true;
 }
 
@@ -2633,18 +2506,12 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  if (!Result.singleWord())
-    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  if (!Result.singleWord())
-    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
   return true;
 }
 
@@ -2711,9 +2578,7 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                    LT *Result) {
-
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2731,7 +2596,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
+        S, OpPC, LHS, RHS);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2779,7 +2644,6 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
-      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2788,48 +2652,6 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
   return true;
 }
 
-/// A version of DoShift that works on IntegralAP.
-template <class LT, class RT, ShiftDir Dir>
-inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                      LT *Result) {
-  const unsigned Bits = LHS.bitWidth();
-  const APSInt &LHSAP = LHS.toAPSInt();
-  APSInt RHSAP = RHS.toAPSInt();
-
-  // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (S.getLangOpts().OpenCL)
-    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
-                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
-                    RHSAP.isUnsigned());
-
-  if (RHS.isNegative()) {
-    // During constant-folding, a negative shift is an opposite shift. Such a
-    // shift is not a constant expression.
-    const SourceInfo &Loc = S.Current->getSource(OpPC);
-    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    if (!S.noteUndefinedBehavior())
-      return false;
-    RHS = -RHS;
-    return DoShiftAP<LT, RT,
-                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
-  }
-
-  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
-    return false;
-
-  if constexpr (Dir == ShiftDir::Left) {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP << SA);
-  } else {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP >> SA);
-  }
-
-  S.Stk.push<LT>(*Result);
-  return true;
-}
-
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2837,13 +2659,7 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
-  } else {
-    LT Result;
-    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
-  }
+  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2852,13 +2668,8 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
-  } else {
-    LT Result;
-    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
-  }
+
+  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3441,15 +3252,7 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      Floating Result = S.allocFloat(*Sem);
-      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
-      S.Stk.push<Floating>(Result);
-
-      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
-    } else if constexpr (needsAlloc<T>()) {
-      T Result = S.allocAP<T>(ResultBitWidth);
-      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
-      S.Stk.push<T>(Result);
+      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3507,11 +3310,7 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  auto &Semantics =
-      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
-
-  auto F = S.allocFloat(Semantics);
-  Floating::deserialize(*OpPC, &F);
+  Floating F = Floating::deserialize(*OpPC);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3519,25 +3318,17 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  assert(Result.bitWidth() == BitWidth);
-
-  IntegralAP<false>::deserialize(*OpPC, &Result);
-  OpPC += align(Result.bytesToSerialize());
-  return Result;
+  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  assert(Result.bitWidth() == BitWidth);
-
-  IntegralAP<true>::deserialize(*OpPC, &Result);
-  OpPC += align(Result.bytesToSerialize());
-  return Result;
+  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5304bd77f2c0..d01e3d042a8b 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,21 +57,6 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
-
-  if (T == PT_IntAPS) {
-    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-    Result.copy(Val);
-    S.Stk.push<IntegralAP<true>>(Result);
-    return;
-  }
-
-  if (T == PT_IntAP) {
-    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-    Result.copy(Val);
-    S.Stk.push<IntegralAP<false>>(Result);
-    return;
-  }
-
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -342,13 +327,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result = S.allocFloat(TargetSemantics);
+  Floating Result;
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -357,10 +342,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -375,9 +360,7 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result = S.allocFloat(TargetSemantics);
-  Result.copy(APFloat::getInf(TargetSemantics));
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
   return true;
 }
 
@@ -385,12 +368,10 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  Result.copy(Copy);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating(Copy));
 
   return true;
 }
@@ -399,13 +380,11 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
-  S.Stk.push<Floating>(Result);
+    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
   return true;
 }
 
@@ -413,13 +392,11 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
-  S.Stk.push<Floating>(Result);
+    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
   return true;
 }
 
@@ -594,16 +571,8 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
-  APFloat F = Val.getAPFloat();
-  if (!F.isNegative()) {
-    S.Stk.push<Floating>(Val);
-    return true;
-  }
 
-  Floating Result = S.allocFloat(Val.getSemantics());
-  F.changeSign();
-  Result.copy(F);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating::abs(Val));
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 2569cac018b3..239b3104e89f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,9 +402,7 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          Floating R = S.allocFloat(Semantics);
-          Floating::bitcastFromMemory(M.get(), Semantics, &R);
-          P.deref<Floating>() = R;
+          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index 08765561985e..e8dc6f0483d6 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,7 +15,6 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
-#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -127,33 +126,6 @@ public:
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
-  void *allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
-  }
-  template <typename T> T *allocate(size_t Num = 1) const {
-    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
-  }
-
-  template <typename T> T allocAP(unsigned BitWidth) {
-    unsigned NumWords = APInt::getNumWords(BitWidth);
-    if (NumWords == 1)
-      return T(BitWidth);
-    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
-    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
-    return T(Mem, BitWidth);
-  }
-
-  Floating allocFloat(const llvm::fltSemantics &Sem) {
-    if (Floating::singleWord(Sem))
-      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
-
-    unsigned NumWords =
-        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
-    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
-    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
-    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
-  }
-
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -189,8 +161,6 @@ public:
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
-
-  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 57e01f7bd9da..c76ac5f8ae86 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,7 +48,6 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
-
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -89,9 +88,6 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
-def IntegralTypeClass : TypeClass {
-  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
-}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -269,13 +265,12 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstFloat : ConstOpcode<Float, ArgFloat>;
+def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
-def ConstFloat : Opcode { let Args = [ArgFloat]; }
-
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -333,7 +328,6 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
-def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -395,7 +389,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [IntegralTypeClass];
+  let Types = [AluTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index a156cccbb3c1..6152fbfbe3a7 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,13 +76,6 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
-template <typename T> constexpr bool needsAlloc() {
-  return std::is_same_v<T, IntegralAP<false>> ||
-         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
-}
-constexpr bool needsAlloc(PrimType T) {
-  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
-}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -216,16 +209,6 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
-#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
-  do {                                                                         \
-    switch (Expr) {                                                            \
-      TYPE_SWITCH_CASE(PT_Float, B)                                            \
-      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
-      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
-    default:;                                                                  \
-    }                                                                          \
-  } while (0)
-
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 5d9c42244749..23ba1bbd193b 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,14 +132,6 @@ public:
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
-  void *Allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
-  }
-  template <typename T> T *Allocate(size_t Num = 1) const {
-    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
-  }
-  void Deallocate(void *Ptr) const {}
-
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -212,7 +204,7 @@ private:
   };
 
   /// Allocator for globals.
-  mutable PoolAllocTy Allocator;
+  PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -246,18 +238,4 @@ public:
 } // namespace interp
 } // namespace clang
 
-inline void *operator new(size_t Bytes, const clang::interp::Program &C,
-                          size_t Alignment = 8) {
-  return C.Allocate(Bytes, Alignment);
-}
-
-inline void operator delete(void *Ptr, const clang::interp::Program &C,
-                            size_t) {
-  C.Deallocate(Ptr);
-}
-inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
-                            size_t Alignment = 8) {
-  return C.Allocate(Bytes, Alignment);
-}
-
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 1013a771d13b..710612bef8fd 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,9 +21,6 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
-#if __x86_64
-  // both-note@-2 {{indeterminate value can only initialize an object of type}}
-#endif
 }
 
 template <class Intermediate, class Init>
@@ -41,8 +38,11 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
-                                                                                 // both-note{{in call}}
+/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
+#if 0
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
+                                                                                 // expected-note{{in call}}
+#endif
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 174c1ffa79a4..21dca15a4577 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  constexpr long double NaN5 = __builtin_nanf128("");
+  //constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,6 +655,8 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
+/// FIXME: The commented out tests here use a IntAP value and fail.
+/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -707,7 +709,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  int clz50 = __builtin_clzg((unsigned __int128)0);
+  // int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -715,7 +717,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -773,7 +775,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -783,7 +785,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 667c7860ef5cc67a94c5233ff1be9c0e113ac514 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 17 Jun 2025 12:25:20 -0700
Subject: [PATCH 0701/1322] [CIR] Handle global string literals as char array
 initializer (#144384)

This change adds the line of code needed to handle a string literal as
an initializer for a character array.
---
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp |  4 ++--
 clang/test/CIR/CodeGen/string-literals.c     | 12 ++++++++++
 clang/test/CIR/CodeGen/string-literals.cpp   | 23 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/string-literals.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 1976742d4039..8b817f3f3d8d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -254,8 +254,8 @@ public:
   }
 
   mlir::Attribute VisitStringLiteral(StringLiteral *e, QualType t) {
-    cgm.errorNYI(e->getBeginLoc(), "ConstExprEmitter::VisitStringLiteral");
-    return {};
+    // This is a string literal initializing an array in an initializer.
+    return cgm.getConstantArrayFromStringLiteral(e);
   }
 
   mlir::Attribute VisitObjCEncodeExpr(ObjCEncodeExpr *e, QualType t) {
diff --git a/clang/test/CIR/CodeGen/string-literals.c b/clang/test/CIR/CodeGen/string-literals.c
index 00f59b09400c..90ea21906f36 100644
--- a/clang/test/CIR/CodeGen/string-literals.c
+++ b/clang/test/CIR/CodeGen/string-literals.c
@@ -5,6 +5,18 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
 // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
 
+char g_str[] = "1234";
+
+// CIR: cir.global external @g_str = #cir.const_array<"1234\00" : !cir.array<!s8i x 5>> : !cir.array<!s8i x 5>
+
+char g_oversized[100] = "123";
+
+// CIR: cir.global external @g_oversized = #cir.const_array<"123" : !cir.array<!s8i x 3>, trailing_zeros> : !cir.array<!s8i x 100>
+
+char g_exact[4] = "123";
+
+// CIR: cir.global external @g_exact = #cir.const_array<"123\00" : !cir.array<!s8i x 4>> : !cir.array<!s8i x 4>
+
 // CIR: cir.global "private" cir_private dsolocal @[[STR1_GLOBAL:.*]] = #cir.const_array<"1\00" : !cir.array<!s8i x 2>> : !cir.array<!s8i x 2>
 // CIR: cir.global "private" cir_private dsolocal @[[STR2_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 1>
 // CIR: cir.global "private" cir_private dsolocal @[[STR3_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 2>
diff --git a/clang/test/CIR/CodeGen/string-literals.cpp b/clang/test/CIR/CodeGen/string-literals.cpp
new file mode 100644
index 000000000000..c56eb7438732
--- /dev/null
+++ b/clang/test/CIR/CodeGen/string-literals.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+// CIR: cir.global "private" cir_private dsolocal @[[STR1_GLOBAL:.*]] = #cir.const_array<"abcd\00" : !cir.array<!s8i x 5>> : !cir.array<!s8i x 5>
+
+// LLVM: @[[STR1_GLOBAL:.*]] = private global [5 x i8] c"abcd\00"
+
+// OGCG: @[[STR1_GLOBAL:.*]] = private unnamed_addr constant [5 x i8] c"abcd\00"
+
+decltype(auto) returns_literal() {
+    return "abcd";
+}
+
+// CIR: cir.func{{.*}} @_Z15returns_literalv() -> !cir.ptr<!cir.array<!s8i x 5>>
+// CIR:   %[[RET_ADDR:.*]] = cir.alloca !cir.ptr<!cir.array<!s8i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s8i x 5>>>, ["__retval"]
+// CIR:   %[[STR_ADDR:.*]] = cir.get_global @[[STR1_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 5>>
+// CIR:   cir.store{{.*}} %[[STR_ADDR]], %[[RET_ADDR]]
+// CIR:   %[[RET:.*]] = cir.load %[[RET_ADDR]]
+// CIR:   cir.return %[[RET]]

From b1aa845595c4dc204dfbe0e48481572e936620fc Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 17 Jun 2025 15:48:58 -0400
Subject: [PATCH 0702/1322] [libc++][NFC] Consistently qualify calls to C
 functions in <fstream> (#144539)

---
 libcxx/include/fstream | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 71c4957b691a..00aa00ff7e9c 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -696,7 +696,7 @@ basic_filebuf<_CharT, _Traits>* basic_filebuf<_CharT, _Traits>::open(const char*
   if (!__mdstr)
     return nullptr;
 
-  return __do_open(fopen(__s, __mdstr), __mode);
+  return __do_open(std::fopen(__s, __mdstr), __mode);
 }
 
 template <class _CharT, class _Traits>
@@ -761,7 +761,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
     std::memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
     if (__always_noconv_) {
       size_t __nmemb = static_cast<size_t>(this->egptr() - this->eback() - __unget_sz);
-      __nmemb        = ::fread(this->eback() + __unget_sz, 1, __nmemb, __file_);
+      __nmemb        = std::fread(this->eback() + __unget_sz, 1, __nmemb, __file_);
       if (__nmemb != 0) {
         this->setg(this->eback(), this->eback() + __unget_sz, this->eback() + __unget_sz + __nmemb);
         __c = traits_type::to_int_type(*this->gptr());
@@ -778,7 +778,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
           std::min(static_cast<size_t>(__ibs_ - __unget_sz), static_cast<size_t>(__extbufend_ - __extbufnext_));
       codecvt_base::result __r;
       __st_last_  = __st_;
-      size_t __nr = fread((void*)const_cast<char*>(__extbufnext_), 1, __nmemb, __file_);
+      size_t __nr = std::fread((void*)const_cast<char*>(__extbufnext_), 1, __nmemb, __file_);
       if (__nr != 0) {
         if (!__cv_)
           std::__throw_bad_cast();
@@ -855,7 +855,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
             return traits_type::eof();
         } else if (__r == codecvt_base::ok || __r == codecvt_base::partial) {
           size_t __nmemb = static_cast<size_t>(__extbe - __extbuf_);
-          if (fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
+          if (std::fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
             return traits_type::eof();
           if (__r == codecvt_base::partial) {
             this->setp(const_cast<char_type*>(__e), this->pptr());
@@ -990,12 +990,12 @@ int basic_filebuf<_CharT, _Traits>::sync() {
       char* __extbe;
       __r            = __cv_->unshift(__st_, __extbuf_, __extbuf_ + __ebs_, __extbe);
       size_t __nmemb = static_cast<size_t>(__extbe - __extbuf_);
-      if (fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
+      if (std::fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
         return -1;
     } while (__r == codecvt_base::partial);
     if (__r == codecvt_base::error)
       return -1;
-    if (fflush(__file_))
+    if (std::fflush(__file_))
       return -1;
   } else if (__cm_ & ios_base::in) {
     off_type __c;

From 19658d14749876cf0b6633f210c923be3709323b Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 13:28:45 -0700
Subject: [PATCH 0703/1322] [llvm] annotate interfaces in llvm/Target for DLL
 export (#143615)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/Target` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

A sub-set of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

The bulk of this change is manual additions of `LLVM_ABI` to
`LLVMInitializeX` functions defined in .cpp files under llvm/lib/Target.
Adding `LLVM_ABI` to the function implementation is required here
because they do not `#include "llvm/Support/TargetSelect.h"`, which
contains the declarations for this functions and was already updated
with `LLVM_ABI` in a previous patch. I considered patching these files
with `#include "llvm/Support/TargetSelect.h"` instead, but since
TargetSelect.h is a large file with a bunch of preprocessor x-macro
stuff in it I was concerned it would unnecessarily impact compile times.

In addition, a number of unit tests under llvm/unittests/Target required
additional dependencies to make them build correctly against the LLVM
DLL on Windows using MSVC.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/Target/CGPassBuilderOption.h        |  3 ++-
 llvm/include/llvm/Target/TargetLoweringObjectFile.h   |  3 ++-
 llvm/include/llvm/Target/TargetMachine.h              |  5 +++--
 llvm/include/llvm/Target/TargetOptions.h              | 11 ++++++-----
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp         |  4 +++-
 llvm/lib/Target/AArch64/AArch64TargetMachine.cpp      |  4 +++-
 .../lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp |  3 ++-
 .../AArch64/Disassembler/AArch64Disassembler.cpp      |  3 ++-
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp      |  4 +++-
 .../Target/AArch64/TargetInfo/AArch64TargetInfo.cpp   |  4 +++-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp           |  4 +++-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp        |  3 ++-
 llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp  |  4 +++-
 .../Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp |  4 +++-
 llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp  |  4 +++-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp |  4 +++-
 .../lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp |  4 +++-
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp                 |  4 +++-
 llvm/lib/Target/ARM/ARMTargetMachine.cpp              |  3 ++-
 llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp        |  2 +-
 llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp  |  3 ++-
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp  |  3 ++-
 llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp      |  5 ++++-
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp                 |  4 +++-
 llvm/lib/Target/AVR/AVRTargetMachine.cpp              |  3 ++-
 llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp        |  3 ++-
 llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp  |  5 ++++-
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp  |  3 ++-
 llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp      |  4 +++-
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp        |  3 ++-
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp                 |  4 +++-
 llvm/lib/Target/BPF/BPFTargetMachine.cpp              |  3 ++-
 llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp  |  5 +++--
 llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp  |  4 ++--
 llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp      |  4 +++-
 .../lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp |  4 +++-
 .../Hexagon/Disassembler/HexagonDisassembler.cpp      |  4 +++-
 llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp         |  4 +++-
 llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp      |  4 +++-
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp      |  4 +++-
 .../Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp   |  4 +++-
 llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp    |  4 +++-
 .../Target/Lanai/Disassembler/LanaiDisassembler.cpp   |  4 +++-
 llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/Lanai/LanaiTargetMachine.cpp          |  3 ++-
 .../Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp  |  4 +++-
 .../Target/LoongArch/AsmParser/LoongArchAsmParser.cpp |  4 +++-
 .../LoongArch/Disassembler/LoongArchDisassembler.cpp  |  4 +++-
 llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp     |  4 +++-
 llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp  |  4 +++-
 .../LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp  |  3 ++-
 .../LoongArch/TargetInfo/LoongArchTargetInfo.cpp      |  4 +++-
 llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp  |  4 +++-
 .../Target/MSP430/Disassembler/MSP430Disassembler.cpp |  4 +++-
 .../Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp |  4 +++-
 llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp           |  4 +++-
 llvm/lib/Target/MSP430/MSP430TargetMachine.cpp        |  3 ++-
 .../lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp |  4 +++-
 llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp      |  3 ++-
 .../lib/Target/Mips/Disassembler/MipsDisassembler.cpp |  3 ++-
 .../lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp |  3 ++-
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp               |  4 +++-
 llvm/lib/Target/Mips/MipsTargetMachine.cpp            |  3 ++-
 llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp    |  4 +++-
 .../Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp    |  4 +++-
 .../Target/PowerPC/Disassembler/PPCDisassembler.cpp   |  4 +++-
 .../Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp          |  4 +++-
 .../Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp   |  4 +++-
 llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp    |  4 +++-
 .../Target/RISCV/Disassembler/RISCVDisassembler.cpp   |  4 +++-
 llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp    |  4 +++-
 .../Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp  |  4 +++-
 .../Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp    |  4 +++-
 .../Target/Sparc/Disassembler/SparcDisassembler.cpp   |  5 +++--
 .../Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/Sparc/SparcTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp  |  4 +++-
 .../lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp |  4 +++-
 .../SystemZ/Disassembler/SystemZDisassembler.cpp      |  4 +++-
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp      |  4 +++-
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp         |  4 +++-
 llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp      |  4 +++-
 .../Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp   |  4 +++-
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp          |  3 ++-
 llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp    |  4 +++-
 llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp    |  3 ++-
 llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp        |  3 ++-
 llvm/lib/Target/VE/VEAsmPrinter.cpp                   |  3 ++-
 llvm/lib/Target/VE/VETargetMachine.cpp                |  3 ++-
 .../WebAssembly/AsmParser/WebAssemblyAsmParser.cpp    |  4 +++-
 .../Disassembler/WebAssemblyDisassembler.cpp          |  3 ++-
 .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp          |  4 +++-
 .../WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp |  4 +++-
 .../Target/WebAssembly/WebAssemblyTargetMachine.cpp   |  4 +++-
 .../Target/XCore/Disassembler/XCoreDisassembler.cpp   |  4 +++-
 .../Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/XCore/XCoreAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/XCore/XCoreTargetMachine.cpp          |  3 ++-
 llvm/unittests/Target/AArch64/CMakeLists.txt          |  1 +
 llvm/unittests/Target/LoongArch/CMakeLists.txt        |  1 +
 llvm/unittests/Target/RISCV/CMakeLists.txt            |  1 +
 llvm/unittests/Target/SPIRV/CMakeLists.txt            |  1 +
 llvm/unittests/Target/VE/CMakeLists.txt               |  1 +
 llvm/unittests/Target/WebAssembly/CMakeLists.txt      |  1 +
 121 files changed, 322 insertions(+), 123 deletions(-)

diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 51f25c1360b8..f29cbe78a185 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_CGPASSBUILDEROPTION_H
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
@@ -82,7 +83,7 @@ struct CGPassBuilderOption {
   std::optional<bool> DebugifyCheckAndStripAll;
 };
 
-CGPassBuilderOption getCGPassBuilderOption();
+LLVM_ABI CGPassBuilderOption getCGPassBuilderOption();
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 47617424a968..27a688bc12ab 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -16,6 +16,7 @@
 
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegister.h"
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 
 namespace llvm {
@@ -43,7 +44,7 @@ class StringRef;
 class TargetMachine;
 class DSOLocalEquivalent;
 
-class TargetLoweringObjectFile : public MCObjectFileInfo {
+class LLVM_ABI TargetLoweringObjectFile : public MCObjectFileInfo {
   /// Name-mangler for global names.
   Mangler *Mang = nullptr;
 
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 906926729ed7..04c97c1502a1 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -28,7 +29,7 @@
 #include <string>
 #include <utility>
 
-extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
+extern LLVM_ABI llvm::cl::opt<bool> NoKernelInfoEndLTO;
 
 namespace llvm {
 
@@ -78,7 +79,7 @@ struct MachineFunctionInfo;
 /// machine.  All target-specific information should be accessible through this
 /// interface.
 ///
-class TargetMachine {
+class LLVM_ABI TargetMachine {
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef DataLayoutString,
                 const Triple &TargetTriple, StringRef CPU, StringRef FS,
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 10638a0ec902..a7c46921255b 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Compiler.h"
 
 #include <memory>
 
@@ -158,12 +159,12 @@ public:
 
   /// DisableFramePointerElim - This returns true if frame pointer elimination
   /// optimization should be disabled for the given machine function.
-  bool DisableFramePointerElim(const MachineFunction &MF) const;
+  LLVM_ABI bool DisableFramePointerElim(const MachineFunction &MF) const;
 
   /// FramePointerIsReserved - This returns true if the frame pointer must
   /// always either point to a new frame record or be un-modified in the given
   /// function.
-  bool FramePointerIsReserved(const MachineFunction &MF) const;
+  LLVM_ABI bool FramePointerIsReserved(const MachineFunction &MF) const;
 
   /// If greater than 0, override the default value of
   /// MCAsmInfo::BinutilsVersion.
@@ -219,7 +220,7 @@ public:
   /// truncations).  If this is enabled (set to true), the code generator must
   /// assume that the rounding mode may dynamically change.
   unsigned HonorSignDependentRoundingFPMathOption : 1;
-  bool HonorSignDependentRoundingFPMath() const;
+  LLVM_ABI bool HonorSignDependentRoundingFPMath() const;
 
   /// NoZerosInBSS - By default some codegens place zero-initialized data to
   /// .bss section. This flag disables such behaviour (necessary, e.g. for
@@ -346,7 +347,7 @@ public:
   unsigned EnableDebugEntryValues : 1;
   /// NOTE: There are targets that still do not support the debug entry values
   /// production.
-  bool ShouldEmitDebugEntryValues() const;
+  LLVM_ABI bool ShouldEmitDebugEntryValues() const;
 
   // When set to true, use experimental new debug variable location tracking,
   // which seeks to follow the values of variables rather than their location,
@@ -450,7 +451,7 @@ public:
 
   DenormalMode getRawFP32DenormalMode() const { return FP32DenormalMode; }
 
-  DenormalMode getDenormalMode(const fltSemantics &FPType) const;
+  LLVM_ABI DenormalMode getDenormalMode(const fltSemantics &FPType) const;
 
   /// What exception model to use
   ExceptionHandling ExceptionModel = ExceptionHandling::None;
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 3f92c1dbfbf4..4099f40ea07f 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -58,6 +58,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -3534,7 +3535,8 @@ INITIALIZE_PASS(AArch64AsmPrinter, "aarch64-asm-printer",
                 "AArch64 Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 50f52cca6c8a..8150e91c8ba5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -223,7 +224,8 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for AArch64"),
                            cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
   RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2f67ff55f26b..d8bdc01a3454 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8322,7 +8322,8 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64AsmParser() {
   RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
   RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
   RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index bab0cbe7788e..ae984be670fc 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -310,7 +310,8 @@ createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                        SymbolLookUp, DisInfo);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Disassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
                                          createAArch64Disassembler);
   TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index b7959e02ec26..efc13589bab6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
@@ -503,7 +504,8 @@ static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64TargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64TargetMC() {
   for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(),
                     &getTheAArch64_32Target(), &getTheARM64Target(),
                     &getTheARM64_32Target()}) {
diff --git a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 52c88fd0218d..c9ebd3b4a651 100644
--- a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 Target &llvm::getTheAArch64leTarget() {
@@ -31,7 +32,8 @@ Target &llvm::getTheARM64_32Target() {
   return TheARM64_32Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64TargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64TargetInfo() {
   // Now register the "arm64" name for use with "-march". We don't want it to
   // take possession of the Triple::aarch64 tags though.
   TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 491314daf2d8..84b0f9855409 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -83,7 +84,8 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
                                      llvm::createR600AsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d59087839b0e..f390d39043ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -89,6 +89,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
@@ -481,7 +482,7 @@ static cl::opt<bool> HasClosedWorldAssumption(
     cl::desc("Whether has closed-world assumption at link time"),
     cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0dc1d1377322..30dcd6d81f16 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <optional>
@@ -9800,7 +9801,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUAsmParser() {
   RegisterMCAsmParser<AMDGPUAsmParser> A(getTheR600Target());
   RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ca0093d1f049..349e408b7965 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -2648,7 +2649,8 @@ static MCDisassembler *createAMDGPUDisassembler(const Target &T,
   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
                                          createAMDGPUDisassembler);
   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 2768e0c23cf0..b8f43c4550b7 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -16,6 +16,7 @@
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/WithColor.h"
 
 namespace llvm::mca {
@@ -353,7 +354,8 @@ createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
 
 /// Extern function to initialize the targets for the AMDGPU backend
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetMCA() {
   TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
                                           createAMDGPUCustomBehaviour);
   TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c692895d84c0..d66725d3a6c4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -156,7 +157,8 @@ static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
   return new AMDGPUMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetMC() {
 
   TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(getTheR600Target(),
diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 98fd16e59bf1..ad547556cf15 100644
--- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -28,7 +29,8 @@ Target &llvm::getTheGCNTarget() {
 }
 
 /// Extern function to initialize the targets for the AMDGPU backend
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetInfo() {
   RegisterTarget<Triple::r600, false> R600(getTheR600Target(), "r600",
                                            "AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
   RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fa1437002551..1443747709b7 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -40,6 +40,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -2456,7 +2457,8 @@ INITIALIZE_PASS(ARMAsmPrinter, "arm-asm-printer", "ARM Assembly Printer", false,
 //===----------------------------------------------------------------------===//
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget());
   RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget());
   RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 563e69a65ab3..fee77a44e5e8 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
@@ -82,7 +83,7 @@ namespace llvm {
   void initializeARMExecutionDomainFixPass(PassRegistry&);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
   RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f3bdcd64805d..25f027301337 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12722,7 +12722,7 @@ bool ARMAsmParser::parseDirectiveSEHCustom(SMLoc L) {
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
   RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
   RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget());
   RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ef30b1aafb28..5f930fb0c807 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1269,7 +1269,8 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
                                          createARMDisassembler);
   TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index c756bff3b501..2d22b27ceb13 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -770,7 +771,7 @@ bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
                     &getTheThumbLETarget(), &getTheThumbBETarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index 4d514f3ca444..3e3670d4e019 100644
--- a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -8,6 +8,8 @@
 
 #include "TargetInfo/ARMTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+
 using namespace llvm;
 
 Target &llvm::getTheARMLETarget() {
@@ -27,7 +29,8 @@ Target &llvm::getTheThumbBETarget() {
   return TheThumbBETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMTargetInfo() {
   RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm",
                                                  "ARM", "ARM");
   RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb",
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 1a1e5155979e..ad8aa5717fb4 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -335,6 +336,7 @@ char AVRAsmPrinter::ID = 0;
 INITIALIZE_PASS(AVRAsmPrinter, "avr-asm-printer", "AVR Assembly Printer", false,
                 false)
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRAsmPrinter() {
   llvm::RegisterAsmPrinter<AVRAsmPrinter> X(getTheAVRTarget());
 }
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 72544b0afd8d..b75417a0896a 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #include "AVR.h"
 #include "AVRMachineFunctionInfo.h"
@@ -87,7 +88,7 @@ void AVRPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
   // Register the target.
   RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
 
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index e82bd761eeb3..012cf2c70e2e 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -731,7 +732,7 @@ ParseStatus AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
   return (parseMany(parseOne));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmParser() {
   RegisterMCAsmParser<AVRAsmParser> X(getTheAVRTarget());
 }
 
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 70428673fcd8..c7a584868f4e 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -23,6 +23,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 
+#include "llvm/Support/Compiler.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "avr-disassembler"
@@ -50,7 +52,8 @@ static MCDisassembler *createAVRDisassembler(const Target &T,
   return new AVRDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheAVRTarget(),
                                          createAVRDisassembler);
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index f87fb70f97ff..d29a7a56167c 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -87,7 +88,7 @@ static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
   return new AVRTargetAsmStreamer(S);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<AVRMCAsmInfo> X(getTheAVRTarget());
 
diff --git a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
index dd61add1526c..d81db50650ba 100644
--- a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
+++ b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/AVRTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 namespace llvm {
 Target &getTheAVRTarget() {
   static Target TheAVRTarget;
@@ -15,7 +16,8 @@ Target &getTheAVRTarget() {
 }
 } // namespace llvm
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRTargetInfo() {
   llvm::RegisterTarget<llvm::Triple::avr> X(llvm::getTheAVRTarget(), "avr",
                                             "Atmel AVR Microcontroller", "AVR");
 }
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 7d1819134d16..b49e8fd96c66 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -532,7 +533,7 @@ bool BPFAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmParser() {
   RegisterMCAsmParser<BPFAsmParser> X(getTheBPFTarget());
   RegisterMCAsmParser<BPFAsmParser> Y(getTheBPFleTarget());
   RegisterMCAsmParser<BPFAsmParser> Z(getTheBPFbeTarget());
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 5dd71cc91427..e3843e0e112e 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -155,7 +156,8 @@ INITIALIZE_PASS(BPFAsmPrinter, "bpf-asm-printer", "BPF Assembly Printer", false,
                 false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFAsmPrinter() {
   RegisterAsmPrinter<BPFAsmPrinter> X(getTheBPFleTarget());
   RegisterAsmPrinter<BPFAsmPrinter> Y(getTheBPFbeTarget());
   RegisterAsmPrinter<BPFAsmPrinter> Z(getTheBPFTarget());
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 0c3f61fdfedd..527a48035457 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
@@ -41,7 +42,7 @@ static cl::opt<bool>
     DisableCheckUnreachable("bpf-disable-trap-unreachable", cl::Hidden,
                             cl::desc("Disable Trap Unreachable for BPF"));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
   // Register the target.
   RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
   RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 536bee539384..4dfae81e9019 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <cstdint>
@@ -82,8 +83,8 @@ static MCDisassembler *createBPFDisassembler(const Target &T,
   return new BPFDisassembler(STI, Ctx);
 }
 
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheBPFTarget(),
                                          createBPFDisassembler);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index caf84701b999..5f44dd9583af 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Host.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -104,7 +105,7 @@ static MCInstrAnalysis *createBPFInstrAnalysis(const MCInstrInfo *Info) {
   return new BPFMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
   for (Target *T :
        {&getTheBPFleTarget(), &getTheBPFbeTarget(), &getTheBPFTarget()}) {
     // Register the MC asm info.
@@ -153,5 +154,4 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
     TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
                                          createBPFbeAsmBackend);
   }
-
 }
diff --git a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
index d7cdcae916aa..6ea6cd56a6d0 100644
--- a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
+++ b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/BPFTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -24,7 +25,8 @@ Target &llvm::getTheBPFTarget() {
   return TheBPFTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFTargetInfo() {
   TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)",
                                  "BPF", [](Triple::ArchType) { return false; },
                                  true);
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index f9b4bc0d14fd..c423dca90a4a 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
@@ -878,7 +879,8 @@ bool HexagonAsmParser::RegisterMatchesArch(MCRegister MatchNum) const {
 // extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmLexer();
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonAsmParser() {
   RegisterMCAsmParser<HexagonAsmParser> X(getTheHexagonTarget());
 }
 
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 98b711f6b014..5bd31707acb6 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -164,7 +165,8 @@ static MCDisassembler *createHexagonDisassembler(const Target &T,
   return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheHexagonTarget(),
                                          createHexagonDisassembler);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index c7580d28618a..f22852d1ef55 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -858,6 +859,7 @@ char HexagonAsmPrinter::ID = 0;
 INITIALIZE_PASS(HexagonAsmPrinter, "hexagon-asm-printer",
                 "Hexagon Assembly Printer", false, false)
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonAsmPrinter() {
   RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 19b7c6a315f5..66508fd76779 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -174,7 +175,8 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTarget() {
   // Register the target.
   RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 91051cd4e2d5..980df819b2c2 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/raw_ostream.h"
@@ -775,7 +776,8 @@ static MCInstrAnalysis *createHexagonMCInstrAnalysis(const MCInstrInfo *Info) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheHexagonTarget(), createHexagonMCAsmInfo);
 
diff --git a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index ef9f9fd337fa..34a7b945ca51 100644
--- a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheHexagonTarget() {
@@ -15,7 +16,8 @@ Target &llvm::getTheHexagonTarget() {
   return TheHexagonTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTargetInfo() {
   RegisterTarget<Triple::hexagon, /*HasJIT=*/true> X(
       getTheHexagonTarget(), "hexagon", "Hexagon", "Hexagon");
 }
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 9cb7f71945d1..6a74686a239d 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -1223,6 +1224,7 @@ bool LanaiAsmParser::parseInstruction(ParseInstructionInfo & /*Info*/,
 #define GET_MATCHER_IMPLEMENTATION
 #include "LanaiGenAsmMatcher.inc"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiAsmParser() {
   RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
 }
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index 2720e1d9a6a6..5d87c3c4d72c 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -35,7 +36,8 @@ static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
   return new LanaiDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiDisassembler() {
   // Register the disassembler
   TargetRegistry::RegisterMCDisassembler(getTheLanaiTarget(),
                                          createLanaiDisassembler);
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 1c4fc572243c..24e4fc3f53e6 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "asm-printer"
@@ -242,6 +243,7 @@ INITIALIZE_PASS(LanaiAsmPrinter, "lanai-asm-printer", "Lanai Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiAsmPrinter() {
   RegisterAsmPrinter<LanaiAsmPrinter> X(getTheLanaiTarget());
 }
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 7f94e778e754..3d6ba9ecc55e 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -21,12 +21,13 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
   // Register the target.
   RegisterTargetMachine<LanaiTargetMachine> registered_target(
       getTheLanaiTarget());
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 4a381c033b38..687386c6962b 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstdint>
@@ -126,7 +127,8 @@ static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
   return new LanaiMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<LanaiMCAsmInfo> X(getTheLanaiTarget());
 
diff --git a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index 5c63df670938..f56591a45f8f 100644
--- a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/LanaiTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -16,7 +17,8 @@ Target &llvm::getTheLanaiTarget() {
   return TheLanaiTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiTargetInfo() {
   RegisterTarget<Triple::lanai> X(getTheLanaiTarget(), "lanai", "Lanai",
                                   "Lanai");
 }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 7d5827008957..a8fed951b0cf 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -1953,7 +1954,8 @@ ParseStatus LoongArchAsmParser::parseDirective(AsmToken DirectiveID) {
   return ParseStatus::NoMatch;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchAsmParser() {
   RegisterMCAsmParser<LoongArchAsmParser> X(getTheLoongArch32Target());
   RegisterMCAsmParser<LoongArchAsmParser> Y(getTheLoongArch64Target());
 }
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
index 761682423fff..8c4668ec70c7 100644
--- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -45,7 +46,8 @@ static MCDisassembler *createLoongArchDisassembler(const Target &T,
   return new LoongArchDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheLoongArch32Target(),
                                          createLoongArchDisassembler);
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 64ac7c03c041..b757d123fa0f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -309,7 +310,8 @@ INITIALIZE_PASS(LoongArchAsmPrinter, "loongarch-asm-printer",
                 "LoongArch Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchAsmPrinter() {
   RegisterAsmPrinter<LoongArchAsmPrinter> X(getTheLoongArch32Target());
   RegisterAsmPrinter<LoongArchAsmPrinter> Y(getTheLoongArch64Target());
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d63e5a2b50e8..c36db9c75dd3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -29,7 +30,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loongarch"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTarget() {
   // Register the target.
   RegisterTargetMachine<LoongArchTargetMachine> X(getTheLoongArch32Target());
   RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index 3ec070e5cbdd..35277ce094a7 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -204,7 +204,8 @@ MCStreamer *createLoongArchELFStreamer(const Triple &T, MCContext &Context,
 }
 } // end namespace
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTargetMC() {
   for (Target *T : {&getTheLoongArch32Target(), &getTheLoongArch64Target()}) {
     TargetRegistry::RegisterMCRegInfo(*T, createLoongArchMCRegisterInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createLoongArchMCInstrInfo);
diff --git a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
index 1d6be4069b71..a7a5c25de323 100644
--- a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
+++ b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/LoongArchTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheLoongArch32Target() {
@@ -20,7 +21,8 @@ Target &llvm::getTheLoongArch64Target() {
   return TheLoongArch64Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTargetInfo() {
   RegisterTarget<Triple::loongarch32, /*HasJIT=*/false> X(
       getTheLoongArch32Target(), "loongarch32", "32-bit LoongArch",
       "LoongArch");
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index c61b8adf89ab..5a4121f7cafd 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "msp430-asm-parser"
@@ -534,7 +535,8 @@ bool MSP430AsmParser::ParseLiteralValues(unsigned Size, SMLoc L) {
   return (parseMany(parseOne));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430AsmParser() {
   RegisterMCAsmParser<MSP430AsmParser> X(getTheMSP430Target());
 }
 
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index 519bba763204..4c5b473982f7 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -57,7 +58,8 @@ static MCDisassembler *createMSP430Disassembler(const Target &T,
   return new MSP430Disassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Disassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430Disassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheMSP430Target(),
                                          createMSP430Disassembler);
 }
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index df182a5459ea..2cb515aef11e 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -80,7 +81,8 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
   return nullptr;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430TargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430TargetMC() {
   Target &T = getTheMSP430Target();
 
   TargetRegistry::RegisterMCAsmInfo(T, createMSP430MCAsmInfo);
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 44e55b6a3c9b..44eea8149c59 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -189,6 +190,7 @@ INITIALIZE_PASS(MSP430AsmPrinter, "msp430-asm-printer",
                 "MSP430 Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
 }
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 763a2db2baca..e6024f4a6218 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -18,10 +18,11 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
   // Register the target.
   RegisterTargetMachine<MSP430TargetMachine> X(getTheMSP430Target());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index fc2b38f41c14..a6170b82e1f4 100644
--- a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/MSP430TargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheMSP430Target() {
@@ -15,7 +16,8 @@ Target &llvm::getTheMSP430Target() {
   return TheMSP430Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430TargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430TargetInfo() {
   RegisterTarget<Triple::msp430> X(getTheMSP430Target(), "msp430",
                                    "MSP430 [experimental]", "MSP430");
 }
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 071c016b92e7..b559a8b896e0 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -8906,7 +8906,8 @@ bool MipsAsmParser::parseInternalDirectiveReallowModule() {
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsAsmParser() {
   RegisterMCAsmParser<MipsAsmParser> X(getTheMipsTarget());
   RegisterMCAsmParser<MipsAsmParser> Y(getTheMipselTarget());
   RegisterMCAsmParser<MipsAsmParser> A(getTheMips64Target());
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 2a3a8eac2e9a..b3f6cd1609fb 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -503,7 +503,8 @@ static MCDisassembler *createMipselDisassembler(
   return new MipsDisassembler(STI, Ctx, false);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
                                          createMipsDisassembler);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index add36d87b9ef..29f61ed9b2b8 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -259,7 +260,7 @@ static MCInstrAnalysis *createMipsMCInstrAnalysis(const MCInstrInfo *Info) {
   return new MipsMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
   for (Target *T : {&getTheMipsTarget(), &getTheMipselTarget(),
                     &getTheMips64Target(), &getTheMips64elTarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index a6300a9c11d4..87e06a6d3c08 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -55,6 +55,7 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -1299,7 +1300,8 @@ INITIALIZE_PASS(MipsAsmPrinter, "mips-asm-printer", "Mips Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(getTheMipsTarget());
   RegisterAsmPrinter<MipsAsmPrinter> Y(getTheMipselTarget());
   RegisterAsmPrinter<MipsAsmPrinter> A(getTheMips64Target());
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 30b4d506c5ca..8c519fa379dd 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -37,6 +37,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
@@ -51,7 +52,7 @@ static cl::opt<bool>
     EnableMulMulFix("mfix4300", cl::init(false),
                     cl::desc("Enable the VR4300 mulmul bug fix."), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
   // Register the target.
   RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget());
   RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
diff --git a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index db5f607bbb4f..458032042e15 100644
--- a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/MipsTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheMipsTarget() {
@@ -27,7 +28,8 @@ Target &llvm::getTheMips64elTarget() {
   return TheMips64elTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsTargetInfo() {
   RegisterTarget<Triple::mips,
                  /*HasJIT=*/true>
       X(getTheMipsTarget(), "mips", "MIPS (32-bit big endian)", "Mips");
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 1cafd236a292..cb7132b5f304 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -71,7 +72,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXTargetMC() {
   for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
     // Register the MC asm info.
     RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b4e2c46b9444..9af6fb2cb198 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -77,6 +77,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/NativeFormatting.h"
@@ -1947,7 +1948,8 @@ INITIALIZE_PASS(NVPTXAsmPrinter, "nvptx-asm-printer", "NVPTX Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
   RegisterAsmPrinter<NVPTXAsmPrinter> Y(getTheNVPTXTarget64());
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85d28a703a4c..ef310e5828f2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -87,7 +88,7 @@ static cl::opt<bool> EarlyByValArgsCopy(
     cl::desc("Create a copy of byval function arguments early."),
     cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   // Register the target.
   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
diff --git a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
index e4f0a517599f..24fea037b1c5 100644
--- a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
+++ b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheNVPTXTarget32() {
@@ -19,7 +20,8 @@ Target &llvm::getTheNVPTXTarget64() {
   return TheNVPTXTarget64;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXTargetInfo() {
   RegisterTarget<Triple::nvptx> X(getTheNVPTXTarget32(), "nvptx",
                                   "NVIDIA PTX 32-bit", "NVPTX");
   RegisterTarget<Triple::nvptx64> Y(getTheNVPTXTarget64(), "nvptx64",
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index bb4c2fd3e5cf..2b3727be644d 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -1784,7 +1785,8 @@ bool PPCAsmParser::parseGNUAttribute(SMLoc L) {
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
   RegisterMCAsmParser<PPCAsmParser> B(getThePPC32LETarget());
   RegisterMCAsmParser<PPCAsmParser> C(getThePPC64Target());
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 0c6c17d5a0b6..71a76142bb38 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -50,7 +51,8 @@ static MCDisassembler *createPPCLEDisassembler(const Target &T,
   return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
                                          createPPCDisassembler);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 44b5732be6e3..dd2756a1a823 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -473,7 +474,8 @@ static MCInstrAnalysis *createPPCMCInstrAnalysis(const MCInstrInfo *Info) {
   return new PPCMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTargetMC() {
   for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(),
                     &getThePPC64Target(), &getThePPC64LETarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d5d51e3ca638..9e42011c0c74 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -63,6 +63,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -3374,7 +3375,8 @@ INITIALIZE_PASS(PPCAIXAsmPrinter, "ppc-aix-asm-printer",
                 "AIX PPC Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
                                      createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getThePPC32LETarget(),
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 359a43dd001d..b5c6ac111dff 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -113,7 +114,8 @@ static cl::opt<unsigned>
                          cl::init(0x7fff),
                          cl::desc("Maximum global merge offset"));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
   RegisterTargetMachine<PPCTargetMachine> B(getThePPC32LETarget());
diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 0bfa0bd5ec0e..982be2746b47 100644
--- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getThePPC32Target() {
@@ -27,7 +28,8 @@ Target &llvm::getThePPC64LETarget() {
   return ThePPC64LETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTargetInfo() {
   RegisterTarget<Triple::ppc, /*HasJIT=*/true> W(getThePPC32Target(), "ppc32",
                                                  "PowerPC 32", "PPC");
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 510ca5f8c0d9..f1d6f99ba981 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributes.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
@@ -4021,7 +4022,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVAsmParser() {
   RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
   RegisterMCAsmParser<RISCVAsmParser> Y(getTheRISCV64Target());
 }
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 4363e5c5176c..cbab081a6731 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -64,7 +65,8 @@ static MCDisassembler *createRISCVDisassembler(const Target &T,
   return new RISCVDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheRISCV32Target(),
                                          createRISCVDisassembler);
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index e3b89d84a134..ae4430617075 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -16,6 +16,7 @@
 #include "RISCV.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca-riscv-custombehaviour"
@@ -344,7 +345,8 @@ createRISCVInstrumentManager(const MCSubtargetInfo &STI,
 }
 
 /// Extern function to initialize the targets for the RISC-V backend
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMCA() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetMCA() {
   TargetRegistry::RegisterInstrumentManager(getTheRISCV32Target(),
                                             createRISCVInstrumentManager);
   TargetRegistry::RegisterInstrumentManager(getTheRISCV64Target(),
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index f3b93f032588..f66c2d5f99cb 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <bitset>
 
@@ -331,7 +332,8 @@ static MCInstrAnalysis *createRISCVInstrAnalysis(const MCInstrInfo *Info) {
   return new RISCVMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
     TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
     TargetRegistry::RegisterMCObjectFileInfo(*T, createRISCVMCObjectFileInfo);
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 83e9b4b4d7c5..d4d7de289a10 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -38,6 +38,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
@@ -610,7 +611,8 @@ void RISCVAsmPrinter::emitFunctionEntryLabel() {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVAsmPrinter() {
   RegisterAsmPrinter<RISCVAsmPrinter> X(getTheRISCV32Target());
   RegisterAsmPrinter<RISCVAsmPrinter> Y(getTheRISCV64Target());
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 0bea3bc432b6..b43b915d0ad4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
@@ -118,7 +119,7 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for RISC-V"),
                            cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
index 0a675d684912..fc0965d263a8 100644
--- a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
+++ b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheRISCV32Target() {
@@ -20,7 +21,8 @@ Target &llvm::getTheRISCV64Target() {
   return TheRISCV64Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetInfo() {
   RegisterTarget<Triple::riscv32, /*HasJIT=*/true> X(
       getTheRISCV32Target(), "riscv32", "32-bit RISC-V", "RISCV");
   RegisterTarget<Triple::riscv64, /*HasJIT=*/true> Y(
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
index 21a952649ff5..cc77ddd748a9 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -78,7 +79,8 @@ static MCInstrAnalysis *createSPIRVInstrAnalysis(const MCInstrInfo *Info) {
   return new SPIRVMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVTargetMC() {
   for (Target *T : {&getTheSPIRV32Target(), &getTheSPIRV64Target(),
                     &getTheSPIRVLogicalTarget()}) {
     RegisterMCAsmInfo<SPIRVMCAsmInfo> X(*T);
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 26b94788b810..1ebfde2a603b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -657,7 +658,8 @@ INITIALIZE_PASS(SPIRVAsmPrinter, "spirv-asm-printer", "SPIRV Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVAsmPrinter() {
   RegisterAsmPrinter<SPIRVAsmPrinter> X(getTheSPIRV32Target());
   RegisterAsmPrinter<SPIRVAsmPrinter> Y(getTheSPIRV64Target());
   RegisterAsmPrinter<SPIRVAsmPrinter> Z(getTheSPIRVLogicalTarget());
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 82fe23a22b60..d7cf211ba84d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
@@ -35,7 +36,7 @@
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   // Register the target.
   RegisterTargetMachine<SPIRVTargetMachine> X(getTheSPIRV32Target());
   RegisterTargetMachine<SPIRVTargetMachine> Y(getTheSPIRV64Target());
diff --git a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
index febefc024920..c4d086d7da5c 100644
--- a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
+++ b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SPIRVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -24,7 +25,8 @@ Target &llvm::getTheSPIRVLogicalTarget() {
   return TheSPIRVLogicalTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVTargetInfo() {
   RegisterTarget<Triple::spirv32> X(getTheSPIRV32Target(), "spirv32",
                                     "SPIR-V 32-bit", "SPIRV");
   RegisterTarget<Triple::spirv64> Y(getTheSPIRV64Target(), "spirv64",
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 28ae34903166..f1009999dc1b 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -1750,7 +1751,8 @@ bool SparcAsmParser::isPossibleExpression(const AsmToken &Token) {
   }
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcAsmParser() {
   RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
   RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
   RegisterMCAsmParser<SparcAsmParser> C(getTheSparcelTarget());
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 173fe3df0d95..fab94fb4d40c 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -46,8 +47,8 @@ static MCDisassembler *createSparcDisassembler(const Target &T,
   return new SparcDisassembler(STI, Ctx);
 }
 
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheSparcTarget(),
                                          createSparcDisassembler);
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 72f9b3bcd968..fa07578e512b 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
@@ -107,7 +108,8 @@ static MCInstPrinter *createSparcMCInstPrinter(const Triple &T,
   return new SparcInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheSparcTarget(), createSparcMCAsmInfo);
   RegisterMCAsmInfoFn Y(getTheSparcV9Target(), createSparcV9MCAsmInfo);
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 5366e905d6df..8e7e2e5f7370 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -506,7 +507,8 @@ INITIALIZE_PASS(SparcAsmPrinter, "sparc-asm-printer", "Sparc Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcAsmPrinter() {
   RegisterAsmPrinter<SparcAsmPrinter> X(getTheSparcTarget());
   RegisterAsmPrinter<SparcAsmPrinter> Y(getTheSparcV9Target());
   RegisterAsmPrinter<SparcAsmPrinter> Z(getTheSparcelTarget());
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index d4d8cbb044de..52076a6b4dd2 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -18,10 +18,11 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
   // Register the target.
   RegisterTargetMachine<SparcV8TargetMachine> X(getTheSparcTarget());
   RegisterTargetMachine<SparcV9TargetMachine> Y(getTheSparcV9Target());
diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 1f8837eb0194..2bfcffbd4fd0 100644
--- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheSparcTarget() {
@@ -23,7 +24,8 @@ Target &llvm::getTheSparcelTarget() {
   return TheSparcelTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcTargetInfo() {
   RegisterTarget<Triple::sparc, /*HasJIT=*/false> X(getTheSparcTarget(),
                                                     "sparc", "Sparc", "Sparc");
   RegisterTarget<Triple::sparcv9, /*HasJIT=*/false> Y(
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 6ee2a87565ba..04a4c3610924 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -1784,6 +1785,7 @@ bool SystemZAsmParser::isLabel(AsmToken &Token) {
 
 // Force static initialization.
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZAsmParser() {
   RegisterMCAsmParser<SystemZAsmParser> X(getTheSystemZTarget());
 }
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index bd188f5b4b52..6ae529e97418 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -45,7 +46,8 @@ static MCDisassembler *createSystemZDisassembler(const Target &T,
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheSystemZTarget(),
                                          createSystemZDisassembler);
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index d2ed5cac5c57..86e340b7ff1b 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -239,7 +240,8 @@ static MCInstrAnalysis *createSystemZMCInstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTargetMC() {
   // Register the MCAsmInfo.
   TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
                                     createSystemZMCAsmInfo);
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index aaf12b88de13..6f9d25c050b7 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Chrono.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ConvertEBCDIC.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -1738,6 +1739,7 @@ INITIALIZE_PASS(SystemZAsmPrinter, "systemz-asm-printer",
                 "SystemZ Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZAsmPrinter() {
   RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index ddb5a730a6fd..ece8928accd0 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
 #include <memory>
@@ -36,7 +37,8 @@ static cl::opt<bool> EnableMachineCombinerPass(
     cl::init(true), cl::Hidden);
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
   auto &PR = *PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index 91e4c91b00b9..703051f6f2d3 100644
--- a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SystemZTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -17,7 +18,8 @@ Target &llvm::getTheSystemZTarget() {
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTargetInfo() {
   RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(
       getTheSystemZTarget(), "systemz", "SystemZ", "SystemZ");
 }
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index c54ce40de45f..7987950a2a0a 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
@@ -1510,7 +1511,7 @@ ParseStatus VEAsmParser::parseVEAsmOperand(std::unique_ptr<VEOperand> &Op) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
   RegisterMCAsmParser<VEAsmParser> A(getTheVETarget());
 }
 
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 00487a1f5bb3..88200c5fc97e 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -47,7 +48,8 @@ static MCDisassembler *createVEDisassembler(const Target &T,
   return new VEDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeVEDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheVETarget(),
                                          createVEDisassembler);
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index 019748413d32..699ef9808eb8 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -84,7 +85,7 @@ static MCInstPrinter *createVEMCInstPrinter(const Triple &T,
   return new VEInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheVETarget(), createVEMCAsmInfo);
 
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index 7c4bf1cfd672..dcc54b4cec01 100644
--- a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/VETargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -16,7 +17,7 @@ Target &llvm::getTheVETarget() {
   return TheVETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
   RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
                                                  "VE", "VE");
 }
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index af0dc0404d3c..f7d770c18f88 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -419,6 +420,6 @@ INITIALIZE_PASS(VEAsmPrinter, "ve-asm-printer", "VE Assembly Printer", false,
                 false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
   RegisterAsmPrinter<VEAsmPrinter> X(getTheVETarget());
 }
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 664a54cea7c5..14b8e330d87a 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -19,13 +19,14 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "ve"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
   // Register the target.
   RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 9649381f07b1..e4140755edf4 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 
 using namespace llvm;
@@ -1282,7 +1283,8 @@ public:
 } // end anonymous namespace
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyAsmParser() {
   RegisterMCAsmParser<WebAssemblyAsmParser> X(getTheWebAssemblyTarget32());
   RegisterMCAsmParser<WebAssemblyAsmParser> Y(getTheWebAssemblyTarget64());
 }
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 0399f9d38e4e..8a29a5902ce2 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 
@@ -66,7 +67,7 @@ static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
   return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
 LLVMInitializeWebAssemblyDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget32(),
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index a4162a07ee33..6c0031f429c6 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -124,7 +125,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTargetMC() {
   for (Target *T :
        {&getTheWebAssemblyTarget32(), &getTheWebAssemblyTarget64()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index ef2c77ade8cc..e65fa8e60aeb 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-target-info"
@@ -26,7 +27,8 @@ Target &llvm::getTheWebAssemblyTarget64() {
   return TheWebAssemblyTarget64;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTargetInfo() {
   RegisterTarget<Triple::wasm32> X(getTheWebAssemblyTarget32(), "wasm32",
                                    "WebAssembly 32-bit", "WebAssembly");
   RegisterTarget<Triple::wasm64> Y(getTheWebAssemblyTarget64(), "wasm64",
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index c61ed3c7d5d8..b43b7dbfc36b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -46,6 +46,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -759,7 +760,8 @@ INITIALIZE_PASS(WebAssemblyAsmPrinter, "webassembly-asm-printer",
                 "WebAssembly Assmebly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyAsmPrinter() {
   RegisterAsmPrinter<WebAssemblyAsmPrinter> X(getTheWebAssemblyTarget32());
   RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(getTheWebAssemblyTarget64());
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index adb446b20ebf..6e551e5c8ee4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LowerAtomicPass.h"
@@ -53,7 +54,8 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
       getTheWebAssemblyTarget32());
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 57801752f170..d36f18238f7a 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -750,7 +751,8 @@ static MCDisassembler *createXCoreDisassembler(const Target &T,
   return new XCoreDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheXCoreTarget(),
                                          createXCoreDisassembler);
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 098d874f2149..0ef2da04171e 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -125,7 +126,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheXCoreTarget(), createXCoreMCAsmInfo);
 
diff --git a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index 8916c6ca7be7..556b31eab8b7 100644
--- a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/XCoreTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheXCoreTarget() {
@@ -15,7 +16,8 @@ Target &llvm::getTheXCoreTarget() {
   return TheXCoreTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreTargetInfo() {
   RegisterTarget<Triple::xcore> X(getTheXCoreTarget(), "xcore", "XCore",
                                   "XCore");
 }
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index b10b3056d82b..0426088caf24 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -295,6 +296,7 @@ INITIALIZE_PASS(XCoreAsmPrinter, "xcore-asm-printer", "XCore Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreAsmPrinter() {
   RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget());
 }
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 3627b81a4805..88f46c38b2f9 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 
 using namespace llvm;
@@ -102,7 +103,7 @@ void XCorePassConfig::addPreEmitPass() {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeXCoreAsmPrinterPass(PR);
diff --git a/llvm/unittests/Target/AArch64/CMakeLists.txt b/llvm/unittests/Target/AArch64/CMakeLists.txt
index 67eb508e9bab..9387ca90dd31 100644
--- a/llvm/unittests/Target/AArch64/CMakeLists.txt
+++ b/llvm/unittests/Target/AArch64/CMakeLists.txt
@@ -16,6 +16,7 @@ set(LLVM_LINK_COMPONENTS
   GlobalISel
   MC
   MIRParser
+  Passes
   SelectionDAG
   Support
   Target
diff --git a/llvm/unittests/Target/LoongArch/CMakeLists.txt b/llvm/unittests/Target/LoongArch/CMakeLists.txt
index 6e7e49b4cb4e..c3d33418a03a 100644
--- a/llvm/unittests/Target/LoongArch/CMakeLists.txt
+++ b/llvm/unittests/Target/LoongArch/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
   LoongArchCodeGen
   LoongArchDesc
   LoongArchInfo
+  Instrumentation
   MC
   MIRParser
   SelectionDAG
diff --git a/llvm/unittests/Target/RISCV/CMakeLists.txt b/llvm/unittests/Target/RISCV/CMakeLists.txt
index 10d6412f9b35..8da8c3896faf 100644
--- a/llvm/unittests/Target/RISCV/CMakeLists.txt
+++ b/llvm/unittests/Target/RISCV/CMakeLists.txt
@@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   Core
   MC
+  Passes
   SelectionDAG
   TargetParser
   )
diff --git a/llvm/unittests/Target/SPIRV/CMakeLists.txt b/llvm/unittests/Target/SPIRV/CMakeLists.txt
index d7f0290089c4..29b31b16094a 100644
--- a/llvm/unittests/Target/SPIRV/CMakeLists.txt
+++ b/llvm/unittests/Target/SPIRV/CMakeLists.txt
@@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
   AsmParser
   BinaryFormat
   Core
+  Passes
   SPIRVCodeGen
   SPIRVAnalysis
   Support
diff --git a/llvm/unittests/Target/VE/CMakeLists.txt b/llvm/unittests/Target/VE/CMakeLists.txt
index 271bf07f5b5d..de823306a9ae 100644
--- a/llvm/unittests/Target/VE/CMakeLists.txt
+++ b/llvm/unittests/Target/VE/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   Core
   GlobalISel
+  Instrumentation
   MC
   SelectionDAG
   Support
diff --git a/llvm/unittests/Target/WebAssembly/CMakeLists.txt b/llvm/unittests/Target/WebAssembly/CMakeLists.txt
index b1e01169e7a0..b1e180d218c1 100644
--- a/llvm/unittests/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/unittests/Target/WebAssembly/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   CodeGenTypes
   Core
+  Instrumentation
   MC
   MIRParser
   TargetParser

From 7b7b5a397da1ecb9f767df5a3a3b6076cec109f9 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Tue, 17 Jun 2025 13:29:45 -0700
Subject: [PATCH 0704/1322] [AMDGPU] Remove AsmVOP3OpSel field completely.
 NFCI. (#144574)

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      | 1 -
 llvm/lib/Target/AMDGPU/VOP1Instructions.td | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e74ccbee975a..343482604ae5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2652,7 +2652,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
    HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
-  field string AsmVOP3OpSel = AsmVOP3Base;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
   field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3Base>.ret;
   field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 7fdd951ecbd3..926df955881e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -393,7 +393,6 @@ def VOP_PERMLANE_SWAP : VOPProfile<[i32, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
   let InsVOP3OpSel = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
   let Asm64 = "$vdst, $src0$bound_ctrl$fi";
-  let AsmVOP3OpSel = "$vdst, $src0$bound_ctrl$fi";
 }
 
 // Special case because there are no true output operands.  Hack vdst

From 8dcf4ba6359578c4d944b75b3f96a1fbd4fb9528 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Tue, 17 Jun 2025 13:30:50 -0700
Subject: [PATCH 0705/1322] [AMDGPU] Fix getAsmVOP3Base call agruments.
 (#144572)

https://github.com/llvm/llvm-project/pull/143465 has removed
getAsmVOP3OpSel and uses getAsmVOP3Base instead, but original
call to getAsmVOP3OpSel was using HasSrc*FloatMods and the
call to getAsmVOP3Base uses HasSrc*Mods. This does not play
well with opsel. An opsel instruction has modifiers in dag but
shall not have them in the asm string.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      |  4 ++--
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 343482604ae5..768f57c469d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2648,8 +2648,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   // the asm operand name via this HasModifiers flag
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
-   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods,
-   HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
+   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0FloatMods, HasSrc1FloatMods,
+   HasSrc2FloatMods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index f372101cb7b7..2dbc119f65cd 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1126,6 +1126,9 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
   let HasModifiers = 0;
   let HasSrc0IntMods = 0;
   let HasSrc1IntMods = 0;
+  let HasSrc0FloatMods = 0;
+  let HasSrc1FloatMods = 0;
+  let HasSrc2FloatMods = 0;
   let HasOMod = 0;
   let HasOpSel = 0;
   let HasClamp = 0;
@@ -1562,9 +1565,12 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
   def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
 }
 
+let HasModifiers = 1 in
+def ASHR_PK_I8_Profile : VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>;
+
 let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
-  defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
-  defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
+  defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", ASHR_PK_I8_Profile, int_amdgcn_ashr_pk_i8_i32>;
+  defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", ASHR_PK_I8_Profile, int_amdgcn_ashr_pk_u8_i32>;
 } // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1
 
 class AshrPkI8Pat<VOP3_Pseudo inst, int lo, int hi>: GCNPat<

From 73f307a5ca308d356c557734765742c26bf7ed03 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Tue, 17 Jun 2025 13:32:11 -0700
Subject: [PATCH 0706/1322] [HLSL] Use ExtVector for firstbit intrinsics
 (#142679)

Fixes https://github.com/llvm/llvm-project/issues/142430

firstbit intrinsics were using the wrong vector type which causes some
conversions to fail. This PR switches them to ExtVector which resolves
the issue
---
 clang/lib/Sema/SemaHLSL.cpp                       | 8 ++++----
 clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl | 8 ++++++++
 clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl  | 8 ++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 4a8479a00e0e..b55f4fd786b5 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2231,8 +2231,9 @@ static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                        QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
   if (VecTyA)
-    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
-                                          VectorKind::Generic);
+    ReturnType =
+        S->Context.getExtVectorType(ReturnType, VecTyA->getNumElements());
+
   TheCall->setType(ReturnType);
 }
 
@@ -2545,8 +2546,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
 
     if (auto *VecTy = EltTy->getAs<VectorType>()) {
       EltTy = VecTy->getElementType();
-      ResTy = SemaRef.Context.getVectorType(ResTy, VecTy->getNumElements(),
-                                            VecTy->getVectorKind());
+      ResTy = SemaRef.Context.getExtVectorType(ResTy, VecTy->getNumElements());
     }
 
     if (!EltTy->isIntegerType()) {
diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
index debf6b6d3e3f..a71b1878f8b5 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
@@ -151,3 +151,11 @@ uint3 test_firstbithigh_long3(int64_t3 p0) {
 uint4 test_firstbithigh_long4(int64_t4 p0) {
   return firstbithigh(p0);
 }
+
+// CHECK-LABEL: test_firstbithigh_upcast
+// CHECK: [[FBH:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
+// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBH]] to <4 x i64>
+// CHECK: ret <4 x i64> [[CONV]]
+uint64_t4 test_firstbithigh_upcast(uint4 p0) {
+  return firstbithigh(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
index 5d490fabc5bc..007db0c9c2ad 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -151,3 +151,11 @@ uint3 test_firstbitlow_long3(int64_t3 p0) {
 uint4 test_firstbitlow_long4(int64_t4 p0) {
   return firstbitlow(p0);
 }
+
+// CHECK-LABEL: test_firstbitlow_upcast
+// CHECK: [[FBL:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32(<4 x i32> %{{.*}})
+// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBL]] to <4 x i64>
+// CHECK: ret <4 x i64> [[CONV]]
+uint64_t4 test_firstbitlow_upcast(uint4 p0) {
+  return firstbitlow(p0);
+}

From a79186c1ea62bbe0579e0b1eed4ad507966cca41 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Tue, 17 Jun 2025 13:36:15 -0700
Subject: [PATCH 0707/1322] [Driver] Fix Arm/AArch64 Link Argument tests
 (#144582)

The openmp-offload-amdgpu-runtime-2 bot specifies default rtlib of
compiler-rt, but default unwindlib of libgcc. Change the tests to accept
that there may be `"--as-needed" "-lgcc_s" "--no-as-needed"` between
`libclang_rt.builtins.a` and `-lc`.

Relates to #121830
---
 clang/test/Driver/aarch64-toolchain.c | 3 ++-
 clang/test/Driver/arm-toolchain.c     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index e12107fa2c50..327161b81d9f 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -135,7 +135,8 @@
 
 // AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
 // AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
+// AARCH64-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
 // AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index d4f9bf2aaf3d..5368158cdeed 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -136,7 +136,8 @@
 
 // ARM-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
 // ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
+// ARM-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
 // ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \

From 7c4b2be983e900663a8d766ea9dc6f03b713e5b0 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 17 Jun 2025 16:38:27 -0400
Subject: [PATCH 0708/1322] [libc++][NFC] Refactor basic_streambuf to use
 public API functions when possible (#144547)

The implementation of std::basic_streambuf used private member variables
to manipulate the get and the put areas. Using public API functions is
equivalent but leads to code that is easier to understand, since the
public API functions are known more widely than our internal member
variables. Using the public API functions removes the need to map the
internal member variables back to get/put area manipulation functions in
one's head.

Finally, it also makes it easier to find subtle issues by instrumenting
accessor functions, which is impossible if the class uses the member
variables directly.
---
 libcxx/include/streambuf | 53 ++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index e25647909378..585ae7af65aa 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -178,8 +178,8 @@ public:
   // Get and put areas:
   // 27.6.2.2.3 Get area:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 streamsize in_avail() {
-    if (__ninp_ < __einp_)
-      return static_cast<streamsize>(__einp_ - __ninp_);
+    if (gptr() < egptr())
+      return static_cast<streamsize>(egptr() - gptr());
     return showmanyc();
   }
 
@@ -190,37 +190,42 @@ public:
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sbumpc() {
-    if (__ninp_ == __einp_)
+    if (gptr() == egptr())
       return uflow();
-    return traits_type::to_int_type(*__ninp_++);
+    int_type __c = traits_type::to_int_type(*gptr());
+    this->gbump(1);
+    return __c;
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sgetc() {
-    if (__ninp_ == __einp_)
+    if (gptr() == egptr())
       return underflow();
-    return traits_type::to_int_type(*__ninp_);
+    return traits_type::to_int_type(*gptr());
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 streamsize sgetn(char_type* __s, streamsize __n) { return xsgetn(__s, __n); }
 
   // 27.6.2.2.4 Putback:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sputbackc(char_type __c) {
-    if (__binp_ == __ninp_ || !traits_type::eq(__c, __ninp_[-1]))
+    if (eback() == gptr() || !traits_type::eq(__c, *(gptr() - 1)))
       return pbackfail(traits_type::to_int_type(__c));
-    return traits_type::to_int_type(*--__ninp_);
+    this->gbump(-1);
+    return traits_type::to_int_type(*gptr());
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sungetc() {
-    if (__binp_ == __ninp_)
+    if (eback() == gptr())
       return pbackfail();
-    return traits_type::to_int_type(*--__ninp_);
+    this->gbump(-1);
+    return traits_type::to_int_type(*gptr());
   }
 
   // 27.6.2.2.5 Put area:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sputc(char_type __c) {
-    if (__nout_ == __eout_)
+    if (pptr() == epptr())
       return overflow(traits_type::to_int_type(__c));
-    *__nout_++ = __c;
+    *pptr() = __c;
+    this->pbump(1);
     return traits_type::to_int_type(__c);
   }
 
@@ -312,17 +317,16 @@ protected:
   virtual streamsize showmanyc() { return 0; }
 
   virtual streamsize xsgetn(char_type* __s, streamsize __n) {
-    const int_type __eof = traits_type::eof();
     int_type __c;
     streamsize __i = 0;
     while (__i < __n) {
-      if (__ninp_ < __einp_) {
-        const streamsize __len = std::min(static_cast<streamsize>(INT_MAX), std::min(__einp_ - __ninp_, __n - __i));
-        traits_type::copy(__s, __ninp_, __len);
+      if (gptr() < egptr()) {
+        const streamsize __len = std::min(static_cast<streamsize>(INT_MAX), std::min(egptr() - gptr(), __n - __i));
+        traits_type::copy(__s, gptr(), __len);
         __s += __len;
         __i += __len;
         this->gbump(__len);
-      } else if ((__c = uflow()) != __eof) {
+      } else if ((__c = uflow()) != traits_type::eof()) {
         *__s = traits_type::to_char_type(__c);
         ++__s;
         ++__i;
@@ -336,7 +340,9 @@ protected:
   virtual int_type uflow() {
     if (underflow() == traits_type::eof())
       return traits_type::eof();
-    return traits_type::to_int_type(*__ninp_++);
+    int_type __c = traits_type::to_int_type(*gptr());
+    this->gbump(1);
+    return __c;
   }
 
   // 27.6.2.4.4 Putback:
@@ -345,17 +351,16 @@ protected:
   // 27.6.2.4.5 Put area:
   virtual streamsize xsputn(const char_type* __s, streamsize __n) {
     streamsize __i = 0;
-    int_type __eof = traits_type::eof();
     while (__i < __n) {
-      if (__nout_ >= __eout_) {
-        if (overflow(traits_type::to_int_type(*__s)) == __eof)
+      if (pptr() >= epptr()) {
+        if (overflow(traits_type::to_int_type(*__s)) == traits_type::eof())
           break;
         ++__s;
         ++__i;
       } else {
-        streamsize __chunk_size = std::min(__eout_ - __nout_, __n - __i);
-        traits_type::copy(__nout_, __s, __chunk_size);
-        __nout_ += __chunk_size;
+        streamsize __chunk_size = std::min(epptr() - pptr(), __n - __i);
+        traits_type::copy(pptr(), __s, __chunk_size);
+        __pbump(__chunk_size);
         __s += __chunk_size;
         __i += __chunk_size;
       }

From 9ae4d2e01331ddeb2543f1940a09ef9c76ff5268 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 17 Jun 2025 22:44:02 +0200
Subject: [PATCH 0709/1322] [LLVM] [Support] Disable `ioctl()` terminal size
 check on Solaris (#144600)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#143514 broke the `clang-solaris11-sparcv9` bot; from what I can tell
that’s Solaris and according to `SolarisTargetInfo::getOSDefines`, the
macro `__sun__` should be defined on Solaris, so check for that and
don’t try to query the terminal size if it is defined.

Not sure this is the best solution but hopefully it fixes the bot.
---
 llvm/lib/Support/Unix/Process.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index db735b7484ad..c6e79af44b9b 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -320,7 +320,7 @@ static unsigned getColumns(int FileID) {
   // instead if it isn't available.
   unsigned Columns = 0;
 
-#ifdef HAVE_SYS_IOCTL_H
+#if defined(HAVE_SYS_IOCTL_H) && !defined(__sun__)
   struct winsize ws;
   if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
     Columns = ws.ws_col;

From c677a11c8d3223480cfe772e63fa0e7c09c76e2e Mon Sep 17 00:00:00 2001
From: David Peixotto <peix@meta.com>
Date: Tue, 17 Jun 2025 13:47:20 -0700
Subject: [PATCH 0710/1322] [lldb] Add support to list/enable/disable remaining
 plugin types. (#143970)

In #134418 we added support to list/enable/disable `SystemRuntime` and
`InstrumentationRuntime` plugins. We limited it to those two plugin
types to flesh out the idea with a smaller change.

This PR adds support for the remaining plugin types. We now support all
the plugins that can be registered directly with the plugin manager.
Plugins that are added by loading shared objects are still not
supported.
---
 lldb/include/lldb/Core/PluginManager.h        | 108 ++++-
 lldb/source/Core/PluginManager.cpp            | 441 ++++++++++++++++--
 lldb/test/API/commands/plugin/TestPlugin.py   |  62 +++
 .../Shell/Commands/command-plugin-list.test   |   8 +-
 4 files changed, 566 insertions(+), 53 deletions(-)
 create mode 100644 lldb/test/API/commands/plugin/TestPlugin.py

diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index e7b169103111..1d7c976f3c38 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -236,12 +236,6 @@ public:
   static SystemRuntimeCreateInstance
   GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx);
 
-  static std::vector<RegisteredPluginInfo> GetSystemRuntimePluginInfo();
-
-  // Modify the enabled state of a SystemRuntime plugin.
-  // Returns false if the plugin name is not found.
-  static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enabled);
-
   // ObjectFile
   static bool
   RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
@@ -549,12 +543,6 @@ public:
   static InstrumentationRuntimeCreateInstance
   GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx);
 
-  static std::vector<RegisteredPluginInfo>
-  GetInstrumentationRuntimePluginInfo();
-
-  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
-                                                     bool enabled);
-
   // TypeSystem
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              TypeSystemCreateInstance create_callback,
@@ -690,6 +678,102 @@ public:
   static bool CreateSettingForCPlusPlusLanguagePlugin(
       Debugger &debugger, const lldb::OptionValuePropertiesSP &properties_sp,
       llvm::StringRef description, bool is_global_property);
+
+  //
+  // Plugin Info+Enable Declarations
+  //
+  static std::vector<RegisteredPluginInfo> GetABIPluginInfo();
+  static bool SetABIPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetArchitecturePluginInfo();
+  static bool SetArchitecturePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetDisassemblerPluginInfo();
+  static bool SetDisassemblerPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetDynamicLoaderPluginInfo();
+  static bool SetDynamicLoaderPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetEmulateInstructionPluginInfo();
+  static bool SetEmulateInstructionPluginEnabled(llvm::StringRef name,
+                                                 bool enable);
+
+  static std::vector<RegisteredPluginInfo>
+  GetInstrumentationRuntimePluginInfo();
+  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                     bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetJITLoaderPluginInfo();
+  static bool SetJITLoaderPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetLanguagePluginInfo();
+  static bool SetLanguagePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetLanguageRuntimePluginInfo();
+  static bool SetLanguageRuntimePluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetMemoryHistoryPluginInfo();
+  static bool SetMemoryHistoryPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetObjectContainerPluginInfo();
+  static bool SetObjectContainerPluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetObjectFilePluginInfo();
+  static bool SetObjectFilePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetOperatingSystemPluginInfo();
+  static bool SetOperatingSystemPluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetPlatformPluginInfo();
+  static bool SetPlatformPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetProcessPluginInfo();
+  static bool SetProcessPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetREPLPluginInfo();
+  static bool SetREPLPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetRegisterTypeBuilderPluginInfo();
+  static bool SetRegisterTypeBuilderPluginEnabled(llvm::StringRef name,
+                                                  bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetScriptInterpreterPluginInfo();
+  static bool SetScriptInterpreterPluginEnabled(llvm::StringRef name,
+                                                bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetScriptedInterfacePluginInfo();
+  static bool SetScriptedInterfacePluginEnabled(llvm::StringRef name,
+                                                bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetStructuredDataPluginInfo();
+  static bool SetStructuredDataPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolFilePluginInfo();
+  static bool SetSymbolFilePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolLocatorPluginInfo();
+  static bool SetSymbolLocatorPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolVendorPluginInfo();
+  static bool SetSymbolVendorPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSystemRuntimePluginInfo();
+  static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTracePluginInfo();
+  static bool SetTracePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTraceExporterPluginInfo();
+  static bool SetTraceExporterPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTypeSystemPluginInfo();
+  static bool SetTypeSystemPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetUnwindAssemblyPluginInfo();
+  static bool SetUnwindAssemblyPluginEnabled(llvm::StringRef name, bool enable);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 5d44434033c5..dfa865929b64 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -182,14 +182,176 @@ void PluginManager::Terminate() {
 }
 
 llvm::ArrayRef<PluginNamespace> PluginManager::GetPluginNamespaces() {
-  // Currently supported set of plugin namespaces. This will be expanded
-  // over time.
   static PluginNamespace PluginNamespaces[] = {
-      {"system-runtime", PluginManager::GetSystemRuntimePluginInfo,
-       PluginManager::SetSystemRuntimePluginEnabled},
-      {"instrumentation-runtime",
-       PluginManager::GetInstrumentationRuntimePluginInfo,
-       PluginManager::SetInstrumentationRuntimePluginEnabled}};
+
+      {
+          "abi",
+          PluginManager::GetABIPluginInfo,
+          PluginManager::SetABIPluginEnabled,
+      },
+
+      {
+          "architecture",
+          PluginManager::GetArchitecturePluginInfo,
+          PluginManager::SetArchitecturePluginEnabled,
+      },
+
+      {
+          "disassembler",
+          PluginManager::GetDisassemblerPluginInfo,
+          PluginManager::SetDisassemblerPluginEnabled,
+      },
+
+      {
+          "dynamic-loader",
+          PluginManager::GetDynamicLoaderPluginInfo,
+          PluginManager::SetDynamicLoaderPluginEnabled,
+      },
+
+      {
+          "emulate-instruction",
+          PluginManager::GetEmulateInstructionPluginInfo,
+          PluginManager::SetEmulateInstructionPluginEnabled,
+      },
+
+      {
+          "instrumentation-runtime",
+          PluginManager::GetInstrumentationRuntimePluginInfo,
+          PluginManager::SetInstrumentationRuntimePluginEnabled,
+      },
+
+      {
+          "jit-loader",
+          PluginManager::GetJITLoaderPluginInfo,
+          PluginManager::SetJITLoaderPluginEnabled,
+      },
+
+      {
+          "language",
+          PluginManager::GetLanguagePluginInfo,
+          PluginManager::SetLanguagePluginEnabled,
+      },
+
+      {
+          "language-runtime",
+          PluginManager::GetLanguageRuntimePluginInfo,
+          PluginManager::SetLanguageRuntimePluginEnabled,
+      },
+
+      {
+          "memory-history",
+          PluginManager::GetMemoryHistoryPluginInfo,
+          PluginManager::SetMemoryHistoryPluginEnabled,
+      },
+
+      {
+          "object-container",
+          PluginManager::GetObjectContainerPluginInfo,
+          PluginManager::SetObjectContainerPluginEnabled,
+      },
+
+      {
+          "object-file",
+          PluginManager::GetObjectFilePluginInfo,
+          PluginManager::SetObjectFilePluginEnabled,
+      },
+
+      {
+          "operating-system",
+          PluginManager::GetOperatingSystemPluginInfo,
+          PluginManager::SetOperatingSystemPluginEnabled,
+      },
+
+      {
+          "platform",
+          PluginManager::GetPlatformPluginInfo,
+          PluginManager::SetPlatformPluginEnabled,
+      },
+
+      {
+          "process",
+          PluginManager::GetProcessPluginInfo,
+          PluginManager::SetProcessPluginEnabled,
+      },
+
+      {
+          "repl",
+          PluginManager::GetREPLPluginInfo,
+          PluginManager::SetREPLPluginEnabled,
+      },
+
+      {
+          "register-type-builder",
+          PluginManager::GetRegisterTypeBuilderPluginInfo,
+          PluginManager::SetRegisterTypeBuilderPluginEnabled,
+      },
+
+      {
+          "script-interpreter",
+          PluginManager::GetScriptInterpreterPluginInfo,
+          PluginManager::SetScriptInterpreterPluginEnabled,
+      },
+
+      {
+          "scripted-interface",
+          PluginManager::GetScriptedInterfacePluginInfo,
+          PluginManager::SetScriptedInterfacePluginEnabled,
+      },
+
+      {
+          "structured-data",
+          PluginManager::GetStructuredDataPluginInfo,
+          PluginManager::SetStructuredDataPluginEnabled,
+      },
+
+      {
+          "symbol-file",
+          PluginManager::GetSymbolFilePluginInfo,
+          PluginManager::SetSymbolFilePluginEnabled,
+      },
+
+      {
+          "symbol-locator",
+          PluginManager::GetSymbolLocatorPluginInfo,
+          PluginManager::SetSymbolLocatorPluginEnabled,
+      },
+
+      {
+          "symbol-vendor",
+          PluginManager::GetSymbolVendorPluginInfo,
+          PluginManager::SetSymbolVendorPluginEnabled,
+      },
+
+      {
+          "system-runtime",
+          PluginManager::GetSystemRuntimePluginInfo,
+          PluginManager::SetSystemRuntimePluginEnabled,
+      },
+
+      {
+          "trace",
+          PluginManager::GetTracePluginInfo,
+          PluginManager::SetTracePluginEnabled,
+      },
+
+      {
+          "trace-exporter",
+          PluginManager::GetTraceExporterPluginInfo,
+          PluginManager::SetTraceExporterPluginEnabled,
+      },
+
+      {
+          "type-system",
+          PluginManager::GetTypeSystemPluginInfo,
+          PluginManager::SetTypeSystemPluginEnabled,
+      },
+
+      {
+          "unwind-assembly",
+          PluginManager::GetUnwindAssemblyPluginInfo,
+          PluginManager::SetUnwindAssemblyPluginEnabled,
+      },
+  };
 
   return PluginNamespaces;
 }
@@ -407,7 +569,7 @@ ABICreateInstance PluginManager::GetABICreateCallbackAtIndex(uint32_t idx) {
 #pragma mark Architecture
 
 typedef PluginInstance<ArchitectureCreateInstance> ArchitectureInstance;
-typedef std::vector<ArchitectureInstance> ArchitectureInstances;
+typedef PluginInstances<ArchitectureInstance> ArchitectureInstances;
 
 static ArchitectureInstances &GetArchitectureInstances() {
   static ArchitectureInstances g_instances;
@@ -417,25 +579,18 @@ static ArchitectureInstances &GetArchitectureInstances() {
 void PluginManager::RegisterPlugin(llvm::StringRef name,
                                    llvm::StringRef description,
                                    ArchitectureCreateInstance create_callback) {
-  GetArchitectureInstances().push_back({name, description, create_callback});
+  GetArchitectureInstances().RegisterPlugin(name, description, create_callback);
 }
 
 void PluginManager::UnregisterPlugin(
     ArchitectureCreateInstance create_callback) {
   auto &instances = GetArchitectureInstances();
-
-  for (auto pos = instances.begin(), end = instances.end(); pos != end; ++pos) {
-    if (pos->create_callback == create_callback) {
-      instances.erase(pos);
-      return;
-    }
-  }
-  llvm_unreachable("Plugin not found");
+  instances.UnregisterPlugin(create_callback);
 }
 
 std::unique_ptr<Architecture>
 PluginManager::CreateArchitectureInstance(const ArchSpec &arch) {
-  for (const auto &instances : GetArchitectureInstances()) {
+  for (const auto &instances : GetArchitectureInstances().GetSnapshot()) {
     if (auto plugin_up = instances.create_callback(arch))
       return plugin_up;
   }
@@ -718,15 +873,6 @@ PluginManager::GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetSystemRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
-std::vector<RegisteredPluginInfo> PluginManager::GetSystemRuntimePluginInfo() {
-  return GetSystemRuntimeInstances().GetPluginInfoForAllInstances();
-}
-
-bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name,
-                                                  bool enable) {
-  return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable);
-}
-
 #pragma mark ObjectFile
 
 struct ObjectFileInstance : public PluginInstance<ObjectFileCreateInstance> {
@@ -1563,16 +1709,6 @@ PluginManager::GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetInstrumentationRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
-std::vector<RegisteredPluginInfo>
-PluginManager::GetInstrumentationRuntimePluginInfo() {
-  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
-}
-
-bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
-                                                           bool enable) {
-  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
-}
-
 #pragma mark TypeSystem
 
 struct TypeSystemInstance : public PluginInstance<TypeSystemCreateInstance> {
@@ -2057,3 +2193,234 @@ bool PluginManager::CreateSettingForCPlusPlusLanguagePlugin(
                                 "Settings for CPlusPlus language plug-ins",
                                 properties_sp, description, is_global_property);
 }
+
+//
+// Plugin Info+Enable Implementations
+//
+std::vector<RegisteredPluginInfo> PluginManager::GetABIPluginInfo() {
+  return GetABIInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetABIPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetABIInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetArchitecturePluginInfo() {
+  return GetArchitectureInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetArchitecturePluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetArchitectureInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetDisassemblerPluginInfo() {
+  return GetDisassemblerInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetDisassemblerPluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetDisassemblerInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetDynamicLoaderPluginInfo() {
+  return GetDynamicLoaderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetDynamicLoaderPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetDynamicLoaderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetEmulateInstructionPluginInfo() {
+  return GetEmulateInstructionInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetEmulateInstructionPluginEnabled(llvm::StringRef name,
+                                                       bool enable) {
+  return GetEmulateInstructionInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetInstrumentationRuntimePluginInfo() {
+  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                           bool enable) {
+  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetJITLoaderPluginInfo() {
+  return GetJITLoaderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetJITLoaderPluginEnabled(llvm::StringRef name,
+                                              bool enable) {
+  return GetJITLoaderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetLanguagePluginInfo() {
+  return GetLanguageInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetLanguagePluginEnabled(llvm::StringRef name,
+                                             bool enable) {
+  return GetLanguageInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetLanguageRuntimePluginInfo() {
+  return GetLanguageRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetLanguageRuntimePluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetLanguageRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetMemoryHistoryPluginInfo() {
+  return GetMemoryHistoryInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetMemoryHistoryPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetMemoryHistoryInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetObjectContainerPluginInfo() {
+  return GetObjectContainerInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetObjectContainerPluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetObjectContainerInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetObjectFilePluginInfo() {
+  return GetObjectFileInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetObjectFilePluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetObjectFileInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetOperatingSystemPluginInfo() {
+  return GetOperatingSystemInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetOperatingSystemPluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetOperatingSystemInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetPlatformPluginInfo() {
+  return GetPlatformInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetPlatformPluginEnabled(llvm::StringRef name,
+                                             bool enable) {
+  return GetPlatformInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetProcessPluginInfo() {
+  return GetProcessInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetProcessPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetProcessInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetREPLPluginInfo() {
+  return GetREPLInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetREPLPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetREPLInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetRegisterTypeBuilderPluginInfo() {
+  return GetRegisterTypeBuilderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetRegisterTypeBuilderPluginEnabled(llvm::StringRef name,
+                                                        bool enable) {
+  return GetRegisterTypeBuilderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetScriptInterpreterPluginInfo() {
+  return GetScriptInterpreterInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetScriptInterpreterPluginEnabled(llvm::StringRef name,
+                                                      bool enable) {
+  return GetScriptInterpreterInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetScriptedInterfacePluginInfo() {
+  return GetScriptedInterfaceInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetScriptedInterfacePluginEnabled(llvm::StringRef name,
+                                                      bool enable) {
+  return GetScriptedInterfaceInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetStructuredDataPluginInfo() {
+  return GetStructuredDataPluginInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetStructuredDataPluginEnabled(llvm::StringRef name,
+                                                   bool enable) {
+  return GetStructuredDataPluginInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolFilePluginInfo() {
+  return GetSymbolFileInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolFilePluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetSymbolFileInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolLocatorPluginInfo() {
+  return GetSymbolLocatorInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolLocatorPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetSymbolLocatorInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolVendorPluginInfo() {
+  return GetSymbolVendorInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolVendorPluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetSymbolVendorInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSystemRuntimePluginInfo() {
+  return GetSystemRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTracePluginInfo() {
+  return GetTracePluginInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTracePluginEnabled(llvm::StringRef name, bool enable) {
+  return GetTracePluginInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTraceExporterPluginInfo() {
+  return GetTraceExporterInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTraceExporterPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetTraceExporterInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTypeSystemPluginInfo() {
+  return GetTypeSystemInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTypeSystemPluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetTypeSystemInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetUnwindAssemblyPluginInfo() {
+  return GetUnwindAssemblyInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetUnwindAssemblyPluginEnabled(llvm::StringRef name,
+                                                   bool enable) {
+  return GetUnwindAssemblyInstances().SetInstanceEnabled(name, enable);
+}
diff --git a/lldb/test/API/commands/plugin/TestPlugin.py b/lldb/test/API/commands/plugin/TestPlugin.py
new file mode 100644
index 000000000000..fdfb14bfcc24
--- /dev/null
+++ b/lldb/test/API/commands/plugin/TestPlugin.py
@@ -0,0 +1,62 @@
+"""
+Make sure the plugin list, enable, and disable commands work.
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestFrameVar(TestBase):
+    # If your test case doesn't stress debug info, then
+    # set this to true.  That way it won't be run once for
+    # each debug info format.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_plugin_list_enable_disable_commands(self):
+        for plugin_namespace in [
+            "abi",
+            "architecture",
+            "disassembler",
+            "dynamic-loader",
+            "emulate-instruction",
+            "instrumentation-runtime",
+            "jit-loader",
+            "language",
+            "language-runtime",
+            "memory-history",
+            "object-container",
+            "object-file",
+            "operating-system",
+            "platform",
+            "process",
+            "repl",
+            "register-type-builder",
+            "script-interpreter",
+            "scripted-interface",
+            "structured-data",
+            "symbol-file",
+            "symbol-locator",
+            "symbol-vendor",
+            "system-runtime",
+            # 'trace', # No trace plugin is registered by default.
+            "trace-exporter",
+            "type-system",
+            "unwind-assembly",
+        ]:
+            self.do_list_disable_enable_test(plugin_namespace)
+
+    def do_list_disable_enable_test(self, plugin_namespace):
+        # Plugins are enabled by default.
+        self.expect(
+            f"plugin list {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
+        )
+
+        # Plugins can be disabled.
+        self.expect(
+            f"plugin disable {plugin_namespace}", substrs=[plugin_namespace, "[-]"]
+        )
+
+        # Plugins can be enabled.
+        self.expect(
+            f"plugin enable {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
+        )
diff --git a/lldb/test/Shell/Commands/command-plugin-list.test b/lldb/test/Shell/Commands/command-plugin-list.test
index 9d3680d48cdd..3f02157665bb 100644
--- a/lldb/test/Shell/Commands/command-plugin-list.test
+++ b/lldb/test/Shell/Commands/command-plugin-list.test
@@ -10,10 +10,10 @@
 # Test plugin list without an argument will list all plugins.
 plugin list
 # CHECK-LABEL: plugin list
-# CHECK: system-runtime
-# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
-# CHECK: instrumentation-runtime
-# CHECK:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+# CHECK-DAG: instrumentation-runtime
+# CHECK-DAG:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+# CHECK-DAG: system-runtime
+# CHECK-DAG:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
 
 # Test plugin list works with fully qualified name.
 plugin list system-runtime.systemruntime-macosx

From 908f74a25e01cc88d1dee1af5521d8fb1c21bc51 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 13:49:18 -0700
Subject: [PATCH 0711/1322] [llvm] re-order LLVM_ABI and extern on
 NoKernelInfoEndLTO decl (#144601)

## Overview
Fix compilation error introduced by #143615. Build failure logs
available
[here](https://lab.llvm.org/buildbot/#/builders/195/builds/10573)

## Background
On `extern` variable declarations, `LLVM_ABI` must appear before
`extern` because `LLVM_ABI` currently resolves to
`[[gnu::visibility("default")]]` when building with gcc.
---
 llvm/include/llvm/Target/TargetMachine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 04c97c1502a1..b286efdea3c1 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -29,7 +29,7 @@
 #include <string>
 #include <utility>
 
-extern LLVM_ABI llvm::cl::opt<bool> NoKernelInfoEndLTO;
+LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
 
 namespace llvm {
 

From 49bf8d38d80ce43bd700f27833a7b8c8e7082af8 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 20:49:41 +0000
Subject: [PATCH 0712/1322] [gn build] Manually port b4e39e4f

---
 llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index c1d107eefdf9..f4ee2599c01c 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -223,6 +223,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=",
       "HAVE_STRERROR_R=",
       "HAVE_SYSCONF=",
+      "HAVE_SYS_IOCTL_H=",
       "HAVE_SYS_MMAN_H=",
       "HAVE_UNISTD_H=",
       "HAVE__CHSIZE_S=1",
@@ -250,6 +251,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=1",
       "HAVE_STRERROR_R=1",
       "HAVE_SYSCONF=1",
+      "HAVE_SYS_IOCTL_H=1",
       "HAVE_SYS_MMAN_H=1",
       "HAVE_UNISTD_H=1",
       "HAVE__CHSIZE_S=",

From 8d1610afd0db877460d1b3cd43cc4066478846a0 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 17 Jun 2025 16:50:31 -0400
Subject: [PATCH 0713/1322] [libc++] Mark two assertion tests as unsupported in
 C++03 mode

Our assertion checking facility requires at least C++11, so these
tests were failing when run in C++03 mode.
---
 .../streambuf.protected/streambuf.get.area/setg.assert.pass.cpp | 2 +-
 .../streambuf.protected/streambuf.put.area/setp.assert.pass.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
index becf89b12fdd..973d744a1da4 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03, libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <streambuf>
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
index abd42272de50..5aaad2738d32 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03, libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <streambuf>

From 3c7df98c7b2a203e49a74b229bbf535c2ef6274b Mon Sep 17 00:00:00 2001
From: Piotr Idzik <65706193+vil02@users.noreply.github.com>
Date: Tue, 17 Jun 2025 22:59:53 +0200
Subject: [PATCH 0714/1322] [clang-tidy] Add missing colon in the docs of
 performance-enum-size (#144525)

There is a syntax error in the provided code example - this PR fixes it.

I did a quick search - I could not find similar _typos_.
---
 .../docs/clang-tidy/checks/performance/enum-size.rst            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
index f72b8c7eabc2..b7631139a013 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
@@ -34,7 +34,7 @@ dependent).
 .. code-block:: c++
 
     // AFTER
-    enum Color : std:int8_t {
+    enum Color : std::int8_t {
         RED = -1,
         GREEN = 0,
         BLUE = 1

From ecfb8fe5c1870091b095ae6ca1ad4cfc7158e619 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Tue, 17 Jun 2025 14:07:07 -0700
Subject: [PATCH 0715/1322] =?UTF-8?q?Revert=20stack=20"[Driver]=20Add=20su?=
 =?UTF-8?q?pport=20for=20GCC=20installation=20detection=20in=20=E2=80=A6?=
 =?UTF-8?q?=20(#144603)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…Baremetal toolchain (#121829)"

This reverts the following stack of commits, due to them breaking the
Fuchsia toolchain and corresponding LLVM buildbot.

Revert "[Driver] Fix Arm/AArch64 Link Argument tests (#144582)" This
reverts commit a79186c1ea62bbe0579e0b1eed4ad507966cca41.

Revert "[Driver] Add option to force undefined symbols during linking in
BareMetal toolchain object. (#132807)" This reverts commit
9cb754509608b9d9143fa17f775631bbfcce0848.

Revert "[Driver] Fix link order of BareMetal toolchain object (#132806)"
This reverts commit 31523de4b000ca254259ae3167d28922e1302648.

Revert "[Driver] Add support for crtbegin.o, crtend.o and libgloss lib
to BareMetal toolchain object (#121830)" This reverts commit
ec230aa7a7d13c222c0b34b87c3c16937383b4a0.

Revert "[Driver] Add support for GCC installation detection in Baremetal
toolchain (#121829)" This reverts commit
eb31c422d0dc816bf285a81bf92690d4d16273ed.
---
 clang/docs/Toolchain.rst                      |   5 -
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 -
 clang/lib/Driver/ToolChains/BareMetal.cpp     | 297 +++++-------------
 clang/lib/Driver/ToolChains/BareMetal.h       |  22 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |   0
 .../aarch64-none-elf/lib/.keep                |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../bin/aarch64-none-elf-ld                   |   1 -
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |   0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../aarch64-none-elf/lib/crtbegin.o           |   0
 .../aarch64-none-elf/lib/crtend.o             |   0
 .../bin/aarch64-none-elf-ld                   |   1 -
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |   0
 .../armv6m-none-eabi/lib/.keep                |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../bin/armv6m-none-eabi-ld                   |   1 -
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |   0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../armv6m-none-eabi/lib/crtbegin.o           |   0
 .../armv6m-none-eabi/lib/crtend.o             |   0
 .../bin/armv6m-none-eabi-ld                   |   1 -
 clang/test/Driver/aarch64-gnutools.c          |   4 -
 clang/test/Driver/aarch64-toolchain-extra.c   |  35 ---
 clang/test/Driver/aarch64-toolchain.c         | 157 ---------
 clang/test/Driver/arm-gnutools.c              |   6 -
 clang/test/Driver/arm-toolchain-extra.c       |  36 ---
 clang/test/Driver/arm-toolchain.c             | 158 ----------
 clang/test/Driver/baremetal-multilib.yaml     |   3 +-
 clang/test/Driver/baremetal-sysroot.cpp       |   8 +-
 .../test/Driver/baremetal-undefined-symbols.c |  14 -
 clang/test/Driver/baremetal.cpp               |  98 ++----
 clang/test/Driver/check-no-multlib-warning.c  |  10 -
 clang/test/Driver/riscv-args.c                |   6 +
 clang/test/Driver/sanitizer-ld.c              |   2 +-
 37 files changed, 124 insertions(+), 744 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 delete mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 delete mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 delete mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 delete mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 delete mode 100644 clang/test/Driver/aarch64-gnutools.c
 delete mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 delete mode 100644 clang/test/Driver/aarch64-toolchain.c
 delete mode 100644 clang/test/Driver/arm-gnutools.c
 delete mode 100644 clang/test/Driver/arm-toolchain-extra.c
 delete mode 100644 clang/test/Driver/arm-toolchain.c
 delete mode 100644 clang/test/Driver/baremetal-undefined-symbols.c
 delete mode 100644 clang/test/Driver/check-no-multlib-warning.c
 create mode 100644 clang/test/Driver/riscv-args.c

diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index d56b21d74c7e..958199eb7a2e 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,8 +347,3 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
-
-GCC Installation
-=================
-Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
-using ``-gcc-install-dir`` flag.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 94224e103875..29f6480ba935 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,9 +847,6 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
-def warn_drv_multilib_not_available_for_target: Warning<
-  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
-  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index d4e4e6d04b41..d8168ed15feb 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,40 +31,6 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
-}
-
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -129,8 +95,7 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeClangRuntimesSysRoot(const Driver &D,
-                                               bool IncludeTriple) {
+static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -143,125 +108,58 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
   return std::string(SysRootDir);
 }
 
-// Only consider the GCC toolchain based on the values provided through the
-// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
-// whether the GCC toolchain was initialized successfully.
-bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
-                                    const llvm::opt::ArgList &Args) {
-  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
-      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
-    GCCInstallation.init(Triple, Args);
-    return GCCInstallation.isValid();
-  }
-  return false;
-}
-
-// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
-// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
-// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
-// `bin/../<target-triple>/lib` directory.
-static bool detectGCCToolchainAdjacent(const Driver &D) {
-  SmallString<128> GCCDir;
-  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
-                          "lib/crt0.o");
-  return llvm::sys::fs::exists(GCCDir);
-}
-
-// If no sysroot is provided the driver will first attempt to infer it from the
-// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
-// location of a GCC toolchain.
-// If neither flag is used, the sysroot defaults to either:
-//    - `bin/../<target-triple>`
-//    - `bin/../lib/clang-runtimes/<target-triple>`
-//
-// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
-// does not exist relative to the driver.
-std::string BareMetal::computeSysRoot() const {
-  // Use Baremetal::sysroot if it has already been set.
-  if (!SysRoot.empty())
-    return SysRoot;
-
-  // Use the sysroot specified via the `--sysroot` command-line flag, if
-  // provided.
-  const Driver &D = getDriver();
-  if (!D.SysRoot.empty())
-    return D.SysRoot;
-
-  // Attempt to infer sysroot from a valid GCC installation.
-  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
-  SmallString<128> inferredSysRoot;
-  if (IsGCCInstallationValid) {
-    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
-                            "..", GCCInstallation.getTriple().str());
-  } else if (detectGCCToolchainAdjacent(D)) {
-    // Use the triple as provided to the driver. Unlike the parsed triple
-    // this has not been normalized to always contain every field.
-    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
-  }
-  // If a valid sysroot was inferred and exists, use it
-  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
-    return std::string(inferredSysRoot);
-
-  // Use the clang-runtimes path.
-  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
-}
-
-static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
-                                  const Multilib &Multilib,
-                                  StringRef InstallPath,
-                                  ToolChain::path_list &Paths) {
-  if (const auto &PathsCallback = Multilibs.filePathsCallback())
-    for (const auto &Path : PathsCallback(Multilib))
-      addPathIfExists(D, InstallPath + Path, Paths);
-}
-
-// GCC mutltilibs will only work for those targets that have their multlib
-// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
-// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
-// in GCCInstallation.
 BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
                      const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
-  std::string ComputedSysRoot = computeSysRoot();
-  if (IsGCCInstallationValid) {
-    if (!isRISCVBareMetal(Triple))
-      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
+    : ToolChain(D, Triple, Args),
+      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
+  getProgramPaths().push_back(getDriver().Dir);
 
-    Multilibs = GCCInstallation.getMultilibs();
-    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
-
-    path_list &Paths = getFilePaths();
-    // Add toolchain/multilib specific file paths.
-    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
-                          GCCInstallation.getInstallPath(), Paths);
-    // Adding filepath for locating crt{begin,end}.o files.
-    Paths.push_back(GCCInstallation.getInstallPath().str());
-    // Adding filepath for locating crt0.o file.
-    Paths.push_back(ComputedSysRoot + "/lib");
-
-    ToolChain::path_list &PPaths = getProgramPaths();
-    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
-    // directory off of the parent of the GCC installation.
-    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
-                           GCCInstallation.getTriple().str() + "/bin")
-                         .str());
-    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
-  } else {
-    getProgramPaths().push_back(getDriver().Dir);
-    findMultilibs(D, Triple, Args);
-    const SmallString<128> SysRootDir(computeSysRoot());
-    if (!SysRootDir.empty()) {
-      for (const Multilib &M : getOrderedMultilibs()) {
-        SmallString<128> Dir(SysRootDir);
-        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-        getFilePaths().push_back(std::string(Dir));
-        getLibraryPaths().push_back(std::string(Dir));
-      }
+  findMultilibs(D, Triple, Args);
+  SmallString<128> SysRoot(computeSysRoot());
+  if (!SysRoot.empty()) {
+    for (const Multilib &M : getOrderedMultilibs()) {
+      SmallString<128> Dir(SysRoot);
+      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+      getFilePaths().push_back(std::string(Dir));
+      getLibraryPaths().push_back(std::string(Dir));
     }
   }
 }
 
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
+}
+
 static void
 findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
                       StringRef MultilibPath, const ArgList &Args,
@@ -318,7 +216,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -336,7 +234,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -344,7 +242,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
+  } else if (isRISCVBareMetal(Triple)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -365,6 +263,8 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
+std::string BareMetal::computeSysRoot() const { return SysRoot; }
+
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -392,10 +292,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRootDir(computeSysRoot());
-  if (!SysRootDir.empty()) {
+  const SmallString<128> SysRoot(computeSysRoot());
+  if (!SysRoot.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRootDir);
+      SmallString<128> Dir(SysRoot);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -409,19 +309,6 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
-void BareMetal::addLibStdCxxIncludePaths(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  if (!IsGCCInstallationValid)
-    return;
-  const GCCVersion &Version = GCCInstallation.getVersion();
-  StringRef TripleStr = GCCInstallation.getTriple().str();
-  const Multilib &Multilib = GCCInstallation.getMultilib();
-  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
-                           TripleStr, Multilib.includeSuffix(), DriverArgs,
-                           CC1Args);
-}
-
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -452,23 +339,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-  case ToolChain::CST_Libcxx: {
-    SmallString<128> P(D.Dir);
-    llvm::sys::path::append(P, "..", "include");
-    AddCXXIncludePath(P);
-    break;
-  }
-  case ToolChain::CST_Libstdcxx:
-    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
-    break;
+    case ToolChain::CST_Libcxx: {
+      SmallString<128> P(D.Dir);
+      llvm::sys::path::append(P, "..", "include");
+      AddCXXIncludePath(P);
+      break;
+    }
+    case ToolChain::CST_Libstdcxx:
+      // We only support libc++ toolchain installation.
+      break;
   }
 
-  std::string SysRootDir(computeSysRoot());
-  if (SysRootDir.empty())
+  std::string SysRoot(computeSysRoot());
+  if (SysRoot.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRootDir);
+    SmallString<128> Dir(SysRoot);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
@@ -568,6 +455,8 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   const llvm::Triple::ArchType Arch = TC.getArch();
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
+  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
+
   CmdArgs.push_back("-Bstatic");
 
   if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
@@ -582,48 +471,19 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Arch == llvm::Triple::aarch64_be ? "-EB" : "-EL");
   }
 
-  bool NeedCRTs =
-      !Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles);
-
-  const char *CRTBegin, *CRTEnd;
-  if (NeedCRTs) {
-    if (!Args.hasArg(options::OPT_r))
-      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
-    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) {
-      auto RuntimeLib = TC.GetRuntimeLibType(Args);
-      switch (RuntimeLib) {
-      case (ToolChain::RLT_Libgcc): {
-        CRTBegin = "crtbegin.o";
-        CRTEnd = "crtend.o";
-        break;
-      }
-      case (ToolChain::RLT_CompilerRT): {
-        CRTBegin =
-            TC.getCompilerRTArgString(Args, "crtbegin", ToolChain::FT_Object);
-        CRTEnd =
-            TC.getCompilerRTArgString(Args, "crtend", ToolChain::FT_Object);
-        break;
-      }
-      }
-      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTBegin)));
-    }
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
+                   options::OPT_r)) {
+    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
   }
 
-  Args.addAllArgs(CmdArgs,
-                  {options::OPT_L, options::OPT_u, options::OPT_T_Group,
-                   options::OPT_s, options::OPT_t, options::OPT_r});
+  Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
+                            options::OPT_s, options::OPT_t, options::OPT_r});
 
   TC.AddFilePathLibArgs(Args, CmdArgs);
 
   for (const auto &LibPath : TC.getLibraryPaths())
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath)));
 
-  if (D.isUsingLTO())
-    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
-                  D.getLTOMode() == LTOK_Thin);
-
-  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
-
   if (TC.ShouldLinkCXXStdlib(Args)) {
     bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                !Args.hasArg(options::OPT_static);
@@ -636,17 +496,14 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
-    CmdArgs.push_back("--start-group");
     AddRunTimeLibs(TC, D, CmdArgs, Args);
+
     CmdArgs.push_back("-lc");
-    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D))
-      CmdArgs.push_back("-lgloss");
-    CmdArgs.push_back("--end-group");
   }
 
-  if ((TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) &&
-      NeedCRTs)
-    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
+  if (D.isUsingLTO())
+    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
+                  D.getLTOMode() == LTOK_Thin);
 
   if (TC.getTriple().isRISCV())
     CmdArgs.push_back("-X");
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index 54805530bae8..f6295bda0a6a 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
-#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -20,7 +19,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -36,9 +35,7 @@ protected:
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool initGCCInstallation(const llvm::Triple &Triple,
-                           const llvm::opt::ArgList &Args);
-  bool hasValidGCCInstallation() const { return IsGCCInstallationValid; }
+  bool useIntegratedAs() const override { return true; }
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -51,19 +48,15 @@ public:
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
-  UnwindTableLevel
-  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
-    return UnwindTableLevel::None;
-  }
-
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
-
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
 
+  const char *getDefaultLinker() const override { return "ld.lld"; }
+
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
@@ -74,9 +67,6 @@ public:
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
-  void
-  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
-                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -90,8 +80,6 @@ private:
 
   std::string SysRoot;
 
-  bool IsGCCInstallationValid;
-
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -116,7 +104,7 @@ public:
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
deleted file mode 100644
index 0214639ed380..000000000000
--- a/clang/test/Driver/aarch64-gnutools.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
deleted file mode 100644
index a0b5f2902962..000000000000
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// A basic clang -cc1 command-line, and simple environment check.
-
-// The tests here are similar to those in aarch64-toolchain.c, however
-// these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// REQUIRES: shell
-// UNSUPPORTED: system-windows
-
-// If there is no GCC install detected then the driver searches for executables
-// and runtime starting from the directory tree above the driver itself.
-// The test below checks that the driver correctly finds the linker and
-// runtime if and only if they exist.
-//
-// RUN: rm -rf %t
-// RUN: mkdir -p %t/aarch64-nogcc/bin
-// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
-// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
-// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld %t/aarch64-nogcc/bin/aarch64-none-elf-ld
-// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
-// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
-
-// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
-// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
-
-// C-AARCH64-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtend.o"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
deleted file mode 100644
index 327161b81d9f..000000000000
--- a/clang/test/Driver/aarch64-toolchain.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// UNSUPPORTED: system-windows
-
-// Test interaction with -fuse-ld=lld
-// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=LLD-AARCH64-BAREMETAL %s
-
-// LLD-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-AARCH64-BAREMETAL: "-Bstatic" "-EL"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
-
-// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL: "-Bstatic" "-EL"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
-
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
-
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
-
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
-
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
-
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-COMPILER-RT %s
-
-// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
-// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
-// AARCH64-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
-// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
-
-// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
-// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
-// AARCH64-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
deleted file mode 100644
index 6e107f19dabc..000000000000
--- a/clang/test/Driver/arm-gnutools.c
+++ /dev/null
@@ -1,6 +0,0 @@
-// check that gnu assembler is invoked with arm baremetal as well
-
-// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
deleted file mode 100644
index a04b41c13e95..000000000000
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// A basic clang -cc1 command-line, and simple environment check.
-
-// The tests here are similar to those in arm-toolchain.c, however
-// these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// REQUIRES: shell
-// UNSUPPORTED: system-windows
-
-// If there is no GCC install detected then the driver searches for executables
-// and runtime starting from the directory tree above the driver itself.
-// The test below checks that the driver correctly finds the linker and
-// runtime if and only if they exist.
-//
-// RUN: rm -rf %t
-// RUN: mkdir -p %t/arm-nogcc/bin
-// RUN: ln -s %clang %t/arm-nogcc/bin/clang
-// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
-// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld %t/arm-nogcc/bin/armv6m-none-eabi-ld
-// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
-// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
-// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib/crt0.o"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtbegin.o"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtend.o"
-
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
deleted file mode 100644
index 5368158cdeed..000000000000
--- a/clang/test/Driver/arm-toolchain.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=LLD-ARM-BAREMETAL %s
-
-// LLD-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-ARM-BAREMETAL: "-Bstatic" "-EL"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// LLD-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
-
-// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL: "-Bstatic" "-EL"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
-
-// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
-
-// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
-
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
-
-// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
-
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-COMPILER-RT %s
-
-// ARM-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
-// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
-// ARM-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
-// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
-
-// ARM-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
-// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
-// ARM-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml
index 1a80c3b4ccfc..853a4e9e36e4 100644
--- a/clang/test/Driver/baremetal-multilib.yaml
+++ b/clang/test/Driver/baremetal-multilib.yaml
@@ -8,9 +8,8 @@
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include/c++/v1"
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include"
 # CHECK-SAME: "-x" "c++" "{{.*}}baremetal-multilib.yaml"
-# CHECK-NEXT: ld{{(.exe)?}}" "-Bstatic"
+# CHECK-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 # CHECK-SAME: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/lib"
-# CHECK-SAME: "{{.*}}.o"
 # CHECK-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 # CHECK-SAME: "-lc"
 # CHECK-SAME: "-o" "{{.*}}.tmp.out"
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 47f0616df850..5d5b336a01b0 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -9,17 +9,15 @@
 // RUN: mkdir -p %T/baremetal_default_sysroot/lib/clang-runtimes/armv6m-none-eabi
 // RUN: ln -s %clang %T/baremetal_default_sysroot/bin/clang
 
-// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.out 2>&1 \
+// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target armv6m-none-eabi --sysroot= \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-C %s
 // CHECK-V6M-C: "{{.*}}clang{{.*}}" "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
-// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "-Bstatic"
-// CHECK-V6M-C-SAME: "crt0.o"
+// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "{{.*}}.o"
 // CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
-// CHECK-V6M-C-SAME: "-o" "{{.*}}.tmp.out"
+// CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
diff --git a/clang/test/Driver/baremetal-undefined-symbols.c b/clang/test/Driver/baremetal-undefined-symbols.c
deleted file mode 100644
index bff58c7c54c3..000000000000
--- a/clang/test/Driver/baremetal-undefined-symbols.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// Check the arguments are correctly passed
-
-// Make sure -T is the last with gcc-toolchain option
-// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-LD %s
-// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"
-
-// TODO: Merge this test with the above in the last patch when finally integrating riscv
-// Make sure -T is the last with gcc-toolchain option
-// RUN: %clang -### --target=aarch64-none-elf --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
-// RUN: %clang -### --target=armv6m-none-eabi --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
-// CHECK-ARM-LD: {{.*}} "-T" "a.lds" "-u" "foo" {{.*}} "--defsym=FOO=10"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index b75f1a9280d1..a80aa9b43711 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -15,12 +15,11 @@
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "{{.*}}.o"
-// CHECK-V6M-C-SAME: {{[^"]*}}libclang_rt.builtins.a"
+// CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
 // CHECK-V6M-C-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
 
@@ -40,10 +39,9 @@
 // CHECK-V6M-TREE-SAME: {{^}} "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-TREE-SAME: "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-TREE-SAME: "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi{{[/\\]+}}crt0.o"
 // CHECK-V6M-TREE-SAME: "-L[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi"
-// CHECK-V6M-TREE-SAME "{{.*}}.o"
 // CHECK-V6M-TREE-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-TREE-SAME: "-lc"
 // CHECK-V6M-TREE-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
@@ -55,21 +53,19 @@
 // CHECK-ARMV7M-PER-TARGET: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-ARMV7M-PER-TARGET: "-isysroot" "[[SYSROOT:[^"]*]]"
 // CHECK-ARMV7M-PER-TARGET: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-ARMV7M-PER_TARGET: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
-// CHECK-ARMV7M-PER-TARGET: "{{.*}}.o"
 // CHECK-ARMV7M-PER-TARGET: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-ARMV7M-PER-TARGET: "-lc"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-DEFAULTCXX %s
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lm"
 // CHECK-V6M-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -81,9 +77,8 @@
 // CHECK-V6M-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-SAME: "-lc++"
 // CHECK-V6M-LIBCXX-SAME: "-lm"
 // CHECK-V6M-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -97,9 +92,8 @@
 // CHECK-V6M-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
-// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-V6M-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lc"
@@ -110,7 +104,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-NDL %s
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-NDL: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
@@ -125,7 +119,6 @@
 // CHECK-V6M-LIBCXX-USR-SAME: "-internal-isystem" "{{[^"]+}}baremetal_cxx_sysroot{{[/\\]+}}usr{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
-// CHECK-V6M-LIBCXX-USR: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lm"
 // CHECK-V6M-LIBCXX-USR-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc"
@@ -156,7 +149,7 @@
 
 // RUN: %clang -### %s --target=armebv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
-// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "--be8" "-EB"
+// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "--be8" "-EB"
 
 // RUN: %clang -### %s --target=armv7-none-eabi -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
@@ -166,7 +159,7 @@
 
 // RUN: %clang -### %s --target=armv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EL %s
-// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-ARMV7EL-NOT: "--be8"
 
 // RUN: %clang -### %s --target=armebv7-none-eabi -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -177,7 +170,7 @@
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64BE %s
-// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EB"
+// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EB"
 // CHECK-AARCH64BE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64-none-elf -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -188,7 +181,7 @@
 
 // RUN: %clang -### %s --target=aarch64-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64LE %s
-// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-AARCH64LE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -203,22 +196,6 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
-// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
-// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
-// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-
-// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
-// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
-// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
@@ -228,10 +205,9 @@
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV64-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
 // CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
@@ -240,9 +216,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-DEFAULTCXX %s
 // CHECK-RV64-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-DEFAULTCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
@@ -255,9 +230,8 @@
 // CHECK-RV64-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV64-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-LIBCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
@@ -270,9 +244,8 @@
 // CHECK-RV64-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
@@ -288,10 +261,9 @@
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV32-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
 // CHECK-RV32-SAME: "-X" "-o" "a.out"
@@ -300,9 +272,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV32-DEFAULTCXX %s
 // CHECK-RV32-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
@@ -315,9 +286,8 @@
 // CHECK-RV32-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV32-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
@@ -329,13 +299,11 @@
 // CHECK-RV32-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
@@ -352,7 +320,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-NDL %s
 // CHECK-RV64-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-NDL: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
@@ -371,7 +339,7 @@
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include"
 // CHECK-RV64FD-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64FD-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -390,7 +358,7 @@
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32I-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32I-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -409,7 +377,7 @@
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IM-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IM-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -423,7 +391,7 @@
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IAC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IAC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf -march=rv32imafc -mabi=ilp32f \
@@ -444,7 +412,7 @@
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include"
 // CHECK-RV32IMAFC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IMAFC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}lib"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc-unknown-eabi 2>&1 \
@@ -455,9 +423,8 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCEABI-SAME:"{{.*}}.o"
 // CHECK-PPCEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCEABI-SAME: "-lc"
 // CHECK-PPCEABI-SAME: "-o" "a.out"
@@ -470,9 +437,8 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64EABI-SAME:"{{.*}}.o"
 // CHECK-PPC64EABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64EABI-SAME: "-lc"
 // CHECK-PPC64EABI-SAME: "-o" "a.out"
@@ -485,9 +451,8 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCLEEABI-SAME:"{{.*}}.o"
 // CHECK-PPCLEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCLEEABI-SAME: "-lc"
 // CHECK-PPCLEEABI-SAME: "-o" "a.out"
@@ -500,9 +465,8 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64LEEABI-SAME:"{{.*}}.o"
 // CHECK-PPC64LEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64LEEABI-SAME: "-lc"
 // CHECK-PPC64LEEABI-SAME: "-o" "a.out"
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
deleted file mode 100644
index 9a0d7cee450a..000000000000
--- a/clang/test/Driver/check-no-multlib-warning.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// UNSUPPORTED: system-windows
-
-
-// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
-// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
-// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
-// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
-
-// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
-// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
diff --git a/clang/test/Driver/riscv-args.c b/clang/test/Driver/riscv-args.c
new file mode 100644
index 000000000000..cab08e5b0f81
--- /dev/null
+++ b/clang/test/Driver/riscv-args.c
@@ -0,0 +1,6 @@
+// Check the arguments are correctly passed
+
+// Make sure -T is the last with gcc-toolchain option
+// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LD %s
+// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index d2e4877e89d7..befd322d027c 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -1033,7 +1033,7 @@
 // RUN:     --target=riscv32-unknown-elf -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32
 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error:
-// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv64-unknown-linux -fuse-ld=ld \

From a5a0d880736f5dc6a566374bc3b3ca0d86901510 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Tue, 17 Jun 2025 14:07:16 -0700
Subject: [PATCH 0716/1322] [libc++] Remove trailing newline from
 _LIBCPP_ASSERTION_HANDLER calls (#143573)

This newline was originally added in https://reviews.llvm.org/D142184
but I think updating `__libcpp_verbose_abort` to add newline instead is
more consistent, and works for other callers of `_LIBCPP_VERBOSE_ABORT`.

The `_LIBCPP_ASSERTION_HANDLER` calls through to either
`_LIBCPP_VERBOSE_ABORT` macro or the `__builtin_verbose_trap`. From what
I can tell neither of these function expect a trailing newline (at least
none of the usage of `_LIBCPP_VERBOSE_ABORT` or `__builtin_verbose_trap`
that I can find include a trailing newline except `_LIBCPP_ASSERTION_HANDLER`).

I noticed this discrepancy when working on
https://github.com/emscripten-core/emscripten/pull/24543
---
 libcxx/include/__assert               | 4 ++--
 libcxx/src/verbose_abort.cpp          | 3 +++
 libcxx/test/support/check_assertion.h | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index 90eaa6023587..1bfed2890b79 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -20,8 +20,8 @@
 #define _LIBCPP_ASSERT(expression, message)                                                                            \
   (__builtin_expect(static_cast<bool>(expression), 1)                                                                  \
        ? (void)0                                                                                                       \
-       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(            \
-             expression) " failed: " message "\n"))
+       : _LIBCPP_ASSERTION_HANDLER(                                                                                    \
+             __FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(expression) " failed: " message))
 
 // WARNING: __builtin_assume can currently inhibit optimizations. Only add assumptions with a clear
 // optimization intent. See https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 for a
diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp
index 94bdb451dee7..efb7b9be6f61 100644
--- a/libcxx/src/verbose_abort.cpp
+++ b/libcxx/src/verbose_abort.cpp
@@ -30,6 +30,9 @@ _LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) noexcept {
     va_list list;
     va_start(list, format);
     std::vfprintf(stderr, format, list);
+    // Callers of `__libcpp_verbose_abort` do not include a newline but when
+    // writing the message to stderr we need to include one.
+    std::fputc('\n', stderr);
     va_end(list);
   }
 
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index a279400d651b..ea04944ea932 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -340,7 +340,7 @@ void std::__libcpp_verbose_abort(char const* format, ...) noexcept {
 
   std::fprintf(stderr, "%s\n", Marker);
   std::vfprintf(stderr, format, args);
-  std::fprintf(stderr, "%s", Marker);
+  std::fprintf(stderr, "\n%s", Marker);
 
   va_end(args);
 

From 844e41c2acedd5219d9363e38838abd5146f63c0 Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Tue, 17 Jun 2025 14:12:35 -0700
Subject: [PATCH 0717/1322] [libc] Moved shared constexpr to the top (#144569)

Some conversions shared constexpr so moved to the top.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 .../src/__support/wchar/character_converter.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3b9046dfb9a7..5ab0447bb08b 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -19,6 +19,13 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
+// This is for utf-8 bytes other than the first byte
+constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+// The number of bits per utf-8 byte that actually encode character
+// Information not metadata (# of bits excluding the byte headers)
+constexpr uint32_t MASK_ENCODED_BITS =
+    mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
 void CharacterConverter::clear() {
@@ -61,10 +68,8 @@ int CharacterConverter::push(char8_t utf8_byte) {
   }
   // Any subsequent push
   // Adding 6 more bits so need to left shift
-  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
   if (num_ones == 1 && !isComplete()) {
-    char32_t byte =
-        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+    char32_t byte = utf8_byte & MASK_ENCODED_BITS;
     state->partial = state->partial << ENCODED_BITS_PER_UTF8;
     state->partial |= byte;
     state->bytes_processed++;
@@ -117,12 +122,6 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
   constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
 
-  // the number of bits per utf-8 byte that actually encode character
-  // information not metadata (# of bits excluding the byte headers)
-  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
-  constexpr int MASK_ENCODED_BITS =
-      mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
-
   char32_t output;
 
   // Shift to get the next 6 bits from the utf32 encoding

From 6fb36db4818abde56e5da47899dcdaacd8293903 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 17 Jun 2025 16:16:37 -0500
Subject: [PATCH 0718/1322] [LinkerWrapper] Fix 'save-temps' when targeting
 SPIR-V (#144605)

Summary:
The logic here is flawed, it was only intended to apply to the CPU case
where we use the linker passed in on the command line. This was falsely
applying to SPIR-V which caused issues.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 7a1007d03737..0f1fa8b329fd 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -268,7 +268,8 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
 bool linkerSupportsLTO(const ArgList &Args) {
   llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   return Triple.isNVPTX() || Triple.isAMDGPU() ||
-         Args.getLastArgValue(OPT_linker_path_EQ).ends_with("lld");
+         (!Triple.isGPU() &&
+          Args.getLastArgValue(OPT_linker_path_EQ).ends_with("lld"));
 }
 
 /// Returns the hashed value for a constant string.

From 362b9d78b4ee9107da2b5e90b3764b0f0fa610fe Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Tue, 17 Jun 2025 14:42:06 -0700
Subject: [PATCH 0719/1322] [lldb-dap] Refactoring DebugCommunication to
 improve test consistency. (#143818)

In DebugCommunication, we currently are using 2 thread to drive
lldb-dap. At the moment, they make an attempt at only synchronizing the
`recv_packets` between the reader thread and the main test thread. Other
stateful properties of the debug session are not guarded by a
locks/mutex.

To mitigate this, I am moving any state updates to the main thread
inside the `_recv_packet` method to ensure that between calls to
`_recv_packet` the state does not change out from under us in a test.

This does mean the precise timing of events has changed slightly as a
result and I've updated the existing tests that fail for me locally with
this new behavior.

I think this should result in overall more predictable behavior, even if
the test is slow due to the host workload or architecture differences.

---------

Co-authored-by: Ebuka Ezike <yerimyah1@gmail.com>
---
 .../test/tools/lldb-dap/dap_server.py         | 871 +++++++++++-------
 .../test/tools/lldb-dap/lldbdap_testcase.py   |  79 +-
 .../breakpoint/TestDAP_setBreakpoints.py      |   5 +-
 .../tools/lldb-dap/cancel/TestDAP_cancel.py   |  10 +-
 .../tools/lldb-dap/launch/TestDAP_launch.py   |  12 +-
 .../tools/lldb-dap/module/TestDAP_module.py   |   2 +-
 .../tools/lldb-dap/output/TestDAP_output.py   |   4 +-
 7 files changed, 588 insertions(+), 395 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 6d32491eaa5e..23178a215206 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -10,17 +10,124 @@ import string
 import subprocess
 import signal
 import sys
+from dataclasses import dataclass
 import threading
 import time
-from typing import Any, Optional, Union, BinaryIO, TextIO
+from typing import (
+    IO,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Generic,
+    TypedDict,
+    Union,
+    BinaryIO,
+    TextIO,
+    Literal,
+    cast,
+)
 
 ## DAP type references
-Event = dict[str, Any]
-Request = dict[str, Any]
-Response = dict[str, Any]
+
+T = TypeVar("T")
+Te = TypeVar("Te")  # Generic type for event body
+Ta = TypeVar("Ta")  # Generic type for request arguments
+Tb = TypeVar("Tb")  # Generic type for response body
+
+
+class Event(Generic[Te], TypedDict):
+    type: Literal["event"]
+    seq: int
+    event: str
+    body: Optional[Te]
+
+
+class Request(Generic[Ta], TypedDict, total=False):
+    type: Literal["request"]
+    seq: int
+    command: str
+    arguments: Ta
+
+
+class Response(Generic[Tb], TypedDict):
+    type: Literal["response"]
+    seq: int
+    request_seq: int
+    success: bool
+    command: str
+    message: Optional[str]
+    body: Optional[Tb]
+
+
 ProtocolMessage = Union[Event, Request, Response]
 
 
+class AttachOrLaunchArguments(TypedDict, total=False):
+    stopOnEntry: bool
+    disableASLR: bool
+    disableSTDIO: bool
+    enableAutoVariableSummaries: bool
+    displayExtendedBacktrace: bool
+    enableSyntheticChildDebugging: bool
+    initCommands: List[str]
+    preRunCommands: List[str]
+    postRunCommands: List[str]
+    stopCommands: List[str]
+    exitCommands: List[str]
+    terminateCommands: List[str]
+    sourceMap: Union[List[Tuple[str, str]], Dict[str, str]]
+    sourcePath: str
+    debuggerRoot: str
+    commandEscapePrefix: str
+    customFrameFormat: str
+    customThreadFormat: str
+
+
+class LaunchArguments(AttachOrLaunchArguments, total=False):
+    program: str
+    args: List[str]
+    cwd: str
+    env: Dict[str, str]
+    shellExpandArguments: bool
+    runInTerminal: bool
+    launchCommands: List[str]
+
+
+# Using the function form of TypedDict to allow for hyphenated keys.
+AttachGdbServer = TypedDict(
+    "AttachGdbServer", {"gdb-remote-port": int, "gdb-remote-hostname": str}, total=False
+)
+
+
+class AttachArguments(AttachGdbServer, AttachOrLaunchArguments, total=False):
+    program: str
+    pid: int
+    waitFor: bool
+    attachCommands: List[str]
+    coreFile: str
+
+
+class BreakpointData(TypedDict, total=False):
+    column: int
+    condition: str
+    hitCondition: str
+    logMessage: str
+    mode: str
+
+
+class SourceBreakpoint(BreakpointData):
+    line: int
+
+
+class Breakpoint(TypedDict, total=False):
+    id: int
+    verified: bool
+
+
 def dump_memory(base_addr, data, num_per_line, outfile):
     data_len = len(data)
     hex_string = binascii.hexlify(data)
@@ -58,7 +165,9 @@ def dump_memory(base_addr, data, num_per_line, outfile):
         outfile.write("\n")
 
 
-def read_packet(f, verbose=False, trace_file=None):
+def read_packet(
+    f: IO[bytes], trace_file: Optional[IO[str]] = None
+) -> Optional[ProtocolMessage]:
     """Decode a JSON packet that starts with the content length and is
     followed by the JSON bytes from a file 'f'. Returns None on EOF.
     """
@@ -70,32 +179,20 @@ def read_packet(f, verbose=False, trace_file=None):
     prefix = "Content-Length: "
     if line.startswith(prefix):
         # Decode length of JSON bytes
-        if verbose:
-            print('content: "%s"' % (line))
         length = int(line[len(prefix) :])
-        if verbose:
-            print('length: "%u"' % (length))
         # Skip empty line
-        line = f.readline()
-        if verbose:
-            print('empty: "%s"' % (line))
+        line = f.readline().decode()
         # Read JSON bytes
         json_str = f.read(length)
-        if verbose:
-            print('json: "%s"' % (json_str))
         if trace_file:
-            trace_file.write("from adapter:\n%s\n" % (json_str))
+            trace_file.write(f"from adapter:\n{json_str!r}\n")
         # Decode the JSON bytes into a python dictionary
         return json.loads(json_str)
 
     raise Exception("unexpected malformed message from lldb-dap: " + line)
 
 
-def packet_type_is(packet, packet_type):
-    return "type" in packet and packet["type"] == packet_type
-
-
-def dump_dap_log(log_file):
+def dump_dap_log(log_file: Optional[str]) -> None:
     print("========= DEBUG ADAPTER PROTOCOL LOGS =========", file=sys.stderr)
     if log_file is None:
         print("no log file available", file=sys.stderr)
@@ -105,34 +202,30 @@ def dump_dap_log(log_file):
     print("========= END =========", file=sys.stderr)
 
 
-class Source(object):
+@dataclass
+class Source:
+    path: Optional[str]
+    source_reference: Optional[int]
+
+    @property
+    def name(self) -> Optional[str]:
+        if not self.path:
+            return None
+        return os.path.basename(self.path)
+
     def __init__(
         self, path: Optional[str] = None, source_reference: Optional[int] = None
     ):
-        self._name = None
-        self._path = None
-        self._source_reference = None
-
-        if path is not None:
-            self._name = os.path.basename(path)
-            self._path = path
-        elif source_reference is not None:
-            self._source_reference = source_reference
-        else:
+        if path is None and source_reference is None:
             raise ValueError("Either path or source_reference must be provided")
 
-    def __str__(self):
-        return f"Source(name={self.name}, path={self.path}), source_reference={self.source_reference})"
+        self.path = path
+        self.source_reference = source_reference
 
-    def as_dict(self):
-        source_dict = {}
-        if self._name is not None:
-            source_dict["name"] = self._name
-        if self._path is not None:
-            source_dict["path"] = self._path
-        if self._source_reference is not None:
-            source_dict["sourceReference"] = self._source_reference
-        return source_dict
+    def to_DAP(self) -> dict:
+        if self.path:
+            return {"path": self.path, "name": self.name}
+        return {"sourceReference": self.source_reference}
 
 
 class NotSupportedError(KeyError):
@@ -144,7 +237,7 @@ class DebugCommunication(object):
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: list[str],
+        init_commands: List[str],
         log_file: Optional[TextIO] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
@@ -152,35 +245,50 @@ class DebugCommunication(object):
         self.log_file = log_file
         self.send = send
         self.recv = recv
-        self.recv_packets: list[Optional[ProtocolMessage]] = []
-        self.recv_condition = threading.Condition()
-        self.recv_thread = threading.Thread(target=self._read_packet_thread)
-        self.process_event_body = None
-        self.exit_status: Optional[int] = None
-        self.capabilities: dict[str, Any] = {}
-        self.progress_events: list[Event] = []
-        self.reverse_requests = []
-        self.sequence = 1
-        self.threads = None
-        self.thread_stop_reasons = {}
-        self.recv_thread.start()
-        self.output_condition = threading.Condition()
-        self.output: dict[str, list[str]] = {}
-        self.configuration_done_sent = False
-        self.initialized = False
-        self.frame_scopes = {}
+        # Packets that have been received and processed but have not yet been
+        # requested by a test case.
+        self._pending_packets: List[Optional[ProtocolMessage]] = []
+        # Received packets that have not yet been processed.
+        self._recv_packets: List[Optional[ProtocolMessage]] = []
+        # Used as a mutex for _recv_packets and for notify when _recv_packets
+        # changes.
+        self._recv_condition = threading.Condition()
+        self._recv_thread = threading.Thread(target=self._read_packet_thread)
+
+        # session state
         self.init_commands = init_commands
-        self.resolved_breakpoints = {}
+        self.exit_status: Optional[int] = None
+        self.capabilities: Optional[Dict] = None
+        self.initialized: bool = False
+        self.configuration_done_sent: bool = False
+        self.process_event_body: Optional[Dict] = None
+        self.terminated: bool = False
+        self.events: List[Event] = []
+        self.progress_events: List[Event] = []
+        self.reverse_requests: List[Request] = []
+        self.module_events: List[Dict] = []
+        self.sequence: int = 1
+        self.output: Dict[str, str] = {}
+
+        # debuggee state
+        self.threads: Optional[dict] = None
+        self.thread_stop_reasons: Dict[str, Any] = {}
+        self.frame_scopes: Dict[str, Any] = {}
+        # keyed by breakpoint id
+        self.resolved_breakpoints: Dict[str, bool] = {}
+
+        # trigger enqueue thread
+        self._recv_thread.start()
 
     @classmethod
     def encode_content(cls, s: str) -> bytes:
         return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8")
 
     @classmethod
-    def validate_response(cls, command, response):
-        if command["command"] != response["command"]:
+    def validate_response(cls, request: Request, response: Response) -> None:
+        if request["command"] != response["command"]:
             raise ValueError("command mismatch in response")
-        if command["seq"] != response["request_seq"]:
+        if request["seq"] != response["request_seq"]:
             raise ValueError("seq mismatch in response")
 
     def _read_packet_thread(self):
@@ -189,262 +297,323 @@ class DebugCommunication(object):
             while not done:
                 packet = read_packet(self.recv, trace_file=self.trace_file)
                 # `packet` will be `None` on EOF. We want to pass it down to
-                # handle_recv_packet anyway so the main thread can handle unexpected
-                # termination of lldb-dap and stop waiting for new packets.
+                # handle_recv_packet anyway so the main thread can handle
+                # unexpected termination of lldb-dap and stop waiting for new
+                # packets.
                 done = not self._handle_recv_packet(packet)
         finally:
             dump_dap_log(self.log_file)
 
-    def get_modules(self):
-        module_list = self.request_modules()["body"]["modules"]
-        modules = {}
-        for module in module_list:
-            modules[module["name"]] = module
-        return modules
-
-    def get_output(self, category, timeout=0.0, clear=True):
-        self.output_condition.acquire()
-        output = None
-        if category in self.output:
-            output = self.output[category]
-            if clear:
-                del self.output[category]
-        elif timeout != 0.0:
-            self.output_condition.wait(timeout)
-            if category in self.output:
-                output = self.output[category]
-                if clear:
-                    del self.output[category]
-        self.output_condition.release()
-        return output
-
-    def collect_output(self, category, timeout_secs, pattern, clear=True):
-        end_time = time.time() + timeout_secs
-        collected_output = ""
-        while end_time > time.time():
-            output = self.get_output(category, timeout=0.25, clear=clear)
-            if output:
-                collected_output += output
-                if pattern is not None and pattern in output:
-                    break
-        return collected_output if collected_output else None
-
-    def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]):
-        self.recv_condition.acquire()
-        self.recv_packets.append(packet)
-        self.recv_condition.notify()
-        self.recv_condition.release()
-
     def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
-        """Called by the read thread that is waiting for all incoming packets
-        to store the incoming packet in "self.recv_packets" in a thread safe
-        way. This function will then signal the "self.recv_condition" to
-        indicate a new packet is available. Returns True if the caller
-        should keep calling this function for more packets.
+        """Handles an incoming packet.
+
+        Called by the read thread that is waiting for all incoming packets
+        to store the incoming packet in "self._recv_packets" in a thread safe
+        way. This function will then signal the "self._recv_condition" to
+        indicate a new packet is available.
+
+        Args:
+            packet: A new packet to store.
+
+        Returns:
+            True if the caller should keep calling this function for more
+            packets.
         """
-        # If EOF, notify the read thread by enqueuing a None.
-        if not packet:
-            self._enqueue_recv_packet(None)
-            return False
+        with self._recv_condition:
+            self._recv_packets.append(packet)
+            self._recv_condition.notify()
+            # packet is None on EOF
+            return packet is not None and not (
+                packet["type"] == "response" and packet["command"] == "disconnect"
+            )
 
-        # Check the packet to see if is an event packet
-        keepGoing = True
-        packet_type = packet["type"]
-        if packet_type == "event":
-            event = packet["event"]
-            body = None
-            if "body" in packet:
-                body = packet["body"]
-            # Handle the event packet and cache information from these packets
-            # as they come in
-            if event == "output":
-                # Store any output we receive so clients can retrieve it later.
-                category = body["category"]
-                output = body["output"]
-                self.output_condition.acquire()
-                if category in self.output:
-                    self.output[category] += output
-                else:
-                    self.output[category] = output
-                self.output_condition.notify()
-                self.output_condition.release()
-                # no need to add 'output' event packets to our packets list
-                return keepGoing
-            elif event == "initialized":
-                self.initialized = True
-            elif event == "process":
-                # When a new process is attached or launched, remember the
-                # details that are available in the body of the event
-                self.process_event_body = body
-            elif event == "exited":
-                # Process exited, mark the status to indicate the process is not
-                # alive.
-                self.exit_status = body["exitCode"]
-            elif event == "continued":
-                # When the process continues, clear the known threads and
-                # thread_stop_reasons.
-                all_threads_continued = body.get("allThreadsContinued", True)
-                tid = body["threadId"]
-                if tid in self.thread_stop_reasons:
-                    del self.thread_stop_reasons[tid]
-                self._process_continued(all_threads_continued)
-            elif event == "stopped":
-                # Each thread that stops with a reason will send a
-                # 'stopped' event. We need to remember the thread stop
-                # reasons since the 'threads' command doesn't return
-                # that information.
-                self._process_stopped()
-                tid = body["threadId"]
-                self.thread_stop_reasons[tid] = body
-            elif event.startswith("progress"):
-                # Progress events come in as 'progressStart', 'progressUpdate',
-                # and 'progressEnd' events. Keep these around in case test
-                # cases want to verify them.
-                self.progress_events.append(packet)
-            elif event == "breakpoint":
-                # Breakpoint events are sent when a breakpoint is resolved
-                self._update_verified_breakpoints([body["breakpoint"]])
-            elif event == "capabilities":
-                # Update the capabilities with new ones from the event.
-                self.capabilities.update(body["capabilities"])
+    def _recv_packet(
+        self,
+        *,
+        predicate: Optional[Callable[[ProtocolMessage], bool]] = None,
+        timeout: Optional[float] = None,
+    ) -> Optional[ProtocolMessage]:
+        """Processes received packets from the adapter.
 
-        elif packet_type == "response":
-            if packet["command"] == "disconnect":
-                keepGoing = False
-        self._enqueue_recv_packet(packet)
-        return keepGoing
+        Updates the DebugCommunication stateful properties based on the received
+        packets in the order they are received.
+
+        NOTE: The only time the session state properties should be updated is
+        during this call to ensure consistency during tests.
+
+        Args:
+            predicate:
+                Optional, if specified, returns the first packet that matches
+                the given predicate.
+            timeout:
+                Optional, if specified, processes packets until either the
+                timeout occurs or the predicate matches a packet, whichever
+                occurs first.
+
+        Returns:
+            The first matching packet for the given predicate, if specified,
+            otherwise None.
+        """
+        assert (
+            threading.current_thread != self._recv_thread
+        ), "Must not be called from the _recv_thread"
+
+        def process_until_match():
+            self._process_recv_packets()
+            for i, packet in enumerate(self._pending_packets):
+                if packet is None:
+                    # We need to return a truthy value to break out of the
+                    # wait_for, use `EOFError` as an indicator of EOF.
+                    return EOFError()
+                if predicate and predicate(packet):
+                    self._pending_packets.pop(i)
+                    return packet
+
+        with self._recv_condition:
+            packet = self._recv_condition.wait_for(process_until_match, timeout)
+            return None if isinstance(packet, EOFError) else packet
+
+    def _process_recv_packets(self) -> None:
+        """Process received packets, updating the session state."""
+        with self._recv_condition:
+            for packet in self._recv_packets:
+                # Handle events that may modify any stateful properties of
+                # the DAP session.
+                if packet and packet["type"] == "event":
+                    self._handle_event(packet)
+                elif packet and packet["type"] == "request":
+                    # Handle reverse requests and keep processing.
+                    self._handle_reverse_request(packet)
+                # Move the packet to the pending queue.
+                self._pending_packets.append(packet)
+            self._recv_packets.clear()
+
+    def _handle_event(self, packet: Event) -> None:
+        """Handle any events that modify debug session state we track."""
+        event = packet["event"]
+        body: Optional[Dict] = packet.get("body", None)
+
+        if event == "output" and body:
+            # Store any output we receive so clients can retrieve it later.
+            category = body["category"]
+            output = body["output"]
+            if category in self.output:
+                self.output[category] += output
+            else:
+                self.output[category] = output
+        elif event == "initialized":
+            self.initialized = True
+        elif event == "process":
+            # When a new process is attached or launched, remember the
+            # details that are available in the body of the event
+            self.process_event_body = body
+        elif event == "exited" and body:
+            # Process exited, mark the status to indicate the process is not
+            # alive.
+            self.exit_status = body["exitCode"]
+        elif event == "continued" and body:
+            # When the process continues, clear the known threads and
+            # thread_stop_reasons.
+            all_threads_continued = body.get("allThreadsContinued", True)
+            tid = body["threadId"]
+            if tid in self.thread_stop_reasons:
+                del self.thread_stop_reasons[tid]
+            self._process_continued(all_threads_continued)
+        elif event == "stopped" and body:
+            # Each thread that stops with a reason will send a
+            # 'stopped' event. We need to remember the thread stop
+            # reasons since the 'threads' command doesn't return
+            # that information.
+            self._process_stopped()
+            tid = body["threadId"]
+            self.thread_stop_reasons[tid] = body
+        elif event.startswith("progress"):
+            # Progress events come in as 'progressStart', 'progressUpdate',
+            # and 'progressEnd' events. Keep these around in case test
+            # cases want to verify them.
+            self.progress_events.append(packet)
+        elif event == "breakpoint" and body:
+            # Breakpoint events are sent when a breakpoint is resolved
+            self._update_verified_breakpoints([body["breakpoint"]])
+        elif event == "capabilities" and body:
+            if self.capabilities is None:
+                self.capabilities = {}
+            # Update the capabilities with new ones from the event.
+            self.capabilities.update(body["capabilities"])
+
+    def _handle_reverse_request(self, request: Request) -> None:
+        if request in self.reverse_requests:
+            return
+        self.reverse_requests.append(request)
+        arguments = request.get("arguments")
+        if request["command"] == "runInTerminal" and arguments is not None:
+            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
+            proc = subprocess.Popen(
+                arguments["args"],
+                env=arguments.get("env", {}),
+                cwd=arguments["cwd"],
+                stdin=subprocess.DEVNULL,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                shell=in_shell,
+            )
+            body = {}
+            if in_shell:
+                body["shellProcessId"] = proc.pid
+            else:
+                body["processId"] = proc.pid
+            self.send_packet(
+                {
+                    "type": "response",
+                    "seq": 0,
+                    "request_seq": request["seq"],
+                    "success": True,
+                    "command": "runInTerminal",
+                    "message": None,
+                    "body": body,
+                }
+            )
+        elif request["command"] == "startDebugging":
+            self.send_packet(
+                {
+                    "type": "response",
+                    "seq": 0,
+                    "request_seq": request["seq"],
+                    "success": True,
+                    "message": None,
+                    "command": "startDebugging",
+                    "body": {},
+                }
+            )
+        else:
+            desc = 'unknown reverse request "%s"' % (request["command"])
+            raise ValueError(desc)
 
     def _process_continued(self, all_threads_continued: bool):
         self.frame_scopes = {}
         if all_threads_continued:
             self.thread_stop_reasons = {}
 
-    def _update_verified_breakpoints(self, breakpoints: list[Event]):
-        for breakpoint in breakpoints:
-            if "id" in breakpoint:
-                self.resolved_breakpoints[str(breakpoint["id"])] = breakpoint.get(
-                    "verified", False
-                )
+    def _update_verified_breakpoints(self, breakpoints: list[Breakpoint]):
+        for bp in breakpoints:
+            # If no id is set, we cannot correlate the given breakpoint across
+            # requests, ignore it.
+            if "id" not in bp:
+                continue
 
-    def send_packet(self, command_dict: Request, set_sequence=True):
+            self.resolved_breakpoints[str(bp["id"])] = bp.get("verified", False)
+
+    def _send_recv(self, request: Request[Ta]) -> Optional[Response[Tb]]:
+        """Send a command python dictionary as JSON and receive the JSON
+        response. Validates that the response is the correct sequence and
+        command in the reply. Any events that are received are added to the
+        events list in this object"""
+        seq = self.send_packet(request)
+        response = self.receive_response(seq)
+        if response is None:
+            raise ValueError(f"no response for {request!r}")
+        self.validate_response(request, response)
+        return response
+
+    def send_packet(self, packet: ProtocolMessage) -> int:
         """Take the "command_dict" python dictionary and encode it as a JSON
         string and send the contents as a packet to the VSCode debug
-        adapter"""
-        # Set the sequence ID for this command automatically
-        if set_sequence:
-            command_dict["seq"] = self.sequence
+        adapter.
+
+        Returns the seq of the packet."""
+        # Set the seq for requests.
+        if packet["type"] == "request":
+            packet["seq"] = self.sequence
             self.sequence += 1
+        else:
+            packet["seq"] = 0
+
         # Encode our command dictionary as a JSON string
-        json_str = json.dumps(command_dict, separators=(",", ":"))
+        json_str = json.dumps(packet, separators=(",", ":"))
+
         if self.trace_file:
             self.trace_file.write("to adapter:\n%s\n" % (json_str))
+
         length = len(json_str)
         if length > 0:
             # Send the encoded JSON packet and flush the 'send' file
             self.send.write(self.encode_content(json_str))
             self.send.flush()
 
-    def recv_packet(
+        return packet["seq"]
+
+    def receive_response(self, seq: int) -> Optional[Response]:
+        """Waits for the a response with the associated request_sec."""
+
+        def predicate(p: ProtocolMessage):
+            return p["type"] == "response" and p["request_seq"] == seq
+
+        return cast(Optional[Response], self._recv_packet(predicate=predicate))
+
+    def get_modules(self):
+        modules = {}
+        resp = self.request_modules()
+        if resp["success"]:
+            module_list = resp["body"]["modules"]
+            for module in module_list:
+                modules[module["name"]] = module
+        else:
+            raise ValueError(f"request_modules failed: {resp!r}")
+        return modules
+
+    def get_output(self, category: str, clear=True) -> str:
+        output = ""
+        if category in self.output:
+            output = self.output.get(category, "")
+            if clear:
+                del self.output[category]
+        return output
+
+    def collect_output(
         self,
-        filter_type: Optional[str] = None,
-        filter_event: Optional[Union[str, list[str]]] = None,
-        timeout: Optional[float] = None,
-    ) -> Optional[ProtocolMessage]:
-        """Get a JSON packet from the VSCode debug adapter. This function
-        assumes a thread that reads packets is running and will deliver
-        any received packets by calling handle_recv_packet(...). This
-        function will wait for the packet to arrive and return it when
-        it does."""
-        while True:
-            try:
-                self.recv_condition.acquire()
-                packet = None
-                while True:
-                    for i, curr_packet in enumerate(self.recv_packets):
-                        if not curr_packet:
-                            raise EOFError
-                        packet_type = curr_packet["type"]
-                        if filter_type is None or packet_type in filter_type:
-                            if filter_event is None or (
-                                packet_type == "event"
-                                and curr_packet["event"] in filter_event
-                            ):
-                                packet = self.recv_packets.pop(i)
-                                break
-                    if packet:
-                        break
-                    # Sleep until packet is received
-                    len_before = len(self.recv_packets)
-                    self.recv_condition.wait(timeout)
-                    len_after = len(self.recv_packets)
-                    if len_before == len_after:
-                        return None  # Timed out
-                return packet
-            except EOFError:
-                return None
-            finally:
-                self.recv_condition.release()
+        category: str,
+        timeout_secs: float,
+        pattern: Optional[str] = None,
+        clear=True,
+    ) -> str:
+        """Collect output from 'output' events.
 
-    def send_recv(self, command):
-        """Send a command python dictionary as JSON and receive the JSON
-        response. Validates that the response is the correct sequence and
-        command in the reply. Any events that are received are added to the
-        events list in this object"""
-        self.send_packet(command)
-        done = False
-        while not done:
-            response_or_request = self.recv_packet(filter_type=["response", "request"])
-            if response_or_request is None:
-                desc = 'no response for "%s"' % (command["command"])
-                raise ValueError(desc)
-            if response_or_request["type"] == "response":
-                self.validate_response(command, response_or_request)
-                return response_or_request
-            else:
-                self.reverse_requests.append(response_or_request)
-                if response_or_request["command"] == "runInTerminal":
-                    subprocess.Popen(
-                        response_or_request["arguments"]["args"],
-                        env=response_or_request["arguments"]["env"],
-                    )
-                    self.send_packet(
-                        {
-                            "type": "response",
-                            "request_seq": response_or_request["seq"],
-                            "success": True,
-                            "command": "runInTerminal",
-                            "body": {},
-                        },
-                    )
-                elif response_or_request["command"] == "startDebugging":
-                    self.send_packet(
-                        {
-                            "type": "response",
-                            "request_seq": response_or_request["seq"],
-                            "success": True,
-                            "command": "startDebugging",
-                            "body": {},
-                        },
-                    )
-                else:
-                    desc = 'unknown reverse request "%s"' % (
-                        response_or_request["command"]
-                    )
-                    raise ValueError(desc)
+        Args:
+            category: The category to collect.
+            timeout_secs: The max duration for collecting output.
+            pattern:
+                Optional, if set, return once this pattern is detected in the
+                collected output.
 
-        return None
+        Returns:
+            The collected output.
+        """
+        deadline = time.monotonic() + timeout_secs
+        output = self.get_output(category, clear)
+        while deadline >= time.monotonic() and (
+            pattern is None or pattern not in output
+        ):
+            event = self.wait_for_event(["output"], timeout=deadline - time.monotonic())
+            if not event:  # Timeout or EOF
+                break
+            output += self.get_output(category, clear=clear)
+        return output
 
     def wait_for_event(
-        self, filter: Union[str, list[str]], timeout: Optional[float] = None
+        self, filter: List[str] = [], timeout: Optional[float] = None
     ) -> Optional[Event]:
         """Wait for the first event that matches the filter."""
-        return self.recv_packet(
-            filter_type="event", filter_event=filter, timeout=timeout
+
+        def predicate(p: ProtocolMessage):
+            return p["type"] == "event" and p["event"] in filter
+
+        return cast(
+            Optional[Event], self._recv_packet(predicate=predicate, timeout=timeout)
         )
 
     def wait_for_stopped(
         self, timeout: Optional[float] = None
-    ) -> Optional[list[Event]]:
+    ) -> Optional[List[Event]]:
         stopped_events = []
         stopped_event = self.wait_for_event(
             filter=["stopped", "exited"], timeout=timeout
@@ -463,9 +632,9 @@ class DebugCommunication(object):
         return stopped_events
 
     def wait_for_breakpoint_events(self, timeout: Optional[float] = None):
-        breakpoint_events: list[Event] = []
+        breakpoint_events: List[Event] = []
         while True:
-            event = self.wait_for_event("breakpoint", timeout=timeout)
+            event = self.wait_for_event(["breakpoint"], timeout=timeout)
             if not event:
                 break
             breakpoint_events.append(event)
@@ -476,20 +645,26 @@ class DebugCommunication(object):
     ):
         """Wait for all breakpoints to be verified. Return all unverified breakpoints."""
         while any(id not in self.resolved_breakpoints for id in breakpoint_ids):
-            breakpoint_event = self.wait_for_event("breakpoint", timeout=timeout)
+            breakpoint_event = self.wait_for_event(["breakpoint"], timeout=timeout)
             if breakpoint_event is None:
                 break
 
-        return [id for id in breakpoint_ids if id not in self.resolved_breakpoints]
+        return [
+            id
+            for id in breakpoint_ids
+            if id not in self.resolved_breakpoints and not self.resolved_breakpoints[id]
+        ]
 
     def wait_for_exited(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event("exited", timeout=timeout)
+        event_dict = self.wait_for_event(["exited"], timeout=timeout)
         if event_dict is None:
             raise ValueError("didn't get exited event")
         return event_dict
 
     def wait_for_terminated(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event("terminated", timeout)
+        if self.terminated:
+            raise ValueError("already terminated")
+        event_dict = self.wait_for_event(["terminated"], timeout)
         if event_dict is None:
             raise ValueError("didn't get terminated event")
         return event_dict
@@ -524,12 +699,10 @@ class DebugCommunication(object):
         if threadId is None:
             threadId = self.get_thread_id()
         if threadId is None:
-            print("invalid threadId")
             return None
         response = self.request_stackTrace(threadId, startFrame=frameIndex, levels=1)
         if response:
             return response["body"]["stackFrames"][0]
-        print("invalid response")
         return None
 
     def get_completions(self, text, frameId=None):
@@ -667,7 +840,7 @@ class DebugCommunication(object):
         gdbRemotePort: Optional[int] = None,
         gdbRemoteHostname: Optional[str] = None,
     ):
-        args_dict = {}
+        args_dict: AttachArguments = {}
         if pid is not None:
             args_dict["pid"] = pid
         if program is not None:
@@ -699,8 +872,12 @@ class DebugCommunication(object):
             args_dict["gdb-remote-port"] = gdbRemotePort
         if gdbRemoteHostname is not None:
             args_dict["gdb-remote-hostname"] = gdbRemoteHostname
-        command_dict = {"command": "attach", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        command_dict: Request = {
+            "command": "attach",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self._send_recv(command_dict)
 
     def request_breakpointLocations(
         self, file_path, line, end_line=None, column=None, end_column=None
@@ -722,7 +899,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_configurationDone(self):
         command_dict = {
@@ -730,7 +907,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": {},
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
             self.request_threads()
@@ -759,7 +936,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response["success"]:
             self._process_continued(response["body"]["allThreadsContinued"])
         # Caller must still call wait_for_stopped.
@@ -776,7 +953,7 @@ class DebugCommunication(object):
         if restartArguments:
             command_dict["arguments"] = restartArguments
 
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         # Caller must still call wait_for_stopped.
         return response
 
@@ -792,7 +969,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_disassemble(
         self,
@@ -812,7 +989,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)["body"]["instructions"]
+        return self._send_recv(command_dict)["body"]["instructions"]
 
     def request_readMemory(self, memoryReference, offset, count):
         args_dict = {
@@ -825,7 +1002,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None):
         stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId)
@@ -841,7 +1018,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_exceptionInfo(self, threadId=None):
         if threadId is None:
@@ -852,7 +1029,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_initialize(self, sourceInitFile=False):
         command_dict = {
@@ -873,7 +1050,7 @@ class DebugCommunication(object):
                 "$__lldb_sourceInitFile": sourceInitFile,
             },
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response:
             if "body" in response:
                 self.capabilities = response["body"]
@@ -908,7 +1085,7 @@ class DebugCommunication(object):
         customFrameFormat: Optional[str] = None,
         customThreadFormat: Optional[str] = None,
     ):
-        args_dict = {"program": program}
+        args_dict: LaunchArguments = {"program": program}
         if args:
             args_dict["args"] = args
         if cwd:
@@ -955,15 +1132,19 @@ class DebugCommunication(object):
         args_dict["displayExtendedBacktrace"] = displayExtendedBacktrace
         if commandEscapePrefix is not None:
             args_dict["commandEscapePrefix"] = commandEscapePrefix
-        command_dict = {"command": "launch", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        command_dict: Request = {
+            "command": "launch",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self._send_recv(command_dict)
 
     def request_next(self, threadId, granularity="statement"):
         if self.exit_status is not None:
             raise ValueError("request_continue called after process exited")
         args_dict = {"threadId": threadId, "granularity": granularity}
         command_dict = {"command": "next", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_stepIn(self, threadId, targetId, granularity="statement"):
         if self.exit_status is not None:
@@ -976,7 +1157,7 @@ class DebugCommunication(object):
             "granularity": granularity,
         }
         command_dict = {"command": "stepIn", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_stepInTargets(self, frameId):
         if self.exit_status is not None:
@@ -988,14 +1169,14 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_stepOut(self, threadId):
         if self.exit_status is not None:
             raise ValueError("request_stepOut called after process exited")
         args_dict = {"threadId": threadId}
         command_dict = {"command": "stepOut", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_pause(self, threadId=None):
         if self.exit_status is not None:
@@ -1004,49 +1185,47 @@ class DebugCommunication(object):
             threadId = self.get_thread_id()
         args_dict = {"threadId": threadId}
         command_dict = {"command": "pause", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_scopes(self, frameId):
         args_dict = {"frameId": frameId}
         command_dict = {"command": "scopes", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
-    def request_setBreakpoints(self, source: Source, line_array, data=None):
+    def request_setBreakpoints(
+        self,
+        source: Union[Source, str],
+        line_array: Optional[List[int]],
+        data: Optional[List[BreakpointData]] = None,
+    ):
         """data is array of parameters for breakpoints in line_array.
         Each parameter object is 1:1 mapping with entries in line_entry.
         It contains optional location/hitCondition/logMessage parameters.
         """
+        if isinstance(source, str):
+            source = Source(path=source)
         args_dict = {
-            "source": source.as_dict(),
+            "source": source.to_DAP(),
             "sourceModified": False,
         }
-        if line_array is not None:
+        if line_array:
             args_dict["lines"] = line_array
             breakpoints = []
             for i, line in enumerate(line_array):
-                breakpoint_data = None
+                breakpoint_data: BreakpointData = {}
                 if data is not None and i < len(data):
                     breakpoint_data = data[i]
-                bp = {"line": line}
-                if breakpoint_data is not None:
-                    if breakpoint_data.get("condition"):
-                        bp["condition"] = breakpoint_data["condition"]
-                    if breakpoint_data.get("hitCondition"):
-                        bp["hitCondition"] = breakpoint_data["hitCondition"]
-                    if breakpoint_data.get("logMessage"):
-                        bp["logMessage"] = breakpoint_data["logMessage"]
-                    if breakpoint_data.get("column"):
-                        bp["column"] = breakpoint_data["column"]
+                bp: SourceBreakpoint = {"line": line, **breakpoint_data}
                 breakpoints.append(bp)
             args_dict["breakpoints"] = breakpoints
 
-        command_dict = {
+        command_dict: Request = {
             "command": "setBreakpoints",
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
-        if response["success"]:
+        response = self._send_recv(command_dict)
+        if response and response["success"] and response["body"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
@@ -1061,7 +1240,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=None):
         breakpoints = []
@@ -1078,7 +1257,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response["success"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
@@ -1099,7 +1278,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_setDataBreakpoint(self, dataBreakpoints):
         """dataBreakpoints is a list of dictionary with following fields:
@@ -1116,7 +1295,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_compileUnits(self, moduleId):
         args_dict = {"moduleId": moduleId}
@@ -1125,7 +1304,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         return response
 
     def request_completions(self, text, frameId=None):
@@ -1137,10 +1316,10 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_modules(self):
-        return self.send_recv({"command": "modules", "type": "request"})
+        return self._send_recv({"command": "modules", "type": "request"})
 
     def request_stackTrace(
         self, threadId=None, startFrame=None, levels=None, format=None, dump=False
@@ -1159,7 +1338,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if dump:
             for idx, frame in enumerate(response["body"]["stackFrames"]):
                 name = frame["name"]
@@ -1185,7 +1364,7 @@ class DebugCommunication(object):
                 "sourceReference": sourceReference,
             },
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_threads(self):
         """Request a list of all threads and combine any information from any
@@ -1193,7 +1372,7 @@ class DebugCommunication(object):
         thread actually stopped. Returns an array of thread dictionaries
         with information about all threads"""
         command_dict = {"command": "threads", "type": "request", "arguments": {}}
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if not response["success"]:
             self.threads = None
             return response
@@ -1233,7 +1412,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_setVariable(self, containingVarRef, name, value, id=None):
         args_dict = {
@@ -1248,7 +1427,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_locations(self, locationReference):
         args_dict = {
@@ -1259,7 +1438,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_testGetTargetBreakpoints(self):
         """A request packet used in the LLDB test suite to get all currently
@@ -1271,12 +1450,12 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": {},
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def terminate(self):
         self.send.close()
-        if self.recv_thread.is_alive():
-            self.recv_thread.join()
+        if self._recv_thread.is_alive():
+            self._recv_thread.join()
 
     def request_setInstructionBreakpoints(self, memory_reference=[]):
         breakpoints = []
@@ -1291,7 +1470,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
 
 class DebugAdapterServer(DebugCommunication):
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 3b54d598c350..8778b51e7c36 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -1,6 +1,6 @@
 import os
 import time
-from typing import Optional
+from typing import Optional, Callable
 import uuid
 
 import dap_server
@@ -121,11 +121,19 @@ class DAPTestCaseBase(TestBase):
             f"Expected to resolve all breakpoints. Unresolved breakpoint ids: {unresolved_breakpoints}",
         )
 
-    def waitUntil(self, condition_callback):
-        for _ in range(20):
-            if condition_callback():
+    def wait_until(
+        self,
+        predicate: Callable[[], bool],
+        delay: float = 0.5,
+        timeout: float = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Repeatedly run the predicate until either the predicate returns True
+        or a timeout has occurred."""
+        deadline = time.monotonic() + timeout
+        while deadline > time.monotonic():
+            if predicate():
                 return True
-            time.sleep(0.5)
+            time.sleep(delay)
         return False
 
     def assertCapabilityIsSet(self, key: str, msg: Optional[str] = None) -> None:
@@ -144,6 +152,7 @@ class DAPTestCaseBase(TestBase):
         "breakpoint_ids" should be a list of breakpoint ID strings
         (["1", "2"]). The return value from self.set_source_breakpoints()
         or self.set_function_breakpoints() can be passed to this function"""
+        breakpoint_ids = [str(i) for i in breakpoint_ids]
         stopped_events = self.dap_server.wait_for_stopped(timeout)
         for stopped_event in stopped_events:
             if "body" in stopped_event:
@@ -155,22 +164,16 @@ class DAPTestCaseBase(TestBase):
                     and body["reason"] != "instruction breakpoint"
                 ):
                     continue
-                if "description" not in body:
+                if "hitBreakpointIds" not in body:
                     continue
-                # Descriptions for breakpoints will be in the form
-                # "breakpoint 1.1", so look for any description that matches
-                # ("breakpoint 1.") in the description field as verification
-                # that one of the breakpoint locations was hit. DAP doesn't
-                # allow breakpoints to have multiple locations, but LLDB does.
-                # So when looking at the description we just want to make sure
-                # the right breakpoint matches and not worry about the actual
-                # location.
-                description = body["description"]
-                for breakpoint_id in breakpoint_ids:
-                    match_desc = f"breakpoint {breakpoint_id}."
-                    if match_desc in description:
+                hit_breakpoint_ids = body["hitBreakpointIds"]
+                for bp in hit_breakpoint_ids:
+                    if str(bp) in breakpoint_ids:
                         return
-        self.assertTrue(False, f"breakpoint not hit, stopped_events={stopped_events}")
+        self.assertTrue(
+            False,
+            f"breakpoint not hit, wanted breakpoint_ids={breakpoint_ids} stopped_events={stopped_events}",
+        )
 
     def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
         """Wait for the process we are debugging to stop, and verify the stop
@@ -205,7 +208,9 @@ class DAPTestCaseBase(TestBase):
                     found = True
                     break
             self.assertTrue(
-                found, "verify '%s' found in console output for '%s'" % (cmd, flavor)
+                found,
+                "verify '%s' found in console output for '%s' in %s"
+                % (cmd, flavor, output),
             )
 
     def get_dict_value(self, d, key_path):
@@ -277,26 +282,30 @@ class DAPTestCaseBase(TestBase):
                         return (source["path"], stackFrame["line"])
         return ("", 0)
 
-    def get_stdout(self, timeout=0.0):
-        return self.dap_server.get_output("stdout", timeout=timeout)
+    def get_stdout(self):
+        return self.dap_server.get_output("stdout")
 
-    def get_console(self, timeout=0.0):
-        return self.dap_server.get_output("console", timeout=timeout)
+    def get_console(self):
+        return self.dap_server.get_output("console")
 
-    def get_important(self, timeout=0.0):
-        return self.dap_server.get_output("important", timeout=timeout)
+    def get_important(self):
+        return self.dap_server.get_output("important")
 
-    def collect_stdout(self, timeout_secs, pattern=None):
+    def collect_stdout(self, timeout_secs: float, pattern: Optional[str] = None) -> str:
         return self.dap_server.collect_output(
             "stdout", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_console(self, timeout_secs, pattern=None):
+    def collect_console(
+        self, timeout_secs: float, pattern: Optional[str] = None
+    ) -> str:
         return self.dap_server.collect_output(
             "console", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_important(self, timeout_secs, pattern=None):
+    def collect_important(
+        self, timeout_secs: float, pattern: Optional[str] = None
+    ) -> str:
         return self.dap_server.collect_output(
             "important", timeout_secs=timeout_secs, pattern=pattern
         )
@@ -355,7 +364,7 @@ class DAPTestCaseBase(TestBase):
             return self.dap_server.wait_for_stopped(timeout)
         return None
 
-    def do_continue(self):  # `continue` is a keyword.
+    def do_continue(self) -> None:  # `continue` is a keyword.
         resp = self.dap_server.request_continue()
         self.assertTrue(resp["success"], f"continue request failed: {resp}")
 
@@ -363,10 +372,14 @@ class DAPTestCaseBase(TestBase):
         self.do_continue()
         return self.dap_server.wait_for_stopped(timeout)
 
-    def continue_to_breakpoint(self, breakpoint_id: str, timeout=DEFAULT_TIMEOUT):
-        self.continue_to_breakpoints((breakpoint_id), timeout)
+    def continue_to_breakpoint(
+        self, breakpoint_id: int, timeout: Optional[float] = DEFAULT_TIMEOUT
+    ) -> None:
+        self.continue_to_breakpoints([breakpoint_id], timeout)
 
-    def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+    def continue_to_breakpoints(
+        self, breakpoint_ids: list[int], timeout: Optional[float] = DEFAULT_TIMEOUT
+    ) -> None:
         self.do_continue()
         self.verify_breakpoint_hit(breakpoint_ids, timeout)
 
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 831edd6494c1..a6eeee3a0254 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -78,7 +78,7 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertFalse(breakpoint["verified"])
         self.assertEqual(other_basename, breakpoint["source"]["name"])
         self.assertEqual(new_other_path, breakpoint["source"]["path"])
-        other_breakpoint_id = breakpoint["id"]
+        other_breakpoint_id = str(breakpoint["id"])
 
         self.dap_server.request_continue()
         self.verify_breakpoint_hit([other_breakpoint_id])
@@ -379,7 +379,8 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
             self.assertEqual(breakpoint["line"], loop_line)
             self.assertEqual(breakpoint["column"], columns[index])
             self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
-            breakpoint_ids.append(breakpoint["id"])
+            self.assertIn("id", breakpoint, "expected breakpoint id")
+            breakpoint_ids.append(str(breakpoint["id"]))
 
         # Continue to the first breakpoint,
         self.continue_to_breakpoints([breakpoint_ids[0]])
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index 824ed8fe3bb9..c750cff071a8 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -54,18 +54,18 @@ class TestDAP_cancel(lldbdap_testcase.DAPTestCaseBase):
         pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2)
         cancel_seq = self.async_cancel(requestId=pending_seq)
 
-        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
+        blocking_resp = self.dap_server.receive_response(blocking_seq)
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], True)
 
-        pending_resp = self.dap_server.recv_packet(filter_type=["response"])
+        pending_resp = self.dap_server.receive_response(pending_seq)
         self.assertEqual(pending_resp["request_seq"], pending_seq)
         self.assertEqual(pending_resp["command"], "evaluate")
         self.assertEqual(pending_resp["success"], False)
         self.assertEqual(pending_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
+        cancel_resp = self.dap_server.receive_response(cancel_seq)
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
@@ -86,13 +86,13 @@ class TestDAP_cancel(lldbdap_testcase.DAPTestCaseBase):
         )
         cancel_seq = self.async_cancel(requestId=blocking_seq)
 
-        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
+        blocking_resp = self.dap_server.receive_response(blocking_seq)
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], False)
         self.assertEqual(blocking_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
+        cancel_resp = self.dap_server.receive_response(cancel_seq)
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index ae8142ae4f48..c29e0d3fa7b8 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -191,7 +191,7 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_exit()
         # Now get the STDOUT and verify our program argument is correct
         output = self.get_stdout()
-        self.assertEqual(output, None, "expect no program output")
+        self.assertEqual(output, "", "expect no program output")
 
     @skipIfWindows
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
@@ -392,14 +392,14 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue again and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the second breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
@@ -461,21 +461,21 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.verify_commands("launchCommands", output, launchCommands)
         # Verify the "stopCommands" here
         self.continue_to_next_stop()
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_next_stop()
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
         self.continue_to_exit()
         # Get output from the console. This should contain both the
         # "exitCommands" that were run after the second breakpoint was hit
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("exitCommands", output, exitCommands)
 
     def test_failing_launch_commands(self):
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index 4fc221668a8e..b1823e4c8b1c 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -54,7 +54,7 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
             return symbol_regex.match(program_module["symbolStatus"])
 
         if expect_debug_info_size:
-            self.waitUntil(checkSymbolsLoadedWithSize)
+            self.wait_until(checkSymbolsLoadedWithSize)
         active_modules = self.dap_server.get_modules()
         program_module = active_modules[program_basename]
         self.assertEqual(program_basename, program_module["name"])
diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
index 0425b55a5e55..4fcde623e382 100644
--- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
+++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
@@ -37,14 +37,14 @@ class TestDAP_output(lldbdap_testcase.DAPTestCaseBase):
         # Disconnecting from the server to ensure any pending IO is flushed.
         self.dap_server.request_disconnect()
 
-        output += self.get_stdout(timeout=self.DEFAULT_TIMEOUT)
+        output += self.get_stdout()
         self.assertTrue(output and len(output) > 0, "expect program stdout")
         self.assertIn(
             "abcdefghi\r\nhello world\r\nfinally\0\0",
             output,
             "full stdout not found in: " + repr(output),
         )
-        console = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        console = self.get_console()
         self.assertTrue(console and len(console) > 0, "expect dap messages")
         self.assertIn(
             "out\0\0\r\nerr\0\0\r\n", console, f"full console message not found"

From 3f33c8482fc0b8dd0d2596262ebd0ed73d41665d Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 17 Jun 2025 15:27:41 -0700
Subject: [PATCH 0720/1322] [clang] Add release note for int->enum conversion
 change. (#144407)

This seems to be having some practical impact, so we should let people
know.
---
 clang/docs/ReleaseNotes.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6f28dbd03ca2..12816eed2e8b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -65,8 +65,10 @@ C++ Specific Potentially Breaking Changes
   standard library already have their own bespoke builtins.
 - A workaround for libstdc++4.7 has been removed. Note that 4.8.3 remains the oldest
   supported libstdc++ version.
-
 - Added ``!nonnull/!align`` metadata to load of references for better codegen.
+- Checking for int->enum conversions in constant expressions is more strict;
+  in particular, ``const E x = (E)-1;`` is not treated as a constant if it's
+  out of range. This impacts old versions of Boost.  (#GH143034)
 
 ABI Changes in This Version
 ---------------------------

From f25f2f7de4f8264d89ba3c4dc9daddb10a90c13f Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Tue, 17 Jun 2025 15:46:35 -0700
Subject: [PATCH 0721/1322] [MLIR][XeGPU] Extend unrolling support for scatter
 ops with chunk_size (#144447)

Add support for load/store with chunk_size, which requires special
consideration for the operand blocking since offests and masks are
 n-D and tensor are n+1-D. Support operations including create_tdesc,
update_tdesc, load, store, and prefetch.

---------

Co-authored-by: Adam Siemieniuk <adam.siemieniuk@intel.com>
---
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  | 176 ++++++++++----
 .../Dialect/XeGPU/xegpu-unroll-patterns.mlir  | 214 ++++++++++++------
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  49 ++--
 3 files changed, 315 insertions(+), 124 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 9c234c1e866b..0457f8128b90 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -402,30 +402,58 @@ struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     xegpu::TensorDescType tdescTy = op.getType();
+    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
+    VectorType indiceVecTy = indiceVec.getType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape)
       return failure();
 
+    SmallVector<int64_t> targetIndiceShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+    // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1.
+    if (originalChunkSize > 1)
+      targetIndiceShape.pop_back();
+
     auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
-
-    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
-    VectorType indiceVecTy = indiceVec.getType();
-
     SmallVector<Type> convertedIndiceTypes =
-        getUnrolledTypes(indiceVecTy, *targetShape);
+        getUnrolledTypes(indiceVecTy, targetIndiceShape);
     SmallVector<Value> convertedIndiceVec =
-        pack(indiceVec, convertedIndiceTypes, *targetShape, loc, rewriter);
+        pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter);
 
     SmallVector<Value> newOps;
-    for (auto indice : convertedIndiceVec) {
-      auto newOp = rewriter.create<xegpu::CreateDescOp>(loc, newTdescTy,
-                                                        op.getSource(), indice);
-      newOps.push_back(newOp);
+
+    // More indices is need when chunkSize > 1. Since a big load from one
+    // address could be break into multiple small loads.
+    if (originalChunkSize > 1) {
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto [indice, indiceType] :
+           llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          // Compute the offset
+          Value inc = rewriter.create<arith::ConstantIndexOp>(
+              loc, i * blockedChunkSize);
+          Value incVec = rewriter.create<vector::SplatOp>(loc, indiceType, inc);
+          Value offsetIndice =
+              rewriter.create<arith::AddIOp>(loc, indice, incVec);
+
+          auto newOp = rewriter.create<xegpu::CreateDescOp>(
+              loc, newTdescTy, op.getSource(), offsetIndice);
+
+          newOps.push_back(newOp);
+        }
+      }
+    } else {
+      for (auto indice : convertedIndiceVec) {
+        auto newOp = rewriter.create<xegpu::CreateDescOp>(
+            loc, newTdescTy, op.getSource(), indice);
+        newOps.push_back(newOp);
+      }
     }
 
     Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
@@ -444,16 +472,18 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
     VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
-    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
-
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape)
       return failure();
 
+    SmallVector<int64_t> targetMaskShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
     Type elemTy = tdescTy.getElementType();
     VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
 
@@ -462,10 +492,29 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
     SmallVector<Value> convertedTdescs = pack(
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
-    SmallVector<Type> convertedMaskTypes =
-        getUnrolledTypes(maskTy, *targetShape);
-    SmallVector<Value> convertedMasks =
-        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    SmallVector<Type> convertedMaskTypes;
+    SmallVector<Value> convertedMasks;
+
+    if (originalChunkSize > 1) {
+      targetMaskShape.pop_back();
+      convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape);
+      SmallVector<Value> convertedMasks1D = pack(
+          op.getMask(), convertedMaskTypes, targetMaskShape, loc, rewriter);
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto mask : convertedMasks1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i)
+          convertedMasks.push_back(mask);
+      }
+      // This is to handle the transpose effect when chunkSize > 1.
+      std::swap((*targetShape)[0], (*targetShape)[1]);
+      newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+    } else {
+      convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape);
+      convertedMasks = pack(op.getMask(), convertedMaskTypes, targetMaskShape,
+                            loc, rewriter);
+    }
 
     SmallVector<Value> newOps;
     for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) {
@@ -476,7 +525,6 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
     }
 
     Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
-
     rewriter.replaceOp(op, castOp);
     return success();
   }
@@ -489,8 +537,7 @@ struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
     Location loc = op.getLoc();
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
@@ -519,30 +566,51 @@ struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
     VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
-    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
-
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape)
       return failure();
 
-    SmallVector<Type> convertedValTypes =
-        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<int64_t> targetIndiceShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
     SmallVector<Type> convertedTdescTypes =
         getUnrolledTypes(tdescTy, *targetShape);
-
-    SmallVector<Value> convertedValues =
-        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
     SmallVector<Value> convertedTdescs = pack(
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
-    SmallVector<Type> convertedMaskTypes =
-        getUnrolledTypes(maskTy, *targetShape);
-    SmallVector<Value> convertedMasks =
-        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    SmallVector<Type> convertedMaskTypes;
+    SmallVector<Value> convertedMasks;
+
+    if (originalChunkSize > 1) {
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+      convertedMaskTypes = getUnrolledTypes(maskTy, (*targetShape)[0]);
+      SmallVector<Value> convertedMasks1D = pack(
+          op.getMask(), convertedMaskTypes, (*targetShape)[0], loc, rewriter);
+
+      for (auto mask : convertedMasks1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          convertedMasks.push_back(mask);
+        }
+      }
+      // This is to handle the transpose effect when chunkSize > 1.
+      std::swap((*targetShape)[0], (*targetShape)[1]);
+
+    } else {
+      convertedMaskTypes = getUnrolledTypes(maskTy, *targetShape);
+      convertedMasks =
+          pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    }
+
+    SmallVector<Type> convertedValTypes =
+        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<Value> convertedValues =
+        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
     for (size_t i = 0; i < convertedValues.size(); ++i) {
       Value v = convertedValues[i];
@@ -565,8 +633,10 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
     Location loc = op.getLoc();
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (tdescTy.getRank() > 2)
+      return failure();
+
+    if (!tdescTy.isScattered())
       return failure();
 
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
@@ -580,12 +650,32 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
 
     TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
     VectorType offsetVecTy = offsetVec.getType();
-    SmallVector<Type> convertedOffsetTypes =
-        getUnrolledTypes(offsetVecTy, *targetShape);
-    SmallVector<Value> convertedOffsetVec =
-        pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
-
+    SmallVector<Type> convertedOffsetTypes;
+    SmallVector<Value> convertedOffsetVec;
     SmallVector<Value> newOps;
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+    if (originalChunkSize > 1) {
+      SmallVector<int64_t> shape1D(targetShape->begin(),
+                                   targetShape->end() - 1);
+      convertedOffsetTypes = getUnrolledTypes(offsetVecTy, shape1D);
+      SmallVector<Value> convertedOffsetVec1D =
+          pack(offsetVec, convertedOffsetTypes, shape1D, loc, rewriter);
+
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto offset : convertedOffsetVec1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          convertedOffsetVec.push_back(offset);
+        }
+      }
+
+    } else {
+      convertedOffsetTypes = getUnrolledTypes(offsetVecTy, *targetShape);
+      convertedOffsetVec =
+          pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
+    }
+
     for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
       auto newOp =
           rewriter.create<xegpu::UpdateOffsetOp>(loc, t.getType(), t, o);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index 52ec3b856da4..41414d802f21 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -2,7 +2,7 @@
 
 gpu.module @test {
 
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
@@ -10,31 +10,31 @@ gpu.module @test {
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>,
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   // CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {__xegpu_blocking_tile_shape__ = array<i64: 8, 16>, __xegpu_blocking_unpack__}
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
   }
 
   //-----
 
-  // CHECK-LABEL: test_create_nd_tdesc_1d
+  // CHECK-LABEL: create_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
   // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
   // CHECK-SAME: to !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {__xegpu_blocking_tile_shape__ = array<i64: 16>, __xegpu_blocking_unpack__}
-  gpu.func @test_create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
   }
 
   //-----
 
-  // CHECK-LABEL: test_update_nd_tdesc
+  // CHECK-LABEL: update_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
+  gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return %update : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -42,11 +42,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_update_nd_tdesc_1d
+  // CHECK-LABEL: update_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-2: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16xf32>
-  gpu.func @test_update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     %update = xegpu.update_nd_offset %tdesc, [32] : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return %update : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
@@ -54,11 +54,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return
@@ -66,23 +66,23 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc_1d
+  // CHECK-LABEL: prefetch_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<16xf32>
-  gpu.func @test_prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
+  gpu.func @prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return
   }
 
   //-----
-  // CHECK-LABEL: test_load_nd
+  // CHECK-LABEL: load_nd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   // CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32>
-  gpu.func @test_load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
+  gpu.func @load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
     gpu.return %ld : vector<24x32xf32>
@@ -90,12 +90,12 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_load_nd_1d
+  // CHECK-LABEL: load_nd_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: [[ld:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
   // CHECK-COUNT-4: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<16xf32> into vector<64xf32>
-  gpu.func @test_load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
+  gpu.func @load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     %data = xegpu.load_nd %tdesc: !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>> -> vector<64xf32>
     gpu.return %data : vector<64xf32>
@@ -103,11 +103,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: xegpu.store_nd {{.*}}  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %data = arith.constant dense<9.0> : vector<24x32xf32>
     xegpu.store_nd %data, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -116,11 +116,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_store_nd_1d
+  // CHECK-LABEL: store_nd_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: xegpu.store_nd {{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.func @test_store_nd_1d(%src: memref<64xf32>) {
+  gpu.func @store_nd_1d(%src: memref<64xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     %data = arith.constant dense<9.0> : vector<64xf32>
     xegpu.store_nd %data, %tdesc: vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
@@ -129,7 +129,7 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_createNd_loadNd_storeNd
+  // CHECK-LABEL: createNd_loadNd_storeNd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   //CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -137,7 +137,7 @@ gpu.module @test {
   //CHECK: [[add:%.+]] = arith.addf {{.*}} : vector<24x32xf32>
   //CHECK-COUNT-6: [[extract:%.+]] = vector.extract_strided_slice {{.*}} : vector<24x32xf32> to vector<8x16xf32>
   //CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
+  gpu.func @createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %data = arith.constant dense<9.0> : vector<24x32xf32>
     %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
@@ -148,23 +148,23 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_dpas
+  // CHECK-LABEL: dpas
   // CHECK-SAME: [[arg0:%.+]]: vector<32x32xf16>, [[arg1:%.+]]: vector<32x32xf16>
   //CHECK-COUNT-8: [[extract1:%.+]] = vector.extract_strided_slice [[arg0]] {{.*}} : vector<32x32xf16> to vector<8x16xf16>
   //CHECK-COUNT-4: [[extract2:%.+]] = vector.extract_strided_slice [[arg1]] {{.*}} : vector<32x32xf16> to vector<16x16xf16>
   //CHECK-COUNT-16: [[dpas:%.+]] = xegpu.dpas {{.*}} -> vector<8x16xf32>
   //CHECK-COUNT-8: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
-  gpu.func @test_dpas(%a: vector<32x32xf16>, %b: vector<32x32xf16>) -> vector<32x32xf32> {
+  gpu.func @dpas(%a: vector<32x32xf16>, %b: vector<32x32xf16>) -> vector<32x32xf32> {
     %c = xegpu.dpas %a, %b : vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
     gpu.return %c : vector<32x32xf32>
   }
 
 //-----
 
-  // CHECK-LABEL: test_create_tdesc_vec
+  // CHECK-LABEL: create_tdesc_vec
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  gpu.func @test_create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
     64,  72,  80,  88,  96, 104, 112, 120,
@@ -177,10 +177,10 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_create_tdesc_step
+  // CHECK-LABEL: create_tdesc_step
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  gpu.func @test_create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
     %step = arith.constant dense<8> : vector<32xindex>
     %seq = vector.step  : vector<32xindex>
     %cst = arith.muli %seq, %step : vector<32xindex>
@@ -190,11 +190,11 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_load
+  // CHECK-LABEL: load
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-  gpu.func @test_load(%src: ui64) -> vector<32xf32> {
+  gpu.func @load(%src: ui64) -> vector<32xf32> {
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
     64,  72,  80,  88,  96, 104, 112, 120,
@@ -212,11 +212,11 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_prefetch
+  // CHECK-LABEL: prefetch
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  gpu.func @test_prefetch(%src: ui64)  {
+  gpu.func @prefetch(%src: ui64)  {
 
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
@@ -233,11 +233,11 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_store
+  // CHECK-LABEL: store
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
-  gpu.func @test_store(%src: ui64) {
+  gpu.func @store(%src: ui64) {
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
     64,  72,  80,  88,  96, 104, 112, 120,
@@ -256,47 +256,129 @@ gpu.module @test {
   }
 
 //-----
-
-  // CHECK-LABEL: test_prefetch_load_store_update
+  // CHECK-LABEL: create_tdesc_step_chunk
   // CHECK-SAME: [[arg0:%.+]]: ui64
-  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-   // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
-   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4 : i64>>
+  gpu.func @create_tdesc_step_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
+  }
 
-  gpu.func @test_prefetch_load_store_update(%src: ui64)  {
+//-----
+  // CHECK-LABEL: create_tdesc_step_chunk2
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  gpu.func @create_tdesc_step_chunk2(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+  }
 
+// CHECK-LABEL: create_tdesc_step_chunk3
+  // CHECK-SAME: [[arg0:%.+]]: ui64  
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+    gpu.func @create_tdesc_step_chunk3(%src: ui64) -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>> {
+    %step = arith.constant dense<8> : vector<16xindex>
+    %seq = vector.step  : vector<16xindex>
+    %cst = arith.muli %seq, %step : vector<16xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32,  #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<16x8xf32,  #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
+  }
+
+//-----
+  // CHECK-LABEL: load_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.load  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<2x16xf32>
+
+  gpu.func @load_chunk(%src: ui64) -> vector<4x32xf32> {
     %cst = arith.constant dense<[
-    0,   8,  16,  24,  32,  40,  48,  56,
-    64,  72,  80,  88,  96, 104, 112, 120,
-    128, 136, 144, 152, 160, 168, 176, 184,
-    192, 200, 208, 216, 224, 232, 240, 248 
+        0,   8,  16,  24,  32,  40,  48,  56,
+        64,  72,  80,  88,  96, 104, 112, 120,
+        128, 136, 144, 152, 160, 168, 176, 184,
+        192, 200, 208, 216, 224, 232, 240, 248 
     ]> : vector<32xindex>
-
-    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-   
-    %delta = arith.constant dense<[
-    32,   32,  32,  32,  32,  32,  32,  32,
-    32,   32,  32,  32,  32,  32,  32,  64,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    128, 128, 128, 128, 128, 128, 128, 256 
-    ]> : vector<32xindex>
-    %new_tdesc = xegpu.update_offset %tdesc, %delta
-              : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>     
- 
+    
     %c17 = arith.constant 17: index
     %mask = vector.create_mask %c17: vector<32xi1>
 
-    %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> 
+    %ld = xegpu.load %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<4x32xf32>
+    
+    gpu.return %ld : vector<4x32xf32> 
+   }
 
-    %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
-    xegpu.store %st_vec, %tdesc, %mask: 
-                 vector<32xf32>, 
-                 !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, 
-                 vector<32xi1>
-  
+//-----
+  // CHECK-LABEL: store_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} :  ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.store  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
+  gpu.func @store_chunk(%src: ui64) {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %st_vec = arith.constant dense<1023.>: vector<4x32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: vector<4x32xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16,2]>>, vector<32xi1>
+    
     gpu.return
   }
+
+//-----
+  // CHECK-LABEL: prefetch_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  gpu.func @prefetch_chunk(%src: ui64)  {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+      ]> : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    
+    gpu.return
+  }
+
+//-----
+  // CHECK-LABEL: update_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
+  gpu.func @update_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    %delta = arith.constant dense<32>: vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+        : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
+
+    gpu.return %new_tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+  }  
 }
+
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 57aaecbd7962..4400d6d9625f 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -19,6 +19,10 @@ using namespace mlir::xegpu;
 
 namespace {
 
+#define DEBUG_TYPE "test-xegpu-unroll"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 struct TestXeGPUUnrollingPatterns
     : public PassWrapper<TestXeGPUUnrollingPatterns,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -48,7 +52,9 @@ struct TestXeGPUUnrollingPatterns
     options.setNativeShapeFn(
         [&](Operation *op) -> std::optional<SmallVector<int64_t>> {
           if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
-                  xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp>(op)) {
+                  xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
+                  xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
+                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
             xegpu::TensorDescType tdescTy;
             if (auto createNdOp = dyn_cast<xegpu::CreateNdDescOp>(op)) {
               tdescTy = createNdOp.getType();
@@ -61,20 +67,7 @@ struct TestXeGPUUnrollingPatterns
               tdescTy = loadNdOp.getTensorDescType();
             } else if (auto storeNdOp = dyn_cast<xegpu::StoreNdOp>(op)) {
               tdescTy = storeNdOp.getTensorDescType();
-            }
-
-            if (auto layout = tdescTy.getLayoutAttr()) {
-              auto inst_data = layout.getInstData();
-              if (inst_data && layout.isSgLayout())
-                return SmallVector<int64_t>(inst_data.asArrayRef().begin(),
-                                            inst_data.asArrayRef().end());
-            }
-          }
-
-          if (isa<xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
-                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
-            xegpu::TensorDescType tdescTy;
-            if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
+            } else if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
               tdescTy = createOp.getType();
             } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
               tdescTy = updateOp.getTensorDescType();
@@ -111,14 +104,40 @@ struct TestXeGPUUnrollingPatterns
             Attribute encoding = tdescTy.getEncoding();
             auto layout = llvm::dyn_cast_if_present<xegpu::LayoutAttr>(
                 tdescTy.getLayout());
+
+            // If the encoding is a ScatterTensorDescAttr, we need to
+            // potentially adjust the chunk size based on the inst_data.
+            if (encoding && mlir::isa<xegpu::ScatterTensorDescAttr>(encoding)) {
+              auto scatterAttr =
+                  mlir::dyn_cast<xegpu::ScatterTensorDescAttr>(encoding);
+              int64_t chunkSize = scatterAttr.getChunkSize().getInt();
+
+              if (chunkSize > 1) {
+                int64_t blockedChunkSize = chunkSize;
+                auto instData = layout.getInstData();
+                if (!instData.empty())
+                  blockedChunkSize = instData.asArrayRef().back();
+
+                auto chunkSizeAttr = mlir::IntegerAttr::get(
+                    mlir::IntegerType::get(ctx, 64), blockedChunkSize);
+
+                // To create a new attribute with a different chunk_size:
+                auto newEncoding = xegpu::ScatterTensorDescAttr::get(
+                    ctx, scatterAttr.getMemorySpace(), chunkSizeAttr);
+
+                encoding = newEncoding;
+              }
+            }
             if (layout) {
               if (layout.getLaneLayout() == nullptr)
                 layout = xegpu::LayoutAttr();
               else
                 layout = layout.dropInstData();
             }
+
             newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                                layout);
+
           } else {
             newTy = type.clone(tileShape, elemTy);
           }

From fd7e46b864229a270726bd1026387740b9113094 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Tue, 17 Jun 2025 15:50:42 -0700
Subject: [PATCH 0722/1322] Revert "[libc++] Remove trailing newline from
 _LIBCPP_ASSERTION_HANDLER calls" (#144615)

Reverts llvm/llvm-project#143573
---
 libcxx/include/__assert               | 4 ++--
 libcxx/src/verbose_abort.cpp          | 3 ---
 libcxx/test/support/check_assertion.h | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index 1bfed2890b79..90eaa6023587 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -20,8 +20,8 @@
 #define _LIBCPP_ASSERT(expression, message)                                                                            \
   (__builtin_expect(static_cast<bool>(expression), 1)                                                                  \
        ? (void)0                                                                                                       \
-       : _LIBCPP_ASSERTION_HANDLER(                                                                                    \
-             __FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(expression) " failed: " message))
+       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(            \
+             expression) " failed: " message "\n"))
 
 // WARNING: __builtin_assume can currently inhibit optimizations. Only add assumptions with a clear
 // optimization intent. See https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 for a
diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp
index efb7b9be6f61..94bdb451dee7 100644
--- a/libcxx/src/verbose_abort.cpp
+++ b/libcxx/src/verbose_abort.cpp
@@ -30,9 +30,6 @@ _LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) noexcept {
     va_list list;
     va_start(list, format);
     std::vfprintf(stderr, format, list);
-    // Callers of `__libcpp_verbose_abort` do not include a newline but when
-    // writing the message to stderr we need to include one.
-    std::fputc('\n', stderr);
     va_end(list);
   }
 
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index ea04944ea932..a279400d651b 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -340,7 +340,7 @@ void std::__libcpp_verbose_abort(char const* format, ...) noexcept {
 
   std::fprintf(stderr, "%s\n", Marker);
   std::vfprintf(stderr, format, args);
-  std::fprintf(stderr, "\n%s", Marker);
+  std::fprintf(stderr, "%s", Marker);
 
   va_end(args);
 

From 1cd18bc894b97b282677c1d140688a27ebbec924 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 07:59:05 +0900
Subject: [PATCH 0723/1322] AMDGPU: Add cost model tests for
 minimumnum/maximumnum (#141904)

The f16 cases in particular look broken since every vector size
has the same reported cost.
---
 .../Analysis/CostModel/AMDGPU/maximumnum.ll   | 452 ++++++++++++++++++
 .../Analysis/CostModel/AMDGPU/minimumnum.ll   | 452 ++++++++++++++++++
 2 files changed, 904 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
new file mode 100644
index 000000000000..5b158e3d8d67
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=SIZE,GFX8-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=SIZE,GFX12-SIZE %s
+
+define void @maximumnum_f16() {
+; GFX7-LABEL: 'maximumnum_f16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @maximumnum_bf16() {
+; GFX7-LABEL: 'maximumnum_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_bf16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_bf16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @maximumnum_f32() {
+; ALL-LABEL: 'maximumnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @maximumnum_f64() {
+; ALL-LABEL: 'maximumnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+define void @maximumnum_f16_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_f16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @maximumnum_bf16_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @maximumnum_f32_no_ieee() #0 {
+; ALL-LABEL: 'maximumnum_f32_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @maximumnum_f64_no_ieee() #0 {
+; ALL-LABEL: 'maximumnum_f64_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
new file mode 100644
index 000000000000..97715cbab7d8
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=SIZE,GFX8-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=SIZE,GFX12-SIZE %s
+
+define void @minimumnum_f16() {
+; GFX7-LABEL: 'minimumnum_f16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @minimumnum_bf16() {
+; GFX7-LABEL: 'minimumnum_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_bf16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_bf16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @minimumnum_f32() {
+; ALL-LABEL: 'minimumnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @minimumnum_f64() {
+; ALL-LABEL: 'minimumnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+define void @minimumnum_f16_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_f16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @minimumnum_bf16_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @minimumnum_f32_no_ieee() #0 {
+; ALL-LABEL: 'minimumnum_f32_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @minimumnum_f64_no_ieee() #0 {
+; ALL-LABEL: 'minimumnum_f64_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }

From 87b13ada109643bbf5495727b0bf59a46bd533aa Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Tue, 17 Jun 2025 15:59:38 -0700
Subject: [PATCH 0724/1322] [HLSL][RootSignature] Implement serialization of
 remaining Root Elements (#143198)

Implements serialization of the remaining `RootElement`s, namely
`RootDescriptor`s and `StaticSampler`s.

- Adds unit testing for the serialization methods

Resolves https://github.com/llvm/llvm-project/issues/138191
Resolves https://github.com/llvm/llvm-project/issues/138193
---
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |   6 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  | 142 ++++++++++++++++++
 .../Frontend/HLSLRootSignatureDumpTest.cpp    | 122 ++++++++++++++-
 3 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index 4d2cd183ebcb..25c2a9f0cc80 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -38,6 +38,12 @@ LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
 
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const DescriptorTable &Table);
 
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const RootDescriptor &Descriptor);
+
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const StaticSampler &StaticSampler);
+
 LLVM_ABI void dumpRootElements(raw_ostream &OS, ArrayRef<RootElement> Elements);
 
 class MetadataBuilder {
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 1e198b639cfd..a1ddb318055b 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -98,6 +98,109 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+static const EnumEntry<SamplerFilter> SamplerFilterNames[] = {
+    {"MinMagMipPoint", SamplerFilter::MinMagMipPoint},
+    {"MinMagPointMipLinear", SamplerFilter::MinMagPointMipLinear},
+    {"MinPointMagLinearMipPoint", SamplerFilter::MinPointMagLinearMipPoint},
+    {"MinPointMagMipLinear", SamplerFilter::MinPointMagMipLinear},
+    {"MinLinearMagMipPoint", SamplerFilter::MinLinearMagMipPoint},
+    {"MinLinearMagPointMipLinear", SamplerFilter::MinLinearMagPointMipLinear},
+    {"MinMagLinearMipPoint", SamplerFilter::MinMagLinearMipPoint},
+    {"MinMagMipLinear", SamplerFilter::MinMagMipLinear},
+    {"Anisotropic", SamplerFilter::Anisotropic},
+    {"ComparisonMinMagMipPoint", SamplerFilter::ComparisonMinMagMipPoint},
+    {"ComparisonMinMagPointMipLinear",
+     SamplerFilter::ComparisonMinMagPointMipLinear},
+    {"ComparisonMinPointMagLinearMipPoint",
+     SamplerFilter::ComparisonMinPointMagLinearMipPoint},
+    {"ComparisonMinPointMagMipLinear",
+     SamplerFilter::ComparisonMinPointMagMipLinear},
+    {"ComparisonMinLinearMagMipPoint",
+     SamplerFilter::ComparisonMinLinearMagMipPoint},
+    {"ComparisonMinLinearMagPointMipLinear",
+     SamplerFilter::ComparisonMinLinearMagPointMipLinear},
+    {"ComparisonMinMagLinearMipPoint",
+     SamplerFilter::ComparisonMinMagLinearMipPoint},
+    {"ComparisonMinMagMipLinear", SamplerFilter::ComparisonMinMagMipLinear},
+    {"ComparisonAnisotropic", SamplerFilter::ComparisonAnisotropic},
+    {"MinimumMinMagMipPoint", SamplerFilter::MinimumMinMagMipPoint},
+    {"MinimumMinMagPointMipLinear", SamplerFilter::MinimumMinMagPointMipLinear},
+    {"MinimumMinPointMagLinearMipPoint",
+     SamplerFilter::MinimumMinPointMagLinearMipPoint},
+    {"MinimumMinPointMagMipLinear", SamplerFilter::MinimumMinPointMagMipLinear},
+    {"MinimumMinLinearMagMipPoint", SamplerFilter::MinimumMinLinearMagMipPoint},
+    {"MinimumMinLinearMagPointMipLinear",
+     SamplerFilter::MinimumMinLinearMagPointMipLinear},
+    {"MinimumMinMagLinearMipPoint", SamplerFilter::MinimumMinMagLinearMipPoint},
+    {"MinimumMinMagMipLinear", SamplerFilter::MinimumMinMagMipLinear},
+    {"MinimumAnisotropic", SamplerFilter::MinimumAnisotropic},
+    {"MaximumMinMagMipPoint", SamplerFilter::MaximumMinMagMipPoint},
+    {"MaximumMinMagPointMipLinear", SamplerFilter::MaximumMinMagPointMipLinear},
+    {"MaximumMinPointMagLinearMipPoint",
+     SamplerFilter::MaximumMinPointMagLinearMipPoint},
+    {"MaximumMinPointMagMipLinear", SamplerFilter::MaximumMinPointMagMipLinear},
+    {"MaximumMinLinearMagMipPoint", SamplerFilter::MaximumMinLinearMagMipPoint},
+    {"MaximumMinLinearMagPointMipLinear",
+     SamplerFilter::MaximumMinLinearMagPointMipLinear},
+    {"MaximumMinMagLinearMipPoint", SamplerFilter::MaximumMinMagLinearMipPoint},
+    {"MaximumMinMagMipLinear", SamplerFilter::MaximumMinMagMipLinear},
+    {"MaximumAnisotropic", SamplerFilter::MaximumAnisotropic},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const SamplerFilter &Filter) {
+  printEnum(OS, Filter, ArrayRef(SamplerFilterNames));
+
+  return OS;
+}
+
+static const EnumEntry<TextureAddressMode> TextureAddressModeNames[] = {
+    {"Wrap", TextureAddressMode::Wrap},
+    {"Mirror", TextureAddressMode::Mirror},
+    {"Clamp", TextureAddressMode::Clamp},
+    {"Border", TextureAddressMode::Border},
+    {"MirrorOnce", TextureAddressMode::MirrorOnce},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const TextureAddressMode &Address) {
+  printEnum(OS, Address, ArrayRef(TextureAddressModeNames));
+
+  return OS;
+}
+
+static const EnumEntry<ComparisonFunc> ComparisonFuncNames[] = {
+    {"Never", ComparisonFunc::Never},
+    {"Less", ComparisonFunc::Less},
+    {"Equal", ComparisonFunc::Equal},
+    {"LessEqual", ComparisonFunc::LessEqual},
+    {"Greater", ComparisonFunc::Greater},
+    {"NotEqual", ComparisonFunc::NotEqual},
+    {"GreaterEqual", ComparisonFunc::GreaterEqual},
+    {"Always", ComparisonFunc::Always},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const ComparisonFunc &CompFunc) {
+  printEnum(OS, CompFunc, ArrayRef(ComparisonFuncNames));
+
+  return OS;
+}
+
+static const EnumEntry<StaticBorderColor> StaticBorderColorNames[] = {
+    {"TransparentBlack", StaticBorderColor::TransparentBlack},
+    {"OpaqueBlack", StaticBorderColor::OpaqueBlack},
+    {"OpaqueWhite", StaticBorderColor::OpaqueWhite},
+    {"OpaqueBlackUint", StaticBorderColor::OpaqueBlackUint},
+    {"OpaqueWhiteUint", StaticBorderColor::OpaqueWhiteUint},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const StaticBorderColor &BorderColor) {
+  printEnum(OS, BorderColor, ArrayRef(StaticBorderColorNames));
+
+  return OS;
+}
+
 static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
     {"CBV", dxil::ResourceClass::CBuffer},
     {"SRV", dxil::ResourceClass::SRV},
@@ -112,6 +215,20 @@ static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
   return OS;
 }
 
+static const EnumEntry<RootDescriptorFlags> RootDescriptorFlagNames[] = {
+    {"DataVolatile", RootDescriptorFlags::DataVolatile},
+    {"DataStaticWhileSetAtExecute",
+     RootDescriptorFlags::DataStaticWhileSetAtExecute},
+    {"DataStatic", RootDescriptorFlags::DataStatic},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const RootDescriptorFlags &Flags) {
+  printFlags(OS, Flags, ArrayRef(RootDescriptorFlagNames));
+
+  return OS;
+}
+
 static const EnumEntry<DescriptorRangeFlags> DescriptorRangeFlagNames[] = {
     {"DescriptorsVolatile", DescriptorRangeFlags::DescriptorsVolatile},
     {"DataVolatile", DescriptorRangeFlags::DataVolatile},
@@ -182,6 +299,31 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) {
   return OS;
 }
 
+raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) {
+  ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type));
+  OS << "Root" << Type << "(" << Descriptor.Reg
+     << ", space = " << Descriptor.Space
+     << ", visibility = " << Descriptor.Visibility
+     << ", flags = " << Descriptor.Flags << ")";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const StaticSampler &Sampler) {
+  OS << "StaticSampler(" << Sampler.Reg << ", filter = " << Sampler.Filter
+     << ", addressU = " << Sampler.AddressU
+     << ", addressV = " << Sampler.AddressV
+     << ", addressW = " << Sampler.AddressW
+     << ", mipLODBias = " << Sampler.MipLODBias
+     << ", maxAnisotropy = " << Sampler.MaxAnisotropy
+     << ", comparisonFunc = " << Sampler.CompFunc
+     << ", borderColor = " << Sampler.BorderColor
+     << ", minLOD = " << Sampler.MinLOD << ", maxLOD = " << Sampler.MaxLOD
+     << ", space = " << Sampler.Space << ", visibility = " << Sampler.Visibility
+     << ")";
+  return OS;
+}
+
 void dumpRootElements(raw_ostream &OS, ArrayRef<RootElement> Elements) {
   OS << "RootElements{";
   bool First = true;
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 1a0c8e2a1639..1c37ee709e09 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -108,6 +108,127 @@ TEST(HLSLRootSignatureTest, DescriptorTableDump) {
   EXPECT_EQ(Out, Expected);
 }
 
+TEST(HLSLRootSignatureTest, RootCBVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::CBuffer;
+  Descriptor.Reg = {RegisterType::BReg, 0};
+  Descriptor.setDefaultFlags();
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected = "RootCBV(b0, space = 0, "
+                         "visibility = All, "
+                         "flags = DataStaticWhileSetAtExecute)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, RootSRVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::SRV;
+  Descriptor.Reg = {RegisterType::TReg, 0};
+  Descriptor.Space = 42;
+  Descriptor.Visibility = ShaderVisibility::Geometry;
+  Descriptor.Flags = RootDescriptorFlags::None;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected =
+      "RootSRV(t0, space = 42, visibility = Geometry, flags = None)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, RootUAVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::UAV;
+  Descriptor.Reg = {RegisterType::UReg, 92374};
+  Descriptor.Space = 932847;
+  Descriptor.Visibility = ShaderVisibility::Hull;
+  Descriptor.Flags = RootDescriptorFlags::ValidFlags;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected =
+      "RootUAV(u92374, space = 932847, visibility = Hull, flags = "
+      "DataVolatile | "
+      "DataStaticWhileSetAtExecute | "
+      "DataStatic)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefaultStaticSamplerDump) {
+  StaticSampler Sampler;
+  Sampler.Reg = {RegisterType::SReg, 0};
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Sampler;
+  OS.flush();
+
+  std::string Expected = "StaticSampler(s0, "
+                         "filter = Anisotropic, "
+                         "addressU = Wrap, "
+                         "addressV = Wrap, "
+                         "addressW = Wrap, "
+                         "mipLODBias = 0.000000e+00, "
+                         "maxAnisotropy = 16, "
+                         "comparisonFunc = LessEqual, "
+                         "borderColor = OpaqueWhite, "
+                         "minLOD = 0.000000e+00, "
+                         "maxLOD = 3.402823e+38, "
+                         "space = 0, "
+                         "visibility = All"
+                         ")";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
+  StaticSampler Sampler;
+  Sampler.Reg = {RegisterType::SReg, 0};
+
+  Sampler.Filter = SamplerFilter::ComparisonMinMagLinearMipPoint;
+  Sampler.AddressU = TextureAddressMode::Mirror;
+  Sampler.AddressV = TextureAddressMode::Border;
+  Sampler.AddressW = TextureAddressMode::Clamp;
+  Sampler.MipLODBias = 4.8f;
+  Sampler.MaxAnisotropy = 32;
+  Sampler.CompFunc = ComparisonFunc::NotEqual;
+  Sampler.BorderColor = StaticBorderColor::OpaqueBlack;
+  Sampler.MinLOD = 1.0f;
+  Sampler.MaxLOD = 32.0f;
+  Sampler.Space = 7;
+  Sampler.Visibility = ShaderVisibility::Domain;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Sampler;
+  OS.flush();
+
+  std::string Expected = "StaticSampler(s0, "
+                         "filter = ComparisonMinMagLinearMipPoint, "
+                         "addressU = Mirror, "
+                         "addressV = Border, "
+                         "addressW = Clamp, "
+                         "mipLODBias = 4.800000e+00, "
+                         "maxAnisotropy = 32, "
+                         "comparisonFunc = NotEqual, "
+                         "borderColor = OpaqueBlack, "
+                         "minLOD = 1.000000e+00, "
+                         "maxLOD = 3.200000e+01, "
+                         "space = 7, "
+                         "visibility = Domain"
+                         ")";
+  EXPECT_EQ(Out, Expected);
+}
+
 TEST(HLSLRootSignatureTest, DefaultRootConstantsDump) {
   RootConstants Constants;
   Constants.Num32BitConstants = 1;
@@ -173,7 +294,6 @@ TEST(HLSLRootSignatureTest, AllRootFlagsDump) {
                          "DenyMeshShaderRootAccess | "
                          "CBVSRVUAVHeapDirectlyIndexed | "
                          "SamplerHeapDirectlyIndexed)";
-
   EXPECT_EQ(Out, Expected);
 }
 

From cb63b75e32a415c9bfc298ed7fdcd67e8d9de54c Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Tue, 17 Jun 2025 16:01:40 -0700
Subject: [PATCH 0725/1322] Revert "[lldb-dap] Refactoring DebugCommunication
 to improve test consistency. (#143818)

This reverts commit 362b9d78b4ee9107da2b5e90b3764b0f0fa610fe.

Buildbots using python3.10 are running into errors from this change.
---
 .../test/tools/lldb-dap/dap_server.py         | 873 +++++++-----------
 .../test/tools/lldb-dap/lldbdap_testcase.py   |  79 +-
 .../breakpoint/TestDAP_setBreakpoints.py      |   5 +-
 .../tools/lldb-dap/cancel/TestDAP_cancel.py   |  10 +-
 .../tools/lldb-dap/launch/TestDAP_launch.py   |  12 +-
 .../tools/lldb-dap/module/TestDAP_module.py   |   2 +-
 .../tools/lldb-dap/output/TestDAP_output.py   |   4 +-
 7 files changed, 396 insertions(+), 589 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 23178a215206..6d32491eaa5e 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -10,124 +10,17 @@ import string
 import subprocess
 import signal
 import sys
-from dataclasses import dataclass
 import threading
 import time
-from typing import (
-    IO,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-    TypeVar,
-    Generic,
-    TypedDict,
-    Union,
-    BinaryIO,
-    TextIO,
-    Literal,
-    cast,
-)
+from typing import Any, Optional, Union, BinaryIO, TextIO
 
 ## DAP type references
-
-T = TypeVar("T")
-Te = TypeVar("Te")  # Generic type for event body
-Ta = TypeVar("Ta")  # Generic type for request arguments
-Tb = TypeVar("Tb")  # Generic type for response body
-
-
-class Event(Generic[Te], TypedDict):
-    type: Literal["event"]
-    seq: int
-    event: str
-    body: Optional[Te]
-
-
-class Request(Generic[Ta], TypedDict, total=False):
-    type: Literal["request"]
-    seq: int
-    command: str
-    arguments: Ta
-
-
-class Response(Generic[Tb], TypedDict):
-    type: Literal["response"]
-    seq: int
-    request_seq: int
-    success: bool
-    command: str
-    message: Optional[str]
-    body: Optional[Tb]
-
-
+Event = dict[str, Any]
+Request = dict[str, Any]
+Response = dict[str, Any]
 ProtocolMessage = Union[Event, Request, Response]
 
 
-class AttachOrLaunchArguments(TypedDict, total=False):
-    stopOnEntry: bool
-    disableASLR: bool
-    disableSTDIO: bool
-    enableAutoVariableSummaries: bool
-    displayExtendedBacktrace: bool
-    enableSyntheticChildDebugging: bool
-    initCommands: List[str]
-    preRunCommands: List[str]
-    postRunCommands: List[str]
-    stopCommands: List[str]
-    exitCommands: List[str]
-    terminateCommands: List[str]
-    sourceMap: Union[List[Tuple[str, str]], Dict[str, str]]
-    sourcePath: str
-    debuggerRoot: str
-    commandEscapePrefix: str
-    customFrameFormat: str
-    customThreadFormat: str
-
-
-class LaunchArguments(AttachOrLaunchArguments, total=False):
-    program: str
-    args: List[str]
-    cwd: str
-    env: Dict[str, str]
-    shellExpandArguments: bool
-    runInTerminal: bool
-    launchCommands: List[str]
-
-
-# Using the function form of TypedDict to allow for hyphenated keys.
-AttachGdbServer = TypedDict(
-    "AttachGdbServer", {"gdb-remote-port": int, "gdb-remote-hostname": str}, total=False
-)
-
-
-class AttachArguments(AttachGdbServer, AttachOrLaunchArguments, total=False):
-    program: str
-    pid: int
-    waitFor: bool
-    attachCommands: List[str]
-    coreFile: str
-
-
-class BreakpointData(TypedDict, total=False):
-    column: int
-    condition: str
-    hitCondition: str
-    logMessage: str
-    mode: str
-
-
-class SourceBreakpoint(BreakpointData):
-    line: int
-
-
-class Breakpoint(TypedDict, total=False):
-    id: int
-    verified: bool
-
-
 def dump_memory(base_addr, data, num_per_line, outfile):
     data_len = len(data)
     hex_string = binascii.hexlify(data)
@@ -165,9 +58,7 @@ def dump_memory(base_addr, data, num_per_line, outfile):
         outfile.write("\n")
 
 
-def read_packet(
-    f: IO[bytes], trace_file: Optional[IO[str]] = None
-) -> Optional[ProtocolMessage]:
+def read_packet(f, verbose=False, trace_file=None):
     """Decode a JSON packet that starts with the content length and is
     followed by the JSON bytes from a file 'f'. Returns None on EOF.
     """
@@ -179,20 +70,32 @@ def read_packet(
     prefix = "Content-Length: "
     if line.startswith(prefix):
         # Decode length of JSON bytes
+        if verbose:
+            print('content: "%s"' % (line))
         length = int(line[len(prefix) :])
+        if verbose:
+            print('length: "%u"' % (length))
         # Skip empty line
-        line = f.readline().decode()
+        line = f.readline()
+        if verbose:
+            print('empty: "%s"' % (line))
         # Read JSON bytes
         json_str = f.read(length)
+        if verbose:
+            print('json: "%s"' % (json_str))
         if trace_file:
-            trace_file.write(f"from adapter:\n{json_str!r}\n")
+            trace_file.write("from adapter:\n%s\n" % (json_str))
         # Decode the JSON bytes into a python dictionary
         return json.loads(json_str)
 
     raise Exception("unexpected malformed message from lldb-dap: " + line)
 
 
-def dump_dap_log(log_file: Optional[str]) -> None:
+def packet_type_is(packet, packet_type):
+    return "type" in packet and packet["type"] == packet_type
+
+
+def dump_dap_log(log_file):
     print("========= DEBUG ADAPTER PROTOCOL LOGS =========", file=sys.stderr)
     if log_file is None:
         print("no log file available", file=sys.stderr)
@@ -202,30 +105,34 @@ def dump_dap_log(log_file: Optional[str]) -> None:
     print("========= END =========", file=sys.stderr)
 
 
-@dataclass
-class Source:
-    path: Optional[str]
-    source_reference: Optional[int]
-
-    @property
-    def name(self) -> Optional[str]:
-        if not self.path:
-            return None
-        return os.path.basename(self.path)
-
+class Source(object):
     def __init__(
         self, path: Optional[str] = None, source_reference: Optional[int] = None
     ):
-        if path is None and source_reference is None:
+        self._name = None
+        self._path = None
+        self._source_reference = None
+
+        if path is not None:
+            self._name = os.path.basename(path)
+            self._path = path
+        elif source_reference is not None:
+            self._source_reference = source_reference
+        else:
             raise ValueError("Either path or source_reference must be provided")
 
-        self.path = path
-        self.source_reference = source_reference
+    def __str__(self):
+        return f"Source(name={self.name}, path={self.path}), source_reference={self.source_reference})"
 
-    def to_DAP(self) -> dict:
-        if self.path:
-            return {"path": self.path, "name": self.name}
-        return {"sourceReference": self.source_reference}
+    def as_dict(self):
+        source_dict = {}
+        if self._name is not None:
+            source_dict["name"] = self._name
+        if self._path is not None:
+            source_dict["path"] = self._path
+        if self._source_reference is not None:
+            source_dict["sourceReference"] = self._source_reference
+        return source_dict
 
 
 class NotSupportedError(KeyError):
@@ -237,7 +144,7 @@ class DebugCommunication(object):
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: List[str],
+        init_commands: list[str],
         log_file: Optional[TextIO] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
@@ -245,50 +152,35 @@ class DebugCommunication(object):
         self.log_file = log_file
         self.send = send
         self.recv = recv
-        # Packets that have been received and processed but have not yet been
-        # requested by a test case.
-        self._pending_packets: List[Optional[ProtocolMessage]] = []
-        # Received packets that have not yet been processed.
-        self._recv_packets: List[Optional[ProtocolMessage]] = []
-        # Used as a mutex for _recv_packets and for notify when _recv_packets
-        # changes.
-        self._recv_condition = threading.Condition()
-        self._recv_thread = threading.Thread(target=self._read_packet_thread)
-
-        # session state
-        self.init_commands = init_commands
+        self.recv_packets: list[Optional[ProtocolMessage]] = []
+        self.recv_condition = threading.Condition()
+        self.recv_thread = threading.Thread(target=self._read_packet_thread)
+        self.process_event_body = None
         self.exit_status: Optional[int] = None
-        self.capabilities: Optional[Dict] = None
-        self.initialized: bool = False
-        self.configuration_done_sent: bool = False
-        self.process_event_body: Optional[Dict] = None
-        self.terminated: bool = False
-        self.events: List[Event] = []
-        self.progress_events: List[Event] = []
-        self.reverse_requests: List[Request] = []
-        self.module_events: List[Dict] = []
-        self.sequence: int = 1
-        self.output: Dict[str, str] = {}
-
-        # debuggee state
-        self.threads: Optional[dict] = None
-        self.thread_stop_reasons: Dict[str, Any] = {}
-        self.frame_scopes: Dict[str, Any] = {}
-        # keyed by breakpoint id
-        self.resolved_breakpoints: Dict[str, bool] = {}
-
-        # trigger enqueue thread
-        self._recv_thread.start()
+        self.capabilities: dict[str, Any] = {}
+        self.progress_events: list[Event] = []
+        self.reverse_requests = []
+        self.sequence = 1
+        self.threads = None
+        self.thread_stop_reasons = {}
+        self.recv_thread.start()
+        self.output_condition = threading.Condition()
+        self.output: dict[str, list[str]] = {}
+        self.configuration_done_sent = False
+        self.initialized = False
+        self.frame_scopes = {}
+        self.init_commands = init_commands
+        self.resolved_breakpoints = {}
 
     @classmethod
     def encode_content(cls, s: str) -> bytes:
         return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8")
 
     @classmethod
-    def validate_response(cls, request: Request, response: Response) -> None:
-        if request["command"] != response["command"]:
+    def validate_response(cls, command, response):
+        if command["command"] != response["command"]:
             raise ValueError("command mismatch in response")
-        if request["seq"] != response["request_seq"]:
+        if command["seq"] != response["request_seq"]:
             raise ValueError("seq mismatch in response")
 
     def _read_packet_thread(self):
@@ -297,323 +189,262 @@ class DebugCommunication(object):
             while not done:
                 packet = read_packet(self.recv, trace_file=self.trace_file)
                 # `packet` will be `None` on EOF. We want to pass it down to
-                # handle_recv_packet anyway so the main thread can handle
-                # unexpected termination of lldb-dap and stop waiting for new
-                # packets.
+                # handle_recv_packet anyway so the main thread can handle unexpected
+                # termination of lldb-dap and stop waiting for new packets.
                 done = not self._handle_recv_packet(packet)
         finally:
             dump_dap_log(self.log_file)
 
-    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
-        """Handles an incoming packet.
+    def get_modules(self):
+        module_list = self.request_modules()["body"]["modules"]
+        modules = {}
+        for module in module_list:
+            modules[module["name"]] = module
+        return modules
 
-        Called by the read thread that is waiting for all incoming packets
-        to store the incoming packet in "self._recv_packets" in a thread safe
-        way. This function will then signal the "self._recv_condition" to
-        indicate a new packet is available.
-
-        Args:
-            packet: A new packet to store.
-
-        Returns:
-            True if the caller should keep calling this function for more
-            packets.
-        """
-        with self._recv_condition:
-            self._recv_packets.append(packet)
-            self._recv_condition.notify()
-            # packet is None on EOF
-            return packet is not None and not (
-                packet["type"] == "response" and packet["command"] == "disconnect"
-            )
-
-    def _recv_packet(
-        self,
-        *,
-        predicate: Optional[Callable[[ProtocolMessage], bool]] = None,
-        timeout: Optional[float] = None,
-    ) -> Optional[ProtocolMessage]:
-        """Processes received packets from the adapter.
-
-        Updates the DebugCommunication stateful properties based on the received
-        packets in the order they are received.
-
-        NOTE: The only time the session state properties should be updated is
-        during this call to ensure consistency during tests.
-
-        Args:
-            predicate:
-                Optional, if specified, returns the first packet that matches
-                the given predicate.
-            timeout:
-                Optional, if specified, processes packets until either the
-                timeout occurs or the predicate matches a packet, whichever
-                occurs first.
-
-        Returns:
-            The first matching packet for the given predicate, if specified,
-            otherwise None.
-        """
-        assert (
-            threading.current_thread != self._recv_thread
-        ), "Must not be called from the _recv_thread"
-
-        def process_until_match():
-            self._process_recv_packets()
-            for i, packet in enumerate(self._pending_packets):
-                if packet is None:
-                    # We need to return a truthy value to break out of the
-                    # wait_for, use `EOFError` as an indicator of EOF.
-                    return EOFError()
-                if predicate and predicate(packet):
-                    self._pending_packets.pop(i)
-                    return packet
-
-        with self._recv_condition:
-            packet = self._recv_condition.wait_for(process_until_match, timeout)
-            return None if isinstance(packet, EOFError) else packet
-
-    def _process_recv_packets(self) -> None:
-        """Process received packets, updating the session state."""
-        with self._recv_condition:
-            for packet in self._recv_packets:
-                # Handle events that may modify any stateful properties of
-                # the DAP session.
-                if packet and packet["type"] == "event":
-                    self._handle_event(packet)
-                elif packet and packet["type"] == "request":
-                    # Handle reverse requests and keep processing.
-                    self._handle_reverse_request(packet)
-                # Move the packet to the pending queue.
-                self._pending_packets.append(packet)
-            self._recv_packets.clear()
-
-    def _handle_event(self, packet: Event) -> None:
-        """Handle any events that modify debug session state we track."""
-        event = packet["event"]
-        body: Optional[Dict] = packet.get("body", None)
-
-        if event == "output" and body:
-            # Store any output we receive so clients can retrieve it later.
-            category = body["category"]
-            output = body["output"]
+    def get_output(self, category, timeout=0.0, clear=True):
+        self.output_condition.acquire()
+        output = None
+        if category in self.output:
+            output = self.output[category]
+            if clear:
+                del self.output[category]
+        elif timeout != 0.0:
+            self.output_condition.wait(timeout)
             if category in self.output:
-                self.output[category] += output
-            else:
-                self.output[category] = output
-        elif event == "initialized":
-            self.initialized = True
-        elif event == "process":
-            # When a new process is attached or launched, remember the
-            # details that are available in the body of the event
-            self.process_event_body = body
-        elif event == "exited" and body:
-            # Process exited, mark the status to indicate the process is not
-            # alive.
-            self.exit_status = body["exitCode"]
-        elif event == "continued" and body:
-            # When the process continues, clear the known threads and
-            # thread_stop_reasons.
-            all_threads_continued = body.get("allThreadsContinued", True)
-            tid = body["threadId"]
-            if tid in self.thread_stop_reasons:
-                del self.thread_stop_reasons[tid]
-            self._process_continued(all_threads_continued)
-        elif event == "stopped" and body:
-            # Each thread that stops with a reason will send a
-            # 'stopped' event. We need to remember the thread stop
-            # reasons since the 'threads' command doesn't return
-            # that information.
-            self._process_stopped()
-            tid = body["threadId"]
-            self.thread_stop_reasons[tid] = body
-        elif event.startswith("progress"):
-            # Progress events come in as 'progressStart', 'progressUpdate',
-            # and 'progressEnd' events. Keep these around in case test
-            # cases want to verify them.
-            self.progress_events.append(packet)
-        elif event == "breakpoint" and body:
-            # Breakpoint events are sent when a breakpoint is resolved
-            self._update_verified_breakpoints([body["breakpoint"]])
-        elif event == "capabilities" and body:
-            if self.capabilities is None:
-                self.capabilities = {}
-            # Update the capabilities with new ones from the event.
-            self.capabilities.update(body["capabilities"])
+                output = self.output[category]
+                if clear:
+                    del self.output[category]
+        self.output_condition.release()
+        return output
 
-    def _handle_reverse_request(self, request: Request) -> None:
-        if request in self.reverse_requests:
-            return
-        self.reverse_requests.append(request)
-        arguments = request.get("arguments")
-        if request["command"] == "runInTerminal" and arguments is not None:
-            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
-            proc = subprocess.Popen(
-                arguments["args"],
-                env=arguments.get("env", {}),
-                cwd=arguments["cwd"],
-                stdin=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-                shell=in_shell,
-            )
-            body = {}
-            if in_shell:
-                body["shellProcessId"] = proc.pid
-            else:
-                body["processId"] = proc.pid
-            self.send_packet(
-                {
-                    "type": "response",
-                    "seq": 0,
-                    "request_seq": request["seq"],
-                    "success": True,
-                    "command": "runInTerminal",
-                    "message": None,
-                    "body": body,
-                }
-            )
-        elif request["command"] == "startDebugging":
-            self.send_packet(
-                {
-                    "type": "response",
-                    "seq": 0,
-                    "request_seq": request["seq"],
-                    "success": True,
-                    "message": None,
-                    "command": "startDebugging",
-                    "body": {},
-                }
-            )
-        else:
-            desc = 'unknown reverse request "%s"' % (request["command"])
-            raise ValueError(desc)
+    def collect_output(self, category, timeout_secs, pattern, clear=True):
+        end_time = time.time() + timeout_secs
+        collected_output = ""
+        while end_time > time.time():
+            output = self.get_output(category, timeout=0.25, clear=clear)
+            if output:
+                collected_output += output
+                if pattern is not None and pattern in output:
+                    break
+        return collected_output if collected_output else None
+
+    def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]):
+        self.recv_condition.acquire()
+        self.recv_packets.append(packet)
+        self.recv_condition.notify()
+        self.recv_condition.release()
+
+    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
+        """Called by the read thread that is waiting for all incoming packets
+        to store the incoming packet in "self.recv_packets" in a thread safe
+        way. This function will then signal the "self.recv_condition" to
+        indicate a new packet is available. Returns True if the caller
+        should keep calling this function for more packets.
+        """
+        # If EOF, notify the read thread by enqueuing a None.
+        if not packet:
+            self._enqueue_recv_packet(None)
+            return False
+
+        # Check the packet to see if is an event packet
+        keepGoing = True
+        packet_type = packet["type"]
+        if packet_type == "event":
+            event = packet["event"]
+            body = None
+            if "body" in packet:
+                body = packet["body"]
+            # Handle the event packet and cache information from these packets
+            # as they come in
+            if event == "output":
+                # Store any output we receive so clients can retrieve it later.
+                category = body["category"]
+                output = body["output"]
+                self.output_condition.acquire()
+                if category in self.output:
+                    self.output[category] += output
+                else:
+                    self.output[category] = output
+                self.output_condition.notify()
+                self.output_condition.release()
+                # no need to add 'output' event packets to our packets list
+                return keepGoing
+            elif event == "initialized":
+                self.initialized = True
+            elif event == "process":
+                # When a new process is attached or launched, remember the
+                # details that are available in the body of the event
+                self.process_event_body = body
+            elif event == "exited":
+                # Process exited, mark the status to indicate the process is not
+                # alive.
+                self.exit_status = body["exitCode"]
+            elif event == "continued":
+                # When the process continues, clear the known threads and
+                # thread_stop_reasons.
+                all_threads_continued = body.get("allThreadsContinued", True)
+                tid = body["threadId"]
+                if tid in self.thread_stop_reasons:
+                    del self.thread_stop_reasons[tid]
+                self._process_continued(all_threads_continued)
+            elif event == "stopped":
+                # Each thread that stops with a reason will send a
+                # 'stopped' event. We need to remember the thread stop
+                # reasons since the 'threads' command doesn't return
+                # that information.
+                self._process_stopped()
+                tid = body["threadId"]
+                self.thread_stop_reasons[tid] = body
+            elif event.startswith("progress"):
+                # Progress events come in as 'progressStart', 'progressUpdate',
+                # and 'progressEnd' events. Keep these around in case test
+                # cases want to verify them.
+                self.progress_events.append(packet)
+            elif event == "breakpoint":
+                # Breakpoint events are sent when a breakpoint is resolved
+                self._update_verified_breakpoints([body["breakpoint"]])
+            elif event == "capabilities":
+                # Update the capabilities with new ones from the event.
+                self.capabilities.update(body["capabilities"])
+
+        elif packet_type == "response":
+            if packet["command"] == "disconnect":
+                keepGoing = False
+        self._enqueue_recv_packet(packet)
+        return keepGoing
 
     def _process_continued(self, all_threads_continued: bool):
         self.frame_scopes = {}
         if all_threads_continued:
             self.thread_stop_reasons = {}
 
-    def _update_verified_breakpoints(self, breakpoints: list[Breakpoint]):
-        for bp in breakpoints:
-            # If no id is set, we cannot correlate the given breakpoint across
-            # requests, ignore it.
-            if "id" not in bp:
-                continue
+    def _update_verified_breakpoints(self, breakpoints: list[Event]):
+        for breakpoint in breakpoints:
+            if "id" in breakpoint:
+                self.resolved_breakpoints[str(breakpoint["id"])] = breakpoint.get(
+                    "verified", False
+                )
 
-            self.resolved_breakpoints[str(bp["id"])] = bp.get("verified", False)
-
-    def _send_recv(self, request: Request[Ta]) -> Optional[Response[Tb]]:
-        """Send a command python dictionary as JSON and receive the JSON
-        response. Validates that the response is the correct sequence and
-        command in the reply. Any events that are received are added to the
-        events list in this object"""
-        seq = self.send_packet(request)
-        response = self.receive_response(seq)
-        if response is None:
-            raise ValueError(f"no response for {request!r}")
-        self.validate_response(request, response)
-        return response
-
-    def send_packet(self, packet: ProtocolMessage) -> int:
+    def send_packet(self, command_dict: Request, set_sequence=True):
         """Take the "command_dict" python dictionary and encode it as a JSON
         string and send the contents as a packet to the VSCode debug
-        adapter.
-
-        Returns the seq of the packet."""
-        # Set the seq for requests.
-        if packet["type"] == "request":
-            packet["seq"] = self.sequence
+        adapter"""
+        # Set the sequence ID for this command automatically
+        if set_sequence:
+            command_dict["seq"] = self.sequence
             self.sequence += 1
-        else:
-            packet["seq"] = 0
-
         # Encode our command dictionary as a JSON string
-        json_str = json.dumps(packet, separators=(",", ":"))
-
+        json_str = json.dumps(command_dict, separators=(",", ":"))
         if self.trace_file:
             self.trace_file.write("to adapter:\n%s\n" % (json_str))
-
         length = len(json_str)
         if length > 0:
             # Send the encoded JSON packet and flush the 'send' file
             self.send.write(self.encode_content(json_str))
             self.send.flush()
 
-        return packet["seq"]
-
-    def receive_response(self, seq: int) -> Optional[Response]:
-        """Waits for the a response with the associated request_sec."""
-
-        def predicate(p: ProtocolMessage):
-            return p["type"] == "response" and p["request_seq"] == seq
-
-        return cast(Optional[Response], self._recv_packet(predicate=predicate))
-
-    def get_modules(self):
-        modules = {}
-        resp = self.request_modules()
-        if resp["success"]:
-            module_list = resp["body"]["modules"]
-            for module in module_list:
-                modules[module["name"]] = module
-        else:
-            raise ValueError(f"request_modules failed: {resp!r}")
-        return modules
-
-    def get_output(self, category: str, clear=True) -> str:
-        output = ""
-        if category in self.output:
-            output = self.output.get(category, "")
-            if clear:
-                del self.output[category]
-        return output
-
-    def collect_output(
+    def recv_packet(
         self,
-        category: str,
-        timeout_secs: float,
-        pattern: Optional[str] = None,
-        clear=True,
-    ) -> str:
-        """Collect output from 'output' events.
+        filter_type: Optional[str] = None,
+        filter_event: Optional[Union[str, list[str]]] = None,
+        timeout: Optional[float] = None,
+    ) -> Optional[ProtocolMessage]:
+        """Get a JSON packet from the VSCode debug adapter. This function
+        assumes a thread that reads packets is running and will deliver
+        any received packets by calling handle_recv_packet(...). This
+        function will wait for the packet to arrive and return it when
+        it does."""
+        while True:
+            try:
+                self.recv_condition.acquire()
+                packet = None
+                while True:
+                    for i, curr_packet in enumerate(self.recv_packets):
+                        if not curr_packet:
+                            raise EOFError
+                        packet_type = curr_packet["type"]
+                        if filter_type is None or packet_type in filter_type:
+                            if filter_event is None or (
+                                packet_type == "event"
+                                and curr_packet["event"] in filter_event
+                            ):
+                                packet = self.recv_packets.pop(i)
+                                break
+                    if packet:
+                        break
+                    # Sleep until packet is received
+                    len_before = len(self.recv_packets)
+                    self.recv_condition.wait(timeout)
+                    len_after = len(self.recv_packets)
+                    if len_before == len_after:
+                        return None  # Timed out
+                return packet
+            except EOFError:
+                return None
+            finally:
+                self.recv_condition.release()
 
-        Args:
-            category: The category to collect.
-            timeout_secs: The max duration for collecting output.
-            pattern:
-                Optional, if set, return once this pattern is detected in the
-                collected output.
+    def send_recv(self, command):
+        """Send a command python dictionary as JSON and receive the JSON
+        response. Validates that the response is the correct sequence and
+        command in the reply. Any events that are received are added to the
+        events list in this object"""
+        self.send_packet(command)
+        done = False
+        while not done:
+            response_or_request = self.recv_packet(filter_type=["response", "request"])
+            if response_or_request is None:
+                desc = 'no response for "%s"' % (command["command"])
+                raise ValueError(desc)
+            if response_or_request["type"] == "response":
+                self.validate_response(command, response_or_request)
+                return response_or_request
+            else:
+                self.reverse_requests.append(response_or_request)
+                if response_or_request["command"] == "runInTerminal":
+                    subprocess.Popen(
+                        response_or_request["arguments"]["args"],
+                        env=response_or_request["arguments"]["env"],
+                    )
+                    self.send_packet(
+                        {
+                            "type": "response",
+                            "request_seq": response_or_request["seq"],
+                            "success": True,
+                            "command": "runInTerminal",
+                            "body": {},
+                        },
+                    )
+                elif response_or_request["command"] == "startDebugging":
+                    self.send_packet(
+                        {
+                            "type": "response",
+                            "request_seq": response_or_request["seq"],
+                            "success": True,
+                            "command": "startDebugging",
+                            "body": {},
+                        },
+                    )
+                else:
+                    desc = 'unknown reverse request "%s"' % (
+                        response_or_request["command"]
+                    )
+                    raise ValueError(desc)
 
-        Returns:
-            The collected output.
-        """
-        deadline = time.monotonic() + timeout_secs
-        output = self.get_output(category, clear)
-        while deadline >= time.monotonic() and (
-            pattern is None or pattern not in output
-        ):
-            event = self.wait_for_event(["output"], timeout=deadline - time.monotonic())
-            if not event:  # Timeout or EOF
-                break
-            output += self.get_output(category, clear=clear)
-        return output
+        return None
 
     def wait_for_event(
-        self, filter: List[str] = [], timeout: Optional[float] = None
+        self, filter: Union[str, list[str]], timeout: Optional[float] = None
     ) -> Optional[Event]:
         """Wait for the first event that matches the filter."""
-
-        def predicate(p: ProtocolMessage):
-            return p["type"] == "event" and p["event"] in filter
-
-        return cast(
-            Optional[Event], self._recv_packet(predicate=predicate, timeout=timeout)
+        return self.recv_packet(
+            filter_type="event", filter_event=filter, timeout=timeout
         )
 
     def wait_for_stopped(
         self, timeout: Optional[float] = None
-    ) -> Optional[List[Event]]:
+    ) -> Optional[list[Event]]:
         stopped_events = []
         stopped_event = self.wait_for_event(
             filter=["stopped", "exited"], timeout=timeout
@@ -632,9 +463,9 @@ class DebugCommunication(object):
         return stopped_events
 
     def wait_for_breakpoint_events(self, timeout: Optional[float] = None):
-        breakpoint_events: List[Event] = []
+        breakpoint_events: list[Event] = []
         while True:
-            event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            event = self.wait_for_event("breakpoint", timeout=timeout)
             if not event:
                 break
             breakpoint_events.append(event)
@@ -645,26 +476,20 @@ class DebugCommunication(object):
     ):
         """Wait for all breakpoints to be verified. Return all unverified breakpoints."""
         while any(id not in self.resolved_breakpoints for id in breakpoint_ids):
-            breakpoint_event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            breakpoint_event = self.wait_for_event("breakpoint", timeout=timeout)
             if breakpoint_event is None:
                 break
 
-        return [
-            id
-            for id in breakpoint_ids
-            if id not in self.resolved_breakpoints and not self.resolved_breakpoints[id]
-        ]
+        return [id for id in breakpoint_ids if id not in self.resolved_breakpoints]
 
     def wait_for_exited(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event(["exited"], timeout=timeout)
+        event_dict = self.wait_for_event("exited", timeout=timeout)
         if event_dict is None:
             raise ValueError("didn't get exited event")
         return event_dict
 
     def wait_for_terminated(self, timeout: Optional[float] = None):
-        if self.terminated:
-            raise ValueError("already terminated")
-        event_dict = self.wait_for_event(["terminated"], timeout)
+        event_dict = self.wait_for_event("terminated", timeout)
         if event_dict is None:
             raise ValueError("didn't get terminated event")
         return event_dict
@@ -699,10 +524,12 @@ class DebugCommunication(object):
         if threadId is None:
             threadId = self.get_thread_id()
         if threadId is None:
+            print("invalid threadId")
             return None
         response = self.request_stackTrace(threadId, startFrame=frameIndex, levels=1)
         if response:
             return response["body"]["stackFrames"][0]
+        print("invalid response")
         return None
 
     def get_completions(self, text, frameId=None):
@@ -840,7 +667,7 @@ class DebugCommunication(object):
         gdbRemotePort: Optional[int] = None,
         gdbRemoteHostname: Optional[str] = None,
     ):
-        args_dict: AttachArguments = {}
+        args_dict = {}
         if pid is not None:
             args_dict["pid"] = pid
         if program is not None:
@@ -872,12 +699,8 @@ class DebugCommunication(object):
             args_dict["gdb-remote-port"] = gdbRemotePort
         if gdbRemoteHostname is not None:
             args_dict["gdb-remote-hostname"] = gdbRemoteHostname
-        command_dict: Request = {
-            "command": "attach",
-            "type": "request",
-            "arguments": args_dict,
-        }
-        return self._send_recv(command_dict)
+        command_dict = {"command": "attach", "type": "request", "arguments": args_dict}
+        return self.send_recv(command_dict)
 
     def request_breakpointLocations(
         self, file_path, line, end_line=None, column=None, end_column=None
@@ -899,7 +722,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_configurationDone(self):
         command_dict = {
@@ -907,7 +730,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": {},
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
             self.request_threads()
@@ -936,7 +759,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response["success"]:
             self._process_continued(response["body"]["allThreadsContinued"])
         # Caller must still call wait_for_stopped.
@@ -953,7 +776,7 @@ class DebugCommunication(object):
         if restartArguments:
             command_dict["arguments"] = restartArguments
 
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         # Caller must still call wait_for_stopped.
         return response
 
@@ -969,7 +792,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_disassemble(
         self,
@@ -989,7 +812,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)["body"]["instructions"]
+        return self.send_recv(command_dict)["body"]["instructions"]
 
     def request_readMemory(self, memoryReference, offset, count):
         args_dict = {
@@ -1002,7 +825,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None):
         stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId)
@@ -1018,7 +841,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_exceptionInfo(self, threadId=None):
         if threadId is None:
@@ -1029,7 +852,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_initialize(self, sourceInitFile=False):
         command_dict = {
@@ -1050,7 +873,7 @@ class DebugCommunication(object):
                 "$__lldb_sourceInitFile": sourceInitFile,
             },
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response:
             if "body" in response:
                 self.capabilities = response["body"]
@@ -1085,7 +908,7 @@ class DebugCommunication(object):
         customFrameFormat: Optional[str] = None,
         customThreadFormat: Optional[str] = None,
     ):
-        args_dict: LaunchArguments = {"program": program}
+        args_dict = {"program": program}
         if args:
             args_dict["args"] = args
         if cwd:
@@ -1132,19 +955,15 @@ class DebugCommunication(object):
         args_dict["displayExtendedBacktrace"] = displayExtendedBacktrace
         if commandEscapePrefix is not None:
             args_dict["commandEscapePrefix"] = commandEscapePrefix
-        command_dict: Request = {
-            "command": "launch",
-            "type": "request",
-            "arguments": args_dict,
-        }
-        return self._send_recv(command_dict)
+        command_dict = {"command": "launch", "type": "request", "arguments": args_dict}
+        return self.send_recv(command_dict)
 
     def request_next(self, threadId, granularity="statement"):
         if self.exit_status is not None:
             raise ValueError("request_continue called after process exited")
         args_dict = {"threadId": threadId, "granularity": granularity}
         command_dict = {"command": "next", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_stepIn(self, threadId, targetId, granularity="statement"):
         if self.exit_status is not None:
@@ -1157,7 +976,7 @@ class DebugCommunication(object):
             "granularity": granularity,
         }
         command_dict = {"command": "stepIn", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_stepInTargets(self, frameId):
         if self.exit_status is not None:
@@ -1169,14 +988,14 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_stepOut(self, threadId):
         if self.exit_status is not None:
             raise ValueError("request_stepOut called after process exited")
         args_dict = {"threadId": threadId}
         command_dict = {"command": "stepOut", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_pause(self, threadId=None):
         if self.exit_status is not None:
@@ -1185,47 +1004,49 @@ class DebugCommunication(object):
             threadId = self.get_thread_id()
         args_dict = {"threadId": threadId}
         command_dict = {"command": "pause", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_scopes(self, frameId):
         args_dict = {"frameId": frameId}
         command_dict = {"command": "scopes", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
-    def request_setBreakpoints(
-        self,
-        source: Union[Source, str],
-        line_array: Optional[List[int]],
-        data: Optional[List[BreakpointData]] = None,
-    ):
+    def request_setBreakpoints(self, source: Source, line_array, data=None):
         """data is array of parameters for breakpoints in line_array.
         Each parameter object is 1:1 mapping with entries in line_entry.
         It contains optional location/hitCondition/logMessage parameters.
         """
-        if isinstance(source, str):
-            source = Source(path=source)
         args_dict = {
-            "source": source.to_DAP(),
+            "source": source.as_dict(),
             "sourceModified": False,
         }
-        if line_array:
+        if line_array is not None:
             args_dict["lines"] = line_array
             breakpoints = []
             for i, line in enumerate(line_array):
-                breakpoint_data: BreakpointData = {}
+                breakpoint_data = None
                 if data is not None and i < len(data):
                     breakpoint_data = data[i]
-                bp: SourceBreakpoint = {"line": line, **breakpoint_data}
+                bp = {"line": line}
+                if breakpoint_data is not None:
+                    if breakpoint_data.get("condition"):
+                        bp["condition"] = breakpoint_data["condition"]
+                    if breakpoint_data.get("hitCondition"):
+                        bp["hitCondition"] = breakpoint_data["hitCondition"]
+                    if breakpoint_data.get("logMessage"):
+                        bp["logMessage"] = breakpoint_data["logMessage"]
+                    if breakpoint_data.get("column"):
+                        bp["column"] = breakpoint_data["column"]
                 breakpoints.append(bp)
             args_dict["breakpoints"] = breakpoints
 
-        command_dict: Request = {
+        command_dict = {
             "command": "setBreakpoints",
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
-        if response and response["success"] and response["body"]:
+        response = self.send_recv(command_dict)
+        if response["success"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
@@ -1240,7 +1061,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=None):
         breakpoints = []
@@ -1257,7 +1078,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response["success"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
@@ -1278,7 +1099,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_setDataBreakpoint(self, dataBreakpoints):
         """dataBreakpoints is a list of dictionary with following fields:
@@ -1295,7 +1116,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_compileUnits(self, moduleId):
         args_dict = {"moduleId": moduleId}
@@ -1304,7 +1125,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         return response
 
     def request_completions(self, text, frameId=None):
@@ -1316,10 +1137,10 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_modules(self):
-        return self._send_recv({"command": "modules", "type": "request"})
+        return self.send_recv({"command": "modules", "type": "request"})
 
     def request_stackTrace(
         self, threadId=None, startFrame=None, levels=None, format=None, dump=False
@@ -1338,7 +1159,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if dump:
             for idx, frame in enumerate(response["body"]["stackFrames"]):
                 name = frame["name"]
@@ -1364,7 +1185,7 @@ class DebugCommunication(object):
                 "sourceReference": sourceReference,
             },
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_threads(self):
         """Request a list of all threads and combine any information from any
@@ -1372,7 +1193,7 @@ class DebugCommunication(object):
         thread actually stopped. Returns an array of thread dictionaries
         with information about all threads"""
         command_dict = {"command": "threads", "type": "request", "arguments": {}}
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if not response["success"]:
             self.threads = None
             return response
@@ -1412,7 +1233,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_setVariable(self, containingVarRef, name, value, id=None):
         args_dict = {
@@ -1427,7 +1248,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_locations(self, locationReference):
         args_dict = {
@@ -1438,7 +1259,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_testGetTargetBreakpoints(self):
         """A request packet used in the LLDB test suite to get all currently
@@ -1450,12 +1271,12 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": {},
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def terminate(self):
         self.send.close()
-        if self._recv_thread.is_alive():
-            self._recv_thread.join()
+        if self.recv_thread.is_alive():
+            self.recv_thread.join()
 
     def request_setInstructionBreakpoints(self, memory_reference=[]):
         breakpoints = []
@@ -1470,7 +1291,7 @@ class DebugCommunication(object):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
 
 class DebugAdapterServer(DebugCommunication):
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 8778b51e7c36..3b54d598c350 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -1,6 +1,6 @@
 import os
 import time
-from typing import Optional, Callable
+from typing import Optional
 import uuid
 
 import dap_server
@@ -121,19 +121,11 @@ class DAPTestCaseBase(TestBase):
             f"Expected to resolve all breakpoints. Unresolved breakpoint ids: {unresolved_breakpoints}",
         )
 
-    def wait_until(
-        self,
-        predicate: Callable[[], bool],
-        delay: float = 0.5,
-        timeout: float = DEFAULT_TIMEOUT,
-    ) -> bool:
-        """Repeatedly run the predicate until either the predicate returns True
-        or a timeout has occurred."""
-        deadline = time.monotonic() + timeout
-        while deadline > time.monotonic():
-            if predicate():
+    def waitUntil(self, condition_callback):
+        for _ in range(20):
+            if condition_callback():
                 return True
-            time.sleep(delay)
+            time.sleep(0.5)
         return False
 
     def assertCapabilityIsSet(self, key: str, msg: Optional[str] = None) -> None:
@@ -152,7 +144,6 @@ class DAPTestCaseBase(TestBase):
         "breakpoint_ids" should be a list of breakpoint ID strings
         (["1", "2"]). The return value from self.set_source_breakpoints()
         or self.set_function_breakpoints() can be passed to this function"""
-        breakpoint_ids = [str(i) for i in breakpoint_ids]
         stopped_events = self.dap_server.wait_for_stopped(timeout)
         for stopped_event in stopped_events:
             if "body" in stopped_event:
@@ -164,16 +155,22 @@ class DAPTestCaseBase(TestBase):
                     and body["reason"] != "instruction breakpoint"
                 ):
                     continue
-                if "hitBreakpointIds" not in body:
+                if "description" not in body:
                     continue
-                hit_breakpoint_ids = body["hitBreakpointIds"]
-                for bp in hit_breakpoint_ids:
-                    if str(bp) in breakpoint_ids:
+                # Descriptions for breakpoints will be in the form
+                # "breakpoint 1.1", so look for any description that matches
+                # ("breakpoint 1.") in the description field as verification
+                # that one of the breakpoint locations was hit. DAP doesn't
+                # allow breakpoints to have multiple locations, but LLDB does.
+                # So when looking at the description we just want to make sure
+                # the right breakpoint matches and not worry about the actual
+                # location.
+                description = body["description"]
+                for breakpoint_id in breakpoint_ids:
+                    match_desc = f"breakpoint {breakpoint_id}."
+                    if match_desc in description:
                         return
-        self.assertTrue(
-            False,
-            f"breakpoint not hit, wanted breakpoint_ids={breakpoint_ids} stopped_events={stopped_events}",
-        )
+        self.assertTrue(False, f"breakpoint not hit, stopped_events={stopped_events}")
 
     def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
         """Wait for the process we are debugging to stop, and verify the stop
@@ -208,9 +205,7 @@ class DAPTestCaseBase(TestBase):
                     found = True
                     break
             self.assertTrue(
-                found,
-                "verify '%s' found in console output for '%s' in %s"
-                % (cmd, flavor, output),
+                found, "verify '%s' found in console output for '%s'" % (cmd, flavor)
             )
 
     def get_dict_value(self, d, key_path):
@@ -282,30 +277,26 @@ class DAPTestCaseBase(TestBase):
                         return (source["path"], stackFrame["line"])
         return ("", 0)
 
-    def get_stdout(self):
-        return self.dap_server.get_output("stdout")
+    def get_stdout(self, timeout=0.0):
+        return self.dap_server.get_output("stdout", timeout=timeout)
 
-    def get_console(self):
-        return self.dap_server.get_output("console")
+    def get_console(self, timeout=0.0):
+        return self.dap_server.get_output("console", timeout=timeout)
 
-    def get_important(self):
-        return self.dap_server.get_output("important")
+    def get_important(self, timeout=0.0):
+        return self.dap_server.get_output("important", timeout=timeout)
 
-    def collect_stdout(self, timeout_secs: float, pattern: Optional[str] = None) -> str:
+    def collect_stdout(self, timeout_secs, pattern=None):
         return self.dap_server.collect_output(
             "stdout", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_console(
-        self, timeout_secs: float, pattern: Optional[str] = None
-    ) -> str:
+    def collect_console(self, timeout_secs, pattern=None):
         return self.dap_server.collect_output(
             "console", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_important(
-        self, timeout_secs: float, pattern: Optional[str] = None
-    ) -> str:
+    def collect_important(self, timeout_secs, pattern=None):
         return self.dap_server.collect_output(
             "important", timeout_secs=timeout_secs, pattern=pattern
         )
@@ -364,7 +355,7 @@ class DAPTestCaseBase(TestBase):
             return self.dap_server.wait_for_stopped(timeout)
         return None
 
-    def do_continue(self) -> None:  # `continue` is a keyword.
+    def do_continue(self):  # `continue` is a keyword.
         resp = self.dap_server.request_continue()
         self.assertTrue(resp["success"], f"continue request failed: {resp}")
 
@@ -372,14 +363,10 @@ class DAPTestCaseBase(TestBase):
         self.do_continue()
         return self.dap_server.wait_for_stopped(timeout)
 
-    def continue_to_breakpoint(
-        self, breakpoint_id: int, timeout: Optional[float] = DEFAULT_TIMEOUT
-    ) -> None:
-        self.continue_to_breakpoints([breakpoint_id], timeout)
+    def continue_to_breakpoint(self, breakpoint_id: str, timeout=DEFAULT_TIMEOUT):
+        self.continue_to_breakpoints((breakpoint_id), timeout)
 
-    def continue_to_breakpoints(
-        self, breakpoint_ids: list[int], timeout: Optional[float] = DEFAULT_TIMEOUT
-    ) -> None:
+    def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
         self.do_continue()
         self.verify_breakpoint_hit(breakpoint_ids, timeout)
 
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index a6eeee3a0254..831edd6494c1 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -78,7 +78,7 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertFalse(breakpoint["verified"])
         self.assertEqual(other_basename, breakpoint["source"]["name"])
         self.assertEqual(new_other_path, breakpoint["source"]["path"])
-        other_breakpoint_id = str(breakpoint["id"])
+        other_breakpoint_id = breakpoint["id"]
 
         self.dap_server.request_continue()
         self.verify_breakpoint_hit([other_breakpoint_id])
@@ -379,8 +379,7 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
             self.assertEqual(breakpoint["line"], loop_line)
             self.assertEqual(breakpoint["column"], columns[index])
             self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
-            self.assertIn("id", breakpoint, "expected breakpoint id")
-            breakpoint_ids.append(str(breakpoint["id"]))
+            breakpoint_ids.append(breakpoint["id"])
 
         # Continue to the first breakpoint,
         self.continue_to_breakpoints([breakpoint_ids[0]])
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index c750cff071a8..824ed8fe3bb9 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -54,18 +54,18 @@ class TestDAP_cancel(lldbdap_testcase.DAPTestCaseBase):
         pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2)
         cancel_seq = self.async_cancel(requestId=pending_seq)
 
-        blocking_resp = self.dap_server.receive_response(blocking_seq)
+        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], True)
 
-        pending_resp = self.dap_server.receive_response(pending_seq)
+        pending_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(pending_resp["request_seq"], pending_seq)
         self.assertEqual(pending_resp["command"], "evaluate")
         self.assertEqual(pending_resp["success"], False)
         self.assertEqual(pending_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.receive_response(cancel_seq)
+        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
@@ -86,13 +86,13 @@ class TestDAP_cancel(lldbdap_testcase.DAPTestCaseBase):
         )
         cancel_seq = self.async_cancel(requestId=blocking_seq)
 
-        blocking_resp = self.dap_server.receive_response(blocking_seq)
+        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], False)
         self.assertEqual(blocking_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.receive_response(cancel_seq)
+        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index c29e0d3fa7b8..ae8142ae4f48 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -191,7 +191,7 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_exit()
         # Now get the STDOUT and verify our program argument is correct
         output = self.get_stdout()
-        self.assertEqual(output, "", "expect no program output")
+        self.assertEqual(output, None, "expect no program output")
 
     @skipIfWindows
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
@@ -392,14 +392,14 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue again and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the second breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
@@ -461,21 +461,21 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.verify_commands("launchCommands", output, launchCommands)
         # Verify the "stopCommands" here
         self.continue_to_next_stop()
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_next_stop()
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
         self.continue_to_exit()
         # Get output from the console. This should contain both the
         # "exitCommands" that were run after the second breakpoint was hit
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("exitCommands", output, exitCommands)
 
     def test_failing_launch_commands(self):
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index b1823e4c8b1c..4fc221668a8e 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -54,7 +54,7 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
             return symbol_regex.match(program_module["symbolStatus"])
 
         if expect_debug_info_size:
-            self.wait_until(checkSymbolsLoadedWithSize)
+            self.waitUntil(checkSymbolsLoadedWithSize)
         active_modules = self.dap_server.get_modules()
         program_module = active_modules[program_basename]
         self.assertEqual(program_basename, program_module["name"])
diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
index 4fcde623e382..0425b55a5e55 100644
--- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
+++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
@@ -37,14 +37,14 @@ class TestDAP_output(lldbdap_testcase.DAPTestCaseBase):
         # Disconnecting from the server to ensure any pending IO is flushed.
         self.dap_server.request_disconnect()
 
-        output += self.get_stdout()
+        output += self.get_stdout(timeout=self.DEFAULT_TIMEOUT)
         self.assertTrue(output and len(output) > 0, "expect program stdout")
         self.assertIn(
             "abcdefghi\r\nhello world\r\nfinally\0\0",
             output,
             "full stdout not found in: " + repr(output),
         )
-        console = self.get_console()
+        console = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.assertTrue(console and len(console) > 0, "expect dap messages")
         self.assertIn(
             "out\0\0\r\nerr\0\0\r\n", console, f"full console message not found"

From c9b28163888574bcfba0171372ae0dcfb40abbfa Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:07:03 +0900
Subject: [PATCH 0726/1322] AMDGPU: Fix cost model for 16-bit operations on
 gfx8 (#141943)

We should only divide the number of pieces to fit the packed instructions
if we actually have pk instructions. This increases the cost of copysign,
but is closer to the current codegen output. It could be much cheaper
than it is now.
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  2 +-
 .../Analysis/CostModel/AMDGPU/canonicalize.ll | 24 ++++++++--------
 .../Analysis/CostModel/AMDGPU/copysign.ll     | 28 +++++++++----------
 .../SLPVectorizer/AMDGPU/slp-v2f16.ll         | 12 ++++----
 4 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24..b2b25ac66677 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -721,7 +721,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   if (SLT == MVT::f64)
     return LT.first * NElts * get64BitInstrCost(CostKind);
 
-  if ((ST->has16BitInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
+  if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
index e162edbf611e..7ac4db311921 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -22,12 +22,12 @@ define void @canonicalize_f16() {
 ;
 ; GFX8-LABEL: 'canonicalize_f16'
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'canonicalize_f16'
@@ -62,12 +62,12 @@ define void @canonicalize_f16() {
 ;
 ; GFX8-SIZE-LABEL: 'canonicalize_f16'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'canonicalize_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
index 06a058ff2e7b..334bb341a3c3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -23,13 +23,13 @@ define void @copysign_f16() {
 ;
 ; GFX8-LABEL: 'copysign_f16'
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'copysign_f16'
@@ -67,13 +67,13 @@ define void @copysign_f16() {
 ;
 ; GFX8-SIZE-LABEL: 'copysign_f16'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'copysign_f16'
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index c79fa9c84d1c..0c26bcb343bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -271,7 +271,9 @@ bb:
 }
 
 ; GCN-LABEL: @copysign_combine_v2f16
-; GCN: call <2 x half> @llvm.copysign.v2f16(
+; GFX8: call half @llvm.copysign.f16(
+; GFX8: call half @llvm.copysign.f16(
+; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -290,8 +292,6 @@ bb:
 
 ; FIXME: Should always vectorize
 ; GCN-LABEL: @copysign_combine_v4f16
-; GCN: call <2 x half> @llvm.copysign.v2f16(
-
 ; GFX8: call half @llvm.copysign.f16(
 ; GFX8: call half @llvm.copysign.f16(
 
@@ -327,8 +327,10 @@ bb:
 }
 
 ; GCN-LABEL: @canonicalize_combine_v4f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

From 3800a83160a42f32947b82700e454cc07c600734 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:10:53 +0900
Subject: [PATCH 0727/1322] AMDGPU: Reduce cost of f64 copysign (#141944)

The real implementation is 1 real instruction plus a constant
materialize. Call that a 1, it's not a real f64 operation.
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 12 ++++---
 .../Analysis/CostModel/AMDGPU/copysign.ll     | 32 +++++++++----------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b2b25ac66677..b79c9be3eac9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -718,9 +718,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
-  if (SLT == MVT::f64)
-    return LT.first * NElts * get64BitInstrCost(CostKind);
-
   if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
@@ -731,6 +728,11 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   switch (ICA.getID()) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
+    if (SLT == MVT::f64) {
+      InstRate = get64BitInstrCost(CostKind);
+      break;
+    }
+
     if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
       InstRate = getFullRateInstrCost();
     else {
@@ -741,8 +743,8 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   case Intrinsic::copysign:
     return NElts * getFullRateInstrCost();
   case Intrinsic::canonicalize: {
-    assert(SLT != MVT::f64);
-    InstRate = getFullRateInstrCost();
+    InstRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
     break;
   }
   case Intrinsic::uadd_sat:
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
index 334bb341a3c3..5b042a8a0460 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -245,25 +245,25 @@ define void @copysign_bf16() {
 
 define void @copysign_f64() {
 ; ALL-LABEL: 'copysign_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'copysign_f64'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.copysign.f64(double undef, double undef)

From bec9ac2dafe1c9fca975721e9951c5f7f6b1b559 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 17 Jun 2025 16:13:42 -0700
Subject: [PATCH 0728/1322] [llvm] Lower latency bonus threshold in function
 specialization. (#143954)

Related to #143219.

Function specialization does not kick in if flang sets `noalias`
attributes on the function arguments of `digits_2`, because PRE
optimizes several `srem` instructions and other memory accesses
from the inner loops causing the latency bonus to be lower than
the current 40% threshold.

While looking at this, I did not really get why we compute the latency
bonus as a ratio of the latency of the "eliminated" instructions
and the code-size of the whole function. It did not make much sense
to me.

I tried computing the total latency as a sum of latencies
of the instructions that belong to non-dead code (including
the instructions that would be executed had they not been
"eliminated" due to the constant propagation). This total
latency should identify the total cost of executing the function
with the given argument being dynamically equal to the tried
constant value. Then the latency bonus would be computed
as the ratio between the latency of the "eliminated" instructions
and the total latency. Unfortunately, this did not given me a good
heuristics either. The bonus was close to 0% on some targets,
and as big as 3-5% on other targets. This does match very well
with the performance gain achieved by function specialization
for exchange2, so it seemd like another artificial heuristic
not better than the current one.

It seems that GCC uses a set of different heuristics for function
specialization, but I am not an expert here and I cannot say
if we can match them in LLVM.

With all that said, I decided to try to lower the threshold
to avoid the regression and be able to re-enable the generally
good change for `noalias` attribute.

With this patch, I was able to reduce the effect of `noalias`,
so that `-force-no-alias=true` is only ~10% slower than
`-force-no-alias=false` code on neoverse-v1 and neoverse-v2.
On neoverse-n1, `-force-no-alias=true` is >2x faster than
`-force-no-alias=false` regardless of this patch.

This threshold has been changed before also due to improved
alias information:
https://github.com/llvm/llvm-project/commit/2fb51fba8ca904a6d3ddf30ae94228ecf9e6a231#diff-066363256b7b4164e66b28a3028b2cb9e405c9136241baa33db76ebd2edb87cd

Please let me know what testing I should run to make sure this change
is safe. As I understand, it may affect the compilation time
performance,
and I will appreciate it if someone points out which benchmarks
need to be checked before merging this.
---
 llvm/lib/Transforms/IPO/FunctionSpecialization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 1034ce958215..45fa9d57e486 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -71,7 +71,7 @@ static cl::opt<unsigned> MinCodeSizeSavings(
              "much percent of the original function size"));
 
 static cl::opt<unsigned> MinLatencySavings(
-    "funcspec-min-latency-savings", cl::init(40), cl::Hidden,
+    "funcspec-min-latency-savings", cl::init(20), cl::Hidden,
     cl::desc("Reject specializations whose latency savings are less than this "
              "much percent of the original function size"));
 

From af65cb68f553759eac307edda87ff7d8b5fdffa9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:13:57 +0900
Subject: [PATCH 0729/1322] AMDGPU: Move fpenvIEEEMode into TTI (#141945)

---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 28 ++-----------------
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 17 +++++++++++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  7 +++++
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 9be8821d5bf9..d12170a60905 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -60,28 +60,6 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
   return maxnum(Src0, Src1);
 }
 
-enum class KnownIEEEMode { Unknown, On, Off };
-
-/// Return KnownIEEEMode::On if we know if the use context can assume
-/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
-/// "amdgpu-ieee"="false".
-static KnownIEEEMode fpenvIEEEMode(const Instruction &I,
-                                   const GCNSubtarget &ST) {
-  if (!ST.hasIEEEMode()) // Only mode on gfx12
-    return KnownIEEEMode::On;
-
-  const Function *F = I.getFunction();
-  if (!F)
-    return KnownIEEEMode::Unknown;
-
-  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
-  if (IEEEAttr.isValid())
-    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
-
-  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
-                                               : KnownIEEEMode::On;
-}
-
 // Check if a value can be converted to a 16-bit value without losing
 // precision.
 // The value is expected to be either a float (IsFloat = true) or an unsigned
@@ -1004,7 +982,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // TODO: Also can fold to 2 operands with infinities.
     if ((match(Src0, m_APFloat(ConstSrc0)) && ConstSrc0->isNaN()) ||
         isa<UndefValue>(Src0)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         // TODO: If Src2 is snan, does it need quieting?
         if (ConstSrc0 && ConstSrc0->isSignaling())
@@ -1019,7 +997,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     } else if ((match(Src1, m_APFloat(ConstSrc1)) && ConstSrc1->isNaN()) ||
                isa<UndefValue>(Src1)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         // TODO: If Src2 is snan, does it need quieting?
         if (ConstSrc1 && ConstSrc1->isSignaling())
@@ -1035,7 +1013,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     } else if ((match(Src2, m_APFloat(ConstSrc2)) && ConstSrc2->isNaN()) ||
                isa<UndefValue>(Src2)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         if (ConstSrc2 && ConstSrc2->isSignaling()) {
           auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b79c9be3eac9..ce2098a3a19b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1445,3 +1445,20 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+GCNTTIImpl::KnownIEEEMode
+GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
+  if (!ST->hasIEEEMode()) // Only mode on gfx12
+    return KnownIEEEMode::On;
+
+  const Function *F = I.getFunction();
+  if (!F)
+    return KnownIEEEMode::Unknown;
+
+  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
+  if (IEEEAttr.isValid())
+    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
+
+  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
+                                               : KnownIEEEMode::On;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ec298c7e9631..0fae301abf53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -281,6 +281,13 @@ public:
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
+
+  enum class KnownIEEEMode { Unknown, On, Off };
+
+  /// Return KnownIEEEMode::On if we know if the use context can assume
+  /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
+  /// "amdgpu-ieee"="false".
+  KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
 };
 
 } // end namespace llvm

From 70343c8d44273c187e3f7fa5e2037fbc41307077 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 17 Jun 2025 16:14:13 -0700
Subject: [PATCH 0730/1322] [mlir][flang] Added
 Weighted[Region]BranchOpInterface's. (#142079)

The new interfaces provide getters and setters for the weight
information about the branches of BranchOpInterface and
RegionBranchOpInterface operations.

These interfaces are done the same way as LLVM dialect's
BranchWeightOpInterface.

The plan is to produce this information in Flang, e.g. mark
most probably "cold" code as such and allow LLVM to order
basic blocks accordingly. An example of such a code is
copy loops generated for arrays repacking - we can mark it
as "cold" assuming that the copy will not happen dynamically.
If the copy actually happens the overhead of the copy is probably high
enough so that we may not care about the little overhead
of jumping to the "cold" code and fetching it.
---
 .../include/flang/Optimizer/Dialect/FIROps.td |  18 ++-
 flang/lib/Optimizer/Dialect/FIROps.cpp        |  21 +++-
 .../Transforms/ControlFlowConverter.cpp       |   5 +-
 flang/test/Fir/cfg-conversion-if.fir          |  46 +++++++
 flang/test/Fir/fir-ops.fir                    |  16 +++
 flang/test/Fir/invalid.fir                    |  28 +++++
 .../Dialect/ControlFlow/IR/ControlFlowOps.td  |  34 +++---
 .../mlir/Dialect/LLVMIR/LLVMInterfaces.td     |  36 ------
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  57 +++++----
 .../mlir/Interfaces/ControlFlowInterfaces.h   |  20 ++++
 .../mlir/Interfaces/ControlFlowInterfaces.td  | 112 ++++++++++++++++++
 .../mlir/Target/LLVMIR/ModuleTranslation.h    |   2 +-
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   |   7 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  11 +-
 mlir/lib/Interfaces/ControlFlowInterfaces.cpp |  46 +++++++
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  11 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |  11 +-
 .../Conversion/ControlFlowToLLVM/branch.mlir  |  14 +++
 mlir/test/Dialect/ControlFlow/invalid.mlir    |  36 ++++++
 mlir/test/Dialect/ControlFlow/ops.mlir        |  10 ++
 .../LLVMIR/Import/metadata-profiling.ll       |  13 +-
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  16 +++
 mlir/test/Target/LLVMIR/llvmir.mlir           |  26 ----
 23 files changed, 461 insertions(+), 135 deletions(-)
 create mode 100644 flang/test/Fir/cfg-conversion-if.fir

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 90e05ce3d5ca..27a6ca4ebdb4 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2323,9 +2323,13 @@ def fir_DoLoopOp : region_Op<"do_loop", [AttrSizedOperandSegments,
   }];
 }
 
-def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterface, [
-    "getRegionInvocationBounds", "getEntrySuccessorRegions"]>, RecursiveMemoryEffects,
-    NoRegionArguments]> {
+def fir_IfOp
+    : region_Op<
+          "if", [DeclareOpInterfaceMethods<
+                     RegionBranchOpInterface, ["getRegionInvocationBounds",
+                                               "getEntrySuccessorRegions"]>,
+                 RecursiveMemoryEffects, NoRegionArguments,
+                 WeightedRegionBranchOpInterface]> {
   let summary = "if-then-else conditional operation";
   let description = [{
     Used to conditionally execute operations. This operation is the FIR
@@ -2342,7 +2346,8 @@ def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterfac
     ```
   }];
 
-  let arguments = (ins I1:$condition);
+  let arguments = (ins I1:$condition,
+      OptionalAttr<DenseI32ArrayAttr>:$region_weights);
   let results = (outs Variadic<AnyType>:$results);
 
   let regions = (region
@@ -2371,6 +2376,11 @@ def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterfac
 
     void resultToSourceOps(llvm::SmallVectorImpl<mlir::Value> &results,
                            unsigned resultNum);
+
+    /// Returns the display name string for the region_weights attribute.
+    static constexpr llvm::StringRef getWeightsAttrAssemblyName() {
+      return "weights";
+    }
   }];
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 6181e1fad424..ecfa2939e96a 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4418,6 +4418,19 @@ mlir::ParseResult fir::IfOp::parse(mlir::OpAsmParser &parser,
       parser.resolveOperand(cond, i1Type, result.operands))
     return mlir::failure();
 
+  if (mlir::succeeded(
+          parser.parseOptionalKeyword(getWeightsAttrAssemblyName()))) {
+    if (parser.parseLParen())
+      return mlir::failure();
+    mlir::DenseI32ArrayAttr weights;
+    if (parser.parseCustomAttributeWithFallback(weights, mlir::Type{}))
+      return mlir::failure();
+    if (weights)
+      result.addAttribute(getRegionWeightsAttrName(result.name), weights);
+    if (parser.parseRParen())
+      return mlir::failure();
+  }
+
   if (parser.parseOptionalArrowTypeList(result.types))
     return mlir::failure();
 
@@ -4449,6 +4462,11 @@ llvm::LogicalResult fir::IfOp::verify() {
 void fir::IfOp::print(mlir::OpAsmPrinter &p) {
   bool printBlockTerminators = false;
   p << ' ' << getCondition();
+  if (auto weights = getRegionWeightsAttr()) {
+    p << ' ' << getWeightsAttrAssemblyName() << '(';
+    p.printStrippedAttrOrType(weights);
+    p << ')';
+  }
   if (!getResults().empty()) {
     p << " -> (" << getResultTypes() << ')';
     printBlockTerminators = true;
@@ -4464,7 +4482,8 @@ void fir::IfOp::print(mlir::OpAsmPrinter &p) {
     p.printRegion(otherReg, /*printEntryBlockArgs=*/false,
                   printBlockTerminators);
   }
-  p.printOptionalAttrDict((*this)->getAttrs());
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elideAttrs=*/{getRegionWeightsAttrName()});
 }
 
 void fir::IfOp::resultToSourceOps(llvm::SmallVectorImpl<mlir::Value> &results,
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index 8a9e9b80134b..3d35803e6a2d 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -212,9 +212,12 @@ public:
     }
 
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<mlir::cf::CondBranchOp>(
+    auto branchOp = rewriter.create<mlir::cf::CondBranchOp>(
         loc, ifOp.getCondition(), ifOpBlock, llvm::ArrayRef<mlir::Value>(),
         otherwiseBlock, llvm::ArrayRef<mlir::Value>());
+    llvm::ArrayRef<int32_t> weights = ifOp.getWeights();
+    if (!weights.empty())
+      branchOp.setWeights(weights);
     rewriter.replaceOp(ifOp, continueBlock->getArguments());
     return success();
   }
diff --git a/flang/test/Fir/cfg-conversion-if.fir b/flang/test/Fir/cfg-conversion-if.fir
new file mode 100644
index 000000000000..1e30ee8e64f0
--- /dev/null
+++ b/flang/test/Fir/cfg-conversion-if.fir
@@ -0,0 +1,46 @@
+// RUN: fir-opt --split-input-file --cfg-conversion %s | FileCheck %s
+
+func.func private @callee() -> none
+
+// CHECK-LABEL:   func.func @if_then(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]] weights([10, 90]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_0:.*]] = fir.call @callee() : () -> none
+// CHECK:           cf.br ^bb2
+// CHECK:         ^bb2:
+// CHECK:           return
+// CHECK:         }
+func.func @if_then(%cond: i1) {
+  fir.if %cond weights([10, 90]) {
+    fir.call @callee() : () -> none
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @if_then_else(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           cf.cond_br %[[ARG0]] weights([90, 10]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb3(%[[VAL_0]] : i32)
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb3(%[[VAL_1]] : i32)
+// CHECK:         ^bb3(%[[VAL_2:.*]]: i32):
+// CHECK:           cf.br ^bb4
+// CHECK:         ^bb4:
+// CHECK:           return %[[VAL_2]] : i32
+// CHECK:         }
+func.func @if_then_else(%cond: i1) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %result = fir.if %cond weights([90, 10]) -> i32 {
+    fir.result %c0 : i32
+  } else {
+    fir.result %c1 : i32
+  }
+  return %result : i32
+}
diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir
index 9c444d2f4e0b..3585bf9efca3 100644
--- a/flang/test/Fir/fir-ops.fir
+++ b/flang/test/Fir/fir-ops.fir
@@ -1015,3 +1015,19 @@ func.func @test_box_total_elements(%arg0: !fir.class<!fir.type<sometype{i:i32}>>
   %6 = arith.addi %2, %5 : index
   return %6 : index
 }
+
+// CHECK-LABEL:   func.func @test_if_weights(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+func.func @test_if_weights(%cond: i1) {
+// CHECK:           fir.if %[[ARG0]] weights([99, 1]) {
+// CHECK:           }
+  fir.if %cond weights([99, 1]) {
+  }
+// CHECK:           fir.if %[[ARG0]] weights([99, 1]) {
+// CHECK:           } else {
+// CHECK:           }
+  fir.if %cond weights ([99,1]) {
+  } else {
+  }
+  return
+}
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index 45cae1f82cb8..aca0ecc1abdc 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1393,3 +1393,31 @@ fir.local {type = local_init} @x.localizer : f32 init {
 ^bb0(%arg0: f32, %arg1: f32):
   fir.yield(%arg0 : f32)
 }
+
+// -----
+
+func.func @wrong_weights_number_in_if_then(%cond: i1) {
+// expected-error @below {{expects number of region weights to match number of regions: 1 vs 2}}
+  fir.if %cond weights([50]) {
+  }
+  return
+}
+
+// -----
+
+func.func @wrong_weights_number_in_if_then_else(%cond: i1) {
+// expected-error @below {{expects number of region weights to match number of regions: 3 vs 2}}
+  fir.if %cond weights([50, 40, 10]) {
+  } else {
+  }
+  return
+}
+
+// -----
+
+func.func @negative_weight_in_if_then(%cond: i1) {
+// expected-error @below {{weight #0 must be non-negative}}
+  fir.if %cond weights([-1, 101]) {
+  }
+  return
+}
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index 48f12b46a57f..79da81ba049d 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -112,10 +112,11 @@ def BranchOp : CF_Op<"br", [
 // CondBranchOp
 //===----------------------------------------------------------------------===//
 
-def CondBranchOp : CF_Op<"cond_br",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
-     Pure, Terminator]> {
+def CondBranchOp
+    : CF_Op<"cond_br", [AttrSizedOperandSegments,
+                        DeclareOpInterfaceMethods<
+                            BranchOpInterface, ["getSuccessorForOperands"]>,
+                        WeightedBranchOpInterface, Pure, Terminator]> {
   let summary = "Conditional branch operation";
   let description = [{
     The `cf.cond_br` terminator operation represents a conditional branch on a
@@ -144,20 +145,23 @@ def CondBranchOp : CF_Op<"cond_br",
     ```
   }];
 
-  let arguments = (ins I1:$condition,
-                       Variadic<AnyType>:$trueDestOperands,
-                       Variadic<AnyType>:$falseDestOperands);
+  let arguments = (ins I1:$condition, Variadic<AnyType>:$trueDestOperands,
+      Variadic<AnyType>:$falseDestOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$branch_weights);
   let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
 
-  let builders = [
-    OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
-      "ValueRange":$trueOperands, "Block *":$falseDest,
-      "ValueRange":$falseOperands), [{
-      build($_builder, $_state, condition, trueOperands, falseOperands, trueDest,
+  let builders = [OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
+                                "ValueRange":$trueOperands,
+                                "Block *":$falseDest,
+                                "ValueRange":$falseOperands),
+                            [{
+      build($_builder, $_state, condition, trueOperands, falseOperands, /*branch_weights=*/{}, trueDest,
             falseDest);
     }]>,
-    OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
-      "Block *":$falseDest, CArg<"ValueRange", "{}">:$falseOperands), [{
+                  OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
+                                "Block *":$falseDest,
+                                CArg<"ValueRange", "{}">:$falseOperands),
+                            [{
       build($_builder, $_state, condition, trueDest, ValueRange(), falseDest,
             falseOperands);
     }]>];
@@ -216,7 +220,7 @@ def CondBranchOp : CF_Op<"cond_br",
 
   let hasCanonicalizer = 1;
   let assemblyFormat = [{
-    $condition `,`
+    $condition (`weights` `(` $branch_weights^ `)` )? `,`
     $trueDest (`(` $trueDestOperands^ `:` type($trueDestOperands) `)`)? `,`
     $falseDest (`(` $falseDestOperands^ `:` type($falseDestOperands) `)`)?
     attr-dict
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index 2824f09dab6c..138170f8c876 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -168,42 +168,6 @@ def NonNegFlagInterface : OpInterface<"NonNegFlagInterface"> {
   ];
 }
 
-def BranchWeightOpInterface : OpInterface<"BranchWeightOpInterface"> {
-  let description = [{
-    An interface for operations that can carry branch weights metadata. It
-    provides setters and getters for the operation's branch weights attribute.
-    The default implementation of the interface methods expect the operation to
-    have an attribute of type DenseI32ArrayAttr named branch_weights.
-  }];
-
-  let cppNamespace = "::mlir::LLVM";
-
-  let methods = [
-    InterfaceMethod<
-      /*desc=*/        "Returns the branch weights attribute or nullptr",
-      /*returnType=*/  "::mlir::DenseI32ArrayAttr",
-      /*methodName=*/  "getBranchWeightsOrNull",
-      /*args=*/        (ins),
-      /*methodBody=*/  [{}],
-      /*defaultImpl=*/ [{
-        auto op = cast<ConcreteOp>(this->getOperation());
-        return op.getBranchWeightsAttr();
-      }]
-      >,
-    InterfaceMethod<
-      /*desc=*/        "Sets the branch weights attribute",
-      /*returnType=*/  "void",
-      /*methodName=*/  "setBranchWeights",
-      /*args=*/        (ins "::mlir::DenseI32ArrayAttr":$attr),
-      /*methodBody=*/  [{}],
-      /*defaultImpl=*/ [{
-        auto op = cast<ConcreteOp>(this->getOperation());
-        op.setBranchWeightsAttr(attr);
-      }]
-      >
-  ];
-}
-
 def AccessGroupOpInterface : OpInterface<"AccessGroupOpInterface"> {
   let description = [{
     An interface for memory operations that can carry access groups metadata.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 68fa620d239b..939e7a09a73a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -660,12 +660,12 @@ def LLVM_FPTruncOp : LLVM_CastOp<"fptrunc", "FPTrunc",
                                  LLVM_ScalarOrVectorOf<LLVM_AnyFloat>>;
 
 // Call-related operations.
-def LLVM_InvokeOp : LLVM_Op<"invoke", [
-                      AttrSizedOperandSegments,
-                      DeclareOpInterfaceMethods<BranchOpInterface>,
-                      DeclareOpInterfaceMethods<CallOpInterface>,
-                      DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-                      Terminator]> {
+def LLVM_InvokeOp
+    : LLVM_Op<"invoke", [AttrSizedOperandSegments,
+                         DeclareOpInterfaceMethods<BranchOpInterface>,
+                         DeclareOpInterfaceMethods<CallOpInterface>,
+                         DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                         Terminator]> {
   let arguments = (ins
                    OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
                    OptionalAttr<FlatSymbolRefAttr>:$callee,
@@ -734,12 +734,12 @@ def LLVM_VaArgOp : LLVM_Op<"va_arg"> {
 // CallOp
 //===----------------------------------------------------------------------===//
 
-def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
-                    [AttrSizedOperandSegments,
-                     DeclareOpInterfaceMethods<FastmathFlagsInterface>,
-                     DeclareOpInterfaceMethods<CallOpInterface>,
-                     DeclareOpInterfaceMethods<SymbolUserOpInterface>,
-                     DeclareOpInterfaceMethods<BranchWeightOpInterface>]> {
+def LLVM_CallOp
+    : LLVM_MemAccessOpBase<
+          "call", [AttrSizedOperandSegments,
+                   DeclareOpInterfaceMethods<FastmathFlagsInterface>,
+                   DeclareOpInterfaceMethods<CallOpInterface>,
+                   DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call to an LLVM function.";
   let description = [{
     In LLVM IR, functions may return either 0 or 1 value. LLVM IR dialect
@@ -788,21 +788,16 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
       OptionalAttr<FlatSymbolRefAttr>:$callee,
       Variadic<LLVM_Type>:$callee_operands,
       DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags,
-      OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
       DefaultValuedAttr<CConv, "CConv::C">:$CConv,
       DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind,
       OptionalAttr<LLVM_MemoryEffectsAttr>:$memory_effects,
-      UnitAttr:$convergent,
-      UnitAttr:$no_unwind,
-      UnitAttr:$will_return,
+      UnitAttr:$convergent, UnitAttr:$no_unwind, UnitAttr:$will_return,
       VariadicOfVariadic<LLVM_Type, "op_bundle_sizes">:$op_bundle_operands,
       DenseI32ArrayAttr:$op_bundle_sizes,
       OptionalAttr<ArrayAttr>:$op_bundle_tags,
       OptionalAttr<DictArrayAttr>:$arg_attrs,
-      OptionalAttr<DictArrayAttr>:$res_attrs,
-      UnitAttr:$no_inline,
-      UnitAttr:$always_inline,
-      UnitAttr:$inline_hint);
+      OptionalAttr<DictArrayAttr>:$res_attrs, UnitAttr:$no_inline,
+      UnitAttr:$always_inline, UnitAttr:$inline_hint);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1047,11 +1042,12 @@ def LLVM_BrOp : LLVM_TerminatorOp<"br",
     LLVM_TerminatorPassthroughOpBuilder
   ];
 }
-def LLVM_CondBrOp : LLVM_TerminatorOp<"cond_br",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface>,
-     DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-     Pure]> {
+def LLVM_CondBrOp
+    : LLVM_TerminatorOp<
+          "cond_br", [AttrSizedOperandSegments,
+                      DeclareOpInterfaceMethods<BranchOpInterface>,
+                      DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                      Pure]> {
   let arguments = (ins I1:$condition,
                    Variadic<LLVM_Type>:$trueDestOperands,
                    Variadic<LLVM_Type>:$falseDestOperands,
@@ -1136,11 +1132,12 @@ def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable"> {
   }];
 }
 
-def LLVM_SwitchOp : LLVM_TerminatorOp<"switch",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface>,
-     DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-     Pure]> {
+def LLVM_SwitchOp
+    : LLVM_TerminatorOp<
+          "switch", [AttrSizedOperandSegments,
+                     DeclareOpInterfaceMethods<BranchOpInterface>,
+                     DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                     Pure]> {
   let arguments = (ins
     AnySignlessInteger:$value,
     Variadic<AnyType>:$defaultOperands,
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
index 7f6967f11444..d63800c12d13 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
@@ -142,6 +142,26 @@ LogicalResult verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                                             const SuccessorOperands &operands);
 } // namespace detail
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// Verify that the branch weights attached to an operation
+/// implementing WeightedBranchOpInterface are correct.
+LogicalResult verifyBranchWeights(Operation *op);
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// WeightedRegiobBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// Verify that the region weights attached to an operation
+/// implementing WeightedRegiobBranchOpInterface are correct.
+LogicalResult verifyRegionBranchWeights(Operation *op);
+} // namespace detail
+
 //===----------------------------------------------------------------------===//
 // RegionBranchOpInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
index 69bce78e946c..46ab0b9ebbc6 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
@@ -375,6 +375,118 @@ def SelectLikeOpInterface : OpInterface<"SelectLikeOpInterface"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+def WeightedBranchOpInterface : OpInterface<"WeightedBranchOpInterface"> {
+  let description = [{
+    This interface provides weight information for branching terminator
+    operations, i.e. terminator operations with successors.
+
+    This interface provides methods for getting/setting integer non-negative
+    weight of each branch. The probability of executing a branch
+    is computed as the ratio between the branch's weight and the total
+    sum of the weights (which cannot be zero).
+    The weights are optional. If they are provided, then their number
+    must match the number of successors of the operation.
+
+    The default implementations of the methods expect the operation
+    to have an attribute of type DenseI32ArrayAttr named branch_weights.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [InterfaceMethod<
+                     /*desc=*/"Returns the branch weights",
+                     /*returnType=*/"::llvm::ArrayRef<int32_t>",
+                     /*methodName=*/"getWeights",
+                     /*args=*/(ins),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        if (auto attr = op.getBranchWeightsAttr())
+          return attr.asArrayRef();
+        return {};
+      }]>,
+                 InterfaceMethod<
+                     /*desc=*/"Sets the branch weights",
+                     /*returnType=*/"void",
+                     /*methodName=*/"setWeights",
+                     /*args=*/(ins "::llvm::ArrayRef<int32_t>":$weights),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        op.setBranchWeightsAttr(::mlir::DenseI32ArrayAttr::get(op->getContext(), weights));
+      }]>,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifyBranchWeights($_op);
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// WeightedRegionBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+// TODO: the probabilities of entering a particular region seem
+// to correlate with the values returned by
+// RegionBranchOpInterface::invocationBounds(), and we should probably
+// verify that the values are consistent. In that case, should
+// WeightedRegionBranchOpInterface extend RegionBranchOpInterface?
+def WeightedRegionBranchOpInterface
+    : OpInterface<"WeightedRegionBranchOpInterface"> {
+  let description = [{
+    This interface provides weight information for region operations
+    that exhibit branching behavior between held regions.
+
+    This interface provides methods for getting/setting integer non-negative
+    weight of each branch. The probability of executing a region is computed
+    as the ratio between the region branch's weight and the total sum
+    of the weights (which cannot be zero).
+    The weights are optional. If they are provided, then their number
+    must match the number of regions held by the operation
+    (including empty regions).
+
+    The weights specify the probability of branching to a particular
+    region when first executing the operation.
+    For example, for loop-like operations with a single region
+    the weight specifies the probability of entering the loop.
+
+    The default implementations of the methods expect the operation
+    to have an attribute of type DenseI32ArrayAttr named branch_weights.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [InterfaceMethod<
+                     /*desc=*/"Returns the region weights",
+                     /*returnType=*/"::llvm::ArrayRef<int32_t>",
+                     /*methodName=*/"getWeights",
+                     /*args=*/(ins),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        if (auto attr = op.getRegionWeightsAttr())
+          return attr.asArrayRef();
+        return {};
+      }]>,
+                 InterfaceMethod<
+                     /*desc=*/"Sets the region weights",
+                     /*returnType=*/"void",
+                     /*methodName=*/"setWeights",
+                     /*args=*/(ins "::llvm::ArrayRef<int32_t>":$weights),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        op.setRegionWeightsAttr(::mlir::DenseI32ArrayAttr::get(op->getContext(), weights));
+      }]>,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifyRegionBranchWeights($_op);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ControlFlow Traits
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 97ae14aa0d6a..0f136c5c46d7 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -189,7 +189,7 @@ public:
                                   llvm::Instruction *inst);
 
   /// Sets LLVM profiling metadata for operations that have branch weights.
-  void setBranchWeightsMetadata(BranchWeightOpInterface op);
+  void setBranchWeightsMetadata(WeightedBranchOpInterface op);
 
   /// Sets LLVM loop metadata for branch operations that have a loop annotation
   /// attribute.
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index debfd003bd5b..d31d7d801e14 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -166,10 +166,15 @@ struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
                           TypeRange(adaptor.getFalseDestOperands()));
     if (failed(convertedFalseBlock))
       return failure();
-    Operation *newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
+    auto newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
         op, adaptor.getCondition(), *convertedTrueBlock,
         adaptor.getTrueDestOperands(), *convertedFalseBlock,
         adaptor.getFalseDestOperands());
+    ArrayRef<int32_t> weights = op.getWeights();
+    if (!weights.empty()) {
+      newOp.setWeights(weights);
+      op.removeBranchWeightsAttr();
+    }
     // TODO: We should not just forward all attributes like that. But there are
     // existing Flang tests that depend on this behavior.
     newOp->setAttrs(op->getAttrDictionary());
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index c7528c970a4b..a12aef0dfad3 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -589,10 +589,6 @@ LogicalResult SwitchOp::verify() {
            static_cast<int64_t>(getCaseDestinations().size())))
     return emitOpError("expects number of case values to match number of "
                        "case destinations");
-  if (getBranchWeights() && getBranchWeights()->size() != getNumSuccessors())
-    return emitError("expects number of branch weights to match number of "
-                     "successors: ")
-           << getBranchWeights()->size() << " vs " << getNumSuccessors();
   if (getCaseValues() &&
       getValue().getType() != getCaseValues()->getElementType())
     return emitError("expects case value type to match condition value type");
@@ -962,7 +958,6 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
   assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
         /*var_callee_type=*/nullptr, callee, args, /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
@@ -992,7 +987,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, args,
         /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr, /*CConv=*/nullptr,
+        /*CConv=*/nullptr,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
@@ -1009,7 +1004,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType),
         /*callee=*/nullptr, args,
-        /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*fastmathFlags=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
@@ -1025,7 +1020,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), args,
-        /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*fastmathFlags=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 2ae334b517a3..3a63db35eec0 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -80,6 +81,51 @@ detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyWeights(Operation *op,
+                                   llvm::ArrayRef<int32_t> weights,
+                                   std::size_t expectedWeightsNum,
+                                   llvm::StringRef weightAnchorName,
+                                   llvm::StringRef weightRefName) {
+  if (weights.empty())
+    return success();
+
+  if (weights.size() != expectedWeightsNum)
+    return op->emitError() << "expects number of " << weightAnchorName
+                           << " weights to match number of " << weightRefName
+                           << ": " << weights.size() << " vs "
+                           << expectedWeightsNum;
+
+  for (auto [index, weight] : llvm::enumerate(weights))
+    if (weight < 0)
+      return op->emitError() << "weight #" << index << " must be non-negative";
+
+  if (llvm::all_of(weights, [](int32_t value) { return value == 0; }))
+    return op->emitError() << "branch weights cannot all be zero";
+
+  return success();
+}
+
+LogicalResult detail::verifyBranchWeights(Operation *op) {
+  llvm::ArrayRef<int32_t> weights =
+      cast<WeightedBranchOpInterface>(op).getWeights();
+  return verifyWeights(op, weights, op->getNumSuccessors(), "branch",
+                       "successors");
+}
+
+//===----------------------------------------------------------------------===//
+// WeightedRegionBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+LogicalResult detail::verifyRegionBranchWeights(Operation *op) {
+  llvm::ArrayRef<int32_t> weights =
+      cast<WeightedRegionBranchOpInterface>(op).getWeights();
+  return verifyWeights(op, weights, op->getNumRegions(), "region", "regions");
+}
+
 //===----------------------------------------------------------------------===//
 // RegionBranchOpInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 1b5ce868b5c7..e67aa892afe0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -146,8 +146,15 @@ static LogicalResult setProfilingAttr(OpBuilder &builder, llvm::MDNode *node,
     branchWeights.push_back(branchWeight->getZExtValue());
   }
 
-  if (auto iface = dyn_cast<BranchWeightOpInterface>(op)) {
-    iface.setBranchWeights(builder.getDenseI32ArrayAttr(branchWeights));
+  if (auto iface = dyn_cast<WeightedBranchOpInterface>(op)) {
+    // LLVM allows attaching a single weight to call instructions.
+    // This is used for carrying the execution count information
+    // in PGO modes. MLIR WeightedBranchOpInterface does not allow this,
+    // so we drop the metadata in this case.
+    // LLVM should probably use the VP form of MD_prof metadata
+    // for such cases.
+    if (op->getNumSuccessors() != 0)
+      iface.setWeights(branchWeights);
     return success();
   }
   return failure();
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e5ca147ea98f..3eaa24eb5c95 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1055,7 +1055,7 @@ LogicalResult ModuleTranslation::convertBlockImpl(Block &bb,
       return failure();
 
     // Set the branch weight metadata on the translated instruction.
-    if (auto iface = dyn_cast<BranchWeightOpInterface>(op))
+    if (auto iface = dyn_cast<WeightedBranchOpInterface>(op))
       setBranchWeightsMetadata(iface);
   }
 
@@ -2026,14 +2026,15 @@ void ModuleTranslation::setDereferenceableMetadata(
   inst->setMetadata(kindId, derefSizeNode);
 }
 
-void ModuleTranslation::setBranchWeightsMetadata(BranchWeightOpInterface op) {
-  DenseI32ArrayAttr weightsAttr = op.getBranchWeightsOrNull();
-  if (!weightsAttr)
+void ModuleTranslation::setBranchWeightsMetadata(WeightedBranchOpInterface op) {
+  SmallVector<uint32_t> weights;
+  llvm::transform(op.getWeights(), std::back_inserter(weights),
+                  [](int32_t value) { return static_cast<uint32_t>(value); });
+  if (weights.empty())
     return;
 
   llvm::Instruction *inst = isa<CallOp>(op) ? lookupCall(op) : lookupBranch(op);
   assert(inst && "expected the operation to have a mapping to an instruction");
-  SmallVector<uint32_t> weights(weightsAttr.asArrayRef());
   inst->setMetadata(
       llvm::LLVMContext::MD_prof,
       llvm::MDBuilder(getLLVMContext()).createBranchWeights(weights));
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
index 9a0f2b771454..7c78211d5901 100644
--- a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
+++ b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
@@ -67,3 +67,17 @@ func.func @unreachable_block() {
 ^bb1(%arg0: index):
   cf.br ^bb1(%arg0 : index)
 }
+
+// -----
+
+// Test case for cf.cond_br with weights.
+
+// CHECK-LABEL:   func.func @cf_cond_br_with_weights(
+func.func @cf_cond_br_with_weights(%cond: i1, %a: index, %b: index) -> index {
+// CHECK:           llvm.cond_br %{{.*}} weights([90, 10]), ^bb1(%{{.*}} : i64), ^bb2(%{{.*}} : i64)
+  cf.cond_br %cond, ^bb1(%a : index), ^bb2(%b : index) {branch_weights = array<i32: 90, 10>}
+^bb1(%arg1: index):
+  return %arg1 : index
+^bb2(%arg2: index):
+  return %arg2 : index
+}
diff --git a/mlir/test/Dialect/ControlFlow/invalid.mlir b/mlir/test/Dialect/ControlFlow/invalid.mlir
index b51d8095c997..1b8de22a9ff9 100644
--- a/mlir/test/Dialect/ControlFlow/invalid.mlir
+++ b/mlir/test/Dialect/ControlFlow/invalid.mlir
@@ -67,3 +67,39 @@ func.func @switch_missing_default(%flag : i32, %caseOperand : i32) {
   ^bb3(%bb3arg : i32):
     return
 }
+
+// -----
+
+// CHECK-LABEL: func @wrong_weights_number
+func.func @wrong_weights_number(%cond: i1) {
+  // expected-error@+1 {{expects number of branch weights to match number of successors: 1 vs 2}}
+  cf.cond_br %cond weights([100]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_weight
+func.func @wrong_total_weight(%cond: i1) {
+  // expected-error@+1 {{weight #0 must be non-negative}}
+  cf.cond_br %cond weights([-1, 101]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
+
+// -----
+
+// CHECK-LABEL: func @zero_weights
+func.func @wrong_total_weight(%cond: i1) {
+  // expected-error@+1 {{branch weights cannot all be zero}}
+  cf.cond_br %cond weights([0, 0]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Dialect/ControlFlow/ops.mlir b/mlir/test/Dialect/ControlFlow/ops.mlir
index c9317c761397..160534240e0f 100644
--- a/mlir/test/Dialect/ControlFlow/ops.mlir
+++ b/mlir/test/Dialect/ControlFlow/ops.mlir
@@ -51,3 +51,13 @@ func.func @switch_result_number(%arg0: i32) {
   ^bb2:
     return
 }
+
+// CHECK-LABEL: func @cond_weights
+func.func @cond_weights(%cond: i1) {
+// CHECK: cf.cond_br %{{.*}} weights([60, 40]), ^{{.*}}, ^{{.*}}
+  cf.cond_br %cond weights([60, 40]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll b/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
index cc3b47a54dfe..c623df0b605b 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
@@ -36,14 +36,17 @@ bbd:
 
 ; // -----
 
+; Verify that a single weight attached to a call is not translated.
+; The MLIR WeightedBranchOpInterface does not support this case.
+
 ; CHECK: llvm.func @fn()
-declare void @fn()
+declare i32 @fn()
 
 ; CHECK-LABEL: @call_branch_weights
-define void @call_branch_weights() {
-  ; CHECK:  llvm.call @fn() {branch_weights = array<i32: 42>}
-  call void @fn(), !prof !0
-  ret void
+define i32 @call_branch_weights() {
+  ; CHECK:  llvm.call @fn() : () -> i32
+  %1 = call i32 @fn(), !prof !0
+  ret i32 %1
 }
 
 !0 = !{!"branch_weights", i32 42}
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 24a7b4255727..a8ef401fff27 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -448,3 +448,19 @@ llvm.mlir.global external constant @const() {addr_space = 0 : i32, dso_local} :
 }
 
 llvm.func extern_weak @extern_func()
+
+// -----
+
+llvm.func @invoke_branch_weights_callee()
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @invoke_branch_weights() -> i32 attributes {personality = @__gxx_personality_v0} {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // expected-error @below{{expects number of branch weights to match number of successors: 1 vs 2}}
+  llvm.invoke @invoke_branch_weights_callee() to ^bb2 unwind ^bb1 {branch_weights = array<i32 : 42>} : () -> ()
+^bb1:  // pred: ^bb0
+  %1 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.br ^bb2
+^bb2:  // 2 preds: ^bb0, ^bb1
+  llvm.return %0 : i32
+}
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 7742259e7a47..fc1993b50ba2 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1906,32 +1906,6 @@ llvm.func @cond_br_weights(%cond : i1, %arg0 : i32,  %arg1 : i32) -> i32 {
 
 // -----
 
-llvm.func @fn()
-
-// CHECK-LABEL: @call_branch_weights
-llvm.func @call_branch_weights() {
-  // CHECK: !prof ![[NODE:[0-9]+]]
-  llvm.call @fn() {branch_weights = array<i32 : 42>} : () -> ()
-  llvm.return
-}
-
-// CHECK: ![[NODE]] = !{!"branch_weights", i32 42}
-
-// -----
-
-llvm.func @fn() -> i32
-
-// CHECK-LABEL: @call_branch_weights
-llvm.func @call_branch_weights() {
-  // CHECK: !prof ![[NODE:[0-9]+]]
-  %res = llvm.call @fn() {branch_weights = array<i32 : 42>} : () -> i32
-  llvm.return
-}
-
-// CHECK: ![[NODE]] = !{!"branch_weights", i32 42}
-
-// -----
-
 llvm.func @foo()
 llvm.func @__gxx_personality_v0(...) -> i32
 

From 54015f36c682aab9024a21a93957312a69c5bc9b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:19:06 +0900
Subject: [PATCH 0731/1322] AMDGPU: Cost model for minimumnum/maximumnum
 (#141946)

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  19 +
 .../Analysis/CostModel/AMDGPU/maximumnum.ll   | 582 ++++++++++--------
 .../Analysis/CostModel/AMDGPU/minimumnum.ll   | 582 ++++++++++--------
 .../SLPVectorizer/AMDGPU/slp-v2f16.ll         |  40 ++
 4 files changed, 695 insertions(+), 528 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ce2098a3a19b..f3474fcbbfb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -685,6 +685,8 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
   case Intrinsic::copysign:
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum:
   case Intrinsic::canonicalize:
   // There's a small benefit to using vector ops in the legalized code.
   case Intrinsic::round:
@@ -742,6 +744,23 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     break;
   case Intrinsic::copysign:
     return NElts * getFullRateInstrCost();
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum: {
+    // Instruction + 2 canonicalizes. For cases that need type promotion, we the
+    // promotion takes the place of the canonicalize.
+    unsigned NumOps = 3;
+    if (const IntrinsicInst *II = ICA.getInst()) {
+      // Directly legal with ieee=0
+      // TODO: Not directly legal with strictfp
+      if (fpenvIEEEMode(*II) == KnownIEEEMode::Off)
+        NumOps = 1;
+    }
+
+    unsigned BaseRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
+    InstRate = BaseRate * NumOps;
+    break;
+  }
   case Intrinsic::canonicalize: {
     InstRate =
         SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
index 5b158e3d8d67..a81cb63f0c51 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
@@ -11,75 +11,75 @@
 
 define void @maximumnum_f16() {
 ; GFX7-LABEL: 'maximumnum_f16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'maximumnum_f16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'maximumnum_f16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'maximumnum_f16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'maximumnum_f16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'maximumnum_f16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'maximumnum_f16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'maximumnum_f16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
@@ -92,77 +92,23 @@ define void @maximumnum_f16() {
 }
 
 define void @maximumnum_bf16() {
-; GFX7-LABEL: 'maximumnum_bf16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; ALL-LABEL: 'maximumnum_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; GFX8-LABEL: 'maximumnum_bf16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX9-LABEL: 'maximumnum_bf16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX12-LABEL: 'maximumnum_bf16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX7-SIZE-LABEL: 'maximumnum_bf16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX8-SIZE-LABEL: 'maximumnum_bf16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX9-SIZE-LABEL: 'maximumnum_bf16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX12-SIZE-LABEL: 'maximumnum_bf16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SIZE-LABEL: 'maximumnum_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
   %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
@@ -175,21 +121,21 @@ define void @maximumnum_bf16() {
 
 define void @maximumnum_f32() {
 ; ALL-LABEL: 'maximumnum_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maximumnum_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
@@ -203,21 +149,21 @@ define void @maximumnum_f32() {
 
 define void @maximumnum_f64() {
 ; ALL-LABEL: 'maximumnum_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maximumnum_f64'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
@@ -231,75 +177,75 @@ define void @maximumnum_f64() {
 
 define void @maximumnum_f16_no_ieee() #0 {
 ; GFX7-LABEL: 'maximumnum_f16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'maximumnum_f16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'maximumnum_f16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'maximumnum_f16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
@@ -313,75 +259,75 @@ define void @maximumnum_f16_no_ieee() #0 {
 
 define void @maximumnum_bf16_no_ieee() #0 {
 ; GFX7-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
@@ -394,23 +340,77 @@ define void @maximumnum_bf16_no_ieee() #0 {
 }
 
 define void @maximumnum_f32_no_ieee() #0 {
-; ALL-LABEL: 'maximumnum_f32_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'maximumnum_f32_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'maximumnum_f32_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'maximumnum_f32_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f32_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f32_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
   %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
@@ -422,23 +422,77 @@ define void @maximumnum_f32_no_ieee() #0 {
 }
 
 define void @maximumnum_f64_no_ieee() #0 {
-; ALL-LABEL: 'maximumnum_f64_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'maximumnum_f64_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'maximumnum_f64_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'maximumnum_f64_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f64_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f64_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
   %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
index 97715cbab7d8..b027ccc61266 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
@@ -11,75 +11,75 @@
 
 define void @minimumnum_f16() {
 ; GFX7-LABEL: 'minimumnum_f16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'minimumnum_f16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'minimumnum_f16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'minimumnum_f16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'minimumnum_f16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'minimumnum_f16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'minimumnum_f16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'minimumnum_f16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
@@ -92,77 +92,23 @@ define void @minimumnum_f16() {
 }
 
 define void @minimumnum_bf16() {
-; GFX7-LABEL: 'minimumnum_bf16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; ALL-LABEL: 'minimumnum_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; GFX8-LABEL: 'minimumnum_bf16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX9-LABEL: 'minimumnum_bf16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX12-LABEL: 'minimumnum_bf16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX7-SIZE-LABEL: 'minimumnum_bf16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX8-SIZE-LABEL: 'minimumnum_bf16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX9-SIZE-LABEL: 'minimumnum_bf16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX12-SIZE-LABEL: 'minimumnum_bf16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SIZE-LABEL: 'minimumnum_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
   %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
@@ -175,21 +121,21 @@ define void @minimumnum_bf16() {
 
 define void @minimumnum_f32() {
 ; ALL-LABEL: 'minimumnum_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'minimumnum_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
@@ -203,21 +149,21 @@ define void @minimumnum_f32() {
 
 define void @minimumnum_f64() {
 ; ALL-LABEL: 'minimumnum_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'minimumnum_f64'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
@@ -231,75 +177,75 @@ define void @minimumnum_f64() {
 
 define void @minimumnum_f16_no_ieee() #0 {
 ; GFX7-LABEL: 'minimumnum_f16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'minimumnum_f16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'minimumnum_f16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'minimumnum_f16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
@@ -313,75 +259,75 @@ define void @minimumnum_f16_no_ieee() #0 {
 
 define void @minimumnum_bf16_no_ieee() #0 {
 ; GFX7-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
@@ -394,23 +340,77 @@ define void @minimumnum_bf16_no_ieee() #0 {
 }
 
 define void @minimumnum_f32_no_ieee() #0 {
-; ALL-LABEL: 'minimumnum_f32_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'minimumnum_f32_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'minimumnum_f32_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'minimumnum_f32_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f32_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f32_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
   %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
@@ -422,23 +422,77 @@ define void @minimumnum_f32_no_ieee() #0 {
 }
 
 define void @minimumnum_f64_no_ieee() #0 {
-; ALL-LABEL: 'minimumnum_f64_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'minimumnum_f64_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'minimumnum_f64_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'minimumnum_f64_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f64_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f64_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
   %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index 0c26bcb343bf..f71fdbdee527 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -360,3 +360,43 @@ bb:
   store half %tmp16, ptr addrspace(1) %tmp14, align 2
   ret void
 }
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @minimumnum_combine_v2f16
+; GFX8: call <2 x half> @llvm.minimumnum.v2f16
+; GFX9: call <2 x half> @llvm.minimumnum.v2f16
+define void @minimumnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minimumnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minimumnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @maximumnum_combine_v2f16
+; GFX8: call <2 x half> @llvm.maximumnum.v2f16
+; GFX9: call <2 x half> @llvm.maximumnum.v2f16
+define void @maximumnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maximumnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maximumnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}

From f08474ab1fa984560565e917453a42bc8562a6f9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:21:55 +0900
Subject: [PATCH 0732/1322] AMDGPU: Add baseline cost model tests for special
 argument intrinsics (#141947)

---
 .../AMDGPU/special-argument-intrinsics.ll     | 202 ++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
new file mode 100644
index 000000000000..ea045e04310b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,UNPACKEDID %s
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=ALL,PACKEDID %s
+
+; RUN: opt -passes='print<cost-model>' -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZE,SIZE-UNPACKEDID %s
+; RUN: opt -passes='print<cost-model>' -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=SIZE,SIZE-PACKEDID %s
+
+define i32 @workitem_id_x() {
+; ALL-LABEL: 'workitem_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workitem_id_y() {
+; ALL-LABEL: 'workitem_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workitem_id_z() {
+; ALL-LABEL: 'workitem_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.z()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workgroup_id_x() {
+; ALL-LABEL: 'workgroup_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.x()
+  ret i32 %result
+}
+
+define i32 @workgroup_id_y() {
+; ALL-LABEL: 'workgroup_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.y()
+  ret i32 %result
+}
+
+define i32 @workgroup_id_z() {
+; ALL-LABEL: 'workgroup_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.y()
+  ret i32 %result
+}
+
+define i32 @lds_kernel_id() {
+; ALL-LABEL: 'lds_kernel_id'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'lds_kernel_id'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.lds.kernel.id()
+  ret i32 %result
+}
+
+define ptr addrspace(4) @dispatch_ptr() {
+; ALL-LABEL: 'dispatch_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'dispatch_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  ret ptr addrspace(4) %result
+}
+
+define i64 @dispatch_id_() {
+; ALL-LABEL: 'dispatch_id_'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i64 %result
+;
+; SIZE-LABEL: 'dispatch_id_'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %result
+;
+  %result = call i64 @llvm.amdgcn.dispatch.id()
+  ret i64 %result
+}
+
+define ptr addrspace(4) @implicitarg_ptr() {
+; ALL-LABEL: 'implicitarg_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'implicitarg_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  ret ptr addrspace(4) %result
+}
+
+define ptr addrspace(4) @queue_ptr() {
+; ALL-LABEL: 'queue_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'queue_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+  ret ptr addrspace(4) %result
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; PACKEDID: {{.*}}
+; SIZE-PACKEDID: {{.*}}
+; SIZE-UNPACKEDID: {{.*}}
+; UNPACKEDID: {{.*}}

From f3af1cd08cd456214961af915c17f858c9eef1a5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 17 Jun 2025 16:24:50 -0700
Subject: [PATCH 0733/1322] [RISCV] Set the exact flag on the SRL created for
 converting vscale to a read of vlenb. (#144571)

We know that vlenb is a multiple of RVVBytesPerBlock so we aren't
shifting out any non-zero bits.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  13 +-
 .../CodeGen/RISCV/rvv/extract-subvector.ll    |  12 +-
 .../CodeGen/RISCV/rvv/get_vector_length.ll    |  24 +--
 .../CodeGen/RISCV/rvv/insert-subvector.ll     |  24 +--
 .../CodeGen/RISCV/rvv/legalize-load-sdnode.ll |  12 +-
 .../RISCV/rvv/legalize-store-sdnode.ll        |   6 +-
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll     |  20 +--
 llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll   |  48 +++---
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |  22 +--
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  | 155 +++++++++---------
 .../RISCV/rvv/vreductions-fp-sdnode.ll        |  42 ++---
 11 files changed, 179 insertions(+), 199 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 33aae7ab16cc..e670567bd184 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7353,20 +7353,25 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     uint64_t Val = Op.getConstantOperandVal(0);
     if (isPowerOf2_64(Val)) {
       uint64_t Log2 = Log2_64(Val);
-      if (Log2 < 3)
+      if (Log2 < 3) {
+        SDNodeFlags Flags;
+        Flags.setExact(true);
         Res = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
-                          DAG.getConstant(3 - Log2, DL, VT));
-      else if (Log2 > 3)
+                          DAG.getConstant(3 - Log2, DL, XLenVT), Flags);
+      } else if (Log2 > 3) {
         Res = DAG.getNode(ISD::SHL, DL, XLenVT, Res,
                           DAG.getConstant(Log2 - 3, DL, XLenVT));
+      }
     } else if ((Val % 8) == 0) {
       // If the multiplier is a multiple of 8, scale it down to avoid needing
       // to shift the VLENB value.
       Res = DAG.getNode(ISD::MUL, DL, XLenVT, Res,
                         DAG.getConstant(Val / 8, DL, XLenVT));
     } else {
+      SDNodeFlags Flags;
+      Flags.setExact(true);
       SDValue VScale = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
-                                   DAG.getConstant(3, DL, XLenVT));
+                                   DAG.getConstant(3, DL, XLenVT), Flags);
       Res = DAG.getNode(ISD::MUL, DL, XLenVT, VScale,
                         DAG.getConstant(Val, DL, XLenVT));
     }
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index 83637e4a71d4..d42c42c7ce03 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -290,8 +290,7 @@ define <vscale x 2 x i8> @extract_nxv32i8_nxv2i8_6(<vscale x 32 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv32i8_nxv2i8_6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -314,8 +313,7 @@ define <vscale x 2 x i8> @extract_nxv32i8_nxv2i8_22(<vscale x 32 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv32i8_nxv2i8_22:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
@@ -341,9 +339,9 @@ define <vscale x 1 x i8> @extract_nxv4i8_nxv1i8_3(<vscale x 4 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv4i8_nxv1i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
index bd0fecd28551..aea688f03cf7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -257,9 +257,9 @@ define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
 ; RV32-LABEL: vector_length_vf3_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    bltu a0, a1, .LBB22_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
@@ -270,9 +270,9 @@ define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    bltu a0, a1, .LBB22_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
@@ -286,9 +286,9 @@ define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
 ; RV32-LABEL: vector_length_vf3_XLen:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    bltu a0, a1, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
@@ -299,9 +299,9 @@ define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    bltu a0, a1, .LBB23_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index ca9cec921b3c..61cf1f56aee3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -78,12 +78,12 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-LABEL: insert_nxv1i8_nxv4i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
   ret <vscale x 4 x i8> %v
@@ -309,12 +309,12 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-LABEL: insert_nxv16i8_nxv1i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
   ret <vscale x 16 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
index e9e1303d1076..f847ccafefda 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 3 x i8> @load_nxv3i8(ptr %ptr) {
 ; CHECK-LABEL: load_nxv3i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -22,9 +22,9 @@ define <vscale x 5 x half> @load_nxv5f16(ptr %ptr) {
 ; CHECK-LABEL: load_nxv5f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 2
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 77438ee53b63..03b84ec177ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -8,9 +8,9 @@ define void @store_nxv3i8(<vscale x 3 x i8> %val, ptr %ptr) {
 ; CHECK-LABEL: store_nxv3i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index d4e2c08d70d3..95c1292e4192 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -637,21 +637,21 @@ define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
 ; RV32-NEXT:    lui a1, 797989
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    lui a3, 11557
-; RV32-NEXT:    lui a4, 92455
 ; RV32-NEXT:    addi a1, a1, -683
-; RV32-NEXT:    addi a3, a3, -683
+; RV32-NEXT:    srli a4, a2, 2
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    srli a0, a2, 3
-; RV32-NEXT:    addi a1, a4, -1368
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    mulhu a1, a0, a1
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    slli a0, a0, 6
-; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    slli a0, a2, 3
+; RV32-NEXT:    sub a0, a0, a4
+; RV32-NEXT:    lui a1, 92455
+; RV32-NEXT:    addi a3, a3, -683
+; RV32-NEXT:    mul a3, a2, a3
+; RV32-NEXT:    srli a2, a2, 3
+; RV32-NEXT:    addi a1, a1, -1368
+; RV32-NEXT:    mulhu a1, a2, a1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    sw a2, 0(sp)
+; RV32-NEXT:    sw a3, 0(sp)
 ; RV32-NEXT:    sw a0, 4(sp)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index aef46e1f5cf1..66e114c938c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2240,20 +2240,19 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-RV32:       # %bb.0: # %entry
 ; CHECK-RV32-NEXT:    csrr a4, vlenb
-; CHECK-RV32-NEXT:    srli a3, a4, 3
-; CHECK-RV32-NEXT:    li a2, 64
+; CHECK-RV32-NEXT:    srli a2, a4, 3
+; CHECK-RV32-NEXT:    li a3, 64
 ; CHECK-RV32-NEXT:    not a1, a1
-; CHECK-RV32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-RV32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    li a3, 0
 ; CHECK-RV32-NEXT:    li a2, 0
 ; CHECK-RV32-NEXT:    j .LBB98_5
 ; CHECK-RV32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-RV32-NEXT:    li a2, 0
-; CHECK-RV32-NEXT:    slli a3, a3, 2
-; CHECK-RV32-NEXT:    neg a3, a3
-; CHECK-RV32-NEXT:    andi a3, a3, 256
 ; CHECK-RV32-NEXT:    srli a4, a4, 1
+; CHECK-RV32-NEXT:    neg a3, a4
+; CHECK-RV32-NEXT:    andi a3, a3, 256
 ; CHECK-RV32-NEXT:    li a6, 0
 ; CHECK-RV32-NEXT:    li a5, 0
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2300,10 +2299,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV64-NEXT:    li a2, 0
 ; CHECK-RV64-NEXT:    j .LBB98_5
 ; CHECK-RV64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-RV64-NEXT:    slli a2, a2, 2
-; CHECK-RV64-NEXT:    negw a2, a2
-; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    srli a3, a4, 1
+; CHECK-RV64-NEXT:    negw a2, a3
+; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    slli a4, a4, 1
 ; CHECK-RV64-NEXT:    mv a5, a0
 ; CHECK-RV64-NEXT:    mv a6, a2
@@ -2335,19 +2333,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-ZVKB-NOZBB32:       # %bb.0: # %entry
 ; CHECK-ZVKB-NOZBB32-NEXT:    csrr a4, vlenb
-; CHECK-ZVKB-NOZBB32-NEXT:    srli a3, a4, 3
-; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 64
-; CHECK-ZVKB-NOZBB32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-ZVKB-NOZBB32-NEXT:    srli a2, a4, 3
+; CHECK-ZVKB-NOZBB32-NEXT:    li a3, 64
+; CHECK-ZVKB-NOZBB32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-ZVKB-NOZBB32-NEXT:  # %bb.1:
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a3, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 0
-; CHECK-ZVKB-NOZBB32-NEXT:    slli a3, a3, 2
-; CHECK-ZVKB-NOZBB32-NEXT:    neg a3, a3
-; CHECK-ZVKB-NOZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-NOZBB32-NEXT:    srli a4, a4, 1
+; CHECK-ZVKB-NOZBB32-NEXT:    neg a3, a4
+; CHECK-ZVKB-NOZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a6, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a5, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2395,10 +2392,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB64-NEXT:    li a2, 0
 ; CHECK-ZVKB-NOZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-ZVKB-NOZBB64-NEXT:    slli a2, a2, 2
-; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a2
-; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    srli a3, a4, 1
+; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a5, a0
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a6, a2
@@ -2431,19 +2427,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-ZVKB-ZBB32:       # %bb.0: # %entry
 ; CHECK-ZVKB-ZBB32-NEXT:    csrr a4, vlenb
-; CHECK-ZVKB-ZBB32-NEXT:    srli a3, a4, 3
-; CHECK-ZVKB-ZBB32-NEXT:    li a2, 64
-; CHECK-ZVKB-ZBB32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-ZVKB-ZBB32-NEXT:    srli a2, a4, 3
+; CHECK-ZVKB-ZBB32-NEXT:    li a3, 64
+; CHECK-ZVKB-ZBB32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-ZVKB-ZBB32-NEXT:  # %bb.1:
 ; CHECK-ZVKB-ZBB32-NEXT:    li a3, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    li a2, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-ZBB32-NEXT:    li a2, 0
-; CHECK-ZVKB-ZBB32-NEXT:    slli a3, a3, 2
-; CHECK-ZVKB-ZBB32-NEXT:    neg a3, a3
-; CHECK-ZVKB-ZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-ZBB32-NEXT:    srli a4, a4, 1
+; CHECK-ZVKB-ZBB32-NEXT:    neg a3, a4
+; CHECK-ZVKB-ZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-ZBB32-NEXT:    li a6, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    li a5, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2489,10 +2484,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB64-NEXT:    li a2, 0
 ; CHECK-ZVKB-ZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-ZVKB-ZBB64-NEXT:    slli a2, a2, 2
-; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a2
-; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    srli a3, a4, 1
+; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a5, a0
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a6, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index ca7f2563e4fc..baace6d26f14 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -338,16 +338,14 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    add a3, a0, a0
+; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a2
 ; CHECK-NEXT:    vslideup.vx v8, v13, a2
-; CHECK-NEXT:    add a2, a0, a0
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v14, a3
-; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v14, a4
+; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
@@ -381,20 +379,18 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    add a3, a1, a1
 ; CHECK-NEXT:    add a4, a2, a1
-; CHECK-NEXT:    slli a5, a1, 1
-; CHECK-NEXT:    add a6, a0, a0
+; CHECK-NEXT:    add a5, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    vslideup.vx v8, v13, a1
+; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a2
-; CHECK-NEXT:    add a1, a5, a1
 ; CHECK-NEXT:    vslideup.vx v8, v14, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v12, a5
-; CHECK-NEXT:    vslideup.vx v8, v15, a5
-; CHECK-NEXT:    vsetvli zero, a6, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a4
+; CHECK-NEXT:    vslideup.vx v8, v15, a4
+; CHECK-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 6a08f5a28a29..75f92c86ff09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -477,27 +477,26 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v16, (a0)
 ; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -606,11 +605,9 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
@@ -836,39 +833,37 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
-; CHECK-NEXT:    srli a2, a0, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    srli a3, a0, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    srli a2, a0, 1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a3
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
-; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vs8r.v v16, (a2)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v16, (a1)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg6e8.v v16, (a2)
 ; CHECK-NEXT:    vlseg6e8.v v10, (a0)
 ; CHECK-NEXT:    vmv2r.v v8, v16
 ; CHECK-NEXT:    vmv2r.v v22, v18
@@ -1068,36 +1063,35 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a2
-; CHECK-NEXT:    sub a0, a0, a3
-; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v22, v12, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    vmv1r.v v10, v15
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v11, v24
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v23
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v14
@@ -1339,49 +1333,48 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a2
-; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vslidedown.vx v0, v9, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v18, (a1)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v18, (a2)
 ; CHECK-NEXT:    vlseg8e8.v v10, (a0)
 ; CHECK-NEXT:    vmv2r.v v8, v18
 ; CHECK-NEXT:    vmv2r.v v26, v20
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 3da04eb7e6ab..78aae96242fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -887,9 +887,9 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv3f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -906,8 +906,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv6f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -925,8 +924,7 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv10f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -944,9 +942,8 @@ define half @vreduce_ord_fadd_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv12f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -965,9 +962,9 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -984,8 +981,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v11, a1
@@ -1002,13 +998,12 @@ declare half @llvm.vector.reduce.fmin.nxv10f16(<vscale x 10 x half>)
 define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-LABEL: vreduce_fmin_nxv10f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    lui a1, %hi(.LCPI73_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI73_0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI73_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI73_0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v12, v8, v12
@@ -1024,9 +1019,8 @@ define half @vreduce_fmax_nxv12f16(<vscale x 12 x half> %v) {
 ; CHECK-LABEL: vreduce_fmax_nxv12f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    li a1, -512
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma

From a9811340b75baae8e06fb9ab83015a90d61510ee Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:24:58 +0900
Subject: [PATCH 0734/1322] AMDGPU: Report special input intrinsics as free
 (#141948)

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 23 +++++++-
 .../AMDGPU/special-argument-intrinsics.ll     | 56 +++++++++----------
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index f3474fcbbfb5..d5a1aaef4ad6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -704,8 +704,29 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
 InstructionCost
 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                   TTI::TargetCostKind CostKind) const {
-  if (ICA.getID() == Intrinsic::fabs)
+  switch (ICA.getID()) {
+  case Intrinsic::fabs:
+    // Free source modifier in the common case.
     return 0;
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
+    // TODO: If hasPackedTID, or if the calling context is not an entry point
+    // there may be a bit instruction.
+    return 0;
+  case Intrinsic::amdgcn_workgroup_id_x:
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::amdgcn_lds_kernel_id:
+  case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_dispatch_id:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+  case Intrinsic::amdgcn_queue_ptr:
+    // Read from an argument register.
+    return 0;
+  default:
+    break;
+  }
 
   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
index ea045e04310b..00dbcff0a021 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
@@ -7,11 +7,11 @@
 
 define i32 @workitem_id_x() {
 ; ALL-LABEL: 'workitem_id_x'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workitem_id_x'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.x()
@@ -20,12 +20,12 @@ define i32 @workitem_id_x() {
 
 define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: 'kernel_workitem_id_x'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'kernel_workitem_id_x'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -36,11 +36,11 @@ define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
 
 define i32 @workitem_id_y() {
 ; ALL-LABEL: 'workitem_id_y'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workitem_id_y'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.y()
@@ -49,12 +49,12 @@ define i32 @workitem_id_y() {
 
 define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: 'kernel_workitem_id_y'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'kernel_workitem_id_y'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
 
 define i32 @workitem_id_z() {
 ; ALL-LABEL: 'workitem_id_z'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workitem_id_z'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.y()
@@ -78,12 +78,12 @@ define i32 @workitem_id_z() {
 
 define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: 'kernel_workitem_id_z'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'kernel_workitem_id_z'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -94,11 +94,11 @@ define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
 
 define i32 @workgroup_id_x() {
 ; ALL-LABEL: 'workgroup_id_x'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workgroup_id_x'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -107,11 +107,11 @@ define i32 @workgroup_id_x() {
 
 define i32 @workgroup_id_y() {
 ; ALL-LABEL: 'workgroup_id_y'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workgroup_id_y'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -120,11 +120,11 @@ define i32 @workgroup_id_y() {
 
 define i32 @workgroup_id_z() {
 ; ALL-LABEL: 'workgroup_id_z'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workgroup_id_z'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -133,11 +133,11 @@ define i32 @workgroup_id_z() {
 
 define i32 @lds_kernel_id() {
 ; ALL-LABEL: 'lds_kernel_id'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'lds_kernel_id'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.lds.kernel.id()
@@ -146,11 +146,11 @@ define i32 @lds_kernel_id() {
 
 define ptr addrspace(4) @dispatch_ptr() {
 ; ALL-LABEL: 'dispatch_ptr'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
 ;
 ; SIZE-LABEL: 'dispatch_ptr'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
 ;
   %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -159,11 +159,11 @@ define ptr addrspace(4) @dispatch_ptr() {
 
 define i64 @dispatch_id_() {
 ; ALL-LABEL: 'dispatch_id_'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i64 %result
 ;
 ; SIZE-LABEL: 'dispatch_id_'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %result
 ;
   %result = call i64 @llvm.amdgcn.dispatch.id()
@@ -172,11 +172,11 @@ define i64 @dispatch_id_() {
 
 define ptr addrspace(4) @implicitarg_ptr() {
 ; ALL-LABEL: 'implicitarg_ptr'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
 ;
 ; SIZE-LABEL: 'implicitarg_ptr'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
 ;
   %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -185,11 +185,11 @@ define ptr addrspace(4) @implicitarg_ptr() {
 
 define ptr addrspace(4) @queue_ptr() {
 ; ALL-LABEL: 'queue_ptr'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
 ;
 ; SIZE-LABEL: 'queue_ptr'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
 ;
   %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()

From 628274dadf92995f4544d6134cba45d327d9eaaa Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Tue, 17 Jun 2025 16:35:47 -0700
Subject: [PATCH 0735/1322] [NFC] Extract Printing portions of DWARFCFIProgram
 to new files (#143762)

CFIPrograms' most common uses are within debug frames, but it is not
their only use. For example, some assembly writers encode them by hand
into .cfi_escape directives. This PR extracts printing code for them
into its own files, which avoids the need for the main class to depend
on DWARFUnit, sections, and similar.

One in a series of NFC DebugInfo/DWARF refactoring changes to layer it
more cleanly, so that binary CFI parsing can be used from low-level
code, (such as byte strings created via .cfi_escape) without circular
dependencies. The final goal is to make a more limited dwarf library
usable from lower-level code.

More information can be found at
https://discourse.llvm.org/t/rfc-debuginfo-dwarf-refactor-into-to-lower-and-higher-level-libraries/86665
---
 .../llvm/DebugInfo/DWARF/DWARFCFIPrinter.h    |  28 ++++
 .../llvm/DebugInfo/DWARF/DWARFCFIProgram.h    |  62 ++++-----
 llvm/lib/DebugInfo/DWARF/CMakeLists.txt       |   1 +
 llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp  | 121 ++++++++++++++++++
 llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp  |  94 --------------
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp  |   6 +-
 llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h   |   5 +-
 7 files changed, 184 insertions(+), 133 deletions(-)
 create mode 100644 llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
 create mode 100644 llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
new file mode 100644
index 000000000000..32e8247ac4c2
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
@@ -0,0 +1,28 @@
+//===- DWARFCFIPrinter.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
+#define LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
+
+#include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
+
+namespace llvm {
+
+struct DIDumpOptions;
+
+namespace dwarf {
+
+void printCFIProgram(const CFIProgram &P, raw_ostream &OS,
+                     const DIDumpOptions &DumpOpts, unsigned IndentLevel,
+                     std::optional<uint64_t> Address);
+
+} // end namespace dwarf
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
index 24a0f389470d..ad7358c28f16 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
@@ -24,6 +24,7 @@
 namespace llvm {
 
 namespace dwarf {
+
 /// Represent a sequence of Call Frame Information instructions that, when read
 /// in order, construct a table mapping PC to frame state. This can also be
 /// referred to as "CFI rules" in DWARF literature to avoid confusion with
@@ -80,15 +81,37 @@ public:
   LLVM_ABI Error parse(DWARFDataExtractor Data, uint64_t *Offset,
                        uint64_t EndOffset);
 
-  LLVM_ABI void dump(raw_ostream &OS, DIDumpOptions DumpOpts,
-                     unsigned IndentLevel,
-                     std::optional<uint64_t> InitialLocation) const;
-
   void addInstruction(const Instruction &I) { Instructions.push_back(I); }
 
   /// Get a DWARF CFI call frame string for the given DW_CFA opcode.
   LLVM_ABI StringRef callFrameString(unsigned Opcode) const;
 
+  /// Types of operands to CFI instructions
+  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
+  /// thus this type doesn't need to be explicitly written to the file (this is
+  /// not a DWARF encoding). The relationship of instrs to operand types can
+  /// be obtained from getOperandTypes() and is only used to simplify
+  /// instruction printing and error messages.
+  enum OperandType {
+    OT_Unset,
+    OT_None,
+    OT_Address,
+    OT_Offset,
+    OT_FactoredCodeOffset,
+    OT_SignedFactDataOffset,
+    OT_UnsignedFactDataOffset,
+    OT_Register,
+    OT_AddressSpace,
+    OT_Expression
+  };
+
+  /// Get the OperandType as a "const char *".
+  static const char *operandTypeString(OperandType OT);
+
+  /// Retrieve the array describing the types of operands according to the enum
+  /// above. This is indexed by opcode.
+  static ArrayRef<OperandType[MaxOperands]> getOperandTypes();
+
 private:
   std::vector<Instruction> Instructions;
   const uint64_t CodeAlignmentFactor;
@@ -121,37 +144,6 @@ private:
     Instructions.back().Ops.push_back(Operand2);
     Instructions.back().Ops.push_back(Operand3);
   }
-
-  /// Types of operands to CFI instructions
-  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
-  /// thus this type doesn't need to be explicitly written to the file (this is
-  /// not a DWARF encoding). The relationship of instrs to operand types can
-  /// be obtained from getOperandTypes() and is only used to simplify
-  /// instruction printing.
-  enum OperandType {
-    OT_Unset,
-    OT_None,
-    OT_Address,
-    OT_Offset,
-    OT_FactoredCodeOffset,
-    OT_SignedFactDataOffset,
-    OT_UnsignedFactDataOffset,
-    OT_Register,
-    OT_AddressSpace,
-    OT_Expression
-  };
-
-  /// Get the OperandType as a "const char *".
-  static const char *operandTypeString(OperandType OT);
-
-  /// Retrieve the array describing the types of operands according to the enum
-  /// above. This is indexed by opcode.
-  static ArrayRef<OperandType[MaxOperands]> getOperandTypes();
-
-  /// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-  void printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
-                    const Instruction &Instr, unsigned OperandIdx,
-                    uint64_t Operand, std::optional<uint64_t> &Address) const;
 };
 
 } // end namespace dwarf
diff --git a/llvm/lib/DebugInfo/DWARF/CMakeLists.txt b/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
index cc9734f9f22b..86e74110b15e 100644
--- a/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMDebugInfoDWARF
   DWARFAbbreviationDeclaration.cpp
   DWARFAddressRange.cpp
   DWARFAcceleratorTable.cpp
+  DWARFCFIPrinter.cpp
   DWARFCFIProgram.cpp
   DWARFCompileUnit.cpp
   DWARFContext.cpp
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
new file mode 100644
index 000000000000..e52f671e4fa1
--- /dev/null
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
@@ -0,0 +1,121 @@
+//===- DWARFCFIPrinter.cpp - Print the cfi-portions of .debug_frame -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <optional>
+
+using namespace llvm;
+using namespace dwarf;
+
+static void printRegister(raw_ostream &OS, const DIDumpOptions &DumpOpts,
+                          unsigned RegNum) {
+  if (DumpOpts.GetNameForDWARFReg) {
+    auto RegName = DumpOpts.GetNameForDWARFReg(RegNum, DumpOpts.IsEH);
+    if (!RegName.empty()) {
+      OS << RegName;
+      return;
+    }
+  }
+  OS << "reg" << RegNum;
+}
+
+/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
+static void printOperand(raw_ostream &OS, const DIDumpOptions &DumpOpts,
+                         const CFIProgram &P,
+                         const CFIProgram::Instruction &Instr,
+                         unsigned OperandIdx, uint64_t Operand,
+                         std::optional<uint64_t> &Address) {
+  assert(OperandIdx < CFIProgram::MaxOperands);
+  uint8_t Opcode = Instr.Opcode;
+  CFIProgram::OperandType Type = P.getOperandTypes()[Opcode][OperandIdx];
+
+  switch (Type) {
+  case CFIProgram::OT_Unset: {
+    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
+    auto OpcodeName = P.callFrameString(Opcode);
+    if (!OpcodeName.empty())
+      OS << " " << OpcodeName;
+    else
+      OS << format(" Opcode %x", Opcode);
+    break;
+  }
+  case CFIProgram::OT_None:
+    break;
+  case CFIProgram::OT_Address:
+    OS << format(" %" PRIx64, Operand);
+    Address = Operand;
+    break;
+  case CFIProgram::OT_Offset:
+    // The offsets are all encoded in a unsigned form, but in practice
+    // consumers use them signed. It's most certainly legacy due to
+    // the lack of signed variants in the first Dwarf standards.
+    OS << format(" %+" PRId64, int64_t(Operand));
+    break;
+  case CFIProgram::OT_FactoredCodeOffset: // Always Unsigned
+    if (P.codeAlign())
+      OS << format(" %" PRId64, Operand * P.codeAlign());
+    else
+      OS << format(" %" PRId64 "*code_alignment_factor", Operand);
+    if (Address && P.codeAlign()) {
+      *Address += Operand * P.codeAlign();
+      OS << format(" to 0x%" PRIx64, *Address);
+    }
+    break;
+  case CFIProgram::OT_SignedFactDataOffset:
+    if (P.dataAlign())
+      OS << format(" %" PRId64, int64_t(Operand) * P.dataAlign());
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor", int64_t(Operand));
+    break;
+  case CFIProgram::OT_UnsignedFactDataOffset:
+    if (P.dataAlign())
+      OS << format(" %" PRId64, Operand * P.dataAlign());
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor", Operand);
+    break;
+  case CFIProgram::OT_Register:
+    OS << ' ';
+    printRegister(OS, DumpOpts, Operand);
+    break;
+  case CFIProgram::OT_AddressSpace:
+    OS << format(" in addrspace%" PRId64, Operand);
+    break;
+  case CFIProgram::OT_Expression:
+    assert(Instr.Expression && "missing DWARFExpression object");
+    OS << " ";
+    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
+                                  nullptr);
+    break;
+  }
+}
+
+void llvm::dwarf::printCFIProgram(const CFIProgram &P, raw_ostream &OS,
+                                  const DIDumpOptions &DumpOpts,
+                                  unsigned IndentLevel,
+                                  std::optional<uint64_t> Address) {
+  for (const auto &Instr : P) {
+    uint8_t Opcode = Instr.Opcode;
+    OS.indent(2 * IndentLevel);
+    OS << P.callFrameString(Opcode) << ":";
+    for (size_t i = 0; i < Instr.Ops.size(); ++i)
+      printOperand(OS, DumpOpts, P, Instr, i, Instr.Ops[i], Address);
+    OS << '\n';
+  }
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
index 8d25599627c4..365b26b98a1e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
@@ -23,18 +23,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-static void printRegister(raw_ostream &OS, DIDumpOptions DumpOpts,
-                          unsigned RegNum) {
-  if (DumpOpts.GetNameForDWARFReg) {
-    auto RegName = DumpOpts.GetNameForDWARFReg(RegNum, DumpOpts.IsEH);
-    if (!RegName.empty()) {
-      OS << RegName;
-      return;
-    }
-  }
-  OS << "reg" << RegNum;
-}
-
 // See DWARF standard v3, section 7.23
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
@@ -361,85 +349,3 @@ CFIProgram::getOperandTypes() {
 
   return ArrayRef<OperandType[MaxOperands]>(&OpTypes[0], DW_CFA_restore + 1);
 }
-
-/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-void CFIProgram::printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
-                              const Instruction &Instr, unsigned OperandIdx,
-                              uint64_t Operand,
-                              std::optional<uint64_t> &Address) const {
-  assert(OperandIdx < MaxOperands);
-  uint8_t Opcode = Instr.Opcode;
-  OperandType Type = getOperandTypes()[Opcode][OperandIdx];
-
-  switch (Type) {
-  case OT_Unset: {
-    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
-    auto OpcodeName = callFrameString(Opcode);
-    if (!OpcodeName.empty())
-      OS << " " << OpcodeName;
-    else
-      OS << format(" Opcode %x", Opcode);
-    break;
-  }
-  case OT_None:
-    break;
-  case OT_Address:
-    OS << format(" %" PRIx64, Operand);
-    Address = Operand;
-    break;
-  case OT_Offset:
-    // The offsets are all encoded in a unsigned form, but in practice
-    // consumers use them signed. It's most certainly legacy due to
-    // the lack of signed variants in the first Dwarf standards.
-    OS << format(" %+" PRId64, int64_t(Operand));
-    break;
-  case OT_FactoredCodeOffset: // Always Unsigned
-    if (CodeAlignmentFactor)
-      OS << format(" %" PRId64, Operand * CodeAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*code_alignment_factor", Operand);
-    if (Address && CodeAlignmentFactor) {
-      *Address += Operand * CodeAlignmentFactor;
-      OS << format(" to 0x%" PRIx64, *Address);
-    }
-    break;
-  case OT_SignedFactDataOffset:
-    if (DataAlignmentFactor)
-      OS << format(" %" PRId64, int64_t(Operand) * DataAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*data_alignment_factor", int64_t(Operand));
-    break;
-  case OT_UnsignedFactDataOffset:
-    if (DataAlignmentFactor)
-      OS << format(" %" PRId64, Operand * DataAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*data_alignment_factor", Operand);
-    break;
-  case OT_Register:
-    OS << ' ';
-    printRegister(OS, DumpOpts, Operand);
-    break;
-  case OT_AddressSpace:
-    OS << format(" in addrspace%" PRId64, Operand);
-    break;
-  case OT_Expression:
-    assert(Instr.Expression && "missing DWARFExpression object");
-    OS << " ";
-    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
-                                  nullptr);
-    break;
-  }
-}
-
-void CFIProgram::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
-                      unsigned IndentLevel,
-                      std::optional<uint64_t> Address) const {
-  for (const auto &Instr : Instructions) {
-    uint8_t Opcode = Instr.Opcode;
-    OS.indent(2 * IndentLevel);
-    OS << callFrameString(Opcode) << ":";
-    for (unsigned i = 0; i < Instr.Ops.size(); ++i)
-      printOperand(OS, DumpOpts, Instr, i, Instr.Ops[i], Address);
-    OS << '\n';
-  }
-}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index c46b14b4446f..9dff925073db 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -602,7 +603,8 @@ void CIE::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     OS << "\n";
   }
   OS << "\n";
-  CFIs.dump(OS, DumpOpts, /*IndentLevel=*/1, /*InitialLocation=*/{});
+  printCFIProgram(CFIs, OS, DumpOpts, /*IndentLevel=*/1,
+                  /*InitialLocation=*/{});
   OS << "\n";
 
   if (Expected<UnwindTable> RowsOrErr = UnwindTable::create(this))
@@ -630,7 +632,7 @@ void FDE::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   OS << "  Format:       " << FormatString(IsDWARF64) << "\n";
   if (LSDAAddress)
     OS << format("  LSDA Address: %016" PRIx64 "\n", *LSDAAddress);
-  CFIs.dump(OS, DumpOpts, /*IndentLevel=*/1, InitialLocation);
+  printCFIProgram(CFIs, OS, DumpOpts, /*IndentLevel=*/1, InitialLocation);
   OS << "\n";
 
   if (Expected<UnwindTable> RowsOrErr = UnwindTable::create(this))
diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 94a44e3afccb..85c4165de4aa 100644
--- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -12,6 +12,7 @@
 #include "llvm-readobj.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
@@ -228,8 +229,8 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
     W.indent();
     auto DumpOpts = DIDumpOptions();
     DumpOpts.IsEH = true;
-    Entry.cfis().dump(W.getOStream(), DumpOpts, W.getIndentLevel(),
-                      InitialLocation);
+    printCFIProgram(Entry.cfis(), W.getOStream(), DumpOpts, W.getIndentLevel(),
+                    InitialLocation);
     W.unindent();
     W.unindent();
     W.getOStream() << "\n";

From a871b919ed135b3b50db58ed816d6ddb488d9c5e Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:27:25 +0000
Subject: [PATCH 0736/1322] [gn build] Port 9e0186d925f0

---
 llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
index cd7d0671fbe7..7338fb159419 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
@@ -14,6 +14,7 @@ unittest("LLVMFrontendTests") {
   ]
   sources = [
     "HLSLRootSignatureDumpTest.cpp",
+    "HLSLRootSignatureRangesTest.cpp",
     "OpenACCTest.cpp",
     "OpenMPCompositionTest.cpp",
     "OpenMPContextTest.cpp",

From 535291409cc7e4ae571318a38bd3617d7f608002 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:27:26 +0000
Subject: [PATCH 0737/1322] [gn build] Port 9ec75a50bc48

---
 .../gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
index a10a0d5637e9..87c02122e0f6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
@@ -67,7 +67,6 @@ static_library("MCTargetDesc") {
     "MipsInstPrinter.cpp",
     "MipsMCAsmInfo.cpp",
     "MipsMCCodeEmitter.cpp",
-    "MipsMCExpr.cpp",
     "MipsMCTargetDesc.cpp",
     "MipsNaClELFStreamer.cpp",
     "MipsOptionRecord.cpp",

From 6652961ae5fee4d81871e4310a9e842c61136c10 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:34:15 +0000
Subject: [PATCH 0738/1322] [gn build] Manually port 556e69b7

---
 llvm/utils/gn/secondary/lldb/test/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 15b57f7d85fc..6dcce2db3796 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -92,6 +92,7 @@ write_lit_cfg("lit_api_site_cfg") {
     "LLDB_FRAMEWORK_DIR=XXX_framework_dir",
     "CMAKE_CXX_COMPILER=c++",  # XXX use bin/clang++ instead?
     "HOST_OS=$host_os",  # XXX
+    "Python3_ROOT_DIR=",  # FIXME
   ]
 
   if (is_debug) {

From b164d3613ad9b86a8b951cfc43fadc0edfc7644e Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:42:47 +0000
Subject: [PATCH 0739/1322] [gn build] Port 628274dadf92

---
 llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
index fc071e5471d0..cb46f7cf55fe 100644
--- a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
@@ -11,6 +11,7 @@ static_library("DWARF") {
     "DWARFAbbreviationDeclaration.cpp",
     "DWARFAcceleratorTable.cpp",
     "DWARFAddressRange.cpp",
+    "DWARFCFIPrinter.cpp",
     "DWARFCFIProgram.cpp",
     "DWARFCompileUnit.cpp",
     "DWARFContext.cpp",

From f2d2c99866dfd133e7b9c98b1d4983c6bce33d67 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 17 Jun 2025 16:43:55 -0700
Subject: [PATCH 0740/1322] [clang] Remove separate evaluation step for static
 class member init. (#142713)

We already evaluate the initializers for all global variables, as
required by the standard. Leverage that evaluation instead of trying to
separately validate static class members.

This has a few benefits:

- Improved diagnostics; we now get notes explaining what failed to
evaluate.
- Improved correctness: is_constant_evaluated is handled correctly.

The behavior follows the proposed resolution for CWG1721.

Fixes #88462. Fixes #99680.
---
 clang/lib/Sema/SemaDecl.cpp                   | 39 +++++++++----------
 .../SemaCXX/builtin-is-constant-evaluated.cpp | 14 +++++++
 clang/test/SemaCXX/class.cpp                  | 28 ++++++++-----
 clang/test/SemaCXX/cxx0x-class.cpp            | 11 ++++--
 clang/test/SemaCXX/cxx2a-consteval.cpp        |  8 ++--
 .../SemaTemplate/instantiate-static-var.cpp   | 10 +++--
 6 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 02ac898a2b70..1bf72e5bb7b9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13963,31 +13963,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
 
     // We allow integer constant expressions in all cases.
     } else if (DclT->isIntegralOrEnumerationType()) {
-      // Check whether the expression is a constant expression.
-      SourceLocation Loc;
       if (getLangOpts().CPlusPlus11 && DclT.isVolatileQualified())
         // In C++11, a non-constexpr const static data member with an
         // in-class initializer cannot be volatile.
         Diag(VDecl->getLocation(), diag::err_in_class_initializer_volatile);
-      else if (Init->isValueDependent())
-        ; // Nothing to check.
-      else if (Init->isIntegerConstantExpr(Context, &Loc))
-        ; // Ok, it's an ICE!
-      else if (Init->getType()->isScopedEnumeralType() &&
-               Init->isCXX11ConstantExpr(Context))
-        ; // Ok, it is a scoped-enum constant expression.
-      else if (Init->isEvaluatable(Context)) {
-        // If we can constant fold the initializer through heroics, accept it,
-        // but report this as a use of an extension for -pedantic.
-        Diag(Loc, diag::ext_in_class_initializer_non_constant)
-          << Init->getSourceRange();
-      } else {
-        // Otherwise, this is some crazy unknown case.  Report the issue at the
-        // location provided by the isIntegerConstantExpr failed check.
-        Diag(Loc, diag::err_in_class_initializer_non_constant)
-          << Init->getSourceRange();
-        VDecl->setInvalidDecl();
-      }
 
     // We allow foldable floating-point constants as an extension.
     } else if (DclT->isFloatingType()) { // also permits complex, which is ok
@@ -14715,6 +14694,17 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
       // Compute and cache the constant value, and remember that we have a
       // constant initializer.
       if (HasConstInit) {
+        if (var->isStaticDataMember() && !var->isInline() &&
+            var->getLexicalDeclContext()->isRecord() &&
+            type->isIntegralOrEnumerationType()) {
+          // In C++98, in-class initialization for a static data member must
+          // be an integer constant expression.
+          SourceLocation Loc;
+          if (!Init->isIntegerConstantExpr(Context, &Loc)) {
+            Diag(Loc, diag::ext_in_class_initializer_non_constant)
+                << Init->getSourceRange();
+          }
+        }
         (void)var->checkForConstantInitialization(Notes);
         Notes.clear();
       } else if (CacheCulprit) {
@@ -14750,6 +14740,13 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
           << Attr->getRange() << Attr->isConstinit();
       for (auto &it : Notes)
         Diag(it.first, it.second);
+    } else if (var->isStaticDataMember() && !var->isInline() &&
+               var->getLexicalDeclContext()->isRecord()) {
+      Diag(var->getLocation(), diag::err_in_class_initializer_non_constant)
+          << Init->getSourceRange();
+      for (auto &it : Notes)
+        Diag(it.first, it.second);
+      var->setInvalidDecl();
     } else if (IsGlobal &&
                !getDiagnostics().isIgnored(diag::warn_global_constructor,
                                            var->getLocation())) {
diff --git a/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp b/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
index c775fe71069d..66981acf87a8 100644
--- a/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
+++ b/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
@@ -154,3 +154,17 @@ namespace narrowing {
     // expected-note {{insert an explicit cast to silence this issue}}
   }
 }
+
+struct GH99680 {
+  static const int x1 = 1/(1-__builtin_is_constant_evaluated()); // expected-error {{in-class initializer for static data member is not a constant expression}} \
+    // expected-note {{division by zero}}
+  static const int x2 = __builtin_is_constant_evaluated();
+  static_assert(x2 == 1);
+  static const float x3 = 1/(1-__builtin_is_constant_evaluated());  // expected-error {{in-class initializer for static data member of type 'const float' requires 'constexpr' specifier}} \
+  // expected-note {{add 'constexpr'}} \
+  // expected-error {{in-class initializer for static data member is not a constant expression}} \
+  // expected-note {{division by zero}}
+  static const float x4 = __builtin_is_constant_evaluated(); // expected-error {{in-class initializer for static data member of type 'const float' requires 'constexpr' specifier}} \
+  // expected-note {{add 'constexpr'}}
+  static_assert(fold(x4 == 1));
+};
diff --git a/clang/test/SemaCXX/class.cpp b/clang/test/SemaCXX/class.cpp
index 2f59544e7f36..f1e02d5158aa 100644
--- a/clang/test/SemaCXX/class.cpp
+++ b/clang/test/SemaCXX/class.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -Wc++11-compat %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s -std=c++98
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98 -Wc++11-compat %s -std=c++98
 class C {
 public:
   auto int errx; // expected-error {{storage class specified for a member declaration}}
@@ -32,7 +32,7 @@ public:
   int : 1, : 2;
   typedef int E : 1; // expected-error {{typedef member 'E' cannot be a bit-field}}
   static int sb : 1; // expected-error {{static member 'sb' cannot be a bit-field}}
-  static int vs;
+  static int vs; // cxx11-note {{declared here}}
 
   typedef int func();
   func tm;
@@ -48,20 +48,28 @@ public:
 #endif
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const NestedC' must be initialized out of line}}
-  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
+  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}} \
+  // cxx11-note {{read of non-const variable 'vs' is not allowed in a constant expression}} \
+  // cxx98-note {{subexpression not valid in a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // ok, illegal in C++11
 #if __cplusplus >= 201103L
   // expected-error@-2 {{static const volatile data member must be initialized out of line}}
 #endif
   static const E evi = 0;
-  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-                                               // expected-warning@-1 {{overflow in expression}}
-  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                               // cxx11-note {{value 1000000000000 is outside the range of representable values of type 'int'}} \
+                                               // expected-warning {{overflow in expression}}
+  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                           // cxx11-note {{shift count 32 >= width of type 'int' (32 bits)}}
+  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}\
+                                            // cxx11-note {{shift count 32 >= width of type 'int' (32 bits)}}
+  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{negative shift count -1}}
+  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{negative shift count -1}}
+  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{left shift of negative value -1}}
 
   void m() {
     sx = 0;
diff --git a/clang/test/SemaCXX/cxx0x-class.cpp b/clang/test/SemaCXX/cxx0x-class.cpp
index a612a5c07e6e..4b54221cceff 100644
--- a/clang/test/SemaCXX/cxx0x-class.cpp
+++ b/clang/test/SemaCXX/cxx0x-class.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -Wno-uninitialized -fsyntax-only -verify -std=c++11 -Wno-error=static-float-init %s 
 
-int vs = 0;
+int vs = 0; // expected-note {{declared here}}
 
 class C {
 public:
@@ -11,17 +11,20 @@ public:
   int i = 0;
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const NestedC' must be initialized out of line}}
-  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
+  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                             // expected-note {{read of non-const variable 'vs' is not allowed in a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // expected-error {{static const volatile data member must be initialized out of line}}
 };
 
 namespace rdar8367341 {
-  float foo(); // expected-note {{here}}
+  float foo(); // expected-note 2 {{here}}
 
   struct A {
     static const float x = 5.0f; // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}}
-    static const float y = foo(); // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}}
+    static const float y = foo(); // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}} \
+                                  // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                                  // expected-note {{non-constexpr function 'foo' cannot be used in a constant expression}}
     static constexpr float x2 = 5.0f;
     static constexpr float y2 = foo(); // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr function 'foo'}}
   };
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index d9932e4dd824..1474c48cda3c 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -1154,20 +1154,20 @@ namespace GH65985 {
 int consteval operator""_foo(unsigned long long V) {
     return 0;
 }
-int consteval operator""_bar(unsigned long long V); // expected-note 3{{here}}
+int consteval operator""_bar(unsigned long long V); // expected-note 4 {{here}}
 
 int consteval f() {
   return 0;
 }
 
-int consteval g();  // expected-note {{here}}
+int consteval g();  // expected-note 2 {{here}}
 
 
 struct C {
     static const int a = 1_foo;
     static constexpr int b = 1_foo;
     static const int c = 1_bar; // expected-error {{call to consteval function 'GH65985::operator""_bar' is not a constant expression}} \
-                                // expected-note {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
+                                // expected-note 2 {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
                                 // expected-error {{in-class initializer for static data member is not a constant expression}}
 
     // FIXME: remove duplicate diagnostics
@@ -1179,7 +1179,7 @@ struct C {
     static const int e = f();
     static const int f = g(); // expected-error {{call to consteval function 'GH65985::g' is not a constant expression}} \
                               // expected-error {{in-class initializer for static data member is not a constant expression}} \
-                              // expected-note  {{undefined function 'g' cannot be used in a constant expression}}
+                              // expected-note 2 {{undefined function 'g' cannot be used in a constant expression}}
 };
 
 }
diff --git a/clang/test/SemaTemplate/instantiate-static-var.cpp b/clang/test/SemaTemplate/instantiate-static-var.cpp
index 63d8366b617c..6602670af901 100644
--- a/clang/test/SemaTemplate/instantiate-static-var.cpp
+++ b/clang/test/SemaTemplate/instantiate-static-var.cpp
@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98 -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -std=c++11 %s
 
 template<typename T, T Divisor>
 class X {
 public:
-  static const T value = 10 / Divisor; // expected-error{{in-class initializer for static data member is not a constant expression}}
+  static const T value = 10 / Divisor; // expected-error{{in-class initializer for static data member is not a constant expression}} \
+  // cxx11-note {{division by zero}} \
+  // cxx98-note {{subexpression not valid}}
 };
 
 int array1[X<int, 2>::value == 5? 1 : -1];

From c21a4c6c43bb6d68dfe52e07a5a391a6167eedf9 Mon Sep 17 00:00:00 2001
From: Andrei Safronov <andrei.safronov@espressif.com>
Date: Wed, 18 Jun 2025 02:57:47 +0300
Subject: [PATCH 0741/1322] [Xtensa] Implement Xtensa Interrupt/Exception/Debug
 Options. (#143820)

Implement Xtensa Interrupt. HighInterrupts, Exception, Debug Options.
Also implement small Xtensa Options like PRID, Coprocessor and Timers.
---
 .../Xtensa/AsmParser/XtensaAsmParser.cpp      |  29 +-
 .../Disassembler/XtensaDisassembler.cpp       |  56 +++-
 .../MCTargetDesc/XtensaMCTargetDesc.cpp       |  89 +++++-
 .../Xtensa/MCTargetDesc/XtensaMCTargetDesc.h  |   9 +-
 llvm/lib/Target/Xtensa/XtensaFeatures.td      |  40 +++
 llvm/lib/Target/Xtensa/XtensaInstrInfo.td     | 115 +++++++
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.td  | 111 ++++++-
 llvm/lib/Target/Xtensa/XtensaSubtarget.h      |   9 +
 .../MC/Disassembler/Xtensa/coprocessor.txt    |  10 +
 llvm/test/MC/Disassembler/Xtensa/debug.txt    |  62 ++++
 .../test/MC/Disassembler/Xtensa/exception.txt |  42 +++
 .../MC/Disassembler/Xtensa/highinterrupts.txt |  82 +++++
 .../test/MC/Disassembler/Xtensa/interrupt.txt |  26 ++
 llvm/test/MC/Disassembler/Xtensa/prid.txt     |  10 +
 llvm/test/MC/Disassembler/Xtensa/timer.txt    |  22 ++
 llvm/test/MC/Xtensa/Core/processor-control.s  |   5 +
 llvm/test/MC/Xtensa/coprocessor.s             |  20 ++
 llvm/test/MC/Xtensa/debug-invalid.s           |   9 +
 llvm/test/MC/Xtensa/debug.s                   | 190 ++++++++++++
 llvm/test/MC/Xtensa/exception.s               | 100 +++++++
 llvm/test/MC/Xtensa/highinterrupts.s          | 280 ++++++++++++++++++
 llvm/test/MC/Xtensa/interrupt.s               |  60 ++++
 llvm/test/MC/Xtensa/prid.s                    |  20 ++
 llvm/test/MC/Xtensa/timer.s                   |  65 ++++
 24 files changed, 1438 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/debug.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/exception.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/interrupt.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/prid.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/timer.txt
 create mode 100644 llvm/test/MC/Xtensa/coprocessor.s
 create mode 100644 llvm/test/MC/Xtensa/debug-invalid.s
 create mode 100644 llvm/test/MC/Xtensa/debug.s
 create mode 100644 llvm/test/MC/Xtensa/exception.s
 create mode 100644 llvm/test/MC/Xtensa/highinterrupts.s
 create mode 100644 llvm/test/MC/Xtensa/interrupt.s
 create mode 100644 llvm/test/MC/Xtensa/prid.s
 create mode 100644 llvm/test/MC/Xtensa/timer.s

diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index 1f6cfec8edf4..6c4e365451af 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -62,11 +62,14 @@ class XtensaAsmParser : public MCTargetAsmParser {
 #include "XtensaGenAsmMatcher.inc"
 
   ParseStatus parseImmediate(OperandVector &Operands);
-  ParseStatus parseRegister(OperandVector &Operands, bool AllowParens = false,
-                            bool SR = false);
+  ParseStatus
+  parseRegister(OperandVector &Operands, bool AllowParens = false,
+                bool SR = false,
+                Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   ParseStatus parseOperandWithModifier(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                    bool SR = false);
+  bool
+  parseOperand(OperandVector &Operands, StringRef Mnemonic, bool SR = false,
+               Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   bool ParseInstructionWithSR(ParseInstructionInfo &Info, StringRef Name,
                               SMLoc NameLoc, OperandVector &Operands);
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
@@ -580,7 +583,8 @@ bool XtensaAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
 }
 
 ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
-                                           bool AllowParens, bool SR) {
+                                           bool AllowParens, bool SR,
+                                           Xtensa::RegisterAccessType RAType) {
   SMLoc FirstS = getLoc();
   bool HadParens = false;
   AsmToken Buf[2];
@@ -624,7 +628,7 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
     return ParseStatus::NoMatch;
   }
 
-  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits(), RAType))
     return ParseStatus::NoMatch;
 
   if (HadParens)
@@ -685,7 +689,7 @@ ParseStatus XtensaAsmParser::parseOperandWithModifier(OperandVector &Operands) {
 /// from this information, adding to Operands.
 /// If operand was parsed, returns false, else true.
 bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                                   bool SR) {
+                                   bool SR, Xtensa::RegisterAccessType RAType) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   ParseStatus Res = MatchOperandParserImpl(Operands, Mnemonic);
@@ -699,7 +703,7 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
     return true;
 
   // Attempt to parse token as register
-  if (parseRegister(Operands, true, SR).isSuccess())
+  if (parseRegister(Operands, true, SR, RAType).isSuccess())
     return false;
 
   // Attempt to parse token as an immediate
@@ -713,6 +717,11 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
 bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
                                              StringRef Name, SMLoc NameLoc,
                                              OperandVector &Operands) {
+  Xtensa::RegisterAccessType RAType =
+      Name[0] == 'w' ? Xtensa::REGISTER_WRITE
+                     : (Name[0] == 'r' ? Xtensa::REGISTER_READ
+                                       : Xtensa::REGISTER_EXCHANGE);
+
   if ((Name.starts_with("wsr.") || Name.starts_with("rsr.") ||
        Name.starts_with("xsr.")) &&
       (Name.size() > 4)) {
@@ -728,7 +737,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     if (RegNo == 0)
       RegNo = MatchRegisterAltName(RegName);
 
-    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits(), RAType))
       return Error(NameLoc, "invalid register name");
 
     // Parse operand
@@ -753,7 +762,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     }
 
     // Parse second operand
-    if (parseOperand(Operands, Name, true))
+    if (parseOperand(Operands, Name, true, RAType))
       return true;
   }
 
diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index dbd34964db07..3b37ac88b9b1 100644
--- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -119,13 +119,39 @@ struct DecodeRegister {
 };
 
 const DecodeRegister SRDecoderTable[] = {
-    {Xtensa::LBEG, 0},    {Xtensa::LEND, 1},        {Xtensa::LCOUNT, 2},
-    {Xtensa::SAR, 3},     {Xtensa::BREG, 4},        {Xtensa::SAR, 3},
-    {Xtensa::LITBASE, 5}, {Xtensa::ACCLO, 16},      {Xtensa::ACCHI, 17},
-    {Xtensa::M0, 32},     {Xtensa::M1, 33},         {Xtensa::M2, 34},
-    {Xtensa::M3, 35},     {Xtensa::WINDOWBASE, 72}, {Xtensa::WINDOWSTART, 73},
-    {Xtensa::MEMCTL, 97}, {Xtensa::VECBASE, 231},   {Xtensa::MISC0, 244},
-    {Xtensa::MISC1, 245}, {Xtensa::MISC2, 246},     {Xtensa::MISC3, 247}};
+    {Xtensa::LBEG, 0},          {Xtensa::LEND, 1},
+    {Xtensa::LCOUNT, 2},        {Xtensa::SAR, 3},
+    {Xtensa::BREG, 4},          {Xtensa::LITBASE, 5},
+    {Xtensa::ACCLO, 16},        {Xtensa::ACCHI, 17},
+    {Xtensa::M0, 32},           {Xtensa::M1, 33},
+    {Xtensa::M2, 34},           {Xtensa::M3, 35},
+    {Xtensa::WINDOWBASE, 72},   {Xtensa::WINDOWSTART, 73},
+    {Xtensa::IBREAKENABLE, 96}, {Xtensa::MEMCTL, 97},
+    {Xtensa::DDR, 104},         {Xtensa::IBREAKA0, 128},
+    {Xtensa::IBREAKA1, 129},    {Xtensa::DBREAKA0, 144},
+    {Xtensa::DBREAKA1, 145},    {Xtensa::DBREAKC0, 160},
+    {Xtensa::DBREAKC1, 161},    {Xtensa::CONFIGID0, 176},
+    {Xtensa::EPC1, 177},        {Xtensa::EPC2, 178},
+    {Xtensa::EPC3, 179},        {Xtensa::EPC4, 180},
+    {Xtensa::EPC5, 181},        {Xtensa::EPC6, 182},
+    {Xtensa::EPC7, 183},        {Xtensa::DEPC, 192},
+    {Xtensa::EPS2, 194},        {Xtensa::EPS3, 195},
+    {Xtensa::EPS4, 196},        {Xtensa::EPS5, 197},
+    {Xtensa::EPS6, 198},        {Xtensa::EPS7, 199},
+    {Xtensa::CONFIGID1, 208},   {Xtensa::EXCSAVE1, 209},
+    {Xtensa::EXCSAVE2, 210},    {Xtensa::EXCSAVE3, 211},
+    {Xtensa::EXCSAVE4, 212},    {Xtensa::EXCSAVE5, 213},
+    {Xtensa::EXCSAVE6, 214},    {Xtensa::EXCSAVE7, 215},
+    {Xtensa::CPENABLE, 224},    {Xtensa::INTERRUPT, 226},
+    {Xtensa::INTCLEAR, 227},    {Xtensa::INTENABLE, 228},
+    {Xtensa::PS, 230},          {Xtensa::VECBASE, 231},
+    {Xtensa::EXCCAUSE, 232},    {Xtensa::DEBUGCAUSE, 233},
+    {Xtensa::CCOUNT, 234},      {Xtensa::PRID, 235},
+    {Xtensa::ICOUNT, 236},      {Xtensa::ICOUNTLEVEL, 237},
+    {Xtensa::EXCVADDR, 238},    {Xtensa::CCOMPARE0, 240},
+    {Xtensa::CCOMPARE1, 241},   {Xtensa::CCOMPARE2, 242},
+    {Xtensa::MISC0, 244},       {Xtensa::MISC1, 245},
+    {Xtensa::MISC2, 246},       {Xtensa::MISC3, 247}};
 
 static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
@@ -133,12 +159,24 @@ static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo > 255)
     return MCDisassembler::Fail;
 
+  Xtensa::RegisterAccessType RAType =
+      Inst.getOpcode() == Xtensa::WSR
+          ? Xtensa::REGISTER_WRITE
+          : (Inst.getOpcode() == Xtensa::RSR ? Xtensa::REGISTER_READ
+                                             : Xtensa::REGISTER_EXCHANGE);
+
   for (unsigned i = 0; i < std::size(SRDecoderTable); i++) {
     if (SRDecoderTable[i].RegNo == RegNo) {
       MCPhysReg Reg = SRDecoderTable[i].Reg;
 
-      if (!Xtensa::checkRegister(Reg,
-                                 Decoder->getSubtargetInfo().getFeatureBits()))
+      // Handle special case. The INTERRUPT/INTSET registers use the same
+      // encoding, but INTERRUPT used for read and INTSET for write.
+      if (Reg == Xtensa::INTERRUPT && RAType == Xtensa::REGISTER_WRITE) {
+        Reg = Xtensa::INTSET;
+      }
+
+      if (!Xtensa::checkRegister(
+              Reg, Decoder->getSubtargetInfo().getFeatureBits(), RAType))
         return MCDisassembler::Fail;
 
       Inst.addOperand(MCOperand::createReg(Reg));
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 63fed46ac411..f48c6225827b 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -75,10 +75,95 @@ bool Xtensa::isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset) {
 }
 
 // Verify Special Register
-bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
+bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
+                           RegisterAccessType RAType) {
   switch (RegNo) {
   case Xtensa::BREG:
     return FeatureBits[Xtensa::FeatureBoolean];
+  case Xtensa::CCOUNT:
+  case Xtensa::CCOMPARE0:
+    if (FeatureBits[Xtensa::FeatureTimers1])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::CCOMPARE1:
+    if (FeatureBits[Xtensa::FeatureTimers2])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::CCOMPARE2:
+    if (FeatureBits[Xtensa::FeatureTimers3])
+      return true;
+    return false;
+  case Xtensa::CONFIGID0:
+    return RAType != Xtensa::REGISTER_EXCHANGE;
+  case Xtensa::CONFIGID1:
+    return RAType == Xtensa::REGISTER_READ;
+  case Xtensa::CPENABLE:
+    return FeatureBits[Xtensa::FeatureCoprocessor];
+  case Xtensa::DEBUGCAUSE:
+    return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeatureDebug];
+  case Xtensa::DEPC:
+  case Xtensa::EPC1:
+  case Xtensa::EXCCAUSE:
+  case Xtensa::EXCSAVE1:
+  case Xtensa::EXCVADDR:
+    return FeatureBits[Xtensa::FeatureException];
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC2:
+  case Xtensa::EPS2:
+  case Xtensa::EXCSAVE2:
+    if (FeatureBits[Xtensa::FeatureHighPriInterrupts])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC3:
+  case Xtensa::EPS3:
+  case Xtensa::EXCSAVE3:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel3])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC4:
+  case Xtensa::EPS4:
+  case Xtensa::EXCSAVE4:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel4])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC5:
+  case Xtensa::EPS5:
+  case Xtensa::EXCSAVE5:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel5])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC6:
+  case Xtensa::EPS6:
+  case Xtensa::EXCSAVE6:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel6])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC7:
+  case Xtensa::EPS7:
+  case Xtensa::EXCSAVE7:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel7])
+      return true;
+    return false;
+  case Xtensa::INTENABLE:
+    return FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::INTERRUPT:
+    return RAType == Xtensa::REGISTER_READ &&
+           FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::INTSET:
+  case Xtensa::INTCLEAR:
+    return RAType == Xtensa::REGISTER_WRITE &&
+           FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::ICOUNT:
+  case Xtensa::ICOUNTLEVEL:
+  case Xtensa::IBREAKENABLE:
+  case Xtensa::DDR:
+  case Xtensa::IBREAKA0:
+  case Xtensa::IBREAKA1:
+  case Xtensa::DBREAKA0:
+  case Xtensa::DBREAKA1:
+  case Xtensa::DBREAKC0:
+  case Xtensa::DBREAKC1:
+    return FeatureBits[Xtensa::FeatureDebug];
   case Xtensa::LBEG:
   case Xtensa::LEND:
   case Xtensa::LCOUNT:
@@ -99,6 +184,8 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
   case Xtensa::MISC2:
   case Xtensa::MISC3:
     return FeatureBits[Xtensa::FeatureMiscSR];
+  case Xtensa::PRID:
+    return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeaturePRID];
   case Xtensa::VECBASE:
     return FeatureBits[Xtensa::FeatureRelocatableVector];
   case Xtensa::WINDOWBASE:
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
index cedc57a14f14..ec91f656bdcb 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
@@ -55,8 +55,15 @@ bool isValidAddrOffset(int Scale, int64_t OffsetVal);
 // Check address offset for load/store instructions.
 bool isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset);
 
+enum RegisterAccessType {
+  REGISTER_WRITE = 1,
+  REGISTER_READ = 2,
+  REGISTER_EXCHANGE = 3
+};
+
 // Verify if it's correct to use a special register.
-bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits);
+bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
+                   RegisterAccessType RA);
 } // namespace Xtensa
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Xtensa/XtensaFeatures.td b/llvm/lib/Target/Xtensa/XtensaFeatures.td
index 55977277daf8..1dd03283e931 100644
--- a/llvm/lib/Target/Xtensa/XtensaFeatures.td
+++ b/llvm/lib/Target/Xtensa/XtensaFeatures.td
@@ -92,3 +92,43 @@ def FeatureDataCache : SubtargetFeature<"dcache", "HasDataCache", "true",
                                         "Enable Xtensa Data Cache option">;
 def HasDataCache : Predicate<"Subtarget->hasDataCache()">,
                    AssemblerPredicate<(all_of FeatureDataCache)>;
+
+// Xtensa Interrupts Options.
+def FeatureHighPriInterrupts : SubtargetFeature<"highpriinterrupts",
+                                                "HasHighPriInterrupts", "true",
+                                                "Enable Xtensa HighPriInterrupts option">;
+def HasHighPriInterrupts : Predicate<"Subtarget->hasHighPriInterrupts()">,
+                                      AssemblerPredicate<(all_of FeatureHighPriInterrupts)>;
+
+foreach i = {3-7} in
+    def FeatureHighPriInterruptsLevel#i : SubtargetFeature<"highpriinterrupts-level"#i,
+         "HasHighPriInterruptsLevel"#i#"", "true", "Enable Xtensa HighPriInterrupts Level"#i, [FeatureHighPriInterrupts]>;
+
+def FeatureInterrupt : SubtargetFeature<"interrupt", "HasInterrupt", "true",
+                                        "Enable Xtensa Interrupt option">;
+def HasInterrupt : Predicate<"Subtarget->hasInterrupt()">,
+                              AssemblerPredicate<(all_of FeatureInterrupt)>;
+
+def FeatureException : SubtargetFeature<"exception", "HasException", "true",
+                                        "Enable Xtensa Exception option">;
+def HasException : Predicate<"Subtarget->hasException()">,
+                              AssemblerPredicate<(all_of FeatureException)>;
+
+def FeatureDebug : SubtargetFeature<"debug", "HasDebug", "true",
+                                    "Enable Xtensa Debug option">;
+def HasDebug : Predicate<"Subtarget->hasDebug()">,
+                          AssemblerPredicate<(all_of FeatureDebug)>;
+
+foreach i = {1-3} in
+    def FeatureTimers#i : SubtargetFeature<"timers"#i,
+         "HasTimers"#i#"", "true", "Enable Xtensa Timers "#i>;
+
+def FeaturePRID : SubtargetFeature<"prid", "HasPRID", "true",
+                                   "Enable Xtensa Processor ID option">;
+def HasPRID : Predicate<"Subtarget->hasPRID()">,
+                         AssemblerPredicate<(all_of FeaturePRID)>;
+
+def FeatureCoprocessor : SubtargetFeature<"coprocessor", "HasCoprocessor", "true",
+                                          "Enable Xtensa Coprocessor option">;
+def HasCoprocessor : Predicate<"Subtarget->hasCoprocessor()">,
+                                AssemblerPredicate<(all_of FeatureCoprocessor)>;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 9a9424f91699..7e9fcd7058c2 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -499,6 +499,18 @@ def EXTW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
   let hasSideEffects = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Illegal instructions
+//===----------------------------------------------------------------------===//
+
+def ILL : CALLX_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                    "ill", []> {
+  let m = 0x0;
+  let n = 0x0;
+  let r = 0;
+  let s = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Processor control instructions
 //===----------------------------------------------------------------------===//
@@ -1044,6 +1056,109 @@ let Predicates = [HasRegionProtection] in {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Debug instructions
+//===----------------------------------------------------------------------===//
+
+let isBarrier = 1, isTerminator = 1 in {
+  def BREAK : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$s, uimm4:$t),
+                      "break\t$s, $t", []>, Requires<[HasDebug]> {
+    let r = 0x04;
+  }
+
+  def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm),
+                         "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {
+    bits<4> imm;
+
+    let r = 0xf;
+    let s = imm;
+    let t = 0x2;
+  }
+}
+
+def : InstAlias<"_break.n\t$imm", (BREAK_N uimm4:$imm)>;
+
+def : Pat<(trap), (BREAK (i32 1), (i32 15))>;
+
+// Load instruction
+def LDDR32P : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$s), (ins),
+                       "lddr32.p\t$s", []>, Requires<[HasDebug]> {
+  let r = 0x7;
+  let t = 0xe;
+  let mayLoad = 1;
+}
+
+// Store instruction
+def SDDR32P : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins AR:$s),
+                       "sddr32.p\t$s", []>, Requires<[HasDebug]> {
+  let r = 0x7;
+  let t = 0xf;
+  let mayStore = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Exception feature instructions
+//===----------------------------------------------------------------------===//
+
+def EXCW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "excw", []>, Requires<[HasException]> {
+  let r = 0x2;
+  let s = 0x0;
+  let t = 0x8;
+}
+
+def RFDE : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "rfde", []>, Requires<[HasException]> {
+  let r = 0x3;
+  let s = 0x2;
+  let t = 0x0;
+}
+
+
+def RFE : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                  "rfe", []>, Requires<[HasException]> {
+  let r = 0x3;
+  let s = 0x0;
+  let t = 0x0;
+}
+
+def SYSCALL : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                      "syscall", []>, Requires<[HasException]> {
+  let r = 0x5;
+  let s = 0x0;
+  let t = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// Interrupt feature instructions
+//===----------------------------------------------------------------------===//
+
+def RSIL : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$t), (ins uimm4:$imm),
+                   "rsil\t$t, $imm", []>, Requires<[HasInterrupt]> {
+  bits<4> imm;
+
+  let r = 0x6;
+  let s = imm{3-0};
+}
+
+def WAITI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
+                   "waiti\t$imm", []>, Requires<[HasInterrupt]> {
+  bits<4> imm;
+
+  let r = 0x7;
+  let s = imm{3-0};
+  let t = 0;
+}
+
+def RFI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
+                  "rfi\t$imm", []>, Requires<[HasHighPriInterrupts]> {
+  bits<4> imm;
+
+  let r = 0x3;
+  let s = imm{3-0};
+  let t = 0x1;
+}
+
 //===----------------------------------------------------------------------===//
 // DSP Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
index c54e2556ba11..7d4402912434 100644
--- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
@@ -91,9 +91,111 @@ def LITBASE : SRReg<5, "litbase", ["LITBASE", "5"]>;
 def WINDOWBASE : SRReg<72, "windowbase", ["WINDOWBASE", "72"]>;
 def WINDOWSTART : SRReg<73, "windowstart", ["WINDOWSTART", "73"]>;
 
+// Instuction breakpoint enable register
+def IBREAKENABLE : SRReg<96, "ibreakenable", ["IBREAKENABLE", "96"]>;
+
 // Memory Control Register
 def MEMCTL : SRReg<97, "memctl", ["MEMCTL", "97"]>;
 
+def DDR : SRReg<104, "ddr", ["DDR", "104"]>;
+
+// Instuction break address register 0
+def IBREAKA0 : SRReg<128, "ibreaka0", ["IBREAKA0", "128"]>;
+
+// Instuction break address register 1
+def IBREAKA1 : SRReg<129, "ibreaka1", ["IBREAKA1", "129"]>;
+
+// Data break address register 0
+def DBREAKA0 : SRReg<144, "dbreaka0", ["DBREAKA0", "144"]>;
+
+// Data break address register 1
+def DBREAKA1 : SRReg<145, "dbreaka1", ["DBREAKA1", "145"]>;
+
+// Data breakpoint control register 0
+def DBREAKC0 : SRReg<160, "dbreakc0", ["DBREAKC0", "160"]>;
+
+// Data breakpoint control register 1
+def DBREAKC1 : SRReg<161, "dbreakc1", ["DBREAKC1", "161"]>;
+
+def CONFIGID0 : SRReg<176, "configid0", ["CONFIGID0", "176"]>;
+
+// Exception PC1
+def EPC1 : SRReg<177, "epc1", ["EPC1", "177"]>;
+
+// Exception PC2
+def EPC2 : SRReg<178, "epc2", ["EPC2", "178"]>;
+
+// Exception PC3
+def EPC3 : SRReg<179, "epc3", ["EPC3", "179"]>;
+
+// Exception PC4
+def EPC4 : SRReg<180, "epc4", ["EPC4", "180"]>;
+
+// Exception PC5
+def EPC5 : SRReg<181, "epc5", ["EPC5", "181"]>;
+
+// Exception PC6
+def EPC6 : SRReg<182, "epc6", ["EPC6", "182"]>;
+
+// Exception PC7
+def EPC7 : SRReg<183, "epc7", ["EPC7", "183"]>;
+
+def DEPC : SRReg<192, "depc", ["DEPC", "192"]>;
+def EPS2 : SRReg<194, "eps2", ["EPS2", "194"]>;
+def EPS3 : SRReg<195, "eps3", ["EPS3", "195"]>;
+def EPS4 : SRReg<196, "eps4", ["EPS4", "196"]>;
+def EPS5 : SRReg<197, "eps5", ["EPS5", "197"]>;
+def EPS6 : SRReg<198, "eps6", ["EPS6", "198"]>;
+def EPS7 : SRReg<199, "eps7", ["EPS7", "199"]>;
+
+def CONFIGID1 : SRReg<208, "configid1", ["CONFIGID1", "208"]>;
+
+def EXCSAVE1 : SRReg<209, "excsave1", ["EXCSAVE1", "209"]>;
+def EXCSAVE2 : SRReg<210, "excsave2", ["EXCSAVE2", "210"]>;
+def EXCSAVE3 : SRReg<211, "excsave3", ["EXCSAVE3", "211"]>;
+def EXCSAVE4 : SRReg<212, "excsave4", ["EXCSAVE4", "212"]>;
+def EXCSAVE5 : SRReg<213, "excsave5", ["EXCSAVE5", "213"]>;
+def EXCSAVE6 : SRReg<214, "excsave6", ["EXCSAVE6", "214"]>;
+def EXCSAVE7 : SRReg<215, "excsave7", ["EXCSAVE7", "215"]>;
+
+def CPENABLE : SRReg<224, "cpenable", ["CPENABLE", "224"]>;
+
+// Interrupt enable mask register
+def INTERRUPT : SRReg<226, "interrupt", ["INTERRUPT", "226"]>;
+
+def INTSET : SRReg<226, "intset", ["INTSET"]>;
+
+def INTCLEAR : SRReg<227, "intclear", ["INTCLEAR", "227"]>;
+
+def INTENABLE : SRReg<228, "intenable", ["INTENABLE", "228"]>;
+
+// Processor State
+def PS : SRReg<230, "ps", ["PS", "230"]>;
+
+def EXCCAUSE : SRReg<232, "exccause", ["EXCCAUSE", "232"]>;
+
+// Cause of last debug exception register
+def DEBUGCAUSE : SRReg<233, "debugcause", ["DEBUGCAUSE", "233"]>;
+
+// Processor Clock Count Register
+def CCOUNT : SRReg<234, "ccount", ["CCOUNT", "234"]>;
+
+// Processor ID Register
+def PRID : SRReg<235, "prid", ["PRID", "235"]>;
+
+def ICOUNT : SRReg<236, "icount", ["ICOUNT", "236"]>;
+def ICOUNTLEVEL : SRReg<237, "icountlevel", ["ICOUNTLEVEL", "237"]>;
+def EXCVADDR : SRReg<238, "excvaddr", ["EXCVADDR", "238"]>;
+
+// Cycle number to interrupt register 0
+def CCOMPARE0 : SRReg<240, "ccompare0", ["CCOMPARE0", "240"]>;
+
+// Cycle number to interrupt register 1
+def CCOMPARE1 : SRReg<241, "ccompare1", ["CCOMPARE1", "241"]>;
+
+// Cycle number to interrupt register 2
+def CCOMPARE2 : SRReg<242, "ccompare2", ["CCOMPARE2", "242"]>;
+
 // Vector base register
 def VECBASE : SRReg<231, "vecbase", ["VECBASE", "231"]>;
 
@@ -116,8 +218,13 @@ def MR23 :  RegisterClass<"Xtensa", [i32], 32, (add M2, M3)>;
 def MR   :  RegisterClass<"Xtensa", [i32], 32, (add MR01, MR23)>;
 
 def SR :  RegisterClass<"Xtensa", [i32], 32, (add
-  LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR, WINDOWBASE, WINDOWSTART,
-  MEMCTL, VECBASE, MISC0, MISC1, MISC2, MISC3)>;
+  LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR,
+  WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, DDR, IBREAKA0, IBREAKA1,
+  DBREAKA0, DBREAKA1, DBREAKC0, DBREAKC1, CONFIGID0, EPC1, EPC2, EPC3, EPC4, EPC5,
+  EPC6, EPC7, DEPC, EPS2, EPS3, EPS4, EPS5, EPS6, EPS7, CONFIGID1, EXCSAVE1, EXCSAVE2,
+  EXCSAVE3, EXCSAVE4, EXCSAVE5, EXCSAVE6, EXCSAVE7, CPENABLE, INTERRUPT, INTSET, INTCLEAR, INTENABLE,
+  PS, VECBASE, EXCCAUSE, DEBUGCAUSE, CCOUNT, PRID, ICOUNT, ICOUNTLEVEL, EXCVADDR, CCOMPARE0,
+  CCOMPARE1, CCOMPARE2, MISC0, MISC1, MISC2, MISC3)>;
 
 //===----------------------------------------------------------------------===//
 // Boolean registers
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
index 9909fb9ff4b3..da4e14a53eef 100644
--- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -82,6 +82,15 @@ public:
   bool hasMiscSR() const { return HasMiscSR; }
   bool hasExtendedL32R() const { return HasExtendedL32R; }
   bool hasDataCache() const { return HasDataCache; }
+  bool hasHighPriInterrupts() const { return HasHighPriInterrupts; }
+  bool hasHighPriInterruptsLevel3() const { return HasHighPriInterruptsLevel3; }
+  bool hasHighPriInterruptsLevel4() const { return HasHighPriInterruptsLevel4; }
+  bool hasHighPriInterruptsLevel5() const { return HasHighPriInterruptsLevel5; }
+  bool hasHighPriInterruptsLevel6() const { return HasHighPriInterruptsLevel6; }
+  bool hasHighPriInterruptsLevel7() const { return HasHighPriInterruptsLevel7; }
+  bool hasInterrupt() const { return HasInterrupt; }
+  bool hasException() const { return HasException; }
+
   bool isWindowedABI() const { return hasWindowed(); }
 
   // Automatically generated by tblgen.
diff --git a/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt b/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
new file mode 100644
index 000000000000..83904dcde938
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+coprocessor -disassemble %s | FileCheck -check-prefixes=CHECK-COPROCESSOR %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa coprocessor option enabled. Also verify that dissasembling without
+## Xtensa coprocessor option generates warnings.
+
+[0x20,0xe0,0x61]
+#CHECK-COPROCESSOR: xsr a2, cpenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/debug.txt b/llvm/test/MC/Disassembler/Xtensa/debug.txt
new file mode 100644
index 000000000000..1321f09a973c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/debug.txt
@@ -0,0 +1,62 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+debug,+density -disassemble %s | FileCheck -check-prefixes=CHECK-DEBUG %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa debug option enabled. Also verify that dissasembling without
+## Xtensa debug option generates warnings.
+
+[0x10,0x41,0x00]
+# CHECK-DEBUG: break 1, 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x2c,0xf1]
+# CHECK-DEBUG: break.n 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xe0,0x73,0x00]
+# CHECK-DEBUG: lddr32.p a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xf0,0x73,0x00]
+# CHECK-DEBUG: sddr32.p a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xec, 0x61]
+#CHECK-DEBUG: xsr a2, icount
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xed, 0x61]
+#CHECK-DEBUG: xsr a2, icountlevel
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x60, 0x61]
+#CHECK-DEBUG: xsr a2, ibreakenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x68, 0x61]
+#CHECK-DEBUG: xsr a2, ddr
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x80, 0x61]
+#CHECK-DEBUG: xsr a2, ibreaka0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x81, 0x61]
+#CHECK-DEBUG: xsr a2, ibreaka1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x90, 0x61]
+#CHECK-DEBUG: xsr a2, dbreaka0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x91, 0x61]
+#CHECK-DEBUG: xsr a2, dbreaka1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xa0, 0x61]
+#CHECK-DEBUG: xsr a2, dbreakc0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xa1, 0x61]
+#CHECK-DEBUG: xsr a2, dbreakc1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/exception.txt b/llvm/test/MC/Disassembler/Xtensa/exception.txt
new file mode 100644
index 000000000000..f40cc9e6549b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/exception.txt
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+exception -disassemble %s | FileCheck -check-prefixes=CHECK-EXCEPTION %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa exception option enabled. Also verify that dissasembling without
+## Xtensa exception option generates warnings.
+
+[0x80,0x20,0x00]
+# CHECK-EXCEPTION: excw
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x50,0x00]
+# CHECK-EXCEPTION: syscall
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x30,0x00]
+# CHECK-EXCEPTION: rfe
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x32,0x00]
+# CHECK-EXCEPTION: rfde
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xb1, 0x61]
+#CHECK-INST: xsr a2, epc1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xd1, 0x61]
+#CHECK-INST: xsr a2, excsave1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe8, 0x61]
+#CHECK-INST: xsr a2, exccause
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xee, 0x61]
+#CHECK-INST: xsr a2, excvaddr
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xc0, 0x61]
+#CHECK-INST: xsr a2, depc
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt b/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
new file mode 100644
index 000000000000..d5d87918c9d5
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
@@ -0,0 +1,82 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+highpriinterrupts,+highpriinterrupts-level7 -disassemble %s | FileCheck -check-prefixes=CHECK-HPINTERRUPTS %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa highpriinterrupts option enabled. Also verify that dissasembling without
+## Xtensa highpriinterrupts option generates warnings.
+
+[0x10,0x31,0x00]
+# CHECK-HPINTERRUPTS: rfi 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/interrupt.txt b/llvm/test/MC/Disassembler/Xtensa/interrupt.txt
new file mode 100644
index 000000000000..da8ea3aa5dc4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/interrupt.txt
@@ -0,0 +1,26 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+interrupt -disassemble %s | FileCheck -check-prefixes=CHECK-EXCEPTION %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa interrupt option enabled. Also verify that dissasembling without
+## Xtensa interrupt option generates warnings.
+
+[0x20,0x61,0x00]
+# CHECK-EXCEPTION: rsil a2, 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x71,0x00]
+# CHECK-EXCEPTION: waiti 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe4, 0x61]
+#CHECK-INST: xsr a2, intenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe2, 0x03]
+#CHECK-INST: rsr a2, interrupt
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe3, 0x13]
+#CHECK-INST: wsr a2, intclear
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/prid.txt b/llvm/test/MC/Disassembler/Xtensa/prid.txt
new file mode 100644
index 000000000000..104ad1c31185
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/prid.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+prid -disassemble %s | FileCheck -check-prefixes=CHECK-PRID %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa prid option enabled. Also verify that dissasembling without
+## Xtensa prid option generates warnings.
+
+[0x20,0xeb,0x03]
+#CHECK-PRID: rsr a2, prid
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/timer.txt b/llvm/test/MC/Disassembler/Xtensa/timer.txt
new file mode 100644
index 000000000000..daacf27872da
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/timer.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+timers3 -disassemble %s | FileCheck -check-prefixes=CHECK-TIMER %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa timer option enabled. Also verify that dissasembling without
+## Xtensa timer option generates warnings.
+
+[0x20,0xea,0x61]
+#CHECK-INST: xsr a2, ccount
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf0,0x61]
+#CHECK-TIMER: xsr a2, ccompare0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf1,0x61]
+#CHECK-TIMER: xsr a2, ccompare1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf2,0x61]
+#CHECK-TIMER: xsr a2, ccompare2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Xtensa/Core/processor-control.s b/llvm/test/MC/Xtensa/Core/processor-control.s
index 5b648356fc68..4a37d8346893 100644
--- a/llvm/test/MC/Xtensa/Core/processor-control.s
+++ b/llvm/test/MC/Xtensa/Core/processor-control.s
@@ -20,6 +20,11 @@ esync
 # CHECK: encoding: [0x00,0x20,0x00]
 isync
 
+# Instruction format CALLX
+# CHECK-INST: ill
+# CHECK: encoding: [0x00,0x00,0x00]
+ill
+
 # Instruction format RRR
 # CHECK-INST: nop
 # CHECK: encoding: [0xf0,0x20,0x00]
diff --git a/llvm/test/MC/Xtensa/coprocessor.s b/llvm/test/MC/Xtensa/coprocessor.s
new file mode 100644
index 000000000000..dca8c55fd72c
--- /dev/null
+++ b/llvm/test/MC/Xtensa/coprocessor.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+coprocessor \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr a2,cpenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr.cpenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr a2, 224
diff --git a/llvm/test/MC/Xtensa/debug-invalid.s b/llvm/test/MC/Xtensa/debug-invalid.s
new file mode 100644
index 000000000000..74f0df9fe814
--- /dev/null
+++ b/llvm/test/MC/Xtensa/debug-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple xtensa --mattr=+debug,+density %s 2>&1 | FileCheck %s
+
+LBL0:
+
+# Out of range immediates
+
+# uimm4
+break 16, 0
+# CHECK: :[[#@LINE-1]]:7: error: expected immediate in range [0, 15]
diff --git a/llvm/test/MC/Xtensa/debug.s b/llvm/test/MC/Xtensa/debug.s
new file mode 100644
index 000000000000..36b1f110d120
--- /dev/null
+++ b/llvm/test/MC/Xtensa/debug.s
@@ -0,0 +1,190 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+debug,+density \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: break 1, 1
+# CHECK: encoding: [0x10,0x41,0x00]
+break 1, 1
+
+# Instruction format RRRN
+# CHECK-INST: break.n 1
+# CHECK: encoding: [0x2c,0xf1]
+break.n 1
+
+# Instruction format RRR
+# CHECK-INST: lddr32.p a3
+# CHECK: encoding: [0xe0,0x73,0x00]
+lddr32.p a3
+
+# Instruction format RRR
+# CHECK-INST: sddr32.p a3
+# CHECK: encoding: [0xf0,0x73,0x00]
+sddr32.p a3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr a2,icount
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr.icount a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr a2, 236
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr a2,icountlevel
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr.icountlevel a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr a2, 237
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr a2,ibreaka0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr.ibreaka0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr a2, 128
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr a2,ibreaka1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr.ibreaka1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr a2, 129
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr a2,dbreaka0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr.dbreaka0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr a2, 144
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr a2,dbreaka1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr.dbreaka1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr a2, 145
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr a2,dbreakc0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr.dbreakc0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr a2, 160
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr a2,dbreakc1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr.dbreakc1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr a2, 161
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr a2,ibreakenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr.ibreakenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr a2, 96
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr a2,debugcause
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr.debugcause a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr a2, 233
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr a2,ddr
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr.ddr a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr a2, 104
diff --git a/llvm/test/MC/Xtensa/exception.s b/llvm/test/MC/Xtensa/exception.s
new file mode 100644
index 000000000000..7084ddacf013
--- /dev/null
+++ b/llvm/test/MC/Xtensa/exception.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+exception \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: excw
+# CHECK: encoding: [0x80,0x20,0x00]
+excw
+
+# Instruction format RRR
+# CHECK-INST: syscall
+# CHECK: encoding: [0x00,0x50,0x00]
+syscall
+
+# Instruction format RRR
+# CHECK-INST: rfe
+# CHECK: encoding: [0x00,0x30,0x00]
+rfe
+
+# Instruction format RRR
+# CHECK-INST: rfde
+# CHECK: encoding: [0x00,0x32,0x00]
+rfde
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr a2, epc1
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr.epc1 a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr a2, 177
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr a2, excsave1
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr.excsave1 a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr a2, 209
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr a2, exccause
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr.exccause a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr a2, 232
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr a2, excvaddr
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr.excvaddr a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr a2, 238
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr a2, depc
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr.depc a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr a2, 192
diff --git a/llvm/test/MC/Xtensa/highinterrupts.s b/llvm/test/MC/Xtensa/highinterrupts.s
new file mode 100644
index 000000000000..4908176b1b03
--- /dev/null
+++ b/llvm/test/MC/Xtensa/highinterrupts.s
@@ -0,0 +1,280 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+highpriinterrupts,+highpriinterrupts-level7 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: rfi 1
+# CHECK: encoding: [0x10,0x31,0x00]
+rfi 1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr a2,epc2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr.epc2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr a2, 178
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr a2,epc3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr.epc3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr a2, 179
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr a2,epc4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr.epc4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr a2, 180
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr a2,epc5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr.epc5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr a2, 181
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr a2,epc6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr.epc6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr a2, 182
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr a2,epc7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr.epc7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr a2, 183
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr a2,eps2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr.eps2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr a2, 194
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr a2,eps3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr.eps3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr a2, 195
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr a2,eps4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr.eps4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr a2, 196
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr a2,eps5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr.eps5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr a2, 197
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr a2,eps6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr.eps6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr a2, 198
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr a2,eps7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr.eps7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr a2, 199
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr a2,excsave2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr.excsave2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr a2, 210
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr a2,excsave3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr.excsave3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr a2, 211
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr a2,excsave4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr.excsave4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr a2, 212
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr a2,excsave5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr.excsave5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr a2, 213
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr a2,excsave6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr.excsave6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr a2, 214
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr a2,excsave7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr.excsave7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr a2, 215
diff --git a/llvm/test/MC/Xtensa/interrupt.s b/llvm/test/MC/Xtensa/interrupt.s
new file mode 100644
index 000000000000..cb1b82dbfe5a
--- /dev/null
+++ b/llvm/test/MC/Xtensa/interrupt.s
@@ -0,0 +1,60 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+interrupt \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: rsil a2, 1
+# CHECK: encoding: [0x20,0x61,0x00]
+rsil a2, 1
+
+# Instruction format RRR
+# CHECK-INST: waiti 1
+# CHECK: encoding: [0x00,0x71,0x00]
+waiti 1
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr a2, interrupt
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr.interrupt a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr a2, 226
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr a2, intclear
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr.intclear a2
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr a2, 227
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr a2, intenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr.intenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr a2, 228
diff --git a/llvm/test/MC/Xtensa/prid.s b/llvm/test/MC/Xtensa/prid.s
new file mode 100644
index 000000000000..75fcc151e8ef
--- /dev/null
+++ b/llvm/test/MC/Xtensa/prid.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+prid \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr a2,prid
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr.prid a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr a2, 235
diff --git a/llvm/test/MC/Xtensa/timer.s b/llvm/test/MC/Xtensa/timer.s
new file mode 100644
index 000000000000..f1fc9709cdec
--- /dev/null
+++ b/llvm/test/MC/Xtensa/timer.s
@@ -0,0 +1,65 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+timers3 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr a2,ccount
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr.ccount a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr a2, 234
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr a2,ccompare0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr.ccompare0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr a2, 240
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr a2,ccompare1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr.ccompare1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr a2, 241
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr a2,ccompare2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr.ccompare2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr a2, 242

From 15482c83aa2b05779d7ad947c34835656ab9da1c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Tue, 17 Jun 2025 19:58:24 -0400
Subject: [PATCH 0742/1322] [ElimAvailExtern] Add an option to allow to convert
 global variables in a specified address space to local (#144287)

Currently, the `EliminateAvailableExternallyPass` only converts certain
available externally functions to local if `avail-extern-to-local` is
set or in
contextual profiling mode. For global variables, it only drops their
initializers.

This PR adds an option to allow the pass to convert global variables in
a
specified address space to local. The motivation for this change is to
correctly
support lowering of LDS variables (`__shared__` variables, in more
generic
terminology) when ThinLTO is enabled for AMDGPU.

A `__shared__` variable is lowered to a hidden global variable in a
particular
address space by the frontend, which is roughly same as a `static` local
variable. To properly lower it in the backend, the compiler needs to
check all
its uses. Enabling ThinLTO currently breaks this when a function
containing a
`__shared__` variable is imported from another module. Even though the
global
variable is imported along with its associated function, and the
function is
privatized by the `EliminateAvailableExternallyPass`, the global
variable itself
is not.

It's safe to privatize such global variables, because they're _local_ to
their
associated functions. If the function itself is privatized, its
associated
global variables should also be privatized accordingly.
---
 llvm/lib/Transforms/IPO/ElimAvailExtern.cpp   | 36 ++++++++++++++++---
 .../convert-global-variables-to-local.ll      | 21 +++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll

diff --git a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index 718452fc0276..bc98f994f490 100644
--- a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -35,8 +35,15 @@ static cl::opt<bool> ConvertToLocal(
     cl::desc("Convert available_externally into locals, renaming them "
              "to avoid link-time clashes."));
 
+static cl::opt<unsigned> ConvertGlobalVariableInAddrSpace(
+    "avail-extern-gv-in-addrspace-to-local", cl::Hidden,
+    cl::desc(
+        "Convert available_externally global variables into locals if they are "
+        "in specificed addrspace, renaming them to avoid link-time clashes."));
+
 STATISTIC(NumRemovals, "Number of functions removed");
-STATISTIC(NumConversions, "Number of functions converted");
+STATISTIC(NumFunctionsConverted, "Number of functions converted");
+STATISTIC(NumGlobalVariablesConverted, "Number of global variables converted");
 STATISTIC(NumVariables, "Number of global variables removed");
 
 void deleteFunction(Function &F) {
@@ -45,6 +52,10 @@ void deleteFunction(Function &F) {
   ++NumRemovals;
 }
 
+static std::string getNewName(Module &M, const GlobalValue &GV) {
+  return GV.getName().str() + ".__uniq" + getUniqueModuleId(&M);
+}
+
 /// Create a copy of the thinlto import, mark it local, and redirect direct
 /// calls to the copy. Only direct calls are replaced, so that e.g. indirect
 /// call function pointer tests would use the global identity of the function.
@@ -68,7 +79,7 @@ static void convertToLocalCopy(Module &M, Function &F) {
   // functions with the same name, but that just creates more trouble than
   // necessary e.g. distinguishing profiles or debugging. Instead, we append the
   // module identifier.
-  auto NewName = OrigName + ".__uniq" + getUniqueModuleId(&M);
+  std::string NewName = getNewName(M, F);
   F.setName(NewName);
   if (auto *SP = F.getSubprogram())
     SP->replaceLinkageName(MDString::get(F.getParent()->getContext(), NewName));
@@ -85,16 +96,33 @@ static void convertToLocalCopy(Module &M, Function &F) {
                        F.getAddressSpace(), OrigName, F.getParent());
   F.replaceUsesWithIf(Decl,
                       [&](Use &U) { return !isa<CallBase>(U.getUser()); });
-  ++NumConversions;
+  ++NumFunctionsConverted;
+}
+
+/// Similar to the function above, this is to convert an externally available
+/// global variable to local.
+static void convertToLocalCopy(Module &M, GlobalVariable &GV) {
+  assert(GV.hasAvailableExternallyLinkage());
+  GV.setName(getNewName(M, GV));
+  GV.setLinkage(GlobalValue::InternalLinkage);
+  ++NumGlobalVariablesConverted;
 }
 
 static bool eliminateAvailableExternally(Module &M, bool Convert) {
   bool Changed = false;
 
-  // Drop initializers of available externally global variables.
+  // If a global variable is available externally and in the specified address
+  // space, convert it to local linkage; otherwise, drop its initializer.
   for (GlobalVariable &GV : M.globals()) {
     if (!GV.hasAvailableExternallyLinkage())
       continue;
+    if (ConvertGlobalVariableInAddrSpace.getNumOccurrences() &&
+        GV.getAddressSpace() == ConvertGlobalVariableInAddrSpace &&
+        !GV.use_empty()) {
+      convertToLocalCopy(M, GV);
+      Changed = true;
+      continue;
+    }
     if (GV.hasInitializer()) {
       Constant *Init = GV.getInitializer();
       GV.setInitializer(nullptr);
diff --git a/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll b/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
new file mode 100644
index 000000000000..6995b97e7988
--- /dev/null
+++ b/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -passes=elim-avail-extern -avail-extern-gv-in-addrspace-to-local=3 %s -o - | FileCheck %s
+
+@shared = internal addrspace(3) global i32 undef, align 4
+@shared.imported = available_externally hidden unnamed_addr addrspace(3) global i32 undef, align 4
+
+;.
+; CHECK: @shared = internal addrspace(3) global i32 undef, align 4
+; CHECK: @shared.imported.__uniq.[[UUID:.*]] = internal unnamed_addr addrspace(3) global i32 undef, align 4
+;.
+define void @foo(i32 %v) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[V:%.*]]) {
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(3) @shared, align 4
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(3) @shared.imported.__uniq.[[UUID]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %v, ptr addrspace(3) @shared, align 4
+  store i32 %v, ptr addrspace(3) @shared.imported, align 4
+  ret void
+}

From 64155a32297f4884875783664ff13bec9ab376f5 Mon Sep 17 00:00:00 2001
From: Minding <77574923+Minding000@users.noreply.github.com>
Date: Wed, 18 Jun 2025 02:09:07 +0200
Subject: [PATCH 0743/1322] Added clarifying comment to 'LLVMLinkInMCJIT' and
 'LLVMLinkInInterpreter' (#92467)

Clarify that these functions are no-ops when linking to LLVM as a shared object.
---
 llvm/include/llvm-c/ExecutionEngine.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/include/llvm-c/ExecutionEngine.h b/llvm/include/llvm-c/ExecutionEngine.h
index c5fc9bdb4d07..2062cbf470d8 100644
--- a/llvm/include/llvm-c/ExecutionEngine.h
+++ b/llvm/include/llvm-c/ExecutionEngine.h
@@ -33,7 +33,15 @@ LLVM_C_EXTERN_C_BEGIN
  * @{
  */
 
+/**
+ * Empty function used to force the linker to link MCJIT.
+ * Has no effect when called on a pre-built library (dylib interface).
+ */
 void LLVMLinkInMCJIT(void);
+/**
+ * Empty function used to force the linker to link the LLVM interpreter.
+ * Has no effect when called on a pre-built library (dylib interface).
+ */
 void LLVMLinkInInterpreter(void);
 
 typedef struct LLVMOpaqueGenericValue *LLVMGenericValueRef;

From abbdd1670d8b12dd72ec353b14e256619ff4694b Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 17:21:40 -0700
Subject: [PATCH 0744/1322] [llvm] minor fixes for clang-cl Windows DLL build
 (#144386)

## Purpose

This patch makes a minor changes to LLVM and Clang so that LLVM can be
built as a Windows DLL with `clang-cl`. These changes were not required
for building a Windows DLL with MSVC.

## Background

The Windows DLL effort is tracked in #109483. Additional context is
provided in [this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

## Overview
Specific changes made in this patch:
- Remove `constexpr` fields that reference DLL exported symbols. These
symbols cannot be resolved at compile time when building a Windows DLL
using `clang-cl`, so they cannot be `constexpr`. Instead, they are made
`const` and initialized in the implementation file rather than at
declaration in the header.
- Annotate symbols now defined out-of-line with `LLVM_ABI` so they are
exported when building as a shared library.
- Explicitly add default copy assignment operator for `ELFFile` to
resolve a compiler warning.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 .../lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp  |  2 +-
 llvm/include/llvm/BinaryFormat/Dwarf.h               | 12 ++++++------
 llvm/include/llvm/Object/ELF.h                       |  4 +++-
 llvm/lib/BinaryFormat/Dwarf.cpp                      | 12 ++++++++++++
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
index 836fc375809a..f965bfb590d8 100644
--- a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
@@ -92,7 +92,7 @@ void Z3CrosscheckVisitor::finalizeVisitor(BugReporterContext &BRC,
   };
 
   auto AttemptOnce = [&](const llvm::SMTSolverRef &Solver) -> Z3Result {
-    constexpr auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
+    auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
     unsigned InitialRLimit = GetUsedRLimit(Solver);
     double Start = getCurrentTime(/*Start=*/true).getWallTime();
     std::optional<bool> IsSAT = Solver->check();
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 2ead62025efa..231b7ac17d75 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -1191,32 +1191,32 @@ template <typename Enum> struct EnumTraits : public std::false_type {};
 
 template <> struct EnumTraits<Attribute> : public std::true_type {
   static constexpr char Type[3] = "AT";
-  static constexpr StringRef (*StringFn)(unsigned) = &AttributeString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Form> : public std::true_type {
   static constexpr char Type[5] = "FORM";
-  static constexpr StringRef (*StringFn)(unsigned) = &FormEncodingString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Index> : public std::true_type {
   static constexpr char Type[4] = "IDX";
-  static constexpr StringRef (*StringFn)(unsigned) = &IndexString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Tag> : public std::true_type {
   static constexpr char Type[4] = "TAG";
-  static constexpr StringRef (*StringFn)(unsigned) = &TagString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<LineNumberOps> : public std::true_type {
   static constexpr char Type[4] = "LNS";
-  static constexpr StringRef (*StringFn)(unsigned) = &LNStandardString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<LocationAtom> : public std::true_type {
   static constexpr char Type[3] = "OP";
-  static constexpr StringRef (*StringFn)(unsigned) = &OperationEncodingString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 inline uint64_t computeTombstoneAddress(uint8_t AddressByteSize) {
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index a0dc522e13ca..8d7545144dfd 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -256,8 +256,10 @@ class ELFFile {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  // Default ctor required to instantiate the template for DLL export.
+  // Default ctor and copy assignment operator required to instantiate the
+  // template for DLL export.
   ELFFile(const ELFFile &) = default;
+  ELFFile &operator=(const ELFFile &) = default;
 
   // This is a callback that can be passed to a number of functions.
   // It can be used to ignore non-critical errors (warnings), which is
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index b9b10a541b26..0d17dc175fed 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -911,6 +911,18 @@ StringRef llvm::dwarf::RLEString(unsigned RLE) {
   }
 }
 
+StringRef (*const llvm::dwarf::EnumTraits<Tag>::StringFn)(unsigned) = TagString;
+StringRef (*const llvm::dwarf::EnumTraits<Attribute>::StringFn)(unsigned) =
+    AttributeString;
+StringRef (*const llvm::dwarf::EnumTraits<Form>::StringFn)(unsigned) =
+    FormEncodingString;
+StringRef (*const llvm::dwarf::EnumTraits<LocationAtom>::StringFn)(unsigned) =
+    OperationEncodingString;
+StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) =
+    LNStandardString;
+StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) =
+    IndexString;
+
 constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Index>::Type[];

From 99e263228f4513c166f20469968b2b646edaaa33 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 09:28:24 +0900
Subject: [PATCH 0745/1322] github: Add mips backend to PR autolabeler
 (#140909)

---
 .github/new-prs-labeler.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 162161ff13fb..2f8d5745668d 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -777,6 +777,10 @@ backend:NVPTX:
   - 'llvm/**/*nvptx*/**'
   - 'llvm/**/*NVPTX*/**'
 
+backend:MIPS:
+  - '**/*mips*'
+  - '**/*Mips*'
+
 backend:RISC-V:
   - clang/**/*riscv*
   - clang/**/*RISCV*

From 4e090b6e84e33e2a442e3951253ca570f8f842f8 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Tue, 17 Jun 2025 17:34:09 -0700
Subject: [PATCH 0746/1322] [lldb] Re-insert code to search for a binary by
 filepath if provided

July 14 2024 I landed a change to update progress reporting when
loading kernel/firmware binaries
https://github.com/llvm/llvm-project/pull/98845
In DynamicLoader::LoadBinaryWithUUIDAndAddress I removed code that
was setting the ModuleSpec to the provided name, if the name provided
is that of a file on disk.  With this code missing, if a filepath
name is passed in, this code will fail to find that binary on the local
disk.  There's nothing in the PR / intention that would lead to this
change, it was unintentional.
---
 lldb/source/Core/DynamicLoader.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 291e6b73a2c3..4be9f3eb9abc 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -229,6 +229,8 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
   ModuleSpec module_spec;
   module_spec.GetUUID() = uuid;
   FileSpec name_filespec(name);
+  if (FileSystem::Instance().Exists(name_filespec))
+    module_spec.GetFileSpec() = name_filespec;
 
   if (uuid.IsValid()) {
     Progress progress("Locating binary", prog_str.GetString().str());

From 86a09f36154fbd264f61ea6462c8cf48b1ff2eb0 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Tue, 17 Jun 2025 17:48:09 -0700
Subject: [PATCH 0747/1322] [MLIR][XeGPU] Clean up xegpu op tests  (#144592)

Test cleanup:
1) separate layout.mlir from ops.mlir for layout related test
2) remove lane layout for ops working at work item scope.
3) remove redundant test in create_tdesc/update_tdesc/prefetch.
4) remove "test_" from all test function name.
---
 mlir/test/Dialect/XeGPU/invalid.mlir          |  96 ++---
 mlir/test/Dialect/XeGPU/layout.mlir           |  49 +++
 mlir/test/Dialect/XeGPU/ops.mlir              | 355 +++++-------------
 .../XeGPU/subgroup-map-propagation.mlir       |  72 ++--
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |  12 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  32 +-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   |  40 +-
 7 files changed, 275 insertions(+), 381 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/layout.mlir

diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index b05c317231ad..0a37ae70b5d9 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
 // -----
-func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
   // expected-error@+1 {{Expecting the TensorDesc rank is up to 2 and not greater than the ranks of shape, strides, offsets or the memref source}}
   %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
@@ -9,49 +9,49 @@ func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
 
 // -----
 
-func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
   // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{SLM is not supported for 2D block tensor}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
   return
 }
 
 // -----
-func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
+func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
   xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<8x16xf16>
@@ -59,7 +59,7 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
+func.func @prefetch_nd_vc_2(%src: memref<24xf16>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex>
                 -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
@@ -70,7 +70,7 @@ func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
+func.func @load_nd_vc_1(%src: memref<8x16xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
@@ -79,7 +79,7 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
+func.func @load_nd_vc_2(%src: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex>
           -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -90,7 +90,7 @@ func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
+func.func @load_nd_vc_3(%src: memref<8x16xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // expected-warning@+1 {{Invalid Packed Attr.}}
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
@@ -99,7 +99,7 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
+func.func @load_nd_vc_4(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
   // expected-error@+1 {{Result shape [8, 1] is not consistent with tensor descriptor}}
@@ -110,7 +110,7 @@ func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
+func.func @load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
   // expected-error@+1 {{Result shape [3] is not a valid distribution for tensor descriptor}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -119,7 +119,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_load_nd_simt(%src: memref<24x32xf32>) {
+func.func @load_nd_simt(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
   %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8xf32>
@@ -127,7 +127,7 @@ func.func @test_load_nd_simt(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
+func.func @store_nd_vc_1(%dst: memref<24x32xf16>) {
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
@@ -136,7 +136,7 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
+func.func @store_nd_vc_2(%dst: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = arith.constant dense<1.0>: vector<8x2xf16>
   %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
@@ -148,7 +148,7 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
+func.func @store_nd_vc_3(%dst: memref<24x32xf16>) {
   %1 = arith.constant dense<1.0>: vector<2x24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // expected-error@+1 {{array length is not supported by store_nd}}
@@ -157,7 +157,7 @@ func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
+func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
   // expected-error@+1 {{Value shape [3] is not a valid distribution for tensor descriptor}}
   xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32>
@@ -165,7 +165,7 @@ func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
+func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
   xegpu.store_nd %data, %1 : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -173,7 +173,7 @@ func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
 }
 
 // -----
-func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
+func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
   // expected-error@+1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
@@ -182,7 +182,7 @@ func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
 }
 
 // -----
-func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
+func.func @update_nd_offset_1(%dst: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
             -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -192,7 +192,7 @@ func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_1(%src: ui64) {
+func.func @create_tdesc_vc_1(%src: ui64) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16>
@@ -200,7 +200,7 @@ func.func @test_create_tdesc_vc_1(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_2(%src: ui64) {
+func.func @create_tdesc_vc_2(%src: ui64) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex>
   // expected-error@+1 {{expected chunk blocks for 2D tensor}}
@@ -209,7 +209,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_3(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_3(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
@@ -218,7 +218,7 @@ func.func @test_create_tdesc_vc_3(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_4(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
   // expected-error@+1 {{invalid chunk size}}
@@ -227,7 +227,7 @@ func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_5(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
   // expected-error@+1 {{expected tensor shape[1] to match chunk size}}
@@ -236,7 +236,7 @@ func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
+func.func @create_tdesc_vc_6(%src: memref<?xf16>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf16>, vector<4xindex>
   // expected-error@+1 {{tensor shape[1] to be a multiple of packing factor 2}}
@@ -246,7 +246,7 @@ func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
 
 
 // -----
-func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
+func.func @prefetch_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
   xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
@@ -254,7 +254,7 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_prefetch_vc_2(%src: ui64) {
+func.func @prefetch_vc_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex>
           -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -264,7 +264,7 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_1(%src: ui64) {
+func.func @create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{expected layout rank to match tensor rank}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
@@ -272,7 +272,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_2(%src: ui64) {
+func.func @create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{cannot map over non-contiguous scattered row elements}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<lane_layout = [1, 4], lane_data = [2, 1]>>
@@ -280,7 +280,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_3(%src: ui64) {
+func.func @create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{work item data mapping must match the number of contiguous elements}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
@@ -288,7 +288,7 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_simt_1(%src: ui64) {
+func.func @load_gather_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -298,7 +298,7 @@ func.func @test_load_gather_simt_1(%src: ui64) {
 }
 
 // -----
-func.func @test_store_scatter_simt_1(%src: ui64) {
+func.func @store_scatter_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<6xf32>
@@ -309,7 +309,7 @@ func.func @test_store_scatter_simt_1(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
+func.func @load_gather_vc_1(%src: memref<24x32xf16>) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
@@ -319,7 +319,7 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_load_gather_vc_2(%src: ui64) {
+func.func @load_gather_vc_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
@@ -332,7 +332,7 @@ func.func @test_load_gather_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
+func.func @store_scatter_vc_1(%src: memref<24x32xf32>) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
@@ -343,7 +343,7 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_store_scatter_vc_2(%src: ui64) {
+func.func @store_scatter_vc_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
@@ -356,49 +356,49 @@ func.func @test_store_scatter_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{K-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{expecting lhs and result to be a 2D vector, and rhs to be either 2D or 3D (packed) vector}}
   %1 = xegpu.dpas %a, %b : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{K-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{M-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<16x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
+func.func @dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
   // expected-error@+1 {{N-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x8x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
+func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
   // expected-error@+1 {{Expecting B operand to be a multiple of 32 bits}}
   %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<15xf16> -> vector<8xf32>
   return
 }
 
 // -----
-func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
+func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
   // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}}
@@ -512,7 +512,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
 }
 
 // -----
-func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
   // expected-error@+1 {{expected different srcMap and resMap}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
@@ -520,7 +520,7 @@ func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
 }
 
 // -----
-func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
   // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
new file mode 100644
index 000000000000..7f3ebec225cd
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  gpu.return
+}
+
+gpu.func @convert_layout(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 76af59d6aedc..054c4d12fdb2 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -6,23 +6,15 @@
 
 // CHECK-LABEL: gpu.module @test {
 gpu.module @test {
-// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+// CHECK: gpu.func @create_nd_tdesc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
@@ -30,94 +22,41 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
-  //CHECK: %[[C:.*]] = arith.constant 1 : index
-  %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
+gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
+// CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
+gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+// CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @prefetch_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
@@ -125,17 +64,9 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+// CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
@@ -144,8 +75,8 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
+// CHECK: func @simt_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @simt_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
@@ -154,8 +85,8 @@ gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
+// CHECK: func @subgroup_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @subgroup_load_nd_2(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
@@ -163,8 +94,8 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
+// CHECK: func @simt_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @simt_load_nd_2(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
@@ -172,8 +103,8 @@ gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_3(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -181,8 +112,8 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_3(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
@@ -190,8 +121,8 @@ gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_4(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
@@ -199,8 +130,8 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_4(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
@@ -208,8 +139,8 @@ gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_5(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_5(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
@@ -217,8 +148,8 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_5(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
@@ -226,8 +157,8 @@ gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_6(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_6(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
@@ -235,8 +166,8 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_6(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
@@ -245,8 +176,8 @@ gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_7(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_7(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16x2xf16>
@@ -254,8 +185,8 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_7(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
@@ -264,8 +195,8 @@ gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_8(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
@@ -273,8 +204,8 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
@@ -282,8 +213,8 @@ gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+// CHECK: func @subgroup_store_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -293,8 +224,8 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
+// CHECK: func @simt_store_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_store_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16>
   %1 = arith.constant dense<1.0>: vector<48xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -306,8 +237,8 @@ gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
 
 
-// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
+// CHECK: func @subgroup_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
   %1 = arith.constant dense<1.0>: vector<32xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
@@ -318,8 +249,8 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 }
 
 
-// CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
+// CHECK: func @simt_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
   %1 = arith.constant dense<1.0>: vector<2xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
@@ -329,8 +260,8 @@ gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
@@ -338,17 +269,9 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc(%src: ui64) {
+// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) {
+gpu.func @create_tdesc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -356,18 +279,9 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
 
-
-// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
+// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
+gpu.func @create_tdesc_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>>
@@ -375,18 +289,9 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
 
-
-// CHECK: gpu.func @test_create_tdesc_vc_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
+// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref<?xf32>) {
+gpu.func @create_tdesc_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
@@ -394,17 +299,9 @@ gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_tdesc_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc_3(%src: ui64) {
+// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) {
+gpu.func @create_tdesc_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -413,17 +310,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 }
 
 
-// CHECK: gpu.func @test_create_tdesc_simt_3(%arg0: ui64) {
-gpu.func @test_create_tdesc_simt_3(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_load_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc(%src: ui64) {
+// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -435,8 +323,8 @@ gpu.func @test_load_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt(%src: ui64) {
+// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -448,8 +336,8 @@ gpu.func @test_load_simt(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_vc_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc_2(%src: ui64) {
+// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -461,8 +349,8 @@ gpu.func @test_load_vc_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt_2(%src: ui64) {
+// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -474,8 +362,8 @@ gpu.func @test_load_simt_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc_3(%src: ui64) {
+// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -487,8 +375,8 @@ gpu.func @test_load_vc_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt_3(%src: ui64) {
+// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -500,8 +388,8 @@ gpu.func @test_load_simt_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc(%src: ui64) {
+// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -517,8 +405,8 @@ gpu.func @test_store_vc(%src: ui64) {
 
 
-// CHECK: gpu.func @test_store_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt(%src: ui64) {
+// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -532,8 +420,8 @@ gpu.func @test_store_simt(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc_2(%src: ui64) {
+// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -549,8 +437,8 @@ gpu.func @test_store_vc_2(%src: ui64) {
 
 
-// CHECK: gpu.func @test_store_simt_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt_2(%src: ui64) {
+// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -564,8 +452,8 @@ gpu.func @test_store_simt_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc_3(%src: ui64) {
+// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -580,8 +468,8 @@ gpu.func @test_store_vc_3(%src: ui64) {
 }
 
 
-// CHECK: gpu.func @test_store_simt_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt_3(%src: ui64) {
+// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -595,20 +483,8 @@ gpu.func @test_store_simt_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_prefetch_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
-
-// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_vc(%src: ui64) {
+// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) {
+gpu.func @prefetch(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -618,21 +494,9 @@ gpu.func @test_prefetch_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_vc(%src: ui64) {
+// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) {
+gpu.func @create_update_tdesc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
@@ -644,29 +508,29 @@ gpu.func @test_create_update_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
-gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
+// CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
+gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
   // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
-gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) {
+// CHECK: gpu.func @simt_dpas(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
+gpu.func @simt_dpas(%a : vector<8xf16>, %b: vector<16xf16>) {
   // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
-gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
+// CHECK: gpu.func @subgroup_dpas_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
+gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
   // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
-gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
+// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
+gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
   //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -715,23 +579,4 @@ gpu.func @fence() {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
-  gpu.return
-}
-
-gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
-  gpu.return
-}
-
-gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
-                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
-  gpu.return
-}
-
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index c7c82fc8dbb3..35ac39d074c7 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s
 
-// CHECK: function: test_dpas_f16:
+// CHECK: function: dpas_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -23,7 +23,7 @@
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -38,7 +38,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 
 
 // -----
-// CHECK: function: test_dpas_i8:
+// CHECK: function: dpas_i8:
 // CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
@@ -51,7 +51,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
+func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
   %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
@@ -60,7 +60,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 }
 
 // -----
-// CHECK: function: test_load_with_transpose_effect:
+// CHECK: function: load_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -83,7 +83,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -97,7 +97,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 }
 
 // -----
-// CHECK: function: test_vector_transpose:
+// CHECK: function: vector_transpose:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -122,7 +122,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -137,7 +137,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 }
 
 // -----
-// CHECK: function: test_extf_truncf:
+// CHECK: function: extf_truncf:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -152,7 +152,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: Not assigned.
-func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
+func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32>
@@ -162,7 +162,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 }
 
 // -----
-// CHECK: function: test_load_gather_with_transpose_effect:
+// CHECK: function: load_gather_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
@@ -187,7 +187,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
+func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
@@ -202,7 +202,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 }
 
 // -----
-// CHECK: function: test_load_gather_1d:
+// CHECK: function: load_gather_1d:
 // CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -215,7 +215,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T1]] = xegpu.load %[[T0]], %[[CST0]]  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -225,7 +225,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 }
 
 // -----
-// CHECK: function: test_store_scatter_with_transpose_effect:
+// CHECK: function: store_scatter_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
@@ -236,7 +236,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
 // CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
-func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
+func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
   %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -246,7 +246,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 }
 
 // -----
-// CHECK: function: test_store_scatter_1d:
+// CHECK: function: store_scatter_1d:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
@@ -257,7 +257,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
+func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -266,7 +266,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 }
 
 // -----
-// CHECK: function: test_vector_bitcast_i16_to_i8:
+// CHECK: function: vector_bitcast_i16_to_i8:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
@@ -289,7 +289,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
+func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
   %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
@@ -303,7 +303,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 }
 
 // -----
-// CHECK: function: test_vector_bitcast_i8_to_f16:
+// CHECK: function: vector_bitcast_i8_to_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
@@ -328,7 +328,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
+func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
   %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
@@ -343,7 +343,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 }
 
 // -----
-// CHECK: function: test_binary_op_one_use:
+// CHECK: function: binary_op_one_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -360,7 +360,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
+func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %2 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -371,7 +371,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 }
 
 // -----
-// CHECK: function: test_binary_op_multiple_uses:
+// CHECK: function: binary_op_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -390,7 +390,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
+func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16>
@@ -402,7 +402,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 }
 
 // -----
-// CHECK: function: test_for_op:
+// CHECK: function: for_op:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
@@ -437,7 +437,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 // CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c16 = arith.constant 16 : index
@@ -458,7 +458,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 }
 
 // -----
-// CHECK: function: test_if_single_use:
+// CHECK: function: if_single_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -477,7 +477,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
+func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
     %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -492,7 +492,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 }
 
 // -----
-// CHECK: function: test_if_multiple_uses:
+// CHECK: function: if_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -513,7 +513,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
+func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
     %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -529,7 +529,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 }
 
 // -----
-// CHECK: function: test_vector_outer_reduction:
+// CHECK: function: vector_outer_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -538,7 +538,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
@@ -546,7 +546,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 }
 
 // -----
-// CHECK: function: test_vector_inner_reduction:
+// CHECK: function: vector_inner_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -555,7 +555,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 8e3673d04eac..67d3bd9b393c 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -4,7 +4,7 @@
 #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
 #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -45,7 +45,7 @@ gpu.module @test_kernel {
 #l1 = #xegpu.layout<inst_data = [8, 16]>
 #l2 = #xegpu.layout<inst_data = [16, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -86,7 +86,7 @@ gpu.module @test_kernel {
 #l1 = #xegpu.layout<inst_data = [8, 16]>
 #l2 = #xegpu.layout<inst_data = [16, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
@@ -130,7 +130,7 @@ gpu.module @test_kernel {
 #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
 #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -172,7 +172,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [8, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
     %c0 = arith.constant 0 : index
     %c32 = arith.constant 32 : index
     %c1024 = arith.constant 1024 : index
@@ -211,7 +211,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [8]>
 gpu.module @test_kernel {
-  gpu.func @test_elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+  gpu.func @elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
     %c0 = arith.constant 0 : index
     %c32 = arith.constant 32 : index
     %c1024 = arith.constant 1024 : index
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 35ad16d8cd9a..c6124f90e0f4 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
 gpu.module @test_round_robin_assignment {
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
       // CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32>
       // CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
@@ -12,9 +12,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
-  // CHECK-LABEL: test_load_nd_tdesc
+  // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
         -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.load_nd %{{.*}}
@@ -27,9 +27,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
         -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}}
@@ -43,9 +43,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
   }
 
-  // CHECK-LABEL: test_update_nd
+  // CHECK-LABEL: update_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_update_nd(%src: memref<24x32xf32>){
+  gpu.func @update_nd(%src: memref<24x32xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
       ->  !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16]
@@ -56,9 +56,9 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_dpas
+  // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<8x8xf32>, %[[ARG_1:.*]]: memref<8x8xf32>, %[[ARG_2:.*]]: memref<8x8xf32>)
-  gpu.func @test_dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
+  gpu.func @dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
     // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.create_nd_tdesc
@@ -90,9 +90,9 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}}
     // CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
@@ -103,7 +103,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1 = arith.constant 1 : index
     %c10 = arith.constant 10 : index
     %c0 = arith.constant 0 : index
@@ -126,7 +126,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1_i32 = arith.constant 1 : i32
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
@@ -150,7 +150,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %0 = gpu.subgroup_id : index
     %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -173,7 +173,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 466842c96844..44b11c304cc8 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -3,9 +3,9 @@
 //CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
 //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[SGID:.*]] = gpu.subgroup_id
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[C4:.*]] = arith.constant 4 : index
@@ -30,9 +30,9 @@ gpu.module @test_1_1_assignment {
   gpu.return
   }
 
-  // CHECK-LABEL: test_load_nd_tdesc
+  // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
@@ -46,9 +46,9 @@ gpu.module @test_1_1_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
@@ -66,9 +66,9 @@ gpu.module @test_1_1_assignment {
     gpu.return
 }
 
-// CHECK-LABEL: test_update_nd
+// CHECK-LABEL: update_nd
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-gpu.func @test_update_nd(%src: memref<24x32xf32>){
+gpu.func @update_nd(%src: memref<24x32xf32>){
   // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
   // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
   // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
@@ -80,10 +80,10 @@ gpu.func @test_update_nd(%src: memref<24x32xf32>){
   gpu.return
 }
 
-// CHECK-LABEL: test_dpas
+// CHECK-LABEL: dpas
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
 // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
-gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+gpu.func @dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
@@ -114,10 +114,10 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
   }
 
 
-// CHECK-LABEL: test_dpas_no_sg_data
+// CHECK-LABEL: dpas_no_sg_data
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
 // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
-gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
@@ -147,9 +147,9 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: xegpu.prefetch_nd %[[TDESC]]
@@ -161,8 +161,8 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  // CHECK-LABEL: test_dpas_with_no_create_nd_desc
-  gpu.func @test_dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
+  // CHECK-LABEL: dpas_with_no_create_nd_desc
+  gpu.func @dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
     // CHECK-NOT: vector<12x12xf32>
     %dpas = xegpu.dpas %a, %b
       {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
@@ -170,7 +170,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
+  gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
     //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
@@ -213,7 +213,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1_i32 = arith.constant 1 : i32
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
@@ -238,7 +238,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
@@ -267,7 +267,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 

From 0defde8e06338cbe968d55d1d9e8581d55f3ae2b Mon Sep 17 00:00:00 2001
From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com>
Date: Wed, 18 Jun 2025 09:00:07 +0800
Subject: [PATCH 0748/1322] [AMDGPU] Support D16 folding for image.sample with
 multiple extractelement and fptrunc users (#141758)

Now we only support D16 folding for `image sample` instructions with a
single user: a `fptrunc` to half.
However, we can actually support D16 folding for image.sample
instructions with multiple users,
as long as each user follows the pattern of extractelement followed by
fptrunc to half.
For example:
```
  %sample = call <4 x float> @llvm.amdgcn.image.sample
  %e0 = extractelement <4 x float> %sample, i32 0
  %h0 = fptrunc float %e0 to half
  %e1 = extractelement <4 x float> %sample, i32 1
  %h1 = fptrunc float %e1 to half
  %e2 = extractelement <4 x float> %sample, i32 2
  %h2 = fptrunc float %e2 to half
```
This change enables D16 folding for such cases and avoids generating
`v_cvt_f16_f32_e32` instructions.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  60 +++++++++
 .../InstCombine/AMDGPU/image-d16.ll           | 118 ++++++++++++++++++
 2 files changed, 178 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index d12170a60905..5477c5eae939 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -248,6 +248,66 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
                                      });
         }
       }
+
+      // Only perform D16 folding if every user of the image sample is
+      // an ExtractElementInst immediately followed by an FPTrunc to half.
+      SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
+          ExtractTruncPairs;
+      bool AllHalfExtracts = true;
+
+      for (User *U : II.users()) {
+        auto *Ext = dyn_cast<ExtractElementInst>(U);
+        if (!Ext || !Ext->hasOneUse()) {
+          AllHalfExtracts = false;
+          break;
+        }
+
+        auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
+        if (!Tr || !Tr->getType()->isHalfTy()) {
+          AllHalfExtracts = false;
+          break;
+        }
+
+        ExtractTruncPairs.emplace_back(Ext, Tr);
+      }
+
+      if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
+        auto *VecTy = cast<VectorType>(II.getType());
+        Type *HalfVecTy =
+            VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
+
+        // Obtain the original image sample intrinsic's signature
+        // and replace its return type with the half-vector for D16 folding
+        SmallVector<Type *, 8> SigTys;
+        Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
+        SigTys[0] = HalfVecTy;
+
+        Module *M = II.getModule();
+        Function *HalfDecl =
+            Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
+
+        II.mutateType(HalfVecTy);
+        II.setCalledFunction(HalfDecl);
+
+        IRBuilder<> Builder(II.getContext());
+        for (auto &[Ext, Tr] : ExtractTruncPairs) {
+          Value *Idx = Ext->getIndexOperand();
+
+          Builder.SetInsertPoint(Tr);
+
+          Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
+          HalfExtract->takeName(Tr);
+
+          Tr->replaceAllUsesWith(HalfExtract);
+        }
+
+        for (auto &[Ext, Tr] : ExtractTruncPairs) {
+          IC.eraseInstFromFunction(*Tr);
+          IC.eraseInstFromFunction(*Ext);
+        }
+
+        return &II;
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index 30431ad72484..ee5ccf5af987 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 
 define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
@@ -121,6 +122,123 @@ main_body:
   ret half %addf_sum.2
 }
 
+define amdgpu_ps half @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to half
+; GFX7-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX7-NEXT:    ret half [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX81PLUS-NEXT:    ret half [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to half
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to half
+  %e2 = extractelement <4 x float> %sample, i32 2
+  %h2 = fptrunc float %e2 to half
+  %mul = fmul half %h0, %h1
+  %res = fadd half %mul, %h2
+  ret half %res
+}
+
+define amdgpu_ps half @image_sample_2d_extractelement_multi_use_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT:    [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
+; GFX7-NEXT:    [[HALF:%.*]] = fptrunc float [[USER2]] to half
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
+; GFX7-NEXT:    ret half [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX81PLUS-NEXT:    [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
+; GFX81PLUS-NEXT:    [[HALF:%.*]] = fptrunc float [[USER2]] to half
+; GFX81PLUS-NEXT:    [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
+; GFX81PLUS-NEXT:    ret half [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to half
+  %user2 = fadd float %e0, 1.0
+  %half = fptrunc float %user2 to half
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to half
+  %mul = fmul half %h0, %h1
+  %res = fadd half %mul, %half
+  ret half %res
+}
+
+define amdgpu_ps bfloat @image_sample_2d_multi_fptrunc_non_half_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to bfloat
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to bfloat
+; GFX7-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to bfloat
+; GFX7-NEXT:    [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
+; GFX7-NEXT:    ret bfloat [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to bfloat
+; GFX81PLUS-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to bfloat
+; GFX81PLUS-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to bfloat
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
+; GFX81PLUS-NEXT:    ret bfloat [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to bfloat
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to bfloat
+  %e2 = extractelement <4 x float> %sample, i32 2
+  %h2 = fptrunc float %e2 to bfloat
+  %mul = fmul bfloat %h0, %h1
+  %res = fadd bfloat %mul, %h2
+  ret bfloat %res
+}
+
 define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX7-LABEL: @image_gather4_2d_v4f32(
 ; GFX7-NEXT:  main_body:

From 9265b1f0cff74c929214efb64f41183299f31772 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Tue, 17 Jun 2025 18:15:06 -0700
Subject: [PATCH 0749/1322] LowerTypeTests: Use jump table entry type as value
 type of jump table alias.

The motivation for this is that it causes the jump table entry's symbol
to have an st_size equal to the jump table entry size, instead of being
equal to the size of the entire jump table, which is incorrect and can
lead to unexpected behavior in binary analysis tools that rely on the
size field such as Bloaty.

Reviewers: fmayer

Reviewed By: fmayer

Pull Request: https://github.com/llvm/llvm-project/pull/144462
---
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp          | 10 ++++++----
 .../Transforms/LowerTypeTests/blockaddress-2.ll     |  2 +-
 .../Transforms/LowerTypeTests/cfi-icall-alias.ll    |  2 +-
 llvm/test/Transforms/LowerTypeTests/export-alias.ll |  4 ++--
 llvm/test/Transforms/LowerTypeTests/export-icall.ll | 12 ++++++------
 .../Transforms/LowerTypeTests/function-disjoint.ll  |  4 ++--
 llvm/test/Transforms/LowerTypeTests/function.ll     | 13 ++++++-------
 .../LowerTypeTests/icall-branch-funnel.ll           |  4 ++--
 llvm/test/Transforms/LowerTypeTests/pr37625.ll      |  2 +-
 llvm/test/Transforms/LowerTypeTests/section.ll      |  2 +-
 10 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 20b54c056cc2..86e1ebf937db 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1701,8 +1701,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
                        GlobalValue::PrivateLinkage,
                        M.getDataLayout().getProgramAddressSpace(),
                        ".cfi.jumptable", &M);
+  ArrayType *JumpTableEntryType = ArrayType::get(Int8Ty, EntrySize);
   ArrayType *JumpTableType =
-      ArrayType::get(ArrayType::get(Int8Ty, EntrySize), Functions.size());
+      ArrayType::get(JumpTableEntryType, Functions.size());
   auto JumpTable = ConstantExpr::getPointerCast(
       JumpTableFn, PointerType::getUnqual(M.getContext()));
 
@@ -1723,7 +1724,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     if (!IsJumpTableCanonical) {
       GlobalValue::LinkageTypes LT = IsExported ? GlobalValue::ExternalLinkage
                                                 : GlobalValue::InternalLinkage;
-      GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT,
+      GlobalAlias *JtAlias = GlobalAlias::create(JumpTableEntryType, 0, LT,
                                                  F->getName() + ".cfi_jt",
                                                  CombinedGlobalElemPtr, &M);
       if (IsExported)
@@ -1748,8 +1749,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     } else {
       assert(F->getType()->getAddressSpace() == 0);
 
-      GlobalAlias *FAlias = GlobalAlias::create(
-          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
+      GlobalAlias *FAlias =
+          GlobalAlias::create(JumpTableEntryType, 0, F->getLinkage(), "",
+                              CombinedGlobalElemPtr, &M);
       FAlias->setVisibility(F->getVisibility());
       FAlias->takeName(F);
       if (FAlias->hasName()) {
diff --git a/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll b/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
index 51a2a5936543..34e740771fe2 100644
--- a/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
+++ b/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S %s -passes=lowertypetests | FileCheck %s
 
 ; CHECK: @badfileops = internal global %struct.f { ptr @bad_f, ptr @bad_f }
-; CHECK: @bad_f = internal alias void (), ptr @.cfi.jumptable
+; CHECK: @bad_f = internal alias [8 x i8], ptr @.cfi.jumptable
 ; CHECK: define internal void @bad_f.cfi() !type !0 {
 ; CHECK-NEXT:  ret void
 
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
index 0c5324ee96c9..6b821186b0ad 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
@@ -12,7 +12,7 @@ RUN: opt test1.bc -passes=lowertypetests -lowertypetests-read-summary=in.yaml \
 RUN:   -lowertypetests-summary-action=export -lowertypetests-write-summary=exported.yaml \
 RUN:   -S -o - | FileCheck %s --check-prefix=REGULAR
 REGULAR: @__typeid__ZTSFvvE_global_addr = hidden alias i8, ptr @.cfi.jumptable
-REGULAR: @f = alias void (), ptr @.cfi.jumptable
+REGULAR: @f = alias [8 x i8], ptr @.cfi.jumptable
 REGULAR: define private void @.cfi.jumptable()
 
 ;; CHECK that @llvm.type.test() is lowered to an actual check.
diff --git a/llvm/test/Transforms/LowerTypeTests/export-alias.ll b/llvm/test/Transforms/LowerTypeTests/export-alias.ll
index 255e6b6ca4d1..45b4db63def1 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-alias.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-alias.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S %s -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/exported-funcs.yaml | FileCheck %s
 ;
-; CHECK: @alias1 = weak alias void (), ptr @external_addrtaken
-; CHECK: @alias2 = hidden alias void (), ptr @external_addrtaken
+; CHECK: @alias1 = weak alias [8 x i8], ptr @external_addrtaken
+; CHECK: @alias2 = hidden alias [8 x i8], ptr @external_addrtaken
 ; CHECK-NOT: @alias3 = alias
 ; CHECK-NOT: @not_present
 
diff --git a/llvm/test/Transforms/LowerTypeTests/export-icall.ll b/llvm/test/Transforms/LowerTypeTests/export-icall.ll
index 47156deb57de..f8adb2d69910 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-icall.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-icall.ll
@@ -40,15 +40,15 @@ define void @f3(i32 %x) !type !8 {
 ; CHECK-DAG: @__typeid_typeid1_align = hidden alias i8, inttoptr (i64 3 to ptr)
 ; CHECK-DAG: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 4 to ptr)
 
-; CHECK-DAG: @h                    = alias void (i8), ptr [[JT1]]
-; CHECK-DAG: @f                    = alias void (i32), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @f2                   = alias void (i32), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @external.cfi_jt      = hidden alias void (), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @external_weak.cfi_jt = hidden alias void (), {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @h                    = alias [8 x i8], ptr [[JT1]]
+; CHECK-DAG: @f                    = alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @f2                   = alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @external.cfi_jt      = hidden alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @external_weak.cfi_jt = hidden alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
 
 ; CHECK-DAG: @__typeid_typeid2_global_addr = hidden alias i8, ptr [[JT2:.*]]
 
-; CHECK-DAG: @g                    = alias void (), ptr [[JT2]]
+; CHECK-DAG: @g                    = alias [8 x i8], ptr [[JT2]]
 
 ; CHECK-DAG: define hidden void @h.cfi(i8 {{.*}}) !type !{{.*}}
 ; CHECK-DAG: declare !type !{{.*}} void @external()
diff --git a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
index d7ba3a681419..ae676df6e9f3 100644
--- a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
@@ -5,8 +5,8 @@
 
 target datalayout = "e-p:64:64"
 
-; X64: @g = alias void (), ptr @[[JT1:.*]]
-; X64: @f = alias void (), ptr @[[JT0:.*]]
+; X64: @g = alias [8 x i8], ptr @[[JT1:.*]]
+; X64: @f = alias [8 x i8], ptr @[[JT0:.*]]
 
 ; WASM32: private constant [0 x i8] zeroinitializer
 @0 = private unnamed_addr constant [2 x ptr] [ptr @f, ptr @g], align 16
diff --git a/llvm/test/Transforms/LowerTypeTests/function.ll b/llvm/test/Transforms/LowerTypeTests/function.ll
index 5b0852c82ea6..ab3cfb6acccf 100644
--- a/llvm/test/Transforms/LowerTypeTests/function.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function.ll
@@ -28,14 +28,13 @@ target datalayout = "e-p:64:64"
 ; NATIVE: private constant [0 x i8] zeroinitializer
 ; WASM32: private constant [0 x i8] zeroinitializer
 
-; NATIVE: @f = alias void (), ptr @[[JT:.*]]
+; JT4: @f = alias [4 x i8], ptr @[[JT:.*]]
+; JT8: @f = alias [8 x i8], ptr @[[JT:.*]]
+; JT16: @f = alias [16 x i8], ptr @[[JT:.*]]
 
-; X86: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
-; ARM: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
-; THUMB: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
-; THUMBV6M: @g = internal alias void (), getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
-; RISCV: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
-; LOONGARCH64: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT4: @g = internal alias [4 x i8], getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT8: @g = internal alias [8 x i8], getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT16: @g = internal alias [16 x i8], getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
 
 ; NATIVE: define hidden void @f.cfi()
 ; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]]
diff --git a/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll b/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
index f67e0b171165..8cb41398e8f5 100644
--- a/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
+++ b/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux"
 ; CHECK: @0 = private constant { i32, [0 x i8], i32 } { i32 1, [0 x i8] zeroinitializer, i32 2 }
 ; CHECK: @g1 = alias i32, ptr @0
 ; CHECK: @g2 = alias i32, getelementptr inbounds ({ i32, [0 x i8], i32 }, ptr @0, i32 0, i32 2)
-; CHECK: @f1 = alias void (), ptr @.cfi.jumptable
-; CHECK: @f2 = alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @.cfi.jumptable, i64 0, i64 1)
+; CHECK: @f1 = alias [8 x i8], ptr @.cfi.jumptable
+; CHECK: @f2 = alias [8 x i8], getelementptr inbounds ([2 x [8 x i8]], ptr @.cfi.jumptable, i64 0, i64 1)
 
 @g1 = constant i32 1
 @g2 = constant i32 2
diff --git a/llvm/test/Transforms/LowerTypeTests/pr37625.ll b/llvm/test/Transforms/LowerTypeTests/pr37625.ll
index 639cc3fa32bc..cf52cdf0759a 100644
--- a/llvm/test/Transforms/LowerTypeTests/pr37625.ll
+++ b/llvm/test/Transforms/LowerTypeTests/pr37625.ll
@@ -11,4 +11,4 @@ declare !type !2 extern_weak void @external_addrtaken(i8)
 !1 = !{!"external_addrtaken", i8 0, !2}
 !2 = !{i64 0, !"typeid1"}
 
-; CHECK-DAG: @external_addrtaken = alias void (i8), ptr @.cfi.jumptable
+; CHECK-DAG: @external_addrtaken = alias [8 x i8], ptr @.cfi.jumptable
diff --git a/llvm/test/Transforms/LowerTypeTests/section.ll b/llvm/test/Transforms/LowerTypeTests/section.ll
index d0d3c212c826..bd91389c60ef 100644
--- a/llvm/test/Transforms/LowerTypeTests/section.ll
+++ b/llvm/test/Transforms/LowerTypeTests/section.ll
@@ -5,7 +5,7 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: @f = alias void (), ptr @[[JT:.*]]
+; CHECK: @f = alias [8 x i8], ptr @[[JT:.*]]
 ; CHECK: define hidden void @f.cfi() section "xxx"
 
 define void @f() section "xxx" !type !0 {

From 8ddada41df0488358373cff1d31a47e5ef4961e0 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 18 Jun 2025 09:17:46 +0800
Subject: [PATCH 0750/1322] [RISCV] Add Andes XAndesVBFHCvt (Andes Vector
 BFLOAT16 Conversion) extension (#144320)

The spec can be found at:
https://github.com/andestech/andes-v5-isa/releases/tag/ast-v5_4_0-release.

This patch only supports assembler. The instructions are similar to
`Zvfbfmin` and the only difference with `Zvfbfmin` is that
`XAndesVBFHCvt` doesn't have mask variant.
---
 .../Driver/print-supported-extensions-riscv.c |  1 +
 .../riscv-target-features-andes.c             |  8 +++++
 llvm/docs/RISCVUsage.rst                      |  3 ++
 llvm/docs/ReleaseNotes.md                     |  1 +
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  4 +--
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  9 ++++++
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 31 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/attributes.ll         |  4 +++
 llvm/test/CodeGen/RISCV/features-info.ll      |  1 +
 llvm/test/MC/RISCV/xandesvbfhcvt-valid.s      | 27 ++++++++++++++++
 .../TargetParser/RISCVISAInfoTest.cpp         |  1 +
 11 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/xandesvbfhcvt-valid.s

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index e1f5a7a0105d..5008c2b7f789 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -159,6 +159,7 @@
 // CHECK-NEXT:     svpbmt               1.0       'Svpbmt' (Page-Based Memory Types)
 // CHECK-NEXT:     svvptc               1.0       'Svvptc' (Obviating Memory-Management Instructions after Marking PTEs Valid)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
+// CHECK-NEXT:     xandesvbfhcvt        5.0       'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)
 // CHECK-NEXT:     xandesvdot           5.0       'XAndesVDot' (Andes Vector Dot Product Extension)
 // CHECK-NEXT:     xandesvpackfph       5.0       'XAndesVPackFPH' (Andes Vector Packed FP16 Extension)
 // CHECK-NEXT:     xcvalu               1.0       'XCValu' (CORE-V ALU Operations)
diff --git a/clang/test/Preprocessor/riscv-target-features-andes.c b/clang/test/Preprocessor/riscv-target-features-andes.c
index 3cd9b0435413..c66d4427b5cf 100644
--- a/clang/test/Preprocessor/riscv-target-features-andes.c
+++ b/clang/test/Preprocessor/riscv-target-features-andes.c
@@ -15,6 +15,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESPERF %s
 // CHECK-XANDESPERF: __riscv_xandesperf  5000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN:   -march=rv32i_xandesvbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
+// RUN: %clang --target=riscv64 \
+// RUN:   -march=rv64i_xandesvbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
+// CHECK-XANDESVBFHCVT: __riscv_xandesvbfhcvt  5000000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32i_xandesvpackfph -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVPACKFPH %s
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index aadda309feab..81684ba30f12 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -513,6 +513,9 @@ The current vendor extensions supported are:
 ``XAndesPerf``
   LLVM implements `version 5.0.0 of the Andes Performance Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
+``XAndesVBFHCvt``
+  LLVM implements `version 5.0.0 of the Andes Vector BFLOAT16 Conversion Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
+
 ``XAndesVPackFPH``
   LLVM implements `version 5.0.0 of the Andes Vector Packed FP16 Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 5c9ed181af59..0395f43c6195 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -210,6 +210,7 @@ Changes to the RISC-V Backend
 * The `Shlcofideleg` extension was added.
 * `-mcpu=sifive-x390` was added.
 * `-mtune=andes-45-series` was added.
+* Adds assembler support for the Andes `XAndesvbfhcvt` (Andes Vector BFLOAT16 Conversion extension).
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index cbab081a6731..27e04c0cb1f8 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -774,8 +774,8 @@ static constexpr FeatureBitset XTHeadGroup = {
     RISCV::FeatureVendorXTHeadVdot};
 
 static constexpr FeatureBitset XAndesGroup = {
-    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVPackFPH,
-    RISCV::FeatureVendorXAndesVDot};
+    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVBFHCvt,
+    RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot};
 
 static constexpr DecoderListEntry DecoderList32[]{
     // Vendor Extensions
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 0b3508426732..6df6368929da 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1599,6 +1599,15 @@ def HasVendorXAndesPerf
       AssemblerPredicate<(all_of FeatureVendorXAndesPerf),
                          "'XAndesPerf' (Andes Performance Extension)">;
 
+def FeatureVendorXAndesVBFHCvt
+    : RISCVExtension<5, 0, "Andes Vector BFLOAT16 Conversion Extension",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXAndesVBFHCvt
+    : Predicate<"Subtarget->hasVendorXAndesVBFHCvt()">,
+      AssemblerPredicate<(all_of FeatureVendorXAndesVBFHCvt),
+                         "'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)">;
+
+
 def FeatureVendorXAndesVPackFPH
     : RISCVExtension<5, 0, "Andes Vector Packed FP16 Extension",
                      [FeatureStdExtZvfhmin]>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 4cf8309ea17f..3ba21e51e7c6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -361,6 +361,25 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
   let RVVConstraint = VMConstraint;
 }
 
+class NDSRVInstVBFHCvt<bits<7> funct7, bits<5> vs1, string opcodestr>
+    : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm),
+             opcodestr, "$vd, $vs2", [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> vd;
+
+  let Inst{31-25} = funct7;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = vs1;
+  let Inst{14-12} = 0b100;
+  let Inst{11-7} = vd;
+  let Inst{6-0} = OPC_CUSTOM_2.Value;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+
+  let Uses = [VL, VTYPE];
+}
+
 //===----------------------------------------------------------------------===//
 // Multiclass
 //===----------------------------------------------------------------------===//
@@ -460,6 +479,18 @@ def NDS_LDGP  : NDSRVInstLDGP<0b011, "nds.ldgp">;
 def NDS_SDGP  : NDSRVInstSDGP<0b111, "nds.sdgp">;
 } // Predicates = [HasVendorXAndesPerf, IsRV64]
 
+//===----------------------------------------------------------------------===//
+// XAndesVBFHCvt
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXAndesVBFHCvt], Constraints = "@earlyclobber $vd",
+    mayRaiseFPException = true in {
+let RVVConstraint = VS2Constraint, DestEEW = EEWSEWx2 in
+def NDS_VFWCVT_S_BF16 : NDSRVInstVBFHCvt<0b0000000, 0b00000, "nds.vfwcvt.s.bf16">;
+let Uses = [FRM, VL, VTYPE] in
+def NDS_VFNCVT_BF16_S : NDSRVInstVBFHCvt<0b0000000, 0b00001, "nds.vfncvt.bf16.s">;
+}
+
 //===----------------------------------------------------------------------===//
 // XAndesVPackFPH
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index cdbf1caff5d8..c5188aa1918b 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -106,6 +106,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisync %s -o - | FileCheck --check-prefix=RV32XQCISYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV32XANDESPERF %s
+; RUN: llc -mtriple=riscv32 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV32XANDESVDOT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV32XANDESVPACKFPH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
@@ -260,6 +261,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV64XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV64XANDESPERF %s
+; RUN: llc -mtriple=riscv64 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV64XANDESVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV64XANDESVPACKFPH %s
 ; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s
@@ -457,6 +459,7 @@
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3"
 ; RV32XANDESPERF: .attribute 5, "rv32i2p1_xandesperf5p0"
+; RV32XANDESVBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV32XANDESVDOT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
 ; RV32XANDESVPACKFPH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
@@ -612,6 +615,7 @@
 ; RV64XTHEADSYNC: .attribute 5, "rv64i2p1_xtheadsync1p0"
 ; RV64XTHEADVDOT: .attribute 5, "rv64i2p1_f2p2_d2p2_v1p0_zicsr2p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xtheadvdot1p0"
 ; RV64XANDESPERF: .attribute 5, "rv64i2p1_xandesperf5p0"
+; RV64XANDESVBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV64XANDESVDOT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
 ; RV64XANDESVPACKFPH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0"
 ; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index fab2e9495930..8b931f70aa5c 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -174,6 +174,7 @@
 ; CHECK-NEXT:   ventana-veyron                   - Ventana Veyron-Series processors.
 ; CHECK-NEXT:   vxrm-pipeline-flush              - VXRM writes causes pipeline flush.
 ; CHECK-NEXT:   xandesperf                       - 'XAndesPerf' (Andes Performance Extension).
+; CHECK-NEXT:   xandesvbfhcvt                    - 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension).
 ; CHECK-NEXT:   xandesvdot                       - 'XAndesVDot' (Andes Vector Dot Product Extension).
 ; CHECK-NEXT:   xandesvpackfph                   - 'XAndesVPackFPH' (Andes Vector Packed FP16 Extension).
 ; CHECK-NEXT:   xcvalu                           - 'XCValu' (CORE-V ALU Operations).
diff --git a/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s b/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
new file mode 100644
index 000000000000..355846719e46
--- /dev/null
+++ b/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
@@ -0,0 +1,27 @@
+# XAndesVBFHCvt - Andes Vector BFLOAT16 Conversion Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xandesvbfhcvt -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xandesvbfhcvt < %s \
+# RUN:     | llvm-objdump --mattr=+xandesvbfhcvt -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
+# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xandesvbfhcvt -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xandesvbfhcvt < %s \
+# RUN:     | llvm-objdump --mattr=+xandesvbfhcvt -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
+# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+# CHECK-OBJ: nds.vfwcvt.s.bf16 v8, v10
+# CHECK-ASM: nds.vfwcvt.s.bf16 v8, v10
+# CHECK-ASM: encoding: [0x5b,0x44,0xa0,0x00]
+# CHECK-ERROR: instruction requires the following: 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension){{$}}
+nds.vfwcvt.s.bf16 v8, v10
+
+# CHECK-OBJ: nds.vfncvt.bf16.s v8, v10
+# CHECK-ASM: nds.vfncvt.bf16.s v8, v10
+# CHECK-ASM: encoding: [0x5b,0xc4,0xa0,0x00]
+# CHECK-ERROR: instruction requires the following: 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension){{$}}
+nds.vfncvt.bf16.s v8, v10
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index a0910a164ea0..66e335a33a3f 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1130,6 +1130,7 @@ R"(All available -march extensions for RISC-V
     svpbmt               1.0
     svvptc               1.0
     xandesperf           5.0
+    xandesvbfhcvt        5.0
     xandesvdot           5.0
     xandesvpackfph       5.0
     xcvalu               1.0

From a96a3f1b26baa8e5ee0abbac629f02566b7e9d1c Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Tue, 17 Jun 2025 18:37:15 -0700
Subject: [PATCH 0751/1322] [lldb][Minidump Parser] Implement a range data
 vector for minidump memory ranges (#136040)

Recently I was debugging a Minidump with a few thousand ranges, and came
across the (now deleted) comment:

```
  // I don't have a sense of how frequently this is called or how many memory
  // ranges a Minidump typically has, so I'm not sure if searching for the
  // appropriate range linearly each time is stupid.  Perhaps we should build
  // an index for faster lookups.
```

blaming this comment, it's 9 years old! Much overdue for this simple fix
with a range data vector.

I had to add a default constructor to Range in order to implement the
RangeDataVector, but otherwise this just a replacement of look up logic.
---
 .../Process/minidump/MinidumpParser.cpp       | 72 ++++++++++---------
 .../Plugins/Process/minidump/MinidumpParser.h | 21 +++++-
 2 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index 94c0a5f11e43..ef691b77193c 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -20,8 +20,8 @@
 #include <algorithm>
 #include <map>
 #include <optional>
-#include <vector>
 #include <utility>
+#include <vector>
 
 using namespace lldb_private;
 using namespace minidump;
@@ -75,8 +75,7 @@ UUID MinidumpParser::GetModuleUUID(const minidump::Module *module) {
     if (GetArchitecture().GetTriple().isOSBinFormatELF()) {
       if (pdb70_uuid->Age != 0)
         return UUID(pdb70_uuid, sizeof(*pdb70_uuid));
-      return UUID(&pdb70_uuid->Uuid,
-                                    sizeof(pdb70_uuid->Uuid));
+      return UUID(&pdb70_uuid->Uuid, sizeof(pdb70_uuid->Uuid));
     }
     return UUID(*pdb70_uuid);
   } else if (cv_signature == CvSignature::ElfBuildId)
@@ -429,62 +428,65 @@ MinidumpParser::GetExceptionStreams() {
 
 std::optional<minidump::Range>
 MinidumpParser::FindMemoryRange(lldb::addr_t addr) {
-  Log *log = GetLog(LLDBLog::Modules);
+  if (m_memory_ranges.IsEmpty())
+    PopulateMemoryRanges();
 
+  const MemoryRangeVector::Entry *entry =
+      m_memory_ranges.FindEntryThatContains(addr);
+  if (!entry)
+    return std::nullopt;
+
+  return entry->data;
+}
+
+void MinidumpParser::PopulateMemoryRanges() {
+  Log *log = GetLog(LLDBLog::Modules);
   auto ExpectedMemory = GetMinidumpFile().getMemoryList();
-  if (!ExpectedMemory) {
-    LLDB_LOG_ERROR(log, ExpectedMemory.takeError(),
-                   "Failed to read memory list: {0}");
-  } else {
+  if (ExpectedMemory) {
     for (const auto &memory_desc : *ExpectedMemory) {
       const LocationDescriptor &loc_desc = memory_desc.Memory;
       const lldb::addr_t range_start = memory_desc.StartOfMemoryRange;
       const size_t range_size = loc_desc.DataSize;
-
-      if (loc_desc.RVA + loc_desc.DataSize > GetData().size())
-        return std::nullopt;
-
-      if (range_start <= addr && addr < range_start + range_size) {
-        auto ExpectedSlice = GetMinidumpFile().getRawData(loc_desc);
-        if (!ExpectedSlice) {
-          LLDB_LOG_ERROR(log, ExpectedSlice.takeError(),
-                         "Failed to get memory slice: {0}");
-          return std::nullopt;
-        }
-        return minidump::Range(range_start, *ExpectedSlice);
+      auto ExpectedSlice = GetMinidumpFile().getRawData(loc_desc);
+      if (!ExpectedSlice) {
+        LLDB_LOG_ERROR(log, ExpectedSlice.takeError(),
+                       "Failed to get memory slice: {0}");
+        continue;
       }
+      m_memory_ranges.Append(MemoryRangeVector::Entry(
+          range_start, range_size,
+          minidump::Range(range_start, *ExpectedSlice)));
     }
+  } else {
+    LLDB_LOG_ERROR(log, ExpectedMemory.takeError(),
+                   "Failed to read memory list: {0}");
   }
 
   if (!GetStream(StreamType::Memory64List).empty()) {
     llvm::Error err = llvm::Error::success();
-    for (const auto &memory_desc :  GetMinidumpFile().getMemory64List(err)) {
-      if (memory_desc.first.StartOfMemoryRange <= addr 
-          && addr < memory_desc.first.StartOfMemoryRange + memory_desc.first.DataSize) {
-        return minidump::Range(memory_desc.first.StartOfMemoryRange, memory_desc.second);
-      }
+    for (const auto &memory_desc : GetMinidumpFile().getMemory64List(err)) {
+      m_memory_ranges.Append(MemoryRangeVector::Entry(
+          memory_desc.first.StartOfMemoryRange, memory_desc.first.DataSize,
+          minidump::Range(memory_desc.first.StartOfMemoryRange,
+                          memory_desc.second)));
     }
 
     if (err)
       LLDB_LOG_ERROR(log, std::move(err), "Failed to read memory64 list: {0}");
   }
 
-  return std::nullopt;
+  m_memory_ranges.Sort();
 }
 
 llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
                                                   size_t size) {
-  // I don't have a sense of how frequently this is called or how many memory
-  // ranges a Minidump typically has, so I'm not sure if searching for the
-  // appropriate range linearly each time is stupid.  Perhaps we should build
-  // an index for faster lookups.
   std::optional<minidump::Range> range = FindMemoryRange(addr);
   if (!range)
     return {};
 
   // There's at least some overlap between the beginning of the desired range
-  // (addr) and the current range.  Figure out where the overlap begins and how
-  // much overlap there is.
+  // (addr) and the current range.  Figure out where the overlap begins and
+  // how much overlap there is.
 
   const size_t offset = addr - range->start;
 
@@ -495,7 +497,8 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
   return range->range_ref.slice(offset, overlap);
 }
 
-llvm::iterator_range<FallibleMemory64Iterator> MinidumpParser::GetMemory64Iterator(llvm::Error &err) {
+llvm::iterator_range<FallibleMemory64Iterator>
+MinidumpParser::GetMemory64Iterator(llvm::Error &err) {
   llvm::ErrorAsOutParameter ErrAsOutParam(&err);
   return m_file->getMemory64List(err);
 }
@@ -607,8 +610,7 @@ std::pair<MemoryRegionInfos, bool> MinidumpParser::BuildMemoryRegions() {
   case StreamType::ST:                                                         \
     return #ST
 
-llvm::StringRef
-MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
+llvm::StringRef MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
   switch (stream_type) {
     ENUM_TO_CSTR(Unused);
     ENUM_TO_CSTR(ThreadList);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index 2c5e6f19ff9a..14599f8d572a 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -17,6 +17,7 @@
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
 
+#include "lldb/Utility/RangeMap.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -35,6 +36,9 @@ namespace minidump {
 
 // Describes a range of memory captured in the Minidump
 struct Range {
+  // Default constructor required for range data vector
+  // but unusued.
+  Range() = default;
   lldb::addr_t start; // virtual address of the beginning of the range
   // range_ref - absolute pointer to the first byte of the range and size
   llvm::ArrayRef<uint8_t> range_ref;
@@ -45,9 +49,18 @@ struct Range {
   friend bool operator==(const Range &lhs, const Range &rhs) {
     return lhs.start == rhs.start && lhs.range_ref == rhs.range_ref;
   }
+
+  friend bool operator<(const Range &lhs, const Range &rhs) {
+    if (lhs.start == rhs.start)
+      return lhs.range_ref.size() < rhs.range_ref.size();
+    return lhs.start < rhs.start;
+  }
 };
 
-using FallibleMemory64Iterator = llvm::object::MinidumpFile::FallibleMemory64Iterator;
+using MemoryRangeVector =
+    lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t, minidump::Range>;
+using FallibleMemory64Iterator =
+    llvm::object::MinidumpFile::FallibleMemory64Iterator;
 using ExceptionStreamsIterator =
     llvm::object::MinidumpFile::ExceptionStreamsIterator;
 
@@ -97,7 +110,8 @@ public:
   /// complete (includes all regions mapped into the process memory).
   std::pair<MemoryRegionInfos, bool> BuildMemoryRegions();
 
-  llvm::iterator_range<FallibleMemory64Iterator> GetMemory64Iterator(llvm::Error &err);
+  llvm::iterator_range<FallibleMemory64Iterator>
+  GetMemory64Iterator(llvm::Error &err);
 
   static llvm::StringRef GetStreamTypeAsString(StreamType stream_type);
 
@@ -109,10 +123,11 @@ public:
 private:
   MinidumpParser(lldb::DataBufferSP data_sp,
                  std::unique_ptr<llvm::object::MinidumpFile> file);
-
+  void PopulateMemoryRanges();
   lldb::DataBufferSP m_data_sp;
   std::unique_ptr<llvm::object::MinidumpFile> m_file;
   ArchSpec m_arch;
+  MemoryRangeVector m_memory_ranges;
 };
 
 } // end namespace minidump

From a2ad65661ad560b04952d4d992248d2db3be36c8 Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Wed, 18 Jun 2025 07:54:08 +0530
Subject: [PATCH 0752/1322] [RISCV] Add patterns for generating QC_CTO and
 QC_CLO (#144532)

These instructions count leading/trailing ones in the register.

Currently these are only generated when we have `Zbb` enabled (along
with `Xqcibm`) since it contains the `CTTZ/CTLZ` instructions.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |   5 +
 llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll   | 958 ++++++++++++++++++++
 2 files changed, 963 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index b94fee3c6e57..09852c6fd596 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1452,6 +1452,11 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
 def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
+let Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32] in {
+def: Pat<(i32 (cttz (not (i32 GPR:$rs1)))), (QC_CTO GPR:$rs1)>;
+def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>;
+} // Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32]
+
 let Predicates = [HasVendorXqciint, IsRV32] in
 def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>;
 
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
new file mode 100644
index 000000000000..fe2bcf00ba7d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
@@ -0,0 +1,958 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+zbb,experimental-xqcibm -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBBXQCIBM
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @test_cttz_i8(i8 %a) nounwind {
+; RV32I-LABEL: test_cttz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    zext.b a1, a0
+; RV32I-NEXT:    beqz a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    li a1, 256
+; RV32ZBB-NEXT:    orn a0, a1, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i8:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    li a1, 256
+; RV32ZBBXQCIBM-NEXT:    orn a0, a1, a0
+; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.cttz.i8(i8 %1, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_cttz_i16(i16 %a) nounwind {
+; RV32I-LABEL: test_cttz_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    beqz a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    li a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lui a1, 16
+; RV32ZBB-NEXT:    orn a0, a1, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    lui a1, 16
+; RV32ZBBXQCIBM-NEXT:    orn a0, a1, a0
+; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.cttz.i16(i16 %1, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_cttz_i32(i32 %a) nounwind {
+; RV32I-LABEL: test_cttz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    beqz a0, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI2_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI2_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i32:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.cttz.i32(i32 %1, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_cttz_i64(i64 %a) nounwind {
+; RV32I-LABEL: test_cttz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not s3, a1
+; RV32I-NEXT:    not s2, a0
+; RV32I-NEXT:    or a0, s2, s3
+; RV32I-NEXT:    beqz a0, .LBB3_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a0, s2
+; RV32I-NEXT:    and a0, s2, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi s1, a1, 1329
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
+; RV32I-NEXT:    neg a0, s3
+; RV32I-NEXT:    and a0, s3, a0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s2, .LBB3_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, s4, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB3_5
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    j .LBB3_6
+; RV32I-NEXT:  .LBB3_4:
+; RV32I-NEXT:    srli s0, s0, 27
+; RV32I-NEXT:    add s0, s4, s0
+; RV32I-NEXT:    lbu a0, 0(s0)
+; RV32I-NEXT:  .LBB3_5: # %cond.false
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:  .LBB3_6: # %cond.end
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    bnez a0, .LBB3_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a1
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB3_2:
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i64:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a0
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB3_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a1
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB3_2:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.cttz.i64(i64 %1, i1 false)
+  ret i64 %tmp
+}
+
+define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
+; RV32I-LABEL: test_cttz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i8_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.cttz.i8(i8 %1, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
+; RV32I-LABEL: test_cttz_i16_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a0
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i16_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.cttz.i16(i16 %1, i1 true)
+  ret i16 %tmp
+}
+
+define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
+; RV32I-LABEL: test_cttz_i32_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i32_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i32_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.cttz.i32(i32 %1, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
+; RV32I-LABEL: test_cttz_i64_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not s3, a1
+; RV32I-NEXT:    not s4, a0
+; RV32I-NEXT:    neg a0, s4
+; RV32I-NEXT:    and a0, s4, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi s1, a1, 1329
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui s2, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi s2, s2, %lo(.LCPI7_0)
+; RV32I-NEXT:    neg a0, s3
+; RV32I-NEXT:    and a0, s3, a0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s4, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, s2, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    srli s0, s0, 27
+; RV32I-NEXT:    add s0, s2, s0
+; RV32I-NEXT:    lbu a0, 0(s0)
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i64_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    bnez a0, .LBB7_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a1
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB7_2:
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i64_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a0
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB7_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a1
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB7_2:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.cttz.i64(i64 %1, i1 true)
+  ret i64 %tmp
+}
+
+define i8 @test_ctlz_i8(i8 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    zext.b a1, a0
+; RV32I-NEXT:    beqz a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i8:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 24
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_ctlz_i16(i16 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    beqz a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a1, 17
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 18
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 20
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    li a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a0, 16
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i16:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 16
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.ctlz.i16(i16 %1, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_ctlz_i32(i32 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    beqz a0, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i32:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_ctlz_i64(i64 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a3, a1
+; RV32I-NEXT:    not a4, a0
+; RV32I-NEXT:    or a0, a4, a3
+; RV32I-NEXT:    beqz a0, .LBB11_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a2, a0, 1365
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    addi a0, a5, -241
+; RV32I-NEXT:    bnez a3, .LBB11_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
+; RV32I-NEXT:    srli a3, a4, 1
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srli a4, a3, 2
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 4
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    srli a3, a3, 2
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_4:
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 2
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 4
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    srli a3, a3, 2
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    bnez a1, .LBB11_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB11_2:
+; RV32ZBB-NEXT:    clz a0, a1
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i64:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a1
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB11_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB11_2:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a1
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.ctlz.i64(i64 %1, i1 false)
+  ret i64 %tmp
+}
+
+define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i8_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 24
+; RV32ZBBXQCIBM-NEXT:    clz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.ctlz.i8(i8 %1, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i16_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a1, 5
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    srli a2, a2, 17
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 18
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 20
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    slli a0, a0, 16
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i16_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 16
+; RV32ZBBXQCIBM-NEXT:    clz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.ctlz.i16(i16 %1, i1 true)
+  ret i16 %tmp
+}
+
+define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i32_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i32_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.ctlz.i32(i32 %1, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i64_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a4, a1
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a3, a1, 1365
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    addi a1, a5, -241
+; RV32I-NEXT:    bnez a4, .LBB15_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB15_2:
+; RV32I-NEXT:    srli a0, a4, 1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i64_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    bnez a1, .LBB15_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB15_2:
+; RV32ZBB-NEXT:    clz a0, a1
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i64_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a1
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB15_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB15_2:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a1
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.ctlz.i64(i64 %1, i1 true)
+  ret i64 %tmp
+}

From e14f327d8094e02134efa98625acaf6fd43fee08 Mon Sep 17 00:00:00 2001
From: Liao Chunyu <chunyu@iscas.ac.cn>
Date: Tue, 17 Jun 2025 23:32:01 -0400
Subject: [PATCH 0753/1322] [RISCV] Pre-test for #144461

---
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 371ec7c790dd..522c83fd9fa9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -470,6 +470,28 @@ define <vscale x 2 x i64> @select_nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i6
   ret <vscale x 2 x i64> %v
 }
 
+define <vscale x 2 x i64> @select_nxv2i64_constant_true(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> %b, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @select_nxv2i64_constant_false(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 100
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> splat (i64 100), i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
 declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @select_nxv4i64(<vscale x 4 x i1> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c, i32 zeroext %evl) {
@@ -702,10 +724,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    bltu a2, a1, .LBB48_2
+; CHECK-NEXT:    bltu a2, a1, .LBB50_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:  .LBB48_2:
+; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload

From af49a650e172d56d684581b66afa9ab0368ec8f9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 13:23:17 +0900
Subject: [PATCH 0754/1322] PowerPC: Add baseline tests for more f128 libcall
 handling (#144381)

Some of these incorrectly call the l suffixed version of libm
functions and others assert.
---
 llvm/test/CodeGen/PowerPC/f128-arith.ll | 445 ++++++++++++++++++++++++
 1 file changed, 445 insertions(+)

diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index decc4a38f7cc..ffa7ac6cb007 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1403,3 +1403,448 @@ entry:
   ret fp128 %3
 }
 declare { fp128, i32 } @llvm.frexp.f128.i32(fp128)
+
+
+define dso_local fp128 @acos_f128(fp128 %x) {
+; CHECK-LABEL: acos_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl acosl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: acos_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl acosl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.acos.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @asin_f128(fp128 %x) {
+; CHECK-LABEL: asin_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl asinl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: asin_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl asinl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.asin.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @atan_f128(fp128 %x) {
+; CHECK-LABEL: atan_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl atanl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: atan_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl atanl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.atan.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
+; CHECK-LABEL: atan2_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl atan2l
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: atan2_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl atan2l
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.atan2.f128(fp128 %x, fp128 %y)
+  ret fp128 %result
+}
+
+define dso_local fp128 @copysign_f128(fp128 %x, fp128 %y) {
+; CHECK-LABEL: copysign_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xscpsgnqp v2, v3, v2
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: copysign_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    addi r3, r1, -16
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    addi r3, r1, -32
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lbz r4, -1(r1)
+; CHECK-P8-NEXT:    lbz r5, -17(r1)
+; CHECK-P8-NEXT:    rlwimi r5, r4, 0, 0, 24
+; CHECK-P8-NEXT:    stb r5, -17(r1)
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.copysign.f128(fp128 %x, fp128 %y)
+  ret fp128 %result
+}
+
+define dso_local fp128 @cosh_f128(fp128 %x) {
+; CHECK-LABEL: cosh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl coshl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: cosh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl coshl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.cosh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @exp10_f128(fp128 %x) {
+; CHECK-LABEL: exp10_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl exp10l
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: exp10_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl exp10l
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.exp10.f128(fp128 %x)
+  ret fp128 %result
+}
+
+; FIXME: Asserts
+; define dso_local fp128 @maximum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.maximum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @minimum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.minimum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @maximumnum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @minimumnum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.minimumnum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
+; CHECK-LABEL: ldexp_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    extsw r5, r5
+; CHECK-NEXT:    bl ldexpl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: ldexp_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    extsw r5, r5
+; CHECK-P8-NEXT:    bl ldexpl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.ldexp.f128.i32(fp128 %x, i32 %y)
+  ret fp128 %result
+}
+
+define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
+; CHECK-LABEL: modf_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lxv v3, 32(r1)
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: modf_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    .cfi_offset r30, -16
+; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    stdu r1, -64(r1)
+; CHECK-P8-NEXT:    addi r30, r1, 32
+; CHECK-P8-NEXT:    std r0, 80(r1)
+; CHECK-P8-NEXT:    mr r5, r30
+; CHECK-P8-NEXT:    bl modfl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r30
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    addi r1, r1, 64
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call { fp128, fp128 } @llvm.modf.f128(fp128 %x)
+  ret { fp128, fp128 } %result
+}
+
+define dso_local fp128 @roundeven_f128(fp128 %x) {
+; CHECK-LABEL: roundeven_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl roundevenl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: roundeven_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl roundevenl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.roundeven.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @sinh_f128(fp128 %x) {
+; CHECK-LABEL: sinh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl sinhl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: sinh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl sinhl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.sinh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @tanh_f128(fp128 %x) {
+; CHECK-LABEL: tanh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl tanhl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: tanh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl tanhl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.tanh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @tan_f128(fp128 %x) {
+; CHECK-LABEL: tan_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl tanl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: tan_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl tanl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.tan.f128(fp128 %x)
+  ret fp128 %result
+}

From 7b9d10d2e6410029fd0750b2e0566432dbf03dc7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 13:26:15 +0900
Subject: [PATCH 0755/1322] PowerPC: Fix using long double libm functions for
 f128 intrinsics (#144382)

This wasn't setting the correct libcall names, which default to the
l suffixed libm names.
---
 llvm/lib/IR/RuntimeLibcalls.cpp         | 143 +++++++++++-------------
 llvm/test/CodeGen/PowerPC/f128-arith.ll |  48 ++++----
 2 files changed, 91 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 7396626a03d4..a57b08919346 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -281,6 +281,69 @@ void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
   SoftFloatCompareLibcallPredicates[RTLIB::UO_PPCF128] = CmpInst::ICMP_NE;
 }
 
+static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
+                                    bool FiniteOnlyFuncs = false) {
+  Info.setLibcallName(RTLIB::REM_F128, "fmodf128");
+  Info.setLibcallName(RTLIB::FMA_F128, "fmaf128");
+  Info.setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+  Info.setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
+  Info.setLibcallName(RTLIB::LOG_F128, "logf128");
+  Info.setLibcallName(RTLIB::LOG2_F128, "log2f128");
+  Info.setLibcallName(RTLIB::LOG10_F128, "log10f128");
+  Info.setLibcallName(RTLIB::EXP_F128, "expf128");
+  Info.setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+  Info.setLibcallName(RTLIB::EXP10_F128, "exp10f128");
+  Info.setLibcallName(RTLIB::SIN_F128, "sinf128");
+  Info.setLibcallName(RTLIB::COS_F128, "cosf128");
+  Info.setLibcallName(RTLIB::TAN_F128, "tanf128");
+  Info.setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+  Info.setLibcallName(RTLIB::ASIN_F128, "asinf128");
+  Info.setLibcallName(RTLIB::ACOS_F128, "acosf128");
+  Info.setLibcallName(RTLIB::ATAN_F128, "atanf128");
+  Info.setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
+  Info.setLibcallName(RTLIB::SINH_F128, "sinhf128");
+  Info.setLibcallName(RTLIB::COSH_F128, "coshf128");
+  Info.setLibcallName(RTLIB::TANH_F128, "tanhf128");
+  Info.setLibcallName(RTLIB::POW_F128, "powf128");
+  Info.setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+  Info.setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+  Info.setLibcallName(RTLIB::RINT_F128, "rintf128");
+  Info.setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+  Info.setLibcallName(RTLIB::ROUND_F128, "roundf128");
+  Info.setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
+  Info.setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+  Info.setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
+  Info.setLibcallName(RTLIB::FMIN_F128, "fminf128");
+  Info.setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+  Info.setLibcallName(RTLIB::FMINIMUM_F128, "fminimumf128");
+  Info.setLibcallName(RTLIB::FMAXIMUM_F128, "fmaximumf128");
+  Info.setLibcallName(RTLIB::FMINIMUM_NUM_F128, "fminimum_numf128");
+  Info.setLibcallName(RTLIB::FMAXIMUM_NUM_F128, "fmaximum_numf128");
+  Info.setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+  Info.setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+  Info.setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+  Info.setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+  Info.setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
+  Info.setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+  Info.setLibcallName(RTLIB::MODF_F128, "modff128");
+
+  if (FiniteOnlyFuncs) {
+    Info.setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
+    Info.setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
+    Info.setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
+    Info.setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
+    Info.setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
+    Info.setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
+  } else {
+    Info.setLibcallName(RTLIB::LOG_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::LOG2_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::LOG10_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::EXP_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::EXP2_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::POW_FINITE_F128, nullptr);
+  }
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
@@ -295,57 +358,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 #undef LIBCALL_NO_NAME
 
   // Use the f128 variants of math functions on x86
-  if (TT.isX86() && TT.isGNUEnvironment()) {
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
-    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::TAN_F128, "tanf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::ASIN_F128, "asinf128");
-    setLibcallName(RTLIB::ACOS_F128, "acosf128");
-    setLibcallName(RTLIB::ATAN_F128, "atanf128");
-    setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
-    setLibcallName(RTLIB::SINH_F128, "sinhf128");
-    setLibcallName(RTLIB::COSH_F128, "coshf128");
-    setLibcallName(RTLIB::TANH_F128, "tanhf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::FMINIMUM_F128, "fminimumf128");
-    setLibcallName(RTLIB::FMAXIMUM_F128, "fmaximumf128");
-    setLibcallName(RTLIB::FMINIMUM_NUM_F128, "fminimum_numf128");
-    setLibcallName(RTLIB::FMAXIMUM_NUM_F128, "fmaximum_numf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
-    setLibcallName(RTLIB::MODF_F128, "modff128");
-  }
+  if (TT.isX86() && TT.isGNUEnvironment())
+    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/true);
 
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
   if (TT.isPPC()) {
@@ -379,31 +393,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::OGT_F128, "__gtkf2");
     setLibcallName(RTLIB::UO_F128, "__unordkf2");
 
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+    // TODO: Do the finite only functions exist?
+    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/false);
 
     if (TT.isOSAIX()) {
       bool isPPC64 = TT.isPPC64();
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index ffa7ac6cb007..f9c953d483ff 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1413,7 +1413,7 @@ define dso_local fp128 @acos_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl acosl
+; CHECK-NEXT:    bl acosf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1427,7 +1427,7 @@ define dso_local fp128 @acos_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl acosl
+; CHECK-P8-NEXT:    bl acosf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1445,7 +1445,7 @@ define dso_local fp128 @asin_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl asinl
+; CHECK-NEXT:    bl asinf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1459,7 +1459,7 @@ define dso_local fp128 @asin_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl asinl
+; CHECK-P8-NEXT:    bl asinf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1477,7 +1477,7 @@ define dso_local fp128 @atan_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl atanl
+; CHECK-NEXT:    bl atanf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1491,7 +1491,7 @@ define dso_local fp128 @atan_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl atanl
+; CHECK-P8-NEXT:    bl atanf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1509,7 +1509,7 @@ define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl atan2l
+; CHECK-NEXT:    bl atan2f128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1523,7 +1523,7 @@ define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl atan2l
+; CHECK-P8-NEXT:    bl atan2f128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1566,7 +1566,7 @@ define dso_local fp128 @cosh_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl coshl
+; CHECK-NEXT:    bl coshf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1580,7 +1580,7 @@ define dso_local fp128 @cosh_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl coshl
+; CHECK-P8-NEXT:    bl coshf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1598,7 +1598,7 @@ define dso_local fp128 @exp10_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl exp10l
+; CHECK-NEXT:    bl exp10f128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1612,7 +1612,7 @@ define dso_local fp128 @exp10_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl exp10l
+; CHECK-P8-NEXT:    bl exp10f128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1655,7 +1655,7 @@ define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    extsw r5, r5
-; CHECK-NEXT:    bl ldexpl
+; CHECK-NEXT:    bl ldexpf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1670,7 +1670,7 @@ define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    extsw r5, r5
-; CHECK-P8-NEXT:    bl ldexpl
+; CHECK-P8-NEXT:    bl ldexpf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1689,7 +1689,7 @@ define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    addi r5, r1, 32
-; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    bl modff128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    lxv v3, 32(r1)
 ; CHECK-NEXT:    addi r1, r1, 48
@@ -1708,7 +1708,7 @@ define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    addi r30, r1, 32
 ; CHECK-P8-NEXT:    std r0, 80(r1)
 ; CHECK-P8-NEXT:    mr r5, r30
-; CHECK-P8-NEXT:    bl modfl
+; CHECK-P8-NEXT:    bl modff128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r30
 ; CHECK-P8-NEXT:    xxswapd v3, vs0
@@ -1729,7 +1729,7 @@ define dso_local fp128 @roundeven_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl roundevenl
+; CHECK-NEXT:    bl roundevenf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1743,7 +1743,7 @@ define dso_local fp128 @roundeven_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl roundevenl
+; CHECK-P8-NEXT:    bl roundevenf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1761,7 +1761,7 @@ define dso_local fp128 @sinh_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl sinhl
+; CHECK-NEXT:    bl sinhf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1775,7 +1775,7 @@ define dso_local fp128 @sinh_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl sinhl
+; CHECK-P8-NEXT:    bl sinhf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1793,7 +1793,7 @@ define dso_local fp128 @tanh_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl tanhl
+; CHECK-NEXT:    bl tanhf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1807,7 +1807,7 @@ define dso_local fp128 @tanh_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl tanhl
+; CHECK-P8-NEXT:    bl tanhf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1825,7 +1825,7 @@ define dso_local fp128 @tan_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl tanl
+; CHECK-NEXT:    bl tanf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1839,7 +1839,7 @@ define dso_local fp128 @tan_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl tanl
+; CHECK-P8-NEXT:    bl tanf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)

From ad9e591fd53f2cf91a2744973b59669d873658af Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 17 Jun 2025 21:33:50 -0700
Subject: [PATCH 0756/1322] [SelectionDAG][RISCV] Fold (add (vscale * C0),
 (vscale * C1)) to (vscale * (C0 + C1)) in getNode. (#144565)

We already have shl/mul vscale related folds in getNode.

This is an alternative to the DAGCombine proposed in #144507.
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   7 +
 .../CodeGen/RISCV/rvv/extract-subvector.ll    |   4 -
 .../CodeGen/RISCV/rvv/insert-subvector.ll     |  32 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    | 283 +++---
 llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll       |  12 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |   9 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll  |   3 +-
 .../RISCV/rvv/undef-earlyclobber-chain.ll     |   2 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    | 184 ++--
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |  18 -
 .../RISCV/rvv/vector-interleave-store.ll      |   9 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    | 817 ++++++++----------
 llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll  |  12 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 100 +--
 14 files changed, 682 insertions(+), 810 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 45a37622a531..b0e3f534e2aa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7377,6 +7377,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if ((Opcode == ISD::ADD || Opcode == ISD::SUB) &&
         VT.getScalarType() == MVT::i1)
       return getNode(ISD::XOR, DL, VT, N1, N2);
+    // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
+    if (Opcode == ISD::ADD && N1.getOpcode() == ISD::VSCALE &&
+        N2.getOpcode() == ISD::VSCALE) {
+      const APInt &C1 = N1->getConstantOperandAPInt(0);
+      const APInt &C2 = N2->getConstantOperandAPInt(0);
+      return getVScale(DL, VT, C1 + C2);
+    }
     break;
   case ISD::MUL:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index d42c42c7ce03..7c9a283dd54b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -488,8 +488,6 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v13, v10, a0
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
@@ -543,8 +541,6 @@ define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v13, v10, a0
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 61cf1f56aee3..22ddd4f8a95d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -81,8 +81,7 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
@@ -246,8 +245,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -282,8 +280,8 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_1(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-LABEL: insert_nxv16i8_nxv1i8_1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
@@ -310,11 +308,11 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
   ret <vscale x 16 x i8> %v
@@ -363,8 +361,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -376,8 +373,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
@@ -422,8 +418,8 @@ define <vscale x 32 x i1> @insert_nxv32i1_nxv8i1_8(<vscale x 32 x i1> %v, <vscal
 ; CHECK-LABEL: insert_nxv32i1_nxv8i1_8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a0
 ; CHECK-NEXT:    ret
@@ -570,8 +566,7 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_2(<vscale x 32 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 2)
@@ -583,8 +578,7 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_26(<vscale x 32 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 26)
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 28b27bb75f21..9972df97ad9f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1371,6 +1371,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
@@ -1378,9 +1380,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1406,6 +1407,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli t0, t0, 1
 ; CHECK-NEXT:    mv t1, t0
 ; CHECK-NEXT:    slli t0, t0, 2
+; CHECK-NEXT:    add t1, t1, t0
+; CHECK-NEXT:    slli t0, t0, 1
 ; CHECK-NEXT:    add t0, t0, t1
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
@@ -1413,9 +1416,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a1
 ; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv t0, a0
 ; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add t0, t0, a0
+; CHECK-NEXT:    mv t0, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, t0
 ; CHECK-NEXT:    add a0, sp, a0
@@ -1445,10 +1447,6 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1457,85 +1455,95 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v6, v24, v16, v0.t
-; CHECK-NEXT:    add a0, a3, a3
+; CHECK-NEXT:    vmfeq.vv v7, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a5
 ; CHECK-NEXT:  .LBB85_4:
-; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 2
-; CHECK-NEXT:    add a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a3
-; CHECK-NEXT:    sltu a6, a2, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add a7, a7, a6
-; CHECK-NEXT:    slli a6, a6, 3
-; CHECK-NEXT:    add a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    sub a0, a2, a4
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    mv a6, a5
 ; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    add a6, a6, a5
+; CHECK-NEXT:    mv a6, a5
 ; CHECK-NEXT:    slli a5, a5, 2
+; CHECK-NEXT:    add a6, a6, a5
+; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    add a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a3
+; CHECK-NEXT:    sltu a5, a2, a0
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a0, a5, a0
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 1
+; CHECK-NEXT:    mv a6, a5
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a5, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v4, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v6, v5, a3
+; CHECK-NEXT:    vmfeq.vv v10, v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v9, v7
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v5, a3
 ; CHECK-NEXT:    bltu a2, a4, .LBB85_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a5, a5, a4
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a4, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a4, a4, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a2, a2, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v4, a3
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v6, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a3
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    mv a1, a0
@@ -3546,8 +3554,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT:    add a0, a1, a1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v6, a1
 ; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
@@ -3576,6 +3583,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -3583,9 +3592,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
@@ -3611,6 +3619,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli t0, t0, 1
 ; ZVFHMIN-NEXT:    mv t1, t0
 ; ZVFHMIN-NEXT:    slli t0, t0, 2
+; ZVFHMIN-NEXT:    add t1, t1, t0
+; ZVFHMIN-NEXT:    slli t0, t0, 1
 ; ZVFHMIN-NEXT:    add t0, t0, t1
 ; ZVFHMIN-NEXT:    add t0, sp, t0
 ; ZVFHMIN-NEXT:    addi t0, t0, 16
@@ -3618,9 +3628,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v8, a1
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv t0, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add t0, t0, a0
+; ZVFHMIN-NEXT:    mv t0, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -3650,10 +3659,6 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -3662,85 +3667,95 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v16, v0.t
-; ZVFHMIN-NEXT:    add a0, a3, a3
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB171_4:
-; ZVFHMIN-NEXT:    sub a5, a2, a4
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 2
-; ZVFHMIN-NEXT:    add a6, a6, a7
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a6, a2, a5
-; ZVFHMIN-NEXT:    addi a6, a6, -1
-; ZVFHMIN-NEXT:    and a5, a6, a5
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    add a7, a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 3
-; ZVFHMIN-NEXT:    add a6, a6, a7
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    sub a0, a2, a4
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    mv a6, a5
 ; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a6, a6, a5
+; ZVFHMIN-NEXT:    mv a6, a5
 ; ZVFHMIN-NEXT:    slli a5, a5, 2
+; ZVFHMIN-NEXT:    add a6, a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 1
 ; ZVFHMIN-NEXT:    add a5, a5, a6
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    sltu a5, a2, a0
+; ZVFHMIN-NEXT:    addi a5, a5, -1
+; ZVFHMIN-NEXT:    and a0, a5, a0
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a5, a5, 1
+; ZVFHMIN-NEXT:    mv a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 3
+; ZVFHMIN-NEXT:    add a5, a5, a6
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a5, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v4, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v6, v5, a3
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v9, v5, a3
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a5, a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, a0, a4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
-; ZVFHMIN-NEXT:    add a0, a1, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v10, a3
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    mv a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index ae868fed68ca..ff923efe8eb4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -4280,8 +4280,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV32-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    add a1, a0, a0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslideup.vx v0, v24, a0
 ; RV32-NEXT:    ret
 ;
@@ -4293,8 +4292,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    srli a0, a0, 3
-; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslideup.vx v0, v24, a0
 ; RV64-NEXT:    ret
 ;
@@ -4306,8 +4304,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-NEXT:    vmfeq.vf v24, v16, fa5
 ; ZVFHMIN32-NEXT:    vmfeq.vf v0, v8, fa5
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
-; ZVFHMIN32-NEXT:    add a1, a0, a0
-; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; ZVFHMIN32-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -4319,8 +4316,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-NEXT:    vmfeq.vf v24, v16, fa5
 ; ZVFHMIN64-NEXT:    vmfeq.vf v0, v8, fa5
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
-; ZVFHMIN64-NEXT:    add a1, a0, a0
-; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; ZVFHMIN64-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN64-NEXT:    ret
   %vc = fcmp oeq <vscale x 16 x double> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index ef560a7631de..13c63d9c80a9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2246,8 +2246,7 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v6, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2283,8 +2282,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
@@ -2316,8 +2314,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index bd3c29b0c6ef..a85b471530cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -3001,9 +3001,8 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0
 ; CHECK-NEXT:    ret
   %vc = icmp eq <vscale x 16 x i64> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
index c9f9a7973300..790cd56ee952 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
@@ -48,10 +48,10 @@ define internal void @SubRegLivenessUndefInPhi(i64 %cond) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vadd.vi v10, v9, 1
 ; CHECK-NEXT:    vadd.vi v11, v9, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index baace6d26f14..4753ab915bdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -191,8 +191,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 4
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
@@ -222,8 +221,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave4_v2i32_
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -254,15 +252,13 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v13, v12, a1
-; CHECK-NEXT:    vslideup.vx v8, v14, a1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v13, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v13, v12, a0
+; CHECK-NEXT:    vslideup.vx v8, v14, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -292,16 +288,14 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 10
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v15, v14, a1
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
-; CHECK-NEXT:    vslideup.vx v12, v10, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v15, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v15, v14, a0
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v15, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v12
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -330,22 +324,19 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 2
 ; CHECK-NEXT:    vslidedown.vi v13, v8, 4
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 6
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    add a4, a2, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a2
+; CHECK-NEXT:    add a3, a1, a2
+; CHECK-NEXT:    vslideup.vx v8, v12, a2
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    add a1, a4, a1
-; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v11, a2
-; CHECK-NEXT:    vslideup.vx v8, v13, a2
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v14, a4
-; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a1
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v14, a3
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
@@ -374,23 +365,20 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; CHECK-NEXT:    vslidedown.vi v13, v8, 2
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 4
 ; CHECK-NEXT:    vslidedown.vi v15, v8, 6
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    add a4, a2, a1
-; CHECK-NEXT:    add a5, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a2
+; CHECK-NEXT:    add a3, a1, a2
+; CHECK-NEXT:    vslideup.vx v8, v13, a2
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vslideup.vx v8, v13, a1
-; CHECK-NEXT:    add a1, a4, a1
-; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v11, a2
-; CHECK-NEXT:    vslideup.vx v8, v14, a2
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v12, a4
-; CHECK-NEXT:    vslideup.vx v8, v15, a4
-; CHECK-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a1
+; CHECK-NEXT:    vslideup.vx v8, v14, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a3
+; CHECK-NEXT:    vslideup.vx v8, v15, a3
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
@@ -551,8 +539,7 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v6f32_v2f32
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 4
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
@@ -586,8 +573,7 @@ define {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @vector_deinterleave
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -622,15 +608,13 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v13, v12, a1
-; CHECK-NEXT:    vslideup.vx v8, v14, a1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v13, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v13, v12, a0
+; CHECK-NEXT:    vslideup.vx v8, v14, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -664,16 +648,14 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>}
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 10
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v15, v14, a1
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
-; CHECK-NEXT:    vslideup.vx v12, v10, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v15, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v15, v14, a0
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v15, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v12
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -707,21 +689,18 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vslidedown.vi v13, v8, 5
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 6
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v9, a1
-; CHECK-NEXT:    vslideup.vx v10, v12, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v11, a0
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v9, a0
+; CHECK-NEXT:    vslideup.vx v10, v12, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a1
 ; CHECK-NEXT:    vslidedown.vi v11, v8, 4
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v13, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v14, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v13, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v14, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs2r.v v10, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
@@ -751,25 +730,22 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 7
 ; CHECK-NEXT:    vslidedown.vi v11, v8, 6
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 5
-; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v10, a1
-; CHECK-NEXT:    vslideup.vx v9, v12, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v10, a0
+; CHECK-NEXT:    vslideup.vx v9, v12, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a1
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 3
 ; CHECK-NEXT:    vslidedown.vi v11, v8, 2
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 1
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v10, a1
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v10, a0
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 75f92c86ff09..6144f916ea52 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -2705,16 +2705,10 @@ define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
@@ -2801,16 +2795,10 @@ define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vs
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
@@ -2897,16 +2885,10 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscal
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 3751967f18aa..a5811e697634 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -14,18 +14,17 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    li a1, -1
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    vwmaccu.vx v8, a1, v12
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmsne.vi v12, v10, 0
 ; CHECK-NEXT:    vmsne.vi v10, v8, 0
-; CHECK-NEXT:    add a1, a2, a2
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v12, a2
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v10, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index e297e88c71f1..01cc5c58b24c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -17,18 +17,17 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; V-NEXT:    vmv1r.v v0, v8
 ; V-NEXT:    vmv.v.i v10, 0
 ; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
 ; V-NEXT:    vmerge.vim v12, v10, 1, v0
 ; V-NEXT:    vmv1r.v v0, v9
 ; V-NEXT:    vmerge.vim v14, v10, 1, v0
-; V-NEXT:    srli a1, a1, 2
 ; V-NEXT:    vwaddu.vv v8, v14, v12
 ; V-NEXT:    vwmaccu.vx v8, a0, v12
+; V-NEXT:    csrr a0, vlenb
 ; V-NEXT:    vmsne.vi v12, v10, 0
 ; V-NEXT:    vmsne.vi v0, v8, 0
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; V-NEXT:    vslideup.vx v0, v12, a1
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; V-NEXT:    vslideup.vx v0, v12, a0
 ; V-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
@@ -38,17 +37,16 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmv.v.i v10, 0
 ; ZVBB-NEXT:    li a0, 1
-; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmerge.vim v10, v10, 1, v0
-; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vwsll.vi v12, v10, 8
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vwaddu.wx v12, v12, a0, v0.t
+; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    srli a0, a0, 2
+; ZVBB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave_nxv32i1_nxv16i1:
@@ -61,13 +59,12 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZIP-NEXT:    vmerge.vim v12, v10, 1, v0
 ; ZIP-NEXT:    vmv1r.v v0, v9
 ; ZIP-NEXT:    vmerge.vim v8, v10, 1, v0
-; ZIP-NEXT:    srli a0, a0, 2
 ; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
 ; ZIP-NEXT:    ri.vzip2a.vv v14, v8, v12
 ; ZIP-NEXT:    vmsne.vi v8, v10, 0
 ; ZIP-NEXT:    vmsne.vi v0, v14, 0
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT:    srli a0, a0, 2
+; ZIP-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZIP-NEXT:    vslideup.vx v0, v8, a0
 ; ZIP-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
@@ -508,19 +505,17 @@ define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsseg3e8.v v14, (a0)
 ; CHECK-NEXT:    vl2r.v v8, (a2)
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    vl2r.v v10, (a3)
 ; CHECK-NEXT:    vl2r.v v12, (a0)
-; CHECK-NEXT:    add a0, a2, a2
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vmsne.vi v14, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v8, a2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v14, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v8, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v14, a2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 6
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -551,19 +546,17 @@ define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsseg3e8.v v14, (a0)
 ; ZVBB-NEXT:    vl2r.v v8, (a2)
-; ZVBB-NEXT:    srli a2, a1, 2
-; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    srli a2, a1, 1
 ; ZVBB-NEXT:    vl2r.v v10, (a3)
 ; ZVBB-NEXT:    vl2r.v v12, (a0)
-; ZVBB-NEXT:    add a0, a2, a2
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vmsne.vi v14, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
-; ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v14, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v14, a2
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 6
 ; ZVBB-NEXT:    mul a0, a0, a1
@@ -812,22 +805,20 @@ define <vscale x 64 x i1> @vector_interleave_nxv64i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    add a2, a4, a2
 ; CHECK-NEXT:    vsseg4e8.v v14, (a0)
 ; CHECK-NEXT:    vl2r.v v8, (a2)
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vl2r.v v10, (a4)
-; CHECK-NEXT:    add a4, a2, a2
 ; CHECK-NEXT:    vl2r.v v12, (a3)
 ; CHECK-NEXT:    vl2r.v v14, (a0)
 ; CHECK-NEXT:    vmsne.vi v16, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    vmsne.vi v9, v12, 0
 ; CHECK-NEXT:    vmsne.vi v0, v14, 0
-; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a2
-; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v8, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    vslideup.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v8, a2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -859,22 +850,20 @@ define <vscale x 64 x i1> @vector_interleave_nxv64i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    add a2, a4, a2
 ; ZVBB-NEXT:    vsseg4e8.v v14, (a0)
 ; ZVBB-NEXT:    vl2r.v v8, (a2)
-; ZVBB-NEXT:    srli a2, a1, 2
-; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vl2r.v v10, (a4)
-; ZVBB-NEXT:    add a4, a2, a2
 ; ZVBB-NEXT:    vl2r.v v12, (a3)
 ; ZVBB-NEXT:    vl2r.v v14, (a0)
 ; ZVBB-NEXT:    vmsne.vi v16, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v9, v12, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v14, 0
-; ZVBB-NEXT:    vsetvli zero, a4, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v16, a2
-; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v16, a1
+; ZVBB-NEXT:    vslideup.vx v0, v9, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a2
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 3
 ; ZVBB-NEXT:    add sp, sp, a0
@@ -1114,7 +1103,7 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    add a2, a4, a1
-; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    srli a3, a1, 1
 ; CHECK-NEXT:    vmv2r.v v20, v14
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
@@ -1144,11 +1133,9 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    add a5, a4, a1
 ; CHECK-NEXT:    vl1r.v v16, (a5)
 ; CHECK-NEXT:    add a5, a5, a1
-; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vl1r.v v11, (a2)
-; CHECK-NEXT:    add a2, a3, a3
 ; CHECK-NEXT:    vl1r.v v15, (a4)
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vl1r.v v13, (a0)
 ; CHECK-NEXT:    vl1r.v v17, (a5)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -1156,11 +1143,11 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    vmsne.vi v9, v12, 0
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v18, a3
-; CHECK-NEXT:    vslideup.vx v9, v8, a3
-; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v18, a1
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a3
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v8, v16, 0
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1190,7 +1177,7 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmerge.vim v18, v12, 1, v0
 ; ZVBB-NEXT:    add a2, a4, a1
-; ZVBB-NEXT:    srli a3, a1, 2
+; ZVBB-NEXT:    srli a3, a1, 1
 ; ZVBB-NEXT:    vmv2r.v v20, v14
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vmerge.vim v16, v12, 1, v0
@@ -1220,11 +1207,9 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    add a5, a4, a1
 ; ZVBB-NEXT:    vl1r.v v16, (a5)
 ; ZVBB-NEXT:    add a5, a5, a1
-; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vl1r.v v11, (a2)
-; ZVBB-NEXT:    add a2, a3, a3
 ; ZVBB-NEXT:    vl1r.v v15, (a4)
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vl1r.v v13, (a0)
 ; ZVBB-NEXT:    vl1r.v v17, (a5)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -1232,11 +1217,11 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmsne.vi v0, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vmsne.vi v9, v12, 0
-; ZVBB-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v18, a3
-; ZVBB-NEXT:    vslideup.vx v9, v8, a3
-; ZVBB-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v9, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v18, a1
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a3
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v8, v16, 0
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -2340,47 +2325,45 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmv1r.v v17, v9
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v24, v20, 1, v0
-; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    addi a4, sp, 16
 ; CHECK-NEXT:    vmv1r.v v18, v25
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    vmerge.vim v26, v20, 1, v0
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmv1r.v v19, v27
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vim v10, v20, 1, v0
-; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vmv1r.v v20, v11
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vsseg6e8.v v15, (a0)
 ; CHECK-NEXT:    vmv1r.v v15, v22
-; CHECK-NEXT:    add a4, a5, a2
+; CHECK-NEXT:    add a5, a4, a1
 ; CHECK-NEXT:    vmv1r.v v16, v8
-; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    srli a3, a1, 1
 ; CHECK-NEXT:    vmv1r.v v17, v24
-; CHECK-NEXT:    add a6, a4, a2
+; CHECK-NEXT:    add a6, a5, a1
 ; CHECK-NEXT:    vmv1r.v v18, v26
-; CHECK-NEXT:    add a7, a3, a2
+; CHECK-NEXT:    add a7, a2, a1
 ; CHECK-NEXT:    vmv1r.v v19, v10
-; CHECK-NEXT:    vsseg6e8.v v14, (a5)
+; CHECK-NEXT:    vsseg6e8.v v14, (a4)
 ; CHECK-NEXT:    vl1r.v v8, (a0)
-; CHECK-NEXT:    add a0, a6, a2
+; CHECK-NEXT:    add a0, a6, a1
 ; CHECK-NEXT:    vl1r.v v10, (a6)
-; CHECK-NEXT:    add a6, a7, a2
-; CHECK-NEXT:    vl1r.v v12, (a5)
-; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    add a6, a7, a1
+; CHECK-NEXT:    vl1r.v v12, (a4)
+; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl1r.v v14, (a7)
-; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vl1r.v v16, (a5)
-; CHECK-NEXT:    add a5, a5, a2
+; CHECK-NEXT:    add a7, a6, a1
+; CHECK-NEXT:    vl1r.v v16, (a4)
+; CHECK-NEXT:    add a4, a4, a1
 ; CHECK-NEXT:    vl1r.v v18, (a7)
-; CHECK-NEXT:    add a7, a7, a2
-; CHECK-NEXT:    srli a2, a2, 1
-; CHECK-NEXT:    vl1r.v v9, (a3)
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    vl1r.v v17, (a5)
-; CHECK-NEXT:    add a5, a2, a2
+; CHECK-NEXT:    add a7, a7, a1
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vl1r.v v9, (a2)
+; CHECK-NEXT:    vl1r.v v17, (a4)
 ; CHECK-NEXT:    vl1r.v v11, (a0)
-; CHECK-NEXT:    vl1r.v v13, (a4)
+; CHECK-NEXT:    vl1r.v v13, (a5)
 ; CHECK-NEXT:    vl1r.v v19, (a7)
 ; CHECK-NEXT:    vl1r.v v15, (a6)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -2390,12 +2373,12 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vmsne.vi v10, v18, 0
 ; CHECK-NEXT:    vmsne.vi v8, v14, 0
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v20, a1
 ; CHECK-NEXT:    vslideup.vx v0, v16, a1
-; CHECK-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a3
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 12
@@ -2427,47 +2410,45 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv1r.v v17, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v24, v20, 1, v0
-; ZVBB-NEXT:    addi a5, sp, 16
+; ZVBB-NEXT:    addi a4, sp, 16
 ; ZVBB-NEXT:    vmv1r.v v18, v25
 ; ZVBB-NEXT:    vmv1r.v v0, v11
 ; ZVBB-NEXT:    vmerge.vim v26, v20, 1, v0
-; ZVBB-NEXT:    csrr a2, vlenb
+; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmv1r.v v19, v27
 ; ZVBB-NEXT:    vmv1r.v v0, v12
 ; ZVBB-NEXT:    vmerge.vim v10, v20, 1, v0
-; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    vmv1r.v v20, v11
-; ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; ZVBB-NEXT:    vsseg6e8.v v15, (a0)
 ; ZVBB-NEXT:    vmv1r.v v15, v22
-; ZVBB-NEXT:    add a4, a5, a2
+; ZVBB-NEXT:    add a5, a4, a1
 ; ZVBB-NEXT:    vmv1r.v v16, v8
-; ZVBB-NEXT:    srli a1, a2, 2
+; ZVBB-NEXT:    srli a3, a1, 1
 ; ZVBB-NEXT:    vmv1r.v v17, v24
-; ZVBB-NEXT:    add a6, a4, a2
+; ZVBB-NEXT:    add a6, a5, a1
 ; ZVBB-NEXT:    vmv1r.v v18, v26
-; ZVBB-NEXT:    add a7, a3, a2
+; ZVBB-NEXT:    add a7, a2, a1
 ; ZVBB-NEXT:    vmv1r.v v19, v10
-; ZVBB-NEXT:    vsseg6e8.v v14, (a5)
+; ZVBB-NEXT:    vsseg6e8.v v14, (a4)
 ; ZVBB-NEXT:    vl1r.v v8, (a0)
-; ZVBB-NEXT:    add a0, a6, a2
+; ZVBB-NEXT:    add a0, a6, a1
 ; ZVBB-NEXT:    vl1r.v v10, (a6)
-; ZVBB-NEXT:    add a6, a7, a2
-; ZVBB-NEXT:    vl1r.v v12, (a5)
-; ZVBB-NEXT:    add a5, a0, a2
+; ZVBB-NEXT:    add a6, a7, a1
+; ZVBB-NEXT:    vl1r.v v12, (a4)
+; ZVBB-NEXT:    add a4, a0, a1
 ; ZVBB-NEXT:    vl1r.v v14, (a7)
-; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vl1r.v v16, (a5)
-; ZVBB-NEXT:    add a5, a5, a2
+; ZVBB-NEXT:    add a7, a6, a1
+; ZVBB-NEXT:    vl1r.v v16, (a4)
+; ZVBB-NEXT:    add a4, a4, a1
 ; ZVBB-NEXT:    vl1r.v v18, (a7)
-; ZVBB-NEXT:    add a7, a7, a2
-; ZVBB-NEXT:    srli a2, a2, 1
-; ZVBB-NEXT:    vl1r.v v9, (a3)
-; ZVBB-NEXT:    add a3, a1, a1
-; ZVBB-NEXT:    vl1r.v v17, (a5)
-; ZVBB-NEXT:    add a5, a2, a2
+; ZVBB-NEXT:    add a7, a7, a1
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    vl1r.v v9, (a2)
+; ZVBB-NEXT:    vl1r.v v17, (a4)
 ; ZVBB-NEXT:    vl1r.v v11, (a0)
-; ZVBB-NEXT:    vl1r.v v13, (a4)
+; ZVBB-NEXT:    vl1r.v v13, (a5)
 ; ZVBB-NEXT:    vl1r.v v19, (a7)
 ; ZVBB-NEXT:    vl1r.v v15, (a6)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -2477,12 +2458,12 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    vmsne.vi v10, v18, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
-; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v20, a1
 ; ZVBB-NEXT:    vslideup.vx v0, v16, a1
-; ZVBB-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a3
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 12
@@ -3676,23 +3657,21 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v14, 0
-; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    sub a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
-; CHECK-NEXT:    add a3, a4, a2
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    add a2, a3, a1
 ; CHECK-NEXT:    vmv4r.v v24, v16
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
-; CHECK-NEXT:    add a6, a3, a2
+; CHECK-NEXT:    add a4, a2, a1
 ; CHECK-NEXT:    vmv1r.v v25, v22
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
@@ -3704,41 +3683,41 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v28, v20
 ; CHECK-NEXT:    vmv1r.v v18, v23
-; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    add a5, a4, a1
 ; CHECK-NEXT:    vmv1r.v v29, v10
 ; CHECK-NEXT:    vmv1r.v v20, v9
 ; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    vmerge.vim v30, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v22, v11
-; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsseg7e8.v v24, (a4)
+; CHECK-NEXT:    vsetvli a6, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg7e8.v v24, (a3)
 ; CHECK-NEXT:    vmv1r.v v23, v31
 ; CHECK-NEXT:    vsseg7e8.v v17, (a0)
-; CHECK-NEXT:    vl1r.v v8, (a6)
-; CHECK-NEXT:    add a6, a7, a2
-; CHECK-NEXT:    vl1r.v v10, (a4)
-; CHECK-NEXT:    add a4, a6, a2
-; CHECK-NEXT:    vl1r.v v12, (a6)
-; CHECK-NEXT:    add a6, a4, a2
-; CHECK-NEXT:    vl1r.v v14, (a6)
-; CHECK-NEXT:    add a6, a5, a2
-; CHECK-NEXT:    vl1r.v v16, (a5)
-; CHECK-NEXT:    add a5, a6, a2
-; CHECK-NEXT:    vl1r.v v18, (a5)
-; CHECK-NEXT:    add a5, a5, a2
-; CHECK-NEXT:    vl1r.v v9, (a7)
-; CHECK-NEXT:    add a7, a5, a2
-; CHECK-NEXT:    vl1r.v v20, (a7)
-; CHECK-NEXT:    add a7, a7, a2
-; CHECK-NEXT:    srli a2, a2, 1
-; CHECK-NEXT:    vl1r.v v11, (a3)
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    vl1r.v v13, (a4)
-; CHECK-NEXT:    add a4, a2, a2
+; CHECK-NEXT:    vl1r.v v8, (a4)
+; CHECK-NEXT:    add a4, a5, a1
+; CHECK-NEXT:    vl1r.v v10, (a3)
+; CHECK-NEXT:    add a6, a4, a1
+; CHECK-NEXT:    vl1r.v v12, (a4)
+; CHECK-NEXT:    add a3, a6, a1
+; CHECK-NEXT:    vl1r.v v14, (a3)
+; CHECK-NEXT:    srli a3, a1, 1
+; CHECK-NEXT:    vl1r.v v9, (a5)
+; CHECK-NEXT:    add a4, a0, a1
+; CHECK-NEXT:    vl1r.v v16, (a4)
+; CHECK-NEXT:    add a4, a4, a1
+; CHECK-NEXT:    vl1r.v v11, (a2)
+; CHECK-NEXT:    add a2, a4, a1
+; CHECK-NEXT:    vl1r.v v18, (a2)
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    vl1r.v v13, (a6)
+; CHECK-NEXT:    add a5, a2, a1
+; CHECK-NEXT:    vl1r.v v20, (a5)
+; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vl1r.v v15, (a0)
-; CHECK-NEXT:    vl1r.v v19, (a5)
-; CHECK-NEXT:    vl1r.v v17, (a6)
-; CHECK-NEXT:    vl1r.v v21, (a7)
+; CHECK-NEXT:    vl1r.v v19, (a2)
+; CHECK-NEXT:    vl1r.v v17, (a4)
+; CHECK-NEXT:    vl1r.v v21, (a5)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v22, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -3747,13 +3726,13 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    vmsne.vi v11, v18, 0
 ; CHECK-NEXT:    vmsne.vi v8, v16, 0
 ; CHECK-NEXT:    vmsne.vi v12, v20, 0
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v22, a1
 ; CHECK-NEXT:    vslideup.vx v9, v10, a1
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
-; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    vslideup.vx v8, v12, a2
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a3
+; CHECK-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 14
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -3770,23 +3749,21 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmv.v.i v14, 0
-; ZVBB-NEXT:    addi a4, sp, 16
+; ZVBB-NEXT:    addi a3, sp, 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 3
 ; ZVBB-NEXT:    sub a0, a1, a0
 ; ZVBB-NEXT:    add a0, sp, a0
 ; ZVBB-NEXT:    addi a0, a0, 16
-; ZVBB-NEXT:    csrr a2, vlenb
+; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
-; ZVBB-NEXT:    add a3, a4, a2
-; ZVBB-NEXT:    srli a1, a2, 2
-; ZVBB-NEXT:    add a5, a0, a2
+; ZVBB-NEXT:    add a2, a3, a1
 ; ZVBB-NEXT:    vmv4r.v v24, v16
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
-; ZVBB-NEXT:    add a6, a3, a2
+; ZVBB-NEXT:    add a4, a2, a1
 ; ZVBB-NEXT:    vmv1r.v v25, v22
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
@@ -3798,41 +3775,41 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v28, v20
 ; ZVBB-NEXT:    vmv1r.v v18, v23
-; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    add a5, a4, a1
 ; ZVBB-NEXT:    vmv1r.v v29, v10
 ; ZVBB-NEXT:    vmv1r.v v20, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v13
 ; ZVBB-NEXT:    vmerge.vim v30, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v22, v11
-; ZVBB-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e8.v v24, (a4)
+; ZVBB-NEXT:    vsetvli a6, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg7e8.v v24, (a3)
 ; ZVBB-NEXT:    vmv1r.v v23, v31
 ; ZVBB-NEXT:    vsseg7e8.v v17, (a0)
-; ZVBB-NEXT:    vl1r.v v8, (a6)
-; ZVBB-NEXT:    add a6, a7, a2
-; ZVBB-NEXT:    vl1r.v v10, (a4)
-; ZVBB-NEXT:    add a4, a6, a2
-; ZVBB-NEXT:    vl1r.v v12, (a6)
-; ZVBB-NEXT:    add a6, a4, a2
-; ZVBB-NEXT:    vl1r.v v14, (a6)
-; ZVBB-NEXT:    add a6, a5, a2
-; ZVBB-NEXT:    vl1r.v v16, (a5)
-; ZVBB-NEXT:    add a5, a6, a2
-; ZVBB-NEXT:    vl1r.v v18, (a5)
-; ZVBB-NEXT:    add a5, a5, a2
-; ZVBB-NEXT:    vl1r.v v9, (a7)
-; ZVBB-NEXT:    add a7, a5, a2
-; ZVBB-NEXT:    vl1r.v v20, (a7)
-; ZVBB-NEXT:    add a7, a7, a2
-; ZVBB-NEXT:    srli a2, a2, 1
-; ZVBB-NEXT:    vl1r.v v11, (a3)
-; ZVBB-NEXT:    add a3, a1, a1
-; ZVBB-NEXT:    vl1r.v v13, (a4)
-; ZVBB-NEXT:    add a4, a2, a2
+; ZVBB-NEXT:    vl1r.v v8, (a4)
+; ZVBB-NEXT:    add a4, a5, a1
+; ZVBB-NEXT:    vl1r.v v10, (a3)
+; ZVBB-NEXT:    add a6, a4, a1
+; ZVBB-NEXT:    vl1r.v v12, (a4)
+; ZVBB-NEXT:    add a3, a6, a1
+; ZVBB-NEXT:    vl1r.v v14, (a3)
+; ZVBB-NEXT:    srli a3, a1, 1
+; ZVBB-NEXT:    vl1r.v v9, (a5)
+; ZVBB-NEXT:    add a4, a0, a1
+; ZVBB-NEXT:    vl1r.v v16, (a4)
+; ZVBB-NEXT:    add a4, a4, a1
+; ZVBB-NEXT:    vl1r.v v11, (a2)
+; ZVBB-NEXT:    add a2, a4, a1
+; ZVBB-NEXT:    vl1r.v v18, (a2)
+; ZVBB-NEXT:    add a2, a2, a1
+; ZVBB-NEXT:    vl1r.v v13, (a6)
+; ZVBB-NEXT:    add a5, a2, a1
+; ZVBB-NEXT:    vl1r.v v20, (a5)
+; ZVBB-NEXT:    add a5, a5, a1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vl1r.v v15, (a0)
-; ZVBB-NEXT:    vl1r.v v19, (a5)
-; ZVBB-NEXT:    vl1r.v v17, (a6)
-; ZVBB-NEXT:    vl1r.v v21, (a7)
+; ZVBB-NEXT:    vl1r.v v19, (a2)
+; ZVBB-NEXT:    vl1r.v v17, (a4)
+; ZVBB-NEXT:    vl1r.v v21, (a5)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v22, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v10, 0
@@ -3841,13 +3818,13 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    vmsne.vi v11, v18, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v16, 0
 ; ZVBB-NEXT:    vmsne.vi v12, v20, 0
-; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v22, a1
 ; ZVBB-NEXT:    vslideup.vx v9, v10, a1
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
-; ZVBB-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    vslideup.vx v8, v12, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a3
+; ZVBB-NEXT:    vslideup.vx v8, v12, a3
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 14
 ; ZVBB-NEXT:    mul a0, a0, a1
@@ -5569,54 +5546,52 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    add a6, a4, a0
 ; CHECK-NEXT:    add a7, a5, a0
 ; CHECK-NEXT:    add t0, a6, a0
-; CHECK-NEXT:    add t1, a7, a0
-; CHECK-NEXT:    add t2, t0, a0
 ; CHECK-NEXT:    vmv1r.v v20, v9
-; CHECK-NEXT:    add t3, t1, a0
+; CHECK-NEXT:    add t1, a7, a0
 ; CHECK-NEXT:    vmv1r.v v22, v11
 ; CHECK-NEXT:    vsseg8e8.v v16, (a1)
-; CHECK-NEXT:    vl1r.v v10, (t1)
-; CHECK-NEXT:    add t1, t2, a0
-; CHECK-NEXT:    vl1r.v v12, (a5)
-; CHECK-NEXT:    add a5, t3, a0
+; CHECK-NEXT:    vl1r.v v8, (a5)
+; CHECK-NEXT:    add a5, t0, a0
+; CHECK-NEXT:    vl1r.v v12, (t1)
+; CHECK-NEXT:    add t1, t1, a0
 ; CHECK-NEXT:    vl1r.v v14, (a2)
-; CHECK-NEXT:    add a2, t1, a0
+; CHECK-NEXT:    add a2, a5, a0
+; CHECK-NEXT:    vl1r.v v10, (a5)
+; CHECK-NEXT:    add a5, t1, a0
 ; CHECK-NEXT:    vl1r.v v16, (a5)
 ; CHECK-NEXT:    add a5, a5, a0
-; CHECK-NEXT:    vl1r.v v8, (a2)
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    vl1r.v v18, (t2)
 ; CHECK-NEXT:    vl1r.v v17, (a5)
-; CHECK-NEXT:    vl1r.v v11, (t3)
-; CHECK-NEXT:    vl1r.v v13, (a7)
+; CHECK-NEXT:    add a5, a2, a0
+; CHECK-NEXT:    vl1r.v v18, (a5)
+; CHECK-NEXT:    add a5, a5, a0
+; CHECK-NEXT:    vl1r.v v13, (t1)
+; CHECK-NEXT:    vl1r.v v9, (a7)
 ; CHECK-NEXT:    vl1r.v v15, (a3)
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v20, v16, 0
-; CHECK-NEXT:    vmsne.vi v16, v10, 0
-; CHECK-NEXT:    vl1r.v v10, (a6)
-; CHECK-NEXT:    vmsne.vi v17, v12, 0
+; CHECK-NEXT:    vmsne.vi v16, v12, 0
+; CHECK-NEXT:    vl1r.v v12, (a6)
+; CHECK-NEXT:    vmsne.vi v17, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v14, 0
-; CHECK-NEXT:    vl1r.v v12, (a1)
-; CHECK-NEXT:    vl1r.v v9, (a2)
-; CHECK-NEXT:    vl1r.v v19, (t1)
-; CHECK-NEXT:    vl1r.v v11, (t0)
-; CHECK-NEXT:    vl1r.v v13, (a4)
-; CHECK-NEXT:    vmsne.vi v14, v8, 0
+; CHECK-NEXT:    vl1r.v v14, (a1)
+; CHECK-NEXT:    vl1r.v v19, (a5)
+; CHECK-NEXT:    vl1r.v v11, (a2)
+; CHECK-NEXT:    vl1r.v v13, (t0)
+; CHECK-NEXT:    vl1r.v v15, (a4)
 ; CHECK-NEXT:    vmsne.vi v9, v18, 0
-; CHECK-NEXT:    vmsne.vi v15, v10, 0
-; CHECK-NEXT:    vmsne.vi v8, v12, 0
+; CHECK-NEXT:    vmsne.vi v18, v10, 0
+; CHECK-NEXT:    vmsne.vi v10, v12, 0
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v20, a1
 ; CHECK-NEXT:    vslideup.vx v0, v17, a1
-; CHECK-NEXT:    vslideup.vx v9, v14, a1
-; CHECK-NEXT:    vslideup.vx v8, v15, a1
+; CHECK-NEXT:    vslideup.vx v18, v9, a1
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
-; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    vslideup.vx v8, v18, a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
@@ -5670,54 +5645,52 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    add a6, a4, a0
 ; ZVBB-NEXT:    add a7, a5, a0
 ; ZVBB-NEXT:    add t0, a6, a0
-; ZVBB-NEXT:    add t1, a7, a0
-; ZVBB-NEXT:    add t2, t0, a0
 ; ZVBB-NEXT:    vmv1r.v v20, v9
-; ZVBB-NEXT:    add t3, t1, a0
+; ZVBB-NEXT:    add t1, a7, a0
 ; ZVBB-NEXT:    vmv1r.v v22, v11
 ; ZVBB-NEXT:    vsseg8e8.v v16, (a1)
-; ZVBB-NEXT:    vl1r.v v10, (t1)
-; ZVBB-NEXT:    add t1, t2, a0
-; ZVBB-NEXT:    vl1r.v v12, (a5)
-; ZVBB-NEXT:    add a5, t3, a0
+; ZVBB-NEXT:    vl1r.v v8, (a5)
+; ZVBB-NEXT:    add a5, t0, a0
+; ZVBB-NEXT:    vl1r.v v12, (t1)
+; ZVBB-NEXT:    add t1, t1, a0
 ; ZVBB-NEXT:    vl1r.v v14, (a2)
-; ZVBB-NEXT:    add a2, t1, a0
+; ZVBB-NEXT:    add a2, a5, a0
+; ZVBB-NEXT:    vl1r.v v10, (a5)
+; ZVBB-NEXT:    add a5, t1, a0
 ; ZVBB-NEXT:    vl1r.v v16, (a5)
 ; ZVBB-NEXT:    add a5, a5, a0
-; ZVBB-NEXT:    vl1r.v v8, (a2)
-; ZVBB-NEXT:    add a2, a2, a0
-; ZVBB-NEXT:    vl1r.v v18, (t2)
 ; ZVBB-NEXT:    vl1r.v v17, (a5)
-; ZVBB-NEXT:    vl1r.v v11, (t3)
-; ZVBB-NEXT:    vl1r.v v13, (a7)
+; ZVBB-NEXT:    add a5, a2, a0
+; ZVBB-NEXT:    vl1r.v v18, (a5)
+; ZVBB-NEXT:    add a5, a5, a0
+; ZVBB-NEXT:    vl1r.v v13, (t1)
+; ZVBB-NEXT:    vl1r.v v9, (a7)
 ; ZVBB-NEXT:    vl1r.v v15, (a3)
 ; ZVBB-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v20, v16, 0
-; ZVBB-NEXT:    vmsne.vi v16, v10, 0
-; ZVBB-NEXT:    vl1r.v v10, (a6)
-; ZVBB-NEXT:    vmsne.vi v17, v12, 0
+; ZVBB-NEXT:    vmsne.vi v16, v12, 0
+; ZVBB-NEXT:    vl1r.v v12, (a6)
+; ZVBB-NEXT:    vmsne.vi v17, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v14, 0
-; ZVBB-NEXT:    vl1r.v v12, (a1)
-; ZVBB-NEXT:    vl1r.v v9, (a2)
-; ZVBB-NEXT:    vl1r.v v19, (t1)
-; ZVBB-NEXT:    vl1r.v v11, (t0)
-; ZVBB-NEXT:    vl1r.v v13, (a4)
-; ZVBB-NEXT:    vmsne.vi v14, v8, 0
+; ZVBB-NEXT:    vl1r.v v14, (a1)
+; ZVBB-NEXT:    vl1r.v v19, (a5)
+; ZVBB-NEXT:    vl1r.v v11, (a2)
+; ZVBB-NEXT:    vl1r.v v13, (t0)
+; ZVBB-NEXT:    vl1r.v v15, (a4)
 ; ZVBB-NEXT:    vmsne.vi v9, v18, 0
-; ZVBB-NEXT:    vmsne.vi v15, v10, 0
-; ZVBB-NEXT:    vmsne.vi v8, v12, 0
+; ZVBB-NEXT:    vmsne.vi v18, v10, 0
+; ZVBB-NEXT:    vmsne.vi v10, v12, 0
+; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    srli a1, a0, 2
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v16, v20, a1
 ; ZVBB-NEXT:    vslideup.vx v0, v17, a1
-; ZVBB-NEXT:    vslideup.vx v9, v14, a1
-; ZVBB-NEXT:    vslideup.vx v8, v15, a1
+; ZVBB-NEXT:    vslideup.vx v18, v9, a1
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    srli a0, a0, 1
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v16, a0
-; ZVBB-NEXT:    vslideup.vx v8, v9, a0
+; ZVBB-NEXT:    vslideup.vx v8, v18, a0
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 4
 ; ZVBB-NEXT:    add sp, sp, a0
@@ -6294,14 +6267,12 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; V-NEXT:    vwaddu.vv v10, v8, v9
 ; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
 ; V-NEXT:    vwmaccu.vx v10, a0, v9
-; V-NEXT:    srli a1, a1, 2
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vslidedown.vx v8, v10, a1
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT:    vslideup.vx v10, v8, a1
+; V-NEXT:    csrr a0, vlenb
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; V-NEXT:    vslidedown.vx v8, v10, a0
+; V-NEXT:    vslideup.vx v10, v8, a0
 ; V-NEXT:    vmv.v.v v8, v10
 ; V-NEXT:    ret
 ;
@@ -6314,8 +6285,6 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
@@ -6327,8 +6296,7 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    srli a0, a0, 2
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZIP-NEXT:    vslideup.vx v10, v11, a0
 ; ZIP-NEXT:    vmv.v.v v8, v10
 ; ZIP-NEXT:    ret
@@ -6374,14 +6342,12 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; V-NEXT:    vwaddu.vv v10, v8, v9
 ; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
 ; V-NEXT:    vwmaccu.vx v10, a0, v9
-; V-NEXT:    srli a1, a1, 2
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vslidedown.vx v8, v10, a1
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT:    vslideup.vx v10, v8, a1
+; V-NEXT:    csrr a0, vlenb
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; V-NEXT:    vslidedown.vx v8, v10, a0
+; V-NEXT:    vslideup.vx v10, v8, a0
 ; V-NEXT:    vmv.v.v v8, v10
 ; V-NEXT:    ret
 ;
@@ -6394,8 +6360,6 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
@@ -6407,8 +6371,7 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    srli a0, a0, 2
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZIP-NEXT:    vslideup.vx v10, v11, a0
 ; ZIP-NEXT:    vmv.v.v v8, v10
 ; ZIP-NEXT:    ret
@@ -6807,8 +6770,7 @@ define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vle16.v v9, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -6834,8 +6796,7 @@ define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vle16.v v9, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v9, a1
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -6967,8 +6928,7 @@ define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x
 ; CHECK-NEXT:    vle16.v v9, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -6994,8 +6954,7 @@ define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB-NEXT:    vle16.v v9, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v9, a1
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -7127,8 +7086,7 @@ define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v9, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -7154,8 +7112,7 @@ define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v9, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v9, a1
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -7391,13 +7348,12 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
@@ -7422,13 +7378,12 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
@@ -7559,13 +7514,12 @@ define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv2bf16(<vscale x 2 x
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
@@ -7590,13 +7544,12 @@ define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
@@ -7727,13 +7680,12 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v9, (a4)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v10, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
@@ -7758,13 +7710,12 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v9, (a4)
 ; ZVBB-NEXT:    vle32.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v10, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
@@ -7998,13 +7949,12 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    vle16.v v8, (a5)
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vle16.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    add a2, a5, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8034,13 +7984,12 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    vle16.v v8, (a5)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vle16.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
 ; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    add a2, a5, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8466,13 +8415,12 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    vle16.v v8, (a5)
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vle16.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    add a2, a5, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8502,13 +8450,12 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    vle16.v v8, (a5)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vle16.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
 ; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    add a2, a5, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8934,13 +8881,12 @@ define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v8, (a5)
 ; CHECK-NEXT:    vle32.v v9, (a4)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vle32.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    add a2, a5, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -8970,13 +8916,12 @@ define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v8, (a5)
 ; ZVBB-NEXT:    vle32.v v9, (a4)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vle32.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
 ; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    add a2, a5, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -9796,18 +9741,17 @@ define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v11, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
@@ -9836,18 +9780,17 @@ define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v11, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v11, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
@@ -10311,18 +10254,17 @@ define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v11, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
@@ -10351,18 +10293,17 @@ define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v11, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v11, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
@@ -10826,18 +10767,17 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v10, (a6)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle32.v v11, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v11, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
@@ -10866,18 +10806,17 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v10, (a6)
 ; ZVBB-NEXT:    vle32.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle32.v v11, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v11, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v11, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
@@ -11761,7 +11700,6 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
@@ -11771,20 +11709,20 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vle16.v v8, (a7)
 ; CHECK-NEXT:    vle16.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a7, a2
 ; CHECK-NEXT:    vle16.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a2)
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -11801,7 +11739,6 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
@@ -11811,20 +11748,20 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vle16.v v8, (a7)
 ; ZVBB-NEXT:    vle16.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a2, a7, a2
 ; ZVBB-NEXT:    vle16.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
 ; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a2)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
 ; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -12325,7 +12262,6 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
@@ -12335,20 +12271,20 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vle16.v v8, (a7)
 ; CHECK-NEXT:    vle16.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a7, a2
 ; CHECK-NEXT:    vle16.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a2)
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -12365,7 +12301,6 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
@@ -12375,20 +12310,20 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vle16.v v8, (a7)
 ; ZVBB-NEXT:    vle16.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a2, a7, a2
 ; ZVBB-NEXT:    vle16.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
 ; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a2)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
 ; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -12889,7 +12824,6 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 3
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
@@ -12899,20 +12833,20 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vle32.v v8, (a7)
 ; CHECK-NEXT:    vle32.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    srli a1, a1, 3
 ; CHECK-NEXT:    add a2, a7, a2
 ; CHECK-NEXT:    vle32.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v11, (a2)
 ; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v12, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -12929,7 +12863,6 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 3
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
@@ -12939,20 +12872,20 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vle32.v v8, (a7)
 ; ZVBB-NEXT:    vle32.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    srli a1, a1, 3
 ; ZVBB-NEXT:    add a2, a7, a2
 ; ZVBB-NEXT:    vle32.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
 ; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v11, (a2)
 ; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
 ; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v12, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -13945,23 +13878,22 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    vle16.v v11, (t0)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v9, (a7)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v8, a1
-; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -13990,23 +13922,22 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    vle16.v v11, (t0)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v9, (a7)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v11, v8, a1
-; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v9, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -14243,23 +14174,22 @@ define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    vle16.v v11, (t0)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v9, (a7)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v8, a1
-; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -14288,23 +14218,22 @@ define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    vle16.v v11, (t0)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v9, (a7)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v11, v8, a1
-; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v9, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -14541,23 +14470,22 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v11, (t0)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle32.v v9, (a7)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v8, a1
-; CHECK-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v10, (a6)
 ; CHECK-NEXT:    vle32.v v8, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v12, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -14586,23 +14514,22 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v11, (t0)
 ; ZVBB-NEXT:    vle32.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle32.v v9, (a7)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v11, v8, a1
-; ZVBB-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v10, (a6)
 ; ZVBB-NEXT:    vle32.v v8, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v9, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v12, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index df7af4d8b166..111fa368ac15 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -634,12 +634,11 @@ define <vscale x 32 x i1> @vfptosi_nxv32bf16_nxv32i1(<vscale x 32 x bfloat> %va)
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v16
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v24
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vand.vi v12, v12, 1
 ; CHECK-NEXT:    vmsne.vi v16, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
 ; CHECK-NEXT:    ret
   %evec = fptosi <vscale x 32 x bfloat> %va to <vscale x 32 x i1>
@@ -656,12 +655,11 @@ define <vscale x 32 x i1> @vfptoui_nxv32bf16_nxv32i1(<vscale x 32 x bfloat> %va)
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v16
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v24
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vand.vi v12, v12, 1
 ; CHECK-NEXT:    vmsne.vi v16, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
 ; CHECK-NEXT:    ret
   %evec = fptoui <vscale x 32 x bfloat> %va to <vscale x 32 x i1>
@@ -1654,12 +1652,11 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v16
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v24
-; ZVFHMIN-NEXT:    add a1, a0, a0
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v16, v8, 0
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v12, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptosi <vscale x 32 x half> %va to <vscale x 32 x i1>
@@ -1684,12 +1681,11 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v16
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v24
-; ZVFHMIN-NEXT:    add a1, a0, a0
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v16, v8, 0
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v12, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptoui <vscale x 32 x half> %va to <vscale x 32 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 142ee5256f9e..186815405227 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -567,38 +567,37 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v8, v0
-; RV32-NEXT:    slli a2, a1, 1
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmerge.vim v11, v9, 1, v0
-; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v11, v11
-; RV32-NEXT:    vwmaccu.vx v12, a1, v11
+; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    srli a2, a2, 2
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v11, v12, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v11, v12, a2
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v11, 0
-; RV32-NEXT:    add a1, a3, a3
+; RV32-NEXT:    slli a3, a1, 1
 ; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v9, a3
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v9, a2
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wx v13, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vnsrl.wi v12, v10, 0
-; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    srli a3, a3, 1
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; RV32-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
@@ -611,26 +610,24 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a1, 33
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v9, 1, v0
-; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v11, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a2, a2, 2
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v11, v12, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v11, v12, a2
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v11, 0
-; RV64-NEXT:    add a1, a3, a3
+; RV64-NEXT:    slli a3, a1, 33
 ; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v9, a3
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v9, a2
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    srli a1, a4, 32
+; RV64-NEXT:    srli a1, a3, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v10, (a0), v0.t
 ; RV64-NEXT:    li a1, 32
@@ -638,9 +635,9 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    vnsrl.wx v13, v10, a1
 ; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vnsrl.wi v12, v10, 0
-; RV64-NEXT:    srli a4, a4, 33
+; RV64-NEXT:    srli a3, a3, 33
 ; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
@@ -807,10 +804,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV32-NEXT:    srli a3, a3, 3
 ; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vx v8, v12, a3
-; RV32-NEXT:    add a4, a3, a3
-; RV32-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV32-NEXT:    vslideup.vx v12, v8, a3
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwaddu.vv v16, v12, v9
 ; RV32-NEXT:    vwmaccu.vx v16, a2, v9
 ; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
@@ -831,10 +825,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV64-NEXT:    srli a3, a3, 3
 ; RV64-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vx v8, v12, a3
-; RV64-NEXT:    add a4, a3, a3
-; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV64-NEXT:    vslideup.vx v12, v8, a3
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vwaddu.vv v16, v12, v9
 ; RV64-NEXT:    vwmaccu.vx v16, a2, v9
 ; RV64-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
@@ -858,29 +849,28 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v9, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmerge.vim v9, v8, 1, v0
-; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v9, v11
 ; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    srli a2, a2, 2
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v9, v12, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v9, v12, a2
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v9, 0
-; RV32-NEXT:    add a2, a3, a3
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v8, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
@@ -899,26 +889,24 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 33
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV64-NEXT:    vmv1r.v v0, v9
 ; RV64-NEXT:    vmerge.vim v9, v8, 1, v0
-; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v9, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a2, a2, 2
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v9, v12, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v9, v12, a2
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v9, 0
-; RV64-NEXT:    add a2, a3, a3
+; RV64-NEXT:    slli a1, a1, 33
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v8, a3
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v8, a2
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma

From 74687180dde07312521db09c6f6454fe9d1e5662 Mon Sep 17 00:00:00 2001
From: Kirill Chibisov <contact@kchibisov.com>
Date: Wed, 18 Jun 2025 14:38:47 +0900
Subject: [PATCH 0757/1322] [mlir][emitc] Make CExpression trait into interface
 (#142771)

By defining `CExpressionInterface`, we move the side effect detection
logic from `emitc.expression` into the individual operations
implementing the interface allowing operations to gradually tune the
side effect.

It also allows checking for side effects each operation individually.
---
 .../mlir/Dialect/EmitC/IR/CMakeLists.txt      |   6 +
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.h    |   2 +-
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td   | 107 +++++++++++-------
 .../mlir/Dialect/EmitC/IR/EmitCInterfaces.h   |  31 +++++
 .../mlir/Dialect/EmitC/IR/EmitCInterfaces.td  |  48 ++++++++
 .../mlir/Dialect/EmitC/IR/EmitCTraits.h       |  30 -----
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp           |   6 +-
 .../EmitC/Transforms/FormExpressions.cpp      |   2 +-
 .../Dialect/EmitC/Transforms/Transforms.cpp   |   3 +-
 mlir/lib/Target/Cpp/TranslateToCpp.cpp        |   6 +-
 10 files changed, 159 insertions(+), 82 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h
 create mode 100644 mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td
 delete mode 100644 mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt
index 610170f5944e..299cee76cb1b 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt
@@ -1,6 +1,12 @@
 add_mlir_dialect(EmitC emitc)
 add_mlir_doc(EmitC EmitC Dialects/ -gen-dialect-doc -dialect emitc)
 
+set(LLVM_TARGET_DEFINITIONS EmitCInterfaces.td)
+mlir_tablegen(EmitCInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(EmitCInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIREmitCInterfacesIncGen)
+add_dependencies(mlir-generic-headers MLIREmitCInterfacesIncGen)
+
 set(LLVM_TARGET_DEFINITIONS EmitCAttributes.td)
 mlir_tablegen(EmitCEnums.h.inc -gen-enum-decls)
 mlir_tablegen(EmitCEnums.cpp.inc -gen-enum-defs)
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
index 57029c64ffd0..1984ed8a7f06 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
@@ -14,7 +14,7 @@
 #define MLIR_DIALECT_EMITC_IR_EMITC_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Dialect/EmitC/IR/EmitCTraits.h"
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index e53d3e45875d..9ecdb74f4d82 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -14,6 +14,7 @@
 #define MLIR_DIALECT_EMITC_IR_EMITC
 
 include "mlir/Dialect/EmitC/IR/EmitCAttributes.td"
+include "mlir/Dialect/EmitC/IR/EmitCInterfaces.td"
 include "mlir/Dialect/EmitC/IR/EmitCTypes.td"
 
 include "mlir/Interfaces/CallInterfaces.td"
@@ -35,22 +36,31 @@ class EmitC_Op<string mnemonic, list<Trait> traits = []>
 
 // Base class for unary operations.
 class EmitC_UnaryOp<string mnemonic, list<Trait> traits = []> :
-    EmitC_Op<mnemonic, traits> {
+    EmitC_Op<mnemonic, !listconcat(traits, [CExpressionInterface])> {
   let arguments = (ins EmitCType);
   let results = (outs EmitCType);
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
 }
 
 // Base class for binary operations.
 class EmitC_BinaryOp<string mnemonic, list<Trait> traits = []> :
-    EmitC_Op<mnemonic, traits> {
+    EmitC_Op<mnemonic, !listconcat(traits, [CExpressionInterface])> {
   let arguments = (ins EmitCType:$lhs, EmitCType:$rhs);
   let results = (outs EmitCType);
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
-}
 
-// EmitC OpTrait
-def CExpression : NativeOpTrait<"emitc::CExpression">;
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
+}
 
 // Types only used in binary arithmetic operations.
 def IntegerIndexOrOpaqueType : Type<CPred<"emitc::isIntegerIndexOrOpaqueType($_self)">,
@@ -103,7 +113,7 @@ def EmitC_FileOp
   let skipDefaultBuilders = 1;
 }
 
-def EmitC_AddOp : EmitC_BinaryOp<"add", [CExpression]> {
+def EmitC_AddOp : EmitC_BinaryOp<"add", []> {
   let summary = "Addition operation";
   let description = [{
     With the `emitc.add` operation the arithmetic operator + (addition) can
@@ -126,7 +136,7 @@ def EmitC_AddOp : EmitC_BinaryOp<"add", [CExpression]> {
   let hasVerifier = 1;
 }
 
-def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> {
+def EmitC_ApplyOp : EmitC_Op<"apply", [CExpressionInterface]> {
   let summary = "Apply operation";
   let description = [{
     With the `emitc.apply` operation the operators & (address of) and * (contents of)
@@ -152,10 +162,17 @@ def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> {
   let assemblyFormat = [{
     $applicableOperator `(` $operand `)` attr-dict `:` functional-type($operand, results)
   }];
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return getApplicableOperator() == "*";
+    }
+  }];
+
   let hasVerifier = 1;
 }
 
-def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", [CExpression]> {
+def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", []> {
   let summary = "Bitwise and operation";
   let description = [{
     With the `emitc.bitwise_and` operation the bitwise operator & (and) can
@@ -173,8 +190,7 @@ def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", [CExpression]> {
   }];
 }
 
-def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift",
-    [CExpression]> {
+def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift", []> {
   let summary = "Bitwise left shift operation";
   let description = [{
     With the `emitc.bitwise_left_shift` operation the bitwise operator <<
@@ -192,7 +208,7 @@ def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift",
   }];
 }
 
-def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", [CExpression]> {
+def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", []> {
   let summary = "Bitwise not operation";
   let description = [{
     With the `emitc.bitwise_not` operation the bitwise operator ~ (not) can
@@ -210,7 +226,7 @@ def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", [CExpression]> {
   }];
 }
 
-def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", [CExpression]> {
+def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", []> {
   let summary = "Bitwise or operation";
   let description = [{
     With the `emitc.bitwise_or` operation the bitwise operator | (or)
@@ -228,8 +244,7 @@ def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", [CExpression]> {
   }];
 }
 
-def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift",
-    [CExpression]> {
+def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift", []> {
   let summary = "Bitwise right shift operation";
   let description = [{
     With the `emitc.bitwise_right_shift` operation the bitwise operator >>
@@ -247,7 +262,7 @@ def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift",
   }];
 }
 
-def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", [CExpression]> {
+def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", []> {
   let summary = "Bitwise xor operation";
   let description = [{
     With the `emitc.bitwise_xor` operation the bitwise operator ^ (xor)
@@ -265,7 +280,7 @@ def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", [CExpression]> {
   }];
 }
 
-def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpression]> {
+def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpressionInterface]> {
   let summary = "Opaque call operation";
   let description = [{
     The `emitc.call_opaque` operation represents a C++ function call. The callee
@@ -312,7 +327,7 @@ def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpression]> {
 }
 
 def EmitC_CastOp : EmitC_Op<"cast",
-    [CExpression,
+    [CExpressionInterface,
      DeclareOpInterfaceMethods<CastOpInterface>]> {
   let summary = "Cast operation";
   let description = [{
@@ -335,9 +350,15 @@ def EmitC_CastOp : EmitC_Op<"cast",
   let arguments = (ins EmitCType:$source);
   let results = (outs EmitCType:$dest);
   let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
 }
 
-def EmitC_CmpOp : EmitC_BinaryOp<"cmp", [CExpression]> {
+def EmitC_CmpOp : EmitC_BinaryOp<"cmp", []> {
   let summary = "Comparison operation";
   let description = [{
     With the `emitc.cmp` operation the comparison operators ==, !=, <, <=, >, >=, <=> 
@@ -407,7 +428,7 @@ def EmitC_ConstantOp : EmitC_Op<"constant", [ConstantLike]> {
   let hasVerifier = 1;
 }
 
-def EmitC_DivOp : EmitC_BinaryOp<"div", [CExpression]> {
+def EmitC_DivOp : EmitC_BinaryOp<"div", []> {
   let summary = "Division operation";
   let description = [{
     With the `emitc.div` operation the arithmetic operator / (division) can
@@ -462,7 +483,7 @@ def EmitC_ExpressionOp : EmitC_Op<"expression",
     ```
 
     The operations allowed within expression body are EmitC operations with the
-    CExpression trait.
+    CExpressionInterface interface.
 
     When specified, the optional `do_not_inline` indicates that the expression is
     to be emitted as seen above, i.e. as the rhs of an EmitC SSA value
@@ -480,18 +501,8 @@ def EmitC_ExpressionOp : EmitC_Op<"expression",
   let extraClassDeclaration = [{
     bool hasSideEffects() {
       auto predicate = [](Operation &op) {
-        assert(op.hasTrait<OpTrait::emitc::CExpression>() && "Expected a C expression");
-        // Conservatively assume calls to read and write memory.
-        if (isa<emitc::CallOpaqueOp>(op))
-          return true;
-        // De-referencing reads modifiable memory, address-taking has no
-        // side-effect.
-        auto applyOp = dyn_cast<emitc::ApplyOp>(op);
-        if (applyOp)
-          return applyOp.getApplicableOperator() == "*";
-        // Any load operation is assumed to read from memory and thus perform
-        // a side effect.
-        return isa<emitc::LoadOp>(op);
+        assert(isa<emitc::CExpressionInterface>(op) && "Expected a C expression");
+        return cast<emitc::CExpressionInterface>(op).hasSideEffects();
       };
       return llvm::any_of(getRegion().front().without_terminator(), predicate);
     };
@@ -579,7 +590,7 @@ def EmitC_ForOp : EmitC_Op<"for",
 }
 
 def EmitC_CallOp : EmitC_Op<"call",
-    [CallOpInterface, CExpression,
+    [CallOpInterface, CExpressionInterface,
      DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call operation";
   let description = [{
@@ -649,6 +660,10 @@ def EmitC_CallOp : EmitC_Op<"call",
     void setCalleeFromCallable(CallInterfaceCallable callee) {
       (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
     }
+
+    bool hasSideEffects() {
+      return false;
+    }
   }];
 
   let assemblyFormat = [{
@@ -861,7 +876,7 @@ def EmitC_LiteralOp : EmitC_Op<"literal", [Pure]> {
   let assemblyFormat = "$value attr-dict `:` type($result)";
 }
 
-def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", [CExpression]> {
+def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", []> {
   let summary = "Logical and operation";
   let description = [{
     With the `emitc.logical_and` operation the logical operator && (and) can
@@ -882,7 +897,7 @@ def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", [CExpression]> {
   let assemblyFormat = "operands attr-dict `:` type(operands)";
 }
 
-def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", [CExpression]> {
+def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", []> {
   let summary = "Logical not operation";
   let description = [{
     With the `emitc.logical_not` operation the logical operator ! (negation) can
@@ -903,7 +918,7 @@ def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", [CExpression]> {
   let assemblyFormat = "operands attr-dict `:` type(operands)";
 }
 
-def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", [CExpression]> {
+def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", []> {
   let summary = "Logical or operation";
   let description = [{
     With the `emitc.logical_or` operation the logical operator || (inclusive or)
@@ -924,7 +939,7 @@ def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", [CExpression]> {
   let assemblyFormat = "operands attr-dict `:` type(operands)";
 }
 
-def EmitC_LoadOp : EmitC_Op<"load", [CExpression,
+def EmitC_LoadOp : EmitC_Op<"load", [CExpressionInterface,
   TypesMatchWith<"result type matches value type of 'operand'",
                   "operand", "result",
                   "::llvm::cast<LValueType>($_self).getValueType()">
@@ -953,7 +968,7 @@ def EmitC_LoadOp : EmitC_Op<"load", [CExpression,
   let assemblyFormat = "$operand attr-dict `:` type($operand)"; 
 }
 
-def EmitC_MulOp : EmitC_BinaryOp<"mul", [CExpression]> {
+def EmitC_MulOp : EmitC_BinaryOp<"mul", []> {
   let summary = "Multiplication operation";
   let description = [{
     With the `emitc.mul` operation the arithmetic operator * (multiplication) can
@@ -977,7 +992,7 @@ def EmitC_MulOp : EmitC_BinaryOp<"mul", [CExpression]> {
   let results = (outs FloatIntegerIndexOrOpaqueType);
 }
 
-def EmitC_RemOp : EmitC_BinaryOp<"rem", [CExpression]> {
+def EmitC_RemOp : EmitC_BinaryOp<"rem", []> {
   let summary = "Remainder operation";
   let description = [{
     With the `emitc.rem` operation the arithmetic operator % (remainder) can
@@ -999,7 +1014,7 @@ def EmitC_RemOp : EmitC_BinaryOp<"rem", [CExpression]> {
   let results = (outs IntegerIndexOrOpaqueType);
 }
 
-def EmitC_SubOp : EmitC_BinaryOp<"sub", [CExpression]> {
+def EmitC_SubOp : EmitC_BinaryOp<"sub", []> {
   let summary = "Subtraction operation";
   let description = [{
     With the `emitc.sub` operation the arithmetic operator - (subtraction) can
@@ -1069,7 +1084,7 @@ def EmitC_MemberOfPtrOp : EmitC_Op<"member_of_ptr"> {
 }
 
 def EmitC_ConditionalOp : EmitC_Op<"conditional",
-    [AllTypesMatch<["true_value", "false_value", "result"]>, CExpression]> {
+    [AllTypesMatch<["true_value", "false_value", "result"]>, CExpressionInterface]> {
   let summary = "Conditional (ternary) operation";
   let description = [{
     With the `emitc.conditional` operation the ternary conditional operator can
@@ -1096,9 +1111,15 @@ def EmitC_ConditionalOp : EmitC_Op<"conditional",
   let arguments = (ins I1:$condition, EmitCType:$true_value, EmitCType:$false_value);
   let results = (outs EmitCType:$result);
   let assemblyFormat = "operands attr-dict `:` type($result)";
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
 }
 
-def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
+def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", []> {
   let summary = "Unary minus operation";
   let description = [{
     With the `emitc.unary_minus` operation the unary operator - (minus) can be
@@ -1116,7 +1137,7 @@ def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
   }];
 }
 
-def EmitC_UnaryPlusOp : EmitC_UnaryOp<"unary_plus", [CExpression]> {
+def EmitC_UnaryPlusOp : EmitC_UnaryOp<"unary_plus", []> {
   let summary = "Unary plus operation";
   let description = [{
     With the `emitc.unary_plus` operation the unary operator + (plus) can be
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h
new file mode 100644
index 000000000000..51efe76aceb5
--- /dev/null
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h
@@ -0,0 +1,31 @@
+//===- EmitCInterfaces.h - EmitC interfaces definitions ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares C++ classes for some of the interfaces used in the EmitC
+// dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_EMITC_IR_EMITCINTERFACES_H
+#define MLIR_DIALECT_EMITC_IR_EMITCINTERFACES_H
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace emitc {
+//
+} // namespace emitc
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// EmitC Dialect Interfaces
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.h.inc"
+
+#endif // MLIR_DIALECT_EMITC_IR_EMITCINTERFACES_H
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td
new file mode 100644
index 000000000000..777784e56202
--- /dev/null
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td
@@ -0,0 +1,48 @@
+//===- EmitCInterfaces.td - EmitC Interfaces ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the interfaces used by EmitC.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_EMITC_IR_EMITCINTERFACES
+#define MLIR_DIALECT_EMITC_IR_EMITCINTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def CExpressionInterface : OpInterface<"CExpressionInterface"> {
+  let description = [{
+    Interface to mark operations that can be part of the CExpression.
+  }];
+
+  let cppNamespace = "::mlir::emitc";
+  let methods = [
+    InterfaceMethod<[{
+      Check whether operation has side effects that may affect the expression
+      evaluation.
+
+      By default operation is marked as having side effects.
+
+      ```c++
+      class ConcreteOp ... {
+      public:
+        bool hasSideEffects() {
+          // That way we can override the default implementation.
+          return false;
+        }
+      };
+      ```
+    }],
+      "bool", "hasSideEffects", (ins), /*methodBody=*/[{}],
+       /*defaultImplementation=*/[{
+        return true;
+    }]>,
+  ];
+}
+
+#endif // MLIR_DIALECT_EMITC_IR_EMITCINTERFACES
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h
deleted file mode 100644
index c1602dfce4b4..000000000000
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- EmitCTraits.h - EmitC trait definitions ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares C++ classes for some of the traits used in the EmitC
-// dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_EMITC_IR_EMITCTRAITS_H
-#define MLIR_DIALECT_EMITC_IR_EMITCTRAITS_H
-
-#include "mlir/IR/OpDefinition.h"
-
-namespace mlir {
-namespace OpTrait {
-namespace emitc {
-
-template <typename ConcreteType>
-class CExpression : public TraitBase<ConcreteType, CExpression> {};
-
-} // namespace emitc
-} // namespace OpTrait
-} // namespace mlir
-
-#endif // MLIR_DIALECT_EMITC_IR_EMITCTRAITS_H
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index f82b20712b8c..e602210c2dc6 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Dialect/EmitC/IR/EmitCTraits.h"
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -418,7 +418,7 @@ LogicalResult ExpressionOp::verify() {
     return emitOpError("requires yielded type to match return type");
 
   for (Operation &op : region.front().without_terminator()) {
-    if (!op.hasTrait<OpTrait::emitc::CExpression>())
+    if (!isa<emitc::CExpressionInterface>(op))
       return emitOpError("contains an unsupported operation");
     if (op.getNumResults() != 1)
       return emitOpError("requires exactly one result for each operation");
@@ -1404,5 +1404,7 @@ void FileOp::build(OpBuilder &builder, OperationState &state, StringRef id) {
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/EmitC/IR/EmitC.cpp.inc"
diff --git a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
index 224d68ab8b4a..2f3e2618f4d7 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
@@ -36,7 +36,7 @@ struct FormExpressionsPass
     // Wrap each C operator op with an expression op.
     OpBuilder builder(context);
     auto matchFun = [&](Operation *op) {
-      if (op->hasTrait<OpTrait::emitc::CExpression>() &&
+      if (isa<emitc::CExpressionInterface>(*op) &&
           !op->getParentOfType<emitc::ExpressionOp>() &&
           op->getNumResults() == 1)
         createExpression(op, builder);
diff --git a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
index 87350ecdceaa..a578a86b499a 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
@@ -16,8 +16,7 @@ namespace mlir {
 namespace emitc {
 
 ExpressionOp createExpression(Operation *op, OpBuilder &builder) {
-  assert(op->hasTrait<OpTrait::emitc::CExpression>() &&
-         "Expected a C expression");
+  assert(isa<emitc::CExpressionInterface>(op) && "Expected a C expression");
 
   // Create an expression yielding the value returned by op.
   assert(op->getNumResults() == 1 && "Expected exactly one result");
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 5abc112ab8c7..067a0470b14e 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -329,9 +329,9 @@ static bool shouldBeInlined(ExpressionOp expressionOp) {
   if (hasDeferredEmission(user))
     return false;
 
-  // Do not inline expressions used by ops with the CExpression trait. If this
-  // was intended, the user could have been merged into the expression op.
-  return !user->hasTrait<OpTrait::emitc::CExpression>();
+  // Do not inline expressions used by ops with the CExpressionInterface. If
+  // this was intended, the user could have been merged into the expression op.
+  return !isa<emitc::CExpressionInterface>(*user);
 }
 
 static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation,

From 10f29a607205c0c17ee9249a66feb63f0fdae182 Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Wed, 18 Jun 2025 14:53:33 +0800
Subject: [PATCH 0758/1322] [MSan] Fix wrong unpoison size in SignalAction
 (#144071)

MSan should unpoison the parameters of extended signal handlers.
However, MSan unpoisoned the second parameter with the wrong size
`sizeof(__sanitizer_sigaction)`, inconsistent with its real type
`siginfo_t`.

This commit fixes this issue by correcting the size to
`sizeof(__sanitizer_siginfo)`.
---
 compiler-rt/lib/msan/msan_interceptors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index 76255cdb742a..f94d3cb79aa0 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -1127,7 +1127,7 @@ static void SignalAction(int signo, void *si, void *uc) {
   SignalHandlerScope signal_handler_scope;
   ScopedThreadLocalStateBackup stlsb;
   UnpoisonParam(3);
-  __msan_unpoison(si, sizeof(__sanitizer_sigaction));
+  __msan_unpoison(si, sizeof(__sanitizer_siginfo));
   __msan_unpoison(uc, ucontext_t_sz(uc));
 
   typedef void (*sigaction_cb)(int, void *, void *);

From 4d71f20b287e398f10bbff55d52bec9683ef89d2 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Wed, 18 Jun 2025 09:07:08 +0200
Subject: [PATCH 0759/1322] [GlobalISel] prevent G_UNMERGE_VALUES for vectors
 with different elements (#133335)

This commit prevents building a G_UNMERGE_VALUES instruction with
different source and destination vector elements in
`LegalizationArtifactCombiner::ArtifactValueFinder::tryCombineMergeLike()`,
e.g.:
`%1:_(<2 x s8>), %2:_(<2 x s8>) = G_UNMERGE_VALUES %0:_(<2 x s16>)`

This LLVM defect was identified via the AMD Fuzzing project.
---
 .../GlobalISel/LegalizationArtifactCombiner.h |  5 +-
 .../AMDGPU/GlobalISel/insertelement.ll        | 55 +++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 22f6a5fde546..8f560c42082f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -997,6 +997,7 @@ public:
 
       // Recognize UnmergeSrc that can be unmerged to DstTy directly.
       // Types have to be either both vector or both non-vector types.
+      // In case of vector types, the scalar elements need to match.
       // Merge-like opcodes are combined one at the time. First one creates new
       // unmerge, following should use the same unmerge (builder performs CSE).
       //
@@ -1005,7 +1006,9 @@ public:
       // %AnotherDst:_(DstTy) = G_merge_like_opcode %2:_(EltTy), %3
       //
       // %Dst:_(DstTy), %AnotherDst = G_UNMERGE_VALUES %UnmergeSrc
-      if ((DstTy.isVector() == UnmergeSrcTy.isVector()) &&
+      if (((!DstTy.isVector() && !UnmergeSrcTy.isVector()) ||
+           (DstTy.isVector() && UnmergeSrcTy.isVector() &&
+            DstTy.getScalarType() == UnmergeSrcTy.getScalarType())) &&
           (Elt0UnmergeIdx % NumMIElts == 0) &&
           getCoverTy(UnmergeSrcTy, DstTy) == UnmergeSrcTy) {
         if (!isSequenceFromUnmerge(MI, 0, Unmerge, Elt0UnmergeIdx, NumMIElts,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 8134eb3ca2af..132a89478c5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -6506,3 +6506,58 @@ entry:
   %insert = insertelement <5 x double> %vec, double %val, i32 %idx
   ret <5 x double> %insert
 }
+
+; Found by fuzzer, reduced with llvm-reduce.
+define amdgpu_kernel void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) {
+; GPRIDX-LABEL: insert_very_small_from_very_large:
+; GPRIDX:       ; %bb.0: ; %bb
+; GPRIDX-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
+; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
+; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
+; GPRIDX-NEXT:    s_lshr_b32 s2, s12, 1
+; GPRIDX-NEXT:    s_and_b32 s2, s2, 1
+; GPRIDX-NEXT:    s_lshl_b32 s2, s2, 1
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    flat_store_byte v[0:1], v2
+; GPRIDX-NEXT:    s_endpgm
+;
+; GFX10-LABEL: insert_very_small_from_very_large:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshr_b32 s2, s12, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    flat_store_byte v[0:1], v2
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: insert_very_small_from_very_large:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x40
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s2, s8, 1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_store_b8 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
+bb:
+  %a = bitcast <32 x i16> %L3 to i512
+  %b = trunc i512 %a to i8
+  %c = trunc i8 %b to i2
+  %d = bitcast i2 %c to <2 x i1>
+  %insert = insertelement <2 x i1> %d, i1 false, i32 0
+  store <2 x i1> %insert, ptr %ptr, align 1
+  ret void
+}

From 896e187a6e923b8441428f9db63c412d989fc51d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:00:21 +0100
Subject: [PATCH 0760/1322] [X86] combineAndMaskToShift - pull out repeated
 SDLoc(). NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f425b3d479d..335481b97d47 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -51225,7 +51225,8 @@ static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
 /// with a shift-right to eliminate loading the vector constant mask value.
-static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
@@ -51255,7 +51256,6 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
       Y = Op1;
     }
     if (X && Y) {
-      SDLoc DL(N);
       SDValue Sra =
           getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
                                      VT.getScalarSizeInBits() - 1, DAG);
@@ -51278,7 +51278,6 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
-  SDLoc DL(N);
   unsigned ShiftVal = SplatVal.countr_one();
   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
@@ -51845,7 +51844,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineAndNotIntoANDNP(N, DAG))
     return R;
 
-  if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
+  if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
     return ShiftRight;
 
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

From dac94f28e696e8234ec69bbed549533ea6b00227 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:06:02 +0100
Subject: [PATCH 0761/1322] [X86] combineAndNotOrIntoAndNotAnd - pull out
 repeated SDLoc(). NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 335481b97d47..62912f1c8d12 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -51308,13 +51308,11 @@ static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
 
 /// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
 /// This undoes the inverse fold performed in InstCombine
-static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, SelectionDAG &DAG) {
-
+static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL,
+                                            SelectionDAG &DAG) {
   using namespace llvm::SDPatternMatch;
   MVT VT = N->getSimpleValueType(0);
-  SDLoc DL(N);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.hasAndNot(SDValue(N, 0)))
+  if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
     return SDValue();
 
   SDValue X, Y, Z;
@@ -51850,7 +51848,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
-  if (SDValue R = combineAndNotOrIntoAndNotAnd(N, DAG))
+  if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
     return R;
 
   // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

From 0875bee2b10185eca40aea3b3f49eb8462522eda Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:14:13 +0100
Subject: [PATCH 0762/1322] [X86] combineAndNotIntoANDNP - pull out repeated
 SDLoc(). NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 62912f1c8d12..1ca5fc5376f0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -50839,7 +50839,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
-static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL,
+                                      SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
 
   MVT VT = N->getSimpleValueType(0);
@@ -50861,7 +50862,7 @@ static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
 
   X = DAG.getBitcast(VT, X);
   Y = DAG.getBitcast(VT, Y);
-  return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
+  return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
 }
 
 /// Try to fold:
@@ -51839,7 +51840,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
-  if (SDValue R = combineAndNotIntoANDNP(N, DAG))
+  if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
     return R;
 
   if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))

From 44b715293fcad79ef4a54474627ac574a759fa5a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 18:38:40 +0100
Subject: [PATCH 0763/1322] [PhaseOrdering][X86] Copy FMUL+ADDSUB/FMADDSUB
 build vector patterns from codegen tests

As detailed on #144489 - confirm the vectorisation of scalar FMUL+ADDSUB/FMADDSUB on various targets
---
 .../Transforms/PhaseOrdering/X86/fmaddsub.ll  | 720 ++++++++++++++++++
 1 file changed, 720 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
new file mode 100644
index 000000000000..ad4452431a48
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -0,0 +1,720 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA3
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA3
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; This test checks the vectorisation of FMUL+ADDSUB/FMADDSUB patterns, including cases with undef elements.
+
+; Ideally, this should reach the backend with 1 fmul, 1 fsub, 1 fadd, and 1 shuffle.
+; That may require some coordination between VectorCombine, SLP, and other passes.
+
+define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_addsub_ps128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x float> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %A = fmul <4 x float> %C, %D
+  %A0 = extractelement <4 x float> %A, i32 0
+  %B0 = extractelement <4 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <4 x float> %A, i32 2
+  %B2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A1 = extractelement <4 x float> %A, i32 1
+  %B1 = extractelement <4 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <4 x float> %A, i32 3
+  %B3 = extractelement <4 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
+  ret <4 x float> %vecinsert4
+}
+
+define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_addsub_pd128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %A = fmul <2 x double> %C, %D
+  %A0 = extractelement <2 x double> %A, i32 0
+  %B0 = extractelement <2 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A1 = extractelement <2 x double> %A, i32 1
+  %B1 = extractelement <2 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+; SSE2-LABEL: @buildvector_mul_addsub_ps256(
+; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE2-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP4]]
+;
+; SSE4-LABEL: @buildvector_mul_addsub_ps256(
+; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x float> [[TMP2]]
+;
+; AVX-LABEL: @buildvector_mul_addsub_ps256(
+; AVX-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %A = fmul <8 x float> %C, %D
+  %A0 = extractelement <8 x float> %A, i32 0
+  %B0 = extractelement <8 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <8 x float> %A, i32 2
+  %B2 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A4 = extractelement <8 x float> %A, i32 4
+  %B4 = extractelement <8 x float> %B, i32 4
+  %sub4 = fsub float %A4, %B4
+  %A6 = extractelement <8 x float> %A, i32 6
+  %B6 = extractelement <8 x float> %B, i32 6
+  %sub6 = fsub float %A6, %B6
+  %A1 = extractelement <8 x float> %A, i32 1
+  %B1 = extractelement <8 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <8 x float> %A, i32 3
+  %B3 = extractelement <8 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %A5 = extractelement <8 x float> %A, i32 5
+  %B5 = extractelement <8 x float> %B, i32 5
+  %add5 = fadd float %A5, %B5
+  %A7 = extractelement <8 x float> %A, i32 7
+  %B7 = extractelement <8 x float> %B, i32 7
+  %add7 = fadd float %A7, %B7
+  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
+  ret <8 x float> %vecinsert8
+}
+
+define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_addsub_pd256(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %A = fmul <4 x double> %C, %D
+  %A0 = extractelement <4 x double> %A, i32 0
+  %B0 = extractelement <4 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A2 = extractelement <4 x double> %A, i32 2
+  %B2 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %A2, %B2
+  %A1 = extractelement <4 x double> %A, i32 1
+  %B1 = extractelement <4 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %A3 = extractelement <4 x double> %A, i32 3
+  %B3 = extractelement <4 x double> %B, i32 3
+  %add3 = fadd double %A3, %B3
+  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
+  ret <4 x double> %vecinsert4
+}
+
+define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; SSE-LABEL: @buildvector_mul_addsub_ps512(
+; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP2:%.*]] = fsub <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <12 x float> [[TMP2]], <12 x float> [[TMP3]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <12 x float> [[TMP4]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP8]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+;
+; AVX-LABEL: @buildvector_mul_addsub_ps512(
+; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP11:%.*]] = fsub <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP12]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
+;
+  %A = fmul <16 x float> %C, %D
+  %A0 = extractelement <16 x float> %A, i32 0
+  %B0 = extractelement <16 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <16 x float> %A, i32 2
+  %B2 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A4 = extractelement <16 x float> %A, i32 4
+  %B4 = extractelement <16 x float> %B, i32 4
+  %sub4 = fsub float %A4, %B4
+  %A6 = extractelement <16 x float> %A, i32 6
+  %B6 = extractelement <16 x float> %B, i32 6
+  %sub6 = fsub float %A6, %B6
+  %A8 = extractelement <16 x float> %A, i32 8
+  %B8 = extractelement <16 x float> %B, i32 8
+  %sub8 = fsub float %A8, %B8
+  %A10 = extractelement <16 x float> %A, i32 10
+  %B10 = extractelement <16 x float> %B, i32 10
+  %sub10 = fsub float %A10, %B10
+  %A12 = extractelement <16 x float> %A, i32 12
+  %B12 = extractelement <16 x float> %B, i32 12
+  %sub12 = fsub float %A12, %B12
+  %A14 = extractelement <16 x float> %A, i32 14
+  %B14 = extractelement <16 x float> %B, i32 14
+  %sub14 = fsub float %A14, %B14
+  %A1 = extractelement <16 x float> %A, i32 1
+  %B1 = extractelement <16 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <16 x float> %A, i32 3
+  %B3 = extractelement <16 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %A5 = extractelement <16 x float> %A, i32 5
+  %B5 = extractelement <16 x float> %B, i32 5
+  %add5 = fadd float %A5, %B5
+  %A7 = extractelement <16 x float> %A, i32 7
+  %B7 = extractelement <16 x float> %B, i32 7
+  %add7 = fadd float %A7, %B7
+  %A9 = extractelement <16 x float> %A, i32 9
+  %B9 = extractelement <16 x float> %B, i32 9
+  %add9 = fadd float %A9, %B9
+  %A11 = extractelement <16 x float> %A, i32 11
+  %B11 = extractelement <16 x float> %B, i32 11
+  %add11 = fadd float %A11, %B11
+  %A13 = extractelement <16 x float> %A, i32 13
+  %B13 = extractelement <16 x float> %B, i32 13
+  %add13 = fadd float %A13, %B13
+  %A15 = extractelement <16 x float> %A, i32 15
+  %B15 = extractelement <16 x float> %B, i32 15
+  %add15 = fadd float %A15, %B15
+  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+  ; element 12 is undef
+  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+  ret <16 x float> %vecinsert16
+}
+
+define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; SSE-LABEL: @buildvector_mul_addsub_pd512(
+; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; SSE-NEXT:    [[ADD7:%.*]] = fadd double [[A7]], [[B7]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP6]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP7]], double [[ADD7]], i64 7
+; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_addsub_pd512(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fadd double [[A7]], [[B7]]
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
+; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_addsub_pd512(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA3-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX512-LABEL: @buildvector_mul_addsub_pd512(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+  %A = fmul <8 x double> %C, %D
+  %A0 = extractelement <8 x double> %A, i32 0
+  %B0 = extractelement <8 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A2 = extractelement <8 x double> %A, i32 2
+  %B2 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %A2, %B2
+  %A4 = extractelement <8 x double> %A, i32 4
+  %B4 = extractelement <8 x double> %B, i32 4
+  %sub4 = fsub double %A4, %B4
+  %A6 = extractelement <8 x double> %A, i32 6
+  %B6 = extractelement <8 x double> %B, i32 6
+  %sub6 = fsub double %A6, %B6
+  %A1 = extractelement <8 x double> %A, i32 1
+  %B1 = extractelement <8 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %A3 = extractelement <8 x double> %A, i32 3
+  %B3 = extractelement <8 x double> %B, i32 3
+  %add3 = fadd double %A3, %B3
+  %A7 = extractelement <8 x double> %A, i32 7
+  %B7 = extractelement <8 x double> %B, i32 7
+  %add7 = fadd double %A7, %B7
+  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+  ret <8 x double> %vecinsert8
+}
+
+define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_subadd_ps128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %A = fmul <4 x float> %C, %D
+  %A0 = extractelement <4 x float> %A, i32 0
+  %B0 = extractelement <4 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <4 x float> %A, i32 2
+  %B2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A1 = extractelement <4 x float> %A, i32 1
+  %B1 = extractelement <4 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <4 x float> %A, i32 3
+  %B3 = extractelement <4 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
+  ret <4 x float> %vecinsert4
+}
+
+define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_subadd_pd128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %A = fmul <2 x double> %C, %D
+  %A0 = extractelement <2 x double> %A, i32 0
+  %B0 = extractelement <2 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A1 = extractelement <2 x double> %A, i32 1
+  %B1 = extractelement <2 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+; SSE2-LABEL: @buildvector_mul_subadd_ps256(
+; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE2-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP4]]
+;
+; SSE4-LABEL: @buildvector_mul_subadd_ps256(
+; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE4-NEXT:    ret <8 x float> [[TMP6]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_subadd_ps256(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA4-NEXT:    ret <8 x float> [[TMP6]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_subadd_ps256(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x float> [[TMP2]]
+;
+; AVX512-LABEL: @buildvector_mul_subadd_ps256(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %A = fmul <8 x float> %C, %D
+  %A0 = extractelement <8 x float> %A, i32 0
+  %B0 = extractelement <8 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <8 x float> %A, i32 2
+  %B2 = extractelement <8 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A4 = extractelement <8 x float> %A, i32 4
+  %B4 = extractelement <8 x float> %B, i32 4
+  %sub4 = fadd float %A4, %B4
+  %A6 = extractelement <8 x float> %A, i32 6
+  %B6 = extractelement <8 x float> %B, i32 6
+  %sub6 = fadd float %A6, %B6
+  %A1 = extractelement <8 x float> %A, i32 1
+  %B1 = extractelement <8 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <8 x float> %A, i32 3
+  %B3 = extractelement <8 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %A5 = extractelement <8 x float> %A, i32 5
+  %B5 = extractelement <8 x float> %B, i32 5
+  %add5 = fsub float %A5, %B5
+  %A7 = extractelement <8 x float> %A, i32 7
+  %B7 = extractelement <8 x float> %B, i32 7
+  %add7 = fsub float %A7, %B7
+  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
+  ret <8 x float> %vecinsert8
+}
+
+define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_subadd_pd256(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %A = fmul <4 x double> %C, %D
+  %A0 = extractelement <4 x double> %A, i32 0
+  %B0 = extractelement <4 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A2 = extractelement <4 x double> %A, i32 2
+  %B2 = extractelement <4 x double> %B, i32 2
+  %sub2 = fadd double %A2, %B2
+  %A1 = extractelement <4 x double> %A, i32 1
+  %B1 = extractelement <4 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %A3 = extractelement <4 x double> %A, i32 3
+  %B3 = extractelement <4 x double> %B, i32 3
+  %add3 = fsub double %A3, %B3
+  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
+  ret <4 x double> %vecinsert4
+}
+
+define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; SSE-LABEL: @buildvector_mul_subadd_ps512(
+; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <12 x float> [[TMP2]], <12 x float> [[TMP3]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <12 x float> [[TMP4]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP8]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+;
+; AVX-LABEL: @buildvector_mul_subadd_ps512(
+; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP7:%.*]] = fsub <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP12]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
+;
+  %A = fmul <16 x float> %C, %D
+  %A0 = extractelement <16 x float> %A, i32 0
+  %B0 = extractelement <16 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <16 x float> %A, i32 2
+  %B2 = extractelement <16 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A4 = extractelement <16 x float> %A, i32 4
+  %B4 = extractelement <16 x float> %B, i32 4
+  %sub4 = fadd float %A4, %B4
+  %A6 = extractelement <16 x float> %A, i32 6
+  %B6 = extractelement <16 x float> %B, i32 6
+  %sub6 = fadd float %A6, %B6
+  %A8 = extractelement <16 x float> %A, i32 8
+  %B8 = extractelement <16 x float> %B, i32 8
+  %sub8 = fadd float %A8, %B8
+  %A10 = extractelement <16 x float> %A, i32 10
+  %B10 = extractelement <16 x float> %B, i32 10
+  %sub10 = fadd float %A10, %B10
+  %A12 = extractelement <16 x float> %A, i32 12
+  %B12 = extractelement <16 x float> %B, i32 12
+  %sub12 = fadd float %A12, %B12
+  %A14 = extractelement <16 x float> %A, i32 14
+  %B14 = extractelement <16 x float> %B, i32 14
+  %sub14 = fadd float %A14, %B14
+  %A1 = extractelement <16 x float> %A, i32 1
+  %B1 = extractelement <16 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <16 x float> %A, i32 3
+  %B3 = extractelement <16 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %A5 = extractelement <16 x float> %A, i32 5
+  %B5 = extractelement <16 x float> %B, i32 5
+  %add5 = fsub float %A5, %B5
+  %A7 = extractelement <16 x float> %A, i32 7
+  %B7 = extractelement <16 x float> %B, i32 7
+  %add7 = fsub float %A7, %B7
+  %A9 = extractelement <16 x float> %A, i32 9
+  %B9 = extractelement <16 x float> %B, i32 9
+  %add9 = fsub float %A9, %B9
+  %A11 = extractelement <16 x float> %A, i32 11
+  %B11 = extractelement <16 x float> %B, i32 11
+  %add11 = fsub float %A11, %B11
+  %A13 = extractelement <16 x float> %A, i32 13
+  %B13 = extractelement <16 x float> %B, i32 13
+  %add13 = fsub float %A13, %B13
+  %A15 = extractelement <16 x float> %A, i32 15
+  %B15 = extractelement <16 x float> %B, i32 15
+  %add15 = fsub float %A15, %B15
+  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+  ; element 12 is undef
+  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+  ret <16 x float> %vecinsert16
+}
+
+define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; SSE-LABEL: @buildvector_mul_subadd_pd512(
+; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = fsub <8 x double> [[A]], [[B]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; SSE-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP6]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP7]], double [[ADD7]], i64 7
+; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_subadd_pd512(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
+; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_subadd_pd512(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA3-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX512-LABEL: @buildvector_mul_subadd_pd512(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX512-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+  %A = fmul <8 x double> %C, %D
+  %A0 = extractelement <8 x double> %A, i32 0
+  %B0 = extractelement <8 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A2 = extractelement <8 x double> %A, i32 2
+  %B2 = extractelement <8 x double> %B, i32 2
+  %sub2 = fadd double %A2, %B2
+  %A4 = extractelement <8 x double> %A, i32 4
+  %B4 = extractelement <8 x double> %B, i32 4
+  %sub4 = fadd double %A4, %B4
+  %A6 = extractelement <8 x double> %A, i32 6
+  %B6 = extractelement <8 x double> %B, i32 6
+  %sub6 = fadd double %A6, %B6
+  %A1 = extractelement <8 x double> %A, i32 1
+  %B1 = extractelement <8 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %A3 = extractelement <8 x double> %A, i32 3
+  %B3 = extractelement <8 x double> %B, i32 3
+  %add3 = fsub double %A3, %B3
+  %A7 = extractelement <8 x double> %A, i32 7
+  %B7 = extractelement <8 x double> %B, i32 7
+  %add7 = fsub double %A7, %B7
+  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+  ret <8 x double> %vecinsert8
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }

From 45ea46c44636094e9fcdbbeabfd11f9d0fad5e38 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Wed, 18 Jun 2025 12:50:48 +0530
Subject: [PATCH 0764/1322] Reland [Driver] Add support for GCC installation
 detection in Baremetal toolchain (#144640)

This patch introduces enhancements to the Baremetal toolchain to support
GCC toolchain detection.
- If the --gcc-install-dir or --gcc-toolchain options are provided and
point to valid paths, the sysroot is derived from those locations.
- If not, the logic falls back to the existing sysroot inference
mechanism already present in the Baremetal toolchain.
- Support for adding include paths for the libstdc++ library has also
been added.

Additionally, the restriction to always use the integrated assembler has
been removed. With a valid GCC installation, the GNU assembler can now
be used as well.

This patch currently updates and adds tests for the ARM target only.
RISC-V-specific tests will be introduced in a later patch, once the
RISCVToolChain is fully merged into the Baremetal toolchain. At this
stage, there is no way to test the RISC-V target within this PR.

RFC:

https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/docs/Toolchain.rst                      |   5 +
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 +
 clang/lib/Driver/ToolChains/BareMetal.cpp     | 245 +++++++++++++-----
 clang/lib/Driver/ToolChains/BareMetal.h       |  19 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |   0
 .../aarch64-none-elf/lib/.keep                |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../bin/aarch64-none-elf-ld                   |   1 +
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |   0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../aarch64-none-elf/lib/crtbegin.o           |   0
 .../aarch64-none-elf/lib/crtend.o             |   0
 .../bin/aarch64-none-elf-ld                   |   1 +
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |   0
 .../armv6m-none-eabi/lib/.keep                |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../bin/armv6m-none-eabi-ld                   |   1 +
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |   0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../armv6m-none-eabi/lib/crtbegin.o           |   0
 .../armv6m-none-eabi/lib/crtend.o             |   0
 .../bin/armv6m-none-eabi-ld                   |   1 +
 clang/test/Driver/aarch64-gnutools.c          |   4 +
 clang/test/Driver/aarch64-toolchain-extra.c   |  28 ++
 clang/test/Driver/aarch64-toolchain.c         |  61 +++++
 clang/test/Driver/arm-gnutools.c              |   6 +
 clang/test/Driver/arm-toolchain-extra.c       |  29 +++
 clang/test/Driver/arm-toolchain.c             |  62 +++++
 clang/test/Driver/baremetal.cpp               |  16 ++
 clang/test/Driver/check-no-multlib-warning.c  |  10 +
 32 files changed, 423 insertions(+), 69 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/aarch64-gnutools.c
 create mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 create mode 100644 clang/test/Driver/aarch64-toolchain.c
 create mode 100644 clang/test/Driver/arm-gnutools.c
 create mode 100644 clang/test/Driver/arm-toolchain-extra.c
 create mode 100644 clang/test/Driver/arm-toolchain.c
 create mode 100644 clang/test/Driver/check-no-multlib-warning.c

diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index 958199eb7a2e..d56b21d74c7e 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,3 +347,8 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
+
+GCC Installation
+=================
+Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
+using ``-gcc-install-dir`` flag.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 29f6480ba935..94224e103875 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,6 +847,9 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
+def warn_drv_multilib_not_available_for_target: Warning<
+  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
+  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index d8168ed15feb..0fbfe6c77f34 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,6 +31,40 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
+}
+
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -95,7 +129,8 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
+static std::string computeClangRuntimesSysRoot(const Driver &D,
+                                               bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -108,58 +143,125 @@ static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   return std::string(SysRootDir);
 }
 
+// Only consider the GCC toolchain based on the values provided through the
+// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
+// whether the GCC toolchain was initialized successfully.
+bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
+                                    const llvm::opt::ArgList &Args) {
+  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
+      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
+    GCCInstallation.init(Triple, Args);
+    return GCCInstallation.isValid();
+  }
+  return false;
+}
+
+// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
+// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
+// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
+// `bin/../<target-triple>/lib` directory.
+static bool detectGCCToolchainAdjacent(const Driver &D) {
+  SmallString<128> GCCDir;
+  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
+                          "lib/crt0.o");
+  return llvm::sys::fs::exists(GCCDir);
+}
+
+// If no sysroot is provided the driver will first attempt to infer it from the
+// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
+// location of a GCC toolchain.
+// If neither flag is used, the sysroot defaults to either:
+//    - `bin/../<target-triple>`
+//    - `bin/../lib/clang-runtimes/<target-triple>`
+//
+// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
+// does not exist relative to the driver.
+std::string BareMetal::computeSysRoot() const {
+  // Use Baremetal::sysroot if it has already been set.
+  if (!SysRoot.empty())
+    return SysRoot;
+
+  // Use the sysroot specified via the `--sysroot` command-line flag, if
+  // provided.
+  const Driver &D = getDriver();
+  if (!D.SysRoot.empty())
+    return D.SysRoot;
+
+  // Attempt to infer sysroot from a valid GCC installation.
+  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
+  SmallString<128> inferredSysRoot;
+  if (IsGCCInstallationValid) {
+    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
+                            "..", GCCInstallation.getTriple().str());
+  } else if (detectGCCToolchainAdjacent(D)) {
+    // Use the triple as provided to the driver. Unlike the parsed triple
+    // this has not been normalized to always contain every field.
+    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
+  }
+  // If a valid sysroot was inferred and exists, use it
+  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
+    return std::string(inferredSysRoot);
+
+  // Use the clang-runtimes path.
+  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
+}
+
+static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
+                                  const Multilib &Multilib,
+                                  StringRef InstallPath,
+                                  ToolChain::path_list &Paths) {
+  if (const auto &PathsCallback = Multilibs.filePathsCallback())
+    for (const auto &Path : PathsCallback(Multilib))
+      addPathIfExists(D, InstallPath + Path, Paths);
+}
+
+// GCC mutltilibs will only work for those targets that have their multlib
+// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
+// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
+// in GCCInstallation.
 BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
                      const ArgList &Args)
-    : ToolChain(D, Triple, Args),
-      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
-  getProgramPaths().push_back(getDriver().Dir);
+    : Generic_ELF(D, Triple, Args) {
+  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
+  std::string ComputedSysRoot = computeSysRoot();
+  if (IsGCCInstallationValid) {
+    if (!isRISCVBareMetal(Triple))
+      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
 
-  findMultilibs(D, Triple, Args);
-  SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
-    for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
-      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-      getFilePaths().push_back(std::string(Dir));
-      getLibraryPaths().push_back(std::string(Dir));
+    Multilibs = GCCInstallation.getMultilibs();
+    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+
+    path_list &Paths = getFilePaths();
+    // Add toolchain/multilib specific file paths.
+    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
+                          GCCInstallation.getInstallPath(), Paths);
+    // Adding filepath for locating crt{begin,end}.o files.
+    Paths.push_back(GCCInstallation.getInstallPath().str());
+    // Adding filepath for locating crt0.o file.
+    Paths.push_back(ComputedSysRoot + "/lib");
+
+    ToolChain::path_list &PPaths = getProgramPaths();
+    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
+    // directory off of the parent of the GCC installation.
+    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
+                           GCCInstallation.getTriple().str() + "/bin")
+                         .str());
+    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
+  } else {
+    getProgramPaths().push_back(getDriver().Dir);
+    findMultilibs(D, Triple, Args);
+    const SmallString<128> SysRootDir(computeSysRoot());
+    if (!SysRootDir.empty()) {
+      for (const Multilib &M : getOrderedMultilibs()) {
+        SmallString<128> Dir(SysRootDir);
+        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+        getFilePaths().push_back(std::string(Dir));
+        getLibraryPaths().push_back(std::string(Dir));
+      }
     }
   }
 }
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
-}
-
 static void
 findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
                       StringRef MultilibPath, const ArgList &Args,
@@ -216,7 +318,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -234,7 +336,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -242,7 +344,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple)) {
+  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -263,8 +365,6 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
-std::string BareMetal::computeSysRoot() const { return SysRoot; }
-
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -292,10 +392,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
+  const SmallString<128> SysRootDir(computeSysRoot());
+  if (!SysRootDir.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
+      SmallString<128> Dir(SysRootDir);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -309,6 +409,19 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
+void BareMetal::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  if (!IsGCCInstallationValid)
+    return;
+  const GCCVersion &Version = GCCInstallation.getVersion();
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+  const Multilib &Multilib = GCCInstallation.getMultilib();
+  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
+                           TripleStr, Multilib.includeSuffix(), DriverArgs,
+                           CC1Args);
+}
+
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -339,23 +452,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-    case ToolChain::CST_Libcxx: {
-      SmallString<128> P(D.Dir);
-      llvm::sys::path::append(P, "..", "include");
-      AddCXXIncludePath(P);
-      break;
-    }
-    case ToolChain::CST_Libstdcxx:
-      // We only support libc++ toolchain installation.
-      break;
+  case ToolChain::CST_Libcxx: {
+    SmallString<128> P(D.Dir);
+    llvm::sys::path::append(P, "..", "include");
+    AddCXXIncludePath(P);
+    break;
+  }
+  case ToolChain::CST_Libstdcxx:
+    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
+    break;
   }
 
-  std::string SysRoot(computeSysRoot());
-  if (SysRoot.empty())
+  std::string SysRootDir(computeSysRoot());
+  if (SysRootDir.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRoot);
+    SmallString<128> Dir(SysRootDir);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index f6295bda0a6a..930f8584e643 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
+#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -19,7 +20,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -35,7 +36,8 @@ protected:
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool useIntegratedAs() const override { return true; }
+  bool initGCCInstallation(const llvm::Triple &Triple,
+                           const llvm::opt::ArgList &Args);
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -48,9 +50,15 @@ public:
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
+  UnwindTableLevel
+  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
+    return UnwindTableLevel::None;
+  }
+
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
+
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
@@ -67,6 +75,9 @@ public:
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -80,6 +91,8 @@ private:
 
   std::string SysRoot;
 
+  bool IsGCCInstallationValid;
+
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -104,7 +117,7 @@ public:
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
new file mode 100644
index 000000000000..0214639ed380
--- /dev/null
+++ b/clang/test/Driver/aarch64-gnutools.c
@@ -0,0 +1,4 @@
+// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
new file mode 100644
index 000000000000..2610e962bd69
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -0,0 +1,28 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in aarch64-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/aarch64-nogcc/bin
+// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
new file mode 100644
index 000000000000..7f2c01d928e4
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -0,0 +1,61 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
+
+// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
+
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
new file mode 100644
index 000000000000..6e107f19dabc
--- /dev/null
+++ b/clang/test/Driver/arm-gnutools.c
@@ -0,0 +1,6 @@
+// check that gnu assembler is invoked with arm baremetal as well
+
+// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
new file mode 100644
index 000000000000..114de0a8154a
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -0,0 +1,29 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in arm-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/arm-nogcc/bin
+// RUN: ln -s %clang %t/arm-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
+
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
new file mode 100644
index 000000000000..2e38461fb7a3
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain.c
@@ -0,0 +1,62 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
+
+// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
+
+// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
+
+// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index a80aa9b43711..2ac83402dda3 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -196,6 +196,22 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
+// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
+// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
new file mode 100644
index 000000000000..9a0d7cee450a
--- /dev/null
+++ b/clang/test/Driver/check-no-multlib-warning.c
@@ -0,0 +1,10 @@
+// UNSUPPORTED: system-windows
+
+
+// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+
+// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
+// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets

From e07b1b26c38ba48af247b370a29eeb9879cefc97 Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Wed, 18 Jun 2025 12:59:27 +0530
Subject: [PATCH 0765/1322] [DAG] Implement SDPatternMatch `m_Abs()` matcher
 (#144512)

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h           |  4 ++++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp        | 12 +++---------
 .../CodeGen/SelectionDAGPatternMatchTest.cpp         |  4 ++++
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 2e3807a2dfff..d413227c4d96 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -938,6 +938,10 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_Trunc(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::TRUNCATE, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Abs(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::ABS, Op);
+}
+
 /// Match a zext or identity
 /// Allows to peek through optional extensions
 template <typename Opnd> inline auto m_ZExtOrSelf(const Opnd &Op) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d14615dcbc5e..934199e414c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11260,19 +11260,13 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) {
   if (N->getOpcode() == ISD::TRUNCATE)
     N = N->getOperand(0).getNode();
 
-  if (N->getOpcode() != ISD::ABS)
-    return SDValue();
-
   EVT VT = N->getValueType(0);
-  SDValue AbsOp1 = N->getOperand(0);
   SDValue Op0, Op1;
 
-  if (AbsOp1.getOpcode() != ISD::SUB)
+  if (!sd_match(N, m_Abs(m_Sub(m_Value(Op0), m_Value(Op1)))))
     return SDValue();
 
-  Op0 = AbsOp1.getOperand(0);
-  Op1 = AbsOp1.getOperand(1);
-
+  SDValue AbsOp0 = N->getOperand(0);
   unsigned Opc0 = Op0.getOpcode();
 
   // Check if the operands of the sub are (zero|sign)-extended.
@@ -11282,7 +11276,7 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) {
        Opc0 != ISD::SIGN_EXTEND_INREG)) {
     // fold (abs (sub nsw x, y)) -> abds(x, y)
     // Don't fold this for unsupported types as we lose the NSW handling.
-    if (AbsOp1->getFlags().hasNoSignedWrap() && hasOperation(ISD::ABDS, VT) &&
+    if (AbsOp0->getFlags().hasNoSignedWrap() && hasOperation(ISD::ABDS, VT) &&
         TLI.preferABDSToABSWithNSW(VT)) {
       SDValue ABD = DAG.getNode(ISD::ABDS, DL, VT, Op0, Op1);
       return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 1b590aa33bd8..2162588aadfd 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -388,6 +388,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0);
   SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1);
 
+  SDValue Abs = DAG->getNode(ISD::ABS, DL, Int32VT, Op0);
+
   SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Trunc, Op0);
   SDValue Neg = DAG->getNegative(Op0, DL, Int32VT);
   SDValue Not = DAG->getNOT(DL, Op0, Int32VT);
@@ -417,6 +419,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(ZExt, m_SExtLike(m_Value())));
   EXPECT_TRUE(sd_match(Trunc, m_Trunc(m_Specific(Op1))));
 
+  EXPECT_TRUE(sd_match(Abs, m_Abs(m_Specific(Op0))));
+
   EXPECT_TRUE(sd_match(Neg, m_Neg(m_Value())));
   EXPECT_TRUE(sd_match(Not, m_Not(m_Value())));
   EXPECT_FALSE(sd_match(ZExt, m_Neg(m_Value())));

From a38932ac3c0a16226e3dde7f1532f117959c58df Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Wed, 18 Jun 2025 09:49:32 +0200
Subject: [PATCH 0766/1322] Revert "[GlobalISel] prevent G_UNMERGE_VALUES for
 vectors with different elements" (#144650)

Reverts llvm/llvm-project#133335
---
 .../GlobalISel/LegalizationArtifactCombiner.h |  5 +-
 .../AMDGPU/GlobalISel/insertelement.ll        | 55 -------------------
 2 files changed, 1 insertion(+), 59 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 8f560c42082f..22f6a5fde546 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -997,7 +997,6 @@ public:
 
       // Recognize UnmergeSrc that can be unmerged to DstTy directly.
       // Types have to be either both vector or both non-vector types.
-      // In case of vector types, the scalar elements need to match.
       // Merge-like opcodes are combined one at the time. First one creates new
       // unmerge, following should use the same unmerge (builder performs CSE).
       //
@@ -1006,9 +1005,7 @@ public:
       // %AnotherDst:_(DstTy) = G_merge_like_opcode %2:_(EltTy), %3
       //
       // %Dst:_(DstTy), %AnotherDst = G_UNMERGE_VALUES %UnmergeSrc
-      if (((!DstTy.isVector() && !UnmergeSrcTy.isVector()) ||
-           (DstTy.isVector() && UnmergeSrcTy.isVector() &&
-            DstTy.getScalarType() == UnmergeSrcTy.getScalarType())) &&
+      if ((DstTy.isVector() == UnmergeSrcTy.isVector()) &&
           (Elt0UnmergeIdx % NumMIElts == 0) &&
           getCoverTy(UnmergeSrcTy, DstTy) == UnmergeSrcTy) {
         if (!isSequenceFromUnmerge(MI, 0, Unmerge, Elt0UnmergeIdx, NumMIElts,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 132a89478c5f..8134eb3ca2af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -6506,58 +6506,3 @@ entry:
   %insert = insertelement <5 x double> %vec, double %val, i32 %idx
   ret <5 x double> %insert
 }
-
-; Found by fuzzer, reduced with llvm-reduce.
-define amdgpu_kernel void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) {
-; GPRIDX-LABEL: insert_very_small_from_very_large:
-; GPRIDX:       ; %bb.0: ; %bb
-; GPRIDX-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
-; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
-; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT:    s_lshr_b32 s2, s12, 1
-; GPRIDX-NEXT:    s_and_b32 s2, s2, 1
-; GPRIDX-NEXT:    s_lshl_b32 s2, s2, 1
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT:    flat_store_byte v[0:1], v2
-; GPRIDX-NEXT:    s_endpgm
-;
-; GFX10-LABEL: insert_very_small_from_very_large:
-; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s2, s12, 1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    flat_store_byte v[0:1], v2
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: insert_very_small_from_very_large:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x40
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s2, s8, 1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_and_b32 s2, s2, 1
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-NEXT:    flat_store_b8 v[0:1], v2
-; GFX11-NEXT:    s_endpgm
-bb:
-  %a = bitcast <32 x i16> %L3 to i512
-  %b = trunc i512 %a to i8
-  %c = trunc i8 %b to i2
-  %d = bitcast i2 %c to <2 x i1>
-  %insert = insertelement <2 x i1> %d, i1 false, i32 0
-  store <2 x i1> %insert, ptr %ptr, align 1
-  ret void
-}

From 49df87e71b73b230ecb21335dcb5f5390eebdab3 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Wed, 18 Jun 2025 08:57:51 +0100
Subject: [PATCH 0767/1322] [libc][printf] Fix out-of-range shift in float320
 printf (#144542)

If you enable `LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320` and use a
`%f` style printf format directive to print a nonzero number too small
to show up in the output digits, e.g. `printf("%.2f", 0.001)`, then the
output would be intermittently incorrect, because
`DyadicFloat::as_mantissa_type_rounded` would try to shift the 320-bit
mantissa right by more than 320 bits, invoking the 'undefined behavior'
clause commented in the `shift()` function in `big_int.h`.

There were already tests in the libc test suite exercising this case,
e.g. the subnormal tests in `LlvmLibcSPrintfTest.FloatDecimalConv` use
`%f` at the default precision of 6 decimal places on tiny numbers such
as 2^-1027. But because the behavior is undefined, they don't visibly
fail all the time, and in all previous test runs we'd tried with
USE_FLOAT320, they had got lucky.

The fix is simply to detect an out-of-range right shift before doing it,
and instead just set the output value to zero.
---
 libc/src/__support/FPUtil/dyadic_float.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 6c3e1520e5af..4c77d3c541cd 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -465,7 +465,10 @@ template <size_t Bits> struct DyadicFloat {
         // exponents coming in to this function _shouldn't_ be that large). The
         // result should always end up as a positive size_t.
         size_t shift = -static_cast<size_t>(exponent);
-        new_mant >>= shift;
+        if (shift >= Bits)
+          new_mant = 0;
+        else
+          new_mant >>= shift;
         round_dir = rounding_direction(mantissa, shift, sign);
         if (round_dir > 0)
           ++new_mant;

From ba40a7bc2e65be86ac23c9cf6038ac085dda77eb Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Wed, 18 Jun 2025 16:03:20 +0800
Subject: [PATCH 0768/1322] [LoopVectorize] Vectorize fixed-order recurrence
 with vscale x 1. (#142772)

When the fixed-order recurrence phi is live-out from the loop, the
vectorizer uses VPInstruction::ExtractPenultimateElement to extract the
penultimate element from the recurrence vector. However, this is not
feasible when the VF is vscale x 1, since vscale could be 1, making the
vector contain only one element.

This patch changes the behavior for vscale x 1 by extracting the last
element from the vector produced by splicing the recurrence phi and the
previous value. This ensures we can still determine the correct live-out
value of the recurrence phi.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 22 +++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  3 -
 .../first-order-recurrence-scalable-vf1.ll    | 57 ++++++------
 .../first-order-recurrence-scalable-vf1.ll    | 90 +++++++++++++++++--
 4 files changed, 130 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e7642..16d48b06dce4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6163,11 +6163,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
     // First-order recurrences are replaced by vector shuffles inside the loop.
     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
-      // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
-      // penultimate value of the recurrence.
-      // TODO: Consider vscale_range info.
-      if (VF.isScalable() && VF.getKnownMinValue() == 1)
-        return InstructionCost::getInvalid();
       SmallVector<int> Mask(VF.getKnownMinValue());
       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
@@ -8556,13 +8551,17 @@ addUsersInExitBlocks(VPlan &Plan,
 /// users in the original exit block using the VPIRInstruction wrapping to the
 /// LCSSA phi.
 static void addExitUsersForFirstOrderRecurrences(
-    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
+    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
   auto *ScalarPHVPBB = Plan.getScalarPreheader();
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
+  auto IsScalableOne = [](ElementCount VF) -> bool {
+    return VF == ElementCount::getScalable(1);
+  };
+
   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
     if (!FOR)
@@ -8644,6 +8643,15 @@ static void addExitUsersForFirstOrderRecurrences(
     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
       if (ExitIRI->getOperand(0) != FOR)
         continue;
+      // For VF vscale x 1, if vscale = 1, we are unable to extract the
+      // penultimate value of the recurrence. Instead, we rely on function
+      // addUsersInExitBlocks to extract the last element from the result of
+      // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
+      // recurrence phi in ExitUsersToFix.
+      // TODO: Consider vscale_range info and UF.
+      if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
+                                                             Range))
+        return;
       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
           VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
           {}, "vector.recur.extract.for.phi");
@@ -8858,7 +8866,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
   SetVector<VPIRInstruction *> ExitUsersToFix =
       collectUsersInLatchExitBlock(*Plan);
-  addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+  addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix, Range);
   addUsersInExitBlocks(*Plan, ExitUsersToFix);
 
   // ---------------------------------------------------------------------------
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8..f3b5c8cfa988 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3680,9 +3680,6 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
   if (VF.isScalar())
     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 
-  if (VF == ElementCount::getScalable(1))
-    return InstructionCost::getInvalid();
-
   return 0;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index d34098545716..e3f9540ff3df 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -1,56 +1,63 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -scalable-vectorization=on -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux-gnu"
 
-; Make sure we do not pick <vscale x 1 x i64> as VF for a loop with a
-; first-order recurrence.
 define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP15]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP_SRC]], align 8
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
index 98a942a50107..b20d59bd5760 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
@@ -8,17 +8,51 @@ define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for_live_out(
 ; CHECK-SAME: ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP13]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
@@ -43,17 +77,51 @@ define void @pr97452_scalable_vf1_for_no_live_out(ptr %src, ptr noalias %dst) {
 ; CHECK-LABEL: define void @pr97452_scalable_vf1_for_no_live_out(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -74,3 +142,11 @@ loop:
 exit:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.

From ca29c632f06fc0e02ebbbb9fbdc73e3abd6b096b Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Wed, 18 Jun 2025 16:11:18 +0800
Subject: [PATCH 0769/1322] [RISCV] Support non-power-of-2 types when expanding
 memcmp

We can convert non-power-of-2 types into extended value types
and then they will be widen.

Reviewers: lukel97

Reviewed By: lukel97

Pull Request: https://github.com/llvm/llvm-project/pull/114971
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  23 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  21 +-
 .../test/CodeGen/RISCV/icmp-non-byte-sized.ll |  41 +
 llvm/test/CodeGen/RISCV/memcmp-optsize.ll     | 800 +++++++-----------
 llvm/test/CodeGen/RISCV/memcmp.ll             | 800 +++++++-----------
 5 files changed, 709 insertions(+), 976 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/icmp-non-byte-sized.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e670567bd184..b8ef221742a2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16190,10 +16190,6 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
     return SDValue();
 
   unsigned OpSize = OpVT.getSizeInBits();
-  // TODO: Support non-power-of-2 types.
-  if (!isPowerOf2_32(OpSize))
-    return SDValue();
-
   // The size should be larger than XLen and smaller than the maximum vector
   // size.
   if (OpSize <= Subtarget.getXLen() ||
@@ -16214,14 +16210,25 @@ combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
           Attribute::NoImplicitFloat))
     return SDValue();
 
+  // Bail out for non-byte-sized types.
+  if (!OpVT.isByteSized())
+    return SDValue();
+
   unsigned VecSize = OpSize / 8;
-  EVT VecVT = MVT::getVectorVT(MVT::i8, VecSize);
-  EVT CmpVT = MVT::getVectorVT(MVT::i1, VecSize);
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, VecSize);
+  EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VecSize);
 
   SDValue VecX = DAG.getBitcast(VecVT, X);
   SDValue VecY = DAG.getBitcast(VecVT, Y);
-  SDValue Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
-  return DAG.getSetCC(DL, VT, DAG.getNode(ISD::VECREDUCE_OR, DL, XLenVT, Cmp),
+  SDValue Mask = DAG.getAllOnesConstant(DL, CmpVT);
+  SDValue VL = DAG.getConstant(VecSize, DL, XLenVT);
+
+  SDValue Cmp = DAG.getNode(ISD::VP_SETCC, DL, CmpVT, VecX, VecY,
+                            DAG.getCondCode(ISD::SETNE), Mask, VL);
+  return DAG.getSetCC(DL, VT,
+                      DAG.getNode(ISD::VP_REDUCE_OR, DL, XLenVT,
+                                  DAG.getConstant(0, DL, XLenVT), Cmp, Mask,
+                                  VL),
                       DAG.getConstant(0, DL, XLenVT), CC);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index aadda2ce8552..46e30ce4c18a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2985,20 +2985,13 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   }
 
   if (IsZeroCmp && ST->hasVInstructions()) {
-    unsigned RealMinVLen = ST->getRealMinVLen();
-    // Support Fractional LMULs if the lengths are larger than XLen.
-    // TODO: Support non-power-of-2 types.
-    for (unsigned FLMUL = 8; FLMUL >= 2; FLMUL /= 2) {
-      unsigned Len = RealMinVLen / FLMUL;
-      if (Len > ST->getXLen())
-        Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8);
-    }
-    for (unsigned LMUL = 1; LMUL <= ST->getMaxLMULForFixedLengthVectors();
-         LMUL *= 2) {
-      unsigned Len = RealMinVLen * LMUL;
-      if (Len > ST->getXLen())
-        Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8);
-    }
+    unsigned VLenB = ST->getRealMinVLen() / 8;
+    // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
+    // `VLenB * MaxLMUL` so that it fits in a single register group.
+    unsigned MinSize = ST->getXLen() / 8 + 1;
+    unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
+    for (unsigned Size = MinSize; Size <= MaxSize; Size++)
+      Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
   }
   return Options;
 }
diff --git a/llvm/test/CodeGen/RISCV/icmp-non-byte-sized.ll b/llvm/test/CodeGen/RISCV/icmp-non-byte-sized.ll
new file mode 100644
index 000000000000..fca6238548aa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/icmp-non-byte-sized.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+v -O2 < %s | FileCheck %s --check-prefix=CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -O2 < %s | FileCheck %s --check-prefix=CHECK-RV64
+
+define i1 @icmp_non_byte_type(ptr %p1, ptr %p2) nounwind {
+; CHECK-RV32-LABEL: icmp_non_byte_type:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    lw a3, 4(a0)
+; CHECK-RV32-NEXT:    lw a4, 8(a0)
+; CHECK-RV32-NEXT:    lw a0, 12(a0)
+; CHECK-RV32-NEXT:    lw a5, 12(a1)
+; CHECK-RV32-NEXT:    lw a6, 4(a1)
+; CHECK-RV32-NEXT:    lw a7, 8(a1)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    xor a0, a0, a5
+; CHECK-RV32-NEXT:    xor a3, a3, a6
+; CHECK-RV32-NEXT:    xor a4, a4, a7
+; CHECK-RV32-NEXT:    xor a1, a2, a1
+; CHECK-RV32-NEXT:    or a0, a3, a0
+; CHECK-RV32-NEXT:    or a1, a1, a4
+; CHECK-RV32-NEXT:    or a0, a1, a0
+; CHECK-RV32-NEXT:    seqz a0, a0
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: icmp_non_byte_type:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    ld a0, 8(a0)
+; CHECK-RV64-NEXT:    ld a3, 8(a1)
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    xor a0, a0, a3
+; CHECK-RV64-NEXT:    xor a1, a2, a1
+; CHECK-RV64-NEXT:    or a0, a1, a0
+; CHECK-RV64-NEXT:    seqz a0, a0
+; CHECK-RV64-NEXT:    ret
+  %v1 = load i127, ptr %p1
+  %v2 = load i127, ptr %p2
+  %ret = icmp eq i127 %v1, %v2
+  ret i1 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 3742383675b9..0d57e4201512 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -517,17 +517,99 @@ define i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_size_5:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 5)
   ret i32 %bcmp
@@ -614,17 +696,99 @@ define i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_size_6:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lhu a0, 4(a0)
-; CHECK-UNALIGNED-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-NEXT:    lhu a1, 4(a1)
-; CHECK-UNALIGNED-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 6, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 6)
   ret i32 %bcmp
@@ -711,17 +875,99 @@ define i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_size_7:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lw a0, 3(a0)
-; CHECK-UNALIGNED-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-NEXT:    lw a1, 3(a1)
-; CHECK-UNALIGNED-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 7, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 7)
   ret i32 %bcmp
@@ -1069,33 +1315,21 @@ define i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_15:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 7(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 11(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 7(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 11(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a3, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 15, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_15:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 7(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 7(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 15, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1477,57 +1711,21 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 15(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 19(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 23(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 15(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 19(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 23(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, t3, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, t2, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t4, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a2, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a5, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a1, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 31, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 15(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 23(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 15(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 23(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a3, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 31, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1875,129 +2073,23 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -48
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 24(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 47(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 51(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 55(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 59(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 31(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 35(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 39(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 43(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, t1, s2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, s0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 31(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 35(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 39(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 43(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, t3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, s1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 47(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 51(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 55(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 59(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, s10
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, s6, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, s8, s0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, s4, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, s9, s2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s5, s1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, s7, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, s3, t3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, t3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, a6, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, s0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t0, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, a7, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t1, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a7, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t0, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a6, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v16
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 48
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 31(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 39(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 47(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 31(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 39(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 47(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, t3, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, t2, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t4, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a2, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a5, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v16
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2315,270 +2407,24 @@ define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_127:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -96
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 64(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 60(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 56(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 52(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 48(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 32(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 36(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 40(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 44(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 48(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 52(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 56(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 60(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 60(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 24(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 36(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 40(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 44(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 40(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a3, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a4, s4
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 32(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 56(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 48(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a6, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 28(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a7, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 24(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t0, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t1, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 16(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 107(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t5, s10
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 75(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, t6, s8
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 123(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s2, s2, s4
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s0, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, t4, s11
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 83(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 87(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 91(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s1, s1, s6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 107(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, t3, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 91(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, t2, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 123(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, a5, s3
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 75(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s5, s11, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s7, s8, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 87(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 83(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s3, s10, s3
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 115(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s6, ra, s6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 115(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s4, s4, s11
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 119(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 119(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s10, s10, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 71(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 67(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 67(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 71(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 99(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 99(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, a5, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 103(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 103(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, a4, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, s9, s8
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, s11, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, ra, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a5, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 95(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 63(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 111(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 79(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 79(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 111(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 63(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 95(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s11
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s9, s9, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, s8, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, a5, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t2, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t3, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t4, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, s1, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t5, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, s0, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, s2, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, t6, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 12(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t2, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t1, t2, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t2, s10
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t3, t3, s4
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 28(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t4, s6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 32(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t5, t5, s3
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 36(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, t6, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or s0, s0, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, s0, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t5, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t3, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t1, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a6, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t4, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t0, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v24
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 64(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 56(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 48(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 44(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 96
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_127:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -96
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 56(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 48(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 56(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t6, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 95(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 103(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 111(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 119(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 63(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 71(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 79(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 87(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t1, t1, s2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, s0
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 63(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 71(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 79(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 87(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, t3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, s1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t4
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 95(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 103(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 111(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 119(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, s10
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, s6, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t2, s8, s0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t4, s4, t4
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t6, s9, s2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor s0, s5, s1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t5, s7, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t3, s3, t3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, t3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a6, a6, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, s0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or t0, t0, t6
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, a3, t4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a7, a7, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a5, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t1, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a7, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, t0, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a6, a2
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v24
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 96
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index f9a6dbba04fc..0caab1f5ce2f 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -517,17 +517,99 @@ define i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_size_5:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-NEXT:    lbu a1, 4(a1)
-; CHECK-UNALIGNED-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_5:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lbu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lbu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 5)
   ret i32 %bcmp
@@ -614,17 +696,99 @@ define i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_size_6:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lhu a0, 4(a0)
-; CHECK-UNALIGNED-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-NEXT:    lhu a1, 4(a1)
-; CHECK-UNALIGNED-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 6, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_6:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lhu a0, 4(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lhu a1, 4(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 6)
   ret i32 %bcmp
@@ -711,17 +875,99 @@ define i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_size_7:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lw a0, 3(a0)
-; CHECK-UNALIGNED-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-NEXT:    lw a1, 3(a1)
-; CHECK-UNALIGNED-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 7, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_7:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a0, 3(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    lw a1, 3(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 7)
   ret i32 %bcmp
@@ -1069,33 +1315,21 @@ define i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_15:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 7(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 11(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 7(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 11(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a3, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 15, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_15:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 7(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 7(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 15, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1555,57 +1789,21 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 15(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 19(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 23(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 15(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 19(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 23(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, t3, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, t2, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t4, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a2, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a5, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a1, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 31, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 15(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 23(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 15(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 23(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a3, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 31, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2109,129 +2307,23 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -48
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 24(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 47(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 51(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 55(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 59(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 31(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 35(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 39(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 43(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, t1, s2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, s0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 31(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 35(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 39(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 43(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, t3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, s1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 47(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 51(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 55(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 59(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, s10
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, s6, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, s8, s0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, s4, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, s9, s2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s5, s1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, s7, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, s3, t3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, t3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, a6, t5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, s0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t0, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, a7, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t1, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a7, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t0, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a6, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v16
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 48
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 31(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 39(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 47(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 31(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 39(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 47(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, t3, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, t2, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t4, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a2, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a5, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v16
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2627,270 +2719,24 @@ define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_127:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -96
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 64(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 60(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 56(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 52(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 48(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 32(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 36(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 40(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 44(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 48(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 52(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 56(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 60(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 60(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 24(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 36(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 40(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 44(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 40(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a3, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a4, s4
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 32(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 56(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 48(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a6, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 28(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a7, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 24(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t0, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t1, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 16(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 107(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t5, s10
-; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 75(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, t6, s8
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 123(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s2, s2, s4
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s0, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, t4, s11
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 83(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 87(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 91(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s1, s1, s6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 107(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, t3, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 91(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, t2, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 123(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, a5, s3
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 75(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s5, s11, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s7, s8, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 87(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 83(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s3, s10, s3
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 115(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s6, ra, s6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 115(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s4, s4, s11
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 119(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 119(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s10, s10, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 71(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 67(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 67(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 71(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 99(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 99(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, a5, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 103(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 103(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, a4, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, s9, s8
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, s11, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, ra, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a5, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 95(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 63(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 111(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 79(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 79(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 111(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 63(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 95(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s11
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor s9, s9, ra
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, s8, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, a5, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t2, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t3, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t4, s9
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, s1, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t5, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, s0, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, s2, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, t6, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 12(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t2, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t1, t2, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t2, s10
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t3, t3, s4
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 28(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t4, s6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 32(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t5, t5, s3
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 36(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, t6, s7
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    or s0, s0, s5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, s0, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t5, t4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t3, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t1, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a6, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t4, t6
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t0, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v24
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 64(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 56(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 48(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 44(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 96
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_127:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -96
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 56(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 48(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 56(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t6, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 95(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 103(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 111(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 119(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 63(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 71(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 79(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 87(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t1, t1, s2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, s0
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 63(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 71(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 79(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 87(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, t3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, s1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t4
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 95(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 103(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 111(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 119(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, s10
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, s6, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t2, s8, s0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t4, s4, t4
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t6, s9, s2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor s0, s5, s1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t5, s7, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t3, s3, t3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, t3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a6, a6, t5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, s0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or t0, t0, t6
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, a3, t4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a7, a7, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a5, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t1, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a7, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, t0, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a6, a2
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v24
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
-; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 96
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)

From 59d6fbb8ffe03ceecfcc07ebe22e256c97ef70dd Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 18 Jun 2025 10:24:08 +0200
Subject: [PATCH 0770/1322] [flang][fir] Provide allocation block for
 `fir.local` when required (#144521)

Extends `fir::FirOpBuilder::getAllocaBlock()` to support `fir.local`.
This allows us to retrieve an allocation block when needed for
`fir.local`.
---
 flang/lib/Optimizer/Builder/FIRBuilder.cpp  |  3 ++
 flang/test/HLFIR/fir-local-alloca-block.fir | 34 +++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 flang/test/HLFIR/fir-local-alloca-block.fir

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 584f3c8ee310..6ac87067f651 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -283,6 +283,9 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
   if (auto doConcurentOp = getRegion().getParentOfType<fir::DoConcurrentOp>())
     return doConcurentOp.getBody();
 
+  if (auto firLocalOp = getRegion().getParentOfType<fir::LocalitySpecifierOp>())
+    return &getRegion().front();
+
   return getEntryBlock();
 }
 
diff --git a/flang/test/HLFIR/fir-local-alloca-block.fir b/flang/test/HLFIR/fir-local-alloca-block.fir
new file mode 100644
index 000000000000..9d76e86fec3d
--- /dev/null
+++ b/flang/test/HLFIR/fir-local-alloca-block.fir
@@ -0,0 +1,34 @@
+// Tests that `fir.local` ops are able to provide an alloca block when required.
+
+// RUN: fir-opt %s -convert-hlfir-to-fir | FileCheck %s
+
+fir.local {type = local_init} @localizer : !fir.box<!fir.array<1xi32>> copy {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<1xi32>>>, %arg1: !fir.ref<!fir.box<!fir.array<1xi32>>>):
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<1xi32>>>
+  hlfir.assign %0 to %arg1 : !fir.box<!fir.array<1xi32>>, !fir.ref<!fir.box<!fir.array<1xi32>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.array<1xi32>>>)
+}
+
+func.func @foo() {
+  %c1 = arith.constant 1 : index
+  %0 = fir.alloca !fir.box<!fir.array<1xi32>>
+  fir.do_concurrent {
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) local(@localizer %0 -> %arg1 : !fir.ref<!fir.box<!fir.array<1xi32>>>) {
+    }
+  }
+  return
+}
+
+// CHECK:  fir.local {type = local_init} @localizer : ![[TYPE:fir.box<!fir.array<1xi32>>]] copy {
+// CHECK:    ^bb0(%[[VAL_0:.*]]: !fir.ref<![[TYPE]]>, %[[VAL_1:.*]]: !fir.ref<![[TYPE]]>):
+// CHECK:      %[[VAL_2:.*]] = fir.alloca ![[TYPE]]
+// CHECK:      %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<![[TYPE]]>
+// CHECK:      %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:      %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_3]], %[[VAL_4]] : (![[TYPE]], index) -> (index, index, index)
+// CHECK:      %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<![[TYPE]]>
+// CHECK:      fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<![[TYPE]]>
+// CHECK:      %[[VAL_10:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<![[TYPE]]>) -> !fir.ref<!fir.box<none>>
+// CHECK:      %[[VAL_11:.*]] = fir.convert %[[VAL_3]] : (![[TYPE]]) -> !fir.box<none>
+// CHECK:      fir.call @_FortranAAssign(%[[VAL_10]], %[[VAL_11]], %{{.*}}, %{{.*}})
+// CHECK:      fir.yield(%[[VAL_1]] : !fir.ref<![[TYPE]]>)
+// CHECK:  }

From 255b55c602f73964262893859a543a115b278e21 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 18 Jun 2025 01:35:56 -0700
Subject: [PATCH 0771/1322] [GlobalOpt] Use cast instead of dyn_cast. NFC
 (#144634)

The dyn_cast was not checked for null, and the cast is guaranteed to
succeed by an earlier check.
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 7db058638650..4a06e0fa619c 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2169,7 +2169,7 @@ static bool tryWidenGlobalArraysUsedByMemcpy(
 
     unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
 
-    auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+    auto *Alloca = cast<AllocaInst>(CI->getArgOperand(0));
     uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
     uint64_t SZSize = SourceDataArray->getType()->getNumElements();
     unsigned ElementByteWidth = SourceDataArray->getElementByteSize();

From 7ea7ccd24d603ceec6eb5194d98911e6ab7c0717 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 18 Jun 2025 10:50:17 +0200
Subject: [PATCH 0772/1322] [PowerPC][AIX] Specify pointer info and alignment
 for stack store (#144526)

When lowering call arguments to stack, specify a stack MPI, as well as
the stack alignment, instead of using the defaults (which would be an
unknown location with ABI alignment).

I believe the asm diffs are just changes in scheduling.
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   4 +-
 llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll   | 366 +++++++++---------
 llvm/test/CodeGen/PowerPC/aix-cc-abi.ll       | 176 +++++----
 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll |   4 +-
 .../CodeGen/PowerPC/aix-vec-arg-spills-mir.ll |  34 +-
 .../CodeGen/PowerPC/aix-vec-arg-spills.ll     |  38 +-
 .../PowerPC/aix-vector-vararg-caller.ll       |  26 +-
 .../PowerPC/aix-vector-vararg-fixed-caller.ll |   8 +-
 8 files changed, 341 insertions(+), 315 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0f8e5e57c58b..f502d8570425 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7767,7 +7767,9 @@ SDValue PPCTargetLowering::LowerCall_AIX(
           DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
       PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
       MemOpChains.push_back(
-          DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+          DAG.getStore(Chain, dl, Arg, PtrOff,
+                       MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
+                       Subtarget.getFrameLowering()->getStackAlign()));
 
       continue;
     }
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
index aead5762d092..9ffb4fd5eae4 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll
@@ -474,14 +474,14 @@ define void @call_test_fpr_max() {
   ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d1, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d1)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STFD renamable $f1, 120, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 112, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 104, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 96, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 88, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 80, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 72, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   STFD renamable $f1, 64, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 120, $r1 :: (store (s64) into stack + 120, basealign 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 112, $r1 :: (store (s64) into stack + 112, align 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 104, $r1 :: (store (s64) into stack + 104, basealign 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 96, $r1 :: (store (s64) into stack + 96, align 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 88, $r1 :: (store (s64) into stack + 88, basealign 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 80, $r1 :: (store (s64) into stack + 80, align 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 72, $r1 :: (store (s64) into stack + 72, basealign 16)
+  ; 32BIT-NEXT:   STFD renamable $f1, 64, $r1 :: (store (s64) into stack + 64, align 16)
   ; 32BIT-NEXT:   $f2 = COPY renamable $f1
   ; 32BIT-NEXT:   $f3 = COPY renamable $f1
   ; 32BIT-NEXT:   $f4 = COPY renamable $f1
@@ -494,7 +494,7 @@ define void @call_test_fpr_max() {
   ; 32BIT-NEXT:   $f11 = COPY renamable $f1
   ; 32BIT-NEXT:   $f12 = COPY renamable $f1
   ; 32BIT-NEXT:   $f13 = COPY renamable $f1
-  ; 32BIT-NEXT:   STFD renamable $f1, 56, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f1, 56, $r1 :: (store (s64) into stack + 56, basealign 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -504,10 +504,10 @@ define void @call_test_fpr_max() {
   ; 64BIT-NEXT:   renamable $x3 = LDtoc @d1, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load (s64) from @d1)
   ; 64BIT-NEXT:   ADJCALLSTACKDOWN 152, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT-NEXT:   STFD renamable $f1, 144, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STFD renamable $f1, 136, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STFD renamable $f1, 128, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STFD renamable $f1, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STFD renamable $f1, 144, $x1 :: (store (s64) into stack + 144, align 16)
+  ; 64BIT-NEXT:   STFD renamable $f1, 136, $x1 :: (store (s64) into stack + 136, basealign 16)
+  ; 64BIT-NEXT:   STFD renamable $f1, 128, $x1 :: (store (s64) into stack + 128, align 16)
+  ; 64BIT-NEXT:   STFD renamable $f1, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
   ; 64BIT-NEXT:   $f2 = COPY renamable $f1
   ; 64BIT-NEXT:   $f3 = COPY renamable $f1
   ; 64BIT-NEXT:   $f4 = COPY renamable $f1
@@ -520,7 +520,7 @@ define void @call_test_fpr_max() {
   ; 64BIT-NEXT:   $f11 = COPY renamable $f1
   ; 64BIT-NEXT:   $f12 = COPY renamable $f1
   ; 64BIT-NEXT:   $f13 = COPY renamable $f1
-  ; 64BIT-NEXT:   STFD renamable $f1, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STFD renamable $f1, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_fpr_max>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $x2, implicit-def $r1, implicit-def dead $f1
   ; 64BIT-NEXT:   ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -889,11 +889,11 @@ define void @call_test_stackarg_int() {
   ; 32BIT-NEXT:   renamable $r6 = LWZ 0, renamable $r3 :: (dereferenceable load (s32) from @lli, align 8)
   ; 32BIT-NEXT:   renamable $r3 = LWZ 4, killed renamable $r3 :: (dereferenceable load (s32) from @lli + 4, basealign 8)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 80, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STW renamable $r5, 76, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r3, 72, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r6, 68, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r5, 64, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r4, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW renamable $r5, 76, $r1 :: (store (s32) into stack + 76, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 64, $r1 :: (store (s32) into stack + 64, align 16)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
   ; 32BIT-NEXT:   $r3 = LI 1
   ; 32BIT-NEXT:   $r4 = LI 2
   ; 32BIT-NEXT:   $r5 = LI 3
@@ -902,7 +902,7 @@ define void @call_test_stackarg_int() {
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
   ; 32BIT-NEXT:   $r10 = LI 8
-  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_int[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 80, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -926,11 +926,11 @@ define void @call_test_stackarg_int() {
   ; 64BIT-NEXT:   $x8 = LI8 6
   ; 64BIT-NEXT:   $x9 = LI8 7
   ; 64BIT-NEXT:   $x10 = LI8 8
-  ; 64BIT-NEXT:   STD killed renamable $x31, 136, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD renamable $x0, 144, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x0, 128, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x12, 120, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x11, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x31, 136, $x1 :: (store (s64) into stack + 136, basealign 16)
+  ; 64BIT-NEXT:   STD renamable $x0, 144, $x1 :: (store (s64) into stack + 144, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x0, 128, $x1 :: (store (s64) into stack + 128, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x12, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x11, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_int[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
   ; 64BIT-NEXT:   ADJCALLSTACKUP 152, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -956,7 +956,11 @@ define void @call_test_stackarg_float() {
   ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
   ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r4 :: (dereferenceable load (s64) from @d)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 68, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STFD renamable $f2, 60, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STFD renamable $f2, 0, %stack.0 :: (store (s64) into %stack.0)
+  ; 32BIT-NEXT:   STFS renamable $f1, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 64, $r1 :: (store (s32) into stack + 64, align 16)
+  ; 32BIT-NEXT:   renamable $r11 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
   ; 32BIT-NEXT:   $r3 = LI 1
   ; 32BIT-NEXT:   $r4 = LI 2
   ; 32BIT-NEXT:   $r5 = LI 3
@@ -965,8 +969,8 @@ define void @call_test_stackarg_float() {
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
   ; 32BIT-NEXT:   $r10 = LI 8
-  ; 32BIT-NEXT:   STFS renamable $f1, 56, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   STW killed renamable $r11, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $r2, implicit-def $r1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 68, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
   ;
@@ -977,7 +981,7 @@ define void @call_test_stackarg_float() {
   ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f)
   ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x4 :: (dereferenceable load (s64) from @d)
   ; 64BIT-NEXT:   ADJCALLSTACKDOWN 128, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT-NEXT:   STFD renamable $f2, 120, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STFD renamable $f2, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
   ; 64BIT-NEXT:   $x3 = LI8 1
   ; 64BIT-NEXT:   $x4 = LI8 2
   ; 64BIT-NEXT:   $x5 = LI8 3
@@ -986,7 +990,7 @@ define void @call_test_stackarg_float() {
   ; 64BIT-NEXT:   $x8 = LI8 6
   ; 64BIT-NEXT:   $x9 = LI8 7
   ; 64BIT-NEXT:   $x10 = LI8 8
-  ; 64BIT-NEXT:   STFS renamable $f1, 112, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   STFS renamable $f1, 112, $x1 :: (store (s32) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_float[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $f1, implicit $f2, implicit $x2, implicit-def $r1
   ; 64BIT-NEXT:   ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -1053,7 +1057,11 @@ define void @call_test_stackarg_float3() {
   ; 32BIT-NEXT:   renamable $r10 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
   ; 32BIT-NEXT:   renamable $f2 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STFS renamable $f2, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STFD renamable $f1, 0, %stack.1 :: (store (s64) into %stack.1)
+  ; 32BIT-NEXT:   STFS renamable $f2, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 4, %stack.1 :: (load (s32) from %stack.1 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r11 = LWZ 0, %stack.1 :: (load (s32) from %stack.1, align 8)
   ; 32BIT-NEXT:   $r3 = LI 1
   ; 32BIT-NEXT:   $r4 = LI 2
   ; 32BIT-NEXT:   $r5 = LI 3
@@ -1061,8 +1069,8 @@ define void @call_test_stackarg_float3() {
   ; 32BIT-NEXT:   $r7 = LI 5
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
-  ; 32BIT-NEXT:   STFD renamable $f1, 52, $r1 :: (store (s64))
-  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
+  ; 32BIT-NEXT:   STW killed renamable $r11, 52, $r1 :: (store (s32) into stack + 52, basealign 16)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $f1, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
   ;
@@ -1082,7 +1090,7 @@ define void @call_test_stackarg_float3() {
   ; 64BIT-NEXT:   $x7 = LI8 5
   ; 64BIT-NEXT:   $x8 = LI8 6
   ; 64BIT-NEXT:   $x9 = LI8 7
-  ; 64BIT-NEXT:   STFS renamable $f2, 112, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   STFS renamable $f2, 112, $x1 :: (store (s32) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_stackarg_float3[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $f1, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1
   ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -1225,15 +1233,15 @@ define void @caller_ints_stack() {
   ; 32BIT-NEXT:   renamable $r9 = LBZ 0, killed renamable $r9 :: (dereferenceable load (s8) from @uc1)
   ; 32BIT-NEXT:   renamable $r12 = LWZ 0, killed renamable $r12 :: (dereferenceable load (s32) from @i1)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 96, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT-NEXT:   STW killed renamable $r12, 92, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r9, 88, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r8, 84, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r10, 80, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r7, 76, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r5, 68, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r4, 64, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STW killed renamable $r3, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r12, 92, $r1 :: (store (s32) into stack + 92, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 88, $r1 :: (store (s32) into stack + 88, align 8, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r8, 84, $r1 :: (store (s32) into stack + 84, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 80, $r1 :: (store (s32) into stack + 80, align 16)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 76, $r1 :: (store (s32) into stack + 76, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 64, $r1 :: (store (s32) into stack + 64, align 16)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
   ; 32BIT-NEXT:   $r3 = LI 1
   ; 32BIT-NEXT:   $r4 = LI 2
   ; 32BIT-NEXT:   $r5 = LI 3
@@ -1242,7 +1250,7 @@ define void @caller_ints_stack() {
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
   ; 32BIT-NEXT:   $r10 = LI 8
-  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_ints_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3, implicit-def dead $r4
   ; 32BIT-NEXT:   ADJCALLSTACKUP 96, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -1274,14 +1282,14 @@ define void @caller_ints_stack() {
   ; 64BIT-NEXT:   $x8 = LI8 6
   ; 64BIT-NEXT:   $x9 = LI8 7
   ; 64BIT-NEXT:   $x10 = LI8 8
-  ; 64BIT-NEXT:   STD killed renamable $x27, 168, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x30, 160, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x28, 152, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x31, 144, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x0, 136, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x12, 128, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x11, 120, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x29, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x27, 168, $x1 :: (store (s64) into stack + 168, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x30, 160, $x1 :: (store (s64) into stack + 160, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x28, 152, $x1 :: (store (s64) into stack + 152, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x31, 144, $x1 :: (store (s64) into stack + 144, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x0, 136, $x1 :: (store (s64) into stack + 136, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x12, 128, $x1 :: (store (s64) into stack + 128, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x11, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x29, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_ints_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
   ; 64BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -1333,7 +1341,7 @@ define void @call_test_i1_stack() {
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
   ; 32BIT-NEXT:   $r10 = LI 8
-  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r11, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_i1_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -1350,7 +1358,7 @@ define void @call_test_i1_stack() {
   ; 64BIT-NEXT:   $x8 = LI8 6
   ; 64BIT-NEXT:   $x9 = LI8 7
   ; 64BIT-NEXT:   $x10 = LI8 8
-  ; 64BIT-NEXT:   STD killed renamable $x11, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x11, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_i1_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1
   ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -1441,88 +1449,92 @@ define void @caller_fpr_stack() {
   ; 32BIT-NEXT:   renamable $r3 = LWZtoc @d15, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc @f14, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f0 = LFD 0, killed renamable $r3 :: (dereferenceable load (s64) from @d15)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc @f16, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
-  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r5 :: (dereferenceable load (s32) from @f16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f16, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, killed renamable $r4 :: (dereferenceable load (s32) from @f14)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (dereferenceable load (s32) from @f16)
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 144, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   STFD killed renamable $f0, 0, %stack.0 :: (store (s64) into %stack.0)
   ; 32BIT-NEXT:   renamable $r5 = LI 0
   ; 32BIT-NEXT:   renamable $r6 = LIS 16352
-  ; 32BIT-NEXT:   STW killed renamable $r5, 60, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 13107
-  ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32), align 8)
-  ; 32BIT-NEXT:   renamable $r6 = LIS 16355
-  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r5, 68, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 140, $r1 :: (store (s32) into stack + 140, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LIS 13107
+  ; 32BIT-NEXT:   STW killed renamable $r4, 128, $r1 :: (store (s32) into stack + 128, align 16)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16355
+  ; 32BIT-NEXT:   STW killed renamable $r5, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
   ; 32BIT-NEXT:   renamable $r5 = LIS 26214
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r6, 64, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16358
+  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r3, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LIS 39321
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 13107
+  ; 32BIT-NEXT:   STW killed renamable $r4, 64, $r1 :: (store (s32) into stack + 64, align 16)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16361
   ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 26214
-  ; 32BIT-NEXT:   STW killed renamable $r5, 76, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 39321
+  ; 32BIT-NEXT:   STW killed renamable $r5, 76, $r1 :: (store (s32) into stack + 76, basealign 16)
+  ; 32BIT-NEXT:   renamable $r5 = LIS 52428
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 26214
-  ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32), align 8)
-  ; 32BIT-NEXT:   renamable $r6 = LIS 16361
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r6, 80, $r1 :: (store (s32), align 8)
-  ; 32BIT-NEXT:   renamable $r6 = LIS 52428
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52429
-  ; 32BIT-NEXT:   STW killed renamable $r6, 92, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16364
-  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 39322
-  ; 32BIT-NEXT:   STW renamable $r5, 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52428
-  ; 32BIT-NEXT:   STW killed renamable $r6, 88, $r1 :: (store (s32), align 8)
-  ; 32BIT-NEXT:   renamable $r6 = LIS 16313
-  ; 32BIT-NEXT:   STW killed renamable $r5, 100, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r4, 80, $r1 :: (store (s32) into stack + 80, align 16)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16313
+  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 52429
+  ; 32BIT-NEXT:   STW killed renamable $r5, 92, $r1 :: (store (s32) into stack + 92, basealign 16)
   ; 32BIT-NEXT:   renamable $r5 = LIS 49807
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r6, 96, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 39322
+  ; 32BIT-NEXT:   STW renamable $r3, 84, $r1 :: (store (s32) into stack + 84, basealign 16)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 52428
+  ; 32BIT-NEXT:   STW killed renamable $r6, 88, $r1 :: (store (s32) into stack + 88, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16316
+  ; 32BIT-NEXT:   STW killed renamable $r3, 100, $r1 :: (store (s32) into stack + 100, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LIS 60293
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 39321
+  ; 32BIT-NEXT:   STW killed renamable $r4, 96, $r1 :: (store (s32) into stack + 96, align 16)
+  ; 32BIT-NEXT:   renamable $r4 = LIS 16318
   ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 23593
-  ; 32BIT-NEXT:   STW killed renamable $r5, 108, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT-NEXT:   renamable $r5 = LIS 60293
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 10485
-  ; 32BIT-NEXT:   STW killed renamable $r6, 104, $r1 :: (store (s32), align 8)
-  ; 32BIT-NEXT:   renamable $r6 = LIS 16318
-  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 7864
-  ; 32BIT-NEXT:   STW killed renamable $r5, 116, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 108, $r1 :: (store (s32) into stack + 108, basealign 16)
   ; 32BIT-NEXT:   renamable $r5 = LIS 2621
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 47185
-  ; 32BIT-NEXT:   STW killed renamable $r6, 112, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 10485
+  ; 32BIT-NEXT:   STW killed renamable $r6, 104, $r1 :: (store (s32) into stack + 104, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16320
-  ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r5, 28836
-  ; 32BIT-NEXT:   STW killed renamable $r5, 124, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r6, 41943
-  ; 32BIT-NEXT:   STW killed renamable $r6, 120, $r1 :: (store (s32), align 8)
-  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.4, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 7864
+  ; 32BIT-NEXT:   STW killed renamable $r3, 116, $r1 :: (store (s32) into stack + 116, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 47185
+  ; 32BIT-NEXT:   STW killed renamable $r4, 112, $r1 :: (store (s32) into stack + 112, align 16)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r5, 28836
+  ; 32BIT-NEXT:   STW killed renamable $r4, 124, $r1 :: (store (s32) into stack + 124, basealign 16)
+  ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r6, 41943
+  ; 32BIT-NEXT:   STW killed renamable $r4, 120, $r1 :: (store (s32) into stack + 120, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 4, %stack.0 :: (load (s32) from %stack.0 + 4)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.5, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.7, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.7, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.8, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.9, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
-  ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.10, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f9 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.9, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.10, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f11 = LFD 0, killed renamable $r6 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r6 = LWZtoc %const.11, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $f12 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   renamable $f13 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $f5 = LFS 0, killed renamable $r6 :: (load (s32) from constant-pool)
-  ; 32BIT-NEXT:   STW killed renamable $r4, 140, $r1 :: (store (s32))
-  ; 32BIT-NEXT:   STFD killed renamable $f0, 132, $r1 :: (store (s64))
+  ; 32BIT-NEXT:   STW killed renamable $r5, 136, $r1 :: (store (s32) into stack + 136, align 8, basealign 16)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, %stack.0 :: (load (s32) from %stack.0, align 8)
   ; 32BIT-NEXT:   $f10 = COPY renamable $f1
-  ; 32BIT-NEXT:   STW killed renamable $r3, 128, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 132, $r1 :: (store (s32) into stack + 132, basealign 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_fpr_stack>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $f1
   ; 32BIT-NEXT:   ADJCALLSTACKUP 144, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -1537,7 +1549,7 @@ define void @caller_fpr_stack() {
   ; 64BIT-NEXT:   renamable $r5 = LWZ 0, killed renamable $x5 :: (dereferenceable load (s32) from @f16)
   ; 64BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT-NEXT:   STW killed renamable $r5, 168, $x1 :: (store (s32))
+  ; 64BIT-NEXT:   STW killed renamable $r5, 168, $x1 :: (store (s32) into stack + 168, align 8, basealign 16)
   ; 64BIT-NEXT:   renamable $x5 = LDtocCPT %const.1, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $x7 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $f2 = LFD 0, killed renamable $x6 :: (load (s64) from constant-pool)
@@ -1549,7 +1561,7 @@ define void @caller_fpr_stack() {
   ; 64BIT-NEXT:   renamable $f6 = LFD 0, killed renamable $x6 :: (load (s64) from constant-pool)
   ; 64BIT-NEXT:   renamable $x6 = LDtocCPT %const.6, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $f7 = LFD 0, killed renamable $x5 :: (load (s64) from constant-pool)
-  ; 64BIT-NEXT:   STD killed renamable $x4, 160, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x4, 160, $x1 :: (store (s64) into stack + 160, align 16)
   ; 64BIT-NEXT:   renamable $x4 = LDtocCPT %const.7, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   renamable $f8 = LFD 0, killed renamable $x7 :: (load (s64) from constant-pool)
   ; 64BIT-NEXT:   renamable $x5 = LIS8 16320
@@ -1588,12 +1600,12 @@ define void @caller_fpr_stack() {
   ; 64BIT-NEXT:   renamable $x8 = ORIS8 killed renamable $x8, 52428
   ; 64BIT-NEXT:   renamable $x8 = ORI8 killed renamable $x8, 52429
   ; 64BIT-NEXT:   $f10 = COPY renamable $f1
-  ; 64BIT-NEXT:   STW killed renamable $r3, 152, $x1 :: (store (s32))
-  ; 64BIT-NEXT:   STD killed renamable $x5, 144, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x6, 136, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x4, 128, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x7, 120, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x8, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STW killed renamable $r3, 152, $x1 :: (store (s32) into stack + 152, align 8, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x5, 144, $x1 :: (store (s64) into stack + 144, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 136, $x1 :: (store (s64) into stack + 136, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 128, $x1 :: (store (s64) into stack + 128, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x7, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x8, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_fpr_stack>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit killed $f10, implicit $f11, implicit $f12, implicit $f13, implicit $x2, implicit-def $r1, implicit-def dead $f1
   ; 64BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -1688,16 +1700,16 @@ define void @caller_mix() {
   ; 32BIT: bb.0.entry:
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 84, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   renamable $r3 = LI 60
-  ; 32BIT-NEXT:   STW killed renamable $r3, 80, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 80, $r1 :: (store (s32) into stack + 80, align 16)
   ; 32BIT-NEXT:   renamable $r3 = LI 50
-  ; 32BIT-NEXT:   STW killed renamable $r3, 76, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 76, $r1 :: (store (s32) into stack + 76, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LI 40
-  ; 32BIT-NEXT:   STW killed renamable $r3, 72, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LI 0
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   STW killed renamable $r3, 64, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 64, $r1 :: (store (s32) into stack + 64, align 16)
   ; 32BIT-NEXT:   renamable $r3 = LI 2
-  ; 32BIT-NEXT:   STW killed renamable $r3, 60, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.1, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.2, $r2 :: (load (s32) from got)
@@ -1706,10 +1718,10 @@ define void @caller_mix() {
   ; 32BIT-NEXT:   renamable $f3 = LFD 0, killed renamable $r4 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $f4 = LFD 0, killed renamable $r3 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r3 = LI 1
-  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LIS 457
   ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 50048
-  ; 32BIT-NEXT:   STW killed renamable $r3, 68, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .mix_callee>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $r2, implicit-def $r1, implicit-def dead $r3
   ; 32BIT-NEXT:   ADJCALLSTACKUP 84, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -1732,8 +1744,8 @@ define void @caller_mix() {
   ; 64BIT-NEXT:   $x7 = LI8 1
   ; 64BIT-NEXT:   $x8 = LI8 2
   ; 64BIT-NEXT:   $x10 = LI8 40
-  ; 64BIT-NEXT:   STD killed renamable $x4, 120, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x5, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x4, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x5, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .mix_callee>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit killed $x7, implicit killed $x8, implicit $x9, implicit killed $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
   ; 64BIT-NEXT:   ADJCALLSTACKUP 128, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
@@ -1855,60 +1867,60 @@ define void @caller_mix() {
   ; 32BIT-NEXT:   renamable $r8 = LIS 16329
   ; 32BIT-NEXT:   renamable $r9 = LIS 13107
   ; 32BIT-NEXT:   renamable $r10 = LIS 16339
-  ; 32BIT-NEXT:   STW renamable $r3, 92, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r3, 92, $r1 :: (store (s32) into stack + 92, basealign 16)
   ; 32BIT-NEXT:   renamable $r11 = LIS 16345
-  ; 32BIT-NEXT:   STW killed renamable $r4, 88, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 88, $r1 :: (store (s32) into stack + 88, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = LIS 16355
-  ; 32BIT-NEXT:   STW killed renamable $r3, 132, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 132, $r1 :: (store (s32) into stack + 132, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LIS 26214
-  ; 32BIT-NEXT:   STW killed renamable $r5, 128, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 128, $r1 :: (store (s32) into stack + 128, align 16)
   ; 32BIT-NEXT:   renamable $r5 = ORI killed renamable $r6, 39322
-  ; 32BIT-NEXT:   STW renamable $r5, 60, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r5, 60, $r1 :: (store (s32) into stack + 60, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r7, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = LIS 16358
-  ; 32BIT-NEXT:   STW renamable $r5, 68, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r5, 68, $r1 :: (store (s32) into stack + 68, basealign 16)
   ; 32BIT-NEXT:   renamable $r7 = ORI killed renamable $r8, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r7, 64, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 64, $r1 :: (store (s32) into stack + 64, align 16)
   ; 32BIT-NEXT:   renamable $r7 = ORI killed renamable $r9, 13107
-  ; 32BIT-NEXT:   STW renamable $r7, 76, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r7, 76, $r1 :: (store (s32) into stack + 76, basealign 16)
   ; 32BIT-NEXT:   renamable $r8 = ORI killed renamable $r10, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r8, 72, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r8, 72, $r1 :: (store (s32) into stack + 72, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r8 = LIS 16361
-  ; 32BIT-NEXT:   STW renamable $r5, 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r5, 84, $r1 :: (store (s32) into stack + 84, basealign 16)
   ; 32BIT-NEXT:   renamable $r9 = ORI killed renamable $r11, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r9, 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 80, $r1 :: (store (s32) into stack + 80, align 16)
   ; 32BIT-NEXT:   renamable $r9 = LIS 52428
-  ; 32BIT-NEXT:   STW renamable $r7, 100, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r7, 100, $r1 :: (store (s32) into stack + 100, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r4, 96, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 96, $r1 :: (store (s32) into stack + 96, align 16)
   ; 32BIT-NEXT:   renamable $r3 = ORI killed renamable $r3, 26214
-  ; 32BIT-NEXT:   STW renamable $r3, 108, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r3, 108, $r1 :: (store (s32) into stack + 108, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r6, 26214
-  ; 32BIT-NEXT:   STW killed renamable $r4, 104, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 104, $r1 :: (store (s32) into stack + 104, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = LIS 16364
-  ; 32BIT-NEXT:   STW renamable $r5, 116, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r5, 116, $r1 :: (store (s32) into stack + 116, basealign 16)
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r8, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r6, 112, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 112, $r1 :: (store (s32) into stack + 112, align 16)
   ; 32BIT-NEXT:   renamable $r6 = ORI killed renamable $r9, 52429
-  ; 32BIT-NEXT:   STW renamable $r6, 124, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW renamable $r6, 124, $r1 :: (store (s32) into stack + 124, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 52428
-  ; 32BIT-NEXT:   STW killed renamable $r4, 120, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 120, $r1 :: (store (s32) into stack + 120, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = LIS 16369
-  ; 32BIT-NEXT:   STW killed renamable $r5, 140, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 140, $r1 :: (store (s32) into stack + 140, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 39321
-  ; 32BIT-NEXT:   STW killed renamable $r4, 136, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 136, $r1 :: (store (s32) into stack + 136, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = LIS 16371
-  ; 32BIT-NEXT:   STW killed renamable $r7, 148, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 148, $r1 :: (store (s32) into stack + 148, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 13107
-  ; 32BIT-NEXT:   STW killed renamable $r4, 144, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 144, $r1 :: (store (s32) into stack + 144, align 16)
   ; 32BIT-NEXT:   renamable $r4 = LIS 16372
   ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   STW killed renamable $r6, 156, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 156, $r1 :: (store (s32) into stack + 156, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = ORI killed renamable $r4, 52428
-  ; 32BIT-NEXT:   STW killed renamable $r4, 152, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 152, $r1 :: (store (s32) into stack + 152, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   STW killed renamable $r3, 164, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 164, $r1 :: (store (s32) into stack + 164, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LWZtoc %const.2, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   renamable $f1 = LFD 0, killed renamable $r5 :: (load (s64) from constant-pool)
   ; 32BIT-NEXT:   renamable $r5 = LWZtoc %const.3, $r2 :: (load (s32) from got)
@@ -1943,7 +1955,7 @@ define void @caller_mix() {
   ; 32BIT-NEXT:   $r8 = LI 6
   ; 32BIT-NEXT:   $r9 = LI 7
   ; 32BIT-NEXT:   $r10 = LI 8
-  ; 32BIT-NEXT:   STW killed renamable $r11, 160, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r11, 160, $r1 :: (store (s32) into stack + 160, align 16)
   ; 32BIT-NEXT:   BL_NOP <mcsymbol .mix_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $r2, implicit-def $r1, implicit-def dead $r3
   ; 32BIT-NEXT:   ADJCALLSTACKUP 168, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
@@ -2044,20 +2056,20 @@ define void @caller_mix() {
   ; 64BIT-NEXT:   $x8 = LI8 6
   ; 64BIT-NEXT:   $x9 = LI8 7
   ; 64BIT-NEXT:   $x10 = LI8 8
-  ; 64BIT-NEXT:   STD killed renamable $x29, 184, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x28, 144, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x11, 216, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x12, 200, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x0, 160, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x31, 152, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x30, 128, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x27, 208, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x25, 192, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x24, 176, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x23, 168, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x22, 136, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x21, 120, $x1 :: (store (s64))
-  ; 64BIT-NEXT:   STD killed renamable $x20, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x29, 184, $x1 :: (store (s64) into stack + 184, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x28, 144, $x1 :: (store (s64) into stack + 144, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x11, 216, $x1 :: (store (s64) into stack + 216, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x12, 200, $x1 :: (store (s64) into stack + 200, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x0, 160, $x1 :: (store (s64) into stack + 160, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x31, 152, $x1 :: (store (s64) into stack + 152, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x30, 128, $x1 :: (store (s64) into stack + 128, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x27, 208, $x1 :: (store (s64) into stack + 208, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x25, 192, $x1 :: (store (s64) into stack + 192, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x24, 176, $x1 :: (store (s64) into stack + 176, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x23, 168, $x1 :: (store (s64) into stack + 168, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x22, 136, $x1 :: (store (s64) into stack + 136, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x21, 120, $x1 :: (store (s64) into stack + 120, basealign 16)
+  ; 64BIT-NEXT:   STD killed renamable $x20, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   BL8_NOP <mcsymbol .mix_floats>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $f1, implicit $f2, implicit $f3, implicit $f4, implicit $f5, implicit $f6, implicit $f7, implicit $f8, implicit $f9, implicit $f10, implicit $f11, implicit $f12, implicit $f13, implicit $x2, implicit-def $r1, implicit-def dead $x3
   ; 64BIT-NEXT:   ADJCALLSTACKUP 224, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
index 8f33f5ef863e..03770d22d9f4 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll
@@ -1012,18 +1012,22 @@ define void @call_test_stackarg_float() {
 ; ASM32PWR4-NEXT:    lwz 3, L..C8(2) # @f
 ; ASM32PWR4-NEXT:    stw 0, 88(1)
 ; ASM32PWR4-NEXT:    li 4, 2
-; ASM32PWR4-NEXT:    li 5, 3
 ; ASM32PWR4-NEXT:    li 6, 4
 ; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    lfs 1, 0(3)
 ; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
-; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    li 9, 7
+; ASM32PWR4-NEXT:    li 10, 8
 ; ASM32PWR4-NEXT:    lfd 2, 0(3)
 ; ASM32PWR4-NEXT:    li 3, 1
-; ASM32PWR4-NEXT:    li 10, 8
-; ASM32PWR4-NEXT:    stfd 2, 60(1)
+; ASM32PWR4-NEXT:    stfd 2, 72(1)
+; ASM32PWR4-NEXT:    lwz 5, 76(1)
+; ASM32PWR4-NEXT:    lwz 11, 72(1)
+; ASM32PWR4-NEXT:    stw 5, 64(1)
+; ASM32PWR4-NEXT:    li 5, 3
 ; ASM32PWR4-NEXT:    stfs 1, 56(1)
+; ASM32PWR4-NEXT:    stw 11, 60(1)
 ; ASM32PWR4-NEXT:    bl .test_stackarg_float[PR]
 ; ASM32PWR4-NEXT:    nop
 ; ASM32PWR4-NEXT:    addi 1, 1, 80
@@ -1126,20 +1130,24 @@ define void @call_test_stackarg_float3() {
 ; ASM32PWR4-NEXT:    stwu 1, -80(1)
 ; ASM32PWR4-NEXT:    lwz 3, L..C9(2) # @d
 ; ASM32PWR4-NEXT:    stw 0, 88(1)
-; ASM32PWR4-NEXT:    li 4, 2
 ; ASM32PWR4-NEXT:    li 5, 3
 ; ASM32PWR4-NEXT:    li 6, 4
 ; ASM32PWR4-NEXT:    li 7, 5
+; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    lfd 1, 0(3)
 ; ASM32PWR4-NEXT:    lwz 3, L..C8(2) # @f
-; ASM32PWR4-NEXT:    li 8, 6
 ; ASM32PWR4-NEXT:    li 9, 7
 ; ASM32PWR4-NEXT:    stfd 1, 72(1)
-; ASM32PWR4-NEXT:    lwz 10, 72(1)
 ; ASM32PWR4-NEXT:    lfs 2, 0(3)
 ; ASM32PWR4-NEXT:    li 3, 1
+; ASM32PWR4-NEXT:    stfd 1, 64(1)
+; ASM32PWR4-NEXT:    lwz 4, 68(1)
+; ASM32PWR4-NEXT:    lwz 10, 72(1)
+; ASM32PWR4-NEXT:    lwz 11, 64(1)
+; ASM32PWR4-NEXT:    stw 4, 56(1)
+; ASM32PWR4-NEXT:    li 4, 2
 ; ASM32PWR4-NEXT:    stfs 2, 60(1)
-; ASM32PWR4-NEXT:    stfd 1, 52(1)
+; ASM32PWR4-NEXT:    stw 11, 52(1)
 ; ASM32PWR4-NEXT:    bl .test_stackarg_float3[PR]
 ; ASM32PWR4-NEXT:    nop
 ; ASM32PWR4-NEXT:    addi 1, 1, 80
@@ -1562,95 +1570,99 @@ define void @caller_fpr_stack() {
 ; ASM32PWR4-LABEL: caller_fpr_stack:
 ; ASM32PWR4:       # %bb.0: # %entry
 ; ASM32PWR4-NEXT:    mflr 0
-; ASM32PWR4-NEXT:    stwu 1, -144(1)
+; ASM32PWR4-NEXT:    stwu 1, -160(1)
 ; ASM32PWR4-NEXT:    lwz 3, L..C19(2) # @d15
-; ASM32PWR4-NEXT:    lwz 4, L..C20(2) # @f14
-; ASM32PWR4-NEXT:    lwz 5, L..C21(2) # @f16
-; ASM32PWR4-NEXT:    stw 0, 152(1)
-; ASM32PWR4-NEXT:    lis 6, 16361
-; ASM32PWR4-NEXT:    ori 6, 6, 39321
+; ASM32PWR4-NEXT:    stw 0, 168(1)
+; ASM32PWR4-NEXT:    lwz 5, L..C20(2) # %const.1
+; ASM32PWR4-NEXT:    lwz 4, L..C21(2) # @f14
 ; ASM32PWR4-NEXT:    lfd 0, 0(3)
-; ASM32PWR4-NEXT:    lwz 3, 0(4)
-; ASM32PWR4-NEXT:    lwz 4, 0(5)
-; ASM32PWR4-NEXT:    li 5, 0
-; ASM32PWR4-NEXT:    stw 5, 60(1)
-; ASM32PWR4-NEXT:    lis 5, 16352
-; ASM32PWR4-NEXT:    stw 5, 56(1)
-; ASM32PWR4-NEXT:    lis 5, 13107
-; ASM32PWR4-NEXT:    ori 5, 5, 13107
-; ASM32PWR4-NEXT:    stw 5, 68(1)
-; ASM32PWR4-NEXT:    lis 5, 16355
-; ASM32PWR4-NEXT:    ori 5, 5, 13107
-; ASM32PWR4-NEXT:    stw 5, 64(1)
-; ASM32PWR4-NEXT:    lis 5, 26214
-; ASM32PWR4-NEXT:    ori 5, 5, 26214
-; ASM32PWR4-NEXT:    stw 5, 76(1)
-; ASM32PWR4-NEXT:    lis 5, 16358
-; ASM32PWR4-NEXT:    ori 5, 5, 26214
-; ASM32PWR4-NEXT:    stw 5, 72(1)
-; ASM32PWR4-NEXT:    lis 5, -26215
-; ASM32PWR4-NEXT:    ori 5, 5, 39322
-; ASM32PWR4-NEXT:    stw 5, 84(1)
-; ASM32PWR4-NEXT:    stw 5, 100(1)
-; ASM32PWR4-NEXT:    lis 5, 16313
-; ASM32PWR4-NEXT:    ori 5, 5, 39321
-; ASM32PWR4-NEXT:    stw 5, 96(1)
-; ASM32PWR4-NEXT:    lis 5, -15729
-; ASM32PWR4-NEXT:    ori 5, 5, 23593
-; ASM32PWR4-NEXT:    stw 5, 108(1)
-; ASM32PWR4-NEXT:    lis 5, 16316
-; ASM32PWR4-NEXT:    ori 5, 5, 10485
-; ASM32PWR4-NEXT:    stw 5, 104(1)
-; ASM32PWR4-NEXT:    lis 5, -5243
-; ASM32PWR4-NEXT:    ori 5, 5, 7864
-; ASM32PWR4-NEXT:    stw 5, 116(1)
-; ASM32PWR4-NEXT:    lis 5, 16318
-; ASM32PWR4-NEXT:    ori 5, 5, 47185
-; ASM32PWR4-NEXT:    stw 6, 80(1)
-; ASM32PWR4-NEXT:    lis 6, -13108
-; ASM32PWR4-NEXT:    ori 6, 6, 52429
-; ASM32PWR4-NEXT:    stw 5, 112(1)
-; ASM32PWR4-NEXT:    lis 5, 2621
-; ASM32PWR4-NEXT:    ori 5, 5, 28836
-; ASM32PWR4-NEXT:    stw 6, 92(1)
-; ASM32PWR4-NEXT:    lis 6, 16364
-; ASM32PWR4-NEXT:    ori 6, 6, 52428
-; ASM32PWR4-NEXT:    stw 5, 124(1)
-; ASM32PWR4-NEXT:    lis 5, 16320
-; ASM32PWR4-NEXT:    ori 5, 5, 41943
-; ASM32PWR4-NEXT:    stw 6, 88(1)
-; ASM32PWR4-NEXT:    lwz 6, L..C22(2) # %const.0
-; ASM32PWR4-NEXT:    stw 5, 120(1)
-; ASM32PWR4-NEXT:    lwz 5, L..C23(2) # %const.1
-; ASM32PWR4-NEXT:    lfd 2, 0(6)
-; ASM32PWR4-NEXT:    lwz 6, L..C24(2) # %const.2
+; ASM32PWR4-NEXT:    lwz 3, L..C22(2) # @f16
+; ASM32PWR4-NEXT:    lwz 3, 0(3)
+; ASM32PWR4-NEXT:    stw 3, 140(1)
+; ASM32PWR4-NEXT:    li 3, 0
+; ASM32PWR4-NEXT:    stw 3, 60(1)
+; ASM32PWR4-NEXT:    lis 3, 16352
+; ASM32PWR4-NEXT:    stw 3, 56(1)
+; ASM32PWR4-NEXT:    lis 3, 13107
+; ASM32PWR4-NEXT:    ori 3, 3, 13107
+; ASM32PWR4-NEXT:    stw 3, 68(1)
+; ASM32PWR4-NEXT:    lis 3, 16355
+; ASM32PWR4-NEXT:    ori 3, 3, 13107
+; ASM32PWR4-NEXT:    stw 3, 64(1)
+; ASM32PWR4-NEXT:    lis 3, 26214
+; ASM32PWR4-NEXT:    ori 3, 3, 26214
+; ASM32PWR4-NEXT:    stw 3, 76(1)
+; ASM32PWR4-NEXT:    lis 3, 16358
+; ASM32PWR4-NEXT:    ori 3, 3, 26214
+; ASM32PWR4-NEXT:    stw 3, 72(1)
+; ASM32PWR4-NEXT:    lis 3, -26215
+; ASM32PWR4-NEXT:    ori 3, 3, 39322
+; ASM32PWR4-NEXT:    stw 3, 84(1)
+; ASM32PWR4-NEXT:    stw 3, 100(1)
+; ASM32PWR4-NEXT:    lis 3, 16313
+; ASM32PWR4-NEXT:    ori 3, 3, 39321
+; ASM32PWR4-NEXT:    stw 3, 96(1)
+; ASM32PWR4-NEXT:    lis 3, -15729
+; ASM32PWR4-NEXT:    ori 3, 3, 23593
+; ASM32PWR4-NEXT:    stw 3, 108(1)
+; ASM32PWR4-NEXT:    lis 3, 16316
+; ASM32PWR4-NEXT:    ori 3, 3, 10485
+; ASM32PWR4-NEXT:    stw 3, 104(1)
+; ASM32PWR4-NEXT:    lis 3, -5243
+; ASM32PWR4-NEXT:    ori 3, 3, 7864
+; ASM32PWR4-NEXT:    stw 3, 116(1)
+; ASM32PWR4-NEXT:    lis 3, 16318
+; ASM32PWR4-NEXT:    ori 3, 3, 47185
+; ASM32PWR4-NEXT:    stw 3, 112(1)
+; ASM32PWR4-NEXT:    lis 3, 2621
+; ASM32PWR4-NEXT:    ori 3, 3, 28836
+; ASM32PWR4-NEXT:    stw 3, 124(1)
+; ASM32PWR4-NEXT:    lis 3, 16320
+; ASM32PWR4-NEXT:    ori 3, 3, 41943
+; ASM32PWR4-NEXT:    stw 3, 120(1)
+; ASM32PWR4-NEXT:    lwz 3, L..C23(2) # %const.0
+; ASM32PWR4-NEXT:    lfd 2, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C24(2) # %const.2
 ; ASM32PWR4-NEXT:    lfd 3, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C25(2) # %const.3
-; ASM32PWR4-NEXT:    lfd 4, 0(6)
-; ASM32PWR4-NEXT:    lwz 6, L..C26(2) # %const.4
+; ASM32PWR4-NEXT:    lfd 4, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C26(2) # %const.4
 ; ASM32PWR4-NEXT:    lfd 6, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C27(2) # %const.5
-; ASM32PWR4-NEXT:    lfd 7, 0(6)
-; ASM32PWR4-NEXT:    lwz 6, L..C28(2) # %const.6
+; ASM32PWR4-NEXT:    lwz 4, 0(4)
+; ASM32PWR4-NEXT:    lfd 7, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C28(2) # %const.6
 ; ASM32PWR4-NEXT:    lfd 8, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C29(2) # %const.7
-; ASM32PWR4-NEXT:    lfd 9, 0(6)
-; ASM32PWR4-NEXT:    lwz 6, L..C30(2) # %const.8
+; ASM32PWR4-NEXT:    stw 4, 128(1)
+; ASM32PWR4-NEXT:    lis 4, 16361
+; ASM32PWR4-NEXT:    ori 4, 4, 39321
+; ASM32PWR4-NEXT:    lfd 9, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C30(2) # %const.8
 ; ASM32PWR4-NEXT:    lfd 1, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C31(2) # %const.9
-; ASM32PWR4-NEXT:    lfd 11, 0(6)
-; ASM32PWR4-NEXT:    lwz 6, L..C32(2) # %const.10
+; ASM32PWR4-NEXT:    stw 4, 80(1)
+; ASM32PWR4-NEXT:    lis 4, -13108
 ; ASM32PWR4-NEXT:    fmr 10, 1
+; ASM32PWR4-NEXT:    ori 4, 4, 52429
+; ASM32PWR4-NEXT:    lfd 11, 0(3)
+; ASM32PWR4-NEXT:    lwz 3, L..C32(2) # %const.10
 ; ASM32PWR4-NEXT:    lfd 12, 0(5)
 ; ASM32PWR4-NEXT:    lwz 5, L..C33(2) # %const.11
-; ASM32PWR4-NEXT:    lfd 13, 0(6)
+; ASM32PWR4-NEXT:    stw 4, 92(1)
+; ASM32PWR4-NEXT:    lis 4, 16364
+; ASM32PWR4-NEXT:    ori 4, 4, 52428
+; ASM32PWR4-NEXT:    stfd 0, 152(1)
+; ASM32PWR4-NEXT:    stw 4, 88(1)
+; ASM32PWR4-NEXT:    lwz 4, 156(1)
+; ASM32PWR4-NEXT:    lfd 13, 0(3)
 ; ASM32PWR4-NEXT:    lfs 5, 0(5)
-; ASM32PWR4-NEXT:    stfd 0, 132(1)
-; ASM32PWR4-NEXT:    stw 4, 140(1)
-; ASM32PWR4-NEXT:    stw 3, 128(1)
+; ASM32PWR4-NEXT:    lwz 3, 152(1)
+; ASM32PWR4-NEXT:    stw 4, 136(1)
+; ASM32PWR4-NEXT:    stw 3, 132(1)
 ; ASM32PWR4-NEXT:    bl .test_fpr_stack
 ; ASM32PWR4-NEXT:    nop
-; ASM32PWR4-NEXT:    addi 1, 1, 144
+; ASM32PWR4-NEXT:    addi 1, 1, 160
 ; ASM32PWR4-NEXT:    lwz 0, 8(1)
 ; ASM32PWR4-NEXT:    mtlr 0
 ; ASM32PWR4-NEXT:    blr
@@ -1667,7 +1679,6 @@ define void @caller_fpr_stack() {
 ; ASM64PWR4-NEXT:    lis 7, 16313
 ; ASM64PWR4-NEXT:    lwz 3, 0(3)
 ; ASM64PWR4-NEXT:    ld 4, 0(4)
-; ASM64PWR4-NEXT:    lwz 5, 0(5)
 ; ASM64PWR4-NEXT:    stw 3, 152(1)
 ; ASM64PWR4-NEXT:    ld 3, L..C22(2) # %const.0
 ; ASM64PWR4-NEXT:    std 4, 160(1)
@@ -1686,6 +1697,7 @@ define void @caller_fpr_stack() {
 ; ASM64PWR4-NEXT:    ld 4, L..C29(2) # %const.7
 ; ASM64PWR4-NEXT:    lfd 9, 0(3)
 ; ASM64PWR4-NEXT:    ld 3, L..C30(2) # %const.8
+; ASM64PWR4-NEXT:    lwz 5, 0(5)
 ; ASM64PWR4-NEXT:    lfd 1, 0(4)
 ; ASM64PWR4-NEXT:    lis 4, 16320
 ; ASM64PWR4-NEXT:    ori 4, 4, 41943
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
index 67800df6ed4b..95fed680e696 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
@@ -134,7 +134,7 @@ define void @call_test_byval_3Byte() {
   ; 32BIT-NEXT:   ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
   ; 32BIT-NEXT:   renamable $r3 = LI 42
   ; 32BIT-NEXT:   renamable $r4 = LWZtoc @gS3, $r2 :: (load (s32) from got)
-  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32) into stack + 56, align 8, basealign 16)
   ; 32BIT-NEXT:   renamable $r3 = LBZ 2, renamable $r4 :: (load (s8))
   ; 32BIT-NEXT:   renamable $r4 = LHZ 0, killed renamable $r4 :: (load (s16))
   ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r3, 8, 16, 23
@@ -155,7 +155,7 @@ define void @call_test_byval_3Byte() {
   ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
   ; 64BIT-NEXT:   renamable $x3 = LI8 42
   ; 64BIT-NEXT:   renamable $x4 = LDtoc @gS3, $x2 :: (load (s64) from got)
-  ; 64BIT-NEXT:   STD killed renamable $x3, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed renamable $x3, 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   renamable $x3 = LBZ8 2, renamable $x4 :: (load (s8))
   ; 64BIT-NEXT:   renamable $x4 = LHZ8 0, killed renamable $x4 :: (load (s16))
   ; 64BIT-NEXT:   renamable $x10 = RLDIC killed renamable $x3, 40, 16
diff --git a/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills-mir.ll b/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills-mir.ll
index 7c45958a1c2f..7ee854c2ae22 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills-mir.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills-mir.ll
@@ -34,17 +34,17 @@ define double @caller() {
   ; MIR32-NEXT:   STXVW4X renamable $vsl0, $r1, killed renamable $r3 :: (store (s128), align 8)
   ; MIR32-NEXT:   renamable $r3 = LI 104
   ; MIR32-NEXT:   STXVW4X renamable $vsl0, $r1, killed renamable $r4 :: (store (s128), align 8)
-  ; MIR32-NEXT:   renamable $r4 = LI 88
   ; MIR32-NEXT:   STXVW4X renamable $vsl0, $r1, killed renamable $r3 :: (store (s128), align 8)
-  ; MIR32-NEXT:   STXVW4X renamable $vsl0, $r1, killed renamable $r4 :: (store (s128), align 8)
-  ; MIR32-NEXT:   renamable $r3 = LI 72
-  ; MIR32-NEXT:   renamable $r4 = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; MIR32-NEXT:   STXVW4X killed renamable $vsl0, $r1, killed renamable $r3 :: (store (s128), align 8)
+  ; MIR32-NEXT:   renamable $r3 = LI 88
+  ; MIR32-NEXT:   renamable $r4 = LI 72
+  ; MIR32-NEXT:   STXVW4X renamable $vsl0, $r1, killed renamable $r3 :: (store (s128), align 8)
   ; MIR32-NEXT:   renamable $r3 = LI 48
-  ; MIR32-NEXT:   renamable $vsl0 = LXVD2X $zero, killed renamable $r4 :: (load (s128) from constant-pool)
+  ; MIR32-NEXT:   STXVW4X killed renamable $vsl0, $r1, killed renamable $r4 :: (store (s128), align 8)
   ; MIR32-NEXT:   renamable $r4 = LI 512
-  ; MIR32-NEXT:   STXVD2X killed renamable $vsl0, $r1, killed renamable $r3 :: (store (s128))
-  ; MIR32-NEXT:   STW killed renamable $r4, 152, $r1 :: (store (s32))
+  ; MIR32-NEXT:   STW killed renamable $r4, 152, $r1 :: (store (s32) into stack + 152, align 8, basealign 16)
+  ; MIR32-NEXT:   renamable $r4 = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; MIR32-NEXT:   renamable $vsl0 = LXVD2X $zero, killed renamable $r4 :: (load (s128) from constant-pool)
+  ; MIR32-NEXT:   STXVD2X killed renamable $vsl0, $r1, killed renamable $r3 :: (store (s128) into stack + 48)
   ; MIR32-NEXT:   $f1 = XXLXORdpz
   ; MIR32-NEXT:   $f2 = XXLXORdpz
   ; MIR32-NEXT:   $v2 = XXLXORz
@@ -92,18 +92,18 @@ define double @caller() {
   ; MIR64-NEXT:   ADJCALLSTACKDOWN 224, 0, implicit-def dead $r1, implicit $r1
   ; MIR64-NEXT:   renamable $vsl0 = XXLXORz
   ; MIR64-NEXT:   renamable $x3 = LI8 160
-  ; MIR64-NEXT:   STXVW4X renamable $vsl0, $x1, killed renamable $x3 :: (store (s128), align 8)
+  ; MIR64-NEXT:   STXVW4X renamable $vsl0, $x1, killed renamable $x3 :: (store (s128))
   ; MIR64-NEXT:   renamable $x3 = LI8 144
-  ; MIR64-NEXT:   STXVW4X renamable $vsl0, $x1, killed renamable $x3 :: (store (s128), align 8)
+  ; MIR64-NEXT:   STXVW4X renamable $vsl0, $x1, killed renamable $x3 :: (store (s128))
   ; MIR64-NEXT:   renamable $x3 = LI8 128
-  ; MIR64-NEXT:   STXVW4X killed renamable $vsl0, $x1, killed renamable $x3 :: (store (s128), align 8)
-  ; MIR64-NEXT:   renamable $x3 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; MIR64-NEXT:   renamable $vsl0 = LXVD2X $zero8, killed renamable $x3 :: (load (s128) from constant-pool)
-  ; MIR64-NEXT:   renamable $x3 = LI8 80
-  ; MIR64-NEXT:   STXVD2X killed renamable $vsl0, $x1, killed renamable $x3 :: (store (s128))
+  ; MIR64-NEXT:   STXVW4X killed renamable $vsl0, $x1, killed renamable $x3 :: (store (s128))
   ; MIR64-NEXT:   renamable $x3 = LI8 512
-  ; MIR64-NEXT:   STD killed renamable $x3, 184, $x1 :: (store (s64))
-  ; MIR64-NEXT:   STD killed renamable $x4, 176, $x1 :: (store (s64))
+  ; MIR64-NEXT:   STD killed renamable $x3, 184, $x1 :: (store (s64) into stack + 184, basealign 16)
+  ; MIR64-NEXT:   renamable $x3 = LI8 80
+  ; MIR64-NEXT:   STD killed renamable $x4, 176, $x1 :: (store (s64) into stack + 176, align 16)
+  ; MIR64-NEXT:   renamable $x4 = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; MIR64-NEXT:   renamable $vsl0 = LXVD2X $zero8, killed renamable $x4 :: (load (s128) from constant-pool)
+  ; MIR64-NEXT:   STXVD2X killed renamable $vsl0, $x1, killed renamable $x3 :: (store (s128) into stack + 80)
   ; MIR64-NEXT:   $f1 = XXLXORdpz
   ; MIR64-NEXT:   $f2 = XXLXORdpz
   ; MIR64-NEXT:   $v2 = XXLXORz
diff --git a/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills.ll b/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills.ll
index 66f88b4e3d5a..294f074807f1 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vec-arg-spills.ll
@@ -25,42 +25,42 @@ define double @caller() {
 ; 32BIT-NEXT:    stw 3, 184(1)
 ; 32BIT-NEXT:    stw 3, 176(1)
 ; 32BIT-NEXT:    stw 4, 172(1)
-; 32BIT-NEXT:    lis 4, 16368
 ; 32BIT-NEXT:    stw 3, 168(1)
+; 32BIT-NEXT:    lis 4, 16368
 ; 32BIT-NEXT:    stw 3, 160(1)
-; 32BIT-NEXT:    stw 4, 164(1)
 ; 32BIT-NEXT:    stw 3, 156(1)
 ; 32BIT-NEXT:    li 3, 136
+; 32BIT-NEXT:    stw 4, 164(1)
 ; 32BIT-NEXT:    li 4, 120
-; 32BIT-NEXT:    xxlxor 2, 2, 2
 ; 32BIT-NEXT:    stxvw4x 0, 1, 3
 ; 32BIT-NEXT:    li 3, 104
 ; 32BIT-NEXT:    stxvw4x 0, 1, 4
-; 32BIT-NEXT:    li 4, 88
+; 32BIT-NEXT:    stxvw4x 0, 1, 3
+; 32BIT-NEXT:    li 3, 88
+; 32BIT-NEXT:    li 4, 72
+; 32BIT-NEXT:    xxlxor 2, 2, 2
 ; 32BIT-NEXT:    stxvw4x 0, 1, 3
 ; 32BIT-NEXT:    stxvw4x 0, 1, 4
+; 32BIT-NEXT:    li 4, 512
+; 32BIT-NEXT:    stw 4, 152(1)
 ; 32BIT-NEXT:    lwz 4, L..C0(2) # %const.0
-; 32BIT-NEXT:    li 3, 72
-; 32BIT-NEXT:    stxvw4x 0, 1, 3
 ; 32BIT-NEXT:    li 3, 48
 ; 32BIT-NEXT:    xxlxor 34, 34, 34
 ; 32BIT-NEXT:    xxlxor 35, 35, 35
-; 32BIT-NEXT:    lxvd2x 0, 0, 4
-; 32BIT-NEXT:    li 4, 512
 ; 32BIT-NEXT:    xxlxor 36, 36, 36
 ; 32BIT-NEXT:    xxlxor 37, 37, 37
 ; 32BIT-NEXT:    xxlxor 38, 38, 38
+; 32BIT-NEXT:    lxvd2x 0, 0, 4
 ; 32BIT-NEXT:    xxlxor 39, 39, 39
 ; 32BIT-NEXT:    xxlxor 40, 40, 40
+; 32BIT-NEXT:    li 4, 256
 ; 32BIT-NEXT:    xxlxor 41, 41, 41
 ; 32BIT-NEXT:    xxlxor 42, 42, 42
-; 32BIT-NEXT:    stxvd2x 0, 1, 3
-; 32BIT-NEXT:    stw 4, 152(1)
-; 32BIT-NEXT:    li 3, 128
-; 32BIT-NEXT:    li 4, 256
 ; 32BIT-NEXT:    xxlxor 43, 43, 43
 ; 32BIT-NEXT:    xxlxor 44, 44, 44
 ; 32BIT-NEXT:    xxlxor 45, 45, 45
+; 32BIT-NEXT:    stxvd2x 0, 1, 3
+; 32BIT-NEXT:    li 3, 128
 ; 32BIT-NEXT:    xxlxor 3, 3, 3
 ; 32BIT-NEXT:    xxlxor 4, 4, 4
 ; 32BIT-NEXT:    xxlxor 5, 5, 5
@@ -114,23 +114,23 @@ define double @caller() {
 ; 64BIT-NEXT:    li 3, 128
 ; 64BIT-NEXT:    xxlxor 43, 43, 43
 ; 64BIT-NEXT:    stxvw4x 0, 1, 3
-; 64BIT-NEXT:    ld 3, L..C0(2) # %const.0
+; 64BIT-NEXT:    std 4, 176(1)
+; 64BIT-NEXT:    ld 4, L..C0(2) # %const.0
+; 64BIT-NEXT:    li 3, 512
 ; 64BIT-NEXT:    xxlxor 44, 44, 44
 ; 64BIT-NEXT:    xxlxor 45, 45, 45
-; 64BIT-NEXT:    lxvd2x 0, 0, 3
+; 64BIT-NEXT:    lxvd2x 0, 0, 4
+; 64BIT-NEXT:    std 3, 184(1)
 ; 64BIT-NEXT:    li 3, 80
+; 64BIT-NEXT:    li 4, 256
 ; 64BIT-NEXT:    xxlxor 3, 3, 3
 ; 64BIT-NEXT:    xxlxor 4, 4, 4
 ; 64BIT-NEXT:    xxlxor 5, 5, 5
 ; 64BIT-NEXT:    stxvd2x 0, 1, 3
-; 64BIT-NEXT:    li 3, 512
-; 64BIT-NEXT:    std 4, 176(1)
-; 64BIT-NEXT:    li 4, 256
+; 64BIT-NEXT:    li 3, 128
 ; 64BIT-NEXT:    xxlxor 6, 6, 6
 ; 64BIT-NEXT:    xxlxor 7, 7, 7
 ; 64BIT-NEXT:    xxlxor 8, 8, 8
-; 64BIT-NEXT:    std 3, 184(1)
-; 64BIT-NEXT:    li 3, 128
 ; 64BIT-NEXT:    xxlxor 9, 9, 9
 ; 64BIT-NEXT:    xxlxor 10, 10, 10
 ; 64BIT-NEXT:    xxlxor 11, 11, 11
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
index 4697a093e5d6..0ba345fb5275 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
@@ -23,31 +23,31 @@ define <4 x i32> @caller() {
   ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI2:%[0-9]+]]:gprc = LI 160
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128) into stack + 160)
   ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI3:%[0-9]+]]:gprc = LI 144
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128) into stack + 144)
   ; 32BIT-NEXT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI4:%[0-9]+]]:gprc = LI 128
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128) into stack + 128)
   ; 32BIT-NEXT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI5:%[0-9]+]]:gprc = LI 112
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128) into stack + 112)
   ; 32BIT-NEXT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI6:%[0-9]+]]:gprc = LI 96
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128) into stack + 96)
   ; 32BIT-NEXT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI7:%[0-9]+]]:gprc = LI 80
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128) into stack + 80)
   ; 32BIT-NEXT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI8:%[0-9]+]]:gprc = LI 64
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128) into stack + 64)
   ; 32BIT-NEXT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
   ; 32BIT-NEXT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
   ; 32BIT-NEXT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
@@ -86,27 +86,27 @@ define <4 x i32> @caller() {
   ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
-  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128) into stack + 192)
   ; 64BIT-NEXT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
-  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128) into stack + 176)
   ; 64BIT-NEXT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
-  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128) into stack + 160)
   ; 64BIT-NEXT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
-  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128) into stack + 144)
   ; 64BIT-NEXT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
-  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128) into stack + 128)
   ; 64BIT-NEXT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
-  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128) into stack + 112)
   ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
   ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
   ; 64BIT-NEXT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
index fad275f58cd0..b39a94e17563 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
@@ -16,13 +16,13 @@ define void @caller() {
   ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 64
-  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128) into stack + 64)
   ; 32BIT-NEXT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
   ; 32BIT-NEXT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
-  ; 32BIT-NEXT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into stack + 84, basealign 16)
   ; 32BIT-NEXT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
   ; 32BIT-NEXT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
-  ; 32BIT-NEXT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   STW killed [[ORI1]], 80, $r1 :: (store (s32) into stack + 80, align 16)
   ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
   ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
   ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
@@ -57,7 +57,7 @@ define void @caller() {
   ; 64BIT-NEXT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
   ; 64BIT-NEXT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
   ; 64BIT-NEXT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
-  ; 64BIT-NEXT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64) into stack + 112, align 16)
   ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
   ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
   ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))

From c16dc63b44ae039f2ac123a8ffbc90031767d00b Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Wed, 18 Jun 2025 09:23:25 +0200
Subject: [PATCH 0773/1322] [OMPIRBuilder] Fix gcc -Wparentheses warning [NFC]

Without this gcc warned like
 /repo/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp:7559:68: warning: suggest parentheses around '&&' within '||' [-Wparentheses]
  7559 |         NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) &&
       |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~
  7560 |             "Wrong number of arguments for StaleCI when shareds are present");
       |             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7cbbbff511c8..ddc9c5392f92 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7558,10 +7558,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
     const unsigned int NumStaleCIArgs = StaleCI->arg_size();
     bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
-    assert(
-        !HasShareds ||
-        NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) &&
-            "Wrong number of arguments for StaleCI when shareds are present");
+    assert((!HasShareds ||
+            NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
+           "Wrong number of arguments for StaleCI when shareds are present");
     int SharedArgOperandNo =
         HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
 

From 669627d0c77ed8408358bc8c5973255fe28a36ea Mon Sep 17 00:00:00 2001
From: Philipp Jung <philippjung2010@live.de>
Date: Wed, 18 Jun 2025 11:02:53 +0200
Subject: [PATCH 0774/1322] Add check 'cppcoreguidelines-use-enum-class'
 (#138282)

Warn on non-class enum definitions as suggested by the Core Guidelines:
https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Renum-class
---
 .../cppcoreguidelines/CMakeLists.txt          |  1 +
 .../CppCoreGuidelinesTidyModule.cpp           |  3 +
 .../cppcoreguidelines/UseEnumClassCheck.cpp   | 42 +++++++++++++
 .../cppcoreguidelines/UseEnumClassCheck.h     | 40 ++++++++++++
 clang-tools-extra/docs/ReleaseNotes.rst       |  6 ++
 .../cppcoreguidelines/use-enum-class.rst      | 35 +++++++++++
 .../docs/clang-tidy/checks/list.rst           |  1 +
 .../cppcoreguidelines/use-enum-class.cpp      | 62 +++++++++++++++++++
 8 files changed, 190 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/use-enum-class.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/use-enum-class.cpp

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
index b023f76a2543..2fb4d7f1d734 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
@@ -33,6 +33,7 @@ add_clang_library(clangTidyCppCoreGuidelinesModule STATIC
   RvalueReferenceParamNotMovedCheck.cpp
   SlicingCheck.cpp
   SpecialMemberFunctionsCheck.cpp
+  UseEnumClassCheck.cpp
   VirtualClassDestructorCheck.cpp
 
   LINK_LIBS
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index 4dd9b0904f07..4b3b7bf963fd 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -48,6 +48,7 @@
 #include "RvalueReferenceParamNotMovedCheck.h"
 #include "SlicingCheck.h"
 #include "SpecialMemberFunctionsCheck.h"
+#include "UseEnumClassCheck.h"
 #include "VirtualClassDestructorCheck.h"
 
 namespace clang::tidy {
@@ -131,6 +132,8 @@ public:
     CheckFactories.registerCheck<SlicingCheck>("cppcoreguidelines-slicing");
     CheckFactories.registerCheck<modernize::UseDefaultMemberInitCheck>(
         "cppcoreguidelines-use-default-member-init");
+    CheckFactories.registerCheck<UseEnumClassCheck>(
+        "cppcoreguidelines-use-enum-class");
     CheckFactories.registerCheck<misc::UnconventionalAssignOperatorCheck>(
         "cppcoreguidelines-c-copy-assignment-signature");
     CheckFactories.registerCheck<VirtualClassDestructorCheck>(
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp
new file mode 100644
index 000000000000..ec7d9237afa3
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.cpp
@@ -0,0 +1,42 @@
+//===--- UseEnumClassCheck.cpp - clang-tidy -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UseEnumClassCheck.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::cppcoreguidelines {
+
+UseEnumClassCheck::UseEnumClassCheck(StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IgnoreUnscopedEnumsInClasses(
+          Options.get("IgnoreUnscopedEnumsInClasses", false)) {}
+
+void UseEnumClassCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoreUnscopedEnumsInClasses",
+                IgnoreUnscopedEnumsInClasses);
+}
+
+void UseEnumClassCheck::registerMatchers(MatchFinder *Finder) {
+  auto EnumDecl =
+      IgnoreUnscopedEnumsInClasses
+          ? enumDecl(unless(isScoped()), unless(hasParent(recordDecl())))
+          : enumDecl(unless(isScoped()));
+  Finder->addMatcher(EnumDecl.bind("unscoped_enum"), this);
+}
+
+void UseEnumClassCheck::check(const MatchFinder::MatchResult &Result) {
+  const auto *UnscopedEnum = Result.Nodes.getNodeAs<EnumDecl>("unscoped_enum");
+
+  diag(UnscopedEnum->getLocation(),
+       "enum %0 is unscoped, use 'enum class' instead")
+      << UnscopedEnum;
+}
+
+} // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h
new file mode 100644
index 000000000000..dfa4b7e3fda6
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h
@@ -0,0 +1,40 @@
+//===--- UseEnumClassCheck.h - clang-tidy -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_USEENUMCLASSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_USEENUMCLASSCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::cppcoreguidelines {
+
+/// Finds unscoped (non-class) enum declarations and suggests using enum class
+/// instead.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/use-enum-class.html
+class UseEnumClassCheck : public ClangTidyCheck {
+public:
+  UseEnumClassCheck(StringRef Name, ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus11;
+  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TraversalKind::TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  const bool IgnoreUnscopedEnumsInClasses;
+};
+
+} // namespace clang::tidy::cppcoreguidelines
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_USEENUMCLASSCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 3c1ca2f92904..7c0c534dbc73 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -136,6 +136,12 @@ New checks
   Finds unintended character output from ``unsigned char`` and ``signed char``
   to an ``ostream``.
 
+- New :doc:`cppcoreguidelines-use-enum-class
+  <clang-tidy/checks/cppcoreguidelines/use-enum-class>` check.
+
+  Finds unscoped (non-class) ``enum`` declarations and suggests using
+  ``enum class`` instead.
+
 - New :doc:`portability-avoid-pragma-once
   <clang-tidy/checks/portability/avoid-pragma-once>` check.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/use-enum-class.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/use-enum-class.rst
new file mode 100644
index 000000000000..9e9f4c99dc24
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/use-enum-class.rst
@@ -0,0 +1,35 @@
+.. title:: clang-tidy - cppcoreguidelines-use-enum-class
+
+cppcoreguidelines-use-enum-class
+================================
+
+Finds unscoped (non-class) ``enum`` declarations and suggests using
+``enum class`` instead.
+
+This check implements `Enum.3
+<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Renum-class>`_
+from the C++ Core Guidelines."
+
+Example:
+
+.. code-block:: c++
+
+  enum E {};        // use "enum class E {};" instead
+  enum class E {};  // OK
+
+  struct S {
+      enum E {};    // use "enum class E {};" instead
+                    // OK with option IgnoreUnscopedEnumsInClasses
+  };
+
+  namespace N {
+      enum E {};    // use "enum class E {};" instead
+  }
+
+Options
+-------
+
+.. option:: IgnoreUnscopedEnumsInClasses
+
+   When `true`, ignores unscoped ``enum`` declarations in classes.
+   Default is `false`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 5a79d61b1fd7..ccb78ee45e9c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -212,6 +212,7 @@ Clang-Tidy Checks
    :doc:`cppcoreguidelines-rvalue-reference-param-not-moved <cppcoreguidelines/rvalue-reference-param-not-moved>`,
    :doc:`cppcoreguidelines-slicing <cppcoreguidelines/slicing>`,
    :doc:`cppcoreguidelines-special-member-functions <cppcoreguidelines/special-member-functions>`,
+   :doc:`cppcoreguidelines-use-enum-class <cppcoreguidelines/use-enum-class>`,
    :doc:`cppcoreguidelines-virtual-class-destructor <cppcoreguidelines/virtual-class-destructor>`, "Yes"
    :doc:`darwin-avoid-spinlock <darwin/avoid-spinlock>`,
    :doc:`darwin-dispatch-once-nonstatic <darwin/dispatch-once-nonstatic>`, "Yes"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/use-enum-class.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/use-enum-class.cpp
new file mode 100644
index 000000000000..f53d787f80ef
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/use-enum-class.cpp
@@ -0,0 +1,62 @@
+// RUN: %check_clang_tidy -std=c++11-or-later -check-suffix=ALL,DEFAULT %s \
+// RUN: cppcoreguidelines-use-enum-class %t --
+
+// RUN: %check_clang_tidy -std=c++11-or-later -check-suffix=ALL %s \
+// RUN: cppcoreguidelines-use-enum-class %t -- \
+// RUN: -config="{CheckOptions: { \
+// RUN: cppcoreguidelines-use-enum-class.IgnoreUnscopedEnumsInClasses: true \
+// RUN: }}" --
+
+enum E {};
+// CHECK-MESSAGES-ALL: :[[@LINE-1]]:6: warning: enum 'E' is unscoped, use 'enum class' instead
+
+enum class EC {};
+
+enum struct ES {};
+
+struct S {
+  enum E {};
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: enum 'E' is unscoped, use 'enum class' instead
+  enum class EC {};
+};
+
+class C {
+  enum E {};
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: enum 'E' is unscoped, use 'enum class' instead
+  enum class EC {};
+};
+
+template<class T>
+class TC {
+  enum E {};
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: enum 'E' is unscoped, use 'enum class' instead
+  enum class EC {};
+};
+
+union U {
+  enum E {};
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:8: warning: enum 'E' is unscoped, use 'enum class' instead
+  enum class EC {};
+};
+
+namespace {
+enum E {};
+// CHECK-MESSAGES-ALL: :[[@LINE-1]]:6: warning: enum 'E' is unscoped, use 'enum class' instead
+enum class EC {};
+} // namespace
+
+namespace N {
+enum E {};
+// CHECK-MESSAGES-ALL: :[[@LINE-1]]:6: warning: enum 'E' is unscoped, use 'enum class' instead
+enum class EC {};
+} // namespace N
+
+template<enum ::EC>
+static void foo();
+
+enum ForwardE : int;
+// CHECK-MESSAGES-ALL: :[[@LINE-1]]:6: warning: enum 'ForwardE' is unscoped, use 'enum class' instead
+
+enum class ForwardEC : int;
+
+enum struct ForwardES : int;

From 43e1a5a411d972fe06a1afb86ffd5ba21fd2a376 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 18 Jun 2025 11:06:48 +0200
Subject: [PATCH 0775/1322] [mlir][mesh] adding option for traversal order in
 sharding propagation (#144079)

The traversal order in sharding propagation was hard-coded. This PR
provides options to the pass to select a suitable order
- forward-only
- backward-only
- forward-backward
- backward-forward

Default is the previous behavior (backward-forward).
---
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h   |  3 --
 .../mlir/Dialect/Mesh/Transforms/Passes.h     | 12 +++++
 .../mlir/Dialect/Mesh/Transforms/Passes.td    | 15 ++++++
 mlir/lib/Dialect/Mesh/IR/MeshOps.cpp          | 27 +++++-----
 .../Mesh/Transforms/ShardingPropagation.cpp   | 38 +++++++++-----
 .../Mesh/backward-sharding-propagation.mlir   | 26 ++++++++++
 ...forward-backward-sharding-propagation.mlir | 27 ++++++++++
 .../Mesh/forward-sharding-propagation.mlir    | 49 +++++++++++++++++++
 8 files changed, 171 insertions(+), 26 deletions(-)
 create mode 100644 mlir/test/Dialect/Mesh/backward-sharding-propagation.mlir
 create mode 100644 mlir/test/Dialect/Mesh/forward-backward-sharding-propagation.mlir
 create mode 100644 mlir/test/Dialect/Mesh/forward-sharding-propagation.mlir

diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index 32c2eca2cefa..3878505f8f93 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -206,9 +206,6 @@ Type shardType(Type type, MeshOp mesh, MeshSharding sharding);
 // Use newShardOp if it is not null. Otherwise create a new one.
 // May insert resharding if required.
 // Potentially updates newShardOp.
-void maybeInsertTargetShardingAnnotation(MeshSharding sharding,
-                                         OpOperand &operand, OpBuilder &builder,
-                                         ShardOp &newShardOp);
 void maybeInsertTargetShardingAnnotation(MeshSharding sharding, OpResult result,
                                          OpBuilder &builder);
 void maybeInsertSourceShardingAnnotation(MeshSharding sharding,
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.h
index 83399d10beaa..a2424d43a8ba 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.h
@@ -19,6 +19,18 @@ class FuncOp;
 
 namespace mesh {
 
+/// This enum controls the traversal order for the sharding propagation.
+enum class TraversalOrder {
+  /// Forward traversal.
+  Forward,
+  /// Backward traversal.
+  Backward,
+  /// Forward then backward traversal.
+  ForwardBackward,
+  /// Backward then forward traversal.
+  BackwardForward
+};
+
 //===----------------------------------------------------------------------===//
 // Passes
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.td b/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.td
index 06ebf151e7d6..11ec7e78cd5e 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Passes.td
@@ -24,6 +24,21 @@ def ShardingPropagation : InterfacePass<"sharding-propagation", "mlir::FunctionO
     operation, and the operations themselves are added with sharding option
     attributes.
   }];
+  let options = [
+    Option<"traversal", "traversal",
+           "mlir::mesh::TraversalOrder", /*default=*/"mlir::mesh::TraversalOrder::BackwardForward",
+           "Traversal order to use for sharding propagation:",
+            [{::llvm::cl::values(
+              clEnumValN(mlir::mesh::TraversalOrder::Forward, "forward",
+              "Forward only traversal."),
+              clEnumValN(mlir::mesh::TraversalOrder::Backward, "backward",
+              "backward only traversal."),
+              clEnumValN(mlir::mesh::TraversalOrder::ForwardBackward, "forward-backward",
+              "forward-backward traversal."),
+              clEnumValN(mlir::mesh::TraversalOrder::BackwardForward, "backward-forward",
+              "backward-forward traversal.")
+            )}]>,
+  ];
   let dependentDialects = [
     "mesh::MeshDialect"
   ];
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index 304cb55a3508..a2c2d1a7470c 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -275,13 +275,12 @@ Type mesh::shardType(Type type, MeshOp mesh, MeshSharding sharding) {
   return type;
 }
 
-void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding,
-                                                     OpOperand &operand,
-                                                     OpBuilder &builder,
-                                                     ShardOp &newShardOp) {
+static void maybeInsertTargetShardingAnnotationImpl(MeshSharding sharding,
+                                                    Value &operandValue,
+                                                    Operation *operandOp,
+                                                    OpBuilder &builder,
+                                                    ShardOp &newShardOp) {
   OpBuilder::InsertionGuard insertionGuard(builder);
-  Value operandValue = operand.get();
-  Operation *operandOp = operand.getOwner();
   builder.setInsertionPointAfterValue(operandValue);
   ShardOp shardOp = dyn_cast<ShardOp>(operandOp);
   if (shardOp && sharding == shardOp.getSharding() &&
@@ -300,9 +299,8 @@ void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding,
         builder.create<ShardOp>(operandValue.getLoc(), operandValue, shardingOp,
                                 /*annotate_for_users*/ false);
   }
-  IRRewriter rewriter(builder);
-  rewriter.replaceUsesWithIf(
-      operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) {
+  operandValue.replaceUsesWithIf(
+      newShardOp, [operandOp, operandValue](OpOperand &use) {
         return use.getOwner() == operandOp && use.get() == operandValue;
       });
 
@@ -313,15 +311,20 @@ void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding,
   auto newShardOp2 = builder.create<ShardOp>(operandValue.getLoc(), newShardOp,
                                              newShardOp.getSharding(),
                                              /*annotate_for_users*/ true);
-  rewriter.replaceAllUsesExcept(newShardOp, newShardOp2, newShardOp2);
+  newShardOp.getResult().replaceAllUsesExcept(newShardOp2, newShardOp2);
 }
 
 void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding,
                                                      OpResult result,
                                                      OpBuilder &builder) {
   ShardOp newShardOp;
-  for (auto &use : llvm::make_early_inc_range(result.getUses())) {
-    maybeInsertTargetShardingAnnotation(sharding, use, builder, newShardOp);
+  SmallVector<std::pair<Value, Operation *>> uses;
+  for (auto &use : result.getUses()) {
+    uses.emplace_back(use.get(), use.getOwner());
+  }
+  for (auto &[operandValue, operandOp] : uses) {
+    maybeInsertTargetShardingAnnotationImpl(sharding, operandValue, operandOp,
+                                            builder, newShardOp);
   }
 }
 
diff --git a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
index 4452dd65fce9..6751fafaf177 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
@@ -362,6 +362,9 @@ static LogicalResult visitOp(Operation *op, OpBuilder &builder) {
 //===----------------------------------------------------------------------===//
 struct ShardingPropagation
     : public mesh::impl::ShardingPropagationBase<ShardingPropagation> {
+
+  using ShardingPropagationBase<ShardingPropagation>::ShardingPropagationBase;
+
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
     MLIRContext *ctx = funcOp.getContext();
@@ -382,18 +385,31 @@ struct ShardingPropagation
             shardingOp.printLoopTypesAndIndexingMaps(llvm::dbgs());
         });
 
-    // 1. propagate in reversed order
-    for (Operation &op : llvm::make_early_inc_range(llvm::reverse(block)))
-      if (failed(visitOp(&op, builder)))
-        return signalPassFailure();
+    auto traverse = [&](auto &&range, OpBuilder &builder,
+                        const char *order) -> bool {
+      for (Operation &op : range) {
+        if (failed(visitOp(&op, builder))) {
+          signalPassFailure();
+          return true;
+        }
+      }
+      LLVM_DEBUG(DBGS() << "After " << order << " order propagation:\n"
+                        << funcOp << "\n");
+      LLVM_DEBUG(assert(succeeded(mlir::verify(funcOp))));
+      return false;
+    };
 
-    LLVM_DEBUG(DBGS() << "After reversed order propagation:\n"
-                      << funcOp << "\n");
-    LLVM_DEBUG(assert(succeeded(mlir::verify(funcOp))));
+    // 1. Propagate in reversed order.
+    if (traversal == TraversalOrder::Backward ||
+        traversal == TraversalOrder::BackwardForward)
+      traverse(llvm::reverse(block), builder, "backward");
 
-    // 2. propagate in original order
-    for (Operation &op : llvm::make_early_inc_range(block))
-      if (failed(visitOp(&op, builder)))
-        return signalPassFailure();
+    // 2. Propagate in original order.
+    if (traversal != TraversalOrder::Backward)
+      traverse(block, builder, "forward");
+
+    // 3. Propagate in backward order if needed.
+    if (traversal == TraversalOrder::ForwardBackward)
+      traverse(llvm::reverse(block), builder, "backward");
   }
 };
diff --git a/mlir/test/Dialect/Mesh/backward-sharding-propagation.mlir b/mlir/test/Dialect/Mesh/backward-sharding-propagation.mlir
new file mode 100644
index 000000000000..4223d01d6511
--- /dev/null
+++ b/mlir/test/Dialect/Mesh/backward-sharding-propagation.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation{traversal=backward}))" %s | FileCheck %s
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  mesh.mesh @mesh(shape = 1) {sym_visibility = "private"}
+  func.func @test_forward() -> tensor<6x6xi32> {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: tensor.empty()
+    %0 = tensor.empty() : tensor<6x6xi32>
+    %sharding = mesh.sharding @mesh split_axes = [[0]] : !mesh.sharding
+    // CHECK-COUNT-2: mesh.shard
+    %sharding_annotated = mesh.shard %0 to %sharding : tensor<6x6xi32>
+    %1 = linalg.fill ins(%c1_i32 : i32) outs(%sharding_annotated : tensor<6x6xi32>) -> tensor<6x6xi32>
+    // CHECK: tensor.empty()
+    // CHECK-NOT: mesh.shard @
+    %2 = tensor.empty() : tensor<6x6xi32>
+    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %1
+        : tensor<6x6xi32>, tensor<6x6xi32>) outs(%2 : tensor<6x6xi32>) {
+    ^bb0(%in: i32, %in_2: i32, %out: i32):
+      %9 = arith.addi %in, %in_2 : i32
+      linalg.yield %9 : i32
+    } -> tensor<6x6xi32>
+    // CHECK: return
+    return %3 : tensor<6x6xi32>
+  }
+}
diff --git a/mlir/test/Dialect/Mesh/forward-backward-sharding-propagation.mlir b/mlir/test/Dialect/Mesh/forward-backward-sharding-propagation.mlir
new file mode 100644
index 000000000000..dd2eee2f7def
--- /dev/null
+++ b/mlir/test/Dialect/Mesh/forward-backward-sharding-propagation.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation{traversal=forward-backward}))" %s | FileCheck %s
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  mesh.mesh @mesh(shape = 1) {sym_visibility = "private"}
+  func.func @test_forward() -> tensor<6x6xi32> {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: tensor.empty()
+    %0 = tensor.empty() : tensor<6x6xi32>
+    // CHECK-COUNT-3: mesh.sharding @mesh split_axes = {{\[\[0}}]]
+    %sharding_row = mesh.sharding @mesh split_axes = [[0]] : !mesh.sharding
+    %annotated_row = mesh.shard %0 to %sharding_row : tensor<6x6xi32>
+    %1 = linalg.fill ins(%c1_i32 : i32) outs(%annotated_row : tensor<6x6xi32>) -> tensor<6x6xi32>
+    %2 = tensor.empty() : tensor<6x6xi32>
+    // CHECK-COUNT-4: mesh.sharding @mesh split_axes = {{\[\[1}}]]
+    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %1
+        : tensor<6x6xi32>, tensor<6x6xi32>) outs(%2 : tensor<6x6xi32>) {
+    ^bb0(%in: i32, %in_2: i32, %out: i32):
+      %9 = arith.addi %in, %in_2 : i32
+      linalg.yield %9 : i32
+    } -> tensor<6x6xi32>
+    %sharding_col = mesh.sharding @mesh split_axes = [[1]] : !mesh.sharding
+    %annotated_col = mesh.shard %3 to %sharding_col : tensor<6x6xi32>
+    // CHECK: return
+    return %annotated_col : tensor<6x6xi32>
+  }
+}
diff --git a/mlir/test/Dialect/Mesh/forward-sharding-propagation.mlir b/mlir/test/Dialect/Mesh/forward-sharding-propagation.mlir
new file mode 100644
index 000000000000..98e9931b8de9
--- /dev/null
+++ b/mlir/test/Dialect/Mesh/forward-sharding-propagation.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation{traversal=forward}))" %s | FileCheck %s
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "mpich", "MPI:comm_world_rank" = 0 : i32>} {
+  mesh.mesh @mesh(shape = 1) {sym_visibility = "private"}
+  func.func @test_forward() -> (tensor<6x6xi32>, tensor<6x6xi32>, tensor<i32>) attributes {llvm.emit_c_interface} {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: [[v3:%.*]] = tensor.empty() : tensor<6x6xi32>
+    %0 = tensor.empty() : tensor<6x6xi32>
+    // CHECK: [[v1:%.*]] = linalg.fill ins
+    // CHECK: [[vsharding_0:%.*]] = mesh.sharding @mesh split_axes = {{\[\[}}0]] : !mesh.sharding
+    // CHECK: [[vsharding_annotated_1:%.*]] = mesh.shard [[v1]] to [[vsharding_0]] : tensor<6x6xi32>
+    %1 = linalg.fill ins(%c1_i32 : i32) outs(%0 : tensor<6x6xi32>) -> tensor<6x6xi32>
+    %sharding = mesh.sharding @mesh split_axes = [[0]] : !mesh.sharding
+    %sharding_annotated = mesh.shard %1 to %sharding : tensor<6x6xi32>
+    // CHECK: [[v2:%.*]] = tensor.empty() : tensor<6x6xi32>
+    // CHECK: [[vsharding_2:%.*]] = mesh.sharding @mesh split_axes = {{\[\[}}0]] : !mesh.sharding
+    // CHECK: [[vsharding_annotated_3:%.*]] = mesh.shard [[vsharding_annotated_1]] to [[vsharding_2]] annotate_for_users : tensor<6x6xi32>
+    %3 = tensor.empty() : tensor<6x6xi32>
+    // CHECK: [[vsharding_4:%.*]] = mesh.sharding @mesh split_axes = {{\[\[}}0]] : !mesh.sharding
+    // CHECK: [[vsharding_annotated_5:%.*]] = mesh.shard [[v2]] to [[vsharding_4]] annotate_for_users : tensor<6x6xi32>
+    // CHECK: [[v3:%.*]] = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]}
+    // CHECK-SAME: ins([[vsharding_annotated_3]], [[vsharding_annotated_3]] : tensor<6x6xi32>, tensor<6x6xi32>) outs([[vsharding_annotated_5]] : tensor<6x6xi32>) {
+    // CHECK: [[vsharding_6:%.*]] = mesh.sharding @mesh split_axes = {{\[\[}}0]] : !mesh.sharding
+    // CHECK: [[vsharding_annotated_7:%.*]] = mesh.shard [[v3]] to [[vsharding_6]] : tensor<6x6xi32>
+    %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%sharding_annotated, %sharding_annotated
+        : tensor<6x6xi32>, tensor<6x6xi32>) outs(%3 : tensor<6x6xi32>) {
+    ^bb0(%in: i32, %in_2: i32, %out: i32):
+      %9 = arith.addi %in, %in_2 : i32
+      linalg.yield %9 : i32
+    } -> tensor<6x6xi32>
+    %c0_i32 = arith.constant 0 : i32
+    %6 = tensor.empty() : tensor<i32>
+    %7 = linalg.fill ins(%c0_i32 : i32) outs(%6 : tensor<i32>) -> tensor<i32>
+    // CHECK: [[vreduced:%.*]] = linalg.reduce ins
+    // CHECK: [[vsharding_12:%.*]] = mesh.sharding @mesh split_axes = [] partial =  sum [0] : !mesh.sharding
+    // CHECK: [[vsharding_annotated_13:%.*]] = mesh.shard [[vreduced]] to [[vsharding_12]] : tensor<i32>
+    %reduced = linalg.reduce ins(%4 : tensor<6x6xi32>) outs(%7 : tensor<i32>) dimensions = [0, 1] 
+      (%in: i32, %init: i32) {
+        %9 = arith.addi %in, %init : i32
+        linalg.yield %9 : i32
+      }
+    // CHECK: [[vsharding_14:%.*]] = mesh.sharding @mesh split_axes = {{\[\[}}]] : !mesh.sharding
+    %sharding_0 = mesh.sharding @mesh split_axes = [[]] : !mesh.sharding
+    // CHECK: [[vsharding_annotated_15:%.*]] = mesh.shard [[vsharding_annotated_13]] to [[vsharding_14]] annotate_for_users : tensor<i32>
+    %sharding_annotated_1 = mesh.shard %reduced to %sharding_0 annotate_for_users : tensor<i32>
+    return %sharding_annotated, %4, %sharding_annotated_1 : tensor<6x6xi32>, tensor<6x6xi32>, tensor<i32>
+  }
+}

From 355725a25e6be38d7a97cab9e206d2a16a1bd849 Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Wed, 18 Jun 2025 17:09:32 +0800
Subject: [PATCH 0776/1322] [TSan] Fix missing inst cleanup (#144067)

Commit 44e875ad5b2ce26826dd53f9e7d1a71436c86212 introduced a change that
replaces `ReplaceInstWithInst` with `Instruction::replaceAllUsesWith`,
without subsequent instruction cleanup.

This results in TSan leaving behind useless `load atomic` instructions
after 'replacing' them.

This commit adds cleanup back, consistent with the context.
---
 llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp        | 1 +
 .../test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8ae6f7745a9e..5485998164f1 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -728,6 +728,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
     Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
     I->replaceAllUsesWith(Cast);
+    I->eraseFromParent();
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     Value *Addr = SI->getPointerOperand();
     int Idx =
diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
index 8bcabaecf0fd..015ee2fe711e 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
@@ -10,7 +10,6 @@ define float @load_float(ptr %fptr) {
 ; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__tsan_atomic32_load(ptr [[FPTR]], i32 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
-; CHECK-NEXT:    [[V:%.*]] = load atomic float, ptr [[FPTR]] unordered, align 4
 ; CHECK-NEXT:    call void @__tsan_func_exit()
 ; CHECK-NEXT:    ret float [[TMP3]]
 ;
@@ -25,7 +24,6 @@ define double @load_double(ptr %fptr) {
 ; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @__tsan_atomic64_load(ptr [[FPTR]], i32 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
-; CHECK-NEXT:    [[V:%.*]] = load atomic double, ptr [[FPTR]] unordered, align 8
 ; CHECK-NEXT:    call void @__tsan_func_exit()
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
@@ -40,7 +38,6 @@ define fp128 @load_fp128(ptr %fptr) {
 ; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i128 @__tsan_atomic128_load(ptr [[FPTR]], i32 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128 [[TMP2]] to fp128
-; CHECK-NEXT:    [[V:%.*]] = load atomic fp128, ptr [[FPTR]] unordered, align 16
 ; CHECK-NEXT:    call void @__tsan_func_exit()
 ; CHECK-NEXT:    ret fp128 [[TMP3]]
 ;

From 8e157fdbb7b4af9f67b139a9f05feaa9b338d3f5 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Wed, 18 Jun 2025 17:10:29 +0800
Subject: [PATCH 0777/1322] [CIR] Add support for __builtin_assume (#144376)

This patch adds support for the `__builtin_assume` builtin function.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 22 +++++++++++
 clang/include/clang/CIR/MissingFeatures.h     |  3 ++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       | 38 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  4 ++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  9 +++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   | 10 +++++
 clang/test/CIR/CodeGen/builtin_call.cpp       | 16 ++++++++
 7 files changed, 102 insertions(+)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 8dd1f0ce361d..4655cebc82ee 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2387,4 +2387,26 @@ def ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Assume Operations
+//===----------------------------------------------------------------------===//
+
+def AssumeOp : CIR_Op<"assume"> {
+  let summary = "Tell the optimizer that a boolean value is true";
+  let description = [{
+    The `cir.assume` operation takes a single boolean prediate as its only
+    argument and does not have any results. The operation tells the optimizer
+    that the predicate is always true.
+
+    This operation corresponds to the `__assume` and the `__builtin_assume`
+    builtin functions.
+  }];
+
+  let arguments = (ins CIR_BoolType:$predicate);
+
+  let assemblyFormat = [{
+    $predicate `:` type($predicate) attr-dict
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 3dc28e6f2e5b..3d120903dea1 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -237,6 +237,9 @@ struct MissingFeatures {
   static bool lowerAggregateLoadStore() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool asmLabelAttr() { return false; }
+  static bool builtinCall() { return false; }
+  static bool builtinCallF128() { return false; }
+  static bool builtinCallMathErrno() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 19fac00ab873..83825f0835a1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CIRGenCall.h"
+#include "CIRGenConstantEmitter.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
 #include "CIRGenValue.h"
@@ -66,6 +67,32 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitLibraryCall(*this, fd, e,
                            cgm.getBuiltinLibFunction(fd, builtinID));
 
+  assert(!cir::MissingFeatures::builtinCallF128());
+
+  // If the builtin has been declared explicitly with an assembler label,
+  // disable the specialized emitting below. Ideally we should communicate the
+  // rename in IR, or at least avoid generating the intrinsic calls that are
+  // likely to get lowered to the renamed library functions.
+  unsigned builtinIDIfNoAsmLabel = fd->hasAttr<AsmLabelAttr>() ? 0 : builtinID;
+
+  assert(!cir::MissingFeatures::builtinCallMathErrno());
+  assert(!cir::MissingFeatures::builtinCall());
+
+  switch (builtinIDIfNoAsmLabel) {
+  default:
+    break;
+
+  case Builtin::BI__assume:
+  case Builtin::BI__builtin_assume: {
+    if (e->getArg(0)->HasSideEffects(getContext()))
+      return RValue::get(nullptr);
+
+    mlir::Value argValue = emitCheckedArgForAssume(e->getArg(0));
+    builder.create<cir::AssumeOp>(getLoc(e->getExprLoc()), argValue);
+    return RValue::get(nullptr);
+  }
+  }
+
   cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
   return getUndefRValue(e->getType());
 }
@@ -88,3 +115,14 @@ cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
   mlir::Type type = convertType(fd->getType());
   return getOrCreateCIRFunction(name, type, d, /*forVTable=*/false);
 }
+
+mlir::Value CIRGenFunction::emitCheckedArgForAssume(const Expr *e) {
+  mlir::Value argValue = evaluateExprAsBool(e);
+  if (!sanOpts.has(SanitizerKind::Builtin))
+    return argValue;
+
+  assert(!cir::MissingFeatures::sanitizers());
+  cgm.errorNYI(e->getSourceRange(),
+               "emitCheckedArgForAssume: sanitizers are NYI");
+  return {};
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index de6ef2a69faf..6c490a72b2e9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -772,6 +772,10 @@ public:
 
   LValue emitCastLValue(const CastExpr *e);
 
+  /// Emits an argument for a call to a `__builtin_assume`. If the builtin
+  /// sanitizer is enabled, a runtime check is also emitted.
+  mlir::Value emitCheckedArgForAssume(const Expr *e);
+
   LValue emitCompoundAssignmentLValue(const clang::CompoundAssignOperator *e);
 
   void emitConstructorBody(FunctionArgList &args);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 6a4e4e4a7df3..a96501ab2c38 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -407,6 +407,14 @@ struct ConvertCIRToLLVMPass
   StringRef getArgument() const override { return "cir-flat-to-llvm"; }
 };
 
+mlir::LogicalResult CIRToLLVMAssumeOpLowering::matchAndRewrite(
+    cir::AssumeOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  auto cond = adaptor.getPredicate();
+  rewriter.replaceOpWithNewOp<mlir::LLVM::AssumeOp>(op, cond);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMBrCondOpLowering::matchAndRewrite(
     cir::BrCondOp brOp, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -1811,6 +1819,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                                              dl);
   patterns.add<
       // clang-format off
+               CIRToLLVMAssumeOpLowering,
                CIRToLLVMBaseClassAddrOpLowering,
                CIRToLLVMBinOpLowering,
                CIRToLLVMBrCondOpLowering,
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index a80981806354..a80c66ac1abf 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -29,6 +29,16 @@ mlir::Value lowerCirAttrAsValue(mlir::Operation *parentOp, mlir::Attribute attr,
 
 mlir::LLVM::Linkage convertLinkage(cir::GlobalLinkageKind linkage);
 
+class CIRToLLVMAssumeOpLowering
+    : public mlir::OpConversionPattern<cir::AssumeOp> {
+public:
+  using mlir::OpConversionPattern<cir::AssumeOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::AssumeOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMBrCondOpLowering
     : public mlir::OpConversionPattern<cir::BrCondOp> {
 public:
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index 322c13c8f081..0a2226a2cc59 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -94,3 +94,19 @@ void library_builtins() {
 // OGCG: define dso_local void @_Z16library_builtinsv()
 // OGCG: call i32 (ptr, ...) @printf(ptr noundef null)
 // OGCG: call void @abort()
+
+void assume(bool arg) {
+  __builtin_assume(arg);
+}
+
+// CIR: cir.func @_Z6assumeb
+// CIR:   cir.assume %{{.+}} : !cir.bool
+// CIR: }
+
+// LLVM: define void @_Z6assumeb
+// LLVM:   call void @llvm.assume(i1 %{{.+}})
+// LLVM: }
+
+// OGCG: define {{.*}}void @_Z6assumeb
+// OGCG:   call void @llvm.assume(i1 %{{.+}})
+// OGCG: }

From fe42d34274cac79794637bf2f69f85537dde8b74 Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Wed, 18 Jun 2025 10:13:46 +0100
Subject: [PATCH 0778/1322] [clang][headers]Remove unnecessary guard of
 !defined(__SCE__). (#144522)

Sony PlayStation now supports C++20, and we wish to change the default
C++ mode to C++20 sometime in the future. As such, the !defined(__SCE__)
guards are redundant and we want to remove them. This in turn makes the
entire guard lines redundant (always true), so this patch removes them
entirely.
---
 clang/lib/Headers/bmiintrin.h       |   4 -
 clang/lib/Headers/immintrin.h       | 224 ----------------------------
 clang/lib/Headers/keylockerintrin.h |   9 --
 clang/lib/Headers/x86gprintrin.h    |  14 --
 clang/lib/Headers/x86intrin.h       |  18 ---
 5 files changed, 269 deletions(-)

diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h
index 59c5ece3977f..8024da55379c 100644
--- a/clang/lib/Headers/bmiintrin.h
+++ b/clang/lib/Headers/bmiintrin.h
@@ -161,8 +161,6 @@ _mm_tzcnt_64(unsigned long long __X) {
 
 #undef __RELAXED_FN_ATTRS
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
-
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS                                                     \
@@ -603,6 +601,4 @@ __blsr_u64(unsigned long long __X) {
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */
-
 #endif /* __BMIINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 19c5987257a2..35f012cc7004 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -16,231 +16,112 @@
 
 #include <x86gprintrin.h>
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
 #include <mmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
 #include <xmmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
 #include <emmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
 #include <pmmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
 #include <tmmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__SSE4_2__) || defined(__SSE4_1__))
 #include <smmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AES__) || defined(__PCLMUL__))
 #include <wmmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
 #include <clflushoptintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
 #include <clwbintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
 #include <f16cintrin.h>
-#endif
 
-/* No feature check desired due to internal checks */
 #include <bmiintrin.h>
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
 #include <lzcntintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
 #include <popcntintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
 #include <avx512fintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
 #include <avx512vlintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
 #include <avx512bwintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
 #include <avx512bitalgintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
 #include <avx512vpopcntdqintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
 #include <avx512vpopcntdqvlintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
 #include <avx512vnniintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
 #include <avx512vlvnniintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
 #include <avxvnniintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
 #include <avx512vlbitalgintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512CD__))
 #include <avx512vlcdintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512DQ__))
 #include <avx512vldqintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
 #include <avx512ifmavlintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
 #include <avxifmaintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
 #include <avx512vbmivlintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
 #include <avx512vbmi2intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
 #include <avx512vlvbmi2intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512FP16__))
 #include <avx512vlfp16intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512BF16__))
 #include <avx512vlbf16intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
 #include <pkuintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
 #include <vpclmulqdqintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
 #include <vaesintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
 #include <gfniintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
 #include <avxvnniint8intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
 #include <avxneconvertintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
 #include <sha512intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
 #include <sm3intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
 #include <sm4intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
 #include <avxvnniint16intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
 /// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
@@ -252,9 +133,7 @@ static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __
 _rdpid_u32(void) {
   return __builtin_ia32_rdpid();
 }
-#endif // __RDPID__
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
 /// Returns a 16-bit hardware-generated random value.
 ///
 /// \headerfile <immintrin.h>
@@ -314,9 +193,7 @@ _rdrand64_step(unsigned long long *__p)
   }
 #endif
 }
-#endif /* __RDRND__ */
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 /// Reads the FS base register.
 ///
@@ -427,9 +304,6 @@ _writegsbase_u64(unsigned long long __V)
 }
 
 #endif
-#endif /* __FSGSBASE__ */
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
 
 /* The structs used below are to force the load/store to be unaligned. This
  * is accomplished with the __packed__ attribute. The __may_alias__ prevents
@@ -543,172 +417,86 @@ _storebe_i64(void * __P, long long __D) {
   ((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
 }
 #endif
-#endif /* __MOVBE */
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
 #include <rtmintrin.h>
 #include <xtestintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
 #include <shaintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
 #include <fxsrintrin.h>
-#endif
 
 /* No feature check desired due to internal MSC_VER checks */
 #include <xsaveintrin.h>
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
 #include <xsavecintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
 #include <xsavesintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
 #include <cetintrin.h>
-#endif
 
 /* Intrinsics inside adcintrin.h are available at all times. */
 #include <adcintrin.h>
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
 #include <adxintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
 #include <rdseedintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
 #include <wbnoinvdintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
 #include <cldemoteintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
 #include <waitpkgintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
-    defined(__MOVDIR64B__)
 #include <movdirintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVRS__)
 #include <movrsintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX10_2__) && defined(__MOVRS__))
 #include <movrs_avx10_2intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX10_2_512__) && defined(__MOVRS__))
 #include <movrs_avx10_2_512intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
 #include <pconfigintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
 #include <sgxintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
 #include <ptwriteintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
 #include <invpcidintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
-    defined(__WIDEKL__)
 #include <keylockerintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
-    defined(__AMX_INT8__) || defined(__AMX_BF16__)
 #include <amxintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
 #include <amxfp16intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
 #include <amxcomplexintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__)
 #include <amxfp8intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__)
 #include <amxtransposeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
 #include <amxmovrsintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
 #include <amxmovrstransposeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
 #include <amxavx512intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
 #include <amxtf32intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__))
 #include <amxtf32transposeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AMX_BF16__) && defined(__AMX_TRANSPOSE__))
 #include <amxbf16transposeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AMX_FP16__) && defined(__AMX_TRANSPOSE__))
 #include <amxfp16transposeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AMX_COMPLEX__) && defined(__AMX_TRANSPOSE__))
 #include <amxcomplextransposeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
 #include <avx512vlvp2intersectintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
 #include <avx10_2bf16intrin.h>
 #include <avx10_2convertintrin.h>
 #include <avx10_2copyintrin.h>
@@ -716,33 +504,21 @@ _storebe_i64(void * __P, long long __D) {
 #include <avx10_2niintrin.h>
 #include <avx10_2satcvtdsintrin.h>
 #include <avx10_2satcvtintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
 #include <avx10_2_512bf16intrin.h>
 #include <avx10_2_512convertintrin.h>
 #include <avx10_2_512minmaxintrin.h>
 #include <avx10_2_512niintrin.h>
 #include <avx10_2_512satcvtdsintrin.h>
 #include <avx10_2_512satcvtintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) ||                             \
-    (defined(__AVX10_2_512__) && defined(__SM4__))
 #include <sm4evexintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
 #include <enqcmdintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
 #include <serializeintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
 #include <tsxldtrkintrin.h>
-#endif
 
 #if defined(_MSC_VER) && __has_extension(gnu_asm)
 /* Define the default attributes for these intrinsics */
diff --git a/clang/lib/Headers/keylockerintrin.h b/clang/lib/Headers/keylockerintrin.h
index f76e91b4d4b3..4e9e6bec20c0 100644
--- a/clang/lib/Headers/keylockerintrin.h
+++ b/clang/lib/Headers/keylockerintrin.h
@@ -28,8 +28,6 @@
 #ifndef _KEYLOCKERINTRIN_H
 #define _KEYLOCKERINTRIN_H
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)
-
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
   __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
@@ -326,10 +324,6 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */
-
-#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)
-
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
   __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
@@ -521,7 +515,4 @@ _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)   \
-        */
-
 #endif /* _KEYLOCKERINTRIN_H */
diff --git a/clang/lib/Headers/x86gprintrin.h b/clang/lib/Headers/x86gprintrin.h
index 3d5cc606d7e6..8d513ceffb6d 100644
--- a/clang/lib/Headers/x86gprintrin.h
+++ b/clang/lib/Headers/x86gprintrin.h
@@ -10,33 +10,19 @@
 #ifndef __X86GPRINTRIN_H
 #define __X86GPRINTRIN_H
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__HRESET__)
 #include <hresetintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__UINTR__)
 #include <uintrintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__USERMSR__)
 #include <usermsrintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CRC32__)
 #include <crc32intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHI__)
 #include <prfchiintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RAOINT__)
 #include <raointintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CMPCCXADD__)
 #include <cmpccxaddintrin.h>
-#endif
 
 #if defined(__i386__)
 #define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
diff --git a/clang/lib/Headers/x86intrin.h b/clang/lib/Headers/x86intrin.h
index f42e9e580f88..aaa84365ce3e 100644
--- a/clang/lib/Headers/x86intrin.h
+++ b/clang/lib/Headers/x86intrin.h
@@ -14,40 +14,22 @@
 
 #include <immintrin.h>
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE4A__)
 #include <ammintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA4__)
 #include <fma4intrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__XOP__)
 #include <xopintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__TBM__)
 #include <tbmintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__LWP__)
 #include <lwpintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__MWAITX__)
 #include <mwaitxintrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__CLZERO__)
 #include <clzerointrin.h>
-#endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPRU__)
 #include <rdpruintrin.h>
-#endif
 
 #endif /* __X86INTRIN_H */

From 58c4fa96cb111ea8d399296838f4cb6a294115ca Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Wed, 18 Jun 2025 10:21:37 +0100
Subject: [PATCH 0779/1322] Fix bazel build for #142771 (#144659)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index cb0f9d8c7413..c750eb733b3b 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1650,6 +1650,7 @@ td_library(
     srcs = [
         "include/mlir/Dialect/EmitC/IR/EmitC.td",
         "include/mlir/Dialect/EmitC/IR/EmitCAttributes.td",
+        "include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td",
         "include/mlir/Dialect/EmitC/IR/EmitCBase.td",
         "include/mlir/Dialect/EmitC/IR/EmitCTypes.td",
     ],
@@ -1665,6 +1666,17 @@ td_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "EmitCInterfacesIncGen",
+    tbl_outs = {
+        "include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h.inc": ["-gen-op-interface-decls"],
+        "include/mlir/Dialect/EmitC/IR/EmitCInterfaces.cpp.inc": ["-gen-op-interface-defs"],
+    },
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td",
+    deps = [":EmitCTdFiles"],
+)
+
 gentbl_cc_library(
     name = "EmitCAttributesIncGen",
     tbl_outs = {
@@ -3679,6 +3691,7 @@ cc_library(
         ":BytecodeOpInterface",
         ":CastInterfaces",
         ":ControlFlowInterfaces",
+        ":EmitCInterfacesIncGen",
         ":EmitCAttributesIncGen",
         ":EmitCOpsIncGen",
         ":FunctionInterfaces",

From 6fcdde2a4eb9eaf34511ac3a35075be329fe1fae Mon Sep 17 00:00:00 2001
From: Lucas Duarte Prates <lucas.prates@arm.com>
Date: Wed, 18 Jun 2025 10:26:46 +0100
Subject: [PATCH 0780/1322] [runtimes] Allow use of external llvm-lit on
 standalone builds (#144347)

When creating a standalone build of the runtimes sub-project, the
current CMake implementation looks for a lit executable that might
potentially exist in the build tree and unconditionally overrides the
value of `LLVM_EXTERNAL_LIT`. Due to this, any value passed via
`-DLLVM_EXTERNAL_LIT` when configuring the CMake project is ignored.
This change adds the `ALLOW_EXTERNAL` argument to the
`get_llvm_lit_path` call in the runtimes' CMakeLists.txt, allowing any
value previously set to be considered.
---
 runtimes/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 878b2eee3861..e4dd4ebfc678 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -259,7 +259,7 @@ if(LLVM_INCLUDE_TESTS)
     # dir rather than ${LLVM_INSTALL_DIR}/bin/llvm-lit (which may not exist if
     # LLVM_BINARY_DIR points at an installed LLVM tree rather than a build tree).
     set(LLVM_LIT_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/bin)
-    get_llvm_lit_path(_base_dir _file_name)
+    get_llvm_lit_path(_base_dir _file_name ALLOW_EXTERNAL)
     set(LLVM_EXTERNAL_LIT "${_base_dir}/${_file_name}" CACHE STRING "Command used to spawn lit" FORCE)
     # Avoid warning about missing llvm-lit from runtimes CMake files. This is
     # fine since we call configure_file() to create llvm-lit at the end of this

From 757a0e6d3b6130a984960ee413a3c8a6f99c7cb5 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Wed, 18 Jun 2025 11:29:23 +0200
Subject: [PATCH 0781/1322] [SystemZ] Treat FAKE_USE instructions as
 instructions without a size (#144390)

This patch fixes an error in which `FAKE_USE` instructions would trigger
an assertion in SystemZLongBranch due to them having a size of 0 without
being excepted in the assertion that each instruction, other than a set
of known 0-size instruction types, should have a non-0 size.

`FAKE_USE` instructions are no-op instructions that are emitted into
LLVM by the `-fextend-variable-liveness` clang flag to help preserve the
liveness of source variables in optimized code, and therefore they
should be understood as being valid size 0 instructions.
---
 llvm/lib/Target/SystemZ/SystemZLongBranch.cpp |  2 +-
 llvm/test/CodeGen/SystemZ/fake-use-size.ll    | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/fake-use-size.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index 54e1eb095494..21a233b2ffa1 100644
--- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -215,7 +215,7 @@ static unsigned getInstSizeInBytes(const MachineInstr &MI,
           // These do not have a size:
           MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() ||
           MI.isImplicitDef() || MI.getOpcode() == TargetOpcode::MEMBARRIER ||
-          MI.getOpcode() == TargetOpcode::INIT_UNDEF ||
+          MI.getOpcode() == TargetOpcode::INIT_UNDEF || MI.isFakeUse() ||
           // These have a size that may be zero:
           MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP ||
           MI.getOpcode() == SystemZ::PATCHPOINT ||
diff --git a/llvm/test/CodeGen/SystemZ/fake-use-size.ll b/llvm/test/CodeGen/SystemZ/fake-use-size.ll
new file mode 100644
index 000000000000..1690a046aad4
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fake-use-size.ll
@@ -0,0 +1,14 @@
+; RUN: llc -O0 < %s -mtriple=s390x-linux-gnu 2>&1 | FileCheck %s
+
+;; Tests that we can handle FAKE_USE instructions, emitting a comment for them
+;; in the resulting assembly.
+
+; CHECK:      .type   idd,@function
+; CHECK:      # %bb.0:
+; CHECK-NEXT: # fake_use:
+
+define double @idd(double %d) {
+entry:
+  notail call void (...) @llvm.fake.use(double %d)
+  ret double %d
+}

From bb00fd087a3c3e02fb812e41218ad0a85d9f0fe1 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 18 Jun 2025 09:35:13 +0000
Subject: [PATCH 0782/1322] [gn build] Port 669627d0c77e

---
 .../clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
index a06b2f11b452..4f3ef5bf174b 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn
@@ -45,6 +45,7 @@ static_library("cppcoreguidelines") {
     "RvalueReferenceParamNotMovedCheck.cpp",
     "SlicingCheck.cpp",
     "SpecialMemberFunctionsCheck.cpp",
+    "UseEnumClassCheck.cpp",
     "VirtualClassDestructorCheck.cpp",
   ]
 }

From cd8248f3e856a37cc1addcb74475b4d37dc8aa42 Mon Sep 17 00:00:00 2001
From: Scott Constable <scott.d.constable@intel.com>
Date: Wed, 18 Jun 2025 02:52:55 -0700
Subject: [PATCH 0783/1322] Fixed a bug in `-fsanitize-kcfi-arity` (#142867)

Compiling with `fsanitize-kcfi-arity` can crash the compiler if a
function has more than 6 arguments, including floating-point arguments
passed in XMM registers. This patch fixes the feature by only counter
integer and stack arguments toward kCFI arity. For example, the compiler
crashed when it attempted to generate kCFI arity information for this
function:
https://github.com/torvalds/linux/blob/16b70698aa3ae7888826d0c84567c72241cf6713/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_util_32.h#L680

As noted in a comment, floating-point registers are not relevant to
enforcing kCFI at this time.
---
 llvm/lib/Target/X86/X86AsmPrinter.cpp | 23 ++++++++++++++++-----
 llvm/test/CodeGen/X86/kcfi-arity.ll   | 29 +++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 24eda602effd..c7238839c26b 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -198,14 +198,27 @@ void X86AsmPrinter::emitKCFITypeId(const MachineFunction &MF) {
     // Determine the function's arity (i.e., the number of arguments) at the ABI
     // level by counting the number of parameters that are passed
     // as registers, such as pointers and 64-bit (or smaller) integers. The
-    // Linux x86-64 ABI allows up to 6 parameters to be passed in GPRs.
+    // Linux x86-64 ABI allows up to 6 integer parameters to be passed in GPRs.
     // Additional parameters or parameters larger than 64 bits may be passed on
-    // the stack, in which case the arity is denoted as 7.
+    // the stack, in which case the arity is denoted as 7. Floating-point
+    // arguments passed in XMM0-XMM7 are not counted toward arity because
+    // floating-point values are not relevant to enforcing kCFI at this time.
     const unsigned ArityToRegMap[8] = {X86::EAX, X86::ECX, X86::EDX, X86::EBX,
                                        X86::ESP, X86::EBP, X86::ESI, X86::EDI};
-    int Arity = MF.getInfo<X86MachineFunctionInfo>()->getArgumentStackSize() > 0
-                    ? 7
-                    : MF.getRegInfo().liveins().size();
+    int Arity;
+    if (MF.getInfo<X86MachineFunctionInfo>()->getArgumentStackSize() > 0) {
+      Arity = 7;
+    } else {
+      Arity = 0;
+      for (const auto &LI : MF.getRegInfo().liveins()) {
+        auto Reg = LI.first;
+        if (X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg) ||
+            X86::GR32RegClass.contains(Reg) ||
+            X86::GR64RegClass.contains(Reg)) {
+          ++Arity;
+        }
+      }
+    }
     DestReg = ArityToRegMap[Arity];
   }
 
diff --git a/llvm/test/CodeGen/X86/kcfi-arity.ll b/llvm/test/CodeGen/X86/kcfi-arity.ll
index 68d90adaf2a1..009fa7d2dc0a 100644
--- a/llvm/test/CodeGen/X86/kcfi-arity.ll
+++ b/llvm/test/CodeGen/X86/kcfi-arity.ll
@@ -192,9 +192,33 @@ entry:
   ret void
 }
 
+;; Ensure that floating-point values are not counted toward the arity
+; ASM-LABEL: __cfi_f12:
+; ASM: movl $2253188362, %ebp
+define dso_local void @f12(i32 noundef %v1, i32 noundef %v2, float noundef %v3, double noundef %v4, float noundef %v5, i32 noundef %v6, i32 noundef %v7, i32 noundef %v8) #0 !kcfi_type !7 {
+entry:
+  %v1.addr = alloca i32, align 4
+  %v2.addr = alloca i32, align 4
+  %v3.addr = alloca float, align 4
+  %v4.addr = alloca double, align 4
+  %v5.addr = alloca float, align 4
+  %v6.addr = alloca i32, align 4
+  %v7.addr = alloca i32, align 4
+  %v8.addr = alloca i32, align 4
+  store i32 %v1, ptr %v1.addr, align 4
+  store i32 %v2, ptr %v2.addr, align 4
+  store float %v3, ptr %v3.addr, align 4
+  store double %v4, ptr %v4.addr, align 4
+  store float %v5, ptr %v5.addr, align 4
+  store i32 %v6, ptr %v6.addr, align 4
+  store i32 %v7, ptr %v7.addr, align 4
+  store i32 %v8, ptr %v8.addr, align 4
+  ret void
+}
+
 attributes #0 = { "target-features"="+retpoline-indirect-branches,+retpoline-indirect-calls" }
 
-!llvm.module.flags = !{!0, !7}
+!llvm.module.flags = !{!0, !8}
 !0 = !{i32 4, !"kcfi", i32 1}
 !1 = !{i32 12345678}
 !2 = !{i32 4196274163}
@@ -202,4 +226,5 @@ attributes #0 = { "target-features"="+retpoline-indirect-branches,+retpoline-ind
 !4 = !{i32 199571451}
 !5 = !{i32 1046421190}
 !6 = !{i32 1342488295}
-!7 = !{i32 4, !"kcfi-arity", i32 1}
+!7 = !{i32 2253188362}
+!8 = !{i32 4, !"kcfi-arity", i32 1}

From dac0820b277835b7506a9c0d1dc5e077597f6742 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson@ericsson.com>
Date: Wed, 18 Jun 2025 11:37:51 +0200
Subject: [PATCH 0784/1322] [Thumb2] Regenerate some test checks. NFC

---
 llvm/test/CodeGen/Thumb2/mve-shuffle.ll    |  15 +-
 llvm/test/CodeGen/Thumb2/mve-vld3.ll       | 431 +++++++--------------
 llvm/test/CodeGen/Thumb2/schedm7-hazard.ll |   1 +
 3 files changed, 159 insertions(+), 288 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 82c8d50e518b..94d5490cead2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -235,7 +235,7 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
 ; CHECK-LV-NEXT:    vmov.f32 s7, s1
 ; CHECK-LV-NEXT:    vmov q0, q1
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: shuffle3_i16:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    vmov q1, q0
@@ -248,6 +248,7 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s5
 ; CHECK-LIS-NEXT:    vins.f16 s1, s7
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
   ret <8 x i16> %out
@@ -1170,7 +1171,7 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
 ; CHECK-LV-NEXT:    vmov.f32 s7, s1
 ; CHECK-LV-NEXT:    vmov q0, q1
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: shuffle3_f16:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    vmov q1, q0
@@ -1183,6 +1184,7 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s5
 ; CHECK-LIS-NEXT:    vins.f16 s1, s7
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
   ret <8 x half> %out
@@ -1514,7 +1516,7 @@ define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x doubl
 ; CHECK-LV-NEXT:    vmov q1, q5
 ; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: shuffle9_f64:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
@@ -1534,6 +1536,7 @@ define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x doubl
 ; CHECK-LIS-NEXT:    vmov q1, q5
 ; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x double> %out
@@ -1627,7 +1630,7 @@ define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2)
 ; CHECK-LV-NEXT:    vmov q1, q5
 ; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: shuffle9_i64:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
@@ -1647,6 +1650,7 @@ define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2)
 ; CHECK-LIS-NEXT:    vmov q1, q5
 ; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x i64> %out
@@ -1886,6 +1890,3 @@ entry:
   ret double %res
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-LIS: {{.*}}
-; CHECK-LV: {{.*}}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index b6c8056891f8..4dd9173e2d41 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -56,7 +56,7 @@ define void @vld3_v4i32(ptr %src, ptr %dst) {
 ; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LV-NEXT:    vpop {d8, d9}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: vld3_v4i32:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .vsave {d8, d9}
@@ -80,6 +80,7 @@ define void @vld3_v4i32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LIS-NEXT:    vpop {d8, d9}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %l1 = load <12 x i32>, ptr %src, align 4
   %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -132,7 +133,7 @@ define void @vld3_v8i32(ptr %src, ptr %dst) {
 ; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: vld3_v8i32:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
@@ -173,6 +174,7 @@ define void @vld3_v8i32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %l1 = load <24 x i32>, ptr %src, align 4
   %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -259,7 +261,7 @@ define void @vld3_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LV-NEXT:    vstrw.32 q3, [r1, #32]
 ; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: vld3_v16i32:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -334,6 +336,7 @@ define void @vld3_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vstrw.32 q3, [r1, #32]
 ; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %l1 = load <48 x i32>, ptr %src, align 4
   %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
@@ -961,7 +964,7 @@ define void @vld3_v2i64(ptr %src, ptr %dst) {
 ; CHECK-LV-NEXT:    vmov q0[3], q0[1], r7, r2
 ; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LV-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
-
+;
 ; CHECK-LIS-LABEL: vld3_v2i64:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .save {r4, r5, r6, r7, r8, lr}
@@ -991,6 +994,7 @@ define void @vld3_v2i64(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vmov q0[3], q0[1], r7, r2
 ; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LIS-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+
 entry:
   %l1 = load <6 x i64>, ptr %src, align 4
   %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
@@ -1147,7 +1151,7 @@ define void @vld3_v2f32(ptr %src, ptr %dst) {
 ; CHECK-LV-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-LV-NEXT:    vstmia r1, {s0, s1}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: vld3_v2f32:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
@@ -1161,6 +1165,7 @@ define void @vld3_v2f32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-LIS-NEXT:    vstmia r1, {s0, s1}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %l1 = load <6 x float>, ptr %src, align 4
   %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3>
@@ -1173,53 +1178,30 @@ entry:
 }
 
 define void @vld3_v4f32(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v4f32:
-; CHECK-LV:       @ %bb.0: @ %entry
-; CHECK-LV-NEXT:    .vsave {d8, d9}
-; CHECK-LV-NEXT:    vpush {d8, d9}
-; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-LV-NEXT:    vmov.f32 s10, s2
-; CHECK-LV-NEXT:    vmov.f32 s13, s0
-; CHECK-LV-NEXT:    vmov.f32 s14, s3
-; CHECK-LV-NEXT:    vmov.f32 s8, s4
-; CHECK-LV-NEXT:    vmov.f32 s9, s7
-; CHECK-LV-NEXT:    vmov.f32 s12, s5
-; CHECK-LV-NEXT:    vmov.f32 s15, s18
-; CHECK-LV-NEXT:    vmov.f32 s11, s17
-; CHECK-LV-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-LV-NEXT:    vmov.f32 s0, s6
-; CHECK-LV-NEXT:    vmov.f32 s2, s16
-; CHECK-LV-NEXT:    vmov.f32 s3, s19
-; CHECK-LV-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LV-NEXT:    vpop {d8, d9}
-; CHECK-LV-NEXT:    bx lr
+; CHECK-LABEL: vld3_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
 
-; CHECK-LIS-LABEL: vld3_v4f32:
-; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .vsave {d8, d9}
-; CHECK-LIS-NEXT:    vpush {d8, d9}
-; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s13, s0
-; CHECK-LIS-NEXT:    vmov.f32 s14, s3
-; CHECK-LIS-NEXT:    vmov.f32 s8, s4
-; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s12, s5
-; CHECK-LIS-NEXT:    vmov.f32 s15, s18
-; CHECK-LIS-NEXT:    vmov.f32 s11, s17
-; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vmov.f32 s2, s16
-; CHECK-LIS-NEXT:    vmov.f32 s3, s19
-; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LIS-NEXT:    vpop {d8, d9}
-; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <12 x float>, ptr %src, align 4
   %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1232,87 +1214,47 @@ entry:
 }
 
 define void @vld3_v8f32(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v8f32:
-; CHECK-LV:       @ %bb.0: @ %entry
-; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-LV-NEXT:    vmov.f32 s10, s2
-; CHECK-LV-NEXT:    vmov.f32 s13, s0
-; CHECK-LV-NEXT:    vmov.f32 s14, s3
-; CHECK-LV-NEXT:    vmov.f32 s8, s4
-; CHECK-LV-NEXT:    vmov.f32 s9, s7
-; CHECK-LV-NEXT:    vmov.f32 s12, s5
-; CHECK-LV-NEXT:    vmov.f32 s15, s18
-; CHECK-LV-NEXT:    vmov.f32 s11, s17
-; CHECK-LV-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-LV-NEXT:    vmov.f32 s0, s6
-; CHECK-LV-NEXT:    vmov.f32 s2, s16
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LV-NEXT:    vmov.f32 s3, s19
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LV-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LV-NEXT:    vmov.f32 s17, s4
-; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LV-NEXT:    vmov.f32 s18, s7
-; CHECK-LV-NEXT:    vmov.f32 s22, s6
-; CHECK-LV-NEXT:    vmov.f32 s16, s9
-; CHECK-LV-NEXT:    vmov.f32 s19, s14
-; CHECK-LV-NEXT:    vmov.f32 s20, s8
-; CHECK-LV-NEXT:    vmov.f32 s21, s11
-; CHECK-LV-NEXT:    vmov.f32 s23, s13
-; CHECK-LV-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-LV-NEXT:    vmov.f32 s4, s10
-; CHECK-LV-NEXT:    vmov.f32 s6, s12
-; CHECK-LV-NEXT:    vmov.f32 s7, s15
-; CHECK-LV-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-LV-NEXT:    bx lr
+; CHECK-LABEL: vld3_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
 
-; CHECK-LIS-LABEL: vld3_v8f32:
-; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s13, s0
-; CHECK-LIS-NEXT:    vmov.f32 s14, s3
-; CHECK-LIS-NEXT:    vmov.f32 s8, s4
-; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s12, s5
-; CHECK-LIS-NEXT:    vmov.f32 s15, s18
-; CHECK-LIS-NEXT:    vmov.f32 s11, s17
-; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vmov.f32 s2, s16
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s3, s19
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LIS-NEXT:    vmov.f32 s17, s4
-; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s18, s7
-; CHECK-LIS-NEXT:    vmov.f32 s22, s6
-; CHECK-LIS-NEXT:    vmov.f32 s16, s9
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s20, s8
-; CHECK-LIS-NEXT:    vmov.f32 s21, s11
-; CHECK-LIS-NEXT:    vmov.f32 s23, s13
-; CHECK-LIS-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-LIS-NEXT:    vmov.f32 s4, s10
-; CHECK-LIS-NEXT:    vmov.f32 s6, s12
-; CHECK-LIS-NEXT:    vmov.f32 s7, s15
-; CHECK-LIS-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <24 x float>, ptr %src, align 4
   %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1325,155 +1267,81 @@ entry:
 }
 
 define void @vld3_v16f32(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v16f32:
-; CHECK-LV:       @ %bb.0: @ %entry
-; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-LV-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-LV-NEXT:    vmov.f32 s10, s2
-; CHECK-LV-NEXT:    vmov.f32 s13, s0
-; CHECK-LV-NEXT:    vmov.f32 s14, s3
-; CHECK-LV-NEXT:    vmov.f32 s8, s4
-; CHECK-LV-NEXT:    vmov.f32 s9, s7
-; CHECK-LV-NEXT:    vmov.f32 s12, s5
-; CHECK-LV-NEXT:    vmov.f32 s15, s18
-; CHECK-LV-NEXT:    vmov.f32 s11, s17
-; CHECK-LV-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-LV-NEXT:    vmov.f32 s0, s6
-; CHECK-LV-NEXT:    vmov.f32 s2, s16
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LV-NEXT:    vmov.f32 s3, s19
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LV-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LV-NEXT:    vmov.f32 s17, s4
-; CHECK-LV-NEXT:    vmov.f32 s18, s7
-; CHECK-LV-NEXT:    vmov.f32 s22, s6
-; CHECK-LV-NEXT:    vmov.f32 s16, s9
-; CHECK-LV-NEXT:    vmov.f32 s19, s14
-; CHECK-LV-NEXT:    vmov.f32 s20, s8
-; CHECK-LV-NEXT:    vmov.f32 s21, s11
-; CHECK-LV-NEXT:    vmov.f32 s23, s13
-; CHECK-LV-NEXT:    vmov.f32 s4, s10
-; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-LV-NEXT:    vmov.f32 s6, s12
-; CHECK-LV-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-LV-NEXT:    vmov.f32 s7, s15
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-LV-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-LV-NEXT:    vmov.f32 s18, s10
-; CHECK-LV-NEXT:    vmov.f32 s21, s8
-; CHECK-LV-NEXT:    vmov.f32 s22, s11
-; CHECK-LV-NEXT:    vmov.f32 s16, s12
-; CHECK-LV-NEXT:    vmov.f32 s17, s15
-; CHECK-LV-NEXT:    vmov.f32 s20, s13
-; CHECK-LV-NEXT:    vmov.f32 s23, s26
-; CHECK-LV-NEXT:    vmov.f32 s19, s25
-; CHECK-LV-NEXT:    vadd.f32 q4, q4, q5
-; CHECK-LV-NEXT:    vmov.f32 s8, s14
-; CHECK-LV-NEXT:    vmov.f32 s10, s24
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-LV-NEXT:    vmov.f32 s11, s27
-; CHECK-LV-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-LV-NEXT:    vadd.f32 q2, q4, q2
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-LV-NEXT:    vmov.f32 s25, s12
-; CHECK-LV-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-LV-NEXT:    vmov.f32 s26, s15
-; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LV-NEXT:    vmov.f32 s30, s14
-; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LV-NEXT:    vmov.f32 s24, s17
-; CHECK-LV-NEXT:    vmov.f32 s27, s22
-; CHECK-LV-NEXT:    vmov.f32 s28, s16
-; CHECK-LV-NEXT:    vmov.f32 s29, s19
-; CHECK-LV-NEXT:    vmov.f32 s31, s21
-; CHECK-LV-NEXT:    vadd.f32 q6, q7, q6
-; CHECK-LV-NEXT:    vmov.f32 s12, s18
-; CHECK-LV-NEXT:    vmov.f32 s14, s20
-; CHECK-LV-NEXT:    vmov.f32 s15, s23
-; CHECK-LV-NEXT:    vadd.f32 q3, q6, q3
-; CHECK-LV-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LV-NEXT:    bx lr
+; CHECK-LABEL: vld3_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vmov.f32 s23, s26
+; CHECK-NEXT:    vmov.f32 s19, s25
+; CHECK-NEXT:    vadd.f32 q4, q4, q5
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s11, s27
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-NEXT:    vadd.f32 q2, q4, q2
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s25, s12
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s30, s14
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s24, s17
+; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vmov.f32 s31, s21
+; CHECK-NEXT:    vadd.f32 q6, q7, q6
+; CHECK-NEXT:    vmov.f32 s12, s18
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s15, s23
+; CHECK-NEXT:    vadd.f32 q3, q6, q3
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
 
-; CHECK-LIS-LABEL: vld3_v16f32:
-; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-LIS-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s13, s0
-; CHECK-LIS-NEXT:    vmov.f32 s14, s3
-; CHECK-LIS-NEXT:    vmov.f32 s8, s4
-; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s12, s5
-; CHECK-LIS-NEXT:    vmov.f32 s15, s18
-; CHECK-LIS-NEXT:    vmov.f32 s11, s17
-; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vmov.f32 s2, s16
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s3, s19
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LIS-NEXT:    vmov.f32 s17, s4
-; CHECK-LIS-NEXT:    vmov.f32 s18, s7
-; CHECK-LIS-NEXT:    vmov.f32 s22, s6
-; CHECK-LIS-NEXT:    vmov.f32 s16, s9
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s20, s8
-; CHECK-LIS-NEXT:    vmov.f32 s21, s11
-; CHECK-LIS-NEXT:    vmov.f32 s23, s13
-; CHECK-LIS-NEXT:    vmov.f32 s4, s10
-; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-LIS-NEXT:    vmov.f32 s6, s12
-; CHECK-LIS-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-LIS-NEXT:    vmov.f32 s7, s15
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-LIS-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-LIS-NEXT:    vmov.f32 s18, s10
-; CHECK-LIS-NEXT:    vmov.f32 s21, s8
-; CHECK-LIS-NEXT:    vmov.f32 s22, s11
-; CHECK-LIS-NEXT:    vmov.f32 s16, s12
-; CHECK-LIS-NEXT:    vmov.f32 s17, s15
-; CHECK-LIS-NEXT:    vmov.f32 s20, s13
-; CHECK-LIS-NEXT:    vmov.f32 s23, s26
-; CHECK-LIS-NEXT:    vmov.f32 s19, s25
-; CHECK-LIS-NEXT:    vadd.f32 q4, q4, q5
-; CHECK-LIS-NEXT:    vmov.f32 s8, s14
-; CHECK-LIS-NEXT:    vmov.f32 s10, s24
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-LIS-NEXT:    vmov.f32 s11, s27
-; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-LIS-NEXT:    vadd.f32 q2, q4, q2
-; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-LIS-NEXT:    vmov.f32 s25, s12
-; CHECK-LIS-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-LIS-NEXT:    vmov.f32 s26, s15
-; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s30, s14
-; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LIS-NEXT:    vmov.f32 s24, s17
-; CHECK-LIS-NEXT:    vmov.f32 s27, s22
-; CHECK-LIS-NEXT:    vmov.f32 s28, s16
-; CHECK-LIS-NEXT:    vmov.f32 s29, s19
-; CHECK-LIS-NEXT:    vmov.f32 s31, s21
-; CHECK-LIS-NEXT:    vadd.f32 q6, q7, q6
-; CHECK-LIS-NEXT:    vmov.f32 s12, s18
-; CHECK-LIS-NEXT:    vmov.f32 s14, s20
-; CHECK-LIS-NEXT:    vmov.f32 s15, s23
-; CHECK-LIS-NEXT:    vadd.f32 q3, q6, q3
-; CHECK-LIS-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <48 x float>, ptr %src, align 4
   %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
@@ -1692,7 +1560,7 @@ define void @vld3_v16f16(ptr %src, ptr %dst) {
 ; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LV-NEXT:    vpop {d8, d9}
 ; CHECK-LV-NEXT:    bx lr
-
+;
 ; CHECK-LIS-LABEL: vld3_v16f16:
 ; CHECK-LIS:       @ %bb.0: @ %entry
 ; CHECK-LIS-NEXT:    .vsave {d8, d9}
@@ -1773,6 +1641,7 @@ define void @vld3_v16f16(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LIS-NEXT:    vpop {d8, d9}
 ; CHECK-LIS-NEXT:    bx lr
+
 entry:
   %l1 = load <48 x half>, ptr %src, align 4
   %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
@@ -1841,4 +1710,4 @@ entry:
   %a = fadd <4 x double> %a1, %s3
   store <4 x double> %a, ptr %dst
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/Thumb2/schedm7-hazard.ll b/llvm/test/CodeGen/Thumb2/schedm7-hazard.ll
index d3e31d192d57..1b222b2a131a 100644
--- a/llvm/test/CodeGen/Thumb2/schedm7-hazard.ll
+++ b/llvm/test/CodeGen/Thumb2/schedm7-hazard.ll
@@ -17,6 +17,7 @@ define i32 @test(ptr %x0, i32 %y, i32 %z) {
 ; CHECK-NEXT:    adds r1, #1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
+;
 ; NOBANK-LABEL: test:
 ; NOBANK:       @ %bb.0: @ %entry
 ; NOBANK-NEXT:    ldr r3, [r0]

From 5a9cc93a2058e2c26d766f7be6aee63e928bf825 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Wed, 18 Jun 2025 10:57:03 +0100
Subject: [PATCH 0785/1322] Fix for bazel build #142079 (#144665)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c750eb733b3b..0b4441c15794 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -323,7 +323,6 @@ cc_library(
     ]) + [
         "include/mlir/IR/OpAsmOpInterface.h.inc",
         "include/mlir/IR/PDLPatternMatch.h.inc",
-        "include/mlir/Interfaces/CallInterfaces.h",
         "include/mlir/Interfaces/DataLayoutInterfaces.h",
         "include/mlir/Interfaces/InferIntRangeInterface.h",
         "include/mlir/Interfaces/SideEffectInterfaces.h",
@@ -332,6 +331,7 @@ cc_library(
         "include/mlir/IR/*.h",
     ]) + [
         "include/mlir/Interfaces/FoldInterfaces.h",
+        "include/mlir/Interfaces/CallInterfaces.h",
     ],
     includes = ["include"],
     deps = [

From a13b7cc00c5f4b9d2636ed7a22c1390cf8033baf Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 18 Jun 2025 12:24:55 +0200
Subject: [PATCH 0786/1322] [LICM] Support hoisting of non-argmemonly readonly
 calls (#144497)

The code checking whether a readonly call is safe to hoist is
currently limited to only argmemonly calls. However, the actual
implementation does not depend on this in any way. It either
does an MSSA clobber walk on the memory access (which will take
all locations accessed by the call into account), or it will
look at all MemoryDefs in an entirely location-independent manner.

The current restriction dates back to the time when LICM still
supported AST, in which case this code *did* reason about the
individual pointer arguments.
---
 llvm/lib/Transforms/Scalar/LICM.cpp        | 30 ++++----------------
 llvm/test/Transforms/LICM/call-hoisting.ll | 32 ++++++++++++++++++++++
 llvm/test/Transforms/LICM/funclet.ll       |  2 +-
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index abb6ff1dcfe6..a6bb8b8a21b0 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1117,13 +1117,6 @@ bool isHoistableAndSinkableInst(Instruction &I) {
           isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
           isa<InsertValueInst>(I) || isa<FreezeInst>(I));
 }
-/// Return true if MSSA knows there are no MemoryDefs in the loop.
-bool isReadOnly(const MemorySSAUpdater &MSSAU, const Loop *L) {
-  for (auto *BB : L->getBlocks())
-    if (MSSAU.getMemorySSA()->getBlockDefs(BB))
-      return false;
-  return true;
-}
 
 /// Return true if I is the only Instruction with a MemoryAccess in L.
 bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
@@ -1234,24 +1227,11 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (Behavior.doesNotAccessMemory())
       return true;
     if (Behavior.onlyReadsMemory()) {
-      // A readonly argmemonly function only reads from memory pointed to by
-      // it's arguments with arbitrary offsets.  If we can prove there are no
-      // writes to this memory in the loop, we can hoist or sink.
-      if (Behavior.onlyAccessesArgPointees()) {
-        // TODO: expand to writeable arguments
-        for (Value *Op : CI->args())
-          if (Op->getType()->isPointerTy() &&
-              pointerInvalidatedByLoop(
-                  MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I,
-                  Flags, /*InvariantGroup=*/false))
-            return false;
-        return true;
-      }
-
-      // If this call only reads from memory and there are no writes to memory
-      // in the loop, we can hoist or sink the call as appropriate.
-      if (isReadOnly(MSSAU, CurLoop))
-        return true;
+      // If we can prove there are no writes to the memory read by the call, we
+      // can hoist or sink.
+      return !pointerInvalidatedByLoop(
+          MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I, Flags,
+          /*InvariantGroup=*/false);
     }
 
     // FIXME: This should use mod/ref information to see if we can hoist or
diff --git a/llvm/test/Transforms/LICM/call-hoisting.ll b/llvm/test/Transforms/LICM/call-hoisting.ll
index 907f13438623..7124b4e445eb 100644
--- a/llvm/test/Transforms/LICM/call-hoisting.ll
+++ b/llvm/test/Transforms/LICM/call-hoisting.ll
@@ -84,6 +84,38 @@ exit:
   ret void
 }
 
+declare i32 @load_not_argmemonly() readonly nounwind willreturn
+
+define void @test_load_not_argmemonly(ptr noalias %sink) {
+; CHECK-LABEL: define void @test_load_not_argmemonly(
+; CHECK-SAME: ptr noalias [[SINK:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @load_not_argmemonly()
+; CHECK-NEXT:    store i32 [[RET]], ptr [[SINK]], align 4
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %ret = call i32 @load_not_argmemonly()
+  store i32 %ret, ptr %sink
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 declare void @store(i32 %val, ptr %p) argmemonly writeonly nounwind
 
 define void @test(ptr %loc) {
diff --git a/llvm/test/Transforms/LICM/funclet.ll b/llvm/test/Transforms/LICM/funclet.ll
index 1cdd12ddc98e..03a49d8ddf69 100644
--- a/llvm/test/Transforms/LICM/funclet.ll
+++ b/llvm/test/Transforms/LICM/funclet.ll
@@ -153,6 +153,6 @@ else:                                             ; preds = %postinvoke
 
 declare void @may_throw()
 
-declare i32 @pure_computation() nounwind argmemonly readonly willreturn
+declare i32 @pure_computation() nounwind willreturn memory(none)
 
 declare i32 @__CxxFrameHandler3(...)

From ee4c2bb68752a6c4b463f3873cde278b8d348628 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 18 Jun 2025 11:32:22 +0100
Subject: [PATCH 0787/1322] [lldb][test] explicit-member-function-quals.cpp:
 add -glldb

This will get un-XFAILed but requires `-glldb` in an upcoming patch.
---
 .../SymbolFile/DWARF/x86/explicit-member-function-quals.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
index 5d1222795dd8..33001db69f83 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
@@ -3,7 +3,7 @@
 // Tests that we correctly deduce the CV-quals and storage
 // class of explicit object member functions.
 //
-// RUN: %clangxx_host %s -target x86_64-pc-linux -g -std=c++23 -c -o %t
+// RUN: %clangxx_host %s -glldb -target x86_64-pc-linux -g -std=c++23 -c -o %t
 // RUN: %lldb %t -b -o "type lookup Foo" 2>&1 | FileCheck %s
 //
 // CHECK:      (lldb) type lookup Foo

From 561eca44e7639ee8805d0bf65a59b9898d782538 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 18 Jun 2025 12:32:33 +0200
Subject: [PATCH 0788/1322] [PowerPC] Split tests into asm and mir parts (NFC)

To allow both to be generated.
---
 .../CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll | 221 ++++++++
 .../CodeGen/PowerPC/aix32-cc-abi-vaarg.ll     | 346 +++---------
 .../CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll | 192 +++++++
 .../CodeGen/PowerPC/aix64-cc-abi-vaarg.ll     | 522 ++++++------------
 4 files changed, 681 insertions(+), 600 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll

diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll
new file mode 100644
index 000000000000..3eef8d5ff90f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg-mir.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O2 -mtriple powerpc-ibm-aix-xcoff -mcpu=ppc -stop-after=machine-cp -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @int_va_arg(i32 %a, ...) local_unnamed_addr  {
+  ; CHECK-LABEL: name: int_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r11 = ADDI %fixed-stack.0, 0
+  ; CHECK-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
+  ; CHECK-NEXT:   STW killed renamable $r6, 8, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW killed renamable $r7, 12, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW killed renamable $r8, 16, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW killed renamable $r9, 20, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW killed renamable $r10, 24, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW renamable $r11, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
+  ; CHECK-NEXT:   STW killed renamable $r11, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
+  ; CHECK-NEXT:   renamable $r4 = ADDI %fixed-stack.0, 4
+  ; CHECK-NEXT:   STW renamable $r4, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
+  ; CHECK-NEXT:   renamable $r6 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur)
+  ; CHECK-NEXT:   STW killed renamable $r4, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
+  ; CHECK-NEXT:   renamable $r4 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur2)
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r6, killed renamable $r3
+  ; CHECK-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 1, 0, 30
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
+  ; CHECK-NEXT:   STW killed renamable $r5, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; CHECK-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+entry:
+  %arg1 = alloca ptr, align 4
+  %arg2 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %argp.cur = load ptr, ptr %arg1, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+  store ptr %argp.next, ptr %arg1, align 4
+  %0 = load i32, ptr %argp.cur, align 4
+  %add = add nsw i32 %0, %a
+  %argp.cur2 = load ptr, ptr %arg2, align 4
+  %argp.next3 = getelementptr inbounds i8, ptr %argp.cur2, i32 4
+  store ptr %argp.next3, ptr %arg2, align 4
+  %1 = load i32, ptr %argp.cur2, align 4
+  %mul = shl i32 %1, 1
+  %add4 = add nsw i32 %add, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg1)
+  ret i32 %add4
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_copy(ptr, ptr)
+declare void @llvm.va_end(ptr)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+
+define i32 @int_stack_va_arg(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, ...) local_unnamed_addr {
+  ; CHECK-LABEL: name: int_stack_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r11 = LI 4
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r4, killed renamable $r3
+  ; CHECK-NEXT:   renamable $r4 = ADDI %fixed-stack.0, 0
+  ; CHECK-NEXT:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r11, 0, 29, 29
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5
+  ; CHECK-NEXT:   STW killed renamable $r4, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6
+  ; CHECK-NEXT:   renamable $r4 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur9, align 8)
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r7
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r8
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r4
+  ; CHECK-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 1, 0, 30
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
+  ; CHECK-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+entry:
+  %arg1 = alloca ptr, align 4
+  %arg2 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %add = add nsw i32 %two, %one
+  %add2 = add nsw i32 %add, %three
+  %add3 = add nsw i32 %add2, %four
+  %add4 = add nsw i32 %add3, %five
+  %add5 = add nsw i32 %add4, %six
+  %add6 = add nsw i32 %add5, %seven
+  %add7 = add nsw i32 %add6, %eight
+  %argp.cur = load ptr, ptr %arg1, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+  store ptr %argp.next, ptr %arg1, align 4
+  %0 = load i32, ptr %argp.cur, align 4
+  %add8 = add nsw i32 %add7, %0
+  %argp.cur9 = load ptr, ptr %arg2, align 4
+  %argp.next10 = getelementptr inbounds i8, ptr %argp.cur9, i32 4
+  store ptr %argp.next10, ptr %arg2, align 4
+  %1 = load i32, ptr %argp.cur9, align 4
+  %mul = shl i32 %1, 1
+  %add11 = add nsw i32 %add8, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg1)
+  ret i32 %add11
+}
+
+define double @double_va_arg(double %a, ...) local_unnamed_addr  {
+  ; CHECK-LABEL: name: double_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $f1, $r5, $r6, $r7, $r8, $r9, $r10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r3 = ADDI %fixed-stack.0, 0
+  ; CHECK-NEXT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32), align 8)
+  ; CHECK-NEXT:   STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; CHECK-NEXT:   STW renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; CHECK-NEXT:   STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW killed renamable $r9, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 16)
+  ; CHECK-NEXT:   STW killed renamable $r10, 20, %fixed-stack.0 :: (store (s32))
+  ; CHECK-NEXT:   STW renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
+  ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
+  ; CHECK-NEXT:   STW renamable $r5, 0, %stack.2 :: (store (s32) into %stack.2, align 8)
+  ; CHECK-NEXT:   STW renamable $r6, 4, %stack.2 :: (store (s32) into %stack.2 + 4)
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %stack.2 :: (load (s64) from %stack.2)
+  ; CHECK-NEXT:   STW killed renamable $r5, 0, %stack.3 :: (store (s32) into %stack.3, align 8)
+  ; CHECK-NEXT:   STW killed renamable $r6, 4, %stack.3 :: (store (s32) into %stack.3 + 4)
+  ; CHECK-NEXT:   renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+entry:
+  %arg1 = alloca ptr, align 4
+  %arg2 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %argp.cur = load ptr, ptr %arg1, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 8
+  store ptr %argp.next, ptr %arg1, align 4
+  %0 = load double, ptr %argp.cur, align 4
+  %add = fadd double %0, %a
+  %argp.cur2 = load ptr, ptr %arg2, align 4
+  %argp.next3 = getelementptr inbounds i8, ptr %argp.cur2, i32 8
+  store ptr %argp.next3, ptr %arg2, align 4
+  %1 = load double, ptr %argp.cur2, align 4
+  %mul = fmul double %1, 2.000000e+00
+  %add4 = fadd double %add, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg1)
+  ret double %add4
+}
+
+define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr  {
+  ; CHECK-LABEL: name: double_stack_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r3 = ADDI %fixed-stack.0, 0
+  ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
+  ; CHECK-NEXT:   renamable $r3 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142, align 16)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
+  ; CHECK-NEXT:   STW renamable $r3, 0, %stack.2 :: (store (s32) into %stack.2, align 8)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm
+  ; CHECK-NEXT:   renamable $r4 = LWZ 4, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142 + 4)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm
+  ; CHECK-NEXT:   STW renamable $r4, 4, %stack.2 :: (store (s32) into %stack.2 + 4)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = LFD 0, %stack.2 :: (load (s64) from %stack.2)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm
+  ; CHECK-NEXT:   STW killed renamable $r3, 0, %stack.3 :: (store (s32) into %stack.3, align 8)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm
+  ; CHECK-NEXT:   STW killed renamable $r4, 4, %stack.3 :: (store (s32) into %stack.3 + 4)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm
+  ; CHECK-NEXT:   renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3)
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+entry:
+  %arg1 = alloca ptr, align 4
+  %arg2 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %add = fadd double %one, %two
+  %add2 = fadd double %add, %three
+  %add3 = fadd double %add2, %four
+  %add4 = fadd double %add3, %five
+  %add5 = fadd double %add4, %six
+  %add6 = fadd double %add5, %seven
+  %add7 = fadd double %add6, %eight
+  %add8 = fadd double %add7, %nine
+  %add9 = fadd double %add8, %ten
+  %add10 = fadd double %add9, %eleven
+  %add11 = fadd double %add10, %twelve
+  %add12 = fadd double %add11, %thirteen
+  %argp.cur1 = load ptr, ptr %arg1, align 4
+  %0 = load double, ptr %argp.cur1, align 4
+  %add13 = fadd double %add12, %0
+  %argp.cur142 = load ptr, ptr %arg2, align 4
+  %1 = load double, ptr %argp.cur142, align 4
+  %mul = fmul double %1, 2.000000e+00
+  %add16 = fadd double %add13, %mul
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg1)
+  ret double %add16
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll
index 3c1b28a4eff1..6ec56ffe3e25 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll
+++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll
@@ -1,29 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O2 -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck --check-prefix=ASM32 %s
-; RUN: llc -O2 -mtriple powerpc-ibm-aix-xcoff -mcpu=ppc -stop-after=machine-cp -verify-machineinstrs < %s | FileCheck --check-prefix=32BIT %s
+; RUN: llc -O2 -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck %s
 
 define i32 @int_va_arg(i32 %a, ...) local_unnamed_addr  {
-; ASM32-LABEL: int_va_arg:
-; ASM32:       # %bb.0: # %entry
-; ASM32-NEXT:    addi 11, 1, 28
-; ASM32-NEXT:    stw 4, 28(1)
-; ASM32-NEXT:    addi 4, 1, 32
-; ASM32-NEXT:    stw 6, 36(1)
-; ASM32-NEXT:    stw 11, -4(1)
-; ASM32-NEXT:    stw 11, -8(1)
-; ASM32-NEXT:    stw 4, -4(1)
-; ASM32-NEXT:    lwz 6, 28(1)
-; ASM32-NEXT:    stw 4, -8(1)
-; ASM32-NEXT:    add 3, 6, 3
-; ASM32-NEXT:    lwz 4, 28(1)
-; ASM32-NEXT:    slwi 4, 4, 1
-; ASM32-NEXT:    stw 7, 40(1)
-; ASM32-NEXT:    add 3, 3, 4
-; ASM32-NEXT:    stw 8, 44(1)
-; ASM32-NEXT:    stw 9, 48(1)
-; ASM32-NEXT:    stw 10, 52(1)
-; ASM32-NEXT:    stw 5, 32(1)
-; ASM32-NEXT:    blr
+; CHECK-LABEL: int_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi 11, 1, 28
+; CHECK-NEXT:    stw 4, 28(1)
+; CHECK-NEXT:    addi 4, 1, 32
+; CHECK-NEXT:    stw 6, 36(1)
+; CHECK-NEXT:    stw 11, -4(1)
+; CHECK-NEXT:    stw 11, -8(1)
+; CHECK-NEXT:    stw 4, -4(1)
+; CHECK-NEXT:    lwz 6, 28(1)
+; CHECK-NEXT:    stw 4, -8(1)
+; CHECK-NEXT:    add 3, 6, 3
+; CHECK-NEXT:    lwz 4, 28(1)
+; CHECK-NEXT:    slwi 4, 4, 1
+; CHECK-NEXT:    stw 7, 40(1)
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    stw 8, 44(1)
+; CHECK-NEXT:    stw 9, 48(1)
+; CHECK-NEXT:    stw 10, 52(1)
+; CHECK-NEXT:    stw 5, 32(1)
+; CHECK-NEXT:    blr
 entry:
   %arg1 = alloca ptr, align 4
   %arg2 = alloca ptr, align 4
@@ -49,45 +48,6 @@ entry:
   ret i32 %add4
 }
 
-; 32BIT-LABEL:   name:            int_va_arg
-; 32BIT-LABEL:   liveins:
-; 32BIT-DAG:     - { reg: '$r3', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r4', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r5', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r6', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r7', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r8', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r9', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r10', virtual-reg: '' }
-
-; 32BIT-LABEL:   fixedStack:
-; 32BIT-DAG:     - { id: 0, type: default, offset: 28, size: 4
-
-; 32BIT-LABEL:   stack:
-; 32BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 4
-; 32BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 4
-
-; 32BIT-LABEL:   body:             |
-; 32BIT-DAG:     liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT-DAG:     STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
-; 32BIT-DAG:     STW killed renamable $r5, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
-; 32BIT-DAG:     STW killed renamable $r6, 8, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW killed renamable $r7, 12, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW killed renamable $r8, 16, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW killed renamable $r9, 20, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW killed renamable $r10, 24, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW killed renamable $r4, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
-; 32BIT-DAG:     renamable $r4 = ADDI %fixed-stack.0, 4
-; 32BIT-DAG:     STW killed renamable $r11, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
-; 32BIT-DAG:     renamable $r11 = ADDI %fixed-stack.0, 0
-; 32BIT-DAG:     STW renamable $r11, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-; 32BIT-DAG:     STW renamable $r4, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-; 32BIT-DAG:     renamable $r6 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur)
-; 32BIT-DAG:     renamable $r4 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur2)
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r6, killed renamable $r3
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
-; 32BIT-DAG:     BLR implicit $lr, implicit $rm, implicit $r3
-
 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 declare void @llvm.va_start(ptr)
 declare void @llvm.va_copy(ptr, ptr)
@@ -95,24 +55,24 @@ declare void @llvm.va_end(ptr)
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 define i32 @int_stack_va_arg(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, ...) local_unnamed_addr {
-; ASM32-LABEL: int_stack_va_arg:
-; ASM32:       # %bb.0: # %entry
-; ASM32-NEXT:    add 3, 4, 3
-; ASM32-NEXT:    lwz 4, 56(1)
-; ASM32-NEXT:    li 11, 4
-; ASM32-NEXT:    add 3, 3, 5
-; ASM32-NEXT:    addi 12, 1, 56
-; ASM32-NEXT:    add 3, 3, 6
-; ASM32-NEXT:    rlwimi 12, 11, 0, 29, 29
-; ASM32-NEXT:    stw 12, -4(1)
-; ASM32-NEXT:    add 3, 3, 7
-; ASM32-NEXT:    add 3, 3, 8
-; ASM32-NEXT:    add 3, 3, 9
-; ASM32-NEXT:    add 3, 3, 10
-; ASM32-NEXT:    add 3, 3, 4
-; ASM32-NEXT:    slwi 4, 4, 1
-; ASM32-NEXT:    add 3, 3, 4
-; ASM32-NEXT:    blr
+; CHECK-LABEL: int_stack_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add 3, 4, 3
+; CHECK-NEXT:    lwz 4, 56(1)
+; CHECK-NEXT:    li 11, 4
+; CHECK-NEXT:    add 3, 3, 5
+; CHECK-NEXT:    addi 12, 1, 56
+; CHECK-NEXT:    add 3, 3, 6
+; CHECK-NEXT:    rlwimi 12, 11, 0, 29, 29
+; CHECK-NEXT:    stw 12, -4(1)
+; CHECK-NEXT:    add 3, 3, 7
+; CHECK-NEXT:    add 3, 3, 8
+; CHECK-NEXT:    add 3, 3, 9
+; CHECK-NEXT:    add 3, 3, 10
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    slwi 4, 4, 1
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    blr
 entry:
   %arg1 = alloca ptr, align 4
   %arg2 = alloca ptr, align 4
@@ -145,63 +105,28 @@ entry:
   ret i32 %add11
 }
 
-; 32BIT-LABEL:   name:            int_stack_va_arg
-; 32BIT-LABEL:   liveins:
-; 32BIT-DAG:     - { reg: '$r3', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r4', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r5', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r6', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r7', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r8', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r9', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r10', virtual-reg: '' }
-
-; 32BIT-LABEL:   fixedStack:
-; 32BIT-DAG:     - { id: 0, type: default, offset: 56, size: 4
-
-; 32BIT-LABEL:   stack:
-; 32BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 4
-; 32BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 4
-
-; 32BIT-LABEL:   body:             |
-; 32BIT-DAG:     liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r7
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r8
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r4, killed renamable $r3
-; 32BIT-DAG:     renamable $r4 = ADDI %fixed-stack.0, 0
-; 32BIT-DAG:     STW killed renamable $r4, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-; 32BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r4
-; 32BIT-DAG:     renamable $r4 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur9, align 8)
-; 32BIT-DAG:     renamable $r11 = LI 4
-; 32BIT-DAG:     BLR implicit $lr, implicit $rm, implicit $r3
-
 define double @double_va_arg(double %a, ...) local_unnamed_addr  {
-; ASM32-LABEL: double_va_arg:
-; ASM32:       # %bb.0: # %entry
-; ASM32-NEXT:    stw 5, -16(1)
-; ASM32-NEXT:    addi 3, 1, 32
-; ASM32-NEXT:    stw 6, -12(1)
-; ASM32-NEXT:    lfd 0, -16(1)
-; ASM32-NEXT:    stw 5, -24(1)
-; ASM32-NEXT:    fadd 0, 0, 1
-; ASM32-NEXT:    stw 6, -20(1)
-; ASM32-NEXT:    lfd 1, -24(1)
-; ASM32-NEXT:    fadd 1, 1, 1
-; ASM32-NEXT:    stw 7, 40(1)
-; ASM32-NEXT:    fadd 1, 0, 1
-; ASM32-NEXT:    stw 5, 32(1)
-; ASM32-NEXT:    stw 6, 36(1)
-; ASM32-NEXT:    stw 8, 44(1)
-; ASM32-NEXT:    stw 9, 48(1)
-; ASM32-NEXT:    stw 10, 52(1)
-; ASM32-NEXT:    stw 3, -4(1)
-; ASM32-NEXT:    stw 3, -8(1)
-; ASM32-NEXT:    blr
+; CHECK-LABEL: double_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stw 5, -16(1)
+; CHECK-NEXT:    addi 3, 1, 32
+; CHECK-NEXT:    stw 6, -12(1)
+; CHECK-NEXT:    lfd 0, -16(1)
+; CHECK-NEXT:    stw 5, -24(1)
+; CHECK-NEXT:    fadd 0, 0, 1
+; CHECK-NEXT:    stw 6, -20(1)
+; CHECK-NEXT:    lfd 1, -24(1)
+; CHECK-NEXT:    fadd 1, 1, 1
+; CHECK-NEXT:    stw 7, 40(1)
+; CHECK-NEXT:    fadd 1, 0, 1
+; CHECK-NEXT:    stw 5, 32(1)
+; CHECK-NEXT:    stw 6, 36(1)
+; CHECK-NEXT:    stw 8, 44(1)
+; CHECK-NEXT:    stw 9, 48(1)
+; CHECK-NEXT:    stw 10, 52(1)
+; CHECK-NEXT:    stw 3, -4(1)
+; CHECK-NEXT:    stw 3, -8(1)
+; CHECK-NEXT:    blr
 entry:
   %arg1 = alloca ptr, align 4
   %arg2 = alloca ptr, align 4
@@ -227,74 +152,35 @@ entry:
   ret double %add4
 }
 
-; 32BIT-LABEL:   name:            double_va_arg
-; 32BIT-LABEL:   liveins:
-; 32BIT-DAG:     - { reg: '$f1', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r5', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r6', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r7', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r8', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r9', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$r10', virtual-reg: '' }
-
-; 32BIT-LABEL:   fixedStack:
-; 32BIT-DAG:     - { id: 0, type: default, offset: 32, size: 4
-
-; 32BIT-LABEL:   stack:
-; 32BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 4
-; 32BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 4
-
-; 32BIT-LABEL:   body:             |
-; 32BIT-DAG:     liveins: $f1, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT-DAG:     renamable $r3 = ADDI %fixed-stack.0, 0
-; 32BIT-DAG:     STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32), align 8)
-; 32BIT-DAG:     STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
-; 32BIT-DAG:     STW renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
-; 32BIT-DAG:     STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW killed renamable $r9, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 16)
-; 32BIT-DAG:     STW killed renamable $r10, 20, %fixed-stack.0 :: (store (s32))
-; 32BIT-DAG:     STW renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-; 32BIT-DAG:     STW killed renamable $r3, 0, %stack.1.arg2 :: (store (s32) into %ir.arg2)
-; 32BIT-DAG:     STW renamable $r5, 0, %stack.2 :: (store (s32) into %stack.2, align 8)
-; 32BIT-DAG:     STW renamable $r6, 4, %stack.2 :: (store (s32) into %stack.2 + 4)
-; 32BIT-DAG:     renamable $f0 = LFD 0, %stack.2 :: (load (s64) from %stack.2)
-; 32BIT-DAG:     STW killed renamable $r5, 0, %stack.3 :: (store (s32) into %stack.3, align 8)
-; 32BIT-DAG:     STW killed renamable $r6, 4, %stack.3 :: (store (s32) into %stack.3 + 4)
-; 32BIT-DAG:     renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-; 32BIT-DAG:     renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
-; 32BIT-DAG:     renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-; 32BIT-DAG:     BLR implicit $lr, implicit $rm, implicit $f1
-
 define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr  {
-; ASM32-LABEL: double_stack_va_arg:
-; ASM32:       # %bb.0: # %entry
-; ASM32-NEXT:    fadd 0, 1, 2
-; ASM32-NEXT:    addi 3, 1, 128
-; ASM32-NEXT:    lwz 4, 132(1)
-; ASM32-NEXT:    fadd 0, 0, 3
-; ASM32-NEXT:    stw 3, -4(1)
-; ASM32-NEXT:    fadd 0, 0, 4
-; ASM32-NEXT:    lwz 3, 128(1)
-; ASM32-NEXT:    fadd 0, 0, 5
-; ASM32-NEXT:    stw 3, -16(1)
-; ASM32-NEXT:    fadd 0, 0, 6
-; ASM32-NEXT:    stw 4, -12(1)
-; ASM32-NEXT:    fadd 0, 0, 7
-; ASM32-NEXT:    lfd 1, -16(1)
-; ASM32-NEXT:    fadd 0, 0, 8
-; ASM32-NEXT:    stw 3, -24(1)
-; ASM32-NEXT:    fadd 0, 0, 9
-; ASM32-NEXT:    stw 4, -20(1)
-; ASM32-NEXT:    fadd 0, 0, 10
-; ASM32-NEXT:    fadd 0, 0, 11
-; ASM32-NEXT:    fadd 0, 0, 12
-; ASM32-NEXT:    fadd 0, 0, 13
-; ASM32-NEXT:    fadd 0, 0, 1
-; ASM32-NEXT:    lfd 1, -24(1)
-; ASM32-NEXT:    fadd 1, 1, 1
-; ASM32-NEXT:    fadd 1, 0, 1
-; ASM32-NEXT:    blr
+; CHECK-LABEL: double_stack_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fadd 0, 1, 2
+; CHECK-NEXT:    addi 3, 1, 128
+; CHECK-NEXT:    lwz 4, 132(1)
+; CHECK-NEXT:    fadd 0, 0, 3
+; CHECK-NEXT:    stw 3, -4(1)
+; CHECK-NEXT:    fadd 0, 0, 4
+; CHECK-NEXT:    lwz 3, 128(1)
+; CHECK-NEXT:    fadd 0, 0, 5
+; CHECK-NEXT:    stw 3, -16(1)
+; CHECK-NEXT:    fadd 0, 0, 6
+; CHECK-NEXT:    stw 4, -12(1)
+; CHECK-NEXT:    fadd 0, 0, 7
+; CHECK-NEXT:    lfd 1, -16(1)
+; CHECK-NEXT:    fadd 0, 0, 8
+; CHECK-NEXT:    stw 3, -24(1)
+; CHECK-NEXT:    fadd 0, 0, 9
+; CHECK-NEXT:    stw 4, -20(1)
+; CHECK-NEXT:    fadd 0, 0, 10
+; CHECK-NEXT:    fadd 0, 0, 11
+; CHECK-NEXT:    fadd 0, 0, 12
+; CHECK-NEXT:    fadd 0, 0, 13
+; CHECK-NEXT:    fadd 0, 0, 1
+; CHECK-NEXT:    lfd 1, -24(1)
+; CHECK-NEXT:    fadd 1, 1, 1
+; CHECK-NEXT:    fadd 1, 0, 1
+; CHECK-NEXT:    blr
 entry:
   %arg1 = alloca ptr, align 4
   %arg2 = alloca ptr, align 4
@@ -325,57 +211,3 @@ entry:
   call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %arg1)
   ret double %add16
 }
-
-; 32BIT-LABEL:   name:            double_stack_va_arg
-; 32BIT-LABEL:   liveins:
-; 32BIT-DAG:     - { reg: '$f1', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f2', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f3', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f4', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f5', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f6', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f7', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f8', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f9', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f10', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f11', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f12', virtual-reg: '' }
-; 32BIT-DAG:     - { reg: '$f13', virtual-reg: '' }
-
-; 32BIT-LABEL:   fixedStack:
-; 32BIT-DAG:     - { id: 0, type: default, offset: 128, size: 4
-
-; 32BIT-LABEL:   stack:
-; 32BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 4, alignment: 4,
-; 32BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 4, alignment: 4,
-; 32BIT-DAG:     - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
-; 32BIT-DAG:     - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
-
-; 32BIT-LABEL:   body:             |
-; 32BIT-DAG:     liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
-; 32BIT-DAG:     renamable $r3 = ADDI %fixed-stack.0, 0
-; 32BIT-DAG:     STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.arg1)
-; 32BIT-DAG:     renamable $r3 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142, align 16)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm
-; 32BIT-DAG:     STW renamable $r3, 0, %stack.2 :: (store (s32) into %stack.2, align 8)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm
-; 32BIT-DAG:     renamable $r4 = LWZ 4, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142 + 4)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm
-; 32BIT-DAG:     STW renamable $r4, 4, %stack.2 :: (store (s32) into %stack.2 + 4)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm
-; 32BIT-DAG:     renamable $f1 = LFD 0, %stack.2 :: (load (s64) from %stack.2)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm
-; 32BIT-DAG:     STW killed renamable $r3, 0, %stack.3 :: (store (s32) into %stack.3, align 8)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm
-; 32BIT-DAG:     STW killed renamable $r4, 4, %stack.3 :: (store (s32) into %stack.3 + 4)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm
-; 32BIT-DAG:     renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3)
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm
-; 32BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-; 32BIT-DAG:     renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
-; 32BIT-DAG:     renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-; 32BIT-DAG:     BLR implicit $lr, implicit $rm, implicit $f1
diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll
new file mode 100644
index 000000000000..4d7c6fb6fa31
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg-mir.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O2 -mtriple powerpc64-ibm-aix-xcoff -mcpu=ppc -stop-after=machine-cp -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @int_va_arg(i32 %a, ...) local_unnamed_addr  {
+  ; CHECK-LABEL: name: int_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x11 = ADDI8 %fixed-stack.0, 0
+  ; CHECK-NEXT:   STD killed renamable $x6, 16, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x7, 24, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x8, 32, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x9, 40, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x10, 48, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD renamable $x11, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
+  ; CHECK-NEXT:   renamable $x6 = LD 0, %stack.1.arg2 :: (load (s64) from %ir.arg2)
+  ; CHECK-NEXT:   renamable $x7 = disjoint ADDI8 %fixed-stack.0, 4
+  ; CHECK-NEXT:   renamable $r8 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0, align 8)
+  ; CHECK-NEXT:   renamable $x9 = ADDI8 renamable $x6, 4
+  ; CHECK-NEXT:   STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
+  ; CHECK-NEXT:   STD killed renamable $x5, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; CHECK-NEXT:   STD killed renamable $x11, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
+  ; CHECK-NEXT:   STD killed renamable $x7, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
+  ; CHECK-NEXT:   STD killed renamable $x9, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
+  ; CHECK-NEXT:   renamable $r4 = LWZ 0, killed renamable $x6 :: (load (s32))
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r8, renamable $r3, implicit killed $x3
+  ; CHECK-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 1, 0, 30
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4, implicit-def $x3
+  ; CHECK-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %0 = va_arg ptr %arg1, i32
+  %add = add nsw i32 %0, %a
+  %1 = va_arg ptr %arg2, i32
+  %mul = shl i32 %1, 1
+  %add3 = add nsw i32 %add, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret i32 %add3
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_copy(ptr, ptr)
+declare void @llvm.va_end(ptr)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+
+define i32 @int_stack_va_arg(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, ...) local_unnamed_addr {
+  ; CHECK-LABEL: name: int_stack_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0, align 16)
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, renamable $r3, implicit killed $x3, implicit killed $x4
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r6, implicit killed $x6
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r7, implicit killed $x7
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r8, implicit killed $x8
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r9, implicit killed $x9
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r10, implicit killed $x10
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r11
+  ; CHECK-NEXT:   renamable $r4 = RLWINM killed renamable $r11, 1, 0, 30
+  ; CHECK-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4, implicit-def $x3
+  ; CHECK-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %add = add nsw i32 %two, %one
+  %add2 = add nsw i32 %add, %three
+  %add3 = add nsw i32 %add2, %four
+  %add4 = add nsw i32 %add3, %five
+  %add5 = add nsw i32 %add4, %six
+  %add6 = add nsw i32 %add5, %seven
+  %add7 = add nsw i32 %add6, %eight
+  %0 = va_arg ptr %arg1, i32
+  %add8 = add nsw i32 %add7, %0
+  %1 = va_arg ptr %arg2, i32
+  %mul = shl i32 %1, 1
+  %add10 = add nsw i32 %add8, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret i32 %add10
+}
+
+define double @double_va_arg(double %a, ...) local_unnamed_addr  {
+  ; CHECK-LABEL: name: double_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $f1, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x3 = ADDI8 %fixed-stack.0, 0
+  ; CHECK-NEXT:   STD killed renamable $x6, 16, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x7, 24, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x8, 32, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x9, 40, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD killed renamable $x10, 48, %fixed-stack.0 :: (store (s64))
+  ; CHECK-NEXT:   STD renamable $x3, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
+  ; CHECK-NEXT:   renamable $x6 = LD 0, %stack.1.arg2 :: (load (s64) from %ir.arg2)
+  ; CHECK-NEXT:   renamable $x7 = ADDI8 %fixed-stack.0, 8
+  ; CHECK-NEXT:   STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
+  ; CHECK-NEXT:   STD killed renamable $x3, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
+  ; CHECK-NEXT:   STD killed renamable $x7, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64))
+  ; CHECK-NEXT:   renamable $x3 = ADDI8 renamable $x6, 8
+  ; CHECK-NEXT:   STD killed renamable $x5, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; CHECK-NEXT:   STD killed renamable $x3, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
+  ; CHECK-NEXT:   renamable $f2 = LFD 0, killed renamable $x6 :: (load (s64))
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
+  ; CHECK-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %0 = va_arg ptr %arg1, double
+  %add = fadd double %0, %a
+  %1 = va_arg ptr %arg2, double
+  %mul = fmul double %1, 2.000000e+00
+  %add3 = fadd double %add, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret double %add3
+}
+
+define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr  {
+  ; CHECK-LABEL: name: double_stack_va_arg
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64))
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, renamable $f0, implicit $rm
+  ; CHECK-NEXT:   renamable $f0 = nofpexcept FADD killed renamable $f0, renamable $f0, implicit $rm
+  ; CHECK-NEXT:   renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
+  ; CHECK-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %add = fadd double %one, %two
+  %add2 = fadd double %add, %three
+  %add3 = fadd double %add2, %four
+  %add4 = fadd double %add3, %five
+  %add5 = fadd double %add4, %six
+  %add6 = fadd double %add5, %seven
+  %add7 = fadd double %add6, %eight
+  %add8 = fadd double %add7, %nine
+  %add9 = fadd double %add8, %ten
+  %add10 = fadd double %add9, %eleven
+  %add11 = fadd double %add10, %twelve
+  %add12 = fadd double %add11, %thirteen
+  %0 = va_arg ptr %arg1, double
+  %add13 = fadd double %add12, %0
+  %1 = va_arg ptr %arg2, double
+  %mul = fmul double %1, 2.000000e+00
+  %add15 = fadd double %add13, %mul
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret double %add15
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll
index 1b9c66ad0b23..87f46fe3aca8 100644
--- a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll
+++ b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll
@@ -1,349 +1,185 @@
-; RUN: llc -O2 -mtriple powerpc64-ibm-aix-xcoff -mcpu=ppc -stop-after=machine-cp -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefix=64BIT %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -O2 -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
-; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefix=ASM64 %s
+; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | FileCheck %s
 
-  define i32 @int_va_arg(i32 %a, ...) local_unnamed_addr  {
-  entry:
-    %arg1 = alloca ptr, align 8
-    %arg2 = alloca ptr, align 8
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.va_start(ptr nonnull %arg1)
-    call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
-    %0 = va_arg ptr %arg1, i32
-    %add = add nsw i32 %0, %a
-    %1 = va_arg ptr %arg2, i32
-    %mul = shl i32 %1, 1
-    %add3 = add nsw i32 %add, %mul
-    call void @llvm.va_end(ptr nonnull %arg1)
-    call void @llvm.va_end(ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
-    ret i32 %add3
-  }
+define i32 @int_va_arg(i32 %a, ...) local_unnamed_addr  {
+; CHECK-LABEL: int_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    std 4, 56(1)
+; CHECK-NEXT:    addi 4, 1, 56
+; CHECK-NEXT:    std 4, -16(1)
+; CHECK-NEXT:    std 4, -8(1)
+; CHECK-NEXT:    ld 4, -16(1)
+; CHECK-NEXT:    std 5, 64(1)
+; CHECK-NEXT:    addi 5, 1, 60
+; CHECK-NEXT:    std 5, -8(1)
+; CHECK-NEXT:    addi 5, 4, 4
+; CHECK-NEXT:    std 6, 72(1)
+; CHECK-NEXT:    std 7, 80(1)
+; CHECK-NEXT:    std 8, 88(1)
+; CHECK-NEXT:    std 9, 96(1)
+; CHECK-NEXT:    std 10, 104(1)
+; CHECK-NEXT:    std 5, -16(1)
+; CHECK-NEXT:    lwz 11, 56(1)
+; CHECK-NEXT:    lwz 4, 0(4)
+; CHECK-NEXT:    add 3, 11, 3
+; CHECK-NEXT:    slwi 4, 4, 1
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %0 = va_arg ptr %arg1, i32
+  %add = add nsw i32 %0, %a
+  %1 = va_arg ptr %arg2, i32
+  %mul = shl i32 %1, 1
+  %add3 = add nsw i32 %add, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret i32 %add3
+}
 
-  declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
-  declare void @llvm.va_start(ptr)
-  declare void @llvm.va_copy(ptr, ptr)
-  declare void @llvm.va_end(ptr)
-  declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_copy(ptr, ptr)
+declare void @llvm.va_end(ptr)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
-; 64BIT-LABEL:   name:            int_va_arg
-; 64BIT-LABEL:   liveins:
-; 64BIT-DAG:     - { reg: '$x3', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x4', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x5', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x6', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x7', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x8', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x9', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x10', virtual-reg: '' }
+define i32 @int_stack_va_arg(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, ...) local_unnamed_addr {
+; CHECK-LABEL: int_stack_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add 3, 4, 3
+; CHECK-NEXT:    lwz 11, 112(1)
+; CHECK-NEXT:    add 3, 3, 5
+; CHECK-NEXT:    add 3, 3, 6
+; CHECK-NEXT:    add 3, 3, 7
+; CHECK-NEXT:    add 3, 3, 8
+; CHECK-NEXT:    add 3, 3, 9
+; CHECK-NEXT:    add 3, 3, 10
+; CHECK-NEXT:    add 3, 3, 11
+; CHECK-NEXT:    slwi 4, 11, 1
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %add = add nsw i32 %two, %one
+  %add2 = add nsw i32 %add, %three
+  %add3 = add nsw i32 %add2, %four
+  %add4 = add nsw i32 %add3, %five
+  %add5 = add nsw i32 %add4, %six
+  %add6 = add nsw i32 %add5, %seven
+  %add7 = add nsw i32 %add6, %eight
+  %0 = va_arg ptr %arg1, i32
+  %add8 = add nsw i32 %add7, %0
+  %1 = va_arg ptr %arg2, i32
+  %mul = shl i32 %1, 1
+  %add10 = add nsw i32 %add8, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret i32 %add10
+}
 
-; 64BIT-LABEL:   fixedStack:
-; 64BIT-DAG:     - { id: 0, type: default, offset: 56, size: 8
+define double @double_va_arg(double %a, ...) local_unnamed_addr  {
+; CHECK-LABEL: double_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi 3, 1, 56
+; CHECK-NEXT:    std 4, 56(1)
+; CHECK-NEXT:    std 3, -8(1)
+; CHECK-NEXT:    std 3, -16(1)
+; CHECK-NEXT:    addi 3, 1, 64
+; CHECK-NEXT:    std 3, -8(1)
+; CHECK-NEXT:    ld 3, -16(1)
+; CHECK-NEXT:    lfd 0, 56(1)
+; CHECK-NEXT:    addi 4, 3, 8
+; CHECK-NEXT:    std 5, 64(1)
+; CHECK-NEXT:    fadd 0, 0, 1
+; CHECK-NEXT:    std 6, 72(1)
+; CHECK-NEXT:    std 7, 80(1)
+; CHECK-NEXT:    std 8, 88(1)
+; CHECK-NEXT:    std 9, 96(1)
+; CHECK-NEXT:    std 10, 104(1)
+; CHECK-NEXT:    std 4, -16(1)
+; CHECK-NEXT:    lfd 1, 0(3)
+; CHECK-NEXT:    fadd 1, 1, 1
+; CHECK-NEXT:    fadd 1, 0, 1
+; CHECK-NEXT:    blr
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %0 = va_arg ptr %arg1, double
+  %add = fadd double %0, %a
+  %1 = va_arg ptr %arg2, double
+  %mul = fmul double %1, 2.000000e+00
+  %add3 = fadd double %add, %mul
+  call void @llvm.va_end(ptr nonnull %arg1)
+  call void @llvm.va_end(ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret double %add3
+}
 
-; 64BIT-LABEL:   stack:
-; 64BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 8
-; 64BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 8
-
-; 64BIT-LABEL:   body:             |
-; 64BIT-DAG:     bb.0.entry:
-; 64BIT-DAG:     liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
-; 64BIT-DAG:     STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
-; 64BIT-DAG:     STD killed renamable $x5, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
-; 64BIT-DAG:     STD killed renamable $x6, 16, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x7, 24, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x8, 32, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x9, 40, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x10, 48, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     renamable $x11 = ADDI8 %fixed-stack.0, 0
-; 64BIT-DAG:     STD renamable $x11, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
-; 64BIT-DAG:     renamable $x6 = LD 0, %stack.1.arg2 :: (load (s64) from %ir.arg2)
-; 64BIT-DAG:     renamable $x9 = ADDI8 renamable $x6, 4
-; 64BIT-DAG:     renamable $x7 = disjoint ADDI8 %fixed-stack.0, 4
-; 64BIT-DAG:     renamable $r8 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0, align 8)
-; 64BIT-DAG:     STD killed renamable $x11, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
-; 64BIT-DAG:     STD killed renamable $x7, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
-; 64BIT-DAG:     STD killed renamable $x9, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
-; 64BIT-DAG:     renamable $r4 = LWZ 0, killed renamable $x6 :: (load (s32))
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r8, renamable $r3, implicit killed $x3
-; 64BIT-DAG:     renamable $r4 = RLWINM killed renamable $r4, 1, 0, 30
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4, implicit-def $x3
-; 64BIT-DAG:     BLR8 implicit $lr8, implicit $rm, implicit $x3
-
-; ASM64-LABEL:   .int_va_arg:
-; ASM64-DAG:     std 4, 56(1)
-; ASM64-DAG:     addi 4, 1, 56
-; ASM64-DAG:     std 4, -16(1)
-; ASM64-DAG:     std 4, -8(1)
-; ASM64-DAG:     ld 4, -16(1)
-; ASM64-DAG:     std 5, 64(1)
-; ASM64-DAG:     addi 5, 1, 60
-; ASM64-DAG:     std 5, -8(1)
-; ASM64-DAG:     addi 5, 4, 4
-; ASM64-DAG:     std 6, 72(1)
-; ASM64-DAG:     std 7, 80(1)
-; ASM64-DAG:     std 8, 88(1)
-; ASM64-DAG:     std 9, 96(1)
-; ASM64-DAG:     std 10, 104(1)
-; ASM64-DAG:     std 5, -16(1)
-; ASM64-DAG:     lwz 11, 56(1)
-; ASM64-DAG:     lwz 4, 0(4)
-; ASM64-DAG:     add 3, 11, 3
-; ASM64-DAG:     slwi 4, 4, 1
-; ASM64-DAG:     add 3, 3, 4
-; ASM64-DAG:     blr
-
-  define i32 @int_stack_va_arg(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, ...) local_unnamed_addr {
-  entry:
-    %arg1 = alloca ptr, align 8
-    %arg2 = alloca ptr, align 8
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.va_start(ptr nonnull %arg1)
-    call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
-    %add = add nsw i32 %two, %one
-    %add2 = add nsw i32 %add, %three
-    %add3 = add nsw i32 %add2, %four
-    %add4 = add nsw i32 %add3, %five
-    %add5 = add nsw i32 %add4, %six
-    %add6 = add nsw i32 %add5, %seven
-    %add7 = add nsw i32 %add6, %eight
-    %0 = va_arg ptr %arg1, i32
-    %add8 = add nsw i32 %add7, %0
-    %1 = va_arg ptr %arg2, i32
-    %mul = shl i32 %1, 1
-    %add10 = add nsw i32 %add8, %mul
-    call void @llvm.va_end(ptr nonnull %arg1)
-    call void @llvm.va_end(ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
-    ret i32 %add10
-  }
-
-; 64BIT-LABEL:    name:            int_stack_va_arg
-; 64BIT-LABEL:    liveins:
-; 64BIT-DAG:       - { reg: '$x3', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x4', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x5', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x6', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x7', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x8', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x9', virtual-reg: '' }
-; 64BIT-DAG:       - { reg: '$x10', virtual-reg: '' }
-
-; 64BIT-LABEL:   fixedStack:
-; 64BIT-DAG:     - { id: 0, type: default, offset: 112, size: 8, alignment: 16, stack-id: default,
-
-; 64BIT-LABEL:   stack:
-; 64BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 8, alignment: 8,
-; 64BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 8, alignment: 8,
-
-; 64BIT-LABEL:   body:             |
-; 64BIT-DAG:     liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
-; 64BIT-DAG:     renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0, align 16)
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 renamable $r4, renamable $r3, implicit killed $x3, implicit killed $x4
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r5, implicit killed $x5
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r6, implicit killed $x6
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r7, implicit killed $x7
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r8, implicit killed $x8
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r9, implicit killed $x9
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r10, implicit killed $x10
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, renamable $r11
-; 64BIT-DAG:     renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4, implicit-def $x3
-; 64BIT-DAG:     BLR8 implicit $lr8, implicit $rm, implicit $x3
-
-; ASM64-LABEL:   .int_stack_va_arg:
-; ASM64-DAG:     add 3, 4, 3
-; ASM64-DAG:     add 3, 3, 5
-; ASM64-DAG:     add 3, 3, 6
-; ASM64-DAG:     add 3, 3, 7
-; ASM64-DAG:     add 3, 3, 8
-; ASM64-DAG:     add 3, 3, 9
-; ASM64-DAG:     add 3, 3, 10
-; ASM64-DAG:     lwz 11, 112(1)
-; ASM64-DAG:     slwi 4, 11, 1
-; ASM64-DAG:     add 3, 3, 11
-; ASM64-DAG:     add 3, 3, 4
-; ASM64-DAG:     blr
-
-  define double @double_va_arg(double %a, ...) local_unnamed_addr  {
-  entry:
-    %arg1 = alloca ptr, align 8
-    %arg2 = alloca ptr, align 8
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.va_start(ptr nonnull %arg1)
-    call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
-    %0 = va_arg ptr %arg1, double
-    %add = fadd double %0, %a
-    %1 = va_arg ptr %arg2, double
-    %mul = fmul double %1, 2.000000e+00
-    %add3 = fadd double %add, %mul
-    call void @llvm.va_end(ptr nonnull %arg1)
-    call void @llvm.va_end(ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
-    ret double %add3
-  }
-
-; 64BIT-LABEL:   name:            double_va_arg
-; 64BIT-LABEL:   liveins:
-; 64BIT-DAG:     - { reg: '$f1', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x4', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x5', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x6', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x7', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x8', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x9', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$x10', virtual-reg: '' }
-
-; 64BIT-LABEL:   fixedStack:
-; 64BIT-DAG:     - { id: 0, type: default, offset: 56, size: 8
-
-; 64BIT-LABEL:   stack:
-; 64BIT-DAG:     - { id: 0, name: arg1, type: default, offset: 0, size: 8
-; 64BIT-DAG:     - { id: 1, name: arg2, type: default, offset: 0, size: 8
-
-; 64BIT-LABEL:   body:             |
-; 64BIT-DAG:     liveins: $f1, $x4, $x5, $x6, $x7, $x8, $x9, $x10
-; 64BIT-DAG:     renamable $x3 = ADDI8 %fixed-stack.0, 0
-; 64BIT-DAG:     STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
-; 64BIT-DAG:     STD killed renamable $x5, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
-; 64BIT-DAG:     STD killed renamable $x6, 16, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x7, 24, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x8, 32, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x9, 40, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD killed renamable $x10, 48, %fixed-stack.0 :: (store (s64))
-; 64BIT-DAG:     STD renamable $x3, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
-; 64BIT-DAG:     renamable $x6 = LD 0, %stack.1.arg2 :: (load (s64) from %ir.arg2)
-; 64BIT-DAG:     renamable $x7 = ADDI8 %fixed-stack.0, 8
-; 64BIT-DAG:     STD killed renamable $x3, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
-; 64BIT-DAG:     STD killed renamable $x7, 0, %stack.0.arg1 :: (store (s64) into %ir.arg1)
-; 64BIT-DAG:     renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64))
-; 64BIT-DAG:     renamable $x3 = ADDI8 renamable $x6, 8
-; 64BIT-DAG:     STD killed renamable $x3, 0, %stack.1.arg2 :: (store (s64) into %ir.arg2)
-; 64BIT-DAG:     renamable $f2 = LFD 0, killed renamable $x6 :: (load (s64))
-; 64BIT-DAG:     renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-; 64BIT-DAG:     renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm
-; 64BIT-DAG:     renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm
-; 64BIT-DAG:     BLR8 implicit $lr8, implicit $rm, implicit $f1
-
-; ASM64-LABEL:  .double_va_arg:
-; ASM64-DAG:    addi 3, 1, 56
-; ASM64-DAG:    std 4, 56(1)
-; ASM64-DAG:    std 3, -8(1)
-; ASM64-DAG:    std 3, -16(1)
-; ASM64-DAG:    addi 3, 1, 64
-; ASM64-DAG:    std 3, -8(1)
-; ASM64-DAG:    ld 3, -16(1)
-; ASM64-DAG:    lfd 0, 56(1)
-; ASM64-DAG:    addi 4, 3, 8
-; ASM64-DAG:    std 5, 64(1)
-; ASM64-DAG:    fadd 0, 0, 1
-; ASM64-DAG:    std 6, 72(1)
-; ASM64-DAG:    std 7, 80(1)
-; ASM64-DAG:    std 8, 88(1)
-; ASM64-DAG:    std 9, 96(1)
-; ASM64-DAG:    std 10, 104(1)
-; ASM64-DAG:    std 4, -16(1)
-; ASM64-DAG:    lfd 1, 0(3)
-; ASM64-DAG:    fadd 1, 1, 1
-; ASM64-DAG:    fadd 1, 0, 1
-; ASM64-DAG:    blr
-
-  define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr  {
-  entry:
-    %arg1 = alloca ptr, align 8
-    %arg2 = alloca ptr, align 8
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
-    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.va_start(ptr nonnull %arg1)
-    call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
-    %add = fadd double %one, %two
-    %add2 = fadd double %add, %three
-    %add3 = fadd double %add2, %four
-    %add4 = fadd double %add3, %five
-    %add5 = fadd double %add4, %six
-    %add6 = fadd double %add5, %seven
-    %add7 = fadd double %add6, %eight
-    %add8 = fadd double %add7, %nine
-    %add9 = fadd double %add8, %ten
-    %add10 = fadd double %add9, %eleven
-    %add11 = fadd double %add10, %twelve
-    %add12 = fadd double %add11, %thirteen
-    %0 = va_arg ptr %arg1, double
-    %add13 = fadd double %add12, %0
-    %1 = va_arg ptr %arg2, double
-    %mul = fmul double %1, 2.000000e+00
-    %add15 = fadd double %add13, %mul
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
-    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
-    ret double %add15
-  }
-
-
-; 64BIT-LABEL:   name:            double_stack_va_arg
-; 64BIT-LABEL:   liveins:
-; 64BIT-DAG:     - { reg: '$f1', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f2', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f3', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f4', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f5', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f6', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f7', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f8', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f9', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f10', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f11', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f12', virtual-reg: '' }
-; 64BIT-DAG:     - { reg: '$f13', virtual-reg: '' }
-
-; 64BIT-LABEL:   fixedStack:
-; 64BIT-DAG:       - { id: 0, type: default, offset: 152, size: 8
-
-; 64BIT-LABEL:   stack:
-; 64BIT-DAG:       - { id: 0, name: arg1, type: default, offset: 0, size: 8
-; 64BIT-DAG:       - { id: 1, name: arg2, type: default, offset: 0, size: 8
-
-; 64BIT-LABEL:     body:             |
-; 64BIT-DAG:       liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
-; 64BIT-DAG:       renamable $f0 = LFD 0, %fixed-stack.0 :: (load (s64))
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f3, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f4, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f5, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f6, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f7, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f8, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f9, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f10, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f11, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f12, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f13, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, renamable $f0, implicit $rm
-; 64BIT-DAG:       renamable $f0 = nofpexcept FADD killed renamable $f0, renamable $f0, implicit $rm
-; 64BIT-DAG:       renamable $f1 = nofpexcept FADD killed renamable $f1, killed renamable $f0, implicit $rm
-; 64BIT-DAG:       BLR8 implicit $lr8, implicit $rm, implicit $f1
-
-; ASM64-LABEL:   .double_stack_va_arg:
-; ASM64-DAG:     fadd 1, 1, 2
-; ASM64-DAG:     fadd 1, 1, 3
-; ASM64-DAG:     fadd 1, 1, 4
-; ASM64-DAG:     fadd 1, 1, 5
-; ASM64-DAG:     fadd 1, 1, 6
-; ASM64-DAG:     fadd 1, 1, 7
-; ASM64-DAG:     fadd 1, 1, 8
-; ASM64-DAG:     fadd 1, 1, 9
-; ASM64-DAG:     fadd 1, 1, 10
-; ASM64-DAG:     fadd 1, 1, 11
-; ASM64-DAG:     fadd 1, 1, 12
-; ASM64-DAG:     fadd 1, 1, 13
-; ASM64-DAG:     lfd 0, 152(1)
-; ASM64-DAG:     fadd 1, 1, 0
-; ASM64-DAG:     fadd 0, 0, 0
-; ASM64-DAG:     fadd 1, 1, 0
-; ASM64-DAG:     blr
+define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr  {
+; CHECK-LABEL: double_stack_va_arg:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fadd 1, 1, 2
+; CHECK-NEXT:    lfd 0, 152(1)
+; CHECK-NEXT:    fadd 1, 1, 3
+; CHECK-NEXT:    fadd 1, 1, 4
+; CHECK-NEXT:    fadd 1, 1, 5
+; CHECK-NEXT:    fadd 1, 1, 6
+; CHECK-NEXT:    fadd 1, 1, 7
+; CHECK-NEXT:    fadd 1, 1, 8
+; CHECK-NEXT:    fadd 1, 1, 9
+; CHECK-NEXT:    fadd 1, 1, 10
+; CHECK-NEXT:    fadd 1, 1, 11
+; CHECK-NEXT:    fadd 1, 1, 12
+; CHECK-NEXT:    fadd 1, 1, 13
+; CHECK-NEXT:    fadd 1, 1, 0
+; CHECK-NEXT:    fadd 0, 0, 0
+; CHECK-NEXT:    fadd 1, 1, 0
+; CHECK-NEXT:    blr
+entry:
+  %arg1 = alloca ptr, align 8
+  %arg2 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg1)
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.va_start(ptr nonnull %arg1)
+  call void @llvm.va_copy(ptr nonnull %arg2, ptr nonnull %arg1)
+  %add = fadd double %one, %two
+  %add2 = fadd double %add, %three
+  %add3 = fadd double %add2, %four
+  %add4 = fadd double %add3, %five
+  %add5 = fadd double %add4, %six
+  %add6 = fadd double %add5, %seven
+  %add7 = fadd double %add6, %eight
+  %add8 = fadd double %add7, %nine
+  %add9 = fadd double %add8, %ten
+  %add10 = fadd double %add9, %eleven
+  %add11 = fadd double %add10, %twelve
+  %add12 = fadd double %add11, %thirteen
+  %0 = va_arg ptr %arg1, double
+  %add13 = fadd double %add12, %0
+  %1 = va_arg ptr %arg2, double
+  %mul = fmul double %1, 2.000000e+00
+  %add15 = fadd double %add13, %mul
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg2)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %arg1)
+  ret double %add15
+}

From acde20b5605f3a3a8da2217e4526fc045e6603ed Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 18 Jun 2025 06:39:52 -0400
Subject: [PATCH 0789/1322] [HLSL][SPIRV] Add vk::constant_id attribute.
 (#143544)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The vk::constant_id attribute is used to indicate that a global const
variable
represents a specialization constant in SPIR-V. This PR adds this
attribute to clang.

The documentation for the attribute is
[here](https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/SPIR-V.rst#specialization-constants).

The strategy is to to modify the initializer to get the value of a
specialize constant for a builtin defined in the SPIR-V backend.

Implements https://github.com/llvm/wg-hlsl/pull/287

Fixes https://github.com/llvm/llvm-project/issues/142448

---------

Co-authored-by: Nathan Gauër <github@keenuts.net>
---
 clang/include/clang/Basic/Attr.td             |   8 +
 clang/include/clang/Basic/AttrDocs.td         |  15 ++
 clang/include/clang/Basic/Builtins.td         |  13 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +
 clang/include/clang/Sema/SemaHLSL.h           |   5 +-
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          |  74 ++++++
 clang/lib/CodeGen/CodeGenFunction.h           |   6 +
 clang/lib/Sema/SemaDecl.cpp                   |  13 ++
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 +
 clang/lib/Sema/SemaHLSL.cpp                   | 120 +++++++++-
 .../test/AST/HLSL/vk.spec-constant.usage.hlsl | 130 +++++++++++
 .../SpirvType.alignment.hlsl                  |   0
 .../SpirvType.hlsl                            |   0
 .../vk-features/vk.spec-constant.hlsl         | 210 ++++++++++++++++++
 .../test/SemaHLSL/vk.spec-constant.error.hlsl |  37 +++
 15 files changed, 636 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
 rename clang/test/CodeGenHLSL/{inline-spirv => vk-features}/SpirvType.alignment.hlsl (100%)
 rename clang/test/CodeGenHLSL/{inline-spirv => vk-features}/SpirvType.hlsl (100%)
 create mode 100644 clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
 create mode 100644 clang/test/SemaHLSL/vk.spec-constant.error.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index f113cd2ba2fb..27fea7dea0a5 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5023,6 +5023,14 @@ def HLSLVkExtBuiltinInput : InheritableAttr {
   let Documentation = [HLSLVkExtBuiltinInputDocs];
 }
 
+def HLSLVkConstantId : InheritableAttr {
+  let Spellings = [CXX11<"vk", "constant_id">];
+  let Args = [IntArgument<"Id">];
+  let Subjects = SubjectList<[ExternalGlobalVar]>;
+  let LangOpts = [HLSL];
+  let Documentation = [VkConstantIdDocs];
+}
+
 def RandomizeLayout : InheritableAttr {
   let Spellings = [GCC<"randomize_layout">];
   let Subjects = SubjectList<[Record]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 6051e1fc4511..43442f177ab7 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -8252,6 +8252,21 @@ and https://microsoft.github.io/hlsl-specs/proposals/0013-wave-size-range.html
   }];
 }
 
+def VkConstantIdDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``vk::constant_id`` attribute specifies the id for a SPIR-V specialization
+constant. The attribute applies to const global scalar variables. The variable must be initialized with a C++11 constexpr.
+In SPIR-V, the
+variable will be replaced with an `OpSpecConstant` with the given id.
+The syntax is:
+
+.. code-block:: text
+
+  ``[[vk::constant_id(<Id>)]] const T Name = <Init>``
+}];
+}
+
 def RootSignatureDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 68cd3d790e78..d65b3a5d2f44 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5065,6 +5065,19 @@ def HLSLGroupMemoryBarrierWithGroupSync: LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void()";
 }
 
+class HLSLScalarTemplate
+    : Template<["bool", "char", "short", "int", "long long int",
+                "unsigned short", "unsigned int", "unsigned long long int",
+                "__fp16", "float", "double"],
+               ["_bool", "_char", "_short", "_int", "_longlong", "_ushort",
+                "_uint", "_ulonglong", "_half", "_float", "_double"]>;
+
+def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
+  let Spellings = ["__builtin_get_spirv_spec_constant"];
+  let Attributes = [NoThrow, Const, Pure];
+  let Prototype = "T(unsigned int, T)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 979ff60b73b7..34b798a09c21 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12927,6 +12927,10 @@ def err_spirv_enum_not_int : Error<
 def err_spirv_enum_not_valid : Error<
    "invalid value for %select{storage class}0 argument">;
 
+def err_specialization_const
+    : Error<"variable with 'vk::constant_id' attribute must be a const "
+            "int/float/enum/bool and be initialized with a literal">;
+
 // errors of expect.with.probability
 def err_probability_not_constant_float : Error<
    "probability argument to __builtin_expect_with_probability must be constant "
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 33c4b8d1568b..97091792ba23 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -98,6 +98,8 @@ public:
   HLSLWaveSizeAttr *mergeWaveSizeAttr(Decl *D, const AttributeCommonInfo &AL,
                                       int Min, int Max, int Preferred,
                                       int SpelledArgsCount);
+  HLSLVkConstantIdAttr *
+  mergeVkConstantIdAttr(Decl *D, const AttributeCommonInfo &AL, int Id);
   HLSLShaderAttr *mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
                                   llvm::Triple::EnvironmentType ShaderType);
   HLSLParamModifierAttr *
@@ -135,6 +137,7 @@ public:
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
+  void handleVkConstantIdAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL);
@@ -171,7 +174,7 @@ public:
   QualType getInoutParameterType(QualType Ty);
 
   bool transformInitList(const InitializedEntity &Entity, InitListExpr *Init);
-
+  bool handleInitialization(VarDecl *VDecl, Expr *&Init);
   void deduceAddressSpace(VarDecl *Decl);
 
 private:
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index ccf45c0c6ff1..cbc5ef9cb0d5 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -12,6 +12,7 @@
 
 #include "CGBuiltin.h"
 #include "CGHLSLRuntime.h"
+#include "CodeGenFunction.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -214,6 +215,43 @@ static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
+// Returns the mangled name for a builtin function that the SPIR-V backend
+// will expand into a spec Constant.
+static std::string getSpecConstantFunctionName(clang::QualType SpecConstantType,
+                                               ASTContext &Context) {
+  // The parameter types for our conceptual intrinsic function.
+  QualType ClangParamTypes[] = {Context.IntTy, SpecConstantType};
+
+  // Create a temporary FunctionDecl for the builtin fuction. It won't be
+  // added to the AST.
+  FunctionProtoType::ExtProtoInfo EPI;
+  QualType FnType =
+      Context.getFunctionType(SpecConstantType, ClangParamTypes, EPI);
+  DeclarationName FuncName = &Context.Idents.get("__spirv_SpecConstant");
+  FunctionDecl *FnDeclForMangling = FunctionDecl::Create(
+      Context, Context.getTranslationUnitDecl(), SourceLocation(),
+      SourceLocation(), FuncName, FnType, /*TSI=*/nullptr, SC_Extern);
+
+  // Attach the created parameter declarations to the function declaration.
+  SmallVector<ParmVarDecl *, 2> ParamDecls;
+  for (QualType ParamType : ClangParamTypes) {
+    ParmVarDecl *PD = ParmVarDecl::Create(
+        Context, FnDeclForMangling, SourceLocation(), SourceLocation(),
+        /*IdentifierInfo*/ nullptr, ParamType, /*TSI*/ nullptr, SC_None,
+        /*DefaultArg*/ nullptr);
+    ParamDecls.push_back(PD);
+  }
+  FnDeclForMangling->setParams(ParamDecls);
+
+  // Get the mangled name.
+  std::string Name;
+  llvm::raw_string_ostream MangledNameStream(Name);
+  MangleContext *Mangler = Context.createMangleContext();
+  Mangler->mangleName(FnDeclForMangling, MangledNameStream);
+  MangledNameStream.flush();
+  return Name;
+}
+
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E,
                                             ReturnValueSlot ReturnValue) {
@@ -773,6 +811,42 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return EmitRuntimeCall(
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
+  case Builtin::BI__builtin_get_spirv_spec_constant_bool:
+  case Builtin::BI__builtin_get_spirv_spec_constant_short:
+  case Builtin::BI__builtin_get_spirv_spec_constant_ushort:
+  case Builtin::BI__builtin_get_spirv_spec_constant_int:
+  case Builtin::BI__builtin_get_spirv_spec_constant_uint:
+  case Builtin::BI__builtin_get_spirv_spec_constant_longlong:
+  case Builtin::BI__builtin_get_spirv_spec_constant_ulonglong:
+  case Builtin::BI__builtin_get_spirv_spec_constant_half:
+  case Builtin::BI__builtin_get_spirv_spec_constant_float:
+  case Builtin::BI__builtin_get_spirv_spec_constant_double: {
+    llvm::Function *SpecConstantFn = getSpecConstantFunction(E->getType());
+    llvm::Value *SpecId = EmitScalarExpr(E->getArg(0));
+    llvm::Value *DefaultVal = EmitScalarExpr(E->getArg(1));
+    llvm::Value *Args[] = {SpecId, DefaultVal};
+    return Builder.CreateCall(SpecConstantFn, Args);
+  }
   }
   return nullptr;
 }
+
+llvm::Function *clang::CodeGen::CodeGenFunction::getSpecConstantFunction(
+    const clang::QualType &SpecConstantType) {
+
+  // Find or create the declaration for the function.
+  llvm::Module *M = &CGM.getModule();
+  std::string MangledName =
+      getSpecConstantFunctionName(SpecConstantType, getContext());
+  llvm::Function *SpecConstantFn = M->getFunction(MangledName);
+
+  if (!SpecConstantFn) {
+    llvm::Type *IntType = ConvertType(getContext().IntTy);
+    llvm::Type *RetTy = ConvertType(SpecConstantType);
+    llvm::Type *ArgTypes[] = {IntType, RetTy};
+    llvm::FunctionType *FnTy = llvm::FunctionType::get(RetTy, ArgTypes, false);
+    SpecConstantFn = llvm::Function::Create(
+        FnTy, llvm::GlobalValue::ExternalLinkage, MangledName, M);
+  }
+  return SpecConstantFn;
+}
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index a5ab9df01dba..59f14b3e35fd 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4850,6 +4850,12 @@ public:
   llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitHLSLBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                    ReturnValueSlot ReturnValue);
+
+  // Returns a builtin function that the SPIR-V backend will expand into a spec
+  // constant.
+  llvm::Function *
+  getSpecConstantFunction(const clang::QualType &SpecConstantType);
+
   llvm::Value *EmitDirectXBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitSPIRVBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 1bf72e5bb7b9..e1cccf068b5a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2890,6 +2890,8 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
     NewAttr = S.HLSL().mergeWaveSizeAttr(D, *WS, WS->getMin(), WS->getMax(),
                                          WS->getPreferred(),
                                          WS->getSpelledArgsCount());
+  else if (const auto *CI = dyn_cast<HLSLVkConstantIdAttr>(Attr))
+    NewAttr = S.HLSL().mergeVkConstantIdAttr(D, *CI, CI->getId());
   else if (const auto *SA = dyn_cast<HLSLShaderAttr>(Attr))
     NewAttr = S.HLSL().mergeShaderAttr(D, *SA, SA->getType());
   else if (isa<SuppressAttr>(Attr))
@@ -13757,6 +13759,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     return;
   }
 
+  if (getLangOpts().HLSL)
+    if (!HLSL().handleInitialization(VDecl, Init))
+      return;
+
   // Get the decls type and save a reference for later, since
   // CheckInitializerTypes may change it.
   QualType DclT = VDecl->getType(), SavT = DclT;
@@ -14179,6 +14185,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
       }
     }
 
+    // HLSL variable with the `vk::constant_id` attribute must be initialized.
+    if (!Var->isInvalidDecl() && Var->hasAttr<HLSLVkConstantIdAttr>()) {
+      Diag(Var->getLocation(), diag::err_specialization_const);
+      Var->setInvalidDecl();
+      return;
+    }
+
     if (!Var->isInvalidDecl() && RealDecl->hasAttr<LoaderUninitializedAttr>()) {
       if (Var->getStorageClass() == SC_Extern) {
         Diag(Var->getLocation(), diag::err_loader_uninitialized_extern_decl)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1c2fa80e782d..eba29e609cb0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7590,6 +7590,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLVkExtBuiltinInput:
     S.HLSL().handleVkExtBuiltinInputAttr(D, AL);
     break;
+  case ParsedAttr::AT_HLSLVkConstantId:
+    S.HLSL().handleVkConstantIdAttr(D, AL);
+    break;
   case ParsedAttr::AT_HLSLSV_GroupThreadID:
     S.HLSL().handleSV_GroupThreadIDAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index b55f4fd786b5..9b43ee00810b 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -119,6 +119,40 @@ static ResourceClass getResourceClass(RegisterType RT) {
   llvm_unreachable("unexpected RegisterType value");
 }
 
+static Builtin::ID getSpecConstBuiltinId(QualType Type) {
+  const auto *BT = dyn_cast<BuiltinType>(Type);
+  if (!BT) {
+    if (!Type->isEnumeralType())
+      return Builtin::NotBuiltin;
+    return Builtin::BI__builtin_get_spirv_spec_constant_int;
+  }
+
+  switch (BT->getKind()) {
+  case BuiltinType::Bool:
+    return Builtin::BI__builtin_get_spirv_spec_constant_bool;
+  case BuiltinType::Short:
+    return Builtin::BI__builtin_get_spirv_spec_constant_short;
+  case BuiltinType::Int:
+    return Builtin::BI__builtin_get_spirv_spec_constant_int;
+  case BuiltinType::LongLong:
+    return Builtin::BI__builtin_get_spirv_spec_constant_longlong;
+  case BuiltinType::UShort:
+    return Builtin::BI__builtin_get_spirv_spec_constant_ushort;
+  case BuiltinType::UInt:
+    return Builtin::BI__builtin_get_spirv_spec_constant_uint;
+  case BuiltinType::ULongLong:
+    return Builtin::BI__builtin_get_spirv_spec_constant_ulonglong;
+  case BuiltinType::Half:
+    return Builtin::BI__builtin_get_spirv_spec_constant_half;
+  case BuiltinType::Float:
+    return Builtin::BI__builtin_get_spirv_spec_constant_float;
+  case BuiltinType::Double:
+    return Builtin::BI__builtin_get_spirv_spec_constant_double;
+  default:
+    return Builtin::NotBuiltin;
+  }
+}
+
 DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD,
                                                       ResourceClass ResClass) {
   assert(getDeclBindingInfo(VD, ResClass) == nullptr &&
@@ -607,6 +641,41 @@ HLSLWaveSizeAttr *SemaHLSL::mergeWaveSizeAttr(Decl *D,
   return Result;
 }
 
+HLSLVkConstantIdAttr *
+SemaHLSL::mergeVkConstantIdAttr(Decl *D, const AttributeCommonInfo &AL,
+                                int Id) {
+
+  auto &TargetInfo = getASTContext().getTargetInfo();
+  if (TargetInfo.getTriple().getArch() != llvm::Triple::spirv) {
+    Diag(AL.getLoc(), diag::warn_attribute_ignored) << AL;
+    return nullptr;
+  }
+
+  auto *VD = cast<VarDecl>(D);
+
+  if (getSpecConstBuiltinId(VD->getType()) == Builtin::NotBuiltin) {
+    Diag(VD->getLocation(), diag::err_specialization_const);
+    return nullptr;
+  }
+
+  if (!VD->getType().isConstQualified()) {
+    Diag(VD->getLocation(), diag::err_specialization_const);
+    return nullptr;
+  }
+
+  if (HLSLVkConstantIdAttr *CI = D->getAttr<HLSLVkConstantIdAttr>()) {
+    if (CI->getId() != Id) {
+      Diag(CI->getLocation(), diag::err_hlsl_attribute_param_mismatch) << AL;
+      Diag(AL.getLoc(), diag::note_conflicting_attribute);
+    }
+    return nullptr;
+  }
+
+  HLSLVkConstantIdAttr *Result =
+      ::new (getASTContext()) HLSLVkConstantIdAttr(getASTContext(), AL, Id);
+  return Result;
+}
+
 HLSLShaderAttr *
 SemaHLSL::mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
                           llvm::Triple::EnvironmentType ShaderType) {
@@ -1157,6 +1226,15 @@ void SemaHLSL::handleVkExtBuiltinInputAttr(Decl *D, const ParsedAttr &AL) {
                  HLSLVkExtBuiltinInputAttr(getASTContext(), AL, ID));
 }
 
+void SemaHLSL::handleVkConstantIdAttr(Decl *D, const ParsedAttr &AL) {
+  uint32_t Id;
+  if (!SemaRef.checkUInt32Argument(AL, AL.getArgAsExpr(0), Id))
+    return;
+  HLSLVkConstantIdAttr *NewAttr = mergeVkConstantIdAttr(D, AL, Id);
+  if (NewAttr)
+    D->addAttr(NewAttr);
+}
+
 bool SemaHLSL::diagnoseInputIDType(QualType T, const ParsedAttr &AL) {
   const auto *VT = T->getAs<VectorType>();
 
@@ -3206,6 +3284,7 @@ static bool IsDefaultBufferConstantDecl(VarDecl *VD) {
   return VD->getDeclContext()->isTranslationUnit() &&
          QT.getAddressSpace() == LangAS::Default &&
          VD->getStorageClass() != SC_Static &&
+         !VD->hasAttr<HLSLVkConstantIdAttr>() &&
          !isInvalidConstantBufferLeafElementType(QT.getTypePtr());
 }
 
@@ -3273,7 +3352,8 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
     const Type *VarType = VD->getType().getTypePtr();
     while (VarType->isArrayType())
       VarType = VarType->getArrayElementTypeNoTypeQual();
-    if (VarType->isHLSLResourceRecord()) {
+    if (VarType->isHLSLResourceRecord() ||
+        VD->hasAttr<HLSLVkConstantIdAttr>()) {
       // Make the variable for resources static. The global externally visible
       // storage is accessed through the handle, which is a member. The variable
       // itself is not externally visible.
@@ -3696,3 +3776,41 @@ bool SemaHLSL::transformInitList(const InitializedEntity &Entity,
     Init->updateInit(Ctx, I, NewInit->getInit(I));
   return true;
 }
+
+bool SemaHLSL::handleInitialization(VarDecl *VDecl, Expr *&Init) {
+  const HLSLVkConstantIdAttr *ConstIdAttr =
+      VDecl->getAttr<HLSLVkConstantIdAttr>();
+  if (!ConstIdAttr)
+    return true;
+
+  ASTContext &Context = SemaRef.getASTContext();
+
+  APValue InitValue;
+  if (!Init->isCXX11ConstantExpr(Context, &InitValue)) {
+    Diag(VDecl->getLocation(), diag::err_specialization_const);
+    VDecl->setInvalidDecl();
+    return false;
+  }
+
+  Builtin::ID BID = getSpecConstBuiltinId(VDecl->getType());
+
+  // Argument 1: The ID from the attribute
+  int ConstantID = ConstIdAttr->getId();
+  llvm::APInt IDVal(Context.getIntWidth(Context.IntTy), ConstantID);
+  Expr *IdExpr = IntegerLiteral::Create(Context, IDVal, Context.IntTy,
+                                        ConstIdAttr->getLocation());
+
+  SmallVector<Expr *, 2> Args = {IdExpr, Init};
+  Expr *C = SemaRef.BuildBuiltinCallExpr(Init->getExprLoc(), BID, Args);
+  if (C->getType()->getCanonicalTypeUnqualified() !=
+      VDecl->getType()->getCanonicalTypeUnqualified()) {
+    C = SemaRef
+            .BuildCStyleCastExpr(SourceLocation(),
+                                 Context.getTrivialTypeSourceInfo(
+                                     Init->getType(), Init->getExprLoc()),
+                                 SourceLocation(), C)
+            .get();
+  }
+  Init = C;
+  return true;
+}
diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
new file mode 100644
index 000000000000..c0955c1ea7b4
--- /dev/null
+++ b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'bool'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'bool (*)(unsigned int, bool) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'bool (unsigned int, bool) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_bool' 'bool (unsigned int, bool) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+[[vk::constant_id(1)]]
+const bool bool_const = true;
+
+// CHECK: VarDecl {{.*}} short_const 'const hlsl_private short' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'short'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'short (*)(unsigned int, short) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'short (unsigned int, short) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_short' 'short (unsigned int, short) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'short' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+[[vk::constant_id(2)]]
+const short short_const = 4;
+
+// CHECK: VarDecl {{.*}} int_const 'const hlsl_private int' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int (*)(unsigned int, int) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int (unsigned int, int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_int' 'int (unsigned int, int) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
+[[vk::constant_id(3)]]
+const int int_const = 5;
+
+// CHECK: VarDecl {{.*}} long_const 'const hlsl_private long long' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'long long'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long long (*)(unsigned int, long long) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'long long (unsigned int, long long) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_longlong' 'long long (unsigned int, long long) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long long' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+[[vk::constant_id(4)]]
+const long long long_const = 8;
+
+// CHECK: VarDecl {{.*}} ushort_const 'const hlsl_private unsigned short' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'unsigned short'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned short (*)(unsigned int, unsigned short) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned short (unsigned int, unsigned short) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_ushort' 'unsigned short (unsigned int, unsigned short) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned short' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 10
+[[vk::constant_id(5)]]
+const unsigned short ushort_const = 10;
+
+// CHECK: VarDecl {{.*}} uint_const 'const hlsl_private unsigned int' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'unsigned int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int (*)(unsigned int, unsigned int) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int (unsigned int, unsigned int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_uint' 'unsigned int (unsigned int, unsigned int) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 12
+[[vk::constant_id(6)]]
+const unsigned int uint_const = 12;
+
+
+// CHECK: VarDecl {{.*}} ulong_const 'const hlsl_private unsigned long long' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'unsigned long long'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned long long (*)(unsigned int, unsigned long long) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned long long (unsigned int, unsigned long long) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_ulonglong' 'unsigned long long (unsigned int, unsigned long long) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 7
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned long long' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 25
+[[vk::constant_id(7)]]
+const unsigned long long ulong_const = 25;
+
+// CHECK: VarDecl {{.*}} half_const 'const hlsl_private half' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'half'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half (*)(unsigned int, half) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'half (unsigned int, half) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_half' 'half (unsigned int, half) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half' <FloatingCast>
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 4.040000e+01
+[[vk::constant_id(8)]]
+const half half_const = 40.4;
+
+// CHECK: VarDecl {{.*}} float_const 'const hlsl_private float' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'float'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float (*)(unsigned int, float) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float (unsigned int, float) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_float' 'float (unsigned int, float) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 50
+[[vk::constant_id(8)]]
+const float float_const = 50;
+
+// CHECK: VarDecl {{.*}} double_const 'const hlsl_private double' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'double'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'double (*)(unsigned int, double) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'double (unsigned int, double) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_double' 'double (unsigned int, double) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 9
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'double' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 100
+[[vk::constant_id(9)]]
+const double double_const = 100;
+
+// CHECK: VarDecl {{.*}} enum_const 'const hlsl_private E' static cinit
+// CHECK-NEXT: CStyleCastExpr {{.*}} 'E' <IntegralCast>
+// CHECK-NEXT: CallExpr {{.*}} 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int (*)(unsigned int, int) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int (unsigned int, int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_int' 'int (unsigned int, int) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 10 
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'E' EnumConstant {{.*}} 'e2' 'E' 
+enum E {
+    e0 = 10,
+    e1 = 20,
+    e2 = 30
+};
+
+[[vk::constant_id(10)]]
+const E enum_const = e2;
+
+// CHECK-NOT: CXXRecordDecl {{.*}} implicit struct __cblayout_$Globals definition
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.alignment.hlsl b/clang/test/CodeGenHLSL/vk-features/SpirvType.alignment.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/inline-spirv/SpirvType.alignment.hlsl
rename to clang/test/CodeGenHLSL/vk-features/SpirvType.alignment.hlsl
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl b/clang/test/CodeGenHLSL/vk-features/SpirvType.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
rename to clang/test/CodeGenHLSL/vk-features/SpirvType.hlsl
diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
new file mode 100644
index 000000000000..cbc1fa61eae2
--- /dev/null
+++ b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
@@ -0,0 +1,210 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+[[vk::constant_id(1)]]
+const bool bool_const = true;
+
+[[vk::constant_id(1)]]
+const short short_const = 4;
+
+[[vk::constant_id(3)]]
+const int int_const = 5;
+
+[[vk::constant_id(4)]]
+const long long long_const = 8;
+
+[[vk::constant_id(5)]]
+const unsigned short ushort_const = 10;
+
+[[vk::constant_id(6)]]
+const unsigned int uint_const = 12;
+
+[[vk::constant_id(7)]]
+const unsigned long long ulong_const = 25;
+
+[[vk::constant_id(8)]]
+const half half_const = 40.4;
+
+[[vk::constant_id(8)]]
+const float float_const = 50.5;
+
+[[vk::constant_id(9)]]
+const double double_const = 100.2;
+
+enum E {
+    e0 = 10,
+    e1 = 20,
+    e2 = 30
+};
+
+[[vk::constant_id(10)]]
+const E enum_const = e2;
+
+[numthreads(1,1,1)]
+void main() {
+    bool b = bool_const;
+    short s = short_const;
+    int i = int_const;
+    long long l = long_const;
+    unsigned short us = ushort_const;
+    unsigned int ui = uint_const;
+    unsigned long long ul = ulong_const;
+    half h = half_const;
+    float f = float_const;
+    double d = double_const;
+    E e = enum_const;
+}
+//.
+// CHECK: @_ZL10bool_const = internal addrspace(10) global i32 0, align 4
+// CHECK: @_ZL11short_const = internal addrspace(10) global i16 0, align 2
+// CHECK: @_ZL9int_const = internal addrspace(10) global i32 0, align 4
+// CHECK: @_ZL10long_const = internal addrspace(10) global i64 0, align 8
+// CHECK: @_ZL12ushort_const = internal addrspace(10) global i16 0, align 2
+// CHECK: @_ZL10uint_const = internal addrspace(10) global i32 0, align 4
+// CHECK: @_ZL11ulong_const = internal addrspace(10) global i64 0, align 8
+// CHECK: @_ZL10half_const = internal addrspace(10) global float 0.000000e+00, align 4
+// CHECK: @_ZL11float_const = internal addrspace(10) global float 0.000000e+00, align 4
+// CHECK: @_ZL12double_const = internal addrspace(10) global double 0.000000e+00, align 8
+// CHECK: @_ZL10enum_const = internal addrspace(10) global i32 0, align 4
+//.
+// CHECK-LABEL: define internal spir_func void @_Z4mainv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[L:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[US:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[UI:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[UL:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[H:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[F:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[D:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[E:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(10) @_ZL10bool_const, align 4
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i32 [[TMP1]] to i1
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i32
+// CHECK-NEXT:    store i32 [[STOREDV]], ptr [[B]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(10) @_ZL11short_const, align 2
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[S]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(10) @_ZL9int_const, align 4
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(10) @_ZL10long_const, align 8
+// CHECK-NEXT:    store i64 [[TMP4]], ptr [[L]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(10) @_ZL12ushort_const, align 2
+// CHECK-NEXT:    store i16 [[TMP5]], ptr [[US]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(10) @_ZL10uint_const, align 4
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[UI]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(10) @_ZL11ulong_const, align 8
+// CHECK-NEXT:    store i64 [[TMP7]], ptr [[UL]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(10) @_ZL10half_const, align 4
+// CHECK-NEXT:    store float [[TMP8]], ptr [[H]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(10) @_ZL11float_const, align 4
+// CHECK-NEXT:    store float [[TMP9]], ptr [[F]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr addrspace(10) @_ZL12double_const, align 8
+// CHECK-NEXT:    store double [[TMP10]], ptr [[D]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(10) @_ZL10enum_const, align 4
+// CHECK-NEXT:    store i32 [[TMP11]], ptr [[E]], align 4
+// CHECK-NEXT:    ret void
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init(
+// CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @_Z20__spirv_SpecConstantib(i32 1, i1 true)
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP1]] to i32
+// CHECK-NEXT:    store i32 [[STOREDV]], ptr addrspace(10) @_ZL10bool_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.1(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @_Z20__spirv_SpecConstantis(i32 1, i16 4)
+// CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(10) @_ZL11short_const, align 2
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.2(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantii(i32 3, i32 5)
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL9int_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.3(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_Z20__spirv_SpecConstantix(i32 4, i64 8)
+// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(10) @_ZL10long_const, align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.4(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @_Z20__spirv_SpecConstantit(i32 5, i16 10)
+// CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(10) @_ZL12ushort_const, align 2
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.5(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantij(i32 6, i32 12)
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL10uint_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.6(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_Z20__spirv_SpecConstantiy(i32 7, i64 25)
+// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(10) @_ZL11ulong_const, align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.7(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn float @_Z20__spirv_SpecConstantiDh(i32 8, float 0x4044333340000000)
+// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(10) @_ZL10half_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.8(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn float @_Z20__spirv_SpecConstantif(i32 8, float 5.050000e+01)
+// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(10) @_ZL11float_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.9(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn double @_Z20__spirv_SpecConstantid(i32 9, double 0x40590CCCC0000000)
+// CHECK-NEXT:    store double [[TMP1]], ptr addrspace(10) @_ZL12double_const, align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.10(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantii(i32 10, i32 30)
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL10enum_const, align 4
+// CHECK-NEXT:    ret void
diff --git a/clang/test/SemaHLSL/vk.spec-constant.error.hlsl b/clang/test/SemaHLSL/vk.spec-constant.error.hlsl
new file mode 100644
index 000000000000..24873d272a54
--- /dev/null
+++ b/clang/test/SemaHLSL/vk.spec-constant.error.hlsl
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s
+
+#ifndef __spirv__
+// expected-warning@+2{{'constant_id' attribute ignored}}
+#endif
+[[vk::constant_id(0)]]
+const bool sc0 = true;
+
+#ifdef __spirv__
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(1)]]
+const bool sc1 = sc0; // error
+
+// expected-warning@+1{{'constant_id' attribute only applies to external global variables}}
+[[vk::constant_id(2)]]
+static const bool sc2 = false; // error
+
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(3)]]
+const bool sc3; // error
+
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(4)]]
+bool sc4 = false; // error
+
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(5)]]
+const int2 sc5 = {0,0}; // error
+
+[numthreads(1,1,1)]
+void main() {
+  // expected-warning@+1{{'constant_id' attribute only applies to external global variables}}
+  [[vk::constant_id(6)]]
+  const bool sc6 = false; // error
+}
+#endif

From d3441f7348203cc2a1d9c44fd24c1113954aa2b2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 18 Jun 2025 11:45:20 +0100
Subject: [PATCH 0790/1322] [LV] Change getSmallBestKnownTC to return an
 ElementCount (NFC) (#141793)

This is prep work for enabling better UF calculations when using vscale
based VFs to vectorise loops with vscale based tripcounts.

NOTE: NFC because All uses remain fixed-length until a following PR
changes LoopVectorize's version of getSmallConstantTripCount().
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 50 +++++++++++--------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 16d48b06dce4..2f4416d2782e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -419,6 +419,13 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
+/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
+/// ElementCount to include loops whose trip count is a function of vscale.
+static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
+                                              const Loop *L) {
+  return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
+}
+
 /// Returns "best known" trip count, which is either a valid positive trip count
 /// or std::nullopt when an estimate cannot be made (including when the trip
 /// count would overflow), for the specified loop \p L as defined by the
@@ -427,24 +434,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
 ///   2) Returns expected trip count according to profile data if any.
 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
 ///   4) Returns std::nullopt if all of the above failed.
-static std::optional<unsigned>
+static std::optional<ElementCount>
 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
                     bool CanUseConstantMax = true) {
   // Check if exact trip count is known.
-  if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
+  if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
     return ExpectedTC;
 
   // Check if there is an expected trip count available from profile data.
   if (LoopVectorizeWithBlockFrequency)
     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
-      return *EstimatedTC;
+      return ElementCount::getFixed(*EstimatedTC);
 
   if (!CanUseConstantMax)
     return std::nullopt;
 
   // Check if upper bound estimate is known.
   if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
-    return ExpectedTC;
+    return ElementCount::getFixed(ExpectedTC);
 
   return std::nullopt;
 }
@@ -1960,7 +1967,8 @@ public:
           // Get the best known TC estimate.
           if (auto EstimatedTC = getSmallBestKnownTC(
                   PSE, OuterLoop, /* CanUseConstantMax = */ false))
-            BestTripCount = *EstimatedTC;
+            if (EstimatedTC->isFixed())
+              BestTripCount = EstimatedTC->getFixedValue();
 
           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
 
@@ -3750,12 +3758,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   }
 
   ScalarEvolution *SE = PSE.getSE();
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-  if (TC != MaxTC)
+  if (TC != ElementCount::getFixed(MaxTC))
     LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
-  if (TC == 1) {
+  if (TC.isScalar()) {
     reportVectorizationFailure("Single iteration (non) loop",
         "loop trip count is one, irrelevant for vectorization",
         "SingleIterationLoop", ORE, TheLoop);
@@ -3869,7 +3877,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   }
 
   auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
-  if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+  if (ExpectedTC && ExpectedTC->isFixed() &&
+      ExpectedTC->getFixedValue() <=
+          TTI.getMinTripCountTailFoldingThreshold()) {
     if (MaxPowerOf2RuntimeVF > 0u) {
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the fixed-width
@@ -3927,7 +3937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
-  if (TC == 0) {
+  if (TC.isZero()) {
     reportVectorizationFailure(
         "unable to calculate the loop count due to complex control flow",
         "UnknownLoopCountComplexCFG", ORE, TheLoop);
@@ -4816,13 +4826,13 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
     // At least one iteration must be scalar when this constraint holds. So the
     // maximum available iterations for interleaving is one less.
     unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
-                               ? (*BestKnownTC) - 1
-                               : *BestKnownTC;
+                               ? BestKnownTC->getFixedValue() - 1
+                               : BestKnownTC->getFixedValue();
 
     unsigned InterleaveCountLB = bit_floor(std::max(
         1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
 
-    if (PSE.getSE()->getSmallConstantTripCount(TheLoop) > 0) {
+    if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
       // If the best known trip count is exact, we select between two
       // prospective ICs, where
       //
@@ -5182,8 +5192,8 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   // costs of comparison and induction instructions, as they'll get simplified
   // away.
   SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
-  auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
+  auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
+  if (TC == VF && !foldTailByMasking())
     addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
                                          ValuesToIgnoreForVF);
 
@@ -6878,8 +6888,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     // simplified away.
     // TODO: Remove this code after stepping away from the legacy cost model and
     // adding code to simplify VPlans before calculating their costs.
-    auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
-    if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
+    auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
+    if (TC == VF && !CM.foldTailByMasking())
       addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
                                            CostCtx.SkipCostComputation);
 
@@ -9647,8 +9657,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
   // Skip vectorization if the expected trip count is less than the minimum
   // required trip count.
   if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
-    if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
-                                VF.MinProfitableTripCount)) {
+    if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
                            "trip count < minimum profitable VF ("
                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
@@ -10018,7 +10027,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
   auto ExpectedTC = getSmallBestKnownTC(PSE, L);
-  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+  if (ExpectedTC && ExpectedTC->isFixed() &&
+      ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                       << "This loop is worth vectorizing only if no scalar "
                       << "iteration overheads are incurred.");

From b7ef5dbac91f9ccaf335ae4dd998e5783523f24e Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 18 Jun 2025 11:53:27 +0100
Subject: [PATCH 0791/1322] [LLVM][ComplexDeinterleaving] Update splat
 identification to include vector ConstantInt/FP. (#144516)

---
 llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp                 | 3 +++
 .../CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll   | 1 +
 2 files changed, 4 insertions(+)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index ae12423d827d..8855740f0cc8 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2005,6 +2005,9 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
     if (isa<ConstantDataVector>(V))
       return true;
 
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V))
+      return isa<VectorType>(V->getType());
+
     VectorType *VTy;
     ArrayRef<int> Mask;
     // Splats are represented differently depending on whether the repeated
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 661531361315..e7a00fc90e31 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+; RUN: llc -use-constant-int-for-scalable-splat -use-constant-fp-for-scalable-splat < %s --mattr=+sve -o - | FileCheck %s
 
 target triple = "aarch64"
 

From b5967264b0fbfd502b3a7edec27409e966fb68be Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 19:56:26 +0900
Subject: [PATCH 0792/1322] CodeGen: Move ABI option enums to support (#142912)

Move these out of TargetOptions and into Support to avoid
the dependency on Target. There are similar ABI options
already in Support/CodeGen.h.
---
 llvm/include/llvm/Support/CodeGen.h      | 16 ++++++++++++++++
 llvm/include/llvm/Target/TargetOptions.h | 17 +----------------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index 48745f7f4d2a..90733b50385a 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -50,6 +50,22 @@ namespace llvm {
     };
   }
 
+  namespace FloatABI {
+  enum ABIType {
+    Default, // Target-specific (either soft or hard depending on triple, etc).
+    Soft,    // Soft float.
+    Hard     // Hard float.
+  };
+  }
+
+  enum class EABI {
+    Unknown,
+    Default, // Default means not specified
+    EABI4,   // Target-specific (either 4, 5 or gnu depending on triple).
+    EABI5,
+    GNU
+  };
+
   /// Code generation optimization level.
   enum class CodeGenOptLevel {
     None = 0,      ///< -O0
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index a7c46921255b..f420798aa46f 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 
 #include <memory>
@@ -25,14 +26,6 @@ struct fltSemantics;
 class MachineFunction;
 class MemoryBuffer;
 
-namespace FloatABI {
-enum ABIType {
-  Default, // Target-specific (either soft or hard depending on triple, etc).
-  Soft,    // Soft float.
-  Hard     // Hard float.
-};
-}
-
 namespace FPOpFusion {
 enum FPOpFusionMode {
   Fast,     // Enable fusion of FP ops wherever it's profitable.
@@ -71,14 +64,6 @@ enum class BasicBlockSection {
   None    // Do not use Basic Block Sections.
 };
 
-enum class EABI {
-  Unknown,
-  Default, // Default means not specified
-  EABI4,   // Target-specific (either 4, 5 or gnu depending on triple).
-  EABI5,
-  GNU
-};
-
 /// Identify a debugger for "tuning" the debug info.
 ///
 /// The "debugger tuning" concept allows us to present a more intuitive

From 4aca3dc48b0919b81bd86302b141f29869266c45 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Wed, 18 Jun 2025 13:04:24 +0200
Subject: [PATCH 0793/1322] Reland: [GlobalISel] prevent G_UNMERGE_VALUES for
 vectors with different elements (#144661)

This commit prevents building a G_UNMERGE_VALUES instruction with
different source and destination vector elements in
`LegalizationArtifactCombiner::ArtifactValueFinder::tryCombineMergeLike()`,
e.g.:
`%1:_(<2 x s8>), %2:_(<2 x s8>) = G_UNMERGE_VALUES %0:_(<2 x s16>)`

This LLVM defect was identified via the AMD Fuzzing project.
---
 .../GlobalISel/LegalizationArtifactCombiner.h |  5 ++-
 .../AMDGPU/GlobalISel/insertelement.ll        | 44 +++++++++++++++++++
 ...ffer-fat-pointers-contents-legalization.ll | 10 ++---
 3 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 22f6a5fde546..8f560c42082f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -997,6 +997,7 @@ public:
 
       // Recognize UnmergeSrc that can be unmerged to DstTy directly.
       // Types have to be either both vector or both non-vector types.
+      // In case of vector types, the scalar elements need to match.
       // Merge-like opcodes are combined one at the time. First one creates new
       // unmerge, following should use the same unmerge (builder performs CSE).
       //
@@ -1005,7 +1006,9 @@ public:
       // %AnotherDst:_(DstTy) = G_merge_like_opcode %2:_(EltTy), %3
       //
       // %Dst:_(DstTy), %AnotherDst = G_UNMERGE_VALUES %UnmergeSrc
-      if ((DstTy.isVector() == UnmergeSrcTy.isVector()) &&
+      if (((!DstTy.isVector() && !UnmergeSrcTy.isVector()) ||
+           (DstTy.isVector() && UnmergeSrcTy.isVector() &&
+            DstTy.getScalarType() == UnmergeSrcTy.getScalarType())) &&
           (Elt0UnmergeIdx % NumMIElts == 0) &&
           getCoverTy(UnmergeSrcTy, DstTy) == UnmergeSrcTy) {
         if (!isSequenceFromUnmerge(MI, 0, Unmerge, Elt0UnmergeIdx, NumMIElts,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 8134eb3ca2af..51d0b225b2a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -6506,3 +6506,47 @@ entry:
   %insert = insertelement <5 x double> %vec, double %val, i32 %idx
   ret <5 x double> %insert
 }
+
+; Found by fuzzer, reduced with llvm-reduce.
+define void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) {
+; GPRIDX-LABEL: insert_very_small_from_very_large:
+; GPRIDX:       ; %bb.0: ; %bb
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GPRIDX-NEXT:    v_and_b32_e32 v0, 1, v0
+; GPRIDX-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GPRIDX-NEXT:    v_and_b32_e32 v0, 3, v0
+; GPRIDX-NEXT:    flat_store_byte v[16:17], v0
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: insert_very_small_from_very_large:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT:    flat_store_byte v[16:17], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: insert_very_small_from_very_large:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b16 v0.l, 1, v0.l
+; GFX11-NEXT:    v_and_b16 v0.l, v0.l, 1
+; GFX11-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    flat_store_b8 v[16:17], v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %a = bitcast <32 x i16> %L3 to i512
+  %b = trunc i512 %a to i8
+  %c = trunc i8 %b to i2
+  %d = bitcast i2 %c to <2 x i1>
+  %insert = insertelement <2 x i1> %d, i1 false, i32 0
+  store <2 x i1> %insert, ptr %ptr, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index fdc1dd6cce8e..53b2542cf9a7 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -2166,14 +2166,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v6i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <6 x i8>, ptr addrspace(7) %p
@@ -3630,10 +3630,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load volatile <6 x i8>, ptr addrspace(7) %p

From c3efe7d64cebcd8679bec3ba7ff8154f8b0a1fa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 18 Jun 2025 14:12:14 +0300
Subject: [PATCH 0794/1322] [libcxx] [test] Fix odr_signature tests with
 optimizations enabled (#144317)

If optimization is enabled, the inline `f()` function actually gets
inlined, meaning that the functions `tu1()` and `tu2()` trivially return
1 and 2, instead of actually referencing the potentially linker
deduplicated function `f()`, which is what the test tries to test.

Therefore, this test previously actually failed to test what it was
supposed to test, if optimization was enabled.

Mark the inline functions with `TEST_NOINLINE` to make sure that they
don't get inlined even with optimizations enabled.

Also update the TODO comments to explain why we have an XFAIL for msvc
mode here.

This avoids these tests unexpectedly passing if building in msvc mode,
with optimizations enabled
(`-DLIBCXX_TEST_PARAMS="optimization=speed"`).
---
 libcxx/test/libcxx/odr_signature.exceptions.sh.cpp | 10 +++++++---
 libcxx/test/libcxx/odr_signature.hardening.sh.cpp  | 14 +++++++++-----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp b/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp
index 6bf60b5e82d3..c0ba48eb245d 100644
--- a/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp
+++ b/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp
@@ -6,9 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: Investigate
+// ABI tags have no effect in MSVC mode.
 // XFAIL: msvc
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // Test that we encode whether exceptions are supported in an ABI tag to avoid
 // ODR violations when linking TUs that have different values for it.
 
@@ -18,17 +20,19 @@
 // RUN: %{cxx} %t.tu1.o %t.tu2.o %t.main.o %{flags} %{link_flags} -o %t.exe
 // RUN: %{exec} %t.exe
 
+#include "test_macros.h"
+
 // -fno-exceptions
 #ifdef TU1
 #  include <__config>
-_LIBCPP_HIDE_FROM_ABI inline int f() { return 1; }
+_LIBCPP_HIDE_FROM_ABI TEST_NOINLINE inline int f() { return 1; }
 int tu1() { return f(); }
 #endif // TU1
 
 // -fexceptions
 #ifdef TU2
 #  include <__config>
-_LIBCPP_HIDE_FROM_ABI inline int f() { return 2; }
+_LIBCPP_HIDE_FROM_ABI TEST_NOINLINE inline int f() { return 2; }
 int tu2() { return f(); }
 #endif // TU2
 
diff --git a/libcxx/test/libcxx/odr_signature.hardening.sh.cpp b/libcxx/test/libcxx/odr_signature.hardening.sh.cpp
index 0dc280bf2818..8daf3f3fd046 100644
--- a/libcxx/test/libcxx/odr_signature.hardening.sh.cpp
+++ b/libcxx/test/libcxx/odr_signature.hardening.sh.cpp
@@ -6,9 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: Investigate
+// ABI tags have no effect in MSVC mode.
 // XFAIL: msvc
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // Test that we encode the hardening mode in an ABI tag to avoid ODR violations
 // when linking TUs that have different values for it.
 
@@ -21,31 +23,33 @@
 // RUN: %{cxx} %t.tu1.o %t.tu2.o %t.tu3.o %t.tu4.o %t.main.o %{flags} %{link_flags} -o %t.exe
 // RUN: %{exec} %t.exe
 
+#include "test_macros.h"
+
 // fast hardening mode
 #ifdef TU1
 #  include <__config>
-_LIBCPP_HIDE_FROM_ABI inline int f() { return 1; }
+_LIBCPP_HIDE_FROM_ABI TEST_NOINLINE inline int f() { return 1; }
 int tu1() { return f(); }
 #endif // TU1
 
 // extensive hardening mode
 #ifdef TU2
 #  include <__config>
-_LIBCPP_HIDE_FROM_ABI inline int f() { return 2; }
+_LIBCPP_HIDE_FROM_ABI TEST_NOINLINE inline int f() { return 2; }
 int tu2() { return f(); }
 #endif // TU2
 
 // debug hardening mode
 #ifdef TU3
 #  include <__config>
-_LIBCPP_HIDE_FROM_ABI inline int f() { return 3; }
+_LIBCPP_HIDE_FROM_ABI TEST_NOINLINE inline int f() { return 3; }
 int tu3() { return f(); }
 #endif // TU3
 
 // No hardening
 #ifdef TU4
 #  include <__config>
-_LIBCPP_HIDE_FROM_ABI inline int f() { return 4; }
+_LIBCPP_HIDE_FROM_ABI TEST_NOINLINE inline int f() { return 4; }
 int tu4() { return f(); }
 #endif // TU4
 

From 66d6964a55014e7fabb7c80fbba19d2145262b6b Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Wed, 18 Jun 2025 16:50:48 +0530
Subject: [PATCH 0795/1322] Fix tests failing on fuchsia clang x86_64 builders
 (#144655)

Fuchsia sets CLANG_DEFAULT_UNWINDLIB to libunwind. As a result, when
rtlib is set to libgcc and unwindlib is not explicitly specified, tests
using Fuchsia as the default platform will fail. To address this, the
affected tests are now xfailed

This change fixes the following tests introduced in
https://github.com/llvm/llvm-project/commit/45ea46c44636094e9fcdbbeabfd11f9d0fad5e38:

clang/test/Driver/aarch64-toolchain-extra.c
clang/test/Driver/arm-toolchain-extra.c
clang/test/Driver/aarch64-toolchain.c
clang/test/Driver/arm-toolchain.c
---
 clang/test/Driver/aarch64-toolchain-extra.c | 1 +
 clang/test/Driver/aarch64-toolchain.c       | 1 +
 clang/test/Driver/arm-toolchain-extra.c     | 1 +
 clang/test/Driver/arm-toolchain.c           | 1 +
 4 files changed, 4 insertions(+)

diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
index 2610e962bd69..eb8c741ae1ad 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -3,6 +3,7 @@
 // The tests here are similar to those in aarch64-toolchain.c, however
 // these tests need to create symlinks to test directory trees in order to
 // set up the environment and therefore shell support is required.
+// XFAIL: target={{.*}}-fuchsia{{.*}}
 // REQUIRES: shell
 // UNSUPPORTED: system-windows
 
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index 7f2c01d928e4..74841eec598b 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-fuchsia{{.*}}
 // UNSUPPORTED: system-windows
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
index 114de0a8154a..67206818f211 100644
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -3,6 +3,7 @@
 // The tests here are similar to those in arm-toolchain.c, however
 // these tests need to create symlinks to test directory trees in order to
 // set up the environment and therefore shell support is required.
+// XFAIL: target={{.*}}-fuchsia{{.*}}
 // REQUIRES: shell
 // UNSUPPORTED: system-windows
 
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index 2e38461fb7a3..56a0e0de7ba7 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-fuchsia{{.*}}
 // UNSUPPORTED: system-windows
 
 // RUN: %clang -### %s -fuse-ld= \

From 8a469da8b2342dd9104faf25deeddd8ad66ca6a6 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Wed, 18 Jun 2025 13:32:46 +0200
Subject: [PATCH 0796/1322] [mlir] remove unnecessary atomic_rmw expansions
 (#144515)

The expansion of `memref.atomic_rmw` into a `memref.generic_atomic_rmw`
for floating-point min/max operations is no longer necessary as those
are now supported by the LLVM dialect and LLVM IR.

Furthermore, combining this expansion with direct lowering of
`generic_atomic_rmw` could leads to invalid LLVM dialect IR with
`cmpxchg` operating on floating-point values that it does not support.
---
 .../Dialect/MemRef/Transforms/ExpandOps.cpp   | 56 +------------------
 mlir/test/Dialect/MemRef/expand-ops.mlir      | 38 +------------
 2 files changed, 5 insertions(+), 89 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandOps.cpp
index 020aabd9db6d..a617029ce470 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandOps.cpp
@@ -1,16 +1,10 @@
-//===- StdExpandDivs.cpp - Code to prepare Std for lowering Divs to LLVM  -===//
+//===- ExpandDivs.cpp - Expansion patterns for MemRef operations ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file Std transformations to expand Divs operation to help for the
-// lowering to LLVM. Currently implemented transformations are Ceil and Floor
-// for Signed Integers.
-//
-//===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 
@@ -33,44 +27,6 @@ using namespace mlir;
 
 namespace {
 
-/// Converts `atomic_rmw` that cannot be lowered to a simple atomic op with
-/// AtomicRMWOpLowering pattern, such as minimum and maximum operations for
-/// floating-point numbers, to `memref.generic_atomic_rmw` with the expanded
-/// code.
-///
-/// %x = atomic_rmw maximumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
-///
-/// will be lowered to
-///
-/// %x = memref.generic_atomic_rmw %F[%i] : memref<10xf32> {
-/// ^bb0(%current: f32):
-///   %1 = arith.maximumf %current, %fval : f32
-///   memref.atomic_yield %1 : f32
-/// }
-struct AtomicRMWOpConverter : public OpRewritePattern<memref::AtomicRMWOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(memref::AtomicRMWOp op,
-                                PatternRewriter &rewriter) const final {
-    auto loc = op.getLoc();
-    auto genericOp = rewriter.create<memref::GenericAtomicRMWOp>(
-        loc, op.getMemref(), op.getIndices());
-    OpBuilder bodyBuilder =
-        OpBuilder::atBlockEnd(genericOp.getBody(), rewriter.getListener());
-
-    Value lhs = genericOp.getCurrentValue();
-    Value rhs = op.getValue();
-
-    Value arithOp =
-        mlir::arith::getReductionOp(op.getKind(), bodyBuilder, loc, lhs, rhs);
-    bodyBuilder.create<memref::AtomicYieldOp>(loc, arithOp);
-
-    rewriter.replaceOp(op, genericOp.getResult());
-    return success();
-  }
-};
-
 /// Converts `memref.reshape` that has a target shape of a statically-known
 /// size to `memref.reinterpret_cast`.
 struct MemRefReshapeOpConverter : public OpRewritePattern<memref::ReshapeOp> {
@@ -139,13 +95,6 @@ struct ExpandOpsPass : public memref::impl::ExpandOpsPassBase<ExpandOpsPass> {
     ConversionTarget target(ctx);
 
     target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect>();
-    target.addDynamicallyLegalOp<memref::AtomicRMWOp>(
-        [](memref::AtomicRMWOp op) {
-          constexpr std::array shouldBeExpandedKinds = {
-              arith::AtomicRMWKind::maximumf, arith::AtomicRMWKind::minimumf,
-              arith::AtomicRMWKind::minnumf, arith::AtomicRMWKind::maxnumf};
-          return !llvm::is_contained(shouldBeExpandedKinds, op.getKind());
-        });
     target.addDynamicallyLegalOp<memref::ReshapeOp>([](memref::ReshapeOp op) {
       return !cast<MemRefType>(op.getShape().getType()).hasStaticShape();
     });
@@ -158,6 +107,5 @@ struct ExpandOpsPass : public memref::impl::ExpandOpsPassBase<ExpandOpsPass> {
 } // namespace
 
 void mlir::memref::populateExpandOpsPatterns(RewritePatternSet &patterns) {
-  patterns.add<AtomicRMWOpConverter, MemRefReshapeOpConverter>(
-      patterns.getContext());
+  patterns.add<MemRefReshapeOpConverter>(patterns.getContext());
 }
diff --git a/mlir/test/Dialect/MemRef/expand-ops.mlir b/mlir/test/Dialect/MemRef/expand-ops.mlir
index 65932b5814a6..fc8db546d918 100644
--- a/mlir/test/Dialect/MemRef/expand-ops.mlir
+++ b/mlir/test/Dialect/MemRef/expand-ops.mlir
@@ -1,42 +1,10 @@
 // RUN: mlir-opt -memref-expand %s -split-input-file | FileCheck %s
 
-// CHECK-LABEL: func @atomic_rmw_to_generic
-// CHECK-SAME: ([[F:%.*]]: memref<10xf32>, [[f:%.*]]: f32, [[i:%.*]]: index)
-func.func @atomic_rmw_to_generic(%F: memref<10xf32>, %f: f32, %i: index) -> f32 {
-  %a = memref.atomic_rmw maximumf %f, %F[%i] : (f32, memref<10xf32>) -> f32
-  %b = memref.atomic_rmw minimumf %f, %F[%i] : (f32, memref<10xf32>) -> f32
-  %c = memref.atomic_rmw maxnumf %f, %F[%i] : (f32, memref<10xf32>) -> f32
-  %d = memref.atomic_rmw minnumf %f, %F[%i] : (f32, memref<10xf32>) -> f32
-  return %a : f32
-}
-// CHECK: [[RESULT:%.*]] = memref.generic_atomic_rmw %arg0[%arg2] : memref<10xf32> {
-// CHECK: ^bb0([[CUR_VAL:%.*]]: f32):
-// CHECK:   [[MAXIMUM:%.*]] = arith.maximumf [[CUR_VAL]], [[f]] : f32
-// CHECK:   memref.atomic_yield [[MAXIMUM]] : f32
-// CHECK: }
-// CHECK: memref.generic_atomic_rmw %arg0[%arg2] : memref<10xf32> {
-// CHECK: ^bb0([[CUR_VAL:%.*]]: f32):
-// CHECK:   [[MINIMUM:%.*]] = arith.minimumf [[CUR_VAL]], [[f]] : f32
-// CHECK:   memref.atomic_yield [[MINIMUM]] : f32
-// CHECK: }
-// CHECK: memref.generic_atomic_rmw %arg0[%arg2] : memref<10xf32> {
-// CHECK: ^bb0([[CUR_VAL:%.*]]: f32):
-// CHECK:   [[MAXNUM:%.*]] = arith.maxnumf [[CUR_VAL]], [[f]] : f32
-// CHECK:   memref.atomic_yield [[MAXNUM]] : f32
-// CHECK: }
-// CHECK: memref.generic_atomic_rmw %arg0[%arg2] : memref<10xf32> {
-// CHECK: ^bb0([[CUR_VAL:%.*]]: f32):
-// CHECK:   [[MINNUM:%.*]] = arith.minnumf [[CUR_VAL]], [[f]] : f32
-// CHECK:   memref.atomic_yield [[MINNUM]] : f32
-// CHECK: }
-// CHECK: return [[RESULT]] : f32
-
-// -----
-
 // CHECK-LABEL: func @atomic_rmw_no_conversion
-func.func @atomic_rmw_no_conversion(%F: memref<10xf32>, %f: f32, %i: index) -> f32 {
+func.func @atomic_rmw_no_conversion(%F: memref<10xf32>, %f: f32, %i: index) -> (f32, f32) {
   %x = memref.atomic_rmw addf %f, %F[%i] : (f32, memref<10xf32>) -> f32
-  return %x : f32
+  %y = memref.atomic_rmw maximumf %f, %F[%i] : (f32, memref<10xf32>) -> f32
+  return %x, %y : f32, f32
 }
 // CHECK-NOT: generic_atomic_rmw
 

From d8e8ab79773f739c602c5869f80c6c5b5962c558 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 18 Jun 2025 12:58:17 +0100
Subject: [PATCH 0797/1322] [AArch64][SME] Fix restoring callee-saves from FP
 with hazard padding (#143371)

Currently, when hazard-padding is enabled a (fixed-size) hazard slot is
placed in the CS area, just after the frame record. The size of this
slot is part of the "CalleeSaveBaseToFrameRecordOffset". The SVE
epilogue emission code assumed this offset was always zero, and
incorrectly setting the stack pointer, resulting in all SVE registers
being reloaded from incorrect offsets.

```
| prev_lr                           |
| prev_fp                           |
| (a.k.a. "frame record")           |
|-----------------------------------| <- fp(=x29)
|   <hazard padding>                |
|-----------------------------------| <- callee-saved base
|                                   |
| callee-saved fp/simd/SVE regs     |
|                                   |
|-----------------------------------| <- SVE callee-save base
```

i.e. in the above diagram, the code assumed `fp == callee-saved base`.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |   37 +-
 llvm/test/CodeGen/AArch64/stack-hazard.ll     | 1173 +++++++++++++++++
 2 files changed, 1198 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7ffe779f2408..a71668e71c23 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2578,20 +2578,33 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                     DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
                     NeedsWinCFI, &HasWinCFI);
   } else if (SVEStackSize) {
-    // If we have stack realignment or variable sized objects on the stack,
-    // restore the stack pointer from the frame pointer prior to SVE CSR
-    // restoration.
-    if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
-      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-        // Set SP to start of SVE callee-save area from which they can
-        // be reloaded. The code below will deallocate the stack space
-        // space by moving FP -> SP.
-        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
-                        StackOffset::getScalable(-CalleeSavedSize), TII,
+    int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
+    // If we have stack realignment or variable-sized objects we must use the
+    // FP to restore SVE callee saves (as there is an unknown amount of
+    // data/padding between the SP and SVE CS area).
+    Register BaseForSVEDealloc =
+        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+                                                              : AArch64::SP;
+    if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
+      Register CalleeSaveBase = AArch64::FP;
+      if (int64_t CalleeSaveBaseOffset =
+              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+        // If we have have an non-zero offset to the non-SVE CS base we need to
+        // compute the base address by subtracting the offest in a temporary
+        // register first (to avoid briefly deallocating the SVE CS).
+        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+            &AArch64::GPR64RegClass);
+        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
                         MachineInstr::FrameDestroy);
       }
-    } else {
-      if (AFI->getSVECalleeSavedStackSize()) {
+      // The code below will deallocate the stack space space by moving the
+      // SP to the start of the SVE callee-save area.
+      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+                      StackOffset::getScalable(-SVECalleeSavedSize), TII,
+                      MachineInstr::FrameDestroy);
+    } else if (BaseForSVEDealloc == AArch64::SP) {
+      if (SVECalleeSavedSize) {
         // Deallocate the non-SVE locals first before we can deallocate (and
         // restore callee saves) from the SVE area.
         emitFrameOffset(
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index e169b199733b..3a3340520013 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -3143,3 +3143,1176 @@ entry:
   call void @bar(ptr noundef nonnull %b)
   ret i32 0
 }
+
+
+define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_dynamic_alloca:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w20, -16
+; CHECK0-NEXT:    .cfi_offset w26, -24
+; CHECK0-NEXT:    .cfi_offset w27, -32
+; CHECK0-NEXT:    .cfi_offset w28, -40
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT:    mov w9, w0
+; CHECK0-NEXT:    mov x8, sp
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    add x9, x9, #15
+; CHECK0-NEXT:    mov x19, sp
+; CHECK0-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK0-NEXT:    sub x8, x8, x9
+; CHECK0-NEXT:    mov sp, x8
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    and x20, x0, #0x1
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB35_2: // %entry
+; CHECK0-NEXT:    mov x0, x8
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB35_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w20
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_dynamic_alloca:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w20, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
+; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    mov w9, w0
+; CHECK64-NEXT:    mov x8, sp
+; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    add x9, x9, #15
+; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK64-NEXT:    sub x8, x8, x9
+; CHECK64-NEXT:    mov sp, x8
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    and x20, x0, #0x1
+; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK64-NEXT:  // %bb.1: // %entry
+; CHECK64-NEXT:    smstop sm
+; CHECK64-NEXT:  .LBB35_2: // %entry
+; CHECK64-NEXT:    mov x0, x8
+; CHECK64-NEXT:    mov w1, #45 // =0x2d
+; CHECK64-NEXT:    bl memset
+; CHECK64-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK64-NEXT:  // %bb.3: // %entry
+; CHECK64-NEXT:    smstart sm
+; CHECK64-NEXT:  .LBB35_4: // %entry
+; CHECK64-NEXT:    mov w0, #22647 // =0x5877
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
+; CHECK64-NEXT:    .cfi_restore vg
+; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    addvl sp, x8, #-18
+; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
+; CHECK64-NEXT:    sub sp, x29, #64
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
+; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #96] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    .cfi_restore w19
+; CHECK64-NEXT:    .cfi_restore w20
+; CHECK64-NEXT:    .cfi_restore w26
+; CHECK64-NEXT:    .cfi_restore w27
+; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore w30
+; CHECK64-NEXT:    .cfi_restore w29
+; CHECK64-NEXT:    ret
+;
+; CHECK1024-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024:       // %bb.0: // %entry
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NEXT:    cntd x9
+; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
+; CHECK1024-NEXT:    sub sp, sp, #1024
+; CHECK1024-NEXT:    mov w9, w0
+; CHECK1024-NEXT:    mov x8, sp
+; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    add x9, x9, #15
+; CHECK1024-NEXT:    mov x19, sp
+; CHECK1024-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK1024-NEXT:    sub x8, x8, x9
+; CHECK1024-NEXT:    mov sp, x8
+; CHECK1024-NEXT:    //APP
+; CHECK1024-NEXT:    //NO_APP
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    and x20, x0, #0x1
+; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-NEXT:  // %bb.1: // %entry
+; CHECK1024-NEXT:    smstop sm
+; CHECK1024-NEXT:  .LBB35_2: // %entry
+; CHECK1024-NEXT:    mov x0, x8
+; CHECK1024-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NEXT:    bl memset
+; CHECK1024-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-NEXT:  // %bb.3: // %entry
+; CHECK1024-NEXT:    smstart sm
+; CHECK1024-NEXT:  .LBB35_4: // %entry
+; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NEXT:    .cfi_restore vg
+; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    .cfi_restore z8
+; CHECK1024-NEXT:    .cfi_restore z9
+; CHECK1024-NEXT:    .cfi_restore z10
+; CHECK1024-NEXT:    .cfi_restore z11
+; CHECK1024-NEXT:    .cfi_restore z12
+; CHECK1024-NEXT:    .cfi_restore z13
+; CHECK1024-NEXT:    .cfi_restore z14
+; CHECK1024-NEXT:    .cfi_restore z15
+; CHECK1024-NEXT:    sub sp, x29, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT:    add sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NEXT:    .cfi_restore w19
+; CHECK1024-NEXT:    .cfi_restore w20
+; CHECK1024-NEXT:    .cfi_restore w26
+; CHECK1024-NEXT:    .cfi_restore w27
+; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore w30
+; CHECK1024-NEXT:    .cfi_restore w29
+; CHECK1024-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 %P1
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_realign:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT:    sub x9, sp, #1024
+; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    and x19, x0, #0x1
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB36_2: // %entry
+; CHECK0-NEXT:    mov x0, sp
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB36_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_realign:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
+; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT:    sub x9, sp, #1088
+; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    and x19, x0, #0x1
+; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK64-NEXT:  // %bb.1: // %entry
+; CHECK64-NEXT:    smstop sm
+; CHECK64-NEXT:  .LBB36_2: // %entry
+; CHECK64-NEXT:    mov x0, sp
+; CHECK64-NEXT:    mov w1, #45 // =0x2d
+; CHECK64-NEXT:    bl memset
+; CHECK64-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK64-NEXT:  // %bb.3: // %entry
+; CHECK64-NEXT:    smstart sm
+; CHECK64-NEXT:  .LBB36_4: // %entry
+; CHECK64-NEXT:    mov w0, #22647 // =0x5877
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
+; CHECK64-NEXT:    .cfi_restore vg
+; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    addvl sp, x8, #-18
+; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
+; CHECK64-NEXT:    sub sp, x29, #64
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
+; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    .cfi_restore w19
+; CHECK64-NEXT:    .cfi_restore w26
+; CHECK64-NEXT:    .cfi_restore w27
+; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore w30
+; CHECK64-NEXT:    .cfi_restore w29
+; CHECK64-NEXT:    ret
+;
+; CHECK1024-LABEL: svecc_call_realign:
+; CHECK1024:       // %bb.0: // %entry
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NEXT:    cntd x9
+; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
+; CHECK1024-NEXT:    sub x9, sp, #2048
+; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    //APP
+; CHECK1024-NEXT:    //NO_APP
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    and x19, x0, #0x1
+; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-NEXT:  // %bb.1: // %entry
+; CHECK1024-NEXT:    smstop sm
+; CHECK1024-NEXT:  .LBB36_2: // %entry
+; CHECK1024-NEXT:    mov x0, sp
+; CHECK1024-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NEXT:    bl memset
+; CHECK1024-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-NEXT:  // %bb.3: // %entry
+; CHECK1024-NEXT:    smstart sm
+; CHECK1024-NEXT:  .LBB36_4: // %entry
+; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NEXT:    .cfi_restore vg
+; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    .cfi_restore z8
+; CHECK1024-NEXT:    .cfi_restore z9
+; CHECK1024-NEXT:    .cfi_restore z10
+; CHECK1024-NEXT:    .cfi_restore z11
+; CHECK1024-NEXT:    .cfi_restore z12
+; CHECK1024-NEXT:    .cfi_restore z13
+; CHECK1024-NEXT:    .cfi_restore z14
+; CHECK1024-NEXT:    .cfi_restore z15
+; CHECK1024-NEXT:    sub sp, x29, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT:    add sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NEXT:    .cfi_restore w19
+; CHECK1024-NEXT:    .cfi_restore w26
+; CHECK1024-NEXT:    .cfi_restore w27
+; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore w30
+; CHECK1024-NEXT:    .cfi_restore w29
+; CHECK1024-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 1000, align 32
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    sub sp, sp, #48
+; CHECK0-NEXT:    addvl sp, sp, #-1
+; CHECK0-NEXT:    mov x19, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w20, -16
+; CHECK0-NEXT:    .cfi_offset w26, -24
+; CHECK0-NEXT:    .cfi_offset w27, -32
+; CHECK0-NEXT:    .cfi_offset w28, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 64 - 8 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 64 - 16 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 64 - 24 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 64 - 32 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 64 - 40 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 64 - 48 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
+; CHECK0-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK0-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK0-NEXT:    mov x9, sp
+; CHECK0-NEXT:    add x8, x8, #15
+; CHECK0-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK0-NEXT:    sub x20, x9, x8
+; CHECK0-NEXT:    mov sp, x20
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    add x0, x19, #8
+; CHECK0-NEXT:    bl bar
+; CHECK0-NEXT:    addvl x0, x29, #-19
+; CHECK0-NEXT:    bl bar
+; CHECK0-NEXT:    mov x0, x20
+; CHECK0-NEXT:    bl bar
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x20, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #112
+; CHECK64-NEXT:    addvl sp, sp, #-1
+; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w20, -24
+; CHECK64-NEXT:    .cfi_offset w26, -32
+; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w28, -48
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 128 - 8 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 128 - 16 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 128 - 24 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 128 - 32 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 128 - 40 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 128 - 48 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 128 - 56 * VG
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
+; CHECK64-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK64-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK64-NEXT:    mov x9, sp
+; CHECK64-NEXT:    add x8, x8, #15
+; CHECK64-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK64-NEXT:    sub x20, x9, x8
+; CHECK64-NEXT:    mov sp, x20
+; CHECK64-NEXT:    //APP
+; CHECK64-NEXT:    //NO_APP
+; CHECK64-NEXT:    add x0, x19, #8
+; CHECK64-NEXT:    bl bar
+; CHECK64-NEXT:    sub x0, x29, #64
+; CHECK64-NEXT:    addvl x0, x0, #-19
+; CHECK64-NEXT:    bl bar
+; CHECK64-NEXT:    mov x0, x20
+; CHECK64-NEXT:    bl bar
+; CHECK64-NEXT:    mov w0, #22647 // =0x5877
+; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
+; CHECK64-NEXT:    addvl sp, x8, #-18
+; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK64-NEXT:    sub sp, x29, #64
+; CHECK64-NEXT:    ldp x20, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    ret
+;
+; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024:       // %bb.0: // %entry
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NEXT:    sub sp, sp, #1072
+; CHECK1024-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NEXT:    mov x19, sp
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NEXT:    .cfi_offset w28, -48
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 1088 - 8 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 1088 - 16 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 1088 - 24 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 1088 - 32 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 1088 - 40 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 1088 - 48 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 1088 - 56 * VG
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
+; CHECK1024-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-NEXT:    mov x9, sp
+; CHECK1024-NEXT:    add x8, x8, #15
+; CHECK1024-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-NEXT:    sub x20, x9, x8
+; CHECK1024-NEXT:    mov sp, x20
+; CHECK1024-NEXT:    //APP
+; CHECK1024-NEXT:    //NO_APP
+; CHECK1024-NEXT:    add x0, x19, #8
+; CHECK1024-NEXT:    bl bar
+; CHECK1024-NEXT:    sub x0, x29, #1024
+; CHECK1024-NEXT:    addvl x0, x0, #-19
+; CHECK1024-NEXT:    bl bar
+; CHECK1024-NEXT:    mov x0, x20
+; CHECK1024-NEXT:    bl bar
+; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NEXT:    sub sp, x29, #1024
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT:    add sp, sp, #1088
+; CHECK1024-NEXT:    ret
+entry:
+  %a = alloca i32, i32 10
+  %b = alloca <vscale x 4 x i32>
+  %c = alloca i32, i32 %P1, align 4
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  call void @bar(ptr noundef nonnull %a)
+  call void @bar(ptr noundef nonnull %b)
+  call void @bar(ptr noundef nonnull %c)
+  ret i32 -396142473
+}

From 34a48941498d95ec2682f7adaeb6115b7b4d70ba Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 18 Jun 2025 13:06:49 +0100
Subject: [PATCH 0798/1322] [X86] detectZextAbsDiff - use
 SDPatternMatch::m_Abs() matcher. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ca5fc5376f0..4751361c71f2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46009,11 +46009,9 @@ static bool detectZextAbsDiff(SDValue Abs, SDValue &Op0, SDValue &Op1) {
 
   // Check if the operands of the sub are zero-extended from vectors of i8.
   EVT SrcVT0, SrcVT1;
-  return sd_match(
-             Abs,
-             m_UnaryOp(ISD::ABS,
-                       m_Sub(m_AllOf(m_Value(Op0), m_ZExt(m_VT(SrcVT0))),
-                             m_AllOf(m_Value(Op1), m_ZExt(m_VT(SrcVT1)))))) &&
+  return sd_match(Abs,
+                  m_Abs(m_Sub(m_AllOf(m_Value(Op0), m_ZExt(m_VT(SrcVT0))),
+                              m_AllOf(m_Value(Op1), m_ZExt(m_VT(SrcVT1)))))) &&
          SrcVT0.getVectorElementType() == MVT::i8 &&
          SrcVT1.getVectorElementType() == MVT::i8;
 }

From 7c15edb306932e41c159f3d69c161ed0d89d47b7 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 18 Jun 2025 14:37:29 +0200
Subject: [PATCH 0799/1322] =?UTF-8?q?Reapply=20"[clang][bytecode]=20Alloca?=
 =?UTF-8?q?te=20IntegralAP=20and=20Floating=20types=20usi=E2=80=A6=20(#144?=
 =?UTF-8?q?676)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ng an allocator (#144246)"

This reverts commit 57828fec760f086b334ce0cb1c465fc559dcaea4.
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 122 ++++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 +
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  58 ++-
 clang/lib/AST/ByteCode/Floating.h             | 252 ++++++++-----
 clang/lib/AST/ByteCode/Integral.h             |   3 +
 clang/lib/AST/ByteCode/IntegralAP.h           | 234 +++++++-----
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +++++-
 clang/lib/AST/ByteCode/Interp.h               | 341 ++++++++++++++----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 ++-
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 ++
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 +
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 17 files changed, 936 insertions(+), 349 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 9fe4803ce98e..3f884ed8d094 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,7 +748,8 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  return this->emitConstFloat(E->getValue(), E);
+  APFloat F = E->getValue();
+  return this->emitFloat(F, E);
 }
 
 template <class Emitter>
@@ -4185,8 +4186,10 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float:
-    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
+  case PT_Float: {
+    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
+    return this->emitFloat(F, E);
+  }
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
@@ -4674,10 +4677,7 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      if (!this->emitFinishInit(Init))
-        return false;
-
-      return this->emitPopPtr(Init);
+      return this->emitFinishInitGlobal(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,51 +4698,45 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  } else {
-    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
-
-    if (VarT) {
-      unsigned Offset = this->allocateLocalPrimitive(
-          VD, *VarT, VD->getType().isConstQualified(), nullptr,
-          ScopeKind::Block, IsConstexprUnknown);
-      if (Init) {
-        // If this is a toplevel declaration, create a scope for the
-        // initializer.
-        if (Toplevel) {
-          LocalScope<Emitter> Scope(this);
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-        } else {
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD);
-        }
-      }
-    } else {
-      if (std::optional<unsigned> Offset =
-              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
-                                  IsConstexprUnknown)) {
-        if (!Init)
-          return true;
-
-        if (!this->emitGetPtrLocal(*Offset, Init))
-          return false;
-
-        if (!visitInitializer(Init))
-          return false;
-
-        if (!this->emitFinishInit(Init))
-          return false;
-
-        return this->emitPopPtr(Init);
-      }
-      return false;
-    }
-    return true;
   }
+  // Local variables.
+  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-  return false;
+  if (VarT) {
+    unsigned Offset = this->allocateLocalPrimitive(
+        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
+        IsConstexprUnknown);
+    if (Init) {
+      // If this is a toplevel declaration, create a scope for the
+      // initializer.
+      if (Toplevel) {
+        LocalScope<Emitter> Scope(this);
+        if (!this->visit(Init))
+          return false;
+        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+      } else {
+        if (!this->visit(Init))
+          return false;
+        return this->emitSetLocal(*VarT, Offset, VD);
+      }
+    }
+  } else {
+    if (std::optional<unsigned> Offset = this->allocateLocal(
+            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
+      if (!Init)
+        return true;
+
+      if (!this->emitGetPtrLocal(*Offset, Init))
+        return false;
+
+      if (!visitInitializer(Init))
+        return false;
+
+      return this->emitFinishInitPop(Init);
+    }
+    return false;
+  }
+  return true;
 }
 
 template <class Emitter>
@@ -4751,8 +4745,10 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat())
-    return this->emitConstFloat(Val.getFloat(), E);
+  else if (Val.isFloat()) {
+    APFloat F = Val.getFloat();
+    return this->emitFloat(F, E);
+  }
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6133,8 +6129,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6176,8 +6174,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,6 +6953,20 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
+  assert(!DiscardResult && "Should've been checked before");
+
+  if (Floating::singleWord(F.getSemantics()))
+    return this->emitConstFloat(Floating(F), E);
+
+  APInt I = F.bitcastToAPInt();
+  return this->emitConstFloat(
+      Floating(const_cast<uint64_t *>(I.getRawData()),
+               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
+      E);
+}
+
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index ac3ad84766dc..a1d068cc7e0a 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,6 +391,7 @@ private:
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
+  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 5531295dfa2f..46e4d0d940b3 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 846dc2fe92a7..7c6b78386b14 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,34 +50,56 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto F = Floating::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  auto Sem = Floating::deserializeSemantics(*OpPC);
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
+      llvm::APFloatBase::EnumToSemantics(Sem));
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  Floating Result(Memory.get(), Sem);
+  Floating::deserialize(*OpPC, &Result);
+
+  OpPC += align(Result.bytesToSerialize());
+
+  std::string S;
+  llvm::raw_string_ostream SS(S);
+  SS << Result;
+  return S;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<false>;
+  unsigned BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  OpPC += Result.bytesToSerialize();
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+  return Str;
 }
+
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<true>;
+  unsigned BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+
+  OpPC += Result.bytesToSerialize();
+  return Str;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 3750568fc23c..659892e720ab 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,63 +17,79 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
+// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
+// floating values.
+#define ALLOCATE_ALL 0
+
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
+using APInt = llvm::APInt;
 
+/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 class Floating final {
 private:
-  // The underlying value storage.
-  APFloat F;
+  union {
+    uint64_t Val = 0;
+    uint64_t *Memory;
+  };
+  llvm::APFloatBase::Semantics Semantics;
+
+  APFloat getValue() const {
+    unsigned BitWidth = bitWidth();
+    if (singleWord())
+      return APFloat(getSemantics(), APInt(BitWidth, Val));
+    unsigned NumWords = numWords();
+    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+  }
 
 public:
-  /// Zero-initializes a Floating.
-  Floating() : F(0.0f) {}
-  Floating(const APFloat &F) : F(F) {}
+  Floating() = default;
+  Floating(llvm::APFloatBase::Semantics Semantics)
+      : Val(0), Semantics(Semantics) {}
+  Floating(const APFloat &F) {
 
-  // Static constructors for special floating point values.
-  static Floating getInf(const llvm::fltSemantics &Sem) {
-    return Floating(APFloat::getInf(Sem));
+    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
+    this->copy(F);
   }
-  const APFloat &getAPFloat() const { return F; }
+  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
+      : Memory(Memory), Semantics(Semantics) {}
 
-  bool operator<(Floating RHS) const { return F < RHS.F; }
-  bool operator>(Floating RHS) const { return F > RHS.F; }
-  bool operator<=(Floating RHS) const { return F <= RHS.F; }
-  bool operator>=(Floating RHS) const { return F >= RHS.F; }
-  bool operator==(Floating RHS) const { return F == RHS.F; }
-  bool operator!=(Floating RHS) const { return F != RHS.F; }
-  Floating operator-() const { return Floating(-F); }
+  APFloat getAPFloat() const { return getValue(); }
+
+  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
+  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
+  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
+  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
+    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
+                                       &IsExact);
   }
 
-  Floating toSemantics(const llvm::fltSemantics *Sem,
-                       llvm::RoundingMode RM) const {
-    APFloat Copy = F;
+  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
+                   Floating *Result) const {
+    APFloat Copy = getValue();
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    return Floating(Copy);
-  }
-
-  /// Convert this Floating to one with the same semantics as \Other.
-  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
-    return toSemantics(&Other.F.getSemantics(), RM);
+    Result->copy(Copy);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(F.bitcastToAPInt());
+    return APSInt(getValue().bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(F); }
+  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    F.toString(Buffer);
+    getValue().toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -83,25 +99,62 @@ public:
     return NameStr;
   }
 
-  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
+  unsigned bitWidth() const {
+    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
+  }
+  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
+  bool singleWord() const {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return numWords() == 1;
+  }
+  static bool singleWord(const llvm::fltSemantics &Sem) {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
+  }
+  const llvm::fltSemantics &getSemantics() const {
+    return llvm::APFloatBase::EnumToSemantics(Semantics);
+  }
+
+  void copy(const APFloat &F) {
+    if (singleWord()) {
+      Val = F.bitcastToAPInt().getZExtValue();
+    } else {
+      assert(Memory);
+      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
+                  numWords() * sizeof(uint64_t));
+    }
+  }
+
+  void take(uint64_t *NewMemory) {
+    if (singleWord())
+      return;
+
+    if (Memory)
+      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return F.isNegative(); }
-  bool isZero() const { return F.isZero(); }
-  bool isNonZero() const { return F.isNonZero(); }
-  bool isMin() const { return F.isSmallest(); }
-  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
-  bool isNan() const { return F.isNaN(); }
-  bool isSignaling() const { return F.isSignaling(); }
-  bool isInf() const { return F.isInfinity(); }
-  bool isFinite() const { return F.isFinite(); }
-  bool isNormal() const { return F.isNormal(); }
-  bool isDenormal() const { return F.isDenormal(); }
-  llvm::FPClassTest classify() const { return F.classify(); }
-  APFloat::fltCategory getCategory() const { return F.getCategory(); }
+  bool isNegative() const { return getValue().isNegative(); }
+  bool isZero() const { return getValue().isZero(); }
+  bool isNonZero() const { return getValue().isNonZero(); }
+  bool isMin() const { return getValue().isSmallest(); }
+  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
+  bool isNan() const { return getValue().isNaN(); }
+  bool isSignaling() const { return getValue().isSignaling(); }
+  bool isInf() const { return getValue().isInfinity(); }
+  bool isFinite() const { return getValue().isFinite(); }
+  bool isNormal() const { return getValue().isNormal(); }
+  bool isDenormal() const { return getValue().isDenormal(); }
+  llvm::FPClassTest classify() const { return getValue().classify(); }
+  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
+    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -118,97 +171,130 @@ public:
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating &Result) {
+                                        Floating *Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result = Floating(F);
+    Result->copy(F);
     return Status;
   }
 
-  static Floating bitcastFromMemory(const std::byte *Buff,
-                                    const llvm::fltSemantics &Sem) {
+  static void bitcastFromMemory(const std::byte *Buff,
+                                const llvm::fltSemantics &Sem,
+                                Floating *Result) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-
-    return Floating(APFloat(Sem, API));
+    Result->copy(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = F.bitcastToAPInt();
+    llvm::APInt API = getValue().bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(llvm::fltSemantics *) +
-           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
+    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    // Semantics followed by an APInt.
-    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
-
-    llvm::APInt API = F.bitcastToAPInt();
-    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
-                           bitWidth() / 8);
+    std::memcpy(Buff, &Semantics, sizeof(Semantics));
+    if (singleWord()) {
+      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
+    } else {
+      std::memcpy(Buff + sizeof(Semantics), Memory,
+                  numWords() * sizeof(uint64_t));
+    }
   }
 
-  static Floating deserialize(const std::byte *Buff) {
-    const llvm::fltSemantics *Sem;
-    std::memcpy((void *)&Sem, Buff, sizeof(void *));
-    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
+  static llvm::APFloatBase::Semantics
+  deserializeSemantics(const std::byte *Buff) {
+    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
   }
 
-  static Floating abs(const Floating &F) {
-    APFloat V = F.F;
-    if (V.isNegative())
-      V.changeSign();
-    return Floating(V);
+  static void deserialize(const std::byte *Buff, Floating *Result) {
+    llvm::APFloatBase::Semantics Semantics;
+    std::memcpy(&Semantics, Buff, sizeof(Semantics));
+
+    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
+        llvm::APFloatBase::EnumToSemantics(Semantics));
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+
+    Result->Semantics = Semantics;
+    if (NumWords == 1 && !ALLOCATE_ALL) {
+      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
+    } else {
+      assert(Result->Memory);
+      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
+                  NumWords * sizeof(uint64_t));
+    }
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.add(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.add(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.add(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.add(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.subtract(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.subtract(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.subtract(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.subtract(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.multiply(B.F, RM);
+
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.multiply(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.divide(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.divide(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    *R = -A;
+    R->copy(-A.getValue());
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index 13fdb5369f2b..af5cd2d13ecc 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,6 +99,9 @@ public:
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
+  bool operator>=(unsigned RHS) const {
+    return static_cast<unsigned>(V) >= RHS;
+  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 8ee08dfb5cfe..61cbd14ad174 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,12 +28,19 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
-template <unsigned Bits, bool Signed> class Integral;
 
+/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 template <bool Signed> class IntegralAP final {
-private:
+public:
+  union {
+    uint64_t *Memory = nullptr;
+    uint64_t Val;
+  };
+  uint32_t BitWidth = 0;
   friend IntegralAP<!Signed>;
-  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -52,106 +59,133 @@ private:
                                : V.trunc(BitSize).getZExtValue();
   }
 
+  APInt getValue() const {
+    if (singleWord())
+      return APInt(BitWidth, Val, Signed);
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+    return llvm::APInt(BitWidth, NumWords, Memory);
+  }
+
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  template <typename T>
-  IntegralAP(T Value, unsigned BitWidth)
-      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
+  void take(uint64_t *NewMemory) {
+    assert(!singleWord());
+    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
 
-  IntegralAP(APInt V) : V(V) {}
-  /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
+  void copy(const APInt &V) {
+    assert(BitWidth == V.getBitWidth());
+    assert(numWords() == V.getNumWords());
 
-  IntegralAP operator-() const { return IntegralAP(-V); }
+    if (V.isSingleWord()) {
+      if constexpr (Signed)
+        Val = V.getSExtValue();
+      else
+        Val = V.getZExtValue();
+      return;
+    }
+    assert(Memory);
+    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
+  }
+
+  // Constructors.
+  IntegralAP() = default;
+  IntegralAP(unsigned BitWidth) : BitWidth(BitWidth) {}
+  IntegralAP(uint64_t *Memory, unsigned BitWidth)
+      : Memory(Memory), BitWidth(BitWidth) {}
+  IntegralAP(const APInt &V) : BitWidth(V.getBitWidth()) {
+    if (V.isSingleWord()) {
+      Val = Signed ? V.getSExtValue() : V.getZExtValue();
+    } else {
+      Memory = const_cast<uint64_t *>(V.getRawData());
+    }
+  }
+
+  IntegralAP operator-() const { return IntegralAP(-getValue()); }
   IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(V - Other.V);
+    return IntegralAP(getValue() - Other.getValue());
   }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return V.ugt(RHS.V);
-    return V.sgt(RHS.V);
+      return getValue().sgt(RHS.getValue());
+    return getValue().ugt(RHS.getValue());
   }
-  bool operator>=(IntegralAP RHS) const {
+  bool operator>=(unsigned RHS) const {
     if constexpr (Signed)
-      return V.uge(RHS.V);
-    return V.sge(RHS.V);
+      return getValue().sge(RHS);
+    return getValue().uge(RHS);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return V.slt(RHS.V);
-    return V.slt(RHS.V);
-  }
-  bool operator<=(IntegralAP RHS) const {
-    if constexpr (Signed)
-      return V.ult(RHS.V);
-    return V.ult(RHS.V);
+      return getValue().slt(RHS.getValue());
+    return getValue().ult(RHS.getValue());
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(V);
+    return truncateCast<Ty, Signed>(getValue());
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
+    if (NumBits == 0)
+      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-
+    assert(false);
     return IntegralAP<Signed>(Copy);
   }
 
+  static IntegralAP from(const APInt &Value) {
+    return IntegralAP<Signed>(Value);
+  }
+
   template <bool InputSigned>
   static IntegralAP from(IntegralAP<InputSigned> V, unsigned NumBits = 0) {
     if (NumBits == 0)
       NumBits = V.bitWidth();
 
     if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
+      return IntegralAP<Signed>(V.getValue().sextOrTrunc(NumBits));
+    return IntegralAP<Signed>(V.getValue().zextOrTrunc(NumBits));
   }
 
-  template <unsigned Bits, bool InputSigned>
-  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
-    return IntegralAP<Signed>(I.toAPInt(BitWidth));
-  }
-
-  static IntegralAP zero(int32_t BitWidth) {
-    APInt V = APInt(BitWidth, 0LL, Signed);
-    return IntegralAP(V);
-  }
-
-  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
+  constexpr unsigned bitWidth() const { return BitWidth; }
+  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
+  constexpr bool singleWord() const { return numWords() == 1; }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
+    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(V.sext(Bits), !Signed);
+      return APSInt(getValue().sext(Bits), !Signed);
     else
-      return APSInt(V.zext(Bits), !Signed);
+      return APSInt(getValue().zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return V.isZero(); }
+  bool isZero() const { return getValue().isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return V.isNonNegative();
+      return getValue().isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !V.isNonNegative();
+      return !getValue().isNonNegative();
     return false;
   }
-  bool isMin() const { return V.isMinValue(); }
-  bool isMax() const { return V.isMaxValue(); }
+  bool isMin() const { return getValue().isMinValue(); }
+  bool isMax() const { return getValue().isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && V == -1; }
+  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
 
-  unsigned countLeadingZeros() const { return V.countl_zero(); }
+  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
+  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -161,53 +195,57 @@ public:
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    APInt Copy = V;
-    return IntegralAP<false>(Copy);
+    return IntegralAP<false>(Memory, BitWidth);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
   }
 
-  static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
+  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
+                                IntegralAP *Result) {
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
-    return IntegralAP(V);
+    Result->copy(V);
   }
 
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
+    APInt V1 = getValue();
+    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V.slt(RHS.V))
+      if (V1.slt(V2))
         return ComparisonCategoryResult::Less;
-      if (V.sgt(RHS.V))
+      if (V1.sgt(V2))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V.ult(RHS.V))
+    if (V1.ult(V2))
       return ComparisonCategoryResult::Less;
-    if (V.ugt(RHS.V))
+    if (V1.ugt(V2))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return add(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return sub(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -224,87 +262,97 @@ public:
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.srem(B.V));
+      R->copy(A.getValue().srem(B.getValue()));
     else
-      *R = IntegralAP(A.V.urem(B.V));
+      R->copy(A.getValue().urem(B.getValue()));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.sdiv(B.V));
+      R->copy(A.getValue().sdiv(B.getValue()));
     else
-      *R = IntegralAP(A.V.udiv(B.V));
+      R->copy(A.getValue().udiv(B.getValue()));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V & B.V);
+    R->copy(A.getValue() & B.getValue());
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    *R = IntegralAP(A.V | B.V);
+    R->copy(A.getValue() | B.getValue());
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V ^ B.V);
+    R->copy(A.getValue() ^ B.getValue());
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.V;
+    APInt AI = A.getValue();
     AI.negate();
-    *R = IntegralAP(AI);
+    R->copy(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    *R = IntegralAP(~A.V);
+    R->copy(~A.getValue());
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
+    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.V.getZExtValue();
+    unsigned ShiftAmount = B.getValue().getZExtValue();
     if constexpr (Signed)
-      *R = IntegralAP(A.V.ashr(ShiftAmount));
+      R->copy(A.getValue().ashr(ShiftAmount));
     else
-      *R = IntegralAP(A.V.lshr(ShiftAmount));
+      R->copy(A.getValue().lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
-    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
+    assert(BitWidth != 0);
+    uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
+    return sizeof(uint32_t) + (NumWords * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
-    uint32_t BitWidth = V.getBitWidth();
-
     std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
-    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
-                           BitWidth / CHAR_BIT);
+    if (singleWord())
+      std::memcpy(Buff + sizeof(uint32_t), &Val, sizeof(uint64_t));
+    else {
+      uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
+      std::memcpy(Buff + sizeof(uint32_t), Memory, NumWords * sizeof(uint64_t));
+    }
   }
 
-  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
-    uint32_t BitWidth;
-    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
-    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
+  static uint64_t deserializeSize(const std::byte *Buff) {
+    return *reinterpret_cast<const uint32_t *>(Buff);
+  }
 
-    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
-                            BitWidth / CHAR_BIT);
-    return Val;
+  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
+    uint64_t BitWidth = Result->BitWidth;
+    uint64_t NumWords = llvm::APInt::getNumWords(BitWidth);
+    assert(BitWidth == Result->BitWidth);
+
+    if (NumWords == 1)
+      std::memcpy(&Result->Val, Buff + sizeof(uint32_t), sizeof(uint64_t));
+    else {
+      assert(Result->Memory);
+      std::memcpy(Result->Memory, Buff + sizeof(uint32_t),
+                  NumWords * sizeof(uint64_t));
+    }
   }
 
 private:
@@ -312,7 +360,7 @@ private:
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->V = Op<APInt>{}(A.V, B.V);
+      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
       return false;
     }
 
@@ -320,7 +368,7 @@ private:
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->V = Result;
+    R->copy(Result);
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 5c8abffb3a99..1e2032feabb6 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,8 +1935,10 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
@@ -1946,8 +1948,10 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2053,6 +2057,100 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
+                                PrimType T) {
+
+  if (T == PT_IntAPS) {
+    auto &Val = Ptr.deref<IntegralAP<true>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_IntAP) {
+    auto &Val = Ptr.deref<IntegralAP<false>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_Float) {
+    auto &Val = Ptr.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+}
+
+template <typename T>
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
+  assert(needsAlloc<T>());
+  auto &Val = Ptr.deref<T>();
+  if (!Val.singleWord()) {
+    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+    Val.take(NewMemory);
+  }
+}
+
+static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
+  if (const Record *R = Ptr.getRecord()) {
+    for (const Record::Field &Fi : R->fields()) {
+      if (Fi.Desc->isPrimitive()) {
+        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
+          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
+        });
+        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
+      } else
+        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
+    }
+    return;
+  }
+
+  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
+    unsigned NumElems = D->getNumElems();
+    if (NumElems == 0)
+      return;
+
+    if (D->isPrimitiveArray()) {
+      PrimType PT = D->getPrimType();
+      if (!needsAlloc(PT))
+        return;
+      assert(NumElems >= 1);
+      const Pointer EP = Ptr.atIndex(0);
+      bool AllSingleWord = true;
+      TYPE_SWITCH_ALLOC(PT, {
+        if (!EP.deref<T>().singleWord()) {
+          copyPrimitiveMemory<T>(S, EP);
+          AllSingleWord = false;
+        }
+      });
+      if (AllSingleWord)
+        return;
+      for (unsigned I = 1; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I);
+        copyPrimitiveMemory(S, EP, PT);
+      }
+    } else {
+      assert(D->isCompositeArray());
+      for (unsigned I = 0; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I).narrow();
+        finishGlobalRecurse(S, EP);
+      }
+    }
+  }
+}
+
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+
+  finishGlobalRecurse(S, Ptr);
+  if (Ptr.canBeInitialized()) {
+    Ptr.initialize();
+    Ptr.activate();
+  }
+
+  return true;
+}
+
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index ae3d4a441a79..66d3e6d79e8b 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
+  if (Bits > 1 && RHS >= Bits) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,6 +370,9 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -408,6 +411,7 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -423,7 +427,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -434,6 +438,7 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -442,7 +447,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -453,6 +458,7 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
+
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -461,8 +467,10 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
+
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -484,9 +492,14 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -539,10 +552,20 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).initialize();
+    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    // Result.atIndex(1).initialize();
+
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    Result.atIndex(1).initialize();
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+
     Result.initialize();
   } else {
     // Integer element type.
@@ -608,9 +631,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -625,9 +651,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -644,7 +673,11 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -659,12 +692,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -679,12 +715,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -707,8 +746,10 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -730,31 +771,44 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
-  T Result;
 
-  if (!T::neg(Value, &Result)) {
+  if constexpr (std::is_same_v<T, Floating>) {
+    T Result = S.allocFloat(Value.getSemantics());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+    return false;
+  } else {
+    T Result;
+    if constexpr (needsAlloc<T>())
+      Result = S.allocAP<T>(Value.bitWidth());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+
+    assert(isIntegralType(Name) &&
+           "don't expect other types to fail at constexpr negation");
     S.Stk.push<T>(Result);
-    return true;
+
+    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+    if (S.checkingForUndefinedBehavior()) {
+      const Expr *E = S.Current->getExpr(OpPC);
+      QualType Type = E->getType();
+      SmallString<32> Trunc;
+      NegatedValue.trunc(Result.bitWidth())
+          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                    /*UpperCase=*/true, /*InsertSeparators=*/true);
+      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+          << Trunc << Type << E->getSourceRange();
+      return true;
+    }
+
+    return handleOverflow(S, OpPC, NegatedValue);
   }
-
-  assert(isIntegralType(Name) &&
-         "don't expect other types to fail at constexpr negation");
-  S.Stk.push<T>(Result);
-
-  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-  if (S.checkingForUndefinedBehavior()) {
-    const Expr *E = S.Current->getExpr(OpPC);
-    QualType Type = E->getType();
-    SmallString<32> Trunc;
-    NegatedValue.trunc(Result.bitWidth())
-        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                  /*UpperCase=*/true, /*InsertSeparators=*/true);
-    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-        << Trunc << Type << E->getSourceRange();
-    return true;
-  }
-
-  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -783,6 +837,8 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -890,7 +946,6 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
-
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -898,7 +953,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result;
+  Floating Result = S.allocFloat(Value.getSemantics());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -952,12 +1007,15 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Val.bitWidth());
+
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
-
   return false;
 }
 
@@ -1325,10 +1383,23 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Arg.bitWidth());
+    Result.copy(Arg.toAPSInt());
+    S.Stk.push<T>(Result);
+    return true;
+  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
+inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
+  Floating Result = S.allocFloat(F.getSemantics());
+  Result.copy(F.getAPFloat());
+  S.Stk.push<Floating>(Result);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1483,7 +1554,24 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
+
   P.deref<T>() = S.Stk.pop<T>();
+
+  if constexpr (std::is_same_v<T, Floating>) {
+    auto &Val = P.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+
+  } else if constexpr (needsAlloc<T>()) {
+    auto &Val = P.deref<T>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+
   P.initialize();
   return true;
 }
@@ -1585,7 +1673,22 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Value.bitWidth());
+    if (T::isSigned())
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .sextOrTrunc(Value.bitWidth()));
+    else
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .zextOrTrunc(Value.bitWidth()));
+
+    Field.deref<T>() = Result;
+  } else {
+    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+  }
   Field.activate();
   Field.initialize();
   return true;
@@ -1765,6 +1868,8 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
+
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2271,7 +2376,8 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = F.toSemantics(Sem, RM);
+  Floating Result = S.allocFloat(*Sem);
+  F.toSemantics(Sem, RM, &Result);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2295,15 +2401,25 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2312,11 +2428,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
-  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
+  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2365,7 +2481,12 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<false>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2381,7 +2502,12 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<true>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2441,8 +2567,9 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-
-  S.Stk.push<Floating>(Fixed.toFloat(Sem));
+  Floating Result = S.allocFloat(*Sem);
+  Result.copy(Fixed.toFloat(Sem));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -2506,12 +2633,18 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2578,7 +2711,9 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                    LT *Result) {
+
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2596,7 +2731,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS);
+        S, OpPC, LHS, RHS, Result);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2644,6 +2779,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
+      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2652,6 +2788,48 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   return true;
 }
 
+/// A version of DoShift that works on IntegralAP.
+template <class LT, class RT, ShiftDir Dir>
+inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                      LT *Result) {
+  const unsigned Bits = LHS.bitWidth();
+  const APSInt &LHSAP = LHS.toAPSInt();
+  APSInt RHSAP = RHS.toAPSInt();
+
+  // OpenCL 6.3j: shift values are effectively % word size of LHS.
+  if (S.getLangOpts().OpenCL)
+    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
+                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
+                    RHSAP.isUnsigned());
+
+  if (RHS.isNegative()) {
+    // During constant-folding, a negative shift is an opposite shift. Such a
+    // shift is not a constant expression.
+    const SourceInfo &Loc = S.Current->getSource(OpPC);
+    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
+    if (!S.noteUndefinedBehavior())
+      return false;
+    RHS = -RHS;
+    return DoShiftAP<LT, RT,
+                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
+        S, OpPC, LHS, RHS, Result);
+  }
+
+  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
+    return false;
+
+  if constexpr (Dir == ShiftDir::Left) {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP << SA);
+  } else {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP >> SA);
+  }
+
+  S.Stk.push<LT>(*Result);
+  return true;
+}
+
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2659,7 +2837,13 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2668,8 +2852,13 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-
-  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3252,7 +3441,15 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+      Floating Result = S.allocFloat(*Sem);
+      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
+      S.Stk.push<Floating>(Result);
+
+      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+    } else if constexpr (needsAlloc<T>()) {
+      T Result = S.allocAP<T>(ResultBitWidth);
+      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
+      S.Stk.push<T>(Result);
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3310,7 +3507,11 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  Floating F = Floating::deserialize(*OpPC);
+  auto &Semantics =
+      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
+
+  auto F = S.allocFloat(Semantics);
+  Floating::deserialize(*OpPC, &F);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3318,17 +3519,25 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<false>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<true>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d01e3d042a8b..5304bd77f2c0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,6 +57,21 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
+
+  if (T == PT_IntAPS) {
+    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<true>>(Result);
+    return;
+  }
+
+  if (T == PT_IntAP) {
+    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<false>>(Result);
+    return;
+  }
+
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -327,13 +342,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result;
+  Floating Result = S.allocFloat(TargetSemantics);
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -342,10 +357,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -360,7 +375,9 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
+  Floating Result = S.allocFloat(TargetSemantics);
+  Result.copy(APFloat::getInf(TargetSemantics));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -368,10 +385,12 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  S.Stk.push<Floating>(Floating(Copy));
+  Result.copy(Copy);
+  S.Stk.push<Floating>(Result);
 
   return true;
 }
@@ -380,11 +399,13 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -392,11 +413,13 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -571,8 +594,16 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
+  APFloat F = Val.getAPFloat();
+  if (!F.isNegative()) {
+    S.Stk.push<Floating>(Val);
+    return true;
+  }
 
-  S.Stk.push<Floating>(Floating::abs(Val));
+  Floating Result = S.allocFloat(Val.getSemantics());
+  F.changeSign();
+  Result.copy(F);
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 239b3104e89f..2569cac018b3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,7 +402,9 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
+          Floating R = S.allocFloat(Semantics);
+          Floating::bitcastFromMemory(M.get(), Semantics, &R);
+          P.deref<Floating>() = R;
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index e8dc6f0483d6..08765561985e 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,6 +15,7 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
+#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -126,6 +127,33 @@ public:
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
+  void *allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *allocate(size_t Num = 1) const {
+    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
+  }
+
+  template <typename T> T allocAP(unsigned BitWidth) {
+    unsigned NumWords = APInt::getNumWords(BitWidth);
+    if (NumWords == 1)
+      return T(BitWidth);
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return T(Mem, BitWidth);
+  }
+
+  Floating allocFloat(const llvm::fltSemantics &Sem) {
+    if (Floating::singleWord(Sem))
+      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
+
+    unsigned NumWords =
+        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
+  }
+
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -161,6 +189,8 @@ public:
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
+
+  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index c76ac5f8ae86..57e01f7bd9da 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,6 +48,7 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
+
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -88,6 +89,9 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
+def IntegralTypeClass : TypeClass {
+  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
+}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -265,12 +269,13 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstFloat : ConstOpcode<Float, ArgFloat>;
-def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
+def ConstFloat : Opcode { let Args = [ArgFloat]; }
+
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -328,6 +333,7 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
+def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -389,7 +395,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [AluTypeClass];
+  let Types = [IntegralTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index 6152fbfbe3a7..a156cccbb3c1 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,6 +76,13 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
+template <typename T> constexpr bool needsAlloc() {
+  return std::is_same_v<T, IntegralAP<false>> ||
+         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
+}
+constexpr bool needsAlloc(PrimType T) {
+  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
+}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -209,6 +216,16 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
+#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
+  do {                                                                         \
+    switch (Expr) {                                                            \
+      TYPE_SWITCH_CASE(PT_Float, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
+    default:;                                                                  \
+    }                                                                          \
+  } while (0)
+
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 23ba1bbd193b..5d9c42244749 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,6 +132,14 @@ public:
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
+  void *Allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *Allocate(size_t Num = 1) const {
+    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
+  }
+  void Deallocate(void *Ptr) const {}
+
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -204,7 +212,7 @@ private:
   };
 
   /// Allocator for globals.
-  PoolAllocTy Allocator;
+  mutable PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -238,4 +246,18 @@ public:
 } // namespace interp
 } // namespace clang
 
+inline void *operator new(size_t Bytes, const clang::interp::Program &C,
+                          size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
+inline void operator delete(void *Ptr, const clang::interp::Program &C,
+                            size_t) {
+  C.Deallocate(Ptr);
+}
+inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
+                            size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 710612bef8fd..1013a771d13b 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,6 +21,9 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
+#if __x86_64
+  // both-note@-2 {{indeterminate value can only initialize an object of type}}
+#endif
 }
 
 template <class Intermediate, class Init>
@@ -38,11 +41,8 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
-#if 0
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
-                                                                                 // expected-note{{in call}}
-#endif
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
+                                                                                 // both-note{{in call}}
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 21dca15a4577..174c1ffa79a4 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  //constexpr long double NaN5 = __builtin_nanf128("");
+  constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,8 +655,6 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
-/// FIXME: The commented out tests here use a IntAP value and fail.
-/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -709,7 +707,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int clz50 = __builtin_clzg((unsigned __int128)0);
+  int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -717,7 +715,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -775,7 +773,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -785,7 +783,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 6265ca686dfe18e6032e59637f144bad7ea6cf2b Mon Sep 17 00:00:00 2001
From: Ties Stuij <ties.stuij@arm.com>
Date: Wed, 18 Jun 2025 13:38:49 +0100
Subject: [PATCH 0800/1322] [AArch64] Add Cortex-A320 scheduling model
 (#144385)

Instead of using the Cortex-A510 scheduling model, Cortex-A320 now uses
its own scheduling model, based off of the Cortex-A320 Software
Optimization Guide:

https://developer.arm.com/documentation/110285/r0p1

---------

Co-authored-by: Nashe Mncube <Nashe.Mncube@arm.com>
---
 llvm/lib/Target/AArch64/AArch64.td            |     1 +
 llvm/lib/Target/AArch64/AArch64Processors.td  |     2 +-
 llvm/lib/Target/AArch64/AArch64SchedA320.td   |  1415 +++
 .../AArch64/Cortex/A320-basic-instructions.s  |  3721 ++++++
 .../AArch64/Cortex/A320-neon-instructions.s   |  3208 +++++
 .../AArch64/Cortex/A320-sve-instructions.s    | 10258 ++++++++++++++++
 6 files changed, 18604 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/AArch64/AArch64SchedA320.td
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/A320-basic-instructions.s
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/A320-neon-instructions.s
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/A320-sve-instructions.s

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index f303819f411d..eb5a5199b895 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -103,6 +103,7 @@ def MTEUnsupported : AArch64Unsupported {
 let F = [HasPAuth, HasPAuthLR] in
 def PAUnsupported : AArch64Unsupported;
 
+include "AArch64SchedA320.td"
 include "AArch64SchedA53.td"
 include "AArch64SchedA55.td"
 include "AArch64SchedA510.td"
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index c7ea6393e2ad..e1b82953aad8 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1131,7 +1131,7 @@ def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,
                      [TuneA35]>;
 def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
                      [TuneA35]>;
-def : ProcessorModel<"cortex-a320", CortexA510Model, ProcessorFeatures.A320,
+def : ProcessorModel<"cortex-a320", CortexA320Model, ProcessorFeatures.A320,
                      [TuneA320]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, ProcessorFeatures.A53,
                      [TuneA53]>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA320.td b/llvm/lib/Target/AArch64/AArch64SchedA320.td
new file mode 100644
index 000000000000..89ed13389daf
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedA320.td
@@ -0,0 +1,1415 @@
+//==- AArch64SchedCortexA320.td - ARM Cortex-A320 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-A320 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A320 machine model for scheduling and other instruction cost heuristics.
+def CortexA320Model : SchedMachineModel {
+  let MicroOpBufferSize = 0;  // Cortex-A320 is an in-order processor
+  let IssueWidth = 1;         // Cortex-A320 is a single-issue processor
+  let LoadLatency = 5;
+  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
+  let CompleteModel = 0;      // Covers instructions applicable to Cortex-A320.
+
+  let FullInstRWOverlapCheck = 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types
+
+let SchedModel = CortexA320Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the
+// Cortex-A320 is in-order.
+let BufferSize = 0 in {
+  def CortexA320UnitALU    : ProcResource<1>;    // Int ALU
+  def CortexA320UnitMAC    : ProcResource<1>;    // Int MAC, 64-bit wide
+  def CortexA320UnitDiv    : ProcResource<1>;    // Int Division, not pipelined
+  def CortexA320UnitLdSt   : ProcResource<1>;    // Load/Store shared pipe
+  def CortexA320UnitB      : ProcResource<1>;    // Branch
+  def CortexA320UnitPAC    : ProcResource<1>;    // Pointer Authentication (PAC) pipe
+
+  // The FP DIV/SQRT instructions execute totally differently from the FP ALU
+  // instructions; that's why for now we model them with 2 resources.
+  def CortexA320UnitVALU  : ProcResource<1>;    // SIMD/FP/SVE ALU
+  def CortexA320UnitVMAC   : ProcResource<1>;    // SIMD/FP/SVE MAC
+  def CortexA320UnitVMC    : ProcResource<1>;    // SIMD/FP/SVE multicycle instrs  (e.g Div, SQRT, cryptography)
+}
+
+// These latencies are modeled without taking into account forwarding paths
+// (the software optimisation guide lists latencies taking into account
+// typical forwarding paths).
+def : WriteRes<WriteImm,   [CortexA320UnitALU]> { let Latency = 1; }  // MOVN, MOVZ
+def : WriteRes<WriteI,     [CortexA320UnitALU]> { let Latency = 1; }  // ALU
+def : WriteRes<WriteISReg, [CortexA320UnitALU]> { let Latency = 2; }  // ALU of Shifted-Reg
+def : WriteRes<WriteIEReg, [CortexA320UnitALU]> { let Latency = 2; }  // ALU of Extended-Reg
+def : WriteRes<WriteExtr, [CortexA320UnitALU]> { let Latency = 2; }   // EXTR from a reg pair
+def : WriteRes<WriteIS, [CortexA320UnitALU]> { let Latency = 2; }     // Shift/Scale
+
+// MAC
+def : WriteRes<WriteIM32, [CortexA320UnitMAC]> { let Latency = 3; }   // 32-bit Multiply
+def : WriteRes<WriteIM64, [CortexA320UnitMAC]> { let Latency = 5; let ReleaseAtCycles = [2];}   // 64-bit Multiply
+
+// Div
+def : WriteRes<WriteID32, [CortexA320UnitDiv]> {
+  let Latency = 12; let ReleaseAtCycles = [12];
+}
+def : WriteRes<WriteID64, [CortexA320UnitDiv]> {
+  let Latency = 20; let ReleaseAtCycles = [20];
+}
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to Cortex-A320
+
+//===----------------------------------------------------------------------===//
+class CortexA320Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+}
+
+class CortexA320MCWrite<int n, int m, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+  let ReleaseAtCycles = [m];
+  let BeginGroup = 1;
+}
+
+class CortexA320MC_RC0Write<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+  let BeginGroup = 1;
+}
+
+//===----------------------------------------------------------------------===//
+
+// Define generic 2 micro-op types
+def CortexA320Write_11cyc_1VMAC_1VALU : SchedWriteRes<[CortexA320UnitVALU, CortexA320UnitVMAC]> {
+  let Latency     = 11;
+  let NumMicroOps = 2;
+}
+
+def CortexA320Write_16cyc_1VMAC_1VALU : SchedWriteRes<[CortexA320UnitVALU, CortexA320UnitVMAC]> {
+  let Latency     = 16;
+  let NumMicroOps = 2;
+}
+
+class CortexA320Write_PAC_B <int lat> : SchedWriteRes<[CortexA320UnitPAC, CortexA320UnitB]> {
+  let Latency = lat;
+  let NumMicroOps = 2;
+}
+
+// Load
+def : WriteRes<WriteLD, [CortexA320UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [CortexA320UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [CortexA320UnitLdSt]> { let Latency = 4; }
+
+def CortexA320WriteVLD1 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; }
+def CortexA320WriteVLD1SI : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; let SingleIssue = 1; }
+
+def CortexA320WriteVLD2 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4;
+                                                  let ReleaseAtCycles = [2]; }
+
+def CortexA320WriteVLD3 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [3]; }
+
+def CortexA320WriteVLD4 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6;
+                                                  let ReleaseAtCycles = [4]; }
+
+def CortexA320WriteVLD6 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [3]; }
+
+def CortexA320WriteVLD8 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6;
+                                                  let ReleaseAtCycles = [4]; }
+
+def CortexA320WriteLDP1 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+def CortexA320WriteLDP2 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+def CortexA320WriteLDP4 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+
+// Pre/Post Indexing - Performed as part of address generation
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+let RetireOOO = 1 in {
+def : WriteRes<WriteST, [CortexA320UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [CortexA320UnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [CortexA320UnitLdSt]> { let Latency = 1; }
+}
+def : WriteRes<WriteSTX, [CortexA320UnitLdSt]> { let Latency = 3; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [CortexA320UnitLdSt]> { let Latency = 5;
+                                          let ReleaseAtCycles = [2];}
+def CortexA320WriteVST1 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; }
+def CortexA320WriteVST2 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [2]; }
+def CortexA320WriteVST3 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [3]; }
+def CortexA320WriteVST4 : SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5;
+                                                  let ReleaseAtCycles = [4]; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [CortexA320UnitB]>;
+def : WriteRes<WriteBrReg, [CortexA320UnitB]>;
+def : WriteRes<WriteSys, [CortexA320UnitB]>;
+def : WriteRes<WriteBarrier, [CortexA320UnitB]>;
+def : WriteRes<WriteHint, [CortexA320UnitB]>;
+
+// FP ALU
+//   As WriteF result is produced in F5 and it can be mostly forwarded
+//   to consumer at F1, the effectively Latency is set as 4.
+def : WriteRes<WriteF, [CortexA320UnitVALU]> { let Latency = 4; }
+def : WriteRes<WriteFCmp, [CortexA320UnitVALU]> { let Latency = 3; }
+def : WriteRes<WriteFCvt, [CortexA320UnitVALU]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [CortexA320UnitVALU]> { let Latency = 3; }
+def : WriteRes<WriteFImm, [CortexA320UnitVALU]> { let Latency = 3; }
+
+class CortexA320VSt<int n> : SchedWriteRes<[CortexA320UnitLdSt]> {
+  let RetireOOO = 1;
+  let ReleaseAtCycles = [n];
+}
+
+def CortexA320VSt0      : SchedWriteRes<[CortexA320UnitLdSt]> {
+  let RetireOOO = 1;
+}
+
+def : SchedAlias<WriteVd, CortexA320Write<4, CortexA320UnitVALU>>;
+def : SchedAlias<WriteVq, CortexA320Write<4, CortexA320UnitVALU>>;
+
+// FP ALU specific new schedwrite definitions
+def CortexA320WriteFPALU_F3 : SchedWriteRes<[CortexA320UnitVALU]> { let Latency = 3;}
+def CortexA320WriteFPALU_F4 : SchedWriteRes<[CortexA320UnitVALU]> { let Latency = 4;}
+
+// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
+def : WriteRes<WriteFMul, [CortexA320UnitVMAC]> { let Latency = 4; }
+
+let RetireOOO = 1 in {
+def : WriteRes<WriteFDiv, [CortexA320UnitVMC]> { let Latency = 22;
+                                            let ReleaseAtCycles = [29]; }
+def CortexA320WriteVMAC : SchedWriteRes<[CortexA320UnitVMAC]> { let Latency = 4; }
+def CortexA320WriteFDivHP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 8;
+                                                     let ReleaseAtCycles = [5]; }
+def CortexA320WriteFDivSP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 13;
+                                                     let ReleaseAtCycles = [10]; }
+def CortexA320WriteFDivDP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 22;
+                                                     let ReleaseAtCycles = [19]; }
+def CortexA320WriteFSqrtHP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 8;
+                                                      let ReleaseAtCycles = [5]; }
+def CortexA320WriteFSqrtSP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 12;
+                                                      let ReleaseAtCycles = [9]; }
+def CortexA320WriteFSqrtDP : SchedWriteRes<[CortexA320UnitVMC]> { let Latency = 22;
+                                                      let ReleaseAtCycles = [19]; }
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+def : ReadAdvance<ReadVLD, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 1>;
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+
+
+// MUL
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 2>;
+
+// Div
+def : ReadAdvance<ReadID, 0>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+def CortexA320WriteISReg : SchedWriteVariant<[
+       SchedVar<RegShiftedPred, [WriteISReg]>,
+       SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[CortexA320WriteISReg], (instregex ".*rs$")>;
+def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
+
+// Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[CortexA320Write<4, CortexA320UnitPAC>], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[CortexA320Write_PAC_B<1>], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+                                            BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+                                            ERETAA, ERETAB)>;
+
+// Load register, with pointer authentication
+def : InstRW<[CortexA320Write<2, CortexA320UnitPAC>], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[CortexA320Write<5, CortexA320UnitPAC>], (instrs XPACD, XPACI, XPACLRI)>;
+//---
+// Miscellaneous
+//---
+def : InstRW<[CortexA320WriteVLD1SI,CortexA320WriteLDP1], (instregex "LDPS?Wi")>;
+def : InstRW<[CortexA320WriteVLD1,CortexA320WriteLDP1], (instregex "LDPSi")>;
+def : InstRW<[CortexA320WriteVLD1,CortexA320WriteLDP2], (instregex "LDP(X|D)i")>;
+def : InstRW<[CortexA320WriteVLD1,CortexA320WriteLDP4], (instregex "LDPQi")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1SI,CortexA320WriteLDP1], (instregex "LDPS?W(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1,CortexA320WriteLDP1], (instregex "LDPS(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1,CortexA320WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1,CortexA320WriteLDP4], (instregex "LDPQ(pre|post)")>;
+def : InstRW<[WriteI], (instrs COPY)>;
+//---
+// Vector Loads - 128-bit per cycle
+//---
+//   1-element structures
+def CortexA320WriteVLD1Latency3: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [1]; }
+def CortexA320WriteVLD1Latency4: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def CortexA320WriteVLD1Latency5: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [3]; }
+def CortexA320WriteVLD1Latency6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [4]; }
+
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency4], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency4], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency5], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency6], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1i(8|16|32|64)$")>;                // single element
+def : InstRW<[CortexA320WriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency4], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency4], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency5], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency4], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency6], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1i(8|16|32|64)_POST$")>;                // single element
+def : InstRW<[WriteAdr, CortexA320WriteVLD1Latency3], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // replicate
+
+//    2-element structures
+def CortexA320WriteVLD2Latency3: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 3; let ReleaseAtCycles = [2]; }
+def CortexA320WriteVLD2Latency4Release1: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [1]; }
+def CortexA320WriteVLD2Latency4Release2: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def CortexA320WriteVLD2Latency4Release6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [6]; }
+
+def : InstRW<[CortexA320WriteVLD2Latency4Release1], (instregex "LD2Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD2Latency4Release2], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD2Latency4Release6], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVLD2Latency3], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency4Release1], (instregex "LD2Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency4Release2], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency4Release6], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD2Latency3], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+//    3-element structures
+def CortexA320WriteVLD3Latency4: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [3]; }
+def CortexA320WriteVLD3Latency5Release6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [6]; }
+def CortexA320WriteVLD3Latency5Release7: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [7]; }
+
+def : InstRW<[CortexA320WriteVLD3Latency5Release6], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+def : InstRW<[CortexA320WriteVLD3Latency5Release7], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVLD3Latency4], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVLD3Latency5Release6], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD3Latency5Release7], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD3Latency4], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
+
+//    4-element structures
+def CortexA320WriteVLD4Latency4: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 4; let ReleaseAtCycles = [4]; }
+def CortexA320WriteVLD4Latency5Release7: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [7]; }
+def CortexA320WriteVLD4Latency5Release8: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 5; let ReleaseAtCycles = [8]; }
+def CortexA320WriteVLD4Latency6: SchedWriteRes<[CortexA320UnitLdSt]> { let Latency = 6; let ReleaseAtCycles = [7]; }
+
+def : InstRW<[CortexA320WriteVLD4Latency5Release7], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVLD4Latency5Release8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVLD4Latency6], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVLD4Latency4], (instregex "LD4Rv(8b|16b|4h|8b|2s|4s|1d|2d)$")>;
+ 
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency5Release7], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency5Release8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; 
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency6], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVLD4Latency4], (instregex "LD4Rv(8b|16b|4h|8b|2s|4s|1d|2d)_POST$")>;
+//---
+// Vector Stores
+//---
+// 1 Element structures
+def : InstRW<[CortexA320WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVST2], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[CortexA320WriteVST2], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[CortexA320WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// 2 Element structures
+def : InstRW<[CortexA320WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[CortexA320WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// 3 Element structures
+def : InstRW<[CortexA320WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+
+// 4 Element structures
+def : InstRW<[CortexA320WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[CortexA320WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+
+def : InstRW<[WriteAdr, CortexA320WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA320WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//---
+// Floating Point Conversions, MAC, DIV, SQRT
+//---
+def : InstRW<[CortexA320WriteFPALU_F3], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>;
+def : InstRW<[CortexA320WriteFPALU_F4], (instregex "^XTN")>;
+def : InstRW<[CortexA320WriteFPALU_F4], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>;
+def : InstRW<[CortexA320WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>;
+
+def : InstRW<[CortexA320WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>;
+def : InstRW<[CortexA320WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>;
+def : InstRW<[CortexA320WriteFPALU_F4], (instregex "^(S|U)CVTFv")>;
+
+def : InstRW<[CortexA320WriteVMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[CortexA320WriteVMAC], (instregex "^FML(A|S)v.*")>;
+def : InstRW<[CortexA320WriteFDivHP], (instrs FDIVHrr)>;
+def : InstRW<[CortexA320WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[CortexA320WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[CortexA320WriteFDivHP], (instregex "^FDIVv.*16$")>;
+def : InstRW<[CortexA320WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[CortexA320WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[CortexA320WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
+def : InstRW<[CortexA320WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[CortexA320WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+def : InstRW<[CortexA320WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)>;
+
+// Advanced SIMD integer instructions
+// ASIMD absolute diff
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
+// ASIMD absolute diff accum
+def : InstRW<[CortexA320Write<6, CortexA320UnitVALU>], (instregex "[SU]ABAL?v")>;
+// ASIMD absolute diff long
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]ABDLv")>;
+// ASIMD arith #1
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "(ADD|SUB|NEG)v",
+  "[SU]R?HADDv", "[SU]HSUBv")>;
+// ASIMD arith #2
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+  "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
+  "ADDPv(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+  "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
+  "ADDPv(16i8|2i64|4i32|8i16)$")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>;
+// ASIMD arith #3
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
+  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex  "ADDHNv", "SUBHNv")>;
+// ASIMD arith #5
+def : InstRW<[CortexA320Write<8, CortexA320UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
+// ASIMD arith, reduce
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex  "ADDVv")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex  "SADDLVv", "UADDLVv")>;
+// ASIMD compare #1
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
+// ASIMD compare #2
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
+// ASIMD logical $1
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v8i8",
+  "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8",
+  "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
+// ASIMD max/min, basic
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
+// SIMD max/min, reduce
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>;
+// ASIMD multiply, by element
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
+  "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs PMULv8i8)>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs PMULv16i8)>;
+// ASIMD multiply accumulate
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply accumulate half
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "SQRDML[AS]H[vi]")>;
+// ASIMD multiply accumulate long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]ML[AS]Lv")>;
+// ASIMD multiply accumulate long #2
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "SQDML[AS]L[iv]")>;
+// ASIMD dot product
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]DOTv8i8")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]DOTv16i8")>;
+// ASIMD dot product, by scalar
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]DOTlanev")>;
+// ASIMD multiply long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>;
+// ASIMD pairwise add and accumulate
+def : InstRW<[CortexA320MCWrite<7, 2, CortexA320UnitVALU>], (instregex "[SU]ADALPv")>;
+// ASIMD shift accumulate
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
+// ASIMD shift accumulate #2
+def : InstRW<[CortexA320MCWrite<7, 2, CortexA320UnitVALU>], (instregex "[SU]RSRA[vd]")>;
+// ASIMD shift by immed
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "SHLd$", "SHLv",
+  "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
+// ASIMD shift by immed
+// SXTL and UXTL are aliases for SHLL
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[US]?SHLLv")>;
+// ASIMD shift by immed #2
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
+  "[SU]RSHRv(16i8|2i64|4i32|8i16)")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "RSHRNv(2i32|4i16|8i8)",
+  "RSHRNv(16i8|4i32|8i16)")>;
+// ASIMD shift by register
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
+// ASIMD shift by register #2
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
+
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]QSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]QSHLv(2i64|4i32|8i16|16i8)")>;
+
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]QRSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "[SU]QRSHLv(2i64|4i32|8i16|16i8)")>;
+
+// Cryptography extensions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[CortexA320MCWrite<4, 0, CortexA320UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 hash acceleration op
+// Crypto SHA1 schedule acceleration ops
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^SHA1(H|SU0|SU1)")>;
+
+// Crypto SHA1 hash acceleration ops
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[CortexA320MCWrite<4, 0, CortexA320UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+
+// Crypto SHA256 schedule acceleration ops
+def : InstRW<[CortexA320MCWrite<4, 0, CortexA320UnitVMC>], (instregex "^SHA256SU[01]")>;
+
+// Crypto SHA512 hash acceleration ops
+def : InstRW<[CortexA320MCWrite<9, 0, CortexA320UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+
+// Crypto SHA3 ops
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs BCAX, EOR3)>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs XAR)>;
+def : InstRW<[CortexA320MCWrite<9, 0, CortexA320UnitVMC>], (instrs RAX1)>;
+
+
+// Crypto SM3 ops
+def : InstRW<[CortexA320MCWrite<9, 0, CortexA320UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+                                                            "^SM3TT[12][AB]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[CortexA320MCWrite<9, 0, CortexA320UnitVMC>], (instrs SM4E, SM4ENCKEY)>;
+
+// CRC
+// -----------------------------------------------------------------------------
+
+def : InstRW<[CortexA320MCWrite<2, 0, CortexA320UnitMAC>], (instregex "^CRC32")>;
+
+// SVE Predicate instructions
+
+// Loop control, based on predicate
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs BRKA_PPmP, BRKA_PPzP,
+                                                  BRKB_PPmP, BRKB_PPzP)>;
+
+// Loop control, based on predicate and flag setting
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+
+// Loop control, propagating
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
+
+// Loop control, propagating and flag setting
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs BRKNS_PPzP)>;
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>;
+
+
+// Loop control, based on GPR
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>],
+             (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
+
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
+
+// Loop terminate
+def : InstRW<[CortexA320Write<1, CortexA320UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
+
+// Predicate counting scalar
+def : InstRW<[CortexA320Write<1, CortexA320UnitALU>], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
+
+def : InstRW<[CortexA320Write<3, CortexA320UnitALU>],
+             (instregex "^CNT[BHWD]_XPiI")>;
+
+def : InstRW<[CortexA320Write<3, CortexA320UnitALU>],
+             (instregex "^(INC|DEC)[BHWD]_XPiI")>;
+
+def : InstRW<[CortexA320Write<5, CortexA320UnitALU>],
+             (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>;
+
+// Predicate counting scalar, active predicate
+def : InstRW<[CortexA320Write<1, CortexA320UnitVALU>],
+             (instregex "^CNTP_XPP_[BHSD]")>;
+
+def : InstRW<[CortexA320Write<1, CortexA320UnitVALU>],
+             (instregex "^(DEC|INC)P_XP_[BHSD]")>;
+
+def : InstRW<[CortexA320Write<9, CortexA320UnitVALU>],
+             (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
+                        "^(UQDEC|UQINC)P_WP_[BHSD]",
+                        "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>;
+
+
+// Predicate counting vector, active predicate
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],
+             (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
+
+// Predicate logical
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>],
+             (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
+
+// Predicate logical, flag setting
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>],
+             (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
+
+// Predicate reverse
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^REV_PP_[BHSD]")>;
+
+// Predicate select
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs SEL_PPPP)>;
+
+// Predicate set
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
+
+// Predicate set/initialize, set flags
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^PTRUES_[BHSD]")>;
+
+// Predicate find first/next
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
+
+// Predicate test
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs PTEST_PP)>;
+
+// Predicate transpose
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^TRN[12]_PPP_[BHSDQ]")>;
+
+// Predicate unpack and widen
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+
+// Predicate zip/unzip
+def : InstRW<[CortexA320Write<2, CortexA320UnitVALU>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>;
+
+
+// SVE integer instructions
+// -----------------------------------------------------------------------------
+// Arithmetic, absolute diff
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>;
+
+// Arithmetic, absolute diff accum
+def : InstRW<[CortexA320MCWrite<6, 2, CortexA320UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
+
+// Arithmetic, absolute diff accum long
+def : InstRW<[CortexA320MCWrite<6, 2, CortexA320UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, absolute diff long
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
+
+// Arithmetic, basic
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],
+             (instregex "^(ABS|CNOT|NEG)_ZPmZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZPZZ_[BHSD]",
+                        "^(ADD|SUB)_ZZZ_[BHSD]",
+                        "^(ADD|SUB|SUBR)_ZI_[BHSD]",
+                        "^ADR_[SU]XTW_ZZZ_D_[0123]",
+                        "^ADR_LSL_ZZZ_[SD]_[0123]",
+                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
+                        "^SADDLBT_ZZZ_[HSD]",
+                        "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
+
+// Arithmetic, complex
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]",
+                        "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]",
+                        "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
+                        "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
+                        "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
+                        "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
+def : InstRW<[CortexA320Write<8, CortexA320UnitVALU>],
+             (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>;
+
+// Arithmetic, large integer
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
+
+// Arithmetic, pairwise add
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^ADDP_ZPmZ_[BHSD]")>;
+
+// Arithmetic, pairwise add and accum long
+def : InstRW<[CortexA320MCWrite<7, 2, CortexA320UnitVALU>], (instregex "^[SU]ADALP_ZPmZ_[HSD]")>;
+
+// Arithmetic, shift
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],
+             (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]",
+                        "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]",
+                        "^(ASR|LSL|LSR)_ZPmI_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPZI_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPmZ_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZPZZ_[BHSD]",
+                        "^(ASR|LSL|LSR)_ZZI_[BHSD]",
+                        "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
+// Arithmetic, shift right for divide
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^ASRD_ZPmI_[BHSD]",
+                        "^ASRD_ZPZI_[BHSD]")>;
+
+// Arithmetic, shift and accumulate
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^(SSRA|USRA)_ZZI_[BHSD]")>;
+
+def : InstRW<[CortexA320MCWrite<7, 2, CortexA320UnitVALU>],
+             (instregex "^(SRSRA|URSRA)_ZZI_[BHSD]")>;
+
+
+// Arithmetic, shift by immediate
+// Arithmetic, shift by immediate and insert
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],
+             (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]")>;
+
+// Arithmetic, shift complex
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]",
+                        "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_(ZPmZ|ZPZZ)_[BHSD]",
+                        "^(SQSHL|SQSHLU|UQSHL)_(ZPmI|ZPZI)_[BHSD]",
+                        "^SQSHRU?N[BT]_ZZI_[BHS]",
+                        "^UQR?SHRN[BT]_ZZI_[BHS]")>;
+
+// Arithmetic, shift rounding
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^(SRSHL|SRSHR|SRSHLR|URSHL|URSHLR|URSHR)_(ZPmZ|ZPZZ|ZPZI)_[BHSD]",
+                        "^[SU]RSHR_ZPmI_[BHSD]")>;
+
+// Bit manipulation
+def : InstRW<[CortexA320MCWrite<13, 12, CortexA320UnitVMC>],
+             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_B")>;
+
+def : InstRW<[CortexA320MCWrite<21, 20, CortexA320UnitVMC>],
+             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_H")>;
+
+def : InstRW<[CortexA320MCWrite<37, 36, CortexA320UnitVMC>],
+             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_S")>;
+
+def : InstRW<[CortexA320MCWrite<68, 67, CortexA320UnitVMC>],
+             (instregex "^(BDEP|BEXT|BGRP)_ZZZ_D")>;
+
+
+// Bitwise select
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ")>;
+
+// Count/reverse bits
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>;
+def : InstRW<[CortexA320Write<8, CortexA320UnitVALU>], (instregex "^CNT_ZPmZ_S")>;
+def : InstRW<[CortexA320Write<12, CortexA320UnitVALU>], (instregex "^CNT_ZPmZ_D")>;
+// Broadcast logical bitmask immediate to vector
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs DUPM_ZI)>;
+
+// Compare and set flags
+def : InstRW<[CortexA320Write<5, CortexA320UnitVALU>],
+             (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
+                        "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
+
+// Complex add
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^CADD_ZZI_[BHSD]")>;
+
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^SQCADD_ZZI_[BHSD]")>;
+
+// Complex dot product 8-bit element
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+
+// Complex dot product 16-bit element
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+
+// Complex multiply-add B, H, S element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^CMLA_ZZZ_[BHS]",
+                                            "^CMLA_ZZZI_[HS]")>;
+
+// Complex multiply-add D element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs CMLA_ZZZ_D)>;
+
+// Conditional extract operations, scalar form
+def : InstRW<[CortexA320MCWrite<8, 2, CortexA320UnitVALU>], (instregex "^CLAST[AB]_RPZ_[BHSD]")>;
+
+// Conditional extract operations, SIMD&FP scalar and vector forms
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]",
+                                            "^COMPACT_ZPZ_[SD]",
+                                            "^SPLICE_ZPZZ?_[BHSD]")>;
+
+// Convert to floating point, 64b to float or convert to double
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]")>;
+
+// Convert to floating point, 64b to half
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_DtoH")>;
+
+// Convert to floating point, 32b to single or half
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]")>;
+
+// Convert to floating point, 32b to double
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_StoD")>;
+
+// Convert to floating point, 16b to half
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]CVTF_ZPmZ_HtoH")>;
+
+// Copy, scalar
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],(instregex "^CPY_ZPmR_[BHSD]")>;
+
+// Copy, scalar SIMD&FP or imm
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^CPY_ZPm[IV]_[BHSD]",
+                                           "^CPY_ZPzI_[BHSD]")>;
+
+// Divides, 32 bit
+def : InstRW<[CortexA320MCWrite<15, 12, CortexA320UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_S")>;
+
+// Divides, 64 bit
+def : InstRW<[CortexA320MCWrite<26, 23, CortexA320UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>;
+
+// Dot product, 8 bit
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>;
+
+// Dot product, 8 bit, using signed and unsigned integers
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+
+// Dot product, 16 bit
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>;
+
+// Duplicate, immediate and indexed form
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^DUP_ZI_[BHSD]",
+                                           "^DUP_ZZI_[BHSDQ]")>;
+
+// Duplicate, scalar form
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^DUP_ZR_[BHSD]")>;
+
+// Extend, sign or zero
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU]XTB_ZPmZ_[HSD]",
+                                            "^[SU]XTH_ZPmZ_[SD]",
+                                            "^[SU]XTW_ZPmZ_[D]")>;
+
+// Extract
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instrs EXT_ZZI, EXT_ZZI_B)>;
+
+// Extract narrow saturating
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]",
+                                            "^SQXTUN[BT]_ZZ_[BHS]")>;
+
+// Extract/insert operation, SIMD and FP scalar form
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^LAST[AB]_VPZ_[BHSD]",
+                                            "^INSR_ZV_[BHSD]")>;
+
+// Extract/insert operation, scalar
+def : InstRW<[CortexA320MCWrite<8, 2, CortexA320UnitVALU>], (instregex "^LAST[AB]_RPZ_[BHSD]",
+                                                "^INSR_ZR_[BHSD]")>;
+
+// Histogram operations
+def : InstRW<[CortexA320MCWrite<8, 2, CortexA320UnitVALU>], (instregex "^HISTCNT_ZPzZZ_[SD]",
+                                                  "^HISTSEG_ZZZ")>;
+
+// Horizontal operations, B, H, S form, immediate operands only
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^INDEX_II_[BHS]")>;
+
+// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
+// operands only / immediate, scalar operands
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_[BHS]")>;
+
+// Horizontal operations, D form, immediate operands only
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs INDEX_II_D)>;
+
+// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
+// only / immediate, scalar operands
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^INDEX_(IR|RI|RR)_D")>;
+
+// Logical
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],
+             (instregex "^(AND|EOR|ORR)_ZI",
+                        "^(AND|BIC|EOR|EOR|ORR)_ZZZ",
+                        "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]",
+                        "^(AND|BIC|EOR|NOT|ORR)_ZPZZ_[BHSD]")>;
+
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^EOR(BT|TB)_ZZZ_[BHSD]")>;
+
+// Max/min, basic and pairwise
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
+                                           "^[SU](MAX|MIN)P?_(ZPmZ|ZPZZ)_[BHSD]")>;
+
+// Matching operations
+def : InstRW<[CortexA320MCWrite<9, 2, CortexA320UnitVALU>], (instregex "^N?MATCH_PPzZZ_[BH]")>;
+
+// Matrix multiply-accumulate
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+
+// Move prefix
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]",
+                                           "^MOVPRFX_ZZ")>;
+
+// Multiply, B, H, S element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_[BHS]",
+                                            "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_[BHS]")>;
+
+// Multiply, D element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ|ZPZZ)_D",
+                                            "^[SU]MULH_(ZPmZ|ZZZ|ZPZZ)_D")>;
+
+// Multiply long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]MULL[BT]_ZZZI_[SD]",
+                                            "^[SU]MULL[BT]_ZZZ_[HSD]")>;
+
+// Multiply accumulate, B, H, S element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_[BHS]",
+                                            "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^ML[AS]_(ZZZI|ZPZZZ)_D",
+                                            "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
+
+// Multiply accumulate long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]",
+                                            "^[SU]ML[AS]L[BT]_ZZZI_[SD]")>;
+
+// Multiply accumulate saturating doubling long regular
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]",
+                                            "^SQDML[AS](LB|LT)_ZZZI_[SD]")>;
+
+// Multiply saturating doubling high, B, H, S element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQDMULH_ZZZ_[BHS]",
+                                            "^SQDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating doubling high, D element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
+
+// Multiply saturating doubling long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQDMULL[BT]_ZZZ_[HSD]",
+                                            "^SQDMULL[BT]_ZZZI_[SD]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
+// element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZ_[BHS]",
+                                            "^SQRDCMLAH_ZZZ_[BHS]",
+                                            "^SQRDML[AS]H_ZZZI_[HS]",
+                                            "^SQRDCMLAH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, D element
+// size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQRDML[AS]H_ZZZI?_D",
+                                            "^SQRDCMLAH_ZZZ_D")>;
+
+// Multiply saturating rounding doubling regular/complex, B, H, S element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQRDMULH_ZZZ_[BHS]",
+                                            "^SQRDMULH_ZZZI_[HS]")>;
+
+// Multiply saturating rounding doubling regular/complex, D element size
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^SQRDMULH_ZZZI?_D")>;
+
+// Multiply/multiply long, (8x8) polynomial
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^PMUL_ZZZ_B")>;
+
+def : InstRW<[CortexA320Write<9, CortexA320UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>;
+
+
+// Predicate counting vector
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>],
+             (instregex "^(DEC|INC)[HWD]_ZPiI")>;
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>;
+
+// Reciprocal estimate
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
+
+// Reduction, arithmetic, B form
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
+
+// Reduction, arithmetic, H form
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
+
+// Reduction, arithmetic, S form
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
+
+// Reduction, arithmetic, D form
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
+
+// Reduction, logical
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>;
+
+// Reverse, vector
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^REV_ZZ_[BHSD]",
+                                           "^REVB_ZPmZ_[HSD]",
+                                           "^REVH_ZPmZ_[SD]",
+                                           "^REVW_ZPmZ_D")>;
+
+// Select, vector form
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^SEL_ZPZZ_[BHSD]")>;
+
+// Table lookup
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^TBL_ZZZZ?_[BHSD]")>;
+
+// Table lookup extension
+def : InstRW<[CortexA320Write<8, CortexA320UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>;
+
+// Transpose, vector form
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
+
+// Unpack and extend
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
+
+// Zip/unzip
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
+
+// SVE floating-point instructions
+// -----------------------------------------------------------------------------
+
+// Floating point absolute value/difference
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FAB[SD]_ZPmZ_[HSD]",
+                                                                  "^FAB[SD]_ZPZZ_[HSD]")>;
+
+// Floating point arithmetic
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ|ZPZI|ZPZZ)_[HSD]",
+                                           "^FADDP_ZPmZZ_[HSD]",
+                                           "^FNEG_ZPmZ_[HSD]",
+                                           "^FSUBR_(ZPm[IZ]|ZPZ[IZ])_[HSD]")>;
+
+// Floating point associative add, F16
+def : InstRW<[CortexA320MCWrite<32, 29, CortexA320UnitVALU>], (instrs FADDA_VPZ_H)>;
+
+// Floating point associative add, F32
+def : InstRW<[CortexA320MCWrite<16, 13, CortexA320UnitVALU>], (instrs FADDA_VPZ_S)>;
+
+// Floating point associative add, F64
+def : InstRW<[CortexA320MCWrite<8, 5, CortexA320UnitVALU>], (instrs FADDA_VPZ_D)>;
+
+// Floating point compare
+def : InstRW<[CortexA320Write<5, CortexA320UnitVALU>], (instregex "^FACG[ET]_PPzZZ_[HSD]",
+                                            "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]",
+                                            "^FCM(LE|LT)_PPzZ0_[HSD]",
+                                            "^FCMUO_PPzZZ_[HSD]")>;
+
+// Floating point complex add
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FCADD_ZPmZ_[HSD]")>;
+
+// Floating point complex multiply add
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FCMLA_ZPmZZ_[HSD]",
+                                           "^FCMLA_ZZZI_[HS]")>;
+
+// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
+                                            "^FCVTLT_ZPmZ_HtoS",
+                                            "^FCVTNT_ZPmZ_StoH")>;
+
+// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
+// or F64 to F16)
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)",
+                                            "^FCVTLT_ZPmZ_StoD",
+                                            "^FCVTNT_ZPmZ_DtoS")>;
+
+// Floating point convert, round to odd
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FCVTX_ZPmZ_DtoS", "FCVTXNT_ZPmZ_DtoS")>;
+
+// Floating point base2 log, F16
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point base2 log, F32
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point base2 log, F64
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FLOGB_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point convert to integer, F16
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_HtoH")>;
+
+// Floating point convert to integer, F32
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)")>;
+
+// Floating point convert to integer, F64
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)")>;
+
+// Floating point copy
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^FCPY_ZPmI_[HSD]",
+                                           "^FDUP_ZI_[HSD]")>;
+
+// Floating point divide, F16
+def : InstRW<[CortexA320MCWrite<8, 5, CortexA320UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_H")>;
+
+// Floating point divide, F32
+def : InstRW<[CortexA320MCWrite<13, 10, CortexA320UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
+
+// Floating point divide, F64
+def : InstRW<[CortexA320MCWrite<22, 19, CortexA320UnitVMC>], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
+
+// Floating point min/max pairwise
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
+
+// Floating point min/max
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^F(MAX|MIN)(NM)?_(ZPm[IZ]|ZPZZ|ZPZI)_[HSD]")>;
+
+// Floating point multiply
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^(FSCALE|FMULX)_(ZPmZ|ZPZZ)_[HSD]",
+                                           "^FMUL_(ZPm[IZ]|ZZZI?|ZPZI|ZPZZ)_[HSD]")>;
+
+// Floating point multiply accumulate
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>],
+             (instregex "^FML[AS]_(ZPmZZ|ZZZI|ZPZZZ)_[HSD]",
+                        "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_(ZPmZZ|ZPZZZ)_[HSD]")>;
+
+// Floating point multiply add/sub accumulate long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FML[AS]L[BT]_ZZZI?_SHH")>;
+
+// Floating point reciprocal estimate, F16
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FRECPE_ZZ_H", "^FRECPX_ZPmZ_H",
+                                         "^FRSQRTE_ZZ_H")>;
+
+// Floating point reciprocal estimate, F32
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FRECPE_ZZ_S", "^FRECPX_ZPmZ_S",
+                                         "^FRSQRTE_ZZ_S")>;
+// Floating point reciprocal estimate, F64
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>],(instregex "^FRECPE_ZZ_D", "^FRECPX_ZPmZ_D",
+                                         "^FRSQRTE_ZZ_D")>;
+
+// Floating point reciprocal step
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
+
+// Floating point reduction, F16
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^(FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_[HSD]")>;
+
+// Floating point reduction, F32
+def : InstRW<[CortexA320MCWrite<12, 11, CortexA320UnitVALU>],
+             (instregex "^FADDV_VPZ_H")>;
+
+def : InstRW<[CortexA320MCWrite<8, 5, CortexA320UnitVALU>],
+             (instregex "^FADDV_VPZ_S")>;
+
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>],
+             (instregex "^FADDV_VPZ_D")>;
+
+
+// Floating point round to integral, F16
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H")>;
+
+// Floating point round to integral, F32
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S")>;
+
+// Floating point round to integral, F64
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D")>;
+
+// Floating point square root, F16
+def : InstRW<[CortexA320MCWrite<11, 5, CortexA320UnitVMC>], (instregex "^FSQRT_ZPmZ_H")>;
+
+// Floating point square root, F32
+def : InstRW<[CortexA320MCWrite<14, 9, CortexA320UnitVMC>], (instregex "^FSQRT_ZPmZ_S")>;
+
+// Floating point square root, F64
+def : InstRW<[CortexA320MCWrite<25, 19, CortexA320UnitVMC>], (instregex "^FSQRT_ZPmZ_D")>;
+
+// Floating point trigonometric exponentiation
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FEXPA_ZZ_[HSD]")>;
+
+// Floating point trigonometric multiply add
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FTMAD_ZZI_[HSD]")>;
+
+// Floating point trigonometric, miscellaneous
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>;
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>;
+
+
+// SVE BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// Convert, F32 to BF16
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
+
+// Dot product
+def : InstRW<[CortexA320Write_11cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+
+// Matrix multiply accumulate
+def : InstRW<[CortexA320Write_16cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>;
+
+// Multiply accumulate long
+def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>;
+
+// SVE Load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instrs LDR_ZXI)>;
+
+// Load predicate
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instrs LDR_PXI)>;
+
+// Contiguous load, scalar + imm
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LD1[BHWD]_IMM$",
+                                           "^LD1S?B_[HSD]_IMM$",
+                                           "^LD1S?H_[SD]_IMM$",
+                                           "^LD1S?W_D_IMM$" )>;
+// Contiguous load, scalar + scalar
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LD1[BHWD]$",
+                                             "^LD1S?B_[HSD]$",
+                                             "^LD1S?H_[SD]$",
+                                             "^LD1S?W_D$" )>;
+
+// Contiguous load broadcast, scalar + imm
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LD1R[BHWD]_IMM$",
+                                           "^LD1RSW_IMM$",
+                                           "^LD1RS?B_[HSD]_IMM$",
+                                           "^LD1RS?H_[SD]_IMM$",
+                                           "^LD1RS?W_D_IMM$",
+                                           "^LD1RQ_[BHWD]_IMM$")>;
+
+// Contiguous load broadcast, scalar + scalar
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LD1RQ_[BHWD]$")>;
+
+// Non temporal load, scalar + imm
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRI$")>;
+
+// Non temporal load, scalar + scalar
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LDNT1[BHWD]_ZRR$")>;
+
+// Non temporal gather load, vector + scalar 32-bit element size
+def : InstRW<[CortexA320MCWrite<9, 9, CortexA320UnitLdSt>], (instregex "^LDNT1[BHW]_ZZR_S$",
+                                              "^LDNT1S[BH]_ZZR_S$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[CortexA320MCWrite<7, 7, CortexA320UnitLdSt>], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
+def : InstRW<[CortexA320MCWrite<7, 7, CortexA320UnitLdSt>], (instrs LDNT1D_ZZR_D)>;
+
+// Contiguous first faulting load, scalar + scalar
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LDFF1[BHWD]$",
+                                              "^LDFF1S?B_[HSD]$",
+                                              "^LDFF1S?H_[SD]$",
+                                              "^LDFF1S?W_D$")>;
+
+// Contiguous non faulting load, scalar + imm
+def : InstRW<[CortexA320Write<3, CortexA320UnitLdSt>], (instregex "^LDNF1[BHWD]_IMM$",
+                                           "^LDNF1S?B_[HSD]_IMM$",
+                                           "^LDNF1S?H_[SD]_IMM$",
+                                           "^LDNF1S?W_D_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + imm
+def : InstRW<[CortexA320MCWrite<3, 1, CortexA320UnitLdSt>], (instregex "^LD2[BHWD]_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + scalar
+def : InstRW<[CortexA320MCWrite<3, 2, CortexA320UnitLdSt>], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[CortexA320MCWrite<5, 3, CortexA320UnitLdSt>], (instregex "^LD3[BHWD]_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + scalar
+def : InstRW<[CortexA320MCWrite<5, 3, CortexA320UnitLdSt>], (instregex "^LD3[BHWD]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[CortexA320MCWrite<5, 3, CortexA320UnitLdSt>], (instregex "^LD4[BHWD]_IMM$")>;
+
+// Contiguous Load four structures to four vectors, scalar + scalar
+def : InstRW<[CortexA320MCWrite<5, 3, CortexA320UnitLdSt>], (instregex "^LD4[BHWD]$")>;
+
+// Gather load, vector + imm, 32-bit element size
+def : InstRW<[CortexA320MCWrite<9, 9, CortexA320UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
+                                              "^GLD(FF)?1W_IMM$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[CortexA320MCWrite<7, 7, CortexA320UnitLdSt>], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
+                                              "^GLD(FF)?1D_IMM$")>;
+
+// Gather load, 64-bit element size
+def : InstRW<[CortexA320MCWrite<7, 7, CortexA320UnitLdSt>],
+             (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW(_SCALED)?$",
+                        "^GLD(FF)?1S?[BHW]_D(_SCALED)?$",
+                        "^GLD(FF)?1D_[SU]XTW(_SCALED)?$",
+                        "^GLD(FF)?1D(_SCALED)?$")>;
+
+// Gather load, 32-bit scaled offset
+def : InstRW<[CortexA320MCWrite<7, 7, CortexA320UnitLdSt>],
+             (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$",
+                        "^GLD(FF)?1W_[SU]XTW_SCALED")>;
+
+// Gather load, 32-bit unpacked unscaled offset
+def : InstRW<[CortexA320MCWrite<7, 7, CortexA320UnitLdSt>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
+                                              "^GLD(FF)?1W_[SU]XTW$")>;
+
+def : InstRW<[CortexA320Write<0, CortexA320UnitVALU>], (instregex "^PRF(B|H|W|D).*")>;
+// SVE Store instructions
+// -----------------------------------------------------------------------------
+
+// Store from predicate reg
+def : InstRW<[CortexA320VSt0], (instrs STR_PXI)>;
+
+// Store from vector reg
+def : InstRW<[CortexA320VSt0], (instrs STR_ZXI)>;
+
+// Contiguous store, scalar + imm
+def : InstRW<[CortexA320VSt0], (instregex "^ST1[BHWD]_IMM$",
+                                                "^ST1B_[HSD]_IMM$",
+                                                "^ST1H_[SD]_IMM$",
+                                                "^ST1W_D_IMM$")>;
+
+// Contiguous store, scalar + scalar
+def : InstRW<[CortexA320VSt0], (instregex "^ST1H(_[SD])?$")>;
+def : InstRW<[CortexA320VSt0], (instregex "^ST1[BWD]$",
+                                                "^ST1B_[HSD]$",
+                                                "^ST1W_D$")>;
+
+// Contiguous store two structures from two vectors, scalar + imm
+def : InstRW<[CortexA320VSt<11>], (instregex "^ST2[BHWD]_IMM$")>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[CortexA320VSt<11>], (instrs ST2H)>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[CortexA320VSt<11>], (instregex "^ST2[BWD]$")>;
+
+// Contiguous store three structures from three vectors, scalar + imm
+def : InstRW<[CortexA320VSt<25>], (instregex "^ST3[BHW]_IMM$")>;
+def : InstRW<[CortexA320VSt<14>], (instregex "^ST3D_IMM$")>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[CortexA320VSt<25>], (instregex "^ST3[BHW]$")>;
+def : InstRW<[CortexA320VSt<14>], (instregex "^ST3D$")>;
+
+// Contiguous store four structures from four vectors, scalar + imm
+def : InstRW<[CortexA320VSt<50>], (instregex "^ST4[BHW]_IMM$")>;
+def : InstRW<[CortexA320VSt<25>], (instregex "^ST4D_IMM$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[CortexA320VSt<50>], (instregex "^ST4[BHW]$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[CortexA320VSt<25>], (instregex "^ST4D$")>;
+
+// Non temporal store, scalar + imm
+def : InstRW<[CortexA320VSt0], (instregex "^STNT1[BHWD]_ZRI$")>;
+
+// Non temporal store, scalar + scalar
+def : InstRW<[CortexA320VSt0], (instrs STNT1H_ZRR)>;
+def : InstRW<[CortexA320VSt0], (instregex "^STNT1[BWD]_ZRR$")>;
+
+// Scatter non temporal store, vector + scalar 32-bit element size
+def : InstRW<[CortexA320VSt<9>], (instregex "^STNT1[BHW]_ZZR_S")>;
+
+// Scatter non temporal store, vector + scalar 64-bit element size
+def : InstRW<[CortexA320VSt<7>], (instregex "^STNT1[BHWD]_ZZR_D")>;
+
+// Scatter store vector + imm 32-bit element size
+def : InstRW<[CortexA320VSt<9>], (instregex "^SST1[BH]_S_IMM$",
+                                                "^SST1W_IMM$")>;
+
+// Scatter store vector + imm 64-bit element size
+def : InstRW<[CortexA320VSt<7>], (instregex "^SST1[BHW]_D_IMM$",
+                                                "^SST1D_IMM$")>;
+
+// Scatter store, 32-bit scaled offset
+def : InstRW<[CortexA320VSt<8>],
+             (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unpacked unscaled offset
+def : InstRW<[CortexA320VSt<8>], (instregex "^SST1[BHW]_D_[SU]XTW$",
+                                                "^SST1D_[SU]XTW$")>;
+
+// Scatter store, 32-bit unpacked scaled offset
+def : InstRW<[CortexA320VSt<8>], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
+                                                "^SST1D_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unscaled offset
+def : InstRW<[CortexA320VSt<8>], (instregex "^SST1[BH]_S_[SU]XTW$",
+                                                "^SST1W_[SU]XTW$")>;
+
+// Scatter store, 64-bit scaled offset
+def : InstRW<[CortexA320VSt<8>], (instregex "^SST1[HW]_D_SCALED$",
+                                                "^SST1D_SCALED$")>;
+
+// Scatter store, 64-bit unscaled offset
+def : InstRW<[CortexA320VSt<8>], (instregex "^SST1[BHW]_D$",
+                                                "^SST1D$")>;
+
+// SVE Miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// Read first fault register, unpredicated
+def : InstRW<[CortexA320Write<1, CortexA320UnitALU>], (instrs RDFFR_P)>;
+
+// Read first fault register, predicated
+def : InstRW<[CortexA320Write<3, CortexA320UnitALU>], (instrs RDFFR_PPz)>;
+
+// Read first fault register and set flags
+def : InstRW<[CortexA320Write<3, CortexA320UnitALU>], (instrs RDFFRS_PPz)>;
+
+// Set first fault register
+// Write to first fault register
+def : InstRW<[CortexA320Write<1, CortexA320UnitALU>], (instrs SETFFR, WRFFR)>;
+
+// SVE Cryptographic instructions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^AES[DE]_ZZZ_B$",
+                                           "^AESI?MC_ZZ_B$")>;
+
+// Crypto SHA3 ops
+def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$",
+                                            "^XAR_ZZZI_[BHSD]$")>;
+
+def : InstRW<[CortexA320MC_RC0Write<9, CortexA320UnitVMC>], (instregex "^RAX1_ZZZ_D$")>;
+
+// Crypto SM4 ops
+def : InstRW<[CortexA320MC_RC0Write<9, CortexA320UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+
+}
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-basic-instructions.s
new file mode 100644
index 000000000000..35b5d5b2ce43
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-basic-instructions.s
@@ -0,0 +1,3721 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a320 -instruction-tables < %s | FileCheck %s
+
+#------------------------------------------------------------------------------
+# Add/sub (immediate)
+#------------------------------------------------------------------------------
+
+add      w2, w3, #4095
+add      w30, w29, #1, lsl #12
+add      w13, w5, #4095, lsl #12
+add      x5, x7, #1638
+add      w20, wsp, #801
+add      wsp, wsp, #1104
+add      wsp, w30, #4084
+add      x0, x24, #291
+add      x3, x24, #4095, lsl #12
+add      x8, sp, #1074
+add      sp, x29, #3816
+sub      w0, wsp, #4077
+sub      w4, w20, #546, lsl #12
+sub      sp, sp, #288
+sub      wsp, w19, #16
+adds     w13, w23, #291, lsl #12
+cmn      w2, #4095
+adds     w20, wsp, #0
+cmn      x3, #1, lsl #12
+cmp      sp, #20, lsl #12
+cmp      x30, #4095
+subs     x4, sp, #3822
+cmn      w3, #291, lsl #12
+cmn      wsp, #1365
+cmn      sp, #1092, lsl #12
+mov      sp, x30
+mov      wsp, w20
+mov      x11, sp
+mov      w24, wsp
+
+#------------------------------------------------------------------------------
+# Add-subtract (shifted register)
+#------------------------------------------------------------------------------
+
+add      w3, w5, w7
+add      wzr, w3, w5
+add      w20, wzr, w4
+add      w4, w6, wzr
+add      w11, w13, w15
+add      w9, w3, wzr, lsl #10
+add      w17, w29, w20, lsl #31
+add      w21, w22, w23, lsr #0
+add      w24, w25, w26, lsr #18
+add      w27, w28, w29, lsr #31
+add      w2, w3, w4, asr #0
+add      w5, w6, w7, asr #21
+add      w8, w9, w10, asr #31
+add      x3, x5, x7
+add      xzr, x3, x5
+add      x20, xzr, x4
+add      x4, x6, xzr
+add      x11, x13, x15
+add      x9, x3, xzr, lsl #10
+add      x17, x29, x20, lsl #63
+add      x21, x22, x23, lsr #0
+add      x24, x25, x26, lsr #18
+add      x27, x28, x29, lsr #63
+add      x2, x3, x4, asr #0
+add      x5, x6, x7, asr #21
+add      x8, x9, x10, asr #63
+adds     w3, w5, w7
+cmn      w3, w5
+adds     w20, wzr, w4
+adds     w4, w6, wzr
+adds     w11, w13, w15
+adds     w9, w3, wzr, lsl #10
+adds     w17, w29, w20, lsl #31
+adds     w21, w22, w23, lsr #0
+adds     w24, w25, w26, lsr #18
+adds     w27, w28, w29, lsr #31
+adds     w2, w3, w4, asr #0
+adds     w5, w6, w7, asr #21
+adds     w8, w9, w10, asr #31
+adds     x3, x5, x7
+cmn      x3, x5
+adds     x20, xzr, x4
+adds     x4, x6, xzr
+adds     x11, x13, x15
+adds     x9, x3, xzr, lsl #10
+adds     x17, x29, x20, lsl #63
+adds     x21, x22, x23, lsr #0
+adds     x24, x25, x26, lsr #18
+adds     x27, x28, x29, lsr #63
+adds     x2, x3, x4, asr #0
+adds     x5, x6, x7, asr #21
+adds     x8, x9, x10, asr #63
+sub      w3, w5, w7
+sub      wzr, w3, w5
+sub      w4, w6, wzr
+sub      w11, w13, w15
+sub      w9, w3, wzr, lsl #10
+sub      w17, w29, w20, lsl #31
+sub      w21, w22, w23, lsr #0
+sub      w24, w25, w26, lsr #18
+sub      w27, w28, w29, lsr #31
+sub      w2, w3, w4, asr #0
+sub      w5, w6, w7, asr #21
+sub      w8, w9, w10, asr #31
+sub      x3, x5, x7
+sub      xzr, x3, x5
+sub      x4, x6, xzr
+sub      x11, x13, x15
+sub      x9, x3, xzr, lsl #10
+sub      x17, x29, x20, lsl #63
+sub      x21, x22, x23, lsr #0
+sub      x24, x25, x26, lsr #18
+sub      x27, x28, x29, lsr #63
+sub      x2, x3, x4, asr #0
+sub      x5, x6, x7, asr #21
+sub      x8, x9, x10, asr #63
+subs     w3, w5, w7
+cmp      w3, w5
+subs     w4, w6, wzr
+subs     w11, w13, w15
+subs     w9, w3, wzr, lsl #10
+subs     w17, w29, w20, lsl #31
+subs     w21, w22, w23, lsr #0
+subs     w24, w25, w26, lsr #18
+subs     w27, w28, w29, lsr #31
+subs     w2, w3, w4, asr #0
+subs     w5, w6, w7, asr #21
+subs     w8, w9, w10, asr #31
+subs     x3, x5, x7
+cmp      x3, x5
+subs     x4, x6, xzr
+subs     x11, x13, x15
+subs     x9, x3, xzr, lsl #10
+subs     x17, x29, x20, lsl #63
+subs     x21, x22, x23, lsr #0
+subs     x24, x25, x26, lsr #18
+subs     x27, x28, x29, lsr #63
+subs     x2, x3, x4, asr #0
+subs     x5, x6, x7, asr #21
+subs     x8, x9, x10, asr #63
+cmn      wzr, w4
+cmn      w5, wzr
+cmn      w6, w7
+cmn      w8, w9, lsl #15
+cmn      w10, w11, lsl #31
+cmn      w12, w13, lsr #0
+cmn      w14, w15, lsr #21
+cmn      w16, w17, lsr #31
+cmn      w18, w19, asr #0
+cmn      w20, w21, asr #22
+cmn      w22, w23, asr #31
+cmn      x0, x3
+cmn      xzr, x4
+cmn      x5, xzr
+cmn      x6, x7
+cmn      x8, x9, lsl #15
+cmn      x10, x11, lsl #63
+cmn      x12, x13, lsr #0
+cmn      x14, x15, lsr #41
+cmn      x16, x17, lsr #63
+cmn      x18, x19, asr #0
+cmn      x20, x21, asr #55
+cmn      x22, x23, asr #63
+cmp      w0, w3
+cmp      wzr, w4
+cmp      w5, wzr
+cmp      w6, w7
+cmp      w8, w9, lsl #15
+cmp      w10, w11, lsl #31
+cmp      w12, w13, lsr #0
+cmp      w14, w15, lsr #21
+cmp      w18, w19, asr #0
+cmp      w20, w21, asr #22
+cmp      w22, w23, asr #31
+cmp      x0, x3
+cmp      xzr, x4
+cmp      x5, xzr
+cmp      x6, x7
+cmp      x8, x9, lsl #15
+cmp      x10, x11, lsl #63
+cmp      x12, x13, lsr #0
+cmp      x14, x15, lsr #41
+cmp      x16, x17, lsr #63
+cmp      x18, x19, asr #0
+cmp      x20, x21, asr #55
+cmp      x22, x23, asr #63
+cmp      wzr, w0
+cmp      xzr, x0
+
+#------------------------------------------------------------------------------
+# Add-subtract (shifted register)
+#------------------------------------------------------------------------------
+
+adc      w29, w27, w25
+adc      wzr, w3, w4
+adc      w9, wzr, w10
+adc      w20, w0, wzr
+adc      x29, x27, x25
+adc      xzr, x3, x4
+adc      x9, xzr, x10
+adc      x20, x0, xzr
+adcs     w29, w27, w25
+adcs     wzr, w3, w4
+adcs     w9, wzr, w10
+adcs     w20, w0, wzr
+adcs     x29, x27, x25
+adcs     xzr, x3, x4
+adcs     x9, xzr, x10
+adcs     x20, x0, xzr
+sbc      w29, w27, w25
+sbc      wzr, w3, w4
+ngc      w9, w10
+sbc      w20, w0, wzr
+sbc      x29, x27, x25
+sbc      xzr, x3, x4
+ngc      x9, x10
+sbc      x20, x0, xzr
+sbcs     w29, w27, w25
+sbcs     wzr, w3, w4
+ngcs     w9, w10
+sbcs     w20, w0, wzr
+sbcs     x29, x27, x25
+sbcs     xzr, x3, x4
+ngcs     x9, x10
+sbcs     x20, x0, xzr
+ngc      w3, w12
+ngc      wzr, w9
+ngc      w23, wzr
+ngc      x29, x30
+ngc      xzr, x0
+ngc      x0, xzr
+ngcs     w3, w12
+ngcs     wzr, w9
+ngcs     w23, wzr
+ngcs     x29, x30
+ngcs     xzr, x0
+ngcs     x0, xzr
+
+#------------------------------------------------------------------------------
+# Compare and branch (immediate)
+#------------------------------------------------------------------------------
+
+sbfx     x1, x2, #3, #2
+asr      x3, x4, #63
+asr      wzr, wzr, #31
+sbfx     w12, w9, #0, #1
+ubfiz    x4, x5, #52, #11
+ubfx     xzr, x4, #0, #1
+ubfiz    x4, xzr, #1, #6
+lsr      x5, x6, #12
+bfi      x4, x5, #52, #11
+bfxil    xzr, x4, #0, #1
+bfi      x4, xzr, #1, #6
+bfxil    x5, x6, #12, #52
+sxtb     w1, w2
+sxtb     xzr, w3
+sxth     w9, w10
+sxth     x0, w1
+sxtw     x3, w30
+uxtb     w1, w2
+uxth     w9, w10
+ubfx     x3, x30, #0, #32
+asr      w3, w2, #0
+asr      w9, w10, #31
+asr      x20, x21, #63
+asr      w1, wzr, #3
+lsr      w3, w2, #0
+lsr      w9, w10, #31
+lsr      x20, x21, #63
+lsr      wzr, wzr, #3
+lsr      w3, w2, #0
+lsl      w9, w10, #31
+lsl      x20, x21, #63
+lsl      w1, wzr, #3
+sbfx     w9, w10, #0, #1
+sbfiz    x2, x3, #63, #1
+asr      x19, x20, #0
+sbfiz    x9, x10, #5, #59
+asr      w9, w10, #0
+sbfiz    w11, w12, #31, #1
+sbfiz    w13, w14, #29, #3
+sbfiz    xzr, xzr, #10, #11
+sbfx     w9, w10, #0, #1
+asr      x2, x3, #63
+asr      x19, x20, #0
+asr      x9, x10, #5
+asr      w9, w10, #0
+asr      w11, w12, #31
+asr      w13, w14, #29
+sbfx     xzr, xzr, #10, #11
+bfxil    w9, w10, #0, #1
+bfi      x2, x3, #63, #1
+bfxil    x19, x20, #0, #64
+bfi      x9, x10, #5, #59
+bfxil    w9, w10, #0, #32
+bfi      w11, w12, #31, #1
+bfi      w13, w14, #29, #3
+bfi      xzr, xzr, #10, #11
+bfxil    w9, w10, #0, #1
+bfxil    x2, x3, #63, #1
+bfxil    x19, x20, #0, #64
+bfxil    x9, x10, #5, #59
+bfxil    w9, w10, #0, #32
+bfxil    w11, w12, #31, #1
+bfxil    w13, w14, #29, #3
+bfxil    xzr, xzr, #10, #11
+ubfx     w9, w10, #0, #1
+lsl      x2, x3, #63
+lsr      x19, x20, #0
+lsl      x9, x10, #5
+lsr      w9, w10, #0
+lsl      w11, w12, #31
+lsl      w13, w14, #29
+ubfiz    xzr, xzr, #10, #11
+ubfx     w9, w10, #0, #1
+lsr      x2, x3, #63
+lsr      x19, x20, #0
+lsr      x9, x10, #5
+lsr      w9, w10, #0
+lsr      w11, w12, #31
+lsr      w13, w14, #29
+ubfx     xzr, xzr, #10, #11
+
+#------------------------------------------------------------------------------
+# Compare and branch (immediate)
+#------------------------------------------------------------------------------
+
+cbz      w5, #4
+cbz      x5, #0
+cbnz     x2, #-4
+cbnz     x26, #1048572
+cbz      wzr, #0
+cbnz     xzr, #0
+
+#------------------------------------------------------------------------------
+# Conditional branch (immediate)
+#------------------------------------------------------------------------------
+
+b.ne #4
+b.ge #1048572
+b.ge #-4
+
+#------------------------------------------------------------------------------
+# Conditional compare (immediate)
+#------------------------------------------------------------------------------
+
+ccmp w1, #31, #0, eq
+ccmp w3, #0, #15, hs
+ccmp wzr, #15, #13, hs
+ccmp x9, #31, #0, le
+ccmp x3, #0, #15, gt
+ccmp xzr, #5, #7, ne
+ccmn w1, #31, #0, eq
+ccmn w3, #0, #15, hs
+ccmn wzr, #15, #13, hs
+ccmn x9, #31, #0, le
+ccmn x3, #0, #15, gt
+ccmn xzr, #5, #7, ne
+
+#------------------------------------------------------------------------------
+# Conditional compare (register)
+#------------------------------------------------------------------------------
+
+ccmp w1, wzr, #0, eq
+ccmp w3, w0, #15, hs
+ccmp wzr, w15, #13, hs
+ccmp x9, xzr, #0, le
+ccmp x3, x0, #15, gt
+ccmp xzr, x5, #7, ne
+ccmn w1, wzr, #0, eq
+ccmn w3, w0, #15, hs
+ccmn wzr, w15, #13, hs
+ccmn x9, xzr, #0, le
+ccmn x3, x0, #15, gt
+ccmn xzr, x5, #7, ne
+
+#------------------------------------------------------------------------------
+# Conditional branch (immediate)
+#------------------------------------------------------------------------------
+
+csel     w1, w0, w19, ne
+csel     wzr, w5, w9, eq
+csel     w9, wzr, w30, gt
+csel     w1, w28, wzr, mi
+csel     x19, x23, x29, lt
+csel     xzr, x3, x4, ge
+csel     x5, xzr, x6, hs
+csel     x7, x8, xzr, lo
+csinc    w1, w0, w19, ne
+csinc    wzr, w5, w9, eq
+csinc    w9, wzr, w30, gt
+csinc    w1, w28, wzr, mi
+csinc    x19, x23, x29, lt
+csinc    xzr, x3, x4, ge
+csinc    x5, xzr, x6, hs
+csinc    x7, x8, xzr, lo
+csinv    w1, w0, w19, ne
+csinv    wzr, w5, w9, eq
+csinv    w9, wzr, w30, gt
+csinv    w1, w28, wzr, mi
+csinv    x19, x23, x29, lt
+csinv    xzr, x3, x4, ge
+csinv    x5, xzr, x6, hs
+csinv    x7, x8, xzr, lo
+csneg    w1, w0, w19, ne
+csneg    wzr, w5, w9, eq
+csneg    w9, wzr, w30, gt
+csneg    w1, w28, wzr, mi
+csneg    x19, x23, x29, lt
+csneg    xzr, x3, x4, ge
+csneg    x5, xzr, x6, hs
+csneg    x7, x8, xzr, lo
+cset    w3, eq
+cset    x9, pl
+csetm    w20, ne
+csetm    x30, ge
+csinc    w2, wzr, wzr, al
+csinv    x3, xzr, xzr, nv
+cinc    w3, w5, gt
+cinc    wzr, w4, le
+cset    w9, lt
+cinc    x3, x5, gt
+cinc    xzr, x4, le
+cset    x9, lt
+csinc   w5, w6, w6, nv
+csinc   x1, x2, x2, al
+cinv    w3, w5, gt
+cinv    wzr, w4, le
+csetm   w9, lt
+cinv    x3, x5, gt
+cinv    xzr, x4, le
+csetm   x9, lt
+csinv   x1, x0, x0, al
+csinv   w9, w8, w8, nv
+cneg     w3, w5, gt
+cneg     wzr, w4, le
+cneg     w9, wzr, lt
+cneg     x3, x5, gt
+cneg     xzr, x4, le
+cneg     x9, xzr, lt
+csneg    x4, x8, x8, al
+csinv    w9, w8, w8, nv
+
+#------------------------------------------------------------------------------
+# Data-processing (1 source)
+#------------------------------------------------------------------------------
+
+rbit	w0, w7
+rbit   x18, x3
+rev16	w17, w1
+rev16	x5, x2
+rev	w18, w0
+rev32	x20, x1
+rev	x22, x2
+clz	w24, w3
+clz	x26, x4
+cls	w3, w5
+cls	x20, x5
+
+#------------------------------------------------------------------------------
+# Data-processing (2 source)
+#------------------------------------------------------------------------------
+
+udiv	w0, w7, w10
+udiv	x9, x22, x4
+sdiv	w12, w21, w0
+sdiv	x13, x2, x1
+lsl	w11, w12, w13
+lsl	x14, x15, x16
+lsr	w17, w18, w19
+lsr	x20, x21, x22
+asr	w23, w24, w25
+asr	x26, x27, x28
+ror	w0, w1, w2
+ror    x3, x4, x5
+lsl	w6, w7, w8
+lsl	x9, x10, x11
+lsr	w12, w13, w14
+lsr	x15, x16, x17
+asr	w18, w19, w20
+asr	x21, x22, x23
+ror	w24, w25, w26
+ror	x27, x28, x29
+
+#------------------------------------------------------------------------------
+# Data-processing (3 sources)
+#------------------------------------------------------------------------------
+
+smulh    x30, x29, x28
+smulh    xzr, x27, x26
+umulh    x30, x29, x28
+umulh    x23, x30, xzr
+madd     w1, w3, w7, w4
+madd     wzr, w0, w9, w11
+madd     w13, wzr, w4, w4
+madd     w19, w30, wzr, w29
+mul      w4, w5, w6
+madd     x1, x3, x7, x4
+madd     xzr, x0, x9, x11
+madd     x13, xzr, x4, x4
+madd     x19, x30, xzr, x29
+mul      x4, x5, x6
+msub     w1, w3, w7, w4
+msub     wzr, w0, w9, w11
+msub     w13, wzr, w4, w4
+msub     w19, w30, wzr, w29
+mneg     w4, w5, w6
+msub     x1, x3, x7, x4
+msub     xzr, x0, x9, x11
+msub     x13, xzr, x4, x4
+msub     x19, x30, xzr, x29
+mneg     x4, x5, x6
+smaddl   x3, w5, w2, x9
+smaddl   xzr, w10, w11, x12
+smaddl   x13, wzr, w14, x15
+smaddl   x16, w17, wzr, x18
+smull    x19, w20, w21
+smsubl   x3, w5, w2, x9
+smsubl   xzr, w10, w11, x12
+smsubl   x13, wzr, w14, x15
+smsubl   x16, w17, wzr, x18
+smnegl   x19, w20, w21
+umaddl   x3, w5, w2, x9
+umaddl   xzr, w10, w11, x12
+umaddl   x13, wzr, w14, x15
+umaddl   x16, w17, wzr, x18
+umull    x19, w20, w21
+umsubl   x3, w5, w2, x9
+umsubl   x16, w17, wzr, x18
+umnegl   x19, w20, w21
+smulh    x30, x29, x28
+smulh    x23, x22, xzr
+umulh    x23, x22, xzr
+mul      x19, x20, xzr
+mneg     w21, w22, w23
+smull    x11, w13, w17
+umull    x11, w13, w17
+smnegl   x11, w13, w17
+umnegl   x11, w13, w17
+
+#------------------------------------------------------------------------------
+# Extract (immediate)
+#------------------------------------------------------------------------------
+
+extr     w3, w5, w7, #0
+extr     w11, w13, w17, #31
+extr     x3, x5, x7, #15
+extr     x11, x13, x17, #63
+ror     x19, x23, #24
+ror     x29, xzr, #63
+ror     w9, w13, #31
+
+#------------------------------------------------------------------------------
+# Floating-point compare
+#------------------------------------------------------------------------------
+
+fcmp    s3, s5
+fcmp    s31, #0.0
+fcmp    s31, #0.0
+fcmpe   s29, s30
+fcmpe   s15, #0.0
+fcmpe   s15, #0.0
+fcmp    d4, d12
+fcmp    d23, #0.0
+fcmp    d23, #0.0
+fcmpe   d26, d22
+fcmpe   d29, #0.0
+fcmpe   d29, #0.0
+
+#------------------------------------------------------------------------------
+# Floating-point conditional compare
+#------------------------------------------------------------------------------
+
+fccmp s1, s31, #0, eq
+fccmp s3, s0, #15, hs
+fccmp s31, s15, #13, hs
+fccmp d9, d31, #0, le
+fccmp d3, d0, #15, gt
+fccmp d31, d5, #7, ne
+fccmpe s1, s31, #0, eq
+fccmpe s3, s0, #15, hs
+fccmpe s31, s15, #13, hs
+fccmpe d9, d31, #0, le
+fccmpe d3, d0, #15, gt
+fccmpe d31, d5, #7, ne
+
+#-------------------------------------------------------------------------------
+# Floating-point conditional compare
+#-------------------------------------------------------------------------------
+
+fcsel s3, s20, s9, pl
+fcsel d9, d10, d11, mi
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (1 source)
+#------------------------------------------------------------------------------
+
+fmov     s0, s1
+fabs     s2, s3
+fneg     s4, s5
+fsqrt    s6, s7
+fcvt     d8, s9
+fcvt     h10, s11
+frintn   s12, s13
+frintp   s14, s15
+frintm   s16, s17
+frintz   s18, s19
+frinta   s20, s21
+frintx   s22, s23
+frinti   s24, s25
+fmov     d0, d1
+fabs     d2, d3
+fneg     d4, d5
+fsqrt    d6, d7
+fcvt     s8, d9
+fcvt     h10, d11
+frintn   d12, d13
+frintp   d14, d15
+frintm   d16, d17
+frintz   d18, d19
+frinta   d20, d21
+frintx   d22, d23
+frinti   d24, d25
+fcvt     s26, h27
+fcvt     d28, h29
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (2 sources)
+#------------------------------------------------------------------------------
+
+fmul     s20, s19, s17
+fdiv     s1, s2, s3
+fadd     s4, s5, s6
+fsub     s7, s8, s9
+fmax     s10, s11, s12
+fmin     s13, s14, s15
+fmaxnm   s16, s17, s18
+fminnm   s19, s20, s21
+fnmul    s22, s23, s2
+fmul     d20, d19, d17
+fdiv     d1, d2, d3
+fadd     d4, d5, d6
+fsub     d7, d8, d9
+fmax     d10, d11, d12
+fmin     d13, d14, d15
+fmaxnm   d16, d17, d18
+fminnm   d19, d20, d21
+fnmul    d22, d23, d24
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (1 source)
+#------------------------------------------------------------------------------
+
+fmadd s3, s5, s6, s31
+fmadd d3, d13, d0, d23
+fmsub s3, s5, s6, s31
+fmsub d3, d13, d0, d23
+fnmadd s3, s5, s6, s31
+fnmadd d3, d13, d0, d23
+fnmsub s3, s5, s6, s31
+fnmsub d3, d13, d0, d23
+
+#------------------------------------------------------------------------------
+# Floating-point <-> fixed-point conversion
+#------------------------------------------------------------------------------
+
+fcvtzs  w3, h5, #1
+fcvtzs  wzr, h20, #13
+fcvtzs  w19, h0, #32
+fcvtzs  x3, h5, #1
+fcvtzs  x12, h30, #45
+fcvtzs  x19, h0, #64
+fcvtzs  w3, s5, #1
+fcvtzs  wzr, s20, #13
+fcvtzs  w19, s0, #32
+fcvtzs  x3, s5, #1
+fcvtzs  x12, s30, #45
+fcvtzs  x19, s0, #64
+fcvtzs  w3, d5, #1
+fcvtzs  wzr, d20, #13
+fcvtzs  w19, d0, #32
+fcvtzs  x3, d5, #1
+fcvtzs  x12, d30, #45
+fcvtzs  x19, d0, #64
+fcvtzu  w3, h5, #1
+fcvtzu  wzr, h20, #13
+fcvtzu  w19, h0, #32
+fcvtzu  x3, h5, #1
+fcvtzu  x12, h30, #45
+fcvtzu  x19, h0, #64
+fcvtzu  w3, s5, #1
+fcvtzu  wzr, s20, #13
+fcvtzu  w19, s0, #32
+fcvtzu  x3, s5, #1
+fcvtzu  x12, s30, #45
+fcvtzu  x19, s0, #64
+fcvtzu  w3, d5, #1
+fcvtzu  wzr, d20, #13
+fcvtzu  w19, d0, #32
+fcvtzu  x3, d5, #1
+fcvtzu  x12, d30, #45
+fcvtzu  x19, d0, #64
+scvtf   h23, w19, #1
+scvtf   h31, wzr, #20
+scvtf   h14, w0, #32
+scvtf   h23, x19, #1
+scvtf   h31, xzr, #20
+scvtf   h14, x0, #64
+scvtf   s23, w19, #1
+scvtf   s31, wzr, #20
+scvtf   s14, w0, #32
+scvtf   s23, x19, #1
+scvtf   s31, xzr, #20
+scvtf   s14, x0, #64
+scvtf   d23, w19, #1
+scvtf   d31, wzr, #20
+scvtf   d14, w0, #32
+scvtf   d23, x19, #1
+scvtf   d31, xzr, #20
+scvtf   d14, x0, #64
+ucvtf   h23, w19, #1
+ucvtf   h31, wzr, #20
+ucvtf   h14, w0, #32
+ucvtf   h23, x19, #1
+ucvtf   h31, xzr, #20
+ucvtf   h14, x0, #64
+ucvtf   s23, w19, #1
+ucvtf   s31, wzr, #20
+ucvtf   s14, w0, #32
+ucvtf   s23, x19, #1
+ucvtf   s31, xzr, #20
+ucvtf   s14, x0, #64
+ucvtf   d23, w19, #1
+ucvtf   d31, wzr, #20
+ucvtf   d14, w0, #32
+ucvtf   d23, x19, #1
+ucvtf   d31, xzr, #20
+ucvtf   d14, x0, #64
+
+#------------------------------------------------------------------------------
+# Floating-point <-> integer conversion
+#------------------------------------------------------------------------------
+
+fcvtns   w3, h31
+fcvtns   xzr, h12
+fcvtnu   wzr, h12
+fcvtnu   x0, h0
+fcvtps   wzr, h9
+fcvtps   x12, h20
+fcvtpu   w30, h23
+fcvtpu   x29, h3
+fcvtms   w2, h3
+fcvtms   x4, h5
+fcvtmu   w6, h7
+fcvtmu   x8, h9
+fcvtzs   w10, h11
+fcvtzs   x12, h13
+fcvtzu   w14, h15
+fcvtzu   x15, h16
+scvtf    h17, w18
+scvtf    h19, x20
+ucvtf    h21, w22
+scvtf    h23, x24
+fcvtas   w25, h26
+fcvtas   x27, h28
+fcvtau   w29, h30
+fcvtau   xzr, h0
+fcvtns   w3, s31
+fcvtns   xzr, s12
+fcvtnu   wzr, s12
+fcvtnu   x0, s0
+fcvtps   wzr, s9
+fcvtps   x12, s20
+fcvtpu   w30, s23
+fcvtpu   x29, s3
+fcvtms   w2, s3
+fcvtms   x4, s5
+fcvtmu   w6, s7
+fcvtmu   x8, s9
+fcvtzs   w10, s11
+fcvtzs   x12, s13
+fcvtzu   w14, s15
+fcvtzu   x15, s16
+scvtf    s17, w18
+scvtf    s19, x20
+ucvtf    s21, w22
+scvtf    s23, x24
+fcvtas   w25, s26
+fcvtas   x27, s28
+fcvtau   w29, s30
+fcvtau   xzr, s0
+fcvtns   w3, d31
+fcvtns   xzr, d12
+fcvtnu   wzr, d12
+fcvtnu   x0, d0
+fcvtps   wzr, d9
+fcvtps   x12, d20
+fcvtpu   w30, d23
+fcvtpu   x29, d3
+fcvtms   w2, d3
+fcvtms   x4, d5
+fcvtmu   w6, d7
+fcvtmu   x8, d9
+fcvtzs   w10, d11
+fcvtzs   x12, d13
+fcvtzu   w14, d15
+fcvtzu   x15, d16
+scvtf    d17, w18
+scvtf    d19, x20
+ucvtf    d21, w22
+ucvtf    d23, x24
+fcvtas   w25, d26
+fcvtas   x27, d28
+fcvtau   w29, d30
+fcvtau   xzr, d0
+fmov     w3, s9
+fmov     s9, w3
+fmov     x20, d31
+fmov     d1, x15
+fmov     x3, v12.d[1]
+fmov     v1.d[1], x19
+
+#------------------------------------------------------------------------------
+# Floating-point immediate
+#------------------------------------------------------------------------------
+
+fmov     s2, #0.12500000
+fmov     s3, #1.00000000
+fmov     d30, #16.00000000
+fmov     s4, #1.06250000
+fmov     d10, #1.93750000
+fmov     s12, #-1.00000000
+fmov     d16, #8.50000000
+
+#------------------------------------------------------------------------------
+# Load-register (literal)
+#------------------------------------------------------------------------------
+
+ldr       w3, #0
+ldr       x29, #4
+ldrsw     xzr, #-4
+ldr       s0, #8
+ldr       d0, #1048572
+ldr       q0, #-1048576
+prfm      pldl1strm, #0
+prfm      #22, #0
+
+#------------------------------------------------------------------------------
+# Load/store exclusive
+#------------------------------------------------------------------------------
+
+stxrb      w18, w8, [sp]
+stxrh      w24, w15, [x16]
+stxr       w5, w6, [x17]
+stxr       w1, x10, [x21]
+ldxrb      w30, [x0]
+ldxrh      w17, [x4]
+ldxr       w22, [sp]
+ldxr       x11, [x29]
+ldxr       x11, [x29]
+ldxr       x11, [x29]
+stxp       w12, w11, w10, [sp]
+stxp       wzr, x27, x9, [x12]
+ldxp       w0, wzr, [sp]
+ldxp       x17, x0, [x18]
+ldxp       x17, x0, [x18]
+stlxrb     w12, w22, [x0]
+stlxrh     w10, w1, [x1]
+stlxr      w9, w2, [x2]
+stlxr      w9, x3, [sp]
+ldaxrb     w8, [x4]
+ldaxrh     w7, [x5]
+ldaxr      w6, [sp]
+ldaxr      x5, [x6]
+ldaxr      x5, [x6]
+ldaxr      x5, [x6]
+stlxp      w4, w5, w6, [sp]
+stlxp      wzr, x6, x7, [x1]
+ldaxp      w5, w18, [sp]
+ldaxp      x6, x19, [x22]
+ldaxp      x6, x19, [x22]
+stlrb      w24, [sp]
+stlrh      w25, [x30]
+stlr       w26, [x29]
+stlr       x27, [x28]
+stlr       x27, [x28]
+stlr       x27, [x28]
+ldarb      w23, [sp]
+ldarh      w22, [x30]
+ldar       wzr, [x29]
+ldar       x21, [x28]
+ldar       x21, [x28]
+ldar       x21, [x28]
+
+#------------------------------------------------------------------------------
+# Load/store (unscaled  immediate)
+#------------------------------------------------------------------------------
+
+sturb    w9, [sp]
+sturh    wzr, [x12, #255]
+stur     w16, [x0, #-256]
+stur     x28, [x14, #1]
+ldurb    w1, [x20, #255]
+ldurh    w20, [x1, #255]
+ldur     w12, [sp, #255]
+ldur     xzr, [x12, #255]
+ldursb   x9, [x7, #-256]
+ldursh   x17, [x19, #-256]
+ldursw   x20, [x15, #-256]
+prfum    pldl2keep, [sp, #-256]
+ldursb   w19, [x1, #-256]
+ldursh   w15, [x21, #-256]
+stur     b0, [sp, #1]
+stur     h12, [x12, #-1]
+stur     s15, [x0, #255]
+stur     d31, [x5, #25]
+stur     q9, [x5]
+ldur     b3, [sp]
+ldur     h5, [x4, #-256]
+ldur     s7, [x12, #-1]
+ldur     d11, [x19, #4]
+ldur     q13, [x1, #2]
+
+#------------------------------------------------------------------------------
+# Load/store (immediate post-indexed)
+#------------------------------------------------------------------------------
+
+strb     w9, [x2], #255
+strb     w10, [x3], #1
+strb     w10, [x3], #-256
+strh     w9, [x2], #255
+strh     w9, [x2], #1
+strh     w10, [x3], #-256
+str      w19, [sp], #255
+str      w20, [x30], #1
+str      w21, [x12], #-256
+str      xzr, [x9], #255
+str      x2, [x3], #1
+str      x19, [x12], #-256
+ldrb     w9, [x2], #255
+ldrb     w10, [x3], #1
+ldrb     w10, [x3], #-256
+ldrh     w9, [x2], #255
+ldrh     w9, [x2], #1
+ldrh     w10, [x3], #-256
+ldr      w19, [sp], #255
+ldr      w20, [x30], #1
+ldr      w21, [x12], #-256
+ldr      xzr, [x9], #255
+ldr      x2, [x3], #1
+ldr      x19, [x12], #-256
+ldrsb    xzr, [x9], #255
+ldrsb    x2, [x3], #1
+ldrsb    x19, [x12], #-256
+ldrsh    xzr, [x9], #255
+ldrsh    x2, [x3], #1
+ldrsh    x19, [x12], #-256
+ldrsw    xzr, [x9], #255
+ldrsw    x2, [x3], #1
+ldrsw    x19, [x12], #-256
+ldrsb    wzr, [x9], #255
+ldrsb    w2, [x3], #1
+ldrsb    w19, [x12], #-256
+ldrsh    wzr, [x9], #255
+ldrsh    w2, [x3], #1
+ldrsh    w19, [x12], #-256
+str      b0, [x0], #255
+str      b3, [x3], #1
+str      b5, [sp], #-256
+str      h10, [x10], #255
+str      h13, [x23], #1
+str      h15, [sp], #-256
+str      s20, [x20], #255
+str      s23, [x23], #1
+str      s25, [x0], #-256
+str      d20, [x20], #255
+str      d23, [x23], #1
+str      d25, [x0], #-256
+ldr      b0, [x0], #255
+ldr      b3, [x3], #1
+ldr      b5, [sp], #-256
+ldr      h10, [x10], #255
+ldr      h13, [x23], #1
+ldr      h15, [sp], #-256
+ldr      s20, [x20], #255
+ldr      s23, [x23], #1
+ldr      s25, [x0], #-256
+ldr      d20, [x20], #255
+ldr      d23, [x23], #1
+ldr      d25, [x0], #-256
+ldr      q20, [x1], #255
+ldr      q23, [x9], #1
+ldr      q25, [x20], #-256
+str      q10, [x1], #255
+str      q22, [sp], #1
+str      q21, [x20], #-256
+
+#-------------------------------------------------------------------------------
+# Load-store register (immediate pre-indexed)
+#-------------------------------------------------------------------------------
+
+ldr      x3, [x4, #0]!
+strb     w9, [x2, #255]!
+strb     w10, [x3, #1]!
+strb     w10, [x3, #-256]!
+strh     w9, [x2, #255]!
+strh     w9, [x2, #1]!
+strh     w10, [x3, #-256]!
+str      w19, [sp, #255]!
+str      w20, [x30, #1]!
+str      w21, [x12, #-256]!
+str      xzr, [x9, #255]!
+str      x2, [x3, #1]!
+str      x19, [x12, #-256]!
+ldrb     w9, [x2, #255]!
+ldrb     w10, [x3, #1]!
+ldrb     w10, [x3, #-256]!
+ldrh     w9, [x2, #255]!
+ldrh     w9, [x2, #1]!
+ldrh     w10, [x3, #-256]!
+ldr      w19, [sp, #255]!
+ldr      w20, [x30, #1]!
+ldr      w21, [x12, #-256]!
+ldr      xzr, [x9, #255]!
+ldr      x2, [x3, #1]!
+ldr      x19, [x12, #-256]!
+ldrsb    xzr, [x9, #255]!
+ldrsb    x2, [x3, #1]!
+ldrsb    x19, [x12, #-256]!
+ldrsh    xzr, [x9, #255]!
+ldrsh    x2, [x3, #1]!
+ldrsh    x19, [x12, #-256]!
+ldrsw    xzr, [x9, #255]!
+ldrsw    x2, [x3, #1]!
+ldrsw    x19, [x12, #-256]!
+ldrsb    wzr, [x9, #255]!
+ldrsb    w2, [x3, #1]!
+ldrsb    w19, [x12, #-256]!
+ldrsh    wzr, [x9, #255]!
+ldrsh    w2, [x3, #1]!
+ldrsh    w19, [x12, #-256]!
+str      b0, [x0, #255]!
+str      b3, [x3, #1]!
+str      b5, [sp, #-256]!
+str      h10, [x10, #255]!
+str      h13, [x23, #1]!
+str      h15, [sp, #-256]!
+str      s20, [x20, #255]!
+str      s23, [x23, #1]!
+str      s25, [x0, #-256]!
+str      d20, [x20, #255]!
+str      d23, [x23, #1]!
+str      d25, [x0, #-256]!
+ldr      b0, [x0, #255]!
+ldr      b3, [x3, #1]!
+ldr      b5, [sp, #-256]!
+ldr      h10, [x10, #255]!
+ldr      h13, [x23, #1]!
+ldr      h15, [sp, #-256]!
+ldr      s20, [x20, #255]!
+ldr      s23, [x23, #1]!
+ldr      s25, [x0, #-256]!
+ldr      d20, [x20, #255]!
+ldr      d23, [x23, #1]!
+ldr      d25, [x0, #-256]!
+ldr      q20, [x1, #255]!
+ldr      q23, [x9, #1]!
+ldr      q25, [x20, #-256]!
+str      q10, [x1, #255]!
+str      q22, [sp, #1]!
+str      q21, [x20, #-256]!
+
+#------------------------------------------------------------------------------
+# Load/store (unprivileged)
+#------------------------------------------------------------------------------
+
+sttrb    w9, [sp]
+sttrh    wzr, [x12, #255]
+sttr     w16, [x0, #-256]
+sttr     x28, [x14, #1]
+ldtrb    w1, [x20, #255]
+ldtrh    w20, [x1, #255]
+ldtr     w12, [sp, #255]
+ldtr     xzr, [x12, #255]
+ldtrsb   x9, [x7, #-256]
+ldtrsh   x17, [x19, #-256]
+ldtrsw   x20, [x15, #-256]
+ldtrsb   w19, [x1, #-256]
+ldtrsh   w15, [x21, #-256]
+
+#------------------------------------------------------------------------------
+# Load/store (unsigned  immediate)
+#------------------------------------------------------------------------------
+
+ldr      x4, [x29]
+ldr      x30, [x12, #32760]
+ldr      x20, [sp, #8]
+ldr      xzr, [sp]
+ldr      w2, [sp]
+ldr      w17, [sp, #16380]
+ldr      w13, [x2, #4]
+ldrsw    x2, [x5, #4]
+ldrsw    x23, [sp, #16380]
+ldrh     w2, [x4]
+ldrsh    w23, [x6, #8190]
+ldrsh    wzr, [sp, #2]
+ldrsh    x29, [x2, #2]
+ldrb     w26, [x3, #121]
+ldrb     w12, [x2]
+ldrsb    w27, [sp, #4095]
+ldrsb    xzr, [x15]
+str      x30, [sp]
+str      w20, [x4, #16380]
+strh     w17, [sp, #8190]
+strb     w23, [x3, #4095]
+strb     wzr, [x2]
+ldr      b31, [sp, #4095]
+ldr      h20, [x2, #8190]
+ldr      s10, [x19, #16380]
+ldr      d3, [x10, #32760]
+str      q12, [sp, #65520]
+
+#------------------------------------------------------------------------------
+# Load/store (register offset)
+#------------------------------------------------------------------------------
+
+ldrb     w3, [sp, x5]
+ldrb     w9, [x27, x6]
+ldrsb    w10, [x30, x7]
+ldrb     w11, [x29, x3, sxtx]
+strb     w12, [x28, xzr, sxtx]
+ldrb     w14, [x26, w6, uxtw]
+ldrsb    w15, [x25, w7, uxtw]
+ldrb     w17, [x23, w9, sxtw]
+ldrsb    x18, [x22, w10, sxtw]
+ldrsh    w3, [sp, x5]
+ldrsh    w9, [x27, x6]
+ldrh     w10, [x30, x7, lsl #1]
+strh     w11, [x29, x3, sxtx]
+ldrh     w12, [x28, xzr, sxtx]
+ldrsh    x13, [x27, x5, sxtx #1]
+ldrh     w14, [x26, w6, uxtw]
+ldrh     w15, [x25, w7, uxtw]
+ldrsh    w16, [x24, w8, uxtw #1]
+ldrh     w17, [x23, w9, sxtw]
+ldrh     w18, [x22, w10, sxtw]
+strh     w19, [x21, wzr, sxtw #1]
+ldr      w3, [sp, x5]
+ldr      s9, [x27, x6]
+ldr      w10, [x30, x7, lsl #2]
+ldr      w11, [x29, x3, sxtx]
+str      s12, [x28, xzr, sxtx]
+str      w13, [x27, x5, sxtx #2]
+str      w14, [x26, w6, uxtw]
+ldr      w15, [x25, w7, uxtw]
+ldr      w16, [x24, w8, uxtw #2]
+ldrsw    x17, [x23, w9, sxtw]
+ldr      w18, [x22, w10, sxtw]
+ldrsw    x19, [x21, wzr, sxtw #2]
+ldr      x3, [sp, x5]
+str      x9, [x27, x6]
+ldr      d10, [x30, x7, lsl #3]
+str      x11, [x29, x3, sxtx]
+ldr      x12, [x28, xzr, sxtx]
+ldr      x13, [x27, x5, sxtx #3]
+prfm     pldl1keep, [x26, w6, uxtw]
+ldr      x15, [x25, w7, uxtw]
+ldr      x16, [x24, w8, uxtw #3]
+ldr      x17, [x23, w9, sxtw]
+ldr      x18, [x22, w10, sxtw]
+str      d19, [x21, wzr, sxtw #3]
+ldr      q3, [sp, x5]
+ldr      q9, [x27, x6]
+ldr      q10, [x30, x7, lsl #4]
+str      q11, [x29, x3, sxtx]
+str      q12, [x28, xzr, sxtx]
+str      q13, [x27, x5, sxtx #4]
+ldr      q14, [x26, w6, uxtw]
+ldr      q15, [x25, w7, uxtw]
+ldr      q16, [x24, w8, uxtw #4]
+ldr      q17, [x23, w9, sxtw]
+str      q18, [x22, w10, sxtw]
+ldr      q19, [x21, wzr, sxtw #4]
+
+#------------------------------------------------------------------------------
+# Load/store register pair (offset)
+#------------------------------------------------------------------------------
+
+ldp      w3, w5, [sp]
+stp      wzr, w9, [sp, #252]
+ldp      w2, wzr, [sp, #-256]
+ldp      w9, w10, [sp, #4]
+ldpsw    x9, x10, [sp, #4]
+ldpsw    x9, x10, [x2, #-256]
+ldpsw    x20, x30, [sp, #252]
+ldp      x21, x29, [x2, #504]
+ldp      x22, x23, [x3, #-512]
+ldp      x24, x25, [x4, #8]
+ldp      s29, s28, [sp, #252]
+stp      s27, s26, [sp, #-256]
+ldp      s1, s2, [x3, #44]
+stp      d3, d5, [x9, #504]
+stp      d7, d11, [x10, #-512]
+ldp      d2, d3, [x30, #-8]
+stp      q3, q5, [sp]
+stp      q17, q19, [sp, #1008]
+ldp      q23, q29, [x1, #-1024]
+
+#------------------------------------------------------------------------------
+# Load/store register pair (post-indexed)
+#------------------------------------------------------------------------------
+
+ldp      w3, w5, [sp], #0
+stp      wzr, w9, [sp], #252
+ldp      w2, wzr, [sp], #-256
+ldp      w9, w10, [sp], #4
+ldpsw    x9, x10, [sp], #4
+ldpsw    x9, x10, [x2], #-256
+ldpsw    x20, x30, [sp], #252
+ldp      x21, x29, [x2], #504
+ldp      x22, x23, [x3], #-512
+ldp      x24, x25, [x4], #8
+ldp      s29, s28, [sp], #252
+stp      s27, s26, [sp], #-256
+ldp      s1, s2, [x3], #44
+stp      d3, d5, [x9], #504
+stp      d7, d11, [x10], #-512
+ldp      d2, d3, [x30], #-8
+stp      q3, q5, [sp], #0
+stp      q17, q19, [sp], #1008
+ldp      q23, q29, [x1], #-1024
+
+#------------------------------------------------------------------------------
+# Load/store register pair (pre-indexed)
+#------------------------------------------------------------------------------
+
+ldp      w3, w5, [sp, #0]!
+stp      wzr, w9, [sp, #252]!
+ldp      w2, wzr, [sp, #-256]!
+ldp      w9, w10, [sp, #4]!
+ldpsw    x9, x10, [sp, #4]!
+ldpsw    x9, x10, [x2, #-256]!
+ldpsw    x20, x30, [sp, #252]!
+ldp      x21, x29, [x2, #504]!
+ldp      x22, x23, [x3, #-512]!
+ldp      x24, x25, [x4, #8]!
+ldp      s29, s28, [sp, #252]!
+stp      s27, s26, [sp, #-256]!
+ldp      s1, s2, [x3, #44]!
+stp      d3, d5, [x9, #504]!
+stp      d7, d11, [x10, #-512]!
+ldp      d2, d3, [x30, #-8]!
+stp      q3, q5, [sp, #0]!
+stp      q17, q19, [sp, #1008]!
+ldp      q23, q29, [x1, #-1024]!
+
+#------------------------------------------------------------------------------
+# Load/store register pair (offset)
+#------------------------------------------------------------------------------
+
+ldnp      w3, w5, [sp]
+stnp      wzr, w9, [sp, #252]
+ldnp      w2, wzr, [sp, #-256]
+ldnp      w9, w10, [sp, #4]
+ldnp      x21, x29, [x2, #504]
+ldnp      x22, x23, [x3, #-512]
+ldnp      x24, x25, [x4, #8]
+ldnp      s29, s28, [sp, #252]
+stnp      s27, s26, [sp, #-256]
+ldnp      s1, s2, [x3, #44]
+stnp      d3, d5, [x9, #504]
+stnp      d7, d11, [x10, #-512]
+ldnp      d2, d3, [x30, #-8]
+stnp      q3, q5, [sp]
+stnp      q17, q19, [sp, #1008]
+ldnp      q23, q29, [x1, #-1024]
+
+#------------------------------------------------------------------------------
+# Logical (immediate)
+#------------------------------------------------------------------------------
+
+mov      w3, #983055
+mov      x10, #-6148914691236517206
+
+#------------------------------------------------------------------------------
+# Logical (shifted register)
+#------------------------------------------------------------------------------
+
+and      w12, w23, w21
+and      w16, w15, w1, lsl #1
+and      w9, w4, w10, lsl #31
+and      w3, w30, w11
+and      x3, x5, x7, lsl #63
+and      x5, x14, x19, asr #4
+and      w3, w17, w19, ror #31
+and      w0, w2, wzr, lsr #17
+and      w3, w30, w11, asr #2
+and      xzr, x4, x26
+and      w3, wzr, w20, ror #2
+and      x7, x20, xzr, asr #63
+bic      x13, x20, x14, lsl #47
+bic      w2, w7, w9
+orr      w2, w7, w0, asr #31
+orr      x8, x9, x10, lsl #12
+orn      x3, x5, x7, asr #2
+orn      w2, w5, w29
+ands     w7, wzr, w9, lsl #1
+ands     x3, x5, x20, ror #63
+bics     w3, w5, w7
+bics     x3, xzr, x3, lsl #1
+tst      w3, w7, lsl #31
+tst      x2, x20, asr #2
+mov      x3, x6
+mov      x3, xzr
+mov      wzr, w2
+mov      w3, w5
+
+#------------------------------------------------------------------------------
+# Move wide (immediate)
+#------------------------------------------------------------------------------
+
+movz     w2, #0, lsl #16
+mov     w2, #-1235
+mov     x2, #5299989643264
+mov      x2, #0
+movk     w3, #0
+movz     x4, #0, lsl #16
+movk     w5, #0, lsl #16
+movz     x6, #0, lsl #32
+movk     x7, #0, lsl #32
+movz     x8, #0, lsl #48
+movk     x9, #0, lsl #48
+
+#------------------------------------------------------------------------------
+# PC-relative addressing
+#------------------------------------------------------------------------------
+
+adr      x2, #1600
+adrp     x21, #6553600
+adr      x0, #262144
+
+#------------------------------------------------------------------------------
+# Test and branch (immediate)
+#------------------------------------------------------------------------------
+
+tbz     x12, #62, #0
+tbz     x12, #62, #4
+tbz     x12, #62, #-32768
+tbnz    x12, #60, #32764
+
+#------------------------------------------------------------------------------
+# Unconditional branch (immediate)
+#------------------------------------------------------------------------------
+
+b        #4
+b        #-4
+b        #134217724
+
+#------------------------------------------------------------------------------
+# Unconditional branch (register)
+#------------------------------------------------------------------------------
+
+br       x20
+blr      xzr
+ret      x10
+ret
+eret
+drps
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        add	w2, w3, #4095
+# CHECK-NEXT:  1      1     1.00                        add	w30, w29, #1, lsl #12
+# CHECK-NEXT:  1      1     1.00                        add	w13, w5, #4095, lsl #12
+# CHECK-NEXT:  1      1     1.00                        add	x5, x7, #1638
+# CHECK-NEXT:  1      1     1.00                        add	w20, wsp, #801
+# CHECK-NEXT:  1      1     1.00                        add	wsp, wsp, #1104
+# CHECK-NEXT:  1      1     1.00                        add	wsp, w30, #4084
+# CHECK-NEXT:  1      1     1.00                        add	x0, x24, #291
+# CHECK-NEXT:  1      1     1.00                        add	x3, x24, #4095, lsl #12
+# CHECK-NEXT:  1      1     1.00                        add	x8, sp, #1074
+# CHECK-NEXT:  1      1     1.00                        add	sp, x29, #3816
+# CHECK-NEXT:  1      1     1.00                        sub	w0, wsp, #4077
+# CHECK-NEXT:  1      1     1.00                        sub	w4, w20, #546, lsl #12
+# CHECK-NEXT:  1      1     1.00                        sub	sp, sp, #288
+# CHECK-NEXT:  1      1     1.00                        sub	wsp, w19, #16
+# CHECK-NEXT:  1      1     1.00                        adds	w13, w23, #291, lsl #12
+# CHECK-NEXT:  1      1     1.00                        cmn	w2, #4095
+# CHECK-NEXT:  1      1     1.00                        adds	w20, wsp, #0
+# CHECK-NEXT:  1      1     1.00                        cmn	x3, #1, lsl #12
+# CHECK-NEXT:  1      1     1.00                        cmp	sp, #20, lsl #12
+# CHECK-NEXT:  1      1     1.00                        cmp	x30, #4095
+# CHECK-NEXT:  1      1     1.00                        subs	x4, sp, #3822
+# CHECK-NEXT:  1      1     1.00                        cmn	w3, #291, lsl #12
+# CHECK-NEXT:  1      1     1.00                        cmn	wsp, #1365
+# CHECK-NEXT:  1      1     1.00                        cmn	sp, #1092, lsl #12
+# CHECK-NEXT:  1      1     1.00                        mov	sp, x30
+# CHECK-NEXT:  1      1     1.00                        mov	wsp, w20
+# CHECK-NEXT:  1      1     1.00                        mov	x11, sp
+# CHECK-NEXT:  1      1     1.00                        mov	w24, wsp
+# CHECK-NEXT:  1      1     1.00                        add	w3, w5, w7
+# CHECK-NEXT:  1      1     1.00                        add	wzr, w3, w5
+# CHECK-NEXT:  1      1     1.00                        add	w20, wzr, w4
+# CHECK-NEXT:  1      1     1.00                        add	w4, w6, wzr
+# CHECK-NEXT:  1      1     1.00                        add	w11, w13, w15
+# CHECK-NEXT:  1      2     1.00                        add	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        add	w17, w29, w20, lsl #31
+# CHECK-NEXT:  1      2     1.00                        add	w21, w22, w23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        add	w24, w25, w26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        add	w27, w28, w29, lsr #31
+# CHECK-NEXT:  1      2     1.00                        add	w2, w3, w4, asr #0
+# CHECK-NEXT:  1      2     1.00                        add	w5, w6, w7, asr #21
+# CHECK-NEXT:  1      2     1.00                        add	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     1.00                        add	x3, x5, x7
+# CHECK-NEXT:  1      1     1.00                        add	xzr, x3, x5
+# CHECK-NEXT:  1      1     1.00                        add	x20, xzr, x4
+# CHECK-NEXT:  1      1     1.00                        add	x4, x6, xzr
+# CHECK-NEXT:  1      1     1.00                        add	x11, x13, x15
+# CHECK-NEXT:  1      2     1.00                        add	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        add	x17, x29, x20, lsl #63
+# CHECK-NEXT:  1      2     1.00                        add	x21, x22, x23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        add	x24, x25, x26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        add	x27, x28, x29, lsr #63
+# CHECK-NEXT:  1      2     1.00                        add	x2, x3, x4, asr #0
+# CHECK-NEXT:  1      2     1.00                        add	x5, x6, x7, asr #21
+# CHECK-NEXT:  1      2     1.00                        add	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     1.00                        adds	w3, w5, w7
+# CHECK-NEXT:  1      1     1.00                        cmn	w3, w5
+# CHECK-NEXT:  1      1     1.00                        adds	w20, wzr, w4
+# CHECK-NEXT:  1      1     1.00                        adds	w4, w6, wzr
+# CHECK-NEXT:  1      1     1.00                        adds	w11, w13, w15
+# CHECK-NEXT:  1      2     1.00                        adds	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        adds	w17, w29, w20, lsl #31
+# CHECK-NEXT:  1      2     1.00                        adds	w21, w22, w23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        adds	w24, w25, w26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        adds	w27, w28, w29, lsr #31
+# CHECK-NEXT:  1      2     1.00                        adds	w2, w3, w4, asr #0
+# CHECK-NEXT:  1      2     1.00                        adds	w5, w6, w7, asr #21
+# CHECK-NEXT:  1      2     1.00                        adds	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     1.00                        adds	x3, x5, x7
+# CHECK-NEXT:  1      1     1.00                        cmn	x3, x5
+# CHECK-NEXT:  1      1     1.00                        adds	x20, xzr, x4
+# CHECK-NEXT:  1      1     1.00                        adds	x4, x6, xzr
+# CHECK-NEXT:  1      1     1.00                        adds	x11, x13, x15
+# CHECK-NEXT:  1      2     1.00                        adds	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        adds	x17, x29, x20, lsl #63
+# CHECK-NEXT:  1      2     1.00                        adds	x21, x22, x23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        adds	x24, x25, x26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        adds	x27, x28, x29, lsr #63
+# CHECK-NEXT:  1      2     1.00                        adds	x2, x3, x4, asr #0
+# CHECK-NEXT:  1      2     1.00                        adds	x5, x6, x7, asr #21
+# CHECK-NEXT:  1      2     1.00                        adds	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     1.00                        sub	w3, w5, w7
+# CHECK-NEXT:  1      1     1.00                        sub	wzr, w3, w5
+# CHECK-NEXT:  1      1     1.00                        sub	w4, w6, wzr
+# CHECK-NEXT:  1      1     1.00                        sub	w11, w13, w15
+# CHECK-NEXT:  1      2     1.00                        sub	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        sub	w17, w29, w20, lsl #31
+# CHECK-NEXT:  1      2     1.00                        sub	w21, w22, w23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        sub	w24, w25, w26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        sub	w27, w28, w29, lsr #31
+# CHECK-NEXT:  1      2     1.00                        sub	w2, w3, w4, asr #0
+# CHECK-NEXT:  1      2     1.00                        sub	w5, w6, w7, asr #21
+# CHECK-NEXT:  1      2     1.00                        sub	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     1.00                        sub	x3, x5, x7
+# CHECK-NEXT:  1      1     1.00                        sub	xzr, x3, x5
+# CHECK-NEXT:  1      1     1.00                        sub	x4, x6, xzr
+# CHECK-NEXT:  1      1     1.00                        sub	x11, x13, x15
+# CHECK-NEXT:  1      2     1.00                        sub	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        sub	x17, x29, x20, lsl #63
+# CHECK-NEXT:  1      2     1.00                        sub	x21, x22, x23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        sub	x24, x25, x26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        sub	x27, x28, x29, lsr #63
+# CHECK-NEXT:  1      2     1.00                        sub	x2, x3, x4, asr #0
+# CHECK-NEXT:  1      2     1.00                        sub	x5, x6, x7, asr #21
+# CHECK-NEXT:  1      2     1.00                        sub	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     1.00                        subs	w3, w5, w7
+# CHECK-NEXT:  1      1     1.00                        cmp	w3, w5
+# CHECK-NEXT:  1      1     1.00                        subs	w4, w6, wzr
+# CHECK-NEXT:  1      1     1.00                        subs	w11, w13, w15
+# CHECK-NEXT:  1      2     1.00                        subs	w9, w3, wzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        subs	w17, w29, w20, lsl #31
+# CHECK-NEXT:  1      2     1.00                        subs	w21, w22, w23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        subs	w24, w25, w26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        subs	w27, w28, w29, lsr #31
+# CHECK-NEXT:  1      2     1.00                        subs	w2, w3, w4, asr #0
+# CHECK-NEXT:  1      2     1.00                        subs	w5, w6, w7, asr #21
+# CHECK-NEXT:  1      2     1.00                        subs	w8, w9, w10, asr #31
+# CHECK-NEXT:  1      1     1.00                        subs	x3, x5, x7
+# CHECK-NEXT:  1      1     1.00                        cmp	x3, x5
+# CHECK-NEXT:  1      1     1.00                        subs	x4, x6, xzr
+# CHECK-NEXT:  1      1     1.00                        subs	x11, x13, x15
+# CHECK-NEXT:  1      2     1.00                        subs	x9, x3, xzr, lsl #10
+# CHECK-NEXT:  1      2     1.00                        subs	x17, x29, x20, lsl #63
+# CHECK-NEXT:  1      2     1.00                        subs	x21, x22, x23, lsr #0
+# CHECK-NEXT:  1      2     1.00                        subs	x24, x25, x26, lsr #18
+# CHECK-NEXT:  1      2     1.00                        subs	x27, x28, x29, lsr #63
+# CHECK-NEXT:  1      2     1.00                        subs	x2, x3, x4, asr #0
+# CHECK-NEXT:  1      2     1.00                        subs	x5, x6, x7, asr #21
+# CHECK-NEXT:  1      2     1.00                        subs	x8, x9, x10, asr #63
+# CHECK-NEXT:  1      1     1.00                        cmn	wzr, w4
+# CHECK-NEXT:  1      1     1.00                        cmn	w5, wzr
+# CHECK-NEXT:  1      1     1.00                        cmn	w6, w7
+# CHECK-NEXT:  1      2     1.00                        cmn	w8, w9, lsl #15
+# CHECK-NEXT:  1      2     1.00                        cmn	w10, w11, lsl #31
+# CHECK-NEXT:  1      2     1.00                        cmn	w12, w13, lsr #0
+# CHECK-NEXT:  1      2     1.00                        cmn	w14, w15, lsr #21
+# CHECK-NEXT:  1      2     1.00                        cmn	w16, w17, lsr #31
+# CHECK-NEXT:  1      2     1.00                        cmn	w18, w19, asr #0
+# CHECK-NEXT:  1      2     1.00                        cmn	w20, w21, asr #22
+# CHECK-NEXT:  1      2     1.00                        cmn	w22, w23, asr #31
+# CHECK-NEXT:  1      1     1.00                        cmn	x0, x3
+# CHECK-NEXT:  1      1     1.00                        cmn	xzr, x4
+# CHECK-NEXT:  1      1     1.00                        cmn	x5, xzr
+# CHECK-NEXT:  1      1     1.00                        cmn	x6, x7
+# CHECK-NEXT:  1      2     1.00                        cmn	x8, x9, lsl #15
+# CHECK-NEXT:  1      2     1.00                        cmn	x10, x11, lsl #63
+# CHECK-NEXT:  1      2     1.00                        cmn	x12, x13, lsr #0
+# CHECK-NEXT:  1      2     1.00                        cmn	x14, x15, lsr #41
+# CHECK-NEXT:  1      2     1.00                        cmn	x16, x17, lsr #63
+# CHECK-NEXT:  1      2     1.00                        cmn	x18, x19, asr #0
+# CHECK-NEXT:  1      2     1.00                        cmn	x20, x21, asr #55
+# CHECK-NEXT:  1      2     1.00                        cmn	x22, x23, asr #63
+# CHECK-NEXT:  1      1     1.00                        cmp	w0, w3
+# CHECK-NEXT:  1      1     1.00                        cmp	wzr, w4
+# CHECK-NEXT:  1      1     1.00                        cmp	w5, wzr
+# CHECK-NEXT:  1      1     1.00                        cmp	w6, w7
+# CHECK-NEXT:  1      2     1.00                        cmp	w8, w9, lsl #15
+# CHECK-NEXT:  1      2     1.00                        cmp	w10, w11, lsl #31
+# CHECK-NEXT:  1      2     1.00                        cmp	w12, w13, lsr #0
+# CHECK-NEXT:  1      2     1.00                        cmp	w14, w15, lsr #21
+# CHECK-NEXT:  1      2     1.00                        cmp	w18, w19, asr #0
+# CHECK-NEXT:  1      2     1.00                        cmp	w20, w21, asr #22
+# CHECK-NEXT:  1      2     1.00                        cmp	w22, w23, asr #31
+# CHECK-NEXT:  1      1     1.00                        cmp	x0, x3
+# CHECK-NEXT:  1      1     1.00                        cmp	xzr, x4
+# CHECK-NEXT:  1      1     1.00                        cmp	x5, xzr
+# CHECK-NEXT:  1      1     1.00                        cmp	x6, x7
+# CHECK-NEXT:  1      2     1.00                        cmp	x8, x9, lsl #15
+# CHECK-NEXT:  1      2     1.00                        cmp	x10, x11, lsl #63
+# CHECK-NEXT:  1      2     1.00                        cmp	x12, x13, lsr #0
+# CHECK-NEXT:  1      2     1.00                        cmp	x14, x15, lsr #41
+# CHECK-NEXT:  1      2     1.00                        cmp	x16, x17, lsr #63
+# CHECK-NEXT:  1      2     1.00                        cmp	x18, x19, asr #0
+# CHECK-NEXT:  1      2     1.00                        cmp	x20, x21, asr #55
+# CHECK-NEXT:  1      2     1.00                        cmp	x22, x23, asr #63
+# CHECK-NEXT:  1      1     1.00                        cmp	wzr, w0
+# CHECK-NEXT:  1      1     1.00                        cmp	xzr, x0
+# CHECK-NEXT:  1      1     1.00                        adc	w29, w27, w25
+# CHECK-NEXT:  1      1     1.00                        adc	wzr, w3, w4
+# CHECK-NEXT:  1      1     1.00                        adc	w9, wzr, w10
+# CHECK-NEXT:  1      1     1.00                        adc	w20, w0, wzr
+# CHECK-NEXT:  1      1     1.00                        adc	x29, x27, x25
+# CHECK-NEXT:  1      1     1.00                        adc	xzr, x3, x4
+# CHECK-NEXT:  1      1     1.00                        adc	x9, xzr, x10
+# CHECK-NEXT:  1      1     1.00                        adc	x20, x0, xzr
+# CHECK-NEXT:  1      1     1.00                        adcs	w29, w27, w25
+# CHECK-NEXT:  1      1     1.00                        adcs	wzr, w3, w4
+# CHECK-NEXT:  1      1     1.00                        adcs	w9, wzr, w10
+# CHECK-NEXT:  1      1     1.00                        adcs	w20, w0, wzr
+# CHECK-NEXT:  1      1     1.00                        adcs	x29, x27, x25
+# CHECK-NEXT:  1      1     1.00                        adcs	xzr, x3, x4
+# CHECK-NEXT:  1      1     1.00                        adcs	x9, xzr, x10
+# CHECK-NEXT:  1      1     1.00                        adcs	x20, x0, xzr
+# CHECK-NEXT:  1      1     1.00                        sbc	w29, w27, w25
+# CHECK-NEXT:  1      1     1.00                        sbc	wzr, w3, w4
+# CHECK-NEXT:  1      1     1.00                        ngc	w9, w10
+# CHECK-NEXT:  1      1     1.00                        sbc	w20, w0, wzr
+# CHECK-NEXT:  1      1     1.00                        sbc	x29, x27, x25
+# CHECK-NEXT:  1      1     1.00                        sbc	xzr, x3, x4
+# CHECK-NEXT:  1      1     1.00                        ngc	x9, x10
+# CHECK-NEXT:  1      1     1.00                        sbc	x20, x0, xzr
+# CHECK-NEXT:  1      1     1.00                        sbcs	w29, w27, w25
+# CHECK-NEXT:  1      1     1.00                        sbcs	wzr, w3, w4
+# CHECK-NEXT:  1      1     1.00                        ngcs	w9, w10
+# CHECK-NEXT:  1      1     1.00                        sbcs	w20, w0, wzr
+# CHECK-NEXT:  1      1     1.00                        sbcs	x29, x27, x25
+# CHECK-NEXT:  1      1     1.00                        sbcs	xzr, x3, x4
+# CHECK-NEXT:  1      1     1.00                        ngcs	x9, x10
+# CHECK-NEXT:  1      1     1.00                        sbcs	x20, x0, xzr
+# CHECK-NEXT:  1      1     1.00                        ngc	w3, w12
+# CHECK-NEXT:  1      1     1.00                        ngc	wzr, w9
+# CHECK-NEXT:  1      1     1.00                        ngc	w23, wzr
+# CHECK-NEXT:  1      1     1.00                        ngc	x29, x30
+# CHECK-NEXT:  1      1     1.00                        ngc	xzr, x0
+# CHECK-NEXT:  1      1     1.00                        ngc	x0, xzr
+# CHECK-NEXT:  1      1     1.00                        ngcs	w3, w12
+# CHECK-NEXT:  1      1     1.00                        ngcs	wzr, w9
+# CHECK-NEXT:  1      1     1.00                        ngcs	w23, wzr
+# CHECK-NEXT:  1      1     1.00                        ngcs	x29, x30
+# CHECK-NEXT:  1      1     1.00                        ngcs	xzr, x0
+# CHECK-NEXT:  1      1     1.00                        ngcs	x0, xzr
+# CHECK-NEXT:  1      2     1.00                        sbfx	x1, x2, #3, #2
+# CHECK-NEXT:  1      2     1.00                        asr	x3, x4, #63
+# CHECK-NEXT:  1      2     1.00                        asr	wzr, wzr, #31
+# CHECK-NEXT:  1      2     1.00                        sbfx	w12, w9, #0, #1
+# CHECK-NEXT:  1      2     1.00                        ubfiz	x4, x5, #52, #11
+# CHECK-NEXT:  1      2     1.00                        ubfx	xzr, x4, #0, #1
+# CHECK-NEXT:  1      2     1.00                        ubfiz	x4, xzr, #1, #6
+# CHECK-NEXT:  1      2     1.00                        lsr	x5, x6, #12
+# CHECK-NEXT:  1      2     1.00                        bfi	x4, x5, #52, #11
+# CHECK-NEXT:  1      2     1.00                        bfxil	xzr, x4, #0, #1
+# CHECK-NEXT:  1      2     1.00                        bfc	x4, #1, #6
+# CHECK-NEXT:  1      2     1.00                        bfxil	x5, x6, #12, #52
+# CHECK-NEXT:  1      2     1.00                        sxtb	w1, w2
+# CHECK-NEXT:  1      2     1.00                        sxtb	xzr, w3
+# CHECK-NEXT:  1      2     1.00                        sxth	w9, w10
+# CHECK-NEXT:  1      2     1.00                        sxth	x0, w1
+# CHECK-NEXT:  1      2     1.00                        sxtw	x3, w30
+# CHECK-NEXT:  1      2     1.00                        uxtb	w1, w2
+# CHECK-NEXT:  1      2     1.00                        uxth	w9, w10
+# CHECK-NEXT:  1      2     1.00                        ubfx	x3, x30, #0, #32
+# CHECK-NEXT:  1      2     1.00                        asr	w3, w2, #0
+# CHECK-NEXT:  1      2     1.00                        asr	w9, w10, #31
+# CHECK-NEXT:  1      2     1.00                        asr	x20, x21, #63
+# CHECK-NEXT:  1      2     1.00                        asr	w1, wzr, #3
+# CHECK-NEXT:  1      2     1.00                        lsr	w3, w2, #0
+# CHECK-NEXT:  1      2     1.00                        lsr	w9, w10, #31
+# CHECK-NEXT:  1      2     1.00                        lsr	x20, x21, #63
+# CHECK-NEXT:  1      2     1.00                        lsr	wzr, wzr, #3
+# CHECK-NEXT:  1      2     1.00                        lsr	w3, w2, #0
+# CHECK-NEXT:  1      2     1.00                        lsl	w9, w10, #31
+# CHECK-NEXT:  1      2     1.00                        lsl	x20, x21, #63
+# CHECK-NEXT:  1      2     1.00                        lsl	w1, wzr, #3
+# CHECK-NEXT:  1      2     1.00                        sbfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      2     1.00                        sbfiz	x2, x3, #63, #1
+# CHECK-NEXT:  1      2     1.00                        asr	x19, x20, #0
+# CHECK-NEXT:  1      2     1.00                        sbfiz	x9, x10, #5, #59
+# CHECK-NEXT:  1      2     1.00                        asr	w9, w10, #0
+# CHECK-NEXT:  1      2     1.00                        sbfiz	w11, w12, #31, #1
+# CHECK-NEXT:  1      2     1.00                        sbfiz	w13, w14, #29, #3
+# CHECK-NEXT:  1      2     1.00                        sbfiz	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      2     1.00                        sbfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      2     1.00                        asr	x2, x3, #63
+# CHECK-NEXT:  1      2     1.00                        asr	x19, x20, #0
+# CHECK-NEXT:  1      2     1.00                        asr	x9, x10, #5
+# CHECK-NEXT:  1      2     1.00                        asr	w9, w10, #0
+# CHECK-NEXT:  1      2     1.00                        asr	w11, w12, #31
+# CHECK-NEXT:  1      2     1.00                        asr	w13, w14, #29
+# CHECK-NEXT:  1      2     1.00                        sbfx	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      2     1.00                        bfxil	w9, w10, #0, #1
+# CHECK-NEXT:  1      2     1.00                        bfi	x2, x3, #63, #1
+# CHECK-NEXT:  1      2     1.00                        bfxil	x19, x20, #0, #64
+# CHECK-NEXT:  1      2     1.00                        bfi	x9, x10, #5, #59
+# CHECK-NEXT:  1      2     1.00                        bfxil	w9, w10, #0, #32
+# CHECK-NEXT:  1      2     1.00                        bfi	w11, w12, #31, #1
+# CHECK-NEXT:  1      2     1.00                        bfi	w13, w14, #29, #3
+# CHECK-NEXT:  1      2     1.00                        bfc	xzr, #10, #11
+# CHECK-NEXT:  1      2     1.00                        bfxil	w9, w10, #0, #1
+# CHECK-NEXT:  1      2     1.00                        bfxil	x2, x3, #63, #1
+# CHECK-NEXT:  1      2     1.00                        bfxil	x19, x20, #0, #64
+# CHECK-NEXT:  1      2     1.00                        bfxil	x9, x10, #5, #59
+# CHECK-NEXT:  1      2     1.00                        bfxil	w9, w10, #0, #32
+# CHECK-NEXT:  1      2     1.00                        bfxil	w11, w12, #31, #1
+# CHECK-NEXT:  1      2     1.00                        bfxil	w13, w14, #29, #3
+# CHECK-NEXT:  1      2     1.00                        bfxil	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      2     1.00                        ubfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      2     1.00                        lsl	x2, x3, #63
+# CHECK-NEXT:  1      2     1.00                        lsr	x19, x20, #0
+# CHECK-NEXT:  1      2     1.00                        lsl	x9, x10, #5
+# CHECK-NEXT:  1      2     1.00                        lsr	w9, w10, #0
+# CHECK-NEXT:  1      2     1.00                        lsl	w11, w12, #31
+# CHECK-NEXT:  1      2     1.00                        lsl	w13, w14, #29
+# CHECK-NEXT:  1      2     1.00                        ubfiz	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      2     1.00                        ubfx	w9, w10, #0, #1
+# CHECK-NEXT:  1      2     1.00                        lsr	x2, x3, #63
+# CHECK-NEXT:  1      2     1.00                        lsr	x19, x20, #0
+# CHECK-NEXT:  1      2     1.00                        lsr	x9, x10, #5
+# CHECK-NEXT:  1      2     1.00                        lsr	w9, w10, #0
+# CHECK-NEXT:  1      2     1.00                        lsr	w11, w12, #31
+# CHECK-NEXT:  1      2     1.00                        lsr	w13, w14, #29
+# CHECK-NEXT:  1      2     1.00                        ubfx	xzr, xzr, #10, #11
+# CHECK-NEXT:  1      1     1.00                        cbz	w5, #4
+# CHECK-NEXT:  1      1     1.00                        cbz	x5, #0
+# CHECK-NEXT:  1      1     1.00                        cbnz	x2, #-4
+# CHECK-NEXT:  1      1     1.00                        cbnz	x26, #1048572
+# CHECK-NEXT:  1      1     1.00                        cbz	wzr, #0
+# CHECK-NEXT:  1      1     1.00                        cbnz	xzr, #0
+# CHECK-NEXT:  1      1     1.00                        b.ne	#4
+# CHECK-NEXT:  1      1     1.00                        b.ge	#1048572
+# CHECK-NEXT:  1      1     1.00                        b.ge	#-4
+# CHECK-NEXT:  1      1     1.00                        ccmp	w1, #31, #0, eq
+# CHECK-NEXT:  1      1     1.00                        ccmp	w3, #0, #15, hs
+# CHECK-NEXT:  1      1     1.00                        ccmp	wzr, #15, #13, hs
+# CHECK-NEXT:  1      1     1.00                        ccmp	x9, #31, #0, le
+# CHECK-NEXT:  1      1     1.00                        ccmp	x3, #0, #15, gt
+# CHECK-NEXT:  1      1     1.00                        ccmp	xzr, #5, #7, ne
+# CHECK-NEXT:  1      1     1.00                        ccmn	w1, #31, #0, eq
+# CHECK-NEXT:  1      1     1.00                        ccmn	w3, #0, #15, hs
+# CHECK-NEXT:  1      1     1.00                        ccmn	wzr, #15, #13, hs
+# CHECK-NEXT:  1      1     1.00                        ccmn	x9, #31, #0, le
+# CHECK-NEXT:  1      1     1.00                        ccmn	x3, #0, #15, gt
+# CHECK-NEXT:  1      1     1.00                        ccmn	xzr, #5, #7, ne
+# CHECK-NEXT:  1      1     1.00                        ccmp	w1, wzr, #0, eq
+# CHECK-NEXT:  1      1     1.00                        ccmp	w3, w0, #15, hs
+# CHECK-NEXT:  1      1     1.00                        ccmp	wzr, w15, #13, hs
+# CHECK-NEXT:  1      1     1.00                        ccmp	x9, xzr, #0, le
+# CHECK-NEXT:  1      1     1.00                        ccmp	x3, x0, #15, gt
+# CHECK-NEXT:  1      1     1.00                        ccmp	xzr, x5, #7, ne
+# CHECK-NEXT:  1      1     1.00                        ccmn	w1, wzr, #0, eq
+# CHECK-NEXT:  1      1     1.00                        ccmn	w3, w0, #15, hs
+# CHECK-NEXT:  1      1     1.00                        ccmn	wzr, w15, #13, hs
+# CHECK-NEXT:  1      1     1.00                        ccmn	x9, xzr, #0, le
+# CHECK-NEXT:  1      1     1.00                        ccmn	x3, x0, #15, gt
+# CHECK-NEXT:  1      1     1.00                        ccmn	xzr, x5, #7, ne
+# CHECK-NEXT:  1      1     1.00                        csel	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     1.00                        csel	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     1.00                        csel	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     1.00                        csel	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     1.00                        csel	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     1.00                        csel	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     1.00                        csel	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     1.00                        csel	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     1.00                        csinc	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     1.00                        csinc	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     1.00                        csinc	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     1.00                        csinc	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     1.00                        csinc	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     1.00                        csinc	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     1.00                        csinc	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     1.00                        csinc	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     1.00                        csinv	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     1.00                        csinv	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     1.00                        csinv	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     1.00                        csinv	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     1.00                        csinv	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     1.00                        csinv	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     1.00                        csinv	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     1.00                        csinv	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     1.00                        csneg	w1, w0, w19, ne
+# CHECK-NEXT:  1      1     1.00                        csneg	wzr, w5, w9, eq
+# CHECK-NEXT:  1      1     1.00                        csneg	w9, wzr, w30, gt
+# CHECK-NEXT:  1      1     1.00                        csneg	w1, w28, wzr, mi
+# CHECK-NEXT:  1      1     1.00                        csneg	x19, x23, x29, lt
+# CHECK-NEXT:  1      1     1.00                        csneg	xzr, x3, x4, ge
+# CHECK-NEXT:  1      1     1.00                        csneg	x5, xzr, x6, hs
+# CHECK-NEXT:  1      1     1.00                        csneg	x7, x8, xzr, lo
+# CHECK-NEXT:  1      1     1.00                        cset	w3, eq
+# CHECK-NEXT:  1      1     1.00                        cset	x9, pl
+# CHECK-NEXT:  1      1     1.00                        csetm	w20, ne
+# CHECK-NEXT:  1      1     1.00                        csetm	x30, ge
+# CHECK-NEXT:  1      1     1.00                        csinc	w2, wzr, wzr, al
+# CHECK-NEXT:  1      1     1.00                        csinv	x3, xzr, xzr, nv
+# CHECK-NEXT:  1      1     1.00                        cinc	w3, w5, gt
+# CHECK-NEXT:  1      1     1.00                        cinc	wzr, w4, le
+# CHECK-NEXT:  1      1     1.00                        cset	w9, lt
+# CHECK-NEXT:  1      1     1.00                        cinc	x3, x5, gt
+# CHECK-NEXT:  1      1     1.00                        cinc	xzr, x4, le
+# CHECK-NEXT:  1      1     1.00                        cset	x9, lt
+# CHECK-NEXT:  1      1     1.00                        csinc	w5, w6, w6, nv
+# CHECK-NEXT:  1      1     1.00                        csinc	x1, x2, x2, al
+# CHECK-NEXT:  1      1     1.00                        cinv	w3, w5, gt
+# CHECK-NEXT:  1      1     1.00                        cinv	wzr, w4, le
+# CHECK-NEXT:  1      1     1.00                        csetm	w9, lt
+# CHECK-NEXT:  1      1     1.00                        cinv	x3, x5, gt
+# CHECK-NEXT:  1      1     1.00                        cinv	xzr, x4, le
+# CHECK-NEXT:  1      1     1.00                        csetm	x9, lt
+# CHECK-NEXT:  1      1     1.00                        csinv	x1, x0, x0, al
+# CHECK-NEXT:  1      1     1.00                        csinv	w9, w8, w8, nv
+# CHECK-NEXT:  1      1     1.00                        cneg	w3, w5, gt
+# CHECK-NEXT:  1      1     1.00                        cneg	wzr, w4, le
+# CHECK-NEXT:  1      1     1.00                        cneg	w9, wzr, lt
+# CHECK-NEXT:  1      1     1.00                        cneg	x3, x5, gt
+# CHECK-NEXT:  1      1     1.00                        cneg	xzr, x4, le
+# CHECK-NEXT:  1      1     1.00                        cneg	x9, xzr, lt
+# CHECK-NEXT:  1      1     1.00                        csneg	x4, x8, x8, al
+# CHECK-NEXT:  1      1     1.00                        csinv	w9, w8, w8, nv
+# CHECK-NEXT:  1      2     1.00                        rbit	w0, w7
+# CHECK-NEXT:  1      2     1.00                        rbit	x18, x3
+# CHECK-NEXT:  1      1     1.00                        rev16	w17, w1
+# CHECK-NEXT:  1      1     1.00                        rev16	x5, x2
+# CHECK-NEXT:  1      1     1.00                        rev	w18, w0
+# CHECK-NEXT:  1      1     1.00                        rev32	x20, x1
+# CHECK-NEXT:  1      1     1.00                        rev	x22, x2
+# CHECK-NEXT:  1      1     1.00                        clz	w24, w3
+# CHECK-NEXT:  1      1     1.00                        clz	x26, x4
+# CHECK-NEXT:  1      1     1.00                        cls	w3, w5
+# CHECK-NEXT:  1      1     1.00                        cls	x20, x5
+# CHECK-NEXT:  1      12    12.00                       udiv	w0, w7, w10
+# CHECK-NEXT:  1      20    20.00                       udiv	x9, x22, x4
+# CHECK-NEXT:  1      12    12.00                       sdiv	w12, w21, w0
+# CHECK-NEXT:  1      20    20.00                       sdiv	x13, x2, x1
+# CHECK-NEXT:  1      2     1.00                        lsl	w11, w12, w13
+# CHECK-NEXT:  1      2     1.00                        lsl	x14, x15, x16
+# CHECK-NEXT:  1      2     1.00                        lsr	w17, w18, w19
+# CHECK-NEXT:  1      2     1.00                        lsr	x20, x21, x22
+# CHECK-NEXT:  1      2     1.00                        asr	w23, w24, w25
+# CHECK-NEXT:  1      2     1.00                        asr	x26, x27, x28
+# CHECK-NEXT:  1      2     1.00                        ror	w0, w1, w2
+# CHECK-NEXT:  1      2     1.00                        ror	x3, x4, x5
+# CHECK-NEXT:  1      2     1.00                        lsl	w6, w7, w8
+# CHECK-NEXT:  1      2     1.00                        lsl	x9, x10, x11
+# CHECK-NEXT:  1      2     1.00                        lsr	w12, w13, w14
+# CHECK-NEXT:  1      2     1.00                        lsr	x15, x16, x17
+# CHECK-NEXT:  1      2     1.00                        asr	w18, w19, w20
+# CHECK-NEXT:  1      2     1.00                        asr	x21, x22, x23
+# CHECK-NEXT:  1      2     1.00                        ror	w24, w25, w26
+# CHECK-NEXT:  1      2     1.00                        ror	x27, x28, x29
+# CHECK-NEXT:  1      5     2.00                        smulh	x30, x29, x28
+# CHECK-NEXT:  1      5     2.00                        smulh	xzr, x27, x26
+# CHECK-NEXT:  1      5     2.00                        umulh	x30, x29, x28
+# CHECK-NEXT:  1      5     2.00                        umulh	x23, x30, xzr
+# CHECK-NEXT:  1      3     1.00                        madd	w1, w3, w7, w4
+# CHECK-NEXT:  1      3     1.00                        madd	wzr, w0, w9, w11
+# CHECK-NEXT:  1      3     1.00                        madd	w13, wzr, w4, w4
+# CHECK-NEXT:  1      3     1.00                        madd	w19, w30, wzr, w29
+# CHECK-NEXT:  1      3     1.00                        mul	w4, w5, w6
+# CHECK-NEXT:  1      5     2.00                        madd	x1, x3, x7, x4
+# CHECK-NEXT:  1      5     2.00                        madd	xzr, x0, x9, x11
+# CHECK-NEXT:  1      5     2.00                        madd	x13, xzr, x4, x4
+# CHECK-NEXT:  1      5     2.00                        madd	x19, x30, xzr, x29
+# CHECK-NEXT:  1      5     2.00                        mul	x4, x5, x6
+# CHECK-NEXT:  1      3     1.00                        msub	w1, w3, w7, w4
+# CHECK-NEXT:  1      3     1.00                        msub	wzr, w0, w9, w11
+# CHECK-NEXT:  1      3     1.00                        msub	w13, wzr, w4, w4
+# CHECK-NEXT:  1      3     1.00                        msub	w19, w30, wzr, w29
+# CHECK-NEXT:  1      3     1.00                        mneg	w4, w5, w6
+# CHECK-NEXT:  1      5     2.00                        msub	x1, x3, x7, x4
+# CHECK-NEXT:  1      5     2.00                        msub	xzr, x0, x9, x11
+# CHECK-NEXT:  1      5     2.00                        msub	x13, xzr, x4, x4
+# CHECK-NEXT:  1      5     2.00                        msub	x19, x30, xzr, x29
+# CHECK-NEXT:  1      5     2.00                        mneg	x4, x5, x6
+# CHECK-NEXT:  1      3     1.00                        smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      3     1.00                        smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      3     1.00                        smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      3     1.00                        smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      3     1.00                        smull	x19, w20, w21
+# CHECK-NEXT:  1      3     1.00                        smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      3     1.00                        smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      3     1.00                        smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      3     1.00                        smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      3     1.00                        smnegl	x19, w20, w21
+# CHECK-NEXT:  1      3     1.00                        umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      3     1.00                        umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      3     1.00                        umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      3     1.00                        umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      3     1.00                        umull	x19, w20, w21
+# CHECK-NEXT:  1      3     1.00                        umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      3     1.00                        umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      3     1.00                        umnegl	x19, w20, w21
+# CHECK-NEXT:  1      5     2.00                        smulh	x30, x29, x28
+# CHECK-NEXT:  1      5     2.00                        smulh	x23, x22, xzr
+# CHECK-NEXT:  1      5     2.00                        umulh	x23, x22, xzr
+# CHECK-NEXT:  1      5     2.00                        mul	x19, x20, xzr
+# CHECK-NEXT:  1      3     1.00                        mneg	w21, w22, w23
+# CHECK-NEXT:  1      3     1.00                        smull	x11, w13, w17
+# CHECK-NEXT:  1      3     1.00                        umull	x11, w13, w17
+# CHECK-NEXT:  1      3     1.00                        smnegl	x11, w13, w17
+# CHECK-NEXT:  1      3     1.00                        umnegl	x11, w13, w17
+# CHECK-NEXT:  1      2     1.00                        extr	w3, w5, w7, #0
+# CHECK-NEXT:  1      2     1.00                        extr	w11, w13, w17, #31
+# CHECK-NEXT:  1      2     1.00                        extr	x3, x5, x7, #15
+# CHECK-NEXT:  1      2     1.00                        extr	x11, x13, x17, #63
+# CHECK-NEXT:  1      2     1.00                        ror	x19, x23, #24
+# CHECK-NEXT:  1      2     1.00                        ror	x29, xzr, #63
+# CHECK-NEXT:  1      2     1.00                        ror	w9, w13, #31
+# CHECK-NEXT:  1      3     1.00                        fcmp	s3, s5
+# CHECK-NEXT:  1      3     1.00                        fcmp	s31, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmp	s31, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	s29, s30
+# CHECK-NEXT:  1      3     1.00                        fcmpe	s15, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	s15, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmp	d4, d12
+# CHECK-NEXT:  1      3     1.00                        fcmp	d23, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmp	d23, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	d26, d22
+# CHECK-NEXT:  1      3     1.00                        fcmpe	d29, #0.0
+# CHECK-NEXT:  1      3     1.00                        fcmpe	d29, #0.0
+# CHECK-NEXT:  1      3     1.00                        fccmp	s1, s31, #0, eq
+# CHECK-NEXT:  1      3     1.00                        fccmp	s3, s0, #15, hs
+# CHECK-NEXT:  1      3     1.00                        fccmp	s31, s15, #13, hs
+# CHECK-NEXT:  1      3     1.00                        fccmp	d9, d31, #0, le
+# CHECK-NEXT:  1      3     1.00                        fccmp	d3, d0, #15, gt
+# CHECK-NEXT:  1      3     1.00                        fccmp	d31, d5, #7, ne
+# CHECK-NEXT:  1      3     1.00                        fccmpe	s1, s31, #0, eq
+# CHECK-NEXT:  1      3     1.00                        fccmpe	s3, s0, #15, hs
+# CHECK-NEXT:  1      3     1.00                        fccmpe	s31, s15, #13, hs
+# CHECK-NEXT:  1      3     1.00                        fccmpe	d9, d31, #0, le
+# CHECK-NEXT:  1      3     1.00                        fccmpe	d3, d0, #15, gt
+# CHECK-NEXT:  1      3     1.00                        fccmpe	d31, d5, #7, ne
+# CHECK-NEXT:  1      3     1.00                        fcsel	s3, s20, s9, pl
+# CHECK-NEXT:  1      3     1.00                        fcsel	d9, d10, d11, mi
+# CHECK-NEXT:  1      4     1.00                        fmov	s0, s1
+# CHECK-NEXT:  1      4     1.00                        fabs	s2, s3
+# CHECK-NEXT:  1      4     1.00                        fneg	s4, s5
+# CHECK-NEXT:  1      22    29.00                       fsqrt	s6, s7
+# CHECK-NEXT:  1      4     1.00                        fcvt	d8, s9
+# CHECK-NEXT:  1      4     1.00                        fcvt	h10, s11
+# CHECK-NEXT:  1      4     1.00                        frintn	s12, s13
+# CHECK-NEXT:  1      4     1.00                        frintp	s14, s15
+# CHECK-NEXT:  1      4     1.00                        frintm	s16, s17
+# CHECK-NEXT:  1      4     1.00                        frintz	s18, s19
+# CHECK-NEXT:  1      4     1.00                        frinta	s20, s21
+# CHECK-NEXT:  1      4     1.00                        frintx	s22, s23
+# CHECK-NEXT:  1      4     1.00                        frinti	s24, s25
+# CHECK-NEXT:  1      4     1.00                        fmov	d0, d1
+# CHECK-NEXT:  1      4     1.00                        fabs	d2, d3
+# CHECK-NEXT:  1      4     1.00                        fneg	d4, d5
+# CHECK-NEXT:  1      22    29.00                       fsqrt	d6, d7
+# CHECK-NEXT:  1      4     1.00                        fcvt	s8, d9
+# CHECK-NEXT:  1      4     1.00                        fcvt	h10, d11
+# CHECK-NEXT:  1      4     1.00                        frintn	d12, d13
+# CHECK-NEXT:  1      4     1.00                        frintp	d14, d15
+# CHECK-NEXT:  1      4     1.00                        frintm	d16, d17
+# CHECK-NEXT:  1      4     1.00                        frintz	d18, d19
+# CHECK-NEXT:  1      4     1.00                        frinta	d20, d21
+# CHECK-NEXT:  1      4     1.00                        frintx	d22, d23
+# CHECK-NEXT:  1      4     1.00                        frinti	d24, d25
+# CHECK-NEXT:  1      4     1.00                        fcvt	s26, h27
+# CHECK-NEXT:  1      4     1.00                        fcvt	d28, h29
+# CHECK-NEXT:  1      4     1.00                        fmul	s20, s19, s17
+# CHECK-NEXT:  1      13    10.00                       fdiv	s1, s2, s3
+# CHECK-NEXT:  1      4     1.00                        fadd	s4, s5, s6
+# CHECK-NEXT:  1      4     1.00                        fsub	s7, s8, s9
+# CHECK-NEXT:  1      4     1.00                        fmax	s10, s11, s12
+# CHECK-NEXT:  1      4     1.00                        fmin	s13, s14, s15
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	s16, s17, s18
+# CHECK-NEXT:  1      4     1.00                        fminnm	s19, s20, s21
+# CHECK-NEXT:  1      4     1.00                        fnmul	s22, s23, s2
+# CHECK-NEXT:  1      4     1.00                        fmul	d20, d19, d17
+# CHECK-NEXT:  1      22    19.00                       fdiv	d1, d2, d3
+# CHECK-NEXT:  1      4     1.00                        fadd	d4, d5, d6
+# CHECK-NEXT:  1      4     1.00                        fsub	d7, d8, d9
+# CHECK-NEXT:  1      4     1.00                        fmax	d10, d11, d12
+# CHECK-NEXT:  1      4     1.00                        fmin	d13, d14, d15
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	d16, d17, d18
+# CHECK-NEXT:  1      4     1.00                        fminnm	d19, d20, d21
+# CHECK-NEXT:  1      4     1.00                        fnmul	d22, d23, d24
+# CHECK-NEXT:  1      4     1.00                        fmadd	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     1.00                        fmadd	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     1.00                        fmsub	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     1.00                        fmsub	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     1.00                        fnmadd	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     1.00                        fnmadd	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     1.00                        fnmsub	s3, s5, s6, s31
+# CHECK-NEXT:  1      4     1.00                        fnmsub	d3, d13, d0, d23
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w3, h5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	wzr, h20, #13
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w19, h0, #32
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x3, h5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x12, h30, #45
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x19, h0, #64
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w3, s5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	wzr, s20, #13
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w19, s0, #32
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x3, s5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x12, s30, #45
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x19, s0, #64
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w3, d5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	wzr, d20, #13
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w19, d0, #32
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x3, d5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x12, d30, #45
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x19, d0, #64
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w3, h5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	wzr, h20, #13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w19, h0, #32
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x3, h5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x12, h30, #45
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x19, h0, #64
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w3, s5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	wzr, s20, #13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w19, s0, #32
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x3, s5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x12, s30, #45
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x19, s0, #64
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w3, d5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	wzr, d20, #13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w19, d0, #32
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x3, d5, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x12, d30, #45
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x19, d0, #64
+# CHECK-NEXT:  1      4     1.00                        scvtf	h23, w19, #1
+# CHECK-NEXT:  1      4     1.00                        scvtf	h31, wzr, #20
+# CHECK-NEXT:  1      4     1.00                        scvtf	h14, w0, #32
+# CHECK-NEXT:  1      4     1.00                        scvtf	h23, x19, #1
+# CHECK-NEXT:  1      4     1.00                        scvtf	h31, xzr, #20
+# CHECK-NEXT:  1      4     1.00                        scvtf	h14, x0, #64
+# CHECK-NEXT:  1      4     1.00                        scvtf	s23, w19, #1
+# CHECK-NEXT:  1      4     1.00                        scvtf	s31, wzr, #20
+# CHECK-NEXT:  1      4     1.00                        scvtf	s14, w0, #32
+# CHECK-NEXT:  1      4     1.00                        scvtf	s23, x19, #1
+# CHECK-NEXT:  1      4     1.00                        scvtf	s31, xzr, #20
+# CHECK-NEXT:  1      4     1.00                        scvtf	s14, x0, #64
+# CHECK-NEXT:  1      4     1.00                        scvtf	d23, w19, #1
+# CHECK-NEXT:  1      4     1.00                        scvtf	d31, wzr, #20
+# CHECK-NEXT:  1      4     1.00                        scvtf	d14, w0, #32
+# CHECK-NEXT:  1      4     1.00                        scvtf	d23, x19, #1
+# CHECK-NEXT:  1      4     1.00                        scvtf	d31, xzr, #20
+# CHECK-NEXT:  1      4     1.00                        scvtf	d14, x0, #64
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h23, w19, #1
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h31, wzr, #20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h14, w0, #32
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h23, x19, #1
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h31, xzr, #20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h14, x0, #64
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s23, w19, #1
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s31, wzr, #20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s14, w0, #32
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s23, x19, #1
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s31, xzr, #20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s14, x0, #64
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d23, w19, #1
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d31, wzr, #20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d14, w0, #32
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d23, x19, #1
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d31, xzr, #20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d14, x0, #64
+# CHECK-NEXT:  1      4     1.00                        fcvtns	w3, h31
+# CHECK-NEXT:  1      4     1.00                        fcvtns	xzr, h12
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	wzr, h12
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	x0, h0
+# CHECK-NEXT:  1      4     1.00                        fcvtps	wzr, h9
+# CHECK-NEXT:  1      4     1.00                        fcvtps	x12, h20
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	w30, h23
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	x29, h3
+# CHECK-NEXT:  1      4     1.00                        fcvtms	w2, h3
+# CHECK-NEXT:  1      4     1.00                        fcvtms	x4, h5
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	w6, h7
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	x8, h9
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w10, h11
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x12, h13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w14, h15
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x15, h16
+# CHECK-NEXT:  1      4     1.00                        scvtf	h17, w18
+# CHECK-NEXT:  1      4     1.00                        scvtf	h19, x20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	h21, w22
+# CHECK-NEXT:  1      4     1.00                        scvtf	h23, x24
+# CHECK-NEXT:  1      4     1.00                        fcvtas	w25, h26
+# CHECK-NEXT:  1      4     1.00                        fcvtas	x27, h28
+# CHECK-NEXT:  1      4     1.00                        fcvtau	w29, h30
+# CHECK-NEXT:  1      4     1.00                        fcvtau	xzr, h0
+# CHECK-NEXT:  1      4     1.00                        fcvtns	w3, s31
+# CHECK-NEXT:  1      4     1.00                        fcvtns	xzr, s12
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	wzr, s12
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	x0, s0
+# CHECK-NEXT:  1      4     1.00                        fcvtps	wzr, s9
+# CHECK-NEXT:  1      4     1.00                        fcvtps	x12, s20
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	w30, s23
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	x29, s3
+# CHECK-NEXT:  1      4     1.00                        fcvtms	w2, s3
+# CHECK-NEXT:  1      4     1.00                        fcvtms	x4, s5
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	w6, s7
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	x8, s9
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w10, s11
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w14, s15
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x15, s16
+# CHECK-NEXT:  1      4     1.00                        scvtf	s17, w18
+# CHECK-NEXT:  1      4     1.00                        scvtf	s19, x20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s21, w22
+# CHECK-NEXT:  1      4     1.00                        scvtf	s23, x24
+# CHECK-NEXT:  1      4     1.00                        fcvtas	w25, s26
+# CHECK-NEXT:  1      4     1.00                        fcvtas	x27, s28
+# CHECK-NEXT:  1      4     1.00                        fcvtau	w29, s30
+# CHECK-NEXT:  1      4     1.00                        fcvtau	xzr, s0
+# CHECK-NEXT:  1      4     1.00                        fcvtns	w3, d31
+# CHECK-NEXT:  1      4     1.00                        fcvtns	xzr, d12
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	wzr, d12
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	x0, d0
+# CHECK-NEXT:  1      4     1.00                        fcvtps	wzr, d9
+# CHECK-NEXT:  1      4     1.00                        fcvtps	x12, d20
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	w30, d23
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	x29, d3
+# CHECK-NEXT:  1      4     1.00                        fcvtms	w2, d3
+# CHECK-NEXT:  1      4     1.00                        fcvtms	x4, d5
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	w6, d7
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	x8, d9
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	w10, d11
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	x12, d13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	w14, d15
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	x15, d16
+# CHECK-NEXT:  1      4     1.00                        scvtf	d17, w18
+# CHECK-NEXT:  1      4     1.00                        scvtf	d19, x20
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d21, w22
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d23, x24
+# CHECK-NEXT:  1      4     1.00                        fcvtas	w25, d26
+# CHECK-NEXT:  1      4     1.00                        fcvtas	x27, d28
+# CHECK-NEXT:  1      4     1.00                        fcvtau	w29, d30
+# CHECK-NEXT:  1      4     1.00                        fcvtau	xzr, d0
+# CHECK-NEXT:  1      3     1.00                        fmov	w3, s9
+# CHECK-NEXT:  1      3     1.00                        fmov	s9, w3
+# CHECK-NEXT:  1      3     1.00                        fmov	x20, d31
+# CHECK-NEXT:  1      3     1.00                        fmov	d1, x15
+# CHECK-NEXT:  1      3     1.00                        fmov	x3, v12.d[1]
+# CHECK-NEXT:  1      3     1.00                        fmov	v1.d[1], x19
+# CHECK-NEXT:  1      3     1.00                        fmov	s2, #0.12500000
+# CHECK-NEXT:  1      3     1.00                        fmov	s3, #1.00000000
+# CHECK-NEXT:  1      3     1.00                        fmov	d30, #16.00000000
+# CHECK-NEXT:  1      3     1.00                        fmov	s4, #1.06250000
+# CHECK-NEXT:  1      3     1.00                        fmov	d10, #1.93750000
+# CHECK-NEXT:  1      3     1.00                        fmov	s12, #-1.00000000
+# CHECK-NEXT:  1      3     1.00                        fmov	d16, #8.50000000
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w3, #0
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x29, #4
+# CHECK-NEXT:  1      4     1.00    *                   ldrsw	xzr, #-4
+# CHECK-NEXT:  1      4     1.00    *                   ldr	s0, #8
+# CHECK-NEXT:  1      4     1.00    *                   ldr	d0, #1048572
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q0, #-1048576
+# CHECK-NEXT:  1      4     1.00                  U     prfm	pldl1strm, #0
+# CHECK-NEXT:  1      4     1.00                  U     prfm	#22, #0
+# CHECK-NEXT:  2      5     2.00    *      *      U     stxrb	w18, w8, [sp]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stxrh	w24, w15, [x16]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stxr	w5, w6, [x17]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stxr	w1, x10, [x21]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldxrb	w30, [x0]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldxrh	w17, [x4]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldxr	w22, [sp]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldxr	x11, [x29]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldxr	x11, [x29]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldxr	x11, [x29]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stxp	w12, w11, w10, [sp]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stxp	wzr, x27, x9, [x12]
+# CHECK-NEXT:  2      4     2.00    *      *      U     ldxp	w0, wzr, [sp]
+# CHECK-NEXT:  2      4     2.00    *      *      U     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  2      4     2.00    *      *      U     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stlxrb	w12, w22, [x0]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stlxrh	w10, w1, [x1]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stlxr	w9, w2, [x2]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stlxr	w9, x3, [sp]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldaxrb	w8, [x4]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldaxrh	w7, [x5]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldaxr	w6, [sp]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldaxr	x5, [x6]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldaxr	x5, [x6]
+# CHECK-NEXT:  1      4     1.00    *      *      U     ldaxr	x5, [x6]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stlxp	w4, w5, w6, [sp]
+# CHECK-NEXT:  2      5     2.00    *      *      U     stlxp	wzr, x6, x7, [x1]
+# CHECK-NEXT:  2      4     2.00    *      *      U     ldaxp	w5, w18, [sp]
+# CHECK-NEXT:  2      4     2.00    *      *      U     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  2      4     2.00    *      *      U     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  1      1     1.00           *      U     stlrb	w24, [sp]
+# CHECK-NEXT:  1      1     1.00           *      U     stlrh	w25, [x30]
+# CHECK-NEXT:  1      1     1.00           *      U     stlr	w26, [x29]
+# CHECK-NEXT:  1      1     1.00           *      U     stlr	x27, [x28]
+# CHECK-NEXT:  1      1     1.00           *      U     stlr	x27, [x28]
+# CHECK-NEXT:  1      1     1.00           *      U     stlr	x27, [x28]
+# CHECK-NEXT:  1      4     1.00    *             U     ldarb	w23, [sp]
+# CHECK-NEXT:  1      4     1.00    *             U     ldarh	w22, [x30]
+# CHECK-NEXT:  1      4     1.00    *             U     ldar	wzr, [x29]
+# CHECK-NEXT:  1      4     1.00    *             U     ldar	x21, [x28]
+# CHECK-NEXT:  1      4     1.00    *             U     ldar	x21, [x28]
+# CHECK-NEXT:  1      4     1.00    *             U     ldar	x21, [x28]
+# CHECK-NEXT:  1      1     1.00           *            sturb	w9, [sp]
+# CHECK-NEXT:  1      1     1.00           *            sturh	wzr, [x12, #255]
+# CHECK-NEXT:  1      1     1.00           *            stur	w16, [x0, #-256]
+# CHECK-NEXT:  1      1     1.00           *            stur	x28, [x14, #1]
+# CHECK-NEXT:  1      4     1.00    *                   ldurb	w1, [x20, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldurh	w20, [x1, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	w12, [sp, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	xzr, [x12, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldursb	x9, [x7, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldursh	x17, [x19, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldursw	x20, [x15, #-256]
+# CHECK-NEXT:  1      4     1.00                  U     prfum	pldl2keep, [sp, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldursb	w19, [x1, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldursh	w15, [x21, #-256]
+# CHECK-NEXT:  1      1     1.00           *            stur	b0, [sp, #1]
+# CHECK-NEXT:  1      1     1.00           *            stur	h12, [x12, #-1]
+# CHECK-NEXT:  1      1     1.00           *            stur	s15, [x0, #255]
+# CHECK-NEXT:  1      1     1.00           *            stur	d31, [x5, #25]
+# CHECK-NEXT:  1      1     1.00           *            stur	q9, [x5]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	b3, [sp]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	h5, [x4, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	s7, [x12, #-1]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	d11, [x19, #4]
+# CHECK-NEXT:  1      4     1.00    *                   ldur	q13, [x1, #2]
+# CHECK-NEXT:  2      1     1.00           *            strb	w9, [x2], #255
+# CHECK-NEXT:  2      1     1.00           *            strb	w10, [x3], #1
+# CHECK-NEXT:  2      1     1.00           *            strb	w10, [x3], #-256
+# CHECK-NEXT:  2      1     1.00           *            strh	w9, [x2], #255
+# CHECK-NEXT:  2      1     1.00           *            strh	w9, [x2], #1
+# CHECK-NEXT:  2      1     1.00           *            strh	w10, [x3], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	w19, [sp], #255
+# CHECK-NEXT:  2      1     1.00           *            str	w20, [x30], #1
+# CHECK-NEXT:  2      1     1.00           *            str	w21, [x12], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	xzr, [x9], #255
+# CHECK-NEXT:  2      1     1.00           *            str	x2, [x3], #1
+# CHECK-NEXT:  2      1     1.00           *            str	x19, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrb	w9, [x2], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrb	w10, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrb	w10, [x3], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrh	w9, [x2], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrh	w9, [x2], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrh	w10, [x3], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	w19, [sp], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	w20, [x30], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	w21, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	xzr, [x9], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	x2, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	x19, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	xzr, [x9], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	x2, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	x19, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	xzr, [x9], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	x2, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	x19, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrsw	xzr, [x9], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrsw	x2, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrsw	x19, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	wzr, [x9], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	w2, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	w19, [x12], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	wzr, [x9], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	w2, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	w19, [x12], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	b0, [x0], #255
+# CHECK-NEXT:  2      1     1.00           *            str	b3, [x3], #1
+# CHECK-NEXT:  2      1     1.00           *            str	b5, [sp], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	h10, [x10], #255
+# CHECK-NEXT:  2      1     1.00           *            str	h13, [x23], #1
+# CHECK-NEXT:  2      1     1.00           *            str	h15, [sp], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	s20, [x20], #255
+# CHECK-NEXT:  2      1     1.00           *            str	s23, [x23], #1
+# CHECK-NEXT:  2      1     1.00           *            str	s25, [x0], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	d20, [x20], #255
+# CHECK-NEXT:  2      1     1.00           *            str	d23, [x23], #1
+# CHECK-NEXT:  2      1     1.00           *            str	d25, [x0], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	b0, [x0], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	b3, [x3], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	b5, [sp], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	h10, [x10], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	h13, [x23], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	h15, [sp], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	s20, [x20], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	s23, [x23], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	s25, [x0], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	d20, [x20], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	d23, [x23], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	d25, [x0], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	q20, [x1], #255
+# CHECK-NEXT:  2      4     1.00    *                   ldr	q23, [x9], #1
+# CHECK-NEXT:  2      4     1.00    *                   ldr	q25, [x20], #-256
+# CHECK-NEXT:  2      1     1.00           *            str	q10, [x1], #255
+# CHECK-NEXT:  2      1     1.00           *            str	q22, [sp], #1
+# CHECK-NEXT:  2      1     1.00           *            str	q21, [x20], #-256
+# CHECK-NEXT:  2      4     1.00    *                   ldr	x3, [x4, #0]!
+# CHECK-NEXT:  2      1     1.00           *            strb	w9, [x2, #255]!
+# CHECK-NEXT:  2      1     1.00           *            strb	w10, [x3, #1]!
+# CHECK-NEXT:  2      1     1.00           *            strb	w10, [x3, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            strh	w9, [x2, #255]!
+# CHECK-NEXT:  2      1     1.00           *            strh	w9, [x2, #1]!
+# CHECK-NEXT:  2      1     1.00           *            strh	w10, [x3, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	w19, [sp, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	w20, [x30, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	w21, [x12, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	xzr, [x9, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	x2, [x3, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	x19, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrb	w9, [x2, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrb	w10, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrb	w10, [x3, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrh	w9, [x2, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrh	w9, [x2, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrh	w10, [x3, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	w19, [sp, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	w20, [x30, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	w21, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	xzr, [x9, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	x2, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	x19, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	xzr, [x9, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	x2, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	x19, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	xzr, [x9, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	x2, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	x19, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsw	xzr, [x9, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsw	x2, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsw	x19, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	wzr, [x9, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	w2, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsb	w19, [x12, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	wzr, [x9, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	w2, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldrsh	w19, [x12, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	b0, [x0, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	b3, [x3, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	b5, [sp, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	h10, [x10, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	h13, [x23, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	h15, [sp, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	s20, [x20, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	s23, [x23, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	s25, [x0, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	d20, [x20, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	d23, [x23, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	d25, [x0, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	b0, [x0, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	b3, [x3, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	b5, [sp, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	h10, [x10, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	h13, [x23, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	h15, [sp, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	s20, [x20, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	s23, [x23, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	s25, [x0, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	d20, [x20, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	d23, [x23, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	d25, [x0, #-256]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	q20, [x1, #255]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	q23, [x9, #1]!
+# CHECK-NEXT:  2      4     1.00    *                   ldr	q25, [x20, #-256]!
+# CHECK-NEXT:  2      1     1.00           *            str	q10, [x1, #255]!
+# CHECK-NEXT:  2      1     1.00           *            str	q22, [sp, #1]!
+# CHECK-NEXT:  2      1     1.00           *            str	q21, [x20, #-256]!
+# CHECK-NEXT:  1      1     1.00           *            sttrb	w9, [sp]
+# CHECK-NEXT:  1      1     1.00           *            sttrh	wzr, [x12, #255]
+# CHECK-NEXT:  1      1     1.00           *            sttr	w16, [x0, #-256]
+# CHECK-NEXT:  1      1     1.00           *            sttr	x28, [x14, #1]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrb	w1, [x20, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrh	w20, [x1, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldtr	w12, [sp, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldtr	xzr, [x12, #255]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrsb	x9, [x7, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrsh	x17, [x19, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrsw	x20, [x15, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrsb	w19, [x1, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldtrsh	w15, [x21, #-256]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x4, [x29]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x30, [x12, #32760]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x20, [sp, #8]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	xzr, [sp]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w2, [sp]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w17, [sp, #16380]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w13, [x2, #4]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsw	x2, [x5, #4]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsw	x23, [sp, #16380]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w2, [x4]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	w23, [x6, #8190]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	wzr, [sp, #2]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	x29, [x2, #2]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w26, [x3, #121]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w12, [x2]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsb	w27, [sp, #4095]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsb	xzr, [x15]
+# CHECK-NEXT:  1      1     1.00           *            str	x30, [sp]
+# CHECK-NEXT:  1      1     1.00           *            str	w20, [x4, #16380]
+# CHECK-NEXT:  1      1     1.00           *            strh	w17, [sp, #8190]
+# CHECK-NEXT:  1      1     1.00           *            strb	w23, [x3, #4095]
+# CHECK-NEXT:  1      1     1.00           *            strb	wzr, [x2]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	b31, [sp, #4095]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	h20, [x2, #8190]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	s10, [x19, #16380]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	d3, [x10, #32760]
+# CHECK-NEXT:  1      1     1.00           *            str	q12, [sp, #65520]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w3, [sp, x5]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w9, [x27, x6]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsb	w10, [x30, x7]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      1     1.00           *            strb	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsb	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrb	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsb	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	w3, [sp, x5]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	w9, [x27, x6]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w10, [x30, x7, lsl #1]
+# CHECK-NEXT:  1      1     1.00           *            strh	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	x13, [x27, x5, sxtx #1]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsh	w16, [x24, w8, uxtw #1]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrh	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      1     1.00           *            strh	w19, [x21, wzr, sxtw #1]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w3, [sp, x5]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	s9, [x27, x6]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w10, [x30, x7, lsl #2]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      1     1.00           *            str	s12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      1     1.00           *            str	w13, [x27, x5, sxtx #2]
+# CHECK-NEXT:  1      1     1.00           *            str	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w16, [x24, w8, uxtw #2]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsw	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldrsw	x19, [x21, wzr, sxtw #2]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x3, [sp, x5]
+# CHECK-NEXT:  1      1     1.00           *            str	x9, [x27, x6]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	d10, [x30, x7, lsl #3]
+# CHECK-NEXT:  1      1     1.00           *            str	x11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x13, [x27, x5, sxtx #3]
+# CHECK-NEXT:  1      4     1.00                  U     prfm	pldl1keep, [x26, w6, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x16, [x24, w8, uxtw #3]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      1     1.00           *            str	d19, [x21, wzr, sxtw #3]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q3, [sp, x5]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q9, [x27, x6]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q10, [x30, x7, lsl #4]
+# CHECK-NEXT:  1      1     1.00           *            str	q11, [x29, x3, sxtx]
+# CHECK-NEXT:  1      1     1.00           *            str	q12, [x28, xzr, sxtx]
+# CHECK-NEXT:  1      1     1.00           *            str	q13, [x27, x5, sxtx #4]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q14, [x26, w6, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q15, [x25, w7, uxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q16, [x24, w8, uxtw #4]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q17, [x23, w9, sxtw]
+# CHECK-NEXT:  1      1     1.00           *            str	q18, [x22, w10, sxtw]
+# CHECK-NEXT:  1      4     1.00    *                   ldr	q19, [x21, wzr, sxtw #4]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	w3, w5, [sp]
+# CHECK-NEXT:  1      1     1.00           *            stp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	w9, w10, [sp, #4]
+# CHECK-NEXT:  2      4     2.00    *                   ldpsw	x9, x10, [sp, #4]
+# CHECK-NEXT:  2      4     2.00    *                   ldpsw	x9, x10, [x2, #-256]
+# CHECK-NEXT:  2      4     2.00    *                   ldpsw	x20, x30, [sp, #252]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	x21, x29, [x2, #504]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	x24, x25, [x4, #8]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	s29, s28, [sp, #252]
+# CHECK-NEXT:  1      1     1.00           *            stp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	s1, s2, [x3, #44]
+# CHECK-NEXT:  1      1     1.00           *            stp	d3, d5, [x9, #504]
+# CHECK-NEXT:  1      1     1.00           *            stp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  1      1     1.00           *            stp	q3, q5, [sp]
+# CHECK-NEXT:  1      1     1.00           *            stp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  2      4     2.00    *                   ldp	q23, q29, [x1, #-1024]
+# CHECK-NEXT:  3      4     2.00    *                   ldp	w3, w5, [sp], #0
+# CHECK-NEXT:  2      1     1.00           *            stp	wzr, w9, [sp], #252
+# CHECK-NEXT:  3      4     2.00    *                   ldp	w2, wzr, [sp], #-256
+# CHECK-NEXT:  3      4     2.00    *                   ldp	w9, w10, [sp], #4
+# CHECK-NEXT:  3      4     2.00    *                   ldpsw	x9, x10, [sp], #4
+# CHECK-NEXT:  3      4     2.00    *                   ldpsw	x9, x10, [x2], #-256
+# CHECK-NEXT:  3      4     2.00    *                   ldpsw	x20, x30, [sp], #252
+# CHECK-NEXT:  3      4     2.00    *                   ldp	x21, x29, [x2], #504
+# CHECK-NEXT:  3      4     2.00    *                   ldp	x22, x23, [x3], #-512
+# CHECK-NEXT:  3      4     2.00    *                   ldp	x24, x25, [x4], #8
+# CHECK-NEXT:  3      4     2.00    *                   ldp	s29, s28, [sp], #252
+# CHECK-NEXT:  2      1     1.00           *            stp	s27, s26, [sp], #-256
+# CHECK-NEXT:  3      4     2.00    *                   ldp	s1, s2, [x3], #44
+# CHECK-NEXT:  2      1     1.00           *            stp	d3, d5, [x9], #504
+# CHECK-NEXT:  2      1     1.00           *            stp	d7, d11, [x10], #-512
+# CHECK-NEXT:  3      4     2.00    *                   ldp	d2, d3, [x30], #-8
+# CHECK-NEXT:  2      1     1.00           *            stp	q3, q5, [sp], #0
+# CHECK-NEXT:  2      1     1.00           *            stp	q17, q19, [sp], #1008
+# CHECK-NEXT:  3      4     2.00    *                   ldp	q23, q29, [x1], #-1024
+# CHECK-NEXT:  3      4     2.00    *                   ldp	w3, w5, [sp, #0]!
+# CHECK-NEXT:  2      1     1.00           *            stp	wzr, w9, [sp, #252]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	w2, wzr, [sp, #-256]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	w9, w10, [sp, #4]!
+# CHECK-NEXT:  3      4     2.00    *                   ldpsw	x9, x10, [sp, #4]!
+# CHECK-NEXT:  3      4     2.00    *                   ldpsw	x9, x10, [x2, #-256]!
+# CHECK-NEXT:  3      4     2.00    *                   ldpsw	x20, x30, [sp, #252]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	x21, x29, [x2, #504]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	x22, x23, [x3, #-512]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	x24, x25, [x4, #8]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	s29, s28, [sp, #252]!
+# CHECK-NEXT:  2      1     1.00           *            stp	s27, s26, [sp, #-256]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	s1, s2, [x3, #44]!
+# CHECK-NEXT:  2      1     1.00           *            stp	d3, d5, [x9, #504]!
+# CHECK-NEXT:  2      1     1.00           *            stp	d7, d11, [x10, #-512]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	d2, d3, [x30, #-8]!
+# CHECK-NEXT:  2      1     1.00           *            stp	q3, q5, [sp, #0]!
+# CHECK-NEXT:  2      1     1.00           *            stp	q17, q19, [sp, #1008]!
+# CHECK-NEXT:  3      4     2.00    *                   ldp	q23, q29, [x1, #-1024]!
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	w3, w5, [sp]
+# CHECK-NEXT:  1      1     1.00           *            stnp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	w9, w10, [sp, #4]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	x21, x29, [x2, #504]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	x24, x25, [x4, #8]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	s29, s28, [sp, #252]
+# CHECK-NEXT:  1      1     1.00           *            stnp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	s1, s2, [x3, #44]
+# CHECK-NEXT:  1      1     1.00           *            stnp	d3, d5, [x9, #504]
+# CHECK-NEXT:  1      1     1.00           *            stnp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  1      1     1.00           *            stnp	q3, q5, [sp]
+# CHECK-NEXT:  1      1     1.00           *            stnp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  2      4     2.00    *                   ldnp	q23, q29, [x1, #-1024]
+# CHECK-NEXT:  1      1     1.00                        mov	w3, #983055
+# CHECK-NEXT:  1      1     1.00                        mov	x10, #-6148914691236517206
+# CHECK-NEXT:  1      1     1.00                        and	w12, w23, w21
+# CHECK-NEXT:  1      2     1.00                        and	w16, w15, w1, lsl #1
+# CHECK-NEXT:  1      2     1.00                        and	w9, w4, w10, lsl #31
+# CHECK-NEXT:  1      1     1.00                        and	w3, w30, w11
+# CHECK-NEXT:  1      2     1.00                        and	x3, x5, x7, lsl #63
+# CHECK-NEXT:  1      2     1.00                        and	x5, x14, x19, asr #4
+# CHECK-NEXT:  1      2     1.00                        and	w3, w17, w19, ror #31
+# CHECK-NEXT:  1      2     1.00                        and	w0, w2, wzr, lsr #17
+# CHECK-NEXT:  1      2     1.00                        and	w3, w30, w11, asr #2
+# CHECK-NEXT:  1      1     1.00                        and	xzr, x4, x26
+# CHECK-NEXT:  1      2     1.00                        and	w3, wzr, w20, ror #2
+# CHECK-NEXT:  1      2     1.00                        and	x7, x20, xzr, asr #63
+# CHECK-NEXT:  1      2     1.00                        bic	x13, x20, x14, lsl #47
+# CHECK-NEXT:  1      1     1.00                        bic	w2, w7, w9
+# CHECK-NEXT:  1      2     1.00                        orr	w2, w7, w0, asr #31
+# CHECK-NEXT:  1      2     1.00                        orr	x8, x9, x10, lsl #12
+# CHECK-NEXT:  1      2     1.00                        orn	x3, x5, x7, asr #2
+# CHECK-NEXT:  1      1     1.00                        orn	w2, w5, w29
+# CHECK-NEXT:  1      2     1.00                        ands	w7, wzr, w9, lsl #1
+# CHECK-NEXT:  1      2     1.00                        ands	x3, x5, x20, ror #63
+# CHECK-NEXT:  1      1     1.00                        bics	w3, w5, w7
+# CHECK-NEXT:  1      2     1.00                        bics	x3, xzr, x3, lsl #1
+# CHECK-NEXT:  1      2     1.00                        tst	w3, w7, lsl #31
+# CHECK-NEXT:  1      2     1.00                        tst	x2, x20, asr #2
+# CHECK-NEXT:  1      1     1.00                        mov	x3, x6
+# CHECK-NEXT:  1      1     1.00                        mov	x3, xzr
+# CHECK-NEXT:  1      1     1.00                        mov	wzr, w2
+# CHECK-NEXT:  1      1     1.00                        mov	w3, w5
+# CHECK-NEXT:  1      1     1.00                        movz	w2, #0, lsl #16
+# CHECK-NEXT:  1      1     1.00                        mov	w2, #-1235
+# CHECK-NEXT:  1      1     1.00                        mov	x2, #5299989643264
+# CHECK-NEXT:  1      1     1.00                        mov	x2, #0
+# CHECK-NEXT:  1      1     1.00                        movk	w3, #0
+# CHECK-NEXT:  1      1     1.00                        movz	x4, #0, lsl #16
+# CHECK-NEXT:  1      1     1.00                        movk	w5, #0, lsl #16
+# CHECK-NEXT:  1      1     1.00                        movz	x6, #0, lsl #32
+# CHECK-NEXT:  1      1     1.00                        movk	x7, #0, lsl #32
+# CHECK-NEXT:  1      1     1.00                        movz	x8, #0, lsl #48
+# CHECK-NEXT:  1      1     1.00                        movk	x9, #0, lsl #48
+# CHECK-NEXT:  1      1     1.00                        adr	x2, #1600
+# CHECK-NEXT:  1      1     1.00                        adrp	x21, #6553600
+# CHECK-NEXT:  1      1     1.00                        adr	x0, #262144
+# CHECK-NEXT:  1      1     1.00                        tbz	x12, #62, #0
+# CHECK-NEXT:  1      1     1.00                        tbz	x12, #62, #4
+# CHECK-NEXT:  1      1     1.00                        tbz	x12, #62, #-32768
+# CHECK-NEXT:  1      1     1.00                        tbnz	x12, #60, #32764
+# CHECK-NEXT:  1      1     1.00                        b	#4
+# CHECK-NEXT:  1      1     1.00                        b	#-4
+# CHECK-NEXT:  1      1     1.00                        b	#134217724
+# CHECK-NEXT:  1      1     1.00                        br	x20
+# CHECK-NEXT:  1      1     1.00                        blr	xzr
+# CHECK-NEXT:  1      1     1.00                  U     ret	x10
+# CHECK-NEXT:  1      1     1.00                  U     ret
+# CHECK-NEXT:  1      1     1.00                  U     eret
+# CHECK-NEXT:  1      1     1.00                  U     drps
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - CortexA320UnitALU
+# CHECK-NEXT: [1]   - CortexA320UnitB
+# CHECK-NEXT: [2]   - CortexA320UnitDiv
+# CHECK-NEXT: [3]   - CortexA320UnitLdSt
+# CHECK-NEXT: [4]   - CortexA320UnitMAC
+# CHECK-NEXT: [5]   - CortexA320UnitPAC
+# CHECK-NEXT: [6]   - CortexA320UnitVALU
+# CHECK-NEXT: [7]   - CortexA320UnitVMAC
+# CHECK-NEXT: [8]   - CortexA320UnitVMC
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT: 465.00 22.00  64.00  450.00 69.00   -     221.00 12.00  87.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w2, w3, #4095
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w30, w29, #1, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w13, w5, #4095, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x5, x7, #1638
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w20, wsp, #801
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	wsp, wsp, #1104
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	wsp, w30, #4084
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x0, x24, #291
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x3, x24, #4095, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x8, sp, #1074
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	sp, x29, #3816
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w0, wsp, #4077
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w4, w20, #546, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	sp, sp, #288
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	wsp, w19, #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w13, w23, #291, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w2, #4095
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w20, wsp, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x3, #1, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	sp, #20, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x30, #4095
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x4, sp, #3822
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w3, #291, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	wsp, #1365
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	sp, #1092, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	sp, x30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	wsp, w20
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	x11, sp
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	w24, wsp
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w3, w5, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	wzr, w3, w5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w20, wzr, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w4, w6, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w11, w13, w15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w17, w29, w20, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w21, w22, w23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w24, w25, w26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w27, w28, w29, lsr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w2, w3, w4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w5, w6, w7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	w8, w9, w10, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x3, x5, x7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	xzr, x3, x5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x20, xzr, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x4, x6, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x11, x13, x15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x17, x29, x20, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x21, x22, x23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x24, x25, x26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x27, x28, x29, lsr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x2, x3, x4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x5, x6, x7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     add	x8, x9, x10, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w3, w5, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w3, w5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w20, wzr, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w4, w6, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w11, w13, w15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w17, w29, w20, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w21, w22, w23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w24, w25, w26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w27, w28, w29, lsr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w2, w3, w4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w5, w6, w7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	w8, w9, w10, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x3, x5, x7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x3, x5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x20, xzr, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x4, x6, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x11, x13, x15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x17, x29, x20, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x21, x22, x23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x24, x25, x26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x27, x28, x29, lsr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x2, x3, x4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x5, x6, x7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adds	x8, x9, x10, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w3, w5, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	wzr, w3, w5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w4, w6, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w11, w13, w15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w17, w29, w20, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w21, w22, w23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w24, w25, w26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w27, w28, w29, lsr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w2, w3, w4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w5, w6, w7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	w8, w9, w10, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x3, x5, x7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	xzr, x3, x5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x4, x6, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x11, x13, x15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x17, x29, x20, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x21, x22, x23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x24, x25, x26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x27, x28, x29, lsr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x2, x3, x4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x5, x6, x7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sub	x8, x9, x10, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w3, w5, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w3, w5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w4, w6, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w11, w13, w15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w9, w3, wzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w17, w29, w20, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w21, w22, w23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w24, w25, w26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w27, w28, w29, lsr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w2, w3, w4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w5, w6, w7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	w8, w9, w10, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x3, x5, x7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x3, x5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x4, x6, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x11, x13, x15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x9, x3, xzr, lsl #10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x17, x29, x20, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x21, x22, x23, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x24, x25, x26, lsr #18
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x27, x28, x29, lsr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x2, x3, x4, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x5, x6, x7, asr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     subs	x8, x9, x10, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	wzr, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w5, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w6, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w8, w9, lsl #15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w10, w11, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w12, w13, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w14, w15, lsr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w16, w17, lsr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w18, w19, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w20, w21, asr #22
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	w22, w23, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x0, x3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	xzr, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x5, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x6, x7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x8, x9, lsl #15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x10, x11, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x12, x13, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x14, x15, lsr #41
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x16, x17, lsr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x18, x19, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x20, x21, asr #55
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmn	x22, x23, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w0, w3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	wzr, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w5, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w6, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w8, w9, lsl #15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w10, w11, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w12, w13, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w14, w15, lsr #21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w18, w19, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w20, w21, asr #22
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	w22, w23, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x0, x3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	xzr, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x5, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x6, x7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x8, x9, lsl #15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x10, x11, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x12, x13, lsr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x14, x15, lsr #41
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x16, x17, lsr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x18, x19, asr #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x20, x21, asr #55
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	x22, x23, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	wzr, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cmp	xzr, x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	w29, w27, w25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	wzr, w3, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	w9, wzr, w10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	w20, w0, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	x29, x27, x25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	xzr, x3, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	x9, xzr, x10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adc	x20, x0, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	w29, w27, w25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	wzr, w3, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	w9, wzr, w10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	w20, w0, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	x29, x27, x25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	xzr, x3, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	x9, xzr, x10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adcs	x20, x0, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbc	w29, w27, w25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbc	wzr, w3, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	w9, w10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbc	w20, w0, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbc	x29, x27, x25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbc	xzr, x3, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	x9, x10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbc	x20, x0, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbcs	w29, w27, w25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbcs	wzr, w3, w4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	w9, w10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbcs	w20, w0, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbcs	x29, x27, x25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbcs	xzr, x3, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	x9, x10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbcs	x20, x0, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	w3, w12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	wzr, w9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	w23, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	x29, x30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	xzr, x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngc	x0, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	w3, w12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	wzr, w9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	w23, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	x29, x30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	xzr, x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ngcs	x0, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfx	x1, x2, #3, #2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x3, x4, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	wzr, wzr, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfx	w12, w9, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfiz	x4, x5, #52, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfx	xzr, x4, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfiz	x4, xzr, #1, #6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x5, x6, #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfi	x4, x5, #52, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	xzr, x4, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfc	x4, #1, #6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	x5, x6, #12, #52
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sxtb	w1, w2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sxtb	xzr, w3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sxth	w9, w10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sxth	x0, w1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sxtw	x3, w30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uxtb	w1, w2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uxth	w9, w10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfx	x3, x30, #0, #32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w3, w2, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w9, w10, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x20, x21, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w1, wzr, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w3, w2, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w9, w10, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x20, x21, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	wzr, wzr, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w3, w2, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	w9, w10, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	x20, x21, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	w1, wzr, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfx	w9, w10, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfiz	x2, x3, #63, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x19, x20, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfiz	x9, x10, #5, #59
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w9, w10, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfiz	w11, w12, #31, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfiz	w13, w14, #29, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfiz	xzr, xzr, #10, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfx	w9, w10, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x2, x3, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x19, x20, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x9, x10, #5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w9, w10, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w11, w12, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w13, w14, #29
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sbfx	xzr, xzr, #10, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfi	x2, x3, #63, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	x19, x20, #0, #64
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfi	x9, x10, #5, #59
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfi	w11, w12, #31, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfi	w13, w14, #29, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfc	xzr, #10, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	x2, x3, #63, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	x19, x20, #0, #64
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	x9, x10, #5, #59
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	w9, w10, #0, #32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	w11, w12, #31, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	w13, w14, #29, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bfxil	xzr, xzr, #10, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfx	w9, w10, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	x2, x3, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x19, x20, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	x9, x10, #5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w9, w10, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	w11, w12, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	w13, w14, #29
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfiz	xzr, xzr, #10, #11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfx	w9, w10, #0, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x2, x3, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x19, x20, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x9, x10, #5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w9, w10, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w11, w12, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w13, w14, #29
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ubfx	xzr, xzr, #10, #11
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     cbz	w5, #4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     cbz	x5, #0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     cbnz	x2, #-4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     cbnz	x26, #1048572
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     cbz	wzr, #0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     cbnz	xzr, #0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     b.ne	#4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     b.ge	#1048572
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     b.ge	#-4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	w1, #31, #0, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	w3, #0, #15, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	wzr, #15, #13, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	x9, #31, #0, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	x3, #0, #15, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	xzr, #5, #7, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	w1, #31, #0, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	w3, #0, #15, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	wzr, #15, #13, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	x9, #31, #0, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	x3, #0, #15, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	xzr, #5, #7, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	w1, wzr, #0, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	w3, w0, #15, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	wzr, w15, #13, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	x9, xzr, #0, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	x3, x0, #15, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmp	xzr, x5, #7, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	w1, wzr, #0, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	w3, w0, #15, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	wzr, w15, #13, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	x9, xzr, #0, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	x3, x0, #15, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ccmn	xzr, x5, #7, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	w1, w0, w19, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	wzr, w5, w9, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	w9, wzr, w30, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	w1, w28, wzr, mi
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	x19, x23, x29, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	xzr, x3, x4, ge
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	x5, xzr, x6, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csel	x7, x8, xzr, lo
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	w1, w0, w19, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	wzr, w5, w9, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	w9, wzr, w30, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	w1, w28, wzr, mi
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	x19, x23, x29, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	xzr, x3, x4, ge
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	x5, xzr, x6, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	x7, x8, xzr, lo
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	w1, w0, w19, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	wzr, w5, w9, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	w9, wzr, w30, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	w1, w28, wzr, mi
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	x19, x23, x29, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	xzr, x3, x4, ge
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	x5, xzr, x6, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	x7, x8, xzr, lo
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	w1, w0, w19, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	wzr, w5, w9, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	w9, wzr, w30, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	w1, w28, wzr, mi
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	x19, x23, x29, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	xzr, x3, x4, ge
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	x5, xzr, x6, hs
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	x7, x8, xzr, lo
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cset	w3, eq
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cset	x9, pl
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csetm	w20, ne
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csetm	x30, ge
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	w2, wzr, wzr, al
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	x3, xzr, xzr, nv
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinc	w3, w5, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinc	wzr, w4, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cset	w9, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinc	x3, x5, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinc	xzr, x4, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cset	x9, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	w5, w6, w6, nv
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinc	x1, x2, x2, al
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinv	w3, w5, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinv	wzr, w4, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csetm	w9, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinv	x3, x5, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cinv	xzr, x4, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csetm	x9, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	x1, x0, x0, al
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	w9, w8, w8, nv
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cneg	w3, w5, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cneg	wzr, w4, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cneg	w9, wzr, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cneg	x3, x5, gt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cneg	xzr, x4, le
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cneg	x9, xzr, lt
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csneg	x4, x8, x8, al
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     csinv	w9, w8, w8, nv
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rbit	w0, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rbit	x18, x3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rev16	w17, w1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rev16	x5, x2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rev	w18, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rev32	x20, x1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rev	x22, x2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     clz	w24, w3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     clz	x26, x4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cls	w3, w5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cls	x20, x5
+# CHECK-NEXT:  -      -     12.00   -      -      -      -      -      -     udiv	w0, w7, w10
+# CHECK-NEXT:  -      -     20.00   -      -      -      -      -      -     udiv	x9, x22, x4
+# CHECK-NEXT:  -      -     12.00   -      -      -      -      -      -     sdiv	w12, w21, w0
+# CHECK-NEXT:  -      -     20.00   -      -      -      -      -      -     sdiv	x13, x2, x1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	w11, w12, w13
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	x14, x15, x16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w17, w18, w19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x20, x21, x22
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w23, w24, w25
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x26, x27, x28
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	w0, w1, w2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	x3, x4, x5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	w6, w7, w8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsl	x9, x10, x11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	w12, w13, w14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     lsr	x15, x16, x17
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	w18, w19, w20
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     asr	x21, x22, x23
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	w24, w25, w26
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	x27, x28, x29
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     smulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     smulh	xzr, x27, x26
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     umulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     umulh	x23, x30, xzr
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     madd	w1, w3, w7, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     madd	wzr, w0, w9, w11
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     madd	w13, wzr, w4, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     madd	w19, w30, wzr, w29
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     mul	w4, w5, w6
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     madd	x1, x3, x7, x4
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     madd	xzr, x0, x9, x11
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     madd	x13, xzr, x4, x4
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     madd	x19, x30, xzr, x29
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     mul	x4, x5, x6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     msub	w1, w3, w7, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     msub	wzr, w0, w9, w11
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     msub	w13, wzr, w4, w4
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     msub	w19, w30, wzr, w29
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     mneg	w4, w5, w6
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     msub	x1, x3, x7, x4
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     msub	xzr, x0, x9, x11
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     msub	x13, xzr, x4, x4
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     msub	x19, x30, xzr, x29
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     mneg	x4, x5, x6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smull	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umull	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     smulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     smulh	x23, x22, xzr
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     umulh	x23, x22, xzr
+# CHECK-NEXT:  -      -      -      -     2.00    -      -      -      -     mul	x19, x20, xzr
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     mneg	w21, w22, w23
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smull	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umull	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     smnegl	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     umnegl	x11, w13, w17
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     extr	w3, w5, w7, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     extr	w11, w13, w17, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     extr	x3, x5, x7, #15
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     extr	x11, x13, x17, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	x19, x23, #24
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	x29, xzr, #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ror	w9, w13, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmp	s3, s5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmp	s31, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmp	s31, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmpe	s29, s30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmpe	s15, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmpe	s15, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmp	d4, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmp	d23, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmp	d23, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmpe	d26, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmpe	d29, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmpe	d29, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmp	s1, s31, #0, eq
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmp	s3, s0, #15, hs
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmp	s31, s15, #13, hs
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmp	d9, d31, #0, le
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmp	d3, d0, #15, gt
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmp	d31, d5, #7, ne
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmpe	s1, s31, #0, eq
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmpe	s3, s0, #15, hs
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmpe	s31, s15, #13, hs
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmpe	d9, d31, #0, le
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmpe	d3, d0, #15, gt
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fccmpe	d31, d5, #7, ne
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcsel	s3, s20, s9, pl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcsel	d9, d10, d11, mi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	s0, s1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	s2, s3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	s4, s5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     29.00  fsqrt	s6, s7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	d8, s9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	h10, s11
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	s14, s15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	s16, s17
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	s18, s19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	s20, s21
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	s22, s23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	s24, s25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	d0, d1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	d2, d3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	d4, d5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     29.00  fsqrt	d6, d7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	s8, d9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	h10, d11
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	d12, d13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	d14, d15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	d16, d17
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	d18, d19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	d20, d21
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	d22, d23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	d24, d25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	s26, h27
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	d28, h29
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	s20, s19, s17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.00  fdiv	s1, s2, s3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	s4, s5, s6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	s7, s8, s9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	s13, s14, s15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	s16, s17, s18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	s19, s20, s21
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmul	s22, s23, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	d20, d19, d17
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  fdiv	d1, d2, d3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	d4, d5, d6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	d7, d8, d9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	d10, d11, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	d13, d14, d15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	d16, d17, d18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	d19, d20, d21
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmul	d22, d23, d24
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmadd	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmadd	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmsub	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmsub	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmadd	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmadd	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmsub	s3, s5, s6, s31
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmsub	d3, d13, d0, d23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	wzr, h20, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w19, h0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x12, h30, #45
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x19, h0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	wzr, s20, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w19, s0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x12, s30, #45
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x19, s0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	wzr, d20, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w19, d0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x12, d30, #45
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x19, d0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	wzr, h20, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w19, h0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x3, h5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x12, h30, #45
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x19, h0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	wzr, s20, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w19, s0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x3, s5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x12, s30, #45
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x19, s0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	wzr, d20, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w19, d0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x3, d5, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x12, d30, #45
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x19, d0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h23, w19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h31, wzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h14, w0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h23, x19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h31, xzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s23, w19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s31, wzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s14, w0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s23, x19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s31, xzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d23, w19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d31, wzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d14, w0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d23, x19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d31, xzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h23, w19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h31, wzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h14, w0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h23, x19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h31, xzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s23, w19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s31, wzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s14, w0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s23, x19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s31, xzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d23, w19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d31, wzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d14, w0, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d23, x19, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d31, xzr, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d14, x0, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	w3, h31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	xzr, h12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	wzr, h12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	x0, h0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	wzr, h9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	x12, h20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	w30, h23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	x29, h3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	w2, h3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	x4, h5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	w6, h7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	x8, h9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w10, h11
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x12, h13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w14, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x15, h16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h17, w18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h19, x20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	h21, w22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	h23, x24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	w25, h26
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	x27, h28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	w29, h30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	xzr, h0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	w3, s31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	xzr, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	wzr, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	x0, s0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	wzr, s9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	x12, s20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	w30, s23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	x29, s3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	w2, s3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	x4, s5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	w6, s7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	x8, s9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w10, s11
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w14, s15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x15, s16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s17, w18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s19, x20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s21, w22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s23, x24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	w25, s26
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	x27, s28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	w29, s30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	xzr, s0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	w3, d31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	xzr, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	wzr, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	x0, d0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	wzr, d9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	x12, d20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	w30, d23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	x29, d3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	w2, d3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	x4, d5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	w6, d7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	x8, d9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	w10, d11
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	x12, d13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	w14, d15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	x15, d16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d17, w18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d19, x20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d21, w22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d23, x24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	w25, d26
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	x27, d28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	w29, d30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	xzr, d0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	w3, s9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	s9, w3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	x20, d31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	d1, x15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	x3, v12.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	v1.d[1], x19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	s2, #0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	s3, #1.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	d30, #16.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	s4, #1.06250000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	d10, #1.93750000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	s12, #-1.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	d16, #8.50000000
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w3, #0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x29, #4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	xzr, #-4
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s0, #8
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d0, #1048572
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q0, #-1048576
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     prfm	pldl1strm, #0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     prfm	#22, #0
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stxrb	w18, w8, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stxrh	w24, w15, [x16]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stxr	w5, w6, [x17]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stxr	w1, x10, [x21]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldxrb	w30, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldxrh	w17, [x4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldxr	w22, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldxr	x11, [x29]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldxr	x11, [x29]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldxr	x11, [x29]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stxp	w12, w11, w10, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stxp	wzr, x27, x9, [x12]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldxp	w0, wzr, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldxp	x17, x0, [x18]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stlxrb	w12, w22, [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stlxrh	w10, w1, [x1]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stlxr	w9, w2, [x2]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stlxr	w9, x3, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldaxrb	w8, [x4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldaxrh	w7, [x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldaxr	w6, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldaxr	x5, [x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldaxr	x5, [x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldaxr	x5, [x6]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stlxp	w4, w5, w6, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     stlxp	wzr, x6, x7, [x1]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldaxp	w5, w18, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldaxp	x6, x19, [x22]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stlrb	w24, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stlrh	w25, [x30]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stlr	w26, [x29]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stlr	x27, [x28]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stlr	x27, [x28]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stlr	x27, [x28]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldarb	w23, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldarh	w22, [x30]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldar	wzr, [x29]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldar	x21, [x28]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldar	x21, [x28]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldar	x21, [x28]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     sturb	w9, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     sturh	wzr, [x12, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	w16, [x0, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	x28, [x14, #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldurb	w1, [x20, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldurh	w20, [x1, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	w12, [sp, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	xzr, [x12, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldursb	x9, [x7, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldursh	x17, [x19, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldursw	x20, [x15, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     prfum	pldl2keep, [sp, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldursb	w19, [x1, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldursh	w15, [x21, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	b0, [sp, #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	h12, [x12, #-1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	s15, [x0, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	d31, [x5, #25]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stur	q9, [x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	b3, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	h5, [x4, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	s7, [x12, #-1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	d11, [x19, #4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldur	q13, [x1, #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w9, [x2], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w10, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w10, [x3], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w9, [x2], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w9, [x2], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w10, [x3], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w19, [sp], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w20, [x30], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w21, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	xzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w9, [x2], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w10, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w10, [x3], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w9, [x2], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w9, [x2], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w10, [x3], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w19, [sp], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w20, [x30], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w21, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	xzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	xzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	x2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	x19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	xzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	x2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	x19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	xzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	wzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	wzr, [x9], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w2, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w19, [x12], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	b0, [x0], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	b3, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	b5, [sp], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	h10, [x10], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	h13, [x23], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	h15, [sp], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s20, [x20], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s23, [x23], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s25, [x0], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d20, [x20], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d23, [x23], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d25, [x0], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b0, [x0], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b3, [x3], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b5, [sp], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h10, [x10], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h13, [x23], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h15, [sp], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s20, [x20], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s23, [x23], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s25, [x0], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d20, [x20], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d23, [x23], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d25, [x0], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q20, [x1], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q23, [x9], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q25, [x20], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q10, [x1], #255
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q22, [sp], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q21, [x20], #-256
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x3, [x4, #0]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w9, [x2, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w10, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w10, [x3, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w9, [x2, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w9, [x2, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w10, [x3, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w19, [sp, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w20, [x30, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w21, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	xzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w9, [x2, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w10, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w10, [x3, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w9, [x2, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w9, [x2, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w10, [x3, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w19, [sp, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w20, [x30, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w21, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	xzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	xzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	x2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	x19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	xzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	x2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	x19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	xzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	wzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	wzr, [x9, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w2, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w19, [x12, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	b0, [x0, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	b3, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	b5, [sp, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	h10, [x10, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	h13, [x23, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	h15, [sp, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s20, [x20, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s23, [x23, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s25, [x0, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d20, [x20, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d23, [x23, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d25, [x0, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b0, [x0, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b3, [x3, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b5, [sp, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h10, [x10, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h13, [x23, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h15, [sp, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s20, [x20, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s23, [x23, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s25, [x0, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d20, [x20, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d23, [x23, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d25, [x0, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q20, [x1, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q23, [x9, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q25, [x20, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q10, [x1, #255]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q22, [sp, #1]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q21, [x20, #-256]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     sttrb	w9, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     sttrh	wzr, [x12, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     sttr	w16, [x0, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     sttr	x28, [x14, #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrb	w1, [x20, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrh	w20, [x1, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtr	w12, [sp, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtr	xzr, [x12, #255]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrsb	x9, [x7, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrsh	x17, [x19, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrsw	x20, [x15, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrsb	w19, [x1, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldtrsh	w15, [x21, #-256]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x4, [x29]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x30, [x12, #32760]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x20, [sp, #8]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	xzr, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w2, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w17, [sp, #16380]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w13, [x2, #4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x2, [x5, #4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x23, [sp, #16380]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w2, [x4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w23, [x6, #8190]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	wzr, [sp, #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	x29, [x2, #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w26, [x3, #121]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w12, [x2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w27, [sp, #4095]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	xzr, [x15]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x30, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w20, [x4, #16380]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w17, [sp, #8190]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w23, [x3, #4095]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	wzr, [x2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	b31, [sp, #4095]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	h20, [x2, #8190]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s10, [x19, #16380]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d3, [x10, #32760]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q12, [sp, #65520]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w3, [sp, x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w9, [x27, x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w10, [x30, x7]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strb	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrb	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsb	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w3, [sp, x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w9, [x27, x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w10, [x30, x7, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	x13, [x27, x5, sxtx #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsh	w16, [x24, w8, uxtw #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrh	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     strh	w19, [x21, wzr, sxtw #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w3, [sp, x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	s9, [x27, x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w10, [x30, x7, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	s12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w13, [x27, x5, sxtx #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	w14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w16, [x24, w8, uxtw #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	w18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldrsw	x19, [x21, wzr, sxtw #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x3, [sp, x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x9, [x27, x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	d10, [x30, x7, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	x11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x13, [x27, x5, sxtx #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     prfm	pldl1keep, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x16, [x24, w8, uxtw #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	x18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	d19, [x21, wzr, sxtw #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q3, [sp, x5]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q9, [x27, x6]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q10, [x30, x7, lsl #4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q11, [x29, x3, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q12, [x28, xzr, sxtx]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q13, [x27, x5, sxtx #4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q14, [x26, w6, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q15, [x25, w7, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q16, [x24, w8, uxtw #4]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q17, [x23, w9, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	q18, [x22, w10, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	q19, [x21, wzr, sxtw #4]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w3, w5, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w9, w10, [sp, #4]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x9, x10, [sp, #4]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x9, x10, [x2, #-256]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x20, x30, [sp, #252]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x21, x29, [x2, #504]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x24, x25, [x4, #8]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	s29, s28, [sp, #252]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	s1, s2, [x3, #44]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	d3, d5, [x9, #504]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	q3, q5, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	q23, q29, [x1, #-1024]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w3, w5, [sp], #0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	wzr, w9, [sp], #252
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w2, wzr, [sp], #-256
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w9, w10, [sp], #4
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x9, x10, [sp], #4
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x9, x10, [x2], #-256
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x20, x30, [sp], #252
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x21, x29, [x2], #504
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x22, x23, [x3], #-512
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x24, x25, [x4], #8
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	s29, s28, [sp], #252
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	s27, s26, [sp], #-256
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	s1, s2, [x3], #44
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	d3, d5, [x9], #504
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	d7, d11, [x10], #-512
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	d2, d3, [x30], #-8
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	q3, q5, [sp], #0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	q17, q19, [sp], #1008
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	q23, q29, [x1], #-1024
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w3, w5, [sp, #0]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	wzr, w9, [sp, #252]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w2, wzr, [sp, #-256]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	w9, w10, [sp, #4]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x9, x10, [sp, #4]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x9, x10, [x2, #-256]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldpsw	x20, x30, [sp, #252]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x21, x29, [x2, #504]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x22, x23, [x3, #-512]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	x24, x25, [x4, #8]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	s29, s28, [sp, #252]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	s27, s26, [sp, #-256]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	s1, s2, [x3, #44]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	d3, d5, [x9, #504]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	d7, d11, [x10, #-512]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	d2, d3, [x30, #-8]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	q3, q5, [sp, #0]!
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stp	q17, q19, [sp, #1008]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldp	q23, q29, [x1, #-1024]!
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	w3, w5, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnp	wzr, w9, [sp, #252]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	w2, wzr, [sp, #-256]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	w9, w10, [sp, #4]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	x21, x29, [x2, #504]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	x22, x23, [x3, #-512]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	x24, x25, [x4, #8]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	s29, s28, [sp, #252]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnp	s27, s26, [sp, #-256]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	s1, s2, [x3, #44]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnp	d3, d5, [x9, #504]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnp	d7, d11, [x10, #-512]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	d2, d3, [x30, #-8]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnp	q3, q5, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnp	q17, q19, [sp, #1008]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ldnp	q23, q29, [x1, #-1024]
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	w3, #983055
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	x10, #-6148914691236517206
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w12, w23, w21
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w16, w15, w1, lsl #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w9, w4, w10, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w3, w30, w11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	x3, x5, x7, lsl #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	x5, x14, x19, asr #4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w3, w17, w19, ror #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w0, w2, wzr, lsr #17
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w3, w30, w11, asr #2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	xzr, x4, x26
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	w3, wzr, w20, ror #2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     and	x7, x20, xzr, asr #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bic	x13, x20, x14, lsl #47
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bic	w2, w7, w9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     orr	w2, w7, w0, asr #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     orr	x8, x9, x10, lsl #12
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     orn	x3, x5, x7, asr #2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     orn	w2, w5, w29
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ands	w7, wzr, w9, lsl #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ands	x3, x5, x20, ror #63
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bics	w3, w5, w7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     bics	x3, xzr, x3, lsl #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     tst	w3, w7, lsl #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     tst	x2, x20, asr #2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	x3, x6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	x3, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	wzr, w2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	w3, w5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movz	w2, #0, lsl #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	w2, #-1235
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	x2, #5299989643264
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     mov	x2, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movk	w3, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movz	x4, #0, lsl #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movk	w5, #0, lsl #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movz	x6, #0, lsl #32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movk	x7, #0, lsl #32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movz	x8, #0, lsl #48
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     movk	x9, #0, lsl #48
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adr	x2, #1600
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adrp	x21, #6553600
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     adr	x0, #262144
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     tbz	x12, #62, #0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     tbz	x12, #62, #4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     tbz	x12, #62, #-32768
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     tbnz	x12, #60, #32764
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     b	#4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     b	#-4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     b	#134217724
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     br	x20
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     blr	xzr
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     ret	x10
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     ret
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     eret
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     drps
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-neon-instructions.s
new file mode 100644
index 000000000000..147da4d2ef07
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-neon-instructions.s
@@ -0,0 +1,3208 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a320 -instruction-tables < %s | FileCheck %s
+
+abs d29, d24
+abs v0.16b, v0.16b
+abs v0.2d, v0.2d
+abs v0.2s, v0.2s
+abs v0.4h, v0.4h
+abs v0.4s, v0.4s
+abs v0.8b, v0.8b
+abs v0.8h, v0.8h
+add d17, d31, d29
+add v0.8b, v0.8b, v0.8b
+addhn v0.2s, v0.2d, v0.2d
+addhn v0.4h, v0.4s, v0.4s
+addhn v0.8b, v0.8h, v0.8h
+addhn2 v0.16b, v0.8h, v0.8h
+addhn2 v0.4s, v0.2d, v0.2d
+addhn2 v0.8h, v0.4s, v0.4s
+addp v0.2d, v0.2d, v0.2d
+addp v0.8b, v0.8b, v0.8b
+and v0.8b, v0.8b, v0.8b
+bic v0.4h, #15, lsl #8
+bic v0.8b, v0.8b, v0.8b
+bif v0.16b, v0.16b, v0.16b
+bit v0.16b, v0.16b, v0.16b
+bsl v0.8b, v0.8b, v0.8b
+cls v0.16b, v0.16b
+cls v0.2s, v0.2s
+cls v0.4h, v0.4h
+cls v0.4s, v0.4s
+cls v0.8b, v0.8b
+cls v0.8h, v0.8h
+clz v0.16b, v0.16b
+clz v0.2s, v0.2s
+clz v0.4h, v0.4h
+clz v0.4s, v0.4s
+clz v0.8b, v0.8b
+clz v0.8h, v0.8h
+cmeq d20, d21, 0
+cmeq d20, d21, d22
+cmeq v0.16b, v0.16b, 0
+cmeq v0.16b, v0.16b, v0.16b
+cmge d20, d21, 0
+cmge d20, d21, d22
+cmge v0.4h, v0.4h, v0.4h
+cmge v0.8b, v0.8b, 0
+cmgt d20, d21, 0
+cmgt d20, d21, d22
+cmgt v0.2s, v0.2s, 0
+cmgt v0.4s, v0.4s, v0.4s
+cmhi d20, d21, d22
+cmhi v0.8h, v0.8h, v0.8h
+cmhs d20, d21, d22
+cmhs v0.8b, v0.8b, v0.8b
+cmle d20, d21, 0
+cmle v0.2d, v0.2d, 0
+cmlt d20, d21, 0
+cmlt v0.8h, v0.8h, 0
+cmtst d20, d21, d22
+cmtst v0.2s, v0.2s, v0.2s
+cnt v0.16b, v0.16b
+cnt v0.8b, v0.8b
+dup v0.16b,w28
+dup v0.2d,x28
+dup v0.2s,w28
+dup v0.4h,w28
+dup v0.4s,w28
+dup v0.8b,w28
+dup v0.8h,w28
+eor v0.16b, v0.16b, v0.16b
+ext v0.16b, v0.16b, v0.16b, #3
+ext v0.8b, v0.8b, v0.8b, #3
+fabd d29, d24, d20
+fabd s29, s24, s20
+fabd v0.4s, v0.4s, v0.4s
+fabs v0.2d, v0.2d
+fabs v0.2s, v0.2s
+fabs v0.4h, v0.4h
+fabs v0.4s, v0.4s
+fabs v0.8h, v0.8h
+facge d20, d21, d22
+facge s10, s11, s12
+facge v0.4s, v0.4s, v0.4s
+facgt d20, d21, d22
+facgt s10, s11, s12
+facgt v0.2d, v0.2d, v0.2d
+fadd v0.4s, v0.4s, v0.4s
+faddp v0.2s, v0.2s, v0.2s
+faddp v0.4s, v0.4s, v0.4s
+fcmeq d20, d21, #0.0
+fcmeq d20, d21, d22
+fcmeq s10, s11, #0.0
+fcmeq s10, s11, s12
+fcmeq v0.2s, v0.2s, #0.0
+fcmeq v0.2s, v0.2s, v0.2s
+fcmge d20, d21, #0.0
+fcmge d20, d21, d22
+fcmge s10, s11, #0.0
+fcmge s10, s11, s12
+fcmge v0.2d, v0.2d, #0.0
+fcmge v0.4s, v0.4s, v0.4s
+fcmgt d20, d21, #0.0
+fcmgt d20, d21, d22
+fcmgt s10, s11, #0.0
+fcmgt s10, s11, s12
+fcmgt v0.4s, v0.4s, #0.0
+fcmgt v0.4s, v0.4s, v0.4s
+fcmle d20, d21, #0.0
+fcmle s10, s11, #0.0
+fcmle v0.2d, v0.2d, #0.0
+fcmlt d20, d21, #0.0
+fcmlt s10, s11, #0.0
+fcmlt v0.4s, v0.4s, #0.0
+fcvtas d21, d14
+fcvtas s12, s13
+fcvtas v0.2d, v0.2d
+fcvtas v0.2s, v0.2s
+fcvtas v0.4h, v0.4h
+fcvtas v0.4s, v0.4s
+fcvtas v0.8h, v0.8h
+fcvtau d21, d14
+fcvtau s12, s13
+fcvtau v0.2d, v0.2d
+fcvtau v0.2s, v0.2s
+fcvtau v0.4h, v0.4h
+fcvtau v0.4s, v0.4s
+fcvtau v0.8h, v0.8h
+fcvtl v0.2d, v0.2s
+fcvtl v0.4s, v0.4h
+fcvtl2 v0.2d, v0.4s
+fcvtl2 v0.4s, v0.8h
+fcvtms d21, d14
+fcvtms s22, s13
+fcvtms v0.2d, v0.2d
+fcvtms v0.2s, v0.2s
+fcvtms v0.4h, v0.4h
+fcvtms v0.4s, v0.4s
+fcvtms v0.8h, v0.8h
+fcvtmu d21, d14
+fcvtmu s12, s13
+fcvtmu v0.2d, v0.2d
+fcvtmu v0.2s, v0.2s
+fcvtmu v0.4h, v0.4h
+fcvtmu v0.4s, v0.4s
+fcvtmu v0.8h, v0.8h
+fcvtn v0.2s, v0.2d
+fcvtn v0.4h, v0.4s
+fcvtn2 v0.4s, v0.2d
+fcvtn2 v0.8h, v0.4s
+fcvtns d21, d14
+fcvtns s22, s13
+fcvtns v0.2d, v0.2d
+fcvtns v0.2s, v0.2s
+fcvtns v0.4h, v0.4h
+fcvtns v0.4s, v0.4s
+fcvtns v0.8h, v0.8h
+fcvtnu d21, d14
+fcvtnu s12, s13
+fcvtnu v0.2d, v0.2d
+fcvtnu v0.2s, v0.2s
+fcvtnu v0.4h, v0.4h
+fcvtnu v0.4s, v0.4s
+fcvtnu v0.8h, v0.8h
+fcvtps d21, d14
+fcvtps s22, s13
+fcvtps v0.2d, v0.2d
+fcvtps v0.2s, v0.2s
+fcvtps v0.4h, v0.4h
+fcvtps v0.4s, v0.4s
+fcvtps v0.8h, v0.8h
+fcvtpu d21, d14
+fcvtpu s12, s13
+fcvtpu v0.2d, v0.2d
+fcvtpu v0.2s, v0.2s
+fcvtpu v0.4h, v0.4h
+fcvtpu v0.4s, v0.4s
+fcvtpu v0.8h, v0.8h
+fcvtxn s22, d13
+fcvtxn v0.2s, v0.2d
+fcvtxn2 v0.4s, v0.2d
+fcvtzs d21, d12, #1
+fcvtzs d21, d14
+fcvtzs s12, s13
+fcvtzs s21, s12, #1
+fcvtzs v0.2d, v0.2d
+fcvtzs v0.2d, v0.2d, #3
+fcvtzs v0.2s, v0.2s
+fcvtzs v0.2s, v0.2s, #3
+fcvtzs v0.4h, v0.4h
+fcvtzs v0.4s, v0.4s
+fcvtzs v0.4s, v0.4s, #3
+fcvtzs v0.8h, v0.8h
+fcvtzu d21, d12, #1
+fcvtzu d21, d14
+fcvtzu s12, s13
+fcvtzu s21, s12, #1
+fcvtzu v0.2d, v0.2d
+fcvtzu v0.2d, v0.2d, #3
+fcvtzu v0.2s, v0.2s
+fcvtzu v0.2s, v0.2s, #3
+fcvtzu v0.4h, v0.4h
+fcvtzu v0.4s, v0.4s
+fcvtzu v0.4s, v0.4s, #3
+fcvtzu v0.8h, v0.8h
+fdiv v0.2s, v0.2s, v0.2s
+fmax v0.2d, v0.2d, v0.2d
+fmax v0.2s, v0.2s, v0.2s
+fmax v0.4s, v0.4s, v0.4s
+fmaxnm v0.2d, v0.2d, v0.2d
+fmaxnm v0.2s, v0.2s, v0.2s
+fmaxnm v0.4s, v0.4s, v0.4s
+fmaxnmp v0.2d, v0.2d, v0.2d
+fmaxnmp v0.2s, v0.2s, v0.2s
+fmaxnmp v0.4s, v0.4s, v0.4s
+fmaxp v0.2d, v0.2d, v0.2d
+fmaxp v0.2s, v0.2s, v0.2s
+fmaxp v0.4s, v0.4s, v0.4s
+fmin v0.2d, v0.2d, v0.2d
+fmin v0.2s, v0.2s, v0.2s
+fmin v0.4s, v0.4s, v0.4s
+fminnm v0.2d, v0.2d, v0.2d
+fminnm v0.2s, v0.2s, v0.2s
+fminnm v0.4s, v0.4s, v0.4s
+fminnmp v0.2d, v0.2d, v0.2d
+fminnmp v0.2s, v0.2s, v0.2s
+fminnmp v0.4s, v0.4s, v0.4s
+fminp v0.2d, v0.2d, v0.2d
+fminp v0.2s, v0.2s, v0.2s
+fminp v0.4s, v0.4s, v0.4s
+fmla d0, d1, v0.d[1]
+fmla s0, s1, v0.s[3]
+fmla v0.2s, v0.2s, v0.2s
+fmls d0, d4, v0.d[1]
+fmls s3, s5, v0.s[3]
+fmls v0.2s, v0.2s, v0.2s
+fmov v0.2d, #-1.25
+fmov v0.2s, #13.0
+fmov v0.4s, #1.0
+fmul d0, d1, v0.d[1]
+fmul s0, s1, v0.s[3]
+fmul v0.2s, v0.2s, v0.2s
+fmulx d0, d4, v0.d[1]
+fmulx d23, d11, d1
+fmulx s20, s22, s15
+fmulx s3, s5, v0.s[3]
+fmulx v0.2d, v0.2d, v0.2d
+fmulx v0.2s, v0.2s, v0.2s
+fmulx v0.4s, v0.4s, v0.4s
+fneg v0.2d, v0.2d
+fneg v0.2s, v0.2s
+fneg v0.4h, v0.4h
+fneg v0.4s, v0.4s
+fneg v0.8h, v0.8h
+frecpe d13, d13
+frecpe s19, s14
+frecpe v0.2d, v0.2d
+frecpe v0.2s, v0.2s
+frecpe v0.4h, v0.4h
+frecpe v0.4s, v0.4s
+frecpe v0.8h, v0.8h
+frecps  v0.4s, v0.4s, v0.4s
+frecps d22, d30, d21
+frecps s21, s16, s13
+frecpx d16, d19
+frecpx s18, s10
+frinta v0.2d, v0.2d
+frinta v0.2s, v0.2s
+frinta v0.4h, v0.4h
+frinta v0.4s, v0.4s
+frinta v0.8h, v0.8h
+frinti v0.2d, v0.2d
+frinti v0.2s, v0.2s
+frinti v0.4h, v0.4h
+frinti v0.4s, v0.4s
+frinti v0.8h, v0.8h
+frintm v0.2d, v0.2d
+frintm v0.2s, v0.2s
+frintm v0.4h, v0.4h
+frintm v0.4s, v0.4s
+frintm v0.8h, v0.8h
+frintn v0.2d, v0.2d
+frintn v0.2s, v0.2s
+frintn v0.4h, v0.4h
+frintn v0.4s, v0.4s
+frintn v0.8h, v0.8h
+frintp v0.2d, v0.2d
+frintp v0.2s, v0.2s
+frintp v0.4h, v0.4h
+frintp v0.4s, v0.4s
+frintp v0.8h, v0.8h
+frintx v0.2d, v0.2d
+frintx v0.2s, v0.2s
+frintx v0.4h, v0.4h
+frintx v0.4s, v0.4s
+frintx v0.8h, v0.8h
+frintz v0.2d, v0.2d
+frintz v0.2s, v0.2s
+frintz v0.4h, v0.4h
+frintz v0.4s, v0.4s
+frintz v0.8h, v0.8h
+frsqrte d21, d12
+frsqrte s22, s13
+frsqrte v0.2d, v0.2d
+frsqrte v0.2s, v0.2s
+frsqrte v0.4h, v0.4h
+frsqrte v0.4s, v0.4s
+frsqrte v0.8h, v0.8h
+frsqrts d8, d22, d18
+frsqrts s21, s5, s12
+frsqrts v0.2d, v0.2d, v0.2d
+fsqrt v0.2d, v0.2d
+fsqrt v0.2s, v0.2s
+fsqrt v0.4h, v0.4h
+fsqrt v0.4s, v0.4s
+fsqrt v0.8h, v0.8h
+fsub v0.2s, v0.2s, v0.2s
+ld1 { v0.16b }, [x0]
+ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+ld1 { v0.4s, v1.4s }, [sp], #32
+ld1 { v0.4s, v1.4s, v2.4s }, [sp]
+ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+ld1 { v0.8h }, [x15], x2
+ld1 { v0.8h, v1.8h }, [x15]
+ld1 { v0.b }[9], [x0]
+ld1 { v0.b }[9], [x0], #1
+ld1r { v0.16b }, [x0]
+ld1r { v0.16b }, [x0], #1
+ld1r { v0.8h }, [x15]
+ld1r { v0.8h }, [x15], #2
+ld2 { v0.16b, v1.16b }, [x0], x1
+ld2 { v0.8b, v1.8b }, [x0]
+ld2 { v0.h, v1.h }[7], [x15]
+ld2 { v0.h, v1.h }[7], [x15], #4
+ld2r { v0.2d, v1.2d }, [x0]
+ld2r { v0.2d, v1.2d }, [x0], #16
+ld2r { v0.4s, v1.4s }, [sp]
+ld2r { v0.4s, v1.4s }, [sp], #8
+ld3 { v0.4h, v1.4h, v2.4h }, [x15]
+ld3 { v0.8h, v1.8h, v2.8h }, [x15], x2
+ld3 { v0.s, v1.s, v2.s }[3], [sp]
+ld3 { v0.s, v1.s, v2.s }[3], [sp], x3
+ld3r { v0.4h, v1.4h, v2.4h }, [x15]
+ld3r { v0.4h, v1.4h, v2.4h }, [x15], #6
+ld3r { v0.8b, v1.8b, v2.8b }, [x0]
+ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3
+ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0]
+ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp]
+ld4r { v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7
+ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30
+mla v0.8b, v0.8b, v0.8b
+mls v0.4h, v0.4h, v0.4h
+mov b0, v0.b[15]
+mov d6, v0.d[1]
+mov h2, v0.h[5]
+mov s17, v0.s[2]
+mov v0.16b, v0.16b
+mov v0.8b, v0.8b
+movi d15, #0xff00ff00ff00ff
+movi v0.16b, #31
+movi v0.2d, #0xff0000ff0000ffff
+movi v0.2s, #8, msl #8
+movi v0.4s, #255, lsl #24
+movi v0.8b, #255
+mul v0.8b, v0.8b, v0.8b
+mvni v0.2s, 0
+mvni v0.4s, #16, msl #16
+neg d29, d24
+neg v0.16b, v0.16b
+neg v0.2d, v0.2d
+neg v0.2s, v0.2s
+neg v0.4h, v0.4h
+neg v0.4s, v0.4s
+neg v0.8b, v0.8b
+neg v0.8h, v0.8h
+not v0.16b, v0.16b
+not v0.8b, v0.8b
+orn v0.16b, v0.16b, v0.16b
+orr v0.16b, v0.16b, v0.16b
+orr v0.8h, #31
+pmul v0.16b, v0.16b, v0.16b
+pmul v0.8b, v0.8b, v0.8b
+pmull v0.8h, v0.8b, v0.8b
+pmull2 v0.8h, v0.16b, v0.16b
+raddhn v0.2s, v0.2d, v0.2d
+raddhn v0.4h, v0.4s, v0.4s
+raddhn v0.8b, v0.8h, v0.8h
+raddhn2 v0.16b, v0.8h, v0.8h
+raddhn2 v0.4s, v0.2d, v0.2d
+raddhn2 v0.8h, v0.4s, v0.4s
+rbit v0.16b, v0.16b
+rbit v0.8b, v0.8b
+rev16 v21.8b, v1.8b
+rev16 v30.16b, v31.16b
+rev32 v0.4h, v9.4h
+rev32 v21.8b, v1.8b
+rev32 v30.16b, v31.16b
+rev32 v4.8h, v7.8h
+rev64 v0.16b, v31.16b
+rev64 v1.8b, v9.8b
+rev64 v13.4h, v21.4h
+rev64 v2.8h, v4.8h
+rev64 v4.2s, v0.2s
+rev64 v6.4s, v8.4s
+rshrn v0.2s, v0.2d, #3
+rshrn v0.4h, v0.4s, #3
+rshrn v0.8b, v0.8h, #3
+rshrn2 v0.16b, v0.8h, #3
+rshrn2 v0.4s, v0.2d, #3
+rshrn2 v0.8h, v0.4s, #3
+rsubhn v0.2s, v0.2d, v0.2d
+rsubhn v0.4h, v0.4s, v0.4s
+rsubhn v0.8b, v0.8h, v0.8h
+rsubhn2 v0.16b, v0.8h, v0.8h
+rsubhn2 v0.4s, v0.2d, v0.2d
+rsubhn2 v0.8h, v0.4s, v0.4s
+saba v0.16b, v0.16b, v0.16b
+sabal v0.2d, v0.2s, v0.2s
+sabal v0.4s, v0.4h, v0.4h
+sabal v0.8h, v0.8b, v0.8b
+sabal2 v0.2d, v0.4s, v0.4s
+sabal2 v0.4s, v0.8h, v0.8h
+sabal2 v0.8h, v0.16b, v0.16b
+sabd v0.4h, v0.4h, v0.4h
+sabdl v0.2d, v0.2s, v0.2s
+sabdl v0.4s, v0.4h, v0.4h
+sabdl v0.8h, v0.8b, v0.8b
+sabdl2 v0.2d, v0.4s, v0.4s
+sabdl2 v0.4s, v0.8h, v0.8h
+sabdl2 v0.8h, v0.16b, v0.16b
+sadalp v0.1d, v0.2s
+sadalp v0.2d, v0.4s
+sadalp v0.2s, v0.4h
+sadalp v0.4h, v0.8b
+sadalp v0.4s, v0.8h
+sadalp v0.8h, v0.16b
+saddl v0.2d, v0.2s, v0.2s
+saddl v0.4s, v0.4h, v0.4h
+saddl v0.8h, v0.8b, v0.8b
+saddl2 v0.2d, v0.4s, v0.4s
+saddl2 v0.4s, v0.8h, v0.8h
+saddl2 v0.8h, v0.16b, v0.16b
+saddlp v0.1d, v0.2s
+saddlp v0.2d, v0.4s
+saddlp v0.2s, v0.4h
+saddlp v0.4h, v0.8b
+saddlp v0.4s, v0.8h
+saddlp v0.8h, v0.16b
+saddw v0.2d, v0.2d, v0.2s
+saddw v0.4s, v0.4s, v0.4h
+saddw v0.8h, v0.8h, v0.8b
+saddw2 v0.2d, v0.2d, v0.4s
+saddw2 v0.4s, v0.4s, v0.8h
+saddw2 v0.8h, v0.8h, v0.16b
+scvtf d21, d12
+scvtf d21, d12, #64
+scvtf s22, s13
+scvtf s22, s13, #32
+scvtf v0.2d, v0.2d
+scvtf v0.2d, v0.2d, #3
+scvtf v0.2s, v0.2s
+scvtf v0.2s, v0.2s, #3
+scvtf v0.4h, v0.4h
+scvtf v0.4s, v0.4s
+scvtf v0.4s, v0.4s, #3
+scvtf v0.8h, v0.8h
+shadd v0.8b, v0.8b, v0.8b
+shl d7, d10, #12
+shl v0.16b, v0.16b, #3
+shl v0.2d, v0.2d, #3
+shl v0.4h, v0.4h, #3
+shl v0.4s, v0.4s, #3
+shll	v0.2d, v0.2s, #32
+shll	v0.4s, v0.4h, #16
+shll	v0.8h, v0.8b, #8
+shll v0.2d, v0.2s, #32
+shll v0.4s, v0.4h, #16
+shll v0.8h, v0.8b, #8
+shll2	v0.2d, v0.4s, #32
+shll2	v0.4s, v0.8h, #16
+shll2	v0.8h, v0.16b, #8
+shll2 v0.2d, v0.4s, #32
+shll2 v0.4s, v0.8h, #16
+shll2 v0.8h, v0.16b, #8
+shrn v0.2s, v0.2d, #3
+shrn v0.4h, v0.4s, #3
+shrn v0.8b, v0.8h, #3
+shrn2 v0.16b, v0.8h, #3
+shrn2 v0.4s, v0.2d, #3
+shrn2 v0.8h, v0.4s, #3
+shsub v0.2s, v0.2s, v0.2s
+shsub v0.4h, v0.4h, v0.4h
+sli d10, d14, #12
+sli v0.16b, v0.16b, #3
+sli v0.2d, v0.2d, #3
+sli v0.2s, v0.2s, #3
+sli v0.4h, v0.4h, #3
+sli v0.4s, v0.4s, #3
+sli v0.8b, v0.8b, #3
+sli v0.8h, v0.8h, #3
+smax v0.2s, v0.2s, v0.2s
+smax v0.4h, v0.4h, v0.4h
+smax v0.8b, v0.8b, v0.8b
+smaxp v0.2s, v0.2s, v0.2s
+smaxp v0.4h, v0.4h, v0.4h
+smaxp v0.8b, v0.8b, v0.8b
+smin v0.16b, v0.16b, v0.16b
+smin v0.4s, v0.4s, v0.4s
+smin v0.8h, v0.8h, v0.8h
+sminp v0.16b, v0.16b, v0.16b
+sminp v0.4s, v0.4s, v0.4s
+sminp v0.8h, v0.8h, v0.8h
+smlal v0.2d, v0.2s, v0.2s
+smlal v0.4s, v0.4h, v0.4h
+smlal v0.8h, v0.8b, v0.8b
+smlal2 v0.2d, v0.4s, v0.4s
+smlal2 v0.4s, v0.8h, v0.8h
+smlal2 v0.8h, v0.16b, v0.16b
+smlsl v0.2d, v0.2s, v0.2s
+smlsl v0.4s, v0.4h, v0.4h
+smlsl v0.8h, v0.8b, v0.8b
+smlsl2 v0.2d, v0.4s, v0.4s
+smlsl2 v0.4s, v0.8h, v0.8h
+smlsl2 v0.8h, v0.16b, v0.16b
+smull v0.2d, v0.2s, v0.2s
+smull v0.4s, v0.4h, v0.4h
+smull v0.8h, v0.8b, v0.8b
+smull2 v0.2d, v0.4s, v0.4s
+smull2 v0.4s, v0.8h, v0.8h
+smull2 v0.8h, v0.16b, v0.16b
+sqabs b19, b14
+sqabs d18, d12
+sqabs h21, h15
+sqabs s20, s12
+sqabs v0.16b, v0.16b
+sqabs v0.2d, v0.2d
+sqabs v0.2s, v0.2s
+sqabs v0.4h, v0.4h
+sqabs v0.4s, v0.4s
+sqabs v0.8b, v0.8b
+sqabs v0.8h, v0.8h
+sqadd b20, b11, b15
+sqadd v0.16b, v0.16b, v0.16b
+sqadd v0.2s, v0.2s, v0.2s
+sqdmlal d19, s24, s12
+sqdmlal d8, s9, v0.s[1]
+sqdmlal s0, h0, v0.h[3]
+sqdmlal s17, h27, h12
+sqdmlal v0.2d, v0.2s, v0.2s
+sqdmlal v0.4s, v0.4h, v0.4h
+sqdmlal2 v0.2d, v0.4s, v0.4s
+sqdmlal2 v0.4s, v0.8h, v0.8h
+sqdmlsl d12, s23, s13
+sqdmlsl d8, s9, v0.s[1]
+sqdmlsl s0, h0, v0.h[3]
+sqdmlsl s14, h12, h25
+sqdmlsl v0.2d, v0.2s, v0.2s
+sqdmlsl v0.4s, v0.4h, v0.4h
+sqdmlsl2 v0.2d, v0.4s, v0.4s
+sqdmlsl2 v0.4s, v0.8h, v0.8h
+sqdmulh h10, h11, h12
+sqdmulh h7, h15, v0.h[3]
+sqdmulh s15, s14, v0.s[1]
+sqdmulh s20, s21, s2
+sqdmulh v0.2s, v0.2s, v0.2s
+sqdmulh v0.4s, v0.4s, v0.4s
+sqdmull d1, s1, v0.s[1]
+sqdmull d15, s22, s12
+sqdmull s1, h1, v0.h[3]
+sqdmull s12, h22, h12
+sqdmull v0.2d, v0.2s, v0.2s
+sqdmull v0.4s, v0.4h, v0.4h
+sqdmull2 v0.2d, v0.4s, v0.4s
+sqdmull2 v0.4s, v0.8h, v0.8h
+sqneg b19, b14
+sqneg d18, d12
+sqneg h21, h15
+sqneg s20, s12
+sqneg v0.16b, v0.16b
+sqneg v0.2d, v0.2d
+sqneg v0.2s, v0.2s
+sqneg v0.4h, v0.4h
+sqneg v0.4s, v0.4s
+sqneg v0.8b, v0.8b
+sqneg v0.8h, v0.8h
+sqrdmulh h10, h11, h12
+sqrdmulh h7, h15, v0.h[3]
+sqrdmulh s15, s14, v0.s[1]
+sqrdmulh s20, s21, s2
+sqrdmulh v0.4h, v0.4h, v0.4h
+sqrdmulh v0.8h, v0.8h, v0.8h
+sqrshl d31, d31, d31
+sqrshl h3, h4, h15
+sqrshl v0.2s, v0.2s, v0.2s
+sqrshl v0.4h, v0.4h, v0.4h
+sqrshl v0.8b, v0.8b, v0.8b
+sqrshrn b10, h13, #2
+sqrshrn h15, s10, #6
+sqrshrn s15, d12, #9
+sqrshrn v0.2s, v0.2d, #3
+sqrshrn v0.4h, v0.4s, #3
+sqrshrn v0.8b, v0.8h, #3
+sqrshrn2 v0.16b, v0.8h, #3
+sqrshrn2 v0.4s, v0.2d, #3
+sqrshrn2 v0.8h, v0.4s, #3
+sqrshrun b17, h10, #6
+sqrshrun h10, s13, #15
+sqrshrun s22, d16, #31
+sqrshrun v0.2s, v0.2d, #3
+sqrshrun v0.4h, v0.4s, #3
+sqrshrun v0.8b, v0.8h, #3
+sqrshrun2 v0.16b, v0.8h, #3
+sqrshrun2 v0.4s, v0.2d, #3
+sqrshrun2 v0.8h, v0.4s, #3
+sqshl b11, b19, #7
+sqshl d15, d16, #51
+sqshl d31, d31, d31
+sqshl h13, h18, #11
+sqshl h3, h4, h15
+sqshl s14, s17, #22
+sqshl v0.16b, v0.16b, #3
+sqshl v0.2d, v0.2d, #3
+sqshl v0.2s, v0.2s, #3
+sqshl v0.2s, v0.2s, v0.2s
+sqshl v0.4h, v0.4h, #3
+sqshl v0.4h, v0.4h, v0.4h
+sqshl v0.4s, v0.4s, #3
+sqshl v0.8b, v0.8b, #3
+sqshl v0.8b, v0.8b, v0.8b
+sqshl v0.8h, v0.8h, #3
+sqshlu b15, b18, #6
+sqshlu d11, d13, #32
+sqshlu h19, h17, #6
+sqshlu s16, s14, #25
+sqshlu v0.16b, v0.16b, #3
+sqshlu v0.2d, v0.2d, #3
+sqshlu v0.2s, v0.2s, #3
+sqshlu v0.4h, v0.4h, #3
+sqshlu v0.4s, v0.4s, #3
+sqshlu v0.8b, v0.8b, #3
+sqshlu v0.8h, v0.8h, #3
+sqshrn b10, h15, #5
+sqshrn h17, s10, #4
+sqshrn s18, d10, #31
+sqshrn v0.2s, v0.2d, #3
+sqshrn v0.4h, v0.4s, #3
+sqshrn v0.8b, v0.8h, #3
+sqshrn2 v0.16b, v0.8h, #3
+sqshrn2 v0.4s, v0.2d, #3
+sqshrn2 v0.8h, v0.4s, #3
+sqshrun b15, h10, #7
+sqshrun h20, s14, #3
+sqshrun s10, d15, #15
+sqshrun v0.2s, v0.2d, #3
+sqshrun v0.4h, v0.4s, #3
+sqshrun v0.8b, v0.8h, #3
+sqshrun2 v0.16b, v0.8h, #3
+sqshrun2 v0.4s, v0.2d, #3
+sqshrun2 v0.8h, v0.4s, #3
+sqsub s20, s10, s7
+sqsub v0.2d, v0.2d, v0.2d
+sqsub v0.4s, v0.4s, v0.4s
+sqsub v0.8b, v0.8b, v0.8b
+sqxtn b18, h18
+sqxtn h20, s17
+sqxtn s19, d14
+sqxtn v0.2s, v0.2d
+sqxtn v0.4h, v0.4s
+sqxtn v0.8b, v0.8h
+sqxtn2 v0.16b, v0.8h
+sqxtn2 v0.4s, v0.2d
+sqxtn2 v0.8h, v0.4s
+sqxtun b19, h14
+sqxtun h21, s15
+sqxtun s20, d12
+sqxtun v0.2s, v0.2d
+sqxtun v0.4h, v0.4s
+sqxtun v0.8b, v0.8h
+sqxtun2 v0.16b, v0.8h
+sqxtun2 v0.4s, v0.2d
+sqxtun2 v0.8h, v0.4s
+srhadd v0.2s, v0.2s, v0.2s
+srhadd v0.4h, v0.4h, v0.4h
+srhadd v0.8b, v0.8b, v0.8b
+sri d10, d12, #14
+sri v0.16b, v0.16b, #3
+sri v0.2d, v0.2d, #3
+sri v0.2s, v0.2s, #3
+sri v0.4h, v0.4h, #3
+sri v0.4s, v0.4s, #3
+sri v0.8b, v0.8b, #3
+sri v0.8h, v0.8h, #3
+srshl d16, d16, d16
+srshl v0.2s, v0.2s, v0.2s
+srshl v0.4h, v0.4h, v0.4h
+srshl v0.8b, v0.8b, v0.8b
+srshr d19, d18, #7
+srshr v0.16b, v0.16b, #3
+srshr v0.2d, v0.2d, #3
+srshr v0.2s, v0.2s, #3
+srshr v0.4h, v0.4h, #3
+srshr v0.4s, v0.4s, #3
+srshr v0.8b, v0.8b, #3
+srshr v0.8h, v0.8h, #3
+srsra d15, d11, #19
+srsra v0.16b, v0.16b, #3
+srsra v0.2d, v0.2d, #3
+srsra v0.2s, v0.2s, #3
+srsra v0.4h, v0.4h, #3
+srsra v0.4s, v0.4s, #3
+srsra v0.8b, v0.8b, #3
+srsra v0.8h, v0.8h, #3
+sshl d31, d31, d31
+sshl v0.2d, v0.2d, v0.2d
+sshl v0.2s, v0.2s, v0.2s
+sshl v0.4h, v0.4h, v0.4h
+sshl v0.8b, v0.8b, v0.8b
+sshll v0.2d, v0.2s, #3
+sshll2 v0.4s, v0.8h, #3
+sshr d15, d16, #12
+sshr v0.16b, v0.16b, #3
+sshr v0.2d, v0.2d, #3
+sshr v0.2s, v0.2s, #3
+sshr v0.4h, v0.4h, #3
+sshr v0.4s, v0.4s, #3
+sshr v0.8b, v0.8b, #3
+sshr v0.8h, v0.8h, #3
+ssra d18, d12, #21
+ssra v0.16b, v0.16b, #3
+ssra v0.2d, v0.2d, #3
+ssra v0.2s, v0.2s, #3
+ssra v0.4h, v0.4h, #3
+ssra v0.4s, v0.4s, #3
+ssra v0.8b, v0.8b, #3
+ssra v0.8h, v0.8h, #3
+ssubl v0.2d, v0.2s, v0.2s
+ssubl v0.4s, v0.4h, v0.4h
+ssubl v0.8h, v0.8b, v0.8b
+ssubl2 v0.2d, v0.4s, v0.4s
+ssubl2 v0.4s, v0.8h, v0.8h
+ssubl2 v0.8h, v0.16b, v0.16b
+ssubw v0.2d, v0.2d, v0.2s
+ssubw v0.4s, v0.4s, v0.4h
+ssubw v0.8h, v0.8h, v0.8b
+ssubw2 v0.2d, v0.2d, v0.4s
+ssubw2 v0.4s, v0.4s, v0.8h
+ssubw2 v0.8h, v0.8h, v0.16b
+st1 { v0.16b }, [x0]
+st1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+st1 { v0.4s, v1.4s }, [sp], #32
+st1 { v0.4s, v1.4s, v2.4s }, [sp]
+st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+st1 { v0.8h }, [x15], x2
+st1 { v0.8h, v1.8h }, [x15]
+st1 { v0.d }[1], [x0]
+st1 { v0.d }[1], [x0], #8
+st2 { v0.16b, v1.16b }, [x0], x1
+st2 { v0.8b, v1.8b }, [x0]
+st2 { v0.s, v1.s }[3], [sp]
+st2 { v0.s, v1.s }[3], [sp], #8
+st3 { v0.4h, v1.4h, v2.4h }, [x15]
+st3 { v0.8h, v1.8h, v2.8h }, [x15], x2
+st3 { v0.h, v1.h, v2.h }[7], [x15]
+st3 { v0.h, v1.h, v2.h }[7], [x15], #6
+st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0]
+st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+sub d15, d5, d16
+sub v0.2d, v0.2d, v0.2d
+suqadd b19, b14
+suqadd d18, d22
+suqadd h20, h15
+suqadd s21, s12
+suqadd v0.16b, v0.16b
+suqadd v0.2d, v0.2d
+suqadd v0.2s, v0.2s
+suqadd v0.4h, v0.4h
+suqadd v0.4s, v0.4s
+suqadd v0.8b, v0.8b
+suqadd v0.8h, v0.8h
+tbl v0.16b, { v0.16b }, v0.16b
+tbl v0.16b, { v0.16b, v1.16b }, v0.16b
+tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+tbl v0.8b, { v0.16b }, v0.8b
+tbl v0.8b, { v0.16b, v1.16b }, v0.8b
+tbl v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+tbx v0.16b, { v0.16b }, v0.16b
+tbx v0.16b, { v0.16b, v1.16b }, v0.16b
+tbx v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+tbx v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+tbx v0.8b, { v0.16b }, v0.8b
+tbx v0.8b, { v0.16b, v1.16b }, v0.8b
+tbx v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+tbx v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+trn1	v0.16b, v0.16b, v0.16b
+trn1	v0.2d, v0.2d, v0.2d
+trn1	v0.2s, v0.2s, v0.2s
+trn1	v0.4h, v0.4h, v0.4h
+trn1	v0.4s, v0.4s, v0.4s
+trn1	v0.8b, v0.8b, v0.8b
+trn1	v0.8h, v0.8h, v0.8h
+trn2	v0.16b, v0.16b, v0.16b
+trn2	v0.2d, v0.2d, v0.2d
+trn2	v0.2s, v0.2s, v0.2s
+trn2	v0.4h, v0.4h, v0.4h
+trn2	v0.4s, v0.4s, v0.4s
+trn2	v0.8b, v0.8b, v0.8b
+trn2	v0.8h, v0.8h, v0.8h
+uaba v0.8b, v0.8b, v0.8b
+uabal v0.2d, v0.2s, v0.2s
+uabal v0.4s, v0.4h, v0.4h
+uabal v0.8h, v0.8b, v0.8b
+uabal2 v0.2d, v0.4s, v0.4s
+uabal2 v0.4s, v0.8h, v0.8h
+uabal2 v0.8h, v0.16b, v0.16b
+uabd v0.4h, v0.4h, v0.4h
+uabdl v0.2d, v0.2s, v0.2s
+uabdl v0.4s, v0.4h, v0.4h
+uabdl v0.8h, v0.8b, v0.8b
+uabdl2 v0.2d, v0.4s, v0.4s
+uabdl2 v0.4s, v0.8h, v0.8h
+uabdl2 v0.8h, v0.16b, v0.16b
+uadalp v0.1d, v0.2s
+uadalp v0.2d, v0.4s
+uadalp v0.2s, v0.4h
+uadalp v0.4h, v0.8b
+uadalp v0.4s, v0.8h
+uadalp v0.8h, v0.16b
+uaddl v0.2d, v0.2s, v0.2s
+uaddl v0.4s, v0.4h, v0.4h
+uaddl v0.8h, v0.8b, v0.8b
+uaddl2 v0.2d, v0.4s, v0.4s
+uaddl2 v0.4s, v0.8h, v0.8h
+uaddl2 v0.8h, v0.16b, v0.16b
+uaddlp v0.1d, v0.2s
+uaddlp v0.2d, v0.4s
+uaddlp v0.2s, v0.4h
+uaddlp v0.4h, v0.8b
+uaddlp v0.4s, v0.8h
+uaddlp v0.8h, v0.16b
+uaddw v0.2d, v0.2d, v0.2s
+uaddw v0.4s, v0.4s, v0.4h
+uaddw v0.8h, v0.8h, v0.8b
+uaddw2 v0.2d, v0.2d, v0.4s
+uaddw2 v0.4s, v0.4s, v0.8h
+uaddw2 v0.8h, v0.8h, v0.16b
+ucvtf d21, d14
+ucvtf d21, d14, #64
+ucvtf s22, s13
+ucvtf s22, s13, #32
+ucvtf v0.2d, v0.2d
+ucvtf v0.2d, v0.2d, #3
+ucvtf v0.2s, v0.2s
+ucvtf v0.2s, v0.2s, #3
+ucvtf v0.4h, v0.4h
+ucvtf v0.4s, v0.4s
+ucvtf v0.4s, v0.4s, #3
+ucvtf v0.8h, v0.8h
+uhadd v0.16b, v0.16b, v0.16b
+uhadd v0.8h, v0.8h, v0.8h
+uhsub v0.4s, v0.4s, v0.4s
+umax v0.16b, v0.16b, v0.16b
+umax v0.4s, v0.4s, v0.4s
+umax v0.8h, v0.8h, v0.8h
+umaxp v0.16b, v0.16b, v0.16b
+umaxp v0.4s, v0.4s, v0.4s
+umaxp v0.8h, v0.8h, v0.8h
+umin v0.2s, v0.2s, v0.2s
+umin v0.4h, v0.4h, v0.4h
+umin v0.8b, v0.8b, v0.8b
+uminp v0.2s, v0.2s, v0.2s
+uminp v0.4h, v0.4h, v0.4h
+uminp v0.8b, v0.8b, v0.8b
+umlal v0.2d, v0.2s, v0.2s
+umlal v0.4s, v0.4h, v0.4h
+umlal v0.8h, v0.8b, v0.8b
+umlal2 v0.2d, v0.4s, v0.4s
+umlal2 v0.4s, v0.8h, v0.8h
+umlal2 v0.8h, v0.16b, v0.16b
+umlsl v0.2d, v0.2s, v0.2s
+umlsl v0.4s, v0.4h, v0.4h
+umlsl v0.8h, v0.8b, v0.8b
+umlsl2 v0.2d, v0.4s, v0.4s
+umlsl2 v0.4s, v0.8h, v0.8h
+umlsl2 v0.8h, v0.16b, v0.16b
+umull v0.2d, v0.2s, v0.2s
+umull v0.4s, v0.4h, v0.4h
+umull v0.8h, v0.8b, v0.8b
+umull2 v0.2d, v0.4s, v0.4s
+umull2 v0.4s, v0.8h, v0.8h
+umull2 v0.8h, v0.16b, v0.16b
+uqadd h0, h1, h5
+uqadd v0.8h, v0.8h, v0.8h
+uqrshl b11, b20, b30
+uqrshl s23, s20, s16
+uqrshl v0.16b, v0.16b, v0.16b
+uqrshl v0.4s, v0.4s, v0.4s
+uqrshl v0.4s, v0.4s, v0.4s
+uqrshl v0.8h, v0.8h, v0.8h
+uqrshrn b10, h12, #5
+uqrshrn h12, s10, #14
+uqrshrn s10, d10, #25
+uqrshrn v0.2s, v0.2d, #3
+uqrshrn v0.4h, v0.4s, #3
+uqrshrn v0.8b, v0.8h, #3
+uqrshrn2 v0.16b, v0.8h, #3
+uqrshrn2 v0.4s, v0.2d, #3
+uqrshrn2 v0.8h, v0.4s, #3
+uqshl b11, b20, b30
+uqshl b18, b15, #6
+uqshl d15, d12, #19
+uqshl h11, h18, #7
+uqshl s14, s19, #18
+uqshl s23, s20, s16
+uqshl v0.16b, v0.16b, #3
+uqshl v0.16b, v0.16b, v0.16b
+uqshl v0.2d, v0.2d, #3
+uqshl v0.2d, v0.2d, v0.2d
+uqshl v0.2s, v0.2s, #3
+uqshl v0.4h, v0.4h, #3
+uqshl v0.4s, v0.4s, #3
+uqshl v0.4s, v0.4s, v0.4s
+uqshl v0.8b, v0.8b, #3
+uqshl v0.8h, v0.8h, #3
+uqshl v0.8h, v0.8h, v0.8h
+uqshrn b12, h10, #7
+uqshrn h10, s14, #5
+uqshrn s10, d12, #13
+uqshrn v0.2s, v0.2d, #3
+uqshrn v0.4h, v0.4s, #3
+uqshrn v0.8b, v0.8h, #3
+uqshrn2 v0.16b, v0.8h, #3
+uqshrn2 v0.4s, v0.2d, #3
+uqshrn2 v0.8h, v0.4s, #3
+uqsub d16, d16, d16
+uqsub v0.4h, v0.4h, v0.4h
+uqxtn b18, h18
+uqxtn h20, s17
+uqxtn s19, d14
+uqxtn v0.2s, v0.2d
+uqxtn v0.4h, v0.4s
+uqxtn v0.8b, v0.8h
+uqxtn2 v0.16b, v0.8h
+uqxtn2 v0.4s, v0.2d
+uqxtn2 v0.8h, v0.4s
+urecpe v0.2s, v0.2s
+urecpe v0.4s, v0.4s
+urhadd v0.16b, v0.16b, v0.16b
+urhadd v0.4s, v0.4s, v0.4s
+urhadd v0.8h, v0.8h, v0.8h
+urshl d8, d7, d4
+urshl v0.16b, v0.16b, v0.16b
+urshl v0.2d, v0.2d, v0.2d
+urshl v0.4s, v0.4s, v0.4s
+urshl v0.8h, v0.8h, v0.8h
+urshr d20, d23, #31
+urshr v0.16b, v0.16b, #3
+urshr v0.2d, v0.2d, #3
+urshr v0.2s, v0.2s, #3
+urshr v0.4h, v0.4h, #3
+urshr v0.4s, v0.4s, #3
+urshr v0.8b, v0.8b, #3
+urshr v0.8h, v0.8h, #3
+ursqrte v0.2s, v0.2s
+ursqrte v0.4s, v0.4s
+ursra d18, d10, #13
+ursra v0.16b, v0.16b, #3
+ursra v0.2d, v0.2d, #3
+ursra v0.2s, v0.2s, #3
+ursra v0.4h, v0.4h, #3
+ursra v0.4s, v0.4s, #3
+ursra v0.8b, v0.8b, #3
+ursra v0.8h, v0.8h, #3
+ushl d0, d0, d0
+ushl v0.16b, v0.16b, v0.16b
+ushl v0.4s, v0.4s, v0.4s
+ushl v0.8h, v0.8h, v0.8h
+ushll v0.4s, v0.4h, #3
+ushll2 v0.8h, v0.16b, #3
+ushr d10, d17, #18
+ushr v0.16b, v0.16b, #3
+ushr v0.2d, v0.2d, #3
+ushr v0.2s, v0.2s, #3
+ushr v0.4h, v0.4h, #3
+ushr v0.4s, v0.4s, #3
+ushr v0.8b, v0.8b, #3
+ushr v0.8h, v0.8h, #3
+usqadd b19, b14
+usqadd d18, d22
+usqadd h20, h15
+usqadd s21, s12
+usqadd v0.16b, v0.16b
+usqadd v0.2d, v0.2d
+usqadd v0.2s, v0.2s
+usqadd v0.4h, v0.4h
+usqadd v0.4s, v0.4s
+usqadd v0.8b, v0.8b
+usqadd v0.8h, v0.8h
+usra d20, d13, #61
+usra v0.16b, v0.16b, #3
+usra v0.2d, v0.2d, #3
+usra v0.2s, v0.2s, #3
+usra v0.4h, v0.4h, #3
+usra v0.4s, v0.4s, #3
+usra v0.8b, v0.8b, #3
+usra v0.8h, v0.8h, #3
+usubl v0.2d, v0.2s, v0.2s
+usubl v0.4s, v0.4h, v0.4h
+usubl v0.8h, v0.8b, v0.8b
+usubl2 v0.2d, v0.4s, v0.4s
+usubl2 v0.4s, v0.8h, v0.8h
+usubl2 v0.8h, v0.16b, v0.16b
+usubw v0.2d, v0.2d, v0.2s
+usubw v0.4s, v0.4s, v0.4h
+usubw v0.8h, v0.8h, v0.8b
+usubw2 v0.2d, v0.2d, v0.4s
+usubw2 v0.4s, v0.4s, v0.8h
+usubw2 v0.8h, v0.8h, v0.16b
+uzp1	v0.16b, v0.16b, v0.16b
+uzp1	v0.2d, v0.2d, v0.2d
+uzp1	v0.2s, v0.2s, v0.2s
+uzp1	v0.4h, v0.4h, v0.4h
+uzp1	v0.4s, v0.4s, v0.4s
+uzp1	v0.8b, v0.8b, v0.8b
+uzp1	v0.8h, v0.8h, v0.8h
+uzp2	v0.16b, v0.16b, v0.16b
+uzp2	v0.2d, v0.2d, v0.2d
+uzp2	v0.2s, v0.2s, v0.2s
+uzp2	v0.4h, v0.4h, v0.4h
+uzp2	v0.4s, v0.4s, v0.4s
+uzp2	v0.8b, v0.8b, v0.8b
+uzp2	v0.8h, v0.8h, v0.8h
+xtn v0.2s, v0.2d
+xtn v0.4h, v0.4s
+xtn v0.8b, v0.8h
+xtn2 v0.16b, v0.8h
+xtn2 v0.4s, v0.2d
+xtn2 v0.8h, v0.4s
+zip1	v0.16b, v0.16b, v0.16b
+zip1	v0.2d, v0.2d, v0.2d
+zip1	v0.2s, v0.2s, v0.2s
+zip1	v0.4h, v0.4h, v0.4h
+zip1	v0.4s, v0.4s, v0.4s
+zip1	v0.8b, v0.8b, v0.8b
+zip1	v0.8h, v0.8h, v0.8h
+zip2	v0.16b, v0.16b, v0.16b
+zip2	v0.2d, v0.2d, v0.2d
+zip2	v0.2s, v0.2s, v0.2s
+zip2	v0.4h, v0.4h, v0.4h
+zip2	v0.4s, v0.4s, v0.4s
+zip2	v0.8b, v0.8b, v0.8b
+zip2	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        abs	d29, d24
+# CHECK-NEXT:  1      3     1.00                        abs	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        abs	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        abs	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        abs	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        abs	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        abs	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        abs	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        add	d17, d31, d29
+# CHECK-NEXT:  1      3     1.00                        add	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        addhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        addhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        addhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        addhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        addhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        addhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        addp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        addp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        and	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        bic	v0.4h, #15, lsl #8
+# CHECK-NEXT:  1      3     1.00                        bic	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        bif	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        bit	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        bsl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        cls	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        cls	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        cls	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        cls	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        cls	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        cls	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        clz	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        clz	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        clz	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        clz	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        clz	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        clz	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        cmeq	d20, d21, #0
+# CHECK-NEXT:  1      3     1.00                        cmeq	d20, d21, d22
+# CHECK-NEXT:  1      3     1.00                        cmeq	v0.16b, v0.16b, #0
+# CHECK-NEXT:  1      3     1.00                        cmeq	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        cmge	d20, d21, #0
+# CHECK-NEXT:  1      3     1.00                        cmge	d20, d21, d22
+# CHECK-NEXT:  1      3     1.00                        cmge	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        cmge	v0.8b, v0.8b, #0
+# CHECK-NEXT:  1      3     1.00                        cmgt	d20, d21, #0
+# CHECK-NEXT:  1      3     1.00                        cmgt	d20, d21, d22
+# CHECK-NEXT:  1      3     1.00                        cmgt	v0.2s, v0.2s, #0
+# CHECK-NEXT:  1      3     1.00                        cmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        cmhi	d20, d21, d22
+# CHECK-NEXT:  1      3     1.00                        cmhi	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        cmhs	d20, d21, d22
+# CHECK-NEXT:  1      3     1.00                        cmhs	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        cmle	d20, d21, #0
+# CHECK-NEXT:  1      3     1.00                        cmle	v0.2d, v0.2d, #0
+# CHECK-NEXT:  1      3     1.00                        cmlt	d20, d21, #0
+# CHECK-NEXT:  1      3     1.00                        cmlt	v0.8h, v0.8h, #0
+# CHECK-NEXT:  1      4     1.00                        cmtst	d20, d21, d22
+# CHECK-NEXT:  1      4     1.00                        cmtst	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        cnt	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        cnt	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        dup	v0.16b, w28
+# CHECK-NEXT:  1      3     1.00                        dup	v0.2d, x28
+# CHECK-NEXT:  1      4     1.00                        dup	v0.2s, w28
+# CHECK-NEXT:  1      4     1.00                        dup	v0.4h, w28
+# CHECK-NEXT:  1      3     1.00                        dup	v0.4s, w28
+# CHECK-NEXT:  1      4     1.00                        dup	v0.8b, w28
+# CHECK-NEXT:  1      3     1.00                        dup	v0.8h, w28
+# CHECK-NEXT:  1      3     1.00                        eor	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        ext	v0.16b, v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     1.00                        ext	v0.8b, v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     1.00                        fabd	d29, d24, d20
+# CHECK-NEXT:  1      4     1.00                        fabd	s29, s24, s20
+# CHECK-NEXT:  1      4     1.00                        fabd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fabs	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fabs	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fabs	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fabs	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fabs	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        facge	d20, d21, d22
+# CHECK-NEXT:  1      4     1.00                        facge	s10, s11, s12
+# CHECK-NEXT:  1      4     1.00                        facge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        facgt	d20, d21, d22
+# CHECK-NEXT:  1      4     1.00                        facgt	s10, s11, s12
+# CHECK-NEXT:  1      4     1.00                        facgt	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        faddp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        faddp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcmeq	d20, d21, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmeq	d20, d21, d22
+# CHECK-NEXT:  1      4     1.00                        fcmeq	s10, s11, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmeq	s10, s11, s12
+# CHECK-NEXT:  1      4     1.00                        fcmeq	v0.2s, v0.2s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmeq	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcmge	d20, d21, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmge	d20, d21, d22
+# CHECK-NEXT:  1      4     1.00                        fcmge	s10, s11, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmge	s10, s11, s12
+# CHECK-NEXT:  1      4     1.00                        fcmge	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcmgt	d20, d21, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmgt	d20, d21, d22
+# CHECK-NEXT:  1      4     1.00                        fcmgt	s10, s11, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmgt	s10, s11, s12
+# CHECK-NEXT:  1      4     1.00                        fcmgt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcmle	d20, d21, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmle	s10, s11, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmle	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmlt	d20, d21, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmlt	s10, s11, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcmlt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fcvtas	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtas	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtas	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtas	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtas	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtas	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtas	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtau	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtau	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtau	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtau	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtau	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtau	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtau	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtl	v0.2d, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtl	v0.4s, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtl2	v0.2d, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtl2	v0.4s, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtms	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtms	s22, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtms	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtms	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtms	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtms	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtms	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtmu	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtns	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtns	s22, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtns	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtns	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtns	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtns	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtns	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtnu	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtps	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtps	s22, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtps	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtps	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtps	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtps	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtps	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtpu	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtxn	s22, d13
+# CHECK-NEXT:  1      4     1.00                        fcvtxn	v0.2s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtxn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	d21, d12, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	s21, s12, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	d21, d12, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	d21, d14
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	s12, s13
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	s21, s12, #1
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	v0.8h, v0.8h
+# CHECK-NEXT:  1      13    10.00                       fdiv	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmax	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fmax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fmaxnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fmaxnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmaxnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fmaxp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fmaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fmin	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fmin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fminnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fminnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fminnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fminnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fminnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fminnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fminp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fmla	d0, d1, v0.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmla	s0, s1, v0.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmla	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmls	d0, d4, v0.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmls	s3, s5, v0.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmls	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmov	v0.2d, #-1.25000000
+# CHECK-NEXT:  1      4     1.00                        fmov	v0.2s, #13.00000000
+# CHECK-NEXT:  1      4     1.00                        fmov	v0.4s, #1.00000000
+# CHECK-NEXT:  1      4     1.00                        fmul	d0, d1, v0.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmul	s0, s1, v0.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmul	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmulx	d0, d4, v0.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmulx	d23, d11, d1
+# CHECK-NEXT:  1      4     1.00                        fmulx	s20, s22, s15
+# CHECK-NEXT:  1      4     1.00                        fmulx	s3, s5, v0.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmulx	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fmulx	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fmulx	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fneg	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        fneg	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        fneg	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        fneg	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        fneg	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frecpe	d13, d13
+# CHECK-NEXT:  1      4     1.00                        frecpe	s19, s14
+# CHECK-NEXT:  1      4     1.00                        frecpe	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frecpe	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frecpe	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frecpe	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frecpe	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frecps	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frecps	d22, d30, d21
+# CHECK-NEXT:  1      4     1.00                        frecps	s21, s16, s13
+# CHECK-NEXT:  1      4     1.00                        frecpx	d16, d19
+# CHECK-NEXT:  1      4     1.00                        frecpx	s18, s10
+# CHECK-NEXT:  1      4     1.00                        frinta	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frinta	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frinta	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frinta	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frinta	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frinti	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frinti	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frinti	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frinti	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frinti	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frintm	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frintm	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frintm	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frintm	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frintm	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frintn	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frintn	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frintn	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frintn	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frintn	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frintp	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frintp	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frintp	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frintp	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frintp	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frintx	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frintx	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frintx	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frintx	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frintx	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        frintz	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        frintz	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        frintz	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        frintz	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        frintz	v0.8h, v0.8h
+# CHECK-NEXT:  1      22    19.00                       frsqrte	d21, d12
+# CHECK-NEXT:  1      12    9.00                        frsqrte	s22, s13
+# CHECK-NEXT:  1      22    19.00                       frsqrte	v0.2d, v0.2d
+# CHECK-NEXT:  1      12    9.00                        frsqrte	v0.2s, v0.2s
+# CHECK-NEXT:  1      8     5.00                        frsqrte	v0.4h, v0.4h
+# CHECK-NEXT:  1      12    9.00                        frsqrte	v0.4s, v0.4s
+# CHECK-NEXT:  1      8     5.00                        frsqrte	v0.8h, v0.8h
+# CHECK-NEXT:  1      22    19.00                       frsqrts	d8, d22, d18
+# CHECK-NEXT:  1      12    9.00                        frsqrts	s21, s5, s12
+# CHECK-NEXT:  1      22    19.00                       frsqrts	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      22    19.00                       fsqrt	v0.2d, v0.2d
+# CHECK-NEXT:  1      12    9.00                        fsqrt	v0.2s, v0.2s
+# CHECK-NEXT:  1      8     5.00                        fsqrt	v0.4h, v0.4h
+# CHECK-NEXT:  1      12    9.00                        fsqrt	v0.4s, v0.4s
+# CHECK-NEXT:  1      8     5.00                        fsqrt	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        fsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00    *                   ld1	{ v0.16b }, [x0]
+# CHECK-NEXT:  2      5     3.00    *                   ld1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  1      6     4.00    *                   ld1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  2      4     2.00    *                   ld1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  1      5     3.00    *                   ld1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  2      4     2.00    *                   ld1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  2      3     1.00    *                   ld1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  1      4     2.00    *                   ld1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  1      3     1.00    *                   ld1	{ v0.b }[9], [x0]
+# CHECK-NEXT:  2      3     1.00    *                   ld1	{ v0.b }[9], [x0], #1
+# CHECK-NEXT:  1      3     1.00    *                   ld1r	{ v0.16b }, [x0]
+# CHECK-NEXT:  2      3     1.00    *                   ld1r	{ v0.16b }, [x0], #1
+# CHECK-NEXT:  1      3     1.00    *                   ld1r	{ v0.8h }, [x15]
+# CHECK-NEXT:  2      3     1.00    *                   ld1r	{ v0.8h }, [x15], #2
+# CHECK-NEXT:  2      4     2.00    *                   ld2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  1      4     1.00    *                   ld2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  1      4     6.00    *                   ld2	{ v0.h, v1.h }[7], [x15]
+# CHECK-NEXT:  2      4     6.00    *                   ld2	{ v0.h, v1.h }[7], [x15], #4
+# CHECK-NEXT:  1      3     2.00    *                   ld2r	{ v0.2d, v1.2d }, [x0]
+# CHECK-NEXT:  2      3     2.00    *                   ld2r	{ v0.2d, v1.2d }, [x0], #16
+# CHECK-NEXT:  1      3     2.00    *                   ld2r	{ v0.4s, v1.4s }, [sp]
+# CHECK-NEXT:  2      3     2.00    *                   ld2r	{ v0.4s, v1.4s }, [sp], #8
+# CHECK-NEXT:  1      5     6.00    *                   ld3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  2      5     6.00    *                   ld3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  1      5     7.00    *                   ld3	{ v0.s, v1.s, v2.s }[3], [sp]
+# CHECK-NEXT:  2      5     7.00    *                   ld3	{ v0.s, v1.s, v2.s }[3], [sp], x3
+# CHECK-NEXT:  1      4     3.00    *                   ld3r	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  2      4     3.00    *                   ld3r	{ v0.4h, v1.4h, v2.4h }, [x15], #6
+# CHECK-NEXT:  1      4     3.00    *                   ld3r	{ v0.8b, v1.8b, v2.8b }, [x0]
+# CHECK-NEXT:  2      4     3.00    *                   ld3r	{ v0.8b, v1.8b, v2.8b }, [x0], #3
+# CHECK-NEXT:  1      5     7.00    *                   ld4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  2      5     8.00    *                   ld4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  1      6     7.00    *                   ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0]
+# CHECK-NEXT:  2      6     7.00    *                   ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+# CHECK-NEXT:  2      6     7.00    *                   ld4	{ v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+# CHECK-NEXT:  1      4     4.00    *                   ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp]
+# CHECK-NEXT:  2      4     4.00    *                   ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7
+# CHECK-NEXT:  1      4     4.00    *                   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  2      4     4.00    *                   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30
+# CHECK-NEXT:  1      4     1.00                        mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        mls	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        mov	b0, v0.b[15]
+# CHECK-NEXT:  1      4     1.00                        mov	d6, v0.d[1]
+# CHECK-NEXT:  1      4     1.00                        mov	h2, v0.h[5]
+# CHECK-NEXT:  1      4     1.00                        mov	s17, v0.s[2]
+# CHECK-NEXT:  1      3     1.00                        mov	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        mov	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        movi	d15, #0xff00ff00ff00ff
+# CHECK-NEXT:  1      4     1.00                        movi	v0.16b, #31
+# CHECK-NEXT:  1      4     1.00                        movi	v0.2d, #0xff0000ff0000ffff
+# CHECK-NEXT:  1      4     1.00                        movi	v0.2s, #8, msl #8
+# CHECK-NEXT:  1      4     1.00                        movi	v0.4s, #255, lsl #24
+# CHECK-NEXT:  1      4     1.00                        movi	v0.8b, #255
+# CHECK-NEXT:  1      4     1.00                        mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        mvni	v0.2s, #0
+# CHECK-NEXT:  1      3     1.00                        mvni	v0.4s, #16, msl #16
+# CHECK-NEXT:  1      3     1.00                        neg	d29, d24
+# CHECK-NEXT:  1      3     1.00                        neg	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        neg	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        neg	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        neg	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        neg	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        neg	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        neg	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        mvn	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        mvn	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        orn	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        mov	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        orr	v0.8h, #31
+# CHECK-NEXT:  1      4     1.00                        pmul	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        pmul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        pmull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        pmull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      8     1.00                        raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     1.00                        raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      8     1.00                        raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     1.00                        raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     1.00                        raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     1.00                        raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        rbit	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        rbit	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        rev16	v21.8b, v1.8b
+# CHECK-NEXT:  1      4     1.00                        rev16	v30.16b, v31.16b
+# CHECK-NEXT:  1      4     1.00                        rev32	v0.4h, v9.4h
+# CHECK-NEXT:  1      4     1.00                        rev32	v21.8b, v1.8b
+# CHECK-NEXT:  1      4     1.00                        rev32	v30.16b, v31.16b
+# CHECK-NEXT:  1      4     1.00                        rev32	v4.8h, v7.8h
+# CHECK-NEXT:  1      4     1.00                        rev64	v0.16b, v31.16b
+# CHECK-NEXT:  1      4     1.00                        rev64	v1.8b, v9.8b
+# CHECK-NEXT:  1      4     1.00                        rev64	v13.4h, v21.4h
+# CHECK-NEXT:  1      4     1.00                        rev64	v2.8h, v4.8h
+# CHECK-NEXT:  1      4     1.00                        rev64	v4.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        rev64	v6.4s, v8.4s
+# CHECK-NEXT:  1      4     1.00                        rshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        rshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        rshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        rshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        rshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        rshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      8     1.00                        rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     1.00                        rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      8     1.00                        rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     1.00                        rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     1.00                        rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     1.00                        rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      6     1.00                        saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      6     1.00                        sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      6     1.00                        sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      6     1.00                        sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      6     1.00                        sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      6     1.00                        sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      6     1.00                        sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        sabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        sabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        sabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        sabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        sabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      7     2.00                        sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      7     2.00                        sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      7     2.00                        sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      7     2.00                        sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      7     2.00                        sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      7     2.00                        sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        saddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        saddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        saddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        saddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        saddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        saddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        saddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        saddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        saddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        saddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        saddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        saddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      4     1.00                        scvtf	d21, d12
+# CHECK-NEXT:  1      4     1.00                        scvtf	d21, d12, #64
+# CHECK-NEXT:  1      4     1.00                        scvtf	s22, s13
+# CHECK-NEXT:  1      4     1.00                        scvtf	s22, s13, #32
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        scvtf	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        shadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        shl	d7, d10, #12
+# CHECK-NEXT:  1      3     1.00                        shl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        shl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        shl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        shl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  1      3     1.00                        shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  1      3     1.00                        shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  1      3     1.00                        shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  1      3     1.00                        shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  1      3     1.00                        shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  1      3     1.00                        shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  1      3     1.00                        shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  1      3     1.00                        shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  1      3     1.00                        shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  1      3     1.00                        shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  1      3     1.00                        shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  1      3     1.00                        shrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        shrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        shrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        shrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        shrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        shrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        shsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        shsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sli	d10, d14, #12
+# CHECK-NEXT:  1      4     1.00                        sli	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     1.00                        sli	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sli	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        sli	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     1.00                        sli	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sli	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     1.00                        sli	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        smax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        smax	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        smax	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        smaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        smaxp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        smaxp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        smin	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        smin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        smin	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        sminp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        sminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        sminp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        smlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        smlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        smlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        smlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        smlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        smlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        smlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        smlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        smlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        smlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        smlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        smlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        smull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        smull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        smull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        smull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        smull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        smull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        sqabs	b19, b14
+# CHECK-NEXT:  1      4     1.00                        sqabs	d18, d12
+# CHECK-NEXT:  1      4     1.00                        sqabs	h21, h15
+# CHECK-NEXT:  1      4     1.00                        sqabs	s20, s12
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        sqabs	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqadd	b20, b11, b15
+# CHECK-NEXT:  1      4     1.00                        sqadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        sqadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	d19, s24, s12
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	d8, s9, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	s0, h0, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	s17, h27, h12
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqdmlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqdmlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	d8, s9, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	s0, h0, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	h10, h11, h12
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	s20, s21, s2
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqdmull	d1, s1, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmull	d15, s22, s12
+# CHECK-NEXT:  1      4     1.00                        sqdmull	s1, h1, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmull	s12, h22, h12
+# CHECK-NEXT:  1      4     1.00                        sqdmull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqdmull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqdmull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqdmull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqneg	b19, b14
+# CHECK-NEXT:  1      4     1.00                        sqneg	d18, d12
+# CHECK-NEXT:  1      4     1.00                        sqneg	h21, h15
+# CHECK-NEXT:  1      4     1.00                        sqneg	s20, s12
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        sqneg	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqrshl	d31, d31, d31
+# CHECK-NEXT:  1      4     1.00                        sqrshl	h3, h4, h15
+# CHECK-NEXT:  1      4     1.00                        sqrshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqrshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqrshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        sqrshrn	b10, h13, #2
+# CHECK-NEXT:  1      4     1.00                        sqrshrn	h15, s10, #6
+# CHECK-NEXT:  1      4     1.00                        sqrshrn	s15, d12, #9
+# CHECK-NEXT:  1      4     1.00                        sqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrun	b17, h10, #6
+# CHECK-NEXT:  1      4     1.00                        sqrshrun	h10, s13, #15
+# CHECK-NEXT:  1      4     1.00                        sqrshrun	s22, d16, #31
+# CHECK-NEXT:  1      4     1.00                        sqrshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqrshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	b11, b19, #7
+# CHECK-NEXT:  1      4     1.00                        sqshl	d15, d16, #51
+# CHECK-NEXT:  1      4     1.00                        sqshl	d31, d31, d31
+# CHECK-NEXT:  1      4     1.00                        sqshl	h13, h18, #11
+# CHECK-NEXT:  1      4     1.00                        sqshl	h3, h4, h15
+# CHECK-NEXT:  1      4     1.00                        sqshl	s14, s17, #22
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        sqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	b15, b18, #6
+# CHECK-NEXT:  1      4     1.00                        sqshlu	d11, d13, #32
+# CHECK-NEXT:  1      4     1.00                        sqshlu	h19, h17, #6
+# CHECK-NEXT:  1      4     1.00                        sqshlu	s16, s14, #25
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     1.00                        sqshlu	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrn	b10, h15, #5
+# CHECK-NEXT:  1      4     1.00                        sqshrn	h17, s10, #4
+# CHECK-NEXT:  1      4     1.00                        sqshrn	s18, d10, #31
+# CHECK-NEXT:  1      4     1.00                        sqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun	b15, h10, #7
+# CHECK-NEXT:  1      4     1.00                        sqshrun	h20, s14, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun	s10, d15, #15
+# CHECK-NEXT:  1      4     1.00                        sqshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sqshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sqsub	s20, s10, s7
+# CHECK-NEXT:  1      4     1.00                        sqsub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqsub	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        sqxtn	b18, h18
+# CHECK-NEXT:  1      4     1.00                        sqxtn	h20, s17
+# CHECK-NEXT:  1      4     1.00                        sqxtn	s19, d14
+# CHECK-NEXT:  1      4     1.00                        sqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqxtun	b19, h14
+# CHECK-NEXT:  1      4     1.00                        sqxtun	h21, s15
+# CHECK-NEXT:  1      4     1.00                        sqxtun	s20, d12
+# CHECK-NEXT:  1      4     1.00                        sqxtun	v0.2s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqxtun	v0.4h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        sqxtun	v0.8b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqxtun2	v0.16b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        sqxtun2	v0.4s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        sqxtun2	v0.8h, v0.4s
+# CHECK-NEXT:  1      3     1.00                        srhadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        srhadd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        srhadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        sri	d10, d12, #14
+# CHECK-NEXT:  1      4     1.00                        sri	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     1.00                        sri	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        sri	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        sri	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     1.00                        sri	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        sri	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     1.00                        sri	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        srshl	d16, d16, d16
+# CHECK-NEXT:  1      3     1.00                        srshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        srshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        srshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        srshr	d19, d18, #7
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	d15, d11, #19
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      7     2.00                        srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        sshl	d31, d31, d31
+# CHECK-NEXT:  1      3     1.00                        sshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        sshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        sshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        sshll	v0.2d, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        sshll2	v0.4s, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	d15, d16, #12
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        sshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	d18, d12, #21
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        ssubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        ssubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        ssubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        ssubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        ssubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        ssubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        ssubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        ssubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        ssubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        ssubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        ssubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        ssubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      5     2.00           *            st1	{ v0.16b }, [x0]
+# CHECK-NEXT:  2      5     2.00           *            st1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  1      5     4.00           *            st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  2      5     2.00           *            st1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  1      5     2.00           *            st1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  2      5     4.00           *            st1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  2      5     2.00           *            st1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  1      5     2.00           *            st1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  1      4     1.00           *            st1	{ v0.d }[1], [x0]
+# CHECK-NEXT:  2      4     1.00           *            st1	{ v0.d }[1], [x0], #8
+# CHECK-NEXT:  2      5     4.00           *            st2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  1      5     2.00           *            st2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  1      5     2.00           *            st2	{ v0.s, v1.s }[3], [sp]
+# CHECK-NEXT:  2      5     2.00           *            st2	{ v0.s, v1.s }[3], [sp], #8
+# CHECK-NEXT:  1      5     4.00           *            st3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  2      5     4.00           *            st3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  1      5     2.00           *            st3	{ v0.h, v1.h, v2.h }[7], [x15]
+# CHECK-NEXT:  2      5     2.00           *            st3	{ v0.h, v1.h, v2.h }[7], [x15], #6
+# CHECK-NEXT:  1      5     4.00           *            st4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  2      5     4.00           *            st4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  1      5     2.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0]
+# CHECK-NEXT:  2      5     2.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+# CHECK-NEXT:  1      3     1.00                        sub	d15, d5, d16
+# CHECK-NEXT:  1      3     1.00                        sub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        suqadd	b19, b14
+# CHECK-NEXT:  1      4     1.00                        suqadd	d18, d22
+# CHECK-NEXT:  1      4     1.00                        suqadd	h20, h15
+# CHECK-NEXT:  1      4     1.00                        suqadd	s21, s12
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        suqadd	v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbl	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        tbx	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        trn1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        trn2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      6     1.00                        uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      6     1.00                        uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      6     1.00                        uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      6     1.00                        uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      6     1.00                        uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      6     1.00                        uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      6     1.00                        uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      7     2.00                        uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      7     2.00                        uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      7     2.00                        uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      7     2.00                        uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      7     2.00                        uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      7     2.00                        uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uaddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uaddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uaddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uaddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uaddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uaddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uaddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uaddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uaddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uaddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uaddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uaddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d21, d14
+# CHECK-NEXT:  1      4     1.00                        ucvtf	d21, d14, #64
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s22, s13
+# CHECK-NEXT:  1      4     1.00                        ucvtf	s22, s13, #32
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        ucvtf	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uhsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        umax	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        umax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        umax	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        umaxp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        umaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        umaxp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        umin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        umin	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        umin	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uminp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uminp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        umlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        umlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        umlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        umlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        umlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        umlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        umlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        umlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        umlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        umlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        umlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        umlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        umull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        umull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        umull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        umull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        umull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        umull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        uqadd	h0, h1, h5
+# CHECK-NEXT:  1      4     1.00                        uqadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        uqrshl	b11, b20, b30
+# CHECK-NEXT:  1      4     1.00                        uqrshl	s23, s20, s16
+# CHECK-NEXT:  1      4     1.00                        uqrshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        uqrshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        uqrshrn	b10, h12, #5
+# CHECK-NEXT:  1      4     1.00                        uqrshrn	h12, s10, #14
+# CHECK-NEXT:  1      4     1.00                        uqrshrn	s10, d10, #25
+# CHECK-NEXT:  1      4     1.00                        uqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        uqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        uqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        uqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        uqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        uqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	b11, b20, b30
+# CHECK-NEXT:  1      4     1.00                        uqshl	b18, b15, #6
+# CHECK-NEXT:  1      4     1.00                        uqshl	d15, d12, #19
+# CHECK-NEXT:  1      4     1.00                        uqshl	h11, h18, #7
+# CHECK-NEXT:  1      4     1.00                        uqshl	s14, s19, #18
+# CHECK-NEXT:  1      4     1.00                        uqshl	s23, s20, s16
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        uqshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        uqshrn	b12, h10, #7
+# CHECK-NEXT:  1      4     1.00                        uqshrn	h10, s14, #5
+# CHECK-NEXT:  1      4     1.00                        uqshrn	s10, d12, #13
+# CHECK-NEXT:  1      4     1.00                        uqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        uqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        uqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        uqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        uqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      4     1.00                        uqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     1.00                        uqsub	d16, d16, d16
+# CHECK-NEXT:  1      4     1.00                        uqsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        uqxtn	b18, h18
+# CHECK-NEXT:  1      4     1.00                        uqxtn	h20, s17
+# CHECK-NEXT:  1      4     1.00                        uqxtn	s19, d14
+# CHECK-NEXT:  1      4     1.00                        uqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        uqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        uqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        uqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        uqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        uqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        urecpe	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        urecpe	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        urhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        urhadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        urhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        urshl	d8, d7, d4
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        urshr	d20, d23, #31
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      12    9.00                        ursqrte	v0.2s, v0.2s
+# CHECK-NEXT:  1      12    9.00                        ursqrte	v0.4s, v0.4s
+# CHECK-NEXT:  1      7     2.00                        ursra	d18, d10, #13
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      7     2.00                        ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        ushl	d0, d0, d0
+# CHECK-NEXT:  1      3     1.00                        ushl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        ushl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        ushl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        ushll	v0.4s, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        ushll2	v0.8h, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	d10, d17, #18
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        ushr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      4     1.00                        usqadd	b19, b14
+# CHECK-NEXT:  1      4     1.00                        usqadd	d18, d22
+# CHECK-NEXT:  1      4     1.00                        usqadd	h20, h15
+# CHECK-NEXT:  1      4     1.00                        usqadd	s21, s12
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        usqadd	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        usra	d20, d13, #61
+# CHECK-NEXT:  1      3     1.00                        usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        usubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        usubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        usubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        usubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        usubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        usubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        usubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        usubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        usubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        usubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        usubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        usubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        uzp1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        uzp2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        xtn	v0.2s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        xtn	v0.4h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        xtn	v0.8b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        xtn2	v0.16b, v0.8h
+# CHECK-NEXT:  1      4     1.00                        xtn2	v0.4s, v0.2d
+# CHECK-NEXT:  1      4     1.00                        xtn2	v0.8h, v0.4s
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        zip1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     1.00                        zip2	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - CortexA320UnitALU
+# CHECK-NEXT: [1]   - CortexA320UnitB
+# CHECK-NEXT: [2]   - CortexA320UnitDiv
+# CHECK-NEXT: [3]   - CortexA320UnitLdSt
+# CHECK-NEXT: [4]   - CortexA320UnitMAC
+# CHECK-NEXT: [5]   - CortexA320UnitPAC
+# CHECK-NEXT: [6]   - CortexA320UnitVALU
+# CHECK-NEXT: [7]   - CortexA320UnitVMAC
+# CHECK-NEXT: [8]   - CortexA320UnitVMC
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -      -     193.00  -      -     1002.00 6.00  197.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	d29, d24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	d17, d31, d29
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	v0.4h, #15, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bif	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bit	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bsl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmeq	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmeq	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmeq	v0.16b, v0.16b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmeq	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmge	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmge	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmge	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmge	v0.8b, v0.8b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmgt	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmgt	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmgt	v0.2s, v0.2s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmhi	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmhi	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmhs	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmhs	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmle	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmle	v0.2d, v0.2d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmlt	d20, d21, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmlt	v0.8h, v0.8h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmtst	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmtst	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnt	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnt	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.16b, w28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.2d, x28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.2s, w28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.4h, w28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.4s, w28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.8b, w28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dup	v0.8h, w28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ext	v0.16b, v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ext	v0.8b, v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabd	d29, d24, d20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabd	s29, s24, s20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     faddp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     faddp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	v0.2s, v0.2s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	d20, d21, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	s10, s11, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmle	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmle	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmle	v0.2d, v0.2d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmlt	d20, d21, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmlt	s10, s11, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmlt	v0.4s, v0.4s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtas	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtau	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtl	v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtl	v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtl2	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtl2	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtms	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtmu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtns	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtps	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtpu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtxn	s22, d13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtxn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtxn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	d21, d12, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	s21, s12, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	d21, d12, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	s12, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	s21, s12, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.00  fdiv	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	d0, d1, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	s0, s1, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	d0, d4, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	s3, s5, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	v0.2d, #-1.25000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	v0.2s, #13.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	v0.4s, #1.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmul	d0, d1, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmul	s0, s1, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmul	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	d0, d4, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	d23, d11, d1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	s20, s22, s15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	s3, s5, v0.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmulx	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	d13, d13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	s19, s14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpe	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecps	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecps	d22, d30, d21
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecps	s21, s16, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpx	d16, d19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frecpx	s18, s10
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  frsqrte	d21, d12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   frsqrte	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  frsqrte	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   frsqrte	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   frsqrte	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   frsqrte	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   frsqrte	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  frsqrts	d8, d22, d18
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   frsqrts	s21, s5, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  frsqrts	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  fsqrt	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   fsqrt	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   fsqrt	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   fsqrt	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   fsqrt	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1	{ v0.16b }, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     ld1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1	{ v0.b }[9], [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1	{ v0.b }[9], [x0], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1r	{ v0.16b }, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1r	{ v0.16b }, [x0], #1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1r	{ v0.8h }, [x15]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1r	{ v0.8h }, [x15], #2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  -      -      -     6.00    -      -      -      -      -     ld2	{ v0.h, v1.h }[7], [x15]
+# CHECK-NEXT:  -      -      -     6.00    -      -      -      -      -     ld2	{ v0.h, v1.h }[7], [x15], #4
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2r	{ v0.2d, v1.2d }, [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2r	{ v0.2d, v1.2d }, [x0], #16
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2r	{ v0.4s, v1.4s }, [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2r	{ v0.4s, v1.4s }, [sp], #8
+# CHECK-NEXT:  -      -      -     6.00    -      -      -      -      -     ld3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  -      -      -     6.00    -      -      -      -      -     ld3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld3	{ v0.s, v1.s, v2.s }[3], [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld3	{ v0.s, v1.s, v2.s }[3], [sp], x3
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3r	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3r	{ v0.4h, v1.4h, v2.4h }, [x15], #6
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3r	{ v0.8b, v1.8b, v2.8b }, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3r	{ v0.8b, v1.8b, v2.8b }, [x0], #3
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     ld4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld4	{ v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp]
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     ld4r	{ v0.1d, v1.1d, v2.1d, v3.1d }, [sp], x7
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mls	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	b0, v0.b[15]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	d6, v0.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	h2, v0.h[5]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	s17, v0.s[2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movi	d15, #0xff00ff00ff00ff
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movi	v0.16b, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movi	v0.2d, #0xff0000ff0000ffff
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movi	v0.2s, #8, msl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movi	v0.4s, #255, lsl #24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movi	v0.8b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mvni	v0.2s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mvni	v0.4s, #16, msl #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	d29, d24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mvn	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mvn	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orn	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	v0.8h, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pmul	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pmul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pmull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pmull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rbit	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rbit	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev16	v21.8b, v1.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev16	v30.16b, v31.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev32	v0.4h, v9.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev32	v21.8b, v1.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev32	v30.16b, v31.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev32	v4.8h, v7.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev64	v0.16b, v31.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev64	v1.8b, v9.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev64	v13.4h, v21.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev64	v2.8h, v4.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev64	v4.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev64	v6.4s, v8.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d21, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	d21, d12, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	s22, s13, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shl	d7, d10, #12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	d10, d14, #12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	d18, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	h21, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	s20, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	b20, b11, b15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal	d19, s24, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal	d8, s9, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal	s0, h0, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal	s17, h27, h12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl	d8, s9, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl	s0, h0, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmulh	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmulh	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull	d1, s1, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull	d15, s22, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull	s1, h1, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull	s12, h22, h12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdmull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	d18, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	h21, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	s20, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrdmulh	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrdmulh	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	d31, d31, d31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	h3, h4, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn	b10, h13, #2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn	h15, s10, #6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn	s15, d12, #9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun	b17, h10, #6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun	h10, s13, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun	s22, d16, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	b11, b19, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	d15, d16, #51
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	d31, d31, d31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	h13, h18, #11
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	h3, h4, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	s14, s17, #22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	b15, b18, #6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	d11, d13, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	h19, h17, #6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	s16, s14, #25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn	b10, h15, #5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn	h17, s10, #4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn	s18, d10, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun	b15, h10, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun	h20, s14, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun	s10, d15, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrun2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	s20, s10, s7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn	b18, h18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn	h20, s17
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn	s19, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun	b19, h14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun	h21, s15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun	s20, d12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtun2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	d10, d12, #14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	d16, d16, d16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	d19, d18, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	d15, d11, #19
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshl	d31, d31, d31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshll	v0.2d, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshll2	v0.4s, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	d15, d16, #12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	d18, d12, #21
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st1	{ v0.16b }, [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st1	{ v0.4s, v1.4s }, [sp], #32
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st1	{ v0.4s, v1.4s, v2.4s }, [sp]
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st1	{ v0.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st1	{ v0.8h, v1.8h }, [x15]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1	{ v0.d }[1], [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1	{ v0.d }[1], [x0], #8
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st2	{ v0.16b, v1.16b }, [x0], x1
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st2	{ v0.8b, v1.8b }, [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st2	{ v0.s, v1.s }[3], [sp]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st2	{ v0.s, v1.s }[3], [sp], #8
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st3	{ v0.4h, v1.4h, v2.4h }, [x15]
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st3	{ v0.8h, v1.8h, v2.8h }, [x15], x2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st3	{ v0.h, v1.h, v2.h }[7], [x15]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st3	{ v0.h, v1.h, v2.h }[7], [x15], #6
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st4	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp]
+# CHECK-NEXT:  -      -      -     4.00    -      -      -      -      -     st4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	d15, d5, d16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	d18, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	h20, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	s21, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.16b, { v0.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.16b, { v0.16b, v1.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.8b, { v0.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.8b, { v0.16b, v1.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.8b, { v0.16b, v1.16b, v2.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d21, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	d21, d14, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s22, s13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	s22, s13, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlsl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlsl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlsl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlsl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlsl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umlsl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umull	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umull	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umull2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umull2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	h0, h1, h5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	b11, b20, b30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	s23, s20, s16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn	b10, h12, #5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn	h12, s10, #14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn	s10, d10, #25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	b11, b20, b30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	b18, b15, #6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	d15, d12, #19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	h11, h18, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	s14, s19, #18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	s23, s20, s16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn	b12, h10, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn	h10, s14, #5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn	s10, d12, #13
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	d16, d16, d16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn	b18, h18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn	h20, s17
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn	s19, d14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urecpe	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urecpe	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	d8, d7, d4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	d20, d23, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   ursqrte	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   ursqrte	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	d18, d10, #13
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushl	d0, d0, d0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushll	v0.4s, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushll2	v0.8h, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	d10, d17, #18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	b19, b14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	d18, d22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	h20, h15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	s21, s12
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	d20, d13, #61
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xtn	v0.2s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xtn	v0.4h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xtn	v0.8b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xtn2	v0.16b, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xtn2	v0.4s, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xtn2	v0.8h, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	v0.8h, v0.8h, v0.8h
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-sve-instructions.s
new file mode 100644
index 000000000000..ad8d57bdb32d
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A320-sve-instructions.s
@@ -0,0 +1,10258 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a320 -mattr=+sve2-aes,+sve2-sha3,+sve2-sm4 -instruction-tables < %s | FileCheck %s
+
+abs	z0.b, p0/m, z0.b
+abs	z0.d, p0/m, z0.d
+abs	z0.h, p0/m, z0.h
+abs	z0.s, p0/m, z0.s
+abs	z31.b, p7/m, z31.b
+abs	z31.d, p7/m, z31.d
+abs	z31.h, p7/m, z31.h
+abs	z31.s, p7/m, z31.s
+adclb	z0.d, z1.d, z31.d
+adclb	z0.s, z1.s, z31.s
+adclt	z0.d, z1.d, z31.d
+adclt	z0.s, z1.s, z31.s
+add	z0.b, p0/m, z0.b, z0.b
+add	z0.b, z0.b, #0
+add	z0.b, z0.b, z0.b
+add	z0.d, p0/m, z0.d, z0.d
+add	z0.d, z0.d, #0
+add	z0.d, z0.d, #0, lsl #8
+add	z0.d, z0.d, z0.d
+add	z0.h, p0/m, z0.h, z0.h
+add	z0.h, z0.h, #0
+add	z0.h, z0.h, #0, lsl #8
+add	z0.h, z0.h, z0.h
+add	z0.s, p0/m, z0.s, z0.s
+add	z0.s, z0.s, #0
+add	z0.s, z0.s, #0, lsl #8
+add	z0.s, z0.s, z0.s
+add	z0.s, z1.s, z2.s
+add	z21.b, p5/m, z21.b, z10.b
+add	z21.b, z10.b, z21.b
+add	z21.d, p5/m, z21.d, z10.d
+add	z21.d, z10.d, z21.d
+add	z21.h, p5/m, z21.h, z10.h
+add	z21.h, z10.h, z21.h
+add	z21.s, p5/m, z21.s, z10.s
+add	z21.s, z10.s, z21.s
+add	z23.b, p3/m, z23.b, z13.b
+add	z23.b, z13.b, z8.b
+add	z23.d, p3/m, z23.d, z13.d
+add	z23.d, z13.d, z8.d
+add	z23.h, p3/m, z23.h, z13.h
+add	z23.h, z13.h, z8.h
+add	z23.s, p3/m, z23.s, z13.s
+add	z23.s, z13.s, z8.s
+add	z31.b, p7/m, z31.b, z31.b
+add	z31.b, z31.b, #255
+add	z31.b, z31.b, z31.b
+add	z31.d, p7/m, z31.d, z31.d
+add	z31.d, z31.d, #65280
+add	z31.d, z31.d, z31.d
+add	z31.h, p7/m, z31.h, z31.h
+add	z31.h, z31.h, #65280
+add	z31.h, z31.h, z31.h
+add	z31.s, p7/m, z31.s, z31.s
+add	z31.s, z31.s, #65280
+add	z31.s, z31.s, z31.s
+addhnb	z0.b, z1.h, z31.h
+addhnb	z0.h, z1.s, z31.s
+addhnb	z0.s, z1.d, z31.d
+addhnt	z0.b, z1.h, z31.h
+addhnt	z0.h, z1.s, z31.s
+addhnt	z0.s, z1.d, z31.d
+addp	z0.b, p0/m, z0.b, z1.b
+addp	z0.h, p0/m, z0.h, z1.h
+addp	z29.s, p7/m, z29.s, z30.s
+addp	z31.d, p7/m, z31.d, z30.d
+addpl	sp, sp, #31
+addpl	x0, x0, #-32
+addpl	x21, x21, #0
+addpl	x23, x8, #-1
+addvl	sp, sp, #31
+addvl	x0, x0, #-32
+addvl	x21, x21, #0
+addvl	x23, x8, #-1
+adr	z0.d, [z0.d, z0.d, lsl #1]
+adr	z0.d, [z0.d, z0.d, lsl #2]
+adr	z0.d, [z0.d, z0.d, lsl #3]
+adr	z0.d, [z0.d, z0.d, sxtw #1]
+adr	z0.d, [z0.d, z0.d, sxtw #2]
+adr	z0.d, [z0.d, z0.d, sxtw #3]
+adr	z0.d, [z0.d, z0.d, sxtw]
+adr	z0.d, [z0.d, z0.d, uxtw #1]
+adr	z0.d, [z0.d, z0.d, uxtw #2]
+adr	z0.d, [z0.d, z0.d, uxtw #3]
+adr	z0.d, [z0.d, z0.d, uxtw]
+adr	z0.d, [z0.d, z0.d]
+adr	z0.s, [z0.s, z0.s, lsl #1]
+adr	z0.s, [z0.s, z0.s, lsl #2]
+adr	z0.s, [z0.s, z0.s, lsl #3]
+adr	z0.s, [z0.s, z0.s]
+aesd	z0.b, z0.b, z31.b
+aese	z0.b, z0.b, z31.b
+aesimc	z0.b, z0.b
+aesimc	z31.b, z31.b
+aesmc	z0.b, z0.b
+aesmc	z31.b, z31.b
+and	p0.b, p0/z, p0.b, p1.b
+and	z0.d, z0.d, #0x6
+and	z0.d, z0.d, #0xfffffffffffffff9
+and	z0.d, z0.d, z0.d
+and	z0.s, z0.s, #0x6
+and	z0.s, z0.s, #0xfffffff9
+and	z23.d, z13.d, z8.d
+and	z23.h, z23.h, #0x6
+and	z23.h, z23.h, #0xfff9
+and	z31.b, p7/m, z31.b, z31.b
+and	z31.d, p7/m, z31.d, z31.d
+and	z31.h, p7/m, z31.h, z31.h
+and	z31.s, p7/m, z31.s, z31.s
+and	z5.b, z5.b, #0x6
+and	z5.b, z5.b, #0xf9
+ands	p0.b, p0/z, p0.b, p1.b
+andv	b0, p7, z31.b
+andv	d0, p7, z31.d
+andv	h0, p7, z31.h
+andv	s0, p7, z31.s
+asr	z0.b, p0/m, z0.b, #1
+asr	z0.b, p0/m, z0.b, z0.b
+asr	z0.b, p0/m, z0.b, z1.d
+asr	z0.b, z0.b, #1
+asr	z0.b, z1.b, z2.d
+asr	z0.d, p0/m, z0.d, #1
+asr	z0.d, p0/m, z0.d, z0.d
+asr	z0.d, z0.d, #1
+asr	z0.h, p0/m, z0.h, #1
+asr	z0.h, p0/m, z0.h, z0.h
+asr	z0.h, p0/m, z0.h, z1.d
+asr	z0.h, z0.h, #1
+asr	z0.h, z1.h, z2.d
+asr	z0.s, p0/m, z0.s, #1
+asr	z0.s, p0/m, z0.s, z0.s
+asr	z0.s, p0/m, z0.s, z1.d
+asr	z0.s, z0.s, #1
+asr	z0.s, z1.s, z2.d
+asr	z31.b, p0/m, z31.b, #8
+asr	z31.b, z31.b, #8
+asr	z31.d, p0/m, z31.d, #64
+asr	z31.d, z31.d, #64
+asr	z31.h, p0/m, z31.h, #16
+asr	z31.h, z31.h, #16
+asr	z31.s, p0/m, z31.s, #32
+asr	z31.s, z31.s, #32
+asrd	z0.b, p0/m, z0.b, #1
+asrd	z0.d, p0/m, z0.d, #1
+asrd	z0.h, p0/m, z0.h, #1
+asrd	z0.s, p0/m, z0.s, #1
+asrd	z31.b, p0/m, z31.b, #8
+asrd	z31.d, p0/m, z31.d, #64
+asrd	z31.h, p0/m, z31.h, #16
+asrd	z31.s, p0/m, z31.s, #32
+asrr	z0.b, p0/m, z0.b, z0.b
+asrr	z0.d, p0/m, z0.d, z0.d
+asrr	z0.h, p0/m, z0.h, z0.h
+asrr	z0.s, p0/m, z0.s, z0.s
+bcax	z29.d, z29.d, z30.d, z31.d
+bdep	z0.b, z1.b, z31.b
+bdep	z0.d, z1.d, z31.d
+bdep	z0.h, z1.h, z31.h
+bdep	z0.s, z1.s, z31.s
+bext	z0.b, z1.b, z31.b
+bext	z0.d, z1.d, z31.d
+bext	z0.h, z1.h, z31.h
+bext	z0.s, z1.s, z31.s
+bfcvt	z0.h, p0/m, z1.s
+bfcvtnt	z0.h, p0/m, z1.s
+bfdot	z0.s, z1.h, z2.h
+bfdot	z0.s, z1.h, z2.h[0]
+bfdot	z0.s, z1.h, z2.h[3]
+bfmlalb	z0.s, z1.h, z2.h
+bfmlalb	z0.s, z1.h, z2.h[0]
+bfmlalb	z0.s, z1.h, z2.h[7]
+bfmlalb	z10.s, z21.h, z14.h
+bfmlalb	z21.s, z14.h, z3.h[2]
+bfmlalt	z0.s, z1.h, z2.h
+bfmlalt	z0.s, z1.h, z2.h[0]
+bfmlalt	z0.s, z1.h, z2.h[7]
+bfmlalt	z0.s, z1.h, z7.h[7]
+bfmlalt	z14.s, z10.h, z21.h
+bfmmla	z0.s, z1.h, z2.h
+bgrp	z0.b, z1.b, z31.b
+bgrp	z0.d, z1.d, z31.d
+bgrp	z0.h, z1.h, z31.h
+bgrp	z0.s, z1.s, z31.s
+bic	p0.b, p0/z, p0.b, p0.b
+bic	p15.b, p15/z, p15.b, p15.b
+bic	z0.d, z0.d, z0.d
+bic	z23.d, z13.d, z8.d
+bic	z31.b, p7/m, z31.b, z31.b
+bic	z31.d, p7/m, z31.d, z31.d
+bic	z31.h, p7/m, z31.h, z31.h
+bic	z31.s, p7/m, z31.s, z31.s
+bics	p0.b, p0/z, p0.b, p0.b
+bics	p15.b, p15/z, p15.b, p15.b
+brka	p0.b, p15/m, p15.b
+brka	p0.b, p15/z, p15.b
+brkas	p0.b, p15/z, p15.b
+brkb	p0.b, p15/m, p15.b
+brkb	p0.b, p15/z, p15.b
+brkbs	p0.b, p15/z, p15.b
+brkn	p0.b, p15/z, p1.b, p0.b
+brkn	p15.b, p15/z, p15.b, p15.b
+brkns	p0.b, p15/z, p1.b, p0.b
+brkns	p15.b, p15/z, p15.b, p15.b
+brkpa	p0.b, p15/z, p1.b, p2.b
+brkpa	p15.b, p15/z, p15.b, p15.b
+brkpas	p0.b, p15/z, p1.b, p2.b
+brkpas	p15.b, p15/z, p15.b, p15.b
+brkpb	p0.b, p15/z, p1.b, p2.b
+brkpb	p15.b, p15/z, p15.b, p15.b
+brkpbs	p0.b, p15/z, p1.b, p2.b
+brkpbs	p15.b, p15/z, p15.b, p15.b
+bsl	z0.d, z0.d, z1.d, z2.d
+bsl1n	z0.d, z0.d, z1.d, z2.d
+bsl2n	z0.d, z0.d, z1.d, z2.d
+cadd	z0.b, z0.b, z0.b, #90
+cadd	z0.d, z0.d, z0.d, #90
+cadd	z0.h, z0.h, z0.h, #90
+cadd	z0.s, z0.s, z0.s, #90
+cadd	z31.b, z31.b, z31.b, #270
+cadd	z31.d, z31.d, z31.d, #270
+cadd	z31.h, z31.h, z31.h, #270
+cadd	z31.s, z31.s, z31.s, #270
+cdot	z0.d, z1.h, z15.h[1], #0
+cdot	z0.d, z1.h, z31.h, #0
+cdot	z0.d, z1.h, z31.h, #180
+cdot	z0.d, z1.h, z31.h, #270
+cdot	z0.d, z1.h, z31.h, #90
+cdot	z0.s, z1.b, z31.b, #0
+cdot	z0.s, z1.b, z7.b[3], #0
+cdot	z29.d, z30.h, z0.h[0], #180
+cdot	z31.d, z30.h, z7.h[1], #270
+cdot	z5.d, z6.h, z3.h[0], #90
+clasta	b0, p7, b0, z31.b
+clasta	d0, p7, d0, z31.d
+clasta	h0, p7, h0, z31.h
+clasta	s0, p7, s0, z31.s
+clasta	w0, p7, w0, z31.b
+clasta	w0, p7, w0, z31.h
+clasta	w0, p7, w0, z31.s
+clasta	x0, p7, x0, z31.d
+clasta	z0.b, p7, z0.b, z31.b
+clasta	z0.d, p7, z0.d, z31.d
+clasta	z0.h, p7, z0.h, z31.h
+clasta	z0.s, p7, z0.s, z31.s
+clastb	b0, p7, b0, z31.b
+clastb	d0, p7, d0, z31.d
+clastb	h0, p7, h0, z31.h
+clastb	s0, p7, s0, z31.s
+clastb	w0, p7, w0, z31.b
+clastb	w0, p7, w0, z31.h
+clastb	w0, p7, w0, z31.s
+clastb	x0, p7, x0, z31.d
+clastb	z0.b, p7, z0.b, z31.b
+clastb	z0.d, p7, z0.d, z31.d
+clastb	z0.h, p7, z0.h, z31.h
+clastb	z0.s, p7, z0.s, z31.s
+cls	z31.b, p7/m, z31.b
+cls	z31.d, p7/m, z31.d
+cls	z31.h, p7/m, z31.h
+cls	z31.s, p7/m, z31.s
+clz	z31.b, p7/m, z31.b
+clz	z31.d, p7/m, z31.d
+clz	z31.h, p7/m, z31.h
+clz	z31.s, p7/m, z31.s
+cmla	z0.b, z1.b, z2.b, #0
+cmla	z0.d, z1.d, z2.d, #0
+cmla	z0.h, z1.h, z2.h, #0
+cmla	z0.h, z1.h, z2.h[0], #0
+cmla	z0.s, z1.s, z2.s, #0
+cmla	z0.s, z1.s, z2.s[0], #0
+cmla	z15.b, z16.b, z17.b, #270
+cmla	z15.d, z16.d, z17.d, #270
+cmla	z15.h, z16.h, z17.h, #270
+cmla	z15.s, z16.s, z17.s, #270
+cmla	z29.b, z30.b, z31.b, #90
+cmla	z29.d, z30.d, z31.d, #90
+cmla	z29.h, z30.h, z31.h, #90
+cmla	z29.s, z30.s, z31.s, #90
+cmla	z31.b, z31.b, z31.b, #180
+cmla	z31.d, z31.d, z31.d, #180
+cmla	z31.h, z30.h, z7.h[0], #180
+cmla	z31.h, z31.h, z31.h, #180
+cmla	z31.s, z30.s, z7.s[0], #180
+cmla	z31.s, z31.s, z31.s, #180
+cmpeq	p0.b, p0/z, z0.b, #-16
+cmpeq	p0.b, p0/z, z0.b, #15
+cmpeq	p0.b, p0/z, z0.b, z0.b
+cmpeq	p0.b, p0/z, z0.b, z0.d
+cmpeq	p0.d, p0/z, z0.d, #-16
+cmpeq	p0.d, p0/z, z0.d, #15
+cmpeq	p0.d, p0/z, z0.d, z0.d
+cmpeq	p0.h, p0/z, z0.h, #-16
+cmpeq	p0.h, p0/z, z0.h, #15
+cmpeq	p0.h, p0/z, z0.h, z0.d
+cmpeq	p0.h, p0/z, z0.h, z0.h
+cmpeq	p0.s, p0/z, z0.s, #-16
+cmpeq	p0.s, p0/z, z0.s, #15
+cmpeq	p0.s, p0/z, z0.s, z0.d
+cmpeq	p0.s, p0/z, z0.s, z0.s
+cmpge	p0.b, p0/z, z0.b, #-16
+cmpge	p0.b, p0/z, z0.b, #15
+cmpge	p0.b, p0/z, z0.b, z0.b
+cmpge	p0.b, p0/z, z0.b, z0.d
+cmpge	p0.b, p0/z, z1.b, z0.b
+cmpge	p0.d, p0/z, z0.d, #-16
+cmpge	p0.d, p0/z, z0.d, #15
+cmpge	p0.d, p0/z, z0.d, z0.d
+cmpge	p0.d, p0/z, z1.d, z0.d
+cmpge	p0.h, p0/z, z0.h, #-16
+cmpge	p0.h, p0/z, z0.h, #15
+cmpge	p0.h, p0/z, z0.h, z0.d
+cmpge	p0.h, p0/z, z0.h, z0.h
+cmpge	p0.h, p0/z, z1.h, z0.h
+cmpge	p0.s, p0/z, z0.s, #-16
+cmpge	p0.s, p0/z, z0.s, #15
+cmpge	p0.s, p0/z, z0.s, z0.d
+cmpge	p0.s, p0/z, z0.s, z0.s
+cmpge	p0.s, p0/z, z1.s, z0.s
+cmpgt	p0.b, p0/z, z0.b, #-16
+cmpgt	p0.b, p0/z, z0.b, #15
+cmpgt	p0.b, p0/z, z0.b, z0.b
+cmpgt	p0.b, p0/z, z0.b, z0.d
+cmpgt	p0.b, p0/z, z1.b, z0.b
+cmpgt	p0.d, p0/z, z0.d, #-16
+cmpgt	p0.d, p0/z, z0.d, #15
+cmpgt	p0.d, p0/z, z0.d, z0.d
+cmpgt	p0.d, p0/z, z1.d, z0.d
+cmpgt	p0.h, p0/z, z0.h, #-16
+cmpgt	p0.h, p0/z, z0.h, #15
+cmpgt	p0.h, p0/z, z0.h, z0.d
+cmpgt	p0.h, p0/z, z0.h, z0.h
+cmpgt	p0.h, p0/z, z1.h, z0.h
+cmpgt	p0.s, p0/z, z0.s, #-16
+cmpgt	p0.s, p0/z, z0.s, #15
+cmpgt	p0.s, p0/z, z0.s, z0.d
+cmpgt	p0.s, p0/z, z0.s, z0.s
+cmpgt	p0.s, p0/z, z1.s, z0.s
+cmphi	p0.b, p0/z, z0.b, #0
+cmphi	p0.b, p0/z, z0.b, #127
+cmphi	p0.b, p0/z, z0.b, z0.b
+cmphi	p0.b, p0/z, z0.b, z0.d
+cmphi	p0.b, p0/z, z1.b, z0.b
+cmphi	p0.d, p0/z, z0.d, #0
+cmphi	p0.d, p0/z, z0.d, #127
+cmphi	p0.d, p0/z, z0.d, z0.d
+cmphi	p0.d, p0/z, z1.d, z0.d
+cmphi	p0.h, p0/z, z0.h, #0
+cmphi	p0.h, p0/z, z0.h, #127
+cmphi	p0.h, p0/z, z0.h, z0.d
+cmphi	p0.h, p0/z, z0.h, z0.h
+cmphi	p0.h, p0/z, z1.h, z0.h
+cmphi	p0.s, p0/z, z0.s, #0
+cmphi	p0.s, p0/z, z0.s, #127
+cmphi	p0.s, p0/z, z0.s, z0.d
+cmphi	p0.s, p0/z, z0.s, z0.s
+cmphi	p0.s, p0/z, z1.s, z0.s
+cmphs	p0.b, p0/z, z0.b, #0
+cmphs	p0.b, p0/z, z0.b, #127
+cmphs	p0.b, p0/z, z0.b, z0.b
+cmphs	p0.b, p0/z, z0.b, z0.d
+cmphs	p0.b, p0/z, z1.b, z0.b
+cmphs	p0.d, p0/z, z0.d, #0
+cmphs	p0.d, p0/z, z0.d, #127
+cmphs	p0.d, p0/z, z0.d, z0.d
+cmphs	p0.d, p0/z, z1.d, z0.d
+cmphs	p0.h, p0/z, z0.h, #0
+cmphs	p0.h, p0/z, z0.h, #127
+cmphs	p0.h, p0/z, z0.h, z0.d
+cmphs	p0.h, p0/z, z0.h, z0.h
+cmphs	p0.h, p0/z, z1.h, z0.h
+cmphs	p0.s, p0/z, z0.s, #0
+cmphs	p0.s, p0/z, z0.s, #127
+cmphs	p0.s, p0/z, z0.s, z0.d
+cmphs	p0.s, p0/z, z0.s, z0.s
+cmphs	p0.s, p0/z, z1.s, z0.s
+cmple	p0.b, p0/z, z0.b, #-16
+cmple	p0.b, p0/z, z0.b, #15
+cmple	p0.b, p0/z, z0.b, z0.d
+cmple	p0.d, p0/z, z0.d, #-16
+cmple	p0.d, p0/z, z0.d, #15
+cmple	p0.h, p0/z, z0.h, #-16
+cmple	p0.h, p0/z, z0.h, #15
+cmple	p0.h, p0/z, z0.h, z0.d
+cmple	p0.s, p0/z, z0.s, #-16
+cmple	p0.s, p0/z, z0.s, #15
+cmple	p0.s, p0/z, z0.s, z0.d
+cmplo	p0.b, p0/z, z0.b, #0
+cmplo	p0.b, p0/z, z0.b, #127
+cmplo	p0.b, p0/z, z0.b, z0.d
+cmplo	p0.d, p0/z, z0.d, #0
+cmplo	p0.d, p0/z, z0.d, #127
+cmplo	p0.h, p0/z, z0.h, #0
+cmplo	p0.h, p0/z, z0.h, #127
+cmplo	p0.h, p0/z, z0.h, z0.d
+cmplo	p0.s, p0/z, z0.s, #0
+cmplo	p0.s, p0/z, z0.s, #127
+cmplo	p0.s, p0/z, z0.s, z0.d
+cmpls	p0.b, p0/z, z0.b, #0
+cmpls	p0.b, p0/z, z0.b, #127
+cmpls	p0.b, p0/z, z0.b, z0.d
+cmpls	p0.d, p0/z, z0.d, #0
+cmpls	p0.d, p0/z, z0.d, #127
+cmpls	p0.h, p0/z, z0.h, #0
+cmpls	p0.h, p0/z, z0.h, #127
+cmpls	p0.h, p0/z, z0.h, z0.d
+cmpls	p0.s, p0/z, z0.s, #0
+cmpls	p0.s, p0/z, z0.s, #127
+cmpls	p0.s, p0/z, z0.s, z0.d
+cmplt	p0.b, p0/z, z0.b, #-16
+cmplt	p0.b, p0/z, z0.b, #15
+cmplt	p0.b, p0/z, z0.b, z0.d
+cmplt	p0.d, p0/z, z0.d, #-16
+cmplt	p0.d, p0/z, z0.d, #15
+cmplt	p0.h, p0/z, z0.h, #-16
+cmplt	p0.h, p0/z, z0.h, #15
+cmplt	p0.h, p0/z, z0.h, z0.d
+cmplt	p0.s, p0/z, z0.s, #-16
+cmplt	p0.s, p0/z, z0.s, #15
+cmplt	p0.s, p0/z, z0.s, z0.d
+cmpne	p0.b, p0/z, z0.b, #-16
+cmpne	p0.b, p0/z, z0.b, #15
+cmpne	p0.b, p0/z, z0.b, z0.b
+cmpne	p0.b, p0/z, z0.b, z0.d
+cmpne	p0.d, p0/z, z0.d, #-16
+cmpne	p0.d, p0/z, z0.d, #15
+cmpne	p0.d, p0/z, z0.d, z0.d
+cmpne	p0.h, p0/z, z0.h, #-16
+cmpne	p0.h, p0/z, z0.h, #15
+cmpne	p0.h, p0/z, z0.h, z0.d
+cmpne	p0.h, p0/z, z0.h, z0.h
+cmpne	p0.s, p0/z, z0.s, #-16
+cmpne	p0.s, p0/z, z0.s, #15
+cmpne	p0.s, p0/z, z0.s, z0.d
+cmpne	p0.s, p0/z, z0.s, z0.s
+cnot	z31.b, p7/m, z31.b
+cnot	z31.d, p7/m, z31.d
+cnot	z31.h, p7/m, z31.h
+cnot	z31.s, p7/m, z31.s
+cnt	z31.b, p7/m, z31.b
+cnt	z31.d, p7/m, z31.d
+cnt	z31.h, p7/m, z31.h
+cnt	z31.s, p7/m, z31.s
+cntb	x0
+cntb	x0, #28
+cntb	x0, all, mul #16
+cntb	x0, pow2
+cntd	x0
+cntd	x0, #28
+cntd	x0, all, mul #16
+cntd	x0, pow2
+cnth	x0
+cnth	x0, #28
+cnth	x0, all, mul #16
+cnth	x0, pow2
+cntp	x0, p15, p0.b
+cntp	x0, p15, p0.d
+cntp	x0, p15, p0.h
+cntp	x0, p15, p0.s
+cntw	x0
+cntw	x0, #28
+cntw	x0, all, mul #16
+cntw	x0, pow2
+compact	z31.d, p7, z31.d
+compact	z31.s, p7, z31.s
+ctermeq	w30, wzr
+ctermeq	wzr, w30
+ctermeq	x30, xzr
+ctermeq	xzr, x30
+ctermne	w30, wzr
+ctermne	wzr, w30
+ctermne	x30, xzr
+ctermne	xzr, x30
+decb	x0
+decb	x0, #14
+decb	x0, all, mul #16
+decb	x0, pow2
+decb	x0, vl1
+decd	x0
+decd	x0, #14
+decd	x0, all, mul #16
+decd	x0, pow2
+decd	x0, vl1
+dech	x0
+dech	x0, #14
+dech	x0, all, mul #16
+dech	x0, pow2
+dech	x0, vl1
+decp	x0, p0.b
+decp	x0, p0.d
+decp	x0, p0.h
+decp	x0, p0.s
+decp	xzr, p15.b
+decp	xzr, p15.d
+decp	xzr, p15.h
+decp	xzr, p15.s
+decp	z31.d, p15.d
+decp	z31.h, p15.h
+decp	z31.s, p15.s
+decw	x0
+decw	x0, #14
+decw	x0, all, mul #16
+decw	x0, pow2
+decw	x0, vl1
+dupm	z0.d, #0xfffffffffffffff9
+dupm	z0.s, #0xfffffff9
+dupm	z23.h, #0xfff9
+dupm	z5.b, #0xf9
+eor	p0.b, p0/z, p0.b, p1.b
+eor	z0.d, z0.d, #0x6
+eor	z0.d, z0.d, #0xfffffffffffffff9
+eor	z0.d, z0.d, z0.d
+eor	z0.s, z0.s, #0x6
+eor	z0.s, z0.s, #0xfffffff9
+eor	z23.d, z13.d, z8.d
+eor	z23.h, z23.h, #0x6
+eor	z23.h, z23.h, #0xfff9
+eor	z31.b, p7/m, z31.b, z31.b
+eor	z31.d, p7/m, z31.d, z31.d
+eor	z31.h, p7/m, z31.h, z31.h
+eor	z31.s, p7/m, z31.s, z31.s
+eor	z5.b, z5.b, #0x6
+eor	z5.b, z5.b, #0xf9
+eor3	z29.d, z29.d, z30.d, z31.d
+eorbt	z0.b, z1.b, z31.b
+eorbt	z0.d, z1.d, z31.d
+eorbt	z0.h, z1.h, z31.h
+eorbt	z0.s, z1.s, z31.s
+eors	p0.b, p0/z, p0.b, p1.b
+eortb	z0.b, z1.b, z31.b
+eortb	z0.d, z1.d, z31.d
+eortb	z0.h, z1.h, z31.h
+eortb	z0.s, z1.s, z31.s
+eorv	b0, p7, z31.b
+eorv	d0, p7, z31.d
+eorv	h0, p7, z31.h
+eorv	s0, p7, z31.s
+ext	z0.b, { z1.b, z2.b }, #0
+ext	z31.b, z31.b, z0.b, #0
+ext	z31.b, z31.b, z0.b, #255
+ext	z31.b, { z30.b, z31.b }, #255
+fabd	z0.d, p7/m, z0.d, z31.d
+fabd	z0.h, p7/m, z0.h, z31.h
+fabd	z0.s, p7/m, z0.s, z31.s
+fabs	z31.d, p7/m, z31.d
+fabs	z31.h, p7/m, z31.h
+fabs	z31.s, p7/m, z31.s
+facge	p0.d, p0/z, z0.d, z1.d
+facge	p0.d, p0/z, z1.d, z0.d
+facge	p0.h, p0/z, z0.h, z1.h
+facge	p0.h, p0/z, z1.h, z0.h
+facge	p0.s, p0/z, z0.s, z1.s
+facge	p0.s, p0/z, z1.s, z0.s
+facgt	p0.d, p0/z, z0.d, z1.d
+facgt	p0.d, p0/z, z1.d, z0.d
+facgt	p0.h, p0/z, z0.h, z1.h
+facgt	p0.h, p0/z, z1.h, z0.h
+facgt	p0.s, p0/z, z0.s, z1.s
+facgt	p0.s, p0/z, z1.s, z0.s
+fadd	z0.d, p0/m, z0.d, #0.5
+fadd	z0.d, p7/m, z0.d, z31.d
+fadd	z0.d, z1.d, z31.d
+fadd	z0.h, p0/m, z0.h, #0.5
+fadd	z0.h, p7/m, z0.h, z31.h
+fadd	z0.h, z1.h, z31.h
+fadd	z0.s, p0/m, z0.s, #0.5
+fadd	z0.s, p7/m, z0.s, z31.s
+fadd	z0.s, z1.s, z31.s
+fadd	z31.d, p7/m, z31.d, #1.0
+fadd	z31.h, p7/m, z31.h, #1.0
+fadd	z31.s, p7/m, z31.s, #1.0
+fadda	d0, p7, d0, z31.d
+fadda	h0, p7, h0, z31.h
+fadda	s0, p7, s0, z31.s
+faddp	z0.h, p0/m, z0.h, z1.h
+faddp	z29.s, p3/m, z29.s, z30.s
+faddp	z31.d, p7/m, z31.d, z30.d
+faddv	d0, p7, z31.d
+faddv	h0, p7, z31.h
+faddv	s0, p7, z31.s
+fcadd	z0.d, p0/m, z0.d, z0.d, #90
+fcadd	z0.h, p0/m, z0.h, z0.h, #90
+fcadd	z0.s, p0/m, z0.s, z0.s, #90
+fcadd	z31.d, p7/m, z31.d, z31.d, #270
+fcadd	z31.h, p7/m, z31.h, z31.h, #270
+fcadd	z31.s, p7/m, z31.s, z31.s, #270
+fcmeq	p0.d, p0/z, z0.d, #0.0
+fcmeq	p0.d, p0/z, z0.d, z1.d
+fcmeq	p0.h, p0/z, z0.h, #0.0
+fcmeq	p0.h, p0/z, z0.h, z1.h
+fcmeq	p0.s, p0/z, z0.s, #0.0
+fcmeq	p0.s, p0/z, z0.s, z1.s
+fcmge	p0.d, p0/z, z0.d, #0.0
+fcmge	p0.d, p0/z, z0.d, z1.d
+fcmge	p0.d, p0/z, z1.d, z0.d
+fcmge	p0.h, p0/z, z0.h, #0.0
+fcmge	p0.h, p0/z, z0.h, z1.h
+fcmge	p0.h, p0/z, z1.h, z0.h
+fcmge	p0.s, p0/z, z0.s, #0.0
+fcmge	p0.s, p0/z, z0.s, z1.s
+fcmge	p0.s, p0/z, z1.s, z0.s
+fcmgt	p0.d, p0/z, z0.d, #0.0
+fcmgt	p0.d, p0/z, z0.d, z1.d
+fcmgt	p0.d, p0/z, z1.d, z0.d
+fcmgt	p0.h, p0/z, z0.h, #0.0
+fcmgt	p0.h, p0/z, z0.h, z1.h
+fcmgt	p0.h, p0/z, z1.h, z0.h
+fcmgt	p0.s, p0/z, z0.s, #0.0
+fcmgt	p0.s, p0/z, z0.s, z1.s
+fcmgt	p0.s, p0/z, z1.s, z0.s
+fcmla	z0.d, p0/m, z0.d, z0.d, #0
+fcmla	z0.d, p0/m, z1.d, z2.d, #90
+fcmla	z0.h, p0/m, z0.h, z0.h, #0
+fcmla	z0.h, p0/m, z1.h, z2.h, #90
+fcmla	z0.h, z0.h, z0.h[0], #0
+fcmla	z0.s, p0/m, z0.s, z0.s, #0
+fcmla	z0.s, p0/m, z1.s, z2.s, #90
+fcmla	z21.s, z10.s, z5.s[1], #90
+fcmla	z23.s, z13.s, z8.s[0], #270
+fcmla	z29.d, p7/m, z30.d, z31.d, #180
+fcmla	z29.h, p7/m, z30.h, z31.h, #180
+fcmla	z29.s, p7/m, z30.s, z31.s, #180
+fcmla	z31.d, p7/m, z31.d, z31.d, #270
+fcmla	z31.h, p7/m, z31.h, z31.h, #270
+fcmla	z31.h, z31.h, z7.h[3], #270
+fcmla	z31.s, p7/m, z31.s, z31.s, #270
+fcmle	p0.d, p0/z, z0.d, #0.0
+fcmle	p0.h, p0/z, z0.h, #0.0
+fcmle	p0.s, p0/z, z0.s, #0.0
+fcmlt	p0.d, p0/z, z0.d, #0.0
+fcmlt	p0.h, p0/z, z0.h, #0.0
+fcmlt	p0.s, p0/z, z0.s, #0.0
+fcmne	p0.d, p0/z, z0.d, #0.0
+fcmne	p0.d, p0/z, z0.d, z1.d
+fcmne	p0.h, p0/z, z0.h, #0.0
+fcmne	p0.h, p0/z, z0.h, z1.h
+fcmne	p0.s, p0/z, z0.s, #0.0
+fcmne	p0.s, p0/z, z0.s, z1.s
+fcmuo	p0.d, p0/z, z0.d, z1.d
+fcmuo	p0.h, p0/z, z0.h, z1.h
+fcmuo	p0.s, p0/z, z0.s, z1.s
+fcvt	z0.d, p0/m, z0.h
+fcvt	z0.d, p0/m, z0.s
+fcvt	z0.h, p0/m, z0.d
+fcvt	z0.h, p0/m, z0.s
+fcvt	z0.s, p0/m, z0.d
+fcvt	z0.s, p0/m, z0.h
+fcvtlt	z0.s, p0/m, z1.h
+fcvtlt	z30.d, p7/m, z31.s
+fcvtnt	z0.h, p0/m, z1.s
+fcvtnt	z30.s, p7/m, z31.d
+fcvtx	z0.s, p0/m, z0.d
+fcvtx	z30.s, p7/m, z31.d
+fcvtxnt	z0.s, p0/m, z1.d
+fcvtxnt	z30.s, p7/m, z31.d
+fcvtzs	z0.d, p0/m, z0.d
+fcvtzs	z0.d, p0/m, z0.h
+fcvtzs	z0.d, p0/m, z0.s
+fcvtzs	z0.h, p0/m, z0.h
+fcvtzs	z0.s, p0/m, z0.d
+fcvtzs	z0.s, p0/m, z0.h
+fcvtzs	z0.s, p0/m, z0.s
+fcvtzu	z0.d, p0/m, z0.d
+fcvtzu	z0.d, p0/m, z0.h
+fcvtzu	z0.d, p0/m, z0.s
+fcvtzu	z0.h, p0/m, z0.h
+fcvtzu	z0.s, p0/m, z0.d
+fcvtzu	z0.s, p0/m, z0.h
+fcvtzu	z0.s, p0/m, z0.s
+fdiv	z0.d, p7/m, z0.d, z31.d
+fdiv	z0.h, p7/m, z0.h, z31.h
+fdiv	z0.s, p7/m, z0.s, z31.s
+fdivr	z0.d, p7/m, z0.d, z31.d
+fdivr	z0.h, p7/m, z0.h, z31.h
+fdivr	z0.s, p7/m, z0.s, z31.s
+fexpa	z0.d, z31.d
+fexpa	z0.h, z31.h
+fexpa	z0.s, z31.s
+flogb	z31.d, p7/m, z31.d
+flogb	z31.h, p7/m, z31.h
+flogb	z31.s, p7/m, z31.s
+fmad	z0.d, p7/m, z1.d, z31.d
+fmad	z0.h, p7/m, z1.h, z31.h
+fmad	z0.s, p7/m, z1.s, z31.s
+fmax	z0.d, p0/m, z0.d, #0.0
+fmax	z0.d, p7/m, z0.d, z31.d
+fmax	z0.h, p0/m, z0.h, #0.0
+fmax	z0.h, p7/m, z0.h, z31.h
+fmax	z0.s, p0/m, z0.s, #0.0
+fmax	z0.s, p7/m, z0.s, z31.s
+fmax	z31.d, p7/m, z31.d, #1.0
+fmax	z31.h, p7/m, z31.h, #1.0
+fmax	z31.s, p7/m, z31.s, #1.0
+fmaxnm	z0.d, p0/m, z0.d, #0.0
+fmaxnm	z0.d, p7/m, z0.d, z31.d
+fmaxnm	z0.h, p0/m, z0.h, #0.0
+fmaxnm	z0.h, p7/m, z0.h, z31.h
+fmaxnm	z0.s, p0/m, z0.s, #0.0
+fmaxnm	z0.s, p7/m, z0.s, z31.s
+fmaxnm	z31.d, p7/m, z31.d, #1.0
+fmaxnm	z31.h, p7/m, z31.h, #1.0
+fmaxnm	z31.s, p7/m, z31.s, #1.0
+fmaxnmp	z0.h, p0/m, z0.h, z1.h
+fmaxnmp	z29.s, p3/m, z29.s, z30.s
+fmaxnmp	z31.d, p7/m, z31.d, z30.d
+fmaxnmv	d0, p7, z31.d
+fmaxnmv	h0, p7, z31.h
+fmaxnmv	s0, p7, z31.s
+fmaxp	z0.h, p0/m, z0.h, z1.h
+fmaxp	z29.s, p3/m, z29.s, z30.s
+fmaxp	z31.d, p7/m, z31.d, z30.d
+fmaxv	d0, p7, z31.d
+fmaxv	h0, p7, z31.h
+fmaxv	s0, p7, z31.s
+fmin	z0.d, p0/m, z0.d, #0.0
+fmin	z0.d, p7/m, z0.d, z31.d
+fmin	z0.h, p0/m, z0.h, #0.0
+fmin	z0.h, p7/m, z0.h, z31.h
+fmin	z0.s, p0/m, z0.s, #0.0
+fmin	z0.s, p7/m, z0.s, z31.s
+fmin	z31.d, p7/m, z31.d, #1.0
+fmin	z31.h, p7/m, z31.h, #1.0
+fmin	z31.s, p7/m, z31.s, #1.0
+fminnm	z0.d, p0/m, z0.d, #0.0
+fminnm	z0.d, p7/m, z0.d, z31.d
+fminnm	z0.h, p0/m, z0.h, #0.0
+fminnm	z0.h, p7/m, z0.h, z31.h
+fminnm	z0.s, p0/m, z0.s, #0.0
+fminnm	z0.s, p7/m, z0.s, z31.s
+fminnm	z31.d, p7/m, z31.d, #1.0
+fminnm	z31.h, p7/m, z31.h, #1.0
+fminnm	z31.s, p7/m, z31.s, #1.0
+fminnmp	z0.h, p0/m, z0.h, z1.h
+fminnmp	z29.s, p3/m, z29.s, z30.s
+fminnmp	z31.d, p7/m, z31.d, z30.d
+fminnmv	d0, p7, z31.d
+fminnmv	h0, p7, z31.h
+fminnmv	s0, p7, z31.s
+fminp	z0.h, p0/m, z0.h, z1.h
+fminp	z29.s, p3/m, z29.s, z30.s
+fminp	z31.d, p7/m, z31.d, z30.d
+fminv	d0, p7, z31.d
+fminv	h0, p7, z31.h
+fminv	s0, p7, z31.s
+fmla	z0.d, p7/m, z1.d, z31.d
+fmla	z0.d, z1.d, z7.d[1]
+fmla	z0.h, p7/m, z1.h, z31.h
+fmla	z0.h, z1.h, z7.h[7]
+fmla	z0.s, p7/m, z1.s, z31.s
+fmla	z0.s, z1.s, z7.s[3]
+fmlalb	z0.s, z1.h, z7.h[0]
+fmlalb	z29.s, z30.h, z31.h
+fmlalb	z30.s, z31.h, z7.h[7]
+fmlalt	z0.s, z1.h, z7.h[0]
+fmlalt	z29.s, z30.h, z31.h
+fmlalt	z30.s, z31.h, z7.h[7]
+fmls	z0.d, p7/m, z1.d, z31.d
+fmls	z0.d, z1.d, z7.d[1]
+fmls	z0.h, p7/m, z1.h, z31.h
+fmls	z0.h, z1.h, z7.h[7]
+fmls	z0.s, p7/m, z1.s, z31.s
+fmls	z0.s, z1.s, z7.s[3]
+fmlslb	z0.s, z1.h, z7.h[0]
+fmlslb	z29.s, z30.h, z31.h
+fmlslb	z30.s, z31.h, z7.h[7]
+fmlslt	z0.s, z1.h, z7.h[0]
+fmlslt	z29.s, z30.h, z31.h
+fmlslt	z30.s, z31.h, z7.h[7]
+fmov	z0.d, #-10.00000000
+fmov	z0.d, #0.12500000
+fmov	z0.d, p0/m, #-10.00000000
+fmov	z0.d, p0/m, #0.12500000
+fmov	z0.h, #-0.12500000
+fmov	z0.h, p0/m, #-0.12500000
+fmov	z0.s, #-0.12500000
+fmov	z0.s, p0/m, #-0.12500000
+fmsb	z0.d, p7/m, z1.d, z31.d
+fmsb	z0.h, p7/m, z1.h, z31.h
+fmsb	z0.s, p7/m, z1.s, z31.s
+fmul	z0.d, p0/m, z0.d, #0.5
+fmul	z0.d, p7/m, z0.d, z31.d
+fmul	z0.d, z0.d, z0.d[0]
+fmul	z0.d, z1.d, z31.d
+fmul	z0.h, p0/m, z0.h, #0.5
+fmul	z0.h, p7/m, z0.h, z31.h
+fmul	z0.h, z0.h, z0.h[0]
+fmul	z0.h, z1.h, z31.h
+fmul	z0.s, p0/m, z0.s, #0.5
+fmul	z0.s, p7/m, z0.s, z31.s
+fmul	z0.s, z0.s, z0.s[0]
+fmul	z0.s, z1.s, z31.s
+fmul	z31.d, p7/m, z31.d, #2.0
+fmul	z31.d, z31.d, z15.d[1]
+fmul	z31.h, p7/m, z31.h, #2.0
+fmul	z31.h, z31.h, z7.h[7]
+fmul	z31.s, p7/m, z31.s, #2.0
+fmul	z31.s, z31.s, z7.s[3]
+fmulx	z0.d, p7/m, z0.d, z31.d
+fmulx	z0.h, p7/m, z0.h, z31.h
+fmulx	z0.s, p7/m, z0.s, z31.s
+fneg	z31.d, p7/m, z31.d
+fneg	z31.h, p7/m, z31.h
+fneg	z31.s, p7/m, z31.s
+fnmad	z0.d, p7/m, z1.d, z31.d
+fnmad	z0.h, p7/m, z1.h, z31.h
+fnmad	z0.s, p7/m, z1.s, z31.s
+fnmla	z0.d, p7/m, z1.d, z31.d
+fnmla	z0.h, p7/m, z1.h, z31.h
+fnmla	z0.s, p7/m, z1.s, z31.s
+fnmls	z0.d, p7/m, z1.d, z31.d
+fnmls	z0.h, p7/m, z1.h, z31.h
+fnmls	z0.s, p7/m, z1.s, z31.s
+fnmsb	z0.d, p7/m, z1.d, z31.d
+fnmsb	z0.h, p7/m, z1.h, z31.h
+fnmsb	z0.s, p7/m, z1.s, z31.s
+frecpe	z0.d, z31.d
+frecpe	z0.h, z31.h
+frecpe	z0.s, z31.s
+frecps	z0.d, z1.d, z31.d
+frecps	z0.h, z1.h, z31.h
+frecps	z0.s, z1.s, z31.s
+frecpx	z31.d, p7/m, z31.d
+frecpx	z31.h, p7/m, z31.h
+frecpx	z31.s, p7/m, z31.s
+frinta	z31.d, p7/m, z31.d
+frinta	z31.h, p7/m, z31.h
+frinta	z31.s, p7/m, z31.s
+frinti	z31.d, p7/m, z31.d
+frinti	z31.h, p7/m, z31.h
+frinti	z31.s, p7/m, z31.s
+frintm	z31.d, p7/m, z31.d
+frintm	z31.h, p7/m, z31.h
+frintm	z31.s, p7/m, z31.s
+frintn	z31.d, p7/m, z31.d
+frintn	z31.h, p7/m, z31.h
+frintn	z31.s, p7/m, z31.s
+frintp	z31.d, p7/m, z31.d
+frintp	z31.h, p7/m, z31.h
+frintp	z31.s, p7/m, z31.s
+frintx	z31.d, p7/m, z31.d
+frintx	z31.h, p7/m, z31.h
+frintx	z31.s, p7/m, z31.s
+frintz	z31.d, p7/m, z31.d
+frintz	z31.h, p7/m, z31.h
+frintz	z31.s, p7/m, z31.s
+frsqrte	z0.d, z31.d
+frsqrte	z0.h, z31.h
+frsqrte	z0.s, z31.s
+frsqrts	z0.d, z1.d, z31.d
+frsqrts	z0.h, z1.h, z31.h
+frsqrts	z0.s, z1.s, z31.s
+fscale	z0.d, p7/m, z0.d, z31.d
+fscale	z0.h, p7/m, z0.h, z31.h
+fscale	z0.s, p7/m, z0.s, z31.s
+fsqrt	z31.d, p7/m, z31.d
+fsqrt	z31.h, p7/m, z31.h
+fsqrt	z31.s, p7/m, z31.s
+fsub	z0.d, p0/m, z0.d, #0.5
+fsub	z0.d, p7/m, z0.d, z31.d
+fsub	z0.d, z1.d, z31.d
+fsub	z0.h, p0/m, z0.h, #0.5
+fsub	z0.h, p7/m, z0.h, z31.h
+fsub	z0.h, z1.h, z31.h
+fsub	z0.s, p0/m, z0.s, #0.5
+fsub	z0.s, p7/m, z0.s, z31.s
+fsub	z0.s, z1.s, z31.s
+fsub	z31.d, p7/m, z31.d, #1.0
+fsub	z31.h, p7/m, z31.h, #1.0
+fsub	z31.s, p7/m, z31.s, #1.0
+fsubr	z0.d, p0/m, z0.d, #0.5
+fsubr	z0.d, p7/m, z0.d, z31.d
+fsubr	z0.h, p0/m, z0.h, #0.5
+fsubr	z0.h, p7/m, z0.h, z31.h
+fsubr	z0.s, p0/m, z0.s, #0.5
+fsubr	z0.s, p7/m, z0.s, z31.s
+fsubr	z31.d, p7/m, z31.d, #1.0
+fsubr	z31.h, p7/m, z31.h, #1.0
+fsubr	z31.s, p7/m, z31.s, #1.0
+ftmad	z0.d, z0.d, z31.d, #7
+ftmad	z0.h, z0.h, z31.h, #7
+ftmad	z0.s, z0.s, z31.s, #7
+ftsmul	z0.d, z1.d, z31.d
+ftsmul	z0.h, z1.h, z31.h
+ftsmul	z0.s, z1.s, z31.s
+ftssel	z0.d, z1.d, z31.d
+ftssel	z0.h, z1.h, z31.h
+ftssel	z0.s, z1.s, z31.s
+histcnt	z0.s, p0/z, z1.s, z2.s
+histcnt	z29.d, p7/z, z30.d, z31.d
+histseg	z0.b, z1.b, z31.b
+incb	x0
+incb	x0, #14
+incb	x0, all, mul #16
+incb	x0, pow2
+incb	x0, vl1
+incd	x0
+incd	x0, #14
+incd	x0, all, mul #16
+incd	x0, pow2
+incd	x0, vl1
+incd	z0.d
+incd	z0.d, all, mul #16
+inch	x0
+inch	x0, #14
+inch	x0, all, mul #16
+inch	x0, pow2
+inch	x0, vl1
+inch	z0.h
+inch	z0.h, all, mul #16
+incp	x0, p0.b
+incp	x0, p0.d
+incp	x0, p0.h
+incp	x0, p0.s
+incp	xzr, p15.b
+incp	xzr, p15.d
+incp	xzr, p15.h
+incp	xzr, p15.s
+incp	z31.d, p15.d
+incp	z31.h, p15.h
+incp	z31.s, p15.s
+incw	x0
+incw	x0, #14
+incw	x0, all, mul #16
+incw	x0, pow2
+incw	x0, vl1
+incw	z0.s
+incw	z0.s, all, mul #16
+index	z0.b, #0, #0
+index	z0.d, #0, #0
+index	z0.h, #0, #0
+index	z0.h, w0, w0
+index	z0.s, #0, #0
+index	z21.b, w10, w21
+index	z21.d, x10, x21
+index	z21.s, w10, w21
+index	z23.b, #13, w8
+index	z23.b, w13, #8
+index	z23.d, #13, x8
+index	z23.d, x13, #8
+index	z23.h, #13, w8
+index	z23.h, w13, #8
+index	z23.s, #13, w8
+index	z23.s, w13, #8
+index	z31.b, #-1, #-1
+index	z31.b, #-1, wzr
+index	z31.b, wzr, #-1
+index	z31.b, wzr, wzr
+index	z31.d, #-1, #-1
+index	z31.d, #-1, xzr
+index	z31.d, xzr, #-1
+index	z31.d, xzr, xzr
+index	z31.h, #-1, #-1
+index	z31.h, #-1, wzr
+index	z31.h, wzr, #-1
+index	z31.h, wzr, wzr
+index	z31.s, #-1, #-1
+index	z31.s, #-1, wzr
+index	z31.s, wzr, #-1
+index	z31.s, wzr, wzr
+insr	z0.b, w0
+insr	z0.d, x0
+insr	z0.h, w0
+insr	z0.s, w0
+insr	z31.b, b31
+insr	z31.b, wzr
+insr	z31.d, d31
+insr	z31.d, xzr
+insr	z31.h, h31
+insr	z31.h, wzr
+insr	z31.s, s31
+insr	z31.s, wzr
+lasta	b0, p7, z31.b
+lasta	d0, p7, z31.d
+lasta	h0, p7, z31.h
+lasta	s0, p7, z31.s
+lasta	w0, p7, z31.b
+lasta	w0, p7, z31.h
+lasta	w0, p7, z31.s
+lasta	x0, p7, z31.d
+lastb	b0, p7, z31.b
+lastb	d0, p7, z31.d
+lastb	h0, p7, z31.h
+lastb	s0, p7, z31.s
+lastb	w0, p7, z31.b
+lastb	w0, p7, z31.h
+lastb	w0, p7, z31.s
+lastb	x0, p7, z31.d
+ld1b	{ z0.b }, p0/z, [sp, x0]
+ld1b	{ z0.b }, p0/z, [x0, x0]
+ld1b	{ z0.b }, p0/z, [x0]
+ld1b	{ z0.d }, p0/z, [x0]
+ld1b	{ z0.d }, p0/z, [z0.d]
+ld1b	{ z0.h }, p0/z, [x0]
+ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ld1b	{ z0.s }, p0/z, [x0]
+ld1b	{ z0.s }, p0/z, [z0.s]
+ld1b	{ z21.b }, p5/z, [x10, #5, mul vl]
+ld1b	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1b	{ z21.h }, p5/z, [x10, #5, mul vl]
+ld1b	{ z21.s }, p5/z, [x10, #5, mul vl]
+ld1b	{ z21.s }, p5/z, [x10, x21]
+ld1b	{ z23.d }, p3/z, [x13, x8]
+ld1b	{ z31.b }, p7/z, [sp, #-1, mul vl]
+ld1b	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1b	{ z31.d }, p7/z, [sp, z31.d]
+ld1b	{ z31.d }, p7/z, [z31.d, #31]
+ld1b	{ z31.h }, p7/z, [sp, #-1, mul vl]
+ld1b	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ld1b	{ z31.s }, p7/z, [z31.s, #31]
+ld1b	{ z5.h }, p3/z, [x17, x16]
+ld1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+ld1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+ld1d	{ z0.d }, p0/z, [x0]
+ld1d	{ z0.d }, p0/z, [z0.d]
+ld1d	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1d	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1d	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1d	{ z23.d }, p3/z, [sp, x8, lsl #3]
+ld1d	{ z23.d }, p3/z, [x13, x8, lsl #3]
+ld1d	{ z23.d }, p3/z, [x13, z8.d, lsl #3]
+ld1d	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1d	{ z31.d }, p7/z, [sp, z31.d]
+ld1d	{ z31.d }, p7/z, [z31.d, #248]
+ld1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+ld1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+ld1h	{ z0.d }, p0/z, [x0]
+ld1h	{ z0.d }, p0/z, [z0.d]
+ld1h	{ z0.h }, p0/z, [x0]
+ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ld1h	{ z0.s }, p0/z, [x0]
+ld1h	{ z0.s }, p0/z, [z0.s]
+ld1h	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1h	{ z21.h }, p5/z, [x10, #5, mul vl]
+ld1h	{ z21.s }, p5/z, [x10, #5, mul vl]
+ld1h	{ z21.s }, p5/z, [x10, x21, lsl #1]
+ld1h	{ z23.d }, p3/z, [x13, x8, lsl #1]
+ld1h	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+ld1h	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1h	{ z31.d }, p7/z, [sp, z31.d]
+ld1h	{ z31.d }, p7/z, [z31.d, #62]
+ld1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
+ld1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+ld1h	{ z31.s }, p7/z, [z31.s, #62]
+ld1h	{ z5.h }, p3/z, [sp, x16, lsl #1]
+ld1h	{ z5.h }, p3/z, [x17, x16, lsl #1]
+ld1rb	{ z0.b }, p0/z, [x0]
+ld1rb	{ z0.d }, p0/z, [x0]
+ld1rb	{ z0.h }, p0/z, [x0]
+ld1rb	{ z0.s }, p0/z, [x0]
+ld1rb	{ z31.b }, p7/z, [sp, #63]
+ld1rb	{ z31.d }, p7/z, [sp, #63]
+ld1rb	{ z31.h }, p7/z, [sp, #63]
+ld1rb	{ z31.s }, p7/z, [sp, #63]
+ld1rd	{ z0.d }, p0/z, [x0]
+ld1rd	{ z31.d }, p7/z, [sp, #504]
+ld1rh	{ z0.d }, p0/z, [x0]
+ld1rh	{ z0.h }, p0/z, [x0]
+ld1rh	{ z0.s }, p0/z, [x0]
+ld1rh	{ z31.d }, p7/z, [sp, #126]
+ld1rh	{ z31.h }, p7/z, [sp, #126]
+ld1rh	{ z31.s }, p7/z, [sp, #126]
+ld1rqb	{ z0.b }, p0/z, [x0, x0]
+ld1rqb	{ z0.b }, p0/z, [x0]
+ld1rqb	{ z21.b }, p5/z, [x10, #112]
+ld1rqb	{ z23.b }, p3/z, [x13, #-128]
+ld1rqb	{ z31.b }, p7/z, [sp, #-16]
+ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
+ld1rqd	{ z0.d }, p0/z, [x0]
+ld1rqd	{ z23.d }, p3/z, [x13, #-128]
+ld1rqd	{ z23.d }, p3/z, [x13, #112]
+ld1rqd	{ z31.d }, p7/z, [sp, #-16]
+ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
+ld1rqh	{ z0.h }, p0/z, [x0]
+ld1rqh	{ z23.h }, p3/z, [x13, #-128]
+ld1rqh	{ z23.h }, p3/z, [x13, #112]
+ld1rqh	{ z31.h }, p7/z, [sp, #-16]
+ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
+ld1rqw	{ z0.s }, p0/z, [x0]
+ld1rqw	{ z23.s }, p3/z, [x13, #-128]
+ld1rqw	{ z23.s }, p3/z, [x13, #112]
+ld1rqw	{ z31.s }, p7/z, [sp, #-16]
+ld1rsb	{ z0.d }, p0/z, [x0]
+ld1rsb	{ z0.h }, p0/z, [x0]
+ld1rsb	{ z0.s }, p0/z, [x0]
+ld1rsb	{ z31.d }, p7/z, [sp, #63]
+ld1rsb	{ z31.h }, p7/z, [sp, #63]
+ld1rsb	{ z31.s }, p7/z, [sp, #63]
+ld1rsh	{ z0.d }, p0/z, [x0]
+ld1rsh	{ z0.s }, p0/z, [x0]
+ld1rsh	{ z31.d }, p7/z, [sp, #126]
+ld1rsh	{ z31.s }, p7/z, [sp, #126]
+ld1rsw	{ z0.d }, p0/z, [x0]
+ld1rsw	{ z31.d }, p7/z, [sp, #252]
+ld1rw	{ z0.d }, p0/z, [x0]
+ld1rw	{ z0.s }, p0/z, [x0]
+ld1rw	{ z31.d }, p7/z, [sp, #252]
+ld1rw	{ z31.s }, p7/z, [sp, #252]
+ld1sb	{ z0.d }, p0/z, [x0]
+ld1sb	{ z0.d }, p0/z, [z0.d]
+ld1sb	{ z0.h }, p0/z, [sp, x0]
+ld1sb	{ z0.h }, p0/z, [x0, x0]
+ld1sb	{ z0.h }, p0/z, [x0]
+ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ld1sb	{ z0.s }, p0/z, [x0]
+ld1sb	{ z0.s }, p0/z, [z0.s]
+ld1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1sb	{ z21.h }, p5/z, [x10, #5, mul vl]
+ld1sb	{ z21.s }, p5/z, [x10, #5, mul vl]
+ld1sb	{ z21.s }, p5/z, [x10, x21]
+ld1sb	{ z23.d }, p3/z, [x13, x8]
+ld1sb	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1sb	{ z31.d }, p7/z, [sp, z31.d]
+ld1sb	{ z31.d }, p7/z, [z31.d, #31]
+ld1sb	{ z31.h }, p7/z, [sp, #-1, mul vl]
+ld1sb	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ld1sb	{ z31.s }, p7/z, [z31.s, #31]
+ld1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+ld1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+ld1sh	{ z0.d }, p0/z, [x0]
+ld1sh	{ z0.d }, p0/z, [z0.d]
+ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ld1sh	{ z0.s }, p0/z, [x0]
+ld1sh	{ z0.s }, p0/z, [z0.s]
+ld1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1sh	{ z21.s }, p5/z, [sp, x21, lsl #1]
+ld1sh	{ z21.s }, p5/z, [x10, #5, mul vl]
+ld1sh	{ z21.s }, p5/z, [x10, x21, lsl #1]
+ld1sh	{ z23.d }, p3/z, [x13, x8, lsl #1]
+ld1sh	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+ld1sh	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1sh	{ z31.d }, p7/z, [sp, z31.d]
+ld1sh	{ z31.d }, p7/z, [z31.d, #62]
+ld1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+ld1sh	{ z31.s }, p7/z, [z31.s, #62]
+ld1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+ld1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+ld1sw	{ z0.d }, p0/z, [x0]
+ld1sw	{ z0.d }, p0/z, [z0.d]
+ld1sw	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1sw	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1sw	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1sw	{ z23.d }, p3/z, [sp, x8, lsl #2]
+ld1sw	{ z23.d }, p3/z, [x13, x8, lsl #2]
+ld1sw	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+ld1sw	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1sw	{ z31.d }, p7/z, [sp, z31.d]
+ld1sw	{ z31.d }, p7/z, [z31.d, #124]
+ld1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+ld1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+ld1w	{ z0.d }, p0/z, [x0]
+ld1w	{ z0.d }, p0/z, [z0.d]
+ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ld1w	{ z0.s }, p0/z, [x0]
+ld1w	{ z0.s }, p0/z, [z0.s]
+ld1w	{ z21.d }, p5/z, [x10, #5, mul vl]
+ld1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ld1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ld1w	{ z21.s }, p5/z, [sp, x21, lsl #2]
+ld1w	{ z21.s }, p5/z, [x10, #5, mul vl]
+ld1w	{ z21.s }, p5/z, [x10, x21, lsl #2]
+ld1w	{ z23.d }, p3/z, [x13, x8, lsl #2]
+ld1w	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+ld1w	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ld1w	{ z31.d }, p7/z, [sp, z31.d]
+ld1w	{ z31.d }, p7/z, [z31.d, #124]
+ld1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+ld1w	{ z31.s }, p7/z, [z31.s, #124]
+ld2b	{ z0.b, z1.b }, p0/z, [x0, x0]
+ld2b	{ z0.b, z1.b }, p0/z, [x0]
+ld2b	{ z21.b, z22.b }, p5/z, [x10, #10, mul vl]
+ld2b	{ z23.b, z24.b }, p3/z, [x13, #-16, mul vl]
+ld2b	{ z5.b, z6.b }, p3/z, [x17, x16]
+ld2d	{ z0.d, z1.d }, p0/z, [x0, x0, lsl #3]
+ld2d	{ z0.d, z1.d }, p0/z, [x0]
+ld2d	{ z21.d, z22.d }, p5/z, [x10, #10, mul vl]
+ld2d	{ z23.d, z24.d }, p3/z, [x13, #-16, mul vl]
+ld2d	{ z5.d, z6.d }, p3/z, [x17, x16, lsl #3]
+ld2h	{ z0.h, z1.h }, p0/z, [x0, x0, lsl #1]
+ld2h	{ z0.h, z1.h }, p0/z, [x0]
+ld2h	{ z21.h, z22.h }, p5/z, [x10, #10, mul vl]
+ld2h	{ z23.h, z24.h }, p3/z, [x13, #-16, mul vl]
+ld2h	{ z5.h, z6.h }, p3/z, [x17, x16, lsl #1]
+ld2w	{ z0.s, z1.s }, p0/z, [x0, x0, lsl #2]
+ld2w	{ z0.s, z1.s }, p0/z, [x0]
+ld2w	{ z21.s, z22.s }, p5/z, [x10, #10, mul vl]
+ld2w	{ z23.s, z24.s }, p3/z, [x13, #-16, mul vl]
+ld2w	{ z5.s, z6.s }, p3/z, [x17, x16, lsl #2]
+ld3b	{ z0.b, z1.b, z2.b }, p0/z, [x0, x0]
+ld3b	{ z0.b, z1.b, z2.b }, p0/z, [x0]
+ld3b	{ z21.b, z22.b, z23.b }, p5/z, [x10, #15, mul vl]
+ld3b	{ z23.b, z24.b, z25.b }, p3/z, [x13, #-24, mul vl]
+ld3b	{ z5.b, z6.b, z7.b }, p3/z, [x17, x16]
+ld3d	{ z0.d, z1.d, z2.d }, p0/z, [x0, x0, lsl #3]
+ld3d	{ z0.d, z1.d, z2.d }, p0/z, [x0]
+ld3d	{ z21.d, z22.d, z23.d }, p5/z, [x10, #15, mul vl]
+ld3d	{ z23.d, z24.d, z25.d }, p3/z, [x13, #-24, mul vl]
+ld3d	{ z5.d, z6.d, z7.d }, p3/z, [x17, x16, lsl #3]
+ld3h	{ z0.h, z1.h, z2.h }, p0/z, [x0, x0, lsl #1]
+ld3h	{ z0.h, z1.h, z2.h }, p0/z, [x0]
+ld3h	{ z21.h, z22.h, z23.h }, p5/z, [x10, #15, mul vl]
+ld3h	{ z23.h, z24.h, z25.h }, p3/z, [x13, #-24, mul vl]
+ld3h	{ z5.h, z6.h, z7.h }, p3/z, [x17, x16, lsl #1]
+ld3w	{ z0.s, z1.s, z2.s }, p0/z, [x0, x0, lsl #2]
+ld3w	{ z0.s, z1.s, z2.s }, p0/z, [x0]
+ld3w	{ z21.s, z22.s, z23.s }, p5/z, [x10, #15, mul vl]
+ld3w	{ z23.s, z24.s, z25.s }, p3/z, [x13, #-24, mul vl]
+ld3w	{ z5.s, z6.s, z7.s }, p3/z, [x17, x16, lsl #2]
+ld4b	{ z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x0]
+ld4b	{ z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
+ld4b	{ z21.b, z22.b, z23.b, z24.b }, p5/z, [x10, #20, mul vl]
+ld4b	{ z23.b, z24.b, z25.b, z26.b }, p3/z, [x13, #-32, mul vl]
+ld4b	{ z5.b, z6.b, z7.b, z8.b }, p3/z, [x17, x16]
+ld4d	{ z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x0, lsl #3]
+ld4d	{ z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+ld4d	{ z21.d, z22.d, z23.d, z24.d }, p5/z, [x10, #20, mul vl]
+ld4d	{ z23.d, z24.d, z25.d, z26.d }, p3/z, [x13, #-32, mul vl]
+ld4d	{ z5.d, z6.d, z7.d, z8.d }, p3/z, [x17, x16, lsl #3]
+ld4h	{ z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x0, lsl #1]
+ld4h	{ z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+ld4h	{ z21.h, z22.h, z23.h, z24.h }, p5/z, [x10, #20, mul vl]
+ld4h	{ z23.h, z24.h, z25.h, z26.h }, p3/z, [x13, #-32, mul vl]
+ld4h	{ z5.h, z6.h, z7.h, z8.h }, p3/z, [x17, x16, lsl #1]
+ld4w	{ z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x0, lsl #2]
+ld4w	{ z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+ld4w	{ z21.s, z22.s, z23.s, z24.s }, p5/z, [x10, #20, mul vl]
+ld4w	{ z23.s, z24.s, z25.s, z26.s }, p3/z, [x13, #-32, mul vl]
+ld4w	{ z5.s, z6.s, z7.s, z8.s }, p3/z, [x17, x16, lsl #2]
+ldff1b	{ z0.d }, p0/z, [x0, x0]
+ldff1b	{ z0.d }, p0/z, [z0.d]
+ldff1b	{ z0.h }, p0/z, [x0, x0]
+ldff1b	{ z0.s }, p0/z, [x0, x0]
+ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ldff1b	{ z0.s }, p0/z, [z0.s]
+ldff1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1b	{ z31.b }, p7/z, [sp]
+ldff1b	{ z31.d }, p7/z, [sp, z31.d]
+ldff1b	{ z31.d }, p7/z, [sp]
+ldff1b	{ z31.d }, p7/z, [z31.d, #31]
+ldff1b	{ z31.h }, p7/z, [sp]
+ldff1b	{ z31.s }, p7/z, [sp]
+ldff1b	{ z31.s }, p7/z, [z31.s, #31]
+ldff1d	{ z0.d }, p0/z, [x0, x0, lsl #3]
+ldff1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+ldff1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+ldff1d	{ z0.d }, p0/z, [z0.d]
+ldff1d	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1d	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1d	{ z23.d }, p3/z, [x13, z8.d, lsl #3]
+ldff1d	{ z31.d }, p7/z, [sp, z31.d]
+ldff1d	{ z31.d }, p7/z, [sp]
+ldff1d	{ z31.d }, p7/z, [z31.d, #248]
+ldff1h	{ z0.d }, p0/z, [x0, x0, lsl #1]
+ldff1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+ldff1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+ldff1h	{ z0.d }, p0/z, [z0.d]
+ldff1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
+ldff1h	{ z0.s }, p0/z, [x0, x0, lsl #1]
+ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ldff1h	{ z0.s }, p0/z, [z0.s]
+ldff1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1h	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+ldff1h	{ z31.d }, p7/z, [sp, z31.d]
+ldff1h	{ z31.d }, p7/z, [sp]
+ldff1h	{ z31.d }, p7/z, [z31.d, #62]
+ldff1h	{ z31.h }, p7/z, [sp]
+ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+ldff1h	{ z31.s }, p7/z, [sp]
+ldff1h	{ z31.s }, p7/z, [z31.s, #62]
+ldff1sb	{ z0.d }, p0/z, [x0, x0]
+ldff1sb	{ z0.d }, p0/z, [z0.d]
+ldff1sb	{ z0.h }, p0/z, [x0, x0]
+ldff1sb	{ z0.s }, p0/z, [x0, x0]
+ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ldff1sb	{ z0.s }, p0/z, [z0.s]
+ldff1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1sb	{ z31.d }, p7/z, [sp, z31.d]
+ldff1sb	{ z31.d }, p7/z, [sp]
+ldff1sb	{ z31.d }, p7/z, [z31.d, #31]
+ldff1sb	{ z31.h }, p7/z, [sp]
+ldff1sb	{ z31.s }, p7/z, [sp]
+ldff1sb	{ z31.s }, p7/z, [z31.s, #31]
+ldff1sh	{ z0.d }, p0/z, [x0, x0, lsl #1]
+ldff1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+ldff1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+ldff1sh	{ z0.d }, p0/z, [z0.d]
+ldff1sh	{ z0.s }, p0/z, [x0, x0, lsl #1]
+ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ldff1sh	{ z0.s }, p0/z, [z0.s]
+ldff1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1sh	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+ldff1sh	{ z31.d }, p7/z, [sp, z31.d]
+ldff1sh	{ z31.d }, p7/z, [sp]
+ldff1sh	{ z31.d }, p7/z, [z31.d, #62]
+ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+ldff1sh	{ z31.s }, p7/z, [sp]
+ldff1sh	{ z31.s }, p7/z, [z31.s, #62]
+ldff1sw	{ z0.d }, p0/z, [x0, x0, lsl #2]
+ldff1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+ldff1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+ldff1sw	{ z0.d }, p0/z, [z0.d]
+ldff1sw	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1sw	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1sw	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+ldff1sw	{ z31.d }, p7/z, [sp, z31.d]
+ldff1sw	{ z31.d }, p7/z, [sp]
+ldff1sw	{ z31.d }, p7/z, [z31.d, #124]
+ldff1w	{ z0.d }, p0/z, [x0, x0, lsl #2]
+ldff1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+ldff1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+ldff1w	{ z0.d }, p0/z, [z0.d]
+ldff1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
+ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+ldff1w	{ z0.s }, p0/z, [z0.s]
+ldff1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+ldff1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+ldff1w	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+ldff1w	{ z31.d }, p7/z, [sp, z31.d]
+ldff1w	{ z31.d }, p7/z, [sp]
+ldff1w	{ z31.d }, p7/z, [z31.d, #124]
+ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+ldff1w	{ z31.s }, p7/z, [sp]
+ldff1w	{ z31.s }, p7/z, [z31.s, #124]
+ldnf1b	{ z0.b }, p0/z, [x0]
+ldnf1b	{ z0.d }, p0/z, [x0]
+ldnf1b	{ z0.h }, p0/z, [x0]
+ldnf1b	{ z0.s }, p0/z, [x0]
+ldnf1b	{ z21.b }, p5/z, [x10, #5, mul vl]
+ldnf1b	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1b	{ z21.h }, p5/z, [x10, #5, mul vl]
+ldnf1b	{ z21.s }, p5/z, [x10, #5, mul vl]
+ldnf1b	{ z31.b }, p7/z, [sp, #-1, mul vl]
+ldnf1b	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1b	{ z31.h }, p7/z, [sp, #-1, mul vl]
+ldnf1b	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ldnf1d	{ z0.d }, p0/z, [x0]
+ldnf1d	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1d	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1h	{ z0.d }, p0/z, [x0]
+ldnf1h	{ z0.h }, p0/z, [x0]
+ldnf1h	{ z0.s }, p0/z, [x0]
+ldnf1h	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1h	{ z21.h }, p5/z, [x10, #5, mul vl]
+ldnf1h	{ z21.s }, p5/z, [x10, #5, mul vl]
+ldnf1h	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
+ldnf1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ldnf1sb	{ z0.d }, p0/z, [x0]
+ldnf1sb	{ z0.h }, p0/z, [x0]
+ldnf1sb	{ z0.s }, p0/z, [x0]
+ldnf1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1sb	{ z21.h }, p5/z, [x10, #5, mul vl]
+ldnf1sb	{ z21.s }, p5/z, [x10, #5, mul vl]
+ldnf1sb	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1sb	{ z31.h }, p7/z, [sp, #-1, mul vl]
+ldnf1sb	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ldnf1sh	{ z0.d }, p0/z, [x0]
+ldnf1sh	{ z0.s }, p0/z, [x0]
+ldnf1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1sh	{ z21.s }, p5/z, [x10, #5, mul vl]
+ldnf1sh	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ldnf1sw	{ z0.d }, p0/z, [x0]
+ldnf1sw	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1sw	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1w	{ z0.d }, p0/z, [x0]
+ldnf1w	{ z0.s }, p0/z, [x0]
+ldnf1w	{ z21.d }, p5/z, [x10, #5, mul vl]
+ldnf1w	{ z21.s }, p5/z, [x10, #5, mul vl]
+ldnf1w	{ z31.d }, p7/z, [sp, #-1, mul vl]
+ldnf1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
+ldnt1b	{ z0.b }, p0/z, [x0, x0]
+ldnt1b	{ z0.b }, p0/z, [x0]
+ldnt1b	{ z0.d }, p0/z, [z1.d]
+ldnt1b	{ z0.s }, p0/z, [z1.s]
+ldnt1b	{ z21.b }, p5/z, [x10, #7, mul vl]
+ldnt1b	{ z23.b }, p3/z, [x13, #-8, mul vl]
+ldnt1b	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1b	{ z31.d }, p7/z, [z31.d]
+ldnt1b	{ z31.s }, p7/z, [z31.s, x0]
+ldnt1b	{ z31.s }, p7/z, [z31.s]
+ldnt1d	{ z0.d }, p0/z, [x0, x0, lsl #3]
+ldnt1d	{ z0.d }, p0/z, [x0]
+ldnt1d	{ z0.d }, p0/z, [z1.d]
+ldnt1d	{ z21.d }, p5/z, [x10, #7, mul vl]
+ldnt1d	{ z23.d }, p3/z, [x13, #-8, mul vl]
+ldnt1d	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1d	{ z31.d }, p7/z, [z31.d]
+ldnt1h	{ z0.d }, p0/z, [z1.d]
+ldnt1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
+ldnt1h	{ z0.h }, p0/z, [x0]
+ldnt1h	{ z0.s }, p0/z, [z1.s]
+ldnt1h	{ z21.h }, p5/z, [x10, #7, mul vl]
+ldnt1h	{ z23.h }, p3/z, [x13, #-8, mul vl]
+ldnt1h	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1h	{ z31.d }, p7/z, [z31.d]
+ldnt1h	{ z31.s }, p7/z, [z31.s, x0]
+ldnt1h	{ z31.s }, p7/z, [z31.s]
+ldnt1sb	{ z0.d }, p0/z, [z1.d]
+ldnt1sb	{ z0.s }, p0/z, [z1.s]
+ldnt1sb	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1sb	{ z31.d }, p7/z, [z31.d]
+ldnt1sb	{ z31.s }, p7/z, [z31.s, x0]
+ldnt1sb	{ z31.s }, p7/z, [z31.s]
+ldnt1sh	{ z0.d }, p0/z, [z1.d]
+ldnt1sh	{ z0.s }, p0/z, [z1.s]
+ldnt1sh	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1sh	{ z31.d }, p7/z, [z31.d]
+ldnt1sh	{ z31.s }, p7/z, [z31.s, x0]
+ldnt1sh	{ z31.s }, p7/z, [z31.s]
+ldnt1sw	{ z0.d }, p0/z, [z1.d]
+ldnt1sw	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1sw	{ z31.d }, p7/z, [z31.d]
+ldnt1w	{ z0.d }, p0/z, [z1.d]
+ldnt1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
+ldnt1w	{ z0.s }, p0/z, [x0]
+ldnt1w	{ z0.s }, p0/z, [z1.s]
+ldnt1w	{ z21.s }, p5/z, [x10, #7, mul vl]
+ldnt1w	{ z23.s }, p3/z, [x13, #-8, mul vl]
+ldnt1w	{ z31.d }, p7/z, [z31.d, x0]
+ldnt1w	{ z31.d }, p7/z, [z31.d]
+ldnt1w	{ z31.s }, p7/z, [z31.s, x0]
+ldnt1w	{ z31.s }, p7/z, [z31.s]
+ldr	p0, [x0]
+ldr	p5, [x10, #255, mul vl]
+ldr	p7, [x13, #-256, mul vl]
+ldr	z0, [x0]
+ldr	z23, [x13, #255, mul vl]
+ldr	z31, [sp, #-256, mul vl]
+lsl	z0.b, p0/m, z0.b, #0
+lsl	z0.b, p0/m, z0.b, z0.b
+lsl	z0.b, p0/m, z0.b, z1.d
+lsl	z0.b, z0.b, #0
+lsl	z0.b, z1.b, z2.d
+lsl	z0.d, p0/m, z0.d, #0
+lsl	z0.d, p0/m, z0.d, z0.d
+lsl	z0.d, z0.d, #0
+lsl	z0.h, p0/m, z0.h, #0
+lsl	z0.h, p0/m, z0.h, z0.h
+lsl	z0.h, p0/m, z0.h, z1.d
+lsl	z0.h, z0.h, #0
+lsl	z0.h, z1.h, z2.d
+lsl	z0.s, p0/m, z0.s, #0
+lsl	z0.s, p0/m, z0.s, z0.s
+lsl	z0.s, p0/m, z0.s, z1.d
+lsl	z0.s, z0.s, #0
+lsl	z0.s, z1.s, z2.d
+lsl	z31.b, p0/m, z31.b, #7
+lsl	z31.b, z31.b, #7
+lsl	z31.d, p0/m, z31.d, #63
+lsl	z31.d, z31.d, #63
+lsl	z31.h, p0/m, z31.h, #15
+lsl	z31.h, z31.h, #15
+lsl	z31.s, p0/m, z31.s, #31
+lsl	z31.s, z31.s, #31
+lslr	z0.b, p0/m, z0.b, z0.b
+lslr	z0.d, p0/m, z0.d, z0.d
+lslr	z0.h, p0/m, z0.h, z0.h
+lslr	z0.s, p0/m, z0.s, z0.s
+lsr	z0.b, p0/m, z0.b, #1
+lsr	z0.b, p0/m, z0.b, z0.b
+lsr	z0.b, p0/m, z0.b, z1.d
+lsr	z0.b, z0.b, #1
+lsr	z0.b, z1.b, z2.d
+lsr	z0.d, p0/m, z0.d, #1
+lsr	z0.d, p0/m, z0.d, z0.d
+lsr	z0.d, z0.d, #1
+lsr	z0.h, p0/m, z0.h, #1
+lsr	z0.h, p0/m, z0.h, z0.h
+lsr	z0.h, p0/m, z0.h, z1.d
+lsr	z0.h, z0.h, #1
+lsr	z0.h, z1.h, z2.d
+lsr	z0.s, p0/m, z0.s, #1
+lsr	z0.s, p0/m, z0.s, z0.s
+lsr	z0.s, p0/m, z0.s, z1.d
+lsr	z0.s, z0.s, #1
+lsr	z0.s, z1.s, z2.d
+lsr	z31.b, p0/m, z31.b, #8
+lsr	z31.b, z31.b, #8
+lsr	z31.d, p0/m, z31.d, #64
+lsr	z31.d, z31.d, #64
+lsr	z31.h, p0/m, z31.h, #16
+lsr	z31.h, z31.h, #16
+lsr	z31.s, p0/m, z31.s, #32
+lsr	z31.s, z31.s, #32
+lsrr	z0.b, p0/m, z0.b, z0.b
+lsrr	z0.d, p0/m, z0.d, z0.d
+lsrr	z0.h, p0/m, z0.h, z0.h
+lsrr	z0.s, p0/m, z0.s, z0.s
+mad	z0.b, p7/m, z1.b, z31.b
+mad	z0.d, p7/m, z1.d, z31.d
+mad	z0.h, p7/m, z1.h, z31.h
+mad	z0.s, p7/m, z1.s, z31.s
+match	p0.b, p0/z, z0.b, z0.b
+match	p0.h, p0/z, z0.h, z0.h
+match	p15.b, p7/z, z30.b, z31.b
+match	p15.h, p7/z, z30.h, z31.h
+mla	z0.b, p7/m, z1.b, z31.b
+mla	z0.d, p7/m, z1.d, z31.d
+mla	z0.d, z1.d, z7.d[1]
+mla	z0.h, p7/m, z1.h, z31.h
+mla	z0.h, z1.h, z7.h[7]
+mla	z0.s, p7/m, z1.s, z31.s
+mla	z0.s, z1.s, z7.s[3]
+mls	z0.b, p7/m, z1.b, z31.b
+mls	z0.d, p7/m, z1.d, z31.d
+mls	z0.d, z1.d, z7.d[1]
+mls	z0.h, p7/m, z1.h, z31.h
+mls	z0.h, z1.h, z7.h[7]
+mls	z0.s, p7/m, z1.s, z31.s
+mls	z0.s, z1.s, z7.s[3]
+mov	p0.b, p0.b
+mov	p0.b, p0/m, p0.b
+mov	p0.b, p0/z, p0.b
+mov	p15.b, p15.b
+mov	p15.b, p15/m, p15.b
+mov	p15.b, p15/z, p15.b
+mov	z0.b, #127
+mov	z0.b, b0
+mov	z0.b, p0/m, b0
+mov	z0.b, p0/m, w0
+mov	z0.b, p0/z, #127
+mov	z0.b, w0
+mov	z0.d, #0
+mov	z0.d, #0xe0000000000003ff
+mov	z0.d, #0xffffffffffff7fff
+mov	z0.d, #32768
+mov	z0.d, d0
+mov	z0.d, p0/m, d0
+mov	z0.d, p0/m, x0
+mov	z0.d, x0
+mov	z0.d, z0.d
+mov	z0.h, #-256
+mov	z0.h, #-32768
+mov	z0.h, #0
+mov	z0.h, #32512
+mov	z0.h, #32767
+mov	z0.h, h0
+mov	z0.h, p0/m, h0
+mov	z0.h, p0/m, w0
+mov	z0.h, p0/z, #32512
+mov	z0.h, w0
+mov	z0.q, q0
+mov	z0.s, #0
+mov	z0.s, #0xffff7fff
+mov	z0.s, #32768
+mov	z0.s, p0/m, s0
+mov	z0.s, p0/m, w0
+mov	z0.s, s0
+mov	z0.s, w0
+mov	z21.d, #-128
+mov	z21.d, #-32768
+mov	z21.d, #127
+mov	z21.d, #32512
+mov	z21.d, p0/z, #-128
+mov	z21.d, p0/z, #-32768
+mov	z21.d, p0/z, #127
+mov	z21.d, p0/z, #32512
+mov	z21.d, p15/m, #-128
+mov	z21.d, p15/m, #-32768
+mov	z21.h, #-128
+mov	z21.h, #-32768
+mov	z21.h, #127
+mov	z21.h, #32512
+mov	z21.h, p0/z, #-128
+mov	z21.h, p0/z, #-32768
+mov	z21.h, p0/z, #127
+mov	z21.h, p0/z, #32512
+mov	z21.h, p15/m, #-128
+mov	z21.h, p15/m, #-32768
+mov	z21.s, #-128
+mov	z21.s, #-32768
+mov	z21.s, #127
+mov	z21.s, #32512
+mov	z21.s, p0/z, #-128
+mov	z21.s, p0/z, #-32768
+mov	z21.s, p0/z, #127
+mov	z21.s, p0/z, #32512
+mov	z21.s, p15/m, #-128
+mov	z21.s, p15/m, #-32768
+mov	z31.b, p15/m, z31.b
+mov	z31.b, p7/m, b31
+movprfx z31, z6
+mov	z31.b, p7/m, wsp
+mov	z31.b, wsp
+mov	z31.b, z31.b[63]
+mov	z31.d, p15/m, z31.d
+mov	z31.d, p7/m, d31
+movprfx z31.d, p7/z, z6.d
+mov	z31.d, p7/m, sp
+mov	z31.d, sp
+mov	z31.d, z0.d
+mov	z31.d, z31.d[7]
+mov	z31.h, p15/m, z31.h
+mov	z31.h, p7/m, h31
+mov	z31.h, p7/m, wsp
+mov	z31.h, wsp
+mov	z31.h, z31.h[31]
+mov	z31.s, p15/m, z31.s
+mov	z31.s, p7/m, s31
+mov	z31.s, p7/m, wsp
+mov	z31.s, wsp
+mov	z31.s, z31.s[15]
+mov	z5.b, #-1
+mov	z5.b, #-128
+mov	z5.b, #127
+mov	z5.b, p0/z, #-1
+mov	z5.b, p0/z, #-128
+mov	z5.b, p0/z, #127
+mov	z5.b, p15/m, #-128
+mov	z5.d, #-6
+mov	z5.h, #-6
+mov	z5.q, z17.q[3]
+mov	z5.s, #-6
+movs	p0.b, p0.b
+movs	p0.b, p0/z, p0.b
+movs	p15.b, p15.b
+movs	p15.b, p15/z, p15.b
+mrs	x3, ID_AA64ZFR0_EL1
+mrs	x3, ZCR_EL1
+mrs	x3, ZCR_EL12
+mrs	x3, ZCR_EL2
+mrs	x3, ZCR_EL3
+msb	z0.b, p7/m, z1.b, z31.b
+msb	z0.d, p7/m, z1.d, z31.d
+msb	z0.h, p7/m, z1.h, z31.h
+msb	z0.s, p7/m, z1.s, z31.s
+msr	ZCR_EL1, x3
+msr	ZCR_EL12, x3
+msr	ZCR_EL2, x3
+msr	ZCR_EL3, x3
+mul	z0.b, p7/m, z0.b, z31.b
+mul	z0.b, z1.b, z2.b
+mul	z0.d, p7/m, z0.d, z31.d
+mul	z0.d, z1.d, z15.d[1]
+mul	z0.h, p7/m, z0.h, z31.h
+mul	z0.h, z1.h, z2.h
+mul	z0.h, z1.h, z7.h[7]
+mul	z0.s, p7/m, z0.s, z31.s
+mul	z0.s, z1.s, z7.s[3]
+mul	z29.s, z30.s, z31.s
+mul	z31.b, z31.b, #-128
+mul	z31.b, z31.b, #127
+mul	z31.d, z31.d, #-128
+mul	z31.d, z31.d, #127
+mul	z31.d, z31.d, z31.d
+mul	z31.h, z31.h, #-128
+mul	z31.h, z31.h, #127
+mul	z31.s, z31.s, #-128
+mul	z31.s, z31.s, #127
+nand	p0.b, p0/z, p0.b, p0.b
+nand	p15.b, p15/z, p15.b, p15.b
+nands	p0.b, p0/z, p0.b, p0.b
+nands	p15.b, p15/z, p15.b, p15.b
+nbsl	z0.d, z0.d, z1.d, z2.d
+neg	z0.b, p0/m, z0.b
+neg	z0.d, p0/m, z0.d
+neg	z0.h, p0/m, z0.h
+neg	z0.s, p0/m, z0.s
+neg	z31.b, p7/m, z31.b
+neg	z31.d, p7/m, z31.d
+neg	z31.h, p7/m, z31.h
+neg	z31.s, p7/m, z31.s
+nmatch	p0.b, p0/z, z0.b, z0.b
+nmatch	p0.h, p0/z, z0.h, z0.h
+nmatch	p15.b, p7/z, z30.b, z31.b
+nmatch	p15.h, p7/z, z30.h, z31.h
+nor	p0.b, p0/z, p0.b, p0.b
+nor	p15.b, p15/z, p15.b, p15.b
+nors	p0.b, p0/z, p0.b, p0.b
+nors	p15.b, p15/z, p15.b, p15.b
+not	p0.b, p0/z, p0.b
+not	p15.b, p15/z, p15.b
+not	z31.b, p7/m, z31.b
+not	z31.d, p7/m, z31.d
+not	z31.h, p7/m, z31.h
+not	z31.s, p7/m, z31.s
+nots	p0.b, p0/z, p0.b
+nots	p15.b, p15/z, p15.b
+orn	p0.b, p0/z, p0.b, p0.b
+orn	p15.b, p15/z, p15.b, p15.b
+orns	p0.b, p0/z, p0.b, p0.b
+orns	p15.b, p15/z, p15.b, p15.b
+orr	p0.b, p0/z, p0.b, p1.b
+orr	z0.d, z0.d, #0x6
+orr	z0.d, z0.d, #0xfffffffffffffff9
+orr	z0.s, z0.s, #0x6
+orr	z0.s, z0.s, #0xfffffff9
+orr	z23.d, z13.d, z8.d
+orr	z23.h, z23.h, #0x6
+orr	z23.h, z23.h, #0xfff9
+orr	z31.b, p7/m, z31.b, z31.b
+orr	z31.d, p7/m, z31.d, z31.d
+orr	z31.h, p7/m, z31.h, z31.h
+orr	z31.s, p7/m, z31.s, z31.s
+orr	z5.b, z5.b, #0x6
+orr	z5.b, z5.b, #0xf9
+orrs	p0.b, p0/z, p0.b, p1.b
+orv	b0, p7, z31.b
+orv	d0, p7, z31.d
+orv	h0, p7, z31.h
+orv	s0, p7, z31.s
+pfalse	p15.b
+pfirst	p0.b, p15, p0.b
+pfirst	p15.b, p15, p15.b
+pmul	z0.b, z1.b, z2.b
+pmul	z29.b, z30.b, z31.b
+pmullb	z0.h, z1.b, z2.b
+pmullb	z29.q, z30.d, z31.d
+pmullb	z31.d, z31.s, z31.s
+pmullt	z0.h, z1.b, z2.b
+pmullt	z29.q, z30.d, z31.d
+pmullt	z31.d, z31.s, z31.s
+pnext	p0.b, p15, p0.b
+pnext	p0.d, p15, p0.d
+pnext	p0.h, p15, p0.h
+pnext	p0.s, p15, p0.s
+pnext	p15.b, p15, p15.b
+prfb	#14, p0, [x0]
+prfb	#15, p0, [x0]
+prfb	#6, p0, [x0]
+prfb	#7, p0, [x0]
+prfb	#7, p3, [z13.s, #31]
+prfb	#7, p3, [z13.s]
+prfb	pldl1keep, p0, [x0, z0.d, uxtw]
+prfb	pldl1keep, p0, [x0, z0.d]
+prfb	pldl1keep, p0, [x0, z0.s, uxtw]
+prfb	pldl1keep, p0, [x0]
+prfb	pldl1strm, p0, [x0, #-32, mul vl]
+prfb	pldl1strm, p0, [x0, #31, mul vl]
+prfb	pldl1strm, p0, [x0]
+prfb	pldl2keep, p0, [x0]
+prfb	pldl2strm, p0, [x0]
+prfb	pldl3keep, p0, [x0]
+prfb	pldl3strm, p0, [x0]
+prfb	pldl3strm, p5, [x10, z21.d, sxtw]
+prfb	pldl3strm, p5, [x10, z21.s, uxtw]
+prfb	pldl3strm, p5, [z10.d, #31]
+prfb	pldl3strm, p5, [z10.d]
+prfb	pstl1keep, p0, [x0]
+prfb	pstl1strm, p0, [x0]
+prfb	pstl2keep, p0, [x0]
+prfb	pstl2strm, p0, [x0]
+prfb	pstl3keep, p0, [x0]
+prfb	pstl3strm, p0, [x0]
+prfd	#14, p0, [x0]
+prfd	#15, p0, [x0]
+prfd	#15, p7, [z31.d, #248]
+prfd	#15, p7, [z31.d]
+prfd	#15, p7, [z31.s, #248]
+prfd	#15, p7, [z31.s]
+prfd	#6, p0, [x0]
+prfd	#7, p0, [x0]
+prfd	pldl1keep, p0, [x0, z0.d, lsl #3]
+prfd	pldl1keep, p0, [x0, z0.d, sxtw #3]
+prfd	pldl1keep, p0, [x0, z0.d, uxtw #3]
+prfd	pldl1keep, p0, [x0, z0.s, sxtw #3]
+prfd	pldl1keep, p0, [x0, z0.s, uxtw #3]
+prfd	pldl1keep, p0, [x0]
+prfd	pldl1strm, p0, [x0, #-32, mul vl]
+prfd	pldl1strm, p0, [x0, #31, mul vl]
+prfd	pldl1strm, p0, [x0]
+prfd	pldl2keep, p0, [x0]
+prfd	pldl2strm, p0, [x0]
+prfd	pldl3keep, p0, [x0]
+prfd	pldl3strm, p0, [x0]
+prfd	pstl1keep, p0, [x0]
+prfd	pstl1strm, p0, [x0]
+prfd	pstl2keep, p0, [x0]
+prfd	pstl2strm, p0, [x0]
+prfd	pstl3keep, p0, [x0]
+prfd	pstl3strm, p0, [x0]
+prfh	#14, p0, [x0]
+prfh	#15, p0, [x0]
+prfh	#15, p7, [z31.d, #62]
+prfh	#15, p7, [z31.d]
+prfh	#15, p7, [z31.s, #62]
+prfh	#15, p7, [z31.s]
+prfh	#6, p0, [x0]
+prfh	#7, p0, [x0]
+prfh	pldl1keep, p0, [x0, z0.d, lsl #1]
+prfh	pldl1keep, p0, [x0]
+prfh	pldl1strm, p0, [x0, #-32, mul vl]
+prfh	pldl1strm, p0, [x0, #31, mul vl]
+prfh	pldl1strm, p0, [x0]
+prfh	pldl2keep, p0, [x0]
+prfh	pldl2strm, p0, [x0]
+prfh	pldl3keep, p0, [x0]
+prfh	pldl3strm, p0, [x0]
+prfh	pldl3strm, p5, [x10, z21.d, sxtw #1]
+prfh	pldl3strm, p5, [x10, z21.d, uxtw #1]
+prfh	pldl3strm, p5, [x10, z21.s, sxtw #1]
+prfh	pldl3strm, p5, [x10, z21.s, uxtw #1]
+prfh	pstl1keep, p0, [x0]
+prfh	pstl1strm, p0, [x0]
+prfh	pstl2keep, p0, [x0]
+prfh	pstl2strm, p0, [x0]
+prfh	pstl3keep, p0, [x0]
+prfh	pstl3strm, p0, [x0]
+prfw	#14, p0, [x0]
+prfw	#15, p0, [x0]
+prfw	#15, p7, [z31.d, #124]
+prfw	#15, p7, [z31.d]
+prfw	#15, p7, [z31.s, #124]
+prfw	#15, p7, [z31.s]
+prfw	#6, p0, [x0]
+prfw	#7, p0, [x0]
+prfw	#7, p3, [x13, z8.d, uxtw #2]
+prfw	pldl1keep, p0, [x0, z0.d, sxtw #2]
+prfw	pldl1keep, p0, [x0, z0.s, uxtw #2]
+prfw	pldl1keep, p0, [x0]
+prfw	pldl1strm, p0, [x0, #-32, mul vl]
+prfw	pldl1strm, p0, [x0, #31, mul vl]
+prfw	pldl1strm, p0, [x0]
+prfw	pldl2keep, p0, [x0]
+prfw	pldl2strm, p0, [x0]
+prfw	pldl3keep, p0, [x0]
+prfw	pldl3strm, p0, [x0]
+prfw	pldl3strm, p5, [x10, z21.d, lsl #2]
+prfw	pldl3strm, p5, [x10, z21.s, sxtw #2]
+prfw	pstl1keep, p0, [x0]
+prfw	pstl1strm, p0, [x0]
+prfw	pstl2keep, p0, [x0]
+prfw	pstl2strm, p0, [x0]
+prfw	pstl3keep, p0, [x0]
+prfw	pstl3strm, p0, [x0]
+ptest	p15, p0.b
+ptest	p15, p15.b
+ptrue	p0.b, pow2
+ptrue	p0.d, pow2
+ptrue	p0.h, pow2
+ptrue	p0.s, pow2
+ptrue	p15.b
+ptrue	p15.d
+ptrue	p15.h
+ptrue	p15.s
+ptrue	p7.s
+ptrue	p7.s, #14
+ptrue	p7.s, #15
+ptrue	p7.s, #16
+ptrue	p7.s, #17
+ptrue	p7.s, #18
+ptrue	p7.s, #19
+ptrue	p7.s, #20
+ptrue	p7.s, #21
+ptrue	p7.s, #22
+ptrue	p7.s, #23
+ptrue	p7.s, #24
+ptrue	p7.s, #25
+ptrue	p7.s, #26
+ptrue	p7.s, #27
+ptrue	p7.s, #28
+ptrue	p7.s, mul3
+ptrue	p7.s, mul4
+ptrue	p7.s, vl1
+ptrue	p7.s, vl128
+ptrue	p7.s, vl16
+ptrue	p7.s, vl2
+ptrue	p7.s, vl256
+ptrue	p7.s, vl3
+ptrue	p7.s, vl32
+ptrue	p7.s, vl4
+ptrue	p7.s, vl5
+ptrue	p7.s, vl6
+ptrue	p7.s, vl64
+ptrue	p7.s, vl7
+ptrue	p7.s, vl8
+ptrues	p0.b, pow2
+ptrues	p0.d, pow2
+ptrues	p0.h, pow2
+ptrues	p0.s, pow2
+ptrues	p15.b
+ptrues	p15.d
+ptrues	p15.h
+ptrues	p15.s
+ptrues	p7.s
+ptrues	p7.s, #14
+ptrues	p7.s, #15
+ptrues	p7.s, #16
+ptrues	p7.s, #17
+ptrues	p7.s, #18
+ptrues	p7.s, #19
+ptrues	p7.s, #20
+ptrues	p7.s, #21
+ptrues	p7.s, #22
+ptrues	p7.s, #23
+ptrues	p7.s, #24
+ptrues	p7.s, #25
+ptrues	p7.s, #26
+ptrues	p7.s, #27
+ptrues	p7.s, #28
+ptrues	p7.s, mul3
+ptrues	p7.s, mul4
+ptrues	p7.s, vl1
+ptrues	p7.s, vl128
+ptrues	p7.s, vl16
+ptrues	p7.s, vl2
+ptrues	p7.s, vl256
+ptrues	p7.s, vl3
+ptrues	p7.s, vl32
+ptrues	p7.s, vl4
+ptrues	p7.s, vl5
+ptrues	p7.s, vl6
+ptrues	p7.s, vl64
+ptrues	p7.s, vl7
+ptrues	p7.s, vl8
+punpkhi	p0.h, p0.b
+punpkhi	p15.h, p15.b
+punpklo	p0.h, p0.b
+punpklo	p15.h, p15.b
+raddhnb	z0.b, z1.h, z31.h
+raddhnb	z0.h, z1.s, z31.s
+raddhnb	z0.s, z1.d, z31.d
+raddhnt	z0.b, z1.h, z31.h
+raddhnt	z0.h, z1.s, z31.s
+raddhnt	z0.s, z1.d, z31.d
+rax1	z0.d, z1.d, z31.d
+rbit	z0.b, p7/m, z31.b
+rbit	z0.d, p7/m, z31.d
+rbit	z0.h, p7/m, z31.h
+rbit	z0.s, p7/m, z31.s
+rdffr	p0.b
+rdffr	p0.b, p0/z
+rdffr	p15.b
+rdffr	p15.b, p15/z
+rdffrs	p0.b, p0/z
+rdffrs	p15.b, p15/z
+rdvl	x0, #0
+rdvl	x21, #-32
+rdvl	x23, #31
+rdvl	xzr, #-1
+rev	z0.b, z31.b
+rev	z0.d, z31.d
+rev	z0.h, z31.h
+rev	z0.s, z31.s
+revb	z0.d, p7/m, z31.d
+revb	z0.h, p7/m, z31.h
+revb	z0.s, p7/m, z31.s
+revh	z0.d, p7/m, z31.d
+revh	z0.s, p7/m, z31.s
+revw	z0.d, p7/m, z31.d
+rshrnb	z0.b, z0.h, #1
+rshrnb	z0.h, z0.s, #1
+rshrnb	z0.s, z0.d, #1
+rshrnb	z31.b, z31.h, #8
+rshrnb	z31.h, z31.s, #16
+rshrnb	z31.s, z31.d, #32
+rshrnt	z0.b, z0.h, #1
+rshrnt	z0.h, z0.s, #1
+rshrnt	z0.s, z0.d, #1
+rshrnt	z31.b, z31.h, #8
+rshrnt	z31.h, z31.s, #16
+rshrnt	z31.s, z31.d, #32
+rsubhnb	z0.b, z1.h, z31.h
+rsubhnb	z0.h, z1.s, z31.s
+rsubhnb	z0.s, z1.d, z31.d
+rsubhnt	z0.b, z1.h, z31.h
+rsubhnt	z0.h, z1.s, z31.s
+rsubhnt	z0.s, z1.d, z31.d
+saba	z0.b, z1.b, z31.b
+saba	z0.d, z1.d, z31.d
+saba	z0.h, z1.h, z31.h
+saba	z0.s, z1.s, z31.s
+sabalb	z0.d, z1.s, z31.s
+sabalb	z0.h, z1.b, z31.b
+sabalb	z0.s, z1.h, z31.h
+sabalt	z0.d, z1.s, z31.s
+sabalt	z0.h, z1.b, z31.b
+sabalt	z0.s, z1.h, z31.h
+sabd	z31.b, p7/m, z31.b, z31.b
+sabd	z31.d, p7/m, z31.d, z31.d
+sabd	z31.h, p7/m, z31.h, z31.h
+sabd	z31.s, p7/m, z31.s, z31.s
+sabdlb	z0.h, z1.b, z2.b
+sabdlb	z29.s, z30.h, z31.h
+sabdlb	z31.d, z31.s, z31.s
+sabdlt	z0.h, z1.b, z2.b
+sabdlt	z29.s, z30.h, z31.h
+sabdlt	z31.d, z31.s, z31.s
+sadalp	z0.h, p0/m, z1.b
+sadalp	z29.s, p0/m, z30.h
+sadalp	z30.d, p7/m, z31.s
+saddlb	z0.h, z1.b, z2.b
+saddlb	z29.s, z30.h, z31.h
+saddlb	z31.d, z31.s, z31.s
+saddlbt	z0.d, z1.s, z31.s
+saddlbt	z0.h, z1.b, z31.b
+saddlbt	z0.s, z1.h, z31.h
+saddlt	z0.h, z1.b, z2.b
+saddlt	z29.s, z30.h, z31.h
+saddlt	z31.d, z31.s, z31.s
+saddv	d0, p7, z31.b
+saddv	d0, p7, z31.h
+saddv	d0, p7, z31.s
+saddwb	z0.h, z1.h, z2.b
+saddwb	z29.s, z30.s, z31.h
+saddwb	z31.d, z31.d, z31.s
+saddwt	z0.h, z1.h, z2.b
+saddwt	z29.s, z30.s, z31.h
+saddwt	z31.d, z31.d, z31.s
+sbclb	z0.d, z1.d, z31.d
+sbclb	z0.s, z1.s, z31.s
+sbclt	z0.d, z1.d, z31.d
+sbclt	z0.s, z1.s, z31.s
+scvtf	z0.d, p0/m, z0.d
+scvtf	z0.d, p0/m, z0.s
+scvtf	z0.h, p0/m, z0.d
+scvtf	z0.h, p0/m, z0.h
+scvtf	z0.h, p0/m, z0.s
+scvtf	z0.s, p0/m, z0.d
+scvtf	z0.s, p0/m, z0.s
+sdiv	z0.d, p7/m, z0.d, z31.d
+sdiv	z0.s, p7/m, z0.s, z31.s
+sdivr	z0.d, p7/m, z0.d, z31.d
+sdivr	z0.s, p7/m, z0.s, z31.s
+sdot	z0.d, z1.h, z15.h[1]
+sdot	z0.d, z1.h, z31.h
+sdot	z0.s, z1.b, z31.b
+sdot	z0.s, z1.b, z7.b[3]
+sel	z23.b, p11, z13.b, z8.b
+sel	z23.d, p11, z13.d, z8.d
+sel	z23.h, p11, z13.h, z8.h
+sel	z23.s, p11, z13.s, z8.s
+setffr
+shadd	z0.b, p0/m, z0.b, z1.b
+shadd	z0.h, p0/m, z0.h, z1.h
+shadd	z29.s, p7/m, z29.s, z30.s
+shadd	z31.d, p7/m, z31.d, z30.d
+shrnb	z0.b, z0.h, #1
+shrnb	z0.h, z0.s, #1
+shrnb	z0.s, z0.d, #1
+shrnb	z31.b, z31.h, #8
+shrnb	z31.h, z31.s, #16
+shrnb	z31.s, z31.d, #32
+shrnt	z0.b, z0.h, #1
+shrnt	z0.h, z0.s, #1
+shrnt	z0.s, z0.d, #1
+shrnt	z31.b, z31.h, #8
+shrnt	z31.h, z31.s, #16
+shrnt	z31.s, z31.d, #32
+shsub	z0.b, p0/m, z0.b, z1.b
+shsub	z0.h, p0/m, z0.h, z1.h
+shsub	z29.s, p7/m, z29.s, z30.s
+shsub	z31.d, p7/m, z31.d, z30.d
+shsubr	z0.b, p0/m, z0.b, z1.b
+shsubr	z0.h, p0/m, z0.h, z1.h
+shsubr	z29.s, p7/m, z29.s, z30.s
+shsubr	z31.d, p7/m, z31.d, z30.d
+sli	z0.b, z0.b, #0
+sli	z0.d, z0.d, #0
+sli	z0.h, z0.h, #0
+sli	z0.s, z0.s, #0
+sli	z31.b, z31.b, #7
+sli	z31.d, z31.d, #63
+sli	z31.h, z31.h, #15
+sli	z31.s, z31.s, #31
+sm4e	z0.s, z0.s, z31.s
+sm4ekey	z0.s, z1.s, z31.s
+smax	z0.b, z0.b, #-128
+smax	z0.d, z0.d, #-128
+smax	z0.h, z0.h, #-128
+smax	z0.s, z0.s, #-128
+smax	z31.b, p7/m, z31.b, z31.b
+smax	z31.b, z31.b, #127
+smax	z31.d, p7/m, z31.d, z31.d
+smax	z31.d, z31.d, #127
+smax	z31.h, p7/m, z31.h, z31.h
+smax	z31.h, z31.h, #127
+smax	z31.s, p7/m, z31.s, z31.s
+smax	z31.s, z31.s, #127
+smaxp	z0.b, p0/m, z0.b, z1.b
+smaxp	z0.h, p0/m, z0.h, z1.h
+smaxp	z29.s, p7/m, z29.s, z30.s
+smaxp	z31.d, p7/m, z31.d, z30.d
+smaxv	b0, p7, z31.b
+smaxv	d0, p7, z31.d
+smaxv	h0, p7, z31.h
+smaxv	s0, p7, z31.s
+smin	z0.b, z0.b, #-128
+smin	z0.d, z0.d, #-128
+smin	z0.h, z0.h, #-128
+smin	z0.s, z0.s, #-128
+smin	z31.b, p7/m, z31.b, z31.b
+smin	z31.b, z31.b, #127
+smin	z31.d, p7/m, z31.d, z31.d
+smin	z31.d, z31.d, #127
+smin	z31.h, p7/m, z31.h, z31.h
+smin	z31.h, z31.h, #127
+smin	z31.s, p7/m, z31.s, z31.s
+smin	z31.s, z31.s, #127
+sminp	z0.b, p0/m, z0.b, z1.b
+sminp	z0.h, p0/m, z0.h, z1.h
+sminp	z29.s, p7/m, z29.s, z30.s
+sminp	z31.d, p7/m, z31.d, z30.d
+sminv	b0, p7, z31.b
+sminv	d0, p7, z31.d
+sminv	h0, p7, z31.h
+sminv	s0, p7, z31.s
+smlalb	z0.d, z1.s, z15.s[1]
+smlalb	z0.d, z1.s, z31.s
+smlalb	z0.h, z1.b, z31.b
+smlalb	z0.s, z1.h, z31.h
+smlalb	z0.s, z1.h, z7.h[7]
+smlalt	z0.d, z1.s, z15.s[1]
+smlalt	z0.d, z1.s, z31.s
+smlalt	z0.h, z1.b, z31.b
+smlalt	z0.s, z1.h, z31.h
+smlalt	z0.s, z1.h, z7.h[7]
+smlslb	z0.d, z1.s, z15.s[1]
+smlslb	z0.d, z1.s, z31.s
+smlslb	z0.h, z1.b, z31.b
+smlslb	z0.s, z1.h, z31.h
+smlslb	z0.s, z1.h, z7.h[7]
+smlslt	z0.d, z1.s, z15.s[1]
+smlslt	z0.d, z1.s, z31.s
+smlslt	z0.h, z1.b, z31.b
+smlslt	z0.s, z1.h, z31.h
+smlslt	z0.s, z1.h, z7.h[7]
+smmla	z0.s, z1.b, z2.b
+smulh	z0.b, p7/m, z0.b, z31.b
+smulh	z0.b, z1.b, z2.b
+smulh	z0.d, p7/m, z0.d, z31.d
+smulh	z0.h, p7/m, z0.h, z31.h
+smulh	z0.h, z1.h, z2.h
+smulh	z0.s, p7/m, z0.s, z31.s
+smulh	z29.s, z30.s, z31.s
+smulh	z31.d, z31.d, z31.d
+smullb	z0.d, z1.s, z15.s[1]
+smullb	z0.h, z1.b, z2.b
+smullb	z0.s, z1.h, z7.h[7]
+smullb	z29.s, z30.h, z31.h
+smullb	z31.d, z31.s, z31.s
+smullt	z0.d, z1.s, z15.s[1]
+smullt	z0.h, z1.b, z2.b
+smullt	z0.s, z1.h, z7.h[7]
+smullt	z29.s, z30.h, z31.h
+smullt	z31.d, z31.s, z31.s
+splice	z29.b, p7, { z30.b, z31.b }
+splice	z29.d, p7, { z30.d, z31.d }
+splice	z29.h, p7, { z30.h, z31.h }
+splice	z29.s, p7, { z30.s, z31.s }
+splice	z31.b, p7, z31.b, z31.b
+splice	z31.d, p7, z31.d, z31.d
+splice	z31.h, p7, z31.h, z31.h
+splice	z31.s, p7, z31.s, z31.s
+sqabs	z31.b, p7/m, z31.b
+sqabs	z31.d, p7/m, z31.d
+sqabs	z31.h, p7/m, z31.h
+sqabs	z31.s, p7/m, z31.s
+sqadd	z0.b, p0/m, z0.b, z1.b
+sqadd	z0.b, z0.b, #0
+sqadd	z0.b, z0.b, z0.b
+sqadd	z0.d, z0.d, #0
+sqadd	z0.d, z0.d, #0, lsl #8
+sqadd	z0.d, z0.d, z0.d
+sqadd	z0.h, p0/m, z0.h, z1.h
+sqadd	z0.h, z0.h, #0
+sqadd	z0.h, z0.h, #0, lsl #8
+sqadd	z0.h, z0.h, z0.h
+sqadd	z0.s, z0.s, #0
+sqadd	z0.s, z0.s, #0, lsl #8
+sqadd	z0.s, z0.s, z0.s
+sqadd	z29.s, p7/m, z29.s, z30.s
+sqadd	z31.b, z31.b, #255
+sqadd	z31.d, p7/m, z31.d, z30.d
+sqadd	z31.d, z31.d, #65280
+sqadd	z31.h, z31.h, #65280
+sqadd	z31.s, z31.s, #65280
+sqcadd	z0.b, z0.b, z0.b, #90
+sqcadd	z0.d, z0.d, z0.d, #90
+sqcadd	z0.h, z0.h, z0.h, #90
+sqcadd	z0.s, z0.s, z0.s, #90
+sqcadd	z31.b, z31.b, z31.b, #270
+sqcadd	z31.d, z31.d, z31.d, #270
+sqcadd	z31.h, z31.h, z31.h, #270
+sqcadd	z31.s, z31.s, z31.s, #270
+sqdecb	x0
+sqdecb	x0, #14
+sqdecb	x0, all, mul #16
+sqdecb	x0, pow2
+sqdecb	x0, vl1
+sqdecb	x0, w0
+sqdecb	x0, w0, all, mul #16
+sqdecb	x0, w0, pow2
+sqdecb	x0, w0, pow2, mul #16
+sqdecd	x0
+sqdecd	x0, #14
+sqdecd	x0, all, mul #16
+sqdecd	x0, pow2
+sqdecd	x0, vl1
+sqdecd	x0, w0
+sqdecd	x0, w0, all, mul #16
+sqdecd	x0, w0, pow2
+sqdecd	x0, w0, pow2, mul #16
+sqdecd	z0.d
+sqdecd	z0.d, all, mul #16
+sqdecd	z0.d, pow2
+sqdecd	z0.d, pow2, mul #16
+sqdech	x0
+sqdech	x0, #14
+sqdech	x0, all, mul #16
+sqdech	x0, pow2
+sqdech	x0, vl1
+sqdech	x0, w0
+sqdech	x0, w0, all, mul #16
+sqdech	x0, w0, pow2
+sqdech	x0, w0, pow2, mul #16
+sqdech	z0.h
+sqdech	z0.h, all, mul #16
+sqdech	z0.h, pow2
+sqdech	z0.h, pow2, mul #16
+sqdecp	x0, p0.b
+sqdecp	x0, p0.d
+sqdecp	x0, p0.h
+sqdecp	x0, p0.s
+sqdecp	xzr, p15.b, wzr
+sqdecp	xzr, p15.d, wzr
+sqdecp	xzr, p15.h, wzr
+sqdecp	xzr, p15.s, wzr
+sqdecp	z0.d, p0.d
+sqdecp	z0.h, p0.h
+sqdecp	z0.s, p0.s
+sqdecw	x0
+sqdecw	x0, #14
+sqdecw	x0, all, mul #16
+sqdecw	x0, pow2
+sqdecw	x0, vl1
+sqdecw	x0, w0
+sqdecw	x0, w0, all, mul #16
+sqdecw	x0, w0, pow2
+sqdecw	x0, w0, pow2, mul #16
+sqdecw	z0.s
+sqdecw	z0.s, all, mul #16
+sqdecw	z0.s, pow2
+sqdecw	z0.s, pow2, mul #16
+sqdmlalb	z0.d, z1.s, z15.s[3]
+sqdmlalb	z0.d, z1.s, z31.s
+sqdmlalb	z0.h, z1.b, z31.b
+sqdmlalb	z0.s, z1.h, z31.h
+sqdmlalb	z0.s, z1.h, z7.h[7]
+sqdmlalbt	z0.d, z1.s, z31.s
+sqdmlalbt	z0.h, z1.b, z31.b
+sqdmlalbt	z0.s, z1.h, z31.h
+sqdmlalt	z0.d, z1.s, z15.s[3]
+sqdmlalt	z0.d, z1.s, z31.s
+sqdmlalt	z0.h, z1.b, z31.b
+sqdmlalt	z0.s, z1.h, z31.h
+sqdmlalt	z0.s, z1.h, z7.h[7]
+sqdmlslb	z0.d, z1.s, z15.s[3]
+sqdmlslb	z0.d, z1.s, z31.s
+sqdmlslb	z0.h, z1.b, z31.b
+sqdmlslb	z0.s, z1.h, z31.h
+sqdmlslb	z0.s, z1.h, z7.h[7]
+sqdmlslbt	z0.d, z1.s, z31.s
+sqdmlslbt	z0.h, z1.b, z31.b
+sqdmlslbt	z0.s, z1.h, z31.h
+sqdmlslt	z0.d, z1.s, z15.s[3]
+sqdmlslt	z0.d, z1.s, z31.s
+sqdmlslt	z0.h, z1.b, z31.b
+sqdmlslt	z0.s, z1.h, z31.h
+sqdmlslt	z0.s, z1.h, z7.h[7]
+sqdmulh	z0.b, z1.b, z2.b
+sqdmulh	z0.d, z1.d, z15.d[1]
+sqdmulh	z0.h, z1.h, z2.h
+sqdmulh	z0.h, z1.h, z7.h[7]
+sqdmulh	z0.s, z1.s, z7.s[3]
+sqdmulh	z29.s, z30.s, z31.s
+sqdmulh	z31.d, z31.d, z31.d
+sqdmullb	z0.d, z1.s, z15.s[1]
+sqdmullb	z0.h, z1.b, z2.b
+sqdmullb	z0.s, z1.h, z7.h[7]
+sqdmullb	z29.s, z30.h, z31.h
+sqdmullb	z31.d, z31.s, z31.s
+sqdmullt	z0.d, z1.s, z15.s[1]
+sqdmullt	z0.h, z1.b, z2.b
+sqdmullt	z0.s, z1.h, z7.h[7]
+sqdmullt	z29.s, z30.h, z31.h
+sqdmullt	z31.d, z31.s, z31.s
+sqincb	x0
+sqincb	x0, #14
+sqincb	x0, all, mul #16
+sqincb	x0, pow2
+sqincb	x0, vl1
+sqincb	x0, w0
+sqincb	x0, w0, all, mul #16
+sqincb	x0, w0, pow2
+sqincb	x0, w0, pow2, mul #16
+sqincd	x0
+sqincd	x0, #14
+sqincd	x0, all, mul #16
+sqincd	x0, pow2
+sqincd	x0, vl1
+sqincd	x0, w0
+sqincd	x0, w0, all, mul #16
+sqincd	x0, w0, pow2
+sqincd	x0, w0, pow2, mul #16
+sqincd	z0.d
+sqincd	z0.d, all, mul #16
+sqincd	z0.d, pow2
+sqincd	z0.d, pow2, mul #16
+sqinch	x0
+sqinch	x0, #14
+sqinch	x0, all, mul #16
+sqinch	x0, pow2
+sqinch	x0, vl1
+sqinch	x0, w0
+sqinch	x0, w0, all, mul #16
+sqinch	x0, w0, pow2
+sqinch	x0, w0, pow2, mul #16
+sqinch	z0.h
+sqinch	z0.h, all, mul #16
+sqinch	z0.h, pow2
+sqinch	z0.h, pow2, mul #16
+sqincp	x0, p0.b
+sqincp	x0, p0.d
+sqincp	x0, p0.h
+sqincp	x0, p0.s
+sqincp	xzr, p15.b, wzr
+sqincp	xzr, p15.d, wzr
+sqincp	xzr, p15.h, wzr
+sqincp	xzr, p15.s, wzr
+sqincp	z0.d, p0.d
+sqincp	z0.h, p0.h
+sqincp	z0.s, p0.s
+sqincw	x0
+sqincw	x0, #14
+sqincw	x0, all, mul #16
+sqincw	x0, pow2
+sqincw	x0, vl1
+sqincw	x0, w0
+sqincw	x0, w0, all, mul #16
+sqincw	x0, w0, pow2
+sqincw	x0, w0, pow2, mul #16
+sqincw	z0.s
+sqincw	z0.s, all, mul #16
+sqincw	z0.s, pow2
+sqincw	z0.s, pow2, mul #16
+sqneg	z31.b, p7/m, z31.b
+sqneg	z31.d, p7/m, z31.d
+sqneg	z31.h, p7/m, z31.h
+sqneg	z31.s, p7/m, z31.s
+sqrdcmlah	z0.b, z1.b, z2.b, #0
+sqrdcmlah	z0.d, z1.d, z2.d, #0
+sqrdcmlah	z0.h, z1.h, z2.h, #0
+sqrdcmlah	z0.h, z1.h, z2.h[0], #0
+sqrdcmlah	z0.s, z1.s, z2.s, #0
+sqrdcmlah	z0.s, z1.s, z2.s[0], #0
+sqrdcmlah	z15.b, z16.b, z17.b, #270
+sqrdcmlah	z15.d, z16.d, z17.d, #270
+sqrdcmlah	z15.h, z16.h, z17.h, #270
+sqrdcmlah	z15.s, z16.s, z17.s, #270
+sqrdcmlah	z29.b, z30.b, z31.b, #90
+sqrdcmlah	z29.d, z30.d, z31.d, #90
+sqrdcmlah	z29.h, z30.h, z31.h, #90
+sqrdcmlah	z29.s, z30.s, z31.s, #90
+sqrdcmlah	z31.b, z31.b, z31.b, #180
+sqrdcmlah	z31.d, z31.d, z31.d, #180
+sqrdcmlah	z31.h, z30.h, z7.h[0], #180
+sqrdcmlah	z31.h, z31.h, z31.h, #180
+sqrdcmlah	z31.s, z30.s, z7.s[0], #180
+sqrdcmlah	z31.s, z31.s, z31.s, #180
+sqrdmlah	z0.b, z1.b, z31.b
+sqrdmlah	z0.d, z1.d, z15.d[1]
+sqrdmlah	z0.d, z1.d, z31.d
+sqrdmlah	z0.h, z1.h, z31.h
+sqrdmlah	z0.h, z1.h, z7.h[7]
+sqrdmlah	z0.s, z1.s, z31.s
+sqrdmlah	z0.s, z1.s, z7.s[3]
+sqrdmlsh	z0.b, z1.b, z31.b
+sqrdmlsh	z0.d, z1.d, z15.d[1]
+sqrdmlsh	z0.d, z1.d, z31.d
+sqrdmlsh	z0.h, z1.h, z31.h
+sqrdmlsh	z0.h, z1.h, z7.h[7]
+sqrdmlsh	z0.s, z1.s, z31.s
+sqrdmlsh	z0.s, z1.s, z7.s[3]
+sqrdmulh	z0.b, z1.b, z2.b
+sqrdmulh	z0.d, z1.d, z15.d[1]
+sqrdmulh	z0.h, z1.h, z2.h
+sqrdmulh	z0.h, z1.h, z7.h[7]
+sqrdmulh	z0.s, z1.s, z7.s[3]
+sqrdmulh	z29.s, z30.s, z31.s
+sqrdmulh	z31.d, z31.d, z31.d
+sqrshl	z0.b, p0/m, z0.b, z1.b
+sqrshl	z0.h, p0/m, z0.h, z1.h
+sqrshl	z29.s, p7/m, z29.s, z30.s
+sqrshl	z31.d, p7/m, z31.d, z30.d
+sqrshlr	z0.b, p0/m, z0.b, z1.b
+sqrshlr	z0.h, p0/m, z0.h, z1.h
+sqrshlr	z29.s, p7/m, z29.s, z30.s
+sqrshlr	z31.d, p7/m, z31.d, z30.d
+sqrshrnb	z0.b, z0.h, #1
+sqrshrnb	z0.h, z0.s, #1
+sqrshrnb	z0.s, z0.d, #1
+sqrshrnb	z31.b, z31.h, #8
+sqrshrnb	z31.h, z31.s, #16
+sqrshrnb	z31.s, z31.d, #32
+sqrshrnt	z0.b, z0.h, #1
+sqrshrnt	z0.h, z0.s, #1
+sqrshrnt	z0.s, z0.d, #1
+sqrshrnt	z31.b, z31.h, #8
+sqrshrnt	z31.h, z31.s, #16
+sqrshrnt	z31.s, z31.d, #32
+sqrshrunb	z0.b, z0.h, #1
+sqrshrunb	z0.h, z0.s, #1
+sqrshrunb	z0.s, z0.d, #1
+sqrshrunb	z31.b, z31.h, #8
+sqrshrunb	z31.h, z31.s, #16
+sqrshrunb	z31.s, z31.d, #32
+sqrshrunt	z0.b, z0.h, #1
+sqrshrunt	z0.h, z0.s, #1
+sqrshrunt	z0.s, z0.d, #1
+sqrshrunt	z31.b, z31.h, #8
+sqrshrunt	z31.h, z31.s, #16
+sqrshrunt	z31.s, z31.d, #32
+sqshl	z0.b, p0/m, z0.b, #0
+sqshl	z0.b, p0/m, z0.b, z1.b
+sqshl	z0.d, p0/m, z0.d, #0
+sqshl	z0.h, p0/m, z0.h, #0
+sqshl	z0.h, p0/m, z0.h, z1.h
+sqshl	z0.s, p0/m, z0.s, #0
+sqshl	z29.s, p7/m, z29.s, z30.s
+sqshl	z31.b, p0/m, z31.b, #7
+sqshl	z31.d, p0/m, z31.d, #63
+sqshl	z31.d, p7/m, z31.d, z30.d
+sqshl	z31.h, p0/m, z31.h, #15
+sqshl	z31.s, p0/m, z31.s, #31
+sqshlr	z0.b, p0/m, z0.b, z1.b
+sqshlr	z0.h, p0/m, z0.h, z1.h
+sqshlr	z29.s, p7/m, z29.s, z30.s
+sqshlr	z31.d, p7/m, z31.d, z30.d
+sqshlu	z0.b, p0/m, z0.b, #0
+sqshlu	z0.d, p0/m, z0.d, #0
+sqshlu	z0.h, p0/m, z0.h, #0
+sqshlu	z0.s, p0/m, z0.s, #0
+sqshlu	z31.b, p0/m, z31.b, #7
+sqshlu	z31.d, p0/m, z31.d, #63
+sqshlu	z31.h, p0/m, z31.h, #15
+sqshlu	z31.s, p0/m, z31.s, #31
+sqshrnb	z0.b, z0.h, #1
+sqshrnb	z0.h, z0.s, #1
+sqshrnb	z0.s, z0.d, #1
+sqshrnb	z31.b, z31.h, #8
+sqshrnb	z31.h, z31.s, #16
+sqshrnb	z31.s, z31.d, #32
+sqshrnt	z0.b, z0.h, #1
+sqshrnt	z0.h, z0.s, #1
+sqshrnt	z0.s, z0.d, #1
+sqshrnt	z31.b, z31.h, #8
+sqshrnt	z31.h, z31.s, #16
+sqshrnt	z31.s, z31.d, #32
+sqshrunb	z0.b, z0.h, #1
+sqshrunb	z0.h, z0.s, #1
+sqshrunb	z0.s, z0.d, #1
+sqshrunb	z31.b, z31.h, #8
+sqshrunb	z31.h, z31.s, #16
+sqshrunb	z31.s, z31.d, #32
+sqshrunt	z0.b, z0.h, #1
+sqshrunt	z0.h, z0.s, #1
+sqshrunt	z0.s, z0.d, #1
+sqshrunt	z31.b, z31.h, #8
+sqshrunt	z31.h, z31.s, #16
+sqshrunt	z31.s, z31.d, #32
+sqsub	z0.b, p0/m, z0.b, z1.b
+sqsub	z0.b, z0.b, #0
+sqsub	z0.b, z0.b, z0.b
+sqsub	z0.d, z0.d, #0
+sqsub	z0.d, z0.d, #0, lsl #8
+sqsub	z0.d, z0.d, z0.d
+sqsub	z0.h, p0/m, z0.h, z1.h
+sqsub	z0.h, z0.h, #0
+sqsub	z0.h, z0.h, #0, lsl #8
+sqsub	z0.h, z0.h, z0.h
+sqsub	z0.s, z0.s, #0
+sqsub	z0.s, z0.s, #0, lsl #8
+sqsub	z0.s, z0.s, z0.s
+sqsub	z29.s, p7/m, z29.s, z30.s
+sqsub	z31.b, z31.b, #255
+sqsub	z31.d, p7/m, z31.d, z30.d
+sqsub	z31.d, z31.d, #65280
+sqsub	z31.h, z31.h, #65280
+sqsub	z31.s, z31.s, #65280
+sqsubr	z0.b, p0/m, z0.b, z1.b
+sqsubr	z0.h, p0/m, z0.h, z1.h
+sqsubr	z29.s, p7/m, z29.s, z30.s
+sqsubr	z31.d, p7/m, z31.d, z30.d
+sqxtnb	z0.b, z31.h
+sqxtnb	z0.h, z31.s
+sqxtnb	z0.s, z31.d
+sqxtnt	z0.b, z31.h
+sqxtnt	z0.h, z31.s
+sqxtnt	z0.s, z31.d
+sqxtunb	z0.b, z31.h
+sqxtunb	z0.h, z31.s
+sqxtunb	z0.s, z31.d
+sqxtunt	z0.b, z31.h
+sqxtunt	z0.h, z31.s
+sqxtunt	z0.s, z31.d
+srhadd	z0.b, p0/m, z0.b, z1.b
+srhadd	z0.h, p0/m, z0.h, z1.h
+srhadd	z29.s, p7/m, z29.s, z30.s
+srhadd	z31.d, p7/m, z31.d, z30.d
+sri	z0.b, z0.b, #1
+sri	z0.d, z0.d, #1
+sri	z0.h, z0.h, #1
+sri	z0.s, z0.s, #1
+sri	z31.b, z31.b, #8
+sri	z31.d, z31.d, #64
+sri	z31.h, z31.h, #16
+sri	z31.s, z31.s, #32
+srshl	z0.b, p0/m, z0.b, z1.b
+srshl	z0.h, p0/m, z0.h, z1.h
+srshl	z29.s, p7/m, z29.s, z30.s
+srshl	z31.d, p7/m, z31.d, z30.d
+srshlr	z0.b, p0/m, z0.b, z1.b
+srshlr	z0.h, p0/m, z0.h, z1.h
+srshlr	z29.s, p7/m, z29.s, z30.s
+srshlr	z31.d, p7/m, z31.d, z30.d
+srshr	z0.b, p0/m, z0.b, #1
+srshr	z0.d, p0/m, z0.d, #1
+srshr	z0.h, p0/m, z0.h, #1
+srshr	z0.s, p0/m, z0.s, #1
+srshr	z31.b, p0/m, z31.b, #8
+srshr	z31.d, p0/m, z31.d, #64
+srshr	z31.h, p0/m, z31.h, #16
+srshr	z31.s, p0/m, z31.s, #32
+srsra	z0.b, z0.b, #1
+srsra	z0.d, z0.d, #1
+srsra	z0.h, z0.h, #1
+srsra	z0.s, z0.s, #1
+srsra	z31.b, z31.b, #8
+srsra	z31.d, z31.d, #64
+srsra	z31.h, z31.h, #16
+srsra	z31.s, z31.s, #32
+sshllb	z0.d, z0.s, #0
+sshllb	z0.h, z0.b, #0
+sshllb	z0.s, z0.h, #0
+sshllb	z31.d, z31.s, #31
+sshllb	z31.h, z31.b, #7
+sshllb	z31.s, z31.h, #15
+sshllt	z0.d, z0.s, #0
+sshllt	z0.h, z0.b, #0
+sshllt	z0.s, z0.h, #0
+sshllt	z31.d, z31.s, #31
+sshllt	z31.h, z31.b, #7
+sshllt	z31.s, z31.h, #15
+ssra	z0.b, z0.b, #1
+ssra	z0.d, z0.d, #1
+ssra	z0.h, z0.h, #1
+ssra	z0.s, z0.s, #1
+ssra	z31.b, z31.b, #8
+ssra	z31.d, z31.d, #64
+ssra	z31.h, z31.h, #16
+ssra	z31.s, z31.s, #32
+ssublb	z0.h, z1.b, z2.b
+ssublb	z29.s, z30.h, z31.h
+ssublb	z31.d, z31.s, z31.s
+ssublbt	z0.d, z1.s, z31.s
+ssublbt	z0.h, z1.b, z31.b
+ssublbt	z0.s, z1.h, z31.h
+ssublt	z0.h, z1.b, z2.b
+ssublt	z29.s, z30.h, z31.h
+ssublt	z31.d, z31.s, z31.s
+ssubltb	z0.d, z1.s, z31.s
+ssubltb	z0.h, z1.b, z31.b
+ssubltb	z0.s, z1.h, z31.h
+ssubwb	z0.h, z1.h, z2.b
+ssubwb	z29.s, z30.s, z31.h
+ssubwb	z31.d, z31.d, z31.s
+ssubwt	z0.h, z1.h, z2.b
+ssubwt	z29.s, z30.s, z31.h
+ssubwt	z31.d, z31.d, z31.s
+st1b	{ z0.b }, p0, [x0, x0]
+st1b	{ z0.b }, p0, [x0]
+st1b	{ z0.d }, p0, [x0, x0]
+st1b	{ z0.d }, p0, [x0, z0.d, sxtw]
+st1b	{ z0.d }, p0, [x0, z0.d, uxtw]
+st1b	{ z0.d }, p0, [x0, z0.d]
+st1b	{ z0.d }, p0, [x0]
+st1b	{ z0.d }, p7, [z0.d]
+st1b	{ z0.h }, p0, [x0, x0]
+st1b	{ z0.h }, p0, [x0]
+st1b	{ z0.s }, p0, [x0, x0]
+st1b	{ z0.s }, p0, [x0, z0.s, sxtw]
+st1b	{ z0.s }, p0, [x0, z0.s, uxtw]
+st1b	{ z0.s }, p0, [x0]
+st1b	{ z0.s }, p7, [z0.s]
+st1b	{ z21.b }, p5, [x10, #5, mul vl]
+st1b	{ z21.d }, p5, [x10, #5, mul vl]
+st1b	{ z21.h }, p5, [x10, #5, mul vl]
+st1b	{ z21.s }, p5, [x10, #5, mul vl]
+st1b	{ z31.b }, p7, [sp, #-1, mul vl]
+st1b	{ z31.d }, p7, [sp, #-1, mul vl]
+st1b	{ z31.d }, p7, [z31.d, #31]
+st1b	{ z31.h }, p7, [sp, #-1, mul vl]
+st1b	{ z31.s }, p7, [sp, #-1, mul vl]
+st1b	{ z31.s }, p7, [z31.s, #31]
+st1d	{ z0.d }, p0, [x0, x0, lsl #3]
+st1d	{ z0.d }, p0, [x0, z0.d, lsl #3]
+st1d	{ z0.d }, p0, [x0, z0.d, sxtw #3]
+st1d	{ z0.d }, p0, [x0, z0.d, sxtw]
+st1d	{ z0.d }, p0, [x0, z0.d, uxtw #3]
+st1d	{ z0.d }, p0, [x0, z0.d, uxtw]
+st1d	{ z0.d }, p0, [x0, z0.d]
+st1d	{ z0.d }, p0, [x0]
+st1d	{ z0.d }, p7, [z0.d]
+st1d	{ z21.d }, p5, [x10, #5, mul vl]
+st1d	{ z31.d }, p7, [sp, #-1, mul vl]
+st1d	{ z31.d }, p7, [z31.d, #248]
+st1h	{ z0.d }, p0, [x0, x0, lsl #1]
+st1h	{ z0.d }, p0, [x0, z0.d, lsl #1]
+st1h	{ z0.d }, p0, [x0, z0.d, sxtw #1]
+st1h	{ z0.d }, p0, [x0, z0.d, sxtw]
+st1h	{ z0.d }, p0, [x0, z0.d, uxtw #1]
+st1h	{ z0.d }, p0, [x0, z0.d, uxtw]
+st1h	{ z0.d }, p0, [x0, z0.d]
+st1h	{ z0.d }, p0, [x0]
+st1h	{ z0.d }, p7, [z0.d]
+st1h	{ z0.h }, p0, [x0, x0, lsl #1]
+st1h	{ z0.h }, p0, [x0]
+st1h	{ z0.s }, p0, [x0, x0, lsl #1]
+st1h	{ z0.s }, p0, [x0, z0.s, sxtw #1]
+st1h	{ z0.s }, p0, [x0, z0.s, sxtw]
+st1h	{ z0.s }, p0, [x0, z0.s, uxtw #1]
+st1h	{ z0.s }, p0, [x0, z0.s, uxtw]
+st1h	{ z0.s }, p0, [x0]
+st1h	{ z0.s }, p7, [z0.s]
+st1h	{ z21.d }, p5, [x10, #5, mul vl]
+st1h	{ z21.h }, p5, [x10, #5, mul vl]
+st1h	{ z21.s }, p5, [x10, #5, mul vl]
+st1h	{ z31.d }, p7, [sp, #-1, mul vl]
+st1h	{ z31.d }, p7, [z31.d, #62]
+st1h	{ z31.h }, p7, [sp, #-1, mul vl]
+st1h	{ z31.s }, p7, [sp, #-1, mul vl]
+st1h	{ z31.s }, p7, [z31.s, #62]
+st1w	{ z0.d }, p0, [x0, x0, lsl #2]
+st1w	{ z0.d }, p0, [x0, z0.d, lsl #2]
+st1w	{ z0.d }, p0, [x0, z0.d, sxtw #2]
+st1w	{ z0.d }, p0, [x0, z0.d, sxtw]
+st1w	{ z0.d }, p0, [x0, z0.d, uxtw #2]
+st1w	{ z0.d }, p0, [x0, z0.d, uxtw]
+st1w	{ z0.d }, p0, [x0, z0.d]
+st1w	{ z0.d }, p0, [x0]
+st1w	{ z0.d }, p7, [z0.d]
+st1w	{ z0.s }, p0, [x0, x0, lsl #2]
+st1w	{ z0.s }, p0, [x0, z0.s, sxtw #2]
+st1w	{ z0.s }, p0, [x0, z0.s, sxtw]
+st1w	{ z0.s }, p0, [x0, z0.s, uxtw #2]
+st1w	{ z0.s }, p0, [x0, z0.s, uxtw]
+st1w	{ z0.s }, p0, [x0]
+st1w	{ z0.s }, p7, [z0.s]
+st1w	{ z21.d }, p5, [x10, #5, mul vl]
+st1w	{ z21.s }, p5, [x10, #5, mul vl]
+st1w	{ z31.d }, p7, [sp, #-1, mul vl]
+st1w	{ z31.d }, p7, [z31.d, #124]
+st1w	{ z31.s }, p7, [sp, #-1, mul vl]
+st1w	{ z31.s }, p7, [z31.s, #124]
+st2b	{ z0.b, z1.b }, p0, [x0, x0]
+st2b	{ z0.b, z1.b }, p0, [x0]
+st2b	{ z21.b, z22.b }, p5, [x10, #10, mul vl]
+st2b	{ z23.b, z24.b }, p3, [x13, #-16, mul vl]
+st2b	{ z5.b, z6.b }, p3, [x17, x16]
+st2d	{ z0.d, z1.d }, p0, [x0, x0, lsl #3]
+st2d	{ z0.d, z1.d }, p0, [x0]
+st2d	{ z21.d, z22.d }, p5, [x10, #10, mul vl]
+st2d	{ z23.d, z24.d }, p3, [x13, #-16, mul vl]
+st2d	{ z5.d, z6.d }, p3, [x17, x16, lsl #3]
+st2h	{ z0.h, z1.h }, p0, [x0, x0, lsl #1]
+st2h	{ z0.h, z1.h }, p0, [x0]
+st2h	{ z21.h, z22.h }, p5, [x10, #10, mul vl]
+st2h	{ z23.h, z24.h }, p3, [x13, #-16, mul vl]
+st2h	{ z5.h, z6.h }, p3, [x17, x16, lsl #1]
+st2w	{ z0.s, z1.s }, p0, [x0, x0, lsl #2]
+st2w	{ z0.s, z1.s }, p0, [x0]
+st2w	{ z21.s, z22.s }, p5, [x10, #10, mul vl]
+st2w	{ z23.s, z24.s }, p3, [x13, #-16, mul vl]
+st2w	{ z5.s, z6.s }, p3, [x17, x16, lsl #2]
+st3b	{ z0.b, z1.b, z2.b }, p0, [x0, x0]
+st3b	{ z0.b, z1.b, z2.b }, p0, [x0]
+st3b	{ z21.b, z22.b, z23.b }, p5, [x10, #15, mul vl]
+st3b	{ z23.b, z24.b, z25.b }, p3, [x13, #-24, mul vl]
+st3b	{ z5.b, z6.b, z7.b }, p3, [x17, x16]
+st3d	{ z0.d, z1.d, z2.d }, p0, [x0, x0, lsl #3]
+st3d	{ z0.d, z1.d, z2.d }, p0, [x0]
+st3d	{ z21.d, z22.d, z23.d }, p5, [x10, #15, mul vl]
+st3d	{ z23.d, z24.d, z25.d }, p3, [x13, #-24, mul vl]
+st3d	{ z5.d, z6.d, z7.d }, p3, [x17, x16, lsl #3]
+st3h	{ z0.h, z1.h, z2.h }, p0, [x0, x0, lsl #1]
+st3h	{ z0.h, z1.h, z2.h }, p0, [x0]
+st3h	{ z21.h, z22.h, z23.h }, p5, [x10, #15, mul vl]
+st3h	{ z23.h, z24.h, z25.h }, p3, [x13, #-24, mul vl]
+st3h	{ z5.h, z6.h, z7.h }, p3, [x17, x16, lsl #1]
+st3w	{ z0.s, z1.s, z2.s }, p0, [x0, x0, lsl #2]
+st3w	{ z0.s, z1.s, z2.s }, p0, [x0]
+st3w	{ z21.s, z22.s, z23.s }, p5, [x10, #15, mul vl]
+st3w	{ z23.s, z24.s, z25.s }, p3, [x13, #-24, mul vl]
+st3w	{ z5.s, z6.s, z7.s }, p3, [x17, x16, lsl #2]
+st4b	{ z0.b, z1.b, z2.b, z3.b }, p0, [x0, x0]
+st4b	{ z0.b, z1.b, z2.b, z3.b }, p0, [x0]
+st4b	{ z21.b, z22.b, z23.b, z24.b }, p5, [x10, #20, mul vl]
+st4b	{ z23.b, z24.b, z25.b, z26.b }, p3, [x13, #-32, mul vl]
+st4b	{ z5.b, z6.b, z7.b, z8.b }, p3, [x17, x16]
+st4d	{ z0.d, z1.d, z2.d, z3.d }, p0, [x0, x0, lsl #3]
+st4d	{ z0.d, z1.d, z2.d, z3.d }, p0, [x0]
+st4d	{ z21.d, z22.d, z23.d, z24.d }, p5, [x10, #20, mul vl]
+st4d	{ z23.d, z24.d, z25.d, z26.d }, p3, [x13, #-32, mul vl]
+st4d	{ z5.d, z6.d, z7.d, z8.d }, p3, [x17, x16, lsl #3]
+st4h	{ z0.h, z1.h, z2.h, z3.h }, p0, [x0, x0, lsl #1]
+st4h	{ z0.h, z1.h, z2.h, z3.h }, p0, [x0]
+st4h	{ z21.h, z22.h, z23.h, z24.h }, p5, [x10, #20, mul vl]
+st4h	{ z23.h, z24.h, z25.h, z26.h }, p3, [x13, #-32, mul vl]
+st4h	{ z5.h, z6.h, z7.h, z8.h }, p3, [x17, x16, lsl #1]
+st4w	{ z0.s, z1.s, z2.s, z3.s }, p0, [x0, x0, lsl #2]
+st4w	{ z0.s, z1.s, z2.s, z3.s }, p0, [x0]
+st4w	{ z21.s, z22.s, z23.s, z24.s }, p5, [x10, #20, mul vl]
+st4w	{ z23.s, z24.s, z25.s, z26.s }, p3, [x13, #-32, mul vl]
+st4w	{ z5.s, z6.s, z7.s, z8.s }, p3, [x17, x16, lsl #2]
+stnt1b	{ z0.b }, p0, [x0, x0]
+stnt1b	{ z0.b }, p0, [x0]
+stnt1b	{ z0.d }, p0, [z1.d]
+stnt1b	{ z0.s }, p0, [z1.s]
+stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+stnt1b	{ z31.d }, p7, [z31.d, x0]
+stnt1b	{ z31.d }, p7, [z31.d]
+stnt1b	{ z31.s }, p7, [z31.s, x0]
+stnt1b	{ z31.s }, p7, [z31.s]
+stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+stnt1d	{ z0.d }, p0, [x0]
+stnt1d	{ z0.d }, p0, [z1.d]
+stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+stnt1d	{ z31.d }, p7, [z31.d, x0]
+stnt1d	{ z31.d }, p7, [z31.d]
+stnt1h	{ z0.d }, p0, [z1.d]
+stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+stnt1h	{ z0.h }, p0, [x0]
+stnt1h	{ z0.s }, p0, [z1.s]
+stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+stnt1h	{ z31.d }, p7, [z31.d, x0]
+stnt1h	{ z31.d }, p7, [z31.d]
+stnt1h	{ z31.s }, p7, [z31.s, x0]
+stnt1h	{ z31.s }, p7, [z31.s]
+stnt1w	{ z0.d }, p0, [z1.d]
+stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+stnt1w	{ z0.s }, p0, [x0]
+stnt1w	{ z0.s }, p0, [z1.s]
+stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+stnt1w	{ z31.d }, p7, [z31.d, x0]
+stnt1w	{ z31.d }, p7, [z31.d]
+stnt1w	{ z31.s }, p7, [z31.s, x0]
+stnt1w	{ z31.s }, p7, [z31.s]
+str	p0, [x0]
+str	p15, [sp, #-256, mul vl]
+str	p5, [x10, #255, mul vl]
+str	z0, [x0]
+str	z21, [x10, #-256, mul vl]
+str	z31, [sp, #255, mul vl]
+sub	z0.b, p0/m, z0.b, z0.b
+sub	z0.b, z0.b, #0
+sub	z0.b, z0.b, z0.b
+sub	z0.d, p0/m, z0.d, z0.d
+sub	z0.d, z0.d, #0
+sub	z0.d, z0.d, #0, lsl #8
+sub	z0.d, z0.d, z0.d
+sub	z0.h, p0/m, z0.h, z0.h
+sub	z0.h, z0.h, #0
+sub	z0.h, z0.h, #0, lsl #8
+sub	z0.h, z0.h, z0.h
+sub	z0.s, p0/m, z0.s, z0.s
+sub	z0.s, z0.s, #0
+sub	z0.s, z0.s, #0, lsl #8
+sub	z0.s, z0.s, z0.s
+sub	z21.b, p5/m, z21.b, z10.b
+sub	z21.b, z10.b, z21.b
+sub	z21.d, p5/m, z21.d, z10.d
+sub	z21.d, z10.d, z21.d
+sub	z21.h, p5/m, z21.h, z10.h
+sub	z21.h, z10.h, z21.h
+sub	z21.s, p5/m, z21.s, z10.s
+sub	z21.s, z10.s, z21.s
+sub	z23.b, p3/m, z23.b, z13.b
+sub	z23.b, z13.b, z8.b
+sub	z23.d, p3/m, z23.d, z13.d
+sub	z23.d, z13.d, z8.d
+sub	z23.h, p3/m, z23.h, z13.h
+sub	z23.h, z13.h, z8.h
+sub	z23.s, p3/m, z23.s, z13.s
+sub	z23.s, z13.s, z8.s
+sub	z31.b, p7/m, z31.b, z31.b
+sub	z31.b, z31.b, #255
+sub	z31.b, z31.b, z31.b
+sub	z31.d, p7/m, z31.d, z31.d
+sub	z31.d, z31.d, #65280
+sub	z31.d, z31.d, z31.d
+sub	z31.h, p7/m, z31.h, z31.h
+sub	z31.h, z31.h, #65280
+sub	z31.h, z31.h, z31.h
+sub	z31.s, p7/m, z31.s, z31.s
+sub	z31.s, z31.s, #65280
+sub	z31.s, z31.s, z31.s
+subhnb	z0.b, z1.h, z31.h
+subhnb	z0.h, z1.s, z31.s
+subhnb	z0.s, z1.d, z31.d
+subhnt	z0.b, z1.h, z31.h
+subhnt	z0.h, z1.s, z31.s
+subhnt	z0.s, z1.d, z31.d
+subr	z0.b, p0/m, z0.b, z0.b
+subr	z0.b, z0.b, #0
+subr	z0.d, p0/m, z0.d, z0.d
+subr	z0.d, z0.d, #0
+subr	z0.d, z0.d, #0, lsl #8
+subr	z0.h, p0/m, z0.h, z0.h
+subr	z0.h, z0.h, #0
+subr	z0.h, z0.h, #0, lsl #8
+subr	z0.s, p0/m, z0.s, z0.s
+subr	z0.s, z0.s, #0
+subr	z0.s, z0.s, #0, lsl #8
+subr	z31.b, z31.b, #255
+subr	z31.d, z31.d, #65280
+subr	z31.h, z31.h, #65280
+subr	z31.s, z31.s, #65280
+sunpkhi	z31.d, z31.s
+sunpkhi	z31.h, z31.b
+sunpkhi	z31.s, z31.h
+sunpklo	z31.d, z31.s
+sunpklo	z31.h, z31.b
+sunpklo	z31.s, z31.h
+suqadd	z0.b, p0/m, z0.b, z1.b
+suqadd	z0.h, p0/m, z0.h, z1.h
+suqadd	z29.s, p7/m, z29.s, z30.s
+suqadd	z31.d, p7/m, z31.d, z30.d
+sxtb	z0.d, p0/m, z0.d
+sxtb	z0.h, p0/m, z0.h
+sxtb	z0.s, p0/m, z0.s
+sxtb	z31.d, p7/m, z31.d
+sxtb	z31.h, p7/m, z31.h
+sxtb	z31.s, p7/m, z31.s
+sxth	z0.d, p0/m, z0.d
+sxth	z0.s, p0/m, z0.s
+sxth	z31.d, p7/m, z31.d
+sxth	z31.s, p7/m, z31.s
+sxtw	z0.d, p0/m, z0.d
+sxtw	z31.d, p7/m, z31.d
+tbl	z28.b, { z29.b, z30.b }, z31.b
+tbl	z28.d, { z29.d, z30.d }, z31.d
+tbl	z28.h, { z29.h, z30.h }, z31.h
+tbl	z28.s, { z29.s, z30.s }, z31.s
+tbl	z31.b, { z31.b }, z31.b
+tbl	z31.d, { z31.d }, z31.d
+tbl	z31.h, { z31.h }, z31.h
+tbl	z31.s, { z31.s }, z31.s
+tbx	z31.b, z31.b, z31.b
+tbx	z31.d, z31.d, z31.d
+tbx	z31.h, z31.h, z31.h
+tbx	z31.s, z31.s, z31.s
+trn1	p15.b, p15.b, p15.b
+trn1	p15.d, p15.d, p15.d
+trn1	p15.h, p15.h, p15.h
+trn1	p15.s, p15.s, p15.s
+trn1	z31.b, z31.b, z31.b
+trn1	z31.d, z31.d, z31.d
+trn1	z31.h, z31.h, z31.h
+trn1	z31.s, z31.s, z31.s
+trn2	p15.b, p15.b, p15.b
+trn2	p15.d, p15.d, p15.d
+trn2	p15.h, p15.h, p15.h
+trn2	p15.s, p15.s, p15.s
+trn2	z31.b, z31.b, z31.b
+trn2	z31.d, z31.d, z31.d
+trn2	z31.h, z31.h, z31.h
+trn2	z31.s, z31.s, z31.s
+uaba	z0.b, z1.b, z31.b
+uaba	z0.d, z1.d, z31.d
+uaba	z0.h, z1.h, z31.h
+uaba	z0.s, z1.s, z31.s
+uabalb	z0.d, z1.s, z31.s
+uabalb	z0.h, z1.b, z31.b
+uabalb	z0.s, z1.h, z31.h
+uabalt	z0.d, z1.s, z31.s
+uabalt	z0.h, z1.b, z31.b
+uabalt	z0.s, z1.h, z31.h
+uabd	z31.b, p7/m, z31.b, z31.b
+uabd	z31.d, p7/m, z31.d, z31.d
+uabd	z31.h, p7/m, z31.h, z31.h
+uabd	z31.s, p7/m, z31.s, z31.s
+uabdlb	z0.h, z1.b, z2.b
+uabdlb	z29.s, z30.h, z31.h
+uabdlb	z31.d, z31.s, z31.s
+uabdlt	z0.h, z1.b, z2.b
+uabdlt	z29.s, z30.h, z31.h
+uabdlt	z31.d, z31.s, z31.s
+uadalp	z0.h, p0/m, z1.b
+uadalp	z29.s, p0/m, z30.h
+uadalp	z30.d, p7/m, z31.s
+uaddlb	z0.h, z1.b, z2.b
+uaddlb	z29.s, z30.h, z31.h
+uaddlb	z31.d, z31.s, z31.s
+uaddlt	z0.h, z1.b, z2.b
+uaddlt	z29.s, z30.h, z31.h
+uaddlt	z31.d, z31.s, z31.s
+uaddv	d0, p7, z31.b
+uaddv	d0, p7, z31.d
+uaddv	d0, p7, z31.h
+uaddv	d0, p7, z31.s
+uaddwb	z0.h, z1.h, z2.b
+uaddwb	z29.s, z30.s, z31.h
+uaddwb	z31.d, z31.d, z31.s
+uaddwt	z0.h, z1.h, z2.b
+uaddwt	z29.s, z30.s, z31.h
+uaddwt	z31.d, z31.d, z31.s
+ucvtf	z0.d, p0/m, z0.d
+ucvtf	z0.d, p0/m, z0.s
+ucvtf	z0.h, p0/m, z0.d
+ucvtf	z0.h, p0/m, z0.h
+ucvtf	z0.h, p0/m, z0.s
+ucvtf	z0.s, p0/m, z0.d
+ucvtf	z0.s, p0/m, z0.s
+udiv	z0.d, p7/m, z0.d, z31.d
+udiv	z0.s, p7/m, z0.s, z31.s
+udivr	z0.d, p7/m, z0.d, z31.d
+udivr	z0.s, p7/m, z0.s, z31.s
+udot	z0.d, z1.h, z15.h[1]
+udot	z0.d, z1.h, z31.h
+udot	z0.s, z1.b, z31.b
+udot	z0.s, z1.b, z7.b[3]
+uhadd	z0.b, p0/m, z0.b, z1.b
+uhadd	z0.h, p0/m, z0.h, z1.h
+uhadd	z29.s, p7/m, z29.s, z30.s
+uhadd	z31.d, p7/m, z31.d, z30.d
+uhsub	z0.b, p0/m, z0.b, z1.b
+uhsub	z0.h, p0/m, z0.h, z1.h
+uhsub	z29.s, p7/m, z29.s, z30.s
+uhsub	z31.d, p7/m, z31.d, z30.d
+uhsubr	z0.b, p0/m, z0.b, z1.b
+uhsubr	z0.h, p0/m, z0.h, z1.h
+uhsubr	z29.s, p7/m, z29.s, z30.s
+uhsubr	z31.d, p7/m, z31.d, z30.d
+umax	z0.b, z0.b, #0
+umax	z31.b, p7/m, z31.b, z31.b
+umax	z31.b, z31.b, #255
+umax	z31.d, p7/m, z31.d, z31.d
+umax	z31.h, p7/m, z31.h, z31.h
+umax	z31.s, p7/m, z31.s, z31.s
+umaxp	z0.b, p0/m, z0.b, z1.b
+umaxp	z0.h, p0/m, z0.h, z1.h
+umaxp	z29.s, p7/m, z29.s, z30.s
+umaxp	z31.d, p7/m, z31.d, z30.d
+umaxv	b0, p7, z31.b
+umaxv	d0, p7, z31.d
+umaxv	h0, p7, z31.h
+umaxv	s0, p7, z31.s
+umin	z0.b, z0.b, #0
+umin	z31.b, p7/m, z31.b, z31.b
+umin	z31.b, z31.b, #255
+umin	z31.d, p7/m, z31.d, z31.d
+umin	z31.h, p7/m, z31.h, z31.h
+umin	z31.s, p7/m, z31.s, z31.s
+uminp	z0.b, p0/m, z0.b, z1.b
+uminp	z0.h, p0/m, z0.h, z1.h
+uminp	z29.s, p7/m, z29.s, z30.s
+uminp	z31.d, p7/m, z31.d, z30.d
+uminv	b0, p7, z31.b
+uminv	d0, p7, z31.d
+uminv	h0, p7, z31.h
+uminv	s0, p7, z31.s
+umlalb	z0.d, z1.s, z15.s[1]
+umlalb	z0.d, z1.s, z31.s
+umlalb	z0.h, z1.b, z31.b
+umlalb	z0.s, z1.h, z31.h
+umlalb	z0.s, z1.h, z7.h[7]
+umlalt	z0.d, z1.s, z15.s[1]
+umlalt	z0.d, z1.s, z31.s
+umlalt	z0.h, z1.b, z31.b
+umlalt	z0.s, z1.h, z31.h
+umlalt	z0.s, z1.h, z7.h[7]
+umlslb	z0.d, z1.s, z15.s[1]
+umlslb	z0.d, z1.s, z31.s
+umlslb	z0.h, z1.b, z31.b
+umlslb	z0.s, z1.h, z31.h
+umlslb	z0.s, z1.h, z7.h[7]
+umlslt	z0.d, z1.s, z15.s[1]
+umlslt	z0.d, z1.s, z31.s
+umlslt	z0.h, z1.b, z31.b
+umlslt	z0.s, z1.h, z31.h
+umlslt	z0.s, z1.h, z7.h[7]
+ummla	z0.s, z1.b, z2.b
+umulh	z0.b, p7/m, z0.b, z31.b
+umulh	z0.b, z1.b, z2.b
+umulh	z0.d, p7/m, z0.d, z31.d
+umulh	z0.h, p7/m, z0.h, z31.h
+umulh	z0.h, z1.h, z2.h
+umulh	z0.s, p7/m, z0.s, z31.s
+umulh	z29.s, z30.s, z31.s
+umulh	z31.d, z31.d, z31.d
+umullb	z0.d, z1.s, z15.s[1]
+umullb	z0.h, z1.b, z2.b
+umullb	z0.s, z1.h, z7.h[7]
+umullb	z29.s, z30.h, z31.h
+umullb	z31.d, z31.s, z31.s
+umullt	z0.d, z1.s, z15.s[1]
+umullt	z0.h, z1.b, z2.b
+umullt	z0.s, z1.h, z7.h[7]
+umullt	z29.s, z30.h, z31.h
+umullt	z31.d, z31.s, z31.s
+uqadd	z0.b, p0/m, z0.b, z1.b
+uqadd	z0.b, z0.b, #0
+uqadd	z0.b, z0.b, z0.b
+uqadd	z0.d, z0.d, #0
+uqadd	z0.d, z0.d, #0, lsl #8
+uqadd	z0.d, z0.d, z0.d
+uqadd	z0.h, p0/m, z0.h, z1.h
+uqadd	z0.h, z0.h, #0
+uqadd	z0.h, z0.h, #0, lsl #8
+uqadd	z0.h, z0.h, z0.h
+uqadd	z0.s, z0.s, #0
+uqadd	z0.s, z0.s, #0, lsl #8
+uqadd	z0.s, z0.s, z0.s
+uqadd	z29.s, p7/m, z29.s, z30.s
+uqadd	z31.b, z31.b, #255
+uqadd	z31.d, p7/m, z31.d, z30.d
+uqadd	z31.d, z31.d, #65280
+uqadd	z31.h, z31.h, #65280
+uqadd	z31.s, z31.s, #65280
+uqdecb	w0
+uqdecb	w0, all, mul #16
+uqdecb	w0, pow2
+uqdecb	w0, pow2, mul #16
+uqdecb	x0
+uqdecb	x0, #14
+uqdecb	x0, all, mul #16
+uqdecb	x0, pow2
+uqdecb	x0, vl1
+uqdecd	w0
+uqdecd	w0, all, mul #16
+uqdecd	w0, pow2
+uqdecd	w0, pow2, mul #16
+uqdecd	x0
+uqdecd	x0, #14
+uqdecd	x0, all, mul #16
+uqdecd	x0, pow2
+uqdecd	x0, vl1
+uqdecd	z0.d
+uqdecd	z0.d, all, mul #16
+uqdecd	z0.d, pow2
+uqdecd	z0.d, pow2, mul #16
+uqdech	w0
+uqdech	w0, all, mul #16
+uqdech	w0, pow2
+uqdech	w0, pow2, mul #16
+uqdech	x0
+uqdech	x0, #14
+uqdech	x0, all, mul #16
+uqdech	x0, pow2
+uqdech	x0, vl1
+uqdech	z0.h
+uqdech	z0.h, all, mul #16
+uqdech	z0.h, pow2
+uqdech	z0.h, pow2, mul #16
+uqdecp	wzr, p15.b
+uqdecp	wzr, p15.d
+uqdecp	wzr, p15.h
+uqdecp	wzr, p15.s
+uqdecp	x0, p0.b
+uqdecp	x0, p0.d
+uqdecp	x0, p0.h
+uqdecp	x0, p0.s
+uqdecp	z0.d, p0.d
+uqdecp	z0.h, p0.h
+uqdecp	z0.s, p0.s
+uqdecw	w0
+uqdecw	w0, all, mul #16
+uqdecw	w0, pow2
+uqdecw	w0, pow2, mul #16
+uqdecw	x0
+uqdecw	x0, #14
+uqdecw	x0, all, mul #16
+uqdecw	x0, pow2
+uqdecw	x0, vl1
+uqdecw	z0.s
+uqdecw	z0.s, all, mul #16
+uqdecw	z0.s, pow2
+uqdecw	z0.s, pow2, mul #16
+uqincb	w0
+uqincb	w0, all, mul #16
+uqincb	w0, pow2
+uqincb	w0, pow2, mul #16
+uqincb	x0
+uqincb	x0, #14
+uqincb	x0, all, mul #16
+uqincb	x0, pow2
+uqincb	x0, vl1
+uqincd	w0
+uqincd	w0, all, mul #16
+uqincd	w0, pow2
+uqincd	w0, pow2, mul #16
+uqincd	x0
+uqincd	x0, #14
+uqincd	x0, all, mul #16
+uqincd	x0, pow2
+uqincd	x0, vl1
+uqincd	z0.d
+uqincd	z0.d, all, mul #16
+uqincd	z0.d, pow2
+uqincd	z0.d, pow2, mul #16
+uqinch	w0
+uqinch	w0, all, mul #16
+uqinch	w0, pow2
+uqinch	w0, pow2, mul #16
+uqinch	x0
+uqinch	x0, #14
+uqinch	x0, all, mul #16
+uqinch	x0, pow2
+uqinch	x0, vl1
+uqinch	z0.h
+uqinch	z0.h, all, mul #16
+uqinch	z0.h, pow2
+uqinch	z0.h, pow2, mul #16
+uqincp	wzr, p15.b
+uqincp	wzr, p15.d
+uqincp	wzr, p15.h
+uqincp	wzr, p15.s
+uqincp	x0, p0.b
+uqincp	x0, p0.d
+uqincp	x0, p0.h
+uqincp	x0, p0.s
+uqincp	z0.d, p0.d
+uqincp	z0.h, p0.h
+uqincp	z0.s, p0.s
+uqincw	w0
+uqincw	w0, all, mul #16
+uqincw	w0, pow2
+uqincw	w0, pow2, mul #16
+uqincw	x0
+uqincw	x0, #14
+uqincw	x0, all, mul #16
+uqincw	x0, pow2
+uqincw	x0, vl1
+uqincw	z0.s
+uqincw	z0.s, all, mul #16
+uqincw	z0.s, pow2
+uqincw	z0.s, pow2, mul #16
+uqrshl	z0.b, p0/m, z0.b, z1.b
+uqrshl	z0.h, p0/m, z0.h, z1.h
+uqrshl	z29.s, p7/m, z29.s, z30.s
+uqrshl	z31.d, p7/m, z31.d, z30.d
+uqrshlr	z0.b, p0/m, z0.b, z1.b
+uqrshlr	z0.h, p0/m, z0.h, z1.h
+uqrshlr	z29.s, p7/m, z29.s, z30.s
+uqrshlr	z31.d, p7/m, z31.d, z30.d
+uqrshrnb	z0.b, z0.h, #1
+uqrshrnb	z0.h, z0.s, #1
+uqrshrnb	z0.s, z0.d, #1
+uqrshrnb	z31.b, z31.h, #8
+uqrshrnb	z31.h, z31.s, #16
+uqrshrnb	z31.s, z31.d, #32
+uqrshrnt	z0.b, z0.h, #1
+uqrshrnt	z0.h, z0.s, #1
+uqrshrnt	z0.s, z0.d, #1
+uqrshrnt	z31.b, z31.h, #8
+uqrshrnt	z31.h, z31.s, #16
+uqrshrnt	z31.s, z31.d, #32
+uqshl	z0.b, p0/m, z0.b, #0
+uqshl	z0.b, p0/m, z0.b, z1.b
+uqshl	z0.d, p0/m, z0.d, #0
+uqshl	z0.h, p0/m, z0.h, #0
+uqshl	z0.h, p0/m, z0.h, z1.h
+uqshl	z0.s, p0/m, z0.s, #0
+uqshl	z29.s, p7/m, z29.s, z30.s
+uqshl	z31.b, p0/m, z31.b, #7
+uqshl	z31.d, p0/m, z31.d, #63
+uqshl	z31.d, p7/m, z31.d, z30.d
+uqshl	z31.h, p0/m, z31.h, #15
+uqshl	z31.s, p0/m, z31.s, #31
+uqshlr	z0.b, p0/m, z0.b, z1.b
+uqshlr	z0.h, p0/m, z0.h, z1.h
+uqshlr	z29.s, p7/m, z29.s, z30.s
+uqshlr	z31.d, p7/m, z31.d, z30.d
+uqshrnb	z0.b, z0.h, #1
+uqshrnb	z0.h, z0.s, #1
+uqshrnb	z0.s, z0.d, #1
+uqshrnb	z31.b, z31.h, #8
+uqshrnb	z31.h, z31.s, #16
+uqshrnb	z31.s, z31.d, #32
+uqshrnt	z0.b, z0.h, #1
+uqshrnt	z0.h, z0.s, #1
+uqshrnt	z0.s, z0.d, #1
+uqshrnt	z31.b, z31.h, #8
+uqshrnt	z31.h, z31.s, #16
+uqshrnt	z31.s, z31.d, #32
+uqsub	z0.b, p0/m, z0.b, z1.b
+uqsub	z0.b, z0.b, #0
+uqsub	z0.b, z0.b, z0.b
+uqsub	z0.d, z0.d, #0
+uqsub	z0.d, z0.d, #0, lsl #8
+uqsub	z0.d, z0.d, z0.d
+uqsub	z0.h, p0/m, z0.h, z1.h
+uqsub	z0.h, z0.h, #0
+uqsub	z0.h, z0.h, #0, lsl #8
+uqsub	z0.h, z0.h, z0.h
+uqsub	z0.s, z0.s, #0
+uqsub	z0.s, z0.s, #0, lsl #8
+uqsub	z0.s, z0.s, z0.s
+uqsub	z29.s, p7/m, z29.s, z30.s
+uqsub	z31.b, z31.b, #255
+uqsub	z31.d, p7/m, z31.d, z30.d
+uqsub	z31.d, z31.d, #65280
+uqsub	z31.h, z31.h, #65280
+uqsub	z31.s, z31.s, #65280
+uqsubr	z0.b, p0/m, z0.b, z1.b
+uqsubr	z0.h, p0/m, z0.h, z1.h
+uqsubr	z29.s, p7/m, z29.s, z30.s
+uqsubr	z31.d, p7/m, z31.d, z30.d
+uqxtnb	z0.b, z31.h
+uqxtnb	z0.h, z31.s
+uqxtnb	z0.s, z31.d
+uqxtnt	z0.b, z31.h
+uqxtnt	z0.h, z31.s
+uqxtnt	z0.s, z31.d
+urecpe	z31.s, p7/m, z31.s
+urhadd	z0.b, p0/m, z0.b, z1.b
+urhadd	z0.h, p0/m, z0.h, z1.h
+urhadd	z29.s, p7/m, z29.s, z30.s
+urhadd	z31.d, p7/m, z31.d, z30.d
+urshl	z0.b, p0/m, z0.b, z1.b
+urshl	z0.h, p0/m, z0.h, z1.h
+urshl	z29.s, p7/m, z29.s, z30.s
+urshl	z31.d, p7/m, z31.d, z30.d
+urshlr	z0.b, p0/m, z0.b, z1.b
+urshlr	z0.h, p0/m, z0.h, z1.h
+urshlr	z29.s, p7/m, z29.s, z30.s
+urshlr	z31.d, p7/m, z31.d, z30.d
+urshr	z0.b, p0/m, z0.b, #1
+urshr	z0.d, p0/m, z0.d, #1
+urshr	z0.h, p0/m, z0.h, #1
+urshr	z0.s, p0/m, z0.s, #1
+urshr	z31.b, p0/m, z31.b, #8
+urshr	z31.d, p0/m, z31.d, #64
+urshr	z31.h, p0/m, z31.h, #16
+urshr	z31.s, p0/m, z31.s, #32
+ursqrte	z31.s, p7/m, z31.s
+ursra	z0.b, z0.b, #1
+ursra	z0.d, z0.d, #1
+ursra	z0.h, z0.h, #1
+ursra	z0.s, z0.s, #1
+ursra	z31.b, z31.b, #8
+ursra	z31.d, z31.d, #64
+ursra	z31.h, z31.h, #16
+ursra	z31.s, z31.s, #32
+ushllb	z0.d, z0.s, #0
+ushllb	z0.h, z0.b, #0
+ushllb	z0.s, z0.h, #0
+ushllb	z31.d, z31.s, #31
+ushllb	z31.h, z31.b, #7
+ushllb	z31.s, z31.h, #15
+ushllt	z0.d, z0.s, #0
+ushllt	z0.h, z0.b, #0
+ushllt	z0.s, z0.h, #0
+ushllt	z31.d, z31.s, #31
+ushllt	z31.h, z31.b, #7
+ushllt	z31.s, z31.h, #15
+usmmla	z0.s, z1.b, z2.b
+usqadd	z0.b, p0/m, z0.b, z1.b
+usqadd	z0.h, p0/m, z0.h, z1.h
+usqadd	z29.s, p7/m, z29.s, z30.s
+usqadd	z31.d, p7/m, z31.d, z30.d
+usra	z0.b, z0.b, #1
+usra	z0.d, z0.d, #1
+usra	z0.h, z0.h, #1
+usra	z0.s, z0.s, #1
+usra	z31.b, z31.b, #8
+usra	z31.d, z31.d, #64
+usra	z31.h, z31.h, #16
+usra	z31.s, z31.s, #32
+usublb	z0.h, z1.b, z2.b
+usublb	z29.s, z30.h, z31.h
+usublb	z31.d, z31.s, z31.s
+usublt	z0.h, z1.b, z2.b
+usublt	z29.s, z30.h, z31.h
+usublt	z31.d, z31.s, z31.s
+usubwb	z0.h, z1.h, z2.b
+usubwb	z29.s, z30.s, z31.h
+usubwb	z31.d, z31.d, z31.s
+usubwt	z0.h, z1.h, z2.b
+usubwt	z29.s, z30.s, z31.h
+usubwt	z31.d, z31.d, z31.s
+uunpkhi	z31.d, z31.s
+uunpkhi	z31.h, z31.b
+uunpkhi	z31.s, z31.h
+uunpklo	z31.d, z31.s
+uunpklo	z31.h, z31.b
+uunpklo	z31.s, z31.h
+uxtb	z0.d, p0/m, z0.d
+uxtb	z0.h, p0/m, z0.h
+uxtb	z0.s, p0/m, z0.s
+uxtb	z31.d, p7/m, z31.d
+uxtb	z31.h, p7/m, z31.h
+uxtb	z31.s, p7/m, z31.s
+uxth	z0.d, p0/m, z0.d
+uxth	z0.s, p0/m, z0.s
+uxth	z31.d, p7/m, z31.d
+uxth	z31.s, p7/m, z31.s
+uxtw	z0.d, p0/m, z0.d
+uxtw	z31.d, p7/m, z31.d
+uzp1	p15.b, p15.b, p15.b
+uzp1	p15.d, p15.d, p15.d
+uzp1	p15.h, p15.h, p15.h
+uzp1	p15.s, p15.s, p15.s
+uzp1	z31.b, z31.b, z31.b
+uzp1	z31.d, z31.d, z31.d
+uzp1	z31.h, z31.h, z31.h
+uzp1	z31.s, z31.s, z31.s
+uzp2	p15.b, p15.b, p15.b
+uzp2	p15.d, p15.d, p15.d
+uzp2	p15.h, p15.h, p15.h
+uzp2	p15.s, p15.s, p15.s
+uzp2	z31.b, z31.b, z31.b
+uzp2	z31.d, z31.d, z31.d
+uzp2	z31.h, z31.h, z31.h
+uzp2	z31.s, z31.s, z31.s
+whilege	p15.b, w0, wzr
+whilege	p15.b, wzr, w0
+whilege	p15.b, x0, xzr
+whilege	p15.b, xzr, x0
+whilege	p15.d, w0, wzr
+whilege	p15.d, x0, xzr
+whilege	p15.h, w0, wzr
+whilege	p15.h, x0, xzr
+whilege	p15.s, w0, wzr
+whilege	p15.s, x0, xzr
+whilerw	p15.b, x30, x30
+whilerw	p15.d, x30, x30
+whilerw	p15.h, x30, x30
+whilerw	p15.s, x30, x30
+whilewr	p15.b, x30, x30
+whilewr	p15.d, x30, x30
+whilewr	p15.h, x30, x30
+whilewr	p15.s, x30, x30
+wrffr	p0.b
+wrffr	p15.b
+xar	z0.b, z0.b, z1.b, #1
+xar	z0.d, z0.d, z1.d, #1
+xar	z0.h, z0.h, z1.h, #1
+xar	z0.s, z0.s, z1.s, #1
+xar	z31.b, z31.b, z30.b, #8
+xar	z31.d, z31.d, z30.d, #64
+xar	z31.h, z31.h, z30.h, #16
+xar	z31.s, z31.s, z30.s, #32
+zip1	p0.b, p0.b, p0.b
+zip1	p0.d, p0.d, p0.d
+zip1	p0.h, p0.h, p0.h
+zip1	p0.s, p0.s, p0.s
+zip1	p15.b, p15.b, p15.b
+zip1	p15.d, p15.d, p15.d
+zip1	p15.h, p15.h, p15.h
+zip1	p15.s, p15.s, p15.s
+zip1	z0.b, z0.b, z0.b
+zip1	z0.d, z0.d, z0.d
+zip1	z0.h, z0.h, z0.h
+zip1	z0.s, z0.s, z0.s
+zip1	z31.b, z31.b, z31.b
+zip1	z31.d, z31.d, z31.d
+zip1	z31.h, z31.h, z31.h
+zip1	z31.s, z31.s, z31.s
+zip2	p0.b, p0.b, p0.b
+zip2	p0.d, p0.d, p0.d
+zip2	p0.h, p0.h, p0.h
+zip2	p0.s, p0.s, p0.s
+zip2	p15.b, p15.b, p15.b
+zip2	p15.d, p15.d, p15.d
+zip2	p15.h, p15.h, p15.h
+zip2	p15.s, p15.s, p15.s
+zip2	z0.b, z0.b, z0.b
+zip2	z0.d, z0.d, z0.d
+zip2	z0.h, z0.h, z0.h
+zip2	z0.s, z0.s, z0.s
+zip2	z31.b, z31.b, z31.b
+zip2	z31.d, z31.d, z31.d
+zip2	z31.h, z31.h, z31.h
+zip2	z31.s, z31.s, z31.s
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                        abs	z0.b, p0/m, z0.b
+# CHECK-NEXT:  1      3     1.00                        abs	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        abs	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      3     1.00                        abs	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      3     1.00                        abs	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        abs	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        abs	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        abs	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        adclb	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        adclb	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        adclt	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        adclt	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        add	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        add	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        add	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        add	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        add	z0.d, z0.d, #0
+# CHECK-NEXT:  1      3     1.00                        add	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        add	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        add	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        add	z0.h, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        add	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        add	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        add	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        add	z0.s, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        add	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        add	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        add	z0.s, z1.s, z2.s
+# CHECK-NEXT:  1      3     1.00                        add	z21.b, p5/m, z21.b, z10.b
+# CHECK-NEXT:  1      3     1.00                        add	z21.b, z10.b, z21.b
+# CHECK-NEXT:  1      3     1.00                        add	z21.d, p5/m, z21.d, z10.d
+# CHECK-NEXT:  1      3     1.00                        add	z21.d, z10.d, z21.d
+# CHECK-NEXT:  1      3     1.00                        add	z21.h, p5/m, z21.h, z10.h
+# CHECK-NEXT:  1      3     1.00                        add	z21.h, z10.h, z21.h
+# CHECK-NEXT:  1      3     1.00                        add	z21.s, p5/m, z21.s, z10.s
+# CHECK-NEXT:  1      3     1.00                        add	z21.s, z10.s, z21.s
+# CHECK-NEXT:  1      3     1.00                        add	z23.b, p3/m, z23.b, z13.b
+# CHECK-NEXT:  1      3     1.00                        add	z23.b, z13.b, z8.b
+# CHECK-NEXT:  1      3     1.00                        add	z23.d, p3/m, z23.d, z13.d
+# CHECK-NEXT:  1      3     1.00                        add	z23.d, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        add	z23.h, p3/m, z23.h, z13.h
+# CHECK-NEXT:  1      3     1.00                        add	z23.h, z13.h, z8.h
+# CHECK-NEXT:  1      3     1.00                        add	z23.s, p3/m, z23.s, z13.s
+# CHECK-NEXT:  1      3     1.00                        add	z23.s, z13.s, z8.s
+# CHECK-NEXT:  1      3     1.00                        add	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        add	z31.b, z31.b, #255
+# CHECK-NEXT:  1      3     1.00                        add	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        add	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        add	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      3     1.00                        add	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        add	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        add	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      3     1.00                        add	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        add	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        add	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      3     1.00                        add	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        addhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        addhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        addhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     1.00                        addhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        addhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        addhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        addp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        addp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        addp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        addp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      1     1.00                        addpl	sp, sp, #31
+# CHECK-NEXT:  1      1     1.00                        addpl	x0, x0, #-32
+# CHECK-NEXT:  1      1     1.00                        addpl	x21, x21, #0
+# CHECK-NEXT:  1      1     1.00                        addpl	x23, x8, #-1
+# CHECK-NEXT:  1      1     1.00                        addvl	sp, sp, #31
+# CHECK-NEXT:  1      1     1.00                        addvl	x0, x0, #-32
+# CHECK-NEXT:  1      1     1.00                        addvl	x21, x21, #0
+# CHECK-NEXT:  1      1     1.00                        addvl	x23, x8, #-1
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, lsl #1]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, lsl #2]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, lsl #3]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, sxtw #1]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, sxtw #2]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, sxtw #3]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, sxtw]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, uxtw #1]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, uxtw #2]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, uxtw #3]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d, uxtw]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.d, [z0.d, z0.d]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.s, [z0.s, z0.s, lsl #1]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.s, [z0.s, z0.s, lsl #2]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.s, [z0.s, z0.s, lsl #3]
+# CHECK-NEXT:  1      3     1.00                        adr	z0.s, [z0.s, z0.s]
+# CHECK-NEXT:  1      3     1.00                        aesd	z0.b, z0.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        aese	z0.b, z0.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        aesimc	z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        aesimc	z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        aesmc	z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        aesmc	z31.b, z31.b
+# CHECK-NEXT:  1      2     1.00                        and	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      3     1.00                        and	z0.d, z0.d, #0x6
+# CHECK-NEXT:  1      3     1.00                        and	z0.d, z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  1      3     1.00                        and	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        and	z0.s, z0.s, #0x6
+# CHECK-NEXT:  1      3     1.00                        and	z0.s, z0.s, #0xfffffff9
+# CHECK-NEXT:  1      3     1.00                        and	z23.d, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        and	z23.h, z23.h, #0x6
+# CHECK-NEXT:  1      3     1.00                        and	z23.h, z23.h, #0xfff9
+# CHECK-NEXT:  1      3     1.00                        and	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        and	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        and	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        and	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        and	z5.b, z5.b, #0x6
+# CHECK-NEXT:  1      3     1.00                        and	z5.b, z5.b, #0xf9
+# CHECK-NEXT:  1      2     1.00                        ands	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      4     1.00                        andv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        andv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        andv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        andv	s0, p7, z31.s
+# CHECK-NEXT:  1      3     1.00                        asr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        asr	z0.b, p0/m, z0.b, z1.d
+# CHECK-NEXT:  1      3     1.00                        asr	z0.b, z0.b, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.b, z1.b, z2.d
+# CHECK-NEXT:  1      3     1.00                        asr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        asr	z0.d, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        asr	z0.h, p0/m, z0.h, z1.d
+# CHECK-NEXT:  1      3     1.00                        asr	z0.h, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.h, z1.h, z2.d
+# CHECK-NEXT:  1      3     1.00                        asr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        asr	z0.s, p0/m, z0.s, z1.d
+# CHECK-NEXT:  1      3     1.00                        asr	z0.s, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        asr	z0.s, z1.s, z2.d
+# CHECK-NEXT:  1      3     1.00                        asr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  1      3     1.00                        asr	z31.b, z31.b, #8
+# CHECK-NEXT:  1      3     1.00                        asr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  1      3     1.00                        asr	z31.d, z31.d, #64
+# CHECK-NEXT:  1      3     1.00                        asr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  1      3     1.00                        asr	z31.h, z31.h, #16
+# CHECK-NEXT:  1      3     1.00                        asr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  1      3     1.00                        asr	z31.s, z31.s, #32
+# CHECK-NEXT:  1      4     1.00                        asrd	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  1      4     1.00                        asrd	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        asrd	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        asrd	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        asrd	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  1      4     1.00                        asrd	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  1      4     1.00                        asrd	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  1      4     1.00                        asrd	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  1      3     1.00                        asrr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        asrr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        asrr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        asrr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        bcax	z29.d, z29.d, z30.d, z31.d
+# CHECK-NEXT:  1      13    12.00                       bdep	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      68    67.00                       bdep	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      21    20.00                       bdep	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      37    36.00                       bdep	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      13    12.00                       bext	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      68    67.00                       bext	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      21    20.00                       bext	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      37    36.00                       bext	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        bfcvt	z0.h, p0/m, z1.s
+# CHECK-NEXT:  1      4     1.00                        bfcvtnt	z0.h, p0/m, z1.s
+# CHECK-NEXT:  2      11    1.00                        bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT:  2      11    1.00                        bfdot	z0.s, z1.h, z2.h[0]
+# CHECK-NEXT:  2      11    1.00                        bfdot	z0.s, z1.h, z2.h[3]
+# CHECK-NEXT:  1      4     1.00                        bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        bfmlalb	z0.s, z1.h, z2.h[0]
+# CHECK-NEXT:  1      4     1.00                        bfmlalb	z0.s, z1.h, z2.h[7]
+# CHECK-NEXT:  1      4     1.00                        bfmlalb	z10.s, z21.h, z14.h
+# CHECK-NEXT:  1      4     1.00                        bfmlalb	z21.s, z14.h, z3.h[2]
+# CHECK-NEXT:  1      4     1.00                        bfmlalt	z0.s, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        bfmlalt	z0.s, z1.h, z2.h[0]
+# CHECK-NEXT:  1      4     1.00                        bfmlalt	z0.s, z1.h, z2.h[7]
+# CHECK-NEXT:  1      4     1.00                        bfmlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        bfmlalt	z14.s, z10.h, z21.h
+# CHECK-NEXT:  2      16    1.00                        bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT:  1      13    12.00                       bgrp	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      68    67.00                       bgrp	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      21    20.00                       bgrp	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      37    36.00                       bgrp	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        bic	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        bic	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      3     1.00                        bic	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        bic	z23.d, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        bic	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        bic	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        bic	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        bic	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        bics	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        bics	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brka	p0.b, p15/m, p15.b
+# CHECK-NEXT:  1      2     1.00                        brka	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkas	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkb	p0.b, p15/m, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkb	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkbs	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkn	p0.b, p15/z, p1.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        brkn	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkns	p0.b, p15/z, p1.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        brkns	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkpa	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      2     1.00                        brkpa	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkpas	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      2     1.00                        brkpas	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkpb	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      2     1.00                        brkpb	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkpbs	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      2     1.00                        brkpbs	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      3     1.00                        bsl	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  1      3     1.00                        bsl1n	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  1      3     1.00                        bsl2n	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  1      3     1.00                        cadd	z0.b, z0.b, z0.b, #90
+# CHECK-NEXT:  1      3     1.00                        cadd	z0.d, z0.d, z0.d, #90
+# CHECK-NEXT:  1      3     1.00                        cadd	z0.h, z0.h, z0.h, #90
+# CHECK-NEXT:  1      3     1.00                        cadd	z0.s, z0.s, z0.s, #90
+# CHECK-NEXT:  1      3     1.00                        cadd	z31.b, z31.b, z31.b, #270
+# CHECK-NEXT:  1      3     1.00                        cadd	z31.d, z31.d, z31.d, #270
+# CHECK-NEXT:  1      3     1.00                        cadd	z31.h, z31.h, z31.h, #270
+# CHECK-NEXT:  1      3     1.00                        cadd	z31.s, z31.s, z31.s, #270
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.d, z1.h, z15.h[1], #0
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.d, z1.h, z31.h, #0
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.d, z1.h, z31.h, #180
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.d, z1.h, z31.h, #270
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.d, z1.h, z31.h, #90
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.s, z1.b, z31.b, #0
+# CHECK-NEXT:  1      4     1.00                        cdot	z0.s, z1.b, z7.b[3], #0
+# CHECK-NEXT:  1      4     1.00                        cdot	z29.d, z30.h, z0.h[0], #180
+# CHECK-NEXT:  1      4     1.00                        cdot	z31.d, z30.h, z7.h[1], #270
+# CHECK-NEXT:  1      4     1.00                        cdot	z5.d, z6.h, z3.h[0], #90
+# CHECK-NEXT:  1      4     1.00                        clasta	b0, p7, b0, z31.b
+# CHECK-NEXT:  1      4     1.00                        clasta	d0, p7, d0, z31.d
+# CHECK-NEXT:  1      4     1.00                        clasta	h0, p7, h0, z31.h
+# CHECK-NEXT:  1      4     1.00                        clasta	s0, p7, s0, z31.s
+# CHECK-NEXT:  1      8     2.00                        clasta	w0, p7, w0, z31.b
+# CHECK-NEXT:  1      8     2.00                        clasta	w0, p7, w0, z31.h
+# CHECK-NEXT:  1      8     2.00                        clasta	w0, p7, w0, z31.s
+# CHECK-NEXT:  1      8     2.00                        clasta	x0, p7, x0, z31.d
+# CHECK-NEXT:  1      4     1.00                        clasta	z0.b, p7, z0.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        clasta	z0.d, p7, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        clasta	z0.h, p7, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        clasta	z0.s, p7, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        clastb	b0, p7, b0, z31.b
+# CHECK-NEXT:  1      4     1.00                        clastb	d0, p7, d0, z31.d
+# CHECK-NEXT:  1      4     1.00                        clastb	h0, p7, h0, z31.h
+# CHECK-NEXT:  1      4     1.00                        clastb	s0, p7, s0, z31.s
+# CHECK-NEXT:  1      8     2.00                        clastb	w0, p7, w0, z31.b
+# CHECK-NEXT:  1      8     2.00                        clastb	w0, p7, w0, z31.h
+# CHECK-NEXT:  1      8     2.00                        clastb	w0, p7, w0, z31.s
+# CHECK-NEXT:  1      8     2.00                        clastb	x0, p7, x0, z31.d
+# CHECK-NEXT:  1      4     1.00                        clastb	z0.b, p7, z0.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        clastb	z0.d, p7, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        clastb	z0.h, p7, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        clastb	z0.s, p7, z0.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        cls	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        cls	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        cls	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        cls	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        clz	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        clz	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        clz	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        clz	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        cmla	z0.b, z1.b, z2.b, #0
+# CHECK-NEXT:  1      4     1.00                        cmla	z0.d, z1.d, z2.d, #0
+# CHECK-NEXT:  1      4     1.00                        cmla	z0.h, z1.h, z2.h, #0
+# CHECK-NEXT:  1      4     1.00                        cmla	z0.h, z1.h, z2.h[0], #0
+# CHECK-NEXT:  1      4     1.00                        cmla	z0.s, z1.s, z2.s, #0
+# CHECK-NEXT:  1      4     1.00                        cmla	z0.s, z1.s, z2.s[0], #0
+# CHECK-NEXT:  1      4     1.00                        cmla	z15.b, z16.b, z17.b, #270
+# CHECK-NEXT:  1      4     1.00                        cmla	z15.d, z16.d, z17.d, #270
+# CHECK-NEXT:  1      4     1.00                        cmla	z15.h, z16.h, z17.h, #270
+# CHECK-NEXT:  1      4     1.00                        cmla	z15.s, z16.s, z17.s, #270
+# CHECK-NEXT:  1      4     1.00                        cmla	z29.b, z30.b, z31.b, #90
+# CHECK-NEXT:  1      4     1.00                        cmla	z29.d, z30.d, z31.d, #90
+# CHECK-NEXT:  1      4     1.00                        cmla	z29.h, z30.h, z31.h, #90
+# CHECK-NEXT:  1      4     1.00                        cmla	z29.s, z30.s, z31.s, #90
+# CHECK-NEXT:  1      4     1.00                        cmla	z31.b, z31.b, z31.b, #180
+# CHECK-NEXT:  1      4     1.00                        cmla	z31.d, z31.d, z31.d, #180
+# CHECK-NEXT:  1      4     1.00                        cmla	z31.h, z30.h, z7.h[0], #180
+# CHECK-NEXT:  1      4     1.00                        cmla	z31.h, z31.h, z31.h, #180
+# CHECK-NEXT:  1      4     1.00                        cmla	z31.s, z30.s, z7.s[0], #180
+# CHECK-NEXT:  1      4     1.00                        cmla	z31.s, z31.s, z31.s, #180
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpeq	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmpge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmpgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmphi	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmphs	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      5     1.00                        cmple	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      5     1.00                        cmplo	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      5     1.00                        cmpls	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      5     1.00                        cmplt	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      5     1.00                        cmpne	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        cnot	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        cnot	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        cnot	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        cnot	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        cnt	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      12    1.00                        cnt	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        cnt	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      8     1.00                        cnt	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        cntb	x0
+# CHECK-NEXT:  1      3     1.00                        cntb	x0, #28
+# CHECK-NEXT:  1      3     1.00                        cntb	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        cntb	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        cntd	x0
+# CHECK-NEXT:  1      3     1.00                        cntd	x0, #28
+# CHECK-NEXT:  1      3     1.00                        cntd	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        cntd	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        cnth	x0
+# CHECK-NEXT:  1      3     1.00                        cnth	x0, #28
+# CHECK-NEXT:  1      3     1.00                        cnth	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        cnth	x0, pow2
+# CHECK-NEXT:  1      1     1.00                        cntp	x0, p15, p0.b
+# CHECK-NEXT:  1      1     1.00                        cntp	x0, p15, p0.d
+# CHECK-NEXT:  1      1     1.00                        cntp	x0, p15, p0.h
+# CHECK-NEXT:  1      1     1.00                        cntp	x0, p15, p0.s
+# CHECK-NEXT:  1      3     1.00                        cntw	x0
+# CHECK-NEXT:  1      3     1.00                        cntw	x0, #28
+# CHECK-NEXT:  1      3     1.00                        cntw	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        cntw	x0, pow2
+# CHECK-NEXT:  1      4     1.00                        compact	z31.d, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        compact	z31.s, p7, z31.s
+# CHECK-NEXT:  1      1     1.00                        ctermeq	w30, wzr
+# CHECK-NEXT:  1      1     1.00                        ctermeq	wzr, w30
+# CHECK-NEXT:  1      1     1.00                        ctermeq	x30, xzr
+# CHECK-NEXT:  1      1     1.00                        ctermeq	xzr, x30
+# CHECK-NEXT:  1      1     1.00                        ctermne	w30, wzr
+# CHECK-NEXT:  1      1     1.00                        ctermne	wzr, w30
+# CHECK-NEXT:  1      1     1.00                        ctermne	x30, xzr
+# CHECK-NEXT:  1      1     1.00                        ctermne	xzr, x30
+# CHECK-NEXT:  1      3     1.00                        decb	x0
+# CHECK-NEXT:  1      3     1.00                        decb	x0, #14
+# CHECK-NEXT:  1      3     1.00                        decb	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        decb	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        decb	x0, vl1
+# CHECK-NEXT:  1      3     1.00                        decd	x0
+# CHECK-NEXT:  1      3     1.00                        decd	x0, #14
+# CHECK-NEXT:  1      3     1.00                        decd	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        decd	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        decd	x0, vl1
+# CHECK-NEXT:  1      3     1.00                        dech	x0
+# CHECK-NEXT:  1      3     1.00                        dech	x0, #14
+# CHECK-NEXT:  1      3     1.00                        dech	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        dech	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        dech	x0, vl1
+# CHECK-NEXT:  1      1     1.00                        decp	x0, p0.b
+# CHECK-NEXT:  1      1     1.00                        decp	x0, p0.d
+# CHECK-NEXT:  1      1     1.00                        decp	x0, p0.h
+# CHECK-NEXT:  1      1     1.00                        decp	x0, p0.s
+# CHECK-NEXT:  1      1     1.00                        decp	xzr, p15.b
+# CHECK-NEXT:  1      1     1.00                        decp	xzr, p15.d
+# CHECK-NEXT:  1      1     1.00                        decp	xzr, p15.h
+# CHECK-NEXT:  1      1     1.00                        decp	xzr, p15.s
+# CHECK-NEXT:  1      3     1.00                        decp	z31.d, p15.d
+# CHECK-NEXT:  1      3     1.00                        decp	z31.h, p15.h
+# CHECK-NEXT:  1      3     1.00                        decp	z31.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        decw	x0
+# CHECK-NEXT:  1      3     1.00                        decw	x0, #14
+# CHECK-NEXT:  1      3     1.00                        decw	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        decw	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        decw	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        dupm	z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  1      4     1.00                        dupm	z0.s, #0xfffffff9
+# CHECK-NEXT:  1      4     1.00                        dupm	z23.h, #0xfff9
+# CHECK-NEXT:  1      4     1.00                        dupm	z5.b, #0xf9
+# CHECK-NEXT:  1      2     1.00                        eor	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      3     1.00                        eor	z0.d, z0.d, #0x6
+# CHECK-NEXT:  1      3     1.00                        eor	z0.d, z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  1      3     1.00                        eor	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        eor	z0.s, z0.s, #0x6
+# CHECK-NEXT:  1      3     1.00                        eor	z0.s, z0.s, #0xfffffff9
+# CHECK-NEXT:  1      3     1.00                        eor	z23.d, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        eor	z23.h, z23.h, #0x6
+# CHECK-NEXT:  1      3     1.00                        eor	z23.h, z23.h, #0xfff9
+# CHECK-NEXT:  1      3     1.00                        eor	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        eor	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        eor	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        eor	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        eor	z5.b, z5.b, #0x6
+# CHECK-NEXT:  1      3     1.00                        eor	z5.b, z5.b, #0xf9
+# CHECK-NEXT:  1      4     1.00                        eor3	z29.d, z29.d, z30.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        eorbt	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        eorbt	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        eorbt	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        eorbt	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        eors	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      4     1.00                        eortb	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        eortb	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        eortb	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        eortb	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        eorv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        eorv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        eorv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        eorv	s0, p7, z31.s
+# CHECK-NEXT:  1      3     1.00                        ext	z0.b, { z1.b, z2.b }, #0
+# CHECK-NEXT:  1      3     1.00                        ext	z31.b, z31.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        ext	z31.b, z31.b, z0.b, #255
+# CHECK-NEXT:  1      3     1.00                        ext	z31.b, { z30.b, z31.b }, #255
+# CHECK-NEXT:  1      4     1.00                        fabd	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fabd	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fabd	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fabs	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        fabs	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        fabs	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      5     1.00                        facge	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        facge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        facge	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        facge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        facge	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      5     1.00                        facge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        facgt	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        facgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        facgt	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        facgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        facgt	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      5     1.00                        facgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fadd	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fadd	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fadd	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fadd	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      8     5.00                        fadda	d0, p7, d0, z31.d
+# CHECK-NEXT:  1      32    29.00                       fadda	h0, p7, h0, z31.h
+# CHECK-NEXT:  1      16    13.00                       fadda	s0, p7, s0, z31.s
+# CHECK-NEXT:  1      4     1.00                        faddp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        faddp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        faddp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        faddv	d0, p7, z31.d
+# CHECK-NEXT:  1      12    11.00                       faddv	h0, p7, z31.h
+# CHECK-NEXT:  1      8     5.00                        faddv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        fcadd	z0.d, p0/m, z0.d, z0.d, #90
+# CHECK-NEXT:  1      4     1.00                        fcadd	z0.h, p0/m, z0.h, z0.h, #90
+# CHECK-NEXT:  1      4     1.00                        fcadd	z0.s, p0/m, z0.s, z0.s, #90
+# CHECK-NEXT:  1      4     1.00                        fcadd	z31.d, p7/m, z31.d, z31.d, #270
+# CHECK-NEXT:  1      4     1.00                        fcadd	z31.h, p7/m, z31.h, z31.h, #270
+# CHECK-NEXT:  1      4     1.00                        fcadd	z31.s, p7/m, z31.s, z31.s, #270
+# CHECK-NEXT:  1      5     1.00                        fcmeq	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmeq	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        fcmeq	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmeq	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        fcmeq	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmeq	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      5     1.00                        fcmge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      5     1.00                        fcmgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.d, p0/m, z0.d, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.h, p0/m, z0.h, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.h, p0/m, z1.h, z2.h, #90
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.h, z0.h, z0.h[0], #0
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.s, p0/m, z0.s, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        fcmla	z0.s, p0/m, z1.s, z2.s, #90
+# CHECK-NEXT:  1      4     1.00                        fcmla	z21.s, z10.s, z5.s[1], #90
+# CHECK-NEXT:  1      4     1.00                        fcmla	z23.s, z13.s, z8.s[0], #270
+# CHECK-NEXT:  1      4     1.00                        fcmla	z29.d, p7/m, z30.d, z31.d, #180
+# CHECK-NEXT:  1      4     1.00                        fcmla	z29.h, p7/m, z30.h, z31.h, #180
+# CHECK-NEXT:  1      4     1.00                        fcmla	z29.s, p7/m, z30.s, z31.s, #180
+# CHECK-NEXT:  1      4     1.00                        fcmla	z31.d, p7/m, z31.d, z31.d, #270
+# CHECK-NEXT:  1      4     1.00                        fcmla	z31.h, p7/m, z31.h, z31.h, #270
+# CHECK-NEXT:  1      4     1.00                        fcmla	z31.h, z31.h, z7.h[3], #270
+# CHECK-NEXT:  1      4     1.00                        fcmla	z31.s, p7/m, z31.s, z31.s, #270
+# CHECK-NEXT:  1      5     1.00                        fcmle	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmle	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmle	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmlt	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmlt	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmlt	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmne	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmne	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        fcmne	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmne	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        fcmne	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  1      5     1.00                        fcmne	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      5     1.00                        fcmuo	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  1      5     1.00                        fcmuo	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  1      5     1.00                        fcmuo	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  1      4     1.00                        fcvt	z0.d, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvt	z0.d, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        fcvt	z0.h, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvt	z0.h, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        fcvt	z0.s, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvt	z0.s, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtlt	z0.s, p0/m, z1.h
+# CHECK-NEXT:  1      4     1.00                        fcvtlt	z30.d, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        fcvtnt	z0.h, p0/m, z1.s
+# CHECK-NEXT:  1      4     1.00                        fcvtnt	z30.s, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        fcvtx	z0.s, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvtx	z30.s, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        fcvtxnt	z0.s, p0/m, z1.d
+# CHECK-NEXT:  1      4     1.00                        fcvtxnt	z30.s, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.d, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.d, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.s, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.s, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtzs	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.d, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.d, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.s, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.s, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        fcvtzu	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      22    19.00                       fdiv	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      8     5.00                        fdiv	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      13    10.00                       fdiv	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      22    19.00                       fdivr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      8     5.00                        fdivr	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      13    10.00                       fdivr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fexpa	z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fexpa	z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fexpa	z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        flogb	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        flogb	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        flogb	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmad	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmad	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmad	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmax	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmax	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmax	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmax	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmax	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmax	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmax	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmax	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmax	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnm	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmaxnmp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        fmaxnmp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        fmaxnmp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        fmaxnmv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmaxnmv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmaxnmv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmaxp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        fmaxp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        fmaxp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        fmaxv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmaxv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmaxv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmin	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmin	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmin	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmin	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmin	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fmin	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmin	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmin	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fmin	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      4     1.00                        fminnm	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  1      4     1.00                        fminnm	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fminnm	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  1      4     1.00                        fminnm	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fminnm	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  1      4     1.00                        fminnm	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fminnm	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fminnm	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fminnm	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      4     1.00                        fminnmp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        fminnmp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        fminnmp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        fminnmv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        fminnmv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        fminnmv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        fminp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        fminp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        fminp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        fminv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        fminv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        fminv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmla	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmla	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmla	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmla	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        fmla	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmla	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmlalb	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  1      4     1.00                        fmlalb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmlalb	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        fmlalt	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  1      4     1.00                        fmlalt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmlalt	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        fmls	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmls	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmls	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmls	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        fmls	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmls	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmlslb	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  1      4     1.00                        fmlslb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmlslb	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        fmlslt	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  1      4     1.00                        fmlslt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmlslt	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.d, #-10.00000000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.d, #0.12500000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.d, p0/m, #-10.00000000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.d, p0/m, #0.12500000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.h, #-0.12500000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.h, p0/m, #-0.12500000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.s, #-0.12500000
+# CHECK-NEXT:  1      3     1.00                        fmov	z0.s, p0/m, #-0.12500000
+# CHECK-NEXT:  1      4     1.00                        fmsb	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmsb	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmsb	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.d, z0.d, z0.d[0]
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.h, z0.h, z0.h[0]
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.s, z0.s, z0.s[0]
+# CHECK-NEXT:  1      4     1.00                        fmul	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fmul	z31.d, p7/m, z31.d, #2.0
+# CHECK-NEXT:  1      4     1.00                        fmul	z31.d, z31.d, z15.d[1]
+# CHECK-NEXT:  1      4     1.00                        fmul	z31.h, p7/m, z31.h, #2.0
+# CHECK-NEXT:  1      4     1.00                        fmul	z31.h, z31.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        fmul	z31.s, p7/m, z31.s, #2.0
+# CHECK-NEXT:  1      4     1.00                        fmul	z31.s, z31.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        fmulx	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fmulx	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fmulx	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fneg	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        fneg	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        fneg	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        fnmad	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fnmad	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fnmad	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fnmla	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fnmla	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fnmla	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fnmls	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fnmls	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fnmls	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fnmsb	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fnmsb	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fnmsb	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        frecpe	z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        frecpe	z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        frecpe	z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        frecps	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        frecps	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        frecps	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        frecpx	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frecpx	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frecpx	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frinta	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frinta	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frinta	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frinti	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frinti	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frinti	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frintm	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frintm	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frintm	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frintn	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frintn	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frintn	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frintp	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frintp	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frintp	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frintx	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frintx	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frintx	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frintz	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        frintz	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        frintz	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        frsqrte	z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        frsqrte	z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        frsqrte	z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        frsqrts	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        frsqrts	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        frsqrts	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fscale	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fscale	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fscale	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      25    19.00                       fsqrt	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      11    5.00                        fsqrt	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      14    9.00                        fsqrt	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fsub	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fsub	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fsub	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fsub	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      4     1.00                        fsubr	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  1      4     1.00                        fsubr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        fsubr	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  1      4     1.00                        fsubr	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        fsubr	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  1      4     1.00                        fsubr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        fsubr	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  1      4     1.00                        fsubr	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  1      4     1.00                        fsubr	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  1      4     1.00                        ftmad	z0.d, z0.d, z31.d, #7
+# CHECK-NEXT:  1      4     1.00                        ftmad	z0.h, z0.h, z31.h, #7
+# CHECK-NEXT:  1      4     1.00                        ftmad	z0.s, z0.s, z31.s, #7
+# CHECK-NEXT:  1      4     1.00                        ftsmul	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        ftsmul	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        ftsmul	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        ftssel	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        ftssel	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        ftssel	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      8     2.00                        histcnt	z0.s, p0/z, z1.s, z2.s
+# CHECK-NEXT:  1      8     2.00                        histcnt	z29.d, p7/z, z30.d, z31.d
+# CHECK-NEXT:  1      8     2.00                        histseg	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        incb	x0
+# CHECK-NEXT:  1      3     1.00                        incb	x0, #14
+# CHECK-NEXT:  1      3     1.00                        incb	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        incb	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        incb	x0, vl1
+# CHECK-NEXT:  1      3     1.00                        incd	x0
+# CHECK-NEXT:  1      3     1.00                        incd	x0, #14
+# CHECK-NEXT:  1      3     1.00                        incd	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        incd	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        incd	x0, vl1
+# CHECK-NEXT:  1      3     1.00                        incd	z0.d
+# CHECK-NEXT:  1      3     1.00                        incd	z0.d, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        inch	x0
+# CHECK-NEXT:  1      3     1.00                        inch	x0, #14
+# CHECK-NEXT:  1      3     1.00                        inch	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        inch	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        inch	x0, vl1
+# CHECK-NEXT:  1      3     1.00                        inch	z0.h
+# CHECK-NEXT:  1      3     1.00                        inch	z0.h, all, mul #16
+# CHECK-NEXT:  1      1     1.00                        incp	x0, p0.b
+# CHECK-NEXT:  1      1     1.00                        incp	x0, p0.d
+# CHECK-NEXT:  1      1     1.00                        incp	x0, p0.h
+# CHECK-NEXT:  1      1     1.00                        incp	x0, p0.s
+# CHECK-NEXT:  1      1     1.00                        incp	xzr, p15.b
+# CHECK-NEXT:  1      1     1.00                        incp	xzr, p15.d
+# CHECK-NEXT:  1      1     1.00                        incp	xzr, p15.h
+# CHECK-NEXT:  1      1     1.00                        incp	xzr, p15.s
+# CHECK-NEXT:  1      3     1.00                        incp	z31.d, p15.d
+# CHECK-NEXT:  1      3     1.00                        incp	z31.h, p15.h
+# CHECK-NEXT:  1      3     1.00                        incp	z31.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        incw	x0
+# CHECK-NEXT:  1      3     1.00                        incw	x0, #14
+# CHECK-NEXT:  1      3     1.00                        incw	x0, all, mul #16
+# CHECK-NEXT:  1      3     1.00                        incw	x0, pow2
+# CHECK-NEXT:  1      3     1.00                        incw	x0, vl1
+# CHECK-NEXT:  1      3     1.00                        incw	z0.s
+# CHECK-NEXT:  1      3     1.00                        incw	z0.s, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        index	z0.b, #0, #0
+# CHECK-NEXT:  1      4     1.00                        index	z0.d, #0, #0
+# CHECK-NEXT:  1      4     1.00                        index	z0.h, #0, #0
+# CHECK-NEXT:  1      4     1.00                        index	z0.h, w0, w0
+# CHECK-NEXT:  1      4     1.00                        index	z0.s, #0, #0
+# CHECK-NEXT:  1      4     1.00                        index	z21.b, w10, w21
+# CHECK-NEXT:  1      4     1.00                        index	z21.d, x10, x21
+# CHECK-NEXT:  1      4     1.00                        index	z21.s, w10, w21
+# CHECK-NEXT:  1      4     1.00                        index	z23.b, #13, w8
+# CHECK-NEXT:  1      4     1.00                        index	z23.b, w13, #8
+# CHECK-NEXT:  1      4     1.00                        index	z23.d, #13, x8
+# CHECK-NEXT:  1      4     1.00                        index	z23.d, x13, #8
+# CHECK-NEXT:  1      4     1.00                        index	z23.h, #13, w8
+# CHECK-NEXT:  1      4     1.00                        index	z23.h, w13, #8
+# CHECK-NEXT:  1      4     1.00                        index	z23.s, #13, w8
+# CHECK-NEXT:  1      4     1.00                        index	z23.s, w13, #8
+# CHECK-NEXT:  1      4     1.00                        index	z31.b, #-1, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.b, #-1, wzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.b, wzr, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.b, wzr, wzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.d, #-1, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.d, #-1, xzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.d, xzr, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.d, xzr, xzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.h, #-1, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.h, #-1, wzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.h, wzr, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.h, wzr, wzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.s, #-1, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.s, #-1, wzr
+# CHECK-NEXT:  1      4     1.00                        index	z31.s, wzr, #-1
+# CHECK-NEXT:  1      4     1.00                        index	z31.s, wzr, wzr
+# CHECK-NEXT:  1      8     2.00                        insr	z0.b, w0
+# CHECK-NEXT:  1      8     2.00                        insr	z0.d, x0
+# CHECK-NEXT:  1      8     2.00                        insr	z0.h, w0
+# CHECK-NEXT:  1      8     2.00                        insr	z0.s, w0
+# CHECK-NEXT:  1      4     1.00                        insr	z31.b, b31
+# CHECK-NEXT:  1      8     2.00                        insr	z31.b, wzr
+# CHECK-NEXT:  1      4     1.00                        insr	z31.d, d31
+# CHECK-NEXT:  1      8     2.00                        insr	z31.d, xzr
+# CHECK-NEXT:  1      4     1.00                        insr	z31.h, h31
+# CHECK-NEXT:  1      8     2.00                        insr	z31.h, wzr
+# CHECK-NEXT:  1      4     1.00                        insr	z31.s, s31
+# CHECK-NEXT:  1      8     2.00                        insr	z31.s, wzr
+# CHECK-NEXT:  1      4     1.00                        lasta	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        lasta	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        lasta	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        lasta	s0, p7, z31.s
+# CHECK-NEXT:  1      8     2.00                        lasta	w0, p7, z31.b
+# CHECK-NEXT:  1      8     2.00                        lasta	w0, p7, z31.h
+# CHECK-NEXT:  1      8     2.00                        lasta	w0, p7, z31.s
+# CHECK-NEXT:  1      8     2.00                        lasta	x0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        lastb	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        lastb	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        lastb	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        lastb	s0, p7, z31.s
+# CHECK-NEXT:  1      8     2.00                        lastb	w0, p7, z31.b
+# CHECK-NEXT:  1      8     2.00                        lastb	w0, p7, z31.h
+# CHECK-NEXT:  1      8     2.00                        lastb	w0, p7, z31.s
+# CHECK-NEXT:  1      8     2.00                        lastb	x0, p7, z31.d
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z0.b }, p0/z, [sp, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ld1b	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z21.b }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z21.s }, p5/z, [x10, x21]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z23.d }, p3/z, [x13, x8]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z31.b }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      9     9.00    *                   ld1b	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  1      3     1.00    *                   ld1b	{ z5.h }, p3/z, [x17, x16]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  1      3     1.00    *                   ld1d	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *                   ld1d	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1d	{ z23.d }, p3/z, [sp, x8, lsl #3]
+# CHECK-NEXT:  1      3     1.00    *                   ld1d	{ z23.d }, p3/z, [x13, x8, lsl #3]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z23.d }, p3/z, [x13, z8.d, lsl #3]
+# CHECK-NEXT:  1      3     1.00    *                   ld1d	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1d	{ z31.d }, p7/z, [z31.d, #248]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ld1h	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z21.s }, p5/z, [x10, x21, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z23.d }, p3/z, [x13, x8, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      9     9.00    *                   ld1h	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z5.h }, p3/z, [sp, x16, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1h	{ z5.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z31.b }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rb	{ z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rd	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rd	{ z31.d }, p7/z, [sp, #504]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rh	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rh	{ z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rh	{ z31.h }, p7/z, [sp, #126]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rh	{ z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqb	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqb	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqb	{ z21.b }, p5/z, [x10, #112]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqb	{ z23.b }, p3/z, [x13, #-128]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqb	{ z31.b }, p7/z, [sp, #-16]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqd	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqd	{ z23.d }, p3/z, [x13, #-128]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqd	{ z23.d }, p3/z, [x13, #112]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqd	{ z31.d }, p7/z, [sp, #-16]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqh	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqh	{ z23.h }, p3/z, [x13, #-128]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqh	{ z23.h }, p3/z, [x13, #112]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqh	{ z31.h }, p7/z, [sp, #-16]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqw	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqw	{ z23.s }, p3/z, [x13, #-128]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqw	{ z23.s }, p3/z, [x13, #112]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rqw	{ z31.s }, p7/z, [sp, #-16]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsb	{ z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsb	{ z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsb	{ z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsh	{ z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsh	{ z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rsw	{ z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rw	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rw	{ z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT:  1      3     1.00    *                   ld1rw	{ z31.s }, p7/z, [sp, #252]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sb	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z0.h }, p0/z, [sp, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z0.h }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ld1sb	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z21.s }, p5/z, [x10, x21]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z23.d }, p3/z, [x13, x8]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sb	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sb	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sb	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      9     9.00    *                   ld1sb	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ld1sh	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z21.s }, p5/z, [sp, x21, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z21.s }, p5/z, [x10, x21, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z23.d }, p3/z, [x13, x8, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      9     9.00    *                   ld1sh	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sw	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sw	{ z23.d }, p3/z, [sp, x8, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sw	{ z23.d }, p3/z, [x13, x8, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1sw	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ld1w	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z21.s }, p5/z, [sp, x21, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z21.s }, p5/z, [x10, x21, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z23.d }, p3/z, [x13, x8, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  1      3     1.00    *                   ld1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  1      9     9.00    *                   ld1w	{ z31.s }, p7/z, [z31.s, #124]
+# CHECK-NEXT:  1      3     2.00    *                   ld2b	{ z0.b, z1.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld2b	{ z0.b, z1.b }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld2b	{ z21.b, z22.b }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld2b	{ z23.b, z24.b }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      3     2.00    *                   ld2b	{ z5.b, z6.b }, p3/z, [x17, x16]
+# CHECK-NEXT:  1      3     2.00    *                   ld2d	{ z0.d, z1.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      3     1.00    *                   ld2d	{ z0.d, z1.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld2d	{ z21.d, z22.d }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld2d	{ z23.d, z24.d }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      3     2.00    *                   ld2d	{ z5.d, z6.d }, p3/z, [x17, x16, lsl #3]
+# CHECK-NEXT:  1      3     2.00    *                   ld2h	{ z0.h, z1.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ld2h	{ z0.h, z1.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld2h	{ z21.h, z22.h }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld2h	{ z23.h, z24.h }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      3     2.00    *                   ld2h	{ z5.h, z6.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      3     2.00    *                   ld2w	{ z0.s, z1.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ld2w	{ z0.s, z1.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ld2w	{ z21.s, z22.s }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ld2w	{ z23.s, z24.s }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      3     2.00    *                   ld2w	{ z5.s, z6.s }, p3/z, [x17, x16, lsl #2]
+# CHECK-NEXT:  1      5     3.00    *                   ld3b	{ z0.b - z2.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld3b	{ z0.b - z2.b }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld3b	{ z21.b - z23.b }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3b	{ z23.b - z25.b }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3b	{ z5.b - z7.b }, p3/z, [x17, x16]
+# CHECK-NEXT:  1      5     3.00    *                   ld3d	{ z0.d - z2.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      5     3.00    *                   ld3d	{ z0.d - z2.d }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld3d	{ z21.d - z23.d }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3d	{ z23.d - z25.d }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3d	{ z5.d - z7.d }, p3/z, [x17, x16, lsl #3]
+# CHECK-NEXT:  1      5     3.00    *                   ld3h	{ z0.h - z2.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      5     3.00    *                   ld3h	{ z0.h - z2.h }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld3h	{ z21.h - z23.h }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3h	{ z23.h - z25.h }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3h	{ z5.h - z7.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      5     3.00    *                   ld3w	{ z0.s - z2.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      5     3.00    *                   ld3w	{ z0.s - z2.s }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld3w	{ z21.s - z23.s }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3w	{ z23.s - z25.s }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld3w	{ z5.s - z7.s }, p3/z, [x17, x16, lsl #2]
+# CHECK-NEXT:  1      5     3.00    *                   ld4b	{ z0.b - z3.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld4b	{ z0.b - z3.b }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld4b	{ z21.b - z24.b }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4b	{ z23.b - z26.b }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4b	{ z5.b - z8.b }, p3/z, [x17, x16]
+# CHECK-NEXT:  1      5     3.00    *                   ld4d	{ z0.d - z3.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      5     3.00    *                   ld4d	{ z0.d - z3.d }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld4d	{ z21.d - z24.d }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4d	{ z23.d - z26.d }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4d	{ z5.d - z8.d }, p3/z, [x17, x16, lsl #3]
+# CHECK-NEXT:  1      5     3.00    *                   ld4h	{ z0.h - z3.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      5     3.00    *                   ld4h	{ z0.h - z3.h }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld4h	{ z21.h - z24.h }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4h	{ z23.h - z26.h }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4h	{ z5.h - z8.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      5     3.00    *                   ld4w	{ z0.s - z3.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      5     3.00    *                   ld4w	{ z0.s - z3.s }, p0/z, [x0]
+# CHECK-NEXT:  1      5     3.00    *                   ld4w	{ z21.s - z24.s }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4w	{ z23.s - z26.s }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      5     3.00    *                   ld4w	{ z5.s - z8.s }, p3/z, [x17, x16, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z0.d }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z0.h }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z0.s }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1b	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z31.b }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z31.h }, p7/z, [sp]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1b	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1b	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1d	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z23.d }, p3/z, [x13, z8.d, lsl #3]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1d	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1d	{ z31.d }, p7/z, [z31.d, #248]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1h	{ z0.d }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1h	{ z0.s }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1h	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1h	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1h	{ z31.h }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1h	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1h	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sb	{ z0.d }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sb	{ z0.h }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sb	{ z0.s }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1sb	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sb	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sb	{ z31.h }, p7/z, [sp]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sb	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1sb	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sh	{ z0.d }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sh	{ z0.s }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1sh	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sh	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sh	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1sh	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sw	{ z0.d }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1sw	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1sw	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1w	{ z0.d }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1w	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1w	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  1      3     1.00    *             U     ldff1w	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  1      9     9.00    *             U     ldff1w	{ z31.s }, p7/z, [z31.s, #124]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z21.b }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z31.b }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1b	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1d	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1d	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1d	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sb	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sh	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sh	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sw	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1sw	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1w	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1w	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1w	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1w	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1w	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *             U     ldnf1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1b	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1b	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1b	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1b	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1b	{ z21.b }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1b	{ z23.b }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1b	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1b	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1b	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1b	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1d	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1d	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1d	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1d	{ z21.d }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1d	{ z23.d }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1d	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1d	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1h	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1h	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1h	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1h	{ z21.h }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1h	{ z23.h }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1h	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1h	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1h	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1h	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sb	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1sb	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sb	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sb	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1sb	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1sb	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sh	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1sh	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sh	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sh	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1sh	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1sh	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sw	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sw	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1sw	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1w	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1w	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1w	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1w	{ z21.s }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldnt1w	{ z23.s }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1w	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  1      7     7.00    *                   ldnt1w	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1w	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  1      9     9.00    *                   ldnt1w	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  1      3     1.00    *                   ldr	p0, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ldr	p5, [x10, #255, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldr	p7, [x13, #-256, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldr	z0, [x0]
+# CHECK-NEXT:  1      3     1.00    *                   ldr	z23, [x13, #255, mul vl]
+# CHECK-NEXT:  1      3     1.00    *                   ldr	z31, [sp, #-256, mul vl]
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.b, p0/m, z0.b, z1.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.b, z1.b, z2.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.d, z0.d, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.h, p0/m, z0.h, z1.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.h, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.h, z1.h, z2.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.s, p0/m, z0.s, z1.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.s, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        lsl	z0.s, z1.s, z2.d
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.b, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.d, z31.d, #63
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.h, z31.h, #15
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  1      3     1.00                        lsl	z31.s, z31.s, #31
+# CHECK-NEXT:  1      3     1.00                        lslr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        lslr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        lslr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        lslr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.b, p0/m, z0.b, z1.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.b, z0.b, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.b, z1.b, z2.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.d, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.h, p0/m, z0.h, z1.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.h, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.h, z1.h, z2.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.s, p0/m, z0.s, z1.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.s, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        lsr	z0.s, z1.s, z2.d
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.b, z31.b, #8
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.d, z31.d, #64
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.h, z31.h, #16
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  1      3     1.00                        lsr	z31.s, z31.s, #32
+# CHECK-NEXT:  1      3     1.00                        lsrr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        lsrr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        lsrr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        lsrr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        mad	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        mad	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        mad	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        mad	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      9     2.00                        match	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      9     2.00                        match	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      9     2.00                        match	p15.b, p7/z, z30.b, z31.b
+# CHECK-NEXT:  1      9     2.00                        match	p15.h, p7/z, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        mla	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        mla	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        mla	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  1      4     1.00                        mla	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        mla	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        mla	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        mla	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        mls	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        mls	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        mls	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  1      4     1.00                        mls	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        mls	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        mls	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        mls	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      2     1.00                        mov	p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        mov	p0.b, p0/m, p0.b
+# CHECK-NEXT:  1      2     1.00                        mov	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        mov	p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        mov	p15.b, p15/m, p15.b
+# CHECK-NEXT:  1      2     1.00                        mov	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      3     1.00                        mov	z0.b, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z0.b, b0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.b, p0/m, b0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.b, p0/m, w0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.b, p0/z, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z0.b, w0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        mov	z0.d, #0xe0000000000003ff
+# CHECK-NEXT:  1      4     1.00                        mov	z0.d, #0xffffffffffff7fff
+# CHECK-NEXT:  1      4     1.00                        mov	z0.d, #32768
+# CHECK-NEXT:  1      3     1.00                        mov	z0.d, d0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.d, p0/m, d0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.d, p0/m, x0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.d, x0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, #-256
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, #32512
+# CHECK-NEXT:  1      4     1.00                        mov	z0.h, #32767
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, h0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, p0/m, h0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, p0/m, w0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, p0/z, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z0.h, w0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.q, q0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        mov	z0.s, #0xffff7fff
+# CHECK-NEXT:  1      4     1.00                        mov	z0.s, #32768
+# CHECK-NEXT:  1      3     1.00                        mov	z0.s, p0/m, s0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.s, p0/m, w0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.s, s0
+# CHECK-NEXT:  1      3     1.00                        mov	z0.s, w0
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, p0/z, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, p0/z, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, p0/z, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, p0/z, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, p15/m, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.d, p15/m, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, p0/z, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, p0/z, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, p0/z, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, p0/z, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, p15/m, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.h, p15/m, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, p0/z, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, p0/z, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, p0/z, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, p0/z, #32512
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, p15/m, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z21.s, p15/m, #-32768
+# CHECK-NEXT:  1      3     1.00                        mov	z31.b, p15/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        mov	z31.b, p7/m, b31
+# CHECK-NEXT:  1      3     1.00                        movprfx	z31, z6
+# CHECK-NEXT:  1      3     1.00                        mov	z31.b, p7/m, wsp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.b, wsp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.b, z31.b[63]
+# CHECK-NEXT:  1      3     1.00                        mov	z31.d, p15/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        mov	z31.d, p7/m, d31
+# CHECK-NEXT:  1      3     1.00                        movprfx	z31.d, p7/z, z6.d
+# CHECK-NEXT:  1      3     1.00                        mov	z31.d, p7/m, sp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.d, sp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        mov	z31.d, z31.d[7]
+# CHECK-NEXT:  1      3     1.00                        mov	z31.h, p15/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        mov	z31.h, p7/m, h31
+# CHECK-NEXT:  1      3     1.00                        mov	z31.h, p7/m, wsp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.h, wsp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.h, z31.h[31]
+# CHECK-NEXT:  1      3     1.00                        mov	z31.s, p15/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        mov	z31.s, p7/m, s31
+# CHECK-NEXT:  1      3     1.00                        mov	z31.s, p7/m, wsp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.s, wsp
+# CHECK-NEXT:  1      3     1.00                        mov	z31.s, z31.s[15]
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, #-1
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, p0/z, #-1
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, p0/z, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, p0/z, #127
+# CHECK-NEXT:  1      3     1.00                        mov	z5.b, p15/m, #-128
+# CHECK-NEXT:  1      3     1.00                        mov	z5.d, #-6
+# CHECK-NEXT:  1      3     1.00                        mov	z5.h, #-6
+# CHECK-NEXT:  1      3     1.00                        mov	z5.q, z17.q[3]
+# CHECK-NEXT:  1      3     1.00                        mov	z5.s, #-6
+# CHECK-NEXT:  1      2     1.00                        movs	p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        movs	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        movs	p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        movs	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ZCR_EL1
+# CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ZCR_EL12
+# CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ZCR_EL2
+# CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ZCR_EL3
+# CHECK-NEXT:  1      4     1.00                        msb	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        msb	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        msb	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        msb	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  1      1     1.00                  U     msr	ZCR_EL1, x3
+# CHECK-NEXT:  1      1     1.00                  U     msr	ZCR_EL12, x3
+# CHECK-NEXT:  1      1     1.00                  U     msr	ZCR_EL2, x3
+# CHECK-NEXT:  1      1     1.00                  U     msr	ZCR_EL3, x3
+# CHECK-NEXT:  1      4     1.00                        mul	z0.b, p7/m, z0.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        mul	z0.b, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        mul	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        mul	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  1      4     1.00                        mul	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        mul	z0.h, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        mul	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        mul	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        mul	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        mul	z29.s, z30.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        mul	z31.b, z31.b, #-128
+# CHECK-NEXT:  1      4     1.00                        mul	z31.b, z31.b, #127
+# CHECK-NEXT:  1      4     1.00                        mul	z31.d, z31.d, #-128
+# CHECK-NEXT:  1      4     1.00                        mul	z31.d, z31.d, #127
+# CHECK-NEXT:  1      4     1.00                        mul	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        mul	z31.h, z31.h, #-128
+# CHECK-NEXT:  1      4     1.00                        mul	z31.h, z31.h, #127
+# CHECK-NEXT:  1      4     1.00                        mul	z31.s, z31.s, #-128
+# CHECK-NEXT:  1      4     1.00                        mul	z31.s, z31.s, #127
+# CHECK-NEXT:  1      2     1.00                        nand	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nand	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        nands	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nands	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      3     1.00                        nbsl	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  1      3     1.00                        neg	z0.b, p0/m, z0.b
+# CHECK-NEXT:  1      3     1.00                        neg	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        neg	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      3     1.00                        neg	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      3     1.00                        neg	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        neg	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        neg	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        neg	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      9     2.00                        nmatch	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      9     2.00                        nmatch	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      9     2.00                        nmatch	p15.b, p7/z, z30.b, z31.b
+# CHECK-NEXT:  1      9     2.00                        nmatch	p15.h, p7/z, z30.h, z31.h
+# CHECK-NEXT:  1      2     1.00                        nor	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nor	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        nors	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nors	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        not	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        not	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      3     1.00                        not	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        not	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        not	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        not	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      2     1.00                        nots	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        nots	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        orn	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        orn	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        orns	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        orns	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        orr	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      3     1.00                        orr	z0.d, z0.d, #0x6
+# CHECK-NEXT:  1      3     1.00                        orr	z0.d, z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  1      3     1.00                        orr	z0.s, z0.s, #0x6
+# CHECK-NEXT:  1      3     1.00                        orr	z0.s, z0.s, #0xfffffff9
+# CHECK-NEXT:  1      3     1.00                        orr	z23.d, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        orr	z23.h, z23.h, #0x6
+# CHECK-NEXT:  1      3     1.00                        orr	z23.h, z23.h, #0xfff9
+# CHECK-NEXT:  1      3     1.00                        orr	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        orr	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        orr	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        orr	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        orr	z5.b, z5.b, #0x6
+# CHECK-NEXT:  1      3     1.00                        orr	z5.b, z5.b, #0xf9
+# CHECK-NEXT:  1      2     1.00                        orrs	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      4     1.00                        orv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        orv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        orv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        orv	s0, p7, z31.s
+# CHECK-NEXT:  1      2     1.00                        pfalse	p15.b
+# CHECK-NEXT:  1      2     1.00                        pfirst	p0.b, p15, p0.b
+# CHECK-NEXT:  1      2     1.00                        pfirst	p15.b, p15, p15.b
+# CHECK-NEXT:  1      4     1.00                        pmul	z0.b, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        pmul	z29.b, z30.b, z31.b
+# CHECK-NEXT:  1      9     1.00                        pmullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      9     1.00                        pmullb	z29.q, z30.d, z31.d
+# CHECK-NEXT:  1      9     1.00                        pmullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      9     1.00                        pmullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      9     1.00                        pmullt	z29.q, z30.d, z31.d
+# CHECK-NEXT:  1      9     1.00                        pmullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.b, p15, p0.b
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.d, p15, p0.d
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.h, p15, p0.h
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.s, p15, p0.s
+# CHECK-NEXT:  1      2     1.00                        pnext	p15.b, p15, p15.b
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	#14, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	#15, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	#6, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	#7, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	#7, p3, [z13.s, #31]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	#7, p3, [z13.s]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1keep, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1keep, p0, [x0, z0.d]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1keep, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl3strm, p5, [x10, z21.d, sxtw]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl3strm, p5, [x10, z21.s, uxtw]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl3strm, p5, [z10.d, #31]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pldl3strm, p5, [z10.d]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pstl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pstl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pstl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pstl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pstl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfb	pstl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#14, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#15, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#15, p7, [z31.d, #248]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#15, p7, [z31.d]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#15, p7, [z31.s, #248]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#15, p7, [z31.s]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#6, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	#7, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1keep, p0, [x0, z0.d, lsl #3]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1keep, p0, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1keep, p0, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1keep, p0, [x0, z0.s, sxtw #3]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1keep, p0, [x0, z0.s, uxtw #3]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pldl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pstl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pstl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pstl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pstl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pstl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfd	pstl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#14, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#15, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#15, p7, [z31.d, #62]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#15, p7, [z31.d]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#15, p7, [z31.s, #62]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#15, p7, [z31.s]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#6, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	#7, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl1keep, p0, [x0, z0.d, lsl #1]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl3strm, p5, [x10, z21.d, sxtw #1]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl3strm, p5, [x10, z21.d, uxtw #1]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl3strm, p5, [x10, z21.s, sxtw #1]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pldl3strm, p5, [x10, z21.s, uxtw #1]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pstl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pstl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pstl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pstl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pstl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfh	pstl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#14, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#15, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#15, p7, [z31.d, #124]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#15, p7, [z31.d]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#15, p7, [z31.s, #124]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#15, p7, [z31.s]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#6, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#7, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	#7, p3, [x13, z8.d, uxtw #2]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl1keep, p0, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl1keep, p0, [x0, z0.s, uxtw #2]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl3strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl3strm, p5, [x10, z21.d, lsl #2]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pldl3strm, p5, [x10, z21.s, sxtw #2]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pstl1keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pstl1strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pstl2keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pstl2strm, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pstl3keep, p0, [x0]
+# CHECK-NEXT:  1      0     1.00    *      *      U     prfw	pstl3strm, p0, [x0]
+# CHECK-NEXT:  1      2     1.00                        ptest	p15, p0.b
+# CHECK-NEXT:  1      2     1.00                        ptest	p15, p15.b
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.b, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.d, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.h, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.s, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.b
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.d
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.h
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.s
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #14
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #15
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #16
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #17
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #18
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #19
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #20
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #21
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #22
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #23
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #24
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #25
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #26
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #27
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #28
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, mul3
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, mul4
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl1
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl128
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl16
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl256
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl3
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl32
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl4
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl5
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl6
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl64
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl7
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl8
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.b, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.d, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.h, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.s, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.b
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.d
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.h
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.s
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #14
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #15
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #16
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #17
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #18
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #19
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #20
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #21
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #22
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #23
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #24
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #25
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #26
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #27
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #28
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, mul3
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, mul4
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl1
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl128
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl16
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl256
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl3
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl32
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl4
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl5
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl6
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl64
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl7
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl8
+# CHECK-NEXT:  1      2     1.00                        punpkhi	p0.h, p0.b
+# CHECK-NEXT:  1      2     1.00                        punpkhi	p15.h, p15.b
+# CHECK-NEXT:  1      2     1.00                        punpklo	p0.h, p0.b
+# CHECK-NEXT:  1      2     1.00                        punpklo	p15.h, p15.b
+# CHECK-NEXT:  1      8     1.00                        raddhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        raddhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        raddhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     1.00                        raddhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        raddhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        raddhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      9     1.00                        rax1	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        rbit	z0.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     1.00                        rbit	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        rbit	z0.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        rbit	z0.s, p7/m, z31.s
+# CHECK-NEXT:  1      1     1.00    *             U     rdffr	p0.b
+# CHECK-NEXT:  1      3     1.00    *             U     rdffr	p0.b, p0/z
+# CHECK-NEXT:  1      1     1.00    *             U     rdffr	p15.b
+# CHECK-NEXT:  1      3     1.00    *             U     rdffr	p15.b, p15/z
+# CHECK-NEXT:  1      3     1.00                  U     rdffrs	p0.b, p0/z
+# CHECK-NEXT:  1      3     1.00                  U     rdffrs	p15.b, p15/z
+# CHECK-NEXT:  1      1     1.00                        rdvl	x0, #0
+# CHECK-NEXT:  1      1     1.00                        rdvl	x21, #-32
+# CHECK-NEXT:  1      1     1.00                        rdvl	x23, #31
+# CHECK-NEXT:  1      1     1.00                        rdvl	xzr, #-1
+# CHECK-NEXT:  1      3     1.00                        rev	z0.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        rev	z0.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        rev	z0.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        rev	z0.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        revb	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        revb	z0.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        revb	z0.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        revh	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        revh	z0.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        revw	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        rshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        rshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        rshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        rshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        rshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        rshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        rshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        rshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        rshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        rshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        rshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        rshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      8     1.00                        rsubhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        rsubhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        rsubhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     1.00                        rsubhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        rsubhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        rsubhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      6     2.00                        saba	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      6     2.00                        saba	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      6     2.00                        saba	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      6     2.00                        saba	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        sabalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        sabalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     2.00                        sabalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      6     2.00                        sabalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        sabalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     2.00                        sabalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        sabd	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        sabd	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        sabd	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        sabd	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        sabdlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      3     1.00                        sabdlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        sabdlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        sabdlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      3     1.00                        sabdlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        sabdlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      7     2.00                        sadalp	z0.h, p0/m, z1.b
+# CHECK-NEXT:  1      7     2.00                        sadalp	z29.s, p0/m, z30.h
+# CHECK-NEXT:  1      7     2.00                        sadalp	z30.d, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        saddlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        saddlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        saddlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        saddlbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        saddlbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        saddlbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        saddlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        saddlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        saddlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        saddv	d0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        saddv	d0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        saddv	d0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        saddwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        saddwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        saddwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        saddwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        saddwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        saddwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        sbclb	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        sbclb	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sbclt	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        sbclt	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.d, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.h, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.h, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.s, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        scvtf	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      26    23.00                       sdiv	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      15    12.00                       sdiv	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      26    23.00                       sdivr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      15    12.00                       sdivr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sdot	z0.d, z1.h, z15.h[1]
+# CHECK-NEXT:  1      4     1.00                        sdot	z0.d, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sdot	z0.s, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sdot	z0.s, z1.b, z7.b[3]
+# CHECK-NEXT:  1      3     1.00                        sel	z23.b, p11, z13.b, z8.b
+# CHECK-NEXT:  1      3     1.00                        sel	z23.d, p11, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        sel	z23.h, p11, z13.h, z8.h
+# CHECK-NEXT:  1      3     1.00                        sel	z23.s, p11, z13.s, z8.s
+# CHECK-NEXT:  1      1     1.00           *      U     setffr
+# CHECK-NEXT:  1      3     1.00                        shadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        shadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        shadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        shadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        shrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        shrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        shrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        shrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      3     1.00                        shrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      3     1.00                        shrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      3     1.00                        shrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        shrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        shrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        shrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      3     1.00                        shrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      3     1.00                        shrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      3     1.00                        shsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        shsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        shsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        shsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        shsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        shsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        shsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        shsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        sli	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        sli	z0.d, z0.d, #0
+# CHECK-NEXT:  1      3     1.00                        sli	z0.h, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        sli	z0.s, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        sli	z31.b, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        sli	z31.d, z31.d, #63
+# CHECK-NEXT:  1      3     1.00                        sli	z31.h, z31.h, #15
+# CHECK-NEXT:  1      3     1.00                        sli	z31.s, z31.s, #31
+# CHECK-NEXT:  1      9     1.00                        sm4e	z0.s, z0.s, z31.s
+# CHECK-NEXT:  1      9     1.00                        sm4ekey	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        smax	z0.b, z0.b, #-128
+# CHECK-NEXT:  1      3     1.00                        smax	z0.d, z0.d, #-128
+# CHECK-NEXT:  1      3     1.00                        smax	z0.h, z0.h, #-128
+# CHECK-NEXT:  1      3     1.00                        smax	z0.s, z0.s, #-128
+# CHECK-NEXT:  1      3     1.00                        smax	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        smax	z31.b, z31.b, #127
+# CHECK-NEXT:  1      3     1.00                        smax	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        smax	z31.d, z31.d, #127
+# CHECK-NEXT:  1      3     1.00                        smax	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        smax	z31.h, z31.h, #127
+# CHECK-NEXT:  1      3     1.00                        smax	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        smax	z31.s, z31.s, #127
+# CHECK-NEXT:  1      3     1.00                        smaxp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        smaxp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        smaxp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        smaxp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        smaxv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        smaxv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        smaxv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        smaxv	s0, p7, z31.s
+# CHECK-NEXT:  1      3     1.00                        smin	z0.b, z0.b, #-128
+# CHECK-NEXT:  1      3     1.00                        smin	z0.d, z0.d, #-128
+# CHECK-NEXT:  1      3     1.00                        smin	z0.h, z0.h, #-128
+# CHECK-NEXT:  1      3     1.00                        smin	z0.s, z0.s, #-128
+# CHECK-NEXT:  1      3     1.00                        smin	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        smin	z31.b, z31.b, #127
+# CHECK-NEXT:  1      3     1.00                        smin	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        smin	z31.d, z31.d, #127
+# CHECK-NEXT:  1      3     1.00                        smin	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        smin	z31.h, z31.h, #127
+# CHECK-NEXT:  1      3     1.00                        smin	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        smin	z31.s, z31.s, #127
+# CHECK-NEXT:  1      3     1.00                        sminp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        sminp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        sminp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        sminp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sminv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        sminv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        sminv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        sminv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        smlalb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        smlalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smlalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        smlalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smlalb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        smlalt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        smlalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smlalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        smlalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        smlslb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        smlslb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smlslb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        smlslb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smlslb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        smlslt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        smlslt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smlslt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        smlslt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smlslt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        smulh	z0.b, p7/m, z0.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        smulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        smulh	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        smulh	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        smulh	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        smullb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        smullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        smullb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        smullb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        smullt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        smullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        smullt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        smullt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        smullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        splice	z29.b, p7, { z30.b, z31.b }
+# CHECK-NEXT:  1      4     1.00                        splice	z29.d, p7, { z30.d, z31.d }
+# CHECK-NEXT:  1      4     1.00                        splice	z29.h, p7, { z30.h, z31.h }
+# CHECK-NEXT:  1      4     1.00                        splice	z29.s, p7, { z30.s, z31.s }
+# CHECK-NEXT:  1      4     1.00                        splice	z31.b, p7, z31.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        splice	z31.d, p7, z31.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        splice	z31.h, p7, z31.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        splice	z31.s, p7, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqabs	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqabs	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqabs	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqabs	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.b, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.d, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.h, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.s, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        sqadd	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        sqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqadd	z31.b, z31.b, #255
+# CHECK-NEXT:  1      4     1.00                        sqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqadd	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      4     1.00                        sqadd	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      4     1.00                        sqadd	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z0.b, z0.b, z0.b, #90
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z0.d, z0.d, z0.d, #90
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z0.h, z0.h, z0.h, #90
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z0.s, z0.s, z0.s, #90
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z31.b, z31.b, z31.b, #270
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z31.d, z31.d, z31.d, #270
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z31.h, z31.h, z31.h, #270
+# CHECK-NEXT:  1      4     1.00                        sqcadd	z31.s, z31.s, z31.s, #270
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdecb	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdecd	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdecd	z0.d
+# CHECK-NEXT:  1      4     1.00                        sqdecd	z0.d, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdecd	z0.d, pow2
+# CHECK-NEXT:  1      4     1.00                        sqdecd	z0.d, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdech	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdech	z0.h
+# CHECK-NEXT:  1      4     1.00                        sqdech	z0.h, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdech	z0.h, pow2
+# CHECK-NEXT:  1      4     1.00                        sqdech	z0.h, pow2, mul #16
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.s
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.b, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.d, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.h, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.s, wzr
+# CHECK-NEXT:  1      3     1.00                        sqdecp	z0.d, p0.d
+# CHECK-NEXT:  1      3     1.00                        sqdecp	z0.h, p0.h
+# CHECK-NEXT:  1      3     1.00                        sqdecp	z0.s, p0.s
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqdecw	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdecw	z0.s
+# CHECK-NEXT:  1      4     1.00                        sqdecw	z0.s, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdecw	z0.s, pow2
+# CHECK-NEXT:  1      4     1.00                        sqdecw	z0.s, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqdmlalb	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmlalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqdmlalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmlalb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmlalbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmlalbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqdmlalbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmlalt	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmlalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqdmlalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmlslb	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlslb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmlslb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqdmlslb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmlslb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmlslbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmlslbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqdmlslbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmlslt	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlslt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmlslt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqdmlslt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmlslt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqdmullb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        sqdmullb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmullb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqdmullt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        sqdmullt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqdmullt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqdmullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqincb	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqincd	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqincd	z0.d
+# CHECK-NEXT:  1      4     1.00                        sqincd	z0.d, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqincd	z0.d, pow2
+# CHECK-NEXT:  1      4     1.00                        sqincd	z0.d, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqinch	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqinch	z0.h
+# CHECK-NEXT:  1      4     1.00                        sqinch	z0.h, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqinch	z0.h, pow2
+# CHECK-NEXT:  1      4     1.00                        sqinch	z0.h, pow2, mul #16
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.s
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.b, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.d, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.h, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.s, wzr
+# CHECK-NEXT:  1      3     1.00                        sqincp	z0.d, p0.d
+# CHECK-NEXT:  1      3     1.00                        sqincp	z0.h, p0.h
+# CHECK-NEXT:  1      3     1.00                        sqincp	z0.s, p0.s
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, #14
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, w0
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, w0, pow2
+# CHECK-NEXT:  1      5     1.00                        sqincw	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqincw	z0.s
+# CHECK-NEXT:  1      4     1.00                        sqincw	z0.s, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqincw	z0.s, pow2
+# CHECK-NEXT:  1      4     1.00                        sqincw	z0.s, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        sqneg	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqneg	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqneg	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqneg	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z0.b, z1.b, z2.b, #0
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z0.d, z1.d, z2.d, #0
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z0.h, z1.h, z2.h, #0
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z0.h, z1.h, z2.h[0], #0
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z0.s, z1.s, z2.s, #0
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z0.s, z1.s, z2.s[0], #0
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z15.b, z16.b, z17.b, #270
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z15.d, z16.d, z17.d, #270
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z15.h, z16.h, z17.h, #270
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z15.s, z16.s, z17.s, #270
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z29.b, z30.b, z31.b, #90
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z29.d, z30.d, z31.d, #90
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z29.h, z30.h, z31.h, #90
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z29.s, z30.s, z31.s, #90
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z31.b, z31.b, z31.b, #180
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z31.d, z31.d, z31.d, #180
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z31.h, z30.h, z7.h[0], #180
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z31.h, z31.h, z31.h, #180
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z31.s, z30.s, z7.s[0], #180
+# CHECK-NEXT:  1      4     1.00                        sqrdcmlah	z31.s, z31.s, z31.s, #180
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqrdmlah	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqrdmlsh	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqrshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqrshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqrshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqrshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqrshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqrshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqrshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqrshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqrshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqrshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqrshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqrshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqrshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqrshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqrshrunb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrunb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrunb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrunb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqrshrunb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqrshrunb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqrshrunt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrunt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrunt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqrshrunt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqrshrunt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqrshrunt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqshl	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        sqshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqshl	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        sqshl	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        sqshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqshl	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        sqshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqshl	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  1      4     1.00                        sqshl	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  1      4     1.00                        sqshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqshl	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  1      4     1.00                        sqshl	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  1      4     1.00                        sqshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  1      4     1.00                        sqshlu	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  1      4     1.00                        sqshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqshrunb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrunb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrunb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrunb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqshrunb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqshrunb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqshrunt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrunt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrunt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        sqshrunt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        sqshrunt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        sqshrunt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.b, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.d, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.h, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.s, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        sqsub	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        sqsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqsub	z31.b, z31.b, #255
+# CHECK-NEXT:  1      4     1.00                        sqsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqsub	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      4     1.00                        sqsub	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      4     1.00                        sqsub	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      4     1.00                        sqsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        sqsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        sqsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        sqsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        sqxtnb	z0.b, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqxtnb	z0.h, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqxtnb	z0.s, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqxtnt	z0.b, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqxtnt	z0.h, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqxtnt	z0.s, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqxtunb	z0.b, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqxtunb	z0.h, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqxtunb	z0.s, z31.d
+# CHECK-NEXT:  1      4     1.00                        sqxtunt	z0.b, z31.h
+# CHECK-NEXT:  1      4     1.00                        sqxtunt	z0.h, z31.s
+# CHECK-NEXT:  1      4     1.00                        sqxtunt	z0.s, z31.d
+# CHECK-NEXT:  1      4     1.00                        srhadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        srhadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        srhadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        srhadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        sri	z0.b, z0.b, #1
+# CHECK-NEXT:  1      3     1.00                        sri	z0.d, z0.d, #1
+# CHECK-NEXT:  1      3     1.00                        sri	z0.h, z0.h, #1
+# CHECK-NEXT:  1      3     1.00                        sri	z0.s, z0.s, #1
+# CHECK-NEXT:  1      3     1.00                        sri	z31.b, z31.b, #8
+# CHECK-NEXT:  1      3     1.00                        sri	z31.d, z31.d, #64
+# CHECK-NEXT:  1      3     1.00                        sri	z31.h, z31.h, #16
+# CHECK-NEXT:  1      3     1.00                        sri	z31.s, z31.s, #32
+# CHECK-NEXT:  1      4     1.00                        srshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        srshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        srshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        srshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        srshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        srshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        srshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        srshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        srshr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  1      4     1.00                        srshr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        srshr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        srshr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        srshr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  1      4     1.00                        srshr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  1      4     1.00                        srshr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  1      4     1.00                        srshr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  1      7     2.00                        srsra	z0.b, z0.b, #1
+# CHECK-NEXT:  1      7     2.00                        srsra	z0.d, z0.d, #1
+# CHECK-NEXT:  1      7     2.00                        srsra	z0.h, z0.h, #1
+# CHECK-NEXT:  1      7     2.00                        srsra	z0.s, z0.s, #1
+# CHECK-NEXT:  1      7     2.00                        srsra	z31.b, z31.b, #8
+# CHECK-NEXT:  1      7     2.00                        srsra	z31.d, z31.d, #64
+# CHECK-NEXT:  1      7     2.00                        srsra	z31.h, z31.h, #16
+# CHECK-NEXT:  1      7     2.00                        srsra	z31.s, z31.s, #32
+# CHECK-NEXT:  1      3     1.00                        sshllb	z0.d, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        sshllb	z0.h, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        sshllb	z0.s, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        sshllb	z31.d, z31.s, #31
+# CHECK-NEXT:  1      3     1.00                        sshllb	z31.h, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        sshllb	z31.s, z31.h, #15
+# CHECK-NEXT:  1      3     1.00                        sshllt	z0.d, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        sshllt	z0.h, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        sshllt	z0.s, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        sshllt	z31.d, z31.s, #31
+# CHECK-NEXT:  1      3     1.00                        sshllt	z31.h, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        sshllt	z31.s, z31.h, #15
+# CHECK-NEXT:  1      4     1.00                        ssra	z0.b, z0.b, #1
+# CHECK-NEXT:  1      4     1.00                        ssra	z0.d, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        ssra	z0.h, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        ssra	z0.s, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        ssra	z31.b, z31.b, #8
+# CHECK-NEXT:  1      4     1.00                        ssra	z31.d, z31.d, #64
+# CHECK-NEXT:  1      4     1.00                        ssra	z31.h, z31.h, #16
+# CHECK-NEXT:  1      4     1.00                        ssra	z31.s, z31.s, #32
+# CHECK-NEXT:  1      4     1.00                        ssublb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        ssublb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        ssublb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        ssublbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        ssublbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        ssublbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        ssublt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        ssublt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        ssublt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        ssubltb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        ssubltb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        ssubltb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        ssubwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        ssubwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        ssubwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        ssubwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        ssubwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        ssubwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.d }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     8.00           *            st1b	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1b	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1b	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  1      1     7.00           *            st1b	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.h }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.s }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     8.00           *            st1b	{ z0.s }, p0, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1b	{ z0.s }, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  1      1     9.00           *            st1b	{ z0.s }, p7, [z0.s]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z21.b }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z21.h }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z21.s }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z31.b }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            st1b	{ z31.d }, p7, [z31.d, #31]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z31.h }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1b	{ z31.s }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     9.00           *            st1b	{ z31.s }, p7, [z31.s, #31]
+# CHECK-NEXT:  1      1     1.00           *            st1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      1     8.00           *            st1d	{ z0.d }, p0, [x0, z0.d, lsl #3]
+# CHECK-NEXT:  1      1     8.00           *            st1d	{ z0.d }, p0, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  1      1     8.00           *            st1d	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1d	{ z0.d }, p0, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  1      1     8.00           *            st1d	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1d	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  1      1     7.00           *            st1d	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1d	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1d	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            st1d	{ z31.d }, p7, [z31.d, #248]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z0.d }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.d }, p0, [x0, z0.d, lsl #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.d }, p0, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.d }, p0, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  1      1     7.00           *            st1h	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z0.s }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.s }, p0, [x0, z0.s, sxtw #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.s }, p0, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.s }, p0, [x0, z0.s, uxtw #1]
+# CHECK-NEXT:  1      1     8.00           *            st1h	{ z0.s }, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  1      1     9.00           *            st1h	{ z0.s }, p7, [z0.s]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z21.h }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z21.s }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            st1h	{ z31.d }, p7, [z31.d, #62]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z31.h }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1h	{ z31.s }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     9.00           *            st1h	{ z31.s }, p7, [z31.s, #62]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z0.d }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.d }, p0, [x0, z0.d, lsl #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.d }, p0, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.d }, p0, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  1      1     7.00           *            st1w	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.s }, p0, [x0, z0.s, sxtw #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.s }, p0, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.s }, p0, [x0, z0.s, uxtw #2]
+# CHECK-NEXT:  1      1     8.00           *            st1w	{ z0.s }, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  1      1     9.00           *            st1w	{ z0.s }, p7, [z0.s]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z21.s }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            st1w	{ z31.d }, p7, [z31.d, #124]
+# CHECK-NEXT:  1      1     1.00           *            st1w	{ z31.s }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  1      1     9.00           *            st1w	{ z31.s }, p7, [z31.s, #124]
+# CHECK-NEXT:  1      1     11.00          *            st2b	{ z0.b, z1.b }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     11.00          *            st2b	{ z0.b, z1.b }, p0, [x0]
+# CHECK-NEXT:  1      1     11.00          *            st2b	{ z21.b, z22.b }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2b	{ z23.b, z24.b }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2b	{ z5.b, z6.b }, p3, [x17, x16]
+# CHECK-NEXT:  1      1     11.00          *            st2d	{ z0.d, z1.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      1     11.00          *            st2d	{ z0.d, z1.d }, p0, [x0]
+# CHECK-NEXT:  1      1     11.00          *            st2d	{ z21.d, z22.d }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2d	{ z23.d, z24.d }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2d	{ z5.d, z6.d }, p3, [x17, x16, lsl #3]
+# CHECK-NEXT:  1      1     11.00          *            st2h	{ z0.h, z1.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     11.00          *            st2h	{ z0.h, z1.h }, p0, [x0]
+# CHECK-NEXT:  1      1     11.00          *            st2h	{ z21.h, z22.h }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2h	{ z23.h, z24.h }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2h	{ z5.h, z6.h }, p3, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      1     11.00          *            st2w	{ z0.s, z1.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      1     11.00          *            st2w	{ z0.s, z1.s }, p0, [x0]
+# CHECK-NEXT:  1      1     11.00          *            st2w	{ z21.s, z22.s }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2w	{ z23.s, z24.s }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  1      1     11.00          *            st2w	{ z5.s, z6.s }, p3, [x17, x16, lsl #2]
+# CHECK-NEXT:  1      1     25.00          *            st3b	{ z0.b - z2.b }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     25.00          *            st3b	{ z0.b - z2.b }, p0, [x0]
+# CHECK-NEXT:  1      1     25.00          *            st3b	{ z21.b - z23.b }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st3b	{ z23.b - z25.b }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st3b	{ z5.b - z7.b }, p3, [x17, x16]
+# CHECK-NEXT:  1      1     14.00          *            st3d	{ z0.d - z2.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      1     14.00          *            st3d	{ z0.d - z2.d }, p0, [x0]
+# CHECK-NEXT:  1      1     14.00          *            st3d	{ z21.d - z23.d }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  1      1     14.00          *            st3d	{ z23.d - z25.d }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      1     14.00          *            st3d	{ z5.d - z7.d }, p3, [x17, x16, lsl #3]
+# CHECK-NEXT:  1      1     25.00          *            st3h	{ z0.h - z2.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     25.00          *            st3h	{ z0.h - z2.h }, p0, [x0]
+# CHECK-NEXT:  1      1     25.00          *            st3h	{ z21.h - z23.h }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st3h	{ z23.h - z25.h }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st3h	{ z5.h - z7.h }, p3, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      1     25.00          *            st3w	{ z0.s - z2.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      1     25.00          *            st3w	{ z0.s - z2.s }, p0, [x0]
+# CHECK-NEXT:  1      1     25.00          *            st3w	{ z21.s - z23.s }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st3w	{ z23.s - z25.s }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st3w	{ z5.s - z7.s }, p3, [x17, x16, lsl #2]
+# CHECK-NEXT:  1      1     50.00          *            st4b	{ z0.b - z3.b }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     50.00          *            st4b	{ z0.b - z3.b }, p0, [x0]
+# CHECK-NEXT:  1      1     50.00          *            st4b	{ z21.b - z24.b }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  1      1     50.00          *            st4b	{ z23.b - z26.b }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      1     50.00          *            st4b	{ z5.b - z8.b }, p3, [x17, x16]
+# CHECK-NEXT:  1      1     25.00          *            st4d	{ z0.d - z3.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      1     25.00          *            st4d	{ z0.d - z3.d }, p0, [x0]
+# CHECK-NEXT:  1      1     25.00          *            st4d	{ z21.d - z24.d }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st4d	{ z23.d - z26.d }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      1     25.00          *            st4d	{ z5.d - z8.d }, p3, [x17, x16, lsl #3]
+# CHECK-NEXT:  1      1     50.00          *            st4h	{ z0.h - z3.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     50.00          *            st4h	{ z0.h - z3.h }, p0, [x0]
+# CHECK-NEXT:  1      1     50.00          *            st4h	{ z21.h - z24.h }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  1      1     50.00          *            st4h	{ z23.h - z26.h }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      1     50.00          *            st4h	{ z5.h - z8.h }, p3, [x17, x16, lsl #1]
+# CHECK-NEXT:  1      1     50.00          *            st4w	{ z0.s - z3.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      1     50.00          *            st4w	{ z0.s - z3.s }, p0, [x0]
+# CHECK-NEXT:  1      1     50.00          *            st4w	{ z21.s - z24.s }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  1      1     50.00          *            st4w	{ z23.s - z26.s }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  1      1     50.00          *            st4w	{ z5.s - z8.s }, p3, [x17, x16, lsl #2]
+# CHECK-NEXT:  1      1     1.00           *            stnt1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  1      1     1.00           *            stnt1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  1      1     7.00           *            stnt1b	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  1      1     9.00           *            stnt1b	{ z0.s }, p0, [z1.s]
+# CHECK-NEXT:  1      1     1.00           *            stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            stnt1b	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  1      1     7.00           *            stnt1b	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  1      1     9.00           *            stnt1b	{ z31.s }, p7, [z31.s, x0]
+# CHECK-NEXT:  1      1     9.00           *            stnt1b	{ z31.s }, p7, [z31.s]
+# CHECK-NEXT:  1      1     1.00           *            stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  1      1     1.00           *            stnt1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  1      1     7.00           *            stnt1d	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  1      1     1.00           *            stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            stnt1d	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  1      1     7.00           *            stnt1d	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  1      1     7.00           *            stnt1h	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  1      1     1.00           *            stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  1      1     1.00           *            stnt1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  1      1     9.00           *            stnt1h	{ z0.s }, p0, [z1.s]
+# CHECK-NEXT:  1      1     1.00           *            stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            stnt1h	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  1      1     7.00           *            stnt1h	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  1      1     9.00           *            stnt1h	{ z31.s }, p7, [z31.s, x0]
+# CHECK-NEXT:  1      1     9.00           *            stnt1h	{ z31.s }, p7, [z31.s]
+# CHECK-NEXT:  1      1     7.00           *            stnt1w	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  1      1     1.00           *            stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  1      1     1.00           *            stnt1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  1      1     9.00           *            stnt1w	{ z0.s }, p0, [z1.s]
+# CHECK-NEXT:  1      1     1.00           *            stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  1      1     7.00           *            stnt1w	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  1      1     7.00           *            stnt1w	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  1      1     9.00           *            stnt1w	{ z31.s }, p7, [z31.s, x0]
+# CHECK-NEXT:  1      1     9.00           *            stnt1w	{ z31.s }, p7, [z31.s]
+# CHECK-NEXT:  1      1     1.00           *            str	p0, [x0]
+# CHECK-NEXT:  1      1     1.00           *            str	p15, [sp, #-256, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            str	p5, [x10, #255, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            str	z0, [x0]
+# CHECK-NEXT:  1      1     1.00           *            str	z21, [x10, #-256, mul vl]
+# CHECK-NEXT:  1      1     1.00           *            str	z31, [sp, #255, mul vl]
+# CHECK-NEXT:  1      3     1.00                        sub	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        sub	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        sub	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        sub	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        sub	z0.d, z0.d, #0
+# CHECK-NEXT:  1      3     1.00                        sub	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        sub	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        sub	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        sub	z0.h, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        sub	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        sub	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        sub	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        sub	z0.s, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        sub	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        sub	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        sub	z21.b, p5/m, z21.b, z10.b
+# CHECK-NEXT:  1      3     1.00                        sub	z21.b, z10.b, z21.b
+# CHECK-NEXT:  1      3     1.00                        sub	z21.d, p5/m, z21.d, z10.d
+# CHECK-NEXT:  1      3     1.00                        sub	z21.d, z10.d, z21.d
+# CHECK-NEXT:  1      3     1.00                        sub	z21.h, p5/m, z21.h, z10.h
+# CHECK-NEXT:  1      3     1.00                        sub	z21.h, z10.h, z21.h
+# CHECK-NEXT:  1      3     1.00                        sub	z21.s, p5/m, z21.s, z10.s
+# CHECK-NEXT:  1      3     1.00                        sub	z21.s, z10.s, z21.s
+# CHECK-NEXT:  1      3     1.00                        sub	z23.b, p3/m, z23.b, z13.b
+# CHECK-NEXT:  1      3     1.00                        sub	z23.b, z13.b, z8.b
+# CHECK-NEXT:  1      3     1.00                        sub	z23.d, p3/m, z23.d, z13.d
+# CHECK-NEXT:  1      3     1.00                        sub	z23.d, z13.d, z8.d
+# CHECK-NEXT:  1      3     1.00                        sub	z23.h, p3/m, z23.h, z13.h
+# CHECK-NEXT:  1      3     1.00                        sub	z23.h, z13.h, z8.h
+# CHECK-NEXT:  1      3     1.00                        sub	z23.s, p3/m, z23.s, z13.s
+# CHECK-NEXT:  1      3     1.00                        sub	z23.s, z13.s, z8.s
+# CHECK-NEXT:  1      3     1.00                        sub	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        sub	z31.b, z31.b, #255
+# CHECK-NEXT:  1      3     1.00                        sub	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        sub	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        sub	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      3     1.00                        sub	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        sub	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        sub	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      3     1.00                        sub	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        sub	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        sub	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      3     1.00                        sub	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        subhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        subhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        subhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     1.00                        subhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        subhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     1.00                        subhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        subr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        subr	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        subr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        subr	z0.d, z0.d, #0
+# CHECK-NEXT:  1      3     1.00                        subr	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        subr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        subr	z0.h, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        subr	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        subr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        subr	z0.s, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        subr	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      3     1.00                        subr	z31.b, z31.b, #255
+# CHECK-NEXT:  1      3     1.00                        subr	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      3     1.00                        subr	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      3     1.00                        subr	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      4     1.00                        sunpkhi	z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        sunpkhi	z31.h, z31.b
+# CHECK-NEXT:  1      4     1.00                        sunpkhi	z31.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        sunpklo	z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        sunpklo	z31.h, z31.b
+# CHECK-NEXT:  1      4     1.00                        sunpklo	z31.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        suqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        suqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        suqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        suqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        sxtb	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        sxtb	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      3     1.00                        sxtb	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      3     1.00                        sxtb	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        sxtb	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        sxtb	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        sxth	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        sxth	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      3     1.00                        sxth	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        sxth	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        sxtw	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        sxtw	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      4     1.00                        tbl	z28.b, { z29.b, z30.b }, z31.b
+# CHECK-NEXT:  1      4     1.00                        tbl	z28.d, { z29.d, z30.d }, z31.d
+# CHECK-NEXT:  1      4     1.00                        tbl	z28.h, { z29.h, z30.h }, z31.h
+# CHECK-NEXT:  1      4     1.00                        tbl	z28.s, { z29.s, z30.s }, z31.s
+# CHECK-NEXT:  1      4     1.00                        tbl	z31.b, { z31.b }, z31.b
+# CHECK-NEXT:  1      4     1.00                        tbl	z31.d, { z31.d }, z31.d
+# CHECK-NEXT:  1      4     1.00                        tbl	z31.h, { z31.h }, z31.h
+# CHECK-NEXT:  1      4     1.00                        tbl	z31.s, { z31.s }, z31.s
+# CHECK-NEXT:  1      8     1.00                        tbx	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      8     1.00                        tbx	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      8     1.00                        tbx	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      8     1.00                        tbx	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        trn1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        trn1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        trn1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        trn1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        trn2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        trn2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        trn2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        trn2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        uaba	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      6     2.00                        uaba	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      6     2.00                        uaba	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      6     2.00                        uaba	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        uabalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        uabalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     2.00                        uabalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      6     2.00                        uabalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     2.00                        uabalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     2.00                        uabalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        uabd	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        uabd	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        uabd	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        uabd	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        uabdlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      3     1.00                        uabdlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        uabdlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        uabdlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      3     1.00                        uabdlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        uabdlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      7     2.00                        uadalp	z0.h, p0/m, z1.b
+# CHECK-NEXT:  1      7     2.00                        uadalp	z29.s, p0/m, z30.h
+# CHECK-NEXT:  1      7     2.00                        uadalp	z30.d, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        uaddlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        uaddlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        uaddlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        uaddlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        uaddlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        uaddlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        uaddwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        uaddwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        uaddwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        uaddwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        uaddwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        uaddwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.d, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.h, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.h, p0/m, z0.s
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.s, p0/m, z0.d
+# CHECK-NEXT:  1      4     1.00                        ucvtf	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      26    23.00                       udiv	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      15    12.00                       udiv	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      26    23.00                       udivr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      15    12.00                       udivr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        udot	z0.d, z1.h, z15.h[1]
+# CHECK-NEXT:  1      4     1.00                        udot	z0.d, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        udot	z0.s, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        udot	z0.s, z1.b, z7.b[3]
+# CHECK-NEXT:  1      3     1.00                        uhadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        uhadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        uhadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        uhadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        uhsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        uhsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        uhsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        uhsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        uhsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        uhsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        uhsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        uhsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     1.00                        umax	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        umax	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        umax	z31.b, z31.b, #255
+# CHECK-NEXT:  1      3     1.00                        umax	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        umax	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        umax	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        umaxp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        umaxp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        umaxp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        umaxp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        umaxv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        umaxv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        umaxv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        umaxv	s0, p7, z31.s
+# CHECK-NEXT:  1      3     1.00                        umin	z0.b, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        umin	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        umin	z31.b, z31.b, #255
+# CHECK-NEXT:  1      3     1.00                        umin	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        umin	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        umin	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  1      3     1.00                        uminp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      3     1.00                        uminp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     1.00                        uminp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     1.00                        uminp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uminv	b0, p7, z31.b
+# CHECK-NEXT:  1      4     1.00                        uminv	d0, p7, z31.d
+# CHECK-NEXT:  1      4     1.00                        uminv	h0, p7, z31.h
+# CHECK-NEXT:  1      4     1.00                        uminv	s0, p7, z31.s
+# CHECK-NEXT:  1      4     1.00                        umlalb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        umlalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umlalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        umlalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umlalb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        umlalt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        umlalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umlalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        umlalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        umlslb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        umlslb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umlslb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        umlslb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umlslb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        umlslt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        umlslt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umlslt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        umlslt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umlslt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        ummla	z0.s, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        umulh	z0.b, p7/m, z0.b, z31.b
+# CHECK-NEXT:  1      4     1.00                        umulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        umulh	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        umulh	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  1      4     1.00                        umulh	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      4     1.00                        umullb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        umullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        umullb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        umullb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        umullt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  1      4     1.00                        umullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        umullt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  1      4     1.00                        umullt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        umullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.b, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.d, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.h, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.s, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        uqadd	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        uqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqadd	z31.b, z31.b, #255
+# CHECK-NEXT:  1      4     1.00                        uqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqadd	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      4     1.00                        uqadd	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      4     1.00                        uqadd	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      5     1.00                        uqdecb	w0
+# CHECK-NEXT:  1      5     1.00                        uqdecb	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecb	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdecb	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecb	x0
+# CHECK-NEXT:  1      5     1.00                        uqdecb	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqdecb	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecb	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdecb	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        uqdecd	w0
+# CHECK-NEXT:  1      5     1.00                        uqdecd	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecd	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdecd	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecd	x0
+# CHECK-NEXT:  1      5     1.00                        uqdecd	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqdecd	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecd	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdecd	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        uqdecd	z0.d
+# CHECK-NEXT:  1      4     1.00                        uqdecd	z0.d, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqdecd	z0.d, pow2
+# CHECK-NEXT:  1      4     1.00                        uqdecd	z0.d, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdech	w0
+# CHECK-NEXT:  1      5     1.00                        uqdech	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdech	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdech	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdech	x0
+# CHECK-NEXT:  1      5     1.00                        uqdech	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqdech	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdech	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdech	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        uqdech	z0.h
+# CHECK-NEXT:  1      4     1.00                        uqdech	z0.h, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqdech	z0.h, pow2
+# CHECK-NEXT:  1      4     1.00                        uqdech	z0.h, pow2, mul #16
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.b
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.d
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.h
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.s
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.s
+# CHECK-NEXT:  1      3     1.00                        uqdecp	z0.d, p0.d
+# CHECK-NEXT:  1      3     1.00                        uqdecp	z0.h, p0.h
+# CHECK-NEXT:  1      3     1.00                        uqdecp	z0.s, p0.s
+# CHECK-NEXT:  1      5     1.00                        uqdecw	w0
+# CHECK-NEXT:  1      5     1.00                        uqdecw	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecw	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdecw	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecw	x0
+# CHECK-NEXT:  1      5     1.00                        uqdecw	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqdecw	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqdecw	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqdecw	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        uqdecw	z0.s
+# CHECK-NEXT:  1      4     1.00                        uqdecw	z0.s, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqdecw	z0.s, pow2
+# CHECK-NEXT:  1      4     1.00                        uqdecw	z0.s, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincb	w0
+# CHECK-NEXT:  1      5     1.00                        uqincb	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincb	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqincb	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincb	x0
+# CHECK-NEXT:  1      5     1.00                        uqincb	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqincb	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincb	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqincb	x0, vl1
+# CHECK-NEXT:  1      5     1.00                        uqincd	w0
+# CHECK-NEXT:  1      5     1.00                        uqincd	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincd	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqincd	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincd	x0
+# CHECK-NEXT:  1      5     1.00                        uqincd	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqincd	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincd	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqincd	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        uqincd	z0.d
+# CHECK-NEXT:  1      4     1.00                        uqincd	z0.d, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqincd	z0.d, pow2
+# CHECK-NEXT:  1      4     1.00                        uqincd	z0.d, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqinch	w0
+# CHECK-NEXT:  1      5     1.00                        uqinch	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqinch	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqinch	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqinch	x0
+# CHECK-NEXT:  1      5     1.00                        uqinch	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqinch	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqinch	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqinch	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        uqinch	z0.h
+# CHECK-NEXT:  1      4     1.00                        uqinch	z0.h, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqinch	z0.h, pow2
+# CHECK-NEXT:  1      4     1.00                        uqinch	z0.h, pow2, mul #16
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.b
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.d
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.h
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.s
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.s
+# CHECK-NEXT:  1      3     1.00                        uqincp	z0.d, p0.d
+# CHECK-NEXT:  1      3     1.00                        uqincp	z0.h, p0.h
+# CHECK-NEXT:  1      3     1.00                        uqincp	z0.s, p0.s
+# CHECK-NEXT:  1      5     1.00                        uqincw	w0
+# CHECK-NEXT:  1      5     1.00                        uqincw	w0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincw	w0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqincw	w0, pow2, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincw	x0
+# CHECK-NEXT:  1      5     1.00                        uqincw	x0, #14
+# CHECK-NEXT:  1      5     1.00                        uqincw	x0, all, mul #16
+# CHECK-NEXT:  1      5     1.00                        uqincw	x0, pow2
+# CHECK-NEXT:  1      5     1.00                        uqincw	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        uqincw	z0.s
+# CHECK-NEXT:  1      4     1.00                        uqincw	z0.s, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqincw	z0.s, pow2
+# CHECK-NEXT:  1      4     1.00                        uqincw	z0.s, pow2, mul #16
+# CHECK-NEXT:  1      4     1.00                        uqrshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqrshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqrshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqrshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqrshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqrshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqrshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqrshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqrshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        uqrshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        uqrshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        uqrshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        uqrshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        uqrshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        uqrshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        uqrshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        uqrshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        uqrshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        uqrshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        uqrshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        uqshl	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        uqshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqshl	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        uqshl	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        uqshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqshl	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        uqshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqshl	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  1      4     1.00                        uqshl	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  1      4     1.00                        uqshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqshl	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  1      4     1.00                        uqshl	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  1      4     1.00                        uqshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        uqshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        uqshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        uqshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        uqshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        uqshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        uqshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        uqshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        uqshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        uqshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  1      4     1.00                        uqshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  1      4     1.00                        uqshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.b, z0.b, #0
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.d, z0.d, #0
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.h, z0.h, #0
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.s, z0.s, #0
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  1      4     1.00                        uqsub	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      4     1.00                        uqsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqsub	z31.b, z31.b, #255
+# CHECK-NEXT:  1      4     1.00                        uqsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqsub	z31.d, z31.d, #65280
+# CHECK-NEXT:  1      4     1.00                        uqsub	z31.h, z31.h, #65280
+# CHECK-NEXT:  1      4     1.00                        uqsub	z31.s, z31.s, #65280
+# CHECK-NEXT:  1      4     1.00                        uqsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        uqsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        uqsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        uqsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        uqxtnb	z0.b, z31.h
+# CHECK-NEXT:  1      4     1.00                        uqxtnb	z0.h, z31.s
+# CHECK-NEXT:  1      4     1.00                        uqxtnb	z0.s, z31.d
+# CHECK-NEXT:  1      4     1.00                        uqxtnt	z0.b, z31.h
+# CHECK-NEXT:  1      4     1.00                        uqxtnt	z0.h, z31.s
+# CHECK-NEXT:  1      4     1.00                        uqxtnt	z0.s, z31.d
+# CHECK-NEXT:  1      4     1.00                        urecpe	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      4     1.00                        urhadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        urhadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        urhadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        urhadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        urshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        urshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        urshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        urshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        urshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        urshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        urshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        urshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        urshr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  1      4     1.00                        urshr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        urshr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        urshr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        urshr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  1      4     1.00                        urshr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  1      4     1.00                        urshr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  1      4     1.00                        urshr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  1      4     1.00                        ursqrte	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      7     2.00                        ursra	z0.b, z0.b, #1
+# CHECK-NEXT:  1      7     2.00                        ursra	z0.d, z0.d, #1
+# CHECK-NEXT:  1      7     2.00                        ursra	z0.h, z0.h, #1
+# CHECK-NEXT:  1      7     2.00                        ursra	z0.s, z0.s, #1
+# CHECK-NEXT:  1      7     2.00                        ursra	z31.b, z31.b, #8
+# CHECK-NEXT:  1      7     2.00                        ursra	z31.d, z31.d, #64
+# CHECK-NEXT:  1      7     2.00                        ursra	z31.h, z31.h, #16
+# CHECK-NEXT:  1      7     2.00                        ursra	z31.s, z31.s, #32
+# CHECK-NEXT:  1      3     1.00                        ushllb	z0.d, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        ushllb	z0.h, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        ushllb	z0.s, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        ushllb	z31.d, z31.s, #31
+# CHECK-NEXT:  1      3     1.00                        ushllb	z31.h, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        ushllb	z31.s, z31.h, #15
+# CHECK-NEXT:  1      3     1.00                        ushllt	z0.d, z0.s, #0
+# CHECK-NEXT:  1      3     1.00                        ushllt	z0.h, z0.b, #0
+# CHECK-NEXT:  1      3     1.00                        ushllt	z0.s, z0.h, #0
+# CHECK-NEXT:  1      3     1.00                        ushllt	z31.d, z31.s, #31
+# CHECK-NEXT:  1      3     1.00                        ushllt	z31.h, z31.b, #7
+# CHECK-NEXT:  1      3     1.00                        ushllt	z31.s, z31.h, #15
+# CHECK-NEXT:  1      4     1.00                        usmmla	z0.s, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        usqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  1      4     1.00                        usqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      4     1.00                        usqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  1      4     1.00                        usqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      4     1.00                        usra	z0.b, z0.b, #1
+# CHECK-NEXT:  1      4     1.00                        usra	z0.d, z0.d, #1
+# CHECK-NEXT:  1      4     1.00                        usra	z0.h, z0.h, #1
+# CHECK-NEXT:  1      4     1.00                        usra	z0.s, z0.s, #1
+# CHECK-NEXT:  1      4     1.00                        usra	z31.b, z31.b, #8
+# CHECK-NEXT:  1      4     1.00                        usra	z31.d, z31.d, #64
+# CHECK-NEXT:  1      4     1.00                        usra	z31.h, z31.h, #16
+# CHECK-NEXT:  1      4     1.00                        usra	z31.s, z31.s, #32
+# CHECK-NEXT:  1      4     1.00                        usublb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        usublb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        usublb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        usublt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     1.00                        usublt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     1.00                        usublt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     1.00                        usubwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        usubwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        usubwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        usubwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     1.00                        usubwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        usubwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        uunpkhi	z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        uunpkhi	z31.h, z31.b
+# CHECK-NEXT:  1      4     1.00                        uunpkhi	z31.s, z31.h
+# CHECK-NEXT:  1      4     1.00                        uunpklo	z31.d, z31.s
+# CHECK-NEXT:  1      4     1.00                        uunpklo	z31.h, z31.b
+# CHECK-NEXT:  1      4     1.00                        uunpklo	z31.s, z31.h
+# CHECK-NEXT:  1      3     1.00                        uxtb	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        uxtb	z0.h, p0/m, z0.h
+# CHECK-NEXT:  1      3     1.00                        uxtb	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      3     1.00                        uxtb	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        uxtb	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     1.00                        uxtb	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        uxth	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        uxth	z0.s, p0/m, z0.s
+# CHECK-NEXT:  1      3     1.00                        uxth	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     1.00                        uxth	z31.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     1.00                        uxtw	z0.d, p0/m, z0.d
+# CHECK-NEXT:  1      3     1.00                        uxtw	z31.d, p7/m, z31.d
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        uzp1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        uzp1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        uzp1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        uzp1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        uzp2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        uzp2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        uzp2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        uzp2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, wzr, w0
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, xzr, x0
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.d, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.d, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.h, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.h, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.s, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.s, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.b, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.d, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.h, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.s, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.b, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.d, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.h, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.s, x30, x30
+# CHECK-NEXT:  1      1     1.00           *      U     wrffr	p0.b
+# CHECK-NEXT:  1      1     1.00           *      U     wrffr	p15.b
+# CHECK-NEXT:  1      4     1.00                        xar	z0.b, z0.b, z1.b, #1
+# CHECK-NEXT:  1      4     1.00                        xar	z0.d, z0.d, z1.d, #1
+# CHECK-NEXT:  1      4     1.00                        xar	z0.h, z0.h, z1.h, #1
+# CHECK-NEXT:  1      4     1.00                        xar	z0.s, z0.s, z1.s, #1
+# CHECK-NEXT:  1      4     1.00                        xar	z31.b, z31.b, z30.b, #8
+# CHECK-NEXT:  1      4     1.00                        xar	z31.d, z31.d, z30.d, #64
+# CHECK-NEXT:  1      4     1.00                        xar	z31.h, z31.h, z30.h, #16
+# CHECK-NEXT:  1      4     1.00                        xar	z31.s, z31.s, z30.s, #32
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.b, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.d, p0.d, p0.d
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.h, p0.h, p0.h
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.s, p0.s, p0.s
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        zip1	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        zip1	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        zip1	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        zip1	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        zip1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        zip1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        zip1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        zip1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.b, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.d, p0.d, p0.d
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.h, p0.h, p0.h
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.s, p0.s, p0.s
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     1.00                        zip2	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      3     1.00                        zip2	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     1.00                        zip2	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      3     1.00                        zip2	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      3     1.00                        zip2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     1.00                        zip2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     1.00                        zip2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     1.00                        zip2	z31.s, z31.s, z31.s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - CortexA320UnitALU
+# CHECK-NEXT: [1]   - CortexA320UnitB
+# CHECK-NEXT: [2]   - CortexA320UnitDiv
+# CHECK-NEXT: [3]   - CortexA320UnitLdSt
+# CHECK-NEXT: [4]   - CortexA320UnitMAC
+# CHECK-NEXT: [5]   - CortexA320UnitPAC
+# CHECK-NEXT: [6]   - CortexA320UnitVALU
+# CHECK-NEXT: [7]   - CortexA320UnitVMAC
+# CHECK-NEXT: [8]   - CortexA320UnitVMC
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT: 229.00 9.00    -     3876.00  -     -     2214.00 399.00 655.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z0.b, p0/m, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     abs	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adclb	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adclb	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adclt	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adclt	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z0.s, z1.s, z2.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.b, p5/m, z21.b, z10.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.b, z10.b, z21.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.d, p5/m, z21.d, z10.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.d, z10.d, z21.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.h, p5/m, z21.h, z10.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.h, z10.h, z21.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.s, p5/m, z21.s, z10.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z21.s, z10.s, z21.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.b, p3/m, z23.b, z13.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.b, z13.b, z8.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.d, p3/m, z23.d, z13.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.d, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.h, p3/m, z23.h, z13.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.h, z13.h, z8.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.s, p3/m, z23.s, z13.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z23.s, z13.s, z8.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.s, z31.s, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     addp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addpl	sp, sp, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addpl	x0, x0, #-32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addpl	x21, x21, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addpl	x23, x8, #-1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addvl	sp, sp, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addvl	x0, x0, #-32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addvl	x21, x21, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     addvl	x23, x8, #-1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, sxtw #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, sxtw]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, uxtw #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, uxtw #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d, uxtw]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.d, [z0.d, z0.d]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.s, [z0.s, z0.s, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.s, [z0.s, z0.s, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.s, [z0.s, z0.s, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     adr	z0.s, [z0.s, z0.s]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     aesd	z0.b, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     aese	z0.b, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     aesimc	z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     aesimc	z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     aesmc	z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     aesmc	z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z0.d, z0.d, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z0.d, z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z0.s, z0.s, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z0.s, z0.s, #0xfffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z23.d, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z23.h, z23.h, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z23.h, z23.h, #0xfff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z5.b, z5.b, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	z5.b, z5.b, #0xf9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ands	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     andv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     andv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     andv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     andv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.b, p0/m, z0.b, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.b, z1.b, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.h, p0/m, z0.h, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.h, z1.h, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.s, p0/m, z0.s, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z0.s, z1.s, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asr	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrd	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     asrr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bcax	z29.d, z29.d, z30.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  bdep	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     67.00  bdep	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     20.00  bdep	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     36.00  bdep	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  bext	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     67.00  bext	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     20.00  bext	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     36.00  bext	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bfcvt	z0.h, p0/m, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bfcvtnt	z0.h, p0/m, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   1.00    -     bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   1.00    -     bfdot	z0.s, z1.h, z2.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   1.00    -     bfdot	z0.s, z1.h, z2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalb	z0.s, z1.h, z2.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalb	z0.s, z1.h, z2.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalb	z10.s, z21.h, z14.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalb	z21.s, z14.h, z3.h[2]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalt	z0.s, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalt	z0.s, z1.h, z2.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalt	z0.s, z1.h, z2.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     bfmlalt	z14.s, z10.h, z21.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   1.00    -     bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  bgrp	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     67.00  bgrp	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     20.00  bgrp	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     36.00  bgrp	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	z23.d, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bic	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bics	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bics	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brka	p0.b, p15/m, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brka	p0.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkas	p0.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkb	p0.b, p15/m, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkb	p0.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkbs	p0.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkn	p0.b, p15/z, p1.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkn	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkns	p0.b, p15/z, p1.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkns	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpa	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpa	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpas	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpas	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpb	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpb	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpbs	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     brkpbs	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bsl	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bsl1n	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     bsl2n	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z0.b, z0.b, z0.b, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z0.d, z0.d, z0.d, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z0.h, z0.h, z0.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z0.s, z0.s, z0.s, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z31.b, z31.b, z31.b, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z31.d, z31.d, z31.d, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z31.h, z31.h, z31.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cadd	z31.s, z31.s, z31.s, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.d, z1.h, z15.h[1], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.d, z1.h, z31.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.d, z1.h, z31.h, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.d, z1.h, z31.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.d, z1.h, z31.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.s, z1.b, z31.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z0.s, z1.b, z7.b[3], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z29.d, z30.h, z0.h[0], #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z31.d, z30.h, z7.h[1], #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cdot	z5.d, z6.h, z3.h[0], #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	b0, p7, b0, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	d0, p7, d0, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	h0, p7, h0, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	s0, p7, s0, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clasta	w0, p7, w0, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clasta	w0, p7, w0, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clasta	w0, p7, w0, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clasta	x0, p7, x0, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	z0.b, p7, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	z0.d, p7, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	z0.h, p7, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clasta	z0.s, p7, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	b0, p7, b0, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	d0, p7, d0, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	h0, p7, h0, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	s0, p7, s0, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clastb	w0, p7, w0, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clastb	w0, p7, w0, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clastb	w0, p7, w0, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     clastb	x0, p7, x0, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	z0.b, p7, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	z0.d, p7, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	z0.h, p7, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clastb	z0.s, p7, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cls	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     clz	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z0.b, z1.b, z2.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z0.d, z1.d, z2.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z0.h, z1.h, z2.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z0.h, z1.h, z2.h[0], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z0.s, z1.s, z2.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z0.s, z1.s, z2.s[0], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z15.b, z16.b, z17.b, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z15.d, z16.d, z17.d, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z15.h, z16.h, z17.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z15.s, z16.s, z17.s, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z29.b, z30.b, z31.b, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z29.d, z30.d, z31.d, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z29.h, z30.h, z31.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z29.s, z30.s, z31.s, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z31.b, z31.b, z31.b, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z31.d, z31.d, z31.d, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z31.h, z30.h, z7.h[0], #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z31.h, z31.h, z31.h, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z31.s, z30.s, z7.s[0], #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     cmla	z31.s, z31.s, z31.s, #180
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpeq	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphi	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmphs	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmple	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplo	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpls	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmplt	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cmpne	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnot	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnot	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnot	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnot	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnt	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnt	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnt	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cnt	z31.s, p7/m, z31.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntb	x0, #28
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntd	x0, #28
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cnth	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cnth	x0, #28
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cnth	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cnth	x0, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cntp	x0, p15, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cntp	x0, p15, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cntp	x0, p15, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cntp	x0, p15, p0.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntw	x0, #28
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     cntw	x0, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     compact	z31.d, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     compact	z31.s, p7, z31.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermeq	w30, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermeq	wzr, w30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermeq	x30, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermeq	xzr, x30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermne	w30, wzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermne	wzr, w30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermne	x30, xzr
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     ctermne	xzr, x30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decb	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decb	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decd	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decd	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     dech	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     dech	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     dech	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     dech	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     dech	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	x0, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	x0, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	x0, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	x0, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	xzr, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	xzr, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	xzr, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	xzr, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	z31.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	z31.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     decp	z31.s, p15.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decw	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decw	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     decw	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dupm	z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dupm	z0.s, #0xfffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dupm	z23.h, #0xfff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     dupm	z5.b, #0xf9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z0.d, z0.d, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z0.d, z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z0.s, z0.s, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z0.s, z0.s, #0xfffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z23.d, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z23.h, z23.h, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z23.h, z23.h, #0xfff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z5.b, z5.b, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor	z5.b, z5.b, #0xf9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eor3	z29.d, z29.d, z30.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorbt	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorbt	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorbt	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorbt	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eors	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eortb	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eortb	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eortb	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eortb	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     eorv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ext	z0.b, { z1.b, z2.b }, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ext	z31.b, z31.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ext	z31.b, z31.b, z0.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ext	z31.b, { z30.b, z31.b }, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabd	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabd	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabd	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fabs	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     facgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fadd	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -      -     fadda	d0, p7, d0, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     29.00   -      -     fadda	h0, p7, h0, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     13.00   -      -     fadda	s0, p7, s0, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     faddp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     faddp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     faddp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     faddv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     11.00   -      -     faddv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     5.00    -      -     faddv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcadd	z0.d, p0/m, z0.d, z0.d, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcadd	z0.h, p0/m, z0.h, z0.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcadd	z0.s, p0/m, z0.s, z0.s, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcadd	z31.d, p7/m, z31.d, z31.d, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcadd	z31.h, p7/m, z31.h, z31.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcadd	z31.s, p7/m, z31.s, z31.s, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmeq	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.d, p0/m, z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.h, p0/m, z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.h, p0/m, z1.h, z2.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.h, z0.h, z0.h[0], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.s, p0/m, z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z0.s, p0/m, z1.s, z2.s, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z21.s, z10.s, z5.s[1], #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z23.s, z13.s, z8.s[0], #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z29.d, p7/m, z30.d, z31.d, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z29.h, p7/m, z30.h, z31.h, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z29.s, p7/m, z30.s, z31.s, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z31.d, p7/m, z31.d, z31.d, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z31.h, p7/m, z31.h, z31.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z31.h, z31.h, z7.h[3], #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fcmla	z31.s, p7/m, z31.s, z31.s, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmle	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmle	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmle	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmlt	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmlt	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmlt	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmne	p0.d, p0/z, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmne	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmne	p0.h, p0/z, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmne	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmne	p0.s, p0/z, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmne	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmuo	p0.d, p0/z, z0.d, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmuo	p0.h, p0/z, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcmuo	p0.s, p0/z, z0.s, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	z0.d, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	z0.d, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	z0.h, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	z0.h, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	z0.s, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt	z0.s, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtlt	z0.s, p0/m, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtlt	z30.d, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnt	z0.h, p0/m, z1.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtnt	z30.s, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtx	z0.s, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtx	z30.s, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtxnt	z0.s, p0/m, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtxnt	z30.s, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.d, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.d, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.s, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.s, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzs	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.d, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.d, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.s, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.s, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvtzu	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  fdiv	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   fdiv	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.00  fdiv	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  fdivr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   fdivr	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.00  fdivr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fexpa	z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fexpa	z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fexpa	z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     flogb	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     flogb	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     flogb	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmad	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmad	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmad	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmax	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnm	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxnmv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmaxv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmin	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z0.d, p0/m, z0.d, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z0.h, p0/m, z0.h, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z0.s, p0/m, z0.s, #0.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnm	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminnmv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fminv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmla	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlalb	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlalb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlalb	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlalt	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlalt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlalt	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmls	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlslb	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlslb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlslb	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlslt	z0.s, z1.h, z7.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlslt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmlslt	z30.s, z31.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.d, #-10.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.d, #0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.d, p0/m, #-10.00000000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.d, p0/m, #0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.h, #-0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.h, p0/m, #-0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.s, #-0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fmov	z0.s, p0/m, #-0.12500000
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmsb	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmsb	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmsb	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.d, z0.d, z0.d[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.h, z0.h, z0.h[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.s, z0.s, z0.s[0]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z31.d, p7/m, z31.d, #2.0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z31.d, z31.d, z15.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z31.h, p7/m, z31.h, #2.0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z31.h, z31.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z31.s, p7/m, z31.s, #2.0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmul	z31.s, z31.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmulx	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmulx	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fmulx	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fneg	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmad	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmad	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmad	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmla	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmla	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmla	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmls	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmls	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmls	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmsb	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmsb	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fnmsb	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecpe	z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecpe	z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecpe	z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecps	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecps	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecps	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecpx	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecpx	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frecpx	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinta	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frinti	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintm	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintn	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintp	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintx	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     frintz	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frsqrte	z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frsqrte	z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frsqrte	z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frsqrts	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frsqrts	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     frsqrts	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fscale	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fscale	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fscale	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  fsqrt	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     5.00   fsqrt	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   fsqrt	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsub	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z0.d, p0/m, z0.d, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z0.h, p0/m, z0.h, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z0.s, p0/m, z0.s, #0.5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z31.d, p7/m, z31.d, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z31.h, p7/m, z31.h, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fsubr	z31.s, p7/m, z31.s, #1.0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ftmad	z0.d, z0.d, z31.d, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ftmad	z0.h, z0.h, z31.h, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ftmad	z0.s, z0.s, z31.s, #7
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ftsmul	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ftsmul	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ftsmul	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ftssel	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ftssel	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ftssel	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     histcnt	z0.s, p0/z, z1.s, z2.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     histcnt	z29.d, p7/z, z30.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     histseg	z0.b, z1.b, z31.b
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incb	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incb	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incd	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incd	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incd	z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incd	z0.d, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     inch	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     inch	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     inch	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     inch	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     inch	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     inch	z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     inch	z0.h, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	x0, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	x0, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	x0, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	x0, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	xzr, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	xzr, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	xzr, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	xzr, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	z31.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	z31.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incp	z31.s, p15.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incw	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incw	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     incw	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incw	z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     incw	z0.s, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z0.b, #0, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z0.d, #0, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z0.h, #0, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z0.h, w0, w0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z0.s, #0, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z21.b, w10, w21
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z21.d, x10, x21
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z21.s, w10, w21
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.b, #13, w8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.b, w13, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.d, #13, x8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.d, x13, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.h, #13, w8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.h, w13, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.s, #13, w8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z23.s, w13, #8
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.b, #-1, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.b, #-1, wzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.b, wzr, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.b, wzr, wzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.d, #-1, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.d, #-1, xzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.d, xzr, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.d, xzr, xzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.h, #-1, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.h, #-1, wzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.h, wzr, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.h, wzr, wzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.s, #-1, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.s, #-1, wzr
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.s, wzr, #-1
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     index	z31.s, wzr, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z0.b, w0
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z0.d, x0
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z0.h, w0
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z0.s, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     insr	z31.b, b31
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z31.b, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     insr	z31.d, d31
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z31.d, xzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     insr	z31.h, h31
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z31.h, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     insr	z31.s, s31
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     insr	z31.s, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lasta	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lasta	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lasta	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lasta	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lasta	w0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lasta	w0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lasta	w0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lasta	x0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lastb	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lastb	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lastb	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lastb	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lastb	w0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lastb	w0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lastb	w0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     lastb	x0, p7, z31.d
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z0.b }, p0/z, [sp, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1b	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z21.b }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z21.s }, p5/z, [x10, x21]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z23.d }, p3/z, [x13, x8]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z31.b }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1b	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1b	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1b	{ z5.h }, p3/z, [x17, x16]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1d	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1d	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1d	{ z23.d }, p3/z, [sp, x8, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1d	{ z23.d }, p3/z, [x13, x8, lsl #3]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z23.d }, p3/z, [x13, z8.d, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1d	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1d	{ z31.d }, p7/z, [z31.d, #248]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1h	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z21.s }, p5/z, [x10, x21, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z23.d }, p3/z, [x13, x8, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1h	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z5.h }, p3/z, [sp, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1h	{ z5.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z31.b }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rb	{ z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rd	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rd	{ z31.d }, p7/z, [sp, #504]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rh	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rh	{ z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rh	{ z31.h }, p7/z, [sp, #126]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rh	{ z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqb	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqb	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqb	{ z21.b }, p5/z, [x10, #112]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqb	{ z23.b }, p3/z, [x13, #-128]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqb	{ z31.b }, p7/z, [sp, #-16]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqd	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqd	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqd	{ z23.d }, p3/z, [x13, #-128]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqd	{ z23.d }, p3/z, [x13, #112]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqd	{ z31.d }, p7/z, [sp, #-16]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqh	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqh	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqh	{ z23.h }, p3/z, [x13, #-128]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqh	{ z23.h }, p3/z, [x13, #112]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqh	{ z31.h }, p7/z, [sp, #-16]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqw	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqw	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqw	{ z23.s }, p3/z, [x13, #-128]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqw	{ z23.s }, p3/z, [x13, #112]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rqw	{ z31.s }, p7/z, [sp, #-16]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsb	{ z31.d }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsb	{ z31.h }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsb	{ z31.s }, p7/z, [sp, #63]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsh	{ z31.d }, p7/z, [sp, #126]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsh	{ z31.s }, p7/z, [sp, #126]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rsw	{ z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rw	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rw	{ z31.d }, p7/z, [sp, #252]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1rw	{ z31.s }, p7/z, [sp, #252]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sb	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z0.h }, p0/z, [sp, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z0.h }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z21.s }, p5/z, [x10, x21]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z23.d }, p3/z, [x13, x8]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sb	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sb	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sb	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1sb	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z21.s }, p5/z, [sp, x21, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z21.s }, p5/z, [x10, x21, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z23.d }, p3/z, [x13, x8, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sw	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sw	{ z23.d }, p3/z, [sp, x8, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sw	{ z23.d }, p3/z, [x13, x8, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1sw	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1sw	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1w	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z21.s }, p5/z, [sp, x21, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z21.s }, p5/z, [x10, x21, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z23.d }, p3/z, [x13, x8, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ld1w	{ z31.s }, p7/z, [z31.s, #124]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2b	{ z0.b, z1.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2b	{ z0.b, z1.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2b	{ z21.b, z22.b }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2b	{ z23.b, z24.b }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2b	{ z5.b, z6.b }, p3/z, [x17, x16]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2d	{ z0.d, z1.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2d	{ z0.d, z1.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2d	{ z21.d, z22.d }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2d	{ z23.d, z24.d }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2d	{ z5.d, z6.d }, p3/z, [x17, x16, lsl #3]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2h	{ z0.h, z1.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2h	{ z0.h, z1.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2h	{ z21.h, z22.h }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2h	{ z23.h, z24.h }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2h	{ z5.h, z6.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2w	{ z0.s, z1.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2w	{ z0.s, z1.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2w	{ z21.s, z22.s }, p5/z, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ld2w	{ z23.s, z24.s }, p3/z, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -     ld2w	{ z5.s, z6.s }, p3/z, [x17, x16, lsl #2]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3b	{ z0.b - z2.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3b	{ z0.b - z2.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3b	{ z21.b - z23.b }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3b	{ z23.b - z25.b }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3b	{ z5.b - z7.b }, p3/z, [x17, x16]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3d	{ z0.d - z2.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3d	{ z0.d - z2.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3d	{ z21.d - z23.d }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3d	{ z23.d - z25.d }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3d	{ z5.d - z7.d }, p3/z, [x17, x16, lsl #3]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3h	{ z0.h - z2.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3h	{ z0.h - z2.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3h	{ z21.h - z23.h }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3h	{ z23.h - z25.h }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3h	{ z5.h - z7.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3w	{ z0.s - z2.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3w	{ z0.s - z2.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3w	{ z21.s - z23.s }, p5/z, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3w	{ z23.s - z25.s }, p3/z, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld3w	{ z5.s - z7.s }, p3/z, [x17, x16, lsl #2]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4b	{ z0.b - z3.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4b	{ z0.b - z3.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4b	{ z21.b - z24.b }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4b	{ z23.b - z26.b }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4b	{ z5.b - z8.b }, p3/z, [x17, x16]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4d	{ z0.d - z3.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4d	{ z0.d - z3.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4d	{ z21.d - z24.d }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4d	{ z23.d - z26.d }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4d	{ z5.d - z8.d }, p3/z, [x17, x16, lsl #3]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4h	{ z0.h - z3.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4h	{ z0.h - z3.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4h	{ z21.h - z24.h }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4h	{ z23.h - z26.h }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4h	{ z5.h - z8.h }, p3/z, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4w	{ z0.s - z3.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4w	{ z0.s - z3.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4w	{ z21.s - z24.s }, p5/z, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4w	{ z23.s - z26.s }, p3/z, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     3.00    -      -      -      -      -     ld4w	{ z5.s - z8.s }, p3/z, [x17, x16, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z0.d }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z0.h }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z31.b }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1b	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z31.h }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1b	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1b	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1d	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z23.d }, p3/z, [x13, z8.d, lsl #3]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1d	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1d	{ z31.d }, p7/z, [z31.d, #248]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1h	{ z0.d }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1h	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1h	{ z31.h }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sb	{ z0.d }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sb	{ z0.h }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sb	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sb	{ z31.d }, p7/z, [z31.d, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sb	{ z31.h }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sb	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1sb	{ z31.s }, p7/z, [z31.s, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sh	{ z0.d }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z23.d }, p3/z, [x13, z8.d, lsl #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sh	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z31.d }, p7/z, [z31.d, #62]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [z31.s, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sw	{ z0.d }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1sw	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1sw	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1w	{ z0.d }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z0.d }, p0/z, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [z0.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z23.d }, p3/z, [x13, z8.d, lsl #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z31.d }, p7/z, [sp, z31.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1w	{ z31.d }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z31.d }, p7/z, [z31.d, #124]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [z31.s, #124]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z21.b }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z31.b }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1b	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1d	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1d	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1d	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z21.h }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z31.h }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sb	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sh	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sh	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sh	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sh	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sw	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sw	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1sw	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1w	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1w	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1w	{ z21.d }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1w	{ z21.s }, p5/z, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1w	{ z31.d }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnf1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1b	{ z0.b }, p0/z, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1b	{ z0.b }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1b	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1b	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1b	{ z21.b }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1b	{ z23.b }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1b	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1b	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1b	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1b	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1d	{ z0.d }, p0/z, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1d	{ z0.d }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1d	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1d	{ z21.d }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1d	{ z23.d }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1d	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1d	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1h	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1h	{ z0.h }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1h	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1h	{ z21.h }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1h	{ z23.h }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1h	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1h	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1h	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1h	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sb	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1sb	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sb	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sb	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1sb	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1sb	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sh	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1sh	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sh	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sh	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1sh	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1sh	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sw	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sw	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1sw	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1w	{ z0.d }, p0/z, [z1.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1w	{ z0.s }, p0/z, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1w	{ z0.s }, p0/z, [z1.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1w	{ z21.s }, p5/z, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldnt1w	{ z23.s }, p3/z, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1w	{ z31.d }, p7/z, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     ldnt1w	{ z31.d }, p7/z, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1w	{ z31.s }, p7/z, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     ldnt1w	{ z31.s }, p7/z, [z31.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	p0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	p5, [x10, #255, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	p7, [x13, #-256, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	z0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	z23, [x13, #255, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     ldr	z31, [sp, #-256, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.b, p0/m, z0.b, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.b, z1.b, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.h, p0/m, z0.h, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.h, z1.h, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.s, p0/m, z0.s, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z0.s, z1.s, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.b, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.d, z31.d, #63
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.h, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsl	z31.s, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lslr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lslr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lslr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lslr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.b, p0/m, z0.b, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.b, z1.b, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.h, p0/m, z0.h, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.h, z1.h, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.s, p0/m, z0.s, z1.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z0.s, z1.s, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsr	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsrr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsrr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsrr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     lsrr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mad	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mad	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mad	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mad	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     match	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     match	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     match	p15.b, p7/z, z30.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     match	p15.h, p7/z, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mla	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.d, z1.d, z7.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mls	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	p0.b, p0/m, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	p0.b, p0/z, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	p15.b, p15/m, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	p15.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.b, b0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.b, p0/m, b0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.b, p0/m, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.b, p0/z, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.b, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, #0xe0000000000003ff
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, #0xffffffffffff7fff
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, #32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, d0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, p0/m, d0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, p0/m, x0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, x0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, #-256
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, #32767
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, h0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, p0/m, h0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, p0/m, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, p0/z, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.h, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.q, q0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, #0xffff7fff
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, #32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, p0/m, s0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, p0/m, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, s0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z0.s, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, p0/z, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, p0/z, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, p0/z, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, p0/z, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, p15/m, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.d, p15/m, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, p0/z, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, p0/z, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, p0/z, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, p0/z, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, p15/m, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.h, p15/m, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, p0/z, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, p0/z, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, p0/z, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, p0/z, #32512
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, p15/m, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z21.s, p15/m, #-32768
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.b, p15/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.b, p7/m, b31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movprfx	z31, z6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.b, p7/m, wsp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.b, wsp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.b, z31.b[63]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.d, p15/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.d, p7/m, d31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movprfx	z31.d, p7/z, z6.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.d, p7/m, sp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.d, sp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.d, z31.d[7]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.h, p15/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.h, p7/m, h31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.h, p7/m, wsp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.h, wsp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.h, z31.h[31]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.s, p15/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.s, p7/m, s31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.s, p7/m, wsp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.s, wsp
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z31.s, z31.s[15]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, #-1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, p0/z, #-1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, p0/z, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, p0/z, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.b, p15/m, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.d, #-6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.h, #-6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.q, z17.q[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mov	z5.s, #-6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movs	p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movs	p0.b, p0/z, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movs	p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     movs	p15.b, p15/z, p15.b
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     mrs	x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     mrs	x3, ZCR_EL1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     mrs	x3, ZCR_EL12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     mrs	x3, ZCR_EL2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     mrs	x3, ZCR_EL3
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     msb	z0.b, p7/m, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     msb	z0.d, p7/m, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     msb	z0.h, p7/m, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     msb	z0.s, p7/m, z1.s, z31.s
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     msr	ZCR_EL1, x3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     msr	ZCR_EL12, x3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     msr	ZCR_EL2, x3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     msr	ZCR_EL3, x3
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.b, p7/m, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.b, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.h, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z29.s, z30.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.b, z31.b, #-128
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.b, z31.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.d, z31.d, #-128
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.d, z31.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.h, z31.h, #-128
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.h, z31.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.s, z31.s, #-128
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     mul	z31.s, z31.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nand	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nand	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nands	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nands	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nbsl	z0.d, z0.d, z1.d, z2.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z0.b, p0/m, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     neg	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     nmatch	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     nmatch	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     nmatch	p15.b, p7/z, z30.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     nmatch	p15.h, p7/z, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nor	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nor	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nors	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nors	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     not	p0.b, p0/z, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     not	p15.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     not	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     not	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     not	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     not	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nots	p0.b, p0/z, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     nots	p15.b, p15/z, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orn	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orn	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orns	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orns	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z0.d, z0.d, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z0.d, z0.d, #0xfffffffffffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z0.s, z0.s, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z0.s, z0.s, #0xfffffff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z23.d, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z23.h, z23.h, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z23.h, z23.h, #0xfff9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z5.b, z5.b, #0x6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orr	z5.b, z5.b, #0xf9
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orrs	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     orv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pfalse	p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pfirst	p0.b, p15, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pfirst	p15.b, p15, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pmul	z0.b, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pmul	z29.b, z30.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   pmullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   pmullb	z29.q, z30.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   pmullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   pmullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   pmullt	z29.q, z30.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   pmullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pnext	p0.b, p15, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pnext	p0.d, p15, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pnext	p0.h, p15, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pnext	p0.s, p15, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     pnext	p15.b, p15, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	#14, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	#15, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	#6, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	#7, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	#7, p3, [z13.s, #31]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	#7, p3, [z13.s]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1keep, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1keep, p0, [x0, z0.d]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1keep, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl3strm, p5, [x10, z21.d, sxtw]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl3strm, p5, [x10, z21.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl3strm, p5, [z10.d, #31]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pldl3strm, p5, [z10.d]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pstl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pstl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pstl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pstl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pstl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfb	pstl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#14, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#15, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#15, p7, [z31.d, #248]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#15, p7, [z31.d]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#15, p7, [z31.s, #248]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#15, p7, [z31.s]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#6, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	#7, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1keep, p0, [x0, z0.d, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1keep, p0, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1keep, p0, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1keep, p0, [x0, z0.s, sxtw #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1keep, p0, [x0, z0.s, uxtw #3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pldl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pstl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pstl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pstl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pstl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pstl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfd	pstl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#14, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#15, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#15, p7, [z31.d, #62]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#15, p7, [z31.d]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#15, p7, [z31.s, #62]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#15, p7, [z31.s]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#6, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	#7, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl1keep, p0, [x0, z0.d, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl3strm, p5, [x10, z21.d, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl3strm, p5, [x10, z21.d, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl3strm, p5, [x10, z21.s, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pldl3strm, p5, [x10, z21.s, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pstl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pstl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pstl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pstl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pstl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfh	pstl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#14, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#15, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#15, p7, [z31.d, #124]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#15, p7, [z31.d]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#15, p7, [z31.s, #124]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#15, p7, [z31.s]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#6, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#7, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	#7, p3, [x13, z8.d, uxtw #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl1keep, p0, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl1keep, p0, [x0, z0.s, uxtw #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl1strm, p0, [x0, #-32, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl1strm, p0, [x0, #31, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl3strm, p5, [x10, z21.d, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pldl3strm, p5, [x10, z21.s, sxtw #2]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pstl1keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pstl1strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pstl2keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pstl2strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pstl3keep, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     prfw	pstl3strm, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptest	p15, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptest	p15, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p0.b, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p0.d, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p0.h, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p0.s, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #17
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #21
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #26
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #27
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, #28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, mul3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, mul4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl256
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrue	p7.s, vl8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p0.b, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p0.d, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p0.h, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p0.s, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #14
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #17
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #18
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #19
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #20
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #21
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #22
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #23
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #24
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #25
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #26
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #27
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, #28
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, mul3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, mul4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl256
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl5
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ptrues	p7.s, vl8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     punpkhi	p0.h, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     punpkhi	p15.h, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     punpklo	p0.h, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     punpklo	p15.h, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     raddhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   rax1	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rbit	z0.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rbit	z0.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rbit	z0.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rbit	z0.s, p7/m, z31.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdffr	p0.b
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdffr	p0.b, p0/z
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdffr	p15.b
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdffr	p15.b, p15/z
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdffrs	p0.b, p0/z
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdffrs	p15.b, p15/z
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdvl	x0, #0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdvl	x21, #-32
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdvl	x23, #31
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     rdvl	xzr, #-1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev	z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev	z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev	z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rev	z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     revb	z0.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     revb	z0.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     revb	z0.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     revh	z0.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     revh	z0.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     revw	z0.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     rsubhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     saba	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     saba	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     saba	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     saba	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sabalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sabalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sabalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sabalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sabalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sabalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabd	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabd	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabd	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabd	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sabdlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	z0.h, p0/m, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	z29.s, p0/m, z30.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     sadalp	z30.d, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddv	d0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddv	d0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddv	d0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     saddwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sbclb	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sbclb	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sbclt	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sbclt	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.d, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.h, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.h, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.s, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     scvtf	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     23.00  sdiv	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  sdiv	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     23.00  sdivr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  sdivr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sdot	z0.d, z1.h, z15.h[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sdot	z0.d, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sdot	z0.s, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sdot	z0.s, z1.b, z7.b[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sel	z23.b, p11, z13.b, z8.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sel	z23.d, p11, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sel	z23.h, p11, z13.h, z8.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sel	z23.s, p11, z13.s, z8.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     setffr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     shsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z31.b, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z31.d, z31.d, #63
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z31.h, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sli	z31.s, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   sm4e	z0.s, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   sm4ekey	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z0.b, z0.b, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z0.d, z0.d, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z0.h, z0.h, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z0.s, z0.s, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.b, z31.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.d, z31.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.h, z31.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smax	z31.s, z31.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smaxv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z0.b, z0.b, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z0.d, z0.d, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z0.h, z0.h, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z0.s, z0.s, #-128
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.b, z31.b, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.d, z31.d, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.h, z31.h, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     smin	z31.s, z31.s, #127
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sminv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smlslt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z0.b, p7/m, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     smullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z29.b, p7, { z30.b, z31.b }
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z29.d, p7, { z30.d, z31.d }
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z29.h, p7, { z30.h, z31.h }
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z29.s, p7, { z30.s, z31.s }
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z31.b, p7, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z31.d, p7, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z31.h, p7, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     splice	z31.s, p7, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqabs	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqadd	z31.s, z31.s, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z0.b, z0.b, z0.b, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z0.d, z0.d, z0.d, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z0.h, z0.h, z0.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z0.s, z0.s, z0.s, #90
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z31.b, z31.b, z31.b, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z31.d, z31.d, z31.d, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z31.h, z31.h, z31.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqcadd	z31.s, z31.s, z31.s, #270
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecb	x0, w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecd	x0, w0, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecd	z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecd	z0.d, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecd	z0.d, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecd	z0.d, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdech	x0, w0, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdech	z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdech	z0.h, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdech	z0.h, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdech	z0.h, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	x0, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	x0, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	x0, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	x0, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	xzr, p15.b, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	xzr, p15.d, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	xzr, p15.h, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	xzr, p15.s, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	z0.d, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	z0.h, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecp	z0.s, p0.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqdecw	x0, w0, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecw	z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecw	z0.s, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecw	z0.s, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqdecw	z0.s, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalb	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalt	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslb	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslt	z0.d, z1.s, z15.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmlslt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqdmullt	z31.d, z31.s, z31.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincb	x0, w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincd	x0, w0, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincd	z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincd	z0.d, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincd	z0.d, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincd	z0.d, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqinch	x0, w0, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqinch	z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqinch	z0.h, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqinch	z0.h, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqinch	z0.h, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	x0, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	x0, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	x0, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	x0, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	xzr, p15.b, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	xzr, p15.d, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	xzr, p15.h, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	xzr, p15.s, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	z0.d, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	z0.h, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincp	z0.s, p0.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     sqincw	x0, w0, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincw	z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincw	z0.s, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincw	z0.s, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqincw	z0.s, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	z31.b, p7/m, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqneg	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z0.b, z1.b, z2.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z0.d, z1.d, z2.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z0.h, z1.h, z2.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z0.h, z1.h, z2.h[0], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z0.s, z1.s, z2.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z0.s, z1.s, z2.s[0], #0
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z15.b, z16.b, z17.b, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z15.d, z16.d, z17.d, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z15.h, z16.h, z17.h, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z15.s, z16.s, z17.s, #270
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z29.b, z30.b, z31.b, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z29.d, z30.d, z31.d, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z29.h, z30.h, z31.h, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z29.s, z30.s, z31.s, #90
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z31.b, z31.b, z31.b, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z31.d, z31.d, z31.d, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z31.h, z30.h, z7.h[0], #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z31.h, z31.h, z31.h, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z31.s, z30.s, z7.s[0], #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdcmlah	z31.s, z31.s, z31.s, #180
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlah	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmlsh	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z0.d, z1.d, z15.d[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z0.h, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z0.s, z1.s, z7.s[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     sqrdmulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqrshrunt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshl	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshlu	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqshrunt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsub	z31.s, z31.s, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtnb	z0.b, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtnb	z0.h, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtnb	z0.s, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtnt	z0.b, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtnt	z0.h, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtnt	z0.s, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtunb	z0.b, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtunb	z0.h, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtunb	z0.s, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtunt	z0.b, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtunt	z0.h, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sqxtunt	z0.s, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srhadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sri	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     srshr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     srsra	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllb	z0.d, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllb	z0.h, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllb	z0.s, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllb	z31.d, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllb	z31.h, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllb	z31.s, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllt	z0.d, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllt	z0.h, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllt	z0.s, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllt	z31.d, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllt	z31.h, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sshllt	z31.s, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssra	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssublt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubltb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubltb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubltb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ssubwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.d }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1b	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1b	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1b	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1b	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.h }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.s }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1b	{ z0.s }, p0, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1b	{ z0.s }, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     st1b	{ z0.s }, p7, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z21.b }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z21.h }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z21.s }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z31.b }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1b	{ z31.d }, p7, [z31.d, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z31.h }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1b	{ z31.s }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     st1b	{ z31.s }, p7, [z31.s, #31]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, z0.d, lsl #3]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, z0.d, sxtw #3]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, z0.d, uxtw #3]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1d	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1d	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1d	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1d	{ z31.d }, p7, [z31.d, #248]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, z0.d, lsl #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, z0.d, sxtw #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, z0.d, uxtw #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1h	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z0.s }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.s }, p0, [x0, z0.s, sxtw #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.s }, p0, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.s }, p0, [x0, z0.s, uxtw #1]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1h	{ z0.s }, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     st1h	{ z0.s }, p7, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z21.h }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z21.s }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1h	{ z31.d }, p7, [z31.d, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z31.h }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1h	{ z31.s }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     st1h	{ z31.s }, p7, [z31.s, #62]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, z0.d, lsl #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, z0.d, sxtw #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, z0.d, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, z0.d, uxtw #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, z0.d, uxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0, z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1w	{ z0.d }, p7, [z0.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.s }, p0, [x0, z0.s, sxtw #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.s }, p0, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.s }, p0, [x0, z0.s, uxtw #2]
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -     st1w	{ z0.s }, p0, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     st1w	{ z0.s }, p7, [z0.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z21.d }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z21.s }, p5, [x10, #5, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z31.d }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     st1w	{ z31.d }, p7, [z31.d, #124]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     st1w	{ z31.s }, p7, [sp, #-1, mul vl]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     st1w	{ z31.s }, p7, [z31.s, #124]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2b	{ z0.b, z1.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2b	{ z0.b, z1.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2b	{ z21.b, z22.b }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2b	{ z23.b, z24.b }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2b	{ z5.b, z6.b }, p3, [x17, x16]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2d	{ z0.d, z1.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2d	{ z0.d, z1.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2d	{ z21.d, z22.d }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2d	{ z23.d, z24.d }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2d	{ z5.d, z6.d }, p3, [x17, x16, lsl #3]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2h	{ z0.h, z1.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2h	{ z0.h, z1.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2h	{ z21.h, z22.h }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2h	{ z23.h, z24.h }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2h	{ z5.h, z6.h }, p3, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2w	{ z0.s, z1.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2w	{ z0.s, z1.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2w	{ z21.s, z22.s }, p5, [x10, #10, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2w	{ z23.s, z24.s }, p3, [x13, #-16, mul vl]
+# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -     st2w	{ z5.s, z6.s }, p3, [x17, x16, lsl #2]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3b	{ z0.b - z2.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3b	{ z0.b - z2.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3b	{ z21.b - z23.b }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3b	{ z23.b - z25.b }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3b	{ z5.b - z7.b }, p3, [x17, x16]
+# CHECK-NEXT:  -      -      -     14.00   -      -      -      -      -     st3d	{ z0.d - z2.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     14.00   -      -      -      -      -     st3d	{ z0.d - z2.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     14.00   -      -      -      -      -     st3d	{ z21.d - z23.d }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     14.00   -      -      -      -      -     st3d	{ z23.d - z25.d }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     14.00   -      -      -      -      -     st3d	{ z5.d - z7.d }, p3, [x17, x16, lsl #3]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3h	{ z0.h - z2.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3h	{ z0.h - z2.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3h	{ z21.h - z23.h }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3h	{ z23.h - z25.h }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3h	{ z5.h - z7.h }, p3, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3w	{ z0.s - z2.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3w	{ z0.s - z2.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3w	{ z21.s - z23.s }, p5, [x10, #15, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3w	{ z23.s - z25.s }, p3, [x13, #-24, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st3w	{ z5.s - z7.s }, p3, [x17, x16, lsl #2]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4b	{ z0.b - z3.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4b	{ z0.b - z3.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4b	{ z21.b - z24.b }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4b	{ z23.b - z26.b }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4b	{ z5.b - z8.b }, p3, [x17, x16]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st4d	{ z0.d - z3.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st4d	{ z0.d - z3.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st4d	{ z21.d - z24.d }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st4d	{ z23.d - z26.d }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     25.00   -      -      -      -      -     st4d	{ z5.d - z8.d }, p3, [x17, x16, lsl #3]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4h	{ z0.h - z3.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4h	{ z0.h - z3.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4h	{ z21.h - z24.h }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4h	{ z23.h - z26.h }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4h	{ z5.h - z8.h }, p3, [x17, x16, lsl #1]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4w	{ z0.s - z3.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4w	{ z0.s - z3.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4w	{ z21.s - z24.s }, p5, [x10, #20, mul vl]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4w	{ z23.s - z26.s }, p3, [x13, #-32, mul vl]
+# CHECK-NEXT:  -      -      -     50.00   -      -      -      -      -     st4w	{ z5.s - z8.s }, p3, [x17, x16, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1b	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1b	{ z0.s }, p0, [z1.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1b	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1b	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1b	{ z31.s }, p7, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1b	{ z31.s }, p7, [z31.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1d	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1d	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1d	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1h	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1h	{ z0.s }, p0, [z1.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1h	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1h	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1h	{ z31.s }, p7, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1h	{ z31.s }, p7, [z31.s]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1w	{ z0.d }, p0, [z1.d]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1w	{ z0.s }, p0, [z1.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1w	{ z31.d }, p7, [z31.d, x0]
+# CHECK-NEXT:  -      -      -     7.00    -      -      -      -      -     stnt1w	{ z31.d }, p7, [z31.d]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1w	{ z31.s }, p7, [z31.s, x0]
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -     stnt1w	{ z31.s }, p7, [z31.s]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	p0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	p15, [sp, #-256, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	p5, [x10, #255, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	z0, [x0]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	z21, [x10, #-256, mul vl]
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -     str	z31, [sp, #255, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.b, p5/m, z21.b, z10.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.b, z10.b, z21.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.d, p5/m, z21.d, z10.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.d, z10.d, z21.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.h, p5/m, z21.h, z10.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.h, z10.h, z21.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.s, p5/m, z21.s, z10.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z21.s, z10.s, z21.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.b, p3/m, z23.b, z13.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.b, z13.b, z8.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.d, p3/m, z23.d, z13.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.d, z13.d, z8.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.h, p3/m, z23.h, z13.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.h, z13.h, z8.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.s, p3/m, z23.s, z13.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z23.s, z13.s, z8.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.s, z31.s, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sub	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.b, p0/m, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.h, p0/m, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.s, p0/m, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subr	z31.s, z31.s, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sunpkhi	z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sunpkhi	z31.h, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sunpkhi	z31.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sunpklo	z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sunpklo	z31.h, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sunpklo	z31.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     suqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtb	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtb	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtb	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtb	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtb	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtb	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxth	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxth	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxth	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxth	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtw	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sxtw	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z28.b, { z29.b, z30.b }, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z28.d, { z29.d, z30.d }, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z28.h, { z29.h, z30.h }, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z28.s, { z29.s, z30.s }, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z31.b, { z31.b }, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z31.d, { z31.d }, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z31.h, { z31.h }, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbl	z31.s, { z31.s }, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     tbx	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     trn2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uaba	z0.b, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uaba	z0.d, z1.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uaba	z0.h, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uaba	z0.s, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uabalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uabalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uabalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uabalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uabalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uabalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabd	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabd	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabd	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabd	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uabdlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	z0.h, p0/m, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	z29.s, p0/m, z30.h
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     uadalp	z30.d, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddv	d0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddv	d0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddv	d0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uaddwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.d, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.h, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.h, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.s, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ucvtf	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     23.00  udiv	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  udiv	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     23.00  udivr	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     12.00  udivr	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     udot	z0.d, z1.h, z15.h[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     udot	z0.d, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     udot	z0.s, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     udot	z0.s, z1.b, z7.b[3]
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uhsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umax	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umaxv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	z31.b, p7/m, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	z31.d, p7/m, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	z31.h, p7/m, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     umin	z31.s, p7/m, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminv	b0, p7, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminv	d0, p7, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminv	h0, p7, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uminv	s0, p7, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlalt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umlslt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ummla	z0.s, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z0.b, p7/m, z0.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z0.b, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z0.d, p7/m, z0.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z0.h, p7/m, z0.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z0.h, z1.h, z2.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z0.s, p7/m, z0.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z29.s, z30.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umulh	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullb	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullb	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullt	z0.d, z1.s, z15.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullt	z0.s, z1.h, z7.h[7]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     umullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqadd	z31.s, z31.s, #65280
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecb	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecd	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecd	z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecd	z0.d, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecd	z0.d, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecd	z0.d, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdech	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdech	z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdech	z0.h, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdech	z0.h, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdech	z0.h, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	wzr, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	wzr, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	wzr, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	wzr, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	x0, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	x0, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	x0, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	x0, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	z0.d, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	z0.h, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecp	z0.s, p0.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqdecw	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecw	z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecw	z0.s, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecw	z0.s, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqdecw	z0.s, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincb	x0, vl1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincd	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincd	z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincd	z0.d, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincd	z0.d, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincd	z0.d, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqinch	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqinch	z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqinch	z0.h, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqinch	z0.h, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqinch	z0.h, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	wzr, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	wzr, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	wzr, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	wzr, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	x0, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	x0, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	x0, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	x0, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	z0.d, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	z0.h, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincp	z0.s, p0.s
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	w0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	w0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	w0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	w0, pow2, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	x0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	x0, #14
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	x0, all, mul #16
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	x0, pow2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     uqincw	x0, vl1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincw	z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincw	z0.s, all, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincw	z0.s, pow2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqincw	z0.s, pow2, mul #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqrshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z0.b, p0/m, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z0.d, p0/m, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z0.h, p0/m, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z0.s, p0/m, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z31.b, p0/m, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z31.d, p0/m, z31.d, #63
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z31.h, p0/m, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshl	z31.s, p0/m, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnb	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnb	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnb	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnb	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnb	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnb	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnt	z0.b, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnt	z0.h, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnt	z0.s, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnt	z31.b, z31.h, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnt	z31.h, z31.s, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqshrnt	z31.s, z31.d, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.b, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.d, z0.d, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.d, z0.d, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.h, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.h, z0.h, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.s, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.s, z0.s, #0, lsl #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z31.b, z31.b, #255
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z31.d, z31.d, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z31.h, z31.h, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsub	z31.s, z31.s, #65280
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsubr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsubr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsubr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqsubr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtnb	z0.b, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtnb	z0.h, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtnb	z0.s, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtnt	z0.b, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtnt	z0.h, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uqxtnt	z0.s, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     urecpe	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urhadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshl	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshlr	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshlr	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshlr	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshlr	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z0.b, p0/m, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z0.d, p0/m, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z0.h, p0/m, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z0.s, p0/m, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z31.b, p0/m, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z31.d, p0/m, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z31.h, p0/m, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     urshr	z31.s, p0/m, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ursqrte	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -     ursra	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllb	z0.d, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllb	z0.h, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllb	z0.s, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllb	z31.d, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllb	z31.h, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllb	z31.s, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllt	z0.d, z0.s, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllt	z0.h, z0.b, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllt	z0.s, z0.h, #0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllt	z31.d, z31.s, #31
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllt	z31.h, z31.b, #7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     ushllt	z31.s, z31.h, #15
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     usmmla	z0.s, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	z29.s, p7/m, z29.s, z30.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usqadd	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z0.b, z0.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z0.d, z0.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z0.h, z0.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z0.s, z0.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z31.b, z31.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z31.d, z31.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z31.h, z31.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usra	z31.s, z31.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usublb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usublb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usublb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usublt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usublt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usublt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     usubwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uunpkhi	z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uunpkhi	z31.h, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uunpkhi	z31.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uunpklo	z31.d, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uunpklo	z31.h, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uunpklo	z31.s, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtb	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtb	z0.h, p0/m, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtb	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtb	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtb	z31.h, p7/m, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtb	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxth	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxth	z0.s, p0/m, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxth	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxth	z31.s, p7/m, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtw	z0.d, p0/m, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uxtw	z31.d, p7/m, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     uzp2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.b, w0, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.b, wzr, w0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.b, x0, xzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.b, xzr, x0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.d, w0, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.d, x0, xzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.h, w0, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.h, x0, xzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.s, w0, wzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilege	p15.s, x0, xzr
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilerw	p15.b, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilerw	p15.d, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilerw	p15.h, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilerw	p15.s, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilewr	p15.b, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilewr	p15.d, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilewr	p15.h, x30, x30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     whilewr	p15.s, x30, x30
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     wrffr	p0.b
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -     wrffr	p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z0.b, z0.b, z1.b, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z0.d, z0.d, z1.d, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z0.h, z0.h, z1.h, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z0.s, z0.s, z1.s, #1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z31.b, z31.b, z30.b, #8
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z31.d, z31.d, z30.d, #64
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z31.h, z31.h, z30.h, #16
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xar	z31.s, z31.s, z30.s, #32
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p0.b, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p0.d, p0.d, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p0.h, p0.h, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p0.s, p0.s, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p0.b, p0.b, p0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p0.d, p0.d, p0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p0.h, p0.h, p0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p0.s, p0.s, p0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z0.b, z0.b, z0.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z0.d, z0.d, z0.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z0.h, z0.h, z0.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z0.s, z0.s, z0.s
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     zip2	z31.s, z31.s, z31.s

From a1c2a712939897251729b6fc436a2db7db6f03fc Mon Sep 17 00:00:00 2001
From: Andrei Golubev <andrey.golubev@intel.com>
Date: Wed, 18 Jun 2025 14:38:58 +0200
Subject: [PATCH 0801/1322] [mlir][bufferization] Use Type instead of Value in
 unknown conversion (#144658)

Generally, bufferization should be able to create a memref from a tensor
without needing to know more than just a mlir::Type. Thus, change
BufferizationOptions::UnknownTypeConverterFn to accept just a type
(mlir::TensorType for now) instead of mlir::Value. Additionally, apply
the same rationale to getMemRefType() helper function.

Both changes are prerequisites to enable custom types support in
one-shot bufferization.
---
 .../IR/BufferizableOpInterface.h              |  9 +++++----
 .../IR/BufferizableOpInterface.cpp            | 19 +++++++++----------
 .../Bufferization/Transforms/Bufferize.cpp    |  4 ++--
 .../SparsificationAndBufferizationPass.cpp    |  6 +++---
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index adccbef754ec..2fb795f16ae2 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -265,9 +265,9 @@ struct BufferizationOptions {
       std::function<BaseMemRefType(TensorType, Attribute memorySpace,
                                    func::FuncOp, const BufferizationOptions &)>;
   /// Tensor -> MemRef type converter.
-  /// Parameters: Value, memory space, bufferization options
+  /// Parameters: tensor type, memory space, bufferization options
   using UnknownTypeConverterFn = std::function<BaseMemRefType(
-      Value, Attribute memorySpace, const BufferizationOptions &)>;
+      TensorType, Attribute memorySpace, const BufferizationOptions &)>;
   // Produce a MemorySpace attribute from a tensor type
   using DefaultMemorySpaceFn =
       std::function<std::optional<Attribute>(TensorType t)>;
@@ -655,7 +655,7 @@ OpTy replaceOpWithNewBufferizedOp(RewriterBase &rewriter, Operation *op,
   return newOp;
 }
 
-/// Return a MemRefType to which the type of the given value can be bufferized.
+/// Return a MemRefType to which the TensorType can be bufferized.
 ///
 /// If possible, op bufferization implementations should not use this function
 /// and instead infer precise memref types for tensor results by themselves.
@@ -667,7 +667,8 @@ OpTy replaceOpWithNewBufferizedOp(RewriterBase &rewriter, Operation *op,
 /// Note: Canonicalization patterns could clean up layout maps and infer more
 /// precise layout maps after bufferization. However, many possible
 /// canonicalizations are currently not implemented.
-BaseMemRefType getMemRefType(Value value, const BufferizationOptions &options,
+BaseMemRefType getMemRefType(TensorType tensorType,
+                             const BufferizationOptions &options,
                              MemRefLayoutAttrInterface layout = {},
                              Attribute memorySpace = nullptr);
 
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 1d6e1bdaf80f..dd43647682ea 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -345,10 +345,9 @@ defaultFunctionArgTypeConverter(TensorType type, Attribute memorySpace,
 }
 /// Default unknown type converter: Use a fully dynamic layout map.
 BaseMemRefType
-defaultUnknownTypeConverter(Value value, Attribute memorySpace,
+defaultUnknownTypeConverter(TensorType tensorType, Attribute memorySpace,
                             const BufferizationOptions &options) {
-  return getMemRefTypeWithFullyDynamicLayout(
-      llvm::cast<TensorType>(value.getType()), memorySpace);
+  return getMemRefTypeWithFullyDynamicLayout(tensorType, memorySpace);
 }
 
 } // namespace
@@ -724,7 +723,8 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options,
   if (!memSpace.has_value())
     return op->emitError("could not infer memory space");
 
-  return getMemRefType(value, options, /*layout=*/{}, *memSpace);
+  return getMemRefType(cast<TensorType>(value.getType()), options,
+                       /*layout=*/{}, *memSpace);
 }
 
 bool bufferization::hasTensorSemantics(Operation *op) {
@@ -797,12 +797,10 @@ LogicalResult BufferizationOptions::createMemCpy(OpBuilder &b, Location loc,
 // Bufferization-specific IRMapping support with debugging.
 //===----------------------------------------------------------------------===//
 
-BaseMemRefType bufferization::getMemRefType(Value value,
+BaseMemRefType bufferization::getMemRefType(TensorType tensorType,
                                             const BufferizationOptions &options,
                                             MemRefLayoutAttrInterface layout,
                                             Attribute memorySpace) {
-  auto tensorType = llvm::cast<TensorType>(value.getType());
-
   // Case 1: Unranked memref type.
   if (auto unrankedTensorType =
           llvm::dyn_cast<UnrankedTensorType>(tensorType)) {
@@ -819,7 +817,7 @@ BaseMemRefType bufferization::getMemRefType(Value value,
                            memorySpace);
   }
 
-  return options.unknownTypeConverterFn(value, memorySpace, options);
+  return options.unknownTypeConverterFn(tensorType, memorySpace, options);
 }
 
 BaseMemRefType
@@ -955,10 +953,11 @@ FailureOr<BaseMemRefType> bufferization::detail::defaultGetBufferType(
     const BufferizationState &bufferizationState,
     SmallVector<Value> &invocationStack) {
   assert(llvm::isa<TensorType>(value.getType()) && "expected tensor type");
+  auto tensorType = cast<TensorType>(value.getType());
 
   // No further analysis is possible for a block argument.
   if (llvm::isa<BlockArgument>(value))
-    return bufferization::getMemRefType(value, options);
+    return bufferization::getMemRefType(tensorType, options);
 
   // Value is an OpResult.
   Operation *op = getOwnerOfValue(value);
@@ -981,7 +980,7 @@ FailureOr<BaseMemRefType> bufferization::detail::defaultGetBufferType(
   if (!memSpace.has_value())
     return op->emitError("could not infer memory space");
 
-  return getMemRefType(value, options, /*layout=*/{}, *memSpace);
+  return getMemRefType(tensorType, options, /*layout=*/{}, *memSpace);
 }
 
 bool bufferization::detail::defaultIsRepetitiveRegion(
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index c7681d309a4a..7e9b9119ce94 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -109,9 +109,9 @@ struct OneShotBufferizePass
                   "'unknown-type-conversion'");
         return signalPassFailure();
       }
-      opt.unknownTypeConverterFn = [=](Value value, Attribute memorySpace,
+      opt.unknownTypeConverterFn = [=](TensorType tensorType,
+                                       Attribute memorySpace,
                                        const BufferizationOptions &options) {
-        auto tensorType = cast<TensorType>(value.getType());
         if (unknownTypeConversionOption == LayoutMapOption::IdentityLayoutMap)
           return bufferization::getMemRefTypeWithStaticIdentityLayout(
               tensorType, memorySpace);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index a3ab53d81811..15e5102462ad 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -223,10 +223,10 @@ mlir::getBufferizationOptionsForSparsification(bool analysisOnly) {
   OneShotBufferizationOptions options;
   options.bufferizeFunctionBoundaries = true;
   options.setFunctionBoundaryTypeConversion(LayoutMapOption::IdentityLayoutMap);
-  options.unknownTypeConverterFn = [](Value value, Attribute memorySpace,
+  options.unknownTypeConverterFn = [](TensorType tensorType,
+                                      Attribute memorySpace,
                                       const BufferizationOptions &options) {
-    return getMemRefTypeWithStaticIdentityLayout(
-        cast<TensorType>(value.getType()), memorySpace);
+    return getMemRefTypeWithStaticIdentityLayout(tensorType, memorySpace);
   };
   if (analysisOnly) {
     options.testAnalysisOnly = true;

From 66580f77b826e71a9727f1d6287bec6a6101f620 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 18 Jun 2025 14:42:09 +0200
Subject: [PATCH 0802/1322] [mlir][Transforms][NFC] Dialect Conversion: Keep
 `unresolvedMaterializations` up to date (#144254)

`unresolvedMaterializations` is a mapping from
`UnrealizedConversionCastOp` to `UnresolvedMaterializationRewrite`. This
mapping is needed to find the correct type converter for an unresolved
materialization.

With this commit, `unresolvedMaterializations` is updated immediately
when an op is being erased. This also cleans up the code base a bit:
`SingleEraseRewriter` is now used only during the "cleanup" phase and no
longer needed as a field of `ConversionRewriterImpl`.

This commit is in preparation of the One-Shot Dialect Conversion
refactoring: `allowPatternRollback = false` will in the future trigger
immediate materialization of all IR changes.
---
 .../Transforms/Utils/DialectConversion.cpp    | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 7de26d7cfa84..c4b85ec4f67d 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -848,7 +848,7 @@ namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(MLIRContext *ctx,
                                          const ConversionConfig &config)
-      : context(ctx), eraseRewriter(ctx), config(config) {}
+      : context(ctx), config(config) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -981,8 +981,11 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// no new IR is created between calls to `eraseOp`/`eraseBlock`.
   struct SingleEraseRewriter : public RewriterBase, RewriterBase::Listener {
   public:
-    SingleEraseRewriter(MLIRContext *context)
-        : RewriterBase(context, /*listener=*/this) {}
+    SingleEraseRewriter(
+        MLIRContext *context,
+        std::function<void(Operation *)> opErasedCallback = nullptr)
+        : RewriterBase(context, /*listener=*/this),
+          opErasedCallback(opErasedCallback) {}
 
     /// Erase the given op (unless it was already erased).
     void eraseOp(Operation *op) override {
@@ -1003,13 +1006,20 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     bool wasErased(void *ptr) const { return erased.contains(ptr); }
 
-    void notifyOperationErased(Operation *op) override { erased.insert(op); }
+    void notifyOperationErased(Operation *op) override {
+      erased.insert(op);
+      if (opErasedCallback)
+        opErasedCallback(op);
+    }
 
     void notifyBlockErased(Block *block) override { erased.insert(block); }
 
   private:
     /// Pointers to all erased operations and blocks.
     DenseSet<void *> erased;
+
+    /// A callback that is invoked when an operation is erased.
+    std::function<void(Operation *)> opErasedCallback;
   };
 
   //===--------------------------------------------------------------------===//
@@ -1019,11 +1029,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// MLIR context.
   MLIRContext *context;
 
-  /// A rewriter that keeps track of ops/block that were already erased and
-  /// skips duplicate op/block erasures. This rewriter is used during the
-  /// "cleanup" phase.
-  SingleEraseRewriter eraseRewriter;
-
   // Mapping between replaced values that differ in type. This happens when
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
@@ -1195,6 +1200,11 @@ void ConversionPatternRewriterImpl::applyRewrites() {
     rewrites[i]->commit(rewriter);
 
   // Clean up all rewrites.
+  SingleEraseRewriter eraseRewriter(
+      context, /*opErasedCallback=*/[&](Operation *op) {
+        if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
+          unresolvedMaterializations.erase(castOp);
+      });
   for (auto &rewrite : rewrites)
     rewrite->cleanup(eraseRewriter);
 }
@@ -2714,11 +2724,8 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   SmallVector<UnrealizedConversionCastOp> allCastOps;
   const DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationRewrite *>
       &materializations = rewriterImpl.unresolvedMaterializations;
-  for (auto it : materializations) {
-    if (rewriterImpl.eraseRewriter.wasErased(it.first))
-      continue;
+  for (auto it : materializations)
     allCastOps.push_back(it.first);
-  }
 
   // Reconcile all UnrealizedConversionCastOps that were inserted by the
   // dialect conversion frameworks. (Not the one that were inserted by

From 4b2ab1494bc07493087252dff4e5e19808703048 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 18 Jun 2025 07:46:49 -0500
Subject: [PATCH 0803/1322] [flang][OpenMP] Don't crash on iterator modifier in
 declare mapper (#144359)

Both the declare mapper directive argument, and the iterator modifier
can contain declaration-type-spec, so make sure that the processing of
one ends before processing of the other begins in semantic analysis.
---
 flang/lib/Semantics/resolve-names.cpp                 |  2 +-
 .../Lower/OpenMP/Todo/declare-mapper-iterator.f90     | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/declare-mapper-iterator.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index f66918e5c140..9e465f8ff3e1 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1800,9 +1800,9 @@ void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec,
   Walk(std::get<parser::TypeSpec>(spec.t));
   auto &varName{std::get<parser::Name>(spec.t)};
   DeclareObjectEntity(varName);
+  EndDeclTypeSpec();
 
   Walk(clauses);
-  EndDeclTypeSpec();
   PopScope();
 }
 
diff --git a/flang/test/Lower/OpenMP/Todo/declare-mapper-iterator.f90 b/flang/test/Lower/OpenMP/Todo/declare-mapper-iterator.f90
new file mode 100644
index 000000000000..dacd6d624659
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/declare-mapper-iterator.f90
@@ -0,0 +1,11 @@
+!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s
+
+!CHECK: Support for iterator modifiers is not implemented yet
+subroutine f(arg)
+  type :: s
+    integer :: a(10)
+  end type
+  type(s) :: arg(:)
+
+  !$omp declare mapper(m: s :: v) map(mapper(m), iterator(i = 1:10): v%a(i))
+end

From a83d3362f686725bac76bfb9562663908de25f15 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Wed, 18 Jun 2025 14:02:11 +0100
Subject: [PATCH 0804/1322] [flang][OpenMP] Don't allow DO CONCURRENT inside of
 a loop nest (#144506)

I don't think DO CONCURRENT fits the definition of a Canonical Loop Nest
(OpenMP 6.0 section 6.4.1).
It is however explicitly allowed for the LOOP construct (6.0 section
13.8).

There's some obscure language in OpenMP 6.0 for the LOOP construct:

> If the collapsed loop is a DO CONCURRENT loop, neither the
> data-sharing attribute clauses nor the collapse clause may be
specified.

From the surrounding context, I think "collapsed loop" just means the
loop that the LOOP construct applies to. So I will interpret this to
mean that DO CONCURRENT can only be used with the LOOP construct if it
does not contain the COLLAPSE clause.

This also fixes a bug where the associated clause was never cleared
after it was set.

Fixes #144178
---
 flang/lib/Semantics/resolve-directives.cpp    | 38 +++++++++++++++---
 .../Lower/OpenMP/Todo/omp-doconcurrent.f90    | 10 -----
 .../OpenMP/do-concurrent-collapse.f90         | 39 +++++++++++++++++++
 3 files changed, 71 insertions(+), 16 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/omp-doconcurrent.f90
 create mode 100644 flang/test/Semantics/OpenMP/do-concurrent-collapse.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 282660684e78..57db76e2160d 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -23,6 +23,7 @@
 #include "flang/Semantics/openmp-modifiers.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/Frontend/OpenMP/OMP.h.inc"
 #include "llvm/Support/Debug.h"
 #include <list>
 #include <map>
@@ -740,9 +741,7 @@ public:
   }
 
   const parser::OmpClause *associatedClause{nullptr};
-  void SetAssociatedClause(const parser::OmpClause &c) {
-    associatedClause = &c;
-  }
+  void SetAssociatedClause(const parser::OmpClause *c) { associatedClause = c; }
   const parser::OmpClause *GetAssociatedClause() { return associatedClause; }
 
 private:
@@ -1919,12 +1918,17 @@ std::int64_t OmpAttributeVisitor::GetAssociatedLoopLevelFromClauses(
   }
 
   if (orderedLevel && (!collapseLevel || orderedLevel >= collapseLevel)) {
-    SetAssociatedClause(*ordClause);
+    SetAssociatedClause(ordClause);
     return orderedLevel;
   } else if (!orderedLevel && collapseLevel) {
-    SetAssociatedClause(*collClause);
+    SetAssociatedClause(collClause);
     return collapseLevel;
-  } // orderedLevel < collapseLevel is an error handled in structural checks
+  } else {
+    SetAssociatedClause(nullptr);
+  }
+  // orderedLevel < collapseLevel is an error handled in structural
+  // checks
+
   return 1; // default is outermost loop
 }
 
@@ -1952,9 +1956,31 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
     ivDSA = Symbol::Flag::OmpLastPrivate;
   }
 
+  bool isLoopConstruct{
+      GetContext().directive == llvm::omp::Directive::OMPD_loop};
+  const parser::OmpClause *clause{GetAssociatedClause()};
+  bool hasCollapseClause{
+      clause ? (clause->Id() == llvm::omp::OMPC_collapse) : false};
+
   const auto &outer{std::get<std::optional<parser::DoConstruct>>(x.t)};
   if (outer.has_value()) {
     for (const parser::DoConstruct *loop{&*outer}; loop && level > 0; --level) {
+      if (loop->IsDoConcurrent()) {
+        // DO CONCURRENT is explicitly allowed for the LOOP construct so long as
+        // there isn't a COLLAPSE clause
+        if (isLoopConstruct) {
+          if (hasCollapseClause) {
+            // hasCollapseClause implies clause != nullptr
+            context_.Say(clause->source,
+                "DO CONCURRENT loops cannot be used with the COLLAPSE clause."_err_en_US);
+          }
+        } else {
+          auto &stmt =
+              std::get<parser::Statement<parser::NonLabelDoStmt>>(loop->t);
+          context_.Say(stmt.source,
+              "DO CONCURRENT loops cannot form part of a loop nest."_err_en_US);
+        }
+      }
       // go through all the nested do-loops and resolve index variables
       const parser::Name *iv{GetLoopIndex(*loop)};
       if (iv) {
diff --git a/flang/test/Lower/OpenMP/Todo/omp-doconcurrent.f90 b/flang/test/Lower/OpenMP/Todo/omp-doconcurrent.f90
deleted file mode 100644
index a6d70fa44592..000000000000
--- a/flang/test/Lower/OpenMP/Todo/omp-doconcurrent.f90
+++ /dev/null
@@ -1,10 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Do Concurrent in Worksharing loop construct
-subroutine sb()
-  !$omp do
-  do concurrent(i=1:10)
-    print *, i
-  end do
-end subroutine
diff --git a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90 b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
new file mode 100644
index 000000000000..bb1929249183
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
@@ -0,0 +1,39 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp
+
+integer :: i, j
+!$omp parallel do collapse(2)
+do i = 1, 1
+  ! ERROR: DO CONCURRENT loops cannot form part of a loop nest.
+  do concurrent (j = 1:2)
+    print *, j
+  end do
+end do
+
+!$omp parallel do
+do i = 1, 1
+  ! This should not lead to an error because it is not part of a loop nest:
+  do concurrent (j = 1:2)
+    print *, j
+  end do
+end do
+
+!$omp parallel do
+! ERROR: DO CONCURRENT loops cannot form part of a loop nest.
+do concurrent (j = 1:2)
+  print *, j
+end do
+
+!$omp loop
+! Do concurrent is explicitly allowed inside of omp loop
+do concurrent (j = 1:2)
+  print *, j
+end do
+
+! ERROR: DO CONCURRENT loops cannot be used with the COLLAPSE clause.
+!$omp loop collapse(2)
+do i = 1, 1
+  do concurrent (j = 1:2)
+    print *, j
+  end do
+end do
+end

From 8584abb05a84d3bf4e84cdfe4154d7ade8bdfd04 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 18 Jun 2025 15:04:55 +0200
Subject: [PATCH 0805/1322] [mlir] mlir/test/lit.local.cfg ->
 mlir/test/Target/SPIRV/lit.local.cfg (#144685)

renamed: mlir/test/lit.local.cfg -> mlir/test/Target/SPIRV/lit.local.cfg
---
 mlir/test/{ => Target/SPIRV}/lit.local.cfg | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename mlir/test/{ => Target/SPIRV}/lit.local.cfg (100%)

diff --git a/mlir/test/lit.local.cfg b/mlir/test/Target/SPIRV/lit.local.cfg
similarity index 100%
rename from mlir/test/lit.local.cfg
rename to mlir/test/Target/SPIRV/lit.local.cfg

From 68471d29eed2c49f9b439e505b3f24d387d54f97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 18 Jun 2025 15:17:53 +0200
Subject: [PATCH 0806/1322] =?UTF-8?q?Revert=20"Reapply=20"[clang][bytecode?=
 =?UTF-8?q?]=20Allocate=20IntegralAP=20and=20Floating=20types=20usi?=
 =?UTF-8?q?=E2=80=A6=20(#144676)"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 7c15edb306932e41c159f3d69c161ed0d89d47b7.

This still breaks clang-armv8-quick:
https://lab.llvm.org/buildbot/#/builders/154/builds/17587
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 114 +++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 -
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  58 +--
 clang/lib/AST/ByteCode/Floating.h             | 252 +++++--------
 clang/lib/AST/ByteCode/Integral.h             |   3 -
 clang/lib/AST/ByteCode/IntegralAP.h           | 234 +++++-------
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +-----
 clang/lib/AST/ByteCode/Interp.h               | 341 ++++--------------
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 +--
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 --
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 -
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 17 files changed, 345 insertions(+), 932 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 3f884ed8d094..9fe4803ce98e 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,8 +748,7 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  APFloat F = E->getValue();
-  return this->emitFloat(F, E);
+  return this->emitConstFloat(E->getValue(), E);
 }
 
 template <class Emitter>
@@ -4186,10 +4185,8 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float: {
-    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
-    return this->emitFloat(F, E);
-  }
+  case PT_Float:
+    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
@@ -4677,7 +4674,10 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      return this->emitFinishInitGlobal(Init);
+      if (!this->emitFinishInit(Init))
+        return false;
+
+      return this->emitPopPtr(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,45 +4698,51 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  }
-  // Local variables.
-  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
-
-  if (VarT) {
-    unsigned Offset = this->allocateLocalPrimitive(
-        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
-        IsConstexprUnknown);
-    if (Init) {
-      // If this is a toplevel declaration, create a scope for the
-      // initializer.
-      if (Toplevel) {
-        LocalScope<Emitter> Scope(this);
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-      } else {
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD);
-      }
-    }
   } else {
-    if (std::optional<unsigned> Offset = this->allocateLocal(
-            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
-      if (!Init)
-        return true;
+    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-      if (!this->emitGetPtrLocal(*Offset, Init))
-        return false;
+    if (VarT) {
+      unsigned Offset = this->allocateLocalPrimitive(
+          VD, *VarT, VD->getType().isConstQualified(), nullptr,
+          ScopeKind::Block, IsConstexprUnknown);
+      if (Init) {
+        // If this is a toplevel declaration, create a scope for the
+        // initializer.
+        if (Toplevel) {
+          LocalScope<Emitter> Scope(this);
+          if (!this->visit(Init))
+            return false;
+          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+        } else {
+          if (!this->visit(Init))
+            return false;
+          return this->emitSetLocal(*VarT, Offset, VD);
+        }
+      }
+    } else {
+      if (std::optional<unsigned> Offset =
+              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
+                                  IsConstexprUnknown)) {
+        if (!Init)
+          return true;
 
-      if (!visitInitializer(Init))
-        return false;
+        if (!this->emitGetPtrLocal(*Offset, Init))
+          return false;
 
-      return this->emitFinishInitPop(Init);
+        if (!visitInitializer(Init))
+          return false;
+
+        if (!this->emitFinishInit(Init))
+          return false;
+
+        return this->emitPopPtr(Init);
+      }
+      return false;
     }
-    return false;
+    return true;
   }
-  return true;
+
+  return false;
 }
 
 template <class Emitter>
@@ -4745,10 +4751,8 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat()) {
-    APFloat F = Val.getFloat();
-    return this->emitFloat(F, E);
-  }
+  else if (Val.isFloat())
+    return this->emitConstFloat(Val.getFloat(), E);
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6129,10 +6133,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      APFloat F(TargetSemantics, 1);
-      if (!this->emitFloat(F, E))
+      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
         return false;
-
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6174,10 +6176,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      APFloat F(TargetSemantics, 1);
-      if (!this->emitFloat(F, E))
+      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
         return false;
-
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,20 +6953,6 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
-template <class Emitter>
-bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
-  assert(!DiscardResult && "Should've been checked before");
-
-  if (Floating::singleWord(F.getSemantics()))
-    return this->emitConstFloat(Floating(F), E);
-
-  APInt I = F.bitcastToAPInt();
-  return this->emitConstFloat(
-      Floating(const_cast<uint64_t *>(I.getRawData()),
-               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
-      E);
-}
-
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index a1d068cc7e0a..ac3ad84766dc 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,7 +391,6 @@ private:
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
-  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 46e4d0d940b3..5531295dfa2f 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 7c6b78386b14..846dc2fe92a7 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,56 +50,34 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto Sem = Floating::deserializeSemantics(*OpPC);
+  auto F = Floating::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
-      llvm::APFloatBase::EnumToSemantics(Sem));
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
-  Floating Result(Memory.get(), Sem);
-  Floating::deserialize(*OpPC, &Result);
-
-  OpPC += align(Result.bytesToSerialize());
-
-  std::string S;
-  llvm::raw_string_ostream SS(S);
-  SS << Result;
-  return S;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  using T = IntegralAP<false>;
-  unsigned BitWidth = T::deserializeSize(*OpPC);
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  auto F = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  T Result(Memory.get(), BitWidth);
-  T::deserialize(*OpPC, &Result);
-
-  OpPC += Result.bytesToSerialize();
-  std::string Str;
-  llvm::raw_string_ostream SS(Str);
-  SS << Result;
-  return Str;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
-
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  using T = IntegralAP<true>;
-  unsigned BitWidth = T::deserializeSize(*OpPC);
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  auto F = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  T Result(Memory.get(), BitWidth);
-  T::deserialize(*OpPC, &Result);
-
-  std::string Str;
-  llvm::raw_string_ostream SS(Str);
-  SS << Result;
-
-  OpPC += Result.bytesToSerialize();
-  return Str;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 659892e720ab..3750568fc23c 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,79 +17,63 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
-// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
-// floating values.
-#define ALLOCATE_ALL 0
-
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
-using APInt = llvm::APInt;
 
-/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
-/// It will NOT copy the memory (unless, of course, copy() is called) and it
-/// won't alllocate anything. The allocation should happen via InterpState or
-/// Program.
 class Floating final {
 private:
-  union {
-    uint64_t Val = 0;
-    uint64_t *Memory;
-  };
-  llvm::APFloatBase::Semantics Semantics;
-
-  APFloat getValue() const {
-    unsigned BitWidth = bitWidth();
-    if (singleWord())
-      return APFloat(getSemantics(), APInt(BitWidth, Val));
-    unsigned NumWords = numWords();
-    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
-  }
+  // The underlying value storage.
+  APFloat F;
 
 public:
-  Floating() = default;
-  Floating(llvm::APFloatBase::Semantics Semantics)
-      : Val(0), Semantics(Semantics) {}
-  Floating(const APFloat &F) {
+  /// Zero-initializes a Floating.
+  Floating() : F(0.0f) {}
+  Floating(const APFloat &F) : F(F) {}
 
-    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
-    this->copy(F);
+  // Static constructors for special floating point values.
+  static Floating getInf(const llvm::fltSemantics &Sem) {
+    return Floating(APFloat::getInf(Sem));
   }
-  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
-      : Memory(Memory), Semantics(Semantics) {}
+  const APFloat &getAPFloat() const { return F; }
 
-  APFloat getAPFloat() const { return getValue(); }
-
-  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
-  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
-  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
-  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
+  bool operator<(Floating RHS) const { return F < RHS.F; }
+  bool operator>(Floating RHS) const { return F > RHS.F; }
+  bool operator<=(Floating RHS) const { return F <= RHS.F; }
+  bool operator>=(Floating RHS) const { return F >= RHS.F; }
+  bool operator==(Floating RHS) const { return F == RHS.F; }
+  bool operator!=(Floating RHS) const { return F != RHS.F; }
+  Floating operator-() const { return Floating(-F); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
-                                       &IsExact);
+    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
   }
 
-  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
-                   Floating *Result) const {
-    APFloat Copy = getValue();
+  Floating toSemantics(const llvm::fltSemantics *Sem,
+                       llvm::RoundingMode RM) const {
+    APFloat Copy = F;
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    Result->copy(Copy);
+    return Floating(Copy);
+  }
+
+  /// Convert this Floating to one with the same semantics as \Other.
+  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
+    return toSemantics(&Other.F.getSemantics(), RM);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(getValue().bitcastToAPInt());
+    return APSInt(F.bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
+  APValue toAPValue(const ASTContext &) const { return APValue(F); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    getValue().toString(Buffer);
+    F.toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -99,62 +83,25 @@ public:
     return NameStr;
   }
 
-  unsigned bitWidth() const {
-    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
-  }
-  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
-  bool singleWord() const {
-#if ALLOCATE_ALL
-    return false;
-#endif
-    return numWords() == 1;
-  }
-  static bool singleWord(const llvm::fltSemantics &Sem) {
-#if ALLOCATE_ALL
-    return false;
-#endif
-    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
-  }
-  const llvm::fltSemantics &getSemantics() const {
-    return llvm::APFloatBase::EnumToSemantics(Semantics);
-  }
-
-  void copy(const APFloat &F) {
-    if (singleWord()) {
-      Val = F.bitcastToAPInt().getZExtValue();
-    } else {
-      assert(Memory);
-      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
-                  numWords() * sizeof(uint64_t));
-    }
-  }
-
-  void take(uint64_t *NewMemory) {
-    if (singleWord())
-      return;
-
-    if (Memory)
-      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
-    Memory = NewMemory;
-  }
+  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return getValue().isNegative(); }
-  bool isZero() const { return getValue().isZero(); }
-  bool isNonZero() const { return getValue().isNonZero(); }
-  bool isMin() const { return getValue().isSmallest(); }
-  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
-  bool isNan() const { return getValue().isNaN(); }
-  bool isSignaling() const { return getValue().isSignaling(); }
-  bool isInf() const { return getValue().isInfinity(); }
-  bool isFinite() const { return getValue().isFinite(); }
-  bool isNormal() const { return getValue().isNormal(); }
-  bool isDenormal() const { return getValue().isDenormal(); }
-  llvm::FPClassTest classify() const { return getValue().classify(); }
-  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
+  bool isNegative() const { return F.isNegative(); }
+  bool isZero() const { return F.isZero(); }
+  bool isNonZero() const { return F.isNonZero(); }
+  bool isMin() const { return F.isSmallest(); }
+  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
+  bool isNan() const { return F.isNaN(); }
+  bool isSignaling() const { return F.isSignaling(); }
+  bool isInf() const { return F.isInfinity(); }
+  bool isFinite() const { return F.isFinite(); }
+  bool isNormal() const { return F.isNormal(); }
+  bool isDenormal() const { return F.isDenormal(); }
+  llvm::FPClassTest classify() const { return F.classify(); }
+  APFloat::fltCategory getCategory() const { return F.getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
+    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -171,130 +118,97 @@ public:
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating *Result) {
+                                        Floating &Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result->copy(F);
+    Result = Floating(F);
     return Status;
   }
 
-  static void bitcastFromMemory(const std::byte *Buff,
-                                const llvm::fltSemantics &Sem,
-                                Floating *Result) {
+  static Floating bitcastFromMemory(const std::byte *Buff,
+                                    const llvm::fltSemantics &Sem) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-    Result->copy(APFloat(Sem, API));
+
+    return Floating(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = getValue().bitcastToAPInt();
+    llvm::APInt API = F.bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
+    return sizeof(llvm::fltSemantics *) +
+           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
   }
 
   void serialize(std::byte *Buff) const {
-    std::memcpy(Buff, &Semantics, sizeof(Semantics));
-    if (singleWord()) {
-      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
-    } else {
-      std::memcpy(Buff + sizeof(Semantics), Memory,
-                  numWords() * sizeof(uint64_t));
-    }
+    // Semantics followed by an APInt.
+    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
+
+    llvm::APInt API = F.bitcastToAPInt();
+    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
+                           bitWidth() / 8);
   }
 
-  static llvm::APFloatBase::Semantics
-  deserializeSemantics(const std::byte *Buff) {
-    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
+  static Floating deserialize(const std::byte *Buff) {
+    const llvm::fltSemantics *Sem;
+    std::memcpy((void *)&Sem, Buff, sizeof(void *));
+    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
   }
 
-  static void deserialize(const std::byte *Buff, Floating *Result) {
-    llvm::APFloatBase::Semantics Semantics;
-    std::memcpy(&Semantics, Buff, sizeof(Semantics));
-
-    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
-        llvm::APFloatBase::EnumToSemantics(Semantics));
-    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-
-    Result->Semantics = Semantics;
-    if (NumWords == 1 && !ALLOCATE_ALL) {
-      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
-    } else {
-      assert(Result->Memory);
-      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
-                  NumWords * sizeof(uint64_t));
-    }
+  static Floating abs(const Floating &F) {
+    APFloat V = F.F;
+    if (V.isNegative())
+      V.changeSign();
+    return Floating(V);
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.add(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.add(B.F, RM);
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.getSemantics(), 1);
-    APFloat LHS = A.getValue();
-
-    auto Status = LHS.add(One, RM);
-    R->copy(LHS);
-    return Status;
+    APFloat One(A.F.getSemantics(), 1);
+    *R = Floating(A.F);
+    return R->F.add(One, RM);
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.subtract(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.subtract(B.F, RM);
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.getSemantics(), 1);
-    APFloat LHS = A.getValue();
-
-    auto Status = LHS.subtract(One, RM);
-    R->copy(LHS);
-    return Status;
+    APFloat One(A.F.getSemantics(), 1);
+    *R = Floating(A.F);
+    return R->F.subtract(One, RM);
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.multiply(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.multiply(B.F, RM);
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.divide(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.divide(B.F, RM);
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    R->copy(-A.getValue());
+    *R = -A;
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index af5cd2d13ecc..13fdb5369f2b 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,9 +99,6 @@ public:
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
-  bool operator>=(unsigned RHS) const {
-    return static_cast<unsigned>(V) >= RHS;
-  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 61cbd14ad174..8ee08dfb5cfe 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,19 +28,12 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
+template <unsigned Bits, bool Signed> class Integral;
 
-/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
-/// It will NOT copy the memory (unless, of course, copy() is called) and it
-/// won't alllocate anything. The allocation should happen via InterpState or
-/// Program.
 template <bool Signed> class IntegralAP final {
-public:
-  union {
-    uint64_t *Memory = nullptr;
-    uint64_t Val;
-  };
-  uint32_t BitWidth = 0;
+private:
   friend IntegralAP<!Signed>;
+  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -59,86 +52,52 @@ public:
                                : V.trunc(BitSize).getZExtValue();
   }
 
-  APInt getValue() const {
-    if (singleWord())
-      return APInt(BitWidth, Val, Signed);
-    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-    return llvm::APInt(BitWidth, NumWords, Memory);
-  }
-
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  void take(uint64_t *NewMemory) {
-    assert(!singleWord());
-    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
-    Memory = NewMemory;
-  }
+  template <typename T>
+  IntegralAP(T Value, unsigned BitWidth)
+      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
 
-  void copy(const APInt &V) {
-    assert(BitWidth == V.getBitWidth());
-    assert(numWords() == V.getNumWords());
+  IntegralAP(APInt V) : V(V) {}
+  /// Arbitrary value for uninitialized variables.
+  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
 
-    if (V.isSingleWord()) {
-      if constexpr (Signed)
-        Val = V.getSExtValue();
-      else
-        Val = V.getZExtValue();
-      return;
-    }
-    assert(Memory);
-    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
-  }
-
-  // Constructors.
-  IntegralAP() = default;
-  IntegralAP(unsigned BitWidth) : BitWidth(BitWidth) {}
-  IntegralAP(uint64_t *Memory, unsigned BitWidth)
-      : Memory(Memory), BitWidth(BitWidth) {}
-  IntegralAP(const APInt &V) : BitWidth(V.getBitWidth()) {
-    if (V.isSingleWord()) {
-      Val = Signed ? V.getSExtValue() : V.getZExtValue();
-    } else {
-      Memory = const_cast<uint64_t *>(V.getRawData());
-    }
-  }
-
-  IntegralAP operator-() const { return IntegralAP(-getValue()); }
+  IntegralAP operator-() const { return IntegralAP(-V); }
   IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(getValue() - Other.getValue());
+    return IntegralAP(V - Other.V);
   }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return getValue().sgt(RHS.getValue());
-    return getValue().ugt(RHS.getValue());
+      return V.ugt(RHS.V);
+    return V.sgt(RHS.V);
   }
-  bool operator>=(unsigned RHS) const {
+  bool operator>=(IntegralAP RHS) const {
     if constexpr (Signed)
-      return getValue().sge(RHS);
-    return getValue().uge(RHS);
+      return V.uge(RHS.V);
+    return V.sge(RHS.V);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return getValue().slt(RHS.getValue());
-    return getValue().ult(RHS.getValue());
+      return V.slt(RHS.V);
+    return V.slt(RHS.V);
+  }
+  bool operator<=(IntegralAP RHS) const {
+    if constexpr (Signed)
+      return V.ult(RHS.V);
+    return V.ult(RHS.V);
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(getValue());
+    return truncateCast<Ty, Signed>(V);
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
-    if (NumBits == 0)
-      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-    assert(false);
-    return IntegralAP<Signed>(Copy);
-  }
 
-  static IntegralAP from(const APInt &Value) {
-    return IntegralAP<Signed>(Value);
+    return IntegralAP<Signed>(Copy);
   }
 
   template <bool InputSigned>
@@ -147,45 +106,52 @@ public:
       NumBits = V.bitWidth();
 
     if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.getValue().sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.getValue().zextOrTrunc(NumBits));
+      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
+    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
   }
 
-  constexpr unsigned bitWidth() const { return BitWidth; }
-  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
-  constexpr bool singleWord() const { return numWords() == 1; }
+  template <unsigned Bits, bool InputSigned>
+  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
+    return IntegralAP<Signed>(I.toAPInt(BitWidth));
+  }
+
+  static IntegralAP zero(int32_t BitWidth) {
+    APInt V = APInt(BitWidth, 0LL, Signed);
+    return IntegralAP(V);
+  }
+
+  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
-    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(getValue().sext(Bits), !Signed);
+      return APSInt(V.sext(Bits), !Signed);
     else
-      return APSInt(getValue().zext(Bits), !Signed);
+      return APSInt(V.zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return getValue().isZero(); }
+  bool isZero() const { return V.isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return getValue().isNonNegative();
+      return V.isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !getValue().isNonNegative();
+      return !V.isNonNegative();
     return false;
   }
-  bool isMin() const { return getValue().isMinValue(); }
-  bool isMax() const { return getValue().isMaxValue(); }
+  bool isMin() const { return V.isMinValue(); }
+  bool isMax() const { return V.isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
+  bool isMinusOne() const { return Signed && V == -1; }
 
-  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
+  unsigned countLeadingZeros() const { return V.countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
+  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -195,57 +161,53 @@ public:
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(
-          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(
-          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    return IntegralAP<false>(Memory, BitWidth);
+    APInt Copy = V;
+    return IntegralAP<false>(Copy);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
   }
 
-  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
-                                IntegralAP *Result) {
+  static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
-    Result->copy(V);
+    return IntegralAP(V);
   }
 
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
-    APInt V1 = getValue();
-    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V1.slt(V2))
+      if (V.slt(RHS.V))
         return ComparisonCategoryResult::Less;
-      if (V1.sgt(V2))
+      if (V.sgt(RHS.V))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V1.ult(V2))
+    if (V.ult(RHS.V))
       return ComparisonCategoryResult::Less;
-    if (V1.ugt(V2))
+    if (V.ugt(RHS.V))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
-    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
+    IntegralAP<Signed> One(1, A.bitWidth());
+    return add(A, One, A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
-    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
+    IntegralAP<Signed> One(1, A.bitWidth());
+    return sub(A, One, A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -262,97 +224,87 @@ public:
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      R->copy(A.getValue().srem(B.getValue()));
+      *R = IntegralAP(A.V.srem(B.V));
     else
-      R->copy(A.getValue().urem(B.getValue()));
+      *R = IntegralAP(A.V.urem(B.V));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      R->copy(A.getValue().sdiv(B.getValue()));
+      *R = IntegralAP(A.V.sdiv(B.V));
     else
-      R->copy(A.getValue().udiv(B.getValue()));
+      *R = IntegralAP(A.V.udiv(B.V));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    R->copy(A.getValue() & B.getValue());
+    *R = IntegralAP(A.V & B.V);
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    R->copy(A.getValue() | B.getValue());
+    *R = IntegralAP(A.V | B.V);
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    R->copy(A.getValue() ^ B.getValue());
+    *R = IntegralAP(A.V ^ B.V);
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.getValue();
+    APInt AI = A.V;
     AI.negate();
-    R->copy(AI);
+    *R = IntegralAP(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    R->copy(~A.getValue());
+    *R = IntegralAP(~A.V);
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
+    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.getValue().getZExtValue();
+    unsigned ShiftAmount = B.V.getZExtValue();
     if constexpr (Signed)
-      R->copy(A.getValue().ashr(ShiftAmount));
+      *R = IntegralAP(A.V.ashr(ShiftAmount));
     else
-      R->copy(A.getValue().lshr(ShiftAmount));
+      *R = IntegralAP(A.V.lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    assert(BitWidth != 0);
-    uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
-    return sizeof(uint32_t) + (NumWords * sizeof(uint64_t));
+    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
+    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
   }
 
   void serialize(std::byte *Buff) const {
+    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
+    uint32_t BitWidth = V.getBitWidth();
+
     std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
-    if (singleWord())
-      std::memcpy(Buff + sizeof(uint32_t), &Val, sizeof(uint64_t));
-    else {
-      uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
-      std::memcpy(Buff + sizeof(uint32_t), Memory, NumWords * sizeof(uint64_t));
-    }
+    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
+                           BitWidth / CHAR_BIT);
   }
 
-  static uint64_t deserializeSize(const std::byte *Buff) {
-    return *reinterpret_cast<const uint32_t *>(Buff);
-  }
+  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
+    uint32_t BitWidth;
+    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
+    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
 
-  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
-    uint64_t BitWidth = Result->BitWidth;
-    uint64_t NumWords = llvm::APInt::getNumWords(BitWidth);
-    assert(BitWidth == Result->BitWidth);
-
-    if (NumWords == 1)
-      std::memcpy(&Result->Val, Buff + sizeof(uint32_t), sizeof(uint64_t));
-    else {
-      assert(Result->Memory);
-      std::memcpy(Result->Memory, Buff + sizeof(uint32_t),
-                  NumWords * sizeof(uint64_t));
-    }
+    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
+                            BitWidth / CHAR_BIT);
+    return Val;
   }
 
 private:
@@ -360,7 +312,7 @@ private:
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
+      R->V = Op<APInt>{}(A.V, B.V);
       return false;
     }
 
@@ -368,7 +320,7 @@ private:
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->copy(Result);
+    R->V = Result;
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 1e2032feabb6..5c8abffb3a99 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,10 +1935,8 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
-
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(
+      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
   return true;
 }
 
@@ -1948,10 +1946,8 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
-
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(
+      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
   return true;
 }
 
@@ -2057,100 +2053,6 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
-static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
-                                PrimType T) {
-
-  if (T == PT_IntAPS) {
-    auto &Val = Ptr.deref<IntegralAP<true>>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  } else if (T == PT_IntAP) {
-    auto &Val = Ptr.deref<IntegralAP<false>>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  } else if (T == PT_Float) {
-    auto &Val = Ptr.deref<Floating>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  }
-}
-
-template <typename T>
-static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
-  assert(needsAlloc<T>());
-  auto &Val = Ptr.deref<T>();
-  if (!Val.singleWord()) {
-    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-    Val.take(NewMemory);
-  }
-}
-
-static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
-  if (const Record *R = Ptr.getRecord()) {
-    for (const Record::Field &Fi : R->fields()) {
-      if (Fi.Desc->isPrimitive()) {
-        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
-          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
-        });
-        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
-      } else
-        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
-    }
-    return;
-  }
-
-  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
-    unsigned NumElems = D->getNumElems();
-    if (NumElems == 0)
-      return;
-
-    if (D->isPrimitiveArray()) {
-      PrimType PT = D->getPrimType();
-      if (!needsAlloc(PT))
-        return;
-      assert(NumElems >= 1);
-      const Pointer EP = Ptr.atIndex(0);
-      bool AllSingleWord = true;
-      TYPE_SWITCH_ALLOC(PT, {
-        if (!EP.deref<T>().singleWord()) {
-          copyPrimitiveMemory<T>(S, EP);
-          AllSingleWord = false;
-        }
-      });
-      if (AllSingleWord)
-        return;
-      for (unsigned I = 1; I != D->getNumElems(); ++I) {
-        const Pointer EP = Ptr.atIndex(I);
-        copyPrimitiveMemory(S, EP, PT);
-      }
-    } else {
-      assert(D->isCompositeArray());
-      for (unsigned I = 0; I != D->getNumElems(); ++I) {
-        const Pointer EP = Ptr.atIndex(I).narrow();
-        finishGlobalRecurse(S, EP);
-      }
-    }
-  }
-}
-
-bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
-  const Pointer &Ptr = S.Stk.pop<Pointer>();
-
-  finishGlobalRecurse(S, Ptr);
-  if (Ptr.canBeInitialized()) {
-    Ptr.initialize();
-    Ptr.activate();
-  }
-
-  return true;
-}
-
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 66d3e6d79e8b..ae3d4a441a79 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= Bits) {
+  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,9 +370,6 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -411,7 +408,6 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
-
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -427,7 +423,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -438,7 +434,6 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
-
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -447,7 +442,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -458,7 +453,6 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
-
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -467,10 +461,8 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
-
+  Floating Result;
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
-
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -492,14 +484,9 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Floating RA = S.allocFloat(A.getSemantics());
-    RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.atIndex(0).deref<Floating>() = Floating(ResR);
     Result.atIndex(0).initialize();
-
-    Floating RI = S.allocFloat(A.getSemantics());
-    RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+    Result.atIndex(1).deref<Floating>() = Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -552,20 +539,10 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
-    // Result.atIndex(0).initialize();
-    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    // Result.atIndex(1).initialize();
-
-    Floating RA = S.allocFloat(A.getSemantics());
-    RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.atIndex(0).deref<Floating>() = Floating(ResR);
     Result.atIndex(0).initialize();
-
-    Floating RI = S.allocFloat(A.getSemantics());
-    RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
-
+    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    Result.atIndex(1).initialize();
     Result.initialize();
   } else {
     // Integer element type.
@@ -631,12 +608,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -651,12 +625,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -673,11 +644,7 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -692,15 +659,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
+  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -715,15 +679,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
+  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -746,10 +707,8 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
-
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -771,44 +730,31 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
+  T Result;
 
-  if constexpr (std::is_same_v<T, Floating>) {
-    T Result = S.allocFloat(Value.getSemantics());
-
-    if (!T::neg(Value, &Result)) {
-      S.Stk.push<T>(Result);
-      return true;
-    }
-    return false;
-  } else {
-    T Result;
-    if constexpr (needsAlloc<T>())
-      Result = S.allocAP<T>(Value.bitWidth());
-
-    if (!T::neg(Value, &Result)) {
-      S.Stk.push<T>(Result);
-      return true;
-    }
-
-    assert(isIntegralType(Name) &&
-           "don't expect other types to fail at constexpr negation");
+  if (!T::neg(Value, &Result)) {
     S.Stk.push<T>(Result);
-
-    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-    if (S.checkingForUndefinedBehavior()) {
-      const Expr *E = S.Current->getExpr(OpPC);
-      QualType Type = E->getType();
-      SmallString<32> Trunc;
-      NegatedValue.trunc(Result.bitWidth())
-          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                    /*UpperCase=*/true, /*InsertSeparators=*/true);
-      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-          << Trunc << Type << E->getSourceRange();
-      return true;
-    }
-
-    return handleOverflow(S, OpPC, NegatedValue);
+    return true;
   }
+
+  assert(isIntegralType(Name) &&
+         "don't expect other types to fail at constexpr negation");
+  S.Stk.push<T>(Result);
+
+  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+  if (S.checkingForUndefinedBehavior()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    QualType Type = E->getType();
+    SmallString<32> Trunc;
+    NegatedValue.trunc(Result.bitWidth())
+        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                  /*UpperCase=*/true, /*InsertSeparators=*/true);
+    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+        << Trunc << Type << E->getSourceRange();
+    return true;
+  }
+
+  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -837,8 +783,6 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -946,6 +890,7 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
+
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -953,7 +898,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result = S.allocFloat(Value.getSemantics());
+  Floating Result;
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -1007,15 +952,12 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Val.bitWidth());
-
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
+
   return false;
 }
 
@@ -1383,23 +1325,10 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
-  if constexpr (needsAlloc<T>()) {
-    T Result = S.allocAP<T>(Arg.bitWidth());
-    Result.copy(Arg.toAPSInt());
-    S.Stk.push<T>(Result);
-    return true;
-  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
-inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
-  Floating Result = S.allocFloat(F.getSemantics());
-  Result.copy(F.getAPFloat());
-  S.Stk.push<Floating>(Result);
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1554,24 +1483,7 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
-
   P.deref<T>() = S.Stk.pop<T>();
-
-  if constexpr (std::is_same_v<T, Floating>) {
-    auto &Val = P.deref<Floating>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-
-  } else if constexpr (needsAlloc<T>()) {
-    auto &Val = P.deref<T>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  }
-
   P.initialize();
   return true;
 }
@@ -1673,22 +1585,7 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-
-  if constexpr (needsAlloc<T>()) {
-    T Result = S.allocAP<T>(Value.bitWidth());
-    if (T::isSigned())
-      Result.copy(Value.toAPSInt()
-                      .trunc(F->Decl->getBitWidthValue())
-                      .sextOrTrunc(Value.bitWidth()));
-    else
-      Result.copy(Value.toAPSInt()
-                      .trunc(F->Decl->getBitWidthValue())
-                      .zextOrTrunc(Value.bitWidth()));
-
-    Field.deref<T>() = Result;
-  } else {
-    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
-  }
+  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
   Field.activate();
   Field.initialize();
   return true;
@@ -1868,8 +1765,6 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
-bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
-
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2376,8 +2271,7 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(*Sem);
-  F.toSemantics(Sem, RM, &Result);
+  Floating Result = F.toSemantics(Sem, RM);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2401,25 +2295,15 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  // Copy data.
-  {
-    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
-    Result.copy(Source);
-  }
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(
+      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  // Copy data.
-  {
-    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
-    Result.copy(Source);
-  }
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(
+      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
   return true;
 }
 
@@ -2428,11 +2312,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
+  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2481,12 +2365,7 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
-  ResultAP.copy(Result);
-
-  S.Stk.push<IntegralAP<false>>(ResultAP);
-
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2502,12 +2381,7 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
-  ResultAP.copy(Result);
-
-  S.Stk.push<IntegralAP<true>>(ResultAP);
-
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2567,9 +2441,8 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-  Floating Result = S.allocFloat(*Sem);
-  Result.copy(Fixed.toFloat(Sem));
-  S.Stk.push<Floating>(Result);
+
+  S.Stk.push<Floating>(Fixed.toFloat(Sem));
   return true;
 }
 
@@ -2633,18 +2506,12 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  if (!Result.singleWord())
-    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  if (!Result.singleWord())
-    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
   return true;
 }
 
@@ -2711,9 +2578,7 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                    LT *Result) {
-
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2731,7 +2596,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
+        S, OpPC, LHS, RHS);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2779,7 +2644,6 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
-      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2788,48 +2652,6 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
   return true;
 }
 
-/// A version of DoShift that works on IntegralAP.
-template <class LT, class RT, ShiftDir Dir>
-inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                      LT *Result) {
-  const unsigned Bits = LHS.bitWidth();
-  const APSInt &LHSAP = LHS.toAPSInt();
-  APSInt RHSAP = RHS.toAPSInt();
-
-  // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (S.getLangOpts().OpenCL)
-    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
-                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
-                    RHSAP.isUnsigned());
-
-  if (RHS.isNegative()) {
-    // During constant-folding, a negative shift is an opposite shift. Such a
-    // shift is not a constant expression.
-    const SourceInfo &Loc = S.Current->getSource(OpPC);
-    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    if (!S.noteUndefinedBehavior())
-      return false;
-    RHS = -RHS;
-    return DoShiftAP<LT, RT,
-                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
-  }
-
-  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
-    return false;
-
-  if constexpr (Dir == ShiftDir::Left) {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP << SA);
-  } else {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP >> SA);
-  }
-
-  S.Stk.push<LT>(*Result);
-  return true;
-}
-
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2837,13 +2659,7 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
-  } else {
-    LT Result;
-    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
-  }
+  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2852,13 +2668,8 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
-  } else {
-    LT Result;
-    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
-  }
+
+  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3441,15 +3252,7 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      Floating Result = S.allocFloat(*Sem);
-      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
-      S.Stk.push<Floating>(Result);
-
-      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
-    } else if constexpr (needsAlloc<T>()) {
-      T Result = S.allocAP<T>(ResultBitWidth);
-      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
-      S.Stk.push<T>(Result);
+      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3507,11 +3310,7 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  auto &Semantics =
-      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
-
-  auto F = S.allocFloat(Semantics);
-  Floating::deserialize(*OpPC, &F);
+  Floating F = Floating::deserialize(*OpPC);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3519,25 +3318,17 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  assert(Result.bitWidth() == BitWidth);
-
-  IntegralAP<false>::deserialize(*OpPC, &Result);
-  OpPC += align(Result.bytesToSerialize());
-  return Result;
+  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  assert(Result.bitWidth() == BitWidth);
-
-  IntegralAP<true>::deserialize(*OpPC, &Result);
-  OpPC += align(Result.bytesToSerialize());
-  return Result;
+  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5304bd77f2c0..d01e3d042a8b 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,21 +57,6 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
-
-  if (T == PT_IntAPS) {
-    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-    Result.copy(Val);
-    S.Stk.push<IntegralAP<true>>(Result);
-    return;
-  }
-
-  if (T == PT_IntAP) {
-    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-    Result.copy(Val);
-    S.Stk.push<IntegralAP<false>>(Result);
-    return;
-  }
-
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -342,13 +327,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result = S.allocFloat(TargetSemantics);
+  Floating Result;
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -357,10 +342,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -375,9 +360,7 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result = S.allocFloat(TargetSemantics);
-  Result.copy(APFloat::getInf(TargetSemantics));
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
   return true;
 }
 
@@ -385,12 +368,10 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  Result.copy(Copy);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating(Copy));
 
   return true;
 }
@@ -399,13 +380,11 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
-  S.Stk.push<Floating>(Result);
+    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
   return true;
 }
 
@@ -413,13 +392,11 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
-  S.Stk.push<Floating>(Result);
+    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
   return true;
 }
 
@@ -594,16 +571,8 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
-  APFloat F = Val.getAPFloat();
-  if (!F.isNegative()) {
-    S.Stk.push<Floating>(Val);
-    return true;
-  }
 
-  Floating Result = S.allocFloat(Val.getSemantics());
-  F.changeSign();
-  Result.copy(F);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating::abs(Val));
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 2569cac018b3..239b3104e89f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,9 +402,7 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          Floating R = S.allocFloat(Semantics);
-          Floating::bitcastFromMemory(M.get(), Semantics, &R);
-          P.deref<Floating>() = R;
+          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index 08765561985e..e8dc6f0483d6 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,7 +15,6 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
-#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -127,33 +126,6 @@ public:
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
-  void *allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
-  }
-  template <typename T> T *allocate(size_t Num = 1) const {
-    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
-  }
-
-  template <typename T> T allocAP(unsigned BitWidth) {
-    unsigned NumWords = APInt::getNumWords(BitWidth);
-    if (NumWords == 1)
-      return T(BitWidth);
-    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
-    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
-    return T(Mem, BitWidth);
-  }
-
-  Floating allocFloat(const llvm::fltSemantics &Sem) {
-    if (Floating::singleWord(Sem))
-      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
-
-    unsigned NumWords =
-        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
-    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
-    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
-    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
-  }
-
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -189,8 +161,6 @@ public:
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
-
-  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 57e01f7bd9da..c76ac5f8ae86 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,7 +48,6 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
-
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -89,9 +88,6 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
-def IntegralTypeClass : TypeClass {
-  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
-}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -269,13 +265,12 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstFloat : ConstOpcode<Float, ArgFloat>;
+def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
-def ConstFloat : Opcode { let Args = [ArgFloat]; }
-
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -333,7 +328,6 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
-def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -395,7 +389,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [IntegralTypeClass];
+  let Types = [AluTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index a156cccbb3c1..6152fbfbe3a7 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,13 +76,6 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
-template <typename T> constexpr bool needsAlloc() {
-  return std::is_same_v<T, IntegralAP<false>> ||
-         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
-}
-constexpr bool needsAlloc(PrimType T) {
-  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
-}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -216,16 +209,6 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
-#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
-  do {                                                                         \
-    switch (Expr) {                                                            \
-      TYPE_SWITCH_CASE(PT_Float, B)                                            \
-      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
-      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
-    default:;                                                                  \
-    }                                                                          \
-  } while (0)
-
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 5d9c42244749..23ba1bbd193b 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,14 +132,6 @@ public:
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
-  void *Allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
-  }
-  template <typename T> T *Allocate(size_t Num = 1) const {
-    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
-  }
-  void Deallocate(void *Ptr) const {}
-
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -212,7 +204,7 @@ private:
   };
 
   /// Allocator for globals.
-  mutable PoolAllocTy Allocator;
+  PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -246,18 +238,4 @@ public:
 } // namespace interp
 } // namespace clang
 
-inline void *operator new(size_t Bytes, const clang::interp::Program &C,
-                          size_t Alignment = 8) {
-  return C.Allocate(Bytes, Alignment);
-}
-
-inline void operator delete(void *Ptr, const clang::interp::Program &C,
-                            size_t) {
-  C.Deallocate(Ptr);
-}
-inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
-                            size_t Alignment = 8) {
-  return C.Allocate(Bytes, Alignment);
-}
-
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 1013a771d13b..710612bef8fd 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,9 +21,6 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
-#if __x86_64
-  // both-note@-2 {{indeterminate value can only initialize an object of type}}
-#endif
 }
 
 template <class Intermediate, class Init>
@@ -41,8 +38,11 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
-                                                                                 // both-note{{in call}}
+/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
+#if 0
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
+                                                                                 // expected-note{{in call}}
+#endif
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 174c1ffa79a4..21dca15a4577 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  constexpr long double NaN5 = __builtin_nanf128("");
+  //constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,6 +655,8 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
+/// FIXME: The commented out tests here use a IntAP value and fail.
+/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -707,7 +709,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  int clz50 = __builtin_clzg((unsigned __int128)0);
+  // int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -715,7 +717,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -773,7 +775,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -783,7 +785,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 6729da647afa2b0ee040ccd4f06153e45d6ca738 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Wed, 18 Jun 2025 14:19:17 +0100
Subject: [PATCH 0807/1322] [mlir][amdgpu][nfc] Add PatternBenefit to populate
 methods (#144663)

---
 mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h   | 10 +++++++---
 mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp  |  5 +++--
 .../AMDGPU/Transforms/ResolveStridedMetadata.cpp       |  4 ++--
 .../Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp   |  4 ++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index 94dd9e3a2933..a52ee2ee89ca 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -14,6 +14,7 @@
 #define MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_
 
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
@@ -28,11 +29,14 @@ namespace amdgpu {
 
 void populateAmdgpuEmulateAtomicsPatterns(ConversionTarget &target,
                                           RewritePatternSet &patterns,
-                                          Chipset chipset);
+                                          Chipset chipset,
+                                          PatternBenefit benefit = 1);
 
-void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns);
+void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
+                                                  PatternBenefit benefit = 1);
 
-void populateAmdgpuTransferReadToLoadPatterns(RewritePatternSet &patterns);
+void populateAmdgpuTransferReadToLoadPatterns(RewritePatternSet &patterns,
+                                              PatternBenefit benefit = 1);
 
 } // namespace amdgpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 7dd4be66d2bd..fd2ba0683786 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -164,7 +164,8 @@ LogicalResult RawBufferAtomicByCasPattern<AtomicOp, ArithOp>::matchAndRewrite(
 }
 
 void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
-    ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset) {
+    ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset,
+    PatternBenefit benefit) {
   // gfx10 has no atomic adds.
   if (chipset.majorVersion == 10 || chipset < Chipset(9, 0, 8)) {
     target.addIllegalOp<RawBufferAtomicFaddOp>();
@@ -204,7 +205,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
       RawBufferAtomicByCasPattern<RawBufferAtomicFmaxOp, arith::MaximumFOp>,
       RawBufferAtomicByCasPattern<RawBufferAtomicSmaxOp, arith::MaxSIOp>,
       RawBufferAtomicByCasPattern<RawBufferAtomicUminOp, arith::MinUIOp>>(
-      patterns.getContext());
+      patterns.getContext(), benefit);
 }
 
 void AmdgpuEmulateAtomicsPass::runOnOperation() {
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/ResolveStridedMetadata.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/ResolveStridedMetadata.cpp
index 4b3d94b4ce2a..195f59d62555 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/ResolveStridedMetadata.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/ResolveStridedMetadata.cpp
@@ -66,9 +66,9 @@ struct ExtractStridedMetadataOnFatRawBufferCastFolder final
 } // namespace
 
 void mlir::amdgpu::populateAmdgpuResolveStridedMetadataPatterns(
-    RewritePatternSet &patterns) {
+    RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<ExtractStridedMetadataOnFatRawBufferCastFolder>(
-      patterns.getContext());
+      patterns.getContext(), benefit);
 }
 
 void AmdgpuResolveStridedMetadataPass::runOnOperation() {
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
index 96925dbf9f28..f5b12a9524cc 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
@@ -222,8 +222,8 @@ struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
 } // namespace
 
 void mlir::amdgpu::populateAmdgpuTransferReadToLoadPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<TransferReadLowering>(patterns.getContext());
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<TransferReadLowering>(patterns.getContext(), benefit);
 }
 
 struct AmdgpuTransferReadToLoadPass final

From c4d99704e22097703c57ee67baea96fdabfd68ab Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Wed, 18 Jun 2025 18:53:45 +0530
Subject: [PATCH 0808/1322] =?UTF-8?q?Revert=20"Reland=20[Driver]=20Add=20s?=
 =?UTF-8?q?upport=20for=20GCC=20installation=20detection=20in=E2=80=A6=20(?=
 =?UTF-8?q?#144684)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… Baremetal toolchain (#144640)"

This reverts commit 45ea46c44636094e9fcdbbeabfd11f9d0fad5e38.
---
 clang/docs/Toolchain.rst                      |   5 -
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 -
 clang/lib/Driver/ToolChains/BareMetal.cpp     | 245 +++++-------------
 clang/lib/Driver/ToolChains/BareMetal.h       |  19 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |   0
 .../aarch64-none-elf/lib/.keep                |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../bin/aarch64-none-elf-ld                   |   1 -
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |   0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../aarch64-none-elf/lib/crtbegin.o           |   0
 .../aarch64-none-elf/lib/crtend.o             |   0
 .../bin/aarch64-none-elf-ld                   |   1 -
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |   0
 .../armv6m-none-eabi/lib/.keep                |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../bin/armv6m-none-eabi-ld                   |   1 -
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |   0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../armv6m-none-eabi/lib/crtbegin.o           |   0
 .../armv6m-none-eabi/lib/crtend.o             |   0
 .../bin/armv6m-none-eabi-ld                   |   1 -
 clang/test/Driver/aarch64-gnutools.c          |   4 -
 clang/test/Driver/aarch64-toolchain-extra.c   |  29 ---
 clang/test/Driver/aarch64-toolchain.c         |  62 -----
 clang/test/Driver/arm-gnutools.c              |   6 -
 clang/test/Driver/arm-toolchain-extra.c       |  30 ---
 clang/test/Driver/arm-toolchain.c             |  63 -----
 clang/test/Driver/baremetal.cpp               |  16 --
 clang/test/Driver/check-no-multlib-warning.c  |  10 -
 32 files changed, 69 insertions(+), 427 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 delete mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 delete mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 delete mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 delete mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 delete mode 100644 clang/test/Driver/aarch64-gnutools.c
 delete mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 delete mode 100644 clang/test/Driver/aarch64-toolchain.c
 delete mode 100644 clang/test/Driver/arm-gnutools.c
 delete mode 100644 clang/test/Driver/arm-toolchain-extra.c
 delete mode 100644 clang/test/Driver/arm-toolchain.c
 delete mode 100644 clang/test/Driver/check-no-multlib-warning.c

diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index d56b21d74c7e..958199eb7a2e 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,8 +347,3 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
-
-GCC Installation
-=================
-Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
-using ``-gcc-install-dir`` flag.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 94224e103875..29f6480ba935 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,9 +847,6 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
-def warn_drv_multilib_not_available_for_target: Warning<
-  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
-  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 0fbfe6c77f34..d8168ed15feb 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,40 +31,6 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
-}
-
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -129,8 +95,7 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeClangRuntimesSysRoot(const Driver &D,
-                                               bool IncludeTriple) {
+static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -143,125 +108,58 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
   return std::string(SysRootDir);
 }
 
-// Only consider the GCC toolchain based on the values provided through the
-// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
-// whether the GCC toolchain was initialized successfully.
-bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
-                                    const llvm::opt::ArgList &Args) {
-  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
-      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
-    GCCInstallation.init(Triple, Args);
-    return GCCInstallation.isValid();
-  }
-  return false;
-}
-
-// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
-// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
-// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
-// `bin/../<target-triple>/lib` directory.
-static bool detectGCCToolchainAdjacent(const Driver &D) {
-  SmallString<128> GCCDir;
-  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
-                          "lib/crt0.o");
-  return llvm::sys::fs::exists(GCCDir);
-}
-
-// If no sysroot is provided the driver will first attempt to infer it from the
-// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
-// location of a GCC toolchain.
-// If neither flag is used, the sysroot defaults to either:
-//    - `bin/../<target-triple>`
-//    - `bin/../lib/clang-runtimes/<target-triple>`
-//
-// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
-// does not exist relative to the driver.
-std::string BareMetal::computeSysRoot() const {
-  // Use Baremetal::sysroot if it has already been set.
-  if (!SysRoot.empty())
-    return SysRoot;
-
-  // Use the sysroot specified via the `--sysroot` command-line flag, if
-  // provided.
-  const Driver &D = getDriver();
-  if (!D.SysRoot.empty())
-    return D.SysRoot;
-
-  // Attempt to infer sysroot from a valid GCC installation.
-  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
-  SmallString<128> inferredSysRoot;
-  if (IsGCCInstallationValid) {
-    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
-                            "..", GCCInstallation.getTriple().str());
-  } else if (detectGCCToolchainAdjacent(D)) {
-    // Use the triple as provided to the driver. Unlike the parsed triple
-    // this has not been normalized to always contain every field.
-    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
-  }
-  // If a valid sysroot was inferred and exists, use it
-  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
-    return std::string(inferredSysRoot);
-
-  // Use the clang-runtimes path.
-  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
-}
-
-static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
-                                  const Multilib &Multilib,
-                                  StringRef InstallPath,
-                                  ToolChain::path_list &Paths) {
-  if (const auto &PathsCallback = Multilibs.filePathsCallback())
-    for (const auto &Path : PathsCallback(Multilib))
-      addPathIfExists(D, InstallPath + Path, Paths);
-}
-
-// GCC mutltilibs will only work for those targets that have their multlib
-// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
-// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
-// in GCCInstallation.
 BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
                      const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
-  std::string ComputedSysRoot = computeSysRoot();
-  if (IsGCCInstallationValid) {
-    if (!isRISCVBareMetal(Triple))
-      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
+    : ToolChain(D, Triple, Args),
+      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
+  getProgramPaths().push_back(getDriver().Dir);
 
-    Multilibs = GCCInstallation.getMultilibs();
-    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
-
-    path_list &Paths = getFilePaths();
-    // Add toolchain/multilib specific file paths.
-    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
-                          GCCInstallation.getInstallPath(), Paths);
-    // Adding filepath for locating crt{begin,end}.o files.
-    Paths.push_back(GCCInstallation.getInstallPath().str());
-    // Adding filepath for locating crt0.o file.
-    Paths.push_back(ComputedSysRoot + "/lib");
-
-    ToolChain::path_list &PPaths = getProgramPaths();
-    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
-    // directory off of the parent of the GCC installation.
-    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
-                           GCCInstallation.getTriple().str() + "/bin")
-                         .str());
-    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
-  } else {
-    getProgramPaths().push_back(getDriver().Dir);
-    findMultilibs(D, Triple, Args);
-    const SmallString<128> SysRootDir(computeSysRoot());
-    if (!SysRootDir.empty()) {
-      for (const Multilib &M : getOrderedMultilibs()) {
-        SmallString<128> Dir(SysRootDir);
-        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-        getFilePaths().push_back(std::string(Dir));
-        getLibraryPaths().push_back(std::string(Dir));
-      }
+  findMultilibs(D, Triple, Args);
+  SmallString<128> SysRoot(computeSysRoot());
+  if (!SysRoot.empty()) {
+    for (const Multilib &M : getOrderedMultilibs()) {
+      SmallString<128> Dir(SysRoot);
+      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+      getFilePaths().push_back(std::string(Dir));
+      getLibraryPaths().push_back(std::string(Dir));
     }
   }
 }
 
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
+}
+
 static void
 findMultilibsFromYAML(const ToolChain &TC, const Driver &D,
                       StringRef MultilibPath, const ArgList &Args,
@@ -318,7 +216,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -336,7 +234,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -344,7 +242,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
+  } else if (isRISCVBareMetal(Triple)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -365,6 +263,8 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
+std::string BareMetal::computeSysRoot() const { return SysRoot; }
+
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -392,10 +292,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRootDir(computeSysRoot());
-  if (!SysRootDir.empty()) {
+  const SmallString<128> SysRoot(computeSysRoot());
+  if (!SysRoot.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRootDir);
+      SmallString<128> Dir(SysRoot);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -409,19 +309,6 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
-void BareMetal::addLibStdCxxIncludePaths(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  if (!IsGCCInstallationValid)
-    return;
-  const GCCVersion &Version = GCCInstallation.getVersion();
-  StringRef TripleStr = GCCInstallation.getTriple().str();
-  const Multilib &Multilib = GCCInstallation.getMultilib();
-  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
-                           TripleStr, Multilib.includeSuffix(), DriverArgs,
-                           CC1Args);
-}
-
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -452,23 +339,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-  case ToolChain::CST_Libcxx: {
-    SmallString<128> P(D.Dir);
-    llvm::sys::path::append(P, "..", "include");
-    AddCXXIncludePath(P);
-    break;
-  }
-  case ToolChain::CST_Libstdcxx:
-    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
-    break;
+    case ToolChain::CST_Libcxx: {
+      SmallString<128> P(D.Dir);
+      llvm::sys::path::append(P, "..", "include");
+      AddCXXIncludePath(P);
+      break;
+    }
+    case ToolChain::CST_Libstdcxx:
+      // We only support libc++ toolchain installation.
+      break;
   }
 
-  std::string SysRootDir(computeSysRoot());
-  if (SysRootDir.empty())
+  std::string SysRoot(computeSysRoot());
+  if (SysRoot.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRootDir);
+    SmallString<128> Dir(SysRoot);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index 930f8584e643..f6295bda0a6a 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
-#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -20,7 +19,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -36,8 +35,7 @@ protected:
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool initGCCInstallation(const llvm::Triple &Triple,
-                           const llvm::opt::ArgList &Args);
+  bool useIntegratedAs() const override { return true; }
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -50,15 +48,9 @@ public:
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
-  UnwindTableLevel
-  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
-    return UnwindTableLevel::None;
-  }
-
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
-
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
@@ -75,9 +67,6 @@ public:
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
-  void
-  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
-                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -91,8 +80,6 @@ private:
 
   std::string SysRoot;
 
-  bool IsGCCInstallationValid;
-
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -117,7 +104,7 @@ public:
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
deleted file mode 100755
index b23e55619b2f..000000000000
--- a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
deleted file mode 100644
index 0214639ed380..000000000000
--- a/clang/test/Driver/aarch64-gnutools.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
deleted file mode 100644
index eb8c741ae1ad..000000000000
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// A basic clang -cc1 command-line, and simple environment check.
-
-// The tests here are similar to those in aarch64-toolchain.c, however
-// these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// XFAIL: target={{.*}}-fuchsia{{.*}}
-// REQUIRES: shell
-// UNSUPPORTED: system-windows
-
-// If there is no GCC install detected then the driver searches for executables
-// and runtime starting from the directory tree above the driver itself.
-// The test below checks that the driver correctly finds the linker and
-// runtime if and only if they exist.
-//
-// RUN: rm -rf %t
-// RUN: mkdir -p %t/aarch64-nogcc/bin
-// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
-// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
-// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
-// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
-// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
deleted file mode 100644
index 74841eec598b..000000000000
--- a/clang/test/Driver/aarch64-toolchain.c
+++ /dev/null
@@ -1,62 +0,0 @@
-// XFAIL: target={{.*}}-fuchsia{{.*}}
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
-
-// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
-
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
-
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
-
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
-
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
-
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
deleted file mode 100644
index 6e107f19dabc..000000000000
--- a/clang/test/Driver/arm-gnutools.c
+++ /dev/null
@@ -1,6 +0,0 @@
-// check that gnu assembler is invoked with arm baremetal as well
-
-// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
deleted file mode 100644
index 67206818f211..000000000000
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// A basic clang -cc1 command-line, and simple environment check.
-
-// The tests here are similar to those in arm-toolchain.c, however
-// these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// XFAIL: target={{.*}}-fuchsia{{.*}}
-// REQUIRES: shell
-// UNSUPPORTED: system-windows
-
-// If there is no GCC install detected then the driver searches for executables
-// and runtime starting from the directory tree above the driver itself.
-// The test below checks that the driver correctly finds the linker and
-// runtime if and only if they exist.
-//
-// RUN: rm -rf %t
-// RUN: mkdir -p %t/arm-nogcc/bin
-// RUN: ln -s %clang %t/arm-nogcc/bin/clang
-// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
-// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
-// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
-// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
-
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
deleted file mode 100644
index 56a0e0de7ba7..000000000000
--- a/clang/test/Driver/arm-toolchain.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// XFAIL: target={{.*}}-fuchsia{{.*}}
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
-
-// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
-
-// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
-
-// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
-
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
-
-// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
-
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 2ac83402dda3..a80aa9b43711 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -196,22 +196,6 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
-// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
-// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
-// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-
-// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
-// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
-// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
deleted file mode 100644
index 9a0d7cee450a..000000000000
--- a/clang/test/Driver/check-no-multlib-warning.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// UNSUPPORTED: system-windows
-
-
-// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
-// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
-// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
-// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
-
-// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
-// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets

From 1f34d68c4f086e7ea6ef9a529f9606476b38bbbb Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Wed, 18 Jun 2025 14:25:41 +0100
Subject: [PATCH 0809/1322] [Remarks] Remove yaml-strtab format (#144527)

Background: The yaml-strtab format looks just like the yaml format,
except that the values in the key/value pairs of the remarks are
deduplicated and replaced by indices into a string table (see removed
test cases for examples). The motivation behind this format was to
reduce size of the remarks files. However, it was quickly superseded by
the bitstream format.

Therefore, remove the yaml-strtab format, as it doesn't have a good
usecase anymore:
  - It isn't particularly efficient
  - It isn't human-readable
  - It isn't straightforward to parse in external tools that can't use the
remarks library. We don't even support it in opt-viewer.

llvm-remarkutil is also missing options to parse/convert yaml-strtab, so
the chance that anyone is actually using this format is low.
---
 llvm/docs/CommandGuide/llvm-opt-report.rst    |   1 -
 llvm/docs/Remarks.rst                         |  32 +---
 llvm/include/llvm/Remarks/RemarkFormat.h      |   2 +-
 llvm/include/llvm/Remarks/RemarkParser.h      |   5 -
 .../llvm/Remarks/YAMLRemarkSerializer.h       |  41 -----
 llvm/lib/Remarks/BitstreamRemarkParser.cpp    |   7 +-
 llvm/lib/Remarks/BitstreamRemarkParser.h      |   7 +-
 llvm/lib/Remarks/RemarkFormat.cpp             |   4 +-
 llvm/lib/Remarks/RemarkLinker.cpp             |   2 +-
 llvm/lib/Remarks/RemarkParser.cpp             |  40 +----
 llvm/lib/Remarks/RemarkSerializer.cpp         |   5 -
 llvm/lib/Remarks/RemarkStreamer.cpp           |   4 +-
 llvm/lib/Remarks/YAMLRemarkParser.cpp         |  71 +-------
 llvm/lib/Remarks/YAMLRemarkParser.h           |  19 +-
 llvm/lib/Remarks/YAMLRemarkSerializer.cpp     | 103 ++---------
 llvm/test/CodeGen/X86/remarks-section.ll      |   8 -
 llvm/unittests/Remarks/RemarksLinkingTest.cpp |  35 ++--
 .../Remarks/YAMLRemarksParsingTest.cpp        | 133 +-------------
 .../Remarks/YAMLRemarksSerializerTest.cpp     | 163 ------------------
 19 files changed, 64 insertions(+), 618 deletions(-)

diff --git a/llvm/docs/CommandGuide/llvm-opt-report.rst b/llvm/docs/CommandGuide/llvm-opt-report.rst
index 4a666a4aa7af..ba10ba34578a 100644
--- a/llvm/docs/CommandGuide/llvm-opt-report.rst
+++ b/llvm/docs/CommandGuide/llvm-opt-report.rst
@@ -94,7 +94,6 @@ be sent to standard output.
  The Argument is one of the following:
 
  - yaml
- - yaml-strtab
  - bitstream
 
 .. option:: --no-demangle
diff --git a/llvm/docs/Remarks.rst b/llvm/docs/Remarks.rst
index b6cec12b326f..3be66e5adac9 100644
--- a/llvm/docs/Remarks.rst
+++ b/llvm/docs/Remarks.rst
@@ -112,7 +112,6 @@ following options:
       Supported formats:
 
       * :ref:`yaml <yamlremarks>` (default)
-      * :ref:`yaml-strtab <yamlstrtabremarks>`
       * :ref:`bitstream <bitstreamremarks>`
 
 ``Content configuration``
@@ -213,30 +212,6 @@ fields are required:
 * ``<arg-line>``
 * ``<arg-column>``
 
-.. _yamlstrtabremarks:
-
-YAML with a string table
-------------------------
-
-The YAML serialization supports the usage of a string table by using the
-``yaml-strtab`` format.
-
-This format replaces strings in the YAML output with integers representing the
-index in the string table that can be provided separately through metadata.
-
-The following entries can take advantage of the string table while respecting
-YAML rules:
-
-* ``<pass>``
-* ``<name>``
-* ``<function>``
-* ``<file>``
-* ``<value>``
-* ``<arg-file>``
-
-Currently, none of the tools in :ref:`the opt-viewer directory <optviewer>`
-support this format.
-
 .. _optviewer:
 
 YAML metadata
@@ -246,9 +221,9 @@ The metadata used together with the YAML format is:
 
 * a magic number: "REMARKS\\0"
 * the version number: a little-endian uint64_t
-* the total size of the string table (the size itself excluded):
-  little-endian uint64_t
-* a list of null-terminated strings
+* 8 zero bytes. This space was previously used to encode the size of a string
+  table. String table support for YAML remarks has been removed, use the
+  bitstream format instead.
 
 Optional:
 
@@ -584,7 +559,6 @@ Emitting remark diagnostics in the object file
 A section containing metadata on remark diagnostics will be emitted for the
 following formats:
 
-* ``yaml-strtab``
 * ``bitstream``
 
 This can be overridden by using the flag ``-remarks-section=<bool>``.
diff --git a/llvm/include/llvm/Remarks/RemarkFormat.h b/llvm/include/llvm/Remarks/RemarkFormat.h
index 64d08bcc9b8a..a39a013dcf90 100644
--- a/llvm/include/llvm/Remarks/RemarkFormat.h
+++ b/llvm/include/llvm/Remarks/RemarkFormat.h
@@ -23,7 +23,7 @@ namespace remarks {
 constexpr StringLiteral Magic("REMARKS");
 
 /// The format used for serializing/deserializing remarks.
-enum class Format { Unknown, YAML, YAMLStrTab, Bitstream };
+enum class Format { Unknown, YAML, Bitstream };
 
 /// Parse and validate a string for the remark format.
 LLVM_ABI Expected<Format> parseFormat(StringRef FormatStr);
diff --git a/llvm/include/llvm/Remarks/RemarkParser.h b/llvm/include/llvm/Remarks/RemarkParser.h
index abb1fb86a87e..e3df74436348 100644
--- a/llvm/include/llvm/Remarks/RemarkParser.h
+++ b/llvm/include/llvm/Remarks/RemarkParser.h
@@ -80,13 +80,8 @@ struct ParsedStringTable {
 LLVM_ABI Expected<std::unique_ptr<RemarkParser>>
 createRemarkParser(Format ParserFormat, StringRef Buf);
 
-LLVM_ABI Expected<std::unique_ptr<RemarkParser>>
-createRemarkParser(Format ParserFormat, StringRef Buf,
-                   ParsedStringTable StrTab);
-
 LLVM_ABI Expected<std::unique_ptr<RemarkParser>> createRemarkParserFromMeta(
     Format ParserFormat, StringRef Buf,
-    std::optional<ParsedStringTable> StrTab = std::nullopt,
     std::optional<StringRef> ExternalFilePrependPath = std::nullopt);
 
 } // end namespace remarks
diff --git a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
index a2214c349e1c..d80464c0fe74 100644
--- a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
+++ b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
@@ -63,47 +63,6 @@ struct LLVM_ABI YAMLMetaSerializer : public MetaSerializer {
   void emit() override;
 };
 
-/// Serialize the remarks to YAML using a string table. An remark entry looks
-/// like the regular YAML remark but instead of string entries it's using
-/// numbers that map to an index in the string table.
-struct LLVM_ABI YAMLStrTabRemarkSerializer : public YAMLRemarkSerializer {
-  /// Wether we already emitted the metadata in standalone mode.
-  /// This should be set to true after the first invocation of `emit`.
-  bool DidEmitMeta = false;
-
-  YAMLStrTabRemarkSerializer(raw_ostream &OS, SerializerMode Mode)
-      : YAMLRemarkSerializer(Format::YAMLStrTab, OS, Mode) {
-    // We always need a string table for this type of serializer.
-    StrTab.emplace();
-  }
-  YAMLStrTabRemarkSerializer(raw_ostream &OS, SerializerMode Mode,
-                             StringTable StrTab)
-      : YAMLRemarkSerializer(Format::YAMLStrTab, OS, Mode, std::move(StrTab)) {}
-
-  /// Override to emit the metadata if necessary.
-  void emit(const Remark &Remark) override;
-
-  std::unique_ptr<MetaSerializer> metaSerializer(
-      raw_ostream &OS,
-      std::optional<StringRef> ExternalFilename = std::nullopt) override;
-
-  static bool classof(const RemarkSerializer *S) {
-    return S->SerializerFormat == Format::YAMLStrTab;
-  }
-};
-
-struct LLVM_ABI YAMLStrTabMetaSerializer : public YAMLMetaSerializer {
-  /// The string table is part of the metadata.
-  const StringTable &StrTab;
-
-  YAMLStrTabMetaSerializer(raw_ostream &OS,
-                           std::optional<StringRef> ExternalFilename,
-                           const StringTable &StrTab)
-      : YAMLMetaSerializer(OS, ExternalFilename), StrTab(StrTab) {}
-
-  void emit() override;
-};
-
 } // end namespace remarks
 } // end namespace llvm
 
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
index 6dd032f07e72..312886013598 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.cpp
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
@@ -308,8 +308,7 @@ static Error advanceToMetaBlock(BitstreamParserHelper &Helper) {
 
 Expected<std::unique_ptr<BitstreamRemarkParser>>
 remarks::createBitstreamParserFromMeta(
-    StringRef Buf, std::optional<ParsedStringTable> StrTab,
-    std::optional<StringRef> ExternalFilePrependPath) {
+    StringRef Buf, std::optional<StringRef> ExternalFilePrependPath) {
   BitstreamParserHelper Helper(Buf);
   Expected<std::array<char, 4>> MagicNumber = Helper.parseMagic();
   if (!MagicNumber)
@@ -319,9 +318,7 @@ remarks::createBitstreamParserFromMeta(
           StringRef(MagicNumber->data(), MagicNumber->size())))
     return std::move(E);
 
-  auto Parser =
-      StrTab ? std::make_unique<BitstreamRemarkParser>(Buf, std::move(*StrTab))
-             : std::make_unique<BitstreamRemarkParser>(Buf);
+  auto Parser = std::make_unique<BitstreamRemarkParser>(Buf);
 
   if (ExternalFilePrependPath)
     Parser->ExternalFilePrependPath = std::string(*ExternalFilePrependPath);
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h
index fc786fc57622..f6f79ef199f7 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -48,11 +48,6 @@ struct BitstreamRemarkParser : public RemarkParser {
   explicit BitstreamRemarkParser(StringRef Buf)
       : RemarkParser(Format::Bitstream), ParserHelper(Buf) {}
 
-  /// Create a parser that uses a pre-parsed string table.
-  BitstreamRemarkParser(StringRef Buf, ParsedStringTable StrTab)
-      : RemarkParser(Format::Bitstream), ParserHelper(Buf),
-        StrTab(std::move(StrTab)) {}
-
   Expected<std::unique_ptr<Remark>> next() override;
 
   static bool classof(const RemarkParser *P) {
@@ -77,7 +72,7 @@ private:
 };
 
 Expected<std::unique_ptr<BitstreamRemarkParser>> createBitstreamParserFromMeta(
-    StringRef Buf, std::optional<ParsedStringTable> StrTab = std::nullopt,
+    StringRef Buf,
     std::optional<StringRef> ExternalFilePrependPath = std::nullopt);
 
 } // end namespace remarks
diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp
index 5006421a3c63..800f5bffe70d 100644
--- a/llvm/lib/Remarks/RemarkFormat.cpp
+++ b/llvm/lib/Remarks/RemarkFormat.cpp
@@ -20,7 +20,6 @@ using namespace llvm::remarks;
 Expected<Format> llvm::remarks::parseFormat(StringRef FormatStr) {
   auto Result = StringSwitch<Format>(FormatStr)
                     .Cases("", "yaml", Format::YAML)
-                    .Case("yaml-strtab", Format::YAMLStrTab)
                     .Case("bitstream", Format::Bitstream)
                     .Default(Format::Unknown);
 
@@ -36,7 +35,8 @@ Expected<Format> llvm::remarks::magicToFormat(StringRef MagicStr) {
   auto Result =
       StringSwitch<Format>(MagicStr)
           .StartsWith("--- ", Format::YAML) // This is only an assumption.
-          .StartsWith(remarks::Magic, Format::YAMLStrTab)
+          .StartsWith(remarks::Magic,
+                      Format::YAML) // Needed for remark meta section
           .StartsWith(remarks::ContainerMagic, Format::Bitstream)
           .Default(Format::Unknown);
 
diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp
index b70b06d706bd..b8395aa135d8 100644
--- a/llvm/lib/Remarks/RemarkLinker.cpp
+++ b/llvm/lib/Remarks/RemarkLinker.cpp
@@ -76,7 +76,7 @@ Error RemarkLinker::link(StringRef Buffer, std::optional<Format> RemarkFormat) {
 
   Expected<std::unique_ptr<RemarkParser>> MaybeParser =
       createRemarkParserFromMeta(
-          *RemarkFormat, Buffer, /*StrTab=*/std::nullopt,
+          *RemarkFormat, Buffer,
           PrependPath ? std::optional<StringRef>(StringRef(*PrependPath))
                       : std::optional<StringRef>());
   if (!MaybeParser)
diff --git a/llvm/lib/Remarks/RemarkParser.cpp b/llvm/lib/Remarks/RemarkParser.cpp
index 7fccb94014b9..5c1690aaa0fe 100644
--- a/llvm/lib/Remarks/RemarkParser.cpp
+++ b/llvm/lib/Remarks/RemarkParser.cpp
@@ -53,10 +53,6 @@ llvm::remarks::createRemarkParser(Format ParserFormat, StringRef Buf) {
   switch (ParserFormat) {
   case Format::YAML:
     return std::make_unique<YAMLRemarkParser>(Buf);
-  case Format::YAMLStrTab:
-    return createStringError(
-        std::make_error_code(std::errc::invalid_argument),
-        "The YAML with string table format requires a parsed string table.");
   case Format::Bitstream:
     return std::make_unique<BitstreamRemarkParser>(Buf);
   case Format::Unknown:
@@ -66,38 +62,15 @@ llvm::remarks::createRemarkParser(Format ParserFormat, StringRef Buf) {
   llvm_unreachable("unhandled ParseFormat");
 }
 
-Expected<std::unique_ptr<RemarkParser>>
-llvm::remarks::createRemarkParser(Format ParserFormat, StringRef Buf,
-                                  ParsedStringTable StrTab) {
-  switch (ParserFormat) {
-  case Format::YAML:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "The YAML format can't be used with a string "
-                             "table. Use yaml-strtab instead.");
-  case Format::YAMLStrTab:
-    return std::make_unique<YAMLStrTabRemarkParser>(Buf, std::move(StrTab));
-  case Format::Bitstream:
-    return std::make_unique<BitstreamRemarkParser>(Buf, std::move(StrTab));
-  case Format::Unknown:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown remark parser format.");
-  }
-  llvm_unreachable("unhandled ParseFormat");
-}
-
 Expected<std::unique_ptr<RemarkParser>>
 llvm::remarks::createRemarkParserFromMeta(
-    Format ParserFormat, StringRef Buf, std::optional<ParsedStringTable> StrTab,
+    Format ParserFormat, StringRef Buf,
     std::optional<StringRef> ExternalFilePrependPath) {
   switch (ParserFormat) {
-  // Depending on the metadata, the format can be either yaml or yaml-strtab,
-  // regardless of the input argument.
   case Format::YAML:
-  case Format::YAMLStrTab:
-    return createYAMLParserFromMeta(Buf, std::move(StrTab),
-                                    std::move(ExternalFilePrependPath));
+    return createYAMLParserFromMeta(Buf, std::move(ExternalFilePrependPath));
   case Format::Bitstream:
-    return createBitstreamParserFromMeta(Buf, std::move(StrTab),
+    return createBitstreamParserFromMeta(Buf,
                                          std::move(ExternalFilePrependPath));
   case Format::Unknown:
     return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -112,11 +85,8 @@ struct CParser {
   std::unique_ptr<RemarkParser> TheParser;
   std::optional<std::string> Err;
 
-  CParser(Format ParserFormat, StringRef Buf,
-          std::optional<ParsedStringTable> StrTab = std::nullopt)
-      : TheParser(cantFail(
-            StrTab ? createRemarkParser(ParserFormat, Buf, std::move(*StrTab))
-                   : createRemarkParser(ParserFormat, Buf))) {}
+  CParser(Format ParserFormat, StringRef Buf)
+      : TheParser(cantFail(createRemarkParser(ParserFormat, Buf))) {}
 
   void handleError(Error E) { Err.emplace(toString(std::move(E))); }
   bool hasError() const { return Err.has_value(); }
diff --git a/llvm/lib/Remarks/RemarkSerializer.cpp b/llvm/lib/Remarks/RemarkSerializer.cpp
index ab19c84bbadb..cc10b91f287a 100644
--- a/llvm/lib/Remarks/RemarkSerializer.cpp
+++ b/llvm/lib/Remarks/RemarkSerializer.cpp
@@ -26,8 +26,6 @@ remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
                              "Unknown remark serializer format.");
   case Format::YAML:
     return std::make_unique<YAMLRemarkSerializer>(OS, Mode);
-  case Format::YAMLStrTab:
-    return std::make_unique<YAMLStrTabRemarkSerializer>(OS, Mode);
   case Format::Bitstream:
     return std::make_unique<BitstreamRemarkSerializer>(OS, Mode);
   }
@@ -43,9 +41,6 @@ remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
                              "Unknown remark serializer format.");
   case Format::YAML:
     return std::make_unique<YAMLRemarkSerializer>(OS, Mode, std::move(StrTab));
-  case Format::YAMLStrTab:
-    return std::make_unique<YAMLStrTabRemarkSerializer>(OS, Mode,
-                                                        std::move(StrTab));
   case Format::Bitstream:
     return std::make_unique<BitstreamRemarkSerializer>(OS, Mode,
                                                        std::move(StrTab));
diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp
index 9f4676ce37ab..bb62c8b5c2fd 100644
--- a/llvm/lib/Remarks/RemarkStreamer.cpp
+++ b/llvm/lib/Remarks/RemarkStreamer.cpp
@@ -21,7 +21,7 @@ static cl::opt<cl::boolOrDefault> EnableRemarksSection(
     "remarks-section",
     cl::desc(
         "Emit a section containing remark diagnostics metadata. By default, "
-        "this is enabled for the following formats: yaml-strtab, bitstream."),
+        "this is enabled for the following formats: bitstream."),
     cl::init(cl::BOU_UNSET), cl::Hidden);
 
 RemarkStreamer::RemarkStreamer(
@@ -63,9 +63,7 @@ bool RemarkStreamer::needsSection() const {
 
   // Only some formats need a section:
   // * bitstream
-  // * yaml-strtab
   switch (RemarkSerializer->SerializerFormat) {
-  case remarks::Format::YAMLStrTab:
   case remarks::Format::Bitstream:
     return true;
   default:
diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp
index a287ef574255..5ff42fe6b9a9 100644
--- a/llvm/lib/Remarks/YAMLRemarkParser.cpp
+++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp
@@ -95,21 +95,8 @@ static Expected<uint64_t> parseStrTabSize(StringRef &Buf) {
   return StrTabSize;
 }
 
-static Expected<ParsedStringTable> parseStrTab(StringRef &Buf,
-                                               uint64_t StrTabSize) {
-  if (Buf.size() < StrTabSize)
-    return createStringError(std::errc::illegal_byte_sequence,
-                             "Expecting string table.");
-
-  // Attach the string table to the parser.
-  ParsedStringTable Result(StringRef(Buf.data(), StrTabSize));
-  Buf = Buf.drop_front(StrTabSize);
-  return Expected<ParsedStringTable>(std::move(Result));
-}
-
 Expected<std::unique_ptr<YAMLRemarkParser>> remarks::createYAMLParserFromMeta(
-    StringRef Buf, std::optional<ParsedStringTable> StrTab,
-    std::optional<StringRef> ExternalFilePrependPath) {
+    StringRef Buf, std::optional<StringRef> ExternalFilePrependPath) {
   // We now have a magic number. The metadata has to be correct.
   Expected<bool> isMeta = parseMagic(Buf);
   if (!isMeta)
@@ -125,15 +112,9 @@ Expected<std::unique_ptr<YAMLRemarkParser>> remarks::createYAMLParserFromMeta(
     if (!StrTabSize)
       return StrTabSize.takeError();
 
-    // If the size of string table is not 0, try to build one.
     if (*StrTabSize != 0) {
-      if (StrTab)
-        return createStringError(std::errc::illegal_byte_sequence,
-                                 "String table already provided.");
-      Expected<ParsedStringTable> MaybeStrTab = parseStrTab(Buf, *StrTabSize);
-      if (!MaybeStrTab)
-        return MaybeStrTab.takeError();
-      StrTab = std::move(*MaybeStrTab);
+      return createStringError(std::errc::illegal_byte_sequence,
+                               "String table unsupported for YAML format.");
     }
     // If it starts with "---", there is no external file.
     if (!Buf.starts_with("---")) {
@@ -157,21 +138,15 @@ Expected<std::unique_ptr<YAMLRemarkParser>> remarks::createYAMLParserFromMeta(
   }
 
   std::unique_ptr<YAMLRemarkParser> Result =
-      StrTab
-          ? std::make_unique<YAMLStrTabRemarkParser>(Buf, std::move(*StrTab))
-          : std::make_unique<YAMLRemarkParser>(Buf);
+      std::make_unique<YAMLRemarkParser>(Buf);
   if (SeparateBuf)
     Result->SeparateBuf = std::move(SeparateBuf);
   return std::move(Result);
 }
 
 YAMLRemarkParser::YAMLRemarkParser(StringRef Buf)
-    : YAMLRemarkParser(Buf, std::nullopt) {}
-
-YAMLRemarkParser::YAMLRemarkParser(StringRef Buf,
-                                   std::optional<ParsedStringTable> StrTab)
-    : RemarkParser{Format::YAML}, StrTab(std::move(StrTab)),
-      SM(setupSM(LastErrorMessage)), Stream(Buf, SM), YAMLIt(Stream.begin()) {}
+    : RemarkParser{Format::YAML}, SM(setupSM(LastErrorMessage)),
+      Stream(Buf, SM), YAMLIt(Stream.begin()) {}
 
 Error YAMLRemarkParser::error(StringRef Message, yaml::Node &Node) {
   return make_error<YAMLParseError>(Message, SM, Stream, Node);
@@ -208,8 +183,8 @@ YAMLRemarkParser::parseRemark(yaml::Document &RemarkEntry) {
   Expected<Type> T = parseType(*Root);
   if (!T)
     return T.takeError();
-  else
-    TheRemark.RemarkType = *T;
+
+  TheRemark.RemarkType = *T;
 
   // Then, parse the fields, one by one.
   for (yaml::KeyValueNode &RemarkField : *Root) {
@@ -428,33 +403,3 @@ Expected<std::unique_ptr<Remark>> YAMLRemarkParser::next() {
 
   return std::move(*MaybeResult);
 }
-
-Expected<StringRef> YAMLStrTabRemarkParser::parseStr(yaml::KeyValueNode &Node) {
-  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
-  yaml::BlockScalarNode *ValueBlock;
-  StringRef Result;
-  if (!Value) {
-    // Try to parse the value as a block node.
-    ValueBlock = dyn_cast<yaml::BlockScalarNode>(Node.getValue());
-    if (!ValueBlock)
-      return error("expected a value of scalar type.", Node);
-    Result = ValueBlock->getValue();
-  } else
-    Result = Value->getRawValue();
-  // If we have a string table, parse it as an unsigned.
-  unsigned StrID = 0;
-  if (Expected<unsigned> MaybeStrID = parseUnsigned(Node))
-    StrID = *MaybeStrID;
-  else
-    return MaybeStrID.takeError();
-
-  if (Expected<StringRef> Str = (*StrTab)[StrID])
-    Result = *Str;
-  else
-    return Str.takeError();
-
-  Result.consume_front("\'");
-  Result.consume_back("\'");
-
-  return Result;
-}
diff --git a/llvm/lib/Remarks/YAMLRemarkParser.h b/llvm/lib/Remarks/YAMLRemarkParser.h
index 8ef72e16be74..9a30e9e295cb 100644
--- a/llvm/lib/Remarks/YAMLRemarkParser.h
+++ b/llvm/lib/Remarks/YAMLRemarkParser.h
@@ -46,8 +46,6 @@ private:
 
 /// Regular YAML to Remark parser.
 struct YAMLRemarkParser : public RemarkParser {
-  /// The string table used for parsing strings.
-  std::optional<ParsedStringTable> StrTab;
   /// Last error message that can come from the YAML parser diagnostics.
   /// We need this for catching errors in the constructor.
   std::string LastErrorMessage;
@@ -70,7 +68,6 @@ struct YAMLRemarkParser : public RemarkParser {
   }
 
 protected:
-  YAMLRemarkParser(StringRef Buf, std::optional<ParsedStringTable> StrTab);
   /// Create a YAMLParseError error from an existing error generated by the YAML
   /// parser.
   /// If there is no error, this returns Success.
@@ -93,22 +90,8 @@ protected:
   Expected<Argument> parseArg(yaml::Node &Node);
 };
 
-/// YAML with a string table to Remark parser.
-struct YAMLStrTabRemarkParser : public YAMLRemarkParser {
-  YAMLStrTabRemarkParser(StringRef Buf, ParsedStringTable StrTab)
-      : YAMLRemarkParser(Buf, std::move(StrTab)) {}
-
-  static bool classof(const RemarkParser *P) {
-    return P->ParserFormat == Format::YAMLStrTab;
-  }
-
-protected:
-  /// Parse one value to a string.
-  Expected<StringRef> parseStr(yaml::KeyValueNode &Node) override;
-};
-
 Expected<std::unique_ptr<YAMLRemarkParser>> createYAMLParserFromMeta(
-    StringRef Buf, std::optional<ParsedStringTable> StrTab = std::nullopt,
+    StringRef Buf,
     std::optional<StringRef> ExternalFilePrependPath = std::nullopt);
 
 } // end namespace remarks
diff --git a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
index 68285c3dde1b..846a72182d8f 100644
--- a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
+++ b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
@@ -21,11 +21,10 @@ using namespace llvm::remarks;
 
 // Use the same keys whether we use a string table or not (respectively, T is an
 // unsigned or a StringRef).
-template <typename T>
-static void mapRemarkHeader(yaml::IO &io, T PassName, T RemarkName,
-                            std::optional<RemarkLocation> RL, T FunctionName,
-                            std::optional<uint64_t> Hotness,
-                            ArrayRef<Argument> Args) {
+static void
+mapRemarkHeader(yaml::IO &io, StringRef PassName, StringRef RemarkName,
+                std::optional<RemarkLocation> RL, StringRef FunctionName,
+                std::optional<uint64_t> Hotness, ArrayRef<Argument> Args) {
   io.mapRequired("Pass", PassName);
   io.mapRequired("Name", RemarkName);
   io.mapOptional("DebugLoc", RL);
@@ -58,19 +57,8 @@ template <> struct MappingTraits<remarks::Remark *> {
     else
       llvm_unreachable("Unknown remark type");
 
-    if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
-            reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
-      assert(Serializer->StrTab && "YAMLStrTabSerializer with no StrTab.");
-      StringTable &StrTab = *Serializer->StrTab;
-      unsigned PassID = StrTab.add(Remark->PassName).first;
-      unsigned NameID = StrTab.add(Remark->RemarkName).first;
-      unsigned FunctionID = StrTab.add(Remark->FunctionName).first;
-      mapRemarkHeader(io, PassID, NameID, Remark->Loc, FunctionID,
-                      Remark->Hotness, Remark->Args);
-    } else {
-      mapRemarkHeader(io, Remark->PassName, Remark->RemarkName, Remark->Loc,
-                      Remark->FunctionName, Remark->Hotness, Remark->Args);
-    }
+    mapRemarkHeader(io, Remark->PassName, Remark->RemarkName, Remark->Loc,
+                    Remark->FunctionName, Remark->Hotness, Remark->Args);
   }
 };
 
@@ -82,15 +70,7 @@ template <> struct MappingTraits<RemarkLocation> {
     unsigned Line = RL.SourceLine;
     unsigned Col = RL.SourceColumn;
 
-    if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
-            reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
-      assert(Serializer->StrTab && "YAMLStrTabSerializer with no StrTab.");
-      StringTable &StrTab = *Serializer->StrTab;
-      unsigned FileID = StrTab.add(File).first;
-      io.mapRequired("File", FileID);
-    } else {
-      io.mapRequired("File", File);
-    }
+    io.mapRequired("File", File);
 
     io.mapRequired("Line", Line);
     io.mapRequired("Column", Col);
@@ -136,13 +116,7 @@ template <> struct MappingTraits<Argument> {
   static void mapping(IO &io, Argument &A) {
     assert(io.outputting() && "input not yet implemented");
 
-    if (auto *Serializer = dyn_cast<YAMLStrTabRemarkSerializer>(
-            reinterpret_cast<RemarkSerializer *>(io.getContext()))) {
-      assert(Serializer->StrTab && "YAMLStrTabSerializer with no StrTab.");
-      StringTable &StrTab = *Serializer->StrTab;
-      auto ValueID = StrTab.add(A.Val).first;
-      io.mapRequired(A.Key.data(), ValueID);
-    } else if (StringRef(A.Val).count('\n') > 1) {
+    if (StringRef(A.Val).count('\n') > 1) {
       StringBlockVal S(A.Val);
       io.mapRequired(A.Key.data(), S);
     } else {
@@ -159,12 +133,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(Argument)
 
 YAMLRemarkSerializer::YAMLRemarkSerializer(raw_ostream &OS, SerializerMode Mode,
                                            std::optional<StringTable> StrTabIn)
-    : YAMLRemarkSerializer(Format::YAML, OS, Mode, std::move(StrTabIn)) {}
-
-YAMLRemarkSerializer::YAMLRemarkSerializer(Format SerializerFormat,
-                                           raw_ostream &OS, SerializerMode Mode,
-                                           std::optional<StringTable> StrTabIn)
-    : RemarkSerializer(SerializerFormat, OS, Mode),
+    : RemarkSerializer(Format::YAML, OS, Mode),
       YAMLOutput(OS, reinterpret_cast<void *>(this)) {
   StrTab = std::move(StrTabIn);
 }
@@ -172,7 +141,7 @@ YAMLRemarkSerializer::YAMLRemarkSerializer(Format SerializerFormat,
 void YAMLRemarkSerializer::emit(const Remark &Remark) {
   // Again, YAMLTraits expect a non-const object for inputting, but we're not
   // using that here.
-  auto R = const_cast<remarks::Remark *>(&Remark);
+  auto *R = const_cast<remarks::Remark *>(&Remark);
   YAMLOutput << R;
 }
 
@@ -181,27 +150,6 @@ std::unique_ptr<MetaSerializer> YAMLRemarkSerializer::metaSerializer(
   return std::make_unique<YAMLMetaSerializer>(OS, ExternalFilename);
 }
 
-void YAMLStrTabRemarkSerializer::emit(const Remark &Remark) {
-  // In standalone mode, for the serializer with a string table, emit the
-  // metadata first and set DidEmitMeta to avoid emitting it again.
-  if (Mode == SerializerMode::Standalone && !DidEmitMeta) {
-    std::unique_ptr<MetaSerializer> MetaSerializer =
-        metaSerializer(OS, /*ExternalFilename=*/std::nullopt);
-    MetaSerializer->emit();
-    DidEmitMeta = true;
-  }
-
-  // Then do the usual remark emission.
-  YAMLRemarkSerializer::emit(Remark);
-}
-
-std::unique_ptr<MetaSerializer> YAMLStrTabRemarkSerializer::metaSerializer(
-    raw_ostream &OS, std::optional<StringRef> ExternalFilename) {
-  assert(StrTab);
-  return std::make_unique<YAMLStrTabMetaSerializer>(OS, ExternalFilename,
-                                                    *StrTab);
-}
-
 static void emitMagic(raw_ostream &OS) {
   // Emit the magic number.
   OS << remarks::Magic;
@@ -216,20 +164,6 @@ static void emitVersion(raw_ostream &OS) {
   OS.write(Version.data(), Version.size());
 }
 
-static void emitStrTab(raw_ostream &OS,
-                       std::optional<const StringTable *> StrTab) {
-  // Emit the string table in the section.
-  uint64_t StrTabSize = StrTab ? (*StrTab)->SerializedSize : 0;
-  // Emit the total size of the string table (the size itself excluded):
-  // little-endian uint64_t.
-  // Note: even if no string table is used, emit 0.
-  std::array<char, 8> StrTabSizeBuf;
-  support::endian::write64le(StrTabSizeBuf.data(), StrTabSize);
-  OS.write(StrTabSizeBuf.data(), StrTabSizeBuf.size());
-  if (StrTab)
-    (*StrTab)->serialize(OS);
-}
-
 static void emitExternalFile(raw_ostream &OS, StringRef Filename) {
   // Emit the null-terminated absolute path to the remark file.
   SmallString<128> FilenameBuf = Filename;
@@ -242,15 +176,16 @@ static void emitExternalFile(raw_ostream &OS, StringRef Filename) {
 void YAMLMetaSerializer::emit() {
   emitMagic(OS);
   emitVersion(OS);
-  emitStrTab(OS, std::nullopt);
-  if (ExternalFilename)
-    emitExternalFile(OS, *ExternalFilename);
-}
 
-void YAMLStrTabMetaSerializer::emit() {
-  emitMagic(OS);
-  emitVersion(OS);
-  emitStrTab(OS, &StrTab);
+  // Emit StringTable with size 0. This is left over after removing StringTable
+  // support from the YAML format. For now, don't unnecessarily change how the
+  // the metadata is serialized. When changing the format, we should think about
+  // just reusing the bitstream remark meta for this.
+  uint64_t StrTabSize = 0;
+  std::array<char, 8> StrTabSizeBuf;
+  support::endian::write64le(StrTabSizeBuf.data(), StrTabSize);
+
+  OS.write(StrTabSizeBuf.data(), StrTabSizeBuf.size());
   if (ExternalFilename)
     emitExternalFile(OS, *ExternalFilename);
 }
diff --git a/llvm/test/CodeGen/X86/remarks-section.ll b/llvm/test/CodeGen/X86/remarks-section.ll
index dba20d428a69..e67c3579b759 100644
--- a/llvm/test/CodeGen/X86/remarks-section.ll
+++ b/llvm/test/CodeGen/X86/remarks-section.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin -remarks-section -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN -DPATH=%/t.yaml %s
-; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=yaml-strtab -remarks-section -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-STRTAB -DPATH=%/t.yaml %s
 
 ; RUN: llc < %s -mtriple=x86_64-darwin -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-DEFAULT %s
-; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=yaml-strtab -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-DEFAULT-YAML-STRTAB %s
 ; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=bitstream -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-DEFAULT-BITSTREAM %s
 ; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=bitstream -remarks-section=false -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-OVERRIDE-BITSTREAM %s
 ; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=yaml -remarks-section=true -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-OVERRIDE-YAML %s
@@ -10,15 +8,9 @@
 ; CHECK-DARWIN: .section __LLVM,__remarks,regular,debug
 ; CHECK-DARWIN-NEXT: .byte
 
-; CHECK-DARWIN-STRTAB: .section __LLVM,__remarks,regular,debug
-; CHECK-DARWIN-STRTAB-NEXT: .byte
-
 ; By default, the format is YAML which does not need a section.
 ; CHECK-DARWIN-DEFAULT-NOT: .section __LLVM,__remarks
 
-; yaml-strtab needs a section.
-; CHECK-DARWIN-DEFAULT-YAML-STRTAB: .section __LLVM,__remarks
-
 ; bitstream needs a section.
 ; CHECK-DARWIN-DEFAULT-BITSTREAM: .section __LLVM,__remarks
 
diff --git a/llvm/unittests/Remarks/RemarksLinkingTest.cpp b/llvm/unittests/Remarks/RemarksLinkingTest.cpp
index ff2aec669f2f..dcd598aaeb5c 100644
--- a/llvm/unittests/Remarks/RemarksLinkingTest.cpp
+++ b/llvm/unittests/Remarks/RemarksLinkingTest.cpp
@@ -207,22 +207,22 @@ TEST(Remarks, LinkingGoodStrTab) {
         "DebugLoc:        { File: file.c, Line: 3, Column: 12 }\n"
         "Function:        foo\n"
         "...\n",
-        remarks::Format::YAMLStrTab,
-        StringRef("REMARKS\0\0\0\0\0\0\0\0\0\x22\0\0\0\0\0\0\0"
-                  "inline\0NoDefinition\0foo\0file.c\0Ok\0"
-                  "--- !Passed\n"
-                  "Pass:            0\n"
-                  "Name:            4\n"
-                  "DebugLoc:        { File: 3, Line: 3, Column: 12 }\n"
-                  "Function:        2\n"
-                  "...\n"
-                  "--- !Missed\n"
-                  "Pass:            0\n"
-                  "Name:            1\n"
-                  "DebugLoc:        { File: 3, Line: 3, Column: 12 }\n"
-                  "Function:        2\n"
-                  "...\n",
-                  304));
+        remarks::Format::Bitstream,
+        "<BLOCKINFO_BLOCK/>\n"
+        "<Meta BlockID=8 NumWords=13 BlockCodeSize=3>\n"
+        "  <Container info codeid=1 abbrevid=4 op0=0 op1=2/>\n"
+        "  <Remark version codeid=2 abbrevid=5 op0=0/>\n"
+        "  <String table codeid=3 abbrevid=6/> blob data = "
+        "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00Ok\\x00'\n"
+        "</Meta>\n"
+        "<Remark BlockID=9 NumWords=4 BlockCodeSize=4>\n"
+        "  <Remark header codeid=5 abbrevid=4 op0=1 op1=4 op2=0 op3=2/>\n"
+        "  <Remark debug location codeid=6 abbrevid=5 op0=3 op1=3 op2=12/>\n"
+        "</Remark>\n"
+        "<Remark BlockID=9 NumWords=4 BlockCodeSize=4>\n"
+        "  <Remark header codeid=5 abbrevid=4 op0=2 op1=1 op2=0 op3=2/>\n"
+        "  <Remark debug location codeid=6 abbrevid=5 op0=3 op1=3 op2=12/>\n"
+        "</Remark>\n");
 }
 
 // Check that we propagate parsing errors.
@@ -241,11 +241,12 @@ TEST(Remarks, LinkingError) {
 
   {
     // Check that the prepend path is propagated and fails with the full path.
+    // Also ensures that the remark format is correctly auto-detected.
     RL.setExternalFilePrependPath("/baddir/");
     Error E = RL.link(
         StringRef("REMARKS\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0badfile.opt.yaml",
                   40),
-        remarks::Format::YAMLStrTab);
+        /*RemarkFormat=*/std::nullopt);
     EXPECT_TRUE(static_cast<bool>(E));
     std::string ErrorMessage = toString(std::move(E));
     EXPECT_EQ(StringRef(ErrorMessage).lower(),
diff --git a/llvm/unittests/Remarks/YAMLRemarksParsingTest.cpp b/llvm/unittests/Remarks/YAMLRemarksParsingTest.cpp
index 3c740ddc8a55..824813aa5af7 100644
--- a/llvm/unittests/Remarks/YAMLRemarksParsingTest.cpp
+++ b/llvm/unittests/Remarks/YAMLRemarksParsingTest.cpp
@@ -77,7 +77,6 @@ void parseExpectErrorMeta(
 
   Expected<std::unique_ptr<remarks::RemarkParser>> MaybeParser =
       remarks::createRemarkParserFromMeta(remarks::Format::YAML, Buf,
-                                          /*StrTab=*/std::nullopt,
                                           std::move(ExternalFilePrependPath));
   handleAllErrors(MaybeParser.takeError(),
                   [&](const ErrorInfoBase &EIB) { EIB.log(Stream); });
@@ -558,124 +557,6 @@ TEST(YAMLRemarks, ContentsCAPI) {
   LLVMRemarkParserDispose(Parser);
 }
 
-TEST(YAMLRemarks, ContentsStrTab) {
-  StringRef Buf = "\n"
-                  "--- !Missed\n"
-                  "Pass: 0\n"
-                  "Name: 1\n"
-                  "DebugLoc: { File: 2, Line: 3, Column: 12 }\n"
-                  "Function: 3\n"
-                  "Hotness: 4\n"
-                  "Args:\n"
-                  "  - Callee: 5\n"
-                  "  - String: 7\n"
-                  "  - Caller: 3\n"
-                  "    DebugLoc: { File: 2, Line: 2, Column: 0 }\n"
-                  "  - String: 8\n"
-                  "\n";
-
-  StringRef StrTabBuf =
-      StringRef("inline\0NoDefinition\0file.c\0foo\0Callee\0bar\0String\0 "
-                "will not be inlined into \0 because its definition is "
-                "unavailable",
-                115);
-
-  remarks::ParsedStringTable StrTab(StrTabBuf);
-  Expected<std::unique_ptr<remarks::RemarkParser>> MaybeParser =
-      remarks::createRemarkParser(remarks::Format::YAMLStrTab, Buf,
-                                  std::move(StrTab));
-  EXPECT_FALSE(errorToBool(MaybeParser.takeError()));
-  EXPECT_TRUE(*MaybeParser != nullptr);
-
-  remarks::RemarkParser &Parser = **MaybeParser;
-  Expected<std::unique_ptr<remarks::Remark>> MaybeRemark = Parser.next();
-  EXPECT_FALSE(
-      errorToBool(MaybeRemark.takeError())); // Check for parsing errors.
-  EXPECT_TRUE(*MaybeRemark != nullptr);      // At least one remark.
-
-  const remarks::Remark &Remark = **MaybeRemark;
-  EXPECT_EQ(Remark.RemarkType, remarks::Type::Missed);
-  EXPECT_EQ(checkStr(Remark.PassName, 6), "inline");
-  EXPECT_EQ(checkStr(Remark.RemarkName, 12), "NoDefinition");
-  EXPECT_EQ(checkStr(Remark.FunctionName, 3), "foo");
-  EXPECT_TRUE(Remark.Loc);
-  const remarks::RemarkLocation &RL = *Remark.Loc;
-  EXPECT_EQ(checkStr(RL.SourceFilePath, 6), "file.c");
-  EXPECT_EQ(RL.SourceLine, 3U);
-  EXPECT_EQ(RL.SourceColumn, 12U);
-  EXPECT_TRUE(Remark.Hotness);
-  EXPECT_EQ(*Remark.Hotness, 4U);
-  EXPECT_EQ(Remark.Args.size(), 4U);
-
-  unsigned ArgID = 0;
-  for (const remarks::Argument &Arg : Remark.Args) {
-    switch (ArgID) {
-    case 0:
-      EXPECT_EQ(checkStr(Arg.Key, 6), "Callee");
-      EXPECT_EQ(checkStr(Arg.Val, 3), "bar");
-      EXPECT_FALSE(Arg.Loc);
-      break;
-    case 1:
-      EXPECT_EQ(checkStr(Arg.Key, 6), "String");
-      EXPECT_EQ(checkStr(Arg.Val, 26), " will not be inlined into ");
-      EXPECT_FALSE(Arg.Loc);
-      break;
-    case 2: {
-      EXPECT_EQ(checkStr(Arg.Key, 6), "Caller");
-      EXPECT_EQ(checkStr(Arg.Val, 3), "foo");
-      EXPECT_TRUE(Arg.Loc);
-      const remarks::RemarkLocation &RL = *Arg.Loc;
-      EXPECT_EQ(checkStr(RL.SourceFilePath, 6), "file.c");
-      EXPECT_EQ(RL.SourceLine, 2U);
-      EXPECT_EQ(RL.SourceColumn, 0U);
-      break;
-    }
-    case 3:
-      EXPECT_EQ(checkStr(Arg.Key, 6), "String");
-      EXPECT_EQ(checkStr(Arg.Val, 38),
-                " because its definition is unavailable");
-      EXPECT_FALSE(Arg.Loc);
-      break;
-    default:
-      break;
-    }
-    ++ArgID;
-  }
-
-  MaybeRemark = Parser.next();
-  Error E = MaybeRemark.takeError();
-  EXPECT_TRUE(E.isA<remarks::EndOfFileError>());
-  EXPECT_TRUE(errorToBool(std::move(E))); // Check for parsing errors.
-}
-
-TEST(YAMLRemarks, ParsingBadStringTableIndex) {
-  StringRef Buf = "\n"
-                  "--- !Missed\n"
-                  "Pass: 50\n"
-                  "\n";
-
-  StringRef StrTabBuf = StringRef("inline");
-
-  remarks::ParsedStringTable StrTab(StrTabBuf);
-  Expected<std::unique_ptr<remarks::RemarkParser>> MaybeParser =
-      remarks::createRemarkParser(remarks::Format::YAMLStrTab, Buf,
-                                  std::move(StrTab));
-  EXPECT_FALSE(errorToBool(MaybeParser.takeError()));
-  EXPECT_TRUE(*MaybeParser != nullptr);
-
-  remarks::RemarkParser &Parser = **MaybeParser;
-  Expected<std::unique_ptr<remarks::Remark>> MaybeRemark = Parser.next();
-  EXPECT_FALSE(MaybeRemark); // Expect an error here.
-
-  std::string ErrorStr;
-  raw_string_ostream Stream(ErrorStr);
-  handleAllErrors(MaybeRemark.takeError(),
-                  [&](const ErrorInfoBase &EIB) { EIB.log(Stream); });
-  EXPECT_TRUE(
-      StringRef(Stream.str())
-          .contains("String with index 50 is out of bounds (size = 1)."));
-}
-
 TEST(YAMLRemarks, ParsingGoodMeta) {
   // No metadata should also work.
   parseGoodMeta("--- !Missed\n"
@@ -692,17 +573,6 @@ TEST(YAMLRemarks, ParsingGoodMeta) {
                           "Name: NoDefinition\n"
                           "Function: foo\n",
                           82));
-
-  // Use the string table from the metadata.
-  parseGoodMeta(StringRef("REMARKS\0"
-                          "\0\0\0\0\0\0\0\0"
-                          "\x02\0\0\0\0\0\0\0"
-                          "a\0"
-                          "--- !Missed\n"
-                          "Pass: 0\n"
-                          "Name: 0\n"
-                          "Function: 0\n",
-                          66));
 }
 
 TEST(YAMLRemarks, ParsingBadMeta) {
@@ -727,7 +597,8 @@ TEST(YAMLRemarks, ParsingBadMeta) {
                                  "\0\0\0\0\0\0\0\0"
                                  "\x01\0\0\0\0\0\0\0",
                                  24),
-                       "Expecting string table.", CmpType::Equal);
+                       "String table unsupported for YAML format.",
+                       CmpType::Equal);
 
   parseExpectErrorMeta(StringRef("REMARKS\0"
                                  "\0\0\0\0\0\0\0\0"
diff --git a/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp b/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp
index 442c24b9fd95..7e994ac4d58b 100644
--- a/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp
+++ b/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp
@@ -131,78 +131,6 @@ TEST(YAMLRemarks, SerializerRemarkStandalone) {
                 "...\n"));
 }
 
-TEST(YAMLRemarks, SerializerRemarkStrTab) {
-  remarks::Remark R;
-  R.RemarkType = remarks::Type::Missed;
-  R.PassName = "pass";
-  R.RemarkName = "name";
-  R.FunctionName = "func";
-  R.Loc = remarks::RemarkLocation{"path", 3, 4};
-  R.Hotness = 5;
-  R.Args.emplace_back();
-  R.Args.back().Key = "key";
-  R.Args.back().Val = "value";
-  R.Args.emplace_back();
-  R.Args.back().Key = "keydebug";
-  R.Args.back().Val = "valuedebug";
-  R.Args.back().Loc = remarks::RemarkLocation{"argpath", 6, 7};
-  check(remarks::Format::YAMLStrTab, R,
-        "--- !Missed\n"
-        "Pass:            0\n"
-        "Name:            1\n"
-        "DebugLoc:        { File: 3, Line: 3, Column: 4 }\n"
-        "Function:        2\n"
-        "Hotness:         5\n"
-        "Args:\n"
-        "  - key:             4\n"
-        "  - keydebug:        5\n"
-        "    DebugLoc:        { File: 6, Line: 6, Column: 7 }\n"
-        "...\n",
-        StringRef("REMARKS\0"
-                  "\0\0\0\0\0\0\0\0"
-                  "\x2d\0\0\0\0\0\0\0"
-                  "pass\0name\0func\0path\0value\0valuedebug\0argpath"
-                  "\0" EXTERNALFILETESTPATH "\0",
-                  83));
-}
-
-TEST(YAMLRemarks, SerializerRemarkParsedStrTab) {
-  StringRef StrTab("pass\0name\0func\0path\0value\0valuedebug\0argpath\0", 45);
-  remarks::Remark R;
-  R.RemarkType = remarks::Type::Missed;
-  R.PassName = "pass";
-  R.RemarkName = "name";
-  R.FunctionName = "func";
-  R.Loc = remarks::RemarkLocation{"path", 3, 4};
-  R.Hotness = 5;
-  R.Args.emplace_back();
-  R.Args.back().Key = "key";
-  R.Args.back().Val = "value";
-  R.Args.emplace_back();
-  R.Args.back().Key = "keydebug";
-  R.Args.back().Val = "valuedebug";
-  R.Args.back().Loc = remarks::RemarkLocation{"argpath", 6, 7};
-  check(remarks::Format::YAMLStrTab, R,
-        "--- !Missed\n"
-        "Pass:            0\n"
-        "Name:            1\n"
-        "DebugLoc:        { File: 3, Line: 3, Column: 4 }\n"
-        "Function:        2\n"
-        "Hotness:         5\n"
-        "Args:\n"
-        "  - key:             4\n"
-        "  - keydebug:        5\n"
-        "    DebugLoc:        { File: 6, Line: 6, Column: 7 }\n"
-        "...\n",
-        StringRef("REMARKS\0"
-                  "\0\0\0\0\0\0\0\0"
-                  "\x2d\0\0\0\0\0\0\0"
-                  "pass\0name\0func\0path\0value\0valuedebug\0argpath"
-                  "\0" EXTERNALFILETESTPATH "\0",
-                  83),
-        remarks::StringTable(remarks::ParsedStringTable(StrTab)));
-}
-
 TEST(YAMLRemarks, SerializerRemarkParsedStrTabStandaloneNoStrTab) {
   // Check that we don't use the string table even if it was provided.
   StringRef StrTab("pass\0name\0func\0path\0value\0valuedebug\0argpath\0", 45);
@@ -237,94 +165,3 @@ TEST(YAMLRemarks, SerializerRemarkParsedStrTabStandaloneNoStrTab) {
                 "...\n"),
       std::move(PreFilledStrTab));
 }
-
-TEST(YAMLRemarks, SerializerRemarkParsedStrTabStandalone) {
-  StringRef StrTab("pass\0name\0func\0path\0value\0valuedebug\0argpath\0", 45);
-  remarks::ParsedStringTable ParsedStrTab(StrTab);
-  remarks::StringTable PreFilledStrTab(ParsedStrTab);
-  remarks::Remark R;
-  R.RemarkType = remarks::Type::Missed;
-  R.PassName = "pass";
-  R.RemarkName = "name";
-  R.FunctionName = "func";
-  R.Loc = remarks::RemarkLocation{"path", 3, 4};
-  R.Hotness = 5;
-  R.Args.emplace_back();
-  R.Args.back().Key = "key";
-  R.Args.back().Val = "value";
-  R.Args.emplace_back();
-  R.Args.back().Key = "keydebug";
-  R.Args.back().Val = "valuedebug";
-  R.Args.back().Loc = remarks::RemarkLocation{"argpath", 6, 7};
-  checkStandalone(
-      remarks::Format::YAMLStrTab, R,
-      StringRef("REMARKS\0"
-                "\0\0\0\0\0\0\0\0"
-                "\x2d\0\0\0\0\0\0\0"
-                "pass\0name\0func\0path\0value\0valuedebug\0argpath\0"
-                "--- !Missed\n"
-                "Pass:            0\n"
-                "Name:            1\n"
-                "DebugLoc:        { File: 3, Line: 3, Column: 4 }\n"
-                "Function:        2\n"
-                "Hotness:         5\n"
-                "Args:\n"
-                "  - key:             4\n"
-                "  - keydebug:        5\n"
-                "    DebugLoc:        { File: 6, Line: 6, Column: 7 }\n"
-                "...\n",
-                315),
-      std::move(PreFilledStrTab));
-}
-
-TEST(YAMLRemarks, SerializerRemarkParsedStrTabStandaloneMultipleRemarks) {
-  StringRef StrTab("pass\0name\0func\0path\0value\0valuedebug\0argpath\0", 45);
-  remarks::ParsedStringTable ParsedStrTab(StrTab);
-  remarks::StringTable PreFilledStrTab(ParsedStrTab);
-  SmallVector<remarks::Remark, 2> Rs;
-  remarks::Remark R;
-  R.RemarkType = remarks::Type::Missed;
-  R.PassName = "pass";
-  R.RemarkName = "name";
-  R.FunctionName = "func";
-  R.Loc = remarks::RemarkLocation{"path", 3, 4};
-  R.Hotness = 5;
-  R.Args.emplace_back();
-  R.Args.back().Key = "key";
-  R.Args.back().Val = "value";
-  R.Args.emplace_back();
-  R.Args.back().Key = "keydebug";
-  R.Args.back().Val = "valuedebug";
-  R.Args.back().Loc = remarks::RemarkLocation{"argpath", 6, 7};
-  Rs.emplace_back(R.clone());
-  Rs.emplace_back(std::move(R));
-  check(remarks::Format::YAMLStrTab, remarks::SerializerMode::Standalone, Rs,
-        StringRef("REMARKS\0"
-                  "\0\0\0\0\0\0\0\0"
-                  "\x2d\0\0\0\0\0\0\0"
-                  "pass\0name\0func\0path\0value\0valuedebug\0argpath\0"
-                  "--- !Missed\n"
-                  "Pass:            0\n"
-                  "Name:            1\n"
-                  "DebugLoc:        { File: 3, Line: 3, Column: 4 }\n"
-                  "Function:        2\n"
-                  "Hotness:         5\n"
-                  "Args:\n"
-                  "  - key:             4\n"
-                  "  - keydebug:        5\n"
-                  "    DebugLoc:        { File: 6, Line: 6, Column: 7 }\n"
-                  "...\n"
-                  "--- !Missed\n"
-                  "Pass:            0\n"
-                  "Name:            1\n"
-                  "DebugLoc:        { File: 3, Line: 3, Column: 4 }\n"
-                  "Function:        2\n"
-                  "Hotness:         5\n"
-                  "Args:\n"
-                  "  - key:             4\n"
-                  "  - keydebug:        5\n"
-                  "    DebugLoc:        { File: 6, Line: 6, Column: 7 }\n"
-                  "...\n",
-                  561),
-        /*ExpectedMeta=*/std::nullopt, std::move(PreFilledStrTab));
-}

From 671caef379c603d2bcc428a00e3535b230162941 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles@arm.com>
Date: Wed, 18 Jun 2025 14:35:53 +0100
Subject: [PATCH 0810/1322] [Flang][OpenMP] Update relevant warnings to emit
 when OMP >= v5.2 (#144492)

There has been a number of deprecation warnings that have been added to
Flang, however these features are only deprecated when the OpenMP
Version being used is 5.2 or later. Previously, flang did not consider
the version with the warnings so would always be emitted.

Flang now ensures warnings are emitted for the appropriate version of
OpenMP, and tests are updated to reflect this change.
---
 flang/lib/Semantics/resolve-directives.cpp    | 20 ++++++++++++-------
 .../Semantics/OpenMP/allocate-align01.f90     |  2 +-
 flang/test/Semantics/OpenMP/allocate01.f90    |  2 +-
 flang/test/Semantics/OpenMP/allocate02.f90    |  2 --
 flang/test/Semantics/OpenMP/allocate03.f90    |  1 -
 flang/test/Semantics/OpenMP/allocate05.f90    |  2 --
 flang/test/Semantics/OpenMP/allocate06.f90    |  1 -
 flang/test/Semantics/OpenMP/allocate09.f90    |  5 -----
 .../Semantics/OpenMP/clause-validity01.f90    |  9 ++++++++-
 flang/test/Semantics/OpenMP/deprecation.f90   |  2 +-
 flang/test/Semantics/OpenMP/flush02.f90       |  1 -
 .../test/Semantics/OpenMP/nested-barrier.f90  |  2 --
 flang/test/Semantics/OpenMP/nested-master.f90 | 12 -----------
 flang/test/Semantics/OpenMP/nested-teams.f90  |  1 -
 flang/test/Semantics/OpenMP/ordered-simd.f90  |  4 ----
 .../Semantics/OpenMP/parallel-master-goto.f90 |  1 -
 16 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 57db76e2160d..885c02e6ec74 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -835,8 +835,8 @@ private:
 
   void AddOmpRequiresToScope(Scope &, WithOmpDeclarative::RequiresFlags,
       std::optional<common::OmpMemoryOrderType>);
-  void IssueNonConformanceWarning(
-      llvm::omp::Directive D, parser::CharBlock source);
+  void IssueNonConformanceWarning(llvm::omp::Directive D,
+      parser::CharBlock source, unsigned EmitFromVersion);
 
   void CreateImplicitSymbols(const Symbol *symbol);
 
@@ -1668,7 +1668,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) {
   }
   if (beginDir.v == llvm::omp::Directive::OMPD_master ||
       beginDir.v == llvm::omp::Directive::OMPD_parallel_master)
-    IssueNonConformanceWarning(beginDir.v, beginDir.source);
+    IssueNonConformanceWarning(beginDir.v, beginDir.source, 52);
   ClearDataSharingAttributeObjects();
   ClearPrivateDataSharingAttributeObjects();
   ClearAllocateNames();
@@ -1791,7 +1791,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
       beginDir.v == llvm::omp::OMPD_parallel_master_taskloop ||
       beginDir.v == llvm::omp::OMPD_parallel_master_taskloop_simd ||
       beginDir.v == llvm::omp::Directive::OMPD_target_loop)
-    IssueNonConformanceWarning(beginDir.v, beginDir.source);
+    IssueNonConformanceWarning(beginDir.v, beginDir.source, 52);
   ClearDataSharingAttributeObjects();
   SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList));
 
@@ -2108,7 +2108,8 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
 }
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
-  IssueNonConformanceWarning(llvm::omp::Directive::OMPD_allocate, x.source);
+  IssueNonConformanceWarning(llvm::omp::Directive::OMPD_allocate, x.source, 52);
+
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
   const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   if (list) {
@@ -3172,11 +3173,16 @@ void OmpAttributeVisitor::AddOmpRequiresToScope(Scope &scope,
   } while (!scopeIter->IsGlobal());
 }
 
-void OmpAttributeVisitor::IssueNonConformanceWarning(
-    llvm::omp::Directive D, parser::CharBlock source) {
+void OmpAttributeVisitor::IssueNonConformanceWarning(llvm::omp::Directive D,
+    parser::CharBlock source, unsigned EmitFromVersion) {
   std::string warnStr;
   llvm::raw_string_ostream warnStrOS(warnStr);
   unsigned version{context_.langOptions().OpenMPVersion};
+  // We only want to emit the warning when the version being used has the
+  // directive deprecated
+  if (version < EmitFromVersion) {
+    return;
+  }
   warnStrOS << "OpenMP directive "
             << parser::ToUpperCaseLetters(
                    llvm::omp::getOpenMPDirectiveName(D, version).str())
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 4974f5e18397..bc17d7047bbb 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=52
 ! OpenMP Version 5.2
 ! The allocate clause's allocator modifier must be of type allocator_handle
 ! and the align modifier must be constant, positive integer expression
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 8a680eee743e..b205b2c79d65 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=52
 ! OpenMP Version 5.0
 ! 2.11.3 allocate Directive
 ! The allocate directive must appear in the same scope as the declarations of
diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90
index 80ef60b31e70..8f0579e810bb 100644
--- a/flang/test/Semantics/OpenMP/allocate02.f90
+++ b/flang/test/Semantics/OpenMP/allocate02.f90
@@ -16,11 +16,9 @@ use omp_lib
   !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive
   !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc)
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
       allocate ( darray(a, b) )
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive
   !$omp allocate(darray) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc)
       allocate ( darray(a, b) )
diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90
index b8c6b8e5dee7..e35115f3897c 100644
--- a/flang/test/Semantics/OpenMP/allocate03.f90
+++ b/flang/test/Semantics/OpenMP/allocate03.f90
@@ -18,7 +18,6 @@ use omp_lib
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(my_var%array)
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc)
     allocate ( darray(a, b) )
diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90
index 2c81c4dbc82c..a787e8bb32a4 100644
--- a/flang/test/Semantics/OpenMP/allocate05.f90
+++ b/flang/test/Semantics/OpenMP/allocate05.f90
@@ -13,13 +13,11 @@ use omp_lib
   real, dimension (:,:), allocatable :: darray
 
   !$omp target
-      !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
       !$omp allocate allocator(omp_default_mem_alloc)
           allocate ( darray(a, b) )
   !$omp end target
 
   !$omp target
-      !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
       !ERROR: ALLOCATE directives that appear in a TARGET region must specify an allocator clause
       !$omp allocate
           allocate ( darray(a, b) )
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index 7196bcac2b9b..e14134cd0730 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -14,7 +14,6 @@ use omp_lib
   !ERROR: List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
     allocate(darray(a, b))
 
diff --git a/flang/test/Semantics/OpenMP/allocate09.f90 b/flang/test/Semantics/OpenMP/allocate09.f90
index 645e97a3a33f..0f93a340fe1e 100644
--- a/flang/test/Semantics/OpenMP/allocate09.f90
+++ b/flang/test/Semantics/OpenMP/allocate09.f90
@@ -12,28 +12,23 @@ use omp_lib
   integer, dimension(:), allocatable :: a, b, c, d, e, f, &
                                         g, h, i, j, k, l
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !$omp allocate(a) allocator(omp_default_mem_alloc)
     allocate(a(1), b(2))
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !$omp allocate(c, d) allocator(omp_default_mem_alloc)
     allocate(c(3), d(4))
 
   !$omp allocate(e) allocator(omp_default_mem_alloc)
   !$omp allocate(f, g) allocator(omp_default_mem_alloc)
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !$omp allocate
     allocate(e(5), f(6), g(7))
 
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !ERROR: Object 'i' in ALLOCATE directive not found in corresponding ALLOCATE statement
   !$omp allocate(h, i) allocator(omp_default_mem_alloc)
     allocate(h(8))
 
   !ERROR: Object 'j' in ALLOCATE directive not found in corresponding ALLOCATE statement
   !$omp allocate(j, k) allocator(omp_default_mem_alloc)
-  !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead.
   !$omp allocate(l) allocator(omp_default_mem_alloc)
     allocate(k(9), l(10))
 
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 5e0d91914c44..6989a183e83e 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=51
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag -fopenmp-version=52
 use omp_lib
 ! Check OpenMP clause validity for the following directives:
 !
@@ -502,6 +502,7 @@ use omp_lib
   !$omp taskyield
   !$omp barrier
   !$omp taskwait
+  !WARNING: SOURCE dependence type is deprecated in OpenMP v5.2
   !ERROR: The SINK and SOURCE dependence types can only be used with the ORDERED directive, used here in the TASKWAIT construct
   !$omp taskwait depend(source)
   ! !$omp taskwait depend(sink:i-1)
@@ -509,12 +510,18 @@ use omp_lib
   ! !$omp target update from(arrayA) to(arrayB)
   ! !$omp target exit data map(from:arrayA) map(delete:arrayB)
   !$omp flush (c)
+  !WARNING: The syntax "FLUSH clause (object, ...)" has been deprecated, use "FLUSH(object, ...) clause" instead
   !$omp flush acq_rel
+  !WARNING: The syntax "FLUSH clause (object, ...)" has been deprecated, use "FLUSH(object, ...) clause" instead
   !$omp flush release
+  !WARNING: The syntax "FLUSH clause (object, ...)" has been deprecated, use "FLUSH(object, ...) clause" instead
   !$omp flush acquire
+  !WARNING: The syntax "FLUSH clause (object, ...)" has been deprecated, use "FLUSH(object, ...) clause" instead
   !ERROR: If memory-order-clause is RELEASE, ACQUIRE, or ACQ_REL, list items must not be specified on the FLUSH directive
   !$omp flush release (c)
+  !WARNING: The syntax "FLUSH clause (object, ...)" has been deprecated, use "FLUSH(object, ...) clause" instead
   !$omp flush seq_cst
+  !WARNING: The syntax "FLUSH clause (object, ...)" has been deprecated, use "FLUSH(object, ...) clause" instead
   !ERROR: RELAXED clause is not allowed on the FLUSH directive
   !$omp flush relaxed
 
diff --git a/flang/test/Semantics/OpenMP/deprecation.f90 b/flang/test/Semantics/OpenMP/deprecation.f90
index e04f43026bbc..df15c3bcc0b1 100644
--- a/flang/test/Semantics/OpenMP/deprecation.f90
+++ b/flang/test/Semantics/OpenMP/deprecation.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -Werror
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -Werror -fopenmp-version=52
 
 ! Check for deprecation of master directive and its combined/composite variants
 
diff --git a/flang/test/Semantics/OpenMP/flush02.f90 b/flang/test/Semantics/OpenMP/flush02.f90
index 615332c6cf31..a7b170d58db5 100644
--- a/flang/test/Semantics/OpenMP/flush02.f90
+++ b/flang/test/Semantics/OpenMP/flush02.f90
@@ -78,7 +78,6 @@ use omp_lib
 
   !$omp parallel num_threads(4)
     array = (/1, 2, 3, 4, 5, 6, 7, 8, 9, 10/)
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !$omp master
       !$omp flush (array)
     !$omp end master
diff --git a/flang/test/Semantics/OpenMP/nested-barrier.f90 b/flang/test/Semantics/OpenMP/nested-barrier.f90
index 5f51363d59e5..8565a09a18cd 100644
--- a/flang/test/Semantics/OpenMP/nested-barrier.f90
+++ b/flang/test/Semantics/OpenMP/nested-barrier.f90
@@ -75,7 +75,6 @@ program omp_nest_barrier
   end do
   !$omp end critical
 
-  !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
   !$omp master
   do i = 1, 10
     k = k + 1
@@ -108,7 +107,6 @@ program omp_nest_barrier
   end do
   !$omp end ordered
 
-  !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
   !$omp master
   do i = 1, 10
     !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
diff --git a/flang/test/Semantics/OpenMP/nested-master.f90 b/flang/test/Semantics/OpenMP/nested-master.f90
index d51e366eb584..7e4bb32bb7be 100644
--- a/flang/test/Semantics/OpenMP/nested-master.f90
+++ b/flang/test/Semantics/OpenMP/nested-master.f90
@@ -9,7 +9,6 @@ program omp_nest_master
   !$omp do
   do i = 1, 10
     k = k + 1
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     j = j -1
@@ -17,7 +16,6 @@ program omp_nest_master
   end do
 
   !$omp sections 
-  !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
   !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     do i = 1, 10
@@ -27,7 +25,6 @@ program omp_nest_master
   !$omp end sections
 
   !$omp single 
-  !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
   !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     do i = 1, 10
@@ -41,7 +38,6 @@ program omp_nest_master
   !$omp task
   do i = 1, 10
     k = k + 1
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     j = j -1
@@ -52,7 +48,6 @@ program omp_nest_master
   !$omp taskloop
   do i = 1, 10
     k = k + 1
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
     j = j -1
@@ -63,7 +58,6 @@ program omp_nest_master
   !$omp target parallel do simd
   do i = 1, 10
     k = k + 1
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !ERROR: The only OpenMP constructs that can be encountered during execution of a 'SIMD' region are the `ATOMIC` construct, the `LOOP` construct, the `SIMD` construct, the `SCAN` construct and the `ORDERED` construct with the `SIMD` clause.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$omp master
@@ -75,7 +69,6 @@ program omp_nest_master
   !$omp critical
   do i = 1, 10
     k = k + 1
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !$omp master
     j = j -1
     !$omp end master
@@ -85,7 +78,6 @@ program omp_nest_master
   !$omp ordered
   do i = 1, 10
     k = k + 1
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !$omp master
     j = j -1
     !$omp end master
@@ -99,7 +91,6 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
-      !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
       !$omp master
       j = j -1
       !$omp end master
@@ -116,7 +107,6 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
-      !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
       !$omp master
       j = j -1
       !$omp end master
@@ -133,7 +123,6 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
-      !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
       !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
       !$omp master
       j = j -1
@@ -151,7 +140,6 @@ program omp_nest_master
     !$omp distribute
     do k =1, 10
       print *, "hello"
-      !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
       !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
       !$omp master
       j = j -1
diff --git a/flang/test/Semantics/OpenMP/nested-teams.f90 b/flang/test/Semantics/OpenMP/nested-teams.f90
index 974172ee9717..3c193ee00b95 100644
--- a/flang/test/Semantics/OpenMP/nested-teams.f90
+++ b/flang/test/Semantics/OpenMP/nested-teams.f90
@@ -42,7 +42,6 @@ program main
   !$omp end teams
   end do
 
-  !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
   !$omp master
   !ERROR: TEAMS region can only be strictly nested within the implicit parallel region or TARGET region
   !$omp teams
diff --git a/flang/test/Semantics/OpenMP/ordered-simd.f90 b/flang/test/Semantics/OpenMP/ordered-simd.f90
index c90ffb3bd1c5..50560139ea24 100644
--- a/flang/test/Semantics/OpenMP/ordered-simd.f90
+++ b/flang/test/Semantics/OpenMP/ordered-simd.f90
@@ -95,7 +95,6 @@ SUBROUTINE ORDERED_BAD(N)
 
   !$OMP CRITICAL  
     C =  C - A * B
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !$OMP MASTER
     DO I = 1,N
       !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region.
@@ -108,7 +107,6 @@ SUBROUTINE ORDERED_BAD(N)
 
   !$OMP ORDERED  
     C =  C - A * B
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !$OMP MASTER
     DO I = 1,N
       !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region.
@@ -121,7 +119,6 @@ SUBROUTINE ORDERED_BAD(N)
 
   !$OMP TASK  
     C =  C - A * B
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$OMP MASTER
     DO I = 1,N
@@ -136,7 +133,6 @@ SUBROUTINE ORDERED_BAD(N)
   !$OMP TASKLOOP
   DO J= 1,N  
     C =  C - A * B
-    !WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
     !ERROR: `MASTER` region may not be closely nested inside of `WORKSHARING`, `LOOP`, `TASK`, `TASKLOOP`, or `ATOMIC` region.
     !$OMP MASTER
     DO I = 1,N
diff --git a/flang/test/Semantics/OpenMP/parallel-master-goto.f90 b/flang/test/Semantics/OpenMP/parallel-master-goto.f90
index 72c8002ab4c5..01d14aaa46d3 100644
--- a/flang/test/Semantics/OpenMP/parallel-master-goto.f90
+++ b/flang/test/Semantics/OpenMP/parallel-master-goto.f90
@@ -7,7 +7,6 @@ do i = 1, 2
 !ERROR: invalid branch leaving an OpenMP structured block
   goto 10
 end do
-!WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
 !$omp master
 10 print *, i
 !$omp end master

From fda6b751f1b1356e65816f85fbc5b98e78337940 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Wed, 18 Jun 2025 09:30:18 -0400
Subject: [PATCH 0811/1322] Fix libc++ restarter job.

A while ago, the test workflow was updated with a new preemption regex,
however it was only applied to the test job, and not the job
that's actually restarting the failed libc++ test runs.

This fix should correct the issue and get the restarter working
again.
---
 .github/workflows/libcxx-restart-preempted-jobs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/libcxx-restart-preempted-jobs.yaml b/.github/workflows/libcxx-restart-preempted-jobs.yaml
index 7b341d7f22e4..9706f0459922 100644
--- a/.github/workflows/libcxx-restart-preempted-jobs.yaml
+++ b/.github/workflows/libcxx-restart-preempted-jobs.yaml
@@ -33,7 +33,7 @@ jobs:
         with:
           script: |
             const failure_regex = /Process completed with exit code 1./
-            const preemption_regex = /The runner has received a shutdown signal/
+            const preemption_regex = /(The runner has received a shutdown signal)|(The operation was canceled)/
 
             const wf_run = context.payload.workflow_run
             core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)

From bdac9580f3bc341ccbeeb743ecca656756f5aaec Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 18 Jun 2025 06:40:06 -0700
Subject: [PATCH 0812/1322] [nfc][jt] Drop `std::optional` pointers (#144548)

The `std::optional` didn't add any semantics that couldn't be modeled with the pointers being `nullptr`.
---
 .../include/llvm/Transforms/Scalar/JumpThreading.h |  7 +++----
 llvm/lib/Transforms/Scalar/JumpThreading.cpp       | 14 +++++++-------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 75b5cf2371fd..a03a38466b27 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -85,8 +85,8 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
   LazyValueInfo *LVI = nullptr;
   AAResults *AA = nullptr;
   std::unique_ptr<DomTreeUpdater> DTU;
-  std::optional<BlockFrequencyInfo *> BFI;
-  std::optional<BranchProbabilityInfo *> BPI;
+  BlockFrequencyInfo *BFI = nullptr;
+  BranchProbabilityInfo *BPI = nullptr;
   bool ChangedSinceLastAnalysisUpdate = false;
   bool HasGuards = false;
 #ifndef LLVM_ENABLE_ABI_BREAKING_CHECKS
@@ -110,8 +110,7 @@ public:
                         TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
                         LazyValueInfo *LVI, AAResults *AA,
                         std::unique_ptr<DomTreeUpdater> DTU,
-                        std::optional<BlockFrequencyInfo *> BFI,
-                        std::optional<BranchProbabilityInfo *> BPI);
+                        BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI);
 
   LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 37b85bf9de81..b5dbef13289a 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -249,7 +249,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
       runImpl(F, &AM, &TLI, &TTI, &LVI, &AA,
               std::make_unique<DomTreeUpdater>(
                   &DT, nullptr, DomTreeUpdater::UpdateStrategy::Lazy),
-              std::nullopt, std::nullopt);
+              nullptr, nullptr);
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -283,8 +283,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_,
                                 TargetTransformInfo *TTI_, LazyValueInfo *LVI_,
                                 AliasAnalysis *AA_,
                                 std::unique_ptr<DomTreeUpdater> DTU_,
-                                std::optional<BlockFrequencyInfo *> BFI_,
-                                std::optional<BranchProbabilityInfo *> BPI_) {
+                                BlockFrequencyInfo *BFI_,
+                                BranchProbabilityInfo *BPI_) {
   LLVM_DEBUG(dbgs() << "Jump threading on function '" << F_.getName() << "'\n");
   F = &F_;
   FAM = FAM_;
@@ -3215,7 +3215,7 @@ BranchProbabilityInfo *JumpThreadingPass::getBPI() {
     assert(FAM && "Can't create BPI without FunctionAnalysisManager");
     BPI = FAM->getCachedResult<BranchProbabilityAnalysis>(*F);
   }
-  return *BPI;
+  return BPI;
 }
 
 BlockFrequencyInfo *JumpThreadingPass::getBFI() {
@@ -3223,7 +3223,7 @@ BlockFrequencyInfo *JumpThreadingPass::getBFI() {
     assert(FAM && "Can't create BFI without FunctionAnalysisManager");
     BFI = FAM->getCachedResult<BlockFrequencyAnalysis>(*F);
   }
-  return *BFI;
+  return BFI;
 }
 
 // Important note on validity of BPI/BFI. JumpThreading tries to preserve
@@ -3237,7 +3237,7 @@ BranchProbabilityInfo *JumpThreadingPass::getOrCreateBPI(bool Force) {
   if (Force)
     BPI = runExternalAnalysis<BranchProbabilityAnalysis>();
 
-  return *BPI;
+  return BPI;
 }
 
 BlockFrequencyInfo *JumpThreadingPass::getOrCreateBFI(bool Force) {
@@ -3248,5 +3248,5 @@ BlockFrequencyInfo *JumpThreadingPass::getOrCreateBFI(bool Force) {
   if (Force)
     BFI = runExternalAnalysis<BlockFrequencyAnalysis>();
 
-  return *BFI;
+  return BFI;
 }

From c5613dc8635000bc0e8396b8156d5639195776ab Mon Sep 17 00:00:00 2001
From: lorenzo chelini <l.chelini@icloud.com>
Date: Wed, 18 Jun 2025 15:49:00 +0200
Subject: [PATCH 0813/1322] [MLIR] Mark LLVM::FMAOp as legal (#144671)

Mark LLVM::FMAOp as legal in configureGpuToNVVMConversionLegality, since
we can handle intrinsic lowering in the NVPTX backend and emit
fma.rn.f32.
---
 .../Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp  |  8 ++++----
 mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir    |  2 +-
 mlir/test/Integration/GPU/CUDA/dump-ptx.mlir       | 14 +++++++++++++-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 958d0d085fce..cef250232daf 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -429,10 +429,10 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
   target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
   target.addIllegalDialect<gpu::GPUDialect>();
   target.addIllegalOp<LLVM::CopySignOp, LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op,
-                      LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FMAOp,
-                      LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
-                      LLVM::PowOp, LLVM::RoundEvenOp, LLVM::RoundOp,
-                      LLVM::SinOp, LLVM::SqrtOp>();
+                      LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp,
+                      LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp,
+                      LLVM::RoundEvenOp, LLVM::RoundOp, LLVM::SinOp,
+                      LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 6d4555e815b6..ef06af3ad316 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1027,7 +1027,7 @@ module attributes {transform.with_named_sequence} {
       legal_ops = ["func.func", "gpu.module", "gpu.yield"],
       illegal_dialects = ["gpu"],
       illegal_ops = ["llvm.copysign", "llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
-                    "llvm.ffloor", "llvm.fma", "llvm.frem", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow",
+                    "llvm.ffloor", "llvm.frem", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow",
                     "llvm.roundeven", "llvm.round", "llvm.sin", "llvm.sqrt"],
       partial_conversion
     } : !transform.any_op
diff --git a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
index 0cc5d8645bb3..27ec1ec435fe 100644
--- a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
+++ b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
@@ -2,7 +2,7 @@
 // RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=serialize-to-isa \
 // RUN:  2>&1 | FileCheck %s
 
-// CHECK: Generated by LLVM NVPTX Back-End
+// CHECK-LABEL: Generated by LLVM NVPTX Back-End
 // CHECK: .visible .func kernel_a()
 // CHECK: ret;
 gpu.module @bar {
@@ -11,3 +11,15 @@ gpu.module @bar {
     llvm.return
   }
 }
+
+// CHECK-LABEL: Generated by LLVM NVPTX Back-End
+// CHECK: .visible .func  ({{.+}}) fma(
+// CHECK: fma.rn.f32
+
+gpu.module @foo {
+  llvm.func @fma(%arg0: f32, %arg1: f32) -> f32
+    attributes { gpu.kernel } {
+    %res = llvm.intr.fma (%arg0, %arg1, %arg1) : (f32, f32, f32) -> f32
+    llvm.return %res : f32
+  }
+}

From 1d6f1029f7e8cf5468309078da3e85201844b625 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <185856+superbobry@users.noreply.github.com>
Date: Wed, 18 Jun 2025 14:53:20 +0100
Subject: [PATCH 0814/1322] [mlir] [python] Fixed the return type of
 `MemRefType.get_strides_and_offset` (#144523)

Previously, the return type for `offset` was `list[int]`, which clearly
is not right.
---
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 6c5f91d757cd..70bca3c75d84 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -2119,7 +2119,7 @@ class MemRefType(ShapedType):
         """
     @property
     def typeid(self) -> TypeID: ...
-    def get_strides_and_offset(self) -> tuple[list[int], list[int]]:
+    def get_strides_and_offset(self) -> tuple[list[int], int]:
         """
         The strides and offset of the MemRef type.
         """

From 9db7502d229b48817521429c2a5d3fb84543fdf9 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 18 Jun 2025 15:55:06 +0200
Subject: [PATCH 0815/1322] [libc++] Move __has_iterator_typedefs to the
 up-to-C++17 implementation of iterator_traits (#144265)

`__has_iterator_typedefs` is only used in the up-to-C++17 implementation
of `type_traits`. To make that clearer the struct is moved into that
code block.
---
 libcxx/include/__iterator/iterator_traits.h | 34 ++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index 221d36614db0..f727e8ff36df 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -71,23 +71,6 @@ struct random_access_iterator_tag : public bidirectional_iterator_tag {};
 struct contiguous_iterator_tag : public random_access_iterator_tag {};
 #endif
 
-template <class _Tp>
-struct __has_iterator_typedefs {
-private:
-  template <class _Up>
-  static false_type __test(...);
-  template <class _Up>
-  static true_type
-  __test(__void_t<typename _Up::iterator_category>* = nullptr,
-         __void_t<typename _Up::difference_type>*   = nullptr,
-         __void_t<typename _Up::value_type>*        = nullptr,
-         __void_t<typename _Up::reference>*         = nullptr,
-         __void_t<typename _Up::pointer>*           = nullptr);
-
-public:
-  static const bool value = decltype(__test<_Tp>(nullptr, nullptr, nullptr, nullptr, nullptr))::value;
-};
-
 #if _LIBCPP_STD_VER >= 20
 
 // The `cpp17-*-iterator` exposition-only concepts have very similar names to the `Cpp17*Iterator` named requirements
@@ -322,6 +305,23 @@ struct __iterator_traits<_Iter, true>
                               is_convertible<typename _Iter::iterator_category, input_iterator_tag>::value ||
                                   is_convertible<typename _Iter::iterator_category, output_iterator_tag>::value > {};
 
+template <class _Tp>
+struct __has_iterator_typedefs {
+private:
+  template <class _Up>
+  static false_type __test(...);
+  template <class _Up>
+  static true_type
+  __test(__void_t<typename _Up::iterator_category>* = nullptr,
+         __void_t<typename _Up::difference_type>*   = nullptr,
+         __void_t<typename _Up::value_type>*        = nullptr,
+         __void_t<typename _Up::reference>*         = nullptr,
+         __void_t<typename _Up::pointer>*           = nullptr);
+
+public:
+  static const bool value = decltype(__test<_Tp>(nullptr, nullptr, nullptr, nullptr, nullptr))::value;
+};
+
 // iterator_traits<Iterator> will only have the nested types if Iterator::iterator_category
 //    exists.  Else iterator_traits<Iterator> will be an empty class.  This is a
 //    conforming extension which allows some programs to compile and behave as

From 40d2f392106f43a60eea79f433b47a5ce44fc4a4 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanak@gmail.com>
Date: Wed, 18 Jun 2025 07:08:32 -0700
Subject: [PATCH 0816/1322] [Sema][ObjC] Loosen restrictions on
 reinterpret_cast involving indirect ARC-managed pointers (#144458)

Allow using reinterpret_cast for conversions between indirect ARC
pointers and other pointer types.

rdar://152905399
---
 clang/docs/ReleaseNotes.rst                  |  3 +++
 clang/include/clang/Sema/SemaObjC.h          |  3 ++-
 clang/lib/Sema/SemaCast.cpp                  | 11 ++++++----
 clang/lib/Sema/SemaExprObjC.cpp              | 12 ++++++----
 clang/test/SemaObjCXX/arc-type-conversion.mm | 23 ++++++++++++++++++--
 5 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 12816eed2e8b..18234188101f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -327,6 +327,9 @@ Non-comprehensive list of changes in this release
   ``__reference_constructs_from_temporary`` should be used instead. (#GH44056)
 - Added `__builtin_get_vtable_pointer` to directly load the primary vtable pointer from a
   polymorphic object.
+- Clang no longer rejects reinterpret_cast conversions between indirect
+  ARC-managed pointers and other pointer types. The prior behavior was overly
+  strict and inconsistent with the ARC specification.
 
 New Compiler Flags
 ------------------
diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h
index b629c6d29140..ed08ff0acf89 100644
--- a/clang/include/clang/Sema/SemaObjC.h
+++ b/clang/include/clang/Sema/SemaObjC.h
@@ -812,7 +812,8 @@ public:
                                           CheckedConversionKind CCK,
                                           bool Diagnose = true,
                                           bool DiagnoseCFAudited = false,
-                                          BinaryOperatorKind Opc = BO_PtrMemD);
+                                          BinaryOperatorKind Opc = BO_PtrMemD,
+                                          bool IsReinterpretCast = false);
 
   Expr *stripARCUnbridgedCast(Expr *e);
   void diagnoseARCUnbridgedCast(Expr *e);
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 14e16bc39eb3..e15a43c11651 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -161,12 +161,14 @@ namespace {
       Self.CheckCastAlign(SrcExpr.get(), DestType, OpRange);
     }
 
-    void checkObjCConversion(CheckedConversionKind CCK) {
+    void checkObjCConversion(CheckedConversionKind CCK,
+                             bool IsReinterpretCast = false) {
       assert(Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers());
 
       Expr *src = SrcExpr.get();
-      if (Self.ObjC().CheckObjCConversion(OpRange, DestType, src, CCK) ==
-          SemaObjC::ACR_unbridged)
+      if (Self.ObjC().CheckObjCConversion(
+              OpRange, DestType, src, CCK, true, false, BO_PtrMemD,
+              IsReinterpretCast) == SemaObjC::ACR_unbridged)
         IsARCUnbridgedCast = true;
       SrcExpr = src;
     }
@@ -1263,7 +1265,8 @@ void CastOperation::CheckReinterpretCast() {
 
   if (isValidCast(tcr)) {
     if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers())
-      checkObjCConversion(CheckedConversionKind::OtherCast);
+      checkObjCConversion(CheckedConversionKind::OtherCast,
+                          /*IsReinterpretCast=*/true);
     DiagnoseReinterpretUpDownCast(Self, SrcExpr.get(), DestType, OpRange);
 
     if (unsigned DiagID = checkCastFunctionType(Self, SrcExpr, DestType))
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 3505d9f38d23..395f2f340dbd 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -4390,7 +4390,7 @@ SemaObjC::ARCConversionResult
 SemaObjC::CheckObjCConversion(SourceRange castRange, QualType castType,
                               Expr *&castExpr, CheckedConversionKind CCK,
                               bool Diagnose, bool DiagnoseCFAudited,
-                              BinaryOperatorKind Opc) {
+                              BinaryOperatorKind Opc, bool IsReinterpretCast) {
   ASTContext &Context = getASTContext();
   QualType castExprType = castExpr->getType();
 
@@ -4450,13 +4450,17 @@ SemaObjC::CheckObjCConversion(SourceRange castRange, QualType castType,
   // must be explicit.
   // Allow conversions between pointers to lifetime types and coreFoundation
   // pointers too, but only when the conversions are explicit.
+  // Allow conversions requested with a reinterpret_cast that converts an
+  // expression of type T* to type U*.
   if (exprACTC == ACTC_indirectRetainable &&
       (castACTC == ACTC_voidPtr ||
-       (castACTC == ACTC_coreFoundation && SemaRef.isCast(CCK))))
+       (castACTC == ACTC_coreFoundation && SemaRef.isCast(CCK)) ||
+       (IsReinterpretCast && effCastType->isAnyPointerType())))
     return ACR_okay;
   if (castACTC == ACTC_indirectRetainable &&
-      (exprACTC == ACTC_voidPtr || exprACTC == ACTC_coreFoundation) &&
-      SemaRef.isCast(CCK))
+      (((exprACTC == ACTC_voidPtr || exprACTC == ACTC_coreFoundation) &&
+        SemaRef.isCast(CCK)) ||
+       (IsReinterpretCast && castExprType->isAnyPointerType())))
     return ACR_okay;
 
   switch (ARCCastChecker(Context, exprACTC, castACTC, false).Visit(castExpr)) {
diff --git a/clang/test/SemaObjCXX/arc-type-conversion.mm b/clang/test/SemaObjCXX/arc-type-conversion.mm
index 64cfd02ec18c..0d281bf3e5c4 100644
--- a/clang/test/SemaObjCXX/arc-type-conversion.mm
+++ b/clang/test/SemaObjCXX/arc-type-conversion.mm
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -fobjc-runtime-has-weak -fsyntax-only -fobjc-arc -verify -fblocks %s
 
+@class NSString;
+typedef unsigned __INTPTR_TYPE__ uintptr_t;
+
 void * cvt(id arg) // expected-note{{candidate function not viable: cannot convert argument of incomplete type 'void *' to '__strong id'}}
 {
   void* voidp_val;
@@ -72,6 +75,24 @@ void test_reinterpret_cast(__strong id *sip, __weak id *wip,
   (void)reinterpret_cast<__weak id *>(cwip); // expected-error{{reinterpret_cast from '__weak id const *' to '__weak id *' casts away qualifiers}}
   (void)reinterpret_cast<__weak id *>(csip); // expected-error{{reinterpret_cast from '__strong id const *' to '__weak id *' casts away qualifiers}}
   (void)reinterpret_cast<__strong id *>(cwip); // expected-error{{reinterpret_cast from '__weak id const *' to '__strong id *' casts away qualifiers}}
+
+  auto *ul = reinterpret_cast<unsigned long *>(sip);
+  (void)reinterpret_cast<__strong id *>(ul);
+  auto *wp = reinterpret_cast<__weak NSString *>(sip);
+  (void)reinterpret_cast<__strong id *>(wp);
+  (void)reinterpret_cast<unsigned long *>(csip); // expected-error {{reinterpret_cast from '__strong id const *' to 'unsigned long *' casts away qualifiers}}
+  (void)reinterpret_cast<const unsigned long *>(csip);
+  const unsigned long *cul = nullptr;
+  (void)reinterpret_cast<__strong id *>(cul); // expected-error {{reinterpret_cast from 'const unsigned long *' to '__strong id *' casts away qualifiers}}
+  (void)reinterpret_cast<const __strong id *>(cul);
+  volatile __strong id *vsip = nullptr;
+  (void)reinterpret_cast<unsigned long *>(vsip); // expected-error {{reinterpret_cast from '__strong id volatile *' to 'unsigned long *' casts away qualifiers}}
+  (void)reinterpret_cast<volatile unsigned long *>(vsip);
+  volatile unsigned long *vul = nullptr;
+  (void)reinterpret_cast<__strong id *>(vul); // expected-error {{reinterpret_cast from 'volatile unsigned long *' to '__strong id *' casts away qualifiers}}
+  (void)reinterpret_cast<volatile __strong id *>(vul);
+  auto uip = reinterpret_cast<uintptr_t>(sip);
+  (void)reinterpret_cast<__strong id *>(uip); // expected-error {{to '__strong id *' is disallowed with ARC}}
 }
 
 void test_cstyle_cast(__strong id *sip, __weak id *wip, 
@@ -194,8 +215,6 @@ typedef void (^Block)();
 typedef void (^Block_strong)() __strong;
 typedef void (^Block_autoreleasing)() __autoreleasing;
 
-@class NSString;
-
 void ownership_transfer_in_cast(void *vp, Block *pblk) {
   __strong NSString **sip2 = static_cast<NSString **>(static_cast<__strong id *>(vp));
   __strong NSString **&si2pref = static_cast<NSString **&>(sip2);

From ee070d08163ac09842d9bf0c1315f311df39faf1 Mon Sep 17 00:00:00 2001
From: Andrei Golubev <andrey.golubev@intel.com>
Date: Wed, 18 Jun 2025 16:18:12 +0200
Subject: [PATCH 0817/1322] [mlir][bufferization] Support custom types (1/N)
 (#142986)

Following the addition of TensorLike and BufferLike type interfaces (see
00eaff3e9c897c263a879416d0f151d7ca7eeaff), introduce minimal changes
required to bufferize a custom tensor operation into a custom buffer
operation.

To achieve this, new interface methods are added to TensorLike type
interface that abstract away the differences between existing (tensor ->
memref) and custom conversions.

The scope of the changes is intentionally limited (for example,
BufferizableOpInterface is untouched) in order to first understand the
basics and reach consensus design-wise.

---
Notable changes:
* mlir::bufferization::getBufferType() returns BufferLikeType (instead
of BaseMemRefType)
* ToTensorOp / ToBufferOp operate on TensorLikeType / BufferLikeType.
Operation argument "memref" renamed to "buffer"
* ToTensorOp's tensor type inferring builder is dropped (users now need
to provide the tensor type explicitly)
---
 .../IR/BufferizableOpInterface.h              | 18 ++++-
 .../Bufferization/IR/BufferizationOps.td      | 59 ++++++++---------
 .../IR/BufferizationTypeInterfaces.h          |  7 ++
 .../IR/BufferizationTypeInterfaces.td         | 26 +++++++-
 .../IR/UnstructuredControlFlow.h              |  5 +-
 .../BufferizableOpInterfaceImpl.cpp           | 14 ++--
 .../IR/BufferizableOpInterface.cpp            | 65 +++++++++++--------
 .../Bufferization/IR/BufferizationDialect.cpp | 32 ++++++++-
 .../Bufferization/IR/BufferizationOps.cpp     | 26 ++++----
 .../IR/BufferizationTypeInterfaces.cpp        | 21 ++++++
 .../Dialect/Bufferization/IR/CMakeLists.txt   |  1 +
 .../Bufferization/Transforms/Bufferize.cpp    |  8 +--
 .../FuncBufferizableOpInterfaceImpl.cpp       |  8 +--
 .../Transforms/ConvertToDestinationStyle.cpp  |  9 ++-
 .../BufferizableOpInterfaceImpl.cpp           | 51 ++++++++-------
 .../BufferizableOpInterfaceImpl.cpp           |  2 +-
 .../Transforms/SparseGPUCodegen.cpp           | 15 +++--
 .../Transforms/SparseTensorCodegen.cpp        |  3 +-
 .../Transforms/SparseTensorConversion.cpp     |  4 +-
 .../Transforms/Utils/CodegenUtils.cpp         |  4 +-
 .../BufferizableOpInterfaceImpl.cpp           | 14 ++--
 .../Transforms/one-shot-bufferize.mlir        | 21 +++++-
 mlir/test/lib/Dialect/Test/TestOpDefs.cpp     | 23 +++++++
 mlir/test/lib/Dialect/Test/TestOps.h          |  1 +
 mlir/test/lib/Dialect/Test/TestOps.td         | 58 ++++++++++++++++-
 mlir/test/lib/Dialect/Test/TestTypeDefs.td    |  9 +++
 mlir/test/lib/Dialect/Test/TestTypes.cpp      | 20 ++++++
 27 files changed, 389 insertions(+), 135 deletions(-)
 create mode 100644 mlir/lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 2fb795f16ae2..c1529a36465a 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -17,6 +17,7 @@
 #include <optional>
 
 #include "mlir/Dialect/Bufferization/IR/BufferizationEnums.h.inc"
+#include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h"
 
 namespace mlir {
 class OpBuilder;
@@ -615,7 +616,7 @@ FailureOr<Value> getBuffer(RewriterBase &rewriter, Value value,
 /// IR, this function can be used.
 ///
 /// This function is a wrapper around BufferizableOpInterface::getBufferType.
-FailureOr<BaseMemRefType> getBufferType(Value value,
+FailureOr<BufferLikeType> getBufferType(Value value,
                                         const BufferizationOptions &options,
                                         const BufferizationState &state);
 
@@ -629,7 +630,7 @@ FailureOr<BaseMemRefType> getBufferType(Value value,
 /// IR, this function can be used.
 ///
 /// This function is a wrapper around `BufferizableOpInterface::getBufferType`.
-FailureOr<BaseMemRefType> getBufferType(Value value,
+FailureOr<BufferLikeType> getBufferType(Value value,
                                         const BufferizationOptions &options,
                                         const BufferizationState &state,
                                         SmallVector<Value> &invocationStack);
@@ -739,6 +740,19 @@ AliasingValueList unknownGetAliasingValues(OpOperand &opOperand);
 /// This is the default implementation of
 /// BufferizableOpInterface::hasTensorSemantics
 bool defaultHasTensorSemantics(Operation *op);
+
+/// This is a helper function used when buffer type is guaranteed to be memref.
+/// It performs two actions: failure state checking and an explicit llvm::cast<>
+/// from the buffer-like type interface to a BaseMemRefType. This allows easier
+/// management of differences in C++ types at the API boundaries. Valid buffer
+/// type is casted to the memref type. Otherwise, the failure state is
+/// propagated i.e. asMemRefType(mlir::failure()) returns mlir::failure().
+FailureOr<BaseMemRefType> asMemRefType(FailureOr<BufferLikeType> bufferType);
+
+/// This function is a free-standing helper that relies on
+/// bufferization::TensorLikeTypeInterface to verify the types in tensor and
+/// buffer worlds match.
+bool typesMatchAfterBufferization(Operation &op, Value tensor, Value buffer);
 } // namespace detail
 
 } // namespace bufferization
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index 6051aea84997..32c53ea9c494 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -12,6 +12,7 @@
 include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
+include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td"
 include "mlir/Dialect/Bufferization/IR/BufferizationBase.td"
 include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
@@ -386,20 +387,31 @@ def Bufferization_DeallocTensorOp : Bufferization_Op<"dealloc_tensor",
 // ToTensorOp
 //===----------------------------------------------------------------------===//
 
+class Bufferization_TensorAndBufferMatch<string tensor, string buffer> : PredOpTrait<
+  "specified tensor and buffer types match",
+  CPred<
+    "::mlir::bufferization::detail::typesMatchAfterBufferization("
+        "$_op, $" # tensor # ", $" # buffer #")"
+  >
+>;
+
 def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     BufferizableOpInterface,
     SameOperandsAndResultShape,
     SameOperandsAndResultElementType,
-    AllElementTypesMatch<["memref", "result"]>
+    Bufferization_TensorAndBufferMatch<"result", "buffer">
   ]> {
-  let summary = "create a tensor from a `memref`";
+  let summary = "create a buffer-like type from a tensor-like type";
   let description = [{
-    An operation that creates a tensor from a `memref`. The result value is a
-    tensor whose shape and element type match the memref operand.
+    An operation that creates a tensor from a buffer. The result value is a
+    tensor-like type that must match the corresponding buffer-like operand as
+    per TensorLikeType::verifyCompatibleBufferType(). For builtins (TensorType
+    and BaseMemRefType), this means that shapes and element types match between
+    the tensor and the buffer.
 
     The opposite of this op is `to_buffer`. Together, these two ops are
     useful for source/target materializations when doing type conversions
-    involving tensors and memrefs.
+    involving tensors and buffers.
 
     Example:
 
@@ -441,19 +453,16 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     away. However, such IR is no longer bufferizable with One-Shot Bufferize.
   }];
 
-  let arguments = (ins Arg<AnyRankedOrUnrankedMemRef,
+  let arguments = (ins Arg<Bufferization_BufferLikeTypeInterface,
                            "the reference to load from",
-                           [MemReadAt<0, FullEffect>]>:$memref,
+                           [MemReadAt<0, FullEffect>]>:$buffer,
                        UnitAttr:$restrict, UnitAttr:$writable);
-  let results = (outs AnyTensor:$result);
+  let results = (outs Bufferization_TensorLikeTypeInterface:$result);
 
   let extraClassDeclaration = [{
     /// The result of a to_tensor is always a tensor.
-    TensorType getType() {
-      Type resultType = getResult().getType();
-      if (::llvm::isa<TensorType>(resultType))
-        return ::llvm::cast<TensorType>(resultType);
-      return {};
+    ::mlir::bufferization::TensorLikeType getType() {
+      return getResult().getType();
     }
 
     //===------------------------------------------------------------------===//
@@ -472,22 +481,15 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     FailureOr<BaseMemRefType> getBufferType(
         Value value, const BufferizationOptions &options,
         const BufferizationState &state, SmallVector<Value> &invocationStack) {
-      return ::llvm::cast<BaseMemRefType>(getMemref().getType());
+      return ::llvm::cast<BaseMemRefType>(getBuffer().getType());
     }
   }];
 
   let assemblyFormat = [{
-    $memref (`restrict` $restrict^)? (`writable` $writable^)? attr-dict
-      `:` type($memref) `to` type($result)
+    $buffer (`restrict` $restrict^)? (`writable` $writable^)? attr-dict
+      `:` type($buffer) `to` type($result)
   }];
 
-  let builders = [
-    OpBuilder<(ins "Value":$memref, CArg<"bool", "false">:$restrict, CArg<"bool", "false">:$writeable), [{
-      auto rtt = memref::getTensorTypeFromMemRefType(memref.getType());
-      build($_builder, $_state, rtt, memref, restrict, writeable);
-    }]>
-  ];
-
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
@@ -502,10 +504,9 @@ def Bufferization_ToBufferOp : Bufferization_Op<"to_buffer", [
     SameOperandsAndResultShape,
     SameOperandsAndResultElementType,
     Pure,
-    AllShapesMatch<["memref", "tensor"]>,
-    AllElementTypesMatch<["memref", "tensor"]>
+    Bufferization_TensorAndBufferMatch<"tensor", "buffer">
   ]> {
-  let summary = "cast a tensor to memref";
+  let summary = "cast a tensor-like type to buffer-like type";
   let description = [{
     An operation that returns the future buffer of a `tensor`.
 
@@ -523,8 +524,8 @@ def Bufferization_ToBufferOp : Bufferization_Op<"to_buffer", [
     the returned buffer) will not be written to.
   }];
 
-  let arguments = (ins AnyTensor:$tensor, UnitAttr:$read_only);
-  let results = (outs AnyRankedOrUnrankedMemRef:$memref);
+  let arguments = (ins Bufferization_TensorLikeTypeInterface:$tensor, UnitAttr:$read_only);
+  let results = (outs Bufferization_BufferLikeTypeInterface:$buffer);
 
   let extraClassDeclaration = [{
     //===------------------------------------------------------------------===//
@@ -559,7 +560,7 @@ def Bufferization_ToBufferOp : Bufferization_Op<"to_buffer", [
   }];
 
   let assemblyFormat = [{
-    $tensor (`read_only` $read_only^)? attr-dict `:` type($tensor) `to` type($memref)
+    $tensor (`read_only` $read_only^)? attr-dict `:` type($tensor) `to` type($buffer)
   }];
 
   let hasFolder = 1;
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h
index 5faa1479ee54..cbb6054fcf88 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h
@@ -13,8 +13,15 @@
 // Bufferization Type Interfaces
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Types.h"
 
+namespace mlir::bufferization {
+struct BufferizationOptions;
+class BufferizationState;
+class BufferLikeType;
+} // namespace mlir::bufferization
+
 #include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h.inc"
 
 #endif // MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZATIONTYPEINTERFACES_H_
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td
index f19224a29564..fb6fc4f5ad96 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td
@@ -21,10 +21,30 @@ def Bufferization_TensorLikeTypeInterface
   let description = [{
     Indicates that this type is a tensor type (similarly to a MLIR builtin
     tensor) for bufferization purposes.
-
-    The interface currently has no methods as it is used by types to opt into
-    being supported by the bufferization procedures.
   }];
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns a BufferLike type for this TensorLike type.
+      }],
+      /*retTy=*/"::mlir::FailureOr<::mlir::bufferization::BufferLikeType>",
+      /*methodName=*/"getBufferType",
+      /*args=*/(ins
+        "const ::mlir::bufferization::BufferizationOptions &":$options,
+        "::llvm::function_ref<::mlir::InFlightDiagnostic()>":$emitError
+      )
+    >,
+    InterfaceMethod<[{
+        Returns whether a BufferLike type is compatible to this TensorLike type.
+        The BufferLike type is assumed to be created by getBufferType().
+      }],
+      /*retTy=*/"::mlir::LogicalResult",
+      /*methodName=*/"verifyCompatibleBufferType",
+      /*args=*/(ins
+        "::mlir::bufferization::BufferLikeType":$bufferType,
+        "::llvm::function_ref<::mlir::InFlightDiagnostic()>":$emitError)
+    >
+  ];
 }
 
 def Bufferization_BufferLikeTypeInterface
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h b/mlir/include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h
index a441b8b66659..f56c10555f02 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h
@@ -65,12 +65,13 @@ struct OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel
         // The operand was already bufferized. Take its type directly.
         callerType = memrefType;
       } else {
-        FailureOr<BaseMemRefType> maybeCallerType =
+        FailureOr<BufferLikeType> maybeCallerType =
             bufferization::getBufferType(opOperand->get(), options, state,
                                          invocationStack);
         if (failed(maybeCallerType))
           return failure();
-        callerType = *maybeCallerType;
+        assert(isa<BaseMemRefType>(*maybeCallerType) && "expected memref type");
+        callerType = cast<BaseMemRefType>(*maybeCallerType);
       }
 
       if (!bufferType) {
diff --git a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
index a57d58ab28d2..85d1b5ac73bf 100644
--- a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -164,8 +164,8 @@ struct SelectOpInterface
     // buffers have different types, they differ only in their layout map. Cast
     // both of them to the most dynamic MemRef type.
     if (trueBuffer.getType() != falseBuffer.getType()) {
-      auto targetType =
-          bufferization::getBufferType(selectOp.getResult(), options, state);
+      auto targetType = bufferization::detail::asMemRefType(
+          bufferization::getBufferType(selectOp.getResult(), options, state));
       if (failed(targetType))
         return failure();
       if (trueBuffer.getType() != *targetType)
@@ -187,10 +187,12 @@ struct SelectOpInterface
                 SmallVector<Value> &invocationStack) const {
     auto selectOp = cast<arith::SelectOp>(op);
     assert(value == selectOp.getResult() && "invalid value");
-    auto trueType = bufferization::getBufferType(
-        selectOp.getTrueValue(), options, state, invocationStack);
-    auto falseType = bufferization::getBufferType(
-        selectOp.getFalseValue(), options, state, invocationStack);
+    auto trueType =
+        bufferization::detail::asMemRefType(bufferization::getBufferType(
+            selectOp.getTrueValue(), options, state, invocationStack));
+    auto falseType =
+        bufferization::detail::asMemRefType(bufferization::getBufferType(
+            selectOp.getFalseValue(), options, state, invocationStack));
     if (failed(trueType) || failed(falseType))
       return failure();
     if (*trueType == *falseType)
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index dd43647682ea..2ab182c9b7b2 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -171,7 +171,9 @@ FailureOr<Value> bufferization::allocateTensorForShapedValue(
   if (llvm::isa<RankedTensorType>(shapedValue.getType())) {
     tensor = shapedValue;
   } else if (llvm::isa<MemRefType>(shapedValue.getType())) {
-    tensor = b.create<ToTensorOp>(loc, shapedValue);
+    tensor = b.create<ToTensorOp>(
+        loc, memref::getTensorTypeFromMemRefType(shapedValue.getType()),
+        shapedValue);
   } else if (llvm::isa<UnrankedTensorType>(shapedValue.getType()) ||
              llvm::isa<UnrankedMemRefType>(shapedValue.getType())) {
     return getOwnerOfValue(shapedValue)
@@ -211,8 +213,8 @@ FailureOr<Value> bufferization::allocateTensorForShapedValue(
   // Add 'memory_space' attribute. Not needed if 'copy' operand is specified.
   if (copy)
     return allocTensorOp.getResult();
-  FailureOr<BaseMemRefType> copyBufferType =
-      getBufferType(tensor, options, state);
+  auto copyBufferType =
+      detail::asMemRefType(getBufferType(tensor, options, state));
   if (failed(copyBufferType))
     return failure();
   std::optional<Attribute> memorySpace = copyBufferType->getMemorySpace();
@@ -672,28 +674,28 @@ FailureOr<Value> bufferization::getBuffer(RewriterBase &rewriter, Value value,
                                           const BufferizationOptions &options,
                                           const BufferizationState &state) {
 #ifndef NDEBUG
-  auto tensorType = llvm::dyn_cast<TensorType>(value.getType());
+  auto tensorType = llvm::dyn_cast<TensorLikeType>(value.getType());
   assert(tensorType && "unexpected non-tensor type");
 #endif // NDEBUG
 
   // Replace "%t = to_tensor %m" with %m.
   if (auto toTensorOp = value.getDefiningOp<bufferization::ToTensorOp>())
-    return toTensorOp.getMemref();
+    return toTensorOp.getBuffer();
 
   // Insert to_buffer op.
   OpBuilder::InsertionGuard g(rewriter);
   setInsertionPointAfter(rewriter, value);
-  FailureOr<BaseMemRefType> memrefType = getBufferType(value, options, state);
-  if (failed(memrefType))
+  FailureOr<BufferLikeType> bufferType = getBufferType(value, options, state);
+  if (failed(bufferType))
     return failure();
-  ensureToBufferOpIsValid(value, *memrefType);
+  ensureToBufferOpIsValid(value, *bufferType);
   return rewriter
-      .create<bufferization::ToBufferOp>(value.getLoc(), *memrefType, value)
+      .create<bufferization::ToBufferOp>(value.getLoc(), *bufferType, value)
       .getResult();
 }
 
 /// Return the buffer type for a given Value (tensor) after bufferization.
-FailureOr<BaseMemRefType>
+FailureOr<BufferLikeType>
 bufferization::getBufferType(Value value, const BufferizationOptions &options,
                              const BufferizationState &state) {
   SmallVector<Value> invocationStack;
@@ -701,11 +703,11 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options,
 }
 
 /// Return the buffer type for a given Value (tensor) after bufferization.
-FailureOr<BaseMemRefType>
+FailureOr<BufferLikeType>
 bufferization::getBufferType(Value value, const BufferizationOptions &options,
                              const BufferizationState &state,
                              SmallVector<Value> &invocationStack) {
-  assert(llvm::isa<TensorType>(value.getType()) &&
+  assert(llvm::isa<TensorLikeType>(value.getType()) &&
          "unexpected non-tensor type");
   invocationStack.push_back(value);
   auto popFromStack =
@@ -718,13 +720,9 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options,
     return bufferizableOp.getBufferType(value, options, state, invocationStack);
 
   // Op is not bufferizable.
-  auto memSpace =
-      options.defaultMemorySpaceFn(cast<TensorType>(value.getType()));
-  if (!memSpace.has_value())
-    return op->emitError("could not infer memory space");
-
-  return getMemRefType(cast<TensorType>(value.getType()), options,
-                       /*layout=*/{}, *memSpace);
+  return cast<TensorLikeType>(value.getType()).getBufferType(options, [&]() {
+    return op->emitError();
+  });
 }
 
 bool bufferization::hasTensorSemantics(Operation *op) {
@@ -744,12 +742,11 @@ void bufferization::replaceOpWithBufferizedValues(RewriterBase &rewriter,
   SmallVector<Value> replacements;
   for (OpResult opResult : op->getOpResults()) {
     Value replacement = values[opResult.getResultNumber()];
-    if (llvm::isa<TensorType>(opResult.getType())) {
+    if (llvm::isa<TensorLikeType>(opResult.getType())) {
       // The OpResult is a tensor. Such values are replaced with memrefs during
       // bufferization.
-      assert((llvm::isa<MemRefType>(replacement.getType()) ||
-              llvm::isa<UnrankedMemRefType>(replacement.getType())) &&
-             "tensor op result should be replaced with a memref value");
+      assert(llvm::isa<BufferLikeType>(replacement.getType()) &&
+             "tensor op result should be replaced with a buffer value");
       // The existing uses of the OpResult still expect a tensor. Insert a
       // ToTensorOp. Throughout bufferization, this ToTensorOp will gradually
       // loose all of its users and eventually DCE away.
@@ -969,8 +966,8 @@ FailureOr<BaseMemRefType> bufferization::detail::defaultGetBufferType(
     // If the OpResult has an equivalent OpOperand, both OpResult and
     // OpOperand bufferize to the exact same buffer type.
     Value equivalentOperand = aliases.getAliases().front().opOperand->get();
-    return getBufferType(equivalentOperand, options, bufferizationState,
-                         invocationStack);
+    return asMemRefType(getBufferType(equivalentOperand, options,
+                                      bufferizationState, invocationStack));
   }
 
   // If we do not know the memory space and there is no default memory space,
@@ -1030,7 +1027,7 @@ bufferization::detail::unknownGetAliasingValues(OpOperand &opOperand) {
 }
 
 bool bufferization::detail::defaultHasTensorSemantics(Operation *op) {
-  auto isaTensor = [](Type t) { return isa<TensorType>(t); };
+  auto isaTensor = [](Type t) { return isa<TensorLikeType>(t); };
   bool hasTensorBlockArgument = any_of(op->getRegions(), [&](Region &r) {
     return any_of(r.getBlocks(), [&](Block &b) {
       return any_of(b.getArguments(), [&](BlockArgument bbArg) {
@@ -1045,3 +1042,19 @@ bool bufferization::detail::defaultHasTensorSemantics(Operation *op) {
     return true;
   return any_of(op->getOperandTypes(), isaTensor);
 }
+
+FailureOr<BaseMemRefType>
+bufferization::detail::asMemRefType(FailureOr<BufferLikeType> bufferType) {
+  if (failed(bufferType))
+    return failure();
+  return cast<BaseMemRefType>(*bufferType);
+}
+
+bool bufferization::detail::typesMatchAfterBufferization(Operation &op,
+                                                         Value tensor,
+                                                         Value buffer) {
+  return mlir::succeeded(
+      cast<TensorLikeType>(tensor.getType())
+          .verifyCompatibleBufferType(cast<BufferLikeType>(buffer.getType()),
+                                      [&]() { return op.emitError(); }));
+}
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
index d8eac01c2dea..6c08cdfb669f 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationDialect.cpp
@@ -57,7 +57,37 @@ struct BufferizationInlinerInterface : public DialectInlinerInterface {
 template <typename Tensor>
 struct BuiltinTensorExternalModel
     : TensorLikeType::ExternalModel<BuiltinTensorExternalModel<Tensor>,
-                                    Tensor> {};
+                                    Tensor> {
+  llvm::FailureOr<BufferLikeType> getBufferType(
+      mlir::Type tensor, const BufferizationOptions &options,
+      llvm::function_ref<mlir::InFlightDiagnostic()> emitError) const {
+    auto tensorType = cast<TensorType>(tensor);
+    auto memSpace = options.defaultMemorySpaceFn(tensorType);
+    if (!memSpace.has_value())
+      return emitError() << "could not infer memory space";
+
+    return cast<BufferLikeType>(
+        getMemRefType(tensorType, options, /*layout=*/{}, *memSpace));
+  }
+
+  mlir::LogicalResult verifyCompatibleBufferType(
+      mlir::Type tensor, BufferLikeType bufferType,
+      llvm::function_ref<mlir::InFlightDiagnostic()> emitError) const {
+    assert(isa<TensorType>(tensor) && "expected tensor type");
+    assert(isa<BaseMemRefType>(bufferType) && "expected memref type");
+
+    auto tensorType = cast<ShapedType>(tensor);
+    auto memrefType = cast<ShapedType>(bufferType);
+
+    if (tensorType.getShape() != memrefType.getShape())
+      return emitError() << "shapes do not match";
+
+    if (tensorType.getElementType() != memrefType.getElementType())
+      return emitError() << "element types do not match";
+
+    return mlir::success();
+  }
+};
 
 template <typename MemRef>
 struct BuiltinMemRefExternalModel
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index dc54ac94aed3..9bd87d66c7d3 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -90,12 +90,12 @@ LogicalResult mlir::bufferization::foldToBufferToTensorPair(
   if (!bufferToTensor)
     return failure();
 
-  Type srcType = bufferToTensor.getMemref().getType();
+  Type srcType = bufferToTensor.getBuffer().getType();
   Type destType = toBuffer.getType();
 
   // Directly rewrite if the type did not change.
   if (srcType == destType) {
-    rewriter.replaceOp(toBuffer, bufferToTensor.getMemref());
+    rewriter.replaceOp(toBuffer, bufferToTensor.getBuffer());
     return success();
   }
 
@@ -106,7 +106,7 @@ LogicalResult mlir::bufferization::foldToBufferToTensorPair(
   // Ranked memref -> Ranked memref cast.
   if (rankedSrcType && rankedDestType) {
     FailureOr<Value> replacement = castOrReallocMemRefValue(
-        rewriter, bufferToTensor.getMemref(), rankedDestType, options);
+        rewriter, bufferToTensor.getBuffer(), rankedDestType, options);
     if (failed(replacement))
       return failure();
 
@@ -124,7 +124,7 @@ LogicalResult mlir::bufferization::foldToBufferToTensorPair(
   assert(memref::CastOp::areCastCompatible(srcType, destType) &&
          "expected that types are cast compatible");
   rewriter.replaceOpWithNewOp<memref::CastOp>(toBuffer, destType,
-                                              bufferToTensor.getMemref());
+                                              bufferToTensor.getBuffer());
   return success();
 }
 
@@ -233,8 +233,9 @@ AllocTensorOp::getBufferType(Value value, const BufferizationOptions &options,
   if (getMemorySpace().has_value()) {
     memorySpace = *getMemorySpace();
   } else if (getCopy()) {
-    auto copyBufferType = bufferization::getBufferType(getCopy(), options,
-                                                       state, invocationStack);
+    auto copyBufferType =
+        bufferization::detail::asMemRefType(bufferization::getBufferType(
+            getCopy(), options, state, invocationStack));
     if (failed(copyBufferType))
       return failure();
     memorySpace = copyBufferType->getMemorySpace();
@@ -642,8 +643,9 @@ Value MaterializeInDestinationOp::buildSubsetExtraction(OpBuilder &builder,
   assert(getRestrict() &&
          "expected that ops with memrefs dest have 'restrict'");
   setRestrict(false);
-  return builder.create<ToTensorOp>(loc, getDest(), /*restrict=*/true,
-                                    getWritable());
+  return builder.create<ToTensorOp>(
+      loc, memref::getTensorTypeFromMemRefType(getDest().getType()), getDest(),
+      /*restrict=*/true, getWritable());
 }
 
 bool MaterializeInDestinationOp::isEquivalentSubset(
@@ -744,7 +746,7 @@ bool ToTensorOp::isWritable(Value value, const AnalysisState &state) {
 }
 
 OpFoldResult ToTensorOp::fold(FoldAdaptor) {
-  if (auto toBuffer = getMemref().getDefiningOp<ToBufferOp>())
+  if (auto toBuffer = getBuffer().getDefiningOp<ToBufferOp>())
     // Approximate alias analysis by conservatively folding only when no there
     // is no interleaved operation.
     if (toBuffer->getBlock() == this->getOperation()->getBlock() &&
@@ -764,7 +766,7 @@ struct DimOfToTensorFolder : public OpRewritePattern<tensor::DimOp> {
       return failure();
 
     rewriter.replaceOpWithNewOp<memref::DimOp>(
-        dimOp, memrefToTensorOp.getMemref(), dimOp.getIndex());
+        dimOp, memrefToTensorOp.getBuffer(), dimOp.getIndex());
     return success();
   }
 };
@@ -781,8 +783,8 @@ void ToTensorOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 OpFoldResult ToBufferOp::fold(FoldAdaptor) {
   if (auto memrefToTensor = getTensor().getDefiningOp<ToTensorOp>())
-    if (memrefToTensor.getMemref().getType() == getType())
-      return memrefToTensor.getMemref();
+    if (memrefToTensor.getBuffer().getType() == getType())
+      return memrefToTensor.getBuffer();
   return {};
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp
new file mode 100644
index 000000000000..0e973915c6fc
--- /dev/null
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp
@@ -0,0 +1,21 @@
+//===- BufferizationTypeInterfaces.cpp - Type Interfaces --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.h"
+
+//===----------------------------------------------------------------------===//
+// Bufferization Type Interfaces
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace bufferization {
+
+#include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp.inc"
+
+} // namespace bufferization
+} // namespace mlir
diff --git a/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
index 63dcc1eb233e..5d8f0060f2c3 100644
--- a/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRBufferizationDialect
   BufferizationDialect.cpp
   BufferViewFlowOpInterface.cpp
   UnstructuredControlFlow.cpp
+  BufferizationTypeInterfaces.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Bufferization
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 7e9b9119ce94..6472ef3eff2a 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -412,11 +412,11 @@ bufferization::bufferizeBlockSignature(Block *block, RewriterBase &rewriter,
       continue;
     }
 
-    FailureOr<BaseMemRefType> memrefType =
+    FailureOr<BufferLikeType> bufferType =
         bufferization::getBufferType(bbArg, options, state);
-    if (failed(memrefType))
+    if (failed(bufferType))
       return failure();
-    newTypes.push_back(*memrefType);
+    newTypes.push_back(*bufferType);
   }
 
   // Change the type of all block arguments.
@@ -463,7 +463,7 @@ bufferization::bufferizeBlockSignature(Block *block, RewriterBase &rewriter,
         newOperands.push_back(operand);
         continue;
       }
-      FailureOr<BaseMemRefType> operandBufferType =
+      FailureOr<BufferLikeType> operandBufferType =
           bufferization::getBufferType(operand, options, state);
       if (failed(operandBufferType))
         return failure();
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index a0168da44b7b..453ed43bcadd 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -255,7 +255,7 @@ struct CallOpInterface
       }
 
       // Returning a memref.
-      FailureOr<BaseMemRefType> resultType =
+      FailureOr<BufferLikeType> resultType =
           bufferization::getBufferType(result, options, state);
       if (failed(resultType))
         return failure();
@@ -290,13 +290,13 @@ struct CallOpInterface
         // The called function was not bufferized yet. This can happen when
         // there cycles in the function call graph. Compute the bufferized
         // result type.
-        FailureOr<BaseMemRefType> maybeMemRefType =
+        FailureOr<BufferLikeType> maybeBufferType =
             bufferization::getBufferType(
                 funcOp.getArgument(opOperand.getOperandNumber()), options,
                 state);
-        if (failed(maybeMemRefType))
+        if (failed(maybeBufferType))
           return failure();
-        memRefType = *maybeMemRefType;
+        memRefType = *maybeBufferType;
       }
 
       // Since we don't yet have a clear layout story, to_buffer may
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
index 94a4b9011c16..573420f6a9aa 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
@@ -252,7 +252,8 @@ Value linalg::bufferizeToAllocation(
   // Create bufferization.to_tensor with "restrict" and "writable". The returned
   // tensor is a new buffer allocation, so it does not alias with any buffer.
   Value toTensorOp = rewriter.create<bufferization::ToTensorOp>(
-      loc, alloc, /*restrict=*/true, /*writable=*/true);
+      loc, padOp.getResult().getType(), alloc, /*restrict=*/true,
+      /*writable=*/true);
   rewriter.replaceOp(padOp, toTensorOp);
   return alloc;
 }
@@ -340,7 +341,8 @@ Value linalg::bufferizeToAllocation(
   // Create bufferization.to_tensor with "restrict" and "writable". The returned
   // tensor is a new buffer allocation, so it does not alias with any buffer.
   Value toTensorOp = rewriter.create<bufferization::ToTensorOp>(
-      loc, alloc, /*restrict=*/true, /*writable=*/true);
+      loc, allocTensorOp.getResult().getType(), alloc, /*restrict=*/true,
+      /*writable=*/true);
   rewriter.replaceOp(allocTensorOp, toTensorOp);
   return alloc;
 }
@@ -567,7 +569,8 @@ Value linalg::bufferizeToAllocation(
       createMemcpy(rewriter, op->getLoc(), operand->get(), alloc, options);
     }
     rewriter.modifyOpInPlace(op, [&]() {
-      auto toTensorOp = rewriter.create<ToTensorOp>(op->getLoc(), alloc);
+      auto toTensorOp = rewriter.create<ToTensorOp>(
+          op->getLoc(), operand->get().getType(), alloc);
       operand->set(toTensorOp);
       if (options.bufferizeDestinationOnly) {
         rewriter.modifyOpInPlace(toTensorOp, [&]() {
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 46fa77a7dc4e..58562536be61 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -108,7 +108,7 @@ struct ConditionOpInterface
             getBuffer(rewriter, value, options, state);
         if (failed(maybeBuffer))
           return failure();
-        FailureOr<BaseMemRefType> resultType = bufferization::getBufferType(
+        FailureOr<BufferLikeType> resultType = bufferization::getBufferType(
             whileOp.getAfterArguments()[it.index()], options, state);
         if (failed(resultType))
           return failure();
@@ -292,8 +292,9 @@ struct IfOpInterface
       // True branch was already bufferized.
       thenBufferType = cast<BaseMemRefType>(thenValue.getType());
     } else {
-      auto maybeBufferType = bufferization::getBufferType(
-          thenValue, options, state, invocationStack);
+      auto maybeBufferType =
+          bufferization::detail::asMemRefType(bufferization::getBufferType(
+              thenValue, options, state, invocationStack));
       if (failed(maybeBufferType))
         return failure();
       thenBufferType = *maybeBufferType;
@@ -302,8 +303,9 @@ struct IfOpInterface
       // False branch was already bufferized.
       elseBufferType = cast<BaseMemRefType>(elseValue.getType());
     } else {
-      auto maybeBufferType = bufferization::getBufferType(
-          elseValue, options, state, invocationStack);
+      auto maybeBufferType =
+          bufferization::detail::asMemRefType(bufferization::getBufferType(
+              elseValue, options, state, invocationStack));
       if (failed(maybeBufferType))
         return failure();
       elseBufferType = *maybeBufferType;
@@ -406,9 +408,7 @@ struct IndexSwitchOpInterface
         return bufferType;
       auto maybeBufferType = bufferization::getBufferType(
           yieldedValue, options, state, invocationStack);
-      if (failed(maybeBufferType))
-        return failure();
-      return maybeBufferType;
+      return bufferization::detail::asMemRefType(maybeBufferType);
     };
 
     // Compute buffer type of the default case.
@@ -527,8 +527,8 @@ static FailureOr<BaseMemRefType> computeLoopRegionIterArgBufferType(
     const BufferizationOptions &options, const BufferizationState &state,
     SmallVector<Value> &invocationStack) {
   // Determine the buffer type of the init_arg.
-  auto initArgBufferType =
-      bufferization::getBufferType(initArg, options, state, invocationStack);
+  auto initArgBufferType = bufferization::detail::asMemRefType(
+      bufferization::getBufferType(initArg, options, state, invocationStack));
   if (failed(initArgBufferType))
     return failure();
 
@@ -554,8 +554,9 @@ static FailureOr<BaseMemRefType> computeLoopRegionIterArgBufferType(
   } else {
     // Note: This typically triggers a recursive call for the buffer type of
     // the iter_arg.
-    auto maybeBufferType = bufferization::getBufferType(yieldedValue, options,
-                                                        state, invocationStack);
+    auto maybeBufferType =
+        bufferization::detail::asMemRefType(bufferization::getBufferType(
+            yieldedValue, options, state, invocationStack));
     if (failed(maybeBufferType))
       return failure();
     yieldedValueBufferType = *maybeBufferType;
@@ -718,8 +719,12 @@ struct ForOpInterface
     if (auto opResult = dyn_cast<OpResult>(value)) {
       // The type of an OpResult must match the corresponding iter_arg type.
       BlockArgument bbArg = forOp.getTiedLoopRegionIterArg(opResult);
-      return bufferization::getBufferType(bbArg, options, state,
-                                          invocationStack);
+      auto bufferType =
+          bufferization::getBufferType(bbArg, options, state, invocationStack);
+      if (failed(bufferType))
+        return failure();
+      assert(isa<BaseMemRefType>(*bufferType) && "expected memref type");
+      return cast<BaseMemRefType>(*bufferType);
     }
 
     // Compute result/argument number.
@@ -1078,8 +1083,8 @@ struct WhileOpInterface
       // scf.condition was already bufferized.
       return cast<BaseMemRefType>(conditionYieldedVal.getType());
     }
-    return bufferization::getBufferType(conditionYieldedVal, options, state,
-                                        invocationStack);
+    return bufferization::detail::asMemRefType(bufferization::getBufferType(
+        conditionYieldedVal, options, state, invocationStack));
   }
 
   /// Assert that yielded values of an scf.while op are equivalent to their
@@ -1185,14 +1190,14 @@ struct YieldOpInterface
         // We may have to cast the value before yielding it.
         if (isa<scf::ForOp, scf::IfOp, scf::IndexSwitchOp>(
                 yieldOp->getParentOp())) {
-          FailureOr<BaseMemRefType> resultType = bufferization::getBufferType(
+          FailureOr<BufferLikeType> resultType = bufferization::getBufferType(
               yieldOp->getParentOp()->getResult(it.index()), options, state);
           if (failed(resultType))
             return failure();
           buffer = castBuffer(rewriter, buffer, *resultType);
         } else if (auto whileOp =
                        dyn_cast<scf::WhileOp>(yieldOp->getParentOp())) {
-          FailureOr<BaseMemRefType> resultType = bufferization::getBufferType(
+          FailureOr<BufferLikeType> resultType = bufferization::getBufferType(
               whileOp.getBeforeArguments()[it.index()], options, state);
           if (failed(resultType))
             return failure();
@@ -1307,15 +1312,15 @@ struct ForallOpInterface
     if (auto bbArg = dyn_cast<BlockArgument>(value))
       // A tensor block argument has the same bufferized type as the
       // corresponding output operand.
-      return bufferization::getBufferType(
-          forallOp.getTiedOpOperand(bbArg)->get(), options, state,
-          invocationStack);
+      return bufferization::detail::asMemRefType(
+          bufferization::getBufferType(forallOp.getTiedOpOperand(bbArg)->get(),
+                                       options, state, invocationStack));
 
     // The bufferized result type is the same as the bufferized type of the
     // corresponding output operand.
-    return bufferization::getBufferType(
+    return bufferization::detail::asMemRefType(bufferization::getBufferType(
         forallOp.getOutputs()[cast<OpResult>(value).getResultNumber()], options,
-        state, invocationStack);
+        state, invocationStack));
   }
 
   bool isRepetitiveRegion(Operation *op, unsigned index) const {
diff --git a/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
index dc91117a5193..8a471c12d21e 100644
--- a/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -67,7 +67,7 @@ struct AssumingOpInterface
     for (const auto &it : llvm::enumerate(assumingOp->getResultTypes())) {
       if (isa<TensorType>(it.value())) {
         newResults.push_back(rewriter.create<bufferization::ToTensorOp>(
-            assumingOp.getLoc(), newOp->getResult(it.index())));
+            assumingOp.getLoc(), it.value(), newOp->getResult(it.index())));
       } else {
         newResults.push_back(newOp->getResult(it.index()));
       }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index e5f2418367a5..e89b34d457ff 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -651,7 +651,7 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
   tokens.clear();
 
   // Done.
-  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);
+  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, y.getType(), memY);
   return success();
 }
 
@@ -752,7 +752,7 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
   tokens.clear();
 
   // Done.
-  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);
+  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, c.getType(), bufC);
   return success();
 }
 
@@ -925,9 +925,12 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
   tokens.clear();
 
   // Done.
-  Value vt = rewriter.create<bufferization::ToTensorOp>(loc, valH);
-  Value rt = rewriter.create<bufferization::ToTensorOp>(loc, rowH);
-  Value ct = rewriter.create<bufferization::ToTensorOp>(loc, colH);
+  Value vt = rewriter.create<bufferization::ToTensorOp>(
+      loc, memref::getTensorTypeFromMemRefType(valH.getType()), valH);
+  Value rt = rewriter.create<bufferization::ToTensorOp>(
+      loc, memref::getTensorTypeFromMemRefType(rowH.getType()), rowH);
+  Value ct = rewriter.create<bufferization::ToTensorOp>(
+      loc, memref::getTensorTypeFromMemRefType(colH.getType()), colH);
   rewriter.replaceOpWithNewOp<AssembleOp>(op, c.getType(), ValueRange{rt, ct},
                                           vt);
   return success();
@@ -1043,7 +1046,7 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
   tokens.clear();
 
   // Done.
-  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);
+  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, C.getType(), bufC);
   return success();
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index e5f9717c3fba..14ced56b8365 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1471,7 +1471,8 @@ struct SparseDisassembleOpConverter
     // Converts MemRefs back to Tensors.
     SmallVector<Value> retValues = llvm::to_vector(
         llvm::map_range(retMem, [&rewriter, loc](Value v) -> Value {
-          return rewriter.create<bufferization::ToTensorOp>(loc, v);
+          return rewriter.create<bufferization::ToTensorOp>(
+              loc, memref::getTensorTypeFromMemRefType(v.getType()), v);
         }));
     // Appends the actual memory length used in each buffer returned.
     retValues.append(retLen.begin(), retLen.end());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 9ffa64dc821d..7f0b65768744 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -867,7 +867,9 @@ public:
     // Converts MemRefs back to Tensors.
     assert(retVal.size() + retLen.size() == op.getNumResults());
     for (unsigned i = 0, sz = retVal.size(); i < sz; i++) {
-      auto tensor = rewriter.create<bufferization::ToTensorOp>(loc, retVal[i]);
+      auto tensor = rewriter.create<bufferization::ToTensorOp>(
+          loc, memref::getTensorTypeFromMemRefType(retVal[i].getType()),
+          retVal[i]);
       retVal[i] =
           rewriter.create<tensor::CastOp>(loc, op.getResultTypes()[i], tensor);
     }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
index 57291064eba2..1bd9563b3db0 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
@@ -549,8 +549,8 @@ TypedValue<BaseMemRefType>
 sparse_tensor::genToMemref(OpBuilder &builder, Location loc, Value tensor) {
   auto tTp = llvm::cast<TensorType>(tensor.getType());
   auto mTp = MemRefType::get(tTp.getShape(), tTp.getElementType());
-  return builder.create<bufferization::ToBufferOp>(loc, mTp, tensor)
-      .getResult();
+  return cast<TypedValue<BaseMemRefType>>(
+      builder.create<bufferization::ToBufferOp>(loc, mTp, tensor).getResult());
 }
 
 Value sparse_tensor::createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 4b778b768d13..729c048db456 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -54,8 +54,9 @@ struct CastOpInterface
                 const BufferizationState &state,
                 SmallVector<Value> &invocationStack) const {
     auto castOp = cast<tensor::CastOp>(op);
-    auto maybeSrcBufferType = bufferization::getBufferType(
-        castOp.getSource(), options, state, invocationStack);
+    auto maybeSrcBufferType =
+        bufferization::detail::asMemRefType(bufferization::getBufferType(
+            castOp.getSource(), options, state, invocationStack));
     if (failed(maybeSrcBufferType))
       return failure();
     Attribute memorySpace = maybeSrcBufferType->getMemorySpace();
@@ -500,8 +501,8 @@ struct FromElementsOpInterface
         /*copy=*/false);
     if (failed(tensorAlloc))
       return failure();
-    FailureOr<BaseMemRefType> memrefType =
-        bufferization::getBufferType(*tensorAlloc, options, state);
+    FailureOr<BaseMemRefType> memrefType = bufferization::detail::asMemRefType(
+        bufferization::getBufferType(*tensorAlloc, options, state));
     if (failed(memrefType))
       return failure();
     Value buffer = rewriter.create<bufferization::ToBufferOp>(
@@ -758,8 +759,9 @@ struct PadOpInterface
                 SmallVector<Value> &invocationStack) const {
     // Infer memory space from the source tensor.
     auto padOp = cast<tensor::PadOp>(op);
-    auto maybeSrcBufferType = bufferization::getBufferType(
-        padOp.getSource(), options, state, invocationStack);
+    auto maybeSrcBufferType =
+        bufferization::detail::asMemRefType(bufferization::getBufferType(
+            padOp.getSource(), options, state, invocationStack));
     if (failed(maybeSrcBufferType))
       return failure();
     MemRefLayoutAttrInterface layout;
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
index cd19e3a5e82a..da3c26ce36ba 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
@@ -268,4 +268,23 @@ func.func @materialize_in_dest_raw(%f: f32, %f2: f32, %idx: index) -> (tensor<5x
   %r = tensor.extract %dest_filled[%idx] : tensor<5xf32>
 
   return %0, %r : tensor<5xf32>, f32
-}
\ No newline at end of file
+}
+
+// -----
+
+// CHECK-LABEL: func.func @test_dialect_op(
+// CHECK-SAME:    %[[ARG:.*]]: !test.test_tensor<[32, 64], f64>
+// CHECK-SAME:  ) -> !test.test_tensor<[32, 128], f64> {
+func.func @test_dialect_op(%arg: !test.test_tensor<[32, 64], f64>)
+    -> !test.test_tensor<[32, 128], f64> {
+  // CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[ARG]]
+  // CHECK: %[[DUMMY:.*]] = "test.dummy_memref_op"(%[[MEMREF]])
+  // CHECK-SAME: : (!test.test_memref<[32, 64], f64>)
+  // CHECK-SAME: -> !test.test_memref<[32, 128], f64>
+  // CHECK: %[[OUT:.*]] = bufferization.to_tensor %[[DUMMY]]
+  %out = "test.dummy_tensor_op"(%arg) : (!test.test_tensor<[32, 64], f64>)
+    -> !test.test_tensor<[32, 128], f64>
+
+  // CHECK: return %[[OUT]]
+  return %out : !test.test_tensor<[32, 128], f64>
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index b5a8bd10d6b6..78e44c6ec7a9 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -8,6 +8,7 @@
 
 #include "TestDialect.h"
 #include "TestOps.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
@@ -1387,3 +1388,25 @@ TestMultiSlotAlloca::handleDestructuringComplete(
     const DestructurableMemorySlot &slot, OpBuilder &builder) {
   return createNewMultiAllocaWithoutSlot(slot, builder, *this);
 }
+
+::mlir::LogicalResult test::TestDummyTensorOp::bufferize(
+    ::mlir::RewriterBase &rewriter,
+    const ::mlir::bufferization::BufferizationOptions &options,
+    ::mlir::bufferization::BufferizationState &state) {
+  auto buffer =
+      mlir::bufferization::getBuffer(rewriter, getInput(), options, state);
+  if (mlir::failed(buffer))
+    return failure();
+
+  const auto outType = getOutput().getType();
+  const auto bufferizedOutType = test::TestMemrefType::get(
+      getContext(), outType.getShape(), outType.getElementType(), nullptr);
+  // replace op with memref analogy
+  auto dummyMemrefOp = rewriter.create<test::TestDummyMemrefOp>(
+      getLoc(), bufferizedOutType, *buffer);
+
+  mlir::bufferization::replaceOpWithBufferizedValues(rewriter, getOperation(),
+                                                     dummyMemrefOp.getResult());
+
+  return mlir::success();
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.h b/mlir/test/lib/Dialect/Test/TestOps.h
index c2ee5f9ab9a5..b414b47c8742 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.h
+++ b/mlir/test/lib/Dialect/Test/TestOps.h
@@ -13,6 +13,7 @@
 #include "TestInterfaces.h"
 #include "TestTypes.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/DLTI/Traits.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 59330fdb1bb2..79bcd9c2e0a9 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -31,7 +31,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/MemorySlotInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-
+include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
 
 // Include the attribute definitions.
 include "TestAttrDefs.td"
@@ -2825,7 +2825,7 @@ def TestNVVMRequiresSMArchCondOp :
   let assemblyFormat = "attr-dict";
 }
 
-def TestNVVMRequirestSMArchCondMultiOp : 
+def TestNVVMRequirestSMArchCondMultiOp :
     TEST_Op<"nvvm_requires_sm_90a_or_sm_100a", [NVVMRequiresSMa<[90, 100]>]> {
   let arguments = (ins );
   let assemblyFormat = "attr-dict";
@@ -3552,4 +3552,58 @@ def TestAllocWithMultipleResults : TEST_Op<"alloc_with_multiple_results"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Test Ops bufferization
+//===----------------------------------------------------------------------===//
+
+def TestDummyTensorOp : TEST_Op<"dummy_tensor_op", [BufferizableOpInterface]> {
+  let arguments = (ins
+    Arg<TestTensorType>:$input
+  );
+  let results = (outs
+    Arg<TestTensorType>:$output
+  );
+  let extraClassDeclaration = [{
+    // BufferizableOpInterface
+    bool bufferizesToMemoryRead(mlir::OpOperand&,
+      const mlir::bufferization::AnalysisState&);
+
+    bool bufferizesToMemoryWrite(mlir::OpOperand&,
+      const mlir::bufferization::AnalysisState&);
+
+    mlir::bufferization::AliasingValueList getAliasingValues(mlir::OpOperand&,
+      const mlir::bufferization::AnalysisState&);
+
+    mlir::LogicalResult bufferize(
+      mlir::RewriterBase& rewriter,
+      const mlir::bufferization::BufferizationOptions& options,
+      mlir::bufferization::BufferizationState &state);
+  }];
+
+  let extraClassDefinition = [{
+    bool test::TestDummyTensorOp::bufferizesToMemoryRead(::mlir::OpOperand&,
+        const ::mlir::bufferization::AnalysisState&) {
+      return true;
+    }
+    bool test::TestDummyTensorOp::bufferizesToMemoryWrite(::mlir::OpOperand&,
+        const ::mlir::bufferization::AnalysisState&) {
+      return true;
+    }
+    ::mlir::bufferization::AliasingValueList
+    test::TestDummyTensorOp::getAliasingValues(::mlir::OpOperand&,
+        const ::mlir::bufferization::AnalysisState&) {
+      return {};
+    }
+  }];
+}
+
+def TestDummyMemrefOp : TEST_Op<"dummy_memref_op", []> {
+  let arguments = (ins
+    Arg<TestMemrefType>:$input
+  );
+  let results = (outs
+    Arg<TestMemrefType>:$output
+  );
+}
+
 #endif // TEST_OPS
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 09294e84960f..03261f37c815 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -428,6 +428,15 @@ def TestTensorType : Test_Type<"TestTensor",
       return test::TestTensorType::get(
         getContext(), shape.value_or(getShape()), elementType);
     }
+
+    // TensorLikeTypeInterface:
+    ::mlir::FailureOr<::mlir::bufferization::BufferLikeType>
+    getBufferType(const ::mlir::bufferization::BufferizationOptions& options,
+                  ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError);
+
+    ::mlir::LogicalResult verifyCompatibleBufferType(
+        ::mlir::bufferization::BufferLikeType bufferType,
+        ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError);
   }];
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 5c784dcee6e1..2fc2f90ef6bc 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -545,3 +545,23 @@ TestTypeOpAsmTypeInterfaceType::getAlias(::llvm::raw_ostream &os) const {
   os << "op_asm_type_interface_type";
   return ::mlir::OpAsmDialectInterface::AliasResult::FinalAlias;
 }
+
+::mlir::FailureOr<::mlir::bufferization::BufferLikeType>
+TestTensorType::getBufferType(
+    const ::mlir::bufferization::BufferizationOptions &,
+    ::llvm::function_ref<::mlir::InFlightDiagnostic()>) {
+  return cast<bufferization::BufferLikeType>(
+      TestMemrefType::get(getContext(), getShape(), getElementType(), nullptr));
+}
+
+::mlir::LogicalResult TestTensorType::verifyCompatibleBufferType(
+    ::mlir::bufferization::BufferLikeType bufferType,
+    ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {
+  auto testMemref = dyn_cast<TestMemrefType>(bufferType);
+  if (!testMemref)
+    return emitError() << "expected TestMemrefType";
+
+  const bool valid = getShape() == testMemref.getShape() &&
+                     getElementType() == testMemref.getElementType();
+  return mlir::success(valid);
+}

From 6f4add34801e6ce02a5ebc96df4d1ca479125649 Mon Sep 17 00:00:00 2001
From: Omair Javaid <omair.javaid@linaro.org>
Date: Wed, 18 Jun 2025 19:23:54 +0500
Subject: [PATCH 0818/1322] [compiler-rt] [Fuzzer] Fix ARMv7 test link failure
 by linking unwinder (#144495)

compiler-rt/lib/fuzzer/tests build was failing on armv7, with undefined
references to unwinder symbols, such as __aeabi_unwind_cpp_pr0.

This occurs because the test is built with `-nostdlib++` but `libunwind`
is not explicitly linked to the final test executable.

This patch resolves the issue by adding CMake logic to explicitly link
the required unwinder to the fuzzer tests, inspired by the same solution
used to fix Scudo build failures by https://reviews.llvm.org/D142888.
---
 compiler-rt/lib/fuzzer/tests/CMakeLists.txt | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
index adfae3d63e64..543f486a9d50 100644
--- a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
+++ b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
@@ -35,6 +35,27 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND
    COMPILER_RT_LIBCXXABI_PATH)
   list(APPEND LIBFUZZER_UNITTEST_CFLAGS -nostdinc++ -fno-exceptions)
   list(APPEND LIBFUZZER_UNITTEST_LINK_FLAGS -nostdlib++ -fno-exceptions)
+
+  # When we use -nostdlib++, we remove the default C++ runtime which normally
+  # provides the stack unwinding symbols (like __aeabi_unwind_cpp_pr0).
+  # We must now manually find and link a suitable unwinder library.
+  set(FUZZER_UNWINDER_LIBS)
+  if(COMPILER_RT_USE_LLVM_UNWINDER)
+    # Prefer LLVM's own libunwind.
+    list(APPEND FUZZER_UNWINDER_LIBS ${COMPILER_RT_UNWINDER_LINK_LIBS})
+  elseif(COMPILER_RT_HAS_GCC_S_LIB)
+    # As a fallback, use the shared libgcc_s library.
+    list(APPEND FUZZER_UNWINDER_LIBS gcc_s)
+  elseif(COMPILER_RT_HAS_GCC_LIB)
+    # As a final fallback, use the static libgcc library.
+    list(APPEND FUZZER_UNWINDER_LIBS gcc)
+  elseif(NOT COMPILER_RT_USE_BUILTINS_LIBRARY)
+    # If no unwinder is found and we aren't using the builtins library
+    message(FATAL_ERROR "Fuzzer tests require a suitable unwinder, but none was found.")
+  endif()
+  # Add the detected unwinder library to our link flags.
+  list(APPEND LIBFUZZER_UNITTEST_LINK_FLAGS ${FUZZER_UNWINDER_LIBS})
+
 endif()
 
 if ("-fvisibility=hidden" IN_LIST LIBFUZZER_CFLAGS)

From 36038a1048b2aab87ed18f982e960c044ad97670 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Wed, 18 Jun 2025 16:04:18 +0100
Subject: [PATCH 0819/1322] [RemoveDIs][NFC] Remove dbg intrinsic handling code
 from SelectionDAG ISel (#144702)

---
 .../llvm/CodeGen/FunctionLoweringInfo.h       |  4 +-
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp    | 45 -------------
 .../SelectionDAG/FunctionLoweringInfo.cpp     |  1 -
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 63 -------------------
 .../SelectionDAG/SelectionDAGBuilder.h        |  1 -
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |  4 --
 6 files changed, 1 insertion(+), 117 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 392da3f512df..b892a0e222a4 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -35,7 +35,6 @@ namespace llvm {
 class Argument;
 class BasicBlock;
 class BranchProbabilityInfo;
-class DbgDeclareInst;
 class Function;
 class Instruction;
 class MachineFunction;
@@ -191,9 +190,8 @@ public:
   /// The current call site index being processed, if any. 0 if none.
   unsigned CurCallSite = 0;
 
-  /// Collection of dbg.declare instructions handled after argument
+  /// Collection of dbg_declare instructions handled after argument
   /// lowering and before ISel proper.
-  SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
   SmallPtrSet<const DbgVariableRecord *, 8> PreprocessedDVRDeclares;
 
   /// set - Initialize this FunctionLoweringInfo with the given Function
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index e8a3df3366b2..fb9eff942a46 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1395,51 +1395,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   // Neither does the llvm.experimental.noalias.scope.decl intrinsic
   case Intrinsic::experimental_noalias_scope_decl:
     return true;
-  case Intrinsic::dbg_declare: {
-    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
-    assert(DI->getVariable() && "Missing variable");
-    if (FuncInfo.PreprocessedDbgDeclares.contains(DI))
-      return true;
-
-    const Value *Address = DI->getAddress();
-    if (!lowerDbgDeclare(Address, DI->getExpression(), DI->getVariable(),
-                         MIMD.getDL()))
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI);
-
-    return true;
-  }
-  case Intrinsic::dbg_assign:
-    // A dbg.assign is a dbg.value with more information, typically produced
-    // during optimisation. If one reaches fastisel then something odd has
-    // happened (such as an optimised function being always-inlined into an
-    // optnone function). We will not be using the extra information in the
-    // dbg.assign in that case, just use its dbg.value fields.
-    [[fallthrough]];
-  case Intrinsic::dbg_value: {
-    // This form of DBG_VALUE is target-independent.
-    const DbgValueInst *DI = cast<DbgValueInst>(II);
-    const Value *V = DI->getValue();
-    DIExpression *Expr = DI->getExpression();
-    DILocalVariable *Var = DI->getVariable();
-    if (DI->hasArgList())
-      // Signal that we don't have a location for this.
-      V = nullptr;
-
-    assert(Var->isValidLocationForIntrinsic(MIMD.getDL()) &&
-           "Expected inlined-at fields to agree");
-
-    if (!lowerDbgValue(V, Expr, Var, MIMD.getDL()))
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
-
-    return true;
-  }
-  case Intrinsic::dbg_label: {
-    const DbgLabelInst *DI = cast<DbgLabelInst>(II);
-    assert(DI->getLabel() && "Missing label");
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
-            TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
-    return true;
-  }
   case Intrinsic::objectsize:
     llvm_unreachable("llvm.objectsize.* should have been lowered already");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index d4ed158729ca..098005b6adfa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -369,7 +369,6 @@ void FunctionLoweringInfo::clear() {
   StatepointStackSlots.clear();
   StatepointRelocationMaps.clear();
   PreferredExtendType.clear();
-  PreprocessedDbgDeclares.clear();
   PreprocessedDVRDeclares.clear();
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ec0c5473b0db..c01f1e792847 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6674,69 +6674,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DAG.setRoot(Res.getValue(1));
     return;
   }
-  case Intrinsic::dbg_declare: {
-    const auto &DI = cast<DbgDeclareInst>(I);
-    // Debug intrinsics are handled separately in assignment tracking mode.
-    // Some intrinsics are handled right after Argument lowering.
-    if (AssignmentTrackingEnabled ||
-        FuncInfo.PreprocessedDbgDeclares.count(&DI))
-      return;
-    LLVM_DEBUG(dbgs() << "SelectionDAG visiting dbg_declare: " << DI << "\n");
-    DILocalVariable *Variable = DI.getVariable();
-    DIExpression *Expression = DI.getExpression();
-    dropDanglingDebugInfo(Variable, Expression);
-    // Assume dbg.declare can not currently use DIArgList, i.e.
-    // it is non-variadic.
-    assert(!DI.hasArgList() && "Only dbg.value should currently use DIArgList");
-    handleDebugDeclare(DI.getVariableLocationOp(0), Variable, Expression,
-                       DI.getDebugLoc());
-    return;
-  }
-  case Intrinsic::dbg_label: {
-    const DbgLabelInst &DI = cast<DbgLabelInst>(I);
-    DILabel *Label = DI.getLabel();
-    assert(Label && "Missing label");
-
-    SDDbgLabel *SDV;
-    SDV = DAG.getDbgLabel(Label, dl, SDNodeOrder);
-    DAG.AddDbgLabel(SDV);
-    return;
-  }
-  case Intrinsic::dbg_assign: {
-    // Debug intrinsics are handled separately in assignment tracking mode.
-    if (AssignmentTrackingEnabled)
-      return;
-    // If assignment tracking hasn't been enabled then fall through and treat
-    // the dbg.assign as a dbg.value.
-    [[fallthrough]];
-  }
-  case Intrinsic::dbg_value: {
-    // Debug intrinsics are handled separately in assignment tracking mode.
-    if (AssignmentTrackingEnabled)
-      return;
-    const DbgValueInst &DI = cast<DbgValueInst>(I);
-    assert(DI.getVariable() && "Missing variable");
-
-    DILocalVariable *Variable = DI.getVariable();
-    DIExpression *Expression = DI.getExpression();
-    dropDanglingDebugInfo(Variable, Expression);
-
-    if (DI.isKillLocation()) {
-      handleKillDebugValue(Variable, Expression, DI.getDebugLoc(), SDNodeOrder);
-      return;
-    }
-
-    SmallVector<Value *, 4> Values(DI.getValues());
-    if (Values.empty())
-      return;
-
-    bool IsVariadic = DI.hasArgList();
-    if (!handleDebugValue(Values, Variable, Expression, DI.getDebugLoc(),
-                          SDNodeOrder, IsVariadic))
-      addDanglingDebugInfo(Values, Variable, Expression, IsVariadic,
-                           DI.getDebugLoc(), SDNodeOrder);
-    return;
-  }
 
   case Intrinsic::eh_typeid_for: {
     // Find the type id for the given typeinfo.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 35c15bc269d4..1c278076a219 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -56,7 +56,6 @@ class CleanupPadInst;
 class CleanupReturnInst;
 class Constant;
 class ConstrainedFPIntrinsic;
-class DbgValueInst;
 class DataLayout;
 class DIExpression;
 class DILocalVariable;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index ac6d25f141ec..4b98d87fcc63 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1592,10 +1592,6 @@ static bool processDbgDeclare(FunctionLoweringInfo &FuncInfo,
 /// in case the declarations refer to arguments.
 static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) {
   for (const auto &I : instructions(*FuncInfo.Fn)) {
-    const auto *DI = dyn_cast<DbgDeclareInst>(&I);
-    if (DI && processDbgDeclare(FuncInfo, DI->getAddress(), DI->getExpression(),
-                                DI->getVariable(), DI->getDebugLoc()))
-      FuncInfo.PreprocessedDbgDeclares.insert(DI);
     for (const DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
       if (DVR.Type == DbgVariableRecord::LocationType::Declare &&
           processDbgDeclare(FuncInfo, DVR.getVariableLocationOp(0),

From 8fc20bffabe7fe6cdc4a9ec1bc79202eba5f1f23 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Wed, 18 Jun 2025 16:07:56 +0100
Subject: [PATCH 0820/1322] Fix bazel build issue caused by 142986 (#144721)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel      | 2 ++
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0b4441c15794..48f2d0900d3e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -12630,6 +12630,7 @@ td_library(
         ":AllocationOpInterfaceTdFiles",
         ":BufferViewFlowOpInterfaceTdFiles",
         ":BufferizableOpInterfaceTdFiles",
+        ":BufferizationTypeInterfacesTdFiles",
         ":CopyOpInterfaceTdFiles",
         ":DestinationStyleOpInterfaceTdFiles",
         ":InferTypeOpInterfaceTdFiles",
@@ -12811,6 +12812,7 @@ cc_library(
         ":BufferDeallocationOpInterfaceIncGen",
         ":BufferViewFlowOpInterfaceIncGen",
         ":BufferizableOpInterfaceIncGen",
+        ":BufferizationTypeInterfacesIncGen",
         ":BufferizationBaseIncGen",
         ":BufferizationInterfaces",
         ":BufferizationOpsIncGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index a2fb5ade7324..0eaf86da7f27 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -111,6 +111,7 @@ td_library(
     name = "TestOpTdFiles",
     srcs = glob(["lib/Dialect/Test/*.td"]),
     deps = [
+        "//mlir:BufferizableOpInterfaceTdFiles",
         "//mlir:BufferizationTypeInterfacesTdFiles",
         "//mlir:BuiltinDialectTdFiles",
         "//mlir:CallInterfacesTdFiles",
@@ -242,6 +243,7 @@ gentbl_cc_library(
     test = True,
     deps = [
         ":TestOpTdFiles",
+        "//mlir:BufferizableOpInterfaceTdFiles",
         "//mlir:BufferizationTypeInterfacesTdFiles",
         "//mlir:BuiltinDialectTdFiles",
     ],

From e4c3b037bc7f5d9a8089de4c509d3e6034735891 Mon Sep 17 00:00:00 2001
From: amordo <iammorjj@gmail.com>
Date: Wed, 18 Jun 2025 17:12:31 +0200
Subject: [PATCH 0821/1322] [InstCombine] Fold `tan(x) * cos(x) => sin(x)`
 (#136319)

This patch enables folding `tan(x) * cos(x) -> sin(x)` under the `contract` flag.

Fixes https://github.com/llvm/llvm-project/issues/34950.
---
 .../InstCombine/InstCombineMulDivRem.cpp      |  12 ++
 .../Transforms/InstCombine/fmul-tan-cos.ll    | 182 ++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fmul-tan-cos.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 457199a72510..fcf4613b5d13 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1072,6 +1072,18 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
     return Result;
   }
 
+  // tan(X) * cos(X) -> sin(X)
+  if (I.hasAllowContract() &&
+      match(&I,
+            m_c_FMul(m_OneUse(m_Intrinsic<Intrinsic::tan>(m_Value(X))),
+                     m_OneUse(m_Intrinsic<Intrinsic::cos>(m_Deferred(X)))))) {
+    auto *Sin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, &I);
+    if (auto *Metadata = I.getMetadata(LLVMContext::MD_fpmath)) {
+      Sin->setMetadata(LLVMContext::MD_fpmath, Metadata);
+    }
+    return replaceInstUsesWith(I, Sin);
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fmul-tan-cos.ll b/llvm/test/Transforms/InstCombine/fmul-tan-cos.ll
new file mode 100644
index 000000000000..a85661f14670
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fmul-tan-cos.ll
@@ -0,0 +1,182 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define double @fmul_tan_cos(double %a) {
+; CHECK-LABEL: define double @fmul_tan_cos(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[TAN:%.*]] = call double @llvm.tan.f64(double [[A]])
+; CHECK-NEXT:    [[COS:%.*]] = call double @llvm.cos.f64(double [[A]])
+; CHECK-NEXT:    [[RES:%.*]] = fmul double [[TAN]], [[COS]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call double @llvm.tan.f64(double %a)
+  %cos = call double @llvm.cos.f64(double %a)
+  %res = fmul double %tan, %cos
+  ret double %res
+}
+
+define double @fmul_strict_tan_strict_cos_contract(double %a) {
+; CHECK-LABEL: define double @fmul_strict_tan_strict_cos_contract(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[TAN:%.*]] = call double @llvm.tan.f64(double [[A]])
+; CHECK-NEXT:    [[COS:%.*]] = call contract double @llvm.cos.f64(double [[A]])
+; CHECK-NEXT:    [[RES:%.*]] = fmul double [[TAN]], [[COS]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call double @llvm.tan.f64(double %a)
+  %cos = call contract double @llvm.cos.f64(double %a)
+  %res = fmul double %tan, %cos
+  ret double %res
+}
+
+define double @fmul_contract_tan_strict_cos_strict(double %a) {
+; CHECK-LABEL: define double @fmul_contract_tan_strict_cos_strict(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract double @llvm.sin.f64(double [[A]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call double @llvm.tan.f64(double %a)
+  %cos = call double @llvm.cos.f64(double %a)
+  %res = fmul contract double %tan, %cos
+  ret double %res
+}
+
+define double @fmul_contract_tan_contract_cos_strict(double %a) {
+; CHECK-LABEL: define double @fmul_contract_tan_contract_cos_strict(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract double @llvm.sin.f64(double [[A]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %cos = call double @llvm.cos.f64(double %a)
+  %res = fmul contract double %tan, %cos
+  ret double %res
+}
+
+define double @fmul_tan_cos_contract_multiple_uses(double %a) {
+; CHECK-LABEL: define double @fmul_tan_cos_contract_multiple_uses(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[TAN:%.*]] = call contract double @llvm.tan.f64(double [[A]])
+; CHECK-NEXT:    [[COS:%.*]] = call contract double @llvm.cos.f64(double [[A]])
+; CHECK-NEXT:    [[RES:%.*]] = fmul contract double [[TAN]], [[COS]]
+; CHECK-NEXT:    call void @use(double [[COS]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %cos = call contract double @llvm.cos.f64(double %a)
+  %res = fmul contract double %tan, %cos
+  call void @use(double %cos)
+  ret double %res
+}
+
+define double @fmul_tan_cos_contract(double %a) {
+; CHECK-LABEL: define double @fmul_tan_cos_contract(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract double @llvm.sin.f64(double [[A]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %cos = call contract double @llvm.cos.f64(double %a)
+  %res = fmul contract double %tan, %cos
+  ret double %res
+}
+
+define float @fmul_tanf_cosf_contract(float %a) {
+; CHECK-LABEL: define float @fmul_tanf_cosf_contract(
+; CHECK-SAME: float [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract float @llvm.sin.f32(float [[A]])
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %tan = call contract float @llvm.tan.f32(float %a)
+  %cos = call contract float @llvm.cos.f32(float %a)
+  %res = fmul contract float %tan, %cos
+  ret float %res
+}
+
+define fp128 @fmul_tanfp128_cosfp128_contract(fp128 %a) {
+; CHECK-LABEL: define fp128 @fmul_tanfp128_cosfp128_contract(
+; CHECK-SAME: fp128 [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract fp128 @llvm.sin.f128(fp128 [[A]])
+; CHECK-NEXT:    ret fp128 [[RES]]
+;
+  %tan = call contract fp128 @llvm.tan.fp128(fp128 %a)
+  %cos = call contract fp128 @llvm.cos.fp128(fp128 %a)
+  %res = fmul contract fp128 %tan, %cos
+  ret fp128 %res
+}
+
+
+define double @commutativity_cos_tan(double %a) {
+; CHECK-LABEL: define double @commutativity_cos_tan(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract double @llvm.sin.f64(double [[A]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %cos = call contract double @llvm.cos.f64(double %a)
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %res = fmul contract double %cos, %tan
+  ret double %res
+}
+
+
+define double @tan_cos_value_mismatch(double %a, double %b) {
+; CHECK-LABEL: define double @tan_cos_value_mismatch(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) {
+; CHECK-NEXT:    [[TAN:%.*]] = call contract double @llvm.tan.f64(double [[A]])
+; CHECK-NEXT:    [[COS:%.*]] = call contract double @llvm.cos.f64(double [[B]])
+; CHECK-NEXT:    [[RES:%.*]] = fmul contract double [[TAN]], [[COS]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %cos = call contract double @llvm.cos.f64(double %b)
+  %res = fmul contract double %tan, %cos
+  ret double %res
+}
+
+
+define <2 x double> @fmul_tan_cos_vector(<2 x double> %a) {
+; CHECK-LABEL: define <2 x double> @fmul_tan_cos_vector(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract <2 x double> @llvm.sin.v2f64(<2 x double> [[A]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %tan = call contract <2 x double> @llvm.tan.v2f64(<2 x double> %a)
+  %cos = call contract <2 x double> @llvm.cos.v2f64(<2 x double> %a)
+  %res = fmul contract <2 x double> %tan, %cos
+  ret <2 x double> %res
+}
+
+
+define double @fmul_tan_cos_nnan_preservation(double %a) {
+; CHECK-LABEL: define double @fmul_tan_cos_nnan_preservation(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call nnan contract double @llvm.sin.f64(double [[A]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %cos = call contract double @llvm.cos.f64(double %a)
+  %res = fmul contract nnan double %tan, %cos
+  ret double %res
+}
+
+
+define double @fmul_tan_cos_fpmath_metadata_preservation(double %a) {
+; CHECK-LABEL: define double @fmul_tan_cos_fpmath_metadata_preservation(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call contract double @llvm.sin.f64(double [[A]]), !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %tan = call contract double @llvm.tan.f64(double %a)
+  %cos = call contract double @llvm.cos.f64(double %a)
+  %res = fmul contract double %tan, %cos, !fpmath !0
+  ret double %res
+}
+
+declare void @use(double)
+
+!0 = !{ float 2.5 }
+
+
+;.
+; CHECK: [[META0]] = !{float 2.500000e+00}
+;.

From b53c1e4ee810ac21dab5d27413af1f31a6a4cbfa Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Wed, 18 Jun 2025 16:16:52 +0100
Subject: [PATCH 0822/1322] [AArch64] Add ISel for postindex ld1/st1 in
 big-endian (#144387)

When big-endian we need to use ld1/st1 for vector loads and stores so
that we get the elements in the correct order, but this prevents
postindex addressing from being used. Fix this by adding the appropriate
ISel patterns, plus the relevant changes in ISelLowering and
ISelDAGToDAG to cause postindex addressing to be used.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |   52 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   12 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   82 +-
 .../CodeGen/AArch64/vector-ldst-offset.ll     | 2108 +++++++++++++++++
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      |   30 +-
 5 files changed, 2230 insertions(+), 54 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/vector-ldst-offset.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 009d69b2b943..da617b7e1926 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1583,6 +1583,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   EVT DstVT = N->getValueType(0);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
 
   // We're not doing validity checking here. That was done when checking
   // if we should mark the load as indexed or not. We're just selecting
@@ -1637,18 +1639,58 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
   } else if (VT == MVT::f32) {
     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
-  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+  } else if (VT == MVT::f64 ||
+             (VT.is64BitVector() && Subtarget->isLittleEndian())) {
     Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
-  } else if (VT.is128BitVector()) {
+  } else if (VT.is128BitVector() && Subtarget->isLittleEndian()) {
     Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else if (VT.is64BitVector()) {
+    if (IsPre || OffsetVal != 8)
+      return false;
+    switch (VT.getScalarSizeInBits()) {
+    case 8:
+      Opcode = AArch64::LD1Onev8b_POST;
+      break;
+    case 16:
+      Opcode = AArch64::LD1Onev4h_POST;
+      break;
+    case 32:
+      Opcode = AArch64::LD1Onev2s_POST;
+      break;
+    case 64:
+      Opcode = AArch64::LD1Onev1d_POST;
+      break;
+    default:
+      llvm_unreachable("Expected vector element to be a power of 2");
+    }
+  } else if (VT.is128BitVector()) {
+    if (IsPre || OffsetVal != 16)
+      return false;
+    switch (VT.getScalarSizeInBits()) {
+    case 8:
+      Opcode = AArch64::LD1Onev16b_POST;
+      break;
+    case 16:
+      Opcode = AArch64::LD1Onev8h_POST;
+      break;
+    case 32:
+      Opcode = AArch64::LD1Onev4s_POST;
+      break;
+    case 64:
+      Opcode = AArch64::LD1Onev2d_POST;
+      break;
+    default:
+      llvm_unreachable("Expected vector element to be a power of 2");
+    }
   } else
     return false;
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
-  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
-  int OffsetVal = (int)OffsetOp->getZExtValue();
   SDLoc dl(N);
-  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
+  // LD1 encodes an immediate offset by using XZR as the offset register.
+  SDValue Offset = (VT.isVector() && !Subtarget->isLittleEndian())
+                       ? CurDAG->getRegister(AArch64::XZR, MVT::i64)
+                       : CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
   SDValue Ops[] = { Base, Offset, Chain };
   SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
                                        MVT::Other, Ops);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1169efce3123..22c497d3de64 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2087,12 +2087,18 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
   setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
 
+  // When little-endian we can use ordinary d and q register loads/stores for
+  // vector types, but when big-endian we need to use structure load/store which
+  // only allow post-index addressing.
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
       setIndexedLoadAction(im, VT, Legal);
       setIndexedStoreAction(im, VT, Legal);
     }
+  } else {
+    setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+    setIndexedStoreAction(ISD::POST_INC, VT, Legal);
   }
 
   if (Subtarget->hasD128()) {
@@ -27047,6 +27053,12 @@ bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
       RHSC = -(uint64_t)RHSC;
     if (!isInt<9>(RHSC))
       return false;
+    // When big-endian VLD1/VST1 are used for vector load and store, and these
+    // only allow an offset that's equal to the store size.
+    EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
+    if (!Subtarget->isLittleEndian() && MemType.isVector() &&
+        RHSC != MemType.getStoreSize())
+      return false;
     // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
     // when dealing with subtraction.
     Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f90f12b5ac3c..400ffff5d567 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4942,39 +4942,42 @@ def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
 def : Pat<(post_store (bf16 FPR16:$Rt), GPR64sp:$addr, simm9:$off),
           (STRHpost FPR16:$Rt, GPR64sp:$addr, simm9:$off)>;
 
-def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(post_store(v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
-def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+  def : Pat<(post_store(v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+            (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Load/store exclusive instructions.
@@ -8925,6 +8928,21 @@ def : St1Pat<v4i16, ST1Onev4h>;
 def : St1Pat<v2i32, ST1Onev2s>;
 def : St1Pat<v1i64, ST1Onev1d>;
 
+class St1PostPat<ValueType ty, Instruction INST, int off>
+    : Pat<(post_store ty:$Vt, GPR64sp:$Rn, (i64 off)),
+          (INST ty:$Vt, GPR64sp:$Rn, XZR)>;
+
+let Predicates = [IsBE] in {
+  def : St1PostPat<v16i8, ST1Onev16b_POST, 16>;
+  def : St1PostPat<v8i16, ST1Onev8h_POST, 16>;
+  def : St1PostPat<v4i32, ST1Onev4s_POST, 16>;
+  def : St1PostPat<v2i64, ST1Onev2d_POST, 16>;
+  def : St1PostPat<v8i8, ST1Onev8b_POST, 8>;
+  def : St1PostPat<v4i16, ST1Onev4h_POST, 8>;
+  def : St1PostPat<v2i32, ST1Onev2s_POST, 8>;
+  def : St1PostPat<v1i64, ST1Onev1d_POST, 8>;
+}
+
 //---
 // Single-element
 //---
diff --git a/llvm/test/CodeGen/AArch64/vector-ldst-offset.ll b/llvm/test/CodeGen/AArch64/vector-ldst-offset.ll
new file mode 100644
index 000000000000..b31ba46893bd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-ldst-offset.ll
@@ -0,0 +1,2108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s --check-prefixes=CHECK-LE
+; RUN: llc -mtriple=aarch64_be < %s -o - | FileCheck %s --check-prefixes=CHECK-BE
+
+; Check that we use the correct offset mode for vector loads and stores, and in
+; particular for big-endian we use ld1/st1 which only allows postindex immediate
+; offset of the same size as the memory access size.
+; FIXME: Currently we fail to make use of postindex register offset ld1/st1.
+
+define [2 x ptr] @v8i8_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i8_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i8_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x i8>, ptr %ldptr, align 2
+  store <8 x i8> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i8_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i8_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i8_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #8
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <8 x i8>, ptr %add.ldptr, align 2
+  store <8 x i8> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i8_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i8_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i8_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x i8>, ptr %ldptr, align 2
+  store <8 x i8> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i8_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i8_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i8_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <8 x i8>, ptr %add.ldptr, align 2
+  store <8 x i8> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i8_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v8i8_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i8_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x i8>, ptr %ldptr, align 2
+  store <8 x i8> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i8_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v8i8_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i8_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <8 x i8>, ptr %add.ldptr, align 2
+  store <8 x i8> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i16_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i16_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i16_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x i16>, ptr %ldptr, align 2
+  store <4 x i16> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i16_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i16_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i16_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #8
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <4 x i16>, ptr %add.ldptr, align 2
+  store <4 x i16> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i16_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i16_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i16_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x i16>, ptr %ldptr, align 2
+  store <4 x i16> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i16_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i16_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i16_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <4 x i16>, ptr %add.ldptr, align 2
+  store <4 x i16> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i16_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4i16_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i16_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x i16>, ptr %ldptr, align 2
+  store <4 x i16> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i16_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4i16_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i16_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <4 x i16>, ptr %add.ldptr, align 2
+  store <4 x i16> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i32_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i32_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i32_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x i32>, ptr %ldptr, align 2
+  store <2 x i32> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i32_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i32_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i32_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #8
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <2 x i32>, ptr %add.ldptr, align 2
+  store <2 x i32> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i32_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i32_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i32_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x i32>, ptr %ldptr, align 2
+  store <2 x i32> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i32_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i32_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i32_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <2 x i32>, ptr %add.ldptr, align 2
+  store <2 x i32> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i32_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2i32_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i32_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x i32>, ptr %ldptr, align 2
+  store <2 x i32> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i32_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2i32_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i32_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <2 x i32>, ptr %add.ldptr, align 2
+  store <2 x i32> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1i64_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1i64_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1i64_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.1d }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.1d }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <1 x i64>, ptr %ldptr, align 2
+  store <1 x i64> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1i64_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1i64_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1i64_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    str d0, [x8, #8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <1 x i64>, ptr %add.ldptr, align 2
+  store <1 x i64> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1i64_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1i64_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1i64_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0], #16
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    str d0, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <1 x i64>, ptr %ldptr, align 2
+  store <1 x i64> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1i64_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1i64_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1i64_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    str d0, [x8, #16]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <1 x i64>, ptr %add.ldptr, align 2
+  store <1 x i64> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1i64_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v1i64_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1i64_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    str d0, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <1 x i64>, ptr %ldptr, align 2
+  store <1 x i64> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1i64_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v1i64_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1i64_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    ldr d0, [x0, x2]
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    str d0, [x8, x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <1 x i64>, ptr %add.ldptr, align 2
+  store <1 x i64> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f16_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f16_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f16_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x half>, ptr %ldptr, align 2
+  store <4 x half> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f16_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f16_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f16_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #8
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <4 x half>, ptr %add.ldptr, align 2
+  store <4 x half> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f16_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f16_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f16_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x half>, ptr %ldptr, align 2
+  store <4 x half> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f16_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f16_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f16_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <4 x half>, ptr %add.ldptr, align 2
+  store <4 x half> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f16_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4f16_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f16_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x half>, ptr %ldptr, align 2
+  store <4 x half> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f16_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4f16_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f16_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <4 x half>, ptr %add.ldptr, align 2
+  store <4 x half> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f32_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f32_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f32_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x float>, ptr %ldptr, align 2
+  store <2 x float> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f32_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f32_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f32_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #8
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <2 x float>, ptr %add.ldptr, align 2
+  store <2 x float> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f32_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f32_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f32_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x float>, ptr %ldptr, align 2
+  store <2 x float> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f32_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f32_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f32_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <2 x float>, ptr %add.ldptr, align 2
+  store <2 x float> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f32_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2f32_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f32_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x float>, ptr %ldptr, align 2
+  store <2 x float> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f32_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2f32_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f32_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <2 x float>, ptr %add.ldptr, align 2
+  store <2 x float> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1f64_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1f64_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #8
+; CHECK-LE-NEXT:    str d0, [x1], #8
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1f64_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.1d }, [x0], #8
+; CHECK-BE-NEXT:    st1 { v0.1d }, [x1], #8
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <1 x double>, ptr %ldptr, align 2
+  store <1 x double> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1f64_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1f64_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-LE-NEXT:    str d0, [x1, #8]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1f64_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0, #8]!
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x1, x1, #8
+; CHECK-BE-NEXT:    str d0, [x8, #8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 8
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 8
+  %val = load <1 x double>, ptr %add.ldptr, align 2
+  store <1 x double> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1f64_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1f64_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0], #16
+; CHECK-LE-NEXT:    str d0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1f64_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0], #16
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    str d0, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <1 x double>, ptr %ldptr, align 2
+  store <1 x double> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1f64_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v1f64_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-LE-NEXT:    str d0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1f64_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0, #16]!
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    str d0, [x8, #16]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <1 x double>, ptr %add.ldptr, align 2
+  store <1 x double> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1f64_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v1f64_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1f64_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldr d0, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    str d0, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <1 x double>, ptr %ldptr, align 2
+  store <1 x double> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v1f64_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v1f64_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr d0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str d0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v1f64_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    ldr d0, [x0, x2]
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    str d0, [x8, x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <1 x double>, ptr %add.ldptr, align 2
+  store <1 x double> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v16i8_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v16i8_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v16i8_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <16 x i8>, ptr %ldptr, align 2
+  store <16 x i8> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v16i8_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v16i8_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v16i8_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <16 x i8>, ptr %add.ldptr, align 2
+  store <16 x i8> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v16i8_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v16i8_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v16i8_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <16 x i8>, ptr %ldptr, align 2
+  store <16 x i8> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v16i8_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v16i8_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v16i8_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <16 x i8>, ptr %add.ldptr, align 2
+  store <16 x i8> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v16i8_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v16i8_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v16i8_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <16 x i8>, ptr %ldptr, align 2
+  store <16 x i8> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v16i8_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v16i8_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v16i8_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <16 x i8>, ptr %add.ldptr, align 2
+  store <16 x i8> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i16_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i16_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i16_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x i16>, ptr %ldptr, align 2
+  store <8 x i16> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i16_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i16_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i16_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <8 x i16>, ptr %add.ldptr, align 2
+  store <8 x i16> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i16_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i16_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i16_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x i16>, ptr %ldptr, align 2
+  store <8 x i16> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i16_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8i16_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i16_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <8 x i16>, ptr %add.ldptr, align 2
+  store <8 x i16> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i16_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v8i16_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i16_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x i16>, ptr %ldptr, align 2
+  store <8 x i16> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8i16_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v8i16_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8i16_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <8 x i16>, ptr %add.ldptr, align 2
+  store <8 x i16> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i32_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i32_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i32_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x i32>, ptr %ldptr, align 2
+  store <4 x i32> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i32_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i32_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i32_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <4 x i32>, ptr %add.ldptr, align 2
+  store <4 x i32> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i32_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i32_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i32_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x i32>, ptr %ldptr, align 2
+  store <4 x i32> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i32_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4i32_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i32_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <4 x i32>, ptr %add.ldptr, align 2
+  store <4 x i32> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i32_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4i32_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i32_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x i32>, ptr %ldptr, align 2
+  store <4 x i32> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4i32_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4i32_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4i32_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <4 x i32>, ptr %add.ldptr, align 2
+  store <4 x i32> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i64_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i64_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i64_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x i64>, ptr %ldptr, align 2
+  store <2 x i64> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i64_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i64_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i64_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <2 x i64>, ptr %add.ldptr, align 2
+  store <2 x i64> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i64_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i64_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i64_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x i64>, ptr %ldptr, align 2
+  store <2 x i64> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i64_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2i64_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i64_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <2 x i64>, ptr %add.ldptr, align 2
+  store <2 x i64> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i64_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2i64_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i64_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x i64>, ptr %ldptr, align 2
+  store <2 x i64> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2i64_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2i64_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2i64_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <2 x i64>, ptr %add.ldptr, align 2
+  store <2 x i64> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8f16_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8f16_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8f16_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x half>, ptr %ldptr, align 2
+  store <8 x half> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8f16_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8f16_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8f16_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <8 x half>, ptr %add.ldptr, align 2
+  store <8 x half> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8f16_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8f16_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8f16_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x half>, ptr %ldptr, align 2
+  store <8 x half> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8f16_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v8f16_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8f16_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <8 x half>, ptr %add.ldptr, align 2
+  store <8 x half> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8f16_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v8f16_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8f16_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <8 x half>, ptr %ldptr, align 2
+  store <8 x half> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v8f16_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v8f16_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v8f16_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <8 x half>, ptr %add.ldptr, align 2
+  store <8 x half> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f32_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f32_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f32_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x float>, ptr %ldptr, align 2
+  store <4 x float> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f32_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f32_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f32_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <4 x float>, ptr %add.ldptr, align 2
+  store <4 x float> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f32_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f32_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f32_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x float>, ptr %ldptr, align 2
+  store <4 x float> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f32_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v4f32_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f32_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <4 x float>, ptr %add.ldptr, align 2
+  store <4 x float> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f32_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4f32_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f32_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <4 x float>, ptr %ldptr, align 2
+  store <4 x float> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v4f32_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v4f32_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v4f32_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <4 x float>, ptr %add.ldptr, align 2
+  store <4 x float> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f64_postidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f64_postidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #16
+; CHECK-LE-NEXT:    str q0, [x1], #16
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f64_postidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0], #16
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1], #16
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x double>, ptr %ldptr, align 2
+  store <2 x double> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f64_preidx_same_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f64_preidx_same_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #16]!
+; CHECK-LE-NEXT:    str q0, [x1, #16]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f64_preidx_same_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x1, x1, #16
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 16
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 16
+  %val = load <2 x double>, ptr %add.ldptr, align 2
+  store <2 x double> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f64_postidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f64_postidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0], #32
+; CHECK-LE-NEXT:    str q0, [x1], #32
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f64_postidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x double>, ptr %ldptr, align 2
+  store <2 x double> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f64_preidx_different_size(ptr %ldptr, ptr %stptr) {
+; CHECK-LE-LABEL: v2f64_preidx_different_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0, #32]!
+; CHECK-LE-NEXT:    str q0, [x1, #32]!
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f64_preidx_different_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, #32
+; CHECK-BE-NEXT:    add x1, x1, #32
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 32
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 32
+  %val = load <2 x double>, ptr %add.ldptr, align 2
+  store <2 x double> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f64_postidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2f64_postidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldr q0, [x0]
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f64_postidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    mov x8, x1
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x8]
+; CHECK-BE-NEXT:    ret
+entry:
+  %val = load <2 x double>, ptr %ldptr, align 2
+  store <2 x double> %val, ptr %stptr, align 2
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
+
+define [2 x ptr] @v2f64_preidx_reg(ptr %ldptr, ptr %stptr, i64 %off) {
+; CHECK-LE-LABEL: v2f64_preidx_reg:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    mov x8, x1
+; CHECK-LE-NEXT:    ldr q0, [x0, x2]
+; CHECK-LE-NEXT:    add x0, x0, x2
+; CHECK-LE-NEXT:    add x1, x1, x2
+; CHECK-LE-NEXT:    str q0, [x8, x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: v2f64_preidx_reg:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    add x0, x0, x2
+; CHECK-BE-NEXT:    add x1, x1, x2
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %add.ldptr = getelementptr inbounds nuw i8, ptr %ldptr, i64 %off
+  %add.stptr = getelementptr inbounds nuw i8, ptr %stptr, i64 %off
+  %val = load <2 x double>, ptr %add.ldptr, align 2
+  store <2 x double> %val, ptr %add.stptr, align 2
+  %ret1 = insertvalue [2 x ptr] poison, ptr %add.ldptr, 0
+  %ret2 = insertvalue [2 x ptr] %ret1, ptr %add.stptr, 1
+  ret [2 x ptr] %ret2
+}
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 68a9dff81232..2a37183c47d5 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2835,14 +2835,13 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE:       // %bb.0: // %entry
 ; CHECK-BE-NEXT:  .LBB24_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
-; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
 ; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
 ; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    subs w2, w2, #1
-; CHECK-BE-NEXT:    add x1, x1, #16
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    umull v4.4s, v1.4h, v2.4h
@@ -3094,7 +3093,7 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
 ; CHECK-BE-NEXT:  .LBB26_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    ld1 { v4.16b }, [x1]
+; CHECK-BE-NEXT:    ld1 { v4.16b }, [x1], #16
 ; CHECK-BE-NEXT:    add x8, x0, #32
 ; CHECK-BE-NEXT:    ld1 { v16.4s }, [x0]
 ; CHECK-BE-NEXT:    add x9, x0, #48
@@ -3107,7 +3106,6 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v3.16b
 ; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
 ; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
-; CHECK-BE-NEXT:    add x1, x1, #16
 ; CHECK-BE-NEXT:    rev32 v5.16b, v5.16b
 ; CHECK-BE-NEXT:    rev32 v6.16b, v6.16b
 ; CHECK-BE-NEXT:    rev32 v7.16b, v7.16b
@@ -3175,19 +3173,18 @@ define i32 @mul_zext_16i8_sext_16i8(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE:       // %bb.0: // %entry
 ; CHECK-BE-NEXT:  .LBB27_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
-; CHECK-BE-NEXT:    ld1 { v1.16b }, [x1]
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
 ; CHECK-BE-NEXT:    add x8, x0, #48
+; CHECK-BE-NEXT:    ld1 { v1.16b }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
-; CHECK-BE-NEXT:    add x1, x1, #16
-; CHECK-BE-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-BE-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BE-NEXT:    sshll2 v2.8h, v1.16b, #0
+; CHECK-BE-NEXT:    ushll2 v3.8h, v0.16b, #0
+; CHECK-BE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    smull2 v4.4s, v2.8h, v3.8h
 ; CHECK-BE-NEXT:    smull v2.4s, v2.4h, v3.4h
-; CHECK-BE-NEXT:    smull v3.4s, v0.4h, v1.4h
-; CHECK-BE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-BE-NEXT:    smull v3.4s, v1.4h, v0.4h
+; CHECK-BE-NEXT:    smull2 v0.4s, v1.8h, v0.8h
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #32
 ; CHECK-BE-NEXT:    st1 { v3.4s }, [x0]
@@ -3249,14 +3246,13 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE:       // %bb.0: // %entry
 ; CHECK-BE-NEXT:  .LBB28_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
-; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
+; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
 ; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
 ; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    subs w2, w2, #1
-; CHECK-BE-NEXT:    add x1, x1, #16
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    smull v4.4s, v1.4h, v2.4h

From 3af4d4e8100fda2a7e1bd0dbbe0914b584ad08d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Wed, 18 Jun 2025 17:26:40 +0200
Subject: [PATCH 0823/1322] [HLSL][SPIR-V] Fix LinkageAttribute emission for
 BuiltIn (#144701)

BuiltIn variables were missing the visibility attribute, which caused
the Linkage capability to be emitted by the backend.
---
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |  1 +
 .../CodeGenHLSL/semantics/SV_Position.ps.hlsl |  2 +-
 .../SPIRV/linkage/link-attribute-vk.ll        | 23 +++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/linkage/link-attribute-vk.ll

diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 585411bc59e1..34960c34e109 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -375,6 +375,7 @@ static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
       llvm::GlobalVariable::GeneralDynamicTLSModel,
       /* AddressSpace */ 7, /* isExternallyInitialized= */ true);
   addSPIRVBuiltinDecoration(GV, BuiltInID);
+  GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
   return B.CreateLoad(Ty, GV);
 }
 
diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
index 58b91fc9264d..bdba38e028ed 100644
--- a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s
 
-// CHECK: @sv_position = external thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+// CHECK: @sv_position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
 
 // CHECK: define void @main() {{.*}} {
 float4 main(float4 p : SV_Position) {
diff --git a/llvm/test/CodeGen/SPIRV/linkage/link-attribute-vk.ll b/llvm/test/CodeGen/SPIRV/linkage/link-attribute-vk.ll
new file mode 100644
index 000000000000..d4ba61ff58d3
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/linkage/link-attribute-vk.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan1.3-pixel %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan1.3-pixel %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+@sv_position = external hidden thread_local local_unnamed_addr addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+
+; CHECK-NOT: OpDecorate %[[#var]] LinkageAttributes "sv_position" Import
+
+; CHECK-DAG: %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float4:]] = OpTypeVector %[[#float]]
+; CHECK-DAG: %[[#type:]] = OpTypePointer Input %[[#float4]]
+; CHECK-DAG: %[[#var:]] = OpVariable %[[#type]] Input
+
+; CHECK-NOT: OpDecorate %[[#var]] LinkageAttributes "sv_position" Import
+
+define void @main() #1 {
+entry:
+  ret void
+}
+
+attributes #1 = { "hlsl.shader"="pixel" }
+
+!0 = !{!1}
+!1 = !{i32 11, i32 0}

From 8b8a3699dbdbb5d7865b0fe330d972c3fa380f1e Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Wed, 18 Jun 2025 16:27:29 +0100
Subject: [PATCH 0824/1322] [AArch64] Use dupq (SVE2.1) for segmented lane
 splats (#144482)

Use the dupq instructions (when available) to represent a splat of the
same lane within each 128b segment of a wider fixed vector.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  37 ++++++
 .../CodeGen/AArch64/sve2p1-vector-shuffles.ll | 115 ++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 22c497d3de64..0e28ccd0f655 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13392,6 +13392,30 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
+/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
+/// the first vector operand.
+static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
+  assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
+  unsigned Lane = (unsigned)M[0];
+  unsigned Segments = VT.getFixedSizeInBits() / 128;
+  unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+
+  // Make sure there's no size changes.
+  if (SegmentElts * Segments != M.size())
+    return std::nullopt;
+
+  // Check the first index corresponds to one of the lanes in the first segment.
+  if (Lane >= SegmentElts)
+    return std::nullopt;
+
+  // Check that all lanes match the first, adjusted for segment.
+  for (unsigned I = 0; I < M.size(); ++I)
+    if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
+      return std::nullopt;
+
+  return Lane;
+}
+
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -29981,6 +30005,19 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
       return convertFromScalableVector(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
+
+    if (Subtarget->hasSVE2p1()) {
+      if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
+        SDValue IID =
+            DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
+        return convertFromScalableVector(
+            DAG, VT,
+            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+                        {IID, Op1,
+                         DAG.getConstant(*Lane, DL, MVT::i64,
+                                         /*isTarget=*/true)}));
+      }
+    }
   }
 
   // Try to widen the shuffle before generating a possibly expensive SVE TBL.
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
new file mode 100644
index 000000000000..40d4d0ff6014
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define void @dupq_i8_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i8_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.b, z0.b[15]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <32 x i8>, ptr %addr
+  %splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15,
+                                                                              i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  store <32 x i8> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_i16_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i16_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.h, z0.h[2]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <16 x i16>, ptr %addr
+  %splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                                i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  store <16 x i16> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_i32_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i32_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x i32>, ptr %addr
+  %splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                             i32 7, i32 7, i32 7, i32 7>
+  store <8 x i32> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_i64_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i64_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    trn1 z0.d, z0.d, z0.d
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <4 x i64>, ptr %addr
+  %splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  store <4 x i64> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_f16_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f16_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.h, z0.h[2]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <16 x half>, ptr %addr
+  %splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                                  i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  store <16 x half> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_bf16_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_bf16_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[2]
+; CHECK-NEXT:    dup v1.8h, v1.h[2]
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %load = load <16 x bfloat>, ptr %addr
+  %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                                      i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  store <16 x bfloat> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_f32_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f32_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x float>, ptr %addr
+  %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                                 i32 7, i32 7, i32 7, i32 7>
+  store <8 x float> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_f64_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f64_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    trn1 z0.d, z0.d, z0.d
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <4 x double>, ptr %addr
+  %splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  store <4 x double> %splat.lanes, ptr %addr
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }

From 9da9d32670ddbf610f0788236e78b2382037f00b Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Wed, 18 Jun 2025 11:30:53 -0400
Subject: [PATCH 0825/1322] [AMDGPU][True16][CodeGen] sext i16 inreg in true16
 mode (#144024)

update sext pattern in true16, setting up proper vgpr16 reg use
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  19 ++
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  24 ++
 llvm/test/CodeGen/AMDGPU/idot4s.ll            |  69 +++--
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |  79 +++---
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  64 ++---
 .../AMDGPU/sext-in-reg-vector-shuffle.ll      |  86 ++++++
 .../test/CodeGen/AMDGPU/vector-reduce-smax.ll | 260 ++++++++++--------
 .../test/CodeGen/AMDGPU/vector-reduce-smin.ll | 260 ++++++++++--------
 8 files changed, 521 insertions(+), 340 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 897c30948cf0..56b15c11a669 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2623,6 +2623,8 @@ def : GCNPat<
   (i32 (DivergentSextInreg<i1> i32:$src)),
   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (i16 (DivergentSextInreg<i1> i16:$src)),
   (V_BFE_I32_e64 $src, (i32 0), (i32 1))
@@ -2632,6 +2634,23 @@ def : GCNPat <
   (i16 (DivergentSextInreg<i8> i16:$src)),
   (V_BFE_I32_e64 $src, (i32 0), (i32 8))
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (i16 (DivergentSextInreg<i1> i16:$src)),
+  (V_BFE_I32_e64
+   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+   (i32 0), (i32 1))
+>;
+
+def : GCNPat <
+  (i16 (DivergentSextInreg<i8> i16:$src)),
+  (V_BFE_I32_e64
+   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+   (i32 0), (i32 8))
+>;
+}
 
 def : GCNPat<
   (i32 (DivergentSextInreg<i8> i32:$src)),
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 2dbc119f65cd..89a9ecc27c6e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -319,11 +319,21 @@ let SchedRW = [Write64Bit] in {
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat<
   (i32 (DivergentUnaryFrag<sext> i16:$src)),
   (i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10)))
 >;
 
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<sext> i16:$src)),
+  (i32 (V_BFE_I32_e64
+       (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+       (i32 0), (i32 0x10)))
+>;
+
 let isReMaterializable = 1 in {
 let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -423,6 +433,8 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat<
   (i64 (DivergentUnaryFrag<sext> i16:$src)),
     (REG_SEQUENCE VReg_64,
@@ -432,6 +444,18 @@ def : GCNPat<
       ), VGPR_32)), sub1)
 >;
 
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<sext> i16:$src)),
+    (REG_SEQUENCE VReg_64,
+      (i32 (V_BFE_I32_e64
+            (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+            (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+      (i32 (COPY_TO_REGCLASS
+         (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+      ), VGPR_32)), sub1)
+>;
+
 let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 9e7968f1acb8..ab38bd21994e 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -1165,35 +1165,32 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v3, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v6.h, 8, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v8.h, 8, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v4.h, 8, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v7.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v1.h, 8, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v9, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v6, v6, v8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v6.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v6.h
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.h
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
@@ -3435,35 +3432,31 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v0, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.h, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v3.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-DL-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index f995f426c637..5e502882a264 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1669,40 +1669,38 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v5, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v6, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v2.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v4, v4, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v3, v3, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v5, v5, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v3.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v4, v4, 0xc0c0302
 ; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_dot4_u32_u8 v0, v2, v1, v0
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v6, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
@@ -1964,44 +1962,41 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes2:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v4, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v5, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.h
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v1.h, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v0.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v4, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes2:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index f44faf4f7edb..3a4bf1c81ed5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -424,15 +424,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -457,15 +457,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -534,15 +534,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -567,15 +567,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
new file mode 100644
index 000000000000..49dec15f9f7d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN:  llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN:  llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
+
+define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+;
+; GFX11-TRUE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v5, 24, v2
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v6, 24, v1
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v8, v3, 0, 8
+; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.h, 8, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v4.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v4.h, v4.l
+; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v2, 24, v1
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v5, 24, v0
+; GFX11-FAKE16-NEXT:    v_ashrrev_i16 v6, 8, v1
+; GFX11-FAKE16-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX11-FAKE16-NEXT:    v_ashrrev_i16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v7, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v6, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v5, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v2, v4
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v4, v3
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v3, v0, v7
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v6, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v8, v4
+; GFX11-FAKE16-NEXT:    global_store_b128 v9, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
+  %load = load <8 x i8>, ptr addrspace(1) %in.gep
+  %shuff = shufflevector <8 x i8> %load, <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %cast = sitofp <8 x i8> %shuff to <8 x half>
+  store <8 x half> %cast, ptr addrspace(1) %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index 2d4c881b855e..16fbd1eabb30 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -208,11 +208,16 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i8:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v0.l, 0xff80
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -245,11 +250,16 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v0.l, 0xff80
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -790,7 +800,7 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
@@ -798,24 +808,27 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v5.l, v3.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -897,7 +910,7 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
@@ -905,24 +918,27 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v5.l, v3.l, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v1.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1291,51 +1307,59 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v5.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v3.h, v9.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v15, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v14, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.l, v13.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.h, v1.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v2.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v4.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v6.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v17, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.h, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v11, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v5.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.h, v2.l, v3.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v0.l, v1.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
@@ -1444,51 +1468,59 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v5.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v3.h, v9.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v15, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v14, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.l, v13.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.h, v1.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v1.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v2.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v4.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v6.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v17, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.h, v0.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v11, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v5.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.h, v2.l, v3.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v0.l, v1.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index d9d9a6b9a4b1..bb868621c23d 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -208,11 +208,16 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i8:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v0.l, 0x7f
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -245,11 +250,16 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v0.l, 0x7f
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -790,7 +800,7 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
@@ -798,24 +808,27 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v5.l, v3.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -897,7 +910,7 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
@@ -905,24 +918,27 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v5.l, v3.l, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v1.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1291,51 +1307,59 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v5.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v3.h, v9.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v15, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v14, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.l, v13.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.h, v1.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v2.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v4.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v6.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v17, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.h, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v11, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v5.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.h, v2.l, v3.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v0.l, v1.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
@@ -1444,51 +1468,59 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v5.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v3.h, v9.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v15, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v14, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.l, v13.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.h, v1.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v1.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v2.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v4.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v6.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v17, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.h, v0.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v11, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v5.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.h, v2.l, v3.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v0.l, v1.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l

From 5d502aeddf2a5d93c3fd93103054261acf4d92f3 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 18 Jun 2025 10:42:39 -0500
Subject: [PATCH 0826/1322] [flang][OpenMP] Clarify confusing error message
 (#144707)

The message "The atomic variable x should occur exactly once among the
arguments of the top-level [...] operator" was intended to convey that
(1) an atomic variable should be an argument, and (2) it should be
exactly one of the arguments. However, the wording turned out to be
sowing confusion instead.

Rework the corresponding check, and emit an individual error message for
each problematic situation:
- "atomic variable cannot be a proper subexpression of an argument",
- "atomic variable should appear as an argument",
- "atomic variable should be exactly one of the arguments".

Fixes https://github.com/llvm/llvm-project/issues/144599
---
 flang/lib/Semantics/check-omp-structure.cpp   | 59 +++++++++++-------
 .../Semantics/OpenMP/atomic-update-only.f90   |  3 +-
 flang/test/Semantics/OpenMP/atomic03.f90      | 32 +++++-----
 flang/test/Semantics/OpenMP/atomic04.f90      | 60 ++++++++++---------
 4 files changed, 89 insertions(+), 65 deletions(-)

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 83f4d1edf3c4..36d4bcb5d99f 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3510,37 +3510,56 @@ void OmpStructureChecker::CheckAtomicUpdateAssignment(
         operation::ToString(top.first));
     return;
   }
-  // Check if `atom` occurs exactly once in the argument list.
+  // Check how many times `atom` occurs as an argument, if it's a subexpression
+  // of an argument, and collect the non-atom arguments.
   std::vector<SomeExpr> nonAtom;
-  auto unique{[&]() { // -> iterator
-    auto found{top.second.end()};
-    for (auto i{top.second.begin()}, e{top.second.end()}; i != e; ++i) {
-      if (IsSameOrConvertOf(*i, atom)) {
-        if (found != top.second.end()) {
-          return top.second.end();
-        }
-        found = i;
+  MaybeExpr subExpr;
+  auto atomCount{[&]() {
+    int count{0};
+    for (const SomeExpr &arg : top.second) {
+      if (IsSameOrConvertOf(arg, atom)) {
+        ++count;
       } else {
-        nonAtom.push_back(*i);
+        if (!subExpr && IsSubexpressionOf(atom, arg)) {
+          subExpr = arg;
+        }
+        nonAtom.push_back(arg);
       }
     }
-    return found;
+    return count;
   }()};
 
-  if (unique == top.second.end()) {
-    if (top.first == operation::Operator::Identity) {
-      // This is "x = y".
+  bool hasError{false};
+  if (subExpr) {
+    context_.Say(rsrc,
+        "The atomic variable %s cannot be a proper subexpression of an argument (here: %s) in the update operation"_err_en_US,
+        atom.AsFortran(), subExpr->AsFortran());
+    hasError = true;
+  }
+  if (top.first == operation::Operator::Identity) {
+    // This is "x = y".
+    assert((atomCount == 0 || atomCount == 1) && "Unexpected count");
+    if (atomCount == 0) {
       context_.Say(rsrc,
           "The atomic variable %s should appear as an argument in the update operation"_err_en_US,
           atom.AsFortran());
-    } else {
-      assert(top.first != operation::Operator::Identity &&
-          "Handle this separately");
-      context_.Say(rsrc,
-          "The atomic variable %s should occur exactly once among the arguments of the top-level %s operator"_err_en_US,
-          atom.AsFortran(), operation::ToString(top.first));
+      hasError = true;
     }
   } else {
+    if (atomCount == 0) {
+      context_.Say(rsrc,
+          "The atomic variable %s should appear as an argument of the top-level %s operator"_err_en_US,
+          atom.AsFortran(), operation::ToString(top.first));
+      hasError = true;
+    } else if (atomCount > 1) {
+      context_.Say(rsrc,
+          "The atomic variable %s should be exactly one of the arguments of the top-level %s operator"_err_en_US,
+          atom.AsFortran(), operation::ToString(top.first));
+      hasError = true;
+    }
+  }
+
+  if (!hasError) {
     CheckStorageOverlap(atom, nonAtom, source);
   }
 }
diff --git a/flang/test/Semantics/OpenMP/atomic-update-only.f90 b/flang/test/Semantics/OpenMP/atomic-update-only.f90
index 28d0e264359c..3c027924a142 100644
--- a/flang/test/Semantics/OpenMP/atomic-update-only.f90
+++ b/flang/test/Semantics/OpenMP/atomic-update-only.f90
@@ -30,7 +30,8 @@ subroutine f03
   integer :: x, y
 
   !$omp atomic update
-  !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
+  !ERROR: The atomic variable x cannot be a proper subexpression of an argument (here: (x+y)) in the update operation
+  !ERROR: The atomic variable x should appear as an argument of the top-level + operator
   x = (x + y) + 1
 end
 
diff --git a/flang/test/Semantics/OpenMP/atomic03.f90 b/flang/test/Semantics/OpenMP/atomic03.f90
index b3a3c0d5e7a1..691a483e6e80 100644
--- a/flang/test/Semantics/OpenMP/atomic03.f90
+++ b/flang/test/Semantics/OpenMP/atomic03.f90
@@ -25,19 +25,19 @@ program OmpAtomic
    y = MIN(y, 8)
 
 !$omp atomic
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level AND operator
    z = IAND(y, 4)
 !$omp atomic
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level OR operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level OR operator
    z = IOR(y, 5)
 !$omp atomic
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level NEQV/EOR operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level NEQV/EOR operator
    z = IEOR(y, 6)
 !$omp atomic
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MAX operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level MAX operator
    z = MAX(y, 7, b, c)
 !$omp atomic
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MIN operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level MIN operator
    z = MIN(y, 8, a, d)
 
 !$omp atomic
@@ -58,19 +58,19 @@ program OmpAtomic
    y = MIN(y, 8)
 
 !$omp atomic update
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level AND operator
    z = IAND(y, 4)
 !$omp atomic update 
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level OR operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level OR operator
    z = IOR(y, 5)
 !$omp atomic update
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level NEQV/EOR operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level NEQV/EOR operator
    z = IEOR(y, 6)
 !$omp atomic update
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MAX operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level MAX operator
    z = MAX(y, 7)
 !$omp atomic update
-   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MIN operator
+   !ERROR: The atomic variable z should appear as an argument of the top-level MIN operator
    z = MIN(y, 8)
 
 !$omp atomic update
@@ -90,7 +90,7 @@ subroutine conflicting_types()
     type(simple) ::s
     z = 1
     !$omp atomic
-    !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator
+    !ERROR: The atomic variable z should appear as an argument of the top-level AND operator
     z = IAND(s%z, 4)
 end subroutine
 
@@ -103,22 +103,22 @@ subroutine more_invalid_atomic_update_stmts()
     type(some_type) :: s
  
     !$omp atomic update
-    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MIN operator
+    !ERROR: The atomic variable a should be exactly one of the arguments of the top-level MIN operator
         a = min(a, a, b)
      
     !$omp atomic
-    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MAX operator
+    !ERROR: The atomic variable a should be exactly one of the arguments of the top-level MAX operator
         a = max(b, a, b, a)
 
     !$omp atomic
         a = min(b, a, b)
 
     !$omp atomic
-    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MAX operator
+    !ERROR: The atomic variable a should be exactly one of the arguments of the top-level MAX operator
         a = max(b, a, b, a, b)
     
     !$omp atomic update
-    !ERROR: The atomic variable y should occur exactly once among the arguments of the top-level MIN operator
+    !ERROR: The atomic variable y should appear as an argument of the top-level MIN operator
         y = min(z, x)
      
     !$omp atomic
@@ -126,7 +126,7 @@ subroutine more_invalid_atomic_update_stmts()
 
     !$omp atomic update
     !ERROR: Atomic variable k should be a scalar
-    !ERROR: The atomic variable k should occur exactly once among the arguments of the top-level MAX operator
+    !ERROR: The atomic variable k should appear as an argument of the top-level MAX operator
         k = max(x, y)
 
     !$omp atomic
diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90
index 0f69befed141..fb87ca518661 100644
--- a/flang/test/Semantics/OpenMP/atomic04.f90
+++ b/flang/test/Semantics/OpenMP/atomic04.f90
@@ -17,10 +17,10 @@ program OmpAtomic
 !$omp atomic
    x = 1 + x
 !$omp atomic
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level + operator
    x = y + 1
 !$omp atomic
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level + operator
    x = 1 + y
 
 !$omp atomic
@@ -28,10 +28,10 @@ program OmpAtomic
 !$omp atomic
    x = 1 - x
 !$omp atomic
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level - operator
    x = y - 1
 !$omp atomic
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level - operator
    x = 1 - y
 
 !$omp atomic
@@ -50,10 +50,10 @@ program OmpAtomic
 !$omp atomic
    x = 1/x
 !$omp atomic
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level / operator
    x = y/1
 !$omp atomic
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level / operator
    x = 1/y
 
 !$omp atomic
@@ -61,7 +61,7 @@ program OmpAtomic
 !$omp atomic
    m = n .AND. m
 !$omp atomic 
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level AND operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level AND operator
    m = n .AND. l
 
 !$omp atomic
@@ -69,7 +69,7 @@ program OmpAtomic
 !$omp atomic
    m = n .OR. m
 !$omp atomic 
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level OR operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level OR operator
    m = n .OR. l
 
 !$omp atomic
@@ -77,7 +77,7 @@ program OmpAtomic
 !$omp atomic
    m = n .EQV. m
 !$omp atomic
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level EQV operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level EQV operator
    m = n .EQV. l
 
 !$omp atomic
@@ -85,7 +85,7 @@ program OmpAtomic
 !$omp atomic
    m = n .NEQV. m
 !$omp atomic
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level NEQV/EOR operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level NEQV/EOR operator
    m = n .NEQV. l
 
 !$omp atomic update
@@ -93,10 +93,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1 + x
 !$omp atomic update
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level + operator
    x = y + 1
 !$omp atomic update
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level + operator
    x = 1 + y
 
 !$omp atomic update
@@ -104,10 +104,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1 - x
 !$omp atomic update
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level - operator
    x = y - 1
 !$omp atomic update
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level - operator
    x = 1 - y
 
 !$omp atomic update
@@ -126,10 +126,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1/x
 !$omp atomic update
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level / operator
    x = y/1
 !$omp atomic update
-   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
+   !ERROR: The atomic variable x should appear as an argument of the top-level / operator
    x = 1/y
 
 !$omp atomic update
@@ -137,7 +137,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .AND. m
 !$omp atomic update
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level AND operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level AND operator
    m = n .AND. l
 
 !$omp atomic update
@@ -145,7 +145,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .OR. m
 !$omp atomic update
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level OR operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level OR operator
    m = n .OR. l
 
 !$omp atomic update
@@ -153,7 +153,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .EQV. m
 !$omp atomic update
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level EQV operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level EQV operator
    m = n .EQV. l
 
 !$omp atomic update
@@ -161,7 +161,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .NEQV. m
 !$omp atomic update
-   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level NEQV/EOR operator
+   !ERROR: The atomic variable m should appear as an argument of the top-level NEQV/EOR operator
    m = n .NEQV. l
 
 end program OmpAtomic
@@ -184,27 +184,30 @@ subroutine more_invalid_atomic_update_stmts()
         x = 1    
 
     !$omp atomic update
-    !ERROR: Within atomic operation a and a*b access the same storage
+    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: a*b) in the update operation
         a = a * b + a
 
     !$omp atomic
-    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level * operator
+    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: (a+9_4)) in the update operation
+    !ERROR: The atomic variable a should appear as an argument of the top-level * operator
         a = b * (a + 9)
 
     !$omp atomic update
-    !ERROR: Within atomic operation a and (a+b) access the same storage
+    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: (a+b)) in the update operation
         a = a * (a + b)
 
     !$omp atomic
-    !ERROR: Within atomic operation a and (b+a) access the same storage
+    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: (b+a)) in the update operation
         a = (b + a) * a
 
     !$omp atomic
-    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level + operator
+    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: a*b) in the update operation
+    !ERROR: The atomic variable a should appear as an argument of the top-level + operator
         a = a * b + c
 
     !$omp atomic update
-    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level + operator
+    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: a+b) in the update operation
+    !ERROR: The atomic variable a should appear as an argument of the top-level + operator
         a = a + b + c
 
     !$omp atomic
@@ -219,11 +222,12 @@ subroutine more_invalid_atomic_update_stmts()
 
     !$omp atomic update
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
-    !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
+    !ERROR: The atomic variable x cannot be a proper subexpression of an argument (here: x*y) in the update operation
+    !ERROR: The atomic variable x should appear as an argument of the top-level / operator
         x = x * y / z
 
     !$omp atomic
-    !ERROR: The atomic variable p%m should occur exactly once among the arguments of the top-level + operator
+    !ERROR: The atomic variable p%m should appear as an argument of the top-level + operator
         p%m = x + y
 
     !$omp atomic update

From b5aaf9d988ff2dc652c86271b181bf0497eb97cb Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 18 Jun 2025 08:53:45 -0700
Subject: [PATCH 0827/1322] [InstCombine] Implement vp.reverse
 reordering/elimination through binop/unop (#143963)

This simply copies the structure of the vector.reverse patterns from
just above, and reimplements them for the vp.reverse intrinsics when the
mask is all ones and the EVLs exactly match.

Its unfortunate that we have three different ways to represent a reverse
(shuffle, vector.reverse, and vp.reverse) but I don't see an obvious way
to remove any them because the semantics are slightly different.

This significantly improves vectorization in TSVC_2's s112 and s1112
loops when using EVL tail folding.
---
 .../InstCombine/InstCombineCalls.cpp          | 19 ++++++
 .../InstCombine/InstructionCombining.cpp      | 33 +++++++++
 .../test/Transforms/InstCombine/vp-reverse.ll | 68 ++++++++++++-------
 3 files changed, 97 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 03897117861f..b6ed1dc4331d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3571,6 +3571,25 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_vp_reverse: {
+    Value *X;
+    Value *Vec = II->getArgOperand(0);
+    Value *Mask = II->getArgOperand(1);
+    if (!match(Mask, m_AllOnes()))
+      break;
+    Value *EVL = II->getArgOperand(2);
+    // rev(unop rev(X)) --> unop X
+    if (match(Vec,
+              m_OneUse(m_UnOp(m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                  m_Value(X), m_AllOnes(), m_Specific(EVL)))))) {
+      auto *OldUnOp = cast<UnaryOperator>(Vec);
+      auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(
+          OldUnOp->getOpcode(), X, OldUnOp, OldUnOp->getName(),
+          II->getIterator());
+      return replaceInstUsesWith(CI, NewUnOp);
+    }
+    break;
+  }
   case Intrinsic::vector_reduce_or:
   case Intrinsic::vector_reduce_and: {
     // Canonicalize logical or/and reductions:
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index e2cd2a59fab9..afd3359e22ff 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2231,6 +2231,39 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   else if (isSplatValue(LHS) && match(RHS, m_OneUse(m_VecReverse(m_Value(V2)))))
     return createBinOpReverse(LHS, V2);
 
+  auto createBinOpVPReverse = [&](Value *X, Value *Y, Value *EVL) {
+    Value *V = Builder.CreateBinOp(Opcode, X, Y, Inst.getName());
+    if (auto *BO = dyn_cast<BinaryOperator>(V))
+      BO->copyIRFlags(&Inst);
+
+    ElementCount EC = cast<VectorType>(V->getType())->getElementCount();
+    Value *AllTrueMask = Builder.CreateVectorSplat(EC, Builder.getTrue());
+    Module *M = Inst.getModule();
+    Function *F = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::experimental_vp_reverse, V->getType());
+    return CallInst::Create(F, {V, AllTrueMask, EVL});
+  };
+
+  Value *EVL;
+  if (match(LHS, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                     m_Value(V1), m_AllOnes(), m_Value(EVL)))) {
+    // Op(rev(V1), rev(V2)) -> rev(Op(V1, V2))
+    if (match(RHS, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                       m_Value(V2), m_AllOnes(), m_Specific(EVL))) &&
+        (LHS->hasOneUse() || RHS->hasOneUse() ||
+         (LHS == RHS && LHS->hasNUses(2))))
+      return createBinOpVPReverse(V1, V2, EVL);
+
+    // Op(rev(V1), RHSSplat)) -> rev(Op(V1, RHSSplat))
+    if (LHS->hasOneUse() && isSplatValue(RHS))
+      return createBinOpVPReverse(V1, RHS, EVL);
+  }
+  // Op(LHSSplat, rev(V2)) -> rev(Op(LHSSplat, V2))
+  else if (isSplatValue(LHS) &&
+           match(RHS, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                          m_Value(V2), m_AllOnes(), m_Value(EVL))))
+    return createBinOpVPReverse(LHS, V2, EVL);
+
   // It may not be safe to reorder shuffles and things like div, urem, etc.
   // because we may trap when executing those ops on unknown vector elements.
   // See PR20059.
diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
index 79e6c47bdf1b..540b57da3475 100644
--- a/llvm/test/Transforms/InstCombine/vp-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -3,11 +3,8 @@
 
 define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_elim(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[B:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -16,8 +13,10 @@ define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4
   ret <vscale x 4 x i32> %add.rev
 }
 
-define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
-; CHECK-LABEL: @binop_reverse_elim2(
+; Negative test - the mask needs to be reversed between the inner and
+; the outer to be correct.
+define <vscale x 4 x i32> @binop_reverse_elim_samemask(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim_samemask(
 ; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
 ; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
@@ -48,10 +47,9 @@ define <vscale x 4 x i32> @binop_reverse_elim_diffmask(<vscale x 4 x i32> %a, <v
 
 define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_elim_diffevl(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 10)
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV:%.*]], [[B_REV:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 10)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -63,10 +61,8 @@ define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vs
 
 define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_splat_elim(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %add = add nsw <vscale x 4 x i32> %a.rev, splat (i32 22)
@@ -76,10 +72,8 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %
 
 define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_splat_elim2(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %add = add nsw <vscale x 4 x i32> splat (i32 22), %a.rev
@@ -87,12 +81,40 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32
   ret <vscale x 4 x i32> %add.rev
 }
 
+define <vscale x 4 x i32> @binop_reverse_splat_elim3(<vscale x 4 x i32> %a, i32 %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim3(
+; CHECK-NEXT:    [[B_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[B_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[B_VEC]], [[A_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %b.ins = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %b.vec = shufflevector <vscale x 4 x i32> %b.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %b.vec, %a.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_splat_elim4(<vscale x 4 x i32> %a, i32 %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim4(
+; CHECK-NEXT:    [[B_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[B_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B_VEC]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
+;
+  %b.ins = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %b.vec = shufflevector <vscale x 4 x i32> %b.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev,  %b.vec
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
 define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
 ; CHECK-LABEL: @unop_reverse_splat_elim(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV]]
-; CHECK-NEXT:    [[OP_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[OP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[OP_REV]]
+; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OP]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %op = fneg <vscale x 4 x float> %a.rev

From 0fa373c77ded203eddb973c79244c75ee5957eaf Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Wed, 18 Jun 2025 12:00:48 -0400
Subject: [PATCH 0828/1322] [Matrix] Propagate shape information through PHI
 insts (#141681)

... and split them as we lower them, avoiding several shuffles in the
process.
---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  57 +-
 .../Transforms/LowerMatrixIntrinsics/phi.ll   | 789 ++++++++++++++++++
 .../propagate-backwards-unsupported.ll        | 261 ------
 3 files changed, 844 insertions(+), 263 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/phi.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 96b156494fd9..fa9e44617b7c 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -288,6 +288,7 @@ static bool isUniformShape(Value *V) {
     }
 
   switch (I->getOpcode()) {
+  case Instruction::PHI:
   case Instruction::FNeg:
     return true;
   default:
@@ -1136,7 +1137,27 @@ public:
 
     Changed |= !FusedInsts.empty();
 
-    // Fourth, lower remaining instructions with shape information.
+    // Fourth, pre-process all the PHINode's. The incoming values will be
+    // assigned later in VisitPHI.
+    for (Instruction *Inst : MatrixInsts) {
+      auto *PHI = dyn_cast<PHINode>(Inst);
+      if (!PHI)
+        continue;
+
+      const ShapeInfo &SI = ShapeMap.at(Inst);
+      auto *EltTy = cast<FixedVectorType>(PHI->getType())->getElementType();
+      MatrixTy PhiM(SI.NumRows, SI.NumColumns, EltTy);
+
+      IRBuilder<> Builder(Inst);
+      for (unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI)
+        PhiM.setVector(VI, Builder.CreatePHI(PhiM.getVectorTy(),
+                                             PHI->getNumIncomingValues(),
+                                             PHI->getName()));
+      assert(!Inst2ColumnMatrix.contains(PHI) && "map already contains phi?");
+      Inst2ColumnMatrix[PHI] = PhiM;
+    }
+
+    // Fifth, lower remaining instructions with shape information.
     for (Instruction *Inst : MatrixInsts) {
       if (FusedInsts.count(Inst))
         continue;
@@ -1161,6 +1182,8 @@ public:
         Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
         Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);
+      else if (auto *PHI = dyn_cast<PHINode>(Inst))
+        Result = VisitPHI(PHI, SI, Builder);
       else
         continue;
 
@@ -1458,7 +1481,8 @@ public:
                         IRBuilder<> &Builder) {
     auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
     (void)inserted;
-    assert(inserted.second && "multiple matrix lowering mapping");
+    assert((inserted.second || isa<PHINode>(Inst)) &&
+           "multiple matrix lowering mapping");
 
     ToRemove.push_back(Inst);
     Value *Flattened = nullptr;
@@ -2239,6 +2263,35 @@ public:
                       Builder);
   }
 
+  MatrixTy VisitPHI(PHINode *Inst, const ShapeInfo &SI, IRBuilder<> &Builder) {
+    auto BlockIP = Inst->getParent()->getFirstInsertionPt();
+    Builder.SetInsertPoint(BlockIP);
+    MatrixTy PhiM = getMatrix(Inst, SI, Builder);
+
+    for (auto [IncomingV, IncomingB] :
+         llvm::zip_equal(Inst->incoming_values(), Inst->blocks())) {
+      // getMatrix() may insert some instructions to help with reshaping. The
+      // safest place for those is at the top of the block after the rest of the
+      // PHI's. Even better, if we can put it in the incoming block.
+      Builder.SetInsertPoint(BlockIP);
+      if (auto *IncomingInst = dyn_cast<Instruction>(IncomingV))
+        if (auto MaybeIP = IncomingInst->getInsertionPointAfterDef())
+          Builder.SetInsertPoint(*MaybeIP);
+
+      MatrixTy OpM = getMatrix(IncomingV, SI, Builder);
+
+      for (unsigned VI = 0, VE = PhiM.getNumVectors(); VI != VE; ++VI) {
+        PHINode *NewPHI = cast<PHINode>(PhiM.getVector(VI));
+        NewPHI->addIncoming(OpM.getVector(VI), IncomingB);
+      }
+    }
+
+    // finalizeLowering() may also insert instructions in some cases. The safe
+    // place for those is at the end of the initial block of PHIs.
+    Builder.SetInsertPoint(BlockIP);
+    return PhiM;
+  }
+
   /// Lower binary operators.
   MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI,
                                IRBuilder<> &Builder) {
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/phi.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/phi.ll
new file mode 100644
index 000000000000..9fdb2ce4dfa7
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/phi.ll
@@ -0,0 +1,789 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -matrix-allow-contract=false -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @matrix_phi_loop(ptr %in1, ptr %in2, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN1:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN1]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN1]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI9:%.*]] = phi <3 x double> [ [[COL_LOAD]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI10:%.*]] = phi <3 x double> [ [[COL_LOAD1]], [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI11:%.*]] = phi <3 x double> [ [[COL_LOAD3]], [[ENTRY]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x double>, ptr [[IN2:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN2]], i64 3
+; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[IN2]], i64 6
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <3 x double>, ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <3 x double> [[PHI9]], [[COL_LOAD4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[PHI10]], [[COL_LOAD6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x double> [[PHI11]], [[COL_LOAD8]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP7]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x double> [[TMP8]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP2]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <3 x double> [[TMP6]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    [[VEC_GEP12:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[TMP7]], ptr [[VEC_GEP12]], align 8
+; CHECK-NEXT:    [[VEC_GEP13:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[TMP8]], ptr [[VEC_GEP13]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in1v = call <9 x double> @llvm.matrix.column.major.load(ptr %in1, i64 3, i1 false, i32 3, i32 3)
+  br label %loop
+
+loop:
+  %phi = phi <9 x double> [%in1v, %entry], [%sum, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %in2v = call <9 x double> @llvm.matrix.column.major.load(ptr %in2, i64 3, i1 false, i32 3, i32 3)
+
+  %sum = fadd <9 x double> %phi, %in2v
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <9 x double> %sum, ptr %out
+  ret void
+}
+
+define void @matrix_phi_loop_zeroinitializer(ptr %in1, ptr %in2, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop_zeroinitializer(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI4:%.*]] = phi <3 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI5:%.*]] = phi <3 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI6:%.*]] = phi <3 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN2:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN2]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN2]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <3 x double> [[PHI4]], [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[PHI5]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x double> [[PHI6]], [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP7]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x double> [[TMP8]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP2]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <3 x double> [[TMP6]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[TMP7]], ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[TMP8]], ptr [[VEC_GEP8]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <9 x double> [zeroinitializer, %entry], [%sum, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %inv = call <9 x double> @llvm.matrix.column.major.load(ptr %in2, i64 3, i1 false, i32 3, i32 3)
+
+  %sum = fadd <9 x double> %phi, %inv
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <9 x double> %sum, ptr %out
+  ret void
+}
+
+define void @matrix_phi_loop_undef(ptr %in, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <9 x double> [ undef, [[ENTRY:%.*]] ], [ [[SUM:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INV:%.*]] = load <9 x double>, ptr [[IN:%.*]], align 128
+; CHECK-NEXT:    [[SUM]] = fadd <9 x double> [[PHI]], [[INV]]
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <9 x double> [[SUM]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <9 x double> [undef, %entry], [%sum, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %inv = load <9 x double>, ptr %in
+
+  %sum = fadd <9 x double> %phi, %inv
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <9 x double> %sum, ptr %out
+  ret void
+}
+
+define void @matrix_phi_loop_poison(ptr %in, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop_poison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI4:%.*]] = phi <3 x double> [ poison, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI5:%.*]] = phi <3 x double> [ poison, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI6:%.*]] = phi <3 x double> [ poison, [[ENTRY]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN2:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN2]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN2]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <3 x double> [[PHI4]], [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[PHI5]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x double> [[PHI6]], [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP7]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x double> [[TMP8]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP2]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <3 x double> [[TMP6]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[TMP7]], ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[TMP8]], ptr [[VEC_GEP8]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <9 x double> [poison, %entry], [%sum, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %inv = call <9 x double> @llvm.matrix.column.major.load(ptr %in, i64 3, i1 false, i32 3, i32 3)
+
+  %sum = fadd <9 x double> %phi, %inv
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <9 x double> %sum, ptr %out
+  ret void
+}
+
+define void @matrix_phi_loop_cdv(ptr %in, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop_cdv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI4:%.*]] = phi <3 x double> [ <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00>, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI5:%.*]] = phi <3 x double> [ <double 3.000000e+00, double 4.000000e+00, double 5.000000e+00>, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI6:%.*]] = phi <3 x double> [ <double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[ENTRY]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN2:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN2]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN2]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <3 x double> [[PHI4]], [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[PHI5]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x double> [[PHI6]], [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP7]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x double> [[TMP8]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP1]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP2]] = shufflevector <9 x double> [[TMP5]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <3 x double> [[TMP6]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[TMP7]], ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[TMP8]], ptr [[VEC_GEP8]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <9 x double> [<double 0., double 1., double 2., double 3., double 4., double 5., double 6., double 7., double 8.>, %entry], [%sum, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %inv = call <9 x double> @llvm.matrix.column.major.load(ptr %in, i64 3, i1 false, i32 3, i32 3)
+
+  %sum = fadd <9 x double> %phi, %inv
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <9 x double> %sum, ptr %out
+  ret void
+}
+
+define void @matrix_phi_loop_delay(ptr %in1, ptr %in2, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop_delay(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN1:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN1]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN1]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI14:%.*]] = phi <3 x double> [ [[COL_LOAD]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI15:%.*]] = phi <3 x double> [ [[COL_LOAD1]], [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI16:%.*]] = phi <3 x double> [ [[COL_LOAD3]], [[ENTRY]] ], [ [[TMP2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0]] = phi <3 x double> [ [[COL_LOAD]], [[ENTRY]] ], [ [[SPLIT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP1]] = phi <3 x double> [ [[COL_LOAD1]], [[ENTRY]] ], [ [[SPLIT10:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2]] = phi <3 x double> [ [[COL_LOAD3]], [[ENTRY]] ], [ [[SPLIT11:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x double>, ptr [[IN2:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN2]], i64 3
+; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[IN2]], i64 6
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <3 x double>, ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <3 x double> [[PHI14]], [[COL_LOAD4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[PHI15]], [[COL_LOAD6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x double> [[PHI16]], [[COL_LOAD8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP7]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <3 x double> [[TMP8]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP9]], <6 x double> [[TMP10]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SPLIT]] = shufflevector <9 x double> [[TMP11]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT10]] = shufflevector <9 x double> [[TMP11]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT11]] = shufflevector <9 x double> [[TMP11]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <3 x double> [[TMP6]], [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <3 x double> [[TMP7]], [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <3 x double> [[TMP8]], [[TMP2]]
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <3 x double> [[TMP12]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    [[VEC_GEP12:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[TMP13]], ptr [[VEC_GEP12]], align 8
+; CHECK-NEXT:    [[VEC_GEP13:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[TMP14]], ptr [[VEC_GEP13]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in1v = call <9 x double> @llvm.matrix.column.major.load(ptr %in1, i64 3, i1 false, i32 3, i32 3)
+  br label %loop
+
+loop:
+  %phi2 = phi <9 x double> [%in1v, %entry], [%phi, %loop]
+  %phi = phi <9 x double> [%in1v, %entry], [%sum, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %in2v = call <9 x double> @llvm.matrix.column.major.load(ptr %in2, i64 3, i1 false, i32 3, i32 3)
+
+  %sum = fadd <9 x double> %phi2, %in2v
+  %sum2 = fadd <9 x double> %sum, %phi
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <9 x double> %sum2, ptr %out
+  ret void
+}
+
+define void @matrix_phi_loop_delay_reshape(ptr %in1, ptr %in2, ptr %in3, i32 %count, ptr %out) {
+; CHECK-LABEL: @matrix_phi_loop_delay_reshape(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[IN3:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN3]], i64 2
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[VEC_GEP1:%.*]] = getelementptr double, ptr [[IN3]], i64 4
+; CHECK-NEXT:    [[COL_LOAD12:%.*]] = load <2 x double>, ptr [[VEC_GEP1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> [[COL_LOAD8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[COL_LOAD12]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[TMP2]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = shufflevector <6 x double> [[TMP2]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load <6 x double>, ptr [[IN2:%.*]], align 8
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN1:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN1]], i64 3
+; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi <3 x double> [ [[SPLIT]], [[ENTRY:%.*]] ], [ [[PHI1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI39:%.*]] = phi <3 x double> [ [[COL_LOAD10]], [[ENTRY]] ], [ [[PHI4:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI25:%.*]] = phi <6 x double> [ [[COL_LOAD11]], [[ENTRY]] ], [ [[PHI25]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI1]] = phi <3 x double> [ [[COL_LOAD]], [[ENTRY]] ], [ [[PHI2]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI4]] = phi <3 x double> [ [[COL_LOAD14]], [[ENTRY]] ], [ [[PHI39]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY]] ], [ [[DEC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <6 x double> [[PHI25]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x double> [[PHI25]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <3 x double> [[TMP3]], [[PHI1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x double> [[TMP4]], [[PHI4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <3 x double> [[TMP7]], [[PHI2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <3 x double> [[TMP8]], [[PHI39]]
+; CHECK-NEXT:    [[DEC]] = sub i32 [[CTR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store <3 x double> [[TMP5]], ptr [[OUT:%.*]], align 64
+; CHECK-NEXT:    [[VEC_GEP30:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[TMP6]], ptr [[VEC_GEP30]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in1v = call <6 x double> @llvm.matrix.column.major.load(ptr %in3, i64 2, i1 false, i32 2, i32 3)
+  %in2v = call <6 x double> @llvm.matrix.column.major.load(ptr %in2, i64 6, i1 false, i32 6, i32 1)
+  %in3v = call <6 x double> @llvm.matrix.column.major.load(ptr %in1, i64 3, i1 false, i32 3, i32 2)
+  br label %loop
+
+loop:
+  %phi = phi <6 x double> [%in1v, %entry], [%phi3, %loop]
+  %phi2 = phi <6 x double> [%in2v, %entry], [%phi2, %loop]
+  %phi3 = phi <6 x double> [%in3v, %entry], [%phi, %loop]
+  %ctr = phi i32 [%count, %entry], [%dec, %loop]
+
+  %sum = fadd <6 x double> %phi2, %phi3
+  %sum2 = fadd <6 x double> %sum, %phi
+
+  %dec = sub i32 %ctr, 1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  store <6 x double> %sum2, ptr %out
+  ret void
+}
+
+define void @matrix_phi_three_preds(i1 %cond1, i1 %cond2, ptr %a, ptr %b, ptr %c, ptr %out) {
+; CHECK-LABEL: @matrix_phi_three_preds(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[BB1:%.*]], label [[BBA:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[BBB:%.*]], label [[BBC:%.*]]
+; CHECK:       bba:
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bbb:
+; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <3 x double>, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr double, ptr [[B]], i64 3
+; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load <3 x double>, ptr [[VEC_GEP10]], align 8
+; CHECK-NEXT:    [[VEC_GEP12:%.*]] = getelementptr double, ptr [[B]], i64 6
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, ptr [[VEC_GEP12]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       bbc:
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x double>, ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[C]], i64 3
+; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[C]], i64 6
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <3 x double>, ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PHI14:%.*]] = phi <3 x double> [ [[COL_LOAD]], [[BBA]] ], [ [[COL_LOAD9]], [[BBB]] ], [ [[COL_LOAD4]], [[BBC]] ]
+; CHECK-NEXT:    [[PHI15:%.*]] = phi <3 x double> [ [[COL_LOAD1]], [[BBA]] ], [ [[COL_LOAD11]], [[BBB]] ], [ [[COL_LOAD6]], [[BBC]] ]
+; CHECK-NEXT:    [[PHI16:%.*]] = phi <3 x double> [ [[COL_LOAD3]], [[BBA]] ], [ [[COL_LOAD13]], [[BBB]] ], [ [[COL_LOAD8]], [[BBC]] ]
+; CHECK-NEXT:    store <3 x double> [[PHI14]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP17:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[PHI15]], ptr [[VEC_GEP17]], align 8
+; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[PHI16]], ptr [[VEC_GEP18]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond1, label %bb1, label %bba
+
+bb1:
+  br i1 %cond2, label %bbb, label %bbc
+
+bba:
+  %va = call <9 x double> @llvm.matrix.column.major.load(ptr %a, i64 3, i1 false, i32 3, i32 3)
+  br label %exit
+
+bbb:
+  %vb = call <9 x double> @llvm.matrix.column.major.load(ptr %b, i64 3, i1 false, i32 3, i32 3)
+  br label %exit
+
+bbc:
+  %vc = call <9 x double> @llvm.matrix.column.major.load(ptr %c, i64 3, i1 false, i32 3, i32 3)
+  br label %exit
+
+exit:
+  %phi = phi <9 x double> [%va, %bba], [%vb, %bbb], [%vc, %bbc]
+  call void @llvm.matrix.column.major.store(<9 x double> %phi, ptr %out, i64 3, i1 false, i32 3, i32 3)
+  ret void
+}
+
+define void @matrix_phi_two_preds_shape_mismatch1(i1 %cond1, ptr %a, ptr %b, ptr %out) {
+; CHECK-LABEL: @matrix_phi_two_preds_shape_mismatch1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[BBA:%.*]], label [[BBB:%.*]]
+; CHECK:       bba:
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <3 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD1]], <3 x double> [[COL_LOAD2]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD4]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[TMP2]], <9 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bbb:
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <9 x double>, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PHI5:%.*]] = phi <9 x double> [ [[SPLIT]], [[BBA]] ], [ [[COL_LOAD]], [[BBB]] ]
+; CHECK-NEXT:    store <9 x double> [[PHI5]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond1, label %bba, label %bbb
+
+bba:
+  %va = call <9 x double> @llvm.matrix.column.major.load(ptr %a, i64 3, i1 false, i32 3, i32 3)
+  br label %exit
+
+bbb:
+  %vb = call <9 x double> @llvm.matrix.column.major.load(ptr %b, i64 9, i1 false, i32 9, i32 1)
+  br label %exit
+
+exit:
+  %phi = phi <9 x double> [%va, %bba], [%vb, %bbb]
+  store <9 x double> %phi, ptr %out
+  ret void
+}
+
+define void @matrix_phi_two_preds_shape_mismatch2(i1 %cond1, ptr %a, ptr %b, ptr %out) {
+; CHECK-LABEL: @matrix_phi_two_preds_shape_mismatch2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[BBA:%.*]], label [[BBB:%.*]]
+; CHECK:       bba:
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <9 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[COL_LOAD4]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <9 x double> [[COL_LOAD4]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <9 x double> [[COL_LOAD4]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bbb:
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[B]], i64 3
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[B]], i64 6
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PHI5:%.*]] = phi <3 x double> [ [[SPLIT]], [[BBA]] ], [ [[COL_LOAD]], [[BBB]] ]
+; CHECK-NEXT:    [[PHI6:%.*]] = phi <3 x double> [ [[SPLIT8]], [[BBA]] ], [ [[COL_LOAD1]], [[BBB]] ]
+; CHECK-NEXT:    [[PHI7:%.*]] = phi <3 x double> [ [[SPLIT9]], [[BBA]] ], [ [[COL_LOAD3]], [[BBB]] ]
+; CHECK-NEXT:    store <3 x double> [[PHI5]], ptr [[OUT:%.*]], align 128
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr double, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <3 x double> [[PHI6]], ptr [[VEC_GEP10]], align 8
+; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr double, ptr [[OUT]], i64 6
+; CHECK-NEXT:    store <3 x double> [[PHI7]], ptr [[VEC_GEP11]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond1, label %bba, label %bbb
+
+bba:
+  %va = call <9 x double> @llvm.matrix.column.major.load(ptr %a, i64 9, i1 false, i32 9, i32 1)
+  br label %exit
+
+bbb:
+  %vb = call <9 x double> @llvm.matrix.column.major.load(ptr %b, i64 3, i1 false, i32 3, i32 3)
+  br label %exit
+
+exit:
+  %phi = phi <9 x double> [%va, %bba], [%vb, %bbb]
+  store <9 x double> %phi, ptr %out
+  ret void
+}
+
+define <9 x double> @matrix_phi_ifthenelse(i1 %cond, <9 x double> %A, <9 x double> %B, <9 x double> %C) {
+; CHECK-LABEL: @matrix_phi_ifthenelse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[A:%.*]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[A]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[A]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> poison, double [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> poison, double [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> poison, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[B:%.*]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[B]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[B]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x double> poison, double [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x double> [[TMP22]], double [[TMP23]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x double> [[TMP24]], double [[TMP25]], i64 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <3 x double> poison, double [[TMP27]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <3 x double> [[TMP28]], double [[TMP29]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <3 x double> [[TMP30]], double [[TMP31]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <3 x double> poison, double [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <3 x double> [[TMP34]], double [[TMP35]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <3 x double> [[TMP36]], double [[TMP37]], i64 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[MERGE9:%.*]] = phi <3 x double> [ [[TMP5]], [[IF_THEN]] ], [ [[TMP26]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[MERGE10:%.*]] = phi <3 x double> [ [[TMP11]], [[IF_THEN]] ], [ [[TMP32]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[MERGE11:%.*]] = phi <3 x double> [ [[TMP17]], [[IF_THEN]] ], [ [[TMP38]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <9 x double> [[MERGE:%.*]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT10:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT11:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x double> [[MERGE9]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP42]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <3 x double> [[MERGE9]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP44]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP45:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]]
+; CHECK-NEXT:    [[TMP46:%.*]] = fadd <1 x double> [[TMP43]], [[TMP45]]
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <3 x double> [[MERGE9]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP47]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[TMP49:%.*]] = fadd <1 x double> [[TMP46]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <1 x double> [[TMP49]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <3 x double> poison, <3 x double> [[TMP50]], <3 x i32> <i32 3, i32 1, i32 2>
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x double> [[MERGE9]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP52]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP53:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]]
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x double> [[MERGE9]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> poison, double [[TMP54]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]]
+; CHECK-NEXT:    [[TMP56:%.*]] = fadd <1 x double> [[TMP53]], [[TMP55]]
+; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x double> [[MERGE9]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> poison, double [[TMP57]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP58:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]]
+; CHECK-NEXT:    [[TMP59:%.*]] = fadd <1 x double> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <1 x double> [[TMP59]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <3 x double> [[TMP51]], <3 x double> [[TMP60]], <3 x i32> <i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <3 x double> [[MERGE9]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> poison, double [[TMP62]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]]
+; CHECK-NEXT:    [[BLOCK30:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <3 x double> [[MERGE9]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> poison, double [[TMP64]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP65:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]]
+; CHECK-NEXT:    [[TMP66:%.*]] = fadd <1 x double> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[BLOCK33:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <3 x double> [[MERGE9]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> poison, double [[TMP67]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP68:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fadd <1 x double> [[TMP66]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <1 x double> [[TMP69]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <3 x double> [[TMP61]], <3 x double> [[TMP70]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[BLOCK36:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <3 x double> [[MERGE10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> poison, double [[TMP72]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP73:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]]
+; CHECK-NEXT:    [[BLOCK39:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <3 x double> [[MERGE10]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> poison, double [[TMP74]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP75:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]]
+; CHECK-NEXT:    [[TMP76:%.*]] = fadd <1 x double> [[TMP73]], [[TMP75]]
+; CHECK-NEXT:    [[BLOCK42:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <3 x double> [[MERGE10]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> poison, double [[TMP77]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP78:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]]
+; CHECK-NEXT:    [[TMP79:%.*]] = fadd <1 x double> [[TMP76]], [[TMP78]]
+; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <1 x double> [[TMP79]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <3 x double> poison, <3 x double> [[TMP80]], <3 x i32> <i32 3, i32 1, i32 2>
+; CHECK-NEXT:    [[BLOCK45:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <3 x double> [[MERGE10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> poison, double [[TMP82]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP83:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]]
+; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <3 x double> [[MERGE10]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> poison, double [[TMP84]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP85:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
+; CHECK-NEXT:    [[TMP86:%.*]] = fadd <1 x double> [[TMP83]], [[TMP85]]
+; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <3 x double> [[MERGE10]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> poison, double [[TMP87]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP88:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]]
+; CHECK-NEXT:    [[TMP89:%.*]] = fadd <1 x double> [[TMP86]], [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = shufflevector <1 x double> [[TMP89]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP91:%.*]] = shufflevector <3 x double> [[TMP81]], <3 x double> [[TMP90]], <3 x i32> <i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[BLOCK54:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <3 x double> [[MERGE10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> poison, double [[TMP92]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP93:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]]
+; CHECK-NEXT:    [[BLOCK57:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <3 x double> [[MERGE10]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> poison, double [[TMP94]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP95:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]]
+; CHECK-NEXT:    [[TMP96:%.*]] = fadd <1 x double> [[TMP93]], [[TMP95]]
+; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <3 x double> [[MERGE10]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> poison, double [[TMP97]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP98:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
+; CHECK-NEXT:    [[TMP99:%.*]] = fadd <1 x double> [[TMP96]], [[TMP98]]
+; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <1 x double> [[TMP99]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <3 x double> [[TMP91]], <3 x double> [[TMP100]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[BLOCK63:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <3 x double> [[MERGE11]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> poison, double [[TMP102]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP103:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]]
+; CHECK-NEXT:    [[BLOCK66:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <3 x double> [[MERGE11]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> poison, double [[TMP104]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP105:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]]
+; CHECK-NEXT:    [[TMP106:%.*]] = fadd <1 x double> [[TMP103]], [[TMP105]]
+; CHECK-NEXT:    [[BLOCK69:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <3 x double> [[MERGE11]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> poison, double [[TMP107]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP108:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]]
+; CHECK-NEXT:    [[TMP109:%.*]] = fadd <1 x double> [[TMP106]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <1 x double> [[TMP109]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP111:%.*]] = shufflevector <3 x double> poison, <3 x double> [[TMP110]], <3 x i32> <i32 3, i32 1, i32 2>
+; CHECK-NEXT:    [[BLOCK72:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <3 x double> [[MERGE11]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> poison, double [[TMP112]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP113:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]]
+; CHECK-NEXT:    [[BLOCK75:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <3 x double> [[MERGE11]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> poison, double [[TMP114]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP115:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]]
+; CHECK-NEXT:    [[TMP116:%.*]] = fadd <1 x double> [[TMP113]], [[TMP115]]
+; CHECK-NEXT:    [[BLOCK78:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP117:%.*]] = extractelement <3 x double> [[MERGE11]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> poison, double [[TMP117]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP118:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]]
+; CHECK-NEXT:    [[TMP119:%.*]] = fadd <1 x double> [[TMP116]], [[TMP118]]
+; CHECK-NEXT:    [[TMP120:%.*]] = shufflevector <1 x double> [[TMP119]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP121:%.*]] = shufflevector <3 x double> [[TMP111]], <3 x double> [[TMP120]], <3 x i32> <i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <3 x double> [[SPLIT9]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <3 x double> [[MERGE11]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> poison, double [[TMP122]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP123:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
+; CHECK-NEXT:    [[BLOCK84:%.*]] = shufflevector <3 x double> [[SPLIT10]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <3 x double> [[MERGE11]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT85:%.*]] = insertelement <1 x double> poison, double [[TMP124]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT86:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT85]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP125:%.*]] = fmul <1 x double> [[BLOCK84]], [[SPLAT_SPLAT86]]
+; CHECK-NEXT:    [[TMP126:%.*]] = fadd <1 x double> [[TMP123]], [[TMP125]]
+; CHECK-NEXT:    [[BLOCK87:%.*]] = shufflevector <3 x double> [[SPLIT11]], <3 x double> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <3 x double> [[MERGE11]], i64 2
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT88:%.*]] = insertelement <1 x double> poison, double [[TMP127]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT88]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP128:%.*]] = fmul <1 x double> [[BLOCK87]], [[SPLAT_SPLAT89]]
+; CHECK-NEXT:    [[TMP129:%.*]] = fadd <1 x double> [[TMP126]], [[TMP128]]
+; CHECK-NEXT:    [[TMP130:%.*]] = shufflevector <1 x double> [[TMP129]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <3 x double> [[TMP121]], <3 x double> [[TMP130]], <3 x i32> <i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP132:%.*]] = shufflevector <3 x double> [[TMP71]], <3 x double> [[TMP101]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP133:%.*]] = shufflevector <3 x double> [[TMP131]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP134:%.*]] = shufflevector <6 x double> [[TMP132]], <6 x double> [[TMP133]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    ret <9 x double> [[TMP134]]
+;
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:                    ; preds = %entry
+  %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
+  br label %if.end
+
+if.else:                                       ; preds = %entry
+  %B.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %B, i32 3, i32 3)
+  br label %if.end
+
+if.end:                                        ; preds = %if.then, %if.else
+  %merge = phi <9 x double> [ %A.trans, %if.then], [ %B.trans, %if.else ]
+  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3)
+  ret <9 x double> %res
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
index 2af2c979f206..f07e1762d404 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
@@ -1,267 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
 
-; Check that we we use flattened vectors for PHI operands and extract the columns afterwards.
-define <9 x double> @unsupported_phi(i1 %cond, <9 x double> %A, <9 x double> %B, <9 x double> %C) {
-; CHECK-LABEL: @unsupported_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[A:%.*]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[A]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[A]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> poison, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> poison, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> poison, double [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> [[TMP11]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <6 x double> [[TMP18]], <6 x double> [[TMP19]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT:    br label [[IF_END:%.*]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[B:%.*]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[B]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[B]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x double> poison, double [[TMP21]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x double> [[TMP22]], double [[TMP23]], i64 1
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x double> [[TMP24]], double [[TMP25]], i64 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <3 x double> poison, double [[TMP27]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <3 x double> [[TMP28]], double [[TMP29]], i64 1
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <3 x double> [[TMP30]], double [[TMP31]], i64 2
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <3 x double> poison, double [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <3 x double> [[TMP34]], double [[TMP35]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <3 x double> [[TMP36]], double [[TMP37]], i64 2
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <3 x double> [[TMP26]], <3 x double> [[TMP32]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <3 x double> [[TMP38]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <6 x double> [[TMP39]], <6 x double> [[TMP40]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi <9 x double> [ [[TMP20]], [[IF_THEN]] ], [ [[TMP41]], [[IF_ELSE]] ]
-; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <9 x double> [[C:%.*]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <9 x double> [[C]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <9 x double> [[C]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT10:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT11:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> poison, <3 x i32> <i32 6, i32 7, i32 8>
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP42]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP44]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP45:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]]
-; CHECK-NEXT:    [[TMP46:%.*]] = fadd <1 x double> [[TMP43]], [[TMP45]]
-; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP47]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT:    [[TMP49:%.*]] = fadd <1 x double> [[TMP46]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <1 x double> [[TMP49]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <3 x double> poison, <3 x double> [[TMP50]], <3 x i32> <i32 3, i32 1, i32 2>
-; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP52]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]]
-; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> poison, double [[TMP54]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP55:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]]
-; CHECK-NEXT:    [[TMP56:%.*]] = fadd <1 x double> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> poison, double [[TMP57]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP58:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]]
-; CHECK-NEXT:    [[TMP59:%.*]] = fadd <1 x double> [[TMP56]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <1 x double> [[TMP59]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <3 x double> [[TMP51]], <3 x double> [[TMP60]], <3 x i32> <i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> poison, double [[TMP62]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP63:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]]
-; CHECK-NEXT:    [[BLOCK30:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> poison, double [[TMP64]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP65:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]]
-; CHECK-NEXT:    [[TMP66:%.*]] = fadd <1 x double> [[TMP63]], [[TMP65]]
-; CHECK-NEXT:    [[BLOCK33:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <3 x double> [[SPLIT9]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> poison, double [[TMP67]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP68:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]]
-; CHECK-NEXT:    [[TMP69:%.*]] = fadd <1 x double> [[TMP66]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <1 x double> [[TMP69]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <3 x double> [[TMP61]], <3 x double> [[TMP70]], <3 x i32> <i32 0, i32 1, i32 3>
-; CHECK-NEXT:    [[BLOCK36:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> poison, double [[TMP72]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP73:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]]
-; CHECK-NEXT:    [[BLOCK39:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> poison, double [[TMP74]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP75:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]]
-; CHECK-NEXT:    [[TMP76:%.*]] = fadd <1 x double> [[TMP73]], [[TMP75]]
-; CHECK-NEXT:    [[BLOCK42:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> poison, double [[TMP77]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]]
-; CHECK-NEXT:    [[TMP79:%.*]] = fadd <1 x double> [[TMP76]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = shufflevector <1 x double> [[TMP79]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <3 x double> poison, <3 x double> [[TMP80]], <3 x i32> <i32 3, i32 1, i32 2>
-; CHECK-NEXT:    [[BLOCK45:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> poison, double [[TMP82]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP83:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]]
-; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> poison, double [[TMP84]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP85:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
-; CHECK-NEXT:    [[TMP86:%.*]] = fadd <1 x double> [[TMP83]], [[TMP85]]
-; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> poison, double [[TMP87]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP88:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]]
-; CHECK-NEXT:    [[TMP89:%.*]] = fadd <1 x double> [[TMP86]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = shufflevector <1 x double> [[TMP89]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP91:%.*]] = shufflevector <3 x double> [[TMP81]], <3 x double> [[TMP90]], <3 x i32> <i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[BLOCK54:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> poison, double [[TMP92]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP93:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]]
-; CHECK-NEXT:    [[BLOCK57:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> poison, double [[TMP94]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP95:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]]
-; CHECK-NEXT:    [[TMP96:%.*]] = fadd <1 x double> [[TMP93]], [[TMP95]]
-; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <3 x double> [[SPLIT10]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> poison, double [[TMP97]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP98:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
-; CHECK-NEXT:    [[TMP99:%.*]] = fadd <1 x double> [[TMP96]], [[TMP98]]
-; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <1 x double> [[TMP99]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <3 x double> [[TMP91]], <3 x double> [[TMP100]], <3 x i32> <i32 0, i32 1, i32 3>
-; CHECK-NEXT:    [[BLOCK63:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> poison, double [[TMP102]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP103:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]]
-; CHECK-NEXT:    [[BLOCK66:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> poison, double [[TMP104]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP105:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]]
-; CHECK-NEXT:    [[TMP106:%.*]] = fadd <1 x double> [[TMP103]], [[TMP105]]
-; CHECK-NEXT:    [[BLOCK69:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> poison, double [[TMP107]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP108:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]]
-; CHECK-NEXT:    [[TMP109:%.*]] = fadd <1 x double> [[TMP106]], [[TMP108]]
-; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <1 x double> [[TMP109]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP111:%.*]] = shufflevector <3 x double> poison, <3 x double> [[TMP110]], <3 x i32> <i32 3, i32 1, i32 2>
-; CHECK-NEXT:    [[BLOCK72:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> poison, double [[TMP112]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP113:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]]
-; CHECK-NEXT:    [[BLOCK75:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> poison, double [[TMP114]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP115:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]]
-; CHECK-NEXT:    [[TMP116:%.*]] = fadd <1 x double> [[TMP113]], [[TMP115]]
-; CHECK-NEXT:    [[BLOCK78:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP117:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> poison, double [[TMP117]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP118:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]]
-; CHECK-NEXT:    [[TMP119:%.*]] = fadd <1 x double> [[TMP116]], [[TMP118]]
-; CHECK-NEXT:    [[TMP120:%.*]] = shufflevector <1 x double> [[TMP119]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP121:%.*]] = shufflevector <3 x double> [[TMP111]], <3 x double> [[TMP120]], <3 x i32> <i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> poison, double [[TMP122]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP123:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
-; CHECK-NEXT:    [[BLOCK84:%.*]] = shufflevector <3 x double> [[SPLIT7]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT85:%.*]] = insertelement <1 x double> poison, double [[TMP124]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT86:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT85]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP125:%.*]] = fmul <1 x double> [[BLOCK84]], [[SPLAT_SPLAT86]]
-; CHECK-NEXT:    [[TMP126:%.*]] = fadd <1 x double> [[TMP123]], [[TMP125]]
-; CHECK-NEXT:    [[BLOCK87:%.*]] = shufflevector <3 x double> [[SPLIT8]], <3 x double> poison, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <3 x double> [[SPLIT11]], i64 2
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT88:%.*]] = insertelement <1 x double> poison, double [[TMP127]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT88]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP128:%.*]] = fmul <1 x double> [[BLOCK87]], [[SPLAT_SPLAT89]]
-; CHECK-NEXT:    [[TMP129:%.*]] = fadd <1 x double> [[TMP126]], [[TMP128]]
-; CHECK-NEXT:    [[TMP130:%.*]] = shufflevector <1 x double> [[TMP129]], <1 x double> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <3 x double> [[TMP121]], <3 x double> [[TMP130]], <3 x i32> <i32 0, i32 1, i32 3>
-; CHECK-NEXT:    [[TMP132:%.*]] = shufflevector <3 x double> [[TMP71]], <3 x double> [[TMP101]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP133:%.*]] = shufflevector <3 x double> [[TMP131]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP134:%.*]] = shufflevector <6 x double> [[TMP132]], <6 x double> [[TMP133]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; CHECK-NEXT:    ret <9 x double> [[TMP134]]
-;
-
-
-
-entry:
-  br i1 %cond, label %if.then, label %if.else
-
-if.then:                    ; preds = %entry
-  %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
-  br label %if.end
-
-if.else:                                       ; preds = %entry
-  %B.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %B, i32 3, i32 3)
-  br label %if.end
-
-if.end:                                        ; preds = %if.then, %if.else
-  %merge = phi <9 x double> [ %A.trans, %if.then], [ %B.trans, %if.else ]
-  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3)
-  ret <9 x double> %res
-}
-
 ; Make sure we use a flattened vector when calling @foo and the use its flat vector result properly.
 define <9 x double> @unsupported_call(i1 %cond, <9 x double> %A, <9 x double> %B) {
 ; CHECK-LABEL: @unsupported_call(

From a2cee05449636c8e0d630b2ccdc71f2d422227a9 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Wed, 18 Jun 2025 09:12:53 -0700
Subject: [PATCH 0829/1322] [scudo] Make report pointers const. (#144624)

Mark as many of the reportXX functions that take pointers const. This
avoid the need to use const_cast when calling these functions on an
already const pointer.

Fix reportHeaderCorruption calls where an argument was passed into an
append call that didn't use them.
---
 compiler-rt/lib/scudo/standalone/chunk.h    |  2 +-
 compiler-rt/lib/scudo/standalone/combined.h |  2 +-
 compiler-rt/lib/scudo/standalone/report.cpp | 15 +++++++--------
 compiler-rt/lib/scudo/standalone/report.h   | 11 ++++++-----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/chunk.h b/compiler-rt/lib/scudo/standalone/chunk.h
index a1b8e723d4cb..9da2dc57e71a 100644
--- a/compiler-rt/lib/scudo/standalone/chunk.h
+++ b/compiler-rt/lib/scudo/standalone/chunk.h
@@ -125,7 +125,7 @@ inline void loadHeader(u32 Cookie, const void *Ptr,
   *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
   if (UNLIKELY(NewUnpackedHeader->Checksum !=
                computeHeaderChecksum(Cookie, Ptr, NewUnpackedHeader)))
-    reportHeaderCorruption(NewUnpackedHeader, const_cast<void *>(Ptr));
+    reportHeaderCorruption(NewUnpackedHeader, Ptr);
 }
 
 inline bool isValid(u32 Cookie, const void *Ptr,
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 43655642843c..87acdec2a3ba 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -775,7 +775,7 @@ public:
 
     // Getting the alloc size of a chunk only makes sense if it's allocated.
     if (UNLIKELY(Header.State != Chunk::State::Allocated))
-      reportInvalidChunkState(AllocatorAction::Sizing, const_cast<void *>(Ptr));
+      reportInvalidChunkState(AllocatorAction::Sizing, Ptr);
 
     return getSize(Ptr, &Header);
   }
diff --git a/compiler-rt/lib/scudo/standalone/report.cpp b/compiler-rt/lib/scudo/standalone/report.cpp
index 14a4066d3720..b97a74b078c2 100644
--- a/compiler-rt/lib/scudo/standalone/report.cpp
+++ b/compiler-rt/lib/scudo/standalone/report.cpp
@@ -66,17 +66,16 @@ void NORETURN reportInvalidFlag(const char *FlagType, const char *Value) {
 
 // The checksum of a chunk header is invalid. This could be caused by an
 // {over,under}write of the header, a pointer that is not an actual chunk.
-void NORETURN reportHeaderCorruption(void *Header, void *Ptr) {
+void NORETURN reportHeaderCorruption(void *Header, const void *Ptr) {
   ScopedErrorReport Report;
   Report.append("corrupted chunk header at address %p", Ptr);
   if (*static_cast<Chunk::PackedHeader *>(Header) == 0U) {
     // Header all zero, which could indicate that this might be a pointer that
     // has been double freed but the memory has been released to the kernel.
     Report.append(": chunk header is zero and might indicate memory corruption "
-                  "or a double free\n",
-                  Ptr);
+                  "or a double free\n");
   } else {
-    Report.append(": most likely due to memory corruption\n", Ptr);
+    Report.append(": most likely due to memory corruption\n");
   }
 }
 
@@ -131,13 +130,13 @@ static const char *stringifyAction(AllocatorAction Action) {
 
 // The chunk is not in a state congruent with the operation we want to perform.
 // This is usually the case with a double-free, a realloc of a freed pointer.
-void NORETURN reportInvalidChunkState(AllocatorAction Action, void *Ptr) {
+void NORETURN reportInvalidChunkState(AllocatorAction Action, const void *Ptr) {
   ScopedErrorReport Report;
   Report.append("invalid chunk state when %s address %p\n",
                 stringifyAction(Action), Ptr);
 }
 
-void NORETURN reportMisalignedPointer(AllocatorAction Action, void *Ptr) {
+void NORETURN reportMisalignedPointer(AllocatorAction Action, const void *Ptr) {
   ScopedErrorReport Report;
   Report.append("misaligned pointer when %s address %p\n",
                 stringifyAction(Action), Ptr);
@@ -145,7 +144,7 @@ void NORETURN reportMisalignedPointer(AllocatorAction Action, void *Ptr) {
 
 // The deallocation function used is at odds with the one used to allocate the
 // chunk (eg: new[]/delete or malloc/delete, and so on).
-void NORETURN reportDeallocTypeMismatch(AllocatorAction Action, void *Ptr,
+void NORETURN reportDeallocTypeMismatch(AllocatorAction Action, const void *Ptr,
                                         u8 TypeA, u8 TypeB) {
   ScopedErrorReport Report;
   Report.append("allocation type mismatch when %s address %p (%d vs %d)\n",
@@ -154,7 +153,7 @@ void NORETURN reportDeallocTypeMismatch(AllocatorAction Action, void *Ptr,
 
 // The size specified to the delete operator does not match the one that was
 // passed to new when allocating the chunk.
-void NORETURN reportDeleteSizeMismatch(void *Ptr, uptr Size,
+void NORETURN reportDeleteSizeMismatch(const void *Ptr, uptr Size,
                                        uptr ExpectedSize) {
   ScopedErrorReport Report;
   Report.append(
diff --git a/compiler-rt/lib/scudo/standalone/report.h b/compiler-rt/lib/scudo/standalone/report.h
index c0214b51560e..c397dd3fc9c6 100644
--- a/compiler-rt/lib/scudo/standalone/report.h
+++ b/compiler-rt/lib/scudo/standalone/report.h
@@ -24,7 +24,7 @@ void NORETURN reportRawError(const char *Message);
 void NORETURN reportInvalidFlag(const char *FlagType, const char *Value);
 
 // Chunk header related errors.
-void NORETURN reportHeaderCorruption(void *Header, void *Ptr);
+void NORETURN reportHeaderCorruption(void *Header, const void *Ptr);
 
 // Sanity checks related error.
 void NORETURN reportSanityCheckError(const char *Field);
@@ -41,11 +41,12 @@ enum class AllocatorAction : u8 {
   Reallocating,
   Sizing,
 };
-void NORETURN reportInvalidChunkState(AllocatorAction Action, void *Ptr);
-void NORETURN reportMisalignedPointer(AllocatorAction Action, void *Ptr);
-void NORETURN reportDeallocTypeMismatch(AllocatorAction Action, void *Ptr,
+void NORETURN reportInvalidChunkState(AllocatorAction Action, const void *Ptr);
+void NORETURN reportMisalignedPointer(AllocatorAction Action, const void *Ptr);
+void NORETURN reportDeallocTypeMismatch(AllocatorAction Action, const void *Ptr,
                                         u8 TypeA, u8 TypeB);
-void NORETURN reportDeleteSizeMismatch(void *Ptr, uptr Size, uptr ExpectedSize);
+void NORETURN reportDeleteSizeMismatch(const void *Ptr, uptr Size,
+                                       uptr ExpectedSize);
 
 // C wrappers errors.
 void NORETURN reportAlignmentNotPowerOfTwo(uptr Alignment);

From 13510c07364dc3ac30f34e73c98ac8dc75e7efc7 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Wed, 18 Jun 2025 12:13:50 -0400
Subject: [PATCH 0830/1322] [libc++] Make list constexpr as part of P3372R3
 (#129799)

This patch makes `std::list` constexpr as part of P3372R3.

Fixes #128659.
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +
 libcxx/include/list                           | 548 ++++++++++-------
 libcxx/include/version                        |   2 +
 .../sequences/list/compare.pass.cpp           |  24 +-
 .../sequences/list/compare.three_way.pass.cpp |   6 +-
 .../sequences/list/get_allocator.pass.cpp     |  13 +-
 .../sequences/list/incomplete_type.pass.cpp   |  13 +-
 .../sequences/list/iterators.pass.cpp         |  27 +-
 .../list/list.capacity/empty.pass.cpp         |  13 +-
 .../list/list.capacity/max_size.pass.cpp      |  13 +-
 .../list/list.capacity/resize_size.pass.cpp   |  62 +-
 .../list.capacity/resize_size_value.pass.cpp  |  13 +-
 .../list/list.capacity/size.pass.cpp          |  13 +-
 .../list/list.cons/assign_copy.pass.cpp       |  13 +-
 .../assign_initializer_list.pass.cpp          |  13 +-
 .../list/list.cons/assign_move.pass.cpp       |  13 +-
 .../sequences/list/list.cons/copy.pass.cpp    |  13 +-
 .../list/list.cons/copy_alloc.pass.cpp        |  13 +-
 .../sequences/list/list.cons/default.pass.cpp |  13 +-
 .../list.cons/default_stack_alloc.pass.cpp    |  13 +-
 .../list/list.cons/from_range.pass.cpp        |  20 +-
 .../list/list.cons/initializer_list.pass.cpp  |  13 +-
 .../list.cons/initializer_list_alloc.pass.cpp |  13 +-
 .../list/list.cons/input_iterator.pass.cpp    |  27 +-
 .../sequences/list/list.cons/move.pass.cpp    |  13 +-
 .../list/list.cons/move_alloc.pass.cpp        |  13 +-
 .../op_equal_initializer_list.pass.cpp        |  13 +-
 .../list/list.cons/size_type.pass.cpp         |  39 +-
 .../list/list.cons/size_value_alloc.pass.cpp  |  13 +-
 .../list/list.erasure/erase.pass.cpp          |  27 +-
 .../list/list.erasure/erase_if.pass.cpp       |  27 +-
 .../list/list.modifiers/append_range.pass.cpp |  20 +-
 .../list/list.modifiers/assign_range.pass.cpp |  20 +-
 .../list/list.modifiers/clear.pass.cpp        |  13 +-
 .../list/list.modifiers/emplace.pass.cpp      |  19 +-
 .../list/list.modifiers/emplace_back.pass.cpp |  19 +-
 .../list.modifiers/emplace_front.pass.cpp     |  19 +-
 .../list/list.modifiers/erase_iter.pass.cpp   |  13 +-
 .../list.modifiers/erase_iter_iter.pass.cpp   |  13 +-
 .../insert_iter_initializer_list.pass.cpp     |  13 +-
 .../insert_iter_iter_iter.pass.cpp            |  59 +-
 .../insert_iter_rvalue.pass.cpp               |  13 +-
 .../insert_iter_size_value.pass.cpp           |  33 +-
 .../list.modifiers/insert_iter_value.pass.cpp |  33 +-
 .../list/list.modifiers/insert_range.pass.cpp |  23 +-
 .../list/list.modifiers/pop_back.pass.cpp     |  13 +-
 .../list/list.modifiers/pop_front.pass.cpp    |  13 +-
 .../list.modifiers/prepend_range.pass.cpp     |  20 +-
 .../list/list.modifiers/push_back.pass.cpp    |  13 +-
 .../list.modifiers/push_back_rvalue.pass.cpp  |  13 +-
 .../list/list.modifiers/push_front.pass.cpp   |  13 +-
 .../list.modifiers/push_front_rvalue.pass.cpp |  13 +-
 .../sequences/list/list.ops/merge.pass.cpp    |  13 +-
 .../list/list.ops/merge_comp.pass.cpp         |  13 +-
 .../sequences/list/list.ops/remove.pass.cpp   |  25 +-
 .../list/list.ops/remove_if.pass.cpp          |  25 +-
 .../sequences/list/list.ops/reverse.pass.cpp  |  13 +-
 .../sequences/list/list.ops/sort.pass.cpp     |  19 +-
 .../list/list.ops/sort_comp.pass.cpp          |  57 +-
 .../list/list.ops/splice_pos_list.pass.cpp    |  13 +-
 .../list.ops/splice_pos_list_iter.pass.cpp    |  13 +-
 .../splice_pos_list_iter_iter.pass.cpp        |  13 +-
 .../sequences/list/list.ops/unique.pass.cpp   |  13 +-
 .../list/list.ops/unique_pred.pass.cpp        |  23 +-
 .../sequences/list/list.special/swap.pass.cpp |  13 +-
 .../list/list.special/swap_noexcept.pass.cpp  |  13 +-
 .../list.version.compile.pass.cpp             |  27 +
 .../version.version.compile.pass.cpp          |  27 +
 libcxx/test/support/min_allocator.h           | 561 +++++++++---------
 .../generate_feature_test_macro_components.py |   5 +
 70 files changed, 1555 insertions(+), 797 deletions(-)

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 3e6fd643f620..5ebc9bb7dcda 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -422,6 +422,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_forward_list``                       ``202502L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_list``                               ``202502L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_new``                                ``202406L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_queue``                              ``202502L``
diff --git a/libcxx/include/list b/libcxx/include/list
index 98610f59ed74..2896231203d9 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -297,14 +297,20 @@ struct __list_node_base {
   __base_pointer __prev_;
   __base_pointer __next_;
 
-  _LIBCPP_HIDE_FROM_ABI __list_node_base() : __prev_(__self()), __next_(__self()) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_node_base() : __prev_(__self()), __next_(__self()) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __list_node_base(__base_pointer __prev, __base_pointer __next)
       : __prev_(__prev), __next_(__next) {}
 
-  _LIBCPP_HIDE_FROM_ABI __base_pointer __self() { return pointer_traits<__base_pointer>::pointer_to(*this); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __base_pointer __self() {
+    return pointer_traits<__base_pointer>::pointer_to(*this);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __as_node() { return static_cast<__node_pointer>(__self()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __as_node() {
+    return pointer_traits<__node_pointer>::pointer_to(
+        *static_cast<typename pointer_traits<__node_pointer>::element_type*>(this));
+  }
 };
 
 template <class _Tp, class _VoidPtr>
@@ -319,7 +325,7 @@ private:
   };
 
 public:
-  _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
 #  else
 
 private:
@@ -332,10 +338,14 @@ public:
   typedef __list_node_base<_Tp, _VoidPtr> __base;
   typedef typename __base::__base_pointer __base_pointer;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __list_node(__base_pointer __prev, __base_pointer __next) : __base(__prev, __next) {}
-  _LIBCPP_HIDE_FROM_ABI ~__list_node() {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __list_node(__base_pointer __prev, __base_pointer __next)
+      : __base(__prev, __next) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__list_node() {}
 
-  _LIBCPP_HIDE_FROM_ABI __base_pointer __as_link() { return __base::__self(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __base_pointer __as_link() {
+    return pointer_traits<__base_pointer>::pointer_to(
+        *static_cast<typename pointer_traits<__base_pointer>::element_type*>(std::addressof(*this)));
+  }
 };
 
 template <class _Tp, class _Alloc = allocator<_Tp> >
@@ -352,7 +362,8 @@ class __list_iterator {
 
   __base_pointer __ptr_;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __list_iterator(__base_pointer __p) _NOEXCEPT : __ptr_(__p) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __list_iterator(__base_pointer __p) _NOEXCEPT
+      : __ptr_(__p) {}
 
   template <class, class>
   friend class list;
@@ -368,37 +379,41 @@ public:
   typedef __rebind_pointer_t<_VoidPtr, value_type> pointer;
   typedef typename pointer_traits<pointer>::difference_type difference_type;
 
-  _LIBCPP_HIDE_FROM_ABI __list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __ptr_->__as_node()->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    return __ptr_->__as_node()->__get_value();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
     return pointer_traits<pointer>::pointer_to(__ptr_->__as_node()->__get_value());
   }
 
-  _LIBCPP_HIDE_FROM_ABI __list_iterator& operator++() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_iterator& operator++() {
     __ptr_ = __ptr_->__next_;
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __list_iterator operator++(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_iterator operator++(int) {
     __list_iterator __t(*this);
     ++(*this);
     return __t;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __list_iterator& operator--() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_iterator& operator--() {
     __ptr_ = __ptr_->__prev_;
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __list_iterator operator--(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_iterator operator--(int) {
     __list_iterator __t(*this);
     --(*this);
     return __t;
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const __list_iterator& __x, const __list_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator==(const __list_iterator& __x, const __list_iterator& __y) {
     return __x.__ptr_ == __y.__ptr_;
   }
-  friend _LIBCPP_HIDE_FROM_ABI bool operator!=(const __list_iterator& __x, const __list_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator!=(const __list_iterator& __x, const __list_iterator& __y) {
     return !(__x == __y);
   }
 };
@@ -410,7 +425,8 @@ class __list_const_iterator {
 
   __base_pointer __ptr_;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __list_const_iterator(__base_pointer __p) _NOEXCEPT : __ptr_(__p) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __list_const_iterator(__base_pointer __p) _NOEXCEPT
+      : __ptr_(__p) {}
 
   template <class, class>
   friend class list;
@@ -424,39 +440,43 @@ public:
   typedef __rebind_pointer_t<_VoidPtr, const value_type> pointer;
   typedef typename pointer_traits<pointer>::difference_type difference_type;
 
-  _LIBCPP_HIDE_FROM_ABI __list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
-  _LIBCPP_HIDE_FROM_ABI __list_const_iterator(const __list_iterator<_Tp, _VoidPtr>& __p) _NOEXCEPT
-      : __ptr_(__p.__ptr_) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  __list_const_iterator(const __list_iterator<_Tp, _VoidPtr>& __p) _NOEXCEPT : __ptr_(__p.__ptr_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __ptr_->__as_node()->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    return __ptr_->__as_node()->__get_value();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
     return pointer_traits<pointer>::pointer_to(__ptr_->__as_node()->__get_value());
   }
 
-  _LIBCPP_HIDE_FROM_ABI __list_const_iterator& operator++() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_const_iterator& operator++() {
     __ptr_ = __ptr_->__next_;
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __list_const_iterator operator++(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_const_iterator operator++(int) {
     __list_const_iterator __t(*this);
     ++(*this);
     return __t;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __list_const_iterator& operator--() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_const_iterator& operator--() {
     __ptr_ = __ptr_->__prev_;
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __list_const_iterator operator--(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_const_iterator operator--(int) {
     __list_const_iterator __t(*this);
     --(*this);
     return __t;
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const __list_const_iterator& __x, const __list_const_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator==(const __list_const_iterator& __x, const __list_const_iterator& __y) {
     return __x.__ptr_ == __y.__ptr_;
   }
-  friend _LIBCPP_HIDE_FROM_ABI bool operator!=(const __list_const_iterator& __x, const __list_const_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator!=(const __list_const_iterator& __x, const __list_const_iterator& __y) {
     return !(__x == __y);
   }
 };
@@ -497,43 +517,49 @@ protected:
   __node_base __end_;
   _LIBCPP_COMPRESSED_PAIR(size_type, __size_, __node_allocator, __node_alloc_);
 
-  _LIBCPP_HIDE_FROM_ABI __base_pointer __end_as_link() const _NOEXCEPT {
-    return const_cast<__node_base&>(__end_).__self();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __base_pointer __end_as_link() const _NOEXCEPT {
+    return pointer_traits<__base_pointer>::pointer_to(const_cast<__node_base&>(__end_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type __node_alloc_max_size() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type __node_alloc_max_size() const _NOEXCEPT {
     return __node_alloc_traits::max_size(__node_alloc_);
   }
-  _LIBCPP_HIDE_FROM_ABI static void __unlink_nodes(__base_pointer __f, __base_pointer __l) _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static void
+  __unlink_nodes(__base_pointer __f, __base_pointer __l) _NOEXCEPT;
 
-  _LIBCPP_HIDE_FROM_ABI __list_imp() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value);
-  _LIBCPP_HIDE_FROM_ABI __list_imp(const allocator_type& __a);
-  _LIBCPP_HIDE_FROM_ABI __list_imp(const __node_allocator& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_imp()
+      _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_imp(const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_imp(const __node_allocator& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI __list_imp(__node_allocator&& __a) _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __list_imp(__node_allocator&& __a) _NOEXCEPT;
 #  endif
-  _LIBCPP_HIDE_FROM_ABI ~__list_imp();
-  _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __size_ == 0; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__list_imp();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __size_ == 0; }
 
-  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__end_.__next_); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__end_.__next_); }
-  _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(__end_as_link()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(__end_as_link()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__end_.__next_); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
+    return const_iterator(__end_.__next_);
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(__end_as_link()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
+    return const_iterator(__end_as_link());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI void swap(__list_imp& __c)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(__list_imp& __c)
 #  if _LIBCPP_STD_VER >= 14
       _NOEXCEPT;
 #  else
       _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<allocator_type>);
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp& __c) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp& __c) {
     __copy_assign_alloc(
         __c, integral_constant<bool, __node_alloc_traits::propagate_on_container_copy_assignment::value>());
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c)
       _NOEXCEPT_(!__node_alloc_traits::propagate_on_container_move_assignment::value ||
                  is_nothrow_move_assignable<__node_allocator>::value) {
     __move_assign_alloc(
@@ -541,7 +567,8 @@ protected:
   }
 
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__base_pointer __prev, __base_pointer __next, _Args&&... __args) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer
+  __create_node(__base_pointer __prev, __base_pointer __next, _Args&&... __args) {
     __allocation_guard<__node_allocator> __guard(__node_alloc_, 1);
     // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value
     // held inside the node, since we need to use the allocator's construct() method for that.
@@ -557,7 +584,7 @@ protected:
     return __guard.__release_ptr();
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
     __node_alloc_traits::destroy(__node_alloc_, std::addressof(__node->__get_value()));
@@ -566,54 +593,57 @@ protected:
   }
 
 private:
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp& __c, true_type) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp& __c, true_type) {
     if (__node_alloc_ != __c.__node_alloc_)
       clear();
     __node_alloc_ = __c.__node_alloc_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp&, false_type) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp&, false_type) {}
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c, true_type)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) {
     __node_alloc_ = std::move(__c.__node_alloc_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp&, false_type) _NOEXCEPT {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp&, false_type) _NOEXCEPT {}
 };
 
 // Unlink nodes [__f, __l]
 template <class _Tp, class _Alloc>
-inline void __list_imp<_Tp, _Alloc>::__unlink_nodes(__base_pointer __f, __base_pointer __l) _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void
+__list_imp<_Tp, _Alloc>::__unlink_nodes(__base_pointer __f, __base_pointer __l) _NOEXCEPT {
   __f->__prev_->__next_ = __l->__next_;
   __l->__next_->__prev_ = __f->__prev_;
 }
 
 template <class _Tp, class _Alloc>
-inline __list_imp<_Tp, _Alloc>::__list_imp() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __list_imp<_Tp, _Alloc>::__list_imp()
+    _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
     : __size_(0) {}
 
 template <class _Tp, class _Alloc>
-inline __list_imp<_Tp, _Alloc>::__list_imp(const allocator_type& __a)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __list_imp<_Tp, _Alloc>::__list_imp(const allocator_type& __a)
     : __size_(0), __node_alloc_(__node_allocator(__a)) {}
 
 template <class _Tp, class _Alloc>
-inline __list_imp<_Tp, _Alloc>::__list_imp(const __node_allocator& __a) : __size_(0), __node_alloc_(__a) {}
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __list_imp<_Tp, _Alloc>::__list_imp(const __node_allocator& __a)
+    : __size_(0), __node_alloc_(__a) {}
 
 #  ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Alloc>
-inline __list_imp<_Tp, _Alloc>::__list_imp(__node_allocator&& __a) _NOEXCEPT
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __list_imp<_Tp, _Alloc>::__list_imp(__node_allocator&& __a) _NOEXCEPT
     : __size_(0),
       __node_alloc_(std::move(__a)) {}
 #  endif
 
 template <class _Tp, class _Alloc>
-__list_imp<_Tp, _Alloc>::~__list_imp() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 __list_imp<_Tp, _Alloc>::~__list_imp() {
   clear();
 }
 
 template <class _Tp, class _Alloc>
-void __list_imp<_Tp, _Alloc>::clear() _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void __list_imp<_Tp, _Alloc>::clear() _NOEXCEPT {
   if (!empty()) {
     __base_pointer __f = __end_.__next_;
     __base_pointer __l = __end_as_link();
@@ -628,7 +658,7 @@ void __list_imp<_Tp, _Alloc>::clear() _NOEXCEPT {
 }
 
 template <class _Tp, class _Alloc>
-void __list_imp<_Tp, _Alloc>::swap(__list_imp& __c)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void __list_imp<_Tp, _Alloc>::swap(__list_imp& __c)
 #  if _LIBCPP_STD_VER >= 14
     _NOEXCEPT
 #  else
@@ -686,170 +716,204 @@ public:
   typedef void __remove_return_type;
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {}
-  _LIBCPP_HIDE_FROM_ABI explicit list(const allocator_type& __a) : __base(__a) {}
-  _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list()
+      _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit list(const allocator_type& __a) : __base(__a) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n);
 #  if _LIBCPP_STD_VER >= 14
-  _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n, const allocator_type& __a);
 #  endif
-  _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x);
   template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x, const allocator_type& __a) : __base(__a) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  list(size_type __n, const value_type& __x, const allocator_type& __a)
+      : __base(__a) {
     for (; __n > 0; --__n)
       push_back(__x);
   }
 
   template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI list(_InpIter __f, _InpIter __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(_InpIter __f, _InpIter __l);
 
   template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI list(_InpIter __f, _InpIter __l, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(_InpIter __f, _InpIter __l, const allocator_type& __a);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type())
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type())
       : __base(__a) {
     prepend_range(std::forward<_Range>(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI list(const list& __c);
-  _LIBCPP_HIDE_FROM_ABI list(const list& __c, const __type_identity_t<allocator_type>& __a);
-  _LIBCPP_HIDE_FROM_ABI list& operator=(const list& __c);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(const list& __c);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  list(const list& __c, const __type_identity_t<allocator_type>& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list& operator=(const list& __c);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI list(initializer_list<value_type> __il);
-  _LIBCPP_HIDE_FROM_ABI list(initializer_list<value_type> __il, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  list(initializer_list<value_type> __il, const allocator_type& __a);
 
-  _LIBCPP_HIDE_FROM_ABI list(list&& __c) _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value);
-  _LIBCPP_HIDE_FROM_ABI list(list&& __c, const __type_identity_t<allocator_type>& __a);
-  _LIBCPP_HIDE_FROM_ABI list& operator=(list&& __c) noexcept(
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(list&& __c)
+      _NOEXCEPT_(is_nothrow_move_constructible<__node_allocator>::value);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list(list&& __c, const __type_identity_t<allocator_type>& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list& operator=(list&& __c) noexcept(
       (__node_alloc_traits::propagate_on_container_move_assignment::value &&
        is_nothrow_move_assignable<__node_allocator>::value) ||
       allocator_traits<allocator_type>::is_always_equal::value);
 
-  _LIBCPP_HIDE_FROM_ABI list& operator=(initializer_list<value_type> __il) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI list& operator=(initializer_list<value_type> __il) {
     assign(__il.begin(), __il.end());
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il) { assign(__il.begin(), __il.end()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il) {
+    assign(__il.begin(), __il.end());
+  }
 #  endif // _LIBCPP_CXX03_LANG
 
   template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void assign(_InpIter __f, _InpIter __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(_InpIter __f, _InpIter __l);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) {
     __assign_with_sentinel(ranges::begin(__range), ranges::end(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __x);
 
-  _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT;
 
-  _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return this->__size_; }
-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __base::empty(); }
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return this->__size_; }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
+    return __base::empty();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return std::min<size_type>(this->__node_alloc_max_size(), numeric_limits<difference_type >::max());
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __base::begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __base::begin(); }
-  _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __base::end(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __base::end(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return __base::begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return __base::end(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __base::begin(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __base::begin(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __base::end(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __base::end(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+    return __base::begin();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return __base::end(); }
 
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return const_reverse_iterator(begin()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT {
+    return reverse_iterator(end());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT {
+    return reverse_iterator(begin());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT {
+    return const_reverse_iterator(begin());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT {
+    return const_reverse_iterator(begin());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI reference front() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list");
     return __base::__end_.__next_->__as_node()->__get_value();
   }
-  _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list");
     return __base::__end_.__next_->__as_node()->__get_value();
   }
-  _LIBCPP_HIDE_FROM_ABI reference back() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference back() {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list");
     return __base::__end_.__prev_->__as_node()->__get_value();
   }
-  _LIBCPP_HIDE_FROM_ABI const_reference back() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference back() const {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list");
     return __base::__end_.__prev_->__as_node()->__get_value();
   }
 
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __x);
-  _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __x);
 
 #    if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) {
     insert_range(begin(), std::forward<_Range>(__range));
   }
 
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void append_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void append_range(_Range&& __range) {
     insert_range(end(), std::forward<_Range>(__range));
   }
 #    endif
 
   template <class... _Args>
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
 #    if _LIBCPP_STD_VER >= 17
-  _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args);
+      _LIBCPP_HIDE_FROM_ABI reference
+      emplace_front(_Args&&... __args);
 #    else
-  _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args);
+      _LIBCPP_HIDE_FROM_ABI void
+      emplace_front(_Args&&... __args);
 #    endif
   template <class... _Args>
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
 #    if _LIBCPP_STD_VER >= 17
-  _LIBCPP_HIDE_FROM_ABI reference emplace_back(_Args&&... __args);
+      _LIBCPP_HIDE_FROM_ABI reference
+      emplace_back(_Args&&... __args);
 #    else
-  _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args);
+      _LIBCPP_HIDE_FROM_ABI void
+      emplace_back(_Args&&... __args);
 #    endif
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args);
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __x);
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, initializer_list<value_type> __il) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert(const_iterator __p, initializer_list<value_type> __il) {
     return insert(__p, __il.begin(), __il.end());
   }
 #  endif // _LIBCPP_CXX03_LANG
 
-  _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __x);
-  _LIBCPP_HIDE_FROM_ABI void push_back(const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_back(const value_type& __x);
 
 #  ifndef _LIBCPP_CXX03_LANG
   template <class _Arg>
-  _LIBCPP_HIDE_FROM_ABI void __emplace_back(_Arg&& __arg) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __emplace_back(_Arg&& __arg) {
     emplace_back(std::forward<_Arg>(__arg));
   }
 #  else
   _LIBCPP_HIDE_FROM_ABI void __emplace_back(value_type const& __arg) { push_back(__arg); }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __x);
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, size_type __n, const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert(const_iterator __p, size_type __n, const value_type& __x);
 
   template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, _InpIter __f, _InpIter __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, _InpIter __f, _InpIter __l);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_range(const_iterator __position, _Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_range(const_iterator __position, _Range&& __range) {
     return __insert_with_sentinel(__position, ranges::begin(__range), ranges::end(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void swap(list& __c)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(list& __c)
 #  if _LIBCPP_STD_VER >= 14
       _NOEXCEPT
 #  else
@@ -858,72 +922,80 @@ public:
   {
     __base::swap(__c);
   }
-  _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); }
 
-  _LIBCPP_HIDE_FROM_ABI void pop_front();
-  _LIBCPP_HIDE_FROM_ABI void pop_back();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void pop_front();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void pop_back();
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __p);
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __f, const_iterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __p);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __f, const_iterator __l);
 
-  _LIBCPP_HIDE_FROM_ABI void resize(size_type __n);
-  _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __x);
 
-  _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list& __c);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list& __c);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list&& __c) { splice(__p, __c); }
-  _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list&& __c, const_iterator __i) { splice(__p, __c, __i); }
-  _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list&& __c, const_iterator __f, const_iterator __l) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list&& __c) { splice(__p, __c); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list&& __c, const_iterator __i) {
+    splice(__p, __c, __i);
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  splice(const_iterator __p, list&& __c, const_iterator __f, const_iterator __l) {
     splice(__p, __c, __f, __l);
   }
 #  endif
-  _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list& __c, const_iterator __i);
-  _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list& __c, const_iterator __f, const_iterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice(const_iterator __p, list& __c, const_iterator __i);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  splice(const_iterator __p, list& __c, const_iterator __f, const_iterator __l);
 
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __x);
   template <class _Pred>
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Pred __pred);
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Pred __pred);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); }
   template <class _BinaryPred>
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPred __binary_pred);
-  _LIBCPP_HIDE_FROM_ABI void merge(list& __c);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPred __binary_pred);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(list& __c);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void merge(list&& __c) { merge(__c); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(list&& __c) { merge(__c); }
 
   template <class _Comp>
-  _LIBCPP_HIDE_FROM_ABI void merge(list&& __c, _Comp __comp) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(list&& __c, _Comp __comp) {
     merge(__c, __comp);
   }
 #  endif
   template <class _Comp>
-  _LIBCPP_HIDE_FROM_ABI void merge(list& __c, _Comp __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(list& __c, _Comp __comp);
 
-  _LIBCPP_HIDE_FROM_ABI void sort();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort();
   template <class _Comp>
-  _LIBCPP_HIDE_FROM_ABI void sort(_Comp __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort(_Comp __comp);
 
-  _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT;
 
-  _LIBCPP_HIDE_FROM_ABI bool __invariants() const;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool __invariants() const;
 
 private:
   template <class _Iterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __f, _Sentinel __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __f, _Sentinel __l);
 
   template <class _Iterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_with_sentinel(const_iterator __p, _Iterator __f, _Sentinel __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  __insert_with_sentinel(const_iterator __p, _Iterator __f, _Sentinel __l);
 
-  _LIBCPP_HIDE_FROM_ABI static void __link_nodes(__base_pointer __p, __base_pointer __f, __base_pointer __l);
-  _LIBCPP_HIDE_FROM_ABI void __link_nodes_at_front(__base_pointer __f, __base_pointer __l);
-  _LIBCPP_HIDE_FROM_ABI void __link_nodes_at_back(__base_pointer __f, __base_pointer __l);
-  _LIBCPP_HIDE_FROM_ABI iterator __iterator(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static void
+  __link_nodes(__base_pointer __p, __base_pointer __f, __base_pointer __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  __link_nodes_at_front(__base_pointer __f, __base_pointer __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __link_nodes_at_back(__base_pointer __f, __base_pointer __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator __iterator(size_type __n);
   // TODO: Make this _LIBCPP_HIDE_FROM_ABI
   template <class _Comp>
-  _LIBCPP_HIDDEN static iterator __sort(iterator __f1, iterator __e2, size_type __n, _Comp& __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDDEN static iterator
+  __sort(iterator __f1, iterator __e2, size_type __n, _Comp& __comp);
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, true_type)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value);
-  _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, false_type);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, false_type);
 };
 
 #  if _LIBCPP_STD_VER >= 17
@@ -949,7 +1021,8 @@ list(from_range_t, _Range&&, _Alloc = _Alloc()) -> list<ranges::range_value_t<_R
 
 // Link in nodes [__f, __l] just prior to __p
 template <class _Tp, class _Alloc>
-inline void list<_Tp, _Alloc>::__link_nodes(__base_pointer __p, __base_pointer __f, __base_pointer __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void
+list<_Tp, _Alloc>::__link_nodes(__base_pointer __p, __base_pointer __f, __base_pointer __l) {
   __p->__prev_->__next_ = __f;
   __f->__prev_          = __p->__prev_;
   __p->__prev_          = __l;
@@ -958,7 +1031,8 @@ inline void list<_Tp, _Alloc>::__link_nodes(__base_pointer __p, __base_pointer _
 
 // Link in nodes [__f, __l] at the front of the list
 template <class _Tp, class _Alloc>
-inline void list<_Tp, _Alloc>::__link_nodes_at_front(__base_pointer __f, __base_pointer __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void
+list<_Tp, _Alloc>::__link_nodes_at_front(__base_pointer __f, __base_pointer __l) {
   __f->__prev_           = __base::__end_as_link();
   __l->__next_           = __base::__end_.__next_;
   __l->__next_->__prev_  = __l;
@@ -967,7 +1041,8 @@ inline void list<_Tp, _Alloc>::__link_nodes_at_front(__base_pointer __f, __base_
 
 // Link in nodes [__f, __l] at the back of the list
 template <class _Tp, class _Alloc>
-inline void list<_Tp, _Alloc>::__link_nodes_at_back(__base_pointer __f, __base_pointer __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void
+list<_Tp, _Alloc>::__link_nodes_at_back(__base_pointer __f, __base_pointer __l) {
   __l->__next_           = __base::__end_as_link();
   __f->__prev_           = __base::__end_.__prev_;
   __f->__prev_->__next_  = __f;
@@ -975,12 +1050,12 @@ inline void list<_Tp, _Alloc>::__link_nodes_at_back(__base_pointer __f, __base_p
 }
 
 template <class _Tp, class _Alloc>
-inline typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::__iterator(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::__iterator(size_type __n) {
   return __n <= this->__size_ / 2 ? std::next(begin(), __n) : std::prev(end(), this->__size_ - __n);
 }
 
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(size_type __n) {
   for (; __n > 0; --__n)
 #  ifndef _LIBCPP_CXX03_LANG
     emplace_back();
@@ -991,41 +1066,43 @@ list<_Tp, _Alloc>::list(size_type __n) {
 
 #  if _LIBCPP_STD_VER >= 14
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(size_type __n, const allocator_type& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(size_type __n, const allocator_type& __a) : __base(__a) {
   for (; __n > 0; --__n)
     emplace_back();
 }
 #  endif
 
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(size_type __n, const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(size_type __n, const value_type& __x) {
   for (; __n > 0; --__n)
     push_back(__x);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> >
-list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l) {
   for (; __f != __l; ++__f)
     __emplace_back(*__f);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> >
-list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l, const allocator_type& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(_InpIter __f, _InpIter __l, const allocator_type& __a)
+    : __base(__a) {
   for (; __f != __l; ++__f)
     __emplace_back(*__f);
 }
 
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(const list& __c)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(const list& __c)
     : __base(__node_alloc_traits::select_on_container_copy_construction(__c.__node_alloc_)) {
   for (const_iterator __i = __c.begin(), __e = __c.end(); __i != __e; ++__i)
     push_back(*__i);
 }
 
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(const list& __c, const __type_identity_t<allocator_type>& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(const list& __c, const __type_identity_t<allocator_type>& __a)
+    : __base(__a) {
   for (const_iterator __i = __c.begin(), __e = __c.end(); __i != __e; ++__i)
     push_back(*__i);
 }
@@ -1033,25 +1110,28 @@ list<_Tp, _Alloc>::list(const list& __c, const __type_identity_t<allocator_type>
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(initializer_list<value_type> __il, const allocator_type& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(initializer_list<value_type> __il, const allocator_type& __a)
+    : __base(__a) {
   for (typename initializer_list<value_type>::const_iterator __i = __il.begin(), __e = __il.end(); __i != __e; ++__i)
     push_back(*__i);
 }
 
 template <class _Tp, class _Alloc>
-list<_Tp, _Alloc>::list(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 list<_Tp, _Alloc>::list(initializer_list<value_type> __il) {
   for (typename initializer_list<value_type>::const_iterator __i = __il.begin(), __e = __il.end(); __i != __e; ++__i)
     push_back(*__i);
 }
 
 template <class _Tp, class _Alloc>
-inline list<_Tp, _Alloc>::list(list&& __c) noexcept(is_nothrow_move_constructible<__node_allocator>::value)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline list<_Tp, _Alloc>::list(list&& __c) noexcept(
+    is_nothrow_move_constructible<__node_allocator>::value)
     : __base(std::move(__c.__node_alloc_)) {
   splice(end(), __c);
 }
 
 template <class _Tp, class _Alloc>
-inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t<allocator_type>& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t<allocator_type>& __a)
+    : __base(__a) {
   if (__a == __c.get_allocator())
     splice(end(), __c);
   else {
@@ -1061,7 +1141,7 @@ inline list<_Tp, _Alloc>::list(list&& __c, const __type_identity_t<allocator_typ
 }
 
 template <class _Tp, class _Alloc>
-inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c) noexcept(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c) noexcept(
     (__node_alloc_traits::propagate_on_container_move_assignment::value &&
      is_nothrow_move_assignable<__node_allocator>::value) ||
     allocator_traits<allocator_type>::is_always_equal::value) {
@@ -1070,7 +1150,7 @@ inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(list&& __c) noexcept(
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::__move_assign(list& __c, false_type) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::__move_assign(list& __c, false_type) {
   if (this->__node_alloc_ != __c.__node_alloc_) {
     typedef move_iterator<iterator> _Ip;
     assign(_Ip(__c.begin()), _Ip(__c.end()));
@@ -1079,8 +1159,8 @@ void list<_Tp, _Alloc>::__move_assign(list& __c, false_type) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::__move_assign(list& __c,
-                                      true_type) noexcept(is_nothrow_move_assignable<__node_allocator>::value) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void
+list<_Tp, _Alloc>::__move_assign(list& __c, true_type) noexcept(is_nothrow_move_assignable<__node_allocator>::value) {
   clear();
   __base::__move_assign_alloc(__c);
   splice(end(), __c);
@@ -1089,7 +1169,7 @@ void list<_Tp, _Alloc>::__move_assign(list& __c,
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(const list& __c) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(const list& __c) {
   if (this != std::addressof(__c)) {
     __base::__copy_assign_alloc(__c);
     assign(__c.begin(), __c.end());
@@ -1099,13 +1179,14 @@ inline list<_Tp, _Alloc>& list<_Tp, _Alloc>::operator=(const list& __c) {
 
 template <class _Tp, class _Alloc>
 template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> >
-void list<_Tp, _Alloc>::assign(_InpIter __f, _InpIter __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::assign(_InpIter __f, _InpIter __l) {
   __assign_with_sentinel(__f, __l);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Iterator, class _Sentinel>
-_LIBCPP_HIDE_FROM_ABI void list<_Tp, _Alloc>::__assign_with_sentinel(_Iterator __f, _Sentinel __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+list<_Tp, _Alloc>::__assign_with_sentinel(_Iterator __f, _Sentinel __l) {
   iterator __i = begin();
   iterator __e = end();
   for (; __f != __l && __i != __e; ++__f, (void)++__i)
@@ -1117,7 +1198,7 @@ _LIBCPP_HIDE_FROM_ABI void list<_Tp, _Alloc>::__assign_with_sentinel(_Iterator _
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::assign(size_type __n, const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::assign(size_type __n, const value_type& __x) {
   iterator __i = begin();
   iterator __e = end();
   for (; __n > 0 && __i != __e; --__n, (void)++__i)
@@ -1129,12 +1210,13 @@ void list<_Tp, _Alloc>::assign(size_type __n, const value_type& __x) {
 }
 
 template <class _Tp, class _Alloc>
-inline _Alloc list<_Tp, _Alloc>::get_allocator() const _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _Alloc list<_Tp, _Alloc>::get_allocator() const _NOEXCEPT {
   return allocator_type(this->__node_alloc_);
 }
 
 template <class _Tp, class _Alloc>
-typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __p, const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
+list<_Tp, _Alloc>::insert(const_iterator __p, const value_type& __x) {
   __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x);
   __link_nodes(__p.__ptr_, __node->__as_link(), __node->__as_link());
   ++this->__size_;
@@ -1142,7 +1224,7 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __
 }
 
 template <class _Tp, class _Alloc>
-typename list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& __x) {
   iterator __r(__p.__ptr_);
   if (__n > 0) {
@@ -1178,13 +1260,14 @@ list<_Tp, _Alloc>::insert(const_iterator __p, size_type __n, const value_type& _
 
 template <class _Tp, class _Alloc>
 template <class _InpIter, __enable_if_t<__has_input_iterator_category<_InpIter>::value, int> >
-typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __p, _InpIter __f, _InpIter __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
+list<_Tp, _Alloc>::insert(const_iterator __p, _InpIter __f, _InpIter __l) {
   return __insert_with_sentinel(__p, __f, __l);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Iterator, class _Sentinel>
-_LIBCPP_HIDE_FROM_ABI typename list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Sentinel __l) {
   iterator __r(__p.__ptr_);
   if (__f != __l) {
@@ -1219,7 +1302,7 @@ list<_Tp, _Alloc>::__insert_with_sentinel(const_iterator __p, _Iterator __f, _Se
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::push_front(const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::push_front(const value_type& __x) {
   __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x);
   __base_pointer __nl   = __node->__as_link();
   __link_nodes_at_front(__nl, __nl);
@@ -1227,7 +1310,7 @@ void list<_Tp, _Alloc>::push_front(const value_type& __x) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::push_back(const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::push_back(const value_type& __x) {
   __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, __x);
   __base_pointer __nl   = __node->__as_link();
   __link_nodes_at_back(__nl, __nl);
@@ -1237,7 +1320,7 @@ void list<_Tp, _Alloc>::push_back(const value_type& __x) {
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::push_front(value_type&& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::push_front(value_type&& __x) {
   __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::move(__x));
   __base_pointer __nl   = __node->__as_link();
   __link_nodes_at_front(__nl, __nl);
@@ -1245,7 +1328,7 @@ void list<_Tp, _Alloc>::push_front(value_type&& __x) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::push_back(value_type&& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::push_back(value_type&& __x) {
   __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::move(__x));
   __base_pointer __nl   = __node->__as_link();
   __link_nodes_at_back(__nl, __nl);
@@ -1254,12 +1337,13 @@ void list<_Tp, _Alloc>::push_back(value_type&& __x) {
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 #    if _LIBCPP_STD_VER >= 17
-typename list<_Tp, _Alloc>::reference
+    typename list<_Tp, _Alloc>::reference
 #    else
-void
+    void
 #    endif
-list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
+    list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
   __node_pointer __node =
       this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::forward<_Args>(__args)...);
   __base_pointer __nl = __node->__as_link();
@@ -1272,12 +1356,13 @@ list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 #    if _LIBCPP_STD_VER >= 17
-typename list<_Tp, _Alloc>::reference
+    typename list<_Tp, _Alloc>::reference
 #    else
-void
+    void
 #    endif
-list<_Tp, _Alloc>::emplace_back(_Args&&... __args) {
+    list<_Tp, _Alloc>::emplace_back(_Args&&... __args) {
   __node_pointer __node =
       this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::forward<_Args>(__args)...);
   __base_pointer __nl = __node->__as_link();
@@ -1290,7 +1375,8 @@ list<_Tp, _Alloc>::emplace_back(_Args&&... __args) {
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
-typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::emplace(const_iterator __p, _Args&&... __args) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
+list<_Tp, _Alloc>::emplace(const_iterator __p, _Args&&... __args) {
   __node_pointer __node =
       this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::forward<_Args>(__args)...);
   __base_pointer __nl = __node->__as_link();
@@ -1300,7 +1386,8 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::emplace(const_iterator _
 }
 
 template <class _Tp, class _Alloc>
-typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __p, value_type&& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
+list<_Tp, _Alloc>::insert(const_iterator __p, value_type&& __x) {
   __node_pointer __node = this->__create_node(/* prev = */ nullptr, /* next = */ nullptr, std::move(__x));
   __base_pointer __nl   = __node->__as_link();
   __link_nodes(__p.__ptr_, __nl, __nl);
@@ -1311,7 +1398,7 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::insert(const_iterator __
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::pop_front() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::pop_front() {
   _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::pop_front() called with empty list");
   __base_pointer __n = __base::__end_.__next_;
   __base::__unlink_nodes(__n, __n);
@@ -1320,7 +1407,7 @@ void list<_Tp, _Alloc>::pop_front() {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::pop_back() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::pop_back() {
   _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::pop_back() called on an empty list");
   __base_pointer __n = __base::__end_.__prev_;
   __base::__unlink_nodes(__n, __n);
@@ -1329,7 +1416,7 @@ void list<_Tp, _Alloc>::pop_back() {
 }
 
 template <class _Tp, class _Alloc>
-typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __p) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __p) {
   _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__p != end(), "list::erase(iterator) called with a non-dereferenceable iterator");
   __base_pointer __n = __p.__ptr_;
   __base_pointer __r = __n->__next_;
@@ -1340,7 +1427,8 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __p
 }
 
 template <class _Tp, class _Alloc>
-typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
+list<_Tp, _Alloc>::erase(const_iterator __f, const_iterator __l) {
   if (__f != __l) {
     __base::__unlink_nodes(__f.__ptr_, __l.__ptr_->__prev_);
     while (__f != __l) {
@@ -1354,7 +1442,7 @@ typename list<_Tp, _Alloc>::iterator list<_Tp, _Alloc>::erase(const_iterator __f
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::resize(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::resize(size_type __n) {
   if (__n < this->__size_)
     erase(__iterator(__n), end());
   else if (__n > this->__size_) {
@@ -1389,7 +1477,7 @@ void list<_Tp, _Alloc>::resize(size_type __n) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x) {
   if (__n < this->__size_)
     erase(__iterator(__n), end());
   else if (__n > this->__size_) {
@@ -1425,7 +1513,7 @@ void list<_Tp, _Alloc>::resize(size_type __n, const value_type& __x) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c) {
   _LIBCPP_ASSERT_VALID_INPUT_RANGE(
       this != std::addressof(__c), "list::splice(iterator, list) called with this == &list");
   if (!__c.empty()) {
@@ -1439,7 +1527,7 @@ void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c) {
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i) {
   if (__p.__ptr_ != __i.__ptr_ && __p.__ptr_ != __i.__ptr_->__next_) {
     __base_pointer __f = __i.__ptr_;
     __base::__unlink_nodes(__f, __f);
@@ -1450,7 +1538,8 @@ void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __i
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f, const_iterator __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void
+list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f, const_iterator __l) {
   if (__f != __l) {
     __base_pointer __first = __f.__ptr_;
     --__l;
@@ -1466,7 +1555,8 @@ void list<_Tp, _Alloc>::splice(const_iterator __p, list& __c, const_iterator __f
 }
 
 template <class _Tp, class _Alloc>
-typename list<_Tp, _Alloc>::__remove_return_type list<_Tp, _Alloc>::remove(const value_type& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::__remove_return_type
+list<_Tp, _Alloc>::remove(const value_type& __x) {
   list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   for (const_iterator __i = begin(), __e = end(); __i != __e;) {
     if (*__i == __x) {
@@ -1486,7 +1576,8 @@ typename list<_Tp, _Alloc>::__remove_return_type list<_Tp, _Alloc>::remove(const
 
 template <class _Tp, class _Alloc>
 template <class _Pred>
-typename list<_Tp, _Alloc>::__remove_return_type list<_Tp, _Alloc>::remove_if(_Pred __pred) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::__remove_return_type
+list<_Tp, _Alloc>::remove_if(_Pred __pred) {
   list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   for (iterator __i = begin(), __e = end(); __i != __e;) {
     if (__pred(*__i)) {
@@ -1506,7 +1597,8 @@ typename list<_Tp, _Alloc>::__remove_return_type list<_Tp, _Alloc>::remove_if(_P
 
 template <class _Tp, class _Alloc>
 template <class _BinaryPred>
-typename list<_Tp, _Alloc>::__remove_return_type list<_Tp, _Alloc>::unique(_BinaryPred __binary_pred) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::__remove_return_type
+list<_Tp, _Alloc>::unique(_BinaryPred __binary_pred) {
   list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   for (iterator __i = begin(), __e = end(); __i != __e;) {
     iterator __j = std::next(__i);
@@ -1522,13 +1614,13 @@ typename list<_Tp, _Alloc>::__remove_return_type list<_Tp, _Alloc>::unique(_Bina
 }
 
 template <class _Tp, class _Alloc>
-inline void list<_Tp, _Alloc>::merge(list& __c) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void list<_Tp, _Alloc>::merge(list& __c) {
   merge(__c, __less<>());
 }
 
 template <class _Tp, class _Alloc>
 template <class _Comp>
-void list<_Tp, _Alloc>::merge(list& __c, _Comp __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::merge(list& __c, _Comp __comp) {
   if (this != std::addressof(__c)) {
     iterator __f1 = begin();
     iterator __e1 = end();
@@ -1557,19 +1649,19 @@ void list<_Tp, _Alloc>::merge(list& __c, _Comp __comp) {
 }
 
 template <class _Tp, class _Alloc>
-inline void list<_Tp, _Alloc>::sort() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void list<_Tp, _Alloc>::sort() {
   sort(__less<>());
 }
 
 template <class _Tp, class _Alloc>
 template <class _Comp>
-inline void list<_Tp, _Alloc>::sort(_Comp __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void list<_Tp, _Alloc>::sort(_Comp __comp) {
   __sort(begin(), end(), this->__size_, __comp);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Comp>
-typename list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename list<_Tp, _Alloc>::iterator
 list<_Tp, _Alloc>::__sort(iterator __f1, iterator __e2, size_type __n, _Comp& __comp) {
   switch (__n) {
   case 0:
@@ -1623,7 +1715,7 @@ list<_Tp, _Alloc>::__sort(iterator __f1, iterator __e2, size_type __n, _Comp& __
 }
 
 template <class _Tp, class _Alloc>
-void list<_Tp, _Alloc>::reverse() _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void list<_Tp, _Alloc>::reverse() _NOEXCEPT {
   if (this->__size_ > 1) {
     iterator __e = end();
     for (iterator __i = begin(); __i.__ptr_ != __e.__ptr_;) {
@@ -1635,46 +1727,52 @@ void list<_Tp, _Alloc>::reverse() _NOEXCEPT {
 }
 
 template <class _Tp, class _Alloc>
-bool list<_Tp, _Alloc>::__invariants() const {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 bool list<_Tp, _Alloc>::__invariants() const {
   return size() == std::distance(begin(), end());
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool operator==(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
+operator==(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
   return __x.size() == __y.size() && std::equal(__x.begin(), __x.end(), __y.begin());
 }
 
 #  if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool operator<(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
+operator<(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
   return std::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end());
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
+operator!=(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
   return !(__x == __y);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool operator>(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
+operator>(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
   return __y < __x;
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
+operator>=(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
   return !(__x < __y);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
+operator<=(const list<_Tp, _Alloc>& __x, const list<_Tp, _Alloc>& __y) {
   return !(__y < __x);
 }
 
 #  else // _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Allocator>
-_LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const list<_Tp, _Allocator>& __x, const list<_Tp, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
@@ -1682,20 +1780,20 @@ operator<=>(const list<_Tp, _Allocator>& __x, const list<_Tp, _Allocator>& __y)
 #  endif // _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void swap(list<_Tp, _Alloc>& __x, list<_Tp, _Alloc>& __y)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void swap(list<_Tp, _Alloc>& __x, list<_Tp, _Alloc>& __y)
     _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
   __x.swap(__y);
 }
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _Tp, class _Allocator, class _Predicate>
-inline _LIBCPP_HIDE_FROM_ABI typename list<_Tp, _Allocator>::size_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename list<_Tp, _Allocator>::size_type
 erase_if(list<_Tp, _Allocator>& __c, _Predicate __pred) {
   return __c.remove_if(__pred);
 }
 
 template <class _Tp, class _Allocator, class _Up>
-inline _LIBCPP_HIDE_FROM_ABI typename list<_Tp, _Allocator>::size_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename list<_Tp, _Allocator>::size_type
 erase(list<_Tp, _Allocator>& __c, const _Up& __v) {
   return std::erase_if(__c, [&](const auto& __elem) -> bool { return __elem == __v; });
 }
diff --git a/libcxx/include/version b/libcxx/include/version
index 87c4ede9a7e5..7154cab92335 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -71,6 +71,7 @@ __cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
 __cpp_lib_constexpr_forward_list                        202502L <forward_list>
 __cpp_lib_constexpr_functional                          201907L <functional>
 __cpp_lib_constexpr_iterator                            201811L <iterator>
+__cpp_lib_constexpr_list                                202502L <list>
 __cpp_lib_constexpr_memory                              202202L <memory>
                                                         201811L // C++20
 __cpp_lib_constexpr_new                                 202406L <new>
@@ -545,6 +546,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # undef  __cpp_lib_constexpr_algorithms
 # define __cpp_lib_constexpr_algorithms                 202306L
 # define __cpp_lib_constexpr_forward_list               202502L
+# define __cpp_lib_constexpr_list                       202502L
 # if !defined(_LIBCPP_ABI_VCRUNTIME)
 #   define __cpp_lib_constexpr_new                      202406L
 # endif
diff --git a/libcxx/test/std/containers/sequences/list/compare.pass.cpp b/libcxx/test/std/containers/sequences/list/compare.pass.cpp
index ce00f57733bf..9705fd916198 100644
--- a/libcxx/test/std/containers/sequences/list/compare.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/compare.pass.cpp
@@ -10,34 +10,34 @@
 
 // template< class T, class Alloc >
 // bool operator==( const std::list<T,Alloc>& lhs,
-//                  const std::list<T,Alloc>& rhs );
+//                  const std::list<T,Alloc>& rhs ); // constexpr since C++26
 
 // template< class T, class Alloc >
 // bool operator!=( const std::list<T,Alloc>& lhs,
-//                  const std::list<T,Alloc>& rhs );
+//                  const std::list<T,Alloc>& rhs ); // constexpr since C++26
 
 // template< class T, class Alloc >
 // bool operator<( const std::list<T,Alloc>& lhs,
-//                 const std::list<T,Alloc>& rhs );
+//                 const std::list<T,Alloc>& rhs ); // constexpr since C++26
 
 // template< class T, class Alloc >
 // bool operator<=( const std::list<T,Alloc>& lhs,
-//                  const std::list<T,Alloc>& rhs );
+//                  const std::list<T,Alloc>& rhs ); // constexpr since C++26
 
 // template< class T, class Alloc >
 // bool operator>( const std::list<T,Alloc>& lhs,
-//                 const std::list<T,Alloc>& rhs );
+//                 const std::list<T,Alloc>& rhs ); // constexpr since C++26
 
 // template< class T, class Alloc >
 // bool operator>=( const std::list<T,Alloc>& lhs,
-//                  const std::list<T,Alloc>& rhs );
+//                  const std::list<T,Alloc>& rhs ); // constexpr since C++26
 
 #include <list>
 #include <cassert>
 
 #include "test_comparisons.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     const std::list<int> l1, l2;
     assert(testComparisons(l1, l2, true, false));
@@ -113,5 +113,15 @@ int main(int, char**) {
     const std::list<LessAndEqComp> l2(items2, items2 + 2);
     assert(testComparisons(l1, l2, false, false));
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/list/compare.three_way.pass.cpp
index 059fba3c2626..7a23a653c0aa 100644
--- a/libcxx/test/std/containers/sequences/list/compare.three_way.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/compare.three_way.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator> constexpr
 //   synth-three-way-result<T>
-//     operator<=>(const list<T, Allocator>& x, const list<T, Allocator>& y);
+//     operator<=>(const list<T, Allocator>& x, const list<T, Allocator>& y); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -20,6 +20,8 @@
 
 int main(int, char**) {
   assert(test_sequence_container_spaceship<std::list>());
-  // `std::list` is not constexpr, so no `static_assert` test here.
+#if TEST_STD_VER >= 26
+  static_assert(test_sequence_container_spaceship<std::list>());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/get_allocator.pass.cpp b/libcxx/test/std/containers/sequences/list/get_allocator.pass.cpp
index f1002f2ca811..9d724673d31e 100644
--- a/libcxx/test/std/containers/sequences/list/get_allocator.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/get_allocator.pass.cpp
@@ -10,7 +10,7 @@
 
 // class list
 
-// allocator_type get_allocator() const
+// allocator_type get_allocator() const // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::allocator<int> alloc;
     const std::list<int> l(alloc);
@@ -30,5 +30,14 @@ int main(int, char**) {
     assert(l.get_allocator() == alloc);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/incomplete_type.pass.cpp b/libcxx/test/std/containers/sequences/list/incomplete_type.pass.cpp
index 1802e53ecf38..ac8a76097d0a 100644
--- a/libcxx/test/std/containers/sequences/list/incomplete_type.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/incomplete_type.pass.cpp
@@ -12,6 +12,7 @@
 // type.
 
 #include <list>
+#include <cassert>
 
 #include "test_macros.h"
 
@@ -23,8 +24,18 @@ struct A {
   std::list<A>::const_reverse_iterator crit;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   A a;
+  (void)a;
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/iterators.pass.cpp b/libcxx/test/std/containers/sequences/list/iterators.pass.cpp
index deaae31f2d27..b41a1899f2ff 100644
--- a/libcxx/test/std/containers/sequences/list/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/iterators.pass.cpp
@@ -8,12 +8,12 @@
 
 // <list>
 
-// iterator       begin();
-// iterator       end();
-// const_iterator begin()  const;
-// const_iterator end()    const;
-// const_iterator cbegin() const;
-// const_iterator cend()   const;
+// iterator       begin();        // constexpr since C++26
+// iterator       end();          // constexpr since C++26
+// const_iterator begin()  const; // constexpr since C++26
+// const_iterator end()    const; // constexpr since C++26
+// const_iterator cbegin() const; // constexpr since C++26
+// const_iterator cend()   const; // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -27,7 +27,7 @@ struct A {
   int second;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::list<T> C;
@@ -74,6 +74,8 @@ int main(int, char**) {
     typedef std::list<T> C;
     C::iterator i;
     C::const_iterator j;
+    (void)i;
+    (void)j;
   }
 #if TEST_STD_VER >= 11
   {
@@ -122,6 +124,8 @@ int main(int, char**) {
     typedef std::list<T, min_allocator<T>> C;
     C::iterator i;
     C::const_iterator j;
+    (void)i;
+    (void)j;
   }
   {
     typedef A T;
@@ -150,5 +154,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.capacity/empty.pass.cpp b/libcxx/test/std/containers/sequences/list/list.capacity/empty.pass.cpp
index 50ca23ff9c56..f368d8e700bb 100644
--- a/libcxx/test/std/containers/sequences/list/list.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.capacity/empty.pass.cpp
@@ -10,7 +10,7 @@
 
 // class list
 
-// bool empty() const noexcept;
+// bool empty() const noexcept; // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef std::list<int> C;
     C c;
@@ -42,5 +42,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.capacity/max_size.pass.cpp b/libcxx/test/std/containers/sequences/list/list.capacity/max_size.pass.cpp
index 74c2ccfb1442..1f956b33e482 100644
--- a/libcxx/test/std/containers/sequences/list/list.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.capacity/max_size.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// size_type max_size() const noexcept
+// size_type max_size() const noexcept // constexpr since C++26
 
 #include <cassert>
 #include <limits>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef limited_allocator<int, 10> A;
     typedef std::list<int, A> C;
@@ -42,5 +42,14 @@ int main(int, char**) {
     assert(c.max_size() <= alloc_max_size(c.get_allocator()));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.capacity/resize_size.pass.cpp b/libcxx/test/std/containers/sequences/list/list.capacity/resize_size.pass.cpp
index 754d931646cc..f694d9ab2650 100644
--- a/libcxx/test/std/containers/sequences/list/list.capacity/resize_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.capacity/resize_size.pass.cpp
@@ -8,15 +8,16 @@
 
 // <list>
 
-// void resize(size_type sz);
+// void resize(size_type sz); // constexpr since C++26
 
 #include <list>
 #include <cassert>
+
 #include "test_macros.h"
 #include "DefaultOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> l(5, 2);
     l.resize(2);
@@ -33,17 +34,31 @@ int main(int, char**) {
     assert(l.back() == 0);
   }
 #if TEST_STD_VER >= 11
-  {
-    std::list<DefaultOnly> l(10);
-    l.resize(5);
-    assert(l.size() == 5);
-    assert(std::distance(l.begin(), l.end()) == 5);
-  }
-  {
-    std::list<DefaultOnly> l(10);
-    l.resize(20);
-    assert(l.size() == 20);
-    assert(std::distance(l.begin(), l.end()) == 20);
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      std::list<DefaultOnly> l(10);
+      l.resize(5);
+      assert(l.size() == 5);
+      assert(std::distance(l.begin(), l.end()) == 5);
+    }
+    {
+      std::list<DefaultOnly> l(10);
+      l.resize(20);
+      assert(l.size() == 20);
+      assert(std::distance(l.begin(), l.end()) == 20);
+    }
+    {
+      std::list<DefaultOnly, min_allocator<DefaultOnly>> l(10);
+      l.resize(5);
+      assert(l.size() == 5);
+      assert(std::distance(l.begin(), l.end()) == 5);
+    }
+    {
+      std::list<DefaultOnly, min_allocator<DefaultOnly>> l(10);
+      l.resize(20);
+      assert(l.size() == 20);
+      assert(std::distance(l.begin(), l.end()) == 20);
+    }
   }
   {
     std::list<int, min_allocator<int>> l(5, 2);
@@ -60,18 +75,15 @@ int main(int, char**) {
     assert(l.front() == 2);
     assert(l.back() == 0);
   }
-  {
-    std::list<DefaultOnly, min_allocator<DefaultOnly>> l(10);
-    l.resize(5);
-    assert(l.size() == 5);
-    assert(std::distance(l.begin(), l.end()) == 5);
-  }
-  {
-    std::list<DefaultOnly, min_allocator<DefaultOnly>> l(10);
-    l.resize(20);
-    assert(l.size() == 20);
-    assert(std::distance(l.begin(), l.end()) == 20);
-  }
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/containers/sequences/list/list.capacity/resize_size_value.pass.cpp b/libcxx/test/std/containers/sequences/list/list.capacity/resize_size_value.pass.cpp
index 95fccddeca54..a93ec224bd6d 100644
--- a/libcxx/test/std/containers/sequences/list/list.capacity/resize_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.capacity/resize_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void resize(size_type sz, const value_type& x);
+// void resize(size_type sz, const value_type& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "DefaultOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<double> l(5, 2);
     l.resize(2, 3.5);
@@ -50,5 +50,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.capacity/size.pass.cpp b/libcxx/test/std/containers/sequences/list/list.capacity/size.pass.cpp
index 930331205a9a..8aecfcaea027 100644
--- a/libcxx/test/std/containers/sequences/list/list.capacity/size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.capacity/size.pass.cpp
@@ -10,7 +10,7 @@
 
 // class list
 
-// size_type size() const noexcept;
+// size_type size() const noexcept; // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef std::list<int> C;
     C c;
@@ -58,5 +58,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/assign_copy.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/assign_copy.pass.cpp
index ca468d870998..912975d55e1d 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/assign_copy.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// list& operator=(const list& c);
+// list& operator=(const list& c); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int, test_allocator<int> > l(3, 2, test_allocator<int>(5));
     std::list<int, test_allocator<int> > l2(l, test_allocator<int>(3));
@@ -41,5 +41,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/assign_initializer_list.pass.cpp
index d4c1120df622..07b25f189a11 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/assign_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/assign_initializer_list.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// void assign(initializer_list<value_type> il);
+// void assign(initializer_list<value_type> il); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> d;
     d.assign({3, 4, 5, 6});
@@ -40,5 +40,14 @@ int main(int, char**) {
     assert(*i++ == 6);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/assign_move.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/assign_move.pass.cpp
index 87faaaac2b21..aa199b05ed45 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/assign_move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/assign_move.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// list& operator=(list&& c);
+// list& operator=(list&& c); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<MoveOnly, test_allocator<MoveOnly> > l(test_allocator<MoveOnly>(5));
     std::list<MoveOnly, test_allocator<MoveOnly> > lo(test_allocator<MoveOnly>(5));
@@ -79,5 +79,14 @@ int main(int, char**) {
     assert(it == l2.begin()); // Iterators remain valid
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/copy.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/copy.pass.cpp
index de52da0fefab..a3e510d4d6eb 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/copy.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// list(const list& c);
+// list(const list& c); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> l(3, 2);
     std::list<int> l2 = l;
@@ -50,5 +50,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/copy_alloc.pass.cpp
index 0d6c6f431f09..5da17a9c9b59 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/copy_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// list(const list& c, const allocator_type& a);
+// list(const list& c, const allocator_type& a); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int, test_allocator<int> > l(3, 2, test_allocator<int>(5));
     std::list<int, test_allocator<int> > l2(l, test_allocator<int>(3));
@@ -39,5 +39,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/default.pass.cpp
index 0cfd8f1e9c59..1256433659c6 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/default.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// explicit list(const Alloc& = Alloc());
+// explicit list(const Alloc& = Alloc()); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "DefaultOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> l;
     assert(l.size() == 0);
@@ -65,5 +65,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/default_stack_alloc.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/default_stack_alloc.pass.cpp
index e31a58d7b9a5..3a78d0e0e0d5 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/default_stack_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/default_stack_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// explicit list(const Alloc& = Alloc());
+// explicit list(const Alloc& = Alloc()); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> l;
     assert(l.size() == 0);
@@ -45,5 +45,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/from_range.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/from_range.pass.cpp
index cc5ed5729b57..311c72d815d1 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/from_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/from_range.pass.cpp
@@ -9,14 +9,15 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23
+//   list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23; constexpr since C++26
 
 #include <list>
+#include <type_traits>
 
 #include "../../from_range_sequence_containers.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for_all_iterators_and_allocators<int>([]<class Iter, class Sent, class Alloc>() {
     test_sequence_container<std::list, int, Iter, Sent, Alloc>([](const auto&) {
       // No additional validation to do.
@@ -26,8 +27,19 @@ int main(int, char**) {
 
   static_assert(test_constraints<std::list, int, double>());
 
-  test_exception_safety_throwing_copy<std::list>();
-  test_exception_safety_throwing_allocator<std::list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_exception_safety_throwing_copy<std::list>();
+    test_exception_safety_throwing_allocator<std::list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/initializer_list.pass.cpp
index 3ba90d1337e9..9e3a71ed3bd1 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/initializer_list.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// list(initializer_list<value_type> il);
+// list(initializer_list<value_type> il); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> d = {3, 4, 5, 6};
     assert(d.size() == 4);
@@ -38,5 +38,14 @@ int main(int, char**) {
     assert(*i++ == 6);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/initializer_list_alloc.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/initializer_list_alloc.pass.cpp
index e4779eb5a640..1b6b1e19c6eb 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/initializer_list_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/initializer_list_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// list(initializer_list<value_type> il, const Allocator& a = allocator_type());
+// list(initializer_list<value_type> il, const Allocator& a = allocator_type()); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int, test_allocator<int>> d({3, 4, 5, 6}, test_allocator<int>(3));
     assert(d.get_allocator() == test_allocator<int>(3));
@@ -41,5 +41,14 @@ int main(int, char**) {
     assert(*i++ == 6);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/input_iterator.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/input_iterator.pass.cpp
index c99069f92f51..d92307283098 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/input_iterator.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/input_iterator.pass.cpp
@@ -9,10 +9,11 @@
 // <list>
 
 // template <class InputIterator>
-//   list(InputIterator first, InputIterator last, const Allocator& = Allocator());
+//   list(InputIterator first, InputIterator last, const Allocator& = Allocator()); // constexpr since C++26
 
 #include <list>
 #include <cassert>
+
 #include "test_macros.h"
 #include "test_iterators.h"
 #include "test_allocator.h"
@@ -22,7 +23,7 @@
 #  include "container_test_types.h"
 #endif
 
-void basic_test() {
+TEST_CONSTEXPR_CXX26 void basic_test() {
   {
     int a[] = {0, 1, 2, 3};
     std::list<int> l(
@@ -81,7 +82,7 @@ void basic_test() {
 #endif
 }
 
-void test_emplacable_concept() {
+TEST_CONSTEXPR_CXX26 void test_emplacable_concept() {
 #if TEST_STD_VER >= 11
   int arr1[] = {42};
   int arr2[] = {1, 101, 42};
@@ -126,7 +127,7 @@ void test_emplacable_concept() {
 #endif
 }
 
-void test_emplacable_concept_with_alloc() {
+TEST_CONSTEXPR_CXX26 void test_emplacable_concept_with_alloc() {
 #if TEST_STD_VER >= 11
   int arr1[] = {42};
   int arr2[] = {1, 101, 42};
@@ -239,12 +240,24 @@ void test_ctor_under_alloc_with_alloc() {
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   basic_test();
   test_emplacable_concept();
   test_emplacable_concept_with_alloc();
-  test_ctor_under_alloc();
-  test_ctor_under_alloc_with_alloc();
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_ctor_under_alloc();
+    test_ctor_under_alloc_with_alloc();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/move.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/move.pass.cpp
index 6703390f10b9..cae2886cf08b 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/move.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// list(list&& c);
+// list(list&& c); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<MoveOnly, test_allocator<MoveOnly> > l(test_allocator<MoveOnly>(5));
     std::list<MoveOnly, test_allocator<MoveOnly> > lo(test_allocator<MoveOnly>(5));
@@ -63,5 +63,14 @@ int main(int, char**) {
     assert(it == l2.begin()); // Iterators remain valid
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/move_alloc.pass.cpp
index f6a1f2c33a63..dee0282c9978 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/move_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// list(list&& c, const allocator_type& a);
+// list(list&& c, const allocator_type& a); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<MoveOnly, test_allocator<MoveOnly> > l(test_allocator<MoveOnly>(5));
     std::list<MoveOnly, test_allocator<MoveOnly> > lo(test_allocator<MoveOnly>(5));
@@ -69,5 +69,14 @@ int main(int, char**) {
     assert(l2.get_allocator() == min_allocator<MoveOnly>());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/op_equal_initializer_list.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/op_equal_initializer_list.pass.cpp
index a9ab30b82640..d7679931ee71 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/op_equal_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/op_equal_initializer_list.pass.cpp
@@ -10,14 +10,14 @@
 
 // <list>
 
-// list& operator=(initializer_list<value_type> il);
+// list& operator=(initializer_list<value_type> il); // constexpr since C++26
 
 #include <list>
 #include <cassert>
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> d;
     d = {3, 4, 5, 6};
@@ -39,5 +39,14 @@ int main(int, char**) {
     assert(*i++ == 6);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/size_type.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/size_type.pass.cpp
index 937a86a27e05..55371e8354a9 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/size_type.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/size_type.pass.cpp
@@ -8,18 +8,19 @@
 
 // <list>
 
-// explicit list(size_type n);
+// explicit list(size_type n); // constexpr since C++26
 
 #include <list>
 #include <cassert>
 #include <cstddef>
+
 #include "test_macros.h"
 #include "DefaultOnly.h"
 #include "test_allocator.h"
 #include "min_allocator.h"
 
 template <class T, class Allocator>
-void test3(unsigned n, Allocator const& alloc = Allocator()) {
+TEST_CONSTEXPR_CXX26 void test1(unsigned n, Allocator const& alloc = Allocator()) {
 #if TEST_STD_VER > 11
   typedef std::list<T, Allocator> C;
   {
@@ -34,7 +35,7 @@ void test3(unsigned n, Allocator const& alloc = Allocator()) {
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> l(3);
     assert(l.size() == 3);
@@ -70,15 +71,10 @@ int main(int, char**) {
     assert(*i == 0);
     ++i;
     assert(*i == 0);
-    test3<int, min_allocator<int>>(3);
+    test1<int, min_allocator<int>>(3);
   }
 #endif
 #if TEST_STD_VER >= 11
-  {
-    std::list<DefaultOnly> l(3);
-    assert(l.size() == 3);
-    assert(std::distance(l.begin(), l.end()) == 3);
-  }
   {
     std::list<int, min_allocator<int>> l(3);
     assert(l.size() == 3);
@@ -90,12 +86,29 @@ int main(int, char**) {
     ++i;
     assert(*i == 0);
   }
-  {
-    std::list<DefaultOnly, min_allocator<DefaultOnly>> l(3);
-    assert(l.size() == 3);
-    assert(std::distance(l.begin(), l.end()) == 3);
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      std::list<DefaultOnly> l(3);
+      assert(l.size() == 3);
+      assert(std::distance(l.begin(), l.end()) == 3);
+    }
+    {
+      std::list<DefaultOnly, min_allocator<DefaultOnly>> l(3);
+      assert(l.size() == 3);
+      assert(std::distance(l.begin(), l.end()) == 3);
+    }
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/size_value_alloc.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/size_value_alloc.pass.cpp
index ff7982ce147d..42700c3ed658 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/size_value_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/size_value_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// list(size_type n, const T& value, const Allocator& = Allocator());
+// list(size_type n, const T& value, const Allocator& = Allocator()); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> l(3, 2);
     assert(l.size() == 3);
@@ -77,5 +77,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.erasure/erase.pass.cpp b/libcxx/test/std/containers/sequences/list/list.erasure/erase.pass.cpp
index 77f9f8956037..babd4b2758e6 100644
--- a/libcxx/test/std/containers/sequences/list/list.erasure/erase.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.erasure/erase.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator, class U>
 //   typename list<T, Allocator>::size_type
-//   erase(list<T, Allocator>& c, const U& value);
+//   erase(list<T, Allocator>& c, const U& value); // constexpr since C++26
 
 #include <list>
 #include <optional>
@@ -21,14 +21,14 @@
 #include "min_allocator.h"
 
 template <class S, class U>
-void test0(S s, U val, S expected, std::size_t expected_erased_count) {
+TEST_CONSTEXPR_CXX26 void test0(S s, U val, S expected, std::size_t expected_erased_count) {
   ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase(s, val)));
   assert(expected_erased_count == std::erase(s, val));
   assert(s == expected);
 }
 
 template <class S>
-void test() {
+TEST_CONSTEXPR_CXX26 void test1() {
   test0(S(), 1, S(), 0);
 
   test0(S({1}), 1, S(), 1);
@@ -62,13 +62,22 @@ void test() {
   test0(S({1, 2, 1}), opt(3), S({1, 2, 1}), 0);
 }
 
-int main(int, char**) {
-  test<std::list<int>>();
-  test<std::list<int, min_allocator<int>>>();
-  test<std::list<int, test_allocator<int>>>();
+TEST_CONSTEXPR_CXX26 bool test() {
+  test1<std::list<int>>();
+  test1<std::list<int, min_allocator<int>>>();
+  test1<std::list<int, test_allocator<int>>>();
 
-  test<std::list<long>>();
-  test<std::list<double>>();
+  test1<std::list<long>>();
+  test1<std::list<double>>();
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/sequences/list/list.erasure/erase_if.pass.cpp
index 5352a2f454f8..e396330bc68c 100644
--- a/libcxx/test/std/containers/sequences/list/list.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.erasure/erase_if.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator, class Predicate>
 //   typename list<T, Allocator>::size_type
-//   erase_if(list<T, Allocator>& c, Predicate pred);
+//   erase_if(list<T, Allocator>& c, Predicate pred); // constexpr since C++26
 
 #include <list>
 
@@ -20,14 +20,14 @@
 #include "min_allocator.h"
 
 template <class S, class Pred>
-void test0(S s, Pred p, S expected, std::size_t expected_erased_count) {
+TEST_CONSTEXPR_CXX26 void test0(S s, Pred p, S expected, std::size_t expected_erased_count) {
   ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
   assert(s == expected);
 }
 
 template <typename S>
-void test() {
+TEST_CONSTEXPR_CXX26 void test1() {
   auto is1   = [](auto v) { return v == 1; };
   auto is2   = [](auto v) { return v == 2; };
   auto is3   = [](auto v) { return v == 3; };
@@ -64,13 +64,22 @@ void test() {
   test0(S({1, 2, 3}), False, S({1, 2, 3}), 0);
 }
 
-int main(int, char**) {
-  test<std::list<int>>();
-  test<std::list<int, min_allocator<int>>>();
-  test<std::list<int, test_allocator<int>>>();
+TEST_CONSTEXPR_CXX26 bool test() {
+  test1<std::list<int>>();
+  test1<std::list<int, min_allocator<int>>>();
+  test1<std::list<int, test_allocator<int>>>();
 
-  test<std::list<long>>();
-  test<std::list<double>>();
+  test1<std::list<long>>();
+  test1<std::list<double>>();
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/append_range.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/append_range.pass.cpp
index 46a99cb54844..4b47a8738e52 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/append_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/append_range.pass.cpp
@@ -9,9 +9,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void append_range(R&& rg); // C++23
+//   constexpr void append_range(R&& rg); // C++23; constexpr since C++26
 
 #include <list>
+#include <type_traits>
 
 #include "../../insert_range_sequence_containers.h"
 #include "test_macros.h"
@@ -21,7 +22,7 @@
 //   {empty/one-element/full} container);
 // - appending move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_append_range<std::list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +32,19 @@ int main(int, char**) {
   });
   test_sequence_append_range_move_only<std::list>();
 
-  test_append_range_exception_safety_throwing_copy<std::list>();
-  test_append_range_exception_safety_throwing_allocator<std::list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_append_range_exception_safety_throwing_copy<std::list>();
+    test_append_range_exception_safety_throwing_allocator<std::list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/assign_range.pass.cpp
index d745786b6815..83a12879a041 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/assign_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/assign_range.pass.cpp
@@ -9,9 +9,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void assign_range(R&& rg); // C++23
+//   constexpr void assign_range(R&& rg); // C++23; constexpr since C++26
 
 #include <list>
+#include <type_traits>
 
 #include "../../insert_range_sequence_containers.h"
 #include "test_macros.h"
@@ -21,7 +22,7 @@
 //   {empty/one-element/full} container);
 // - assigning move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_assign_range<std::list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +32,19 @@ int main(int, char**) {
   });
   test_sequence_assign_range_move_only<std::list>();
 
-  test_assign_range_exception_safety_throwing_copy<std::list>();
-  test_assign_range_exception_safety_throwing_allocator<std::list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_assign_range_exception_safety_throwing_copy<std::list>();
+    test_assign_range_exception_safety_throwing_allocator<std::list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/clear.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/clear.pass.cpp
index 5931fd62d037..0b38ae05bd68 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/clear.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void clear() noexcept;
+// void clear() noexcept; // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a[] = {1, 2, 3};
     std::list<int> c(a, a + 3);
@@ -34,5 +34,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/emplace.pass.cpp
index 2f83aa0d317b..9bd7a151d20e 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/emplace.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// template <class... Args> void emplace(const_iterator p, Args&&... args);
+// template <class... Args> void emplace(const_iterator p, Args&&... args); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -26,13 +26,13 @@ class A {
   A& operator=(const A&);
 
 public:
-  A(int i, double d) : i_(i), d_(d) {}
+  TEST_CONSTEXPR_CXX20 A(int i, double d) : i_(i), d_(d) {}
 
-  int geti() const { return i_; }
-  double getd() const { return d_; }
+  TEST_CONSTEXPR int geti() const { return i_; }
+  TEST_CONSTEXPR double getd() const { return d_; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<A> c;
     c.emplace(c.cbegin(), 2, 3.5);
@@ -60,5 +60,14 @@ int main(int, char**) {
     assert(c.back().getd() == 4.5);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp
index 900f8b83d3e6..5f84c4c7c05a 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// template <class... Args> reference emplace_back(Args&&... args);
+// template <class... Args> reference emplace_back(Args&&... args); // constexpr since C++26
 // return type is 'reference' in C++17; 'void' before
 
 #include <list>
@@ -27,13 +27,13 @@ class A {
   A& operator=(const A&);
 
 public:
-  A(int i, double d) : i_(i), d_(d) {}
+  TEST_CONSTEXPR_CXX20 A(int i, double d) : i_(i), d_(d) {}
 
-  int geti() const { return i_; }
-  double getd() const { return d_; }
+  TEST_CONSTEXPR int geti() const { return i_; }
+  TEST_CONSTEXPR double getd() const { return d_; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<A> c;
 #if TEST_STD_VER > 14
@@ -83,5 +83,14 @@ int main(int, char**) {
     assert(c.back().getd() == 4.5);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp
index 665f5077bd42..95474b52dbd0 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// template <class... Args> reference emplace_front(Args&&... args);
+// template <class... Args> reference emplace_front(Args&&... args); // constexpr since C++26
 // return type is 'reference' in C++17; 'void' before
 
 #include <list>
@@ -27,13 +27,13 @@ class A {
   A& operator=(const A&);
 
 public:
-  A(int i, double d) : i_(i), d_(d) {}
+  TEST_CONSTEXPR_CXX20 A(int i, double d) : i_(i), d_(d) {}
 
-  int geti() const { return i_; }
-  double getd() const { return d_; }
+  TEST_CONSTEXPR int geti() const { return i_; }
+  TEST_CONSTEXPR double getd() const { return d_; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<A> c;
 #if TEST_STD_VER > 14
@@ -84,5 +84,14 @@ int main(int, char**) {
     assert(c.back().getd() == 3.5);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter.pass.cpp
index ba139b4367d7..79dae11a8263 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// iterator erase(const_iterator position);
+// iterator erase(const_iterator position); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {1, 2, 3};
     std::list<int> l1(a1, a1 + 3);
@@ -62,5 +62,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter_iter.pass.cpp
index cc8d537032d0..fa6f8139ff75 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/erase_iter_iter.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// iterator erase(const_iterator first, const_iterator last);
+// iterator erase(const_iterator first, const_iterator last); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   int a1[] = {1, 2, 3};
   {
     std::list<int> l1(a1, a1 + 3);
@@ -81,5 +81,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_initializer_list.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_initializer_list.pass.cpp
index 8bd01c940d95..4475d27a7e73 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_initializer_list.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// iterator insert(const_iterator p, initializer_list<value_type> il);
+// iterator insert(const_iterator p, initializer_list<value_type> il); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> d(10, 1);
     std::list<int>::iterator i = d.insert(std::next(d.cbegin(), 2), {3, 4, 5, 6});
@@ -62,5 +62,14 @@ int main(int, char**) {
     assert(*i++ == 1);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_iter_iter.pass.cpp
index bab125ca6209..27db218511aa 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_iter_iter.pass.cpp
@@ -9,7 +9,7 @@
 // <list>
 
 // template <InputIterator Iter>
-//   iterator insert(const_iterator position, Iter first, Iter last);
+//   iterator insert(const_iterator position, Iter first, Iter last); // constexpr since C++26
 
 #include <list>
 #include <cstdlib>
@@ -21,7 +21,7 @@
 #include "count_new.h"
 
 template <class List>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   int a1[] = {1, 2, 3};
   List l1;
   typename List::iterator i = l1.insert(l1.begin(), a1, a1 + 3);
@@ -53,36 +53,47 @@ void test() {
   assert(*i == 3);
 
 #if !defined(TEST_HAS_NO_EXCEPTIONS) && !defined(DISABLE_NEW_COUNT)
-  globalMemCounter.throw_after = 2;
-  int save_count               = globalMemCounter.outstanding_new;
-  try {
-    i = l1.insert(i, a2, a2 + 3);
-    assert(false);
-  } catch (...) {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    globalMemCounter.throw_after = 2;
+    int save_count               = globalMemCounter.outstanding_new;
+    try {
+      i = l1.insert(i, a2, a2 + 3);
+      assert(false);
+    } catch (...) {
+    }
+    assert(globalMemCounter.checkOutstandingNewEq(save_count));
+    assert(l1.size() == 6);
+    assert(std::distance(l1.begin(), l1.end()) == 6);
+    i = l1.begin();
+    assert(*i == 1);
+    ++i;
+    assert(*i == 2);
+    ++i;
+    assert(*i == 4);
+    ++i;
+    assert(*i == 5);
+    ++i;
+    assert(*i == 6);
+    ++i;
+    assert(*i == 3);
   }
-  assert(globalMemCounter.checkOutstandingNewEq(save_count));
-  assert(l1.size() == 6);
-  assert(std::distance(l1.begin(), l1.end()) == 6);
-  i = l1.begin();
-  assert(*i == 1);
-  ++i;
-  assert(*i == 2);
-  ++i;
-  assert(*i == 4);
-  ++i;
-  assert(*i == 5);
-  ++i;
-  assert(*i == 6);
-  ++i;
-  assert(*i == 3);
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::list<int> >();
 #if TEST_STD_VER >= 11
   test<std::list<int, min_allocator<int>>>();
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_rvalue.pass.cpp
index 8bb513208eb7..7d7b2f158a60 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_rvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_rvalue.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// iterator insert(const_iterator position, value_type&& x);
+// iterator insert(const_iterator position, value_type&& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<MoveOnly> l1;
     l1.insert(l1.cend(), MoveOnly(1));
@@ -41,5 +41,14 @@ int main(int, char**) {
     assert(l1.back() == MoveOnly(1));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_size_value.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_size_value.pass.cpp
index 32ee7a73406d..1056d997f9d8 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// iterator insert(const_iterator position, size_type n, const value_type& x);
+// iterator insert(const_iterator position, size_type n, const value_type& x); // constexpr since C++26
 
 // UNSUPPORTED: sanitizer-new-delete
 
@@ -21,7 +21,7 @@
 #include "test_macros.h"
 
 template <class List>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   int a1[] = {1, 2, 3};
   int a2[] = {1, 4, 4, 4, 4, 4, 2, 3};
   List l1(a1, a1 + 3);
@@ -29,23 +29,34 @@ void test() {
   assert(i == std::next(l1.begin()));
   assert(l1 == List(a2, a2 + 8));
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  globalMemCounter.throw_after = 4;
-  int save_count               = globalMemCounter.outstanding_new;
-  try {
-    i = l1.insert(i, 5, 5);
-    assert(false);
-  } catch (...) {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    globalMemCounter.throw_after = 4;
+    int save_count               = globalMemCounter.outstanding_new;
+    try {
+      i = l1.insert(i, 5, 5);
+      assert(false);
+    } catch (...) {
+    }
+    assert(globalMemCounter.checkOutstandingNewEq(save_count));
+    assert(l1 == List(a2, a2 + 8));
   }
-  assert(globalMemCounter.checkOutstandingNewEq(save_count));
-  assert(l1 == List(a2, a2 + 8));
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::list<int> >();
 #if TEST_STD_VER >= 11
   test<std::list<int, min_allocator<int>>>();
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_value.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_value.pass.cpp
index 129fe05cb39d..615bb5bb2b42 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// iterator insert(const_iterator position, const value_type& x);
+// iterator insert(const_iterator position, const value_type& x); // constexpr since C++26
 
 #include <list>
 #include <cstdlib>
@@ -19,7 +19,7 @@
 #include "count_new.h"
 
 template <class List>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   int a1[] = {1, 2, 3};
   int a2[] = {1, 4, 2, 3};
   List l1(a1, a1 + 3);
@@ -30,23 +30,34 @@ void test() {
   assert(l1 == List(a2, a2 + 4));
 
 #if !defined(TEST_HAS_NO_EXCEPTIONS) && !defined(DISABLE_NEW_COUNT)
-  globalMemCounter.throw_after = 0;
-  int save_count               = globalMemCounter.outstanding_new;
-  try {
-    i = l1.insert(i, 5);
-    assert(false);
-  } catch (...) {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    globalMemCounter.throw_after = 0;
+    int save_count               = globalMemCounter.outstanding_new;
+    try {
+      i = l1.insert(i, 5);
+      assert(false);
+    } catch (...) {
+    }
+    assert(globalMemCounter.checkOutstandingNewEq(save_count));
+    assert(l1 == List(a2, a2 + 4));
   }
-  assert(globalMemCounter.checkOutstandingNewEq(save_count));
-  assert(l1 == List(a2, a2 + 4));
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::list<int> >();
 #if TEST_STD_VER >= 11
   test<std::list<int, min_allocator<int>>>();
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_range.pass.cpp
index eb3937eb8f9e..5908d40d0cc9 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/insert_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/insert_range.pass.cpp
@@ -6,12 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr iterator insert_range(const_iterator position, R&& rg); // C++23
+//   constexpr iterator insert_range(const_iterator position, R&& rg); // C++23; constexpr since C++26
 
 #include <list>
+#include <type_traits>
 
 #include "../../insert_range_sequence_containers.h"
 #include "test_macros.h"
@@ -21,7 +25,7 @@
 //   {empty/one-element/full} container at the {beginning/middle/end});
 // - inserting move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_insert_range<std::list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +35,19 @@ int main(int, char**) {
   });
   test_sequence_insert_range_move_only<std::list>();
 
-  test_insert_range_exception_safety_throwing_copy<std::list>();
-  test_insert_range_exception_safety_throwing_allocator<std::list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_insert_range_exception_safety_throwing_copy<std::list>();
+    test_insert_range_exception_safety_throwing_allocator<std::list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/pop_back.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/pop_back.pass.cpp
index aaa225b14776..5bbac428d8d5 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/pop_back.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/pop_back.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void pop_back();
+// void pop_back(); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a[] = {1, 2, 3};
     std::list<int> c(a, a + 3);
@@ -40,5 +40,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/pop_front.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/pop_front.pass.cpp
index 33b8ff35c524..74b6a1cc319b 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/pop_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/pop_front.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void pop_front();
+// void pop_front(); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a[] = {1, 2, 3};
     std::list<int> c(a, a + 3);
@@ -40,5 +40,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/prepend_range.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/prepend_range.pass.cpp
index d5e4d4fabb76..41f7061c09d2 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/prepend_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/prepend_range.pass.cpp
@@ -9,9 +9,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void prepend_range(R&& rg); // C++23
+//   constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26
 
 #include <list>
+#include <type_traits>
 
 #include "../../insert_range_sequence_containers.h"
 #include "test_macros.h"
@@ -21,7 +22,7 @@
 //   {empty/one-element/full} container);
 // - prepending move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_prepend_range<std::list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +32,19 @@ int main(int, char**) {
   });
   test_sequence_prepend_range_move_only<std::list>();
 
-  test_prepend_range_exception_safety_throwing_copy<std::list>();
-  test_prepend_range_exception_safety_throwing_allocator<std::list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_prepend_range_exception_safety_throwing_copy<std::list>();
+    test_prepend_range_exception_safety_throwing_allocator<std::list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/push_back.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/push_back.pass.cpp
index 582f4a200ac2..3ac9a60e7901 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/push_back.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/push_back.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void push_back(const value_type& x);
+// void push_back(const value_type& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> c;
     for (int i = 0; i < 5; ++i)
@@ -34,5 +34,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/push_back_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/push_back_rvalue.pass.cpp
index 6a31d81d694f..764dd7da1832 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/push_back_rvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/push_back_rvalue.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// void push_back(value_type&& x);
+// void push_back(value_type&& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<MoveOnly> l1;
     l1.push_back(MoveOnly(1));
@@ -41,5 +41,14 @@ int main(int, char**) {
     assert(l1.back() == MoveOnly(2));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/push_front.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/push_front.pass.cpp
index 3b5f74a217a2..7ec18e841822 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/push_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/push_front.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void push_front(const value_type& x);
+// void push_front(const value_type& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<int> c;
     for (int i = 0; i < 5; ++i)
@@ -34,5 +34,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.modifiers/push_front_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/list/list.modifiers/push_front_rvalue.pass.cpp
index 0d41b8fd8553..930b6af5f243 100644
--- a/libcxx/test/std/containers/sequences/list/list.modifiers/push_front_rvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.modifiers/push_front_rvalue.pass.cpp
@@ -10,7 +10,7 @@
 
 // <list>
 
-// void push_front(value_type&& x);
+// void push_front(value_type&& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::list<MoveOnly> l1;
     l1.push_front(MoveOnly(1));
@@ -41,5 +41,14 @@ int main(int, char**) {
     assert(l1.back() == MoveOnly(1));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/merge.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/merge.pass.cpp
index 7f82f65fd493..19ea940cb2a0 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/merge.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/merge.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void merge(list& x);
+// void merge(list& x); // constexpr since C++26
 // If (addressof(x) == this) does nothing; otherwise ...
 
 #include <list>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {1, 3, 7, 9, 10};
     int a2[] = {0, 2, 4, 5, 6, 8, 11};
@@ -49,5 +49,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/merge_comp.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/merge_comp.pass.cpp
index 13241909c6e5..974700926db6 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/merge_comp.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/merge_comp.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// template <class Compare> void merge(list& x, Compare comp);
+// template <class Compare> void merge(list& x, Compare comp); // constexpr since C++26
 // If (addressof(x) == this) does nothing; otherwise ...
 
 #include <list>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {10, 9, 7, 3, 1};
     int a2[] = {11, 8, 6, 5, 4, 2, 0};
@@ -49,5 +49,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/remove.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/remove.pass.cpp
index 238ea9b69ea2..9bf677b8745c 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/remove.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/remove.pass.cpp
@@ -9,7 +9,7 @@
 // <list>
 
 // void      remove(const value_type& value); // pre-c++20
-// size_type remove(const value_type& value); // c++20 and later
+// size_type remove(const value_type& value); // c++20 and later; constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,22 +18,22 @@
 #include "min_allocator.h"
 
 struct S {
-  S(int i) : i_(new int(i)) {}
-  S(const S& rhs) : i_(new int(*rhs.i_)) {}
-  S& operator=(const S& rhs) {
+  TEST_CONSTEXPR_CXX20 S(int i) : i_(new int(i)) {}
+  TEST_CONSTEXPR_CXX20 S(const S& rhs) : i_(new int(*rhs.i_)) {}
+  TEST_CONSTEXPR_CXX14 S& operator=(const S& rhs) {
     *i_ = *rhs.i_;
     return *this;
   }
-  ~S() {
+  TEST_CONSTEXPR_CXX20 ~S() {
     delete i_;
     i_ = NULL;
   }
-  bool operator==(const S& rhs) const { return *i_ == *rhs.i_; }
-  int get() const { return *i_; }
+  TEST_CONSTEXPR bool operator==(const S& rhs) const { return *i_ == *rhs.i_; }
+  TEST_CONSTEXPR int get() const { return *i_; }
   int* i_;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {1, 2, 3, 4};
     int a2[] = {1, 2, 4};
@@ -101,5 +101,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/remove_if.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/remove_if.pass.cpp
index 510cb361142b..c7ee09530ed9 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/remove_if.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/remove_if.pass.cpp
@@ -9,7 +9,7 @@
 // <list>
 
 // template <class Pred> void      remove_if(Pred pred); // before C++20
-// template <class Pred> size_type remove_if(Pred pred); // c++20 and later
+// template <class Pred> size_type remove_if(Pred pred); // c++20 and later; constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -19,22 +19,22 @@
 #include "min_allocator.h"
 #include "counting_predicates.h"
 
-bool even(int i) { return i % 2 == 0; }
+TEST_CONSTEXPR bool even(int i) { return i % 2 == 0; }
 
-bool g(int i) { return i < 3; }
+TEST_CONSTEXPR bool g(int i) { return i < 3; }
 
 struct PredLWG526 {
-  PredLWG526(int i) : i_(i) {}
-  ~PredLWG526() { i_ = -32767; }
-  bool operator()(const PredLWG526& p) const { return p.i_ == i_; }
+  TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; }
+  TEST_CONSTEXPR bool operator()(const PredLWG526& p) const { return p.i_ == i_; }
 
-  bool operator==(int i) const { return i == i_; }
+  TEST_CONSTEXPR bool operator==(int i) const { return i == i_; }
   int i_;
 };
 
 typedef unary_counting_predicate<bool (*)(int), int> Predicate;
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {1, 2, 3, 4};
     int a2[] = {3, 4};
@@ -92,5 +92,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/reverse.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/reverse.pass.cpp
index 5b91ad0224be..43e894f5e83a 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/reverse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/reverse.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void reverse();
+// void reverse(); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
     int a2[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
@@ -34,5 +34,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/sort.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/sort.pass.cpp
index 892419f6ac96..34ead09110a3 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/sort.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/sort.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void sort();
+// void sort(); // constexpr since C++26
 
 #include <algorithm>
 #include <list>
@@ -58,7 +58,7 @@ void test_stable(int N) {
   }
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {4, 8, 1, 0, 5, 7, 2, 3, 6, 11, 10, 9};
     int a2[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
@@ -76,8 +76,19 @@ int main(int, char**) {
   }
 #endif
 
-  for (int i = 0; i < 40; ++i)
-    test_stable(i);
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    for (int i = 0; i < 40; ++i)
+      test_stable(i);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/sort_comp.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/sort_comp.pass.cpp
index 499702281991..a24f187f4b4e 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/sort_comp.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/sort_comp.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// template <class Compare> sort(Compare comp);
+// template <class Compare> sort(Compare comp); // constexpr since C++26
 
 #include <list>
 #include <functional>
@@ -76,7 +76,7 @@ void test_stable(int N) {
   }
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {4, 8, 1, 0, 5, 7, 2, 3, 6, 11, 10, 9};
     int a2[] = {11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
@@ -85,37 +85,48 @@ int main(int, char**) {
     assert(c1 == std::list<int>(a2, a2 + sizeof(a2) / sizeof(a2[0])));
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) {
 //  Test with throwing comparison; make sure that nothing is lost.
 //  This is (sort of) LWG #2824
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  {
-    int a1[]     = {4, 8, 1, 0, 5, 7, 2, 3, 6, 11, 10, 9};
-    const int sz = sizeof(a1) / sizeof(a1[0]);
-    for (int i = 0; i < 10; ++i) {
-      std::list<int> c1(a1, a1 + sz);
-      try {
-        throwingLess<int> comp(i);
-        c1.sort(std::cref(comp));
-      } catch (int) {
+    {
+      int a1[]     = {4, 8, 1, 0, 5, 7, 2, 3, 6, 11, 10, 9};
+      const int sz = sizeof(a1) / sizeof(a1[0]);
+      for (int i = 0; i < 10; ++i) {
+        std::list<int> c1(a1, a1 + sz);
+        try {
+          throwingLess<int> comp(i);
+          c1.sort(std::cref(comp));
+        } catch (int) {
+        }
+        assert((c1.size() == sz));
+        assert((std::is_permutation(c1.begin(), c1.end(), a1)));
       }
-      assert((c1.size() == sz));
-      assert((std::is_permutation(c1.begin(), c1.end(), a1)));
     }
-  }
 #endif
 
 #if TEST_STD_VER >= 11
-  {
-    int a1[] = {4, 8, 1, 0, 5, 7, 2, 3, 6, 11, 10, 9};
-    int a2[] = {11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
-    std::list<int, min_allocator<int>> c1(a1, a1 + sizeof(a1) / sizeof(a1[0]));
-    c1.sort(std::greater<int>());
-    assert((c1 == std::list<int, min_allocator<int>>(a2, a2 + sizeof(a2) / sizeof(a2[0]))));
-  }
+    {
+      int a1[] = {4, 8, 1, 0, 5, 7, 2, 3, 6, 11, 10, 9};
+      int a2[] = {11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+      std::list<int, min_allocator<int>> c1(a1, a1 + sizeof(a1) / sizeof(a1[0]));
+      c1.sort(std::greater<int>());
+      assert((c1 == std::list<int, min_allocator<int>>(a2, a2 + sizeof(a2) / sizeof(a2[0]))));
+    }
 #endif
 
-  for (int i = 0; i < 40; ++i)
-    test_stable(i);
+    for (int i = 0; i < 40; ++i)
+      test_stable(i);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list.pass.cpp
index 4b40876e3bb7..0f1cfefab34e 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void splice(const_iterator position, list& x);
+// void splice(const_iterator position, list& x); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   int a1[] = {1, 2, 3};
   int a2[] = {4, 5, 6};
   {
@@ -780,5 +780,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter.pass.cpp
index db71fe17a06e..38dce58dc390 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void splice(const_iterator position, list<T,Allocator>& x, iterator i);
+// void splice(const_iterator position, list<T,Allocator>& x, iterator i); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   int a1[] = {1, 2, 3};
   int a2[] = {4, 5, 6};
   {
@@ -334,5 +334,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter_iter.pass.cpp
index b77b6a26440d..8fca21c81c66 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/splice_pos_list_iter_iter.pass.cpp
@@ -8,7 +8,7 @@
 
 // <list>
 
-// void splice(const_iterator position, list& x, iterator first, iterator last);
+// void splice(const_iterator position, list& x, iterator first, iterator last); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   int a1[] = {1, 2, 3};
   int a2[] = {4, 5, 6};
   {
@@ -214,5 +214,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/unique.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/unique.pass.cpp
index c08e348218f9..c2fa54f42553 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/unique.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/unique.pass.cpp
@@ -9,7 +9,7 @@
 // <list>
 
 // void      unique(); // before C++20
-// size_type unique(); // C++20 and later
+// size_type unique(); // C++20 and later; constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {2, 1, 1, 4, 4, 4, 4, 3, 3};
     int a2[] = {2, 1, 4, 3};
@@ -46,5 +46,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.ops/unique_pred.pass.cpp b/libcxx/test/std/containers/sequences/list/list.ops/unique_pred.pass.cpp
index 1d3a8e0c426a..830e54a3288d 100644
--- a/libcxx/test/std/containers/sequences/list/list.ops/unique_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.ops/unique_pred.pass.cpp
@@ -9,7 +9,7 @@
 // <list>
 
 // template <class BinaryPred> void      unique(BinaryPred pred); // before C++20
-// template <class BinaryPred> size_type unique(BinaryPred pred); // C++20 and later
+// template <class BinaryPred> size_type unique(BinaryPred pred); // C++20 and later; constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -18,18 +18,18 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-bool g(int x, int y) { return x == y; }
+TEST_CONSTEXPR bool g(int x, int y) { return x == y; }
 
 struct PredLWG526 {
-  PredLWG526(int i) : i_(i) {}
-  ~PredLWG526() { i_ = -32767; }
-  bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; }
+  TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; }
+  TEST_CONSTEXPR bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; }
 
-  bool operator==(int i) const { return i == i_; }
+  TEST_CONSTEXPR bool operator==(int i) const { return i == i_; }
   int i_;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {2, 1, 1, 4, 4, 4, 4, 3, 3};
     int a2[] = {2, 1, 4, 3};
@@ -75,5 +75,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.special/swap.pass.cpp b/libcxx/test/std/containers/sequences/list/list.special/swap.pass.cpp
index 1e9c71131d80..32efddb06920 100644
--- a/libcxx/test/std/containers/sequences/list/list.special/swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.special/swap.pass.cpp
@@ -9,7 +9,7 @@
 // <list>
 
 // template <class T, class Alloc>
-//   void swap(list<T,Alloc>& x, list<T,Alloc>& y);
+//   void swap(list<T,Alloc>& x, list<T,Alloc>& y); // constexpr since C++26
 
 #include <list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     int a1[] = {1, 3, 7, 9, 10};
     int a2[] = {0, 2, 4, 5, 6, 8, 11};
@@ -133,5 +133,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/list/list.special/swap_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/list/list.special/swap_noexcept.pass.cpp
index a4b1622a04be..037c7d07c4cb 100644
--- a/libcxx/test/std/containers/sequences/list/list.special/swap_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.special/swap_noexcept.pass.cpp
@@ -12,7 +12,7 @@
 
 // void swap(list& c)
 //     noexcept(!allocator_type::propagate_on_container_swap::value ||
-//              __is_nothrow_swappable<allocator_type>::value);
+//              __is_nothrow_swappable<allocator_type>::value); // constexpr since C++26
 //
 //  In C++17, the standard says that swap shall have:
 //     noexcept(allocator_traits<Allocator>::is_always_equal::value);
@@ -52,7 +52,7 @@ struct some_alloc2 {
   typedef std::true_type is_always_equal;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef std::list<MoveOnly> C;
     static_assert(noexcept(swap(std::declval<C&>(), std::declval<C&>())), "");
@@ -84,5 +84,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
index 9fd638087fce..d10c61c0e9cf 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
@@ -24,6 +24,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -54,6 +58,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -87,6 +95,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -126,6 +138,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -171,6 +187,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should be defined in c++23"
 #  endif
@@ -219,6 +239,13 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_list != 202502L
+#    error "__cpp_lib_constexpr_list should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index a13edacd1e46..e4fe9f994e2e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -208,6 +208,10 @@
 #    error "__cpp_lib_constexpr_iterator should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_memory
 #    error "__cpp_lib_constexpr_memory should not be defined before c++20"
 #  endif
@@ -1100,6 +1104,10 @@
 #    error "__cpp_lib_constexpr_iterator should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_memory
 #    error "__cpp_lib_constexpr_memory should not be defined before c++20"
 #  endif
@@ -2094,6 +2102,10 @@
 #    error "__cpp_lib_constexpr_iterator should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_memory
 #    error "__cpp_lib_constexpr_memory should not be defined before c++20"
 #  endif
@@ -3334,6 +3346,10 @@
 #    error "__cpp_lib_constexpr_iterator should have the value 201811L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_memory
 #    error "__cpp_lib_constexpr_memory should be defined in c++20"
 #  endif
@@ -4790,6 +4806,10 @@
 #    error "__cpp_lib_constexpr_iterator should have the value 201811L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_memory
 #    error "__cpp_lib_constexpr_memory should be defined in c++23"
 #  endif
@@ -6468,6 +6488,13 @@
 #    error "__cpp_lib_constexpr_iterator should have the value 201811L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_list
+#    error "__cpp_lib_constexpr_list should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_list != 202502L
+#    error "__cpp_lib_constexpr_list should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_memory
 #    error "__cpp_lib_constexpr_memory should be defined in c++26"
 #  endif
diff --git a/libcxx/test/support/min_allocator.h b/libcxx/test/support/min_allocator.h
index 3b7d12af24ce..16775649f55c 100644
--- a/libcxx/test/support/min_allocator.h
+++ b/libcxx/test/support/min_allocator.h
@@ -22,384 +22,381 @@
 #include "test_macros.h"
 
 template <class T>
-class bare_allocator
-{
+class bare_allocator {
 public:
-    typedef T value_type;
+  typedef T value_type;
 
-    bare_allocator() TEST_NOEXCEPT {}
+  bare_allocator() TEST_NOEXCEPT {}
 
-    template <class U>
-    bare_allocator(bare_allocator<U>) TEST_NOEXCEPT {}
+  template <class U>
+  bare_allocator(bare_allocator<U>) TEST_NOEXCEPT {}
 
-    T* allocate(std::size_t n)
-    {
-        return static_cast<T*>(::operator new(n*sizeof(T)));
-    }
+  T* allocate(std::size_t n) { return static_cast<T*>(::operator new(n * sizeof(T))); }
 
-    void deallocate(T* p, std::size_t)
-    {
-        return ::operator delete(static_cast<void*>(p));
-    }
+  void deallocate(T* p, std::size_t) { return ::operator delete(static_cast<void*>(p)); }
 
-    friend bool operator==(bare_allocator, bare_allocator) {return true;}
-    friend bool operator!=(bare_allocator x, bare_allocator y) {return !(x == y);}
+  friend bool operator==(bare_allocator, bare_allocator) { return true; }
+  friend bool operator!=(bare_allocator x, bare_allocator y) { return !(x == y); }
 };
 
-
 template <class T>
-class no_default_allocator
-{
+class no_default_allocator {
 #if TEST_STD_VER >= 11
-    no_default_allocator() = delete;
+  no_default_allocator() = delete;
 #else
-    no_default_allocator();
+  no_default_allocator();
 #endif
-    struct construct_tag {};
-    explicit no_default_allocator(construct_tag) {}
+  struct construct_tag {};
+  TEST_CONSTEXPR_CXX20 explicit no_default_allocator(construct_tag) {}
 
 public:
-    static no_default_allocator create() {
-      construct_tag tag;
-      return no_default_allocator(tag);
-    }
+  TEST_CONSTEXPR_CXX20 static no_default_allocator create() {
+    construct_tag tag;
+    return no_default_allocator(tag);
+  }
 
 public:
-    typedef T value_type;
+  typedef T value_type;
 
-    template <class U>
-    no_default_allocator(no_default_allocator<U>) TEST_NOEXCEPT {}
+  template <class U>
+  TEST_CONSTEXPR_CXX20 no_default_allocator(no_default_allocator<U>) TEST_NOEXCEPT {}
 
-    T* allocate(std::size_t n)
-    {
-        return static_cast<T*>(::operator new(n*sizeof(T)));
-    }
+  TEST_CONSTEXPR_CXX20 T* allocate(std::size_t n) { return static_cast<T*>(std::allocator<T>().allocate(n)); }
 
-    void deallocate(T* p, std::size_t)
-    {
-        return ::operator delete(static_cast<void*>(p));
-    }
+  TEST_CONSTEXPR_CXX20 void deallocate(T* p, std::size_t n) { std::allocator<T>().deallocate(p, n); }
 
-    friend bool operator==(no_default_allocator, no_default_allocator) {return true;}
-    friend bool operator!=(no_default_allocator x, no_default_allocator y) {return !(x == y);}
+  friend TEST_CONSTEXPR bool operator==(no_default_allocator, no_default_allocator) { return true; }
+  friend TEST_CONSTEXPR bool operator!=(no_default_allocator x, no_default_allocator y) { return !(x == y); }
 };
 
 struct malloc_allocator_base {
-    static std::size_t outstanding_bytes;
-    static std::size_t alloc_count;
-    static std::size_t dealloc_count;
-    static bool disable_default_constructor;
+  static std::size_t outstanding_bytes;
+  static std::size_t alloc_count;
+  static std::size_t dealloc_count;
+  static bool disable_default_constructor;
 
-    static std::size_t outstanding_alloc() {
-      assert(alloc_count >= dealloc_count);
-      return (alloc_count - dealloc_count);
-    }
+  static std::size_t outstanding_alloc() {
+    assert(alloc_count >= dealloc_count);
+    return (alloc_count - dealloc_count);
+  }
 
-    static void reset() {
-        assert(outstanding_alloc() == 0);
-        disable_default_constructor = false;
-        outstanding_bytes = 0;
-        alloc_count = 0;
-        dealloc_count = 0;
-    }
+  static void reset() {
+    assert(outstanding_alloc() == 0);
+    disable_default_constructor = false;
+    outstanding_bytes           = 0;
+    alloc_count                 = 0;
+    dealloc_count               = 0;
+  }
 };
 
-size_t malloc_allocator_base::outstanding_bytes = 0;
-size_t malloc_allocator_base::alloc_count = 0;
-size_t malloc_allocator_base::dealloc_count = 0;
+size_t malloc_allocator_base::outstanding_bytes         = 0;
+size_t malloc_allocator_base::alloc_count               = 0;
+size_t malloc_allocator_base::dealloc_count             = 0;
 bool malloc_allocator_base::disable_default_constructor = false;
 
-
 template <class T>
-class malloc_allocator : public malloc_allocator_base
-{
+class malloc_allocator : public malloc_allocator_base {
 public:
-    typedef T value_type;
+  typedef T value_type;
 
-    malloc_allocator() TEST_NOEXCEPT { assert(!disable_default_constructor); }
+  malloc_allocator() TEST_NOEXCEPT { assert(!disable_default_constructor); }
 
-    template <class U>
-    malloc_allocator(malloc_allocator<U>) TEST_NOEXCEPT {}
+  template <class U>
+  malloc_allocator(malloc_allocator<U>) TEST_NOEXCEPT {}
 
-    T* allocate(std::size_t n)
-    {
-        const std::size_t nbytes = n*sizeof(T);
-        ++alloc_count;
-        outstanding_bytes += nbytes;
-        return static_cast<T*>(std::malloc(nbytes));
-    }
+  T* allocate(std::size_t n) {
+    const std::size_t nbytes = n * sizeof(T);
+    ++alloc_count;
+    outstanding_bytes += nbytes;
+    return static_cast<T*>(std::malloc(nbytes));
+  }
 
-    void deallocate(T* p, std::size_t n)
-    {
-        const std::size_t nbytes = n*sizeof(T);
-        ++dealloc_count;
-        outstanding_bytes -= nbytes;
-        std::free(static_cast<void*>(p));
-    }
+  void deallocate(T* p, std::size_t n) {
+    const std::size_t nbytes = n * sizeof(T);
+    ++dealloc_count;
+    outstanding_bytes -= nbytes;
+    std::free(static_cast<void*>(p));
+  }
 
-    friend bool operator==(malloc_allocator, malloc_allocator) {return true;}
-    friend bool operator!=(malloc_allocator x, malloc_allocator y) {return !(x == y);}
+  friend bool operator==(malloc_allocator, malloc_allocator) { return true; }
+  friend bool operator!=(malloc_allocator x, malloc_allocator y) { return !(x == y); }
 };
 
 template <class T>
-struct cpp03_allocator : bare_allocator<T>
-{
-    typedef T value_type;
-    typedef value_type* pointer;
+struct cpp03_allocator : bare_allocator<T> {
+  typedef T value_type;
+  typedef value_type* pointer;
 
-    static bool construct_called;
+  static bool construct_called;
 
-    // Returned value is not used but it's not prohibited.
-    pointer construct(pointer p, const value_type& val)
-    {
-        ::new(p) value_type(val);
-        construct_called = true;
-        return p;
-    }
+  // Returned value is not used but it's not prohibited.
+  pointer construct(pointer p, const value_type& val) {
+    ::new (p) value_type(val);
+    construct_called = true;
+    return p;
+  }
 
-    std::size_t max_size() const
-    {
-        return UINT_MAX / sizeof(T);
-    }
+  std::size_t max_size() const { return UINT_MAX / sizeof(T); }
 };
-template <class T> bool cpp03_allocator<T>::construct_called = false;
+template <class T>
+bool cpp03_allocator<T>::construct_called = false;
 
 template <class T>
-struct cpp03_overload_allocator : bare_allocator<T>
-{
-    typedef T value_type;
-    typedef value_type* pointer;
+struct cpp03_overload_allocator : bare_allocator<T> {
+  typedef T value_type;
+  typedef value_type* pointer;
 
-    static bool construct_called;
+  static bool construct_called;
 
-    void construct(pointer p, const value_type& val)
-    {
-        construct(p, val, std::is_class<T>());
-    }
-    void construct(pointer p, const value_type& val, std::true_type)
-    {
-        ::new(p) value_type(val);
-        construct_called = true;
-    }
-    void construct(pointer p, const value_type& val, std::false_type)
-    {
-        ::new(p) value_type(val);
-        construct_called = true;
-    }
+  void construct(pointer p, const value_type& val) { construct(p, val, std::is_class<T>()); }
+  void construct(pointer p, const value_type& val, std::true_type) {
+    ::new (p) value_type(val);
+    construct_called = true;
+  }
+  void construct(pointer p, const value_type& val, std::false_type) {
+    ::new (p) value_type(val);
+    construct_called = true;
+  }
 
-    std::size_t max_size() const
-    {
-        return UINT_MAX / sizeof(T);
-    }
+  std::size_t max_size() const { return UINT_MAX / sizeof(T); }
 };
-template <class T> bool cpp03_overload_allocator<T>::construct_called = false;
+template <class T>
+bool cpp03_overload_allocator<T>::construct_called = false;
 
-template <class T, class = std::integral_constant<std::size_t, 0> > class min_pointer;
-template <class T, class ID> class min_pointer<const T, ID>;
-template <class ID> class min_pointer<void, ID>;
-template <class ID> class min_pointer<const void, ID>;
-template <class T> class min_allocator;
+template <class T, class = std::integral_constant<std::size_t, 0> >
+class min_pointer;
+template <class T, class ID>
+class min_pointer<const T, ID>;
+template <class ID>
+class min_pointer<void, ID>;
+template <class ID>
+class min_pointer<const void, ID>;
+template <class T>
+class min_allocator;
 
 template <class ID>
-class min_pointer<const void, ID>
-{
-    const void* ptr_;
+class min_pointer<const void, ID> {
+  const void* ptr_;
+
 public:
-    min_pointer() TEST_NOEXCEPT = default;
-    min_pointer(std::nullptr_t) TEST_NOEXCEPT : ptr_(nullptr) {}
-    template <class T>
-    min_pointer(min_pointer<T, ID> p) TEST_NOEXCEPT : ptr_(p.ptr_) {}
+  min_pointer() TEST_NOEXCEPT = default;
+  min_pointer(std::nullptr_t) TEST_NOEXCEPT : ptr_(nullptr) {}
+  template <class T>
+  min_pointer(min_pointer<T, ID> p) TEST_NOEXCEPT : ptr_(p.ptr_) {}
 
-    explicit operator bool() const {return ptr_ != nullptr;}
+  explicit operator bool() const { return ptr_ != nullptr; }
 
-    friend bool operator==(min_pointer x, min_pointer y) {return x.ptr_ == y.ptr_;}
-    friend bool operator!=(min_pointer x, min_pointer y) {return !(x == y);}
-    template <class U, class XID> friend class min_pointer;
+  friend bool operator==(min_pointer x, min_pointer y) { return x.ptr_ == y.ptr_; }
+  friend bool operator!=(min_pointer x, min_pointer y) { return !(x == y); }
+  template <class U, class XID>
+  friend class min_pointer;
 };
 
 template <class ID>
-class min_pointer<void, ID>
-{
-    void* ptr_;
+class min_pointer<void, ID> {
+  void* ptr_;
+
 public:
-    min_pointer() TEST_NOEXCEPT = default;
-    TEST_CONSTEXPR_CXX14 min_pointer(std::nullptr_t) TEST_NOEXCEPT : ptr_(nullptr) {}
-    template <class T,
-              class = typename std::enable_if
-                       <
-                            !std::is_const<T>::value
-                       >::type
-             >
-    TEST_CONSTEXPR_CXX14 min_pointer(min_pointer<T, ID> p) TEST_NOEXCEPT : ptr_(p.ptr_) {}
+  min_pointer() TEST_NOEXCEPT = default;
+  TEST_CONSTEXPR_CXX14 min_pointer(std::nullptr_t) TEST_NOEXCEPT : ptr_(nullptr) {}
+  template <class T, class = typename std::enable_if< !std::is_const<T>::value >::type >
+  TEST_CONSTEXPR_CXX14 min_pointer(min_pointer<T, ID> p) TEST_NOEXCEPT : ptr_(p.ptr_) {}
 
-    TEST_CONSTEXPR_CXX14 explicit operator bool() const {return ptr_ != nullptr;}
+  TEST_CONSTEXPR_CXX14 explicit operator bool() const { return ptr_ != nullptr; }
 
-    TEST_CONSTEXPR_CXX14 friend bool operator==(min_pointer x, min_pointer y) {return x.ptr_ == y.ptr_;}
-    TEST_CONSTEXPR_CXX14 friend bool operator!=(min_pointer x, min_pointer y) {return !(x == y);}
-    template <class U, class XID> friend class min_pointer;
+  TEST_CONSTEXPR_CXX14 friend bool operator==(min_pointer x, min_pointer y) { return x.ptr_ == y.ptr_; }
+  TEST_CONSTEXPR_CXX14 friend bool operator!=(min_pointer x, min_pointer y) { return !(x == y); }
+  template <class U, class XID>
+  friend class min_pointer;
 };
 
 template <class T, class ID>
-class min_pointer
-{
-    T* ptr_;
+class min_pointer {
+  T* ptr_;
+
+  TEST_CONSTEXPR_CXX14 explicit min_pointer(T* p) TEST_NOEXCEPT : ptr_(p) {}
 
-    TEST_CONSTEXPR_CXX14 explicit min_pointer(T* p) TEST_NOEXCEPT : ptr_(p) {}
 public:
-    min_pointer() TEST_NOEXCEPT = default;
-    TEST_CONSTEXPR_CXX14 min_pointer(std::nullptr_t) TEST_NOEXCEPT : ptr_(nullptr) {}
-    TEST_CONSTEXPR_CXX14 explicit min_pointer(min_pointer<void, ID> p) TEST_NOEXCEPT : ptr_(static_cast<T*>(p.ptr_)) {}
+  min_pointer() TEST_NOEXCEPT = default;
+  TEST_CONSTEXPR_CXX14 min_pointer(std::nullptr_t) TEST_NOEXCEPT : ptr_(nullptr) {}
+  TEST_CONSTEXPR_CXX14 explicit min_pointer(min_pointer<void, ID> p) TEST_NOEXCEPT : ptr_(static_cast<T*>(p.ptr_)) {}
 
-    TEST_CONSTEXPR_CXX14 explicit operator bool() const {return ptr_ != nullptr;}
+  TEST_CONSTEXPR_CXX14 explicit operator bool() const { return ptr_ != nullptr; }
 
-    typedef std::ptrdiff_t difference_type;
-    typedef T& reference;
-    typedef T* pointer;
-    typedef T value_type;
-    typedef std::random_access_iterator_tag iterator_category;
+  typedef std::ptrdiff_t difference_type;
+  typedef T& reference;
+  typedef T* pointer;
+  typedef T value_type;
+  typedef std::random_access_iterator_tag iterator_category;
 
-    TEST_CONSTEXPR_CXX14 reference operator*() const {return *ptr_;}
-    TEST_CONSTEXPR_CXX14 pointer operator->() const {return ptr_;}
+  TEST_CONSTEXPR_CXX14 reference operator*() const { return *ptr_; }
+  TEST_CONSTEXPR_CXX14 pointer operator->() const { return ptr_; }
 
-    TEST_CONSTEXPR_CXX14 min_pointer& operator++() {++ptr_; return *this;}
-    TEST_CONSTEXPR_CXX14 min_pointer operator++(int) {min_pointer tmp(*this); ++ptr_; return tmp;}
+  TEST_CONSTEXPR_CXX14 min_pointer& operator++() {
+    ++ptr_;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 min_pointer operator++(int) {
+    min_pointer tmp(*this);
+    ++ptr_;
+    return tmp;
+  }
 
-    TEST_CONSTEXPR_CXX14 min_pointer& operator--() {--ptr_; return *this;}
-    TEST_CONSTEXPR_CXX14 min_pointer operator--(int) {min_pointer tmp(*this); --ptr_; return tmp;}
+  TEST_CONSTEXPR_CXX14 min_pointer& operator--() {
+    --ptr_;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 min_pointer operator--(int) {
+    min_pointer tmp(*this);
+    --ptr_;
+    return tmp;
+  }
 
-    TEST_CONSTEXPR_CXX14 min_pointer& operator+=(difference_type n) {ptr_ += n; return *this;}
-    TEST_CONSTEXPR_CXX14 min_pointer& operator-=(difference_type n) {ptr_ -= n; return *this;}
+  TEST_CONSTEXPR_CXX14 min_pointer& operator+=(difference_type n) {
+    ptr_ += n;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 min_pointer& operator-=(difference_type n) {
+    ptr_ -= n;
+    return *this;
+  }
 
-    TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n) const
-    {
-        min_pointer tmp(*this);
-        tmp += n;
-        return tmp;
-    }
+  TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n) const {
+    min_pointer tmp(*this);
+    tmp += n;
+    return tmp;
+  }
 
-    friend TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n, min_pointer x)
-    {
-        return x + n;
-    }
+  friend TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n, min_pointer x) { return x + n; }
 
-    TEST_CONSTEXPR_CXX14 min_pointer operator-(difference_type n) const
-    {
-        min_pointer tmp(*this);
-        tmp -= n;
-        return tmp;
-    }
+  TEST_CONSTEXPR_CXX14 min_pointer operator-(difference_type n) const {
+    min_pointer tmp(*this);
+    tmp -= n;
+    return tmp;
+  }
 
-    friend TEST_CONSTEXPR_CXX14 difference_type operator-(min_pointer x, min_pointer y)
-    {
-        return x.ptr_ - y.ptr_;
-    }
+  friend TEST_CONSTEXPR_CXX14 difference_type operator-(min_pointer x, min_pointer y) { return x.ptr_ - y.ptr_; }
 
-    TEST_CONSTEXPR_CXX14 reference operator[](difference_type n) const {return ptr_[n];}
+  TEST_CONSTEXPR_CXX14 reference operator[](difference_type n) const { return ptr_[n]; }
 
-    friend TEST_CONSTEXPR_CXX14 bool operator< (min_pointer x, min_pointer y) {return x.ptr_ < y.ptr_;}
-    friend TEST_CONSTEXPR_CXX14 bool operator> (min_pointer x, min_pointer y) {return y < x;}
-    friend TEST_CONSTEXPR_CXX14 bool operator<=(min_pointer x, min_pointer y) {return !(y < x);}
-    friend TEST_CONSTEXPR_CXX14 bool operator>=(min_pointer x, min_pointer y) {return !(x < y);}
+  friend TEST_CONSTEXPR_CXX14 bool operator<(min_pointer x, min_pointer y) { return x.ptr_ < y.ptr_; }
+  friend TEST_CONSTEXPR_CXX14 bool operator>(min_pointer x, min_pointer y) { return y < x; }
+  friend TEST_CONSTEXPR_CXX14 bool operator<=(min_pointer x, min_pointer y) { return !(y < x); }
+  friend TEST_CONSTEXPR_CXX14 bool operator>=(min_pointer x, min_pointer y) { return !(x < y); }
 
-    static TEST_CONSTEXPR_CXX14 min_pointer pointer_to(T& t) {return min_pointer(std::addressof(t));}
+  static TEST_CONSTEXPR_CXX14 min_pointer pointer_to(T& t) { return min_pointer(std::addressof(t)); }
 
-    friend TEST_CONSTEXPR_CXX14 bool operator==(min_pointer x, min_pointer y) {return x.ptr_ == y.ptr_;}
-    friend TEST_CONSTEXPR_CXX14 bool operator!=(min_pointer x, min_pointer y) {return !(x == y);}
-    template <class U, class XID> friend class min_pointer;
-    template <class U> friend class min_allocator;
+  friend TEST_CONSTEXPR_CXX14 bool operator==(min_pointer x, min_pointer y) { return x.ptr_ == y.ptr_; }
+  friend TEST_CONSTEXPR_CXX14 bool operator!=(min_pointer x, min_pointer y) { return !(x == y); }
+  template <class U, class XID>
+  friend class min_pointer;
+  template <class U>
+  friend class min_allocator;
 };
 
 template <class T, class ID>
-class min_pointer<const T, ID>
-{
-    const T* ptr_;
+class min_pointer<const T, ID> {
+  const T* ptr_;
+
+  TEST_CONSTEXPR_CXX14 explicit min_pointer(const T* p) : ptr_(p) {}
 
-    TEST_CONSTEXPR_CXX14 explicit min_pointer(const T* p) : ptr_(p) {}
 public:
-    min_pointer() TEST_NOEXCEPT = default;
-    TEST_CONSTEXPR_CXX14 min_pointer(std::nullptr_t) : ptr_(nullptr) {}
-    TEST_CONSTEXPR_CXX14 min_pointer(min_pointer<T, ID> p) : ptr_(p.ptr_) {}
-    TEST_CONSTEXPR_CXX14 explicit min_pointer(min_pointer<const void, ID> p) : ptr_(static_cast<const T*>(p.ptr_)) {}
+  min_pointer() TEST_NOEXCEPT = default;
+  TEST_CONSTEXPR_CXX14 min_pointer(std::nullptr_t) : ptr_(nullptr) {}
+  TEST_CONSTEXPR_CXX14 min_pointer(min_pointer<T, ID> p) : ptr_(p.ptr_) {}
+  TEST_CONSTEXPR_CXX14 explicit min_pointer(min_pointer<const void, ID> p) : ptr_(static_cast<const T*>(p.ptr_)) {}
 
-    TEST_CONSTEXPR_CXX14 explicit operator bool() const {return ptr_ != nullptr;}
+  TEST_CONSTEXPR_CXX14 explicit operator bool() const { return ptr_ != nullptr; }
 
-    typedef std::ptrdiff_t difference_type;
-    typedef const T& reference;
-    typedef const T* pointer;
-    typedef const T value_type;
-    typedef std::random_access_iterator_tag iterator_category;
+  typedef std::ptrdiff_t difference_type;
+  typedef const T& reference;
+  typedef const T* pointer;
+  typedef const T value_type;
+  typedef std::random_access_iterator_tag iterator_category;
 
-    TEST_CONSTEXPR_CXX14 reference operator*() const {return *ptr_;}
-    TEST_CONSTEXPR_CXX14 pointer operator->() const {return ptr_;}
+  TEST_CONSTEXPR_CXX14 reference operator*() const { return *ptr_; }
+  TEST_CONSTEXPR_CXX14 pointer operator->() const { return ptr_; }
 
-    TEST_CONSTEXPR_CXX14 min_pointer& operator++() {++ptr_; return *this;}
-    TEST_CONSTEXPR_CXX14 min_pointer operator++(int) {min_pointer tmp(*this); ++ptr_; return tmp;}
+  TEST_CONSTEXPR_CXX14 min_pointer& operator++() {
+    ++ptr_;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 min_pointer operator++(int) {
+    min_pointer tmp(*this);
+    ++ptr_;
+    return tmp;
+  }
 
-    TEST_CONSTEXPR_CXX14 min_pointer& operator--() {--ptr_; return *this;}
-    TEST_CONSTEXPR_CXX14 min_pointer operator--(int) {min_pointer tmp(*this); --ptr_; return tmp;}
+  TEST_CONSTEXPR_CXX14 min_pointer& operator--() {
+    --ptr_;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 min_pointer operator--(int) {
+    min_pointer tmp(*this);
+    --ptr_;
+    return tmp;
+  }
 
-    TEST_CONSTEXPR_CXX14 min_pointer& operator+=(difference_type n) {ptr_ += n; return *this;}
-    TEST_CONSTEXPR_CXX14 min_pointer& operator-=(difference_type n) {ptr_ -= n; return *this;}
+  TEST_CONSTEXPR_CXX14 min_pointer& operator+=(difference_type n) {
+    ptr_ += n;
+    return *this;
+  }
+  TEST_CONSTEXPR_CXX14 min_pointer& operator-=(difference_type n) {
+    ptr_ -= n;
+    return *this;
+  }
 
-    TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n) const
-    {
-        min_pointer tmp(*this);
-        tmp += n;
-        return tmp;
-    }
+  TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n) const {
+    min_pointer tmp(*this);
+    tmp += n;
+    return tmp;
+  }
 
-    friend TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n, min_pointer x)
-    {
-        return x + n;
-    }
+  friend TEST_CONSTEXPR_CXX14 min_pointer operator+(difference_type n, min_pointer x) { return x + n; }
 
-    TEST_CONSTEXPR_CXX14 min_pointer operator-(difference_type n) const
-    {
-        min_pointer tmp(*this);
-        tmp -= n;
-        return tmp;
-    }
+  TEST_CONSTEXPR_CXX14 min_pointer operator-(difference_type n) const {
+    min_pointer tmp(*this);
+    tmp -= n;
+    return tmp;
+  }
 
-    friend TEST_CONSTEXPR_CXX14 difference_type operator-(min_pointer x, min_pointer y)
-    {
-        return x.ptr_ - y.ptr_;
-    }
+  friend TEST_CONSTEXPR_CXX14 difference_type operator-(min_pointer x, min_pointer y) { return x.ptr_ - y.ptr_; }
 
-    TEST_CONSTEXPR_CXX14 reference operator[](difference_type n) const {return ptr_[n];}
+  TEST_CONSTEXPR_CXX14 reference operator[](difference_type n) const { return ptr_[n]; }
 
-    friend TEST_CONSTEXPR_CXX14 bool operator< (min_pointer x, min_pointer y) {return x.ptr_ < y.ptr_;}
-    friend TEST_CONSTEXPR_CXX14 bool operator> (min_pointer x, min_pointer y) {return y < x;}
-    friend TEST_CONSTEXPR_CXX14 bool operator<=(min_pointer x, min_pointer y) {return !(y < x);}
-    friend TEST_CONSTEXPR_CXX14 bool operator>=(min_pointer x, min_pointer y) {return !(x < y);}
+  friend TEST_CONSTEXPR_CXX14 bool operator<(min_pointer x, min_pointer y) { return x.ptr_ < y.ptr_; }
+  friend TEST_CONSTEXPR_CXX14 bool operator>(min_pointer x, min_pointer y) { return y < x; }
+  friend TEST_CONSTEXPR_CXX14 bool operator<=(min_pointer x, min_pointer y) { return !(y < x); }
+  friend TEST_CONSTEXPR_CXX14 bool operator>=(min_pointer x, min_pointer y) { return !(x < y); }
 
-    static TEST_CONSTEXPR_CXX14 min_pointer pointer_to(const T& t) {return min_pointer(std::addressof(t));}
+  static TEST_CONSTEXPR_CXX14 min_pointer pointer_to(const T& t) { return min_pointer(std::addressof(t)); }
 
-    friend TEST_CONSTEXPR_CXX14 bool operator==(min_pointer x, min_pointer y) {return x.ptr_ == y.ptr_;}
-    friend TEST_CONSTEXPR_CXX14 bool operator!=(min_pointer x, min_pointer y) {return x.ptr_ != y.ptr_;}
-    friend TEST_CONSTEXPR_CXX14 bool operator==(min_pointer x, std::nullptr_t) {return x.ptr_ == nullptr;}
-    friend TEST_CONSTEXPR_CXX14 bool operator!=(min_pointer x, std::nullptr_t) {return x.ptr_ != nullptr;}
-    friend TEST_CONSTEXPR_CXX14 bool operator==(std::nullptr_t, min_pointer x) {return x.ptr_ == nullptr;}
-    friend TEST_CONSTEXPR_CXX14 bool operator!=(std::nullptr_t, min_pointer x) {return x.ptr_ != nullptr;}
-    template <class U, class XID> friend class min_pointer;
+  friend TEST_CONSTEXPR_CXX14 bool operator==(min_pointer x, min_pointer y) { return x.ptr_ == y.ptr_; }
+  friend TEST_CONSTEXPR_CXX14 bool operator!=(min_pointer x, min_pointer y) { return x.ptr_ != y.ptr_; }
+  friend TEST_CONSTEXPR_CXX14 bool operator==(min_pointer x, std::nullptr_t) { return x.ptr_ == nullptr; }
+  friend TEST_CONSTEXPR_CXX14 bool operator!=(min_pointer x, std::nullptr_t) { return x.ptr_ != nullptr; }
+  friend TEST_CONSTEXPR_CXX14 bool operator==(std::nullptr_t, min_pointer x) { return x.ptr_ == nullptr; }
+  friend TEST_CONSTEXPR_CXX14 bool operator!=(std::nullptr_t, min_pointer x) { return x.ptr_ != nullptr; }
+  template <class U, class XID>
+  friend class min_pointer;
 };
 
 template <class T>
-class min_allocator
-{
+class min_allocator {
 public:
-    typedef T value_type;
-    typedef min_pointer<T> pointer;
+  typedef T value_type;
+  typedef min_pointer<T> pointer;
 
-    min_allocator() = default;
-    template <class U>
-    TEST_CONSTEXPR_CXX20 min_allocator(min_allocator<U>) {}
+  min_allocator() = default;
+  template <class U>
+  TEST_CONSTEXPR_CXX20 min_allocator(min_allocator<U>) {}
 
-    TEST_CONSTEXPR_CXX20 pointer allocate(std::size_t n) { return pointer(std::allocator<T>().allocate(n)); }
+  TEST_CONSTEXPR_CXX20 pointer allocate(std::size_t n) { return pointer(std::allocator<T>().allocate(n)); }
 
-    TEST_CONSTEXPR_CXX20 void deallocate(pointer p, std::size_t n) { std::allocator<T>().deallocate(p.ptr_, n); }
+  TEST_CONSTEXPR_CXX20 void deallocate(pointer p, std::size_t n) { std::allocator<T>().deallocate(p.ptr_, n); }
 
-    TEST_CONSTEXPR_CXX20 friend bool operator==(min_allocator, min_allocator) {return true;}
-    TEST_CONSTEXPR_CXX20 friend bool operator!=(min_allocator x, min_allocator y) {return !(x == y);}
+  TEST_CONSTEXPR_CXX20 friend bool operator==(min_allocator, min_allocator) { return true; }
+  TEST_CONSTEXPR_CXX20 friend bool operator!=(min_allocator x, min_allocator y) { return !(x == y); }
 };
 
 template <class T>
@@ -427,25 +424,19 @@ template <class T>
 class explicit_allocator
 {
 public:
-    typedef T value_type;
+  typedef T value_type;
 
-    TEST_CONSTEXPR_CXX20 explicit_allocator() TEST_NOEXCEPT {}
+  TEST_CONSTEXPR_CXX20 explicit_allocator() TEST_NOEXCEPT {}
 
-    template <class U>
-    TEST_CONSTEXPR_CXX20 explicit explicit_allocator(explicit_allocator<U>) TEST_NOEXCEPT {}
+  template <class U>
+  TEST_CONSTEXPR_CXX20 explicit explicit_allocator(explicit_allocator<U>) TEST_NOEXCEPT {}
 
-    TEST_CONSTEXPR_CXX20 T* allocate(std::size_t n)
-    {
-        return static_cast<T*>(std::allocator<T>().allocate(n));
-    }
+  TEST_CONSTEXPR_CXX20 T* allocate(std::size_t n) { return static_cast<T*>(std::allocator<T>().allocate(n)); }
 
-    TEST_CONSTEXPR_CXX20 void deallocate(T* p, std::size_t n)
-    {
-        std::allocator<T>().deallocate(p, n);
-    }
+  TEST_CONSTEXPR_CXX20 void deallocate(T* p, std::size_t n) { std::allocator<T>().deallocate(p, n); }
 
-    TEST_CONSTEXPR_CXX20 friend bool operator==(explicit_allocator, explicit_allocator) {return true;}
-    TEST_CONSTEXPR_CXX20 friend bool operator!=(explicit_allocator x, explicit_allocator y) {return !(x == y);}
+  TEST_CONSTEXPR_CXX20 friend bool operator==(explicit_allocator, explicit_allocator) { return true; }
+  TEST_CONSTEXPR_CXX20 friend bool operator!=(explicit_allocator x, explicit_allocator y) { return !(x == y); }
 };
 
 template <class T>
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index b59c7fdaf0a3..de06b9dd1bee 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -372,6 +372,11 @@ feature_test_macros = [
             "values": {"c++20": 201811},
             "headers": ["iterator"],
         },
+        {
+            "name": "__cpp_lib_constexpr_list",
+            "values": {"c++26": 202502},
+            "headers": ["list"],
+        },
         {
             "name": "__cpp_lib_constexpr_memory",
             "values": {"c++20": 201811, "c++23": 202202},

From 6d785ca4218b18e77e39320bea7f8973c3ea2764 Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Wed, 18 Jun 2025 17:14:33 +0100
Subject: [PATCH 0831/1322] [Clang] Fix the clang/test/PCH/ignored-pch.c test.
 (#144737)

Change the test to check the exit status of the 'ls' command line
(instead of error message) since the error message is different when
running 'ls' command on the different Host machine.
---
 clang/test/PCH/ignored-pch.c | 58 +++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/clang/test/PCH/ignored-pch.c b/clang/test/PCH/ignored-pch.c
index 5b64582cba61..c6ef3fe74cee 100644
--- a/clang/test/PCH/ignored-pch.c
+++ b/clang/test/PCH/ignored-pch.c
@@ -1,96 +1,96 @@
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -o %t.ll
-// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: ls %t.pch
+// RUN: ls %t.ll
 
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
 // RUN: %clang %s -emit-ast -include-pch %t.pch -o %t.ll
-// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: ls %t.pch
+// RUN: ls %t.ll
 
 // Check that -ignore-pch causes -emit-pch and -include-pch options to be ignored.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
-// RUN: ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -emit-ast %s -include-pch %t.pch -ignore-pch -o %t.ll
-// RUN: not ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ-ERROR %s
+// RUN: not ls %t.ll
 
 // Check that -ignore-pch works for multiple PCH related options.
 // Test with -building-pch-with-obj.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -building-pch-with-obj -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -building-pch-with-obj -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fallow-pch-with-compiler-errors.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fallow-pch-with-different-modules-cache-path.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.pch
 // RUN: %clang -S -emit-llvm %s -ignore-pch -include-pch %t.pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fpch-codegen.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-codegen -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-codegen -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fpch-debuginfo.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-debuginfo -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-debuginfo -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fpch-instantiate-templates.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-instantiate-templates -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-instantiate-templates -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fno-pch-timestamp.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-pch-timestamp -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-pch-timestamp -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -fno-validate-pch.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-validate-pch -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-validate-pch -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -relocatable-pch.
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -relocatable-pch -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -relocatable-pch -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 // Test with -pch-through-hdrstop-create/-pch-through-hdrstop-use
 // RUN: rm -rf %t.pch %t.ll
 // RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -pch-through-hdrstop-create -o %t.pch
 // RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -pch-through-hdrstop-use -o %t.ll
-// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
-// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+// RUN: not ls %t.pch
+// RUN: ls %t.ll
 
 
 // Test with AST dump output:
@@ -99,10 +99,6 @@
 // RUN: %clang %s -include-pch %t.pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST-PCH %s
 // RUN: %clang %s -include-pch %t.pch -ignore-pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST %s
 
-// CHECK-PCH: ignored-pch.c.{{.*}}.pch
-// CHECK-OBJ: ignored-pch.c.{{.*}}.ll
-// CHECK-PCH-ERROR: ignored-pch.c.{{.*}}.pch{{'?}}: No such file or directory
-// CHECK-OBJ-ERROR: ignored-pch.c.{{.*}}.ll{{'?}}: No such file or directory
 // CHECK-AST-PCH: <undeserialized declarations>
 // CHECK-AST-NOT: <undeserialized declarations>
 

From 2a41350aabd8b7d3e406141a55ce0bb6f5e70a76 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Wed, 18 Jun 2025 17:15:12 +0100
Subject: [PATCH 0832/1322] =?UTF-8?q?Fix=20bazel=20build=20issue=20caused?=
 =?UTF-8?q?=20by=20#142986=20second=20attempt=20(#144721=20didnt=E2=80=A6?=
 =?UTF-8?q?=20(#144743)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… cover everything)
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel      | 1 +
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 2 files changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 48f2d0900d3e..761a93ea7dfa 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -12796,6 +12796,7 @@ cc_library(
         "lib/Dialect/Bufferization/IR/BufferViewFlowOpInterface.cpp",
         "lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp",
         "lib/Dialect/Bufferization/IR/BufferizationDialect.cpp",
+        "lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp",
         "lib/Dialect/Bufferization/IR/BufferizationOps.cpp",
         "lib/Dialect/Bufferization/IR/UnstructuredControlFlow.cpp",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 0eaf86da7f27..a439fdd50d21 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -375,6 +375,7 @@ cc_library(
         "//llvm:IRReader",
         "//llvm:Support",
         "//mlir:ArithDialect",
+        "//mlir:BufferizationDialect",
         "//mlir:BufferizationInterfaces",
         "//mlir:BytecodeOpInterface",
         "//mlir:CallOpInterfaces",

From dd40c460c42d075c47f0d1a6d83f129655eafe10 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Wed, 18 Jun 2025 12:16:01 -0400
Subject: [PATCH 0833/1322] [libc++] Clean up casts in std::forward_list
 (#130310)

The patch removes unnecessary casts to `void*` pointers, inline some
casts, and eliminates an identity cast.
---
 libcxx/include/forward_list | 97 +++++++++++++++----------------------
 1 file changed, 39 insertions(+), 58 deletions(-)

diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index e9b2c860b89c..bad0c11b7c7e 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -282,7 +282,6 @@ struct __forward_node_traits {
   typedef _NodePtr __node_pointer;
   typedef __forward_begin_node<_NodePtr> __begin_node;
   typedef __rebind_pointer_t<_NodePtr, __begin_node> __begin_node_pointer;
-  typedef __rebind_pointer_t<_NodePtr, void> __void_pointer;
 
 // TODO(LLVM 22): Remove this check
 #  ifndef _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB
@@ -294,10 +293,6 @@ struct __forward_node_traits {
                 "is being broken between LLVM 19 and LLVM 20. If you don't care about your ABI being broken, define "
                 "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic.");
 #  endif
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) {
-    return std::__static_fancy_pointer_cast<__begin_node_pointer>(__p);
-  }
 };
 
 template <class _NodePtr>
@@ -309,10 +304,6 @@ struct __forward_begin_node {
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {}
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {}
-
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const {
-    return std::__static_fancy_pointer_cast<__begin_node_pointer>(__next_);
-  }
 };
 
 template <class _Tp, class _VoidPtr>
@@ -361,15 +352,9 @@ class __forward_list_iterator {
   typedef typename __traits::__begin_node __begin_node_type;
   typedef typename __traits::__node_pointer __node_pointer;
   typedef typename __traits::__begin_node_pointer __begin_node_pointer;
-  typedef typename __traits::__void_pointer __void_pointer;
 
   __begin_node_pointer __ptr_;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
-    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_);
-  }
-
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT
       : __ptr_(nullptr) {}
 
@@ -377,7 +362,7 @@ class __forward_list_iterator {
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {}
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT
-      : __ptr_(__traits::__as_iter_node(__p)) {}
+      : __ptr_(std::__static_fancy_pointer_cast<__begin_node_pointer>(__p)) {}
 
   template <class, class>
   friend class forward_list;
@@ -394,14 +379,14 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
-    return __get_unsafe_node_pointer()->__get_value();
+    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_)->__get_value();
   }
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
-    return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
+    return pointer_traits<pointer>::pointer_to(std::__static_fancy_pointer_cast<__node_pointer>(__ptr_)->__get_value());
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() {
-    __ptr_ = __traits::__as_iter_node(__ptr_->__next_);
+    __ptr_ = std::__static_fancy_pointer_cast<__begin_node_pointer>(__ptr_->__next_);
     return *this;
   }
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) {
@@ -430,15 +415,9 @@ class __forward_list_const_iterator {
   typedef typename __traits::__begin_node __begin_node_type;
   typedef typename __traits::__node_pointer __node_pointer;
   typedef typename __traits::__begin_node_pointer __begin_node_pointer;
-  typedef typename __traits::__void_pointer __void_pointer;
 
   __begin_node_pointer __ptr_;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
-    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_);
-  }
-
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT
       : __ptr_(nullptr) {}
 
@@ -447,7 +426,7 @@ class __forward_list_const_iterator {
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__node_pointer __p) _NOEXCEPT
-      : __ptr_(__traits::__as_iter_node(__p)) {}
+      : __ptr_(std::__static_fancy_pointer_cast<__begin_node_pointer>(__p)) {}
 
   template <class, class>
   friend class forward_list;
@@ -464,14 +443,14 @@ public:
   __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT : __ptr_(__p.__ptr_) {}
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
-    return __get_unsafe_node_pointer()->__get_value();
+    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_)->__get_value();
   }
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
-    return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
+    return pointer_traits<pointer>::pointer_to(std::__static_fancy_pointer_cast<__node_pointer>(__ptr_)->__get_value());
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() {
-    __ptr_ = __traits::__as_iter_node(__ptr_->__next_);
+    __ptr_ = std::__static_fancy_pointer_cast<__begin_node_pointer>(__ptr_->__next_);
     return *this;
   }
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) {
@@ -963,7 +942,8 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>::forward_list(con
 template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
   if (__n > 0) {
-    for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) {
+    for (__begin_node_pointer __p = __base::__before_begin(); __n > 0;
+         --__n, __p = std::__static_fancy_pointer_cast<__begin_node_pointer>(__p->__next_)) {
       __p->__next_ = this->__create_node(/* next = */ nullptr);
     }
   }
@@ -974,7 +954,8 @@ template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc)
     : __base(__base_alloc) {
   if (__n > 0) {
-    for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) {
+    for (__begin_node_pointer __p = __base::__before_begin(); __n > 0;
+         --__n, __p = std::__static_fancy_pointer_cast<__begin_node_pointer>(__p->__next_)) {
       __p->__next_ = this->__create_node(/* next = */ nullptr);
     }
   }
@@ -1167,7 +1148,7 @@ template <class _Tp, class _Alloc>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) {
-  __begin_node_pointer const __r = __p.__get_begin();
+  __begin_node_pointer const __r = __p.__ptr_;
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, std::forward<_Args>(__args)...);
   return iterator(__r->__next_);
 }
@@ -1175,7 +1156,7 @@ forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args)
 template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) {
-  __begin_node_pointer const __r = __p.__get_begin();
+  __begin_node_pointer const __r = __p.__ptr_;
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, std::move(__v));
   return iterator(__r->__next_);
 }
@@ -1185,7 +1166,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) {
 template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __v) {
-  __begin_node_pointer const __r = __p.__get_begin();
+  __begin_node_pointer const __r = __p.__ptr_;
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, __v);
   return iterator(__r->__next_);
 }
@@ -1194,7 +1175,7 @@ template <class _Tp, class _Alloc>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Args&&... __args) {
-  __begin_node_pointer __r = __p.__get_begin();
+  __begin_node_pointer __r = __p.__ptr_;
   if (__n > 0) {
     __node_pointer __first = this->__create_node(/* next = */ nullptr, std::forward<_Args>(__args)...);
     __node_pointer __last  = __first;
@@ -1216,7 +1197,7 @@ forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Ar
 #  endif // _LIBCPP_HAS_EXCEPTIONS
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
-    __r             = __forward_node_traits<__node_pointer>::__as_iter_node(__last);
+    __r             = std::__static_fancy_pointer_cast<__begin_node_pointer>(__last);
   }
   return iterator(__r);
 }
@@ -1232,7 +1213,7 @@ template <class _Tp, class _Alloc>
 template <class _InputIterator, class _Sentinel>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l) {
-  __begin_node_pointer __r = __p.__get_begin();
+  __begin_node_pointer __r = __p.__ptr_;
 
   if (__f != __l) {
     __node_pointer __first = this->__create_node(/* next = */ nullptr, *__f);
@@ -1257,7 +1238,7 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp
 
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
-    __r             = __forward_node_traits<__node_pointer>::__as_iter_node(__last);
+    __r             = std::__static_fancy_pointer_cast<__begin_node_pointer>(__last);
   }
 
   return iterator(__r);
@@ -1266,7 +1247,7 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp
 template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) {
-  __begin_node_pointer __p = __f.__get_begin();
+  __begin_node_pointer __p = __f.__ptr_;
   __node_pointer __n       = __p->__next_;
   __p->__next_             = __n->__next_;
   this->__delete_node(__n);
@@ -1276,9 +1257,9 @@ forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) {
 template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) {
-  __node_pointer __e = __l.__get_unsafe_node_pointer();
+  __node_pointer __e = std::__static_fancy_pointer_cast<__node_pointer>(__l.__ptr_);
   if (__f != __l) {
-    __begin_node_pointer __bp = __f.__get_begin();
+    __begin_node_pointer __bp = __f.__ptr_;
 
     __node_pointer __n = __bp->__next_;
     if (__n != __e) {
@@ -1324,13 +1305,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type _
 template <class _Tp, class _Alloc>
 _LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) {
   if (!__x.empty()) {
-    if (__p.__get_begin()->__next_ != nullptr) {
+    if (__p.__ptr_->__next_ != nullptr) {
       const_iterator __lm1 = __x.before_begin();
-      while (__lm1.__get_begin()->__next_ != nullptr)
+      while (__lm1.__ptr_->__next_ != nullptr)
         ++__lm1;
-      __lm1.__get_begin()->__next_ = __p.__get_begin()->__next_;
+      __lm1.__ptr_->__next_ = __p.__ptr_->__next_;
     }
-    __p.__get_begin()->__next_    = __x.__before_begin()->__next_;
+    __p.__ptr_->__next_           = __x.__before_begin()->__next_;
     __x.__before_begin()->__next_ = nullptr;
   }
 }
@@ -1340,9 +1321,9 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void
 forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) {
   const_iterator __lm1 = std::next(__i);
   if (__p != __i && __p != __lm1) {
-    __i.__get_begin()->__next_   = __lm1.__get_begin()->__next_;
-    __lm1.__get_begin()->__next_ = __p.__get_begin()->__next_;
-    __p.__get_begin()->__next_   = __lm1.__get_unsafe_node_pointer();
+    __i.__ptr_->__next_   = __lm1.__ptr_->__next_;
+    __lm1.__ptr_->__next_ = __p.__ptr_->__next_;
+    __p.__ptr_->__next_   = std::__static_fancy_pointer_cast<__node_pointer>(__lm1.__ptr_);
   }
 }
 
@@ -1351,12 +1332,12 @@ _LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(
     const_iterator __p, forward_list& /*__other*/, const_iterator __f, const_iterator __l) {
   if (__f != __l && __p != __f) {
     const_iterator __lm1 = __f;
-    while (__lm1.__get_begin()->__next_ != __l.__get_begin())
+    while (__lm1.__ptr_->__next_ != __l.__ptr_)
       ++__lm1;
     if (__f != __lm1) {
-      __lm1.__get_begin()->__next_ = __p.__get_begin()->__next_;
-      __p.__get_begin()->__next_   = __f.__get_begin()->__next_;
-      __f.__get_begin()->__next_   = __l.__get_unsafe_node_pointer();
+      __lm1.__ptr_->__next_ = __p.__ptr_->__next_;
+      __p.__ptr_->__next_   = __f.__ptr_->__next_;
+      __f.__ptr_->__next_   = std::__static_fancy_pointer_cast<__node_pointer>(__l.__ptr_);
     }
   }
 }
@@ -1385,8 +1366,8 @@ forward_list<_Tp, _Alloc>::remove(const value_type& __v) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
   const iterator __e                                            = end();
-  for (iterator __i = before_begin(); __i.__get_begin()->__next_ != nullptr;) {
-    if (__i.__get_begin()->__next_->__get_value() == __v) {
+  for (iterator __i = before_begin(); __i.__ptr_->__next_ != nullptr;) {
+    if (__i.__ptr_->__next_->__get_value() == __v) {
       ++__count_removed;
       iterator __j = std::next(__i, 2);
       for (; __j != __e && *__j == __v; ++__j)
@@ -1409,8 +1390,8 @@ forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
   const iterator __e                                            = end();
-  for (iterator __i = before_begin(); __i.__get_begin()->__next_ != nullptr;) {
-    if (__pred(__i.__get_begin()->__next_->__get_value())) {
+  for (iterator __i = before_begin(); __i.__ptr_->__next_ != nullptr;) {
+    if (__pred(__i.__ptr_->__next_->__get_value())) {
       ++__count_removed;
       iterator __j = std::next(__i, 2);
       for (; __j != __e && __pred(*__j); ++__j)
@@ -1436,7 +1417,7 @@ forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) {
     iterator __j = std::next(__i);
     for (; __j != __e && __binary_pred(*__i, *__j); ++__j)
       ++__count_removed;
-    if (__i.__get_begin()->__next_ != __j.__get_unsafe_node_pointer())
+    if (__i.__ptr_->__next_ != std::__static_fancy_pointer_cast<__node_pointer>(__j.__ptr_))
       __deleted_nodes.splice_after(__deleted_nodes.before_begin(), *this, __i, __j);
     __i = __j;
   }
@@ -1516,7 +1497,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Co
   }
   difference_type __sz1 = __sz / 2;
   difference_type __sz2 = __sz - __sz1;
-  __node_pointer __t    = std::next(iterator(__f1), __sz1 - 1).__get_unsafe_node_pointer();
+  __node_pointer __t    = std::__static_fancy_pointer_cast<__node_pointer>(std::next(iterator(__f1), __sz1 - 1).__ptr_);
   __node_pointer __f2   = __t->__next_;
   __t->__next_          = nullptr;
   return __merge(__sort(__f1, __sz1, __comp), __sort(__f2, __sz2, __comp), __comp);

From 9827440f1e723423baf4c235e844eb8ac48a8f97 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Wed, 18 Jun 2025 12:22:47 -0400
Subject: [PATCH 0834/1322] [libc++] Optimize ranges::{for_each, for_each_n}
 for segmented iterators (#132896)

Previously, the segmented iterator optimization was limited to `std::{for_each, for_each_n}`. This patch
extends the optimization to `std::ranges::for_each` and `std::ranges::for_each_n`, ensuring consistent
optimizations across these algorithms. This patch first generalizes the `std` algorithms by introducing
a `Projection` parameter, which is set to `__identity` for the `std` algorithms. Then we let the `ranges`
algorithms to directly call their `std` counterparts with a general `__proj` argument. Benchmarks
demonstrate performance improvements of up to 21.4x for ``std::deque::iterator`` and 22.3x for
``join_view`` of ``vector<vector<char>>``.

Addresses a subtask of #102817.
---
 libcxx/docs/ReleaseNotes/21.rst               |  5 +-
 libcxx/include/__algorithm/for_each.h         | 35 +++++++++-----
 libcxx/include/__algorithm/for_each_n.h       | 26 +++++++----
 libcxx/include/__algorithm/ranges_for_each.h  | 18 ++++++--
 .../include/__algorithm/ranges_for_each_n.h   |  9 ++--
 libcxx/include/experimental/iterator          |  1 +
 libcxx/include/mutex                          |  1 +
 libcxx/include/shared_mutex                   |  1 +
 .../nonmodifying/for_each.bench.cpp           | 43 +++++++++++++++--
 .../nonmodifying/for_each_n.bench.cpp         | 23 ++++------
 .../alg.foreach/ranges.for_each.pass.cpp      | 46 +++++++++++++++++--
 .../alg.foreach/ranges.for_each_n.pass.cpp    | 46 ++++++++++++++++++-
 12 files changed, 197 insertions(+), 57 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 2a5b90750eaf..8661e5898fbc 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -70,8 +70,9 @@ Improvements and New Features
 - The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
   in C++23 and later.
 
-- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
-  up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
+- The ``std::for_each_n``, ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for
+  segmented iterators, resulting in a performance improvement of up to 17.7x for ``std::deque<short>`` iterators, and up
+  to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
 
 - The ``bitset::to_string`` function has been optimized, resulting in a performance improvement of up to 8.3x for bitsets
   with uniformly distributed zeros and ones, and up to 13.5x and 16.1x for sparse and dense bitsets, respectively.
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index b6c2c7c056ed..4167eec3506e 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -12,41 +12,54 @@
 
 #include <__algorithm/for_each_segment.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/invoke.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Sent, class _Func>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __for_each(_InputIterator __first, _Sent __last, _Func& __f) {
+template <class _InputIterator, class _Sent, class _Func, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
   for (; __first != __last; ++__first)
-    __f(*__first);
+    std::__invoke(__f, std::__invoke(__proj, *__first));
+  return __first;
 }
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _SegmentedIterator,
-          class _Function,
+          class _Func,
+          class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __func, _Proj& __proj) {
   using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
   std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__for_each(__lfirst, __llast, __func);
+    std::__for_each(__lfirst, __llast, __func, __proj);
   });
+  return __last;
 }
 #endif // !_LIBCPP_CXX03_LANG
 
-template <class _InputIterator, class _Function>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
-  std::__for_each(__first, __last, __f);
+template <class _InputIterator, class _Func>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func
+for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
+  __identity __proj;
+  std::__for_each(__first, __last, __f, __proj);
   return __f;
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 29351ec39f4e..9a6c6bb5175d 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -13,10 +13,12 @@
 #include <__algorithm/for_each.h>
 #include <__algorithm/for_each_n_segment.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/disjunction.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/invoke.h>
 #include <__type_traits/negation.h>
 #include <__utility/convert_to_integral.h>
 #include <__utility/move.h>
@@ -33,16 +35,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _InputIterator,
           class _Size,
           class _Func,
+          class _Proj,
           __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
                             _Or< _Not<__is_segmented_iterator<_InputIterator> >,
                                  _Not<__has_random_access_local_iterator<_InputIterator> > >::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
+__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   while (__n > 0) {
-    __f(*__first);
+    std::__invoke(__f, std::__invoke(__proj, *__first));
     ++__first;
     --__n;
   }
@@ -52,39 +55,42 @@ __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
 template <class _RandIter,
           class _Size,
           class _Func,
+          class _Proj,
           __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
-__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
+__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
   auto __last                                                   = __first + __n;
-  std::__for_each(__first, __last, __f);
-  return std::move(__last);
+  std::__for_each(__first, __last, __f, __proj);
+  return __last;
 }
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _SegmentedIterator,
           class _Size,
           class _Func,
+          class _Proj,
           __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
                             __is_segmented_iterator<_SegmentedIterator>::value &&
                             __has_random_access_iterator_category<
                                 typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
-__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
   return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__for_each(__lfirst, __llast, __f);
+    std::__for_each(__lfirst, __llast, __f, __proj);
   });
 }
 #endif // !_LIBCPP_CXX03_LANG
 
 #if _LIBCPP_STD_VER >= 17
 
-template <class _InputIterator, class _Size, class _Function>
+template <class _InputIterator, class _Size, class _Func>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
-  return std::__for_each_n(__first, __orig_n, __f);
+for_each_n(_InputIterator __first, _Size __orig_n, _Func __f) {
+  __identity __proj;
+  return std::__for_each_n(__first, __orig_n, __f, __proj);
 }
 
 #endif // _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index de39bc552275..e9c84e8583f8 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -9,10 +9,12 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 
+#include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
+#include <__concepts/assignable.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/concepts.h>
 #include <__iterator/projected.h>
 #include <__ranges/access.h>
@@ -41,9 +43,17 @@ private:
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    for (; __first != __last; ++__first)
-      std::invoke(__func, std::invoke(__proj, *__first));
-    return {std::move(__first), std::move(__func)};
+    // In the case where we have different iterator and sentinel types, the segmented iterator optimization
+    // in std::for_each will not kick in. Therefore, we prefer std::for_each_n in that case (whenever we can
+    // obtain the `n`).
+    if constexpr (!std::assignable_from<_Iter&, _Sent> && std::sized_sentinel_for<_Sent, _Iter>) {
+      auto __n   = __last - __first;
+      auto __end = std::__for_each_n(std::move(__first), __n, __func, __proj);
+      return {std::move(__end), std::move(__func)};
+    } else {
+      auto __end = std::__for_each(std::move(__first), std::move(__last), __func, __proj);
+      return {std::move(__end), std::move(__func)};
+    }
   }
 
 public:
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 603cb723233c..3aab1b79c10a 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -9,10 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 
+#include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
@@ -40,11 +40,8 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    while (__count-- > 0) {
-      std::invoke(__func, std::invoke(__proj, *__first));
-      ++__first;
-    }
-    return {std::move(__first), std::move(__func)};
+    auto __last = std::__for_each_n(std::move(__first), __count, __func, __proj);
+    return {std::move(__last), std::move(__func)};
   }
 };
 
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index d92613845a66..565bb83903ac 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -127,6 +127,7 @@ _LIBCPP_POP_MACROS
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstddef>
 #    include <iosfwd>
+#    include <optional>
 #    include <type_traits>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index e058b3113073..f616bad3ac17 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -504,6 +504,7 @@ _LIBCPP_POP_MACROS
 #    include <initializer_list>
 #    include <iosfwd>
 #    include <new>
+#    include <optional>
 #    include <stdexcept>
 #    include <system_error>
 #    include <type_traits>
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index e6759e413dfe..6469c02ca587 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -457,6 +457,7 @@ _LIBCPP_POP_MACROS
 #  endif // _LIBCPP_HAS_THREADS
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#    include <optional>
 #    include <system_error>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
index 760accbe4d92..f58f336f8b89 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
@@ -12,6 +12,7 @@
 #include <cstddef>
 #include <deque>
 #include <list>
+#include <ranges>
 #include <string>
 #include <vector>
 
@@ -23,6 +24,7 @@ int main(int argc, char** argv) {
   // {std,ranges}::for_each
   {
     auto bm = []<class Container>(std::string name, auto for_each) {
+      using ElemType = typename Container::value_type;
       benchmark::RegisterBenchmark(
           name,
           [for_each](auto& st) {
@@ -33,15 +35,14 @@ int main(int argc, char** argv) {
 
             for ([[maybe_unused]] auto _ : st) {
               benchmark::DoNotOptimize(c);
-              auto result = for_each(first, last, [](int& x) { x = std::clamp(x, 10, 100); });
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
               benchmark::DoNotOptimize(result);
             }
           })
           ->Arg(8)
           ->Arg(32)
           ->Arg(50) // non power-of-two
-          ->Arg(8192)
-          ->Arg(1 << 20);
+          ->Arg(8192);
     };
     bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
     bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
@@ -51,6 +52,42 @@ int main(int argc, char** argv) {
     bm.operator()<std::list<int>>("rng::for_each(list<int>)", std::ranges::for_each);
   }
 
+  // {std,ranges}::for_each for join_view
+  {
+    auto bm = []<class Container>(std::string name, auto for_each) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(8192);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
+  }
+
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index 784708c7e01e..e643e647722c 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -21,7 +21,7 @@
 int main(int argc, char** argv) {
   auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
 
-  // std::for_each_n
+  // {std,ranges}::for_each_n
   {
     auto bm = []<class Container>(std::string name, auto for_each_n) {
       using ElemType = typename Container::value_type;
@@ -41,19 +41,17 @@ int main(int argc, char** argv) {
           ->Arg(8)
           ->Arg(32)
           ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
+          ->Arg(8192);
     };
     bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
     bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
     bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
+    bm.operator()<std::vector<int>>("rng::for_each_n(vector<int>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<int>>("rng::for_each_n(deque<int>)", std::ranges::for_each_n);
+    bm.operator()<std::list<int>>("rng::for_each_n(list<int>)", std::ranges::for_each_n);
   }
 
-  // std::for_each_n for join_view
+  // {std,ranges}::for_each_n for join_view
   {
     auto bm = []<class Container>(std::string name, auto for_each_n) {
       using C1       = typename Container::value_type;
@@ -81,14 +79,11 @@ int main(int argc, char** argv) {
           ->Arg(8)
           ->Arg(32)
           ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
+          ->Arg(8192);
     };
     bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
   }
 
   benchmark::Initialize(&argc, argv);
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 8b9b6e82cbcb..a6d0afde3186 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -20,7 +20,10 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <deque>
 #include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
@@ -30,7 +33,7 @@ struct Callable {
 };
 
 template <class Iter, class Sent = Iter>
-concept HasForEachIt = requires (Iter iter, Sent sent) { std::ranges::for_each(iter, sent, Callable{}); };
+concept HasForEachIt = requires(Iter iter, Sent sent) { std::ranges::for_each(iter, sent, Callable{}); };
 
 static_assert(HasForEachIt<int*>);
 static_assert(!HasForEachIt<InputIteratorNotDerivedFrom>);
@@ -47,7 +50,7 @@ static_assert(!HasForEachItFunc<IndirectUnaryPredicateNotPredicate>);
 static_assert(!HasForEachItFunc<IndirectUnaryPredicateNotCopyConstructible>);
 
 template <class Range>
-concept HasForEachR = requires (Range range) { std::ranges::for_each(range, Callable{}); };
+concept HasForEachR = requires(Range range) { std::ranges::for_each(range, Callable{}); };
 
 static_assert(HasForEachR<UncheckedRange<int*>>);
 static_assert(!HasForEachR<InputRangeNotDerivedFrom>);
@@ -68,7 +71,7 @@ constexpr void test_iterator() {
   { // simple test
     {
       auto func = [i = 0](int& a) mutable { a += i++; };
-      int a[] = {1, 6, 3, 4};
+      int a[]   = {1, 6, 3, 4};
       std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> decltype(auto) ret =
           std::ranges::for_each(Iter(a), Sent(Iter(a + 4)), func);
       assert(a[0] == 1);
@@ -81,8 +84,8 @@ constexpr void test_iterator() {
       assert(i == 4);
     }
     {
-      auto func = [i = 0](int& a) mutable { a += i++; };
-      int a[] = {1, 6, 3, 4};
+      auto func  = [i = 0](int& a) mutable { a += i++; };
+      int a[]    = {1, 6, 3, 4};
       auto range = std::ranges::subrange(Iter(a), Sent(Iter(a + 4)));
       std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> decltype(auto) ret =
           std::ranges::for_each(range, func);
@@ -110,6 +113,30 @@ constexpr void test_iterator() {
   }
 }
 
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
+
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
+};
+
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  // check that segmented deque iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::ranges::for_each(d, deque_test(d, index));
+  }
+}
+
 constexpr bool test() {
   test_iterator<cpp17_input_iterator<int*>, sentinel_wrapper<cpp17_input_iterator<int*>>>();
   test_iterator<cpp20_input_iterator<int*>, sentinel_wrapper<cpp20_input_iterator<int*>>>();
@@ -146,6 +173,15 @@ constexpr bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::ranges::for_each(v, [i = 0](int x) mutable { assert(x == 2 * i++); }, [](int x) { return 2 * x; });
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index d4b2d053d08c..157876369423 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -17,7 +17,12 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <deque>
+#include <iterator>
 #include <ranges>
+#include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
@@ -27,7 +32,7 @@ struct Callable {
 };
 
 template <class Iter>
-concept HasForEachN = requires (Iter iter) { std::ranges::for_each_n(iter, 0, Callable{}); };
+concept HasForEachN = requires(Iter iter) { std::ranges::for_each_n(iter, 0, Callable{}); };
 
 static_assert(HasForEachN<int*>);
 static_assert(!HasForEachN<InputIteratorNotDerivedFrom>);
@@ -45,7 +50,7 @@ template <class Iter>
 constexpr void test_iterator() {
   { // simple test
     auto func = [i = 0](int& a) mutable { a += i++; };
-    int a[] = {1, 6, 3, 4};
+    int a[]   = {1, 6, 3, 4};
     std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> auto ret =
         std::ranges::for_each_n(Iter(a), 4, func);
     assert(a[0] == 1);
@@ -64,6 +69,30 @@ constexpr void test_iterator() {
   }
 }
 
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
+
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
+};
+
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  // check that segmented deque iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::ranges::for_each_n(d.begin(), d.size(), deque_test(d, index));
+  }
+}
+
 constexpr bool test() {
   test_iterator<cpp17_input_iterator<int*>>();
   test_iterator<cpp20_input_iterator<int*>>();
@@ -89,6 +118,19 @@ constexpr bool test() {
     assert(a[2].other == 6);
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::ranges::for_each_n(
+        v.begin(),
+        std::ranges::distance(v),
+        [i = 0](int x) mutable { assert(x == 2 * i++); },
+        [](int x) { return 2 * x; });
+  }
+
   return true;
 }
 

From 00189211486d052b25429f11790ef5486cf9d3ce Mon Sep 17 00:00:00 2001
From: woruyu <99597449+woruyu@users.noreply.github.com>
Date: Thu, 19 Jun 2025 00:22:53 +0800
Subject: [PATCH 0835/1322] [DAG] add (~a | x) & (a | y) -> (a & (x ^ y)) ^y
 for foldMaskedMerge (#144342)

### Summary
This PR resolves https://github.com/llvm/llvm-project/issues/143864

Add (~a | x) & (a | y) -> (a & (x ^ y)) ^y for foldMaskedMerge func
using SDPatternMatch

aftering adding this pattern, run ```ninja check-llvm-codegen```, all
other cases remain unchanged, so I add a
testcase(fold-masked-merge-demorgan.ll) for it

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  24 +-
 .../CodeGen/X86/fold-masked-merge-demorgan.ll | 267 ++++++++++++++++++
 2 files changed, 284 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/fold-masked-merge-demorgan.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 934199e414c7..0e078f9dd88b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7206,24 +7206,30 @@ static SDValue foldLogicTreeOfShifts(SDNode *N, SDValue LeftHand,
   return DAG.getNode(LogicOpcode, DL, VT, CombinedShifts, W);
 }
 
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for targets without a fused
-/// "and-not" operation.
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` and its DeMorgan
+/// variant `(~m | x) & (m | y)` into the equivalent `((x ^ y) & m) ^ y)`
+/// pattern. This is typically a better representation for targets without a
+/// fused "and-not" operation.
 static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
                                const TargetLowering &TLI, const SDLoc &DL) {
   // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  // normalized to OR by InstCombine so we only check for OR or AND.
+  assert(Node->getOpcode() == ISD::OR ||
+         Node->getOpcode() == ISD::AND &&
+             "Must be called with ISD::OR or ISD::AND node");
 
   // If the target supports and-not, don't fold this.
   if (TLI.hasAndNot(SDValue(Node, 0)))
     return SDValue();
 
   SDValue M, X, Y;
+
   if (sd_match(Node,
                m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
-                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
+                    m_OneUse(m_And(m_Deferred(M), m_Value(X))))) ||
+      sd_match(Node,
+               m_And(m_OneUse(m_Or(m_OneUse(m_Not(m_Value(M))), m_Value(X))),
+                     m_OneUse(m_Or(m_Deferred(M), m_Value(Y)))))) {
     EVT VT = M.getValueType();
     SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
     SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
@@ -7678,6 +7684,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
+  if (VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL))
+      return R;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge-demorgan.ll b/llvm/test/CodeGen/X86/fold-masked-merge-demorgan.ll
new file mode 100644
index 000000000000..fe27b3c73be0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-masked-merge-demorgan.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -o - %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=CHECK,NOBMI
+; RUN: llc -o - %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=CHECK,BMI
+;
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0_demorgan(i32 %a0, i32 %a1, i32 %a2) {
+; NOBMI-LABEL: masked_merge0_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge0_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    orl %edi, %edx
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    andnl %edx, %eax, %eax
+; BMI-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %or0 = or i32 %not, %a1
+  %or1 = or i32 %a0, %a2
+  %and = and i32 %or0, %or1
+  ret i32 %and
+}
+
+define i16 @masked_merge1_demorgan(i16 %a0, i16 %a1, i16 %a2) {
+; NOBMI-LABEL: masked_merge1_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge1_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    andnl %edx, %edi, %eax
+; BMI-NEXT:    andl %edi, %esi
+; BMI-NEXT:    orl %esi, %eax
+; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; BMI-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %or0 = or i16 %not, %a1
+  %or1 = or i16 %a0, %a2
+  %and = and i16 %or0, %or1
+  ret i16 %and
+}
+
+define i8 @masked_merge2_demorgan(i8 %a0, i8 %a1, i8 %a2) {
+; CHECK-LABEL: masked_merge2_demorgan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %or0 = or i8 %not, %a1
+  %or1 = or i8 %a0, %a1
+  %and = and i8 %or0, %or1
+  ret i8 %and
+}
+
+define i64 @masked_merge3_demorgan(i64 %a0, i64 %a1, i64 %a2) {
+; NOBMI-LABEL: masked_merge3_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movq %rsi, %rax
+; NOBMI-NEXT:    notq %rdx
+; NOBMI-NEXT:    xorq %rdx, %rax
+; NOBMI-NEXT:    notq %rax
+; NOBMI-NEXT:    andq %rdi, %rax
+; NOBMI-NEXT:    xorq %rdx, %rax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge3_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    andnq %rdx, %rdi, %rax
+; BMI-NEXT:    andq %rdi, %rsi
+; BMI-NEXT:    notq %rsi
+; BMI-NEXT:    andnq %rsi, %rax, %rax
+; BMI-NEXT:    retq
+  %not_a0  = xor i64 %a0, -1
+  %not_a1  = xor i64 %a1, -1
+  %not_a2  = xor i64 %a2, -1
+  %or0     = or i64 %not_a0, %not_a1
+  %or1     = or i64 %a0, %not_a2
+  %and     = and i64 %or0, %or1
+  ret i64 %and
+}
+
+define i32 @not_a_masked_merge0_demorgan(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-LABEL: not_a_masked_merge0_demorgan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    orl %edi, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    retq
+  %not_a_not = sub i32 0, %a0
+  %or0 = or i32 %not_a_not, %a1
+  %or1 = or i32 %a0, %a2
+  %and = and i32 %or0, %or1
+  ret i32 %and
+}
+
+; not a masked merge: `not` operand does not match another `and`-operand.
+define i32 @not_a_masked_merge1_demorgan(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; NOBMI-LABEL: not_a_masked_merge1_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %ecx, %eax
+; NOBMI-NEXT:    orl %edx, %edi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: not_a_masked_merge1_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    orl %edx, %edi
+; BMI-NEXT:    andnl %ecx, %esi, %eax
+; BMI-NEXT:    andnl %edi, %eax, %eax
+; BMI-NEXT:    retq
+  %or1 = or i32 %a0, %a2
+  %not = xor i32 %a3, -1
+  %or0 = or i32 %not, %a1
+  %and = and i32 %or0, %or1
+  ret i32 %and
+}
+
+; not a masked merge: one of the operands of `and` is not an `or`.
+define i32 @not_a_masked_merge2_demorgan(i32 %a0, i32 %a1, i32 %a2) {
+; NOBMI-LABEL: not_a_masked_merge2_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %edi, %eax
+; NOBMI-NEXT:    andl %edi, %edx
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    andl %edx, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: not_a_masked_merge2_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    andl %edi, %edx
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    andnl %edx, %eax, %eax
+; BMI-NEXT:    retq
+  %not_an_or1 = and i32 %a0, %a2
+  %not = xor i32 %a0, -1
+  %or0 = or i32 %not, %a1
+  %and = and i32 %or0, %not_an_or1
+  ret i32 %and
+}
+
+define i32 @not_a_masked_merge3_demorgan(i32 %a0, i32 %a1, i32 %a2) {
+; NOBMI-LABEL: not_a_masked_merge3_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    orl %edi, %edx
+; NOBMI-NEXT:    xorl %edi, %eax
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    andl %edx, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: not_a_masked_merge3_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    orl %edi, %edx
+; BMI-NEXT:    xorl %edi, %esi
+; BMI-NEXT:    andnl %edx, %esi, %eax
+; BMI-NEXT:    retq
+  %or1 = or i32 %a0, %a2
+  %not = xor i32 %a0, -1
+  %not_an_or0 = xor i32 %not, %a1
+  %and = and i32 %not_an_or0, %or1
+  ret i32 %and
+}
+
+; not a masked merge: `not` operand must not be on same `or`.
+define i32 @not_a_masked_merge4_demorgan(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-LABEL: not_a_masked_merge4_demorgan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    retq
+  %or1 = or i32 %a0, %a2
+  %not = xor i32 %a1, -1
+  %or0 = or i32 %not, %a1
+  %and = and i32 %or0, %or1
+  ret i32 %and
+}
+
+; should not transform when operands have multiple users.
+define i32 @masked_merge_no_transform0_demorgan(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NOBMI-LABEL: masked_merge_no_transform0_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    orl %edi, %edx
+; NOBMI-NEXT:    movl %edi, %eax
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    andl %edx, %eax
+; NOBMI-NEXT:    movl %edx, (%rcx)
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge_no_transform0_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    orl %edi, %edx
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    andnl %edx, %eax, %eax
+; BMI-NEXT:    movl %edx, (%rcx)
+; BMI-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %or0 = or i32 %not, %a1
+  %or1 = or i32 %a0, %a2
+  %and = and i32 %or0, %or1
+  store i32 %or1, ptr %p1
+  ret i32 %and
+}
+
+; should not transform when operands have multiple users.
+define i32 @masked_merge_no_transform1_demorgan(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NOBMI-LABEL: masked_merge_no_transform1_demorgan:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %edx, %eax
+; NOBMI-NEXT:    orl %edi, %eax
+; NOBMI-NEXT:    notl %edi
+; NOBMI-NEXT:    orl %edi, %esi
+; NOBMI-NEXT:    andl %esi, %eax
+; NOBMI-NEXT:    movl %edi, (%rcx)
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge_no_transform1_demorgan:
+; BMI:       # %bb.0:
+; BMI-NEXT:    orl %edi, %edx
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    notl %edi
+; BMI-NEXT:    andnl %edx, %eax, %eax
+; BMI-NEXT:    movl %edi, (%rcx)
+; BMI-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %or0 = or i32 %not, %a1
+  %or1 = or i32 %a0, %a2
+  %and = and i32 %or0, %or1
+  store i32 %not, ptr %p1
+  ret i32 %and
+}
+
+; should not transform when operands have multiple users.
+define i32 @masked_merge_no_transform2_demorgan(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-LABEL: masked_merge_no_transform2_demorgan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    movl %edi, (%rcx)
+; CHECK-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %or0 = or i32 %not, %a1
+  %or1 = or i32 %a0, %a2
+  %and = and i32 %or0, %or1
+  store i32 %or0, ptr %p1
+  ret i32 %and
+}

From fe3933da15b5bc635bce156f1f8d11a784316a07 Mon Sep 17 00:00:00 2001
From: Yang Bai <baiyang0132@gmail.com>
Date: Thu, 19 Jun 2025 00:26:04 +0800
Subject: [PATCH 0836/1322] [mlir][vector] Support complete folding in single
 pass for vector.insert/vector.extract (#142124)

### Description

This patch improves the folding efficiency of `vector.insert` and
`vector.extract` operations by not returning early after successfully
converting dynamic indices to static indices.

This PR also renames the test pass `TestConstantFold` to
`TestSingleFold` and adds comprehensive documentation explaining the
single-pass folding behavior.

### Motivation

Since the `OpBuilder::createOrFold` function only calls `fold` **once**,
the current `fold` methods of `vector.insert` and `vector.extract` may
leave the op in a state that can be folded further. For example,
consider the following un-folded IR:
```
%v1 = vector.insert %e1, %v0 [0] : f32 into vector<128xf32>
%c0 = arith.constant 0 : index
%e2 = vector.extract %v1[%c0] : f32 from vector<128xf32>
```
If we use `createOrFold` to create the `vector.extract` op, then the
result will be:
```
%v1 = vector.insert %e1, %v0 [127] : f32 into vector<128xf32>
%e2 = vector.extract %v1[0] : f32 from vector<128xf32>
```
But this is not the optimal result. `createOrFold` should have returned
`%e1`.
The reason is that the execution of fold returns immediately after
`extractInsertFoldConstantOp`, causing subsequent folding logics to be
skipped.

---------

Co-authored-by: Yang Bai <yangb@nvidia.com>
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 24 +++++++-----
 mlir/test/Dialect/Affine/constant-fold.mlir   |  2 +-
 .../test/Dialect/Linalg/mesh-spmdization.mlir |  2 +-
 mlir/test/Dialect/Mesh/spmdization.mlir       |  2 +-
 .../test/Dialect/Tensor/mesh-spmdization.mlir |  2 +-
 mlir/test/Dialect/Tosa/constant_folding.mlir  |  2 +-
 mlir/test/Dialect/Vector/constant-fold.mlir   |  4 +-
 mlir/test/Dialect/Vector/single-fold.mlir     | 38 +++++++++++++++++++
 .../Transforms/constant-fold-debuginfo.mlir   |  2 +-
 mlir/test/Transforms/constant-fold.mlir       |  2 +-
 mlir/test/lib/Transforms/CMakeLists.txt       |  2 +-
 ...estConstantFold.cpp => TestSingleFold.cpp} | 32 ++++++++++------
 mlir/tools/mlir-opt/mlir-opt.cpp              |  4 +-
 13 files changed, 86 insertions(+), 32 deletions(-)
 create mode 100644 mlir/test/Dialect/Vector/single-fold.mlir
 rename mlir/test/lib/Transforms/{TestConstantFold.cpp => TestSingleFold.cpp} (62%)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 2a2357319bd2..e576eeac2365 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2063,6 +2063,7 @@ static Value extractInsertFoldConstantOp(OpType op, AdaptorType adaptor,
   if (opChange) {
     op.setStaticPosition(staticPosition);
     op.getOperation()->setOperands(operands);
+    // Return the original result to indicate an in-place folding happened.
     return op.getResult();
   }
   return {};
@@ -2146,11 +2147,12 @@ OpFoldResult ExtractOp::fold(FoldAdaptor adaptor) {
     return getVector();
   if (auto res = foldPoisonSrcExtractOp(adaptor.getVector()))
     return res;
-  // Fold `arith.constant` indices into the `vector.extract` operation. Make
-  // sure that patterns requiring constant indices are added after this fold.
+  // Fold `arith.constant` indices into the `vector.extract` operation.
+  // Do not stop here as this fold may enable subsequent folds that require
+  // constant indices.
   SmallVector<Value> operands = {getVector()};
-  if (auto val = extractInsertFoldConstantOp(*this, adaptor, operands))
-    return val;
+  auto inplaceFolded = extractInsertFoldConstantOp(*this, adaptor, operands);
+
   if (auto res = foldPoisonIndexInsertExtractOp(
           getContext(), adaptor.getStaticPosition(), kPoisonIndex))
     return res;
@@ -2172,7 +2174,8 @@ OpFoldResult ExtractOp::fold(FoldAdaptor adaptor) {
     return val;
   if (auto val = foldScalarExtractFromFromElements(*this))
     return val;
-  return OpFoldResult();
+
+  return inplaceFolded;
 }
 
 namespace {
@@ -3272,11 +3275,12 @@ OpFoldResult vector::InsertOp::fold(FoldAdaptor adaptor) {
   // (type mismatch).
   if (getNumIndices() == 0 && getValueToStoreType() == getType())
     return getValueToStore();
-  // Fold `arith.constant` indices into the `vector.insert` operation. Make
-  // sure that patterns requiring constant indices are added after this fold.
+  // Fold `arith.constant` indices into the `vector.insert` operation.
+  // Do not stop here as this fold may enable subsequent folds that require
+  // constant indices.
   SmallVector<Value> operands = {getValueToStore(), getDest()};
-  if (auto val = extractInsertFoldConstantOp(*this, adaptor, operands))
-    return val;
+  auto inplaceFolded = extractInsertFoldConstantOp(*this, adaptor, operands);
+
   if (auto res = foldPoisonIndexInsertExtractOp(
           getContext(), adaptor.getStaticPosition(), kPoisonIndex))
     return res;
@@ -3286,7 +3290,7 @@ OpFoldResult vector::InsertOp::fold(FoldAdaptor adaptor) {
     return res;
   }
 
-  return {};
+  return inplaceFolded;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Affine/constant-fold.mlir b/mlir/test/Dialect/Affine/constant-fold.mlir
index ffc3946db08d..8bddacc02475 100644
--- a/mlir/test/Dialect/Affine/constant-fold.mlir
+++ b/mlir/test/Dialect/Affine/constant-fold.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-constant-fold -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -test-single-fold -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: func @affine_apply
 func.func @affine_apply(%variable : index) -> (index, index, index) {
diff --git a/mlir/test/Dialect/Linalg/mesh-spmdization.mlir b/mlir/test/Dialect/Linalg/mesh-spmdization.mlir
index 487cec00de16..9805ee4ea552 100644
--- a/mlir/test/Dialect/Linalg/mesh-spmdization.mlir
+++ b/mlir/test/Dialect/Linalg/mesh-spmdization.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt \
-// RUN:  --pass-pipeline="builtin.module(func.func(mesh-spmdization,test-constant-fold))" \
+// RUN:  --pass-pipeline="builtin.module(func.func(mesh-spmdization,test-single-fold))" \
 // RUN:  --split-input-file \
 // RUN:  %s | FileCheck %s
 
diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir
index 5c9fd29444f0..af4ab58ea50a 100644
--- a/mlir/test/Dialect/Mesh/spmdization.mlir
+++ b/mlir/test/Dialect/Mesh/spmdization.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt \
-// RUN:   --pass-pipeline="builtin.module(func.func(mesh-spmdization,test-constant-fold))" \
+// RUN:   --pass-pipeline="builtin.module(func.func(mesh-spmdization,test-single-fold))" \
 // RUN:   %s | FileCheck %s
 
 mesh.mesh @mesh_1d(shape = 2)
diff --git a/mlir/test/Dialect/Tensor/mesh-spmdization.mlir b/mlir/test/Dialect/Tensor/mesh-spmdization.mlir
index 3fb842474550..8598d81ff6cf 100644
--- a/mlir/test/Dialect/Tensor/mesh-spmdization.mlir
+++ b/mlir/test/Dialect/Tensor/mesh-spmdization.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt \
-// RUN:   --pass-pipeline="builtin.module(func.func(mesh-spmdization,test-constant-fold))" \
+// RUN:   --pass-pipeline="builtin.module(func.func(mesh-spmdization,test-single-fold))" \
 // RUN:   %s | FileCheck %s
 
 mesh.mesh @mesh_1d_4(shape = 4)
diff --git a/mlir/test/Dialect/Tosa/constant_folding.mlir b/mlir/test/Dialect/Tosa/constant_folding.mlir
index 9b6ccdb54c10..d477a2479e91 100644
--- a/mlir/test/Dialect/Tosa/constant_folding.mlir
+++ b/mlir/test/Dialect/Tosa/constant_folding.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --test-constant-fold %s | FileCheck %s
+// RUN: mlir-opt --test-single-fold %s | FileCheck %s
 
 // CHECK-LABEL: func @test_const
 func.func @test_const(%arg0 : index) -> tensor<4xi32> {
diff --git a/mlir/test/Dialect/Vector/constant-fold.mlir b/mlir/test/Dialect/Vector/constant-fold.mlir
index 66c91d6b2041..cbb159fd59ff 100644
--- a/mlir/test/Dialect/Vector/constant-fold.mlir
+++ b/mlir/test/Dialect/Vector/constant-fold.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -test-constant-fold | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -test-single-fold | FileCheck %s
 
 // CHECK-LABEL: fold_extract_transpose_negative
 func.func @fold_extract_transpose_negative(%arg0: vector<4x4xf16>) -> vector<4x4xf16> {
@@ -11,3 +11,5 @@ func.func @fold_extract_transpose_negative(%arg0: vector<4x4xf16>) -> vector<4x4
   %2 = vector.extract %1[0] : vector<4x4xf16> from vector<1x4x4xf16>
   return %2 : vector<4x4xf16>
 }
+
+
diff --git a/mlir/test/Dialect/Vector/single-fold.mlir b/mlir/test/Dialect/Vector/single-fold.mlir
new file mode 100644
index 000000000000..baccdc3f51c0
--- /dev/null
+++ b/mlir/test/Dialect/Vector/single-fold.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-opt %s -split-input-file -test-single-fold | FileCheck %s
+
+// The tests in this file verify that fold() methods can handle complex
+// optimization scenarios without requiring multiple folding iterations.
+// This is important because:
+//
+// 1. OpBuilder::createOrFold() only calls fold() once, so operations must
+//    be fully optimized in that single call
+// 2. Multiple rounds of folding would incur higher performance costs,
+//    so it's more efficient to complete all optimizations in one pass
+//
+// These tests ensure that folding implementations are robust and complete,
+// avoiding situations where operations are left in intermediate states
+// that could be further optimized.
+
+// CHECK-LABEL: fold_extract_in_single_pass
+// CHECK-SAME: (%{{.*}}: vector<4xf16>, %[[ARG1:.+]]: f16)
+func.func @fold_extract_in_single_pass(%arg0: vector<4xf16>, %arg1: f16) -> f16 {
+  %0 = vector.insert %arg1, %arg0 [1] : f16 into vector<4xf16>
+  %c1 = arith.constant 1 : index
+  // Verify that the fold is finished in a single pass even if the index is dynamic.
+  %1 = vector.extract %0[%c1] : f16 from vector<4xf16>
+  // CHECK: return %[[ARG1]] : f16
+  return %1 : f16
+}
+
+// -----
+
+// CHECK-LABEL: fold_insert_in_single_pass
+func.func @fold_insert_in_single_pass() -> vector<2xf16> {
+  %cst = arith.constant dense<0.000000e+00> : vector<2xf16>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2.5 : f16
+  // Verify that the fold is finished in a single pass even if the index is dynamic.
+  // CHECK: arith.constant dense<[0.000000e+00, 2.500000e+00]> : vector<2xf16>
+  %0 = vector.insert %c2, %cst [%c1] : f16 into vector<2xf16>
+  return %0 : vector<2xf16>
+} 
\ No newline at end of file
diff --git a/mlir/test/Transforms/constant-fold-debuginfo.mlir b/mlir/test/Transforms/constant-fold-debuginfo.mlir
index c308bc477bee..4fa7fb6698a2 100644
--- a/mlir/test/Transforms/constant-fold-debuginfo.mlir
+++ b/mlir/test/Transforms/constant-fold-debuginfo.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -test-constant-fold -mlir-print-debuginfo | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -test-single-fold -mlir-print-debuginfo | FileCheck %s
 
 // CHECK-LABEL: func @fold_and_merge
 func.func @fold_and_merge() -> (i32, i32) {
diff --git a/mlir/test/Transforms/constant-fold.mlir b/mlir/test/Transforms/constant-fold.mlir
index 981757aed9b1..0b393bf0556b 100644
--- a/mlir/test/Transforms/constant-fold.mlir
+++ b/mlir/test/Transforms/constant-fold.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -test-constant-fold | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -test-single-fold | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 76041cd6cd79..ddc0a779e8f6 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -26,11 +26,11 @@ endif()
 add_mlir_library(MLIRTestTransforms
   TestCommutativityUtils.cpp
   TestCompositePass.cpp
-  TestConstantFold.cpp
   TestControlFlowSink.cpp
   TestInlining.cpp
   TestInliningCallback.cpp
   TestMakeIsolatedFromAbove.cpp
+  TestSingleFold.cpp
   TestTransformsOps.cpp
   ${MLIRTestTransformsPDLSrc}
 
diff --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestSingleFold.cpp
similarity index 62%
rename from mlir/test/lib/Transforms/TestConstantFold.cpp
rename to mlir/test/lib/Transforms/TestSingleFold.cpp
index c97ab9091cb6..5bd9dd2a1f07 100644
--- a/mlir/test/lib/Transforms/TestConstantFold.cpp
+++ b/mlir/test/lib/Transforms/TestSingleFold.cpp
@@ -1,4 +1,4 @@
-//===- TestConstantFold.cpp - Pass to test constant folding ---------------===//
+//===- TestSingleFold.cpp - Pass to test single-pass folding --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,14 +12,23 @@
 using namespace mlir;
 
 namespace {
-/// Simple constant folding pass.
-struct TestConstantFold : public PassWrapper<TestConstantFold, OperationPass<>>,
-                          public RewriterBase::Listener {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestConstantFold)
+/// Test pass for single-pass constant folding.
+///
+/// This pass tests the behavior of operations when folded exactly once. Unlike
+/// canonicalization passes that may apply multiple rounds of folding, this pass
+/// ensures that each operation is folded at most once, which is useful for
+/// testing scenarios where the fold implementation should handle complex cases
+/// without requiring multiple iterations.
+///
+/// The pass also removes dead constants after folding to clean up unused
+/// intermediate results.
+struct TestSingleFold : public PassWrapper<TestSingleFold, OperationPass<>>,
+                        public RewriterBase::Listener {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSingleFold)
 
-  StringRef getArgument() const final { return "test-constant-fold"; }
+  StringRef getArgument() const final { return "test-single-fold"; }
   StringRef getDescription() const final {
-    return "Test operation constant folding";
+    return "Test single-pass operation folding and dead constant elimination";
   }
   // All constants in the operation post folding.
   SmallVector<Operation *> existingConstants;
@@ -39,18 +48,19 @@ struct TestConstantFold : public PassWrapper<TestConstantFold, OperationPass<>>,
 };
 } // namespace
 
-void TestConstantFold::foldOperation(Operation *op, OperationFolder &helper) {
+void TestSingleFold::foldOperation(Operation *op, OperationFolder &helper) {
   // Attempt to fold the specified operation, including handling unused or
   // duplicated constants.
   (void)helper.tryToFold(op);
 }
 
-void TestConstantFold::runOnOperation() {
+void TestSingleFold::runOnOperation() {
   existingConstants.clear();
 
   // Collect and fold the operations within the operation.
   SmallVector<Operation *, 8> ops;
-  getOperation()->walk<mlir::WalkOrder::PreOrder>([&](Operation *op) { ops.push_back(op); });
+  getOperation()->walk<mlir::WalkOrder::PreOrder>(
+      [&](Operation *op) { ops.push_back(op); });
 
   // Fold the constants in reverse so that the last generated constants from
   // folding are at the beginning. This creates somewhat of a linear ordering to
@@ -70,6 +80,6 @@ void TestConstantFold::runOnOperation() {
 
 namespace mlir {
 namespace test {
-void registerTestConstantFold() { PassRegistration<TestConstantFold>(); }
+void registerTestSingleFold() { PassRegistration<TestSingleFold>(); }
 } // namespace test
 } // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 6ef9ff8e8454..143a5e8e8f8d 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -87,7 +87,6 @@ void registerTestCfAssertPass();
 void registerTestCFGLoopInfoPass();
 void registerTestComposeSubView();
 void registerTestCompositePass();
-void registerTestConstantFold();
 void registerTestControlFlowSink();
 void registerTestConvertToSPIRVPass();
 void registerTestDataLayoutPropagation();
@@ -145,6 +144,7 @@ void registerTestSCFUtilsPass();
 void registerTestSCFWhileOpBuilderPass();
 void registerTestSCFWrapInZeroTripCheckPasses();
 void registerTestShapeMappingPass();
+void registerTestSingleFold();
 void registerTestSliceAnalysisPass();
 void registerTestSPIRVCPURunnerPipeline();
 void registerTestSPIRVFuncSignatureConversion();
@@ -233,7 +233,6 @@ void registerTestPasses() {
   mlir::test::registerTestCFGLoopInfoPass();
   mlir::test::registerTestComposeSubView();
   mlir::test::registerTestCompositePass();
-  mlir::test::registerTestConstantFold();
   mlir::test::registerTestControlFlowSink();
   mlir::test::registerTestConvertToSPIRVPass();
   mlir::test::registerTestDataLayoutPropagation();
@@ -291,6 +290,7 @@ void registerTestPasses() {
   mlir::test::registerTestSCFWhileOpBuilderPass();
   mlir::test::registerTestSCFWrapInZeroTripCheckPasses();
   mlir::test::registerTestShapeMappingPass();
+  mlir::test::registerTestSingleFold();
   mlir::test::registerTestSliceAnalysisPass();
   mlir::test::registerTestSPIRVCPURunnerPipeline();
   mlir::test::registerTestSPIRVFuncSignatureConversion();

From 4084ffcf1e69b962e864aa138bb54dabbcec912f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 18 Jun 2025 11:31:03 -0500
Subject: [PATCH 0837/1322] [flang] Show types in DumpEvExpr (#143743)

When dumping evaluate::Expr, show type names which contain a lot of
useful information.

For example show
```
expr <Fortran::evaluate::SomeType> {
  expr <Fortran::evaluate::SomeKind<Fortran::common::TypeCategory::Integer>> {
    expr <Fortran::evaluate::Type<Fortran::common::TypeCategory::Integer, 4>> {
      ...
```
instead of
```
expr T {
  expr T {
    expr T {
      ...
```
---
 flang/include/flang/Semantics/dump-expr.h | 56 +++++++++++++++++++----
 flang/lib/Semantics/dump-expr.cpp         |  2 +-
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/flang/include/flang/Semantics/dump-expr.h b/flang/include/flang/Semantics/dump-expr.h
index 2f445429a10b..9cc52b4da487 100644
--- a/flang/include/flang/Semantics/dump-expr.h
+++ b/flang/include/flang/Semantics/dump-expr.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <optional>
+#include <string>
 #include <variant>
 #include <vector>
 
@@ -38,6 +39,43 @@ public:
   }
 
 private:
+  template <typename T> struct TypeOf {
+    static constexpr std::string_view get() {
+#if defined(__GNUC__)
+#define DUMP_EXPR_SHOW_TYPE
+      std::string_view v(__PRETTY_FUNCTION__);
+      // Extract the "xyz" from the "pretty function" string:
+      // "... [with T = xyz; std::string_view = ...]"
+      std::string_view front("with T = ");
+      std::string_view back("; std::string_view =");
+
+#elif defined(_MSC_VER)
+#define DUMP_EXPR_SHOW_TYPE
+      std::string_view v(__FUNCSIG__);
+      // Extract the "xyz" from the "pretty function" string:
+      // "...TypeOf<xyz>::get(void)"
+      std::string_view front("TypeOf<");
+      std::string_view back(">::get(void)");
+
+#endif
+
+#if defined(DUMP_EXPR_SHOW_TYPE)
+#undef DUMP_EXPR_SHOW_TYPE
+      if (auto fpos{v.find(front)}; fpos != v.npos) {
+        v.remove_prefix(fpos + front.size());
+        if (auto bpos{v.find(back)}; bpos != v.npos) {
+          v.remove_suffix(v.size() - bpos);
+          return v;
+        }
+      }
+#endif
+
+      return "";
+    }
+
+    static constexpr std::string_view name{TypeOf<T>::get()};
+  };
+
   template <typename A, bool C> void Show(const common::Indirection<A, C> &x) {
     Show(x.value());
   }
@@ -76,7 +114,7 @@ private:
   void Show(const evaluate::NullPointer &);
   template <typename T> void Show(const evaluate::Constant<T> &x) {
     if constexpr (T::category == common::TypeCategory::Derived) {
-      Indent("derived constant");
+      Indent("derived constant "s + std::string(TypeOf<T>::name));
       for (const auto &map : x.values()) {
         for (const auto &pair : map) {
           Show(pair.second.value());
@@ -84,7 +122,7 @@ private:
       }
       Outdent();
     } else {
-      Print("constant");
+      Print("constant "s + std::string(TypeOf<T>::name));
     }
   }
   void Show(const Symbol &symbol);
@@ -102,7 +140,7 @@ private:
   void Show(const evaluate::Substring &x);
   void Show(const evaluate::ComplexPart &x);
   template <typename T> void Show(const evaluate::Designator<T> &x) {
-    Indent("designator");
+    Indent("designator "s + std::string(TypeOf<T>::name));
     Show(x.u);
     Outdent();
   }
@@ -117,7 +155,7 @@ private:
     Outdent();
   }
   template <typename T> void Show(const evaluate::FunctionRef<T> &x) {
-    Indent("function ref");
+    Indent("function ref "s + std::string(TypeOf<T>::name));
     Show(x.proc());
     Show(x.arguments());
     Outdent();
@@ -127,14 +165,14 @@ private:
   }
   template <typename T>
   void Show(const evaluate::ArrayConstructorValues<T> &x) {
-    Indent("array constructor value");
+    Indent("array constructor value "s + std::string(TypeOf<T>::name));
     for (auto &v : x) {
       Show(v);
     }
     Outdent();
   }
   template <typename T> void Show(const evaluate::ImpliedDo<T> &x) {
-    Indent("implied do");
+    Indent("implied do "s + std::string(TypeOf<T>::name));
     Show(x.lower());
     Show(x.upper());
     Show(x.stride());
@@ -148,20 +186,20 @@ private:
   void Show(const evaluate::StructureConstructor &x);
   template <typename D, typename R, typename O>
   void Show(const evaluate::Operation<D, R, O> &op) {
-    Indent("unary op");
+    Indent("unary op "s + std::string(TypeOf<D>::name));
     Show(op.left());
     Outdent();
   }
   template <typename D, typename R, typename LO, typename RO>
   void Show(const evaluate::Operation<D, R, LO, RO> &op) {
-    Indent("binary op");
+    Indent("binary op "s + std::string(TypeOf<D>::name));
     Show(op.left());
     Show(op.right());
     Outdent();
   }
   void Show(const evaluate::Relational<evaluate::SomeType> &x);
   template <typename T> void Show(const evaluate::Expr<T> &x) {
-    Indent("expr T");
+    Indent("expr <" + std::string(TypeOf<T>::name) + ">");
     Show(x.u);
     Outdent();
   }
diff --git a/flang/lib/Semantics/dump-expr.cpp b/flang/lib/Semantics/dump-expr.cpp
index aa0b4e0f0339..66cedab94bfb 100644
--- a/flang/lib/Semantics/dump-expr.cpp
+++ b/flang/lib/Semantics/dump-expr.cpp
@@ -151,7 +151,7 @@ void DumpEvaluateExpr::Show(const evaluate::StructureConstructor &x) {
 }
 
 void DumpEvaluateExpr::Show(const evaluate::Relational<evaluate::SomeType> &x) {
-  Indent("expr some type");
+  Indent("relational some type");
   Show(x.u);
   Outdent();
 }

From 2a8c65e983b3f4e1c83d8028d354f7bacc149015 Mon Sep 17 00:00:00 2001
From: Alexis Engelke <engelke@in.tum.de>
Date: Wed, 18 Jun 2025 18:56:30 +0200
Subject: [PATCH 0838/1322] [CodeGen][NFC] Fix quadratic c-t for large jump
 tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deleting a basic block removes all references from jump tables, which
is O(n). When freeing a MachineFunction, all basic blocks are deleted
before the jump tables, causing O(n^2) runtime. Fix this by deallocating
the jump table first.

Test case generator:

    import sys

    n = int(sys.argv[1])
    print("define void @f(i64 %c, ptr %p) {")
    print("  switch i64 %c, label %d [")
    for i in range(n):
        print(f"    i64 {i}, label %h{i}")
    print(f"  ]")
    for i in range(n):
        print(f'h{i}:')
        print(f'  store i64 {i*i}, ptr %p')
        print(f'  ret void')
    print('d:')
    print('  ret void')
    print('}')

Improvement at 5000 entries:

    Benchmark 1: ./llc.pre -filetype=obj -O0 <switch5k.bc
      Time (mean ± σ):      49.7 ms ±   1.0 ms
      Range (min … max):    48.0 ms …  52.1 ms    57 runs

    Benchmark 2: ./llc.post -filetype=obj -O0 <switch5k.bc
      Time (mean ± σ):      39.4 ms ±   0.8 ms
      Range (min … max):    37.1 ms …  41.1 ms    72 runs

    Summary
      ./llc.post -filetype=obj -O0 <switch5k.bc ran
        1.26 ± 0.04 times faster than ./llc.pre -filetype=obj -O0 <switch5k.bc

Improvement at 20000 entries:

    Benchmark 1: ./llc.pre -filetype=obj -O0 <switch20k.bc
      Time (mean ± σ):     281.7 ms ±   1.0 ms
      Range (min … max):   280.2 ms … 283.0 ms    10 runs

    Benchmark 2: ./llc.post -filetype=obj -O0 <switch20k.bc
      Time (mean ± σ):     123.9 ms ±   1.5 ms
      Range (min … max):   121.4 ms … 129.2 ms    23 runs

    Summary
      ./llc.post -filetype=obj -O0 <switch20k.bc ran
        2.27 ± 0.03 times faster than ./llc.pre -filetype=obj -O0 <switch20k.bc

Pull Request: https://github.com/llvm/llvm-project/pull/144108
---
 llvm/lib/CodeGen/MachineFunction.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 607e87a38274..38ad582ba923 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -259,6 +259,15 @@ MachineFunction::~MachineFunction() {
 
 void MachineFunction::clear() {
   Properties.reset();
+
+  // Clear JumpTableInfo first. Otherwise, every MBB we delete would do a
+  // linear search over the jump table entries to find and erase itself.
+  if (JumpTableInfo) {
+    JumpTableInfo->~MachineJumpTableInfo();
+    Allocator.Deallocate(JumpTableInfo);
+    JumpTableInfo = nullptr;
+  }
+
   // Don't call destructors on MachineInstr and MachineOperand. All of their
   // memory comes from the BumpPtrAllocator which is about to be purged.
   //
@@ -287,11 +296,6 @@ void MachineFunction::clear() {
   ConstantPool->~MachineConstantPool();
   Allocator.Deallocate(ConstantPool);
 
-  if (JumpTableInfo) {
-    JumpTableInfo->~MachineJumpTableInfo();
-    Allocator.Deallocate(JumpTableInfo);
-  }
-
   if (WinEHInfo) {
     WinEHInfo->~WinEHFuncInfo();
     Allocator.Deallocate(WinEHInfo);

From 77bc25485135b8a8cb2427910a8850fbc4e4be09 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Wed, 18 Jun 2025 18:05:02 +0100
Subject: [PATCH 0839/1322] [AArch64] Fix build failure with -Werror (#144749)

PR#144387 caused buildbot failures with -Werror due to a comparison
between signed and unsigned types. Fix this with an explicit cast.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0e28ccd0f655..d8b574719dae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27081,7 +27081,7 @@ bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
     // only allow an offset that's equal to the store size.
     EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
     if (!Subtarget->isLittleEndian() && MemType.isVector() &&
-        RHSC != MemType.getStoreSize())
+        (uint64_t)RHSC != MemType.getStoreSize())
       return false;
     // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
     // when dealing with subtraction.

From 298f1c276f4f9c18b25a79ffe6e619e89c5fbf7e Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Wed, 18 Jun 2025 10:08:27 -0700
Subject: [PATCH 0840/1322] Revert "Add missing intrinsics to cuda headers"
 (#144755)

Reverts llvm/llvm-project#143664
as it breaks CUDA compilation.
---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 284 --------------------
 1 file changed, 284 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 5e13f3f78df7..8b230af6f664 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,290 +479,6 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
   return ret;
 }
 
-#pragma push_macro("__INTRINSIC_LOAD")
-#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType,  \
-                         __Clobber)                                            \
-  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
-    __TmpType __ret;                                                           \
-    asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber);        \
-    return (__DeclType)__ret;                                                  \
-  }
-
-#pragma push_macro("__INTRINSIC_LOAD2")
-#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
-                          __Clobber)                                           \
-  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
-    __DeclType __ret;                                                          \
-    __TmpType __tmp;                                                           \
-    asm(__AsmOp " {%0,%1}, [%2];"                                              \
-        : __AsmType(__tmp.x), __AsmType(__tmp.y)                               \
-        : "l"(__ptr)__Clobber);                                                \
-    using __ElementType = decltype(__ret.x);                                   \
-    __ret.x = (__ElementType)(__tmp.x);                                        \
-    __ret.y = (__ElementType)__tmp.y;                                          \
-    return __ret;                                                              \
-  }
-
-#pragma push_macro("__INTRINSIC_LOAD4")
-#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
-                          __Clobber)                                           \
-  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
-    __DeclType __ret;                                                          \
-    __TmpType __tmp;                                                           \
-    asm(__AsmOp " {%0,%1,%2,%3}, [%4];"                                        \
-        : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z),          \
-          __AsmType(__tmp.w)                                                   \
-        : "l"(__ptr)__Clobber);                                                \
-    using __ElementType = decltype(__ret.x);                                   \
-    __ret.x = (__ElementType)__tmp.x;                                          \
-    __ret.y = (__ElementType)__tmp.y;                                          \
-    __ret.z = (__ElementType)__tmp.z;                                          \
-    __ret.w = (__ElementType)__tmp.w;                                          \
-    return __ret;                                                              \
-  }
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
-                 "=l", );
-
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
-                 "=h", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
-                 unsigned long long, "=l", );
-
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
-                  "=l", );
-
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
-__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
-__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
-__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
-
-inline __device__ long __ldcg(const long *__ptr) {
-  unsigned long __ret;
-  if (sizeof(long) == 8) {
-    asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
-  } else {
-    asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
-  }
-  return (long)__ret;
-}
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
-                 "=h", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
-                 unsigned long long, "=l", : "memory");
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
-                 "=h", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
-                 "=r", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
-                 "=l", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
-                  "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
-                  "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
-                  "=h", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
-                  "=h", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
-                  "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
-                  "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
-                  "=l", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
-                  "=h", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
-                  "=h", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
-                  "=l", : "memory");
-
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
-__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
-
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
-                  "=f", : "memory");
-__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
-                  "=f", : "memory");
-__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
-                  "=d", : "memory");
-
-inline __device__ long __ldcv(const long *__ptr) {
-  unsigned long __ret;
-  if (sizeof(long) == 8) {
-    asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
-  } else {
-    asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
-  }
-  return (long)__ret;
-}
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
-                 "=l", );
-
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
-                 "=h", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
-                 "=r", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
-                 unsigned long long, "=l", );
-
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
-                  "=l", );
-
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
-__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
-__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
-__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
-
-#pragma pop_macro("__INTRINSIC_LOAD")
-#pragma pop_macro("__INTRINSIC_LOAD2")
-#pragma pop_macro("__INTRINSIC_LOAD4")
-
-inline __device__ long __ldcs(const long *__ptr) {
-  unsigned long __ret;
-  if (sizeof(long) == 8) {
-    asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
-  } else {
-    asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
-  }
-  return (long)__ret;
-}
-
-#pragma push_macro("__INTRINSIC_STORE")
-#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
-  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
-    __TmpType __tmp = (__TmpType)__value;                                      \
-    asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory");       \
-  }
-
-#pragma push_macro("__INTRINSIC_STORE2")
-#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType,           \
-                           __AsmType)                                          \
-  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
-    __TmpType __tmp;                                                           \
-    using __ElementType = decltype(__tmp.x);                                   \
-    __tmp.x = (__ElementType)(__value.x);                                      \
-    __tmp.y = (__ElementType)(__value.y);                                      \
-    asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x),            \
-        __AsmType(__tmp.y)                                                     \
-        : "memory");                                                           \
-  }
-
-#pragma push_macro("__INTRINSIC_STORE4")
-#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType,           \
-                           __AsmType)                                          \
-  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
-    __TmpType __tmp;                                                           \
-    using __ElementType = decltype(__tmp.x);                                   \
-    __tmp.x = (__ElementType)(__value.x);                                      \
-    __tmp.y = (__ElementType)(__value.y);                                      \
-    __tmp.z = (__ElementType)(__value.z);                                      \
-    __tmp.w = (__ElementType)(__value.w);                                      \
-    asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x),      \
-        __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w)             \
-        : "memory");                                                           \
-  }
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
-
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
-                  "h");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
-__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
-                  unsigned long long, "l");
-
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
-
-__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
-__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
-__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
-__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
-
-#pragma pop_macro("__INTRINSIC_STORE")
-#pragma pop_macro("__INTRINSIC_STORE2")
-#pragma pop_macro("__INTRINSIC_STORE4")
-
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
 
 #if CUDA_VERSION >= 11000

From d9f7979a63ceac88727632ecfd522c073288b6c1 Mon Sep 17 00:00:00 2001
From: Justin King <jcking@google.com>
Date: Wed, 18 Jun 2025 10:24:38 -0700
Subject: [PATCH 0841/1322] sanitizer_common: add unsupported test for
 free_sized and free_aligned_sized from C23 (#144727)

Signed-off-by: Justin King <jcking@google.com>
---
 .../TestCases/Linux/free_aligned_sized.c          | 13 +++++++++++++
 .../sanitizer_common/TestCases/Linux/free_sized.c | 15 +++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c

diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
new file mode 100644
index 000000000000..f4c6c0f973bd
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
@@ -0,0 +1,13 @@
+// RUN: %clang -std=c23 -O0 %s -o %t && %run %t
+// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, lsan, ubsan
+
+#include <stddef.h>
+#include <stdlib.h>
+
+extern void free_aligned_sized(void *p, size_t alignment, size_t size);
+
+int main() {
+  volatile void *p = aligned_alloc(128, 1024);
+  free_aligned_sized((void *)p, 128, 1024);
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
new file mode 100644
index 000000000000..0ee2289684d0
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
@@ -0,0 +1,15 @@
+// RUN: %clang -std=c23 -O0 %s -o %t && %run %t
+// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, lsan, ubsan
+
+#include <stddef.h>
+#include <stdlib.h>
+
+extern void *aligned_alloc(size_t alignment, size_t size);
+
+extern void free_sized(void *p, size_t size);
+
+int main() {
+  volatile void *p = malloc(64);
+  free_sized((void *)p, 64);
+  return 0;
+}

From 82acd8c377e9ed267195afdbde16eedebabc648c Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 18 Jun 2025 13:50:57 -0400
Subject: [PATCH 0842/1322] [PowerPC]  Add code to spill and restore DMRp
 registers (#142443)

---
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp    |   2 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.h      |   9 +
 llvm/lib/Target/PowerPC/PPCInstrMMA.td      |   4 +
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 107 +++++-----
 llvm/test/CodeGen/PowerPC/dmr-spill.ll      |  36 ++--
 llvm/test/CodeGen/PowerPC/dmrp-spill.ll     | 213 ++++++++++++++++++++
 6 files changed, 300 insertions(+), 71 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/dmrp-spill.ll

diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 57c86d9e5de6..7c1550e99bae 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1926,7 +1926,7 @@ unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const {
   } else if (PPC::DMRROWpRCRegClass.hasSubClassEq(RC)) {
     llvm_unreachable("TODO: Implement spill DMRROWp regclass!");
   } else if (PPC::DMRpRCRegClass.hasSubClassEq(RC)) {
-    llvm_unreachable("TODO: Implement spill DMRp regclass!");
+    OpcodeIndex = SOK_DMRpSpill;
   } else if (PPC::DMRRCRegClass.hasSubClassEq(RC)) {
     OpcodeIndex = SOK_DMRSpill;
   } else {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index a27b5718ec89..7931a9e3ae13 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -81,6 +81,7 @@ enum SpillOpcodeKey {
   SOK_AccumulatorSpill,
   SOK_UAccumulatorSpill,
   SOK_WAccumulatorSpill,
+  SOK_DMRpSpill,
   SOK_DMRSpill,
   SOK_SPESpill,
   SOK_PairedG8Spill,
@@ -119,6 +120,7 @@ enum PPCMachineCombinerPattern : unsigned {
    NoInstr,                                                                    \
    NoInstr,                                                                    \
    NoInstr,                                                                    \
+   NoInstr,                                                                    \
    PPC::EVLDD,                                                                 \
    PPC::RESTORE_QUADWORD}
 
@@ -140,6 +142,7 @@ enum PPCMachineCombinerPattern : unsigned {
    NoInstr,                                                                    \
    NoInstr,                                                                    \
    NoInstr,                                                                    \
+   NoInstr,                                                                    \
    PPC::RESTORE_QUADWORD}
 
 #define Pwr10LoadOpcodes                                                       \
@@ -160,6 +163,7 @@ enum PPCMachineCombinerPattern : unsigned {
    NoInstr,                                                                    \
    NoInstr,                                                                    \
    NoInstr,                                                                    \
+   NoInstr,                                                                    \
    PPC::RESTORE_QUADWORD}
 
 #define FutureLoadOpcodes                                                      \
@@ -178,6 +182,7 @@ enum PPCMachineCombinerPattern : unsigned {
    PPC::RESTORE_ACC,                                                           \
    PPC::RESTORE_UACC,                                                          \
    PPC::RESTORE_WACC,                                                          \
+   PPC::RESTORE_DMRP,                                                          \
    PPC::RESTORE_DMR,                                                           \
    NoInstr,                                                                    \
    PPC::RESTORE_QUADWORD}
@@ -199,6 +204,7 @@ enum PPCMachineCombinerPattern : unsigned {
    NoInstr,                                                                    \
    NoInstr,                                                                    \
    NoInstr,                                                                    \
+   NoInstr,                                                                    \
    PPC::EVSTDD,                                                                \
    PPC::SPILL_QUADWORD}
 
@@ -220,6 +226,7 @@ enum PPCMachineCombinerPattern : unsigned {
    NoInstr,                                                                    \
    NoInstr,                                                                    \
    NoInstr,                                                                    \
+   NoInstr,                                                                    \
    PPC::SPILL_QUADWORD}
 
 #define Pwr10StoreOpcodes                                                      \
@@ -240,6 +247,7 @@ enum PPCMachineCombinerPattern : unsigned {
    NoInstr,                                                                    \
    NoInstr,                                                                    \
    NoInstr,                                                                    \
+   NoInstr,                                                                    \
    PPC::SPILL_QUADWORD}
 
 #define FutureStoreOpcodes                                                     \
@@ -258,6 +266,7 @@ enum PPCMachineCombinerPattern : unsigned {
    PPC::SPILL_ACC,                                                             \
    PPC::SPILL_UACC,                                                            \
    PPC::SPILL_WACC,                                                            \
+   PPC::SPILL_DMRP,                                                            \
    PPC::SPILL_DMR,                                                             \
    NoInstr,                                                                    \
    PPC::SPILL_QUADWORD}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index 82e4a60e0a72..436715a0e4ab 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -565,12 +565,16 @@ let Predicates = [MMA, IsISAFuture], isCodeGenOnly = 1 in {
   let mayStore = 1 in {
     def SPILL_WACC: PPCEmitTimePseudo<(outs), (ins wacc:$AT, memrix16:$dst),
                                       "#SPILL_WACC", []>;
+    def SPILL_DMRP: PPCEmitTimePseudo<(outs), (ins dmrp:$AT, memrix16:$dst),
+                                      "#SPILL_DMRP", []>;
     def SPILL_DMR: PPCEmitTimePseudo<(outs), (ins dmr:$AT, memrix16:$dst),
                                       "#SPILL_DMR", []>;
   }
   let mayLoad = 1, hasSideEffects = 0 in {
     def RESTORE_WACC: PPCEmitTimePseudo<(outs wacc:$AT), (ins memrix16:$src),
                                         "#RESTORE_WACC", []>;
+    def RESTORE_DMRP: PPCEmitTimePseudo<(outs dmrp:$AT), (ins memrix16:$src),
+                                        "#RESTORE_DMRP", []>;
     def RESTORE_DMR: PPCEmitTimePseudo<(outs dmr:$AT), (ins memrix16:$src),
                                         "#RESTORE_DMR", []>;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ea34c1aba82e..76dca4794e05 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1519,33 +1519,32 @@ void PPCRegisterInfo::lowerDMRSpilling(MachineBasicBlock::iterator II,
   // DMR is made up of WACC and WACC_HI, so DMXXEXTFDMR512 to spill
   // the corresponding 512 bits.
   const TargetRegisterClass *RC = &PPC::VSRpRCRegClass;
+  auto spillDMR = [&](Register SrcReg, int BEIdx, int LEIdx) {
+    auto spillWACC = [&](unsigned Opc, unsigned RegIdx, int IdxBE, int IdxLE) {
+      Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC);
+      Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC);
+
+      BuildMI(MBB, II, DL, TII.get(Opc), VSRpReg0)
+          .addDef(VSRpReg1)
+          .addReg(TargetRegisterInfo::getSubReg(SrcReg, RegIdx));
+
+      addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                            .addReg(VSRpReg0, RegState::Kill),
+                        FrameIndex, IsLittleEndian ? IdxLE : IdxBE);
+      addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                            .addReg(VSRpReg1, RegState::Kill),
+                        FrameIndex, IsLittleEndian ? IdxLE - 32 : IdxBE + 32);
+    };
+    spillWACC(PPC::DMXXEXTFDMR512, PPC::sub_wacc_lo, BEIdx, LEIdx);
+    spillWACC(PPC::DMXXEXTFDMR512_HI, PPC::sub_wacc_hi, BEIdx + 64, LEIdx - 64);
+  };
+
   Register SrcReg = MI.getOperand(0).getReg();
-
-  Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC);
-  Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC);
-  Register VSRpReg2 = MF.getRegInfo().createVirtualRegister(RC);
-  Register VSRpReg3 = MF.getRegInfo().createVirtualRegister(RC);
-
-  BuildMI(MBB, II, DL, TII.get(PPC::DMXXEXTFDMR512_HI), VSRpReg2)
-      .addDef(VSRpReg3)
-      .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_wacc_hi));
-
-  BuildMI(MBB, II, DL, TII.get(PPC::DMXXEXTFDMR512), VSRpReg0)
-      .addDef(VSRpReg1)
-      .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_wacc_lo));
-
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                        .addReg(VSRpReg0, RegState::Kill),
-                    FrameIndex, IsLittleEndian ? 96 : 0);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                        .addReg(VSRpReg1, RegState::Kill),
-                    FrameIndex, IsLittleEndian ? 64 : 32);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                        .addReg(VSRpReg2, RegState::Kill),
-                    FrameIndex, IsLittleEndian ? 32 : 64);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                        .addReg(VSRpReg3, RegState::Kill),
-                    FrameIndex, IsLittleEndian ? 0 : 96);
+  if (MI.getOpcode() == PPC::SPILL_DMRP) {
+    spillDMR(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_dmr1), 0, 96);
+    spillDMR(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_dmr0), 128, 224);
+  } else
+    spillDMR(SrcReg, 0, 96);
 
   // Discard the pseudo instruction.
   MBB.erase(II);
@@ -1554,7 +1553,7 @@ void PPCRegisterInfo::lowerDMRSpilling(MachineBasicBlock::iterator II,
 /// lowerDMRRestore - Generate the code to restore the DMR register.
 void PPCRegisterInfo::lowerDMRRestore(MachineBasicBlock::iterator II,
                                       unsigned FrameIndex) const {
-  MachineInstr &MI = *II; // <DestReg> = RESTORE_WACC <offset>
+  MachineInstr &MI = *II; // <DestReg> = RESTORE_DMR[P] <offset>
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
@@ -1563,32 +1562,34 @@ void PPCRegisterInfo::lowerDMRRestore(MachineBasicBlock::iterator II,
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   const TargetRegisterClass *RC = &PPC::VSRpRCRegClass;
+  auto restoreDMR = [&](Register DestReg, int BEIdx, int LEIdx) {
+    auto restoreWACC = [&](unsigned Opc, unsigned RegIdx, int IdxBE,
+                           int IdxLE) {
+      Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC);
+      Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC);
+
+      addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg0),
+                        FrameIndex, IsLittleEndian ? IdxLE : IdxBE);
+      addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg1),
+                        FrameIndex, IsLittleEndian ? IdxLE - 32 : IdxBE + 32);
+
+      // Kill virtual registers (killedRegState::Killed).
+      BuildMI(MBB, II, DL, TII.get(Opc),
+              TargetRegisterInfo::getSubReg(DestReg, RegIdx))
+          .addReg(VSRpReg0, RegState::Kill)
+          .addReg(VSRpReg1, RegState::Kill);
+    };
+    restoreWACC(PPC::DMXXINSTDMR512, PPC::sub_wacc_lo, BEIdx, LEIdx);
+    restoreWACC(PPC::DMXXINSTDMR512_HI, PPC::sub_wacc_hi, BEIdx + 64,
+                LEIdx - 64);
+  };
+
   Register DestReg = MI.getOperand(0).getReg();
-
-  Register VSRpReg0 = MF.getRegInfo().createVirtualRegister(RC);
-  Register VSRpReg1 = MF.getRegInfo().createVirtualRegister(RC);
-  Register VSRpReg2 = MF.getRegInfo().createVirtualRegister(RC);
-  Register VSRpReg3 = MF.getRegInfo().createVirtualRegister(RC);
-
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg0),
-                    FrameIndex, IsLittleEndian ? 96 : 0);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg1),
-                    FrameIndex, IsLittleEndian ? 64 : 32);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg2),
-                    FrameIndex, IsLittleEndian ? 32 : 64);
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), VSRpReg3),
-                    FrameIndex, IsLittleEndian ? 0 : 96);
-
-  // Kill virtual registers (killedRegState::Killed).
-  BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTDMR512_HI),
-          TargetRegisterInfo::getSubReg(DestReg, PPC::sub_wacc_hi))
-      .addReg(VSRpReg2, RegState::Kill)
-      .addReg(VSRpReg3, RegState::Kill);
-
-  BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTDMR512),
-          TargetRegisterInfo::getSubReg(DestReg, PPC::sub_wacc_lo))
-      .addReg(VSRpReg0, RegState::Kill)
-      .addReg(VSRpReg1, RegState::Kill);
+  if (MI.getOpcode() == PPC::RESTORE_DMRP) {
+    restoreDMR(TargetRegisterInfo::getSubReg(DestReg, PPC::sub_dmr1), 0, 96);
+    restoreDMR(TargetRegisterInfo::getSubReg(DestReg, PPC::sub_dmr0), 128, 224);
+  } else
+    restoreDMR(DestReg, 0, 96);
 
   // Discard the pseudo instruction.
   MBB.erase(II);
@@ -1756,9 +1757,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   case PPC::RESTORE_WACC:
     lowerWACCRestore(II, FrameIndex);
     return true;
+  case PPC::SPILL_DMRP:
   case PPC::SPILL_DMR:
     lowerDMRSpilling(II, FrameIndex);
     return true;
+  case PPC::RESTORE_DMRP:
   case PPC::RESTORE_DMR:
     lowerDMRRestore(II, FrameIndex);
     return true;
diff --git a/llvm/test/CodeGen/PowerPC/dmr-spill.ll b/llvm/test/CodeGen/PowerPC/dmr-spill.ll
index c1b01cd2d3fd..983fce4127af 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-spill.ll
@@ -30,19 +30,19 @@ define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind {
 ; CHECK-NEXT:    lxv v3, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-NEXT:    stxvp vsp36, 128(r1)
+; CHECK-NEXT:    stxvp vsp34, 96(r1)
 ; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1
-; CHECK-NEXT:    dmxxextfdmr512 vsp38, vsp32, wacc0, 0
-; CHECK-NEXT:    stxvp vsp38, 128(r1)
-; CHECK-NEXT:    stxvp vsp32, 96(r1)
 ; CHECK-NEXT:    stxvp vsp36, 64(r1)
 ; CHECK-NEXT:    stxvp vsp34, 32(r1)
 ; CHECK-NEXT:    bl dummy_func@notoc
 ; CHECK-NEXT:    lxvp vsp34, 128(r1)
 ; CHECK-NEXT:    lxvp vsp36, 96(r1)
-; CHECK-NEXT:    lxvp vsp32, 64(r1)
-; CHECK-NEXT:    lxvp vsp38, 32(r1)
-; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp32, vsp38, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r30)
 ; CHECK-NEXT:    stxvp vsp36, 64(r30)
@@ -72,20 +72,20 @@ define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind {
 ; AIX-NEXT:    lxv v3, 16(r4)
 ; AIX-NEXT:    lxv vs0, 0(r5)
 ; AIX-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; AIX-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; AIX-NEXT:    stxvp vsp36, 112(r1)
+; AIX-NEXT:    stxvp vsp34, 144(r1)
 ; AIX-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1
-; AIX-NEXT:    dmxxextfdmr512 vsp38, vsp32, wacc0, 0
-; AIX-NEXT:    stxvp vsp38, 112(r1)
-; AIX-NEXT:    stxvp vsp32, 144(r1)
 ; AIX-NEXT:    stxvp vsp36, 176(r1)
 ; AIX-NEXT:    stxvp vsp34, 208(r1)
 ; AIX-NEXT:    bl .dummy_func[PR]
 ; AIX-NEXT:    nop
 ; AIX-NEXT:    lxvp vsp34, 112(r1)
 ; AIX-NEXT:    lxvp vsp36, 144(r1)
-; AIX-NEXT:    lxvp vsp32, 176(r1)
-; AIX-NEXT:    lxvp vsp38, 208(r1)
-; AIX-NEXT:    dmxxinstdmr512 wacc_hi0, vsp32, vsp38, 1
 ; AIX-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; AIX-NEXT:    lxvp vsp34, 176(r1)
+; AIX-NEXT:    lxvp vsp36, 208(r1)
+; AIX-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
 ; AIX-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; AIX-NEXT:    stxvp vsp36, 96(r31)
 ; AIX-NEXT:    stxvp vsp34, 64(r31)
@@ -115,20 +115,20 @@ define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind {
 ; AIX32-NEXT:    lxv v3, 16(r4)
 ; AIX32-NEXT:    lxv vs0, 0(r5)
 ; AIX32-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; AIX32-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; AIX32-NEXT:    stxvp vsp36, 64(r1)
+; AIX32-NEXT:    stxvp vsp34, 96(r1)
 ; AIX32-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1
-; AIX32-NEXT:    dmxxextfdmr512 vsp38, vsp32, wacc0, 0
-; AIX32-NEXT:    stxvp vsp38, 64(r1)
-; AIX32-NEXT:    stxvp vsp32, 96(r1)
 ; AIX32-NEXT:    stxvp vsp36, 128(r1)
 ; AIX32-NEXT:    stxvp vsp34, 160(r1)
 ; AIX32-NEXT:    bl .dummy_func[PR]
 ; AIX32-NEXT:    nop
 ; AIX32-NEXT:    lxvp vsp34, 64(r1)
 ; AIX32-NEXT:    lxvp vsp36, 96(r1)
-; AIX32-NEXT:    lxvp vsp32, 128(r1)
-; AIX32-NEXT:    lxvp vsp38, 160(r1)
-; AIX32-NEXT:    dmxxinstdmr512 wacc_hi0, vsp32, vsp38, 1
 ; AIX32-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; AIX32-NEXT:    lxvp vsp34, 128(r1)
+; AIX32-NEXT:    lxvp vsp36, 160(r1)
+; AIX32-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
 ; AIX32-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; AIX32-NEXT:    stxvp vsp36, 96(r31)
 ; AIX32-NEXT:    stxvp vsp34, 64(r31)
diff --git a/llvm/test/CodeGen/PowerPC/dmrp-spill.ll b/llvm/test/CodeGen/PowerPC/dmrp-spill.ll
new file mode 100644
index 000000000000..62d42d4a26d5
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/dmrp-spill.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-aix \
+; RUN:   -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-aix \
+; RUN:   -disable-auto-paired-vec-st=false -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX32
+
+declare void @dummy_func()
+declare <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1>, i32)
+
+define dso_local void @test_dmsha3hash(ptr %vopp, ptr %resp) nounwind {
+; CHECK-LABEL: test_dmsha3hash:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    std r0, 16(r1)
+; CHECK-NEXT:    stdu r1, -304(r1)
+; CHECK-NEXT:    std r30, 288(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    mr r30, r4
+; CHECK-NEXT:    dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstdmr512 wacc1, vsp36, vsp34, 0
+; CHECK-NEXT:    lxvp vsp34, 128(r3)
+; CHECK-NEXT:    lxvp vsp36, 160(r3)
+; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 192(r3)
+; CHECK-NEXT:    lxvp vsp36, 224(r3)
+; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    dmsha3hash dmrp0, 5
+; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc1, 0
+; CHECK-NEXT:    stxvp vsp36, 128(r1)
+; CHECK-NEXT:    stxvp vsp34, 96(r1)
+; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi1, 1
+; CHECK-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-NEXT:    stxvp vsp36, 256(r1)
+; CHECK-NEXT:    stxvp vsp34, 224(r1)
+; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp36, 192(r1)
+; CHECK-NEXT:    stxvp vsp34, 160(r1)
+; CHECK-NEXT:    bl dummy_func@notoc
+; CHECK-NEXT:    lxvp vsp34, 128(r1)
+; CHECK-NEXT:    lxvp vsp36, 96(r1)
+; CHECK-NEXT:    dmxxinstdmr512 wacc1, vsp34, vsp36, 0
+; CHECK-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-NEXT:    dmxxinstdmr512 wacc_hi1, vsp34, vsp36, 1
+; CHECK-NEXT:    lxvp vsp34, 256(r1)
+; CHECK-NEXT:    lxvp vsp36, 224(r1)
+; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT:    lxvp vsp34, 192(r1)
+; CHECK-NEXT:    lxvp vsp36, 160(r1)
+; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 224(r30)
+; CHECK-NEXT:    stxvp vsp36, 192(r30)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 160(r30)
+; CHECK-NEXT:    stxvp vsp36, 128(r30)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r30)
+; CHECK-NEXT:    stxvp vsp36, 64(r30)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi1, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r30)
+; CHECK-NEXT:    stxvp vsp36, 0(r30)
+; CHECK-NEXT:    ld r30, 288(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 304
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; AIX-LABEL: test_dmsha3hash:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    mflr r0
+; AIX-NEXT:    std r0, 16(r1)
+; AIX-NEXT:    stdu r1, -384(r1)
+; AIX-NEXT:    std r31, 376(r1) # 8-byte Folded Spill
+; AIX-NEXT:    lxvp vsp34, 224(r3)
+; AIX-NEXT:    lxvp vsp36, 192(r3)
+; AIX-NEXT:    mr r31, r4
+; AIX-NEXT:    dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1
+; AIX-NEXT:    lxvp vsp34, 160(r3)
+; AIX-NEXT:    lxvp vsp36, 128(r3)
+; AIX-NEXT:    dmxxinstdmr512 wacc1, vsp36, vsp34, 0
+; AIX-NEXT:    lxvp vsp34, 96(r3)
+; AIX-NEXT:    lxvp vsp36, 64(r3)
+; AIX-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; AIX-NEXT:    lxvp vsp34, 32(r3)
+; AIX-NEXT:    lxvp vsp36, 0(r3)
+; AIX-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; AIX-NEXT:    dmsha3hash dmrp0, 5
+; AIX-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc1, 0
+; AIX-NEXT:    stxvp vsp36, 112(r1)
+; AIX-NEXT:    stxvp vsp34, 144(r1)
+; AIX-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi1, 1
+; AIX-NEXT:    stxvp vsp36, 176(r1)
+; AIX-NEXT:    stxvp vsp34, 208(r1)
+; AIX-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; AIX-NEXT:    stxvp vsp36, 240(r1)
+; AIX-NEXT:    stxvp vsp34, 272(r1)
+; AIX-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1
+; AIX-NEXT:    stxvp vsp36, 304(r1)
+; AIX-NEXT:    stxvp vsp34, 336(r1)
+; AIX-NEXT:    bl .dummy_func[PR]
+; AIX-NEXT:    nop
+; AIX-NEXT:    lxvp vsp34, 112(r1)
+; AIX-NEXT:    lxvp vsp36, 144(r1)
+; AIX-NEXT:    dmxxinstdmr512 wacc1, vsp34, vsp36, 0
+; AIX-NEXT:    lxvp vsp34, 176(r1)
+; AIX-NEXT:    lxvp vsp36, 208(r1)
+; AIX-NEXT:    dmxxinstdmr512 wacc_hi1, vsp34, vsp36, 1
+; AIX-NEXT:    lxvp vsp34, 240(r1)
+; AIX-NEXT:    lxvp vsp36, 272(r1)
+; AIX-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; AIX-NEXT:    lxvp vsp34, 304(r1)
+; AIX-NEXT:    lxvp vsp36, 336(r1)
+; AIX-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; AIX-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi1, 1
+; AIX-NEXT:    stxvp vsp36, 224(r31)
+; AIX-NEXT:    stxvp vsp34, 192(r31)
+; AIX-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; AIX-NEXT:    stxvp vsp36, 160(r31)
+; AIX-NEXT:    stxvp vsp34, 128(r31)
+; AIX-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; AIX-NEXT:    stxvp vsp36, 96(r31)
+; AIX-NEXT:    stxvp vsp34, 64(r31)
+; AIX-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; AIX-NEXT:    stxvp vsp36, 32(r31)
+; AIX-NEXT:    stxvp vsp34, 0(r31)
+; AIX-NEXT:    ld r31, 376(r1) # 8-byte Folded Reload
+; AIX-NEXT:    addi r1, r1, 384
+; AIX-NEXT:    ld r0, 16(r1)
+; AIX-NEXT:    mtlr r0
+; AIX-NEXT:    blr
+;
+; AIX32-LABEL: test_dmsha3hash:
+; AIX32:       # %bb.0: # %entry
+; AIX32-NEXT:    mflr r0
+; AIX32-NEXT:    stw r0, 8(r1)
+; AIX32-NEXT:    stwu r1, -336(r1)
+; AIX32-NEXT:    stw r31, 332(r1) # 4-byte Folded Spill
+; AIX32-NEXT:    lxvp vsp34, 224(r3)
+; AIX32-NEXT:    lxvp vsp36, 192(r3)
+; AIX32-NEXT:    mr r31, r4
+; AIX32-NEXT:    dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1
+; AIX32-NEXT:    lxvp vsp34, 160(r3)
+; AIX32-NEXT:    lxvp vsp36, 128(r3)
+; AIX32-NEXT:    dmxxinstdmr512 wacc1, vsp36, vsp34, 0
+; AIX32-NEXT:    lxvp vsp34, 96(r3)
+; AIX32-NEXT:    lxvp vsp36, 64(r3)
+; AIX32-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; AIX32-NEXT:    lxvp vsp34, 32(r3)
+; AIX32-NEXT:    lxvp vsp36, 0(r3)
+; AIX32-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; AIX32-NEXT:    dmsha3hash dmrp0, 5
+; AIX32-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc1, 0
+; AIX32-NEXT:    stxvp vsp36, 64(r1)
+; AIX32-NEXT:    stxvp vsp34, 96(r1)
+; AIX32-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi1, 1
+; AIX32-NEXT:    stxvp vsp36, 128(r1)
+; AIX32-NEXT:    stxvp vsp34, 160(r1)
+; AIX32-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; AIX32-NEXT:    stxvp vsp36, 192(r1)
+; AIX32-NEXT:    stxvp vsp34, 224(r1)
+; AIX32-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc_hi0, 1
+; AIX32-NEXT:    stxvp vsp36, 256(r1)
+; AIX32-NEXT:    stxvp vsp34, 288(r1)
+; AIX32-NEXT:    bl .dummy_func[PR]
+; AIX32-NEXT:    nop
+; AIX32-NEXT:    lxvp vsp34, 64(r1)
+; AIX32-NEXT:    lxvp vsp36, 96(r1)
+; AIX32-NEXT:    dmxxinstdmr512 wacc1, vsp34, vsp36, 0
+; AIX32-NEXT:    lxvp vsp34, 128(r1)
+; AIX32-NEXT:    lxvp vsp36, 160(r1)
+; AIX32-NEXT:    dmxxinstdmr512 wacc_hi1, vsp34, vsp36, 1
+; AIX32-NEXT:    lxvp vsp34, 192(r1)
+; AIX32-NEXT:    lxvp vsp36, 224(r1)
+; AIX32-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; AIX32-NEXT:    lxvp vsp34, 256(r1)
+; AIX32-NEXT:    lxvp vsp36, 288(r1)
+; AIX32-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; AIX32-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi1, 1
+; AIX32-NEXT:    stxvp vsp36, 224(r31)
+; AIX32-NEXT:    stxvp vsp34, 192(r31)
+; AIX32-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; AIX32-NEXT:    stxvp vsp36, 160(r31)
+; AIX32-NEXT:    stxvp vsp34, 128(r31)
+; AIX32-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; AIX32-NEXT:    stxvp vsp36, 96(r31)
+; AIX32-NEXT:    stxvp vsp34, 64(r31)
+; AIX32-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; AIX32-NEXT:    stxvp vsp36, 32(r31)
+; AIX32-NEXT:    stxvp vsp34, 0(r31)
+; AIX32-NEXT:    lwz r31, 332(r1) # 4-byte Folded Reload
+; AIX32-NEXT:    addi r1, r1, 336
+; AIX32-NEXT:    lwz r0, 8(r1)
+; AIX32-NEXT:    mtlr r0
+; AIX32-NEXT:    blr
+  entry:
+    %0 = load <2048 x i1>, ptr %vopp, align 64
+    %2 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 5)
+    tail call void @dummy_func()
+    %3 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 5)
+    store <2048 x i1> %2, ptr %resp, align 64
+    ret void
+}

From 835d3034fe96931cf907537b51b9cdd87b59d3ad Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Wed, 18 Jun 2025 20:56:33 +0300
Subject: [PATCH 0843/1322] [AArch64] improve zero-cycle regmov test (#143680)

- Add a `gpr32` suffix to test name to denote the specific register
class being checked
- Expand `-mtriple=arm64-apple-ios` to `-march=arm64` to broaden the
test context to the generic architecture, as the specific triple is not
required
- Port `bl` match to Linux too via the regex: `{{_?foo}}`
- Advance `-mcpu=cyclone` to the newer M series major `-mcpu=apple-m1`
- Use `-mcpu` so that `-mattr=-zcm` has a real effect
- Add a test that generic arm64 doesn't optimize for ZCM
- Distinguish 4 different assembly layouts: NOTCPU, CPU, NOTATTR, ATTR
- Fix broken test logic, for example: `; NOT: mov [[REG2:w[0-9]+]], w3`
matched `mov w1, w3` then `REG2` captured `w1` but then `; NOT: mov w1,
[[REG2]]` matched by prefix `mov, w1, w19` even though it should have
matched `mov w1, w1`. This change adds explicit matches for all of the
generated copies.
---
 .../AArch64/arm64-zero-cycle-regmov-gpr32.ll  | 45 +++++++++++++++++++
 .../AArch64/arm64-zero-cycle-regmov.ll        | 23 ----------
 2 files changed, 45 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr32.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll

diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr32.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr32.ll
new file mode 100644
index 000000000000..5ef6d3e84805
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr32.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define void @t(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: mov w0, w2
+; NOTCPU: mov w1, w3
+; NOTCPU: mov [[REG2:w[0-9]+]], w3
+; NOTCPU: mov [[REG1:w[0-9]+]], w2
+; NOTCPU-NEXT: bl {{_?foo}}
+; NOTCPU: mov w0, [[REG1]]
+; NOTCPU: mov w1, [[REG2]]
+
+; CPU: mov [[REG2:x[0-9]+]], x3
+; CPU: mov [[REG1:x[0-9]+]], x2
+; CPU: mov x0, x2
+; CPU: mov x1, x3
+; CPU-NEXT: bl {{_?foo}}
+; CPU: mov x0, [[REG1]]
+; CPU: mov x1, [[REG2]]
+
+; NOTATTR: mov [[REG2:w[0-9]+]], w3
+; NOTATTR: mov [[REG1:w[0-9]+]], w2
+; NOTATTR: mov w0, w2
+; NOTATTR: mov w1, w3
+; NOTATTR-NEXT: bl {{_?foo}}
+; NOTATTR: mov w0, [[REG1]]
+; NOTATTR: mov w1, [[REG2]]
+
+; ATTR: mov x0, x2
+; ATTR: mov x1, x3
+; ATTR: mov [[REG2:x[0-9]+]], x3
+; ATTR: mov [[REG1:x[0-9]+]], x2
+; ATTR-NEXT: bl {{_?foo}}
+; ATTR: mov x0, [[REG1]]
+; ATTR: mov x1, [[REG2]]
+  %call = call i32 @foo(i32 %c, i32 %d)
+  %call1 = call i32 @foo(i32 %c, i32 %d)
+  unreachable
+}
+
+declare i32 @foo(i32, i32)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
deleted file mode 100644
index b390853d44bf..000000000000
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=-zcm   | FileCheck %s -check-prefixes=CHECK,NOT
-; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+zcm   | FileCheck %s -check-prefixes=CHECK,YES
-; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=CHECK,YES
-
-; rdar://12254953
-define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
-entry:
-; CHECK-LABEL: t:
-; NOT: mov [[REG2:w[0-9]+]], w3
-; NOT: mov [[REG1:w[0-9]+]], w2
-; YES: mov [[REG2:x[0-9]+]], x3
-; YES: mov [[REG1:x[0-9]+]], x2
-; CHECK: bl _foo
-; NOT: mov w0, [[REG1]]
-; NOT: mov w1, [[REG2]]
-; YES: mov x0, [[REG1]]
-; YES: mov x1, [[REG2]]
-  %call = call i32 @foo(i32 %c, i32 %d) nounwind
-  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
-  unreachable
-}
-
-declare i32 @foo(i32, i32)

From 6f4e4ea17745d1414519651eb4067ce14031ea93 Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Wed, 18 Jun 2025 10:56:57 -0700
Subject: [PATCH 0844/1322] [libc] Internal getrandom implementation (#144427)

Implemented an internal getrandom to avoid calls to the public one in
table.h

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/HashTable/CMakeLists.txt   |  3 +-
 libc/src/__support/HashTable/randomness.h     | 16 ++++-----
 .../src/__support/OSUtil/linux/CMakeLists.txt | 13 +++++++
 libc/src/__support/OSUtil/linux/getrandom.h   | 35 +++++++++++++++++++
 libc/src/sys/random/linux/getrandom.cpp       | 13 ++++---
 5 files changed, 63 insertions(+), 17 deletions(-)
 create mode 100644 libc/src/__support/OSUtil/linux/getrandom.h

diff --git a/libc/src/__support/HashTable/CMakeLists.txt b/libc/src/__support/HashTable/CMakeLists.txt
index a1de0680cc7d..698b8d0dfa68 100644
--- a/libc/src/__support/HashTable/CMakeLists.txt
+++ b/libc/src/__support/HashTable/CMakeLists.txt
@@ -15,7 +15,8 @@ if (NOT ${getrandom_index} EQUAL -1)
   message(STATUS "Using getrandom for hashtable randomness")
   set(randomness_compile_flags -DLIBC_HASHTABLE_USE_GETRANDOM)
   set(randomness_extra_depends
-    libc.src.sys.random.getrandom libc.src.errno.errno)
+    libc.src.__support.OSUtil.linux.getrandom
+    libc.hdr.errno_macros)
 endif()
 
 
diff --git a/libc/src/__support/HashTable/randomness.h b/libc/src/__support/HashTable/randomness.h
index 6b58a4125f78..7e54c9aa6ad1 100644
--- a/libc/src/__support/HashTable/randomness.h
+++ b/libc/src/__support/HashTable/randomness.h
@@ -14,8 +14,8 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #if defined(LIBC_HASHTABLE_USE_GETRANDOM)
-#include "src/__support/libc_errno.h"
-#include "src/sys/random/getrandom.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/OSUtil/linux/getrandom.h"
 #endif
 
 namespace LIBC_NAMESPACE_DECL {
@@ -35,20 +35,18 @@ LIBC_INLINE uint64_t next_random_seed() {
     entropy[0] = reinterpret_cast<uint64_t>(&entropy);
     entropy[1] = reinterpret_cast<uint64_t>(&state);
 #if defined(LIBC_HASHTABLE_USE_GETRANDOM)
-    int errno_backup = libc_errno;
     size_t count = sizeof(entropy);
     uint8_t *buffer = reinterpret_cast<uint8_t *>(entropy);
     while (count > 0) {
-      ssize_t len = getrandom(buffer, count, 0);
-      if (len == -1) {
-        if (libc_errno == ENOSYS)
+      auto len = internal::getrandom(buffer, count, 0);
+      if (!len.has_value()) {
+        if (len.error() == ENOSYS)
           break;
         continue;
       }
-      count -= len;
-      buffer += len;
+      count -= len.value();
+      buffer += len.value();
     }
-    libc_errno = errno_backup;
 #endif
     state.update(&entropy, sizeof(entropy));
   }
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index 4681d8c2bb73..f303e54ce7b3 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -24,6 +24,19 @@ add_object_library(
     libc.include.sys_syscall
 )
 
+add_header_library(
+  getrandom
+  HDRS
+    getrandom.h
+  DEPENDS
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.macros.config
+    libc.hdr.types.ssize_t
+    libc.include.sys_syscall
+)
+
 add_header_library(
   vdso_sym
   HDRS
diff --git a/libc/src/__support/OSUtil/linux/getrandom.h b/libc/src/__support/OSUtil/linux/getrandom.h
new file mode 100644
index 000000000000..793639472fee
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/getrandom.h
@@ -0,0 +1,35 @@
+//===------------ Implementation of getrandom function ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_GETRANDOM_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_GETRANDOM_H
+
+#include "hdr/types/ssize_t.h"
+#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+LIBC_INLINE static ErrorOr<ssize_t> getrandom(void *buf, size_t buflen,
+                                              unsigned int flags) {
+  ssize_t ret =
+      LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_getrandom, buf, buflen, flags);
+  if (ret < 0) {
+    return Error(-static_cast<int>(ret));
+  }
+  return ret;
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_GETRANDOM_H
diff --git a/libc/src/sys/random/linux/getrandom.cpp b/libc/src/sys/random/linux/getrandom.cpp
index 0b8471ed8b37..4a95bddfa428 100644
--- a/libc/src/sys/random/linux/getrandom.cpp
+++ b/libc/src/sys/random/linux/getrandom.cpp
@@ -8,24 +8,23 @@
 
 #include "src/sys/random/getrandom.h"
 
+#include "src/__support/OSUtil/linux/getrandom.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-
+#include "src/__support/error_or.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(ssize_t, getrandom,
                    (void *buf, size_t buflen, unsigned int flags)) {
-  ssize_t ret =
-      LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_getrandom, buf, buflen, flags);
-  if (ret < 0) {
-    libc_errno = static_cast<int>(-ret);
+  auto rand = internal::getrandom(buf, buflen, flags);
+  if (!rand.has_value()) {
+    libc_errno = static_cast<int>(rand.error());
     return -1;
   }
-  return ret;
+  return rand.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL

From dfe4d44d8de645d151d3483272c1c1f80c27ab31 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 18 Jun 2025 11:00:13 -0700
Subject: [PATCH 0845/1322] Revert "[VPlan] Remove unnecessary DomTreeUpdater
 flush (NFC)." (#144758)

This reverts commit 2e337349f436d75af112c081df5ec683871cbcc8.

Causes breakages internally, will post reproducer later.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4332332ef5cc..773a5a4a829c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1015,6 +1015,8 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
+  State->CFG.DTU.flush();
+
   VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT);
   if (!Header)
     return;

From 071a6feabd7aeec2c1239719f50f6912cf94d00a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 18 Jun 2025 19:02:17 +0100
Subject: [PATCH 0846/1322] [TTI] Remove PPC hasActiveVectorLength impl,
 simplify interface (NFC). (#142310)

PPCTTIImpl defines hasActiveVectorLength and also getVPMemoryOpCost, but
they appear unused (i.e. no changes to tests).

Remove them, as they complicate the interface for hasActiveVectorLength.
This simplifies the only use in LV as now no placeholder values need to
be passed.

PR: https://github.com/llvm/llvm-project/pull/142310
---
 .../llvm/Analysis/TargetTransformInfo.h       |  8 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  5 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 +-
 .../Target/PowerPC/PPCTargetTransformInfo.cpp | 82 -------------------
 .../Target/PowerPC/PPCTargetTransformInfo.h   |  6 --
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  2 +-
 .../Target/RISCV/RISCVTargetTransformInfo.h   | 13 +--
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +-
 8 files changed, 11 insertions(+), 114 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5e..9dc4eca82492 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1850,11 +1850,9 @@ public:
   /// \name Vector Predication Information
   /// @{
   /// Whether the target supports the %evl parameter of VP intrinsic efficiently
-  /// in hardware, for the given opcode and type/alignment. (see LLVM Language
-  /// Reference - "Vector Predication Intrinsics").
-  /// Use of %evl is discouraged when that is not the case.
-  LLVM_ABI bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
-                                      Align Alignment) const;
+  /// in hardware. (see LLVM Language Reference - "Vector Predication
+  /// Intrinsics"). Use of %evl is discouraged when that is not the case.
+  LLVM_ABI bool hasActiveVectorLength() const;
 
   /// Return true if sinking I's operands to the same basic block as I is
   /// profitable, e.g. because the operands can be folded into a target
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179ba..d93375218394 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1108,10 +1108,7 @@ public:
 
   virtual bool enableScalableVectorization() const { return false; }
 
-  virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
-                                     Align Alignment) const {
-    return false;
-  }
+  virtual bool hasActiveVectorLength() const { return false; }
 
   virtual bool isProfitableToSinkOperands(Instruction *I,
                                           SmallVectorImpl<Use *> &Ops) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa..d9cb11de9c09 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1454,9 +1454,8 @@ bool TargetTransformInfo::enableScalableVectorization() const {
   return TTIImpl->enableScalableVectorization();
 }
 
-bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
-                                                Align Alignment) const {
-  return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
+bool TargetTransformInfo::hasActiveVectorLength() const {
+  return TTIImpl->hasActiveVectorLength();
 }
 
 bool TargetTransformInfo::isProfitableToSinkOperands(
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f9e77f2abdca..cd9b226ca82d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1027,88 +1027,6 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   return false;
 }
 
-bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
-                                       Align Alignment) const {
-  // Only load and stores instructions can have variable vector length on Power.
-  if (Opcode != Instruction::Load && Opcode != Instruction::Store)
-    return false;
-  // Loads/stores with length instructions use bits 0-7 of the GPR operand and
-  // therefore cannot be used in 32-bit mode.
-  if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
-    return false;
-  if (isa<FixedVectorType>(DataType)) {
-    unsigned VecWidth = DataType->getPrimitiveSizeInBits();
-    return VecWidth == 128;
-  }
-  Type *ScalarTy = DataType->getScalarType();
-
-  if (ScalarTy->isPointerTy())
-    return true;
-
-  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
-    return true;
-
-  if (!ScalarTy->isIntegerTy())
-    return false;
-
-  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
-  return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
-}
-
-InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src,
-                                              Align Alignment,
-                                              unsigned AddressSpace,
-                                              TTI::TargetCostKind CostKind,
-                                              const Instruction *I) const {
-  InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
-                                                  AddressSpace, CostKind, I);
-  if (TLI->getValueType(DL, Src, true) == MVT::Other)
-    return Cost;
-  // TODO: Handle other cost kinds.
-  if (CostKind != TTI::TCK_RecipThroughput)
-    return Cost;
-
-  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
-         "Invalid Opcode");
-
-  auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
-  assert(SrcVTy && "Expected a vector type for VP memory operations");
-
-  if (hasActiveVectorLength(Opcode, Src, Alignment)) {
-    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
-
-    InstructionCost CostFactor =
-        vectorCostAdjustmentFactor(Opcode, Src, nullptr);
-    if (!CostFactor.isValid())
-      return InstructionCost::getMax();
-
-    InstructionCost Cost = LT.first * CostFactor;
-    assert(Cost.isValid() && "Expected valid cost");
-
-    // On P9 but not on P10, if the op is misaligned then it will cause a
-    // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
-    // ones.
-    const Align DesiredAlignment(16);
-    if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
-      return Cost;
-
-    // Since alignment may be under estimated, we try to compute the probability
-    // that the actual address is aligned to the desired boundary. For example
-    // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
-    // time, while a 4-byte aligned load has a 25% chance of being 16-byte
-    // aligned.
-    float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
-    float MisalignmentProb = 1.0 - AlignmentProb;
-    return (MisalignmentProb * P9PipelineFlushEstimate) +
-           (AlignmentProb * Cost.getValue());
-  }
-
-  // Usually we should not get to this point, but the following is an attempt to
-  // model the cost of legalization. Currently we can only lower intrinsics with
-  // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
-  return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
-}
-
 bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const {
   return TLI->supportsTailCallFor(CB);
 }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8618f3064c18..bc5f7a4d06de 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -148,12 +148,6 @@ public:
                            const Function *Callee) const override;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                              const ArrayRef<Type *> &Types) const override;
-  bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
-                             Align Alignment) const override;
-  InstructionCost
-  getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                    unsigned AddressSpace, TTI::TargetCostKind CostKind,
-                    const Instruction *I = nullptr) const override;
   bool supportsTailCallFor(const CallBase *CB) const override;
 
 private:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 46e30ce4c18a..63c5f17a8487 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -282,7 +282,7 @@ RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
   return TTI::TCC_Free;
 }
 
-bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
+bool RISCVTTIImpl::hasActiveVectorLength() const {
   return ST->hasVInstructions();
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index dd7e9f7709f8..75d377abb0e7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -90,19 +90,12 @@ public:
 
   /// \name EVL Support for predicated vectorization.
   /// Whether the target supports the %evl parameter of VP intrinsic efficiently
-  /// in hardware, for the given opcode and type/alignment. (see LLVM Language
-  /// Reference - "Vector Predication Intrinsics",
+  /// in hardware. (see LLVM Language Reference - "Vector Predication
+  /// Intrinsics",
   /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics and
   /// "IR-level VP intrinsics",
   /// https://llvm.org/docs/Proposals/VectorPredication.html#ir-level-vp-intrinsics).
-  /// \param Opcode the opcode of the instruction checked for predicated version
-  /// support.
-  /// \param DataType the type of the instruction with the \p Opcode checked for
-  /// prediction support.
-  /// \param Alignment the alignment for memory access operation checked for
-  /// predicated version support.
-  bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
-                             Align Alignment) const override;
+  bool hasActiveVectorLength() const override;
 
   TargetTransformInfo::PopcntSupportKind
   getPopcntSupport(unsigned TyWidth) const override;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2f4416d2782e..3b16248f962b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1377,11 +1377,9 @@ public:
     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
       return;
     // Override forced styles if needed.
-    // FIXME: use actual opcode/data type for analysis here.
     // FIXME: Investigate opportunity for fixed vector factor.
     bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
-                      TTI.hasActiveVectorLength(0, nullptr, Align()) &&
-                      !EnableVPlanNativePath;
+                      TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
     if (!EVLIsLegal) {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail

From 3f3526f36d23eac8d099e8e887a924c94000bbfa Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Wed, 18 Jun 2025 14:15:30 -0400
Subject: [PATCH 0847/1322] [NFC][PowerPC] pre-commit running the
 update_llc_test_checks.py for all-atomics.ll,loop-comment.ll etc (#144411)

Run the update_llc_test_checks.py for all-atomics.ll,loop-comment.ll
,PR35812-neg-cmpxchg.ll (Pre-commit patch for the
https://github.com/llvm/llvm-project/pull/144089)
---
 .../CodeGen/PowerPC/PR35812-neg-cmpxchg.ll    |   94 +-
 llvm/test/CodeGen/PowerPC/all-atomics.ll      | 1929 +++++++++--------
 llvm/test/CodeGen/PowerPC/loop-comment.ll     |   14 +-
 3 files changed, 1020 insertions(+), 1017 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
index 1a8dabc5ad71..dac17dc3225e 100644
--- a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
+++ b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
@@ -18,54 +18,54 @@ define signext i32 @main() nounwind {
 ; CHECK-NEXT:    sth 3, 46(1)
 ; CHECK-NEXT:    addi 3, 1, 46
 ; CHECK-NEXT:    lharx 4, 0, 3
-; CHECK-NEXT:    clrlwi  4, 4, 16
-; CHECK-NEXT:    cmplwi  4, 33059
-; CHECK-NEXT:    bne     0, .LBB0_4
-; CHECK-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:    clrlwi 4, 4, 16
+; CHECK-NEXT:    cmplwi 4, 33059
+; CHECK-NEXT:    bne 0, .LBB0_4
+; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    li 4, 234
-; CHECK-NEXT:    .p2align        5
-; CHECK-NEXT:  .LBB0_2:                                # %cmpxchg.trystore
-; CHECK-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_2: # %cmpxchg.trystore
+; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 4, 0, 3
-; CHECK-NEXT:    beq     0, .LBB0_7
-; CHECK-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
-; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    beq 0, .LBB0_7
+; CHECK-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 5, 0, 3
-; CHECK-NEXT:    clrlwi  5, 5, 16
-; CHECK-NEXT:    cmplwi  5, 33059
-; CHECK-NEXT:    beq     0, .LBB0_2
-; CHECK-NEXT:  .LBB0_4:                                # %cmpxchg.nostore
+; CHECK-NEXT:    clrlwi 5, 5, 16
+; CHECK-NEXT:    cmplwi 5, 33059
+; CHECK-NEXT:    beq 0, .LBB0_2
+; CHECK-NEXT:  .LBB0_4: # %cmpxchg.nostore
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    b .LBB0_8
-; CHECK-NEXT:  .LBB0_5:                                # %L.B0000
+; CHECK-NEXT:  .LBB0_5: # %L.B0000
 ; CHECK-NEXT:    lhz 3, 46(1)
-; CHECK-NEXT:    cmplwi  3, 234
-; CHECK-NEXT:    bne     0, .LBB0_9
-; CHECK-NEXT:  # %bb.6:                                # %L.B0001
+; CHECK-NEXT:    cmplwi 3, 234
+; CHECK-NEXT:    bne 0, .LBB0_9
+; CHECK-NEXT:  # %bb.6: # %L.B0001
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    bl puts
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    b .LBB0_11
-; CHECK-NEXT:  .LBB0_7:                                # %cmpxchg.success
+; CHECK-NEXT:  .LBB0_7: # %cmpxchg.success
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    b .LBB0_5
-; CHECK-NEXT:  .LBB0_8:                                # %L.B0003
+; CHECK-NEXT:  .LBB0_8: # %L.B0003
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    addi 3, 3, 16
 ; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_9:                                # %L.B0005
+; CHECK-NEXT:  .LBB0_9: # %L.B0005
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    addi 3, 3, 64
-; CHECK-NEXT:  .LBB0_10:                               # %L.B0003
+; CHECK-NEXT:  .LBB0_10: # %L.B0003
 ; CHECK-NEXT:    bl puts
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB0_11:                               # %L.B0003
+; CHECK-NEXT:  .LBB0_11: # %L.B0003
 ; CHECK-NEXT:    addi 1, 1, 48
 ; CHECK-NEXT:    ld 0, 16(1)
 ; CHECK-NEXT:    mtlr 0
@@ -83,62 +83,62 @@ define signext i32 @main() nounwind {
 ; CHECK-P7-NEXT:    rlwinm 4, 4, 3, 27, 27
 ; CHECK-P7-NEXT:    lwarx 5, 0, 3
 ; CHECK-P7-NEXT:    srw 6, 5, 4
-; CHECK-P7-NEXT:    clrlwi  6, 6, 16
-; CHECK-P7-NEXT:    cmplwi  6, 33059
-; CHECK-P7-NEXT:    bne     0, .LBB0_4
-; CHECK-P7-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; CHECK-P7-NEXT:    clrlwi 6, 6, 16
+; CHECK-P7-NEXT:    cmplwi 6, 33059
+; CHECK-P7-NEXT:    bne 0, .LBB0_4
+; CHECK-P7-NEXT:  # %bb.1: # %cmpxchg.fencedstore
 ; CHECK-P7-NEXT:    lis 6, 0
 ; CHECK-P7-NEXT:    li 7, 234
 ; CHECK-P7-NEXT:    sync
 ; CHECK-P7-NEXT:    ori 6, 6, 65535
 ; CHECK-P7-NEXT:    slw 7, 7, 4
 ; CHECK-P7-NEXT:    slw 6, 6, 4
-; CHECK-P7-NEXT:    not     6, 6
-; CHECK-P7-NEXT:    .p2align        4
-; CHECK-P7-NEXT:  .LBB0_2:                                # %cmpxchg.trystore
-; CHECK-P7-NEXT:                                        # =>This Inner Loop Header: Depth=1
+; CHECK-P7-NEXT:    not 6, 6
+; CHECK-P7-NEXT:    .p2align 4
+; CHECK-P7-NEXT:  .LBB0_2: # %cmpxchg.trystore
+; CHECK-P7-NEXT:    #
 ; CHECK-P7-NEXT:    and 5, 5, 6
 ; CHECK-P7-NEXT:    or 5, 5, 7
 ; CHECK-P7-NEXT:    stwcx. 5, 0, 3
-; CHECK-P7-NEXT:    beq     0, .LBB0_7
-; CHECK-P7-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
-; CHECK-P7-NEXT:                                        #   in Loop: Header=BB0_2 Depth=1
+; CHECK-P7-NEXT:    beq 0, .LBB0_7
+; CHECK-P7-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; CHECK-P7-NEXT:    #
 ; CHECK-P7-NEXT:    lwarx 5, 0, 3
 ; CHECK-P7-NEXT:    srw 8, 5, 4
-; CHECK-P7-NEXT:    clrlwi  8, 8, 16
-; CHECK-P7-NEXT:    cmplwi  8, 33059
-; CHECK-P7-NEXT:    beq     0, .LBB0_2
-; CHECK-P7-NEXT:  .LBB0_4:                                # %cmpxchg.nostore
+; CHECK-P7-NEXT:    clrlwi 8, 8, 16
+; CHECK-P7-NEXT:    cmplwi 8, 33059
+; CHECK-P7-NEXT:    beq 0, .LBB0_2
+; CHECK-P7-NEXT:  .LBB0_4: # %cmpxchg.nostore
 ; CHECK-P7-NEXT:    lwsync
 ; CHECK-P7-NEXT:    b .LBB0_8
-; CHECK-P7-NEXT:  .LBB0_5:                                # %L.B0000
+; CHECK-P7-NEXT:  .LBB0_5: # %L.B0000
 ; CHECK-P7-NEXT:    lhz 3, 46(1)
-; CHECK-P7-NEXT:    cmplwi  3, 234
-; CHECK-P7-NEXT:    bne     0, .LBB0_9
-; CHECK-P7-NEXT:  # %bb.6:                                # %L.B0001
+; CHECK-P7-NEXT:    cmplwi 3, 234
+; CHECK-P7-NEXT:    bne 0, .LBB0_9
+; CHECK-P7-NEXT:  # %bb.6: # %L.B0001
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    bl puts
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    li 3, 0
 ; CHECK-P7-NEXT:    b .LBB0_11
-; CHECK-P7-NEXT:  .LBB0_7:                                # %cmpxchg.success
+; CHECK-P7-NEXT:  .LBB0_7: # %cmpxchg.success
 ; CHECK-P7-NEXT:    lwsync
 ; CHECK-P7-NEXT:    b .LBB0_5
-; CHECK-P7-NEXT:  .LBB0_8:                                # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_8: # %L.B0003
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    addi 3, 3, 16
 ; CHECK-P7-NEXT:    b .LBB0_10
-; CHECK-P7-NEXT:  .LBB0_9:                                # %L.B0005
+; CHECK-P7-NEXT:  .LBB0_9: # %L.B0005
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    addi 3, 3, 64
-; CHECK-P7-NEXT:  .LBB0_10:                               # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_10: # %L.B0003
 ; CHECK-P7-NEXT:    bl puts
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    li 3, 1
-; CHECK-P7-NEXT:  .LBB0_11:                               # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_11: # %L.B0003
 ; CHECK-P7-NEXT:    addi 1, 1, 48
 ; CHECK-P7-NEXT:    ld 0, 16(1)
 ; CHECK-P7-NEXT:    mtlr 0
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 67cee358882f..5e14fbbb6ad6 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -4336,959 +4336,959 @@ entry:
 define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-LABEL: test_compare_and_swap:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:   addis 4, 2, sc@toc@ha
-; CHECK-NEXT:   addis 3, 2, uc@toc@ha
-; CHECK-NEXT:   std 27, -40(1)                          # 8-byte Folded Spill
-; CHECK-NEXT:   std 28, -32(1)                          # 8-byte Folded Spill
-; CHECK-NEXT:   std 29, -24(1)                          # 8-byte Folded Spill
-; CHECK-NEXT:   std 30, -16(1)                          # 8-byte Folded Spill
-; CHECK-NEXT:   addi 6, 4, sc@toc@l
-; CHECK-NEXT:   lbz 7, uc@toc@l(3)
-; CHECK-NEXT:   lbz 8, sc@toc@l(4)
-; CHECK-NEXT:   lbarx 5, 0, 6
-; CHECK-NEXT:   clrlwi  9, 5, 24
-; CHECK-NEXT:   cmplw   9, 7
-; CHECK-NEXT:   bne     0, .LBB3_4
-; CHECK-NEXT: # %bb.1:                                # %cmpxchg.fencedstore276
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_2:                                # %cmpxchg.trystore275
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stbcx. 8, 0, 6
-; CHECK-NEXT:   beq     0, .LBB3_4
-; CHECK-NEXT: # %bb.3:                                # %cmpxchg.releasedload274
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT:   lbarx 5, 0, 6
-; CHECK-NEXT:   clrlwi  9, 5, 24
-; CHECK-NEXT:   cmplw   9, 7
-; CHECK-NEXT:   beq     0, .LBB3_2
-; CHECK-NEXT: .LBB3_4:                                # %cmpxchg.nostore272
-; CHECK-NEXT:   addi 7, 3, uc@toc@l
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   stb 5, sc@toc@l(4)
-; CHECK-NEXT:   lbz 9, uc@toc@l(3)
-; CHECK-NEXT:   lbarx 8, 0, 7
-; CHECK-NEXT:   clrlwi  10, 8, 24
-; CHECK-NEXT:   cmplw   10, 9
-; CHECK-NEXT:   bne     0, .LBB3_8
-; CHECK-NEXT: # %bb.5:                                # %cmpxchg.fencedstore257
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   clrlwi  5, 5, 24
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_6:                                # %cmpxchg.trystore256
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stbcx. 5, 0, 7
-; CHECK-NEXT:   beq     0, .LBB3_8
-; CHECK-NEXT: # %bb.7:                                # %cmpxchg.releasedload255
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_6 Depth=1
-; CHECK-NEXT:   lbarx 8, 0, 7
-; CHECK-NEXT:   clrlwi  10, 8, 24
-; CHECK-NEXT:   cmplw   10, 9
-; CHECK-NEXT:   beq     0, .LBB3_6
-; CHECK-NEXT: .LBB3_8:                                # %cmpxchg.nostore253
-; CHECK-NEXT:   addis 5, 2, ss@toc@ha
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   stb 8, uc@toc@l(3)
-; CHECK-NEXT:   clrlwi  10, 8, 24
-; CHECK-NEXT:   lbz 11, sc@toc@l(4)
-; CHECK-NEXT:   addi 8, 5, ss@toc@l
-; CHECK-NEXT:   lharx 9, 0, 8
-; CHECK-NEXT:   clrlwi  12, 9, 16
-; CHECK-NEXT:   cmplw   12, 10
-; CHECK-NEXT:   bne     0, .LBB3_12
-; CHECK-NEXT: # %bb.9:                                # %cmpxchg.fencedstore238
-; CHECK-NEXT:   extsb 11, 11
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   clrlwi  11, 11, 16
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_10:                               # %cmpxchg.trystore237
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   sthcx. 11, 0, 8
-; CHECK-NEXT:   beq     0, .LBB3_12
-; CHECK-NEXT: # %bb.11:                               # %cmpxchg.releasedload236
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_10 Depth=1
-; CHECK-NEXT:   lharx 9, 0, 8
-; CHECK-NEXT:   clrlwi  12, 9, 16
-; CHECK-NEXT:   cmplw   12, 10
-; CHECK-NEXT:   beq     0, .LBB3_10
-; CHECK-NEXT: .LBB3_12:                               # %cmpxchg.nostore234
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   sth 9, ss@toc@l(5)
-; CHECK-NEXT:   addis 5, 2, us@toc@ha
-; CHECK-NEXT:   lbz 11, uc@toc@l(3)
-; CHECK-NEXT:   lbz 12, sc@toc@l(4)
-; CHECK-NEXT:   addi 9, 5, us@toc@l
-; CHECK-NEXT:   lharx 10, 0, 9
-; CHECK-NEXT:   clrlwi  0, 10, 16
-; CHECK-NEXT:   cmplw   0, 11
-; CHECK-NEXT:   bne     0, .LBB3_16
-; CHECK-NEXT: # %bb.13:                               # %cmpxchg.fencedstore219
-; CHECK-NEXT:   extsb 12, 12
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   clrlwi  12, 12, 16
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_14:                               # %cmpxchg.trystore218
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   sthcx. 12, 0, 9
-; CHECK-NEXT:   beq     0, .LBB3_16
-; CHECK-NEXT: # %bb.15:                               # %cmpxchg.releasedload217
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_14 Depth=1
-; CHECK-NEXT:   lharx 10, 0, 9
-; CHECK-NEXT:   clrlwi  0, 10, 16
-; CHECK-NEXT:   cmplw   0, 11
-; CHECK-NEXT:   beq     0, .LBB3_14
-; CHECK-NEXT: .LBB3_16:                               # %cmpxchg.nostore215
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   sth 10, us@toc@l(5)
-; CHECK-NEXT:   addis 5, 2, si@toc@ha
-; CHECK-NEXT:   lbz 12, uc@toc@l(3)
-; CHECK-NEXT:   lbz 0, sc@toc@l(4)
-; CHECK-NEXT:   addi 10, 5, si@toc@l
-; CHECK-NEXT:   lwarx 11, 0, 10
-; CHECK-NEXT:   cmplw   11, 12
-; CHECK-NEXT:   bne     0, .LBB3_20
-; CHECK-NEXT: # %bb.17:                               # %cmpxchg.fencedstore200
-; CHECK-NEXT:   extsb 0, 0
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_18:                               # %cmpxchg.trystore199
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stwcx. 0, 0, 10
-; CHECK-NEXT:   beq     0, .LBB3_20
-; CHECK-NEXT: # %bb.19:                               # %cmpxchg.releasedload198
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_18 Depth=1
-; CHECK-NEXT:   lwarx 11, 0, 10
-; CHECK-NEXT:   cmplw   11, 12
-; CHECK-NEXT:   beq     0, .LBB3_18
-; CHECK-NEXT: .LBB3_20:                               # %cmpxchg.nostore196
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   stw 11, si@toc@l(5)
-; CHECK-NEXT:   addis 5, 2, ui@toc@ha
-; CHECK-NEXT:   lbz 0, uc@toc@l(3)
-; CHECK-NEXT:   lbz 30, sc@toc@l(4)
-; CHECK-NEXT:   addi 11, 5, ui@toc@l
-; CHECK-NEXT:   lwarx 12, 0, 11
-; CHECK-NEXT:   cmplw   12, 0
-; CHECK-NEXT:   bne     0, .LBB3_24
-; CHECK-NEXT: # %bb.21:                               # %cmpxchg.fencedstore181
-; CHECK-NEXT:   extsb 30, 30
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_22:                               # %cmpxchg.trystore180
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stwcx. 30, 0, 11
-; CHECK-NEXT:   beq     0, .LBB3_24
-; CHECK-NEXT: # %bb.23:                               # %cmpxchg.releasedload179
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_22 Depth=1
-; CHECK-NEXT:   lwarx 12, 0, 11
-; CHECK-NEXT:   cmplw   12, 0
-; CHECK-NEXT:   beq     0, .LBB3_22
-; CHECK-NEXT: .LBB3_24:                               # %cmpxchg.nostore177
-; CHECK-NEXT:   addis 30, 2, sll@toc@ha
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   stw 12, ui@toc@l(5)
-; CHECK-NEXT:   lbz 29, uc@toc@l(3)
-; CHECK-NEXT:   lbz 28, sc@toc@l(4)
-; CHECK-NEXT:   addi 12, 30, sll@toc@l
-; CHECK-NEXT:   ldarx 0, 0, 12
-; CHECK-NEXT:   cmpld   0, 29
-; CHECK-NEXT:   bne     0, .LBB3_28
-; CHECK-NEXT: # %bb.25:                               # %cmpxchg.fencedstore162
-; CHECK-NEXT:   extsb 28, 28
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_26:                               # %cmpxchg.trystore161
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stdcx. 28, 0, 12
-; CHECK-NEXT:   beq     0, .LBB3_28
-; CHECK-NEXT: # %bb.27:                               # %cmpxchg.releasedload160
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_26 Depth=1
-; CHECK-NEXT:   ldarx 0, 0, 12
-; CHECK-NEXT:   cmpld   0, 29
-; CHECK-NEXT:   beq     0, .LBB3_26
-; CHECK-NEXT: .LBB3_28:                               # %cmpxchg.nostore158
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   std 0, sll@toc@l(30)
-; CHECK-NEXT:   addis 30, 2, ull@toc@ha
-; CHECK-NEXT:   lbz 28, uc@toc@l(3)
-; CHECK-NEXT:   lbz 27, sc@toc@l(4)
-; CHECK-NEXT:   addi 0, 30, ull@toc@l
-; CHECK-NEXT:   ldarx 29, 0, 0
-; CHECK-NEXT:   cmpld   29, 28
-; CHECK-NEXT:   bne     0, .LBB3_32
-; CHECK-NEXT: # %bb.29:                               # %cmpxchg.fencedstore143
-; CHECK-NEXT:   extsb 27, 27
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_30:                               # %cmpxchg.trystore142
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stdcx. 27, 0, 0
-; CHECK-NEXT:   beq     0, .LBB3_32
-; CHECK-NEXT: # %bb.31:                               # %cmpxchg.releasedload141
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_30 Depth=1
-; CHECK-NEXT:   ldarx 29, 0, 0
-; CHECK-NEXT:   cmpld   29, 28
-; CHECK-NEXT:   beq     0, .LBB3_30
-; CHECK-NEXT: .LBB3_32:                               # %cmpxchg.nostore139
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   std 29, ull@toc@l(30)
-; CHECK-NEXT:   lbz 30, uc@toc@l(3)
-; CHECK-NEXT:   lbz 29, sc@toc@l(4)
-; CHECK-NEXT:   lbarx 28, 0, 6
-; CHECK-NEXT:   clrlwi  28, 28, 24
-; CHECK-NEXT:   cmplw   28, 30
-; CHECK-NEXT:   bne     0, .LBB3_36
-; CHECK-NEXT: # %bb.33:                               # %cmpxchg.fencedstore124
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_34:                               # %cmpxchg.trystore123
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stbcx. 29, 0, 6
-; CHECK-NEXT:   beq     0, .LBB3_37
-; CHECK-NEXT: # %bb.35:                               # %cmpxchg.releasedload122
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_34 Depth=1
-; CHECK-NEXT:   lbarx 28, 0, 6
-; CHECK-NEXT:   clrlwi  28, 28, 24
-; CHECK-NEXT:   cmplw   28, 30
-; CHECK-NEXT:   beq     0, .LBB3_34
-; CHECK-NEXT: .LBB3_36:                               # %cmpxchg.nostore120
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_38
-; CHECK-NEXT: .LBB3_37:                               # %cmpxchg.success121
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_38:                               # %cmpxchg.end118
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 30, 1
-; CHECK-NEXT:   isel 6, 30, 6, 20
-; CHECK-NEXT:   lbz 30, sc@toc@l(4)
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   lbz 6, uc@toc@l(3)
-; CHECK-NEXT:   lbarx 29, 0, 7
-; CHECK-NEXT:   clrlwi  29, 29, 24
-; CHECK-NEXT:   cmplw   29, 6
-; CHECK-NEXT:   bne     0, .LBB3_42
-; CHECK-NEXT: # %bb.39:                               # %cmpxchg.fencedstore105
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_40:                               # %cmpxchg.trystore104
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stbcx. 30, 0, 7
-; CHECK-NEXT:   beq     0, .LBB3_43
-; CHECK-NEXT: # %bb.41:                               # %cmpxchg.releasedload103
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_40 Depth=1
-; CHECK-NEXT:   lbarx 29, 0, 7
-; CHECK-NEXT:   clrlwi  29, 29, 24
-; CHECK-NEXT:   cmplw   29, 6
-; CHECK-NEXT:   beq     0, .LBB3_40
-; CHECK-NEXT: .LBB3_42:                               # %cmpxchg.nostore101
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_44
-; CHECK-NEXT: .LBB3_43:                               # %cmpxchg.success102
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_44:                               # %cmpxchg.end99
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 7, 1
-; CHECK-NEXT:   isel 6, 7, 6, 20
-; CHECK-NEXT:   lbz 7, sc@toc@l(4)
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   lbz 6, uc@toc@l(3)
-; CHECK-NEXT:   lharx 30, 0, 8
-; CHECK-NEXT:   clrlwi  30, 30, 16
-; CHECK-NEXT:   cmplw   30, 6
-; CHECK-NEXT:   bne     0, .LBB3_48
-; CHECK-NEXT: # %bb.45:                               # %cmpxchg.fencedstore86
-; CHECK-NEXT:   extsb 7, 7
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   clrlwi  7, 7, 16
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_46:                               # %cmpxchg.trystore85
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   sthcx. 7, 0, 8
-; CHECK-NEXT:   beq     0, .LBB3_49
-; CHECK-NEXT: # %bb.47:                               # %cmpxchg.releasedload84
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_46 Depth=1
-; CHECK-NEXT:   lharx 30, 0, 8
-; CHECK-NEXT:   clrlwi  30, 30, 16
-; CHECK-NEXT:   cmplw   30, 6
-; CHECK-NEXT:   beq     0, .LBB3_46
-; CHECK-NEXT: .LBB3_48:                               # %cmpxchg.nostore82
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_50
-; CHECK-NEXT: .LBB3_49:                               # %cmpxchg.success83
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_50:                               # %cmpxchg.end80
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 7, 1
-; CHECK-NEXT:   isel 6, 7, 6, 20
-; CHECK-NEXT:   lbz 7, sc@toc@l(4)
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   lbz 6, uc@toc@l(3)
-; CHECK-NEXT:   lharx 8, 0, 9
-; CHECK-NEXT:   clrlwi  8, 8, 16
-; CHECK-NEXT:   cmplw   8, 6
-; CHECK-NEXT:   bne     0, .LBB3_54
-; CHECK-NEXT: # %bb.51:                               # %cmpxchg.fencedstore67
-; CHECK-NEXT:   extsb 7, 7
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   clrlwi  7, 7, 16
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_52:                               # %cmpxchg.trystore66
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   sthcx. 7, 0, 9
-; CHECK-NEXT:   beq     0, .LBB3_55
-; CHECK-NEXT: # %bb.53:                               # %cmpxchg.releasedload65
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_52 Depth=1
-; CHECK-NEXT:   lharx 8, 0, 9
-; CHECK-NEXT:   clrlwi  8, 8, 16
-; CHECK-NEXT:   cmplw   8, 6
-; CHECK-NEXT:   beq     0, .LBB3_52
-; CHECK-NEXT: .LBB3_54:                               # %cmpxchg.nostore63
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_56
-; CHECK-NEXT: .LBB3_55:                               # %cmpxchg.success64
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_56:                               # %cmpxchg.end61
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 7, 1
-; CHECK-NEXT:   isel 6, 7, 6, 20
-; CHECK-NEXT:   lbz 7, sc@toc@l(4)
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   lbz 6, uc@toc@l(3)
-; CHECK-NEXT:   lwarx 8, 0, 10
-; CHECK-NEXT:   cmplw   8, 6
-; CHECK-NEXT:   bne     0, .LBB3_60
-; CHECK-NEXT: # %bb.57:                               # %cmpxchg.fencedstore48
-; CHECK-NEXT:   extsb 7, 7
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_58:                               # %cmpxchg.trystore47
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stwcx. 7, 0, 10
-; CHECK-NEXT:   beq     0, .LBB3_61
-; CHECK-NEXT: # %bb.59:                               # %cmpxchg.releasedload46
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_58 Depth=1
-; CHECK-NEXT:   lwarx 8, 0, 10
-; CHECK-NEXT:   cmplw   8, 6
-; CHECK-NEXT:   beq     0, .LBB3_58
-; CHECK-NEXT: .LBB3_60:                               # %cmpxchg.nostore44
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_62
-; CHECK-NEXT: .LBB3_61:                               # %cmpxchg.success45
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_62:                               # %cmpxchg.end42
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 7, 1
-; CHECK-NEXT:   isel 6, 7, 6, 20
-; CHECK-NEXT:   lbz 7, sc@toc@l(4)
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   lbz 6, uc@toc@l(3)
-; CHECK-NEXT:   lwarx 8, 0, 11
-; CHECK-NEXT:   cmplw   8, 6
-; CHECK-NEXT:   bne     0, .LBB3_66
-; CHECK-NEXT: # %bb.63:                               # %cmpxchg.fencedstore29
-; CHECK-NEXT:   extsb 7, 7
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_64:                               # %cmpxchg.trystore28
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stwcx. 7, 0, 11
-; CHECK-NEXT:   beq     0, .LBB3_67
-; CHECK-NEXT: # %bb.65:                               # %cmpxchg.releasedload27
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_64 Depth=1
-; CHECK-NEXT:   lwarx 8, 0, 11
-; CHECK-NEXT:   cmplw   8, 6
-; CHECK-NEXT:   beq     0, .LBB3_64
-; CHECK-NEXT: .LBB3_66:                               # %cmpxchg.nostore25
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_68
-; CHECK-NEXT: .LBB3_67:                               # %cmpxchg.success26
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_68:                               # %cmpxchg.end23
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 7, 1
-; CHECK-NEXT:   isel 6, 7, 6, 20
-; CHECK-NEXT:   lbz 7, sc@toc@l(4)
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   lbz 6, uc@toc@l(3)
-; CHECK-NEXT:   ldarx 8, 0, 12
-; CHECK-NEXT:   cmpld   8, 6
-; CHECK-NEXT:   bne     0, .LBB3_72
-; CHECK-NEXT: # %bb.69:                               # %cmpxchg.fencedstore10
-; CHECK-NEXT:   extsb 7, 7
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_70:                               # %cmpxchg.trystore9
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stdcx. 7, 0, 12
-; CHECK-NEXT:   beq     0, .LBB3_73
-; CHECK-NEXT: # %bb.71:                               # %cmpxchg.releasedload8
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_70 Depth=1
-; CHECK-NEXT:   ldarx 8, 0, 12
-; CHECK-NEXT:   cmpld   8, 6
-; CHECK-NEXT:   beq     0, .LBB3_70
-; CHECK-NEXT: .LBB3_72:                               # %cmpxchg.nostore6
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_74
-; CHECK-NEXT: .LBB3_73:                               # %cmpxchg.success7
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_74:                               # %cmpxchg.end4
-; CHECK-NEXT:   li 6, 0
-; CHECK-NEXT:   li 7, 1
-; CHECK-NEXT:   lbz 3, uc@toc@l(3)
-; CHECK-NEXT:   lbz 4, sc@toc@l(4)
-; CHECK-NEXT:   isel 6, 7, 6, 20
-; CHECK-NEXT:   stw 6, ui@toc@l(5)
-; CHECK-NEXT:   ldarx 6, 0, 0
-; CHECK-NEXT:   cmpld   6, 3
-; CHECK-NEXT:   bne     0, .LBB3_78
-; CHECK-NEXT: # %bb.75:                               # %cmpxchg.fencedstore
-; CHECK-NEXT:   extsb 4, 4
-; CHECK-NEXT:   sync
-; CHECK-NEXT:   .p2align        5
-; CHECK-NEXT: .LBB3_76:                               # %cmpxchg.trystore
-; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:   stdcx. 4, 0, 0
-; CHECK-NEXT:   beq     0, .LBB3_79
-; CHECK-NEXT: # %bb.77:                               # %cmpxchg.releasedload
-; CHECK-NEXT:                                         #   in Loop: Header=BB3_76 Depth=1
-; CHECK-NEXT:   ldarx 6, 0, 0
-; CHECK-NEXT:   cmpld   6, 3
-; CHECK-NEXT:   beq     0, .LBB3_76
-; CHECK-NEXT: .LBB3_78:                               # %cmpxchg.nostore
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT:   b .LBB3_80
-; CHECK-NEXT: .LBB3_79:                               # %cmpxchg.success
-; CHECK-NEXT:   lwsync
-; CHECK-NEXT:   creqv 20, 20, 20
-; CHECK-NEXT: .LBB3_80:                               # %cmpxchg.end
-; CHECK-NEXT:   li 3, 0
-; CHECK-NEXT:   li 4, 1
-; CHECK-NEXT:   ld 30, -16(1)                           # 8-byte Folded Reload
-; CHECK-NEXT:   ld 29, -24(1)                           # 8-byte Folded Reload
-; CHECK-NEXT:   ld 28, -32(1)                           # 8-byte Folded Reload
-; CHECK-NEXT:   ld 27, -40(1)                           # 8-byte Folded Reload
-; CHECK-NEXT:   isel 3, 4, 3, 20
-; CHECK-NEXT:   stw 3, ui@toc@l(5)
-; CHECK-NEXT:   blr
+; CHECK-NEXT:    addis 4, 2, sc@toc@ha
+; CHECK-NEXT:    addis 3, 2, uc@toc@ha
+; CHECK-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addi 6, 4, sc@toc@l
+; CHECK-NEXT:    lbz 7, uc@toc@l(3)
+; CHECK-NEXT:    lbz 8, sc@toc@l(4)
+; CHECK-NEXT:    lbarx 5, 0, 6
+; CHECK-NEXT:    clrlwi 9, 5, 24
+; CHECK-NEXT:    cmplw 9, 7
+; CHECK-NEXT:    bne 0, .LBB3_4
+; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore276
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_2: # %cmpxchg.trystore275
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stbcx. 8, 0, 6
+; CHECK-NEXT:    beq 0, .LBB3_4
+; CHECK-NEXT:  # %bb.3: # %cmpxchg.releasedload274
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lbarx 5, 0, 6
+; CHECK-NEXT:    clrlwi 9, 5, 24
+; CHECK-NEXT:    cmplw 9, 7
+; CHECK-NEXT:    beq 0, .LBB3_2
+; CHECK-NEXT:  .LBB3_4: # %cmpxchg.nostore272
+; CHECK-NEXT:    addi 7, 3, uc@toc@l
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    stb 5, sc@toc@l(4)
+; CHECK-NEXT:    lbz 9, uc@toc@l(3)
+; CHECK-NEXT:    lbarx 8, 0, 7
+; CHECK-NEXT:    clrlwi 10, 8, 24
+; CHECK-NEXT:    cmplw 10, 9
+; CHECK-NEXT:    bne 0, .LBB3_8
+; CHECK-NEXT:  # %bb.5: # %cmpxchg.fencedstore257
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    clrlwi 5, 5, 24
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_6: # %cmpxchg.trystore256
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stbcx. 5, 0, 7
+; CHECK-NEXT:    beq 0, .LBB3_8
+; CHECK-NEXT:  # %bb.7: # %cmpxchg.releasedload255
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lbarx 8, 0, 7
+; CHECK-NEXT:    clrlwi 10, 8, 24
+; CHECK-NEXT:    cmplw 10, 9
+; CHECK-NEXT:    beq 0, .LBB3_6
+; CHECK-NEXT:  .LBB3_8: # %cmpxchg.nostore253
+; CHECK-NEXT:    addis 5, 2, ss@toc@ha
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    stb 8, uc@toc@l(3)
+; CHECK-NEXT:    clrlwi 10, 8, 24
+; CHECK-NEXT:    lbz 11, sc@toc@l(4)
+; CHECK-NEXT:    addi 8, 5, ss@toc@l
+; CHECK-NEXT:    lharx 9, 0, 8
+; CHECK-NEXT:    clrlwi 12, 9, 16
+; CHECK-NEXT:    cmplw 12, 10
+; CHECK-NEXT:    bne 0, .LBB3_12
+; CHECK-NEXT:  # %bb.9: # %cmpxchg.fencedstore238
+; CHECK-NEXT:    extsb 11, 11
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    clrlwi 11, 11, 16
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_10: # %cmpxchg.trystore237
+; CHECK-NEXT:    #
+; CHECK-NEXT:    sthcx. 11, 0, 8
+; CHECK-NEXT:    beq 0, .LBB3_12
+; CHECK-NEXT:  # %bb.11: # %cmpxchg.releasedload236
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lharx 9, 0, 8
+; CHECK-NEXT:    clrlwi 12, 9, 16
+; CHECK-NEXT:    cmplw 12, 10
+; CHECK-NEXT:    beq 0, .LBB3_10
+; CHECK-NEXT:  .LBB3_12: # %cmpxchg.nostore234
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    sth 9, ss@toc@l(5)
+; CHECK-NEXT:    addis 5, 2, us@toc@ha
+; CHECK-NEXT:    lbz 11, uc@toc@l(3)
+; CHECK-NEXT:    lbz 12, sc@toc@l(4)
+; CHECK-NEXT:    addi 9, 5, us@toc@l
+; CHECK-NEXT:    lharx 10, 0, 9
+; CHECK-NEXT:    clrlwi 0, 10, 16
+; CHECK-NEXT:    cmplw 0, 11
+; CHECK-NEXT:    bne 0, .LBB3_16
+; CHECK-NEXT:  # %bb.13: # %cmpxchg.fencedstore219
+; CHECK-NEXT:    extsb 12, 12
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    clrlwi 12, 12, 16
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_14: # %cmpxchg.trystore218
+; CHECK-NEXT:    #
+; CHECK-NEXT:    sthcx. 12, 0, 9
+; CHECK-NEXT:    beq 0, .LBB3_16
+; CHECK-NEXT:  # %bb.15: # %cmpxchg.releasedload217
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lharx 10, 0, 9
+; CHECK-NEXT:    clrlwi 0, 10, 16
+; CHECK-NEXT:    cmplw 0, 11
+; CHECK-NEXT:    beq 0, .LBB3_14
+; CHECK-NEXT:  .LBB3_16: # %cmpxchg.nostore215
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    sth 10, us@toc@l(5)
+; CHECK-NEXT:    addis 5, 2, si@toc@ha
+; CHECK-NEXT:    lbz 12, uc@toc@l(3)
+; CHECK-NEXT:    lbz 0, sc@toc@l(4)
+; CHECK-NEXT:    addi 10, 5, si@toc@l
+; CHECK-NEXT:    lwarx 11, 0, 10
+; CHECK-NEXT:    cmplw 11, 12
+; CHECK-NEXT:    bne 0, .LBB3_20
+; CHECK-NEXT:  # %bb.17: # %cmpxchg.fencedstore200
+; CHECK-NEXT:    extsb 0, 0
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_18: # %cmpxchg.trystore199
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stwcx. 0, 0, 10
+; CHECK-NEXT:    beq 0, .LBB3_20
+; CHECK-NEXT:  # %bb.19: # %cmpxchg.releasedload198
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lwarx 11, 0, 10
+; CHECK-NEXT:    cmplw 11, 12
+; CHECK-NEXT:    beq 0, .LBB3_18
+; CHECK-NEXT:  .LBB3_20: # %cmpxchg.nostore196
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    stw 11, si@toc@l(5)
+; CHECK-NEXT:    addis 5, 2, ui@toc@ha
+; CHECK-NEXT:    lbz 0, uc@toc@l(3)
+; CHECK-NEXT:    lbz 30, sc@toc@l(4)
+; CHECK-NEXT:    addi 11, 5, ui@toc@l
+; CHECK-NEXT:    lwarx 12, 0, 11
+; CHECK-NEXT:    cmplw 12, 0
+; CHECK-NEXT:    bne 0, .LBB3_24
+; CHECK-NEXT:  # %bb.21: # %cmpxchg.fencedstore181
+; CHECK-NEXT:    extsb 30, 30
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_22: # %cmpxchg.trystore180
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stwcx. 30, 0, 11
+; CHECK-NEXT:    beq 0, .LBB3_24
+; CHECK-NEXT:  # %bb.23: # %cmpxchg.releasedload179
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lwarx 12, 0, 11
+; CHECK-NEXT:    cmplw 12, 0
+; CHECK-NEXT:    beq 0, .LBB3_22
+; CHECK-NEXT:  .LBB3_24: # %cmpxchg.nostore177
+; CHECK-NEXT:    addis 30, 2, sll@toc@ha
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    stw 12, ui@toc@l(5)
+; CHECK-NEXT:    lbz 29, uc@toc@l(3)
+; CHECK-NEXT:    lbz 28, sc@toc@l(4)
+; CHECK-NEXT:    addi 12, 30, sll@toc@l
+; CHECK-NEXT:    ldarx 0, 0, 12
+; CHECK-NEXT:    cmpld 0, 29
+; CHECK-NEXT:    bne 0, .LBB3_28
+; CHECK-NEXT:  # %bb.25: # %cmpxchg.fencedstore162
+; CHECK-NEXT:    extsb 28, 28
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_26: # %cmpxchg.trystore161
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stdcx. 28, 0, 12
+; CHECK-NEXT:    beq 0, .LBB3_28
+; CHECK-NEXT:  # %bb.27: # %cmpxchg.releasedload160
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ldarx 0, 0, 12
+; CHECK-NEXT:    cmpld 0, 29
+; CHECK-NEXT:    beq 0, .LBB3_26
+; CHECK-NEXT:  .LBB3_28: # %cmpxchg.nostore158
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    std 0, sll@toc@l(30)
+; CHECK-NEXT:    addis 30, 2, ull@toc@ha
+; CHECK-NEXT:    lbz 28, uc@toc@l(3)
+; CHECK-NEXT:    lbz 27, sc@toc@l(4)
+; CHECK-NEXT:    addi 0, 30, ull@toc@l
+; CHECK-NEXT:    ldarx 29, 0, 0
+; CHECK-NEXT:    cmpld 29, 28
+; CHECK-NEXT:    bne 0, .LBB3_32
+; CHECK-NEXT:  # %bb.29: # %cmpxchg.fencedstore143
+; CHECK-NEXT:    extsb 27, 27
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_30: # %cmpxchg.trystore142
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stdcx. 27, 0, 0
+; CHECK-NEXT:    beq 0, .LBB3_32
+; CHECK-NEXT:  # %bb.31: # %cmpxchg.releasedload141
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ldarx 29, 0, 0
+; CHECK-NEXT:    cmpld 29, 28
+; CHECK-NEXT:    beq 0, .LBB3_30
+; CHECK-NEXT:  .LBB3_32: # %cmpxchg.nostore139
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    std 29, ull@toc@l(30)
+; CHECK-NEXT:    lbz 30, uc@toc@l(3)
+; CHECK-NEXT:    lbz 29, sc@toc@l(4)
+; CHECK-NEXT:    lbarx 28, 0, 6
+; CHECK-NEXT:    clrlwi 28, 28, 24
+; CHECK-NEXT:    cmplw 28, 30
+; CHECK-NEXT:    bne 0, .LBB3_36
+; CHECK-NEXT:  # %bb.33: # %cmpxchg.fencedstore124
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_34: # %cmpxchg.trystore123
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stbcx. 29, 0, 6
+; CHECK-NEXT:    beq 0, .LBB3_37
+; CHECK-NEXT:  # %bb.35: # %cmpxchg.releasedload122
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lbarx 28, 0, 6
+; CHECK-NEXT:    clrlwi 28, 28, 24
+; CHECK-NEXT:    cmplw 28, 30
+; CHECK-NEXT:    beq 0, .LBB3_34
+; CHECK-NEXT:  .LBB3_36: # %cmpxchg.nostore120
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_38
+; CHECK-NEXT:  .LBB3_37: # %cmpxchg.success121
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_38: # %cmpxchg.end118
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 30, 1
+; CHECK-NEXT:    isel 6, 30, 6, 20
+; CHECK-NEXT:    lbz 30, sc@toc@l(4)
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    lbz 6, uc@toc@l(3)
+; CHECK-NEXT:    lbarx 29, 0, 7
+; CHECK-NEXT:    clrlwi 29, 29, 24
+; CHECK-NEXT:    cmplw 29, 6
+; CHECK-NEXT:    bne 0, .LBB3_42
+; CHECK-NEXT:  # %bb.39: # %cmpxchg.fencedstore105
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_40: # %cmpxchg.trystore104
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stbcx. 30, 0, 7
+; CHECK-NEXT:    beq 0, .LBB3_43
+; CHECK-NEXT:  # %bb.41: # %cmpxchg.releasedload103
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lbarx 29, 0, 7
+; CHECK-NEXT:    clrlwi 29, 29, 24
+; CHECK-NEXT:    cmplw 29, 6
+; CHECK-NEXT:    beq 0, .LBB3_40
+; CHECK-NEXT:  .LBB3_42: # %cmpxchg.nostore101
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_44
+; CHECK-NEXT:  .LBB3_43: # %cmpxchg.success102
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_44: # %cmpxchg.end99
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 1
+; CHECK-NEXT:    isel 6, 7, 6, 20
+; CHECK-NEXT:    lbz 7, sc@toc@l(4)
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    lbz 6, uc@toc@l(3)
+; CHECK-NEXT:    lharx 30, 0, 8
+; CHECK-NEXT:    clrlwi 30, 30, 16
+; CHECK-NEXT:    cmplw 30, 6
+; CHECK-NEXT:    bne 0, .LBB3_48
+; CHECK-NEXT:  # %bb.45: # %cmpxchg.fencedstore86
+; CHECK-NEXT:    extsb 7, 7
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    clrlwi 7, 7, 16
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_46: # %cmpxchg.trystore85
+; CHECK-NEXT:    #
+; CHECK-NEXT:    sthcx. 7, 0, 8
+; CHECK-NEXT:    beq 0, .LBB3_49
+; CHECK-NEXT:  # %bb.47: # %cmpxchg.releasedload84
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lharx 30, 0, 8
+; CHECK-NEXT:    clrlwi 30, 30, 16
+; CHECK-NEXT:    cmplw 30, 6
+; CHECK-NEXT:    beq 0, .LBB3_46
+; CHECK-NEXT:  .LBB3_48: # %cmpxchg.nostore82
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_50
+; CHECK-NEXT:  .LBB3_49: # %cmpxchg.success83
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_50: # %cmpxchg.end80
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 1
+; CHECK-NEXT:    isel 6, 7, 6, 20
+; CHECK-NEXT:    lbz 7, sc@toc@l(4)
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    lbz 6, uc@toc@l(3)
+; CHECK-NEXT:    lharx 8, 0, 9
+; CHECK-NEXT:    clrlwi 8, 8, 16
+; CHECK-NEXT:    cmplw 8, 6
+; CHECK-NEXT:    bne 0, .LBB3_54
+; CHECK-NEXT:  # %bb.51: # %cmpxchg.fencedstore67
+; CHECK-NEXT:    extsb 7, 7
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    clrlwi 7, 7, 16
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_52: # %cmpxchg.trystore66
+; CHECK-NEXT:    #
+; CHECK-NEXT:    sthcx. 7, 0, 9
+; CHECK-NEXT:    beq 0, .LBB3_55
+; CHECK-NEXT:  # %bb.53: # %cmpxchg.releasedload65
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lharx 8, 0, 9
+; CHECK-NEXT:    clrlwi 8, 8, 16
+; CHECK-NEXT:    cmplw 8, 6
+; CHECK-NEXT:    beq 0, .LBB3_52
+; CHECK-NEXT:  .LBB3_54: # %cmpxchg.nostore63
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_56
+; CHECK-NEXT:  .LBB3_55: # %cmpxchg.success64
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_56: # %cmpxchg.end61
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 1
+; CHECK-NEXT:    isel 6, 7, 6, 20
+; CHECK-NEXT:    lbz 7, sc@toc@l(4)
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    lbz 6, uc@toc@l(3)
+; CHECK-NEXT:    lwarx 8, 0, 10
+; CHECK-NEXT:    cmplw 8, 6
+; CHECK-NEXT:    bne 0, .LBB3_60
+; CHECK-NEXT:  # %bb.57: # %cmpxchg.fencedstore48
+; CHECK-NEXT:    extsb 7, 7
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_58: # %cmpxchg.trystore47
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stwcx. 7, 0, 10
+; CHECK-NEXT:    beq 0, .LBB3_61
+; CHECK-NEXT:  # %bb.59: # %cmpxchg.releasedload46
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lwarx 8, 0, 10
+; CHECK-NEXT:    cmplw 8, 6
+; CHECK-NEXT:    beq 0, .LBB3_58
+; CHECK-NEXT:  .LBB3_60: # %cmpxchg.nostore44
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_62
+; CHECK-NEXT:  .LBB3_61: # %cmpxchg.success45
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_62: # %cmpxchg.end42
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 1
+; CHECK-NEXT:    isel 6, 7, 6, 20
+; CHECK-NEXT:    lbz 7, sc@toc@l(4)
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    lbz 6, uc@toc@l(3)
+; CHECK-NEXT:    lwarx 8, 0, 11
+; CHECK-NEXT:    cmplw 8, 6
+; CHECK-NEXT:    bne 0, .LBB3_66
+; CHECK-NEXT:  # %bb.63: # %cmpxchg.fencedstore29
+; CHECK-NEXT:    extsb 7, 7
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_64: # %cmpxchg.trystore28
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stwcx. 7, 0, 11
+; CHECK-NEXT:    beq 0, .LBB3_67
+; CHECK-NEXT:  # %bb.65: # %cmpxchg.releasedload27
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lwarx 8, 0, 11
+; CHECK-NEXT:    cmplw 8, 6
+; CHECK-NEXT:    beq 0, .LBB3_64
+; CHECK-NEXT:  .LBB3_66: # %cmpxchg.nostore25
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_68
+; CHECK-NEXT:  .LBB3_67: # %cmpxchg.success26
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_68: # %cmpxchg.end23
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 1
+; CHECK-NEXT:    isel 6, 7, 6, 20
+; CHECK-NEXT:    lbz 7, sc@toc@l(4)
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    lbz 6, uc@toc@l(3)
+; CHECK-NEXT:    ldarx 8, 0, 12
+; CHECK-NEXT:    cmpld 8, 6
+; CHECK-NEXT:    bne 0, .LBB3_72
+; CHECK-NEXT:  # %bb.69: # %cmpxchg.fencedstore10
+; CHECK-NEXT:    extsb 7, 7
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_70: # %cmpxchg.trystore9
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stdcx. 7, 0, 12
+; CHECK-NEXT:    beq 0, .LBB3_73
+; CHECK-NEXT:  # %bb.71: # %cmpxchg.releasedload8
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ldarx 8, 0, 12
+; CHECK-NEXT:    cmpld 8, 6
+; CHECK-NEXT:    beq 0, .LBB3_70
+; CHECK-NEXT:  .LBB3_72: # %cmpxchg.nostore6
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_74
+; CHECK-NEXT:  .LBB3_73: # %cmpxchg.success7
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_74: # %cmpxchg.end4
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 1
+; CHECK-NEXT:    lbz 3, uc@toc@l(3)
+; CHECK-NEXT:    lbz 4, sc@toc@l(4)
+; CHECK-NEXT:    isel 6, 7, 6, 20
+; CHECK-NEXT:    stw 6, ui@toc@l(5)
+; CHECK-NEXT:    ldarx 6, 0, 0
+; CHECK-NEXT:    cmpld 6, 3
+; CHECK-NEXT:    bne 0, .LBB3_78
+; CHECK-NEXT:  # %bb.75: # %cmpxchg.fencedstore
+; CHECK-NEXT:    extsb 4, 4
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB3_76: # %cmpxchg.trystore
+; CHECK-NEXT:    #
+; CHECK-NEXT:    stdcx. 4, 0, 0
+; CHECK-NEXT:    beq 0, .LBB3_79
+; CHECK-NEXT:  # %bb.77: # %cmpxchg.releasedload
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ldarx 6, 0, 0
+; CHECK-NEXT:    cmpld 6, 3
+; CHECK-NEXT:    beq 0, .LBB3_76
+; CHECK-NEXT:  .LBB3_78: # %cmpxchg.nostore
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:    b .LBB3_80
+; CHECK-NEXT:  .LBB3_79: # %cmpxchg.success
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:  .LBB3_80: # %cmpxchg.end
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    li 4, 1
+; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; CHECK-NEXT:    isel 3, 4, 3, 20
+; CHECK-NEXT:    stw 3, ui@toc@l(5)
+; CHECK-NEXT:    blr
 ;
 ; AIX32-LABEL: test_compare_and_swap:
 ; AIX32:       # %bb.0: # %entry
 ; AIX32-NEXT:    mflr 0
-; AIX32-NEXT:   stwu 1, -144(1)
-; AIX32-NEXT:   stw 0, 152(1)
-; AIX32-NEXT:   stw 29, 132(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   lwz 29, L..C0(2)                        # @sc
-; AIX32-NEXT:   stw 26, 120(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   not     3, 29
-; AIX32-NEXT:   stw 30, 136(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   lwz 30, L..C1(2)                        # @uc
-; AIX32-NEXT:   lbz 4, 0(30)
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   stw 27, 124(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   rlwinm 27, 29, 0, 0, 29
-; AIX32-NEXT:   stw 14, 72(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   stw 15, 76(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   rlwinm 26, 3, 3, 27, 28
-; AIX32-NEXT:   li 3, 255
-; AIX32-NEXT:   slw 3, 3, 26
-; AIX32-NEXT:   stw 16, 80(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   stw 17, 84(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   stw 18, 88(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   stw 19, 92(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   stw 20, 96(1)                           # 4-byte Folded Spill
-; AIX32-NEXT:   stw 21, 100(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   stw 22, 104(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   stw 23, 108(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   stw 24, 112(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   stw 25, 116(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   stw 28, 128(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   stw 31, 140(1)                          # 4-byte Folded Spill
-; AIX32-NEXT:   not     25, 3
-; AIX32-NEXT:   lwarx 3, 0, 27
-; AIX32-NEXT:   srw 6, 3, 26
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 4
-; AIX32-NEXT:   bne     0, L..BB3_4
-; AIX32-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore289
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   slw 5, 5, 26
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_2:                               # %cmpxchg.trystore288
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 6, 3, 25
-; AIX32-NEXT:   or 6, 6, 5
-; AIX32-NEXT:   stwcx. 6, 0, 27
-; AIX32-NEXT:   beq     0, L..BB3_4
-; AIX32-NEXT:  # %bb.3:                                # %cmpxchg.releasedload287
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
-; AIX32-NEXT:   lwarx 3, 0, 27
-; AIX32-NEXT:   srw 6, 3, 26
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 4
-; AIX32-NEXT:   beq     0, L..BB3_2
-; AIX32-NEXT:  L..BB3_4:                               # %cmpxchg.nostore285
-; AIX32-NEXT:   not     4, 30
-; AIX32-NEXT:   srw 5, 3, 26
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   rlwinm 24, 30, 0, 0, 29
-; AIX32-NEXT:   rlwinm 23, 4, 3, 27, 28
-; AIX32-NEXT:   li 4, 255
-; AIX32-NEXT:   stb 5, 0(29)
-; AIX32-NEXT:   slw 4, 4, 23
-; AIX32-NEXT:   not     22, 4
-; AIX32-NEXT:   lwarx 4, 0, 24
-; AIX32-NEXT:   srw 6, 4, 23
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   bne     0, L..BB3_8
-; AIX32-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore256
-; AIX32-NEXT:   clrlwi  5, 5, 24
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   slw 5, 5, 23
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_6:                               # %cmpxchg.trystore255
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 6, 4, 22
-; AIX32-NEXT:   or 6, 6, 5
-; AIX32-NEXT:   stwcx. 6, 0, 24
-; AIX32-NEXT:   beq     0, L..BB3_8
-; AIX32-NEXT:  # %bb.7:                                # %cmpxchg.releasedload254
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_6 Depth=1
-; AIX32-NEXT:   lwarx 4, 0, 24
-; AIX32-NEXT:   srw 6, 4, 23
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   beq     0, L..BB3_6
-; AIX32-NEXT:  L..BB3_8:                               # %cmpxchg.nostore252
-; AIX32-NEXT:   srw 4, 4, 23
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   lis 3, 0
-; AIX32-NEXT:   lbz 7, 0(29)
-; AIX32-NEXT:   stb 4, 0(30)
-; AIX32-NEXT:   clrlwi  6, 4, 24
-; AIX32-NEXT:   lwz 4, L..C2(2)                         # @ss
-; AIX32-NEXT:   ori 3, 3, 65535
-; AIX32-NEXT:   clrlwi  5, 4, 30
-; AIX32-NEXT:   rlwinm 21, 4, 0, 0, 29
-; AIX32-NEXT:   xori 5, 5, 2
-; AIX32-NEXT:   slwi 20, 5, 3
-; AIX32-NEXT:   slw 5, 3, 20
-; AIX32-NEXT:   not     19, 5
-; AIX32-NEXT:   lwarx 5, 0, 21
-; AIX32-NEXT:   srw 8, 5, 20
-; AIX32-NEXT:   clrlwi  8, 8, 16
-; AIX32-NEXT:   cmplw   8, 6
-; AIX32-NEXT:   bne     0, L..BB3_12
-; AIX32-NEXT:  # %bb.9:                                # %cmpxchg.fencedstore223
-; AIX32-NEXT:   extsb 7, 7
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   clrlwi  7, 7, 16
-; AIX32-NEXT:   slw 7, 7, 20
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_10:                              # %cmpxchg.trystore222
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 8, 5, 19
-; AIX32-NEXT:   or 8, 8, 7
-; AIX32-NEXT:   stwcx. 8, 0, 21
-; AIX32-NEXT:   beq     0, L..BB3_12
-; AIX32-NEXT:  # %bb.11:                               # %cmpxchg.releasedload221
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_10 Depth=1
-; AIX32-NEXT:   lwarx 5, 0, 21
-; AIX32-NEXT:   srw 8, 5, 20
-; AIX32-NEXT:   clrlwi  8, 8, 16
-; AIX32-NEXT:   cmplw   8, 6
-; AIX32-NEXT:   beq     0, L..BB3_10
-; AIX32-NEXT:  L..BB3_12:                              # %cmpxchg.nostore219
-; AIX32-NEXT:   srw 5, 5, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   lbz 6, 0(29)
-; AIX32-NEXT:   sth 5, 0(4)
-; AIX32-NEXT:   lwz 4, L..C3(2)                         # @us
-; AIX32-NEXT:   lbz 5, 0(30)
-; AIX32-NEXT:   clrlwi  7, 4, 30
-; AIX32-NEXT:   rlwinm 18, 4, 0, 0, 29
-; AIX32-NEXT:   xori 7, 7, 2
-; AIX32-NEXT:   slwi 17, 7, 3
-; AIX32-NEXT:   slw 3, 3, 17
-; AIX32-NEXT:   not     16, 3
-; AIX32-NEXT:   lwarx 3, 0, 18
-; AIX32-NEXT:   srw 7, 3, 17
-; AIX32-NEXT:   clrlwi  7, 7, 16
-; AIX32-NEXT:   cmplw   7, 5
-; AIX32-NEXT:   bne     0, L..BB3_16
-; AIX32-NEXT:  # %bb.13:                               # %cmpxchg.fencedstore190
-; AIX32-NEXT:   extsb 6, 6
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   clrlwi  6, 6, 16
-; AIX32-NEXT:   slw 6, 6, 17
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_14:                              # %cmpxchg.trystore189
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 7, 3, 16
-; AIX32-NEXT:   or 7, 7, 6
-; AIX32-NEXT:   stwcx. 7, 0, 18
-; AIX32-NEXT:   beq     0, L..BB3_16
-; AIX32-NEXT:  # %bb.15:                               # %cmpxchg.releasedload188
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_14 Depth=1
-; AIX32-NEXT:   lwarx 3, 0, 18
-; AIX32-NEXT:   srw 7, 3, 17
-; AIX32-NEXT:   clrlwi  7, 7, 16
-; AIX32-NEXT:   cmplw   7, 5
-; AIX32-NEXT:   beq     0, L..BB3_14
-; AIX32-NEXT:  L..BB3_16:                              # %cmpxchg.nostore186
-; AIX32-NEXT:   srw 3, 3, 17
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   lwz 15, L..C4(2)                        # @si
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   sth 3, 0(4)
-; AIX32-NEXT:   lbz 4, 0(30)
-; AIX32-NEXT:   lwarx 3, 0, 15
-; AIX32-NEXT:   cmplw   3, 4
-; AIX32-NEXT:   bne     0, L..BB3_20
-; AIX32-NEXT:  # %bb.17:                               # %cmpxchg.fencedstore171
-; AIX32-NEXT:   extsb 5, 5
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   .align  5
-; AIX32-NEXT:  L..BB3_18:                              # %cmpxchg.trystore170
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   stwcx. 5, 0, 15
-; AIX32-NEXT:   beq     0, L..BB3_20
-; AIX32-NEXT:  # %bb.19:                               # %cmpxchg.releasedload169
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_18 Depth=1
-; AIX32-NEXT:   lwarx 3, 0, 15
-; AIX32-NEXT:   cmplw   3, 4
-; AIX32-NEXT:   beq     0, L..BB3_18
-; AIX32-NEXT:  L..BB3_20:                              # %cmpxchg.nostore167
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   lwz 28, L..C5(2)                        # @ui
-; AIX32-NEXT:   stw 3, 0(15)
-; AIX32-NEXT:   lbz 4, 0(30)
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   lwarx 3, 0, 28
-; AIX32-NEXT:   cmplw   3, 4
-; AIX32-NEXT:   bne     0, L..BB3_24
-; AIX32-NEXT:  # %bb.21:                               # %cmpxchg.fencedstore152
-; AIX32-NEXT:   extsb 5, 5
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   .align  5
-; AIX32-NEXT:  L..BB3_22:                              # %cmpxchg.trystore151
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   stwcx. 5, 0, 28
-; AIX32-NEXT:   beq     0, L..BB3_24
-; AIX32-NEXT:  # %bb.23:                               # %cmpxchg.releasedload150
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_22 Depth=1
-; AIX32-NEXT:   lwarx 3, 0, 28
-; AIX32-NEXT:   cmplw   3, 4
-; AIX32-NEXT:   beq     0, L..BB3_22
-; AIX32-NEXT:  L..BB3_24:                              # %cmpxchg.nostore148
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lwz 31, L..C6(2)                        # @sll
-; AIX32-NEXT:   lbz 3, 0(29)
-; AIX32-NEXT:   li 14, 0
-; AIX32-NEXT:   addi 4, 1, 64
-; AIX32-NEXT:   li 7, 5
-; AIX32-NEXT:   li 8, 5
-; AIX32-NEXT:   stw 14, 64(1)
-; AIX32-NEXT:   extsb 6, 3
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   srawi 5, 6, 31
-; AIX32-NEXT:   stw 3, 68(1)
-; AIX32-NEXT:   mr      3, 31
-; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:   nop
-; AIX32-NEXT:   lwz 3, 68(1)
-; AIX32-NEXT:   lbz 4, 0(29)
-; AIX32-NEXT:   li 7, 5
-; AIX32-NEXT:   li 8, 5
-; AIX32-NEXT:   stw 3, 4(31)
-; AIX32-NEXT:   lwz 3, 64(1)
-; AIX32-NEXT:   extsb 6, 4
-; AIX32-NEXT:   addi 4, 1, 64
-; AIX32-NEXT:   stw 14, 64(1)
-; AIX32-NEXT:   srawi 5, 6, 31
-; AIX32-NEXT:   stw 3, 0(31)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwz 31, L..C7(2)                        # @ull
-; AIX32-NEXT:   stw 3, 68(1)
-; AIX32-NEXT:   mr      3, 31
-; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:   nop
-; AIX32-NEXT:   lwz 3, 64(1)
-; AIX32-NEXT:   lwz 4, 68(1)
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   stw 4, 4(31)
-; AIX32-NEXT:   stw 3, 0(31)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwarx 4, 0, 27
-; AIX32-NEXT:   srw 6, 4, 26
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   bne     0, L..BB3_28
-; AIX32-NEXT:  # %bb.25:                               # %cmpxchg.fencedstore119
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   slw 5, 5, 26
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_26:                              # %cmpxchg.trystore118
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 4, 4, 25
-; AIX32-NEXT:   or 4, 4, 5
-; AIX32-NEXT:   stwcx. 4, 0, 27
-; AIX32-NEXT:   beq     0, L..BB3_29
-; AIX32-NEXT:  # %bb.27:                               # %cmpxchg.releasedload117
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_26 Depth=1
-; AIX32-NEXT:   lwarx 4, 0, 27
-; AIX32-NEXT:   srw 6, 4, 26
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   beq     0, L..BB3_26
-; AIX32-NEXT:  L..BB3_28:                              # %cmpxchg.nostore115
-; AIX32-NEXT:   crxor 20, 20, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   b L..BB3_30
-; AIX32-NEXT:  L..BB3_29:                              # %cmpxchg.success116
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   creqv 20, 20, 20
-; AIX32-NEXT:  L..BB3_30:                              # %cmpxchg.end113
-; AIX32-NEXT:   li 3, 0
-; AIX32-NEXT:   li 4, 1
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   isel 3, 4, 3, 20
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwarx 4, 0, 24
-; AIX32-NEXT:   srw 6, 4, 23
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   bne     0, L..BB3_34
-; AIX32-NEXT:  # %bb.31:                               # %cmpxchg.fencedstore86
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   slw 5, 5, 23
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_32:                              # %cmpxchg.trystore85
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 4, 4, 22
-; AIX32-NEXT:   or 4, 4, 5
-; AIX32-NEXT:   stwcx. 4, 0, 24
-; AIX32-NEXT:   beq     0, L..BB3_35
-; AIX32-NEXT:  # %bb.33:                               # %cmpxchg.releasedload84
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_32 Depth=1
-; AIX32-NEXT:   lwarx 4, 0, 24
-; AIX32-NEXT:   srw 6, 4, 23
-; AIX32-NEXT:   clrlwi  6, 6, 24
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   beq     0, L..BB3_32
-; AIX32-NEXT:  L..BB3_34:                              # %cmpxchg.nostore82
-; AIX32-NEXT:   crxor 20, 20, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   b L..BB3_36
-; AIX32-NEXT:  L..BB3_35:                              # %cmpxchg.success83
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   creqv 20, 20, 20
-; AIX32-NEXT:  L..BB3_36:                              # %cmpxchg.end80
-; AIX32-NEXT:   li 3, 0
-; AIX32-NEXT:   li 4, 1
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   isel 3, 4, 3, 20
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwarx 4, 0, 21
-; AIX32-NEXT:   srw 6, 4, 20
-; AIX32-NEXT:   clrlwi  6, 6, 16
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   bne     0, L..BB3_40
-; AIX32-NEXT:  # %bb.37:                               # %cmpxchg.fencedstore53
-; AIX32-NEXT:   extsb 5, 5
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   clrlwi  5, 5, 16
-; AIX32-NEXT:   slw 5, 5, 20
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_38:                              # %cmpxchg.trystore52
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 4, 4, 19
-; AIX32-NEXT:   or 4, 4, 5
-; AIX32-NEXT:   stwcx. 4, 0, 21
-; AIX32-NEXT:   beq     0, L..BB3_41
-; AIX32-NEXT:  # %bb.39:                               # %cmpxchg.releasedload51
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_38 Depth=1
-; AIX32-NEXT:   lwarx 4, 0, 21
-; AIX32-NEXT:   srw 6, 4, 20
-; AIX32-NEXT:   clrlwi  6, 6, 16
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   beq     0, L..BB3_38
-; AIX32-NEXT:  L..BB3_40:                              # %cmpxchg.nostore49
-; AIX32-NEXT:   crxor 20, 20, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   b L..BB3_42
-; AIX32-NEXT:  L..BB3_41:                              # %cmpxchg.success50
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   creqv 20, 20, 20
-; AIX32-NEXT:  L..BB3_42:                              # %cmpxchg.end47
-; AIX32-NEXT:   li 3, 0
-; AIX32-NEXT:   li 4, 1
-; AIX32-NEXT:   lbz 5, 0(29)
-; AIX32-NEXT:   isel 3, 4, 3, 20
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwarx 4, 0, 18
-; AIX32-NEXT:   srw 6, 4, 17
-; AIX32-NEXT:   clrlwi  6, 6, 16
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   bne     0, L..BB3_46
-; AIX32-NEXT:  # %bb.43:                               # %cmpxchg.fencedstore29
-; AIX32-NEXT:   extsb 5, 5
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   clrlwi  5, 5, 16
-; AIX32-NEXT:   slw 5, 5, 17
-; AIX32-NEXT:   .align  4
-; AIX32-NEXT:  L..BB3_44:                              # %cmpxchg.trystore28
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   and 4, 4, 16
-; AIX32-NEXT:   or 4, 4, 5
-; AIX32-NEXT:   stwcx. 4, 0, 18
-; AIX32-NEXT:   beq     0, L..BB3_47
-; AIX32-NEXT:  # %bb.45:                               # %cmpxchg.releasedload27
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_44 Depth=1
-; AIX32-NEXT:   lwarx 4, 0, 18
-; AIX32-NEXT:   srw 6, 4, 17
-; AIX32-NEXT:   clrlwi  6, 6, 16
-; AIX32-NEXT:   cmplw   6, 3
-; AIX32-NEXT:   beq     0, L..BB3_44
-; AIX32-NEXT:  L..BB3_46:                              # %cmpxchg.nostore25
-; AIX32-NEXT:   crxor 20, 20, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   b L..BB3_48
-; AIX32-NEXT:  L..BB3_47:                              # %cmpxchg.success26
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   creqv 20, 20, 20
-; AIX32-NEXT:  L..BB3_48:                              # %cmpxchg.end23
-; AIX32-NEXT:   li 3, 0
-; AIX32-NEXT:   li 4, 1
-; AIX32-NEXT:   isel 3, 4, 3, 20
-; AIX32-NEXT:   lbz 4, 0(29)
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwarx 5, 0, 15
-; AIX32-NEXT:   cmplw   5, 3
-; AIX32-NEXT:   bne     0, L..BB3_52
-; AIX32-NEXT:  # %bb.49:                               # %cmpxchg.fencedstore10
-; AIX32-NEXT:   extsb 4, 4
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   .align  5
-; AIX32-NEXT:  L..BB3_50:                              # %cmpxchg.trystore9
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   stwcx. 4, 0, 15
-; AIX32-NEXT:   beq     0, L..BB3_53
-; AIX32-NEXT:  # %bb.51:                               # %cmpxchg.releasedload8
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_50 Depth=1
-; AIX32-NEXT:   lwarx 5, 0, 15
-; AIX32-NEXT:   cmplw   5, 3
-; AIX32-NEXT:   beq     0, L..BB3_50
-; AIX32-NEXT:  L..BB3_52:                              # %cmpxchg.nostore6
-; AIX32-NEXT:   crxor 20, 20, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   b L..BB3_54
-; AIX32-NEXT:  L..BB3_53:                              # %cmpxchg.success7
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   creqv 20, 20, 20
-; AIX32-NEXT:  L..BB3_54:                              # %cmpxchg.end4
-; AIX32-NEXT:   li 3, 0
-; AIX32-NEXT:   li 4, 1
-; AIX32-NEXT:   isel 3, 4, 3, 20
-; AIX32-NEXT:   lbz 4, 0(29)
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   lwarx 5, 0, 28
-; AIX32-NEXT:   cmplw   5, 3
-; AIX32-NEXT:   bne     0, L..BB3_58
-; AIX32-NEXT:  # %bb.55:                               # %cmpxchg.fencedstore
-; AIX32-NEXT:   extsb 4, 4
-; AIX32-NEXT:   sync
-; AIX32-NEXT:   .align  5
-; AIX32-NEXT:  L..BB3_56:                              # %cmpxchg.trystore
-; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
-; AIX32-NEXT:   stwcx. 4, 0, 28
-; AIX32-NEXT:   beq     0, L..BB3_59
-; AIX32-NEXT:  # %bb.57:                               # %cmpxchg.releasedload
-; AIX32-NEXT:                                          #   in Loop: Header=BB3_56 Depth=1
-; AIX32-NEXT:   lwarx 5, 0, 28
-; AIX32-NEXT:   cmplw   5, 3
-; AIX32-NEXT:   beq     0, L..BB3_56
-; AIX32-NEXT:  L..BB3_58:                              # %cmpxchg.nostore
-; AIX32-NEXT:   crxor 20, 20, 20
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   b L..BB3_60
-; AIX32-NEXT:  L..BB3_59:                              # %cmpxchg.success
-; AIX32-NEXT:   lwsync
-; AIX32-NEXT:   creqv 20, 20, 20
-; AIX32-NEXT:  L..BB3_60:                              # %cmpxchg.end
-; AIX32-NEXT:   li 3, 1
-; AIX32-NEXT:   li 31, 0
-; AIX32-NEXT:   lbz 4, 0(29)
-; AIX32-NEXT:   isel 3, 3, 31, 20
-; AIX32-NEXT:   li 7, 5
-; AIX32-NEXT:   li 8, 5
-; AIX32-NEXT:   extsb 6, 4
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   addi 4, 1, 64
-; AIX32-NEXT:   stw 31, 64(1)
-; AIX32-NEXT:   srawi 5, 6, 31
-; AIX32-NEXT:   stw 3, 68(1)
-; AIX32-NEXT:   lwz 3, L..C6(2)                         # @sll
-; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:   nop
-; AIX32-NEXT:   lbz 4, 0(29)
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lbz 3, 0(30)
-; AIX32-NEXT:   li 7, 5
-; AIX32-NEXT:   li 8, 5
-; AIX32-NEXT:   extsb 6, 4
-; AIX32-NEXT:   stw 3, 68(1)
-; AIX32-NEXT:   lwz 3, L..C7(2)                         # @ull
-; AIX32-NEXT:   addi 4, 1, 64
-; AIX32-NEXT:   stw 31, 64(1)
-; AIX32-NEXT:   srawi 5, 6, 31
-; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:   nop
-; AIX32-NEXT:   stw 3, 0(28)
-; AIX32-NEXT:   lwz 31, 140(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 30, 136(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 29, 132(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 28, 128(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 27, 124(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 26, 120(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 25, 116(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 24, 112(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 23, 108(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 22, 104(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 21, 100(1)                          # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 20, 96(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 19, 92(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 18, 88(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 17, 84(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 16, 80(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 15, 76(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   lwz 14, 72(1)                           # 4-byte Folded Reload
-; AIX32-NEXT:   addi 1, 1, 144
-; AIX32-NEXT:   lwz 0, 8(1)
-; AIX32-NEXT:   mtlr 0
+; AIX32-NEXT:    stwu 1, -144(1)
+; AIX32-NEXT:    stw 0, 152(1)
+; AIX32-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
+; AIX32-NEXT:    lwz 29, L..C0(2) # @sc
+; AIX32-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
+; AIX32-NEXT:    not 3, 29
+; AIX32-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
+; AIX32-NEXT:    lwz 30, L..C1(2) # @uc
+; AIX32-NEXT:    lbz 4, 0(30)
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
+; AIX32-NEXT:    rlwinm 27, 29, 0, 0, 29
+; AIX32-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
+; AIX32-NEXT:    rlwinm 26, 3, 3, 27, 28
+; AIX32-NEXT:    li 3, 255
+; AIX32-NEXT:    slw 3, 3, 26
+; AIX32-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
+; AIX32-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
+; AIX32-NEXT:    not 25, 3
+; AIX32-NEXT:    lwarx 3, 0, 27
+; AIX32-NEXT:    srw 6, 3, 26
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 4
+; AIX32-NEXT:    bne 0, L..BB3_4
+; AIX32-NEXT:  # %bb.1: # %cmpxchg.fencedstore289
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    slw 5, 5, 26
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_2: # %cmpxchg.trystore288
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 6, 3, 25
+; AIX32-NEXT:    or 6, 6, 5
+; AIX32-NEXT:    stwcx. 6, 0, 27
+; AIX32-NEXT:    beq 0, L..BB3_4
+; AIX32-NEXT:  # %bb.3: # %cmpxchg.releasedload287
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 3, 0, 27
+; AIX32-NEXT:    srw 6, 3, 26
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 4
+; AIX32-NEXT:    beq 0, L..BB3_2
+; AIX32-NEXT:  L..BB3_4: # %cmpxchg.nostore285
+; AIX32-NEXT:    not 4, 30
+; AIX32-NEXT:    srw 5, 3, 26
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    rlwinm 24, 30, 0, 0, 29
+; AIX32-NEXT:    rlwinm 23, 4, 3, 27, 28
+; AIX32-NEXT:    li 4, 255
+; AIX32-NEXT:    stb 5, 0(29)
+; AIX32-NEXT:    slw 4, 4, 23
+; AIX32-NEXT:    not 22, 4
+; AIX32-NEXT:    lwarx 4, 0, 24
+; AIX32-NEXT:    srw 6, 4, 23
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    bne 0, L..BB3_8
+; AIX32-NEXT:  # %bb.5: # %cmpxchg.fencedstore256
+; AIX32-NEXT:    clrlwi 5, 5, 24
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    slw 5, 5, 23
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_6: # %cmpxchg.trystore255
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 6, 4, 22
+; AIX32-NEXT:    or 6, 6, 5
+; AIX32-NEXT:    stwcx. 6, 0, 24
+; AIX32-NEXT:    beq 0, L..BB3_8
+; AIX32-NEXT:  # %bb.7: # %cmpxchg.releasedload254
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 4, 0, 24
+; AIX32-NEXT:    srw 6, 4, 23
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    beq 0, L..BB3_6
+; AIX32-NEXT:  L..BB3_8: # %cmpxchg.nostore252
+; AIX32-NEXT:    srw 4, 4, 23
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    lis 3, 0
+; AIX32-NEXT:    lbz 7, 0(29)
+; AIX32-NEXT:    stb 4, 0(30)
+; AIX32-NEXT:    clrlwi 6, 4, 24
+; AIX32-NEXT:    lwz 4, L..C2(2) # @ss
+; AIX32-NEXT:    ori 3, 3, 65535
+; AIX32-NEXT:    clrlwi 5, 4, 30
+; AIX32-NEXT:    rlwinm 21, 4, 0, 0, 29
+; AIX32-NEXT:    xori 5, 5, 2
+; AIX32-NEXT:    slwi 20, 5, 3
+; AIX32-NEXT:    slw 5, 3, 20
+; AIX32-NEXT:    not 19, 5
+; AIX32-NEXT:    lwarx 5, 0, 21
+; AIX32-NEXT:    srw 8, 5, 20
+; AIX32-NEXT:    clrlwi 8, 8, 16
+; AIX32-NEXT:    cmplw 8, 6
+; AIX32-NEXT:    bne 0, L..BB3_12
+; AIX32-NEXT:  # %bb.9: # %cmpxchg.fencedstore223
+; AIX32-NEXT:    extsb 7, 7
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    clrlwi 7, 7, 16
+; AIX32-NEXT:    slw 7, 7, 20
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_10: # %cmpxchg.trystore222
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 8, 5, 19
+; AIX32-NEXT:    or 8, 8, 7
+; AIX32-NEXT:    stwcx. 8, 0, 21
+; AIX32-NEXT:    beq 0, L..BB3_12
+; AIX32-NEXT:  # %bb.11: # %cmpxchg.releasedload221
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 5, 0, 21
+; AIX32-NEXT:    srw 8, 5, 20
+; AIX32-NEXT:    clrlwi 8, 8, 16
+; AIX32-NEXT:    cmplw 8, 6
+; AIX32-NEXT:    beq 0, L..BB3_10
+; AIX32-NEXT:  L..BB3_12: # %cmpxchg.nostore219
+; AIX32-NEXT:    srw 5, 5, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    lbz 6, 0(29)
+; AIX32-NEXT:    sth 5, 0(4)
+; AIX32-NEXT:    lwz 4, L..C3(2) # @us
+; AIX32-NEXT:    lbz 5, 0(30)
+; AIX32-NEXT:    clrlwi 7, 4, 30
+; AIX32-NEXT:    rlwinm 18, 4, 0, 0, 29
+; AIX32-NEXT:    xori 7, 7, 2
+; AIX32-NEXT:    slwi 17, 7, 3
+; AIX32-NEXT:    slw 3, 3, 17
+; AIX32-NEXT:    not 16, 3
+; AIX32-NEXT:    lwarx 3, 0, 18
+; AIX32-NEXT:    srw 7, 3, 17
+; AIX32-NEXT:    clrlwi 7, 7, 16
+; AIX32-NEXT:    cmplw 7, 5
+; AIX32-NEXT:    bne 0, L..BB3_16
+; AIX32-NEXT:  # %bb.13: # %cmpxchg.fencedstore190
+; AIX32-NEXT:    extsb 6, 6
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    clrlwi 6, 6, 16
+; AIX32-NEXT:    slw 6, 6, 17
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_14: # %cmpxchg.trystore189
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 7, 3, 16
+; AIX32-NEXT:    or 7, 7, 6
+; AIX32-NEXT:    stwcx. 7, 0, 18
+; AIX32-NEXT:    beq 0, L..BB3_16
+; AIX32-NEXT:  # %bb.15: # %cmpxchg.releasedload188
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 3, 0, 18
+; AIX32-NEXT:    srw 7, 3, 17
+; AIX32-NEXT:    clrlwi 7, 7, 16
+; AIX32-NEXT:    cmplw 7, 5
+; AIX32-NEXT:    beq 0, L..BB3_14
+; AIX32-NEXT:  L..BB3_16: # %cmpxchg.nostore186
+; AIX32-NEXT:    srw 3, 3, 17
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    lwz 15, L..C4(2) # @si
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    sth 3, 0(4)
+; AIX32-NEXT:    lbz 4, 0(30)
+; AIX32-NEXT:    lwarx 3, 0, 15
+; AIX32-NEXT:    cmplw 3, 4
+; AIX32-NEXT:    bne 0, L..BB3_20
+; AIX32-NEXT:  # %bb.17: # %cmpxchg.fencedstore171
+; AIX32-NEXT:    extsb 5, 5
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    .align 5
+; AIX32-NEXT:  L..BB3_18: # %cmpxchg.trystore170
+; AIX32-NEXT:    #
+; AIX32-NEXT:    stwcx. 5, 0, 15
+; AIX32-NEXT:    beq 0, L..BB3_20
+; AIX32-NEXT:  # %bb.19: # %cmpxchg.releasedload169
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 3, 0, 15
+; AIX32-NEXT:    cmplw 3, 4
+; AIX32-NEXT:    beq 0, L..BB3_18
+; AIX32-NEXT:  L..BB3_20: # %cmpxchg.nostore167
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    lwz 28, L..C5(2) # @ui
+; AIX32-NEXT:    stw 3, 0(15)
+; AIX32-NEXT:    lbz 4, 0(30)
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    lwarx 3, 0, 28
+; AIX32-NEXT:    cmplw 3, 4
+; AIX32-NEXT:    bne 0, L..BB3_24
+; AIX32-NEXT:  # %bb.21: # %cmpxchg.fencedstore152
+; AIX32-NEXT:    extsb 5, 5
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    .align 5
+; AIX32-NEXT:  L..BB3_22: # %cmpxchg.trystore151
+; AIX32-NEXT:    #
+; AIX32-NEXT:    stwcx. 5, 0, 28
+; AIX32-NEXT:    beq 0, L..BB3_24
+; AIX32-NEXT:  # %bb.23: # %cmpxchg.releasedload150
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 3, 0, 28
+; AIX32-NEXT:    cmplw 3, 4
+; AIX32-NEXT:    beq 0, L..BB3_22
+; AIX32-NEXT:  L..BB3_24: # %cmpxchg.nostore148
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
+; AIX32-NEXT:    lbz 3, 0(29)
+; AIX32-NEXT:    li 14, 0
+; AIX32-NEXT:    addi 4, 1, 64
+; AIX32-NEXT:    li 7, 5
+; AIX32-NEXT:    li 8, 5
+; AIX32-NEXT:    stw 14, 64(1)
+; AIX32-NEXT:    extsb 6, 3
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    srawi 5, 6, 31
+; AIX32-NEXT:    stw 3, 68(1)
+; AIX32-NEXT:    mr 3, 31
+; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:    nop
+; AIX32-NEXT:    lwz 3, 68(1)
+; AIX32-NEXT:    lbz 4, 0(29)
+; AIX32-NEXT:    li 7, 5
+; AIX32-NEXT:    li 8, 5
+; AIX32-NEXT:    stw 3, 4(31)
+; AIX32-NEXT:    lwz 3, 64(1)
+; AIX32-NEXT:    extsb 6, 4
+; AIX32-NEXT:    addi 4, 1, 64
+; AIX32-NEXT:    stw 14, 64(1)
+; AIX32-NEXT:    srawi 5, 6, 31
+; AIX32-NEXT:    stw 3, 0(31)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwz 31, L..C7(2) # @ull
+; AIX32-NEXT:    stw 3, 68(1)
+; AIX32-NEXT:    mr 3, 31
+; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:    nop
+; AIX32-NEXT:    lwz 3, 64(1)
+; AIX32-NEXT:    lwz 4, 68(1)
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    stw 4, 4(31)
+; AIX32-NEXT:    stw 3, 0(31)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwarx 4, 0, 27
+; AIX32-NEXT:    srw 6, 4, 26
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    bne 0, L..BB3_28
+; AIX32-NEXT:  # %bb.25: # %cmpxchg.fencedstore119
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    slw 5, 5, 26
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_26: # %cmpxchg.trystore118
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 4, 4, 25
+; AIX32-NEXT:    or 4, 4, 5
+; AIX32-NEXT:    stwcx. 4, 0, 27
+; AIX32-NEXT:    beq 0, L..BB3_29
+; AIX32-NEXT:  # %bb.27: # %cmpxchg.releasedload117
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 4, 0, 27
+; AIX32-NEXT:    srw 6, 4, 26
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    beq 0, L..BB3_26
+; AIX32-NEXT:  L..BB3_28: # %cmpxchg.nostore115
+; AIX32-NEXT:    crxor 20, 20, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    b L..BB3_30
+; AIX32-NEXT:  L..BB3_29: # %cmpxchg.success116
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_30: # %cmpxchg.end113
+; AIX32-NEXT:    li 3, 0
+; AIX32-NEXT:    li 4, 1
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    isel 3, 4, 3, 20
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwarx 4, 0, 24
+; AIX32-NEXT:    srw 6, 4, 23
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    bne 0, L..BB3_34
+; AIX32-NEXT:  # %bb.31: # %cmpxchg.fencedstore86
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    slw 5, 5, 23
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_32: # %cmpxchg.trystore85
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 4, 4, 22
+; AIX32-NEXT:    or 4, 4, 5
+; AIX32-NEXT:    stwcx. 4, 0, 24
+; AIX32-NEXT:    beq 0, L..BB3_35
+; AIX32-NEXT:  # %bb.33: # %cmpxchg.releasedload84
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 4, 0, 24
+; AIX32-NEXT:    srw 6, 4, 23
+; AIX32-NEXT:    clrlwi 6, 6, 24
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    beq 0, L..BB3_32
+; AIX32-NEXT:  L..BB3_34: # %cmpxchg.nostore82
+; AIX32-NEXT:    crxor 20, 20, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    b L..BB3_36
+; AIX32-NEXT:  L..BB3_35: # %cmpxchg.success83
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_36: # %cmpxchg.end80
+; AIX32-NEXT:    li 3, 0
+; AIX32-NEXT:    li 4, 1
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    isel 3, 4, 3, 20
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwarx 4, 0, 21
+; AIX32-NEXT:    srw 6, 4, 20
+; AIX32-NEXT:    clrlwi 6, 6, 16
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    bne 0, L..BB3_40
+; AIX32-NEXT:  # %bb.37: # %cmpxchg.fencedstore53
+; AIX32-NEXT:    extsb 5, 5
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    clrlwi 5, 5, 16
+; AIX32-NEXT:    slw 5, 5, 20
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_38: # %cmpxchg.trystore52
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 4, 4, 19
+; AIX32-NEXT:    or 4, 4, 5
+; AIX32-NEXT:    stwcx. 4, 0, 21
+; AIX32-NEXT:    beq 0, L..BB3_41
+; AIX32-NEXT:  # %bb.39: # %cmpxchg.releasedload51
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 4, 0, 21
+; AIX32-NEXT:    srw 6, 4, 20
+; AIX32-NEXT:    clrlwi 6, 6, 16
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    beq 0, L..BB3_38
+; AIX32-NEXT:  L..BB3_40: # %cmpxchg.nostore49
+; AIX32-NEXT:    crxor 20, 20, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    b L..BB3_42
+; AIX32-NEXT:  L..BB3_41: # %cmpxchg.success50
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_42: # %cmpxchg.end47
+; AIX32-NEXT:    li 3, 0
+; AIX32-NEXT:    li 4, 1
+; AIX32-NEXT:    lbz 5, 0(29)
+; AIX32-NEXT:    isel 3, 4, 3, 20
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwarx 4, 0, 18
+; AIX32-NEXT:    srw 6, 4, 17
+; AIX32-NEXT:    clrlwi 6, 6, 16
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    bne 0, L..BB3_46
+; AIX32-NEXT:  # %bb.43: # %cmpxchg.fencedstore29
+; AIX32-NEXT:    extsb 5, 5
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    clrlwi 5, 5, 16
+; AIX32-NEXT:    slw 5, 5, 17
+; AIX32-NEXT:    .align 4
+; AIX32-NEXT:  L..BB3_44: # %cmpxchg.trystore28
+; AIX32-NEXT:    #
+; AIX32-NEXT:    and 4, 4, 16
+; AIX32-NEXT:    or 4, 4, 5
+; AIX32-NEXT:    stwcx. 4, 0, 18
+; AIX32-NEXT:    beq 0, L..BB3_47
+; AIX32-NEXT:  # %bb.45: # %cmpxchg.releasedload27
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 4, 0, 18
+; AIX32-NEXT:    srw 6, 4, 17
+; AIX32-NEXT:    clrlwi 6, 6, 16
+; AIX32-NEXT:    cmplw 6, 3
+; AIX32-NEXT:    beq 0, L..BB3_44
+; AIX32-NEXT:  L..BB3_46: # %cmpxchg.nostore25
+; AIX32-NEXT:    crxor 20, 20, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    b L..BB3_48
+; AIX32-NEXT:  L..BB3_47: # %cmpxchg.success26
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_48: # %cmpxchg.end23
+; AIX32-NEXT:    li 3, 0
+; AIX32-NEXT:    li 4, 1
+; AIX32-NEXT:    isel 3, 4, 3, 20
+; AIX32-NEXT:    lbz 4, 0(29)
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwarx 5, 0, 15
+; AIX32-NEXT:    cmplw 5, 3
+; AIX32-NEXT:    bne 0, L..BB3_52
+; AIX32-NEXT:  # %bb.49: # %cmpxchg.fencedstore10
+; AIX32-NEXT:    extsb 4, 4
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    .align 5
+; AIX32-NEXT:  L..BB3_50: # %cmpxchg.trystore9
+; AIX32-NEXT:    #
+; AIX32-NEXT:    stwcx. 4, 0, 15
+; AIX32-NEXT:    beq 0, L..BB3_53
+; AIX32-NEXT:  # %bb.51: # %cmpxchg.releasedload8
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 5, 0, 15
+; AIX32-NEXT:    cmplw 5, 3
+; AIX32-NEXT:    beq 0, L..BB3_50
+; AIX32-NEXT:  L..BB3_52: # %cmpxchg.nostore6
+; AIX32-NEXT:    crxor 20, 20, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    b L..BB3_54
+; AIX32-NEXT:  L..BB3_53: # %cmpxchg.success7
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_54: # %cmpxchg.end4
+; AIX32-NEXT:    li 3, 0
+; AIX32-NEXT:    li 4, 1
+; AIX32-NEXT:    isel 3, 4, 3, 20
+; AIX32-NEXT:    lbz 4, 0(29)
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    lwarx 5, 0, 28
+; AIX32-NEXT:    cmplw 5, 3
+; AIX32-NEXT:    bne 0, L..BB3_58
+; AIX32-NEXT:  # %bb.55: # %cmpxchg.fencedstore
+; AIX32-NEXT:    extsb 4, 4
+; AIX32-NEXT:    sync
+; AIX32-NEXT:    .align 5
+; AIX32-NEXT:  L..BB3_56: # %cmpxchg.trystore
+; AIX32-NEXT:    #
+; AIX32-NEXT:    stwcx. 4, 0, 28
+; AIX32-NEXT:    beq 0, L..BB3_59
+; AIX32-NEXT:  # %bb.57: # %cmpxchg.releasedload
+; AIX32-NEXT:    #
+; AIX32-NEXT:    lwarx 5, 0, 28
+; AIX32-NEXT:    cmplw 5, 3
+; AIX32-NEXT:    beq 0, L..BB3_56
+; AIX32-NEXT:  L..BB3_58: # %cmpxchg.nostore
+; AIX32-NEXT:    crxor 20, 20, 20
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    b L..BB3_60
+; AIX32-NEXT:  L..BB3_59: # %cmpxchg.success
+; AIX32-NEXT:    lwsync
+; AIX32-NEXT:    creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_60: # %cmpxchg.end
+; AIX32-NEXT:    li 3, 1
+; AIX32-NEXT:    li 31, 0
+; AIX32-NEXT:    lbz 4, 0(29)
+; AIX32-NEXT:    isel 3, 3, 31, 20
+; AIX32-NEXT:    li 7, 5
+; AIX32-NEXT:    li 8, 5
+; AIX32-NEXT:    extsb 6, 4
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    addi 4, 1, 64
+; AIX32-NEXT:    stw 31, 64(1)
+; AIX32-NEXT:    srawi 5, 6, 31
+; AIX32-NEXT:    stw 3, 68(1)
+; AIX32-NEXT:    lwz 3, L..C6(2) # @sll
+; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:    nop
+; AIX32-NEXT:    lbz 4, 0(29)
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lbz 3, 0(30)
+; AIX32-NEXT:    li 7, 5
+; AIX32-NEXT:    li 8, 5
+; AIX32-NEXT:    extsb 6, 4
+; AIX32-NEXT:    stw 3, 68(1)
+; AIX32-NEXT:    lwz 3, L..C7(2) # @ull
+; AIX32-NEXT:    addi 4, 1, 64
+; AIX32-NEXT:    stw 31, 64(1)
+; AIX32-NEXT:    srawi 5, 6, 31
+; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:    nop
+; AIX32-NEXT:    stw 3, 0(28)
+; AIX32-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
+; AIX32-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
+; AIX32-NEXT:    addi 1, 1, 144
+; AIX32-NEXT:    lwz 0, 8(1)
+; AIX32-NEXT:    mtlr 0
 ; AIX32-NEXT:    blr
 entry:
   %0 = load i8, ptr @uc, align 1
@@ -5852,20 +5852,23 @@ entry:
 define dso_local i64 @cmpswplp(ptr noundef %ptr, ptr nocapture noundef readnone %oldval, i64 noundef %newval) local_unnamed_addr #0 {
 ; CHECK-LABEL: cmpswplp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:   ldarx 4, 0, 3
-; CHECK-NEXT:   cmpld   4, 5
-; CHECK-NEXT:   bne     0, .LBB6_2
-; CHECK-NEXT: # %bb.1:                                # %cmpxchg.fencedstore
-; CHECK-NEXT:   addi 4, 5, 1
-; CHECK-NEXT:   stdcx. 4, 0, 3
-; CHECK-NEXT:   beq     0, .LBB6_4
-; CHECK-NEXT: .LBB6_2:                                # %cmpxchg.failure
-; CHECK-NEXT:   crxor 20, 20, 20
-; CHECK-NEXT: .LBB6_3:                                # %cmpxchg.end
-; CHECK-NEXT:   li 3, 66
-; CHECK-NEXT:   li 4, 55
-; CHECK-NEXT:   isel 3, 4, 3, 20
-; CHECK-NEXT:   blr
+; CHECK-NEXT:    ldarx 4, 0, 3
+; CHECK-NEXT:    cmpld 4, 5
+; CHECK-NEXT:    bne 0, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; CHECK-NEXT:    addi 4, 5, 1
+; CHECK-NEXT:    stdcx. 4, 0, 3
+; CHECK-NEXT:    beq 0, .LBB6_4
+; CHECK-NEXT:  .LBB6_2: # %cmpxchg.failure
+; CHECK-NEXT:    crxor 20, 20, 20
+; CHECK-NEXT:  .LBB6_3: # %cmpxchg.end
+; CHECK-NEXT:    li 3, 66
+; CHECK-NEXT:    li 4, 55
+; CHECK-NEXT:    isel 3, 4, 3, 20
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB6_4:
+; CHECK-NEXT:    creqv 20, 20, 20
+; CHECK-NEXT:    b .LBB6_3
 ;
 ; AIX32-LABEL: cmpswplp:
 ; AIX32:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/PowerPC/loop-comment.ll b/llvm/test/CodeGen/PowerPC/loop-comment.ll
index 1fa9dda51ef9..34b29cbe901e 100644
--- a/llvm/test/CodeGen/PowerPC/loop-comment.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-comment.ll
@@ -6,18 +6,18 @@ define void @test(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:    .p2align        5
-; PPC64LE-NEXT:  .LBB0_1:                                # %cmpxchg.start
-; PPC64LE-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB0_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    clrlwi  6, 6, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
 ; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
-; PPC64LE-NEXT:                                          #   in Loop: Header=BB0_1 Depth=1
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB0_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
   ret void

From 17f5b8b52a3552de1143efb42af6a94d47d8c7fd Mon Sep 17 00:00:00 2001
From: Andre Kuhlenschmidt <andre.kuhlenschmidt@gmail.com>
Date: Wed, 18 Jun 2025 11:21:35 -0700
Subject: [PATCH 0848/1322] [flang][driver] add ability to look up feature
 flags without setting them (#144559)

This just adds some convenience methods to feature control and rewrites
old code in terms of those methods. Also cleans up some names that I
just realize were overloads of another method.
---
 .../include/flang/Support/Fortran-features.h  | 14 ++++++++++-
 flang/lib/Frontend/CompilerInvocation.cpp     |  2 +-
 flang/lib/Support/Fortran-features.cpp        | 23 ++++++++++---------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index ea0845b7d605..39356daa3606 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -81,6 +81,9 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
+using LanguageFeatureOrWarning = std::variant<LanguageFeature, UsageWarning>;
+using LanguageControlFlag =
+    std::pair<LanguageFeatureOrWarning, /*shouldEnable=*/bool>;
 
 class LanguageFeatureControl {
 public:
@@ -94,6 +97,13 @@ public:
   void EnableWarning(UsageWarning w, bool yes = true) {
     warnUsage_.set(w, yes);
   }
+  void EnableWarning(LanguageFeatureOrWarning flag, bool yes = true) {
+    if (std::holds_alternative<LanguageFeature>(flag)) {
+      EnableWarning(std::get<LanguageFeature>(flag), yes);
+    } else {
+      EnableWarning(std::get<UsageWarning>(flag), yes);
+    }
+  }
   void WarnOnAllNonstandard(bool yes = true);
   bool IsWarnOnAllNonstandard() const { return warnAllLanguage_; }
   void WarnOnAllUsage(bool yes = true);
@@ -116,9 +126,11 @@ public:
   bool ShouldWarn(LanguageFeature f) const { return warnLanguage_.test(f); }
   bool ShouldWarn(UsageWarning w) const { return warnUsage_.test(w); }
   // Cli options
+  // Find a warning by its Cli spelling, i.e. '[no-]warning-name'.
+  std::optional<LanguageControlFlag> FindWarning(std::string_view input);
   // Take a string from the Cli and apply it to the LanguageFeatureControl.
   // Return true if the option was recognized (and hence applied).
-  bool ApplyCliOption(std::string input);
+  bool EnableWarning(std::string_view input);
   // The add and replace functions are not currently used but are provided
   // to allow a flexible many-to-one mapping from Cli spellings to enum values.
   // Taking a string by value because the functions own this string after the
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 147849b0b7d2..2603a3f6dc64 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1011,7 +1011,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
       if (wArg == "error") {
         res.setWarnAsErr(true);
         // -W(no-)<feature>
-      } else if (!features.ApplyCliOption(wArg)) {
+      } else if (!features.EnableWarning(wArg)) {
         const unsigned diagID = diags.getCustomDiagID(
             clang::DiagnosticsEngine::Error, "Unknown diagnostic option: -W%0");
         diags.Report(diagID) << wArg;
diff --git a/flang/lib/Support/Fortran-features.cpp b/flang/lib/Support/Fortran-features.cpp
index 08ded173de51..17b5f8368916 100644
--- a/flang/lib/Support/Fortran-features.cpp
+++ b/flang/lib/Support/Fortran-features.cpp
@@ -151,22 +151,23 @@ LanguageFeatureControl::LanguageFeatureControl() {
   warnLanguage_.set(LanguageFeature::NullActualForAllocatable);
 }
 
-// Take a string from the Cli and apply it to the LanguageFeatureControl.
-bool LanguageFeatureControl::ApplyCliOption(std::string input) {
+std::optional<LanguageControlFlag> LanguageFeatureControl::FindWarning(
+    std::string_view input) {
   bool negated{false};
   if (input.size() > 3 && input.substr(0, 3) == "no-") {
     negated = true;
     input = input.substr(3);
   }
-  if (auto it{cliOptions_.find(input)}; it != cliOptions_.end()) {
-    if (std::holds_alternative<LanguageFeature>(it->second)) {
-      EnableWarning(std::get<LanguageFeature>(it->second), !negated);
-      return true;
-    }
-    if (std::holds_alternative<UsageWarning>(it->second)) {
-      EnableWarning(std::get<UsageWarning>(it->second), !negated);
-      return true;
-    }
+  if (auto it{cliOptions_.find(std::string{input})}; it != cliOptions_.end()) {
+    return std::make_pair(it->second, !negated);
+  }
+  return std::nullopt;
+}
+
+bool LanguageFeatureControl::EnableWarning(std::string_view input) {
+  if (auto warningAndEnabled{FindWarning(input)}) {
+    EnableWarning(warningAndEnabled->first, warningAndEnabled->second);
+    return true;
   }
   return false;
 }

From 8c3fbaf0ee7322e948403d2234a7230bd6137c98 Mon Sep 17 00:00:00 2001
From: "Walter J.T.V" <81811777+eZWALT@users.noreply.github.com>
Date: Wed, 18 Jun 2025 20:52:41 +0200
Subject: [PATCH 0849/1322] [Clang][OpenMP][LoopTransformations] Fix incorrect
 number of generated loops for Tile and Reverse directives (#140532)

This patch is closely related to #139293 and addresses an existing issue
in the loop transformation codebase. Specifically, it corrects the
handling of the `NumGeneratedLoops` variable in
`OMPLoopTransformationDirective` AST nodes and its inheritors (such as
OMPUnrollDirective, OMPTileDirective, etc.).

Previously, this variable was inaccurately set for certain
transformations like reverse or tile. While this did not lead to
functional bugs, since the value was only checked to determine whether
it was greater than zero or equal to zero, the inconsistency could
introduce problems when supporting more complex directives in the
future.
---
 clang/include/clang/AST/StmtOpenMP.h      | 23 +++++++++++++++--------
 clang/lib/AST/StmtOpenMP.cpp              | 11 +++++++----
 clang/lib/Sema/SemaOpenMP.cpp             |  4 ++--
 clang/lib/Serialization/ASTReaderStmt.cpp |  5 ++---
 4 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index 736bcabbad1f..e2fd2114026f 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -5787,10 +5787,13 @@ class OMPReverseDirective final : public OMPLoopTransformationDirective {
     TransformedStmtOffset,
   };
 
-  explicit OMPReverseDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+  explicit OMPReverseDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                               unsigned NumLoops)
       : OMPLoopTransformationDirective(OMPReverseDirectiveClass,
                                        llvm::omp::OMPD_reverse, StartLoc,
-                                       EndLoc, 1) {}
+                                       EndLoc, NumLoops) {
+    setNumGeneratedLoops(NumLoops);
+  }
 
   void setPreInits(Stmt *PreInits) {
     Data->getChildren()[PreInitsOffset] = PreInits;
@@ -5806,19 +5809,23 @@ public:
   /// \param C         Context of the AST.
   /// \param StartLoc  Location of the introducer (e.g. the 'omp' token).
   /// \param EndLoc    Location of the directive's end (e.g. the tok::eod).
+  /// \param NumLoops Number of affected loops
   /// \param AssociatedStmt  The outermost associated loop.
   /// \param TransformedStmt The loop nest after tiling, or nullptr in
   ///                        dependent contexts.
   /// \param PreInits   Helper preinits statements for the loop nest.
-  static OMPReverseDirective *
-  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
-         Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits);
+  static OMPReverseDirective *Create(const ASTContext &C,
+                                     SourceLocation StartLoc,
+                                     SourceLocation EndLoc,
+                                     Stmt *AssociatedStmt, unsigned NumLoops,
+                                     Stmt *TransformedStmt, Stmt *PreInits);
 
   /// Build an empty '#pragma omp reverse' AST node for deserialization.
   ///
   /// \param C          Context of the AST.
-  /// \param NumClauses Number of clauses to allocate.
-  static OMPReverseDirective *CreateEmpty(const ASTContext &C);
+  /// \param NumLoops   Number of associated loops to allocate
+  static OMPReverseDirective *CreateEmpty(const ASTContext &C,
+                                          unsigned NumLoops);
 
   /// Gets/sets the associated loops after the transformation, i.e. after
   /// de-sugaring.
@@ -5857,7 +5864,7 @@ class OMPInterchangeDirective final : public OMPLoopTransformationDirective {
       : OMPLoopTransformationDirective(OMPInterchangeDirectiveClass,
                                        llvm::omp::OMPD_interchange, StartLoc,
                                        EndLoc, NumLoops) {
-    setNumGeneratedLoops(3 * NumLoops);
+    setNumGeneratedLoops(NumLoops);
   }
 
   void setPreInits(Stmt *PreInits) {
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 093e1f659916..2eeb5e45ab51 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -471,18 +471,21 @@ OMPUnrollDirective *OMPUnrollDirective::CreateEmpty(const ASTContext &C,
 OMPReverseDirective *
 OMPReverseDirective::Create(const ASTContext &C, SourceLocation StartLoc,
                             SourceLocation EndLoc, Stmt *AssociatedStmt,
-                            Stmt *TransformedStmt, Stmt *PreInits) {
+                            unsigned NumLoops, Stmt *TransformedStmt,
+                            Stmt *PreInits) {
   OMPReverseDirective *Dir = createDirective<OMPReverseDirective>(
-      C, {}, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc);
+      C, {}, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc,
+      NumLoops);
   Dir->setTransformedStmt(TransformedStmt);
   Dir->setPreInits(PreInits);
   return Dir;
 }
 
-OMPReverseDirective *OMPReverseDirective::CreateEmpty(const ASTContext &C) {
+OMPReverseDirective *OMPReverseDirective::CreateEmpty(const ASTContext &C,
+                                                      unsigned NumLoops) {
   return createEmptyDirective<OMPReverseDirective>(
       C, /*NumClauses=*/0, /*HasAssociatedStmt=*/true,
-      TransformedStmtOffset + 1, SourceLocation(), SourceLocation());
+      TransformedStmtOffset + 1, SourceLocation(), SourceLocation(), NumLoops);
 }
 
 OMPInterchangeDirective *OMPInterchangeDirective::Create(
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index d928b7ae2b4c..00f465818080 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -15140,7 +15140,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
   // instantiated.
   if (SemaRef.CurContext->isDependentContext())
     return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt,
-                                       nullptr, nullptr);
+                                       NumLoops, nullptr, nullptr);
 
   assert(LoopHelpers.size() == NumLoops &&
          "Expecting a single-dimensional loop iteration space");
@@ -15299,7 +15299,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
       ForStmt(Context, Init.get(), Cond.get(), nullptr, Incr.get(),
               ReversedBody, LoopHelper.Init->getBeginLoc(),
               LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc());
-  return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt,
+  return OMPReverseDirective::Create(Context, StartLoc, EndLoc, AStmt, NumLoops,
                                      ReversedFor,
                                      buildPreInits(Context, PreInits));
 }
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 65102b64030c..44cfb83ad2db 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -3602,11 +3602,10 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case STMT_OMP_REVERSE_DIRECTIVE: {
-      assert(Record[ASTStmtReader::NumStmtFields] == 1 &&
-             "Reverse directive accepts only a single loop");
+      unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
       assert(Record[ASTStmtReader::NumStmtFields + 1] == 0 &&
              "Reverse directive has no clauses");
-      S = OMPReverseDirective::CreateEmpty(Context);
+      S = OMPReverseDirective::CreateEmpty(Context, NumLoops);
       break;
     }
 

From ab6beeca9ccc1968661eea27c1a55e8734f7437b Mon Sep 17 00:00:00 2001
From: uthmanna <114300283+uthmanna@users.noreply.github.com>
Date: Wed, 18 Jun 2025 21:00:10 +0200
Subject: [PATCH 0850/1322] [llvm-cov] Export decision coverage to output json
 (#144335)

This commit adds decision coverage counts derived from MC/DC test vector execution to the JSON output of llvm-cov, as
discussed here: [Missing Decision Coverage (DC) in output
json](https://discourse.llvm.org/t/missing-decision-coverage-dc-in-output-json/86783)
with @evodius96

---------

Co-authored-by: uthmanna <andre.uthmann@vector.com>
---
 .../llvm/ProfileData/Coverage/CoverageMapping.h      | 12 ++++++++++++
 llvm/tools/llvm-cov/CoverageExporterJson.cpp         |  6 ++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index d1230b0ba7c5..8e6180be25b5 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -31,6 +31,7 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -494,6 +495,17 @@ public:
     return TV[TestVectorIndex].first[PosToID[Condition]];
   }
 
+  /// Return the number of True and False decisions for all executed test
+  /// vectors.
+  std::pair<unsigned, unsigned> getDecisions() const {
+    const unsigned TrueDecisions =
+        std::count_if(TV.begin(), TV.end(), [](const auto &TestVec) {
+          return TestVec.second == CondState::MCDC_True;
+        });
+
+    return {TrueDecisions, TV.size() - TrueDecisions};
+  }
+
   /// Return the Result evaluation for an executed test vector.
   /// See MCDCRecordProcessor::RecordTestVector().
   CondState getTVResult(unsigned TestVectorIndex) {
diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index 4088c1b053aa..024693a24cc2 100644
--- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -62,7 +62,7 @@
 #include <utility>
 
 /// The semantic version combined as a string.
-#define LLVM_COVERAGE_EXPORT_JSON_STR "2.0.1"
+#define LLVM_COVERAGE_EXPORT_JSON_STR "3.0.0"
 
 /// Unique type identifier for JSON coverage export.
 #define LLVM_COVERAGE_EXPORT_JSON_TYPE_STR "llvm.coverage.json.export"
@@ -110,8 +110,10 @@ json::Array gatherConditions(const coverage::MCDCRecord &Record) {
 
 json::Array renderMCDCRecord(const coverage::MCDCRecord &Record) {
   const llvm::coverage::CounterMappingRegion &CMR = Record.getDecisionRegion();
+  const auto [TrueDecisions, FalseDecisions] = Record.getDecisions();
   return json::Array({CMR.LineStart, CMR.ColumnStart, CMR.LineEnd,
-                      CMR.ColumnEnd, CMR.ExpandedFileID, int64_t(CMR.Kind),
+                      CMR.ColumnEnd, TrueDecisions, FalseDecisions,
+                      CMR.ExpandedFileID, int64_t(CMR.Kind),
                       gatherConditions(Record)});
 }
 

From ca9a09dbe679dbdd4d47cb7894977e04c3bb914e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 18 Jun 2025 12:03:17 -0700
Subject: [PATCH 0851/1322] [libc++] Fix a typo in documentation (#144763)

---
 libcxx/docs/ABIGuarantees.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/ABIGuarantees.rst b/libcxx/docs/ABIGuarantees.rst
index e6ac4f2b5b23..c7d5afe1080b 100644
--- a/libcxx/docs/ABIGuarantees.rst
+++ b/libcxx/docs/ABIGuarantees.rst
@@ -92,7 +92,7 @@ Linking TUs which have been compiled with different flags affecting code gen
 There are a lot of compiler (and library) flags which change the code generated for functions. This includes flags like
 ``-O1``, which are guaranteed by the compiler to not change the observable behaviour of a correct program, as well as
 flags like ``-fexceptions``, which **do** change the observable behaviour. libc++ allows linking of TUs which have been
-compiled whith specific flags only and makes no guarantees for any of the flags not listed below.
+compiled with specific flags only and makes no guarantees for any of the flags not listed below.
 
 The flags allowed (in any combination) are:
 - ``-f[no-]exceptions``

From a94eb27a29ef3aee5ccafc1d7bebee1c8efbaf38 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 18 Jun 2025 20:18:22 +0100
Subject: [PATCH 0852/1322] [HashRecognize] Fix big-endian CRC tables (#144754)

Big-endian CRC tables are incorrect due to the initial value of CRC in
genSarwateTable being hard-coded for CRC-8. 128 is the signed-min value
for CRC-8, but it should be generalized to APInt::getSignedMinValue. The
issue was found when writing CRC verification tests for llvm-test-suite.
---
 llvm/lib/Analysis/HashRecognize.cpp           |  2 +-
 .../HashRecognize/cyclic-redundancy-check.ll  | 96 +++++++++----------
 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 1edb8b3bdc9a..987d13731276 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -478,7 +478,7 @@ CRCTable HashRecognize::genSarwateTable(const APInt &GenPoly,
   Table[0] = APInt::getZero(BW);
 
   if (ByteOrderSwapped) {
-    APInt CRCInit(BW, 128);
+    APInt CRCInit = APInt::getSignedMinValue(BW);
     for (unsigned I = 1; I < 256; I <<= 1) {
       CRCInit = CRCInit.shl(1) ^
                 (CRCInit.isSignBitSet() ? GenPoly : APInt::getZero(BW));
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 7a3082056ad2..0fbc376c40d7 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -151,22 +151,22 @@ define i16 @crc16.be.tc8.crc.init.li(i16 %checksum, i8 %msg) {
 ; CHECK-NEXT:    Generating polynomial: 4129
 ; CHECK-NEXT:    Computed CRC: %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
 ; CHECK-NEXT:    Computed CRC lookup table:
-; CHECK-NEXT:  0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840
-; CHECK-NEXT:  4096 4352 4608 4864 5120 5376 5632 5888 6144 6400 6656 6912 7168 7424 7680 7936
-; CHECK-NEXT:  8192 8448 8704 8960 9216 9472 9728 9984 10240 10496 10752 11008 11264 11520 11776 12032
-; CHECK-NEXT:  12288 12544 12800 13056 13312 13568 13824 14080 14336 14592 14848 15104 15360 15616 15872 16128
-; CHECK-NEXT:  16384 16640 16896 17152 17408 17664 17920 18176 18432 18688 18944 19200 19456 19712 19968 20224
-; CHECK-NEXT:  20480 20736 20992 21248 21504 21760 22016 22272 22528 22784 23040 23296 23552 23808 24064 24320
-; CHECK-NEXT:  24576 24832 25088 25344 25600 25856 26112 26368 26624 26880 27136 27392 27648 27904 28160 28416
-; CHECK-NEXT:  28672 28928 29184 29440 29696 29952 30208 30464 30720 30976 31232 31488 31744 32000 32256 32512
-; CHECK-NEXT:  32768 33024 33280 33536 33792 34048 34304 34560 34816 35072 35328 35584 35840 36096 36352 36608
-; CHECK-NEXT:  36864 37120 37376 37632 37888 38144 38400 38656 38912 39168 39424 39680 39936 40192 40448 40704
-; CHECK-NEXT:  40960 41216 41472 41728 41984 42240 42496 42752 43008 43264 43520 43776 44032 44288 44544 44800
-; CHECK-NEXT:  45056 45312 45568 45824 46080 46336 46592 46848 47104 47360 47616 47872 48128 48384 48640 48896
-; CHECK-NEXT:  49152 49408 49664 49920 50176 50432 50688 50944 51200 51456 51712 51968 52224 52480 52736 52992
-; CHECK-NEXT:  53248 53504 53760 54016 54272 54528 54784 55040 55296 55552 55808 56064 56320 56576 56832 57088
-; CHECK-NEXT:  57344 57600 57856 58112 58368 58624 58880 59136 59392 59648 59904 60160 60416 60672 60928 61184
-; CHECK-NEXT:  61440 61696 61952 62208 62464 62720 62976 63232 63488 63744 64000 64256 64512 64768 65024 65280
+; CHECK-NEXT:  0 4129 8258 12387 16516 20645 24774 28903 33032 37161 41290 45419 49548 53677 57806 61935
+; CHECK-NEXT:  4657 528 12915 8786 21173 17044 29431 25302 37689 33560 45947 41818 54205 50076 62463 58334
+; CHECK-NEXT:  9314 13379 1056 5121 25830 29895 17572 21637 42346 46411 34088 38153 58862 62927 50604 54669
+; CHECK-NEXT:  13907 9842 5649 1584 30423 26358 22165 18100 46939 42874 38681 34616 63455 59390 55197 51132
+; CHECK-NEXT:  18628 22757 26758 30887 2112 6241 10242 14371 51660 55789 59790 63919 35144 39273 43274 47403
+; CHECK-NEXT:  23285 19156 31415 27286 6769 2640 14899 10770 56317 52188 64447 60318 39801 35672 47931 43802
+; CHECK-NEXT:  27814 31879 19684 23749 11298 15363 3168 7233 60846 64911 52716 56781 44330 48395 36200 40265
+; CHECK-NEXT:  32407 28342 24277 20212 15891 11826 7761 3696 65439 61374 57309 53244 48923 44858 40793 36728
+; CHECK-NEXT:  37256 33193 45514 41451 53516 49453 61774 57711 4224 161 12482 8419 20484 16421 28742 24679
+; CHECK-NEXT:  33721 37784 41979 46042 49981 54044 58239 62302 689 4752 8947 13010 16949 21012 25207 29270
+; CHECK-NEXT:  46570 42443 38312 34185 62830 58703 54572 50445 13538 9411 5280 1153 29798 25671 21540 17413
+; CHECK-NEXT:  42971 47098 34713 38840 59231 63358 50973 55100 9939 14066 1681 5808 26199 30326 17941 22068
+; CHECK-NEXT:  55628 51565 63758 59695 39368 35305 47498 43435 22596 18533 30726 26663 6336 2273 14466 10403
+; CHECK-NEXT:  52093 56156 60223 64286 35833 39896 43963 48026 19061 23124 27191 31254 2801 6864 10931 14994
+; CHECK-NEXT:  64814 60687 56684 52557 48554 44427 40424 36297 31782 27655 23652 19525 15522 11395 7392 3265
+; CHECK-NEXT:  61215 65342 53085 57212 44955 49082 36825 40952 28183 32310 20053 24180 11923 16050 3793 7920
 ;
 entry:
   %msg.ext = zext i8 %msg to i16
@@ -196,22 +196,22 @@ define i16 @crc16.be.tc8.crc.init.arg(i16 %crc.init) {
 ; CHECK-NEXT:    Generating polynomial: 4129
 ; CHECK-NEXT:    Computed CRC: %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
 ; CHECK-NEXT:    Computed CRC lookup table:
-; CHECK-NEXT:  0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840
-; CHECK-NEXT:  4096 4352 4608 4864 5120 5376 5632 5888 6144 6400 6656 6912 7168 7424 7680 7936
-; CHECK-NEXT:  8192 8448 8704 8960 9216 9472 9728 9984 10240 10496 10752 11008 11264 11520 11776 12032
-; CHECK-NEXT:  12288 12544 12800 13056 13312 13568 13824 14080 14336 14592 14848 15104 15360 15616 15872 16128
-; CHECK-NEXT:  16384 16640 16896 17152 17408 17664 17920 18176 18432 18688 18944 19200 19456 19712 19968 20224
-; CHECK-NEXT:  20480 20736 20992 21248 21504 21760 22016 22272 22528 22784 23040 23296 23552 23808 24064 24320
-; CHECK-NEXT:  24576 24832 25088 25344 25600 25856 26112 26368 26624 26880 27136 27392 27648 27904 28160 28416
-; CHECK-NEXT:  28672 28928 29184 29440 29696 29952 30208 30464 30720 30976 31232 31488 31744 32000 32256 32512
-; CHECK-NEXT:  32768 33024 33280 33536 33792 34048 34304 34560 34816 35072 35328 35584 35840 36096 36352 36608
-; CHECK-NEXT:  36864 37120 37376 37632 37888 38144 38400 38656 38912 39168 39424 39680 39936 40192 40448 40704
-; CHECK-NEXT:  40960 41216 41472 41728 41984 42240 42496 42752 43008 43264 43520 43776 44032 44288 44544 44800
-; CHECK-NEXT:  45056 45312 45568 45824 46080 46336 46592 46848 47104 47360 47616 47872 48128 48384 48640 48896
-; CHECK-NEXT:  49152 49408 49664 49920 50176 50432 50688 50944 51200 51456 51712 51968 52224 52480 52736 52992
-; CHECK-NEXT:  53248 53504 53760 54016 54272 54528 54784 55040 55296 55552 55808 56064 56320 56576 56832 57088
-; CHECK-NEXT:  57344 57600 57856 58112 58368 58624 58880 59136 59392 59648 59904 60160 60416 60672 60928 61184
-; CHECK-NEXT:  61440 61696 61952 62208 62464 62720 62976 63232 63488 63744 64000 64256 64512 64768 65024 65280
+; CHECK-NEXT:  0 4129 8258 12387 16516 20645 24774 28903 33032 37161 41290 45419 49548 53677 57806 61935
+; CHECK-NEXT:  4657 528 12915 8786 21173 17044 29431 25302 37689 33560 45947 41818 54205 50076 62463 58334
+; CHECK-NEXT:  9314 13379 1056 5121 25830 29895 17572 21637 42346 46411 34088 38153 58862 62927 50604 54669
+; CHECK-NEXT:  13907 9842 5649 1584 30423 26358 22165 18100 46939 42874 38681 34616 63455 59390 55197 51132
+; CHECK-NEXT:  18628 22757 26758 30887 2112 6241 10242 14371 51660 55789 59790 63919 35144 39273 43274 47403
+; CHECK-NEXT:  23285 19156 31415 27286 6769 2640 14899 10770 56317 52188 64447 60318 39801 35672 47931 43802
+; CHECK-NEXT:  27814 31879 19684 23749 11298 15363 3168 7233 60846 64911 52716 56781 44330 48395 36200 40265
+; CHECK-NEXT:  32407 28342 24277 20212 15891 11826 7761 3696 65439 61374 57309 53244 48923 44858 40793 36728
+; CHECK-NEXT:  37256 33193 45514 41451 53516 49453 61774 57711 4224 161 12482 8419 20484 16421 28742 24679
+; CHECK-NEXT:  33721 37784 41979 46042 49981 54044 58239 62302 689 4752 8947 13010 16949 21012 25207 29270
+; CHECK-NEXT:  46570 42443 38312 34185 62830 58703 54572 50445 13538 9411 5280 1153 29798 25671 21540 17413
+; CHECK-NEXT:  42971 47098 34713 38840 59231 63358 50973 55100 9939 14066 1681 5808 26199 30326 17941 22068
+; CHECK-NEXT:  55628 51565 63758 59695 39368 35305 47498 43435 22596 18533 30726 26663 6336 2273 14466 10403
+; CHECK-NEXT:  52093 56156 60223 64286 35833 39896 43963 48026 19061 23124 27191 31254 2801 6864 10931 14994
+; CHECK-NEXT:  64814 60687 56684 52557 48554 44427 40424 36297 31782 27655 23652 19525 15522 11395 7392 3265
+; CHECK-NEXT:  61215 65342 53085 57212 44955 49082 36825 40952 28183 32310 20053 24180 11923 16050 3793 7920
 ;
 entry:
   br label %loop
@@ -238,22 +238,22 @@ define i16 @crc16.be.tc8.crc.init.arg.flipped.sb.check(i16 %crc.init) {
 ; CHECK-NEXT:    Generating polynomial: 4129
 ; CHECK-NEXT:    Computed CRC: %crc.next = select i1 %check.sb, i16 %crc.shl, i16 %crc.xor
 ; CHECK-NEXT:    Computed CRC lookup table:
-; CHECK-NEXT:  0 256 512 768 1024 1280 1536 1792 2048 2304 2560 2816 3072 3328 3584 3840
-; CHECK-NEXT:  4096 4352 4608 4864 5120 5376 5632 5888 6144 6400 6656 6912 7168 7424 7680 7936
-; CHECK-NEXT:  8192 8448 8704 8960 9216 9472 9728 9984 10240 10496 10752 11008 11264 11520 11776 12032
-; CHECK-NEXT:  12288 12544 12800 13056 13312 13568 13824 14080 14336 14592 14848 15104 15360 15616 15872 16128
-; CHECK-NEXT:  16384 16640 16896 17152 17408 17664 17920 18176 18432 18688 18944 19200 19456 19712 19968 20224
-; CHECK-NEXT:  20480 20736 20992 21248 21504 21760 22016 22272 22528 22784 23040 23296 23552 23808 24064 24320
-; CHECK-NEXT:  24576 24832 25088 25344 25600 25856 26112 26368 26624 26880 27136 27392 27648 27904 28160 28416
-; CHECK-NEXT:  28672 28928 29184 29440 29696 29952 30208 30464 30720 30976 31232 31488 31744 32000 32256 32512
-; CHECK-NEXT:  32768 33024 33280 33536 33792 34048 34304 34560 34816 35072 35328 35584 35840 36096 36352 36608
-; CHECK-NEXT:  36864 37120 37376 37632 37888 38144 38400 38656 38912 39168 39424 39680 39936 40192 40448 40704
-; CHECK-NEXT:  40960 41216 41472 41728 41984 42240 42496 42752 43008 43264 43520 43776 44032 44288 44544 44800
-; CHECK-NEXT:  45056 45312 45568 45824 46080 46336 46592 46848 47104 47360 47616 47872 48128 48384 48640 48896
-; CHECK-NEXT:  49152 49408 49664 49920 50176 50432 50688 50944 51200 51456 51712 51968 52224 52480 52736 52992
-; CHECK-NEXT:  53248 53504 53760 54016 54272 54528 54784 55040 55296 55552 55808 56064 56320 56576 56832 57088
-; CHECK-NEXT:  57344 57600 57856 58112 58368 58624 58880 59136 59392 59648 59904 60160 60416 60672 60928 61184
-; CHECK-NEXT:  61440 61696 61952 62208 62464 62720 62976 63232 63488 63744 64000 64256 64512 64768 65024 65280
+; CHECK-NEXT:  0 4129 8258 12387 16516 20645 24774 28903 33032 37161 41290 45419 49548 53677 57806 61935
+; CHECK-NEXT:  4657 528 12915 8786 21173 17044 29431 25302 37689 33560 45947 41818 54205 50076 62463 58334
+; CHECK-NEXT:  9314 13379 1056 5121 25830 29895 17572 21637 42346 46411 34088 38153 58862 62927 50604 54669
+; CHECK-NEXT:  13907 9842 5649 1584 30423 26358 22165 18100 46939 42874 38681 34616 63455 59390 55197 51132
+; CHECK-NEXT:  18628 22757 26758 30887 2112 6241 10242 14371 51660 55789 59790 63919 35144 39273 43274 47403
+; CHECK-NEXT:  23285 19156 31415 27286 6769 2640 14899 10770 56317 52188 64447 60318 39801 35672 47931 43802
+; CHECK-NEXT:  27814 31879 19684 23749 11298 15363 3168 7233 60846 64911 52716 56781 44330 48395 36200 40265
+; CHECK-NEXT:  32407 28342 24277 20212 15891 11826 7761 3696 65439 61374 57309 53244 48923 44858 40793 36728
+; CHECK-NEXT:  37256 33193 45514 41451 53516 49453 61774 57711 4224 161 12482 8419 20484 16421 28742 24679
+; CHECK-NEXT:  33721 37784 41979 46042 49981 54044 58239 62302 689 4752 8947 13010 16949 21012 25207 29270
+; CHECK-NEXT:  46570 42443 38312 34185 62830 58703 54572 50445 13538 9411 5280 1153 29798 25671 21540 17413
+; CHECK-NEXT:  42971 47098 34713 38840 59231 63358 50973 55100 9939 14066 1681 5808 26199 30326 17941 22068
+; CHECK-NEXT:  55628 51565 63758 59695 39368 35305 47498 43435 22596 18533 30726 26663 6336 2273 14466 10403
+; CHECK-NEXT:  52093 56156 60223 64286 35833 39896 43963 48026 19061 23124 27191 31254 2801 6864 10931 14994
+; CHECK-NEXT:  64814 60687 56684 52557 48554 44427 40424 36297 31782 27655 23652 19525 15522 11395 7392 3265
+; CHECK-NEXT:  61215 65342 53085 57212 44955 49082 36825 40952 28183 32310 20053 24180 11923 16050 3793 7920
 ;
 entry:
   br label %loop

From f13b9e3643661ea2cda252c7e2c59ace036407c7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 18 Jun 2025 20:18:53 +0100
Subject: [PATCH 0853/1322] [HashRecognize] Don't const-qualify Values in
 result (#144752)

Const-qualifying Values in the analysis result makes them unusable with
IRBuilder. The issue was discovered when attempting to use the result of
the analysis for a transform.
---
 llvm/include/llvm/Analysis/HashRecognize.h | 12 ++++++------
 llvm/lib/Analysis/HashRecognize.cpp        |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Analysis/HashRecognize.h b/llvm/include/llvm/Analysis/HashRecognize.h
index 8ab68a5dc2cb..c169383bf7b0 100644
--- a/llvm/include/llvm/Analysis/HashRecognize.h
+++ b/llvm/include/llvm/Analysis/HashRecognize.h
@@ -53,7 +53,7 @@ struct PolynomialInfo {
   // division in the case of CRC. Since polynomial division is an XOR in
   // GF(2^m), this variable must be XOR'ed with RHS in a loop to yield the
   // ComputedValue.
-  const Value *LHS;
+  Value *LHS;
 
   // The generating polynomial, or the RHS of the polynomial division in the
   // case of CRC.
@@ -61,7 +61,7 @@ struct PolynomialInfo {
 
   // The final computed value. This is a remainder of a polynomial division in
   // the case of CRC, which must be zero.
-  const Value *ComputedValue;
+  Value *ComputedValue;
 
   // Set to true in the case of big-endian.
   bool ByteOrderSwapped;
@@ -69,11 +69,11 @@ struct PolynomialInfo {
   // An optional auxiliary checksum that augments the LHS. In the case of CRC,
   // it is XOR'ed with the LHS, so that the computation's final remainder is
   // zero.
-  const Value *LHSAux;
+  Value *LHSAux;
 
-  PolynomialInfo(unsigned TripCount, const Value *LHS, const APInt &RHS,
-                 const Value *ComputedValue, bool ByteOrderSwapped,
-                 const Value *LHSAux = nullptr);
+  PolynomialInfo(unsigned TripCount, Value *LHS, const APInt &RHS,
+                 Value *ComputedValue, bool ByteOrderSwapped,
+                 Value *LHSAux = nullptr);
 };
 
 /// The analysis.
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 987d13731276..06a3738018e9 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -442,9 +442,9 @@ getRecurrences(BasicBlock *LoopLatch, const PHINode *IndVar, const Loop &L) {
   return std::make_pair(SimpleRecurrence, ConditionalRecurrence);
 }
 
-PolynomialInfo::PolynomialInfo(unsigned TripCount, const Value *LHS,
-                               const APInt &RHS, const Value *ComputedValue,
-                               bool ByteOrderSwapped, const Value *LHSAux)
+PolynomialInfo::PolynomialInfo(unsigned TripCount, Value *LHS, const APInt &RHS,
+                               Value *ComputedValue, bool ByteOrderSwapped,
+                               Value *LHSAux)
     : TripCount(TripCount), LHS(LHS), RHS(RHS), ComputedValue(ComputedValue),
       ByteOrderSwapped(ByteOrderSwapped), LHSAux(LHSAux) {}
 
@@ -623,7 +623,7 @@ HashRecognize::recognizeCRC() const {
   if (!checkExtractBits(ResultBits, TC, IsZero, *ByteOrderSwapped))
     return ErrBits(ResultBits, TC, *ByteOrderSwapped);
 
-  const Value *LHSAux = SimpleRecurrence ? SimpleRecurrence.Start : nullptr;
+  Value *LHSAux = SimpleRecurrence ? SimpleRecurrence.Start : nullptr;
   return PolynomialInfo(TC, ConditionalRecurrence.Start, GenPoly, ComputedValue,
                         *ByteOrderSwapped, LHSAux);
 }

From 156a64c585faf0870936b62ec85fae19ceb9ad3f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 18 Jun 2025 20:19:25 +0100
Subject: [PATCH 0854/1322] [HashRecognize] Tighten pre-conditions for analysis
 (#144757)

Exit early if the TC is not a byte-multiple, as optimization works by
dividing TC by 8. Also delay the SCEV TC query.
---
 llvm/lib/Analysis/HashRecognize.cpp           |  8 +--
 .../HashRecognize/cyclic-redundancy-check.ll  | 62 +++++++++++++++++--
 2 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 06a3738018e9..d11602f92187 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -561,14 +561,14 @@ std::variant<PolynomialInfo, ErrBits, StringRef>
 HashRecognize::recognizeCRC() const {
   if (!L.isInnermost())
     return "Loop is not innermost";
-  unsigned TC = SE.getSmallConstantMaxTripCount(&L);
-  if (!TC || TC > 256)
-    return "Unable to find a small constant trip count";
   BasicBlock *Latch = L.getLoopLatch();
   BasicBlock *Exit = L.getExitBlock();
   const PHINode *IndVar = L.getCanonicalInductionVariable();
-  if (!Latch || !Exit || !IndVar)
+  if (!Latch || !Exit || !IndVar || L.getNumBlocks() != 1)
     return "Loop not in canonical form";
+  unsigned TC = SE.getSmallConstantTripCount(&L);
+  if (!TC || TC > 256 || TC % 8)
+    return "Unable to find a small constant byte-multiple trip count";
 
   auto R = getRecurrences(Latch, IndVar, L);
   if (!R)
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 0fbc376c40d7..0366684a13b5 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -384,7 +384,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.non.const.tc(i16 %crc.init, i32 %loop.limit) {
 ; CHECK-LABEL: 'not.crc.non.const.tc'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Unable to find a small constant trip count
+; CHECK-NEXT:  Reason: Unable to find a small constant byte-multiple trip count
 ;
 entry:
   br label %loop
@@ -404,8 +404,31 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
-define i16 @not.crc.non.canonical.loop(i16 %crc.init) {
-; CHECK-LABEL: 'not.crc.non.canonical.loop'
+define i16 @not.crc.non.canonical.not.multiple.8(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.non.canonical.not.multiple.8'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Unable to find a small constant byte-multiple trip count
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 4129
+  %check.sb = icmp slt i16 %crc, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign eq i32 %iv, 3
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+define i16 @not.crc.non.canonical.loop.countdown(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.non.canonical.loop.countdown'
 ; CHECK-NEXT:  Did not find a hash algorithm
 ; CHECK-NEXT:  Reason: Loop not in canonical form
 ;
@@ -427,10 +450,39 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
+define i16 @not.crc.non.canonical.loop.multiple.blocks(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.non.canonical.loop.multiple.blocks'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Loop not in canonical form
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %continue ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %continue ]
+  %check.sb = icmp slt i16 %crc, 0
+  %crc.shl = shl i16 %crc, 1
+  br i1 %check.sb, label %xor, label %continue
+
+xor:
+  %crc.xor = xor i16 %crc.shl, 4129
+  br label %continue
+
+continue:
+  %crc.next = phi i16 [ %crc.xor, %xor ], [ %crc.shl, %loop ]
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign eq i32 %iv, 7
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
 define i16 @not.crc.tc.limit(i16 %crc.init) {
 ; CHECK-LABEL: 'not.crc.tc.limit'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Unable to find a small constant trip count
+; CHECK-NEXT:  Reason: Unable to find a small constant byte-multiple trip count
 ;
 entry:
   br label %loop
@@ -617,7 +669,7 @@ loop:                                              ; preds = %loop, %entry
   %crc.xor = xor i16 %crc.lshr, -24575
   %crc.next = select i1 %check.sb, i16 %crc.lshr, i16 %crc.xor
   %iv.next = add nuw nsw i8 %iv, 1
-  %exit.cond = icmp samesign ult i8 %iv, 20
+  %exit.cond = icmp samesign ult i8 %iv, 31
   br i1 %exit.cond, label %loop, label %exit
 
 exit:                                              ; preds = %loop

From 88d250729eb00842a41c946632bcacf1af106f64 Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Wed, 18 Jun 2025 14:33:59 -0500
Subject: [PATCH 0855/1322] Revert "[llvm-cov] Export decision coverage to
 output json" (#144783)

Reverts llvm/llvm-project#144335

Need to resolve test failures
---
 .../llvm/ProfileData/Coverage/CoverageMapping.h      | 12 ------------
 llvm/tools/llvm-cov/CoverageExporterJson.cpp         |  6 ++----
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 8e6180be25b5..d1230b0ba7c5 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -31,7 +31,6 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -495,17 +494,6 @@ public:
     return TV[TestVectorIndex].first[PosToID[Condition]];
   }
 
-  /// Return the number of True and False decisions for all executed test
-  /// vectors.
-  std::pair<unsigned, unsigned> getDecisions() const {
-    const unsigned TrueDecisions =
-        std::count_if(TV.begin(), TV.end(), [](const auto &TestVec) {
-          return TestVec.second == CondState::MCDC_True;
-        });
-
-    return {TrueDecisions, TV.size() - TrueDecisions};
-  }
-
   /// Return the Result evaluation for an executed test vector.
   /// See MCDCRecordProcessor::RecordTestVector().
   CondState getTVResult(unsigned TestVectorIndex) {
diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index 024693a24cc2..4088c1b053aa 100644
--- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -62,7 +62,7 @@
 #include <utility>
 
 /// The semantic version combined as a string.
-#define LLVM_COVERAGE_EXPORT_JSON_STR "3.0.0"
+#define LLVM_COVERAGE_EXPORT_JSON_STR "2.0.1"
 
 /// Unique type identifier for JSON coverage export.
 #define LLVM_COVERAGE_EXPORT_JSON_TYPE_STR "llvm.coverage.json.export"
@@ -110,10 +110,8 @@ json::Array gatherConditions(const coverage::MCDCRecord &Record) {
 
 json::Array renderMCDCRecord(const coverage::MCDCRecord &Record) {
   const llvm::coverage::CounterMappingRegion &CMR = Record.getDecisionRegion();
-  const auto [TrueDecisions, FalseDecisions] = Record.getDecisions();
   return json::Array({CMR.LineStart, CMR.ColumnStart, CMR.LineEnd,
-                      CMR.ColumnEnd, TrueDecisions, FalseDecisions,
-                      CMR.ExpandedFileID, int64_t(CMR.Kind),
+                      CMR.ColumnEnd, CMR.ExpandedFileID, int64_t(CMR.Kind),
                       gatherConditions(Record)});
 }
 

From fb0651959b1b6ae64f84cf5840adc95923af991f Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 18 Jun 2025 15:37:56 -0400
Subject: [PATCH 0856/1322] [AArch64] fix trampoline implementation: actually
 use X15 (#143892)

A incorrect switch statement caused it to try to use X4 instead of X15
in #126743, which would have not worked.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp |  3 +--
 llvm/test/CodeGen/AArch64/trampoline.ll         | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d8b574719dae..581f15277602 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7138,8 +7138,7 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   switch (CC) {
   default:
     NestReg = 0x0f; // X15
-    LLVM_FALLTHROUGH;
-  case CallingConv::ARM64EC_Thunk_Native:
+    break;
   case CallingConv::ARM64EC_Thunk_X64:
     // Must be kept in sync with AArch64CallingConv.td
     NestReg = 0x04; // X4
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index d9016b02a0f8..0e682704afbf 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -83,7 +83,7 @@ define i64 @func1() {
 ; CHECK-LINUX-NEXT:    str w9, [sp, #16]
 ; CHECK-LINUX-NEXT:    add x9, sp, #56
 ; CHECK-LINUX-NEXT:    stp x9, x8, [sp, #24]
-; CHECK-LINUX-NEXT:    mov x8, #132 // =0x84
+; CHECK-LINUX-NEXT:    mov x8, #143 // =0x8f
 ; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #16
 ; CHECK-LINUX-NEXT:    movk x8, #177, lsl #32
 ; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #48
@@ -112,7 +112,7 @@ define i64 @func1() {
 ; CHECK-PC-NEXT:    add x0, sp, #8
 ; CHECK-PC-NEXT:    movk w8, #54815, lsl #16
 ; CHECK-PC-NEXT:    str w8, [sp, #16]
-; CHECK-PC-NEXT:    mov x8, #132 // =0x84
+; CHECK-PC-NEXT:    mov x8, #143 // =0x8f
 ; CHECK-PC-NEXT:    movk x8, #22528, lsl #16
 ; CHECK-PC-NEXT:    movk x8, #177, lsl #32
 ; CHECK-PC-NEXT:    movk x8, #22528, lsl #48
@@ -148,7 +148,7 @@ define i64 @func1() {
 ; CHECK-APPLE-NEXT:    mov x0, sp
 ; CHECK-APPLE-NEXT:    movk w8, #54815, lsl #16
 ; CHECK-APPLE-NEXT:    str w8, [sp, #8]
-; CHECK-APPLE-NEXT:    mov x8, #132 ; =0x84
+; CHECK-APPLE-NEXT:    mov x8, #143 ; =0x8f
 ; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #16
 ; CHECK-APPLE-NEXT:    movk x8, #177, lsl #32
 ; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #48
@@ -184,7 +184,7 @@ define i64 @func2() {
 ; CHECK-LINUX-NEXT:    add x9, sp, #8
 ; CHECK-LINUX-NEXT:    add x1, x0, #12
 ; CHECK-LINUX-NEXT:    stp x9, x8, [x0, #16]
-; CHECK-LINUX-NEXT:    mov x8, #132 // =0x84
+; CHECK-LINUX-NEXT:    mov x8, #143 // =0x8f
 ; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #16
 ; CHECK-LINUX-NEXT:    movk x8, #177, lsl #32
 ; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #48
@@ -210,7 +210,7 @@ define i64 @func2() {
 ; CHECK-PC-NEXT:    mov w8, #544 // =0x220
 ; CHECK-PC-NEXT:    movk w8, #54815, lsl #16
 ; CHECK-PC-NEXT:    str w8, [x0, #8]
-; CHECK-PC-NEXT:    mov x8, #132 // =0x84
+; CHECK-PC-NEXT:    mov x8, #143 // =0x8f
 ; CHECK-PC-NEXT:    movk x8, #22528, lsl #16
 ; CHECK-PC-NEXT:    movk x8, #177, lsl #32
 ; CHECK-PC-NEXT:    movk x8, #22528, lsl #48
@@ -246,7 +246,7 @@ define i64 @func2() {
 ; CHECK-APPLE-NEXT:    mov w8, #544 ; =0x220
 ; CHECK-APPLE-NEXT:    movk w8, #54815, lsl #16
 ; CHECK-APPLE-NEXT:    str w8, [x0, #8]
-; CHECK-APPLE-NEXT:    mov x8, #132 ; =0x84
+; CHECK-APPLE-NEXT:    mov x8, #143 ; =0x8f
 ; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #16
 ; CHECK-APPLE-NEXT:    movk x8, #177, lsl #32
 ; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #48

From c04fc5596ec8c197c75b92a086c31438bfb08faf Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 18 Jun 2025 15:38:34 -0400
Subject: [PATCH 0857/1322] [MemCpyOpt] allow some undef contents overread in
 processMemCpyMemCpyDependence (#143745)

Allows memcpy to memcpy forwarding in cases where the second memcpy is
larger, but the overread is known to be undef, by shrinking the memcpy
size.

Refs https://github.com/llvm/llvm-project/pull/140954 which laid some of
the groundwork for this.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 52 +++++++++++++------
 .../MemCpyOpt/memcpy-memcpy-offset.ll         | 33 +++++++++---
 .../MemCpyOpt/variable-sized-memcpy-memcpy.ll | 37 ++++++++++++-
 3 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1c4ec6aa08b4..2b0e221f341e 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -107,6 +107,9 @@ struct MemsetRange {
 
 } // end anonymous namespace
 
+static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
+                                  MemIntrinsic *MemSrc, BatchAAResults &BAA);
+
 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
   if (TheStores.size() >= 4 || End - Start >= 16)
@@ -1129,14 +1132,29 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     MForwardOffset = *Offset;
   }
 
-  // The length of the memcpy's must be the same, or the preceding one
-  // must be larger than the following one.
-  if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) {
+  Value *CopyLength = M->getLength();
+
+  // The length of the memcpy's must be the same, or the preceding one must be
+  // larger than the following one, or the contents of the overread must be
+  // undefined bytes of a defined size.
+  if (MForwardOffset != 0 || MDep->getLength() != CopyLength) {
     auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
-    auto *MLen = dyn_cast<ConstantInt>(M->getLength());
-    if (!MDepLen || !MLen ||
-        MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset)
+    auto *MLen = dyn_cast<ConstantInt>(CopyLength);
+    // This could be converted to a runtime test (%CopyLength =
+    // min(max(0, MDepLen - MForwardOffset), MLen)), but it is
+    // unclear if that is useful
+    if (!MDepLen || !MLen)
       return false;
+    if (MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) {
+      if (!overreadUndefContents(MSSA, M, MDep, BAA))
+        return false;
+      if (MDepLen->getZExtValue() <= (uint64_t)MForwardOffset)
+        return false; // Should not reach here (there is obviously no aliasing
+                      // with MDep), so just bail in case it had incomplete info
+                      // somehow
+      CopyLength = ConstantInt::get(CopyLength->getType(),
+                                    MDepLen->getZExtValue() - MForwardOffset);
+    }
   }
 
   IRBuilder<> Builder(M);
@@ -1152,9 +1170,13 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
       eraseInstruction(NewCopySource);
   });
   MaybeAlign CopySourceAlign = MDep->getSourceAlign();
-  // We just need to calculate the actual size of the copy.
-  auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
-      MemoryLocation::getForSource(M).Size);
+  auto MCopyLoc = MemoryLocation::getForSource(MDep);
+  // Truncate the size of the MDep access to just the bytes read
+  if (MDep->getLength() != CopyLength) {
+    auto *ConstLength = cast<ConstantInt>(CopyLength);
+    MCopyLoc = MCopyLoc.getWithNewSize(
+        LocationSize::precise(ConstLength->getZExtValue()));
+  }
 
   // When the forwarding offset is greater than 0, we transform
   //    memcpy(d1 <- s1)
@@ -1223,20 +1245,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // example we could be moving from movaps -> movq on x86.
   Instruction *NewM;
   if (UseMemMove)
-    NewM =
-        Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource,
-                              CopySourceAlign, M->getLength(), M->isVolatile());
+    NewM = Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource,
+                                 CopySourceAlign, CopyLength, M->isVolatile());
   else if (M->isForceInlined())
     // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
     // never allowed since that would allow the latter to be lowered as a call
     // to an external function.
     NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(),
-                                      CopySource, CopySourceAlign,
-                                      M->getLength(), M->isVolatile());
+                                      CopySource, CopySourceAlign, CopyLength,
+                                      M->isVolatile());
   else
     NewM = Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource,
-                                CopySourceAlign, M->getLength(),
-                                M->isVolatile());
+                                CopySourceAlign, CopyLength, M->isVolatile());
 
   NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
index da654438d7bd..7dc579aad02f 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -134,14 +134,15 @@ define void @forward_offset_memcpy_inline(ptr %src, ptr %dest) {
   ret void
 }
 
-; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on.
-define void @do_not_forward_oversize_offset(ptr %src, ptr %dest) {
-; CHECK-LABEL: define void @do_not_forward_oversize_offset(
+; We can forward `memcpy` by shrinking it to the size of the `memcpy` it depends on.
+define void @forward_oversize_offset(ptr %src, ptr %dest) {
+; CHECK-LABEL: define void @forward_oversize_offset(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
-; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
-; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 6, i1 false)
+; CHECK-NEXT:    [[CPY_TMP:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[CPY_TMP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    [[CPY_TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[CPY_TMP]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %cpy_tmp = alloca %buf, align 1
@@ -214,6 +215,24 @@ define void @pr98675(ptr noalias %p1, ptr noalias %p2) {
   ret void
 }
 
+define void @over_offset_cpy(ptr %src) {
+; CHECK-LABEL: define void @over_offset_cpy(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[TMP:%.*]] = alloca [2 x i8], align 1
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC]], i64 1, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca [2 x i8]
+  %dst = alloca i8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false)
+  %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 1, i1 false)
+
+  ret void
+}
+
 declare void @use(ptr)
 
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1)
diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll
index 4f6b734ec057..95402a8ea686 100644
--- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll
@@ -18,7 +18,42 @@ define void @test(ptr %src, i64 %size) {
   ret void
 }
 
-; Differing sizes, so left as it is.
+define void @dynalloca_test(ptr %src, i64 %size1) {
+; CHECK-LABEL: @dynalloca_test(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, i64 [[SIZE1]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC:%.*]], i64 31, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST]], ptr align 8 [[SRC]], i64 31, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca i8, i64 %size1
+  %dst = alloca i8, i64 %size1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 31, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 32, i1 false)
+
+  ret void
+}
+
+define void @dynalloca_offset_test(ptr %src, i64 %size1) {
+; CHECK-LABEL: @dynalloca_offset_test(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, i64 [[SIZE1]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC:%.*]], i64 31, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST]], ptr align 1 [[TMP1]], i64 30, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca i8, i64 %size1
+  %dst = alloca i8, i64 %size1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 31, i1 false)
+  %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 31, i1 false)
+
+  ret void
+}
+
+; Dynamic sizes, so left as it is.
 define void @negative_test(ptr %src, i64 %size1, i64 %size2) {
 ; CHECK-LABEL: @negative_test(
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1

From 67c52aacae2aa698eb1d31d81d2376bd77723d3a Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 18 Jun 2025 21:47:50 +0200
Subject: [PATCH 0858/1322] [CIR] Upstream support for IncompleteArrayType
 (#144138)

This change adds the basic support for IncompleteArray

Issue https://github.com/llvm/llvm-project/issues/130197
---
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 16 ++++++++++++++++
 clang/test/CIR/CodeGen/struct.c       | 14 ++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index bab47924dd71..621eb66962bf 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -416,6 +416,22 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::IncompleteArray: {
+    const IncompleteArrayType *arrTy = cast<IncompleteArrayType>(ty);
+    if (arrTy->getIndexTypeCVRQualifiers() != 0)
+      cgm.errorNYI(SourceLocation(), "non trivial array types", type);
+
+    mlir::Type elemTy = convertTypeForMem(arrTy->getElementType());
+    // int X[] -> [0 x int], unless the element type is not sized.  If it is
+    // unsized (e.g. an incomplete record) just use [0 x i8].
+    if (!builder.isSized(elemTy)) {
+      elemTy = cgm.SInt8Ty;
+    }
+
+    resultType = cir::ArrayType::get(elemTy, 0);
+    break;
+  }
+
   case Type::ConstantArray: {
     const ConstantArrayType *arrTy = cast<ConstantArrayType>(ty);
     mlir::Type elemTy = convertTypeForMem(arrTy->getElementType());
diff --git a/clang/test/CIR/CodeGen/struct.c b/clang/test/CIR/CodeGen/struct.c
index ed84edd97e5d..b722b64eeb58 100644
--- a/clang/test/CIR/CodeGen/struct.c
+++ b/clang/test/CIR/CodeGen/struct.c
@@ -19,6 +19,7 @@
 // CIR-DAG: !rec_CycleEnd = !cir.record<struct "CycleEnd" {!cir.ptr<!cir.record<struct "CycleStart" {!cir.ptr<!cir.record<struct "CycleMiddle" {!cir.ptr<!cir.record<struct "CycleEnd">>}>>}>>}>
 // CIR-DAG: !rec_CycleMiddle = !cir.record<struct "CycleMiddle" {!cir.ptr<!rec_CycleEnd>}>
 // CIR-DAG: !rec_CycleStart = !cir.record<struct "CycleStart" {!cir.ptr<!rec_CycleMiddle>}>
+// CIR-DAG: !rec_IncompleteArray = !cir.record<struct "IncompleteArray" {!cir.array<!s32i x 0>}>
 // LLVM-DAG: %struct.CompleteS = type { i32, i8 }
 // LLVM-DAG: %struct.OuterS = type { %struct.InnerS, i32 }
 // LLVM-DAG: %struct.InnerS = type { i32, i8 }
@@ -30,6 +31,7 @@
 // LLVM-DAG: %struct.CycleStart = type { ptr }
 // LLVM-DAG: %struct.CycleMiddle = type { ptr }
 // LLVM-DAG: %struct.CycleEnd = type { ptr }
+// LLVM-DAG: %struct.IncompleteArray = type { [0 x i32] }
 // OGCG-DAG: %struct.CompleteS = type { i32, i8 }
 // OGCG-DAG: %struct.OuterS = type { %struct.InnerS, i32 }
 // OGCG-DAG: %struct.InnerS = type { i32, i8 }
@@ -41,6 +43,7 @@
 // OGCG-DAG: %struct.CycleStart = type { ptr }
 // OGCG-DAG: %struct.CycleMiddle = type { ptr }
 // OGCG-DAG: %struct.CycleEnd = type { ptr }
+// OGCG-DAG: %struct.IncompleteArray = type { [0 x i32] }
 
 struct CompleteS {
   int a;
@@ -149,6 +152,16 @@ struct CycleEnd {
 // LLVM-DAG:  @end = global %struct.CycleEnd zeroinitializer
 // OGCG-DAG:  @end = global %struct.CycleEnd zeroinitializer
 
+struct IncompleteArray {
+  int array[];
+} incomplete;
+
+// CIR: cir.global external @incomplete = #cir.zero : !rec_IncompleteArray
+
+// LLVM-DAG: global %struct.IncompleteArray zeroinitializer
+
+// OGCG-DAG: global %struct.IncompleteArray zeroinitializer
+
 void f(void) {
   struct IncompleteS *p;
 }
@@ -313,3 +326,4 @@ void f6(struct CycleStart *start) {
 // OGCG:   %[[MIDDLE:.*]] = getelementptr inbounds nuw %struct.CycleStart, ptr %{{.*}}, i32 0, i32 0
 // OGCG:   %[[END:.*]] = getelementptr inbounds nuw %struct.CycleMiddle, ptr %{{.*}}, i32 0, i32 0
 // OGCG:   %[[START2:.*]] = getelementptr inbounds nuw %struct.CycleEnd, ptr %{{.*}}, i32 0, i32 0
+

From d4b7c0d8b437f50ea254d814a1aeecf87a17be91 Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Wed, 18 Jun 2025 20:49:55 +0100
Subject: [PATCH 0859/1322] [Remarks] Auto-detect remark parser format
 (#144554)

Add remark format 'Auto', which performs automatic detection of the
remark format using the magic numbers at the beginning of the remarks
files.

The RemarkLinker already did something similar, so we streamlined this
and exposed this to llvm-remarkutil.
---
 llvm/include/llvm/Remarks/RemarkFormat.h      |  5 ++++-
 llvm/include/llvm/Remarks/RemarkLinker.h      |  5 ++---
 llvm/lib/Remarks/RemarkFormat.cpp             | 18 +++++++++++++++-
 llvm/lib/Remarks/RemarkLinker.cpp             | 14 +++----------
 llvm/lib/Remarks/RemarkParser.cpp             | 21 +++++++++++++------
 llvm/lib/Remarks/RemarkSerializer.cpp         |  6 ++++--
 .../Inputs/broken-remark-magic.bitstream      |  1 +
 .../llvm-remarkutil/annotation-count.test     |  2 ++
 .../broken-bitstream-remark-magic.test        |  6 ++++++
 .../tools/llvm-remarkutil/empty-file.test     |  5 +++++
 .../llvm-remarkutil/instruction-count.test    |  4 +++-
 .../llvm-remarkutil/instruction-mix.test      |  4 +++-
 .../size-diff/no-difference.test              |  3 +++
 .../tools/llvm-remarkutil/RemarkUtilHelpers.h |  9 +++++---
 llvm/unittests/Remarks/RemarksLinkingTest.cpp |  6 ++----
 15 files changed, 76 insertions(+), 33 deletions(-)
 create mode 100644 llvm/test/tools/llvm-remarkutil/Inputs/broken-remark-magic.bitstream
 create mode 100644 llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test

diff --git a/llvm/include/llvm/Remarks/RemarkFormat.h b/llvm/include/llvm/Remarks/RemarkFormat.h
index a39a013dcf90..eda201d4ee6f 100644
--- a/llvm/include/llvm/Remarks/RemarkFormat.h
+++ b/llvm/include/llvm/Remarks/RemarkFormat.h
@@ -23,7 +23,7 @@ namespace remarks {
 constexpr StringLiteral Magic("REMARKS");
 
 /// The format used for serializing/deserializing remarks.
-enum class Format { Unknown, YAML, Bitstream };
+enum class Format { Unknown, Auto, YAML, Bitstream };
 
 /// Parse and validate a string for the remark format.
 LLVM_ABI Expected<Format> parseFormat(StringRef FormatStr);
@@ -31,6 +31,9 @@ LLVM_ABI Expected<Format> parseFormat(StringRef FormatStr);
 /// Parse and validate a magic number to a remark format.
 LLVM_ABI Expected<Format> magicToFormat(StringRef Magic);
 
+/// Detect format based on selected format and magic number
+LLVM_ABI Expected<Format> detectFormat(Format Selected, StringRef Magic);
+
 } // end namespace remarks
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Remarks/RemarkLinker.h b/llvm/include/llvm/Remarks/RemarkLinker.h
index 5343c6214470..67208f40592a 100644
--- a/llvm/include/llvm/Remarks/RemarkLinker.h
+++ b/llvm/include/llvm/Remarks/RemarkLinker.h
@@ -80,13 +80,12 @@ public:
   /// \p Buffer.
   /// \p Buffer can be either a standalone remark container or just
   /// metadata. This takes care of uniquing and merging the remarks.
-  LLVM_ABI Error link(StringRef Buffer,
-                      std::optional<Format> RemarkFormat = std::nullopt);
+  LLVM_ABI Error link(StringRef Buffer, Format RemarkFormat = Format::Auto);
 
   /// Link the remarks found in \p Obj by looking for the right section and
   /// calling the method above.
   LLVM_ABI Error link(const object::ObjectFile &Obj,
-                      std::optional<Format> RemarkFormat = std::nullopt);
+                      Format RemarkFormat = Format::Auto);
 
   /// Serialize the linked remarks to the stream \p OS, using the format \p
   /// RemarkFormat.
diff --git a/llvm/lib/Remarks/RemarkFormat.cpp b/llvm/lib/Remarks/RemarkFormat.cpp
index 800f5bffe70d..1c52e352f939 100644
--- a/llvm/lib/Remarks/RemarkFormat.cpp
+++ b/llvm/lib/Remarks/RemarkFormat.cpp
@@ -42,6 +42,22 @@ Expected<Format> llvm::remarks::magicToFormat(StringRef MagicStr) {
 
   if (Result == Format::Unknown)
     return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown remark magic: '%s'", MagicStr.data());
+                             "Automatic detection of remark format failed. "
+                             "Unknown magic number: '%.4s'",
+                             MagicStr.data());
   return Result;
 }
+
+Expected<Format> llvm::remarks::detectFormat(Format Selected,
+                                             StringRef MagicStr) {
+  if (Selected == Format::Unknown)
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             "Unknown remark parser format.");
+  if (Selected != Format::Auto)
+    return Selected;
+
+  // Empty files are valid bitstream files
+  if (MagicStr.empty())
+    return Format::Bitstream;
+  return magicToFormat(MagicStr);
+}
diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp
index b8395aa135d8..0ca6217edfdd 100644
--- a/llvm/lib/Remarks/RemarkLinker.cpp
+++ b/llvm/lib/Remarks/RemarkLinker.cpp
@@ -66,17 +66,10 @@ void RemarkLinker::setExternalFilePrependPath(StringRef PrependPathIn) {
   PrependPath = std::string(PrependPathIn);
 }
 
-Error RemarkLinker::link(StringRef Buffer, std::optional<Format> RemarkFormat) {
-  if (!RemarkFormat) {
-    Expected<Format> ParserFormat = magicToFormat(Buffer);
-    if (!ParserFormat)
-      return ParserFormat.takeError();
-    RemarkFormat = *ParserFormat;
-  }
-
+Error RemarkLinker::link(StringRef Buffer, Format RemarkFormat) {
   Expected<std::unique_ptr<RemarkParser>> MaybeParser =
       createRemarkParserFromMeta(
-          *RemarkFormat, Buffer,
+          RemarkFormat, Buffer,
           PrependPath ? std::optional<StringRef>(StringRef(*PrependPath))
                       : std::optional<StringRef>());
   if (!MaybeParser)
@@ -102,8 +95,7 @@ Error RemarkLinker::link(StringRef Buffer, std::optional<Format> RemarkFormat) {
   return Error::success();
 }
 
-Error RemarkLinker::link(const object::ObjectFile &Obj,
-                         std::optional<Format> RemarkFormat) {
+Error RemarkLinker::link(const object::ObjectFile &Obj, Format RemarkFormat) {
   Expected<std::optional<StringRef>> SectionOrErr =
       getRemarksSectionContents(Obj);
   if (!SectionOrErr)
diff --git a/llvm/lib/Remarks/RemarkParser.cpp b/llvm/lib/Remarks/RemarkParser.cpp
index 5c1690aaa0fe..038fc1d3f485 100644
--- a/llvm/lib/Remarks/RemarkParser.cpp
+++ b/llvm/lib/Remarks/RemarkParser.cpp
@@ -15,6 +15,7 @@
 #include "BitstreamRemarkParser.h"
 #include "YAMLRemarkParser.h"
 #include "llvm-c/Remarks.h"
+#include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include <optional>
 
@@ -50,14 +51,18 @@ Expected<StringRef> ParsedStringTable::operator[](size_t Index) const {
 
 Expected<std::unique_ptr<RemarkParser>>
 llvm::remarks::createRemarkParser(Format ParserFormat, StringRef Buf) {
-  switch (ParserFormat) {
+  auto DetectedFormat = detectFormat(ParserFormat, Buf);
+  if (!DetectedFormat)
+    return DetectedFormat.takeError();
+
+  switch (*DetectedFormat) {
   case Format::YAML:
     return std::make_unique<YAMLRemarkParser>(Buf);
   case Format::Bitstream:
     return std::make_unique<BitstreamRemarkParser>(Buf);
   case Format::Unknown:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown remark parser format.");
+  case Format::Auto:
+    break;
   }
   llvm_unreachable("unhandled ParseFormat");
 }
@@ -66,15 +71,19 @@ Expected<std::unique_ptr<RemarkParser>>
 llvm::remarks::createRemarkParserFromMeta(
     Format ParserFormat, StringRef Buf,
     std::optional<StringRef> ExternalFilePrependPath) {
-  switch (ParserFormat) {
+  auto DetectedFormat = detectFormat(ParserFormat, Buf);
+  if (!DetectedFormat)
+    return DetectedFormat.takeError();
+
+  switch (*DetectedFormat) {
   case Format::YAML:
     return createYAMLParserFromMeta(Buf, std::move(ExternalFilePrependPath));
   case Format::Bitstream:
     return createBitstreamParserFromMeta(Buf,
                                          std::move(ExternalFilePrependPath));
   case Format::Unknown:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unknown remark parser format.");
+  case Format::Auto:
+    break;
   }
   llvm_unreachable("unhandled ParseFormat");
 }
diff --git a/llvm/lib/Remarks/RemarkSerializer.cpp b/llvm/lib/Remarks/RemarkSerializer.cpp
index cc10b91f287a..df1da53d7c8a 100644
--- a/llvm/lib/Remarks/RemarkSerializer.cpp
+++ b/llvm/lib/Remarks/RemarkSerializer.cpp
@@ -22,8 +22,9 @@ remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
                                 raw_ostream &OS) {
   switch (RemarksFormat) {
   case Format::Unknown:
+  case Format::Auto:
     return createStringError(std::errc::invalid_argument,
-                             "Unknown remark serializer format.");
+                             "Invalid remark serializer format.");
   case Format::YAML:
     return std::make_unique<YAMLRemarkSerializer>(OS, Mode);
   case Format::Bitstream:
@@ -37,8 +38,9 @@ remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode,
                                 raw_ostream &OS, remarks::StringTable StrTab) {
   switch (RemarksFormat) {
   case Format::Unknown:
+  case Format::Auto:
     return createStringError(std::errc::invalid_argument,
-                             "Unknown remark serializer format.");
+                             "Invalid remark serializer format.");
   case Format::YAML:
     return std::make_unique<YAMLRemarkSerializer>(OS, Mode, std::move(StrTab));
   case Format::Bitstream:
diff --git a/llvm/test/tools/llvm-remarkutil/Inputs/broken-remark-magic.bitstream b/llvm/test/tools/llvm-remarkutil/Inputs/broken-remark-magic.bitstream
new file mode 100644
index 000000000000..97b5955f788b
--- /dev/null
+++ b/llvm/test/tools/llvm-remarkutil/Inputs/broken-remark-magic.bitstream
@@ -0,0 +1 @@
+12345678
diff --git a/llvm/test/tools/llvm-remarkutil/annotation-count.test b/llvm/test/tools/llvm-remarkutil/annotation-count.test
index e006220c64f3..ee44ed2035c8 100644
--- a/llvm/test/tools/llvm-remarkutil/annotation-count.test
+++ b/llvm/test/tools/llvm-remarkutil/annotation-count.test
@@ -1,5 +1,7 @@
 RUN: llvm-remarkutil annotation-count --parser=yaml --annotation-type=remark %p/Inputs/annotation-count.yaml | FileCheck %s
+RUN: llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/annotation-count.yaml | FileCheck %s
 RUN: llvm-remarkutil yaml2bitstream %p/Inputs/annotation-count.yaml | llvm-remarkutil annotation-count --parser=bitstream --annotation-type=remark | FileCheck %s
+RUN: llvm-remarkutil yaml2bitstream %p/Inputs/annotation-count.yaml | llvm-remarkutil annotation-count --annotation-type=remark | FileCheck %s
 RUN: llvm-remarkutil count --parser=yaml --count-by=arg --group-by=function --remark-name="AnnotationSummary" %p/Inputs/annotation-count.yaml | FileCheck %s --check-prefix=COUNT-CHECK
 RUN: llvm-remarkutil yaml2bitstream %p/Inputs/annotation-count.yaml | llvm-remarkutil count --parser=bitstream --count-by=arg --group-by=function --remark-name="AnnotationSummary" | FileCheck %s --check-prefix=COUNT-CHECK
 
diff --git a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test
new file mode 100644
index 000000000000..f469eadc07f9
--- /dev/null
+++ b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test
@@ -0,0 +1,6 @@
+RUN: not llvm-remarkutil instruction-count %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s
+RUN: not llvm-remarkutil instruction-mix %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s
+RUN: not llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s
+RUN: not llvm-remarkutil count %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s
+
+CHECK: error: Automatic detection of remark format failed. Unknown magic number: '1234'
diff --git a/llvm/test/tools/llvm-remarkutil/empty-file.test b/llvm/test/tools/llvm-remarkutil/empty-file.test
index bdc5fcf87f7b..d9820a088ea8 100644
--- a/llvm/test/tools/llvm-remarkutil/empty-file.test
+++ b/llvm/test/tools/llvm-remarkutil/empty-file.test
@@ -8,6 +8,11 @@ RUN: llvm-remarkutil instruction-count --parser=bitstream %p/Inputs/empty-file -
 RUN: llvm-remarkutil instruction-mix --parser=bitstream %p/Inputs/empty-file --report_style=csv -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=MIXBITSTREAM
 RUN: llvm-remarkutil annotation-count --parser=bitstream --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=ANNOTATIONBITSTREAM
 RUN: llvm-remarkutil count --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=COUNTBITSTREAM
+; Parser format auto-detection should treat empty files as bitstream files
+RUN: llvm-remarkutil instruction-count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=SIZEBITSTREAM
+RUN: llvm-remarkutil instruction-mix %p/Inputs/empty-file --report_style=csv -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=MIXBITSTREAM
+RUN: llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=ANNOTATIONBITSTREAM
+RUN: llvm-remarkutil count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=COUNTBITSTREAM
 
 ; YAMLPARSER: error: document root is not of mapping type.
 
diff --git a/llvm/test/tools/llvm-remarkutil/instruction-count.test b/llvm/test/tools/llvm-remarkutil/instruction-count.test
index d94f4f94cc1d..a0aa6dc98c44 100644
--- a/llvm/test/tools/llvm-remarkutil/instruction-count.test
+++ b/llvm/test/tools/llvm-remarkutil/instruction-count.test
@@ -1,5 +1,7 @@
 RUN: llvm-remarkutil instruction-count --parser=yaml %p/Inputs/instruction-count.yaml | FileCheck %s
+RUN: llvm-remarkutil instruction-count %p/Inputs/instruction-count.yaml | FileCheck %s
 RUN: llvm-remarkutil yaml2bitstream %p/Inputs/instruction-count.yaml | llvm-remarkutil instruction-count --parser=bitstream | FileCheck %s
+RUN: llvm-remarkutil yaml2bitstream %p/Inputs/instruction-count.yaml | llvm-remarkutil instruction-count | FileCheck %s
 RUN: llvm-remarkutil count --parser=yaml --count-by=arg --group-by=function --remark-name="InstructionCount" %p/Inputs/instruction-count.yaml | FileCheck %s --check-prefix=COUNT-CHECK
 RUN: llvm-remarkutil yaml2bitstream %p/Inputs/instruction-count.yaml  | llvm-remarkutil count --parser=bitstream --count-by=arg --group-by=function --remark-name="InstructionCount" | FileCheck %s --check-prefix=COUNT-CHECK
 RUN: not llvm-remarkutil count --parser=yaml --count-by=arg --group-by=function --rremark-name=* %p/Inputs/instruction-count.yaml 2>&1 | FileCheck %s --check-prefix=ERROR-REPOPERATOR -DARG=rremark-name
@@ -18,4 +20,4 @@ RUN: not llvm-remarkutil count --parser=yaml --count-by=arg --group-by=function
 ; COUNT-CHECK: func3,3
 
 ; ERROR-REPOPERATOR: error: invalid argument '--[[ARG]]=*': repetition-operator operand invalid
-; ERROR-BOTHFILTERS: error: conflicting arguments: --remark-name and --rremark-name
\ No newline at end of file
+; ERROR-BOTHFILTERS: error: conflicting arguments: --remark-name and --rremark-name
diff --git a/llvm/test/tools/llvm-remarkutil/instruction-mix.test b/llvm/test/tools/llvm-remarkutil/instruction-mix.test
index 178c1311b2fe..15994679f5d4 100644
--- a/llvm/test/tools/llvm-remarkutil/instruction-mix.test
+++ b/llvm/test/tools/llvm-remarkutil/instruction-mix.test
@@ -1,5 +1,7 @@
 RUN: llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/instruction-mix.yaml | FileCheck %s
+RUN: llvm-remarkutil instruction-mix %p/Inputs/instruction-mix.yaml | FileCheck %s
 RUN: llvm-remarkutil yaml2bitstream %p/Inputs/instruction-mix.yaml | llvm-remarkutil instruction-mix --parser=bitstream | FileCheck %s
+RUN: llvm-remarkutil yaml2bitstream %p/Inputs/instruction-mix.yaml | llvm-remarkutil instruction-mix | FileCheck %s
 RUN: llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/instruction-mix.yaml --report_style=human | FileCheck %s
 RUN: llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/instruction-mix.yaml --report_style=csv | FileCheck %s --check-prefix=CSV
 RUN: llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/instruction-mix.yaml --rfilter=meow | FileCheck %s --check-prefix=MEOW-RE
@@ -34,4 +36,4 @@ RUN: not llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/instruction-mix
 ; NONE-EXACT: ----------- -----
 ; NONE-NOT: {{.*}}
 
-; ERROR: error: invalid argument '--rfilter=*': repetition-operator operand invalid
\ No newline at end of file
+; ERROR: error: invalid argument '--rfilter=*': repetition-operator operand invalid
diff --git a/llvm/test/tools/llvm-remarkutil/size-diff/no-difference.test b/llvm/test/tools/llvm-remarkutil/size-diff/no-difference.test
index a9b6ba4ae256..8550339bebc4 100644
--- a/llvm/test/tools/llvm-remarkutil/size-diff/no-difference.test
+++ b/llvm/test/tools/llvm-remarkutil/size-diff/no-difference.test
@@ -1,4 +1,7 @@
 RUN: llvm-remarkutil size-diff %p/Inputs/1-func-1-instr-1-stack.yaml %p/Inputs/1-func-1-instr-1-stack.yaml --parser=yaml | FileCheck -strict-whitespace %s
+RUN: llvm-remarkutil size-diff %p/Inputs/1-func-1-instr-1-stack.yaml %p/Inputs/1-func-1-instr-1-stack.yaml | FileCheck -strict-whitespace %s
+RUN: llvm-remarkutil yaml2bitstream -o %t.bitstream %p/Inputs/1-func-1-instr-1-stack.yaml
+RUN: llvm-remarkutil size-diff %t.bitstream %p/Inputs/1-func-1-instr-1-stack.yaml | FileCheck -strict-whitespace %s
 
 ; Same file passed twice -> no changes reported.
 
diff --git a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h
index eb393bc3e304..894ac8354e18 100644
--- a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h
+++ b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h
@@ -35,9 +35,12 @@
 // Keep Input format and names consistent accross the modes via a macro.
 #define INPUT_FORMAT_COMMAND_LINE_OPTIONS(SUBOPT)                              \
   static cl::opt<Format> InputFormat(                                          \
-      "parser", cl::desc("Input remark format to parse"),                      \
-      cl::values(clEnumValN(Format::YAML, "yaml", "YAML"),                     \
-                 clEnumValN(Format::Bitstream, "bitstream", "Bitstream")),     \
+      "parser", cl::init(Format::Auto),                                        \
+      cl::desc("Input remark format to parse"),                                \
+      cl::values(                                                              \
+          clEnumValN(Format::Auto, "auto", "Automatic detection (default)"),   \
+          clEnumValN(Format::YAML, "yaml", "YAML"),                            \
+          clEnumValN(Format::Bitstream, "bitstream", "Bitstream")),            \
       cl::sub(SUBOPT));
 
 #define DEBUG_LOC_INFO_COMMAND_LINE_OPTIONS(SUBOPT)                            \
diff --git a/llvm/unittests/Remarks/RemarksLinkingTest.cpp b/llvm/unittests/Remarks/RemarksLinkingTest.cpp
index dcd598aaeb5c..89de9e8f4f95 100644
--- a/llvm/unittests/Remarks/RemarksLinkingTest.cpp
+++ b/llvm/unittests/Remarks/RemarksLinkingTest.cpp
@@ -243,10 +243,8 @@ TEST(Remarks, LinkingError) {
     // Check that the prepend path is propagated and fails with the full path.
     // Also ensures that the remark format is correctly auto-detected.
     RL.setExternalFilePrependPath("/baddir/");
-    Error E = RL.link(
-        StringRef("REMARKS\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0badfile.opt.yaml",
-                  40),
-        /*RemarkFormat=*/std::nullopt);
+    Error E = RL.link(StringRef(
+        "REMARKS\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0badfile.opt.yaml", 40));
     EXPECT_TRUE(static_cast<bool>(E));
     std::string ErrorMessage = toString(std::move(E));
     EXPECT_EQ(StringRef(ErrorMessage).lower(),

From 22a69a266d8206b1585dd82d466cd96d01725a65 Mon Sep 17 00:00:00 2001
From: Justin King <jcking@google.com>
Date: Wed, 18 Jun 2025 12:57:49 -0700
Subject: [PATCH 0860/1322] lsan: Support free_sized and free_aligned_sized
 from C23 (#144604)

Adds support to LSan for `free_sized` and `free_aligned_sized` from C23.

Other sanitizers will be handled with their own separate PRs.

For https://github.com/llvm/llvm-project/issues/144435

This is attempt number 2.

Signed-off-by: Justin King <jcking@google.com>
---
 compiler-rt/lib/lsan/lsan_allocator.cpp       |  4 +++
 compiler-rt/lib/lsan/lsan_allocator.h         |  2 ++
 compiler-rt/lib/lsan/lsan_interceptors.cpp    | 31 +++++++++++++++++++
 compiler-rt/lib/lsan/lsan_malloc_mac.cpp      | 23 ++++++++------
 .../sanitizer_common/sanitizer_malloc_mac.inc | 16 ++++++++++
 .../sanitizer_platform_interceptors.h         | 11 +++++++
 .../TestCases/Linux/free_aligned_sized.c      |  2 +-
 .../TestCases/Linux/free_sized.c              |  2 +-
 8 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index 493bf5f9efc5..a436d9c07ac6 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -220,6 +220,10 @@ void lsan_free(void *p) {
   Deallocate(p);
 }
 
+void lsan_free_sized(void *p, uptr) { Deallocate(p); }
+
+void lsan_free_aligned_sized(void *p, uptr, uptr) { Deallocate(p); }
+
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack) {
   return SetErrnoOnNull(Reallocate(stack, p, size, 1));
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 5eed0cbdb309..2342f11fb5d0 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -127,6 +127,8 @@ void *lsan_aligned_alloc(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_memalign(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_malloc(uptr size, const StackTrace &stack);
 void lsan_free(void *p);
+void lsan_free_sized(void *p, uptr size);
+void lsan_free_aligned_sized(void *p, uptr alignment, uptr size);
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack);
 void *lsan_reallocarray(void *p, uptr nmemb, uptr size,
                         const StackTrace &stack);
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index a8252cddacf2..6da9d0d9d24f 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -84,6 +84,35 @@ INTERCEPTOR(void, free, void *p) {
   lsan_free(p);
 }
 
+#  if SANITIZER_INTERCEPT_FREE_SIZED
+INTERCEPTOR(void, free_sized, void *p, uptr size) {
+  if (UNLIKELY(!p))
+    return;
+  if (DlsymAlloc::PointerIsMine(p))
+    return DlsymAlloc::Free(p);
+  ENSURE_LSAN_INITED;
+  lsan_free_sized(p, size);
+}
+#    define LSAN_MAYBE_INTERCEPT_FREE_SIZED INTERCEPT_FUNCTION(free_sized)
+#  else
+#    define LSAN_MAYBE_INTERCEPT_FREE_SIZED
+#  endif
+
+#  if SANITIZER_INTERCEPT_FREE_ALIGNED_SIZED
+INTERCEPTOR(void, free_aligned_sized, void *p, uptr alignment, uptr size) {
+  if (UNLIKELY(!p))
+    return;
+  if (DlsymAlloc::PointerIsMine(p))
+    return DlsymAlloc::Free(p);
+  ENSURE_LSAN_INITED;
+  lsan_free_aligned_sized(p, alignment, size);
+}
+#    define LSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED \
+      INTERCEPT_FUNCTION(free_aligned_sized)
+#  else
+#    define LSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED
+#  endif
+
 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
   if (DlsymAlloc::Use())
     return DlsymAlloc::Callocate(nmemb, size);
@@ -547,6 +576,8 @@ void InitializeInterceptors() {
 
   INTERCEPT_FUNCTION(malloc);
   INTERCEPT_FUNCTION(free);
+  LSAN_MAYBE_INTERCEPT_FREE_SIZED;
+  LSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED;
   LSAN_MAYBE_INTERCEPT_CFREE;
   INTERCEPT_FUNCTION(calloc);
   INTERCEPT_FUNCTION(realloc);
diff --git a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
index 525c30272ccc..8a16c053da23 100644
--- a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
+++ b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
@@ -44,16 +44,19 @@ using namespace __lsan;
   void *p = lsan_valloc(size, stack)
 #define COMMON_MALLOC_FREE(ptr) \
   lsan_free(ptr)
-#define COMMON_MALLOC_SIZE(ptr) \
-  uptr size = lsan_mz_size(ptr)
-#define COMMON_MALLOC_FILL_STATS(zone, stats)
-#define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name) \
-  (void)zone_name; \
-  Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", ptr);
-#define COMMON_MALLOC_NAMESPACE __lsan
-#define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
-#define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
+#  define COMMON_MALLOC_FREE_SIZED(ptr, size) lsan_free_sized(ptr, size)
+#  define COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size) \
+    lsan_free_aligned_sized(ptr, alignment, size)
+#  define COMMON_MALLOC_SIZE(ptr) uptr size = lsan_mz_size(ptr)
+#  define COMMON_MALLOC_FILL_STATS(zone, stats)
+#  define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name)    \
+    (void)zone_name;                                                        \
+    Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", \
+           ptr);
+#  define COMMON_MALLOC_NAMESPACE __lsan
+#  define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
+#  define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
 
-#include "sanitizer_common/sanitizer_malloc_mac.inc"
+#  include "sanitizer_common/sanitizer_malloc_mac.inc"
 
 #endif // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
index 6343eb284afb..be27584f2053 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
@@ -144,6 +144,22 @@ INTERCEPTOR(void, free, void *ptr) {
   COMMON_MALLOC_FREE(ptr);
 }
 
+#if SANITIZER_INTERCEPT_FREE_SIZED && defined(COMMON_MALLOC_FREE_SIZED)
+INTERCEPTOR(void, free_sized, void *ptr, size_t size) {
+  COMMON_MALLOC_ENTER();
+  COMMON_MALLOC_FREE_SIZED(ptr, size);
+}
+#endif
+
+#if SANITIZER_INTERCEPT_FREE_ALIGNED_SIZED && \
+    defined(COMMON_MALLOC_FREE_ALIGNED_SIZED)
+INTERCEPTOR(void, free_aligned_sized, void *ptr, size_t alignment,
+            size_t size) {
+  COMMON_MALLOC_ENTER();
+  COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size);
+}
+#endif
+
 INTERCEPTOR(void *, realloc, void *ptr, size_t size) {
   COMMON_MALLOC_ENTER();
   COMMON_MALLOC_REALLOC(ptr, size);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index ccc808b60ca7..29987decdff4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -663,6 +663,17 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_GETSERVBYNAME_R SI_GLIBC
 #define SANITIZER_INTERCEPT_GETSERVBYPORT_R SI_GLIBC
 
+// Until free_sized and free_aligned_sized are more generally available,
+// we can only unconditionally intercept on ELF-based platforms where it
+// is okay to have undefined weak symbols.
+#ifdef __ELF__
+#  define SANITIZER_INTERCEPT_FREE_SIZED 1
+#  define SANITIZER_INTERCEPT_FREE_ALIGNED_SIZED 1
+#else
+#  define SANITIZER_INTERCEPT_FREE_SIZED 0
+#  define SANITIZER_INTERCEPT_FREE_ALIGNED_SIZED 0
+#endif
+
 // This macro gives a way for downstream users to override the above
 // interceptor macros irrespective of the platform they are on. They have
 // to do two things:
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
index f4c6c0f973bd..e9cb6f20c5ea 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
@@ -1,5 +1,5 @@
 // RUN: %clang -std=c23 -O0 %s -o %t && %run %t
-// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, lsan, ubsan
+// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, ubsan
 
 #include <stddef.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
index 0ee2289684d0..8cdf3216e528 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
@@ -1,5 +1,5 @@
 // RUN: %clang -std=c23 -O0 %s -o %t && %run %t
-// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, lsan, ubsan
+// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, ubsan
 
 #include <stddef.h>
 #include <stdlib.h>

From 23b8f11b27f1345cfdd9d03c9ebaccaf81897764 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 18 Jun 2025 20:59:15 +0100
Subject: [PATCH 0861/1322] [VPlan] Remove redundant VPWidenRecipe constructors
 (NFC)

Since the removal of VPWidenEVLRecipe, the constructors taking a
VPDefOpcode are not needed any more. Remove them.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 21 ++++++-------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  5 ++---
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f3306ad7cb8e..ab5ff82a7720 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1304,24 +1304,15 @@ protected:
 class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   unsigned Opcode;
 
-protected:
-  VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
-                ArrayRef<VPValue *> Operands)
-      : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I),
-        Opcode(I.getOpcode()) {}
-
-  VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode,
-                ArrayRef<VPValue *> Operands, bool NUW, bool NSW, DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL),
+public:
+  VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                const VPIRFlags &Flags, DebugLoc DL)
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
         Opcode(Opcode) {}
 
-public:
   VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
-      : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
-
-  VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands, bool NUW,
-                bool NSW, DebugLoc DL)
-      : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {}
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I),
+        Opcode(I.getOpcode()) {}
 
   ~VPWidenRecipe() override = default;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 11f0f2a93032..cc73ae44f9c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2703,9 +2703,8 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
   }
 
   std::array<VPValue *, 2> MulOps = {Op0, Op1};
-  auto *Mul = new VPWidenRecipe(
-      Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(),
-      MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
+  auto *Mul = new VPWidenRecipe(Instruction::Mul, ArrayRef(MulOps), *MulAcc,
+                                MulAcc->getDebugLoc());
   Mul->insertBefore(MulAcc);
 
   auto *Red = new VPReductionRecipe(

From a630ca6f6c4727d852d60076d1179c3e9830ca2f Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Wed, 18 Jun 2025 13:06:20 -0700
Subject: [PATCH 0862/1322] [lldb][breakpoint] Grey out disabled breakpoints
 (#91404)

This commit adds colour settings to the list of breakpoints in order to
grey out breakpoints that have been disabled.
---
 lldb/include/lldb/Core/Debugger.h             |  4 +++
 lldb/source/Breakpoint/Breakpoint.cpp         | 15 +++++++++++
 lldb/source/Core/CoreProperties.td            | 16 ++++++++++++
 lldb/source/Core/Debugger.cpp                 | 12 +++++++++
 .../API/terminal/TestDisabledBreakpoints.py   | 25 +++++++++++++++++++
 5 files changed, 72 insertions(+)
 create mode 100644 lldb/test/API/terminal/TestDisabledBreakpoints.py

diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index d73aba1e3ce5..2087ef2a1156 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -307,6 +307,10 @@ public:
 
   llvm::StringRef GetShowProgressAnsiSuffix() const;
 
+  llvm::StringRef GetDisabledAnsiPrefix() const;
+
+  llvm::StringRef GetDisabledAnsiSuffix() const;
+
   bool GetUseAutosuggestion() const;
 
   llvm::StringRef GetAutosuggestionAnsiPrefix() const;
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index 337c1a4ac401..2ed0c9314e3e 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -15,6 +15,7 @@
 #include "lldb/Breakpoint/BreakpointResolver.h"
 #include "lldb/Breakpoint/BreakpointResolverFileLine.h"
 #include "lldb/Core/Address.h"
+#include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleList.h"
 #include "lldb/Core/SearchFilter.h"
@@ -26,6 +27,7 @@
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Target/ThreadSpec.h"
+#include "lldb/Utility/AnsiTerminal.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Stream.h"
@@ -838,6 +840,13 @@ void Breakpoint::GetDescription(Stream *s, lldb::DescriptionLevel level,
                                 bool show_locations) {
   assert(s != nullptr);
 
+  const bool dim_breakpoint_description =
+      !IsEnabled() && s->AsRawOstream().colors_enabled();
+  if (dim_breakpoint_description)
+    s->Printf("%s", ansi::FormatAnsiTerminalCodes(
+                        GetTarget().GetDebugger().GetDisabledAnsiPrefix())
+                        .c_str());
+
   if (!m_kind_description.empty()) {
     if (level == eDescriptionLevelBrief) {
       s->PutCString(GetBreakpointKind());
@@ -934,6 +943,12 @@ void Breakpoint::GetDescription(Stream *s, lldb::DescriptionLevel level,
     }
     s->IndentLess();
   }
+
+  // Reset the colors back to normal if they were previously greyed out.
+  if (dim_breakpoint_description)
+    s->Printf("%s", ansi::FormatAnsiTerminalCodes(
+                        GetTarget().GetDebugger().GetDisabledAnsiSuffix())
+                        .c_str());
 }
 
 void Breakpoint::GetResolverDescription(Stream *s) {
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 4d1ea5dfec2e..53dd333f045c 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -191,6 +191,22 @@ let Definition = "debugger" in {
             "${separator}${thread.stop-reason}}{ "
             "${separator}{${progress.count} }${progress.message}}">,
         Desc<"The default statusline format string.">;
+
+  def ShowDisabledAnsiPrefix
+      : Property<"disable-ansi-prefix", "String">,
+        Global,
+        DefaultStringValue<"${ansi.faint}">,
+        Desc<"If something has been disabled in a color-enabled terminal, use "
+             "the ANSI terminal code specified immediately before whatever has "
+             "been disabled.">;
+  def ShowDisabledAnsiSuffix
+      : Property<"disable-ansi-suffix", "String">,
+        Global,
+        DefaultStringValue<"${ansi.normal}">,
+        Desc<"When somehing has been disabled in a color-enabled terminal, use "
+             "the ANSI terminal code specified immediately after whatever has "
+             "been disabled.">;
+
   def UseSourceCache: Property<"use-source-cache", "Boolean">,
     Global,
     DefaultTrue,
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 81037d3def81..c9935f2d745f 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -510,6 +510,18 @@ llvm::StringRef Debugger::GetSeparator() const {
       idx, g_debugger_properties[idx].default_cstr_value);
 }
 
+llvm::StringRef Debugger::GetDisabledAnsiPrefix() const {
+  const uint32_t idx = ePropertyShowDisabledAnsiPrefix;
+  return GetPropertyAtIndexAs<llvm::StringRef>(
+      idx, g_debugger_properties[idx].default_cstr_value);
+}
+
+llvm::StringRef Debugger::GetDisabledAnsiSuffix() const {
+  const uint32_t idx = ePropertyShowDisabledAnsiSuffix;
+  return GetPropertyAtIndexAs<llvm::StringRef>(
+      idx, g_debugger_properties[idx].default_cstr_value);
+}
+
 bool Debugger::SetSeparator(llvm::StringRef s) {
   constexpr uint32_t idx = ePropertySeparator;
   bool ret = SetPropertyAtIndex(idx, s);
diff --git a/lldb/test/API/terminal/TestDisabledBreakpoints.py b/lldb/test/API/terminal/TestDisabledBreakpoints.py
new file mode 100644
index 000000000000..a644c94c8a17
--- /dev/null
+++ b/lldb/test/API/terminal/TestDisabledBreakpoints.py
@@ -0,0 +1,25 @@
+"""
+Test that disabling breakpoints and viewing them in a list uses the correct ANSI color settings when colors are enabled and disabled.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from lldbsuite.test.lldbpexpect import PExpectTest
+
+import io
+
+
+class DisabledBreakpointsTest(PExpectTest):
+    @add_test_categories(["pexpect"])
+    def test_disabling_breakpoints_with_color(self):
+        """Test that disabling a breakpoint and viewing the breakpoints list uses the specified ANSI color prefix."""
+        ansi_red_color_code = "\x1b[31m"
+
+        self.launch(use_colors=True, dimensions=(100, 100))
+        self.expect('settings set disable-ansi-prefix "${ansi.fg.red}"')
+        self.expect("b main")
+        self.expect("br dis")
+        self.expect("br l", substrs=[ansi_red_color_code + "1:"])
+        self.quit()

From a88e655809655eec8fa85366318fb3c4a0baa113 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 18 Jun 2025 13:08:05 -0700
Subject: [PATCH 0863/1322] [llvm] build Blake3 source with LLVM_EXPORTS
 defined (#144753)

## Purpose
This patch ensures that the BLAKE3 implementation in the LLVM Support
library exports its public interface with `__declspec(dllexport)` when
building LLVM as a Windows DLL.

## Background
The effort to support building LLVM as a Windows DLL is tracked in
#109483. Additional context is provided in [this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307).

## Overview
Replicate [this
logic](https://github.com/llvm/llvm-project/blob/main/llvm/cmake/modules/AddLLVM.cmake#L662-L664)
from `llvm_add_library()` for the `LLVMSupportBlake3` target. Without
this change, the `llvm_blake_` functions will only be annotated with
`__declspec(dllimport)` when building LLVM as a Windows DLL which leads
to inconsistent DLL linkage warnings from MSVC and `clang-cl`.
---
 llvm/lib/Support/BLAKE3/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Support/BLAKE3/CMakeLists.txt b/llvm/lib/Support/BLAKE3/CMakeLists.txt
index 99cb78881ec4..eae2b0280e5d 100644
--- a/llvm/lib/Support/BLAKE3/CMakeLists.txt
+++ b/llvm/lib/Support/BLAKE3/CMakeLists.txt
@@ -85,3 +85,9 @@ endif()
 add_library(LLVMSupportBlake3 OBJECT EXCLUDE_FROM_ALL ${LLVM_BLAKE3_FILES})
 set_target_properties(LLVMSupportBlake3 PROPERTIES FOLDER "LLVM/Libraries")
 llvm_update_compile_flags(LLVMSupportBlake3)
+if(LLVM_BUILD_LLVM_DYLIB OR BUILD_SHARED_LIBS)
+  # Since LLVMSupportBlake3 is not defined using llvm_add_library(), we must
+  # define LLVM_EXPORTS here so its public interface is annotated with
+  # __declspec(dllexport) when building as a DLL on Windows.
+  target_compile_definitions(LLVMSupportBlake3 PRIVATE LLVM_EXPORTS)
+endif()

From 96bbe472ef01e5f34bfeabedceea397889ff1119 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 18 Jun 2025 16:15:06 -0400
Subject: [PATCH 0864/1322] Revert "[mlir][spirv] Fix int type declaration
 duplication when serializing" and follow up commits (#144773)

This reverts the following PRs:
* https://github.com/llvm/llvm-project/pull/143108
* https://github.com/llvm/llvm-project/pull/144538
* https://github.com/llvm/llvm-project/pull/144685

Reverting because this disabled tests when building without the llvm
spirv backend enabled.
---
 mlir/lib/Target/SPIRV/Serialization/Serializer.cpp  | 13 -------------
 mlir/test/CMakeLists.txt                            |  6 ------
 mlir/test/Target/SPIRV/constant.mlir                |  5 +----
 mlir/test/Target/SPIRV/lit.local.cfg                |  7 -------
 mlir/test/lit.cfg.py                                |  1 -
 mlir/test/lit.site.cfg.py.in                        |  4 +---
 .../llvm-project-overlay/mlir/test/BUILD.bazel      |  1 -
 7 files changed, 2 insertions(+), 35 deletions(-)
 delete mode 100644 mlir/test/Target/SPIRV/lit.local.cfg

diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 56c64f38fe29..d258bfd85296 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -446,19 +446,6 @@ LogicalResult Serializer::processType(Location loc, Type type,
 LogicalResult
 Serializer::processTypeImpl(Location loc, Type type, uint32_t &typeID,
                             SetVector<StringRef> &serializationCtx) {
-
-  // Map unsigned integer types to singless integer types.
-  // This is needed otherwise the generated spirv assembly will contain
-  // twice a type declaration (like OpTypeInt 32 0) which is no permitted and
-  // such module fails validation. Indeed at MLIR level the two types are
-  // different and lookup in the cache below misses.
-  // Note: This conversion needs to happen here before the type is looked up in
-  // the cache.
-  if (type.isUnsignedInteger()) {
-    type = IntegerType::get(loc->getContext(), type.getIntOrFloatBitWidth(),
-                            IntegerType::SignednessSemantics::Signless);
-  }
-
   typeID = getTypeID(type);
   if (typeID)
     return success();
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 89568e7766ae..ac8b44f53aeb 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -68,7 +68,6 @@ endif()
 llvm_canonicalize_cmake_booleans(
   LLVM_BUILD_EXAMPLES
   LLVM_HAS_NVPTX_TARGET
-  LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   MLIR_ENABLE_BINDINGS_PYTHON
   MLIR_ENABLE_CUDA_RUNNER
   MLIR_ENABLE_ROCM_CONVERSIONS
@@ -218,11 +217,6 @@ if(MLIR_ENABLE_BINDINGS_PYTHON)
   )
 endif()
 
-if (LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
-  list(APPEND MLIR_TEST_DEPENDS spirv-as)
-  list(APPEND MLIR_TEST_DEPENDS spirv-val)
-endif()
-
 # This target can be used to just build the dependencies
 # for the check-mlir target without executing the tests.
 # This is useful for bots when splitting the build step
diff --git a/mlir/test/Target/SPIRV/constant.mlir b/mlir/test/Target/SPIRV/constant.mlir
index 50d9b09ee004..8d4e53418b70 100644
--- a/mlir/test/Target/SPIRV/constant.mlir
+++ b/mlir/test/Target/SPIRV/constant.mlir
@@ -1,7 +1,6 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
-// RUN: %if spirv-tools %{ mlir-translate -no-implicit-module -serialize-spirv %s | spirv-val %}
 
-spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, Int64, Int16, Int8, Float64, Float16, CooperativeMatrixKHR], [SPV_KHR_vulkan_memory_model, SPV_KHR_cooperative_matrix]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   // CHECK-LABEL: @bool_const
   spirv.func @bool_const() -> () "None" {
     // CHECK: spirv.Constant true
@@ -306,6 +305,4 @@ spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader
     %coop = spirv.Constant dense<4> : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
     spirv.ReturnValue %coop : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
   }
-
-  spirv.EntryPoint "GLCompute" @bool_const
 }
diff --git a/mlir/test/Target/SPIRV/lit.local.cfg b/mlir/test/Target/SPIRV/lit.local.cfg
deleted file mode 100644
index 167c454db518..000000000000
--- a/mlir/test/Target/SPIRV/lit.local.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
-if not "SPIRV" in config.root.targets:
-    config.unsupported = True
-
-if config.spirv_tools_tests:
-    config.available_features.add("spirv-tools")
-    config.substitutions.append(("spirv-as", os.path.join(config.llvm_tools_dir, "spirv-as")))
-    config.substitutions.append(("spirv-val", os.path.join(config.llvm_tools_dir, "spirv-val")))
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index a6f1ac0d568f..9b5cadd62bef 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -332,7 +332,6 @@ if config.enable_assertions:
 else:
     config.available_features.add("noasserts")
 
-config.targets = frozenset(config.targets_to_build.split())
 
 def have_host_jit_feature_support(feature_name):
     mlir_runner_exe = lit.util.which("mlir-runner", config.mlir_tools_dir)
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 77f24e0f29b0..132aabe13594 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -5,8 +5,6 @@ import sys
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
-config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
-config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
 config.python_executable = "@Python3_EXECUTABLE@"
@@ -43,7 +41,7 @@ config.mlir_run_amx_tests = @MLIR_RUN_AMX_TESTS@
 config.mlir_run_arm_sve_tests = @MLIR_RUN_ARM_SVE_TESTS@
 # This is a workaround for the fact that LIT's:
 #   %if <cond>
-# requires <cond> to be in the set of available features.
+# requires <cond> to be in the set of available features. 
 # TODO: Update LIT's TestRunner so that this is not required.
 if config.mlir_run_arm_sve_tests:
     config.available_features.add("mlir_arm_sve_tests")
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index a439fdd50d21..51731b1e8f74 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -37,7 +37,6 @@ expand_template(
         # All disabled, but required to substituted because they are not in quotes.
         "@LLVM_BUILD_EXAMPLES@": "0",
         "@LLVM_HAS_NVPTX_TARGET@": "0",
-        "@LLVM_INCLUDE_SPIRV_TOOLS_TESTS@": "0",
         "@MLIR_ENABLE_CUDA_RUNNER@": "0",
         "@MLIR_ENABLE_ROCM_CONVERSIONS@": "0",
         "@MLIR_ENABLE_ROCM_RUNNER@": "0",

From b85e92990fdec32ec7512dec7bd36d945f8e0144 Mon Sep 17 00:00:00 2001
From: Alexey Karyakin <akaryaki@quicinc.com>
Date: Wed, 18 Jun 2025 15:26:38 -0500
Subject: [PATCH 0865/1322] Hexagon v87 v89 elf flags (#144584)

---
 llvm/include/llvm/BinaryFormat/ELF.h | 64 ++++++++++++++++------------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 97f417263aff..dcf1f1c6d7b2 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -614,33 +614,7 @@ enum {
 
 // Hexagon-specific e_flags
 enum {
-  // Object processor version flags, bits[11:0]
-  EF_HEXAGON_MACH_V2 = 0x00000001,   // Hexagon V2
-  EF_HEXAGON_MACH_V3 = 0x00000002,   // Hexagon V3
-  EF_HEXAGON_MACH_V4 = 0x00000003,   // Hexagon V4
-  EF_HEXAGON_MACH_V5 = 0x00000004,   // Hexagon V5
-  EF_HEXAGON_MACH_V55 = 0x00000005,  // Hexagon V55
-  EF_HEXAGON_MACH_V60 = 0x00000060,  // Hexagon V60
-  EF_HEXAGON_MACH_V61 = 0x00000061,  // Hexagon V61
-  EF_HEXAGON_MACH_V62 = 0x00000062,  // Hexagon V62
-  EF_HEXAGON_MACH_V65 = 0x00000065,  // Hexagon V65
-  EF_HEXAGON_MACH_V66 = 0x00000066,  // Hexagon V66
-  EF_HEXAGON_MACH_V67 = 0x00000067,  // Hexagon V67
-  EF_HEXAGON_MACH_V67T = 0x00008067, // Hexagon V67T
-  EF_HEXAGON_MACH_V68 = 0x00000068,  // Hexagon V68
-  EF_HEXAGON_MACH_V69 = 0x00000069,  // Hexagon V69
-  EF_HEXAGON_MACH_V71 = 0x00000071,  // Hexagon V71
-  EF_HEXAGON_MACH_V71T = 0x00008071, // Hexagon V71T
-  EF_HEXAGON_MACH_V73 = 0x00000073,  // Hexagon V73
-  EF_HEXAGON_MACH_V75 = 0x00000075,  // Hexagon V75
-  EF_HEXAGON_MACH_V77 = 0x00000077,  // Hexagon V77
-  EF_HEXAGON_MACH_V79 = 0x00000079,  // Hexagon V79
-  EF_HEXAGON_MACH_V81 = 0x00000081,  // Hexagon V81
-  EF_HEXAGON_MACH_V83 = 0x00000083,  // Hexagon V83
-  EF_HEXAGON_MACH_V85 = 0x00000085,  // Hexagon V85
-  EF_HEXAGON_MACH = 0x000003ff,      // Hexagon V..
-
-  // Highest ISA version flags
+  // Hexagon ISA version, bits[11:0]
   EF_HEXAGON_ISA_MACH = 0x00000000, // Same as specified in bits[11:0]
                                     // of e_flags
   EF_HEXAGON_ISA_V2 = 0x00000010,   // Hexagon V2 ISA
@@ -664,7 +638,43 @@ enum {
   EF_HEXAGON_ISA_V81 = 0x00000081,  // Hexagon V81 ISA
   EF_HEXAGON_ISA_V83 = 0x00000083,  // Hexagon V83 ISA
   EF_HEXAGON_ISA_V85 = 0x00000085,  // Hexagon V85 ISA
+  EF_HEXAGON_ISA_V87 = 0x00000087,  // Hexagon V87 ISA
+  EF_HEXAGON_ISA_V89 = 0x00000089,  // Hexagon V89 ISA
   EF_HEXAGON_ISA = 0x000003ff,      // Hexagon V.. ISA
+
+  // Tiny core flag, bit[15]
+  EF_HEXAGON_TINY_CORE = 0x00008000, // Hexagon Tiny Core
+
+  // Hexagon processor version, bits[15:0]
+  EF_HEXAGON_MACH_V2 = 0x00000001,          // Hexagon V2
+  EF_HEXAGON_MACH_V3 = 0x00000002,          // Hexagon V3
+  EF_HEXAGON_MACH_V4 = 0x00000003,          // Hexagon V4
+  EF_HEXAGON_MACH_V5 = 0x00000004,          // Hexagon V5
+  EF_HEXAGON_MACH_V55 = 0x00000005,         // Hexagon V55
+  EF_HEXAGON_MACH_V60 = EF_HEXAGON_ISA_V60, // Hexagon V60
+  EF_HEXAGON_MACH_V61 = EF_HEXAGON_ISA_V61, // Hexagon V61
+  EF_HEXAGON_MACH_V62 = EF_HEXAGON_ISA_V62, // Hexagon V62
+  EF_HEXAGON_MACH_V65 = EF_HEXAGON_ISA_V65, // Hexagon V65
+  EF_HEXAGON_MACH_V66 = EF_HEXAGON_ISA_V66, // Hexagon V66
+  EF_HEXAGON_MACH_V67 = EF_HEXAGON_ISA_V67, // Hexagon V67
+  EF_HEXAGON_MACH_V67T =
+      EF_HEXAGON_ISA_V67 | EF_HEXAGON_TINY_CORE, // Hexagon V67T
+  EF_HEXAGON_MACH_V68 = EF_HEXAGON_ISA_V68,      // Hexagon V68
+  EF_HEXAGON_MACH_V69 = EF_HEXAGON_ISA_V69,      // Hexagon V69
+  EF_HEXAGON_MACH_V71 = EF_HEXAGON_ISA_V71,      // Hexagon V71
+  EF_HEXAGON_MACH_V71T =
+      EF_HEXAGON_ISA_V71 | EF_HEXAGON_TINY_CORE, // Hexagon V71T
+  EF_HEXAGON_MACH_V73 = EF_HEXAGON_ISA_V73,      // Hexagon V73
+  EF_HEXAGON_MACH_V75 = EF_HEXAGON_ISA_V75,      // Hexagon V75
+  EF_HEXAGON_MACH_V77 = EF_HEXAGON_ISA_V77,      // Hexagon V77
+  EF_HEXAGON_MACH_V79 = EF_HEXAGON_ISA_V79,      // Hexagon V79
+  EF_HEXAGON_MACH_V81 = EF_HEXAGON_ISA_V81,      // Hexagon V81
+  EF_HEXAGON_MACH_V83 = EF_HEXAGON_ISA_V83,      // Hexagon V83
+  EF_HEXAGON_MACH_V85 = EF_HEXAGON_ISA_V85,      // Hexagon V85
+  EF_HEXAGON_MACH_V87 = EF_HEXAGON_ISA_V87,      // Hexagon V87
+  EF_HEXAGON_MACH_V89 = EF_HEXAGON_ISA_V89,      // Hexagon V89
+
+  EF_HEXAGON_MACH = 0x0000ffff, // Hexagon V..
 };
 
 // Hexagon-specific section indexes for common small data

From 7aecd7ecacb4b305b94149f3cfcef306a9da6beb Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Wed, 18 Jun 2025 13:45:43 -0700
Subject: [PATCH 0866/1322] [mlir][Vector] Add `vector.to_elements` op
 (#141457)

This PR introduces the `vector.to_elements` op, which decomposes a
vector into its scalar elements. This operation is symmetrical to the
existing `vector.from_elements`.

Examples:

```
    // Decompose a 0-D vector.
    %0 = vector.to_elements %v0 : vector<f32>
    // %0 = %v0[0]

    // Decompose a 1-D vector.
    %0:2 = vector.to_elements %v1 : vector<2xf32>
    // %0#0 = %v1[0]
    // %0#1 = %v1[1]

    // Decompose a 2-D.
    %0:6 = vector.to_elements %v2 : vector<2x3xf32>
    // %0#0 = %v2[0, 0]
    // %0#1 = %v2[0, 1]
    // %0#2 = %v2[0, 2]
    // %0#3 = %v2[1, 0]
    // %0#4 = %v2[1, 1]
    // %0#5 = %v2[1, 2]
```

This op is aimed at reducing code size when modeling "structured" vector
extractions and simplifying canonicalizations of large sequences of
`vector.extract` and `vector.insert` ops into `vector.shuffle` and other
sophisticated ops that can re-arrange vector elements.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 91 ++++++++++++++-----
 mlir/include/mlir/IR/OpBase.td                | 19 ++++
 mlir/lib/TableGen/Operator.cpp                | 31 +++++++
 mlir/test/Dialect/Vector/invalid.mlir         | 24 ++++-
 mlir/test/Dialect/Vector/ops.mlir             | 18 ++++
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        | 26 ++++++
 6 files changed, 184 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 8353314ed958..125cd4645ccc 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -790,40 +790,89 @@ def Vector_FMAOp :
   }];
 }
 
+def Vector_ToElementsOp : Vector_Op<"to_elements", [
+    Pure,
+    ShapedTypeMatchesElementCountAndTypes<"source", "elements">]> {
+  let summary = "operation that decomposes a vector into all its scalar elements";
+  let description = [{
+    This operation decomposes all the scalar elements from a vector. The
+    decomposed scalar elements are returned in row-major order. The number of
+    scalar results must match the number of elements in the input vector type.
+    All the result elements have the same result type, which must match the
+    element type of the input vector. Scalable vectors are not supported.
+
+    Examples:
+
+    ```mlir
+    // Decompose a 0-D vector.
+    %0 = vector.to_elements %v0 : vector<f32>
+    // %0 = %v0[0]
+
+    // Decompose a 1-D vector.
+    %0:2 = vector.to_elements %v1 : vector<2xf32>
+    // %0#0 = %v1[0]
+    // %0#1 = %v1[1]
+
+    // Decompose a 2-D.
+    %0:6 = vector.to_elements %v2 : vector<2x3xf32>
+    // %0#0 = %v2[0, 0]
+    // %0#1 = %v2[0, 1]
+    // %0#2 = %v2[0, 2]
+    // %0#3 = %v2[1, 0]
+    // %0#4 = %v2[1, 1]
+    // %0#5 = %v2[1, 2]
+
+    // Decompose a 3-D vector.
+    %0:6 = vector.to_elements %v3 : vector<3x1x2xf32>
+    // %0#0 = %v3[0, 0, 0]
+    // %0#1 = %v3[0, 0, 1]
+    // %0#2 = %v3[1, 0, 0]
+    // %0#3 = %v3[1, 0, 1]
+    // %0#4 = %v3[2, 0, 0]
+    // %0#5 = %v3[2, 0, 1]
+    ```
+  }];
+
+  let arguments = (ins AnyVectorOfAnyRank:$source);
+  let results = (outs Variadic<AnyType>:$elements);
+  let assemblyFormat = "$source attr-dict `:` type($source)";
+}
+
 def Vector_FromElementsOp : Vector_Op<"from_elements", [
     Pure,
-    TypesMatchWith<"operand types match result element type",
-                   "result", "elements", "SmallVector<Type>("
-                   "::llvm::cast<VectorType>($_self).getNumElements(), "
-                   "::llvm::cast<VectorType>($_self).getElementType())">]> {
+    ShapedTypeMatchesElementCountAndTypes<"dest", "elements">]> {
   let summary = "operation that defines a vector from scalar elements";
   let description = [{
     This operation defines a vector from one or multiple scalar elements. The
-    number of elements must match the number of elements in the result type.
-    All elements must have the same type, which must match the element type of
-    the result vector type.
+    scalar elements are arranged in row-major within the vector. The number of
+    elements must match the number of elements in the result type. All elements
+    must have the same type, which must match the element type of the result
+    vector type. Scalable vectors are not supported.
 
-    `elements` are a flattened version of the result vector in row-major order.
-
-    Example:
+    Examples:
 
     ```mlir
-    // %f1
+    // Define a 0-D vector.
     %0 = vector.from_elements %f1 : vector<f32>
-    // [%f1, %f2]
-    %1 = vector.from_elements %f1, %f2 : vector<2xf32>
-    // [[%f1, %f2, %f3], [%f4, %f5, %f6]]
-    %2 = vector.from_elements %f1, %f2, %f3, %f4, %f5, %f6 : vector<2x3xf32>
-    // [[[%f1, %f2]], [[%f3, %f4]], [[%f5, %f6]]]
-    %3 = vector.from_elements %f1, %f2, %f3, %f4, %f5, %f6 : vector<3x1x2xf32>
-    ```
+    // [%f1]
 
-    Note, scalable vectors are not supported.
+    // Define a 1-D vector.
+    %1 = vector.from_elements %f1, %f2 : vector<2xf32>
+    // [%f1, %f2]
+
+    // Define a 2-D vector.
+    %2 = vector.from_elements %f1, %f2, %f3, %f4, %f5, %f6 : vector<2x3xf32>
+    // [[%f1, %f2, %f3], [%f4, %f5, %f6]]
+
+    // Define a 3-D vector.
+    %3 = vector.from_elements %f1, %f2, %f3, %f4, %f5, %f6 : vector<3x1x2xf32>
+    // [[[%f1, %f2]], [[%f3, %f4]], [[%f5, %f6]]]
+    ```
   }];
 
   let arguments = (ins Variadic<AnyType>:$elements);
-  let results = (outs AnyFixedVectorOfAnyRank:$result);
-  let assemblyFormat = "$elements attr-dict `:` type($result)";
+  let results = (outs AnyFixedVectorOfAnyRank:$dest);
+  let assemblyFormat = "$elements attr-dict `:` type($dest)";
   let hasCanonicalizer = 1;
 }
 
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 51b60972203e..b3fabe409806 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -556,6 +556,25 @@ class AllShapesMatch<list<string> names> :
 class AllTypesMatch<list<string> names> :
     AllMatchSameOperatorTrait<names, "$_self.getType()", "type">;
 
+// A type constraint that verifies that a shaped type matches the size and
+// element type of a container with element types. More specifically, it denotes
+// shapedArg.getType().getNumElements() == elementsArg.size() &&
+// shapedArg.getType().getElementType() == elementsArg[i].getType(), for i in
+// [0, elementsArg.size()).
+class ShapedTypeMatchesElementCountAndTypes<string shapedArg,
+                                            string elementsArg> :
+    PredOpTrait<"shaped type '" # shapedArg # "' matches '" # elementsArg # "' "
+                "element count and types",
+        And<[CPred<ElementCount<shapedArg>.result # " == "
+                     "$" # elementsArg # ".getTypes().size()">,
+             CPred<"::llvm::all_of($" # elementsArg # ".getTypes(), "
+                     "[&](::mlir::Type t) { return t == "
+                       # ElementType<shapedArg>.result # "; })">]>> {
+
+  string shaped = shapedArg;
+  string elements = elementsArg;
+}
+
 // A type constraint that denotes `transform(lhs.getType()) == rhs.getType()`.
 // An optional comparator function may be provided that changes the above form
 // into: `comparator(transform(lhs.getType()), rhs.getType())`.
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 2544f0a1b91b..07520a2f94d7 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -468,6 +468,37 @@ void Operator::populateTypeInferenceInfo(
       continue;
     }
 
+    // The `ShapedTypeMatchesElementCountAndTypes` trait represents a 1 -> 1
+    // type inference edge where a shaped type matches element count and types
+    // of variadic elements.
+    if (def.isSubClassOf("ShapedTypeMatchesElementCountAndTypes")) {
+      StringRef shapedArg = def.getValueAsString("shaped");
+      StringRef elementsArg = def.getValueAsString("elements");
+
+      int shapedIndex = argumentsAndResultsIndex.lookup(shapedArg);
+      int elementsIndex = argumentsAndResultsIndex.lookup(elementsArg);
+
+      // Handle result type inference from shaped type to variadic elements.
+      if (InferredResultType::isResultIndex(elementsIndex) &&
+          InferredResultType::isArgIndex(shapedIndex)) {
+        int resultIndex = InferredResultType::unmapResultIndex(elementsIndex);
+        ResultTypeInference &infer = inference[resultIndex];
+        if (!infer.inferred) {
+          infer.sources.emplace_back(
+              shapedIndex,
+              "::llvm::SmallVector<::mlir::Type>(::llvm::cast<::mlir::"
+              "ShapedType>($_self).getNumElements(), "
+              "::llvm::cast<::mlir::ShapedType>($_self).getElementType())");
+          infer.inferred = true;
+        }
+      }
+
+      // Type inference in the opposite direction is not possible as the actual
+      // shaped type can't be inferred from the variadic elements.
+
+      continue;
+    }
+
     if (!def.isSubClassOf("AllTypesMatch"))
       continue;
 
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 04810ed52584..ec7cee7b2c64 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1896,7 +1896,24 @@ func.func @deinterleave_scalable_rank_fail(%vec : vector<2x[4]xf32>) {
 
 // -----
 
-func.func @invalid_from_elements(%a: f32) {
+func.func @to_elements_wrong_num_results(%a: vector<1x1x2xf32>) {
+  // expected-error @+1 {{operation defines 2 results but was provided 4 to bind}}
+  %0:4 = vector.to_elements %a : vector<1x1x2xf32>
+  return
+}
+
+// -----
+
+func.func @to_elements_wrong_result_type(%a: vector<2xf32>) -> i32 {
+  // expected-error @+3 {{use of value '%0' expects different type than prior uses: 'i32'}}
+  // expected-note @+1 {{prior use here}}
+  %0:2 = vector.to_elements %a : vector<2xf32>
+  return %0#0 : i32
+}
+
+// -----
+
+func.func @from_elements_wrong_num_operands(%a: f32) {
   // expected-error @+1 {{'vector.from_elements' number of operands and types do not match: got 1 operands and 2 types}}
   vector.from_elements %a : vector<2xf32>
   return
@@ -1905,16 +1922,15 @@ func.func @invalid_from_elements(%a: f32) {
 // -----
 
 // expected-note @+1 {{prior use here}}
-func.func @invalid_from_elements(%a: f32, %b: i32) {
+func.func @from_elements_wrong_operand_type(%a: f32, %b: i32) {
   // expected-error @+1 {{use of value '%b' expects different type than prior uses: 'f32' vs 'i32'}}
   vector.from_elements %a, %b : vector<2xf32>
   return
 }
-
 // -----
 
 func.func @invalid_from_elements_scalable(%a: f32, %b: i32) {
-  // expected-error @+1 {{'result' must be fixed-length vector of any type values, but got 'vector<[2]xf32>'}}
+  // expected-error @+1 {{'dest' must be fixed-length vector of any type values, but got 'vector<[2]xf32>'}}
   vector.from_elements %a, %b : vector<[2]xf32>
   return
 }
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index f3220aed4360..c59f7bd00190 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -1175,6 +1175,24 @@ func.func @deinterleave_nd_scalable(%arg:vector<2x3x4x[6]xf32>) -> (vector<2x3x4
   return %0, %1 : vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32>
 }
 
+// CHECK-LABEL: func @to_elements(
+//  CHECK-SAME:     %[[A_VEC:.*]]: vector<f32>, %[[B_VEC:.*]]: vector<1xf32>,
+//  CHECK-SAME:     %[[C_VEC:.*]]: vector<1x2xf32>, %[[D_VEC:.*]]: vector<2x2xf32>)
+func.func @to_elements(%a_vec : vector<f32>, %b_vec : vector<1xf32>,
+                       %c_vec : vector<1x2xf32>, %d_vec : vector<2x2xf32>)
+                   -> (f32, f32, f32, f32, f32, f32, f32, f32) {
+  // CHECK: %[[A_ELEMS:.*]] = vector.to_elements %[[A_VEC]] : vector<f32>
+  %0 = vector.to_elements %a_vec : vector<f32>
+  // CHECK: %[[B_ELEMS:.*]] = vector.to_elements %[[B_VEC]] : vector<1xf32>
+  %1 = vector.to_elements %b_vec : vector<1xf32>
+  // CHECK: %[[C_ELEMS:.*]]:2 = vector.to_elements %[[C_VEC]] : vector<1x2xf32>
+  %2:2 = vector.to_elements %c_vec : vector<1x2xf32>
+  // CHECK: %[[D_ELEMS:.*]]:4 = vector.to_elements %[[D_VEC]] : vector<2x2xf32>
+  %3:4 = vector.to_elements %d_vec : vector<2x2xf32>
+  // CHECK: return %[[A_ELEMS]], %[[B_ELEMS]], %[[C_ELEMS]]#0, %[[C_ELEMS]]#1, %[[D_ELEMS]]#0, %[[D_ELEMS]]#1, %[[D_ELEMS]]#2, %[[D_ELEMS]]#3
+  return %0, %1, %2#0, %2#1, %3#0, %3#1, %3#2, %3#3: f32, f32, f32, f32, f32, f32, f32, f32
+}
+
 // CHECK-LABEL: func @from_elements(
 //  CHECK-SAME:     %[[a:.*]]: f32, %[[b:.*]]: f32)
 func.func @from_elements(%a: f32, %b: f32) -> (vector<f32>, vector<1xf32>, vector<1x2xf32>, vector<2x2xf32>) {
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 0a9d14d6603a..ef3a18ba7df2 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -2787,6 +2787,11 @@ private:
   void handleTypesMatchConstraint(
       StringMap<TypeResolutionInstance> &variableTyResolver, const Record &def);
 
+  /// Check for inferable type resolution based on
+  /// `ShapedTypeMatchesElementCountAndTypes` constraint.
+  void handleShapedTypeMatchesElementCountAndTypesConstraint(
+      StringMap<TypeResolutionInstance> &variableTyResolver, const Record &def);
+
   /// Returns an argument or attribute with the given name that has been seen
   /// within the format.
   ConstArgument findSeenArg(StringRef name);
@@ -2850,6 +2855,9 @@ LogicalResult OpFormatParser::verify(SMLoc loc,
       handleSameTypesConstraint(variableTyResolver, /*includeResults=*/true);
     } else if (def.isSubClassOf("TypesMatchWith")) {
       handleTypesMatchConstraint(variableTyResolver, def);
+    } else if (def.isSubClassOf("ShapedTypeMatchesElementCountAndTypes")) {
+      handleShapedTypeMatchesElementCountAndTypesConstraint(variableTyResolver,
+                                                            def);
     } else if (!op.allResultTypesKnown()) {
       // This doesn't check the name directly to handle
       //    DeclareOpInterfaceMethods<InferTypeOpInterface>
@@ -3289,6 +3297,24 @@ void OpFormatParser::handleTypesMatchConstraint(
     variableTyResolver[rhsName] = {arg, transformer};
 }
 
+void OpFormatParser::handleShapedTypeMatchesElementCountAndTypesConstraint(
+    StringMap<TypeResolutionInstance> &variableTyResolver, const Record &def) {
+  StringRef shapedArg = def.getValueAsString("shaped");
+  StringRef elementsArg = def.getValueAsString("elements");
+
+  // Check if the 'shaped' argument is seen, then we can infer the 'elements'
+  // types.
+  if (ConstArgument arg = findSeenArg(shapedArg)) {
+    variableTyResolver[elementsArg] = {
+        arg, "::llvm::SmallVector<::mlir::Type>(::llvm::cast<::mlir::"
+             "ShapedType>($_self).getNumElements(), "
+             "::llvm::cast<::mlir::ShapedType>($_self).getElementType())"};
+  }
+
+  // Type inference in the opposite direction is not possible as the actual
+  // shaped type can't be inferred from the variadic elements.
+}
+
 ConstArgument OpFormatParser::findSeenArg(StringRef name) {
   if (const NamedTypeConstraint *arg = findArg(op.getOperands(), name))
     return seenOperandTypes.test(arg - op.operand_begin()) ? arg : nullptr;

From 86d1d6b2c0c1f03e82cb8e360f2672c6f0ea39d5 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Wed, 18 Jun 2025 16:50:12 -0400
Subject: [PATCH 0867/1322] [clang] Use TargetInfo to determine device kernel
 calling convention (#144728)

We should abstract this logic away to `TargetInfo`. See
https://github.com/llvm/llvm-project/pull/137882 for more information.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 clang/lib/CodeGen/CGCall.cpp         | 13 ++-----------
 clang/lib/CodeGen/TargetInfo.cpp     | 27 +++++++++++++++------------
 clang/lib/CodeGen/TargetInfo.h       |  4 ++--
 clang/lib/CodeGen/Targets/AMDGPU.cpp |  4 ++--
 clang/lib/CodeGen/Targets/NVPTX.cpp  |  2 +-
 clang/lib/CodeGen/Targets/SPIR.cpp   |  4 ++--
 6 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index a06455d25b1e..fd75de42515d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -83,17 +83,8 @@ unsigned CodeGenTypes::ClangCallConvToLLVMCallConv(CallingConv CC) {
     return llvm::CallingConv::AArch64_SVE_VectorCall;
   case CC_SpirFunction:
     return llvm::CallingConv::SPIR_FUNC;
-  case CC_DeviceKernel: {
-    if (CGM.getLangOpts().OpenCL)
-      return CGM.getTargetCodeGenInfo().getOpenCLKernelCallingConv();
-    if (CGM.getTriple().isSPIROrSPIRV())
-      return llvm::CallingConv::SPIR_KERNEL;
-    if (CGM.getTriple().isAMDGPU())
-      return llvm::CallingConv::AMDGPU_KERNEL;
-    if (CGM.getTriple().isNVPTX())
-      return llvm::CallingConv::PTX_Kernel;
-    llvm_unreachable("Unknown kernel calling convention");
-  }
+  case CC_DeviceKernel:
+    return CGM.getTargetCodeGenInfo().getDeviceKernelCallingConv();
   case CC_PreserveMost:
     return llvm::CallingConv::PreserveMost;
   case CC_PreserveAll:
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index f3df92c44bb6..277d69daf493 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -103,18 +103,21 @@ TargetCodeGenInfo::getDependentLibraryOption(llvm::StringRef Lib,
   Opt += Lib;
 }
 
-unsigned TargetCodeGenInfo::getOpenCLKernelCallingConv() const {
-  // OpenCL kernels are called via an explicit runtime API with arguments
-  // set with clSetKernelArg(), not as normal sub-functions.
-  // Return SPIR_KERNEL by default as the kernel calling convention to
-  // ensure the fingerprint is fixed such way that each OpenCL argument
-  // gets one matching argument in the produced kernel function argument
-  // list to enable feasible implementation of clSetKernelArg() with
-  // aggregates etc. In case we would use the default C calling conv here,
-  // clSetKernelArg() might break depending on the target-specific
-  // conventions; different targets might split structs passed as values
-  // to multiple function arguments etc.
-  return llvm::CallingConv::SPIR_KERNEL;
+unsigned TargetCodeGenInfo::getDeviceKernelCallingConv() const {
+  if (getABIInfo().getContext().getLangOpts().OpenCL) {
+    // Device kernels are called via an explicit runtime API with arguments,
+    // such as set with clSetKernelArg() for OpenCL, not as normal
+    // sub-functions. Return SPIR_KERNEL by default as the kernel calling
+    // convention to ensure the fingerprint is fixed such way that each kernel
+    // argument gets one matching argument in the produced kernel function
+    // argument list to enable feasible implementation of clSetKernelArg() with
+    // aggregates etc. In case we would use the default C calling conv here,
+    // clSetKernelArg() might break depending on the target-specific
+    // conventions; different targets might split structs passed as values
+    // to multiple function arguments etc.
+    return llvm::CallingConv::SPIR_KERNEL;
+  }
+  llvm_unreachable("Unknown kernel calling convention");
 }
 
 void TargetCodeGenInfo::setOCLKernelStubCallingConvention(
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 2783e222eb80..b4057d369f98 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -298,8 +298,8 @@ public:
                                        llvm::StringRef Value,
                                        llvm::SmallString<32> &Opt) const {}
 
-  /// Get LLVM calling convention for OpenCL kernel.
-  virtual unsigned getOpenCLKernelCallingConv() const;
+  /// Get LLVM calling convention for device kernels.
+  virtual unsigned getDeviceKernelCallingConv() const;
 
   /// Get target specific null pointer.
   /// \param T is the LLVM type of the null pointer.
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 8660373c3927..47a552a7bf49 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -304,7 +304,7 @@ public:
 
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
-  unsigned getOpenCLKernelCallingConv() const override;
+  unsigned getDeviceKernelCallingConv() const override;
 
   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
       llvm::PointerType *T, QualType QT) const override;
@@ -431,7 +431,7 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     F->addFnAttr("amdgpu-ieee", "false");
 }
 
-unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
+unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
   return llvm::CallingConv::AMDGPU_KERNEL;
 }
 
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index ad802c9131de..82bdfe2666b5 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -78,7 +78,7 @@ public:
     return true;
   }
 
-  unsigned getOpenCLKernelCallingConv() const override {
+  unsigned getDeviceKernelCallingConv() const override {
     return llvm::CallingConv::PTX_Kernel;
   }
 
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 2f1e43cdc8cc..afa23bffcd07 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -51,7 +51,7 @@ public:
         getABIInfo().getDataLayout().getAllocaAddrSpace());
   }
 
-  unsigned getOpenCLKernelCallingConv() const override;
+  unsigned getDeviceKernelCallingConv() const override;
   llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override;
   llvm::Type *
   getHLSLType(CodeGenModule &CGM, const Type *Ty,
@@ -219,7 +219,7 @@ void computeSPIRKernelABIInfo(CodeGenModule &CGM, CGFunctionInfo &FI) {
 }
 }
 
-unsigned CommonSPIRTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
+unsigned CommonSPIRTargetCodeGenInfo::getDeviceKernelCallingConv() const {
   return llvm::CallingConv::SPIR_KERNEL;
 }
 

From 03bdc0a1f68adcddef80a4e7931dbfae914e5652 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Wed, 18 Jun 2025 13:51:59 -0700
Subject: [PATCH 0868/1322] [lldb][target] Add progress report for
 wait-attaching to process (#144768)

This commit adds a progress report when wait-attaching to a process as
well as a test for this.
---
 lldb/source/Target/Target.cpp                 |  1 +
 .../TestProgressReporting.py                  | 31 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 45a9e1196a04..8f8d2ef21cc5 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3546,6 +3546,7 @@ llvm::Expected<TraceSP> Target::GetTraceOrCreate() {
 }
 
 Status Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) {
+  Progress attach_progress("Waiting to attach to process");
   m_stats.SetLaunchOrAttachTime();
   auto state = eStateInvalid;
   auto process_sp = GetProcessSP();
diff --git a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
index 9af53845ca1b..8198c50a5ff0 100644
--- a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
+++ b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
@@ -2,6 +2,7 @@
 Test that we are able to broadcast and receive progress events from lldb
 """
 import lldb
+import threading
 
 import lldbsuite.test.lldbutil as lldbutil
 
@@ -16,6 +17,36 @@ class TestProgressReporting(TestBase):
             self.broadcaster, lldb.SBDebugger.eBroadcastBitProgress
         )
 
+    def test_wait_attach_progress_reporting(self):
+        """Test that progress reports for wait attaching work as intended."""
+        self.build()
+        target = self.dbg.CreateTarget(None)
+
+        # Wait attach to a process, then check to see that a progress report was created
+        # and that its message is correct for waiting to attach to a process.
+        class AttachThread(threading.Thread):
+            def __init__(self, target):
+                threading.Thread.__init__(self)
+                self.target = target
+
+            def run(self):
+                self.target.AttachToProcessWithName(
+                    lldb.SBListener(), "a.out", True, lldb.SBError()
+                )
+
+        thread = AttachThread(target)
+        thread.start()
+
+        event = lldbutil.fetch_next_event(self, self.listener, self.broadcaster)
+        progress_data = lldb.SBDebugger.GetProgressDataFromEvent(event)
+        message = progress_data.GetValueForKey("message").GetStringValue(100)
+        self.assertEqual(message, "Waiting to attach to process")
+
+        # Interrupt the process attach to keep the test from stalling.
+        target.process.SendAsyncInterrupt()
+
+        thread.join()
+
     def test_dwarf_symbol_loading_progress_report(self):
         """Test that we are able to fetch dwarf symbol loading progress events"""
         self.build()

From 4dca4459a328b8d589d81cd1f203b798c36ebf35 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 18 Jun 2025 23:09:48 +0200
Subject: [PATCH 0869/1322] [CIR] Upstream ComplexType builtin_complex
 (#144225)

This change adds support for builtin_complex

https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp     | 11 ++++++-
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 33 ++++++++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenValue.h         |  1 -
 clang/test/CIR/CodeGen/complex.cpp          | 25 ++++++++++++++++
 4 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 83825f0835a1..cff139a7802d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -78,6 +78,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   assert(!cir::MissingFeatures::builtinCallMathErrno());
   assert(!cir::MissingFeatures::builtinCall());
 
+  mlir::Location loc = getLoc(e->getExprLoc());
+
   switch (builtinIDIfNoAsmLabel) {
   default:
     break;
@@ -88,9 +90,16 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       return RValue::get(nullptr);
 
     mlir::Value argValue = emitCheckedArgForAssume(e->getArg(0));
-    builder.create<cir::AssumeOp>(getLoc(e->getExprLoc()), argValue);
+    builder.create<cir::AssumeOp>(loc, argValue);
     return RValue::get(nullptr);
   }
+
+  case Builtin::BI__builtin_complex: {
+    mlir::Value real = emitScalarExpr(e->getArg(0));
+    mlir::Value imag = emitScalarExpr(e->getArg(1));
+    mlir::Value complex = builder.createComplexCreate(loc, real, imag);
+    return RValue::get(complex);
+  }
   }
 
   cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 26070a6ca307..12e8e27948cf 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -15,11 +15,25 @@ public:
   explicit ComplexExprEmitter(CIRGenFunction &cgf)
       : cgf(cgf), builder(cgf.getBuilder()) {}
 
+  //===--------------------------------------------------------------------===//
+  //                               Utilities
+  //===--------------------------------------------------------------------===//
+
+  /// Given an expression with complex type that represents a value l-value,
+  /// this method emits the address of the l-value, then loads and returns the
+  /// result.
+  mlir::Value emitLoadOfLValue(const Expr *e) {
+    return emitLoadOfLValue(cgf.emitLValue(e), e->getExprLoc());
+  }
+
+  mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc);
+
   /// Store the specified real/imag parts into the
   /// specified value pointer.
   void emitStoreOfComplex(mlir::Location loc, mlir::Value val, LValue lv,
                           bool isInit);
 
+  mlir::Value VisitCallExpr(const CallExpr *e);
   mlir::Value VisitInitListExpr(InitListExpr *e);
 
   mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il);
@@ -34,11 +48,21 @@ static const ComplexType *getComplexType(QualType type) {
   return cast<ComplexType>(cast<AtomicType>(type)->getValueType());
 }
 
+mlir::Value ComplexExprEmitter::emitLoadOfLValue(LValue lv,
+                                                 SourceLocation loc) {
+  assert(lv.isSimple() && "non-simple complex l-value?");
+  if (lv.getType()->isAtomicType())
+    cgf.cgm.errorNYI(loc, "emitLoadOfLValue with Atomic LV");
+
+  const Address srcAddr = lv.getAddress();
+  return builder.createLoad(cgf.getLoc(loc), srcAddr);
+}
+
 void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
                                             LValue lv, bool isInit) {
   if (lv.getType()->isAtomicType() ||
       (!isInit && cgf.isLValueSuitableForInlineAtomic(lv))) {
-    cgf.cgm.errorNYI("StoreOfComplex with Atomic LV");
+    cgf.cgm.errorNYI(loc, "StoreOfComplex with Atomic LV");
     return;
   }
 
@@ -46,6 +70,13 @@ void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
   builder.createStore(loc, val, destAddr);
 }
 
+mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
+  if (e->getCallReturnType(cgf.getContext())->isReferenceType())
+    return emitLoadOfLValue(e);
+
+  return cgf.emitCallExpr(e).getValue();
+}
+
 mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
   mlir::Location loc = cgf.getLoc(e->getExprLoc());
   if (e->getNumInits() == 2) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 84972fc7f911..7180d92f8c31 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -88,7 +88,6 @@ public:
     return er;
   }
 
-  // FIXME: Aggregate rvalues need to retain information about whether they are
   // volatile or not.  Remove default to find all places that probably get this
   // wrong.
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index db0b9111ab4f..721db235b37d 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -191,6 +191,31 @@ void foo8() {
 // OGCG: store double 0.000000e+00, ptr %[[C_REAL_PTR]], align 8
 // OGCG: store double 2.000000e+00, ptr %[[C_IMAG_PTR]], align 8
 
+void foo9(double a, double b) {
+  double _Complex c = __builtin_complex(a, b);
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.double>, !cir.double
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.double>, !cir.double
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[TMP_B]] : !cir.double -> !cir.complex<!cir.double>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[TMP_A:.*]] = load double, ptr {{.*}}, align 8
+// LLVM: %[[TMP_B:.*]] = load double, ptr {{.*}}, align 8
+// LLVM: %[[TMP:.*]] = insertvalue { double, double } undef, double %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { double, double } %[[TMP]], double %[[TMP_B]], 1
+// LLVM: store { double, double } %[[TMP_2]], ptr %[[COMPLEX]], align 8
+
+// OGCG: %[[COMPLEX]] = alloca { double, double }, align 8
+// OGCG: %[[TMP_A:.*]] = load double, ptr {{.*}}, align 8
+// OGCG: %[[TMP_B:.*]] = load double, ptr {{.*}}, align 8
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store double %[[TMP_A]], ptr %[[C_REAL_PTR]], align 8
+// OGCG: store double %[[TMP_B]], ptr %[[C_IMAG_PTR]], align 8
+
 void foo14() {
   int _Complex c = 2i;
 }

From ac37a0df949afc31d12de75f85306db32dd50713 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Wed, 18 Jun 2025 14:11:21 -0700
Subject: [PATCH 0870/1322] [mlir] Fix integer comparison warning (#144794)

Introduced by https://github.com/llvm/llvm-project/pull/141457
---
 mlir/include/mlir/IR/OpBase.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index b3fabe409806..43ef28624fb1 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -566,7 +566,7 @@ class ShapedTypeMatchesElementCountAndTypes<string shapedArg,
     PredOpTrait<"shaped type '" # shapedArg # "' matches '" # elementsArg # "' "
                 "element count and types",
         And<[CPred<ElementCount<shapedArg>.result # " == "
-                     "$" # elementsArg # ".getTypes().size()">,
+                     "static_cast<int64_t>($" # elementsArg # ".getTypes().size())">,
              CPred<"::llvm::all_of($" # elementsArg # ".getTypes(), "
                      "[&](::mlir::Type t) { return t == "
                        # ElementType<shapedArg>.result # "; })">]>> {

From d10079e305acae58b44dc773cb94f7127de197ef Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 18 Jun 2025 14:23:51 -0700
Subject: [PATCH 0871/1322] [RISCV] Reduce the VL of both operands in
 VMERGE_VVM (#144759)

The `tryToReduceVL` function in RISCVVectorPeephole currently only
reduces the VL of the instruction that defines the true operand in
VMERGE_VVM. We should be able to reduce VL of both operands. This patch
generalizes this function to support multiple operands from a single
instruction.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 68 ++++++++++---------
 .../test/CodeGen/RISCV/rvv/masked-load-int.ll |  3 +-
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll  |  4 +-
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index c9c2413d009b..f7acd676461f 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -112,7 +112,7 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   //
   // TODO: We can handle a bunch more instructions here, and probably
   // recurse backwards through operands too.
-  unsigned SrcIdx = 0;
+  SmallVector<unsigned, 2> SrcIndices = {0};
   switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return false;
@@ -122,10 +122,10 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   case RISCV::VSE64_V:
     break;
   case RISCV::VMV_V_V:
-    SrcIdx = 2;
+    SrcIndices[0] = 2;
     break;
   case RISCV::VMERGE_VVM:
-    SrcIdx = 3; // TODO: We can also handle the false operand.
+    SrcIndices.assign({2, 3});
     break;
   case RISCV::VREDSUM_VS:
   case RISCV::VREDMAXU_VS:
@@ -143,7 +143,7 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   case RISCV::VFREDMIN_VS:
   case RISCV::VFWREDUSUM_VS:
   case RISCV::VFWREDOSUM_VS:
-    SrcIdx = 2;
+    SrcIndices[0] = 2;
     break;
   }
 
@@ -151,42 +151,48 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   if (VL.isImm() && VL.getImm() == RISCV::VLMaxSentinel)
     return false;
 
-  Register SrcReg = MI.getOperand(SrcIdx).getReg();
-  // Note: one *use*, not one *user*.
-  if (!MRI->hasOneUse(SrcReg))
-    return false;
+  bool Changed = false;
+  for (unsigned SrcIdx : SrcIndices) {
+    Register SrcReg = MI.getOperand(SrcIdx).getReg();
+    // Note: one *use*, not one *user*.
+    if (!MRI->hasOneUse(SrcReg))
+      continue;
 
-  MachineInstr *Src = MRI->getVRegDef(SrcReg);
-  if (!Src || Src->hasUnmodeledSideEffects() ||
-      Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
-      !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
-      !RISCVII::hasSEWOp(Src->getDesc().TSFlags))
-    return false;
+    MachineInstr *Src = MRI->getVRegDef(SrcReg);
+    if (!Src || Src->hasUnmodeledSideEffects() ||
+        Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
+        !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
+        !RISCVII::hasSEWOp(Src->getDesc().TSFlags))
+      continue;
 
-  // Src's dest needs to have the same EEW as MI's input.
-  if (!hasSameEEW(MI, *Src))
-    return false;
+    // Src's dest needs to have the same EEW as MI's input.
+    if (!hasSameEEW(MI, *Src))
+      continue;
 
-  bool ElementsDependOnVL = RISCVII::elementsDependOnVL(
-      TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags);
-  if (ElementsDependOnVL || Src->mayRaiseFPException())
-    return false;
+    bool ElementsDependOnVL = RISCVII::elementsDependOnVL(
+        TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags);
+    if (ElementsDependOnVL || Src->mayRaiseFPException())
+      continue;
 
-  MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
-  if (VL.isIdenticalTo(SrcVL) || !RISCV::isVLKnownLE(VL, SrcVL))
-    return false;
+    MachineOperand &SrcVL =
+        Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
+    if (VL.isIdenticalTo(SrcVL) || !RISCV::isVLKnownLE(VL, SrcVL))
+      continue;
 
-  if (!ensureDominates(VL, *Src))
-    return false;
+    if (!ensureDominates(VL, *Src))
+      continue;
 
-  if (VL.isImm())
-    SrcVL.ChangeToImmediate(VL.getImm());
-  else if (VL.isReg())
-    SrcVL.ChangeToRegister(VL.getReg(), false);
+    if (VL.isImm())
+      SrcVL.ChangeToImmediate(VL.getImm());
+    else if (VL.isReg())
+      SrcVL.ChangeToRegister(VL.getReg(), false);
+
+    Changed = true;
+  }
 
   // TODO: For instructions with a passthru, we could clear the passthru
   // and tail policy since we've just proven the tail is not demanded.
-  return true;
+  return Changed;
 }
 
 /// Check if an operand is an immediate or a materialized ADDI $x0, imm.
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
index 75537406f351..372b07e0137b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
@@ -34,9 +34,8 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1>
 ; ZVE32:       # %bb.0:
 ; ZVE32-NEXT:    csrr a1, vlenb
 ; ZVE32-NEXT:    srli a1, a1, 3
-; ZVE32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; ZVE32-NEXT:    vmv.v.i v8, 0
 ; ZVE32-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; ZVE32-NEXT:    vmv.v.i v8, 0
 ; ZVE32-NEXT:    vle8.v v8, (a0), v0.t
 ; ZVE32-NEXT:    ret
   %load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> zeroinitializer)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 10a92f0188a9..1cbb980aebff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -3063,9 +3063,9 @@ define <vscale x 4 x i32> @vmv_v_x(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
 define <vscale x 1 x i8> @vmv_v_v(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl) {
 ; NOVLOPT-LABEL: vmv_v_v:
 ; NOVLOPT:       # %bb.0:
-; NOVLOPT-NEXT:    vsetvli a1, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
 ; NOVLOPT-NEXT:    vmv.v.v v8, v9
-; NOVLOPT-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; NOVLOPT-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; NOVLOPT-NEXT:    ret
 ;

From c4d7ea8049688a1d6d6d93129893fd1700a9f7e5 Mon Sep 17 00:00:00 2001
From: Javier Lopez-Gomez <javier.lopez.gomez@proton.me>
Date: Wed, 18 Jun 2025 23:47:30 +0200
Subject: [PATCH 0872/1322] [llvm-debuginfo-analyzer] Apply various memory
 savings in Core/LVxxx base classes (#144399)

This small changelist reduces memory footprint of instances of the Core
classes. Specifically,

- For `LVProperties`, use underlying type of `uint32_t` if there are at
most 32 properties to keep track of. Otherwise, fallback to the generic
`std::bitset<N>`.
The use of `llvm::SmallBitVector` is disregarded in this case, as the
upper bound on the size of the bitset can be determined statically (no
heap alloc ever needed).
- Reorder members in `LVElement` s.t. padding between members is
reduced.
- `LVScopeCompileUnit`: fix a couple of members which should be `static
constexpr` instead.
---
 .../DebugInfo/LogicalView/Core/LVElement.h    | 10 +++----
 .../llvm/DebugInfo/LogicalView/Core/LVScope.h |  4 +--
 .../DebugInfo/LogicalView/Core/LVSupport.h    | 29 +++++++++++++++----
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h
index b4501db190fe..0e7be45abfef 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h
@@ -107,18 +107,18 @@ class LLVM_ABI LVElement : public LVObject {
     IsAnonymous,
     LastEntry
   };
-  // Typed bitvector with properties for this element.
-  LVProperties<Property> Properties;
   static LVElementDispatch Dispatch;
 
-  /// RTTI.
-  const LVSubclassID SubclassID;
-
   // Indexes in the String Pool.
   size_t NameIndex = 0;
   size_t QualifiedNameIndex = 0;
   size_t FilenameIndex = 0;
 
+  // Typed bitvector with properties for this element.
+  LVProperties<Property> Properties;
+  /// RTTI.
+  const LVSubclassID SubclassID;
+
   uint16_t AccessibilityCode : 2; // DW_AT_accessibility.
   uint16_t InlineCode : 2;        // DW_AT_inline.
   uint16_t VirtualityCode : 2;    // DW_AT_virtuality.
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
index 5715a37185b2..a453923d032e 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
@@ -473,7 +473,7 @@ class LLVM_ABI LVScopeCompileUnit final : public LVScope {
 
   // Record scope sizes indexed by lexical level.
   // Setting an initial size that will cover a very deep nested scopes.
-  const size_t TotalInitialSize = 8;
+  static constexpr size_t TotalInitialSize = 8;
   using LVTotalsEntry = std::pair<unsigned, float>;
   SmallVector<LVTotalsEntry> Totals;
   // Maximum seen lexical level. It is used to control how many entries
@@ -510,7 +510,7 @@ public:
   void addMapping(LVLine *Line, LVSectionIndex SectionIndex);
   LVLineRange lineRange(LVLocation *Location) const;
 
-  LVNameInfo NameNone = {UINT64_MAX, 0};
+  static constexpr LVNameInfo NameNone = {UINT64_MAX, 0};
   void addPublicName(LVScope *Scope, LVAddress LowPC, LVAddress HighPC) {
     PublicNames.emplace(std::piecewise_construct, std::forward_as_tuple(Scope),
                         std::forward_as_tuple(LowPC, HighPC - LowPC));
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h
index 01137f80c0f8..058ca2da9a96 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSupport.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_DEBUGINFO_LOGICALVIEW_CORE_LVSUPPORT_H
 #define LLVM_DEBUGINFO_LOGICALVIEW_CORE_LVSUPPORT_H
 
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/DebugInfo/LogicalView/Core/LVStringPool.h"
 #include "llvm/Support/Compiler.h"
@@ -21,9 +20,11 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include <bitset>
 #include <cctype>
 #include <map>
 #include <sstream>
+#include <type_traits>
 
 namespace llvm {
 namespace logicalview {
@@ -38,14 +39,32 @@ using LVLexicalIndex =
 
 // Used to record specific characteristics about the objects.
 template <typename T> class LVProperties {
-  SmallBitVector Bits = SmallBitVector(static_cast<unsigned>(T::LastEntry) + 1);
+  static constexpr unsigned N_PROPS = static_cast<unsigned>(T::LastEntry);
+  // Use uint32_t as the underlying type if the `T` enum has at most 32
+  // enumerators; otherwise, fallback to the generic `std::bitset` case.
+  std::conditional_t<(N_PROPS > 32), std::bitset<N_PROPS>, uint32_t> Bits{};
 
 public:
   LVProperties() = default;
 
-  void set(T Idx) { Bits[static_cast<unsigned>(Idx)] = 1; }
-  void reset(T Idx) { Bits[static_cast<unsigned>(Idx)] = 0; }
-  bool get(T Idx) const { return Bits[static_cast<unsigned>(Idx)]; }
+  void set(T Idx) {
+    if constexpr (std::is_same_v<decltype(Bits), uint32_t>)
+      Bits |= 1 << static_cast<unsigned>(Idx);
+    else
+      Bits.set(static_cast<unsigned>(Idx));
+  }
+  void reset(T Idx) {
+    if constexpr (std::is_same_v<decltype(Bits), uint32_t>)
+      Bits &= ~(1 << static_cast<unsigned>(Idx));
+    else
+      Bits.reset(static_cast<unsigned>(Idx));
+  }
+  bool get(T Idx) const {
+    if constexpr (std::is_same_v<decltype(Bits), uint32_t>)
+      return Bits & (1 << static_cast<unsigned>(Idx));
+    else
+      return Bits[static_cast<unsigned>(Idx)];
+  }
 };
 
 // Generate get, set and reset 'bool' functions for LVProperties instances.

From 51aa6a4993ea18c968a087352d1cf569c840c41f Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Wed, 18 Jun 2025 22:48:24 +0100
Subject: [PATCH 0873/1322] [lldb-dap] Use protocol types for ReadMemory
 request (#144552)

Read memory from process instead of target.
---
 .../tools/lldb-dap/memory/TestDAP_memory.py   |  17 +-
 .../Handler/ReadMemoryRequestHandler.cpp      | 147 ++++--------------
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |   9 +-
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  37 +++++
 .../lldb-dap/Protocol/ProtocolRequests.h      |  36 +++++
 lldb/unittests/DAP/ProtocolTypesTest.cpp      |  33 ++++
 6 files changed, 154 insertions(+), 125 deletions(-)

diff --git a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
index 74062f3ab216..55fb4a961e78 100644
--- a/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
+++ b/lldb/test/API/tools/lldb-dap/memory/TestDAP_memory.py
@@ -111,8 +111,17 @@ class TestDAP_memory(lldbdap_testcase.DAPTestCaseBase):
         # VS Code sends those in order to check if a `memoryReference` can actually be dereferenced.
         mem = self.dap_server.request_readMemory(memref, 0, 0)
         self.assertEqual(mem["success"], True)
-        self.assertEqual(mem["body"]["data"], "")
+        self.assertNotIn(
+            "data", mem["body"], f"expects no data key in response: {mem!r}"
+        )
 
-        # Reads at offset 0x0 fail
-        mem = self.dap_server.request_readMemory("0x0", 0, 6)
-        self.assertEqual(mem["success"], False)
+        # Reads at offset 0x0 return unreadable bytes
+        bytes_to_read = 6
+        mem = self.dap_server.request_readMemory("0x0", 0, bytes_to_read)
+        self.assertEqual(mem["body"]["unreadableBytes"], bytes_to_read)
+
+        # Reads with invalid address fails.
+        mem = self.dap_server.request_readMemory("-3204", 0, 10)
+        self.assertFalse(mem["success"], "expect fail on reading memory.")
+
+        self.continue_to_exit()
diff --git a/lldb/tools/lldb-dap/Handler/ReadMemoryRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ReadMemoryRequestHandler.cpp
index 891c2af4f2f2..7065b6a24b55 100644
--- a/lldb/tools/lldb-dap/Handler/ReadMemoryRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/ReadMemoryRequestHandler.cpp
@@ -7,136 +7,47 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
-#include "EventHelper.h"
 #include "JSONUtils.h"
 #include "RequestHandler.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Base64.h"
 
 namespace lldb_dap {
 
-// "ReadMemoryRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "Reads bytes from memory at the provided location. Clients
-//                     should only call this request if the corresponding
-//                     capability `supportsReadMemoryRequest` is true.",
-//     "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "readMemory" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/ReadMemoryArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments" ]
-//   }]
-// },
-// "ReadMemoryArguments": {
-//   "type": "object",
-//   "description": "Arguments for `readMemory` request.",
-//   "properties": {
-//     "memoryReference": {
-//       "type": "string",
-//       "description": "Memory reference to the base location from which data
-//                       should be read."
-//     },
-//     "offset": {
-//       "type": "integer",
-//       "description": "Offset (in bytes) to be applied to the reference
-//                       location before reading data. Can be negative."
-//     },
-//     "count": {
-//       "type": "integer",
-//       "description": "Number of bytes to read at the specified location and
-//                       offset."
-//     }
-//   },
-//   "required": [ "memoryReference", "count" ]
-// },
-// "ReadMemoryResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to `readMemory` request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "address": {
-//             "type": "string",
-//             "description": "The address of the first byte of data returned.
-//                             Treated as a hex value if prefixed with `0x`, or
-//                             as a decimal value otherwise."
-//           },
-//           "unreadableBytes": {
-//             "type": "integer",
-//             "description": "The number of unreadable bytes encountered after
-//                             the last successfully read byte.\nThis can be
-//                             used to determine the number of bytes that should
-//                             be skipped before a subsequent
-//             `readMemory` request succeeds."
-//           },
-//           "data": {
-//             "type": "string",
-//             "description": "The bytes read from memory, encoded using base64.
-//                             If the decoded length of `data` is less than the
-//                             requested `count` in the original `readMemory`
-//                             request, and `unreadableBytes` is zero or
-//                             omitted, then the client should assume it's
-//                             reached the end of readable memory."
-//           }
-//         },
-//         "required": [ "address" ]
-//       }
-//     }
-//   }]
-// },
-void ReadMemoryRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  auto *arguments = request.getObject("arguments");
+// Reads bytes from memory at the provided location.
+//
+// Clients should only call this request if the corresponding capability
+// `supportsReadMemoryRequest` is true
+llvm::Expected<protocol::ReadMemoryResponseBody>
+ReadMemoryRequestHandler::Run(const protocol::ReadMemoryArguments &args) const {
+  const lldb::addr_t raw_address = args.memoryReference + args.offset;
 
-  llvm::StringRef memoryReference =
-      GetString(arguments, "memoryReference").value_or("");
-  auto addr_opt = DecodeMemoryReference(memoryReference);
-  if (!addr_opt.has_value()) {
-    response["success"] = false;
-    response["message"] =
-        "Malformed memory reference: " + memoryReference.str();
-    dap.SendJSON(llvm::json::Value(std::move(response)));
-    return;
-  }
-  lldb::addr_t addr_int = *addr_opt;
-  addr_int += GetInteger<uint64_t>(arguments, "offset").value_or(0);
-  const uint64_t count_requested =
-      GetInteger<uint64_t>(arguments, "count").value_or(0);
+  lldb::SBProcess process = dap.target.GetProcess();
+  if (!lldb::SBDebugger::StateIsStoppedState(process.GetState()))
+    return llvm::make_error<NotStoppedError>();
 
+  const uint64_t count_read = std::max<uint64_t>(args.count, 1);
   // We also need support reading 0 bytes
   // VS Code sends those requests to check if a `memoryReference`
   // can be dereferenced.
-  const uint64_t count_read = std::max<uint64_t>(count_requested, 1);
-  std::vector<uint8_t> buf;
-  buf.resize(count_read);
-  lldb::SBError error;
-  lldb::SBAddress addr{addr_int, dap.target};
-  size_t count_result =
-      dap.target.ReadMemory(addr, buf.data(), count_read, error);
-  if (count_result == 0) {
-    response["success"] = false;
-    EmplaceSafeString(response, "message", error.GetCString());
-    dap.SendJSON(llvm::json::Value(std::move(response)));
-    return;
-  }
-  buf.resize(std::min<size_t>(count_result, count_requested));
+  protocol::ReadMemoryResponseBody response;
+  std::vector<std::byte> &buffer = response.data;
+  buffer.resize(count_read);
 
-  llvm::json::Object body;
-  std::string formatted_addr = "0x" + llvm::utohexstr(addr_int);
-  body.try_emplace("address", formatted_addr);
-  body.try_emplace("data", llvm::encodeBase64(buf));
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+  lldb::SBError error;
+  const size_t memory_count = dap.target.GetProcess().ReadMemory(
+      raw_address, buffer.data(), buffer.size(), error);
+
+  response.address = "0x" + llvm::utohexstr(raw_address);
+
+  // reading memory may fail for multiple reasons. memory not readable,
+  // reading out of memory range and gaps in memory. return from
+  // the last readable byte.
+  if (error.Fail() && (memory_count < count_read)) {
+    response.unreadableBytes = count_read - memory_count;
+  }
+
+  buffer.resize(std::min<size_t>(memory_count, args.count));
+  return response;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index 054cc7a32131..e35b9830ab60 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -564,14 +564,17 @@ public:
   Run(const protocol::DisassembleArguments &args) const override;
 };
 
-class ReadMemoryRequestHandler : public LegacyRequestHandler {
+class ReadMemoryRequestHandler final
+    : public RequestHandler<protocol::ReadMemoryArguments,
+                            llvm::Expected<protocol::ReadMemoryResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "readMemory"; }
   FeatureSet GetSupportedFeatures() const override {
     return {protocol::eAdapterFeatureReadMemoryRequest};
   }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::ReadMemoryResponseBody>
+  Run(const protocol::ReadMemoryArguments &args) const override;
 };
 
 class CancelRequestHandler : public RequestHandler<protocol::CancelArguments,
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index e6ba54ed4dcd..9bd84a6c898f 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "Protocol/ProtocolRequests.h"
+#include "JSONUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Base64.h"
 #include "llvm/Support/JSON.h"
 #include <utility>
 
@@ -480,4 +482,39 @@ json::Value toJSON(const DisassembleResponseBody &DRB) {
   return json::Object{{"instructions", DRB.instructions}};
 }
 
+bool fromJSON(const json::Value &Params, ReadMemoryArguments &RMA,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+
+  const json::Object *rma_obj = Params.getAsObject();
+  constexpr llvm::StringRef ref_key = "memoryReference";
+  const std::optional<llvm::StringRef> memory_ref = rma_obj->getString(ref_key);
+  if (!memory_ref) {
+    P.field(ref_key).report("missing value");
+    return false;
+  }
+
+  const std::optional<lldb::addr_t> addr_opt =
+      DecodeMemoryReference(*memory_ref);
+  if (!addr_opt) {
+    P.field(ref_key).report("Malformed memory reference");
+    return false;
+  }
+
+  RMA.memoryReference = *addr_opt;
+
+  return O && O.map("count", RMA.count) && O.mapOptional("offset", RMA.offset);
+}
+
+json::Value toJSON(const ReadMemoryResponseBody &RMR) {
+  json::Object result{{"address", RMR.address}};
+
+  if (RMR.unreadableBytes != 0)
+    result.insert({"unreadableBytes", RMR.unreadableBytes});
+  if (!RMR.data.empty())
+    result.insert({"data", llvm::encodeBase64(RMR.data)});
+
+  return result;
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 01b8f2445c9f..7d9a99fdacce 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -839,6 +839,42 @@ bool fromJSON(const llvm::json::Value &, DisassembleResponseBody &,
               llvm::json::Path);
 llvm::json::Value toJSON(const DisassembleResponseBody &);
 
+/// Arguments for `readMemory` request.
+struct ReadMemoryArguments {
+  /// Memory reference to the base location from which data should be read.
+  lldb::addr_t memoryReference;
+
+  /// Offset (in bytes) to be applied to the reference location before reading
+  /// data. Can be negative.
+  int64_t offset = 0;
+
+  /// Number of bytes to read at the specified location and offset.
+  uint64_t count;
+};
+bool fromJSON(const llvm::json::Value &, ReadMemoryArguments &,
+              llvm::json::Path);
+
+/// Response to `readMemory` request.
+struct ReadMemoryResponseBody {
+  /// The address of the first byte of data returned.
+  /// Treated as a hex value if prefixed with `0x`, or as a decimal value
+  /// otherwise.
+  std::string address;
+
+  /// The number of unreadable bytes encountered after the last successfully
+  /// read byte.
+  /// This can be used to determine the number of bytes that should be skipped
+  /// before a subsequent `readMemory` request succeeds.
+  uint64_t unreadableBytes = 0;
+
+  /// The bytes read from memory, encoded using base64. If the decoded length
+  /// of `data` is less than the requested `count` in the original `readMemory`
+  /// request, and `unreadableBytes` is zero or omitted, then the client should
+  /// assume it's reached the end of readable memory.
+  std::vector<std::byte> data;
+};
+llvm::json::Value toJSON(const ReadMemoryResponseBody &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index 46a09f090fea..9c93eb8c94b0 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -765,3 +765,36 @@ TEST(ProtocolTypesTest, StepInTarget) {
   EXPECT_EQ(target.endLine, deserialized_target->endLine);
   EXPECT_EQ(target.endColumn, deserialized_target->endColumn);
 }
+
+TEST(ProtocolTypesTest, ReadMemoryArguments) {
+  ReadMemoryArguments args;
+  args.count = 20;
+  args.memoryReference = 43962;
+  args.offset = 0;
+
+  llvm::Expected<ReadMemoryArguments> expected =
+      parse<ReadMemoryArguments>(R"({"memoryReference":"-4000", "count": 20})");
+  ASSERT_THAT_EXPECTED(expected, llvm::Failed());
+  expected = parse<ReadMemoryArguments>(
+      R"({"memoryReference":"0xabba", "count": 20})");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+
+  EXPECT_EQ(args.count, expected->count);
+  EXPECT_EQ(args.memoryReference, expected->memoryReference);
+  EXPECT_EQ(args.offset, expected->offset);
+}
+
+TEST(ProtocolTypesTest, ReadMemoryResponseBody) {
+  ReadMemoryResponseBody response;
+  response.address = "0xdeadbeef";
+  const std::string data_str = "hello world!";
+  std::transform(data_str.begin(), data_str.end(),
+                 std::back_inserter(response.data),
+                 [](char letter) { return std::byte(letter); });
+  response.unreadableBytes = 1;
+
+  Expected<Value> expected = json::parse(
+      R"({ "address": "0xdeadbeef", "data": "aGVsbG8gd29ybGQh", "unreadableBytes": 1})");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected), pp(response));
+}
\ No newline at end of file

From 118bfcda46c17349575217bc901e8e5942521955 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Wed, 18 Jun 2025 14:52:03 -0700
Subject: [PATCH 0874/1322] [MLIR][XEGPU] Add blocking support for scatter ops
 (#144766)

Add blocking support for scatter ops: Create_tdesc, update, prefetch,
load and store. It also enables the load/store with chunk size.
---
 .../XeGPU/Transforms/XeGPUBlocking.cpp        |  42 +++++--
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   | 113 ++++++++++++++++--
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  14 +--
 3 files changed, 142 insertions(+), 27 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index a3826c56e1f6..3950e8f70d1c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -134,11 +134,13 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
 
 std::optional<SmallVector<int64_t>>
 XeGPUBlockingPass::getTileShape(Operation *op) const {
-  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(op))
+  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
+          xegpu::UpdateOffsetOp>(op))
     return getTileShape(op->getOpResult(0));
-  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp>(op))
+  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
+          xegpu::LoadGatherOp>(op))
     return getTileShape(op->getOpOperand(0));
-  if (isa<xegpu::StoreNdOp>(op))
+  if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op))
     return getTileShape(op->getOpOperand(1));
 
   if (isa<xegpu::DpasOp>(op)) {
@@ -295,12 +297,36 @@ void XeGPUBlockingPass::runOnOperation() {
     Type elemTy = type.getElementType();
     Type newTy;
 
-    if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type))
-      newTy = xegpu::TensorDescType::get(
-          ctx, tileShape, elemTy, tdescTy.getEncoding(),
-          tdescTy.getLayoutAttr().dropInstData());
-    else
+    if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {
+
+      Attribute encoding = tdescTy.getEncoding();
+      // If the encoding is a ScatterTensorDescAttr, we need to
+      // potentially adjust the chunk size based on the inst_data.
+      if (tdescTy.isScattered()) {
+        auto scatterAttr =
+            llvm::dyn_cast_if_present<xegpu::ScatterTensorDescAttr>(encoding);
+        int64_t chunkSize = scatterAttr.getChunkSize().getInt();
+
+        if (chunkSize > 1) {
+          int64_t blockedChunkSize = chunkSize;
+          auto instData = tdescTy.getLayoutAttr().getInstData();
+          if (!instData.empty())
+            blockedChunkSize = instData.asArrayRef().back();
+
+          // To create a new attribute with a different chunk_size:
+          auto newEncoding = xegpu::ScatterTensorDescAttr::get(
+              ctx, scatterAttr.getMemorySpace().getValue(), blockedChunkSize);
+
+          encoding = newEncoding;
+        }
+      }
+
+      newTy =
+          xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
+                                     tdescTy.getLayoutAttr().dropInstData());
+    } else {
       newTy = type.clone(tileShape, elemTy);
+    }
 
     std::optional<SmallVector<int64_t>> ratio =
         computeShapeRatio(type.getShape(), tileShape);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 67d3bd9b393c..f977ba3c11bc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -250,8 +250,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [16, 16]>
 #r = #xegpu.layout<inst_data = [16]>
-
-gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+gpu.module @test_kernel  {
   gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     %acc = arith.constant dense<0.0> : vector<64xf32>
     %c64 = arith.constant 64 : index
@@ -271,8 +270,7 @@ gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<
 // -----
 #l = #xegpu.layout<inst_data = [16, 16]>
 #r = #xegpu.layout<inst_data = [16]>
-
-gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+gpu.module @test_kernel   {
   gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     %c1 = arith.constant 1 : index
     %c32 = arith.constant 32 : index
@@ -299,8 +297,7 @@ gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<
 // -----
 #r = #xegpu.layout<inst_data = [16]>
 #l = #xegpu.layout<inst_data = [16, 16]>
-
-gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+gpu.module @test_kernel   {
   gpu.func @broadcast_dim_0(%a: memref<512xf32>, %b: memref<16x512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
 
     %c64 = arith.constant 64 : index
@@ -319,8 +316,7 @@ gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<
 // -----
 #r = #xegpu.layout<inst_data = [16]>
 #l = #xegpu.layout<inst_data = [16, 16]>
-
-gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+gpu.module @test_kernel  {
   gpu.func @broadcast_dim_1(%a: memref<512xf32>, %b: memref<16x512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
 
     %c32 = arith.constant 32 : index
@@ -340,8 +336,7 @@ gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<
 // -----
 #l = #xegpu.layout<inst_data = [16, 8]>
 #t = #xegpu.layout<inst_data = [8, 16]>
-
-gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+gpu.module @test_kernel   {
   gpu.func @transpose(%a: memref<512x8xf32>, %b: memref<8x512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
 
     %c32 = arith.constant 32 : index
@@ -355,4 +350,100 @@ gpu.module @kernel  attributes {spirv.target_env = #spirv.target_env<#spirv.vce<
     xegpu.store_nd %2, %3: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
     gpu.return
   }
-}
\ No newline at end of file
+}
+
+// -----
+gpu.module @test_kernel {
+  // CHECK-LABEL: test_prefetch_load_store_update
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+   // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
+   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+
+  gpu.func @test_prefetch_load_store_update(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+   
+    %delta = arith.constant dense<[
+    32,   32,  32,  32,  32,  32,  32,  32,
+    32,   32,  32,  32,  32,  32,  32,  64,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 256 
+    ]> : vector<32xindex>
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+              : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>     
+ 
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+
+    %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
+    xegpu.store %st_vec, %tdesc, %mask: 
+                 vector<32xf32>, 
+                 !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, 
+                 vector<32xi1>
+  
+    gpu.return
+  }
+  
+}
+
+// -----
+
+gpu.module @test_kernel   {
+  // CHECK-LABEL: test_prefetch_load_store_update_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+   // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
+   // CHECK-COUNT-4: xegpu.load  {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<2x16xf32>
+  // CHECK-COUNT-4: xegpu.store  {{.*}} : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
+
+  gpu.func @test_prefetch_load_store_update_chunk(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+   
+    %delta = arith.constant dense<[
+      32,   32,  32,  32,  32,  32,  32,  32,
+      32,   32,  32,  32,  32,  32,  32,  64,
+      128, 128, 128, 128, 128, 128, 128, 128,
+      128, 128, 128, 128, 128, 128, 128, 256 
+    ]> : vector<32xindex>
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+              : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>     
+ 
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<4x32xf32>
+
+    %st_vec = arith.addf %ld_vec, %ld_vec : vector<4x32xf32>
+    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: 
+                 vector<4x32xf32>, 
+                 !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, 
+                 vector<32xi1>
+  
+    gpu.return
+  }
+}
+
+
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 4400d6d9625f..c84eb7419854 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -102,14 +102,14 @@ struct TestXeGPUUnrollingPatterns
           // attribute
           if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {
             Attribute encoding = tdescTy.getEncoding();
-            auto layout = llvm::dyn_cast_if_present<xegpu::LayoutAttr>(
-                tdescTy.getLayout());
+            auto layout = tdescTy.getLayoutAttr();
 
             // If the encoding is a ScatterTensorDescAttr, we need to
             // potentially adjust the chunk size based on the inst_data.
-            if (encoding && mlir::isa<xegpu::ScatterTensorDescAttr>(encoding)) {
+            if (tdescTy.isScattered()) {
               auto scatterAttr =
-                  mlir::dyn_cast<xegpu::ScatterTensorDescAttr>(encoding);
+                  llvm::dyn_cast_if_present<xegpu::ScatterTensorDescAttr>(
+                      encoding);
               int64_t chunkSize = scatterAttr.getChunkSize().getInt();
 
               if (chunkSize > 1) {
@@ -118,12 +118,10 @@ struct TestXeGPUUnrollingPatterns
                 if (!instData.empty())
                   blockedChunkSize = instData.asArrayRef().back();
 
-                auto chunkSizeAttr = mlir::IntegerAttr::get(
-                    mlir::IntegerType::get(ctx, 64), blockedChunkSize);
-
                 // To create a new attribute with a different chunk_size:
                 auto newEncoding = xegpu::ScatterTensorDescAttr::get(
-                    ctx, scatterAttr.getMemorySpace(), chunkSizeAttr);
+                    ctx, scatterAttr.getMemorySpace().getValue(),
+                    blockedChunkSize);
 
                 encoding = newEncoding;
               }

From 7b6963ea672f8fedbbaefd15eaca943495709d37 Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid <omair.javaid@linaro.org>
Date: Thu, 19 Jun 2025 03:06:46 +0500
Subject: [PATCH 0875/1322] [compiler-rt] [Fuzzer] Fix tests linking buildbot
 failure (#144495)

Fix for #144495 by 6f4add3 broke sanitizer-aarch64-linux buildbot.

compiler-rt/lib/fuzzer/tests build failed because the linker was
looking gcc_s without '-l' appended.

The CMake script was adding the library name without the required
'-l' prefix. This patch adds the -l prefix changing gcc_s to -lgcc_s
and gcc to -lgcc.

https://lab.llvm.org/buildbot/#/builders/51/builds/18170
---
 compiler-rt/lib/fuzzer/tests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
index 543f486a9d50..c5885ccccd20 100644
--- a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
+++ b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt
@@ -45,10 +45,10 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND
     list(APPEND FUZZER_UNWINDER_LIBS ${COMPILER_RT_UNWINDER_LINK_LIBS})
   elseif(COMPILER_RT_HAS_GCC_S_LIB)
     # As a fallback, use the shared libgcc_s library.
-    list(APPEND FUZZER_UNWINDER_LIBS gcc_s)
+    list(APPEND FUZZER_UNWINDER_LIBS -lgcc_s)
   elseif(COMPILER_RT_HAS_GCC_LIB)
     # As a final fallback, use the static libgcc library.
-    list(APPEND FUZZER_UNWINDER_LIBS gcc)
+    list(APPEND FUZZER_UNWINDER_LIBS -lgcc)
   elseif(NOT COMPILER_RT_USE_BUILTINS_LIBRARY)
     # If no unwinder is found and we aren't using the builtins library
     message(FATAL_ERROR "Fuzzer tests require a suitable unwinder, but none was found.")

From 00ae89a1cbece94412cf832e47fdf449a611ad24 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Wed, 18 Jun 2025 16:35:01 -0600
Subject: [PATCH 0876/1322] Revert "[IPO] Added attributor for identifying
 invariant loads" (#144808)

Reverts llvm/llvm-project#141800

The implementation critically misunderstands the `AAMemoryBehavior`
attributor, which it relies on heavily.

@shiltian, since I do not have commit permissions.
---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  41 --
 llvm/lib/Transforms/IPO/Attributor.cpp        |   2 -
 .../Transforms/IPO/AttributorAttributes.cpp   | 339 ----------------
 .../Attributor/AMDGPU/tag-invariant-loads.ll  | 382 ------------------
 .../Attributor/dereferenceable-1.ll           |   1 +
 .../Attributor/value-simplify-local-remote.ll |  22 +-
 6 files changed, 15 insertions(+), 772 deletions(-)
 delete mode 100644 llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index f19f3292c479..e6eb756df987 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -6335,47 +6335,6 @@ struct AAUnderlyingObjects : AbstractAttribute {
                           AA::ValueScope Scope = AA::Interprocedural) const = 0;
 };
 
-/// An abstract interface for identifying pointers from which loads can be
-/// marked invariant.
-struct AAInvariantLoadPointer : public AbstractAttribute {
-  AAInvariantLoadPointer(const IRPosition &IRP) : AbstractAttribute(IRP) {}
-
-  /// See AbstractAttribute::isValidIRPositionForInit
-  static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
-    if (!IRP.getAssociatedType()->isPointerTy())
-      return false;
-
-    return AbstractAttribute::isValidIRPositionForInit(A, IRP);
-  }
-
-  /// Create an abstract attribute view for the position \p IRP.
-  static AAInvariantLoadPointer &createForPosition(const IRPosition &IRP,
-                                                   Attributor &A);
-
-  /// Return true if the pointer's contents are known to remain invariant.
-  virtual bool isKnownInvariant() const = 0;
-  virtual bool isKnownLocallyInvariant() const = 0;
-
-  /// Return true if the pointer's contents are assumed to remain invariant.
-  virtual bool isAssumedInvariant() const = 0;
-  virtual bool isAssumedLocallyInvariant() const = 0;
-
-  /// See AbstractAttribute::getName().
-  StringRef getName() const override { return "AAInvariantLoadPointer"; }
-
-  /// See AbstractAttribute::getIdAddr().
-  const char *getIdAddr() const override { return &ID; }
-
-  /// This function should return true if the type of the \p AA is
-  /// AAInvariantLoadPointer
-  static bool classof(const AbstractAttribute *AA) {
-    return (AA->getIdAddr() == &ID);
-  }
-
-  /// Unique ID (due to the unique address).
-  static const char ID;
-};
-
 /// An abstract interface for address space information.
 struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
   AAAddressSpace(const IRPosition &IRP, Attributor &A)
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index a2548258ddaf..dac1f7a30c37 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3612,8 +3612,6 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       if (SimplifyAllLoads)
         getAssumedSimplified(IRPosition::value(I), nullptr,
                              UsedAssumedInformation, AA::Intraprocedural);
-      getOrCreateAAFor<AAInvariantLoadPointer>(
-          IRPosition::value(*LI->getPointerOperand()));
       getOrCreateAAFor<AAAddressSpace>(
           IRPosition::value(*LI->getPointerOperand()));
     } else {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 5cb8f888354b..3799a696f67a 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -191,7 +191,6 @@ PIPE_OPERATOR(AAInterFnReachability)
 PIPE_OPERATOR(AAPointerInfo)
 PIPE_OPERATOR(AAAssumptionInfo)
 PIPE_OPERATOR(AAUnderlyingObjects)
-PIPE_OPERATOR(AAInvariantLoadPointer)
 PIPE_OPERATOR(AAAddressSpace)
 PIPE_OPERATOR(AAAllocationInfo)
 PIPE_OPERATOR(AAIndirectCallInfo)
@@ -12534,342 +12533,6 @@ private:
 };
 } // namespace
 
-/// --------------------- Invariant Load Pointer -------------------------------
-namespace {
-
-struct AAInvariantLoadPointerImpl
-    : public StateWrapper<BitIntegerState<uint8_t, 15>,
-                          AAInvariantLoadPointer> {
-
-  enum {
-    // pointer does not alias within the bounds of the function
-    IS_NOALIAS = 1 << 0,
-    // pointer is not involved in any effectful instructions within the bounds
-    // of the function
-    IS_NOEFFECT = 1 << 1,
-    // loads are invariant within the bounds of the function
-    IS_LOCALLY_INVARIANT = 1 << 2,
-    // memory lifetime is constrained within the bounds of the function
-    IS_LOCALLY_CONSTRAINED = 1 << 3,
-
-    IS_BEST_STATE = IS_NOALIAS | IS_NOEFFECT | IS_LOCALLY_INVARIANT |
-                    IS_LOCALLY_CONSTRAINED,
-  };
-  static_assert(getBestState() == IS_BEST_STATE, "Unexpected best state");
-
-  using Base =
-      StateWrapper<BitIntegerState<uint8_t, 15>, AAInvariantLoadPointer>;
-
-  // the BitIntegerState is optimistic about IS_NOALIAS and IS_NOEFFECT, but
-  // pessimistic about IS_KNOWN_INVARIANT
-  AAInvariantLoadPointerImpl(const IRPosition &IRP, Attributor &A)
-      : Base(IRP) {}
-
-  bool isKnownInvariant() const final {
-    return isKnownLocallyInvariant() && isKnown(IS_LOCALLY_CONSTRAINED);
-  }
-
-  bool isKnownLocallyInvariant() const final {
-    if (isKnown(IS_LOCALLY_INVARIANT))
-      return true;
-    return isKnown(IS_NOALIAS | IS_NOEFFECT);
-  }
-
-  bool isAssumedInvariant() const final {
-    return isAssumedLocallyInvariant() && isAssumed(IS_LOCALLY_CONSTRAINED);
-  }
-
-  bool isAssumedLocallyInvariant() const final {
-    if (isAssumed(IS_LOCALLY_INVARIANT))
-      return true;
-    return isAssumed(IS_NOALIAS | IS_NOEFFECT);
-  }
-
-  ChangeStatus updateImpl(Attributor &A) override {
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
-    Changed |= updateNoAlias(A);
-    if (requiresNoAlias() && !isAssumed(IS_NOALIAS))
-      return indicatePessimisticFixpoint();
-
-    Changed |= updateNoEffect(A);
-
-    Changed |= updateLocalInvariance(A);
-
-    return Changed;
-  }
-
-  ChangeStatus manifest(Attributor &A) override {
-    if (!isKnownInvariant())
-      return ChangeStatus::UNCHANGED;
-
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-    const Value *Ptr = &getAssociatedValue();
-    const auto TagInvariantLoads = [&](const Use &U, bool &) {
-      if (U.get() != Ptr)
-        return true;
-      auto *I = dyn_cast<Instruction>(U.getUser());
-      if (!I)
-        return true;
-
-      // Ensure that we are only changing uses from the corresponding callgraph
-      // SSC in the case that the AA isn't run on the entire module
-      if (!A.isRunOn(I->getFunction()))
-        return true;
-
-      if (I->hasMetadata(LLVMContext::MD_invariant_load))
-        return true;
-
-      if (auto *LI = dyn_cast<LoadInst>(I)) {
-        LI->setMetadata(LLVMContext::MD_invariant_load,
-                        MDNode::get(LI->getContext(), {}));
-        Changed = ChangeStatus::CHANGED;
-      }
-      return true;
-    };
-
-    (void)A.checkForAllUses(TagInvariantLoads, *this, *Ptr);
-    return Changed;
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr(Attributor *) const override {
-    if (isKnownInvariant())
-      return "load-invariant pointer";
-    return "non-invariant pointer";
-  }
-
-  /// See AbstractAttribute::trackStatistics().
-  void trackStatistics() const override {}
-
-private:
-  /// Indicate that noalias is required for the pointer to be invariant.
-  bool requiresNoAlias() const {
-    switch (getPositionKind()) {
-    default:
-      // Conservatively default to require noalias.
-      return true;
-    case IRP_FLOAT:
-    case IRP_RETURNED:
-    case IRP_CALL_SITE:
-      return false;
-    case IRP_CALL_SITE_RETURNED: {
-      const auto &CB = cast<CallBase>(getAnchorValue());
-      return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-          &CB, /*MustPreserveNullness=*/false);
-    }
-    case IRP_ARGUMENT: {
-      const Function *F = getAssociatedFunction();
-      assert(F && "no associated function for argument");
-      return !isCallableCC(F->getCallingConv());
-    }
-    }
-  }
-
-  bool isExternal() const {
-    const Function *F = getAssociatedFunction();
-    if (!F)
-      return true;
-    return isCallableCC(F->getCallingConv()) &&
-           getPositionKind() != IRP_CALL_SITE_RETURNED;
-  }
-
-  ChangeStatus updateNoAlias(Attributor &A) {
-    if (isKnown(IS_NOALIAS) || !isAssumed(IS_NOALIAS))
-      return ChangeStatus::UNCHANGED;
-
-    // Try to use AANoAlias.
-    if (const auto *ANoAlias = A.getOrCreateAAFor<AANoAlias>(
-            getIRPosition(), this, DepClassTy::REQUIRED)) {
-      if (ANoAlias->isKnownNoAlias()) {
-        addKnownBits(IS_NOALIAS);
-        return ChangeStatus::CHANGED;
-      }
-
-      if (!ANoAlias->isAssumedNoAlias()) {
-        removeAssumedBits(IS_NOALIAS);
-        return ChangeStatus::CHANGED;
-      }
-
-      return ChangeStatus::UNCHANGED;
-    }
-
-    // Try to infer noalias from argument attribute, since it is applicable for
-    // the duration of the function.
-    if (const Argument *Arg = getAssociatedArgument()) {
-      if (Arg->hasNoAliasAttr()) {
-        addKnownBits(IS_NOALIAS);
-        return ChangeStatus::UNCHANGED;
-      }
-
-      // Noalias information is not provided, and cannot be inferred,
-      // so we conservatively assume the pointer aliases.
-      removeAssumedBits(IS_NOALIAS);
-      return ChangeStatus::CHANGED;
-    }
-
-    return ChangeStatus::UNCHANGED;
-  }
-
-  ChangeStatus updateNoEffect(Attributor &A) {
-    if (isKnown(IS_NOEFFECT) || !isAssumed(IS_NOEFFECT))
-      return ChangeStatus::UNCHANGED;
-
-    if (!getAssociatedFunction())
-      return indicatePessimisticFixpoint();
-
-    const auto HasNoEffectLoads = [&](const Use &U, bool &) {
-      const auto *LI = dyn_cast<LoadInst>(U.getUser());
-      return !LI || !LI->mayHaveSideEffects();
-    };
-    if (!A.checkForAllUses(HasNoEffectLoads, *this, getAssociatedValue()))
-      return indicatePessimisticFixpoint();
-
-    // Try to use AAMemoryBehavior to infer readonly attribute.
-    if (const auto *AMemoryBehavior = A.getOrCreateAAFor<AAMemoryBehavior>(
-            getIRPosition(), this, DepClassTy::REQUIRED)) {
-      if (!AMemoryBehavior->isAssumedReadOnly())
-        return indicatePessimisticFixpoint();
-
-      if (AMemoryBehavior->isKnownReadOnly()) {
-        addKnownBits(IS_NOEFFECT);
-        return ChangeStatus::UNCHANGED;
-      }
-
-      return ChangeStatus::UNCHANGED;
-    }
-
-    if (const Argument *Arg = getAssociatedArgument()) {
-      if (Arg->onlyReadsMemory()) {
-        addKnownBits(IS_NOEFFECT);
-        return ChangeStatus::UNCHANGED;
-      }
-
-      // Readonly information is not provided, and cannot be inferred from
-      // AAMemoryBehavior.
-      return indicatePessimisticFixpoint();
-    }
-
-    return ChangeStatus::UNCHANGED;
-  }
-
-  ChangeStatus updateLocalInvariance(Attributor &A) {
-    if (isKnown(IS_LOCALLY_INVARIANT) || !isAssumed(IS_LOCALLY_INVARIANT))
-      return ChangeStatus::UNCHANGED;
-
-    // try to infer invariance from underlying objects
-    const auto *AUO = A.getOrCreateAAFor<AAUnderlyingObjects>(
-        getIRPosition(), this, DepClassTy::REQUIRED);
-    if (!AUO)
-      return ChangeStatus::UNCHANGED;
-
-    bool UsedAssumedInformation = false;
-    const auto IsLocallyInvariantLoadIfPointer = [&](const Value &V) {
-      if (!V.getType()->isPointerTy())
-        return true;
-      const auto *IsInvariantLoadPointer =
-          A.getOrCreateAAFor<AAInvariantLoadPointer>(IRPosition::value(V), this,
-                                                     DepClassTy::REQUIRED);
-      // Conservatively fail if invariance cannot be inferred.
-      if (!IsInvariantLoadPointer)
-        return false;
-
-      if (IsInvariantLoadPointer->isKnownLocallyInvariant())
-        return true;
-      if (!IsInvariantLoadPointer->isAssumedLocallyInvariant())
-        return false;
-
-      UsedAssumedInformation = true;
-      return true;
-    };
-    if (!AUO->forallUnderlyingObjects(IsLocallyInvariantLoadIfPointer))
-      return indicatePessimisticFixpoint();
-
-    if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue())) {
-      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-              CB, /*MustPreserveNullness=*/false)) {
-        for (const Value *Arg : CB->args()) {
-          if (!IsLocallyInvariantLoadIfPointer(*Arg))
-            return indicatePessimisticFixpoint();
-        }
-      }
-    }
-
-    if (!UsedAssumedInformation) {
-      // Pointer is known and not just assumed to be locally invariant.
-      addKnownBits(IS_LOCALLY_INVARIANT);
-      return ChangeStatus::CHANGED;
-    }
-
-    return ChangeStatus::UNCHANGED;
-  }
-};
-
-struct AAInvariantLoadPointerFloating final : AAInvariantLoadPointerImpl {
-  AAInvariantLoadPointerFloating(const IRPosition &IRP, Attributor &A)
-      : AAInvariantLoadPointerImpl(IRP, A) {}
-};
-
-struct AAInvariantLoadPointerReturned final : AAInvariantLoadPointerImpl {
-  AAInvariantLoadPointerReturned(const IRPosition &IRP, Attributor &A)
-      : AAInvariantLoadPointerImpl(IRP, A) {}
-
-  void initialize(Attributor &) override {
-    removeAssumedBits(IS_LOCALLY_CONSTRAINED);
-  }
-};
-
-struct AAInvariantLoadPointerCallSiteReturned final
-    : AAInvariantLoadPointerImpl {
-  AAInvariantLoadPointerCallSiteReturned(const IRPosition &IRP, Attributor &A)
-      : AAInvariantLoadPointerImpl(IRP, A) {}
-
-  void initialize(Attributor &A) override {
-    const Function *F = getAssociatedFunction();
-    assert(F && "no associated function for return from call");
-
-    if (!F->isDeclaration() && !F->isIntrinsic())
-      return AAInvariantLoadPointerImpl::initialize(A);
-
-    const auto &CB = cast<CallBase>(getAnchorValue());
-    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
-            &CB, /*MustPreserveNullness=*/false))
-      return AAInvariantLoadPointerImpl::initialize(A);
-
-    if (F->onlyReadsMemory() && F->hasNoSync())
-      return AAInvariantLoadPointerImpl::initialize(A);
-
-    // At this point, the function is opaque, so we conservatively assume
-    // non-invariance.
-    indicatePessimisticFixpoint();
-  }
-};
-
-struct AAInvariantLoadPointerArgument final : AAInvariantLoadPointerImpl {
-  AAInvariantLoadPointerArgument(const IRPosition &IRP, Attributor &A)
-      : AAInvariantLoadPointerImpl(IRP, A) {}
-
-  void initialize(Attributor &) override {
-    const Function *F = getAssociatedFunction();
-    assert(F && "no associated function for argument");
-
-    if (!isCallableCC(F->getCallingConv())) {
-      addKnownBits(IS_LOCALLY_CONSTRAINED);
-      return;
-    }
-
-    if (!F->hasLocalLinkage())
-      removeAssumedBits(IS_LOCALLY_CONSTRAINED);
-  }
-};
-
-struct AAInvariantLoadPointerCallSiteArgument final
-    : AAInvariantLoadPointerImpl {
-  AAInvariantLoadPointerCallSiteArgument(const IRPosition &IRP, Attributor &A)
-      : AAInvariantLoadPointerImpl(IRP, A) {}
-};
-} // namespace
-
 /// ------------------------ Address Space  ------------------------------------
 namespace {
 
@@ -13375,7 +13038,6 @@ const char AAInterFnReachability::ID = 0;
 const char AAPointerInfo::ID = 0;
 const char AAAssumptionInfo::ID = 0;
 const char AAUnderlyingObjects::ID = 0;
-const char AAInvariantLoadPointer::ID = 0;
 const char AAAddressSpace::ID = 0;
 const char AAAllocationInfo::ID = 0;
 const char AAIndirectCallInfo::ID = 0;
@@ -13510,7 +13172,6 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFPClass)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPointerInfo)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAInvariantLoadPointer)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAddressSpace)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAllocationInfo)
 
diff --git a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
deleted file mode 100644
index ace68a19bf41..000000000000
--- a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
+++ /dev/null
@@ -1,382 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=attributor %s -S | FileCheck %s --check-prefix=AMDGCN
-
-@G = addrspace(1) global i32 zeroinitializer, align 4
-declare void @clobber(i32) #0
-declare ptr addrspace(1) @get_ptr() #0
-declare noalias ptr addrspace(1) @get_noalias_ptr() #0
-declare noalias ptr addrspace(1) @get_untouched_ptr() #1
-
-define void @test_nonkernel(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define void @test_nonkernel(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6:[0-9]+]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; may not be !invariant.load, as the caller may modify %ptr
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_plain(ptr addrspace(1) %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_plain(
-; AMDGCN-SAME: ptr addrspace(1) nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; may not be !invariant.load, as %ptr may alias a pointer in @clobber
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_noalias_ptr(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_ptr(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0:![0-9]+]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_gep(ptr addrspace(1) %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_gep(
-; AMDGCN-SAME: ptr addrspace(1) nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
-  %val = load i32, ptr addrspace(1) %gep, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_noalias_gep(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_gep(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
-  %val = load i32, ptr addrspace(1) %gep, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_swap(ptr addrspace(1) noalias %ptr, i32 inreg %swap) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_swap(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]], i32 inreg [[SWAP:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    store i32 [[SWAP]], ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; cannot be !invariant.load due to the write to %ptr
-  store i32 %swap, ptr addrspace(1) %ptr, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_volatile(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_volatile(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load volatile i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load volatile i32, ptr addrspace(1) %ptr, align 4
-  ;; volatiles loads cannot be !invariant.load
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_unordered(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_unordered(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] unordered, align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load atomic i32, ptr addrspace(1) %ptr unordered, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_monotonic(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_monotonic(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] monotonic, align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load atomic i32, ptr addrspace(1) %ptr monotonic, align 4
-  ;; atomic loads with ordering guarantees may have side effects
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_global() {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_global(
-; AMDGCN-SAME: ) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) @G, align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = load i32, ptr addrspace(1) @G, align 4
-  ;; is not an !invariant.load as global variables may change
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define internal i32 @test_internal_noalias_load(ptr addrspace(1) %ptr) {
-; AMDGCN-LABEL: define internal i32 @test_internal_noalias_load(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    ret i32 [[VAL]]
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; is an !invariant.load due to its only caller @test_call_internal_noalias
-  ret i32 %val
-}
-
-define amdgpu_kernel void @test_call_internal_noalias(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_noalias(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_noalias_load(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7:[0-9]+]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = call i32 @test_internal_noalias_load(ptr addrspace(1) %ptr)
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define internal i32 @test_internal_load(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define internal i32 @test_internal_load(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    ret i32 [[VAL]]
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; may not be an !invariant.load since the pointer in @test_call_internal may alias
-  ret i32 %val
-}
-
-define amdgpu_kernel void @test_call_internal(ptr addrspace(1) %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal(
-; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_load(ptr addrspace(1) nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = call i32 @test_internal_load(ptr addrspace(1) %ptr)
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define internal i32 @test_internal_written(ptr addrspace(1) %ptr) {
-; AMDGCN-LABEL: define internal i32 @test_internal_written(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    ret i32 [[VAL]]
-;
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; cannot be an !invariant.load because of the write in caller @test_call_internal_written
-  ret i32 %val
-}
-
-define amdgpu_kernel void @test_call_internal_written(ptr addrspace(1) noalias %ptr, i32 inreg %x) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_written(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree captures(none) [[PTR:%.*]], i32 inreg [[X:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_written(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
-; AMDGCN-NEXT:    store i32 [[X]], ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %val = call i32 @test_internal_written(ptr addrspace(1) %ptr)
-  store i32 %x, ptr addrspace(1) %ptr
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_call_ptr() {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_call_ptr(
-; AMDGCN-SAME: ) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_ptr() #[[ATTR6]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %ptr = call ptr addrspace(1) @get_ptr()
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; may not be an !invariant.load since %ptr may alias
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_call_noalias_ptr() {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_call_noalias_ptr(
-; AMDGCN-SAME: ) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_noalias_ptr() #[[ATTR6]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %ptr = call ptr addrspace(1) @get_noalias_ptr()
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; may not be an !invariant.load since %ptr may have been written to before returning
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_call_untouched_ptr() {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_call_untouched_ptr(
-; AMDGCN-SAME: ) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[PTR:%.*]] = call noalias align 4 ptr addrspace(1) @get_untouched_ptr() #[[ATTR8:[0-9]+]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %ptr = call ptr addrspace(1) @get_untouched_ptr()
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer(
-; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9:[0-9]+]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
-  %val = load i32, ptr addrspace(7) %rsrc, align 4
-  ;; original %ptr may alias
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_make_buffer_noalias(ptr addrspace(1) noalias %ptr) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer_noalias(
-; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
-  %val = load i32, ptr addrspace(7) %rsrc, align 4
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_selected_load(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load(
-; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; either pointer yields an !invariant.load
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_selected_load_partial_noalias(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load_partial_noalias(
-; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; %ptr.false may alias, so no !invariant.load
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_branch_load(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load(
-; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:  [[ENTRY:.*:]]
-; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
-; AMDGCN:       [[TRUE]]:
-; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
-; AMDGCN-NEXT:    br label %[[FINISH:.*]]
-; AMDGCN:       [[FALSE]]:
-; AMDGCN-NEXT:    br label %[[FINISH]]
-; AMDGCN:       [[FINISH]]:
-; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-entry:
-  br i1 %cond, label %true, label %false
-true:
-  call void @clobber(i32 1)
-  br label %finish
-false:
-  br label %finish
-finish:
-  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; either pointer yields an !invariant.load
-  call void @clobber(i32 %val)
-  ret void
-}
-
-define amdgpu_kernel void @test_branch_load_partial_noalias(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
-; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load_partial_noalias(
-; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
-; AMDGCN-NEXT:  [[ENTRY:.*:]]
-; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
-; AMDGCN:       [[TRUE]]:
-; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
-; AMDGCN-NEXT:    br label %[[FINISH:.*]]
-; AMDGCN:       [[FALSE]]:
-; AMDGCN-NEXT:    br label %[[FINISH]]
-; AMDGCN:       [[FINISH]]:
-; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
-; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
-; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
-; AMDGCN-NEXT:    ret void
-;
-entry:
-  br i1 %cond, label %true, label %false
-true:
-  call void @clobber(i32 1)
-  br label %finish
-false:
-  br label %finish
-finish:
-  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
-  %val = load i32, ptr addrspace(1) %ptr, align 4
-  ;; ptr.false may alias, so no !invariant.load
-  call void @clobber(i32 %val)
-  ret void
-}
-
-attributes #0 = { nofree norecurse nosync nounwind willreturn }
-attributes #1 = { nofree norecurse nosync nounwind willreturn readonly }
-;.
-; AMDGCN: [[META0]] = !{}
-;.
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 5bff2a2e6b20..07e2d5ea1575 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -207,6 +207,7 @@ define void @f7_1(ptr %ptr, i1 %cnd) {
 ; CHECK-LABEL: define {{[^@]+}}@f7_1
 ; CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
+; CHECK-NEXT:    [[PTR_0:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    br i1 [[CND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
index 4767244800d2..374d5ba7ff52 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
@@ -135,7 +135,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; TUNIT-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR5:[0-9]+]]
-; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
+; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
 ; TUNIT-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -145,7 +145,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; CGSCC-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
+; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
 ; CGSCC-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 entry:
@@ -234,7 +234,7 @@ define internal %S @bar.5(ptr %this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]]
-; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
+; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
 ; TUNIT-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -244,7 +244,7 @@ define internal %S @bar.5(ptr %this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR9:[0-9]+]]
-; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
+; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
 ; CGSCC-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 entry:
@@ -286,7 +286,7 @@ define internal void @boom(ptr %this, ptr %data) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[DATA_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[DATA]], ptr [[DATA_ADDR]], align 8
-; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8, !invariant.load [[META8]]
+; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8
 ; TUNIT-NEXT:    store ptr [[V]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    ret void
 ;
@@ -342,6 +342,14 @@ define %S.2 @t3.helper() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[S_2:%.*]], align 8
 ; CHECK-NEXT:    call void @ext1(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]])
+; CHECK-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load ptr, ptr [[RETVAL]], align 8
+; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[S_2]] poison, ptr [[DOTFCA_0_LOAD]], 0
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 1
+; CHECK-NEXT:    [[DOTFCA_1_LOAD:%.*]] = load i64, ptr [[DOTFCA_1_GEP]], align 8
+; CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_0_INSERT]], i64 [[DOTFCA_1_LOAD]], 1
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 2
+; CHECK-NEXT:    [[DOTFCA_2_LOAD:%.*]] = load i64, ptr [[DOTFCA_2_GEP]], align 8
+; CHECK-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_1_INSERT]], i64 [[DOTFCA_2_LOAD]], 2
 ; CHECK-NEXT:    ret [[S_2]] zeroinitializer
 ;
 entry:
@@ -500,7 +508,7 @@ define internal %S @t4a(ptr %this) {
 ; CGSCC-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @t4b(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
+; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8
 ; CGSCC-NEXT:    ret [[S]] [[TMP0]]
 ;
 entry:
@@ -615,7 +623,6 @@ entry:
 ; TUNIT: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; TUNIT: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; TUNIT: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; TUNIT: [[META8]] = !{}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
@@ -625,5 +632,4 @@ entry:
 ; CGSCC: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; CGSCC: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; CGSCC: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META8]] = !{}
 ;.

From e0933ab5ae4856c4aa188a5ea16716b3a8d0840b Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Wed, 18 Jun 2025 15:39:25 -0700
Subject: [PATCH 0877/1322] Revert "[lldb][target] Add progress report for
 wait-attaching to process" (#144810)

This is breaking TestCreateAfterAttach.py on Ubuntu:

```
======================================================================
FAIL: test_create_after_attach_dwo (TestCreateAfterAttach.CreateAfterAttachTestCase.test_create_after_attach_dwo)
   Test thread creation after process attach.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 1804, in test_method
    return attrvalue(self)
           ^^^^^^^^^^^^^^^
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/decorators.py", line 149, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/test/API/functionalities/thread/create_after_attach/TestCreateAfterAttach.py", line 36, in test_create_after_attach
    self.runCmd("process attach -p " + str(pid))
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 1005, in runCmd
    self.assertTrue(self.res.Succeeded(), msg + output)
AssertionError: False is not true : Command 'process attach -p 1474309' did not return successfully
Error output:
error: attach failed: lost connection
```

on the buildbots for lldb-remote-linux-ubuntu, lldb-arm-ubuntu,
lldb-aarch64-ubuntu, lldb-arm-ubuntu.
---
 lldb/source/Target/Target.cpp                 |  1 -
 .../TestProgressReporting.py                  | 31 -------------------
 2 files changed, 32 deletions(-)

diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 8f8d2ef21cc5..45a9e1196a04 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3546,7 +3546,6 @@ llvm::Expected<TraceSP> Target::GetTraceOrCreate() {
 }
 
 Status Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) {
-  Progress attach_progress("Waiting to attach to process");
   m_stats.SetLaunchOrAttachTime();
   auto state = eStateInvalid;
   auto process_sp = GetProcessSP();
diff --git a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
index 8198c50a5ff0..9af53845ca1b 100644
--- a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
+++ b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
@@ -2,7 +2,6 @@
 Test that we are able to broadcast and receive progress events from lldb
 """
 import lldb
-import threading
 
 import lldbsuite.test.lldbutil as lldbutil
 
@@ -17,36 +16,6 @@ class TestProgressReporting(TestBase):
             self.broadcaster, lldb.SBDebugger.eBroadcastBitProgress
         )
 
-    def test_wait_attach_progress_reporting(self):
-        """Test that progress reports for wait attaching work as intended."""
-        self.build()
-        target = self.dbg.CreateTarget(None)
-
-        # Wait attach to a process, then check to see that a progress report was created
-        # and that its message is correct for waiting to attach to a process.
-        class AttachThread(threading.Thread):
-            def __init__(self, target):
-                threading.Thread.__init__(self)
-                self.target = target
-
-            def run(self):
-                self.target.AttachToProcessWithName(
-                    lldb.SBListener(), "a.out", True, lldb.SBError()
-                )
-
-        thread = AttachThread(target)
-        thread.start()
-
-        event = lldbutil.fetch_next_event(self, self.listener, self.broadcaster)
-        progress_data = lldb.SBDebugger.GetProgressDataFromEvent(event)
-        message = progress_data.GetValueForKey("message").GetStringValue(100)
-        self.assertEqual(message, "Waiting to attach to process")
-
-        # Interrupt the process attach to keep the test from stalling.
-        target.process.SendAsyncInterrupt()
-
-        thread.join()
-
     def test_dwarf_symbol_loading_progress_report(self):
         """Test that we are able to fetch dwarf symbol loading progress events"""
         self.build()

From 780c0ef7fb97027aa21c2ee6b02282693f908a20 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 18 Jun 2025 15:41:13 -0700
Subject: [PATCH 0878/1322] [llvm] explicitly link llvm-jitlink-executor with
 ExecutionEngine (#144778)

## Overview
Explicitly link `llvm-jitlink-executor` with `ExecutionEngine` to avoid
link failures when building LLVM as a Windows DLL. This link dependency
should probably have always been declared here, but didn't matter when
building against an LLVM static library because it was (presumably)
picked up as a transitive dependency.

This change is required to enable the Windows DLL build because
`llvm-jitlink-executor` is declared using `add_llvm_utility` which
invokes `add_llvm_executable` with
[`DISABLE_LLVM_LINK_LLVM_DYLIB`](https://github.com/llvm/llvm-project/blob/main/llvm/cmake/modules/AddLLVM.cmake#L500-L502)
so it links statically against its dependencies instead of against the
main LLVM library.

## Background
The effort to support building LLVM as a Windows DLL is tracked in
#109483. Additional context is provided in [this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307).
---
 llvm/tools/llvm-jitlink/llvm-jitlink-executor/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/CMakeLists.txt b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/CMakeLists.txt
index f2daa294eec0..792ecf544f61 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/CMakeLists.txt
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  ExecutionEngine
   OrcShared
   OrcTargetProcess
   Support

From bb1f5c3189c4d8d30e3b1273e0b774a7ccdbd86a Mon Sep 17 00:00:00 2001
From: Guy David <49722543+guy-david@users.noreply.github.com>
Date: Thu, 19 Jun 2025 01:53:36 +0300
Subject: [PATCH 0879/1322] [AArch64] Lower jump table cases threshold to 10
 (#143632)

Previous stabs at this setting
(https://github.com/llvm/llvm-project/pull/71166) hypertuned it for
SPEC2017, but Clang's own compilation can benefit from a slightly lower
threshold, yielding a 0.3% improvement in compile time, while still not
regressing SPEC.

Most notable beneficiaries of this change are:
 - `llvm::Instruction::getNumSuccessors` (11 cases)
 - `llvm::Instruction::getSuccessor` (11 cases)

Test Suite with a bootstrapped build:
```
Tests: 4316
Metric: compile_time

Program                                       compile_time
                                              lhs          rhs    diff
SingleSour...ce/UnitTests/SignlessTypes/div     0.02         0.02  3.0%
SingleSour.../UnitTests/SignlessTypes/cast2     0.02         0.02  2.8%
SingleSource/Benchmarks/Misc/flops-4            0.02         0.02  1.9%
SingleSour...ebra/solvers/cholesky/cholesky     0.05         0.05  1.8%
SingleSour...tTests/2020-01-06-coverage-006     0.02         0.02  1.7%
SingleSour...ce/Benchmarks/Stanford/FloatMM     0.03         0.03  1.7%
SingleSour...9-04-16-BitfieldInitialization     0.02         0.02  1.7%
SingleSour...nitTests/2003-07-08-BitOpsTest     0.02         0.02  1.7%
MultiSourc...marks/Prolangs-C++/vcirc/vcirc     0.02         0.02  1.6%
MultiSourc...Prolangs-C/fixoutput/fixoutput     0.05         0.05  1.5%
SingleSour...h/stencils/jacobi-1d/jacobi-1d     0.04         0.04  1.4%
MultiSourc...rks/Prolangs-C++/office/office     0.28         0.28  1.4%
SingleSour...arks/Adobe-C++/functionobjects     0.39         0.40  1.3%
SingleSour...Tests/2003-10-29-ScalarReplBug     0.02         0.02  1.2%
SingleSour...arks/Adobe-C++/stepanov_vector     0.41         0.42  1.2%
                           Geomean difference                     -0.3%
      compile_time
l/r            lhs          rhs        diff
count  4316.000000  4316.000000  469.000000
mean   0.057747     0.057595    -0.003034
std    0.544528     0.543139     0.007625
min    0.000000     0.000000    -0.035294
25%    0.000000     0.000000    -0.007006
50%    0.000000     0.000000    -0.003257
75%    0.000000     0.000000     0.000000
max    18.295300    18.252500    0.030151
```
---
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 2 +-
 llvm/test/CodeGen/AArch64/min-jump-table.ll  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index a28e6bad0dca..68ed10570a52 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -73,7 +73,7 @@ static cl::opt<AArch64PAuth::AuthCheckMethod>
                                cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR));
 
 static cl::opt<unsigned> AArch64MinimumJumpTableEntries(
-    "aarch64-min-jump-table-entries", cl::init(13), cl::Hidden,
+    "aarch64-min-jump-table-entries", cl::init(10), cl::Hidden,
     cl::desc("Set minimum number of entries to use a jump table on AArch64"));
 
 static cl::opt<unsigned> AArch64StreamingHazardSize(
diff --git a/llvm/test/CodeGen/AArch64/min-jump-table.ll b/llvm/test/CodeGen/AArch64/min-jump-table.ll
index 98b89210f5a0..7cdff6e435f7 100644
--- a/llvm/test/CodeGen/AArch64/min-jump-table.ll
+++ b/llvm/test/CodeGen/AArch64/min-jump-table.ll
@@ -105,7 +105,7 @@ entry:
 ; CHECK4-NEXT: Jump Tables:
 ; CHECK8-NEXT: Jump Tables:
 ; CHECK12-NEXT: Jump Tables:
-; CHECK-DEFAULT-NOT: {{^}}Jump Tables:
+; CHECK-DEFAULT: {{^}}Jump Tables:
 
 bb1: tail call void @ext(i32 1, i32 0) br label %return
 bb2: tail call void @ext(i32 2, i32 2) br label %return

From 5f69d680e2cc94dcb30a7f29e8144725530a6da4 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 18 Jun 2025 19:30:43 -0400
Subject: [PATCH 0880/1322] Revert "[HLSL][SPIRV] Add vk::constant_id
 attribute." (#144812)

Reverts llvm/llvm-project#143544
---
 clang/include/clang/Basic/Attr.td             |   8 -
 clang/include/clang/Basic/AttrDocs.td         |  15 --
 clang/include/clang/Basic/Builtins.td         |  13 --
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 -
 clang/include/clang/Sema/SemaHLSL.h           |   5 +-
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          |  74 ------
 clang/lib/CodeGen/CodeGenFunction.h           |   6 -
 clang/lib/Sema/SemaDecl.cpp                   |  13 --
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 -
 clang/lib/Sema/SemaHLSL.cpp                   | 120 +---------
 .../test/AST/HLSL/vk.spec-constant.usage.hlsl | 130 -----------
 .../SpirvType.alignment.hlsl                  |   0
 .../SpirvType.hlsl                            |   0
 .../vk-features/vk.spec-constant.hlsl         | 210 ------------------
 .../test/SemaHLSL/vk.spec-constant.error.hlsl |  37 ---
 15 files changed, 2 insertions(+), 636 deletions(-)
 delete mode 100644 clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
 rename clang/test/CodeGenHLSL/{vk-features => inline-spirv}/SpirvType.alignment.hlsl (100%)
 rename clang/test/CodeGenHLSL/{vk-features => inline-spirv}/SpirvType.hlsl (100%)
 delete mode 100644 clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
 delete mode 100644 clang/test/SemaHLSL/vk.spec-constant.error.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 27fea7dea0a5..f113cd2ba2fb 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5023,14 +5023,6 @@ def HLSLVkExtBuiltinInput : InheritableAttr {
   let Documentation = [HLSLVkExtBuiltinInputDocs];
 }
 
-def HLSLVkConstantId : InheritableAttr {
-  let Spellings = [CXX11<"vk", "constant_id">];
-  let Args = [IntArgument<"Id">];
-  let Subjects = SubjectList<[ExternalGlobalVar]>;
-  let LangOpts = [HLSL];
-  let Documentation = [VkConstantIdDocs];
-}
-
 def RandomizeLayout : InheritableAttr {
   let Spellings = [GCC<"randomize_layout">];
   let Subjects = SubjectList<[Record]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 43442f177ab7..6051e1fc4511 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -8252,21 +8252,6 @@ and https://microsoft.github.io/hlsl-specs/proposals/0013-wave-size-range.html
   }];
 }
 
-def VkConstantIdDocs : Documentation {
-  let Category = DocCatFunction;
-  let Content = [{
-The ``vk::constant_id`` attribute specifies the id for a SPIR-V specialization
-constant. The attribute applies to const global scalar variables. The variable must be initialized with a C++11 constexpr.
-In SPIR-V, the
-variable will be replaced with an `OpSpecConstant` with the given id.
-The syntax is:
-
-.. code-block:: text
-
-  ``[[vk::constant_id(<Id>)]] const T Name = <Init>``
-}];
-}
-
 def RootSignatureDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index d65b3a5d2f44..68cd3d790e78 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5065,19 +5065,6 @@ def HLSLGroupMemoryBarrierWithGroupSync: LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void()";
 }
 
-class HLSLScalarTemplate
-    : Template<["bool", "char", "short", "int", "long long int",
-                "unsigned short", "unsigned int", "unsigned long long int",
-                "__fp16", "float", "double"],
-               ["_bool", "_char", "_short", "_int", "_longlong", "_ushort",
-                "_uint", "_ulonglong", "_half", "_float", "_double"]>;
-
-def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
-  let Spellings = ["__builtin_get_spirv_spec_constant"];
-  let Attributes = [NoThrow, Const, Pure];
-  let Prototype = "T(unsigned int, T)";
-}
-
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 34b798a09c21..979ff60b73b7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12927,10 +12927,6 @@ def err_spirv_enum_not_int : Error<
 def err_spirv_enum_not_valid : Error<
    "invalid value for %select{storage class}0 argument">;
 
-def err_specialization_const
-    : Error<"variable with 'vk::constant_id' attribute must be a const "
-            "int/float/enum/bool and be initialized with a literal">;
-
 // errors of expect.with.probability
 def err_probability_not_constant_float : Error<
    "probability argument to __builtin_expect_with_probability must be constant "
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 97091792ba23..33c4b8d1568b 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -98,8 +98,6 @@ public:
   HLSLWaveSizeAttr *mergeWaveSizeAttr(Decl *D, const AttributeCommonInfo &AL,
                                       int Min, int Max, int Preferred,
                                       int SpelledArgsCount);
-  HLSLVkConstantIdAttr *
-  mergeVkConstantIdAttr(Decl *D, const AttributeCommonInfo &AL, int Id);
   HLSLShaderAttr *mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
                                   llvm::Triple::EnvironmentType ShaderType);
   HLSLParamModifierAttr *
@@ -137,7 +135,6 @@ public:
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
-  void handleVkConstantIdAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL);
@@ -174,7 +171,7 @@ public:
   QualType getInoutParameterType(QualType Ty);
 
   bool transformInitList(const InitializedEntity &Entity, InitListExpr *Init);
-  bool handleInitialization(VarDecl *VDecl, Expr *&Init);
+
   void deduceAddressSpace(VarDecl *Decl);
 
 private:
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index cbc5ef9cb0d5..ccf45c0c6ff1 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -12,7 +12,6 @@
 
 #include "CGBuiltin.h"
 #include "CGHLSLRuntime.h"
-#include "CodeGenFunction.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -215,43 +214,6 @@ static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
-// Returns the mangled name for a builtin function that the SPIR-V backend
-// will expand into a spec Constant.
-static std::string getSpecConstantFunctionName(clang::QualType SpecConstantType,
-                                               ASTContext &Context) {
-  // The parameter types for our conceptual intrinsic function.
-  QualType ClangParamTypes[] = {Context.IntTy, SpecConstantType};
-
-  // Create a temporary FunctionDecl for the builtin fuction. It won't be
-  // added to the AST.
-  FunctionProtoType::ExtProtoInfo EPI;
-  QualType FnType =
-      Context.getFunctionType(SpecConstantType, ClangParamTypes, EPI);
-  DeclarationName FuncName = &Context.Idents.get("__spirv_SpecConstant");
-  FunctionDecl *FnDeclForMangling = FunctionDecl::Create(
-      Context, Context.getTranslationUnitDecl(), SourceLocation(),
-      SourceLocation(), FuncName, FnType, /*TSI=*/nullptr, SC_Extern);
-
-  // Attach the created parameter declarations to the function declaration.
-  SmallVector<ParmVarDecl *, 2> ParamDecls;
-  for (QualType ParamType : ClangParamTypes) {
-    ParmVarDecl *PD = ParmVarDecl::Create(
-        Context, FnDeclForMangling, SourceLocation(), SourceLocation(),
-        /*IdentifierInfo*/ nullptr, ParamType, /*TSI*/ nullptr, SC_None,
-        /*DefaultArg*/ nullptr);
-    ParamDecls.push_back(PD);
-  }
-  FnDeclForMangling->setParams(ParamDecls);
-
-  // Get the mangled name.
-  std::string Name;
-  llvm::raw_string_ostream MangledNameStream(Name);
-  MangleContext *Mangler = Context.createMangleContext();
-  Mangler->mangleName(FnDeclForMangling, MangledNameStream);
-  MangledNameStream.flush();
-  return Name;
-}
-
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E,
                                             ReturnValueSlot ReturnValue) {
@@ -811,42 +773,6 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return EmitRuntimeCall(
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
-  case Builtin::BI__builtin_get_spirv_spec_constant_bool:
-  case Builtin::BI__builtin_get_spirv_spec_constant_short:
-  case Builtin::BI__builtin_get_spirv_spec_constant_ushort:
-  case Builtin::BI__builtin_get_spirv_spec_constant_int:
-  case Builtin::BI__builtin_get_spirv_spec_constant_uint:
-  case Builtin::BI__builtin_get_spirv_spec_constant_longlong:
-  case Builtin::BI__builtin_get_spirv_spec_constant_ulonglong:
-  case Builtin::BI__builtin_get_spirv_spec_constant_half:
-  case Builtin::BI__builtin_get_spirv_spec_constant_float:
-  case Builtin::BI__builtin_get_spirv_spec_constant_double: {
-    llvm::Function *SpecConstantFn = getSpecConstantFunction(E->getType());
-    llvm::Value *SpecId = EmitScalarExpr(E->getArg(0));
-    llvm::Value *DefaultVal = EmitScalarExpr(E->getArg(1));
-    llvm::Value *Args[] = {SpecId, DefaultVal};
-    return Builder.CreateCall(SpecConstantFn, Args);
-  }
   }
   return nullptr;
 }
-
-llvm::Function *clang::CodeGen::CodeGenFunction::getSpecConstantFunction(
-    const clang::QualType &SpecConstantType) {
-
-  // Find or create the declaration for the function.
-  llvm::Module *M = &CGM.getModule();
-  std::string MangledName =
-      getSpecConstantFunctionName(SpecConstantType, getContext());
-  llvm::Function *SpecConstantFn = M->getFunction(MangledName);
-
-  if (!SpecConstantFn) {
-    llvm::Type *IntType = ConvertType(getContext().IntTy);
-    llvm::Type *RetTy = ConvertType(SpecConstantType);
-    llvm::Type *ArgTypes[] = {IntType, RetTy};
-    llvm::FunctionType *FnTy = llvm::FunctionType::get(RetTy, ArgTypes, false);
-    SpecConstantFn = llvm::Function::Create(
-        FnTy, llvm::GlobalValue::ExternalLinkage, MangledName, M);
-  }
-  return SpecConstantFn;
-}
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 59f14b3e35fd..a5ab9df01dba 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4850,12 +4850,6 @@ public:
   llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitHLSLBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                    ReturnValueSlot ReturnValue);
-
-  // Returns a builtin function that the SPIR-V backend will expand into a spec
-  // constant.
-  llvm::Function *
-  getSpecConstantFunction(const clang::QualType &SpecConstantType);
-
   llvm::Value *EmitDirectXBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitSPIRVBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index e1cccf068b5a..1bf72e5bb7b9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2890,8 +2890,6 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
     NewAttr = S.HLSL().mergeWaveSizeAttr(D, *WS, WS->getMin(), WS->getMax(),
                                          WS->getPreferred(),
                                          WS->getSpelledArgsCount());
-  else if (const auto *CI = dyn_cast<HLSLVkConstantIdAttr>(Attr))
-    NewAttr = S.HLSL().mergeVkConstantIdAttr(D, *CI, CI->getId());
   else if (const auto *SA = dyn_cast<HLSLShaderAttr>(Attr))
     NewAttr = S.HLSL().mergeShaderAttr(D, *SA, SA->getType());
   else if (isa<SuppressAttr>(Attr))
@@ -13759,10 +13757,6 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     return;
   }
 
-  if (getLangOpts().HLSL)
-    if (!HLSL().handleInitialization(VDecl, Init))
-      return;
-
   // Get the decls type and save a reference for later, since
   // CheckInitializerTypes may change it.
   QualType DclT = VDecl->getType(), SavT = DclT;
@@ -14185,13 +14179,6 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
       }
     }
 
-    // HLSL variable with the `vk::constant_id` attribute must be initialized.
-    if (!Var->isInvalidDecl() && Var->hasAttr<HLSLVkConstantIdAttr>()) {
-      Diag(Var->getLocation(), diag::err_specialization_const);
-      Var->setInvalidDecl();
-      return;
-    }
-
     if (!Var->isInvalidDecl() && RealDecl->hasAttr<LoaderUninitializedAttr>()) {
       if (Var->getStorageClass() == SC_Extern) {
         Diag(Var->getLocation(), diag::err_loader_uninitialized_extern_decl)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index eba29e609cb0..1c2fa80e782d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7590,9 +7590,6 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLVkExtBuiltinInput:
     S.HLSL().handleVkExtBuiltinInputAttr(D, AL);
     break;
-  case ParsedAttr::AT_HLSLVkConstantId:
-    S.HLSL().handleVkConstantIdAttr(D, AL);
-    break;
   case ParsedAttr::AT_HLSLSV_GroupThreadID:
     S.HLSL().handleSV_GroupThreadIDAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 9b43ee00810b..b55f4fd786b5 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -119,40 +119,6 @@ static ResourceClass getResourceClass(RegisterType RT) {
   llvm_unreachable("unexpected RegisterType value");
 }
 
-static Builtin::ID getSpecConstBuiltinId(QualType Type) {
-  const auto *BT = dyn_cast<BuiltinType>(Type);
-  if (!BT) {
-    if (!Type->isEnumeralType())
-      return Builtin::NotBuiltin;
-    return Builtin::BI__builtin_get_spirv_spec_constant_int;
-  }
-
-  switch (BT->getKind()) {
-  case BuiltinType::Bool:
-    return Builtin::BI__builtin_get_spirv_spec_constant_bool;
-  case BuiltinType::Short:
-    return Builtin::BI__builtin_get_spirv_spec_constant_short;
-  case BuiltinType::Int:
-    return Builtin::BI__builtin_get_spirv_spec_constant_int;
-  case BuiltinType::LongLong:
-    return Builtin::BI__builtin_get_spirv_spec_constant_longlong;
-  case BuiltinType::UShort:
-    return Builtin::BI__builtin_get_spirv_spec_constant_ushort;
-  case BuiltinType::UInt:
-    return Builtin::BI__builtin_get_spirv_spec_constant_uint;
-  case BuiltinType::ULongLong:
-    return Builtin::BI__builtin_get_spirv_spec_constant_ulonglong;
-  case BuiltinType::Half:
-    return Builtin::BI__builtin_get_spirv_spec_constant_half;
-  case BuiltinType::Float:
-    return Builtin::BI__builtin_get_spirv_spec_constant_float;
-  case BuiltinType::Double:
-    return Builtin::BI__builtin_get_spirv_spec_constant_double;
-  default:
-    return Builtin::NotBuiltin;
-  }
-}
-
 DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD,
                                                       ResourceClass ResClass) {
   assert(getDeclBindingInfo(VD, ResClass) == nullptr &&
@@ -641,41 +607,6 @@ HLSLWaveSizeAttr *SemaHLSL::mergeWaveSizeAttr(Decl *D,
   return Result;
 }
 
-HLSLVkConstantIdAttr *
-SemaHLSL::mergeVkConstantIdAttr(Decl *D, const AttributeCommonInfo &AL,
-                                int Id) {
-
-  auto &TargetInfo = getASTContext().getTargetInfo();
-  if (TargetInfo.getTriple().getArch() != llvm::Triple::spirv) {
-    Diag(AL.getLoc(), diag::warn_attribute_ignored) << AL;
-    return nullptr;
-  }
-
-  auto *VD = cast<VarDecl>(D);
-
-  if (getSpecConstBuiltinId(VD->getType()) == Builtin::NotBuiltin) {
-    Diag(VD->getLocation(), diag::err_specialization_const);
-    return nullptr;
-  }
-
-  if (!VD->getType().isConstQualified()) {
-    Diag(VD->getLocation(), diag::err_specialization_const);
-    return nullptr;
-  }
-
-  if (HLSLVkConstantIdAttr *CI = D->getAttr<HLSLVkConstantIdAttr>()) {
-    if (CI->getId() != Id) {
-      Diag(CI->getLocation(), diag::err_hlsl_attribute_param_mismatch) << AL;
-      Diag(AL.getLoc(), diag::note_conflicting_attribute);
-    }
-    return nullptr;
-  }
-
-  HLSLVkConstantIdAttr *Result =
-      ::new (getASTContext()) HLSLVkConstantIdAttr(getASTContext(), AL, Id);
-  return Result;
-}
-
 HLSLShaderAttr *
 SemaHLSL::mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
                           llvm::Triple::EnvironmentType ShaderType) {
@@ -1226,15 +1157,6 @@ void SemaHLSL::handleVkExtBuiltinInputAttr(Decl *D, const ParsedAttr &AL) {
                  HLSLVkExtBuiltinInputAttr(getASTContext(), AL, ID));
 }
 
-void SemaHLSL::handleVkConstantIdAttr(Decl *D, const ParsedAttr &AL) {
-  uint32_t Id;
-  if (!SemaRef.checkUInt32Argument(AL, AL.getArgAsExpr(0), Id))
-    return;
-  HLSLVkConstantIdAttr *NewAttr = mergeVkConstantIdAttr(D, AL, Id);
-  if (NewAttr)
-    D->addAttr(NewAttr);
-}
-
 bool SemaHLSL::diagnoseInputIDType(QualType T, const ParsedAttr &AL) {
   const auto *VT = T->getAs<VectorType>();
 
@@ -3284,7 +3206,6 @@ static bool IsDefaultBufferConstantDecl(VarDecl *VD) {
   return VD->getDeclContext()->isTranslationUnit() &&
          QT.getAddressSpace() == LangAS::Default &&
          VD->getStorageClass() != SC_Static &&
-         !VD->hasAttr<HLSLVkConstantIdAttr>() &&
          !isInvalidConstantBufferLeafElementType(QT.getTypePtr());
 }
 
@@ -3352,8 +3273,7 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
     const Type *VarType = VD->getType().getTypePtr();
     while (VarType->isArrayType())
       VarType = VarType->getArrayElementTypeNoTypeQual();
-    if (VarType->isHLSLResourceRecord() ||
-        VD->hasAttr<HLSLVkConstantIdAttr>()) {
+    if (VarType->isHLSLResourceRecord()) {
       // Make the variable for resources static. The global externally visible
       // storage is accessed through the handle, which is a member. The variable
       // itself is not externally visible.
@@ -3776,41 +3696,3 @@ bool SemaHLSL::transformInitList(const InitializedEntity &Entity,
     Init->updateInit(Ctx, I, NewInit->getInit(I));
   return true;
 }
-
-bool SemaHLSL::handleInitialization(VarDecl *VDecl, Expr *&Init) {
-  const HLSLVkConstantIdAttr *ConstIdAttr =
-      VDecl->getAttr<HLSLVkConstantIdAttr>();
-  if (!ConstIdAttr)
-    return true;
-
-  ASTContext &Context = SemaRef.getASTContext();
-
-  APValue InitValue;
-  if (!Init->isCXX11ConstantExpr(Context, &InitValue)) {
-    Diag(VDecl->getLocation(), diag::err_specialization_const);
-    VDecl->setInvalidDecl();
-    return false;
-  }
-
-  Builtin::ID BID = getSpecConstBuiltinId(VDecl->getType());
-
-  // Argument 1: The ID from the attribute
-  int ConstantID = ConstIdAttr->getId();
-  llvm::APInt IDVal(Context.getIntWidth(Context.IntTy), ConstantID);
-  Expr *IdExpr = IntegerLiteral::Create(Context, IDVal, Context.IntTy,
-                                        ConstIdAttr->getLocation());
-
-  SmallVector<Expr *, 2> Args = {IdExpr, Init};
-  Expr *C = SemaRef.BuildBuiltinCallExpr(Init->getExprLoc(), BID, Args);
-  if (C->getType()->getCanonicalTypeUnqualified() !=
-      VDecl->getType()->getCanonicalTypeUnqualified()) {
-    C = SemaRef
-            .BuildCStyleCastExpr(SourceLocation(),
-                                 Context.getTrivialTypeSourceInfo(
-                                     Init->getType(), Init->getExprLoc()),
-                                 SourceLocation(), C)
-            .get();
-  }
-  Init = C;
-  return true;
-}
diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
deleted file mode 100644
index c0955c1ea7b4..000000000000
--- a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
+++ /dev/null
@@ -1,130 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'bool'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'bool (*)(unsigned int, bool) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'bool (unsigned int, bool) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_bool' 'bool (unsigned int, bool) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
-// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
-[[vk::constant_id(1)]]
-const bool bool_const = true;
-
-// CHECK: VarDecl {{.*}} short_const 'const hlsl_private short' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'short'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'short (*)(unsigned int, short) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'short (unsigned int, short) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_short' 'short (unsigned int, short) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'short' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
-[[vk::constant_id(2)]]
-const short short_const = 4;
-
-// CHECK: VarDecl {{.*}} int_const 'const hlsl_private int' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'int'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int (*)(unsigned int, int) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int (unsigned int, int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_int' 'int (unsigned int, int) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
-[[vk::constant_id(3)]]
-const int int_const = 5;
-
-// CHECK: VarDecl {{.*}} long_const 'const hlsl_private long long' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'long long'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long long (*)(unsigned int, long long) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'long long (unsigned int, long long) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_longlong' 'long long (unsigned int, long long) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long long' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
-[[vk::constant_id(4)]]
-const long long long_const = 8;
-
-// CHECK: VarDecl {{.*}} ushort_const 'const hlsl_private unsigned short' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'unsigned short'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned short (*)(unsigned int, unsigned short) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned short (unsigned int, unsigned short) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_ushort' 'unsigned short (unsigned int, unsigned short) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned short' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 10
-[[vk::constant_id(5)]]
-const unsigned short ushort_const = 10;
-
-// CHECK: VarDecl {{.*}} uint_const 'const hlsl_private unsigned int' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'unsigned int'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int (*)(unsigned int, unsigned int) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int (unsigned int, unsigned int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_uint' 'unsigned int (unsigned int, unsigned int) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 12
-[[vk::constant_id(6)]]
-const unsigned int uint_const = 12;
-
-
-// CHECK: VarDecl {{.*}} ulong_const 'const hlsl_private unsigned long long' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'unsigned long long'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned long long (*)(unsigned int, unsigned long long) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned long long (unsigned int, unsigned long long) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_ulonglong' 'unsigned long long (unsigned int, unsigned long long) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 7
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned long long' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 25
-[[vk::constant_id(7)]]
-const unsigned long long ulong_const = 25;
-
-// CHECK: VarDecl {{.*}} half_const 'const hlsl_private half' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'half'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half (*)(unsigned int, half) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'half (unsigned int, half) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_half' 'half (unsigned int, half) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half' <FloatingCast>
-// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 4.040000e+01
-[[vk::constant_id(8)]]
-const half half_const = 40.4;
-
-// CHECK: VarDecl {{.*}} float_const 'const hlsl_private float' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'float'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float (*)(unsigned int, float) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'float (unsigned int, float) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_float' 'float (unsigned int, float) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 50
-[[vk::constant_id(8)]]
-const float float_const = 50;
-
-// CHECK: VarDecl {{.*}} double_const 'const hlsl_private double' static cinit
-// CHECK-NEXT: CallExpr {{.*}} 'double'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'double (*)(unsigned int, double) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'double (unsigned int, double) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_double' 'double (unsigned int, double) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 9
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'double' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 100
-[[vk::constant_id(9)]]
-const double double_const = 100;
-
-// CHECK: VarDecl {{.*}} enum_const 'const hlsl_private E' static cinit
-// CHECK-NEXT: CStyleCastExpr {{.*}} 'E' <IntegralCast>
-// CHECK-NEXT: CallExpr {{.*}} 'int'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int (*)(unsigned int, int) noexcept' <FunctionToPointerDecay>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int (unsigned int, int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_int' 'int (unsigned int, int) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 10 
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'E' EnumConstant {{.*}} 'e2' 'E' 
-enum E {
-    e0 = 10,
-    e1 = 20,
-    e2 = 30
-};
-
-[[vk::constant_id(10)]]
-const E enum_const = e2;
-
-// CHECK-NOT: CXXRecordDecl {{.*}} implicit struct __cblayout_$Globals definition
diff --git a/clang/test/CodeGenHLSL/vk-features/SpirvType.alignment.hlsl b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.alignment.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/vk-features/SpirvType.alignment.hlsl
rename to clang/test/CodeGenHLSL/inline-spirv/SpirvType.alignment.hlsl
diff --git a/clang/test/CodeGenHLSL/vk-features/SpirvType.hlsl b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/vk-features/SpirvType.hlsl
rename to clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
deleted file mode 100644
index cbc1fa61eae2..000000000000
--- a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
+++ /dev/null
@@ -1,210 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s
-
-[[vk::constant_id(1)]]
-const bool bool_const = true;
-
-[[vk::constant_id(1)]]
-const short short_const = 4;
-
-[[vk::constant_id(3)]]
-const int int_const = 5;
-
-[[vk::constant_id(4)]]
-const long long long_const = 8;
-
-[[vk::constant_id(5)]]
-const unsigned short ushort_const = 10;
-
-[[vk::constant_id(6)]]
-const unsigned int uint_const = 12;
-
-[[vk::constant_id(7)]]
-const unsigned long long ulong_const = 25;
-
-[[vk::constant_id(8)]]
-const half half_const = 40.4;
-
-[[vk::constant_id(8)]]
-const float float_const = 50.5;
-
-[[vk::constant_id(9)]]
-const double double_const = 100.2;
-
-enum E {
-    e0 = 10,
-    e1 = 20,
-    e2 = 30
-};
-
-[[vk::constant_id(10)]]
-const E enum_const = e2;
-
-[numthreads(1,1,1)]
-void main() {
-    bool b = bool_const;
-    short s = short_const;
-    int i = int_const;
-    long long l = long_const;
-    unsigned short us = ushort_const;
-    unsigned int ui = uint_const;
-    unsigned long long ul = ulong_const;
-    half h = half_const;
-    float f = float_const;
-    double d = double_const;
-    E e = enum_const;
-}
-//.
-// CHECK: @_ZL10bool_const = internal addrspace(10) global i32 0, align 4
-// CHECK: @_ZL11short_const = internal addrspace(10) global i16 0, align 2
-// CHECK: @_ZL9int_const = internal addrspace(10) global i32 0, align 4
-// CHECK: @_ZL10long_const = internal addrspace(10) global i64 0, align 8
-// CHECK: @_ZL12ushort_const = internal addrspace(10) global i16 0, align 2
-// CHECK: @_ZL10uint_const = internal addrspace(10) global i32 0, align 4
-// CHECK: @_ZL11ulong_const = internal addrspace(10) global i64 0, align 8
-// CHECK: @_ZL10half_const = internal addrspace(10) global float 0.000000e+00, align 4
-// CHECK: @_ZL11float_const = internal addrspace(10) global float 0.000000e+00, align 4
-// CHECK: @_ZL12double_const = internal addrspace(10) global double 0.000000e+00, align 8
-// CHECK: @_ZL10enum_const = internal addrspace(10) global i32 0, align 4
-//.
-// CHECK-LABEL: define internal spir_func void @_Z4mainv(
-// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[L:%.*]] = alloca i64, align 8
-// CHECK-NEXT:    [[US:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[UI:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[UL:%.*]] = alloca i64, align 8
-// CHECK-NEXT:    [[H:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[F:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[D:%.*]] = alloca double, align 8
-// CHECK-NEXT:    [[E:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(10) @_ZL10bool_const, align 4
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i32 [[TMP1]] to i1
-// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i32
-// CHECK-NEXT:    store i32 [[STOREDV]], ptr [[B]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(10) @_ZL11short_const, align 2
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[S]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(10) @_ZL9int_const, align 4
-// CHECK-NEXT:    store i32 [[TMP3]], ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(10) @_ZL10long_const, align 8
-// CHECK-NEXT:    store i64 [[TMP4]], ptr [[L]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(10) @_ZL12ushort_const, align 2
-// CHECK-NEXT:    store i16 [[TMP5]], ptr [[US]], align 2
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(10) @_ZL10uint_const, align 4
-// CHECK-NEXT:    store i32 [[TMP6]], ptr [[UI]], align 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(10) @_ZL11ulong_const, align 8
-// CHECK-NEXT:    store i64 [[TMP7]], ptr [[UL]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(10) @_ZL10half_const, align 4
-// CHECK-NEXT:    store float [[TMP8]], ptr [[H]], align 4
-// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(10) @_ZL11float_const, align 4
-// CHECK-NEXT:    store float [[TMP9]], ptr [[F]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr addrspace(10) @_ZL12double_const, align 8
-// CHECK-NEXT:    store double [[TMP10]], ptr [[D]], align 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(10) @_ZL10enum_const, align 4
-// CHECK-NEXT:    store i32 [[TMP11]], ptr [[E]], align 4
-// CHECK-NEXT:    ret void
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init(
-// CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @_Z20__spirv_SpecConstantib(i32 1, i1 true)
-// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP1]] to i32
-// CHECK-NEXT:    store i32 [[STOREDV]], ptr addrspace(10) @_ZL10bool_const, align 4
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.1(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @_Z20__spirv_SpecConstantis(i32 1, i16 4)
-// CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(10) @_ZL11short_const, align 2
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.2(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantii(i32 3, i32 5)
-// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL9int_const, align 4
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.3(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_Z20__spirv_SpecConstantix(i32 4, i64 8)
-// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(10) @_ZL10long_const, align 8
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.4(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @_Z20__spirv_SpecConstantit(i32 5, i16 10)
-// CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(10) @_ZL12ushort_const, align 2
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.5(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantij(i32 6, i32 12)
-// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL10uint_const, align 4
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.6(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_Z20__spirv_SpecConstantiy(i32 7, i64 25)
-// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(10) @_ZL11ulong_const, align 8
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.7(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn float @_Z20__spirv_SpecConstantiDh(i32 8, float 0x4044333340000000)
-// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(10) @_ZL10half_const, align 4
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.8(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn float @_Z20__spirv_SpecConstantif(i32 8, float 5.050000e+01)
-// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(10) @_ZL11float_const, align 4
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.9(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn double @_Z20__spirv_SpecConstantid(i32 9, double 0x40590CCCC0000000)
-// CHECK-NEXT:    store double [[TMP1]], ptr addrspace(10) @_ZL12double_const, align 8
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.10(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantii(i32 10, i32 30)
-// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL10enum_const, align 4
-// CHECK-NEXT:    ret void
diff --git a/clang/test/SemaHLSL/vk.spec-constant.error.hlsl b/clang/test/SemaHLSL/vk.spec-constant.error.hlsl
deleted file mode 100644
index 24873d272a54..000000000000
--- a/clang/test/SemaHLSL/vk.spec-constant.error.hlsl
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s
-
-#ifndef __spirv__
-// expected-warning@+2{{'constant_id' attribute ignored}}
-#endif
-[[vk::constant_id(0)]]
-const bool sc0 = true;
-
-#ifdef __spirv__
-// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
-[[vk::constant_id(1)]]
-const bool sc1 = sc0; // error
-
-// expected-warning@+1{{'constant_id' attribute only applies to external global variables}}
-[[vk::constant_id(2)]]
-static const bool sc2 = false; // error
-
-// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
-[[vk::constant_id(3)]]
-const bool sc3; // error
-
-// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
-[[vk::constant_id(4)]]
-bool sc4 = false; // error
-
-// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
-[[vk::constant_id(5)]]
-const int2 sc5 = {0,0}; // error
-
-[numthreads(1,1,1)]
-void main() {
-  // expected-warning@+1{{'constant_id' attribute only applies to external global variables}}
-  [[vk::constant_id(6)]]
-  const bool sc6 = false; // error
-}
-#endif

From d265105b8f50718a684d792d3ca957231d668533 Mon Sep 17 00:00:00 2001
From: David Justo <david.justo.1996@gmail.com>
Date: Wed, 18 Jun 2025 17:13:25 -0700
Subject: [PATCH 0881/1322] Augment `uncaught-exception.test` fuzzer test to be
 msvc-compatible (#125924)

Today, the `uncaught-exception.test` fuzzer test checks for the string
"libFuzzer: deadly signal" in the program output as the result of an
uncaught exception.

Although this is correct for `clang`, `msvc` reports a different error
message: "libFuzzer: uncaught C++ exception". Since `msvc` reuses the
`libFuzzer` infrastructure for ASan regression testing, it would help us
greatly if the test handled the `msvc` divergence more gracefully.

**This PR:** augments this test so check for a different string (namely
"libFuzzer: uncaught C++ exception") if the compiler target matches the
`msvc` naming scheme.

I understand if this is outside the scope of support for LLVM as well,
and I'm also open for different approaches here. Thanks!
---
 compiler-rt/test/fuzzer/uncaught-exception.test | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/fuzzer/uncaught-exception.test b/compiler-rt/test/fuzzer/uncaught-exception.test
index b055c88f6d90..d1b98cfb7c74 100644
--- a/compiler-rt/test/fuzzer/uncaught-exception.test
+++ b/compiler-rt/test/fuzzer/uncaught-exception.test
@@ -4,7 +4,10 @@
 REQUIRES: windows
 RUN: %cpp_compiler %S/UncaughtException.cpp -o %t-UncaughtException
 
-RUN: not %run %t-UncaughtException 2>&1 | FileCheck %s
+# Clang will fail the test with 'deadly signal', but other compilers may fail with different error messages.
+# For example, msvc fails with 'uncaught C++ exception'. So the error we check depends on the compiler target.
+RUN: not %run %t-UncaughtException 2>&1 | FileCheck %s --check-prefixes=CHECK-CRASH,%if target={{.*-windows-msvc.*}} %{CHECK-MSVC%} %else %{CHECK-ERROR%}
 
-CHECK: ERROR: libFuzzer: deadly signal
-CHECK: Test unit written to ./crash
+CHECK-ERROR: ERROR: libFuzzer: deadly signal
+CHECK-MSVC: ERROR: libFuzzer: uncaught C++ exception
+CHECK-CRASH: Test unit written to ./crash

From bc8908a4e93b0641e1c17f408885c8aebb308bbe Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 19 Jun 2025 09:36:10 +0900
Subject: [PATCH 0882/1322] ARM: Move declaration of supportSplitCSR to be
 public (#144679)

This is an implementation of a public method from the base
class, so it should also be public. Avoids unrelated diff
in a future patch.
---
 llvm/lib/Target/ARM/ARMISelLowering.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 87710ee29a24..357ca9ea5d20 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -716,6 +716,11 @@ class VectorType;
       return true;
     }
 
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+             MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
+    }
+
     bool hasStandaloneRem(EVT VT) const override {
       return HasStandaloneRem;
     }
@@ -914,11 +919,6 @@ class VectorType;
                             SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
                             SDValue ThisVal, bool isCmseNSCall) const;
 
-    bool supportSplitCSR(MachineFunction *MF) const override {
-      return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
-          MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
-    }
-
     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
     void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,

From 874a02f05b6ebb4b5dbe0ab09beb9c3d5b36e237 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 19 Jun 2025 09:38:22 +0900
Subject: [PATCH 0883/1322] ARM: Move ABI helpers from Subtarget to
 TargetMachine (#144680)

These are module level concepts, and attaching them to the
function level subtarget is confusing. Similarly these other
helpers that only operate on the triple should also be removed
from the subtarget.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp    |  6 +++++-
 llvm/lib/Target/ARM/ARMAsmPrinter.h      |  2 ++
 llvm/lib/Target/ARM/ARMFastISel.cpp      | 15 +++++++------
 llvm/lib/Target/ARM/ARMISelLowering.cpp  | 27 +++++++++++++++---------
 llvm/lib/Target/ARM/ARMISelLowering.h    |  3 +++
 llvm/lib/Target/ARM/ARMSubtarget.cpp     | 20 ++----------------
 llvm/lib/Target/ARM/ARMSubtarget.h       |  6 ------
 llvm/lib/Target/ARM/ARMTargetMachine.cpp | 16 ++++++++++++++
 llvm/lib/Target/ARM/ARMTargetMachine.h   |  4 ++++
 9 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 1443747709b7..2fd784373f4a 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -54,6 +54,10 @@ ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
     : AsmPrinter(TM, std::move(Streamer), ID), Subtarget(nullptr), AFI(nullptr),
       MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {}
 
+const ARMBaseTargetMachine &ARMAsmPrinter::getTM() const {
+  return static_cast<const ARMBaseTargetMachine &>(TM);
+}
+
 void ARMAsmPrinter::emitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
   // of the function.
@@ -750,7 +754,7 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
 
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-  if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+  if (getTM().isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
     ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
 
   // FIXME: To support emitting this build attribute as GCC does, the
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 8a7ec4e2bcf2..2b067c753264 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -76,6 +76,8 @@ public:
     return "ARM Assembly Printer";
   }
 
+  const ARMBaseTargetMachine &getTM() const;
+
   void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
 
   void PrintSymbolOperand(const MachineOperand &MO, raw_ostream &O) override;
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 765c65c5fcb2..06499a3945ee 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -20,6 +20,7 @@
 #include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "Utils/ARMBaseInfo.h"
@@ -134,9 +135,9 @@ class ARMFastISel final : public FastISel {
   /// make the right decision when generating code for different targets.
   const ARMSubtarget *Subtarget;
   Module &M;
-  const TargetMachine &TM;
-  const TargetInstrInfo &TII;
-  const TargetLowering &TLI;
+  const ARMBaseInstrInfo &TII;
+  const ARMTargetLowering &TLI;
+  const ARMBaseTargetMachine &TM;
   ARMFunctionInfo *AFI;
 
   // Convenience variables to avoid some queries.
@@ -149,8 +150,8 @@ class ARMFastISel final : public FastISel {
         : FastISel(funcInfo, libInfo),
           Subtarget(&funcInfo.MF->getSubtarget<ARMSubtarget>()),
           M(const_cast<Module &>(*funcInfo.Fn->getParent())),
-          TM(funcInfo.MF->getTarget()), TII(*Subtarget->getInstrInfo()),
-          TLI(*Subtarget->getTargetLowering()) {
+          TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()),
+          TM(TLI.getTM()) {
       AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
       isThumb2 = AFI->isThumbFunction();
       Context = &funcInfo.Fn->getContext();
@@ -1893,7 +1894,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
     report_fatal_error("Unsupported calling convention");
   case CallingConv::Fast:
     if (Subtarget->hasVFP2Base() && !isVarArg) {
-      if (!Subtarget->isAAPCS_ABI())
+      if (!TM.isAAPCS_ABI())
         return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
       // For AAPCS ABI targets, just use VFP variant of the calling convention.
       return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
@@ -1902,7 +1903,7 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::C:
   case CallingConv::CXX_FAST_TLS:
     // Use target triple & subtarget features to do actual dispatch.
-    if (Subtarget->isAAPCS_ABI()) {
+    if (TM.isAAPCS_ABI()) {
       if (Subtarget->hasFPRegs() &&
           TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
         return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 05d8a1190ada..6e653687dbcb 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -499,9 +499,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
 }
 
-ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
+const ARMBaseTargetMachine &ARMTargetLowering::getTM() const {
+  return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
+}
+
+ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
                                      const ARMSubtarget &STI)
-    : TargetLowering(TM), Subtarget(&STI) {
+    : TargetLowering(TM_), Subtarget(&STI) {
+
+  const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
+
   RegInfo = Subtarget->getRegisterInfo();
   Itins = Subtarget->getInstrItineraryData();
 
@@ -591,7 +598,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   // RTLIB
-  if (Subtarget->isAAPCS_ABI() &&
+  if (TM.isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
     // clang-format off
@@ -716,7 +723,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   // non-watchos platforms, but are needed for some targets which use a
   // hard-float calling convention by default.
   if (!Subtarget->isTargetWatchABI()) {
-    if (Subtarget->isAAPCS_ABI()) {
+    if (TM.isAAPCS_ABI()) {
       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
@@ -2070,7 +2077,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
   case CallingConv::C:
   case CallingConv::Tail:
-    if (!Subtarget->isAAPCS_ABI())
+    if (!getTM().isAAPCS_ABI())
       return CallingConv::ARM_APCS;
     else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
@@ -2080,12 +2087,12 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
       return CallingConv::ARM_AAPCS;
   case CallingConv::Fast:
   case CallingConv::CXX_FAST_TLS:
-    if (!Subtarget->isAAPCS_ABI()) {
+    if (!getTM().isAAPCS_ABI()) {
       if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
         return CallingConv::Fast;
       return CallingConv::ARM_APCS;
-    } else if (Subtarget->hasVFP2Base() &&
-               !Subtarget->isThumb1Only() && !isVarArg)
+    } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
+               !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
     else
       return CallingConv::ARM_AAPCS;
@@ -3273,7 +3280,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     SDValue Arg = OutVals[realRVLocIdx];
     bool ReturnF16 = false;
 
-    if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
+    if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
       // Half-precision return values can be returned like this:
       //
       // t11 f16 = fadd ...
@@ -9937,7 +9944,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
-  bool ShouldUseSRet = Subtarget->isAPCS_ABI();
+  bool ShouldUseSRet = getTM().isAPCS_ABI();
   SDValue SRet;
   if (ShouldUseSRet) {
     // Create stack object for sret.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 357ca9ea5d20..9c330e60a7d5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -34,6 +34,7 @@
 
 namespace llvm {
 
+class ARMBaseTargetMachine;
 class ARMSubtarget;
 class DataLayout;
 class FastISel;
@@ -414,6 +415,8 @@ class VectorType;
     explicit ARMTargetLowering(const TargetMachine &TM,
                                const ARMSubtarget &STI);
 
+    const ARMBaseTargetMachine &getTM() const;
+
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 91d385a0b595..abca4bb947bc 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -201,9 +201,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isTargetWindows())
     NoARM = true;
 
-  if (isAAPCS_ABI())
+  if (TM.isAAPCS_ABI())
     stackAlignment = Align(8);
-  if (isTargetNaCl() || isAAPCS16_ABI())
+  if (isTargetNaCl() || TM.isAAPCS16_ABI())
     stackAlignment = Align(16);
 
   // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
@@ -320,22 +320,6 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   }
 }
 
-bool ARMSubtarget::isTargetHardFloat() const { return TM.isTargetHardFloat(); }
-
-bool ARMSubtarget::isAPCS_ABI() const {
-  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
-}
-bool ARMSubtarget::isAAPCS_ABI() const {
-  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
-         TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-}
-bool ARMSubtarget::isAAPCS16_ABI() const {
-  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-}
-
 bool ARMSubtarget::isROPI() const {
   return TM.getRelocationModel() == Reloc::ROPI ||
          TM.getRelocationModel() == Reloc::ROPI_RWPI;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 890a22f574a6..7893796e313b 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -360,8 +360,6 @@ public:
     return TargetTriple.isTargetEHABICompatible();
   }
 
-  bool isTargetHardFloat() const;
-
   bool isReadTPSoft() const {
     return !(isReadTPTPIDRURW() || isReadTPTPIDRURO() || isReadTPTPIDRPRW());
   }
@@ -370,10 +368,6 @@ public:
 
   bool isXRaySupported() const override;
 
-  bool isAPCS_ABI() const;
-  bool isAAPCS_ABI() const;
-  bool isAAPCS16_ABI() const;
-
   bool isROPI() const;
   bool isRWPI() const;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index fee77a44e5e8..0d947d924eb6 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -271,6 +271,22 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
 
+bool ARMBaseTargetMachine::isAPCS_ABI() const {
+  assert(TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
+}
+
+bool ARMBaseTargetMachine::isAAPCS_ABI() const {
+  assert(TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
+         TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+
+bool ARMBaseTargetMachine::isAAPCS16_ABI() const {
+  assert(TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+
 MachineFunctionInfo *ARMBaseTargetMachine::createMachineFunctionInfo(
     BumpPtrAllocator &Allocator, const Function &F,
     const TargetSubtargetInfo *STI) const {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index 99fd817c81f8..513fe713c0bc 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -66,6 +66,10 @@ public:
     return TLOF.get();
   }
 
+  bool isAPCS_ABI() const;
+  bool isAAPCS_ABI() const;
+  bool isAAPCS16_ABI() const;
+
   bool isTargetHardFloat() const {
     return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
            TargetTriple.getEnvironment() == Triple::GNUEABIHFT64 ||

From 6e5ee4aa98f1dc16e6a75a7fd298a59f1edd1c6e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 18 Jun 2025 17:48:10 -0700
Subject: [PATCH 0884/1322] [RISCV] Save vector registers in interrupt handler.
 (#143808)

Corresponding gcc bug report
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110665

The generated code is pretty awful.
---
 llvm/lib/Target/RISCV/RISCVCallingConv.td     |   30 +-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |   10 +
 llvm/test/CodeGen/RISCV/interrupt-attr.ll     | 6820 +++++++++++++++++
 .../RISCV/rvv/interrupt-attr-nocall.ll        |  502 ++
 4 files changed, 7360 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll

diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index 98e05b7f8eca..cbf039edec27 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -56,14 +56,40 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
 def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
                                              (sequence "F%u_D", 0, 31))>;
 
+// Same as CSR_Interrupt, but including all vector registers.
+def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
+                                           (sequence "V%u", 0, 31))>;
+
+// Same as CSR_Interrupt, but including all 32-bit FP registers and all vector
+// registers.
+def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt,
+                                               (sequence "V%u", 0, 31))>;
+
+// Same as CSR_Interrupt, but including all 64-bit FP registers and all vector
+// registers.
+def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt,
+                                               (sequence "V%u", 0, 31))>;
+
 // Same as CSR_Interrupt, but excluding X16-X31.
 def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt,
                                         (sequence "X%u", 16, 31))>;
 
 // Same as CSR_XLEN_F32_Interrupt, but excluding X16-X31.
 def CSR_XLEN_F32_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_Interrupt,
-                                                (sequence "X%u", 16, 31))>;
+                                                 (sequence "X%u", 16, 31))>;
 
 // Same as CSR_XLEN_F64_Interrupt, but excluding X16-X31.
 def CSR_XLEN_F64_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_Interrupt,
-                                                (sequence "X%u", 16, 31))>;
+                                                 (sequence "X%u", 16, 31))>;
+
+// Same as CSR_XLEN_V_Interrupt, but excluding X16-X31.
+def CSR_XLEN_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_V_Interrupt,
+                                               (sequence "X%u", 16, 31))>;
+
+// Same as CSR_XLEN_F32_V_Interrupt, but excluding X16-X31.
+def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt,
+                                                   (sequence "X%u", 16, 31))>;
+
+// Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31.
+def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt,
+                                                   (sequence "X%u", 16, 31))>;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 112142e1ef2f..7fdbf4be1ed1 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -69,6 +69,16 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getFunction().getCallingConv() == CallingConv::GHC)
     return CSR_NoRegs_SaveList;
   if (MF->getFunction().hasFnAttribute("interrupt")) {
+    if (Subtarget.hasVInstructions()) {
+      if (Subtarget.hasStdExtD())
+        return Subtarget.hasStdExtE() ? CSR_XLEN_F64_V_Interrupt_RVE_SaveList
+                                      : CSR_XLEN_F64_V_Interrupt_SaveList;
+      if (Subtarget.hasStdExtF())
+        return Subtarget.hasStdExtE() ? CSR_XLEN_F32_V_Interrupt_RVE_SaveList
+                                      : CSR_XLEN_F32_V_Interrupt_SaveList;
+      return Subtarget.hasStdExtE() ? CSR_XLEN_V_Interrupt_RVE_SaveList
+                                    : CSR_XLEN_V_Interrupt_SaveList;
+    }
     if (Subtarget.hasStdExtD())
       return Subtarget.hasStdExtE() ? CSR_XLEN_F64_Interrupt_RVE_SaveList
                                     : CSR_XLEN_F64_Interrupt_SaveList;
diff --git a/llvm/test/CodeGen/RISCV/interrupt-attr.ll b/llvm/test/CodeGen/RISCV/interrupt-attr.ll
index ba20ba77e6b2..e278b8d0b53b 100644
--- a/llvm/test/CodeGen/RISCV/interrupt-attr.ll
+++ b/llvm/test/CodeGen/RISCV/interrupt-attr.ll
@@ -19,6 +19,13 @@
 ; RUN: 2>&1 | FileCheck %s -check-prefixes=CHECK,CHECK-RV32E
 ; RUN: llc -mtriple riscv32-unknown-elf -mattr=+e,+f -o - %s \
 ; RUN: 2>&1 | FileCheck %s -check-prefixes=CHECK,CHECK-RV32E-F
+
+; RUN: llc -mtriple riscv32-unknown-elf -mattr=+zve32x -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV32-V
+; RUN: llc -mtriple riscv32-unknown-elf -mattr=+zve32x,+f -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV32-FV
+; RUN: llc -mtriple riscv32-unknown-elf -mattr=+zve32x,+f,+d -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV32-FDV
 ;
 ; RUN: llc -mtriple riscv64-unknown-elf -o - %s \
 ; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV64
@@ -42,6 +49,13 @@
 ; RUN: 2>&1 | FileCheck %s -check-prefixes=CHECK,CHECK-RV64E-F
 ; RUN: llc -mtriple riscv64-unknown-elf -mattr=+e,+f,+d -o - %s \
 ; RUN: 2>&1 | FileCheck %s -check-prefixes=CHECK,CHECK-RV64E-FD
+;
+; RUN: llc -mtriple riscv64-unknown-elf -mattr=+zve32x -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV64-V
+; RUN: llc -mtriple riscv64-unknown-elf -mattr=+zve32x,+f -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV64-FV
+; RUN: llc -mtriple riscv64-unknown-elf -mattr=+zve32x,+f,+d -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK -check-prefix CHECK-RV64-FDV
 
 ;
 ; Checking for special return instructions (sret, mret).
@@ -757,6 +771,1697 @@ define void @foo_with_call() #1 {
 ; CHECK-RV32E-F-NEXT:    addi sp, sp, 168
 ; CHECK-RV32E-F-NEXT:    mret
 ;
+; CHECK-RV32-V-LABEL: foo_with_call:
+; CHECK-RV32-V:       # %bb.0:
+; CHECK-RV32-V-NEXT:    addi sp, sp, -80
+; CHECK-RV32-V-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t0, 72(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t1, 68(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t2, 64(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a0, 60(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a1, 56(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a2, 52(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a3, 48(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a4, 44(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a5, 40(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a6, 36(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a7, 32(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t3, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t4, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t5, 20(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t6, 16(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
+; CHECK-RV32-V-NEXT:    sub sp, sp, a0
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 5
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 4
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    addi a0, sp, 16
+; CHECK-RV32-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    call otherfoo
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 5
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 4
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    add a0, sp, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    addi a0, sp, 16
+; CHECK-RV32-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
+; CHECK-RV32-V-NEXT:    add sp, sp, a0
+; CHECK-RV32-V-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t0, 72(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t1, 68(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t2, 64(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a0, 60(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a1, 56(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a2, 52(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a3, 48(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a4, 44(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a5, 40(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a6, 36(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a7, 32(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t3, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t4, 24(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t5, 20(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t6, 16(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    addi sp, sp, 80
+; CHECK-RV32-V-NEXT:    mret
+;
+; CHECK-RV32-FV-LABEL: foo_with_call:
+; CHECK-RV32-FV:       # %bb.0:
+; CHECK-RV32-FV-NEXT:    addi sp, sp, -160
+; CHECK-RV32-FV-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t0, 152(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t1, 148(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t2, 144(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a0, 140(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a2, 132(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a3, 128(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a4, 124(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a5, 120(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a6, 116(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a7, 112(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t3, 108(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t4, 104(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t5, 100(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t6, 96(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft0, 92(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft1, 88(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft2, 84(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft3, 80(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft4, 76(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft5, 72(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft6, 68(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft7, 64(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa0, 60(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa1, 56(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa2, 52(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa3, 48(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa4, 44(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa5, 40(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa6, 36(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa7, 32(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft8, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft9, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft10, 20(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft11, 16(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FV-NEXT:    sub sp, sp, a0
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV32-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    call otherfoo
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV32-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FV-NEXT:    add sp, sp, a0
+; CHECK-RV32-FV-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t0, 152(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t1, 148(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t2, 144(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a0, 140(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a1, 136(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a2, 132(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a3, 128(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a4, 124(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a5, 120(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a6, 116(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a7, 112(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t3, 108(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t4, 104(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t5, 100(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t6, 96(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft0, 92(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft1, 88(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft2, 84(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft3, 80(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft4, 76(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft5, 72(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft6, 68(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft7, 64(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa0, 60(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa1, 56(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa2, 52(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa3, 48(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa4, 44(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa5, 40(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa6, 36(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa7, 32(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft8, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft9, 24(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft10, 20(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft11, 16(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    addi sp, sp, 160
+; CHECK-RV32-FV-NEXT:    mret
+;
+; CHECK-RV32-FDV-LABEL: foo_with_call:
+; CHECK-RV32-FDV:       # %bb.0:
+; CHECK-RV32-FDV-NEXT:    addi sp, sp, -240
+; CHECK-RV32-FDV-NEXT:    sw ra, 236(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t0, 232(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t1, 228(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t2, 224(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a0, 220(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a1, 216(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a2, 212(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a3, 208(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a4, 204(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a5, 200(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a6, 196(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a7, 192(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t3, 188(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t4, 184(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t5, 180(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t6, 176(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft0, 168(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft1, 160(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft2, 152(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft3, 144(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft4, 136(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft5, 128(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft6, 120(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft7, 112(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa0, 104(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa1, 96(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa2, 88(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa3, 80(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa4, 72(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa5, 64(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa6, 56(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa7, 48(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft8, 40(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft9, 32(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft10, 24(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft11, 16(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub sp, sp, a0
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    addi a0, sp, 16
+; CHECK-RV32-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    call otherfoo
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    addi a0, sp, 16
+; CHECK-RV32-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FDV-NEXT:    add sp, sp, a0
+; CHECK-RV32-FDV-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t0, 232(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t1, 228(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t2, 224(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a0, 220(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a1, 216(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a2, 212(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a3, 208(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a4, 204(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a5, 200(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a6, 196(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a7, 192(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t3, 188(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t4, 184(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t5, 180(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t6, 176(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft0, 168(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft1, 160(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft2, 152(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft3, 144(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft4, 136(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft5, 128(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft6, 120(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft7, 112(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa0, 104(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa1, 96(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa2, 88(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa3, 80(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa4, 72(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa5, 64(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa6, 56(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa7, 48(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft8, 40(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft9, 32(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft10, 24(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft11, 16(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    addi sp, sp, 240
+; CHECK-RV32-FDV-NEXT:    mret
+;
 ; CHECK-RV64-LABEL: foo_with_call:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    addi sp, sp, -128
@@ -1530,6 +3235,1697 @@ define void @foo_with_call() #1 {
 ; CHECK-RV64E-FD-NEXT:    fld ft11, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64E-FD-NEXT:    addi sp, sp, 336
 ; CHECK-RV64E-FD-NEXT:    mret
+;
+; CHECK-RV64-V-LABEL: foo_with_call:
+; CHECK-RV64-V:       # %bb.0:
+; CHECK-RV64-V-NEXT:    addi sp, sp, -144
+; CHECK-RV64-V-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t0, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t1, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t2, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a1, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a3, 80(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a4, 72(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a5, 64(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a6, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a7, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t3, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t4, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t5, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t6, 16(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
+; CHECK-RV64-V-NEXT:    sub sp, sp, a0
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 5
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    addi a0, sp, 16
+; CHECK-RV64-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    call otherfoo
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 5
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    add a0, sp, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    addi a0, sp, 16
+; CHECK-RV64-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
+; CHECK-RV64-V-NEXT:    add sp, sp, a0
+; CHECK-RV64-V-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t0, 128(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t1, 120(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t2, 112(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a2, 88(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a3, 80(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a4, 72(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a5, 64(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a6, 56(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a7, 48(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t3, 40(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t4, 32(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t5, 24(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t6, 16(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    addi sp, sp, 144
+; CHECK-RV64-V-NEXT:    mret
+;
+; CHECK-RV64-FV-LABEL: foo_with_call:
+; CHECK-RV64-FV:       # %bb.0:
+; CHECK-RV64-FV-NEXT:    addi sp, sp, -224
+; CHECK-RV64-FV-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t0, 208(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t1, 200(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t2, 192(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a1, 176(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a2, 168(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a3, 160(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a4, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a5, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a6, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a7, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t3, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t4, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t5, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t6, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft0, 92(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft1, 88(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft2, 84(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft3, 80(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft4, 76(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft5, 72(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft6, 68(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft7, 64(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa0, 60(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa1, 56(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa2, 52(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa3, 48(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa4, 44(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa5, 40(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa6, 36(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa7, 32(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft8, 28(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft9, 24(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft10, 20(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft11, 16(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FV-NEXT:    sub sp, sp, a0
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV64-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    call otherfoo
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV64-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FV-NEXT:    add sp, sp, a0
+; CHECK-RV64-FV-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t0, 208(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t1, 200(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t2, 192(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a1, 176(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a2, 168(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a3, 160(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a4, 152(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a5, 144(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a6, 136(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a7, 128(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t3, 120(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t4, 112(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t6, 96(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft0, 92(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft1, 88(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft2, 84(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft3, 80(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft4, 76(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft5, 72(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft6, 68(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft7, 64(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa0, 60(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa1, 56(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa2, 52(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa3, 48(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa4, 44(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa5, 40(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa6, 36(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa7, 32(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft8, 28(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft9, 24(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft10, 20(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft11, 16(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    addi sp, sp, 224
+; CHECK-RV64-FV-NEXT:    mret
+;
+; CHECK-RV64-FDV-LABEL: foo_with_call:
+; CHECK-RV64-FDV:       # %bb.0:
+; CHECK-RV64-FDV-NEXT:    addi sp, sp, -304
+; CHECK-RV64-FDV-NEXT:    sd ra, 296(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t0, 288(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t1, 280(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t2, 272(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a1, 256(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a2, 248(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a3, 240(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a4, 232(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a6, 216(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a7, 208(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t3, 200(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t4, 192(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t5, 184(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t6, 176(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft0, 168(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft1, 160(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft2, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft3, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft4, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft5, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft6, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft7, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa0, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa1, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa2, 88(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa3, 80(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa4, 72(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa5, 64(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa6, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa7, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft8, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft9, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft10, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft11, 16(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub sp, sp, a0
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    addi a0, sp, 16
+; CHECK-RV64-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    call otherfoo
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    addi a0, sp, 16
+; CHECK-RV64-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    add sp, sp, a0
+; CHECK-RV64-FDV-NEXT:    ld ra, 296(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t0, 288(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t1, 280(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t2, 272(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a1, 256(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a2, 248(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a3, 240(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a4, 232(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a5, 224(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a6, 216(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a7, 208(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t3, 200(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t4, 192(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t5, 184(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t6, 176(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft0, 168(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft1, 160(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft2, 152(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft3, 144(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft4, 136(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft5, 128(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft6, 120(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft7, 112(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa0, 104(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa1, 96(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa2, 88(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa3, 80(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa4, 72(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa5, 64(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa6, 56(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa7, 48(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft8, 40(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft9, 32(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft10, 24(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft11, 16(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    addi sp, sp, 304
+; CHECK-RV64-FDV-NEXT:    mret
   %call = call i32 @otherfoo()
   ret void
 }
@@ -2249,6 +5645,1718 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV32E-F-NEXT:    addi sp, sp, 172
 ; CHECK-RV32E-F-NEXT:    mret
 ;
+; CHECK-RV32-V-LABEL: foo_fp_with_call:
+; CHECK-RV32-V:       # %bb.0:
+; CHECK-RV32-V-NEXT:    addi sp, sp, -80
+; CHECK-RV32-V-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t0, 72(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t1, 68(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t2, 64(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a0, 56(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a1, 52(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a2, 48(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a3, 44(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a4, 40(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a5, 36(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a6, 32(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw a7, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t4, 20(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t5, 16(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    addi s0, sp, 80
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
+; CHECK-RV32-V-NEXT:    sub sp, sp, a0
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 4
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 5
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    call otherfoo
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 4
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 4
+; CHECK-RV32-V-NEXT:    add a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 2
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    mv a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a1, a1, a0
+; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a1, a0, 5
+; CHECK-RV32-V-NEXT:    sub a0, a1, a0
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    csrr a0, vlenb
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
+; CHECK-RV32-V-NEXT:    sub a0, s0, a0
+; CHECK-RV32-V-NEXT:    addi a0, a0, -80
+; CHECK-RV32-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    addi sp, s0, -80
+; CHECK-RV32-V-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t0, 72(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t1, 68(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t2, 64(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a1, 52(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a2, 48(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a3, 44(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a4, 40(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a5, 36(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a6, 32(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw a7, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t4, 20(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t5, 16(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    lw t6, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-V-NEXT:    addi sp, sp, 80
+; CHECK-RV32-V-NEXT:    mret
+;
+; CHECK-RV32-FV-LABEL: foo_fp_with_call:
+; CHECK-RV32-FV:       # %bb.0:
+; CHECK-RV32-FV-NEXT:    addi sp, sp, -160
+; CHECK-RV32-FV-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t0, 152(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t1, 148(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t2, 144(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw s0, 140(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a0, 136(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a1, 132(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a2, 128(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a3, 124(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a4, 120(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a5, 116(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a6, 112(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw a7, 108(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t3, 104(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t4, 100(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t5, 96(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    sw t6, 92(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft0, 88(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft1, 84(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft2, 80(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft3, 76(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft4, 72(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft5, 68(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft6, 64(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft7, 60(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa0, 56(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa1, 52(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa2, 48(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa3, 44(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa4, 40(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa5, 36(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa6, 32(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa7, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft8, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft9, 20(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft10, 16(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft11, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    addi s0, sp, 160
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FV-NEXT:    sub sp, sp, a0
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    call otherfoo
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    mv a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
+; CHECK-RV32-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    addi sp, s0, -160
+; CHECK-RV32-FV-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t0, 152(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t1, 148(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t2, 144(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw s0, 140(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a0, 136(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a1, 132(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a2, 128(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a3, 124(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a4, 120(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a5, 116(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a6, 112(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw a7, 108(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft0, 88(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft1, 84(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft2, 80(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft3, 76(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft4, 72(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft5, 68(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft6, 64(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft7, 60(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa0, 56(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa1, 52(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa2, 48(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa3, 44(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa4, 40(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa5, 36(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa6, 32(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw fa7, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft8, 24(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft9, 20(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft10, 16(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    flw ft11, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    addi sp, sp, 160
+; CHECK-RV32-FV-NEXT:    mret
+;
+; CHECK-RV32-FDV-LABEL: foo_fp_with_call:
+; CHECK-RV32-FDV:       # %bb.0:
+; CHECK-RV32-FDV-NEXT:    addi sp, sp, -240
+; CHECK-RV32-FDV-NEXT:    sw ra, 236(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t0, 232(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t1, 228(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t2, 224(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw s0, 220(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a0, 216(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a1, 212(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a2, 208(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a3, 204(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a4, 200(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a5, 196(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a6, 192(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw a7, 188(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t3, 184(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t4, 180(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t5, 176(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t6, 172(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft0, 160(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft1, 152(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft2, 144(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft3, 136(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft4, 128(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft5, 120(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft6, 112(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft7, 104(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa0, 96(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa1, 88(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa2, 80(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa3, 72(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa4, 64(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa5, 56(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa6, 48(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa7, 40(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft8, 32(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft9, 24(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft10, 16(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft11, 8(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    addi s0, sp, 240
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub sp, sp, a0
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    call otherfoo
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    mv a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
+; CHECK-RV32-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    addi sp, s0, -240
+; CHECK-RV32-FDV-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t0, 232(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t1, 228(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t2, 224(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw s0, 220(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a0, 216(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a1, 212(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a2, 208(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a3, 204(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a4, 200(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a5, 196(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a6, 192(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw a7, 188(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t3, 184(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t4, 180(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t5, 176(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    lw t6, 172(sp) # 4-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft0, 160(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft1, 152(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft2, 144(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft3, 136(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft4, 128(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft5, 120(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft6, 112(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft7, 104(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa0, 96(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa1, 88(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa2, 80(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa3, 72(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa4, 64(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa5, 56(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa6, 48(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld fa7, 40(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft8, 32(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft9, 24(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft10, 16(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    fld ft11, 8(sp) # 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    addi sp, sp, 240
+; CHECK-RV32-FDV-NEXT:    mret
+;
 ; CHECK-RV64-LABEL: foo_fp_with_call:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    addi sp, sp, -144
@@ -3052,6 +8160,1718 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV64E-FD-NEXT:    fld ft11, 0(sp) # 8-byte Folded Reload
 ; CHECK-RV64E-FD-NEXT:    addi sp, sp, 344
 ; CHECK-RV64E-FD-NEXT:    mret
+;
+; CHECK-RV64-V-LABEL: foo_fp_with_call:
+; CHECK-RV64-V:       # %bb.0:
+; CHECK-RV64-V-NEXT:    addi sp, sp, -160
+; CHECK-RV64-V-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t0, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t1, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t2, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd s0, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a1, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a3, 88(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a4, 80(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a5, 72(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a6, 64(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd a7, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t3, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t4, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t5, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t6, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    addi s0, sp, 160
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
+; CHECK-RV64-V-NEXT:    sub sp, sp, a0
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 5
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    call otherfoo
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 4
+; CHECK-RV64-V-NEXT:    add a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 2
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    mv a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a1, a1, a0
+; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a1, a0, 5
+; CHECK-RV64-V-NEXT:    sub a0, a1, a0
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
+; CHECK-RV64-V-NEXT:    sub a0, s0, a0
+; CHECK-RV64-V-NEXT:    addi a0, a0, -160
+; CHECK-RV64-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    addi sp, s0, -160
+; CHECK-RV64-V-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t0, 144(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t1, 136(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t2, 128(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld s0, 120(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a2, 96(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a3, 88(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a4, 80(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a5, 72(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a6, 64(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld a7, 56(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t3, 48(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t4, 40(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t5, 32(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    ld t6, 24(sp) # 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    addi sp, sp, 160
+; CHECK-RV64-V-NEXT:    mret
+;
+; CHECK-RV64-FV-LABEL: foo_fp_with_call:
+; CHECK-RV64-FV:       # %bb.0:
+; CHECK-RV64-FV-NEXT:    addi sp, sp, -240
+; CHECK-RV64-FV-NEXT:    sd ra, 232(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t0, 224(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t1, 216(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t2, 208(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd s0, 200(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a1, 184(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a2, 176(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a3, 168(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a4, 160(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a5, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a6, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd a7, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t3, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t4, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t5, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    sd t6, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft0, 100(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft1, 96(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft2, 92(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft3, 88(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft4, 84(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft5, 80(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft6, 76(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft7, 72(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa0, 68(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa1, 64(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa2, 60(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa3, 56(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa4, 52(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa5, 48(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa6, 44(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa7, 40(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft8, 36(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft9, 32(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft10, 28(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft11, 24(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    addi s0, sp, 240
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FV-NEXT:    sub sp, sp, a0
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    call otherfoo
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    mv a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
+; CHECK-RV64-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    addi sp, s0, -240
+; CHECK-RV64-FV-NEXT:    ld ra, 232(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t0, 224(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t1, 216(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t2, 208(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld s0, 200(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a1, 184(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a2, 176(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a3, 168(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a4, 160(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a5, 152(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a6, 144(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld a7, 136(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t3, 128(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t4, 120(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t5, 112(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    ld t6, 104(sp) # 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft0, 100(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft1, 96(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft2, 92(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft3, 88(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft4, 84(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft5, 80(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft6, 76(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft7, 72(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa0, 68(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa1, 64(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa2, 60(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa3, 56(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa4, 52(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa5, 48(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa6, 44(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw fa7, 40(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft8, 36(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft9, 32(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft10, 28(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    flw ft11, 24(sp) # 4-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    addi sp, sp, 240
+; CHECK-RV64-FV-NEXT:    mret
+;
+; CHECK-RV64-FDV-LABEL: foo_fp_with_call:
+; CHECK-RV64-FDV:       # %bb.0:
+; CHECK-RV64-FDV-NEXT:    addi sp, sp, -320
+; CHECK-RV64-FDV-NEXT:    sd ra, 312(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t0, 304(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t1, 296(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t2, 288(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd s0, 280(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a0, 272(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a1, 264(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a2, 256(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a3, 248(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a4, 240(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a5, 232(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a6, 224(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a7, 216(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t3, 208(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t4, 200(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t5, 192(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t6, 184(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft0, 176(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft1, 168(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft2, 160(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft3, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft4, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft5, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft6, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft7, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa0, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa1, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa2, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa3, 88(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa4, 80(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa5, 72(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa6, 64(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa7, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft8, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft9, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft10, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft11, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    addi s0, sp, 320
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub sp, sp, a0
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    call otherfoo
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
+; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    mv a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
+; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
+; CHECK-RV64-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    addi sp, s0, -320
+; CHECK-RV64-FDV-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t0, 304(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t1, 296(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t2, 288(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld s0, 280(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a1, 264(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a2, 256(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a3, 248(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a4, 240(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a5, 232(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a6, 224(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld a7, 216(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t3, 208(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t4, 200(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t5, 192(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    ld t6, 184(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft0, 176(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft1, 168(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft2, 160(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft3, 152(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft4, 144(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft5, 136(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft6, 128(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft7, 120(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa0, 112(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa1, 104(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa2, 96(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa3, 88(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa4, 80(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa5, 72(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa6, 64(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld fa7, 56(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft8, 48(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft9, 40(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft10, 32(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    fld ft11, 24(sp) # 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    addi sp, sp, 320
+; CHECK-RV64-FDV-NEXT:    mret
   %call = call i32 @otherfoo()
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
new file mode 100644
index 000000000000..af2e8d384a44
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
@@ -0,0 +1,502 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv32-unknown-elf -mattr=+zve32x,+zvl128b -o - %s \
+; RUN: 2>&1 | FileCheck %s -check-prefix CHECK-RV32
+
+@a = external global <4 x i32>
+@b = external global <4 x i32>
+@c = external global <4 x i32>
+
+define void @foo_lmul1() nounwind #0 {
+; CHECK-RV32-LABEL: foo_lmul1:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -32
+; CHECK-RV32-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    sub sp, sp, a0
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    lui a0, %hi(a)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(a)
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32-NEXT:    lui a0, %hi(b)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(b)
+; CHECK-RV32-NEXT:    vle32.v v9, (a0)
+; CHECK-RV32-NEXT:    vadd.vv v8, v9, v8
+; CHECK-RV32-NEXT:    lui a0, %hi(c)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(c)
+; CHECK-RV32-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add sp, sp, a0
+; CHECK-RV32-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 32
+; CHECK-RV32-NEXT:    mret
+  %1 = load <4 x i32>, ptr @a
+  %2 = load <4 x i32>, ptr @b
+  %add = add nsw <4 x i32> %2, %1
+  store <4 x i32> %add, ptr @c
+  ret void
+}
+
+@d = external global <8 x i32>
+@e = external global <8 x i32>
+@f = external global <8 x i32>
+
+define void @foo_lmul2() nounwind #0 {
+; CHECK-RV32-LABEL: foo_lmul2:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -32
+; CHECK-RV32-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    sub sp, sp, a0
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    slli a1, a0, 1
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    lui a0, %hi(d)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(d)
+; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-RV32-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32-NEXT:    lui a0, %hi(e)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(e)
+; CHECK-RV32-NEXT:    vle32.v v10, (a0)
+; CHECK-RV32-NEXT:    vadd.vv v8, v10, v8
+; CHECK-RV32-NEXT:    lui a0, %hi(f)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(f)
+; CHECK-RV32-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    slli a1, a0, 1
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add sp, sp, a0
+; CHECK-RV32-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 32
+; CHECK-RV32-NEXT:    mret
+  %1 = load <8 x i32>, ptr @d
+  %2 = load <8 x i32>, ptr @e
+  %add = add nsw <8 x i32> %2, %1
+  store <8 x i32> %add, ptr @f
+  ret void
+}
+
+@g = external global <16 x i32>
+@h = external global <16 x i32>
+@i = external global <16 x i32>
+
+define void @foo_lmul4() nounwind #0 {
+; CHECK-RV32-LABEL: foo_lmul4:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -32
+; CHECK-RV32-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 3
+; CHECK-RV32-NEXT:    sub sp, sp, a0
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    slli a1, a0, 3
+; CHECK-RV32-NEXT:    sub a0, a1, a0
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    slli a1, a0, 2
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    slli a1, a0, 1
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    lui a0, %hi(g)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(g)
+; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-RV32-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32-NEXT:    lui a0, %hi(h)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(h)
+; CHECK-RV32-NEXT:    vle32.v v12, (a0)
+; CHECK-RV32-NEXT:    vadd.vv v8, v12, v8
+; CHECK-RV32-NEXT:    lui a0, %hi(i)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(i)
+; CHECK-RV32-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 3
+; CHECK-RV32-NEXT:    sub a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 2
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 1
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 3
+; CHECK-RV32-NEXT:    add sp, sp, a0
+; CHECK-RV32-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 32
+; CHECK-RV32-NEXT:    mret
+  %1 = load <16 x i32>, ptr @g
+  %2 = load <16 x i32>, ptr @h
+  %add = add nsw <16 x i32> %2, %1
+  store <16 x i32> %add, ptr @i
+  ret void
+}
+
+@j = external global <32 x i32>
+@k = external global <32 x i32>
+@l = external global <32 x i32>
+
+define void @foo_lmul8() nounwind #0 {
+; CHECK-RV32-LABEL: foo_lmul8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    addi sp, sp, -32
+; CHECK-RV32-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    sub sp, sp, a0
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 4
+; CHECK-RV32-NEXT:    sub a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a1, a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a1, a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a1, a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 3
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 3
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 3
+; CHECK-RV32-NEXT:    sub a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 2
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 1
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    lui a0, %hi(j)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(j)
+; CHECK-RV32-NEXT:    li a1, 32
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-RV32-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32-NEXT:    lui a0, %hi(k)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(k)
+; CHECK-RV32-NEXT:    vle32.v v16, (a0)
+; CHECK-RV32-NEXT:    vadd.vv v8, v16, v8
+; CHECK-RV32-NEXT:    lui a0, %hi(l)
+; CHECK-RV32-NEXT:    addi a0, a0, %lo(l)
+; CHECK-RV32-NEXT:    vse32.v v8, (a0)
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 4
+; CHECK-RV32-NEXT:    sub a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a1, a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a1, a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a1, a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 3
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 3
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 3
+; CHECK-RV32-NEXT:    sub a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    mv a1, a0
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 2
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 2
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a1, a0, 1
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 1
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    addi a0, sp, 16
+; CHECK-RV32-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    add sp, sp, a0
+; CHECK-RV32-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT:    addi sp, sp, 32
+; CHECK-RV32-NEXT:    mret
+  %1 = load <32 x i32>, ptr @j
+  %2 = load <32 x i32>, ptr @k
+  %add = add nsw <32 x i32> %2, %1
+  store <32 x i32> %add, ptr @l
+  ret void
+}
+
+attributes #0 = { "interrupt"="machine" }

From 2bcdfa198aa511479c46144c5cf95c7c685384ef Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Thu, 19 Jun 2025 09:58:19 +0800
Subject: [PATCH 0885/1322] [CIR] Add side effect attribute to call operations
 (#144201)

This patch adds `side_effect` attribute to `cir.call` operation.

Other function call attributes will be added in later patches.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      | 16 +++--
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  |  5 ++
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 43 +++++++++++-
 .../clang/CIR/Interfaces/CIROpInterfaces.td   |  2 +
 clang/include/clang/CIR/MissingFeatures.h     |  1 -
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 46 +++++++++++--
 clang/lib/CIR/CodeGen/CIRGenCall.h            |  6 ++
 clang/lib/CIR/CodeGen/CIRGenModule.h          | 10 +++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 66 ++++++++++++++++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 51 ++++++++++++--
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  5 ++
 clang/test/CIR/CodeGen/call.c                 | 26 ++++++++
 clang/test/CIR/IR/call.cir                    |  4 ++
 13 files changed, 257 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 502d58d7db8b..3e052c564112 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -227,22 +227,26 @@ public:
   //===--------------------------------------------------------------------===//
 
   cir::CallOp createCallOp(mlir::Location loc, mlir::SymbolRefAttr callee,
-                           mlir::Type returnType, mlir::ValueRange operands) {
-    return create<cir::CallOp>(loc, callee, returnType, operands);
+                           mlir::Type returnType, mlir::ValueRange operands,
+                           cir::SideEffect sideEffect = cir::SideEffect::All) {
+    return create<cir::CallOp>(loc, callee, returnType, operands, sideEffect);
   }
 
   cir::CallOp createCallOp(mlir::Location loc, cir::FuncOp callee,
-                           mlir::ValueRange operands) {
+                           mlir::ValueRange operands,
+                           cir::SideEffect sideEffect = cir::SideEffect::All) {
     return createCallOp(loc, mlir::SymbolRefAttr::get(callee),
-                        callee.getFunctionType().getReturnType(), operands);
+                        callee.getFunctionType().getReturnType(), operands,
+                        sideEffect);
   }
 
   cir::CallOp createIndirectCallOp(mlir::Location loc,
                                    mlir::Value indirectTarget,
                                    cir::FuncType funcType,
-                                   mlir::ValueRange operands) {
+                                   mlir::ValueRange operands,
+                                   cir::SideEffect sideEffect) {
     return create<cir::CallOp>(loc, indirectTarget, funcType.getReturnType(),
-                               operands);
+                               operands, sideEffect);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index b48f4ed461cc..9e01dde379d7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -42,6 +42,11 @@ class CIR_TypedAttr<string name, string attrMnemonic, list<Trait> traits = []>
   let assemblyFormat = [{}];
 }
 
+class CIR_I32EnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
+    : I32EnumAttr<name, summary, cases> {
+  let cppNamespace = "::cir";
+}
+
 class CIRUnitAttr<string name, string attrMnemonic, list<Trait> traits = []>
     : CIR_Attr<name, attrMnemonic, traits> {
   let returnType = "bool";
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 4655cebc82ee..852d3aa13114 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1858,6 +1858,36 @@ def FuncOp : CIR_Op<"func", [
 // CallOp
 //===----------------------------------------------------------------------===//
 
+def CIR_SideEffect : CIR_I32EnumAttr<
+    "SideEffect", "allowed side effects of a function", [
+      I32EnumAttrCase<"All", 1, "all">,
+      I32EnumAttrCase<"Pure", 2, "pure">,
+      I32EnumAttrCase<"Const", 3, "const">
+    ]> {
+  let description = [{
+    The side effect attribute specifies the possible side effects of the callee
+    of a call operation. This is an enumeration attribute and all possible
+    enumerators are:
+
+    - all: The callee can have any side effects. This is the default if no side
+      effects are explicitly listed.
+    - pure: The callee may read data from memory, but it cannot write data to
+      memory. This has the same effect as the GNU C/C++ attribute
+      `__attribute__((pure))`.
+    - const: The callee may not read or write data from memory. This has the
+      same effect as the GNU C/C++ attribute `__attribute__((const))`.
+
+    Examples:
+
+    ```mlir
+    %2 = cir.call @add(%0, %1) : (!s32i, !s32i) -> !s32i
+    %2 = cir.call @add(%0, %1) : (!s32i, !s32i) -> !s32i side_effect(pure)
+    %2 = cir.call @add(%0, %1) : (!s32i, !s32i) -> !s32i side_effect(const)
+    ```
+  }];
+  let cppNamespace = "::cir";
+}
+
 class CIR_CallOpBase<string mnemonic, list<Trait> extra_traits = []>
     : Op<CIR_Dialect, mnemonic,
          !listconcat(extra_traits,
@@ -1911,7 +1941,8 @@ class CIR_CallOpBase<string mnemonic, list<Trait> extra_traits = []>
   // will add in the future.
 
   dag commonArgs = (ins OptionalAttr<FlatSymbolRefAttr>:$callee,
-      Variadic<CIR_AnyType>:$args);
+      Variadic<CIR_AnyType>:$args,
+      DefaultValuedAttr<CIR_SideEffect, "SideEffect::All">:$side_effect);
 }
 
 def CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> {
@@ -1942,20 +1973,26 @@ def CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> {
   let builders = [
     // Build a call op for a direct call
     OpBuilder<(ins "mlir::SymbolRefAttr":$callee, "mlir::Type":$resType,
-                   "mlir::ValueRange":$operands), [{
+                   "mlir::ValueRange":$operands,
+                   CArg<"SideEffect", "SideEffect::All">:$sideEffect), [{
       assert(callee && "callee attribute is required for direct call");
       $_state.addOperands(operands);
       $_state.addAttribute("callee", callee);
+      $_state.addAttribute("side_effect",
+        SideEffectAttr::get($_builder.getContext(), sideEffect));
       if (resType && !isa<VoidType>(resType))
         $_state.addTypes(resType);
     }]>,
     // Build a call op for an indirect call
     OpBuilder<(ins "mlir::Value":$calleePtr, "mlir::Type":$resType,
-                   "mlir::ValueRange":$operands), [{
+                   "mlir::ValueRange":$operands,
+                   CArg<"SideEffect", "SideEffect::All">:$sideEffect), [{
       $_state.addOperands(calleePtr);
       $_state.addOperands(operands);
       if (resType && !isa<VoidType>(resType))
         $_state.addTypes(resType);
+      $_state.addAttribute("side_effect",
+        SideEffectAttr::get($_builder.getContext(), sideEffect));
     }]>,
   ];
 }
diff --git a/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td b/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td
index 80d78b11c2ba..203e42f7c575 100644
--- a/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td
+++ b/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td
@@ -34,6 +34,8 @@ let cppNamespace = "::cir" in {
           "Return the number of operands, accounts for indirect call or "
           "exception info",
           "unsigned", "getNumArgOperands", (ins)>,
+      InterfaceMethod<"Return the side effects of the call operation",
+                      "cir::SideEffect", "getSideEffect", (ins)>,
     ];
   }
 
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 3d120903dea1..45452c5929a3 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -95,7 +95,6 @@ struct MissingFeatures {
   static bool opCallReturn() { return false; }
   static bool opCallArgEvaluationOrder() { return false; }
   static bool opCallCallConv() { return false; }
-  static bool opCallSideEffect() { return false; }
   static bool opCallNoPrototypeFunc() { return false; }
   static bool opCallMustTail() { return false; }
   static bool opCallVirtual() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index af0e6ca822b8..9c9c96604c16 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -77,6 +77,35 @@ void CIRGenFunction::emitAggregateStore(mlir::Value value, Address dest) {
   builder.createStore(*currSrcLoc, value, dest);
 }
 
+/// Construct the CIR attribute list of a function or call.
+void CIRGenModule::constructAttributeList(CIRGenCalleeInfo calleeInfo,
+                                          cir::SideEffect &sideEffect) {
+  assert(!cir::MissingFeatures::opCallCallConv());
+  sideEffect = cir::SideEffect::All;
+
+  assert(!cir::MissingFeatures::opCallAttrs());
+
+  const Decl *targetDecl = calleeInfo.getCalleeDecl().getDecl();
+
+  if (targetDecl) {
+    assert(!cir::MissingFeatures::opCallAttrs());
+
+    // 'const', 'pure' and 'noalias' attributed functions are also nounwind.
+    if (targetDecl->hasAttr<ConstAttr>()) {
+      // gcc specifies that 'const' functions have greater restrictions than
+      // 'pure' functions, so they also cannot have infinite loops.
+      sideEffect = cir::SideEffect::Const;
+    } else if (targetDecl->hasAttr<PureAttr>()) {
+      // gcc specifies that 'pure' functions cannot have infinite loops.
+      sideEffect = cir::SideEffect::Pure;
+    }
+
+    assert(!cir::MissingFeatures::opCallAttrs());
+  }
+
+  assert(!cir::MissingFeatures::opCallAttrs());
+}
+
 /// Returns the canonical formal type of the given C++ method.
 static CanQual<FunctionProtoType> getFormalType(const CXXMethodDecl *md) {
   return md->getType()
@@ -386,7 +415,8 @@ static cir::CIRCallOpInterface
 emitCallLikeOp(CIRGenFunction &cgf, mlir::Location callLoc,
                cir::FuncType indirectFuncTy, mlir::Value indirectFuncVal,
                cir::FuncOp directFuncOp,
-               const SmallVectorImpl<mlir::Value> &cirCallArgs) {
+               const SmallVectorImpl<mlir::Value> &cirCallArgs,
+               cir::SideEffect sideEffect) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
 
   assert(!cir::MissingFeatures::opCallSurroundingTry());
@@ -397,11 +427,11 @@ emitCallLikeOp(CIRGenFunction &cgf, mlir::Location callLoc,
   if (indirectFuncTy) {
     // TODO(cir): Set calling convention for indirect calls.
     assert(!cir::MissingFeatures::opCallCallConv());
-    return builder.createIndirectCallOp(callLoc, indirectFuncVal,
-                                        indirectFuncTy, cirCallArgs);
+    return builder.createIndirectCallOp(
+        callLoc, indirectFuncVal, indirectFuncTy, cirCallArgs, sideEffect);
   }
 
-  return builder.createCallOp(callLoc, directFuncOp, cirCallArgs);
+  return builder.createCallOp(callLoc, directFuncOp, cirCallArgs, sideEffect);
 }
 
 const CIRGenFunctionInfo &
@@ -513,8 +543,9 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
     funcName = calleeFuncOp.getName();
 
   assert(!cir::MissingFeatures::opCallCallConv());
-  assert(!cir::MissingFeatures::opCallSideEffect());
   assert(!cir::MissingFeatures::opCallAttrs());
+  cir::SideEffect sideEffect;
+  cgm.constructAttributeList(callee.getAbstractInfo(), sideEffect);
 
   assert(!cir::MissingFeatures::invokeOp());
 
@@ -538,8 +569,9 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
   assert(!cir::MissingFeatures::opCallAttrs());
 
   mlir::Location callLoc = loc;
-  cir::CIRCallOpInterface theCall = emitCallLikeOp(
-      *this, loc, indirectFuncTy, indirectFuncVal, directFuncOp, cirCallArgs);
+  cir::CIRCallOpInterface theCall =
+      emitCallLikeOp(*this, loc, indirectFuncTy, indirectFuncVal, directFuncOp,
+                     cirCallArgs, sideEffect);
 
   if (callOp)
     *callOp = theCall;
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h
index 0353848f3ec0..56c76c51a46d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.h
@@ -105,6 +105,12 @@ public:
   /// callee
   CIRGenCallee prepareConcreteCallee(CIRGenFunction &cgf) const;
 
+  CIRGenCalleeInfo getAbstractInfo() const {
+    assert(!cir::MissingFeatures::opCallVirtual());
+    assert(isOrdinary());
+    return abstractInfo;
+  }
+
   mlir::Operation *getFunctionPointer() const {
     assert(isOrdinary());
     return reinterpret_cast<mlir::Operation *>(kindOrFunctionPtr);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 0ea2d9f9c822..71806e3c5de2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -14,6 +14,7 @@
 #define LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H
 
 #include "CIRGenBuilder.h"
+#include "CIRGenCall.h"
 #include "CIRGenTypeCache.h"
 #include "CIRGenTypes.h"
 #include "CIRGenValue.h"
@@ -158,6 +159,15 @@ public:
       const CXXRecordDecl *derivedClass,
       llvm::iterator_range<CastExpr::path_const_iterator> path);
 
+  /// Get the CIR attributes and calling convention to use for a particular
+  /// function type.
+  ///
+  /// \param calleeInfo - The callee information these attributes are being
+  /// constructed for. If valid, the attributes applied to this decl may
+  /// contribute to the function attributes and calling convention.
+  void constructAttributeList(CIRGenCalleeInfo calleeInfo,
+                              cir::SideEffect &sideEffect);
+
   /// Return a constant array for the given string.
   mlir::Attribute getConstantArrayFromStringLiteral(const StringLiteral *e);
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 3fcb0213b219..16248059c497 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -92,6 +92,46 @@ Operation *cir::CIRDialect::materializeConstant(mlir::OpBuilder &builder,
 // Helpers
 //===----------------------------------------------------------------------===//
 
+// Parses one of the keywords provided in the list `keywords` and returns the
+// position of the parsed keyword in the list. If none of the keywords from the
+// list is parsed, returns -1.
+static int parseOptionalKeywordAlternative(AsmParser &parser,
+                                           ArrayRef<llvm::StringRef> keywords) {
+  for (auto en : llvm::enumerate(keywords)) {
+    if (succeeded(parser.parseOptionalKeyword(en.value())))
+      return en.index();
+  }
+  return -1;
+}
+
+namespace {
+template <typename Ty> struct EnumTraits {};
+
+#define REGISTER_ENUM_TYPE(Ty)                                                 \
+  template <> struct EnumTraits<cir::Ty> {                                     \
+    static llvm::StringRef stringify(cir::Ty value) {                          \
+      return stringify##Ty(value);                                             \
+    }                                                                          \
+    static unsigned getMaxEnumVal() { return cir::getMaxEnumValFor##Ty(); }    \
+  }
+
+REGISTER_ENUM_TYPE(SideEffect);
+} // namespace
+
+/// Parse an enum from the keyword, return failure if the keyword is not found.
+template <typename EnumTy, typename RetTy = EnumTy>
+static ParseResult parseCIRKeyword(AsmParser &parser, RetTy &result) {
+  llvm::SmallVector<llvm::StringRef, 10> names;
+  for (unsigned i = 0, e = EnumTraits<EnumTy>::getMaxEnumVal(); i <= e; ++i)
+    names.push_back(EnumTraits<EnumTy>::stringify(static_cast<EnumTy>(i)));
+
+  int index = parseOptionalKeywordAlternative(parser, names);
+  if (index == -1)
+    return failure();
+  result = static_cast<RetTy>(index);
+  return success();
+}
+
 // Check if a region's termination omission is valid and, if so, creates and
 // inserts the omitted terminator into the region.
 static LogicalResult ensureRegionTerm(OpAsmParser &parser, Region &region,
@@ -534,6 +574,18 @@ static mlir::ParseResult parseCallCommon(mlir::OpAsmParser &parser,
   if (parser.parseRParen())
     return mlir::failure();
 
+  if (parser.parseOptionalKeyword("side_effect").succeeded()) {
+    if (parser.parseLParen().failed())
+      return failure();
+    cir::SideEffect sideEffect;
+    if (parseCIRKeyword<cir::SideEffect>(parser, sideEffect).failed())
+      return failure();
+    if (parser.parseRParen().failed())
+      return failure();
+    auto attr = cir::SideEffectAttr::get(parser.getContext(), sideEffect);
+    result.addAttribute("side_effect", attr);
+  }
+
   if (parser.parseOptionalAttrDict(result.attributes))
     return ::mlir::failure();
 
@@ -556,7 +608,8 @@ static mlir::ParseResult parseCallCommon(mlir::OpAsmParser &parser,
 static void printCallCommon(mlir::Operation *op,
                             mlir::FlatSymbolRefAttr calleeSym,
                             mlir::Value indirectCallee,
-                            mlir::OpAsmPrinter &printer) {
+                            mlir::OpAsmPrinter &printer,
+                            cir::SideEffect sideEffect) {
   printer << ' ';
 
   auto callLikeOp = mlir::cast<cir::CIRCallOpInterface>(op);
@@ -572,7 +625,13 @@ static void printCallCommon(mlir::Operation *op,
   }
   printer << "(" << ops << ")";
 
-  printer.printOptionalAttrDict(op->getAttrs(), {"callee"});
+  if (sideEffect != cir::SideEffect::All) {
+    printer << " side_effect(";
+    printer << stringifySideEffect(sideEffect);
+    printer << ")";
+  }
+
+  printer.printOptionalAttrDict(op->getAttrs(), {"callee", "side_effect"});
 
   printer << " : ";
   printer.printFunctionalType(op->getOperands().getTypes(),
@@ -586,7 +645,8 @@ mlir::ParseResult cir::CallOp::parse(mlir::OpAsmParser &parser,
 
 void cir::CallOp::print(mlir::OpAsmPrinter &p) {
   mlir::Value indirectCallee = isIndirect() ? getIndirectCall() : nullptr;
-  printCallCommon(*this, getCalleeAttr(), indirectCallee, p);
+  cir::SideEffect sideEffect = getSideEffect();
+  printCallCommon(*this, getCalleeAttr(), indirectCallee, p, sideEffect);
 }
 
 static LogicalResult
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index a96501ab2c38..b73cb839828e 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -220,6 +220,39 @@ mlir::Value lowerCirAttrAsValue(mlir::Operation *parentOp,
   return value;
 }
 
+void convertSideEffectForCall(mlir::Operation *callOp,
+                              cir::SideEffect sideEffect,
+                              mlir::LLVM::MemoryEffectsAttr &memoryEffect,
+                              bool &noUnwind, bool &willReturn) {
+  using mlir::LLVM::ModRefInfo;
+
+  switch (sideEffect) {
+  case cir::SideEffect::All:
+    memoryEffect = {};
+    noUnwind = false;
+    willReturn = false;
+    break;
+
+  case cir::SideEffect::Pure:
+    memoryEffect = mlir::LLVM::MemoryEffectsAttr::get(
+        callOp->getContext(), /*other=*/ModRefInfo::Ref,
+        /*argMem=*/ModRefInfo::Ref,
+        /*inaccessibleMem=*/ModRefInfo::Ref);
+    noUnwind = true;
+    willReturn = true;
+    break;
+
+  case cir::SideEffect::Const:
+    memoryEffect = mlir::LLVM::MemoryEffectsAttr::get(
+        callOp->getContext(), /*other=*/ModRefInfo::NoModRef,
+        /*argMem=*/ModRefInfo::NoModRef,
+        /*inaccessibleMem=*/ModRefInfo::NoModRef);
+    noUnwind = true;
+    willReturn = true;
+    break;
+  }
+}
+
 /// IntAttr visitor.
 mlir::Value CIRAttrToValue::visitCirAttr(cir::IntAttr intAttr) {
   mlir::Location loc = parentOp->getLoc();
@@ -745,12 +778,18 @@ rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands,
                     mlir::FlatSymbolRefAttr calleeAttr) {
   llvm::SmallVector<mlir::Type, 8> llvmResults;
   mlir::ValueTypeRange<mlir::ResultRange> cirResults = op->getResultTypes();
+  auto call = cast<cir::CIRCallOpInterface>(op);
 
   if (converter->convertTypes(cirResults, llvmResults).failed())
     return mlir::failure();
 
   assert(!cir::MissingFeatures::opCallCallConv());
-  assert(!cir::MissingFeatures::opCallSideEffect());
+
+  mlir::LLVM::MemoryEffectsAttr memoryEffects;
+  bool noUnwind = false;
+  bool willReturn = false;
+  convertSideEffectForCall(op, call.getSideEffect(), memoryEffects, noUnwind,
+                           willReturn);
 
   mlir::LLVM::LLVMFunctionType llvmFnTy;
   if (calleeAttr) { // direct call
@@ -775,10 +814,14 @@ rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands,
   assert(!cir::MissingFeatures::opCallLandingPad());
   assert(!cir::MissingFeatures::opCallContinueBlock());
   assert(!cir::MissingFeatures::opCallCallConv());
-  assert(!cir::MissingFeatures::opCallSideEffect());
 
-  rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(op, llvmFnTy, calleeAttr,
-                                                  callOperands);
+  auto newOp = rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
+      op, llvmFnTy, calleeAttr, callOperands);
+  if (memoryEffects)
+    newOp.setMemoryEffectsAttr(memoryEffects);
+  newOp.setNoUnwind(noUnwind);
+  newOp.setWillReturn(willReturn);
+
   return mlir::success();
 }
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index a80c66ac1abf..ae7247332c66 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -29,6 +29,11 @@ mlir::Value lowerCirAttrAsValue(mlir::Operation *parentOp, mlir::Attribute attr,
 
 mlir::LLVM::Linkage convertLinkage(cir::GlobalLinkageKind linkage);
 
+void convertSideEffectForCall(mlir::Operation *callOp,
+                              cir::SideEffect sideEffect,
+                              mlir::LLVM::MemoryEffectsAttr &memoryEffect,
+                              bool &noUnwind, bool &willReturn);
+
 class CIRToLLVMAssumeOpLowering
     : public mlir::OpConversionPattern<cir::AssumeOp> {
 public:
diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
index 13f3c5a21ceb..f6aa41df7439 100644
--- a/clang/test/CIR/CodeGen/call.c
+++ b/clang/test/CIR/CodeGen/call.c
@@ -109,3 +109,29 @@ void f9() {
 // OGCG-NEXT:    store i64 %[[RET]], ptr %[[SLOT]], align 4
 // OGCG-NEXT:    %[[ARG:.+]] = load i64, ptr %[[SLOT]], align 4
 // OGCG-NEXT:    call void @f1(i64 %[[ARG]])
+
+__attribute__((pure)) int f10(int);
+__attribute__((const)) int f11(int);
+int f12(void) {
+  return f10(1) + f11(2);
+}
+
+// CIR-LABEL: cir.func @f12() -> !s32i
+// CIR:         %[[A:.+]] = cir.const #cir.int<1> : !s32i
+// CIR-NEXT:    %{{.+}} = cir.call @f10(%[[A]]) side_effect(pure) : (!s32i) -> !s32i
+// CIR-NEXT:    %[[B:.+]] = cir.const #cir.int<2> : !s32i
+// CIR-NEXT:    %{{.+}} = cir.call @f11(%[[B]]) side_effect(const) : (!s32i) -> !s32i
+
+// LLVM-LABEL: define i32 @f12()
+// LLVM:         %{{.+}} = call i32 @f10(i32 1) #[[ATTR0:.+]]
+// LLVM-NEXT:    %{{.+}} = call i32 @f11(i32 2) #[[ATTR1:.+]]
+
+// OGCG-LABEL: define dso_local i32 @f12()
+// OGCG:         %{{.+}} = call i32 @f10(i32 noundef 1) #[[ATTR0:.+]]
+// OGCG-NEXT:    %{{.+}} = call i32 @f11(i32 noundef 2) #[[ATTR1:.+]]
+
+// LLVM: attributes #[[ATTR0]] = { nounwind willreturn memory(read, errnomem: none) }
+// LLVM: attributes #[[ATTR1]] = { nounwind willreturn memory(none) }
+
+// OGCG: attributes #[[ATTR0]] = { nounwind willreturn memory(read) }
+// OGCG: attributes #[[ATTR1]] = { nounwind willreturn memory(none) }
diff --git a/clang/test/CIR/IR/call.cir b/clang/test/CIR/IR/call.cir
index e35c201b6ed4..5f0916775479 100644
--- a/clang/test/CIR/IR/call.cir
+++ b/clang/test/CIR/IR/call.cir
@@ -8,11 +8,15 @@ cir.func @f1()
 
 cir.func @f2() {
   cir.call @f1() : () -> ()
+  cir.call @f1() side_effect(pure) : () -> ()
+  cir.call @f1() side_effect(const) : () -> ()
   cir.return
 }
 
 // CHECK:      cir.func @f2() {
 // CHECK-NEXT:   cir.call @f1() : () -> ()
+// CHECK-NEXT:   cir.call @f1() side_effect(pure) : () -> ()
+// CHECK-NEXT:   cir.call @f1() side_effect(const) : () -> ()
 // CHECK-NEXT:   cir.return
 // CHECK-NEXT: }
 

From faf9295f4e3a23a972d29e2be85052beef409d08 Mon Sep 17 00:00:00 2001
From: MingYan <99472920+NexMing@users.noreply.github.com>
Date: Thu, 19 Jun 2025 10:15:31 +0800
Subject: [PATCH 0886/1322] [RISCV] Fix a bug where AVL is the last MI in MBB.
 (#144668)

When `AVL` is the last MI, `std::next(II)` equals `MBB.end()`, and
calling `II->getParent()` at that point will cause an error.

---------

Co-authored-by: yanming <ming.yan@terapines.com>
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp  | 15 +++++----
 .../RISCV/rvv/vsetvli-insert-crossbb.mir      | 32 +++++++++++++++++++
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 9a513891b765..78d64ea67324 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1119,25 +1119,26 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     LIS->InsertMachineInstrInMaps(*MI);
     LiveInterval &LI = LIS->getInterval(AVLReg);
     SlotIndex SI = LIS->getInstructionIndex(*MI).getRegSlot();
+    const VNInfo *CurVNI = Info.getAVLVNInfo();
     // If the AVL value isn't live at MI, do a quick check to see if it's easily
     // extendable. Otherwise, we need to copy it.
-    if (LI.getVNInfoBefore(SI) != Info.getAVLVNInfo()) {
+    if (LI.getVNInfoBefore(SI) != CurVNI) {
       if (!LI.liveAt(SI) && LI.containsOneValue())
         LIS->extendToIndices(LI, SI);
       else {
         Register AVLCopyReg =
             MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass);
+        MachineBasicBlock *MBB = LIS->getMBBFromIndex(CurVNI->def);
         MachineBasicBlock::iterator II;
-        if (Info.getAVLVNInfo()->isPHIDef())
-          II = LIS->getMBBFromIndex(Info.getAVLVNInfo()->def)->getFirstNonPHI();
+        if (CurVNI->isPHIDef())
+          II = MBB->getFirstNonPHI();
         else {
-          II = LIS->getInstructionFromIndex(Info.getAVLVNInfo()->def);
+          II = LIS->getInstructionFromIndex(CurVNI->def);
           II = std::next(II);
         }
         assert(II.isValid());
-        auto AVLCopy =
-            BuildMI(*II->getParent(), II, DL, TII->get(RISCV::COPY), AVLCopyReg)
-                .addReg(AVLReg);
+        auto AVLCopy = BuildMI(*MBB, II, DL, TII->get(RISCV::COPY), AVLCopyReg)
+                           .addReg(AVLReg);
         LIS->InsertMachineInstrInMaps(*AVLCopy);
         MI->getOperand(1).setReg(AVLCopyReg);
         LIS->createAndComputeVirtRegInterval(AVLCopyReg);
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index 140875c4b24a..e09fc1828fec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -142,6 +142,10 @@
     ret void
   }
 
+  define void @avl_is_last_instr() {
+    ret void
+  }
+
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
@@ -1099,3 +1103,31 @@ body: |
     renamable $v10m2 = PseudoVADD_VV_M2 undef renamable $v10m2, %v, %v, -1, 5, 0
     renamable $v8m2 = PseudoVADD_VV_M2 undef renamable $v8m2, killed renamable $v10m2, killed %v, %outvl:gprnox0, 5, 0
     PseudoRET implicit $v8m2
+...
+---
+name: avl_is_last_instr
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: avl_is_last_instr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %avl:gprnox0 = COPY $x10
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY %avl
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   dead %avl:gprnox0 = ADDI %avl, -1
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETIVLI 1, 192 /* e8, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   $v8 = PseudoVMV_S_X undef renamable $v8, $x0, 1, 3 /* e8 */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETVLI [[COPY]], 192 /* e8, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   $v8 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, $noreg, 3 /* e8 */, 3 /* ta, ma */, implicit $vl, implicit $vtype
+  bb.0:
+    liveins: $x10
+    %avl:gprnox0 = COPY $x10
+
+  bb.1:
+    %vl:gprnox0 = PseudoVSETVLI %avl:gprnox0, 192, implicit-def dead $vl, implicit-def dead $vtype
+    %avl:gprnox0 = ADDI %avl:gprnox0, -1
+    $v8 = PseudoVMV_S_X undef renamable $v8, $x0, 1, 3
+    $v8 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %vl:gprnox0, 3, 3

From bfee625821c07d9a05b48e4a8b0f3d73c1233107 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Thu, 19 Jun 2025 07:49:08 +0530
Subject: [PATCH 0887/1322] [NVPTX] Attach Range attr to setmaxnreg and fence
 intrinsics (#144120)

This patch attaches the range attribute to the setmaxnreg
and fence.proxy.tensormap.* intrinsics. The range checks
are now handled generically in the Verifier. So, this patch
removes the per-intrinsic error-handling for range-checks
from the Verifier.

This patch also adds more coverage tests for these cases.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td          |  8 ++++++--
 llvm/lib/IR/Verifier.cpp                        | 10 ----------
 .../Verifier/NVPTX/fence-proxy.tensormap.ll     | 17 +++++++++++++++++
 llvm/test/Verifier/NVPTX/setmaxnreg.ll          |  4 +++-
 4 files changed, 26 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/Verifier/NVPTX/fence-proxy.tensormap.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 4efdff71c016..410a0dea2bf5 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1341,9 +1341,11 @@ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
         Intrinsic<[], [], [IntrNoCallback],
         "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>;
 
+  // The imm-arg 'size' can only be 128.
   def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope :
         Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
-                  [IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>],
+                  [IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
+                   Range<ArgIndex<1>, 128, 129>],
                   "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>;
 }
 
@@ -1989,10 +1991,12 @@ def int_nvvm_is_explicit_cluster
               "llvm.nvvm.is_explicit_cluster">;
 
 // Setmaxnreg inc/dec intrinsics
+// The imm-arg should be in the range: 24 <= val <= 256
 foreach op = ["dec", "inc"] in
   def int_nvvm_setmaxnreg_ # op # _sync_aligned_u32
     : DefaultAttrsIntrinsic<[], [llvm_i32_ty],
-              [IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+              [IntrConvergent, IntrNoMem, IntrHasSideEffects,
+               ImmArg<ArgIndex<0>>, Range<ArgIndex<0>, 24, 257>]>;
 
 // Exit
 def int_nvvm_exit : NVVMBuiltin,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 1f1041b25973..f0a4d7b6a4c1 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6557,8 +6557,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     unsigned RegCount = cast<ConstantInt>(V)->getZExtValue();
     Check(RegCount % 8 == 0,
           "reg_count argument to nvvm.setmaxnreg must be in multiples of 8");
-    Check((RegCount >= 24 && RegCount <= 256),
-          "reg_count argument to nvvm.setmaxnreg must be within [24, 256]");
     break;
   }
   case Intrinsic::experimental_convergence_entry:
@@ -6605,14 +6603,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "llvm.threadlocal.address operand isThreadLocal() must be true");
     break;
   }
-  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta:
-  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cluster:
-  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu:
-  case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys: {
-    unsigned size = cast<ConstantInt>(Call.getArgOperand(1))->getZExtValue();
-    Check(size == 128, " The only supported value for size operand is 128");
-    break;
-  }
   };
 
   // Verify that there aren't any unmediated control transfers between funclets.
diff --git a/llvm/test/Verifier/NVPTX/fence-proxy.tensormap.ll b/llvm/test/Verifier/NVPTX/fence-proxy.tensormap.ll
new file mode 100644
index 000000000000..4fa7a7ae7100
--- /dev/null
+++ b/llvm/test/Verifier/NVPTX/fence-proxy.tensormap.ll
@@ -0,0 +1,17 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) {
+  ; CHECK: immarg value 127 out of range [128, 129)
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 127);
+
+  ; CHECK: immarg value 129 out of range [128, 129)
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr addrspace(0) %addr, i32 129);
+
+  ; CHECK: immarg value 127 out of range [128, 129)
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr addrspace(0) %addr, i32 127);
+
+  ; CHECK: immarg value 129 out of range [128, 129)
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr addrspace(0) %addr, i32 129);
+
+  ret void
+}
diff --git a/llvm/test/Verifier/NVPTX/setmaxnreg.ll b/llvm/test/Verifier/NVPTX/setmaxnreg.ll
index 8999e4ffa667..1afebeab4742 100644
--- a/llvm/test/Verifier/NVPTX/setmaxnreg.ll
+++ b/llvm/test/Verifier/NVPTX/setmaxnreg.ll
@@ -7,8 +7,10 @@ define void @test_set_maxn_reg() {
   ; CHECK: reg_count argument to nvvm.setmaxnreg must be in multiples of 8
   call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 95)
 
-  ; CHECK: reg_count argument to nvvm.setmaxnreg must be within [24, 256]
+  ; CHECK: immarg value 16 out of range [24, 257)
   call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 16)
 
+  ; CHECK: immarg value 264 out of range [24, 257)
+  call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 264)
   ret void
 }

From 5875fafdc547889fb089c943a881a9ab6d8a23c0 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Thu, 19 Jun 2025 10:30:47 +0800
Subject: [PATCH 0888/1322] [X86] Remove CLDEMOTE from Alderlake and later
 hybrid processors (#144662)

SDM doesn't list any hybrid processors in this feature. Besides,
physical machine also reports not supported.
---
 clang/test/Preprocessor/predefined-arch-macros.c | 4 ++--
 llvm/lib/Target/X86/X86.td                       | 2 +-
 llvm/lib/TargetParser/X86TargetParser.cpp        | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 2d17891071aa..9dfeddbd4d5a 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2102,7 +2102,7 @@
 // CHECK_ADL_M32: #define __AVX__ 1
 // CHECK_ADL_M32: #define __BMI2__ 1
 // CHECK_ADL_M32: #define __BMI__ 1
-// CHECK_ADL_M32: #define __CLDEMOTE__ 1
+// CHECK_ADL_M32-NOT: #define __CLDEMOTE__ 1
 // CHECK_ADL_M32: #define __CLFLUSHOPT__ 1
 // CHECK_ADL_M32: #define __CLWB__ 1
 // CHECK_ADL_M32: #define __F16C__ 1
@@ -2173,7 +2173,7 @@
 // CHECK_ADL_M64: #define __AVX__ 1
 // CHECK_ADL_M64: #define __BMI2__ 1
 // CHECK_ADL_M64: #define __BMI__ 1
-// CHECK_ADL_M64: #define __CLDEMOTE__ 1
+// CHECK_ADL_M64-NOT: #define __CLDEMOTE__ 1
 // CHECK_ADL_M64: #define __CLFLUSHOPT__ 1
 // CHECK_ADL_M64: #define __CLWB__ 1
 // CHECK_ADL_M64: #define __F16C__ 1
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 2d635835e3ff..b09891652ad9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1284,7 +1284,6 @@ def ProcessorFeatures {
                                                   FeatureAVXVNNI,
                                                   FeaturePKU,
                                                   FeatureHRESET,
-                                                  FeatureCLDEMOTE,
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureWAITPKG];
@@ -1311,6 +1310,7 @@ def ProcessorFeatures {
                                                   FeatureAVXNECONVERT,
                                                   FeatureENQCMD,
                                                   FeatureUINTR,
+                                                  FeatureCLDEMOTE,
                                                   FeatureAVXVNNIINT8];
   list<SubtargetFeature> SRFFeatures =
     !listconcat(ADLFeatures, SRFAdditionalFeatures);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 21d05ee389e6..4947b05cd037 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -165,11 +165,11 @@ constexpr FeatureBitset FeaturesAlderlake =
     FeaturesTremont | FeatureADX | FeatureBMI | FeatureBMI2 | FeatureF16C |
     FeatureFMA | FeatureINVPCID | FeatureLZCNT | FeaturePCONFIG | FeaturePKU |
     FeatureSERIALIZE | FeatureSHSTK | FeatureVAES | FeatureVPCLMULQDQ |
-    FeatureCLDEMOTE | FeatureMOVDIR64B | FeatureMOVDIRI | FeatureWAITPKG |
-    FeatureAVXVNNI | FeatureHRESET | FeatureWIDEKL;
+    FeatureMOVDIR64B | FeatureMOVDIRI | FeatureWAITPKG | FeatureAVXVNNI |
+    FeatureHRESET | FeatureWIDEKL;
 constexpr FeatureBitset FeaturesSierraforest =
     FeaturesAlderlake | FeatureCMPCCXADD | FeatureAVXIFMA | FeatureUINTR |
-    FeatureENQCMD | FeatureAVXNECONVERT | FeatureAVXVNNIINT8;
+    FeatureCLDEMOTE | FeatureENQCMD | FeatureAVXNECONVERT | FeatureAVXVNNIINT8;
 constexpr FeatureBitset FeaturesArrowlakeS = FeaturesSierraforest |
     FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =

From 351303c28e8feb85c93d8e9480f534653b032735 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Wed, 18 Jun 2025 20:07:43 -0700
Subject: [PATCH 0889/1322] [mlir][docs] Fix broken links to Traits
 documentation. (#144820)

---
 mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td | 2 +-
 mlir/include/mlir/IR/BuiltinOps.td               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 77e3074661ab..481b14cdb462 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -339,7 +339,7 @@ def MemRef_AllocaOp : AllocLikeOp<"alloca", AutomaticAllocationScopeResource,[
     The `alloca` operation allocates memory on the stack, to be automatically
     released when control transfers back from the region of its closest
     surrounding operation with an
-    [`AutomaticAllocationScope`](../Traits.md/#automaticallocationscope) trait.
+    [`AutomaticAllocationScope`](../Traits/#automaticallocationscope) trait.
     The amount of memory allocated is specified by its memref and additional
     operands. For example:
 
diff --git a/mlir/include/mlir/IR/BuiltinOps.td b/mlir/include/mlir/IR/BuiltinOps.td
index 56edd7519cd6..cdc09afe0b67 100644
--- a/mlir/include/mlir/IR/BuiltinOps.td
+++ b/mlir/include/mlir/IR/BuiltinOps.td
@@ -40,7 +40,7 @@ def ModuleOp : Builtin_Op<"module", [
     [graph region](../LangRef.md#control-flow-and-ssacfg-regions) containing a single block
     which can contain any operations and does not have a terminator. Operations
     within this region cannot implicitly capture values defined outside the module,
-    i.e. Modules are [IsolatedFromAbove](../Traits.md#isolatedfromabove). Modules have
+    i.e. Modules are [IsolatedFromAbove](../Traits#isolatedfromabove). Modules have
     an optional [symbol name](../SymbolsAndSymbolTables.md) which can be used to refer
     to them in operations.
 

From 7b989ade35a43357f9152198ee2c76899df9a56d Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Wed, 18 Jun 2025 22:49:21 -0700
Subject: [PATCH 0890/1322] [lldb/crashlog] Make interactive mode the new
 default (#144839)

This patch makes interactive mode as the default when using the crashlog
command. It replaces the existing `-i|--interactive` flag with a new
`-m|--mode` option, that can either be `interactive` or `batch`.

By default, when the option is not explicitely set by the user, the
interactive mode is selected, however, lldb will fallback to batch mode
if the command interpreter is not interactive or if stdout is not a tty.

This also adds some railguards to prevent users from using interactive
only options with the batch mode and updates the tests accordingly.

rdar://97801509

Differential Revision: https://reviews.llvm.org/D141658

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/examples/python/crashlog.py              | 126 +++++++++++-------
 .../Python/Crashlog/altered_threadState.test  |   2 +-
 .../Python/Crashlog/json.test                 |   6 +-
 .../Python/Crashlog/no_threadState.test       |   2 +-
 .../skipped_status_interactive_crashlog.test  |   2 +-
 .../Python/Crashlog/text.test                 |   2 +-
 6 files changed, 85 insertions(+), 55 deletions(-)

diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py
index 6615c3353ffe..5f07cda2892a 100755
--- a/lldb/examples/python/crashlog.py
+++ b/lldb/examples/python/crashlog.py
@@ -31,6 +31,7 @@ import argparse
 import concurrent.futures
 import contextlib
 import datetime
+import enum
 import json
 import os
 import platform
@@ -45,7 +46,6 @@ import threading
 import time
 import uuid
 
-
 print_lock = threading.RLock()
 
 try:
@@ -1582,9 +1582,12 @@ def load_crashlog_in_scripted_process(debugger, crashlog_path, options, result):
                 debugger.RunCommandInterpreter(True, False, run_options, 0, False, True)
 
 
-def CreateSymbolicateCrashLogOptions(
-    command_name, description, add_interactive_options
-):
+class CrashLogLoadingMode(str, enum.Enum):
+    batch = "batch"
+    interactive = "interactive"
+
+
+def CreateSymbolicateCrashLogOptions(command_name, description):
     usage = "crashlog [options] <FILE> [FILE ...]"
     arg_parser = argparse.ArgumentParser(
         description=description,
@@ -1600,6 +1603,12 @@ def CreateSymbolicateCrashLogOptions(
         help="crash report(s) to symbolicate",
     )
 
+    arg_parser.add_argument(
+        "-m",
+        "--mode",
+        choices=[mode.value for mode in CrashLogLoadingMode],
+        help="change how the symbolicated process and threads are displayed to the user (default: interactive)",
+    )
     arg_parser.add_argument(
         "--version",
         "-V",
@@ -1736,36 +1745,35 @@ def CreateSymbolicateCrashLogOptions(
         help=argparse.SUPPRESS,
         default=False,
     )
-    if add_interactive_options:
-        arg_parser.add_argument(
-            "-i",
-            "--interactive",
-            action="store_true",
-            help="parse a crash log and load it in a ScriptedProcess",
-            default=False,
-        )
-        arg_parser.add_argument(
-            "-b",
-            "--batch",
-            action="store_true",
-            help="dump symbolicated stackframes without creating a debug session",
-            default=True,
-        )
-        arg_parser.add_argument(
-            "--target",
-            "-t",
-            dest="target_path",
-            help="the target binary path that should be used for interactive crashlog (optional)",
-            default=None,
-        )
-        arg_parser.add_argument(
-            "--skip-status",
-            "-s",
-            dest="skip_status",
-            action="store_true",
-            help="prevent the interactive crashlog to dump the process status and thread backtrace at launch",
-            default=False,
-        )
+    arg_parser.add_argument(
+        "--target",
+        "-t",
+        dest="target_path",
+        help="the target binary path that should be used for interactive crashlog (optional)",
+        default=None,
+    )
+    arg_parser.add_argument(
+        "--skip-status",
+        "-s",
+        dest="skip_status",
+        action="store_true",
+        help="prevent the interactive crashlog to dump the process status and thread backtrace at launch",
+        default=False,
+    )
+    legacy_group = arg_parser.add_mutually_exclusive_group()
+    legacy_group.add_argument(
+        "-i",
+        "--interactive",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
+    legacy_group.add_argument(
+        "-b",
+        "--batch",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
+
     return arg_parser
 
 
@@ -1778,7 +1786,7 @@ for use at the LLDB command line. After a crash log has been parsed and symbolic
 created that has all of the shared libraries loaded at the load addresses found in the crash log file. This allows
 you to explore the program as if it were stopped at the locations described in the crash log and functions can
 be disassembled and lookups can be performed using the addresses found in the crash log."""
-    return CreateSymbolicateCrashLogOptions("crashlog", description, True)
+    return CreateSymbolicateCrashLogOptions("crashlog", description)
 
 
 def SymbolicateCrashLogs(debugger, command_args, result, is_command):
@@ -1794,8 +1802,35 @@ def SymbolicateCrashLogs(debugger, command_args, result, is_command):
         result.SetError(str(e))
         return
 
+    # To avoid breaking existing users, we should keep supporting legacy flags
+    # even if we don't use them / advertise them anymore.
+    if not options.mode:
+        if options.batch:
+            options.mode = CrashLogLoadingMode.batch
+        else:
+            options.mode = CrashLogLoadingMode.interactive
+
+    if options.mode != CrashLogLoadingMode.interactive and (
+        options.target_path or options.skip_status
+    ):
+        print(
+            "Target path (-t) and skipping process status (-s) options can only used in interactive mode (-m=interactive)."
+        )
+        print("Aborting symbolication.")
+        arg_parser.print_help()
+        return
+
+    if options.version:
+        print(debugger.GetVersionString())
+        return
+
+    if options.debug:
+        print("command_args = %s" % command_args)
+        print("options", options)
+        print("args", options.reports)
+
     # Interactive mode requires running the crashlog command from inside lldb.
-    if options.interactive and not is_command:
+    if options.mode == CrashLogLoadingMode.interactive and not is_command:
         lldb_exec = (
             subprocess.check_output(["/usr/bin/xcrun", "-f", "lldb"])
             .decode("utf-8")
@@ -1821,31 +1856,26 @@ def SymbolicateCrashLogs(debugger, command_args, result, is_command):
         print(debugger.GetVersionString())
         return
 
-    if options.debug:
-        print("command_args = %s" % command_args)
-        print("options", options)
-        print("args", options.reports)
-
     if options.debug_delay > 0:
         print("Waiting %u seconds for debugger to attach..." % options.debug_delay)
         time.sleep(options.debug_delay)
     error = lldb.SBError()
 
     def should_run_in_interactive_mode(options, ci):
-        if options.interactive:
+        if options.mode == CrashLogLoadingMode.batch:
+            return False
+        elif options.mode == CrashLogLoadingMode.interactive or (
+            ci and ci.IsInteractive()
+        ):
             return True
-        elif options.batch:
-            return False
-        # elif ci and ci.IsInteractive():
-        #     return True
         else:
-            return False
+            return sys.stdout.isatty()
 
     ci = debugger.GetCommandInterpreter()
 
     if options.reports:
         for crashlog_file in options.reports:
-            crashlog_path = os.path.expanduser(crashlog_file)
+            crashlog_path = os.path.normpath(os.path.expanduser(crashlog_file))
             if not os.path.exists(crashlog_path):
                 raise FileNotFoundError(
                     "crashlog file %s does not exist" % crashlog_path
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/altered_threadState.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/altered_threadState.test
index 5a946a38b195..d925324822de 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/altered_threadState.test
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/altered_threadState.test
@@ -1,7 +1,7 @@
 # RUN: %clang_host -g %S/Inputs/test.c -o %t.out
 # RUN: cp %S/Inputs/altered_threadState.crash %t.crash
 # RUN: %python %S/patch-crashlog.py --binary %t.out --crashlog %t.crash --offsets '{"main":20, "bar":9, "foo":16}'
-# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog %t.crash' 2>&1 | FileCheck %s
+# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog -b %t.crash' 2>&1 | FileCheck %s
 
 # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands
 
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/json.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/json.test
index c2e23e82be7f..d5c6d915316e 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/json.test
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/json.test
@@ -2,12 +2,12 @@
 
 # RUN: cp %S/Inputs/a.out.ips %t.crash
 # RUN: %python %S/patch-crashlog.py --binary %t.out --crashlog %t.crash --offsets '{"main":20, "bar":9, "foo":16}' --json
-# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog %t.crash' 2>&1 | FileCheck %s
-# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog -c %t.crash' 2>&1 | FileCheck %s
+# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog --mode batch %t.crash' 2>&1 | FileCheck %s
+# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog --mode batch -c %t.crash' 2>&1 | FileCheck %s
 
 # RUN: cp %S/Inputs/a.out.ips %t.nometadata.crash
 # RUN: %python %S/patch-crashlog.py --binary %t.out --crashlog %t.nometadata.crash --offsets '{"main":20, "bar":9, "foo":16}' --json --no-metadata
-# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog %t.nometadata.crash' 2>&1 | FileCheck %s
+# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog --mode batch %t.nometadata.crash' 2>&1 | FileCheck %s
 
 # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands
 
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/no_threadState.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/no_threadState.test
index 5b5cef40716c..2e4b46c8c240 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/no_threadState.test
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/no_threadState.test
@@ -2,7 +2,7 @@
 
 # RUN: cp %S/Inputs/no_threadState.ips %t.crash
 # RUN: %python %S/patch-crashlog.py --binary %t.out --crashlog %t.crash --offsets '{"main":20, "bar":9, "foo":16}' --json
-# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog %t.crash' 2>&1 | FileCheck %s
+# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog --mode batch %t.crash' 2>&1 | FileCheck %s
 
 # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands
 
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test
index 64cd0904371a..52a185b8cf76 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test
@@ -3,7 +3,7 @@
 # RUN: mkdir -p %t.dir
 # RUN: yaml2obj %S/Inputs/interactive_crashlog/multithread-test.yaml > %t.dir/multithread-test
 # RUN: %lldb -b -o 'command script import lldb.macosx.crashlog' \
-# RUN: -o 'crashlog -a -i -s -t %t.dir/multithread-test %S/Inputs/interactive_crashlog/multithread-test.ips' \
+# RUN: -o 'crashlog -a -s -t %t.dir/multithread-test %S/Inputs/interactive_crashlog/multithread-test.ips' \
 # RUN: -o 'command source -s 0 %s' 2>&1 | FileCheck %s
 
 # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/text.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/text.test
index e9d1c5e98fb3..eec30a1da64c 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/text.test
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/text.test
@@ -1,7 +1,7 @@
 # RUN: %clang_host -g %S/Inputs/test.c -o %t.out
 # RUN: cp %S/Inputs/a.out.crash %t.crash
 # RUN: %python %S/patch-crashlog.py --binary %t.out --crashlog %t.crash --offsets '{"main":20, "bar":9, "foo":16}'
-# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog %t.crash' 2>&1 | FileCheck %s
+# RUN: %lldb %t.out -o 'command script import lldb.macosx.crashlog' -o 'crashlog -b %t.crash' 2>&1 | FileCheck %s
 
 # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands
 

From 590066bee70db37636311881c5b232464d6d4aec Mon Sep 17 00:00:00 2001
From: Rajat Bajpai <rbajpai@nvidia.com>
Date: Thu, 19 Jun 2025 12:18:17 +0530
Subject: [PATCH 0891/1322] [NVPTX] Add family-specific architectures support
 (#141899)

This change adds family-specific architecture variants support added in [PTX ISA
8.8](https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-version-8-8).
These architecture variants have "f" suffix. For example, sm_100f.

This change doesn't promote existing features to family-specific
architecture.
---
 llvm/docs/NVPTXUsage.rst                | 50 ++++++++++++++++
 llvm/lib/Target/NVPTX/NVPTX.td          | 76 +++++++++++++++++++++----
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td |  8 +--
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h  | 42 +++++++++-----
 llvm/test/CodeGen/NVPTX/sm-version.ll   | 20 +++++++
 5 files changed, 168 insertions(+), 28 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index abd7ca545364..11017fe4e01b 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -147,7 +147,57 @@ Example: 32-bit PTX for CUDA Driver API: ``nvptx-nvidia-cuda``
 
 Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda``
 
+.. _nvptx_arch_hierarchy:
 
+NVPTX Architecture Hierarchy and Ordering
+=========================================
+
+GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y
+('Y' represents version within the architecture)
+The architectures have name of form ``sm_XYz`` where ``X`` represent the generation
+number, ``Y`` represents the version within the architecture, and ``z`` represents
+the optional feature suffix.
+If ``X1Y1 <= X2Y2``, then GPU capabilities of ``sm_X1Y1`` are included in ``sm_X2Y2``.
+For example, take ``sm_90`` (9 represents ``X``, 0 represents ``Y``, and no feature
+suffix) and ``sm_103`` architectures (10 represents ``X``, 3 represents ``Y``, and no
+feature suffix). Since 90 <= 103, ``sm_90`` is compatible with ``sm_103``.
+
+The family-specific variants have ``f`` feature suffix and they follow
+following order:
+``sm_X{Y2}f > sm_X{Y1}f`` iff ``Y2 > Y1``
+``sm_XY{f} > sm_{XY}{}``
+
+For example, take ``sm_100f`` (10 represents ``X``, 0 represents ``Y``, and ``f``
+represents ``z``) and ``sm_103f`` (10 represents ``X``, 3 represents ``Y``, and ``f``
+represents ``z``) architecture variants. Since ``Y1 < Y2``, ``sm_100f`` is compatible with
+``sm_103f``. Similarly based on the second rule, ``sm_90`` is compatible with ``sm_103f``.
+
+Some counter examples, take ``sm_100f`` and ``sm_120f`` (12 represents ``X``, 0
+represents ``Y``, and ``f`` represents ``z``) architecture variants. Since both
+belongs to different family i.e. ``X1 != X2``, ``sm_100f`` is not compatible with
+``sm_120f``.
+
+The architecture-specific variants have ``a`` feature suffix and they follow
+following order:
+``sm_XY{a} > sm_XY{f} > sm_{XY}{}``
+
+For example, take ``sm_103a`` (10 represents ``X``, 3 represents ``Y``, and ``a``
+represents ``z``), ``sm_103f``, and ``sm_103`` architecture variants. The ``sm_103`` is
+compatible with ``sm_103a`` and ``sm_103f``, and ``sm_103f`` is compatible with ``sm_103a``.
+
+Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
+Arch := X * 10 + Y
+
+For example, ``sm_103f`` is encoded as 1032 (103 * 10 + 2) and ``sm_103a`` is
+encoded as 1033 (103 * 10 + 2 + 1).
+
+This encoding allows simple partial ordering of the architectures.
+
+* Compare Family and Arch by dividing FullSMVersion by 100 and 10
+  respectively before the comparison.
+* Compare within the family by comparing FullSMVersion, given both belongs to
+  the same family.
+* Detect ``a`` variants by checking FullSMVersion & 1.
 
 .. _nvptx_intrinsics:
 
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index ff9a187ecf72..83992606bc41 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -33,20 +33,69 @@ class FeaturePTX<int version>:
    SubtargetFeature<"ptx"# version, "PTXVersion",
                     "" # version,
                     "Use PTX version " # version>;
-
+// NVPTX Architecture Hierarchy and Ordering:
+//
+// GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y
+// ('Y' represents version within the architecture)
+// The architectures have name of form sm_XYz where 'X' represent the generation
+// number, 'Y' represents the version within the architecture, and 'z' represents
+// the optional feature suffix.
+// If X1Y1 <= X2Y2, then GPU capabilities of sm_X1Y1 are included in sm_X2Y2.
+// For example, take sm_90 (9 represents 'X', 0 represents 'Y', and no feature
+// suffix) and sm_103 architectures (10 represents 'X', 3 represents 'Y', and no
+// feature suffix). Since 90 <= 103, sm_90 is compatible with sm_103.
+//
+// The family-specific variants have 'f' feature suffix and they follow
+// following order:
+// sm_X{Y2}f > sm_X{Y1}f iff Y2 > Y1
+// sm_XY{f} > sm_{XY}{}
+//
+// For example, take sm_100f (10 represents 'X', 0 represents 'Y', and 'f'
+// represents 'z') and sm_103f (10 represents 'X', 3 represents 'Y', and 'f'
+// represents 'z') architecture variants. Since Y1 < Y2, sm_100f is compatible with
+// sm_103f. Similarly based on the second rule, sm_90 is compatible with sm_103f.
+//
+// Some counter examples, take sm_100f and sm_120f (12 represents 'X', 0
+// represents 'Y', and 'f' represents 'z') architecture variants. Since both
+// belongs to different family i.e. X1 != X2, sm_100f is not compatible with
+// sm_120f.
+//
+// The architecture-specific variants have 'a' feature suffix and they follow
+// following order:
+// sm_XY{a} > sm_XY{f} > sm_{XY}{}
+//
+// For example, take sm_103a (10 represents 'X', 3 represents 'Y', and 'a'
+// represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
+// compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
+//
+// Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
+// Arch := X * 10 + Y
+//
+// For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is
+// encoded as 1032 (103 * 10 + 2).
+//
+// This encoding allows simple partial ordering of the architectures.
+//  + Compare Family and Arch by dividing FullSMVersion by 100 and 10
+//    respectively before the comparison.
+//  + Compare within the family by comparing FullSMVersion, given both belongs to
+//    the same family.
+//  + Detect 'a' variants by checking FullSMVersion & 1.
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
               60, 61, 62, 70, 72, 75, 80, 86, 87,
-              89, 90, 100, 101, 103, 120, 121] in
-  def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
+              89, 90, 100, 101, 103, 120, 121] in {
+  // Base SM version (e.g. FullSMVersion for sm_100 is 1000)
+  def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
 
-// Arch-specific targets. PTX for these is not compatible with any other
-// architectures.
-def SM90a : FeatureSM<"90a", 901>;
-def SM100a: FeatureSM<"100a", 1001>;
-def SM101a: FeatureSM<"101a", 1011>;
-def SM103a: FeatureSM<"103a", 1031>;
-def SM120a: FeatureSM<"120a", 1201>;
-def SM121a: FeatureSM<"121a", 1211>;
+  // Family-specific targets which are compatible within same family
+  // (e.g. FullSMVersion for sm_100f is 1002)
+  if !ge(sm, 100) then
+    def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
+
+  // Architecture-specific targets which are incompatible across architectures
+  // (e.g. FullSMVersion for sm_100a is 1003)
+  if !ge(sm, 90) then
+    def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
+}
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
@@ -83,14 +132,19 @@ def : Proc<"sm_90",   [SM90, PTX78]>;
 def : Proc<"sm_90a",  [SM90a, PTX80]>;
 def : Proc<"sm_100",  [SM100, PTX86]>;
 def : Proc<"sm_100a", [SM100a, PTX86]>;
+def : Proc<"sm_100f", [SM100f, PTX88]>;
 def : Proc<"sm_101",  [SM101, PTX86]>;
 def : Proc<"sm_101a", [SM101a, PTX86]>;
+def : Proc<"sm_101f", [SM101f, PTX88]>;
 def : Proc<"sm_103",  [SM103, PTX88]>;
 def : Proc<"sm_103a", [SM103a, PTX88]>;
+def : Proc<"sm_103f", [SM103f, PTX88]>;
 def : Proc<"sm_120",  [SM120, PTX87]>;
 def : Proc<"sm_120a", [SM120a, PTX87]>;
+def : Proc<"sm_120f", [SM120f, PTX88]>;
 def : Proc<"sm_121",  [SM121, PTX88]>;
 def : Proc<"sm_121a", [SM121a, PTX88]>;
+def : Proc<"sm_121f", [SM121f, PTX88]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5dbdce52f055..bbe99dec5c44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -158,10 +158,10 @@ class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>
 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
 
 // Explicit records for arch-accelerated SM versions
-def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
-def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">;
-def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">;
-def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">;
+def hasSM90a : Predicate<"Subtarget->getSmVersion() == 90 && Subtarget->hasArchAccelFeatures()">;
+def hasSM100a : Predicate<"Subtarget->getSmVersion() == 100 && Subtarget->hasArchAccelFeatures()">;
+def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArchAccelFeatures()">;
+def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">;
 
 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index d2eae4882682..8810feaee297 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -108,8 +108,8 @@ public:
     switch (FullSmVersion) {
     default:
       break;
-    case 1001: // sm_100a
-    case 1011: // sm_101a
+    case 1003: // sm_100a
+    case 1013: // sm_101a
       HasTcgen05 = true;
       break;
     }
@@ -120,9 +120,15 @@ public:
   // TMA G2S copy with cta_group::1/2 support
   bool hasCpAsyncBulkTensorCTAGroupSupport() const {
     // TODO: Update/tidy-up after the family-conditional support arrives
-    return ((FullSmVersion == 1001 || FullSmVersion == 1011) &&
-            PTXVersion >= 86) ||
-           (FullSmVersion == 1031 && PTXVersion >= 88);
+    switch (FullSmVersion) {
+    case 1003:
+    case 1013:
+      return PTXVersion >= 86;
+    case 1033:
+      return PTXVersion >= 88;
+    default:
+      return false;
+    }
   }
 
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
@@ -136,14 +142,24 @@ public:
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
-  // GPUs with "a" suffix have include architecture-accelerated features that
-  // are supported on the specified architecture only, hence such targets do not
-  // follow the onion layer model. hasArchAccelFeatures() allows
-  // distinguishing such GPU variants from the base GPU architecture.
-  // - 0 represents base GPU model,
-  // - non-zero value identifies particular architecture-accelerated variant.
-  bool hasArchAccelFeatures() const { return getFullSmVersion() % 10; }
-
+  // GPUs with "a" suffix have architecture-accelerated features that are
+  // supported on the specified architecture only, hence such targets do not
+  // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
+  // such GPU variants from the base GPU architecture.
+  // - false represents non-accelerated architecture.
+  // - true represents architecture-accelerated variant.
+  bool hasArchAccelFeatures() const {
+    return (getFullSmVersion() & 1) && PTXVersion >= 80;
+  }
+  // GPUs with 'f' suffix have architecture-accelerated features which are
+  // portable across all future architectures under same SM major. For example,
+  // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
+  // - false represents non-family-specific architecture.
+  // - true represents family-specific variant.
+  bool hasFamilySpecificFeatures() const {
+    return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
+                                        : hasArchAccelFeatures();
+  }
   // If the user did not provide a target we default to the `sm_30` target.
   std::string getTargetName() const {
     return TargetName.empty() ? "sm_30" : TargetName;
diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll
index 9705a2f3ba73..3a154a1b9ac9 100644
--- a/llvm/test/CodeGen/NVPTX/sm-version.ll
+++ b/llvm/test/CodeGen/NVPTX/sm-version.ll
@@ -18,14 +18,19 @@
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100 | FileCheck %s --check-prefix=SM100
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101 | FileCheck %s --check-prefix=SM101
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103 | FileCheck %s --check-prefix=SM103
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120 | FileCheck %s --check-prefix=SM120
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121 | FileCheck %s --check-prefix=SM121
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f
 
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_21 | FileCheck %s --check-prefix=SM21
@@ -47,14 +52,19 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=SM100
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101 | FileCheck %s --check-prefix=SM101
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103 | FileCheck %s --check-prefix=SM103
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120 | FileCheck %s --check-prefix=SM120
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121 | FileCheck %s --check-prefix=SM121
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f
 
 ; SM20: .version 3.2
 ; SM21: .version 3.2
@@ -76,14 +86,19 @@
 ; SM90a: .version 8.0
 ; SM100: .version 8.6
 ; SM100a: .version 8.6
+; SM100f: .version 8.8
 ; SM101: .version 8.6
 ; SM101a: .version 8.6
+; SM101f: .version 8.8
 ; SM103: .version 8.8
 ; SM103a: .version 8.8
+; SM103f: .version 8.8
 ; SM120: .version 8.7
 ; SM120a: .version 8.7
+; SM120f: .version 8.8
 ; SM121: .version 8.8
 ; SM121a: .version 8.8
+; SM121f: .version 8.8
 
 ; SM20: .target sm_20
 ; SM21: .target sm_21
@@ -105,11 +120,16 @@
 ; SM90a: .target sm_90a
 ; SM100: .target sm_100
 ; SM100a: .target sm_100a
+; SM100f: .target sm_100f
 ; SM101: .target sm_101
 ; SM101a: .target sm_101a
+; SM101f: .target sm_101f
 ; SM103: .target sm_103
 ; SM103a: .target sm_103a
+; SM103f: .target sm_103f
 ; SM120: .target sm_120
 ; SM120a: .target sm_120a
+; SM120f: .target sm_120f
 ; SM121: .target sm_121
 ; SM121a: .target sm_121a
+; SM121f: .target sm_121f

From 03461c9c6e21e43a6e1c699bfb254ddb3d575c93 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <hsiangkai.wang@arm.com>
Date: Thu, 19 Jun 2025 07:56:30 +0100
Subject: [PATCH 0892/1322] [mlir][gpu][spirv] Remove rotation semantics of
 gpu.shuffle up/down (#139105)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From the description of gpu.shuffle operation, shuffle up/down rotates
values in the subgroup because it applies modulo on the shifted value to
calculate the result lane ID. It is inconsistent with the definition of
SPIR-V shuffle up/down and NVVM data movement definitions within
subgroup.

In NVVM, it says

"If the computed source lane index j is in range, the returned i32 value
will be the value of %a from lane j; otherwise, it will be the the value
of %a from the current thread."

It will keep the original value if the result land ID is out of range.

In SPIR-V OpGroupNonUniformShuffleUp and OpGroupNonUniformShuffleDown,
it says

"The resulting value is undefined if Delta is greater than the current
invocation’s id within the scope or if the identified invocation is not
in scope restricted tangle."

It's an undefined value if the result land ID is out of range.

Anyway, there is no circular movement in shuffle up/down from these 2
specifications. This patch removes the circular movement in gpu.shuffle
up/down and lower gpu.shuffle up/down to SPIR-V
OpGroupNonUniformShuffleUp and OpGroupNonUniformShuffleDown directly.

Reference:

https://docs.nvidia.com/cuda/archive/12.2.1/nvvm-ir-spec/index.html#data-movement

https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpGroupNonUniformShuffleUp

https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpGroupNonUniformShuffleDown
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  6 +-
 mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp | 45 ++++++++++--
 mlir/test/Conversion/GPUToSPIRV/shuffle.mlir  | 71 ++++++++++++++++++-
 3 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 15b14c767b66..a81b2e83ddef 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1332,7 +1332,8 @@ def GPU_ShuffleOp : GPU_Op<
     %3, %4 = gpu.shuffle down %0, %cst1, %width : f32
     ```
 
-    For lane `k`, returns the value from lane `(k + 1) % width`.
+    For lane `k`, returns the value from lane `(k + cst1)`. If `(k + cst1)` is
+    bigger than or equal to `width`, the value is poison and `valid` is `false`.
 
     `up` example:
 
@@ -1341,7 +1342,8 @@ def GPU_ShuffleOp : GPU_Op<
     %5, %6 = gpu.shuffle up %0, %cst1, %width : f32
     ```
 
-    For lane `k`, returns the value from lane `(k - 1) % width`.
+    For lane `k`, returns the value from lane `(k - cst1)`. If `(k - cst1)` is
+    smaller than `0`, the value is poison and `valid` is `false`.
 
     `idx` example:
 
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index 46db5d3fdca3..93c76d267c51 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -435,26 +435,57 @@ LogicalResult GPUShuffleConversion::matchAndRewrite(
     return rewriter.notifyMatchFailure(
         shuffleOp, "shuffle width and target subgroup size mismatch");
 
+  assert(!adaptor.getOffset().getType().isSignedInteger() &&
+         "shuffle offset must be a signless/unsigned integer");
+
   Location loc = shuffleOp.getLoc();
-  Value trueVal = spirv::ConstantOp::getOne(rewriter.getI1Type(),
-                                            shuffleOp.getLoc(), rewriter);
   auto scope = rewriter.getAttr<spirv::ScopeAttr>(spirv::Scope::Subgroup);
   Value result;
+  Value validVal;
 
   switch (shuffleOp.getMode()) {
-  case gpu::ShuffleMode::XOR:
+  case gpu::ShuffleMode::XOR: {
     result = rewriter.create<spirv::GroupNonUniformShuffleXorOp>(
         loc, scope, adaptor.getValue(), adaptor.getOffset());
+    validVal = spirv::ConstantOp::getOne(rewriter.getI1Type(),
+                                         shuffleOp.getLoc(), rewriter);
     break;
-  case gpu::ShuffleMode::IDX:
+  }
+  case gpu::ShuffleMode::IDX: {
     result = rewriter.create<spirv::GroupNonUniformShuffleOp>(
         loc, scope, adaptor.getValue(), adaptor.getOffset());
+    validVal = spirv::ConstantOp::getOne(rewriter.getI1Type(),
+                                         shuffleOp.getLoc(), rewriter);
     break;
-  default:
-    return rewriter.notifyMatchFailure(shuffleOp, "unimplemented shuffle mode");
+  }
+  case gpu::ShuffleMode::DOWN: {
+    result = rewriter.create<spirv::GroupNonUniformShuffleDownOp>(
+        loc, scope, adaptor.getValue(), adaptor.getOffset());
+
+    Value laneId = rewriter.create<gpu::LaneIdOp>(loc, widthAttr);
+    Value resultLaneId =
+        rewriter.create<arith::AddIOp>(loc, laneId, adaptor.getOffset());
+    validVal = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
+                                              resultLaneId, adaptor.getWidth());
+    break;
+  }
+  case gpu::ShuffleMode::UP: {
+    result = rewriter.create<spirv::GroupNonUniformShuffleUpOp>(
+        loc, scope, adaptor.getValue(), adaptor.getOffset());
+
+    Value laneId = rewriter.create<gpu::LaneIdOp>(loc, widthAttr);
+    Value resultLaneId =
+        rewriter.create<arith::SubIOp>(loc, laneId, adaptor.getOffset());
+    auto i32Type = rewriter.getIntegerType(32);
+    validVal = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, resultLaneId,
+        rewriter.create<arith::ConstantOp>(
+            loc, i32Type, rewriter.getIntegerAttr(i32Type, 0)));
+    break;
+  }
   }
 
-  rewriter.replaceOp(shuffleOp, {result, trueVal});
+  rewriter.replaceOp(shuffleOp, {result, validVal});
   return success();
 }
 
diff --git a/mlir/test/Conversion/GPUToSPIRV/shuffle.mlir b/mlir/test/Conversion/GPUToSPIRV/shuffle.mlir
index d3d8ec0dab40..e93f69704f25 100644
--- a/mlir/test/Conversion/GPUToSPIRV/shuffle.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/shuffle.mlir
@@ -15,8 +15,8 @@ gpu.module @kernels {
 
     // CHECK: %[[MASK:.+]] = spirv.Constant 8 : i32
     // CHECK: %[[VAL:.+]] = spirv.Constant 4.200000e+01 : f32
-    // CHECK: %{{.+}} = spirv.Constant true
     // CHECK: %{{.+}} = spirv.GroupNonUniformShuffleXor <Subgroup> %[[VAL]], %[[MASK]] : f32, i32
+    // CHECK: %{{.+}} = spirv.Constant true
     %result, %valid = gpu.shuffle xor %val, %mask, %width : f32
     gpu.return
   }
@@ -64,11 +64,78 @@ gpu.module @kernels {
 
     // CHECK: %[[MASK:.+]] = spirv.Constant 8 : i32
     // CHECK: %[[VAL:.+]] = spirv.Constant 4.200000e+01 : f32
-    // CHECK: %{{.+}} = spirv.Constant true
     // CHECK: %{{.+}} = spirv.GroupNonUniformShuffle <Subgroup> %[[VAL]], %[[MASK]] : f32, i32
+    // CHECK: %{{.+}} = spirv.Constant true
     %result, %valid = gpu.shuffle idx %val, %mask, %width : f32
     gpu.return
   }
 }
 
 }
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle, GroupNonUniformShuffleRelative], []>,
+    #spirv.resource_limits<subgroup_size = 16>>
+} {
+
+gpu.module @kernels {
+  // CHECK-LABEL:  spirv.func @shuffle_down()
+  gpu.func @shuffle_down() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+    %offset = arith.constant 4 : i32
+    %width = arith.constant 16 : i32
+    %val = arith.constant 42.0 : f32
+
+    // CHECK: %[[OFFSET:.+]] = spirv.Constant 4 : i32
+    // CHECK: %[[WIDTH:.+]] = spirv.Constant 16 : i32
+    // CHECK: %[[VAL:.+]] = spirv.Constant 4.200000e+01 : f32
+    // CHECK: %{{.+}} = spirv.GroupNonUniformShuffleDown <Subgroup> %[[VAL]], %[[OFFSET]] : f32, i32
+
+    // CHECK: %[[INVOCATION_ID_ADDR:.+]] = spirv.mlir.addressof @__builtin__SubgroupLocalInvocationId__ : !spirv.ptr<i32, Input>
+    // CHECK: %[[LANE_ID:.+]] = spirv.Load "Input" %[[INVOCATION_ID_ADDR]] : i32
+    // CHECK: %[[VAL_LANE_ID:.+]] = spirv.IAdd %[[LANE_ID]], %[[OFFSET]] : i32
+    // CHECK: %[[VALID:.+]] = spirv.ULessThan %[[VAL_LANE_ID]], %[[WIDTH]] : i32
+
+    %result, %valid = gpu.shuffle down %val, %offset, %width : f32
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle, GroupNonUniformShuffleRelative], []>,
+    #spirv.resource_limits<subgroup_size = 16>>
+} {
+
+gpu.module @kernels {
+  // CHECK-LABEL:  spirv.func @shuffle_up()
+  gpu.func @shuffle_up() kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+    %offset = arith.constant 4 : i32
+    %width = arith.constant 16 : i32
+    %val = arith.constant 42.0 : f32
+
+    // CHECK: %[[OFFSET:.+]] = spirv.Constant 4 : i32
+    // CHECK: %[[WIDTH:.+]] = spirv.Constant 16 : i32
+    // CHECK: %[[VAL:.+]] = spirv.Constant 4.200000e+01 : f32
+    // CHECK: %{{.+}} = spirv.GroupNonUniformShuffleUp <Subgroup> %[[VAL]], %[[OFFSET]] : f32, i32
+
+    // CHECK: %[[INVOCATION_ID_ADDR:.+]] = spirv.mlir.addressof @__builtin__SubgroupLocalInvocationId__ : !spirv.ptr<i32, Input>
+    // CHECK: %[[LANE_ID:.+]] = spirv.Load "Input" %[[INVOCATION_ID_ADDR]] : i32
+    // CHECK: %[[VAL_LANE_ID:.+]] = spirv.ISub %[[LANE_ID]], %[[OFFSET]] : i32
+    // CHECK: %[[CST0:.+]] = spirv.Constant 0 : i32
+    // CHECK: %[[VALID:.+]] = spirv.SGreaterThanEqual %[[VAL_LANE_ID]], %[[CST0]] : i32
+
+    %result, %valid = gpu.shuffle up %val, %offset, %width : f32
+    gpu.return
+  }
+}
+
+}

From 2c2ad9a096e78e9129f8cb2d4ee260eb7e67473f Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Thu, 19 Jun 2025 14:59:32 +0800
Subject: [PATCH 0893/1322] Reapply "[Clang] Profile singly-resolved
 UnresolvedLookupExpr with the declaration" (#140680)

For a dependent variable template specialization, we don't build a
dependent Decl node or a DeclRefExpr to represent it. Instead, we
preserve the UnresolvedLookupExpr until instantiation.

However, this approach isn't ideal for constraint normalization. We
consider the qualifier during profiling, but since that's based on the
written code, it can introduce confusing differences, even when the
expressions resolve to the same declaration.

This change profiles the underlying VarTemplateDecl if
UnresolvedLookupExpr is used to model a dependent use of it.

Fixes https://github.com/llvm/llvm-project/issues/139476
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/AST/StmtProfile.cpp                 | 10 +++++++--
 clang/test/SemaCXX/exception-spec.cpp         | 21 +++++++++++++++++++
 .../SemaTemplate/concepts-out-of-line-def.cpp | 15 +++++++++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 18234188101f..dd748ab06873 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -852,6 +852,7 @@ Bug Fixes to C++ Support
 - Fixed the handling of pack indexing types in the constraints of a member function redeclaration. (#GH138255)
 - Clang now correctly parses arbitrary order of ``[[]]``, ``__attribute__`` and ``alignas`` attributes for declarations (#GH133107)
 - Fixed a crash when forming an invalid function type in a dependent context. (#GH138657) (#GH115725) (#GH68852)
+- Fixed a function declaration mismatch that caused inconsistencies between concepts and variable template declarations. (#GH139476)
 - Clang no longer segfaults when there is a configuration mismatch between modules and their users (http://crbug.com/400353616).
 - Fix an incorrect deduction when calling an explicit object member function template through an overload set address.
 - Fixed bug in constant evaluation that would allow using the value of a
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index c666d966a6e5..c61450e19f1b 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2189,8 +2189,14 @@ StmtProfiler::VisitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *S) {
 
 void StmtProfiler::VisitOverloadExpr(const OverloadExpr *S) {
   VisitExpr(S);
-  VisitNestedNameSpecifier(S->getQualifier());
-  VisitName(S->getName(), /*TreatAsDecl*/ true);
+  bool DescribingDependentVarTemplate =
+      S->getNumDecls() == 1 && isa<VarTemplateDecl>(*S->decls_begin());
+  if (DescribingDependentVarTemplate) {
+    VisitDecl(*S->decls_begin());
+  } else {
+    VisitNestedNameSpecifier(S->getQualifier());
+    VisitName(S->getName(), /*TreatAsDecl*/ true);
+  }
   ID.AddBoolean(S->hasExplicitTemplateArgs());
   if (S->hasExplicitTemplateArgs())
     VisitTemplateArguments(S->getTemplateArgs(), S->getNumTemplateArgs());
diff --git a/clang/test/SemaCXX/exception-spec.cpp b/clang/test/SemaCXX/exception-spec.cpp
index 6ad19aab397b..31c691b28da4 100644
--- a/clang/test/SemaCXX/exception-spec.cpp
+++ b/clang/test/SemaCXX/exception-spec.cpp
@@ -52,3 +52,24 @@ namespace AssignmentOp {
     D2 &operator=(const D2&); // expected-error {{more lax}}
   };
 }
+
+namespace OverloadedFunctions {
+
+template <typename T>
+void f(T&) noexcept;
+
+template <typename T, int N>
+void f(T (&arr)[N]) noexcept(noexcept(f(*arr)));
+
+template <typename T>
+inline void f(T&) noexcept {}
+
+template <typename T, int N>
+inline void f(T (&arr)[N]) noexcept(noexcept(f(*arr))) {}
+
+void g() {
+    int x[1];
+    f(x);
+}
+
+}
diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
index e5d00491d3fb..bf505dec0ca1 100644
--- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
+++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
@@ -853,3 +853,18 @@ template <int... Ts>
 requires C<Ts...[0]>
 auto TplClass<int>::buggy() -> void {}
 }
+
+namespace GH139476 {
+
+namespace moo {
+  template <typename T>
+  constexpr bool baa = true;
+
+  template <typename T> requires baa<T>
+  void caw();
+}
+
+template <typename T> requires moo::baa<T>
+void moo::caw() {}
+
+}

From 9ee55e717308757b580dff182fc23b40d1c18a56 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 19 Jun 2025 09:01:51 +0200
Subject: [PATCH 0894/1322] [CIR] Implement folder for VecSplatOp (#143771)

This change adds a folder for the VecSplatOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 .../CIR/Dialect/Transforms/CIRSimplify.cpp    | 30 +++++++++++++++++--
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  4 +--
 clang/test/CIR/Transforms/vector-splat.cir    | 16 ++++++++++
 3 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CIR/Transforms/vector-splat.cir

diff --git a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
index 67ed4124f26c..3b7f08c44140 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
@@ -260,6 +260,31 @@ struct SimplifySwitch : public OpRewritePattern<SwitchOp> {
   }
 };
 
+struct SimplifyVecSplat : public OpRewritePattern<VecSplatOp> {
+  using OpRewritePattern<VecSplatOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(VecSplatOp op,
+                                PatternRewriter &rewriter) const override {
+    mlir::Value splatValue = op.getValue();
+    auto constant =
+        mlir::dyn_cast_if_present<cir::ConstantOp>(splatValue.getDefiningOp());
+    if (!constant)
+      return mlir::failure();
+
+    auto value = constant.getValue();
+    if (!mlir::isa_and_nonnull<cir::IntAttr>(value) &&
+        !mlir::isa_and_nonnull<cir::FPAttr>(value))
+      return mlir::failure();
+
+    cir::VectorType resultType = op.getResult().getType();
+    SmallVector<mlir::Attribute, 16> elements(resultType.getSize(), value);
+    auto constVecAttr = cir::ConstVectorAttr::get(
+        resultType, mlir::ArrayAttr::get(getContext(), elements));
+
+    rewriter.replaceOpWithNewOp<cir::ConstantOp>(op, constVecAttr);
+    return mlir::success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // CIRSimplifyPass
 //===----------------------------------------------------------------------===//
@@ -275,7 +300,8 @@ void populateMergeCleanupPatterns(RewritePatternSet &patterns) {
   patterns.add<
     SimplifyTernary,
     SimplifySelect,
-    SimplifySwitch
+    SimplifySwitch,
+    SimplifyVecSplat
   >(patterns.getContext());
   // clang-format on
 }
@@ -288,7 +314,7 @@ void CIRSimplifyPass::runOnOperation() {
   // Collect operations to apply patterns.
   llvm::SmallVector<Operation *, 16> ops;
   getOperation()->walk([&](Operation *op) {
-    if (isa<TernaryOp, SelectOp, SwitchOp>(op))
+    if (isa<TernaryOp, SelectOp, SwitchOp, VecSplatOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index b73cb839828e..5f41e340e247 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -979,9 +979,7 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
     }
 
     attr = rewriter.getArrayAttr(components);
-  }
-
-  else {
+  } else {
     return op.emitError() << "unsupported constant type " << op.getType();
   }
 
diff --git a/clang/test/CIR/Transforms/vector-splat.cir b/clang/test/CIR/Transforms/vector-splat.cir
new file mode 100644
index 000000000000..e2274b8627b1
--- /dev/null
+++ b/clang/test/CIR/Transforms/vector-splat.cir
@@ -0,0 +1,16 @@
+// RUN: cir-opt %s -cir-simplify -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %v = cir.const #cir.int<3> : !s32i
+    %vec = cir.vec.splat %v : !s32i, !cir.vector<4 x !s32i>
+    cir.return %vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK: cir.func @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %0 = cir.const #cir.const_vector<[#cir.int<3> : !s32i, #cir.int<3> : !s32i,
+  // CHECK-SAME: #cir.int<3> : !s32i, #cir.int<3> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %0 : !cir.vector<4 x !s32i>
+}

From 408e55098d7d8f7064d7a288b5e3fe6fdbbc2ad4 Mon Sep 17 00:00:00 2001
From: quic_hchandel <quic_hchandel@quicinc.com>
Date: Thu, 19 Jun 2025 12:36:20 +0530
Subject: [PATCH 0895/1322] [RISCV] Add support for handling one tied operand
 in the source instruction for compress patterns (#143660)

This update enables compress patterns to handle one tied operand in
source instructions, which was previously unsupported. Qualcomm's uC
extension Xqci includes several instructions with tied operands that can
be compressed into smaller forms. This change adds the necessary support
to enable such compression. Additionally, a compress pattern for the
qc.muliadd instruction has been implemented.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |  5 ++
 llvm/test/MC/RISCV/xqciac-valid.s           | 21 ++++++--
 llvm/utils/TableGen/CompressInstEmitter.cpp | 58 +++++++++++++++------
 3 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 09852c6fd596..2856df47f704 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1570,6 +1570,11 @@ def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm),
                   (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
 } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32]
 
+let Predicates = [HasVendorXqciac, IsRV32] in {
+def : CompressPat<(QC_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5),
+                  (QC_C_MULIADD GPRC:$rd, GPRC:$rs1, uimm5:$imm5)>;
+}
+
 let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32] in {
 def : CompressPat<(QC_E_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
                   (QC_BEQI GPRNoX0:$rs1, simm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
diff --git a/llvm/test/MC/RISCV/xqciac-valid.s b/llvm/test/MC/RISCV/xqciac-valid.s
index 438c4cafe0df..1afebc75cb45 100644
--- a/llvm/test/MC/RISCV/xqciac-valid.s
+++ b/llvm/test/MC/RISCV/xqciac-valid.s
@@ -1,24 +1,27 @@
 # Xqciac - Qualcomm uC Load-Store Address Calculation Extension
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -M no-aliases -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST,CHECK-NOALIAS %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqciac -M no-aliases --no-print-imm-hex -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 # RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciac -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST,CHECK-ALIAS %s
 # RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciac < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-xqciac --no-print-imm-hex -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 
-# CHECK-INST: qc.c.muliadd    a0, a1, 0
+# CHECK-NOALIAS: qc.c.muliadd    a0, a1, 0
+# CHECK-ALIAS: qc.muliadd    a0, a1, 0
 # CHECK-ENC: encoding: [0x8a,0x21]
 qc.c.muliadd x10, x11, 0
 
-# CHECK-INST: qc.c.muliadd    a0, a1, 31
+# CHECK-NOALIAS: qc.c.muliadd    a0, a1, 31
+# CHECK-ALIAS: qc.muliadd    a0, a1, 31
 # CHECK-ENC: encoding: [0xea,0x3d]
 qc.c.muliadd x10, x11, 31
 
-# CHECK-INST: qc.c.muliadd    a0, a1, 16
+# CHECK-NOALIAS: qc.c.muliadd    a0, a1, 16
+# CHECK-ALIAS: qc.muliadd    a0, a1, 16
 # CHECK-ENC: encoding: [0xaa,0x21]
 qc.c.muliadd x10, x11, 16
 
@@ -47,3 +50,11 @@ qc.shladd x10, x11, x12, 4
 # CHECK-INST: qc.shladd       a0, a1, a2, 31
 # CHECK-ENC: encoding: [0x0b,0xb5,0xc5,0x7e]
 qc.shladd x10, x11, x12, 31
+
+# Check that compress pattern for qc.muliadd works
+
+# CHECK-NOALIAS: qc.c.muliadd    a0, a1, 16
+# CHECK-ALIAS: qc.muliadd    a0, a1, 16
+# CHECK-ENC: encoding: [0xaa,0x21]
+qc.muliadd x10, x11, 16
+
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index 4a0b6d79c53d..2dfeea36e213 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -75,6 +75,7 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include <limits>
 #include <set>
 #include <vector>
 using namespace llvm;
@@ -123,10 +124,10 @@ class CompressInstEmitter {
   const RecordKeeper &Records;
   const CodeGenTarget Target;
   std::vector<CompressPat> CompressPatterns;
-
   void addDagOperandMapping(const Record *Rec, const DagInit *Dag,
                             const CodeGenInstruction &Inst,
-                            IndexedMap<OpData> &OperandMap, bool IsSourceInst);
+                            IndexedMap<OpData> &OperandMap, bool IsSourceInst,
+                            unsigned *SourceLastTiedOpPtr);
   void evaluateCompressPat(const Record *Compress);
   void emitCompressInstEmitter(raw_ostream &OS, EmitterType EType);
   bool validateTypes(const Record *DagOpType, const Record *InstOpType,
@@ -143,7 +144,8 @@ class CompressInstEmitter {
                                 IndexedMap<OpData> &SourceOperandMap,
                                 IndexedMap<OpData> &DestOperandMap,
                                 StringMap<unsigned> &SourceOperands,
-                                const CodeGenInstruction &DestInst);
+                                const CodeGenInstruction &DestInst,
+                                unsigned SourceLastTiedOp);
 
 public:
   CompressInstEmitter(const RecordKeeper &R) : Records(R), Target(R) {}
@@ -206,7 +208,8 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
                                                const DagInit *Dag,
                                                const CodeGenInstruction &Inst,
                                                IndexedMap<OpData> &OperandMap,
-                                               bool IsSourceInst) {
+                                               bool IsSourceInst,
+                                               unsigned *SourceLastTiedOpPtr) {
   unsigned NumMIOperands = 0;
   for (const auto &Op : Inst.Operands)
     NumMIOperands += Op.MINumOperands;
@@ -219,12 +222,16 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
   // are represented.
   unsigned TiedCount = 0;
   unsigned OpNo = 0;
+  if (IsSourceInst)
+    *SourceLastTiedOpPtr = std::numeric_limits<unsigned int>::max();
   for (const auto &Opnd : Inst.Operands) {
     int TiedOpIdx = Opnd.getTiedRegister();
     if (-1 != TiedOpIdx) {
       // Set the entry in OperandMap for the tied operand we're skipping.
       OperandMap[OpNo].Kind = OperandMap[TiedOpIdx].Kind;
       OperandMap[OpNo].Data = OperandMap[TiedOpIdx].Data;
+      if (IsSourceInst)
+        *SourceLastTiedOpPtr = OpNo;
       ++OpNo;
       ++TiedCount;
       continue;
@@ -289,15 +296,23 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
 static bool verifyDagOpCount(const CodeGenInstruction &Inst, const DagInit *Dag,
                              bool IsSource) {
   unsigned NumMIOperands = 0;
-  for (const auto &Op : Inst.Operands)
+
+  // Use this to count number of tied Operands in Source Inst in this function.
+  // This counter is required here to error out when there is a Source
+  // Inst with two or more tied operands.
+  unsigned SourceInstTiedOpCount = 0;
+  for (const auto &Op : Inst.Operands) {
     NumMIOperands += Op.MINumOperands;
+    if (Op.getTiedRegister() != -1)
+      SourceInstTiedOpCount++;
+  }
 
   if (Dag->getNumArgs() == NumMIOperands)
     return true;
 
-  // Source instructions are non compressed instructions and don't have tied
-  // operands.
-  if (IsSource)
+  // Source instructions are non compressed instructions and have at most one
+  // tied operand.
+  if (IsSource && (SourceInstTiedOpCount >= 2))
     PrintFatalError(Inst.TheDef->getLoc(),
                     "Input operands for Inst '" + Inst.TheDef->getName() +
                         "' and input Dag operand count mismatch");
@@ -381,7 +396,8 @@ void CompressInstEmitter::createDagOperandMapping(
 void CompressInstEmitter::createInstOperandMapping(
     const Record *Rec, const DagInit *SourceDag, const DagInit *DestDag,
     IndexedMap<OpData> &SourceOperandMap, IndexedMap<OpData> &DestOperandMap,
-    StringMap<unsigned> &SourceOperands, const CodeGenInstruction &DestInst) {
+    StringMap<unsigned> &SourceOperands, const CodeGenInstruction &DestInst,
+    unsigned SourceLastTiedOp) {
   // TiedCount keeps track of the number of operands skipped in Inst
   // operands list to get to the corresponding Dag operand.
   unsigned TiedCount = 0;
@@ -422,10 +438,18 @@ void CompressInstEmitter::createInstOperandMapping(
       assert(DestDag->getArgNameStr(DagArgIdx) ==
                  SourceDag->getArgNameStr(SourceOp->getValue()) &&
              "Incorrect operand mapping detected!\n");
-      DestOperandMap[OpNo].Data.Operand = SourceOp->getValue();
-      SourceOperandMap[SourceOp->getValue()].Data.Operand = OpNo;
-      LLVM_DEBUG(dbgs() << "    " << SourceOp->getValue() << " ====> " << OpNo
-                        << "\n");
+
+      // Following four lines ensure the correct handling of a single tied
+      // operand in the Source Inst. SourceDagOp points to the position of
+      // appropriate Dag argument which is not correct in presence of tied
+      // operand in the Source Inst and must be incremented by 1 to reflect
+      // correct position of the operand in Source Inst
+      unsigned SourceDagOp = SourceOp->getValue();
+      if (SourceDagOp >= SourceLastTiedOp)
+        SourceDagOp++;
+      DestOperandMap[OpNo].Data.Operand = SourceDagOp;
+      SourceOperandMap[SourceDagOp].Data.Operand = OpNo;
+      LLVM_DEBUG(dbgs() << "    " << SourceDagOp << " ====> " << OpNo << "\n");
     }
   }
 }
@@ -484,15 +508,16 @@ void CompressInstEmitter::evaluateCompressPat(const Record *Rec) {
   // Fill the mapping from the source to destination instructions.
 
   IndexedMap<OpData> SourceOperandMap;
+  unsigned SourceLastTiedOp; // postion of the last tied operand in Source Inst
   // Create a mapping between source Dag operands and source Inst operands.
   addDagOperandMapping(Rec, SourceDag, SourceInst, SourceOperandMap,
-                       /*IsSourceInst*/ true);
+                       /*IsSourceInst*/ true, &SourceLastTiedOp);
 
   IndexedMap<OpData> DestOperandMap;
   // Create a mapping between destination Dag operands and destination Inst
   // operands.
   addDagOperandMapping(Rec, DestDag, DestInst, DestOperandMap,
-                       /*IsSourceInst*/ false);
+                       /*IsSourceInst*/ false, nullptr);
 
   StringMap<unsigned> SourceOperands;
   StringMap<unsigned> DestOperands;
@@ -500,7 +525,8 @@ void CompressInstEmitter::evaluateCompressPat(const Record *Rec) {
                           SourceOperandMap);
   // Create operand mapping between the source and destination instructions.
   createInstOperandMapping(Rec, SourceDag, DestDag, SourceOperandMap,
-                           DestOperandMap, SourceOperands, DestInst);
+                           DestOperandMap, SourceOperands, DestInst,
+                           SourceLastTiedOp);
 
   // Get the target features for the CompressPat.
   std::vector<const Record *> PatReqFeatures;

From 3e795c60c73e990fbbf254715cb47855c32bcfae Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Thu, 19 Jun 2025 11:12:34 +0400
Subject: [PATCH 0896/1322] [lldb] Disable TestTargetWatchAddress on Windows
 x86_64 (#144779)

See #144777 for details.
---
 .../watchpoint/watchlocation/TestTargetWatchAddress.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py b/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py
index 37fa911b3714..f1c7a60300df 100644
--- a/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py
+++ b/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py
@@ -21,6 +21,11 @@ class TargetWatchpointCreateByAddressPITestCase(TestBase):
         # This is for verifying that watch location works.
         self.violating_func = "do_bad_thing_with_location"
 
+    @skipIf(
+        oslist=["windows"],
+        archs=["x86_64"],
+        bugnumber="github.com/llvm/llvm-project/issues/144777",
+    )
     def test_watch_create_by_address(self):
         """Exercise SBTarget.WatchpointCreateByAddress() API to set a watchpoint."""
         self.build()
@@ -88,6 +93,11 @@ class TargetWatchpointCreateByAddressPITestCase(TestBase):
 
         # This finishes our test.
 
+    @skipIf(
+        oslist=["windows"],
+        archs=["x86_64"],
+        bugnumber="github.com/llvm/llvm-project/issues/144777",
+    )
     def test_watch_address(self):
         """Exercise SBTarget.WatchAddress() API to set a watchpoint.
         Same as test_watch_create_by_address, but uses the simpler API.

From a9a71b6d311892d6add6aab3790b20fe945cca38 Mon Sep 17 00:00:00 2001
From: "S. B. Tam" <cpplearner@outlook.com>
Date: Thu, 19 Jun 2025 15:21:28 +0800
Subject: [PATCH 0897/1322] [libc++][test] Don't pass ill-formed UTF-8 to
 MAKE_STRING_VIEW (#136403)

---
 .../escaped_output.unicode.pass.cpp           |  2 +-
 .../format.functions/fill.unicode.pass.cpp    | 38 ++++++++++++-------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index c4adf601c40a..eb27c7095466 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -337,7 +337,7 @@ void test_string() {
 
   // Ill-formed
   if constexpr (sizeof(CharT) == 1)
-    test_format(SV(R"("\x{80}")"), SV("{:?}"), SV("\x80"));
+    test_format(SV(R"("\x{80}")"), SV("{:?}"), "\x80");
 
   // *** P2713R1 examples ***
   test_format(SV(R"(["\u{301}"])"), SV("[{:?}]"), SV("\u0301"));
diff --git a/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
index cd555e1ab9ce..76f756ae9148 100644
--- a/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
@@ -75,30 +75,40 @@ void test() {
 
   // Invalid Unicode Scalar Values
   if constexpr (std::same_as<CharT, char>) {
-    check_exception("The format specifier contains malformed Unicode characters", SV("{:\xed\xa0\x80^}"), 42); // U+D800
-    check_exception("The format specifier contains malformed Unicode characters", SV("{:\xed\xa0\xbf^}"), 42); // U+DBFF
-    check_exception("The format specifier contains malformed Unicode characters", SV("{:\xed\xbf\x80^}"), 42); // U+DC00
-    check_exception("The format specifier contains malformed Unicode characters", SV("{:\xed\xbf\xbf^}"), 42); // U+DFFF
-
-    check_exception(
-        "The format specifier contains malformed Unicode characters", SV("{:\xf4\x90\x80\x80^}"), 42); // U+110000
-    check_exception(
-        "The format specifier contains malformed Unicode characters", SV("{:\xf4\x90\xbf\xbf^}"), 42); // U+11FFFF
+    check_exception("The format specifier contains malformed Unicode characters",
+                    std::string_view{"{:\xed\xa0\x80^}"},
+                    42); // U+D800
+    check_exception("The format specifier contains malformed Unicode characters",
+                    std::string_view{"{:\xed\xa0\xbf^}"},
+                    42); // U+DBFF
+    check_exception("The format specifier contains malformed Unicode characters",
+                    std::string_view{"{:\xed\xbf\x80^}"},
+                    42); // U+DC00
+    check_exception("The format specifier contains malformed Unicode characters",
+                    std::string_view{"{:\xed\xbf\xbf^}"},
+                    42); // U+DFFF
 
     check_exception("The format specifier contains malformed Unicode characters",
-                    SV("{:\x80^}"),
+                    std::string_view{"{:\xf4\x90\x80\x80^}"},
+                    42); // U+110000
+    check_exception("The format specifier contains malformed Unicode characters",
+                    std::string_view{"{:\xf4\x90\xbf\xbf^}"},
+                    42); // U+11FFFF
+
+    check_exception("The format specifier contains malformed Unicode characters",
+                    std::string_view{"{:\x80^}"},
                     42); // Trailing code unit with no leading one.
     check_exception("The format specifier contains malformed Unicode characters",
-                    SV("{:\xc0^}"),
+                    std::string_view{"{:\xc0^}"},
                     42); // Missing trailing code unit.
     check_exception("The format specifier contains malformed Unicode characters",
-                    SV("{:\xe0\x80^}"),
+                    std::string_view{"{:\xe0\x80^}"},
                     42); // Missing trailing code unit.
     check_exception("The format specifier contains malformed Unicode characters",
-                    SV("{:\xf0\x80^}"),
+                    std::string_view{"{:\xf0\x80^}"},
                     42); // Missing two trailing code units.
     check_exception("The format specifier contains malformed Unicode characters",
-                    SV("{:\xf0\x80\x80^}"),
+                    std::string_view{"{:\xf0\x80\x80^}"},
                     42); // Missing trailing code unit.
 
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS

From 50a7511138a42d2c7a69b68237ce88cc027b91bc Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Thu, 19 Jun 2025 08:51:08 +0100
Subject: [PATCH 0898/1322] [BOLT][AArch64] Fix PREL Relocs on RHEL8 (#144505)

---
 bolt/test/AArch64/r_aarch64_prelxx.s | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/test/AArch64/r_aarch64_prelxx.s b/bolt/test/AArch64/r_aarch64_prelxx.s
index 5cbe2c50b294..39f74301cedf 100644
--- a/bolt/test/AArch64/r_aarch64_prelxx.s
+++ b/bolt/test/AArch64/r_aarch64_prelxx.s
@@ -5,7 +5,7 @@
 // REQUIRES: system-linux
 
 // RUN: %clang %cflags -nostartfiles -nostdlib %s -o %t.exe -mlittle-endian \
-// RUN:     -Wl,-q -Wl,-z,max-page-size=4
+// RUN:     -Wl,-q -Wl,-z,max-page-size=4 -Wl,--no-relax
 // RUN: llvm-readelf -Wa %t.exe | FileCheck %s -check-prefix=CHECKPREL
 
 // CHECKPREL:       R_AARCH64_PREL16      {{.*}} .dummy + 0
@@ -36,9 +36,9 @@
   .type _start, %function
 _start:
   adrp x0, datatable
-  add x0, x0, :lo12:datable
+  add x0, x0, :lo12:datatable
   mov x0, #0
-  ret 
+  ret
 
 .section .dummy, "a", @progbits
 dummy:

From e73bff89ef8e3c8cdd8895cdc3d021fc4dcabd76 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 19 Jun 2025 09:56:07 +0200
Subject: [PATCH 0899/1322] [AMDGPU] New RegBankSelect: Handle all 32/64 bit
 pointer types for B32/B64 rule (#142560)

The previous system explicitly enumerated the types. P0 was missing and thus we couldn't handle a select of P0s for example.
Generalize the logic to simply check the width of the pointer for 32/64 bit pointers, this should handle all common address spaces
---
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   8 +-
 .../GlobalISel/regbankselect-select.mir       | 196 ++++++++++++++++++
 2 files changed, 199 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 89056b0271f1..5e21f44f7d54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -199,13 +199,11 @@ UniformityLLTOpPredicateID LLTToId(LLT Ty) {
 
 UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
   if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
-      Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) ||
-      Ty == LLT::pointer(6, 32))
+      (Ty.isPointer() && Ty.getSizeInBits() == 32))
     return B32;
   if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
-      Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(1, 64) ||
-      Ty == LLT::pointer(4, 64) ||
-      (Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS))
+      Ty == LLT::fixed_vector(4, 16) ||
+      (Ty.isPointer() && Ty.getSizeInBits() == 64))
     return B64;
   if (Ty == LLT::fixed_vector(3, 32))
     return B96;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-select.mir
index 762f7b950036..2fd2e03471f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-select.mir
@@ -896,6 +896,31 @@ body: |
     %5:_(<4 x s16>) = G_SELECT %4, %2, %3
 ...
 
+---
+name: select_p0_scc_ss
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+    ; CHECK-LABEL: name: select_p0_scc_ss
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p0) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(p0) = COPY $sgpr4_sgpr5
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(p0) = G_SELECT [[AND]](s32), [[COPY2]], [[COPY3]]
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(p0) = COPY $sgpr2_sgpr3
+    %3:_(p0) = COPY $sgpr4_sgpr5
+    %4:_(s1) = G_ICMP intpred(ne), %0, %1
+    %5:_(p0) = G_SELECT %4, %2, %3
+...
+
 ---
 name: select_p1_scc_ss
 legalized: true
@@ -946,6 +971,36 @@ body: |
     %5:_(p999) = G_SELECT %4, %2, %3
 ...
 
+---
+name: select_p0_scc_sv
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1
+    ; CHECK-LABEL: name: select_p0_scc_sv
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p0) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[ICMP]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p0) = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY4]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](p0)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_COPY_VCC_SCC]](s1), [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_COPY_VCC_SCC]](s1), [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(p0) = COPY $sgpr2_sgpr3
+    %3:_(p0) = COPY $vgpr0_vgpr1
+    %4:_(s1) = G_ICMP intpred(ne), %0, %1
+    %5:_(p0) = G_SELECT %4, %2, %3
+
+...
+
 ---
 name: select_p1_scc_sv
 legalized: true
@@ -976,6 +1031,35 @@ body: |
 
 ...
 
+---
+name: select_p0_scc_vs
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1
+    ; CHECK-LABEL: name: select_p0_scc_vs
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p0) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[ICMP]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p0) = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY4]](p0)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_COPY_VCC_SCC]](s1), [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_COPY_VCC_SCC]](s1), [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(p0) = COPY $sgpr2_sgpr3
+    %3:_(p0) = COPY $vgpr0_vgpr1
+    %4:_(s1) = G_ICMP intpred(ne), %0, %1
+    %5:_(p0) = G_SELECT %4, %3, %2
+...
+
 ---
 name: select_p1_scc_vs
 legalized: true
@@ -1033,6 +1117,35 @@ body: |
     %5:_(p1) = G_SELECT %4, %2, %3
 ...
 
+---
+name: select_p0_vcc_ss
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: select_p0_vcc_ss
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p0) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY2]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p0) = COPY [[COPY]](p0)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(p0) = COPY [[COPY1]](p0)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY4]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY5]](p0)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
+    %0:_(p0) = COPY $sgpr0_sgpr1
+    %1:_(p0) = COPY $sgpr2_sgpr3
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(s1) = G_ICMP intpred(ne), %2, %3
+    %5:_(p0) = G_SELECT %4, %0, %1
+...
+
 ---
 name: select_p1_vcc_ss
 legalized: true
@@ -1062,6 +1175,34 @@ body: |
     %5:_(p1) = G_SELECT %4, %0, %1
 ...
 
+---
+name: select_p0_vcc_sv
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2_vgpr3
+    ; CHECK-LABEL: name: select_p0_vcc_sv
+    ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p0) = COPY [[COPY]](p0)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY4]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](p0)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
+    %0:_(p0) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p0) = COPY $vgpr2_vgpr3
+    %4:_(s1) = G_ICMP intpred(ne), %1, %2
+    %5:_(p0) = G_SELECT %4, %0, %3
+...
+
 ---
 name: select_p1_vcc_sv
 legalized: true
@@ -1090,6 +1231,34 @@ body: |
     %5:_(p1) = G_SELECT %4, %0, %3
 ...
 
+---
+name: select_p0_vcc_vs
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2_vgpr3
+    ; CHECK-LABEL: name: select_p0_vcc_vs
+    ; CHECK: liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p0) = COPY [[COPY]](p0)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY4]](p0)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
+    %0:_(p0) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p0) = COPY $vgpr2_vgpr3
+    %4:_(s1) = G_ICMP intpred(ne), %1, %2
+    %5:_(p0) = G_SELECT %4, %3, %0
+...
+
 ---
 name: select_p1_vcc_vs
 legalized: true
@@ -1118,6 +1287,33 @@ body: |
     %5:_(p1) = G_SELECT %4, %3, %0
 ...
 
+---
+name: select_p0_vcc_vv
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-LABEL: name: select_p0_vcc_vv
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(p0) = COPY $vgpr4_vgpr5
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY2]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](p0)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p0) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p0) = COPY $vgpr2_vgpr3
+    %3:_(p0) = COPY $vgpr4_vgpr5
+    %4:_(s1) = G_ICMP intpred(ne), %0, %1
+    %5:_(p0) = G_SELECT %4, %2, %3
+...
+
 ---
 name: select_p1_vcc_vv
 legalized: true

From db8e6fc64534e986f5bf96cceaa76cc5007ac1c7 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 19 Jun 2025 09:58:57 +0200
Subject: [PATCH 0900/1322] [AMDGPU] New RegBanKSelect: Add S128 types
 (#142601)

---
 llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 9 +++++++++
 llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp  | 6 ++++++
 llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h    | 5 +++++
 3 files changed, 20 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 7ff822c6f658..89af98263659 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -556,6 +556,9 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
   case Sgpr64:
   case Vgpr64:
     return LLT::scalar(64);
+  case Sgpr128:
+  case Vgpr128:
+    return LLT::scalar(128);
   case VgprP0:
     return LLT::pointer(0, 64);
   case SgprP1:
@@ -646,6 +649,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case Sgpr16:
   case Sgpr32:
   case Sgpr64:
+  case Sgpr128:
   case SgprP1:
   case SgprP3:
   case SgprP4:
@@ -678,6 +682,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case Vgpr16:
   case Vgpr32:
   case Vgpr64:
+  case Vgpr128:
   case VgprP0:
   case VgprP1:
   case VgprP3:
@@ -718,6 +723,7 @@ void RegBankLegalizeHelper::applyMappingDst(
     case Sgpr16:
     case Sgpr32:
     case Sgpr64:
+    case Sgpr128:
     case SgprP1:
     case SgprP3:
     case SgprP4:
@@ -728,6 +734,7 @@ void RegBankLegalizeHelper::applyMappingDst(
     case Vgpr16:
     case Vgpr32:
     case Vgpr64:
+    case Vgpr128:
     case VgprP0:
     case VgprP1:
     case VgprP3:
@@ -839,6 +846,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case Sgpr16:
     case Sgpr32:
     case Sgpr64:
+    case Sgpr128:
     case SgprP1:
     case SgprP3:
     case SgprP4:
@@ -865,6 +873,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case Vgpr16:
     case Vgpr32:
     case Vgpr64:
+    case Vgpr128:
     case VgprP0:
     case VgprP1:
     case VgprP3:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5e21f44f7d54..672fc5b79abc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::scalar(32);
   case S64:
     return MRI.getType(Reg) == LLT::scalar(64);
+  case S128:
+    return MRI.getType(Reg) == LLT::scalar(128);
   case P0:
     return MRI.getType(Reg) == LLT::pointer(0, 64);
   case P1:
@@ -84,6 +86,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
   case UniS64:
     return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
+  case UniS128:
+    return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg);
   case UniP0:
     return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
   case UniP1:
@@ -116,6 +120,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
   case DivS64:
     return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
+  case DivS128:
+    return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg);
   case DivP0:
     return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
   case DivP1:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index bddfb8dd1913..30b900d871f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -39,16 +39,19 @@ enum UniformityLLTOpPredicateID {
   S16,
   S32,
   S64,
+  S128,
 
   UniS1,
   UniS16,
   UniS32,
   UniS64,
+  UniS128,
 
   DivS1,
   DivS16,
   DivS32,
   DivS64,
+  DivS128,
 
   // pointers
   P0,
@@ -117,6 +120,7 @@ enum RegBankLLTMappingApplyID {
   Sgpr16,
   Sgpr32,
   Sgpr64,
+  Sgpr128,
   SgprP1,
   SgprP3,
   SgprP4,
@@ -135,6 +139,7 @@ enum RegBankLLTMappingApplyID {
   Vgpr16,
   Vgpr32,
   Vgpr64,
+  Vgpr128,
   VgprP0,
   VgprP1,
   VgprP3,

From 26d4b3cb4ca2f882384d940f3dad28f8d79451eb Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Thu, 19 Jun 2025 13:29:57 +0530
Subject: [PATCH 0901/1322] [RISCV] Don't prefer QC_EXT for SEXT_INREG patterns
 when Zbb is enabled (#144837)

`Zbb` has the `sext.b` and `sext.h` instructions that are compressible.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |   8 +-
 llvm/test/CodeGen/RISCV/xqcibm-extract.ll   | 141 ++++++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 2856df47f704..3d0cad7884fd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1439,8 +1439,6 @@ def : SelectQCbi<SETUGE, uimm16nonzero, Select_GPRNoX0_Using_CC_UImm16NonZero_QC
 } // let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2
 
 let Predicates = [HasVendorXqcibm, IsRV32] in {
-def : Pat<(sext_inreg (i32 GPR:$rs1), i16), (QC_EXT GPR:$rs1, 16, 0)>;
-def : Pat<(sext_inreg (i32 GPR:$rs1), i8), (QC_EXT GPR:$rs1, 8, 0)>;
 def : Pat<(sext_inreg (i32 GPR:$rs1), i1), (QC_EXT GPR:$rs1, 1, 0)>;
 
 // Prefer qc.extu to andi for the following cases since the former can be compressed
@@ -1452,6 +1450,12 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
 def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
+// If Zbb is enabled sext.b/h is preferred since they are compressible
+let Predicates = [HasVendorXqcibm, NoStdExtZbb, IsRV32] in {
+def : Pat<(sext_inreg (i32 GPR:$rs1), i16), (QC_EXT GPR:$rs1, 16, 0)>;
+def : Pat<(sext_inreg (i32 GPR:$rs1), i8), (QC_EXT GPR:$rs1, 8, 0)>;
+} // Predicates = [HasVendorXqcibm, NoStdExtZbb, IsRV32]
+
 let Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32] in {
 def: Pat<(i32 (cttz (not (i32 GPR:$rs1)))), (QC_CTO GPR:$rs1)>;
 def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>;
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
index edf6e9a2d501..481bfdd66643 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
@@ -3,6 +3,8 @@
 ; RUN:   | FileCheck %s -check-prefixes=RV32I
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcibm -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefixes=RV32XQCIBM
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcibm,+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32XQCIBMZBB
 
 define i32 @sexti1_i32(i1 %a) nounwind {
 ; RV32I-LABEL: sexti1_i32:
@@ -15,6 +17,11 @@ define i32 @sexti1_i32(i1 %a) nounwind {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 1, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti1_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 1, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %sext = sext i1 %a to i32
   ret i32 %sext
 }
@@ -30,6 +37,11 @@ define i32 @sexti1_i32_2(i32 %a) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 1, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti1_i32_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 1, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i32 %a, 31
   %shr = ashr exact i32 %shl, 31
   ret i32 %shr
@@ -47,6 +59,11 @@ define i32 @sexti8_i32(i8 %a) nounwind {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 8, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti8_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.b a0, a0
+; RV32XQCIBMZBB-NEXT:    ret
   %sext = sext i8 %a to i32
   ret i32 %sext
 }
@@ -62,6 +79,11 @@ define i32 @sexti8_i32_2(i32 %a) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 8, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti8_i32_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.b a0, a0
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i32 %a, 24
   %shr = ashr exact i32 %shl, 24
   ret i32 %shr
@@ -78,6 +100,11 @@ define i32 @sexti16_i32(i16 %a) nounwind {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 16, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti16_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.h a0, a0
+; RV32XQCIBMZBB-NEXT:    ret
   %sext = sext i16 %a to i32
   ret i32 %sext
 }
@@ -93,6 +120,11 @@ define i32 @sexti16_i32_2(i32 %a) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 16, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti16_i32_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.h a0, a0
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i32 %a, 16
   %shr = ashr exact i32 %shl, 16
   ret i32 %shr
@@ -111,6 +143,12 @@ define i64 @sexti1_i64(i64 %a) {
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 1, 0
 ; RV32XQCIBM-NEXT:    mv a1, a0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti1_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 1, 0
+; RV32XQCIBMZBB-NEXT:    mv a1, a0
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i64 %a, 63
   %shr = ashr exact i64 %shl, 63
   ret i64 %shr
@@ -129,6 +167,12 @@ define i64 @sexti1_i64_2(i1 %a) {
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 1, 0
 ; RV32XQCIBM-NEXT:    mv a1, a0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti1_i64_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 1, 0
+; RV32XQCIBMZBB-NEXT:    mv a1, a0
+; RV32XQCIBMZBB-NEXT:    ret
   %1 = sext i1 %a to i64
   ret i64 %1
 }
@@ -146,6 +190,12 @@ define i64 @sexti8_i64(i64 %a) {
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 8, 0
 ; RV32XQCIBM-NEXT:    srai a1, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti8_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.b a0, a0
+; RV32XQCIBMZBB-NEXT:    srai a1, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i64 %a, 56
   %shr = ashr exact i64 %shl, 56
   ret i64 %shr
@@ -164,6 +214,12 @@ define i64 @sexti8_i64_2(i8 %a) {
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 8, 0
 ; RV32XQCIBM-NEXT:    srai a1, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti8_i64_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.b a0, a0
+; RV32XQCIBMZBB-NEXT:    srai a1, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %1 = sext i8 %a to i64
   ret i64 %1
 }
@@ -181,6 +237,12 @@ define i64 @sexti16_i64(i64 %a) {
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 16, 0
 ; RV32XQCIBM-NEXT:    srai a1, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti16_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.h a0, a0
+; RV32XQCIBMZBB-NEXT:    srai a1, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i64 %a, 48
   %shr = ashr exact i64 %shl, 48
   ret i64 %shr
@@ -199,6 +261,12 @@ define i64 @sexti16_i64_2(i16 %a) {
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 16, 0
 ; RV32XQCIBM-NEXT:    srai a1, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti16_i64_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    sext.h a0, a0
+; RV32XQCIBMZBB-NEXT:    srai a1, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %1 = sext i16 %a to i64
   ret i64 %1
 }
@@ -213,6 +281,11 @@ define i64 @sexti32_i64(i64 %a) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    srai a1, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti32_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    srai a1, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i64 %a, 32
   %shr = ashr exact i64 %shl, 32
   ret i64 %shr
@@ -228,6 +301,11 @@ define i64 @sexti32_i64_2(i32 %a) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    srai a1, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: sexti32_i64_2:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    srai a1, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %1 = sext i32 %a to i64
   ret i64 %1
 }
@@ -243,6 +321,11 @@ define i32 @extu_from_and_i32(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_and_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 12, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %a = and i32 %x, 4095
   ret i32 %a
 }
@@ -257,6 +340,11 @@ define i32 @no_extu_from_and_i32(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    andi a0, a0, 31
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: no_extu_from_and_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    andi a0, a0, 31
+; RV32XQCIBMZBB-NEXT:    ret
   %a = and i32 %x, 31
   ret i32 %a
 }
@@ -271,6 +359,11 @@ define i32 @extu_from_and_i32_simm12_lb(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 6, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_and_i32_simm12_lb:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 6, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %a = and i32 %x, 63
   ret i32 %a
 }
@@ -285,6 +378,11 @@ define i32 @extu_from_and_i32_simm12_ub(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 11, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_and_i32_simm12_ub:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 11, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %a = and i32 %x, 2047
   ret i32 %a
 }
@@ -302,6 +400,12 @@ define i64 @extu_from_and_i64(i64 %x) {
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 0
 ; RV32XQCIBM-NEXT:    li a1, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_and_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 12, 0
+; RV32XQCIBMZBB-NEXT:    li a1, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %a = and i64 %x, 4095
   ret i64 %a
 }
@@ -317,6 +421,11 @@ define i32 @extu_from_and_lshr_i32(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 3, 23
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_and_lshr_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 3, 23
+; RV32XQCIBMZBB-NEXT:    ret
   %shifted = lshr i32 %x, 23
   %masked = and i32 %shifted, 7
   ret i32 %masked
@@ -335,6 +444,12 @@ define i64 @extu_from_and_lshr_i64(i64 %x) {
 ; RV32XQCIBM-NEXT:    qc.extu a0, a1, 12, 14
 ; RV32XQCIBM-NEXT:    li a1, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_and_lshr_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a1, 12, 14
+; RV32XQCIBMZBB-NEXT:    li a1, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %shifted = lshr i64 %x, 46
   %masked = and i64 %shifted, 4095
   ret i64 %masked
@@ -351,6 +466,11 @@ define i32 @extu_from_lshr_and_i32(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 12
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_lshr_and_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 12, 12
+; RV32XQCIBMZBB-NEXT:    ret
   %masked = and i32 %x, 16773120
   %shifted = lshr i32 %masked, 12
   ret i32 %shifted
@@ -369,6 +489,12 @@ define i64 @extu_from_lshr_and_i64(i64 %x) {
 ; RV32XQCIBM-NEXT:    qc.extu a0, a0, 12, 12
 ; RV32XQCIBM-NEXT:    li a1, 0
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: extu_from_lshr_and_i64:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.extu a0, a0, 12, 12
+; RV32XQCIBMZBB-NEXT:    li a1, 0
+; RV32XQCIBMZBB-NEXT:    ret
   %masked = and i64 %x, 16773120
   %shifted = lshr i64 %masked, 12
   ret i64 %shifted
@@ -385,6 +511,11 @@ define i32 @ext_from_ashr_shl_i32(i32 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 8, 16
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: ext_from_ashr_shl_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 8, 16
+; RV32XQCIBMZBB-NEXT:    ret
   %shl = shl i32 %x, 8
   %ashr = ashr i32 %shl, 24
   ret i32 %ashr
@@ -401,6 +532,11 @@ define i32 @ext_from_ashr_sexti8_i32(i8 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 3, 5
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: ext_from_ashr_sexti8_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 3, 5
+; RV32XQCIBMZBB-NEXT:    ret
   %sext = sext i8 %x to i32
   %ashr = ashr i32 %sext, 5
   ret i32 %ashr
@@ -417,6 +553,11 @@ define i32 @ext_from_ashr_sexti16_i32(i16 %x) {
 ; RV32XQCIBM:       # %bb.0:
 ; RV32XQCIBM-NEXT:    qc.ext a0, a0, 1, 15
 ; RV32XQCIBM-NEXT:    ret
+;
+; RV32XQCIBMZBB-LABEL: ext_from_ashr_sexti16_i32:
+; RV32XQCIBMZBB:       # %bb.0:
+; RV32XQCIBMZBB-NEXT:    qc.ext a0, a0, 1, 15
+; RV32XQCIBMZBB-NEXT:    ret
   %sext = sext i16 %x to i32
   %ashr = ashr i32 %sext, 24
   ret i32 %ashr

From 7ceea22a7adad5d21328839facbc6a6d0151e056 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 19 Jun 2025 10:06:38 +0200
Subject: [PATCH 0902/1322] [AMDGPU] New RegBankSelect: Add Ptr32/Ptr64/Ptr128
 (#142602)

There's quite a few opcodes that do not care about the exact AS of the pointer, just its size.
Adding generic types for these will help reduce duplication in the rule definitions.

I also moved the usual B types to use the new `isAnyPtr` helper I added to make sure they're supersets of the `Ptr` cases
---
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp    | 42 +++++++++++++++----
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     | 29 +++++++++++--
 .../AMDGPU/AMDGPURegBankLegalizeRules.h       | 19 +++++++++
 3 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 89af98263659..b2ddc6e88966 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -595,17 +595,23 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
   case VgprB32:
   case UniInVgprB32:
     if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
-        Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) ||
-        Ty == LLT::pointer(6, 32))
+        isAnyPtr(Ty, 32))
       return Ty;
     return LLT();
+  case SgprPtr32:
+  case VgprPtr32:
+    return isAnyPtr(Ty, 32) ? Ty : LLT();
+  case SgprPtr64:
+  case VgprPtr64:
+    return isAnyPtr(Ty, 64) ? Ty : LLT();
+  case SgprPtr128:
+  case VgprPtr128:
+    return isAnyPtr(Ty, 128) ? Ty : LLT();
   case SgprB64:
   case VgprB64:
   case UniInVgprB64:
     if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
-        Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(0, 64) ||
-        Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64) ||
-        (Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS))
+        Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
       return Ty;
     return LLT();
   case SgprB96:
@@ -619,7 +625,7 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
   case VgprB128:
   case UniInVgprB128:
     if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
-        Ty == LLT::fixed_vector(2, 64))
+        Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
       return Ty;
     return LLT();
   case SgprB256:
@@ -654,6 +660,9 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case SgprP3:
   case SgprP4:
   case SgprP5:
+  case SgprPtr32:
+  case SgprPtr64:
+  case SgprPtr128:
   case SgprV2S16:
   case SgprV2S32:
   case SgprV4S32:
@@ -688,6 +697,9 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case VgprP3:
   case VgprP4:
   case VgprP5:
+  case VgprPtr32:
+  case VgprPtr64:
+  case VgprPtr128:
   case VgprV2S16:
   case VgprV2S32:
   case VgprV4S32:
@@ -754,12 +766,18 @@ void RegBankLegalizeHelper::applyMappingDst(
     case SgprB128:
     case SgprB256:
     case SgprB512:
+    case SgprPtr32:
+    case SgprPtr64:
+    case SgprPtr128:
     case VgprB32:
     case VgprB64:
     case VgprB96:
     case VgprB128:
     case VgprB256:
-    case VgprB512: {
+    case VgprB512:
+    case VgprPtr32:
+    case VgprPtr64:
+    case VgprPtr128: {
       assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
       assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
       break;
@@ -864,7 +882,10 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case SgprB96:
     case SgprB128:
     case SgprB256:
-    case SgprB512: {
+    case SgprB512:
+    case SgprPtr32:
+    case SgprPtr64:
+    case SgprPtr128: {
       assert(Ty == getBTyFromID(MethodIDs[i], Ty));
       assert(RB == getRegBankFromID(MethodIDs[i]));
       break;
@@ -895,7 +916,10 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case VgprB96:
     case VgprB128:
     case VgprB256:
-    case VgprB512: {
+    case VgprB512:
+    case VgprPtr32:
+    case VgprPtr64:
+    case VgprPtr128: {
       assert(Ty == getBTyFromID(MethodIDs[i], Ty));
       if (RB != VgprRB) {
         auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 672fc5b79abc..5402129e4188 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -26,6 +26,10 @@
 using namespace llvm;
 using namespace AMDGPU;
 
+bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
+  return Ty.isPointer() && Ty.getSizeInBits() == Width;
+}
+
 RegBankLLTMapping::RegBankLLTMapping(
     std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
     std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
@@ -62,6 +66,12 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(4, 64);
   case P5:
     return MRI.getType(Reg) == LLT::pointer(5, 32);
+  case Ptr32:
+    return isAnyPtr(MRI.getType(Reg), 32);
+  case Ptr64:
+    return isAnyPtr(MRI.getType(Reg), 64);
+  case Ptr128:
+    return isAnyPtr(MRI.getType(Reg), 128);
   case V2S32:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
   case V4S32:
@@ -98,6 +108,12 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
   case UniP5:
     return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+  case UniPtr32:
+    return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
+  case UniPtr64:
+    return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniform(Reg);
+  case UniPtr128:
+    return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
   case UniV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
   case UniB32:
@@ -132,6 +148,12 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
   case DivP5:
     return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
+  case DivPtr32:
+    return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergent(Reg);
+  case DivPtr64:
+    return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergent(Reg);
+  case DivPtr128:
+    return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
   case DivV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
   case DivB32:
@@ -205,15 +227,14 @@ UniformityLLTOpPredicateID LLTToId(LLT Ty) {
 
 UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
   if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
-      (Ty.isPointer() && Ty.getSizeInBits() == 32))
+      isAnyPtr(Ty, 32))
     return B32;
   if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
-      Ty == LLT::fixed_vector(4, 16) ||
-      (Ty.isPointer() && Ty.getSizeInBits() == 64))
+      Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
     return B64;
   if (Ty == LLT::fixed_vector(3, 32))
     return B96;
-  if (Ty == LLT::fixed_vector(4, 32))
+  if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128))
     return B128;
   return _;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 30b900d871f3..7243d75aa830 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -15,6 +15,7 @@
 
 namespace llvm {
 
+class LLT;
 class MachineRegisterInfo;
 class MachineInstr;
 class GCNSubtarget;
@@ -26,6 +27,9 @@ using MachineUniformityInfo = GenericUniformityInfo<MachineSSAContext>;
 
 namespace AMDGPU {
 
+/// \returns true if \p Ty is a pointer type with size \p Width.
+bool isAnyPtr(LLT Ty, unsigned Width);
+
 // IDs used to build predicate for RegBankLegalizeRule. Predicate can have one
 // or more IDs and each represents a check for 'uniform or divergent' + LLT or
 // just LLT on register operand.
@@ -59,18 +63,27 @@ enum UniformityLLTOpPredicateID {
   P3,
   P4,
   P5,
+  Ptr32,
+  Ptr64,
+  Ptr128,
 
   UniP0,
   UniP1,
   UniP3,
   UniP4,
   UniP5,
+  UniPtr32,
+  UniPtr64,
+  UniPtr128,
 
   DivP0,
   DivP1,
   DivP3,
   DivP4,
   DivP5,
+  DivPtr32,
+  DivPtr64,
+  DivPtr128,
 
   // vectors
   V2S16,
@@ -125,6 +138,9 @@ enum RegBankLLTMappingApplyID {
   SgprP3,
   SgprP4,
   SgprP5,
+  SgprPtr32,
+  SgprPtr64,
+  SgprPtr128,
   SgprV2S16,
   SgprV4S32,
   SgprV2S32,
@@ -145,6 +161,9 @@ enum RegBankLLTMappingApplyID {
   VgprP3,
   VgprP4,
   VgprP5,
+  VgprPtr32,
+  VgprPtr64,
+  VgprPtr128,
   VgprV2S16,
   VgprV2S32,
   VgprB32,

From 52ff58c3300338876ae63126ce0d33331000f1ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Thu, 19 Jun 2025 10:06:50 +0200
Subject: [PATCH 0903/1322] Revert "[CUDA][HIP] Add a __device__ version of
 std::__glibcxx_assert_fail()" (#144850)

Reverts llvm/llvm-project#136133
---
 clang/lib/Headers/CMakeLists.txt              |  1 -
 .../Headers/cuda_wrappers/bits/c++config.h    | 51 -------------------
 2 files changed, 52 deletions(-)
 delete mode 100644 clang/lib/Headers/cuda_wrappers/bits/c++config.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c96d209c1fc0..c1c9d2e8c7b7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -341,7 +341,6 @@ set(cuda_wrapper_files
 )
 
 set(cuda_wrapper_bits_files
-  cuda_wrappers/bits/c++config.h
   cuda_wrappers/bits/shared_ptr_base.h
   cuda_wrappers/bits/basic_string.h
   cuda_wrappers/bits/basic_string.tcc
diff --git a/clang/lib/Headers/cuda_wrappers/bits/c++config.h b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
deleted file mode 100644
index eafa13a9cc64..000000000000
--- a/clang/lib/Headers/cuda_wrappers/bits/c++config.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// libstdc++ uses the non-constexpr function std::__glibcxx_assert_fail()
-// to trigger compilation errors when the __glibcxx_assert(cond) macro
-// is used in a constexpr context.
-// Compilation fails when using code from the libstdc++ (such as std::array) on
-// device code, since these assertions invoke a non-constexpr host function from
-// device code.
-//
-// To work around this issue, we declare our own device version of the function
-
-#ifndef __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
-#define __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
-
-#include_next <bits/c++config.h>
-
-#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_BEGIN_NAMESPACE_STD
-#else
-namespace std {
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif
-
-#ifdef _GLIBCXX_VERBOSE_ASSERT
-__attribute__((device, noreturn)) inline void
-__glibcxx_assert_fail(const char *file, int line, const char *function,
-                      const char *condition) noexcept {
-  if (file && function && condition)
-    __builtin_printf("%s:%d: %s: Assertion '%s' failed.\n", file, line,
-                     function, condition);
-  else if (function)
-    __builtin_printf("%s: Undefined behavior detected.\n", function);
-  __builtin_abort();
-}
-#endif
-
-#endif
-__attribute__((device, noreturn, __always_inline__,
-               __visibility__("default"))) inline void
-__glibcxx_assert_fail(...) noexcept {
-  __builtin_abort();
-}
-#ifdef _LIBCPP_END_NAMESPACE_STD
-_LIBCPP_END_NAMESPACE_STD
-#else
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_END_NAMESPACE_VERSION
-#endif
-} // namespace std
-#endif
-
-#endif

From 650b451d0065c8ea6a1f87e7fdc6d07648729549 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 19 Jun 2025 10:06:59 +0200
Subject: [PATCH 0904/1322] [libc++] Simplify the implementation of
 pointer_traits a bit (#142260)

---
 libcxx/include/__memory/pointer_traits.h | 118 +++++++----------------
 1 file changed, 34 insertions(+), 84 deletions(-)

diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index 879b387b9ad1..8c7f8dff1b76 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -16,11 +16,13 @@
 #include <__type_traits/conditional.h>
 #include <__type_traits/conjunction.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/detected_or.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_void.h>
+#include <__type_traits/nat.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
@@ -34,67 +36,37 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-// clang-format off
-#define _LIBCPP_CLASS_TRAITS_HAS_XXX(NAME, PROPERTY)                                                                   \
-  template <class _Tp, class = void>                                                                                   \
-  struct NAME : false_type {};                                                                                         \
-  template <class _Tp>                                                                                                 \
-  struct NAME<_Tp, __void_t<typename _Tp::PROPERTY> > : true_type {}
-// clang-format on
-
-_LIBCPP_CLASS_TRAITS_HAS_XXX(__has_pointer, pointer);
-_LIBCPP_CLASS_TRAITS_HAS_XXX(__has_element_type, element_type);
-
-template <class _Ptr, bool = __has_element_type<_Ptr>::value>
-struct __pointer_traits_element_type {};
-
 template <class _Ptr>
-struct __pointer_traits_element_type<_Ptr, true> {
-  using type _LIBCPP_NODEBUG = typename _Ptr::element_type;
-};
+struct __pointer_traits_element_type_impl {};
 
 template <template <class, class...> class _Sp, class _Tp, class... _Args>
-struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, true> {
-  using type _LIBCPP_NODEBUG = typename _Sp<_Tp, _Args...>::element_type;
-};
-
-template <template <class, class...> class _Sp, class _Tp, class... _Args>
-struct __pointer_traits_element_type<_Sp<_Tp, _Args...>, false> {
+struct __pointer_traits_element_type_impl<_Sp<_Tp, _Args...> > {
   using type _LIBCPP_NODEBUG = _Tp;
 };
 
-template <class _Tp, class = void>
-struct __has_difference_type : false_type {};
-
-template <class _Tp>
-struct __has_difference_type<_Tp, __void_t<typename _Tp::difference_type> > : true_type {};
-
-template <class _Ptr, bool = __has_difference_type<_Ptr>::value>
-struct __pointer_traits_difference_type {
-  using type _LIBCPP_NODEBUG = ptrdiff_t;
-};
+template <class _Ptr, class = void>
+struct __pointer_traits_element_type : __pointer_traits_element_type_impl<_Ptr> {};
 
 template <class _Ptr>
-struct __pointer_traits_difference_type<_Ptr, true> {
-  using type _LIBCPP_NODEBUG = typename _Ptr::difference_type;
+struct __pointer_traits_element_type<_Ptr, __void_t<typename _Ptr::element_type> > {
+  using type _LIBCPP_NODEBUG = typename _Ptr::element_type;
 };
 
 template <class _Tp, class _Up>
-struct __has_rebind {
-private:
-  template <class _Xp>
-  static false_type __test(...);
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  template <class _Xp>
-  static true_type __test(typename _Xp::template rebind<_Up>* = 0);
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
-
-public:
-  static const bool value = decltype(__test<_Tp>(0))::value;
+struct __pointer_traits_rebind_impl {
+  static_assert(false, "Cannot rebind pointer; did you forget to add a rebind member to your pointer?");
 };
 
-template <class _Tp, class _Up, bool = __has_rebind<_Tp, _Up>::value>
-struct __pointer_traits_rebind {
+template <template <class, class...> class _Sp, class _Tp, class... _Args, class _Up>
+struct __pointer_traits_rebind_impl<_Sp<_Tp, _Args...>, _Up> {
+  using type _LIBCPP_NODEBUG = _Sp<_Up, _Args...>;
+};
+
+template <class _Tp, class _Up, class = void>
+struct __pointer_traits_rebind : __pointer_traits_rebind_impl<_Tp, _Up> {};
+
+template <class _Tp, class _Up>
+struct __pointer_traits_rebind<_Tp, _Up, __void_t<typename _Tp::template rebind<_Up> > > {
 #ifndef _LIBCPP_CXX03_LANG
   using type _LIBCPP_NODEBUG = typename _Tp::template rebind<_Up>;
 #else
@@ -102,19 +74,8 @@ struct __pointer_traits_rebind {
 #endif
 };
 
-template <template <class, class...> class _Sp, class _Tp, class... _Args, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, true> {
-#ifndef _LIBCPP_CXX03_LANG
-  using type _LIBCPP_NODEBUG = typename _Sp<_Tp, _Args...>::template rebind<_Up>;
-#else
-  using type _LIBCPP_NODEBUG = typename _Sp<_Tp, _Args...>::template rebind<_Up>::other;
-#endif
-};
-
-template <template <class, class...> class _Sp, class _Tp, class... _Args, class _Up>
-struct __pointer_traits_rebind<_Sp<_Tp, _Args...>, _Up, false> {
-  typedef _Sp<_Up, _Args...> type;
-};
+template <class _Tp>
+using __difference_type_member _LIBCPP_NODEBUG = typename _Tp::difference_type;
 
 template <class _Ptr, class = void>
 struct __pointer_traits_impl {};
@@ -123,7 +84,7 @@ template <class _Ptr>
 struct __pointer_traits_impl<_Ptr, __void_t<typename __pointer_traits_element_type<_Ptr>::type> > {
   typedef _Ptr pointer;
   typedef typename __pointer_traits_element_type<pointer>::type element_type;
-  typedef typename __pointer_traits_difference_type<pointer>::type difference_type;
+  using difference_type = __detected_or_t<ptrdiff_t, __difference_type_member, pointer>;
 
 #ifndef _LIBCPP_CXX03_LANG
   template <class _Up>
@@ -135,9 +96,6 @@ struct __pointer_traits_impl<_Ptr, __void_t<typename __pointer_traits_element_ty
   };
 #endif // _LIBCPP_CXX03_LANG
 
-private:
-  struct __nat {};
-
 public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static pointer
   pointer_to(__conditional_t<is_void<element_type>::value, __nat, element_type>& __r) {
@@ -164,9 +122,6 @@ struct pointer_traits<_Tp*> {
   };
 #endif
 
-private:
-  struct __nat {};
-
 public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static pointer
   pointer_to(__conditional_t<is_void<element_type>::value, __nat, element_type>& __r) _NOEXCEPT {
@@ -257,20 +212,26 @@ template <class _Tp>
 struct __pointer_of {};
 
 template <class _Tp>
-  requires(__has_pointer<_Tp>::value)
+concept __has_pointer_member = requires { typename _Tp::pointer; };
+
+template <class _Tp>
+concept __has_element_type_member = requires { typename _Tp::element_type; };
+
+template <class _Tp>
+  requires __has_pointer_member<_Tp>
 struct __pointer_of<_Tp> {
   using type _LIBCPP_NODEBUG = typename _Tp::pointer;
 };
 
 template <class _Tp>
-  requires(!__has_pointer<_Tp>::value && __has_element_type<_Tp>::value)
+  requires(!__has_pointer_member<_Tp> && __has_element_type_member<_Tp>)
 struct __pointer_of<_Tp> {
   using type _LIBCPP_NODEBUG = typename _Tp::element_type*;
 };
 
 template <class _Tp>
-  requires(!__has_pointer<_Tp>::value && !__has_element_type<_Tp>::value &&
-           __has_element_type<pointer_traits<_Tp>>::value)
+  requires(!__has_pointer_member<_Tp> && !__has_element_type_member<_Tp> &&
+           __has_element_type_member<pointer_traits<_Tp>>)
 struct __pointer_of<_Tp> {
   using type _LIBCPP_NODEBUG = typename pointer_traits<_Tp>::element_type*;
 };
@@ -278,19 +239,8 @@ struct __pointer_of<_Tp> {
 template <typename _Tp>
 using __pointer_of_t _LIBCPP_NODEBUG = typename __pointer_of<_Tp>::type;
 
-template <class _Tp, class _Up>
-struct __pointer_of_or {
-  using type _LIBCPP_NODEBUG = _Up;
-};
-
-template <class _Tp, class _Up>
-  requires requires { typename __pointer_of_t<_Tp>; }
-struct __pointer_of_or<_Tp, _Up> {
-  using type _LIBCPP_NODEBUG = __pointer_of_t<_Tp>;
-};
-
 template <typename _Tp, typename _Up>
-using __pointer_of_or_t _LIBCPP_NODEBUG = typename __pointer_of_or<_Tp, _Up>::type;
+using __pointer_of_or_t _LIBCPP_NODEBUG = __detected_or_t<_Up, __pointer_of_t, _Tp>;
 
 template <class _Smart>
 concept __resettable_smart_pointer = requires(_Smart __s) { __s.reset(); };

From 1ab0e7dd60e26ac7c7fc64a273485522f5c5ba02 Mon Sep 17 00:00:00 2001
From: "Jiachen (Yangyang) Wang"
 <130888597+WanderingAura@users.noreply.github.com>
Date: Thu, 19 Jun 2025 09:09:19 +0100
Subject: [PATCH 0905/1322] [LICM] Hoisting writeonly calls (#143799)

Adds support for hoisting `writeonly` calls in LICM.

This patch adds a missing optimization that allows hoisting of
`writeonly` function calls out of loops when it is safe to do so.
Previously, such calls were conservatively retained inside the loop
body, and the redundant calls were only reduced through unrolling,
relying on target-dependent heuristics.

Closes #143267

Testing:
- Modified previously negative tests for hoisting writeonly calls to be
instead positive
- Added test cases for hoisting of two writeonly calls where the
pointers do/do not alias
- Added a test case for not argmemonly writeonly calls.
---
 llvm/lib/Transforms/Scalar/LICM.cpp           | 133 ++++++++++-------
 .../test/CodeGen/AMDGPU/loop_exit_with_xor.ll |   2 +-
 llvm/test/Transforms/LICM/call-hoisting.ll    | 134 +++++++++++++++++-
 3 files changed, 211 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index a6bb8b8a21b0..cf84366c4200 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -186,6 +186,9 @@ static bool isSafeToExecuteUnconditionally(
     const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
     OptimizationRemarkEmitter *ORE, const Instruction *CtxI,
     AssumptionCache *AC, bool AllowSpeculation);
+static bool noConflictingReadWrites(Instruction *I, MemorySSA *MSSA,
+                                    AAResults *AA, Loop *CurLoop,
+                                    SinkAndHoistLICMFlags &Flags);
 static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU,
                                      Loop *CurLoop, Instruction &I,
                                      SinkAndHoistLICMFlags &Flags,
@@ -1234,8 +1237,11 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
           /*InvariantGroup=*/false);
     }
 
-    // FIXME: This should use mod/ref information to see if we can hoist or
-    // sink the call.
+    if (Behavior.onlyWritesMemory()) {
+      // can hoist or sink if there are no conflicting read/writes to the
+      // memory location written to by the call.
+      return noConflictingReadWrites(CI, MSSA, AA, CurLoop, Flags);
+    }
 
     return false;
   } else if (auto *FI = dyn_cast<FenceInst>(&I)) {
@@ -1253,57 +1259,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     // arbitrary number of reads in the loop.
     if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
       return true;
-    // If there are more accesses than the Promotion cap, then give up as we're
-    // not walking a list that long.
-    if (Flags.tooManyMemoryAccesses())
-      return false;
-
-    auto *SIMD = MSSA->getMemoryAccess(SI);
-    BatchAAResults BAA(*AA);
-    auto *Source = getClobberingMemoryAccess(*MSSA, BAA, Flags, SIMD);
-    // Make sure there are no clobbers inside the loop.
-    if (!MSSA->isLiveOnEntryDef(Source) &&
-           CurLoop->contains(Source->getBlock()))
-      return false;
-
-    // If there are interfering Uses (i.e. their defining access is in the
-    // loop), or ordered loads (stored as Defs!), don't move this store.
-    // Could do better here, but this is conservatively correct.
-    // TODO: Cache set of Uses on the first walk in runOnLoop, update when
-    // moving accesses. Can also extend to dominating uses.
-    for (auto *BB : CurLoop->getBlocks())
-      if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
-        for (const auto &MA : *Accesses)
-          if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
-            auto *MD = getClobberingMemoryAccess(*MSSA, BAA, Flags,
-                const_cast<MemoryUse *>(MU));
-            if (!MSSA->isLiveOnEntryDef(MD) &&
-                CurLoop->contains(MD->getBlock()))
-              return false;
-            // Disable hoisting past potentially interfering loads. Optimized
-            // Uses may point to an access outside the loop, as getClobbering
-            // checks the previous iteration when walking the backedge.
-            // FIXME: More precise: no Uses that alias SI.
-            if (!Flags.getIsSink() && !MSSA->dominates(SIMD, MU))
-              return false;
-          } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
-            if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
-              (void)LI; // Silence warning.
-              assert(!LI->isUnordered() && "Expected unordered load");
-              return false;
-            }
-            // Any call, while it may not be clobbering SI, it may be a use.
-            if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
-              // Check if the call may read from the memory location written
-              // to by SI. Check CI's attributes and arguments; the number of
-              // such checks performed is limited above by NoOfMemAccTooLarge.
-              ModRefInfo MRI = BAA.getModRefInfo(CI, MemoryLocation::get(SI));
-              if (isModOrRefSet(MRI))
-                return false;
-            }
-          }
-      }
-    return true;
+    return noConflictingReadWrites(SI, MSSA, AA, CurLoop, Flags);
   }
 
   assert(!I.mayReadOrWriteMemory() && "unhandled aliasing");
@@ -2330,6 +2286,77 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) {
   return Result;
 }
 
+// For a given store instruction or writeonly call instruction, this function
+// checks that there are no read or writes that conflict with the memory
+// access in the instruction
+static bool noConflictingReadWrites(Instruction *I, MemorySSA *MSSA,
+                                    AAResults *AA, Loop *CurLoop,
+                                    SinkAndHoistLICMFlags &Flags) {
+  assert(isa<CallInst>(*I) || isa<StoreInst>(*I));
+  // If there are more accesses than the Promotion cap, then give up as we're
+  // not walking a list that long.
+  if (Flags.tooManyMemoryAccesses())
+    return false;
+
+  auto *IMD = MSSA->getMemoryAccess(I);
+  BatchAAResults BAA(*AA);
+  auto *Source = getClobberingMemoryAccess(*MSSA, BAA, Flags, IMD);
+  // Make sure there are no clobbers inside the loop.
+  if (!MSSA->isLiveOnEntryDef(Source) && CurLoop->contains(Source->getBlock()))
+    return false;
+
+  // If there are interfering Uses (i.e. their defining access is in the
+  // loop), or ordered loads (stored as Defs!), don't move this store.
+  // Could do better here, but this is conservatively correct.
+  // TODO: Cache set of Uses on the first walk in runOnLoop, update when
+  // moving accesses. Can also extend to dominating uses.
+  for (auto *BB : CurLoop->getBlocks()) {
+    auto *Accesses = MSSA->getBlockAccesses(BB);
+    if (!Accesses)
+      continue;
+    for (const auto &MA : *Accesses)
+      if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
+        auto *MD = getClobberingMemoryAccess(*MSSA, BAA, Flags,
+                                             const_cast<MemoryUse *>(MU));
+        if (!MSSA->isLiveOnEntryDef(MD) && CurLoop->contains(MD->getBlock()))
+          return false;
+        // Disable hoisting past potentially interfering loads. Optimized
+        // Uses may point to an access outside the loop, as getClobbering
+        // checks the previous iteration when walking the backedge.
+        // FIXME: More precise: no Uses that alias I.
+        if (!Flags.getIsSink() && !MSSA->dominates(IMD, MU))
+          return false;
+      } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
+        if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
+          (void)LI; // Silence warning.
+          assert(!LI->isUnordered() && "Expected unordered load");
+          return false;
+        }
+        // Any call, while it may not be clobbering I, it may be a use.
+        if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+          // Check if the call may read from the memory location written
+          // to by I. Check CI's attributes and arguments; the number of
+          // such checks performed is limited above by NoOfMemAccTooLarge.
+          if (auto *SI = dyn_cast<StoreInst>(I)) {
+            ModRefInfo MRI = BAA.getModRefInfo(CI, MemoryLocation::get(SI));
+            if (isModOrRefSet(MRI))
+              return false;
+          } else {
+            auto *SCI = cast<CallInst>(I);
+            // If the instruction we are wanting to hoist is also a call
+            // instruction then we need not check mod/ref info with itself
+            if (SCI == CI)
+              continue;
+            ModRefInfo MRI = BAA.getModRefInfo(CI, SCI);
+            if (isModOrRefSet(MRI))
+              return false;
+          }
+        }
+      }
+  }
+  return true;
+}
+
 static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU,
                                      Loop *CurLoop, Instruction &I,
                                      SinkAndHoistLICMFlags &Flags,
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index e37dcf60506b..2864e0554a27 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -64,6 +64,7 @@ define void @doesnt_need_and(i32 %arg) {
 ; GCN-LABEL: doesnt_need_and:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB1_1: ; %loop
@@ -71,7 +72,6 @@ define void @doesnt_need_and(i32 %arg) {
 ; GCN-NEXT:    s_add_i32 s6, s6, 1
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execnz .LBB1_1
 ; GCN-NEXT:  ; %bb.2: ; %loopexit
diff --git a/llvm/test/Transforms/LICM/call-hoisting.ll b/llvm/test/Transforms/LICM/call-hoisting.ll
index 7124b4e445eb..aa8c8bbed550 100644
--- a/llvm/test/Transforms/LICM/call-hoisting.ll
+++ b/llvm/test/Transforms/LICM/call-hoisting.ll
@@ -118,14 +118,16 @@ exit:
 
 declare void @store(i32 %val, ptr %p) argmemonly writeonly nounwind
 
+; loop invariant calls to writeonly functions such as the above
+; should be hoisted
 define void @test(ptr %loc) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr [[LOC:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
@@ -150,10 +152,10 @@ define void @test_multiexit(ptr %loc, i1 %earlycnd) {
 ; CHECK-LABEL: define void @test_multiexit(
 ; CHECK-SAME: ptr [[LOC:%.*]], i1 [[EARLYCND:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
-; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    br i1 [[EARLYCND]], label %[[EXIT1:.*]], label %[[BACKEDGE]]
 ; CHECK:       [[BACKEDGE]]:
@@ -183,6 +185,97 @@ exit2:
   ret void
 }
 
+; cannot be hoisted because the two pointers can alias one another
+define void @neg_two_pointer(ptr %loc, ptr %otherloc) {
+; CHECK-LABEL: define void @neg_two_pointer(
+; CHECK-SAME: ptr [[LOC:%.*]], ptr [[OTHERLOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    call void @store(i32 1, ptr [[OTHERLOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 0, ptr %loc)
+  call void @store(i32 1, ptr %otherloc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; hoisted due to pointers not aliasing
+define void @two_pointer_noalias(ptr noalias %loc, ptr noalias %otherloc) {
+; CHECK-LABEL: define void @two_pointer_noalias(
+; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[OTHERLOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    call void @store(i32 1, ptr [[OTHERLOC]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 0, ptr %loc)
+  call void @store(i32 1, ptr %otherloc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
+; when there's a conflicting read, store call should not be hoisted
+define void @neg_conflicting_read(ptr noalias %loc, ptr noalias %otherloc) {
+; CHECK-LABEL: define void @neg_conflicting_read(
+; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[OTHERLOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @load(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @store(i32 0, ptr %loc)
+  br label %loop
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @load(i32 0, ptr %loc)
+  call void @store(i32 0, ptr %loc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret void
+}
+
 define void @neg_lv_value(ptr %loc) {
 ; CHECK-LABEL: define void @neg_lv_value(
 ; CHECK-SAME: ptr [[LOC:%.*]]) {
@@ -406,14 +499,47 @@ exit:
   ret void
 }
 
-define void @neg_not_argmemonly(ptr %loc) {
+; when the call is not argmemonly and is not the only memory access we
+; do not hoist
+define void @neg_not_argmemonly(ptr %loc, ptr %loc2) {
 ; CHECK-LABEL: define void @neg_not_argmemonly(
-; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-SAME: ptr [[LOC:%.*]], ptr [[LOC2:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    call void @not_argmemonly(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    call void @load(i32 0, ptr [[LOC2]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @not_argmemonly(i32 0, ptr %loc)
+  call void @load(i32 0, ptr %loc2)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; when the call is not argmemonly and is only memory access we hoist it
+define void @not_argmemonly_hoisted(ptr %loc, ptr %loc2) {
+; CHECK-LABEL: define void @not_argmemonly_hoisted(
+; CHECK-SAME: ptr [[LOC:%.*]], ptr [[LOC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @not_argmemonly(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]

From 0e1aab1ec833d7f8e9897b0940c634385036fdee Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 19 Jun 2025 10:13:55 +0200
Subject: [PATCH 0906/1322] [AMDGPU] Improve test coverage for G_INTTOPTR and
 G_PTRTOINT (#142603)

Test P0 through P6 + P8 for both S/VGPRs.
---
 .../GlobalISel/regbankselect-inttoptr.mir     | 236 +++++++++++++++++-
 .../GlobalISel/regbankselect-ptrtoint.mir     | 232 ++++++++++++++++-
 2 files changed, 458 insertions(+), 10 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
index 053aede615f8..42600d7d0dd7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
@@ -3,13 +3,141 @@
 # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
 ---
-name: inttoptr_s
+name: inttoptr_s_p0
 legalized: true
 
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; CHECK-LABEL: name: inttoptr_s
+    ; CHECK-LABEL: name: inttoptr_s_p0
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p0) = G_INTTOPTR [[COPY]](s64)
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(p0) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p0
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: inttoptr_v_p0
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p0) = G_INTTOPTR [[COPY]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(p0) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p1
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: inttoptr_s_p1
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p1) = G_INTTOPTR [[COPY]](s64)
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(p1) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p1
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: inttoptr_v_p1
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p1) = G_INTTOPTR [[COPY]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(p1) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p2
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: inttoptr_s_p2
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p2) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(p2) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p2
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: inttoptr_v_p2
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p2) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p2) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p3
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: inttoptr_s_p3
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p3) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(p3) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p3
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: inttoptr_v_p3
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p3) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p3) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p4
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: inttoptr_s_p4
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
@@ -19,17 +147,113 @@ body: |
 ...
 
 ---
-name: inttoptr_v
+name: inttoptr_v_p4
 legalized: true
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1
-    ; CHECK-LABEL: name: inttoptr_v
+    ; CHECK-LABEL: name: inttoptr_v_p4
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p0) = G_INTTOPTR [[COPY]](s64)
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p4) = G_INTTOPTR [[COPY]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
-    %1:_(p0) = G_INTTOPTR %0
+    %1:_(p4) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p5
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: inttoptr_s_p5
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p5) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(p5) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p5
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: inttoptr_v_p5
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p5) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p6
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: inttoptr_s_p6
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p6) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(p6) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p6
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: inttoptr_v_p6
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p6) = G_INTTOPTR [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p6) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_s_p8
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-LABEL: name: inttoptr_s_p8
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:sgpr(p8) = G_INTTOPTR [[COPY]](s128)
+    %0:_(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %1:_(p8) = G_INTTOPTR %0
+...
+
+---
+name: inttoptr_v_p8
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-LABEL: name: inttoptr_v_p8
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:vgpr(p8) = G_INTTOPTR [[COPY]](s128)
+    %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(p8) = G_INTTOPTR %0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
index 2db1a5b1d2ba..9240c9f6df40 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
@@ -3,13 +3,45 @@
 # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
 ---
-name: ptrtoint_s
+name: ptrtoint_s_p0
 legalized: true
 
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; CHECK-LABEL: name: ptrtoint_s
+    ; CHECK-LABEL: name: ptrtoint_s_p0
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s64) = G_PTRTOINT [[COPY]](p0)
+    %0:_(p0) = COPY $sgpr0_sgpr1
+    %1:_(s64) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p0
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: ptrtoint_v_p0
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s64) = G_PTRTOINT [[COPY]](p0)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_s_p1
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: ptrtoint_s_p1
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -19,13 +51,13 @@ body: |
 ...
 
 ---
-name: ptrtoint_v
+name: ptrtoint_v_p1
 legalized: true
 
 body: |
   bb.0:
     liveins: $vgpr0_vgpr1
-    ; CHECK-LABEL: name: ptrtoint_v
+    ; CHECK-LABEL: name: ptrtoint_v_p1
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -33,3 +65,195 @@ body: |
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_PTRTOINT %0
 ...
+
+---
+name: ptrtoint_s_p2
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ptrtoint_s_p2
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p2) = COPY $sgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s32) = G_PTRTOINT [[COPY]](p2)
+    %0:_(p2) = COPY $sgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p2
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: ptrtoint_v_p2
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s32) = G_PTRTOINT [[COPY]](p2)
+    %0:_(p2) = COPY $vgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_s_p3
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ptrtoint_s_p3
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s32) = G_PTRTOINT [[COPY]](p3)
+    %0:_(p3) = COPY $sgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p3
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: ptrtoint_v_p3
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s32) = G_PTRTOINT [[COPY]](p3)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_s_p4
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: ptrtoint_s_p4
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s64) = G_PTRTOINT [[COPY]](p4)
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(s64) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p4
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: ptrtoint_v_p4
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s64) = G_PTRTOINT [[COPY]](p4)
+    %0:_(p4) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_s_p5
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ptrtoint_s_p5
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s32) = G_PTRTOINT [[COPY]](p5)
+    %0:_(p5) = COPY $sgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p5
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: ptrtoint_v_p5
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s32) = G_PTRTOINT [[COPY]](p5)
+    %0:_(p5) = COPY $vgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_s_p6
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: ptrtoint_s_p6
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p6) = COPY $sgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s32) = G_PTRTOINT [[COPY]](p6)
+    %0:_(p6) = COPY $sgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p6
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: ptrtoint_v_p6
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p6) = COPY $vgpr0
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s32) = G_PTRTOINT [[COPY]](p6)
+    %0:_(p6) = COPY $vgpr0
+    %1:_(s32) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_s_p8
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-LABEL: name: ptrtoint_s_p8
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:sgpr(s128) = G_PTRTOINT [[COPY]](p8)
+    %0:_(p8) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %1:_(s128) = G_PTRTOINT %0
+...
+
+---
+name: ptrtoint_v_p8
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-LABEL: name: ptrtoint_v_p8
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p8) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:vgpr(s128) = G_PTRTOINT [[COPY]](p8)
+    %0:_(p8) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(s128) = G_PTRTOINT %0
+...

From 62fe5e428acc2c5ef9b144c5737d55b17b55feac Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Date: Thu, 19 Jun 2025 13:47:37 +0530
Subject: [PATCH 0907/1322] [NFC][AMDGPU] print more info when debugging
 SIInsertWaitcnts pass (#144629)

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 112 ++++++++++++++------
 1 file changed, 80 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ca8e3244edd1..f7b88bf2d5eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -104,25 +104,38 @@ struct HardwareLimits {
   unsigned KmcntMax;     // gfx12+ only.
 };
 
+#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
+  DECL(VMEM_ACCESS)              /* vmem read & write */                       \
+  DECL(VMEM_READ_ACCESS)         /* vmem read */                               \
+  DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \
+  DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
+  DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
+  DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
+  DECL(LDS_ACCESS)               /* lds read & write */                        \
+  DECL(GDS_ACCESS)               /* gds read & write */                        \
+  DECL(SQ_MESSAGE)               /* send message */                            \
+  DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
+  DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
+  DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
+  DECL(EXP_POS_ACCESS)           /* write to export position */                \
+  DECL(EXP_PARAM_ACCESS)         /* write to export parameter */               \
+  DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */      \
+  DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */
+
+// clang-format off
+#define AMDGPU_EVENT_ENUM(Name) Name,
 enum WaitEventType {
-  VMEM_ACCESS,              // vector-memory read & write
-  VMEM_READ_ACCESS,         // vector-memory read
-  VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
-  VMEM_BVH_READ_ACCESS,     // vector-memory BVH read (gfx12+ only)
-  VMEM_WRITE_ACCESS,        // vector-memory write that is not scratch
-  SCRATCH_WRITE_ACCESS,     // vector-memory write that may be scratch
-  LDS_ACCESS,               // lds read & write
-  GDS_ACCESS,               // gds read & write
-  SQ_MESSAGE,               // send message
-  SMEM_ACCESS,              // scalar-memory read & write
-  EXP_GPR_LOCK,             // export holding on its data src
-  GDS_GPR_LOCK,             // GDS holding on its data and addr src
-  EXP_POS_ACCESS,           // write to export position
-  EXP_PARAM_ACCESS,         // write to export parameter
-  VMW_GPR_LOCK,             // vector-memory write holding on its data src
-  EXP_LDS_ACCESS,           // read by ldsdir counting as export
-  NUM_WAIT_EVENTS,
+  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_ENUM)
+  NUM_WAIT_EVENTS
 };
+#undef AMDGPU_EVENT_ENUM
+
+#define AMDGPU_EVENT_NAME(Name) #Name,
+static constexpr StringLiteral WaitEventTypeName[] = {
+  AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
+};
+#undef AMDGPU_EVENT_NAME
+// clang-format on
 
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
@@ -1100,6 +1113,20 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
     }
     OS << '\n';
   }
+
+  OS << "Pending Events: ";
+  if (hasPendingEvent()) {
+    ListSeparator LS;
+    for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
+      if (hasPendingEvent((WaitEventType)I)) {
+        OS << LS << WaitEventTypeName[I];
+      }
+    }
+  } else {
+    OS << "none";
+  }
+  OS << '\n';
+
   OS << '\n';
 }
 
@@ -1265,10 +1292,15 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
 
+  LLVM_DEBUG(dbgs() << "PreGFX12::applyPreexistingWaitcnt at: " << *It);
+
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    if (II.isMetaInstruction())
+    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
+    if (II.isMetaInstruction()) {
+      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
       continue;
+    }
 
     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
     bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
@@ -1320,9 +1352,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 
     LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
                    ? dbgs()
-                         << "applyPreexistingWaitcnt\n"
+                         << "applied pre-existing waitcnt\n"
                          << "New Instr at block end: " << *WaitcntInstr << '\n'
-                   : dbgs() << "applyPreexistingWaitcnt\n"
+                   : dbgs() << "applied pre-existing waitcnt\n"
                             << "Old Instr: " << *It
                             << "New Instr: " << *WaitcntInstr << '\n');
   }
@@ -1336,10 +1368,10 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     Wait.StoreCnt = ~0u;
 
     LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
-                   ? dbgs() << "applyPreexistingWaitcnt\n"
+                   ? dbgs() << "applied pre-existing waitcnt\n"
                             << "New Instr at block end: " << *WaitcntVsCntInstr
                             << '\n'
-                   : dbgs() << "applyPreexistingWaitcnt\n"
+                   : dbgs() << "applied pre-existing waitcnt\n"
                             << "Old Instr: " << *It
                             << "New Instr: " << *WaitcntVsCntInstr << '\n');
   }
@@ -1413,10 +1445,15 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
   MachineInstr *CombinedStoreDsCntInstr = nullptr;
   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
 
+  LLVM_DEBUG(dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: " << *It);
+
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
-    if (II.isMetaInstruction())
+    LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
+    if (II.isMetaInstruction()) {
+      LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
       continue;
+    }
 
     MachineInstr **UpdatableInstr;
 
@@ -1486,10 +1523,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Wait.DsCnt = ~0u;
 
       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                     ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: "
                               << *CombinedLoadDsCntInstr << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
+                     : dbgs() << "applied pre-existing waitcnt\n"
                               << "Old Instr: " << *It << "New Instr: "
                               << *CombinedLoadDsCntInstr << '\n');
     } else {
@@ -1511,10 +1548,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Wait.DsCnt = ~0u;
 
       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                     ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: "
                               << *CombinedStoreDsCntInstr << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
+                     : dbgs() << "applied pre-existing waitcnt\n"
                               << "Old Instr: " << *It << "New Instr: "
                               << *CombinedStoreDsCntInstr << '\n');
     } else {
@@ -1570,10 +1607,10 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       setNoWait(Wait, CT);
 
       LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applyPreexistingWaitcnt\n"
+                     ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: " << *WaitInstrs[CT]
                               << '\n'
-                     : dbgs() << "applyPreexistingWaitcnt\n"
+                     : dbgs() << "applied pre-existing waitcnt\n"
                               << "Old Instr: " << *It
                               << "New Instr: " << *WaitInstrs[CT] << '\n');
     } else {
@@ -2306,7 +2343,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   bool Modified = false;
 
   LLVM_DEBUG({
-    dbgs() << "*** Block" << Block.getNumber() << " ***";
+    dbgs() << "*** Begin Block: ";
+    Block.printName(dbgs());
     ScoreBrackets.dump();
   });
 
@@ -2437,6 +2475,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
                               OldWaitcntInstr);
 
+  LLVM_DEBUG({
+    dbgs() << "*** End Block: ";
+    Block.printName(dbgs());
+    ScoreBrackets.dump();
+  });
+
   return Modified;
 }
 
@@ -2699,8 +2743,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
           BlockInfo &SuccBI = SuccBII->second;
           if (!SuccBI.Incoming) {
             SuccBI.Dirty = true;
-            if (SuccBII <= BII)
+            if (SuccBII <= BII) {
+              LLVM_DEBUG(dbgs() << "repeat on backedge\n");
               Repeat = true;
+            }
             if (!MoveBracketsToSucc) {
               MoveBracketsToSucc = &SuccBI;
             } else {
@@ -2708,8 +2754,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
             }
           } else if (SuccBI.Incoming->merge(*Brackets)) {
             SuccBI.Dirty = true;
-            if (SuccBII <= BII)
+            if (SuccBII <= BII) {
+              LLVM_DEBUG(dbgs() << "repeat on backedge\n");
               Repeat = true;
+            }
           }
         }
         if (MoveBracketsToSucc)

From 8f82c027c8969d965c43909da639e7790af19956 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 19 Jun 2025 10:21:17 +0200
Subject: [PATCH 0908/1322] [AMDGPU] New RegBankSelect: Add rules for
 `G_PTRTOINT` and `G_INTTOPTR` (#142604)

---
 .../Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 16 +++++++++++++++-
 .../AMDGPU/GlobalISel/regbankselect-inttoptr.mir |  1 +
 .../AMDGPU/GlobalISel/regbankselect-ptrtoint.mir |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5402129e4188..db3f6bd360b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -718,7 +718,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
       .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
 
-  addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}});
+  addRulesForGOpcs({G_INTTOPTR})
+      .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
+      .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
+      .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
+      .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
+      .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
+      .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
+
+  addRulesForGOpcs({G_PTRTOINT})
+      .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
+      .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
+      .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
+      .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
+      .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
+      .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
index 42600d7d0dd7..aec68e5a5b71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
 # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s
 
 ---
 name: inttoptr_s_p0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
index 9240c9f6df40..31cbae9ab47f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
 # RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s
 
 ---
 name: ptrtoint_s_p0

From 681db064d221e9eef024ce0aef6165caa37fbfd2 Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Thu, 19 Jun 2025 16:25:51 +0800
Subject: [PATCH 0909/1322] [TSan] Make Shadow/Meta region inclusive-exclusive
 (#144647)

This commit changes the interval shadow/meta address check from
inclusive-inclusive ( $[\mathrm{start}, \mathrm{end}]$ ) to
inclusive-exclusive ( $[\mathrm{start}, \mathrm{end})$ ), to resolve the
ambiguity of the end point address. This also aligns the logic with the
check for `isAppMem` (i.e., inclusive-exclusive), ensuring consistent
behavior across all memory classifications.

1. The `isShadowMem` and `isMetaMem` checks previously used an
inclusive-inclusive interval, i.e., $[\mathrm{start}, \mathrm{end}]$,
which could lead to a boundary address being incorrectly classified as
both Shadow and Meta memory, e.g., 0x3000_0000_0000 in
`Mapping48AddressSpace`.
- What's more, even when Shadow doesn't border Meta, `ShadowMem::end`
cannot be considered a legal shadow address, as TSan protects the gap,
i.e., `ProtectRange(ShadowEnd(), MetaShadowBeg());`

2. `ShadowMem`/`MetaMem` addresses are derived from `AppMem` using an
affine-like transformation (`* factor + bias`). This transformation
includes two extra modifications: high- and low-order bits are masked
out, and for Shadow Memory, an optional XOR operation may be applied to
prevent conflicts with certain AppMem regions.
- Given that all AppMem regions are defined as inclusive-exclusive
intervals, $[\mathrm{start}, \mathrm{end})$, the resulting Shadow/Meta
regions should logically also be inclusive-exclusive.

Note: This change is purely for improving code consistency and should
have no functional impact. In practice, the exact endpoint addresses of
the Shadow/Meta regions are generally not reached.
---
 compiler-rt/lib/tsan/rtl/tsan_platform.h     | 4 ++--
 compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform.h b/compiler-rt/lib/tsan/rtl/tsan_platform.h
index 354f6da6a64a..ada594bc11fc 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform.h
@@ -931,7 +931,7 @@ bool IsAppMem(uptr mem) { return SelectMapping<IsAppMemImpl>(mem); }
 struct IsShadowMemImpl {
   template <typename Mapping>
   static bool Apply(uptr mem) {
-    return mem >= Mapping::kShadowBeg && mem <= Mapping::kShadowEnd;
+    return mem >= Mapping::kShadowBeg && mem < Mapping::kShadowEnd;
   }
 };
 
@@ -943,7 +943,7 @@ bool IsShadowMem(RawShadow *p) {
 struct IsMetaMemImpl {
   template <typename Mapping>
   static bool Apply(uptr mem) {
-    return mem >= Mapping::kMetaShadowBeg && mem <= Mapping::kMetaShadowEnd;
+    return mem >= Mapping::kMetaShadowBeg && mem < Mapping::kMetaShadowEnd;
   }
 };
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
index cf07686d968d..dbdc6359d92a 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
@@ -525,7 +525,7 @@ SECOND:
 void ShadowSet(RawShadow* p, RawShadow* end, RawShadow v) {
   DCHECK_LE(p, end);
   DCHECK(IsShadowMem(p));
-  DCHECK(IsShadowMem(end));
+  DCHECK(p == end || IsShadowMem(end - 1));
   UNUSED const uptr kAlign = kShadowCnt * kShadowSize;
   DCHECK_EQ(reinterpret_cast<uptr>(p) % kAlign, 0);
   DCHECK_EQ(reinterpret_cast<uptr>(end) % kAlign, 0);
@@ -669,7 +669,7 @@ void MemoryAccessRangeT(ThreadState* thr, uptr pc, uptr addr, uptr size) {
   RawShadow* shadow_mem = MemToShadow(addr);
   DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_read=%d\n", thr->tid,
            (void*)pc, (void*)addr, (int)size, is_read);
-
+  DCHECK_NE(size, 0);
 #if SANITIZER_DEBUG
   if (!IsAppMem(addr)) {
     Printf("Access to non app mem start: %p\n", (void*)addr);

From 584cc376870505821b5ff0b0e80be85ee563ff0c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 19 Jun 2025 10:37:18 +0200
Subject: [PATCH 0910/1322] [libc++] Move std::abs into __math/abs.h (#139586)

`template <class = int>` is also added to our implementations to avoid
an ambiguity between the libc's version and our version when both are
visible.

This avoids including `<stdlib.h>` in `<math.h>`.
---
 libcxx/include/__math/abs.h                   | 24 +++++++++++++++++++
 libcxx/include/math.h                         | 11 +--------
 libcxx/include/stdlib.h                       | 19 ++-------------
 .../test/std/numerics/c.math/abs.verify.cpp   |  4 ++--
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/libcxx/include/__math/abs.h b/libcxx/include/__math/abs.h
index fc3bf3a2c7c3..b780159f11eb 100644
--- a/libcxx/include/__math/abs.h
+++ b/libcxx/include/__math/abs.h
@@ -39,6 +39,30 @@ template <class _A1, __enable_if_t<is_integral<_A1>::value, int> = 0>
   return __builtin_fabs((double)__x);
 }
 
+// abs
+
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI inline float abs(float __x) _NOEXCEPT { return __builtin_fabsf(__x); }
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI inline double abs(double __x) _NOEXCEPT { return __builtin_fabs(__x); }
+
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI inline long double abs(long double __x) _NOEXCEPT {
+  return __builtin_fabsl(__x);
+}
+
+template <class = int>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI inline int abs(int __x) _NOEXCEPT {
+  return __builtin_abs(__x);
+}
+
+template <class = int>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI inline long abs(long __x) _NOEXCEPT {
+  return __builtin_labs(__x);
+}
+
+template <class = int>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI inline long long abs(long long __x) _NOEXCEPT {
+  return __builtin_llabs(__x);
+}
+
 } // namespace __math
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/math.h b/libcxx/include/math.h
index de2dacde282c..929bef638504 100644
--- a/libcxx/include/math.h
+++ b/libcxx/include/math.h
@@ -378,9 +378,7 @@ extern "C++" {
 #      include <__math/traits.h>
 #      include <__math/trigonometric_functions.h>
 #      include <__type_traits/enable_if.h>
-#      include <__type_traits/is_floating_point.h>
 #      include <__type_traits/is_integral.h>
-#      include <stdlib.h>
 
 // fpclassify relies on implementation-defined constants, so we can't move it to a detail header
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -431,19 +429,12 @@ using std::__math::isnormal;
 using std::__math::isunordered;
 #      endif // _LIBCPP_MSVCRT
 
-// abs
-//
-// handled in stdlib.h
-
-// div
-//
-// handled in stdlib.h
-
 // We have to provide double overloads for <math.h> to work on platforms that don't provide the full set of math
 // functions. To make the overload set work with multiple functions that take the same arguments, we make our overloads
 // templates. Functions are preferred over function templates during overload resolution, which means that our overload
 // will only be selected when the C library doesn't provide one.
 
+using std::__math::abs;
 using std::__math::acos;
 using std::__math::acosh;
 using std::__math::asin;
diff --git a/libcxx/include/stdlib.h b/libcxx/include/stdlib.h
index 39550f36bb6e..8dfdfa416f08 100644
--- a/libcxx/include/stdlib.h
+++ b/libcxx/include/stdlib.h
@@ -106,23 +106,8 @@ extern "C++" {
 #        undef llabs
 #      endif
 
-// MSVCRT already has the correct prototype in <stdlib.h> if __cplusplus is defined
-#      if !defined(_LIBCPP_MSVCRT)
-[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI long abs(long __x) _NOEXCEPT { return __builtin_labs(__x); }
-[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI long long abs(long long __x) _NOEXCEPT { return __builtin_llabs(__x); }
-#      endif // !defined(_LIBCPP_MSVCRT)
-
-[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI float abs(float __lcpp_x) _NOEXCEPT {
-  return __builtin_fabsf(__lcpp_x); // Use builtins to prevent needing math.h
-}
-
-[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI double abs(double __lcpp_x) _NOEXCEPT {
-  return __builtin_fabs(__lcpp_x);
-}
-
-[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI long double abs(long double __lcpp_x) _NOEXCEPT {
-  return __builtin_fabsl(__lcpp_x);
-}
+#      include <__math/abs.h>
+using std::__math::abs;
 
 // div
 
diff --git a/libcxx/test/std/numerics/c.math/abs.verify.cpp b/libcxx/test/std/numerics/c.math/abs.verify.cpp
index dec80762b214..cc30112f0c31 100644
--- a/libcxx/test/std/numerics/c.math/abs.verify.cpp
+++ b/libcxx/test/std/numerics/c.math/abs.verify.cpp
@@ -13,10 +13,10 @@ void f() {
     (void)std::abs(ui); // expected-error {{call to 'abs' is ambiguous}}
 
     unsigned char uc = -5;
-    (void)std::abs(uc); // expected-warning {{taking the absolute value of unsigned type 'unsigned char' has no effect}}
+    (void)std::abs(uc); // expected-warning 0-1 {{taking the absolute value of unsigned type 'unsigned char' has no effect}}
 
     unsigned short us = -5;
-    (void)std::abs(us); // expected-warning {{taking the absolute value of unsigned type 'unsigned short' has no effect}}
+    (void)std::abs(us); // expected-warning 0-1 {{taking the absolute value of unsigned type 'unsigned short' has no effect}}
 
     unsigned long ul = -5;
     (void)std::abs(ul); // expected-error {{call to 'abs' is ambiguous}}

From 20245bbf66977ca9de5a2b6e29e8617a3a5d9fb5 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 19 Jun 2025 10:41:10 +0200
Subject: [PATCH 0911/1322] [libc++][NFC] Format abs.pass.cpp test

I will modify the test in an upcoming PR. I'm formatting it now to avoid
a bunch of whitespace changes in that PR.
---
 libcxx/test/std/numerics/c.math/abs.pass.cpp | 80 +++++++++-----------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/libcxx/test/std/numerics/c.math/abs.pass.cpp b/libcxx/test/std/numerics/c.math/abs.pass.cpp
index 03aae465c257..51aee6e98683 100644
--- a/libcxx/test/std/numerics/c.math/abs.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/abs.pass.cpp
@@ -14,65 +14,59 @@
 
 #include "test_macros.h"
 
-template<class T>
-struct correct_size_int
-{
-    typedef typename std::conditional<sizeof(T) < sizeof(int), int, T>::type type;
+template <class T>
+struct correct_size_int {
+  typedef typename std::conditional<sizeof(T) < sizeof(int), int, T>::type type;
 };
 
 template <class Source, class Result>
-void test_abs()
-{
-    Source neg_val = -5;
-    Source pos_val = 5;
-    Result res = 5;
+void test_abs() {
+  Source neg_val = -5;
+  Source pos_val = 5;
+  Result res     = 5;
 
-    ASSERT_SAME_TYPE(decltype(std::abs(neg_val)), Result);
+  ASSERT_SAME_TYPE(decltype(std::abs(neg_val)), Result);
 
-    assert(std::abs(neg_val) == res);
-    assert(std::abs(pos_val) == res);
+  assert(std::abs(neg_val) == res);
+  assert(std::abs(pos_val) == res);
 }
 
-void test_big()
-{
-    long long int big_value = std::numeric_limits<long long int>::max(); // a value too big for ints to store
-    long long int negative_big_value = -big_value;
-    assert(std::abs(negative_big_value) == big_value); // make sure it doesn't get casted to a smaller type
+void test_big() {
+  long long int big_value          = std::numeric_limits<long long int>::max(); // a value too big for ints to store
+  long long int negative_big_value = -big_value;
+  assert(std::abs(negative_big_value) == big_value); // make sure it doesn't get casted to a smaller type
 }
 
 // The following is helpful to keep in mind:
 // 1byte == char <= short <= int <= long <= long long
 
-int main(int, char**)
-{
-    // On some systems char is unsigned.
-    // If that is the case, we should just test signed char twice.
-    typedef std::conditional<
-        std::is_signed<char>::value, char, signed char
-    >::type SignedChar;
+int main(int, char**) {
+  // On some systems char is unsigned.
+  // If that is the case, we should just test signed char twice.
+  typedef std::conditional< std::is_signed<char>::value, char, signed char >::type SignedChar;
 
-    // All types less than or equal to and not greater than int are promoted to int.
-    test_abs<short int, int>();
-    test_abs<SignedChar, int>();
-    test_abs<signed char, int>();
+  // All types less than or equal to and not greater than int are promoted to int.
+  test_abs<short int, int>();
+  test_abs<SignedChar, int>();
+  test_abs<signed char, int>();
 
-    // These three calls have specific overloads:
-    test_abs<int, int>();
-    test_abs<long int, long int>();
-    test_abs<long long int, long long int>();
+  // These three calls have specific overloads:
+  test_abs<int, int>();
+  test_abs<long int, long int>();
+  test_abs<long long int, long long int>();
 
-    // Here there is no guarantee that int is larger than int8_t so we
-    // use a helper type trait to conditional test against int.
-    test_abs<std::int8_t, correct_size_int<std::int8_t>::type>();
-    test_abs<std::int16_t, correct_size_int<std::int16_t>::type>();
-    test_abs<std::int32_t, correct_size_int<std::int32_t>::type>();
-    test_abs<std::int64_t, correct_size_int<std::int64_t>::type>();
+  // Here there is no guarantee that int is larger than int8_t so we
+  // use a helper type trait to conditional test against int.
+  test_abs<std::int8_t, correct_size_int<std::int8_t>::type>();
+  test_abs<std::int16_t, correct_size_int<std::int16_t>::type>();
+  test_abs<std::int32_t, correct_size_int<std::int32_t>::type>();
+  test_abs<std::int64_t, correct_size_int<std::int64_t>::type>();
 
-    test_abs<long double, long double>();
-    test_abs<double, double>();
-    test_abs<float, float>();
+  test_abs<long double, long double>();
+  test_abs<double, double>();
+  test_abs<float, float>();
 
-    test_big();
+  test_big();
 
-    return 0;
+  return 0;
 }

From 6273c5d4d3540204cb0d298cf1cf74ba94ed2a6c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 19 Jun 2025 09:57:07 +0100
Subject: [PATCH 0912/1322] [lldb][AArch64] Correctly invalidate svg when vg is
 written (#140875)

Recently the Linux Kernel has fixed a bunch of issues in SME support and
while testing that, I found two tests failing:
FAIL: test_za_register_dynamic_config_main_disabled
(TestZAThreadedDynamic.AArch64ZAThreadedTestCase)
FAIL: test_za_register_dynamic_config_main_enabled
(TestZAThreadedDynamic.AArch64ZAThreadedTestCase)

These tests write to vg during streaming mode from lldb and expect to
see that za has been resized to match it. Instead, it was unavailable.
lldb-server was sending the correct amount of data but lldb client was
expecting the old size.

Turns out that instead of a write to vg invalidating svg, it was
invalidating... something else. I'm still not sure how these tests ever
worked but with this one line fix, they pass again.

I did not see this issue with SVE or streaming SVE Z registers because
those always resize using the value of vg, and vg always has the value
we just wrote.

(remember that vg is the vector length of the **current** mode, not of
non-streaming mode, whereas svg is the vector length of streaming mode,
even if you are currently in non-streaming mode)
---
 .../source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
index c004c0f3c3cf..fbf128553fd5 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
@@ -425,8 +425,7 @@ void RegisterInfoPOSIX_arm64::AddRegSetSME(bool has_zt) {
   //
   // This must be added now, rather than when vg is defined because SME is a
   // dynamic set that may or may not be present.
-  static uint32_t vg_invalidates[] = {sme_regnum + 1 /*svg*/,
-                                      LLDB_INVALID_REGNUM};
+  static uint32_t vg_invalidates[] = {GetRegNumSMESVG(), LLDB_INVALID_REGNUM};
   m_dynamic_reg_infos[GetRegNumSVEVG()].invalidate_regs = vg_invalidates;
 }
 

From c0a9c908a697a150f797d0dff7f0bcd3782abed9 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Thu, 19 Jun 2025 14:45:43 +0530
Subject: [PATCH 0913/1322] [MLIR][NVVM-Docs] Fix rendering of a few tables in
 NVVM Docs (#144764)

This patch corrects the formatting of tables
in the tcgen05 ld/st and smem_descriptor Ops.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 2dd7ac29cfed..418931b93126 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -3455,6 +3455,7 @@ def NVVM_Tcgen05MmaSmemDescOp : NVVM_Op<"tcgen05.mma_smem_desc", []> {
     properties of multiplicand matrix in shared memory including its location
     in the shared memory of the current CTA.
 
+    ```
     +-----------+------+------------------------------------------------------+
     | Bit-field | Size | Description                                          |
     +-----------+------+------------------------------------------------------+
@@ -3477,6 +3478,7 @@ def NVVM_Tcgen05MmaSmemDescOp : NVVM_Op<"tcgen05.mma_smem_desc", []> {
     |           |      |   6: 32-Byte swizzling                               |
     |           |      |   (Values 3, 5 and 7 are invalid)                    |
     +-----------+------+------------------------------------------------------+    
+    ```
 
     Example:
     ```mlir
@@ -3578,7 +3580,8 @@ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld", [NVVMRequiresSMa<[100, 101]>]> {
     elements from adjacent columns into a single 32-bit element during the load.
 
     The following table describes the size of the vector for various combinations
-    of `num` and `shape` attributes
+    of `num` and `shape` attributes:
+    ```
     |=====================================================================|
     | num/shape      |     16x32bx2/16x64b/32x32b |  16x128b   | 16x256b  |
     |=====================================================================|
@@ -3591,6 +3594,7 @@ def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld", [NVVMRequiresSMa<[100, 101]>]> {
     | x64            |          64                |    128     |    NA    |
     | x128           |          128               |    NA      |    NA    |
     |=====================================================================|
+    ```
 
     Example:
     ```mlir
@@ -3666,7 +3670,8 @@ def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st", [NVVMRequiresSMa<[100, 101]>]> {
     in the register into two 16-bit elements and store them in adjacent columns.
 
     The following table describes the size of the vector for various combinations
-    of `num` and `shape` attributes
+    of `num` and `shape` attributes:
+    ```
     |=====================================================================|
     | num/shape      |     16x32bx2/16x64b/32x32b |  16x128b   | 16x256b  |
     |=====================================================================|
@@ -3679,6 +3684,7 @@ def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st", [NVVMRequiresSMa<[100, 101]>]> {
     | x64            |          64                |    128     |    NA    |
     | x128           |          128               |    NA      |    NA    |
     |=====================================================================|
+    ```
 
     Example:
     ```mlir

From 97c1a2444574b32dd7a283c53be248c5dbbf62e9 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Thu, 19 Jun 2025 05:47:44 -0400
Subject: [PATCH 0914/1322] [mlir][linalg] Add option to pad dynamic dims to
 `linalg::rewriteAsPaddedOp` (#144354)

This patch makes the following changes:

- Add a `ValueRange typeDynDims` argument to
`linalg::makeComposedPadHighOp`, allowing to pad a tensor with dynamic
dimensions using `tensor::createPadHighOp`.

- Add a `DenseMap<std::pair<unsigned, unsigned>, OpFoldResult>
sizeToPadTo;` option to `LinalgPaddingOptions`. This option allows
setting the size to use when padding a dimension of an operand, allowing
to pad operands even in the case they don't have a constant upper
bounding box. If the value is not provided, then the constant upper
bound is used by default.

- Add a `use_prescribed_tensor_shapes` option to
`transform.structured.pad`. If set to true then `tensor.dim` will be
used as dimensions to compute the size of the padded dim instead of
computing the constant upper bound.

- This patch also changes the behavior for computing the padded shape
`linalg::rewriteAsPaddedOp`, by using the newly added options in
`LinalgPaddingOptions`.

- Finally it adds tests verifying the behavior.
---
 .../Linalg/TransformOps/LinalgTransformOps.td |  10 +-
 .../Dialect/Linalg/Transforms/Transforms.h    |  17 ++
 .../include/mlir/Dialect/Linalg/Utils/Utils.h |  17 +-
 .../TransformOps/LinalgTransformOps.cpp       |  36 +++-
 .../lib/Dialect/Linalg/Transforms/Padding.cpp | 163 ++++++++++++++----
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       |  24 ++-
 .../test/Dialect/Linalg/transform-op-pad.mlir |  39 ++++-
 7 files changed, 246 insertions(+), 60 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 15ea5e7bf715..6f6df350f1ba 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1134,7 +1134,8 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
          DefaultValuedAttr<
           TypedArrayAttrBase<I64ArrayAttr, "array of arrays of i64">,
           "{}">:$transpose_paddings,
-         DefaultValuedAttr<StrAttr, "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copy_back_op);
+         DefaultValuedAttr<StrAttr, "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copy_back_op,
+         DefaultValuedAttr<UnitAttr, "false">:$use_prescribed_tensor_shapes);
   let results = (outs TransformHandleTypeInterface:$padded,
                       TransformHandleTypeInterface:$pad,
                       TransformHandleTypeInterface:$copy);
@@ -1142,6 +1143,7 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
   let assemblyFormat = [{
     $target 
     (`pad_to_multiple_of` custom<DynamicIndexList>($pad_to_multiple_of, $static_pad_to_multiple_of)^)?
+    (`use_prescribed_tensor_shapes` $use_prescribed_tensor_shapes^)?
     attr-dict
     `:` functional-type(operands, results)
   }];
@@ -1159,13 +1161,15 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
                    CArg<"ArrayRef<int64_t>", "{}">:$staticPadToMultipleOf,
                    CArg<"ArrayRef<int64_t>", "{}">:$nofoldFlags,
                    CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
-                   CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>,
+                   CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp,
+                   CArg<"bool", "false">:$usePrescribedTensorShapes)>,
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$paddingDimensions,
                    "ArrayRef<OpFoldResult>":$mixedPadToMultipleOf,
                    CArg<"ArrayRef<int64_t>", "{}">:$nofoldFlags,
                    CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
-                   CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>
+                   CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp,
+                   CArg<"bool", "false">:$usePrescribedTensorShapes)>
   ];
 
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 2eef0a06d0eb..147a2907f52e 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -295,6 +295,23 @@ struct LinalgPaddingOptions {
     padToMultipleOf.emplace(m.begin(), m.end());
     return *this;
   }
+  /// A mapping between an operand and shape dim, and a size for a padding
+  /// dimension. Each size is expected to be greater or equal than the
+  /// corresponding shape dim. If no value is provided then the constant upper
+  /// bound will be used.
+  DenseMap<std::pair<unsigned, unsigned>, OpFoldResult> sizeToPadTo;
+  LinalgPaddingOptions &setSizeToPadTo(unsigned operandIndex, unsigned dimIndex,
+                                       OpFoldResult size) {
+    assert(size && "expected non-null size");
+    sizeToPadTo[{operandIndex, dimIndex}] = size;
+    return *this;
+  }
+  /// Given the operand index and shape dim it returns the size to pad to.
+  OpFoldResult getSizeToPadTo(unsigned operandIndex, unsigned dimIndex) const {
+    return sizeToPadTo.lookup_or(
+        std::pair<unsigned, unsigned>(operandIndex, dimIndex), nullptr);
+  }
+
   /// A flag for every operand to mark the PadOp as nofold which enables
   /// packing for statically shaped operands.
   SmallVector<bool> nofoldFlags;
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 80aa034d2199..fc151d02ceef 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -71,12 +71,14 @@ bool isParallelIterator(utils::IteratorType iteratorType);
 /// Check if iterator type  has "reduction" semantics.
 bool isReductionIterator(utils::IteratorType iteratorType);
 
-/// Create a tensor::PadOp that pads `source` to the size of the statically
-/// sized `type` whose static sizes are assumed to be greater than the dynamic
-/// `source` size. The padding introduces trailing `pad` values until the
-/// target size is met. If `source` is defined by one or more LinalgOps that
-/// have been padded with the same value and sizes, return their padded result
-/// instead of creating a tensor::PadOp.
+/// Create a tensor::PadOp that pads `source` to the shape of `type` whose sizes
+/// are assumed to be greater than the dynamic `source` size. If `typeDynDims`
+/// is specified, then it must contain the sizes of all the dynamic dimensions
+/// in order of appearance in `type`, otherwise the function will pad those
+/// values to `0`. The padding introduces trailing `pad` values until the target
+/// size is met. If `source` is defined by one or more LinalgOps that have been
+/// padded with the same  value and sizes, return their padded result instead of
+/// creating a tensor::PadOp.
 ///
 /// Example:
 /// ```
@@ -91,7 +93,8 @@ bool isReductionIterator(utils::IteratorType iteratorType);
 /// %4 = tensor.pad %3 low[0, 0] high[...] { tensor.yield %other_cst }
 /// ```
 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
-                            Value source, Value pad, bool nofold);
+                            Value source, Value padding, bool nofold,
+                            ValueRange typeDynDims = std::nullopt);
 
 /// Returns GenericOp that copies an n-D memref. Unlike the current
 /// implementation of memref::CopyOp, this op can further tile, lower to loops
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index b2c28f5eed33..d78c8847f884 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1907,7 +1907,8 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                              ArrayRef<int64_t> padToMultipleOf,
                              ArrayRef<int64_t> nofoldFlags,
                              ArrayRef<Attribute> transposePaddings,
-                             StringRef copyBackOp) {
+                             StringRef copyBackOp,
+                             bool usePrescribedTensorShapes) {
   auto resultType = transform::AnyOpType::get(b.getContext());
   return build(/*builder=*/b,
                /*result=*/result,
@@ -1922,7 +1923,9 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                     : b.getDenseI64ArrayAttr(padToMultipleOf)),
                /*nofoldFlags=*/b.getI64ArrayAttr(nofoldFlags),
                /*transposePaddings=*/b.getArrayAttr(transposePaddings),
-               /*copyBackOp=*/b.getStringAttr(copyBackOp));
+               /*copyBackOp=*/b.getStringAttr(copyBackOp),
+               /*usePrescribedTensorShapes=*/
+               usePrescribedTensorShapes ? b.getUnitAttr() : nullptr);
 }
 
 void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
@@ -1930,7 +1933,8 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                              ArrayRef<OpFoldResult> mixedPadToMultipleOf,
                              ArrayRef<int64_t> nofoldFlags,
                              ArrayRef<Attribute> transposePaddings,
-                             StringRef copyBackOp) {
+                             StringRef copyBackOp,
+                             bool usePrescribedTensorShapes) {
   auto resultType = transform::AnyOpType::get(b.getContext());
   SmallVector<int64_t> staticPadToMultipleOf;
   SmallVector<Value> dynamicPadToMultipleOf;
@@ -1946,7 +1950,8 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                /*padToMultipleOf=*/staticPadToMultipleOf,
                /*nofoldFlags=*/b.getI64ArrayAttr(nofoldFlags),
                /*transposePaddings=*/b.getArrayAttr(transposePaddings),
-               /*copyBackOp=*/b.getStringAttr(copyBackOp));
+               /*copyBackOp=*/copyBackOp,
+               /*usePrescribedTensorShapes=*/usePrescribedTensorShapes);
 }
 
 void PadOp::getEffects(
@@ -2051,11 +2056,34 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter,
     } else {
       llvm_unreachable("unsupported copy_back op");
     }
+    // Populate `sizeToPadTo` with the dynamic tensor sizes for each operand.
+    bool irChanged = false;
+    if (getUsePrescribedTensorShapes() &&
+        linalgTarget.hasPureTensorSemantics()) {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPoint(linalgTarget);
+      for (OpOperand &operand : linalgTarget->getOpOperands()) {
+        for (auto [i, dim] : llvm::enumerate(linalgTarget.getShape(&operand))) {
+          if (!ShapedType::isDynamic(dim))
+            continue;
+          options.setSizeToPadTo(operand.getOperandNumber(), i,
+                                 tensor::getMixedSize(rewriter,
+                                                      operand.get().getLoc(),
+                                                      operand.get(), i));
+          irChanged = true;
+        }
+      }
+    }
 
     SmallVector<Value> replacements;
     SmallVector<tensor::PadOp> newPadOps;
     if (failed(rewriteAsPaddedOp(rewriter, linalgTarget, options, paddedOp,
                                  replacements, newPadOps))) {
+      if (irChanged) {
+        auto diag = emitDefiniteFailure() << "failed to pad op";
+        diag.attachNote(target->getLoc()) << "target op";
+        return diag;
+      }
       auto diag = emitSilenceableError() << "failed to pad op";
       diag.attachNote(target->getLoc()) << "target op";
       return diag;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
index 9a685f6dc96a..dc9e11eccac4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -22,53 +23,93 @@ using namespace mlir::linalg;
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
 #define DBGSNL() (llvm::dbgs() << "\n")
 
-/// Compute the padded shape of the given operand. The operand is padded to a
-/// static bounding box according to the specified padding options.
-static LogicalResult computePaddedShape(linalg::LinalgOp opToPad,
+namespace {
+/// Helper class for storing padding information.
+struct PaddingInfo {
+  PaddingInfo(int64_t padToMultipleOf = 1, OpFoldResult size = {})
+      : padToMultipleOf(padToMultipleOf), size(size) {}
+  /// Pad the tensor to a multiple of.
+  int64_t padToMultipleOf = 1;
+  /// The size used for padding.
+  OpFoldResult size = {};
+};
+
+/// Helper class for storing and computing the padded shape.
+struct PaddedShape {
+  /// Initializes the shape information and on success it returns whether the
+  /// shape of the operand will change. Returns failure if the operand cannot be
+  /// padded.
+  FailureOr<bool> initialize(linalg::LinalgOp opToPad, OpOperand *opOperand,
+                             const LinalgPaddingOptions &options);
+
+  /// Computs the padded shape.
+  void computePadding(OpBuilder &builder, Value operand);
+
+  /// Returns the new tensor type.
+  RankedTensorType getType(Type elemTy) {
+    return RankedTensorType::get(shape, elemTy);
+  }
+
+  SmallVector<Value> dynDims;
+
+private:
+  SmallVector<int64_t> shape;
+  DenseMap<int64_t, PaddingInfo> dimToInfo;
+};
+} // namespace
+
+FailureOr<bool> PaddedShape::initialize(linalg::LinalgOp opToPad,
                                         OpOperand *opOperand,
-                                        const LinalgPaddingOptions &options,
-                                        SmallVector<int64_t> &paddedShape,
-                                        bool &alreadyHasRequestedShape) {
+                                        const LinalgPaddingOptions &options) {
   AffineMap indexingMap = opToPad.getMatchingIndexingMap(opOperand);
-  ArrayRef<int64_t> shape = opToPad.getShape(opOperand);
+
+  // Initialize the padded shape.
+  llvm::append_range(shape, opToPad.getShape(opOperand));
 
   // Collect the shape dimensions that are a function of "paddingDimensions",
   // along with the multiple that they should be padded to ("1" if none).
-  alreadyHasRequestedShape = true;
-  DenseMap<int64_t, int64_t> shapeDimToMultiple;
+  bool alreadyHasRequestedShape = true;
   for (const auto &dimEn : enumerate(options.paddingDimensions)) {
     for (const auto &en : enumerate(indexingMap.getResults())) {
       if (en.value().isFunctionOfDim(dimEn.value())) {
+        PaddingInfo paddingInfo;
         int64_t dimSize = shape[en.index()];
         if (options.padToMultipleOf.has_value()) {
-          shapeDimToMultiple[en.index()] =
+          paddingInfo.padToMultipleOf =
               (*options.padToMultipleOf)[dimEn.index()];
         } else {
-          shapeDimToMultiple[en.index()] = 1;
+          paddingInfo.padToMultipleOf = 1;
         }
-        if (ShapedType::isDynamic(dimSize)) {
-          alreadyHasRequestedShape = false;
-        } else if (dimSize % shapeDimToMultiple[en.index()] != 0) {
+
+        // Check if the user provided a size in the options.
+        paddingInfo.size =
+            options.getSizeToPadTo(opOperand->getOperandNumber(), en.index());
+
+        // Set the padding info.
+        dimToInfo[en.index()] = paddingInfo;
+        if (ShapedType::isDynamic(dimSize) ||
+            dimSize % paddingInfo.padToMultipleOf != 0 ||
+            !paddingInfo.size.isNull()) {
           alreadyHasRequestedShape = false;
         }
       }
     }
   }
 
-  // Helper function to round a number up to a given multiple.
-  auto ceil = [](int64_t val, int64_t multiple) {
-    return ((val + multiple - 1) / multiple) * multiple;
-  };
-
   // Upper bound the sizes to obtain a static bounding box.
-  paddedShape.assign(shape.begin(), shape.end());
   for (int64_t i = 0, e = shape.size(); i < e; ++i) {
-    LLVM_DEBUG(DBGS() << "--compute padded size for dim " << i << "\n");
+    LLVM_DEBUG(DBGS() << "--computing un-padded size for dim " << i << "\n");
     // Skip dimensions that do not require padding.
-    if (!shapeDimToMultiple.contains(i)) {
+    if (!dimToInfo.contains(i)) {
       LLVM_DEBUG(DBGS() << "----dim does not require padding, SKIP\n");
       continue;
     }
+    PaddingInfo &info = dimToInfo[i];
+    if (info.size) {
+      LLVM_DEBUG(DBGS() << "----the user provided the size: " << info.size
+                        << "\n");
+      continue;
+    }
     // Otherwise, try to compute a constant upper bound for the size value.
     FailureOr<int64_t> upperBound =
         ValueBoundsConstraintSet::computeConstantBound(
@@ -77,14 +118,58 @@ static LogicalResult computePaddedShape(linalg::LinalgOp opToPad,
              /*dim=*/i},
             /*stopCondition=*/nullptr, /*closedUB=*/true);
     if (failed(upperBound)) {
-      LLVM_DEBUG(DBGS() << "----could not compute a bounding box for padding");
+      LLVM_DEBUG(
+          DBGS() << "----could not compute a bounding box for padding\n");
       return failure();
     }
-    paddedShape[i] = ceil(*upperBound, shapeDimToMultiple[i]);
-    LLVM_DEBUG(DBGS() << "----new dim size: " << paddedShape[i] << "\n");
+    info.size =
+        IntegerAttr::get(IndexType::get(opToPad.getContext()), *upperBound);
+    LLVM_DEBUG(DBGS() << "----new un-padded size: " << info.size << "\n");
   }
+  return alreadyHasRequestedShape;
+}
 
-  return success();
+void PaddedShape::computePadding(OpBuilder &builder, Value operand) {
+  Location loc = operand.getLoc();
+  AffineExpr sizeSym = builder.getAffineSymbolExpr(0);
+
+  // Compute the padding for each dimension.
+  for (auto &&[i, dim] : llvm::enumerate(shape)) {
+    LLVM_DEBUG(DBGS() << "--computing padded size for dim " << i << "\n");
+
+    // Get the padding info or default info for the shape dimension.
+    PaddingInfo paddingInfo = dimToInfo.lookup(i);
+
+    // Skip dimensions that do not require padding.
+    if (paddingInfo.size.isNull()) {
+      LLVM_DEBUG(DBGS() << "----dim does not require padding, SKIP\n");
+
+      // We still need to push the size as `makeComposedPadHighOp` expects a
+      // range with all the dynamic sizes, whether they're being padded or not.
+      if (ShapedType::isDynamic(dim)) {
+        dynDims.push_back(
+            cast<Value>(tensor::getMixedSize(builder, loc, operand, i)));
+      }
+      continue;
+    }
+
+    // Compute the padded size to be a multiple of `padToMultipleOf`.
+    AffineExpr szExpr = (sizeSym).ceilDiv(paddingInfo.padToMultipleOf) *
+                        paddingInfo.padToMultipleOf;
+    OpFoldResult paddedSize = affine::makeComposedFoldedAffineApply(
+        builder, loc, szExpr, paddingInfo.size);
+    assert(paddedSize && "invalid arguments to affine apply");
+
+    if (auto cstSzAttr = dyn_cast<Attribute>(paddedSize)) {
+      // Update the shape as the size is static.
+      dim = cast<IntegerAttr>(cstSzAttr).getValue().getZExtValue();
+    } else {
+      // Add a dynamic dimension.
+      dim = ShapedType::kDynamic;
+      dynDims.push_back(cast<Value>(paddedSize));
+    }
+    LLVM_DEBUG(DBGS() << "----new dim size: " << paddedSize << "\n");
+  }
 }
 
 /// Pad the `opOperand` in the "paddingDimensions" using the padding value and
@@ -107,20 +192,21 @@ static FailureOr<Value> padOperandToSmallestStaticBoundingBox(
        options.padToMultipleOf->size() == options.paddingDimensions.size()) &&
       "invalid number of elements in padToMultipleOf");
 
-  // Compute padded shape.
-  SmallVector<int64_t> paddedShape;
-  bool alreadyHasRequestedShape = false;
-  if (failed(computePaddedShape(opToPad, opOperand, options, paddedShape,
-                                alreadyHasRequestedShape)))
+  // Initialize the padded shape and get whether it requires padding.
+  PaddedShape shape;
+  FailureOr<bool> alreadyHasRequestedShape =
+      shape.initialize(opToPad, opOperand, options);
+  if (failed(alreadyHasRequestedShape)) {
     return rewriter.notifyMatchFailure(opToPad,
                                        "--failed to compute padded shape");
+  }
 
-  // Return the unpadded operand if padding to a static shape is not needed and
+  // Return the un-padded operand if padding to a static shape is not needed and
   // if the nofold flag is not set.
   bool nofold = opOperand->getOperandNumber() < options.nofoldFlags.size()
                     ? bool(options.nofoldFlags[opOperand->getOperandNumber()])
                     : false;
-  if (!nofold && alreadyHasRequestedShape)
+  if (!nofold && *alreadyHasRequestedShape)
     return opOperand->get();
 
   // Fail if `paddingValues` specifies no padding value.
@@ -140,13 +226,18 @@ static FailureOr<Value> padOperandToSmallestStaticBoundingBox(
         opToPad.getLoc(), cast<TypedAttr>(paddingAttr));
   }
 
+  // Computes the padded shape.
+  if (!*alreadyHasRequestedShape)
+    shape.computePadding(rewriter, opOperand->get());
+
   // Pad the operand to the bounding box defined by `paddedShape`.
-  auto paddedTensorType = RankedTensorType::get(
-      paddedShape, getElementTypeOrSelf(opOperand->get()));
+  RankedTensorType paddedTensorType =
+      shape.getType(getElementTypeOrSelf(opOperand->get()));
   LLVM_DEBUG(DBGS() << "--SUCCESS, makeComposedPadHighOp with type: "
                     << paddedTensorType);
   return makeComposedPadHighOp(rewriter, opToPad->getLoc(), paddedTensorType,
-                               opOperand->get(), paddingValue, nofold);
+                               opOperand->get(), paddingValue, nofold,
+                               shape.dynDims);
 }
 
 LogicalResult
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 2527d90cfa2e..209309ddb413 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -244,11 +244,13 @@ bool isReductionIterator(utils::IteratorType iteratorType) {
 }
 
 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
-                            Value source, Value pad, bool nofold) {
+                            Value source, Value pad, bool nofold,
+                            ValueRange typeDynDims) {
   // Exit if `source` is not defined by an ExtractSliceOp.
   auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>();
   if (!sliceOp)
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Search the `source` use-def chain for padded LinalgOps.
   Value current = sliceOp.getSource();
@@ -264,24 +266,28 @@ Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
   // Exit if the search fails to match a tensor::PadOp at the end of the matched
   // LinalgOp sequence.
   if (!padOp)
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Exit if the padded result type does not match.
   if (sliceOp.getSource().getType() != type)
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Exit if the LinalgOps are not high padded.
   if (llvm::any_of(padOp.getMixedLowPad(), [](OpFoldResult ofr) {
         return getConstantIntValue(ofr) != static_cast<int64_t>(0);
       }))
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Exit if `padOpSliceOp`, which defines the slice used by
   // `padOp`, is rank-reducing.
   auto padOpSliceOp = padOp.getSource().getDefiningOp<tensor::ExtractSliceOp>();
   if (!padOpSliceOp ||
       sliceOp.getMixedSizes().size() != padOpSliceOp.getMixedSizes().size())
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size
   // of the slice padded by `padOp`.
@@ -290,14 +296,16 @@ Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
           [](std::tuple<OpFoldResult, OpFoldResult> it) {
             return !isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it));
           }))
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Exit if the padding values do not match.
   Attribute padOpPadAttr, padAttr;
   Value padOpPad = padOp.getConstantPaddingValue();
   if (!padOpPad || !matchPattern(padOpPad, m_Constant(&padOpPadAttr)) ||
       !matchPattern(pad, m_Constant(&padAttr)) || padOpPadAttr != padAttr)
-    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b,
+                                   typeDynDims);
 
   // Return the padded result if the padding values and sizes match.
   return sliceOp.getSource();
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index ab2711545405..bc684b53c9b6 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -300,7 +300,7 @@ func.func @negative_no_ub_estimate(%arg0: tensor<?x12xf32>,
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{ailed to pad op}}
+    // expected-error @below {{failed to pad op}}
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       // Note - attempting to pad non-static dim
@@ -313,6 +313,41 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// Test dynamic padding using `use_prescribed_tensor_shapes`
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (-s0 + (s0 ceildiv 7) * 7)>
+// CHECK: @use_prescribed_tensor_shapes
+// CHECK: (%[[ARG0:.*]]: tensor<?x12xf32>, %[[ARG1:.*]]: tensor<12x?xf32>
+func.func @use_prescribed_tensor_shapes(%arg0: tensor<?x12xf32>,
+                                   %arg1: tensor<12x?xf32>,
+                                   %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // CHECK: %[[C1_0:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG1]], %[[C1_0]] : tensor<12x?xf32>
+  // CHECK: %[[PADDING:.*]] = affine.apply #[[MAP]]()[%[[DIM_0]]]
+  // CHECK: %[[PADDED:.*]] = tensor.pad %[[ARG1]] low[0, 0] high[0, %[[PADDING]]] {
+  // CHECK: linalg.matmul ins(%[[ARG0]], %[[PADDED]] : tensor<?x12xf32>, tensor<12x?xf32>) 
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %padded, %pad, %copy_back = transform.structured.pad %0
+    pad_to_multiple_of [7] use_prescribed_tensor_shapes {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[1]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // Check that the padding can be applied even when the output argument of the
 // linalg op is not produced by an empty op or an extract_slice op.
 
@@ -416,6 +451,6 @@ module attributes {transform.with_named_sequence} {
       padding_dimensions=[0, 1, 2],
       nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-    transform.yield
+    transform.yield 
   }
 }

From af51c9d9df9d482503fe30c80dd788a02161cea6 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 19 Jun 2025 10:49:40 +0100
Subject: [PATCH 0915/1322] [LV][NFC] Add branch weight test showing incorrect
 behaviour (#144682)

This patch adds a test that shows incorrect branch weights being set in
function

EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   2 +
 .../LoopVectorize/branch-weights.ll           | 184 ++++++++++++------
 2 files changed, 130 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3b16248f962b..e14f985efd96 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7683,6 +7683,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+    // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
+    // think the MainLoopStep is correct.
     unsigned MainLoopStep = UF * VF.getKnownMinValue();
     unsigned EpilogueLoopStep =
         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
index e11f77d8aeae..6892709f085f 100644
--- a/llvm/test/Transforms/LoopVectorize/branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll
@@ -1,53 +1,103 @@
-; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4  -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 | FileCheck %s
-
-; CHECK-LABEL: @f0(
-;
-; CHECK: entry:
-; CHECK:   br i1 %cmp.entry, label %iter.check, label %exit, !prof [[PROF_F0_ENTRY:![0-9]+]]
-;
-; CHECK: iter.check:
-; CHECK:   br i1 %min.iters.check, label %vec.epilog.scalar.ph, label %vector.scevcheck, !prof [[PROF_F0_UNLIKELY:![0-9]+]]
-;
-; CHECK: vector.scevcheck:
-; CHECK:   br i1 %4, label %vec.epilog.scalar.ph, label %vector.main.loop.iter.check, !prof [[PROF_F0_UNLIKELY]]
-;
-; CHECK: vector.main.loop.iter.check:
-; CHECK:   br i1 %min.iters.check1, label %vec.epilog.ph, label %vector.ph, !prof [[PROF_F0_UNLIKELY]]
-;
-; CHECK: vector.ph:
-; CHECK:   br label %vector.body
-;
-; CHECK: vector.body:
-; CHECK:   br i1 {{.+}}, label %middle.block, label %vector.body, !prof [[PROF_F0_VECTOR_BODY:![0-9]+]]
-;
-; CHECK: middle.block:
-; CHECK:   br i1 %cmp.n, label %exit.loopexit, label %vec.epilog.iter.check, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
-;
-; CHECK: vec.epilog.iter.check:
-; CHECK:   br i1 %min.epilog.iters.check, label %vec.epilog.scalar.ph, label %vec.epilog.ph, !prof [[PROF_F0_VEC_EPILOGUE_SKIP:![0-9]+]]
-;
-; CHECK: vec.epilog.ph:
-; CHECK:   br label %vec.epilog.vector.body
-;
-; CHECK: vec.epilog.vector.body:
-; CHECK:   br i1 {{.+}}, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]]
-;
-; CHECK: vec.epilog.middle.block:
-; CHECK:   br i1 %cmp.n{{.+}}, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
-;
-; CHECK: vec.epilog.scalar.ph:
-; CHECK:   br label %loop
-;
-; CHECK: loop:
-; CHECK:   br i1 %cmp.loop, label %loop, label %exit.loopexit, !prof [[PROF_F0_LOOP:![0-9]+]]
-;
-; CHECK: exit.loopexit:
-; CHECK:   br label %exit
-;
-; CHECK: exit:
-; CHECK:   ret void
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br " --filter "^.*:" --filter "icmp" --version 5
+; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4  -enable-epilogue-vectorization \
+; RUN:   -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC1_EPI4
+; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4  -enable-epilogue-vectorization \
+; RUN:   -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC2_EPI4
 
+; FIXME: For MAINVF4IC2_EPI4 the branch weights in the terminator of
+; the VEC_EPILOG_ITER_CHECK block should be [4,4] since we process 8
+; scalar iterations in the main loop, leaving the remaining count to
+; be in the range [0,7]. That gives a 4:4 chance of skipping the
+; vector epilogue. I believe the problem lies in
+; EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck
+; where the main loop VF is set to the same value as the epilogue VF.
 define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
+; MAINVF4IC1_EPI4-LABEL: define void @f0(
+; MAINVF4IC1_EPI4-SAME: i8 [[N:%.*]], i32 [[LEN:%.*]], ptr [[P:%.*]]) !prof [[PROF0:![0-9]+]] {
+; MAINVF4IC1_EPI4:  [[ENTRY:.*:]]
+; MAINVF4IC1_EPI4:    [[CMP_ENTRY:%.*]] = icmp sgt i32 [[LEN]], 0
+; MAINVF4IC1_EPI4:    br i1 [[CMP_ENTRY]], label %[[ITER_CHECK:.*]], label %[[EXIT:.*]], !prof [[PROF1:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[ITER_CHECK]]:
+; MAINVF4IC1_EPI4:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0:%.*]], 4
+; MAINVF4IC1_EPI4:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]], !prof [[PROF2:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[VECTOR_SCEVCHECK]]:
+; MAINVF4IC1_EPI4:    [[TMP2:%.*]] = icmp slt i8 [[TMP1:%.*]], 0
+; MAINVF4IC1_EPI4:    [[TMP3:%.*]] = icmp ugt i32 [[LEN]], 255
+; MAINVF4IC1_EPI4:    br i1 [[TMP4:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF2]]
+; MAINVF4IC1_EPI4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; MAINVF4IC1_EPI4:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 4
+; MAINVF4IC1_EPI4:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF2]]
+; MAINVF4IC1_EPI4:  [[VECTOR_PH]]:
+; MAINVF4IC1_EPI4:    br label %[[VECTOR_BODY:.*]]
+; MAINVF4IC1_EPI4:  [[VECTOR_BODY]]:
+; MAINVF4IC1_EPI4:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT:%.*]], [[N_VEC:%.*]]
+; MAINVF4IC1_EPI4:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[MIDDLE_BLOCK]]:
+; MAINVF4IC1_EPI4:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; MAINVF4IC1_EPI4:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
+; MAINVF4IC1_EPI4:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING:%.*]], 4
+; MAINVF4IC1_EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[VEC_EPILOG_PH]]:
+; MAINVF4IC1_EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; MAINVF4IC1_EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; MAINVF4IC1_EPI4:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT6:%.*]], [[N_VEC3:%.*]]
+; MAINVF4IC1_EPI4:    br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; MAINVF4IC1_EPI4:    [[CMP_N8:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC3]]
+; MAINVF4IC1_EPI4:    br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF7]]
+; MAINVF4IC1_EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
+; MAINVF4IC1_EPI4:    br label %[[LOOP:.*]]
+; MAINVF4IC1_EPI4:  [[LOOP]]:
+; MAINVF4IC1_EPI4:    [[CMP_LOOP:%.*]] = icmp ult i32 [[I32:%.*]], [[LEN]]
+; MAINVF4IC1_EPI4:    br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF11:![0-9]+]], !llvm.loop [[LOOP12:![0-9]+]]
+; MAINVF4IC1_EPI4:  [[EXIT_LOOPEXIT]]:
+; MAINVF4IC1_EPI4:    br label %[[EXIT]]
+; MAINVF4IC1_EPI4:  [[EXIT]]:
+;
+; MAINVF4IC2_EPI4-LABEL: define void @f0(
+; MAINVF4IC2_EPI4-SAME: i8 [[N:%.*]], i32 [[LEN:%.*]], ptr [[P:%.*]]) !prof [[PROF0:![0-9]+]] {
+; MAINVF4IC2_EPI4:  [[ENTRY:.*:]]
+; MAINVF4IC2_EPI4:    [[CMP_ENTRY:%.*]] = icmp sgt i32 [[LEN]], 0
+; MAINVF4IC2_EPI4:    br i1 [[CMP_ENTRY]], label %[[ITER_CHECK:.*]], label %[[EXIT:.*]], !prof [[PROF1:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[ITER_CHECK]]:
+; MAINVF4IC2_EPI4:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0:%.*]], 4
+; MAINVF4IC2_EPI4:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]], !prof [[PROF2:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[VECTOR_SCEVCHECK]]:
+; MAINVF4IC2_EPI4:    [[TMP2:%.*]] = icmp slt i8 [[TMP1:%.*]], 0
+; MAINVF4IC2_EPI4:    [[TMP3:%.*]] = icmp ugt i32 [[LEN]], 255
+; MAINVF4IC2_EPI4:    br i1 [[TMP4:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF2]]
+; MAINVF4IC2_EPI4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; MAINVF4IC2_EPI4:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 8
+; MAINVF4IC2_EPI4:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF2]]
+; MAINVF4IC2_EPI4:  [[VECTOR_PH]]:
+; MAINVF4IC2_EPI4:    br label %[[VECTOR_BODY:.*]]
+; MAINVF4IC2_EPI4:  [[VECTOR_BODY]]:
+; MAINVF4IC2_EPI4:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT:%.*]], [[N_VEC:%.*]]
+; MAINVF4IC2_EPI4:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[MIDDLE_BLOCK]]:
+; MAINVF4IC2_EPI4:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; MAINVF4IC2_EPI4:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF7:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[VEC_EPILOG_ITER_CHECK]]:
+; MAINVF4IC2_EPI4:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING:%.*]], 4
+; MAINVF4IC2_EPI4:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[VEC_EPILOG_PH]]:
+; MAINVF4IC2_EPI4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; MAINVF4IC2_EPI4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; MAINVF4IC2_EPI4:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT6:%.*]], [[N_VEC3:%.*]]
+; MAINVF4IC2_EPI4:    br i1 [[TMP13]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; MAINVF4IC2_EPI4:    [[CMP_N8:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC3]]
+; MAINVF4IC2_EPI4:    br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF11:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[VEC_EPILOG_SCALAR_PH]]:
+; MAINVF4IC2_EPI4:    br label %[[LOOP:.*]]
+; MAINVF4IC2_EPI4:  [[LOOP]]:
+; MAINVF4IC2_EPI4:    [[CMP_LOOP:%.*]] = icmp ult i32 [[I32:%.*]], [[LEN]]
+; MAINVF4IC2_EPI4:    br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF12:![0-9]+]], !llvm.loop [[LOOP13:![0-9]+]]
+; MAINVF4IC2_EPI4:  [[EXIT_LOOPEXIT]]:
+; MAINVF4IC2_EPI4:    br label %[[EXIT]]
+; MAINVF4IC2_EPI4:  [[EXIT]]:
+;
 entry:
   %cmp.entry = icmp sgt i32 %len, 0
   br i1 %cmp.entry, label %loop, label %exit, !prof !1
@@ -72,11 +122,33 @@ exit:
 !0 = !{!"function_entry_count", i64 13}
 !1 = !{!"branch_weights", i32 12, i32 1}
 !2 = !{!"branch_weights", i32 1234, i32 1}
-
-; CHECK: [[PROF_F0_ENTRY]] = !{!"branch_weights", i32 12, i32 1}
-; CHECK: [[PROF_F0_UNLIKELY]] = !{!"branch_weights", i32 1, i32 127}
-; CHECK: [[PROF_F0_VECTOR_BODY]] = !{!"branch_weights", i32 1, i32 307}
-; CHECK: [[PROF_F0_MIDDLE_BLOCKS]] =  !{!"branch_weights", i32 1, i32 3}
-; CHECK: [[PROF_F0_VEC_EPILOGUE_SKIP]] = !{!"branch_weights", i32 4, i32 0}
-; CHECK: [[PROF_F0_VEC_EPILOG_VECTOR_BODY]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[PROF_F0_LOOP]] = !{!"branch_weights", i32 2, i32 1}
+;.
+; MAINVF4IC1_EPI4: [[PROF0]] = !{!"function_entry_count", i64 13}
+; MAINVF4IC1_EPI4: [[PROF1]] = !{!"branch_weights", i32 12, i32 1}
+; MAINVF4IC1_EPI4: [[PROF2]] = !{!"branch_weights", i32 1, i32 127}
+; MAINVF4IC1_EPI4: [[PROF3]] = !{!"branch_weights", i32 1, i32 307}
+; MAINVF4IC1_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]}
+; MAINVF4IC1_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
+; MAINVF4IC1_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MAINVF4IC1_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 3}
+; MAINVF4IC1_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 0}
+; MAINVF4IC1_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
+; MAINVF4IC1_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
+; MAINVF4IC1_EPI4: [[PROF11]] = !{!"branch_weights", i32 2, i32 1}
+; MAINVF4IC1_EPI4: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+;.
+; MAINVF4IC2_EPI4: [[PROF0]] = !{!"function_entry_count", i64 13}
+; MAINVF4IC2_EPI4: [[PROF1]] = !{!"branch_weights", i32 12, i32 1}
+; MAINVF4IC2_EPI4: [[PROF2]] = !{!"branch_weights", i32 1, i32 127}
+; MAINVF4IC2_EPI4: [[PROF3]] = !{!"branch_weights", i32 1, i32 153}
+; MAINVF4IC2_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]}
+; MAINVF4IC2_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
+; MAINVF4IC2_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MAINVF4IC2_EPI4: [[PROF7]] = !{!"branch_weights", i32 1, i32 7}
+; MAINVF4IC2_EPI4: [[PROF8]] = !{!"branch_weights", i32 4, i32 0}
+; MAINVF4IC2_EPI4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
+; MAINVF4IC2_EPI4: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
+; MAINVF4IC2_EPI4: [[PROF11]] = !{!"branch_weights", i32 1, i32 3}
+; MAINVF4IC2_EPI4: [[PROF12]] = !{!"branch_weights", i32 2, i32 1}
+; MAINVF4IC2_EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]}
+;.

From 0fe78c4a290517925acc03d59f235926f440f155 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 18 Jun 2025 13:44:46 +0800
Subject: [PATCH 0916/1322] [NFC] [Serialization] Some Code Cleanups for Name
 lookup table things

---
 clang/include/clang/Serialization/ASTReader.h |  26 ++-
 .../clang/Serialization/ASTRecordWriter.h     |   7 +
 clang/include/clang/Serialization/ASTWriter.h |  10 +-
 clang/lib/Serialization/ASTReader.cpp         |  30 ++-
 clang/lib/Serialization/ASTReaderDecl.cpp     |  73 +++----
 clang/lib/Serialization/ASTWriter.cpp         | 193 ++++++++++--------
 clang/lib/Serialization/ASTWriterDecl.cpp     |  17 +-
 7 files changed, 188 insertions(+), 168 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index ba676fd8698e..866986dcbf76 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -373,6 +373,25 @@ struct LazySpecializationInfoLookupTable;
 
 } // namespace serialization
 
+struct VisibleLookupBlockOffsets {
+  uint64_t VisibleOffset = 0;
+  uint64_t ModuleLocalOffset = 0;
+  uint64_t TULocalOffset = 0;
+
+  operator bool() const {
+    return VisibleOffset || ModuleLocalOffset || // ModuleUnitLocalOffset ||
+           TULocalOffset;
+  }
+};
+
+struct LookupBlockOffsets : VisibleLookupBlockOffsets {
+  uint64_t LexicalOffset = 0;
+
+  operator bool() const {
+    return VisibleLookupBlockOffsets::operator bool() || LexicalOffset;
+  }
+};
+
 /// Reads an AST files chain containing the contents of a translation
 /// unit.
 ///
@@ -535,13 +554,6 @@ private:
   /// in the chain.
   DeclUpdateOffsetsMap DeclUpdateOffsets;
 
-  struct LookupBlockOffsets {
-    uint64_t LexicalOffset;
-    uint64_t VisibleOffset;
-    uint64_t ModuleLocalOffset;
-    uint64_t TULocalOffset;
-  };
-
   using DelayedNamespaceOffsetMapTy =
       llvm::DenseMap<GlobalDeclID, LookupBlockOffsets>;
 
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 07f7e8d919d8..ad1ec2673812 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -115,6 +115,13 @@ public:
     Record->push_back(BitOffset);
   }
 
+  void AddLookupOffsets(const LookupBlockOffsets &Offsets) {
+    AddOffset(Offsets.LexicalOffset);
+    AddOffset(Offsets.VisibleOffset);
+    AddOffset(Offsets.ModuleLocalOffset);
+    AddOffset(Offsets.TULocalOffset);
+  }
+
   /// Add the given statement or expression to the queue of
   /// statements to emit.
   ///
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 0f49646f3f02..97679ace8b61 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -75,6 +75,9 @@ class StoredDeclsList;
 class SwitchCase;
 class Token;
 
+struct VisibleLookupBlockOffsets;
+struct LookupBlockOffsets;
+
 namespace serialization {
 enum class DeclUpdateKind;
 } // namespace serialization
@@ -606,9 +609,7 @@ private:
   uint64_t WriteDeclContextLexicalBlock(ASTContext &Context,
                                         const DeclContext *DC);
   void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC,
-                                    uint64_t &VisibleBlockOffset,
-                                    uint64_t &ModuleLocalBlockOffset,
-                                    uint64_t &TULocalBlockOffset);
+                                    VisibleLookupBlockOffsets &Offsets);
   void WriteTypeDeclOffsets();
   void WriteFileDeclIDsMap();
   void WriteComments(ASTContext &Context);
@@ -777,6 +778,9 @@ public:
     return (I == DeclIDs.end() || I->second >= clang::NUM_PREDEF_DECL_IDS);
   };
 
+  void AddLookupOffsets(const LookupBlockOffsets &Offsets,
+                        RecordDataImpl &Record);
+
   /// Emit a reference to a declaration.
   void AddDeclRef(const Decl *D, RecordDataImpl &Record);
   // Emit a reference to a declaration if the declaration was emitted.
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index acda5a7c879d..a3fbc3d25aca 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -4097,8 +4097,8 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         uint64_t TULocalOffset =
             TULocalLocalOffset ? BaseOffset + TULocalLocalOffset : 0;
 
-        DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset,
-                                         ModuleLocalOffset, TULocalOffset};
+        DelayedNamespaceOffsetMap[ID] = {
+            {VisibleOffset, TULocalOffset, ModuleLocalOffset}, LexicalOffset};
 
         assert(!GetExistingDecl(ID) &&
                "We shouldn't load the namespace in the front of delayed "
@@ -8544,17 +8544,21 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
   SmallVector<NamedDecl *, 64> Decls;
   llvm::SmallPtrSet<NamedDecl *, 8> Found;
 
+  auto Find = [&, this](auto &&Table, auto &&Key) {
+    for (GlobalDeclID ID : Table.find(Key)) {
+      NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
+      if (ND->getDeclName() == Name && Found.insert(ND).second)
+        Decls.push_back(ND);
+    }
+  };
+
   Deserializing LookupResults(this);
 
   // FIXME: Clear the redundancy with templated lambda in C++20 when that's
   // available.
   if (auto It = Lookups.find(DC); It != Lookups.end()) {
     ++NumVisibleDeclContextsRead;
-    for (GlobalDeclID ID : It->second.Table.find(Name)) {
-      NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
-      if (ND->getDeclName() == Name && Found.insert(ND).second)
-        Decls.push_back(ND);
-    }
+    Find(It->second.Table, Name);
   }
 
   if (auto *NamedModule =
@@ -8562,21 +8566,13 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
                      : nullptr) {
     if (auto It = ModuleLocalLookups.find(DC); It != ModuleLocalLookups.end()) {
       ++NumModuleLocalVisibleDeclContexts;
-      for (GlobalDeclID ID : It->second.Table.find({Name, NamedModule})) {
-        NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
-        if (ND->getDeclName() == Name && Found.insert(ND).second)
-          Decls.push_back(ND);
-      }
+      Find(It->second.Table, std::make_pair(Name, NamedModule));
     }
   }
 
   if (auto It = TULocalLookups.find(DC); It != TULocalLookups.end()) {
     ++NumTULocalVisibleDeclContexts;
-    for (GlobalDeclID ID : It->second.Table.find(Name)) {
-      NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
-      if (ND->getDeclName() == Name && Found.insert(ND).second)
-        Decls.push_back(ND);
-    }
+    Find(It->second.Table, Name);
   }
 
   SetExternalVisibleDeclsForName(DC, Name, Decls);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a1368a48351c..259c772e4222 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -414,9 +414,7 @@ public:
   void VisitOpenACCDeclareDecl(OpenACCDeclareDecl *D);
   void VisitOpenACCRoutineDecl(OpenACCRoutineDecl *D);
 
-  void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
-                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset,
-                        uint64_t &TULocalOffset);
+  void VisitDeclContext(DeclContext *DC, LookupBlockOffsets &Offsets);
 
   template <typename T>
   RedeclarableResult VisitRedeclarable(Redeclarable<T> *D);
@@ -1875,12 +1873,8 @@ void ASTDeclReader::VisitNamespaceDecl(NamespaceDecl *D) {
 
 void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) {
   VisitNamedDecl(D);
-  uint64_t LexicalOffset = 0;
-  uint64_t VisibleOffset = 0;
-  uint64_t ModuleLocalOffset = 0;
-  uint64_t TULocalOffset = 0;
-  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset,
-                   TULocalOffset);
+  LookupBlockOffsets Offsets;
+  VisitDeclContext(D, Offsets);
   D->IsCBuffer = Record.readBool();
   D->KwLoc = readSourceLocation();
   D->LBraceLoc = readSourceLocation();
@@ -2794,14 +2788,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl(
   mergeMergeable(D);
 }
 
-void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
-                                     uint64_t &VisibleOffset,
-                                     uint64_t &ModuleLocalOffset,
-                                     uint64_t &TULocalOffset) {
-  LexicalOffset = ReadLocalOffset();
-  VisibleOffset = ReadLocalOffset();
-  ModuleLocalOffset = ReadLocalOffset();
-  TULocalOffset = ReadLocalOffset();
+void ASTDeclReader::VisitDeclContext(DeclContext *DC,
+                                     LookupBlockOffsets &Offsets) {
+  Offsets.LexicalOffset = ReadLocalOffset();
+  Offsets.VisibleOffset = ReadLocalOffset();
+  Offsets.ModuleLocalOffset = ReadLocalOffset();
+  Offsets.TULocalOffset = ReadLocalOffset();
 }
 
 template <typename T>
@@ -4249,42 +4241,37 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
   // If this declaration is also a declaration context, get the
   // offsets for its tables of lexical and visible declarations.
   if (auto *DC = dyn_cast<DeclContext>(D)) {
-    uint64_t LexicalOffset = 0;
-    uint64_t VisibleOffset = 0;
-    uint64_t ModuleLocalOffset = 0;
-    uint64_t TULocalOffset = 0;
+    LookupBlockOffsets Offsets;
 
-    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, ModuleLocalOffset,
-                            TULocalOffset);
+    Reader.VisitDeclContext(DC, Offsets);
 
     // Get the lexical and visible block for the delayed namespace.
     // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap.
     // But it may be more efficient to filter the other cases.
-    if (!LexicalOffset && !VisibleOffset && !ModuleLocalOffset &&
-        isa<NamespaceDecl>(D))
+    if (!Offsets && isa<NamespaceDecl>(D))
       if (auto Iter = DelayedNamespaceOffsetMap.find(ID);
-          Iter != DelayedNamespaceOffsetMap.end()) {
-        LexicalOffset = Iter->second.LexicalOffset;
-        VisibleOffset = Iter->second.VisibleOffset;
-        ModuleLocalOffset = Iter->second.ModuleLocalOffset;
-        TULocalOffset = Iter->second.TULocalOffset;
-      }
+          Iter != DelayedNamespaceOffsetMap.end())
+        Offsets = Iter->second;
 
-    if (LexicalOffset &&
-        ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC))
-      return nullptr;
-    if (VisibleOffset && ReadVisibleDeclContextStorage(
-                             *Loc.F, DeclsCursor, VisibleOffset, ID,
-                             VisibleDeclContextStorageKind::GenerallyVisible))
-      return nullptr;
-    if (ModuleLocalOffset &&
+    if (Offsets.VisibleOffset &&
         ReadVisibleDeclContextStorage(
-            *Loc.F, DeclsCursor, ModuleLocalOffset, ID,
+            *Loc.F, DeclsCursor, Offsets.VisibleOffset, ID,
+            VisibleDeclContextStorageKind::GenerallyVisible))
+      return nullptr;
+    if (Offsets.ModuleLocalOffset &&
+        ReadVisibleDeclContextStorage(
+            *Loc.F, DeclsCursor, Offsets.ModuleLocalOffset, ID,
             VisibleDeclContextStorageKind::ModuleLocalVisible))
       return nullptr;
-    if (TULocalOffset && ReadVisibleDeclContextStorage(
-                             *Loc.F, DeclsCursor, TULocalOffset, ID,
-                             VisibleDeclContextStorageKind::TULocalVisible))
+    if (Offsets.TULocalOffset &&
+        ReadVisibleDeclContextStorage(
+            *Loc.F, DeclsCursor, Offsets.TULocalOffset, ID,
+            VisibleDeclContextStorageKind::TULocalVisible))
+      return nullptr;
+
+    if (Offsets.LexicalOffset &&
+        ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor,
+                                      Offsets.LexicalOffset, DC))
       return nullptr;
   }
   assert(Record.getIdx() == Record.size());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index be22ee522191..af7229d74887 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -4089,11 +4089,9 @@ public:
   using hash_value_type = unsigned;
   using offset_type = unsigned;
 
-protected:
   explicit ASTDeclContextNameLookupTraitBase(ASTWriter &Writer)
       : Writer(Writer) {}
 
-public:
   data_type getData(const DeclIDsTy &LocalIDs) {
     unsigned Start = DeclIDs.size();
     for (auto ID : LocalIDs)
@@ -4231,6 +4229,38 @@ public:
   }
 };
 
+class ASTDeclContextNameTrivialLookupTrait
+    : public ASTDeclContextNameLookupTraitBase {
+public:
+  using key_type = DeclarationNameKey;
+  using key_type_ref = key_type;
+
+public:
+  using ASTDeclContextNameLookupTraitBase::ASTDeclContextNameLookupTraitBase;
+
+  using ASTDeclContextNameLookupTraitBase::getData;
+
+  static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
+
+  hash_value_type ComputeHash(key_type Name) { return Name.getHash(); }
+
+  std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &Out,
+                                                  DeclarationNameKey Name,
+                                                  data_type_ref Lookup) {
+    auto [KeyLen, DataLen] = EmitKeyDataLengthBase(Out, Name, Lookup);
+    return emitULEBKeyDataLength(KeyLen, DataLen, Out);
+  }
+
+  void EmitKey(raw_ostream &Out, DeclarationNameKey Name, unsigned) {
+    return EmitKeyBase(Out, Name);
+  }
+
+  void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup,
+                unsigned DataLen) {
+    EmitDataBase(Out, Lookup, DataLen);
+  }
+};
+
 static bool isModuleLocalDecl(NamedDecl *D) {
   // For decls not in a file context, they should have the same visibility
   // with their parent.
@@ -4273,25 +4303,43 @@ static bool isTULocalInNamedModules(NamedDecl *D) {
   return D->getLinkageInternal() == Linkage::Internal;
 }
 
-// Trait used for the on-disk hash table used in the method pool.
-template <bool CollectingTULocalDecls>
-class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
+class ASTDeclContextNameLookupTrait
+    : public ASTDeclContextNameTrivialLookupTrait {
 public:
+  using TULocalDeclsMapTy = llvm::DenseMap<key_type, DeclIDsTy>;
+
   using ModuleLevelDeclsMapTy =
       llvm::DenseMap<ModuleLevelNameLookupTrait::key_type, DeclIDsTy>;
 
-  using key_type = DeclarationNameKey;
-  using key_type_ref = key_type;
-
-  using TULocalDeclsMapTy = llvm::DenseMap<key_type, DeclIDsTy>;
-
 private:
+  enum class LookupVisibility {
+    GenerallyVisibile,
+    // The decls can only be found by other TU in the same module.
+    // Note a clang::Module models a module unit instead of logical module
+    // in C++20.
+    ModuleLocalVisible,
+    // The decls can only be found by the TU itself that defines it.
+    TULocal,
+  };
+
+  LookupVisibility getLookupVisibility(NamedDecl *D) const {
+    // Only named modules have other lookup visibility.
+    if (!Writer.isWritingStdCXXNamedModules())
+      return LookupVisibility::GenerallyVisibile;
+
+    if (isModuleLocalDecl(D))
+      return LookupVisibility::ModuleLocalVisible;
+    if (isTULocalInNamedModules(D))
+      return LookupVisibility::TULocal;
+    return LookupVisibility::GenerallyVisibile;
+  }
+
   ModuleLevelDeclsMapTy ModuleLocalDeclsMap;
   TULocalDeclsMapTy TULocalDeclsMap;
 
 public:
-  explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer)
-      : ASTDeclContextNameLookupTraitBase(Writer) {}
+  using ASTDeclContextNameTrivialLookupTrait::
+      ASTDeclContextNameTrivialLookupTrait;
 
   template <typename Coll> data_type getData(const Coll &Decls) {
     unsigned Start = DeclIDs.size();
@@ -4312,7 +4360,8 @@ public:
 
       auto ID = Writer.GetDeclRef(DeclForLocalLookup);
 
-      if (isModuleLocalDecl(D)) {
+      switch (getLookupVisibility(DeclForLocalLookup)) {
+      case LookupVisibility::ModuleLocalVisible:
         if (UnsignedOrNone PrimaryModuleHash =
                 getPrimaryModuleHash(D->getOwningModule())) {
           auto Key = std::make_pair(D->getDeclName(), *PrimaryModuleHash);
@@ -4323,17 +4372,18 @@ public:
             Iter->second.push_back(ID);
           continue;
         }
+        break;
+      case LookupVisibility::TULocal: {
+        auto Iter = TULocalDeclsMap.find(D->getDeclName());
+        if (Iter == TULocalDeclsMap.end())
+          TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}});
+        else
+          Iter->second.push_back(ID);
+        continue;
       }
-
-      if constexpr (CollectingTULocalDecls) {
-        if (isTULocalInNamedModules(D)) {
-          auto Iter = TULocalDeclsMap.find(D->getDeclName());
-          if (Iter == TULocalDeclsMap.end())
-            TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}});
-          else
-            Iter->second.push_back(ID);
-          continue;
-        }
+      case LookupVisibility::GenerallyVisibile:
+        // Generally visible decls go into the general lookup table.
+        break;
       }
 
       DeclIDs.push_back(ID);
@@ -4341,33 +4391,11 @@ public:
     return std::make_pair(Start, DeclIDs.size());
   }
 
-  using ASTDeclContextNameLookupTraitBase::getData;
-
   const ModuleLevelDeclsMapTy &getModuleLocalDecls() {
     return ModuleLocalDeclsMap;
   }
 
   const TULocalDeclsMapTy &getTULocalDecls() { return TULocalDeclsMap; }
-
-  static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
-
-  hash_value_type ComputeHash(key_type Name) { return Name.getHash(); }
-
-  std::pair<unsigned, unsigned> EmitKeyDataLength(raw_ostream &Out,
-                                                  DeclarationNameKey Name,
-                                                  data_type_ref Lookup) {
-    auto [KeyLen, DataLen] = EmitKeyDataLengthBase(Out, Name, Lookup);
-    return emitULEBKeyDataLength(KeyLen, DataLen, Out);
-  }
-
-  void EmitKey(raw_ostream &Out, DeclarationNameKey Name, unsigned) {
-    return EmitKeyBase(Out, Name);
-  }
-
-  void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup,
-                unsigned DataLen) {
-    EmitDataBase(Out, Lookup, DataLen);
-  }
 };
 
 } // namespace
@@ -4581,11 +4609,10 @@ void ASTWriter::GenerateNameLookupTable(
   assert(DC == DC->getPrimaryContext() && "only primary DC has lookup table");
 
   // Create the on-disk hash table representation.
-  MultiOnDiskHashTableGenerator<
-      reader::ASTDeclContextNameLookupTrait,
-      ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true>>
+  MultiOnDiskHashTableGenerator<reader::ASTDeclContextNameLookupTrait,
+                                ASTDeclContextNameLookupTrait>
       Generator;
-  ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true> Trait(*this);
+  ASTDeclContextNameLookupTrait Trait(*this);
 
   // The first step is to collect the declaration names which we need to
   // serialize into the name lookup table, and to collect them in a stable
@@ -4743,12 +4770,10 @@ void ASTWriter::GenerateNameLookupTable(
 
   const auto &TULocalDecls = Trait.getTULocalDecls();
   if (!TULocalDecls.empty() && !isGeneratingReducedBMI()) {
-    MultiOnDiskHashTableGenerator<
-        reader::ASTDeclContextNameLookupTrait,
-        ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false>>
+    MultiOnDiskHashTableGenerator<reader::ASTDeclContextNameLookupTrait,
+                                  ASTDeclContextNameTrivialLookupTrait>
         TULookupGenerator;
-    ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false> TULocalTrait(
-        *this);
+    ASTDeclContextNameTrivialLookupTrait TULocalTrait(*this);
 
     for (const auto &TULocalIter : TULocalDecls) {
       const auto &Key = TULocalIter.first;
@@ -4767,14 +4792,9 @@ void ASTWriter::GenerateNameLookupTable(
 ///
 /// \returns the offset of the DECL_CONTEXT_VISIBLE block within the
 /// bitstream, or 0 if no block was written.
-void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
-                                             DeclContext *DC,
-                                             uint64_t &VisibleBlockOffset,
-                                             uint64_t &ModuleLocalBlockOffset,
-                                             uint64_t &TULocalBlockOffset) {
-  assert(VisibleBlockOffset == 0);
-  assert(ModuleLocalBlockOffset == 0);
-  assert(TULocalBlockOffset == 0);
+void ASTWriter::WriteDeclContextVisibleBlock(
+    ASTContext &Context, DeclContext *DC, VisibleLookupBlockOffsets &Offsets) {
+  assert(!Offsets);
 
   // If we imported a key declaration of this namespace, write the visible
   // lookup results as an update record for it rather than including them
@@ -4858,7 +4878,7 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
   if (!Map || Map->empty())
     return;
 
-  VisibleBlockOffset = Stream.GetCurrentBitNo();
+  Offsets.VisibleOffset = Stream.GetCurrentBitNo();
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
@@ -4873,8 +4893,8 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
   ++NumVisibleDeclContexts;
 
   if (!ModuleLocalLookupTable.empty()) {
-    ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
-    assert(ModuleLocalBlockOffset > VisibleBlockOffset);
+    Offsets.ModuleLocalOffset = Stream.GetCurrentBitNo();
+    assert(Offsets.ModuleLocalOffset > Offsets.VisibleOffset);
     // Write the lookup table
     RecordData::value_type ModuleLocalRecord[] = {
         DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
@@ -4884,7 +4904,7 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
   }
 
   if (!TULookupTable.empty()) {
-    TULocalBlockOffset = Stream.GetCurrentBitNo();
+    Offsets.TULocalOffset = Stream.GetCurrentBitNo();
     // Write the lookup table
     RecordData::value_type TULocalDeclsRecord[] = {
         DECL_CONTEXT_TU_LOCAL_VISIBLE};
@@ -6203,31 +6223,26 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
   assert(DelayedNamespace.empty() || GeneratingReducedBMI);
   RecordData DelayedNamespaceRecord;
   for (NamespaceDecl *NS : DelayedNamespace) {
-    uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS);
-    uint64_t VisibleOffset = 0;
-    uint64_t ModuleLocalOffset = 0;
-    uint64_t TULocalOffset = 0;
-    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset,
-                                 TULocalOffset);
+    LookupBlockOffsets Offsets;
+
+    Offsets.LexicalOffset = WriteDeclContextLexicalBlock(Context, NS);
+    WriteDeclContextVisibleBlock(Context, NS, Offsets);
+
+    if (Offsets.LexicalOffset)
+      Offsets.LexicalOffset -= DeclTypesBlockStartOffset;
 
     // Write the offset relative to current block.
-    if (LexicalOffset)
-      LexicalOffset -= DeclTypesBlockStartOffset;
+    if (Offsets.VisibleOffset)
+      Offsets.VisibleOffset -= DeclTypesBlockStartOffset;
 
-    if (VisibleOffset)
-      VisibleOffset -= DeclTypesBlockStartOffset;
+    if (Offsets.ModuleLocalOffset)
+      Offsets.ModuleLocalOffset -= DeclTypesBlockStartOffset;
 
-    if (ModuleLocalOffset)
-      ModuleLocalOffset -= DeclTypesBlockStartOffset;
-
-    if (TULocalOffset)
-      TULocalOffset -= DeclTypesBlockStartOffset;
+    if (Offsets.TULocalOffset)
+      Offsets.TULocalOffset -= DeclTypesBlockStartOffset;
 
     AddDeclRef(NS, DelayedNamespaceRecord);
-    DelayedNamespaceRecord.push_back(LexicalOffset);
-    DelayedNamespaceRecord.push_back(VisibleOffset);
-    DelayedNamespaceRecord.push_back(ModuleLocalOffset);
-    DelayedNamespaceRecord.push_back(TULocalOffset);
+    AddLookupOffsets(Offsets, DelayedNamespaceRecord);
   }
 
   // The process of writing lexical and visible block for delayed namespace
@@ -6818,6 +6833,14 @@ TypeID ASTWriter::GetOrCreateTypeID(ASTContext &Context, QualType T) {
   });
 }
 
+void ASTWriter::AddLookupOffsets(const LookupBlockOffsets &Offsets,
+                                 RecordDataImpl &Record) {
+  Record.push_back(Offsets.LexicalOffset);
+  Record.push_back(Offsets.VisibleOffset);
+  Record.push_back(Offsets.ModuleLocalOffset);
+  Record.push_back(Offsets.TULocalOffset);
+}
+
 void ASTWriter::AddEmittedDeclRef(const Decl *D, RecordDataImpl &Record) {
   if (!wasDeclEmitted(D))
     return;
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 052cb5a253bf..2d93832a9ac3 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -2189,11 +2189,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
   static_assert(DeclContext::NumDeclContextBits == 13,
                 "You need to update the serializer after you change the "
                 "DeclContextBits");
-
-  uint64_t LexicalOffset = 0;
-  uint64_t VisibleOffset = 0;
-  uint64_t ModuleLocalOffset = 0;
-  uint64_t TULocalOffset = 0;
+  LookupBlockOffsets Offsets;
 
   if (Writer.isGeneratingReducedBMI() && isa<NamespaceDecl>(DC) &&
       cast<NamespaceDecl>(DC)->isFromExplicitGlobalModule()) {
@@ -2202,17 +2198,12 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
     // details.
     Writer.DelayedNamespace.push_back(cast<NamespaceDecl>(DC));
   } else {
-    LexicalOffset =
+    Offsets.LexicalOffset =
         Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC);
-    Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC,
-                                        VisibleOffset, ModuleLocalOffset,
-                                        TULocalOffset);
+    Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC, Offsets);
   }
 
-  Record.AddOffset(LexicalOffset);
-  Record.AddOffset(VisibleOffset);
-  Record.AddOffset(ModuleLocalOffset);
-  Record.AddOffset(TULocalOffset);
+  Record.AddLookupOffsets(Offsets);
 }
 
 const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) {

From 5bee2c34bde1aa8b0fb5aed7d5ce330f094f6436 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 19 Jun 2025 19:02:42 +0900
Subject: [PATCH 0917/1322] RuntimeLibcalls: Pass in FloatABI and EABI type
 (#144691)

We need the full set of ABI options to accurately compute
the full set of libcalls. This partially resolves missing
information required to compute the set of ARM calls.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h  | 10 +++++++---
 llvm/lib/CodeGen/TargetLoweringBase.cpp |  3 ++-
 llvm/lib/IR/RuntimeLibcalls.cpp         | 18 +++++++++++++++---
 llvm/lib/Object/IRSymtab.cpp            |  2 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 10 ----------
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 3e1531ebfd9d..a6a180f5ed8d 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -19,6 +19,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -53,8 +54,10 @@ static inline auto libcalls() {
 
 /// A simple container for information about the supported runtime calls.
 struct RuntimeLibcallsInfo {
-  explicit RuntimeLibcallsInfo(const Triple &TT) {
-    initLibcalls(TT);
+  explicit RuntimeLibcallsInfo(const Triple &TT,
+                               FloatABI::ABIType FloatABI = FloatABI::Default,
+                               EABI EABIVersion = EABI::Default) {
+    initLibcalls(TT, FloatABI, EABIVersion);
   }
 
   /// Rename the default libcall routine name for the specified libcall.
@@ -144,7 +147,8 @@ private:
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
-  LLVM_ABI void initLibcalls(const Triple &TT);
+  LLVM_ABI void initLibcalls(const Triple &TT, FloatABI::ABIType FloatABI,
+                             EABI ABIType);
 };
 
 } // namespace RTLIB
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b1afdc2a3ac3..2b5087cd38f5 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -632,7 +632,8 @@ void RTLIB::initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs) {
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), Libcalls(TM.getTargetTriple()) {
+    : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.FloatABIType,
+                       TM.Options.EABIVersion) {
   initActions();
 
   // Perform these initializations only once.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index a57b08919346..74dccdf172d4 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -65,7 +65,17 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
 #undef LCALLNAME5
 }
 
-static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
+static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
+                               FloatABI::ABIType FloatABIType,
+                               EABI EABIVersion) {
+  if (!TT.isOSDarwin() && !TT.isiOS() && !TT.isWatchOS() && !TT.isDriverKit()) {
+    CallingConv::ID DefaultCC = FloatABIType == FloatABI::Hard
+                                    ? CallingConv::ARM_AAPCS_VFP
+                                    : CallingConv::ARM_AAPCS;
+    for (RTLIB::Libcall LC : RTLIB::libcalls())
+      Info.setLibcallCallingConv(LC, DefaultCC);
+  }
+
   // Register based DivRem for AEABI (RTABI 4.2)
   if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
       TT.isTargetMuslAEABI() || TT.isOSWindows()) {
@@ -346,7 +356,9 @@ static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
 
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
-void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
+void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
+                                       FloatABI::ABIType FloatABI,
+                                       EABI EABIVersion) {
   initSoftFloatCmpLibcallPredicates();
 
   initSoftFloatCmpLibcallPredicates();
@@ -539,7 +551,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
   if (TT.isAArch64())
     setAArch64LibcallNames(*this, TT);
   else if (TT.isARM() || TT.isThumb())
-    setARMLibcallNames(*this, TT);
+    setARMLibcallNames(*this, TT, FloatABI, EABIVersion);
   else if (TT.getArch() == Triple::ArchType::avr) {
     // Division rtlib functions (not supported), use divmod functions instead
     setLibcallName(RTLIB::SDIV_I8, nullptr);
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 806477ae3de0..494ec089d7bd 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -216,7 +216,7 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
 static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) {
   DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols),
                                          std::end(PreservedSymbols));
-
+  // FIXME: Do we need to pass in ABI fields from TargetOptions?
   RTLIB::RuntimeLibcallsInfo Libcalls(TT);
   for (const char *Name : Libcalls.getLibcallNames()) {
     if (Name)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6e653687dbcb..91fb7bc4578b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -515,16 +515,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
-      !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
-    bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
-
-    for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-      setLibcallCallingConv(LC, IsHFTarget ? CallingConv::ARM_AAPCS_VFP
-                                           : CallingConv::ARM_AAPCS);
-    }
-  }
-
   if (Subtarget->isTargetMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&

From 305953a32ded8a43b22f65cf73d9214729feb1fc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 19 Jun 2025 19:05:56 +0900
Subject: [PATCH 0918/1322] MC: Move ExceptionHandling enum to Support
 (#144692)

Similar to b5967264b0fbfd502b3a7edec27409e966fb68be, we need
to use this in RuntimeLibcalls to compute the set of library
calls.
---
 llvm/include/llvm/MC/MCTargetOptions.h | 13 +------------
 llvm/include/llvm/Support/CodeGen.h    | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/MC/MCTargetOptions.h b/llvm/include/llvm/MC/MCTargetOptions.h
index 3ee21d9cda4b..d95adf92b9a8 100644
--- a/llvm/include/llvm/MC/MCTargetOptions.h
+++ b/llvm/include/llvm/MC/MCTargetOptions.h
@@ -10,6 +10,7 @@
 #define LLVM_MC_MCTARGETOPTIONS_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
 #include <string>
@@ -17,18 +18,6 @@
 
 namespace llvm {
 
-enum class ExceptionHandling {
-  None,     ///< No exception support
-  DwarfCFI, ///< DWARF-like instruction based exceptions
-  SjLj,     ///< setjmp/longjmp based exceptions
-  ARM,      ///< ARM EHABI
-  WinEH,    ///< Windows Exception Handling
-  Wasm,     ///< WebAssembly Exception Handling
-  AIX,      ///< AIX Exception Handling
-  ZOS,      ///< z/OS MVS Exception Handling. Very similar to DwarfCFI, but the PPA1
-            ///< is used instead of an .eh_frame section.
-};
-
 enum class EmitDwarfUnwindType {
   Always,          // Always emit dwarf unwind
   NoCompactUnwind, // Only emit if compact unwind isn't available
diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index 90733b50385a..cd1f9167b996 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -50,6 +50,18 @@ namespace llvm {
     };
   }
 
+  enum class ExceptionHandling {
+    None,     ///< No exception support
+    DwarfCFI, ///< DWARF-like instruction based exceptions
+    SjLj,     ///< setjmp/longjmp based exceptions
+    ARM,      ///< ARM EHABI
+    WinEH,    ///< Windows Exception Handling
+    Wasm,     ///< WebAssembly Exception Handling
+    AIX,      ///< AIX Exception Handling
+    ZOS, ///< z/OS MVS Exception Handling. Very similar to DwarfCFI, but the
+         ///< PPA1 is used instead of an .eh_frame section.
+  };
+
   namespace FloatABI {
   enum ABIType {
     Default, // Target-specific (either soft or hard depending on triple, etc).

From 1c35fe4e6b2596d153da82b23d04a3779fb12730 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 19 Jun 2025 19:08:52 +0900
Subject: [PATCH 0919/1322] RuntimeLibcalls: Pass in exception handling type
 (#144696)

All of the ABI options that influence libcall decisions need
to be passed in.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h  | 14 ++++++++------
 llvm/lib/CodeGen/TargetLoweringBase.cpp |  4 ++--
 llvm/lib/IR/RuntimeLibcalls.cpp         |  6 ++++++
 llvm/lib/Target/VE/VEISelLowering.cpp   |  2 --
 llvm/lib/Target/X86/X86ISelLowering.cpp |  4 ----
 5 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index a6a180f5ed8d..71f38bedf17e 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -54,10 +54,12 @@ static inline auto libcalls() {
 
 /// A simple container for information about the supported runtime calls.
 struct RuntimeLibcallsInfo {
-  explicit RuntimeLibcallsInfo(const Triple &TT,
-                               FloatABI::ABIType FloatABI = FloatABI::Default,
-                               EABI EABIVersion = EABI::Default) {
-    initLibcalls(TT, FloatABI, EABIVersion);
+  explicit RuntimeLibcallsInfo(
+      const Triple &TT,
+      ExceptionHandling ExceptionModel = ExceptionHandling::None,
+      FloatABI::ABIType FloatABI = FloatABI::Default,
+      EABI EABIVersion = EABI::Default) {
+    initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion);
   }
 
   /// Rename the default libcall routine name for the specified libcall.
@@ -147,8 +149,8 @@ private:
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
-  LLVM_ABI void initLibcalls(const Triple &TT, FloatABI::ABIType FloatABI,
-                             EABI ABIType);
+  LLVM_ABI void initLibcalls(const Triple &TT, ExceptionHandling ExceptionModel,
+                             FloatABI::ABIType FloatABI, EABI ABIType);
 };
 
 } // namespace RTLIB
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 2b5087cd38f5..41e73b853093 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -632,8 +632,8 @@ void RTLIB::initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs) {
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.FloatABIType,
-                       TM.Options.EABIVersion) {
+    : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel,
+                       TM.Options.FloatABIType, TM.Options.EABIVersion) {
   initActions();
 
   // Perform these initializations only once.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 74dccdf172d4..ad2904d6d2ea 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -357,6 +357,7 @@ static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
+                                       ExceptionHandling ExceptionModel,
                                        FloatABI::ABIType FloatABI,
                                        EABI EABIVersion) {
   initSoftFloatCmpLibcallPredicates();
@@ -373,6 +374,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
   if (TT.isX86() && TT.isGNUEnvironment())
     setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/true);
 
+  if (TT.isX86() || TT.isVE()) {
+    if (ExceptionModel == ExceptionHandling::SjLj)
+      setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+  }
+
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
   if (TT.isPPC()) {
     setLibcallName(RTLIB::ADD_F128, "__addkf3");
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index b5a0d26abbf8..98c5fdd13898 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -298,8 +298,6 @@ void VETargetLowering::initSPUActions() {
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
-  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
-    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
   /// } SJLJ instructions
 
   // Intrinsic instructions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4751361c71f2..defb7730b4c7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -513,10 +513,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 
-  // FIXME: This should be set in RuntimeLibcallsInfo
-  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
-    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
-
   // Darwin ABI issue.
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())

From 74054cab7a3e04b323828850409343932e975737 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 19 Jun 2025 12:29:58 +0100
Subject: [PATCH 0920/1322] [HashRecognize] Make it a non-PM analysis (#144742)

Make HashRecognize a non-PassManager analysis that can be called to get
the result on-demand, creating a new getResult() entry-point. The issue
was discovered when attempting to use the analysis to perform a
transform in LoopIdiomRecognize.
---
 llvm/include/llvm/Analysis/HashRecognize.h | 14 +++-----------
 llvm/lib/Analysis/HashRecognize.cpp        | 18 +++++++++---------
 llvm/lib/Passes/PassRegistry.def           |  1 -
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/Analysis/HashRecognize.h b/llvm/include/llvm/Analysis/HashRecognize.h
index c169383bf7b0..0361dfcd2352 100644
--- a/llvm/include/llvm/Analysis/HashRecognize.h
+++ b/llvm/include/llvm/Analysis/HashRecognize.h
@@ -84,12 +84,13 @@ class HashRecognize {
 public:
   HashRecognize(const Loop &L, ScalarEvolution &SE);
 
-  // The main analysis entry point.
+  // The main analysis entry points.
   std::variant<PolynomialInfo, ErrBits, StringRef> recognizeCRC() const;
+  std::optional<PolynomialInfo> getResult() const;
 
   // Auxilary entry point after analysis to interleave the generating polynomial
   // and return a 256-entry CRC table.
-  CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped) const;
+  static CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped);
 
   void print(raw_ostream &OS) const;
 
@@ -107,15 +108,6 @@ public:
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &);
 };
-
-class HashRecognizeAnalysis : public AnalysisInfoMixin<HashRecognizeAnalysis> {
-  friend AnalysisInfoMixin<HashRecognizeAnalysis>;
-  static AnalysisKey Key;
-
-public:
-  using Result = HashRecognize;
-  Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR);
-};
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index d11602f92187..1a5dfbe4bed6 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -472,7 +472,7 @@ static bool checkExtractBits(const KnownBits &Known, unsigned N,
 /// polynomial. The optimization technique of table-lookup for CRC is also
 /// called the Sarwate algorithm.
 CRCTable HashRecognize::genSarwateTable(const APInt &GenPoly,
-                                        bool ByteOrderSwapped) const {
+                                        bool ByteOrderSwapped) {
   unsigned BW = GenPoly.getBitWidth();
   CRCTable Table;
   Table[0] = APInt::getZero(BW);
@@ -686,6 +686,13 @@ void HashRecognize::print(raw_ostream &OS) const {
 void HashRecognize::dump() const { print(dbgs()); }
 #endif
 
+std::optional<PolynomialInfo> HashRecognize::getResult() const {
+  auto Res = HashRecognize(L, SE).recognizeCRC();
+  if (std::holds_alternative<PolynomialInfo>(Res))
+    return std::get<PolynomialInfo>(Res);
+  return std::nullopt;
+}
+
 HashRecognize::HashRecognize(const Loop &L, ScalarEvolution &SE)
     : L(L), SE(SE) {}
 
@@ -693,13 +700,6 @@ PreservedAnalyses HashRecognizePrinterPass::run(Loop &L,
                                                 LoopAnalysisManager &AM,
                                                 LoopStandardAnalysisResults &AR,
                                                 LPMUpdater &) {
-  AM.getResult<HashRecognizeAnalysis>(L, AR).print(OS);
+  HashRecognize(L, AR.SE).print(OS);
   return PreservedAnalyses::all();
 }
-
-HashRecognize HashRecognizeAnalysis::run(Loop &L, LoopAnalysisManager &AM,
-                                         LoopStandardAnalysisResults &AR) {
-  return {L, AR.SE};
-}
-
-AnalysisKey HashRecognizeAnalysis::Key;
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index f761d0dab09a..ec14c6a9211d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -672,7 +672,6 @@ LOOPNEST_PASS("no-op-loopnest", NoOpLoopNestPass())
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)
 #endif
 LOOP_ANALYSIS("ddg", DDGAnalysis())
-LOOP_ANALYSIS("hash-recognize", HashRecognizeAnalysis())
 LOOP_ANALYSIS("iv-users", IVUsersAnalysis())
 LOOP_ANALYSIS("no-op-loop", NoOpLoopAnalysis())
 LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))

From 30824c449a893771c3f25f0eb29cfa9d2cfd4f15 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 19 Jun 2025 12:48:39 +0100
Subject: [PATCH 0921/1322] [lldb][DWARFASTParserClang] GetCXXObjectParameter
 to take DeclContext DIE parameter (#144876)

I'm trying to call `GetCXXObjectParameter` from unit-tests in a
follow-up patch and taking a `DWARFDIE` instead of `clang::DeclContext`
makes that much simpler. These should be equivalent, since all we're
trying to check is that the parent context is a record type.
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 620501b304e6..7fc1d70898d1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -163,14 +163,14 @@ static bool TagIsRecordType(dw_tag_t tag) {
 /// a default DWARFDIE. If \c containing_decl_ctx is not a valid
 /// C++ declaration context for class methods, assume no object
 /// parameter exists for the given \c subprogram.
-static DWARFDIE
-GetCXXObjectParameter(const DWARFDIE &subprogram,
-                      const clang::DeclContext &containing_decl_ctx) {
+static DWARFDIE GetCXXObjectParameter(const DWARFDIE &subprogram,
+                                      const DWARFDIE &decl_ctx_die) {
+  assert(subprogram);
   assert(subprogram.Tag() == DW_TAG_subprogram ||
          subprogram.Tag() == DW_TAG_inlined_subroutine ||
          subprogram.Tag() == DW_TAG_subroutine_type);
 
-  if (!DeclKindIsCXXClass(containing_decl_ctx.getDeclKind()))
+  if (!decl_ctx_die.IsStructUnionOrClass())
     return {};
 
   if (DWARFDIE object_parameter =
@@ -1304,8 +1304,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
   clang::CallingConv calling_convention =
       ConvertDWARFCallingConventionToClang(attrs);
 
-  const DWARFDIE object_parameter =
-      GetCXXObjectParameter(die, *containing_decl_ctx);
+  const DWARFDIE object_parameter = GetCXXObjectParameter(die, decl_ctx_die);
 
   // clang_type will get the function prototype clang type after this
   // call
@@ -2411,12 +2410,13 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) {
   DWARFDeclContext decl_ctx = die.GetDWARFDeclContext();
   sstr << decl_ctx.GetQualifiedName();
 
+  DWARFDIE decl_ctx_die;
   clang::DeclContext *containing_decl_ctx =
-      GetClangDeclContextContainingDIE(die, nullptr);
+      GetClangDeclContextContainingDIE(die, &decl_ctx_die);
   assert(containing_decl_ctx);
 
-  const unsigned cv_quals = GetCXXMethodCVQuals(
-      die, GetCXXObjectParameter(die, *containing_decl_ctx));
+  const unsigned cv_quals =
+      GetCXXMethodCVQuals(die, GetCXXObjectParameter(die, decl_ctx_die));
 
   ParseChildParameters(containing_decl_ctx, die, is_variadic,
                        has_template_params, param_types, param_names);

From 046e2f545ef568b2ce577c9172a0f147dc376071 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 19 Jun 2025 12:49:45 +0100
Subject: [PATCH 0922/1322] [LV] Add interleaving test with partial reductions
 and non-const start.

Add test coverage for mis-compile after
https://github.com/llvm/llvm-project/pull/142290.
---
 .../AArch64/partial-reduce-interleave.ll      | 154 ++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
new file mode 100644
index 000000000000..b5bc7bf80372
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=16 -force-vector-interleave=2 -mattr=+dotprod -S %s | FileCheck --check-prefix=IC2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=16 -force-vector-interleave=4 -mattr=+dotprod -S %s | FileCheck --check-prefix=IC4 %s
+
+target triple = "arm64-apple-macosx"
+
+define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.start, i64 %n) {
+; IC2-LABEL: define i32 @partial_reduce_with_non_constant_start_value(
+; IC2-SAME: ptr [[SRC:%.*]], i32 [[RDX_START:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; IC2-NEXT:  [[ENTRY:.*]]:
+; IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
+; IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC2:       [[VECTOR_PH]]:
+; IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; IC2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[RDX_START]], i32 0
+; IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC2:       [[VECTOR_BODY]]:
+; IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; IC2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ]
+; IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; IC2-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; IC2-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; IC2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]]
+; IC2-NEXT:    [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]]
+; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
+; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; IC2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IC2:       [[MIDDLE_BLOCK]]:
+; IC2-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; IC2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; IC2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC2:       [[SCALAR_PH]]:
+; IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC2-NEXT:    br label %[[LOOP:.*]]
+; IC2:       [[LOOP]]:
+; IC2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC2-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; IC2-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; IC2-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; IC2-NEXT:    [[CONV:%.*]] = zext i8 [[L]] to i32
+; IC2-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; IC2-NEXT:    [[RDX_NEXT]] = add nsw i32 [[MUL]], [[RDX]]
+; IC2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IC2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; IC2:       [[EXIT]]:
+; IC2-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; IC2-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+; IC4-LABEL: define i32 @partial_reduce_with_non_constant_start_value(
+; IC4-SAME: ptr [[SRC:%.*]], i32 [[RDX_START:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; IC4-NEXT:  [[ENTRY:.*]]:
+; IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 64
+; IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4:       [[VECTOR_PH]]:
+; IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 64
+; IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; IC4-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[RDX_START]], i32 0
+; IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC4:       [[VECTOR_BODY]]:
+; IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32
+; IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 48
+; IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; IC4-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; IC4-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; IC4-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; IC4-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]]
+; IC4-NEXT:    [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
+; IC4-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
+; IC4-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
+; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; IC4-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; IC4-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
+; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IC4:       [[MIDDLE_BLOCK]]:
+; IC4-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; IC4-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; IC4-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; IC4-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4:       [[SCALAR_PH]]:
+; IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; IC4-NEXT:    br label %[[LOOP:.*]]
+; IC4:       [[LOOP]]:
+; IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; IC4-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; IC4-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; IC4-NEXT:    [[CONV:%.*]] = zext i8 [[L]] to i32
+; IC4-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; IC4-NEXT:    [[RDX_NEXT]] = add nsw i32 [[MUL]], [[RDX]]
+; IC4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; IC4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IC4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; IC4:       [[EXIT]]:
+; IC4-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
+; IC4-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %rdx.start, %entry ], [ %rdx.next, %loop ]
+  %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src, align 1
+  %conv = zext i8 %l to i32
+  %mul = mul nuw nsw i32 %conv, %conv
+  %rdx.next = add nsw i32 %mul, %rdx
+  %iv.next = add nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %rdx.next
+}
+;.
+; IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

From e33f13ba4824d807e846e7783a48efd6c0bf58ee Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 19 Jun 2025 13:59:22 +0200
Subject: [PATCH 0923/1322] [mlir][arith] Add overflow flags to `arith.trunci`
 (#144863)

LLVM already supports overflow flags on `llvm.trunc` for a while. This
commit adds support for these flags to `arith.trunci`.
---
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 36 +++++++++++++++----
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |  3 +-
 .../Dialect/Arith/IR/ArithCanonicalization.td | 12 +++----
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir |  2 ++
 mlir/test/Dialect/Arith/ops.mlir              |  2 ++
 5 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index adc27ae6bdaf..993f36f556e8 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -226,7 +226,7 @@ def Arith_AddIOp : Arith_IntBinaryOpWithOverflowFlags<"addi", [Commutative]> {
     these is required to be the same type. This type may be an integer scalar type, 
     a vector whose element type is integer, or a tensor of integers.
 
-    This op supports `nuw`/`nsw` overflow flags which stands stand for
+    This op supports `nuw`/`nsw` overflow flags which stands for
     "No Unsigned Wrap" and "No Signed Wrap", respectively. If the `nuw` and/or
     `nsw` flags are present, and an unsigned/signed overflow occurs
     (respectively), the result is poison.
@@ -321,7 +321,7 @@ def Arith_SubIOp : Arith_IntBinaryOpWithOverflowFlags<"subi"> {
     these is required to be the same type. This type may be an integer scalar type,
     a vector whose element type is integer, or a tensor of integers.
 
-    This op supports `nuw`/`nsw` overflow flags which stands stand for
+    This op supports `nuw`/`nsw` overflow flags which stands for
     "No Unsigned Wrap" and "No Signed Wrap", respectively. If the `nuw` and/or
     `nsw` flags are present, and an unsigned/signed overflow occurs
     (respectively), the result is poison.
@@ -367,7 +367,7 @@ def Arith_MulIOp : Arith_IntBinaryOpWithOverflowFlags<"muli",
     these is required to be the same type. This type may be an integer scalar type,
     a vector whose element type is integer, or a tensor of integers.
 
-    This op supports `nuw`/`nsw` overflow flags which stands stand for
+    This op supports `nuw`/`nsw` overflow flags which stands for
     "No Unsigned Wrap" and "No Signed Wrap", respectively. If the `nuw` and/or
     `nsw` flags are present, and an unsigned/signed overflow occurs
     (respectively), the result is poison.
@@ -800,7 +800,7 @@ def Arith_ShLIOp : Arith_IntBinaryOpWithOverflowFlags<"shli"> {
     operand is greater or equal than the bitwidth of the first operand, then the
     operation returns poison.
 
-    This op supports `nuw`/`nsw` overflow flags which stands stand for
+    This op supports `nuw`/`nsw` overflow flags which stands for
     "No Unsigned Wrap" and "No Signed Wrap", respectively. If the `nuw` and/or
     `nsw` flags are present, and an unsigned/signed overflow occurs
     (respectively), the result is poison.
@@ -1271,7 +1271,11 @@ def Arith_ScalingExtFOp
 // TruncIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_TruncIOp : Arith_IToICastOp<"trunci"> {
+def Arith_TruncIOp : Op<Arith_Dialect, "trunci",
+    [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims,
+     DeclareOpInterfaceMethods<CastOpInterface>,
+     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+     DeclareOpInterfaceMethods<ArithIntegerOverflowFlagsInterface>]> {
   let summary = "integer truncation operation";
   let description = [{
     The integer truncation operation takes an integer input of
@@ -1279,17 +1283,37 @@ def Arith_TruncIOp : Arith_IToICastOp<"trunci"> {
     bit-width must be smaller than the input bit-width (N < M).
     The top-most (N - M) bits of the input are discarded.
 
+    This op supports `nuw`/`nsw` overflow flags which stands for "No Unsigned
+    Wrap" and "No Signed Wrap", respectively. If the nuw keyword is present,
+    and any of the truncated bits are non-zero, the result is a poison value.
+    If the nsw keyword is present, and any of the truncated bits are not the
+    same as the top bit of the truncation result, the result is a poison value.
+
     Example:
 
     ```mlir
+      // Scalar truncation.
       %1 = arith.constant 21 : i5     // %1 is 0b10101
       %2 = arith.trunci %1 : i5 to i4 // %2 is 0b0101
       %3 = arith.trunci %1 : i5 to i3 // %3 is 0b101
 
-      %5 = arith.trunci %0 : vector<2 x i32> to vector<2 x i16>
+      // Vector truncation.
+      %4 = arith.trunci %0 : vector<2 x i32> to vector<2 x i16>
+
+      // Scalar truncation with overflow flags.
+      %5 = arith.trunci %a overflow<nsw, nuw> : i32 to i16
     ```
   }];
 
+  let arguments = (ins
+      SignlessFixedWidthIntegerLike:$in,
+      DefaultValuedAttr<Arith_IntegerOverflowAttr,
+          "::mlir::arith::IntegerOverflowFlags::none">:$overflowFlags);
+  let results = (outs SignlessFixedWidthIntegerLike:$out);
+  let assemblyFormat = [{
+    $in (`overflow` `` $overflowFlags^)? attr-dict
+    `:` type($in) `to` type($out)
+  }];
   let hasFolder = 1;
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index ced18a48766b..b8e5aa87244f 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -163,7 +163,8 @@ using ConstrainedTruncFOpLowering = ConstrainedVectorConvertToLLVMPattern<
     arith::TruncFOp, LLVM::ConstrainedFPTruncIntr, true,
     arith::AttrConverterConstrainedFPToLLVM>;
 using TruncIOpLowering =
-    VectorConvertToLLVMPattern<arith::TruncIOp, LLVM::TruncOp>;
+    VectorConvertToLLVMPattern<arith::TruncIOp, LLVM::TruncOp,
+                               arith::AttrConvertOverflowToLLVM>;
 using UIToFPOpLowering =
     VectorConvertToLLVMPattern<arith::UIToFPOp, LLVM::UIToFPOp>;
 using XOrIOpLowering = VectorConvertToLLVMPattern<arith::XOrIOp, LLVM::XOrOp>;
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index 13eb97a910bd..b61612436eb7 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -378,14 +378,14 @@ def TruncationMatchesShiftAmount :
 
 // trunci(extsi(x)) -> extsi(x), when only the sign-extension bits are truncated
 def TruncIExtSIToExtSI :
-    Pat<(Arith_TruncIOp:$tr (Arith_ExtSIOp:$ext $x)),
+    Pat<(Arith_TruncIOp:$tr (Arith_ExtSIOp:$ext $x), $overflow),
         (Arith_ExtSIOp $x),
         [(ValueWiderThan $ext, $tr),
          (ValueWiderThan $tr, $x)]>;
 
 // trunci(extui(x)) -> extui(x), when only the zero-extension bits are truncated
 def TruncIExtUIToExtUI :
-    Pat<(Arith_TruncIOp:$tr (Arith_ExtUIOp:$ext $x)),
+    Pat<(Arith_TruncIOp:$tr (Arith_ExtUIOp:$ext $x), $overflow),
         (Arith_ExtUIOp $x),
         [(ValueWiderThan $ext, $tr),
          (ValueWiderThan $tr, $x)]>;
@@ -393,8 +393,8 @@ def TruncIExtUIToExtUI :
 // trunci(shrsi(x, c)) -> trunci(shrui(x, c))
 def TruncIShrSIToTrunciShrUI :
     Pat<(Arith_TruncIOp:$tr
-          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0))),
-        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0)))),
+          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0)), $overflow),
+        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0))), $overflow),
         [(TruncationMatchesShiftAmount $x, $tr, $c0)]>;
 
 // trunci(shrui(mul(sext(x), sext(y)), c)) -> mulsi_extended(x, y)
@@ -402,7 +402,7 @@ def TruncIShrUIMulIToMulSIExtended :
     Pat<(Arith_TruncIOp:$tr (Arith_ShRUIOp
                               (Arith_MulIOp:$mul
                                 (Arith_ExtSIOp $x), (Arith_ExtSIOp $y), $ovf1),
-                              (ConstantLikeMatcher AnyAttr:$c0))),
+                              (ConstantLikeMatcher AnyAttr:$c0)), $overflow),
         (Arith_MulSIExtendedOp:$res__1 $x, $y),
       [(ValuesWithSameType $tr, $x, $y),
        (ValueWiderThan $mul, $x),
@@ -413,7 +413,7 @@ def TruncIShrUIMulIToMulUIExtended :
     Pat<(Arith_TruncIOp:$tr (Arith_ShRUIOp
                               (Arith_MulIOp:$mul
                                 (Arith_ExtUIOp $x), (Arith_ExtUIOp $y), $ovf1),
-                              (ConstantLikeMatcher AnyAttr:$c0))),
+                              (ConstantLikeMatcher AnyAttr:$c0)), $overflow),
         (Arith_MulUIExtendedOp:$res__1 $x, $y),
       [(ValuesWithSameType $tr, $x, $y),
        (ValueWiderThan $mul, $x),
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index e0d974ea7404..83bdbe1f6711 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -731,6 +731,8 @@ func.func @ops_supporting_overflow(%arg0: i64, %arg1: i64) {
   %2 = arith.muli %arg0, %arg1 overflow<nsw, nuw> : i64
   // CHECK: %{{.*}} = llvm.shl %{{.*}}, %{{.*}} overflow<nsw, nuw> : i64
   %3 = arith.shli %arg0, %arg1 overflow<nsw, nuw> : i64
+  // CHECK: %{{.*}} = llvm.trunc %{{.*}} overflow<nsw, nuw> : i64 to i32
+  %4 = arith.trunci %arg0 overflow<nsw, nuw> : i64 to i32
   return
 }
 
diff --git a/mlir/test/Dialect/Arith/ops.mlir b/mlir/test/Dialect/Arith/ops.mlir
index f684e02344a5..1e656e84da83 100644
--- a/mlir/test/Dialect/Arith/ops.mlir
+++ b/mlir/test/Dialect/Arith/ops.mlir
@@ -1159,5 +1159,7 @@ func.func @intflags_func(%arg0: i64, %arg1: i64) {
   %2 = arith.muli %arg0, %arg1 overflow<nsw, nuw> : i64
   // CHECK: %{{.*}} = arith.shli %{{.*}}, %{{.*}} overflow<nsw, nuw> : i64
   %3 = arith.shli %arg0, %arg1 overflow<nsw, nuw> : i64
+  // CHECK: %{{.*}} = arith.trunci %{{.*}} overflow<nsw, nuw> : i64 to i32
+  %4 = arith.trunci %arg0 overflow<nsw, nuw> : i64 to i32
   return
 }

From b73720cf6c5380854bf27d4453abf21cc87ae642 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Thu, 19 Jun 2025 14:00:36 +0200
Subject: [PATCH 0924/1322] [analyzer] Conversion to CheckerFamily:
 DynamicTypePropagation (#144735)

This commit converts the class DynamicTypePropagation to a very simple
checker family, which has only one checker frontend -- but also supports
enabling the backend ("modeling checker") without the frontend.

As a tangentially related change, this commit adds the backend of
DynamicTypePropagation as a dependency of alpha.core.DynamicTypeChecker
in Checkers.td, because the header comment of DynamicTypeChecker.cpp
claims that it depends on DynamicTypePropagation and the source code
seems to confirm this.

(The lack of this dependency relationship didn't cause problems, because
'core.DynamicTypePropagation' is in the group 'core', so it is
practically always active. However, explicitly declaring the dependency
clarifies the fact that the separate existence of the modeling checker
is warranted.)
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td | 10 ++--
 .../Checkers/DynamicTypePropagation.cpp       | 55 +++++++++----------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 211ce585fbac..2234143004b6 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -294,10 +294,12 @@ def TestAfterDivZeroChecker : Checker<"TestAfterDivZero">,
            "Either the comparison is useless or there is division by zero.">,
   Documentation<HasDocumentation>;
 
-def DynamicTypeChecker : Checker<"DynamicTypeChecker">,
-  HelpText<"Check for cases where the dynamic and the static type of an object "
-           "are unrelated.">,
-  Documentation<HasDocumentation>;
+def DynamicTypeChecker
+    : Checker<"DynamicTypeChecker">,
+      HelpText<"Check for cases where the dynamic and the static type of an "
+               "object are unrelated.">,
+      Dependencies<[DynamicTypePropagation]>,
+      Documentation<HasDocumentation>;
 
 def StackAddrAsyncEscapeChecker : Checker<"StackAddressAsyncEscape">,
   HelpText<"Check that addresses to stack memory do not escape the function">,
diff --git a/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp b/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
index 344be0b176c5..4982cd59b0a4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
@@ -49,15 +49,19 @@ REGISTER_MAP_WITH_PROGRAMSTATE(MostSpecializedTypeArgsMap, SymbolRef,
                                const ObjCObjectPointerType *)
 
 namespace {
-class DynamicTypePropagation:
-    public Checker< check::PreCall,
-                    check::PostCall,
-                    check::DeadSymbols,
-                    check::PostStmt<CastExpr>,
-                    check::PostStmt<CXXNewExpr>,
-                    check::PreObjCMessage,
-                    check::PostObjCMessage > {
+class DynamicTypePropagation
+    : public CheckerFamily<check::PreCall, check::PostCall, check::DeadSymbols,
+                           check::PostStmt<CastExpr>,
+                           check::PostStmt<CXXNewExpr>, check::PreObjCMessage,
+                           check::PostObjCMessage> {
+public:
+  // This checker family implements only one frontend, but -- unlike a simple
+  // Checker -- its backend can be enabled (by the checker DynamicTypeChecker
+  // which depends on it) without enabling the frontend.
+  CheckerFrontendWithBugType ObjCGenericsChecker{
+      "Generics", categories::CoreFoundationObjectiveC};
 
+private:
   /// Return a better dynamic type if one can be derived from the cast.
   const ObjCObjectPointerType *getBetterObjCType(const Expr *CastE,
                                                  CheckerContext &C) const;
@@ -66,13 +70,6 @@ class DynamicTypePropagation:
                                               ProgramStateRef &State,
                                               CheckerContext &C) const;
 
-  mutable std::unique_ptr<BugType> ObjCGenericsBugType;
-  void initBugType() const {
-    if (!ObjCGenericsBugType)
-      ObjCGenericsBugType.reset(new BugType(
-          GenericCheckName, "Generics", categories::CoreFoundationObjectiveC));
-  }
-
   class GenericsBugVisitor : public BugReporterVisitor {
   public:
     GenericsBugVisitor(SymbolRef S) : Sym(S) {}
@@ -106,9 +103,8 @@ public:
   void checkPreObjCMessage(const ObjCMethodCall &M, CheckerContext &C) const;
   void checkPostObjCMessage(const ObjCMethodCall &M, CheckerContext &C) const;
 
-  /// This value is set to true, when the Generics checker is turned on.
-  bool CheckGenerics = false;
-  CheckerNameRef GenericCheckName;
+  /// Identifies this checker family for debugging purposes.
+  StringRef getDebugTag() const override { return "DynamicTypePropagation"; }
 };
 
 bool isObjCClassType(QualType Type) {
@@ -1026,10 +1022,9 @@ void DynamicTypePropagation::reportGenericsBug(
     const ObjCObjectPointerType *From, const ObjCObjectPointerType *To,
     ExplodedNode *N, SymbolRef Sym, CheckerContext &C,
     const Stmt *ReportedNode) const {
-  if (!CheckGenerics)
+  if (!ObjCGenericsChecker.isEnabled())
     return;
 
-  initBugType();
   SmallString<192> Buf;
   llvm::raw_svector_ostream OS(Buf);
   OS << "Conversion from value of type '";
@@ -1037,7 +1032,7 @@ void DynamicTypePropagation::reportGenericsBug(
   OS << "' to incompatible type '";
   QualType::print(To, Qualifiers(), OS, C.getLangOpts(), llvm::Twine());
   OS << "'";
-  auto R = std::make_unique<PathSensitiveBugReport>(*ObjCGenericsBugType,
+  auto R = std::make_unique<PathSensitiveBugReport>(ObjCGenericsChecker,
                                                     OS.str(), N);
   R->markInteresting(Sym);
   R->addVisitor(std::make_unique<GenericsBugVisitor>(Sym));
@@ -1102,20 +1097,22 @@ PathDiagnosticPieceRef DynamicTypePropagation::GenericsBugVisitor::VisitNode(
 }
 
 /// Register checkers.
-void ento::registerObjCGenericsChecker(CheckerManager &mgr) {
-  DynamicTypePropagation *checker = mgr.getChecker<DynamicTypePropagation>();
-  checker->CheckGenerics = true;
-  checker->GenericCheckName = mgr.getCurrentCheckerName();
+void ento::registerObjCGenericsChecker(CheckerManager &Mgr) {
+  Mgr.getChecker<DynamicTypePropagation>()->ObjCGenericsChecker.enable(Mgr);
 }
 
-bool ento::shouldRegisterObjCGenericsChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterObjCGenericsChecker(const CheckerManager &) {
   return true;
 }
 
-void ento::registerDynamicTypePropagation(CheckerManager &mgr) {
-  mgr.registerChecker<DynamicTypePropagation>();
+void ento::registerDynamicTypePropagation(CheckerManager &Mgr) {
+  // The checker 'core.DynamicTypeChecker' relies on the modeling implemented
+  // in the class 'DynamicTypePropagation', so this "modeling checker" can
+  // register the 'DynamicTypePropagation' backend for its callbacks without
+  // enabling its frontend.
+  Mgr.getChecker<DynamicTypePropagation>();
 }
 
-bool ento::shouldRegisterDynamicTypePropagation(const CheckerManager &mgr) {
+bool ento::shouldRegisterDynamicTypePropagation(const CheckerManager &) {
   return true;
 }

From 2b4d757290226e0185e17294339aae1588efd07e Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Thu, 19 Jun 2025 15:15:26 +0300
Subject: [PATCH 0925/1322] [BOLT] Gadget scanner: detect authentication
 oracles (#135663)

Implement the detection of authentication instructions whose results can
be inspected by an attacker to know whether authentication succeeded.

As the properties of output registers of authentication instructions are
inspected, add a second set of analysis-related classes to iterate over
the instructions in reverse order.
---
 bolt/include/bolt/Passes/PAuthGadgetScanner.h |  25 +-
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 652 +++++++++++++-
 .../AArch64/gs-pauth-authentication-oracles.s | 812 ++++++++++++++++++
 .../AArch64/gs-pauth-debug-output.s           |  84 +-
 4 files changed, 1534 insertions(+), 39 deletions(-)
 create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s

diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
index c6b9cc2eb4b9..721fd664a325 100644
--- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h
+++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
@@ -199,8 +199,7 @@ namespace PAuthGadgetScanner {
 // to distinguish intermediate and final results at the type level.
 //
 // Here is an overview of issue life-cycle:
-// * an analysis (SrcSafetyAnalysis at now, DstSafetyAnalysis will be added
-//   later to support the detection of authentication oracles) computes register
+// * an analysis (SrcSafetyAnalysis or DstSafetyAnalysis) computes register
 //   state for each instruction in the function.
 // * for each instruction, it is checked whether it is a gadget of some kind,
 //   taking the computed state into account. If a gadget is found, its kind
@@ -273,6 +272,11 @@ public:
   virtual ~ExtraInfo() {}
 };
 
+/// The set of instructions writing to the affected register in an unsafe
+/// manner.
+///
+/// This is a hint to be printed alongside the report. It should be further
+/// analyzed by the user.
 class ClobberingInfo : public ExtraInfo {
   SmallVector<MCInstReference> ClobberingInstrs;
 
@@ -282,6 +286,20 @@ public:
   void print(raw_ostream &OS, const MCInstReference Location) const override;
 };
 
+/// The set of instructions leaking the authenticated pointer before the
+/// result of authentication was checked.
+///
+/// This is a hint to be printed alongside the report. It should be further
+/// analyzed by the user.
+class LeakageInfo : public ExtraInfo {
+  SmallVector<MCInstReference> LeakingInstrs;
+
+public:
+  LeakageInfo(ArrayRef<MCInstReference> Instrs) : LeakingInstrs(Instrs) {}
+
+  void print(raw_ostream &OS, const MCInstReference Location) const override;
+};
+
 /// A brief version of a report that can be further augmented with the details.
 ///
 /// A half-baked report produced on the first run of the analysis. An extra,
@@ -322,6 +340,9 @@ class FunctionAnalysisContext {
   void findUnsafeUses(SmallVector<PartialReport<MCPhysReg>> &Reports);
   void augmentUnsafeUseReports(ArrayRef<PartialReport<MCPhysReg>> Reports);
 
+  void findUnsafeDefs(SmallVector<PartialReport<MCPhysReg>> &Reports);
+  void augmentUnsafeDefReports(ArrayRef<PartialReport<MCPhysReg>> Reports);
+
   /// Process the reports which do not have to be augmented, and remove them
   /// from Reports.
   void handleSimpleReports(SmallVector<PartialReport<MCPhysReg>> &Reports);
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 971ea5fdef42..7682d7fe2c54 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -152,6 +152,8 @@ public:
 //    in the gadgets to be reported. This information is used in the second run
 //    to also track which instructions last wrote to those registers.
 
+typedef SmallPtrSet<const MCInst *, 4> SetOfRelatedInsts;
+
 /// A state representing which registers are safe to use by an instruction
 /// at a given program point.
 ///
@@ -195,7 +197,7 @@ struct SrcState {
   /// pac-ret analysis, the expectation is that almost all return instructions
   /// only use register `X30`, and therefore, this vector will probably have
   /// length 1 in the second run.
-  std::vector<SmallPtrSet<const MCInst *, 4>> LastInstWritingReg;
+  std::vector<SetOfRelatedInsts> LastInstWritingReg;
 
   /// Construct an empty state.
   SrcState() {}
@@ -230,12 +232,11 @@ struct SrcState {
   bool operator!=(const SrcState &RHS) const { return !((*this) == RHS); }
 };
 
-static void
-printLastInsts(raw_ostream &OS,
-               ArrayRef<SmallPtrSet<const MCInst *, 4>> LastInstWritingReg) {
+static void printInstsShort(raw_ostream &OS,
+                            ArrayRef<SetOfRelatedInsts> Insts) {
   OS << "Insts: ";
-  for (unsigned I = 0; I < LastInstWritingReg.size(); ++I) {
-    auto &Set = LastInstWritingReg[I];
+  for (unsigned I = 0; I < Insts.size(); ++I) {
+    auto &Set = Insts[I];
     OS << "[" << I << "](";
     for (const MCInst *MCInstP : Set)
       OS << MCInstP << " ";
@@ -243,14 +244,14 @@ printLastInsts(raw_ostream &OS,
   }
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const SrcState &S) {
+static raw_ostream &operator<<(raw_ostream &OS, const SrcState &S) {
   OS << "src-state<";
   if (S.empty()) {
     OS << "empty";
   } else {
     OS << "SafeToDerefRegs: " << S.SafeToDerefRegs << ", ";
     OS << "TrustedRegs: " << S.TrustedRegs << ", ";
-    printLastInsts(OS, S.LastInstWritingReg);
+    printInstsShort(OS, S.LastInstWritingReg);
   }
   OS << ">";
   return OS;
@@ -279,7 +280,7 @@ void SrcStatePrinter::print(raw_ostream &OS, const SrcState &S) const {
     OS << ", TrustedRegs: ";
     RegStatePrinter.print(OS, S.TrustedRegs);
     OS << ", ";
-    printLastInsts(OS, S.LastInstWritingReg);
+    printInstsShort(OS, S.LastInstWritingReg);
   }
   OS << ">";
 }
@@ -323,13 +324,12 @@ protected:
   DenseMap<const MCInst *, std::pair<MCPhysReg, const MCInst *>>
       CheckerSequenceInfo;
 
-  SmallPtrSet<const MCInst *, 4> &lastWritingInsts(SrcState &S,
-                                                   MCPhysReg Reg) const {
+  SetOfRelatedInsts &lastWritingInsts(SrcState &S, MCPhysReg Reg) const {
     unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
     return S.LastInstWritingReg[Index];
   }
-  const SmallPtrSet<const MCInst *, 4> &lastWritingInsts(const SrcState &S,
-                                                         MCPhysReg Reg) const {
+  const SetOfRelatedInsts &lastWritingInsts(const SrcState &S,
+                                            MCPhysReg Reg) const {
     unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
     return S.LastInstWritingReg[Index];
   }
@@ -433,8 +433,7 @@ protected:
     SrcStatePrinter P(BC);
     LLVM_DEBUG({
       dbgs() << "  SrcSafetyAnalysis::ComputeNext(";
-      BC.InstPrinter->printInst(&const_cast<MCInst &>(Point), 0, "", *BC.STI,
-                                dbgs());
+      BC.InstPrinter->printInst(&Point, 0, "", *BC.STI, dbgs());
       dbgs() << ", ";
       P.print(dbgs(), Cur);
       dbgs() << ")\n";
@@ -612,6 +611,42 @@ protected:
   StringRef getAnnotationName() const { return "DataflowSrcSafetyAnalysis"; }
 };
 
+/// A helper base class for implementing a simplified counterpart of a dataflow
+/// analysis for functions without CFG information.
+template <typename StateTy> class CFGUnawareAnalysis {
+  BinaryContext &BC;
+  BinaryFunction &BF;
+  MCPlusBuilder::AllocatorIdTy AllocId;
+  unsigned StateAnnotationIndex;
+
+  void cleanStateAnnotations() {
+    for (auto &I : BF.instrs())
+      BC.MIB->removeAnnotation(I.second, StateAnnotationIndex);
+  }
+
+protected:
+  CFGUnawareAnalysis(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId,
+                     StringRef AnnotationName)
+      : BC(BF.getBinaryContext()), BF(BF), AllocId(AllocId) {
+    StateAnnotationIndex = BC.MIB->getOrCreateAnnotationIndex(AnnotationName);
+  }
+
+  void setState(MCInst &Inst, const StateTy &S) {
+    // Check if we need to remove an old annotation (this is the case if
+    // this is the second, detailed run of the analysis).
+    if (BC.MIB->hasAnnotation(Inst, StateAnnotationIndex))
+      BC.MIB->removeAnnotation(Inst, StateAnnotationIndex);
+    // Attach the state.
+    BC.MIB->addAnnotation(Inst, StateAnnotationIndex, S, AllocId);
+  }
+
+  const StateTy &getState(const MCInst &Inst) const {
+    return BC.MIB->getAnnotationAs<StateTy>(Inst, StateAnnotationIndex);
+  }
+
+  virtual ~CFGUnawareAnalysis() { cleanStateAnnotations(); }
+};
+
 // A simplified implementation of DataflowSrcSafetyAnalysis for functions
 // lacking CFG information.
 //
@@ -646,15 +681,10 @@ protected:
 // of instructions without labels in between. These sequences can be processed
 // the same way basic blocks are processed by data-flow analysis, assuming
 // pessimistically that all registers are unsafe at the start of each sequence.
-class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis {
+class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
+                                    public CFGUnawareAnalysis<SrcState> {
+  using SrcSafetyAnalysis::BC;
   BinaryFunction &BF;
-  MCPlusBuilder::AllocatorIdTy AllocId;
-  unsigned StateAnnotationIndex;
-
-  void cleanStateAnnotations() {
-    for (auto &I : BF.instrs())
-      BC.MIB->removeAnnotation(I.second, StateAnnotationIndex);
-  }
 
   /// Creates a state with all registers marked unsafe (not to be confused
   /// with empty state).
@@ -666,9 +696,8 @@ public:
   CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF,
                               MCPlusBuilder::AllocatorIdTy AllocId,
                               ArrayRef<MCPhysReg> RegsToTrackInstsFor)
-      : SrcSafetyAnalysis(BF, RegsToTrackInstsFor), BF(BF), AllocId(AllocId) {
-    StateAnnotationIndex =
-        BC.MIB->getOrCreateAnnotationIndex("CFGUnawareSrcSafetyAnalysis");
+      : SrcSafetyAnalysis(BF, RegsToTrackInstsFor),
+        CFGUnawareAnalysis(BF, AllocId, "CFGUnawareSrcSafetyAnalysis"), BF(BF) {
   }
 
   void run() override {
@@ -687,12 +716,8 @@ public:
         S = createUnsafeState();
       }
 
-      // Check if we need to remove an old annotation (this is the case if
-      // this is the second, detailed, run of the analysis).
-      if (BC.MIB->hasAnnotation(Inst, StateAnnotationIndex))
-        BC.MIB->removeAnnotation(Inst, StateAnnotationIndex);
       // Attach the state *before* this instruction executes.
-      BC.MIB->addAnnotation(Inst, StateAnnotationIndex, S, AllocId);
+      setState(Inst, S);
 
       // Compute the state after this instruction executes.
       S = computeNext(Inst, S);
@@ -700,10 +725,8 @@ public:
   }
 
   const SrcState &getStateBefore(const MCInst &Inst) const override {
-    return BC.MIB->getAnnotationAs<SrcState>(Inst, StateAnnotationIndex);
+    return getState(Inst);
   }
-
-  ~CFGUnawareSrcSafetyAnalysis() { cleanStateAnnotations(); }
 };
 
 std::shared_ptr<SrcSafetyAnalysis>
@@ -717,6 +740,478 @@ SrcSafetyAnalysis::create(BinaryFunction &BF,
                                                        RegsToTrackInstsFor);
 }
 
+/// A state representing which registers are safe to be used as the destination
+/// operand of an authentication instruction.
+///
+/// Similar to SrcState, it is the responsibility of the analysis to take
+/// register aliasing into account.
+///
+/// Depending on the implementation (such as whether FEAT_FPAC is implemented
+/// by an AArch64 CPU or not), it may be possible that an authentication
+/// instruction returns an invalid pointer on failure instead of terminating
+/// the program immediately (assuming the program will crash as soon as that
+/// pointer is dereferenced). Since few bits are usually allocated for the PAC
+/// field (such as less than 16 bits on a typical AArch64 system), an attacker
+/// can try every possible signature and guess the correct one if there is a
+/// gadget that tells whether the particular pointer has a correct signature
+/// (a so called "authentication oracle"). For that reason, it should be
+/// impossible for an attacker to test if a pointer is correctly signed -
+/// either the program should be terminated on authentication failure or
+/// the result of authentication should not be accessible to an attacker.
+///
+/// Considering the instructions in forward order as they are executed, a
+/// restricted set of operations can be allowed on any register containing a
+/// value derived from the result of an authentication instruction until that
+/// value is checked not to contain the result of a failed authentication.
+/// In DstSafetyAnalysis, these rules are adapted, so that the safety property
+/// for a register is computed by iterating the instructions in backward order.
+/// Then the resulting properties are used at authentication instruction sites
+/// to check output registers and report the particular instruction if it writes
+/// to an unsafe register.
+///
+/// Another approach would be to simulate the above rules as-is, iterating over
+/// the instructions in forward direction. To make it possible to report the
+/// particular instructions as oracles, this would probably require tracking
+/// references to these instructions for each register currently containing
+/// sensitive data.
+///
+/// In DstSafetyAnalysis, the source register Xn of an instruction Inst is safe
+/// if at least one of the following is true:
+/// * Inst checks if Xn contains the result of a successful authentication and
+///   terminates the program on failure. Note that Inst can either naturally
+///   dereference Xn (load, branch, return, etc. instructions) or be the first
+///   instruction of an explicit checking sequence.
+/// * Inst performs safe address arithmetic AND both source and result
+///   registers, as well as any temporary registers, must be safe after
+///   execution of Inst (temporaries are not used on AArch64 and thus not
+///   currently supported/allowed).
+///   See MCPlusBuilder::analyzeAddressArithmeticsForPtrAuth for the details.
+/// * Inst fully overwrites Xn with a constant.
+struct DstState {
+  /// The set of registers whose values cannot be inspected by an attacker in
+  /// a way usable as an authentication oracle. The results of authentication
+  /// instructions should only be written to such registers.
+  BitVector CannotEscapeUnchecked;
+
+  /// A vector of sets, only used on the second analysis run.
+  /// Each element in this vector represents one of the tracked registers.
+  /// For each such register we track the set of first instructions that leak
+  /// the authenticated pointer before it was checked. This is intended to
+  /// provide clues on which instruction made the particular register unsafe.
+  ///
+  /// Please note that the mapping from MCPhysReg values to indexes in this
+  /// vector is provided by RegsToTrackInstsFor field of DstSafetyAnalysis.
+  std::vector<SetOfRelatedInsts> FirstInstLeakingReg;
+
+  /// Constructs an empty state.
+  DstState() {}
+
+  DstState(unsigned NumRegs, unsigned NumRegsToTrack)
+      : CannotEscapeUnchecked(NumRegs), FirstInstLeakingReg(NumRegsToTrack) {}
+
+  DstState &merge(const DstState &StateIn) {
+    if (StateIn.empty())
+      return *this;
+    if (empty())
+      return (*this = StateIn);
+
+    CannotEscapeUnchecked &= StateIn.CannotEscapeUnchecked;
+    for (unsigned I = 0; I < FirstInstLeakingReg.size(); ++I)
+      for (const MCInst *J : StateIn.FirstInstLeakingReg[I])
+        FirstInstLeakingReg[I].insert(J);
+    return *this;
+  }
+
+  /// Returns true if this object does not store state of any registers -
+  /// neither safe, nor unsafe ones.
+  bool empty() const { return CannotEscapeUnchecked.empty(); }
+
+  bool operator==(const DstState &RHS) const {
+    return CannotEscapeUnchecked == RHS.CannotEscapeUnchecked &&
+           FirstInstLeakingReg == RHS.FirstInstLeakingReg;
+  }
+  bool operator!=(const DstState &RHS) const { return !((*this) == RHS); }
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const DstState &S) {
+  OS << "dst-state<";
+  if (S.empty()) {
+    OS << "empty";
+  } else {
+    OS << "CannotEscapeUnchecked: " << S.CannotEscapeUnchecked << ", ";
+    printInstsShort(OS, S.FirstInstLeakingReg);
+  }
+  OS << ">";
+  return OS;
+}
+
+class DstStatePrinter {
+public:
+  void print(raw_ostream &OS, const DstState &S) const;
+  explicit DstStatePrinter(const BinaryContext &BC) : BC(BC) {}
+
+private:
+  const BinaryContext &BC;
+};
+
+void DstStatePrinter::print(raw_ostream &OS, const DstState &S) const {
+  RegStatePrinter RegStatePrinter(BC);
+  OS << "dst-state<";
+  if (S.empty()) {
+    assert(S.CannotEscapeUnchecked.empty());
+    assert(S.FirstInstLeakingReg.empty());
+    OS << "empty";
+  } else {
+    OS << "CannotEscapeUnchecked: ";
+    RegStatePrinter.print(OS, S.CannotEscapeUnchecked);
+    OS << ", ";
+    printInstsShort(OS, S.FirstInstLeakingReg);
+  }
+  OS << ">";
+}
+
+/// Computes which registers are safe to be written to by auth instructions.
+///
+/// This is the base class for two implementations: a dataflow-based analysis
+/// which is intended to be used for most functions and a simplified CFG-unaware
+/// version for functions without reconstructed CFG.
+class DstSafetyAnalysis {
+public:
+  DstSafetyAnalysis(BinaryFunction &BF, ArrayRef<MCPhysReg> RegsToTrackInstsFor)
+      : BC(BF.getBinaryContext()), NumRegs(BC.MRI->getNumRegs()),
+        RegsToTrackInstsFor(RegsToTrackInstsFor) {}
+
+  virtual ~DstSafetyAnalysis() {}
+
+  static std::shared_ptr<DstSafetyAnalysis>
+  create(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId,
+         ArrayRef<MCPhysReg> RegsToTrackInstsFor);
+
+  virtual void run() = 0;
+  virtual const DstState &getStateAfter(const MCInst &Inst) const = 0;
+
+protected:
+  BinaryContext &BC;
+  const unsigned NumRegs;
+
+  const TrackedRegisters RegsToTrackInstsFor;
+
+  /// Stores information about the detected instruction sequences emitted to
+  /// check an authenticated pointer. Specifically, if such sequence is detected
+  /// in a basic block, it maps the first instruction of that sequence to the
+  /// register being checked.
+  ///
+  /// As the detection of such sequences requires iterating over the adjacent
+  /// instructions, it should be done before calling computeNext(), which
+  /// operates on separate instructions.
+  DenseMap<const MCInst *, MCPhysReg> RegCheckedAt;
+
+  SetOfRelatedInsts &firstLeakingInsts(DstState &S, MCPhysReg Reg) const {
+    unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
+    return S.FirstInstLeakingReg[Index];
+  }
+  const SetOfRelatedInsts &firstLeakingInsts(const DstState &S,
+                                             MCPhysReg Reg) const {
+    unsigned Index = RegsToTrackInstsFor.getIndex(Reg);
+    return S.FirstInstLeakingReg[Index];
+  }
+
+  /// Creates a state with all registers marked unsafe (not to be confused
+  /// with empty state).
+  DstState createUnsafeState() {
+    return DstState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
+  }
+
+  /// Returns the set of registers that can be leaked by this instruction.
+  /// A register is considered leaked if it has any intersection with any
+  /// register read by Inst. This is similar to how the set of clobbered
+  /// registers is computed, but taking input operands instead of outputs.
+  BitVector getLeakedRegs(const MCInst &Inst) const {
+    BitVector Leaked(NumRegs);
+
+    // Assume a call can read all registers.
+    if (BC.MIB->isCall(Inst)) {
+      Leaked.set();
+      return Leaked;
+    }
+
+    // Compute the set of registers overlapping with any register used by
+    // this instruction.
+
+    const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
+
+    for (MCPhysReg Reg : Desc.implicit_uses())
+      Leaked |= BC.MIB->getAliases(Reg, /*OnlySmaller=*/false);
+
+    for (const MCOperand &Op : BC.MIB->useOperands(Inst)) {
+      if (Op.isReg())
+        Leaked |= BC.MIB->getAliases(Op.getReg(), /*OnlySmaller=*/false);
+    }
+
+    return Leaked;
+  }
+
+  SmallVector<MCPhysReg> getRegsMadeProtected(const MCInst &Inst,
+                                              const BitVector &LeakedRegs,
+                                              const DstState &Cur) const {
+    SmallVector<MCPhysReg> Regs;
+
+    // A pointer can be checked, or
+    if (auto CheckedReg =
+            BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/true))
+      Regs.push_back(*CheckedReg);
+    if (RegCheckedAt.contains(&Inst))
+      Regs.push_back(RegCheckedAt.at(&Inst));
+
+    // ... it can be used as a branch target, or
+    if (BC.MIB->isIndirectBranch(Inst) || BC.MIB->isIndirectCall(Inst)) {
+      bool IsAuthenticated;
+      MCPhysReg BranchDestReg =
+          BC.MIB->getRegUsedAsIndirectBranchDest(Inst, IsAuthenticated);
+      assert(BranchDestReg != BC.MIB->getNoRegister());
+      if (!IsAuthenticated)
+        Regs.push_back(BranchDestReg);
+    }
+
+    // ... it can be used as a return target, or
+    if (BC.MIB->isReturn(Inst)) {
+      bool IsAuthenticated = false;
+      std::optional<MCPhysReg> RetReg =
+          BC.MIB->getRegUsedAsRetDest(Inst, IsAuthenticated);
+      if (RetReg && !IsAuthenticated)
+        Regs.push_back(*RetReg);
+    }
+
+    // ... an address can be updated in a safe manner, or
+    if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Inst)) {
+      MCPhysReg DstReg, SrcReg;
+      std::tie(DstReg, SrcReg) = *DstAndSrc;
+      // Note that *all* registers containing the derived values must be safe,
+      // both source and destination ones. No temporaries are supported at now.
+      if (Cur.CannotEscapeUnchecked[SrcReg] &&
+          Cur.CannotEscapeUnchecked[DstReg])
+        Regs.push_back(SrcReg);
+    }
+
+    // ... the register can be overwritten in whole with a constant: for that
+    // purpose, look for the instructions with no register inputs (neither
+    // explicit nor implicit ones) and no side effects (to rule out reading
+    // not modelled locations).
+    const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
+    bool HasExplicitSrcRegs = llvm::any_of(BC.MIB->useOperands(Inst),
+                                           [](auto Op) { return Op.isReg(); });
+    if (!Desc.hasUnmodeledSideEffects() && !HasExplicitSrcRegs &&
+        Desc.implicit_uses().empty()) {
+      for (const MCOperand &Def : BC.MIB->defOperands(Inst))
+        Regs.push_back(Def.getReg());
+    }
+
+    return Regs;
+  }
+
+  DstState computeNext(const MCInst &Point, const DstState &Cur) {
+    DstStatePrinter P(BC);
+    LLVM_DEBUG({
+      dbgs() << "  DstSafetyAnalysis::ComputeNext(";
+      BC.InstPrinter->printInst(&Point, 0, "", *BC.STI, dbgs());
+      dbgs() << ", ";
+      P.print(dbgs(), Cur);
+      dbgs() << ")\n";
+    });
+
+    // If this instruction is reachable by the analysis, a non-empty state will
+    // be propagated to it sooner or later. Until then, skip computeNext().
+    if (Cur.empty()) {
+      LLVM_DEBUG(
+          { dbgs() << "Skipping computeNext(Point, Cur) as Cur is empty.\n"; });
+      return DstState();
+    }
+
+    // First, compute various properties of the instruction, taking the state
+    // after its execution into account, if necessary.
+
+    BitVector LeakedRegs = getLeakedRegs(Point);
+    SmallVector<MCPhysReg> NewProtectedRegs =
+        getRegsMadeProtected(Point, LeakedRegs, Cur);
+
+    // Then, compute the state before this instruction is executed.
+    DstState Next = Cur;
+
+    Next.CannotEscapeUnchecked.reset(LeakedRegs);
+    for (MCPhysReg Reg : RegsToTrackInstsFor.getRegisters()) {
+      if (LeakedRegs[Reg])
+        firstLeakingInsts(Next, Reg) = {&Point};
+    }
+
+    BitVector NewProtectedSubregs(NumRegs);
+    for (MCPhysReg Reg : NewProtectedRegs)
+      NewProtectedSubregs |= BC.MIB->getAliases(Reg, /*OnlySmaller=*/true);
+    Next.CannotEscapeUnchecked |= NewProtectedSubregs;
+    for (MCPhysReg Reg : RegsToTrackInstsFor.getRegisters()) {
+      if (NewProtectedSubregs[Reg])
+        firstLeakingInsts(Next, Reg).clear();
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "    .. result: (";
+      P.print(dbgs(), Next);
+      dbgs() << ")\n";
+    });
+
+    return Next;
+  }
+
+public:
+  std::vector<MCInstReference> getLeakingInsts(const MCInst &Inst,
+                                               BinaryFunction &BF,
+                                               MCPhysReg LeakedReg) const {
+    const DstState &S = getStateAfter(Inst);
+
+    std::vector<MCInstReference> Result;
+    for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) {
+      MCInstReference Ref = MCInstReference::get(Inst, BF);
+      assert(Ref && "Expected Inst to be found");
+      Result.push_back(Ref);
+    }
+    return Result;
+  }
+};
+
+class DataflowDstSafetyAnalysis
+    : public DstSafetyAnalysis,
+      public DataflowAnalysis<DataflowDstSafetyAnalysis, DstState,
+                              /*Backward=*/true, DstStatePrinter> {
+  using DFParent = DataflowAnalysis<DataflowDstSafetyAnalysis, DstState, true,
+                                    DstStatePrinter>;
+  friend DFParent;
+
+  using DstSafetyAnalysis::BC;
+  using DstSafetyAnalysis::computeNext;
+
+public:
+  DataflowDstSafetyAnalysis(BinaryFunction &BF,
+                            MCPlusBuilder::AllocatorIdTy AllocId,
+                            ArrayRef<MCPhysReg> RegsToTrackInstsFor)
+      : DstSafetyAnalysis(BF, RegsToTrackInstsFor), DFParent(BF, AllocId) {}
+
+  const DstState &getStateAfter(const MCInst &Inst) const override {
+    // The dataflow analysis base class iterates backwards over the
+    // instructions, thus "after" vs. "before" difference.
+    return DFParent::getStateBefore(Inst).get();
+  }
+
+  void run() override {
+    for (BinaryBasicBlock &BB : Func) {
+      if (auto CheckerInfo = BC.MIB->getAuthCheckedReg(BB)) {
+        LLVM_DEBUG({
+          dbgs() << "Found pointer checking sequence in " << BB.getName()
+                 << ":\n";
+          traceReg(BC, "Checked register", CheckerInfo->first);
+          traceInst(BC, "First instruction", *CheckerInfo->second);
+        });
+        RegCheckedAt[CheckerInfo->second] = CheckerInfo->first;
+      }
+    }
+    DFParent::run();
+  }
+
+protected:
+  void preflight() {}
+
+  DstState getStartingStateAtBB(const BinaryBasicBlock &BB) {
+    // In general, the initial state should be empty, not everything-is-unsafe,
+    // to give a chance for some meaningful state to be propagated to BB from
+    // an indirectly reachable "exit basic block" ending with a return or tail
+    // call instruction.
+    //
+    // A basic block without any successors, on the other hand, can be
+    // pessimistically initialized to everything-is-unsafe: this will naturally
+    // handle both return and tail call instructions and is harmless for
+    // internal indirect branch instructions (such as computed gotos).
+    if (BB.succ_empty())
+      return createUnsafeState();
+
+    return DstState();
+  }
+
+  DstState getStartingStateAtPoint(const MCInst &Point) { return DstState(); }
+
+  void doConfluence(DstState &StateOut, const DstState &StateIn) {
+    DstStatePrinter P(BC);
+    LLVM_DEBUG({
+      dbgs() << "  DataflowDstSafetyAnalysis::Confluence(\n";
+      dbgs() << "    State 1: ";
+      P.print(dbgs(), StateOut);
+      dbgs() << "\n";
+      dbgs() << "    State 2: ";
+      P.print(dbgs(), StateIn);
+      dbgs() << ")\n";
+    });
+
+    StateOut.merge(StateIn);
+
+    LLVM_DEBUG({
+      dbgs() << "    merged state: ";
+      P.print(dbgs(), StateOut);
+      dbgs() << "\n";
+    });
+  }
+
+  StringRef getAnnotationName() const { return "DataflowDstSafetyAnalysis"; }
+};
+
+class CFGUnawareDstSafetyAnalysis : public DstSafetyAnalysis,
+                                    public CFGUnawareAnalysis<DstState> {
+  using DstSafetyAnalysis::BC;
+  BinaryFunction &BF;
+
+public:
+  CFGUnawareDstSafetyAnalysis(BinaryFunction &BF,
+                              MCPlusBuilder::AllocatorIdTy AllocId,
+                              ArrayRef<MCPhysReg> RegsToTrackInstsFor)
+      : DstSafetyAnalysis(BF, RegsToTrackInstsFor),
+        CFGUnawareAnalysis(BF, AllocId, "CFGUnawareDstSafetyAnalysis"), BF(BF) {
+  }
+
+  void run() override {
+    DstState S = createUnsafeState();
+    for (auto &I : llvm::reverse(BF.instrs())) {
+      MCInst &Inst = I.second;
+
+      // If Inst can change the control flow, we cannot be sure that the next
+      // instruction (to be executed in analyzed program) is the one processed
+      // on the previous iteration, thus pessimistically reset S before
+      // starting to analyze Inst.
+      if (BC.MIB->isCall(Inst) || BC.MIB->isBranch(Inst) ||
+          BC.MIB->isReturn(Inst)) {
+        LLVM_DEBUG({ traceInst(BC, "Control flow instruction", Inst); });
+        S = createUnsafeState();
+      }
+
+      // Attach the state *after* this instruction executes.
+      setState(Inst, S);
+
+      // Compute the next state.
+      S = computeNext(Inst, S);
+    }
+  }
+
+  const DstState &getStateAfter(const MCInst &Inst) const override {
+    return getState(Inst);
+  }
+};
+
+std::shared_ptr<DstSafetyAnalysis>
+DstSafetyAnalysis::create(BinaryFunction &BF,
+                          MCPlusBuilder::AllocatorIdTy AllocId,
+                          ArrayRef<MCPhysReg> RegsToTrackInstsFor) {
+  if (BF.hasCFG())
+    return std::make_shared<DataflowDstSafetyAnalysis>(BF, AllocId,
+                                                       RegsToTrackInstsFor);
+  return std::make_shared<CFGUnawareDstSafetyAnalysis>(BF, AllocId,
+                                                       RegsToTrackInstsFor);
+}
+
 // This function could return PartialReport<T>, but currently T is always
 // MCPhysReg, even though it is an implementation detail.
 static PartialReport<MCPhysReg> make_generic_report(MCInstReference Location,
@@ -808,6 +1303,37 @@ shouldReportSigningOracle(const BinaryContext &BC, const MCInstReference &Inst,
   return make_gadget_report(SigningOracleKind, Inst, *SignedReg);
 }
 
+static std::optional<PartialReport<MCPhysReg>>
+shouldReportAuthOracle(const BinaryContext &BC, const MCInstReference &Inst,
+                       const DstState &S) {
+  static const GadgetKind AuthOracleKind("authentication oracle found");
+
+  bool IsChecked = false;
+  std::optional<MCPhysReg> AuthReg =
+      BC.MIB->getWrittenAuthenticatedReg(Inst, IsChecked);
+  if (!AuthReg || IsChecked)
+    return std::nullopt;
+
+  LLVM_DEBUG({
+    traceInst(BC, "Found auth inst", Inst);
+    traceReg(BC, "Authenticated reg", *AuthReg);
+  });
+
+  if (S.empty()) {
+    LLVM_DEBUG({ dbgs() << "    DstState is empty!\n"; });
+    return make_generic_report(
+        Inst, "Warning: no state computed for an authentication instruction "
+              "(possibly unreachable)");
+  }
+
+  LLVM_DEBUG(
+      { traceRegMask(BC, "safe output registers", S.CannotEscapeUnchecked); });
+  if (S.CannotEscapeUnchecked[*AuthReg])
+    return std::nullopt;
+
+  return make_gadget_report(AuthOracleKind, Inst, *AuthReg);
+}
+
 template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
   if (BF.hasCFG()) {
     for (BinaryBasicBlock &BB : BF)
@@ -889,6 +1415,52 @@ void FunctionAnalysisContext::augmentUnsafeUseReports(
   }
 }
 
+void FunctionAnalysisContext::findUnsafeDefs(
+    SmallVector<PartialReport<MCPhysReg>> &Reports) {
+  if (PacRetGadgetsOnly)
+    return;
+
+  auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, {});
+  LLVM_DEBUG({ dbgs() << "Running dst register safety analysis...\n"; });
+  Analysis->run();
+  LLVM_DEBUG({
+    dbgs() << "After dst register safety analysis:\n";
+    BF.dump();
+  });
+
+  iterateOverInstrs(BF, [&](MCInstReference Inst) {
+    const DstState &S = Analysis->getStateAfter(Inst);
+
+    if (auto Report = shouldReportAuthOracle(BC, Inst, S))
+      Reports.push_back(*Report);
+  });
+}
+
+void FunctionAnalysisContext::augmentUnsafeDefReports(
+    ArrayRef<PartialReport<MCPhysReg>> Reports) {
+  SmallVector<MCPhysReg> RegsToTrack = collectRegsToTrack(Reports);
+  // Re-compute the analysis with register tracking.
+  auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, RegsToTrack);
+  LLVM_DEBUG(
+      { dbgs() << "\nRunning detailed dst register safety analysis...\n"; });
+  Analysis->run();
+  LLVM_DEBUG({
+    dbgs() << "After detailed dst register safety analysis:\n";
+    BF.dump();
+  });
+
+  // Augment gadget reports.
+  for (auto &Report : Reports) {
+    MCInstReference Location = Report.Issue->Location;
+    LLVM_DEBUG({ traceInst(BC, "Attaching leakage info to", Location); });
+    assert(Report.RequestedDetails &&
+           "Should be removed by handleSimpleReports");
+    auto DetailedInfo = std::make_shared<LeakageInfo>(
+        Analysis->getLeakingInsts(Location, BF, *Report.RequestedDetails));
+    Result.Diagnostics.emplace_back(Report.Issue, DetailedInfo);
+  }
+}
+
 void FunctionAnalysisContext::handleSimpleReports(
     SmallVector<PartialReport<MCPhysReg>> &Reports) {
   // Before re-running the detailed analysis, process the reports which do not
@@ -912,6 +1484,12 @@ void FunctionAnalysisContext::run() {
   handleSimpleReports(UnsafeUses);
   if (!UnsafeUses.empty())
     augmentUnsafeUseReports(UnsafeUses);
+
+  SmallVector<PartialReport<MCPhysReg>> UnsafeDefs;
+  findUnsafeDefs(UnsafeDefs);
+  handleSimpleReports(UnsafeDefs);
+  if (!UnsafeDefs.empty())
+    augmentUnsafeDefReports(UnsafeDefs);
 }
 
 void Analysis::runOnFunction(BinaryFunction &BF,
@@ -1015,6 +1593,12 @@ void ClobberingInfo::print(raw_ostream &OS,
   printRelatedInstrs(OS, Location, ClobberingInstrs);
 }
 
+void LeakageInfo::print(raw_ostream &OS, const MCInstReference Location) const {
+  OS << "  The " << LeakingInstrs.size()
+     << " instructions that leak the affected registers are:\n";
+  printRelatedInstrs(OS, Location, LeakingInstrs);
+}
+
 void GenericDiagnostic::generateReport(raw_ostream &OS,
                                        const BinaryContext &BC) const {
   printBasicInfo(OS, BC, Text);
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
new file mode 100644
index 000000000000..717bf40df3d0
--- /dev/null
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
@@ -0,0 +1,812 @@
+// RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe
+// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth  %t.exe 2>&1 | FileCheck %s
+
+// The detection of compiler-generated explicit pointer checks is tested in
+// gs-pauth-address-checks.s, for that reason only test here "dummy-load" and
+// "high-bits-notbi" checkers, as the shortest examples of checkers that are
+// detected per-instruction and per-BB.
+
+// PACRET-NOT: authentication oracle found in function
+
+        .text
+
+        .type   sym,@function
+sym:
+        ret
+        .size sym, .-sym
+
+        .globl  callee
+        .type   callee,@function
+callee:
+        ret
+        .size callee, .-callee
+
+        .globl  good_ret
+        .type   good_ret,@function
+good_ret:
+// CHECK-NOT: good_ret
+        autia   x0, x1
+        ret     x0
+        .size good_ret, .-good_ret
+
+        .globl  good_call
+        .type   good_call,@function
+good_call:
+// CHECK-NOT: good_call
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        blr     x0
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_call, .-good_call
+
+        .globl  good_branch
+        .type   good_branch,@function
+good_branch:
+// CHECK-NOT: good_branch
+        autia   x0, x1
+        br      x0
+        .size good_branch, .-good_branch
+
+        .globl  good_load_other_reg
+        .type   good_load_other_reg,@function
+good_load_other_reg:
+// CHECK-NOT: good_load_other_reg
+        autia   x0, x1
+        ldr     x2, [x0]
+        ret
+        .size good_load_other_reg, .-good_load_other_reg
+
+        .globl  good_load_same_reg
+        .type   good_load_same_reg,@function
+good_load_same_reg:
+// CHECK-NOT: good_load_same_reg
+        autia   x0, x1
+        ldr     x0, [x0]
+        ret
+        .size good_load_same_reg, .-good_load_same_reg
+
+        .globl  good_explicit_check
+        .type   good_explicit_check,@function
+good_explicit_check:
+// CHECK-NOT: good_explicit_check
+        autia   x0, x1
+        eor     x16, x0, x0, lsl #1
+        tbz     x16, #62, 1f
+        brk     0x1234
+1:
+        ret
+        .size good_explicit_check, .-good_explicit_check
+
+        .globl  bad_unchecked
+        .type   bad_unchecked,@function
+bad_unchecked:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        ret
+        .size bad_unchecked, .-bad_unchecked
+
+        .globl  bad_leaked_to_subroutine
+        .type   bad_leaked_to_subroutine,@function
+bad_leaked_to_subroutine:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_leaked_to_subroutine, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      bl      callee
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   paciasp
+// CHECK-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
+// CHECK-NEXT:  {{[0-9a-f]+}}:   mov     x29, sp
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   bl      callee
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autiasp
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        bl      callee
+        ldr     x2, [x0]
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_leaked_to_subroutine, .-bad_leaked_to_subroutine
+
+        .globl  bad_unknown_usage_read
+        .type   bad_unknown_usage_read,@function
+bad_unknown_usage_read:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_read, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   mul     x3, x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        autia   x0, x1
+        // Registers are not accessible to an attacker under Pointer
+        // Authentication threat model, until spilled to memory.
+        // Thus, reporting the below MUL instruction is a false positive, since
+        // the next LDR instruction prevents any possible spilling of x3 unless
+        // the authentication succeeded. Though, rejecting anything except for
+        // a closed list of instruction types is the intended behavior of the
+        // analysis, so this false positive is by design.
+        mul     x3, x0, x1
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_read, .-bad_unknown_usage_read
+
+        .globl  bad_store_to_memory_and_wait
+        .type   bad_store_to_memory_and_wait,@function
+bad_store_to_memory_and_wait:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_store_to_memory_and_wait, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      str     x0, [x3]
+        autia   x0, x1
+        cbz     x3, 2f
+        str     x0, [x3]
+1:
+        // The thread performs a time-consuming computation while the result of
+        // authentication is accessible in memory.
+        nop
+2:
+        ldr     x2, [x0]
+        ret
+        .size bad_store_to_memory_and_wait, .-bad_store_to_memory_and_wait
+
+// FIXME: Known false negative: if no return instruction is reachable from a
+//        program point (this probably implies an infinite loop), such
+//        instruction cannot be detected as an authentication oracle.
+        .globl  bad_store_to_memory_and_hang
+        .type   bad_store_to_memory_and_hang,@function
+bad_store_to_memory_and_hang:
+// CHECK-NOT: bad_store_to_memory_and_hang
+        autia   x0, x1
+        cbz     x3, 2f
+        str     x0, [x3]
+1:
+        // The thread loops indefinitely while the result of authentication
+        // is accessible in memory.
+        b       1b
+2:
+        ldr     x2, [x0]
+        ret
+        .size bad_store_to_memory_and_hang, .-bad_store_to_memory_and_hang
+
+        .globl  bad_unknown_usage_subreg_read
+        .type   bad_unknown_usage_subreg_read,@function
+bad_unknown_usage_subreg_read:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_subreg_read, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   mul     w3, w0, w1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        autia   x0, x1
+        mul     w3, w0, w1
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_subreg_read, .-bad_unknown_usage_subreg_read
+
+        .globl  bad_unknown_usage_update
+        .type   bad_unknown_usage_update,@function
+bad_unknown_usage_update:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_update, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
+// CHECK-NEXT:  This happens in the following basic block:
+// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// CHECK-NEXT:  {{[0-9a-f]+}}:   movk    x0, #0x2a, lsl #16
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ldr     x2, [x0]
+// CHECK-NEXT:  {{[0-9a-f]+}}:   ret
+        autia   x0, x1
+        movk    x0, #42, lsl #16 // does not overwrite x0 completely
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_update, .-bad_unknown_usage_update
+
+        .globl  good_overwrite_with_constant
+        .type   good_overwrite_with_constant,@function
+good_overwrite_with_constant:
+// CHECK-NOT: good_overwrite_with_constant
+        autia   x0, x1
+        mov     x0, #42
+        ret
+        .size good_overwrite_with_constant, .-good_overwrite_with_constant
+
+// Overwriting sensitive data by instructions with unmodelled side-effects is
+// explicitly rejected, even though this particular MRS is safe.
+        .globl  bad_overwrite_with_side_effects
+        .type   bad_overwrite_with_side_effects,@function
+bad_overwrite_with_side_effects:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_overwrite_with_side_effects, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        mrs     x0, CTR_EL0
+        ret
+        .size bad_overwrite_with_side_effects, .-bad_overwrite_with_side_effects
+
+// Here the new value written by MUL to x0 is completely unrelated to the result
+// of authentication, so this is a false positive.
+// FIXME: Can/should we generalize overwriting by constant to handle such cases?
+        .globl  good_unknown_overwrite
+        .type   good_unknown_overwrite,@function
+good_unknown_overwrite:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function good_unknown_overwrite, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        mul     x0, x1, x2
+        ret
+        .size good_unknown_overwrite, .-good_unknown_overwrite
+
+// This is a false positive: when a general-purpose register is written to as
+// a 32-bit register, its top 32 bits are zeroed, but according to LLVM
+// representation, the instruction only overwrites the Wn register.
+        .globl  good_wreg_overwrite
+        .type   good_wreg_overwrite,@function
+good_wreg_overwrite:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function good_wreg_overwrite, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+        autia   x0, x1
+        mov     w0, #42
+        ret
+        .size good_wreg_overwrite, .-good_wreg_overwrite
+
+        .globl  good_address_arith
+        .type   good_address_arith,@function
+good_address_arith:
+// CHECK-NOT: good_address_arith
+        autia   x0, x1
+
+        add     x1, x0, #8
+        sub     x2, x1, #16
+        mov     x3, x2
+
+        ldr     x4, [x3]
+        mov     x0, #0
+        mov     x1, #0
+        mov     x2, #0
+
+        ret
+        .size good_address_arith, .-good_address_arith
+
+        .globl  good_ret_multi_bb
+        .type   good_ret_multi_bb,@function
+good_ret_multi_bb:
+// CHECK-NOT: good_ret_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        ret     x0
+        .size good_ret_multi_bb, .-good_ret_multi_bb
+
+        .globl  good_call_multi_bb
+        .type   good_call_multi_bb,@function
+good_call_multi_bb:
+// CHECK-NOT: good_call_multi_bb
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        blr     x0
+        cbz     x1, 2f
+        nop
+2:
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_call_multi_bb, .-good_call_multi_bb
+
+        .globl  good_branch_multi_bb
+        .type   good_branch_multi_bb,@function
+good_branch_multi_bb:
+// CHECK-NOT: good_branch_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        br      x0
+        .size good_branch_multi_bb, .-good_branch_multi_bb
+
+        .globl  good_load_other_reg_multi_bb
+        .type   good_load_other_reg_multi_bb,@function
+good_load_other_reg_multi_bb:
+// CHECK-NOT: good_load_other_reg_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        ldr     x2, [x0]
+        cbz     x1, 2f
+        nop
+2:
+        ret
+        .size good_load_other_reg_multi_bb, .-good_load_other_reg_multi_bb
+
+        .globl  good_load_same_reg_multi_bb
+        .type   good_load_same_reg_multi_bb,@function
+good_load_same_reg_multi_bb:
+// CHECK-NOT: good_load_same_reg_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        ldr     x0, [x0]
+        cbz     x1, 2f
+        nop
+2:
+        ret
+        .size good_load_same_reg_multi_bb, .-good_load_same_reg_multi_bb
+
+        .globl  good_explicit_check_multi_bb
+        .type   good_explicit_check_multi_bb,@function
+good_explicit_check_multi_bb:
+// CHECK-NOT: good_explicit_check_multi_bb
+        autia   x0, x1
+        cbz     x1, 1f
+        nop
+1:
+        eor     x16, x0, x0, lsl #1
+        tbz     x16, #62, 2f
+        brk     0x1234
+2:
+        cbz     x1, 3f
+        nop
+3:
+        ret
+        .size good_explicit_check_multi_bb, .-good_explicit_check_multi_bb
+
+        .globl  bad_unchecked_multi_bb
+        .type   bad_unchecked_multi_bb,@function
+bad_unchecked_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        autia   x0, x1
+        cbz     x1, 1f
+        ldr     x2, [x0]
+1:
+        ret
+        .size bad_unchecked_multi_bb, .-bad_unchecked_multi_bb
+
+        .globl  bad_leaked_to_subroutine_multi_bb
+        .type   bad_leaked_to_subroutine_multi_bb,@function
+bad_leaked_to_subroutine_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_leaked_to_subroutine_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      bl      callee
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        autia   x0, x1
+        cbz     x1, 1f
+        ldr     x2, [x0]
+1:
+        bl      callee
+        ldr     x2, [x0]
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_leaked_to_subroutine_multi_bb, .-bad_leaked_to_subroutine_multi_bb
+
+        .globl  bad_unknown_usage_read_multi_bb
+        .type   bad_unknown_usage_read_multi_bb,@function
+bad_unknown_usage_read_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_read_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
+        autia   x0, x1
+        cbz     x3, 1f
+        mul     x3, x0, x1
+1:
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_read_multi_bb, .-bad_unknown_usage_read_multi_bb
+
+        .globl  bad_unknown_usage_subreg_read_multi_bb
+        .type   bad_unknown_usage_subreg_read_multi_bb,@function
+bad_unknown_usage_subreg_read_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_subreg_read_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
+        autia   x0, x1
+        cbz     x3, 1f
+        mul     w3, w0, w1
+1:
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_subreg_read_multi_bb, .-bad_unknown_usage_subreg_read_multi_bb
+
+        .globl  bad_unknown_usage_update_multi_bb
+        .type   bad_unknown_usage_update_multi_bb,@function
+bad_unknown_usage_update_multi_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_update_multi_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
+        autia   x0, x1
+        cbz     x3, 1f
+        movk    x0, #42, lsl #16  // does not overwrite x0 completely
+1:
+        ldr     x2, [x0]
+        ret
+        .size bad_unknown_usage_update_multi_bb, .-bad_unknown_usage_update_multi_bb
+
+        .globl  good_overwrite_with_constant_multi_bb
+        .type   good_overwrite_with_constant_multi_bb,@function
+good_overwrite_with_constant_multi_bb:
+// CHECK-NOT: good_overwrite_with_constant_multi_bb
+        autia   x0, x1
+        cbz     x3, 1f
+1:
+        mov     x0, #42
+        ret
+        .size good_overwrite_with_constant_multi_bb, .-good_overwrite_with_constant_multi_bb
+
+        .globl  good_address_arith_multi_bb
+        .type   good_address_arith_multi_bb,@function
+good_address_arith_multi_bb:
+// CHECK-NOT: good_address_arith_multi_bb
+        autia   x0, x1
+        cbz     x3, 1f
+
+        add     x1, x0, #8
+        sub     x2, x1, #16
+        mov     x0, x2
+
+        mov     x1, #0
+        mov     x2, #0
+1:
+        ldr     x3, [x0]
+        ret
+        .size good_address_arith_multi_bb, .-good_address_arith_multi_bb
+
+// FIXME: Most *_nocfg test cases contain paciasp+autiasp instructions even if
+//        LR is not spilled - this is a workaround for RET instructions being
+//        reported as non-protected, because LR state is reset at every label.
+
+        .globl  good_ret_nocfg
+        .type   good_ret_nocfg,@function
+good_ret_nocfg:
+// CHECK-NOT: good_ret_nocfg
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+
+        ret     x0
+        .size good_ret_nocfg, .-good_ret_nocfg
+
+        .globl  good_call_nocfg
+        .type   good_call_nocfg,@function
+good_call_nocfg:
+// CHECK-NOT: good_call_nocfg
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        blr     x0
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size good_call_nocfg, .-good_call_nocfg
+
+        .globl  good_branch_nocfg
+        .type   good_branch_nocfg,@function
+good_branch_nocfg:
+// CHECK-NOT: good_branch_nocfg
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        br      x0
+        .size good_branch_nocfg, .-good_branch_nocfg
+
+        .globl  good_load_other_reg_nocfg
+        .type   good_load_other_reg_nocfg,@function
+good_load_other_reg_nocfg:
+// CHECK-NOT: good_load_other_reg_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size good_load_other_reg_nocfg, .-good_load_other_reg_nocfg
+
+        .globl  good_load_same_reg_nocfg
+        .type   good_load_same_reg_nocfg,@function
+good_load_same_reg_nocfg:
+// CHECK-NOT: good_load_same_reg_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        ldr     x0, [x0]
+
+        autiasp
+        ret
+        .size good_load_same_reg_nocfg, .-good_load_same_reg_nocfg
+
+// FIXME: Multi-instruction checker sequences are not supported without CFG.
+
+        .globl  bad_unchecked_nocfg
+        .type   bad_unchecked_nocfg,@function
+bad_unchecked_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unchecked_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+
+        autiasp
+        ret
+        .size bad_unchecked_nocfg, .-bad_unchecked_nocfg
+
+        .globl  bad_leaked_to_subroutine_nocfg
+        .type   bad_leaked_to_subroutine_nocfg,@function
+bad_leaked_to_subroutine_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_leaked_to_subroutine_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      bl      callee # Offset: 24
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        bl      callee
+        ldr     x2, [x0]
+
+        ldp     x29, x30, [sp], #16
+        autiasp
+        ret
+        .size bad_leaked_to_subroutine_nocfg, .-bad_leaked_to_subroutine_nocfg
+
+        .globl  bad_unknown_usage_read_nocfg
+        .type   bad_unknown_usage_read_nocfg,@function
+bad_unknown_usage_read_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_read_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     x3, x0, x1
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        mul     x3, x0, x1
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size bad_unknown_usage_read_nocfg, .-bad_unknown_usage_read_nocfg
+
+        .globl  bad_unknown_usage_subreg_read_nocfg
+        .type   bad_unknown_usage_subreg_read_nocfg,@function
+bad_unknown_usage_subreg_read_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_subreg_read_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      mul     w3, w0, w1
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        mul     w3, w0, w1
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size bad_unknown_usage_subreg_read_nocfg, .-bad_unknown_usage_subreg_read_nocfg
+
+        .globl  bad_unknown_usage_update_nocfg
+        .type   bad_unknown_usage_update_nocfg,@function
+bad_unknown_usage_update_nocfg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_unknown_usage_update_nocfg, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autia   x0, x1
+// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
+// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      movk    x0, #0x2a, lsl #16
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        movk    x0, #42, lsl #16  // does not overwrite x0 completely
+        ldr     x2, [x0]
+
+        autiasp
+        ret
+        .size bad_unknown_usage_update_nocfg, .-bad_unknown_usage_update_nocfg
+
+        .globl  good_overwrite_with_constant_nocfg
+        .type   good_overwrite_with_constant_nocfg,@function
+good_overwrite_with_constant_nocfg:
+// CHECK-NOT: good_overwrite_with_constant_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        mov     x0, #42
+
+        autiasp
+        ret
+        .size good_overwrite_with_constant_nocfg, .-good_overwrite_with_constant_nocfg
+
+        .globl  good_address_arith_nocfg
+        .type   good_address_arith_nocfg,@function
+good_address_arith_nocfg:
+// CHECK-NOT: good_address_arith_nocfg
+        paciasp
+        adr     x2, 1f
+        br      x2
+1:
+        autia   x0, x1
+        add     x1, x0, #8
+        sub     x2, x1, #16
+        mov     x3, x2
+
+        ldr     x4, [x3]
+        mov     x0, #0
+        mov     x1, #0
+        mov     x2, #0
+
+        autiasp
+        ret
+        .size good_address_arith_nocfg, .-good_address_arith_nocfg
+
+        .globl  good_explicit_check_unrelated_reg
+        .type   good_explicit_check_unrelated_reg,@function
+good_explicit_check_unrelated_reg:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function good_explicit_check_unrelated_reg, basic block {{[^,]+}}, at address
+        // FIXME: The below instruction is not an authentication oracle
+        autia   x2, x3    // One of possible execution paths after this instruction
+                          // ends at BRK below, thus BRK used as a trap instruction
+                          // should formally "check everything" not to introduce
+                          // false-positive here.
+        autia   x0, x1
+        eor     x16, x0, x0, lsl #1
+        tbz     x16, #62, 1f
+        brk     0x1234
+1:
+        ldr     x4, [x2]  // Right before this instruction X2 is checked - this
+                          // should be propagated to the basic block ending with
+                          // TBZ instruction above.
+        ret
+        .size good_explicit_check_unrelated_reg, .-good_explicit_check_unrelated_reg
+
+// The last BB (in layout order) is processed first by the data-flow analysis.
+// Its initial state is usually filled in a special way (because it ends with
+// `ret` instruction), and then affects the state propagated to the other BBs
+// Thus, the case of the last instruction in a function being a jump somewhere
+// in the middle is special.
+
+        .globl  good_no_ret_from_last_bb
+        .type   good_no_ret_from_last_bb,@function
+good_no_ret_from_last_bb:
+// CHECK-NOT: good_no_ret_from_last_bb
+        paciasp
+        autiasp     // authenticates LR
+        b       2f
+1:
+        ret
+2:
+        b       1b  // LR is dereferenced by `ret`, which is executed next
+        .size good_no_ret_from_last_bb, .-good_no_ret_from_last_bb
+
+        .globl  bad_no_ret_from_last_bb
+        .type   bad_no_ret_from_last_bb,@function
+bad_no_ret_from_last_bb:
+// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_no_ret_from_last_bb, basic block {{[^,]+}}, at address
+// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+        paciasp
+        autiasp     // authenticates LR
+        b       2f
+1:
+        ret     x0
+2:
+        b       1b  // X0 (but not LR) is dereferenced by `ret x0`
+        .size bad_no_ret_from_last_bb, .-bad_no_ret_from_last_bb
+
+// Test that combined auth+something instructions are not reported as
+// authentication oracles.
+
+        .globl  inst_retaa
+        .type   inst_retaa,@function
+inst_retaa:
+// CHECK-NOT: inst_retaa
+        paciasp
+        retaa
+        .size inst_retaa, .-inst_retaa
+
+        .globl  inst_blraa
+        .type   inst_blraa,@function
+inst_blraa:
+// CHECK-NOT: inst_blraa
+        paciasp
+        stp     x29, x30, [sp, #-16]!
+        mov     x29, sp
+
+        blraa   x0, x1
+
+        ldp     x29, x30, [sp], #16
+        retaa
+        .size inst_blraa, .-inst_blraa
+
+        .globl  inst_braa
+        .type   inst_braa,@function
+inst_braa:
+// CHECK-NOT: inst_braa
+        braa    x0, x1
+        .size inst_braa, .-inst_braa
+
+        .globl  inst_ldraa_no_wb
+        .type   inst_ldraa_no_wb,@function
+inst_ldraa_no_wb:
+// CHECK-NOT: inst_ldraa_no_wb
+        ldraa   x1, [x0]
+        ret
+        .size inst_ldraa_no_wb, .-inst_ldraa_no_wb
+
+        .globl  inst_ldraa_wb
+        .type   inst_ldraa_wb,@function
+inst_ldraa_wb:
+// CHECK-NOT: inst_ldraa_wb
+        ldraa   x1, [x0]!
+        ret
+        .size inst_ldraa_wb, .-inst_ldraa_wb
+
+        .globl  main
+        .type   main,@function
+main:
+        mov     x0, 0
+        ret
+        .size   main, .-main
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index 82494d834a15..686557eb1e52 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -113,7 +113,7 @@ simple:
 // CHECK-EMPTY:
 // PAUTH-NEXT:   Found sign inst:     00000000:        paciasp # DataflowSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // PAUTH-NEXT:     Signed reg: LR
-// PAUTH-NEXT:     TrustedRegs: LR W30 W30_HI
+// PAUTH-NEXT:     TrustedRegs: LR W30 W30_HI{{[ \t]*$}}
 // PAUTH-NEXT:   Found call inst:     00000000:        blr     x0 # DataflowSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // PAUTH-NEXT:     Call destination reg: X0
 // PAUTH-NEXT:     SafeToDerefRegs: W0 X0 W0_HI{{[ \t]*$}}
@@ -220,10 +220,10 @@ nocfg:
 // CHECK-EMPTY:
 // PAUTH-NEXT:   Found call inst:     00000000:        br      x0 # UNKNOWN CONTROL FLOW # Offset: 4 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // PAUTH-NEXT:     Call destination reg: X0
-// PAUTH-NEXT:     SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI
+// PAUTH-NEXT:     SafeToDerefRegs: LR W0 W30 X0 W0_HI W30_HI{{[ \t]*$}}
 // CHECK-NEXT:   Found RET inst:     00000000:         ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: >
 // CHECK-NEXT:     RetReg: LR
-// CHECK-NEXT:     SafeToDerefRegs:
+// CHECK-NEXT:     SafeToDerefRegs:{{[ \t]*$}}
 // CHECK-EMPTY:
 // CHECK-NEXT: Running detailed src register safety analysis...
 // CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   adr     x0, __ENTRY_nocfg@0x[[ENTRY_ADDR]], src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: [0]()>)
@@ -251,6 +251,84 @@ nocfg:
 // CHECK-EMPTY:
 // CHECK-NEXT:   Attaching clobbering info to:     00000000:   ret # Offset: 8 # CFGUnawareSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0]()>
 
+        .globl  auth_oracle
+        .type   auth_oracle,@function
+auth_oracle:
+        autia   x0, x1
+        ret
+        .size auth_oracle, .-auth_oracle
+
+// CHECK-LABEL:Analyzing function auth_oracle, AllocatorId = 1
+// CHECK-NEXT: Binary Function "auth_oracle"  {
+// CHECK-NEXT:   Number      : 4
+// CHECK-NEXT:   State       : CFG constructed
+// ...
+// CHECK:        BB Layout   : [[BB0:[0-9a-zA-Z.]+]]
+// CHECK-NEXT: }
+// CHECK-NEXT: [[BB0]] (2 instructions, align : 1)
+// CHECK-NEXT:   Entry Point
+// CHECK-NEXT:     00000000:   autia   x0, x1
+// CHECK-NEXT:     00000004:   ret
+// CHECK-EMPTY:
+// CHECK-NEXT: DWARF CFI Instructions:
+// CHECK-NEXT:     <empty>
+// CHECK-NEXT: End of Function "auth_oracle"
+// CHECK-EMPTY:
+// CHECK-NEXT: Running src register safety analysis...
+// ...
+// CHECK:      After src register safety analysis:
+// CHECK-NEXT: Binary Function "auth_oracle"  {
+// ...
+// CHECK:      End of Function "auth_oracle"
+// ...
+// PAUTH:      Running dst register safety analysis...
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: >)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// PAUTH-NEXT: After dst register safety analysis:
+// PAUTH-NEXT: Binary Function "auth_oracle"  {
+// PAUTH-NEXT:   Number      : 4
+// PAUTH-NEXT:   State       : CFG constructed
+// ...
+// PAUTH:        BB Layout   : [[BB0]]
+// PAUTH-NEXT: }
+// PAUTH-NEXT: [[BB0]] (2 instructions, align : 1)
+// PAUTH-NEXT:   Entry Point
+// PAUTH-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// PAUTH-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// PAUTH-EMPTY:
+// PAUTH-NEXT: DWARF CFI Instructions:
+// PAUTH-NEXT:     <empty>
+// PAUTH-NEXT: End of Function "auth_oracle"
+// PAUTH-EMPTY:
+// PAUTH-NEXT:   Found auth inst:     00000000:        autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// PAUTH-NEXT:     Authenticated reg: X0
+// PAUTH-NEXT:     safe output registers: LR W30 W30_HI{{[ \t]*$}}
+// PAUTH-EMPTY:
+// PAUTH-NEXT: Running detailed dst register safety analysis...
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: [0]()>)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
+// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
+// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
+// PAUTH-NEXT: After detailed dst register safety analysis:
+// PAUTH-NEXT: Binary Function "auth_oracle"  {
+// PAUTH-NEXT:   Number      : 4
+// PAUTH-NEXT:   State       : CFG constructed
+// ...
+// PAUTH:        BB Layout   : [[BB0]]
+// PAUTH-NEXT: }
+// PAUTH-NEXT: [[BB0]] (2 instructions, align : 1)
+// PAUTH-NEXT:   Entry Point
+// PAUTH-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// PAUTH-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0]()>
+// PAUTH-EMPTY:
+// PAUTH-NEXT: DWARF CFI Instructions:
+// PAUTH-NEXT:     <empty>
+// PAUTH-NEXT: End of Function "auth_oracle"
+// PAUTH-EMPTY:
+// PAUTH-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+
 // CHECK-LABEL:Analyzing function main, AllocatorId = 1
         .globl  main
         .type   main,@function

From 936c5566db013225dc098ff961395bb19e1bf2a4 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Thu, 19 Jun 2025 07:18:21 -0500
Subject: [PATCH 0926/1322] [flang][OpenMP] Handle REQUIRES ADMO in lowering
 (#144362)

The previous approach rewrote the atomic constructs in the AST based on
the REQUIRES ATOMIC_DEFAULT_MEM_ORDER directives. The new approach
checks for incorrect uses of REQUIRED ADMO in the semantic analysis, and
applies it in lowering, eliminating the need for a separate
tree-rewriting procedure.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 206 +++++++++++++-----
 flang/lib/Semantics/CMakeLists.txt            |   1 -
 flang/lib/Semantics/check-omp-structure.cpp   |  19 ++
 flang/lib/Semantics/check-omp-structure.h     |   1 +
 flang/lib/Semantics/rewrite-directives.cpp    | 172 ---------------
 flang/lib/Semantics/rewrite-directives.h      |  24 --
 flang/lib/Semantics/rewrite-parse-tree.cpp    |   4 +-
 .../Lower/OpenMP/requires-admo-acqrel.f90     |  19 ++
 .../Lower/OpenMP/requires-admo-invalid1.f90   |  16 ++
 .../Lower/OpenMP/requires-admo-invalid2.f90   |  16 ++
 .../Semantics/OpenMP/requires-atomic01.f90    | 121 ----------
 .../Semantics/OpenMP/requires-atomic02.f90    | 121 ----------
 12 files changed, 220 insertions(+), 500 deletions(-)
 delete mode 100644 flang/lib/Semantics/rewrite-directives.cpp
 delete mode 100644 flang/lib/Semantics/rewrite-directives.h
 create mode 100644 flang/test/Lower/OpenMP/requires-admo-acqrel.f90
 create mode 100644 flang/test/Lower/OpenMP/requires-admo-invalid1.f90
 create mode 100644 flang/test/Lower/OpenMP/requires-admo-invalid2.f90
 delete mode 100644 flang/test/Semantics/OpenMP/requires-atomic01.f90
 delete mode 100644 flang/test/Semantics/OpenMP/requires-atomic02.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 3e865a1ee718..7ad886959727 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2722,58 +2722,129 @@ static mlir::IntegerAttr getAtomicHint(lower::AbstractConverter &converter,
   return nullptr;
 }
 
-static mlir::omp::ClauseMemoryOrderKindAttr
-getAtomicMemoryOrder(lower::AbstractConverter &converter,
-                     semantics::SemanticsContext &semaCtx,
-                     const List<Clause> &clauses) {
-  std::optional<mlir::omp::ClauseMemoryOrderKind> kind;
+static mlir::omp::ClauseMemoryOrderKind
+getMemoryOrderKind(common::OmpMemoryOrderType kind) {
+  switch (kind) {
+  case common::OmpMemoryOrderType::Acq_Rel:
+    return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
+  case common::OmpMemoryOrderType::Acquire:
+    return mlir::omp::ClauseMemoryOrderKind::Acquire;
+  case common::OmpMemoryOrderType::Relaxed:
+    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  case common::OmpMemoryOrderType::Release:
+    return mlir::omp::ClauseMemoryOrderKind::Release;
+  case common::OmpMemoryOrderType::Seq_Cst:
+    return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
+  }
+  llvm_unreachable("Unexpected kind");
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getMemoryOrderKind(llvm::omp::Clause clauseId) {
+  switch (clauseId) {
+  case llvm::omp::Clause::OMPC_acq_rel:
+    return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
+  case llvm::omp::Clause::OMPC_acquire:
+    return mlir::omp::ClauseMemoryOrderKind::Acquire;
+  case llvm::omp::Clause::OMPC_relaxed:
+    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  case llvm::omp::Clause::OMPC_release:
+    return mlir::omp::ClauseMemoryOrderKind::Release;
+  case llvm::omp::Clause::OMPC_seq_cst:
+    return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
+  default:
+    return std::nullopt;
+  }
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getMemoryOrderFromRequires(const semantics::Scope &scope) {
+  // The REQUIRES construct is only allowed in the main program scope
+  // and module scope, but seems like we also accept it in a subprogram
+  // scope.
+  // For safety, traverse all enclosing scopes and check if their symbol
+  // contains REQUIRES.
+  for (const auto *sc{&scope}; sc->kind() != semantics::Scope::Kind::Global;
+       sc = &sc->parent()) {
+    const semantics::Symbol *sym = sc->symbol();
+    if (!sym)
+      continue;
+
+    const common::OmpMemoryOrderType *admo = common::visit(
+        [](auto &&s) {
+          using WithOmpDeclarative = semantics::WithOmpDeclarative;
+          if constexpr (std::is_convertible_v<decltype(s),
+                                              const WithOmpDeclarative &>) {
+            return s.ompAtomicDefaultMemOrder();
+          }
+          return static_cast<const common::OmpMemoryOrderType *>(nullptr);
+        },
+        sym->details());
+    if (admo)
+      return getMemoryOrderKind(*admo);
+  }
+
+  return std::nullopt;
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getDefaultAtomicMemOrder(semantics::SemanticsContext &semaCtx) {
   unsigned version = semaCtx.langOptions().OpenMPVersion;
+  if (version > 50)
+    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  return std::nullopt;
+}
 
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getAtomicMemoryOrder(semantics::SemanticsContext &semaCtx,
+                     const List<Clause> &clauses,
+                     const semantics::Scope &scope) {
   for (const Clause &clause : clauses) {
-    switch (clause.id) {
-    case llvm::omp::Clause::OMPC_acq_rel:
-      kind = mlir::omp::ClauseMemoryOrderKind::Acq_rel;
-      break;
-    case llvm::omp::Clause::OMPC_acquire:
-      kind = mlir::omp::ClauseMemoryOrderKind::Acquire;
-      break;
-    case llvm::omp::Clause::OMPC_relaxed:
-      kind = mlir::omp::ClauseMemoryOrderKind::Relaxed;
-      break;
-    case llvm::omp::Clause::OMPC_release:
-      kind = mlir::omp::ClauseMemoryOrderKind::Release;
-      break;
-    case llvm::omp::Clause::OMPC_seq_cst:
-      kind = mlir::omp::ClauseMemoryOrderKind::Seq_cst;
-      break;
-    default:
-      break;
-    }
+    if (auto maybeKind = getMemoryOrderKind(clause.id))
+      return *maybeKind;
   }
 
-  // Starting with 5.1, if no memory-order clause is present, the effect
-  // is as if "relaxed" was present.
-  if (!kind) {
-    if (version <= 50)
-      return nullptr;
-    kind = mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  if (auto maybeKind = getMemoryOrderFromRequires(scope))
+    return *maybeKind;
+
+  return getDefaultAtomicMemOrder(semaCtx);
+}
+
+static mlir::omp::ClauseMemoryOrderKindAttr
+makeMemOrderAttr(lower::AbstractConverter &converter,
+                 std::optional<mlir::omp::ClauseMemoryOrderKind> maybeKind) {
+  if (maybeKind) {
+    return mlir::omp::ClauseMemoryOrderKindAttr::get(
+        converter.getFirOpBuilder().getContext(), *maybeKind);
   }
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  return mlir::omp::ClauseMemoryOrderKindAttr::get(builder.getContext(), *kind);
+  return nullptr;
 }
 
 static mlir::Operation * //
-genAtomicRead(lower::AbstractConverter &converter, mlir::Location loc,
+genAtomicRead(lower::AbstractConverter &converter,
+              semantics::SemanticsContext &semaCtx, mlir::Location loc,
               lower::StatementContext &stmtCtx, mlir::Value atomAddr,
               const semantics::SomeExpr &atom,
               const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-              mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+              std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
               fir::FirOpBuilder::InsertPoint preAt,
               fir::FirOpBuilder::InsertPoint atomicAt,
               fir::FirOpBuilder::InsertPoint postAt) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   builder.restoreInsertionPoint(preAt);
 
+  // If the atomic clause is read then the memory-order clause must
+  // not be release.
+  if (memOrder) {
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release) {
+      // Reset it back to the default.
+      memOrder = getDefaultAtomicMemOrder(semaCtx);
+    } else if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel) {
+      // The MLIR verifier doesn't like acq_rel either.
+      memOrder = mlir::omp::ClauseMemoryOrderKind::Acquire;
+    }
+  }
+
   mlir::Value storeAddr =
       fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx, &loc));
   mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
@@ -2787,7 +2858,8 @@ genAtomicRead(lower::AbstractConverter &converter, mlir::Location loc,
 
   builder.restoreInsertionPoint(atomicAt);
   mlir::Operation *op = builder.create<mlir::omp::AtomicReadOp>(
-      loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint, memOrder);
+      loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint,
+      makeMemOrderAttr(converter, memOrder));
 
   if (atomType != storeType) {
     lower::ExprToValueMap overrides;
@@ -2808,17 +2880,30 @@ genAtomicRead(lower::AbstractConverter &converter, mlir::Location loc,
 }
 
 static mlir::Operation * //
-genAtomicWrite(lower::AbstractConverter &converter, mlir::Location loc,
+genAtomicWrite(lower::AbstractConverter &converter,
+               semantics::SemanticsContext &semaCtx, mlir::Location loc,
                lower::StatementContext &stmtCtx, mlir::Value atomAddr,
                const semantics::SomeExpr &atom,
                const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-               mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+               std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
                fir::FirOpBuilder::InsertPoint preAt,
                fir::FirOpBuilder::InsertPoint atomicAt,
                fir::FirOpBuilder::InsertPoint postAt) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   builder.restoreInsertionPoint(preAt);
 
+  // If the atomic clause is write then the memory-order clause must
+  // not be acquire.
+  if (memOrder) {
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire) {
+      // Reset it back to the default.
+      memOrder = getDefaultAtomicMemOrder(semaCtx);
+    } else if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel) {
+      // The MLIR verifier doesn't like acq_rel either.
+      memOrder = mlir::omp::ClauseMemoryOrderKind::Release;
+    }
+  }
+
   mlir::Value value =
       fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
   mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
@@ -2826,16 +2911,17 @@ genAtomicWrite(lower::AbstractConverter &converter, mlir::Location loc,
 
   builder.restoreInsertionPoint(atomicAt);
   mlir::Operation *op = builder.create<mlir::omp::AtomicWriteOp>(
-      loc, atomAddr, converted, hint, memOrder);
+      loc, atomAddr, converted, hint, makeMemOrderAttr(converter, memOrder));
   return op;
 }
 
 static mlir::Operation *
-genAtomicUpdate(lower::AbstractConverter &converter, mlir::Location loc,
+genAtomicUpdate(lower::AbstractConverter &converter,
+                semantics::SemanticsContext &semaCtx, mlir::Location loc,
                 lower::StatementContext &stmtCtx, mlir::Value atomAddr,
                 const semantics::SomeExpr &atom,
                 const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-                mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+                std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
                 fir::FirOpBuilder::InsertPoint preAt,
                 fir::FirOpBuilder::InsertPoint atomicAt,
                 fir::FirOpBuilder::InsertPoint postAt) {
@@ -2858,8 +2944,8 @@ genAtomicUpdate(lower::AbstractConverter &converter, mlir::Location loc,
   }
 
   builder.restoreInsertionPoint(atomicAt);
-  auto updateOp =
-      builder.create<mlir::omp::AtomicUpdateOp>(loc, atomAddr, hint, memOrder);
+  auto updateOp = builder.create<mlir::omp::AtomicUpdateOp>(
+      loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder));
 
   mlir::Region &region = updateOp->getRegion(0);
   mlir::Block *block = builder.createBlock(&region, {}, {atomType}, {loc});
@@ -2878,11 +2964,12 @@ genAtomicUpdate(lower::AbstractConverter &converter, mlir::Location loc,
 }
 
 static mlir::Operation *
-genAtomicOperation(lower::AbstractConverter &converter, mlir::Location loc,
+genAtomicOperation(lower::AbstractConverter &converter,
+                   semantics::SemanticsContext &semaCtx, mlir::Location loc,
                    lower::StatementContext &stmtCtx, int action,
                    mlir::Value atomAddr, const semantics::SomeExpr &atom,
                    const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-                   mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+                   std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
                    fir::FirOpBuilder::InsertPoint preAt,
                    fir::FirOpBuilder::InsertPoint atomicAt,
                    fir::FirOpBuilder::InsertPoint postAt) {
@@ -2894,14 +2981,14 @@ genAtomicOperation(lower::AbstractConverter &converter, mlir::Location loc,
   // builder's insertion point, or set it to anything specific.
   switch (action) {
   case parser::OpenMPAtomicConstruct::Analysis::Read:
-    return genAtomicRead(converter, loc, stmtCtx, atomAddr, atom, assign, hint,
-                         memOrder, preAt, atomicAt, postAt);
+    return genAtomicRead(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
+                         assign, hint, memOrder, preAt, atomicAt, postAt);
   case parser::OpenMPAtomicConstruct::Analysis::Write:
-    return genAtomicWrite(converter, loc, stmtCtx, atomAddr, atom, assign, hint,
-                          memOrder, preAt, atomicAt, postAt);
+    return genAtomicWrite(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
+                          assign, hint, memOrder, preAt, atomicAt, postAt);
   case parser::OpenMPAtomicConstruct::Analysis::Update:
-    return genAtomicUpdate(converter, loc, stmtCtx, atomAddr, atom, assign,
-                           hint, memOrder, preAt, atomicAt, postAt);
+    return genAtomicUpdate(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
+                           assign, hint, memOrder, preAt, atomicAt, postAt);
   default:
     return nullptr;
   }
@@ -3899,8 +3986,9 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   mlir::Value atomAddr =
       fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc));
   mlir::IntegerAttr hint = getAtomicHint(converter, clauses);
-  mlir::omp::ClauseMemoryOrderKindAttr memOrder =
-      getAtomicMemoryOrder(converter, semaCtx, clauses);
+  std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder =
+      getAtomicMemoryOrder(semaCtx, clauses,
+                           semaCtx.FindScope(construct.source));
 
   if (auto *cond = get(analysis.cond)) {
     (void)cond;
@@ -3918,8 +4006,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
              "Expexcing two actions");
       (void)action0;
       (void)action1;
-      captureOp =
-          builder.create<mlir::omp::AtomicCaptureOp>(loc, hint, memOrder);
+      captureOp = builder.create<mlir::omp::AtomicCaptureOp>(
+          loc, hint, makeMemOrderAttr(converter, memOrder));
       // Set the non-atomic insertion point to before the atomic.capture.
       preAt = getInsertionPointBefore(captureOp);
 
@@ -3931,7 +4019,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       atomicAt = getInsertionPointBefore(term);
       postAt = getInsertionPointAfter(captureOp);
       hint = nullptr;
-      memOrder = nullptr;
+      memOrder = std::nullopt;
     } else {
       // Non-capturing operation.
       assert(action0 != analysis.None && action1 == analysis.None &&
@@ -3943,16 +4031,16 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     // The builder's insertion point needs to be specifically set before
     // each call to `genAtomicOperation`.
     mlir::Operation *firstOp = genAtomicOperation(
-        converter, loc, stmtCtx, analysis.op0.what, atomAddr, atom,
+        converter, semaCtx, loc, stmtCtx, analysis.op0.what, atomAddr, atom,
         *get(analysis.op0.assign), hint, memOrder, preAt, atomicAt, postAt);
     assert(firstOp && "Should have created an atomic operation");
     atomicAt = getInsertionPointAfter(firstOp);
 
     mlir::Operation *secondOp = nullptr;
     if (analysis.op1.what != analysis.None) {
-      secondOp = genAtomicOperation(converter, loc, stmtCtx, analysis.op1.what,
-                                    atomAddr, atom, *get(analysis.op1.assign),
-                                    hint, memOrder, preAt, atomicAt, postAt);
+      secondOp = genAtomicOperation(
+          converter, semaCtx, loc, stmtCtx, analysis.op1.what, atomAddr, atom,
+          *get(analysis.op1.assign), hint, memOrder, preAt, atomicAt, postAt);
     }
 
     if (construct.IsCapture()) {
diff --git a/flang/lib/Semantics/CMakeLists.txt b/flang/lib/Semantics/CMakeLists.txt
index 18c89587843a..c0fda3631c01 100644
--- a/flang/lib/Semantics/CMakeLists.txt
+++ b/flang/lib/Semantics/CMakeLists.txt
@@ -40,7 +40,6 @@ add_flang_library(FortranSemantics
   resolve-directives.cpp
   resolve-names-utils.cpp
   resolve-names.cpp
-  rewrite-directives.cpp
   rewrite-parse-tree.cpp
   runtime-type-info.cpp
   scope.cpp
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 36d4bcb5d99f..68cea6739830 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1719,6 +1719,22 @@ void OmpStructureChecker::Leave(const parser::OpenMPDepobjConstruct &x) {
 void OmpStructureChecker::Enter(const parser::OpenMPRequiresConstruct &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_requires);
+
+  if (visitedAtomicSource_.empty()) {
+    return;
+  }
+  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+  for (const parser::OmpClause &clause : clauseList.v) {
+    llvm::omp::Clause id{clause.Id()};
+    if (id == llvm::omp::Clause::OMPC_atomic_default_mem_order) {
+      parser::MessageFormattedText txt(
+          "REQUIRES directive with '%s' clause found lexically after atomic operation without a memory order clause"_err_en_US,
+          parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(id)));
+      parser::Message message(clause.source, txt);
+      message.Attach(visitedAtomicSource_, "Previous atomic construct"_en_US);
+      context_.Say(std::move(message));
+    }
+  }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
@@ -4056,6 +4072,9 @@ void OmpStructureChecker::CheckAtomicUpdate(
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPAtomicConstruct &x) {
+  if (visitedAtomicSource_.empty())
+    visitedAtomicSource_ = x.source;
+
   // All of the following groups have the "exclusive" property, i.e. at
   // most one clause from each group is allowed.
   // The exclusivity-checking code should eventually be unified for all
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 2074ec611dc2..beb6e0528e81 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -360,6 +360,7 @@ private:
   };
   int directiveNest_[LastType + 1] = {0};
 
+  parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
   using LoopConstruct = std::variant<const parser::DoConstruct *,
diff --git a/flang/lib/Semantics/rewrite-directives.cpp b/flang/lib/Semantics/rewrite-directives.cpp
deleted file mode 100644
index 91b60ea151de..000000000000
--- a/flang/lib/Semantics/rewrite-directives.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//===-- lib/Semantics/rewrite-directives.cpp ------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "rewrite-directives.h"
-#include "flang/Parser/parse-tree-visitor.h"
-#include "flang/Parser/parse-tree.h"
-#include "flang/Semantics/semantics.h"
-#include "flang/Semantics/symbol.h"
-#include "llvm/Frontend/OpenMP/OMP.h"
-#include <list>
-
-namespace Fortran::semantics {
-
-using namespace parser::literals;
-
-class DirectiveRewriteMutator {
-public:
-  explicit DirectiveRewriteMutator(SemanticsContext &context)
-      : context_{context} {}
-
-  // Default action for a parse tree node is to visit children.
-  template <typename T> bool Pre(T &) { return true; }
-  template <typename T> void Post(T &) {}
-
-protected:
-  SemanticsContext &context_;
-};
-
-// Rewrite atomic constructs to add an explicit memory ordering to all that do
-// not specify it, honoring in this way the `atomic_default_mem_order` clause of
-// the REQUIRES directive.
-class OmpRewriteMutator : public DirectiveRewriteMutator {
-public:
-  explicit OmpRewriteMutator(SemanticsContext &context)
-      : DirectiveRewriteMutator(context) {}
-
-  template <typename T> bool Pre(T &) { return true; }
-  template <typename T> void Post(T &) {}
-
-  bool Pre(parser::OpenMPAtomicConstruct &);
-  bool Pre(parser::OpenMPRequiresConstruct &);
-
-private:
-  bool atomicDirectiveDefaultOrderFound_{false};
-};
-
-bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) {
-  // Find top-level parent of the operation.
-  Symbol *topLevelParent{[&]() {
-    Symbol *symbol{nullptr};
-    Scope *scope{&context_.FindScope(
-        std::get<parser::OmpDirectiveSpecification>(x.t).source)};
-    do {
-      if (Symbol * parent{scope->symbol()}) {
-        symbol = parent;
-      }
-      scope = &scope->parent();
-    } while (!scope->IsGlobal());
-
-    assert(symbol &&
-        "Atomic construct must be within a scope associated with a symbol");
-    return symbol;
-  }()};
-
-  // Get the `atomic_default_mem_order` clause from the top-level parent.
-  std::optional<common::OmpMemoryOrderType> defaultMemOrder;
-  common::visit(
-      [&](auto &details) {
-        if constexpr (std::is_convertible_v<decltype(&details),
-                          WithOmpDeclarative *>) {
-          if (details.has_ompAtomicDefaultMemOrder()) {
-            defaultMemOrder = *details.ompAtomicDefaultMemOrder();
-          }
-        }
-      },
-      topLevelParent->details());
-
-  if (!defaultMemOrder) {
-    return false;
-  }
-
-  auto findMemOrderClause{[](const parser::OmpClauseList &clauses) {
-    return llvm::any_of(
-        clauses.v, [](auto &clause) -> const parser::OmpClause * {
-          switch (clause.Id()) {
-          case llvm::omp::Clause::OMPC_acq_rel:
-          case llvm::omp::Clause::OMPC_acquire:
-          case llvm::omp::Clause::OMPC_relaxed:
-          case llvm::omp::Clause::OMPC_release:
-          case llvm::omp::Clause::OMPC_seq_cst:
-            return &clause;
-          default:
-            return nullptr;
-          }
-        });
-  }};
-
-  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
-  auto &clauseList{std::get<std::optional<parser::OmpClauseList>>(dirSpec.t)};
-  if (clauseList) {
-    if (findMemOrderClause(*clauseList)) {
-      return false;
-    }
-  } else {
-    clauseList = parser::OmpClauseList(decltype(parser::OmpClauseList::v){});
-  }
-
-  // Add a memory order clause to the atomic directive.
-  atomicDirectiveDefaultOrderFound_ = true;
-  llvm::omp::Clause kind{x.GetKind()};
-  switch (*defaultMemOrder) {
-  case common::OmpMemoryOrderType::Acq_Rel:
-    // FIXME: Implement 5.0 rules, pending clarification on later spec
-    // versions.
-    // [5.0:62:22-26]
-    if (kind == llvm::omp::Clause::OMPC_read) {
-      clauseList->v.emplace_back(
-          parser::OmpClause{parser::OmpClause::Acquire{}});
-    } else if (kind == llvm::omp::Clause::OMPC_update && x.IsCapture()) {
-      clauseList->v.emplace_back(
-          parser::OmpClause{parser::OmpClause::AcqRel{}});
-    } else {
-      clauseList->v.emplace_back(
-          parser::OmpClause{parser::OmpClause::Release{}});
-    }
-    break;
-  case common::OmpMemoryOrderType::Relaxed:
-    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::Relaxed{}});
-    break;
-  case common::OmpMemoryOrderType::Seq_Cst:
-    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::SeqCst{}});
-    break;
-  default:
-    // FIXME: Don't process other values at the moment since their validity
-    // depends on the OpenMP version (which is unavailable here).
-    break;
-  }
-
-  return false;
-}
-
-bool OmpRewriteMutator::Pre(parser::OpenMPRequiresConstruct &x) {
-  for (parser::OmpClause &clause : std::get<parser::OmpClauseList>(x.t).v) {
-    if (std::holds_alternative<parser::OmpClause::AtomicDefaultMemOrder>(
-            clause.u) &&
-        atomicDirectiveDefaultOrderFound_) {
-      context_.Say(clause.source,
-          "REQUIRES directive with '%s' clause found lexically after atomic "
-          "operation without a memory order clause"_err_en_US,
-          parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(
-              llvm::omp::OMPC_atomic_default_mem_order)
-                                         .str()));
-    }
-  }
-  return false;
-}
-
-bool RewriteOmpParts(SemanticsContext &context, parser::Program &program) {
-  if (!context.IsEnabled(common::LanguageFeature::OpenMP)) {
-    return true;
-  }
-  OmpRewriteMutator ompMutator{context};
-  parser::Walk(program, ompMutator);
-  return !context.AnyFatalError();
-}
-
-} // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/rewrite-directives.h b/flang/lib/Semantics/rewrite-directives.h
deleted file mode 100644
index 675962192842..000000000000
--- a/flang/lib/Semantics/rewrite-directives.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- lib/Semantics/rewrite-directives.h ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FORTRAN_SEMANTICS_REWRITE_DIRECTIVES_H_
-#define FORTRAN_SEMANTICS_REWRITE_DIRECTIVES_H_
-
-namespace Fortran::parser {
-struct Program;
-} // namespace Fortran::parser
-
-namespace Fortran::semantics {
-class SemanticsContext;
-} // namespace Fortran::semantics
-
-namespace Fortran::semantics {
-bool RewriteOmpParts(SemanticsContext &, parser::Program &);
-} // namespace Fortran::semantics
-
-#endif // FORTRAN_SEMANTICS_REWRITE_DIRECTIVES_H_
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 577558e7e33b..4eeb1b9ed3c1 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "rewrite-parse-tree.h"
-#include "rewrite-directives.h"
+
 #include "flang/Common/indirection.h"
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
@@ -229,7 +229,7 @@ void RewriteMutator::Post(parser::WriteStmt &x) {
 bool RewriteParseTree(SemanticsContext &context, parser::Program &program) {
   RewriteMutator mutator{context};
   parser::Walk(program, mutator);
-  return !context.AnyFatalError() && RewriteOmpParts(context, program);
+  return !context.AnyFatalError();
 }
 
 } // namespace Fortran::semantics
diff --git a/flang/test/Lower/OpenMP/requires-admo-acqrel.f90 b/flang/test/Lower/OpenMP/requires-admo-acqrel.f90
new file mode 100644
index 000000000000..525a846f410d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-admo-acqrel.f90
@@ -0,0 +1,19 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+
+module m
+!$omp requires atomic_default_mem_order(acq_rel)
+
+contains
+
+subroutine f00(x, v)
+  integer :: x, v
+!CHECK: omp.atomic.read %{{[ %#=0-9]+}} memory_order(acquire)
+  !$omp atomic read
+    v = x
+
+!CHECK: omp.atomic.write %{{[ %#=0-9]+}} memory_order(release)
+  !$omp atomic write
+    x = v
+end
+
+end module
diff --git a/flang/test/Lower/OpenMP/requires-admo-invalid1.f90 b/flang/test/Lower/OpenMP/requires-admo-invalid1.f90
new file mode 100644
index 000000000000..b21d3bbbc786
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-admo-invalid1.f90
@@ -0,0 +1,16 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+
+module m
+!$omp requires atomic_default_mem_order(acquire)
+
+contains
+
+subroutine f00(x, v)
+  integer :: x, v
+!CHECK: omp.atomic.write %{{[ %#=0-9]+}} memory_order(relaxed)
+  !$omp atomic write
+    x = v
+end
+
+end module
+
diff --git a/flang/test/Lower/OpenMP/requires-admo-invalid2.f90 b/flang/test/Lower/OpenMP/requires-admo-invalid2.f90
new file mode 100644
index 000000000000..33caa25dcc64
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-admo-invalid2.f90
@@ -0,0 +1,16 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+
+module m
+!$omp requires atomic_default_mem_order(release)
+
+contains
+
+subroutine f00(x, v)
+  integer :: x, v
+!CHECK: omp.atomic.read {{[ %#=0-9]+}} memory_order(relaxed)
+  !$omp atomic read
+    v = x
+end
+
+end module
+
diff --git a/flang/test/Semantics/OpenMP/requires-atomic01.f90 b/flang/test/Semantics/OpenMP/requires-atomic01.f90
deleted file mode 100644
index e8817c3f5ef6..000000000000
--- a/flang/test/Semantics/OpenMP/requires-atomic01.f90
+++ /dev/null
@@ -1,121 +0,0 @@
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
-! Ensure that requires atomic_default_mem_order is used to update atomic
-! operations with no explicit memory order set.
-program requires
-  implicit none
-  !$omp requires atomic_default_mem_order(seq_cst)
-  integer :: i, j
-
-  ! ----------------------------------------------------------------------------
-  ! READ
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Read
-  ! CHECK: OmpClause -> SeqCst
-  !$omp atomic read
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Read
-  !$omp atomic relaxed read
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Read
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic read relaxed
-  i = j
-  
-  ! ----------------------------------------------------------------------------
-  ! WRITE
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Write
-  ! CHECK: OmpClause -> SeqCst
-  !$omp atomic write
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Write
-  !$omp atomic relaxed write
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Write
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic write relaxed
-  i = j
-
-  ! ----------------------------------------------------------------------------
-  ! UPDATE
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Update
-  ! CHECK: OmpClause -> SeqCst
-  !$omp atomic update
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Update
-  !$omp atomic relaxed update
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Update
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic update relaxed
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> SeqCst
-  !$omp atomic
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic relaxed
-  i = i + j
-
-  ! ----------------------------------------------------------------------------
-  ! CAPTURE
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Capture
-  ! CHECK: OmpClause -> SeqCst
-  !$omp atomic capture
-  i = j
-  j = j + 1
-  !$omp end atomic
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Capture
-  !$omp atomic relaxed capture
-  i = j
-  j = j + 1
-  !$omp end atomic
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Capture
-  ! CHECK-NOT: OmpClause -> SeqCst
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic capture relaxed
-  i = j
-  j = j + 1
-  !$omp end atomic
-end program requires
diff --git a/flang/test/Semantics/OpenMP/requires-atomic02.f90 b/flang/test/Semantics/OpenMP/requires-atomic02.f90
deleted file mode 100644
index 04a9b7a09aa9..000000000000
--- a/flang/test/Semantics/OpenMP/requires-atomic02.f90
+++ /dev/null
@@ -1,121 +0,0 @@
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
-! Ensure that requires atomic_default_mem_order is used to update atomic
-! operations with no explicit memory order set. ACQ_REL clause tested here.
-program requires
-  implicit none
-  !$omp requires atomic_default_mem_order(acq_rel)
-  integer :: i, j
-
-  ! ----------------------------------------------------------------------------
-  ! READ
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Read
-  ! CHECK: OmpClause -> Acquire
-  !$omp atomic read
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Read
-  !$omp atomic relaxed read
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Read
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic read relaxed
-  i = j
-  
-  ! ----------------------------------------------------------------------------
-  ! WRITE
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Write
-  ! CHECK: OmpClause -> Release
-  !$omp atomic write
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Write
-  !$omp atomic relaxed write
-  i = j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Write
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic write relaxed
-  i = j
-
-  ! ----------------------------------------------------------------------------
-  ! UPDATE
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Update
-  ! CHECK: OmpClause -> Release
-  !$omp atomic update
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Update
-  !$omp atomic relaxed update
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Update
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic update relaxed
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Release
-  !$omp atomic
-  i = i + j
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic relaxed
-  i = i + j
-
-  ! ----------------------------------------------------------------------------
-  ! CAPTURE
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> Capture
-  ! CHECK: OmpClause -> AcqRel
-  !$omp atomic capture
-  i = j
-  j = j + 1
-  !$omp end atomic
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Relaxed
-  ! CHECK: OmpClause -> Capture
-  !$omp atomic relaxed capture
-  i = j
-  j = j + 1
-  !$omp end atomic
-
-  ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK-NOT: OmpClause -> AcqRel
-  ! CHECK: OmpClause -> Capture
-  ! CHECK: OmpClause -> Relaxed
-  !$omp atomic capture relaxed
-  i = j
-  j = j + 1
-  !$omp end atomic
-end program requires

From e478a22d540d336632fb3c110c5377447cd7f3b2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 19 Jun 2025 13:24:39 +0100
Subject: [PATCH 0927/1322] [LLVM][IRBuilder] Use NUW arithmetic for
 Create{ElementCount,TypeSize}. (#143532)

This put the onus on the caller to ensure the result type is big enough.
In the unlikely event a cropped result is required then explicitly
truncate a safe value.
---
 clang/test/CodeGen/builtin_vectorelements.c   |   8 +-
 llvm/include/llvm/IR/IRBuilder.h              |   6 +-
 llvm/lib/IR/IRBuilder.cpp                     |   2 +-
 .../Analysis/ValueTracking/phi-known-bits.ll  |   4 +-
 .../asan-masked-load-store.ll                 |  12 +-
 .../AddressSanitizer/asan-vp-load-store.ll    |  14 +-
 .../AddressSanitizer/vector-load-store.ll     |  44 +--
 .../Instrumentation/BoundsChecking/simple.ll  |  14 +-
 .../HWAddressSanitizer/vector-load-store.ll   |  20 +-
 .../MemorySanitizer/vector-load-store.ll      |  10 +-
 .../Instrumentation/MemorySanitizer/vscale.ll |  20 +-
 .../sve-intrinsic-opts-counting-elems.ll      |   8 +-
 .../test/Transforms/InstCombine/gep-vector.ll |   2 +-
 .../Transforms/InstCombine/gepofconstgepi8.ll |   2 +-
 .../Transforms/InstCombine/getelementptr.ll   |   4 +-
 llvm/test/Transforms/InstCombine/icmp-gep.ll  |   8 +-
 .../InstCombine/masked_intrinsics.ll          |   2 +-
 .../test/Transforms/InstCombine/opaque-ptr.ll |   4 +-
 .../InstCombine/scalable-vector-array.ll      |   4 +-
 .../InstCombine/scalable-vector-struct.ll     |  16 +-
 llvm/test/Transforms/InstCombine/sub-gep.ll   |   6 +-
 .../test/Transforms/InstCombine/vscale_gep.ll |   6 +-
 .../AArch64/clamped-trip-count.ll             |   8 +-
 .../AArch64/conditional-branches-cost.ll      |   6 +-
 .../AArch64/divs-with-scalable-vfs.ll         |  20 +-
 .../AArch64/eliminate-tail-predication.ll     |   6 +-
 .../gather-do-not-vectorize-addressing.ll     |   6 +-
 .../AArch64/induction-costs-sve.ll            |  20 +-
 .../AArch64/low_trip_count_predicates.ll      |  32 +-
 .../LoopVectorize/AArch64/masked-call.ll      | 100 +++---
 .../LoopVectorize/AArch64/optsize_minsize.ll  |  18 +-
 .../AArch64/outer_loop_prefer_scalable.ll     |   6 +-
 .../AArch64/partial-reduce-chained.ll         |  72 ++---
 .../partial-reduce-dot-product-epilogue.ll    |   8 +-
 .../partial-reduce-dot-product-mixed.ll       |  40 +--
 .../AArch64/partial-reduce-dot-product.ll     | 304 +++++++++---------
 .../AArch64/partial-reduce-sub.ll             |  22 +-
 .../AArch64/pr60831-sve-inv-store-crash.ll    |   8 +-
 .../AArch64/reduction-recurrence-costs-sve.ll |  44 +--
 .../AArch64/scalable-avoid-scalarization.ll   |   8 +-
 .../AArch64/scalable-reduction-inloop-cond.ll |  12 +-
 .../AArch64/scalable-strict-fadd.ll           | 258 +++++++--------
 .../AArch64/simple_early_exit.ll              |  12 +-
 .../LoopVectorize/AArch64/store-costs-sve.ll  |  26 +-
 .../sve-epilog-vect-inloop-reductions.ll      |   8 +-
 ...sve-epilog-vect-no-remaining-iterations.ll |   8 +-
 .../AArch64/sve-epilog-vect-reductions.ll     |   8 +-
 .../sve-epilog-vect-strict-reductions.ll      |   8 +-
 .../LoopVectorize/AArch64/sve-epilog-vect.ll  |  64 ++--
 .../AArch64/sve-extract-last-veclane.ll       |   4 +-
 .../LoopVectorize/AArch64/sve-fneg.ll         |  12 +-
 .../AArch64/sve-inductions-unusual-types.ll   |  12 +-
 .../LoopVectorize/AArch64/sve-inductions.ll   |   6 +-
 .../AArch64/sve-interleaved-accesses.ll       |  14 +-
 .../sve-interleaved-masked-accesses.ll        |  60 ++--
 .../LoopVectorize/AArch64/sve-inv-store.ll    |  14 +-
 .../AArch64/sve-live-out-pointer-induction.ll |   8 +-
 .../AArch64/sve-low-trip-count.ll             |   4 +-
 .../LoopVectorize/AArch64/sve-multiexit.ll    |  24 +-
 .../sve-runtime-check-size-based-threshold.ll |  16 +-
 .../AArch64/sve-tail-folding-forced.ll        |   6 +-
 .../AArch64/sve-tail-folding-optsize.ll       |   4 +-
 .../AArch64/sve-tail-folding-reductions.ll    |  36 +--
 .../AArch64/sve-tail-folding-unroll.ll        |  54 ++--
 .../LoopVectorize/AArch64/sve-tail-folding.ll |  60 ++--
 .../AArch64/sve-vector-reverse.ll             |  12 +-
 .../AArch64/sve-vscale-based-trip-counts.ll   |  60 ++--
 .../LoopVectorize/AArch64/sve-widen-gep.ll    |  16 +-
 .../AArch64/sve2-histcnt-epilogue.ll          |  14 +-
 .../AArch64/sve2-histcnt-too-many-deps.ll     |   6 +-
 .../AArch64/tail-folding-styles.ll            |  30 +-
 .../AArch64/uniform-args-call-variants.ll     |  24 +-
 .../AArch64/wider-VF-for-callinst.ll          |   4 +-
 .../Transforms/LoopVectorize/RISCV/bf16.ll    |  12 +-
 .../RISCV/blend-any-of-reduction-cost.ll      |   6 +-
 .../RISCV/blocks-with-dead-instructions.ll    |  54 ++--
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |  24 +-
 .../LoopVectorize/RISCV/defaults.ll           |  12 +-
 .../Transforms/LoopVectorize/RISCV/divrem.ll  |  54 ++--
 .../Transforms/LoopVectorize/RISCV/f16.ll     |   6 +-
 .../LoopVectorize/RISCV/fminimumnum.ll        |  96 +++---
 .../LoopVectorize/RISCV/induction-costs.ll    |   6 +-
 .../LoopVectorize/RISCV/inloop-reduction.ll   |  44 +--
 .../RISCV/interleaved-accesses.ll             |  84 ++---
 .../RISCV/interleaved-masked-access.ll        |  22 +-
 .../Transforms/LoopVectorize/RISCV/lmul.ll    |  18 +-
 .../LoopVectorize/RISCV/low-trip-count.ll     |   4 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |   6 +-
 .../RISCV/masked_gather_scatter.ll            |  12 +-
 .../RISCV/partial-reduce-dot-product.ll       |  48 +--
 ...ruction-or-drop-poison-generating-flags.ll |   6 +-
 .../LoopVectorize/RISCV/remark-reductions.ll  |   6 +-
 .../RISCV/riscv-vector-reverse-output.ll      |  36 +--
 .../RISCV/riscv-vector-reverse.ll             |  28 +-
 .../LoopVectorize/RISCV/safe-dep-distance.ll  |  18 +-
 .../LoopVectorize/RISCV/scalable-basics.ll    |  72 ++---
 .../LoopVectorize/RISCV/scalable-tailfold.ll  |  24 +-
 .../RISCV/select-cmp-reduction.ll             |  24 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |  62 ++--
 .../truncate-to-minimal-bitwidth-cost.ll      |  22 +-
 .../truncate-to-minimal-bitwidth-evl-crash.ll |   4 +-
 .../RISCV/type-info-cache-evl-crash.ll        |   6 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll |  78 ++---
 ...-force-tail-with-evl-bin-unary-ops-args.ll | 108 +++----
 ...ize-force-tail-with-evl-call-intrinsics.ll |  72 ++---
 ...ize-force-tail-with-evl-cast-intrinsics.ll |  70 ++--
 ...rize-force-tail-with-evl-cond-reduction.ll |  72 ++---
 .../vectorize-force-tail-with-evl-div.ll      |  16 +-
 ...ce-tail-with-evl-fixed-order-recurrence.ll |  84 ++---
 ...ze-force-tail-with-evl-inloop-reduction.ll | 168 +++++-----
 ...ectorize-force-tail-with-evl-interleave.ll |  16 +-
 ...-force-tail-with-evl-intermediate-store.ll |  24 +-
 .../vectorize-force-tail-with-evl-iv32.ll     |  12 +-
 ...e-force-tail-with-evl-known-no-overflow.ll |  16 +-
 ...ze-force-tail-with-evl-masked-loadstore.ll |   6 +-
 ...e-force-tail-with-evl-ordered-reduction.ll |   6 +-
 ...vectorize-force-tail-with-evl-reduction.ll | 168 +++++-----
 ...-force-tail-with-evl-reverse-load-store.ll |  12 +-
 ...e-force-tail-with-evl-safe-dep-distance.ll |   8 +-
 ...orize-force-tail-with-evl-uniform-store.ll |   6 +-
 .../RISCV/vectorize-vp-intrinsics.ll          |  12 +-
 .../RISCV/vplan-vp-select-intrinsics.ll       |   4 +-
 .../LoopVectorize/outer_loop_scalable.ll      |   6 +-
 .../scalable-first-order-recurrence.ll        | 134 ++++----
 .../LoopVectorize/scalable-inductions.ll      |  22 +-
 .../LoopVectorize/scalable-iv-outside-user.ll |  10 +-
 .../LoopVectorize/scalable-lifetime.ll        |  12 +-
 ...able-loop-unpredicated-body-scalar-tail.ll |  12 +-
 .../scalable-reduction-inloop.ll              |   8 +-
 .../scalable-trunc-min-bitwidth.ll            |  12 +-
 .../vectorize-force-tail-with-evl.ll          |   6 +-
 .../Transforms/MemCpyOpt/vscale-crashes.ll    |   4 +-
 .../AArch64/sve-interleave-vectorization.ll   |   6 +-
 .../AArch64/expand-exp.ll                     |   2 +-
 134 files changed, 1929 insertions(+), 1927 deletions(-)

diff --git a/clang/test/CodeGen/builtin_vectorelements.c b/clang/test/CodeGen/builtin_vectorelements.c
index b0ff6f83b1e4..45f7a3c34562 100644
--- a/clang/test/CodeGen/builtin_vectorelements.c
+++ b/clang/test/CodeGen/builtin_vectorelements.c
@@ -85,7 +85,7 @@ int test_builtin_vectorelements_neon64x1() {
 long test_builtin_vectorelements_sve32() {
   // SVE: i64 @test_builtin_vectorelements_sve32(
   // SVE: [[VSCALE:%.+]] = call i64 @llvm.vscale.i64()
-  // SVE: [[RES:%.+]] = mul i64 [[VSCALE]], 4
+  // SVE: [[RES:%.+]] = mul nuw i64 [[VSCALE]], 4
   // SVE: ret i64 [[RES]]
   return __builtin_vectorelements(svuint32_t);
 }
@@ -93,7 +93,7 @@ long test_builtin_vectorelements_sve32() {
 long test_builtin_vectorelements_sve8() {
   // SVE: i64 @test_builtin_vectorelements_sve8(
   // SVE: [[VSCALE:%.+]] = call i64 @llvm.vscale.i64()
-  // SVE: [[RES:%.+]] = mul i64 [[VSCALE]], 16
+  // SVE: [[RES:%.+]] = mul nuw i64 [[VSCALE]], 16
   // SVE: ret i64 [[RES]]
   return __builtin_vectorelements(svuint8_t);
 }
@@ -105,7 +105,7 @@ long test_builtin_vectorelements_sve8() {
 long test_builtin_vectorelements_riscv8() {
   // RISCV: i64 @test_builtin_vectorelements_riscv8(
   // RISCV: [[VSCALE:%.+]] = call i64 @llvm.vscale.i64()
-  // RISCV: [[RES:%.+]] = mul i64 [[VSCALE]], 8
+  // RISCV: [[RES:%.+]] = mul nuw i64 [[VSCALE]], 8
   // RISCV: ret i64 [[RES]]
   return __builtin_vectorelements(vuint8m1_t);
 }
@@ -120,7 +120,7 @@ long test_builtin_vectorelements_riscv64() {
 long test_builtin_vectorelements_riscv32m2() {
   // RISCV: i64 @test_builtin_vectorelements_riscv32m2(
   // RISCV: [[VSCALE:%.+]] = call i64 @llvm.vscale.i64()
-  // RISCV: [[RES:%.+]] = mul i64 [[VSCALE]], 4
+  // RISCV: [[RES:%.+]] = mul nuw i64 [[VSCALE]], 4
   // RISCV: ret i64 [[RES]]
   return __builtin_vectorelements(vuint32m2_t);
 }
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 59295089d6e9..8fe610835dca 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -965,11 +965,13 @@ public:
   }
 
   /// Create an expression which evaluates to the number of elements in \p EC
-  /// at runtime.
+  /// at runtime. This can result in poison if type \p Ty is not big enough to
+  /// hold the value.
   LLVM_ABI Value *CreateElementCount(Type *Ty, ElementCount EC);
 
   /// Create an expression which evaluates to the number of units in \p Size
-  /// at runtime.  This works for both units of bits and bytes.
+  /// at runtime. This works for both units of bits and bytes. This can result
+  /// in poison if type \p Ty is not big enough to hold the value.
   LLVM_ABI Value *CreateTypeSize(Type *Ty, TypeSize Size);
 
   /// Creates a vector of type \p DstType with the linear sequence <0, 1, ...>
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 0a8b26b5f3d8..beefd5e7d7ee 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -117,7 +117,7 @@ static Value *CreateVScaleMultiple(IRBuilderBase &B, Type *Ty, uint64_t Scale) {
   if (Scale == 1)
     return VScale;
 
-  return B.CreateMul(VScale, ConstantInt::get(Ty, Scale));
+  return B.CreateNUWMul(VScale, ConstantInt::get(Ty, Scale));
 }
 
 Value *IRBuilderBase::CreateElementCount(Type *Ty, ElementCount EC) {
diff --git a/llvm/test/Analysis/ValueTracking/phi-known-bits.ll b/llvm/test/Analysis/ValueTracking/phi-known-bits.ll
index 436aadbc25de..b8a5be583499 100644
--- a/llvm/test/Analysis/ValueTracking/phi-known-bits.ll
+++ b/llvm/test/Analysis/ValueTracking/phi-known-bits.ll
@@ -936,7 +936,7 @@ define i1 @recursiveGEP_withPtrSub_scalableGEP(ptr %val1) {
 ; CHECK:       while.cond.i:
 ; CHECK-NEXT:    [[A_PN_I:%.*]] = phi ptr [ [[TEST_0_I:%.*]], [[WHILE_COND_I]] ], [ [[VAL1:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TEST_0_I]] = getelementptr i8, ptr [[A_PN_I]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TEST_0_I]], align 1
 ; CHECK-NEXT:    [[CMP3_NOT_I:%.*]] = icmp eq i8 [[TMP2]], 0
@@ -970,7 +970,7 @@ define i1 @recursiveGEP_withPtrSub_scalableGEP_inbounds(ptr %val1) {
 ; CHECK:       while.cond.i:
 ; CHECK-NEXT:    [[A_PN_I:%.*]] = phi ptr [ [[TEST_0_I:%.*]], [[WHILE_COND_I]] ], [ [[VAL1:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TEST_0_I]] = getelementptr inbounds i8, ptr [[A_PN_I]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TEST_0_I]], align 1
 ; CHECK-NEXT:    [[CMP3_NOT_I:%.*]] = icmp eq i8 [[TMP2]], 0
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index a8da5a3740e5..afa3d0966b55 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -315,7 +315,7 @@ declare void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr, i32, <vsca
 define <vscale x 4 x float> @scalable.load.nxv4f32(ptr %p, <vscale x 4 x i1> %mask) sanitize_address {
 ; CHECK-LABEL: @scalable.load.nxv4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[IV_NEXT:%.*]], [[TMP7:%.*]] ]
@@ -345,7 +345,7 @@ define <vscale x 4 x float> @scalable.load.nxv4f32(ptr %p, <vscale x 4 x i1> %ma
 define void @scalable.store.nxv4f32(ptr %p, <vscale x 4 x float> %arg, <vscale x 4 x i1> %mask) sanitize_address {
 ; CHECK-LABEL: @scalable.store.nxv4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[IV_NEXT:%.*]], [[TMP7:%.*]] ]
@@ -379,7 +379,7 @@ declare void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float>, <vscale x
 define <vscale x 4 x float> @scalable.gather.nxv4f32(<vscale x 4 x ptr> %vp, <vscale x 4 x i1> %mask, i32 %evl) sanitize_address {
 ; CHECK-LABEL: @scalable.gather.nxv4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[IV_NEXT:%.*]], [[TMP7:%.*]] ]
@@ -409,7 +409,7 @@ define <vscale x 4 x float> @scalable.gather.nxv4f32(<vscale x 4 x ptr> %vp, <vs
 define void @scalable.scatter.nxv4f32(<vscale x 4 x float> %val, <vscale x 4 x ptr> %vp, <vscale x 4 x i1> %mask, i32 %evl) sanitize_address {
 ; CHECK-LABEL: @scalable.scatter.nxv4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[IV_NEXT:%.*]], [[TMP7:%.*]] ]
@@ -447,7 +447,7 @@ define <vscale x 4 x float> @scalable.expandload.nxv4f32(ptr align 4 %p, <vscale
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP13:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP6]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
@@ -485,7 +485,7 @@ define void @scalable.compressstore.nxv4f32(ptr align 4 %p, <vscale x 4 x float>
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP13:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP6]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll
index d22671aa84f8..f53aa44c8c90 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll
@@ -255,7 +255,7 @@ define <vscale x 4 x float> @scalable.load.nxv4f32(ptr align 4 %p, <vscale x 4 x
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
@@ -292,7 +292,7 @@ define void @scalable.store.nxv4f32(ptr align 4 %p, <vscale x 4 x float> %arg, <
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
@@ -333,7 +333,7 @@ define <vscale x 4 x float> @scalable.strided.load.nxv4f32(ptr align 4 %p, i32 %
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -372,7 +372,7 @@ define void @scalable.strided.store.nxv4f32(<vscale x 4 x float> %arg, ptr align
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -412,7 +412,7 @@ define <vscale x 4 x float> @scalable.strided.load.nxv4f32.align(ptr align 4 %p,
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
@@ -454,7 +454,7 @@ define <vscale x 4 x float> @scalable.gather.nxv4f32(<vscale x 4 x ptr> %vp, <vs
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
@@ -491,7 +491,7 @@ define void @scalable.scatter.nxv4f32(<vscale x 4 x float> %arg, <vscale x 4 x p
 ; CHECK:       2:
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[EVL]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP5]])
 ; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
 ; CHECK:       .split:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll
index 120a4b235f85..373cb8a536a2 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll
@@ -402,7 +402,7 @@ define void @store.v2i32.align8(ptr %p) sanitize_address {
 define void @load.nxv1i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @load.nxv1i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 32
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -445,7 +445,7 @@ define void @load.nxv1i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @load.nxv1i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 32
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 32
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_loadN(i64 [[TMP4]], i64 [[TMP3]])
@@ -459,7 +459,7 @@ define void @load.nxv1i32(ptr %p) sanitize_address {
 define void @load.nxv2i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @load.nxv2i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 64
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -502,7 +502,7 @@ define void @load.nxv2i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @load.nxv2i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 64
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 64
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_loadN(i64 [[TMP4]], i64 [[TMP3]])
@@ -516,7 +516,7 @@ define void @load.nxv2i32(ptr %p) sanitize_address {
 define void @load.nxv4i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @load.nxv4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 128
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 128
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -559,7 +559,7 @@ define void @load.nxv4i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @load.nxv4i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 128
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 128
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_loadN(i64 [[TMP4]], i64 [[TMP3]])
@@ -573,7 +573,7 @@ define void @load.nxv4i32(ptr %p) sanitize_address {
 define void @load.nxv8i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @load.nxv8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 256
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 256
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -616,7 +616,7 @@ define void @load.nxv8i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @load.nxv8i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 256
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 256
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_loadN(i64 [[TMP4]], i64 [[TMP3]])
@@ -630,7 +630,7 @@ define void @load.nxv8i32(ptr %p) sanitize_address {
 define void @load.nxv16i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @load.nxv16i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 512
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 512
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -673,7 +673,7 @@ define void @load.nxv16i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @load.nxv16i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 512
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 512
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_loadN(i64 [[TMP4]], i64 [[TMP3]])
@@ -688,7 +688,7 @@ define void @load.nxv16i32(ptr %p) sanitize_address {
 define void @store.nxv1i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @store.nxv1i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 32
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -731,7 +731,7 @@ define void @store.nxv1i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @store.nxv1i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 32
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 32
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_storeN(i64 [[TMP4]], i64 [[TMP3]])
@@ -745,7 +745,7 @@ define void @store.nxv1i32(ptr %p) sanitize_address {
 define void @store.nxv2i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @store.nxv2i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 64
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -788,7 +788,7 @@ define void @store.nxv2i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @store.nxv2i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 64
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 64
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_storeN(i64 [[TMP4]], i64 [[TMP3]])
@@ -802,7 +802,7 @@ define void @store.nxv2i32(ptr %p) sanitize_address {
 define void @store.nxv4i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @store.nxv4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 128
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 128
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -845,7 +845,7 @@ define void @store.nxv4i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @store.nxv4i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 128
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 128
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_storeN(i64 [[TMP4]], i64 [[TMP3]])
@@ -859,7 +859,7 @@ define void @store.nxv4i32(ptr %p) sanitize_address {
 define void @store.nxv8i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @store.nxv8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 256
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 256
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -902,7 +902,7 @@ define void @store.nxv8i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @store.nxv8i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 256
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 256
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_storeN(i64 [[TMP4]], i64 [[TMP3]])
@@ -916,7 +916,7 @@ define void @store.nxv8i32(ptr %p) sanitize_address {
 define void @store.nxv16i32(ptr %p) sanitize_address {
 ; CHECK-LABEL: @store.nxv16i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 512
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 512
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -959,7 +959,7 @@ define void @store.nxv16i32(ptr %p) sanitize_address {
 ;
 ; CALLS-LABEL: @store.nxv16i32(
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 512
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 512
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CALLS-NEXT:    call void @__asan_storeN(i64 [[TMP4]], i64 [[TMP3]])
@@ -977,7 +977,7 @@ define <vscale x 2 x i32> @local_alloca() sanitize_address {
 ; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 2 x i32>, align 8
 ; CHECK-NEXT:    call void @clobber(ptr [[A]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 64
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], 1
@@ -1022,7 +1022,7 @@ define <vscale x 2 x i32> @local_alloca() sanitize_address {
 ; CALLS-NEXT:    [[A:%.*]] = alloca <vscale x 2 x i32>, align 8
 ; CALLS-NEXT:    call void @clobber(ptr [[A]])
 ; CALLS-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CALLS-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 64
+; CALLS-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 64
 ; CALLS-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
 ; CALLS-NEXT:    call void @__asan_loadN(i64 [[TMP4]], i64 [[TMP3]])
diff --git a/llvm/test/Instrumentation/BoundsChecking/simple.ll b/llvm/test/Instrumentation/BoundsChecking/simple.ll
index 5d8f76753e0c..ab2f9f66c82f 100644
--- a/llvm/test/Instrumentation/BoundsChecking/simple.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/simple.ll
@@ -470,7 +470,7 @@ define <vscale x 1 x i32> @load_scalable_vector(i64 %y) nounwind {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 0, [[DOTIDX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i64 [[Y]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64(), !nosanitize [[META0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4, !nosanitize [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4, !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 256, [[TMP2]], !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i64 256, [[TMP2]], !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]], !nosanitize [[META0]]
@@ -492,16 +492,16 @@ define <vscale x 1 x i32> @load_scalable_vector(i64 %y) nounwind {
 define void @scalable_alloca(i64 %y) nounwind {
 ; CHECK-LABEL: @scalable_alloca(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 5
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca <vscale x 4 x i16>, i32 5, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[Y:%.*]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 0, [[DOTIDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <vscale x 4 x i16>, ptr [[TMP4]], i64 [[Y]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64(), !nosanitize [[META0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8, !nosanitize [[META0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8, !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP3]], [[TMP7]], !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[TMP3]], [[TMP7]], !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], [[TMP10]], !nosanitize [[META0]]
@@ -525,16 +525,16 @@ define void @scalable_alloca(i64 %y) nounwind {
 define void @scalable_alloca2(i64 %y) nounwind {
 ; CHECK-LABEL: @scalable_alloca2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 32
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 32
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[Y:%.*]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 0, [[DOTIDX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <vscale x 4 x i64>, ptr [[TMP4]], i64 [[Y]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64(), !nosanitize [[META0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 32, !nosanitize [[META0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 32, !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP3]], [[TMP7]], !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[TMP3]], [[TMP7]], !nosanitize [[META0]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[TMP11]], [[TMP10]], !nosanitize [[META0]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll b/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll
index 5312c7cc7336..e3794e37f8b2 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/vector-load-store.ll
@@ -125,7 +125,7 @@ define void @load.nxv1i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 1 x i32>, ptr [[P]], align 4
@@ -140,7 +140,7 @@ define void @load.nxv2i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i32>, ptr [[P]], align 8
@@ -155,7 +155,7 @@ define void @load.nxv4i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 128
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16
@@ -170,7 +170,7 @@ define void @load.nxv8i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 256
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 256
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 8 x i32>, ptr [[P]], align 32
@@ -185,7 +185,7 @@ define void @load.nxv16i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 512
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 512
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_loadN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 16 x i32>, ptr [[P]], align 64
@@ -201,7 +201,7 @@ define void @store.nxv1i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    store <vscale x 1 x i32> zeroinitializer, ptr [[P]], align 4
@@ -216,7 +216,7 @@ define void @store.nxv2i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[P]], align 8
@@ -231,7 +231,7 @@ define void @store.nxv4i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 128
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[P]], align 16
@@ -246,7 +246,7 @@ define void @store.nxv8i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 256
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 256
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> zeroinitializer, ptr [[P]], align 32
@@ -261,7 +261,7 @@ define void @store.nxv16i32(ptr %p) sanitize_hwaddress {
 ; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 512
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 512
 ; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 8
 ; CHECK-NEXT:    call void @__hwasan_storeN(i64 [[TMP1]], i64 [[TMP4]])
 ; CHECK-NEXT:    store <vscale x 16 x i32> zeroinitializer, ptr [[P]], align 64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
index 0149b4f05559..d01974016f6c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-load-store.ll
@@ -672,7 +672,7 @@ define void @store.nxv1i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
 ; ORIGINS:       7:
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
 ; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -732,7 +732,7 @@ define void @store.nxv2i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; ORIGINS:       7:
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
 ; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -792,7 +792,7 @@ define void @store.nxv4i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; ORIGINS:       7:
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
 ; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -852,7 +852,7 @@ define void @store.nxv8i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; ORIGINS:       7:
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 32
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 32
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
 ; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -912,7 +912,7 @@ define void @store.nxv16i32(ptr %p) sanitize_memory {
 ; ORIGINS-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; ORIGINS:       7:
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGINS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 64
+; ORIGINS-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 64
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 3
 ; ORIGINS-NEXT:    [[TMP11:%.*]] = udiv i64 [[TMP10]], 4
 ; ORIGINS-NEXT:    br label [[DOTSPLIT:%.*]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
index 61114af7ce90..0c0b393667bf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
@@ -44,7 +44,7 @@ define void @test_load_store_i32(ptr %a, ptr %b) sanitize_memory {
 ; ORIGIN:       14:
 ; ORIGIN-NEXT:    [[TMP15:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP7]])
 ; ORIGIN-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; ORIGIN-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 16
 ; ORIGIN-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 3
 ; ORIGIN-NEXT:    [[TMP19:%.*]] = udiv i64 [[TMP18]], 4
 ; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -125,7 +125,7 @@ define void @test_load_store_add_int(ptr %a, ptr %b) sanitize_memory {
 ; ORIGIN:       25:
 ; ORIGIN-NEXT:    [[TMP26:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP14]])
 ; ORIGIN-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 64
+; ORIGIN-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 64
 ; ORIGIN-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], 3
 ; ORIGIN-NEXT:    [[TMP30:%.*]] = udiv i64 [[TMP29]], 4
 ; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -188,7 +188,7 @@ define void @test_load_store_float(ptr %a, ptr %b) sanitize_memory {
 ; ORIGIN:       14:
 ; ORIGIN-NEXT:    [[TMP15:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP7]])
 ; ORIGIN-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; ORIGIN-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 16
 ; ORIGIN-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 3
 ; ORIGIN-NEXT:    [[TMP19:%.*]] = udiv i64 [[TMP18]], 4
 ; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -269,7 +269,7 @@ define void @test_load_store_add_float(ptr %a, ptr %b) sanitize_memory {
 ; ORIGIN:       25:
 ; ORIGIN-NEXT:    [[TMP26:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP14]])
 ; ORIGIN-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; ORIGIN-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 8
 ; ORIGIN-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], 3
 ; ORIGIN-NEXT:    [[TMP30:%.*]] = udiv i64 [[TMP29]], 4
 ; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -363,7 +363,7 @@ define void @test_ret(ptr %a, ptr %b) sanitize_memory {
 ; ORIGIN:       11:
 ; ORIGIN-NEXT:    [[TMP12:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP4]])
 ; ORIGIN-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; ORIGIN-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 8
 ; ORIGIN-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], 3
 ; ORIGIN-NEXT:    [[TMP16:%.*]] = udiv i64 [[TMP15]], 4
 ; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -411,7 +411,7 @@ define void @fn_param(<vscale x 2 x float> %a, ptr %b) sanitize_memory {
 ; ORIGIN:       7:
 ; ORIGIN-NEXT:    [[TMP8:%.*]] = call i32 @__msan_chain_origin(i32 0)
 ; ORIGIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ORIGIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; ORIGIN-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 3
 ; ORIGIN-NEXT:    [[TMP12:%.*]] = udiv i64 [[TMP11]], 4
 ; ORIGIN-NEXT:    br label [[DOTSPLIT:%.*]]
@@ -490,7 +490,7 @@ define void @test_alloca1() sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X:%.*]] = alloca <vscale x 64 x i1>, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -503,7 +503,7 @@ define void @test_alloca1() sanitize_memory {
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[X:%.*]] = alloca <vscale x 64 x i1>, align 4
 ; ORIGIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -526,7 +526,7 @@ define void @test_alloca2() sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X:%.*]] = alloca <vscale x 64 x double>, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 512
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 512
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -539,7 +539,7 @@ define void @test_alloca2() sanitize_memory {
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[X:%.*]] = alloca <vscale x 64 x double>, align 4
 ; ORIGIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ORIGIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 512
+; ORIGIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 512
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[X]] to i64
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
index 46ca99f4bb27..0722e4cfcddd 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
@@ -52,7 +52,7 @@ define i64 @cntb_mul4() {
 define i64 @cntb_all() {
 ; CHECK-LABEL: @cntb_all(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[OUT:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    ret i64 [[OUT]]
 ;
   %out = call i64 @llvm.aarch64.sve.cntb(i32 31)
@@ -110,7 +110,7 @@ define i64 @cnth_mul4() {
 define i64 @cnth_all() {
 ; CHECK-LABEL: @cnth_all(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[OUT:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-NEXT:    ret i64 [[OUT]]
 ;
   %out = call i64 @llvm.aarch64.sve.cnth(i32 31)
@@ -170,7 +170,7 @@ define i64 @cntw_mul4() {
 define i64 @cntw_all() {
 ; CHECK-LABEL: @cntw_all(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[OUT:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    ret i64 [[OUT]]
 ;
   %out = call i64 @llvm.aarch64.sve.cntw(i32 31)
@@ -232,7 +232,7 @@ define i64 @cntd_mul4() {
 define i64 @cntd_all() {
 ; CHECK-LABEL: @cntd_all(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[OUT:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[OUT:%.*]] = shl nuw i64 [[TMP1]], 1
 ; CHECK-NEXT:    ret i64 [[OUT]]
 ;
   %out = call i64 @llvm.aarch64.sve.cntd(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/gep-vector.ll b/llvm/test/Transforms/InstCombine/gep-vector.ll
index 27624f790c4c..5546cb36d2f5 100644
--- a/llvm/test/Transforms/InstCombine/gep-vector.ll
+++ b/llvm/test/Transforms/InstCombine/gep-vector.ll
@@ -128,7 +128,7 @@ define ptr addrspace(3) @inbounds_bitcast_vec_to_array_addrspace_matching_alloc_
 define ptr @test_accumulate_constant_offset_vscale_nonzero(<vscale x 16 x i1> %pg, ptr %base) {
 ; CHECK-LABEL: @test_accumulate_constant_offset_vscale_nonzero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 4
 ; CHECK-NEXT:    ret ptr [[GEP]]
diff --git a/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll b/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll
index 4c8c56a9262e..a92e0c263d35 100644
--- a/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll
+++ b/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll
@@ -282,7 +282,7 @@ define ptr @test_scalable(ptr %base, i64 %a) {
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
 ; CHECK-NEXT:    [[INDEX:%.*]] = add i64 [[A]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[P2_IDX:%.*]] = mul i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P1]], i64 [[P2_IDX]]
 ; CHECK-NEXT:    ret ptr [[P2]]
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 61236df80bfa..e78d70058c14 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -266,7 +266,7 @@ define <2 x i1> @test13_fixed_scalable(i64 %X, ptr %P, <2 x i64> %y) nounwind {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[DOTSPLATINSERT]], <i64 3, i64 0>
 ; CHECK-NEXT:    [[A_IDX:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B_IDX:%.*]] = mul nsw <2 x i64> [[Y:%.*]], [[DOTSPLAT]]
@@ -285,7 +285,7 @@ define <vscale x 2 x i1> @test13_scalable_scalable(i64 %X, ptr %P, <vscale x 2 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw <vscale x 2 x i64> [[DOTSPLATINSERT]], splat (i64 3)
 ; CHECK-NEXT:    [[A_IDX:%.*]] = shufflevector <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP2]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[B_IDX:%.*]] = mul nsw <vscale x 2 x i64> [[Y:%.*]], [[DOTSPLAT2]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index 7f8f1ae73948..260462896c39 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -623,10 +623,10 @@ define i1 @test_scalable_xc(ptr %x) {
 define i1 @test_scalable_xy(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test_scalable_xy(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = mul nsw i64 [[I:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[GEP2_IDX:%.*]] = mul nsw i64 [[J:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[GEP2_IDX]], [[GEP1_IDX]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
@@ -642,10 +642,10 @@ define i1 @test_scalable_xy(ptr %foo, i64 %i, i64 %j) {
 define i1 @test_scalable_ij(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test_scalable_ij(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = mul nsw i64 [[I:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[GEP2_IDX:%.*]] = mul nsw i64 [[J:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[GEP2_IDX]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 0f8b06e12421..d9f022442a02 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -326,7 +326,7 @@ define void @scatter_nxv4i16_no_uniform_vals_uniform_ptrs_all_active_mask(ptr %d
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[SRC:%.*]], align 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP2]]
 ; CHECK-NEXT:    store i16 [[TMP3]], ptr [[DST:%.*]], align 2
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index be734243d14a..99d1fa032db1 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -294,7 +294,7 @@ define ptr @geps_combinable_different_elem_type_extra_use2(ptr %a, i64 %idx) {
 define ptr @geps_combinable_scalable(ptr %a, i64 %idx) {
 ; CHECK-LABEL: @geps_combinable_scalable(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw i8, ptr [[A2]], i64 4
 ; CHECK-NEXT:    ret ptr [[A3]]
@@ -307,7 +307,7 @@ define ptr @geps_combinable_scalable(ptr %a, i64 %idx) {
 define ptr @geps_combinable_scalable_vector_array(ptr %a, i64 %idx) {
 ; CHECK-LABEL: @geps_combinable_scalable_vector_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 5
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw i8, ptr [[A2]], i64 4
 ; CHECK-NEXT:    ret ptr [[A3]]
diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-array.ll b/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
index 20e9f2d99dd9..290de9633fc8 100644
--- a/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
@@ -5,7 +5,7 @@ define <vscale x 4 x i32> @load(ptr %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @load
 ; CHECK-SAME: (ptr [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[A_ELT1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A_UNPACK2:%.*]] = load <vscale x 4 x i32>, ptr [[A_ELT1]], align 16
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_UNPACK2]]
@@ -20,7 +20,7 @@ define void @store(ptr %x, <vscale x 4 x i32> %y, <vscale x 4 x i32> %z) {
 ; CHECK-SAME: (ptr [[X:%.*]], <vscale x 4 x i32> [[Y:%.*]], <vscale x 4 x i32> [[Z:%.*]]) {
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[Y]], ptr [[X]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[X_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[Z]], ptr [[X_REPACK1]], align 16
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
index a8790b579d75..9a0a6ae6324e 100644
--- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
@@ -7,7 +7,7 @@ define <vscale x 1 x i32> @load(ptr %x) {
 ; CHECK-LABEL: define <vscale x 1 x i32> @load
 ; CHECK-SAME: (ptr [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[A_ELT1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A_UNPACK2:%.*]] = load <vscale x 1 x i32>, ptr [[A_ELT1]], align 4
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[A_UNPACK2]]
@@ -22,7 +22,7 @@ define void @store(ptr %x, <vscale x 1 x i32> %y, <vscale x 1 x i32> %z) {
 ; CHECK-SAME: (ptr [[X:%.*]], <vscale x 1 x i32> [[Y:%.*]], <vscale x 1 x i32> [[Z:%.*]]) {
 ; CHECK-NEXT:    store <vscale x 1 x i32> [[Y]], ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[X_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    store <vscale x 1 x i32> [[Z]], ptr [[X_REPACK1]], align 4
 ; CHECK-NEXT:    ret void
@@ -40,7 +40,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @split_load(ptr %p) nounwind {
 ; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[R_ELT1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[R_UNPACK2:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT1]], align 16
 ; CHECK-NEXT:    [[R3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[R_UNPACK2]], 1
@@ -71,7 +71,7 @@ define void @split_store({<vscale x 4 x i32>, <vscale x 4 x i32>} %x, ptr %p) no
 ; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
@@ -123,14 +123,14 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @check_nxv16i8_nxv4i32({<vscale
 ; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
 ; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
 ; CHECK-NEXT:    [[R5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
@@ -150,14 +150,14 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @alloca_nxv16i8_nxv4i32({<vscale
 ; CHECK-NEXT:    [[X_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 0
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT]], ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[P_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[X_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[X]], 1
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[X_ELT2]], ptr [[P_REPACK1]], align 16
 ; CHECK-NEXT:    [[R_UNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[R_UNPACK]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[R_ELT3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[R_UNPACK4:%.*]] = load <vscale x 16 x i8>, ptr [[R_ELT3]], align 16
 ; CHECK-NEXT:    [[R5:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[R_UNPACK4]], 1
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 375be8a3d69c..45a30350aafe 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -703,7 +703,7 @@ define i64 @sub_scalable(ptr noundef %val1) {
 ; CHECK-LABEL: @sub_scalable(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
 entry:
@@ -718,7 +718,7 @@ define i64 @sub_scalable2(ptr noundef %val1) {
 ; CHECK-LABEL: @sub_scalable2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[GEP2_IDX:%.*]] = shl i64 [[TMP2]], 5
 ; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub i64 [[TMP1]], [[GEP2_IDX]]
@@ -750,7 +750,7 @@ define i64 @nullptrtoint_scalable_x(i64 %x) {
 ; CHECK-LABEL: @nullptrtoint_scalable_x(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[PTR_IDX:%.*]] = mul nsw i64 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i64 [[PTR_IDX]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/vscale_gep.ll b/llvm/test/Transforms/InstCombine/vscale_gep.ll
index 371ee71e45f2..84019e613d23 100644
--- a/llvm/test/Transforms/InstCombine/vscale_gep.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_gep.ll
@@ -15,7 +15,7 @@ define <vscale x 2 x ptr> @gep_index_type_is_scalable(ptr %p) {
 define ptr @gep_num_of_indices_1(ptr %p) {
 ; CHECK-LABEL: @gep_num_of_indices_1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
@@ -28,7 +28,7 @@ define void @gep_bitcast(ptr %p) {
 ; CHECK-LABEL: @gep_bitcast(
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[P:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[GEP2]], align 16
 ; CHECK-NEXT:    ret void
@@ -59,7 +59,7 @@ define i32 @gep_alloca_inbounds_vscale_nonzero() {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_nonzero(
 ; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 24c703ae42f0..95f3eb7b21f4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -8,13 +8,13 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -92,13 +92,13 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 976f95ff4f0b..5f72fa4b4e8e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -706,15 +706,15 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; PRED-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; PRED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index d59607711b5b..d42be20ea1e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -8,7 +8,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CONV61:%.*]] = zext i32 [[X]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
@@ -21,11 +21,11 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32
 ; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]]
@@ -40,7 +40,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP38:%.*]] = mul i64 [[TMP37]], 2
+; CHECK-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP34]], i64 [[TMP38]]
 ; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP36]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP39]], align 8
@@ -110,15 +110,15 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
@@ -233,15 +233,15 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP0]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index f957279a7944..c824bee916b0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -10,15 +10,15 @@ define void @f1(ptr %A) #0 {
 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index a65d36b51443..847155559c17 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -72,16 +72,16 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
 ; SVE:       for.body.preheader:
 ; SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
 ; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SVE:       vector.ph:
 ; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; SVE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; SVE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SVE:       vector.body:
 ; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index a4dc329c061e..dab14280a6b7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -13,23 +13,23 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; DEFAULT:       [[VECTOR_MEMCHECK]]:
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; DEFAULT-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
 ; DEFAULT-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; DEFAULT-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; DEFAULT-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
@@ -39,7 +39,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
+; DEFAULT-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 8
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP24]]
 ; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
 ; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP25]], align 1
@@ -58,7 +58,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP38]], i32 0
 ; DEFAULT-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; DEFAULT-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 8
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP40]], align 1
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
@@ -98,23 +98,23 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; PRED:       [[VECTOR_MEMCHECK]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; PRED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; PRED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
 ; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
 ; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 641564560fc2..6499a1f7e52e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -49,7 +49,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VS1-NEXT:    [[TMP3:%.*]] = sub i64 20, [[TMP2]]
 ; CHECK-VS1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-VS1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-VS1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
 ; CHECK-VS1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK-VS1:       [[VECTOR_SCEVCHECK]]:
@@ -64,16 +64,16 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK-VS1:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
 ; CHECK-VS1-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 16
+; CHECK-VS1-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 16
 ; CHECK-VS1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
 ; CHECK-VS1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VS1:       [[VECTOR_PH]]:
 ; CHECK-VS1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; CHECK-VS1-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 16
 ; CHECK-VS1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]]
 ; CHECK-VS1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-VS1-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 16
+; CHECK-VS1-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-VS1-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -95,17 +95,17 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 8
+; CHECK-VS1-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 8
 ; CHECK-VS1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
 ; CHECK-VS1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_PH]]:
 ; CHECK-VS1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 8
+; CHECK-VS1-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 8
 ; CHECK-VS1-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS1-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
 ; CHECK-VS1-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS1-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 8
+; CHECK-VS1-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 8
 ; CHECK-VS1-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -155,7 +155,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VS2-NEXT:    [[TMP3:%.*]] = sub i64 20, [[TMP2]]
 ; CHECK-VS2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-VS2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-VS2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
 ; CHECK-VS2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK-VS2:       [[VECTOR_SCEVCHECK]]:
@@ -170,16 +170,16 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK-VS2:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
 ; CHECK-VS2-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 8
+; CHECK-VS2-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 8
 ; CHECK-VS2-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
 ; CHECK-VS2-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VS2:       [[VECTOR_PH]]:
 ; CHECK-VS2-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-VS2-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 8
 ; CHECK-VS2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]]
 ; CHECK-VS2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-VS2-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-VS2-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-VS2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -201,17 +201,17 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
+; CHECK-VS2-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 4
 ; CHECK-VS2-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
 ; CHECK-VS2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_PH]]:
 ; CHECK-VS2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-VS2-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-VS2-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
 ; CHECK-VS2-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
 ; CHECK-VS2-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VS2-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; CHECK-VS2-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 4
 ; CHECK-VS2-NEXT:    [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -419,13 +419,13 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 2c0fb797d1d1..6029095bbe7b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -15,11 +15,11 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFNONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -53,7 +53,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; TFCOMMON-NEXT:  [[ENTRY:.*]]:
 ; TFCOMMON-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFCOMMON-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFCOMMON-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFCOMMON:       [[VECTOR_BODY]]:
@@ -76,9 +76,9 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; TFA_INTERLEAVE-NEXT:  [[ENTRY:.*]]:
 ; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -88,7 +88,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], %[[VECTOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]]
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
@@ -96,13 +96,13 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]]
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX_NEXT]], [[TMP18]]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 1025)
@@ -138,11 +138,11 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFNONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -184,7 +184,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFCOMMON-NEXT:  [[ENTRY:.*]]:
 ; TFCOMMON-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFCOMMON-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFCOMMON-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFCOMMON:       [[VECTOR_BODY]]:
@@ -210,9 +210,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFA_INTERLEAVE-NEXT:  [[ENTRY:.*]]:
 ; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -222,7 +222,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], %[[VECTOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]]
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
@@ -236,13 +236,13 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX_NEXT]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP23]], i64 1025)
@@ -289,11 +289,11 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFNONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -340,7 +340,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFCOMMON-NEXT:  [[ENTRY:.*]]:
 ; TFCOMMON-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFCOMMON-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFCOMMON-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFCOMMON:       [[VECTOR_BODY]]:
@@ -369,9 +369,9 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFA_INTERLEAVE-NEXT:  [[ENTRY:.*]]:
 ; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -381,7 +381,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], %[[VECTOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]]
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
@@ -401,13 +401,13 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i64> [[TMP22]], <vscale x 2 x i64> [[TMP18]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP23]], i64 [[TMP25]]
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP23]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP26]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX_NEXT]], [[TMP28]]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP29]], i64 1025)
@@ -457,11 +457,11 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFNONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -512,11 +512,11 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFFALLBACK-NEXT:  [[ENTRY:.*]]:
 ; TFFALLBACK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFFALLBACK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFFALLBACK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFFALLBACK-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFFALLBACK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFFALLBACK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFFALLBACK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFFALLBACK:       [[VECTOR_BODY]]:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -586,11 +586,11 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFNONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -624,7 +624,7 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFALWAYS-NEXT:  [[ENTRY:.*]]:
 ; TFALWAYS-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFALWAYS-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFALWAYS-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFALWAYS-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFALWAYS:       [[VECTOR_BODY]]:
@@ -647,7 +647,7 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFFALLBACK-NEXT:  [[ENTRY:.*]]:
 ; TFFALLBACK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFFALLBACK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFFALLBACK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFFALLBACK:       [[VECTOR_BODY]]:
@@ -670,9 +670,9 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] {
 ; TFA_INTERLEAVE-NEXT:  [[ENTRY:.*]]:
 ; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -682,7 +682,7 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], %[[VECTOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]]
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
@@ -690,13 +690,13 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]]
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX_NEXT]], [[TMP18]]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 1025)
@@ -734,11 +734,11 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; TFNONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M]], i64 0
 ; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -783,7 +783,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], double [[M:%.*]]) #[[ATTR0]] {
 ; TFALWAYS-NEXT:  [[ENTRY:.*]]:
 ; TFALWAYS-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFALWAYS-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFALWAYS-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFALWAYS-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M]], i64 0
 ; TFALWAYS-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -813,7 +813,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], double [[M:%.*]]) #[[ATTR0]] {
 ; TFFALLBACK-NEXT:  [[ENTRY:.*]]:
 ; TFFALLBACK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFFALLBACK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFFALLBACK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M]], i64 0
 ; TFFALLBACK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -843,9 +843,9 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], double [[M:%.*]]) #[[ATTR0]] {
 ; TFA_INTERLEAVE-NEXT:  [[ENTRY:.*]]:
 ; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M]], i64 0
@@ -858,7 +858,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 [[TMP9]]
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
@@ -870,7 +870,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
@@ -880,7 +880,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[TMP24]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[TMP22]], <vscale x 2 x double> [[TMP23]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX_NEXT]], [[TMP26]]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP27]], i64 1025)
@@ -918,16 +918,16 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE-NEXT:  [[ENTRY:.*]]:
 ; TFNONE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; TFNONE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; TFNONE-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; TFNONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
 ; TFNONE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; TFNONE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -938,7 +938,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
 ; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
 ; TFNONE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; TFNONE-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 2
+; TFNONE-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
 ; TFNONE-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
 ; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
 ; TFNONE-NEXT:    store double [[TMP14]], ptr [[P]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index e4718dc21635..f0835b818db5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -501,15 +501,15 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; DEFAULT-NEXT:    [[N_RND_UP:%.*]] = add i64 15, [[TMP2]]
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = sub i64 15, [[TMP6]]
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -577,15 +577,15 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; OPTSIZE:       [[VECTOR_PH]]:
 ; OPTSIZE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; OPTSIZE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; OPTSIZE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; OPTSIZE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; OPTSIZE-NEXT:    [[N_RND_UP:%.*]] = add i64 15, [[TMP2]]
 ; OPTSIZE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; OPTSIZE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; OPTSIZE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; OPTSIZE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; OPTSIZE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; OPTSIZE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; OPTSIZE-NEXT:    [[TMP7:%.*]] = sub i64 15, [[TMP6]]
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -653,15 +653,15 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; MINSIZE:       [[VECTOR_PH]]:
 ; MINSIZE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; MINSIZE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; MINSIZE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; MINSIZE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; MINSIZE-NEXT:    [[N_RND_UP:%.*]] = add i64 15, [[TMP2]]
 ; MINSIZE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; MINSIZE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; MINSIZE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; MINSIZE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; MINSIZE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; MINSIZE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; MINSIZE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; MINSIZE-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; MINSIZE-NEXT:    [[TMP7:%.*]] = sub i64 15, [[TMP6]]
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index 3302103873bd..2cec6ca498e4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -10,16 +10,16 @@ define void @foo() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index a229ca8c6e6d..295fe1b891e0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -56,16 +56,16 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -102,16 +102,16 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE-MAXBW:       vector.ph:
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE-MAXBW:       vector.body:
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -221,16 +221,16 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -267,16 +267,16 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE-MAXBW:       vector.ph:
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE-MAXBW:       vector.body:
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -386,16 +386,16 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -432,16 +432,16 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE-MAXBW:       vector.ph:
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE-MAXBW:       vector.body:
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -555,16 +555,16 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -601,16 +601,16 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE-MAXBW:       vector.ph:
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE-MAXBW:       vector.body:
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -726,16 +726,16 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -774,16 +774,16 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE-MAXBW:       vector.ph:
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE-MAXBW:       vector.body:
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -903,16 +903,16 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -951,16 +951,16 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE-MAXBW:       vector.ph:
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE-MAXBW:       vector.body:
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 66dbcff2c123..400b031917a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -9,7 +9,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -36,18 +36,18 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index eceff5ede34b..d2a82269e1b8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -10,15 +10,15 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -27,7 +27,7 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
@@ -36,7 +36,7 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
@@ -60,15 +60,15 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NOI8MM-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
 ; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -77,7 +77,7 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
@@ -86,7 +86,7 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
@@ -133,15 +133,15 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -150,7 +150,7 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
@@ -159,7 +159,7 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
@@ -183,15 +183,15 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NOI8MM-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
 ; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -200,7 +200,7 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
@@ -209,7 +209,7 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index b091452e28b4..a44b9bfb3b44 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -11,15 +11,15 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
@@ -47,15 +47,15 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
@@ -64,7 +64,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
@@ -73,7 +73,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
@@ -97,15 +97,15 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -156,15 +156,15 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVE1-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP7]], 2
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP18]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP18]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -194,15 +194,15 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP16]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -214,7 +214,7 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
@@ -222,7 +222,7 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
@@ -246,15 +246,15 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -309,15 +309,15 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVE1-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP7]], 2
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP14]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP10]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP10]], 2
@@ -351,15 +351,15 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP12]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP15]], 2
@@ -375,7 +375,7 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP4]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i16>, ptr [[TMP30]], align 2
@@ -383,7 +383,7 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD4]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[TMP8]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i16>, ptr [[TMP21]], align 2
@@ -407,15 +407,15 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
@@ -833,17 +833,17 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -866,11 +866,11 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP22]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP26]]
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -881,22 +881,22 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
@@ -905,7 +905,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
@@ -920,11 +920,11 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP33]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP35]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -935,17 +935,17 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -968,11 +968,11 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 8
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP22]]
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP24]], 8
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1006,15 +1006,15 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1033,11 +1033,11 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul nuw i32 [[TMP19]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sub i32 [[TMP20]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 8 x i32> [[TMP17]], i32 [[TMP21]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP23]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP24]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP17]], i32 [[TMP25]]
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1048,27 +1048,27 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
@@ -1079,11 +1079,11 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP23]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nuw i32 [[TMP23]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP31]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP25]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP29]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1094,15 +1094,15 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1121,11 +1121,11 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 8
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = mul nuw i32 [[TMP26]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP27]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP28]]
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul i32 [[TMP23]], 8
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP24]], 1
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP21]], i32 [[TMP25]]
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1159,16 +1159,16 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP13]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1235,16 +1235,16 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP13]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP34]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP34]], 8
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1269,7 +1269,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP56]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP56]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
@@ -1277,7 +1277,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
@@ -1289,7 +1289,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
@@ -1297,7 +1297,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
@@ -1309,7 +1309,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul nuw i64 [[TMP51]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
@@ -1317,7 +1317,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul nuw i64 [[TMP57]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
@@ -1329,7 +1329,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul i64 [[TMP67]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul nuw i64 [[TMP67]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
@@ -1337,7 +1337,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP73]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul nuw i64 [[TMP73]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
@@ -1367,16 +1367,16 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1501,15 +1501,15 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -1546,15 +1546,15 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -1591,15 +1591,15 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -1657,15 +1657,15 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP11]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1686,7 +1686,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]]
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1697,15 +1697,15 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP14]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP14]], 8
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1714,7 +1714,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
@@ -1723,7 +1723,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
@@ -1740,7 +1740,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1751,15 +1751,15 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1780,7 +1780,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i32> [[TMP20]], i32 [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
@@ -1815,16 +1815,16 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1853,16 +1853,16 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1871,7 +1871,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
@@ -1881,7 +1881,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
@@ -1905,16 +1905,16 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2218,16 +2218,16 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
 ; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2356,16 +2356,16 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
 ; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -2421,16 +2421,16 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1:       for.body.preheader:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
@@ -2466,16 +2466,16 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED:       for.body.preheader:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
@@ -2489,7 +2489,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
@@ -2497,7 +2497,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
@@ -2525,16 +2525,16 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW:       for.body.preheader:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
@@ -2647,15 +2647,15 @@ define i32 @zext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2748,15 +2748,15 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2849,15 +2849,15 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2950,15 +2950,15 @@ define i64 @zext_add_reduc_i8_i64_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR1]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -3051,15 +3051,15 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 3514404d3b2d..a471c004a8de 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -11,15 +11,15 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -48,15 +48,15 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -65,7 +65,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
@@ -74,7 +74,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
@@ -100,15 +100,15 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll
index b92d8406f614..6d5bbde36642 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll
@@ -9,15 +9,15 @@ define void @test_invar_gep(ptr %dst) #0 {
 ; CHECK-LABEL: @test_invar_gep(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -32,7 +32,7 @@ define void @test_invar_gep(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i64> [[TMP9]], i32 [[TMP17]]
 ; CHECK-NEXT:    store i64 [[TMP18]], ptr [[TMP14:%.*]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index c2fe37ad214c..492ab5632473 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -51,16 +51,16 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:  [[ENTRY:.*]]:
 ; VSCALEFORTUNING2-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
 ; VSCALEFORTUNING2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; VSCALEFORTUNING2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; VSCALEFORTUNING2:       [[VECTOR_PH]]:
 ; VSCALEFORTUNING2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; VSCALEFORTUNING2-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; VSCALEFORTUNING2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; VSCALEFORTUNING2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; VSCALEFORTUNING2-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
 ; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
@@ -76,11 +76,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64>
 ; VSCALEFORTUNING2-NEXT:    [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP15]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
-; VSCALEFORTUNING2-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], 4
+; VSCALEFORTUNING2-NEXT:    [[TMP19:%.*]] = mul nuw i32 [[TMP18]], 4
 ; VSCALEFORTUNING2-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP19]], 1
 ; VSCALEFORTUNING2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP20]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
-; VSCALEFORTUNING2-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], 4
+; VSCALEFORTUNING2-NEXT:    [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 4
 ; VSCALEFORTUNING2-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
 ; VSCALEFORTUNING2-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP23]]
 ; VSCALEFORTUNING2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -128,7 +128,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 4 x i32> [[TMP48]], [[TMP47]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP50:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; VSCALEFORTUNING2-NEXT:    [[TMP51:%.*]] = call i32 @llvm.vscale.i32()
-; VSCALEFORTUNING2-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], 4
+; VSCALEFORTUNING2-NEXT:    [[TMP52:%.*]] = mul nuw i32 [[TMP51]], 4
 ; VSCALEFORTUNING2-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP52]], 1
 ; VSCALEFORTUNING2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP26]], i32 [[TMP53]]
 ; VSCALEFORTUNING2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -181,19 +181,19 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP3]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; PRED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; PRED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP0]], [[TMP7]]
 ; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP0]], [[TMP7]]
 ; PRED-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
@@ -209,11 +209,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64>
 ; PRED-NEXT:    [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP19]]
 ; PRED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; PRED-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; PRED-NEXT:    [[TMP23:%.*]] = mul nuw i32 [[TMP22]], 4
 ; PRED-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP23]], 1
 ; PRED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP24]]
 ; PRED-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
-; PRED-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], 4
+; PRED-NEXT:    [[TMP26:%.*]] = mul nuw i32 [[TMP25]], 4
 ; PRED-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 1
 ; PRED-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP27]]
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -335,16 +335,16 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:  [[ENTRY:.*]]:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -355,7 +355,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP13]], i32 0
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[TMP13]], i64 [[TMP17]]
 ; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2
 ; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP18]], align 2
@@ -394,16 +394,16 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; VSCALEFORTUNING2-NEXT:  [[ENTRY:.*]]:
 ; VSCALEFORTUNING2-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; VSCALEFORTUNING2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; VSCALEFORTUNING2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; VSCALEFORTUNING2:       [[VECTOR_PH]]:
 ; VSCALEFORTUNING2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; VSCALEFORTUNING2-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; VSCALEFORTUNING2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; VSCALEFORTUNING2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; VSCALEFORTUNING2-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
 ; VSCALEFORTUNING2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
 ; VSCALEFORTUNING2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -414,7 +414,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; VSCALEFORTUNING2-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
 ; VSCALEFORTUNING2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; VSCALEFORTUNING2-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; VSCALEFORTUNING2-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; VSCALEFORTUNING2-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i64 [[TMP11]]
 ; VSCALEFORTUNING2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP9]], align 2
 ; VSCALEFORTUNING2-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP12]], align 2
@@ -455,15 +455,15 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP2]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
 ; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; PRED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
 ; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
 ; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index ba58c32bb3f5..0f2eae10f4ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -15,16 +15,16 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[IDX]], i64 0
@@ -52,7 +52,7 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 2
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 2 x ptr> [[TMP15]], i32 [[TMP23]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
index 68276c2c8af7..1b489dd25b92 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -6,16 +6,16 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture
 ; CHECK-LABEL: @cond_fadd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -92,16 +92,16 @@ define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) {
 ; CHECK-LABEL: @cond_cmp_sel(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 82f64e988e6d..d73cdc1228fe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -34,16 +34,16 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-SAME: (ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-UNORDERED-NEXT:  entry:
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -80,16 +80,16 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-SAME: (ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-ORDERED-NEXT:  entry:
 ; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -127,15 +127,15 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -217,16 +217,16 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-SAME: (ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-UNORDERED-NEXT:  entry:
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -237,13 +237,13 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
 ; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
@@ -284,16 +284,16 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-SAME: (ptr noalias readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-ORDERED-NEXT:  entry:
 ; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -301,13 +301,13 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
 ; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
@@ -346,26 +346,26 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP11]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP15]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
@@ -382,13 +382,13 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP19]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP22]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP25]]
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP20]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
@@ -404,13 +404,13 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP33]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], [[TMP36]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = mul nuw i64 [[TMP38]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = add i64 [[INDEX]], [[TMP39]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP37]], i64 [[TMP9]])
@@ -498,16 +498,16 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A2]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[A1]], i32 0
@@ -568,16 +568,16 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
@@ -636,15 +636,15 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
@@ -768,16 +768,16 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK-UNORDERED:       for.body.preheader:
 ; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -829,16 +829,16 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK-ORDERED:       for.body.preheader:
 ; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -891,15 +891,15 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
@@ -1006,16 +1006,16 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-UNORDERED-SAME: (ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-UNORDERED-NEXT:  entry:
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1065,16 +1065,16 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-SAME: (ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-ORDERED-NEXT:  entry:
 ; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1125,15 +1125,15 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -1244,16 +1244,16 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b,
 ; CHECK-UNORDERED-SAME: (ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-UNORDERED-NEXT:  entry:
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1383,16 +1383,16 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-UNORDERED-NEXT:  entry:
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1403,13 +1403,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
 ; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
@@ -1418,13 +1418,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
 ; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
 ; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
@@ -1467,16 +1467,16 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-ORDERED-NEXT:  entry:
 ; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1484,13 +1484,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
 ; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
@@ -1499,13 +1499,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
 ; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
 ; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
 ; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 24
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
@@ -1550,26 +1550,26 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP11]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP15]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
@@ -1586,13 +1586,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP19]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP22]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP25]]
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP20]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
@@ -1601,13 +1601,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP30]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP33]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP36]]
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP28]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP31]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
@@ -1627,13 +1627,13 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP49]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP47]], <vscale x 8 x float> [[TMP48]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = mul i64 [[TMP50]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = mul nuw i64 [[TMP50]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], [[TMP51]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = mul i64 [[TMP53]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = mul nuw i64 [[TMP53]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = add i64 [[INDEX]], [[TMP54]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = mul i64 [[TMP56]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = mul nuw i64 [[TMP56]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = add i64 [[INDEX]], [[TMP57]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP52]], i64 [[TMP9]])
@@ -1711,16 +1711,16 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-UNORDERED-NEXT:  entry:
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-UNORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-UNORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1731,13 +1731,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
 ; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
@@ -1746,13 +1746,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
 ; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
 ; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
@@ -1795,16 +1795,16 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-ORDERED-NEXT:  entry:
 ; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-ORDERED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-ORDERED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1812,13 +1812,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
 ; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
@@ -1827,13 +1827,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
 ; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
 ; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
 ; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 24
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
@@ -1878,26 +1878,26 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED-TF:       vector.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
+; CHECK-ORDERED-TF-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 32
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP11]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP15]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
@@ -1914,13 +1914,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP19]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP22]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP25]]
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP20]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
@@ -1929,13 +1929,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP30]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP33]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP36]]
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP28]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP31]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
@@ -1955,13 +1955,13 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP49]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP47]], <vscale x 8 x float> [[TMP48]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = mul i64 [[TMP50]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = mul nuw i64 [[TMP50]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], [[TMP51]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = mul i64 [[TMP53]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = mul nuw i64 [[TMP53]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = add i64 [[INDEX]], [[TMP54]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = mul i64 [[TMP56]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = mul nuw i64 [[TMP56]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = add i64 [[INDEX]], [[TMP57]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP52]], i64 [[TMP9]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index d02d03b4b437..5925b7014b12 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -14,16 +14,16 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -260,16 +260,16 @@ define i64 @loop_contains_safe_div() #1 {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP12]])
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[INDEX1:%.*]] = sub i64 64, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 15c5258b57cc..77b768e45e89 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -10,28 +10,28 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; DEFAULT-NEXT:  iter.check:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; DEFAULT:       vector.main.loop.iter.check:
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP3]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 101, [[TMP5]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 101, [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 32
+; DEFAULT-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 32
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 16
+; DEFAULT-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP23]]
 ; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
 ; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP24]], align 1
@@ -44,17 +44,17 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; DEFAULT:       vec.epilog.iter.check:
 ; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 101, [[N_VEC]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; DEFAULT-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 8
 ; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP13]]
 ; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; DEFAULT:       vec.epilog.ph:
 ; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 8
+; DEFAULT-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 8
 ; DEFAULT-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP15]]
 ; DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]]
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 8
 ; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; DEFAULT:       vec.epilog.vector.body:
 ; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -86,15 +86,15 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; PRED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; PRED-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 101, [[TMP4]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; PRED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; PRED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; PRED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
 ; PRED-NEXT:    [[TMP9:%.*]] = sub i64 101, [[TMP8]]
 ; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 101, [[TMP8]]
 ; PRED-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
@@ -231,13 +231,13 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP10]], 2
+; PRED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP10]], 2
 ; PRED-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 1000, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; PRED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1000)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index 36e9befb606d..0fddadd4e3ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -11,16 +11,16 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -29,7 +29,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
index f8551d774de4..d85bc484af0b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
@@ -8,7 +8,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ITER_CHECK:.*]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
@@ -40,20 +40,20 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP31]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP31]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index e5633462973a..de8fcb0aff7e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -11,16 +11,16 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -29,7 +29,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index 7e49c69266d8..83f2b2a9080a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -11,16 +11,16 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -28,7 +28,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 5d0e6f72309c..520937454ce5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -22,28 +22,28 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-LABEL: @main_vf_vscale_x_16(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 1024, [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 32
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 16
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 16
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP18]]
 ; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP16]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP19]], align 1
@@ -56,17 +56,17 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP22]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 8
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 1024, [[TMP24]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 1024, [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 8
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -89,23 +89,23 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vector.main.loop.iter.check:
 ; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-VF8:       vector.ph:
 ; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 16
+; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 16
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]]
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP17]], align 1
@@ -173,23 +173,23 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP14]], align 1
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1
@@ -226,23 +226,23 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vector.main.loop.iter.check:
 ; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-VF8:       vector.ph:
 ; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 2
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
 ; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1
@@ -295,28 +295,28 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-LABEL: @test_pr57912_pointer_induction(
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10000, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 10000, [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 32
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 16
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 16
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP18]]
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP19]], align 1
@@ -330,17 +330,17 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[IND_END4:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 10000, [[N_VEC]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP22]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 8
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 10000, [[TMP24]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 10000, [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
@@ -365,23 +365,23 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vector.main.loop.iter.check:
 ; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
 ; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10000, [[TMP1]]
 ; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-VF8:       vector.ph:
 ; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP3]]
 ; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]]
 ; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
 ; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 16
+; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 16
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
index a01e3e169298..7e24eb03c290 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
@@ -9,7 +9,7 @@ define void @inv_store_last_lane(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK:  store <vscale x 4 x i32> %[[VEC_VAL:.*]], ptr
 ; CHECK: middle.block:
 ; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl nuw i32 %[[VSCALE]], 2
 ; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
 ; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
 
@@ -39,7 +39,7 @@ define float @ret_last_lane(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK:  store <vscale x 4 x float> %[[VEC_VAL:.*]], ptr
 ; CHECK: middle.block:
 ; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl nuw i32 %[[VSCALE]], 2
 ; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
 ; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
index cfb96b4f5a61..0322f74ac343 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
@@ -16,30 +16,30 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[D1]], [[S2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x half>, ptr [[TMP15]], align 2
@@ -48,7 +48,7 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 8 x half> [[TMP16]], ptr [[TMP19]], align 2
 ; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP22]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index f44744071ae5..70042caaf961 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -13,11 +13,11 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-SAME: ptr [[DST:%.*]])
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
@@ -39,7 +39,7 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = zext <vscale x 2 x i7> [[TMP20]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 2
+; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP27]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP23]], ptr [[TMP25]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP24]], ptr [[TMP28]], align 8
@@ -78,11 +78,11 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-SAME: ptr [[DST:%.*]])
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
@@ -102,7 +102,7 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP25]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP19]], ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP20]], ptr [[TMP26]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
index 2f90b5a332bd..4f0637fd8db2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -15,16 +15,16 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly
 ; CHECK-LABEL: @cond_ind64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 7e4edf739695..8c2958769a61 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1369,16 +1369,15 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK-LABEL: @interleave_deinterleave_factor3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP0]], 256
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1464,16 +1463,15 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
 ; CHECK-LABEL: @interleave_deinterleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP0]], 256
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index bd2bd5aa2795..f152dd308cb6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -27,16 +27,15 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:  entry:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
@@ -78,10 +77,12 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPA:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -172,16 +173,15 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:  entry:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
@@ -216,10 +216,12 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPA:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -299,16 +301,15 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
@@ -347,10 +348,12 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPA:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -446,16 +449,15 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:  entry:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
@@ -501,10 +503,12 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPA:%.*]] = sub i32 1024, [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index 8b009f1c9137..f8c635baf13c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -7,16 +7,16 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N)
 ; CHECK-LABEL: @inv_store_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -24,7 +24,7 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP11]]
 ; CHECK-NEXT:    store i16 [[TMP12]], ptr [[DST:%.*]], align 2
@@ -57,16 +57,16 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64
 ; CHECK-LABEL: @cond_inv_store_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index be1ff0710a41..cac526f16213 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -11,16 +11,16 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 8
@@ -32,7 +32,7 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[TMP30]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 2
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i64, ptr [[TMP30]], i64 [[TMP34]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP35]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index ca2f969552a2..ce7b78ecc248 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -5,9 +5,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
 ; CHECK-LABEL: @trip7_i64(
 ; CHECK:         = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    = mul i64
+; CHECK-NEXT:    = mul nuw i64
 ; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[VF:%.*]] = mul i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[VF:%.*]] = mul nuw i64 [[VSCALE]], 2
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
index bc4533f3011c..246beb297cd2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
@@ -16,39 +16,39 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i32 [[TMP1]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD3]], ptr [[TMP30]], align 4
@@ -90,39 +90,39 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i32 [[TMP1]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD3]], ptr [[TMP30]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index 18d05d2a5b54..b6f723e049be 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -14,13 +14,13 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[DST_21:%.*]] = ptrtoint ptr [[DST_2:%.*]] to i64
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 20, i64 [[TMP1]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[DST_21]], [[DST_12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -43,11 +43,11 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP16]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP49]], 4
+; CHECK-NEXT:    [[TMP50:%.*]] = mul nuw i64 [[TMP49]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -55,13 +55,13 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 2
+; CHECK-NEXT:    [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 8
@@ -71,13 +71,13 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP43:%.*]] = mul nuw i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 8
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 2
+; CHECK-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP46]], 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 8fbb356c7974..eb8f218f9938 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -47,15 +47,15 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
index 5a4e0efb36cb..90b490148be8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
@@ -9,13 +9,13 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index f4639203d103..2de24b0f654d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -13,15 +13,15 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -67,15 +67,15 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-IN-LOOP:       vector.ph:
 ; CHECK-IN-LOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-IN-LOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-IN-LOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-IN-LOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -139,15 +139,15 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -192,15 +192,15 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-IN-LOOP:       vector.ph:
 ; CHECK-IN-LOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-IN-LOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-IN-LOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-IN-LOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -262,15 +262,15 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -329,15 +329,15 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-IN-LOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-IN-LOOP:       vector.ph:
 ; CHECK-IN-LOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-IN-LOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
 ; CHECK-IN-LOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-IN-LOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-IN-LOOP-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-IN-LOOP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-IN-LOOP-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-IN-LOOP-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 977115ce5321..ea9cd3f5d854 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -11,26 +11,26 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP61]], 16
+; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw i64 [[TMP61]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 8
 ; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 12
+; CHECK-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 12
 ; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
@@ -48,13 +48,13 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX6]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 4
+; CHECK-NEXT:    [[TMP53:%.*]] = mul nuw i64 [[TMP52]], 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 8
+; CHECK-NEXT:    [[TMP56:%.*]] = mul nuw i64 [[TMP55]], 8
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 12
+; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
@@ -62,13 +62,13 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[TMP63]], 4
+; CHECK-NEXT:    [[TMP64:%.*]] = mul nuw i64 [[TMP63]], 4
 ; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]]
 ; CHECK-NEXT:    [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP66]], 8
+; CHECK-NEXT:    [[TMP67:%.*]] = mul nuw i64 [[TMP66]], 8
 ; CHECK-NEXT:    [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 12
+; CHECK-NEXT:    [[TMP70:%.*]] = mul nuw i64 [[TMP69]], 12
 ; CHECK-NEXT:    [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT11]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP9]])
@@ -103,26 +103,26 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP83:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP84:%.*]] = mul i64 [[TMP83]], 16
+; CHECK-NEXT:    [[TMP84:%.*]] = mul nuw i64 [[TMP83]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; CHECK-NEXT:    [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
+; CHECK-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 8
 ; CHECK-NEXT:    [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 12
+; CHECK-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 12
 ; CHECK-NEXT:    [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
@@ -140,13 +140,13 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[INDEX6]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 4
+; CHECK-NEXT:    [[TMP53:%.*]] = mul nuw i64 [[TMP52]], 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 8
+; CHECK-NEXT:    [[TMP56:%.*]] = mul nuw i64 [[TMP55]], 8
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 12
+; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
@@ -163,13 +163,13 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX6]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
 ; CHECK-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 4
+; CHECK-NEXT:    [[TMP75:%.*]] = mul nuw i64 [[TMP74]], 4
 ; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP77]], 8
+; CHECK-NEXT:    [[TMP78:%.*]] = mul nuw i64 [[TMP77]], 8
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]]
 ; CHECK-NEXT:    [[TMP80:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP81:%.*]] = mul i64 [[TMP80]], 12
+; CHECK-NEXT:    [[TMP81:%.*]] = mul nuw i64 [[TMP80]], 12
 ; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, <vscale x 4 x i1> [[TMP69]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
@@ -177,13 +177,13 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]]
 ; CHECK-NEXT:    [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP86:%.*]] = mul i64 [[TMP85]], 4
+; CHECK-NEXT:    [[TMP86:%.*]] = mul nuw i64 [[TMP85]], 4
 ; CHECK-NEXT:    [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP89:%.*]] = mul i64 [[TMP88]], 8
+; CHECK-NEXT:    [[TMP89:%.*]] = mul nuw i64 [[TMP88]], 8
 ; CHECK-NEXT:    [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP92:%.*]] = mul i64 [[TMP91]], 12
+; CHECK-NEXT:    [[TMP92:%.*]] = mul nuw i64 [[TMP91]], 12
 ; CHECK-NEXT:    [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP9]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 46dcf23ee9cf..f6f8895c2c70 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -11,15 +11,15 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -112,15 +112,15 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -172,15 +172,15 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
@@ -235,15 +235,15 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -296,15 +296,15 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -356,15 +356,15 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -430,15 +430,15 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -487,15 +487,15 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -549,15 +549,15 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
@@ -607,15 +607,15 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset_trip1024(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index 592dc1c4efd4..33fa3607730d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -15,16 +15,16 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -88,7 +88,7 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
@@ -99,11 +99,11 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP7]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
index 685516a57680..e2c7469a9781 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
@@ -10,37 +10,37 @@ define void @vscale_mul_4(ptr noalias noundef readonly captures(none) %a, ptr no
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP18]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP18]], 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP27]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP28]], ptr [[TMP22]], align 4
@@ -95,22 +95,22 @@ define  void @vscale_mul_8(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[MUL1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[MUL1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[MUL1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
@@ -118,7 +118,7 @@ define  void @vscale_mul_8(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP22]], align 4
@@ -166,37 +166,37 @@ define void @vscale_mul_12(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i64 [[TMP0]], 12
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[MUL1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[MUL1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[MUL1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP25]], ptr [[TMP21]], align 4
@@ -251,37 +251,37 @@ define void @vscale_mul_31(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i64 [[TMP0]], 31
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[MUL1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[MUL1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[MUL1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
@@ -336,37 +336,37 @@ define void @vscale_mul_64(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul nuw nsw i64 [[TMP0]], 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[MUL1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[MUL1]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[MUL1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index 166bdc7931cf..e58ea655d609 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -50,16 +50,16 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-LABEL: @pointer_induction_used_as_vector(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]]
@@ -68,7 +68,7 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP6]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
@@ -144,23 +144,23 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 1, [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
index 54ba0a8c4d6b..5848d317ff14 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
@@ -9,21 +9,21 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -40,17 +40,17 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = shl i64 [[TMP22]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 1
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP23]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP25:%.*]] = shl i64 [[TMP24]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 1
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], [[TMP25]]
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
index b292e4304673..abee8b9340ca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
@@ -44,7 +44,7 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe
 ; NORMAL_DEP_LIMIT-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], ptr [[OTHER:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NORMAL_DEP_LIMIT-NEXT:  entry:
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NORMAL_DEP_LIMIT-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
 ; NORMAL_DEP_LIMIT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; NORMAL_DEP_LIMIT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
@@ -67,11 +67,11 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe
 ; NORMAL_DEP_LIMIT-NEXT:    br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[ENTRY:%.*]]
 ; NORMAL_DEP_LIMIT:       vector.ph:
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NORMAL_DEP_LIMIT-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP4]], 2
 ; NORMAL_DEP_LIMIT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
 ; NORMAL_DEP_LIMIT-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; NORMAL_DEP_LIMIT-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; NORMAL_DEP_LIMIT-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP6]] to i32
 ; NORMAL_DEP_LIMIT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 71d03afa6b6f..1cdc290757dd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -14,16 +14,16 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; NONE-NEXT:  entry:
 ; NONE-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; NONE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NONE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NONE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]]
 ; NONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NONE:       vector.ph:
 ; NONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NONE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]]
 ; NONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; NONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; NONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; NONE-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; NONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; NONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NONE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -56,18 +56,18 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; DATA-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
 ; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DATA-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; DATA-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; DATA-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA:       vector.ph:
 ; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; DATA-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
 ; DATA-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
 ; DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; DATA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; DATA-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -100,19 +100,19 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; DATA_NO_LANEMASK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DATA_NO_LANEMASK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; DATA_NO_LANEMASK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA_NO_LANEMASK:       vector.ph:
 ; DATA_NO_LANEMASK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA_NO_LANEMASK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; DATA_NO_LANEMASK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
 ; DATA_NO_LANEMASK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
 ; DATA_NO_LANEMASK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; DATA_NO_LANEMASK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA_NO_LANEMASK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
 ; DATA_NO_LANEMASK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
@@ -152,18 +152,18 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; DATA_AND_CONTROL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
 ; DATA_AND_CONTROL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DATA_AND_CONTROL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; DATA_AND_CONTROL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; DATA_AND_CONTROL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA_AND_CONTROL:       vector.ph:
 ; DATA_AND_CONTROL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA_AND_CONTROL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; DATA_AND_CONTROL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
 ; DATA_AND_CONTROL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
 ; DATA_AND_CONTROL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; DATA_AND_CONTROL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA_AND_CONTROL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; DATA_AND_CONTROL-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
 ; DATA_AND_CONTROL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; DATA_AND_CONTROL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -200,15 +200,15 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA_AND_CONTROL_NO_RT_CHECK:       vector.ph:
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index ce8492cd7736..e9de5e21228f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -11,9 +11,9 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -36,12 +36,12 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; INTERLEAVE-NEXT:  entry:
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 1
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]])
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -65,7 +65,7 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 1
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
@@ -97,9 +97,9 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -122,12 +122,12 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; INTERLEAVE-NEXT:  entry:
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 1
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 [[N]])
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -151,7 +151,7 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl i64 [[TMP15]], 1
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index f178805608eb..3545c6b2239d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -10,11 +10,11 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; WIDE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; WIDE:       vector.ph:
 ; WIDE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; WIDE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; WIDE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; WIDE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; WIDE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; WIDE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; WIDE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; WIDE-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; WIDE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; WIDE:       vector.body:
 ; WIDE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
index 2587b932dd76..2b4d8b99847d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -25,16 +25,16 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
 ; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
 ; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
 ; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; ZVFBFMIN:       [[VECTOR_PH]]:
 ; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
 ; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP12]], 8
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP12]], 8
 ; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFBFMIN:       [[VECTOR_BODY]]:
 ; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -142,16 +142,16 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
 ; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; ZVFBFMIN:       [[VECTOR_PH]]:
 ; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFBFMIN:       [[VECTOR_BODY]]:
 ; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index 362cf093f180..75ae6df5fcd3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -63,16 +63,16 @@ define i32 @any_of_reduction_used_in_blend_with_multiple_phis(ptr %src, i64 %N,
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_0]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index d41caca97e1f..f8b83ff41f51 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -12,18 +12,18 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
@@ -97,18 +97,18 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 333, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 333, [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
@@ -182,18 +182,18 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 333, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 333, [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
@@ -277,18 +277,18 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
@@ -374,18 +374,18 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 333, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 333, [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 333, [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 3)
@@ -479,18 +479,18 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
@@ -581,16 +581,16 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -660,16 +660,16 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 {
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -746,7 +746,7 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[UMIN7]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umax.i64(i64 40, i64 [[TMP5]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -773,11 +773,11 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP18]], splat (i64 3)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 4450678871ac..22c56c89fa16 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -19,18 +19,18 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[UMIN]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START_EXT]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
@@ -93,7 +93,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 8, i32 [[TMP1]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -106,13 +106,13 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 4
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
@@ -317,16 +317,16 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-SAME: ptr [[DST:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 37, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 9
@@ -402,16 +402,16 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP9]], splat (i64 2)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
index eccba717f747..a2faaaaf06df 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
@@ -14,16 +14,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK-LABEL: @vector_add(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -76,16 +76,16 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) {
 ; CHECK-LABEL: @vector_add_reduce(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 73a71f7557f3..db780c3c12c7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -11,16 +11,16 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_udiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -111,16 +111,16 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_sdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -211,16 +211,16 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_urem(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -311,16 +311,16 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @vector_srem(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -411,16 +411,16 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @predicated_udiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
@@ -536,16 +536,16 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-LABEL: @predicated_sdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
@@ -661,16 +661,16 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @predicated_udiv_by_constant(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -781,16 +781,16 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @predicated_sdiv_by_constant(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -901,16 +901,16 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @predicated_sdiv_by_minus_one(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
index b8c030c0c60b..0b307c28cecc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -25,16 +25,16 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFHMIN-NEXT:  [[ENTRY:.*]]:
 ; ZVFHMIN-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; ZVFHMIN-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP12]], 8
+; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP12]], 8
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
index 1319454b7a1a..283688c8e447 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
@@ -11,13 +11,13 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; CHECK-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
@@ -28,11 +28,11 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP18]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -77,13 +77,13 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; ZVFHMIN-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; ZVFHMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; ZVFHMIN:       [[VECTOR_MEMCHECK]]:
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; ZVFHMIN-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -94,11 +94,11 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -166,13 +166,13 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; CHECK-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
@@ -183,11 +183,11 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP18]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP19]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -232,13 +232,13 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; ZVFHMIN-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; ZVFHMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; ZVFHMIN:       [[VECTOR_MEMCHECK]]:
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; ZVFHMIN-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -249,11 +249,11 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -321,13 +321,13 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; CHECK-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
@@ -338,11 +338,11 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP18]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP19]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -387,13 +387,13 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; ZVFHMIN-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; ZVFHMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; ZVFHMIN:       [[VECTOR_MEMCHECK]]:
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; ZVFHMIN-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -404,11 +404,11 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -476,13 +476,13 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; CHECK-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP8]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP16]]
@@ -493,11 +493,11 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP18]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP19]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -542,13 +542,13 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; ZVFHMIN-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; ZVFHMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; ZVFHMIN:       [[VECTOR_MEMCHECK]]:
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; ZVFHMIN-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -559,11 +559,11 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -631,13 +631,13 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; CHECK-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP18]]
@@ -648,11 +648,11 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -697,13 +697,13 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; ZVFHMIN-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; ZVFHMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; ZVFHMIN:       [[VECTOR_MEMCHECK]]:
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; ZVFHMIN-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -714,11 +714,11 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -786,13 +786,13 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; CHECK-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP13]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP14]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP18]]
@@ -803,11 +803,11 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -852,13 +852,13 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    [[INPUT12:%.*]] = ptrtoint ptr [[INPUT1]] to i64
 ; ZVFHMIN-NEXT:    [[OUTPUT1:%.*]] = ptrtoint ptr [[OUTPUT]] to i64
 ; ZVFHMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP2]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; ZVFHMIN:       [[VECTOR_MEMCHECK]]:
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = sub i64 [[OUTPUT1]], [[INPUT12]]
 ; ZVFHMIN-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
@@ -869,11 +869,11 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
 ; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 5c15660e8713..e6825faf3f8d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -21,7 +21,7 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[UMIN21]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.umax.i64(i64 128, i64 [[TMP7]])
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -58,13 +58,13 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 {
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 8
+; CHECK-NEXT:    [[TMP46:%.*]] = mul nuw i64 [[TMP45]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP46]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 8
+; CHECK-NEXT:    [[TMP52:%.*]] = mul nuw i64 [[TMP51]], 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = mul i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[X_I64]], [[TMP49]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 8e90287bac2a..a8e6ef7ebfec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -15,16 +15,16 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; OUTLOOP:       for.body.preheader:
 ; OUTLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4
 ; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
 ; OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; OUTLOOP:       vector.ph:
 ; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 4
 ; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
 ; OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; OUTLOOP-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4
 ; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; OUTLOOP:       vector.body:
 ; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -68,16 +68,16 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; INLOOP:       for.body.preheader:
 ; INLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 8
 ; INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]]
 ; INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INLOOP:       vector.ph:
 ; INLOOP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 8
+; INLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 8
 ; INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]]
 ; INLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; INLOOP-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; INLOOP-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 8
+; INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 8
 ; INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INLOOP:       vector.body:
 ; INLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -123,13 +123,13 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-OUTLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
 ; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.body:
 ; IF-EVL-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -179,13 +179,13 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-INLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 8
 ; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
 ; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]]
 ; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 8
+; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 8
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -250,16 +250,16 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; OUTLOOP-LABEL: @smin(
 ; OUTLOOP-NEXT:  entry:
 ; OUTLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; OUTLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; OUTLOOP:       vector.ph:
 ; OUTLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; OUTLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; OUTLOOP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; OUTLOOP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -299,16 +299,16 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; INLOOP-LABEL: @smin(
 ; INLOOP-NEXT:  entry:
 ; INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; INLOOP:       vector.ph:
 ; INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; INLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INLOOP:       vector.body:
 ; INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -346,18 +346,18 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:  entry:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -403,18 +403,18 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:  entry:
 ; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 61a3e3561ad9..85ccbab0e670 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -7,16 +7,16 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -103,16 +103,16 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-LABEL: @load_store_factor2_i32(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -181,16 +181,16 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -277,16 +277,16 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-LABEL: @load_store_factor2_i64(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -355,16 +355,16 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -467,16 +467,16 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; SCALABLE-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -558,16 +558,16 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -670,16 +670,16 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; SCALABLE-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -761,16 +761,16 @@ define void @load_store_factor4(ptr %p) {
 ; CHECK-LABEL: @load_store_factor4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -887,16 +887,16 @@ define void @load_store_factor4(ptr %p) {
 ; SCALABLE-LABEL: @load_store_factor4(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2171,16 +2171,16 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2273,16 +2273,16 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-LABEL: @combine_load_factor2_i32(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2351,16 +2351,16 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2453,16 +2453,16 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-LABEL: @combine_load_factor2_i64(
 ; SCALABLE-NEXT:  entry:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 79425ae3a67e..c0548dd8ca54 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -11,16 +11,15 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:  entry:
 ; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_EPILOGUE:       vector.ph:
 ; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
@@ -63,12 +62,12 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
@@ -180,16 +179,15 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:  entry:
 ; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_EPILOGUE:       vector.ph:
 ; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
@@ -248,12 +246,12 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
index 964615b03049..4d59f272d8b9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -48,16 +48,16 @@ define void @load_store(ptr %p) {
 ; LMUL2-LABEL: @load_store(
 ; LMUL2-NEXT:  entry:
 ; LMUL2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; LMUL2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; LMUL2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; LMUL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; LMUL2:       vector.ph:
 ; LMUL2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; LMUL2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; LMUL2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; LMUL2-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; LMUL2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL2-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; LMUL2-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; LMUL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LMUL2:       vector.body:
 ; LMUL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -90,16 +90,16 @@ define void @load_store(ptr %p) {
 ; LMUL4-LABEL: @load_store(
 ; LMUL4-NEXT:  entry:
 ; LMUL4-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL4-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; LMUL4-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; LMUL4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; LMUL4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; LMUL4:       vector.ph:
 ; LMUL4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL4-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; LMUL4-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; LMUL4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; LMUL4-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; LMUL4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL4-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; LMUL4-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; LMUL4-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LMUL4:       vector.body:
 ; LMUL4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -132,16 +132,16 @@ define void @load_store(ptr %p) {
 ; LMUL8-LABEL: @load_store(
 ; LMUL8-NEXT:  entry:
 ; LMUL8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL8-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; LMUL8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; LMUL8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; LMUL8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; LMUL8:       vector.ph:
 ; LMUL8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL8-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; LMUL8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; LMUL8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; LMUL8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; LMUL8-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; LMUL8-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; LMUL8-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; LMUL8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; LMUL8:       vector.body:
 ; LMUL8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 17c737157a56..158022f1879a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -126,13 +126,13 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index 3e4d337c0706..4ccc45d4daf9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -12,16 +12,16 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-LABEL: @test(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; VLENUNK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 2276b592aac8..c94b2026c4f7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -17,7 +17,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-LABEL: @foo4(
 ; RV32-NEXT:  entry:
 ; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
@@ -35,11 +35,11 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
 ; RV32-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; RV32-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
@@ -96,7 +96,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-LABEL: @foo4(
 ; RV64-NEXT:  entry:
 ; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
@@ -114,11 +114,11 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV64:       vector.ph:
 ; RV64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; RV64-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
 ; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 8c29da02b813..b8d5cbd5d47b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -11,16 +11,16 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; V-NEXT:  entry:
 ; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; V:       vector.ph:
 ; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; V-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; V:       vector.body:
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -48,16 +48,16 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVQDOTQ-NEXT:  entry:
 ; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; ZVQDOTQ:       vector.ph:
 ; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ZVQDOTQ:       vector.body:
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -183,16 +183,16 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; V-NEXT:  entry:
 ; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; V:       vector.ph:
 ; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; V-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; V:       vector.body:
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -220,16 +220,16 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; ZVQDOTQ-NEXT:  entry:
 ; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; ZVQDOTQ:       vector.ph:
 ; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ZVQDOTQ:       vector.body:
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -355,16 +355,16 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; V-NEXT:  entry:
 ; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; V:       vector.ph:
 ; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; V-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; V:       vector.body:
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -392,16 +392,16 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; ZVQDOTQ-NEXT:  entry:
 ; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; ZVQDOTQ:       vector.ph:
 ; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ZVQDOTQ:       vector.body:
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -526,16 +526,16 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; V-NEXT:  entry:
 ; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; V:       vector.ph:
 ; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; V-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; V:       vector.body:
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -563,16 +563,16 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; ZVQDOTQ-NEXT:  entry:
 ; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; ZVQDOTQ:       vector.ph:
 ; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ZVQDOTQ:       vector.body:
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 2f9ff20bf0f9..8088a6507c25 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -10,16 +10,16 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-SAME: ptr [[ARG:%.*]], i64 [[A:%.*]], i64 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1001, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1001, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1001, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[B]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll
index e3e727b41c02..85163c79072b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll
@@ -6,16 +6,16 @@ define float @s311(float %a_0, float %s311_sum) {
 ; CHECK-SAME: float [[A_0:%.*]], float [[S311_SUM:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 1200, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[A_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
index a28673cf8e55..09b274de3021 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
@@ -22,16 +22,16 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; RV64-NEXT:  [[ENTRY:.*]]:
 ; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
 ; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; RV64:       [[VECTOR_PH]]:
 ; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; RV64-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
 ; RV64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; RV64-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
 ; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64:       [[VECTOR_BODY]]:
@@ -79,16 +79,16 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; RV32-NEXT:  [[ENTRY:.*]]:
 ; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; RV32:       [[VECTOR_PH]]:
 ; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; RV32-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
 ; RV32-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV32-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; RV32-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
 ; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV32:       [[VECTOR_BODY]]:
@@ -138,16 +138,16 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; RV64-UF2-NEXT:  [[ENTRY:.*]]:
 ; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
 ; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; RV64-UF2:       [[VECTOR_PH]]:
 ; RV64-UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
 ; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
 ; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
 ; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -228,16 +228,16 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
 ; RV64-NEXT:  [[ENTRY:.*]]:
 ; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
 ; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; RV64:       [[VECTOR_PH]]:
 ; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; RV64-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
 ; RV64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; RV64-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
 ; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV64:       [[VECTOR_BODY]]:
@@ -285,16 +285,16 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
 ; RV32-NEXT:  [[ENTRY:.*]]:
 ; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; RV32:       [[VECTOR_PH]]:
 ; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; RV32-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
 ; RV32-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV32-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; RV32-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
 ; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; RV32:       [[VECTOR_BODY]]:
@@ -344,16 +344,16 @@ define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
 ; RV64-UF2-NEXT:  [[ENTRY:.*]]:
 ; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
 ; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; RV64-UF2:       [[VECTOR_PH]]:
 ; RV64-UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
 ; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
 ; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
 ; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ba4c4b6d58ad..41252f519155 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -214,7 +214,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.memcheck>:
 ; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR %12 = mul nuw i64 %11, 4
 ; CHECK-NEXT:    IR %13 = mul i64 %12, 4
 ; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
 ; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
@@ -222,11 +222,11 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.ph>:
 ; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR %16 = mul nuw i64 %15, 4
 ; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
 ; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %18 = mul i64 %17, 4
+; CHECK-NEXT:    IR %18 = mul nuw i64 %17, 4
 ; CHECK-NEXT:    vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:    vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector.body
@@ -279,7 +279,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  for.body.preheader: ; preds = %entry
 ; CHECK-NEXT:    %0 = zext i32 %n to i64
 ; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %2 = mul i64 %1, 4
+; CHECK-NEXT:    %2 = mul nuw i64 %1, 4
 ; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
 ; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
 ; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
@@ -302,7 +302,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
 ; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %12 = mul i64 %11, 4
+; CHECK-NEXT:    %12 = mul nuw i64 %11, 4
 ; CHECK-NEXT:    %13 = mul i64 %12, 4
 ; CHECK-NEXT:    %14 = sub i64 %B1, %A2
 ; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
@@ -312,11 +312,11 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
 ; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %16 = mul i64 %15, 4
+; CHECK-NEXT:    %16 = mul nuw i64 %15, 4
 ; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
 ; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %18 = mul i64 %17, 4
+; CHECK-NEXT:    %18 = mul nuw i64 %17, 4
 ; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
 ; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
 ; CHECK-NEXT:    %20 = sub i32 %n, %.cast
@@ -623,7 +623,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.memcheck>:
 ; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR %12 = mul nuw i64 %11, 4
 ; CHECK-NEXT:    IR %13 = mul i64 %12, 4
 ; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
 ; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
@@ -631,11 +631,11 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.ph>:
 ; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR %16 = mul nuw i64 %15, 4
 ; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
 ; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %18 = mul i64 %17, 4
+; CHECK-NEXT:    IR %18 = mul nuw i64 %17, 4
 ; CHECK-NEXT:    vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:    vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector.body
@@ -688,7 +688,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  for.body.preheader: ; preds = %entry
 ; CHECK-NEXT:    %0 = zext i32 %n to i64
 ; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %2 = mul i64 %1, 4
+; CHECK-NEXT:    %2 = mul nuw i64 %1, 4
 ; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
 ; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
 ; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
@@ -711,7 +711,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
 ; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %12 = mul i64 %11, 4
+; CHECK-NEXT:    %12 = mul nuw i64 %11, 4
 ; CHECK-NEXT:    %13 = mul i64 %12, 4
 ; CHECK-NEXT:    %14 = sub i64 %B1, %A2
 ; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
@@ -721,11 +721,11 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: filled BB:
 ; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
 ; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %16 = mul i64 %15, 4
+; CHECK-NEXT:    %16 = mul nuw i64 %15, 4
 ; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
 ; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
 ; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %18 = mul i64 %17, 4
+; CHECK-NEXT:    %18 = mul nuw i64 %17, 4
 ; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
 ; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
 ; CHECK-NEXT:    %20 = sub i32 %n, %.cast
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index 3cba3de13b03..4fb02827b829 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -11,16 +11,16 @@ define void @test(ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -132,16 +132,16 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; CHECK-LABEL: @trivial_due_max_vscale(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -197,16 +197,16 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; CHECK-LABEL: @no_high_lmul_or_interleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
index 5a658dbf0400..d1757c7e81ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -16,16 +16,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLENUNK-LABEL: @vector_add(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLENUNK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -60,16 +60,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLEN128-LABEL: @vector_add(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLEN128-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -125,16 +125,16 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ; VLENUNK-LABEL: @vector_add_i32(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; VLENUNK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -169,16 +169,16 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ; VLEN128-LABEL: @vector_add_i32(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; VLEN128-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -289,16 +289,16 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; VLENUNK-LABEL: @indexed_store(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLENUNK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -333,16 +333,16 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; VLEN128-LABEL: @indexed_store(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; VLEN128-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -395,16 +395,16 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; VLENUNK-LABEL: @indexed_load(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; VLENUNK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -444,16 +444,16 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; VLEN128-LABEL: @indexed_load(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; VLEN128-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -513,16 +513,16 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLENUNK-LABEL: @splat_int(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLENUNK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -553,16 +553,16 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; VLEN128-LABEL: @splat_int(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLEN128-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -609,16 +609,16 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
 ; VLENUNK-LABEL: @splat_ptr(
 ; VLENUNK-NEXT:  entry:
 ; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLENUNK:       vector.ph:
 ; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLENUNK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLENUNK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -649,16 +649,16 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
 ; VLEN128-LABEL: @splat_ptr(
 ; VLEN128-NEXT:  entry:
 ; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
 ; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; VLEN128-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[V:%.*]], i64 0
 ; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index c6bcd7201777..ff9c58525e51 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -12,13 +12,13 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -75,13 +75,13 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -136,13 +136,13 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -205,13 +205,13 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -260,13 +260,13 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -351,13 +351,13 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 076ada825d0b..01b4ad2e66b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -33,11 +33,11 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
 ; SCALABLE-LABEL: @select_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -104,11 +104,11 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0
 ; SCALABLE-LABEL: @select_fcmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X:%.*]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -173,11 +173,11 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; SCALABLE-LABEL: @select_const_i32_from_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -240,11 +240,11 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
 ; SCALABLE-LABEL: @select_i32_from_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -307,11 +307,11 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; SCALABLE-LABEL: @select_const_i32_from_fcmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -404,11 +404,11 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; SCALABLE-LABEL: @pred_select_const_i32_from_icmp
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; SCALABLE-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 79590f5060ad..25dac366ef73 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -7,18 +7,18 @@ define void @single_constant_stride_int_scaled(ptr %p) {
 ; CHECK-LABEL: @single_constant_stride_int_scaled(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
@@ -78,16 +78,16 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-LABEL: @single_constant_stride_int_iv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 64)
@@ -152,18 +152,18 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-LABEL: @single_constant_stride_ptr_iv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP18]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -171,7 +171,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 8, [[TMP8]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
@@ -232,7 +232,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-LABEL: @single_stride_int_scaled(
 ; NOSTRIDED-NEXT:  entry:
 ; NOSTRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NOSTRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
 ; NOSTRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; NOSTRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -241,11 +241,11 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; NOSTRIDED:       vector.ph:
 ; NOSTRIDED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -314,7 +314,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-LABEL: @single_stride_int_iv(
 ; NOSTRIDED-NEXT:  entry:
 ; NOSTRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NOSTRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
 ; NOSTRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; NOSTRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -323,11 +323,11 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; NOSTRIDED:       vector.ph:
 ; NOSTRIDED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -438,7 +438,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[P3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; NOSTRIDED-NEXT:    [[P21:%.*]] = ptrtoint ptr [[P2:%.*]] to i64
 ; NOSTRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NOSTRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
 ; NOSTRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; NOSTRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -447,18 +447,18 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; NOSTRIDED:       vector.memcheck:
 ; NOSTRIDED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; NOSTRIDED-NEXT:    [[TMP6:%.*]] = sub i64 [[P21]], [[P3]]
 ; NOSTRIDED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
 ; NOSTRIDED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; NOSTRIDED:       vector.ph:
 ; NOSTRIDED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; NOSTRIDED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP8]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NOSTRIDED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -495,7 +495,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-LABEL: @double_stride_int_scaled(
 ; STRIDED-NEXT:  entry:
 ; STRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; STRIDED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; STRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 80, i64 [[TMP1]])
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
@@ -548,11 +548,11 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
 ; STRIDED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; STRIDED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; STRIDED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; STRIDED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -618,7 +618,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-LABEL: @double_stride_int_iv(
 ; NOSTRIDED-NEXT:  entry:
 ; NOSTRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NOSTRIDED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NOSTRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
 ; NOSTRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; NOSTRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -627,11 +627,11 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; NOSTRIDED:       vector.ph:
 ; NOSTRIDED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; NOSTRIDED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; NOSTRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; NOSTRIDED-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; NOSTRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -726,7 +726,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-LABEL: @double_stride_ptr_iv(
 ; STRIDED-NEXT:  entry:
 ; STRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; STRIDED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; STRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
@@ -750,11 +750,11 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
 ; STRIDED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; STRIDED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP9]]
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; STRIDED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; STRIDED-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; STRIDED-NEXT:    [[TMP10:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
 ; STRIDED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]]
@@ -765,7 +765,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; STRIDED-NEXT:    [[POINTER_PHI11:%.*]] = phi ptr [ [[P2]], [[VECTOR_PH]] ], [ [[PTR_IND12:%.*]], [[VECTOR_BODY]] ]
 ; STRIDED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; STRIDED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP13]]
 ; STRIDED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP15]], 0
 ; STRIDED-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP18]], i64 0
@@ -777,7 +777,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i64> [[TMP20]], [[DOTSPLAT10]]
 ; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP21]]
 ; STRIDED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; STRIDED-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; STRIDED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP13]]
 ; STRIDED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP23]], 0
 ; STRIDED-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP26]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index f83b5782d5ad..528cec077d8a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -9,16 +9,16 @@ define void @test_pr98413_zext_removed(ptr %src, ptr noalias %dst, i64 %x) {
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 97, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 97, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 97, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <vscale x 8 x i64> [[BROADCAST_SPLAT]] to <vscale x 8 x i8>
@@ -82,16 +82,16 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) {
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 97, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 97, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 97, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <vscale x 8 x i64> [[BROADCAST_SPLAT]] to <vscale x 8 x i8>
@@ -158,13 +158,13 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 9, [[TMP12]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP10]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -269,16 +269,16 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[N]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[V]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[N]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[T]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
index a94fcbffcf3b..72afff279e6b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
@@ -12,13 +12,13 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) {
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 9, [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
index 3a929f3e43a7..a256e92c823e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
@@ -12,7 +12,7 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -25,13 +25,13 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x ptr> poison, ptr [[DSTV]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 8 x ptr> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 827612cfe36d..71018451f59a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -14,16 +14,16 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -94,13 +94,13 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -185,16 +185,16 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -315,16 +315,16 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -425,13 +425,13 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -551,16 +551,16 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -631,13 +631,13 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -722,16 +722,16 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -802,13 +802,13 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -893,16 +893,16 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -917,7 +917,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; SCALABLE-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 2
+; SCALABLE-NEXT:    [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 2
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i64> [[TMP9]], i32 [[TMP14]]
 ; SCALABLE-NEXT:    store i64 [[TMP15]], ptr [[B]], align 8
@@ -989,13 +989,13 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
@@ -1117,16 +1117,16 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
@@ -1226,13 +1226,13 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
@@ -1351,16 +1351,16 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SCALABLE-NEXT:  [[ENTRY:.*]]:
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
 ; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; SCALABLE:       [[VECTOR_PH]]:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -1431,13 +1431,13 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TF-SCALABLE:       [[VECTOR_PH]]:
 ; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; TF-SCALABLE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
index e1132e7b8935..16c575f5a817 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
@@ -19,19 +19,19 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -112,19 +112,19 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -205,19 +205,19 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -298,19 +298,19 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -391,19 +391,19 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -484,19 +484,19 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -577,19 +577,19 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -670,19 +670,19 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -763,19 +763,19 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -856,19 +856,19 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -949,19 +949,19 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1042,19 +1042,19 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1135,19 +1135,19 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1230,20 +1230,20 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1324,20 +1324,20 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1418,20 +1418,20 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1512,20 +1512,20 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1659,20 +1659,20 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
index 3128e40144e3..325f3fd2b968 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
@@ -18,13 +18,13 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
@@ -35,13 +35,13 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; IF-EVL-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP28]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP28]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -132,13 +132,13 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
@@ -149,13 +149,13 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; IF-EVL-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP28]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP28]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -246,13 +246,13 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
@@ -263,13 +263,13 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; IF-EVL-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP28]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP28]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -360,13 +360,13 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
@@ -377,13 +377,13 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 4
+; IF-EVL-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP28]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP28]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -474,25 +474,25 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP21]], [[TMP20]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; IF-EVL-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP23]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP23]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -573,25 +573,25 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP6]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP10]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; IF-EVL-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -672,26 +672,26 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 9, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; IF-EVL-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP26]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP26]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -780,26 +780,26 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 9, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP24]], [[TMP23]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; IF-EVL-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP26]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP26]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -888,26 +888,26 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP21]], [[TMP20]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
+; IF-EVL-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP23]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP23]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
index 50b600f8e8bd..107ca54c002f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
@@ -15,7 +15,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 18, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -30,13 +30,13 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP9]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -114,7 +114,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 18, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -129,13 +129,13 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP9]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -213,7 +213,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 18, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -228,13 +228,13 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP9]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -312,7 +312,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 14, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -327,13 +327,13 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP9]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -411,7 +411,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 14, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
@@ -426,13 +426,13 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP9]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -512,26 +512,26 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -611,26 +611,26 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -710,26 +710,26 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -809,26 +809,26 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -908,26 +908,26 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[B2]]
 ; IF-EVL-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
 ; IF-EVL-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 3f17c95f7ca9..8faec471cf5a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -27,18 +27,18 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:  entry:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.body:
@@ -86,18 +86,18 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:  entry:
 ; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -142,16 +142,16 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-OUTLOOP-NEXT:  entry:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-OUTLOOP:       vector.ph:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP-OUTLOOP:       vector.body:
@@ -193,16 +193,16 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-INLOOP-NEXT:  entry:
 ; NO-VP-INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-INLOOP:       vector.ph:
 ; NO-VP-INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP-INLOOP:       vector.body:
 ; NO-VP-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -264,19 +264,19 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:  entry:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -337,18 +337,18 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:  entry:
 ; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
 ; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
 ; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -396,16 +396,16 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; NO-VP-OUTLOOP-NEXT:  entry:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-OUTLOOP:       vector.ph:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP-OUTLOOP:       vector.body:
@@ -451,16 +451,16 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; NO-VP-INLOOP-NEXT:  entry:
 ; NO-VP-INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-INLOOP:       vector.ph:
 ; NO-VP-INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP-INLOOP:       vector.body:
 ; NO-VP-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -571,16 +571,16 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; NO-VP-OUTLOOP-NEXT:  entry:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-OUTLOOP:       vector.ph:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
@@ -632,16 +632,16 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; NO-VP-INLOOP-NEXT:  entry:
 ; NO-VP-INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-INLOOP:       vector.ph:
 ; NO-VP-INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-INLOOP-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-INLOOP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; NO-VP-INLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
@@ -761,16 +761,16 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; NO-VP-OUTLOOP-NEXT:  entry:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-OUTLOOP:       vector.ph:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
@@ -826,16 +826,16 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; NO-VP-INLOOP-NEXT:  entry:
 ; NO-VP-INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP-INLOOP:       vector.ph:
 ; NO-VP-INLOOP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-INLOOP-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; NO-VP-INLOOP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; NO-VP-INLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
index 2cada3a9ec16..3e83d8a757b5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
@@ -16,13 +16,13 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -112,13 +112,13 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -207,13 +207,13 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -302,13 +302,13 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index a138fbf5b6e9..f651f2295b3b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -17,21 +17,21 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP8]] to i32
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -79,18 +79,18 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NO-VP:       [[VECTOR_PH]]:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
 ; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
 ; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -110,7 +110,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-VP:       [[MIDDLE_BLOCK]]:
 ; NO-VP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; NO-VP-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
 ; NO-VP-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
 ; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
@@ -158,25 +158,25 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP8]] to i32
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 4
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -228,22 +228,22 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NO-VP:       [[VECTOR_PH]]:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
 ; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; NO-VP-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP11]]
 ; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -265,11 +265,11 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-VP:       [[MIDDLE_BLOCK]]:
 ; NO-VP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], 4
+; NO-VP-NEXT:    [[TMP22:%.*]] = mul nuw i32 [[TMP21]], 4
 ; NO-VP-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP22]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP23]]
 ; NO-VP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP25:%.*]] = mul i32 [[TMP24]], 4
+; NO-VP-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 4
 ; NO-VP-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <vscale x 4 x i32> [[TMP15]], i32 [[TMP26]]
 ; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
@@ -320,29 +320,29 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP39:%.*]] = trunc i64 [[TMP8]] to i32
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 4
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 4
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP15]], 4
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP17]]
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -400,26 +400,26 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NO-VP:       [[VECTOR_PH]]:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
 ; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; NO-VP-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP11]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
+; NO-VP-NEXT:    [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 4
 ; NO-VP-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP14]]
 ; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -444,15 +444,15 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NO-VP:       [[MIDDLE_BLOCK]]:
 ; NO-VP-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], 4
+; NO-VP-NEXT:    [[TMP27:%.*]] = mul nuw i32 [[TMP26]], 4
 ; NO-VP-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP27]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP28]]
 ; NO-VP-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; NO-VP-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 4
 ; NO-VP-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <vscale x 4 x i32> [[TMP18]], i32 [[TMP31]]
 ; NO-VP-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; NO-VP-NEXT:    [[TMP33:%.*]] = mul nuw i32 [[TMP32]], 4
 ; NO-VP-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP33]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <vscale x 4 x i32> [[TMP19]], i32 [[TMP34]]
 ; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
@@ -526,18 +526,18 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NO-VP:       [[VECTOR_PH]]:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
 ; NO-VP-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
 ; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -557,11 +557,11 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NO-VP:       [[MIDDLE_BLOCK]]:
 ; NO-VP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; NO-VP-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
 ; NO-VP-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 2
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
 ; NO-VP-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 4
+; NO-VP-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 4
 ; NO-VP-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP22]]
 ; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
index d9b1981869b2..c8d52efde4ac 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
@@ -16,18 +16,18 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -67,16 +67,16 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @add(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -241,18 +241,18 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -292,16 +292,16 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @or(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -356,18 +356,18 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -407,16 +407,16 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @and(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -471,18 +471,18 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -522,16 +522,16 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @xor(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -586,18 +586,18 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -638,16 +638,16 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @smin(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -704,18 +704,18 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -756,16 +756,16 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @smax(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -822,18 +822,18 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -874,16 +874,16 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @umin(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -940,18 +940,18 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -992,16 +992,16 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @umax(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1058,18 +1058,18 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1109,16 +1109,16 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; NO-VP-LABEL: @fadd(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1283,18 +1283,18 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1336,16 +1336,16 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-LABEL: @fmin(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1403,18 +1403,18 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1456,16 +1456,16 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-LABEL: @fmax(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1747,18 +1747,18 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1804,16 +1804,16 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; NO-VP-LABEL: @fmuladd(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1876,18 +1876,18 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1931,16 +1931,16 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-LABEL: @anyof_icmp(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2000,18 +2000,18 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2055,16 +2055,16 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-LABEL: @anyof_fcmp(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index cd246053bcb3..38ef88457c6c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -15,19 +15,19 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -56,7 +56,7 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
 ; IF-EVL-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 4
+; IF-EVL-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 4
 ; IF-EVL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]]
 ; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP25]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP19]])
 ; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP20]])
@@ -87,16 +87,16 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-LABEL: @interleave(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP8]], 2
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -119,7 +119,7 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
 ; NO-VP-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; NO-VP-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
 ; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
 ; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
index 3e7483143c88..6d54b219a346 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
@@ -27,7 +27,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-OUTLOOP-NEXT:  entry:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; IF-EVL-OUTLOOP:       vector.memcheck:
@@ -40,13 +40,13 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[ENTRY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP7]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.body:
@@ -91,7 +91,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-INLOOP-NEXT:  entry:
 ; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; IF-EVL-INLOOP:       vector.memcheck:
@@ -104,13 +104,13 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-INLOOP-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], 1
 ; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP10]]
 ; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP9]]
 ; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -152,7 +152,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-OUTLOOP-NEXT:  entry:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; NO-VP-OUTLOOP:       vector.memcheck:
@@ -165,11 +165,11 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; NO-VP-OUTLOOP:       vector.ph:
 ; NO-VP-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-OUTLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP-OUTLOOP:       vector.body:
@@ -208,7 +208,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; NO-VP-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-INLOOP-NEXT:  entry:
 ; NO-VP-INLOOP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-INLOOP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; NO-VP-INLOOP:       vector.memcheck:
@@ -221,11 +221,11 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; NO-VP-INLOOP-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; NO-VP-INLOOP:       vector.ph:
 ; NO-VP-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-INLOOP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-INLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; NO-VP-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP-INLOOP:       vector.body:
 ; NO-VP-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
index 64f2bab302b8..a05aabc063b1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
@@ -14,18 +14,18 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i32 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP19]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -62,16 +62,16 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
 ; NO-VP-LABEL: @iv32(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP10]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP1]], 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = mul nuw i32 [[TMP1]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP11]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; NO-VP-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP2]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
index 723ee6402672..dcd15087ce3d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
@@ -17,18 +17,18 @@ define void @trip_count_max_1024(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TC]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -92,18 +92,18 @@ define void @overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK:       [[LOOP_PREHEADER]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -168,13 +168,13 @@ define void @no_overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC_ADD]], [[TMP2]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
index aad20331e29f..7609182c7690 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -14,18 +14,18 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
index 24983c21a8ac..bc28918dac68 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
@@ -16,18 +16,18 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index af36f184ea82..53e0b2f45aaa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -14,18 +14,18 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
@@ -67,16 +67,16 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @add(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -243,18 +243,18 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
@@ -296,16 +296,16 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @or(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -361,18 +361,18 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> splat (i32 -1), i32 [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
@@ -414,16 +414,16 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @and(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x i32> splat (i32 -1), i32 [[START:%.*]], i32 0
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -479,18 +479,18 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
@@ -532,16 +532,16 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @xor(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START:%.*]], i32 0
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -597,18 +597,18 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -653,16 +653,16 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @smin(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -722,18 +722,18 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -778,16 +778,16 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @smax(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -847,18 +847,18 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -903,16 +903,16 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @umin(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -972,18 +972,18 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1028,16 +1028,16 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-LABEL: @umax(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1097,18 +1097,18 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
@@ -1150,16 +1150,16 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; NO-VP-LABEL: @fadd(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[START:%.*]], i32 0
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -1326,18 +1326,18 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1382,16 +1382,16 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-LABEL: @fmin(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[START:%.*]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1451,18 +1451,18 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1507,16 +1507,16 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-LABEL: @fmax(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[START:%.*]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1800,18 +1800,18 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[START:%.*]], i32 0
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
@@ -1858,16 +1858,16 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; NO-VP-LABEL: @fmuladd(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = insertelement <vscale x 4 x float> splat (float -0.000000e+00), float [[START:%.*]], i32 0
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
@@ -1930,18 +1930,18 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1985,16 +1985,16 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-LABEL: @anyof_icmp(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2054,18 +2054,18 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -2109,16 +2109,16 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-LABEL: @anyof_fcmp(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 4d8166eaa46f..96db5bf4e9ac 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -15,13 +15,13 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -113,13 +113,13 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -244,13 +244,13 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
index ee32127f2889..336c242c13ad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -18,13 +18,13 @@ define void @test(ptr %p) {
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -333,13 +333,13 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 82e8d3d6c611..81f52627b179 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -15,18 +15,18 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
index 579bc450b83d..f8f397212e0e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -14,18 +14,18 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -70,16 +70,16 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-LABEL: @foo(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 8a3c334db187..69d4f96f7ba2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -21,13 +21,13 @@
 
  ; IF-EVL:      ir-bb<vector.ph>:
  ; IF-EVL-NEXT:   IR   %4 = call i64 @llvm.vscale.i64()
- ; IF-EVL-NEXT:   IR   %5 = mul i64 %4, 4
+ ; IF-EVL-NEXT:   IR   %5 = mul nuw i64 %4, 4
  ; IF-EVL-NEXT:   IR   %6 = sub i64 %5, 1
  ; IF-EVL-NEXT:   IR   %n.rnd.up = add i64 %N, %6
  ; IF-EVL-NEXT:   IR   %n.mod.vf = urem i64 %n.rnd.up, %5
  ; IF-EVL-NEXT:   IR   %n.vec = sub i64 %n.rnd.up, %n.mod.vf
  ; IF-EVL-NEXT:   IR   %7 = call i64 @llvm.vscale.i64()
- ; IF-EVL-NEXT:   IR   %8 = mul i64 %7, 4
+ ; IF-EVL-NEXT:   IR   %8 = mul nuw i64 %7, 4
  ; IF-EVL-NEXT: Successor(s): vector.body
  ; IF-EVL-EMPTY:
  ; IF-EVL-NEXT: vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
index 2135fc636791..4fda9d34b9dc 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
@@ -16,16 +16,16 @@ define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index bb84dbf8ed23..f429677e3875 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -20,12 +20,12 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = add i64 [[B1]], -4
 ; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
@@ -33,13 +33,13 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP11]]
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[PRE_LOAD]], i32 [[TMP16]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -60,11 +60,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = mul nuw i32 [[TMP25]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 2
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP27]]
 ; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = mul nuw i32 [[TMP28]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP29]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP30]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
@@ -87,12 +87,12 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
 ; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = add i64 [[B1]], -4
 ; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
@@ -100,13 +100,13 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP11]]
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 8
 ; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[PRE_LOAD]], i32 [[TMP16]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -117,7 +117,7 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
 ; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i32>, ptr [[TMP22]], align 4
@@ -128,7 +128,7 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD3]], [[TMP24]]
 ; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP30]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP26]], ptr [[TMP28]], align 4
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP31]], align 4
@@ -137,11 +137,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP33]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 2
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP35]]
 ; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul nuw i32 [[TMP36]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP38]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
@@ -194,18 +194,18 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
 ; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTPRE]], i32 [[TMP9]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -228,7 +228,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP22]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -256,18 +256,18 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
 ; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTPRE]], i32 [[TMP9]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -279,7 +279,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP13]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD2]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
@@ -302,7 +302,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[RDX_MINMAX:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[TMP25]], <vscale x 4 x i32> [[TMP26]])
 ; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[RDX_MINMAX]])
 ; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD2]], i32 [[TMP31]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -371,7 +371,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
@@ -391,16 +391,16 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
 ; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]]
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = add i64 1, [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double> poison, double [[CONV1]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP0]], i32 [[TMP18]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -424,7 +424,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP31]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
@@ -451,7 +451,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
@@ -471,16 +471,16 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8
 ; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]]
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 8
 ; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = add i64 1, [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double> poison, double [[CONV1]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP0]], i32 [[TMP18]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -491,7 +491,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i64 [[TMP22]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD4]] = load <vscale x 4 x i16>, ptr [[TMP23]], align 2, !alias.scope [[META6]]
@@ -508,7 +508,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[TMP37]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP32]], ptr [[TMP35]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP33]], ptr [[TMP38]], align 8, !alias.scope [[META9]], !noalias [[META6]]
@@ -517,7 +517,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP41:%.*]] = mul i32 [[TMP40]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP41:%.*]] = mul nuw i32 [[TMP40]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP42:%.*]] = sub i32 [[TMP41]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD4]], i32 [[TMP42]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
@@ -570,16 +570,16 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF1-LABEL: define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]]
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -598,16 +598,16 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-LABEL: define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]]
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -648,16 +648,16 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-SAME: ptr [[CVAL:%.*]], i32 [[X:%.*]]) {
 ; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 96, [[TMP1]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 4
 ; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 96, [[TMP3]]
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
@@ -677,11 +677,11 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
 ; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
@@ -696,16 +696,16 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-SAME: ptr [[CVAL:%.*]], i32 [[X:%.*]]) {
 ; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 8
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 96, [[TMP1]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 8
 ; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i32 96, [[TMP3]]
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -726,11 +726,11 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
 ; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
@@ -769,7 +769,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A]], align 2
 ; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
@@ -785,13 +785,13 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
 ; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -814,7 +814,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = mul nuw i32 [[TMP22]], 4
 ; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP23]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP24]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -830,7 +830,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A]], align 2
 ; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
@@ -846,13 +846,13 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
 ; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
 ; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -863,7 +863,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
 ; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i16>, ptr [[TMP17]], align 2, !alias.scope [[META17]]
@@ -878,7 +878,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP29]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP24]], ptr [[TMP27]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP25]], ptr [[TMP30]], align 4, !alias.scope [[META20]], !noalias [[META17]]
@@ -887,7 +887,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = mul nuw i32 [[TMP32]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP33]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD3]], i32 [[TMP34]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 10f96284c018..62649412deb8 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -11,16 +11,16 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-LABEL: @add_ind64_unrolled(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -92,12 +92,12 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-LABEL: @add_ind64_unrolled_nxv1i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
@@ -177,16 +177,16 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @add_unique_ind32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
@@ -254,16 +254,16 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @add_unique_indf32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[DOTCAST]], 2.000000e+00
 ; CHECK-NEXT:    [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
index 15db687ba64f..b0029a4e0d06 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -8,16 +8,16 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
 ; CHECK-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 2000, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 2000, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 2000, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -34,7 +34,7 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]]
 ; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP11]], align 2
 ; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP14]], align 2
@@ -45,7 +45,7 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i32> [[TMP15]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2000, [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
index e1c14f9f6b71..4a1d7a2376dd 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
@@ -12,16 +12,16 @@ define void @test(ptr %d) {
 ; CHECK-NEXT:    [[ARR:%.*]] = alloca [1024 x i32], align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 128, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 128, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 128, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -83,16 +83,16 @@ define void @testloopvariant(ptr %d) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARR:%.*]] = alloca [1024 x i32], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 128, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 128, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 128, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
index 3f9ca05688de..f384d3c15ca6 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -4,16 +4,16 @@
 ; CHECKUF1: for.body.preheader:
 ; CHECKUF1-DAG: %wide.trip.count = zext nneg i32 %N to i64
 ; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2
 ; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
 
 ; CHECKUF1: vector.ph:
 ; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2
 ; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
 ; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
 ; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1: %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2
 
 ; CHECKUF1: vector.body:
 ; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
@@ -33,16 +33,16 @@
 ; CHECKUF2: for.body.preheader:
 ; CHECKUF2-DAG: %wide.trip.count = zext nneg i32 %N to i64
 ; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3
 ; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
 
 ; CHECKUF2: vector.ph:
 ; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3
 ; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
 ; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2: %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3
 
 ; CHECKUF2: vector.body:
 ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
index 04b4c6fe58db..079f6b73e886 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
@@ -7,16 +7,16 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-LABEL: @reduction_add_trunc(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP31:%.*]] = mul i32 [[TMP30]], 16
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nuw i32 [[TMP30]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 256, [[TMP31]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 256, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 256, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -27,7 +27,7 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
index af83c13bdfdc..4b8ff8677468 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll
@@ -5,16 +5,16 @@ define void @trunc_minimal_bitwidth(ptr %bptr, ptr noalias %hptr, i32 %val, i64
 ; CHECK-LABEL: @trunc_minimal_bitwidth(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i16>
@@ -65,16 +65,16 @@ define void @trunc_minimal_bitwidths_shufflevector (ptr %p, i32 %arg1, i64 %len)
 ; CHECK-LABEL: @trunc_minimal_bitwidths_shufflevector(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[LEN:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[LEN]], [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[LEN]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[ARG1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
index 18cb2257e7ec..17979e5f08a7 100644
--- a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -37,16 +37,16 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-LABEL: @foo(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP8]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP1]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP14]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index 0f26a08db456..80f2977fcb3a 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -102,7 +102,7 @@ define void @memmove_vector(ptr %a, ptr %b) {
 define void @memmove_agg1(ptr %a, ptr %b) {
 ; CHECK-LABEL: @memmove_agg1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[B:%.*]], ptr align 1 [[A:%.*]], i64 [[TMP2]], i1 false)
 ; CHECK-NEXT:    ret void
 ;
@@ -114,7 +114,7 @@ define void @memmove_agg1(ptr %a, ptr %b) {
 define void @memmove_agg2(ptr %a, ptr %b) {
 ; CHECK-LABEL: @memmove_agg2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[B:%.*]], ptr align 1 [[A:%.*]], i64 [[TMP2]], i1 false)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
index 6993fb281bfe..f40afbda1246 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
@@ -16,16 +16,16 @@ define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
 ; CHECK-LABEL: @interleave_deinterleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
index 284f2ad8072f..9acc6d660129 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -7,7 +7,7 @@ define <vscale x 4 x float> @scalable_vec_exp(<vscale x 4 x float> %input) {
 ; CHECK-LABEL: define <vscale x 4 x float> @scalable_vec_exp(
 ; CHECK-SAME: <vscale x 4 x float> [[INPUT:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; CHECK-NEXT:    br label %[[BB3:.*]]
 ; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP9:%.*]], %[[BB3]] ]

From c4c2d777f4aea07c59ff85ade75816df24b05389 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 19 Jun 2025 13:26:19 +0100
Subject: [PATCH 0928/1322] [VPlan] Fix handling of ReductionStartVector for
 rdxs when unrolling.

Update handling of ReductionStartVector in VPlanUnroll for partial
reductions. The new code makes sure all parts are properly set to the
cloned ReductionStartVector.

Fixes a mis-compile reported for
https://github.com/llvm/llvm-project/pull/142290.
---
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp         | 11 ++++++++---
 .../AArch64/partial-reduce-interleave.ll              |  8 ++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index dfb5bfabd22b..0bc683e557e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -231,14 +231,19 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
       if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
         assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
                "unexpected start VPInstruction");
+        if (Part != 1)
+          continue;
+        VPValue *StartV;
         if (match(VPI->getOperand(2), m_SpecificInt(1))) {
-          Copy->setOperand(0, VPI->getOperand(1));
-        } else if (Part == 1) {
+          StartV = VPI->getOperand(1);
+        } else {
           auto *C = VPI->clone();
           C->setOperand(0, C->getOperand(1));
           C->insertAfter(VPI);
-          addUniformForAllParts(C);
+          StartV = C;
         }
+        for (unsigned Part = 1; Part != UF; ++Part)
+          VPV2Parts[VPI][Part - 1] = StartV;
       }
       Copy->addOperand(getConstantVPV(Part));
     } else {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
index b5bc7bf80372..3515365c7027 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -18,7 +18,7 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC2:       [[VECTOR_BODY]]:
 ; IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
-; IC2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ]
+; IC2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ]
 ; IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
 ; IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
@@ -70,9 +70,9 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC4:       [[VECTOR_BODY]]:
 ; IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
-; IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], %[[VECTOR_BODY]] ]
-; IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], %[[VECTOR_BODY]] ]
-; IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], %[[VECTOR_BODY]] ]
+; IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], %[[VECTOR_BODY]] ]
 ; IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
 ; IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16

From 5148e085386fb1808fba055e170d88e3344220ca Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Thu, 19 Jun 2025 13:30:20 +0100
Subject: [PATCH 0929/1322] Fix build issue caused by commit #0fe78c4 (#144888)

Noticed internally in blaze build.
---
 clang/include/clang/Serialization/ASTRecordWriter.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index ad1ec2673812..964c9e6ea8a2 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -17,6 +17,7 @@
 #include "clang/AST/AbstractBasicWriter.h"
 #include "clang/AST/OpenACCClause.h"
 #include "clang/AST/OpenMPClause.h"
+#include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "clang/Serialization/SourceLocationEncoding.h"
 

From dae5104eed451fdd0354ff9639feba10f9dc5440 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 19 Jun 2025 13:42:23 +0100
Subject: [PATCH 0930/1322] [lldb][DWARFASTParserClang] Make
 GetCXXObjectParameter public and call it from unit-tests (#144879)

My goal is to remove the `object_pointer` member on
`ParsedDWARFTypeAttributes` since it's duplicating information that we
retrieve with `GetCXXObjectParameter` anyway. To continue having
coverage for the `DW_AT_object_pointer` code-paths, instead of checking
the
`attrs.object_pointer` I'm now calling `GetCXXObjectParameter` directly.
We could find some very roundabout way to go via the Clang AST to check
that the object parameter was parsed correctly, but that quickly became
quite painful.

Depends on https://github.com/llvm/llvm-project/pull/144876
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  5 +--
 .../SymbolFile/DWARF/DWARFASTParserClang.h    |  4 +++
 .../DWARF/DWARFASTParserClangTests.cpp        | 34 +++++++++++++------
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 7fc1d70898d1..4f79c8aa3f81 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -163,8 +163,9 @@ static bool TagIsRecordType(dw_tag_t tag) {
 /// a default DWARFDIE. If \c containing_decl_ctx is not a valid
 /// C++ declaration context for class methods, assume no object
 /// parameter exists for the given \c subprogram.
-static DWARFDIE GetCXXObjectParameter(const DWARFDIE &subprogram,
-                                      const DWARFDIE &decl_ctx_die) {
+DWARFDIE
+DWARFASTParserClang::GetCXXObjectParameter(const DWARFDIE &subprogram,
+                                           const DWARFDIE &decl_ctx_die) {
   assert(subprogram);
   assert(subprogram.Tag() == DW_TAG_subprogram ||
          subprogram.Tag() == DW_TAG_inlined_subroutine ||
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 3994726aa6b3..111604ce4068 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -112,6 +112,10 @@ public:
   void MapDeclDIEToDefDIE(const lldb_private::plugin::dwarf::DWARFDIE &decl_die,
                           const lldb_private::plugin::dwarf::DWARFDIE &def_die);
 
+  lldb_private::plugin::dwarf::DWARFDIE GetCXXObjectParameter(
+      const lldb_private::plugin::dwarf::DWARFDIE &subprogram,
+      const lldb_private::plugin::dwarf::DWARFDIE &decl_ctx_die);
+
 protected:
   /// Protected typedefs and members.
   /// @{
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index 6c77736113da..2d4b79fed4a5 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -889,18 +889,32 @@ DWARF:
   ASSERT_TRUE(context_die.IsValid());
   ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type);
 
-  auto subprogram_definition = context_die.GetSibling();
-  ASSERT_TRUE(subprogram_definition.IsValid());
-  ASSERT_EQ(subprogram_definition.Tag(), DW_TAG_subprogram);
-  ASSERT_FALSE(subprogram_definition.GetAttributeValueAsOptionalUnsigned(
-      DW_AT_external));
+  {
+    auto decl_die = context_die.GetFirstChild();
+    ASSERT_TRUE(decl_die.IsValid());
+    ASSERT_EQ(decl_die.Tag(), DW_TAG_subprogram);
+    ASSERT_TRUE(decl_die.GetAttributeValueAsOptionalUnsigned(DW_AT_external));
 
-  auto param_die = subprogram_definition.GetFirstChild();
-  ASSERT_TRUE(param_die.IsValid());
+    auto param_die = decl_die.GetFirstChild();
+    ASSERT_TRUE(param_die.IsValid());
 
-  ParsedDWARFTypeAttributes attrs(subprogram_definition);
-  EXPECT_TRUE(attrs.object_pointer.IsValid());
-  EXPECT_EQ(attrs.object_pointer, param_die);
+    EXPECT_EQ(param_die,
+              ast_parser.GetCXXObjectParameter(decl_die, context_die));
+  }
+
+  {
+    auto subprogram_definition = context_die.GetSibling();
+    ASSERT_TRUE(subprogram_definition.IsValid());
+    ASSERT_EQ(subprogram_definition.Tag(), DW_TAG_subprogram);
+    ASSERT_FALSE(subprogram_definition.GetAttributeValueAsOptionalUnsigned(
+        DW_AT_external));
+
+    auto param_die = subprogram_definition.GetFirstChild();
+    ASSERT_TRUE(param_die.IsValid());
+
+    EXPECT_EQ(param_die, ast_parser.GetCXXObjectParameter(subprogram_definition,
+                                                          context_die));
+  }
 }
 
 TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) {

From c079040eea5ce75a97285003948d141ebaac69e6 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 19 Jun 2025 13:44:41 +0100
Subject: [PATCH 0931/1322] [lldb] add has methods to all DemangledNameInfo
 attributes (#144549)

Add `hasX` methods to all the attributes of `DemangledNameInfo`.
---
 lldb/include/lldb/Core/DemangledNameInfo.h | 19 ++++++
 lldb/unittests/Core/MangledTest.cpp        | 76 ++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/lldb/include/lldb/Core/DemangledNameInfo.h b/lldb/include/lldb/Core/DemangledNameInfo.h
index 4b5ba5e42b3b..a2f3fde90c61 100644
--- a/lldb/include/lldb/Core/DemangledNameInfo.h
+++ b/lldb/include/lldb/Core/DemangledNameInfo.h
@@ -73,6 +73,25 @@ struct DemangledNameInfo {
   bool hasBasename() const {
     return BasenameRange.second > BasenameRange.first;
   }
+
+  /// Returns \c true if this object holds a valid scope range.
+  bool hasScope() const { return ScopeRange.second > ScopeRange.first; }
+
+  /// Returns \c true if this object holds a valid arguments range.
+  bool hasArguments() const {
+    return ArgumentsRange.second > ArgumentsRange.first;
+  }
+
+  /// Returns \c true if this object holds a valid qualifiers range.
+  bool hasQualifiers() const {
+    return QualifiersRange.second > QualifiersRange.first;
+  }
+
+  /// Returns \c true if this object holds a valid prefix range.
+  bool hasPrefix() const { return PrefixRange.second > PrefixRange.first; }
+
+  /// Returns \c true if this object holds a valid suffix range.
+  bool hasSuffix() const { return SuffixRange.second > SuffixRange.first; }
 };
 
 /// An OutputBuffer which keeps a record of where certain parts of a
diff --git a/lldb/unittests/Core/MangledTest.cpp b/lldb/unittests/Core/MangledTest.cpp
index 46adb6272209..5994d6072481 100644
--- a/lldb/unittests/Core/MangledTest.cpp
+++ b/lldb/unittests/Core/MangledTest.cpp
@@ -635,6 +635,82 @@ TEST_P(DemanglingPartsTestFixture, DemanglingParts) {
 INSTANTIATE_TEST_SUITE_P(DemanglingPartsTests, DemanglingPartsTestFixture,
                          ::testing::ValuesIn(g_demangling_parts_test_cases));
 
+struct DemangledNameInfoTestCase {
+  DemangledNameInfo expected_info;
+  bool valid_basename;
+  bool valid_scope;
+  bool valid_arguments;
+  bool valid_qualifiers;
+  bool valid_prefix;
+  bool valid_suffix;
+};
+
+DemangledNameInfoTestCase g_demangled_name_info_test_cases[] = {
+    // clang-format off
+   {
+    { /*.BasenameRange=*/{0, 10}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
+      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    },
+      /*valid_basename=*/true, /*valid_scope=*/false, /*valid_arguments=*/false,
+      /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/false,
+   },
+   {
+    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 10}, /*.ArgumentsRange=*/{0, 0},
+      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    },
+      /*valid_basename=*/false, /*valid_scope=*/true, /*valid_arguments=*/false,
+      /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/false,
+   },
+   {
+    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 10},
+      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    },
+      /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/true,
+      /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/false,
+   },
+   {
+    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
+      /*.QualifiersRange=*/{0, 10}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    },
+      /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/false,
+      /*valid_qualifiers=*/true, /*valid_prefix=*/false, /*valid_suffix=*/false,
+   },
+   {
+    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
+      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 10}, /*.SuffixRange=*/{0, 0}
+    },
+      /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/false,
+      /*valid_qualifiers=*/false, /*valid_prefix=*/true, /*valid_suffix=*/false,
+   },
+   {
+    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
+      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 10}
+    },
+      /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/false,
+      /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/true,
+   },
+    // clang-format on
+};
+
+struct DemangledNameInfoTestFixture
+    : public ::testing::TestWithParam<DemangledNameInfoTestCase> {};
+
+TEST_P(DemangledNameInfoTestFixture, DemangledNameInfoRanges) {
+  const auto &[info, valid_basename, valid_scope, valid_arguments,
+               valid_qualifiers, valid_prefix, valid_suffix] = GetParam();
+
+  ASSERT_EQ(info.hasBasename(), valid_basename);
+  ASSERT_EQ(info.hasScope(), valid_scope);
+  ASSERT_EQ(info.hasArguments(), valid_arguments);
+  ASSERT_EQ(info.hasQualifiers(), valid_qualifiers);
+  ASSERT_EQ(info.hasPrefix(), valid_prefix);
+  ASSERT_EQ(info.hasSuffix(), valid_suffix);
+}
+
+INSTANTIATE_TEST_SUITE_P(DemangledNameInfoRangesTests,
+                         DemangledNameInfoTestFixture,
+                         ::testing::ValuesIn(g_demangled_name_info_test_cases));
+
 struct DemanglingInfoCorrectnessTestCase {
   const char *mangled;
   const char *demangled;

From 09e794c4bb138e14b3156d7dbdac0164d9c0327b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 19 Jun 2025 07:47:03 -0500
Subject: [PATCH 0932/1322] [HIP] Emit the CUID value in the module with the
 new driver (#144570)

Summary:
This is a weird point of divergence that was not updated when the new
driver
switched to using the CUID method, which is also apparently critical
for SPIR-V compilation not failing? Somehow if we don't emit this global
than the `llvm.compiler.used` global uses AS(0) which makes SPIR-V
unhappy, but with this global it's AS(4) which makes it happy. Either
way, this should be fixed.
---
 clang/lib/CodeGen/CodeGenModule.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c27168e4c4bf..16688810d068 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -969,7 +969,7 @@ void CodeGenModule::Release() {
         llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
     addCompilerUsedGlobal(GV);
   }
-  if (LangOpts.HIP && !getLangOpts().OffloadingNewDriver) {
+  if (LangOpts.HIP) {
     // Emit a unique ID so that host and device binaries from the same
     // compilation unit can be associated.
     auto *GV = new llvm::GlobalVariable(

From e873fd157eda617ffd42edad3c4a6ab495e6e375 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Thu, 19 Jun 2025 15:52:54 +0300
Subject: [PATCH 0933/1322] [BOLT] Gadget scanner: do not crash on
 debug-printing CFI instructions (#136151)

Some instruction-printing code used under LLVM_DEBUG does not handle CFI
instructions well. While CFI instructions seem to be harmless for the
correctness of the analysis results, they do not convey any useful
information to the analysis either, so skip them early.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 16 ++++++++++
 .../AArch64/gs-pauth-debug-output.s           | 32 +++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 7682d7fe2c54..95e831fe9c8c 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -430,6 +430,9 @@ protected:
   }
 
   SrcState computeNext(const MCInst &Point, const SrcState &Cur) {
+    if (BC.MIB->isCFI(Point))
+      return Cur;
+
     SrcStatePrinter P(BC);
     LLVM_DEBUG({
       dbgs() << "  SrcSafetyAnalysis::ComputeNext(";
@@ -704,6 +707,8 @@ public:
     SrcState S = createEntryState();
     for (auto &I : BF.instrs()) {
       MCInst &Inst = I.second;
+      if (BC.MIB->isCFI(Inst))
+        continue;
 
       // If there is a label before this instruction, it is possible that it
       // can be jumped-to, thus conservatively resetting S. As an exception,
@@ -1010,6 +1015,9 @@ protected:
   }
 
   DstState computeNext(const MCInst &Point, const DstState &Cur) {
+    if (BC.MIB->isCFI(Point))
+      return Cur;
+
     DstStatePrinter P(BC);
     LLVM_DEBUG({
       dbgs() << "  DstSafetyAnalysis::ComputeNext(";
@@ -1177,6 +1185,8 @@ public:
     DstState S = createUnsafeState();
     for (auto &I : llvm::reverse(BF.instrs())) {
       MCInst &Inst = I.second;
+      if (BC.MIB->isCFI(Inst))
+        continue;
 
       // If Inst can change the control flow, we cannot be sure that the next
       // instruction (to be executed in analyzed program) is the one processed
@@ -1366,6 +1376,9 @@ void FunctionAnalysisContext::findUnsafeUses(
   });
 
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
+    if (BC.MIB->isCFI(Inst))
+      return;
+
     const SrcState &S = Analysis->getStateBefore(Inst);
 
     // If non-empty state was never propagated from the entry basic block
@@ -1429,6 +1442,9 @@ void FunctionAnalysisContext::findUnsafeDefs(
   });
 
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
+    if (BC.MIB->isCFI(Inst))
+      return;
+
     const DstState &S = Analysis->getStateAfter(Inst);
 
     if (auto Report = shouldReportAuthOracle(BC, Inst, S))
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index 686557eb1e52..fbb96a63d41e 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -329,6 +329,38 @@ auth_oracle:
 // PAUTH-EMPTY:
 // PAUTH-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
 
+// Gadget scanner should not crash on CFI instructions, including when debug-printing them.
+// Note that the particular debug output is not checked, but BOLT should be
+// compiled with assertions enabled to support -debug-only argument.
+
+        .globl  cfi_inst_df
+        .type   cfi_inst_df,@function
+cfi_inst_df:
+        .cfi_startproc
+        sub     sp, sp, #16
+        .cfi_def_cfa_offset 16
+        add     sp, sp, #16
+        .cfi_def_cfa_offset 0
+        ret
+        .size   cfi_inst_df, .-cfi_inst_df
+        .cfi_endproc
+
+        .globl  cfi_inst_nocfg
+        .type   cfi_inst_nocfg,@function
+cfi_inst_nocfg:
+        .cfi_startproc
+        sub     sp, sp, #16
+        .cfi_def_cfa_offset 16
+
+        adr     x0, 1f
+        br      x0
+1:
+        add     sp, sp, #16
+        .cfi_def_cfa_offset 0
+        ret
+        .size   cfi_inst_nocfg, .-cfi_inst_nocfg
+        .cfi_endproc
+
 // CHECK-LABEL:Analyzing function main, AllocatorId = 1
         .globl  main
         .type   main,@function

From 493a359237e824216d5c572656481c42165a2cb7 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 19 Jun 2025 12:58:08 +0000
Subject: [PATCH 0934/1322] [lldb][AArch64] Fix live process test for Linux's
 mte_ctrl register

I forgot to update this when I changed the presentation of the
"TCF" field.
---
 .../register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py b/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
index 3eaca00b0dcc..2570f267bf46 100644
--- a/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
+++ b/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
@@ -42,8 +42,9 @@ class MTECtrlRegisterTestCase(TestBase):
             expected = [value]
 
             if self.hasXMLSupport():
+                tfc_modes = ["NONE", "SYNC", "ASYNC", "ASYMM"]
                 expected.append(
-                    "(TAGS = 0, TCF_ASYNC = {}, TCF_SYNC = {}, TAGGED_ADDR_ENABLE = 1)".format(
+                    f"(TAGS = 0, TCF = TCF_{tfc_modes[async_err << 1 | sync_err]}, TAGGED_ADDR_ENABLE = 1)".format(
                         async_err, sync_err
                     )
                 )

From 83381ba832a5cf34b09e27a6154c7179fed2fc80 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Thu, 19 Jun 2025 18:10:56 +0500
Subject: [PATCH 0935/1322] [LLDB] Add negative number parsing to DIL (#144557)

---
 lldb/source/ValueObject/DILParser.cpp                 | 11 +++++++++--
 .../ArraySubscript/TestFrameVarDILArraySubscript.py   |  6 +-----
 .../frame/var-dil/basics/ArraySubscript/main.cpp      |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp
index 32af0820acb9..5abbfeabcea3 100644
--- a/lldb/source/ValueObject/DILParser.cpp
+++ b/lldb/source/ValueObject/DILParser.cpp
@@ -348,8 +348,15 @@ void DILParser::BailOut(const std::string &error, uint32_t loc,
 //    ? Integer constant ?
 //
 std::optional<int64_t> DILParser::ParseIntegerConstant() {
-  auto spelling = CurToken().GetSpelling();
-  llvm::StringRef spelling_ref = spelling;
+  std::string number_spelling;
+  if (CurToken().GetKind() == Token::minus) {
+    // StringRef::getAsInteger<>() can parse negative numbers.
+    // FIXME: Remove this once unary minus operator is added.
+    number_spelling = "-";
+    m_dil_lexer.Advance();
+  }
+  number_spelling.append(CurToken().GetSpelling());
+  llvm::StringRef spelling_ref = number_spelling;
   int64_t raw_value;
   if (!spelling_ref.getAsInteger<int64_t>(0, raw_value)) {
     m_dil_lexer.Advance();
diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
index c90e0eaa6363..c0ef29fab859 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
@@ -60,11 +60,7 @@ class TestFrameVarDILArraySubscript(TestBase):
         self.expect_var_path("*(&int_arr[1])", value="2")
 
         # Test for negative index.
-        self.expect(
-            "frame var 'int_arr[-1]'",
-            error=True,
-            substrs=["failed to parse integer constant"],
-        )
+        self.expect_var_path("int_ptr_1[-1]", True, value="1")
 
         # Test for floating point index
         self.expect(
diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/main.cpp
index 485666ae46c2..a9a3612dfae5 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/main.cpp
+++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/main.cpp
@@ -3,6 +3,7 @@
 int main(int argc, char **argv) {
   int int_arr[] = {1, 2, 3};
   int *int_ptr = int_arr;
+  int *int_ptr_1 = &int_arr[1];
   int(&int_arr_ref)[3] = int_arr;
   void *p_void = (void *)int_arr;
 

From 5645d6710904107d66a45f1c3ee0ee25924ff08a Mon Sep 17 00:00:00 2001
From: Aly ElAshram <71949028+AlyElashram@users.noreply.github.com>
Date: Thu, 19 Jun 2025 15:15:59 +0200
Subject: [PATCH 0936/1322] Implement `sigsetjmp` and `siglongjmp` for
 darwin/aarch64 (#139555)

---
 libc/config/darwin/aarch64/config.json        |  8 +++++++
 libc/config/darwin/aarch64/entrypoints.txt    | 11 ++++++++++
 libc/config/darwin/aarch64/headers.txt        |  1 +
 libc/src/setjmp/CMakeLists.txt                | 10 +++++----
 libc/src/setjmp/darwin/CMakeLists.txt         | 12 +++++++++++
 libc/src/setjmp/darwin/sigsetjmp_epilogue.cpp | 21 +++++++++++++++++++
 libc/test/src/CMakeLists.txt                  |  2 +-
 7 files changed, 60 insertions(+), 5 deletions(-)
 create mode 100644 libc/config/darwin/aarch64/config.json
 create mode 100644 libc/src/setjmp/darwin/CMakeLists.txt
 create mode 100644 libc/src/setjmp/darwin/sigsetjmp_epilogue.cpp

diff --git a/libc/config/darwin/aarch64/config.json b/libc/config/darwin/aarch64/config.json
new file mode 100644
index 000000000000..c82f13e5cbf7
--- /dev/null
+++ b/libc/config/darwin/aarch64/config.json
@@ -0,0 +1,8 @@
+{
+  "setjmp": {
+    "LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER": {
+      "value": false,
+      "doc": "Avoid setjmp saving the value of x18, and longjmp restoring it. The Apple AArch64 ABI specifies that this register is reserved and should not be used"
+    }
+  }
+}
diff --git a/libc/config/darwin/aarch64/entrypoints.txt b/libc/config/darwin/aarch64/entrypoints.txt
index 308fc49d681d..437eca79a76f 100644
--- a/libc/config/darwin/aarch64/entrypoints.txt
+++ b/libc/config/darwin/aarch64/entrypoints.txt
@@ -101,6 +101,17 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.free
 )
 
+if(LLVM_LIBC_FULL_BUILD)
+  list(APPEND TARGET_LIBC_ENTRYPOINTS
+    # setjmp.h entrypoints
+    libc.src.setjmp.longjmp
+    libc.src.setjmp.setjmp
+    libc.src.setjmp.siglongjmp
+    libc.src.setjmp.sigsetjmp
+  )
+endif()
+
+
 set(TARGET_LIBM_ENTRYPOINTS
     # complex.h entrypoints
     libc.src.complex.creal
diff --git a/libc/config/darwin/aarch64/headers.txt b/libc/config/darwin/aarch64/headers.txt
index 86e714597232..8f3d6029c9b6 100644
--- a/libc/config/darwin/aarch64/headers.txt
+++ b/libc/config/darwin/aarch64/headers.txt
@@ -7,6 +7,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.inttypes
     libc.include.limits
     libc.include.math
+    libc.include.setjmp
     libc.include.stdlib
     libc.include.string
     libc.include.strings
diff --git a/libc/src/setjmp/CMakeLists.txt b/libc/src/setjmp/CMakeLists.txt
index 239254fa57dc..50c827254da6 100644
--- a/libc/src/setjmp/CMakeLists.txt
+++ b/libc/src/setjmp/CMakeLists.txt
@@ -1,3 +1,9 @@
+# Process architecture-specific subdirectory FIRST to avoid missing targets.
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+endif()
+
+# Then process OS-specific subdirectory
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_object_library(
@@ -8,10 +14,6 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   )
 endif()
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
-endif()
-
 add_entrypoint_object(
   setjmp
   ALIAS
diff --git a/libc/src/setjmp/darwin/CMakeLists.txt b/libc/src/setjmp/darwin/CMakeLists.txt
new file mode 100644
index 000000000000..b844c8c5ee55
--- /dev/null
+++ b/libc/src/setjmp/darwin/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_object_library(
+  sigsetjmp_epilogue
+  HDRS
+    ../sigsetjmp_epilogue.h
+  SRCS
+    sigsetjmp_epilogue.cpp
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.OSUtil.osutil
+    libc.hdr.types.jmp_buf
+    libc.hdr.types.sigset_t
+)
diff --git a/libc/src/setjmp/darwin/sigsetjmp_epilogue.cpp b/libc/src/setjmp/darwin/sigsetjmp_epilogue.cpp
new file mode 100644
index 000000000000..b2ca4d99ed82
--- /dev/null
+++ b/libc/src/setjmp/darwin/sigsetjmp_epilogue.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of sigsetjmp_epilogue ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/setjmp/sigsetjmp_epilogue.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/common.h"
+#include "src/signal/sigprocmask.h"
+
+namespace LIBC_NAMESPACE_DECL {
+[[gnu::returns_twice]] int sigsetjmp_epilogue(jmp_buf buffer, int retval) {
+  syscall_impl<long>(sigprocmask, SIG_SETMASK,
+                     /* set= */ retval ? &buffer->sigmask : nullptr,
+                     /* old_set= */ retval ? nullptr : &buffer->sigmask);
+  return retval;
+}
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index b7c145788c0c..6dca47b5343e 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -62,6 +62,7 @@ add_subdirectory(errno)
 add_subdirectory(fenv)
 add_subdirectory(math)
 add_subdirectory(search)
+add_subdirectory(setjmp)
 add_subdirectory(stdbit)
 add_subdirectory(stdfix)
 add_subdirectory(stdio)
@@ -92,7 +93,6 @@ add_subdirectory(assert)
 add_subdirectory(compiler)
 add_subdirectory(dirent)
 add_subdirectory(locale)
-add_subdirectory(setjmp)
 add_subdirectory(signal)
 add_subdirectory(spawn)
 

From bf79d4819edeb54c6cf528db63676110992908a8 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Thu, 19 Jun 2025 09:22:16 -0400
Subject: [PATCH 0937/1322] [Reland] [PowerPC] frontend get target feature from
 backend with cpu name (#144594)

1. The PR proceeds with a backend target hook to allow front-ends to
determine what target features are available in a compilation based on
the CPU name.
2. Fix a backend target feature bug that supports HTM for
Power8/9/10/11. However, HTM is only supported on Power8/9 according to
the ISA.
3. All target features that are hardcoded in PPC.cpp can be retrieved
from the backend target feature. I have double-checked that the
hardcoded logic for inferring target features from the CPU in the
frontend(PPC.cpp) is the same as in PPC.td.

The reland patch addressed the comment
https://github.com/llvm/llvm-project/pull/137670#discussion_r2143541120
---
 clang/lib/Basic/Targets/PPC.cpp               | 148 +--------------
 .../cxx11-thread-local-reference.cpp          |   2 +-
 .../Driver/aix-shared-lib-tls-model-opt.c     |   7 +-
 .../Driver/aix-small-local-exec-dynamic-tls.c |  39 ++--
 clang/test/Driver/ppc-crbits.cpp              |   4 -
 clang/test/Driver/ppc-isa-features.cpp        |  22 +--
 llvm/include/llvm/TargetParser/CMakeLists.txt |   3 +
 .../llvm/TargetParser/PPCTargetParser.h       |   6 +
 llvm/include/llvm/TargetParser/TargetParser.h |  27 +++
 llvm/lib/Target/PowerPC/PPC.td                |   4 +-
 llvm/lib/TargetParser/PPCTargetParser.cpp     |  25 +++
 llvm/lib/TargetParser/TargetParser.cpp        |  47 +++++
 llvm/utils/TableGen/Basic/CMakeLists.txt      |   1 +
 .../TableGen/Basic/TargetFeaturesEmitter.cpp  | 178 ++++++++++++++++++
 .../TableGen/Basic/TargetFeaturesEmitter.h    |  49 +++++
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  84 +--------
 16 files changed, 383 insertions(+), 263 deletions(-)
 create mode 100644 llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
 create mode 100644 llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index e6ef0ecc526b..77145e2891a8 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/TargetParser/PPCTargetParser.h"
+#include <optional>
 
 using namespace clang;
 using namespace clang::targets;
@@ -516,129 +517,14 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
 bool PPCTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
-  Features["altivec"] = llvm::StringSwitch<bool>(CPU)
-                            .Case("7400", true)
-                            .Case("g4", true)
-                            .Case("7450", true)
-                            .Case("g4+", true)
-                            .Case("970", true)
-                            .Case("g5", true)
-                            .Case("pwr6", true)
-                            .Case("pwr7", true)
-                            .Case("pwr8", true)
-                            .Case("pwr9", true)
-                            .Case("ppc64", true)
-                            .Case("ppc64le", true)
-                            .Default(false);
 
-  Features["power9-vector"] = (CPU == "pwr9");
-  Features["crypto"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Default(false);
-  Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
-                                  .Case("ppc64le", true)
-                                  .Case("pwr9", true)
-                                  .Case("pwr8", true)
-                                  .Default(false);
-  Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Case("pwr7", true)
-                           .Default(false);
-  Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Case("pwr7", true)
-                           .Default(false);
-  Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
-                                .Case("ppc64le", true)
-                                .Case("pwr9", true)
-                                .Case("pwr8", true)
-                                .Default(false);
-  Features["crbits"] = llvm::StringSwitch<bool>(CPU)
-                                .Case("ppc64le", true)
-                                .Case("pwr9", true)
-                                .Case("pwr8", true)
-                                .Default(false);
-  Features["vsx"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("ppc64le", true)
-                        .Case("pwr9", true)
-                        .Case("pwr8", true)
-                        .Case("pwr7", true)
-                        .Default(false);
-  Features["htm"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("ppc64le", true)
-                        .Case("pwr9", true)
-                        .Case("pwr8", true)
-                        .Default(false);
+  const llvm::Triple &TheTriple = getTriple();
 
-  // ROP Protect is off by default.
-  Features["rop-protect"] = false;
-  // Privileged instructions are off by default.
-  Features["privileged"] = false;
-
-  if (getTriple().isOSAIX()) {
-    // The code generated by the -maix-small-local-[exec|dynamic]-tls option is
-    // turned off by default.
-    Features["aix-small-local-exec-tls"] = false;
-    Features["aix-small-local-dynamic-tls"] = false;
-
-    // Turn off TLS model opt by default.
-    Features["aix-shared-lib-tls-model-opt"] = false;
-  }
-
-  Features["spe"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("8548", true)
-                        .Case("e500", true)
-                        .Default(false);
-
-  Features["isa-v206-instructions"] = llvm::StringSwitch<bool>(CPU)
-                                          .Case("ppc64le", true)
-                                          .Case("pwr9", true)
-                                          .Case("pwr8", true)
-                                          .Case("pwr7", true)
-                                          .Case("a2", true)
-                                          .Default(false);
-
-  Features["isa-v207-instructions"] = llvm::StringSwitch<bool>(CPU)
-                                          .Case("ppc64le", true)
-                                          .Case("pwr9", true)
-                                          .Case("pwr8", true)
-                                          .Default(false);
-
-  Features["isa-v30-instructions"] =
-      llvm::StringSwitch<bool>(CPU).Case("pwr9", true).Default(false);
-
-  Features["quadword-atomics"] =
-      getTriple().isArch64Bit() && llvm::StringSwitch<bool>(CPU)
-                                       .Case("pwr9", true)
-                                       .Case("pwr8", true)
-                                       .Default(false);
-
-  // Power10 includes all the same features as Power9 plus any features specific
-  // to the Power10 core.
-  if (CPU == "pwr10" || CPU == "power10") {
-    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
-    addP10SpecificFeatures(Features);
-  }
-
-  // Power11 includes all the same features as Power10 plus any features
-  // specific to the Power11 core.
-  if (CPU == "pwr11" || CPU == "power11") {
-    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
-    addP11SpecificFeatures(Features);
-  }
-
-  // Future CPU should include all of the features of Power 11 as well as any
-  // additional features (yet to be determined) specific to it.
-  if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr11", FeaturesVec);
-    addFutureSpecificFeatures(Features);
-  }
+  std::optional<llvm::StringMap<bool>> FeaturesOpt =
+      llvm::PPC::getPPCDefaultTargetFeatures(TheTriple,
+                                             llvm::PPC::normalizeCPUName(CPU));
+  if (FeaturesOpt)
+    Features = FeaturesOpt.value();
 
   if (!ppcUserFeaturesCheck(Diags, FeaturesVec))
     return false;
@@ -700,26 +586,6 @@ bool PPCTargetInfo::initFeatureMap(
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
-// Add any Power10 specific features.
-void PPCTargetInfo::addP10SpecificFeatures(
-    llvm::StringMap<bool> &Features) const {
-  Features["htm"] = false; // HTM was removed for P10.
-  Features["paired-vector-memops"] = true;
-  Features["mma"] = true;
-  Features["power10-vector"] = true;
-  Features["pcrelative-memops"] = true;
-  Features["prefix-instrs"] = true;
-  Features["isa-v31-instructions"] = true;
-}
-
-// Add any Power11 specific features.
-void PPCTargetInfo::addP11SpecificFeatures(
-    llvm::StringMap<bool> &Features) const {}
-
-// Add features specific to the "Future" CPU.
-void PPCTargetInfo::addFutureSpecificFeatures(
-    llvm::StringMap<bool> &Features) const {}
-
 bool PPCTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("powerpc", true)
diff --git a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index cd5a18f39060..a0e76e8a9a0b 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -35,5 +35,5 @@ int &g() { return r; }
 // DARWIN-LABEL: define internal cxx_fast_tlscc void @__tls_init()
 // CHECK: call void @[[R_INIT]]()
 
-// LINUX_AIX: attributes [[ATTR0]] = { {{.*}}"target-features"{{.*}} }
+// LINUX_AIX: attributes [[ATTR0]] = { {{.*}} }
 // DARWIN: attributes [[ATTR1]] = { {{.*}}nounwind{{.*}}"target-features"{{.*}}  }
diff --git a/clang/test/Driver/aix-shared-lib-tls-model-opt.c b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
index 7acf091f0a04..891caf4ed3fc 100644
--- a/clang/test/Driver/aix-shared-lib-tls-model-opt.c
+++ b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 // RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
@@ -19,9 +19,8 @@ int test(void) {
 
 // CHECK-AIX: test() #0 {
 // CHECK-AIX: attributes #0 = {
-// CHECK-AIX-OFF-SAME: -aix-shared-lib-tls-model-opt
 // CHECK-AIX-ON-SAME: +aix-shared-lib-tls-model-opt
 
-// CHECK-LINUX-NOT: {{[-+]aix-shared-lib-tls-model-opt}}
+// CHECK-LINUX-NOT: {{[+]aix-shared-lib-tls-model-opt}}
 
 // CHECK-UNSUPPORTED-TARGET: option '-maix-shared-lib-tls-model-opt' cannot be specified on this target
diff --git a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
index 1a0619b58e89..6fc2b8efb4ae 100644
--- a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
+++ b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
@@ -1,37 +1,37 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
-// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
-// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
+// RUN: %clang --target=powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
 
-// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
+// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALEXEC_TLS
 
-// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
+// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALDYNAMIC_TLS
 
-// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
-// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
@@ -39,10 +39,9 @@ int test(void) {
   return 0;
 }
 
-// CHECK-AIX-DEFAULT: test() #0 {
-// CHECK-AIX-DEFAULT: attributes #0 = {
-// CHECK-AIX-DEFAULT-SAME: {{-aix-small-local-exec-tls,.*-aix-small-local-dynamic-tls|-aix-small-local-dynamic-tls,.*-aix-small-local-exec-tls}}
-// CHECK-LINUX-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
+// CHECK-DEFAULT: test() #0 {
+// CHECK-DEFAULT: attributes #0 = {
+// CHECK-DEFAULT-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
 
 // CHECK-UNSUPPORTED-AIX32: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
 // CHECK-UNSUPPORTED-LINUX: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
diff --git a/clang/test/Driver/ppc-crbits.cpp b/clang/test/Driver/ppc-crbits.cpp
index 3ed56308cb52..62893d3d0e87 100644
--- a/clang/test/Driver/ppc-crbits.cpp
+++ b/clang/test/Driver/ppc-crbits.cpp
@@ -64,8 +64,6 @@
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
-// RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -emit-llvm \
-// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mno-crbits \
@@ -92,8 +90,6 @@
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
-// RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -emit-llvm \
-// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mno-crbits \
diff --git a/clang/test/Driver/ppc-isa-features.cpp b/clang/test/Driver/ppc-isa-features.cpp
index 92c5bc82f72b..35dbfbcdf569 100644
--- a/clang/test/Driver/ppc-isa-features.cpp
+++ b/clang/test/Driver/ppc-isa-features.cpp
@@ -5,20 +5,20 @@
 // RUN: %clang -target powerpc64-unknown-aix -mcpu=pwr9 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR9
 // RUN: %clang -target powerpc-unknown-aix -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR10
 
-// CHECK-PWR6: -isa-v206-instructions
-// CHECK-PWR6: -isa-v207-instructions
-// CHECK-PWR6: -isa-v30-instructions
+// CHECK-PWR6-NOT: isa-v206-instructions
+// CHECK-PWR6-NOT: isa-v207-instructions
+// CHECK-PWR6-NOT: isa-v30-instructions
 
-// CHECK-A2: +isa-v206-instructions
-// CHECK-A2: -isa-v207-instructions
-// CHECK-A2: -isa-v30-instructions
+// CHECK-A2:     +isa-v206-instructions
+// CHECK-A2-NOT: isa-v207-instructions
+// CHECK-A2-NOT: isa-v30-instructions
 
-// CHECK-PWR7: +isa-v206-instructions
-// CHECK-PWR7: -isa-v207-instructions
-// CHECK-PWR7: -isa-v30-instructions
+// CHECK-PWR7:     +isa-v206-instructions
+// CHECK-PWR7-NOT: isa-v207-instructions
+// CHECK-PWR7-NOT: isa-v30-instructions
 
-// CHECK-PWR8: +isa-v207-instructions
-// CHECK-PWR8: -isa-v30-instructions
+// CHECK-PWR8:     +isa-v207-instructions
+// CHECK-PWR8-NOT: isa-v30-instructions
 
 // CHECK-PWR9: +isa-v207-instructions
 // CHECK-PWR9: +isa-v30-instructions
diff --git a/llvm/include/llvm/TargetParser/CMakeLists.txt b/llvm/include/llvm/TargetParser/CMakeLists.txt
index b456da66a022..bb6d58d74a35 100644
--- a/llvm/include/llvm/TargetParser/CMakeLists.txt
+++ b/llvm/include/llvm/TargetParser/CMakeLists.txt
@@ -7,5 +7,8 @@ tablegen(LLVM AArch64TargetParserDef.inc -gen-arm-target-def -I ${PROJECT_SOURCE
 set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/lib/Target/RISCV/RISCV.td)
 tablegen(LLVM RISCVTargetParserDef.inc -gen-riscv-target-def -I ${PROJECT_SOURCE_DIR}/lib/Target/RISCV/)
 
+set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/lib/Target/PowerPC/PPC.td)
+tablegen(LLVM PPCGenTargetFeatures.inc -gen-target-features -I${PROJECT_SOURCE_DIR}/lib/Target/PowerPC)
+
 # This covers all of the tablegen calls above.
 add_public_tablegen_target(target_parser_gen)
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index 59d9f867005a..d3d44afb5f54 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
+#include "TargetParser.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -37,6 +39,10 @@ LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
 LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
+
+LLVM_ABI std::optional<llvm::StringMap<bool>>
+getPPCDefaultTargetFeatures(const Triple &T, StringRef CPUName);
+
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index 176205e17ae0..b4a92cc6b6c4 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TARGETPARSER_TARGETPARSER_H
 #define LLVM_TARGETPARSER_TARGETPARSER_H
 
+#include "SubtargetFeature.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@@ -190,6 +192,31 @@ insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
 } // namespace AMDGPU
+
+struct BasicSubtargetFeatureKV {
+  const char *Key;         ///< K-V key string
+  unsigned Value;          ///< K-V integer value
+  FeatureBitArray Implies; ///< K-V bit mask
+};
+
+/// Used to provide key value pairs for feature and CPU bit flags.
+struct BasicSubtargetSubTypeKV {
+  const char *Key;         ///< K-V key string
+  FeatureBitArray Implies; ///< K-V bit mask
+
+  /// Compare routine for std::lower_bound
+  bool operator<(StringRef S) const { return StringRef(Key) < S; }
+
+  /// Compare routine for std::is_sorted.
+  bool operator<(const BasicSubtargetSubTypeKV &Other) const {
+    return StringRef(Key) < StringRef(Other.Key);
+  }
+};
+
+std::optional<llvm::StringMap<bool>>
+getCPUDefaultTargetFeatures(StringRef CPU,
+                            ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
+                            ArrayRef<BasicSubtargetFeatureKV> ProcFeatures);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index fd850faf7b2f..ea7c2203662b 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -411,7 +411,6 @@ def ProcessorFeatures {
      FeatureP8Altivec,
      FeatureP8Vector,
      FeatureP8Crypto,
-     FeatureHTM,
      FeatureDirectMove,
      FeatureICBT,
      FeaturePartwordAtomic,
@@ -422,6 +421,7 @@ def ProcessorFeatures {
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
+                                               FeatureHTM,
                                                FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
@@ -443,7 +443,7 @@ def ProcessorFeatures {
   // dispatch for vector operations than scalar ones. For the time being,
   // this list also includes scheduling-related features since we do not have
   // enough info to create custom scheduling strategies for future CPUs.
-  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits, FeatureHTM];
   list<SubtargetFeature> P9InheritableFeatures =
     !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
   list<SubtargetFeature> P9Features =
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index 422d758c772e..d51044529a49 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -15,6 +15,10 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TargetParser/Host.h"
 
+#define GET_SUBTARGETFEATURES_ENUM
+#define GET_SUBTARGETFEATURES_KV
+#include "llvm/TargetParser/PPCGenTargetFeatures.inc"
+
 namespace llvm {
 namespace PPC {
 
@@ -117,5 +121,26 @@ StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
   return getNormalizedPPCTargetCPU(T, CPUName);
 }
 
+std::optional<StringMap<bool>> getPPCDefaultTargetFeatures(const Triple &T,
+                                                           StringRef CPU) {
+  std::optional<StringMap<bool>> FeaturesOpt =
+      getCPUDefaultTargetFeatures(CPU, BasicPPCSubTypeKV, BasicPPCFeatureKV);
+
+  if (!FeaturesOpt.has_value())
+    return std::nullopt;
+
+  StringMap<bool> Features = FeaturesOpt.value();
+  // FIXME: We need to check for the processor model 8548, since the backend
+  // does not support this processor. When this processor model is implemented
+  // within the backend, the following code can be removed.
+  if (CPU == "8548")
+    Features["spe"] = true;
+
+  // The target feature `quadword-atomics` is only supported for 64-bit
+  // POWER8 and above.
+  if (Features.find("quadword-atomics") != Features.end() && !T.isArch64Bit())
+    Features["quadword-atomics"] = false;
+  return Features;
+}
 } // namespace PPC
 } // namespace llvm
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 7c54901dae47..03f7d3899c2e 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -18,6 +18,53 @@
 using namespace llvm;
 using namespace AMDGPU;
 
+/// Find KV in array using binary search.
+static const BasicSubtargetSubTypeKV *
+find(StringRef S, ArrayRef<BasicSubtargetSubTypeKV> A) {
+  // Binary search the array
+  auto F = llvm::lower_bound(A, S);
+  // If not found then return NULL
+  if (F == A.end() || StringRef(F->Key) != S)
+    return nullptr;
+  // Return the found array item
+  return F;
+}
+
+/// For each feature that is (transitively) implied by this feature, set it.
+static void setImpliedBits(FeatureBitset &Bits, const FeatureBitset &Implies,
+                           ArrayRef<BasicSubtargetFeatureKV> FeatureTable) {
+  // OR the Implies bits in outside the loop. This allows the Implies for CPUs
+  // which might imply features not in FeatureTable to use this.
+  Bits |= Implies;
+  for (const auto &FE : FeatureTable)
+    if (Implies.test(FE.Value))
+      setImpliedBits(Bits, FE.Implies.getAsBitset(), FeatureTable);
+}
+
+std::optional<llvm::StringMap<bool>> llvm::getCPUDefaultTargetFeatures(
+    StringRef CPU, ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
+    ArrayRef<BasicSubtargetFeatureKV> ProcFeatures) {
+  if (CPU.empty())
+    return std::nullopt;
+
+  const BasicSubtargetSubTypeKV *CPUEntry = ::find(CPU, ProcDesc);
+  if (!CPUEntry)
+    return std::nullopt;
+
+  // Set the features implied by this CPU feature if there is a match.
+  FeatureBitset Bits;
+  llvm::StringMap<bool> DefaultFeatures;
+  setImpliedBits(Bits, CPUEntry->Implies.getAsBitset(), ProcFeatures);
+
+  unsigned BitSize = Bits.size();
+  for (const BasicSubtargetFeatureKV &FE : ProcFeatures) {
+    assert(FE.Value < BitSize && "Target Feature is out of range");
+    if (Bits[FE.Value])
+      DefaultFeatures[FE.Key] = true;
+  }
+  return DefaultFeatures;
+}
+
 namespace {
 
 struct GPUInfo {
diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt
index b058fba78eb0..1f4d3a7bc123 100644
--- a/llvm/utils/TableGen/Basic/CMakeLists.txt
+++ b/llvm/utils/TableGen/Basic/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLV
   RISCVTargetDefEmitter.cpp
   SDNodeProperties.cpp
   TableGen.cpp
+  TargetFeaturesEmitter.cpp
   VTEmitter.cpp
 )
 
diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
new file mode 100644
index 000000000000..f016cc43c0e3
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
@@ -0,0 +1,178 @@
+//===- TargetFeaturesEmitter.cpp - Generate CPU Target feature ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This tablegen backend exports cpu target features
+//  and cpu sub-type.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TargetFeaturesEmitter.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+
+using namespace llvm;
+
+using FeatureMapTy = DenseMap<const Record *, unsigned>;
+using ConstRecVec = std::vector<const Record *>;
+
+TargetFeaturesEmitter::TargetFeaturesEmitter(const RecordKeeper &R)
+    : Records(R) {
+  ArrayRef<const Record *> Targets = Records.getAllDerivedDefinitions("Target");
+  if (Targets.size() == 0)
+    PrintFatalError("No 'Target' subclasses defined!");
+  if (Targets.size() != 1)
+    PrintFatalError("Multiple subclasses of Target defined!");
+  Target = Targets[0]->getName();
+}
+
+FeatureMapTy TargetFeaturesEmitter::enumeration(raw_ostream &OS) {
+  ArrayRef<const Record *> DefList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
+
+  unsigned N = DefList.size();
+  if (N == 0)
+    return FeatureMapTy();
+
+  if (N + 1 > MAX_SUBTARGET_FEATURES)
+    PrintFatalError(
+        "Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
+
+  OS << "namespace " << Target << " {\n";
+
+  OS << "enum {\n";
+
+  FeatureMapTy FeatureMap;
+  for (unsigned I = 0; I < N; ++I) {
+    const Record *Def = DefList[I];
+    // Print the Feature Name.
+    OS << "  " << Def->getName() << " = " << I << ",\n";
+
+    FeatureMap[Def] = I;
+  }
+
+  OS << "  " << "NumSubtargetFeatures = " << N << "\n";
+
+  // Close enumeration and namespace
+  OS << "};\n";
+  OS << "} // end namespace " << Target << "\n";
+  return FeatureMap;
+}
+
+void TargetFeaturesEmitter::printFeatureMask(
+    raw_ostream &OS, ArrayRef<const Record *> FeatureList,
+    const FeatureMapTy &FeatureMap) {
+  std::array<uint64_t, MAX_SUBTARGET_WORDS> Mask = {};
+  for (const Record *Feature : FeatureList) {
+    unsigned Bit = FeatureMap.lookup(Feature);
+    Mask[Bit / 64] |= 1ULL << (Bit % 64);
+  }
+
+  OS << "{ { { ";
+  for (unsigned I = 0; I != Mask.size(); ++I) {
+    OS << "0x";
+    OS.write_hex(Mask[I]);
+    OS << "ULL, ";
+  }
+  OS << "} } }";
+}
+
+void TargetFeaturesEmitter::printFeatureKeyValues(
+    raw_ostream &OS, const FeatureMapTy &FeatureMap) {
+  std::vector<const Record *> FeatureList =
+      Records.getAllDerivedDefinitions("SubtargetFeature");
+
+  // Remove features with empty name.
+  llvm::erase_if(FeatureList, [](const Record *Rec) {
+    return Rec->getValueAsString("Name").empty();
+  });
+
+  if (FeatureList.empty())
+    return;
+
+  llvm::sort(FeatureList, LessRecordFieldName());
+
+  // Begin feature table.
+  OS << "// Sorted (by key) array of values for CPU features.\n"
+     << "extern const llvm::BasicSubtargetFeatureKV " << "Basic" << Target
+     << "FeatureKV[] = {\n";
+
+  for (const Record *Feature : FeatureList) {
+    StringRef Name = Feature->getName();
+    StringRef ValueName = Feature->getValueAsString("Name");
+
+    OS << "  { " << "\"" << ValueName << "\", " << Target << "::" << Name
+       << ", ";
+
+    ConstRecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
+
+    printFeatureMask(OS, ImpliesList, FeatureMap);
+
+    OS << " },\n";
+  }
+
+  // End feature table.
+  OS << "};\n";
+
+  return;
+}
+
+void TargetFeaturesEmitter::printCPUKeyValues(raw_ostream &OS,
+                                              const FeatureMapTy &FeatureMap) {
+  // Gather and sort processor information
+  std::vector<const Record *> ProcessorList =
+      Records.getAllDerivedDefinitions("Processor");
+  llvm::sort(ProcessorList, LessRecordFieldName());
+
+  // Begin processor table.
+  OS << "// Sorted (by key) array of values for CPU subtype.\n"
+     << "extern const llvm::BasicSubtargetSubTypeKV " << "Basic" << Target
+     << "SubTypeKV[] = {\n";
+
+  for (const Record *Processor : ProcessorList) {
+    StringRef Name = Processor->getValueAsString("Name");
+    ConstRecVec FeatureList = Processor->getValueAsListOfDefs("Features");
+
+    OS << " { " << "\"" << Name << "\", ";
+
+    printFeatureMask(OS, FeatureList, FeatureMap);
+    OS << " },\n";
+  }
+
+  // End processor table.
+  OS << "};\n";
+
+  return;
+}
+
+void TargetFeaturesEmitter::run(raw_ostream &OS) {
+  OS << "// Autogenerated by TargetFeatureEmitter.cpp\n\n";
+
+  OS << "\n#ifdef GET_SUBTARGETFEATURES_ENUM\n";
+  OS << "#undef GET_SUBTARGETFEATURES_ENUM\n\n";
+
+  OS << "namespace llvm {\n";
+  auto FeatureMap = enumeration(OS);
+  OS << "} // end namespace llvm\n\n";
+  OS << "#endif // GET_SUBTARGETFEATURES_ENUM\n\n";
+
+  OS << "\n#ifdef GET_SUBTARGETFEATURES_KV\n";
+  OS << "#undef GET_SUBTARGETFEATURES_KV\n\n";
+
+  OS << "namespace llvm {\n";
+  printFeatureKeyValues(OS, FeatureMap);
+  OS << "\n";
+
+  printCPUKeyValues(OS, FeatureMap);
+  OS << "\n";
+  OS << "} // end namespace llvm\n\n";
+  OS << "#endif // GET_SUBTARGETFEATURES_KV\n\n";
+}
+
+static TableGen::Emitter::OptClass<TargetFeaturesEmitter>
+    X("gen-target-features", "Generate subtarget enumerations");
diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
new file mode 100644
index 000000000000..99e4820c614c
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
@@ -0,0 +1,49 @@
+//===- TargetFeaturesEmitter.h- Generate CPU Target features ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the TargetFeaturesEmitter class, which is used to export
+// CPU target features and CPU subtypes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_BASIC_EMITTARGETFEATURE_H
+#define LLVM_UTILS_TABLEGEN_BASIC_EMITTARGETFEATURE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+/// Sorting predicate to sort record pointers by their
+/// FieldName field.
+struct LessRecordFieldFieldName {
+  bool operator()(const Record *Rec1, const Record *Rec2) const {
+    return Rec1->getValueAsString("FieldName") <
+           Rec2->getValueAsString("FieldName");
+  }
+};
+
+using FeatureMapTy = DenseMap<const Record *, unsigned>;
+
+class TargetFeaturesEmitter {
+protected:
+  const RecordKeeper &Records;
+  std::string Target;
+
+public:
+  TargetFeaturesEmitter(const RecordKeeper &R);
+  static void printFeatureMask(raw_ostream &OS,
+                               ArrayRef<const Record *> FeatureList,
+                               const FeatureMapTy &FeatureMap);
+  FeatureMapTy enumeration(raw_ostream &OS);
+  void printFeatureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  void printCPUKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  virtual void run(raw_ostream &O);
+  virtual ~TargetFeaturesEmitter() {};
+};
+} // namespace llvm
+#endif
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index ca008e256a70..77618b24cf11 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -10,12 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Basic/TargetFeaturesEmitter.h"
 #include "Common/CodeGenHwModes.h"
 #include "Common/CodeGenSchedule.h"
 #include "Common/CodeGenTarget.h"
 #include "Common/PredicateExpander.h"
 #include "Common/Utils.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
@@ -27,9 +27,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include "llvm/TargetParser/SubtargetFeature.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -43,18 +41,7 @@ using namespace llvm;
 
 namespace {
 
-using FeatureMapTy = DenseMap<const Record *, unsigned>;
-
-/// Sorting predicate to sort record pointers by their
-/// FieldName field.
-struct LessRecordFieldFieldName {
-  bool operator()(const Record *Rec1, const Record *Rec2) const {
-    return Rec1->getValueAsString("FieldName") <
-           Rec2->getValueAsString("FieldName");
-  }
-};
-
-class SubtargetEmitter {
+class SubtargetEmitter : TargetFeaturesEmitter {
   // Each processor has a SchedClassDesc table with an entry for each
   // SchedClass. The SchedClassDesc table indexes into a global write resource
   // table, write latency table, and read advance table.
@@ -83,11 +70,8 @@ class SubtargetEmitter {
   };
 
   CodeGenTarget TGT;
-  const RecordKeeper &Records;
   CodeGenSchedModels &SchedModels;
-  std::string Target;
 
-  FeatureMapTy enumeration(raw_ostream &OS);
   void emitSubtargetInfoMacroCalls(raw_ostream &OS);
   unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
@@ -143,73 +127,13 @@ class SubtargetEmitter {
 
 public:
   SubtargetEmitter(const RecordKeeper &R)
-      : TGT(R), Records(R), SchedModels(TGT.getSchedModels()),
-        Target(TGT.getName()) {}
+      : TargetFeaturesEmitter(R), TGT(R), SchedModels(TGT.getSchedModels()) {}
 
-  void run(raw_ostream &O);
+  void run(raw_ostream &O) override;
 };
 
 } // end anonymous namespace
 
-//
-// Enumeration - Emit the specified class as an enumeration.
-//
-FeatureMapTy SubtargetEmitter::enumeration(raw_ostream &OS) {
-  ArrayRef<const Record *> DefList =
-      Records.getAllDerivedDefinitions("SubtargetFeature");
-
-  unsigned N = DefList.size();
-  if (N == 0)
-    return FeatureMapTy();
-  if (N + 1 > MAX_SUBTARGET_FEATURES)
-    PrintFatalError(
-        "Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
-
-  OS << "namespace " << Target << " {\n";
-
-  // Open enumeration.
-  OS << "enum {\n";
-
-  FeatureMapTy FeatureMap;
-  // For each record
-  for (unsigned I = 0; I < N; ++I) {
-    // Next record
-    const Record *Def = DefList[I];
-
-    // Get and emit name
-    OS << "  " << Def->getName() << " = " << I << ",\n";
-
-    // Save the index for this feature.
-    FeatureMap[Def] = I;
-  }
-
-  OS << "  "
-     << "NumSubtargetFeatures = " << N << "\n";
-
-  // Close enumeration and namespace
-  OS << "};\n";
-  OS << "} // end namespace " << Target << "\n";
-  return FeatureMap;
-}
-
-static void printFeatureMask(raw_ostream &OS,
-                             ArrayRef<const Record *> FeatureList,
-                             const FeatureMapTy &FeatureMap) {
-  std::array<uint64_t, MAX_SUBTARGET_WORDS> Mask = {};
-  for (const Record *Feature : FeatureList) {
-    unsigned Bit = FeatureMap.lookup(Feature);
-    Mask[Bit / 64] |= 1ULL << (Bit % 64);
-  }
-
-  OS << "{ { { ";
-  for (unsigned I = 0; I != Mask.size(); ++I) {
-    OS << "0x";
-    OS.write_hex(Mask[I]);
-    OS << "ULL, ";
-  }
-  OS << "} } }";
-}
-
 /// Emit some information about the SubtargetFeature as calls to a macro so
 /// that they can be used from C++.
 void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {

From f87b6625d64c4ba95cf26b249ce6bdbcb31d65c9 Mon Sep 17 00:00:00 2001
From: Abdul Raheem <abdulraheembeigh@gmail.com>
Date: Thu, 19 Jun 2025 18:54:34 +0530
Subject: [PATCH 0938/1322] [MLIR][NFC] Fixed some Typos (#144263)

-- Fixed some typos in Operation.h

Signed-off: Abdul Raheem Beigh abdulraheembeigh@gmail.com
---
 mlir/include/mlir/IR/Operation.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 68ab1527b480..1c2c04e718bf 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -30,7 +30,7 @@ enum class OpProperties : char {};
 
 /// Operation is the basic unit of execution within MLIR.
 ///
-/// The following documentation are recommended to understand this class:
+/// The following documentations are recommended to understand this class:
 /// - https://mlir.llvm.org/docs/LangRef/#operations
 /// - https://mlir.llvm.org/docs/Tutorials/UnderstandingTheIRStructure/
 ///
@@ -66,14 +66,14 @@ enum class OpProperties : char {};
 /// tail allocated with the operation class itself, but can be dynamically moved
 /// out-of-line in a dynamic allocation as needed.
 ///
-/// An Operation may contain optionally one or multiple Regions, stored in a
+/// An Operation may optionally contain one or multiple Regions, stored in a
 /// tail allocated array. Each `Region` is a list of Blocks. Each `Block` is
 /// itself a list of Operations. This structure is effectively forming a tree.
 ///
 /// Some operations like branches also refer to other Block, in which case they
 /// would have an array of `BlockOperand`.
 ///
-/// An Operation may contain optionally a "Properties" object: this is a
+/// An Operation may optionally contain a "Properties" object: this is a
 /// pre-defined C++ object with a fixed size. This object is owned by the
 /// operation and deleted with the operation. It can be converted to an
 /// Attribute on demand, or loaded from an Attribute.
@@ -286,7 +286,7 @@ public:
   void destroy();
 
   /// This drops all operand uses from this operation, which is an essential
-  /// step in breaking cyclic dependences between references when they are to
+  /// step in breaking cyclic dependencies between references when they are to
   /// be deleted.
   void dropAllReferences();
 
@@ -448,11 +448,11 @@ public:
   /// to use Properties instead.
   void setInherentAttr(StringAttr name, Attribute value);
 
-  /// Access a discardable attribute by name, returns an null Attribute if the
+  /// Access a discardable attribute by name, returns a null Attribute if the
   /// discardable attribute does not exist.
   Attribute getDiscardableAttr(StringRef name) { return attrs.get(name); }
 
-  /// Access a discardable attribute by name, returns an null Attribute if the
+  /// Access a discardable attribute by name, returns a null Attribute if the
   /// discardable attribute does not exist.
   Attribute getDiscardableAttr(StringAttr name) { return attrs.get(name); }
 
@@ -515,7 +515,7 @@ public:
   DictionaryAttr getAttrDictionary();
 
   /// Set the attributes from a dictionary on this operation.
-  /// These methods are expensive: if the dictionnary only contains discardable
+  /// These methods are expensive: if the dictionary only contains discardable
   /// attributes, `setDiscardableAttrs` is more efficient.
   void setAttrs(DictionaryAttr newAttrs);
   void setAttrs(ArrayRef<NamedAttribute> newAttrs);
@@ -529,7 +529,7 @@ public:
   }
 
   /// Return the specified attribute if present, null otherwise.
-  /// These methods are expensive: if the dictionnary only contains discardable
+  /// These methods are expensive: if the dictionary only contains discardable
   /// attributes, `getDiscardableAttr` is more efficient.
   Attribute getAttr(StringAttr name) {
     if (getPropertiesStorageSize()) {
@@ -950,7 +950,7 @@ private:
   /// operation.
   static constexpr unsigned kOrderStride = 5;
 
-  /// Update the order index of this operation of this operation if necessary,
+  /// Update the order index of this operation if necessary,
   /// potentially recomputing the order of the parent block.
   void updateOrderIfNecessary();
 

From e75e2485f2e5e627d0bdf0306df4672f69ddd6eb Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Thu, 19 Jun 2025 09:26:45 -0400
Subject: [PATCH 0939/1322] [AMDGPU][True16][Codegen] keep srcmod/clamp/omod
 from v_s_xxx_f16 when moved to VALU (#144781)

https://github.com/llvm/llvm-project/pull/141152 causes an issue in
v_s_xxx_f16 lowering in both true16/fake16 flow.

V_S_XXX_F16 are special insts which has scalar input/output but in VALU
VOP3 format. Need to keep the srcmod/clamp/omod when lower it to its
corresponding VALU inst with vector input/output.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  9 ++-
 .../fix-sgpr-copies-f16-gfx12-fake16.mir      | 78 +++++++++++++++++++
 .../fix-sgpr-copies-f16-gfx12-true16.mir      | 78 +++++++++++++++++++
 ...-to-valu-pseudo-scalar-trans-f16-fake16.ll | 10 +--
 ...-to-valu-pseudo-scalar-trans-f16-true16.ll | 10 +--
 5 files changed, 171 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ebf8b99e9d7..a538ec9df6f0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7744,11 +7744,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
     auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
-                        .addImm(0) // src0_modifiers
+                        .add(Inst.getOperand(1)) // src0_modifiers
                         .add(Inst.getOperand(2))
-                        .addImm(0)  // clamp
-                        .addImm(0); // omod
-    if (ST.useRealTrue16Insts())
+                        .add(Inst.getOperand(3)) // clamp
+                        .add(Inst.getOperand(4)) // omod
+                        .setMIFlags(Inst.getFlags());
+    if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
       NewInstr.addImm(0); // opsel0
     MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
     legalizeOperandsVALUt16(*NewInstr, MRI);
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir
new file mode 100644
index 000000000000..1ec7249476ec
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir
@@ -0,0 +1,78 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
+
+---
+name:            v_s_exp_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_exp_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_EXP_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_log_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_log_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_LOG_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_rcp_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_rcp_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_RCP_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_rsq_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_rsq_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_RSQ_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_sqrt_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_sqrt_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_SQRT_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir
new file mode 100644
index 000000000000..5194d2529597
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir
@@ -0,0 +1,78 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
+
+---
+name:            v_s_exp_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_exp_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_EXP_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_log_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_log_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_LOG_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_LOG_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_rcp_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_rcp_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RCP_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_RCP_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_rsq_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_rsq_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RSQ_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_RSQ_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
+---
+name:            v_s_sqrt_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: v_s_sqrt_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SQRT_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32_xexec = V_S_SQRT_F16_e64 1, %2:sreg_32, 1, 1, implicit $mode, implicit $exec
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
index a6819359561b..2870af19f94c 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
@@ -11,7 +11,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
   ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -32,7 +32,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
   ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -53,7 +53,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
   ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -74,7 +74,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
   ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -95,7 +95,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
   ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
index b1b5b6b62296..c93eb1d9ef14 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
-  ; CHECK-NEXT:   [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
@@ -35,7 +35,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
-  ; CHECK-NEXT:   [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
@@ -58,7 +58,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
-  ; CHECK-NEXT:   [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
@@ -81,7 +81,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
-  ; CHECK-NEXT:   [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
@@ -104,7 +104,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
   ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
-  ; CHECK-NEXT:   [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]

From 278ece7c80d36bb1074fa53e655a5ca8f31145dd Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik@intel.com>
Date: Thu, 19 Jun 2025 06:29:05 -0700
Subject: [PATCH 0940/1322] [InstCombine][NFC] Pre-commit tests for #125935
 (#144111)

Pre-commit tests for #125935

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 .../InstCombine/bitcast-known-bits.ll         | 134 ++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/bitcast-known-bits.ll

diff --git a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
new file mode 100644
index 000000000000..3e47e775e3a2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; PR125228
+
+define <16 x i8> @knownbits_bitcast_masked_shift(<16 x i8> %arg1, <16 x i8> %arg2)  {
+; CHECK-LABEL: define <16 x i8> @knownbits_bitcast_masked_shift(
+; CHECK-SAME: <16 x i8> [[ARG1:%.*]], <16 x i8> [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[ARG1]], splat (i8 3)
+; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[ARG2]], splat (i8 48)
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint <16 x i8> [[AND3]], [[AND]]
+; CHECK-NEXT:    [[BITCAST4:%.*]] = bitcast <16 x i8> [[OR]] to <8 x i16>
+; CHECK-NEXT:    [[SHL5:%.*]] = shl nuw <8 x i16> [[BITCAST4]], splat (i16 2)
+; CHECK-NEXT:    [[BITCAST6:%.*]] = bitcast <8 x i16> [[SHL5]] to <16 x i8>
+; CHECK-NEXT:    [[AND7:%.*]] = and <16 x i8> [[BITCAST6]], splat (i8 -52)
+; CHECK-NEXT:    ret <16 x i8> [[AND7]]
+;
+  %and = and <16 x i8> %arg1, splat (i8 3)
+  %and3 = and <16 x i8> %arg2, splat (i8 48)
+  %or = or disjoint <16 x i8> %and3, %and
+  %bitcast4 = bitcast <16 x i8> %or to <8 x i16>
+  %shl5 = shl nuw <8 x i16> %bitcast4, splat (i16 2)
+  %bitcast6 = bitcast <8 x i16> %shl5 to <16 x i8>
+  %and7 = and <16 x i8> %bitcast6, splat (i8 -52)
+  ret <16 x i8> %and7
+}
+
+define <16 x i8> @knownbits_shuffle_masked_nibble_shift(<16 x i8> %arg)  {
+; CHECK-LABEL: define <16 x i8> @knownbits_shuffle_masked_nibble_shift(
+; CHECK-SAME: <16 x i8> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[ARG]], splat (i8 15)
+; CHECK-NEXT:    [[SHUFFLEVECTOR:%.*]] = shufflevector <16 x i8> [[AND]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
+; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
+; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
+; CHECK-NEXT:    ret <16 x i8> [[AND3]]
+;
+  %and = and <16 x i8> %arg, splat (i8 15)
+  %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %bitcast1 = bitcast <16 x i8> %shufflevector to <8 x i16>
+  %shl = shl nuw <8 x i16> %bitcast1, splat (i16 4)
+  %bitcast2 = bitcast <8 x i16> %shl to <16 x i8>
+  %and3 = and <16 x i8> %bitcast2, splat (i8 -16)
+  ret <16 x i8> %and3
+}
+
+define <16 x i8> @knownbits_reverse_shuffle_masked_shift(<16 x i8> %arg)  {
+; CHECK-LABEL: define <16 x i8> @knownbits_reverse_shuffle_masked_shift(
+; CHECK-SAME: <16 x i8> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[ARG]], splat (i8 15)
+; CHECK-NEXT:    [[SHUFFLEVECTOR:%.*]] = shufflevector <16 x i8> [[AND]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
+; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
+; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
+; CHECK-NEXT:    ret <16 x i8> [[AND3]]
+;
+  %and = and <16 x i8> %arg, splat (i8 15)
+  %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  %bitcast1 = bitcast <16 x i8> %shufflevector to <8 x i16>
+  %shl = shl nuw <8 x i16> %bitcast1, splat (i16 4)
+  %bitcast2 = bitcast <8 x i16> %shl to <16 x i8>
+  %and3 = and <16 x i8> %bitcast2, splat (i8 -16)
+  ret <16 x i8> %and3
+}
+
+define <16 x i8> @knownbits_extract_bit(<8 x i16> %arg)  {
+; CHECK-LABEL: define <16 x i8> @knownbits_extract_bit(
+; CHECK-SAME: <8 x i16> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr <8 x i16> [[ARG]], splat (i16 15)
+; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <8 x i16> [[LSHR]] to <16 x i8>
+; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[BITCAST1]], splat (i8 1)
+; CHECK-NEXT:    ret <16 x i8> [[AND]]
+;
+  %lshr = lshr <8 x i16> %arg, splat (i16 15)
+  %bitcast1 = bitcast <8 x i16> %lshr to <16 x i8>
+  %and = and <16 x i8> %bitcast1, splat (i8 1)
+  ret <16 x i8> %and
+}
+
+define { i32, i1 } @knownbits_popcount_add_with_overflow(<2 x i64> %arg1, <2 x i64> %arg2)  {
+; CHECK-LABEL: define { i32, i1 } @knownbits_popcount_add_with_overflow(
+; CHECK-SAME: <2 x i64> [[ARG1:%.*]], <2 x i64> [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call range(i64 0, 65) <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[ARG1]])
+; CHECK-NEXT:    [[BITCAST5:%.*]] = bitcast <2 x i64> [[CALL]] to <4 x i32>
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x i32> [[BITCAST5]], i64 0
+; CHECK-NEXT:    [[CALL9:%.*]] = tail call range(i64 0, 65) <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[ARG2]])
+; CHECK-NEXT:    [[BITCAST10:%.*]] = bitcast <2 x i64> [[CALL9]] to <4 x i32>
+; CHECK-NEXT:    [[EXTRACTELEMENT11:%.*]] = extractelement <4 x i32> [[BITCAST10]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[EXTRACTELEMENT]], i32 [[EXTRACTELEMENT11]])
+; CHECK-NEXT:    ret { i32, i1 } [[TMP1]]
+;
+  %call = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %arg1)
+  %bitcast5 = bitcast <2 x i64> %call to <4 x i32>
+  %extractelement = extractelement <4 x i32> %bitcast5, i64 0
+  %call9 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %arg2)
+  %bitcast10 = bitcast <2 x i64> %call9 to <4 x i32>
+  %extractelement11 = extractelement <4 x i32> %bitcast10, i64 0
+  %call12 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %extractelement, i32 %extractelement11)
+  ret { i32, i1 } %call12
+}
+
+define <16 x i8> @knownbits_shuffle_add_shift_v32i8(<16 x i8> %arg1, <8 x i16> %arg2, <8 x i16> %arg3)  {
+; CHECK-LABEL: define <16 x i8> @knownbits_shuffle_add_shift_v32i8(
+; CHECK-SAME: <16 x i8> [[ARG1:%.*]], <8 x i16> [[ARG2:%.*]], <8 x i16> [[ARG3:%.*]]) {
+; CHECK-NEXT:    [[SHL6:%.*]] = shl <8 x i16> [[ARG2]], splat (i16 8)
+; CHECK-NEXT:    [[BITCAST7:%.*]] = bitcast <8 x i16> [[SHL6]] to <16 x i8>
+; CHECK-NEXT:    [[SHL10:%.*]] = shl <8 x i16> [[ARG3]], splat (i16 8)
+; CHECK-NEXT:    [[BITCAST11:%.*]] = bitcast <8 x i16> [[SHL10]] to <16 x i8>
+; CHECK-NEXT:    [[ADD12:%.*]] = add <16 x i8> [[BITCAST11]], [[BITCAST7]]
+; CHECK-NEXT:    [[ADD14:%.*]] = add <16 x i8> [[ADD12]], [[ARG1]]
+; CHECK-NEXT:    [[BITCAST14:%.*]] = bitcast <16 x i8> [[ADD12]] to <8 x i16>
+; CHECK-NEXT:    [[SHL15:%.*]] = shl <8 x i16> [[BITCAST14]], splat (i16 8)
+; CHECK-NEXT:    [[BITCAST16:%.*]] = bitcast <8 x i16> [[SHL15]] to <16 x i8>
+; CHECK-NEXT:    [[ADD13:%.*]] = add <16 x i8> [[ADD14]], [[BITCAST16]]
+; CHECK-NEXT:    ret <16 x i8> [[ADD13]]
+;
+  %shl6 = shl <8 x i16> %arg2, splat (i16 8)
+  %bitcast7 = bitcast <8 x i16> %shl6 to <16 x i8>
+  %shl10 = shl <8 x i16> %arg3, splat (i16 8)
+  %bitcast11 = bitcast <8 x i16> %shl10 to <16 x i8>
+  %add12 = add <16 x i8> %bitcast11, %bitcast7
+  %add13 = add <16 x i8> %add12, %arg1
+  %bitcast14 = bitcast <16 x i8> %add12 to <8 x i16>
+  %shl15 = shl <8 x i16> %bitcast14, splat (i16 8)
+  %bitcast16 = bitcast <8 x i16> %shl15 to <16 x i8>
+  %add17 = add <16 x i8> %add13, %bitcast16
+  ret <16 x i8> %add17
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32)

From a4e4527c4b44be9a88168c0a4758de58fd1a770d Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 19 Jun 2025 15:39:06 +0200
Subject: [PATCH 0941/1322] [mlir][Transforms] Fix replaceUsesOfBlockArgument
 API (#144706)

Before this PR, users had to pass the "old" block argument when
replacing the uses of a block argument in a newly converted block. Users
can now pass the actual block argument that should be replaced.

Note for LLVM integration: Make sure to pass the current block argument
instead of the old one.
---
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp   | 2 +-
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 328c605add65..c6bcb593eaad 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -295,7 +295,7 @@ static void restoreByValRefArgumentType(
         cast<TypeAttr>(byValRefAttr->getValue()).getValue());
 
     auto valueArg = rewriter.create<LLVM::LoadOp>(arg.getLoc(), resTy, arg);
-    rewriter.replaceUsesOfBlockArgument(oldArg, valueArg);
+    rewriter.replaceUsesOfBlockArgument(arg, valueArg);
   }
 }
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index c4b85ec4f67d..3b669f51a615 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1743,7 +1743,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from,
   });
   impl->appendRewrite<ReplaceBlockArgRewrite>(from.getOwner(), from,
                                               impl->currentTypeConverter);
-  impl->mapping.map(impl->mapping.lookupOrDefault(from), to);
+  impl->mapping.map(from, to);
 }
 
 Value ConversionPatternRewriter::getRemappedValue(Value key) {

From 802fa92aee3565768887615108aa3e924d4e0fc7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 19 Jun 2025 16:17:54 +0200
Subject: [PATCH 0942/1322] [PredicateInfo] Avoid duplicate hash lookup (NFC)

Use try_emplace to either look up the existing entry or insert it.
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index d4b2bcf40fd8..4c87babbfb6f 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -728,16 +728,12 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
 
 PredicateInfoBuilder::ValueInfo &
 PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) {
-  auto OIN = ValueInfoNums.find(Operand);
-  if (OIN == ValueInfoNums.end()) {
-    // This will grow it
+  auto Res = ValueInfoNums.try_emplace(Operand, ValueInfos.size());
+  if (Res.second) {
+    // Allocate space for new ValueInfo.
     ValueInfos.resize(ValueInfos.size() + 1);
-    // This will use the new size and give us a 0 based number of the info
-    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
-    assert(InsertResult.second && "Value info number already existed?");
-    return ValueInfos[InsertResult.first->second];
   }
-  return ValueInfos[OIN->second];
+  return ValueInfos[Res.first->second];
 }
 
 const PredicateInfoBuilder::ValueInfo &

From 7e8f1f5f72753a1d9d3ae2810da52b82a787600b Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Thu, 19 Jun 2025 14:23:23 +0000
Subject: [PATCH 0943/1322] [HIP] Remove dots in HIP runtime path (#143792)

Remove the dots in the HIP path before passing to the rpath flag
---
 clang/lib/Driver/ToolChains/Linux.cpp        | 8 +++++---
 clang/test/Driver/hip-runtime-libs-linux.hip | 6 ++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 151b2bfced81..8ac8d4eb9181 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -743,9 +743,11 @@ void Linux::AddHIPRuntimeLibArgs(const ArgList &Args,
       Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath()));
 
   if (Args.hasFlag(options::OPT_frtlib_add_rpath,
-                   options::OPT_fno_rtlib_add_rpath, false))
-    CmdArgs.append(
-        {"-rpath", Args.MakeArgString(RocmInstallation->getLibPath())});
+                   options::OPT_fno_rtlib_add_rpath, false)) {
+    SmallString<0> p = RocmInstallation->getLibPath();
+    llvm::sys::path::remove_dots(p, true);
+    CmdArgs.append({"-rpath", Args.MakeArgString(p)});
+  }
 
   CmdArgs.push_back("-lamdhip64");
 }
diff --git a/clang/test/Driver/hip-runtime-libs-linux.hip b/clang/test/Driver/hip-runtime-libs-linux.hip
index a4cd2733114b..eda87d0aa4b6 100644
--- a/clang/test/Driver/hip-runtime-libs-linux.hip
+++ b/clang/test/Driver/hip-runtime-libs-linux.hip
@@ -20,6 +20,11 @@
 // RUN:   --rocm-path=%S/Inputs/rocm %t.o -frtlib-add-rpath 2>&1 \
 // RUN:   | FileCheck -check-prefixes=ROCM-RPATH %s
 
+// Test that a canonical HIP runtime path is passed to the -rpath flag
+// RUN: %clang -### --hip-link --target=x86_64-linux-gnu \
+// RUN:   --rocm-path=%S/Inputs/rocm/./bin/../include/../ %t.o -frtlib-add-rpath 2>&1 \
+// RUN:   | FileCheck -check-prefixes=ROCM-RPATH-CANONICAL %s
+
 // Test detecting latest /opt/rocm-{release} directory.
 // RUN: rm -rf %t && mkdir -p %t/opt
 // RUN: cp -r %S/Inputs/rocm %t/opt/rocm-3.9.0-1234
@@ -55,6 +60,7 @@
 
 // ROCM-PATH: "-L[[HIPRT:.*/Inputs/rocm/lib]]" "-lamdhip64"
 // ROCM-RPATH: "-L[[HIPRT:.*/Inputs/rocm/lib]]" "-rpath" "[[HIPRT]]" "-lamdhip64"
+// ROCM-RPATH-CANONICAL: "-rpath" "{{.*/rocm/lib}}" "-lamdhip64"
 // ROCM-REL: "-L[[HIPRT:.*/opt/rocm-3.10.0/lib]]" "-lamdhip64"
 // NOHIPRT-NOT: "-L{{.*/Inputs/rocm/lib}}"
 // NOHIPRT-NOT: "-rpath" "{{.*/Inputs/rocm/lib}}"

From 4c6f398b866030c17fd94dcdca04f4df03c5214c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 19 Jun 2025 09:23:57 -0500
Subject: [PATCH 0944/1322] [Clang] Add standalone AMDGPU SPIR-V toolchain
 (#144576)

Summary:
The AMDGPU toolchain uses a different set of tools than the standard
SPIR-V toolchain. The linker wrapper prefers to invoke a linker via a
clang toolchain. To make that work we introduce
`--target=spirv64-amd-amdhsa` so that it creates the linking phases that
HIP prefers. Additionally, this can be used to make LLVM-IR / SPIR-V
from C/C++ that can be linked with the HIP output.
---
 clang/lib/Driver/Driver.cpp             | 16 +++++++++++-----
 clang/lib/Driver/ToolChains/HIPAMD.cpp  | 12 ++++++++++++
 clang/lib/Driver/ToolChains/HIPAMD.h    |  9 +++++++++
 clang/test/Driver/spirv-amd-toolchain.c | 19 +++++++++++++++++++
 4 files changed, 51 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Driver/spirv-amd-toolchain.c

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 2f86b6633df1..061d60c62903 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6857,11 +6857,17 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
       TC = std::make_unique<toolchains::NVPTXToolChain>(*this, Target, Args);
       break;
     case llvm::Triple::AMDHSA: {
-      bool DL =
-          usesInput(Args, types::isOpenCL) || usesInput(Args, types::isLLVMIR);
-      TC = DL ? std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args)
-              : std::make_unique<toolchains::AMDGPUToolChain>(*this, Target,
-                                                              Args);
+      if (Target.getArch() == llvm::Triple::spirv64) {
+        TC = std::make_unique<toolchains::SPIRVAMDToolChain>(*this, Target,
+                                                             Args);
+      } else {
+        bool DL = usesInput(Args, types::isOpenCL) ||
+                  usesInput(Args, types::isLLVMIR);
+        TC = DL ? std::make_unique<toolchains::ROCMToolChain>(*this, Target,
+                                                              Args)
+                : std::make_unique<toolchains::AMDGPUToolChain>(*this, Target,
+                                                                Args);
+      }
       break;
     }
     case llvm::Triple::AMDPAL:
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index a8f2b09b1b20..74ac8306e7cc 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -417,3 +417,15 @@ void HIPAMDToolChain::checkTargetID(
     getDriver().Diag(clang::diag::err_drv_bad_target_id)
         << *PTID.OptionalTargetID;
 }
+
+SPIRVAMDToolChain::SPIRVAMDToolChain(const Driver &D,
+                                     const llvm::Triple &Triple,
+                                     const ArgList &Args)
+    : ROCMToolChain(D, Triple, Args) {
+  getProgramPaths().push_back(getDriver().Dir);
+}
+
+Tool *SPIRVAMDToolChain::buildLinker() const {
+  assert(getTriple().getArch() == llvm::Triple::spirv64);
+  return new tools::AMDGCN::Linker(*this);
+}
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.h b/clang/lib/Driver/ToolChains/HIPAMD.h
index c31894e22c5c..3630b11cd8b1 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.h
+++ b/clang/lib/Driver/ToolChains/HIPAMD.h
@@ -97,6 +97,15 @@ protected:
   Tool *buildLinker() const override;
 };
 
+class LLVM_LIBRARY_VISIBILITY SPIRVAMDToolChain final : public ROCMToolChain {
+public:
+  SPIRVAMDToolChain(const Driver &D, const llvm::Triple &Triple,
+                    const llvm::opt::ArgList &Args);
+
+protected:
+  Tool *buildLinker() const override;
+};
+
 } // end namespace toolchains
 } // end namespace driver
 } // end namespace clang
diff --git a/clang/test/Driver/spirv-amd-toolchain.c b/clang/test/Driver/spirv-amd-toolchain.c
new file mode 100644
index 000000000000..c9417400a937
--- /dev/null
+++ b/clang/test/Driver/spirv-amd-toolchain.c
@@ -0,0 +1,19 @@
+// RUN: %clang -### -ccc-print-phases --target=spirv64-amd-amdhsa %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=PHASES
+// PHASES: 0: input, "[[INPUT:.+]]", c
+// PHASES: 1: preprocessor, {0}, cpp-output
+// PHASES: 2: compiler, {1}, ir
+// PHASES: 3: backend, {2}, assembler
+// PHASES: 4: assembler, {3}, object
+// PHASES: 5: linker, {4}, image
+
+// RUN: %clang -### -ccc-print-bindings --target=spirv64-amd-amdhsa %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=BINDINGS
+// BINDINGS: # "spirv64-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[OUTPUT:.+]]"
+// BINDINGS: # "spirv64-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OUTPUT]]"], output: "a.out"
+
+// RUN: %clang -### --target=spirv64-amd-amdhsa %s -nogpulib -nogpuinc 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=INVOCATION
+// INVOCATION: "-cc1" "-triple" "spirv64-amd-amdhsa" {{.*}} "-o" "[[OUTPUT:.+]]" "-x" "c"
+// INVOCATION: "{{.*}}llvm-link" "-o" "a.out" "[[OUTPUT]]"
+// INVOCATION: "{{.*}}llvm-spirv" "--spirv-max-version=1.6" "--spirv-ext=+all" "--spirv-allow-unknown-intrinsics" "--spirv-lower-const-expr" "--spirv-preserve-auxdata" "--spirv-debug-info-version=nonsemantic-shader-200" "a.out" "-o" "a.out"

From 19360e62d0d1a1dabf9f01736927ab8f1b72c7df Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 19 Jun 2025 10:20:36 -0400
Subject: [PATCH 0945/1322] [gn build] port bf79d4819ede (ppc
 -gen-target-features)

---
 .../gn/secondary/llvm/include/llvm/TargetParser/BUILD.gn  | 8 ++++++++
 .../utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn | 1 +
 2 files changed, 9 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/TargetParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/TargetParser/BUILD.gn
index 455a8265fce8..c3cdf46ccfb9 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/TargetParser/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/TargetParser/BUILD.gn
@@ -14,6 +14,13 @@ tablegen("AArch64TargetParserDef") {
   tblgen_target = "//llvm/utils/TableGen:llvm-min-tblgen"
 }
 
+tablegen("PPCGenTargetFeatures") {
+  visibility = [ ":gen" ]
+  args = [ "-gen-target-features" ]
+  td_file = "//llvm/lib/Target/PowerPC/PPC.td"
+  tblgen_target = "//llvm/utils/TableGen:llvm-min-tblgen"
+}
+
 tablegen("RISCVTargetParserDef") {
   visibility = [ ":gen" ]
   args = [ "-gen-riscv-target-def" ]
@@ -25,6 +32,7 @@ group("gen") {
   deps = [
     ":AArch64TargetParserDef",
     ":ARMTargetParserDef",
+    ":PPCGenTargetFeatures",
     ":RISCVTargetParserDef",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
index ef6d6e44b6f8..5838118f2711 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
@@ -12,6 +12,7 @@ source_set("Basic") {
     "RISCVTargetDefEmitter.cpp",
     "SDNodeProperties.cpp",
     "TableGen.cpp",
+    "TargetFeaturesEmitter.cpp",
     "VTEmitter.cpp",
   ]
 }

From 89efae916a5de0387710b7dc06938423817e1503 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles@arm.com>
Date: Thu, 19 Jun 2025 15:32:27 +0100
Subject: [PATCH 0946/1322] [Flang][OpenMP] Update default MapType for Map
 Clauses and OpenMP 5.2 (#144715)

In OpenMP 5.2, the `target enter data` and `target exit data` constructs
now have default map types if the user does not define them in the Map
clause. For `target enter data`, this is `to` and `target exit data`
this is `from`. This behaviour is now enabled when OpenMP 5.2 or greater
is used when compiling. To enable this, the default value is now set in
the `processMap` clause, with any previous behaviour being maintained
for either older versions of OpenMP or other directives.

See also #110008
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    | 17 +++++++++---
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |  1 +
 flang/lib/Lower/OpenMP/Clauses.cpp            |  2 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  5 ++--
 .../target-enter-data-default-openmp52.f90    | 27 +++++++++++++++++++
 5 files changed, 45 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index bc8fc14bcaeb..7bea427099a2 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1269,7 +1269,7 @@ void ClauseProcessor::processMapObjects(
 
 bool ClauseProcessor::processMap(
     mlir::Location currentLocation, lower::StatementContext &stmtCtx,
-    mlir::omp::MapClauseOps &result,
+    mlir::omp::MapClauseOps &result, llvm::omp::Directive directive,
     llvm::SmallVectorImpl<const semantics::Symbol *> *mapSyms) const {
   // We always require tracking of symbols, even if the caller does not,
   // so we create an optionally used local set of symbols when the mapSyms
@@ -1287,9 +1287,18 @@ bool ClauseProcessor::processMap(
     llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
         llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE;
     std::string mapperIdName = "__implicit_mapper";
-    // If the map type is specified, then process it else Tofrom is the
-    // default.
-    Map::MapType type = mapType.value_or(Map::MapType::Tofrom);
+    // If the map type is specified, then process it else set the appropriate
+    // default value
+    Map::MapType type;
+    if (directive == llvm::omp::Directive::OMPD_target_enter_data &&
+        semaCtx.langOptions().OpenMPVersion >= 52)
+      type = mapType.value_or(Map::MapType::To);
+    else if (directive == llvm::omp::Directive::OMPD_target_exit_data &&
+             semaCtx.langOptions().OpenMPVersion >= 52)
+      type = mapType.value_or(Map::MapType::From);
+    else
+      type = mapType.value_or(Map::MapType::Tofrom);
+
     switch (type) {
     case Map::MapType::To:
       mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index c957a94d387e..3d8c4a337a4a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -139,6 +139,7 @@ public:
   bool processMap(mlir::Location currentLocation,
                   lower::StatementContext &stmtCtx,
                   mlir::omp::MapClauseOps &result,
+                  llvm::omp::Directive directive = llvm::omp::OMPD_unknown,
                   llvm::SmallVectorImpl<const semantics::Symbol *> *mapSyms =
                       nullptr) const;
   bool processMotionClauses(lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index c0c57d1832d4..b599d69a3627 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1043,7 +1043,7 @@ Map make(const parser::OmpClause::Map &inp,
   auto type = [&]() -> std::optional<Map::MapType> {
     if (t3)
       return convert1(t3->v);
-    return Map::MapType::Tofrom;
+    return std::nullopt;
   }();
 
   Map::MapTypeModifiers typeMods;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 7ad886959727..2595a08f626e 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1831,7 +1831,8 @@ static void genTargetClauses(
   }
   cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps);
   cp.processIsDevicePtr(clauseOps, isDevicePtrSyms);
-  cp.processMap(loc, stmtCtx, clauseOps, &mapSyms);
+  cp.processMap(loc, stmtCtx, clauseOps, llvm::omp::Directive::OMPD_unknown,
+                &mapSyms);
   cp.processNowait(clauseOps);
   cp.processThreadLimit(stmtCtx, clauseOps);
 
@@ -1884,7 +1885,7 @@ static void genTargetEnterExitUpdateDataClauses(
   if (directive == llvm::omp::Directive::OMPD_target_update)
     cp.processMotionClauses(stmtCtx, clauseOps);
   else
-    cp.processMap(loc, stmtCtx, clauseOps);
+    cp.processMap(loc, stmtCtx, clauseOps, directive);
 
   cp.processNowait(clauseOps);
 }
diff --git a/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90 b/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90
new file mode 100644
index 000000000000..0d4fd964b71e
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90
@@ -0,0 +1,27 @@
+! This test checks the lowering and application of default map types for the target enter/exit data constructs and map clauses
+
+!RUN: %flang -fc1 -emit-fir -fopenmp -fopenmp-version=52 -o - %s | FileCheck %s --check-prefix=CHECK-52
+!RUN: not %flang -fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-51
+
+module test
+  real, allocatable :: A
+
+contains
+  subroutine initialize()
+  allocate(A)
+  !$omp target enter data map(A)
+  !CHECK-52: omp.map.info var_ptr(%2 : !fir.ref<!fir.box<!fir.heap<f32>>>, f32) map_clauses(to) capture(ByRef) var_ptr_ptr(%5 : !fir.llvm_ptr<!fir.ref<f32>>) -> !fir.llvm_ptr<!fir.ref<f32>> {name = ""}
+  !CHECK-52: omp.map.info var_ptr(%2 : !fir.ref<!fir.box<!fir.heap<f32>>>, !fir.box<!fir.heap<f32>>) map_clauses(to) capture(ByRef) members(%6 : [0] : !fir.llvm_ptr<!fir.ref<f32>>) -> !fir.ref<!fir.box<!fir.heap<f32>>> {name = "a"}
+  !CHECK-51: to and alloc map types are permitted
+
+  end subroutine initialize
+
+  subroutine finalize()
+  !$omp target exit data map(A)
+  !CHECK-52: omp.map.info var_ptr(%2 : !fir.ref<!fir.box<!fir.heap<f32>>>, f32) map_clauses(from) capture(ByRef) var_ptr_ptr(%3 : !fir.llvm_ptr<!fir.ref<f32>>) -> !fir.llvm_ptr<!fir.ref<f32>> {name = ""}
+  !CHECK-52: omp.map.info var_ptr(%2 : !fir.ref<!fir.box<!fir.heap<f32>>>, !fir.box<!fir.heap<f32>>) map_clauses(from) capture(ByRef) members(%4 : [0] : !fir.llvm_ptr<!fir.ref<f32>>) -> !fir.ref<!fir.box<!fir.heap<f32>>> {name = "a"}
+  !CHECK-51: from, release and delete map types are permitted
+  deallocate(A)
+  
+  end subroutine finalize
+end module test

From eb694b28461fdbd5e347fca59829e8a9ad021773 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Thu, 19 Jun 2025 16:32:48 +0200
Subject: [PATCH 0947/1322] [mlir][arith] Delete mul ext canonicalizations
 (#144844)

The Arith dialect includes patterns that canonicalize a sequence of:

- trunci(shrui(mul(sext(x), sext(y)), c)) -> mulsi_extended(x, y)
- trunci(shrui(mul(zext(x), zext(y)), c)) -> mului_extended(x, y)

These patterns return the high word of an extended multiplication, which
assumes that the shift amount is equal to the bit width of the original
operands. This check was missing, leading to incorrect canonicalizations
when the shift amount was less than the bit width.

For example, the following code:
```
  %x = arith.extui %a: i32 to i33
  %y = arith.extui %b: i32 to i33
  %m = arith.muli %x, %y: i33
  %c1 = arith.constant 1: i33
  %sh = arith.shrui %m, %c1 : i33
  %hi = arith.trunci %sh: i33 to i32
```
would incorrectly be canonicalized to:
```
_, %hi = arith.mului_extended %a, %b : i32
```
This commit removes the faulty canonicalizations since they are not
believed to be generally beneficial (c.f., the discussion of the
alternative https://github.com/llvm/llvm-project/pull/144787 which fixes
the canonicalizations).
---
 .../Dialect/Arith/IR/ArithCanonicalization.td |  28 +----
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp        |   6 +-
 mlir/test/Dialect/Arith/canonicalize.mlir     | 114 +-----------------
 3 files changed, 5 insertions(+), 143 deletions(-)

diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index b61612436eb7..64eccc76a664 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -273,7 +273,7 @@ def RedundantSelectFalse :
     Pat<(SelectOp $pred, $a, (SelectOp $pred, $b, $c)),
         (SelectOp $pred, $a, $c)>;
 
-// select(pred, false, true) => not(pred) 
+// select(pred, false, true) => not(pred)
 def SelectI1ToNot :
     Pat<(SelectOp $pred,
                   (ConstantLikeMatcher ConstantAttr<I1Attr, "0">),
@@ -361,10 +361,6 @@ def OrOfExtSI :
 // TruncIOp
 //===----------------------------------------------------------------------===//
 
-def ValuesWithSameType :
-    Constraint<
-      CPred<"llvm::all_equal({$0.getType(), $1.getType(), $2.getType()})">>;
-
 def ValueWiderThan :
     Constraint<And<[
       CPred<"getScalarOrElementWidth($0) > getScalarOrElementWidth($1)">,
@@ -397,28 +393,6 @@ def TruncIShrSIToTrunciShrUI :
         (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0))), $overflow),
         [(TruncationMatchesShiftAmount $x, $tr, $c0)]>;
 
-// trunci(shrui(mul(sext(x), sext(y)), c)) -> mulsi_extended(x, y)
-def TruncIShrUIMulIToMulSIExtended :
-    Pat<(Arith_TruncIOp:$tr (Arith_ShRUIOp
-                              (Arith_MulIOp:$mul
-                                (Arith_ExtSIOp $x), (Arith_ExtSIOp $y), $ovf1),
-                              (ConstantLikeMatcher AnyAttr:$c0)), $overflow),
-        (Arith_MulSIExtendedOp:$res__1 $x, $y),
-      [(ValuesWithSameType $tr, $x, $y),
-       (ValueWiderThan $mul, $x),
-       (TruncationMatchesShiftAmount $mul, $x, $c0)]>;
-
-// trunci(shrui(mul(zext(x), zext(y)), c)) -> mului_extended(x, y)
-def TruncIShrUIMulIToMulUIExtended :
-    Pat<(Arith_TruncIOp:$tr (Arith_ShRUIOp
-                              (Arith_MulIOp:$mul
-                                (Arith_ExtUIOp $x), (Arith_ExtUIOp $y), $ovf1),
-                              (ConstantLikeMatcher AnyAttr:$c0)), $overflow),
-        (Arith_MulUIExtendedOp:$res__1 $x, $y),
-      [(ValuesWithSameType $tr, $x, $y),
-       (ValueWiderThan $mul, $x),
-       (TruncationMatchesShiftAmount $mul, $x, $c0)]>;
-
 //===----------------------------------------------------------------------===//
 // TruncIOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 9e53e195274a..12a1e18597ac 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1509,9 +1509,9 @@ bool arith::TruncIOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
 
 void arith::TruncIOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                                   MLIRContext *context) {
-  patterns.add<TruncIExtSIToExtSI, TruncIExtUIToExtUI, TruncIShrSIToTrunciShrUI,
-               TruncIShrUIMulIToMulSIExtended, TruncIShrUIMulIToMulUIExtended>(
-      context);
+  patterns
+      .add<TruncIExtSIToExtSI, TruncIExtUIToExtUI, TruncIShrSIToTrunciShrUI>(
+          context);
 }
 
 LogicalResult arith::TruncIOp::verify() {
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index b6188c81ff91..076f3a99cd39 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -1000,7 +1000,7 @@ func.func @tripleAddAddOvf2(%arg0: index) -> index {
 
 
 // CHECK-LABEL: @foldSubXX_tensor
-//       CHECK:   %[[c0:.+]] = arith.constant dense<0> : tensor<10xi32> 
+//       CHECK:   %[[c0:.+]] = arith.constant dense<0> : tensor<10xi32>
 //       CHECK:   %[[sub:.+]] = arith.subi
 //       CHECK:   return %[[c0]], %[[sub]]
 func.func @foldSubXX_tensor(%static : tensor<10xi32>, %dyn : tensor<?x?xi32>) -> (tensor<10xi32>, tensor<?x?xi32>) {
@@ -2952,118 +2952,6 @@ func.func @truncIShrSIToTrunciShrUIBadShiftAmt2(%a: i64) -> i32 {
   return %hi : i32
 }
 
-// CHECK-LABEL: @wideMulToMulSIExtended
-//  CHECK-SAME:   (%[[A:.+]]: i32, %[[B:.+]]: i32)
-//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mulsi_extended %[[A]], %[[B]] : i32
-//  CHECK-NEXT:   return %[[HIGH]] : i32
-func.func @wideMulToMulSIExtended(%a: i32, %b: i32) -> i32 {
-  %x = arith.extsi %a: i32 to i64
-  %y = arith.extsi %b: i32 to i64
-  %m = arith.muli %x, %y: i64
-  %c32 = arith.constant 32: i64
-  %sh = arith.shrui %m, %c32 : i64
-  %hi = arith.trunci %sh: i64 to i32
-  return %hi : i32
-}
-
-// CHECK-LABEL: @wideMulToMulSIExtendedVector
-//  CHECK-SAME:   (%[[A:.+]]: vector<3xi32>, %[[B:.+]]: vector<3xi32>)
-//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mulsi_extended %[[A]], %[[B]] : vector<3xi32>
-//  CHECK-NEXT:   return %[[HIGH]] : vector<3xi32>
-func.func @wideMulToMulSIExtendedVector(%a: vector<3xi32>, %b: vector<3xi32>) -> vector<3xi32> {
-  %x = arith.extsi %a: vector<3xi32> to vector<3xi64>
-  %y = arith.extsi %b: vector<3xi32> to vector<3xi64>
-  %m = arith.muli %x, %y: vector<3xi64>
-  %c32 = arith.constant dense<32>: vector<3xi64>
-  %sh = arith.shrui %m, %c32 : vector<3xi64>
-  %hi = arith.trunci %sh: vector<3xi64> to vector<3xi32>
-  return %hi : vector<3xi32>
-}
-
-// CHECK-LABEL: @wideMulToMulUIExtended
-//  CHECK-SAME:   (%[[A:.+]]: i32, %[[B:.+]]: i32)
-//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mului_extended %[[A]], %[[B]] : i32
-//  CHECK-NEXT:   return %[[HIGH]] : i32
-func.func @wideMulToMulUIExtended(%a: i32, %b: i32) -> i32 {
-  %x = arith.extui %a: i32 to i64
-  %y = arith.extui %b: i32 to i64
-  %m = arith.muli %x, %y: i64
-  %c32 = arith.constant 32: i64
-  %sh = arith.shrui %m, %c32 : i64
-  %hi = arith.trunci %sh: i64 to i32
-  return %hi : i32
-}
-
-// CHECK-LABEL: @wideMulToMulUIExtendedVector
-//  CHECK-SAME:   (%[[A:.+]]: vector<3xi32>, %[[B:.+]]: vector<3xi32>)
-//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mului_extended %[[A]], %[[B]] : vector<3xi32>
-//  CHECK-NEXT:   return %[[HIGH]] : vector<3xi32>
-func.func @wideMulToMulUIExtendedVector(%a: vector<3xi32>, %b: vector<3xi32>) -> vector<3xi32> {
-  %x = arith.extui %a: vector<3xi32> to vector<3xi64>
-  %y = arith.extui %b: vector<3xi32> to vector<3xi64>
-  %m = arith.muli %x, %y: vector<3xi64>
-  %c32 = arith.constant dense<32>: vector<3xi64>
-  %sh = arith.shrui %m, %c32 : vector<3xi64>
-  %hi = arith.trunci %sh: vector<3xi64> to vector<3xi32>
-  return %hi : vector<3xi32>
-}
-
-// CHECK-LABEL: @wideMulToMulIExtendedMixedExt
-//       CHECK:   arith.muli
-//       CHECK:   arith.shrui
-//       CHECK:   arith.trunci
-func.func @wideMulToMulIExtendedMixedExt(%a: i32, %b: i32) -> i32 {
-  %x = arith.extsi %a: i32 to i64
-  %y = arith.extui %b: i32 to i64
-  %m = arith.muli %x, %y: i64
-  %c32 = arith.constant 32: i64
-  %sh = arith.shrui %m, %c32 : i64
-  %hi = arith.trunci %sh: i64 to i32
-  return %hi : i32
-}
-
-// CHECK-LABEL: @wideMulToMulSIExtendedBadExt
-//       CHECK:   arith.muli
-//       CHECK:   arith.shrui
-//       CHECK:   arith.trunci
-func.func @wideMulToMulSIExtendedBadExt(%a: i16, %b: i16) -> i32 {
-  %x = arith.extsi %a: i16 to i64
-  %y = arith.extsi %b: i16 to i64
-  %m = arith.muli %x, %y: i64
-  %c32 = arith.constant 32: i64
-  %sh = arith.shrui %m, %c32 : i64
-  %hi = arith.trunci %sh: i64 to i32
-  return %hi : i32
-}
-
-// CHECK-LABEL: @wideMulToMulSIExtendedBadShift1
-//       CHECK:   arith.muli
-//       CHECK:   arith.shrui
-//       CHECK:   arith.trunci
-func.func @wideMulToMulSIExtendedBadShift1(%a: i32, %b: i32) -> i32 {
-  %x = arith.extsi %a: i32 to i64
-  %y = arith.extsi %b: i32 to i64
-  %m = arith.muli %x, %y: i64
-  %c33 = arith.constant 33: i64
-  %sh = arith.shrui %m, %c33 : i64
-  %hi = arith.trunci %sh: i64 to i32
-  return %hi : i32
-}
-
-// CHECK-LABEL: @wideMulToMulSIExtendedBadShift2
-//       CHECK:   arith.muli
-//       CHECK:   arith.shrui
-//       CHECK:   arith.trunci
-func.func @wideMulToMulSIExtendedBadShift2(%a: i32, %b: i32) -> i32 {
-  %x = arith.extsi %a: i32 to i64
-  %y = arith.extsi %b: i32 to i64
-  %m = arith.muli %x, %y: i64
-  %c31 = arith.constant 31: i64
-  %sh = arith.shrui %m, %c31 : i64
-  %hi = arith.trunci %sh: i64 to i32
-  return %hi : i32
-}
-
 // CHECK-LABEL: @foldShli0
 // CHECK-SAME: (%[[ARG:.*]]: i64)
 //       CHECK:   return %[[ARG]] : i64

From 3516ad05dfd674d731487cb67bbfe48f7e1fda9c Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail@igalia.com>
Date: Thu, 19 Jun 2025 11:42:50 -0300
Subject: [PATCH 0948/1322] [RISCV] Update SpacemiT X60 scheduling latencies
 based on hardware measurements (#144730)

This patch updates the RISC-V SpacemiT X60 scheduling model with latency
values collected from the X60 hardware. The previous values were
empirically derived but were slightly off.

  Changes:
  - LoadLatency (baseline for load instructions): 5 --> 3 cycles
  - Memory operations: unified at 4 cycles
  - Atomic loads/stores: 5 --> 8 cycles
  - Atomic RMW operations: 5 --> 12 cycles

Hardware-measured values provide more accurate instruction scheduling
for the in-order X60 core. Testing shows NFC across benchmarks except
for 523.xalancbmk_r (known to be noisy).

https://lnt.lukelau.me/db_default/v4/nts/663?compare_to=657
---
 .../lib/Target/RISCV/RISCVSchedSpacemitX60.td |  15 +-
 .../tools/llvm-mca/RISCV/SpacemitX60/atomic.s | 176 +++++++++---------
 .../RISCV/SpacemitX60/floating-point.s        |  12 +-
 .../llvm-mca/RISCV/SpacemitX60/integer.s      |  22 +--
 4 files changed, 113 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 8948694c420a..9059d5a4e497 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -16,7 +16,7 @@
 def SpacemitX60Model : SchedMachineModel {
   let IssueWidth        = 2; // dual-issue
   let MicroOpBufferSize = 0; // in-order
-  let LoadLatency       = 5; // worse case: >= 3
+  let LoadLatency       = 3; // worse case: >= 3
   let MispredictPenalty = 9; // nine-stage
 
   let CompleteModel = 0;
@@ -114,7 +114,7 @@ def : WriteRes<WriteBEXT, [SMX60_IEU]>;
 def : WriteRes<WriteBEXTI, [SMX60_IEU]>;
 
 // Memory/Atomic memory
-let Latency = 3 in {
+let Latency = 4 in {
   def : WriteRes<WriteSTB, [SMX60_LS]>;
   def : WriteRes<WriteSTH, [SMX60_LS]>;
   def : WriteRes<WriteSTW, [SMX60_LS]>;
@@ -122,11 +122,7 @@ let Latency = 3 in {
   def : WriteRes<WriteFST16, [SMX60_LS]>;
   def : WriteRes<WriteFST32, [SMX60_LS]>;
   def : WriteRes<WriteFST64, [SMX60_LS]>;
-  def : WriteRes<WriteAtomicSTW, [SMX60_LS]>;
-  def : WriteRes<WriteAtomicSTD, [SMX60_LS]>;
-}
 
-let Latency = 5 in {
   def : WriteRes<WriteLDB, [SMX60_LS]>;
   def : WriteRes<WriteLDH, [SMX60_LS]>;
   def : WriteRes<WriteLDW, [SMX60_LS]>;
@@ -137,9 +133,14 @@ let Latency = 5 in {
 }
 
 // Atomics
-let Latency = 5 in {
+let Latency = 8 in {
+  def : WriteRes<WriteAtomicSTW, [SMX60_LS]>;
+  def : WriteRes<WriteAtomicSTD, [SMX60_LS]>;
   def : WriteRes<WriteAtomicLDW, [SMX60_LS]>;
   def : WriteRes<WriteAtomicLDD, [SMX60_LS]>;
+}
+
+let Latency = 12 in {
   def : WriteRes<WriteAtomicW, [SMX60_LS]>;
   def : WriteRes<WriteAtomicD, [SMX60_LS]>;
 }
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
index ceab015e2720..bc9229471b20 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
@@ -120,94 +120,94 @@ amomaxu.d.aqrl s5, s4, (s3)
 # CHECK-NEXT: [9]: LLVM Opcode Name
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_W                       lr.w	t0, (t1)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_W_AQ                    lr.w.aq	t1, (t2)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_W_RL                    lr.w.rl	t2, (t3)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_W_AQ_RL                 lr.w.aqrl	t3, (t4)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_W                       sc.w	t6, t5, (t4)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_W_AQ                    sc.w.aq	t5, t4, (t3)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_W_RL                    sc.w.rl	t4, t3, (t2)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_W_AQ_RL                 sc.w.aqrl	t3, t2, (t1)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_D                       lr.d	t0, (t1)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_D_AQ                    lr.d.aq	t1, (t2)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_D_RL                    lr.d.rl	t2, (t3)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LR_D_AQ_RL                 lr.d.aqrl	t3, (t4)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_D                       sc.d	t6, t5, (t4)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_D_AQ                    sc.d.aq	t5, t4, (t3)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_D_RL                    sc.d.rl	t4, t3, (t2)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SC_D_AQ_RL                 sc.d.aqrl	t3, t2, (t1)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_W                  amoswap.w	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_W                   amoadd.w	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_W                   amoxor.w	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_W                   amoand.w	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_W                    amoor.w	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_W                   amomin.w	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_W                   amomax.w	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_W                  amominu.w	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_W                  amomaxu.w	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_W_AQ               amoswap.w.aq	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_W_AQ                amoadd.w.aq	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_W_AQ                amoxor.w.aq	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_W_AQ                amoand.w.aq	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_W_AQ                 amoor.w.aq	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_W_AQ                amomin.w.aq	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_W_AQ                amomax.w.aq	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_W_AQ               amominu.w.aq	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_W_AQ               amomaxu.w.aq	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_W_RL               amoswap.w.rl	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_W_RL                amoadd.w.rl	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_W_RL                amoxor.w.rl	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_W_RL                amoand.w.rl	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_W_RL                 amoor.w.rl	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_W_RL                amomin.w.rl	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_W_RL                amomax.w.rl	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_W_RL               amominu.w.rl	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_W_RL               amomaxu.w.rl	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_W_AQ_RL            amoswap.w.aqrl	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_W_AQ_RL             amoadd.w.aqrl	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_W_AQ_RL             amoxor.w.aqrl	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_W_AQ_RL             amoand.w.aqrl	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_W_AQ_RL              amoor.w.aqrl	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_W_AQ_RL             amomin.w.aqrl	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_W_AQ_RL             amomax.w.aqrl	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_W_AQ_RL            amominu.w.aqrl	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_W_AQ_RL            amomaxu.w.aqrl	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_D                  amoswap.d	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_D                   amoadd.d	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_D                   amoxor.d	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_D                   amoand.d	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_D                    amoor.d	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_D                   amomin.d	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_D                   amomax.d	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_D                  amominu.d	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_D                  amomaxu.d	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_D_AQ               amoswap.d.aq	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_D_AQ                amoadd.d.aq	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_D_AQ                amoxor.d.aq	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_D_AQ                amoand.d.aq	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_D_AQ                 amoor.d.aq	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_D_AQ                amomin.d.aq	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_D_AQ                amomax.d.aq	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_D_AQ               amominu.d.aq	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_D_AQ               amomaxu.d.aq	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_D_RL               amoswap.d.rl	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_D_RL                amoadd.d.rl	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_D_RL                amoxor.d.rl	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_D_RL                amoand.d.rl	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_D_RL                 amoor.d.rl	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_D_RL                amomin.d.rl	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_D_RL                amomax.d.rl	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_D_RL               amominu.d.rl	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_D_RL               amomaxu.d.rl	s5, s4, (s3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOSWAP_D_AQ_RL            amoswap.d.aqrl	a4, ra, (s0)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOADD_D_AQ_RL             amoadd.d.aqrl	a1, a2, (a3)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOXOR_D_AQ_RL             amoxor.d.aqrl	a2, a3, (a4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOAND_D_AQ_RL             amoand.d.aqrl	a3, a4, (a5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOOR_D_AQ_RL              amoor.d.aqrl	a4, a5, (a6)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMIN_D_AQ_RL             amomin.d.aqrl	a5, a6, (a7)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAX_D_AQ_RL             amomax.d.aqrl	s7, s6, (s5)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMINU_D_AQ_RL            amominu.d.aqrl	s6, s5, (s4)
-# CHECK-NEXT:  1      5     0.50    *      *             5     SMX60_LS                                   AMOMAXU_D_AQ_RL            amomaxu.d.aqrl	s5, s4, (s3)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_W                       lr.w	t0, (t1)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_W_AQ                    lr.w.aq	t1, (t2)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_W_RL                    lr.w.rl	t2, (t3)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_W_AQ_RL                 lr.w.aqrl	t3, (t4)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_W                       sc.w	t6, t5, (t4)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_W_AQ                    sc.w.aq	t5, t4, (t3)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_W_RL                    sc.w.rl	t4, t3, (t2)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_W_AQ_RL                 sc.w.aqrl	t3, t2, (t1)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_D                       lr.d	t0, (t1)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_D_AQ                    lr.d.aq	t1, (t2)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_D_RL                    lr.d.rl	t2, (t3)
+# CHECK-NEXT:  1      8     0.50    *                    8     SMX60_LS                                   LR_D_AQ_RL                 lr.d.aqrl	t3, (t4)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_D                       sc.d	t6, t5, (t4)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_D_AQ                    sc.d.aq	t5, t4, (t3)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_D_RL                    sc.d.rl	t4, t3, (t2)
+# CHECK-NEXT:  1      8     0.50           *             8     SMX60_LS                                   SC_D_AQ_RL                 sc.d.aqrl	t3, t2, (t1)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_W                  amoswap.w	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_W                   amoadd.w	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_W                   amoxor.w	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_W                   amoand.w	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_W                    amoor.w	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_W                   amomin.w	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_W                   amomax.w	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_W                  amominu.w	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_W                  amomaxu.w	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_W_AQ               amoswap.w.aq	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_W_AQ                amoadd.w.aq	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_W_AQ                amoxor.w.aq	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_W_AQ                amoand.w.aq	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_W_AQ                 amoor.w.aq	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_W_AQ                amomin.w.aq	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_W_AQ                amomax.w.aq	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_W_AQ               amominu.w.aq	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_W_AQ               amomaxu.w.aq	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_W_RL               amoswap.w.rl	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_W_RL                amoadd.w.rl	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_W_RL                amoxor.w.rl	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_W_RL                amoand.w.rl	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_W_RL                 amoor.w.rl	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_W_RL                amomin.w.rl	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_W_RL                amomax.w.rl	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_W_RL               amominu.w.rl	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_W_RL               amomaxu.w.rl	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_W_AQ_RL            amoswap.w.aqrl	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_W_AQ_RL             amoadd.w.aqrl	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_W_AQ_RL             amoxor.w.aqrl	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_W_AQ_RL             amoand.w.aqrl	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_W_AQ_RL              amoor.w.aqrl	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_W_AQ_RL             amomin.w.aqrl	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_W_AQ_RL             amomax.w.aqrl	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_W_AQ_RL            amominu.w.aqrl	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_W_AQ_RL            amomaxu.w.aqrl	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_D                  amoswap.d	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_D                   amoadd.d	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_D                   amoxor.d	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_D                   amoand.d	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_D                    amoor.d	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_D                   amomin.d	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_D                   amomax.d	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_D                  amominu.d	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_D                  amomaxu.d	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_D_AQ               amoswap.d.aq	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_D_AQ                amoadd.d.aq	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_D_AQ                amoxor.d.aq	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_D_AQ                amoand.d.aq	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_D_AQ                 amoor.d.aq	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_D_AQ                amomin.d.aq	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_D_AQ                amomax.d.aq	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_D_AQ               amominu.d.aq	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_D_AQ               amomaxu.d.aq	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_D_RL               amoswap.d.rl	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_D_RL                amoadd.d.rl	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_D_RL                amoxor.d.rl	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_D_RL                amoand.d.rl	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_D_RL                 amoor.d.rl	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_D_RL                amomin.d.rl	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_D_RL                amomax.d.rl	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_D_RL               amominu.d.rl	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_D_RL               amomaxu.d.rl	s5, s4, (s3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOSWAP_D_AQ_RL            amoswap.d.aqrl	a4, ra, (s0)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOADD_D_AQ_RL             amoadd.d.aqrl	a1, a2, (a3)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOXOR_D_AQ_RL             amoxor.d.aqrl	a2, a3, (a4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOAND_D_AQ_RL             amoand.d.aqrl	a3, a4, (a5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOOR_D_AQ_RL              amoor.d.aqrl	a4, a5, (a6)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMIN_D_AQ_RL             amomin.d.aqrl	a5, a6, (a7)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAX_D_AQ_RL             amomax.d.aqrl	s7, s6, (s5)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMINU_D_AQ_RL            amominu.d.aqrl	s6, s5, (s4)
+# CHECK-NEXT:  1      12    0.50    *      *             12    SMX60_LS                                   AMOMAXU_D_AQ_RL            amomaxu.d.aqrl	s5, s4, (s3)
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - SMX60_FP
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
index bd3666ef7bb9..b86fcbccbeab 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
@@ -148,12 +148,12 @@ fclass.d a3, ft10
 # CHECK-NEXT: [9]: LLVM Opcode Name
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   FLH                        flh	ft0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   FSH                        fsh	ft0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   FLW                        flw	ft0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   FSW                        fsw	ft0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   FLD                        fld	ft0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   FSD                        fsd	ft0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   FLH                        flh	ft0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   FSH                        fsh	ft0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   FLW                        flw	ft0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   FSW                        fsw	ft0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   FLD                        fld	ft0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   FSD                        fsd	ft0, 0(a0)
 # CHECK-NEXT:  1      4     1.00                         4     SMX60_FP                                   FADD_H                     fadd.h	fs10, fs11, ft8
 # CHECK-NEXT:  1      4     1.00                         4     SMX60_FP                                   FSUB_H                     fsub.h	ft9, ft10, ft11
 # CHECK-NEXT:  1      4     1.00                         4     SMX60_FP                                   FMUL_H                     fmul.h	ft0, ft1, ft2
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
index 8b43874499f2..b72540f29f48 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
@@ -222,17 +222,17 @@ bseti a0, a1, 1
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_IEU,SMX60_IEUA                       BGE                        bge	a0, a0, .Ltmp5
 # CHECK-NEXT:  1      1     1.00                         1     SMX60_IEU,SMX60_IEUA                       BGEU                       bgeu	a0, a0, .Ltmp6
 # CHECK-NEXT:  1      1     0.50                         1     SMX60_IEU                                  C_ADD                      add	a0, a0, a0
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LB                         lb	t0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LBU                        lbu	t0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LH                         lh	t0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LHU                        lhu	t0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LW                         lw	t0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LWU                        lwu	t0, 0(a0)
-# CHECK-NEXT:  1      5     0.50    *                    5     SMX60_LS                                   LD                         ld	t0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SB                         sb	t0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SH                         sh	t0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SW                         sw	t0, 0(a0)
-# CHECK-NEXT:  1      3     0.50           *             3     SMX60_LS                                   SD                         sd	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LB                         lb	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LBU                        lbu	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LH                         lh	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LHU                        lhu	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LW                         lw	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LWU                        lwu	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50    *                    4     SMX60_LS                                   LD                         ld	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   SB                         sb	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   SH                         sh	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   SW                         sw	t0, 0(a0)
+# CHECK-NEXT:  1      4     0.50           *             4     SMX60_LS                                   SD                         sd	t0, 0(a0)
 # CHECK-NEXT:  1      6     0.50                         6     SMX60_IEU                                  MUL                        mul	a0, a0, a0
 # CHECK-NEXT:  1      6     0.50                         6     SMX60_IEU                                  MULH                       mulh	a0, a0, a0
 # CHECK-NEXT:  1      6     0.50                         6     SMX60_IEU                                  MULHU                      mulhu	a0, a0, a0

From fdb572681de7e12b7cd03fd33478022272eb1bae Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Thu, 19 Jun 2025 15:50:53 +0100
Subject: [PATCH 0949/1322] Fix bazel build after #144594 (#144904)

---
 .../llvm-project-overlay/llvm/BUILD.bazel     | 41 +++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 31855cd5444c..d5a7fdb747f8 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -418,6 +418,7 @@ cc_library(
         "include/llvm/BinaryFormat/ELFRelocs/*.def",
     ]),
     deps = [
+        ":PPCTargetParser",
         ":Support",
         ":TargetParser",
     ],
@@ -1364,6 +1365,13 @@ td_library(
     ]),
 )
 
+td_library(
+    name = "PPCTargetTdFiles",
+    srcs = glob([
+        "lib/Target/PowerPC/**/*.td",
+    ]),
+)
+
 gentbl_cc_library(
     name = "RISCVTargetParserDefGen",
     tbl_outs = {"include/llvm/TargetParser/RISCVTargetParserDef.inc": ["-gen-riscv-target-def"]},
@@ -1375,11 +1383,38 @@ gentbl_cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "PPCGenTargetFeaturesGen",
+    tbl_outs = {"include/llvm/TargetParser/PPCGenTargetFeatures.inc": ["-gen-target-features"]},
+    tblgen = ":llvm-tblgen",
+    td_file = "lib/Target/PowerPC/PPC.td",
+    deps = [
+        ":CommonTargetTdFiles",
+        ":PPCTargetTdFiles",
+    ],
+)
+
+cc_library(
+    name = "PPCTargetParser",
+    srcs = ["lib/TargetParser/PPCTargetParser.cpp"],
+    hdrs = ["include/llvm/TargetParser/PPCTargetParser.h"],
+    copts = llvm_copts,
+    includes = ["include"],
+    deps = [
+        ":PPCGenTargetFeaturesGen",
+        ":Support",
+        ":TargetParser",
+    ],
+)
+
 cc_library(
     name = "TargetParser",
-    srcs = glob([
-        "lib/TargetParser/*.cpp",
-    ]) + select({
+    srcs = glob(
+        [
+            "lib/TargetParser/*.cpp",
+        ],
+        exclude = ["lib/TargetParser/PPCTargetParser.cpp"],
+    ) + select({
         "@platforms//os:windows": glob([
             "lib/TargetParser/Windows/*.inc",
         ]),

From 5eb24fde11cd82a08f208509f80f428da90c89c9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 19 Jun 2025 08:06:07 -0700
Subject: [PATCH 0950/1322] [SelectionDAG][RISCV] Preserve nneg flag when
 folding (trunc (zext X))->(zext X). (#144807)

If X is known non-negative, that's still true if we fold the truncate
to create a smaller zext.

In the i128 tests, SelectionDAGBuilder aggressively truncates the
`zext nneg` to i64 to match `getShiftAmountTy`. If we don't preserve
the `nneg` we can't see that the shift amount argument being `signext`
means we don't need to do any extension
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   8 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   8 +-
 llvm/test/CodeGen/RISCV/shifts.ll             | 295 ++++++++++++++++++
 3 files changed, 307 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e078f9dd88b..a6b9cc81edde 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15740,8 +15740,12 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       N0.getOpcode() == ISD::SIGN_EXTEND ||
       N0.getOpcode() == ISD::ANY_EXTEND) {
     // if the source is smaller than the dest, we still need an extend.
-    if (N0.getOperand(0).getValueType().bitsLT(VT))
-      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
+    if (N0.getOperand(0).getValueType().bitsLT(VT)) {
+      SDNodeFlags Flags;
+      if (N0.getOpcode() == ISD::ZERO_EXTEND)
+        Flags.setNonNeg(N0->getFlags().hasNonNeg());
+      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Flags);
+    }
     // if the source is larger than the dest, than we just need the truncate.
     if (N0.getOperand(0).getValueType().bitsGT(VT))
       return DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b0e3f534e2aa..5d8db8be9731 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6474,8 +6474,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
         OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
       if (N1.getOperand(0).getValueType().getScalarType().bitsLT(
-              VT.getScalarType()))
-        return getNode(OpOpcode, DL, VT, N1.getOperand(0));
+              VT.getScalarType())) {
+        SDNodeFlags Flags;
+        if (OpOpcode == ISD::ZERO_EXTEND)
+          Flags.setNonNeg(N1->getFlags().hasNonNeg());
+        return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags);
+      }
       if (N1.getOperand(0).getValueType().bitsGT(VT))
         return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0));
       return N1.getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 249dabba0cc2..32a037918a5a 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -484,3 +484,298 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
   %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b)
   ret i128 %res
 }
+
+define i64 @lshr64_shamt32(i64 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: lshr64_shamt32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a4, a2, -32
+; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    bltz a4, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    j .LBB11_3
+; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    srai a1, a4, 31
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: lshr64_shamt32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    ret
+  %zext = zext nneg i32 %b to i64
+  %1 = lshr i64 %a, %zext
+  ret i64 %1
+}
+
+define i64 @ashr64_shamt32(i64 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: ashr64_shamt32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    addi a4, a2, -32
+; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    bltz a4, .LBB12_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB12_2:
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ashr64_shamt32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    ret
+  %zext = zext nneg i32 %b to i64
+  %1 = ashr i64 %a, %zext
+  ret i64 %1
+}
+
+define i64 @shl64_shamt32(i64 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: shl64_shamt32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a4, a2, -32
+; RV32I-NEXT:    sll a3, a0, a2
+; RV32I-NEXT:    bltz a4, .LBB13_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    j .LBB13_3
+; RV32I-NEXT:  .LBB13_2:
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a0
+; RV32I-NEXT:  .LBB13_3:
+; RV32I-NEXT:    srai a0, a4, 31
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: shl64_shamt32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    ret
+  %zext = zext nneg i32 %b to i64
+  %1 = shl i64 %a, %zext
+  ret i64 %1
+}
+
+define i128 @lshr128_shamt32(i128 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: lshr128_shamt32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    srli a6, a2, 3
+; RV32I-NEXT:    mv a7, sp
+; RV32I-NEXT:    andi t0, a2, 31
+; RV32I-NEXT:    andi a6, a6, 12
+; RV32I-NEXT:    xori t0, t0, 31
+; RV32I-NEXT:    add a6, a7, a6
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    lw a1, 0(a6)
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    lw a4, 8(a6)
+; RV32I-NEXT:    lw a5, 12(a6)
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    slli a6, a3, 1
+; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    slli a7, a4, 1
+; RV32I-NEXT:    srl a4, a4, a2
+; RV32I-NEXT:    srl a2, a5, a2
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    sll a6, a6, t0
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    sll a5, a5, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: lshr128_shamt32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a4, a2, -64
+; RV64I-NEXT:    srl a3, a1, a2
+; RV64I-NEXT:    bltz a4, .LBB14_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    j .LBB14_3
+; RV64I-NEXT:  .LBB14_2:
+; RV64I-NEXT:    srl a0, a0, a2
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    sll a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:  .LBB14_3:
+; RV64I-NEXT:    srai a1, a4, 63
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    ret
+  %zext = zext nneg i32 %b to i128
+  %1 = lshr i128 %a, %zext
+  ret i128 %1
+}
+
+define i128 @ashr128_shamt32(i128 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: ashr128_shamt32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    srli a6, a2, 3
+; RV32I-NEXT:    mv a7, sp
+; RV32I-NEXT:    andi t0, a2, 31
+; RV32I-NEXT:    andi a6, a6, 12
+; RV32I-NEXT:    xori t0, t0, 31
+; RV32I-NEXT:    add a6, a7, a6
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    sw a1, 16(sp)
+; RV32I-NEXT:    sw a1, 20(sp)
+; RV32I-NEXT:    sw a1, 24(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    lw a1, 0(a6)
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    lw a4, 8(a6)
+; RV32I-NEXT:    lw a5, 12(a6)
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    slli a6, a3, 1
+; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    slli a7, a4, 1
+; RV32I-NEXT:    srl a4, a4, a2
+; RV32I-NEXT:    sra a2, a5, a2
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    sll a6, a6, t0
+; RV32I-NEXT:    sll a7, a7, t0
+; RV32I-NEXT:    sll a5, a5, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sw a4, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ashr128_shamt32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a3, a1
+; RV64I-NEXT:    addi a4, a2, -64
+; RV64I-NEXT:    sra a1, a1, a2
+; RV64I-NEXT:    bltz a4, .LBB15_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB15_2:
+; RV64I-NEXT:    srl a0, a0, a2
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    slli a3, a3, 1
+; RV64I-NEXT:    sll a2, a3, a2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    ret
+  %zext = zext nneg i32 %b to i128
+  %1 = ashr i128 %a, %zext
+  ret i128 %1
+}
+
+define i128 @shl128_shamt32(i128 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: shl128_shamt32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    srli a6, a2, 3
+; RV32I-NEXT:    addi a7, sp, 16
+; RV32I-NEXT:    andi t0, a2, 31
+; RV32I-NEXT:    andi a6, a6, 12
+; RV32I-NEXT:    sub a6, a7, a6
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    lw a1, 0(a6)
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    lw a4, 8(a6)
+; RV32I-NEXT:    lw a5, 12(a6)
+; RV32I-NEXT:    xori a6, t0, 31
+; RV32I-NEXT:    sll a7, a3, a2
+; RV32I-NEXT:    srli t0, a1, 1
+; RV32I-NEXT:    sll a5, a5, a2
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    sll a2, a4, a2
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    srli a4, a4, 1
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    srl a3, a3, a6
+; RV32I-NEXT:    srl a4, a4, a6
+; RV32I-NEXT:    or a6, a7, t0
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a6, 4(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: shl128_shamt32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a4, a2, -64
+; RV64I-NEXT:    sll a3, a0, a2
+; RV64I-NEXT:    bltz a4, .LBB16_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    j .LBB16_3
+; RV64I-NEXT:  .LBB16_2:
+; RV64I-NEXT:    sll a1, a1, a2
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    srl a0, a0, a2
+; RV64I-NEXT:    or a1, a1, a0
+; RV64I-NEXT:  .LBB16_3:
+; RV64I-NEXT:    srai a0, a4, 63
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    ret
+  %zext = zext nneg i32 %b to i128
+  %1 = shl i128 %a, %zext
+  ret i128 %1
+}

From 3de01d07c33c10dfefc753c87c0a926fd512425b Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Thu, 19 Jun 2025 16:16:03 +0100
Subject: [PATCH 0951/1322] Fix bazel build after #144594, mark variable as
 potentially unused (#144910)

---
 llvm/lib/TargetParser/TargetParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 03f7d3899c2e..90791dfb7b7c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -56,7 +56,7 @@ std::optional<llvm::StringMap<bool>> llvm::getCPUDefaultTargetFeatures(
   llvm::StringMap<bool> DefaultFeatures;
   setImpliedBits(Bits, CPUEntry->Implies.getAsBitset(), ProcFeatures);
 
-  unsigned BitSize = Bits.size();
+  [[maybe_unused]] unsigned BitSize = Bits.size();
   for (const BasicSubtargetFeatureKV &FE : ProcFeatures) {
     assert(FE.Value < BitSize && "Target Feature is out of range");
     if (Bits[FE.Value])

From 36af7345dfb8e84a1f2971db34089b63321e8467 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 15 May 2025 11:43:49 +0100
Subject: [PATCH 0952/1322] Reapply "[Clang] Enable -fextend-variable-liveness
 at -Og (#118026)"

Relands this feature after several fixes:

* Force fake uses to be emitted before musttail calls (#136867)
* Added soften-float legalization for fake uses (#142714)
* Treat fake uses as size-less instructions in a SystemZ assert (#144390)

If further issues with fake uses are found then this may be reverted again,
but all currently-known issues are resolved.

This reverts commit 2dc6e98169baeb1f73036da0ea50fd828d8323d0.
---
 clang/docs/CommandGuide/clang.rst            | 7 +++++--
 clang/docs/ReleaseNotes.rst                  | 4 ++++
 clang/lib/Driver/ToolChains/Clang.cpp        | 8 +++++++-
 clang/test/Driver/extend-variable-liveness.c | 3 ++-
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst
index 1b8776c5e9ad..7d49f2cc28a1 100644
--- a/clang/docs/CommandGuide/clang.rst
+++ b/clang/docs/CommandGuide/clang.rst
@@ -460,8 +460,11 @@ Code Generation Options
     :option:`-Oz` Like :option:`-Os` (and thus :option:`-O2`), but reduces code
     size further.
 
-    :option:`-Og` Like :option:`-O1`. In future versions, this option might
-    disable different optimizations in order to improve debuggability.
+    :option:`-Og` Similar to :option:`-O1`, but with slightly reduced
+    optimization and better variable visibility. The same optimizations are run
+    as at :option:`-O1`, but the ``-fextend-variable-liveness`` flag is
+    also set, which tries to prevent optimizations from reducing the liveness of
+    user variables, improving their availability when debugging.
 
     :option:`-O` Equivalent to :option:`-O1`.
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dd748ab06873..96477ef6ddc9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -366,6 +366,10 @@ Modified Compiler Flags
 
 - The ``-fveclib=libmvec`` option now supports AArch64 targets (requires GLIBC 2.40 or newer).
 
+- The ``-Og`` optimization flag now sets ``-fextend-variable-liveness``,
+  reducing performance slightly while reducing the number of optimized-out
+  variables. (#GH118026)
+
 Removed Compiler Flags
 -------------------------
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a78a1c897818..5a3c09e3a343 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7555,7 +7555,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_fretain_comments_from_system_headers))
     CmdArgs.push_back("-fretain-comments-from-system-headers");
 
-  Args.AddLastArg(CmdArgs, options::OPT_fextend_variable_liveness_EQ);
+  if (Arg *A = Args.getLastArg(options::OPT_fextend_variable_liveness_EQ)) {
+    A->render(Args, CmdArgs);
+  } else if (Arg *A = Args.getLastArg(options::OPT_O_Group);
+             A && A->containsValue("g")) {
+    // Set -fextend-variable-liveness=all by default at -Og.
+    CmdArgs.push_back("-fextend-variable-liveness=all");
+  }
 
   // Forward -fcomment-block-commands to -cc1.
   Args.AddAllArgs(CmdArgs, options::OPT_fcomment_block_commands);
diff --git a/clang/test/Driver/extend-variable-liveness.c b/clang/test/Driver/extend-variable-liveness.c
index bbfb2ece6f29..99a5409cecce 100644
--- a/clang/test/Driver/extend-variable-liveness.c
+++ b/clang/test/Driver/extend-variable-liveness.c
@@ -1,7 +1,8 @@
 // Tests that -fextend-variable-liveness and its aliases are correctly passed
-// by the driver.
+// by the driver, and are set by default at -Og.
 
 // RUN: %clang -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,DEFAULT
+// RUN: %clang -### -Og -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,ALL
 // RUN: %clang -fextend-variable-liveness=none -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,NONE
 // RUN: %clang -fextend-variable-liveness=this -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THIS
 // RUN: %clang -fextend-variable-liveness=all -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,ALL

From f4db14229cd975822c41376afda9d56a29f9396c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 19 Jun 2025 17:26:54 +0200
Subject: [PATCH 0953/1322] [SCCP] Move logic for removing ssa.copy into Solver
 (NFC)

So it can be reused between IPSCCP and SCCP.

Make the implementation a bit more efficient by only lookup the
PredicateInfo once.
---
 .../llvm/Transforms/Utils/SCCPSolver.h        |  2 ++
 llvm/lib/Transforms/IPO/SCCP.cpp              | 14 +----------
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      | 24 +++++++++++++++++++
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
index c8fe4301b2b4..5aac7c2ac5d3 100644
--- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h
@@ -77,6 +77,8 @@ public:
   LLVM_ABI void addPredicateInfo(Function &F, DominatorTree &DT,
                                  AssumptionCache &AC);
 
+  LLVM_ABI void removeSSACopies(Function &F);
+
   /// markBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
   /// This returns true if the block was not considered live before.
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index 43c5df357500..d50de34dfa48 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -250,19 +250,7 @@ static bool runIPSCCP(
       if (!DeadBB->hasAddressTaken())
         DTU.deleteBB(DeadBB);
 
-    for (BasicBlock &BB : F) {
-      for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
-        if (Solver.getPredicateInfoFor(&Inst)) {
-          if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
-            if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
-              Value *Op = II->getOperand(0);
-              Inst.replaceAllUsesWith(Op);
-              Inst.eraseFromParent();
-            }
-          }
-        }
-      }
-    }
+    Solver.removeSSACopies(F);
   }
 
   // If we inferred constant or undef return values for a function, we replaced
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index f4b378b82dae..cc0bb4735c23 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -764,6 +764,26 @@ public:
     FnPredicateInfo.insert({&F, std::make_unique<PredicateInfo>(F, DT, AC)});
   }
 
+  void removeSSACopies(Function &F) {
+    auto It = FnPredicateInfo.find(&F);
+    if (It == FnPredicateInfo.end())
+      return;
+
+    for (BasicBlock &BB : F) {
+      for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
+        if (It->second->getPredicateInfoFor(&Inst)) {
+          if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
+            if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+              Value *Op = II->getOperand(0);
+              Inst.replaceAllUsesWith(Op);
+              Inst.eraseFromParent();
+            }
+          }
+        }
+      }
+    }
+  }
+
   void visitCallInst(CallInst &I) { visitCallBase(I); }
 
   bool markBlockExecutable(BasicBlock *BB);
@@ -2168,6 +2188,10 @@ void SCCPSolver::addPredicateInfo(Function &F, DominatorTree &DT,
   Visitor->addPredicateInfo(F, DT, AC);
 }
 
+void SCCPSolver::removeSSACopies(Function &F) {
+  Visitor->removeSSACopies(F);
+}
+
 bool SCCPSolver::markBlockExecutable(BasicBlock *BB) {
   return Visitor->markBlockExecutable(BB);
 }

From 01d648a42939c834b6b45677e540882222b01c11 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 19 Jun 2025 11:52:55 -0400
Subject: [PATCH 0954/1322] [HLSL][SPIRV] Reapply "[HLSL][SPIRV] Add
 vk::constant_id attribute." (#144902)

- **Reapply "[HLSL][SPIRV] Add vk::constant_id attribute." (#144812)**
- **Fix memory leak.**
---
 clang/include/clang/Basic/Attr.td             |   8 +
 clang/include/clang/Basic/AttrDocs.td         |  15 ++
 clang/include/clang/Basic/Builtins.td         |  13 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +
 clang/include/clang/Sema/SemaHLSL.h           |   5 +-
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          |  75 +++++++
 clang/lib/CodeGen/CodeGenFunction.h           |   6 +
 clang/lib/Sema/SemaDecl.cpp                   |  13 ++
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 +
 clang/lib/Sema/SemaHLSL.cpp                   | 120 +++++++++-
 .../test/AST/HLSL/vk.spec-constant.usage.hlsl | 130 +++++++++++
 .../SpirvType.alignment.hlsl                  |   0
 .../SpirvType.hlsl                            |   0
 .../vk-features/vk.spec-constant.hlsl         | 210 ++++++++++++++++++
 .../test/SemaHLSL/vk.spec-constant.error.hlsl |  37 +++
 15 files changed, 637 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
 rename clang/test/CodeGenHLSL/{inline-spirv => vk-features}/SpirvType.alignment.hlsl (100%)
 rename clang/test/CodeGenHLSL/{inline-spirv => vk-features}/SpirvType.hlsl (100%)
 create mode 100644 clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
 create mode 100644 clang/test/SemaHLSL/vk.spec-constant.error.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index f113cd2ba2fb..27fea7dea0a5 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5023,6 +5023,14 @@ def HLSLVkExtBuiltinInput : InheritableAttr {
   let Documentation = [HLSLVkExtBuiltinInputDocs];
 }
 
+def HLSLVkConstantId : InheritableAttr {
+  let Spellings = [CXX11<"vk", "constant_id">];
+  let Args = [IntArgument<"Id">];
+  let Subjects = SubjectList<[ExternalGlobalVar]>;
+  let LangOpts = [HLSL];
+  let Documentation = [VkConstantIdDocs];
+}
+
 def RandomizeLayout : InheritableAttr {
   let Spellings = [GCC<"randomize_layout">];
   let Subjects = SubjectList<[Record]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 6051e1fc4511..43442f177ab7 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -8252,6 +8252,21 @@ and https://microsoft.github.io/hlsl-specs/proposals/0013-wave-size-range.html
   }];
 }
 
+def VkConstantIdDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``vk::constant_id`` attribute specifies the id for a SPIR-V specialization
+constant. The attribute applies to const global scalar variables. The variable must be initialized with a C++11 constexpr.
+In SPIR-V, the
+variable will be replaced with an `OpSpecConstant` with the given id.
+The syntax is:
+
+.. code-block:: text
+
+  ``[[vk::constant_id(<Id>)]] const T Name = <Init>``
+}];
+}
+
 def RootSignatureDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 68cd3d790e78..d65b3a5d2f44 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5065,6 +5065,19 @@ def HLSLGroupMemoryBarrierWithGroupSync: LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void()";
 }
 
+class HLSLScalarTemplate
+    : Template<["bool", "char", "short", "int", "long long int",
+                "unsigned short", "unsigned int", "unsigned long long int",
+                "__fp16", "float", "double"],
+               ["_bool", "_char", "_short", "_int", "_longlong", "_ushort",
+                "_uint", "_ulonglong", "_half", "_float", "_double"]>;
+
+def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
+  let Spellings = ["__builtin_get_spirv_spec_constant"];
+  let Attributes = [NoThrow, Const, Pure];
+  let Prototype = "T(unsigned int, T)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 979ff60b73b7..34b798a09c21 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12927,6 +12927,10 @@ def err_spirv_enum_not_int : Error<
 def err_spirv_enum_not_valid : Error<
    "invalid value for %select{storage class}0 argument">;
 
+def err_specialization_const
+    : Error<"variable with 'vk::constant_id' attribute must be a const "
+            "int/float/enum/bool and be initialized with a literal">;
+
 // errors of expect.with.probability
 def err_probability_not_constant_float : Error<
    "probability argument to __builtin_expect_with_probability must be constant "
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 33c4b8d1568b..97091792ba23 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -98,6 +98,8 @@ public:
   HLSLWaveSizeAttr *mergeWaveSizeAttr(Decl *D, const AttributeCommonInfo &AL,
                                       int Min, int Max, int Preferred,
                                       int SpelledArgsCount);
+  HLSLVkConstantIdAttr *
+  mergeVkConstantIdAttr(Decl *D, const AttributeCommonInfo &AL, int Id);
   HLSLShaderAttr *mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
                                   llvm::Triple::EnvironmentType ShaderType);
   HLSLParamModifierAttr *
@@ -135,6 +137,7 @@ public:
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
+  void handleVkConstantIdAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL);
@@ -171,7 +174,7 @@ public:
   QualType getInoutParameterType(QualType Ty);
 
   bool transformInitList(const InitializedEntity &Entity, InitListExpr *Init);
-
+  bool handleInitialization(VarDecl *VDecl, Expr *&Init);
   void deduceAddressSpace(VarDecl *Decl);
 
 private:
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index ccf45c0c6ff1..2a60a0909c93 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -12,6 +12,7 @@
 
 #include "CGBuiltin.h"
 #include "CGHLSLRuntime.h"
+#include "CodeGenFunction.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -214,6 +215,44 @@ static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
+// Returns the mangled name for a builtin function that the SPIR-V backend
+// will expand into a spec Constant.
+static std::string getSpecConstantFunctionName(clang::QualType SpecConstantType,
+                                               ASTContext &Context) {
+  // The parameter types for our conceptual intrinsic function.
+  QualType ClangParamTypes[] = {Context.IntTy, SpecConstantType};
+
+  // Create a temporary FunctionDecl for the builtin fuction. It won't be
+  // added to the AST.
+  FunctionProtoType::ExtProtoInfo EPI;
+  QualType FnType =
+      Context.getFunctionType(SpecConstantType, ClangParamTypes, EPI);
+  DeclarationName FuncName = &Context.Idents.get("__spirv_SpecConstant");
+  FunctionDecl *FnDeclForMangling = FunctionDecl::Create(
+      Context, Context.getTranslationUnitDecl(), SourceLocation(),
+      SourceLocation(), FuncName, FnType, /*TSI=*/nullptr, SC_Extern);
+
+  // Attach the created parameter declarations to the function declaration.
+  SmallVector<ParmVarDecl *, 2> ParamDecls;
+  for (QualType ParamType : ClangParamTypes) {
+    ParmVarDecl *PD = ParmVarDecl::Create(
+        Context, FnDeclForMangling, SourceLocation(), SourceLocation(),
+        /*IdentifierInfo*/ nullptr, ParamType, /*TSI*/ nullptr, SC_None,
+        /*DefaultArg*/ nullptr);
+    ParamDecls.push_back(PD);
+  }
+  FnDeclForMangling->setParams(ParamDecls);
+
+  // Get the mangled name.
+  std::string Name;
+  llvm::raw_string_ostream MangledNameStream(Name);
+  std::unique_ptr<MangleContext> Mangler(Context.createMangleContext());
+  Mangler->mangleName(FnDeclForMangling, MangledNameStream);
+  MangledNameStream.flush();
+
+  return Name;
+}
+
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E,
                                             ReturnValueSlot ReturnValue) {
@@ -773,6 +812,42 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return EmitRuntimeCall(
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
+  case Builtin::BI__builtin_get_spirv_spec_constant_bool:
+  case Builtin::BI__builtin_get_spirv_spec_constant_short:
+  case Builtin::BI__builtin_get_spirv_spec_constant_ushort:
+  case Builtin::BI__builtin_get_spirv_spec_constant_int:
+  case Builtin::BI__builtin_get_spirv_spec_constant_uint:
+  case Builtin::BI__builtin_get_spirv_spec_constant_longlong:
+  case Builtin::BI__builtin_get_spirv_spec_constant_ulonglong:
+  case Builtin::BI__builtin_get_spirv_spec_constant_half:
+  case Builtin::BI__builtin_get_spirv_spec_constant_float:
+  case Builtin::BI__builtin_get_spirv_spec_constant_double: {
+    llvm::Function *SpecConstantFn = getSpecConstantFunction(E->getType());
+    llvm::Value *SpecId = EmitScalarExpr(E->getArg(0));
+    llvm::Value *DefaultVal = EmitScalarExpr(E->getArg(1));
+    llvm::Value *Args[] = {SpecId, DefaultVal};
+    return Builder.CreateCall(SpecConstantFn, Args);
+  }
   }
   return nullptr;
 }
+
+llvm::Function *clang::CodeGen::CodeGenFunction::getSpecConstantFunction(
+    const clang::QualType &SpecConstantType) {
+
+  // Find or create the declaration for the function.
+  llvm::Module *M = &CGM.getModule();
+  std::string MangledName =
+      getSpecConstantFunctionName(SpecConstantType, getContext());
+  llvm::Function *SpecConstantFn = M->getFunction(MangledName);
+
+  if (!SpecConstantFn) {
+    llvm::Type *IntType = ConvertType(getContext().IntTy);
+    llvm::Type *RetTy = ConvertType(SpecConstantType);
+    llvm::Type *ArgTypes[] = {IntType, RetTy};
+    llvm::FunctionType *FnTy = llvm::FunctionType::get(RetTy, ArgTypes, false);
+    SpecConstantFn = llvm::Function::Create(
+        FnTy, llvm::GlobalValue::ExternalLinkage, MangledName, M);
+  }
+  return SpecConstantFn;
+}
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index a5ab9df01dba..59f14b3e35fd 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4850,6 +4850,12 @@ public:
   llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitHLSLBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                    ReturnValueSlot ReturnValue);
+
+  // Returns a builtin function that the SPIR-V backend will expand into a spec
+  // constant.
+  llvm::Function *
+  getSpecConstantFunction(const clang::QualType &SpecConstantType);
+
   llvm::Value *EmitDirectXBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitSPIRVBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 1bf72e5bb7b9..e1cccf068b5a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2890,6 +2890,8 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
     NewAttr = S.HLSL().mergeWaveSizeAttr(D, *WS, WS->getMin(), WS->getMax(),
                                          WS->getPreferred(),
                                          WS->getSpelledArgsCount());
+  else if (const auto *CI = dyn_cast<HLSLVkConstantIdAttr>(Attr))
+    NewAttr = S.HLSL().mergeVkConstantIdAttr(D, *CI, CI->getId());
   else if (const auto *SA = dyn_cast<HLSLShaderAttr>(Attr))
     NewAttr = S.HLSL().mergeShaderAttr(D, *SA, SA->getType());
   else if (isa<SuppressAttr>(Attr))
@@ -13757,6 +13759,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     return;
   }
 
+  if (getLangOpts().HLSL)
+    if (!HLSL().handleInitialization(VDecl, Init))
+      return;
+
   // Get the decls type and save a reference for later, since
   // CheckInitializerTypes may change it.
   QualType DclT = VDecl->getType(), SavT = DclT;
@@ -14179,6 +14185,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
       }
     }
 
+    // HLSL variable with the `vk::constant_id` attribute must be initialized.
+    if (!Var->isInvalidDecl() && Var->hasAttr<HLSLVkConstantIdAttr>()) {
+      Diag(Var->getLocation(), diag::err_specialization_const);
+      Var->setInvalidDecl();
+      return;
+    }
+
     if (!Var->isInvalidDecl() && RealDecl->hasAttr<LoaderUninitializedAttr>()) {
       if (Var->getStorageClass() == SC_Extern) {
         Diag(Var->getLocation(), diag::err_loader_uninitialized_extern_decl)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1c2fa80e782d..eba29e609cb0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7590,6 +7590,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLVkExtBuiltinInput:
     S.HLSL().handleVkExtBuiltinInputAttr(D, AL);
     break;
+  case ParsedAttr::AT_HLSLVkConstantId:
+    S.HLSL().handleVkConstantIdAttr(D, AL);
+    break;
   case ParsedAttr::AT_HLSLSV_GroupThreadID:
     S.HLSL().handleSV_GroupThreadIDAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index b55f4fd786b5..9b43ee00810b 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -119,6 +119,40 @@ static ResourceClass getResourceClass(RegisterType RT) {
   llvm_unreachable("unexpected RegisterType value");
 }
 
+static Builtin::ID getSpecConstBuiltinId(QualType Type) {
+  const auto *BT = dyn_cast<BuiltinType>(Type);
+  if (!BT) {
+    if (!Type->isEnumeralType())
+      return Builtin::NotBuiltin;
+    return Builtin::BI__builtin_get_spirv_spec_constant_int;
+  }
+
+  switch (BT->getKind()) {
+  case BuiltinType::Bool:
+    return Builtin::BI__builtin_get_spirv_spec_constant_bool;
+  case BuiltinType::Short:
+    return Builtin::BI__builtin_get_spirv_spec_constant_short;
+  case BuiltinType::Int:
+    return Builtin::BI__builtin_get_spirv_spec_constant_int;
+  case BuiltinType::LongLong:
+    return Builtin::BI__builtin_get_spirv_spec_constant_longlong;
+  case BuiltinType::UShort:
+    return Builtin::BI__builtin_get_spirv_spec_constant_ushort;
+  case BuiltinType::UInt:
+    return Builtin::BI__builtin_get_spirv_spec_constant_uint;
+  case BuiltinType::ULongLong:
+    return Builtin::BI__builtin_get_spirv_spec_constant_ulonglong;
+  case BuiltinType::Half:
+    return Builtin::BI__builtin_get_spirv_spec_constant_half;
+  case BuiltinType::Float:
+    return Builtin::BI__builtin_get_spirv_spec_constant_float;
+  case BuiltinType::Double:
+    return Builtin::BI__builtin_get_spirv_spec_constant_double;
+  default:
+    return Builtin::NotBuiltin;
+  }
+}
+
 DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD,
                                                       ResourceClass ResClass) {
   assert(getDeclBindingInfo(VD, ResClass) == nullptr &&
@@ -607,6 +641,41 @@ HLSLWaveSizeAttr *SemaHLSL::mergeWaveSizeAttr(Decl *D,
   return Result;
 }
 
+HLSLVkConstantIdAttr *
+SemaHLSL::mergeVkConstantIdAttr(Decl *D, const AttributeCommonInfo &AL,
+                                int Id) {
+
+  auto &TargetInfo = getASTContext().getTargetInfo();
+  if (TargetInfo.getTriple().getArch() != llvm::Triple::spirv) {
+    Diag(AL.getLoc(), diag::warn_attribute_ignored) << AL;
+    return nullptr;
+  }
+
+  auto *VD = cast<VarDecl>(D);
+
+  if (getSpecConstBuiltinId(VD->getType()) == Builtin::NotBuiltin) {
+    Diag(VD->getLocation(), diag::err_specialization_const);
+    return nullptr;
+  }
+
+  if (!VD->getType().isConstQualified()) {
+    Diag(VD->getLocation(), diag::err_specialization_const);
+    return nullptr;
+  }
+
+  if (HLSLVkConstantIdAttr *CI = D->getAttr<HLSLVkConstantIdAttr>()) {
+    if (CI->getId() != Id) {
+      Diag(CI->getLocation(), diag::err_hlsl_attribute_param_mismatch) << AL;
+      Diag(AL.getLoc(), diag::note_conflicting_attribute);
+    }
+    return nullptr;
+  }
+
+  HLSLVkConstantIdAttr *Result =
+      ::new (getASTContext()) HLSLVkConstantIdAttr(getASTContext(), AL, Id);
+  return Result;
+}
+
 HLSLShaderAttr *
 SemaHLSL::mergeShaderAttr(Decl *D, const AttributeCommonInfo &AL,
                           llvm::Triple::EnvironmentType ShaderType) {
@@ -1157,6 +1226,15 @@ void SemaHLSL::handleVkExtBuiltinInputAttr(Decl *D, const ParsedAttr &AL) {
                  HLSLVkExtBuiltinInputAttr(getASTContext(), AL, ID));
 }
 
+void SemaHLSL::handleVkConstantIdAttr(Decl *D, const ParsedAttr &AL) {
+  uint32_t Id;
+  if (!SemaRef.checkUInt32Argument(AL, AL.getArgAsExpr(0), Id))
+    return;
+  HLSLVkConstantIdAttr *NewAttr = mergeVkConstantIdAttr(D, AL, Id);
+  if (NewAttr)
+    D->addAttr(NewAttr);
+}
+
 bool SemaHLSL::diagnoseInputIDType(QualType T, const ParsedAttr &AL) {
   const auto *VT = T->getAs<VectorType>();
 
@@ -3206,6 +3284,7 @@ static bool IsDefaultBufferConstantDecl(VarDecl *VD) {
   return VD->getDeclContext()->isTranslationUnit() &&
          QT.getAddressSpace() == LangAS::Default &&
          VD->getStorageClass() != SC_Static &&
+         !VD->hasAttr<HLSLVkConstantIdAttr>() &&
          !isInvalidConstantBufferLeafElementType(QT.getTypePtr());
 }
 
@@ -3273,7 +3352,8 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
     const Type *VarType = VD->getType().getTypePtr();
     while (VarType->isArrayType())
       VarType = VarType->getArrayElementTypeNoTypeQual();
-    if (VarType->isHLSLResourceRecord()) {
+    if (VarType->isHLSLResourceRecord() ||
+        VD->hasAttr<HLSLVkConstantIdAttr>()) {
       // Make the variable for resources static. The global externally visible
       // storage is accessed through the handle, which is a member. The variable
       // itself is not externally visible.
@@ -3696,3 +3776,41 @@ bool SemaHLSL::transformInitList(const InitializedEntity &Entity,
     Init->updateInit(Ctx, I, NewInit->getInit(I));
   return true;
 }
+
+bool SemaHLSL::handleInitialization(VarDecl *VDecl, Expr *&Init) {
+  const HLSLVkConstantIdAttr *ConstIdAttr =
+      VDecl->getAttr<HLSLVkConstantIdAttr>();
+  if (!ConstIdAttr)
+    return true;
+
+  ASTContext &Context = SemaRef.getASTContext();
+
+  APValue InitValue;
+  if (!Init->isCXX11ConstantExpr(Context, &InitValue)) {
+    Diag(VDecl->getLocation(), diag::err_specialization_const);
+    VDecl->setInvalidDecl();
+    return false;
+  }
+
+  Builtin::ID BID = getSpecConstBuiltinId(VDecl->getType());
+
+  // Argument 1: The ID from the attribute
+  int ConstantID = ConstIdAttr->getId();
+  llvm::APInt IDVal(Context.getIntWidth(Context.IntTy), ConstantID);
+  Expr *IdExpr = IntegerLiteral::Create(Context, IDVal, Context.IntTy,
+                                        ConstIdAttr->getLocation());
+
+  SmallVector<Expr *, 2> Args = {IdExpr, Init};
+  Expr *C = SemaRef.BuildBuiltinCallExpr(Init->getExprLoc(), BID, Args);
+  if (C->getType()->getCanonicalTypeUnqualified() !=
+      VDecl->getType()->getCanonicalTypeUnqualified()) {
+    C = SemaRef
+            .BuildCStyleCastExpr(SourceLocation(),
+                                 Context.getTrivialTypeSourceInfo(
+                                     Init->getType(), Init->getExprLoc()),
+                                 SourceLocation(), C)
+            .get();
+  }
+  Init = C;
+  return true;
+}
diff --git a/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
new file mode 100644
index 000000000000..c0955c1ea7b4
--- /dev/null
+++ b/clang/test/AST/HLSL/vk.spec-constant.usage.hlsl
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// CHECK: VarDecl {{.*}} bool_const 'const hlsl_private bool' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'bool'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'bool (*)(unsigned int, bool) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'bool (unsigned int, bool) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_bool' 'bool (unsigned int, bool) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+[[vk::constant_id(1)]]
+const bool bool_const = true;
+
+// CHECK: VarDecl {{.*}} short_const 'const hlsl_private short' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'short'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'short (*)(unsigned int, short) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'short (unsigned int, short) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_short' 'short (unsigned int, short) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'short' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+[[vk::constant_id(2)]]
+const short short_const = 4;
+
+// CHECK: VarDecl {{.*}} int_const 'const hlsl_private int' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int (*)(unsigned int, int) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int (unsigned int, int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_int' 'int (unsigned int, int) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
+[[vk::constant_id(3)]]
+const int int_const = 5;
+
+// CHECK: VarDecl {{.*}} long_const 'const hlsl_private long long' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'long long'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long long (*)(unsigned int, long long) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'long long (unsigned int, long long) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_longlong' 'long long (unsigned int, long long) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long long' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+[[vk::constant_id(4)]]
+const long long long_const = 8;
+
+// CHECK: VarDecl {{.*}} ushort_const 'const hlsl_private unsigned short' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'unsigned short'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned short (*)(unsigned int, unsigned short) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned short (unsigned int, unsigned short) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_ushort' 'unsigned short (unsigned int, unsigned short) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned short' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 10
+[[vk::constant_id(5)]]
+const unsigned short ushort_const = 10;
+
+// CHECK: VarDecl {{.*}} uint_const 'const hlsl_private unsigned int' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'unsigned int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int (*)(unsigned int, unsigned int) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int (unsigned int, unsigned int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_uint' 'unsigned int (unsigned int, unsigned int) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 12
+[[vk::constant_id(6)]]
+const unsigned int uint_const = 12;
+
+
+// CHECK: VarDecl {{.*}} ulong_const 'const hlsl_private unsigned long long' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'unsigned long long'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned long long (*)(unsigned int, unsigned long long) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned long long (unsigned int, unsigned long long) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_ulonglong' 'unsigned long long (unsigned int, unsigned long long) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 7
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned long long' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 25
+[[vk::constant_id(7)]]
+const unsigned long long ulong_const = 25;
+
+// CHECK: VarDecl {{.*}} half_const 'const hlsl_private half' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'half'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half (*)(unsigned int, half) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'half (unsigned int, half) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_half' 'half (unsigned int, half) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half' <FloatingCast>
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 4.040000e+01
+[[vk::constant_id(8)]]
+const half half_const = 40.4;
+
+// CHECK: VarDecl {{.*}} float_const 'const hlsl_private float' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'float'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float (*)(unsigned int, float) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float (unsigned int, float) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_float' 'float (unsigned int, float) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 50
+[[vk::constant_id(8)]]
+const float float_const = 50;
+
+// CHECK: VarDecl {{.*}} double_const 'const hlsl_private double' static cinit
+// CHECK-NEXT: CallExpr {{.*}} 'double'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'double (*)(unsigned int, double) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'double (unsigned int, double) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_double' 'double (unsigned int, double) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 9
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'double' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 100
+[[vk::constant_id(9)]]
+const double double_const = 100;
+
+// CHECK: VarDecl {{.*}} enum_const 'const hlsl_private E' static cinit
+// CHECK-NEXT: CStyleCastExpr {{.*}} 'E' <IntegralCast>
+// CHECK-NEXT: CallExpr {{.*}} 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int (*)(unsigned int, int) noexcept' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int (unsigned int, int) noexcept' lvalue Function {{.*}} '__builtin_get_spirv_spec_constant_int' 'int (unsigned int, int) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 10 
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'E' EnumConstant {{.*}} 'e2' 'E' 
+enum E {
+    e0 = 10,
+    e1 = 20,
+    e2 = 30
+};
+
+[[vk::constant_id(10)]]
+const E enum_const = e2;
+
+// CHECK-NOT: CXXRecordDecl {{.*}} implicit struct __cblayout_$Globals definition
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.alignment.hlsl b/clang/test/CodeGenHLSL/vk-features/SpirvType.alignment.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/inline-spirv/SpirvType.alignment.hlsl
rename to clang/test/CodeGenHLSL/vk-features/SpirvType.alignment.hlsl
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl b/clang/test/CodeGenHLSL/vk-features/SpirvType.hlsl
similarity index 100%
rename from clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
rename to clang/test/CodeGenHLSL/vk-features/SpirvType.hlsl
diff --git a/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
new file mode 100644
index 000000000000..cbc1fa61eae2
--- /dev/null
+++ b/clang/test/CodeGenHLSL/vk-features/vk.spec-constant.hlsl
@@ -0,0 +1,210 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+[[vk::constant_id(1)]]
+const bool bool_const = true;
+
+[[vk::constant_id(1)]]
+const short short_const = 4;
+
+[[vk::constant_id(3)]]
+const int int_const = 5;
+
+[[vk::constant_id(4)]]
+const long long long_const = 8;
+
+[[vk::constant_id(5)]]
+const unsigned short ushort_const = 10;
+
+[[vk::constant_id(6)]]
+const unsigned int uint_const = 12;
+
+[[vk::constant_id(7)]]
+const unsigned long long ulong_const = 25;
+
+[[vk::constant_id(8)]]
+const half half_const = 40.4;
+
+[[vk::constant_id(8)]]
+const float float_const = 50.5;
+
+[[vk::constant_id(9)]]
+const double double_const = 100.2;
+
+enum E {
+    e0 = 10,
+    e1 = 20,
+    e2 = 30
+};
+
+[[vk::constant_id(10)]]
+const E enum_const = e2;
+
+[numthreads(1,1,1)]
+void main() {
+    bool b = bool_const;
+    short s = short_const;
+    int i = int_const;
+    long long l = long_const;
+    unsigned short us = ushort_const;
+    unsigned int ui = uint_const;
+    unsigned long long ul = ulong_const;
+    half h = half_const;
+    float f = float_const;
+    double d = double_const;
+    E e = enum_const;
+}
+//.
+// CHECK: @_ZL10bool_const = internal addrspace(10) global i32 0, align 4
+// CHECK: @_ZL11short_const = internal addrspace(10) global i16 0, align 2
+// CHECK: @_ZL9int_const = internal addrspace(10) global i32 0, align 4
+// CHECK: @_ZL10long_const = internal addrspace(10) global i64 0, align 8
+// CHECK: @_ZL12ushort_const = internal addrspace(10) global i16 0, align 2
+// CHECK: @_ZL10uint_const = internal addrspace(10) global i32 0, align 4
+// CHECK: @_ZL11ulong_const = internal addrspace(10) global i64 0, align 8
+// CHECK: @_ZL10half_const = internal addrspace(10) global float 0.000000e+00, align 4
+// CHECK: @_ZL11float_const = internal addrspace(10) global float 0.000000e+00, align 4
+// CHECK: @_ZL12double_const = internal addrspace(10) global double 0.000000e+00, align 8
+// CHECK: @_ZL10enum_const = internal addrspace(10) global i32 0, align 4
+//.
+// CHECK-LABEL: define internal spir_func void @_Z4mainv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[L:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[US:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[UI:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[UL:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[H:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[F:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[D:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[E:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(10) @_ZL10bool_const, align 4
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i32 [[TMP1]] to i1
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[LOADEDV]] to i32
+// CHECK-NEXT:    store i32 [[STOREDV]], ptr [[B]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(10) @_ZL11short_const, align 2
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[S]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(10) @_ZL9int_const, align 4
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(10) @_ZL10long_const, align 8
+// CHECK-NEXT:    store i64 [[TMP4]], ptr [[L]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(10) @_ZL12ushort_const, align 2
+// CHECK-NEXT:    store i16 [[TMP5]], ptr [[US]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(10) @_ZL10uint_const, align 4
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[UI]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(10) @_ZL11ulong_const, align 8
+// CHECK-NEXT:    store i64 [[TMP7]], ptr [[UL]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(10) @_ZL10half_const, align 4
+// CHECK-NEXT:    store float [[TMP8]], ptr [[H]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(10) @_ZL11float_const, align 4
+// CHECK-NEXT:    store float [[TMP9]], ptr [[F]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr addrspace(10) @_ZL12double_const, align 8
+// CHECK-NEXT:    store double [[TMP10]], ptr [[D]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(10) @_ZL10enum_const, align 4
+// CHECK-NEXT:    store i32 [[TMP11]], ptr [[E]], align 4
+// CHECK-NEXT:    ret void
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init(
+// CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @_Z20__spirv_SpecConstantib(i32 1, i1 true)
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP1]] to i32
+// CHECK-NEXT:    store i32 [[STOREDV]], ptr addrspace(10) @_ZL10bool_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.1(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @_Z20__spirv_SpecConstantis(i32 1, i16 4)
+// CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(10) @_ZL11short_const, align 2
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.2(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantii(i32 3, i32 5)
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL9int_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.3(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_Z20__spirv_SpecConstantix(i32 4, i64 8)
+// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(10) @_ZL10long_const, align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.4(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @_Z20__spirv_SpecConstantit(i32 5, i16 10)
+// CHECK-NEXT:    store i16 [[TMP1]], ptr addrspace(10) @_ZL12ushort_const, align 2
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.5(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantij(i32 6, i32 12)
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL10uint_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.6(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_Z20__spirv_SpecConstantiy(i32 7, i64 25)
+// CHECK-NEXT:    store i64 [[TMP1]], ptr addrspace(10) @_ZL11ulong_const, align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.7(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn float @_Z20__spirv_SpecConstantiDh(i32 8, float 0x4044333340000000)
+// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(10) @_ZL10half_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.8(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn float @_Z20__spirv_SpecConstantif(i32 8, float 5.050000e+01)
+// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(10) @_ZL11float_const, align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.9(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz arcp afn double @_Z20__spirv_SpecConstantid(i32 9, double 0x40590CCCC0000000)
+// CHECK-NEXT:    store double [[TMP1]], ptr addrspace(10) @_ZL12double_const, align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define internal spir_func void @__cxx_global_var_init.10(
+// CHECK-SAME: ) #[[ATTR3]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.experimental.convergence.entry()
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_Z20__spirv_SpecConstantii(i32 10, i32 30)
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(10) @_ZL10enum_const, align 4
+// CHECK-NEXT:    ret void
diff --git a/clang/test/SemaHLSL/vk.spec-constant.error.hlsl b/clang/test/SemaHLSL/vk.spec-constant.error.hlsl
new file mode 100644
index 000000000000..24873d272a54
--- /dev/null
+++ b/clang/test/SemaHLSL/vk.spec-constant.error.hlsl
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan1.3-compute -verify %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.8-compute -verify %s
+
+#ifndef __spirv__
+// expected-warning@+2{{'constant_id' attribute ignored}}
+#endif
+[[vk::constant_id(0)]]
+const bool sc0 = true;
+
+#ifdef __spirv__
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(1)]]
+const bool sc1 = sc0; // error
+
+// expected-warning@+1{{'constant_id' attribute only applies to external global variables}}
+[[vk::constant_id(2)]]
+static const bool sc2 = false; // error
+
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(3)]]
+const bool sc3; // error
+
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(4)]]
+bool sc4 = false; // error
+
+// expected-error@+2{{variable with 'vk::constant_id' attribute must be a const int/float/enum/bool and be initialized with a literal}}
+[[vk::constant_id(5)]]
+const int2 sc5 = {0,0}; // error
+
+[numthreads(1,1,1)]
+void main() {
+  // expected-warning@+1{{'constant_id' attribute only applies to external global variables}}
+  [[vk::constant_id(6)]]
+  const bool sc6 = false; // error
+}
+#endif

From c1ac87b327861a7387c1ab9e1ffb1c002acbcd6a Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 19 Jun 2025 12:05:58 -0400
Subject: [PATCH 0955/1322] [libc] Fix setjmp build order. (#144917)

Fix build order issue from
https://github.com/llvm/llvm-project/pull/139555.
---
 libc/src/setjmp/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libc/src/setjmp/CMakeLists.txt b/libc/src/setjmp/CMakeLists.txt
index 50c827254da6..8b8e74f0955e 100644
--- a/libc/src/setjmp/CMakeLists.txt
+++ b/libc/src/setjmp/CMakeLists.txt
@@ -1,7 +1,4 @@
 # Process architecture-specific subdirectory FIRST to avoid missing targets.
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
-endif()
 
 # Then process OS-specific subdirectory
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
@@ -14,6 +11,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   )
 endif()
 
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+endif()
+
 add_entrypoint_object(
   setjmp
   ALIAS

From b8337349d9b6143669e8bfa6776926a708cacf99 Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 19 Jun 2025 12:18:39 -0400
Subject: [PATCH 0956/1322] [libc][math] Skip setting errno and floating point
 exception for math functions when LIBC_MATH flag has LIBC_MATH_NO_ERRNO and
 LIBC_MATH_NO_EXCEPT. (#144920)

---
 libc/src/__support/FPUtil/FEnvImpl.h     | 17 +++++++++++++----
 libc/src/__support/macros/optimization.h |  8 ++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 50a101f833c5..ba145a3da45c 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -15,6 +15,7 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/architectures.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
@@ -71,27 +72,35 @@ LIBC_INLINE int set_env(const fenv_t *) { return 0; }
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
-LIBC_INLINE int clear_except_if_required(int excepts) {
+LIBC_INLINE static int clear_except_if_required([[maybe_unused]] int excepts) {
+#ifndef LIBC_MATH_HAS_NO_EXCEPT
   if (math_errhandling & MATH_ERREXCEPT)
     return clear_except(excepts);
+#endif // LIBC_MATH_HAS_NO_EXCEPT
   return 0;
 }
 
-LIBC_INLINE int set_except_if_required(int excepts) {
+LIBC_INLINE static int set_except_if_required([[maybe_unused]] int excepts) {
+#ifndef LIBC_MATH_HAS_NO_EXCEPT
   if (math_errhandling & MATH_ERREXCEPT)
     return set_except(excepts);
+#endif // LIBC_MATH_HAS_NO_EXCEPT
   return 0;
 }
 
-LIBC_INLINE int raise_except_if_required(int excepts) {
+LIBC_INLINE static int raise_except_if_required([[maybe_unused]] int excepts) {
+#ifndef LIBC_MATH_HAS_NO_EXCEPT
   if (math_errhandling & MATH_ERREXCEPT)
     return raise_except(excepts);
+#endif // LIBC_MATH_HAS_NO_EXCEPT
   return 0;
 }
 
-LIBC_INLINE void set_errno_if_required(int err) {
+LIBC_INLINE static void set_errno_if_required([[maybe_unused]] int err) {
+#ifndef LIBC_MATH_HAS_NO_ERRNO
   if (math_errhandling & MATH_ERRNO)
     libc_errno = err;
+#endif // LIBC_MATH_HAS_NO_ERRNO
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index 253843e5e37a..db008d323b3a 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -63,4 +63,12 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
 #define LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
 #endif
 
+#if (LIBC_MATH & LIBC_MATH_NO_ERRNO)
+#define LIBC_MATH_HAS_NO_ERRNO
+#endif
+
+#if (LIBC_MATH & LIBC_MATH_NO_EXCEPT)
+#define LIBC_MATH_HAS_NO_EXCEPT
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H

From 5cf7d871b030212d021ffc9356620551f09ad402 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 19 Jun 2025 09:25:04 -0700
Subject: [PATCH 0957/1322] [NFC][Clang][AST] Adopt simplified
 `getTrailingObjects` in AST (#144432)

Adopt simplified `getTrailingObjects` API in several places in clag/AST
that were missed by earlier changes.
---
 clang/include/clang/AST/DeclTemplate.h  | 16 +++-------
 clang/include/clang/AST/Expr.h          |  7 ++---
 clang/include/clang/AST/OpenACCClause.h |  2 +-
 clang/include/clang/AST/OpenMPClause.h  | 42 +++++++------------------
 clang/include/clang/AST/StmtOpenACC.h   |  2 +-
 clang/include/clang/AST/Type.h          |  6 ++--
 clang/lib/AST/ASTImporter.cpp           |  2 +-
 clang/lib/AST/Decl.cpp                  |  2 +-
 clang/lib/AST/ExprCXX.cpp               |  6 ++--
 clang/lib/AST/OpenMPClause.cpp          | 12 +++----
 clang/lib/AST/Type.cpp                  |  2 +-
 11 files changed, 33 insertions(+), 66 deletions(-)

diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 8d8b1ca93882..939b14b0351d 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -279,7 +279,7 @@ public:
 
   /// Produce this as an array ref.
   ArrayRef<TemplateArgument> asArray() const {
-    return llvm::ArrayRef(data(), size());
+    return getTrailingObjects(size());
   }
 
   /// Retrieve the number of template arguments in this
@@ -287,9 +287,7 @@ public:
   unsigned size() const { return NumArguments; }
 
   /// Retrieve a pointer to the template argument list.
-  const TemplateArgument *data() const {
-    return getTrailingObjects<TemplateArgument>();
-  }
+  const TemplateArgument *data() const { return getTrailingObjects(); }
 };
 
 void *allocateDefaultArgStorageChain(const ASTContext &C);
@@ -505,12 +503,10 @@ private:
         TemplateArgumentsAsWritten(TemplateArgsAsWritten),
         PointOfInstantiation(POI) {
     if (MSInfo)
-      getTrailingObjects<MemberSpecializationInfo *>()[0] = MSInfo;
+      getTrailingObjects()[0] = MSInfo;
   }
 
-  size_t numTrailingObjects(OverloadToken<MemberSpecializationInfo*>) const {
-    return Function.getInt();
-  }
+  size_t numTrailingObjects() const { return Function.getInt(); }
 
 public:
   friend TrailingObjects;
@@ -597,9 +593,7 @@ public:
   /// function and the function template, and should always be
   /// TSK_ExplicitSpecialization whenever we have MemberSpecializationInfo.
   MemberSpecializationInfo *getMemberSpecializationInfo() const {
-    return numTrailingObjects(OverloadToken<MemberSpecializationInfo *>())
-               ? getTrailingObjects<MemberSpecializationInfo *>()[0]
-               : nullptr;
+    return numTrailingObjects() ? getTrailingObjects()[0] : nullptr;
   }
 
   void Profile(llvm::FoldingSetNodeID &ID) {
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 9fc23d30b733..41e50359962e 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -7364,17 +7364,14 @@ public:
                               ArrayRef<Expr *> SubExprs);
   static RecoveryExpr *CreateEmpty(ASTContext &Ctx, unsigned NumSubExprs);
 
-  ArrayRef<Expr *> subExpressions() {
-    auto *B = getTrailingObjects<Expr *>();
-    return llvm::ArrayRef(B, B + NumExprs);
-  }
+  ArrayRef<Expr *> subExpressions() { return getTrailingObjects(NumExprs); }
 
   ArrayRef<const Expr *> subExpressions() const {
     return const_cast<RecoveryExpr *>(this)->subExpressions();
   }
 
   child_range children() {
-    Stmt **B = reinterpret_cast<Stmt **>(getTrailingObjects<Expr *>());
+    Stmt **B = reinterpret_cast<Stmt **>(getTrailingObjects());
     return child_range(B, B + NumExprs);
   }
 
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 67fbdfeb0702..a778c7cc3dc9 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -307,7 +307,7 @@ public:
   }
 
   ArrayRef<DeviceTypeArgument> getArchitectures() const {
-    return getTrailingObjects<DeviceTypeArgument>(NumArchs);
+    return getTrailingObjects(NumArchs);
   }
 
   static OpenACCDeviceTypeClause *
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 6fd16bc0f03b..2fa8fa529741 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -302,8 +302,7 @@ protected:
   void setVarRefs(ArrayRef<Expr *> VL) {
     assert(VL.size() == NumVars &&
            "Number of variables is not the same as the preallocated buffer");
-    std::copy(VL.begin(), VL.end(),
-              static_cast<T *>(this)->template getTrailingObjects<Expr *>());
+    llvm::copy(VL, getVarRefs().begin());
   }
 
 public:
@@ -388,9 +387,7 @@ public:
     assert(
         DK.size() == NumKinds &&
         "Number of directive kinds is not the same as the preallocated buffer");
-    std::copy(DK.begin(), DK.end(),
-              static_cast<T *>(this)
-                  ->template getTrailingObjects<OpenMPDirectiveKind>());
+    std::copy(DK.begin(), DK.end(), getDirectiveKinds().begin());
   }
 
   SourceLocation getLParenLoc() { return LParenLoc; }
@@ -980,20 +977,14 @@ public:
 
   /// Returns the tile size expressions.
   MutableArrayRef<Expr *> getSizesRefs() {
-    return static_cast<OMPSizesClause *>(this)
-        ->template getTrailingObjects<Expr *>(NumSizes);
-  }
-  ArrayRef<Expr *> getSizesRefs() const {
-    return static_cast<const OMPSizesClause *>(this)
-        ->template getTrailingObjects<Expr *>(NumSizes);
+    return getTrailingObjects(NumSizes);
   }
+  ArrayRef<Expr *> getSizesRefs() const { return getTrailingObjects(NumSizes); }
 
   /// Sets the tile size expressions.
   void setSizesRefs(ArrayRef<Expr *> VL) {
     assert(VL.size() == NumSizes);
-    std::copy(VL.begin(), VL.end(),
-              static_cast<OMPSizesClause *>(this)
-                  ->template getTrailingObjects<Expr *>());
+    llvm::copy(VL, getSizesRefs().begin());
   }
 
   child_range children() {
@@ -1043,8 +1034,7 @@ class OMPPermutationClause final
   /// Sets the permutation index expressions.
   void setArgRefs(ArrayRef<Expr *> VL) {
     assert(VL.size() == NumLoops && "Expecting one expression per loop");
-    llvm::copy(VL, static_cast<OMPPermutationClause *>(this)
-                       ->template getTrailingObjects<Expr *>());
+    llvm::copy(VL, getTrailingObjects());
   }
 
   /// Build an empty clause.
@@ -1083,14 +1073,8 @@ public:
 
   /// Returns the permutation index expressions.
   ///@{
-  MutableArrayRef<Expr *> getArgsRefs() {
-    return static_cast<OMPPermutationClause *>(this)
-        ->template getTrailingObjects<Expr *>(NumLoops);
-  }
-  ArrayRef<Expr *> getArgsRefs() const {
-    return static_cast<const OMPPermutationClause *>(this)
-        ->template getTrailingObjects<Expr *>(NumLoops);
-  }
+  MutableArrayRef<Expr *> getArgsRefs() { return getTrailingObjects(NumLoops); }
+  ArrayRef<Expr *> getArgsRefs() const { return getTrailingObjects(NumLoops); }
   ///@}
 
   child_range children() {
@@ -9239,9 +9223,7 @@ class OMPAffinityClause final
                                             SourceLocation(), N) {}
 
   /// Sets the affinity modifier for the clause, if any.
-  void setModifier(Expr *E) {
-    getTrailingObjects<Expr *>()[varlist_size()] = E;
-  }
+  void setModifier(Expr *E) { getTrailingObjects()[varlist_size()] = E; }
 
   /// Sets the location of ':' symbol.
   void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; }
@@ -9268,10 +9250,8 @@ public:
   static OMPAffinityClause *CreateEmpty(const ASTContext &C, unsigned N);
 
   /// Gets affinity modifier.
-  Expr *getModifier() { return getTrailingObjects<Expr *>()[varlist_size()]; }
-  Expr *getModifier() const {
-    return getTrailingObjects<Expr *>()[varlist_size()];
-  }
+  Expr *getModifier() { return getTrailingObjects()[varlist_size()]; }
+  Expr *getModifier() const { return getTrailingObjects()[varlist_size()]; }
 
   /// Gets the location of ':' symbol.
   SourceLocation getColonLoc() const { return ColonLoc; }
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index c8f8b968b1c8..9ad3d8e00d98 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -736,7 +736,7 @@ class OpenACCUpdateConstruct final
                              OpenACCDirectiveKind::Update, SourceLocation{},
                              SourceLocation{}, SourceLocation{}) {
     std::uninitialized_value_construct_n(getTrailingObjects(), NumClauses);
-    setClauseList(getTrailingObjects<const OpenACCClause *>(NumClauses));
+    setClauseList(getTrailingObjects(NumClauses));
   }
 
   OpenACCUpdateConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3896cd914bf0..35a8b898d8e1 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -6052,9 +6052,7 @@ public:
                       ArrayRef<QualType> Expansions);
 
 private:
-  const QualType *getExpansionsPtr() const {
-    return getTrailingObjects<QualType>();
-  }
+  const QualType *getExpansionsPtr() const { return getTrailingObjects(); }
 
   static TypeDependence computeDependence(QualType Pattern, Expr *IndexExpr,
                                           ArrayRef<QualType> Expansions = {});
@@ -6494,7 +6492,7 @@ public:
   uint32_t getSize() const { return Size; }
   uint32_t getAlignment() const { return Alignment; }
   ArrayRef<SpirvOperand> getOperands() const {
-    return getTrailingObjects<SpirvOperand>(NumOperands);
+    return getTrailingObjects(NumOperands);
   }
 
   bool isSugared() const { return false; }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 5c44353d8b98..96a5e2eeaa4d 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -4452,7 +4452,7 @@ ExpectedDecl ASTNodeImporter::VisitFriendDecl(FriendDecl *D) {
   }
 
   SmallVector<TemplateParameterList *, 1> ToTPLists(D->NumTPLists);
-  auto **FromTPLists = D->getTrailingObjects<TemplateParameterList *>();
+  auto **FromTPLists = D->getTrailingObjects();
   for (unsigned I = 0; I < D->NumTPLists; I++) {
     if (auto ListOrErr = import(FromTPLists[I]))
       ToTPLists[I] = *ListOrErr;
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 860968939b4a..c4376aab480c 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -5380,7 +5380,7 @@ PragmaCommentDecl *PragmaCommentDecl::Create(const ASTContext &C,
       new (C, DC, additionalSizeToAlloc<char>(Arg.size() + 1))
           PragmaCommentDecl(DC, CommentLoc, CommentKind);
   memcpy(PCD->getTrailingObjects(), Arg.data(), Arg.size());
-  PCD->getTrailingObjects<char>()[Arg.size()] = '\0';
+  PCD->getTrailingObjects()[Arg.size()] = '\0';
   return PCD;
 }
 
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 43b1c39d7379..2b66445fe253 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -806,8 +806,7 @@ CXXDynamicCastExpr *CXXDynamicCastExpr::Create(const ASTContext &C, QualType T,
       new (Buffer) CXXDynamicCastExpr(T, VK, K, Op, PathSize, WrittenTy, L,
                                       RParenLoc, AngleBrackets);
   if (PathSize)
-    llvm::uninitialized_copy(*BasePath,
-                             E->getTrailingObjects<CXXBaseSpecifier *>());
+    llvm::uninitialized_copy(*BasePath, E->getTrailingObjects());
   return E;
 }
 
@@ -869,8 +868,7 @@ CXXReinterpretCastExpr::Create(const ASTContext &C, QualType T,
       new (Buffer) CXXReinterpretCastExpr(T, VK, K, Op, PathSize, WrittenTy, L,
                                           RParenLoc, AngleBrackets);
   if (PathSize)
-    llvm::uninitialized_copy(*BasePath,
-                             E->getTrailingObjects<CXXBaseSpecifier *>());
+    llvm::uninitialized_copy(*BasePath, E->getTrailingObjects());
   return E;
 }
 
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 0e5052b94416..f714974b9476 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -370,26 +370,26 @@ OMPOrderedClause *OMPOrderedClause::CreateEmpty(const ASTContext &C,
 void OMPOrderedClause::setLoopNumIterations(unsigned NumLoop,
                                             Expr *NumIterations) {
   assert(NumLoop < NumberOfLoops && "out of loops number.");
-  getTrailingObjects<Expr *>()[NumLoop] = NumIterations;
+  getTrailingObjects()[NumLoop] = NumIterations;
 }
 
 ArrayRef<Expr *> OMPOrderedClause::getLoopNumIterations() const {
-  return getTrailingObjects<Expr *>(NumberOfLoops);
+  return getTrailingObjects(NumberOfLoops);
 }
 
 void OMPOrderedClause::setLoopCounter(unsigned NumLoop, Expr *Counter) {
   assert(NumLoop < NumberOfLoops && "out of loops number.");
-  getTrailingObjects<Expr *>()[NumberOfLoops + NumLoop] = Counter;
+  getTrailingObjects()[NumberOfLoops + NumLoop] = Counter;
 }
 
 Expr *OMPOrderedClause::getLoopCounter(unsigned NumLoop) {
   assert(NumLoop < NumberOfLoops && "out of loops number.");
-  return getTrailingObjects<Expr *>()[NumberOfLoops + NumLoop];
+  return getTrailingObjects()[NumberOfLoops + NumLoop];
 }
 
 const Expr *OMPOrderedClause::getLoopCounter(unsigned NumLoop) const {
   assert(NumLoop < NumberOfLoops && "out of loops number.");
-  return getTrailingObjects<Expr *>()[NumberOfLoops + NumLoop];
+  return getTrailingObjects()[NumberOfLoops + NumLoop];
 }
 
 OMPUpdateClause *OMPUpdateClause::Create(const ASTContext &C,
@@ -1678,7 +1678,7 @@ OMPInitClause *OMPInitClause::Create(const ASTContext &C, Expr *InteropVar,
       InteropInfo.IsTarget, InteropInfo.IsTargetSync, StartLoc, LParenLoc,
       VarLoc, EndLoc, InteropInfo.PreferTypes.size() + 1);
   Clause->setInteropVar(InteropVar);
-  llvm::copy(InteropInfo.PreferTypes, Clause->getTrailingObjects<Expr *>() + 1);
+  llvm::copy(InteropInfo.PreferTypes, Clause->getTrailingObjects() + 1);
   return Clause;
 }
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index a461dbde4093..543f05e4ee7c 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3981,7 +3981,7 @@ CountAttributedType::CountAttributedType(
   CountAttributedTypeBits.NumCoupledDecls = CoupledDecls.size();
   CountAttributedTypeBits.CountInBytes = CountInBytes;
   CountAttributedTypeBits.OrNull = OrNull;
-  auto *DeclSlot = getTrailingObjects<TypeCoupledDeclRefInfo>();
+  auto *DeclSlot = getTrailingObjects();
   Decls = llvm::ArrayRef(DeclSlot, CoupledDecls.size());
   for (unsigned i = 0; i != CoupledDecls.size(); ++i)
     DeclSlot[i] = CoupledDecls[i];

From c0cc81cdc03c97473ba771bbc3a2330bd22396bc Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 19 Jun 2025 09:25:32 -0700
Subject: [PATCH 0958/1322] [NFC][Clang] Adopt simplified `getTrailingObjects`
 in ASTReader (#144438)

---
 clang/lib/Serialization/ASTReaderDecl.cpp | 24 +++++++++++------------
 clang/lib/Serialization/ASTReaderStmt.cpp | 19 +++++++++---------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 259c772e4222..7f7882654b9d 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -664,22 +664,21 @@ void ASTDeclReader::VisitPragmaCommentDecl(PragmaCommentDecl *D) {
   D->setLocation(readSourceLocation());
   D->CommentKind = (PragmaMSCommentKind)Record.readInt();
   std::string Arg = readString();
-  memcpy(D->getTrailingObjects<char>(), Arg.data(), Arg.size());
-  D->getTrailingObjects<char>()[Arg.size()] = '\0';
+  memcpy(D->getTrailingObjects(), Arg.data(), Arg.size());
+  D->getTrailingObjects()[Arg.size()] = '\0';
 }
 
 void ASTDeclReader::VisitPragmaDetectMismatchDecl(PragmaDetectMismatchDecl *D) {
   VisitDecl(D);
   D->setLocation(readSourceLocation());
   std::string Name = readString();
-  memcpy(D->getTrailingObjects<char>(), Name.data(), Name.size());
-  D->getTrailingObjects<char>()[Name.size()] = '\0';
+  memcpy(D->getTrailingObjects(), Name.data(), Name.size());
+  D->getTrailingObjects()[Name.size()] = '\0';
 
   D->ValueStart = Name.size() + 1;
   std::string Value = readString();
-  memcpy(D->getTrailingObjects<char>() + D->ValueStart, Value.data(),
-         Value.size());
-  D->getTrailingObjects<char>()[D->ValueStart + Value.size()] = '\0';
+  memcpy(D->getTrailingObjects() + D->ValueStart, Value.data(), Value.size());
+  D->getTrailingObjects()[D->ValueStart + Value.size()] = '\0';
 }
 
 void ASTDeclReader::VisitTranslationUnitDecl(TranslationUnitDecl *TU) {
@@ -1746,7 +1745,7 @@ void ASTDeclReader::VisitParmVarDecl(ParmVarDecl *PD) {
 
 void ASTDeclReader::VisitDecompositionDecl(DecompositionDecl *DD) {
   VisitVarDecl(DD);
-  auto **BDs = DD->getTrailingObjects<BindingDecl *>();
+  auto **BDs = DD->getTrailingObjects();
   for (unsigned I = 0; I != DD->NumBindings; ++I) {
     BDs[I] = readDeclAs<BindingDecl>();
     BDs[I]->setDecomposedDecl(DD);
@@ -1917,7 +1916,7 @@ void ASTDeclReader::VisitUsingEnumDecl(UsingEnumDecl *D) {
 void ASTDeclReader::VisitUsingPackDecl(UsingPackDecl *D) {
   VisitNamedDecl(D);
   D->InstantiatedFrom = readDeclAs<NamedDecl>();
-  auto **Expansions = D->getTrailingObjects<NamedDecl *>();
+  auto **Expansions = D->getTrailingObjects();
   for (unsigned I = 0; I != D->NumExpansions; ++I)
     Expansions[I] = readDeclAs<NamedDecl>();
   mergeMergeable(D);
@@ -2358,7 +2357,7 @@ void ASTDeclReader::VisitImportDecl(ImportDecl *D) {
   VisitDecl(D);
   D->ImportedModule = readModule();
   D->setImportComplete(Record.readInt());
-  auto *StoredLocs = D->getTrailingObjects<SourceLocation>();
+  auto *StoredLocs = D->getTrailingObjects();
   for (unsigned I = 0, N = Record.back(); I != N; ++I)
     StoredLocs[I] = readSourceLocation();
   Record.skipInts(1); // The number of stored source locations.
@@ -2376,8 +2375,7 @@ void ASTDeclReader::VisitFriendDecl(FriendDecl *D) {
   else
     D->Friend = readTypeSourceInfo();
   for (unsigned i = 0; i != D->NumTPLists; ++i)
-    D->getTrailingObjects<TemplateParameterList *>()[i] =
-        Record.readTemplateParameterList();
+    D->getTrailingObjects()[i] = Record.readTemplateParameterList();
   D->NextFriend = readDeclID().getRawValue();
   D->UnsupportedFriend = (Record.readInt() != 0);
   D->FriendLoc = readSourceLocation();
@@ -2745,7 +2743,7 @@ void ASTDeclReader::VisitTemplateTemplateParmDecl(TemplateTemplateParmDecl *D) {
   D->setDepth(Record.readInt());
   D->setPosition(Record.readInt());
   if (D->isExpandedParameterPack()) {
-    auto **Data = D->getTrailingObjects<TemplateParameterList *>();
+    auto **Data = D->getTrailingObjects();
     for (unsigned I = 0, N = D->getNumExpansionTemplateParameters();
          I != N; ++I)
       Data[I] = Record.readTemplateParameterList();
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 44cfb83ad2db..8945407cf666 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -719,7 +719,7 @@ void ASTStmtReader::VisitParenListExpr(ParenListExpr *E) {
   unsigned NumExprs = Record.readInt();
   assert((NumExprs == E->getNumExprs()) && "Wrong NumExprs!");
   for (unsigned I = 0; I != NumExprs; ++I)
-    E->getTrailingObjects<Stmt *>()[I] = Record.readSubStmt();
+    E->getTrailingObjects()[I] = Record.readSubStmt();
   E->LParenLoc = readSourceLocation();
   E->RParenLoc = readSourceLocation();
 }
@@ -1892,7 +1892,7 @@ void ASTStmtReader::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) {
   E->CXXDefaultArgExprBits.Loc = readSourceLocation();
   E->CXXDefaultArgExprBits.HasRewrittenInit = Record.readInt();
   if (E->CXXDefaultArgExprBits.HasRewrittenInit)
-    *E->getTrailingObjects<Expr *>() = Record.readSubExpr();
+    *E->getTrailingObjects() = Record.readSubExpr();
 }
 
 void ASTStmtReader::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *E) {
@@ -1902,7 +1902,7 @@ void ASTStmtReader::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *E) {
   E->UsedContext = readDeclAs<DeclContext>();
   E->CXXDefaultInitExprBits.Loc = readSourceLocation();
   if (E->CXXDefaultInitExprBits.HasRewrittenInit)
-    *E->getTrailingObjects<Expr *>() = Record.readSubExpr();
+    *E->getTrailingObjects() = Record.readSubExpr();
 }
 
 void ASTStmtReader::VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
@@ -1999,7 +1999,7 @@ void ASTStmtReader::VisitExprWithCleanups(ExprWithCleanups *E) {
       Obj = cast<CompoundLiteralExpr>(Record.readSubExpr());
     else
       llvm_unreachable("unexpected cleanup object type");
-    E->getTrailingObjects<ExprWithCleanups::CleanupObject>()[i] = Obj;
+    E->getTrailingObjects()[i] = Obj;
   }
 
   E->ExprWithCleanupsBits.CleanupsHaveSideEffects = Record.readInt();
@@ -2198,9 +2198,8 @@ void ASTStmtReader::VisitSizeOfPackExpr(SizeOfPackExpr *E) {
   E->Pack = Record.readDeclAs<NamedDecl>();
   if (E->isPartiallySubstituted()) {
     assert(E->Length == NumPartialArgs);
-    for (auto *I = E->getTrailingObjects<TemplateArgument>(),
-              *E = I + NumPartialArgs;
-         I != E; ++I)
+    for (auto *I = E->getTrailingObjects(), *E = I + NumPartialArgs; I != E;
+         ++I)
       new (I) TemplateArgument(Record.readTemplateArgument());
   } else if (!E->isValueDependent()) {
     E->Length = Record.readInt();
@@ -2215,7 +2214,7 @@ void ASTStmtReader::VisitPackIndexingExpr(PackIndexingExpr *E) {
   E->RSquareLoc = readSourceLocation();
   E->SubExprs[0] = Record.readStmt();
   E->SubExprs[1] = Record.readStmt();
-  auto **Exprs = E->getTrailingObjects<Expr *>();
+  auto **Exprs = E->getTrailingObjects();
   for (unsigned I = 0; I < E->PackIndexingExprBits.TransformedExpressions; ++I)
     Exprs[I] = Record.readExpr();
 }
@@ -2252,7 +2251,7 @@ void ASTStmtReader::VisitFunctionParmPackExpr(FunctionParmPackExpr *E) {
   E->NumParameters = Record.readInt();
   E->ParamPack = readDeclAs<ValueDecl>();
   E->NameLoc = readSourceLocation();
-  auto **Parms = E->getTrailingObjects<ValueDecl *>();
+  auto **Parms = E->getTrailingObjects();
   for (unsigned i = 0, n = E->NumParameters; i != n; ++i)
     Parms[i] = readDeclAs<ValueDecl>();
 }
@@ -2289,7 +2288,7 @@ void ASTStmtReader::VisitCXXParenListInitExpr(CXXParenListInitExpr *E) {
   E->LParenLoc = readSourceLocation();
   E->RParenLoc = readSourceLocation();
   for (unsigned I = 0; I < ExpectedNumExprs; I++)
-    E->getTrailingObjects<Expr *>()[I] = Record.readSubExpr();
+    E->getTrailingObjects()[I] = Record.readSubExpr();
 
   bool HasArrayFillerOrUnionDecl = Record.readBool();
   if (HasArrayFillerOrUnionDecl) {

From 3fe62682ef9ca514b899d0cecaebb8f1fd97baef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 19 Jun 2025 17:34:08 +0100
Subject: [PATCH 0959/1322] [mlir][vector] Use `result` consistently as the
 result argument name (#144739)

This patch updates the following ops to use `result` (instead of `res`)
as the name for their result argument:
  * `vector.scalable.insert`
  * `vector.scalable.extract`
  * `vector.insert_strided_slice`

This change ensures naming consistency with other ops in the `vector`
dialect. It addresses part of:
* https://github.com/llvm/llvm-project/issues/131602
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 125cd4645ccc..85cc22ab3964 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1008,7 +1008,7 @@ def Vector_InsertOp :
 def Vector_ScalableInsertOp :
   Vector_Op<"scalable.insert", [Pure,
        AllElementTypesMatch<["valueToStore", "dest"]>,
-       AllTypesMatch<["dest", "res"]>,
+       AllTypesMatch<["dest", "result"]>,
        PredOpTrait<"position is a multiple of the source length.",
         CPred<
           "(getPos() % getSourceVectorType().getNumElements()) == 0"
@@ -1016,7 +1016,7 @@ def Vector_ScalableInsertOp :
      Arguments<(ins VectorOfRank<[1]>:$valueToStore,
                     ScalableVectorOfRank<[1]>:$dest,
                     I64Attr:$pos)>,
-     Results<(outs ScalableVectorOfRank<[1]>:$res)> {
+     Results<(outs ScalableVectorOfRank<[1]>:$result)> {
   let summary = "insert subvector into scalable vector operation";
   // NOTE: This operation is designed to map to `llvm.vector.insert`, and its
   //       documentation should be kept aligned with LLVM IR:
@@ -1059,19 +1059,23 @@ def Vector_ScalableInsertOp :
     VectorType getDestVectorType() {
       return ::llvm::cast<VectorType>(getDest().getType());
     }
+    /// Wrapper for getResult, which replaced getRes.
+    [[deprecated("Use getResult instead!")]] ::mlir::Value getRes() {
+      return getResult();
+    }
   }];
 }
 
 def Vector_ScalableExtractOp :
   Vector_Op<"scalable.extract", [Pure,
-       AllElementTypesMatch<["source", "res"]>,
+       AllElementTypesMatch<["source", "result"]>,
        PredOpTrait<"position is a multiple of the result length.",
         CPred<
           "(getPos() % getResultVectorType().getNumElements()) == 0"
         >>]>,
      Arguments<(ins ScalableVectorOfRank<[1]>:$source,
                     I64Attr:$pos)>,
-     Results<(outs VectorOfRank<[1]>:$res)> {
+     Results<(outs VectorOfRank<[1]>:$result)> {
   let summary = "extract subvector from scalable vector operation";
   // NOTE: This operation is designed to map to `llvm.vector.extract`, and its
   //       documentation should be kept aligned with LLVM IR:
@@ -1100,7 +1104,7 @@ def Vector_ScalableExtractOp :
   }];
 
   let assemblyFormat = [{
-    $source `[` $pos `]` attr-dict `:` type($res) `from` type($source)
+    $source `[` $pos `]` attr-dict `:` type($result) `from` type($source)
   }];
 
   let extraClassDeclaration = extraPoisonClassDeclaration # [{
@@ -1108,7 +1112,11 @@ def Vector_ScalableExtractOp :
       return ::llvm::cast<VectorType>(getSource().getType());
     }
     VectorType getResultVectorType() {
-      return ::llvm::cast<VectorType>(getRes().getType());
+      return ::llvm::cast<VectorType>(getResult().getType());
+    }
+    /// Wrapper for getResult, which replaced getRes.
+    [[deprecated("Use getResult instead!")]] ::mlir::Value getRes() {
+      return getResult();
     }
   }];
 }
@@ -1117,10 +1125,10 @@ def Vector_InsertStridedSliceOp :
   Vector_Op<"insert_strided_slice", [Pure,
     PredOpTrait<"operand #0 and result have same element type",
                  TCresVTEtIsSameAsOpBase<0, 0>>,
-    AllTypesMatch<["dest", "res"]>]>,
+    AllTypesMatch<["dest", "result"]>]>,
     Arguments<(ins AnyVectorOfNonZeroRank:$valueToStore, AnyVectorOfNonZeroRank:$dest, I64ArrayAttr:$offsets,
                I64ArrayAttr:$strides)>,
-    Results<(outs AnyVectorOfNonZeroRank:$res)> {
+    Results<(outs AnyVectorOfNonZeroRank:$result)> {
   let summary = "strided_slice operation";
   let description = [{
     Takes a k-D valueToStore vector, an n-D destination vector (n >= k), n-sized
@@ -1164,6 +1172,10 @@ def Vector_InsertStridedSliceOp :
         return ::llvm::cast<IntegerAttr>(attr).getInt() != 1;
       });
     }
+    /// Wrapper for getResult, which replaced getRes.
+    [[deprecated("Use getResult instead!")]] ::mlir::Value getRes() {
+      return getResult();
+    }
   }];
 
   let hasFolder = 1;

From 0816bb32ac37b24d2f895f0c0464b7659fffd4fc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 19 Jun 2025 17:49:33 +0100
Subject: [PATCH 0960/1322] [Matrix] Fix heap-use-after-free after
 0fa373c77ded203eddb.

We need to skip instructions in FusedInsts, as they may have been
deleted. Fixes a heap-use-after-free after #141681.
---
 llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index fa9e44617b7c..ccb68700747b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1140,6 +1140,9 @@ public:
     // Fourth, pre-process all the PHINode's. The incoming values will be
     // assigned later in VisitPHI.
     for (Instruction *Inst : MatrixInsts) {
+      if (FusedInsts.count(Inst))
+        continue;
+
       auto *PHI = dyn_cast<PHINode>(Inst);
       if (!PHI)
         continue;

From 6ce86538c11b3ef93a2a8df3bd4f817a724f42bd Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Thu, 19 Jun 2025 10:09:10 -0700
Subject: [PATCH 0961/1322] [mlir][cf] Preserve branch weights during
 cf.cond_br canonicalization. (#144822)

---
 .../Dialect/ControlFlow/IR/ControlFlowOps.td  | 20 ++++++++++-----
 .../Dialect/ControlFlow/IR/ControlFlowOps.cpp |  6 ++---
 .../Dialect/ControlFlow/canonicalize.mlir     | 25 +++++++++++++++++++
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index 79da81ba049d..a441fd82546e 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -153,17 +153,25 @@ def CondBranchOp
   let builders = [OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
                                 "ValueRange":$trueOperands,
                                 "Block *":$falseDest,
-                                "ValueRange":$falseOperands),
+                                "ValueRange":$falseOperands,
+                                CArg<"ArrayRef<int32_t>", "{}">:$branchWeights),
                             [{
-      build($_builder, $_state, condition, trueOperands, falseOperands, /*branch_weights=*/{}, trueDest,
-            falseDest);
+      DenseI32ArrayAttr weights;
+      if (!branchWeights.empty())
+        weights = $_builder.getDenseI32ArrayAttr(branchWeights);
+      build($_builder, $_state, condition, trueOperands, falseOperands,
+            weights, trueDest, falseDest);
     }]>,
                   OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
                                 "Block *":$falseDest,
-                                CArg<"ValueRange", "{}">:$falseOperands),
+                                CArg<"ValueRange", "{}">:$falseOperands,
+                                CArg<"ArrayRef<int32_t>", "{}">:$branchWeights),
                             [{
-      build($_builder, $_state, condition, trueDest, ValueRange(), falseDest,
-            falseOperands);
+      DenseI32ArrayAttr weights;
+      if (!branchWeights.empty())
+        weights = $_builder.getDenseI32ArrayAttr(branchWeights);
+      build($_builder, $_state, condition, ValueRange(), falseOperands,
+            weights, trueDest, falseDest);
     }]>];
 
   let extraClassDeclaration = [{
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index edd7f607f24f..0c11c76cf1f7 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -265,9 +265,9 @@ struct SimplifyPassThroughCondBranch : public OpRewritePattern<CondBranchOp> {
       return failure();
 
     // Create a new branch with the collapsed successors.
-    rewriter.replaceOpWithNewOp<CondBranchOp>(condbr, condbr.getCondition(),
-                                              trueDest, trueDestOperands,
-                                              falseDest, falseDestOperands);
+    rewriter.replaceOpWithNewOp<CondBranchOp>(
+        condbr, condbr.getCondition(), trueDest, trueDestOperands, falseDest,
+        falseDestOperands, condbr.getWeights());
     return success();
   }
 };
diff --git a/mlir/test/Dialect/ControlFlow/canonicalize.mlir b/mlir/test/Dialect/ControlFlow/canonicalize.mlir
index 0ad6898fce86..bf69935a00bf 100644
--- a/mlir/test/Dialect/ControlFlow/canonicalize.mlir
+++ b/mlir/test/Dialect/ControlFlow/canonicalize.mlir
@@ -102,6 +102,31 @@ func.func @cond_br_and_br_folding(%a : i32) {
 
 /// Test that pass-through successors of CondBranchOp get folded.
 
+// Test that the weights are preserved:
+// CHECK-LABEL:   func.func @cond_br_passthrough_weights(
+// CHECK-SAME:      %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i1) -> i32 {
+func.func @cond_br_passthrough_weights(%arg0 : i32, %arg1 : i32, %cond : i1) -> i32 {
+// CHECK:           cf.cond_br %[[ARG2]] weights([30, 70]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           return %[[ARG0]] : i32
+// CHECK:         ^bb2:
+// CHECK:           return %[[ARG1]] : i32
+// CHECK:         }
+  cf.cond_br %cond weights([30,70]), ^bb1, ^bb3
+
+^bb1:
+  cf.br ^bb2
+
+^bb3:
+  cf.br ^bb4
+
+^bb2:
+  return %arg0 : i32
+
+^bb4:
+  return %arg1 : i32
+}
+
 // CHECK-LABEL: func @cond_br_passthrough(
 // CHECK-SAME: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32, %[[COND:.*]]: i1
 func.func @cond_br_passthrough(%arg0 : i32, %arg1 : i32, %arg2 : i32, %cond : i1) -> (i32, i32) {

From c0c71463f6bca05eb4540b68cdcbd17c916562c9 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Thu, 19 Jun 2025 10:13:58 -0700
Subject: [PATCH 0962/1322] [InstCombine] Optimize
 sub(sext(add(x,y)),sext(add(x,z))). (#144174)

This pattern can be often met in Flang generated LLVM IR,
for example, for the counts of the loops generated for array
expressions like: `a(x:x+y)` or `a(x+z:x+z)` or their variations.

In order to compute the loop count, Flang needs to subtract
the lower bound of the array slice from the upper bound
of the array slice. To avoid the sign wraps, it sign extends
the original values (that may be of any user data type)
to `i64`.

This peephole is really helpful in CPU2017/548.exchange2,
where we have multiple following statements like this:
```
block(row+1:row+2, 7:9, i7) = block(row+1:row+2, 7:9, i7) - 10
```

While this is just a 2x3 iterations loop nest, LLVM cannot
figure it out, ending up vectorizing the inner loop really
hard (with a vector epilog and scalar remainder). This, in turn,
causes problems for LSR that ends up creating too many loop-carried
values in the loop containing the above statement, which are then
causing too many spills/reloads.

Alive2: https://alive2.llvm.org/ce/z/gLgfYX

Related to #143219.
---
 llvm/include/llvm/IR/PatternMatch.h           |   8 +
 .../InstCombine/InstCombineAddSub.cpp         |  51 ++-
 .../InstCombine/InstCombineMulDivRem.cpp      |   6 +-
 .../InstCombine/InstCombineNegator.cpp        |   8 +-
 .../Transforms/InstCombine/sub-sext-add.ll    | 350 ++++++++++++++++++
 5 files changed, 413 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/sub-sext-add.ll

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 2eaa7d0faabc..1f86cdfd94e1 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1323,6 +1323,14 @@ m_NSWAdd(const LHS &L, const RHS &R) {
                                                                             R);
 }
 template <typename LHS, typename RHS>
+inline OverflowingBinaryOp_match<LHS, RHS, Instruction::Add,
+                                 OverflowingBinaryOperator::NoSignedWrap, true>
+m_c_NSWAdd(const LHS &L, const RHS &R) {
+  return OverflowingBinaryOp_match<LHS, RHS, Instruction::Add,
+                                   OverflowingBinaryOperator::NoSignedWrap,
+                                   true>(L, R);
+}
+template <typename LHS, typename RHS>
 inline OverflowingBinaryOp_match<LHS, RHS, Instruction::Sub,
                                  OverflowingBinaryOperator::NoSignedWrap>
 m_NSWSub(const LHS &L, const RHS &R) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0a3837f2c0ce..418302d1edb3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1896,7 +1896,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
                                           {Sub, Builder.getFalse()});
     Value *Ret = Builder.CreateSub(
         ConstantInt::get(A->getType(), A->getType()->getScalarSizeInBits()),
-        Ctlz, "", /*HasNUW*/ true, /*HasNSW*/ true);
+        Ctlz, "", /*HasNUW=*/true, /*HasNSW=*/true);
     return replaceInstUsesWith(I, Builder.CreateZExtOrTrunc(Ret, I.getType()));
   }
 
@@ -2363,8 +2363,8 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     OverflowingBinaryOperator *LHSSub = cast<OverflowingBinaryOperator>(Op0);
     bool HasNUW = I.hasNoUnsignedWrap() && LHSSub->hasNoUnsignedWrap();
     bool HasNSW = HasNUW && I.hasNoSignedWrap() && LHSSub->hasNoSignedWrap();
-    Value *Add = Builder.CreateAdd(Y, Op1, "", /* HasNUW */ HasNUW,
-                                   /* HasNSW */ HasNSW);
+    Value *Add = Builder.CreateAdd(Y, Op1, "", /*HasNUW=*/HasNUW,
+                                   /*HasNSW=*/HasNSW);
     BinaryOperator *Sub = BinaryOperator::CreateSub(X, Add);
     Sub->setHasNoUnsignedWrap(HasNUW);
     Sub->setHasNoSignedWrap(HasNSW);
@@ -2835,6 +2835,51 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
     return Res;
 
+  // (sub (sext (add nsw (X, Y)), sext (X))) --> (sext (Y))
+  if (match(Op1, m_SExtLike(m_Value(X))) &&
+      match(Op0, m_SExtLike(m_c_NSWAdd(m_Specific(X), m_Value(Y))))) {
+    Value *SExtY = Builder.CreateSExt(Y, I.getType());
+    return replaceInstUsesWith(I, SExtY);
+  }
+
+  // (sub[ nsw] (sext (add nsw (X, Y)), sext (add nsw (X, Z)))) -->
+  // --> (sub[ nsw] (sext (Y), sext (Z)))
+  {
+    Value *Z, *Add0, *Add1;
+    if (match(Op0, m_SExtLike(m_Value(Add0))) &&
+        match(Op1, m_SExtLike(m_Value(Add1))) &&
+        ((match(Add0, m_NSWAdd(m_Value(X), m_Value(Y))) &&
+          match(Add1, m_c_NSWAdd(m_Specific(X), m_Value(Z)))) ||
+         (match(Add0, m_NSWAdd(m_Value(Y), m_Value(X))) &&
+          match(Add1, m_c_NSWAdd(m_Specific(X), m_Value(Z)))))) {
+      unsigned NumOfNewInstrs = 0;
+      // Non-constant Y, Z require new SExt.
+      NumOfNewInstrs += !isa<Constant>(Y) ? 1 : 0;
+      NumOfNewInstrs += !isa<Constant>(Z) ? 1 : 0;
+      // Check if we can trade some of the old instructions for the new ones.
+      unsigned NumOfDeadInstrs = 0;
+      if (Op0->hasOneUse()) {
+        // If Op0 (sext) has multiple uses, then we keep it
+        // and the add that it uses, otherwise, we can remove
+        // the sext and probably the add (depending on the number of its uses).
+        ++NumOfDeadInstrs;
+        NumOfDeadInstrs += Add0->hasOneUse() ? 1 : 0;
+      }
+      if (Op1->hasOneUse()) {
+        ++NumOfDeadInstrs;
+        NumOfDeadInstrs += Add1->hasOneUse() ? 1 : 0;
+      }
+      if (NumOfDeadInstrs >= NumOfNewInstrs) {
+        Value *SExtY = Builder.CreateSExt(Y, I.getType());
+        Value *SExtZ = Builder.CreateSExt(Z, I.getType());
+        Value *Sub = Builder.CreateSub(SExtY, SExtZ, "",
+                                       /*HasNUW=*/false,
+                                       /*HasNSW=*/I.hasNoSignedWrap());
+        return replaceInstUsesWith(I, Sub);
+      }
+    }
+  }
+
   return TryToNarrowDeduceFlags();
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index fcf4613b5d13..d7310b1c741c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -290,7 +290,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
       auto *Op1C = cast<Constant>(Op1);
       return replaceInstUsesWith(
           I, Builder.CreateMul(NegOp0, ConstantExpr::getNeg(Op1C), "",
-                               /* HasNUW */ false,
+                               /*HasNUW=*/false,
                                HasNSW && Op1C->isNotMinSignedValue()));
     }
 
@@ -1255,8 +1255,8 @@ static Value *foldIDivShl(BinaryOperator &I, InstCombiner::BuilderTy &Builder) {
       // or divisor has nsw and operator is sdiv.
       Value *Dividend = Builder.CreateShl(
           One, Y, "shl.dividend",
-          /*HasNUW*/ true,
-          /*HasNSW*/
+          /*HasNUW=*/true,
+          /*HasNSW=*/
           IsSigned ? (Shl0->hasNoUnsignedWrap() || Shl1->hasNoUnsignedWrap())
                    : Shl0->hasNoSignedWrap());
       return Builder.CreateLShr(Dividend, Z, "", I.isExact());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
index 2210336d92bf..b0a0bcfbde19 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -233,7 +233,7 @@ std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
     // However, only do this either if the old `sub` doesn't stick around, or
     // it was subtracting from a constant. Otherwise, this isn't profitable.
     return Builder.CreateSub(I->getOperand(1), I->getOperand(0),
-                             I->getName() + ".neg", /* HasNUW */ false,
+                             I->getName() + ".neg", /*HasNUW=*/false,
                              IsNSW && I->hasNoSignedWrap());
   }
 
@@ -404,7 +404,7 @@ std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
     IsNSW &= I->hasNoSignedWrap();
     if (Value *NegOp0 = negate(I->getOperand(0), IsNSW, Depth + 1))
       return Builder.CreateShl(NegOp0, I->getOperand(1), I->getName() + ".neg",
-                               /* HasNUW */ false, IsNSW);
+                               /*HasNUW=*/false, IsNSW);
     // Otherwise, `shl %x, C` can be interpreted as `mul %x, 1<<C`.
     Constant *Op1C;
     if (!match(I->getOperand(1), m_ImmConstant(Op1C)) || !IsTrulyNegation)
@@ -412,7 +412,7 @@ std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
     return Builder.CreateMul(
         I->getOperand(0),
         Builder.CreateShl(Constant::getAllOnesValue(Op1C->getType()), Op1C),
-        I->getName() + ".neg", /* HasNUW */ false, IsNSW);
+        I->getName() + ".neg", /*HasNUW=*/false, IsNSW);
   }
   case Instruction::Or: {
     if (!cast<PossiblyDisjointInst>(I)->isDisjoint())
@@ -483,7 +483,7 @@ std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
       // Can't negate either of them.
       return nullptr;
     return Builder.CreateMul(NegatedOp, OtherOp, I->getName() + ".neg",
-                             /* HasNUW */ false, IsNSW && I->hasNoSignedWrap());
+                             /*HasNUW=*/false, IsNSW && I->hasNoSignedWrap());
   }
   default:
     return nullptr; // Don't know, likely not negatible for free.
diff --git a/llvm/test/Transforms/InstCombine/sub-sext-add.ll b/llvm/test/Transforms/InstCombine/sub-sext-add.ll
new file mode 100644
index 000000000000..71e91f954420
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/sub-sext-add.ll
@@ -0,0 +1,350 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i64 @src_2add_2sext_sub_1(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_2(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %z, %x
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_3(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %y, %x
+  %add2 = add nsw i32 %z, %x
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_4(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %y, %x
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sextlike_sub(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sextlike_sub(
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SEXT1]], [[SEXT2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = zext nneg i32 %add1 to i64
+  %sext2 = zext nneg i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_nsw(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_nsw(
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SEXT1]], [[SEXT2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub nsw i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_nuw(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub nuw i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+declare void @use_i32(i32, i32)
+declare void @use_i64(i64, i64)
+
+define i64 @src_2add_2sext_sub_multiple_uses_1(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_1(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT1]], i64 [[SEXT1]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i64(i64 %sext1, i64 %sext1)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_2(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_2(
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[ADD2]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT2]], i64 [[SEXT2]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i64(i64 %sext2, i64 %sext2)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_3(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_3(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[ADD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SEXT1]], [[SEXT2]]
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT1]], i64 [[SEXT2]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i64(i64 %sext1, i64 %sext2)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_4(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_4(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    call void @use_i32(i32 [[ADD1]], i32 [[ADD2]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i32(i32 %add1, i32 %add2)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_5(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_5(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    call void @use_i32(i32 [[ADD1]], i32 [[ADD1]])
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT1]], i64 [[SEXT1]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i32(i32 %add1, i32 %add1)
+  call void @use_i64(i64 %sext1, i64 %sext1)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_6(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_6(
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[ADD2]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[Z]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    call void @use_i32(i32 [[ADD2]], i32 [[ADD2]])
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT2]], i64 [[SEXT2]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i32(i32 %add2, i32 %add2)
+  call void @use_i64(i64 %sext2, i64 %sext2)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_7(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_7(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[ADD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SEXT1]], [[SEXT2]]
+; CHECK-NEXT:    call void @use_i32(i32 [[ADD1]], i32 [[ADD2]])
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT1]], i64 [[SEXT2]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, %z
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i32(i32 %add1, i32 %add2)
+  call void @use_i64(i64 %sext1, i64 %sext2)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_8(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_8(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[ADD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SEXT1]], [[SEXT2]]
+; CHECK-NEXT:    call void @use_i32(i32 [[ADD1]], i32 [[ADD2]])
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT1]], i64 [[SEXT2]])
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %add2 = add nsw i32 %x, 1
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i32(i32 %add1, i32 %add2)
+  call void @use_i64(i64 %sext1, i64 %sext2)
+  ret i64 %sub
+}
+
+define i64 @src_2add_2sext_sub_multiple_uses_9(i32 %x) {
+; CHECK-LABEL: @src_2add_2sext_sub_multiple_uses_9(
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext i32 [[ADD2]] to i64
+; CHECK-NEXT:    call void @use_i32(i32 [[ADD1]], i32 [[ADD2]])
+; CHECK-NEXT:    call void @use_i64(i64 [[SEXT1]], i64 [[SEXT2]])
+; CHECK-NEXT:    ret i64 1
+;
+  %add1 = add nsw i32 %x, 2
+  %add2 = add nsw i32 %x, 1
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %add2 to i64
+  %sub = sub i64 %sext1, %sext2
+  call void @use_i32(i32 %add1, i32 %add2)
+  call void @use_i64(i64 %sext1, i64 %sext2)
+  ret i64 %sub
+}
+
+define i64 @src_x_add_2sext_sub_1(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_x_add_2sext_sub_1(
+; CHECK-NEXT:    [[SUB:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %x to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_x_add_2sext_sub_2(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_x_add_2sext_sub_2(
+; CHECK-NEXT:    [[SUB:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %y, %x
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %x to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_x_add_2sextlike_sub(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_x_add_2sextlike_sub(
+; CHECK-NEXT:    [[SUB:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %sext1 = zext nneg i32 %add1 to i64
+  %sext2 = zext nneg i32 %x to i64
+  %sub = sub i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_x_add_2sext_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_x_add_2sext_sub_nsw(
+; CHECK-NEXT:    [[SUB:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %x to i64
+  %sub = sub nsw i64 %sext1, %sext2
+  ret i64 %sub
+}
+
+define i64 @src_x_add_2sext_sub_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_x_add_2sext_sub_nuw(
+; CHECK-NEXT:    [[SUB:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add1 = add nsw i32 %x, %y
+  %sext1 = sext i32 %add1 to i64
+  %sext2 = sext i32 %x to i64
+  %sub = sub nuw i64 %sext1, %sext2
+  ret i64 %sub
+}

From 0b8179b2adbc821324c425d7cafd269f84e72d5e Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik@intel.com>
Date: Thu, 19 Jun 2025 10:35:40 -0700
Subject: [PATCH 0963/1322] [ValueTracking] Improve `Bitcast` handling to match
 SDAG (#125935)

Closes #125228
---
 llvm/lib/Analysis/ValueTracking.cpp           | 27 +++++++++++++++++--
 .../InstCombine/X86/x86-vector-shifts.ll      |  4 +--
 .../InstCombine/bitcast-known-bits.ll         | 21 +++++----------
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index a17417cb5189..73320b556f82 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1346,6 +1346,8 @@ static void computeKnownBitsFromOperator(const Operator *I,
         isa<ScalableVectorType>(I->getType()))
       break;
 
+    unsigned NumElts = DemandedElts.getBitWidth();
+    bool IsLE = Q.DL.isLittleEndian();
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
@@ -1364,7 +1366,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
-      unsigned NumElts = DemandedElts.getBitWidth();
       unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
@@ -1376,10 +1377,32 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc, Q,
                          Depth + 1);
-        unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
+        unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
+    // Look through a cast from wider vector elements to narrow type.
+    // Examples: v2i64 -> v4i32
+    if (SubBitWidth % BitWidth == 0) {
+      unsigned SubScale = SubBitWidth / BitWidth;
+      KnownBits KnownSrc(SubBitWidth);
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+      computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Q,
+                       Depth + 1);
+
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      for (unsigned i = 0; i != SubScale; ++i) {
+        if (DemandedElts[i]) {
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
+          Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
+          if (Known.isUnknown())
+            break;
+        }
+      }
+    }
     break;
   }
   case Instruction::SExt: {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index db56080a3ea2..cc252ae53803 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,7 +3732,6 @@ define <4 x i64> @test_avx2_psrl_0() {
   ret <4 x i64> %16
 }
 
-; FIXME: Failure to peek through bitcasts to ensure psllq shift amount is within bounds.
 define <2 x i64> @PR125228(<2 x i64> %v, <2 x i64> %s) {
 ; CHECK-LABEL: @PR125228(
 ; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
@@ -3741,7 +3740,8 @@ define <2 x i64> @PR125228(<2 x i64> %v, <2 x i64> %s) {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
 ; CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
-; CHECK-NEXT:    [[SLL1:%.*]] = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> [[V]], <2 x i64> [[CAST3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP2]]
 ; CHECK-NEXT:    [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    ret <2 x i64> [[SHUFP_UNCASTED]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
index 3e47e775e3a2..65b43df752f7 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
@@ -12,8 +12,7 @@ define <16 x i8> @knownbits_bitcast_masked_shift(<16 x i8> %arg1, <16 x i8> %arg
 ; CHECK-NEXT:    [[BITCAST4:%.*]] = bitcast <16 x i8> [[OR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL5:%.*]] = shl nuw <8 x i16> [[BITCAST4]], splat (i16 2)
 ; CHECK-NEXT:    [[BITCAST6:%.*]] = bitcast <8 x i16> [[SHL5]] to <16 x i8>
-; CHECK-NEXT:    [[AND7:%.*]] = and <16 x i8> [[BITCAST6]], splat (i8 -52)
-; CHECK-NEXT:    ret <16 x i8> [[AND7]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST6]]
 ;
   %and = and <16 x i8> %arg1, splat (i8 3)
   %and3 = and <16 x i8> %arg2, splat (i8 48)
@@ -33,8 +32,7 @@ define <16 x i8> @knownbits_shuffle_masked_nibble_shift(<16 x i8> %arg)  {
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
 ; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
-; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
-; CHECK-NEXT:    ret <16 x i8> [[AND3]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST2]]
 ;
   %and = and <16 x i8> %arg, splat (i8 15)
   %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -53,8 +51,7 @@ define <16 x i8> @knownbits_reverse_shuffle_masked_shift(<16 x i8> %arg)  {
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
 ; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
-; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
-; CHECK-NEXT:    ret <16 x i8> [[AND3]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST2]]
 ;
   %and = and <16 x i8> %arg, splat (i8 15)
   %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -70,8 +67,7 @@ define <16 x i8> @knownbits_extract_bit(<8 x i16> %arg)  {
 ; CHECK-SAME: <8 x i16> [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <8 x i16> [[ARG]], splat (i16 15)
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <8 x i16> [[LSHR]] to <16 x i8>
-; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[BITCAST1]], splat (i8 1)
-; CHECK-NEXT:    ret <16 x i8> [[AND]]
+; CHECK-NEXT:    ret <16 x i8> [[BITCAST1]]
 ;
   %lshr = lshr <8 x i16> %arg, splat (i16 15)
   %bitcast1 = bitcast <8 x i16> %lshr to <16 x i8>
@@ -88,7 +84,8 @@ define { i32, i1 } @knownbits_popcount_add_with_overflow(<2 x i64> %arg1, <2 x i
 ; CHECK-NEXT:    [[CALL9:%.*]] = tail call range(i64 0, 65) <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[ARG2]])
 ; CHECK-NEXT:    [[BITCAST10:%.*]] = bitcast <2 x i64> [[CALL9]] to <4 x i32>
 ; CHECK-NEXT:    [[EXTRACTELEMENT11:%.*]] = extractelement <4 x i32> [[BITCAST10]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[EXTRACTELEMENT]], i32 [[EXTRACTELEMENT11]])
+; CHECK-NEXT:    [[CALL12:%.*]] = add nuw nsw i32 [[EXTRACTELEMENT]], [[EXTRACTELEMENT11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, i1 } { i32 poison, i1 false }, i32 [[CALL12]], 0
 ; CHECK-NEXT:    ret { i32, i1 } [[TMP1]]
 ;
   %call = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %arg1)
@@ -110,11 +107,7 @@ define <16 x i8> @knownbits_shuffle_add_shift_v32i8(<16 x i8> %arg1, <8 x i16> %
 ; CHECK-NEXT:    [[BITCAST11:%.*]] = bitcast <8 x i16> [[SHL10]] to <16 x i8>
 ; CHECK-NEXT:    [[ADD12:%.*]] = add <16 x i8> [[BITCAST11]], [[BITCAST7]]
 ; CHECK-NEXT:    [[ADD14:%.*]] = add <16 x i8> [[ADD12]], [[ARG1]]
-; CHECK-NEXT:    [[BITCAST14:%.*]] = bitcast <16 x i8> [[ADD12]] to <8 x i16>
-; CHECK-NEXT:    [[SHL15:%.*]] = shl <8 x i16> [[BITCAST14]], splat (i16 8)
-; CHECK-NEXT:    [[BITCAST16:%.*]] = bitcast <8 x i16> [[SHL15]] to <16 x i8>
-; CHECK-NEXT:    [[ADD13:%.*]] = add <16 x i8> [[ADD14]], [[BITCAST16]]
-; CHECK-NEXT:    ret <16 x i8> [[ADD13]]
+; CHECK-NEXT:    ret <16 x i8> [[ADD14]]
 ;
   %shl6 = shl <8 x i16> %arg2, splat (i16 8)
   %bitcast7 = bitcast <8 x i16> %shl6 to <16 x i8>

From f780955e1df9105e9c4e67ebd16efded7dd279e2 Mon Sep 17 00:00:00 2001
From: Justin King <jcking@google.com>
Date: Thu, 19 Jun 2025 10:38:45 -0700
Subject: [PATCH 0964/1322] lsan: fix macos build after #144604 (#144818)

Fixes build failures on macOS, including

https://green.lab.llvm.org/job/llvm.org/job/clang-stage1-RA/

llvm-project/compiler-rt/lib/lsan/lsan_interceptors.cpp:579:3: error: use of undeclared identifier 'LSAN_MAYBE_INTERCEPT_FREE_SIZED'
13:23:58    579 |   LSAN_MAYBE_INTERCEPT_FREE_SIZED;
13:23:58        |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13:23:58  /Users/ec2-user/jenkins/workspace/llvm.org/as-lldb-cmake/llvm-project/compiler-rt/lib/lsan/lsan_interceptors.cpp:580:3: error: use of undeclared identifier 'LSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED'
13:23:58    580 |   LSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED;
13:23:58        |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13:23:58  2 errors generated.
---
 compiler-rt/lib/lsan/lsan_interceptors.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index 6da9d0d9d24f..f9f83f6c0cc4 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -146,6 +146,9 @@ INTERCEPTOR(void*, valloc, uptr size) {
   GET_STACK_TRACE_MALLOC;
   return lsan_valloc(size, stack);
 }
+#else
+#  define LSAN_MAYBE_INTERCEPT_FREE_SIZED
+#  define LSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED
 #endif  // !SANITIZER_APPLE
 
 #if SANITIZER_INTERCEPT_MEMALIGN

From 836201f1177c38f3ca0457de019bb179a04afe3c Mon Sep 17 00:00:00 2001
From: Umang Yadav <29876643+umangyadav@users.noreply.github.com>
Date: Thu, 19 Jun 2025 13:52:31 -0400
Subject: [PATCH 0965/1322] Allow bf16 operands on new MFMAs  (#144925)

New gfx950 MFMA allows bf16 operands.


https://github.com/llvm/llvm-project/blob/c0cc81cdc03c97473ba771bbc3a2330bd22396bc/llvm/include/llvm/IR/IntrinsicsAMDGPU.td#L3434

When running `amdgpu-to-rocdl`, Current logic converts bf16 to i16
always which fails to compile for newer bf16 MFMA e.g.
`v_mfma_f32_16x16x32bf16`.
Backend expects bf16 type for the operands for those newer MFMAs. This
patch fixes it.

CC: @krzysz00  @dhernandez0  @giuseros  @antiagainst  @kuhar
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 28 ++++++++++++++-----
 .../Conversion/AMDGPUToROCDL/mfma-gfx950.mlir |  4 +--
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 074404add47f..700563460f52 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -499,7 +499,9 @@ struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
 /// and LLVM AMDGPU intrinsics convention.
 ///
 /// Specifically:
-/// 1. If the element type is bfloat16, bitcast it to i16.
+/// 1. If the element type is bfloat16, bitcast it to i16 unless rocdl intrinsic
+/// allows bf16. Newer MFMAs support bf16 types on operand, check
+/// IntrinsicsAMDGPU.td file for reference.
 /// 2. If instead we have a more than 64-bit quantity, use a <N / 4 x i32>
 /// instead, which is what the f8f6f4 intrinsics use.
 /// 3. If `input` is a vector of N <= 8 bytes, bitcast it to a (N * 8)-bit
@@ -509,10 +511,11 @@ struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
 /// therefore 8-bit and smaller floats are represented as their corresponding
 /// `iN` integers.
 static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter,
-                                      Location loc, Value input) {
+                                      Location loc, Value input,
+                                      bool allowBf16 = true) {
   Type inputType = input.getType();
   if (auto vectorType = dyn_cast<VectorType>(inputType)) {
-    if (vectorType.getElementType().isBF16())
+    if (vectorType.getElementType().isBF16() && !allowBf16)
       return rewriter.create<LLVM::BitcastOp>(
           loc, vectorType.clone(rewriter.getI16Type()), input);
     if (vectorType.getElementType().isInteger(8) &&
@@ -958,12 +961,23 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
 
     StringRef intrinsicName =
         isScaled ? std::get<0>(*maybeScaledIntrinsic) : *maybeIntrinsic;
+    // Determine if we can use bf16 in the intrinsic. Newer MFMAs in gfx950+
+    // allows bf16 as the input. For reference check IntrinsicsAMDGPU.td file.
+    bool allowBf16 = [&]() {
+      if (chipset < kGfx950)
+        return false;
+      if (isScaled)
+        return true;
+      return intrinsicName.contains("16x16x32.bf16") ||
+             intrinsicName.contains("32x32x16.bf16");
+    }();
     OperationState loweredOp(loc, intrinsicName);
     loweredOp.addTypes(intrinsicOutType);
-    loweredOp.addOperands(
-        {convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceA()),
-         convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceB()),
-         adaptor.getDestC()});
+    loweredOp.addOperands({convertMFMAVectorOperand(
+                               rewriter, loc, adaptor.getSourceA(), allowBf16),
+                           convertMFMAVectorOperand(
+                               rewriter, loc, adaptor.getSourceB(), allowBf16),
+                           adaptor.getDestC()});
     if (isScaled) {
       Value zero = createI32Constant(rewriter, loc, 0);
       auto [_scaledName, aTypeCode, bTypeCode] = *maybeScaledIntrinsic;
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
index 52a5d39f668c..39c31d5bf2fa 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
@@ -11,9 +11,9 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>,
   amdgpu.mfma %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf16>, vector<8xf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x32.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
   amdgpu.mfma %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf16>, vector<8xf16>, vector<4xf32>
-  // CHECK: rocdl.mfma.f32.32x32x16.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  // CHECK: rocdl.mfma.f32.32x32x16.bf16{{.*}}: (vector<8xbf16>, vector<8xbf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
   amdgpu.mfma %arg3 * %arg3 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
-  // CHECK: rocdl.mfma.f32.16x16x32.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  // CHECK: rocdl.mfma.f32.16x16x32.bf16{{.*}}: (vector<8xbf16>, vector<8xbf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
   amdgpu.mfma %arg3 * %arg3 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xbf16>, vector<8xbf16>, vector<4xf32>
   // CHECK: rocdl.mfma.i32.32x32x32.i8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<16xi32>, i32, i32, i32) -> vector<16xi32>
   amdgpu.mfma %arg4 * %arg4 + %arg5 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<16xi8>, vector<16xi8>, vector<16xi32>

From 74ec1c287a88dffc232c38e0fdd3251f6b167d15 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 19 Jun 2025 18:56:23 +0100
Subject: [PATCH 0966/1322] [LV] Add tests interleaving extended and
 multiply/accumulate reductions.

Add missing test coverage for interleaving with
VPExtendedReduction/VPMulAccumulateReduction recipes.

Adds missing test coverage in preparation for
https://github.com/llvm/llvm-project/pull/144281.
---
 .../ARM/mve-reductions-interleave.ll          | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-reductions-interleave.ll

diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions-interleave.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions-interleave.ll
new file mode 100644
index 000000000000..e27b0288b62e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions-interleave.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,instcombine,simplifycfg -simplifycfg-require-and-preserve-domtree=1 -tail-predication=enabled -force-vector-interleave=2 < %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; 4x to use VADDLV
+define i64 @add_i32_i64(ptr nocapture readonly %x, i32 %n) #0 {
+; CHECK-LABEL: @add_i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i64 [[TMP7]], [[TMP3]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[R_07:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.x = getelementptr inbounds i32, ptr %x, i32 %iv
+  %0 = load i32, ptr %gep.x, align 4
+  %conv = sext i32 %0 to i64
+  %red.next = add nsw i64 %red, %conv
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.lcssa = phi i64 [ %red.next, %loop ]
+  ret i64 %red.lcssa
+}
+
+; 4x to use VMLAL.u32
+define i64 @mla_i32_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) #0 {
+; CHECK-LABEL: @mla_i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i32> [[TMP14]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP11]] = add i64 [[TMP10]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i64 [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[R_09:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_010]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[I_010]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ADD]] = add nsw i64 [[R_09]], [[CONV]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.x = getelementptr inbounds i32, ptr %x, i32 %iv
+  %0 = load i32, ptr %gep.x, align 4
+  %gep.y = getelementptr inbounds i32, ptr %y, i32 %iv
+  %1 = load i32, ptr %gep.y, align 4
+  %mul = mul nsw i32 %1, %0
+  %conv = sext i32 %mul to i64
+  %red.next = add nsw i64 %red, %conv
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %red.lcssa = phi i64 [ %red.next, %loop ]
+  ret i64 %red.lcssa
+}
+
+attributes #0 = { "target-features"="+mve" }

From 3bee9ba0156ee130fa88379a5a89de0812936a3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 19 Jun 2025 11:22:49 -0700
Subject: [PATCH 0967/1322] AMDGPU/GFX12: Fix s_barrier_signal_isfirst for
 single-wave workgroups (#143634)

Barrier instructions are no-ops in single-wave workgroups. This includes
s_barrier_signal_isfirst, which will leave SCC unmodified.

Model this correctly (via an implicit use of SCC) and ensure SCC==1
before the barrier instruction (if the wave is the only one of the
workgroup, then it is the first).

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 llvm/docs/AMDGPUUsage.rst                     |  4 ++
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  3 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  8 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  3 ++
 .../CodeGen/AMDGPU/insert-skips-gfx12.mir     | 10 +++--
 .../llvm.amdgcn.s.barrier.signal.isfirst.ll   | 41 +++++++++++++++++++
 6 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c052b076c21c..ed3e4c8513e2 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1402,6 +1402,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    performs subtraction only if the memory value is greater than or
                                                    equal to the data value.
 
+  llvm.amdgcn.s.barrier.signal.isfirst             Provides access to the s_barrier_signal_first instruction;
+                                                   additionally ensures that the result value is valid even when the
+                                                   intrinsic is used from a wave that is not running in a workgroup.
+
   llvm.amdgcn.s.getpc                              Provides access to the s_getpc_b64 instruction, but with the return value
                                                    sign-extended from the width of the underlying PC hardware register even on
                                                    processors where the s_getpc_b64 instruction returns a zero-extended value.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e72f6ca478f..672520390c8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5918,6 +5918,9 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
   const DebugLoc &DL = I.getDebugLoc();
   Register CCReg = I.getOperand(0).getReg();
 
+  // Set SCC to true, in case the barrier instruction gets converted to a NOP.
+  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
+
   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
       .addImm(I.getOperand(2).getImm());
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 586de433ea28..b1e77a282e41 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5423,6 +5423,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent();
     return BB;
   }
+  case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
+    // Set SCC to true, in case the barrier instruction gets converted to a NOP.
+    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::S_CMP_EQ_U32))
+        .addImm(0)
+        .addImm(0);
+    return BB;
+  }
   case AMDGPU::GET_GROUPSTATICSIZE: {
     assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
            getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e0a36758534d..90e65a6950c0 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -472,6 +472,7 @@ def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
 def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
   "", []>{
   let Defs = [SCC];
+  let Uses = [M0, SCC];
   let SchedRW = [WriteBarrier];
   let isConvergent = 1;
 }
@@ -487,6 +488,8 @@ def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
 def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
   (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
   let Defs = [SCC];
+  let Uses = [SCC];
+  let usesCustomInserter = 1;
   let SchedRW = [WriteBarrier];
   let isConvergent = 1;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
index e4b16a3fa004..f437dee253d0 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -374,7 +374,8 @@ body: |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   V_NOP_e32 implicit $exec
-  ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+  ; CHECK-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
+  ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   S_ENDPGM 0
@@ -385,7 +386,8 @@ body: |
   bb.1:
     successors: %bb.2
     V_NOP_e32 implicit $exec
-    S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
+    S_CMP_EQ_U32 0, 0, implicit-def $scc
+    S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
 
   bb.2:
     S_ENDPGM 0
@@ -437,6 +439,7 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   V_NOP_e32 implicit $exec
   ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; CHECK-NEXT:   S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -449,7 +452,8 @@ body: |
     successors: %bb.2
     V_NOP_e32 implicit $exec
     $m0 = S_MOV_B32 -1
-    S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
+    S_CMP_EQ_U32 0, 0, implicit-def $scc
+    S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit $scc
 
   bb.2:
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
new file mode 100644
index 000000000000..651d204f65b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200  < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+define i1 @func1() {
+; GFX12-SDAG-LABEL: func1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+  ret i1 %r
+}
+
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)

From 633e740e3453bab06bf535830174c759100257f9 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Thu, 19 Jun 2025 14:32:20 -0400
Subject: [PATCH 0968/1322] [Clang][AMDGPU][Driver] Add
 `avail-extern-gv-in-addrspace-to-local` option when ThinTLO is enabled
 (#144914)

On AMDGPU, we need an extra argument
`-avail-extern-gv-in-addrspace-to-local=3`
to privatize LDS global variables when ThinLTO is enabled.
---
 clang/lib/Driver/ToolChains/Clang.cpp  | 3 +++
 clang/lib/Driver/ToolChains/HIPAMD.cpp | 2 ++
 clang/test/Driver/hip-thinlto.hip      | 1 +
 clang/test/Driver/openmp-offload-gpu.c | 1 +
 4 files changed, 7 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5a3c09e3a343..e910a2bedeeb 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9192,6 +9192,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
           CmdArgs.push_back(
               Args.MakeArgString("--device-linker=" + TC->getTripleString() +
                                  "=-plugin-opt=-avail-extern-to-local"));
+          CmdArgs.push_back(Args.MakeArgString(
+              "--device-linker=" + TC->getTripleString() +
+              "=-plugin-opt=-avail-extern-gv-in-addrspace-to-local=3"));
           if (Kind == Action::OFK_OpenMP) {
             CmdArgs.push_back(
                 Args.MakeArgString("--device-linker=" + TC->getTripleString() +
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 74ac8306e7cc..b8f3a70ee827 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -102,6 +102,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
   if (IsThinLTO) {
     LldArgs.push_back(Args.MakeArgString("-plugin-opt=-force-import-all"));
     LldArgs.push_back(Args.MakeArgString("-plugin-opt=-avail-extern-to-local"));
+    LldArgs.push_back(Args.MakeArgString(
+        "-plugin-opt=-avail-extern-gv-in-addrspace-to-local=3"));
   }
 
   for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
diff --git a/clang/test/Driver/hip-thinlto.hip b/clang/test/Driver/hip-thinlto.hip
index 4227cd3f2e9f..bcb7d4e6cb52 100644
--- a/clang/test/Driver/hip-thinlto.hip
+++ b/clang/test/Driver/hip-thinlto.hip
@@ -3,6 +3,7 @@
 // CHECK: -plugin-opt=thinlto
 // CHECK-SAME: -plugin-opt=-force-import-all
 // CHECK-SAME: -plugin-opt=-avail-extern-to-local
+// CHECK-SAME: -plugin-opt=-avail-extern-gv-in-addrspace-to-local=3
 int main(int, char *[]) {
   return 0;
 }
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index f67c2173cb14..2af3e2da3b21 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -388,6 +388,7 @@
 // THINLTO-GFX906: --device-compiler=amdgcn-amd-amdhsa=-flto=thin
 // THINLTO-GFX906-SAME: --device-linker=amdgcn-amd-amdhsa=-plugin-opt=-force-import-all
 // THINLTO-GFX906-SAME: --device-linker=amdgcn-amd-amdhsa=-plugin-opt=-avail-extern-to-local
+// THINLTO-GFX906-SAME: --device-linker=amdgcn-amd-amdhsa=-plugin-opt=-avail-extern-gv-in-addrspace-to-local=3
 // THINLTO-GFX906-SAME: --device-linker=amdgcn-amd-amdhsa=-plugin-opt=-amdgpu-internalize-symbols
 //
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \

From baf35d7a829efb9688dc0aef6d1e161ef6bc5983 Mon Sep 17 00:00:00 2001
From: William Huynh <William.Huynh@arm.com>
Date: Thu, 19 Jun 2025 19:50:07 +0100
Subject: [PATCH 0969/1322] [libc] Fix bug in LIBC_CONF_ERRNO_MODE being
 undefined (#144896)

A typo, set() instead of list() would cause the build to not define
LIBC_CONF_ERRNO_MODE, which would cause the wrong configuration to be
used.
---
 libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index a98e7276bef8..82d06e2b9eb5 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -107,7 +107,7 @@ function(_get_compile_options_from_config output_var)
   endif()
 
   if(LIBC_CONF_ERRNO_MODE)
-    set(APPEND config_options "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}")
+    list(APPEND config_options "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}")
   endif()
 
   set(${output_var} ${config_options} PARENT_SCOPE)

From 8631b4f1b4f30edd1f26b20e35b7367517aba359 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Thu, 19 Jun 2025 12:12:04 -0700
Subject: [PATCH 0970/1322] [flang] Set low probability for array repacking
 code. (#144830)

This allows LLVM to place the most probably cold blocks
that do the repacking out of the line of the potentially hot code.
---
 .../include/flang/Optimizer/Dialect/FIROps.td | 10 +++
 .../Optimizer/CodeGen/LowerRepackArrays.cpp   | 32 ++++++----
 .../test/Integration/cold_array_repacking.f90 | 30 +++++++++
 flang/test/Transforms/lower-repack-arrays.fir | 64 +++++++++----------
 4 files changed, 91 insertions(+), 45 deletions(-)
 create mode 100644 flang/test/Integration/cold_array_repacking.f90

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 27a6ca4ebdb4..8ac847dd7dd0 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2381,6 +2381,16 @@ def fir_IfOp
     static constexpr llvm::StringRef getWeightsAttrAssemblyName() {
       return "weights";
     }
+
+    /// Sets WeightedRegionBranchOpInterface weights to indicate
+    /// that either THEN or ELSE branch is unlikely.
+    /// By default, THEN branch is set to be unlikely.
+    void setUnlikelyIfWeights(bool unlikelyElse = false) {
+      if (unlikelyElse)
+        setWeights({1, 0});
+      else
+        setWeights({0, 1});
+    }
   }];
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
index de97a0bbc184..2774382c22bf 100644
--- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
+++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
@@ -250,6 +250,8 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder,
 
   fir::IfOp ifOp =
       builder.create<fir::IfOp>(loc, boxType, doPack, /*withElseRegion=*/true);
+  // Assume that the repacking is unlikely.
+  ifOp.setUnlikelyIfWeights();
 
   // Return original box.
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
@@ -322,20 +324,24 @@ UnpackArrayConversion::matchAndRewrite(fir::UnpackArrayOp op,
 
     auto isNotSame = builder.genPtrCompare(loc, mlir::arith::CmpIPredicate::ne,
                                            tempAddr, originalAddr);
-    builder.genIfThen(loc, isNotSame).genThen([&]() {});
-    // Copy from temporary to the original.
-    if (!op.getNoCopy())
-      fir::runtime::genShallowCopy(builder, loc, originalBox, tempBox,
-                                   /*resultIsAllocated=*/true);
+    builder.genIfThen(loc, isNotSame)
+        .genThen([&]() {
+          // Copy from temporary to the original.
+          if (!op.getNoCopy())
+            fir::runtime::genShallowCopy(builder, loc, originalBox, tempBox,
+                                         /*resultIsAllocated=*/true);
 
-    // Deallocate, if it was allocated in heap.
-    // Note that the stack attribute does not always mean
-    // that the allocation was actually done in stack memory.
-    // There are currently cases where we delegate the allocation
-    // to the runtime that uses heap memory, even when the stack
-    // attribute is set on fir.pack_array.
-    if (!op.getStack() || !canAllocateTempOnStack(originalBox))
-      builder.create<fir::FreeMemOp>(loc, tempAddr);
+          // Deallocate, if it was allocated in heap.
+          // Note that the stack attribute does not always mean
+          // that the allocation was actually done in stack memory.
+          // There are currently cases where we delegate the allocation
+          // to the runtime that uses heap memory, even when the stack
+          // attribute is set on fir.pack_array.
+          if (!op.getStack() || !canAllocateTempOnStack(originalBox))
+            builder.create<fir::FreeMemOp>(loc, tempAddr);
+        })
+        .getIfOp()
+        .setUnlikelyIfWeights();
   });
   rewriter.eraseOp(op);
   return mlir::success();
diff --git a/flang/test/Integration/cold_array_repacking.f90 b/flang/test/Integration/cold_array_repacking.f90
new file mode 100644
index 000000000000..0f25fcae866e
--- /dev/null
+++ b/flang/test/Integration/cold_array_repacking.f90
@@ -0,0 +1,30 @@
+! Check that the branch weights used by the array repacking
+! are propagated all the way to LLVM IR:
+! RUN: %flang_fc1 -frepack-arrays -emit-llvm %s -o - | FileCheck %s
+
+! CHECK-LABEL: define void @test_(
+! CHECK-SAME:      ptr [[TMP0:%.*]]) {
+! CHECK:    [[TMP4:%.*]] = ptrtoint ptr [[TMP0]] to i64
+! CHECK:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+! CHECK:    br i1 [[TMP5]], label %[[BB6:.*]], label %[[BB46:.*]]
+! CHECK:  [[BB6]]:
+! CHECK:    [[TMP7:%.*]] = call i1 @_FortranAIsContiguous(ptr [[TMP0]])
+! CHECK:    [[TMP8:%.*]] = icmp eq i1 [[TMP7]], false
+! CHECK:    [[TMP13:%.*]] = and i1 [[TMP8]], [[TMP12:.*]]
+! CHECK:    br i1 [[TMP13]], label %[[BB14:.*]], label %[[BB46]], !prof [[PROF2:![0-9]+]]
+! CHECK:  [[BB14]]:
+! CHECK:    call void @_FortranAShallowCopyDirect
+! CHECK:    br label %[[BB46]]
+! CHECK:  [[BB46]]:
+! CHECK:    br i1 [[TMP5]], label %[[BB48:.*]], label %[[BB57:.*]]
+! CHECK:  [[BB48]]:
+! CHECK:    br i1 [[TMP55:.*]], label %[[BB56:.*]], label %[[BB57]], !prof [[PROF2]]
+! CHECK:  [[BB56]]:
+! CHECK:    call void @_FortranAShallowCopyDirect
+! CHECK:    br label %[[BB57]]
+! CHECK:  [[BB57]]:
+! CHECK:    ret void
+! CHECK: [[PROF2]] = !{!"branch_weights", i32 0, i32 1}
+subroutine test(x)
+  real :: x(:)
+end subroutine test
diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir
index 012e957173ac..458869cce45f 100644
--- a/flang/test/Transforms/lower-repack-arrays.fir
+++ b/flang/test/Transforms/lower-repack-arrays.fir
@@ -22,7 +22,7 @@ func.func @_QPtest1(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"})
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.box<!fir.array<?x?xf32>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box<!fir.array<?x?xf32>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.box<!fir.array<?x?xf32>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -52,7 +52,7 @@ func.func @_QPtest1(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"})
 // CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index
-// CHECK:             fir.if %[[VAL_33]] {
+// CHECK:             fir.if %[[VAL_33]] weights([0, 1]) {
 // CHECK:               %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
@@ -87,7 +87,7 @@ func.func @_QPtest1_whole(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name =
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.box<!fir.array<?x?xf32>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box<!fir.array<?x?xf32>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.box<!fir.array<?x?xf32>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -117,7 +117,7 @@ func.func @_QPtest1_whole(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name =
 // CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index
-// CHECK:             fir.if %[[VAL_33]] {
+// CHECK:             fir.if %[[VAL_33]] weights([0, 1]) {
 // CHECK:               %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
@@ -150,7 +150,7 @@ func.func @_QPtest1_in(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x
 // CHECK:             %[[VAL_10:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.box<!fir.array<?x?xf32>>>
 // CHECK:             %[[VAL_11:.*]] = fir.is_present %[[VAL_10]] : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> i1
 // CHECK:             %[[VAL_12:.*]] = arith.andi %[[VAL_9]], %[[VAL_11]] : i1
-// CHECK:             %[[VAL_13:.*]] = fir.if %[[VAL_12]] -> (!fir.box<!fir.array<?x?xf32>>) {
+// CHECK:             %[[VAL_13:.*]] = fir.if %[[VAL_12]] weights([0, 1]) -> (!fir.box<!fir.array<?x?xf32>>) {
 // CHECK:               %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2>
@@ -180,7 +180,7 @@ func.func @_QPtest1_in(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x
 // CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_32:.*]] = arith.cmpi ne, %[[VAL_29]], %[[VAL_31]] : index
-// CHECK:             fir.if %[[VAL_32]] {
+// CHECK:             fir.if %[[VAL_32]] weights([0, 1]) {
 // CHECK:               fir.freemem %[[VAL_28]] : !fir.heap<!fir.array<?x?xf32>>
 // CHECK:             }
 // CHECK:           }
@@ -209,7 +209,7 @@ func.func @_QPtest1_out(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "
 // CHECK:             %[[VAL_10:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.box<!fir.array<?x?xf32>>>
 // CHECK:             %[[VAL_11:.*]] = fir.is_present %[[VAL_10]] : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> i1
 // CHECK:             %[[VAL_12:.*]] = arith.andi %[[VAL_9]], %[[VAL_11]] : i1
-// CHECK:             %[[VAL_13:.*]] = fir.if %[[VAL_12]] -> (!fir.box<!fir.array<?x?xf32>>) {
+// CHECK:             %[[VAL_13:.*]] = fir.if %[[VAL_12]] weights([0, 1]) -> (!fir.box<!fir.array<?x?xf32>>) {
 // CHECK:               %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2>
@@ -234,7 +234,7 @@ func.func @_QPtest1_out(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "
 // CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (!fir.heap<!fir.array<?x?xf32>>) -> index
 // CHECK:             %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_25]], %[[VAL_27]] : index
-// CHECK:             fir.if %[[VAL_28]] {
+// CHECK:             fir.if %[[VAL_28]] weights([0, 1]) {
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
@@ -280,7 +280,7 @@ func.func @_QPtest2(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.box
 // CHECK:             %[[VAL_17:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>
 // CHECK:             %[[VAL_18:.*]] = fir.is_present %[[VAL_17]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>) -> i1
 // CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_16]], %[[VAL_18]] : i1
-// CHECK:             %[[VAL_20:.*]] = fir.if %[[VAL_19]] -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
+// CHECK:             %[[VAL_20:.*]] = fir.if %[[VAL_19]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
 // CHECK:               %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_5]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_22:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2>
@@ -310,7 +310,7 @@ func.func @_QPtest2(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.box
 // CHECK:             %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_36]], %[[VAL_38]] : index
-// CHECK:             fir.if %[[VAL_39]] {
+// CHECK:             fir.if %[[VAL_39]] weights([0, 1]) {
 // CHECK:               %[[VAL_40:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_42:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
@@ -356,7 +356,7 @@ func.func @_QPtest2_stack(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !f
 // CHECK:             %[[VAL_17:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>
 // CHECK:             %[[VAL_18:.*]] = fir.is_present %[[VAL_17]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>) -> i1
 // CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_16]], %[[VAL_18]] : i1
-// CHECK:             %[[VAL_20:.*]] = fir.if %[[VAL_19]] -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
+// CHECK:             %[[VAL_20:.*]] = fir.if %[[VAL_19]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
 // CHECK:               %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_5]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_22:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2>
@@ -386,7 +386,7 @@ func.func @_QPtest2_stack(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !f
 // CHECK:             %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_36]], %[[VAL_38]] : index
-// CHECK:             fir.if %[[VAL_39]] {
+// CHECK:             fir.if %[[VAL_39]] weights([0, 1]) {
 // CHECK:               %[[VAL_40:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_42:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
@@ -420,7 +420,7 @@ func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> index
@@ -451,7 +451,7 @@ func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index
-// CHECK:             fir.if %[[VAL_34]] {
+// CHECK:             fir.if %[[VAL_34]] weights([0, 1]) {
 // CHECK:               %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
@@ -486,7 +486,7 @@ func.func @_QPtest3_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.b
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,?>>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> index
@@ -517,7 +517,7 @@ func.func @_QPtest3_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.b
 // CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>) -> index
 // CHECK:             %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index
-// CHECK:             fir.if %[[VAL_34]] {
+// CHECK:             fir.if %[[VAL_34]] weights([0, 1]) {
 // CHECK:               %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
@@ -553,7 +553,7 @@ func.func @_QPtest4(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.bindc_
 // CHECK:             %[[VAL_12:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,10>>>>
 // CHECK:             %[[VAL_13:.*]] = fir.is_present %[[VAL_12]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,10>>>>) -> i1
 // CHECK:             %[[VAL_14:.*]] = arith.andi %[[VAL_11]], %[[VAL_13]] : i1
-// CHECK:             %[[VAL_15:.*]] = fir.if %[[VAL_14]] -> (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) {
+// CHECK:             %[[VAL_15:.*]] = fir.if %[[VAL_14]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) {
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2>
@@ -583,7 +583,7 @@ func.func @_QPtest4(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.bindc_
 // CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>) -> index
 // CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>) -> index
 // CHECK:             %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index
-// CHECK:             fir.if %[[VAL_34]] {
+// CHECK:             fir.if %[[VAL_34]] weights([0, 1]) {
 // CHECK:               %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
@@ -620,7 +620,7 @@ func.func @_QPtest4_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.
 // CHECK:             %[[VAL_12:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,10>>>>
 // CHECK:             %[[VAL_13:.*]] = fir.is_present %[[VAL_12]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.char<1,10>>>>) -> i1
 // CHECK:             %[[VAL_14:.*]] = arith.andi %[[VAL_11]], %[[VAL_13]] : i1
-// CHECK:             %[[VAL_15:.*]] = fir.if %[[VAL_14]] -> (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) {
+// CHECK:             %[[VAL_15:.*]] = fir.if %[[VAL_14]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) {
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2>
@@ -650,7 +650,7 @@ func.func @_QPtest4_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.
 // CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>) -> index
 // CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>) -> index
 // CHECK:             %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index
-// CHECK:             fir.if %[[VAL_34]] {
+// CHECK:             fir.if %[[VAL_34]] weights([0, 1]) {
 // CHECK:               %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
@@ -684,7 +684,7 @@ func.func @_QPtest5(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bind
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -714,7 +714,7 @@ func.func @_QPtest5(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bind
 // CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index
-// CHECK:             fir.if %[[VAL_33]] {
+// CHECK:             fir.if %[[VAL_33]] weights([0, 1]) {
 // CHECK:               %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
@@ -749,7 +749,7 @@ func.func @_QPtest5_stack(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fi
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.ref<!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -779,7 +779,7 @@ func.func @_QPtest5_stack(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fi
 // CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index
-// CHECK:             fir.if %[[VAL_33]] {
+// CHECK:             fir.if %[[VAL_33]] weights([0, 1]) {
 // CHECK:               %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
@@ -814,7 +814,7 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.ref<!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_2]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -853,7 +853,7 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:             %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_41:.*]] = arith.cmpi ne, %[[VAL_39]], %[[VAL_40]] : index
-// CHECK:             fir.if %[[VAL_41]] {
+// CHECK:             fir.if %[[VAL_41]] weights([0, 1]) {
 // CHECK:               %[[VAL_42:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 // CHECK:               %[[VAL_43:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_8]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
@@ -890,7 +890,7 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.ref<!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_2]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -929,7 +929,7 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:             %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>) -> index
 // CHECK:             %[[VAL_41:.*]] = arith.cmpi ne, %[[VAL_39]], %[[VAL_40]] : index
-// CHECK:             fir.if %[[VAL_41]] {
+// CHECK:             fir.if %[[VAL_41]] weights([0, 1]) {
 // CHECK:               %[[VAL_42:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 // CHECK:               %[[VAL_43:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_8]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
@@ -965,7 +965,7 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.class<!fir.array<?x?xnone>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.class<!fir.array<?x?xnone>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.class<!fir.array<?x?xnone>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.class<!fir.array<?x?xnone>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_2]] : (!fir.class<!fir.array<?x?xnone>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -1004,7 +1004,7 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:             %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (!fir.heap<!fir.array<?x?xnone>>) -> index
 // CHECK:             %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.heap<!fir.array<?x?xnone>>) -> index
 // CHECK:             %[[VAL_41:.*]] = arith.cmpi ne, %[[VAL_39]], %[[VAL_40]] : index
-// CHECK:             fir.if %[[VAL_41]] {
+// CHECK:             fir.if %[[VAL_41]] weights([0, 1]) {
 // CHECK:               %[[VAL_42:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 // CHECK:               %[[VAL_43:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_8]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
@@ -1041,7 +1041,7 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:             %[[VAL_11:.*]] = fir.box_addr %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.class<!fir.array<?x?xnone>>>
 // CHECK:             %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref<!fir.class<!fir.array<?x?xnone>>>) -> i1
 // CHECK:             %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
-// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.class<!fir.array<?x?xnone>>) {
+// CHECK:             %[[VAL_14:.*]] = fir.if %[[VAL_13]] weights([0, 1]) -> (!fir.class<!fir.array<?x?xnone>>) {
 // CHECK:               %[[VAL_15:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_3]] : (!fir.class<!fir.array<?x?xnone>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_16:.*]]:3 = fir.box_dims %[[ARG0]], %[[VAL_2]] : (!fir.class<!fir.array<?x?xnone>>, index) -> (index, index, index)
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
@@ -1080,7 +1080,7 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:             %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (!fir.heap<!fir.array<?x?xnone>>) -> index
 // CHECK:             %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.heap<!fir.array<?x?xnone>>) -> index
 // CHECK:             %[[VAL_41:.*]] = arith.cmpi ne, %[[VAL_39]], %[[VAL_40]] : index
-// CHECK:             fir.if %[[VAL_41]] {
+// CHECK:             fir.if %[[VAL_41]] weights([0, 1]) {
 // CHECK:               %[[VAL_42:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
 // CHECK:               %[[VAL_43:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_8]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>

From 28808dda2c53a1dff1076cb83a9b91d0866ebf9a Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Thu, 19 Jun 2025 13:02:30 -0700
Subject: [PATCH 0971/1322] [flang] Fixed test added in #144830.

---
 flang/test/Integration/cold_array_repacking.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Integration/cold_array_repacking.f90 b/flang/test/Integration/cold_array_repacking.f90
index 0f25fcae866e..11b7d8c21b67 100644
--- a/flang/test/Integration/cold_array_repacking.f90
+++ b/flang/test/Integration/cold_array_repacking.f90
@@ -3,7 +3,7 @@
 ! RUN: %flang_fc1 -frepack-arrays -emit-llvm %s -o - | FileCheck %s
 
 ! CHECK-LABEL: define void @test_(
-! CHECK-SAME:      ptr [[TMP0:%.*]]) {
+! CHECK-SAME:      ptr [[TMP0:%.*]])
 ! CHECK:    [[TMP4:%.*]] = ptrtoint ptr [[TMP0]] to i64
 ! CHECK:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
 ! CHECK:    br i1 [[TMP5]], label %[[BB6:.*]], label %[[BB46:.*]]

From dc058a3d84ed1bc4006416023e8b336f3214bdc7 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 13:17:23 -0700
Subject: [PATCH 0972/1322] [TableGen] Use ListSeparator (NFC) (#144936)

Note that an instance of ListSeparator evaluates to the empty string
for the first time and then ", " for subsequent references.
---
 clang/utils/TableGen/ClangAttrEmitter.cpp        | 6 ++----
 clang/utils/TableGen/ClangDiagnosticsEmitter.cpp | 9 +++------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index f892626a447e..dfeb6b1b1ec1 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -5482,14 +5482,12 @@ void EmitTestPragmaAttributeSupportedAttributes(const RecordKeeper &Records,
     }
     const Record *SubjectObj = I.second->getValueAsDef("Subjects");
     OS << " (";
-    bool PrintComma = false;
+    ListSeparator LS;
     for (const auto &Subject :
          enumerate(SubjectObj->getValueAsListOfDefs("Subjects"))) {
       if (!isSupportedPragmaClangAttributeSubject(*Subject.value()))
         continue;
-      if (PrintComma)
-        OS << ", ";
-      PrintComma = true;
+      OS << LS;
       PragmaClangAttributeSupport::RuleOrAggregateRuleSet &RuleSet =
           Support.SubjectsToRules.find(Subject.value())->getSecond();
       if (RuleSet.isRule()) {
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index bfc60f485cd3..b28cb2c09ac5 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -2225,13 +2225,10 @@ void clang::EmitClangDiagDocs(const RecordKeeper &Records, raw_ostream &OS) {
       else
         OS << "Also controls ";
 
-      bool First = true;
       sort(GroupInfo.SubGroups);
-      for (StringRef Name : GroupInfo.SubGroups) {
-        if (!First) OS << ", ";
-        OS << "`" << (IsRemarkGroup ? "-R" : "-W") << Name << "`_";
-        First = false;
-      }
+      ListSeparator LS;
+      for (StringRef Name : GroupInfo.SubGroups)
+        OS << LS << "`" << (IsRemarkGroup ? "-R" : "-W") << Name << "`_";
       OS << ".\n\n";
     }
 

From 7349864d2c7c874c17ed546791489a46e896f901 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 13:17:31 -0700
Subject: [PATCH 0973/1322] [ADT] Remove an extraneous variable (NFC) (#144937)

Without this patch, Int and IntWord have the same value and type.
This patch removes the extraneous copy.
---
 llvm/include/llvm/ADT/PointerIntPair.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/PointerIntPair.h b/llvm/include/llvm/ADT/PointerIntPair.h
index f73f5bcd6ce0..9cfc65846d5b 100644
--- a/llvm/include/llvm/ADT/PointerIntPair.h
+++ b/llvm/include/llvm/ADT/PointerIntPair.h
@@ -206,11 +206,10 @@ struct PointerIntPairInfo {
   }
 
   static intptr_t updateInt(intptr_t OrigValue, intptr_t Int) {
-    intptr_t IntWord = static_cast<intptr_t>(Int);
-    assert((IntWord & ~IntMask) == 0 && "Integer too large for field");
+    assert((Int & ~IntMask) == 0 && "Integer too large for field");
 
     // Preserve all bits other than the ones we are updating.
-    return (OrigValue & ~ShiftedIntMask) | IntWord << IntShift;
+    return (OrigValue & ~ShiftedIntMask) | Int << IntShift;
   }
 };
 

From 3b672e1d7b7375ca2a048cfb252d0e8ff35724e2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 13:17:39 -0700
Subject: [PATCH 0974/1322] [llvm] Use "= delete" to delete constructors (NFC)
 (#144938)

None of the constructors touched in this patch has a corresponding
definition.  This patch explicitly deletes them with "= delete" while
moving them to the public section of respective classes.  Note that "=
delete" itself serves as documentation.

Identified with modernize-use-equals-delete.
---
 llvm/unittests/ADT/TestGraph.h     | 10 +++++-----
 llvm/unittests/Support/Casting.cpp |  4 +---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h
index 3e6d4e14d5c1..a59ab504f714 100644
--- a/llvm/unittests/ADT/TestGraph.h
+++ b/llvm/unittests/ADT/TestGraph.h
@@ -24,14 +24,13 @@ namespace llvm {
 template <unsigned N>
 class Graph {
 private:
-  // Disable copying.
-  Graph(const Graph&);
-  Graph& operator=(const Graph&);
-
   static void ValidateIndex(unsigned Idx) {
     assert(Idx < N && "Invalid node index!");
   }
 public:
+  // Disable copying.
+  Graph(const Graph &) = delete;
+  Graph &operator=(const Graph &) = delete;
 
   /// NodeSubset - A subset of the graph's nodes.
   class NodeSubset {
@@ -169,11 +168,12 @@ public:
     /// yet been visited.
     NodeSubset Children;
 
-    ChildIterator(); // Disable default constructor.
   protected:
     ChildIterator(NodeType *F, NodeSubset C) : FirstNode(F), Children(C) {}
 
   public:
+    ChildIterator() = delete; // Disable default constructor.
+
     /// ChildIterator - Copy constructor.
     ChildIterator(const ChildIterator &other) = default;
     ChildIterator &operator=(const ChildIterator &other) = default;
diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp
index a128cedaf398..18327f6dd167 100644
--- a/llvm/unittests/Support/Casting.cpp
+++ b/llvm/unittests/Support/Casting.cpp
@@ -24,13 +24,11 @@ template <typename T> IllegalCast *cast(...) { return nullptr; }
 //
 struct bar {
   bar() {}
+  bar(const bar &) = delete;
   struct foo *baz();
   struct foo *caz();
   struct foo *daz();
   struct foo *naz();
-
-private:
-  bar(const bar &);
 };
 struct foo {
   foo(const bar &) {}

From 03692aa40487d0f4090d329ca2904fb888242c94 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 13:17:47 -0700
Subject: [PATCH 0975/1322] [memprof] Use a lambda instead of std::bind (NFC)
 (#144940)

A lambda is a lot shorter than std::bind here.
---
 llvm/include/llvm/ProfileData/MemProfReader.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index 4d41d05b1457..25578ecd06f1 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -62,8 +62,7 @@ public:
       return make_error<InstrProfError>(instrprof_error::eof);
 
     if (Callback == nullptr)
-      Callback =
-          std::bind(&MemProfReader::idToFrame, this, std::placeholders::_1);
+      Callback = [&](FrameId Id) { return idToFrame(Id); };
 
     CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
         MemProfData.CallStacks, Callback);

From 9fd22cb56d4c626769afd938e0f9ef6157164394 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Thu, 19 Jun 2025 13:42:46 -0700
Subject: [PATCH 0976/1322] [flang][NFC] Move new code to right place (#144551)

Some new code was added to flang/Semantics that only depends on
facilities in flang/Evaluate. Move it into Evaluate and clean up some
minor stylistic problems.
---
 flang/include/flang/Evaluate/tools.h        | 148 +++++++++
 flang/include/flang/Semantics/tools.h       | 149 ---------
 flang/lib/Evaluate/tools.cpp                | 311 ++++++++++++++++++
 flang/lib/Lower/OpenACC.cpp                 |   2 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp           |   7 +-
 flang/lib/Semantics/check-omp-structure.cpp |   3 +
 flang/lib/Semantics/tools.cpp               | 329 --------------------
 7 files changed, 467 insertions(+), 482 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 1959d5f3a589..e04621f71f9a 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1389,6 +1389,154 @@ inline bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
   return (hasConstant || (hostSymbols > 0)) && deviceSymbols > 0;
 }
 
+// Checks whether the symbol on the LHS is present in the RHS expression.
+bool CheckForSymbolMatch(const Expr<SomeType> *lhs, const Expr<SomeType> *rhs);
+
+namespace operation {
+
+enum class Operator {
+  Unknown,
+  Add,
+  And,
+  Associated,
+  Call,
+  Constant,
+  Convert,
+  Div,
+  Eq,
+  Eqv,
+  False,
+  Ge,
+  Gt,
+  Identity,
+  Intrinsic,
+  Le,
+  Lt,
+  Max,
+  Min,
+  Mul,
+  Ne,
+  Neqv,
+  Not,
+  Or,
+  Pow,
+  Resize, // Convert within the same TypeCategory
+  Sub,
+  True,
+};
+
+std::string ToString(Operator op);
+
+template <typename... Ts, int Kind>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::LogicalOperation<Kind>, Ts...> &op) {
+  switch (op.derived().logicalOperator) {
+  case common::LogicalOperator::And:
+    return Operator::And;
+  case common::LogicalOperator::Or:
+    return Operator::Or;
+  case common::LogicalOperator::Eqv:
+    return Operator::Eqv;
+  case common::LogicalOperator::Neqv:
+    return Operator::Neqv;
+  case common::LogicalOperator::Not:
+    return Operator::Not;
+  }
+  return Operator::Unknown;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Relational<T>, Ts...> &op) {
+  switch (op.derived().opr) {
+  case common::RelationalOperator::LT:
+    return Operator::Lt;
+  case common::RelationalOperator::LE:
+    return Operator::Le;
+  case common::RelationalOperator::EQ:
+    return Operator::Eq;
+  case common::RelationalOperator::NE:
+    return Operator::Ne;
+  case common::RelationalOperator::GE:
+    return Operator::Ge;
+  case common::RelationalOperator::GT:
+    return Operator::Gt;
+  }
+  return Operator::Unknown;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(const evaluate::Operation<evaluate::Add<T>, Ts...> &op) {
+  return Operator::Add;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Subtract<T>, Ts...> &op) {
+  return Operator::Sub;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Multiply<T>, Ts...> &op) {
+  return Operator::Mul;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Divide<T>, Ts...> &op) {
+  return Operator::Div;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Power<T>, Ts...> &op) {
+  return Operator::Pow;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::RealToIntPower<T>, Ts...> &op) {
+  return Operator::Pow;
+}
+
+template <typename T, common::TypeCategory C, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Convert<T, C>, Ts...> &op) {
+  if constexpr (C == T::category) {
+    return Operator::Resize;
+  } else {
+    return Operator::Convert;
+  }
+}
+
+template <typename T> Operator OperationCode(const evaluate::Constant<T> &x) {
+  return Operator::Constant;
+}
+
+template <typename T> Operator OperationCode(const T &) {
+  return Operator::Unknown;
+}
+
+Operator OperationCode(const evaluate::ProcedureDesignator &proc);
+
+} // namespace operation
+
+// Return information about the top-level operation (ignoring parentheses):
+// the operation code and the list of arguments.
+std::pair<operation::Operator, std::vector<Expr<SomeType>>>
+GetTopLevelOperation(const Expr<SomeType> &expr);
+
+// Check if expr is same as x, or a sequence of Convert operations on x.
+bool IsSameOrConvertOf(const Expr<SomeType> &expr, const Expr<SomeType> &x);
+
+// Strip away any top-level Convert operations (if any exist) and return
+// the input value. A ComplexConstructor(x, 0) is also considered as a
+// convert operation.
+// If the input is not Operation, Designator, FunctionRef or Constant,
+// it returns std::nullopt.
+std::optional<Expr<SomeType>> GetConvertInput(const Expr<SomeType> &x);
+
 } // namespace Fortran::evaluate
 
 namespace Fortran::semantics {
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 69375a83dec2..f3cfa9b99fb4 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -756,154 +756,5 @@ std::string GetCommonBlockObjectName(const Symbol &, bool underscoring);
 // Check for ambiguous USE associations
 bool HadUseError(SemanticsContext &, SourceName at, const Symbol *);
 
-// Checks whether the symbol on the LHS is present in the RHS expression.
-bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs);
-
-namespace operation {
-
-enum class Operator {
-  Unknown,
-  Add,
-  And,
-  Associated,
-  Call,
-  Constant,
-  Convert,
-  Div,
-  Eq,
-  Eqv,
-  False,
-  Ge,
-  Gt,
-  Identity,
-  Intrinsic,
-  Le,
-  Lt,
-  Max,
-  Min,
-  Mul,
-  Ne,
-  Neqv,
-  Not,
-  Or,
-  Pow,
-  Resize, // Convert within the same TypeCategory
-  Sub,
-  True,
-};
-
-std::string ToString(Operator op);
-
-template <typename... Ts, int Kind>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::LogicalOperation<Kind>, Ts...> &op) {
-  switch (op.derived().logicalOperator) {
-  case common::LogicalOperator::And:
-    return Operator::And;
-  case common::LogicalOperator::Or:
-    return Operator::Or;
-  case common::LogicalOperator::Eqv:
-    return Operator::Eqv;
-  case common::LogicalOperator::Neqv:
-    return Operator::Neqv;
-  case common::LogicalOperator::Not:
-    return Operator::Not;
-  }
-  return Operator::Unknown;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::Relational<T>, Ts...> &op) {
-  switch (op.derived().opr) {
-  case common::RelationalOperator::LT:
-    return Operator::Lt;
-  case common::RelationalOperator::LE:
-    return Operator::Le;
-  case common::RelationalOperator::EQ:
-    return Operator::Eq;
-  case common::RelationalOperator::NE:
-    return Operator::Ne;
-  case common::RelationalOperator::GE:
-    return Operator::Ge;
-  case common::RelationalOperator::GT:
-    return Operator::Gt;
-  }
-  return Operator::Unknown;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(const evaluate::Operation<evaluate::Add<T>, Ts...> &op) {
-  return Operator::Add;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::Subtract<T>, Ts...> &op) {
-  return Operator::Sub;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::Multiply<T>, Ts...> &op) {
-  return Operator::Mul;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::Divide<T>, Ts...> &op) {
-  return Operator::Div;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::Power<T>, Ts...> &op) {
-  return Operator::Pow;
-}
-
-template <typename T, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::RealToIntPower<T>, Ts...> &op) {
-  return Operator::Pow;
-}
-
-template <typename T, common::TypeCategory C, typename... Ts>
-Operator OperationCode(
-    const evaluate::Operation<evaluate::Convert<T, C>, Ts...> &op) {
-  if constexpr (C == T::category) {
-    return Operator::Resize;
-  } else {
-    return Operator::Convert;
-  }
-}
-
-template <typename T> //
-Operator OperationCode(const evaluate::Constant<T> &x) {
-  return Operator::Constant;
-}
-
-template <typename T> //
-Operator OperationCode(const T &) {
-  return Operator::Unknown;
-}
-
-Operator OperationCode(const evaluate::ProcedureDesignator &proc);
-
-} // namespace operation
-
-/// Return information about the top-level operation (ignoring parentheses):
-/// the operation code and the list of arguments.
-std::pair<operation::Operator, std::vector<SomeExpr>> GetTopLevelOperation(
-    const SomeExpr &expr);
-
-/// Check if expr is same as x, or a sequence of Convert operations on x.
-bool IsSameOrConvertOf(const SomeExpr &expr, const SomeExpr &x);
-
-/// Strip away any top-level Convert operations (if any exist) and return
-/// the input value. A ComplexConstructor(x, 0) is also considered as a
-/// convert operation.
-/// If the input is not Operation, Designator, FunctionRef or Constant,
-/// it returns std::nullopt.
-MaybeExpr GetConvertInput(const SomeExpr &x);
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TOOLS_H_
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 222c32a9c332..68838564f87b 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -13,6 +13,7 @@
 #include "flang/Evaluate/traverse.h"
 #include "flang/Parser/message.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/ADT/StringSwitch.h"
 #include <algorithm>
 #include <variant>
 
@@ -1595,6 +1596,316 @@ bool CheckForCoindexedObject(parser::ContextualMessages &messages,
   }
 }
 
+bool CheckForSymbolMatch(const Expr<SomeType> *lhs, const Expr<SomeType> *rhs) {
+  if (lhs && rhs) {
+    if (SymbolVector lhsSymbols{GetSymbolVector(*lhs)}; !lhsSymbols.empty()) {
+      const Symbol &first{*lhsSymbols.front()};
+      for (const Symbol &symbol : GetSymbolVector(*rhs)) {
+        if (first == symbol) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+namespace operation {
+template <typename T> Expr<SomeType> AsSomeExpr(const T &x) {
+  return AsGenericExpr(common::Clone(x));
+}
+
+template <bool IgnoreResizingConverts>
+struct ArgumentExtractor
+    : public Traverse<ArgumentExtractor<IgnoreResizingConverts>,
+          std::pair<operation::Operator, std::vector<Expr<SomeType>>>, false> {
+  using Arguments = std::vector<Expr<SomeType>>;
+  using Result = std::pair<operation::Operator, Arguments>;
+  using Base =
+      Traverse<ArgumentExtractor<IgnoreResizingConverts>, Result, false>;
+  static constexpr auto IgnoreResizes{IgnoreResizingConverts};
+  static constexpr auto Logical{common::TypeCategory::Logical};
+  ArgumentExtractor() : Base(*this) {}
+
+  Result Default() const { return {}; }
+
+  using Base::operator();
+
+  template <int Kind>
+  Result operator()(const Constant<Type<Logical, Kind>> &x) const {
+    if (const auto &val{x.GetScalarValue()}) {
+      return val->IsTrue()
+          ? std::make_pair(operation::Operator::True, Arguments{})
+          : std::make_pair(operation::Operator::False, Arguments{});
+    }
+    return Default();
+  }
+
+  template <typename R> Result operator()(const FunctionRef<R> &x) const {
+    Result result{operation::OperationCode(x.proc()), {}};
+    for (size_t i{0}, e{x.arguments().size()}; i != e; ++i) {
+      if (auto *e{x.UnwrapArgExpr(i)}) {
+        result.second.push_back(*e);
+      }
+    }
+    return result;
+  }
+
+  template <typename D, typename R, typename... Os>
+  Result operator()(const Operation<D, R, Os...> &x) const {
+    if constexpr (std::is_same_v<D, Parentheses<R>>) {
+      // Ignore top-level parentheses.
+      return (*this)(x.template operand<0>());
+    }
+    if constexpr (IgnoreResizes && std::is_same_v<D, Convert<R, R::category>>) {
+      // Ignore conversions within the same category.
+      // Atomic operations on int(kind=1) may be implicitly widened
+      // to int(kind=4) for example.
+      return (*this)(x.template operand<0>());
+    } else {
+      return std::make_pair(operation::OperationCode(x),
+          OperationArgs(x, std::index_sequence_for<Os...>{}));
+    }
+  }
+
+  template <typename T> Result operator()(const Designator<T> &x) const {
+    return {operation::Operator::Identity, {AsSomeExpr(x)}};
+  }
+
+  template <typename T> Result operator()(const Constant<T> &x) const {
+    return {operation::Operator::Identity, {AsSomeExpr(x)}};
+  }
+
+  template <typename... Rs>
+  Result Combine(Result &&result, Rs &&...results) const {
+    // There shouldn't be any combining needed, since we're stopping the
+    // traversal at the top-level operation, but implement one that picks
+    // the first non-empty result.
+    if constexpr (sizeof...(Rs) == 0) {
+      return std::move(result);
+    } else {
+      if (!result.second.empty()) {
+        return std::move(result);
+      } else {
+        return Combine(std::move(results)...);
+      }
+    }
+  }
+
+private:
+  template <typename D, typename R, typename... Os, size_t... Is>
+  Arguments OperationArgs(
+      const Operation<D, R, Os...> &x, std::index_sequence<Is...>) const {
+    return Arguments{Expr<SomeType>(x.template operand<Is>())...};
+  }
+};
+} // namespace operation
+
+std::string operation::ToString(operation::Operator op) {
+  switch (op) {
+  case Operator::Unknown:
+    return "??";
+  case Operator::Add:
+    return "+";
+  case Operator::And:
+    return "AND";
+  case Operator::Associated:
+    return "ASSOCIATED";
+  case Operator::Call:
+    return "function-call";
+  case Operator::Constant:
+    return "constant";
+  case Operator::Convert:
+    return "type-conversion";
+  case Operator::Div:
+    return "/";
+  case Operator::Eq:
+    return "==";
+  case Operator::Eqv:
+    return "EQV";
+  case Operator::False:
+    return ".FALSE.";
+  case Operator::Ge:
+    return ">=";
+  case Operator::Gt:
+    return ">";
+  case Operator::Identity:
+    return "identity";
+  case Operator::Intrinsic:
+    return "intrinsic";
+  case Operator::Le:
+    return "<=";
+  case Operator::Lt:
+    return "<";
+  case Operator::Max:
+    return "MAX";
+  case Operator::Min:
+    return "MIN";
+  case Operator::Mul:
+    return "*";
+  case Operator::Ne:
+    return "/=";
+  case Operator::Neqv:
+    return "NEQV/EOR";
+  case Operator::Not:
+    return "NOT";
+  case Operator::Or:
+    return "OR";
+  case Operator::Pow:
+    return "**";
+  case Operator::Resize:
+    return "resize";
+  case Operator::Sub:
+    return "-";
+  case Operator::True:
+    return ".TRUE.";
+  }
+  llvm_unreachable("Unhandler operator");
+}
+
+operation::Operator operation::OperationCode(const ProcedureDesignator &proc) {
+  Operator code{llvm::StringSwitch<Operator>(proc.GetName())
+          .Case("associated", Operator::Associated)
+          .Case("min", Operator::Min)
+          .Case("max", Operator::Max)
+          .Case("iand", Operator::And)
+          .Case("ior", Operator::Or)
+          .Case("ieor", Operator::Neqv)
+          .Default(Operator::Call)};
+  if (code == Operator::Call && proc.GetSpecificIntrinsic()) {
+    return Operator::Intrinsic;
+  }
+  return code;
+}
+
+std::pair<operation::Operator, std::vector<Expr<SomeType>>>
+GetTopLevelOperation(const Expr<SomeType> &expr) {
+  return operation::ArgumentExtractor<true>{}(expr);
+}
+
+namespace operation {
+struct ConvertCollector
+    : public Traverse<ConvertCollector,
+          std::pair<std::optional<Expr<SomeType>>, std::vector<DynamicType>>,
+          false> {
+  using Result =
+      std::pair<std::optional<Expr<SomeType>>, std::vector<DynamicType>>;
+  using Base = Traverse<ConvertCollector, Result, false>;
+  ConvertCollector() : Base(*this) {}
+
+  Result Default() const { return {}; }
+
+  using Base::operator();
+
+  template <typename T> Result operator()(const Designator<T> &x) const {
+    return {AsSomeExpr(x), {}};
+  }
+
+  template <typename T> Result operator()(const FunctionRef<T> &x) const {
+    return {AsSomeExpr(x), {}};
+  }
+
+  template <typename T> Result operator()(const Constant<T> &x) const {
+    return {AsSomeExpr(x), {}};
+  }
+
+  template <typename D, typename R, typename... Os>
+  Result operator()(const Operation<D, R, Os...> &x) const {
+    if constexpr (std::is_same_v<D, Parentheses<R>>) {
+      // Ignore parentheses.
+      return (*this)(x.template operand<0>());
+    } else if constexpr (is_convert_v<D>) {
+      // Convert should always have a typed result, so it should be safe to
+      // dereference x.GetType().
+      return Combine(
+          {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>()));
+    } else if constexpr (is_complex_constructor_v<D>) {
+      // This is a conversion iff the imaginary operand is 0.
+      if (IsZero(x.template operand<1>())) {
+        return Combine(
+            {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>()));
+      } else {
+        return {AsSomeExpr(x.derived()), {}};
+      }
+    } else {
+      return {AsSomeExpr(x.derived()), {}};
+    }
+  }
+
+  template <typename... Rs>
+  Result Combine(Result &&result, Rs &&...results) const {
+    Result v(std::move(result));
+    auto setValue{[](std::optional<Expr<SomeType>> &x,
+                      std::optional<Expr<SomeType>> &&y) {
+      assert((!x.has_value() || !y.has_value()) && "Multiple designators");
+      if (!x.has_value()) {
+        x = std::move(y);
+      }
+    }};
+    auto moveAppend{[](auto &accum, auto &&other) {
+      for (auto &&s : other) {
+        accum.push_back(std::move(s));
+      }
+    }};
+    (setValue(v.first, std::move(results).first), ...);
+    (moveAppend(v.second, std::move(results).second), ...);
+    return v;
+  }
+
+private:
+  template <typename A> static bool IsZero(const A &x) { return false; }
+  template <typename T> static bool IsZero(const Expr<T> &x) {
+    return common::visit([](auto &&s) { return IsZero(s); }, x.u);
+  }
+  template <typename T> static bool IsZero(const Constant<T> &x) {
+    if (auto &&maybeScalar{x.GetScalarValue()}) {
+      return maybeScalar->IsZero();
+    } else {
+      return false;
+    }
+  }
+
+  template <typename T> struct is_convert {
+    static constexpr bool value{false};
+  };
+  template <typename T, common::TypeCategory C>
+  struct is_convert<Convert<T, C>> {
+    static constexpr bool value{true};
+  };
+  template <int K> struct is_convert<ComplexComponent<K>> {
+    // Conversion from complex to real.
+    static constexpr bool value{true};
+  };
+  template <typename T>
+  static constexpr bool is_convert_v{is_convert<T>::value};
+
+  template <typename T> struct is_complex_constructor {
+    static constexpr bool value{false};
+  };
+  template <int K> struct is_complex_constructor<ComplexConstructor<K>> {
+    static constexpr bool value{true};
+  };
+  template <typename T>
+  static constexpr bool is_complex_constructor_v{
+      is_complex_constructor<T>::value};
+};
+} // namespace operation
+
+std::optional<Expr<SomeType>> GetConvertInput(const Expr<SomeType> &x) {
+  // This returns Expr<SomeType>{x} when x is a designator/functionref/constant.
+  return operation::ConvertCollector{}(x).first;
+}
+
+bool IsSameOrConvertOf(const Expr<SomeType> &expr, const Expr<SomeType> &x) {
+  // Check if expr is same as x, or a sequence of Convert operations on x.
+  if (expr == x) {
+    return true;
+  } else if (auto maybe{GetConvertInput(expr)}) {
+    return *maybe == x;
+  } else {
+    return false;
+  }
+}
 } // namespace Fortran::evaluate
 
 namespace Fortran::semantics {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 69e9c53baa74..3ef3330cba2d 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -654,7 +654,7 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
   firOpBuilder.setInsertionPointToStart(&block);
   if (Fortran::parser::CheckForSingleVariableOnRHS(stmt1)) {
-    if (Fortran::semantics::CheckForSymbolMatch(
+    if (Fortran::evaluate::CheckForSymbolMatch(
             Fortran::semantics::GetExpr(stmt2Var),
             Fortran::semantics::GetExpr(stmt2Expr))) {
       // Atomic capture construct is of the form [capture-stmt, update-stmt]
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 2595a08f626e..1c51fd60d570 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2934,11 +2934,12 @@ genAtomicUpdate(lower::AbstractConverter &converter,
   mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
 
   // This must exist by now.
-  SomeExpr input = *semantics::GetConvertInput(assign.rhs);
-  std::vector<SomeExpr> args{semantics::GetTopLevelOperation(input).second};
+  SomeExpr input = *Fortran::evaluate::GetConvertInput(assign.rhs);
+  std::vector<SomeExpr> args{
+      Fortran::evaluate::GetTopLevelOperation(input).second};
   assert(!args.empty() && "Update operation without arguments");
   for (auto &arg : args) {
-    if (!semantics::IsSameOrConvertOf(arg, atom)) {
+    if (!Fortran::evaluate::IsSameOrConvertOf(arg, atom)) {
       mlir::Value val = fir::getBase(converter.genExprValue(arg, naCtx, &loc));
       overrides.try_emplace(&arg, val);
     }
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 68cea6739830..3abb5a304b00 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -12,6 +12,7 @@
 #include "flang/Evaluate/check-expression.h"
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/shape.h"
+#include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
@@ -2987,6 +2988,8 @@ static bool IsPointerAssignment(const evaluate::Assignment &x) {
       std::holds_alternative<evaluate::Assignment::BoundsRemapping>(x.u);
 }
 
+namespace operation = Fortran::evaluate::operation;
+
 static bool IsCheckForAssociated(const SomeExpr &cond) {
   return GetTopLevelOperation(cond).first == operation::Operator::Associated;
 }
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index bf520d04a50c..d053179448c0 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -17,7 +17,6 @@
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
 #include "flang/Support/Fortran.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <set>
@@ -1789,332 +1788,4 @@ bool HadUseError(
   }
 }
 
-bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs) {
-  if (lhs && rhs) {
-    if (SymbolVector lhsSymbols{evaluate::GetSymbolVector(*lhs)};
-        !lhsSymbols.empty()) {
-      const Symbol &first{*lhsSymbols.front()};
-      for (const Symbol &symbol : evaluate::GetSymbolVector(*rhs)) {
-        if (first == symbol) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-namespace operation {
-template <typename T> //
-SomeExpr asSomeExpr(const T &x) {
-  auto copy{x};
-  return AsGenericExpr(std::move(copy));
-}
-
-template <bool IgnoreResizingConverts> //
-struct ArgumentExtractor
-    : public evaluate::Traverse<ArgumentExtractor<IgnoreResizingConverts>,
-          std::pair<operation::Operator, std::vector<SomeExpr>>, false> {
-  using Arguments = std::vector<SomeExpr>;
-  using Result = std::pair<operation::Operator, Arguments>;
-  using Base = evaluate::Traverse<ArgumentExtractor<IgnoreResizingConverts>,
-      Result, false>;
-  static constexpr auto IgnoreResizes = IgnoreResizingConverts;
-  static constexpr auto Logical = common::TypeCategory::Logical;
-  ArgumentExtractor() : Base(*this) {}
-
-  Result Default() const { return {}; }
-
-  using Base::operator();
-
-  template <int Kind> //
-  Result operator()(
-      const evaluate::Constant<evaluate::Type<Logical, Kind>> &x) const {
-    if (const auto &val{x.GetScalarValue()}) {
-      return val->IsTrue()
-          ? std::make_pair(operation::Operator::True, Arguments{})
-          : std::make_pair(operation::Operator::False, Arguments{});
-    }
-    return Default();
-  }
-
-  template <typename R> //
-  Result operator()(const evaluate::FunctionRef<R> &x) const {
-    Result result{operation::OperationCode(x.proc()), {}};
-    for (size_t i{0}, e{x.arguments().size()}; i != e; ++i) {
-      if (auto *e{x.UnwrapArgExpr(i)}) {
-        result.second.push_back(*e);
-      }
-    }
-    return result;
-  }
-
-  template <typename D, typename R, typename... Os>
-  Result operator()(const evaluate::Operation<D, R, Os...> &x) const {
-    if constexpr (std::is_same_v<D, evaluate::Parentheses<R>>) {
-      // Ignore top-level parentheses.
-      return (*this)(x.template operand<0>());
-    }
-    if constexpr (IgnoreResizes &&
-        std::is_same_v<D, evaluate::Convert<R, R::category>>) {
-      // Ignore conversions within the same category.
-      // Atomic operations on int(kind=1) may be implicitly widened
-      // to int(kind=4) for example.
-      return (*this)(x.template operand<0>());
-    } else {
-      return std::make_pair(operation::OperationCode(x),
-          OperationArgs(x, std::index_sequence_for<Os...>{}));
-    }
-  }
-
-  template <typename T> //
-  Result operator()(const evaluate::Designator<T> &x) const {
-    return {operation::Operator::Identity, {asSomeExpr(x)}};
-  }
-
-  template <typename T> //
-  Result operator()(const evaluate::Constant<T> &x) const {
-    return {operation::Operator::Identity, {asSomeExpr(x)}};
-  }
-
-  template <typename... Rs> //
-  Result Combine(Result &&result, Rs &&...results) const {
-    // There shouldn't be any combining needed, since we're stopping the
-    // traversal at the top-level operation, but implement one that picks
-    // the first non-empty result.
-    if constexpr (sizeof...(Rs) == 0) {
-      return std::move(result);
-    } else {
-      if (!result.second.empty()) {
-        return std::move(result);
-      } else {
-        return Combine(std::move(results)...);
-      }
-    }
-  }
-
-private:
-  template <typename D, typename R, typename... Os, size_t... Is>
-  Arguments OperationArgs(const evaluate::Operation<D, R, Os...> &x,
-      std::index_sequence<Is...>) const {
-    return Arguments{SomeExpr(x.template operand<Is>())...};
-  }
-};
-} // namespace operation
-
-std::string operation::ToString(operation::Operator op) {
-  switch (op) {
-  case Operator::Unknown:
-    return "??";
-  case Operator::Add:
-    return "+";
-  case Operator::And:
-    return "AND";
-  case Operator::Associated:
-    return "ASSOCIATED";
-  case Operator::Call:
-    return "function-call";
-  case Operator::Constant:
-    return "constant";
-  case Operator::Convert:
-    return "type-conversion";
-  case Operator::Div:
-    return "/";
-  case Operator::Eq:
-    return "==";
-  case Operator::Eqv:
-    return "EQV";
-  case Operator::False:
-    return ".FALSE.";
-  case Operator::Ge:
-    return ">=";
-  case Operator::Gt:
-    return ">";
-  case Operator::Identity:
-    return "identity";
-  case Operator::Intrinsic:
-    return "intrinsic";
-  case Operator::Le:
-    return "<=";
-  case Operator::Lt:
-    return "<";
-  case Operator::Max:
-    return "MAX";
-  case Operator::Min:
-    return "MIN";
-  case Operator::Mul:
-    return "*";
-  case Operator::Ne:
-    return "/=";
-  case Operator::Neqv:
-    return "NEQV/EOR";
-  case Operator::Not:
-    return "NOT";
-  case Operator::Or:
-    return "OR";
-  case Operator::Pow:
-    return "**";
-  case Operator::Resize:
-    return "resize";
-  case Operator::Sub:
-    return "-";
-  case Operator::True:
-    return ".TRUE.";
-  }
-  llvm_unreachable("Unhandler operator");
-}
-
-operation::Operator operation::OperationCode(
-    const evaluate::ProcedureDesignator &proc) {
-  Operator code = llvm::StringSwitch<Operator>(proc.GetName())
-                      .Case("associated", Operator::Associated)
-                      .Case("min", Operator::Min)
-                      .Case("max", Operator::Max)
-                      .Case("iand", Operator::And)
-                      .Case("ior", Operator::Or)
-                      .Case("ieor", Operator::Neqv)
-                      .Default(Operator::Call);
-  if (code == Operator::Call && proc.GetSpecificIntrinsic()) {
-    return Operator::Intrinsic;
-  }
-  return code;
-}
-
-std::pair<operation::Operator, std::vector<SomeExpr>> GetTopLevelOperation(
-    const SomeExpr &expr) {
-  return operation::ArgumentExtractor<true>{}(expr);
-}
-
-namespace operation {
-struct ConvertCollector
-    : public evaluate::Traverse<ConvertCollector,
-          std::pair<MaybeExpr, std::vector<evaluate::DynamicType>>, false> {
-  using Result = std::pair<MaybeExpr, std::vector<evaluate::DynamicType>>;
-  using Base = evaluate::Traverse<ConvertCollector, Result, false>;
-  ConvertCollector() : Base(*this) {}
-
-  Result Default() const { return {}; }
-
-  using Base::operator();
-
-  template <typename T> //
-  Result operator()(const evaluate::Designator<T> &x) const {
-    return {asSomeExpr(x), {}};
-  }
-
-  template <typename T> //
-  Result operator()(const evaluate::FunctionRef<T> &x) const {
-    return {asSomeExpr(x), {}};
-  }
-
-  template <typename T> //
-  Result operator()(const evaluate::Constant<T> &x) const {
-    return {asSomeExpr(x), {}};
-  }
-
-  template <typename D, typename R, typename... Os>
-  Result operator()(const evaluate::Operation<D, R, Os...> &x) const {
-    if constexpr (std::is_same_v<D, evaluate::Parentheses<R>>) {
-      // Ignore parentheses.
-      return (*this)(x.template operand<0>());
-    } else if constexpr (is_convert_v<D>) {
-      // Convert should always have a typed result, so it should be safe to
-      // dereference x.GetType().
-      return Combine(
-          {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>()));
-    } else if constexpr (is_complex_constructor_v<D>) {
-      // This is a conversion iff the imaginary operand is 0.
-      if (IsZero(x.template operand<1>())) {
-        return Combine(
-            {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>()));
-      } else {
-        return {asSomeExpr(x.derived()), {}};
-      }
-    } else {
-      return {asSomeExpr(x.derived()), {}};
-    }
-  }
-
-  template <typename... Rs> //
-  Result Combine(Result &&result, Rs &&...results) const {
-    Result v(std::move(result));
-    auto setValue{[](MaybeExpr &x, MaybeExpr &&y) {
-      assert((!x.has_value() || !y.has_value()) && "Multiple designators");
-      if (!x.has_value()) {
-        x = std::move(y);
-      }
-    }};
-    auto moveAppend{[](auto &accum, auto &&other) {
-      for (auto &&s : other) {
-        accum.push_back(std::move(s));
-      }
-    }};
-    (setValue(v.first, std::move(results).first), ...);
-    (moveAppend(v.second, std::move(results).second), ...);
-    return v;
-  }
-
-private:
-  template <typename T> //
-  static bool IsZero(const T &x) {
-    return false;
-  }
-  template <typename T> //
-  static bool IsZero(const evaluate::Expr<T> &x) {
-    return common::visit([](auto &&s) { return IsZero(s); }, x.u);
-  }
-  template <typename T> //
-  static bool IsZero(const evaluate::Constant<T> &x) {
-    if (auto &&maybeScalar{x.GetScalarValue()}) {
-      return maybeScalar->IsZero();
-    } else {
-      return false;
-    }
-  }
-
-  template <typename T> //
-  struct is_convert {
-    static constexpr bool value{false};
-  };
-  template <typename T, common::TypeCategory C> //
-  struct is_convert<evaluate::Convert<T, C>> {
-    static constexpr bool value{true};
-  };
-  template <int K> //
-  struct is_convert<evaluate::ComplexComponent<K>> {
-    // Conversion from complex to real.
-    static constexpr bool value{true};
-  };
-  template <typename T> //
-  static constexpr bool is_convert_v = is_convert<T>::value;
-
-  template <typename T> //
-  struct is_complex_constructor {
-    static constexpr bool value{false};
-  };
-  template <int K> //
-  struct is_complex_constructor<evaluate::ComplexConstructor<K>> {
-    static constexpr bool value{true};
-  };
-  template <typename T> //
-  static constexpr bool is_complex_constructor_v =
-      is_complex_constructor<T>::value;
-};
-} // namespace operation
-
-MaybeExpr GetConvertInput(const SomeExpr &x) {
-  // This returns SomeExpr(x) when x is a designator/functionref/constant.
-  return operation::ConvertCollector{}(x).first;
-}
-
-bool IsSameOrConvertOf(const SomeExpr &expr, const SomeExpr &x) {
-  // Check if expr is same as x, or a sequence of Convert operations on x.
-  if (expr == x) {
-    return true;
-  } else if (auto maybe{GetConvertInput(expr)}) {
-    return *maybe == x;
-  } else {
-    return false;
-  }
-}
 } // namespace Fortran::semantics
\ No newline at end of file

From 53336ad488c953d41e744d42873d712276be980f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 19 Jun 2025 22:02:03 +0100
Subject: [PATCH 0977/1322] [Offload] Move (most) global state to an
 `OffloadContext` struct (#144494)

Rather than having a number of static local variables, we now use
a single `OffloadContext` struct to store global state. This is
initialised by `olInit`, but is never deleted (de-initialization of
Offload isn't yet implemented).

The error reporting mechanism has not been moved to the struct, since
that's going to cause issues with teardown (error messages must outlive
liboffload).
---
 offload/liboffload/include/OffloadImpl.hpp    | 12 +--
 offload/liboffload/src/OffloadImpl.cpp        | 82 ++++++++++++-------
 offload/liboffload/src/OffloadLib.cpp         |  5 --
 .../tools/offload-tblgen/EntryPointGen.cpp    | 37 +++++----
 4 files changed, 80 insertions(+), 56 deletions(-)

diff --git a/offload/liboffload/include/OffloadImpl.hpp b/offload/liboffload/include/OffloadImpl.hpp
index 9b0a21cb9ae1..a12d8c47a180 100644
--- a/offload/liboffload/include/OffloadImpl.hpp
+++ b/offload/liboffload/include/OffloadImpl.hpp
@@ -22,12 +22,12 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Error.h"
 
-struct OffloadConfig {
-  bool TracingEnabled = false;
-  bool ValidationEnabled = true;
-};
-
-OffloadConfig &offloadConfig();
+namespace llvm {
+namespace offload {
+bool isTracingEnabled();
+bool isValidationEnabled();
+} // namespace offload
+} // namespace llvm
 
 // Use the StringSet container to efficiently deduplicate repeated error
 // strings (e.g. if the same error is hit constantly in a long running program)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 770c212d804d..f02497c0a633 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -93,22 +93,36 @@ struct AllocInfo {
   ol_alloc_type_t Type;
 };
 
-using AllocInfoMapT = DenseMap<void *, AllocInfo>;
-AllocInfoMapT &allocInfoMap() {
-  static AllocInfoMapT AllocInfoMap{};
-  return AllocInfoMap;
-}
+// Global shared state for liboffload
+struct OffloadContext;
+static OffloadContext *OffloadContextVal;
+struct OffloadContext {
+  OffloadContext(OffloadContext &) = delete;
+  OffloadContext(OffloadContext &&) = delete;
+  OffloadContext &operator=(OffloadContext &) = delete;
+  OffloadContext &operator=(OffloadContext &&) = delete;
 
-using PlatformVecT = SmallVector<ol_platform_impl_t, 4>;
-PlatformVecT &Platforms() {
-  static PlatformVecT Platforms;
-  return Platforms;
-}
+  bool TracingEnabled = false;
+  bool ValidationEnabled = true;
+  DenseMap<void *, AllocInfo> AllocInfoMap{};
+  SmallVector<ol_platform_impl_t, 4> Platforms{};
 
-ol_device_handle_t HostDevice() {
-  // The host platform is always inserted last
-  return &Platforms().back().Devices[0];
+  ol_device_handle_t HostDevice() {
+    // The host platform is always inserted last
+    return &Platforms.back().Devices[0];
+  }
+
+  static OffloadContext &get() {
+    assert(OffloadContextVal);
+    return *OffloadContextVal;
+  }
+};
+
+// If the context is uninited, then we assume tracing is disabled
+bool isTracingEnabled() {
+  return OffloadContextVal && OffloadContext::get().TracingEnabled;
 }
+bool isValidationEnabled() { return OffloadContext::get().ValidationEnabled; }
 
 template <typename HandleT> Error olDestroy(HandleT Handle) {
   delete Handle;
@@ -130,10 +144,12 @@ constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
 #include "Shared/Targets.def"
 
 void initPlugins() {
+  auto *Context = new OffloadContext{};
+
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    Platforms().emplace_back(ol_platform_impl_t{                               \
+    Context->Platforms.emplace_back(ol_platform_impl_t{                        \
         std::unique_ptr<GenericPluginTy>(createPlugin_##Name()),               \
         {},                                                                    \
         pluginNameToBackend(#Name)});                                          \
@@ -141,7 +157,7 @@ void initPlugins() {
 #include "Shared/Targets.def"
 
   // Preemptively initialize all devices in the plugin
-  for (auto &Platform : Platforms()) {
+  for (auto &Platform : Context->Platforms) {
     // Do not use the host plugin - it isn't supported.
     if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN)
       continue;
@@ -157,15 +173,16 @@ void initPlugins() {
   }
 
   // Add the special host device
-  auto &HostPlatform = Platforms().emplace_back(
+  auto &HostPlatform = Context->Platforms.emplace_back(
       ol_platform_impl_t{nullptr,
                          {ol_device_impl_t{-1, nullptr, nullptr}},
                          OL_PLATFORM_BACKEND_HOST});
-  HostDevice()->Platform = &HostPlatform;
+  Context->HostDevice()->Platform = &HostPlatform;
 
-  offloadConfig().TracingEnabled = std::getenv("OFFLOAD_TRACE");
-  offloadConfig().ValidationEnabled =
-      !std::getenv("OFFLOAD_DISABLE_VALIDATION");
+  Context->TracingEnabled = std::getenv("OFFLOAD_TRACE");
+  Context->ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
+
+  OffloadContextVal = Context;
 }
 
 // TODO: We can properly reference count here and manage the resources in a more
@@ -229,7 +246,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 
   // Find the info if it exists under any of the given names
   auto GetInfo = [&](std::vector<std::string> Names) {
-    if (Device == HostDevice())
+    if (Device == OffloadContext::get().HostDevice())
       return std::string("Host");
 
     if (!Device->Device)
@@ -251,8 +268,9 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   case OL_DEVICE_INFO_PLATFORM:
     return ReturnValue(Device->Platform);
   case OL_DEVICE_INFO_TYPE:
-    return Device == HostDevice() ? ReturnValue(OL_DEVICE_TYPE_HOST)
-                                  : ReturnValue(OL_DEVICE_TYPE_GPU);
+    return Device == OffloadContext::get().HostDevice()
+               ? ReturnValue(OL_DEVICE_TYPE_HOST)
+               : ReturnValue(OL_DEVICE_TYPE_GPU);
   case OL_DEVICE_INFO_NAME:
     return ReturnValue(GetInfo({"Device Name"}).c_str());
   case OL_DEVICE_INFO_VENDOR:
@@ -280,7 +298,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
 }
 
 Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
-  for (auto &Platform : Platforms()) {
+  for (auto &Platform : OffloadContext::get().Platforms) {
     for (auto &Device : Platform.Devices) {
       if (!Callback(&Device, UserData)) {
         break;
@@ -311,16 +329,17 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
     return Alloc.takeError();
 
   *AllocationOut = *Alloc;
-  allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device, Type});
+  OffloadContext::get().AllocInfoMap.insert_or_assign(*Alloc,
+                                                      AllocInfo{Device, Type});
   return Error::success();
 }
 
 Error olMemFree_impl(void *Address) {
-  if (!allocInfoMap().contains(Address))
+  if (!OffloadContext::get().AllocInfoMap.contains(Address))
     return createOffloadError(ErrorCode::INVALID_ARGUMENT,
                               "address is not a known allocation");
 
-  auto AllocInfo = allocInfoMap().at(Address);
+  auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
   auto Device = AllocInfo.Device;
   auto Type = AllocInfo.Type;
 
@@ -328,7 +347,7 @@ Error olMemFree_impl(void *Address) {
           Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type)))
     return Res;
 
-  allocInfoMap().erase(Address);
+  OffloadContext::get().AllocInfoMap.erase(Address);
 
   return Error::success();
 }
@@ -395,7 +414,8 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
                     ol_device_handle_t DstDevice, const void *SrcPtr,
                     ol_device_handle_t SrcDevice, size_t Size,
                     ol_event_handle_t *EventOut) {
-  if (DstDevice == HostDevice() && SrcDevice == HostDevice()) {
+  auto Host = OffloadContext::get().HostDevice();
+  if (DstDevice == Host && SrcDevice == Host) {
     if (!Queue) {
       std::memcpy(DstPtr, SrcPtr, Size);
       return Error::success();
@@ -410,11 +430,11 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
   // If no queue is given the memcpy will be synchronous
   auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
 
-  if (DstDevice == HostDevice()) {
+  if (DstDevice == Host) {
     if (auto Res =
             SrcDevice->Device->dataRetrieve(DstPtr, SrcPtr, Size, QueueImpl))
       return Res;
-  } else if (SrcDevice == HostDevice()) {
+  } else if (SrcDevice == Host) {
     if (auto Res =
             DstDevice->Device->dataSubmit(DstPtr, SrcPtr, Size, QueueImpl))
       return Res;
diff --git a/offload/liboffload/src/OffloadLib.cpp b/offload/liboffload/src/OffloadLib.cpp
index 8662d3a44124..0a65815e5969 100644
--- a/offload/liboffload/src/OffloadLib.cpp
+++ b/offload/liboffload/src/OffloadLib.cpp
@@ -30,11 +30,6 @@ ol_code_location_t *&currentCodeLocation() {
   return CodeLoc;
 }
 
-OffloadConfig &offloadConfig() {
-  static OffloadConfig Config{};
-  return Config;
-}
-
 namespace llvm {
 namespace offload {
 // Pull in the declarations for the implementation functions. The actual entry
diff --git a/offload/tools/offload-tblgen/EntryPointGen.cpp b/offload/tools/offload-tblgen/EntryPointGen.cpp
index 85c5c50bf2f2..13aa0d1f6318 100644
--- a/offload/tools/offload-tblgen/EntryPointGen.cpp
+++ b/offload/tools/offload-tblgen/EntryPointGen.cpp
@@ -35,21 +35,30 @@ static void EmitValidationFunc(const FunctionRec &F, raw_ostream &OS) {
   }
   OS << ") {\n";
 
-  OS << TAB_1 "if (offloadConfig().ValidationEnabled) {\n";
-  // Emit validation checks
-  for (const auto &Return : F.getReturns()) {
-    for (auto &Condition : Return.getConditions()) {
-      if (Condition.starts_with("`") && Condition.ends_with("`")) {
-        auto ConditionString = Condition.substr(1, Condition.size() - 2);
-        OS << formatv(TAB_2 "if ({0}) {{\n", ConditionString);
-        OS << formatv(TAB_3 "return createOffloadError(error::ErrorCode::{0}, "
-                            "\"validation failure: {1}\");\n",
-                      Return.getUnprefixedValue(), ConditionString);
-        OS << TAB_2 "}\n\n";
+  bool HasValidation = llvm::any_of(F.getReturns(), [](auto &R) {
+    return llvm::any_of(R.getConditions(), [](auto &C) {
+      return C.starts_with("`") && C.ends_with("`");
+    });
+  });
+
+  if (HasValidation) {
+    OS << TAB_1 "if (llvm::offload::isValidationEnabled()) {\n";
+    // Emit validation checks
+    for (const auto &Return : F.getReturns()) {
+      for (auto &Condition : Return.getConditions()) {
+        if (Condition.starts_with("`") && Condition.ends_with("`")) {
+          auto ConditionString = Condition.substr(1, Condition.size() - 2);
+          OS << formatv(TAB_2 "if ({0}) {{\n", ConditionString);
+          OS << formatv(TAB_3
+                        "return createOffloadError(error::ErrorCode::{0}, "
+                        "\"validation failure: {1}\");\n",
+                        Return.getUnprefixedValue(), ConditionString);
+          OS << TAB_2 "}\n\n";
+        }
       }
     }
+    OS << TAB_1 "}\n\n";
   }
-  OS << TAB_1 "}\n\n";
 
   // Perform actual function call to the implementation
   ParamNameList = ParamNameList.substr(0, ParamNameList.size() - 2);
@@ -74,7 +83,7 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
   OS << ") {\n";
 
   // Emit pre-call prints
-  OS << TAB_1 "if (offloadConfig().TracingEnabled) {\n";
+  OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n";
   OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName());
   OS << TAB_1 "}\n\n";
 
@@ -85,7 +94,7 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
       PrefixLower, F.getName(), ParamNameList);
 
   // Emit post-call prints
-  OS << TAB_1 "if (offloadConfig().TracingEnabled) {\n";
+  OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n";
   if (F.getParams().size() > 0) {
     OS << formatv(TAB_2 "{0} Params = {{", F.getParamStructName());
     for (const auto &Param : F.getParams()) {

From 2f3a8fd0b3322baac25e5595313413ed4cd1158f Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 19 Jun 2025 17:06:52 -0400
Subject: [PATCH 0978/1322] [libc] Skip x87 floating point register and only
 update mxcsr for x86_64 targets when raising exceptions inside math
 functions. (#144951)

Updating x87 floating point register significantly affect the
performance of the functions.
All the floating point exception reads will merge the results from both
mxcsr and x87 registers anyway.
---
 libc/src/__support/FPUtil/FEnvImpl.h        |  5 +++++
 libc/src/__support/FPUtil/x86_64/FEnvImpl.h | 13 ++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index ba145a3da45c..76910880eb81 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -91,7 +91,12 @@ LIBC_INLINE static int set_except_if_required([[maybe_unused]] int excepts) {
 LIBC_INLINE static int raise_except_if_required([[maybe_unused]] int excepts) {
 #ifndef LIBC_MATH_HAS_NO_EXCEPT
   if (math_errhandling & MATH_ERREXCEPT)
+#ifdef LIBC_TARGET_ARCH_IS_X86_64
+    return raise_except</*SKIP_X87_FPU*/ true>(excepts);
+#else  // !LIBC_TARGET_ARCH_IS_X86
     return raise_except(excepts);
+#endif // LIBC_TARGET_ARCH_IS_X86
+
 #endif // LIBC_MATH_HAS_NO_EXCEPT
   return 0;
 }
diff --git a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
index b77178ea69ea..560727c22978 100644
--- a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
@@ -239,7 +239,7 @@ LIBC_INLINE int set_except(int excepts) {
   return 0;
 }
 
-LIBC_INLINE int raise_except(int excepts) {
+template <bool SKIP_X87_FPU = false> LIBC_INLINE int raise_except(int excepts) {
   uint16_t status_value = internal::get_status_value_for_except(excepts);
 
   // We set the status flag for exception one at a time and call the
@@ -256,13 +256,16 @@ LIBC_INLINE int raise_except(int excepts) {
   // when raising the next exception.
 
   auto raise_helper = [](uint16_t singleExceptFlag) {
-    internal::X87StateDescriptor state;
+    if constexpr (!SKIP_X87_FPU) {
+      internal::X87StateDescriptor state;
+      internal::get_x87_state_descriptor(state);
+      state.status_word |= singleExceptFlag;
+      internal::write_x87_state_descriptor(state);
+    }
+
     uint32_t mxcsr = 0;
-    internal::get_x87_state_descriptor(state);
     mxcsr = internal::get_mxcsr();
-    state.status_word |= singleExceptFlag;
     mxcsr |= singleExceptFlag;
-    internal::write_x87_state_descriptor(state);
     internal::write_mxcsr(mxcsr);
     internal::fwait();
   };

From 53ea522d1b87c144a1faeffea62d50a4d9907a38 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 19 Jun 2025 14:12:14 -0700
Subject: [PATCH 0979/1322] [LV] Introduce and use
 VPBuilder::createScalarZExtOrTrunc [nfc] (#144946)

Reduce redundant code, make the flow slightly easier to read.
---
 .../Vectorize/LoopVectorizationPlanner.h      | 11 +++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 40 +++++++------------
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 70f541d64b30..d17c64a778e8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -278,6 +278,17 @@ public:
         new VPInstructionWithType(Opcode, Op, ResultTy, {}, DL));
   }
 
+  VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
+                                   DebugLoc DL) {
+    if (ResultTy == SrcTy)
+      return Op;
+    Instruction::CastOps CastOp =
+        ResultTy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits()
+            ? Instruction::Trunc
+            : Instruction::ZExt;
+    return createScalarCast(CastOp, Op, ResultTy, DL);
+  }
+
   VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
                                      Type *ResultTy) {
     return tryInsertInstruction(new VPWidenCastRecipe(Opcode, Op, ResultTy));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cc73ae44f9c0..60a8837fb76a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -797,15 +797,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   VPValue *FirstActiveLane =
       B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
   Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
-  if (CanonicalIVType != FirstActiveLaneType) {
-    Instruction::CastOps CastOp =
-        CanonicalIVType->getScalarSizeInBits() <
-                FirstActiveLaneType->getScalarSizeInBits()
-            ? Instruction::Trunc
-            : Instruction::ZExt;
-    FirstActiveLane =
-        B.createScalarCast(CastOp, FirstActiveLane, CanonicalIVType, DL);
-  }
+  FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
+                                              FirstActiveLaneType, DL);
   EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
 
   // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
@@ -2182,13 +2175,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     VPValue *MaxEVL = &Plan.getVF();
     // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
     VPBuilder Builder(LoopRegion->getPreheaderVPBB());
-    if (unsigned VFSize =
-            TypeInfo.inferScalarType(MaxEVL)->getScalarSizeInBits();
-        VFSize != 32) {
-      MaxEVL = Builder.createScalarCast(
-          VFSize > 32 ? Instruction::Trunc : Instruction::ZExt, MaxEVL,
-          Type::getInt32Ty(Ctx), DebugLoc());
-    }
+    MaxEVL = Builder.createScalarZExtOrTrunc(MaxEVL, Type::getInt32Ty(Ctx),
+                                             TypeInfo.inferScalarType(MaxEVL),
+                                             DebugLoc());
+
     Builder.setInsertPoint(Header, Header->getFirstNonPhi());
     PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
   }
@@ -2286,6 +2276,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
     return false;
 
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
+  auto *CanIVTy = CanonicalIVPHI->getScalarType();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
@@ -2297,8 +2288,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
       Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
   if (MaxSafeElements) {
     // Support for MaxSafeDist for correct loop emission.
-    VPValue *AVLSafe = Plan.getOrAddLiveIn(
-        ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements));
+    VPValue *AVLSafe =
+        Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
     AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
   }
@@ -2308,13 +2299,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
   auto *CanonicalIVIncrement =
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   Builder.setInsertPoint(CanonicalIVIncrement);
-  VPSingleDefRecipe *OpVPEVL = VPEVL;
-  if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
-      IVSize != 32) {
-    OpVPEVL = Builder.createScalarCast(
-        IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL,
-        CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc());
-  }
+  VPValue *OpVPEVL = VPEVL;
+
+  auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext());
+  OpVPEVL = Builder.createScalarZExtOrTrunc(
+      OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
+
   auto *NextEVLIV = Builder.createOverflowingOp(
       Instruction::Add, {OpVPEVL, EVLPhi},
       {CanonicalIVIncrement->hasNoUnsignedWrap(),

From d3a2931d8af87c20aaede991acda0b5f313075c3 Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 19 Jun 2025 17:51:01 -0400
Subject: [PATCH 0980/1322] [libc] Use `raise_except_if_required` for log2f.
 (#144961)

---
 libc/src/math/generic/log2f.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index b25ec41f277b..cff718eec216 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -79,7 +79,7 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
-      fputil::raise_except(FE_INVALID);
+      fputil::raise_except_if_required(FE_INVALID);
       return FPBits::quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {

From a9d175f1735a508ac05ab48d83a99071ba97c10e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 14:57:58 -0700
Subject: [PATCH 0981/1322] [CodeGen] Use range-based for loops (NFC) (#144939)

---
 clang/lib/CodeGen/CGBlocks.cpp       |  6 +++---
 clang/lib/CodeGen/CGCleanup.cpp      |  4 ++--
 clang/lib/CodeGen/CGDeclCXX.cpp      |  6 +++---
 clang/lib/CodeGen/CGException.cpp    | 10 +++++-----
 clang/lib/CodeGen/CGExpr.cpp         | 12 ++++++------
 clang/lib/CodeGen/CGExprConstant.cpp |  4 +---
 clang/lib/CodeGen/CGObjC.cpp         |  4 ++--
 clang/lib/CodeGen/CGObjCGNU.cpp      | 19 ++++++++-----------
 clang/lib/CodeGen/CGObjCMac.cpp      | 20 ++++++++------------
 clang/lib/CodeGen/CGObjCRuntime.cpp  |  4 +---
 10 files changed, 39 insertions(+), 50 deletions(-)

diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 729758ddce56..f3ddf7bf9a46 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1415,10 +1415,10 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction(
   // Arrange for local static and local extern declarations to appear
   // to be local to this function as well, in case they're directly
   // referenced in a block.
-  for (DeclMapTy::const_iterator i = ldm.begin(), e = ldm.end(); i != e; ++i) {
-    const auto *var = dyn_cast<VarDecl>(i->first);
+  for (const auto &KV : ldm) {
+    const auto *var = dyn_cast<VarDecl>(KV.first);
     if (var && !var->hasLocalStorage())
-      setAddrOfLocalVar(var, i->second);
+      setAddrOfLocalVar(var, KV.second);
   }
 
   // Begin building the function declaration.
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index 4ed2c5183c47..28ac9bf39635 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -962,8 +962,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough,
 
       // Append the prepared cleanup prologue from above.
       llvm::BasicBlock *NormalExit = Builder.GetInsertBlock();
-      for (unsigned I = 0, E = InstsToAppend.size(); I != E; ++I)
-        InstsToAppend[I]->insertInto(NormalExit, NormalExit->end());
+      for (llvm::Instruction *Inst : InstsToAppend)
+        Inst->insertInto(NormalExit, NormalExit->end());
 
       // Optimistically hope that any fixups will continue falling through.
       for (unsigned I = FixupDepth, E = EHStack.getNumBranchFixups();
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 69d77f283db3..7ae99935c8ad 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -1121,9 +1121,9 @@ CodeGenFunction::GenerateCXXGlobalInitFunc(llvm::Function *Fn,
       EmitObjCAutoreleasePoolCleanup(token);
     }
 
-    for (unsigned i = 0, e = Decls.size(); i != e; ++i)
-      if (Decls[i])
-        EmitRuntimeCall(Decls[i]);
+    for (llvm::Function *Decl : Decls)
+      if (Decl)
+        EmitRuntimeCall(Decl);
 
     Scope.ForceCleanup();
 
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index e0367282355c..ad138b9876e8 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -319,9 +319,9 @@ static bool PersonalityHasOnlyCXXUses(llvm::Constant *Fn) {
     llvm::Function *F = dyn_cast<llvm::Function>(U);
     if (!F) return false;
 
-    for (auto BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      if (BB->isLandingPad())
-        if (!LandingPadHasOnlyCXXUses(BB->getLandingPadInst()))
+    for (llvm::BasicBlock &BB : *F) {
+      if (BB.isLandingPad())
+        if (!LandingPadHasOnlyCXXUses(BB.getLandingPadInst()))
           return false;
     }
   }
@@ -937,8 +937,8 @@ llvm::BasicBlock *CodeGenFunction::EmitLandingPad() {
                              filterTypes[0]->getType() : Int8PtrTy,
                            filterTypes.size());
 
-    for (unsigned i = 0, e = filterTypes.size(); i != e; ++i)
-      Filters.push_back(cast<llvm::Constant>(filterTypes[i]));
+    for (llvm::Value *filterType : filterTypes)
+      Filters.push_back(cast<llvm::Constant>(filterType));
     llvm::Constant *FilterArray = llvm::ConstantArray::get(AType, Filters);
     LPadInst->addClause(FilterArray);
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6cb348ffdf55..85c768807572 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -3801,8 +3801,8 @@ void CodeGenFunction::EmitCheck(
       ArgTypes.push_back(Args.back()->getType());
     }
 
-    for (size_t i = 0, n = DynamicArgs.size(); i != n; ++i) {
-      Args.push_back(EmitCheckValue(DynamicArgs[i]));
+    for (llvm::Value *DynamicArg : DynamicArgs) {
+      Args.push_back(EmitCheckValue(DynamicArg));
       ArgTypes.push_back(IntPtrTy);
     }
   }
@@ -4932,8 +4932,8 @@ EmitExtVectorElementExpr(const ExtVectorElementExpr *E) {
   llvm::Constant *BaseElts = Base.getExtVectorElts();
   SmallVector<llvm::Constant *, 4> CElts;
 
-  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-    CElts.push_back(BaseElts->getAggregateElement(Indices[i]));
+  for (unsigned Index : Indices)
+    CElts.push_back(BaseElts->getAggregateElement(Index));
   llvm::Constant *CV = llvm::ConstantVector::get(CElts);
   return LValue::MakeExtVectorElt(Base.getExtVectorAddress(), CV, type,
                                   Base.getBaseInfo(), TBAAAccessInfo());
@@ -6660,8 +6660,8 @@ static LValueOrRValue emitPseudoObjectExpr(CodeGenFunction &CGF,
   }
 
   // Unbind all the opaques now.
-  for (unsigned i = 0, e = opaques.size(); i != e; ++i)
-    opaques[i].unbind(CGF);
+  for (CodeGenFunction::OpaqueValueMappingData &opaque : opaques)
+    opaque.unbind(CGF);
 
   return result;
 }
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 069cc09cd91d..715bd392f59f 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -871,9 +871,7 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
     }
     llvm::stable_sort(Bases);
 
-    for (unsigned I = 0, N = Bases.size(); I != N; ++I) {
-      BaseInfo &Base = Bases[I];
-
+    for (const BaseInfo &Base : Bases) {
       bool IsPrimaryBase = Layout.getPrimaryBase() == Base.Decl;
       Build(Val.getStructBase(Base.Index), Base.Decl, IsPrimaryBase,
             VTableClass, Offset + Base.Offset);
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 8a1b44a0c157..6f87444d3f67 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -3173,8 +3173,8 @@ ARCExprEmitter<Impl,Result>::visitPseudoObjectExpr(const PseudoObjectExpr *E) {
   }
 
   // Unbind all the opaques now.
-  for (unsigned i = 0, e = opaques.size(); i != e; ++i)
-    opaques[i].unbind(CGF);
+  for (CodeGenFunction::OpaqueValueMappingData &opaque : opaques)
+    opaque.unbind(CGF);
 
   return result;
 }
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index 3fc837c12a92..d828702cbb87 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1103,8 +1103,7 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
     bool isNamed = !isNonASCII;
     if (isNamed) {
       StringName = ".objc_str_";
-      for (int i=0,e=Str.size() ; i<e ; ++i) {
-        unsigned char c = Str[i];
+      for (unsigned char c : Str) {
         if (isalnum(c))
           StringName += c;
         else if (c == ' ')
@@ -2560,10 +2559,9 @@ llvm::Value *CGObjCGNU::GetTypedSelector(CodeGenFunction &CGF, Selector Sel,
   SmallVectorImpl<TypedSelector> &Types = SelectorTable[Sel];
   llvm::GlobalAlias *SelValue = nullptr;
 
-  for (SmallVectorImpl<TypedSelector>::iterator i = Types.begin(),
-      e = Types.end() ; i!=e ; i++) {
-    if (i->first == TypeEncoding) {
-      SelValue = i->second;
+  for (const TypedSelector &Type : Types) {
+    if (Type.first == TypeEncoding) {
+      SelValue = Type.second;
       break;
     }
   }
@@ -3333,13 +3331,12 @@ CGObjCGNU::GenerateProtocolList(ArrayRef<std::string> Protocols) {
   ProtocolList.addInt(LongTy, Protocols.size());
 
   auto Elements = ProtocolList.beginArray(PtrToInt8Ty);
-  for (const std::string *iter = Protocols.begin(), *endIter = Protocols.end();
-      iter != endIter ; iter++) {
+  for (const std::string &Protocol : Protocols) {
     llvm::Constant *protocol = nullptr;
-    llvm::StringMap<llvm::Constant*>::iterator value =
-      ExistingProtocols.find(*iter);
+    llvm::StringMap<llvm::Constant *>::iterator value =
+        ExistingProtocols.find(Protocol);
     if (value == ExistingProtocols.end()) {
-      protocol = GenerateEmptyProtocol(*iter);
+      protocol = GenerateEmptyProtocol(Protocol);
     } else {
       protocol = value->getValue();
     }
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 1c23a8b4db91..a52c92cdbc83 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -2702,8 +2702,8 @@ llvm::Constant *CGObjCCommonMac::getBitmapBlockLayout(bool ComputeByrefLayout) {
   unsigned char inst = (BLOCK_LAYOUT_OPERATOR << 4) | 0;
   Layout.push_back(inst);
   std::string BitMap;
-  for (unsigned i = 0, e = Layout.size(); i != e; i++)
-    BitMap += Layout[i];
+  for (unsigned char C : Layout)
+    BitMap += C;
 
   if (CGM.getLangOpts().ObjCGCBitmapPrint) {
     if (ComputeByrefLayout)
@@ -4225,9 +4225,8 @@ FragileHazards::FragileHazards(CodeGenFunction &CGF) : CGF(CGF) {
     return;
 
   // Collect all the blocks in the function.
-  for (llvm::Function::iterator I = CGF.CurFn->begin(), E = CGF.CurFn->end();
-       I != E; ++I)
-    BlocksBeforeTry.insert(&*I);
+  for (llvm::BasicBlock &BB : *CGF.CurFn)
+    BlocksBeforeTry.insert(&BB);
 
   llvm::FunctionType *AsmFnTy = GetAsmFnType();
 
@@ -4299,9 +4298,7 @@ void FragileHazards::emitHazardsInNewBlocks() {
   CGBuilderTy Builder(CGF, CGF.getLLVMContext());
 
   // Iterate through all blocks, skipping those prior to the try.
-  for (llvm::Function::iterator FI = CGF.CurFn->begin(), FE = CGF.CurFn->end();
-       FI != FE; ++FI) {
-    llvm::BasicBlock &BB = *FI;
+  for (llvm::BasicBlock &BB : *CGF.CurFn) {
     if (BlocksBeforeTry.count(&BB))
       continue;
 
@@ -4348,10 +4345,9 @@ void FragileHazards::collectLocals() {
   // Collect all the allocas currently in the function.  This is
   // probably way too aggressive.
   llvm::BasicBlock &Entry = CGF.CurFn->getEntryBlock();
-  for (llvm::BasicBlock::iterator I = Entry.begin(), E = Entry.end(); I != E;
-       ++I)
-    if (isa<llvm::AllocaInst>(*I) && !AllocasToIgnore.count(&*I))
-      Locals.push_back(&*I);
+  for (llvm::Instruction &I : Entry)
+    if (isa<llvm::AllocaInst>(I) && !AllocasToIgnore.count(&I))
+      Locals.push_back(&I);
 }
 
 llvm::FunctionType *FragileHazards::GetAsmFnType() {
diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp
index dfb0fd14d93a..6e2f32022a01 100644
--- a/clang/lib/CodeGen/CGObjCRuntime.cpp
+++ b/clang/lib/CodeGen/CGObjCRuntime.cpp
@@ -220,9 +220,7 @@ void CGObjCRuntime::EmitTryCatchStmt(CodeGenFunction &CGF,
   CGBuilderTy::InsertPoint SavedIP = CGF.Builder.saveAndClearIP();
 
   // Emit the handlers.
-  for (unsigned I = 0, E = Handlers.size(); I != E; ++I) {
-    CatchHandler &Handler = Handlers[I];
-
+  for (CatchHandler &Handler : Handlers) {
     CGF.EmitBlock(Handler.Block);
 
     CodeGenFunction::LexicalScope Cleanups(CGF, Handler.Body->getSourceRange());

From a8edda195c9fe0d48ee7f6f2438c9575ebbad7f2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 14:58:05 -0700
Subject: [PATCH 0982/1322] [llvm] Remove unused includes (NFC) (#144941)

These are identified by misc-include-cleaner.  I've filtered out those
that break builds.  Also, I'm staying away from llvm-config.h,
config.h, and Compiler.h, which likely cause platform- or
compiler-specific build failures.
---
 llvm/lib/ExecutionEngine/JITLink/COFFDirectiveParser.cpp      | 2 --
 llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp | 2 --
 llvm/lib/Linker/IRMover.cpp                                   | 1 -
 llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp               | 1 -
 4 files changed, 6 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFDirectiveParser.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFDirectiveParser.cpp
index ecf5c0e519ca..4a5d5a8ffab6 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFFDirectiveParser.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFFDirectiveParser.cpp
@@ -12,8 +12,6 @@
 
 #include "COFFDirectiveParser.h"
 
-#include <array>
-
 using namespace llvm;
 using namespace jitlink;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
index 4482eedc0070..178c7ee1c01b 100644
--- a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h"
 
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 
 #define DEBUG_TYPE "orc"
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 2a9709050162..a466ce5bf0d4 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -9,7 +9,6 @@
 #include "llvm/Linker/IRMover.h"
 #include "LinkDiagnosticInfo.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/Constants.h"
diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
index 222eb19e3eee..ea9155c60d38 100644
--- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
@@ -108,7 +108,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"

From dfb5cadf5e816e542e46d3f0551b6a148a93ce3d Mon Sep 17 00:00:00 2001
From: Koakuma <koachan@protonmail.com>
Date: Fri, 20 Jun 2025 05:12:51 +0700
Subject: [PATCH 0983/1322] [SPARC][IAS] Properly set implied feature sets for
 ISA levels/extensions (#143232)

Some SPARC ISA levels and/or extensions are defined in a way such that
the availability of it implies the availability of other, more fundamental
ISA features (for example, targeting 64-bit environment implies that
V9 instructions are available).
Properly set those in the TableGen definitions.

Fixes https://github.com/llvm/llvm-project/issues/142388.
---
 llvm/lib/Target/Sparc/Sparc.td                | 18 +++--
 llvm/lib/Target/Sparc/SparcInstrInfo.td       |  3 +-
 llvm/test/CodeGen/SPARC/ctlz.ll               | 42 ++++-------
 llvm/test/CodeGen/SPARC/cttz.ll               | 46 +++++-------
 llvm/test/CodeGen/SPARC/inlineasm-v9.ll       |  9 +++
 llvm/test/CodeGen/SPARC/inlineasm.ll          |  9 ---
 .../Sparc/Relocations/relocation-specifier.s  | 75 ++++++++++---------
 llvm/test/MC/Sparc/sparcv9-instructions.s     | 21 ++++++
 8 files changed, 120 insertions(+), 103 deletions(-)

diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 6e6c887e60e1..8588d2d28b71 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -42,22 +42,28 @@ def FeatureV8Deprecated
                      "Enable deprecated V8 instructions in V9 mode">;
 def FeatureVIS
   : SubtargetFeature<"vis", "IsVIS", "true",
-                     "Enable UltraSPARC Visual Instruction Set extensions">;
+                     "Enable UltraSPARC Visual Instruction Set extensions",
+                     [FeatureV9]>;
 def FeatureVIS2
   : SubtargetFeature<"vis2", "IsVIS2", "true",
-                     "Enable Visual Instruction Set extensions II">;
+                     "Enable Visual Instruction Set extensions II",
+                     [FeatureV9]>;
 def FeatureVIS3
   : SubtargetFeature<"vis3", "IsVIS3", "true",
-                     "Enable Visual Instruction Set extensions III">;
+                     "Enable Visual Instruction Set extensions III",
+                     [FeatureV9]>;
 def FeatureUA2005
   : SubtargetFeature<"ua2005", "IsUA2005", "true",
-                     "Enable UltraSPARC Architecture 2005 extensions">;
+                     "Enable UltraSPARC Architecture 2005 extensions",
+                     [FeatureV9, FeatureVIS, FeatureVIS2]>;
 def FeatureUA2007
   : SubtargetFeature<"ua2007", "IsUA2007", "true",
-                     "Enable UltraSPARC Architecture 2007 extensions">;
+                     "Enable UltraSPARC Architecture 2007 extensions",
+                     [FeatureV9, FeatureVIS, FeatureVIS2]>;
 def FeatureOSA2011
   : SubtargetFeature<"osa2011", "IsOSA2011", "true",
-                     "Enable Oracle SPARC Architecture 2011 extensions">;
+                     "Enable Oracle SPARC Architecture 2011 extensions",
+                     [FeatureV9, FeatureVIS, FeatureVIS2, FeatureVIS3]>;
 def FeatureLeon
   : SubtargetFeature<"leon", "IsLeon", "true",
                      "Enable LEON extensions">;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 074a04a5dd74..1be017be1c64 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -24,7 +24,8 @@ include "SparcInstrFormats.td"
 def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 
 // True when generating 64-bit code. This also implies HasV9.
-def Is64Bit : Predicate<"Subtarget->is64Bit()">;
+def Is64Bit : Predicate<"Subtarget->is64Bit()">,
+              AssemblerPredicate<(all_of FeatureV9)>;
 
 def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">,
               AssemblerPredicate<(all_of FeatureSoftMulDiv)>;
diff --git a/llvm/test/CodeGen/SPARC/ctlz.ll b/llvm/test/CodeGen/SPARC/ctlz.ll
index 75930190f516..f7dc309452b3 100644
--- a/llvm/test/CodeGen/SPARC/ctlz.ll
+++ b/llvm/test/CodeGen/SPARC/ctlz.ll
@@ -207,20 +207,15 @@ define i64 @i64_nopoison(i64 %x) nounwind {
 ;
 ; SPARC-VIS3-LABEL: i64_nopoison:
 ; SPARC-VIS3:       ! %bb.0:
+; SPARC-VIS3-NEXT:    srl %o0, 0, %o2
+; SPARC-VIS3-NEXT:    lzcnt %o2, %o2
+; SPARC-VIS3-NEXT:    add %o2, -32, %o2
+; SPARC-VIS3-NEXT:    srl %o1, 0, %o1
+; SPARC-VIS3-NEXT:    lzcnt %o1, %o1
+; SPARC-VIS3-NEXT:    add %o1, -32, %o1
+; SPARC-VIS3-NEXT:    add %o1, 32, %o1
 ; SPARC-VIS3-NEXT:    cmp %o0, 0
-; SPARC-VIS3-NEXT:    bne .LBB2_2
-; SPARC-VIS3-NEXT:    nop
-; SPARC-VIS3-NEXT:  ! %bb.1:
-; SPARC-VIS3-NEXT:    srl %o1, 0, %o0
-; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
-; SPARC-VIS3-NEXT:    add %o0, -32, %o0
-; SPARC-VIS3-NEXT:    add %o0, 32, %o1
-; SPARC-VIS3-NEXT:    retl
-; SPARC-VIS3-NEXT:    mov %g0, %o0
-; SPARC-VIS3-NEXT:  .LBB2_2:
-; SPARC-VIS3-NEXT:    srl %o0, 0, %o0
-; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
-; SPARC-VIS3-NEXT:    add %o0, -32, %o1
+; SPARC-VIS3-NEXT:    movne %icc, %o2, %o1
 ; SPARC-VIS3-NEXT:    retl
 ; SPARC-VIS3-NEXT:    mov %g0, %o0
 ;
@@ -311,20 +306,15 @@ define i64 @i64_poison(i64 %x) nounwind {
 ;
 ; SPARC-VIS3-LABEL: i64_poison:
 ; SPARC-VIS3:       ! %bb.0:
+; SPARC-VIS3-NEXT:    srl %o0, 0, %o2
+; SPARC-VIS3-NEXT:    lzcnt %o2, %o2
+; SPARC-VIS3-NEXT:    add %o2, -32, %o2
+; SPARC-VIS3-NEXT:    srl %o1, 0, %o1
+; SPARC-VIS3-NEXT:    lzcnt %o1, %o1
+; SPARC-VIS3-NEXT:    add %o1, -32, %o1
+; SPARC-VIS3-NEXT:    add %o1, 32, %o1
 ; SPARC-VIS3-NEXT:    cmp %o0, 0
-; SPARC-VIS3-NEXT:    bne .LBB3_2
-; SPARC-VIS3-NEXT:    nop
-; SPARC-VIS3-NEXT:  ! %bb.1:
-; SPARC-VIS3-NEXT:    srl %o1, 0, %o0
-; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
-; SPARC-VIS3-NEXT:    add %o0, -32, %o0
-; SPARC-VIS3-NEXT:    add %o0, 32, %o1
-; SPARC-VIS3-NEXT:    retl
-; SPARC-VIS3-NEXT:    mov %g0, %o0
-; SPARC-VIS3-NEXT:  .LBB3_2:
-; SPARC-VIS3-NEXT:    srl %o0, 0, %o0
-; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
-; SPARC-VIS3-NEXT:    add %o0, -32, %o1
+; SPARC-VIS3-NEXT:    movne %icc, %o2, %o1
 ; SPARC-VIS3-NEXT:    retl
 ; SPARC-VIS3-NEXT:    mov %g0, %o0
 ;
diff --git a/llvm/test/CodeGen/SPARC/cttz.ll b/llvm/test/CodeGen/SPARC/cttz.ll
index edabd7d560ed..138f67dbf684 100644
--- a/llvm/test/CodeGen/SPARC/cttz.ll
+++ b/llvm/test/CodeGen/SPARC/cttz.ll
@@ -254,28 +254,25 @@ define i64 @i64_nopoison(i64 %x) nounwind {
 ;
 ; SPARC-VIS3-LABEL: i64_nopoison:
 ; SPARC-VIS3:       ! %bb.0:
-; SPARC-VIS3-NEXT:    cmp %o1, 0
-; SPARC-VIS3-NEXT:    bne .LBB2_2
-; SPARC-VIS3-NEXT:    nop
-; SPARC-VIS3-NEXT:  ! %bb.1:
-; SPARC-VIS3-NEXT:    add %o0, -1, %o1
-; SPARC-VIS3-NEXT:    andn %o1, %o0, %o0
+; SPARC-VIS3-NEXT:    add %o0, -1, %o2
+; SPARC-VIS3-NEXT:    andn %o2, %o0, %o0
 ; SPARC-VIS3-NEXT:    srl %o0, 0, %o0
 ; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
 ; SPARC-VIS3-NEXT:    add %o0, -32, %o0
-; SPARC-VIS3-NEXT:    ba .LBB2_3
-; SPARC-VIS3-NEXT:    mov 64, %o1
-; SPARC-VIS3-NEXT:  .LBB2_2:
+; SPARC-VIS3-NEXT:    mov 64, %o2
+; SPARC-VIS3-NEXT:    sub %o2, %o0, %o2
 ; SPARC-VIS3-NEXT:    add %o1, -1, %o0
 ; SPARC-VIS3-NEXT:    andn %o0, %o1, %o0
 ; SPARC-VIS3-NEXT:    srl %o0, 0, %o0
 ; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
 ; SPARC-VIS3-NEXT:    add %o0, -32, %o0
-; SPARC-VIS3-NEXT:    mov 32, %o1
-; SPARC-VIS3-NEXT:  .LBB2_3:
-; SPARC-VIS3-NEXT:    sub %o1, %o0, %o1
-; SPARC-VIS3-NEXT:    retl
+; SPARC-VIS3-NEXT:    mov 32, %o3
+; SPARC-VIS3-NEXT:    sub %o3, %o0, %o0
+; SPARC-VIS3-NEXT:    cmp %o1, 0
+; SPARC-VIS3-NEXT:    movne %icc, %o0, %o2
 ; SPARC-VIS3-NEXT:    mov %g0, %o0
+; SPARC-VIS3-NEXT:    retl
+; SPARC-VIS3-NEXT:    mov %o2, %o1
 ;
 ; SPARC64-LABEL: i64_nopoison:
 ; SPARC64:       ! %bb.0:
@@ -376,28 +373,25 @@ define i64 @i64_poison(i64 %x) nounwind {
 ;
 ; SPARC-VIS3-LABEL: i64_poison:
 ; SPARC-VIS3:       ! %bb.0:
-; SPARC-VIS3-NEXT:    cmp %o1, 0
-; SPARC-VIS3-NEXT:    bne .LBB3_2
-; SPARC-VIS3-NEXT:    nop
-; SPARC-VIS3-NEXT:  ! %bb.1:
-; SPARC-VIS3-NEXT:    add %o0, -1, %o1
-; SPARC-VIS3-NEXT:    andn %o1, %o0, %o0
+; SPARC-VIS3-NEXT:    add %o0, -1, %o2
+; SPARC-VIS3-NEXT:    andn %o2, %o0, %o0
 ; SPARC-VIS3-NEXT:    srl %o0, 0, %o0
 ; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
 ; SPARC-VIS3-NEXT:    add %o0, -32, %o0
-; SPARC-VIS3-NEXT:    ba .LBB3_3
-; SPARC-VIS3-NEXT:    mov 64, %o1
-; SPARC-VIS3-NEXT:  .LBB3_2:
+; SPARC-VIS3-NEXT:    mov 64, %o2
+; SPARC-VIS3-NEXT:    sub %o2, %o0, %o2
 ; SPARC-VIS3-NEXT:    add %o1, -1, %o0
 ; SPARC-VIS3-NEXT:    andn %o0, %o1, %o0
 ; SPARC-VIS3-NEXT:    srl %o0, 0, %o0
 ; SPARC-VIS3-NEXT:    lzcnt %o0, %o0
 ; SPARC-VIS3-NEXT:    add %o0, -32, %o0
-; SPARC-VIS3-NEXT:    mov 32, %o1
-; SPARC-VIS3-NEXT:  .LBB3_3:
-; SPARC-VIS3-NEXT:    sub %o1, %o0, %o1
-; SPARC-VIS3-NEXT:    retl
+; SPARC-VIS3-NEXT:    mov 32, %o3
+; SPARC-VIS3-NEXT:    sub %o3, %o0, %o0
+; SPARC-VIS3-NEXT:    cmp %o1, 0
+; SPARC-VIS3-NEXT:    movne %icc, %o0, %o2
 ; SPARC-VIS3-NEXT:    mov %g0, %o0
+; SPARC-VIS3-NEXT:    retl
+; SPARC-VIS3-NEXT:    mov %o2, %o1
 ;
 ; SPARC64-LABEL: i64_poison:
 ; SPARC64:       ! %bb.0:
diff --git a/llvm/test/CodeGen/SPARC/inlineasm-v9.ll b/llvm/test/CodeGen/SPARC/inlineasm-v9.ll
index 47126d5d64da..289bb33fb864 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm-v9.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm-v9.ll
@@ -58,3 +58,12 @@ Entry:
   tail call void asm sideeffect "", "{o0}"(i64 %val)
   ret void
 }
+
+; CHECK-LABEL: test_twinword:
+; CHECK: rd  %pc, %i1
+; CHECK: srlx %i1, 32, %i0
+
+define i64 @test_twinword(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm.ll b/llvm/test/CodeGen/SPARC/inlineasm.ll
index 3ca2168efb71..07411385bdf3 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm.ll
@@ -144,15 +144,6 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: test_twinword:
-; CHECK: rd  %asr5, %i1
-; CHECK: srlx %i1, 32, %i0
-
-define i64 @test_twinword(){
-  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"()
-  ret i64 %1
-}
-
 ; CHECK-LABEL: test_symbol:
 ; CHECK: ba,a brtarget
 define void @test_symbol() {
diff --git a/llvm/test/MC/Sparc/Relocations/relocation-specifier.s b/llvm/test/MC/Sparc/Relocations/relocation-specifier.s
index 1d89babb5e6c..8a996c99e55a 100644
--- a/llvm/test/MC/Sparc/Relocations/relocation-specifier.s
+++ b/llvm/test/MC/Sparc/Relocations/relocation-specifier.s
@@ -1,20 +1,21 @@
 # RUN: llvm-mc %s -triple=sparc | FileCheck %s --check-prefix=ASM
-# RUN: llvm-mc %s -triple=sparcv9 | FileCheck %s --check-prefix=ASM
+# RUN: llvm-mc %s --defsym V9=1 -triple=sparcv9 | FileCheck %s --check-prefixes=ASM,ASM-V9
 
 # RUN: llvm-mc %s -triple=sparc -filetype=obj -o %t
 # RUN: llvm-objdump -dr %t | FileCheck %s --check-prefix=OBJDUMP
-# RUN: llvm-mc %s -triple=sparcv9 -filetype=obj -o %t
-# RUN: llvm-objdump -dr %t | FileCheck %s --check-prefix=OBJDUMP
 # RUN: llvm-readelf -s - < %t | FileCheck %s --check-prefix=READELF --implicit-check-not=TLS
+# RUN: llvm-mc %s --defsym V9=1 -triple=sparcv9 -filetype=obj -o %t
+# RUN: llvm-objdump -dr %t | FileCheck %s --check-prefixes=OBJDUMP,OBJDUMP-V9
+# RUN: llvm-readelf -s - < %t | FileCheck %s --check-prefixes=READELF,READELF-V9 --implicit-check-not=TLS
 
 # READELF: TLS     LOCAL  DEFAULT [[#]] s_tle_hix22
 # READELF: TLS     LOCAL  DEFAULT [[#]] s_tldo_hix22
 # READELF: TLS     GLOBAL DEFAULT   UND s_tle_lox10
-# READELF: TLS     GLOBAL DEFAULT   UND s_tie_hi22
-# READELF: TLS     GLOBAL DEFAULT   UND s_tie_lo10
-# READELF: TLS     GLOBAL DEFAULT   UND s_tie_ld
-# READELF: TLS     GLOBAL DEFAULT   UND s_tie_ldx
-# READELF: TLS     GLOBAL DEFAULT   UND s_tie_add
+# READELF-V9: TLS     GLOBAL DEFAULT   UND s_tie_hi22
+# READELF-V9: TLS     GLOBAL DEFAULT   UND s_tie_lo10
+# READELF-V9: TLS     GLOBAL DEFAULT   UND s_tie_ld
+# READELF-V9: TLS     GLOBAL DEFAULT   UND s_tie_ldx
+# READELF-V9: TLS     GLOBAL DEFAULT   UND s_tie_add
 # READELF: TLS     GLOBAL DEFAULT   UND s_tldm_hi22
 # READELF: TLS     GLOBAL DEFAULT   UND s_tldm_lo10
 # READELF: TLS     GLOBAL DEFAULT   UND s_tldm_add
@@ -72,23 +73,24 @@ or %g1, %hm(sym), %g3
 or %g1, %ulo(sym), %g3
 sethi %lm(sym), %l0
 
-# ASM:      sethi %hix(sym), %g1
-# ASM-NEXT: xor %g1, %lox(sym), %g1
-# ASM-NEXT: sethi %gdop_hix22(sym), %l1
-# ASM-NEXT: or %l1, %gdop_lox10(sym), %l1
-# ASM-NEXT: ldx [%l7+%l1], %l2, %gdop(sym)
-# OBJDUMP:      sethi 0x3fffff, %g0
-# OBJDUMP-NEXT: xor %g0, -0x400, %g0
-# OBJDUMP-NEXT: sethi 0x0, %g1
-# OBJDUMP-NEXT:   R_SPARC_HIX22 sym
-# OBJDUMP-NEXT: xor %g1, 0x0, %g1
-# OBJDUMP-NEXT:   R_SPARC_LOX10 sym
-# OBJDUMP-NEXT: sethi 0x0, %l1
-# OBJDUMP-NEXT:   R_SPARC_GOTDATA_OP_HIX22 sym
-# OBJDUMP-NEXT: or %l1, 0x0, %l1
-# OBJDUMP-NEXT:   R_SPARC_GOTDATA_OP_LOX10 sym
-# OBJDUMP-NEXT: ldx [%l7+%l1], %l2
-# OBJDUMP-NEXT:   R_SPARC_GOTDATA_OP sym
+.ifdef V9
+# ASM-V9:      sethi %hix(sym), %g1
+# ASM-V9-NEXT: xor %g1, %lox(sym), %g1
+# ASM-V9-NEXT: sethi %gdop_hix22(sym), %l1
+# ASM-V9-NEXT: or %l1, %gdop_lox10(sym), %l1
+# ASM-V9-NEXT: ldx [%l7+%l1], %l2, %gdop(sym)
+# OBJDUMP-V9:      sethi 0x3fffff, %g0
+# OBJDUMP-V9-NEXT: xor %g0, -0x400, %g0
+# OBJDUMP-V9-NEXT: sethi 0x0, %g1
+# OBJDUMP-V9-NEXT:   R_SPARC_HIX22 sym
+# OBJDUMP-V9-NEXT: xor %g1, 0x0, %g1
+# OBJDUMP-V9-NEXT:   R_SPARC_LOX10 sym
+# OBJDUMP-V9-NEXT: sethi 0x0, %l1
+# OBJDUMP-V9-NEXT:   R_SPARC_GOTDATA_OP_HIX22 sym
+# OBJDUMP-V9-NEXT: or %l1, 0x0, %l1
+# OBJDUMP-V9-NEXT:   R_SPARC_GOTDATA_OP_LOX10 sym
+# OBJDUMP-V9-NEXT: ldx [%l7+%l1], %l2
+# OBJDUMP-V9-NEXT:   R_SPARC_GOTDATA_OP sym
 sethi %hix(zero), %g0
 xor %g0, %lox(zero), %g0
 sethi %hix(sym), %g1
@@ -96,6 +98,7 @@ xor %g1, %lox(sym), %g1
 sethi %gdop_hix22(sym), %l1
 or %l1, %gdop_lox10(sym), %l1
 ldx [%l7 + %l1], %l2, %gdop(sym)
+.endif
 
 .set abs, 0xfedcba98
 .set abs48, 0xfedcba987654
@@ -147,23 +150,25 @@ xor %o0, %lox(abs), %o0
         sethi %tle_hix22(s_tle_hix22), %i0
         xor %i0, %tle_lox10(s_tle_lox10), %i0
 
+.ifdef V9
 ## Initial Executable model
-# ASM:      sethi %tie_hi22(s_tie_hi22), %i1
-# ASM-NEXT: add %i1, %tie_lo10(s_tie_lo10), %i1
-# ASM-NEXT: ld [%i0+%i1], %i0, %tie_ld(s_tie_ld)
-# ASM-NEXT: ldx [%i0+%i1], %i0, %tie_ldx(s_tie_ldx)
-# ASM-NEXT: add %g7, %i0, %o0, %tie_add(s_tie_add)
+# ASM-V9:      sethi %tie_hi22(s_tie_hi22), %i1
+# ASM-V9-NEXT: add %i1, %tie_lo10(s_tie_lo10), %i1
+# ASM-V9-NEXT: ld [%i0+%i1], %i0, %tie_ld(s_tie_ld)
+# ASM-V9-NEXT: ldx [%i0+%i1], %i0, %tie_ldx(s_tie_ldx)
+# ASM-V9-NEXT: add %g7, %i0, %o0, %tie_add(s_tie_add)
 
-# OBJDUMP:      R_SPARC_TLS_IE_HI22	s_tie_hi22
-# OBJDUMP:      R_SPARC_TLS_IE_LO10	s_tie_lo10
-# OBJDUMP:      R_SPARC_TLS_IE_LD	s_tie_ld
-# OBJDUMP:      R_SPARC_TLS_IE_LDX	s_tie_ldx
-# OBJDUMP:      R_SPARC_TLS_IE_ADD	s_tie_add
+# OBJDUMP-V9:      R_SPARC_TLS_IE_HI22	s_tie_hi22
+# OBJDUMP-V9:      R_SPARC_TLS_IE_LO10	s_tie_lo10
+# OBJDUMP-V9:      R_SPARC_TLS_IE_LD	s_tie_ld
+# OBJDUMP-V9:      R_SPARC_TLS_IE_LDX	s_tie_ldx
+# OBJDUMP-V9:      R_SPARC_TLS_IE_ADD	s_tie_add
 	sethi %tie_hi22(s_tie_hi22), %i1
         add %i1, %tie_lo10(s_tie_lo10), %i1
         ld [%i0+%i1], %i0, %tie_ld(s_tie_ld)
         ldx [%i0+%i1], %i0, %tie_ldx(s_tie_ldx)
         add %g7, %i0, %o0, %tie_add(s_tie_add)
+.endif
 
 ## Local Dynamic model
 # ASM:      sethi %tldo_hix22(s_tldo_hix22), %i1
diff --git a/llvm/test/MC/Sparc/sparcv9-instructions.s b/llvm/test/MC/Sparc/sparcv9-instructions.s
index de9fe03201b1..6dd0dc3d64e0 100644
--- a/llvm/test/MC/Sparc/sparcv9-instructions.s
+++ b/llvm/test/MC/Sparc/sparcv9-instructions.s
@@ -53,14 +53,19 @@
         ! V9: lda [%i0+%l6] #ASI_SNF, %o2 ! encoding: [0xd4,0x86,0x10,0x76]
         lduwa [%i0 + %l6] (130+1), %o2
 
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldsw [%i0+%l6], %o2    ! encoding: [0xd4,0x46,0x00,0x16]
         ldsw [%i0 + %l6], %o2
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldsw [%i0+32], %o2     ! encoding: [0xd4,0x46,0x20,0x20]
         ldsw [%i0 + 32], %o2
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldsw [%g1], %o2        ! encoding: [0xd4,0x40,0x40,0x00]
         ldsw [%g1], %o2
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldswa [%i0+%l6] #ASI_SNF, %o2 ! encoding: [0xd4,0xc6,0x10,0x76]
         ldswa [%i0 + %l6] 131, %o2
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldswa [%i0+%l6] #ASI_SNF, %o2 ! encoding: [0xd4,0xc6,0x10,0x76]
         ldswa [%i0 + %l6] (130+1), %o2
 
@@ -121,8 +126,10 @@
         ! V9: ldx [%g2+%i5], %fsr   ! encoding: [0xc3,0x08,0x80,0x1d]
         ldx [%g2 + %i5],%fsr
 
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldxa [%g2+%i5] #ASI_SNF, %g0   ! encoding: [0xc0,0xd8,0x90,0x7d]
         ldxa [%g2 + %i5] 131, %g0
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldxa [%g2+%i5] #ASI_SNF, %g0   ! encoding: [0xc0,0xd8,0x90,0x7d]
         ldxa [%g2 + %i5] (130+1), %g0
 
@@ -134,8 +141,10 @@
         ! V9: stx %fsr, [%g2+%i5]   ! encoding: [0xc3,0x28,0x80,0x1d]
         stx %fsr,[%g2 + %i5]
 
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: stxa %g0, [%g2+%i5] #ASI_SNF   ! encoding: [0xc0,0xf0,0x90,0x7d]
         stxa %g0, [%g2 + %i5] 131
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: stxa %g0, [%g2+%i5] #ASI_SNF   ! encoding: [0xc0,0xf0,0x90,0x7d]
         stxa %g0, [%g2 + %i5] (130+1)
 
@@ -398,33 +407,45 @@
         ! V9: wr %i0, 1, %asr21         ! encoding: [0xab,0x86,0x20,0x01]
         wr %i0, 1, %clear_softint
 
+        ! V8:      [[#@LINE+2]]:9: error: invalid instruction mnemonic
         ! V9: st %o1, [%o0]             ! encoding: [0xd2,0x22,0x00,0x00]
         stw %o1, [%o0]
+        ! V8:      [[#@LINE+2]]:9: error: invalid instruction mnemonic
         ! V9: st %o1, [%o0]             ! encoding: [0xd2,0x22,0x00,0x00]
         stuw %o1, [%o0]
+        ! V8:      [[#@LINE+2]]:9: error: invalid instruction mnemonic
         ! V9: st %o1, [%o0]             ! encoding: [0xd2,0x22,0x00,0x00]
         stsw %o1, [%o0]
 
+        ! V8:      [[#@LINE+2]]:9: error: invalid instruction mnemonic
         ! V9: sta %o2, [%i0+%l6] #ASI_SNF ! encoding: [0xd4,0xa6,0x10,0x76]
         stwa %o2, [%i0 + %l6] 131
+        ! V8:      [[#@LINE+2]]:9: error: invalid instruction mnemonic
         ! V9: sta %o2, [%i0+%l6] #ASI_SNF ! encoding: [0xd4,0xa6,0x10,0x76]
         stuwa %o2, [%i0 + %l6] 131
+        ! V8:      [[#@LINE+2]]:9: error: invalid instruction mnemonic
         ! V9: sta %o2, [%i0+%l6] #ASI_SNF ! encoding: [0xd4,0xa6,0x10,0x76]
         stswa %o2, [%i0 + %l6] 131
 
         !! SPARCv9 provides a new variant of ASI-tagged memory accesses.
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldxa [%g2] %asi, %g0    ! encoding: [0xc0,0xd8,0xa0,0x00]
         ldxa [%g2] %asi, %g0
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: stxa %g0, [%g2] %asi    ! encoding: [0xc0,0xf0,0xa0,0x00]
         stxa %g0, [%g2] %asi
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldxa [%g2+5] %asi, %g0    ! encoding: [0xc0,0xd8,0xa0,0x05]
         ldxa [%g2 + 5] %asi, %g0
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: stxa %g0, [%g2+5] %asi    ! encoding: [0xc0,0xf0,0xa0,0x05]
         stxa %g0, [%g2 + 5] %asi
 
         !! Also make sure named ASI tags are parsed properly.
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: ldxa [%g2+%i5] #ASI_SNF, %g0   ! encoding: [0xc0,0xd8,0x90,0x7d]
         ldxa [%g2 + %i5] #ASI_SNF, %g0
+        ! V8:      [[#@LINE+2]]:9: error: instruction requires a CPU feature not currently enabled
         ! V9: stxa %g0, [%g2+%i5] #ASI_SNF   ! encoding: [0xc0,0xf0,0x90,0x7d]
         stxa %g0, [%g2 + %i5] #ASI_SNF
 

From b96370131d1572feb9c51442ac8ba1ccb16d7071 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 19 Jun 2025 15:29:56 -0700
Subject: [PATCH 0984/1322] [TTI] Plumb CostKind through
 getPartialReductionCost (#144953)

Purely for the sake of being idiomatic with other TTI costing routines,
no direct motivation beyond that.
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h      |  4 ++--
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h  | 11 +++++------
 llvm/lib/Analysis/TargetTransformInfo.cpp             |  5 +++--
 .../lib/Target/AArch64/AArch64TargetTransformInfo.cpp |  7 +++++--
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h  | 11 +++++------
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp    |  9 ++++-----
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h      | 11 +++++------
 .../WebAssembly/WebAssemblyTargetTransformInfo.cpp    |  7 +++++--
 .../WebAssembly/WebAssemblyTargetTransformInfo.h      |  4 ++--
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp       |  2 +-
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp        |  6 +++---
 11 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9dc4eca82492..ba47cef274be 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1332,8 +1332,8 @@ public:
   LLVM_ABI InstructionCost getPartialReductionCost(
       unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
       ElementCount VF, PartialReductionExtendKind OpAExtend,
-      PartialReductionExtendKind OpBExtend,
-      std::optional<unsigned> BinOp = std::nullopt) const;
+      PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+      TTI::TargetCostKind CostKind) const;
 
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d93375218394..640766cf8cd1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -652,12 +652,11 @@ public:
   virtual bool enableWritePrefetching() const { return false; }
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
 
-  virtual InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
-                          Type *AccumType, ElementCount VF,
-                          TTI::PartialReductionExtendKind OpAExtend,
-                          TTI::PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp = std::nullopt) const {
+  virtual InstructionCost getPartialReductionCost(
+      unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+      ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
+      TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+      TTI::TargetCostKind CostKind) const {
     return InstructionCost::getInvalid();
   }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index d9cb11de9c09..8cc7f8a9d2ab 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -871,10 +871,11 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
 InstructionCost TargetTransformInfo::getPartialReductionCost(
     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
     ElementCount VF, PartialReductionExtendKind OpAExtend,
-    PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp) const {
+    PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+    TTI::TargetCostKind CostKind) const {
   return TTIImpl->getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
                                           AccumType, VF, OpAExtend, OpBExtend,
-                                          BinOp);
+                                          BinOp, CostKind);
 }
 
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ed051f295752..9d5c984fa4f1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5395,11 +5395,14 @@ AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index,
 InstructionCost AArch64TTIImpl::getPartialReductionCost(
     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
     ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
-    TTI::PartialReductionExtendKind OpBExtend,
-    std::optional<unsigned> BinOp) const {
+    TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+    TTI::TargetCostKind CostKind) const {
   InstructionCost Invalid = InstructionCost::getInvalid();
   InstructionCost Cost(TTI::TCC_Basic);
 
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Invalid;
+
   // Sub opcodes currently only occur in chained cases.
   // Independent partial reduction subtractions are still costed as an add
   if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0184e748b3d8..470af01be315 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -382,12 +382,11 @@ public:
     return BaseT::isLegalNTLoad(DataType, Alignment);
   }
 
-  InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
-                          Type *AccumType, ElementCount VF,
-                          TTI::PartialReductionExtendKind OpAExtend,
-                          TTI::PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp) const override;
+  InstructionCost getPartialReductionCost(
+      unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+      ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
+      TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+      TTI::TargetCostKind CostKind) const override;
 
   bool enableOrderedReductions() const override { return true; }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 63c5f17a8487..1b80b0fcaf10 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -297,8 +297,8 @@ RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
 InstructionCost RISCVTTIImpl::getPartialReductionCost(
     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
     ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
-    TTI::PartialReductionExtendKind OpBExtend,
-    std::optional<unsigned> BinOp) const {
+    TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+    TTI::TargetCostKind CostKind) const {
 
   // zve32x is broken for partial_reduce_umla, but let's make sure we
   // don't generate them.
@@ -311,9 +311,8 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost(
   Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
   // Note: Asuming all vqdot* variants are equal cost
-  // TODO: Thread CostKind through this API
-  return LT.first * getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second,
-                                            TTI::TCK_RecipThroughput);
+  return LT.first *
+         getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
 }
 
 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 75d377abb0e7..83ac71ed9da6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -100,12 +100,11 @@ public:
   TargetTransformInfo::PopcntSupportKind
   getPopcntSupport(unsigned TyWidth) const override;
 
-  InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB,
-                          Type *AccumType, ElementCount VF,
-                          TTI::PartialReductionExtendKind OpAExtend,
-                          TTI::PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp) const override;
+  InstructionCost getPartialReductionCost(
+      unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
+      ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
+      TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+      TTI::TargetCostKind CostKind) const override;
 
   bool shouldExpandReduction(const IntrinsicInst *II) const override;
   bool supportsScalableVectors() const override {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 978e08bb8955..4f159996e4c6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -198,12 +198,15 @@ InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
 InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
     ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
-    TTI::PartialReductionExtendKind OpBExtend,
-    std::optional<unsigned> BinOp) const {
+    TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+    TTI::TargetCostKind CostKind) const {
   InstructionCost Invalid = InstructionCost::getInvalid();
   if (!VF.isFixed() || !ST->hasSIMD128())
     return Invalid;
 
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Invalid;
+
   InstructionCost Cost(TTI::TCC_Basic);
 
   // Possible options:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 6b6d060076a8..d83b8d1f45db 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -86,8 +86,8 @@ public:
   InstructionCost getPartialReductionCost(
       unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
       ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
-      TTI::PartialReductionExtendKind OpBExtend,
-      std::optional<unsigned> BinOp = std::nullopt) const override;
+      TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
+      TTI::TargetCostKind CostKind) const override;
   TTI::ReductionShuffle
   getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override;
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e14f985efd96..9a2cd94eda58 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8240,7 +8240,7 @@ bool VPRecipeBuilder::getScaledReductions(
           [&](ElementCount VF) {
             InstructionCost Cost = TTI->getPartialReductionCost(
                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
-                VF, OpAExtend, OpBExtend, BinOp->getOpcode());
+                VF, OpAExtend, OpBExtend, BinOp->getOpcode(), CM.CostKind);
             return Cost.isValid();
           },
           Range)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f3b5c8cfa988..22861eb1c7df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -336,9 +336,9 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
     return TargetTransformInfo::PR_None;
   };
 
-  return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
-                                         PhiType, VF, GetExtendKind(ExtAR),
-                                         GetExtendKind(ExtBR), Opcode);
+  return Ctx.TTI.getPartialReductionCost(
+      getOpcode(), InputTypeA, InputTypeB, PhiType, VF, GetExtendKind(ExtAR),
+      GetExtendKind(ExtBR), Opcode, Ctx.CostKind);
 }
 
 void VPPartialReductionRecipe::execute(VPTransformState &State) {

From d8e6d74c6905b3032a3dc9b686bd80bb3feb9857 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 19 Jun 2025 16:15:23 -0700
Subject: [PATCH 0985/1322] [LV] Consider EVL legality for TTI tail folding
 preference (#144790)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 43 +++++++++----------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9a2cd94eda58..d9f53c4146c2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1364,36 +1364,33 @@ public:
       return;
     }
 
-    if (!ForceTailFoldingStyle.getNumOccurrences()) {
-      ChosenTailFoldingStyle = {
-          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
-          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
-      return;
-    }
+    // Default to TTI preference, but allow command line override.
+    ChosenTailFoldingStyle = {
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
+    if (ForceTailFoldingStyle.getNumOccurrences())
+      ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
+                                ForceTailFoldingStyle.getValue()};
 
-    // Set styles when forced.
-    ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
-                              ForceTailFoldingStyle.getValue()};
     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
       return;
     // Override forced styles if needed.
     // FIXME: Investigate opportunity for fixed vector factor.
     bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
                       TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
-    if (!EVLIsLegal) {
-      // If for some reason EVL mode is unsupported, fallback to
-      // DataWithoutLaneMask to try to vectorize the loop with folded tail
-      // in a generic way.
-      ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
-                                TailFoldingStyle::DataWithoutLaneMask};
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Preference for VP intrinsics indicated. Will "
-             "not try to generate VP Intrinsics "
-          << (UserIC > 1
-                  ? "since interleave count specified is greater than 1.\n"
-                  : "due to non-interleaving reasons.\n"));
-    }
+    if (EVLIsLegal)
+      return;
+    // If for some reason EVL mode is unsupported, fallback to
+    // DataWithoutLaneMask to try to vectorize the loop with folded tail
+    // in a generic way.
+    ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+                              TailFoldingStyle::DataWithoutLaneMask};
+    LLVM_DEBUG(
+        dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                  "not try to generate VP Intrinsics "
+               << (UserIC > 1
+                       ? "since interleave count specified is greater than 1.\n"
+                       : "due to non-interleaving reasons.\n"));
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.

From dad64877c811dcce7e2c7ebc216161ecf0733fcf Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 16:24:46 -0700
Subject: [PATCH 0986/1322] [llvm] Remove an extraneous cast (NFC) (#144955)

llvm::CallBase::getArgOperand returns Value *, so we do not need
const_cast<Value *>.
---
 llvm/include/llvm/IR/IntrinsicInst.h          | 64 ++++++++-----------
 .../CodeGen/GlobalISel/InlineAsmLowering.cpp  |  2 +-
 llvm/lib/IR/IntrinsicInst.cpp                 |  6 +-
 3 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 20e667524fa0..d8f8bdeb01d4 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -779,8 +779,8 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getLHS() const { return const_cast<Value *>(getArgOperand(0)); }
-  Value *getRHS() const { return const_cast<Value *>(getArgOperand(1)); }
+  Value *getLHS() const { return getArgOperand(0); }
+  Value *getRHS() const { return getArgOperand(1); }
 
   /// Returns the comparison predicate underlying the intrinsic.
   static ICmpInst::Predicate getPredicate(Intrinsic::ID ID) {
@@ -868,8 +868,8 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getLHS() const { return const_cast<Value *>(getArgOperand(0)); }
-  Value *getRHS() const { return const_cast<Value *>(getArgOperand(1)); }
+  Value *getLHS() const { return getArgOperand(0); }
+  Value *getRHS() const { return getArgOperand(1); }
 
   static bool isSigned(Intrinsic::ID ID) { return ID == Intrinsic::scmp; }
   bool isSigned() const { return isSigned(getIntrinsicID()); }
@@ -914,8 +914,8 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getLHS() const { return const_cast<Value *>(getArgOperand(0)); }
-  Value *getRHS() const { return const_cast<Value *>(getArgOperand(1)); }
+  Value *getLHS() const { return getArgOperand(0); }
+  Value *getRHS() const { return getArgOperand(1); }
 
   /// Returns the binary operation underlying the intrinsic.
   LLVM_ABI Instruction::BinaryOps getBinaryOp() const;
@@ -1118,7 +1118,7 @@ private:
 
 public:
   ConstantInt *getVolatileCst() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(ARG_VOLATILE)));
+    return cast<ConstantInt>(getArgOperand(ARG_VOLATILE));
   }
 
   bool isVolatile() const { return !getVolatileCst()->isZero(); }
@@ -1180,7 +1180,7 @@ private:
 
 public:
   ConstantInt *getVolatileCst() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(ARG_VOLATILE)));
+    return cast<ConstantInt>(getArgOperand(ARG_VOLATILE));
   }
 
   bool isVolatile() const { return !getVolatileCst()->isZero(); }
@@ -1288,7 +1288,7 @@ public:
 
   Value *getRawElementSizeInBytes() const {
     assert(isAtomic());
-    return const_cast<Value *>(getArgOperand(ARG_ELEMENTSIZE));
+    return getArgOperand(ARG_ELEMENTSIZE);
   }
 
   uint32_t getElementSizeInBytes() const {
@@ -1388,7 +1388,7 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getArgList() const { return const_cast<Value *>(getArgOperand(0)); }
+  Value *getArgList() const { return getArgOperand(0); }
 };
 
 /// This represents the llvm.va_end intrinsic.
@@ -1401,7 +1401,7 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getArgList() const { return const_cast<Value *>(getArgOperand(0)); }
+  Value *getArgList() const { return getArgOperand(0); }
 };
 
 /// This represents the llvm.va_copy intrinsic.
@@ -1414,8 +1414,8 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getDest() const { return const_cast<Value *>(getArgOperand(0)); }
-  Value *getSrc() const { return const_cast<Value *>(getArgOperand(1)); }
+  Value *getDest() const { return getArgOperand(0); }
+  Value *getSrc() const { return getArgOperand(1); }
 };
 
 /// A base class for all instrprof intrinsics.
@@ -1457,16 +1457,12 @@ public:
   // The "name" operand of the profile instrumentation instruction - this is the
   // operand that can be used to relate the instruction to the function it
   // belonged to at instrumentation time.
-  Value *getNameValue() const {
-    return const_cast<Value *>(getArgOperand(0))->stripPointerCasts();
-  }
+  Value *getNameValue() const { return getArgOperand(0)->stripPointerCasts(); }
 
   void setNameValue(Value *V) { setArgOperand(0, V); }
 
   // The hash of the CFG for the instrumented function.
-  ConstantInt *getHash() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
-  }
+  ConstantInt *getHash() const { return cast<ConstantInt>(getArgOperand(1)); }
 };
 
 /// A base class for all instrprof counter intrinsics.
@@ -1563,18 +1559,14 @@ public:
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  Value *getTargetValue() const {
-    return cast<Value>(const_cast<Value *>(getArgOperand(2)));
-  }
+  Value *getTargetValue() const { return cast<Value>(getArgOperand(2)); }
 
   ConstantInt *getValueKind() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+    return cast<ConstantInt>(getArgOperand(3));
   }
 
   // Returns the value site index.
-  ConstantInt *getIndex() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(4)));
-  }
+  ConstantInt *getIndex() const { return cast<ConstantInt>(getArgOperand(4)); }
 };
 
 /// A base class for instrprof mcdc intrinsics that require global bitmap bytes.
@@ -1590,7 +1582,7 @@ public:
   /// \return The number of bits used for the MCDC bitmaps for the instrumented
   /// function.
   ConstantInt *getNumBitmapBits() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+    return cast<ConstantInt>(getArgOperand(2));
   }
 
   /// \return The number of bytes used for the MCDC bitmaps for the instrumented
@@ -1624,14 +1616,12 @@ public:
   /// \return The index of the TestVector Bitmap upon which this intrinsic
   /// acts.
   ConstantInt *getBitmapIndex() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+    return cast<ConstantInt>(getArgOperand(2));
   }
 
   /// \return The address of the corresponding condition bitmap containing
   /// the index of the TestVector to update within the TestVector Bitmap.
-  Value *getMCDCCondBitmapAddr() const {
-    return cast<Value>(const_cast<Value *>(getArgOperand(3)));
-  }
+  Value *getMCDCCondBitmapAddr() const { return cast<Value>(getArgOperand(3)); }
 };
 
 class PseudoProbeInst : public IntrinsicInst {
@@ -1645,20 +1635,16 @@ public:
   }
 
   ConstantInt *getFuncGuid() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(0)));
+    return cast<ConstantInt>(getArgOperand(0));
   }
 
-  ConstantInt *getIndex() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
-  }
+  ConstantInt *getIndex() const { return cast<ConstantInt>(getArgOperand(1)); }
 
   ConstantInt *getAttributes() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+    return cast<ConstantInt>(getArgOperand(2));
   }
 
-  ConstantInt *getFactor() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
-  }
+  ConstantInt *getFactor() const { return cast<ConstantInt>(getArgOperand(3)); }
 };
 
 class NoAliasScopeDeclInst : public IntrinsicInst {
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index fbbbea6156fc..b4e64d7416d8 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -240,7 +240,7 @@ bool InlineAsmLowering::lowerInlineAsm(
 
     // Compute the value type for each operand.
     if (OpInfo.hasArg()) {
-      OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo));
+      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
 
       if (isa<BasicBlock>(OpInfo.CallOperandVal)) {
         LLVM_DEBUG(dbgs() << "Basic block input operands not supported yet\n");
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 256bce1abe71..b1d3339c5a41 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -239,13 +239,13 @@ void DbgAssignIntrinsic::setValue(Value *V) {
 ConstantInt *InstrProfCntrInstBase::getNumCounters() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("InstrProfValueProfileInst does not have counters!");
-  return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+  return cast<ConstantInt>(getArgOperand(2));
 }
 
 ConstantInt *InstrProfCntrInstBase::getIndex() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("Please use InstrProfValueProfileInst::getIndex()");
-  return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+  return cast<ConstantInt>(getArgOperand(3));
 }
 
 void InstrProfCntrInstBase::setIndex(uint32_t Idx) {
@@ -255,7 +255,7 @@ void InstrProfCntrInstBase::setIndex(uint32_t Idx) {
 
 Value *InstrProfIncrementInst::getStep() const {
   if (InstrProfIncrementInstStep::classof(this)) {
-    return const_cast<Value *>(getArgOperand(4));
+    return getArgOperand(4);
   }
   const Module *M = getModule();
   LLVMContext &Context = M->getContext();

From 6001a8bb945762fd128f025bb8d7969d92096772 Mon Sep 17 00:00:00 2001
From: Tianle Liu <tianle.l.liu@intel.com>
Date: Fri, 20 Jun 2025 08:01:32 +0800
Subject: [PATCH 0987/1322] [WholeProgramDevirt] Add check for
 AvailableExternal and give up icall.branch.funnel (#143468)

When a customer class inherits from a libc++ class, and is built with
"-flto  -fwhole-program-vtables -static-libstdc++ \
-Wl,-plugin-opt=-whole-program-visibility", the libc++ class's vtable is
available_externally, meanwhile the customer class vtable is private.
And
both of them are !vcall_visibility == Linkage Unit.
In this case, icall.branch.funnel might be generated.

But the icall.branch.funnel would cause crash in LowerTypeTests because
available_externally Global_Object's GlobalTypeMember would not be
saved and finally leads to a NULL GlobalTypeMember which causes a crash.
Even saving the available_externally GO's GlobalTypeMember so that it is
not NULL to avoid the crash in LowerTypeTests, it still will crash in
SelectionDAGBuilder or Verifier, because operands linkage type
consistency
check of icall.branch.funnel can not pass.

So any one of available externally vtable would stop to generate
icall.branch.funnel.
This patch fixes FullLTO mode and split-LTO-unit ThinLTO mode.
---
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 16 ++++++
 .../availableexternal-check.ll                | 56 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 llvm/test/Transforms/WholeProgramDevirt/availableexternal-check.ll

diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 30e1dc7167a3..aec484f8a18f 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1450,6 +1450,22 @@ void DevirtModule::tryICallBranchFunnel(
   if (!HasNonDevirt)
     return;
 
+  // If any GV is AvailableExternally, not to generate branch.funnel.
+  // NOTE: It is to avoid crash in LowerTypeTest.
+  // If the branch.funnel is generated, because GV.isDeclarationForLinker(),
+  // in LowerTypeTestsModule::lower(), its GlobalTypeMember would NOT
+  // be saved in GlobalTypeMembers[&GV]. Then crash happens in
+  // buildBitSetsFromDisjointSet due to GlobalTypeMembers[&GV] is NULL.
+  // Even doing experiment to save it in GlobalTypeMembers[&GV] and
+  // making GlobalTypeMembers[&GV] be not NULL, crash could avoid from
+  // buildBitSetsFromDisjointSet. But still report_fatal_error in Verifier
+  // or SelectionDAGBuilder later, because operands linkage type consistency
+  // check of icall.branch.funnel can not pass.
+  for (auto &T : TargetsForSlot) {
+    if (T.TM->Bits->GV->hasAvailableExternallyLinkage())
+      return;
+  }
+
   FunctionType *FT =
       FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true);
   Function *JT;
diff --git a/llvm/test/Transforms/WholeProgramDevirt/availableexternal-check.ll b/llvm/test/Transforms/WholeProgramDevirt/availableexternal-check.ll
new file mode 100644
index 000000000000..41809b41d8c7
--- /dev/null
+++ b/llvm/test/Transforms/WholeProgramDevirt/availableexternal-check.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility %s | FileCheck %s
+
+; This test is reduced from C++ code like this:
+; class A :public std::exception {
+; public:
+;   A() {};
+;   const char* what () const throw () {return "A";}
+; };
+; long test(std::exception *p) {
+;   const char* ch = p->what();
+;   ...;
+; }
+;
+; Build command is "clang++ -O2 -target x86_64-unknown-linux -flto=full \
+; -fwhole-program-vtables -static-libstdc++  -Wl,-plugin-opt=-whole-program-visibility"
+;
+; _ZTVSt9exception's visibility is 1 (Linkage Unit), and available_externally.
+; If any GV is available_externally, icall.branch.funnel should not be generated.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+@_ZTVSt9exception = available_externally constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr @_ZNKSt9exception4whatEv] }, !type !0, !type !1
+@_ZTV1A.0 = constant [5 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr @_ZNK1A4whatEv], !type !3, !type !4, !type !5, !type !6
+
+declare ptr @_ZNKSt9exception4whatEv()
+
+define ptr @_Z4testPSt9exception() {
+  %1 = load ptr, ptr null, align 8
+  %2 = call i1 @llvm.type.test(ptr %1, metadata !"_ZTSSt9exception")
+  tail call void @llvm.assume(i1 %2)
+  %3 = getelementptr i8, ptr %1, i64 16
+  %4 = load ptr, ptr %3, align 8
+  %5 = tail call ptr %4(ptr null)
+  ret ptr %5
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #0
+
+declare ptr @_ZNK1A4whatEv()
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i1 @llvm.type.test(ptr, metadata) #1
+
+; CHECK-NOT: call void (...) @llvm.icall.branch.funnel
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!0 = !{i64 16, !"_ZTSSt9exception"}
+!1 = !{i64 32, !"_ZTSMSt9exceptionKDoFPKcvE.virtual"}
+!3 = !{i32 16, !"_ZTS1A"}
+!4 = !{i32 32, !"_ZTSM1AKDoFPKcvE.virtual"}
+!5 = !{i32 16, !"_ZTSSt9exception"}
+!6 = !{i32 32, !"_ZTSMSt9exceptionKDoFPKcvE.virtual"}

From 91439817e8d19613ac6e25ca9abd5e7534a9d33b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Jun 2025 09:34:35 +0900
Subject: [PATCH 0988/1322] ARM: Avoid using isTarget wrappers around Triple
 predicates (#144705)

These are module level properties, and querying them through
a function-level subtarget context is confusing. Plus we don't
need an aliased name. This doesn't avoid all the uses, just the
ones in the TargetLowering constructor.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 29 ++++++++++++-------------
 llvm/lib/Target/ARM/ARMSubtarget.h      |  4 ++++
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 91fb7bc4578b..4567081fe78d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -515,7 +515,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  if (Subtarget->isTargetMachO()) {
+  const Triple &TT = TM.getTargetTriple();
+
+  if (TT.isOSBinFormatMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
@@ -588,9 +590,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   }
 
   // RTLIB
-  if (TM.isAAPCS_ABI() &&
-      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
-       Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
+  if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
+                           TT.isTargetMuslAEABI() || TT.isAndroid())) {
     // clang-format off
     static const struct {
       const RTLIB::Libcall Op;
@@ -712,7 +713,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   // The half <-> float conversion functions are always soft-float on
   // non-watchos platforms, but are needed for some targets which use a
   // hard-float calling convention by default.
-  if (!Subtarget->isTargetWatchABI()) {
+  if (!TT.isWatchABI()) {
     if (TM.isAAPCS_ABI()) {
       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
@@ -726,7 +727,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
 
   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
   // a __gnu_ prefix (which is the default).
-  if (Subtarget->isTargetAEABI()) {
+  if (TT.isTargetAEABI()) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
@@ -741,7 +742,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
     }
-  } else if (!Subtarget->isTargetMachO()) {
+  } else if (!TT.isOSBinFormatMachO()) {
     setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
     setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
   }
@@ -1227,7 +1228,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
   }
 
-  if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
+  if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
     setOperationAction(ISD::SDIV, MVT::i32, Custom);
     setOperationAction(ISD::UDIV, MVT::i32, Custom);
 
@@ -1239,9 +1240,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
 
   // Register based DivRem for AEABI (RTABI 4.2)
-  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
-      Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
-      Subtarget->isTargetWindows()) {
+  if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
+      TT.isTargetMuslAEABI() || TT.isOSWindows()) {
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
@@ -1271,7 +1271,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isTargetWindows())
+  if (TT.isOSWindows())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
@@ -1326,8 +1326,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   }
 
   // Compute supported atomic widths.
-  if (Subtarget->isTargetLinux() ||
-      (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
+  if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
     // For targets where __sync_* routines are reliably available, we use them
     // if necessary.
     //
@@ -1538,7 +1537,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
 
   // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined.  MinGW has
   // it, but it's just a wrapper around ldexp.
-  if (Subtarget->isTargetWindows()) {
+  if (TT.isOSWindows()) {
     for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
       if (isOperationExpand(Op, MVT::f32))
         setOperationAction(Op, MVT::f32, Promote);
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 7893796e313b..3e1314349564 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -334,6 +334,9 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
+  /// @{
+  /// These properties are per-module, please use the TargetMachine
+  /// TargetTriple.
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
@@ -359,6 +362,7 @@ public:
   bool isTargetEHABICompatible() const {
     return TargetTriple.isTargetEHABICompatible();
   }
+  /// @}
 
   bool isReadTPSoft() const {
     return !(isReadTPTPIDRURW() || isReadTPTPIDRURO() || isReadTPTPIDRPRW());

From efd42b9b1d655a56abb3e6ce1ed4414e9f882912 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Jun 2025 09:35:02 +0900
Subject: [PATCH 0989/1322] WebAssembly: Stop directly using
 RuntimeLibcalls.def (#143054)

Construct RuntimeLibcallsInfo instead of manually creating a map.
This was repeating the setting of the RETURN_ADDRESS. This removes
an obstacle to generating libcall information with tablegen.

This is also not great, since it's setting a static map which
would be broken if there were ever a triple with a different libcall
configuration.
---
 .../WebAssemblyRuntimeLibcallSignatures.cpp   | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index d5c4532824c0..4548a7520b3b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -528,25 +528,19 @@ RuntimeLibcallSignatureTable &getRuntimeLibcallSignatures() {
 // constructor for use with a static variable
 struct StaticLibcallNameMap {
   StringMap<RTLIB::Libcall> Map;
-  StaticLibcallNameMap() {
-    static const std::pair<const char *, RTLIB::Libcall> NameLibcalls[] = {
-#define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code},
-#define LIBCALL_NO_NAME nullptr
-#include "llvm/IR/RuntimeLibcalls.def"
-#undef HANDLE_LIBCALL
-#undef LIBCALL_NO_NAME
-    };
-    for (const auto &NameLibcall : NameLibcalls) {
-      if (NameLibcall.first != nullptr &&
-          getRuntimeLibcallSignatures().Table[NameLibcall.second] !=
-              unsupported) {
-        assert(!Map.contains(NameLibcall.first) &&
+  StaticLibcallNameMap(const Triple &TT) {
+    // FIXME: This is broken if there are ever different triples compiled with
+    // different libcalls.
+    RTLIB::RuntimeLibcallsInfo RTCI(TT);
+    for (RTLIB::Libcall LC : RTLIB::libcalls()) {
+      const char *NameLibcall = RTCI.getLibcallName(LC);
+      if (NameLibcall != nullptr &&
+          getRuntimeLibcallSignatures().Table[LC] != unsupported) {
+        assert(!Map.contains(NameLibcall) &&
                "duplicate libcall names in name map");
-        Map[NameLibcall.first] = NameLibcall.second;
+        Map[NameLibcall] = LC;
       }
     }
-
-    Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS;
   }
 };
 
@@ -942,7 +936,7 @@ void WebAssembly::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
                                       StringRef Name,
                                       SmallVectorImpl<wasm::ValType> &Rets,
                                       SmallVectorImpl<wasm::ValType> &Params) {
-  static StaticLibcallNameMap LibcallNameMap;
+  static StaticLibcallNameMap LibcallNameMap(Subtarget.getTargetTriple());
   auto &Map = LibcallNameMap.Map;
   auto Val = Map.find(Name);
 #ifndef NDEBUG

From fa7646008ef32b38357189ed5752e1a1b8d6d146 Mon Sep 17 00:00:00 2001
From: joaosaffran <126493771+joaosaffran@users.noreply.github.com>
Date: Thu, 19 Jun 2025 17:39:48 -0700
Subject: [PATCH 0990/1322] [DirectX] Add Root Signature Version Support and
 Update Test IR Format (#144957)

Updates the Root Signature metadata parser to extract version
information. This requirement was added after the initial parser
implementation.

---------

Co-authored-by: joaosaffran <joao.saffran@microsoft.com>
---
 llvm/lib/Target/DirectX/DXILRootSignature.cpp | 10 ++++--
 .../RootSignature-Error-is-not-function.ll    |  4 +--
 .../RootSignature-Error-is-not-value.ll       |  4 +--
 ...ootSignature-Error-no-root-element-list.ll |  4 +--
 ...Signature-Error-root-element-not-mdnode.ll |  4 +--
 .../RootSignature-Flags-Error.ll              |  2 +-
 .../ContainerData/RootSignature-Flags.ll      |  2 +-
 .../RootSignature-MultipleEntryFunctions.ll   |  4 +--
 .../RootSignature-NullFunction-Error.ll       |  4 +--
 ...Parameters-Invalid-ParameterIsNotString.ll |  2 +-
 ...otSignature-Parameters-Validation-Error.ll |  2 +-
 .../ContainerData/RootSignature-Parameters.ll |  2 +-
 ...re-RootConstants-Invalid-Num32BitValues.ll |  2 +-
 ...ure-RootConstants-Invalid-RegisterSpace.ll |  2 +-
 ...re-RootConstants-Invalid-ShaderRegister.ll |  2 +-
 .../RootSignature-RootConstants.ll            |  2 +-
 ...tSignature-RootDescriptor-Invalid-Flags.ll |  2 +-
 ...ure-RootDescriptor-Invalid-RegisterKind.ll |  2 +-
 ...re-RootDescriptor-Invalid-RegisterSpace.ll |  2 +-
 ...re-RootDescriptor-Invalid-RegisterValue.ll |  2 +-
 .../RootSignature-RootDescriptor.ll           |  2 +-
 .../RootSignature-RootDescriptor_V1.ll        | 34 +++++++++++++++++++
 .../RootSignature-RootElement-Error.ll        |  2 +-
 ...ure-RootFlags-VisibilityValidationError.ll |  2 +-
 24 files changed, 70 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll

diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index 3d195acb19e1..88914a31f46e 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -309,7 +309,7 @@ analyzeModule(Module &M) {
     return RSDMap;
 
   for (const auto &RSDefNode : RootSignatureNode->operands()) {
-    if (RSDefNode->getNumOperands() != 2) {
+    if (RSDefNode->getNumOperands() != 3) {
       reportError(Ctx, "Invalid format for Root Signature Definition. Pairs "
                        "of function, root signature expected.");
       continue;
@@ -348,8 +348,14 @@ analyzeModule(Module &M) {
       reportError(Ctx, "Root Element is not a metadata node.");
       continue;
     }
-
     mcdxbc::RootSignatureDesc RSD;
+    if (std::optional<uint32_t> Version = extractMdIntValue(RSDefNode, 2))
+      RSD.Version = *Version;
+    else {
+      reportError(Ctx, "Invalid RSDefNode value, expected constant int");
+      continue;
+    }
+
     // Clang emits the root signature data in dxcontainer following a specific
     // sequence. First the header, then the root parameters. So the header
     // offset will always equal to the header size.
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll
index ad2aa7997eba..fbda7561ceca 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll
@@ -18,9 +18,9 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
-!5 = !{ i32 -1, !6 } ; function, root signature
+!5 = !{ i32 -1, !6, i32 2 } ; function, root signature
 !6 = !{ !7 } ; list of root signature elements
 !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll
index 4d881f96e4c3..94ab52e1f29c 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll
@@ -18,9 +18,9 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
-!5 = !{ !3, !6 } ; function, root signature
+!5 = !{ !3, !6, i32 2 } ; function, root signature
 !6 = !{ !7 } ; list of root signature elements
 !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll
index b5109022b4b0..dc7a3fd10320 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll
@@ -18,9 +18,9 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
-!2 = !{ ptr @main, null } ; function, root signature
+!2 = !{ ptr @main, null, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
-!5 = !{ i32 -1, !6 } ; function, root signature
+!5 = !{ i32 -1, !6, i32 2 } ; function, root signature
 !6 = !{ !7 } ; list of root signature elements
 !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll
index 7e6bcdadd386..3028ca99e4ef 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll
@@ -18,9 +18,9 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
-!2 = !{ ptr @main, i32 -1 } ; function, root signature
+!2 = !{ ptr @main, i32 -1, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
-!5 = !{ i32 -1, !6 } ; function, root signature
+!5 = !{ i32 -1, !6, i32 2 } ; function, root signature
 !6 = !{ !7 } ; list of root signature elements
 !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll
index 4921472d253a..65511160f230 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags-Error.ll
@@ -15,6 +15,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"NOTRootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll
index e81679732a5d..10235b7d1796 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Flags.ll
@@ -13,7 +13,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll
index d23e1c71d2fc..fec9c226d8bc 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-MultipleEntryFunctions.ll
@@ -16,10 +16,10 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
-!5 = !{ ptr @anotherMain, !6 } ; function, root signature
+!5 = !{ ptr @anotherMain, !6, i32 2 } ; function, root signature
 !6 = !{ !7 } ; list of root signature elements
 !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll
index f5caa5012478..c6b57ee31c87 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-NullFunction-Error.ll
@@ -13,9 +13,9 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
-!5 = !{ null, !6 } ; function, root signature
+!5 = !{ null, !6, i32 2 } ; function, root signature
 !6 = !{ !7 } ; list of root signature elements
 !7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Invalid-ParameterIsNotString.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Invalid-ParameterIsNotString.ll
index 04edd00eee64..b4b616f8fd6e 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Invalid-ParameterIsNotString.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Invalid-ParameterIsNotString.ll
@@ -14,6 +14,6 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!0}
-!0 = !{ ptr @main, !1 }
+!0 = !{ ptr @main, !1, i32 2 }
 !1 = !{ !2 }
 !2 = !{ i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Validation-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Validation-Error.ll
index 2b4a075281f8..a61928d0a7fd 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Validation-Error.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters-Validation-Error.ll
@@ -15,6 +15,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootConstants", i32 255, i32 1, i32 2, i32 3 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll
index 714c76213e1b..80aa757d7e10 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll
@@ -11,7 +11,7 @@ entry:
 attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4, !5, !6 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
 !5 = !{ !"RootConstants", i32 0, i32 1, i32 2, i32 3 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-Num32BitValues.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-Num32BitValues.ll
index 552c128e5ab5..121bc6e932a4 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-Num32BitValues.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-Num32BitValues.ll
@@ -11,6 +11,6 @@ entry:
 }
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootConstants", i32 0, i32 1, i32 2, !"Invalid" }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-RegisterSpace.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-RegisterSpace.ll
index 1087b414942e..3534e5d1c5a2 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-RegisterSpace.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-RegisterSpace.ll
@@ -13,6 +13,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootConstants", i32 0, i32 1, !"Invalid", i32 3 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-ShaderRegister.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-ShaderRegister.ll
index 53fd924e8f46..5c3dce2f419e 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-ShaderRegister.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants-Invalid-ShaderRegister.ll
@@ -13,6 +13,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootConstants", i32 0, !"Invalid", i32 2, i32 3 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll
index 71511ff52334..964554fe143e 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootConstants.ll
@@ -13,7 +13,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootConstants", i32 0, i32 1, i32 2, i32 3 }
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll
index 422998124091..6c90bcb09b64 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll
@@ -13,6 +13,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootCBV", i32 0, i32 1, i32 2, i32 3  }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterKind.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterKind.ll
index 4aed84efbe2b..579528d8b5e1 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterKind.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterKind.ll
@@ -13,6 +13,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"Invalid", i32 0, i32 1, i32 2, i32 3  }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterSpace.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterSpace.ll
index 020d117ba45d..18582090e761 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterSpace.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterSpace.ll
@@ -13,6 +13,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootCBV", i32 0, i32 1, i32 4294967280, i32 0  }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterValue.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterValue.ll
index edb8b943c6e3..8bbfdf00bea2 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterValue.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-RegisterValue.ll
@@ -13,6 +13,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootCBV", i32 0, i32 4294967295, i32 2, i32 3  }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll
index 9217945855cd..f77bb96840be 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll
@@ -13,7 +13,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
 !5 = !{ !"RootCBV", i32 0, i32 1, i32 2, i32 8  }
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll
new file mode 100644
index 000000000000..e05c42a22ea4
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor_V1.ll
@@ -0,0 +1,34 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.rts0 = private constant [44 x i8]  c"{{.*}}", section "RTS0", align 4
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 1 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"RootCBV", i32 0, i32 1, i32 2, i32 8  }
+
+; DXC:  - Name:            RTS0
+; DXC-NEXT:    Size:            44
+; DXC-NEXT:    RootSignature:
+; DXC-NEXT:      Version:         1
+; DXC-NEXT:      NumRootParameters: 1 
+; DXC-NEXT:      RootParametersOffset: 24 
+; DXC-NEXT:      NumStaticSamplers: 0
+; DXC-NEXT:      StaticSamplersOffset: 0
+; DXC-NEXT:      Parameters:
+; DXC-NEXT:        - ParameterType:   2
+; DXC-NEXT:          ShaderVisibility: 0
+; DXC-NEXT:          Descriptor:
+; DXC-NEXT:            RegisterSpace: 2
+; DXC-NEXT:            ShaderRegister: 1
+; DXC-NOT:            DATA_STATIC: true
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll
index 89e23f6540c5..aa8d46dccbac 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootElement-Error.ll
@@ -15,5 +15,5 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !"NOTRootElements" } ; list of root signature elements
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootFlags-VisibilityValidationError.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootFlags-VisibilityValidationError.ll
index 4b8e6abacd7a..baeb74c2c3ce 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootFlags-VisibilityValidationError.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootFlags-VisibilityValidationError.ll
@@ -15,6 +15,6 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
-!2 = !{ ptr @main, !3 } ; function, root signature
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !4 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 2147487744 } ; 1 = allow_input_assembler_input_layout

From 5cbed34404a3862c2d7f18e4b4b24f5ce1516a8d Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 20 Jun 2025 08:57:37 +0800
Subject: [PATCH 0991/1322] [X86] Remove CLDEMOTE from Arrowlake and later
 hybrid processors (#144833)

Decouple Arrowlake from Sierraforest because the later has CLDEMOTE
feature.
---
 .../Preprocessor/predefined-arch-macros.c     | 312 +++++++++---------
 llvm/lib/Target/X86/X86.td                    |  22 +-
 llvm/lib/TargetParser/X86TargetParser.cpp     |  16 +-
 3 files changed, 182 insertions(+), 168 deletions(-)

diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 9dfeddbd4d5a..86d51820ae5b 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2556,209 +2556,211 @@
 
 // RUN: %clang -march=sierraforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_SRF_M32
 // RUN: %clang -march=grandridge -m32 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_SRF_M32
 // RUN: %clang -march=arrowlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ARL_M32
 // RUN: %clang -march=arrowlake-s -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32
 // RUN: %clang -march=lunarlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32
 // RUN: %clang -march=pantherlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_PTL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_PTL_M32
 // RUN: %clang -march=clearwaterforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_PTL_M32,CHECK_CWF_M32
-// CHECK_SRF_M32: #define __ADX__ 1
-// CHECK_SRF_M32: #define __AES__ 1
-// CHECK_SRF_M32: #define __AVX2__ 1
-// CHECK_SRF_M32-NOT: AVX512
-// CHECK_SRF_M32: #define __AVXIFMA__ 1
-// CHECK_SRF_M32: #define __AVXNECONVERT__ 1
-// CHECK_SRF_M32-NOT: #define __AVXVNNIINT16__ 1
+// CHECK_ARL_M32: #define __ADX__ 1
+// CHECK_ARL_M32: #define __AES__ 1
+// CHECK_ARL_M32: #define __AVX2__ 1
+// CHECK_ARL_M32-NOT: AVX512
+// CHECK_ARL_M32: #define __AVXIFMA__ 1
+// CHECK_ARL_M32: #define __AVXNECONVERT__ 1
+// CHECK_ARL_M32-NOT: #define __AVXVNNIINT16__ 1
 // CHECK_ARLS_M32: #define __AVXVNNIINT16__ 1
-// CHECK_SRF_M32: #define __AVXVNNIINT8__ 1
-// CHECK_SRF_M32: #define __AVXVNNI__ 1
-// CHECK_SRF_M32: #define __AVX__ 1
-// CHECK_SRF_M32: #define __BMI2__ 1
-// CHECK_SRF_M32: #define __BMI__ 1
+// CHECK_ARL_M32: #define __AVXVNNIINT8__ 1
+// CHECK_ARL_M32: #define __AVXVNNI__ 1
+// CHECK_ARL_M32: #define __AVX__ 1
+// CHECK_ARL_M32: #define __BMI2__ 1
+// CHECK_ARL_M32: #define __BMI__ 1
+// CHECK_ARLS_M32-NOT: __CLDEMOTE__
 // CHECK_SRF_M32: #define __CLDEMOTE__ 1
-// CHECK_SRF_M32: #define __CLFLUSHOPT__ 1
-// CHECK_SRF_M32: #define __CLWB__ 1
-// CHECK_SRF_M32: #define __CMPCCXADD__ 1
-// CHECK_SRF_M32: #define __ENQCMD__ 1
-// CHECK_SRF_M32: #define __F16C__ 1
-// CHECK_SRF_M32: #define __FMA__ 1
-// CHECK_SRF_M32: #define __FSGSBASE__ 1
-// CHECK_SRF_M32: #define __FXSR__ 1
-// CHECK_SRF_M32: #define __GFNI__ 1
-// CHECK_SRF_M32: #define __HRESET__ 1
-// CHECK_SRF_M32: #define __INVPCID__ 1
-// CHECK_SRF_M32: #define __KL__ 1
-// CHECK_SRF_M32: #define __LZCNT__ 1
-// CHECK_SRF_M32: #define __MMX__ 1
-// CHECK_SRF_M32: #define __MOVBE__ 1
-// CHECK_SRF_M32: #define __MOVDIR64B__ 1
-// CHECK_SRF_M32: #define __MOVDIRI__ 1
-// CHECK_SRF_M32: #define __PCLMUL__ 1
-// CHECK_SRF_M32: #define __PCONFIG__ 1
-// CHECK_SRF_M32: #define __PKU__ 1
-// CHECK_SRF_M32: #define __POPCNT__ 1
-// CHECK_SRF_M32-NOT: #define __PREFETCHI__ 1
+// CHECK_ARL_M32: #define __CLFLUSHOPT__ 1
+// CHECK_ARL_M32: #define __CLWB__ 1
+// CHECK_ARL_M32: #define __CMPCCXADD__ 1
+// CHECK_ARL_M32: #define __ENQCMD__ 1
+// CHECK_ARL_M32: #define __F16C__ 1
+// CHECK_ARL_M32: #define __FMA__ 1
+// CHECK_ARL_M32: #define __FSGSBASE__ 1
+// CHECK_ARL_M32: #define __FXSR__ 1
+// CHECK_ARL_M32: #define __GFNI__ 1
+// CHECK_ARL_M32: #define __HRESET__ 1
+// CHECK_ARL_M32: #define __INVPCID__ 1
+// CHECK_ARL_M32: #define __KL__ 1
+// CHECK_ARL_M32: #define __LZCNT__ 1
+// CHECK_ARL_M32: #define __MMX__ 1
+// CHECK_ARL_M32: #define __MOVBE__ 1
+// CHECK_ARL_M32: #define __MOVDIR64B__ 1
+// CHECK_ARL_M32: #define __MOVDIRI__ 1
+// CHECK_ARL_M32: #define __PCLMUL__ 1
+// CHECK_ARL_M32: #define __PCONFIG__ 1
+// CHECK_ARL_M32: #define __PKU__ 1
+// CHECK_ARL_M32: #define __POPCNT__ 1
+// CHECK_ARL_M32-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M32-NOT: #define __PREFETCHI__ 1
 // CHECK_PTL_M32: #define __PREFETCHI__ 1
-// CHECK_SRF_M32: #define __PRFCHW__ 1
-// CHECK_SRF_M32: #define __PTWRITE__ 1
-// CHECK_SRF_M32-NOT: #define __RAOINT__ 1
-// CHECK_SRF_M32: #define __RDPID__ 1
-// CHECK_SRF_M32: #define __RDRND__ 1
-// CHECK_SRF_M32: #define __RDSEED__ 1
-// CHECK_SRF_M32: #define __SERIALIZE__ 1
-// CHECK_SRF_M32: #define __SGX__ 1
-// CHECK_SRF_M32-NOT: #define __SHA512__ 1
+// CHECK_ARL_M32: #define __PRFCHW__ 1
+// CHECK_ARL_M32: #define __PTWRITE__ 1
+// CHECK_ARL_M32-NOT: #define __RAOINT__ 1
+// CHECK_ARL_M32: #define __RDPID__ 1
+// CHECK_ARL_M32: #define __RDRND__ 1
+// CHECK_ARL_M32: #define __RDSEED__ 1
+// CHECK_ARL_M32: #define __SERIALIZE__ 1
+// CHECK_ARL_M32: #define __SGX__ 1
+// CHECK_ARL_M32-NOT: #define __SHA512__ 1
 // CHECK_ARLS_M32: #define __SHA512__ 1
-// CHECK_SRF_M32: #define __SHA__ 1
-// CHECK_SRF_M32: #define __SHSTK__ 1
-// CHECK_SRF_M32-NOT: #define __SM3__ 1
+// CHECK_ARL_M32: #define __SHA__ 1
+// CHECK_ARL_M32: #define __SHSTK__ 1
+// CHECK_ARL_M32-NOT: #define __SM3__ 1
 // CHECK_ARLS_M32: #define __SM3__ 1
-// CHECK_SRF_M32-NOT: #define __SM4__ 1
+// CHECK_ARL_M32-NOT: #define __SM4__ 1
 // CHECK_ARLS_M32: #define __SM4__ 1
-// CHECK_SRF_M32: #define __SSE2__ 1
-// CHECK_SRF_M32: #define __SSE3__ 1
-// CHECK_SRF_M32: #define __SSE4_1__ 1
-// CHECK_SRF_M32: #define __SSE4_2__ 1
-// CHECK_SRF_M32: #define __SSE_MATH__ 1
-// CHECK_SRF_M32: #define __SSE__ 1
-// CHECK_SRF_M32: #define __SSSE3__ 1
-// CHECK_SRF_M32: #define __UINTR__ 1
-// CHECK_SRF_M32-NOT: #define __USERMSR__ 1
+// CHECK_ARL_M32: #define __SSE2__ 1
+// CHECK_ARL_M32: #define __SSE3__ 1
+// CHECK_ARL_M32: #define __SSE4_1__ 1
+// CHECK_ARL_M32: #define __SSE4_2__ 1
+// CHECK_ARL_M32: #define __SSE_MATH__ 1
+// CHECK_ARL_M32: #define __SSE__ 1
+// CHECK_ARL_M32: #define __SSSE3__ 1
+// CHECK_ARL_M32: #define __UINTR__ 1
+// CHECK_ARL_M32-NOT: #define __USERMSR__ 1
 // CHECK_ARLS_M32-NOT: #define __USERMSR__ 1
 // CHECK_PTL_M32-NOT: #define __USERMSR__ 1
 // CHECK_CWF_M32: #define __USERMSR__ 1
-// CHECK_SRF_M32: #define __VAES__ 1
-// CHECK_SRF_M32: #define __VPCLMULQDQ__ 1
-// CHECK_SRF_M32: #define __WAITPKG__ 1
-// CHECK_SRF_M32: #define __WIDEKL__ 1
-// CHECK_SRF_M32: #define __XSAVEC__ 1
-// CHECK_SRF_M32: #define __XSAVEOPT__ 1
-// CHECK_SRF_M32: #define __XSAVES__ 1
-// CHECK_SRF_M32: #define __XSAVE__ 1
-// CHECK_SRF_M32: #define __corei7 1
-// CHECK_SRF_M32: #define __corei7__ 1
-// CHECK_SRF_M32: #define __i386 1
-// CHECK_SRF_M32: #define __i386__ 1
-// CHECK_SRF_M32: #define __tune_corei7__ 1
-// CHECK_SRF_M32: #define i386 1
+// CHECK_ARL_M32: #define __VAES__ 1
+// CHECK_ARL_M32: #define __VPCLMULQDQ__ 1
+// CHECK_ARL_M32: #define __WAITPKG__ 1
+// CHECK_ARL_M32: #define __WIDEKL__ 1
+// CHECK_ARL_M32: #define __XSAVEC__ 1
+// CHECK_ARL_M32: #define __XSAVEOPT__ 1
+// CHECK_ARL_M32: #define __XSAVES__ 1
+// CHECK_ARL_M32: #define __XSAVE__ 1
+// CHECK_ARL_M32: #define __corei7 1
+// CHECK_ARL_M32: #define __corei7__ 1
+// CHECK_ARL_M32: #define __i386 1
+// CHECK_ARL_M32: #define __i386__ 1
+// CHECK_ARL_M32: #define __tune_corei7__ 1
+// CHECK_ARL_M32: #define i386 1
 
 // RUN: %clang -march=sierraforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64
 // RUN: %clang -march=grandridge -m64 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64
 // RUN: %clang -march=arrowlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ARL_M64
 // RUN: %clang -march=arrowlake-s -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64,CHECK_ARLS_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64
 // RUN: %clang -march=lunarlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64,CHECK_ARLS_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64
 // RUN: %clang -march=pantherlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_PTL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_PTL_M64
 // RUN: %clang -march=clearwaterforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_CWF_M64
-// CHECK_SRF_M64: #define __ADX__ 1
-// CHECK_SRF_M64: #define __AES__ 1
-// CHECK_SRF_M64: #define __AVX2__ 1
-// CHECK_SRF_M64-NOT: AVX512
-// CHECK_SRF_M64: #define __AVXIFMA__ 1
-// CHECK_SRF_M64: #define __AVXNECONVERT__ 1
-// CHECK_SRF_M64-NOT: #define __AVXVNNIINT16__ 1
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_CWF_M64
+// CHECK_ARL_M64: #define __ADX__ 1
+// CHECK_ARL_M64: #define __AES__ 1
+// CHECK_ARL_M64: #define __AVX2__ 1
+// CHECK_ARL_M64-NOT: AVX512
+// CHECK_ARL_M64: #define __AVXIFMA__ 1
+// CHECK_ARL_M64: #define __AVXNECONVERT__ 1
+// CHECK_ARL_M64-NOT: #define __AVXVNNIINT16__ 1
 // CHECK_ARLS_M64: #define __AVXVNNIINT16__ 1
-// CHECK_SRF_M64: #define __AVXVNNIINT8__ 1
-// CHECK_SRF_M64: #define __AVXVNNI__ 1
-// CHECK_SRF_M64: #define __AVX__ 1
-// CHECK_SRF_M64: #define __BMI2__ 1
-// CHECK_SRF_M64: #define __BMI__ 1
+// CHECK_ARL_M64: #define __AVXVNNIINT8__ 1
+// CHECK_ARL_M64: #define __AVXVNNI__ 1
+// CHECK_ARL_M64: #define __AVX__ 1
+// CHECK_ARL_M64: #define __BMI2__ 1
+// CHECK_ARL_M64: #define __BMI__ 1
+// CHECK_ARLS_M64-NOT: __CLDEMOTE__
 // CHECK_SRF_M64: #define __CLDEMOTE__ 1
-// CHECK_SRF_M64: #define __CLFLUSHOPT__ 1
-// CHECK_SRF_M64: #define __CLWB__ 1
-// CHECK_SRF_M64: #define __CMPCCXADD__ 1
-// CHECK_SRF_M64: #define __ENQCMD__ 1
-// CHECK_SRF_M64: #define __F16C__ 1
-// CHECK_SRF_M64: #define __FMA__ 1
-// CHECK_SRF_M64: #define __FSGSBASE__ 1
-// CHECK_SRF_M64: #define __FXSR__ 1
-// CHECK_SRF_M64: #define __GFNI__ 1
-// CHECK_SRF_M64: #define __HRESET__ 1
-// CHECK_SRF_M64: #define __INVPCID__ 1
-// CHECK_SRF_M64: #define __KL__ 1
-// CHECK_SRF_M64: #define __LZCNT__ 1
-// CHECK_SRF_M64: #define __MMX__ 1
-// CHECK_SRF_M64: #define __MOVBE__ 1
-// CHECK_SRF_M64: #define __MOVDIR64B__ 1
-// CHECK_SRF_M64: #define __MOVDIRI__ 1
-// CHECK_SRF_M64: #define __PCLMUL__ 1
-// CHECK_SRF_M64: #define __PCONFIG__ 1
-// CHECK_SRF_M64: #define __PKU__ 1
-// CHECK_SRF_M64: #define __POPCNT__ 1
-// CHECK_SRF_M64-NOT: #define __PREFETCHI__ 1
+// CHECK_ARL_M64: #define __CLFLUSHOPT__ 1
+// CHECK_ARL_M64: #define __CLWB__ 1
+// CHECK_ARL_M64: #define __CMPCCXADD__ 1
+// CHECK_ARL_M64: #define __ENQCMD__ 1
+// CHECK_ARL_M64: #define __F16C__ 1
+// CHECK_ARL_M64: #define __FMA__ 1
+// CHECK_ARL_M64: #define __FSGSBASE__ 1
+// CHECK_ARL_M64: #define __FXSR__ 1
+// CHECK_ARL_M64: #define __GFNI__ 1
+// CHECK_ARL_M64: #define __HRESET__ 1
+// CHECK_ARL_M64: #define __INVPCID__ 1
+// CHECK_ARL_M64: #define __KL__ 1
+// CHECK_ARL_M64: #define __LZCNT__ 1
+// CHECK_ARL_M64: #define __MMX__ 1
+// CHECK_ARL_M64: #define __MOVBE__ 1
+// CHECK_ARL_M64: #define __MOVDIR64B__ 1
+// CHECK_ARL_M64: #define __MOVDIRI__ 1
+// CHECK_ARL_M64: #define __PCLMUL__ 1
+// CHECK_ARL_M64: #define __PCONFIG__ 1
+// CHECK_ARL_M64: #define __PKU__ 1
+// CHECK_ARL_M64: #define __POPCNT__ 1
+// CHECK_ARL_M64-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M64-NOT: #define __PREFETCHI__ 1
 // CHECK_PTL_M64: #define __PREFETCHI__ 1
-// CHECK_SRF_M64: #define __PRFCHW__ 1
-// CHECK_SRF_M64: #define __PTWRITE__ 1
-// CHECK_SRF_M64-NOT: #define __RAOINT__ 1
-// CHECK_SRF_M64: #define __RDPID__ 1
-// CHECK_SRF_M64: #define __RDRND__ 1
-// CHECK_SRF_M64: #define __RDSEED__ 1
-// CHECK_SRF_M64: #define __SERIALIZE__ 1
-// CHECK_SRF_M64: #define __SGX__ 1
-// CHECK_SRF_M64-NOT: #define __SHA512__ 1
+// CHECK_ARL_M64: #define __PRFCHW__ 1
+// CHECK_ARL_M64: #define __PTWRITE__ 1
+// CHECK_ARL_M64-NOT: #define __RAOINT__ 1
+// CHECK_ARL_M64: #define __RDPID__ 1
+// CHECK_ARL_M64: #define __RDRND__ 1
+// CHECK_ARL_M64: #define __RDSEED__ 1
+// CHECK_ARL_M64: #define __SERIALIZE__ 1
+// CHECK_ARL_M64: #define __SGX__ 1
+// CHECK_ARL_M64-NOT: #define __SHA512__ 1
 // CHECK_ARLS_M64: #define __SHA512__ 1
-// CHECK_SRF_M64: #define __SHA__ 1
-// CHECK_SRF_M64: #define __SHSTK__ 1
-// CHECK_SRF_M64-NOT: #define __SM3__ 1
+// CHECK_ARL_M64: #define __SHA__ 1
+// CHECK_ARL_M64: #define __SHSTK__ 1
+// CHECK_ARL_M64-NOT: #define __SM3__ 1
 // CHECK_ARLS_M64: #define __SM3__ 1
-// CHECK_SRF_M64-NOT: #define __SM4__ 1
+// CHECK_ARL_M64-NOT: #define __SM4__ 1
 // CHECK_ARLS_M64: #define __SM4__ 1
-// CHECK_SRF_M64: #define __SSE2_MATH__ 1
-// CHECK_SRF_M64: #define __SSE2__ 1
-// CHECK_SRF_M64: #define __SSE3__ 1
-// CHECK_SRF_M64: #define __SSE4_1__ 1
-// CHECK_SRF_M64: #define __SSE4_2__ 1
-// CHECK_SRF_M64: #define __SSE_MATH__ 1
-// CHECK_SRF_M64: #define __SSE__ 1
-// CHECK_SRF_M64: #define __SSSE3__ 1
-// CHECK_SRF_M64: #define __UINTR__ 1
-// CHECK_SRF_M64-NOT: #define __USERMSR__ 1
+// CHECK_ARL_M64: #define __SSE2_MATH__ 1
+// CHECK_ARL_M64: #define __SSE2__ 1
+// CHECK_ARL_M64: #define __SSE3__ 1
+// CHECK_ARL_M64: #define __SSE4_1__ 1
+// CHECK_ARL_M64: #define __SSE4_2__ 1
+// CHECK_ARL_M64: #define __SSE_MATH__ 1
+// CHECK_ARL_M64: #define __SSE__ 1
+// CHECK_ARL_M64: #define __SSSE3__ 1
+// CHECK_ARL_M64: #define __UINTR__ 1
+// CHECK_ARL_M64-NOT: #define __USERMSR__ 1
 // CHECK_ARLS_M64-NOT: #define __USERMSR__ 1
 // CHECK_PTL_M64-NOT: #define __USERMSR__ 1
 // CHECK_CWF_M64: #define __USERMSR__ 1
-// CHECK_SRF_M64: #define __VAES__ 1
-// CHECK_SRF_M64: #define __VPCLMULQDQ__ 1
-// CHECK_SRF_M64: #define __WAITPKG__ 1
-// CHECK_SRF_M64: #define __WIDEKL__ 1
-// CHECK_SRF_M64: #define __XSAVEC__ 1
-// CHECK_SRF_M64: #define __XSAVEOPT__ 1
-// CHECK_SRF_M64: #define __XSAVES__ 1
-// CHECK_SRF_M64: #define __XSAVE__ 1
-// CHECK_SRF_M64: #define __amd64 1
-// CHECK_SRF_M64: #define __amd64__ 1
-// CHECK_SRF_M64: #define __corei7 1
-// CHECK_SRF_M64: #define __corei7__ 1
-// CHECK_SRF_M64: #define __tune_corei7__ 1
-// CHECK_SRF_M64: #define __x86_64 1
-// CHECK_SRF_M64: #define __x86_64__ 1
+// CHECK_ARL_M64: #define __VAES__ 1
+// CHECK_ARL_M64: #define __VPCLMULQDQ__ 1
+// CHECK_ARL_M64: #define __WAITPKG__ 1
+// CHECK_ARL_M64: #define __WIDEKL__ 1
+// CHECK_ARL_M64: #define __XSAVEC__ 1
+// CHECK_ARL_M64: #define __XSAVEOPT__ 1
+// CHECK_ARL_M64: #define __XSAVES__ 1
+// CHECK_ARL_M64: #define __XSAVE__ 1
+// CHECK_ARL_M64: #define __amd64 1
+// CHECK_ARL_M64: #define __amd64__ 1
+// CHECK_ARL_M64: #define __corei7 1
+// CHECK_ARL_M64: #define __corei7__ 1
+// CHECK_ARL_M64: #define __tune_corei7__ 1
+// CHECK_ARL_M64: #define __x86_64 1
+// CHECK_ARL_M64: #define __x86_64__ 1
 
 // RUN: %clang -march=geode -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index b09891652ad9..4d0bfbef794c 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1304,16 +1304,20 @@ def ProcessorFeatures {
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER];
 
-  // Sierraforest
-  list<SubtargetFeature> SRFAdditionalFeatures = [FeatureCMPCCXADD,
+  // Arrowlake
+  list<SubtargetFeature> ARLAdditionalFeatures = [FeatureCMPCCXADD,
                                                   FeatureAVXIFMA,
                                                   FeatureAVXNECONVERT,
                                                   FeatureENQCMD,
                                                   FeatureUINTR,
-                                                  FeatureCLDEMOTE,
                                                   FeatureAVXVNNIINT8];
+  list<SubtargetFeature> ARLFeatures =
+    !listconcat(ADLFeatures, ARLAdditionalFeatures);
+
+  // Sierraforest
+  list<SubtargetFeature> SRFAdditionalFeatures = [FeatureCLDEMOTE];
   list<SubtargetFeature> SRFFeatures =
-    !listconcat(ADLFeatures, SRFAdditionalFeatures);
+    !listconcat(ARLFeatures, SRFAdditionalFeatures);
 
   // Arrowlake S
   list<SubtargetFeature> ARLSAdditionalFeatures = [FeatureAVXVNNIINT16,
@@ -1321,7 +1325,7 @@ def ProcessorFeatures {
                                                    FeatureSM3,
                                                    FeatureSM4];
   list<SubtargetFeature> ARLSFeatures =
-    !listconcat(SRFFeatures, ARLSAdditionalFeatures);
+    !listconcat(ARLFeatures, ARLSAdditionalFeatures);
 
   // Pantherlake
   list<SubtargetFeature> PTLAdditionalFeatures = [FeaturePREFETCHI];
@@ -1331,9 +1335,13 @@ def ProcessorFeatures {
 
   // Clearwaterforest
   list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
+                                                  FeatureAVXVNNIINT16,
+                                                  FeatureSHA512,
+                                                  FeatureSM3,
+                                                  FeatureSM4,
                                                   FeatureUSERMSR];
   list<SubtargetFeature> CWFFeatures =
-    !listconcat(ARLSFeatures, CWFAdditionalFeatures);
+    !listconcat(SRFFeatures, CWFAdditionalFeatures);
 
   // Knights Landing
   list<SubtargetFeature> KNLFeatures = [FeatureX87,
@@ -1861,7 +1869,7 @@ def : ProcModel<"raptorlake", AlderlakePModel,
 def : ProcModel<"meteorlake", AlderlakePModel,
                 ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"arrowlake", AlderlakePModel,
-                ProcessorFeatures.SRFFeatures, ProcessorFeatures.ADLTuning>;
+                ProcessorFeatures.ARLFeatures, ProcessorFeatures.ADLTuning>;
 foreach P = ["arrowlake-s", "arrowlake_s", "lunarlake"] in {
 def : ProcModel<P, AlderlakePModel,
                 ProcessorFeatures.ARLSFeatures, ProcessorFeatures.ADLTuning>;
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 4947b05cd037..94812e4e60c3 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -167,15 +167,19 @@ constexpr FeatureBitset FeaturesAlderlake =
     FeatureSERIALIZE | FeatureSHSTK | FeatureVAES | FeatureVPCLMULQDQ |
     FeatureMOVDIR64B | FeatureMOVDIRI | FeatureWAITPKG | FeatureAVXVNNI |
     FeatureHRESET | FeatureWIDEKL;
-constexpr FeatureBitset FeaturesSierraforest =
+constexpr FeatureBitset FeaturesArrowlake =
     FeaturesAlderlake | FeatureCMPCCXADD | FeatureAVXIFMA | FeatureUINTR |
-    FeatureCLDEMOTE | FeatureENQCMD | FeatureAVXNECONVERT | FeatureAVXVNNIINT8;
-constexpr FeatureBitset FeaturesArrowlakeS = FeaturesSierraforest |
-    FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4;
+    FeatureENQCMD | FeatureAVXNECONVERT | FeatureAVXVNNIINT8;
+constexpr FeatureBitset FeaturesSierraforest =
+    FeaturesArrowlake | FeatureCLDEMOTE;
+constexpr FeatureBitset FeaturesArrowlakeS =
+    FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
+    FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
     FeaturesArrowlakeS | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesClearwaterforest =
-    FeaturesArrowlakeS | FeatureUSERMSR | FeaturePREFETCHI;
+    FeaturesSierraforest | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
+    FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
 
 // Geode Processor.
 constexpr FeatureBitset FeaturesGeode =
@@ -367,7 +371,7 @@ constexpr ProcInfo Processors[] = {
   // Meteorlake microarchitecture based processors.
   { {"meteorlake"}, CK_Meteorlake, FEATURE_AVX2, FeaturesAlderlake, 'p', false },
   // Arrowlake microarchitecture based processors.
-  { {"arrowlake"}, CK_Arrowlake, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
+  { {"arrowlake"}, CK_Arrowlake, FEATURE_AVX2, FeaturesArrowlake, 'p', false },
   { {"arrowlake-s"}, CK_ArrowlakeS, FEATURE_AVX2, FeaturesArrowlakeS, '\0', false },
   { {"arrowlake_s"}, CK_ArrowlakeS, FEATURE_AVX2, FeaturesArrowlakeS, 'p', true },
   // Lunarlake microarchitecture based processors.

From a05393a879b2950fccca66ff0e1b6c70c39838e4 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Fri, 20 Jun 2025 09:06:57 +0800
Subject: [PATCH 0992/1322] [RISCV] Add symbol parsing support for XAndesPerf
 branch instructions (#137748)

This patch adds support for parsing symbols in the XAndesPerf branch
immediate instructions. The branch immediate instructions use
`R_RISCV_NDS_BRANCH_10` relocation. It uses a 10-bit PC-relative branch
offset.
---
 .../ELFRelocs/RISCV_nonstandard.def           |  4 ++
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  6 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    | 21 +++++++
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |  1 +
 .../MCTargetDesc/RISCVELFObjectWriter.cpp     |  2 +
 .../RISCV/MCTargetDesc/RISCVFixupKinds.h      |  4 ++
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  2 +
 llvm/lib/Target/RISCV/RISCVInstrFormats.td    | 57 ++++++++++---------
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 18 +++---
 llvm/test/MC/RISCV/custom_reloc.s             | 13 +++++
 llvm/test/MC/RISCV/vendor-symbol.s            | 19 ++++++-
 .../MC/RISCV/xandesperf-fixups-diagnostics.s  | 13 +++++
 llvm/test/MC/RISCV/xandesperf-relocation.s    | 36 ++++++++++++
 13 files changed, 153 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/xandesperf-fixups-diagnostics.s
 create mode 100644 llvm/test/MC/RISCV/xandesperf-relocation.s

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
index b02462ca89fd..037ca6438733 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -26,3 +26,7 @@ ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U,    192)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH,   193)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32,       194)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_CALL_PLT, 195)
+
+// Andes Nonstandard Relocations
+// Calculation: S + A - P (10-bit PC-relative branch offset)
+ELF_RISCV_NONSTANDARD_RELOC(ANDES, R_RISCV_NDS_BRANCH_10,    241)
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index f1d6f99ba981..45946d3efe32 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -858,10 +858,6 @@ public:
     return SignExtend64<32>(Imm);
   }
 
-  bool isSImm11Lsb0() const {
-    return isSImmPred([](int64_t Imm) { return isShiftedInt<10, 1>(Imm); });
-  }
-
   bool isSImm12() const {
     if (!isImm())
       return false;
@@ -1548,7 +1544,7 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSImm11:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 10),
                                       (1 << 10) - 1);
-  case Match_InvalidSImm11Lsb0:
+  case Match_InvalidBareSImm11Lsb0:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -(1 << 10), (1 << 10) - 2,
         "immediate must be a multiple of 2 bytes in the range");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 9161f23c8a95..186296944efd 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -86,6 +86,9 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_riscv_qc_e_32", 16, 32, 0},
       {"fixup_riscv_qc_abs20_u", 12, 20, 0},
       {"fixup_riscv_qc_e_call_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
+
+      // Andes fixups
+      {"fixup_riscv_nds_branch_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
   };
   static_assert((std::size(Infos)) == RISCV::NumTargetFixupKinds,
                 "Not all fixup kinds added to Infos array");
@@ -567,6 +570,21 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
             (Bit15_13 << 17) | (Bit4_1 << 8) | (Bit11 << 7);
     return Value;
   }
+  case RISCV::fixup_riscv_nds_branch_10: {
+    if (!isInt<11>(Value))
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned");
+    // Need to extract imm[10], imm[9:5], imm[4:1] from the 11-bit Value.
+    unsigned Sbit = (Value >> 10) & 0x1;
+    unsigned Hi5 = (Value >> 5) & 0x1f;
+    unsigned Lo4 = (Value >> 1) & 0xf;
+    // Inst{31} = Sbit;
+    // Inst{29-25} = Hi5;
+    // Inst{11-8} = Lo4;
+    Value = (Sbit << 31) | (Hi5 << 25) | (Lo4 << 8);
+    return Value;
+  }
   }
 }
 
@@ -702,6 +720,9 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   case RISCV::fixup_riscv_qc_e_call_plt:
     VendorIdentifier = "QUALCOMM";
     break;
+  case RISCV::fixup_riscv_nds_branch_10:
+    VendorIdentifier = "ANDES";
+    break;
   }
 
   // Create a local symbol for the vendor relocation to reference. It's fine if
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 6ef94fb5e93d..3d304842fac1 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -56,6 +56,7 @@ enum {
   InstFormatQC_EB = 24,
   InstFormatQC_EJ = 25,
   InstFormatQC_ES = 26,
+  InstFormatNDS_BRANCH_10 = 27,
   InstFormatOther = 31,
 
   InstFormatMask = 31,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 3c1f9450a099..8ab2c56ae317 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -103,6 +103,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_RISCV_QC_E_BRANCH;
     case RISCV::fixup_riscv_qc_e_call_plt:
       return ELF::R_RISCV_QC_E_CALL_PLT;
+    case RISCV::fixup_riscv_nds_branch_10:
+      return ELF::R_RISCV_NDS_BRANCH_10;
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 8d869a64cde4..b5c23772e6d8 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -56,6 +56,10 @@ enum Fixups {
   // 32-bit fixup for symbol references in the 48-bit qc.j/qc.jal instructions
   fixup_riscv_qc_e_call_plt,
 
+  // Andes specific fixups
+  // 10-bit fixup for symbol references in the xandesperf branch instruction
+  fixup_riscv_nds_branch_10,
+
   // Used as a sentinel, must be the last
   fixup_riscv_invalid,
   NumTargetFixupKinds = fixup_riscv_invalid - FirstTargetFixupKind
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 2a90552037f9..b50913be9922 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -647,6 +647,8 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     } else if (MIFrm == RISCVII::InstFormatQC_EJ) {
       FixupKind = RISCV::fixup_riscv_qc_e_call_plt;
       RelaxCandidate = true;
+    } else if (MIFrm == RISCVII::InstFormatNDS_BRANCH_10) {
+      FixupKind = RISCV::fixup_riscv_nds_branch_10;
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 088a6923fadb..b6b64b57b1b3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -30,34 +30,35 @@
 class InstFormat<bits<5> val> {
   bits<5> Value = val;
 }
-def InstFormatPseudo : InstFormat<0>;
-def InstFormatR      : InstFormat<1>;
-def InstFormatR4     : InstFormat<2>;
-def InstFormatI      : InstFormat<3>;
-def InstFormatS      : InstFormat<4>;
-def InstFormatB      : InstFormat<5>;
-def InstFormatU      : InstFormat<6>;
-def InstFormatJ      : InstFormat<7>;
-def InstFormatCR     : InstFormat<8>;
-def InstFormatCI     : InstFormat<9>;
-def InstFormatCSS    : InstFormat<10>;
-def InstFormatCIW    : InstFormat<11>;
-def InstFormatCL     : InstFormat<12>;
-def InstFormatCS     : InstFormat<13>;
-def InstFormatCA     : InstFormat<14>;
-def InstFormatCB     : InstFormat<15>;
-def InstFormatCJ     : InstFormat<16>;
-def InstFormatCU     : InstFormat<17>;
-def InstFormatCLB    : InstFormat<18>;
-def InstFormatCLH    : InstFormat<19>;
-def InstFormatCSB    : InstFormat<20>;
-def InstFormatCSH    : InstFormat<21>;
-def InstFormatQC_EAI : InstFormat<22>;
-def InstFormatQC_EI  : InstFormat<23>;
-def InstFormatQC_EB  : InstFormat<24>;
-def InstFormatQC_EJ  : InstFormat<25>;
-def InstFormatQC_ES  : InstFormat<26>;
-def InstFormatOther  : InstFormat<31>;
+def InstFormatPseudo        : InstFormat<0>;
+def InstFormatR             : InstFormat<1>;
+def InstFormatR4            : InstFormat<2>;
+def InstFormatI             : InstFormat<3>;
+def InstFormatS             : InstFormat<4>;
+def InstFormatB             : InstFormat<5>;
+def InstFormatU             : InstFormat<6>;
+def InstFormatJ             : InstFormat<7>;
+def InstFormatCR            : InstFormat<8>;
+def InstFormatCI            : InstFormat<9>;
+def InstFormatCSS           : InstFormat<10>;
+def InstFormatCIW           : InstFormat<11>;
+def InstFormatCL            : InstFormat<12>;
+def InstFormatCS            : InstFormat<13>;
+def InstFormatCA            : InstFormat<14>;
+def InstFormatCB            : InstFormat<15>;
+def InstFormatCJ            : InstFormat<16>;
+def InstFormatCU            : InstFormat<17>;
+def InstFormatCLB           : InstFormat<18>;
+def InstFormatCLH           : InstFormat<19>;
+def InstFormatCSB           : InstFormat<20>;
+def InstFormatCSH           : InstFormat<21>;
+def InstFormatQC_EAI        : InstFormat<22>;
+def InstFormatQC_EI         : InstFormat<23>;
+def InstFormatQC_EB         : InstFormat<24>;
+def InstFormatQC_EJ         : InstFormat<25>;
+def InstFormatQC_ES         : InstFormat<26>;
+def InstFormatNDS_BRANCH_10 : InstFormat<27>;
+def InstFormatOther         : InstFormat<31>;
 
 
 class RISCVVConstraint<bits<3> val> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 3ba21e51e7c6..4b8d40d1429a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -15,16 +15,16 @@
 //===----------------------------------------------------------------------===//
 
 // A 11-bit signed immediate where the least significant bit is zero.
-def simm11_lsb0 : Operand<OtherVT> {
-  let ParserMatchClass = SImmAsmOperand<11, "Lsb0">;
+def bare_simm11_lsb0 : Operand<OtherVT> {
+  let ParserMatchClass = BareSImmNLsb0AsmOperand<11>;
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getImmOpValueAsrN<1>";
   let DecoderMethod = "decodeSImmOperandAndLslN<11, 1>";
   let MCOperandPredicate = [{
     int64_t Imm;
-    if (!MCOp.evaluateAsConstantImm(Imm))
-      return false;
-    return isShiftedInt<10, 1>(Imm);
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<10, 1>(Imm);
+    return MCOp.isBareSymbolRef();
   }];
   let OperandType = "OPERAND_PCREL";
 }
@@ -58,8 +58,8 @@ def simm20_lsb000 : Operand<XLenVT> {
 //===----------------------------------------------------------------------===//
 
 class NDSRVInstBB<bit cs, string opcodestr>
-    : RVInst<(outs), (ins GPR:$rs1, uimmlog2xlen:$cimm, simm11_lsb0:$imm10),
-             opcodestr, "$rs1, $cimm, $imm10", [], InstFormatOther>,
+    : RVInst<(outs), (ins GPR:$rs1, uimmlog2xlen:$cimm, bare_simm11_lsb0:$imm10),
+             opcodestr, "$rs1, $cimm, $imm10", [], InstFormatNDS_BRANCH_10>,
       Sched<[WriteJmp, ReadIALU]> {
   bits<10> imm10;
   bits<5> rs1;
@@ -82,8 +82,8 @@ class NDSRVInstBB<bit cs, string opcodestr>
 }
 
 class NDSRVInstBC<bits<3> funct3, string opcodestr>
-    : RVInst<(outs), (ins GPR:$rs1, uimm7:$cimm, simm11_lsb0:$imm10),
-             opcodestr, "$rs1, $cimm, $imm10", [], InstFormatOther>,
+    : RVInst<(outs), (ins GPR:$rs1, uimm7:$cimm, bare_simm11_lsb0:$imm10),
+             opcodestr, "$rs1, $cimm, $imm10", [], InstFormatNDS_BRANCH_10>,
       Sched<[WriteJmp, ReadIALU]> {
   bits<10> imm10;
   bits<5> rs1;
diff --git a/llvm/test/MC/RISCV/custom_reloc.s b/llvm/test/MC/RISCV/custom_reloc.s
index cdb819467875..a68f71063ea9 100644
--- a/llvm/test/MC/RISCV/custom_reloc.s
+++ b/llvm/test/MC/RISCV/custom_reloc.s
@@ -48,6 +48,19 @@
   # CHECK-OBJ-NEXT: R_RISCV_VENDOR    QUALCOMM
   # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_bar+0x2
 
+  .reloc ., R_RISCV_VENDOR,     ANDES
+  .reloc ., R_RISCV_NDS_BRANCH_10, my_bar + 2
+  addi a1, a1, 0
+  # CHECK-ASM:      [[L3:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L3]], R_RISCV_VENDOR, ANDES
+  # CHECK-ASM-NEXT: [[L4:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L4]], R_RISCV_NDS_BRANCH_10, my_bar+2
+  # CHECK-ASM-NEXT: mv a1, a1
+
+  # CHECK-OBJ:      addi a1, a1, 0
+  # CHECK-OBJ-NEXT: R_RISCV_VENDOR    ANDES
+  # CHECK-OBJ-NEXT: R_RISCV_CUSTOM241 my_bar+0x2
+
   nop
   # CHECK-ASM: nop
   # CHECK-OBJ: addi zero, zero, 0x0
diff --git a/llvm/test/MC/RISCV/vendor-symbol.s b/llvm/test/MC/RISCV/vendor-symbol.s
index 7df3a3efeb64..9595f218d78f 100644
--- a/llvm/test/MC/RISCV/vendor-symbol.s
+++ b/llvm/test/MC/RISCV/vendor-symbol.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple riscv32 -mattr=+experimental-xqcibi %s \
+# RUN: llvm-mc -triple riscv32 -mattr=+experimental-xqcibi,+xandesperf %s \
 # RUN:     -filetype=obj -o - \
 # RUN:     | llvm-readelf -sr - \
 # RUN:     | FileCheck %s
@@ -18,6 +18,14 @@ QUALCOMM:
 
   qc.e.bgeui s0, 20, QUALCOMM
 
+  nds.bbc t0, 7, ANDES
+
+  .global ANDES
+ANDES:
+  nop
+
+  nds.bbs t0, 7, ANDES
+
 
 # CHECK-LABEL: Relocation section '.rela.text'
 ## Note the different values for the "Sym. Value" Field
@@ -25,11 +33,20 @@ QUALCOMM:
 # CHECK: R_RISCV_CUSTOM193 00000006 QUALCOMM + 0
 # CHECK: R_RISCV_VENDOR    00000000 QUALCOMM + 0
 # CHECK: R_RISCV_CUSTOM193 00000006 QUALCOMM + 0
+# CHECK: R_RISCV_VENDOR    00000000 ANDES + 0
+# CHECK: R_RISCV_CUSTOM241 00000014 ANDES + 0
+# CHECK: R_RISCV_VENDOR    00000000 ANDES + 0
+# CHECK: R_RISCV_CUSTOM241 00000014 ANDES + 0
 
 
 # CHECK-LABEL: Symbol table '.symtab'
 # CHECK-NOT: QUALCOMM
+# CHECK-NOT: ANDES
 # CHECK: 00000000 0 NOTYPE  LOCAL  DEFAULT ABS QUALCOMM
+# CHECK: 00000000 0 NOTYPE  LOCAL  DEFAULT ABS ANDES
 # CHECK-NOT: QUALCOMM
+# CHECK-NOT: ANDES
 # CHECK: 00000006 0 NOTYPE  GLOBAL DEFAULT   2 QUALCOMM
+# CHECK: 00000014 0 NOTYPE  GLOBAL DEFAULT   2 ANDES
 # CHECK-NOT: QUALCOMM
+# CHECK-NOT: ANDES
diff --git a/llvm/test/MC/RISCV/xandesperf-fixups-diagnostics.s b/llvm/test/MC/RISCV/xandesperf-fixups-diagnostics.s
new file mode 100644
index 000000000000..e52f8129129d
--- /dev/null
+++ b/llvm/test/MC/RISCV/xandesperf-fixups-diagnostics.s
@@ -0,0 +1,13 @@
+# RUN: not llvm-mc -triple riscv32 -filetype obj -mattr=+xandesperf < %s -o /dev/null 2>&1 | FileCheck %s
+
+  nds.bbc t0, 7, far_distant # CHECK: :[[@LINE]]:3: error: fixup value out of range
+  nds.bbc t0, 7, unaligned # CHECK: :[[@LINE]]:3: error: fixup value must be 2-byte aligned
+
+  .byte 0
+unaligned:
+  .byte 0
+  .byte 0
+  .byte 0
+
+  .space 1<<10
+far_distant:
diff --git a/llvm/test/MC/RISCV/xandesperf-relocation.s b/llvm/test/MC/RISCV/xandesperf-relocation.s
new file mode 100644
index 000000000000..4df75f04dbfe
--- /dev/null
+++ b/llvm/test/MC/RISCV/xandesperf-relocation.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc -triple riscv32 -mattr=+xandesperf -M no-aliases < %s -show-encoding \
+# RUN:     | FileCheck -check-prefix=ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xandesperf < %s \
+# RUN:     | llvm-objdump -dr --mattr=+xandesperf - \
+# RUN:     | FileCheck -check-prefix=OBJ %s
+# RUN: llvm-mc -triple riscv64 -mattr=+xandesperf -M no-aliases < %s -show-encoding \
+# RUN:     | FileCheck -check-prefix=ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xandesperf < %s \
+# RUN:     | llvm-objdump -dr --mattr=+xandesperf - \
+# RUN:     | FileCheck -check-prefix=OBJ %s
+
+.long foo
+
+# ASM: nds.bbc t0, 7, foo
+# OBJ: nds.bbc t0, 0x7, 0x4 <.text+0x4>
+# OBJ-NEXT: R_RISCV_VENDOR ANDES{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM241 foo{{$}}
+nds.bbc t0, 7, foo
+
+# ASM: nds.bbs t0, 7, foo
+# OBJ-NEXT: nds.bbs t0, 0x7, 0x8 <.text+0x8>
+# OBJ-NEXT: R_RISCV_VENDOR ANDES{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM241 foo{{$}}
+nds.bbs t0, 7, foo
+
+# ASM: nds.beqc t0, 7, foo
+# OBJ-NEXT: nds.beqc t0, 0x7, 0xc <.text+0xc>
+# OBJ-NEXT: R_RISCV_VENDOR ANDES{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM241 foo{{$}}
+nds.beqc t0, 7, foo
+
+# ASM: nds.bnec t0, 7, foo
+# OBJ-NEXT: nds.bnec t0, 0x7, 0x10 <.text+0x10>
+# OBJ-NEXT: R_RISCV_VENDOR ANDES{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM241 foo{{$}}
+nds.bnec t0, 7, foo

From 513bcf6d012f7f9483af784de8487ee04cb9971a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 19 Jun 2025 18:38:58 -0700
Subject: [PATCH 0993/1322] [yaml2obj] Report error when the input filename
 does not exist

I invoked yaml2obj with a mistyped filename and received no error
message. I nearly thought the program had succeeded, but the shell's
exit code prompt tipped me off.

Pull Request: https://github.com/llvm/llvm-project/pull/144835
---
 llvm/test/tools/yaml2obj/basic.test | 4 ++++
 llvm/tools/yaml2obj/yaml2obj.cpp    | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/yaml2obj/basic.test

diff --git a/llvm/test/tools/yaml2obj/basic.test b/llvm/test/tools/yaml2obj/basic.test
new file mode 100644
index 000000000000..d673accfee88
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/basic.test
@@ -0,0 +1,4 @@
+# Test case when the input file does not exist.
+RUN: not yaml2obj %t.blah 2>&1 | FileCheck -DMSG=%errc_ENOENT --check-prefix=ENOENT %s
+
+ENOENT: yaml2obj: error: {{.*}}.blah: [[MSG]]
diff --git a/llvm/tools/yaml2obj/yaml2obj.cpp b/llvm/tools/yaml2obj/yaml2obj.cpp
index 45aa3828311f..dcd6dfcd3de2 100644
--- a/llvm/tools/yaml2obj/yaml2obj.cpp
+++ b/llvm/tools/yaml2obj/yaml2obj.cpp
@@ -117,8 +117,9 @@ int main(int argc, char **argv) {
       argc, argv, "Create an object file from a YAML description", nullptr,
       nullptr, /*LongOptionsUseDoubleDash=*/true);
 
-  auto ErrHandler = [](const Twine &Msg) {
-    WithColor::error(errs(), "yaml2obj") << Msg << "\n";
+  constexpr StringRef ProgName = "yaml2obj";
+  auto ErrHandler = [&](const Twine &Msg) {
+    WithColor::error(errs(), ProgName) << Msg << "\n";
   };
 
   std::error_code EC;
@@ -131,8 +132,10 @@ int main(int argc, char **argv) {
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
       MemoryBuffer::getFileOrSTDIN(Input, /*IsText=*/true);
-  if (!Buf)
+  if (std::error_code EC = Buf.getError()) {
+    WithColor::error(errs(), ProgName) << Input << ": " << EC.message() << '\n';
     return 1;
+  }
 
   std::optional<std::string> Buffer =
       preprocess(Buf.get()->getBuffer(), ErrHandler);

From 4f991cc99523e4bb7a0d96cee9f5c3a64bf2bc8e Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 19 Jun 2025 20:48:07 -0500
Subject: [PATCH 0994/1322] [lldb-dap] Make connection URLs match lldb
 (#144770)

Use the same scheme as ConnectionFileDescriptor::Connect and use
"listen" and "accept". Addresses feedback from a Pavel in a different PR
[1].

[1] https://github.com/llvm/llvm-project/pull/143628#discussion_r2152225200
---
 lldb/include/lldb/Host/Socket.h               |  9 ++++++
 lldb/source/Host/common/Socket.cpp            | 32 +++++++++++++++++--
 .../tools/lldb-dap/server/TestDAP_server.py   |  6 ++--
 lldb/tools/lldb-dap/Options.td                |  4 +--
 lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts |  2 +-
 lldb/tools/lldb-dap/tool/lldb-dap.cpp         | 29 +++++++++++------
 6 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/lldb/include/lldb/Host/Socket.h b/lldb/include/lldb/Host/Socket.h
index 4585eac12efb..c313aa4f6d26 100644
--- a/lldb/include/lldb/Host/Socket.h
+++ b/lldb/include/lldb/Host/Socket.h
@@ -74,6 +74,11 @@ public:
     ProtocolUnixAbstract
   };
 
+  enum SocketMode {
+    ModeAccept,
+    ModeConnect,
+  };
+
   struct HostAndPort {
     std::string hostname;
     uint16_t port;
@@ -83,6 +88,10 @@ public:
     }
   };
 
+  using ProtocolModePair = std::pair<SocketProtocol, SocketMode>;
+  static std::optional<ProtocolModePair>
+  GetProtocolAndMode(llvm::StringRef scheme);
+
   static const NativeSocket kInvalidSocketValue;
 
   ~Socket() override;
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index 76f74401ac4d..5c5cd653c3d9 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -271,7 +271,8 @@ Socket::UdpConnect(llvm::StringRef host_and_port) {
   return UDPSocket::CreateConnected(host_and_port);
 }
 
-llvm::Expected<Socket::HostAndPort> Socket::DecodeHostAndPort(llvm::StringRef host_and_port) {
+llvm::Expected<Socket::HostAndPort>
+Socket::DecodeHostAndPort(llvm::StringRef host_and_port) {
   static llvm::Regex g_regex("([^:]+|\\[[0-9a-fA-F:]+.*\\]):([0-9]+)");
   HostAndPort ret;
   llvm::SmallVector<llvm::StringRef, 3> matches;
@@ -347,8 +348,8 @@ Status Socket::Write(const void *buf, size_t &num_bytes) {
               ", src = %p, src_len = %" PRIu64 ", flags = 0) => %" PRIi64
               " (error = %s)",
               static_cast<void *>(this), static_cast<uint64_t>(m_socket), buf,
-              static_cast<uint64_t>(src_len),
-              static_cast<int64_t>(bytes_sent), error.AsCString());
+              static_cast<uint64_t>(src_len), static_cast<int64_t>(bytes_sent),
+              error.AsCString());
   }
 
   return error;
@@ -476,3 +477,28 @@ llvm::raw_ostream &lldb_private::operator<<(llvm::raw_ostream &OS,
                                             const Socket::HostAndPort &HP) {
   return OS << '[' << HP.hostname << ']' << ':' << HP.port;
 }
+
+std::optional<Socket::ProtocolModePair>
+Socket::GetProtocolAndMode(llvm::StringRef scheme) {
+  // Keep in sync with ConnectionFileDescriptor::Connect.
+  return llvm::StringSwitch<std::optional<ProtocolModePair>>(scheme)
+      .Case("listen", ProtocolModePair{SocketProtocol::ProtocolTcp,
+                                       SocketMode::ModeAccept})
+      .Cases("accept", "unix-accept",
+             ProtocolModePair{SocketProtocol::ProtocolUnixDomain,
+                              SocketMode::ModeAccept})
+      .Case("unix-abstract-accept",
+            ProtocolModePair{SocketProtocol::ProtocolUnixAbstract,
+                             SocketMode::ModeAccept})
+      .Cases("connect", "tcp-connect",
+             ProtocolModePair{SocketProtocol::ProtocolTcp,
+                              SocketMode::ModeConnect})
+      .Case("udp", ProtocolModePair{SocketProtocol::ProtocolTcp,
+                                    SocketMode::ModeConnect})
+      .Case("unix-connect", ProtocolModePair{SocketProtocol::ProtocolUnixDomain,
+                                             SocketMode::ModeConnect})
+      .Case("unix-abstract-connect",
+            ProtocolModePair{SocketProtocol::ProtocolUnixAbstract,
+                             SocketMode::ModeConnect})
+      .Default(std::nullopt);
+}
diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
index ed17044a220d..592a4cfb0a88 100644
--- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
+++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
@@ -54,7 +54,7 @@ class TestDAP_server(lldbdap_testcase.DAPTestCaseBase):
         Test launching a binary with a lldb-dap in server mode on a specific port.
         """
         self.build()
-        (_, connection) = self.start_server(connection="tcp://localhost:0")
+        (_, connection) = self.start_server(connection="listen://localhost:0")
         self.run_debug_session(connection, "Alice")
         self.run_debug_session(connection, "Bob")
 
@@ -72,7 +72,7 @@ class TestDAP_server(lldbdap_testcase.DAPTestCaseBase):
         self.addTearDownHook(cleanup)
 
         self.build()
-        (_, connection) = self.start_server(connection="unix://" + name)
+        (_, connection) = self.start_server(connection="accept://" + name)
         self.run_debug_session(connection, "Alice")
         self.run_debug_session(connection, "Bob")
 
@@ -82,7 +82,7 @@ class TestDAP_server(lldbdap_testcase.DAPTestCaseBase):
         Test launching a binary with lldb-dap in server mode and shutting down the server while the debug session is still active.
         """
         self.build()
-        (process, connection) = self.start_server(connection="tcp://localhost:0")
+        (process, connection) = self.start_server(connection="listen://localhost:0")
         self.dap_server = dap_server.DebugAdapterServer(
             connection=connection,
         )
diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/Options.td
index aecf91797ac7..867753e9294a 100644
--- a/lldb/tools/lldb-dap/Options.td
+++ b/lldb/tools/lldb-dap/Options.td
@@ -28,8 +28,8 @@ def connection
       MetaVarName<"<connection>">,
       HelpText<
           "Communicate with the lldb-dap tool over the specified connection. "
-          "Connections are specified like 'tcp://[host]:port' or "
-          "'unix:///path'.">;
+          "Connections are specified like 'listen://[host]:port' or "
+          "'accept:///path'.">;
 
 def launch_target: S<"launch-target">,
   MetaVarName<"<target>">,
diff --git a/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts b/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts
index f40dbf049a4b..79573ec7342b 100644
--- a/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts
+++ b/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts
@@ -26,7 +26,7 @@ export class LLDBDapServer implements vscode.Disposable {
     args: string[],
     options?: child_process.SpawnOptionsWithoutStdio,
   ): Promise<{ host: string; port: number } | undefined> {
-    const dapArgs = [...args, "--connection", "connect://localhost:0"];
+    const dapArgs = [...args, "--connection", "listen://localhost:0" ];
     if (!(await this.shouldContinueStartup(dapPath, dapArgs))) {
       return undefined;
     }
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index 9b9de5e21a74..2799d10ae84b 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -127,7 +127,7 @@ EXAMPLES:
   parent over stdio. Passing a --connection URI will cause lldb-dap to listen
   for a connection in the specified mode.
 
-    lldb-dap --connection connection://localhost:<port>
+    lldb-dap --connection listen://localhost:<port>
 
   Passing --wait-for-debugger will pause the process at startup and wait for a
   debugger to attach to the process.
@@ -226,23 +226,32 @@ static llvm::Expected<std::pair<Socket::SocketProtocol, std::string>>
 validateConnection(llvm::StringRef conn) {
   auto uri = lldb_private::URI::Parse(conn);
 
-  if (uri && (uri->scheme == "tcp" || uri->scheme == "connect" ||
-              !uri->hostname.empty() || uri->port)) {
+  auto make_error = [conn]() -> llvm::Error {
+    return llvm::createStringError(
+        "Unsupported connection specifier, expected 'accept:///path' or "
+        "'listen://[host]:port', got '%s'.",
+        conn.str().c_str());
+  };
+
+  if (!uri)
+    return make_error();
+
+  std::optional<Socket::ProtocolModePair> protocol_and_mode =
+      Socket::GetProtocolAndMode(uri->scheme);
+  if (!protocol_and_mode || protocol_and_mode->second != Socket::ModeAccept)
+    return make_error();
+
+  if (protocol_and_mode->first == Socket::ProtocolTcp) {
     return std::make_pair(
         Socket::ProtocolTcp,
         formatv("[{0}]:{1}", uri->hostname.empty() ? "0.0.0.0" : uri->hostname,
                 uri->port.value_or(0)));
   }
 
-  if (uri && (uri->scheme == "unix" || uri->scheme == "unix-connect" ||
-              uri->path != "/")) {
+  if (protocol_and_mode->first == Socket::ProtocolUnixDomain)
     return std::make_pair(Socket::ProtocolUnixDomain, uri->path.str());
-  }
 
-  return llvm::createStringError(
-      "Unsupported connection specifier, expected 'unix-connect:///path' or "
-      "'connect://[host]:port', got '%s'.",
-      conn.str().c_str());
+  return make_error();
 }
 
 static llvm::Error

From 50c5ecd35402dc734f2a462df5532e77a5ce12b2 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 20 Jun 2025 10:33:32 +0800
Subject: [PATCH 0995/1322] [NFC] [Serialization] Remove a useless comment

The comments belongs to other WIP patches.
---
 clang/include/clang/Serialization/ASTReader.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 866986dcbf76..be1c6e081759 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -379,8 +379,7 @@ struct VisibleLookupBlockOffsets {
   uint64_t TULocalOffset = 0;
 
   operator bool() const {
-    return VisibleOffset || ModuleLocalOffset || // ModuleUnitLocalOffset ||
-           TULocalOffset;
+    return VisibleOffset || ModuleLocalOffset || TULocalOffset;
   }
 };
 

From bb51c5d4b85a655e8c90247a6678e507402703a4 Mon Sep 17 00:00:00 2001
From: Ming Yan <99472920+NexMing@users.noreply.github.com>
Date: Fri, 20 Jun 2025 10:58:02 +0800
Subject: [PATCH 0996/1322] [SDPatternMatch] Add m_Poison matcher (#144860)

Add SDPatternMatch matcher and unit test coverage for ISD::POISON opcode

e.g.
```
m_InsertElt(m_Poison(), m_Value(), m_Zero())
```
---
 llvm/include/llvm/CodeGen/SDPatternMatch.h              | 2 ++
 llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index d413227c4d96..9eb6dd45f912 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -142,6 +142,8 @@ inline Opcode_match m_Opc(unsigned Opcode) { return Opcode_match(Opcode); }
 
 inline Opcode_match m_Undef() { return Opcode_match(ISD::UNDEF); }
 
+inline Opcode_match m_Poison() { return Opcode_match(ISD::POISON); }
+
 template <unsigned NumUses, typename Pattern> struct NUses_match {
   Pattern P;
 
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 2162588aadfd..2b1fa75a1475 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -493,6 +493,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
   SDValue UndefVInt32VT = DAG->getUNDEF(VInt32VT);
   EXPECT_TRUE(sd_match(UndefInt32VT, m_Undef()));
   EXPECT_TRUE(sd_match(UndefVInt32VT, m_Undef()));
+
+  SDValue PoisonInt32VT = DAG->getPOISON(Int32VT);
+  SDValue PoisonVInt32VT = DAG->getPOISON(VInt32VT);
+  EXPECT_TRUE(sd_match(PoisonInt32VT, m_Poison()));
+  EXPECT_TRUE(sd_match(PoisonVInt32VT, m_Poison()));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, patternCombinators) {

From 64fe3236476a9a85977abf5489414bbb2de2109c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 21:31:26 -0700
Subject: [PATCH 0997/1322] [llvm] Migrate away from ArrayRef(std::nullopt)
 (NFC) (#144967)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

This patch takes care of the llvm side of the migration.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h        |  4 ++--
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++++-----
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  |  3 +--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 3b87978fe3fa..90a75c3d352e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2376,8 +2376,8 @@ public:
                                           CostKind, 1, nullptr, nullptr);
       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
                                           CostKind, 0, nullptr, nullptr);
-      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
-                                      CostKind, 0, nullptr);
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, {}, CostKind,
+                                      0, nullptr);
       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
                                           CmpInst::ICMP_EQ, CostKind);
       Cost +=
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4551a365a696..5eef2497cf90 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5994,7 +5994,7 @@ static bool isMaskedLoadCompress(
       InstructionCost InterleavedCost =
           VectorGEPCost + TTI.getInterleavedMemoryOpCost(
                               Instruction::Load, AlignedLoadVecTy,
-                              CompressMask[1], std::nullopt, CommonAlignment,
+                              CompressMask[1], {}, CommonAlignment,
                               LI->getPointerAddressSpace(), CostKind, IsMasked);
       if (InterleavedCost < GatherCost) {
         InterleaveFactor = CompressMask[1];
@@ -13561,7 +13561,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       case TreeEntry::Vectorize:
         if (unsigned Factor = E->getInterleaveFactor()) {
           VecLdCost = TTI->getInterleavedMemoryOpCost(
-              Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
+              Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
               LI0->getPointerAddressSpace(), CostKind);
 
         } else {
@@ -13602,7 +13602,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Align CommonAlignment = LI0->getAlign();
         if (InterleaveFactor) {
           VecLdCost = TTI->getInterleavedMemoryOpCost(
-              Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt,
+              Instruction::Load, LoadVecTy, InterleaveFactor, {},
               CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
         } else if (IsMasked) {
           VecLdCost = TTI->getMaskedMemoryOpCost(
@@ -13677,8 +13677,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                  "No reused shuffles expected");
           CommonCost = 0;
           VecStCost = TTI->getInterleavedMemoryOpCost(
-              Instruction::Store, VecTy, Factor, std::nullopt,
-              BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
+              Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
+              BaseSI->getPointerAddressSpace(), CostKind);
         } else {
           TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
           VecStCost = TTI->getMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 22861eb1c7df..f45ce46763c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3478,8 +3478,7 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
 
   return Cost + IG->getNumMembers() *
                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                           VectorTy, std::nullopt, Ctx.CostKind,
-                                           0);
+                                           VectorTy, {}, Ctx.CostKind, 0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

From 69974658f079cec82a9fc13dd4993ab1e072c811 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Thu, 19 Jun 2025 22:52:51 -0700
Subject: [PATCH 0998/1322] [AMDGPU] Initial support for gfx1250 target.
 (#144965)

This is just a stub for now.
---
 clang/include/clang/Basic/OffloadArch.h       |  1 +
 clang/lib/Basic/OffloadArch.cpp               |  1 +
 clang/lib/Basic/Targets/NVPTX.cpp             |  1 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  1 +
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |  2 +
 clang/test/Driver/amdgpu-macros.cl            | 11 +++-
 clang/test/Driver/amdgpu-mcpu.cl              |  2 +
 .../Misc/target-invalid-cpu-note/amdgcn.c     |  1 +
 .../test/Misc/target-invalid-cpu-note/nvptx.c |  1 +
 llvm/docs/AMDGPUUsage.rst                     |  9 +++-
 llvm/include/llvm/BinaryFormat/ELF.h          |  2 +-
 llvm/include/llvm/TargetParser/TargetParser.h |  3 +-
 llvm/lib/Object/ELFObjectFile.cpp             |  2 +
 llvm/lib/ObjectYAML/ELFYAML.cpp               |  1 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 53 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/GCNProcessors.td       |  4 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  1 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  2 +
 llvm/lib/TargetParser/TargetParser.cpp        | 30 +++++++++++
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll |  2 +
 .../CodeGen/AMDGPU/elf-header-flags-mach.ll   |  2 +
 .../Object/AMDGPU/elf-header-flags-mach.yaml  |  7 +++
 .../llvm-objdump/ELF/AMDGPU/subtarget.ll      |  5 ++
 .../llvm-readobj/ELF/AMDGPU/elf-headers.test  |  9 ++++
 llvm/tools/llvm-readobj/ELFDumper.cpp         |  1 +
 25 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/OffloadArch.h b/clang/include/clang/Basic/OffloadArch.h
index 99b1024b9d0d..4dda3ec2216f 100644
--- a/clang/include/clang/Basic/OffloadArch.h
+++ b/clang/include/clang/Basic/OffloadArch.h
@@ -98,6 +98,7 @@ enum class OffloadArch {
   GFX12_GENERIC,
   GFX1200,
   GFX1201,
+  GFX1250,
   AMDGCNSPIRV,
   Generic, // A processor model named 'generic' if the target backend defines a
            // public one.
diff --git a/clang/lib/Basic/OffloadArch.cpp b/clang/lib/Basic/OffloadArch.cpp
index a019f0ac18c8..dce9ffaedb90 100644
--- a/clang/lib/Basic/OffloadArch.cpp
+++ b/clang/lib/Basic/OffloadArch.cpp
@@ -86,6 +86,7 @@ static const OffloadArchToStringMap ArchNames[] = {
     {OffloadArch::GFX12_GENERIC, "gfx12-generic", "compute_amdgcn"},
     GFX(1200), // gfx1200
     GFX(1201), // gfx1201
+    GFX(1250), // gfx1250
     {OffloadArch::AMDGCNSPIRV, "amdgcnspirv", "compute_amdgcn"},
     // Intel CPUs
     {OffloadArch::GRANITERAPIDS, "graniterapids", ""},
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 3235bf2e710d..54b39fd072a8 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -238,6 +238,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::GFX12_GENERIC:
       case OffloadArch::GFX1200:
       case OffloadArch::GFX1201:
+      case OffloadArch::GFX1250:
       case OffloadArch::AMDGCNSPIRV:
       case OffloadArch::Generic:
       case OffloadArch::GRANITERAPIDS:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 9e27e634676d..2bc9cd549f01 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2331,6 +2331,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::GFX12_GENERIC:
       case OffloadArch::GFX1200:
       case OffloadArch::GFX1201:
+      case OffloadArch::GFX1250:
       case OffloadArch::AMDGCNSPIRV:
       case OffloadArch::Generic:
       case OffloadArch::GRANITERAPIDS:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index b94e1e76c7a2..730ed47f0b0c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -52,6 +52,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1153 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1153 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1250 %s
 
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s
 
@@ -107,6 +108,7 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index 35dc190761ca..a60593f2ab9e 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -130,6 +130,7 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1153 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1153 -DFAMILY=GFX11
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1200 -DFAMILY=GFX12
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1250 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1250 -DFAMILY=GFX12
 
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_4_generic -DFAMILY=GFX9
@@ -177,13 +178,19 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DMCPU=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1030 \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1030 -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1030 -mno-cumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
-// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1250 \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1250 -mcumode \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1250 -mno-cumode \
+// RUN:   %s 2>&1 | FileCheck -DMCPU=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[MCPU]]' [-Woption-ignored]
 // CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1
 // CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0
diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index ad5fd8ebaa6a..6d302e4c59ad 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -115,6 +115,7 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx1153 %s 2>&1 | FileCheck --check-prefix=GFX1153 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX1200 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx1250 %s 2>&1 | FileCheck --check-prefix=GFX1250 %s
 
 // RUN: %clang -### -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefix=GFX9_GENERIC %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefix=GFX9_4_GENERIC %s
@@ -169,6 +170,7 @@
 // GFX1153:   "-target-cpu" "gfx1153"
 // GFX1200:   "-target-cpu" "gfx1200"
 // GFX1201:   "-target-cpu" "gfx1201"
+// GFX1250:   "-target-cpu" "gfx1250"
 
 // GFX9_GENERIC:      "-target-cpu" "gfx9-generic"
 // GFX9_4_GENERIC:    "-target-cpu" "gfx9-4-generic"
diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
index 9ef44b2bb403..352658b6fb38 100644
--- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
+++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
@@ -68,6 +68,7 @@
 // CHECK-SAME: {{^}}, gfx1153
 // CHECK-SAME: {{^}}, gfx1200
 // CHECK-SAME: {{^}}, gfx1201
+// CHECK-SAME: {{^}}, gfx1250
 // CHECK-SAME: {{^}}, gfx9-generic
 // CHECK-SAME: {{^}}, gfx10-1-generic
 // CHECK-SAME: {{^}}, gfx10-3-generic
diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
index 6675a1ecc34b..b5209ffc5f0a 100644
--- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -83,5 +83,6 @@
 // CHECK-SAME: {{^}}, gfx12-generic
 // CHECK-SAME: {{^}}, gfx1200
 // CHECK-SAME: {{^}}, gfx1201
+// CHECK-SAME: {{^}}, gfx1250
 // CHECK-SAME: {{^}}, amdgcnspirv
 // CHECK-SAME: {{$}}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ed3e4c8513e2..2fb4f5389fc7 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -531,6 +531,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                         work-item                       Add product
                                                                         IDs                             names.
 
+     ``gfx1250``                 ``amdgcn``   APU                     - Architected                   *TBA*
+                                                                        flat
+                                                                        scratch                       .. TODO::
+                                                                      - Packed
+                                                                        work-item                       Add product
+                                                                        IDs                             names.
+
      =========== =============== ============ ===== ================= =============== =============== ======================
 
 Generic processors allow execution of a single code object on any of the processors that
@@ -2265,7 +2272,7 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX1101``          0x046      ``gfx1101``
      ``EF_AMDGPU_MACH_AMDGCN_GFX1102``          0x047      ``gfx1102``
      ``EF_AMDGPU_MACH_AMDGCN_GFX1200``          0x048      ``gfx1200``
-     *reserved*                                 0x049      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX1250``          0x049      ``gfx1250``
      ``EF_AMDGPU_MACH_AMDGCN_GFX1151``          0x04a      ``gfx1151``
      *reserved*                                 0x04b      Reserved.
      ``EF_AMDGPU_MACH_AMDGCN_GFX942``           0x04c      ``gfx942``
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index dcf1f1c6d7b2..f5f236cf9806 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -835,7 +835,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX1101         = 0x046,
   EF_AMDGPU_MACH_AMDGCN_GFX1102         = 0x047,
   EF_AMDGPU_MACH_AMDGCN_GFX1200         = 0x048,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49   = 0x049,
+  EF_AMDGPU_MACH_AMDGCN_GFX1250         = 0x049,
   EF_AMDGPU_MACH_AMDGCN_GFX1151         = 0x04a,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4B   = 0x04b,
   EF_AMDGPU_MACH_AMDGCN_GFX942          = 0x04c,
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index b4a92cc6b6c4..2ea7829d668a 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -112,9 +112,10 @@ enum GPUKind : uint32_t {
 
   GK_GFX1200 = 100,
   GK_GFX1201 = 101,
+  GK_GFX1250 = 102,
 
   GK_AMDGCN_FIRST = GK_GFX600,
-  GK_AMDGCN_LAST = GK_GFX1201,
+  GK_AMDGCN_LAST = GK_GFX1250,
 
   GK_GFX9_GENERIC = 192,
   GK_GFX10_1_GENERIC = 193,
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index dd240550cba0..cce5c74aa4d1 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -597,6 +597,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
     return "gfx1200";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201:
     return "gfx1201";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250:
+    return "gfx1250";
 
   // Generic AMDGCN targets
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 520e956fdab9..fc485b6656e0 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -632,6 +632,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1153, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1250, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, EF_AMDGPU_MACH);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a9b39eebbcdc..559328459141 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -487,6 +487,12 @@ def FeatureGFX12Insts : SubtargetFeature<"gfx12-insts",
   "Additional instructions for GFX12+"
 >;
 
+def FeatureGFX1250Insts : SubtargetFeature<"gfx1250-insts",
+  "GFX1250Insts",
+  "true",
+  "Additional instructions for GFX1250+"
+>;
+
 def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
   "GFX10_3Insts",
   "true",
@@ -1882,6 +1888,53 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureBVHDualAndBVH8Insts
    ]>;
 
+def FeatureISAVersion12_50 : FeatureSet<
+  [FeatureGFX12,
+   FeatureGFX1250Insts,
+   FeatureCuMode,
+   FeatureLDSBankCount32,
+   FeatureDLInsts,
+   FeatureFmacF64Inst,
+   FeaturePackedFP32Ops,
+   FeatureDot7Insts,
+   FeatureDot8Insts,
+   FeatureWavefrontSize32,
+   FeatureShaderCyclesHiLoRegisters,
+   FeatureArchitectedFlatScratch,
+   FeatureArchitectedSGPRs,
+   FeatureAtomicFaddRtnInsts,
+   FeatureAtomicFaddNoRtnInsts,
+   FeatureAtomicDsPkAdd16Insts,
+   FeatureAtomicFlatPkAdd16Insts,
+   FeatureAtomicBufferGlobalPkAddF16Insts,
+   FeatureAtomicGlobalPkAddBF16Inst,
+   FeatureAtomicBufferPkAddBF16Inst,
+   FeatureFlatAtomicFaddF32Inst,
+   FeatureFP8ConversionInsts,
+   FeaturePackedTID,
+   FeatureVcmpxPermlaneHazard,
+   FeatureSALUFloatInsts,
+   FeaturePseudoScalarTrans,
+   FeatureHasRestrictedSOffset,
+   FeatureScalarDwordx3Loads,
+   FeatureDPPSrc1SGPR,
+   FeatureBitOp3Insts,
+   FeatureBF16ConversionInsts,
+   FeatureCvtPkF16F32Inst,
+   FeatureMinimum3Maximum3PKF16,
+   FeaturePrngInst,
+   FeaturePermlane16Swap,
+   FeatureAshrPkInsts,
+   FeatureSupportsSRAMECC,
+   FeatureMaxHardClauseLength63,
+   FeatureAtomicFMinFMaxF64GlobalInsts,
+   FeatureAtomicFMinFMaxF64FlatInsts,
+   FeatureFlatBufferGlobalAtomicFaddF64Inst,
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureKernargPreload,
+   FeatureLshlAddU64Inst,
+]>;
+
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
     [FeatureRequiresCOV6])>;
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 0b372e29efe6..0b331bd3f3fb 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -325,3 +325,7 @@ def : ProcessorModel<"gfx1201", GFX12SpeedModel,
 def : ProcessorModel<"gfx12-generic", GFX12SpeedModel,
   FeatureISAVersion12_Generic.Features
 >;
+
+def : ProcessorModel<"gfx1250", GFX12SpeedModel,
+  FeatureISAVersion12_50.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 202e5b38f0a4..dd57cc96e41c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -113,6 +113,7 @@ protected:
   bool GFX10Insts = false;
   bool GFX11Insts = false;
   bool GFX12Insts = false;
+  bool GFX1250Insts = false;
   bool GFX10_3Insts = false;
   bool GFX7GFX8GFX9Insts = false;
   bool SGPRInitBug = false;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 6d69bb75f293..10f6d3382368 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -117,6 +117,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153: AK = GK_GFX1153; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250: AK = GK_GFX1250; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:     AK = GK_GFX9_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC:   AK = GK_GFX9_4_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:  AK = GK_GFX10_1_GENERIC; break;
@@ -202,6 +203,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1153: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153;
   case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
   case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
+  case GK_GFX1250: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250;
   case GK_GFX9_GENERIC:     return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
   case GK_GFX9_4_GENERIC:   return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC;
   case GK_GFX10_1_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 90791dfb7b7c..49442c30eb44 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -174,6 +174,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1153"},   {"gfx1153"}, GK_GFX1153, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1200"},   {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
+    {{"gfx1250"},   {"gfx1250"}, GK_GFX1250, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
 
     {{"gfx9-generic"},      {"gfx9-generic"},    GK_GFX9_GENERIC,    FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
     {{"gfx10-1-generic"},   {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
@@ -328,6 +329,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX1153: return {11, 5, 3};
   case GK_GFX1200: return {12, 0, 0};
   case GK_GFX1201: return {12, 0, 1};
+  case GK_GFX1250: return {12, 5, 0};
 
   // Generic targets return the lowest common denominator
   // within their family. That is, the ISA that is the most
@@ -425,6 +427,33 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
   } else if (T.isAMDGCN()) {
     AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
     switch (Kind) {
+    case GK_GFX1250:
+      Features["ci-insts"] = true;
+      Features["dot7-insts"] = true;
+      Features["dot8-insts"] = true;
+      Features["dl-insts"] = true;
+      Features["16-bit-insts"] = true;
+      Features["dpp"] = true;
+      Features["gfx8-insts"] = true;
+      Features["gfx9-insts"] = true;
+      Features["gfx10-insts"] = true;
+      Features["gfx10-3-insts"] = true;
+      Features["gfx11-insts"] = true;
+      Features["gfx12-insts"] = true;
+      Features["gfx1250-insts"] = true;
+      Features["bitop3-insts"] = true;
+      Features["prng-inst"] = true;
+      Features["fp8-conversion-insts"] = true;
+      Features["permlane16-swap"] = true;
+      Features["ashr-pk-insts"] = true;
+      Features["atomic-buffer-pk-add-bf16-inst"] = true;
+      Features["atomic-fadd-rtn-insts"] = true;
+      Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+      Features["atomic-flat-pk-add-16-insts"] = true;
+      Features["atomic-global-pk-add-bf16-inst"] = true;
+      Features["atomic-ds-pk-add-16-insts"] = true;
+      Features["setprio-inc-wg-inst"] = true;
+      break;
     case GK_GFX1201:
     case GK_GFX1200:
     case GK_GFX12_GENERIC:
@@ -678,6 +707,7 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) {
   // XXX - What does the member GPU mean if device name string passed here?
   if (T.isAMDGCN()) {
     switch (parseArchAMDGCN(GPU)) {
+    case GK_GFX1250:
     case GK_GFX1201:
     case GK_GFX1200:
     case GK_GFX1153:
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index c954e1fe124e..9940cc53e612 100644
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -106,6 +106,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1153 < %s | FileCheck --check-prefixes=GFX1153 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX1200 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 < %s | FileCheck --check-prefixes=GFX1201 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
 
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_NOXNACK %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_XNACK %s
@@ -203,6 +204,7 @@
 ; GFX1153: .amdgcn_target "amdgcn-amd-amdhsa--gfx1153"
 ; GFX1200: .amdgcn_target "amdgcn-amd-amdhsa--gfx1200"
 ; GFX1201: .amdgcn_target "amdgcn-amd-amdhsa--gfx1201"
+; GFX1250: .amdgcn_target "amdgcn-amd-amdhsa--gfx1250"
 
 ; GFX9_GENERIC_NOXNACK:     .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack-"
 ; GFX9_GENERIC_XNACK:       .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack+"
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 65039b471694..32cacf0d6dd9 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -77,6 +77,7 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1153 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1153 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1200 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1200 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1250 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1250 %s
 
 ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_GENERIC %s
 ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-4-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_4_GENERIC %s
@@ -158,6 +159,7 @@
 ; GFX1153:       EF_AMDGPU_MACH_AMDGCN_GFX1153 (0x58)
 ; GFX1200:       EF_AMDGPU_MACH_AMDGCN_GFX1200 (0x48)
 ; GFX1201:       EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
+; GFX1250:       EF_AMDGPU_MACH_AMDGCN_GFX1250 (0x49)
 
 ; GFX9_GENERIC:       EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
 ; GFX9_4_GENERIC:     EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F)
diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 8d5ce006e24d..51590819f8b9 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -242,6 +242,10 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1201 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1201 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX1201 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1201 %s
 
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX1250/' %s | yaml2obj -o %t.o.AMDGCN_GFX1250
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1250 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1250 %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX1250 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1250 %s
+
 # RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX9_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX9_GENERIC
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_GENERIC %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_GENERIC %s
@@ -467,6 +471,9 @@
 # ELF-AMDGCN-GFX1201:   EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
 # YAML-AMDGCN-GFX1201:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1201 ]
 
+# ELF-AMDGCN-GFX1250:   EF_AMDGPU_MACH_AMDGCN_GFX1250 (0x49)
+# YAML-AMDGCN-GFX1250:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1250 ]
+
 # ELF-AMDGCN-GFX9_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
 # YAML-AMDGCN-GFX9_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC ]
 
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
index 0e392e228715..155ced379b0c 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -21,6 +21,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
 ; RUN: diff %t-specify.txt %t-detect.txt
 
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1250 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; ----------------------------------GFX11--------------------------------------
 ;
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -filetype=obj -O0 -o %t.o %s
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
index dd9eaaeabd8c..8edb887ab005 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -454,6 +454,15 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC -DFLAG_VALUE=0x59
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1250
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1250 -DFLAG_VALUE=0x49
+
+# RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1250
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1250 -DFLAG_VALUE=0x49
+
+# RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1250
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1250 -DFLAG_VALUE=0x49
+
 --- !ELF
 FileHeader:
   Class:           ELFCLASS64
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index abaf6077ba9e..7be61dcce841 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1640,6 +1640,7 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1153, "gfx1153"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"),                          \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1250, "gfx1250"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"),                \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, "gfx9-4-generic"),            \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, "gfx10-1-generic"),          \

From b8d3efa189620bfd48dab5fb05b923560fb1e2d5 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Thu, 19 Jun 2025 23:09:34 -0700
Subject: [PATCH 0999/1322] [BOLT][Linux] Fix linux_banner lookup (#144962)

While detecting the Linux kernel version, look for `linux_banner` symbol
with local visibility if the global one was not found.

Fixes #144847
---
 bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 40 ++++++++++++++----------
 bolt/test/X86/linux-version.S            | 11 +++++++
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 5a5e044184d0..174721a3a053 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -432,25 +432,33 @@ public:
 };
 
 Error LinuxKernelRewriter::detectLinuxKernelVersion() {
-  if (BinaryData *BD = BC.getBinaryDataByName("linux_banner")) {
-    const BinarySection &Section = BD->getSection();
-    const std::string S =
-        Section.getContents().substr(BD->getOffset(), BD->getSize()).str();
+  // Check for global and local linux_banner symbol.
+  BinaryData *BD = BC.getBinaryDataByName("linux_banner");
+  if (!BD)
+    BD = BC.getBinaryDataByName("linux_banner/1");
 
-    const std::regex Re(R"---(Linux version ((\d+)\.(\d+)(\.(\d+))?))---");
-    std::smatch Match;
-    if (std::regex_search(S, Match, Re)) {
-      const unsigned Major = std::stoi(Match[2].str());
-      const unsigned Minor = std::stoi(Match[3].str());
-      const unsigned Rev = Match[5].matched ? std::stoi(Match[5].str()) : 0;
-      LinuxKernelVersion = LKVersion(Major, Minor, Rev);
-      BC.outs() << "BOLT-INFO: Linux kernel version is " << Match[1].str()
-                << "\n";
-      return Error::success();
-    }
+  if (!BD)
+    return createStringError(errc::executable_format_error,
+                             "unable to locate linux_banner");
+
+  const BinarySection &Section = BD->getSection();
+  const std::string S =
+      Section.getContents().substr(BD->getOffset(), BD->getSize()).str();
+
+  const std::regex Re(R"---(Linux version ((\d+)\.(\d+)(\.(\d+))?))---");
+  std::smatch Match;
+  if (std::regex_search(S, Match, Re)) {
+    const unsigned Major = std::stoi(Match[2].str());
+    const unsigned Minor = std::stoi(Match[3].str());
+    const unsigned Rev = Match[5].matched ? std::stoi(Match[5].str()) : 0;
+    LinuxKernelVersion = LKVersion(Major, Minor, Rev);
+    BC.outs() << "BOLT-INFO: Linux kernel version is " << Match[1].str()
+              << "\n";
+    return Error::success();
   }
+
   return createStringError(errc::executable_format_error,
-                           "Linux kernel version is unknown");
+                           "Linux kernel version is unknown: " + S);
 }
 
 void LinuxKernelRewriter::processLKSections() {
diff --git a/bolt/test/X86/linux-version.S b/bolt/test/X86/linux-version.S
index e680d0d64a21..a3d7f365304a 100644
--- a/bolt/test/X86/linux-version.S
+++ b/bolt/test/X86/linux-version.S
@@ -17,6 +17,11 @@
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
 # RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-C %s
 
+# RUN: %clang -DD -target x86_64-unknown-unknown \
+# RUN:   %cflags -nostdlib %s -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
+# RUN: llvm-bolt %t.exe -o %t.out 2>&1 | FileCheck --check-prefix=CHECK-D %s
+
   .text
   .globl foo
   .type foo, %function
@@ -46,6 +51,12 @@ linux_banner:
 #endif
 # CHECK-C: BOLT-INFO: Linux kernel version is 6.6
 
+#ifdef D
+  .hidden linux_banner
+  .string  "Linux version 6.6.15.2-2-xxx\n"
+#endif
+# CHECK-D: BOLT-INFO: Linux kernel version is 6.6
+
   .size  linux_banner, . - linux_banner
 
 ## Fake Linux Kernel sections.

From 7cbb1411550ef6caab18a9360f1549d6029ffe86 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Jun 2025 23:29:50 -0700
Subject: [PATCH 1000/1322] [clang] Migrate away from ArrayRef(std::nullopt)
 (NFC) (#144982)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

This patch takes care of the clang side of the migration.
---
 clang/include/clang/AST/TypeProperties.td | 2 +-
 clang/lib/AST/ASTContext.cpp              | 4 ++--
 clang/lib/AST/ASTDiagnostic.cpp           | 4 ++--
 clang/lib/AST/ASTImporter.cpp             | 2 +-
 clang/lib/AST/DeclTemplate.cpp            | 2 +-
 clang/lib/AST/QualTypeNames.cpp           | 5 ++---
 clang/lib/Basic/Targets/Xtensa.h          | 2 +-
 clang/lib/CodeGen/CGBuiltin.cpp           | 2 +-
 clang/lib/Driver/ToolChains/Clang.cpp     | 4 ++--
 clang/lib/Sema/SemaConcept.cpp            | 4 ++--
 clang/lib/Sema/SemaExpr.cpp               | 2 +-
 clang/lib/Sema/TreeTransform.h            | 2 +-
 12 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 6e44bce893e7..d7dbf1b43df2 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -753,7 +753,7 @@ let Class = TemplateSpecializationType in {
   }
 
   def : Creator<[{
-    return ctx.getTemplateSpecializationType(templateName, args, std::nullopt, UnderlyingType);
+    return ctx.getTemplateSpecializationType(templateName, args, {}, UnderlyingType);
   }]>;
 }
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 189e67e4eed0..74be2871f270 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -14283,7 +14283,7 @@ static QualType getCommonNonSugarTypeNode(ASTContext &Ctx, const Type *X,
         ::getCommonTemplateNameChecked(Ctx, TX->getTemplateName(),
                                        TY->getTemplateName(),
                                        /*IgnoreDeduced=*/true),
-        As, /*CanonicalArgs=*/std::nullopt, X->getCanonicalTypeInternal());
+        As, /*CanonicalArgs=*/{}, X->getCanonicalTypeInternal());
   }
   case Type::Decltype: {
     const auto *DX = cast<DecltypeType>(X);
@@ -14529,7 +14529,7 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X,
                                    TY->template_arguments()))
       return QualType();
     return Ctx.getTemplateSpecializationType(CTN, As,
-                                             /*CanonicalArgs=*/std::nullopt,
+                                             /*CanonicalArgs=*/{},
                                              Ctx.getQualifiedType(Underlying));
   }
   case Type::Typedef: {
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index a00d5801f054..522abd5912db 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -130,7 +130,7 @@ QualType clang::desugarForDiagnostic(ASTContext &Context, QualType QT,
         if (DesugarArgument) {
           ShouldAKA = true;
           QT = Context.getTemplateSpecializationType(
-              TST->getTemplateName(), Args, /*CanonicalArgs=*/std::nullopt, QT);
+              TST->getTemplateName(), Args, /*CanonicalArgs=*/{}, QT);
         }
         break;
       }
@@ -1143,7 +1143,7 @@ class TemplateDiff {
 
     Ty = Context.getTemplateSpecializationType(
         TemplateName(CTSD->getSpecializedTemplate()),
-        CTSD->getTemplateArgs().asArray(), /*CanonicalArgs=*/std::nullopt,
+        CTSD->getTemplateArgs().asArray(), /*CanonicalArgs=*/{},
         Ty.getLocalUnqualifiedType().getCanonicalType());
 
     return Ty->getAs<TemplateSpecializationType>();
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 96a5e2eeaa4d..4621ebb854d8 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1664,7 +1664,7 @@ ExpectedType ASTNodeImporter::VisitTemplateSpecializationType(
   if (!ToUnderlyingOrErr)
     return ToUnderlyingOrErr.takeError();
   return Importer.getToContext().getTemplateSpecializationType(
-      *ToTemplateOrErr, ToTemplateArgs, std::nullopt, *ToUnderlyingOrErr);
+      *ToTemplateOrErr, ToTemplateArgs, {}, *ToUnderlyingOrErr);
 }
 
 ExpectedType ASTNodeImporter::VisitElaboratedType(const ElaboratedType *T) {
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index e1ef2188dbdb..5035f2d33b0a 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -669,7 +669,7 @@ ClassTemplateDecl::getInjectedClassNameSpecialization() {
   CommonPtr->InjectedClassNameType =
       Context.getTemplateSpecializationType(Name,
                                             /*SpecifiedArgs=*/TemplateArgs,
-                                            /*CanonicalArgs=*/std::nullopt);
+                                            /*CanonicalArgs=*/{});
   return CommonPtr->InjectedClassNameType;
 }
 
diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp
index 4d11a3b62331..39703d6d7b88 100644
--- a/clang/lib/AST/QualTypeNames.cpp
+++ b/clang/lib/AST/QualTypeNames.cpp
@@ -140,7 +140,7 @@ static const Type *getFullyQualifiedTemplateType(const ASTContext &Ctx,
     if (MightHaveChanged) {
       QualType QT = Ctx.getTemplateSpecializationType(
           TST->getTemplateName(), FQArgs,
-          /*CanonicalArgs=*/std::nullopt, TST->desugar());
+          /*CanonicalArgs=*/{}, TST->desugar());
       // getTemplateSpecializationType returns a fully qualified
       // version of the specialization itself, so no need to qualify
       // it.
@@ -172,8 +172,7 @@ static const Type *getFullyQualifiedTemplateType(const ASTContext &Ctx,
         TemplateName TN(TSTDecl->getSpecializedTemplate());
         QualType QT = Ctx.getTemplateSpecializationType(
             TN, FQArgs,
-            /*CanonicalArgs=*/std::nullopt,
-            TSTRecord->getCanonicalTypeInternal());
+            /*CanonicalArgs=*/{}, TSTRecord->getCanonicalTypeInternal());
         // getTemplateSpecializationType returns a fully qualified
         // version of the specialization itself, so no need to qualify
         // it.
diff --git a/clang/lib/Basic/Targets/Xtensa.h b/clang/lib/Basic/Targets/Xtensa.h
index 470835aacff5..f3558ac247be 100644
--- a/clang/lib/Basic/Targets/Xtensa.h
+++ b/clang/lib/Basic/Targets/Xtensa.h
@@ -77,7 +77,7 @@ public:
   }
 
   ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
-    return std::nullopt;
+    return {};
   }
 
   bool validateAsmConstraint(const char *&Name,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1f6927435167..2c011a951986 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2032,7 +2032,7 @@ Value *CodeGenFunction::EmitCheckedArgForAssume(const Expr *E) {
       std::make_pair(ArgValue, CheckOrdinal), CheckHandler,
       {EmitCheckSourceLocation(E->getExprLoc()),
        llvm::ConstantInt::get(Builder.getInt8Ty(), BCK_AssumePassedFalse)},
-      std::nullopt);
+      {});
   return ArgValue;
 }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index e910a2bedeeb..2bb42a319ecc 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -8953,7 +8953,7 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, std::nullopt, Output));
+      CmdArgs, ArrayRef<InputInfo>(), Output));
 }
 
 void OffloadBundler::ConstructJobMultipleOutputs(
@@ -9040,7 +9040,7 @@ void OffloadBundler::ConstructJobMultipleOutputs(
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, std::nullopt, Outputs));
+      CmdArgs, ArrayRef<InputInfo>(), Outputs));
 }
 
 void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index a4f660c39b4b..1594b4423e4d 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1081,10 +1081,10 @@ static bool CheckFunctionConstraintsWithoutInstantiation(
   // FIXME: Add TemplateArgs through the 'Innermost' parameter once
   // the refactoring of getTemplateInstantiationArgs() relands.
   MultiLevelTemplateArgumentList MLTAL;
-  MLTAL.addOuterTemplateArguments(Template, std::nullopt, /*Final=*/false);
+  MLTAL.addOuterTemplateArguments(Template, {}, /*Final=*/false);
   SemaRef.getTemplateInstantiationArgs(
       MLTAL, /*D=*/FD, FD,
-      /*Final=*/false, /*Innermost=*/std::nullopt, /*RelativeToPrimary=*/true,
+      /*Final=*/false, /*Innermost=*/{}, /*RelativeToPrimary=*/true,
       /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true);
   MLTAL.replaceInnermostTemplateArguments(Template, TemplateArgs);
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ebc43157d4c2..fc2819458a4f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -21212,7 +21212,7 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
     }
     if (TST.isNull())
       TST = Context.getTemplateSpecializationType(
-          TN, ULE->template_arguments(), /*CanonicalArgs=*/std::nullopt,
+          TN, ULE->template_arguments(), /*CanonicalArgs=*/{},
           HasAnyDependentTA ? Context.DependentTy : Context.IntTy);
     QualType ET =
         Context.getElaboratedType(ElaboratedTypeKeyword::None, NNS, TST);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 3e33fb73e01b..26bee7a96de2 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -16222,7 +16222,7 @@ TreeTransform<Derived>::TransformSizeOfPackExpr(SizeOfPackExpr *E) {
   return getDerived().RebuildSizeOfPackExpr(
       E->getOperatorLoc(), E->getPack(), E->getPackLoc(), E->getRParenLoc(),
       /*Length=*/static_cast<unsigned>(Args.size()),
-      /*PartialArgs=*/std::nullopt);
+      /*PartialArgs=*/{});
 }
 
 template <typename Derived>

From 0f302f38b0014a0018031ffb3cb898fdc7d90880 Mon Sep 17 00:00:00 2001
From: no92 <no92@users.noreply.github.com>
Date: Fri, 20 Jun 2025 08:40:20 +0200
Subject: [PATCH 1001/1322] [clang] Add managarm support (#144791)

This is a repost of the quickly reverted #139271. The failing buildbot
tests have been fixed and pass on my machine now.
---
 clang/lib/Basic/Targets.cpp                   |   9 +
 clang/lib/Basic/Targets/OSTargets.h           |  30 ++
 clang/lib/Driver/CMakeLists.txt               |   1 +
 clang/lib/Driver/Driver.cpp                   |   4 +
 clang/lib/Driver/ToolChains/Gnu.cpp           |   2 +
 clang/lib/Driver/ToolChains/Managarm.cpp      | 218 ++++++++++++++
 clang/lib/Driver/ToolChains/Managarm.h        |  55 ++++
 clang/lib/Lex/InitHeaderSearch.cpp            |   1 +
 .../lib/aarch64-managarm-mlibc/.keep          |   0
 .../lib/riscv64-managarm-mlibc/.keep          |   0
 .../lib/x86_64-managarm-mlibc/.keep           |   0
 .../lib64/aarch64-managarm-mlibc/.keep        |   0
 .../lib64/riscv64-managarm-mlibc/.keep        |   0
 .../lib64/x86_64-managarm-mlibc/.keep         |   0
 .../aarch64-managarm-mlibc/c++/10/.keep       |   0
 .../usr/include/c++/10/.keep                  |   0
 .../usr/include/c++/v1/.keep                  |   0
 .../riscv64-managarm-mlibc/c++/10/.keep       |   0
 .../x86_64-managarm-mlibc/c++/10/.keep        |   0
 .../usr/lib/aarch64-managarm-mlibc/.keep      |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbegin.o   |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginS.o  |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginT.o  |   0
 .../usr/lib/riscv64-managarm-mlibc/.keep      |   0
 .../usr/lib/x86_64-managarm-mlibc/.keep       |   0
 .../basic_managarm_tree/usr/lib64/.keep       |   0
 clang/test/Driver/managarm.cpp                | 267 ++++++++++++++++++
 clang/test/Preprocessor/init.c                |   5 +
 .../predefined-macros-no-warnings.c           |   3 +
 35 files changed, 595 insertions(+)
 create mode 100644 clang/lib/Driver/ToolChains/Managarm.cpp
 create mode 100644 clang/lib/Driver/ToolChains/Managarm.h
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
 create mode 100644 clang/test/Driver/managarm.cpp

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 9889141ad208..af1111a86330 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -164,6 +164,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
       }
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<AArch64leTargetInfo>>(Triple,
+                                                                       Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
@@ -466,6 +469,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<RISCV64TargetInfo>>(Triple,
                                                                    Opts);
       }
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<RISCV64TargetInfo>>(Triple,
+                                                                     Opts);
     default:
       return std::make_unique<RISCV64TargetInfo>(Triple, Opts);
     }
@@ -654,6 +660,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<PS5OSTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::Hurd:
       return std::make_unique<HurdTargetInfo<X86_64TargetInfo>>(Triple, Opts);
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<X86_64TargetInfo>>(Triple,
+                                                                    Opts);
     default:
       return std::make_unique<X86_64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index d148b38d03c7..5dac699c2bb4 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -395,6 +395,36 @@ public:
   }
 };
 
+// Managarm Target
+template <typename Target>
+class LLVM_LIBRARY_VISIBILITY ManagarmTargetInfo : public OSTargetInfo<Target> {
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override {
+    DefineStd(Builder, "unix", Opts);
+    Builder.defineMacro("__managarm__");
+    if (Opts.POSIXThreads)
+      Builder.defineMacro("_REENTRANT");
+    if (Opts.CPlusPlus)
+      Builder.defineMacro("_GNU_SOURCE");
+    if (this->HasFloat128)
+      Builder.defineMacro("__FLOAT128__");
+  }
+
+public:
+  ManagarmTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
+    switch (Triple.getArch()) {
+    default:
+      break;
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      this->HasFloat128 = true;
+      break;
+    }
+  }
+};
+
 // NetBSD Target
 template <typename Target>
 class LLVM_LIBRARY_VISIBILITY NetBSDTargetInfo : public OSTargetInfo<Target> {
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 44e16edfb1cc..3cfd671e9d8f 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -66,6 +66,7 @@ add_clang_library(clangDriver
   ToolChains/HLSL.cpp
   ToolChains/Hurd.cpp
   ToolChains/Linux.cpp
+  ToolChains/Managarm.cpp
   ToolChains/MipsLinux.cpp
   ToolChains/MinGW.cpp
   ToolChains/MSP430.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 061d60c62903..2d055ffa17a8 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -33,6 +33,7 @@
 #include "ToolChains/Linux.h"
 #include "ToolChains/MSP430.h"
 #include "ToolChains/MSVC.h"
+#include "ToolChains/Managarm.h"
 #include "ToolChains/MinGW.h"
 #include "ToolChains/MipsLinux.h"
 #include "ToolChains/NaCl.h"
@@ -6850,6 +6851,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::Fuchsia:
       TC = std::make_unique<toolchains::Fuchsia>(*this, Target, Args);
       break;
+    case llvm::Triple::Managarm:
+      TC = std::make_unique<toolchains::Managarm>(*this, Target, Args);
+      break;
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9203bbc91b0b..afce4fffe1d5 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -226,6 +226,8 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
       return "elf_iamcu";
     return "elf_i386";
   case llvm::Triple::aarch64:
+    if (T.isOSManagarm())
+      return "aarch64managarm";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
     return "aarch64linuxb";
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
new file mode 100644
index 000000000000..ff455f2c6ec7
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/Managarm.cpp
@@ -0,0 +1,218 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Managarm.h"
+#include "Arch/ARM.h"
+#include "Arch/RISCV.h"
+#include "clang/Config/config.h"
+#include "clang/Driver/CommonArgs.h"
+#include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/Path.h"
+
+using namespace clang::driver;
+using namespace clang::driver::toolchains;
+using namespace clang;
+using namespace llvm::opt;
+
+using tools::addPathIfExists;
+
+std::string Managarm::getMultiarchTriple(const Driver &D,
+                                         const llvm::Triple &TargetTriple,
+                                         StringRef SysRoot) const {
+  switch (TargetTriple.getArch()) {
+  default:
+    return TargetTriple.str();
+  case llvm::Triple::x86_64:
+    return "x86_64-managarm-" + TargetTriple.getEnvironmentName().str();
+  case llvm::Triple::aarch64:
+    return "aarch64-managarm-" + TargetTriple.getEnvironmentName().str();
+  case llvm::Triple::riscv64:
+    return "riscv64-managarm-" + TargetTriple.getEnvironmentName().str();
+  }
+}
+
+static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
+  // It happens that only x86, PPC and SPARC use the 'lib32' variant of
+  // oslibdir, and using that variant while targeting other architectures causes
+  // problems because the libraries are laid out in shared system roots that
+  // can't cope with a 'lib32' library search path being considered. So we only
+  // enable them when we know we may need it.
+  //
+  // FIXME: This is a bit of a hack. We should really unify this code for
+  // reasoning about oslibdir spellings with the lib dir spellings in the
+  // GCCInstallationDetector, but that is a more significant refactoring.
+  if (Triple.getArch() == llvm::Triple::x86 || Triple.isPPC32() ||
+      Triple.getArch() == llvm::Triple::sparc)
+    return "lib32";
+
+  if (Triple.getArch() == llvm::Triple::x86_64 && Triple.isX32())
+    return "libx32";
+
+  if (Triple.getArch() == llvm::Triple::riscv32)
+    return "lib32";
+
+  return Triple.isArch32Bit() ? "lib" : "lib64";
+}
+
+Managarm::Managarm(const Driver &D, const llvm::Triple &Triple,
+                   const ArgList &Args)
+    : Generic_ELF(D, Triple, Args) {
+  GCCInstallation.init(Triple, Args);
+  Multilibs = GCCInstallation.getMultilibs();
+  SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+  std::string SysRoot = computeSysRoot();
+
+  ToolChain::path_list &PPaths = getProgramPaths();
+
+  Generic_GCC::PushPPaths(PPaths);
+
+#ifdef ENABLE_LINKER_BUILD_ID
+  ExtraOpts.push_back("--build-id");
+#endif
+
+  // The selection of paths to try here is designed to match the patterns which
+  // the GCC driver itself uses, as this is part of the GCC-compatible driver.
+  // This was determined by running GCC in a fake filesystem, creating all
+  // possible permutations of these directories, and seeing which ones it added
+  // to the link paths.
+  path_list &Paths = getFilePaths();
+
+  const std::string OSLibDir = std::string(getOSLibDir(Triple, Args));
+  const std::string MultiarchTriple = getMultiarchTriple(D, Triple, SysRoot);
+
+  Generic_GCC::AddMultilibPaths(D, SysRoot, OSLibDir, MultiarchTriple, Paths);
+
+  addPathIfExists(D, concat(SysRoot, "/lib", MultiarchTriple), Paths);
+  addPathIfExists(D, concat(SysRoot, "/lib/..", OSLibDir), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr/lib", MultiarchTriple), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr", OSLibDir), Paths);
+
+  Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths);
+
+  addPathIfExists(D, concat(SysRoot, "/lib"), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr/lib"), Paths);
+}
+
+bool Managarm::HasNativeLLVMSupport() const { return true; }
+
+Tool *Managarm::buildLinker() const {
+  return new tools::gnutools::Linker(*this);
+}
+
+Tool *Managarm::buildAssembler() const {
+  return new tools::gnutools::Assembler(*this);
+}
+
+std::string Managarm::computeSysRoot() const {
+  if (!getDriver().SysRoot.empty())
+    return getDriver().SysRoot;
+  return std::string();
+}
+
+std::string Managarm::getDynamicLinker(const ArgList &Args) const {
+  switch (getTriple().getArch()) {
+  case llvm::Triple::aarch64:
+    return "/lib/aarch64-managarm/ld.so";
+  case llvm::Triple::riscv64: {
+    StringRef ABIName = tools::riscv::getRISCVABI(Args, getTriple());
+    return ("/lib/riscv64-managarm/ld-riscv64-" + ABIName + ".so").str();
+  }
+  case llvm::Triple::x86_64:
+    return "/lib/x86_64-managarm/ld.so";
+  default:
+    llvm_unreachable("unsupported architecture");
+  }
+}
+
+void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                         ArgStringList &CC1Args) const {
+  const Driver &D = getDriver();
+  std::string SysRoot = computeSysRoot();
+
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+    return;
+
+  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
+    addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include");
+
+  // Add 'include' in the resource directory, which is similar to
+  // GCC_INCLUDE_DIR (private headers) in GCC.
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> ResourceDirInclude(D.ResourceDir);
+    llvm::sys::path::append(ResourceDirInclude, "include");
+    addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
+    return;
+
+  // TOOL_INCLUDE_DIR
+  AddMultilibIncludeArgs(DriverArgs, CC1Args);
+
+  // Check for configure-time C include directories.
+  StringRef CIncludeDirs(C_INCLUDE_DIRS);
+  if (CIncludeDirs != "") {
+    SmallVector<StringRef, 5> dirs;
+    CIncludeDirs.split(dirs, ":");
+    for (StringRef dir : dirs) {
+      StringRef Prefix =
+          llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : "";
+      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
+    }
+    return;
+  }
+
+  // On systems using multiarch, add /usr/include/$triple before
+  // /usr/include.
+  std::string MultiarchIncludeDir = getMultiarchTriple(D, getTriple(), SysRoot);
+  if (!MultiarchIncludeDir.empty())
+    addExternCSystemInclude(
+        DriverArgs, CC1Args,
+        concat(SysRoot, "/usr/include", MultiarchIncludeDir));
+
+  // Add an include of '/include' directly. This isn't provided by default by
+  // system GCCs, but is often used with cross-compiling GCCs, and harmless to
+  // add even when Clang is acting as-if it were a system compiler.
+  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/include"));
+
+  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/usr/include"));
+}
+
+void Managarm::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  // We need a detected GCC installation on Managarm to provide libstdc++'s
+  // headers.
+  if (!GCCInstallation.isValid())
+    return;
+
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+
+  // Try generic GCC detection.
+  Generic_GCC::addGCCLibStdCxxIncludePaths(DriverArgs, CC1Args, TripleStr);
+}
+
+SanitizerMask Managarm::getSupportedSanitizers() const {
+  const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
+  SanitizerMask Res = ToolChain::getSupportedSanitizers();
+  Res |= SanitizerKind::PointerCompare;
+  Res |= SanitizerKind::PointerSubtract;
+  Res |= SanitizerKind::KernelAddress;
+  Res |= SanitizerKind::Vptr;
+  if (IsX86_64)
+    Res |= SanitizerKind::KernelMemory;
+  return Res;
+}
+
+void Managarm::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
+  for (const auto &Opt : ExtraOpts)
+    CmdArgs.push_back(Opt.c_str());
+}
diff --git a/clang/lib/Driver/ToolChains/Managarm.h b/clang/lib/Driver/ToolChains/Managarm.h
new file mode 100644
index 000000000000..2082e2c615f2
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/Managarm.h
@@ -0,0 +1,55 @@
+//===--- Managarm.h - Managarm ToolChain Implementations --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
+
+#include "Gnu.h"
+#include "clang/Driver/ToolChain.h"
+
+namespace clang {
+namespace driver {
+namespace toolchains {
+
+class LLVM_LIBRARY_VISIBILITY Managarm : public Generic_ELF {
+public:
+  Managarm(const Driver &D, const llvm::Triple &Triple,
+           const llvm::opt::ArgList &Args);
+
+  bool HasNativeLLVMSupport() const override;
+
+  std::string getMultiarchTriple(const Driver &D,
+                                 const llvm::Triple &TargetTriple,
+                                 StringRef SysRoot) const override;
+
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
+  SanitizerMask getSupportedSanitizers() const override;
+  std::string computeSysRoot() const override;
+
+  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
+
+  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
+
+  std::vector<std::string> ExtraOpts;
+
+protected:
+  Tool *buildAssembler() const override;
+  Tool *buildLinker() const override;
+};
+
+} // end namespace toolchains
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 641e3beebc08..3e22b4001bde 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -221,6 +221,7 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
   case llvm::Triple::Hurd:
   case llvm::Triple::Linux:
   case llvm::Triple::LiteOS:
+  case llvm::Triple::Managarm:
   case llvm::Triple::NaCl:
   case llvm::Triple::NetBSD:
   case llvm::Triple::OpenBSD:
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/managarm.cpp b/clang/test/Driver/managarm.cpp
new file mode 100644
index 000000000000..9c3f2d4d722a
--- /dev/null
+++ b/clang/test/Driver/managarm.cpp
@@ -0,0 +1,267 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-X86-64 %s
+// CHECK-X86-64:      "-cc1"
+// CHECK-X86-64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
+// CHECK-X86-64-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
+// CHECK-X86-64-SAME: "-L
+// CHECK-X86-64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-X86-64-LIBS %s
+// CHECK-X86-64-LIBS:      "-cc1"
+// CHECK-X86-64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT:[^"]+]]/include/c++/v1"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT:[^"]+]]/usr/local/include"
+// CHECK-X86-64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-LIBS-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
+// CHECK-X86-64-LIBS-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
+// CHECK-X86-64-LIBS-SAME: "-L
+// CHECK-X86-64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-X86-64-STATIC %s
+// CHECK-X86-64-STATIC:      "-cc1"
+// CHECK-X86-64-STATIC-SAME: "-static-define"
+// CHECK-X86-64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-STATIC-SAME: "-static"
+// CHECK-X86-64-STATIC-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-X86-64-STATIC-SAME: "-L
+// CHECK-X86-64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-X86-64-SHARED %s
+// CHECK-X86-64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SHARED-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-X86-64-SHARED-SAME: "-L
+// CHECK-X86-64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
+// CHECK-AARCH64:      "-cc1"
+// CHECK-AARCH64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
+// CHECK-AARCH64-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
+// CHECK-AARCH64-SAME: {{^}} "-L
+// CHECK-AARCH64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-LIBS %s
+// CHECK-AARCH64-LIBS:      "-cc1"
+// CHECK-AARCH64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT:[^"]+]]/include/c++/v1"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT:[^"]+]]/usr/local/include"
+// CHECK-AARCH64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-LIBS-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
+// CHECK-AARCH64-LIBS-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L
+// CHECK-AARCH64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-STATIC %s
+// CHECK-AARCH64-STATIC:      "-cc1"
+// CHECK-AARCH64-STATIC-SAME: "-static-define"
+// CHECK-AARCH64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-STATIC-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-static"
+// CHECK-AARCH64-STATIC-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L
+// CHECK-AARCH64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SHARED %s
+// CHECK-AARCH64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SHARED-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L
+// CHECK-AARCH64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-RISCV64 %s
+// CHECK-RISCV64:      "-cc1"
+// CHECK-RISCV64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
+// CHECK-RISCV64-SAME: "-L
+// CHECK-RISCV64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-LIBS %s
+// CHECK-RISCV64-LIBS:      "-cc1"
+// CHECK-RISCV64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT:[^"]+]]/include/c++/v1"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT:[^"]+]]/usr/local/include"
+// CHECK-RISCV64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-LIBS-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
+// CHECK-RISCV64-LIBS-SAME: "-L
+// CHECK-RISCV64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-STATIC %s
+// CHECK-RISCV64-STATIC:      "-cc1"
+// CHECK-RISCV64-STATIC-SAME: "-static-define"
+// CHECK-RISCV64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-STATIC-SAME: "-static"
+// CHECK-RISCV64-STATIC-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-RISCV64-STATIC-SAME: "-L
+// CHECK-RISCV64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-SHARED %s
+// CHECK-RISCV64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SHARED-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-RISCV64-SHARED-SAME: "-L
+// CHECK-RISCV64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 031a6c1a755b..bed39dc3e34d 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,6 +1622,11 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
+// RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// MANAGARM: #define __managarm__ 1
+
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c
index 4e3e29ccfa8a..fe27ed8814ee 100644
--- a/clang/test/Preprocessor/predefined-macros-no-warnings.c
+++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c
@@ -14,6 +14,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux-openhos
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-netbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-openbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-win32-gnu
@@ -108,6 +109,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux-openhos
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-solaris
@@ -167,6 +169,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps4
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps5
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32

From 06e08f38e1ec57bf3cb5e08569b52eb6a3c3e166 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Jun 2025 15:43:39 +0900
Subject: [PATCH 1002/1322] AArch64: Use reportFatalUsageError for unsupported
 calling conv (#144840)

This probably should emit a DiagnosticInfoUnsupported and use the
default calling convention instead, but then we would need to pass
in the context.

Also move where CCAssignFnForCall is called. It was unnecessarily
called for each argument, so the error wouldn't trigger for functions
with 0 arguments.

This only ensures the error occurs for functions defined with the
calling convention. The error is still missed for outgoing calls
with no arguments. The lowering logic here is convoluted, calling
CCAssignFnForCall for each argument and it does not mirror
LowerFormalArguments so I'm not sure what's going on here.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  | 14 +++++++++-----
 llvm/test/CodeGen/AArch64/unsupported-cc-call.ll |  8 ++++++++
 llvm/test/CodeGen/AArch64/unsupported-cc-func.ll |  6 ++++++
 llvm/test/MC/AArch64/coff-function-type-info.ll  |  6 +++---
 4 files changed, 26 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/unsupported-cc-call.ll
 create mode 100644 llvm/test/CodeGen/AArch64/unsupported-cc-func.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 581f15277602..69f8c6c21a55 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7632,7 +7632,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                      bool IsVarArg) const {
   switch (CC) {
   default:
-    report_fatal_error("Unsupported calling convention.");
+    reportFatalUsageError("unsupported calling convention");
   case CallingConv::GHC:
     return CC_AArch64_GHC;
   case CallingConv::PreserveNone:
@@ -7741,6 +7741,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   unsigned NumArgs = Ins.size();
   Function::const_arg_iterator CurOrigArg = F.arg_begin();
   unsigned CurArgIdx = 0;
+  bool UseVarArgCC = false;
+  if (IsWin64)
+    UseVarArgCC = isVarArg;
+
+  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
+
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ValVT = Ins[i].VT;
     if (Ins[i].isOrigArg()) {
@@ -7757,10 +7763,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       else if (ActualMVT == MVT::i16)
         ValVT = MVT::i16;
     }
-    bool UseVarArgCC = false;
-    if (IsWin64)
-      UseVarArgCC = isVarArg;
-    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
     bool Res =
         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
     assert(!Res && "Call operand has unhandled type");
@@ -8429,6 +8431,8 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
         ArgVT = MVT::i16;
     }
 
+    // FIXME: CCAssignFnForCall should be called once, for the call and not per
+    // argument. This logic should exactly mirror LowerFormalArguments.
     CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
     bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
     assert(!Res && "Call operand has unhandled type");
diff --git a/llvm/test/CodeGen/AArch64/unsupported-cc-call.ll b/llvm/test/CodeGen/AArch64/unsupported-cc-call.ll
new file mode 100644
index 000000000000..3e03ac81baf5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/unsupported-cc-call.ll
@@ -0,0 +1,8 @@
+; FIXME: This should error:
+; RUN: llc -mtriple=aarch64-- -filetype=null %s
+declare amdgpu_gfx void @amdgpu_gfx_func()
+
+define void @call_amdgpu_gfx_func() {
+  call amdgpu_gfx void @amdgpu_gfx_func()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/unsupported-cc-func.ll b/llvm/test/CodeGen/AArch64/unsupported-cc-func.ll
new file mode 100644
index 000000000000..2d0fb12af0d4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/unsupported-cc-func.ll
@@ -0,0 +1,6 @@
+; RUN: not llc -mtriple=aarch64-- -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: unsupported calling convention
+define amdgpu_gfx void @amdgpu_gfx_func_definition() {
+  ret void
+}
diff --git a/llvm/test/MC/AArch64/coff-function-type-info.ll b/llvm/test/MC/AArch64/coff-function-type-info.ll
index 21446c50ab27..63cac59adef1 100644
--- a/llvm/test/MC/AArch64/coff-function-type-info.ll
+++ b/llvm/test/MC/AArch64/coff-function-type-info.ll
@@ -1,10 +1,10 @@
-; RUN: llc -mtriple arm64-windows -filetype asm -o - %s \
+    ; RUN: llc -mtriple arm64-windows -filetype asm -o - %s \
 ; RUN:    | FileCheck %s -check-prefix CHECK-ASM
 
 ; RUN: llc -mtriple arm64-windows -filetype obj -o - %s \
 ; RUN:    | llvm-readobj --symbols - | FileCheck %s -check-prefix CHECK-OBJECT
 
-define arm_aapcs_vfpcc void @external() {
+define aarch64_vector_pcs void @external() {
 entry:
   ret void
 }
@@ -15,7 +15,7 @@ entry:
 ; CHECK-ASM: .endef
 ; CHECK-ASM: .globl external
 
-define internal arm_aapcs_vfpcc void @internal() {
+define internal aarch64_vector_pcs void @internal() {
 entry:
   ret void
 }

From 090f409538d2b426f11ce5aa22af8c243099aecf Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Jun 2025 06:47:44 +0000
Subject: [PATCH 1003/1322] [gn build] Port 0f302f38b001

---
 llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index e929e0088384..98c78dc15b32 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -84,6 +84,7 @@ static_library("Driver") {
     "ToolChains/Linux.cpp",
     "ToolChains/MSP430.cpp",
     "ToolChains/MSVC.cpp",
+    "ToolChains/Managarm.cpp",
     "ToolChains/MinGW.cpp",
     "ToolChains/MipsLinux.cpp",
     "ToolChains/NaCl.cpp",

From d196124dd22391f6c967ed569b34632840536c45 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Jun 2025 09:16:39 +0200
Subject: [PATCH 1004/1322] [PredicateInfo] Remove unnecessary EdgeUsesOnly set
 (NFC) (#144912)

As far as I can tell, this set is pointless: It just represents whether
the target block has multiple predecessors, and the way it is
constructed and queried, we're not even reducing the number of
getSinglePredecessor() calls or something like that.
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 4c87babbfb6f..4a0faab00cca 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -253,10 +253,6 @@ class PredicateInfoBuilder {
   // whether it returned a valid result.
   DenseMap<Value *, unsigned int> ValueInfoNums;
 
-  // The set of edges along which we can only handle phi uses, due to critical
-  // edges.
-  DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
-
   ValueInfo &getOrCreateValueInfo(Value *);
   const ValueInfo &getValueInfo(Value *) const;
 
@@ -459,8 +455,6 @@ void PredicateInfoBuilder::processBranch(
           PredicateBase *PB =
               new PredicateBranch(V, BranchBB, Succ, Cond, TakenEdge);
           addInfoFor(OpsToRename, V, PB);
-          if (!Succ->getSinglePredecessor())
-            EdgeUsesOnly.insert({BranchBB, Succ});
         }
       }
     }
@@ -487,8 +481,6 @@ void PredicateInfoBuilder::processSwitch(
       PredicateSwitch *PS = new PredicateSwitch(
           Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
       addInfoFor(OpsToRename, Op, PS);
-      if (!TargetBlock->getSinglePredecessor())
-        EdgeUsesOnly.insert({BranchBB, TargetBlock});
     }
   }
 }
@@ -637,7 +629,7 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
         // block, and handle it specially. We know that it goes last, and only
         // dominate phi uses.
         auto BlockEdge = getBlockEdge(PossibleCopy);
-        if (EdgeUsesOnly.count(BlockEdge)) {
+        if (!BlockEdge.second->getSinglePredecessor()) {
           VD.LocalNum = LN_Last;
           auto *DomNode = DT.getNode(BlockEdge.first);
           if (DomNode) {

From cbb5e244f7564091f9169f525fd8456e68bc028a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Jun 2025 09:35:11 +0200
Subject: [PATCH 1005/1322] [PredicateInfo] Remove redundant EdgeOnly member
 (NFC)

EdgeOnly indicates a phi def, which can already be identified by
LN_Last with non-null PInfo. Most of the code already reasons in
terms of LN_Last instead of EdgeOnly.
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 4a0faab00cca..778287bb41b6 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -89,9 +89,7 @@ struct ValueDFS {
   // Only one of Def or Use will be set.
   Value *Def = nullptr;
   Use *U = nullptr;
-  // Neither PInfo nor EdgeOnly participate in the ordering
   PredicateBase *PInfo = nullptr;
-  bool EdgeOnly = false;
 };
 
 // Perform a strict weak ordering on instructions and arguments.
@@ -289,10 +287,11 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
     return false;
   // If it's a phi only use, make sure it's for this phi node edge, and that the
   // use is in a phi node.  If it's anything else, and the top of the stack is
-  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to
-  // the defs they must go with so that we can know it's time to pop the stack
-  // when we hit the end of the phi uses for a given def.
-  if (Stack.back().EdgeOnly) {
+  // a LN_Last def, we need to pop the stack.  We deliberately sort phi uses
+  // next to the defs they must go with so that we can know it's time to pop
+  // the stack when we hit the end of the phi uses for a given def.
+  const ValueDFS &Top = Stack.back();
+  if (Top.LocalNum == LN_Last && Top.PInfo) {
     if (!VDUse.U)
       return false;
     auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
@@ -300,15 +299,14 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
       return false;
     // Check edge
     BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
-    if (EdgePred != getBranchBlock(Stack.back().PInfo))
+    if (EdgePred != getBranchBlock(Top.PInfo))
       return false;
 
     // Use dominates, which knows how to handle edge dominance.
-    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+    return DT.dominates(getBlockEdge(Top.PInfo), *VDUse.U);
   }
 
-  return (VDUse.DFSIn >= Stack.back().DFSIn &&
-          VDUse.DFSOut <= Stack.back().DFSOut);
+  return VDUse.DFSIn >= Top.DFSIn && VDUse.DFSOut <= Top.DFSOut;
 }
 
 void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack,
@@ -636,7 +634,6 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
             VD.DFSIn = DomNode->getDFSNumIn();
             VD.DFSOut = DomNode->getDFSNumOut();
             VD.PInfo = PossibleCopy;
-            VD.EdgeOnly = true;
             OrderedUses.push_back(VD);
           }
         } else {

From 1cae21da47b1f53c3946534b12507a035fb283d2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Jun 2025 16:43:39 +0900
Subject: [PATCH 1006/1322] AMDGPU: Remove legacy PM version of
 AMDGPUPromoteAllocaToVector (#144986)

This is only run in the middle end with the new pass manager now,
so garbage collect the old PM version.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  4 --
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 40 -------------------
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  1 -
 3 files changed, 45 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5a917734e9c7..0ad18c38213f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -242,10 +242,6 @@ FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
-FunctionPass *createAMDGPUPromoteAllocaToVector();
-void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
-extern char &AMDGPUPromoteAllocaToVectorID;
-
 struct AMDGPUPromoteAllocaPass : PassInfoMixin<AMDGPUPromoteAllocaPass> {
   AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 700dc87d2f82..e0f3c72890b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -167,34 +167,6 @@ public:
   }
 };
 
-class AMDGPUPromoteAllocaToVector : public FunctionPass {
-public:
-  static char ID;
-
-  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
-
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F))
-      return false;
-    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(
-                 TPC->getTM<TargetMachine>(),
-                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
-          .run(F, /*PromoteToLDS*/ false);
-    return false;
-  }
-
-  StringRef getPassName() const override {
-    return "AMDGPU Promote Alloca to vector";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<LoopInfoWrapperPass>();
-    FunctionPass::getAnalysisUsage(AU);
-  }
-};
-
 static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
                             const Function &F) {
   if (!TM.getTargetTriple().isAMDGCN())
@@ -216,7 +188,6 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
 } // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
-char AMDGPUPromoteAllocaToVector::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
                       "AMDGPU promote alloca to vector or LDS", false, false)
@@ -227,14 +198,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
                     "AMDGPU promote alloca to vector or LDS", false, false)
 
-INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
-                      "AMDGPU promote alloca to vector", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
-                    "AMDGPU promote alloca to vector", false, false)
-
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
-char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
@@ -264,10 +228,6 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
-  return new AMDGPUPromoteAllocaToVector();
-}
-
 static void collectAllocaUses(AllocaInst &Alloca,
                               SmallVectorImpl<Use *> &Uses) {
   SmallVector<Instruction *, 4> WorkList({&Alloca});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f390d39043ed..074beccef33c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -529,7 +529,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
   initializeAMDGPURegBankCombinerPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
-  initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
   initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);

From c361bffa50f1ed790c393ffbab39c2e07dfcb242 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Jun 2025 16:44:06 +0900
Subject: [PATCH 1007/1322] AMDGPU: Remove legacy pass manager version of
 AMDGPUUnifyMetadata (#144985)

This is only run in the new pass manager now.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  4 ---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  1 -
 .../lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp | 27 +------------------
 3 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0ad18c38213f..68a3caf59544 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
-ModulePass* createAMDGPUUnifyMetadataPass();
-void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
-extern char &AMDGPUUnifyMetadataID;
-
 struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 074beccef33c..1ea7dd01d15c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -537,7 +537,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
-  initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluLegacyPass(*PR);
   initializeSIInsertHardClausesLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index cd7866b86d55..e400491c3860 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -32,17 +32,6 @@ namespace {
 
   } // end namespace kOCLMD
 
-  /// Unify multiple OpenCL metadata due to linking.
-  class AMDGPUUnifyMetadata : public ModulePass {
-  public:
-    static char ID;
-
-    explicit AMDGPUUnifyMetadata() : ModulePass(ID) {}
-
-  private:
-    bool runOnModule(Module &M) override;
-  };
-
     /// Unify version metadata.
     /// \return true if changes are made.
     /// Assume the named metadata has operands each of which is a pair of
@@ -104,6 +93,7 @@ namespace {
     return true;
   }
 
+  /// Unify multiple OpenCL metadata due to linking.
   bool unifyMetadataImpl(Module &M) {
     const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
     const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
@@ -122,21 +112,6 @@ namespace {
 
   } // end anonymous namespace
 
-  char AMDGPUUnifyMetadata::ID = 0;
-
-  char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
-
-  INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
-                  "Unify multiple OpenCL metadata due to linking", false, false)
-
-  ModulePass *llvm::createAMDGPUUnifyMetadataPass() {
-    return new AMDGPUUnifyMetadata();
-  }
-
-  bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
-    return unifyMetadataImpl(M);
-  }
-
   PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
                                                  ModuleAnalysisManager &AM) {
     return unifyMetadataImpl(M) ? PreservedAnalyses::none()

From 8973be462c49a7b0a24c61f41e07a721706b1ad8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Jun 2025 09:46:41 +0200
Subject: [PATCH 1008/1322] [PredicateInfo] Avoid duplicate stack in scope
 check (NFC)

popStackUntilDFSScope() is going to check this itself, there is
no need to do it in advance as well.
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 778287bb41b6..9b239d9161e7 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -283,8 +283,7 @@ public:
 
 bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
                                           const ValueDFS &VDUse) const {
-  if (Stack.empty())
-    return false;
+  assert(!Stack.empty() && "Should not be called with empty stack");
   // If it's a phi only use, make sure it's for this phi node edge, and that the
   // use is in a phi node.  If it's anything else, and the top of the stack is
   // a LN_Last def, we need to pop the stack.  We deliberately sort phi uses
@@ -677,22 +676,18 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
       LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
                         << VD.DFSOut << ")\n");
 
-      bool ShouldPush = (VD.Def || PossibleCopy);
-      bool OutOfScope = !stackIsInScope(RenameStack, VD);
-      if (OutOfScope || ShouldPush) {
-        // Sync to our current scope.
-        popStackUntilDFSScope(RenameStack, VD);
-        if (ShouldPush) {
-          RenameStack.push_back(VD);
-        }
+      // Sync to our current scope.
+      popStackUntilDFSScope(RenameStack, VD);
+
+      if (VD.Def || PossibleCopy) {
+        RenameStack.push_back(VD);
+        continue;
       }
+
       // If we get to this point, and the stack is empty we must have a use
       // with no renaming needed, just skip it.
       if (RenameStack.empty())
         continue;
-      // Skip values, only want to rename the uses
-      if (VD.Def || PossibleCopy)
-        continue;
       if (!DebugCounter::shouldExecute(RenameCounter)) {
         LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
         continue;

From bc14e5e5e9c717a7699cf10b13a7661a9e033594 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 20 Jun 2025 01:05:43 -0700
Subject: [PATCH 1009/1322] [AMDGPU] Add trivial gfx1250 runlines to MC tests.
 NFC. (#144988)

---
 llvm/test/MC/AMDGPU/gfx12_asm_smem.s                            | 1 +
 llvm/test/MC/AMDGPU/gfx12_asm_sopk.s                            | 1 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s                      | 1 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s                       | 1 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s                      | 2 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s                       | 2 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s            | 1 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s             | 1 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s            | 2 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s             | 2 ++
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt            | 1 +
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt       | 2 ++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt  | 2 ++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt   | 2 ++
 14 files changed, 21 insertions(+)

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
index 2ef027459fa6..f4086e81d5c6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX12 %s
 
 //===----------------------------------------------------------------------===//
 // ENC_SMEM.
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s
index 5ce6847b9dca..4e3e725a0055 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX12 %s
 
 s_movk_i32 s0, 0x1234
 // GFX12: encoding: [0x34,0x12,0x00,0xb0]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index 9ccdd2f604cb..06ccf098bd60 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -1,6 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 
 v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 61ee43dc9d16..d28d00da1910 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -1,6 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 
 v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s
index d4378ae0a210..3b10481d5e72 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s
@@ -3,6 +3,8 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 
 v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0]
 // W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s
index a81cd58acf03..235a6ba3dbb9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s
@@ -3,6 +3,8 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 
 v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
 // W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 6d07f299d1d6..292419949aef 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -1,5 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index d2fc865cc8b4..c61fa9aff244 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -1,5 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
index ee28cc85bc5c..8fea02c9029e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
@@ -3,6 +3,8 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 
 v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 quad_perm:[3,2,1,0]
 // W32: v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x20,0xd5,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
index 2bdad0875e00..a8e7676a3cb9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
@@ -3,6 +3,8 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 
 v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
 // W32: v_add_co_ci_u32_e64_dpp v5, s6, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x20,0xd5,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
index 02641e6eb97f..328fdbca10f3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
 
 # GFX12: s_load_i8 s5, s[2:3], s0 offset:0x0            ; encoding: [0x41,0x01,0x01,0xf4,0x00,0x00,0x00,0x00]
 0x41,0x01,0x01,0xf4,0x00,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index 7f9b268440cf..7a7be57e17e7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -3,6 +3,8 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05
 # GFX12: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index a020b0ae46a3..32eb7711089e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_bfrev_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xb8,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index ad3c673b4e39..13a58aa6b415 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_bfrev_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xb8,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]

From b6b8fa3b15d334c51fcf8763ccda0102a01aeb9c Mon Sep 17 00:00:00 2001
From: int-zjt <zhangjiatong.0@bytedance.com>
Date: Fri, 20 Jun 2025 16:24:19 +0800
Subject: [PATCH 1010/1322] [llvm-cov][gcov] Support multi-files coverage in
 one basic block (#144504)

In the current gcov implementation, all lines within a basic block are
attributed to the source file of the block's containing function. This
is inaccurate when a block contains lines from other files (e.g., via
#include "foo.inc").

Commit
[406e81b](https://github.com/llvm/llvm-project/commit/406e81b79d26dae6838cc69d10a3e22635da09ef)
attempted to address this by filtering lines based on debug info types,
but this approach has two limitations:

* **Over-filtering**: Some valid lines belonging to the function are
incorrectly excluded.
* **Under-counting**: Lines not belonging to the function are filtered
out and omitted from coverage statistics.

**GCC Reference Behavior**
GCC's gcov implementation handles this case correctly.This change aligns
the LLVM behavior with GCC.

**Proposed Solution**
1. **GCNO Generation**:

* **Current**: Each block stores a single GCOVLines record (filename +
lines).

* **New**: Dynamically create new GCOVLines records whenever consecutive
lines in a block originate from different source files. Group subsequent
lines from the same file under one record.

2. **GCNO Parsing**:

* **Current**: Lines are directly attributed to the function's source
file.

* **New**: Introduce a GCOVLocation type to track filename/line mappings
within blocks. Statistics will reflect the actual source file for each
line.
---
 .../profile/Posix/gcov-file-change-line.cpp   | 15 +++++++
 .../test/profile/Posix/gcov-file-change.cpp   | 12 +++--
 llvm/include/llvm/ProfileData/GCOV.h          | 20 +++++++--
 llvm/lib/ProfileData/GCOV.cpp                 | 45 +++++++++++--------
 .../Instrumentation/GCOVProfiling.cpp         | 37 +++++++--------
 5 files changed, 80 insertions(+), 49 deletions(-)
 create mode 100644 compiler-rt/test/profile/Posix/gcov-file-change-line.cpp

diff --git a/compiler-rt/test/profile/Posix/gcov-file-change-line.cpp b/compiler-rt/test/profile/Posix/gcov-file-change-line.cpp
new file mode 100644
index 000000000000..a750befb47e5
--- /dev/null
+++ b/compiler-rt/test/profile/Posix/gcov-file-change-line.cpp
@@ -0,0 +1,15 @@
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: %clangxx --coverage main.cpp -o t
+// RUN: %run ./t
+// RUN: llvm-cov gcov -t t-main. | FileCheck %s
+
+//--- main.cpp
+#include <stdio.h>
+
+int main(int argc, char *argv[]) { // CHECK:      2: [[#]]:int main
+  puts("");                        // CHECK-NEXT: 2: [[#]]:
+#line 3
+  puts(""); // line 3
+  return 0; // line 4
+}
+// CHECK-NOT:  {{^ +[0-9]+:}}
diff --git a/compiler-rt/test/profile/Posix/gcov-file-change.cpp b/compiler-rt/test/profile/Posix/gcov-file-change.cpp
index 9d3bc79591f2..0cef1c3512f8 100644
--- a/compiler-rt/test/profile/Posix/gcov-file-change.cpp
+++ b/compiler-rt/test/profile/Posix/gcov-file-change.cpp
@@ -16,8 +16,8 @@ inline auto *const inl_var_main = // CHECK:      1: [[#]]:inline auto
 void foo(int x) {                 // CHECK-NEXT: 1: [[#]]:
   if (x) {                        // CHECK-NEXT: 1: [[#]]:
 #include "a.inc"
-  }
-}
+  } // CHECK:      1: [[#]]:
+} // CHECK-NEXT: 1: [[#]]:
 // CHECK-NOT:  {{^ +[0-9]+:}}
 
 int main(int argc, char *argv[]) { // CHECK:      1: [[#]]:int main
@@ -32,10 +32,8 @@ int main(int argc, char *argv[]) { // CHECK:      1: [[#]]:int main
 //--- a.h
 /// Apple targets doesn't enable -mconstructor-aliases by default and the count may be 4.
 struct A { A() { } };              // CHECK:      {{[24]}}: [[#]]:struct A
-inline auto *const inl_var_a =
-    new A;
-/// TODO a.inc:1 should have line execution.
-// CHECK-NOT:  {{^ +[0-9]+:}}
+inline auto *const inl_var_a =     // CHECK-NEXT: 1: [[#]]:
+    new A;                         // CHECK-NEXT: 1: [[#]]:
 
 //--- a.inc
-puts("");
+puts(""); // CHECK:      1: [[#]]:puts
diff --git a/llvm/include/llvm/ProfileData/GCOV.h b/llvm/include/llvm/ProfileData/GCOV.h
index 0dc33d062e4f..bc407182a094 100644
--- a/llvm/include/llvm/ProfileData/GCOV.h
+++ b/llvm/include/llvm/ProfileData/GCOV.h
@@ -271,6 +271,14 @@ public:
   DenseSet<const GCOVBlock *> visited;
 };
 
+/// Represent file of lines same with block_location_info in gcc.
+struct GCOVBlockLocation {
+  GCOVBlockLocation(unsigned idx) : srcIdx(idx) {}
+
+  unsigned srcIdx;
+  SmallVector<uint32_t, 4> lines;
+};
+
 /// GCOVBlock - Collects block information.
 class GCOVBlock {
 public:
@@ -281,8 +289,13 @@ public:
 
   GCOVBlock(uint32_t N) : number(N) {}
 
-  void addLine(uint32_t N) { lines.push_back(N); }
-  uint32_t getLastLine() const { return lines.back(); }
+  void addLine(uint32_t N) {
+    locations.back().lines.push_back(N);
+    lastLine = N;
+  }
+  void addFile(unsigned fileIdx) { locations.emplace_back(fileIdx); }
+
+  uint32_t getLastLine() const { return lastLine; }
   uint64_t getCount() const { return count; }
 
   void addSrcEdge(GCOVArc *Edge) { pred.push_back(Edge); }
@@ -311,7 +324,8 @@ public:
   uint64_t count = 0;
   SmallVector<GCOVArc *, 2> pred;
   SmallVector<GCOVArc *, 2> succ;
-  SmallVector<uint32_t, 4> lines;
+  SmallVector<GCOVBlockLocation> locations;
+  uint32_t lastLine = 0;
   bool traversable = false;
   GCOVArc *incoming = nullptr;
 };
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index ecb12c045b5b..7d0a243d0238 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -191,7 +191,7 @@ bool GCOVFile::readGCNO(GCOVBuffer &buf) {
           buf.readString(filename);
           if (filename.empty())
             break;
-          // TODO Unhandled
+          Block.addFile(addNormalizedPathToMap(filename));
         }
       }
     }
@@ -456,11 +456,13 @@ void GCOVBlock::print(raw_ostream &OS) const {
     }
     OS << "\n";
   }
-  if (!lines.empty()) {
-    OS << "\tLines : ";
-    for (uint32_t N : lines)
-      OS << (N) << ",";
-    OS << "\n";
+  if (!locations.empty()) {
+    for (const GCOVBlockLocation &loc : locations) {
+      OS << "\tFile: " << loc.srcIdx << ": ";
+      for (uint32_t N : loc.lines)
+        OS << (N) << ",";
+      OS << "\n";
+    }
   }
 }
 
@@ -701,20 +703,25 @@ void Context::collectFunction(GCOVFunction &f, Summary &summary) {
   SmallSet<uint32_t, 16> lines;
   SmallSet<uint32_t, 16> linesExec;
   for (const GCOVBlock &b : f.blocksRange()) {
-    if (b.lines.empty())
+    if (b.locations.empty())
       continue;
-    uint32_t maxLineNum = *llvm::max_element(b.lines);
-    if (maxLineNum >= si.lines.size())
-      si.lines.resize(maxLineNum + 1);
-    for (uint32_t lineNum : b.lines) {
-      LineInfo &line = si.lines[lineNum];
-      if (lines.insert(lineNum).second)
-        ++summary.lines;
-      if (b.count && linesExec.insert(lineNum).second)
-        ++summary.linesExec;
-      line.exists = true;
-      line.count += b.count;
-      line.blocks.push_back(&b);
+    for (const GCOVBlockLocation &loc : b.locations) {
+      SourceInfo &locSource = sources[loc.srcIdx];
+      uint32_t maxLineNum = *llvm::max_element(loc.lines);
+      if (maxLineNum >= locSource.lines.size())
+        locSource.lines.resize(maxLineNum + 1);
+      for (uint32_t lineNum : loc.lines) {
+        LineInfo &line = locSource.lines[lineNum];
+        line.exists = true;
+        line.count += b.count;
+        line.blocks.push_back(&b);
+        if (f.srcIdx == loc.srcIdx) {
+          if (lines.insert(lineNum).second)
+            ++summary.lines;
+          if (b.count && linesExec.insert(lineNum).second)
+            ++summary.linesExec;
+        }
+      }
     }
   }
 }
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 3dfb36f4f181..4c7b8c69c1bf 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -210,12 +210,12 @@ static StringRef getFunctionName(const DISubprogram *SP) {
   return SP->getName();
 }
 
-/// Extract a filename for a DISubprogram.
+/// Extract a filename for a DIScope.
 ///
 /// Prefer relative paths in the coverage notes. Clang also may split
 /// up absolute paths into a directory and filename component. When
 /// the relative path doesn't exist, reconstruct the absolute path.
-static SmallString<128> getFilename(const DISubprogram *SP) {
+static SmallString<128> getFilename(const DIScope *SP) {
   SmallString<128> Path;
   StringRef RelPath = SP->getFilename();
   if (sys::fs::exists(RelPath))
@@ -244,7 +244,9 @@ namespace {
   // list of line numbers and a single filename, representing lines that belong
   // to the block.
   class GCOVLines : public GCOVRecord {
-   public:
+  public:
+    const StringRef getFilename() { return Filename; }
+
     void addLine(uint32_t Line) {
       assert(Line != 0 && "Line zero is not a valid real line number.");
       Lines.push_back(Line);
@@ -276,7 +278,9 @@ namespace {
   class GCOVBlock : public GCOVRecord {
    public:
     GCOVLines &getFile(StringRef Filename) {
-      return LinesByFile.try_emplace(Filename, P, Filename).first->second;
+      if (Lines.empty() || Lines.back().getFilename() != Filename)
+        Lines.emplace_back(P, Filename);
+      return Lines.back();
     }
 
     void addEdge(GCOVBlock &Successor, uint32_t Flags) {
@@ -285,22 +289,16 @@ namespace {
 
     void writeOut() {
       uint32_t Len = 3;
-      SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
-      for (auto &I : LinesByFile) {
-        Len += I.second.length();
-        SortedLinesByFile.push_back(&I);
-      }
+
+      for (auto &L : Lines)
+        Len += L.length();
 
       write(GCOV_TAG_LINES);
       write(Len);
       write(Number);
 
-      llvm::sort(SortedLinesByFile, [](StringMapEntry<GCOVLines> *LHS,
-                                       StringMapEntry<GCOVLines> *RHS) {
-        return LHS->getKey() < RHS->getKey();
-      });
-      for (auto &I : SortedLinesByFile)
-        I->getValue().writeOut();
+      for (auto &L : Lines)
+        L.writeOut();
       write(0);
       write(0);
     }
@@ -309,7 +307,7 @@ namespace {
       // Only allow copy before edges and lines have been added. After that,
       // there are inter-block pointers (eg: edges) that won't take kindly to
       // blocks being copied or moved around.
-      assert(LinesByFile.empty());
+      assert(Lines.empty());
       assert(OutEdges.empty());
     }
 
@@ -322,7 +320,7 @@ namespace {
     GCOVBlock(GCOVProfiler *P, uint32_t Number)
         : GCOVRecord(P), Number(Number) {}
 
-    StringMap<GCOVLines> LinesByFile;
+    SmallVector<GCOVLines> Lines;
   };
 
   // A function has a unique identifier, a checksum (we leave as zero) and a
@@ -881,11 +879,10 @@ bool GCOVProfiler::emitProfileNotes(
           if (Line == Loc.getLine()) continue;
           Line = Loc.getLine();
           MDNode *Scope = Loc.getScope();
-          // TODO: Handle blocks from another file due to #line, #include, etc.
-          if (isa<DILexicalBlockFile>(Scope) || SP != getDISubprogram(Scope))
+          if (SP != getDISubprogram(Scope))
             continue;
 
-          GCOVLines &Lines = Block.getFile(Filename);
+          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope()));
           Lines.addLine(Loc.getLine());
         }
         Line = 0;

From 874773635d31501ab21812c05c44caf281c1acc7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 20 Jun 2025 10:00:55 +0100
Subject: [PATCH 1011/1322] [SLP] NFC: Simplify CandidateVFs initialization
 (#144882)

Also adds a comment to clarify the meaning of MaxRegVF.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5eef2497cf90..1141c1b2babb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21198,7 +21198,11 @@ bool SLPVectorizerPass::vectorizeStores(
         }
       }
 
+      // MaxRegVF represents the number of instructions (scalar, or vector in
+      // case of revec) that can be vectorized to naturally fit in a vector
+      // register.
       unsigned MaxRegVF = MaxVF;
+
       MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
       if (MaxVF < MinVF) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -21207,13 +21211,11 @@ bool SLPVectorizerPass::vectorizeStores(
         continue;
       }
 
-      unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
-      SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
-      unsigned Size = MinVF;
-      for (unsigned &VF : reverse(CandidateVFs)) {
-        VF = Size > MaxVF ? NonPowerOf2VF : Size;
-        Size *= 2;
-      }
+      SmallVector<unsigned> CandidateVFs;
+      for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
+           VF = divideCeil(VF, 2))
+        CandidateVFs.push_back(VF);
+
       unsigned End = Operands.size();
       unsigned Repeat = 0;
       constexpr unsigned MaxAttempts = 4;

From 14e89b061fdecedcec4bb035060a56588610cb5c Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 20 Jun 2025 17:01:35 +0800
Subject: [PATCH 1012/1322] [C++20] [Modules] Add exported modules as
 transitive imported modules

Close https://github.com/llvm/llvm-project/issues/144230

The root cause of the problem is, when we decide the transitive imports,
we didn't deal with exported imports.
---
 clang/lib/Sema/SemaModule.cpp    | 13 +++++++++----
 clang/test/Modules/pr144230.cppm | 26 ++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Modules/pr144230.cppm

diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 9fcaad48d305..54ee0486763b 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -136,6 +136,7 @@ makeTransitiveImportsVisible(ASTContext &Ctx, VisibleModuleSet &VisibleModules,
          "modules only.");
 
   llvm::SmallVector<Module *, 4> Worklist;
+  llvm::SmallSet<Module *, 16> Visited;
   Worklist.push_back(Imported);
 
   Module *FoundPrimaryModuleInterface =
@@ -144,18 +145,22 @@ makeTransitiveImportsVisible(ASTContext &Ctx, VisibleModuleSet &VisibleModules,
   while (!Worklist.empty()) {
     Module *Importing = Worklist.pop_back_val();
 
-    if (VisibleModules.isVisible(Importing))
+    if (Visited.count(Importing))
       continue;
+    Visited.insert(Importing);
 
     // FIXME: The ImportLoc here is not meaningful. It may be problematic if we
     // use the sourcelocation loaded from the visible modules.
     VisibleModules.setVisible(Importing, ImportLoc);
 
     if (isImportingModuleUnitFromSameModule(Ctx, Importing, CurrentModule,
-                                            FoundPrimaryModuleInterface))
+                                            FoundPrimaryModuleInterface)) {
       for (Module *TransImported : Importing->Imports)
-        if (!VisibleModules.isVisible(TransImported))
-          Worklist.push_back(TransImported);
+        Worklist.push_back(TransImported);
+
+      for (auto [Exports, _] : Importing->Exports)
+        Worklist.push_back(Exports);
+    }
   }
 }
 
diff --git a/clang/test/Modules/pr144230.cppm b/clang/test/Modules/pr144230.cppm
new file mode 100644
index 000000000000..7de9fc6461ab
--- /dev/null
+++ b/clang/test/Modules/pr144230.cppm
@@ -0,0 +1,26 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/P.cppm -emit-module-interface -o %t/M-P.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/M.cpp -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- A.cppm
+export module A;
+export using T = int;
+
+//--- P.cppm
+export module M:P;
+import A;
+
+//--- M.cppm
+export module M;
+export import :P;
+
+//--- M.cpp
+// expected-no-diagnostics
+module M;
+
+T x;

From f704782c63ac1d567422f66072bebd49c41723f9 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Fri, 20 Jun 2025 10:07:48 +0100
Subject: [PATCH 1013/1322] [AArch64][SelectionDAG] Fix UDOT regression
 (#144907)

Fix broken check in AArch64ISelLowering for bailing from ZExt
optimizations when there is a partial reduction intrinsic.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 16 ++--
 .../neon-partial-reduce-dot-product.ll        | 80 ++++++++++++-------
 2 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 69f8c6c21a55..8f208c4d006c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16869,14 +16869,14 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
     if (SrcWidth * 4 <= DstWidth) {
       if (all_of(I->users(), [&](auto *U) {
             auto *SingleUser = cast<Instruction>(&*U);
-            return (
-                match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))) ||
-                (match(SingleUser,
-                       m_Intrinsic<
-                           Intrinsic::experimental_vector_partial_reduce_add>(
-                           m_Value(), m_Specific(I))) &&
-                 !shouldExpandPartialReductionIntrinsic(
-                     cast<IntrinsicInst>(SingleUser))));
+            if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
+              return true;
+            if (match(SingleUser,
+                      m_Intrinsic<
+                          Intrinsic::experimental_vector_partial_reduce_add>(
+                          m_Value(), m_Specific(I))))
+              return true;
+            return false;
           }))
         return false;
     }
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0ea80a075fae..0db4dc07fbfe 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -772,35 +772,57 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 }
 
 define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
-; CHECK-COMMON-LABEL: udot_no_bin_op_in_loop:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_0
-; CHECK-COMMON-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-COMMON-NEXT:    adrp x9, .LCPI16_2
-; CHECK-COMMON-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_1
-; CHECK-COMMON-NEXT:    adrp x10, .LCPI16_3
-; CHECK-COMMON-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
-; CHECK-COMMON-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
-; CHECK-COMMON-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
-; CHECK-COMMON-NEXT:    mov x8, xzr
-; CHECK-COMMON-NEXT:  .LBB16_1: // %vector.body
-; CHECK-COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-COMMON-NEXT:    ldr q6, [x0, x8]
-; CHECK-COMMON-NEXT:    mov v0.16b, v2.16b
-; CHECK-COMMON-NEXT:    add x8, x8, #16
-; CHECK-COMMON-NEXT:    cmp x8, #16
-; CHECK-COMMON-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
-; CHECK-COMMON-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
-; CHECK-COMMON-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
-; CHECK-COMMON-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
-; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v17.4s
-; CHECK-COMMON-NEXT:    add v7.4s, v16.4s, v7.4s
-; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-COMMON-NEXT:    b.ne .LBB16_1
-; CHECK-COMMON-NEXT:  // %bb.2: // %end
-; CHECK-COMMON-NEXT:    ret
+; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v3.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB16_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB16_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q3, [x0, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-NEXT:    b.ne .LBB16_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB16_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB16_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 

From b85387dfe8e67ee8a142a1faf25325761c343577 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Jun 2025 10:39:06 +0200
Subject: [PATCH 1014/1322] [SCCP] Check instruction type before querying
 PredicateInfo (NFC)

Do the cheap intrinsic check before the hash lookup for the
PredicateInfo.
---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index cc0bb4735c23..e1b1cf68cd9f 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -771,9 +771,9 @@ public:
 
     for (BasicBlock &BB : F) {
       for (Instruction &Inst : llvm::make_early_inc_range(BB)) {
-        if (It->second->getPredicateInfoFor(&Inst)) {
-          if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
-            if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+        if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
+          if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+            if (It->second->getPredicateInfoFor(&Inst)) {
               Value *Op = II->getOperand(0);
               Inst.replaceAllUsesWith(Op);
               Inst.eraseFromParent();

From eb0f1dc00e5d0e591fe912c1aaf9dd9d01d94b8d Mon Sep 17 00:00:00 2001
From: Peter Smith <peter.smith@arm.com>
Date: Fri, 20 Jun 2025 10:11:42 +0100
Subject: [PATCH 1015/1322] [LLD][ELF] Include offset when adding Thunk symbols
 (#144995)

Include the offset of a thunk in the ThunkSection when adding symbols.

At Thunk creation time the offset is set to 0 as we don't know where in
the ThunkSection the Thunk will end up. The symbol values are updated by
the setOffset() call in assignOffsets().

When we transform a thunk from a short to a long, we sometimes add a
mapping symbol. At this point the offset of the thunk is non zero and we
need to account for that when defining the symbol, as the setOffset()
call subtracts the offset before adding the new one back in.

To test; added a second thunk that is converted to a long thunk to
aarch64-thunk-bit-multipass. This second thunk is given a non zero
offset from the start of the Thunk Section so we can observe the mapping
symbol being put in the wrong place without accounting for the offset.

fixes: https://github.com/llvm/llvm-project/issues/142326
---
 lld/ELF/Thunks.cpp                         |  3 ++-
 lld/test/ELF/aarch64-thunk-bti-multipass.s | 22 +++++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index bad1b4b85735..c26ba76bccb7 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -601,7 +601,8 @@ public:
 
 Defined *Thunk::addSymbol(StringRef name, uint8_t type, uint64_t value,
                           InputSectionBase &section) {
-  Defined *d = addSyntheticLocal(ctx, name, type, value, /*size=*/0, section);
+  Defined *d =
+      addSyntheticLocal(ctx, name, type, value + offset, /*size=*/0, section);
   syms.push_back(d);
   return d;
 }
diff --git a/lld/test/ELF/aarch64-thunk-bti-multipass.s b/lld/test/ELF/aarch64-thunk-bti-multipass.s
index f2ff914fb850..f470d832843b 100644
--- a/lld/test/ELF/aarch64-thunk-bti-multipass.s
+++ b/lld/test/ELF/aarch64-thunk-bti-multipass.s
@@ -29,6 +29,7 @@
 _start:
 /// Call that requires a thunk.
  bl fn1
+ bl fn2
 /// padding so that the thunk for fn1 is placed after this section is
 /// sufficiently close to the target to be within short range, but only
 /// just so that a small displacement will mean a long thunk is needed.
@@ -39,6 +40,7 @@ _start:
 
 // CHECK-LABEL: <_start>:
 // CHECK-NEXT: 10001000: bl  0x10002008 <__AArch64AbsLongThunk_fn1>
+// CHECK-NEXT:           bl  0x10002018 <__AArch64AbsLongThunk_fn2>
 
 // CHECK-LABEL: <__AArch64AbsLongThunk_fn1>:
 // CHECK-NEXT: 10002008: ldr     x16, 0x10002010 <__AArch64AbsLongThunk_fn1+0x8>
@@ -46,6 +48,12 @@ _start:
 // CHECK-NEXT:           00 30 00 18    .word   0x18003000
 // CHECK-NEXT:           00 00 00 00    .word   0x00000000
 
+// CHECK-LABEL: <__AArch64AbsLongThunk_fn2>:
+// CHECK-NEXT: 10002018: ldr     x16, 0x10002020 <__AArch64AbsLongThunk_fn2+0x8>
+// CHECK-NEXT:           br      x16
+// CHECK-NEXT:           04 40 00 18    .word   0x18004004
+// CHECK-NEXT:           00 00 00 00    .word   0x00000000
+
 .section .text.1, "ax", %progbits
 .balign 0x1000
 .global farcall
@@ -75,6 +83,12 @@ farcall:
 fn1:
  ret
 
+.section .text.3, "ax", %progbits
+.global fn2
+.type fn2, %function
+fn2:
+ ret
+
 .section .text.far, "ax", %progbits
 .type far, %function
 .global far
@@ -88,6 +102,12 @@ far:
 // CHECK-LABEL: <fn1>:
 // CHECK-NEXT: 18004000: ret
 
+// CHECK-LABEL: <__AArch64BTIThunk_fn2>:
+// CHECK-NEXT: 18004004:       bti     c
+
+// CHECK-LABEL: <fn2>:
+// CHECK-NEXT: 18004008:       ret
+
 // CHECK-LABEL: <__AArch64BTIThunk_far>:
 // CHECK-NEXT: 30000000: bti     c
 
@@ -104,6 +124,6 @@ SECTIONS {
   .rodata 0x10000000 : { *(.note.gnu.property) } :low
   .text_low : { *(.text.0) } :low
   .text 0x18001000 : { *(.text.1) } :mid
-  .text_aligned : { *(.text.2) } :mid
+  .text_aligned : { *(.text.2) *(.text.3) } :mid
   .text_high 0x30000000 : { *(.text.far) } :high
 }

From 61972054f3fcaf59096799342bac9c93dd9aa432 Mon Sep 17 00:00:00 2001
From: Alexis Engelke <engelke@in.tum.de>
Date: Fri, 20 Jun 2025 11:23:00 +0200
Subject: [PATCH 1016/1322] [CodeGen] Limit number of analyzed predecessors

MachineBlockPlacement has quadratic runtime in the number of
predecessors: in some situation, for an edge, all predecessors of the
successor are considered.

Limit the number of considered predecessors to bound compile time for
large functions.

Pull Request: https://github.com/llvm/llvm-project/pull/142584
---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp | 16 +++++++
 llvm/test/CodeGen/RISCV/branch.ll          | 49 ++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 08fe3d47e2ff..2dbabfe345d5 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -104,6 +104,12 @@ static cl::opt<unsigned> MaxBytesForAlignmentOverride(
              "alignment"),
     cl::init(0), cl::Hidden);
 
+static cl::opt<unsigned> PredecessorLimit(
+    "block-placement-predecessor-limit",
+    cl::desc("For blocks with more predecessors, certain layout optimizations"
+             "will be disabled to prevent quadratic compile time."),
+    cl::init(1000), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
     "block-placement-exit-block-bias",
@@ -1030,6 +1036,11 @@ bool MachineBlockPlacement::isTrellis(
   SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
 
   for (MachineBasicBlock *Succ : ViableSuccs) {
+    // Compile-time optimization: runtime is quadratic in the number of
+    // predecessors. For such uncommon cases, exit early.
+    if (Succ->pred_size() > PredecessorLimit)
+      return false;
+
     int PredCount = 0;
     for (auto *SuccPred : Succ->predecessors()) {
       // Allow triangle successors, but don't count them.
@@ -1472,6 +1483,11 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   if (SuccChain.UnscheduledPredecessors == 0)
     return false;
 
+  // Compile-time optimization: runtime is quadratic in the number of
+  // predecessors. For such uncommon cases, exit early.
+  if (Succ->pred_size() > PredecessorLimit)
+    return false;
+
   // There are two basic scenarios here:
   // -------------------------------------
   // Case 1: triangular shape CFG (if-then):
diff --git a/llvm/test/CodeGen/RISCV/branch.ll b/llvm/test/CodeGen/RISCV/branch.ll
index 578080cd3a24..ed86ca8ca4dd 100644
--- a/llvm/test/CodeGen/RISCV/branch.ll
+++ b/llvm/test/CodeGen/RISCV/branch.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -block-placement-predecessor-limit=10 < %s \
+; RUN:   | FileCheck -check-prefix=RV32I-MBPLIMIT %s
 
 define void @foo(i32 %a, ptr %b, i1 %c) nounwind {
 ; RV32I-LABEL: foo:
@@ -48,6 +50,53 @@ define void @foo(i32 %a, ptr %b, i1 %c) nounwind {
 ; RV32I-NEXT:    lw zero, 0(a1)
 ; RV32I-NEXT:  .LBB0_14: # %end
 ; RV32I-NEXT:    ret
+;
+; RV32I-MBPLIMIT-LABEL: foo:
+; RV32I-MBPLIMIT:       # %bb.0:
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bne a3, a0, .LBB0_2
+; RV32I-MBPLIMIT-NEXT:  .LBB0_1: # %end
+; RV32I-MBPLIMIT-NEXT:    ret
+; RV32I-MBPLIMIT-NEXT:  .LBB0_2: # %test2
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bne a3, a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.3: # %test3
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    blt a3, a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.4: # %test4
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bge a3, a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.5: # %test5
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bltu a3, a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.6: # %test6
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bgeu a3, a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.7: # %test7
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    blt a0, a3, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.8: # %test8
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bge a0, a3, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.9: # %test9
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bltu a0, a3, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.10: # %test10
+; RV32I-MBPLIMIT-NEXT:    lw a3, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bgeu a0, a3, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.11: # %test11
+; RV32I-MBPLIMIT-NEXT:    lw zero, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    andi a2, a2, 1
+; RV32I-MBPLIMIT-NEXT:    bnez a2, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.12: # %test12
+; RV32I-MBPLIMIT-NEXT:    lw a0, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    bgez a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.13: # %test13
+; RV32I-MBPLIMIT-NEXT:    lw a0, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    blez a0, .LBB0_1
+; RV32I-MBPLIMIT-NEXT:  # %bb.14: # %test14
+; RV32I-MBPLIMIT-NEXT:    lw zero, 0(a1)
+; RV32I-MBPLIMIT-NEXT:    ret
   %val1 = load volatile i32, ptr %b
   %tst1 = icmp eq i32 %val1, %a
   br i1 %tst1, label %end, label %test2

From f577516d91dc1ae5b9c8a3bcad81558bc19ccf65 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Jun 2025 11:23:33 +0200
Subject: [PATCH 1017/1322] [mlir][arith] Add back ElementwiseMappable to
 `arith.trunci` (#145000)

This trait was accidentally dropped in #144863.
---
 mlir/include/mlir/Dialect/Arith/IR/ArithOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 993f36f556e8..0518cac156eb 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1271,7 +1271,7 @@ def Arith_ScalingExtFOp
 // TruncIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_TruncIOp : Op<Arith_Dialect, "trunci",
+def Arith_TruncIOp : Arith_Op<"trunci",
     [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims,
      DeclareOpInterfaceMethods<CastOpInterface>,
      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,

From dd4776d429bd20050c80749f669367a0574520c8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Jun 2025 18:26:56 +0900
Subject: [PATCH 1018/1322] AMDGPU: Remove AMDGPUInstrInfo class (#144984)

This was never constructed and only provided one static helper
function.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp         | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp         | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp            | 7 +------
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h              | 9 ++-------
 llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp     | 2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp             | 2 +-
 7 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e52c2d7fde43..6e990cb2e160 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3936,7 +3936,7 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
   const auto *Ld = cast<LoadSDNode>(N);
 
   const MachineMemOperand *MMO = Ld->getMemOperand();
-  if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
+  if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
     return false;
 
   return MMO->getSize().hasValue() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c51cc2a2fe52..134adc681215 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -848,7 +848,7 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(
        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
        (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
         MN->isInvariant())) &&
-      AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
+      AMDGPU::isUniformMMO(MN->getMemOperand()))
     return false;
 
   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 627ac6b0063e..5085e86d71c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -22,17 +22,12 @@
 
 using namespace llvm;
 
-// Pin the vtable to this file.
-//void AMDGPUInstrInfo::anchor() {}
-
-AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { }
-
 Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
   return I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
 }
 
 // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
   const Value *Ptr = MMO->getValue();
   // UndefValue means this is a load of a kernel input.  These are uniform.
   // Sometimes LDS instructions have constant pointers.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index e00c02cfdb86..0eb00cbc2f46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -23,15 +23,10 @@ class GCNSubtarget;
 class MachineMemOperand;
 class MachineInstr;
 
-class AMDGPUInstrInfo {
-public:
-  explicit AMDGPUInstrInfo(const GCNSubtarget &st);
-
-  static bool isUniformMMO(const MachineMemOperand *MMO);
-};
-
 namespace AMDGPU {
 
+bool isUniformMMO(const MachineMemOperand *MMO);
+
 /// Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
 ///
 /// These opcodes have an Intrinsic::ID operand similar to a GIntrinsic. But
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index db3f6bd360b3..a60855cc4f2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -630,7 +630,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
   });
 
   Predicate isUniMMO([](const MachineInstr &MI) -> bool {
-    return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin());
+    return AMDGPU::isUniformMMO(*MI.memoperands_begin());
   });
 
   Predicate isConst([](const MachineInstr &MI) -> bool {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4391a48ff2b6..dca55dafcc5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -463,7 +463,7 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
          (IsConst || !MMO->isVolatile()) &&
          // Memory must be known constant, or not written before this load.
          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
-         AMDGPUInstrInfo::isUniformMMO(MMO);
+         AMDGPU::isUniformMMO(MMO);
 }
 
 RegisterBankInfo::InstructionMappings
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b1e77a282e41..07d79d677104 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10866,7 +10866,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       (AS == AMDGPUAS::GLOBAL_ADDRESS &&
        Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
        isMemOpHasNoClobberedMemOperand(Load))) {
-    if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
+    if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
         Alignment >= Align(4) && NumElements < 32) {
       if (MemVT.isPow2VectorType() ||
           (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))

From f75973949b0e51eb4b3852c903d08bc72bf5d459 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81d=C3=A1m=20Kallai?= <kadam@inf.u-szeged.hu>
Date: Fri, 20 Jun 2025 11:40:35 +0200
Subject: [PATCH 1019/1322] [BOLT][AArch64] Add support for SPE brstack format
 (#129231)

Since Linux 6.14, Perf gained the ability to report SPE branch events
using the `brstack` format, which matches the layout of LBR/BRBE.

This patch reuses the existing LBR parsing logic to support SPE.

Example SPE brstack format:
```bash
perf script -i perf.data -F pid,brstack --itrace=bl
```
```
  PID       FROM / TO / PREDICTED

16984  0x72e342e5f4/0x72e36192d0/M/-/-/11/RET/-
16984  0x72e7b8b3b4/0x72e7b8b3b8/PN/-/-/11/COND/-
16984  0x72e7b92b48/0x72e7b92b4c/PN/-/-/8/COND/-
16984  0x72eacc6b7c/0x760cc94b00/P/-/-/9/RET/-
16984  0x72e3f210fc/0x72e3f21068/P/-/-/4//-
16984  0x72e39b8c5c/0x72e3627b24/P/-/-/4//-
16984  0x72e7b89d20/0x72e7b92bbc/P/-/-/4/RET/-
```

SPE brstack flags can be two characters long: `PN` or `MN`:
- `P` = predicted branch
- `M` = mispredicted branch
- `N` = optionally appears when the branch is NOT-TAKEN
    - flag is relevant only to  conditional branches


Example of usage with BOLT:

1. Capture SPE branch events:
```bash
perf record -e 'arm_spe_0/branch_filter=1/u' -- binary
```

2. Convert profile for BOLT:
```bash
perf2bolt -p perf.data -o perf.fdata --spe binary
```

3. Run BOLT Optimization:
```bash
llvm-bolt binary -o binary.bolted --data   perf.fdata ...
```

A unit test verifies the parsing of the 'SPE brstack format'.

---------

Co-authored-by: Paschalis Mpeis <paschalis.mpeis@arm.com>
---
 bolt/include/bolt/Profile/DataAggregator.h    |   2 +
 bolt/include/bolt/Utils/CommandLineOpts.h     |   1 +
 bolt/lib/Profile/DataAggregator.cpp           |  61 +++++--
 .../test/perf2bolt/AArch64/perf2bolt-spe.test |  12 ++
 bolt/test/perf2bolt/X86/perf2bolt-spe.test    |   9 +
 bolt/tools/driver/llvm-bolt.cpp               |   7 +
 bolt/unittests/Profile/CMakeLists.txt         |  14 ++
 bolt/unittests/Profile/PerfSpeEvents.cpp      | 164 ++++++++++++++++++
 8 files changed, 258 insertions(+), 12 deletions(-)
 create mode 100644 bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
 create mode 100644 bolt/test/perf2bolt/X86/perf2bolt-spe.test
 create mode 100644 bolt/unittests/Profile/PerfSpeEvents.cpp

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 96969cf53bac..662fe2a49afe 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -85,6 +85,8 @@ private:
   };
   friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &);
 
+  friend struct PerfSpeEventsTestHelper;
+
   struct PerfBranchSample {
     SmallVector<LBREntry, 32> LBR;
   };
diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index 4acce5a3e832..a75b6bf720ec 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -48,6 +48,7 @@ extern llvm::cl::OptionCategory BinaryAnalysisCategory;
 extern llvm::cl::opt<unsigned> AlignText;
 extern llvm::cl::opt<unsigned> AlignFunctions;
 extern llvm::cl::opt<bool> AggregateOnly;
+extern llvm::cl::opt<bool> ArmSPE;
 extern llvm::cl::opt<unsigned> BucketsPerLine;
 extern llvm::cl::opt<bool> CompactCodeModel;
 extern llvm::cl::opt<bool> DiffOnly;
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 178c9d3a6373..e03fa9bd5322 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -49,6 +49,9 @@ static cl::opt<bool>
                      cl::desc("aggregate basic samples (without LBR info)"),
                      cl::cat(AggregatorCategory));
 
+cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
+                     cl::cat(AggregatorCategory));
+
 static cl::opt<std::string>
     ITraceAggregation("itrace",
                       cl::desc("Generate LBR info with perf itrace argument"),
@@ -181,11 +184,21 @@ void DataAggregator::start() {
 
   findPerfExecutable();
 
+  if (opts::ArmSPE) {
+    // pid    from_ip      to_ip        flags
+    // where flags could be:
+    // P/M: whether branch was Predicted or Mispredicted.
+    // N: optionally appears when the branch was Not-Taken (ie fall-through)
+    // 12345  0x123/0x456/PN/-/-/8/RET/-
+    opts::ITraceAggregation = "bl";
+    opts::ParseMemProfile = true;
+    opts::BasicAggregation = false;
+  }
+
   if (opts::BasicAggregation) {
-    launchPerfProcess("events without LBR",
-                      MainEventsPPI,
+    launchPerfProcess("events without LBR", MainEventsPPI,
                       "script -F pid,event,ip",
-                      /*Wait = */false);
+                      /*Wait = */ false);
   } else if (!opts::ITraceAggregation.empty()) {
     // Disable parsing memory profile from trace data, unless requested by user.
     if (!opts::ParseMemProfile.getNumOccurrences())
@@ -994,9 +1007,22 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
   if (std::error_code EC = MispredStrRes.getError())
     return EC;
   StringRef MispredStr = MispredStrRes.get();
-  if (MispredStr.size() != 1 ||
-      (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
-    reportError("expected single char for mispred bit");
+  // SPE brstack mispredicted flags might be up to two characters long:
+  // 'PN' or 'MN'. Where 'N' optionally appears.
+  bool ValidStrSize = opts::ArmSPE
+                          ? MispredStr.size() >= 1 && MispredStr.size() <= 2
+                          : MispredStr.size() == 1;
+  bool SpeTakenBitErr =
+      (opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N');
+  bool PredictionBitErr =
+      !ValidStrSize ||
+      (MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-');
+  if (SpeTakenBitErr)
+    reportError("expected 'N' as SPE prediction bit for a not-taken branch");
+  if (PredictionBitErr)
+    reportError("expected 'P', 'M' or '-' char as a prediction bit");
+
+  if (SpeTakenBitErr || PredictionBitErr) {
     Diag << "Found: " << MispredStr << "\n";
     return make_error_code(llvm::errc::io_error);
   }
@@ -1497,7 +1523,9 @@ void DataAggregator::printBranchStacksDiagnostics(
 }
 
 std::error_code DataAggregator::parseBranchEvents() {
-  outs() << "PERF2BOLT: parse branch events...\n";
+  std::string BranchEventTypeStr =
+      opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events";
+  outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n";
   NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
 
@@ -1525,7 +1553,8 @@ std::error_code DataAggregator::parseBranchEvents() {
     }
 
     NumEntries += Sample.LBR.size();
-    if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
+    if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 &&
+        !NeedsSkylakeFix) {
       errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
       NeedsSkylakeFix = true;
     }
@@ -1548,10 +1577,18 @@ std::error_code DataAggregator::parseBranchEvents() {
     if (NumSamples && NumSamplesNoLBR == NumSamples) {
       // Note: we don't know if perf2bolt is being used to parse memory samples
       // at this point. In this case, it is OK to parse zero LBRs.
-      errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
-                "LBR. Record profile with perf record -j any or run perf2bolt "
-                "in no-LBR mode with -nl (the performance improvement in -nl "
-                "mode may be limited)\n";
+      if (!opts::ArmSPE)
+        errs()
+            << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
+               "LBR. Record profile with perf record -j any or run perf2bolt "
+               "in no-LBR mode with -nl (the performance improvement in -nl "
+               "mode may be limited)\n";
+      else
+        errs()
+            << "PERF2BOLT-WARNING: All recorded samples for this binary lack "
+               "SPE brstack entries. Make sure you are running Linux perf 6.14 "
+               "or later, otherwise you get zero samples. Record the profile "
+               "with: perf record -e 'arm_spe_0/branch_filter=1/'.";
     } else {
       printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
     }
diff --git a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
new file mode 100644
index 000000000000..91f5c857fbab
--- /dev/null
+++ b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
@@ -0,0 +1,12 @@
+## Check that Arm SPE mode is available on AArch64.
+
+REQUIRES: system-linux,perf,target=aarch64{{.*}}
+
+RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
+
+RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null
+
+RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR
+
+CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format
+
diff --git a/bolt/test/perf2bolt/X86/perf2bolt-spe.test b/bolt/test/perf2bolt/X86/perf2bolt-spe.test
new file mode 100644
index 000000000000..8eed2c859509
--- /dev/null
+++ b/bolt/test/perf2bolt/X86/perf2bolt-spe.test
@@ -0,0 +1,9 @@
+## Check that Arm SPE mode is unavailable on X86.
+
+REQUIRES: system-linux,x86_64-linux
+
+RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
+RUN: touch %t.empty.perf.data
+RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --spe --pa %t.exe 2>&1 | FileCheck %s
+
+CHECK: perf2bolt: -spe is available only on AArch64.
diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp
index b9836c2397b6..cf1b31f8c0c6 100644
--- a/bolt/tools/driver/llvm-bolt.cpp
+++ b/bolt/tools/driver/llvm-bolt.cpp
@@ -237,6 +237,13 @@ int main(int argc, char **argv) {
       if (Error E = RIOrErr.takeError())
         report_error(opts::InputFilename, std::move(E));
       RewriteInstance &RI = *RIOrErr.get();
+
+      if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
+          opts::ArmSPE) {
+        errs() << ToolName << ": -spe is available only on AArch64.\n";
+        exit(1);
+      }
+
       if (!opts::PerfData.empty()) {
         if (!opts::AggregateOnly) {
           errs() << ToolName
diff --git a/bolt/unittests/Profile/CMakeLists.txt b/bolt/unittests/Profile/CMakeLists.txt
index e0aa0926b49c..ce01c6c4b949 100644
--- a/bolt/unittests/Profile/CMakeLists.txt
+++ b/bolt/unittests/Profile/CMakeLists.txt
@@ -1,11 +1,25 @@
+set(LLVM_LINK_COMPONENTS
+  DebugInfoDWARF
+  Object
+  ${LLVM_TARGETS_TO_BUILD}
+  )
+
 add_bolt_unittest(ProfileTests
   DataAggregator.cpp
+  PerfSpeEvents.cpp
 
   DISABLE_LLVM_LINK_LLVM_DYLIB
   )
 
 target_link_libraries(ProfileTests
   PRIVATE
+  LLVMBOLTCore
   LLVMBOLTProfile
+  LLVMTargetParser
+  LLVMTestingSupport
   )
 
+foreach (tgt ${BOLT_TARGETS_TO_BUILD})
+  string(TOUPPER "${tgt}" upper)
+  target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
+endforeach()
diff --git a/bolt/unittests/Profile/PerfSpeEvents.cpp b/bolt/unittests/Profile/PerfSpeEvents.cpp
new file mode 100644
index 000000000000..3e3e05395246
--- /dev/null
+++ b/bolt/unittests/Profile/PerfSpeEvents.cpp
@@ -0,0 +1,164 @@
+//===- bolt/unittests/Profile/PerfSpeEvents.cpp ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef AARCH64_AVAILABLE
+
+#include "bolt/Core/BinaryContext.h"
+#include "bolt/Profile/DataAggregator.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::bolt;
+using namespace llvm::object;
+using namespace llvm::ELF;
+
+namespace opts {
+extern cl::opt<std::string> ReadPerfEvents;
+extern cl::opt<bool> ArmSPE;
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+/// Perform checks on perf SPE branch events.
+struct PerfSpeEventsTestHelper : public testing::Test {
+  void SetUp() override {
+    initalizeLLVM();
+    prepareElf();
+    initializeBOLT();
+  }
+
+protected:
+  using Trace = DataAggregator::Trace;
+  using TakenBranchInfo = DataAggregator::TakenBranchInfo;
+
+  void initalizeLLVM() {
+    llvm::InitializeAllTargetInfos();
+    llvm::InitializeAllTargetMCs();
+    llvm::InitializeAllAsmParsers();
+    llvm::InitializeAllDisassemblers();
+    llvm::InitializeAllTargets();
+    llvm::InitializeAllAsmPrinters();
+  }
+
+  void prepareElf() {
+    memcpy(ElfBuf, "\177ELF", 4);
+    ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
+    EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
+    EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
+    EHdr->e_machine = llvm::ELF::EM_AARCH64;
+    MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
+    ObjFile = cantFail(ObjectFile::createObjectFile(Source));
+  }
+
+  void initializeBOLT() {
+    Relocation::Arch = ObjFile->makeTriple().getArch();
+    BC = cantFail(BinaryContext::createBinaryContext(
+        ObjFile->makeTriple(), std::make_shared<orc::SymbolStringPool>(),
+        ObjFile->getFileName(), nullptr, /*IsPIC*/ false,
+        DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()}));
+    ASSERT_FALSE(!BC);
+  }
+
+  char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
+  std::unique_ptr<ObjectFile> ObjFile;
+  std::unique_ptr<BinaryContext> BC;
+
+  /// Helper function to export lists to show the mismatch.
+  void reportBrStackEventMismatch(
+      const std::vector<std::pair<Trace, TakenBranchInfo>> &Traces,
+      const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
+    llvm::errs() << "Traces items: \n";
+    for (const auto &[Trace, BI] : Traces)
+      llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ","
+                   << Trace.To << ", " << BI.TakenCount << ", "
+                   << BI.MispredCount << "}" << "\n";
+
+    llvm::errs() << "Expected items: \n";
+    for (const auto &[Trace, BI] : ExpectedSamples)
+      llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ", "
+                   << Trace.To << ", " << BI.TakenCount << ", "
+                   << BI.MispredCount << "}" << "\n";
+  }
+
+  /// Parse and check SPE brstack as LBR.
+  void parseAndCheckBrstackEvents(
+      uint64_t PID,
+      const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
+    DataAggregator DA("<pseudo input>");
+    DA.ParsingBuf = opts::ReadPerfEvents;
+    DA.BC = BC.get();
+    DataAggregator::MMapInfo MMap;
+    DA.BinaryMMapInfo.insert(std::make_pair(PID, MMap));
+
+    DA.parseBranchEvents();
+
+    EXPECT_EQ(DA.Traces.size(), ExpectedSamples.size());
+    if (DA.Traces.size() != ExpectedSamples.size())
+      reportBrStackEventMismatch(DA.Traces, ExpectedSamples);
+
+    const auto TracesBegin = DA.Traces.begin();
+    const auto TracesEnd = DA.Traces.end();
+    for (const auto &BI : ExpectedSamples) {
+      auto it = find_if(TracesBegin, TracesEnd,
+                        [&BI](const auto &Tr) { return Tr.first == BI.first; });
+
+      EXPECT_NE(it, TracesEnd);
+      EXPECT_EQ(it->second.MispredCount, BI.second.MispredCount);
+      EXPECT_EQ(it->second.TakenCount, BI.second.TakenCount);
+    }
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
+  // Check perf input with SPE branch events as brstack format.
+  // Example collection command:
+  // ```
+  // perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
+  // ```
+  // How Bolt extracts the branch events:
+  // ```
+  // perf script -F pid,brstack --itrace=bl
+  // ```
+
+  opts::ArmSPE = true;
+  opts::ReadPerfEvents = "  1234  0xa001/0xa002/PN/-/-/10/COND/-\n"
+                         "  1234  0xb001/0xb002/P/-/-/4/RET/-\n"
+                         "  1234  0xc456/0xc789/P/-/-/13/-/-\n"
+                         "  1234  0xd123/0xd456/M/-/-/7/RET/-\n"
+                         "  1234  0xe001/0xe002/P/-/-/14/RET/-\n"
+                         "  1234  0xd123/0xd456/M/-/-/7/RET/-\n"
+                         "  1234  0xf001/0xf002/MN/-/-/8/COND/-\n"
+                         "  1234  0xc456/0xc789/M/-/-/13/-/-\n";
+
+  // ExpectedSamples contains the aggregated information about
+  // a branch {{Branch From, To}, {TakenCount, MispredCount}}.
+  // Consider this example trace: {{0xd123, 0xd456, Trace::BR_ONLY},
+  // {2,2}}. This entry has a TakenCount = 2, as we have two samples for
+  // (0xd123, 0xd456) in our input. It also has MispredsCount = 2,
+  // as 'M' misprediction flag appears in both cases. BR_ONLY means
+  // the trace only contains branch data.
+  std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
+      {{0xa001, 0xa002, Trace::BR_ONLY}, {1, 0}},
+      {{0xb001, 0xb002, Trace::BR_ONLY}, {1, 0}},
+      {{0xc456, 0xc789, Trace::BR_ONLY}, {2, 1}},
+      {{0xd123, 0xd456, Trace::BR_ONLY}, {2, 2}},
+      {{0xe001, 0xe002, Trace::BR_ONLY}, {1, 0}},
+      {{0xf001, 0xf002, Trace::BR_ONLY}, {1, 1}}};
+
+  parseAndCheckBrstackEvents(1234, ExpectedSamples);
+}
+
+#endif

From 238abf8ba8233ec0f2dab57a3bacbd192e78f8b6 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Jun 2025 09:40:52 +0000
Subject: [PATCH 1020/1322] [gn build] Port f75973949b0e

---
 llvm/utils/gn/secondary/bolt/unittests/Profile/BUILD.gn | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/bolt/unittests/Profile/BUILD.gn b/llvm/utils/gn/secondary/bolt/unittests/Profile/BUILD.gn
index b3f7216af059..8ac57eea6a46 100644
--- a/llvm/utils/gn/secondary/bolt/unittests/Profile/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/unittests/Profile/BUILD.gn
@@ -3,5 +3,8 @@ import("//third-party/unittest/unittest.gni")
 unittest("ProfileTests") {
   configs += [ "//llvm/utils/gn/build:bolt_code" ]
   deps = [ "//bolt/lib/Profile" ]
-  sources = [ "DataAggregator.cpp" ]
+  sources = [
+    "DataAggregator.cpp",
+    "PerfSpeEvents.cpp",
+  ]
 }

From 95bd05d7cae327e431ccdaf0a452a0573ade5357 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Jun 2025 12:04:04 +0200
Subject: [PATCH 1021/1322] [mlir][Func][NFC] Simplify implementation after
 #144706 (#145006)

---
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index c6bcb593eaad..538016927256 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -273,7 +273,7 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
 static void restoreByValRefArgumentType(
     ConversionPatternRewriter &rewriter, const LLVMTypeConverter &typeConverter,
     ArrayRef<std::optional<NamedAttribute>> byValRefNonPtrAttrs,
-    ArrayRef<BlockArgument> oldBlockArgs, LLVM::LLVMFuncOp funcOp) {
+    LLVM::LLVMFuncOp funcOp) {
   // Nothing to do for function declarations.
   if (funcOp.isExternal())
     return;
@@ -281,8 +281,8 @@ static void restoreByValRefArgumentType(
   ConversionPatternRewriter::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointToStart(&funcOp.getFunctionBody().front());
 
-  for (const auto &[arg, oldArg, byValRefAttr] :
-       llvm::zip(funcOp.getArguments(), oldBlockArgs, byValRefNonPtrAttrs)) {
+  for (const auto &[arg, byValRefAttr] :
+       llvm::zip(funcOp.getArguments(), byValRefNonPtrAttrs)) {
     // Skip argument if no `llvm.byval` or `llvm.byref` attribute.
     if (!byValRefAttr)
       continue;
@@ -309,10 +309,6 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
     return rewriter.notifyMatchFailure(
         funcOp, "Only support FunctionOpInterface with FunctionType");
 
-  // Keep track of the entry block arguments. They will be needed later.
-  SmallVector<BlockArgument> oldBlockArgs =
-      llvm::to_vector(funcOp.getArguments());
-
   // Convert the original function arguments. They are converted using the
   // LLVMTypeConverter provided to this legalization pattern.
   auto varargsAttr = funcOp->getAttrOfType<BoolAttr>(varargsAttrName);
@@ -455,7 +451,7 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
   // pointee type in the function body when converting `llvm.byval`/`llvm.byref`
   // function arguments.
   restoreByValRefArgumentType(rewriter, converter, byValRefNonPtrAttrs,
-                              oldBlockArgs, newFuncOp);
+                              newFuncOp);
 
   if (!shouldUseBarePtrCallConv(funcOp, &converter)) {
     if (funcOp->getAttrOfType<UnitAttr>(

From a5fa5bd2a890c588b99b07fc0f3fcef236888609 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 11:07:20 +0100
Subject: [PATCH 1022/1322] [X86] ptest.ll - add test coverage for #144861 load
 chains

---
 llvm/test/CodeGen/X86/ptest.ll | 168 ++++++++++++++++++++++++++++++++-
 1 file changed, 164 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index d3da7524eaf1..6e43b897caef 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2   | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2   | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 define i32 @veccond128(<4 x i32> %input) {
 ; SSE2-LABEL: veccond128:
@@ -388,3 +388,163 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
   %t2 = select i1 %t1, i32 %a, i32 %b
   ret i32 %t2
 }
+
+define i1 @vecmp_load64x2(ptr %p0) {
+; CHECK-LABEL: vecmp_load64x2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    orq 8(%rdi), %rax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %p1 = getelementptr i8, ptr %p0, i64 8
+  %i0 = load i64, ptr %p0, align 1
+  %i1 = load i64, ptr %p1, align 1
+  %or = or i64 %i0, %i1
+  %ne = icmp ne i64 %or, 0
+  %zx = zext i1 %ne to i32
+  %eq = icmp eq i32 %zx, 0
+  ret i1 %eq
+}
+
+define i1 @vecmp_load64x4(ptr %p0) {
+; CHECK-LABEL: vecmp_load64x4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rcx
+; CHECK-NEXT:    orq 16(%rdi), %rax
+; CHECK-NEXT:    orq 24(%rdi), %rcx
+; CHECK-NEXT:    orq %rax, %rcx
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %p1 = getelementptr i8, ptr %p0, i64 8
+  %p2 = getelementptr i8, ptr %p0, i64 16
+  %p3 = getelementptr i8, ptr %p0, i64 24
+  %i0 = load i64, ptr %p0, align 1
+  %i1 = load i64, ptr %p1, align 1
+  %i2 = load i64, ptr %p2, align 1
+  %i3 = load i64, ptr %p3, align 1
+  %or02 = or i64 %i0, %i2
+  %or13 = or i64 %i1, %i3
+  %or = or i64 %or02, %or13
+  %ne = icmp ne i64 %or, 0
+  %zx = zext i1 %ne to i32
+  %eq = icmp eq i32 %zx, 0
+  ret i1 %eq
+}
+
+define i1 @vecmp_load128x2(ptr %p0) {
+; CHECK-LABEL: vecmp_load128x2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rcx
+; CHECK-NEXT:    orq 24(%rdi), %rcx
+; CHECK-NEXT:    orq 16(%rdi), %rax
+; CHECK-NEXT:    orq %rcx, %rax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %p1 = getelementptr i8, ptr %p0, i64 16
+  %i0 = load i128, ptr %p0, align 1
+  %i1 = load i128, ptr %p1, align 1
+  %or = or i128 %i0, %i1
+  %ne = icmp ne i128 %or, 0
+  %zx = zext i1 %ne to i32
+  %eq = icmp eq i32 %zx, 0
+  ret i1 %eq
+}
+
+define i1 @vecmp_load128x4(ptr %p0) {
+; CHECK-LABEL: vecmp_load128x4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rcx
+; CHECK-NEXT:    movq 24(%rdi), %rdx
+; CHECK-NEXT:    movq 16(%rdi), %rsi
+; CHECK-NEXT:    orq 32(%rdi), %rax
+; CHECK-NEXT:    orq 40(%rdi), %rcx
+; CHECK-NEXT:    orq 48(%rdi), %rsi
+; CHECK-NEXT:    orq %rax, %rsi
+; CHECK-NEXT:    orq 56(%rdi), %rdx
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    orq %rsi, %rdx
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %p1 = getelementptr i8, ptr %p0, i64 16
+  %p2 = getelementptr i8, ptr %p0, i64 32
+  %p3 = getelementptr i8, ptr %p0, i64 48
+  %i0 = load i128, ptr %p0, align 1
+  %i1 = load i128, ptr %p1, align 1
+  %i2 = load i128, ptr %p2, align 1
+  %i3 = load i128, ptr %p3, align 1
+  %or02 = or i128 %i0, %i2
+  %or13 = or i128 %i1, %i3
+  %or = or i128 %or02, %or13
+  %ne = icmp ne i128 %or, 0
+  %zx = zext i1 %ne to i32
+  %eq = icmp eq i32 %zx, 0
+  ret i1 %eq
+}
+
+; PR144861
+define i1 @vecmp_load256x2(ptr %p0) {
+; CHECK-LABEL: vecmp_load256x2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq 24(%rdi), %rax
+; CHECK-NEXT:    movq (%rdi), %rcx
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    movq 16(%rdi), %rsi
+; CHECK-NEXT:    orq 48(%rdi), %rsi
+; CHECK-NEXT:    orq 32(%rdi), %rcx
+; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    orq 56(%rdi), %rax
+; CHECK-NEXT:    orq 40(%rdi), %rdx
+; CHECK-NEXT:    orq %rax, %rdx
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %p1 = getelementptr i8, ptr %p0, i64 32
+  %i0 = load i256, ptr %p0, align 1
+  %i1 = load i256, ptr %p1, align 1
+  %or = or i256 %i0, %i1
+  %ne = icmp ne i256 %or, 0
+  %zx = zext i1 %ne to i32
+  %eq = icmp eq i32 %zx, 0
+  ret i1 %eq
+}
+
+define i1 @vecmp_load512x2(ptr %p0) {
+; CHECK-LABEL: vecmp_load512x2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq 24(%rdi), %rax
+; CHECK-NEXT:    movq 56(%rdi), %rdx
+; CHECK-NEXT:    movq 40(%rdi), %rsi
+; CHECK-NEXT:    movq 16(%rdi), %rcx
+; CHECK-NEXT:    movq 48(%rdi), %r8
+; CHECK-NEXT:    movq (%rdi), %r9
+; CHECK-NEXT:    movq 8(%rdi), %r10
+; CHECK-NEXT:    movq 32(%rdi), %r11
+; CHECK-NEXT:    orq 96(%rdi), %r11
+; CHECK-NEXT:    orq 64(%rdi), %r9
+; CHECK-NEXT:    orq %r11, %r9
+; CHECK-NEXT:    orq 112(%rdi), %r8
+; CHECK-NEXT:    orq 80(%rdi), %rcx
+; CHECK-NEXT:    orq %r8, %rcx
+; CHECK-NEXT:    orq %r9, %rcx
+; CHECK-NEXT:    orq 104(%rdi), %rsi
+; CHECK-NEXT:    orq 72(%rdi), %r10
+; CHECK-NEXT:    orq %rsi, %r10
+; CHECK-NEXT:    orq 120(%rdi), %rdx
+; CHECK-NEXT:    orq 88(%rdi), %rax
+; CHECK-NEXT:    orq %rdx, %rax
+; CHECK-NEXT:    orq %r10, %rax
+; CHECK-NEXT:    orq %rcx, %rax
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %p1 = getelementptr i8, ptr %p0, i64 64
+  %i0 = load i512, ptr %p0, align 1
+  %i1 = load i512, ptr %p1, align 1
+  %or = or i512 %i0, %i1
+  %ne = icmp ne i512 %or, 0
+  %zx = zext i1 %ne to i32
+  %eq = icmp eq i32 %zx, 0
+  ret i1 %eq
+}

From 7085065c02da6091dca91be201160912e43a63ec Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 20 Jun 2025 03:17:08 -0700
Subject: [PATCH 1023/1322] [BOLT] Support pre-aggregated returns (#143296)

Intel's Architectural LBR supports capturing branch type information
as part of LBR stack (SDM Vol 3B, part 2, October 2024):
```
20.1.3.2 Branch Types
The IA32_LBR_x_INFO.BR_TYPE and IA32_LER_INFO.BR_TYPE fields encode
the branch types as shown in Table 20-3.

Table 20-3. IA32_LBR_x_INFO and IA32_LER_INFO Branch Type Encodings

Encoding | Branch Type
  0000B  | COND
  0001B  | NEAR_IND_JMP
  0010B  | NEAR_REL_JMP
  0011B  | NEAR_IND_CALL
  0100B  | NEAR_REL_CALL
  0101B  | NEAR_RET
  011xB  | Reserved
  1xxxB  | OTHER_BRANCH

For a list of branch operations that fall into the categories above,
see Table 20-2.

Table 20-2. Branch Type Filtering Details
Branch Type   | Operations Recorded
COND          | Jcc, J*CXZ, and LOOP*
NEAR_IND_JMP  | JMP r/m*
NEAR_REL_JMP  | JMP rel*
NEAR_IND_CALL | CALL r/m*
NEAR_REL_CALL | CALL rel* (excluding CALLs to the next sequential IP)
NEAR_RET      | RET (0C3H)
OTHER_BRANCH  | JMP/CALL ptr*, JMP/CALL m*, RET (0C8H), SYS*,
interrupts, exceptions (other than debug exceptions), IRET, INT3,
INTn, INTO, TSX Abort, EENTER, ERESUME, EEXIT, AEX, INIT, SIPI, RSM
```

Linux kernel can preserve branch type when `save_type` is enabled,
even if CPU does not support Architectural LBR:

https://github.com/torvalds/linux/blob/f09079bd04a924c72d555cd97942d5f8d7eca98c/tools/perf/Documentation/perf-record.txt#L457-L460

> - save_type: save branch type during sampling in case binary is not
available later.
For the platforms with Intel Arch LBR support (12th-Gen+ client or
4th-Gen Xeon+ server), the save branch type is unconditionally enabled
when the taken branch stack sampling is enabled.

Kernel-reported branch type values:

https://github.com/torvalds/linux/blob/8c6bc74c7f8910ed4c969ccec52e98716f98700a/include/uapi/linux/perf_event.h#L251-L269

This information is needed to disambiguate external returns (from
DSO/JIT) to an entry point or a landing pad, when BOLT can't
disassemble the branch source.

This patch adds new pre-aggregated types:
- return trace (R),
- external return fall-through (r).

For such types, the checks for fall-through start (not an entry or
a landing pad) are relaxed.

Depends on #143295.

Test Plan: updated callcont-fallthru.s
---
 bolt/include/bolt/Profile/DataAggregator.h | 23 ++++++---
 bolt/lib/Profile/DataAggregator.cpp        | 59 +++++++++++++---------
 bolt/test/X86/callcont-fallthru.s          | 23 +++++++++
 bolt/test/link_fdata.py                    |  4 +-
 4 files changed, 77 insertions(+), 32 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 662fe2a49afe..cc28a06c151e 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -101,16 +101,17 @@ private:
     uint64_t Addr;
   };
 
-  /// Container for the unit of branch data.
-  /// Backwards compatible with legacy use for branches and fall-throughs:
-  /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only
-  ///   contains fall-through data,
-  /// - if \p To is BR_ONLY, the trace only contains branch data.
+  /// Container for the unit of branch data, matching pre-aggregated trace type.
+  /// Backwards compatible with branch and fall-through types:
+  /// - if \p To is < 0, the trace only contains branch data (BR_ONLY),
+  /// - if \p Branch is < 0, the trace only contains fall-through data
+  ///   (FT_ONLY, FT_EXTERNAL_ORIGIN, or FT_EXTERNAL_RETURN).
   struct Trace {
     static constexpr const uint64_t EXTERNAL = 0ULL;
     static constexpr const uint64_t BR_ONLY = -1ULL;
     static constexpr const uint64_t FT_ONLY = -1ULL;
     static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+    static constexpr const uint64_t FT_EXTERNAL_RETURN = -3ULL;
 
     uint64_t Branch;
     uint64_t From;
@@ -390,9 +391,9 @@ private:
   /// File format syntax:
   /// E <event>
   /// S <start> <count>
-  /// T <start> <end> <ft_end> <count>
+  /// [TR] <start> <end> <ft_end> <count>
   /// B <start> <end> <count> <mispred_count>
-  /// [Ff] <start> <end> <count>
+  /// [Ffr] <start> <end> <count>
   ///
   /// where <start>, <end>, <ft_end> have the format [<id>:]<offset>
   ///
@@ -403,8 +404,11 @@ private:
   /// f - an aggregated fall-through with external origin - used to disambiguate
   ///       between a return hitting a basic block head and a regular internal
   ///       jump to the block
+  /// r - an aggregated fall-through originating at an external return, no
+  ///       checks are performed for a fallthrough start
   /// T - an aggregated trace: branch from <start> to <end> with a fall-through
   ///       to <ft_end>
+  /// R - an aggregated trace originating at a return
   ///
   /// <id> - build id of the object containing the address. We can skip it for
   /// the main binary and use "X" for an unknown object. This will save some
@@ -532,7 +536,12 @@ inline raw_ostream &operator<<(raw_ostream &OS,
                                const DataAggregator::Trace &T) {
   switch (T.Branch) {
   case DataAggregator::Trace::FT_ONLY:
+    break;
   case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
+    OS << "X:0 -> ";
+    break;
+  case DataAggregator::Trace::FT_EXTERNAL_RETURN:
+    OS << "X:R -> ";
     break;
   default:
     OS << Twine::utohexstr(T.Branch) << " -> ";
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index e03fa9bd5322..c067b2f5b73b 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -537,8 +537,7 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
 
 heatmap:
   // Sort parsed traces for faster processing.
-  if (!opts::BasicAggregation)
-    llvm::sort(Traces, llvm::less_first());
+  llvm::sort(Traces, llvm::less_first());
 
   if (!opts::HeatmapMode)
     return Error::success();
@@ -883,13 +882,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (IsReturn) {
-    if (From)
-      FromBB = BF.getBasicBlockContainingOffset(From - 1);
-    else
-      LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
-  } else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
-             !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
+  if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
+      From == FromBB->getOffset() &&
+      (IsReturn ? From : !(FromBB->isEntryPoint() || FromBB->isLandingPad()))) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -1228,12 +1223,14 @@ ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
 std::error_code DataAggregator::parseAggregatedLBREntry() {
   enum AggregatedLBREntry : char {
     INVALID = 0,
-    EVENT_NAME,        // E
-    TRACE,             // T
-    SAMPLE,            // S
-    BRANCH,            // B
-    FT,                // F
-    FT_EXTERNAL_ORIGIN // f
+    EVENT_NAME,         // E
+    TRACE,              // T
+    RETURN,             // R
+    SAMPLE,             // S
+    BRANCH,             // B
+    FT,                 // F
+    FT_EXTERNAL_ORIGIN, // f
+    FT_EXTERNAL_RETURN  // r
   } Type = INVALID;
 
   /// The number of fields to parse, set based on \p Type.
@@ -1261,20 +1258,22 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
 
     Type = StringSwitch<AggregatedLBREntry>(Str)
                .Case("T", TRACE)
+               .Case("R", RETURN)
                .Case("S", SAMPLE)
                .Case("E", EVENT_NAME)
                .Case("B", BRANCH)
                .Case("F", FT)
                .Case("f", FT_EXTERNAL_ORIGIN)
+               .Case("r", FT_EXTERNAL_RETURN)
                .Default(INVALID);
 
     if (Type == INVALID) {
-      reportError("expected T, S, E, B, F or f");
+      reportError("expected T, R, S, E, B, F, f or r");
       return make_error_code(llvm::errc::io_error);
     }
 
     using SSI = StringSwitch<int>;
-    AddrNum = SSI(Str).Case("T", 3).Case("S", 1).Case("E", 0).Default(2);
+    AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
@@ -1331,17 +1330,30 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   if (ToFunc)
     ToFunc->setHasProfileAvailable();
 
-  /// For legacy fall-through types, adjust locations to match Trace container.
-  if (Type == FT || Type == FT_EXTERNAL_ORIGIN) {
+  /// For fall-through types, adjust locations to match Trace container.
+  if (Type == FT || Type == FT_EXTERNAL_ORIGIN || Type == FT_EXTERNAL_RETURN) {
     Addr[2] = Location(Addr[1]->Offset); // Trace To
     Addr[1] = Location(Addr[0]->Offset); // Trace From
-    // Put a magic value into Trace Branch to differentiate from a full trace.
-    Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
+    // Put a magic value into Trace Branch to differentiate from a full trace:
+    if (Type == FT)
+      Addr[0] = Location(Trace::FT_ONLY);
+    else if (Type == FT_EXTERNAL_ORIGIN)
+      Addr[0] = Location(Trace::FT_EXTERNAL_ORIGIN);
+    else if (Type == FT_EXTERNAL_RETURN)
+      Addr[0] = Location(Trace::FT_EXTERNAL_RETURN);
+    else
+      llvm_unreachable("Unexpected fall-through type");
   }
 
-  /// For legacy branch type, mark Trace To to differentite from a full trace.
-  if (Type == BRANCH) {
+  /// For branch type, mark Trace To to differentiate from a full trace.
+  if (Type == BRANCH)
     Addr[2] = Location(Trace::BR_ONLY);
+
+  if (Type == RETURN) {
+    if (!Addr[0]->Offset)
+      Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN;
+    else
+      Returns.emplace(Addr[0]->Offset);
   }
 
   /// Record a trace.
@@ -1602,6 +1614,7 @@ void DataAggregator::processBranchEvents() {
   NamedRegionTimer T("processBranch", "Processing branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
+  Returns.emplace(Trace::FT_EXTERNAL_RETURN);
   for (const auto &[Trace, Info] : Traces) {
     bool IsReturn = checkReturn(Trace.Branch);
     // Ignore returns.
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index c2ef024db947..8c05491e7bca 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -10,6 +10,10 @@
 # RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
 # Trace from an external location to a landing pad/entry point call continuation
 # RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# Return trace to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-pret PREAGG-PRET
+# External return to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET
 # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
@@ -38,6 +42,21 @@
 # RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
 # RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
 
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-pret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-pret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
+## Same for external return type
+# RUN: llvm-bolt %t --pa -p %t.pa-eret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-eret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
 ## Check pre-aggregated traces don't report zero-sized PLT fall-through as
 ## invalid trace
 # RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
@@ -92,6 +111,10 @@ Ltmp4_br:
 # PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
 # PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+## Pre-aggregated return trace
+# PREAGG-PRET: R X:0 #Ltmp3# #Ltmp3_br# 1
+## External return
+# PREAGG-ERET: r #Ltmp3# #Ltmp3_br# 1
 
 # CHECK-ATTACH:      callq foo
 # CHECK-ATTACH-NEXT: count: 1
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 5a9752068bb9..898dce8e3fb5 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -36,9 +36,9 @@ prefix_pat = re.compile(f"^# {args.prefix}: (.*)")
 fdata_pat = re.compile(r"([01].*) (?P<mispred>\d+) (?P<exec>\d+)")
 
 # Pre-aggregated profile:
-# {T|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
+# {T|R|S|E|B|F|f|r} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
 # <loc>: [<id>:]<offset>
-preagg_pat = re.compile(r"(?P<type>[TSBFf]) (?P<offsets_count>.*)")
+preagg_pat = re.compile(r"(?P<type>[TRSBFfr]) (?P<offsets_count>.*)")
 
 # No-LBR profile:
 # <is symbol?> <closest elf symbol or DSO name> <relative address> <count>

From 00c18d04ab6341022867d3b6674ec3ab30e5de2c Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <Nico.Vasilache@amd.com>
Date: Fri, 20 Jun 2025 12:31:46 +0200
Subject: [PATCH 1024/1322] =?UTF-8?q?[mlir][Transforms]=20Add=20a=20PadTil?=
 =?UTF-8?q?ingInterface=20transformation=20and=20hook=20i=E2=80=A6=20(#144?=
 =?UTF-8?q?991)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…t up to the transform dialect

This revision revisits the padding transformation from first principles
and prepares it to work more generally with TilingInterface.

Compared to structured.transform.pad it has the following differences:
- no support for nofold, copy-back, transpose and hoisting: these have
been carried by the padding op in the very early days of StructuredOps
and have since then been separated out as independent transformations
that compose.
- no conflated static bounding box derivation attempts:
pad_tiling_interface composes more naturally with or without tiling.
- properly derives padding size on outputs where multiple dimensions
contribute: this is not supported in structured.transform.pad
- geared towards supporting TilingInterface once the proper control
mechanisms are supported through a PadSizeComputationFunction (supports
LinalgOp by default)

This will gradually replace structured.transform.pad as it is fleshed
out and tested more comprehensively.

In the future this should be moved out of a specific Linalg
implementation file and into a more general "Structured" file.
---
 .../Linalg/TransformOps/LinalgTransformOps.td |  79 +++++
 .../Dialect/Linalg/Transforms/Transforms.h    |  79 ++++-
 .../TransformOps/LinalgTransformOps.cpp       | 161 +++++++++
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |   1 +
 .../Linalg/Transforms/PadTilingInterface.cpp  | 322 ++++++++++++++++++
 ...m-op-pad-tiling-interface-multiple-of.mlir | 138 ++++++++
 .../transform-op-pad-tiling-interface.mlir    | 129 +++++++
 7 files changed, 906 insertions(+), 3 deletions(-)
 create mode 100644 mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
 create mode 100644 mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
 create mode 100644 mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 6f6df350f1ba..cf3f2b70580d 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1186,6 +1186,85 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// PadTilingInterfaceOp
+//===----------------------------------------------------------------------===//
+
+def PadTilingInterfaceOp : Op<Transform_Dialect, "structured.pad_tiling_interface",
+    [FunctionalStyleTransformOpTrait, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     TransformOpInterface,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Pads the operations pointed to by the target handle using the options
+    provided as operation attributes. The operation returns a handle to the
+    padded operation and to the padding operation ("tensor.pad").
+
+    TODO: in the future this should be moved out of a specific Linalg
+    implementation file and into a more general "Structured" file.
+
+    #### Return modes
+
+    This operation ignores non-Linalg ops and drops them in the return.
+    In the future, this operation will support all TilingInterfaceOps.
+
+    This operation may produce a definite failure if the padding fails for any
+    reason.
+
+    If all the operations referred to by the `target` handle pad properly, the
+    transform succeeds. Otherwise the transform produces a silenceable failure.
+    The return handle points to only the subset of successfully produced
+    padded operations, which can be empty.
+  }];
+
+  let arguments =
+    (ins TransformHandleTypeInterface:$target,
+         DefaultValuedAttr<ArrayAttr, "{}">:$padding_values,
+         DefaultValuedAttr<I64ArrayAttr, "{}">:$padding_dimensions,
+         Variadic<TransformAnyParamTypeOrAnyHandle>:$padding_sizes,
+         DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:
+            $static_padding_sizes,
+         DefaultValuedAttr<UnitAttr, "false">:$pad_to_multiple_of);
+  let results = (outs TransformHandleTypeInterface:$padded,
+                      TransformHandleTypeInterface:$pad);
+
+  let assemblyFormat = [{
+    $target
+    `to`
+    (`padding_sizes` custom<DynamicIndexList>($padding_sizes, $static_padding_sizes)^)?
+    (`pad_to_multiple_of` $pad_to_multiple_of^)?
+    attr-dict
+    `:` functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+
+  let builders = [
+    // Builder for a transform::PadOp with automatic inference of padding
+    // value. Warning: this will set the value 0 for the inferred elemental
+    // type without taking the op into account and thus only work for the
+    // add/mul ring at the moment.
+    // TODO: support other operations (e.g. min, max etc).
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<int64_t>":$paddingDimensions,
+                   CArg<"ArrayRef<int64_t>", "{}">:$staticPaddingSizes,
+                   CArg<"bool", "false">:$padToMultipleOf)>,
+    OpBuilder<(ins "Value":$target,
+                   "ArrayRef<int64_t>":$paddingDimensions,
+                   "ArrayRef<OpFoldResult>":$mixedPadPaddingSizes,
+                   CArg<"bool", "false">:$usePrescribedTensorShapes)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Returns a mix of dynamic `padding_sizes` and static `static_padding_sizes`.
+    SmallVector<OpFoldResult> getMixedPaddingSizes();
+
+    ::mlir::DiagnosedSilenceableFailure apply(
+      ::mlir::transform::TransformRewriter &rewriter,
+      ::mlir::transform::TransformResults &results,
+      ::mlir::transform::TransformState &state);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // HoistPadOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 147a2907f52e..59b7fdeef10b 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -347,6 +348,34 @@ struct LinalgPaddingOptions {
   }
 };
 
+struct PadTilingInterfaceOptions {
+  /// A padding value for every operand.
+  SmallVector<Attribute> paddingValues;
+  PadTilingInterfaceOptions &setPaddingValues(ArrayRef<Attribute> pv) {
+    paddingValues.assign(pv.begin(), pv.end());
+    return *this;
+  }
+  /// A list of iterator dimensions to pad.
+  SmallVector<int64_t> paddingDimensions;
+  PadTilingInterfaceOptions &setPaddingDimensions(ArrayRef<int64_t> pd) {
+    paddingDimensions.assign(pd.begin(), pd.end());
+    return *this;
+  }
+  /// A list of iterator dimensions sizes to pad to.
+  SmallVector<OpFoldResult> paddingSizes;
+  PadTilingInterfaceOptions &setPaddingSizes(ArrayRef<OpFoldResult> m) {
+    paddingSizes.assign(m.begin(), m.end());
+    return *this;
+  }
+  /// Pad iterator `paddingDimension[i]` to next multiple of `paddingSizes[i]`
+  /// if true. Otherwise pad to `paddingSizes[i]`.
+  bool padToMultipleOf;
+  PadTilingInterfaceOptions &setPadToMultipleOf(bool b) {
+    padToMultipleOf = b;
+    return *this;
+  }
+};
+
 /// Callback function type used to perform the allocation for the promoted
 /// `subView`. In `boundingSubViewsize` a best attempt is made to find the
 /// smallest constant value for the size of the buffer needed for each
@@ -542,9 +571,9 @@ SmallVector<Value> peelLoop(RewriterBase &rewriter, Operation *op);
 /// where relevant.
 void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
 
-/// Pad the iterator dimensions `paddingDimensions` of all `opToPad` operands
-/// to a static bounding box. The original `opToPad` is cloned and operates on
-/// the padded tensors.
+/// Pad the iterator dimensions `options.paddingDimensions` of all `opToPad`
+/// operands to a static bounding box. The original `opToPad` is cloned and
+/// operates on the padded tensors.
 ///
 /// * "options.padToMultipleOf" indicates that each padding dimension should be
 ///   padded to the specified multiple.
@@ -561,6 +590,50 @@ LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
                                 SmallVector<Value> &replacements,
                                 SmallVector<tensor::PadOp> &padOps);
 
+/// Helper function to compute the padded shape of the given value `v` of
+/// `RankedTensorType` given:
+///   - the `indexingSizes` as a list of OpFoldResult.
+///   - an `indexingMap` that encodes how the padded shape varies with
+///     increases in `indexingSizes`.
+/// The implementation iteratively combines increases from contributing using
+/// affine.apply operations.
+/// The `indexingMap` + `indexingSizes` encoding suits StructuredOps and
+/// provides a gentle portability path for Linalg-like ops with affine maps.
+/// In the future, more general interfaces can be devised to encode similar
+/// shape evolutions and map between an op and its operands.
+SmallVector<OpFoldResult>
+computePaddedShape(RewriterBase &rewriter, TypedValue<RankedTensorType> v,
+                   AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
+                   const PadTilingInterfaceOptions &options);
+
+using PadSizeComputationFunction =
+    std::function<FailureOr<SmallVector<OpFoldResult>>(
+        RewriterBase &, OpOperand &, ArrayRef<Range>,
+        const PadTilingInterfaceOptions &)>;
+
+/// Specific helper for Linalg ops.
+FailureOr<SmallVector<OpFoldResult>>
+computeLinalgPaddedShape(RewriterBase &rewriter, OpOperand &operandToPad,
+                         ArrayRef<Range> iterationDomain,
+                         const PadTilingInterfaceOptions &options);
+
+/// Pad the iterator dimensions `options.paddingDimensions` of `opToPad`.
+///
+/// * "options.paddingSizes" indicates that each padding dimension should be
+///   padded to the specified padding size.
+/// * "options.padToMultipleOf" indicates that the paddingSizes should be
+//    interpreted as the bounding box (dynamic) value to pad to.
+/// * Use "options.paddingValues" to set the padding value of the created
+//    tensor::PadOp.
+/// * The tensor::PadOp is returned on success.
+
+FailureOr<TilingInterface>
+rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
+                  const PadTilingInterfaceOptions &constOptions,
+                  SmallVector<tensor::PadOp> &padOps,
+                  PadSizeComputationFunction computePaddingSizeFun =
+                      &computeLinalgPaddedShape);
+
 namespace detail {
 
 /// Helper struct to hold the results of building a packing loop nest.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index d78c8847f884..e627fc83f2ba 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -45,6 +45,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include <type_traits>
 
 using namespace mlir;
@@ -2155,6 +2156,166 @@ LogicalResult transform::PadOp::verify() {
   return success();
 }
 
+//===---------------------------------------------------------------------===//
+// PadTilingInterfaceOp
+//===---------------------------------------------------------------------===//
+
+void transform::PadTilingInterfaceOp::build(OpBuilder &b,
+                                            OperationState &result,
+                                            Value target,
+                                            ArrayRef<int64_t> paddingDimensions,
+                                            ArrayRef<int64_t> paddingSizes,
+                                            bool padToMultipleOf) {
+  auto resultType = transform::AnyOpType::get(b.getContext());
+  return build(/*builder=*/b,
+               /*result=*/result,
+               /*types=*/TypeRange{resultType, resultType},
+               /*target=*/target,
+               /*paddingValues=*/ArrayAttr(), // let inference handle this
+               /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
+               /*paddingSizes=*/ValueRange{},
+               /*paddingSizes=*/
+               (paddingSizes.empty() ? DenseI64ArrayAttr()
+                                     : b.getDenseI64ArrayAttr(paddingSizes)),
+               /*padToMultipleOf=*/
+               padToMultipleOf ? b.getUnitAttr() : nullptr);
+}
+
+void transform::PadTilingInterfaceOp::build(
+    OpBuilder &b, OperationState &result, Value target,
+    ArrayRef<int64_t> paddingDimensions,
+    ArrayRef<OpFoldResult> mixedPaddingSizes, bool padToMultipleOf) {
+  auto resultType = transform::AnyOpType::get(b.getContext());
+  SmallVector<int64_t> staticPaddingSizes;
+  SmallVector<Value> dynamicPaddingSizes;
+  dispatchIndexOpFoldResults(mixedPaddingSizes, dynamicPaddingSizes,
+                             staticPaddingSizes);
+  return build(/*builder=*/b,
+               /*result=*/result,
+               /*types=*/TypeRange{resultType, resultType},
+               /*target=*/target,
+               /*paddingValues=*/ArrayAttr(), // let inference handle this
+               /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
+               /*paddingSizes=*/dynamicPaddingSizes,
+               /*paddingSizes=*/staticPaddingSizes,
+               /*usePrescribedTensorShapes=*/padToMultipleOf);
+}
+
+void transform::PadTilingInterfaceOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getPaddingSizesMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  modifiesPayload(effects);
+}
+
+SmallVector<OpFoldResult>
+transform::PadTilingInterfaceOp::getMixedPaddingSizes() {
+  Builder b(getContext());
+  return getMixedValues(getStaticPaddingSizes(), getPaddingSizes(), b);
+}
+
+DiagnosedSilenceableFailure
+transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
+                                       transform::TransformResults &results,
+                                       transform::TransformState &state) {
+  SmallVector<Operation *> paddedOps, padOps;
+
+  for (Operation *target : state.getPayloadOps(getTarget())) {
+    auto targetOp = dyn_cast<TilingInterface>(target);
+    if (!targetOp) {
+      auto diag = emitSilenceableError() << "expected TilingInterface target";
+      diag.attachNote(target->getLoc()) << "target op";
+      return diag;
+    }
+
+    // Only Linalg ops for now, until TilingInterface exposes a loopsToOperand
+    // map / C++ APIs to compute the effect of padding on operands.
+    if (!isa<LinalgOp>(targetOp.getOperation())) {
+      auto diag = emitSilenceableError() << "only LinalgOp supported atm";
+      diag.attachNote(target->getLoc()) << "target op";
+      return diag;
+    }
+
+    // Convert the padding values to attributes.
+    SmallVector<Attribute> paddingValues;
+    for (auto const &[untypedAttr, elementOrTensorType] :
+         llvm::zip(getPaddingValues(), targetOp->getOperandTypes())) {
+      auto attr = dyn_cast<TypedAttr>(untypedAttr);
+      Type elementType = getElementTypeOrSelf(elementOrTensorType);
+      if (!attr) {
+        emitOpError("expects padding values to be typed attributes");
+        return DiagnosedSilenceableFailure::definiteFailure();
+      }
+      // Try to parse string attributes to obtain an attribute of element type.
+      if (auto stringAttr = dyn_cast<StringAttr>(attr)) {
+        auto parsedAttr = dyn_cast_if_present<TypedAttr>(parseAttribute(
+            stringAttr, getContext(), elementType,
+            /*numRead=*/nullptr, /*isKnownNullTerminated=*/true));
+        if (!parsedAttr || parsedAttr.getType() != elementType) {
+          auto diag = this->emitOpError("expects a padding that parses to ")
+                      << elementType << ", got " << attr;
+          diag.attachNote(targetOp.getLoc()) << "when applied to this op";
+          return DiagnosedSilenceableFailure::definiteFailure();
+        }
+        paddingValues.push_back(parsedAttr);
+        continue;
+      }
+      // Otherwise, add the attribute directly.
+      if (attr.getType() != elementType) {
+        auto diag = this->emitOpError("expects a padding value of type ")
+                    << elementType << ", got " << attr;
+        diag.attachNote(targetOp.getLoc()) << "when applied to this op";
+        return DiagnosedSilenceableFailure::definiteFailure();
+      }
+      paddingValues.push_back(attr);
+    }
+
+    // Set options.
+    TilingInterface paddedOp;
+    PadTilingInterfaceOptions options;
+    options.setPaddingValues(paddingValues)
+        .setPaddingDimensions(
+            extractFromIntegerArrayAttr<int64_t>(getPaddingDimensions()))
+        .setPaddingSizes(getMixedPaddingSizes())
+        .setPadToMultipleOf(getPadToMultipleOf());
+
+    // Apply padding.
+    SmallVector<tensor::PadOp> newPadOps;
+    FailureOr<TilingInterface> maybePaddedOp = rewriteAsPaddedOp(
+        rewriter, cast<TilingInterface>(targetOp.getOperation()), options,
+        newPadOps);
+    if (failed(maybePaddedOp)) {
+      auto diag = emitSilenceableError() << "failed to pad op";
+      diag.attachNote(target->getLoc()) << "target op";
+      return diag;
+    }
+
+    // Set transform results.
+    paddedOps.push_back(cast<TilingInterface>(maybePaddedOp->getOperation()));
+    padOps.append(newPadOps.begin(), newPadOps.end());
+  }
+
+  results.set(cast<OpResult>(getPadded()), paddedOps);
+  results.set(cast<OpResult>(getPad()), padOps);
+  return DiagnosedSilenceableFailure::success();
+}
+
+LogicalResult transform::PadTilingInterfaceOp::verify() {
+  SmallVector<int64_t> paddingDimensions =
+      extractFromIntegerArrayAttr<int64_t>(getPaddingDimensions());
+  if (any_of(paddingDimensions,
+             [](int64_t paddingDimension) { return paddingDimension < 0; })) {
+    return emitOpError() << "expects padding_dimensions to contain positive "
+                            "integers, found "
+                         << getPaddingDimensions();
+  }
+  if (getMixedPaddingSizes().size() != paddingDimensions.size()) {
+    return emitOpError() << "expects as many multiples as padding_dimensions";
+  }
+  return success();
+}
+
 //===---------------------------------------------------------------------===//
 // HoistPadOp
 //===---------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 881d9fcb4f52..69e6fdabf9a5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -29,6 +29,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   BlockPackMatmul.cpp
   PackAndUnpackPatterns.cpp
   Padding.cpp
+  PadTilingInterface.cpp
   Promotion.cpp
   RuntimeOpVerification.cpp
   Specialize.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
new file mode 100644
index 000000000000..a9d7bc64f2a6
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -0,0 +1,322 @@
+//===- PaddingTilingInterface.cpp - Padding of TilingInterface ops --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/TilingInterface.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "pad-tiling-interface"
+
+using namespace mlir;
+using namespace mlir::linalg;
+using namespace mlir::tensor;
+
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
+#define DBGSNL() (llvm::dbgs() << "\n")
+
+/// Form a "full-rank" padding specification so that the application is easy.
+static llvm::SmallDenseMap<int64_t, OpFoldResult>
+getDimsToSize(Builder &b, ArrayRef<OpFoldResult> indexingSizes,
+              const PadTilingInterfaceOptions &options) {
+  llvm::SmallDenseMap<int64_t, OpFoldResult> dimsToSize;
+  for (const auto &[paddingDim, paddingSize] :
+       llvm::zip_equal(options.paddingDimensions, options.paddingSizes)) {
+    dimsToSize[paddingDim] = paddingSize;
+  }
+  // Complete the padding specification to specify all dimensions.
+  for (int64_t idx = 0, e = indexingSizes.size(); idx != e; ++idx) {
+    if (dimsToSize.find(idx) != dimsToSize.end())
+      continue;
+    // If a dimension is not specified, either complete with:
+    //   - 1 if we are padding to the next multiple of.
+    //   - indexingSizes[idx] otherwise
+    dimsToSize[idx] =
+        options.padToMultipleOf ? b.getIndexAttr(1) : indexingSizes[idx];
+  }
+  for (int64_t idx = 0, e = indexingSizes.size(); idx != e; ++idx) {
+    LLVM_DEBUG(DBGS() << "----idx: " << idx << " : " << dimsToSize[idx]
+                      << "\n");
+  }
+  return dimsToSize;
+}
+
+/// Compute the padded shape of the given value `v` of `RankedTensorType` given
+///   - `indexingSizes` a list of OpFoldResult.
+///   - an `indexingMap` that encodes how the shape of varies with increases
+///     in `indexingSizes`.
+/// The `indexingMap` encodes how the shape of varies with `indexingSizes`.
+/// The `indexingMap` + `indexingSizes` encoding suits StructuredOps.
+/// The implementaiton below iteratively combines increases from contributing
+/// dimensions using affine.apply operations.
+/// In the future, more general interfaces can be devised to encode similar
+/// shape evolutions and map between an op and its operands.
+SmallVector<OpFoldResult> linalg::computePaddedShape(
+    RewriterBase &rewriter, TypedValue<RankedTensorType> v,
+    AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
+    const PadTilingInterfaceOptions &options) {
+  Location loc = v.getLoc();
+  SmallVector<OpFoldResult> paddedShape;
+  auto tensorType = cast<RankedTensorType>(v.getType());
+  paddedShape.resize_for_overwrite(tensorType.getRank());
+  assert(tensorType.getRank() == indexingMap.getNumResults() &&
+         "expect the number of results of the affine map to match the tensor "
+         "rank");
+
+  // "Full-rank" padding specification.
+  llvm::SmallDenseMap<int64_t, OpFoldResult> dimsToSize =
+      getDimsToSize(rewriter, indexingSizes, options);
+
+  // For each dimension in the operand's shape, iterate over indexingSizes and
+  // add
+  for (const auto &enResults : enumerate(indexingMap.getResults())) {
+    int64_t resultIndex = enResults.index();
+    AffineMap partialIndexingMap = indexingMap.getSubMap(
+        ArrayRef<unsigned>{static_cast<unsigned>(resultIndex)});
+
+    LLVM_DEBUG(DBGS() << "----resultIndex: " << resultIndex
+                      << " with partialIndexingMap: " << partialIndexingMap
+                      << "\n");
+
+    // Find all padding dimensions that contribute to this operand dimension
+    // and compute the padded term contribution to the final padded shape.
+    SmallVector<OpFoldResult> terms;
+    for (const auto &[paddingDim, paddingSize] : dimsToSize) {
+      LLVM_DEBUG(DBGS() << "------try apply padding of dim: " << paddingDim
+                        << " to: " << paddingSize << "\n");
+      if (!enResults.value().isFunctionOfDim(paddingDim))
+        continue;
+
+      LLVM_DEBUG(DBGS() << "------apply padding of dim: " << paddingDim
+                        << " to: " << paddingSize << "\n");
+
+      // Project non-'paddingDim' dimensions and compress the result.
+      llvm::SmallBitVector projectedDims(partialIndexingMap.getNumDims(), true);
+      projectedDims.flip(paddingDim);
+      AffineMap projectedMap =
+          mlir::projectDims(partialIndexingMap, projectedDims,
+                            /*compressDims=*/true);
+
+      // If we are padding to the next multiple of, compose with ceil(sz) * sz.
+      if (options.padToMultipleOf) {
+        AffineExpr d0, s0;
+        bindDims(rewriter.getContext(), d0);
+        bindSymbols(rewriter.getContext(), s0);
+        AffineMap ceilMap = AffineMap::get(1, 1, d0.ceilDiv(s0) * s0);
+        AffineMap composedMap = projectedMap.compose(ceilMap);
+        OpFoldResult paddingDimOfr = affine::makeComposedFoldedAffineApply(
+            rewriter, loc, composedMap,
+            {indexingSizes[paddingDim], paddingSize});
+        terms.push_back(paddingDimOfr);
+      } else {
+        // Otherwise just set to paddingSize.
+        OpFoldResult paddingDimOfr = affine::makeComposedFoldedAffineApply(
+            rewriter, loc, projectedMap, paddingSize);
+        terms.push_back(paddingDimOfr);
+      }
+
+      LLVM_DEBUG(DBGS() << "------new term: " << terms.back() << "\n");
+    }
+
+    // If there are no terms, just return the dim.
+    if (terms.empty()) {
+      paddedShape[resultIndex] =
+          createFoldedDimOp(rewriter, loc, v, resultIndex);
+      continue;
+    }
+
+    // Sum individual terms' contributions.
+    SmallVector<AffineExpr> dims(terms.size());
+    bindDimsList(rewriter.getContext(), MutableArrayRef{dims});
+    AffineExpr sumExpr = dims.front();
+    for (unsigned i = 1; i < dims.size(); ++i)
+      sumExpr = sumExpr + dims[i];
+    OpFoldResult paddedDimOfr =
+        affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, terms);
+    paddedShape[resultIndex] = paddedDimOfr;
+  }
+
+  return paddedShape;
+}
+
+FailureOr<SmallVector<OpFoldResult>> linalg::computeLinalgPaddedShape(
+    RewriterBase &rewriter, OpOperand &operandToPad,
+    ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options) {
+  auto linalgOp = llvm::dyn_cast<LinalgOp>(operandToPad.getOwner());
+  if (!linalgOp)
+    return failure();
+
+  // clang-format off
+  assert(llvm::all_of(iterationDomain, [&rewriter](Range r) {
+    return r.offset == OpFoldResult(rewriter.getIndexAttr(0)) &&
+    r.stride == OpFoldResult(rewriter.getIndexAttr(1));
+  }) && "expected 0-offset 1-stride loop ranges");
+  // clang-format on
+  SmallVector<OpFoldResult> loopUpperBounds;
+  loopUpperBounds.reserve(iterationDomain.size());
+  for (const Range &range : iterationDomain)
+    loopUpperBounds.push_back(range.size);
+
+  AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&operandToPad);
+  return computePaddedShape(
+      rewriter, cast<TypedValue<RankedTensorType>>(operandToPad.get()),
+      indexingMap, loopUpperBounds, options);
+}
+
+/// Pad a single operand to `paddedShape` using `paddingValueAttr` as padding
+/// Value.
+static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
+                        TypedValue<RankedTensorType> v,
+                        ArrayRef<OpFoldResult> paddedShape,
+                        Attribute paddingValueAttr) {
+  Value paddingValue;
+  if (auto complexTy =
+          dyn_cast<ComplexType>(getElementTypeOrSelf(v.getType()))) {
+    auto complexAttr = cast<ArrayAttr>(paddingValueAttr);
+    paddingValue = rewriter.create<complex::ConstantOp>(opToPad.getLoc(),
+                                                        complexTy, complexAttr);
+  } else {
+    paddingValue = rewriter.create<arith::ConstantOp>(
+        opToPad.getLoc(), cast<TypedAttr>(paddingValueAttr));
+  }
+
+  // Pad the operand to the bounding box defined by `paddedShape`.
+  SmallVector<int64_t> tensorShape;
+  SmallVector<Value> dynDims;
+  for (OpFoldResult ofr : paddedShape) {
+    std::optional<int64_t> cst = getConstantIntValue(ofr);
+    tensorShape.push_back(cst.has_value() ? *cst : ShapedType::kDynamic);
+    if (!cst.has_value())
+      dynDims.push_back(ofr.dyn_cast<Value>());
+  }
+  // TODO: use dispatchIndexOpFoldResults(paddedShape, dynDims, paddedShape);
+
+  auto paddedTensorType =
+      RankedTensorType::get(tensorShape, getElementTypeOrSelf(v));
+  LLVM_DEBUG(DBGS() << "--SUCCESS, makeComposedPadHighOp with type: "
+                    << paddedTensorType);
+  return makeComposedPadHighOp(rewriter, opToPad.getLoc(), paddedTensorType, v,
+                               paddingValue, /*nofold=*/false, dynDims);
+}
+
+FailureOr<TilingInterface>
+linalg::rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
+                          const PadTilingInterfaceOptions &constOptions,
+                          SmallVector<tensor::PadOp> &padOps,
+                          PadSizeComputationFunction computePaddingSizeFun) {
+  LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << opToPad << "\n");
+  assert(constOptions.paddingSizes.size() ==
+             constOptions.paddingDimensions.size() &&
+         "invalid number of elements in padToMultipleOf");
+
+  Location loc = opToPad.getLoc();
+  PadTilingInterfaceOptions options(constOptions);
+  // Allow inference of pad values if they are not explicitly specified.
+  // TODO: be mindful about the value depending on the actual operation.
+  if (options.paddingValues.empty()) {
+    SmallVector<Type> types(opToPad->getOperandTypes());
+    llvm::append_range(types, opToPad->getResultTypes());
+    for (Type t : types) {
+      options.paddingValues.push_back(
+          rewriter.getZeroAttr(getElementTypeOrSelf(t)));
+    }
+  }
+
+  if (llvm::any_of(opToPad->getOperands(),
+                   [](Value v) { return isa<MemRefType>(v.getType()); })) {
+    return rewriter.notifyMatchFailure(opToPad,
+                                       "expected operation on tensors");
+  }
+
+  OpBuilder::InsertionGuard g(rewriter);
+  // Set IP after opToPad because we also take the dims of opToPad's output.
+  rewriter.setInsertionPointAfter(opToPad);
+
+  // 1. Get the loopUpperBounds from the TilingInterface.
+  SmallVector<Range> iterationDomain = opToPad.getIterationDomain(rewriter);
+
+  // 2. For each operand.
+  SmallVector<Value> newOperands;
+  newOperands.reserve(opToPad->getNumOperands());
+  for (OpOperand &opOperand : opToPad->getOpOperands()) {
+    LLVM_DEBUG(DBGS() << "--start padding oprd: " << opOperand.get() << "\n");
+    // 2.a. Compute padded shape.
+    FailureOr<SmallVector<OpFoldResult>> maybePaddedShape =
+        computePaddingSizeFun(rewriter, opOperand, iterationDomain, options);
+    if (failed(maybePaddedShape)) {
+      return rewriter.notifyMatchFailure(opToPad, "could not pad op");
+    }
+
+    // 2.b. Expect proper `paddingValues`.
+    // TODO: we may want to allow garbage padding in the future, in which case
+    // we would just not assert.
+    assert(opOperand.getOperandNumber() < options.paddingValues.size() &&
+           "--no padding value specified");
+    Attribute paddingValueAttr =
+        options.paddingValues[opOperand.getOperandNumber()];
+
+    // 2.c. Perform actual padding.
+    Value paddedOperand = padOperand(
+        rewriter, opToPad, cast<TypedValue<RankedTensorType>>(opOperand.get()),
+        *maybePaddedShape, paddingValueAttr);
+    LLVM_DEBUG(DBGS() << "--done padding operand: " << paddedOperand << "\n");
+
+    // 2.d. Perform actual padding.
+    newOperands.push_back(paddedOperand);
+    if (auto padOp = paddedOperand.getDefiningOp<tensor::PadOp>())
+      padOps.push_back(padOp);
+  }
+
+  // 3. Form the resulting tensor::ExtractSliceOp.
+  ReifiedRankedShapedTypeDims reifiedResultShapes;
+  if (failed(reifyResultShapes(rewriter, opToPad, reifiedResultShapes))) {
+    LLVM_DEBUG(DBGS() << "--failed to reify result shapes -> FAIL\n");
+    return rewriter.notifyMatchFailure(opToPad,
+                                       "failed to reify result shapes");
+  }
+  assert(reifiedResultShapes.size() == opToPad->getNumResults() &&
+         "expected same number of results");
+
+  // Clone `opToPad` to operate on the statically padded shapes.
+  auto resultTensorTypes =
+      ValueRange(newOperands).take_back(opToPad->getNumResults()).getTypes();
+  // clone **should** properly notify the rewriter.
+  TilingInterface paddedOp =
+      clone(rewriter, opToPad, resultTensorTypes, newOperands);
+  LLVM_DEBUG(DBGS() << "--cloned padded op: " << paddedOp << "\n");
+
+  // Recover the slice out of the new static results. This keeps the original
+  // opToPad around because it uses the dims of the original results.
+  SmallVector<Value> paddedSubtensorResults;
+  paddedSubtensorResults.reserve(opToPad->getNumResults());
+  for (const auto &en : llvm::enumerate(paddedOp->getResults())) {
+    Value paddedResult = en.value();
+    int64_t resultNumber = en.index();
+    int64_t rank = cast<RankedTensorType>(paddedResult.getType()).getRank();
+    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
+    SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
+    paddedSubtensorResults.push_back(rewriter.create<tensor::ExtractSliceOp>(
+        loc, paddedResult, offsets, reifiedResultShapes[resultNumber],
+        strides));
+  }
+
+  rewriter.replaceOp(opToPad, paddedSubtensorResults);
+
+  return paddedOp;
+}
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
new file mode 100644
index 000000000000..d502150e9e34
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
@@ -0,0 +1,138 @@
+// RUN: mlir-opt --transform-interpreter -canonicalize -split-input-file --verify-diagnostics %s | FileCheck %s
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (-d0 + (d0 ceildiv 8) * 8)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+  //      CHECK:   %[[MIN:.*]] = affine.min #[[$MAP0]](%{{.*}})
+  //      CHECK:   %[[H0:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[%[[H0]], 0]
+  //      CHECK:     : tensor<?x12xf32> to tensor<?x12xf32>
+
+  //      CHECK:   %[[H1:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:     : tensor<?x25xf32> to tensor<?x25xf32>
+
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<?x12xf32>, tensor<12x25xf32>) outs(%{{.*}} : tensor<?x25xf32>) -> tensor<?x25xf32>
+  
+  //      CHECK:   tensor.extract_slice %{{.*}}[0, 0] [%{{.*}}, 25] [1, 1]
+  //      CHECK:     : tensor<?x25xf32> to tensor<?x25xf32>
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1]
+  // CHECK-SAME:     : tensor<?x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile to 5 then pad to 8 (supposedly to better hit vector ops).
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul_l1 to padding_sizes [8] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2, d0 + d1)>
+
+module {
+
+// CHECK-LABEL: @generic
+// CHECK-SAME:      %[[T0:.*]]: tensor<7x5xf32>,
+// CHECK-SAME:      %[[T1:.*]]: tensor<7x11x12xf32>)
+  func.func @generic(%arg0: tensor<7x5xf32>, %arg1: tensor<7x11x12xf32>) -> tensor<7x11x12xf32> {
+
+  //  CHECK-DAG: %[[CST:.*]] = arith.constant 0.
+
+  //      CHECK: %[[PAD0:.*]] = tensor.pad %[[T0]] low[0, 0] high[2, 0]
+  //      CHECK:   : tensor<7x5xf32> to tensor<9x5xf32>
+  //      CHECK: %[[PAD1:.*]] = tensor.pad %[[T1]] low[0, 0, 0] high[2, 4, 2] {
+  //      CHECK:   : tensor<7x11x12xf32> to tensor<9x15x14xf32>
+  // CHECK-NEXT: linalg.generic
+  //      CHECK: tensor.extract_slice %{{.*}}[0, 0, 0] [7, 11, 12] [1, 1, 1] : tensor<9x15x14xf32> to tensor<7x11x12xf32>
+  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0 : tensor<7x5xf32>) outs(%arg1 : tensor<7x11x12xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<7x11x12xf32>
+    return %0 : tensor<7x11x12xf32>
+  }
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
+        padding_dimensions = [0, 2], 
+        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
+      } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield 
+    }
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1] -> (-s1 + (s0 ceildiv 3) * 3)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1] -> (-s1 + (s0 ceildiv 3) * 3 + 5)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 5)>
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2, d0 + d1)>
+module {
+
+// CHECK-LABEL: @generic
+// CHECK-SAME:      %[[T0:.*]]: tensor<?x5xf32>,
+// CHECK-SAME:      %[[T1:.*]]: tensor<?x11x?xf32>)
+  func.func @generic(%arg0: tensor<?x5xf32>, %arg1: tensor<?x11x?xf32>) -> tensor<?x11x?xf32> {
+
+  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  //
+  //      CHECK: %[[D0_0:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x5xf32>
+  //      CHECK: %[[D0_1:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x5xf32>
+  //      CHECK: %[[H0:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]], %[[D0_1]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[%[[H0]], 0] {
+  //      CHECK:   : tensor<?x5xf32> to tensor<?x5xf32>
+  //
+  //      CHECK: %[[D0_2:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x11x?xf32>
+  //      CHECK: %[[H1:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]], %[[D0_2]]]
+  //      CHECK: %[[D2_0:.*]] = tensor.dim %{{.*}}, %[[C2]] : tensor<?x11x?xf32>
+  //      CHECK: %[[H2:.*]] = affine.apply #[[$MAP1]]()[%[[D0_0]], %[[D2_0]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0, 0] high[%[[H1]], 4, %[[H2]]] {
+  //      CHECK:   : tensor<?x11x?xf32> to tensor<?x15x?xf32>
+  //
+  //      CHECK: %[[D0_3:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x5xf32>
+  //      CHECK: %[[D2_1:.*]] = affine.apply #[[$MAP2]]()[%[[D0_3]]]
+  //      CHECK: linalg.generic {{.*}} ins(%{{.*}} : tensor<?x5xf32>) outs(%{{.*}} : tensor<?x15x?xf32>) {
+  //      CHECK: } -> tensor<?x15x?xf32>
+  //      CHECK: tensor.extract_slice %{{.*}}[0, 0, 0] [%[[D0_3]], 11, %[[D2_1]]] [1, 1, 1] : tensor<?x15x?xf32> to tensor<?x11x?xf32>
+  //
+  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0 : tensor<?x5xf32>) outs(%arg1 : tensor<?x11x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?x11x?xf32>
+    return %0 : tensor<?x11x?xf32>
+  }
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
+        padding_dimensions = [0, 2], 
+        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
+      } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield 
+    }
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
new file mode 100644
index 000000000000..e2b97cf908a8
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
@@ -0,0 +1,129 @@
+// RUN: mlir-opt --transform-interpreter -canonicalize -split-input-file --verify-diagnostics %s | FileCheck %s
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+  //      CHECK:   tensor.pad %{{.*}} 
+  //      CHECK:     : tensor<?x12xf32> to tensor<8x12xf32>
+  //      CHECK:   tensor.pad %{{.*}} 
+  //      CHECK:     : tensor<?x25xf32> to tensor<8x25xf32>
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x12xf32>, tensor<12x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+  //      CHECK:   tensor.extract_slice %{{.*}}[0, 0] [%{{.*}}, 25] [1, 1]
+  //      CHECK:     : tensor<8x25xf32> to tensor<?x25xf32>
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1]
+  // CHECK-SAME:     : tensor<?x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile to 5 then pad to 8 (supposedly to better hit vector ops).
+    %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul_l1 to padding_sizes [8] {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2, d0 + d1)>
+module {
+
+// CHECK-LABEL: @generic
+// CHECK-SAME:      %[[T0:.*]]: tensor<7x5xf32>,
+// CHECK-SAME:      %[[T1:.*]]: tensor<7x11x12xf32>)
+  func.func @generic(%arg0: tensor<7x5xf32>, %arg1: tensor<7x11x12xf32>) -> tensor<7x11x12xf32> {
+
+  //  CHECK-DAG: %[[CST:.*]] = arith.constant 0.
+
+  //      CHECK: %[[PAD0:.*]] = tensor.pad %[[T0]] low[0, 0] high[1, 0]
+  //      CHECK:   : tensor<7x5xf32> to tensor<8x5xf32>
+  //      CHECK: %[[PAD1:.*]] = tensor.pad %[[T1]] low[0, 0, 0] high[1, 3, 1] {
+  //      CHECK:   : tensor<7x11x12xf32> to tensor<8x14x13xf32>
+  // CHECK-NEXT: linalg.generic
+  //      CHECK: tensor.extract_slice %{{.*}}[0, 0, 0] [7, 11, 12] [1, 1, 1] : tensor<8x14x13xf32> to tensor<7x11x12xf32>
+  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0 : tensor<7x5xf32>) outs(%arg1 : tensor<7x11x12xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<7x11x12xf32>
+    return %0 : tensor<7x11x12xf32>
+  }
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 14] {
+        padding_dimensions = [0, 2], 
+        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
+      } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield 
+    }
+  }
+}
+
+
+// -----
+
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (-s0 + 8)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (-s0 + 13)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 5)>
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2, d0 + d1)>
+module {
+
+// CHECK-LABEL: @generic
+// CHECK-SAME:      %[[T0:.*]]: tensor<?x5xf32>,
+// CHECK-SAME:      %[[T1:.*]]: tensor<?x11x?xf32>)
+  func.func @generic(%arg0: tensor<?x5xf32>, %arg1: tensor<?x11x?xf32>) -> tensor<?x11x?xf32> {
+
+  //  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  //
+  //      CHECK: %[[D0_0:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x5xf32>
+  //      CHECK: %[[H0:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[%[[H0]], 0] {
+  //      CHECK:   : tensor<?x5xf32> to tensor<8x5xf32>
+  //
+  //      CHECK: %[[D0_1:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x11x?xf32>
+  //      CHECK: %[[H1:.*]] = affine.apply #[[$MAP0]]()[%[[D0_1]]]
+  //      CHECK: %[[D2_0:.*]] = tensor.dim %{{.*}}, %[[C2]] : tensor<?x11x?xf32>
+  //      CHECK: %[[H2:.*]] = affine.apply #[[$MAP1]]()[%[[D2_0]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0, 0] high[%[[H1]], 3, %[[H2]]] {
+  //      CHECK:   : tensor<?x11x?xf32> to tensor<8x14x13xf32>
+  //
+  //      CHECK: %[[D0_2:.*]] = tensor.dim %{{.*}}, %[[C0]] : tensor<?x5xf32>
+  //      CHECK: %[[D2_1:.*]] = affine.apply #[[$MAP2]]()[%[[D0_2]]]
+  //      CHECK: linalg.generic {{.*}} ins(%{{.*}} : tensor<8x5xf32>) outs(%{{.*}} : tensor<8x14x13xf32>) {
+  //      CHECK: } -> tensor<8x14x13xf32>
+  //      CHECK: tensor.extract_slice %{{.*}}[0, 0, 0] [%[[D0_2]], 11, %[[D2_1]]] [1, 1, 1] : tensor<8x14x13xf32> to tensor<?x11x?xf32>
+  //
+  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0 : tensor<?x5xf32>) outs(%arg1 : tensor<?x11x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?x11x?xf32>
+    return %0 : tensor<?x11x?xf32>
+  }
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 14] {
+        padding_dimensions = [0, 2], 
+        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
+      } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield 
+    }
+  }
+}
\ No newline at end of file

From 7af545237f8509d40b9ab1dd6526210e09bc76bf Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <Nico.Vasilache@amd.com>
Date: Fri, 20 Jun 2025 12:33:14 +0200
Subject: [PATCH 1025/1322] [NFC] Update
 transform-op-pad-tiling-interface-multiple-of.mlir

Missing NL
---
 .../Linalg/transform-op-pad-tiling-interface-multiple-of.mlir   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
index d502150e9e34..5ac35c14be3f 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
@@ -135,4 +135,4 @@ module {
       transform.yield 
     }
   }
-}
\ No newline at end of file
+}

From 227cd56e1330e51bc48f4fd71fabfc203b7ca31d Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <Nico.Vasilache@amd.com>
Date: Fri, 20 Jun 2025 12:33:56 +0200
Subject: [PATCH 1026/1322] [NFC] Update transform-op-pad-tiling-interface.mlir

Missing NL
---
 mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
index e2b97cf908a8..c361885693cb 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
@@ -126,4 +126,4 @@ module {
       transform.yield 
     }
   }
-}
\ No newline at end of file
+}

From 68732ce8e01938227378b4e6f7850ba85c978726 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Fri, 20 Jun 2025 11:46:25 +0100
Subject: [PATCH 1027/1322] [LLVM][CodeGen][SVE] Add isel for bfloat unordered
 reductions. (#143540)

The omissions are VECREDUCE_SEQ_* and MUL. The former goes down a
different code path and the latter is unsupported across all element types.
---
 .../SelectionDAG/LegalizeVectorOps.cpp        |  47 ++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  15 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +-
 .../CodeGen/AArch64/sve-bf16-reductions.ll    | 279 ++++++++++++++++++
 4 files changed, 329 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 4a1cd642233e..f908a66128ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -189,6 +189,12 @@ class VectorLegalizer {
 
   void PromoteSTRICT(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
+  /// Calculate the reduction using a type of higher precision and round the
+  /// result to match the original type. Setting NonArithmetic signifies the
+  /// rounding of the result does not affect its value.
+  void PromoteFloatVECREDUCE(SDNode *Node, SmallVectorImpl<SDValue> &Results,
+                             bool NonArithmetic);
+
 public:
   VectorLegalizer(SelectionDAG& dag) :
       DAG(dag), TLI(dag.getTargetLoweringInfo()) {}
@@ -500,21 +506,15 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN:
   case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMINIMUM:
   case ISD::VECREDUCE_FMUL:
   case ISD::VECTOR_FIND_LAST_ACTIVE:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
-  case ISD::VECREDUCE_FMAX:
-  case ISD::VECREDUCE_FMIN:
-  case ISD::VECREDUCE_FMAXIMUM:
-  case ISD::VECREDUCE_FMINIMUM:
-    Action = TLI.getOperationAction(Node->getOpcode(),
-                                    Node->getOperand(0).getValueType());
-    // Defer non-vector results to LegalizeDAG.
-    if (Action == TargetLowering::Promote)
-      Action = TargetLowering::Legal;
-    break;
   case ISD::VECREDUCE_SEQ_FADD:
   case ISD::VECREDUCE_SEQ_FMUL:
     Action = TLI.getOperationAction(Node->getOpcode(),
@@ -688,6 +688,24 @@ void VectorLegalizer::PromoteSTRICT(SDNode *Node,
   Results.push_back(Round.getValue(1));
 }
 
+void VectorLegalizer::PromoteFloatVECREDUCE(SDNode *Node,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            bool NonArithmetic) {
+  MVT OpVT = Node->getOperand(0).getSimpleValueType();
+  assert(OpVT.isFloatingPoint() && "Expected floating point reduction!");
+  MVT NewOpVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OpVT);
+
+  SDLoc DL(Node);
+  SDValue NewOp = DAG.getNode(ISD::FP_EXTEND, DL, NewOpVT, Node->getOperand(0));
+  SDValue Rdx =
+      DAG.getNode(Node->getOpcode(), DL, NewOpVT.getVectorElementType(), NewOp,
+                  Node->getFlags());
+  SDValue Res =
+      DAG.getNode(ISD::FP_ROUND, DL, Node->getValueType(0), Rdx,
+                  DAG.getIntPtrConstant(NonArithmetic, DL, /*isTarget=*/true));
+  Results.push_back(Res);
+}
+
 void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   // For a few operations there is a specific concept for promotion based on
   // the operand's type.
@@ -719,6 +737,15 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::STRICT_FMA:
     PromoteSTRICT(Node, Results);
     return;
+  case ISD::VECREDUCE_FADD:
+    PromoteFloatVECREDUCE(Node, Results, /*NonArithmetic=*/false);
+    return;
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMINIMUM:
+    PromoteFloatVECREDUCE(Node, Results, /*NonArithmetic=*/true);
+    return;
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
     // These operations are used to do promotion so they can't be promoted
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 52f19cc6e1ab..1e470318ced0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11413,13 +11413,9 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
   SDValue Op = Node->getOperand(0);
   EVT VT = Op.getValueType();
 
-  if (VT.isScalableVector())
-    report_fatal_error(
-        "Expanding reductions for scalable vectors is undefined.");
-
   // Try to use a shuffle reduction for power of two vectors.
   if (VT.isPow2VectorType()) {
-    while (VT.getVectorNumElements() > 1) {
+    while (VT.getVectorElementCount().isKnownMultipleOf(2)) {
       EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
       if (!isOperationLegalOrCustom(BaseOpcode, HalfVT))
         break;
@@ -11428,9 +11424,18 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
       std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
       Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi, Node->getFlags());
       VT = HalfVT;
+
+      // Stop if splitting is enough to make the reduction legal.
+      if (isOperationLegalOrCustom(Node->getOpcode(), HalfVT))
+        return DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Op,
+                           Node->getFlags());
     }
   }
 
+  if (VT.isScalableVector())
+    reportFatalInternalError(
+        "Expanding reductions for scalable vectors is undefined.");
+
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8f208c4d006c..eaaff0529cbd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1762,7 +1762,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     for (auto Opcode :
          {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
-          ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC}) {
+          ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
+          ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
+          ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
       setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
       setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
       setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll b/llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll
new file mode 100644
index 000000000000..7f79c9c5431e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16            < %s | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,SME
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; FADDV
+
+define bfloat @faddv_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: faddv_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat zeroinitializer, <vscale x 2 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @faddv_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: faddv_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @faddv_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: faddv_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call fast bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat zeroinitializer, <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
+; FMAXNMV
+
+define bfloat @fmaxv_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fmaxv_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmax.nxv2bf16(<vscale x 2 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fmaxv_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fmaxv_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fmaxv_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fmaxv_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
+; FMINNMV
+
+define bfloat @fminv_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fminv_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminnmv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmin.nxv2bf16(<vscale x 2 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fminv_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fminv_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminnmv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmin.nxv4bf16(<vscale x 4 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fminv_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fminv_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fminnmv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
+; FMAXV
+
+define bfloat @fmaximumv_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fmaximumv_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fmaxv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmaximum.nxv2bf16(<vscale x 2 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fmaximumv_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fmaximumv_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmaxv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmaximum.nxv4bf16(<vscale x 4 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fmaximumv_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fmaximumv_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmaxv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fmaximum.nxv8bf16(<vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
+; FMINV
+
+define bfloat @fminimumv_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fminimumv_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fminv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fminimum.nxv2bf16(<vscale x 2 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fminimumv_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fminimumv_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fminv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fminimum.nxv4bf16(<vscale x 4 x bfloat> %a)
+  ret bfloat %res
+}
+
+define bfloat @fminimumv_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fminimumv_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fminv s0, p0, z0.s
+; CHECK-NEXT:    bfcvt h0, s0
+; CHECK-NEXT:    ret
+  %res = call bfloat @llvm.vector.reduce.fminimum.nxv8bf16(<vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
+; The reduction is performed at a higher precision. Because add operations
+; can utilise that precision, its result must be rounded even if it's then
+; promoted.
+define float @promoted_fadd(<vscale x 4 x bfloat> %a) {
+; SVE-LABEL: promoted_fadd:
+; SVE:       // %bb.0:
+; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    faddv s0, p0, z0.s
+; SVE-NEXT:    bfcvt h0, s0
+; SVE-NEXT:    shll v0.4s, v0.4h, #16
+; SVE-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; SVE-NEXT:    ret
+;
+; SME-LABEL: promoted_fadd:
+; SME:       // %bb.0:
+; SME-NEXT:    lsl z0.s, z0.s, #16
+; SME-NEXT:    ptrue p0.s
+; SME-NEXT:    faddv s0, p0, z0.s
+; SME-NEXT:    bfcvt h0, s0
+; SME-NEXT:    fmov w8, s0
+; SME-NEXT:    lsl w8, w8, #16
+; SME-NEXT:    fmov s0, w8
+; SME-NEXT:    ret
+  %rdx = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a)
+  %res = fpext bfloat %rdx to float
+  ret float %res
+}
+
+; The reduction is performed at a higher precision. Because min/max operations
+; don't utilise that precision, its result can be used directly.
+define float @promoted_fmax(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: promoted_fmax:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %rdx = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a)
+  %res = fpext bfloat %rdx to float
+  ret float %res
+}
+
+declare bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat, <vscale x 2 x bfloat>)
+declare bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat, <vscale x 4 x bfloat>)
+declare bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat, <vscale x 8 x bfloat>)
+
+declare bfloat @llvm.vector.reduce.fmax.nxv2bf16(<vscale x 2 x bfloat>)
+declare bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat>)
+declare bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat>)
+
+declare bfloat @llvm.vector.reduce.fmin.nxv2bf16(<vscale x 2 x bfloat>)
+declare bfloat @llvm.vector.reduce.fmin.nxv4bf16(<vscale x 4 x bfloat>)
+declare bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat>)
+
+declare bfloat @llvm.vector.reduce.fmaximum.nxv2bf16(<vscale x 2 x bfloat>)
+declare bfloat @llvm.vector.reduce.fmaximum.nxv4bf16(<vscale x 4 x bfloat>)
+declare bfloat @llvm.vector.reduce.fmaximum.nxv8bf16(<vscale x 8 x bfloat>)
+
+declare bfloat @llvm.vector.reduce.fminimum.nxv2bf16(<vscale x 2 x bfloat>)
+declare bfloat @llvm.vector.reduce.fminimum.nxv4bf16(<vscale x 4 x bfloat>)
+declare bfloat @llvm.vector.reduce.fminimum.nxv8bf16(<vscale x 8 x bfloat>)

From 4ec6d127c1857e77d70236b15b03d23ba1283a3d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 11:43:01 +0100
Subject: [PATCH 1028/1322] [X86] movmsk-cmp.ll - regenerate VPTERNLOG asm
 comments

---
 llvm/test/CodeGen/X86/movmsk-cmp.ll | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 9b624a935bad..2f8cd4d41af5 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4098,7 +4098,7 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
 ; KNL-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $7, %edi
@@ -4143,7 +4143,7 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
 ; KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $3, %edi
 ; KNL-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
@@ -4200,7 +4200,7 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    movzbl -24(%rsp,%rdi,8), %eax
@@ -4247,7 +4247,7 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
 ; KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vcmpeq_uqps %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $3, %edi
 ; KNL-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
@@ -4291,7 +4291,7 @@ define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) {
 ; KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vcmplepd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    movzbl -24(%rsp,%rdi,8), %eax

From 3e99aa6c0a36ec4d6f126882b1a06436767cbf73 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Jun 2025 13:02:22 +0200
Subject: [PATCH 1029/1322] [PredicateInfo] Clean up DFS sorting (NFC)
 (#144943)

The comparison function for ValueDFS was confused in a number of ways.
Most significantly, it had a bunch of logic based on Def -- however, Def
is always null during sorting, it only gets set later. At this point
defs only have PInfo set.

Clean up the implementation to remove various dead code.
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 105 +++++++-------------
 1 file changed, 35 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 9b239d9161e7..97f13e3b2674 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -92,19 +92,6 @@ struct ValueDFS {
   PredicateBase *PInfo = nullptr;
 };
 
-// Perform a strict weak ordering on instructions and arguments.
-static bool valueComesBefore(const Value *A, const Value *B) {
-  auto *ArgA = dyn_cast_or_null<Argument>(A);
-  auto *ArgB = dyn_cast_or_null<Argument>(B);
-  if (ArgA && !ArgB)
-    return true;
-  if (ArgB && !ArgA)
-    return false;
-  if (ArgA && ArgB)
-    return ArgA->getArgNo() < ArgB->getArgNo();
-  return cast<Instruction>(A)->comesBefore(cast<Instruction>(B));
-}
-
 // This compares ValueDFS structures. Doing so allows us to walk the minimum
 // number of instructions necessary to compute our def/use ordering.
 struct ValueDFS_Compare {
@@ -114,28 +101,34 @@ struct ValueDFS_Compare {
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
       return false;
-    // The only case we can't directly compare them is when they in the same
-    // block, and both have localnum == middle.  In that case, we have to use
-    // comesbefore to see what the real ordering is, because they are in the
-    // same basic block.
+    assert(!A.Def && !B.Def && "Should not have Def during comparison");
 
-    assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) &&
+    // Order by block first.
+    if (A.DFSIn != B.DFSIn)
+      return A.DFSIn < B.DFSIn;
+    assert(A.DFSOut == B.DFSOut &&
            "Equal DFS-in numbers imply equal out numbers");
-    bool SameBlock = A.DFSIn == B.DFSIn;
+
+    // Then order by first/middle/last.
+    if (A.LocalNum != B.LocalNum)
+      return A.LocalNum < B.LocalNum;
 
     // We want to put the def that will get used for a given set of phi uses,
     // before those phi uses.
     // So we sort by edge, then by def.
     // Note that only phi nodes uses and defs can come last.
-    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+    if (A.LocalNum == LN_Last)
       return comparePHIRelated(A, B);
 
-    bool isADef = A.Def;
-    bool isBDef = B.Def;
-    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
-      return std::tie(A.DFSIn, A.LocalNum, isADef) <
-             std::tie(B.DFSIn, B.LocalNum, isBDef);
-    return localComesBefore(A, B);
+    // Use block-local ordering for instructions in the middle.
+    if (A.LocalNum == LN_Middle)
+      return localComesBefore(A, B);
+
+    // The order of PredicateInfo definitions at the start of the block does not
+    // matter.
+    assert(A.LocalNum == LN_First);
+    assert(A.PInfo && B.PInfo && "Must be predicate info def");
+    return false;
   }
 
   // For a phi use, or a non-materialized def, return the edge it represents.
@@ -173,60 +166,32 @@ struct ValueDFS_Compare {
     DomTreeNode *DomBDest = DT.getNode(BDest);
     unsigned AIn = DomADest->getDFSNumIn();
     unsigned BIn = DomBDest->getDFSNumIn();
-    bool isADef = A.Def;
-    bool isBDef = B.Def;
-    assert((!A.Def || !A.U) && (!B.Def || !B.U) &&
+    bool isAUse = A.U;
+    bool isBUse = B.U;
+    assert((!A.PInfo || !A.U) && (!B.PInfo || !B.U) &&
            "Def and U cannot be set at the same time");
     // Now sort by edge destination and then defs before uses.
-    return std::tie(AIn, isADef) < std::tie(BIn, isBDef);
+    return std::tie(AIn, isAUse) < std::tie(BIn, isBUse);
   }
 
-  // Get the definition of an instruction that occurs in the middle of a block.
-  Value *getMiddleDef(const ValueDFS &VD) const {
-    if (VD.Def)
-      return VD.Def;
-    // It's possible for the defs and uses to be null.  For branches, the local
-    // numbering will say the placed predicaeinfos should go first (IE
-    // LN_beginning), so we won't be in this function. For assumes, we will end
-    // up here, beause we need to order the def we will place relative to the
-    // assume.  So for the purpose of ordering, we pretend the def is right
-    // after the assume, because that is where we will insert the info.
-    if (!VD.U) {
-      assert(VD.PInfo &&
-             "No def, no use, and no predicateinfo should not occur");
-      assert(isa<PredicateAssume>(VD.PInfo) &&
-             "Middle of block should only occur for assumes");
-      return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode();
-    }
-    return nullptr;
-  }
+  const Instruction *getDefOrUser(const ValueDFS &VD) const {
+    if (VD.U)
+      return cast<Instruction>(VD.U->getUser());
 
-  // Return either the Def, if it's not null, or the user of the Use, if the def
-  // is null.
-  const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
-    if (Def)
-      return cast<Instruction>(Def);
-    return cast<Instruction>(U->getUser());
+    // For the purpose of ordering, we pretend the def is right after the
+    // assume, because that is where we will insert the info.
+    assert(VD.PInfo && "No use, and no predicateinfo should not occur");
+    assert(isa<PredicateAssume>(VD.PInfo) &&
+           "Middle of block should only occur for assumes");
+    return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode();
   }
 
   // This performs the necessary local basic block ordering checks to tell
   // whether A comes before B, where both are in the same basic block.
   bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
-    auto *ADef = getMiddleDef(A);
-    auto *BDef = getMiddleDef(B);
-
-    // See if we have real values or uses. If we have real values, we are
-    // guaranteed they are instructions or arguments. No matter what, we are
-    // guaranteed they are in the same block if they are instructions.
-    auto *ArgA = dyn_cast_or_null<Argument>(ADef);
-    auto *ArgB = dyn_cast_or_null<Argument>(BDef);
-
-    if (ArgA || ArgB)
-      return valueComesBefore(ArgA, ArgB);
-
-    auto *AInst = getDefOrUser(ADef, A.U);
-    auto *BInst = getDefOrUser(BDef, B.U);
-    return valueComesBefore(AInst, BInst);
+    const Instruction *AInst = getDefOrUser(A);
+    const Instruction *BInst = getDefOrUser(B);
+    return AInst->comesBefore(BInst);
   }
 };
 

From cbd496581fb6953a9a8d8387a010cc3a67d4654b Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson@amd.com>
Date: Fri, 20 Jun 2025 12:17:01 +0100
Subject: [PATCH 1030/1322] [NFC][AMDGPU] Automate any_extend_vector_inreg.ll
 check line generation (#145013)

Convert the test to use update_llc_test_checks.py.
---
 .../CodeGen/AMDGPU/any_extend_vector_inreg.ll | 167 +++++++++++++++---
 1 file changed, 143 insertions(+), 24 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
index 8bcef24c8e23..cc9f595f9d0b 100644
--- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -1,30 +1,149 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
 
-; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
-; GCN: s_load_dwordx8
-; GCN-DAG: s_load_dword
 
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
-; GCN: {{buffer|flat}}_store_byte
 define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) %arg1) local_unnamed_addr #0 {
+; GFX6-LABEL: any_extend_vector_inreg_v16i8_to_v4i32:
+; GFX6:       ; %bb.0: ; %bb
+; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s14
+; GFX6-NEXT:    s_mov_b32 s1, s15
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[12:13], 0x0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s4, s[12:13], 0x8
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:13
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:15
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:14
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:11
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:10
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:6
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:3
+; GFX6-NEXT:    s_lshr_b32 s8, s9, 16
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[4:5], 8
+; GFX6-NEXT:    v_mov_b32_e32 v1, s11
+; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:9
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
+; GFX6-NEXT:    v_alignbit_b32 v0, s8, v0, 16
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s7
+; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:12
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX6-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:5
+; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:7
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: any_extend_vector_inreg_v16i8_to_v4i32:
+; GFX8:       ; %bb.0: ; %bb
+; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[8:9], 0x20
+; GFX8-NEXT:    s_lshr_b32 s6, s5, 24
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[0:1], 8
+; GFX8-NEXT:    s_add_u32 s4, s10, 13
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 15
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 14
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 8
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 11
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 10
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 4
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 6
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 1
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    s_add_u32 s4, s10, 3
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_add_u32 s4, s10, 9
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    s_add_u32 s4, s10, 2
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s5, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_add_u32 s0, s10, 5
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s1, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    s_add_u32 s0, s10, 12
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s1, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    s_add_u32 s0, s10, 7
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_addc_u32 s1, s11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    flat_store_byte v[0:1], v2
+; GFX8-NEXT:    s_endpgm
 bb:
   %tmp2 = load <16 x i8>, ptr addrspace(1) %arg, align 16
   %tmp3 = extractelement <16 x i8> %tmp2, i64 4

From e5559ca45f211f2cdd9c81e46935afe1cc2e22ab Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 20 Jun 2025 06:44:14 -0500
Subject: [PATCH 1031/1322] [flang][OpenMP] Move lowering of ATOMIC to separate
 file, NFC (#144960)

---
 flang/lib/Lower/CMakeLists.txt    |   1 +
 flang/lib/Lower/OpenMP/Atomic.cpp | 510 ++++++++++++++++++++++++++++++
 flang/lib/Lower/OpenMP/Atomic.h   |  36 +++
 flang/lib/Lower/OpenMP/OpenMP.cpp | 470 +--------------------------
 4 files changed, 549 insertions(+), 468 deletions(-)
 create mode 100644 flang/lib/Lower/OpenMP/Atomic.cpp
 create mode 100644 flang/lib/Lower/OpenMP/Atomic.h

diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 9c5db2b12651..8049cdf33317 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -23,6 +23,7 @@ add_flang_library(FortranLower
   LoweringOptions.cpp
   Mangler.cpp
   OpenACC.cpp
+  OpenMP/Atomic.cpp
   OpenMP/ClauseProcessor.cpp
   OpenMP/Clauses.cpp
   OpenMP/DataSharingProcessor.cpp
diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp
new file mode 100644
index 000000000000..33a743f8f9dd
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Atomic.cpp
@@ -0,0 +1,510 @@
+//===-- Atomic.cpp -- Lowering of atomic constructs -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Atomic.h"
+#include "Clauses.h"
+#include "flang/Evaluate/expression.h"
+#include "flang/Evaluate/fold.h"
+#include "flang/Evaluate/tools.h"
+#include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/StatementContext.h"
+#include "flang/Lower/SymbolMap.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/semantics.h"
+#include "flang/Semantics/type.h"
+#include "flang/Support/Fortran.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+static llvm::cl::opt<bool> DumpAtomicAnalysis("fdebug-dump-atomic-analysis");
+
+using namespace Fortran;
+
+// Don't import the entire Fortran::lower.
+namespace omp {
+using namespace Fortran::lower::omp;
+}
+
+[[maybe_unused]] static void
+dumpAtomicAnalysis(const parser::OpenMPAtomicConstruct::Analysis &analysis) {
+  auto whatStr = [](int k) {
+    std::string txt = "?";
+    switch (k & parser::OpenMPAtomicConstruct::Analysis::Action) {
+    case parser::OpenMPAtomicConstruct::Analysis::None:
+      txt = "None";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::Read:
+      txt = "Read";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::Write:
+      txt = "Write";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::Update:
+      txt = "Update";
+      break;
+    }
+    switch (k & parser::OpenMPAtomicConstruct::Analysis::Condition) {
+    case parser::OpenMPAtomicConstruct::Analysis::IfTrue:
+      txt += " | IfTrue";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::IfFalse:
+      txt += " | IfFalse";
+      break;
+    }
+    return txt;
+  };
+
+  auto exprStr = [&](const parser::TypedExpr &expr) {
+    if (auto *maybe = expr.get()) {
+      if (maybe->v)
+        return maybe->v->AsFortran();
+    }
+    return "<null>"s;
+  };
+  auto assignStr = [&](const parser::AssignmentStmt::TypedAssignment &assign) {
+    if (auto *maybe = assign.get(); maybe && maybe->v) {
+      std::string str;
+      llvm::raw_string_ostream os(str);
+      maybe->v->AsFortran(os);
+      return str;
+    }
+    return "<null>"s;
+  };
+
+  const semantics::SomeExpr &atom = *analysis.atom.get()->v;
+
+  llvm::errs() << "Analysis {\n";
+  llvm::errs() << "  atom: " << atom.AsFortran() << "\n";
+  llvm::errs() << "  cond: " << exprStr(analysis.cond) << "\n";
+  llvm::errs() << "  op0 {\n";
+  llvm::errs() << "    what: " << whatStr(analysis.op0.what) << "\n";
+  llvm::errs() << "    assign: " << assignStr(analysis.op0.assign) << "\n";
+  llvm::errs() << "  }\n";
+  llvm::errs() << "  op1 {\n";
+  llvm::errs() << "    what: " << whatStr(analysis.op1.what) << "\n";
+  llvm::errs() << "    assign: " << assignStr(analysis.op1.assign) << "\n";
+  llvm::errs() << "  }\n";
+  llvm::errs() << "}\n";
+}
+
+static bool isPointerAssignment(const evaluate::Assignment &assign) {
+  return common::visit(
+      common::visitors{
+          [](const evaluate::Assignment::BoundsSpec &) { return true; },
+          [](const evaluate::Assignment::BoundsRemapping &) { return true; },
+          [](const auto &) { return false; },
+      },
+      assign.u);
+}
+
+static fir::FirOpBuilder::InsertPoint
+getInsertionPointBefore(mlir::Operation *op) {
+  return fir::FirOpBuilder::InsertPoint(op->getBlock(),
+                                        mlir::Block::iterator(op));
+}
+
+static fir::FirOpBuilder::InsertPoint
+getInsertionPointAfter(mlir::Operation *op) {
+  return fir::FirOpBuilder::InsertPoint(op->getBlock(),
+                                        ++mlir::Block::iterator(op));
+}
+
+static mlir::IntegerAttr getAtomicHint(lower::AbstractConverter &converter,
+                                       const omp::List<omp::Clause> &clauses) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  for (const omp::Clause &clause : clauses) {
+    if (clause.id != llvm::omp::Clause::OMPC_hint)
+      continue;
+    auto &hint = std::get<omp::clause::Hint>(clause.u);
+    auto maybeVal = evaluate::ToInt64(hint.v);
+    CHECK(maybeVal);
+    return builder.getI64IntegerAttr(*maybeVal);
+  }
+  return nullptr;
+}
+
+static mlir::omp::ClauseMemoryOrderKind
+getMemoryOrderKind(common::OmpMemoryOrderType kind) {
+  switch (kind) {
+  case common::OmpMemoryOrderType::Acq_Rel:
+    return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
+  case common::OmpMemoryOrderType::Acquire:
+    return mlir::omp::ClauseMemoryOrderKind::Acquire;
+  case common::OmpMemoryOrderType::Relaxed:
+    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  case common::OmpMemoryOrderType::Release:
+    return mlir::omp::ClauseMemoryOrderKind::Release;
+  case common::OmpMemoryOrderType::Seq_Cst:
+    return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
+  }
+  llvm_unreachable("Unexpected kind");
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getMemoryOrderKind(llvm::omp::Clause clauseId) {
+  switch (clauseId) {
+  case llvm::omp::Clause::OMPC_acq_rel:
+    return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
+  case llvm::omp::Clause::OMPC_acquire:
+    return mlir::omp::ClauseMemoryOrderKind::Acquire;
+  case llvm::omp::Clause::OMPC_relaxed:
+    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  case llvm::omp::Clause::OMPC_release:
+    return mlir::omp::ClauseMemoryOrderKind::Release;
+  case llvm::omp::Clause::OMPC_seq_cst:
+    return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
+  default:
+    return std::nullopt;
+  }
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getMemoryOrderFromRequires(const semantics::Scope &scope) {
+  // The REQUIRES construct is only allowed in the main program scope
+  // and module scope, but seems like we also accept it in a subprogram
+  // scope.
+  // For safety, traverse all enclosing scopes and check if their symbol
+  // contains REQUIRES.
+  for (const auto *sc{&scope}; sc->kind() != semantics::Scope::Kind::Global;
+       sc = &sc->parent()) {
+    const semantics::Symbol *sym = sc->symbol();
+    if (!sym)
+      continue;
+
+    const common::OmpMemoryOrderType *admo = common::visit(
+        [](auto &&s) {
+          using WithOmpDeclarative = semantics::WithOmpDeclarative;
+          if constexpr (std::is_convertible_v<decltype(s),
+                                              const WithOmpDeclarative &>) {
+            return s.ompAtomicDefaultMemOrder();
+          }
+          return static_cast<const common::OmpMemoryOrderType *>(nullptr);
+        },
+        sym->details());
+    if (admo)
+      return getMemoryOrderKind(*admo);
+  }
+
+  return std::nullopt;
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getDefaultAtomicMemOrder(semantics::SemanticsContext &semaCtx) {
+  unsigned version = semaCtx.langOptions().OpenMPVersion;
+  if (version > 50)
+    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  return std::nullopt;
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+getAtomicMemoryOrder(semantics::SemanticsContext &semaCtx,
+                     const omp::List<omp::Clause> &clauses,
+                     const semantics::Scope &scope) {
+  for (const omp::Clause &clause : clauses) {
+    if (auto maybeKind = getMemoryOrderKind(clause.id))
+      return *maybeKind;
+  }
+
+  if (auto maybeKind = getMemoryOrderFromRequires(scope))
+    return *maybeKind;
+
+  return getDefaultAtomicMemOrder(semaCtx);
+}
+
+static mlir::omp::ClauseMemoryOrderKindAttr
+makeMemOrderAttr(lower::AbstractConverter &converter,
+                 std::optional<mlir::omp::ClauseMemoryOrderKind> maybeKind) {
+  if (maybeKind) {
+    return mlir::omp::ClauseMemoryOrderKindAttr::get(
+        converter.getFirOpBuilder().getContext(), *maybeKind);
+  }
+  return nullptr;
+}
+
+static mlir::Operation * //
+genAtomicRead(lower::AbstractConverter &converter,
+              semantics::SemanticsContext &semaCtx, mlir::Location loc,
+              lower::StatementContext &stmtCtx, mlir::Value atomAddr,
+              const semantics::SomeExpr &atom,
+              const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+              std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
+              fir::FirOpBuilder::InsertPoint preAt,
+              fir::FirOpBuilder::InsertPoint atomicAt,
+              fir::FirOpBuilder::InsertPoint postAt) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(preAt);
+
+  // If the atomic clause is read then the memory-order clause must
+  // not be release.
+  if (memOrder) {
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release) {
+      // Reset it back to the default.
+      memOrder = getDefaultAtomicMemOrder(semaCtx);
+    } else if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel) {
+      // The MLIR verifier doesn't like acq_rel either.
+      memOrder = mlir::omp::ClauseMemoryOrderKind::Acquire;
+    }
+  }
+
+  mlir::Value storeAddr =
+      fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx, &loc));
+  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
+  mlir::Type storeType = fir::unwrapRefType(storeAddr.getType());
+
+  mlir::Value toAddr = [&]() {
+    if (atomType == storeType)
+      return storeAddr;
+    return builder.createTemporary(loc, atomType, ".tmp.atomval");
+  }();
+
+  builder.restoreInsertionPoint(atomicAt);
+  mlir::Operation *op = builder.create<mlir::omp::AtomicReadOp>(
+      loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint,
+      makeMemOrderAttr(converter, memOrder));
+
+  if (atomType != storeType) {
+    lower::ExprToValueMap overrides;
+    // The READ operation could be a part of UPDATE CAPTURE, so make sure
+    // we don't emit extra code into the body of the atomic op.
+    builder.restoreInsertionPoint(postAt);
+    mlir::Value load = builder.create<fir::LoadOp>(loc, toAddr);
+    overrides.try_emplace(&atom, load);
+
+    converter.overrideExprValues(&overrides);
+    mlir::Value value =
+        fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
+    converter.resetExprOverrides();
+
+    builder.create<fir::StoreOp>(loc, value, storeAddr);
+  }
+  return op;
+}
+
+static mlir::Operation * //
+genAtomicWrite(lower::AbstractConverter &converter,
+               semantics::SemanticsContext &semaCtx, mlir::Location loc,
+               lower::StatementContext &stmtCtx, mlir::Value atomAddr,
+               const semantics::SomeExpr &atom,
+               const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+               std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
+               fir::FirOpBuilder::InsertPoint preAt,
+               fir::FirOpBuilder::InsertPoint atomicAt,
+               fir::FirOpBuilder::InsertPoint postAt) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(preAt);
+
+  // If the atomic clause is write then the memory-order clause must
+  // not be acquire.
+  if (memOrder) {
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire) {
+      // Reset it back to the default.
+      memOrder = getDefaultAtomicMemOrder(semaCtx);
+    } else if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel) {
+      // The MLIR verifier doesn't like acq_rel either.
+      memOrder = mlir::omp::ClauseMemoryOrderKind::Release;
+    }
+  }
+
+  mlir::Value value =
+      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
+  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
+  mlir::Value converted = builder.createConvert(loc, atomType, value);
+
+  builder.restoreInsertionPoint(atomicAt);
+  mlir::Operation *op = builder.create<mlir::omp::AtomicWriteOp>(
+      loc, atomAddr, converted, hint, makeMemOrderAttr(converter, memOrder));
+  return op;
+}
+
+static mlir::Operation *
+genAtomicUpdate(lower::AbstractConverter &converter,
+                semantics::SemanticsContext &semaCtx, mlir::Location loc,
+                lower::StatementContext &stmtCtx, mlir::Value atomAddr,
+                const semantics::SomeExpr &atom,
+                const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+                std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
+                fir::FirOpBuilder::InsertPoint preAt,
+                fir::FirOpBuilder::InsertPoint atomicAt,
+                fir::FirOpBuilder::InsertPoint postAt) {
+  lower::ExprToValueMap overrides;
+  lower::StatementContext naCtx;
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(preAt);
+
+  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
+
+  // This must exist by now.
+  semantics::SomeExpr input = *evaluate::GetConvertInput(assign.rhs);
+  std::vector<semantics::SomeExpr> args =
+      evaluate::GetTopLevelOperation(input).second;
+  assert(!args.empty() && "Update operation without arguments");
+  for (auto &arg : args) {
+    if (!evaluate::IsSameOrConvertOf(arg, atom)) {
+      mlir::Value val = fir::getBase(converter.genExprValue(arg, naCtx, &loc));
+      overrides.try_emplace(&arg, val);
+    }
+  }
+
+  builder.restoreInsertionPoint(atomicAt);
+  auto updateOp = builder.create<mlir::omp::AtomicUpdateOp>(
+      loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder));
+
+  mlir::Region &region = updateOp->getRegion(0);
+  mlir::Block *block = builder.createBlock(&region, {}, {atomType}, {loc});
+  mlir::Value localAtom = fir::getBase(block->getArgument(0));
+  overrides.try_emplace(&atom, localAtom);
+
+  converter.overrideExprValues(&overrides);
+  mlir::Value updated =
+      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
+  mlir::Value converted = builder.createConvert(loc, atomType, updated);
+  builder.create<mlir::omp::YieldOp>(loc, converted);
+  converter.resetExprOverrides();
+
+  builder.restoreInsertionPoint(postAt); // For naCtx cleanups
+  return updateOp;
+}
+
+static mlir::Operation *
+genAtomicOperation(lower::AbstractConverter &converter,
+                   semantics::SemanticsContext &semaCtx, mlir::Location loc,
+                   lower::StatementContext &stmtCtx, int action,
+                   mlir::Value atomAddr, const semantics::SomeExpr &atom,
+                   const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+                   std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
+                   fir::FirOpBuilder::InsertPoint preAt,
+                   fir::FirOpBuilder::InsertPoint atomicAt,
+                   fir::FirOpBuilder::InsertPoint postAt) {
+  if (isPointerAssignment(assign)) {
+    TODO(loc, "Code generation for pointer assignment is not implemented yet");
+  }
+
+  // This function and the functions called here do not preserve the
+  // builder's insertion point, or set it to anything specific.
+  switch (action) {
+  case parser::OpenMPAtomicConstruct::Analysis::Read:
+    return genAtomicRead(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
+                         assign, hint, memOrder, preAt, atomicAt, postAt);
+  case parser::OpenMPAtomicConstruct::Analysis::Write:
+    return genAtomicWrite(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
+                          assign, hint, memOrder, preAt, atomicAt, postAt);
+  case parser::OpenMPAtomicConstruct::Analysis::Update:
+    return genAtomicUpdate(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
+                           assign, hint, memOrder, preAt, atomicAt, postAt);
+  default:
+    return nullptr;
+  }
+}
+
+void Fortran::lower::omp::lowerAtomic(
+    AbstractConverter &converter, SymMap &symTable,
+    semantics::SemanticsContext &semaCtx, pft::Evaluation &eval,
+    const parser::OpenMPAtomicConstruct &construct) {
+  auto get = [](auto &&typedWrapper) -> decltype(&*typedWrapper.get()->v) {
+    if (auto *maybe = typedWrapper.get(); maybe && maybe->v) {
+      return &*maybe->v;
+    } else {
+      return nullptr;
+    }
+  };
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  auto &dirSpec = std::get<parser::OmpDirectiveSpecification>(construct.t);
+  omp::List<omp::Clause> clauses = makeClauses(dirSpec.Clauses(), semaCtx);
+  lower::StatementContext stmtCtx;
+
+  const parser::OpenMPAtomicConstruct::Analysis &analysis = construct.analysis;
+  if (DumpAtomicAnalysis)
+    dumpAtomicAnalysis(analysis);
+
+  const semantics::SomeExpr &atom = *get(analysis.atom);
+  mlir::Location loc = converter.genLocation(construct.source);
+  mlir::Value atomAddr =
+      fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc));
+  mlir::IntegerAttr hint = getAtomicHint(converter, clauses);
+  std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder =
+      getAtomicMemoryOrder(semaCtx, clauses,
+                           semaCtx.FindScope(construct.source));
+
+  if (auto *cond = get(analysis.cond)) {
+    (void)cond;
+    TODO(loc, "OpenMP ATOMIC COMPARE");
+  } else {
+    int action0 = analysis.op0.what & analysis.Action;
+    int action1 = analysis.op1.what & analysis.Action;
+    mlir::Operation *captureOp = nullptr;
+    fir::FirOpBuilder::InsertPoint preAt = builder.saveInsertionPoint();
+    fir::FirOpBuilder::InsertPoint atomicAt, postAt;
+
+    if (construct.IsCapture()) {
+      // Capturing operation.
+      assert(action0 != analysis.None && action1 != analysis.None &&
+             "Expexcing two actions");
+      (void)action0;
+      (void)action1;
+      captureOp = builder.create<mlir::omp::AtomicCaptureOp>(
+          loc, hint, makeMemOrderAttr(converter, memOrder));
+      // Set the non-atomic insertion point to before the atomic.capture.
+      preAt = getInsertionPointBefore(captureOp);
+
+      mlir::Block *block = builder.createBlock(&captureOp->getRegion(0));
+      builder.setInsertionPointToEnd(block);
+      // Set the atomic insertion point to before the terminator inside
+      // atomic.capture.
+      mlir::Operation *term = builder.create<mlir::omp::TerminatorOp>(loc);
+      atomicAt = getInsertionPointBefore(term);
+      postAt = getInsertionPointAfter(captureOp);
+      hint = nullptr;
+      memOrder = std::nullopt;
+    } else {
+      // Non-capturing operation.
+      assert(action0 != analysis.None && action1 == analysis.None &&
+             "Expexcing single action");
+      assert(!(analysis.op0.what & analysis.Condition));
+      postAt = atomicAt = preAt;
+    }
+
+    // The builder's insertion point needs to be specifically set before
+    // each call to `genAtomicOperation`.
+    mlir::Operation *firstOp = genAtomicOperation(
+        converter, semaCtx, loc, stmtCtx, analysis.op0.what, atomAddr, atom,
+        *get(analysis.op0.assign), hint, memOrder, preAt, atomicAt, postAt);
+    assert(firstOp && "Should have created an atomic operation");
+    atomicAt = getInsertionPointAfter(firstOp);
+
+    mlir::Operation *secondOp = nullptr;
+    if (analysis.op1.what != analysis.None) {
+      secondOp = genAtomicOperation(
+          converter, semaCtx, loc, stmtCtx, analysis.op1.what, atomAddr, atom,
+          *get(analysis.op1.assign), hint, memOrder, preAt, atomicAt, postAt);
+    }
+
+    if (construct.IsCapture()) {
+      // If this is a capture operation, the first/second ops will be inside
+      // of it. Set the insertion point to past the capture op itself.
+      builder.restoreInsertionPoint(postAt);
+    } else {
+      if (secondOp) {
+        builder.setInsertionPointAfter(secondOp);
+      } else {
+        builder.setInsertionPointAfter(firstOp);
+      }
+    }
+  }
+}
diff --git a/flang/lib/Lower/OpenMP/Atomic.h b/flang/lib/Lower/OpenMP/Atomic.h
new file mode 100644
index 000000000000..b83773b11300
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Atomic.h
@@ -0,0 +1,36 @@
+//===-- Atomic.h -- Lowering of atomic constructs -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_OPENMP_ATOMIC_H
+#define FORTRAN_LOWER_OPENMP_ATOMIC_H
+
+namespace Fortran {
+namespace lower {
+class AbstractConverter;
+class SymMap;
+} // namespace lower
+
+namespace parser {
+struct OpenMPAtomicConstruct;
+}
+
+namespace pft {
+struct Evaluation;
+}
+
+namespace semantics {
+class SemanticsContext;
+}
+} // namespace Fortran
+
+namespace Fortran::lower::omp {
+void lowerAtomic(AbstractConverter &converter, SymMap &symTable,
+                 semantics::SemanticsContext &semaCtx, pft::Evaluation &eval,
+                 const parser::OpenMPAtomicConstruct &construct);
+}
+
+#endif // FORTRAN_LOWER_OPENMP_ATOMIC_H
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1c51fd60d570..ddf58fd87444 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Lower/OpenMP.h"
 
+#include "Atomic.h"
 #include "ClauseProcessor.h"
 #include "Clauses.h"
 #include "DataSharingProcessor.h"
@@ -41,13 +42,10 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Support/CommandLine.h"
 
 using namespace Fortran::lower::omp;
 using namespace Fortran::common::openmp;
 
-static llvm::cl::opt<bool> DumpAtomicAnalysis("fdebug-dump-atomic-analysis");
-
 //===----------------------------------------------------------------------===//
 // Code generation helper functions
 //===----------------------------------------------------------------------===//
@@ -1127,16 +1125,6 @@ markDeclareTarget(mlir::Operation *op, lower::AbstractConverter &converter,
   declareTargetOp.setDeclareTarget(deviceType, captureClause);
 }
 
-static bool isPointerAssignment(const evaluate::Assignment &assign) {
-  return common::visit(
-      common::visitors{
-          [](const evaluate::Assignment::BoundsSpec &) { return true; },
-          [](const evaluate::Assignment::BoundsRemapping &) { return true; },
-          [](const auto &) { return false; },
-      },
-      assign.u);
-}
-
 //===----------------------------------------------------------------------===//
 // Op body generation helper structures and functions
 //===----------------------------------------------------------------------===//
@@ -2694,308 +2682,6 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       queue, item, clauseOps);
 }
 
-//===----------------------------------------------------------------------===//
-// Code generation for atomic operations
-//===----------------------------------------------------------------------===//
-static fir::FirOpBuilder::InsertPoint
-getInsertionPointBefore(mlir::Operation *op) {
-  return fir::FirOpBuilder::InsertPoint(op->getBlock(),
-                                        mlir::Block::iterator(op));
-}
-
-static fir::FirOpBuilder::InsertPoint
-getInsertionPointAfter(mlir::Operation *op) {
-  return fir::FirOpBuilder::InsertPoint(op->getBlock(),
-                                        ++mlir::Block::iterator(op));
-}
-
-static mlir::IntegerAttr getAtomicHint(lower::AbstractConverter &converter,
-                                       const List<Clause> &clauses) {
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  for (const Clause &clause : clauses) {
-    if (clause.id != llvm::omp::Clause::OMPC_hint)
-      continue;
-    auto &hint = std::get<clause::Hint>(clause.u);
-    auto maybeVal = evaluate::ToInt64(hint.v);
-    CHECK(maybeVal);
-    return builder.getI64IntegerAttr(*maybeVal);
-  }
-  return nullptr;
-}
-
-static mlir::omp::ClauseMemoryOrderKind
-getMemoryOrderKind(common::OmpMemoryOrderType kind) {
-  switch (kind) {
-  case common::OmpMemoryOrderType::Acq_Rel:
-    return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
-  case common::OmpMemoryOrderType::Acquire:
-    return mlir::omp::ClauseMemoryOrderKind::Acquire;
-  case common::OmpMemoryOrderType::Relaxed:
-    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
-  case common::OmpMemoryOrderType::Release:
-    return mlir::omp::ClauseMemoryOrderKind::Release;
-  case common::OmpMemoryOrderType::Seq_Cst:
-    return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
-  }
-  llvm_unreachable("Unexpected kind");
-}
-
-static std::optional<mlir::omp::ClauseMemoryOrderKind>
-getMemoryOrderKind(llvm::omp::Clause clauseId) {
-  switch (clauseId) {
-  case llvm::omp::Clause::OMPC_acq_rel:
-    return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
-  case llvm::omp::Clause::OMPC_acquire:
-    return mlir::omp::ClauseMemoryOrderKind::Acquire;
-  case llvm::omp::Clause::OMPC_relaxed:
-    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
-  case llvm::omp::Clause::OMPC_release:
-    return mlir::omp::ClauseMemoryOrderKind::Release;
-  case llvm::omp::Clause::OMPC_seq_cst:
-    return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
-  default:
-    return std::nullopt;
-  }
-}
-
-static std::optional<mlir::omp::ClauseMemoryOrderKind>
-getMemoryOrderFromRequires(const semantics::Scope &scope) {
-  // The REQUIRES construct is only allowed in the main program scope
-  // and module scope, but seems like we also accept it in a subprogram
-  // scope.
-  // For safety, traverse all enclosing scopes and check if their symbol
-  // contains REQUIRES.
-  for (const auto *sc{&scope}; sc->kind() != semantics::Scope::Kind::Global;
-       sc = &sc->parent()) {
-    const semantics::Symbol *sym = sc->symbol();
-    if (!sym)
-      continue;
-
-    const common::OmpMemoryOrderType *admo = common::visit(
-        [](auto &&s) {
-          using WithOmpDeclarative = semantics::WithOmpDeclarative;
-          if constexpr (std::is_convertible_v<decltype(s),
-                                              const WithOmpDeclarative &>) {
-            return s.ompAtomicDefaultMemOrder();
-          }
-          return static_cast<const common::OmpMemoryOrderType *>(nullptr);
-        },
-        sym->details());
-    if (admo)
-      return getMemoryOrderKind(*admo);
-  }
-
-  return std::nullopt;
-}
-
-static std::optional<mlir::omp::ClauseMemoryOrderKind>
-getDefaultAtomicMemOrder(semantics::SemanticsContext &semaCtx) {
-  unsigned version = semaCtx.langOptions().OpenMPVersion;
-  if (version > 50)
-    return mlir::omp::ClauseMemoryOrderKind::Relaxed;
-  return std::nullopt;
-}
-
-static std::optional<mlir::omp::ClauseMemoryOrderKind>
-getAtomicMemoryOrder(semantics::SemanticsContext &semaCtx,
-                     const List<Clause> &clauses,
-                     const semantics::Scope &scope) {
-  for (const Clause &clause : clauses) {
-    if (auto maybeKind = getMemoryOrderKind(clause.id))
-      return *maybeKind;
-  }
-
-  if (auto maybeKind = getMemoryOrderFromRequires(scope))
-    return *maybeKind;
-
-  return getDefaultAtomicMemOrder(semaCtx);
-}
-
-static mlir::omp::ClauseMemoryOrderKindAttr
-makeMemOrderAttr(lower::AbstractConverter &converter,
-                 std::optional<mlir::omp::ClauseMemoryOrderKind> maybeKind) {
-  if (maybeKind) {
-    return mlir::omp::ClauseMemoryOrderKindAttr::get(
-        converter.getFirOpBuilder().getContext(), *maybeKind);
-  }
-  return nullptr;
-}
-
-static mlir::Operation * //
-genAtomicRead(lower::AbstractConverter &converter,
-              semantics::SemanticsContext &semaCtx, mlir::Location loc,
-              lower::StatementContext &stmtCtx, mlir::Value atomAddr,
-              const semantics::SomeExpr &atom,
-              const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-              std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
-              fir::FirOpBuilder::InsertPoint preAt,
-              fir::FirOpBuilder::InsertPoint atomicAt,
-              fir::FirOpBuilder::InsertPoint postAt) {
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  builder.restoreInsertionPoint(preAt);
-
-  // If the atomic clause is read then the memory-order clause must
-  // not be release.
-  if (memOrder) {
-    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release) {
-      // Reset it back to the default.
-      memOrder = getDefaultAtomicMemOrder(semaCtx);
-    } else if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel) {
-      // The MLIR verifier doesn't like acq_rel either.
-      memOrder = mlir::omp::ClauseMemoryOrderKind::Acquire;
-    }
-  }
-
-  mlir::Value storeAddr =
-      fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx, &loc));
-  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
-  mlir::Type storeType = fir::unwrapRefType(storeAddr.getType());
-
-  mlir::Value toAddr = [&]() {
-    if (atomType == storeType)
-      return storeAddr;
-    return builder.createTemporary(loc, atomType, ".tmp.atomval");
-  }();
-
-  builder.restoreInsertionPoint(atomicAt);
-  mlir::Operation *op = builder.create<mlir::omp::AtomicReadOp>(
-      loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint,
-      makeMemOrderAttr(converter, memOrder));
-
-  if (atomType != storeType) {
-    lower::ExprToValueMap overrides;
-    // The READ operation could be a part of UPDATE CAPTURE, so make sure
-    // we don't emit extra code into the body of the atomic op.
-    builder.restoreInsertionPoint(postAt);
-    mlir::Value load = builder.create<fir::LoadOp>(loc, toAddr);
-    overrides.try_emplace(&atom, load);
-
-    converter.overrideExprValues(&overrides);
-    mlir::Value value =
-        fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
-    converter.resetExprOverrides();
-
-    builder.create<fir::StoreOp>(loc, value, storeAddr);
-  }
-  return op;
-}
-
-static mlir::Operation * //
-genAtomicWrite(lower::AbstractConverter &converter,
-               semantics::SemanticsContext &semaCtx, mlir::Location loc,
-               lower::StatementContext &stmtCtx, mlir::Value atomAddr,
-               const semantics::SomeExpr &atom,
-               const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-               std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
-               fir::FirOpBuilder::InsertPoint preAt,
-               fir::FirOpBuilder::InsertPoint atomicAt,
-               fir::FirOpBuilder::InsertPoint postAt) {
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  builder.restoreInsertionPoint(preAt);
-
-  // If the atomic clause is write then the memory-order clause must
-  // not be acquire.
-  if (memOrder) {
-    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire) {
-      // Reset it back to the default.
-      memOrder = getDefaultAtomicMemOrder(semaCtx);
-    } else if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel) {
-      // The MLIR verifier doesn't like acq_rel either.
-      memOrder = mlir::omp::ClauseMemoryOrderKind::Release;
-    }
-  }
-
-  mlir::Value value =
-      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
-  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
-  mlir::Value converted = builder.createConvert(loc, atomType, value);
-
-  builder.restoreInsertionPoint(atomicAt);
-  mlir::Operation *op = builder.create<mlir::omp::AtomicWriteOp>(
-      loc, atomAddr, converted, hint, makeMemOrderAttr(converter, memOrder));
-  return op;
-}
-
-static mlir::Operation *
-genAtomicUpdate(lower::AbstractConverter &converter,
-                semantics::SemanticsContext &semaCtx, mlir::Location loc,
-                lower::StatementContext &stmtCtx, mlir::Value atomAddr,
-                const semantics::SomeExpr &atom,
-                const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-                std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
-                fir::FirOpBuilder::InsertPoint preAt,
-                fir::FirOpBuilder::InsertPoint atomicAt,
-                fir::FirOpBuilder::InsertPoint postAt) {
-  lower::ExprToValueMap overrides;
-  lower::StatementContext naCtx;
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  builder.restoreInsertionPoint(preAt);
-
-  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
-
-  // This must exist by now.
-  SomeExpr input = *Fortran::evaluate::GetConvertInput(assign.rhs);
-  std::vector<SomeExpr> args{
-      Fortran::evaluate::GetTopLevelOperation(input).second};
-  assert(!args.empty() && "Update operation without arguments");
-  for (auto &arg : args) {
-    if (!Fortran::evaluate::IsSameOrConvertOf(arg, atom)) {
-      mlir::Value val = fir::getBase(converter.genExprValue(arg, naCtx, &loc));
-      overrides.try_emplace(&arg, val);
-    }
-  }
-
-  builder.restoreInsertionPoint(atomicAt);
-  auto updateOp = builder.create<mlir::omp::AtomicUpdateOp>(
-      loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder));
-
-  mlir::Region &region = updateOp->getRegion(0);
-  mlir::Block *block = builder.createBlock(&region, {}, {atomType}, {loc});
-  mlir::Value localAtom = fir::getBase(block->getArgument(0));
-  overrides.try_emplace(&atom, localAtom);
-
-  converter.overrideExprValues(&overrides);
-  mlir::Value updated =
-      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
-  mlir::Value converted = builder.createConvert(loc, atomType, updated);
-  builder.create<mlir::omp::YieldOp>(loc, converted);
-  converter.resetExprOverrides();
-
-  builder.restoreInsertionPoint(postAt); // For naCtx cleanups
-  return updateOp;
-}
-
-static mlir::Operation *
-genAtomicOperation(lower::AbstractConverter &converter,
-                   semantics::SemanticsContext &semaCtx, mlir::Location loc,
-                   lower::StatementContext &stmtCtx, int action,
-                   mlir::Value atomAddr, const semantics::SomeExpr &atom,
-                   const evaluate::Assignment &assign, mlir::IntegerAttr hint,
-                   std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
-                   fir::FirOpBuilder::InsertPoint preAt,
-                   fir::FirOpBuilder::InsertPoint atomicAt,
-                   fir::FirOpBuilder::InsertPoint postAt) {
-  if (isPointerAssignment(assign)) {
-    TODO(loc, "Code generation for pointer assignment is not implemented yet");
-  }
-
-  // This function and the functions called here do not preserve the
-  // builder's insertion point, or set it to anything specific.
-  switch (action) {
-  case parser::OpenMPAtomicConstruct::Analysis::Read:
-    return genAtomicRead(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
-                         assign, hint, memOrder, preAt, atomicAt, postAt);
-  case parser::OpenMPAtomicConstruct::Analysis::Write:
-    return genAtomicWrite(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
-                          assign, hint, memOrder, preAt, atomicAt, postAt);
-  case parser::OpenMPAtomicConstruct::Analysis::Update:
-    return genAtomicUpdate(converter, semaCtx, loc, stmtCtx, atomAddr, atom,
-                           assign, hint, memOrder, preAt, atomicAt, postAt);
-  default:
-    return nullptr;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Code generation functions for the standalone version of constructs that can
 // also be a leaf of a composite construct
@@ -3900,163 +3586,11 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
 // OpenMPConstruct visitors
 //===----------------------------------------------------------------------===//
 
-[[maybe_unused]] static void
-dumpAtomicAnalysis(const parser::OpenMPAtomicConstruct::Analysis &analysis) {
-  auto whatStr = [](int k) {
-    std::string txt = "?";
-    switch (k & parser::OpenMPAtomicConstruct::Analysis::Action) {
-    case parser::OpenMPAtomicConstruct::Analysis::None:
-      txt = "None";
-      break;
-    case parser::OpenMPAtomicConstruct::Analysis::Read:
-      txt = "Read";
-      break;
-    case parser::OpenMPAtomicConstruct::Analysis::Write:
-      txt = "Write";
-      break;
-    case parser::OpenMPAtomicConstruct::Analysis::Update:
-      txt = "Update";
-      break;
-    }
-    switch (k & parser::OpenMPAtomicConstruct::Analysis::Condition) {
-    case parser::OpenMPAtomicConstruct::Analysis::IfTrue:
-      txt += " | IfTrue";
-      break;
-    case parser::OpenMPAtomicConstruct::Analysis::IfFalse:
-      txt += " | IfFalse";
-      break;
-    }
-    return txt;
-  };
-
-  auto exprStr = [&](const parser::TypedExpr &expr) {
-    if (auto *maybe = expr.get()) {
-      if (maybe->v)
-        return maybe->v->AsFortran();
-    }
-    return "<null>"s;
-  };
-  auto assignStr = [&](const parser::AssignmentStmt::TypedAssignment &assign) {
-    if (auto *maybe = assign.get(); maybe && maybe->v) {
-      std::string str;
-      llvm::raw_string_ostream os(str);
-      maybe->v->AsFortran(os);
-      return str;
-    }
-    return "<null>"s;
-  };
-
-  const SomeExpr &atom = *analysis.atom.get()->v;
-
-  llvm::errs() << "Analysis {\n";
-  llvm::errs() << "  atom: " << atom.AsFortran() << "\n";
-  llvm::errs() << "  cond: " << exprStr(analysis.cond) << "\n";
-  llvm::errs() << "  op0 {\n";
-  llvm::errs() << "    what: " << whatStr(analysis.op0.what) << "\n";
-  llvm::errs() << "    assign: " << assignStr(analysis.op0.assign) << "\n";
-  llvm::errs() << "  }\n";
-  llvm::errs() << "  op1 {\n";
-  llvm::errs() << "    what: " << whatStr(analysis.op1.what) << "\n";
-  llvm::errs() << "    assign: " << assignStr(analysis.op1.assign) << "\n";
-  llvm::errs() << "  }\n";
-  llvm::errs() << "}\n";
-}
-
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAtomicConstruct &construct) {
-  auto get = [](auto &&typedWrapper) -> decltype(&*typedWrapper.get()->v) {
-    if (auto *maybe = typedWrapper.get(); maybe && maybe->v) {
-      return &*maybe->v;
-    } else {
-      return nullptr;
-    }
-  };
-
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  auto &dirSpec = std::get<parser::OmpDirectiveSpecification>(construct.t);
-  List<Clause> clauses = makeClauses(dirSpec.Clauses(), semaCtx);
-  lower::StatementContext stmtCtx;
-
-  const parser::OpenMPAtomicConstruct::Analysis &analysis = construct.analysis;
-  if (DumpAtomicAnalysis)
-    dumpAtomicAnalysis(analysis);
-
-  const semantics::SomeExpr &atom = *get(analysis.atom);
-  mlir::Location loc = converter.genLocation(construct.source);
-  mlir::Value atomAddr =
-      fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc));
-  mlir::IntegerAttr hint = getAtomicHint(converter, clauses);
-  std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder =
-      getAtomicMemoryOrder(semaCtx, clauses,
-                           semaCtx.FindScope(construct.source));
-
-  if (auto *cond = get(analysis.cond)) {
-    (void)cond;
-    TODO(loc, "OpenMP ATOMIC COMPARE");
-  } else {
-    int action0 = analysis.op0.what & analysis.Action;
-    int action1 = analysis.op1.what & analysis.Action;
-    mlir::Operation *captureOp = nullptr;
-    fir::FirOpBuilder::InsertPoint preAt = builder.saveInsertionPoint();
-    fir::FirOpBuilder::InsertPoint atomicAt, postAt;
-
-    if (construct.IsCapture()) {
-      // Capturing operation.
-      assert(action0 != analysis.None && action1 != analysis.None &&
-             "Expexcing two actions");
-      (void)action0;
-      (void)action1;
-      captureOp = builder.create<mlir::omp::AtomicCaptureOp>(
-          loc, hint, makeMemOrderAttr(converter, memOrder));
-      // Set the non-atomic insertion point to before the atomic.capture.
-      preAt = getInsertionPointBefore(captureOp);
-
-      mlir::Block *block = builder.createBlock(&captureOp->getRegion(0));
-      builder.setInsertionPointToEnd(block);
-      // Set the atomic insertion point to before the terminator inside
-      // atomic.capture.
-      mlir::Operation *term = builder.create<mlir::omp::TerminatorOp>(loc);
-      atomicAt = getInsertionPointBefore(term);
-      postAt = getInsertionPointAfter(captureOp);
-      hint = nullptr;
-      memOrder = std::nullopt;
-    } else {
-      // Non-capturing operation.
-      assert(action0 != analysis.None && action1 == analysis.None &&
-             "Expexcing single action");
-      assert(!(analysis.op0.what & analysis.Condition));
-      postAt = atomicAt = preAt;
-    }
-
-    // The builder's insertion point needs to be specifically set before
-    // each call to `genAtomicOperation`.
-    mlir::Operation *firstOp = genAtomicOperation(
-        converter, semaCtx, loc, stmtCtx, analysis.op0.what, atomAddr, atom,
-        *get(analysis.op0.assign), hint, memOrder, preAt, atomicAt, postAt);
-    assert(firstOp && "Should have created an atomic operation");
-    atomicAt = getInsertionPointAfter(firstOp);
-
-    mlir::Operation *secondOp = nullptr;
-    if (analysis.op1.what != analysis.None) {
-      secondOp = genAtomicOperation(
-          converter, semaCtx, loc, stmtCtx, analysis.op1.what, atomAddr, atom,
-          *get(analysis.op1.assign), hint, memOrder, preAt, atomicAt, postAt);
-    }
-
-    if (construct.IsCapture()) {
-      // If this is a capture operation, the first/second ops will be inside
-      // of it. Set the insertion point to past the capture op itself.
-      builder.restoreInsertionPoint(postAt);
-    } else {
-      if (secondOp) {
-        builder.setInsertionPointAfter(secondOp);
-      } else {
-        builder.setInsertionPointAfter(firstOp);
-      }
-    }
-  }
+  lowerAtomic(converter, symTable, semaCtx, eval, construct);
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,

From 1b83f10072b322a206ffcaf737b42fe5c2d95b89 Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vigneshwar.jayakumar@amd.com>
Date: Fri, 20 Jun 2025 07:44:23 -0400
Subject: [PATCH 1032/1322] [AMDGPU] Fix to prevent sinking of PERMLANE_SWAP
 instruction (#144423)

Permlane_swap instruction depends on exec mask, added isConvergent flag
to prevent sinking of instruction.

Fixes: SWDEV-537232
---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  6 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  1 +
 .../machine-sink-ignorable-exec-use.mir       | 67 +++++++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 926df955881e..02b912bcfb9e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -774,7 +774,8 @@ defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
 
 let Constraints = "$vdst = $vdst_in, $src0_out = $src0",
      DisableEncoding="$vdst_in,$src0_out",
-     SchedRW = [Write32Bit, Write32Bit] in {
+     SchedRW = [Write32Bit, Write32Bit],
+     isConvergent = 1 in {
 let SubtargetPredicate = HasPermlane16Swap in {
 defm V_PERMLANE16_SWAP_B32 : VOP1Inst<"v_permlane16_swap_b32", VOP_PERMLANE_SWAP>;
 }
@@ -1549,8 +1550,11 @@ defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
 defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
 
 defm V_PRNG_B32            : VOP1_Real_gfx9 <0x58>;
+
+let isConvergent = 1 in {
 defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>;
 defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>;
+}
 
 class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
   (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 6045f59d1f04..19d490465f16 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -15,6 +15,7 @@ class LetDummies {
   bit isConvertibleToThreeAddress;
   bit isMoveImm;
   bit isReMaterializable;
+  bit isConvergent;
   bit isAsCheapAsAMove;
   bit FPDPRounding;
   Predicate SubtargetPredicate;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 0fc31ea9d643..ed22b353b066 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -733,3 +733,70 @@ body:             |
     liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
     S_ENDPGM 0
 ...
+---
+name: test_no_sink_permlane_swap
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GFX9-LABEL: name: test_no_sink_permlane_swap
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_PERMLANE32_SWAP_B32_e64_]], [[V_PERMLANE32_SWAP_B32_e64_1]], implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_MAX_I32_e64_]], %bb.1
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]]
+  bb.0:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $vgpr0
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %2:sreg_64 = S_MOV_B64 0
+    %3:vreg_64 = COPY %2
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD killed %3, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %5:vgpr_32, %6:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %4, %4, 0, 0, implicit $exec
+    %7:vgpr_32 = COPY $vgpr0
+    %8:sreg_32 = S_MOV_B32 1
+    %9:sreg_64 = V_CMP_LT_I32_e64 %7, %8, implicit $exec
+    %10:sreg_64 = COPY %9
+    %11:sreg_64 = SI_IF %10, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %12:vgpr_32 = V_MAX_I32_e64 %5, %6, implicit $exec
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %13:vgpr_32 = PHI %1, %bb.0, %12, %bb.1
+    SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    S_ENDPGM 0, implicit %13
+...

From e8be733a3c3347207c162fc83e8dbe02dad2a952 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 20 Jun 2025 12:45:20 +0100
Subject: [PATCH 1033/1322] [VPlan] Remove redundant ExtractLastElement from
 vector-to-scalar VPI.

Recipes that are vector-to-scalar are guaranteed to generate a scalar
value, so the extract is redundant after VPlan unrolling. Remove it.

This removes unneeded ExtractLastElement VPInstruction of reduction
result computations.
---
 llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h         | 3 +++
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp         | 8 ++++++++
 .../Transforms/LoopVectorize/AArch64/vplan-printing.ll    | 3 +--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b2535fe3aa57..ae91e310f759 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -192,6 +192,9 @@ inline match_combine_and<LTy, RTy> m_CombineAnd(const LTy &L, const RTy &R) {
 /// Match a VPValue, capturing it if we match.
 inline bind_ty<VPValue> m_VPValue(VPValue *&V) { return V; }
 
+/// Match a VPInstruction, capturing if we match.
+inline bind_ty<VPInstruction> m_VPInstruction(VPInstruction *&V) { return V; }
+
 template <typename Ops_t, unsigned Opcode, bool Commutative,
           typename... RecipeTys>
 struct Recipe_match {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 60a8837fb76a..b6ca50549fa3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1163,6 +1163,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     });
     return;
   }
+
+  VPInstruction *OpVPI;
+  if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>(
+                     m_VPInstruction(OpVPI))) &&
+      OpVPI->isVectorToScalar()) {
+    Def->replaceAllUsesWith(OpVPI);
+    return;
+  }
 }
 
 void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 8095f258ea18..307ab3a87e9d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -106,13 +106,12 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
-; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-last-element vp<[[RED_RESULT]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:

From 8650c801381d5640018a2fab32932ee41ad27797 Mon Sep 17 00:00:00 2001
From: NimishMishra <42909663+NimishMishra@users.noreply.github.com>
Date: Fri, 20 Jun 2025 17:22:40 +0530
Subject: [PATCH 1034/1322] [flang][OpenMP] Do not skip privatization of linear
 variable if it is OmpPreDetermined (#144315)

Current implementation of linear clause skips privatisation of all
linear variables during the FIR generation phase, since linear variables
are handled in their entirety by the OpenMP IRBuilder. However,
"implicit" linear variables (like OmpPreDetermined) cannot be skipped,
since FIR generation requires privatized symbols. This patch adds checks
to skip the same.


Fixes https://github.com/llvm/llvm-project/issues/142935
---
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 40 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 1b8670b379f8..3fae3f3a0ddf 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -204,6 +204,42 @@ void DataSharingProcessor::collectOmpObjectListSymbol(
 }
 
 void DataSharingProcessor::collectSymbolsForPrivatization() {
+  // Add checks here for exceptional cases where privatization is not
+  // needed and be deferred to a later phase (like OpenMP IRBuilder).
+  // Such cases are suggested to be clearly documented and explained
+  // instead of being silently skipped
+  auto isException = [&](const Fortran::semantics::Symbol *sym) -> bool {
+    // `OmpPreDetermined` symbols cannot be exceptions since
+    // their privatized symbols are heavily used in FIR.
+    if (sym->test(Fortran::semantics::Symbol::Flag::OmpPreDetermined))
+      return false;
+
+    // The handling of linear clause is deferred to the OpenMP
+    // IRBuilder which is responsible for all its aspects,
+    // including privatization. Privatizing linear variables at this point would
+    // cause the following structure:
+    //
+    // omp.op linear(%linear = %step : !fir.ref<type>) {
+    //	Use %linear in this BB
+    // }
+    //
+    // to be changed to the following:
+    //
+    // omp. op linear(%linear = %step : !fir.ref<type>)
+    // 	private(%linear -> %arg0 : !fir.ref<i32>) {
+    //	Declare and use %arg0 in this BB
+    // }
+    //
+    // The OpenMP IRBuilder needs to map the linear MLIR value
+    // (i.e. %linear) to its `uses` in the BB to correctly
+    // implement the functionalities of linear clause. However,
+    // privatizing here disallows the IRBuilder to
+    // draw a relation between %linear and %arg0. Hence skip.
+    if (sym->test(Fortran::semantics::Symbol::Flag::OmpLinear))
+      return true;
+    return false;
+  };
+
   for (const omp::Clause &clause : clauses) {
     if (const auto &privateClause =
             std::get_if<omp::clause::Private>(&clause.u)) {
@@ -222,10 +258,10 @@ void DataSharingProcessor::collectSymbolsForPrivatization() {
   }
 
   // TODO For common blocks, add the underlying objects within the block. Doing
-  // so, we won't need to explicitely handle block objects (or forget to do
+  // so, we won't need to explicitly handle block objects (or forget to do
   // so).
   for (auto *sym : explicitlyPrivatizedSymbols)
-    if (!sym->test(Fortran::semantics::Symbol::Flag::OmpLinear))
+    if (!isException(sym))
       allPrivatizedSymbols.insert(sym);
 }
 

From e970f59e6b20dddc4369735affb79ca9be240c1c Mon Sep 17 00:00:00 2001
From: NimishMishra <42909663+NimishMishra@users.noreply.github.com>
Date: Fri, 20 Jun 2025 17:23:00 +0530
Subject: [PATCH 1035/1322] [flang][OpenMP] Reintroduce TODO for FIR lowering
 of linear clause (#144883)

Current design of the linear clause lowering and translation shifts all
responsibility for handling the clause (like privatisation, linear
stepping, finalisation, and emission of synchronisation barriers) to the
IRBuilder. However in certain corner cases (like associated loops in or
before OpenMP version 4.5), variables are are implicitly linear. This
currently causes a problem with the existing linear clause
implementation. Hence, re-introduce TODO on the linear clause until the
linear clause lowering/translation are robust enough to handle such
cases as well.

Fixes https://github.com/llvm/llvm-project/issues/142935
---
 flang/lib/Lower/OpenMP/OpenMP.cpp         |  4 +-
 flang/test/Lower/OpenMP/wsloop-linear.f90 | 57 -----------------------
 2 files changed, 2 insertions(+), 59 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/wsloop-linear.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ddf58fd87444..ebd1d038716e 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1968,13 +1968,13 @@ static void genWsloopClauses(
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processNowait(clauseOps);
-  cp.processLinear(clauseOps);
   cp.processOrder(clauseOps);
   cp.processOrdered(clauseOps);
   cp.processReduction(loc, clauseOps, reductionSyms);
   cp.processSchedule(stmtCtx, clauseOps);
 
-  cp.processTODO<clause::Allocate>(loc, llvm::omp::Directive::OMPD_do);
+  cp.processTODO<clause::Allocate, clause::Linear>(
+      loc, llvm::omp::Directive::OMPD_do);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/flang/test/Lower/OpenMP/wsloop-linear.f90 b/flang/test/Lower/OpenMP/wsloop-linear.f90
deleted file mode 100644
index b99677108be2..000000000000
--- a/flang/test/Lower/OpenMP/wsloop-linear.f90
+++ /dev/null
@@ -1,57 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing)
-! with linear clause
-
-! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - 2>&1 | FileCheck %s
-
-!CHECK: %[[X_alloca:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_linearEx"}
-!CHECK: %[[X:.*]]:2 = hlfir.declare %[[X_alloca]] {uniq_name = "_QFsimple_linearEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-subroutine simple_linear
-    implicit none
-    integer :: x, y, i
-    !CHECK: omp.wsloop linear(%[[X]]#0 = %[[const]] : !fir.ref<i32>) {{.*}}
-    !$omp do linear(x)
-    !CHECK: %[[LOAD:.*]] = fir.load %[[X]]#0 : !fir.ref<i32>
-    !CHECK: %[[const:.*]] = arith.constant 2 : i32
-    !CHECK: %[[RESULT:.*]] = arith.addi %[[LOAD]], %[[const]] : i32
-    do i = 1, 10
-        y = x + 2
-    end do
-    !$omp end do
-end subroutine
-
-
-!CHECK: %[[X_alloca:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFlinear_stepEx"}
-!CHECK: %[[X:.*]]:2 = hlfir.declare %[[X_alloca]] {uniq_name = "_QFlinear_stepEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-subroutine linear_step
-    implicit none
-    integer :: x, y, i
-    !CHECK: %[[const:.*]] = arith.constant 4 : i32
-    !CHECK: omp.wsloop linear(%[[X]]#0 = %[[const]] : !fir.ref<i32>) {{.*}}
-    !$omp do linear(x:4)
-    !CHECK: %[[LOAD:.*]] = fir.load %[[X]]#0 : !fir.ref<i32>
-    !CHECK: %[[const:.*]] = arith.constant 2 : i32
-    !CHECK: %[[RESULT:.*]] = arith.addi %[[LOAD]], %[[const]] : i32   
-    do i = 1, 10
-        y = x + 2
-    end do
-    !$omp end do
-end subroutine
-
-!CHECK: %[[A_alloca:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFlinear_exprEa"}
-!CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_alloca]] {uniq_name = "_QFlinear_exprEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[X_alloca:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFlinear_exprEx"}
-!CHECK: %[[X:.*]]:2 = hlfir.declare %[[X_alloca]] {uniq_name = "_QFlinear_exprEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-subroutine linear_expr
-    implicit none
-    integer :: x, y, i, a
-    !CHECK: %[[LOAD_A:.*]] = fir.load %[[A]]#0 : !fir.ref<i32>
-    !CHECK: %[[const:.*]] = arith.constant 4 : i32
-    !CHECK: %[[LINEAR_EXPR:.*]] = arith.addi %[[LOAD_A]], %[[const]] : i32
-    !CHECK: omp.wsloop linear(%[[X]]#0 = %[[LINEAR_EXPR]] : !fir.ref<i32>) {{.*}}
-    !$omp do linear(x:a+4)
-    do i = 1, 10
-        y = x + 2
-    end do
-    !$omp end do
-end subroutine

From a5b1093f782729014604f3208550de7400c518ac Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Jun 2025 13:55:03 +0200
Subject: [PATCH 1036/1322] [MLIR] Add ReturnLike trait to memref.atomic_yield
 (#144932)

Without this, the yield isn't considered as the region terminator and
the dataflow framework does not consider it live.
---
 mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td |  3 +--
 mlir/test/Transforms/remove-dead-values.mlir     | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 481b14cdb462..b0fb5b078514 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1078,8 +1078,7 @@ def GenericAtomicRMWOp : MemRef_Op<"generic_atomic_rmw", [
 
 def AtomicYieldOp : MemRef_Op<"atomic_yield", [
       HasParent<"GenericAtomicRMWOp">,
-      Pure,
-      Terminator
+      Pure, Terminator, ReturnLike
     ]> {
   let summary = "yield operation for GenericAtomicRMWOp";
   let description = [{
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 8c2a1cf7546f..e55a9160f5b3 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -510,3 +510,18 @@ module {
 // CHECK: %[[yield:.*]] = arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK: linalg.yield %[[yield]] : f32
 // CHECK-NOT: arith.subf
+
+// -----
+
+// CHECK-LABEL: func.func @test_atomic_yield
+func.func @test_atomic_yield(%I: memref<10xf32>, %idx : index) {
+  // CHECK: memref.generic_atomic_rmw
+  %x = memref.generic_atomic_rmw %I[%idx] : memref<10xf32> {
+  ^bb0(%current_value : f32):
+    // CHECK: arith.constant
+    %c1 = arith.constant 1.0 : f32
+    // CHECK: memref.atomic_yield
+    memref.atomic_yield %c1 : f32
+  }
+  func.return
+}

From 5835f1e0a33afcae46a6ca4854373785eb3e7fd6 Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Fri, 20 Jun 2025 12:55:11 +0100
Subject: [PATCH 1037/1322] [AsmPrinter] Fix crash when remarks section is
 unsupported (#144724)

Emit a warning and bail out instead of segfault-ing when the current
object file format does not have support for emitting a remarks section.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 12 +++++++++---
 llvm/test/CodeGen/X86/remarks-section.ll   |  5 +++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index a2c3b50b2467..403963f33b65 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2383,6 +2383,15 @@ void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) {
   if (!RS.needsSection())
     return;
 
+  MCSection *RemarksSection =
+      OutContext.getObjectFileInfo()->getRemarksSection();
+  if (!RemarksSection) {
+    OutContext.reportWarning(SMLoc(), "Current object file format does not "
+                                      "support remarks sections. Use the yaml "
+                                      "remark format instead.");
+    return;
+  }
+
   remarks::RemarkSerializer &RemarkSerializer = RS.getSerializer();
 
   std::optional<SmallString<128>> Filename;
@@ -2400,10 +2409,7 @@ void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) {
   MetaSerializer->emit();
 
   // Switch to the remarks section.
-  MCSection *RemarksSection =
-      OutContext.getObjectFileInfo()->getRemarksSection();
   OutStreamer->switchSection(RemarksSection);
-
   OutStreamer->emitBinaryData(Buf);
 }
 
diff --git a/llvm/test/CodeGen/X86/remarks-section.ll b/llvm/test/CodeGen/X86/remarks-section.ll
index e67c3579b759..2611e525aecf 100644
--- a/llvm/test/CodeGen/X86/remarks-section.ll
+++ b/llvm/test/CodeGen/X86/remarks-section.ll
@@ -5,6 +5,8 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=bitstream -remarks-section=false -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-OVERRIDE-BITSTREAM %s
 ; RUN: llc < %s -mtriple=x86_64-darwin --pass-remarks-format=yaml -remarks-section=true -pass-remarks-output=%/t.yaml | FileCheck --check-prefix=CHECK-DARWIN-OVERRIDE-YAML %s
 
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu --pass-remarks-format=bitstream -pass-remarks-output=%/t.yaml 2>&1 | FileCheck --check-prefix=CHECK-LINUX-DEFAULT-BITSTREAM %s
+
 ; CHECK-DARWIN: .section __LLVM,__remarks,regular,debug
 ; CHECK-DARWIN-NEXT: .byte
 
@@ -22,3 +24,6 @@
 define void @func1() {
   ret void
 }
+
+; Currently no ELF support for bitstream remarks
+; CHECK-LINUX-DEFAULT-BITSTREAM: warning: Current object file format does not support remarks sections. Use the yaml remark format instead.

From fd97dfbb78e3c9aea16873617b6d61b5b8a64474 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 20 Jun 2025 13:05:19 +0100
Subject: [PATCH 1038/1322] [LV] Don't mark ptrs as safe to speculate if fed by
 UB/poison op. (#143204)

Add additional checks before marking pointers safe to load
speculatively. If some computations feeding the pointer may trigger UB,
we cannot load the pointer speculatively, because we cannot compute the
address speculatively. The UB triggering instructions will be
predicated, but if the predicated block does not execute the result is
poison.

Similarly, we also cannot load the pointer speculatively if it may be
poison. The patch also checks if any of the operands defined outside the
loop may be poison when entering the loop. We *don't* need to check if
any operation inside the loop may produce poison due to flags, as those
will be dropped if needed.

There are some types of instructions inside the loop that can produce
poison independent of flags. Currently loads are also checked, not sure
if there's a convenient API to check for all such operands.

Fixes https://github.com/llvm/llvm-project/issues/142957.

PR: https://github.com/llvm/llvm-project/pull/143204
---
 .../Vectorize/LoopVectorizationLegality.cpp   |  41 ++++++
 .../X86/drop-poison-generating-flags.ll       | 124 ++++++++++++------
 ...able-info-from-assumption-constant-size.ll |  66 +++++-----
 ...able-info-from-assumption-variable-size.ll |  16 +--
 ...eref-pred-poison-ub-ops-feeding-pointer.ll |  66 ++++++----
 5 files changed, 202 insertions(+), 111 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0c4e5bb3d472..969d225c6ef2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1491,10 +1491,51 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     SmallVector<const SCEVPredicate *, 4> Predicates;
     for (Instruction &I : *BB) {
       LoadInst *LI = dyn_cast<LoadInst>(&I);
+
+      // Make sure we can execute all computations feeding into Ptr in the loop
+      // w/o triggering UB and that none of the out-of-loop operands are poison.
+      // We do not need to check if operations inside the loop can produce
+      // poison due to flags (e.g. due to an inbounds GEP going out of bounds),
+      // because flags will be dropped when executing them unconditionally.
+      // TODO: Results could be improved by considering poison-propagation
+      // properties of visited ops.
+      auto CanSpeculatePointerOp = [this](Value *Ptr) {
+        SmallVector<Value *> Worklist = {Ptr};
+        SmallPtrSet<Value *, 4> Visited;
+        while (!Worklist.empty()) {
+          Value *CurrV = Worklist.pop_back_val();
+          if (!Visited.insert(CurrV).second)
+            continue;
+
+          auto *CurrI = dyn_cast<Instruction>(CurrV);
+          if (!CurrI || !TheLoop->contains(CurrI)) {
+            // If operands from outside the loop may be poison then Ptr may also
+            // be poison.
+            if (!isGuaranteedNotToBePoison(CurrV, AC,
+                                           TheLoop->getLoopPredecessor()
+                                               ->getTerminator()
+                                               ->getIterator()))
+              return false;
+            continue;
+          }
+
+          // A loaded value may be poison, independent of any flags.
+          if (isa<LoadInst>(CurrI) && !isGuaranteedNotToBePoison(CurrV, AC))
+            return false;
+
+          // For other ops, assume poison can only be introduced via flags,
+          // which can be dropped.
+          if (!isa<PHINode>(CurrI) && !isSafeToSpeculativelyExecute(CurrI))
+            return false;
+          append_range(Worklist, CurrI->operands());
+        }
+        return true;
+      };
       // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
       // that it will consider loops that need guarding by SCEV checks. The
       // vectoriser will generate these checks if we decide to vectorise.
       if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
+          CanSpeculatePointerOp(LI->getPointerOperand()) &&
           isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC,
                                             &Predicates))
         SafePointers.insert(LI->getPointerOperand());
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 53fd2ed43972..ff9cf682b6e9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -666,34 +666,54 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_SREM_CONTINUE6]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SREM_IF:.*]], label %[[PRED_SREM_CONTINUE:.*]]
-; CHECK:       [[PRED_SREM_IF]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE]]
-; CHECK:       [[PRED_SREM_CONTINUE]]:
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], poison
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i8> poison, i8 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP24]], %[[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SREM_IF1:.*]], label %[[PRED_SREM_CONTINUE2:.*]]
-; CHECK:       [[PRED_SREM_IF1]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE2]]
-; CHECK:       [[PRED_SREM_CONTINUE2]]:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], poison
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP26]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i8> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_SREM_IF3:.*]], label %[[PRED_SREM_CONTINUE4:.*]]
-; CHECK:       [[PRED_SREM_IF3]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE4]]
-; CHECK:       [[PRED_SREM_CONTINUE4]]:
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], poison
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP29]], i8 [[TMP20]], i32 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i8> [ [[TMP29]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP21]], %[[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_SREM_IF5:.*]], label %[[PRED_SREM_CONTINUE6]]
-; CHECK:       [[PRED_SREM_IF5]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE6]]
-; CHECK:       [[PRED_SREM_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], poison
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP30]], poison
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i8> [[TMP22]], i8 [[TMP27]], i32 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
@@ -743,34 +763,54 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_SREM_CONTINUE6]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SREM_IF:.*]], label %[[PRED_SREM_CONTINUE:.*]]
-; CHECK:       [[PRED_SREM_IF]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE]]
-; CHECK:       [[PRED_SREM_CONTINUE]]:
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP9]], poison
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i8> poison, i8 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP24]], %[[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SREM_IF1:.*]], label %[[PRED_SREM_CONTINUE2:.*]]
-; CHECK:       [[PRED_SREM_IF1]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE2]]
-; CHECK:       [[PRED_SREM_CONTINUE2]]:
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[TMP26]], poison
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP25]], i8 [[TMP13]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i8> [ [[TMP25]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SREM_IF3:.*]], label %[[PRED_SREM_CONTINUE4:.*]]
-; CHECK:       [[PRED_SREM_IF3]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE4]]
-; CHECK:       [[PRED_SREM_CONTINUE4]]:
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], poison
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP15]], i8 [[TMP20]], i32 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i8> [ [[TMP15]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP21]], %[[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SREM_IF5:.*]], label %[[PRED_SREM_CONTINUE6]]
-; CHECK:       [[PRED_SREM_IF5]]:
-; CHECK-NEXT:    br label %[[PRED_SREM_CONTINUE6]]
-; CHECK:       [[PRED_SREM_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], poison
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP31]], poison
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x i8> [[TMP22]], i8 [[TMP27]], i32 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> zeroinitializer, <4 x i64> poison
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[PREDPHI7]], i32 3
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index dfae2d3f41d4..7a54519c7cdf 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -3,9 +3,9 @@
 
 declare void @llvm.assume(i1)
 
-define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -103,9 +103,9 @@ exit:
   ret void
 }
 
-define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -179,9 +179,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -279,9 +279,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -479,9 +479,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -579,9 +579,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -675,9 +675,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -777,9 +777,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
+define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -881,9 +881,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -958,9 +958,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 3999) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1053,9 +1053,9 @@ exit:
   ret void
 }
 
-define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1131,9 +1131,9 @@ exit:
 }
 
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias noundef align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(
-; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1145,9 +1145,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
@@ -1208,9 +1208,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1303,9 +1303,9 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 3999) ]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -1399,9 +1399,9 @@ exit:
 }
 
 ; %a may be freed between the dereferenceable assumption and accesses.
-define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
 ; CHECK-NEXT:    call void @may_free()
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
index 6b46197295e3..ee65dc8cdcb1 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
@@ -4,9 +4,9 @@
 declare void @llvm.assume(i1)
 
 ; %a is known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_non_constant_trip_count_access_i8(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
+define void @deref_assumption_in_preheader_non_constant_trip_count_access_i8(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_non_constant_trip_count_access_i8(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[N]]) ]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
@@ -86,9 +86,9 @@ exit:
 }
 
 ; %a is known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
+define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[N]], 4
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[MUL]]) ]
@@ -171,9 +171,9 @@ exit:
 
 
 ; %a is NOT known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_access_i32(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
+define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_access_i32(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_access_i32(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[N]]) ]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
@@ -253,9 +253,9 @@ exit:
 }
 
 ; %a is NOT known dereferenceable via assume for the whole loop.
-define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_access_i32(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
+define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_access_i32(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %n) nofree nosync {
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_access_i32(
-; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 100) ]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index e27734755dfb..3373c6d5cb81 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -7,7 +7,7 @@ target datalayout="p:16:16"
 
 ; Test cases for https://github.com/llvm/llvm-project/issues/142957
 
-; FIXME: Cannot speculatively execute %l, because %div may trigger UB and must be
+; Cannot speculatively execute %l, because %div may trigger UB and must be
 ; predicated.
 define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
 ; CHECK-LABEL: define void @ptr_depends_on_sdiv(
@@ -26,36 +26,40 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
 ; CHECK:       [[PRED_SDIV_IF]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i16 24316, [[OFF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
 ; CHECK:       [[PRED_SDIV_CONTINUE]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i16 [ poison, %[[VECTOR_BODY]] ], [ [[TMP2]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP3]], %[[PRED_SDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]]
 ; CHECK:       [[PRED_SDIV_IF1]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = sdiv i16 24316, [[OFF]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i16> [[TMP4]], i16 [[TMP18]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
 ; CHECK:       [[PRED_SDIV_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = add i16 [[OFFSET_IDX]], 16383
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i16 [[TMP3]], 14
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i16 [[TMP13]], [[TMP7]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr @src, i16 [[TMP8]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP4]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP7]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add <2 x i16> [[VEC_IND]], splat (i16 16383)
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <2 x i16> [[TMP8]], splat (i16 14)
+; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i16> [[TMP21]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr @src, i16 [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST]], i16 [[TMP10]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP11]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_IF3]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i16> [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr @src, i16 [[TMP25]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP20]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[DST]], i16 [[TMP16]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
@@ -94,15 +98,17 @@ exit:
   ret void
 }
 
-; FIXME: Cannot speculatively execute %l, because %off may be poison.
+; Cannot speculatively execute %l, because %off may be poison.
 define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
 ; CHECK-LABEL: define void @ptr_depends_on_possibly_poison_value(
 ; CHECK-SAME: ptr noalias [[DST:%.*]], i16 [[OFF:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i16 1, [[OFF]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], [[OFF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFF]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = sub <2 x i16> splat (i16 1), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
@@ -110,25 +116,26 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 9, [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], splat (i16 10)
-; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr @src, i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[VEC_IND]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr @src, i16 [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i16 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP8]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_IF1]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr @src, i16 [[TMP15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[DST]], i16 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
@@ -237,7 +244,7 @@ exit:
   ret void
 }
 
-; FIXME: Cannot speculatively execute %l, because %off may be poison.
+; Cannot speculatively execute %l, because %off may be poison.
 define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
 ; CHECK-LABEL: define void @ptr_depends_on_possibly_poison_value_from_load(
 ; CHECK-SAME: ptr noalias [[DST:%.*]]) {
@@ -251,28 +258,31 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 9, [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @src, align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i16> [[VEC_IND]], splat (i16 10)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i16 1, [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr @src, i16 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i16> splat (i16 1), [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i16> [[VEC_IND]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr @src, i16 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DST]], i16 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP10]], ptr [[TMP9]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_IF1]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr @src, i16 [[TMP16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[DST]], i16 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:

From 6c0ac888c59ca34a7f4e1dc2702b30e0db5cbac3 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Jun 2025 14:32:56 +0200
Subject: [PATCH 1039/1322] [mlir][arith][NFC] Remove redundant trait
 declaration (#145007)

`Arith_Op` already declares the `ElementwiseMappable` traits, so they
don't have to be declared for `arith.select`.
---
 mlir/include/mlir/Dialect/Arith/IR/ArithOps.td | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 0518cac156eb..ef9ccb7e8794 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1688,8 +1688,7 @@ def SelectOp : Arith_Op<"select", [Pure,
     AllTypesMatch<["true_value", "false_value", "result"]>,
     BooleanConditionOrMatchingShape<"condition", "result">,
     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRangesFromOptional"]>,
-    DeclareOpInterfaceMethods<SelectLikeOpInterface>,
-  ] # ElementwiseMappable.traits> {
+    DeclareOpInterfaceMethods<SelectLikeOpInterface>]> {
   let summary = "select operation";
   let description = [{
     The `arith.select` operation chooses one value based on a binary condition

From 6edf2eb36470b623597668a7a97153544f568a40 Mon Sep 17 00:00:00 2001
From: Artemiy Bulavin <artemiyb@graphcore.ai>
Date: Fri, 20 Jun 2025 13:45:17 +0100
Subject: [PATCH 1040/1322] [MLIR] Print more user-friendly error message when
 generating local reproducer and threading is enabled (#144905)

---
 mlir/lib/Pass/PassManagerOptions.cpp                    | 8 ++++++++
 mlir/test/mlir-opt/local-reproducer-with-threading.mlir | 7 +++++++
 2 files changed, 15 insertions(+)
 create mode 100644 mlir/test/mlir-opt/local-reproducer-with-threading.mlir

diff --git a/mlir/lib/Pass/PassManagerOptions.cpp b/mlir/lib/Pass/PassManagerOptions.cpp
index dd119a75f406..305bf72bb479 100644
--- a/mlir/lib/Pass/PassManagerOptions.cpp
+++ b/mlir/lib/Pass/PassManagerOptions.cpp
@@ -146,6 +146,14 @@ LogicalResult mlir::applyPassManagerCLOptions(PassManager &pm) {
   if (!options.isConstructed())
     return failure();
 
+  if (options->reproducerFile.getNumOccurrences() && options->localReproducer &&
+      pm.getContext()->isMultithreadingEnabled()) {
+    emitError(UnknownLoc::get(pm.getContext()))
+        << "Local crash reproduction may not be used without disabling "
+           "mutli-threading first.";
+    return failure();
+  }
+
   // Generate a reproducer on crash/failure.
   if (options->reproducerFile.getNumOccurrences())
     pm.enableCrashReproducerGeneration(options->reproducerFile,
diff --git a/mlir/test/mlir-opt/local-reproducer-with-threading.mlir b/mlir/test/mlir-opt/local-reproducer-with-threading.mlir
new file mode 100644
index 000000000000..8e94f4edb91b
--- /dev/null
+++ b/mlir/test/mlir-opt/local-reproducer-with-threading.mlir
@@ -0,0 +1,7 @@
+// Test that attempting to create a local crash reproducer without disabling threading
+// prints an error from the pass manager (as opposed to crashing with a stack trace).
+
+// RUN: mlir-opt --verify-diagnostics --mlir-pass-pipeline-local-reproducer \
+// RUN:          --mlir-pass-pipeline-crash-reproducer=%t %s
+
+// expected-error@unknown {{Local crash reproduction may not be used without disabling mutli-threading first.}}

From b334ffd4f4e70a404a8572b132184583a9d8623a Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 20 Jun 2025 14:01:15 +0100
Subject: [PATCH 1041/1322] [VPlan] Refine return types in VPBuilder (NFC)
 (#108858)

---
 .../Vectorize/LoopVectorizationPlanner.h      | 47 ++++++++++---------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index d17c64a778e8..144f35e10132 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -190,37 +190,38 @@ public:
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
 
-  VPValue *createNot(VPValue *Operand, DebugLoc DL = DebugLoc::getUnknown(),
-                     const Twine &Name = "") {
+  VPInstruction *createNot(VPValue *Operand,
+                           DebugLoc DL = DebugLoc::getUnknown(),
+                           const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
   }
 
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS,
-                     DebugLoc DL = DebugLoc::getUnknown(),
-                     const Twine &Name = "") {
+  VPInstruction *createAnd(VPValue *LHS, VPValue *RHS,
+                           DebugLoc DL = DebugLoc::getUnknown(),
+                           const Twine &Name = "") {
     return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
   }
 
-  VPValue *createOr(VPValue *LHS, VPValue *RHS,
-                    DebugLoc DL = DebugLoc::getUnknown(),
-                    const Twine &Name = "") {
+  VPInstruction *createOr(VPValue *LHS, VPValue *RHS,
+                          DebugLoc DL = DebugLoc::getUnknown(),
+                          const Twine &Name = "") {
 
     return tryInsertInstruction(new VPInstruction(
         Instruction::BinaryOps::Or, {LHS, RHS},
         VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
   }
 
-  VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS,
-                            DebugLoc DL = DebugLoc::getUnknown(),
-                            const Twine &Name = "") {
+  VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
+                                  DebugLoc DL = DebugLoc::getUnknown(),
+                                  const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name));
   }
 
-  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
-                        DebugLoc DL = DebugLoc::getUnknown(),
-                        const Twine &Name = "",
-                        std::optional<FastMathFlags> FMFs = std::nullopt) {
+  VPInstruction *
+  createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
+               DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "",
+               std::optional<FastMathFlags> FMFs = std::nullopt) {
     auto *Select =
         FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
                                  *FMFs, DL, Name)
@@ -232,9 +233,9 @@ public:
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
   /// and \p B.
   /// TODO: add createFCmp when needed.
-  VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
-                      DebugLoc DL = DebugLoc::getUnknown(),
-                      const Twine &Name = "") {
+  VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+                            DebugLoc DL = DebugLoc::getUnknown(),
+                            const Twine &Name = "") {
     assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
            Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
@@ -248,16 +249,16 @@ public:
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
                           GEPNoWrapFlags::none(), DL, Name));
   }
-  VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
-                                DebugLoc DL = DebugLoc::getUnknown(),
-                                const Twine &Name = "") {
+  VPInstruction *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
+                                      DebugLoc DL = DebugLoc::getUnknown(),
+                                      const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
                           GEPNoWrapFlags::inBounds(), DL, Name));
   }
 
-  VPInstruction *createScalarPhi(ArrayRef<VPValue *> IncomingValues,
-                                 DebugLoc DL, const Twine &Name = "") {
+  VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL,
+                         const Twine &Name = "") {
     return tryInsertInstruction(new VPPhi(IncomingValues, DL, Name));
   }
 

From c8c4bd1ebc6e4451dc835a77bacdbe6a0467f219 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 20 Jun 2025 14:01:48 +0100
Subject: [PATCH 1042/1322] [LV] Stengthen loop-invariance checks in
 isPredicatedInst (#140744)

Check loop-invariance against SCEV as well.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 .../Transforms/LoopVectorize/RISCV/pr88802.ll | 114 +-------
 .../predicatedinst-loop-invariant.ll          | 263 ++++++++++++++++++
 4 files changed, 269 insertions(+), 116 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 00cdb66d8b77..94b9fe958126 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2764,8 +2764,8 @@ LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) {
 
 bool LoopAccessInfo::isInvariant(Value *V) const {
   auto *SE = PSE->getSE();
-  // TODO: Is this really what we want? Even without FP SCEV, we may want some
-  // trivially loop-invariant FP values to be considered invariant.
+  if (TheLoop->isLoopInvariant(V))
+    return true;
   if (!SE->isSCEVable(V->getType()))
     return false;
   const SCEV *S = SE->getSCEV(V);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d9f53c4146c2..88b2ffba1b79 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3104,14 +3104,14 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
     // is correct.  The easiest form of the later is to require that all values
     // stored are the same.
     return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
-             TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
+             Legal->isInvariant(cast<StoreInst>(I)->getValueOperand()));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
     // If the divisor is loop-invariant no predication is needed.
-    return !TheLoop->isLoopInvariant(I->getOperand(1));
+    return !Legal->isInvariant(I->getOperand(1));
   }
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 51a8b451dffd..a1201dcfbdf5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -17,126 +17,16 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 9)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <16 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP4]], <16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP8]], i32 0
-; CHECK-NEXT:    store i8 [[TMP19]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[VECTOR_BODY]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[CMP_N:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[TMP8]], i32 1
-; CHECK-NEXT:    store i8 [[TMP12]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[SCALAR_PH]]
-; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
-; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[TMP8]], i32 2
-; CHECK-NEXT:    store i8 [[TMP14]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[TMP8]], i32 3
-; CHECK-NEXT:    store i8 [[TMP16]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 4
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[TMP8]], i32 4
-; CHECK-NEXT:    store i8 [[TMP18]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 5
-; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[TMP8]], i32 5
-; CHECK-NEXT:    store i8 [[TMP20]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 6
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
-; CHECK:       pred.store.if13:
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[TMP8]], i32 6
-; CHECK-NEXT:    store i8 [[TMP22]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 7
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
-; CHECK:       pred.store.if15:
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[TMP8]], i32 7
-; CHECK-NEXT:    store i8 [[TMP24]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
-; CHECK:       pred.store.continue16:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 8
-; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
-; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP8]], i32 8
-; CHECK-NEXT:    store i8 [[TMP26]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
-; CHECK:       pred.store.continue18:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 9
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
-; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP8]], i32 9
-; CHECK-NEXT:    store i8 [[TMP28]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
-; CHECK:       pred.store.continue20:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 10
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP8]], i32 10
-; CHECK-NEXT:    store i8 [[TMP30]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue22:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 11
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP8]], i32 11
-; CHECK-NEXT:    store i8 [[TMP32]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 12
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if25:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP8]], i32 12
-; CHECK-NEXT:    store i8 [[TMP34]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 13
-; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
-; CHECK:       pred.store.if27:
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[TMP8]], i32 13
-; CHECK-NEXT:    store i8 [[TMP36]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; CHECK:       pred.store.continue28:
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 14
-; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
-; CHECK:       pred.store.if29:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[TMP8]], i32 14
-; CHECK-NEXT:    store i8 [[TMP38]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
-; CHECK:       pred.store.continue30:
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 15
-; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.if31:
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
 ; CHECK-NEXT:    store i8 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue32:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
new file mode 100644
index 000000000000..0a975108edee
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @loop_invariant_store(ptr %p, i64 %a, i8 %b) {
+; CHECK-LABEL: define void @loop_invariant_store(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> [[TMP0]], splat (i64 52)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 8)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sge <4 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[P]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[ADD]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
+; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label %[[COND_FALSE:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], %[[LOOP_HEADER]] ], [ [[ZEXT]], %[[COND_FALSE]] ]
+; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
+; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[P]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:                                      ; preds = %loop.latch, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i32 %iv, 1
+  %cmp.slt = icmp slt i32 %iv, 2
+  %shl = shl i64 %a, 48
+  %ashr = ashr i64 %shl, 52
+  %trunc.i32 = trunc i64 %ashr to i32
+  br i1 %cmp.slt, label %cond.false, label %loop.latch
+
+cond.false:                                       ; preds = %loop.header
+  %zext = zext i8 %b to i32
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %cond.false, %loop.header
+  %cond = phi i32 [ %trunc.i32, %loop.header ], [ %zext, %cond.false ]
+  %shl.i32 = shl i32 %cond, 8
+  %trunc = trunc i32 %shl.i32 to i8
+  store i8 %trunc, ptr %p, align 1
+  %exitcond = icmp slt i32 %iv, 8
+  br i1 %exitcond, label %loop.header, label %exit
+
+exit:                                             ; preds = %loop.latch
+  ret void
+}
+
+define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) {
+; CHECK-LABEL: define void @loop_invariant_srem(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i64> [[TMP0]], splat (i64 52)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE10:.*]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE10]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IND:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 8)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sge <4 x i8> [[VEC_IND1]], splat (i8 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = srem <4 x i8> [[VEC_IND1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i8> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP13]]
+; CHECK-NEXT:    store i32 4, ptr [[TMP12]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK:       [[PRED_STORE_IF5]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i8> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP16]]
+; CHECK-NEXT:    store i32 4, ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK:       [[PRED_STORE_IF7]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i8> [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP20]]
+; CHECK-NEXT:    store i32 4, ptr [[TMP19]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_IF9]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i8> [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[P]], i8 [[TMP21]]
+; CHECK-NEXT:    store i32 4, ptr [[TMP23]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND1]], splat (i8 4)
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i8 [[IV]], 2
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
+; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label %[[COND_FALSE:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], %[[LOOP_HEADER]] ], [ [[ZEXT]], %[[COND_FALSE]] ]
+; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
+; CHECK-NEXT:    [[REM:%.*]] = srem i8 [[IV]], [[TRUNC]]
+; CHECK-NEXT:    [[GEP_P_REM:%.*]] = getelementptr i32, ptr [[P]], i8 [[REM]]
+; CHECK-NEXT:    store i32 4, ptr [[GEP_P_REM]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV]], 8
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:                                      ; preds = %loop.latch, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i8 %iv, 1
+  %cmp.slt = icmp slt i8 %iv, 2
+  %shl = shl i64 %a, 48
+  %ashr = ashr i64 %shl, 52
+  %trunc.i32 = trunc i64 %ashr to i32
+  br i1 %cmp.slt, label %cond.false, label %loop.latch
+
+cond.false:                                       ; preds = %loop.header
+  %zext = zext i8 %b to i32
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %cond.false, %loop.header
+  %cond = phi i32 [ %trunc.i32, %loop.header ], [ %zext, %cond.false ]
+  %shl.i32 = shl i32 %cond, 8
+  %trunc = trunc i32 %shl.i32 to i8
+  %rem = srem i8 %iv, %trunc
+  %gep.p.rem = getelementptr i32, ptr %p, i8 %rem
+  store i32 4, ptr %gep.p.rem
+  %ec = icmp eq i8 %iv, 8
+  br i1 %ec, label %exit, label %loop.header
+
+exit:                                             ; preds = %loop.latch
+  ret void
+}
+
+define void @loop_invariant_float_store(ptr %p, i32 %a) {
+; CHECK-LABEL: define void @loop_invariant_float_store(
+; CHECK-SAME: ptr [[P:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = sitofp i32 [[A]] to float
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    store float [[TMP10]], ptr [[P]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label %[[COND_FALSE:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    store float [[TMP10]], ptr [[P]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[IV]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.conv = sitofp i32 %a to float
+  br label %loop.header
+
+loop.header:                                      ; preds = %loop.latch, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i32 %iv, 1
+  %cmp.slt = icmp slt i32 %iv, 2
+  br i1 %cmp.slt, label %cond.false, label %loop.latch
+
+cond.false:                                       ; preds = %loop.header
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %cond.false, %loop.header
+  store float %a.conv, ptr %p
+  %exitcond = icmp slt i32 %iv, 8
+  br i1 %exitcond, label %loop.header, label %exit
+
+exit:                                             ; preds = %loop.latch
+  ret void
+}

From 349f8d67d4ee2e7a6045b02f6aea0a72165404b1 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 20 Jun 2025 08:09:36 -0500
Subject: [PATCH 1043/1322] [flang][OpenMP] Skip runtime mapping with no
 offload targets (#144534)

When no offload targets are specified flang will ignore "target"
constructs, but not "target data" constructs. This patch makes the
behavior consistent across all offload-related operations.

While ignoring "target" may produce semantically incorrect code, it may
still be a useful debugging tool.
---
 flang/test/Lower/ignore-target-data.f90       |  30 ++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   6 +
 mlir/test/Target/LLVMIR/omptarget-llvm.mlir   | 352 +++++++++---------
 .../LLVMIR/omptargetdata-nowait-llvm.mlir     |  42 ++-
 .../LLVMIR/openmp-data-target-device.mlir     |   2 +-
 5 files changed, 247 insertions(+), 185 deletions(-)
 create mode 100644 flang/test/Lower/ignore-target-data.f90

diff --git a/flang/test/Lower/ignore-target-data.f90 b/flang/test/Lower/ignore-target-data.f90
new file mode 100644
index 000000000000..08947c137090
--- /dev/null
+++ b/flang/test/Lower/ignore-target-data.f90
@@ -0,0 +1,30 @@
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix=NORT
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix=LLVM
+
+!Make sure that there are no calls to the mapper.
+!NORT-NOT: call{{.*}}__tgt_target_data_begin_mapper
+!NORT-NOT: call{{.*}}__tgt_target_data_end_mapper
+
+!Make sure we generate the body
+!LLVM: define internal void @_QFPf(ptr %[[A0:[0-9]+]], ptr %[[A1:[0-9]+]]) {
+!LLVM:   %[[V0:[0-9]+]] = load i32, ptr %[[A0]], align 4
+!LLVM:   %[[V1:[0-9]+]] = load i32, ptr %[[A1]], align 4
+!LLVM:   %[[V2:[0-9]+]] = add i32 %[[V0]], %[[V1]]
+!LLVM:   store i32 %[[V2]], ptr %[[A0]], align 4
+!LLVM:   ret void
+!LLVM: }
+
+
+program test
+
+call f(1, 2)
+
+contains
+
+subroutine f(x, y)
+  integer :: x, y
+  !$omp target data map(tofrom: x, y)
+  x = x + y
+  !$omp end target data
+end subroutine
+end
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 90ce06a0345c..eece8573f00e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4378,6 +4378,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   llvm::OpenMPIRBuilder::TargetDataInfo info(/*RequiresDevicePointerInfo=*/true,
                                              /*SeparateBeginEndCalls=*/true);
+  bool isTargetDevice = ompBuilder->Config.isTargetDevice();
+  bool isOffloadEntry =
+      isTargetDevice || !ompBuilder->Config.TargetTriples.empty();
 
   LogicalResult result =
       llvm::TypeSwitch<Operation *, LogicalResult>(op)
@@ -4467,6 +4470,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
 
   if (failed(result))
     return failure();
+  // Pretend we have IF(false) if we're not doing offload.
+  if (!isOffloadEntry)
+    ifCond = builder.getFalse();
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   MapInfoData mapData;
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 971bea206854..e6ea3aaeec65 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -1,15 +1,17 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
 
-llvm.func @_QPopenmp_target_data() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(99 : i32) : i32
-    llvm.store %3, %1 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data map_entries(%2 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(99 : i32) : i32
+      llvm.store %3, %1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
@@ -38,23 +40,25 @@ llvm.func @_QPopenmp_target_data() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
-  %1 = llvm.mlir.constant(1023 : index) : i64
-  %2 = llvm.mlir.constant(0 : index) : i64
-  %3 = llvm.mlir.constant(1024 : index) : i64
-  %4 = llvm.mlir.constant(1 : index) : i64
-  %5 = omp.map.bounds   lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%3 : i64) stride(%4 : i64) start_idx(%4 : i64)
-  %6 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%5)  -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%6 : !llvm.ptr) {
-    %7 = llvm.mlir.constant(99 : i32) : i32
-    %8 = llvm.mlir.constant(1 : i64) : i64
-    %9 = llvm.mlir.constant(1 : i64) : i64
-    %10 = llvm.mlir.constant(0 : i64) : i64
-    %11 = llvm.getelementptr %0[0, %10] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1024 x i32>
-    llvm.store %7, %11 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
+    %1 = llvm.mlir.constant(1023 : index) : i64
+    %2 = llvm.mlir.constant(0 : index) : i64
+    %3 = llvm.mlir.constant(1024 : index) : i64
+    %4 = llvm.mlir.constant(1 : index) : i64
+    %5 = omp.map.bounds   lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%3 : i64) stride(%4 : i64) start_idx(%4 : i64)
+    %6 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%5)  -> !llvm.ptr {name = ""}
+    omp.target_data map_entries(%6 : !llvm.ptr) {
+      %7 = llvm.mlir.constant(99 : i32) : i32
+      %8 = llvm.mlir.constant(1 : i64) : i64
+      %9 = llvm.mlir.constant(1 : i64) : i64
+      %10 = llvm.mlir.constant(0 : i64) : i64
+      %11 = llvm.getelementptr %0[0, %10] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1024 x i32>
+      llvm.store %7, %11 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4096]
@@ -85,50 +89,52 @@ llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
 
 // -----
 
-llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
-  %4 = llvm.mlir.constant(1 : i64) : i64
-  %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr
-  %6 = llvm.mlir.constant(1 : i64) : i64
-  %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr
-  %8 = llvm.mlir.constant(5 : i32) : i32
-  llvm.store %8, %7 : i32, !llvm.ptr
-  %9 = llvm.mlir.constant(2 : i32) : i32
-  llvm.store %9, %5 : i32, !llvm.ptr
-  %10 = llvm.load %7 : !llvm.ptr -> i32
-  %11 = llvm.mlir.constant(10 : i32) : i32
-  %12 = llvm.icmp "slt" %10, %11 : i32
-  %13 = llvm.load %5 : !llvm.ptr -> i32
-  %14 = llvm.mlir.constant(1023 : index) : i64
-  %15 = llvm.mlir.constant(0 : index) : i64
-  %16 = llvm.mlir.constant(1024 : index) : i64
-  %17 = llvm.mlir.constant(1 : index) : i64
-  %18 = omp.map.bounds   lower_bound(%15 : i64) upper_bound(%14 : i64) extent(%16 : i64) stride(%17 : i64) start_idx(%17 : i64)
-  %map1 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(to) capture(ByRef) bounds(%18) -> !llvm.ptr {name = ""}
-  %19 = llvm.mlir.constant(511 : index) : i64
-  %20 = llvm.mlir.constant(0 : index) : i64
-  %21 = llvm.mlir.constant(512 : index) : i64
-  %22 = llvm.mlir.constant(1 : index) : i64
-  %23 = omp.map.bounds   lower_bound(%20 : i64) upper_bound(%19 : i64) extent(%21 : i64) stride(%22 : i64) start_idx(%22 : i64)
-  %map2 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%23) -> !llvm.ptr {name = ""}
-  omp.target_enter_data   if(%12) device(%13 : i32) map_entries(%map1, %map2 : !llvm.ptr, !llvm.ptr)
-  %24 = llvm.load %7 : !llvm.ptr -> i32
-  %25 = llvm.mlir.constant(10 : i32) : i32
-  %26 = llvm.icmp "sgt" %24, %25 : i32
-  %27 = llvm.load %5 : !llvm.ptr -> i32
-  %28 = llvm.mlir.constant(1023 : index) : i64
-  %29 = llvm.mlir.constant(0 : index) : i64
-  %30 = llvm.mlir.constant(1024 : index) : i64
-  %31 = llvm.mlir.constant(1 : index) : i64
-  %32 = omp.map.bounds   lower_bound(%29 : i64) upper_bound(%28 : i64) extent(%30 : i64) stride(%31 : i64) start_idx(%31 : i64)
-  %map3 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%32) -> !llvm.ptr {name = ""}
-  %33 = llvm.mlir.constant(511 : index) : i64
-  %34 = llvm.mlir.constant(0 : index) : i64
-  %35 = llvm.mlir.constant(512 : index) : i64
-  %36 = llvm.mlir.constant(1 : index) : i64
-  %37 = omp.map.bounds   lower_bound(%34 : i64) upper_bound(%33 : i64) extent(%35 : i64) stride(%36 : i64) start_idx(%36 : i64)
-  %map4 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%37) -> !llvm.ptr {name = ""}
-  omp.target_exit_data   if(%26) device(%27 : i32) map_entries(%map3, %map4 : !llvm.ptr, !llvm.ptr)
-  llvm.return
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
+    %4 = llvm.mlir.constant(1 : i64) : i64
+    %5 = llvm.alloca %4 x i32 {bindc_name = "dvc", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEdvc"} : (i64) -> !llvm.ptr
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_enter_exitEi"} : (i64) -> !llvm.ptr
+    %8 = llvm.mlir.constant(5 : i32) : i32
+    llvm.store %8, %7 : i32, !llvm.ptr
+    %9 = llvm.mlir.constant(2 : i32) : i32
+    llvm.store %9, %5 : i32, !llvm.ptr
+    %10 = llvm.load %7 : !llvm.ptr -> i32
+    %11 = llvm.mlir.constant(10 : i32) : i32
+    %12 = llvm.icmp "slt" %10, %11 : i32
+    %13 = llvm.load %5 : !llvm.ptr -> i32
+    %14 = llvm.mlir.constant(1023 : index) : i64
+    %15 = llvm.mlir.constant(0 : index) : i64
+    %16 = llvm.mlir.constant(1024 : index) : i64
+    %17 = llvm.mlir.constant(1 : index) : i64
+    %18 = omp.map.bounds   lower_bound(%15 : i64) upper_bound(%14 : i64) extent(%16 : i64) stride(%17 : i64) start_idx(%17 : i64)
+    %map1 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(to) capture(ByRef) bounds(%18) -> !llvm.ptr {name = ""}
+    %19 = llvm.mlir.constant(511 : index) : i64
+    %20 = llvm.mlir.constant(0 : index) : i64
+    %21 = llvm.mlir.constant(512 : index) : i64
+    %22 = llvm.mlir.constant(1 : index) : i64
+    %23 = omp.map.bounds   lower_bound(%20 : i64) upper_bound(%19 : i64) extent(%21 : i64) stride(%22 : i64) start_idx(%22 : i64)
+    %map2 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%23) -> !llvm.ptr {name = ""}
+    omp.target_enter_data   if(%12) device(%13 : i32) map_entries(%map1, %map2 : !llvm.ptr, !llvm.ptr)
+    %24 = llvm.load %7 : !llvm.ptr -> i32
+    %25 = llvm.mlir.constant(10 : i32) : i32
+    %26 = llvm.icmp "sgt" %24, %25 : i32
+    %27 = llvm.load %5 : !llvm.ptr -> i32
+    %28 = llvm.mlir.constant(1023 : index) : i64
+    %29 = llvm.mlir.constant(0 : index) : i64
+    %30 = llvm.mlir.constant(1024 : index) : i64
+    %31 = llvm.mlir.constant(1 : index) : i64
+    %32 = omp.map.bounds   lower_bound(%29 : i64) upper_bound(%28 : i64) extent(%30 : i64) stride(%31 : i64) start_idx(%31 : i64)
+    %map3 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%32) -> !llvm.ptr {name = ""}
+    %33 = llvm.mlir.constant(511 : index) : i64
+    %34 = llvm.mlir.constant(0 : index) : i64
+    %35 = llvm.mlir.constant(512 : index) : i64
+    %36 = llvm.mlir.constant(1 : index) : i64
+    %37 = omp.map.bounds   lower_bound(%34 : i64) upper_bound(%33 : i64) extent(%35 : i64) stride(%36 : i64) start_idx(%36 : i64)
+    %map4 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%37) -> !llvm.ptr {name = ""}
+    omp.target_exit_data   if(%26) device(%27 : i32) map_entries(%map3, %map4 : !llvm.ptr, !llvm.ptr)
+    llvm.return
+  }
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4096, i64 2048]
@@ -205,18 +211,20 @@ llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_ptr() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 -> %arg0 : !llvm.ptr)  {
-    %1 = llvm.mlir.constant(10 : i32) : i32
-    %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %1, %2 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_ptr() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map1 : !llvm.ptr) use_device_ptr(%map2 -> %arg0 : !llvm.ptr)  {
+      %1 = llvm.mlir.constant(10 : i32) : i32
+      %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %1, %2 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 8]
@@ -249,18 +257,20 @@ llvm.func @_QPopenmp_target_use_dev_ptr() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_addr() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
-    %1 = llvm.mlir.constant(10 : i32) : i32
-    %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %1, %2 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_addr() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
+      %1 = llvm.mlir.constant(10 : i32) : i32
+      %2 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %1, %2 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 8]
@@ -291,17 +301,19 @@ llvm.func @_QPopenmp_target_use_dev_addr() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
-    %1 = llvm.mlir.constant(10 : i32) : i32
-    llvm.store %1, %arg0 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
+      %1 = llvm.mlir.constant(10 : i32) : i32
+      llvm.store %1, %arg0 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
@@ -331,23 +343,25 @@ llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
-    %2 = llvm.mlir.constant(10 : i32) : i32
-    %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %2, %3 : i32, !llvm.ptr
-    %4 = llvm.mlir.constant(20 : i32) : i32
-    %5 = llvm.load %b : !llvm.ptr -> !llvm.ptr
-    llvm.store %4, %5 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map : !llvm.ptr) use_device_addr(%map2 -> %arg0 : !llvm.ptr)  {
+      %2 = llvm.mlir.constant(10 : i32) : i32
+      %3 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %2, %3 : i32, !llvm.ptr
+      %4 = llvm.mlir.constant(20 : i32) : i32
+      %5 = llvm.load %b : !llvm.ptr -> !llvm.ptr
+      llvm.store %4, %5 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 8, i64 0]
@@ -387,25 +401,27 @@ llvm.func @_QPopenmp_target_use_dev_addr_nomap() {
 
 // -----
 
-llvm.func @_QPopenmp_target_use_dev_both() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %1 = llvm.mlir.constant(1 : i64) : i64
-  %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
-  %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_addr(%map3 -> %arg0 : !llvm.ptr) use_device_ptr(%map2 -> %arg1 : !llvm.ptr)  {
-    %2 = llvm.mlir.constant(10 : i32) : i32
-    %3 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
-    llvm.store %2, %3 : i32, !llvm.ptr
-    %4 = llvm.mlir.constant(20 : i32) : i32
-    %5 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
-    llvm.store %4, %5 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_use_dev_both() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %b = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr
+    %map = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map1 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map2 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    %map3 = omp.map.info var_ptr(%b : !llvm.ptr, !llvm.ptr)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data  map_entries(%map, %map1 : !llvm.ptr, !llvm.ptr) use_device_addr(%map3 -> %arg0 : !llvm.ptr) use_device_ptr(%map2 -> %arg1 : !llvm.ptr)  {
+      %2 = llvm.mlir.constant(10 : i32) : i32
+      %3 = llvm.load %arg1 : !llvm.ptr -> !llvm.ptr
+      llvm.store %2, %3 : i32, !llvm.ptr
+      %4 = llvm.mlir.constant(20 : i32) : i32
+      %5 = llvm.load %arg0 : !llvm.ptr -> !llvm.ptr
+      llvm.store %4, %5 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 8, i64 8]
@@ -448,19 +464,21 @@ llvm.func @_QPopenmp_target_use_dev_both() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_update() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(99 : i32) : i32
-    llvm.store %3, %1 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_update() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+    omp.target_data map_entries(%2 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(99 : i32) : i32
+      llvm.store %3, %1 : i32, !llvm.ptr
+      omp.terminator
+    }
+
+    omp.target_update map_entries(%2 : !llvm.ptr)
+
+    llvm.return
   }
-
-  omp.target_update map_entries(%2 : !llvm.ptr)
-
-  llvm.return
 }
 
 // CHECK-LABEL: define void @_QPopenmp_target_data_update
@@ -488,26 +506,28 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // -----
 
-omp.declare_mapper @_QQFmy_testmy_mapper : !llvm.struct<"_QFmy_testTmy_type", (i32)> {
-^bb0(%arg0: !llvm.ptr):
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  %1 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%data"}
-  %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%2 : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
-  omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
-}
-
-llvm.func @_QPopenmp_target_data_mapper() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x !llvm.struct<"_QFmy_testTmy_type", (i32)> {bindc_name = "a"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) mapper(@_QQFmy_testmy_mapper) -> !llvm.ptr {name = "a"}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(10 : i32) : i32
-    %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
-    llvm.store %3, %4 : i32, !llvm.ptr
-    omp.terminator
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  omp.declare_mapper @_QQFmy_testmy_mapper : !llvm.struct<"_QFmy_testTmy_type", (i32)> {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%data"}
+    %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%2 : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
+    omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
+  }
+
+  llvm.func @_QPopenmp_target_data_mapper() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<"_QFmy_testTmy_type", (i32)> {bindc_name = "a"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) mapper(@_QQFmy_testmy_mapper) -> !llvm.ptr {name = "a"}
+    omp.target_data map_entries(%2 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(10 : i32) : i32
+      %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
+      llvm.store %3, %4 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
   }
-  llvm.return
 }
 
 // CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
index dba8c553aaca..f5c620a8942d 100644
--- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
@@ -1,13 +1,15 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s 2>&1 | FileCheck %s
 
-llvm.func @_QPopenmp_target_data_enter() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_enter() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
 
-  omp.target_enter_data map_entries(%2 : !llvm.ptr) nowait
+    omp.target_enter_data map_entries(%2 : !llvm.ptr) nowait
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK: define void @_QPopenmp_target_data_enter() {
@@ -32,14 +34,16 @@ llvm.func @_QPopenmp_target_data_enter() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_update() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_update() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
 
-  omp.target_update map_entries(%2 : !llvm.ptr) nowait
+    omp.target_update map_entries(%2 : !llvm.ptr) nowait
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK: define void @_QPopenmp_target_data_update() {
@@ -64,14 +68,16 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_exit() {
-  %0 = llvm.mlir.constant(1 : i64) : i64
-  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
-  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @_QPopenmp_target_data_exit() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
 
-  omp.target_exit_data map_entries(%2 : !llvm.ptr) nowait
+    omp.target_exit_data map_entries(%2 : !llvm.ptr) nowait
 
-  llvm.return
+    llvm.return
+  }
 }
 
 // CHECK: define void @_QPopenmp_target_data_exit() {
diff --git a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
index 717a77e61b9a..53c9b4f55964 100644
--- a/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-data-target-device.mlir
@@ -3,7 +3,7 @@
 // This tests checks that a target op inside a data op
 // We are only interested in ensuring that the -mlir-to-llmvir pass doesn't crash.
 // CHECK: {{.*}} = add i32 {{.*}}, 1
-module attributes { } {
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
   llvm.mlir.global weak_odr hidden local_unnamed_addr constant @__oclc_ABI_version(400 : i32) {addr_space = 4 : i32} : i32
   llvm.func @_QQmain() attributes {fir.bindc_name = "main", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %0 = llvm.mlir.constant(99 : index) : i64

From 152d4b8a01e8671a676e7cfaf71c70b1edeee7e8 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Fri, 20 Jun 2025 14:23:26 +0100
Subject: [PATCH 1044/1322] [AArch64] Use indexed dup for 128b segmented splat
 (#144688)

Matches a splat of 128b segments into a wider z register expressed as a
concat_vectors sdnode and generate a dup zn.q, zd.q[0] instruction.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  10 ++
 .../AArch64/sve-fixed-length-splat-segment.ll | 142 ++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eaaff0529cbd..e9d05710cbc4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29358,6 +29358,16 @@ SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
   EVT VT = Op.getValueType();
   EVT SrcVT = SrcOp1.getValueType();
 
+  // Match a splat of 128b segments that fit in a single register.
+  if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+    SDValue Splat =
+        DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
+                    convertToScalableVector(DAG, ContainerVT, SrcOp1),
+                    DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
+    return convertFromScalableVector(DAG, VT, Splat);
+  }
+
   if (NumOperands > 2) {
     SmallVector<SDValue, 4> Ops;
     EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll
new file mode 100644
index 000000000000..a5d213c658c6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+;; Patterns that lower to concat_vectors where all incoming operands are the same.
+
+define void @concat_i8q_256(<16 x i8> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i8q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <16 x i8> %data, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                                                        i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <32 x i8> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i16q_256(<8 x i16> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i16q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <8 x i16> %data, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                                        i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <16 x i16> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i32q_256(<4 x i32> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i32q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x i32> %data, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                       i32 0, i32 1, i32 2, i32 3>
+  store <8 x i32> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i64q_256(<2 x i64> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i64q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <2 x i64> %data, <2 x i64> poison, <4 x i32> <i32 0, i32 1,
+                                                                       i32 0, i32 1>
+  store <4 x i64> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_f16q_256(<8 x half> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_f16q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <8 x half> %data, <8 x half> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                                          i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <16 x half> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_bf16q_256(<8 x bfloat> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_bf16q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <8 x bfloat> %data, <8 x bfloat> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                                              i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <16 x bfloat> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_f32q_256(<4 x float> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_f32q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x float> %data, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                           i32 0, i32 1, i32 2, i32 3>
+  store <8 x float> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_f64q_256(<2 x double> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_f64q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <2 x double> %data, <2 x double> poison, <4 x i32> <i32 0, i32 1,
+                                                                             i32 0, i32 1>
+  store <4 x double> %splat, ptr %addr, align 1
+  ret void
+}
+
+;; Test a wider vector
+
+define void @concat_i32q_512_with_256_vectors(<4 x i32> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i32q_512_with_256_vectors:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0, #1, mul vl]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3>
+  store <16 x i32> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i32q_512_with_512_vectors(<4 x i32> %data, ptr %addr) #1 {
+; CHECK-LABEL: concat_i32q_512_with_512_vectors:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3>
+  store <16 x i32> %splat, ptr %addr, align 1
+  ret void
+}
+
+attributes #0 = { vscale_range(2,2) "target-features"="+sve,+bf16" }
+attributes #1 = { vscale_range(4,4) "target-features"="+sve,+bf16" }

From 8db272ffcf9ad97fe5614b34cd978eac30b53cd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 20 Jun 2025 14:25:36 +0100
Subject: [PATCH 1045/1322] [mlir][SparseTensor] Re-enable tests on AArch64
 (#143387)

These tests were disabled in https://reviews.llvm.org/D136273, due to:
* https://github.com/llvm/llvm-project/issues/58465

That issue has now been resolved, so we should be able to re-enable
these tests.
---
 .../Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir | 2 --
 .../Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir   | 2 --
 2 files changed, 4 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir
index bb7dcd2f35e3..40a0351a83ee 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output_bf16.mlir
@@ -28,8 +28,6 @@
 // REDEFINE: %{sparsifier_opts} = enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
 // RUN: %{compile} | %{run} | FileCheck %s
 
-// UNSUPPORTED: target=aarch64{{.*}}
-
 #SparseVector = #sparse_tensor.encoding<{map = (d0) -> (d0 : compressed)}>
 #DenseVector = #sparse_tensor.encoding<{map = (d0) -> (d0 : dense)}>
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir
index a0711d5b577a..440c1b63c8c2 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir
@@ -30,8 +30,6 @@
 // Do the same run, but now with  VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
 
-// UNSUPPORTED: target=aarch64{{.*}}, mlir_arm_emulator
-
 !Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{

From c3929fbf0ef2d0fac05c54237bd9eac4fd57b1d8 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Fri, 20 Jun 2025 13:16:52 +0000
Subject: [PATCH 1046/1322] [NFC][LLVM] Reorder SVE_SME codegen predicate
 classes.

Also removes unused HasNonStreamingSVE2p1_or_SSVE_AES predicate.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 45 ++++++++++++---------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 400ffff5d567..efe6cc1aa8ae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -245,7 +245,7 @@ def HasOCCMO         : Predicate<"Subtarget->hasOCCMO()">,
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasSVE_or_SME
-    : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
+    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable()">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
                 "sve or sme">;
 def HasNonStreamingSVE_or_SME2p2
@@ -253,6 +253,12 @@ def HasNonStreamingSVE_or_SME2p2
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME2p2),
                 "sve or sme2p2">;
+def HasNonStreamingSVE_or_SSVE_FEXPA
+    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||"
+                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_FEXPA())">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSSVE_FEXPA),
+                "sve or ssve-fexpa">;
+
 def HasSVE2_or_SME
     : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
@@ -264,40 +270,41 @@ def HasSVE2_or_SME2
 def HasNonStreamingSVE2_or_SSVE_AES
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
-                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES), "sve2 or ssve-aes">;
+                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES),
+                "sve2 or ssve-aes">;
+
 def HasSVE2p1_or_SME
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
-                 AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
+                AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1),
+                "sme or sve2p1">;
 def HasSVE2p1_or_SME2
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
-                 AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">;
+                AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1),
+                "sme2 or sve2p1">;
 def HasSVE2p1_or_SME2p1
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">,
-                 AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
+                AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1),
+                "sme2p1 or sve2p1">;
+
 def HasSVE2p2_or_SME2p2
     : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,
-                 AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">;
-def HasNonStreamingSVE2p1_or_SSVE_AES
-    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()) ||"
-                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
-                AssemblerPredicateWithAll<(any_of FeatureSVE2p1, FeatureSSVE_AES), "sve2p1 or ssve-aes">;
-def HasSMEF16F16_or_SMEF8F16
-    : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
-                AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
-                "sme-f16f16 or sme-f8f16">;
+                AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2),
+                "sme2p2 or sve2p2">;
 def HasNonStreamingSVE2p2_or_SME2p2
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
                 "sme2p2 or sve2p2">;
+
+def HasSMEF16F16_or_SMEF8F16
+    : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
+                AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
+                "sme-f16f16 or sme-f8f16">;
 def HasNonStreamingSVE2_or_SSVE_BitPerm
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">,
-                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">;
-def HasNonStreamingSVE_or_SSVE_FEXPA
-    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||"
-                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_FEXPA())">,
-                AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSSVE_FEXPA), "sve or ssve-fexpa">;
+                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm),
+                "sve2 or ssve-bitperm">;
 
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
 // so don't need the additional check for 'isNeonAvailable'.

From 376b71442d03bcc8ec6e2244002e3d62916dcea4 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 20 Jun 2025 06:41:48 -0700
Subject: [PATCH 1047/1322] [NFC][TableGen][DecoderEmitter] Use structured
 binding in range for loop (#144890)

Also assign variable names to different elements of `OpMap` for better
readibility, and eliminate `NumberedEncodingsRef` as `std::vector` will
automatically get converted to an `ArrayRef`.
---
 llvm/utils/TableGen/DecoderEmitter.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 37814113b467..2e8ff2aa47d9 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2631,12 +2631,12 @@ namespace {
 
   DecoderTableInfo TableInfo;
   unsigned OpcodeMask = 0;
-  for (const auto &Opc : OpcMap) {
+  for (const auto &[NSAndByteSize, EncodingIDs] : OpcMap) {
+    const std::string &DecoderNamespace = NSAndByteSize.first;
+    const unsigned BitWidth = 8 * NSAndByteSize.second;
     // Emit the decoder for this namespace+width combination.
-    ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
-                                                   NumberedEncodings.size());
-    FilterChooser FC(NumberedEncodingsRef, Opc.second, Operands,
-                     IsVarLenInst ? MaxInstLen : 8 * Opc.first.second, this);
+    FilterChooser FC(NumberedEncodings, EncodingIDs, Operands,
+                     IsVarLenInst ? MaxInstLen : BitWidth, this);
 
     // The decode table is cleared for each top level decoder function. The
     // predicates and decoders themselves, however, are shared across all
@@ -2657,7 +2657,7 @@ namespace {
 
     // Print the table to the output stream.
     OpcodeMask |= emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(),
-                            Opc.first.first, Opc.second);
+                            DecoderNamespace, EncodingIDs);
   }
 
   // For variable instruction, we emit a instruction length table

From 269cb22ae82fd83ecc7a7ef7f7a4110e4c7a43ec Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <Nico.Vasilache@amd.com>
Date: Fri, 20 Jun 2025 15:45:21 +0200
Subject: [PATCH 1048/1322] =?UTF-8?q?[mlir][transform]=20extract=20a=20min?=
 =?UTF-8?q?imal=20DomainAndOperandsAffineMapT=E2=80=A6=20(#145034)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ransferInterface out of LinalgStructuredInterface and use that for
PadTilingInterface

Along the way, a bug was found on the handling of scalar values, fix it
and add a test.
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     | 88 +++++++++++--------
 .../Dialect/Linalg/Transforms/Transforms.h    |  9 +-
 .../TransformOps/LinalgTransformOps.cpp       | 10 ++-
 .../Linalg/Transforms/PadTilingInterface.cpp  | 31 +++++--
 .../transform-op-pad-tiling-interface.mlir    | 28 ++++++
 5 files changed, 112 insertions(+), 54 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index dbc1ac60e097..74c4c0a8835f 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -222,9 +222,59 @@ def LinalgFillOpInterface : OpInterface<"FillOpInterface"> {
   ];
 }
 
+def IndexingMapOpInterface : OpInterface<"IndexingMapOpInterface"> {
+  let description = [{
+    Interface for operations that connect an iteration domain to operands via
+    affine maps. Provides methods to access indexing maps between iteration
+    domain and operand index spaces.
+  }];
+  let cppNamespace = "::mlir::linalg";
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the indexing maps attribute within the current operation.
+      }],
+      /*retTy=*/"ArrayAttr",
+      /*methodName=*/"getIndexingMaps"
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the indexing maps within the current operation.
+      }],
+      /*retTy=*/"SmallVector<AffineMap>",
+      /*methodName=*/"getIndexingMapsArray",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        auto range = $_op.getIndexingMaps()
+          .template getAsValueRange<AffineMapAttr>();
+        return {range.begin(), range.end()};
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the input or output indexing map for `opOperand`.
+      }],
+      /*retTy=*/"AffineMap",
+      /*methodName=*/"getMatchingIndexingMap",
+      /*args=*/(ins "OpOperand*":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(opOperand->getOwner() == this->getOperation());
+        auto indexingMaps =
+          $_op.getIndexingMaps().template getAsValueRange<AffineMapAttr>();
+        return *(indexingMaps.begin() + opOperand->getOperandNumber());
+      }]
+    >,
+  ];
+}
+
 // The 'LinalgStructuredInterface' provides access to the 'LinalgOp' interface.
 def LinalgStructuredInterface
-    : OpInterface<"LinalgOp", [DestinationStyleOpInterface]> {
+    : OpInterface<"LinalgOp", [
+      DestinationStyleOpInterface,
+      IndexingMapOpInterface
+  ]> {
   let cppNamespace = "::mlir::linalg";
   let methods = [
     //===------------------------------------------------------------------===//
@@ -465,21 +515,6 @@ def LinalgStructuredInterface
             blockArgument.getArgNumber());
       }]
     >,
-    InterfaceMethod<
-      /*desc=*/[{
-        Return the input or output indexing map for `opOperand`.
-      }],
-      /*retTy=*/"AffineMap",
-      /*methodName=*/"getMatchingIndexingMap",
-      /*args=*/(ins "OpOperand*":$opOperand),
-      /*methodBody=*/"",
-      /*defaultImplementation=*/[{
-        assert(opOperand->getOwner() == this->getOperation());
-        auto indexingMaps =
-          $_op.getIndexingMaps().template getAsValueRange<AffineMapAttr>();
-        return *(indexingMaps.begin() + opOperand->getOperandNumber());
-      }]
-    >,
     InterfaceMethod<
       /*desc=*/[{
         Return the indexing map for a `result`.
@@ -576,27 +611,6 @@ def LinalgStructuredInterface
       /*methodBody=*/"",
       /*defaultImplementation=*/[{ return success(); }]
     >,
-    InterfaceMethod<
-      /*desc=*/[{
-        Return the indexing maps attribute within the current operation.
-      }],
-      /*retTy=*/"ArrayAttr",
-      /*methodName=*/"getIndexingMaps"
-    >,
-    InterfaceMethod<
-      /*desc=*/[{
-        Return the indexing maps within the current operation.
-      }],
-      /*retTy=*/"SmallVector<AffineMap>",
-      /*methodName=*/"getIndexingMapsArray",
-      /*args=*/(ins),
-      /*methodBody=*/"",
-      /*defaultImplementation=*/[{
-        auto range = $_op.getIndexingMaps()
-          .template getAsValueRange<AffineMapAttr>();
-        return {range.begin(), range.end()};
-      }]
-    >,
     InterfaceMethod<
       /*desc=*/[{
         Return true if any of the operands has a dynamic shape.
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 59b7fdeef10b..a6dab03d6473 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -612,10 +612,9 @@ using PadSizeComputationFunction =
         const PadTilingInterfaceOptions &)>;
 
 /// Specific helper for Linalg ops.
-FailureOr<SmallVector<OpFoldResult>>
-computeLinalgPaddedShape(RewriterBase &rewriter, OpOperand &operandToPad,
-                         ArrayRef<Range> iterationDomain,
-                         const PadTilingInterfaceOptions &options);
+FailureOr<SmallVector<OpFoldResult>> computeIndexingMapOpInterfacePaddedShape(
+    RewriterBase &rewriter, OpOperand &operandToPad,
+    ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options);
 
 /// Pad the iterator dimensions `options.paddingDimensions` of `opToPad`.
 ///
@@ -632,7 +631,7 @@ rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
                   const PadTilingInterfaceOptions &constOptions,
                   SmallVector<tensor::PadOp> &padOps,
                   PadSizeComputationFunction computePaddingSizeFun =
-                      &computeLinalgPaddedShape);
+                      &computeIndexingMapOpInterfacePaddedShape);
 
 namespace detail {
 
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index e627fc83f2ba..5d55adbf46f3 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2229,10 +2229,12 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
       return diag;
     }
 
-    // Only Linalg ops for now, until TilingInterface exposes a loopsToOperand
-    // map / C++ APIs to compute the effect of padding on operands.
-    if (!isa<LinalgOp>(targetOp.getOperation())) {
-      auto diag = emitSilenceableError() << "only LinalgOp supported atm";
+    // Only IndexingMapOpInterface ops for now, until TilingInterface exposes a
+    // loopsToOperand map / C++ APIs to compute the effect of padding on
+    // operands.
+    if (!isa<IndexingMapOpInterface>(targetOp.getOperation())) {
+      auto diag = emitSilenceableError() << "only IndexingMapOpInterface ops "
+                                            "supported atm";
       diag.attachNote(target->getLoc()) << "target op";
       return diag;
     }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index a9d7bc64f2a6..5383ae48aeb3 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -155,11 +155,13 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
   return paddedShape;
 }
 
-FailureOr<SmallVector<OpFoldResult>> linalg::computeLinalgPaddedShape(
+FailureOr<SmallVector<OpFoldResult>>
+linalg::computeIndexingMapOpInterfacePaddedShape(
     RewriterBase &rewriter, OpOperand &operandToPad,
     ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options) {
-  auto linalgOp = llvm::dyn_cast<LinalgOp>(operandToPad.getOwner());
-  if (!linalgOp)
+  auto transferOp =
+      llvm::dyn_cast<IndexingMapOpInterface>(operandToPad.getOwner());
+  if (!transferOp)
     return failure();
 
   // clang-format off
@@ -173,7 +175,7 @@ FailureOr<SmallVector<OpFoldResult>> linalg::computeLinalgPaddedShape(
   for (const Range &range : iterationDomain)
     loopUpperBounds.push_back(range.size);
 
-  AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&operandToPad);
+  AffineMap indexingMap = transferOp.getMatchingIndexingMap(&operandToPad);
   return computePaddedShape(
       rewriter, cast<TypedValue<RankedTensorType>>(operandToPad.get()),
       indexingMap, loopUpperBounds, options);
@@ -255,7 +257,18 @@ linalg::rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
   SmallVector<Value> newOperands;
   newOperands.reserve(opToPad->getNumOperands());
   for (OpOperand &opOperand : opToPad->getOpOperands()) {
-    LLVM_DEBUG(DBGS() << "--start padding oprd: " << opOperand.get() << "\n");
+    Value operand = opOperand.get();
+    LLVM_DEBUG(DBGS() << "--start padding oprd: " << operand << "\n");
+
+    // 2.a. Skip scalar-like operands.
+    Type operandType = operand.getType();
+    if (!isa<RankedTensorType>(operandType)) {
+      assert(!isa<ShapedType>(operandType) ||
+             isa<VectorType>(operandType) &&
+                 "Unexpected non-vector ShapedType");
+      newOperands.push_back(operand);
+      continue;
+    }
     // 2.a. Compute padded shape.
     FailureOr<SmallVector<OpFoldResult>> maybePaddedShape =
         computePaddingSizeFun(rewriter, opOperand, iterationDomain, options);
@@ -266,14 +279,16 @@ linalg::rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
     // 2.b. Expect proper `paddingValues`.
     // TODO: we may want to allow garbage padding in the future, in which case
     // we would just not assert.
-    assert(opOperand.getOperandNumber() < options.paddingValues.size() &&
-           "--no padding value specified");
+    if (opOperand.getOperandNumber() >= options.paddingValues.size()) {
+      return rewriter.notifyMatchFailure(opToPad,
+                                         "--no padding value specified");
+    }
     Attribute paddingValueAttr =
         options.paddingValues[opOperand.getOperandNumber()];
 
     // 2.c. Perform actual padding.
     Value paddedOperand = padOperand(
-        rewriter, opToPad, cast<TypedValue<RankedTensorType>>(opOperand.get()),
+        rewriter, opToPad, cast<TypedValue<RankedTensorType>>(operand),
         *maybePaddedShape, paddingValueAttr);
     LLVM_DEBUG(DBGS() << "--done padding operand: " << paddedOperand << "\n");
 
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
index c361885693cb..f0a410fa4015 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
@@ -1,5 +1,33 @@
 // RUN: mlir-opt --transform-interpreter -canonicalize -split-input-file --verify-diagnostics %s | FileCheck %s
 
+//     CHECK-LABEL: pad_fill
+//           CHECK:   linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+func.func @pad_fill(%value: f32, %output: tensor<24x25xf32>) -> tensor<24x25xf32>
+{
+  %0 = linalg.fill ins(%value : f32) outs(%output : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %fill = transform.structured.match ops{["linalg.fill"]} in %arg1
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile to 5 then pad to 8
+    %fill_l1, %loops_l1 = transform.structured.tile_using_for %fill tile_sizes [5] 
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    %fill_padded, %_ = transform.structured.pad_tiling_interface %fill_l1 to padding_sizes [8] {
+      padding_values=[0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
 //     CHECK-LABEL: pad_lhs
 func.func @pad_lhs(
   %arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>)

From 3ff69c80786e4c2e37fd40a48ee675e33a5b6f76 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Fri, 20 Jun 2025 14:49:50 +0100
Subject: [PATCH 1049/1322] [LV] Add early-exit-with-store tests (#140899)

Adds some additional LoopVectorizeLegality tests for early exit loops
with a store that we don't vectorize.

Test precommit split from #137774
---
 .../LoopVectorize/early_exit_legality.ll      |  43 +-
 .../early_exit_store_legality.ll              | 409 ++++++++++++++++++
 2 files changed, 428 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll

diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index de455c81d363..22b34079755c 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -442,35 +442,30 @@ loop.end:
   ret i64 %retval
 }
 
-
-define i64 @loop_contains_store(ptr %dest) {
-; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store'
-; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops
+define void @exit_conditions_combined_in_single_branch(ptr noalias dereferenceable(40) %array, ptr readonly align 2 dereferenceable(40) %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'exit_conditions_combined_in_single_branch'
+; CHECK:       LV: Not vectorizing: Cannot vectorize uncountable loop.
 entry:
-  %p1 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  br label %loop
+  br label %for.body
 
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
-  %ld1 = load i32, ptr %arrayidx, align 1
-  %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index
-  store i32 %ld1, ptr %arrayidx2, align 4
-  %cmp = icmp eq i32 %ld1, 1
-  br i1 %cmp, label %loop.inc, label %loop.end
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  %or.cond = select i1 %ee.cond, i1 true, i1 %counted.cond
+  br i1 %or.cond, label %exit, label %for.body
 
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
+exit:                                             ; preds = %for.body
+  ret void
 }
 
-
 define i64 @uncountable_exit_in_conditional_block(ptr %mask) {
 ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block'
 ; CHECK:       LV: Not vectorizing: Early exit is not the latch predecessor.
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
new file mode 100644
index 000000000000..3a08681a2fb9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -0,0 +1,409 @@
+; REQUIRES: asserts
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -disable-output 2>&1 | FileCheck %s
+
+define i64 @loop_contains_store(ptr %dest) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 %ld1, ptr %arrayidx2, align 4
+  %cmp = icmp eq i32 %ld1, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+define void @loop_contains_store_condition_load_has_single_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_condition_load_has_single_user'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_ee_condition_is_invariant(ptr dereferenceable(40) noalias %array, i16 %ee.val) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_ee_condition_is_invariant'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_fcmp_condition(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_fcmp_condition'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw half, ptr %pred, i64 %iv
+  %ee.val = load half, ptr %ee.addr, align 2
+  %ee.cond = fcmp ugt half %ee.val, 500.0
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_safe_dependency(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(96) %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_safe_dependency'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  %pred.plus.8 = getelementptr inbounds nuw i16, ptr %pred, i64 8
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred.plus.8, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  %some.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  store i16 42, ptr %some.addr, align 2
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_unsafe_dependency(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(80) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_unsafe_dependency'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  %unknown.offset = call i64 @get_an_unknown_offset()
+  %unknown.cmp = icmp ult i64 %unknown.offset, 20
+  %clamped.offset = select i1 %unknown.cmp, i64 %unknown.offset, i64 20
+  %unknown.base = getelementptr i16, ptr %pred, i64 %clamped.offset
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  %some.addr = getelementptr inbounds nuw i16, ptr %unknown.base, i64 %iv
+  store i16 42, ptr %some.addr, align 2
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_assumed_bounds(ptr noalias %array, ptr readonly %pred, i32 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_assumed_bounds'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  %n_bytes = mul nuw nsw i32 %n, 2
+  call void @llvm.assume(i1 true) [ "align"(ptr %pred, i64 2), "dereferenceable"(ptr %pred, i32 %n_bytes) ]
+  %tc = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, %tc
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_to_pointer_with_no_deref_info(ptr align 2 dereferenceable(40) readonly %load.array, ptr align 2 noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_to_pointer_with_no_deref_info'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %ld.addr = getelementptr inbounds nuw i16, ptr %load.array, i64 %iv
+  %data = load i16, ptr %ld.addr, align 2
+  %inc = add nsw i16 %data, 1
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_unknown_bounds(ptr align 2 dereferenceable(100) noalias %array, ptr align 2 dereferenceable(100) readonly %pred, i64 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_unknown_bounds'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, %n
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_volatile(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_volatile'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store volatile i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_to_invariant_location(ptr dereferenceable(40) readonly %array, ptr align 2 dereferenceable(40) readonly %pred, ptr noalias %store_addr) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_to_invariant_location'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %store_addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_in_latch_block(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_in_latch_block'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  store i16 %inc, ptr %st.addr, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_requiring_alias_check(ptr dereferenceable(40) %array, ptr align 2 dereferenceable(40) %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_requiring_alias_check'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_condition_load_is_chained(ptr dereferenceable(40) noalias %array, ptr align 8 dereferenceable(160) readonly %offsets, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_condition_load_is_chained'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %gather.addr = getelementptr inbounds nuw i64, ptr %offsets, i64 %iv
+  %ee.offset = load i64, ptr %gather.addr, align 8
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %ee.offset
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_decrementing_iv(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_decrementing_iv'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 19, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = sub nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 0
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare void @init_mem(ptr, i64);
+declare i64 @get_an_unknown_offset();

From 225768d1f9f2e2ccff7dc79b4a4aaeab4c6aafc1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 14:51:01 +0100
Subject: [PATCH 1050/1322] [X86] combineConcatVectorOps - add tests showing
 v4i64 shift-by-32 with unnecessary concatenation

On AVX1-only targets, we concat SHL/SRL AVX1 v4i64 by 32-bits as a shuffle. But this is only worth while if the shift source value is free to concatenate.
---
 .../test/CodeGen/X86/vector-shift-lshr-256.ll | 67 +++++++++++++++++++
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 67 +++++++++++++++++++
 2 files changed, 134 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index d8852956c66f..b45525b6e20f 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1971,6 +1971,73 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
   ret <4 x i64> %shift
 }
 
+define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
+; AVX1-LABEL: shift32_v4i64_concat:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shift32_v4i64_concat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: shift32_v4i64_concat:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shift32_v4i64_concat:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: shift32_v4i64_concat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: shift32_v4i64_concat:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; X86-AVX1-LABEL: shift32_v4i64_concat:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: shift32_v4i64_concat:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; X86-AVX2-NEXT:    retl
+  %a = shufflevector <2 x i64> %lo, <2 x i64> %hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shift = lshr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
+  ret <4 x i64> %shift
+}
+
 define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) {
 ; AVX1-LABEL: sh_trunc_sh_vec:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 3f238b5739f0..2248ee997d52 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1823,3 +1823,70 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
   %shift = shl <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
   ret <4 x i64> %shift
 }
+
+define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
+; AVX1-LABEL: shift32_v4i64_concat:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shift32_v4i64_concat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; XOPAVX1-LABEL: shift32_v4i64_concat:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shift32_v4i64_concat:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; XOPAVX2-NEXT:    retq
+;
+; AVX512-LABEL: shift32_v4i64_concat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: shift32_v4i64_concat:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; X86-AVX1-LABEL: shift32_v4i64_concat:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: shift32_v4i64_concat:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; X86-AVX2-NEXT:    retl
+  %a = shufflevector <2 x i64> %lo, <2 x i64> %hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shift = shl <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
+  ret <4 x i64> %shift
+}

From bd36f7331a9f575272aebb9e0163194541110912 Mon Sep 17 00:00:00 2001
From: Andres-Salamanca <andrealebarbaritos@gmail.com>
Date: Fri, 20 Jun 2025 09:03:02 -0500
Subject: [PATCH 1051/1322] [CIR] Add initial support for bitfields in structs
 (#142041)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds support for bitfields CIR records can now contain bit
fields.

I’ve updated the `CIRGenBitFieldInfo` comment, which originally came
from the incubator and was identical to the one in OGCodeGen, to better
reflect the current implementation.

Support for bitfields in unions big-endian architectures and `get` and
`set` operations remains unimplemented and will be addressed in a future
patch.
---
 clang/include/clang/CIR/MissingFeatures.h     |   3 +
 clang/lib/CIR/CodeGen/CIRGenRecordLayout.h    | 114 ++++++++
 .../CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp | 268 +++++++++++++++++-
 clang/lib/CIR/CodeGen/TargetInfo.cpp          |  37 +++
 clang/lib/CIR/CodeGen/TargetInfo.h            |  10 +
 clang/test/CIR/CodeGen/bitfields.c            |  78 +++++
 clang/test/CIR/CodeGen/bitfields.cpp          |  32 +++
 7 files changed, 535 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/bitfields.c
 create mode 100644 clang/test/CIR/CodeGen/bitfields.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 45452c5929a3..e0b2959f374f 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -149,6 +149,8 @@ struct MissingFeatures {
   static bool cxxabiUseARMGuardVarABI() { return false; }
   static bool cxxabiAppleARM64CXXABI() { return false; }
   static bool cxxabiStructorImplicitParam() { return false; }
+  static bool isDiscreteBitFieldABI() { return false; }
+  static bool isBigEndian() { return false; }
 
   // Address class
   static bool addressOffset() { return false; }
@@ -239,6 +241,7 @@ struct MissingFeatures {
   static bool builtinCall() { return false; }
   static bool builtinCallF128() { return false; }
   static bool builtinCallMathErrno() { return false; }
+  static bool nonFineGrainedBitfields() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
index ac8832b8c9b2..3b51ab784d37 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
@@ -14,6 +14,106 @@
 
 namespace clang::CIRGen {
 
+/// Record with information about how a bitfield should be accessed. This is
+/// very similar to what LLVM codegen does, once CIR evolves it's possible we
+/// can use a more higher level representation.
+///
+/// Often we lay out a sequence of bitfields as a contiguous sequence of bits.
+/// When the AST record layout does this, we represent it in CIR as a
+/// `!cir.record` type, which directly reflects the structure's layout,
+/// including bitfield packing and padding, using CIR types such as
+/// `!cir.bool`, `!s8i`, `!u16i`.
+///
+/// To access a particular bitfield in CIR, we use the operations
+/// `cir.get_bitfield` (`GetBitfieldOp`) or `cir.set_bitfield`
+/// (`SetBitfieldOp`). These operations rely on the `bitfield_info`
+/// attribute, which provides detailed metadata required for access,
+/// such as the size and offset of the bitfield, the type and size of
+/// the underlying storage, and whether the value is signed.
+/// The CIRGenRecordLayout also has a bitFields map which encodes which
+/// byte-sequence this bitfield falls within. Let's assume the following C
+/// struct:
+///
+///   struct S {
+///     char a, b, c;
+///     unsigned bits : 3;
+///     unsigned more_bits : 4;
+///     unsigned still_more_bits : 7;
+///   };
+///
+/// This will end up as the following cir.record. The bitfield members are
+/// represented by one !u16i value, and the array provides padding to align the
+/// struct to a 4-byte alignment.
+///
+///   !rec_S = !cir.record<struct "S" padded {!s8i, !s8i, !s8i, !u16i,
+///   !cir.array<!u8i x 3>}>
+///
+/// When generating code to access more_bits, we'll generate something
+/// essentially like this:
+///
+///   #bfi_more_bits = #cir.bitfield_info<name = "more_bits", storage_type =
+///   !u16i, size = 4, offset = 3, is_signed = false>
+///
+///   cir.func @store_field() {
+///     %0 = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s"] {alignment = 4 : i64}
+///     %1 = cir.const #cir.int<2> : !s32i
+///     %2 = cir.cast(integral, %1 : !s32i), !u32i
+///     %3 = cir.get_member %0[3] {name = "more_bits"} : !cir.ptr<!rec_S> ->
+///     !cir.ptr<!u16i>
+///     %4 = cir.set_bitfield(#bfi_more_bits, %3 :
+///     !cir.ptr<!u16i>, %2 : !u32i) -> !u32i
+///     cir.return
+///   }
+///
+struct CIRGenBitFieldInfo {
+  /// The offset within a contiguous run of bitfields that are represented as
+  /// a single "field" within the cir.record type. This offset is in bits.
+  unsigned offset : 16;
+
+  /// The total size of the bit-field, in bits.
+  unsigned size : 15;
+
+  /// Whether the bit-field is signed.
+  unsigned isSigned : 1;
+
+  /// The storage size in bits which should be used when accessing this
+  /// bitfield.
+  unsigned storageSize;
+
+  /// The offset of the bitfield storage from the start of the record.
+  clang::CharUnits storageOffset;
+
+  /// The offset within a contiguous run of bitfields that are represented as a
+  /// single "field" within the cir.record type, taking into account the AAPCS
+  /// rules for volatile bitfields. This offset is in bits.
+  unsigned volatileOffset : 16;
+
+  /// The storage size in bits which should be used when accessing this
+  /// bitfield.
+  unsigned volatileStorageSize;
+
+  /// The offset of the bitfield storage from the start of the record.
+  clang::CharUnits volatileStorageOffset;
+
+  /// The name of a bitfield
+  llvm::StringRef name;
+
+  // The actual storage type for the bitfield
+  mlir::Type storageType;
+
+  CIRGenBitFieldInfo()
+      : offset(), size(), isSigned(), storageSize(), volatileOffset(),
+        volatileStorageSize() {}
+
+  CIRGenBitFieldInfo(unsigned offset, unsigned size, bool isSigned,
+                     unsigned storageSize, clang::CharUnits storageOffset)
+      : offset(offset), size(size), isSigned(isSigned),
+        storageSize(storageSize), storageOffset(storageOffset) {}
+
+  void print(llvm::raw_ostream &os) const;
+  LLVM_DUMP_METHOD void dump() const;
+};
+
 /// This class handles record and union layout info while lowering AST types
 /// to CIR types.
 ///
@@ -41,6 +141,10 @@ private:
   // for both virtual and non-virtual bases.
   llvm::DenseMap<const clang::CXXRecordDecl *, unsigned> nonVirtualBases;
 
+  /// Map from (bit-field) record field to the corresponding CIR record type
+  /// field no. This info is populated by record builder.
+  llvm::DenseMap<const clang::FieldDecl *, CIRGenBitFieldInfo> bitFields;
+
   /// False if any direct or indirect subobject of this class, when considered
   /// as a complete object, requires a non-zero bitpattern when
   /// zero-initialized.
@@ -83,6 +187,16 @@ public:
   /// Check whether this struct can be C++ zero-initialized
   /// with a zeroinitializer when considered as a base subobject.
   bool isZeroInitializableAsBase() const { return zeroInitializableAsBase; }
+
+  /// Return the BitFieldInfo that corresponds to the field FD.
+  const CIRGenBitFieldInfo &getBitFieldInfo(const clang::FieldDecl *fd) const {
+    fd = fd->getCanonicalDecl();
+    assert(fd->isBitField() && "Invalid call for non-bit-field decl!");
+    llvm::DenseMap<const clang::FieldDecl *, CIRGenBitFieldInfo>::const_iterator
+        it = bitFields.find(fd);
+    assert(it != bitFields.end() && "Unable to find bitfield info");
+    return it->second;
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
index 0aeef7fd89ae..8dbf1b36a93b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
@@ -20,6 +20,7 @@
 #include "clang/AST/RecordLayout.h"
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDataLayout.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/Casting.h"
 
 #include <memory>
@@ -66,6 +67,10 @@ struct CIRRecordLowering final {
     return MemberInfo(offset, MemberInfo::InfoKind::Field, data);
   }
 
+  // Layout routines.
+  void setBitFieldInfo(const FieldDecl *fd, CharUnits startOffset,
+                       mlir::Type storageType);
+
   void lower();
   void lowerUnion();
 
@@ -77,6 +82,9 @@ struct CIRRecordLowering final {
   void accumulateBases(const CXXRecordDecl *cxxRecordDecl);
   void accumulateVPtrs();
   void accumulateFields();
+  RecordDecl::field_iterator
+  accumulateBitFields(RecordDecl::field_iterator field,
+                      RecordDecl::field_iterator fieldEnd);
 
   CharUnits bitsToCharUnits(uint64_t bitOffset) {
     return astContext.toCharUnitsFromBits(bitOffset);
@@ -87,6 +95,9 @@ struct CIRRecordLowering final {
   CharUnits getSize(mlir::Type Ty) {
     return CharUnits::fromQuantity(dataLayout.layout.getTypeSize(Ty));
   }
+  CharUnits getSizeInBits(mlir::Type ty) {
+    return CharUnits::fromQuantity(dataLayout.layout.getTypeSizeInBits(ty));
+  }
   CharUnits getAlignment(mlir::Type Ty) {
     return CharUnits::fromQuantity(dataLayout.layout.getTypeABIAlignment(Ty));
   }
@@ -124,6 +135,17 @@ struct CIRRecordLowering final {
   mlir::Type getStorageType(const CXXRecordDecl *RD) {
     return cirGenTypes.getCIRGenRecordLayout(RD).getBaseSubobjectCIRType();
   }
+  // This is different from LLVM traditional codegen because CIRGen uses arrays
+  // of bytes instead of arbitrary-sized integers. This is important for packed
+  // structures support.
+  mlir::Type getBitfieldStorageType(unsigned numBits) {
+    unsigned alignedBits = llvm::alignTo(numBits, astContext.getCharWidth());
+    if (cir::isValidFundamentalIntWidth(alignedBits))
+      return builder.getUIntNTy(alignedBits);
+
+    mlir::Type type = getCharType();
+    return cir::ArrayType::get(type, alignedBits / astContext.getCharWidth());
+  }
 
   mlir::Type getStorageType(const FieldDecl *fieldDecl) {
     mlir::Type type = cirGenTypes.convertTypeForMem(fieldDecl->getType());
@@ -157,6 +179,7 @@ struct CIRRecordLowering final {
   std::vector<MemberInfo> members;
   // Output fields, consumed by CIRGenTypes::computeRecordLayout
   llvm::SmallVector<mlir::Type, 16> fieldTypes;
+  llvm::DenseMap<const FieldDecl *, CIRGenBitFieldInfo> bitFields;
   llvm::DenseMap<const FieldDecl *, unsigned> fieldIdxMap;
   llvm::DenseMap<const CXXRecordDecl *, unsigned> nonVirtualBases;
   cir::CIRDataLayout dataLayout;
@@ -186,6 +209,32 @@ CIRRecordLowering::CIRRecordLowering(CIRGenTypes &cirGenTypes,
       zeroInitializable(true), zeroInitializableAsBase(true), packed(packed),
       padded(false) {}
 
+void CIRRecordLowering::setBitFieldInfo(const FieldDecl *fd,
+                                        CharUnits startOffset,
+                                        mlir::Type storageType) {
+  CIRGenBitFieldInfo &info = bitFields[fd->getCanonicalDecl()];
+  info.isSigned = fd->getType()->isSignedIntegerOrEnumerationType();
+  info.offset =
+      (unsigned)(getFieldBitOffset(fd) - astContext.toBits(startOffset));
+  info.size = fd->getBitWidthValue();
+  info.storageSize = getSizeInBits(storageType).getQuantity();
+  info.storageOffset = startOffset;
+  info.storageType = storageType;
+  info.name = fd->getName();
+
+  if (info.size > info.storageSize)
+    info.size = info.storageSize;
+  // Reverse the bit offsets for big endian machines. Since bitfields are laid
+  // out as packed bits within an integer-sized unit, we can imagine the bits
+  // counting from the most-significant-bit instead of the
+  // least-significant-bit.
+  assert(!cir::MissingFeatures::isBigEndian());
+
+  info.volatileStorageSize = 0;
+  info.volatileOffset = 0;
+  info.volatileStorageOffset = CharUnits::Zero();
+}
+
 void CIRRecordLowering::lower() {
   if (recordDecl->isUnion()) {
     lowerUnion();
@@ -233,6 +282,8 @@ void CIRRecordLowering::fillOutputFields() {
             fieldTypes.size() - 1;
       // A field without storage must be a bitfield.
       assert(!cir::MissingFeatures::bitfields());
+      if (!member.data)
+        setBitFieldInfo(member.fieldDecl, member.offset, fieldTypes.back());
     } else if (member.kind == MemberInfo::InfoKind::Base) {
       nonVirtualBases[member.cxxRecordDecl] = fieldTypes.size() - 1;
     }
@@ -240,16 +291,217 @@ void CIRRecordLowering::fillOutputFields() {
   }
 }
 
-void CIRRecordLowering::accumulateFields() {
-  for (const FieldDecl *field : recordDecl->fields()) {
-    if (field->isBitField()) {
-      cirGenTypes.getCGModule().errorNYI(recordDecl->getSourceRange(),
-                                         "accumulate bitfields");
+RecordDecl::field_iterator
+CIRRecordLowering::accumulateBitFields(RecordDecl::field_iterator field,
+                                       RecordDecl::field_iterator fieldEnd) {
+  assert(!cir::MissingFeatures::isDiscreteBitFieldABI());
+
+  CharUnits regSize =
+      bitsToCharUnits(astContext.getTargetInfo().getRegisterWidth());
+  unsigned charBits = astContext.getCharWidth();
+
+  // Data about the start of the span we're accumulating to create an access
+  // unit from. 'Begin' is the first bitfield of the span. If 'begin' is
+  // 'fieldEnd', we've not got a current span. The span starts at the
+  // 'beginOffset' character boundary. 'bitSizeSinceBegin' is the size (in bits)
+  // of the span -- this might include padding when we've advanced to a
+  // subsequent bitfield run.
+  RecordDecl::field_iterator begin = fieldEnd;
+  CharUnits beginOffset;
+  uint64_t bitSizeSinceBegin;
+
+  // The (non-inclusive) end of the largest acceptable access unit we've found
+  // since 'begin'. If this is 'begin', we're gathering the initial set of
+  // bitfields of a new span. 'bestEndOffset' is the end of that acceptable
+  // access unit -- it might extend beyond the last character of the bitfield
+  // run, using available padding characters.
+  RecordDecl::field_iterator bestEnd = begin;
+  CharUnits bestEndOffset;
+  bool bestClipped; // Whether the representation must be in a byte array.
+
+  for (;;) {
+    // atAlignedBoundary is true if 'field' is the (potential) start of a new
+    // span (or the end of the bitfields). When true, limitOffset is the
+    // character offset of that span and barrier indicates whether the new
+    // span cannot be merged into the current one.
+    bool atAlignedBoundary = false;
+    bool barrier = false; // a barrier can be a zero Bit Width or non bit member
+    if (field != fieldEnd && field->isBitField()) {
+      uint64_t bitOffset = getFieldBitOffset(*field);
+      if (begin == fieldEnd) {
+        // Beginning a new span.
+        begin = field;
+        bestEnd = begin;
+
+        assert((bitOffset % charBits) == 0 && "Not at start of char");
+        beginOffset = bitsToCharUnits(bitOffset);
+        bitSizeSinceBegin = 0;
+      } else if ((bitOffset % charBits) != 0) {
+        // Bitfield occupies the same character as previous bitfield, it must be
+        // part of the same span. This can include zero-length bitfields, should
+        // the target not align them to character boundaries. Such non-alignment
+        // is at variance with the standards, which require zero-length
+        // bitfields be a barrier between access units. But of course we can't
+        // achieve that in the middle of a character.
+        assert(bitOffset ==
+                   astContext.toBits(beginOffset) + bitSizeSinceBegin &&
+               "Concatenating non-contiguous bitfields");
+      } else {
+        // Bitfield potentially begins a new span. This includes zero-length
+        // bitfields on non-aligning targets that lie at character boundaries
+        // (those are barriers to merging).
+        if (field->isZeroLengthBitField())
+          barrier = true;
+        atAlignedBoundary = true;
+      }
+    } else {
+      // We've reached the end of the bitfield run. Either we're done, or this
+      // is a barrier for the current span.
+      if (begin == fieldEnd)
+        break;
+
+      barrier = true;
+      atAlignedBoundary = true;
+    }
+
+    // 'installBest' indicates whether we should create an access unit for the
+    // current best span: fields ['begin', 'bestEnd') occupying characters
+    // ['beginOffset', 'bestEndOffset').
+    bool installBest = false;
+    if (atAlignedBoundary) {
+      // 'field' is the start of a new span or the end of the bitfields. The
+      // just-seen span now extends to 'bitSizeSinceBegin'.
+
+      // Determine if we can accumulate that just-seen span into the current
+      // accumulation.
+      CharUnits accessSize = bitsToCharUnits(bitSizeSinceBegin + charBits - 1);
+      if (bestEnd == begin) {
+        // This is the initial run at the start of a new span. By definition,
+        // this is the best seen so far.
+        bestEnd = field;
+        bestEndOffset = beginOffset + accessSize;
+        // Assume clipped until proven not below.
+        bestClipped = true;
+        if (!bitSizeSinceBegin)
+          // A zero-sized initial span -- this will install nothing and reset
+          // for another.
+          installBest = true;
+      } else if (accessSize > regSize) {
+        // Accumulating the just-seen span would create a multi-register access
+        // unit, which would increase register pressure.
+        installBest = true;
+      }
+
+      if (!installBest) {
+        // Determine if accumulating the just-seen span will create an expensive
+        // access unit or not.
+        mlir::Type type = getUIntNType(astContext.toBits(accessSize));
+        if (!astContext.getTargetInfo().hasCheapUnalignedBitFieldAccess())
+          cirGenTypes.getCGModule().errorNYI(
+              field->getSourceRange(), "NYI CheapUnalignedBitFieldAccess");
+
+        if (!installBest) {
+          // Find the next used storage offset to determine what the limit of
+          // the current span is. That's either the offset of the next field
+          // with storage (which might be field itself) or the end of the
+          // non-reusable tail padding.
+          CharUnits limitOffset;
+          for (auto probe = field; probe != fieldEnd; ++probe)
+            if (!isEmptyFieldForLayout(astContext, *probe)) {
+              // A member with storage sets the limit.
+              assert((getFieldBitOffset(*probe) % charBits) == 0 &&
+                     "Next storage is not byte-aligned");
+              limitOffset = bitsToCharUnits(getFieldBitOffset(*probe));
+              goto FoundLimit;
+            }
+          assert(!cir::MissingFeatures::cxxSupport());
+          limitOffset = astRecordLayout.getDataSize();
+        FoundLimit:
+          CharUnits typeSize = getSize(type);
+          if (beginOffset + typeSize <= limitOffset) {
+            // There is space before limitOffset to create a naturally-sized
+            // access unit.
+            bestEndOffset = beginOffset + typeSize;
+            bestEnd = field;
+            bestClipped = false;
+          }
+          if (barrier) {
+            // The next field is a barrier that we cannot merge across.
+            installBest = true;
+          } else if (cirGenTypes.getCGModule()
+                         .getCodeGenOpts()
+                         .FineGrainedBitfieldAccesses) {
+            assert(!cir::MissingFeatures::nonFineGrainedBitfields());
+            cirGenTypes.getCGModule().errorNYI(field->getSourceRange(),
+                                               "NYI FineGrainedBitfield");
+          } else {
+            // Otherwise, we're not installing. Update the bit size
+            // of the current span to go all the way to limitOffset, which is
+            // the (aligned) offset of next bitfield to consider.
+            bitSizeSinceBegin = astContext.toBits(limitOffset - beginOffset);
+          }
+        }
+      }
+    }
+
+    if (installBest) {
+      assert((field == fieldEnd || !field->isBitField() ||
+              (getFieldBitOffset(*field) % charBits) == 0) &&
+             "Installing but not at an aligned bitfield or limit");
+      CharUnits accessSize = bestEndOffset - beginOffset;
+      if (!accessSize.isZero()) {
+        // Add the storage member for the access unit to the record. The
+        // bitfields get the offset of their storage but come afterward and
+        // remain there after a stable sort.
+        mlir::Type type;
+        if (bestClipped) {
+          assert(getSize(getUIntNType(astContext.toBits(accessSize))) >
+                     accessSize &&
+                 "Clipped access need not be clipped");
+          type = getByteArrayType(accessSize);
+        } else {
+          type = getUIntNType(astContext.toBits(accessSize));
+          assert(getSize(type) == accessSize &&
+                 "Unclipped access must be clipped");
+        }
+        members.push_back(makeStorageInfo(beginOffset, type));
+        for (; begin != bestEnd; ++begin)
+          if (!begin->isZeroLengthBitField())
+            members.push_back(MemberInfo(
+                beginOffset, MemberInfo::InfoKind::Field, nullptr, *begin));
+      }
+      // Reset to start a new span.
+      field = bestEnd;
+      begin = fieldEnd;
+    } else {
+      assert(field != fieldEnd && field->isBitField() &&
+             "Accumulating past end of bitfields");
+      assert(!barrier && "Accumulating across barrier");
+      // Accumulate this bitfield into the current (potential) span.
+      bitSizeSinceBegin += field->getBitWidthValue();
       ++field;
+    }
+  }
+
+  return field;
+}
+
+void CIRRecordLowering::accumulateFields() {
+  for (RecordDecl::field_iterator field = recordDecl->field_begin(),
+                                  fieldEnd = recordDecl->field_end();
+       field != fieldEnd;) {
+    if (field->isBitField()) {
+      RecordDecl::field_iterator start = field;
+      // Iterate to gather the list of bitfields.
+      for (++field; field != fieldEnd && field->isBitField(); ++field)
+        ;
+      field = accumulateBitFields(start, field);
+      assert((field == fieldEnd || !field->isBitField()) &&
+             "Failed to accumulate all the bitfields");
     } else if (!field->isZeroSize(astContext)) {
-      members.push_back(MemberInfo(bitsToCharUnits(getFieldBitOffset(field)),
+      members.push_back(MemberInfo(bitsToCharUnits(getFieldBitOffset(*field)),
                                    MemberInfo::InfoKind::Field,
-                                   getStorageType(field), field));
+                                   getStorageType(*field), *field));
       ++field;
     } else {
       // TODO(cir): do we want to do anything special about zero size members?
@@ -383,6 +635,8 @@ CIRGenTypes::computeRecordLayout(const RecordDecl *rd, cir::RecordType *ty) {
   // Add all the field numbers.
   rl->fieldIdxMap.swap(lowering.fieldIdxMap);
 
+  rl->bitFields.swap(lowering.bitFields);
+
   // Dump the layout, if requested.
   if (getASTContext().getLangOpts().DumpRecordLayouts) {
     cgm.errorNYI(rd->getSourceRange(), "computeRecordLayout: dump layout");
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index 551341ff20c0..d2d32bbd9403 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -3,6 +3,43 @@
 
 using namespace clang;
 using namespace clang::CIRGen;
+
+bool clang::CIRGen::isEmptyRecordForLayout(const ASTContext &context,
+                                           QualType t) {
+  const RecordType *rt = t->getAs<RecordType>();
+  if (!rt)
+    return false;
+
+  const RecordDecl *rd = rt->getDecl();
+
+  // If this is a C++ record, check the bases first.
+  if (const CXXRecordDecl *cxxrd = dyn_cast<CXXRecordDecl>(rd)) {
+    if (cxxrd->isDynamicClass())
+      return false;
+
+    for (const auto &I : cxxrd->bases())
+      if (!isEmptyRecordForLayout(context, I.getType()))
+        return false;
+  }
+
+  for (const auto *I : rd->fields())
+    if (!isEmptyFieldForLayout(context, I))
+      return false;
+
+  return true;
+}
+
+bool clang::CIRGen::isEmptyFieldForLayout(const ASTContext &context,
+                                          const FieldDecl *fd) {
+  if (fd->isZeroLengthBitField())
+    return true;
+
+  if (fd->isUnnamedBitField())
+    return false;
+
+  return isEmptyRecordForLayout(context, fd->getType());
+}
+
 namespace {
 
 class X8664ABIInfo : public ABIInfo {
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.h b/clang/lib/CIR/CodeGen/TargetInfo.h
index d31d1ee82d90..a5c548aa2c7c 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -22,6 +22,16 @@
 
 namespace clang::CIRGen {
 
+/// isEmptyFieldForLayout - Return true if the field is "empty", that is,
+/// either a zero-width bit-field or an isEmptyRecordForLayout.
+bool isEmptyFieldForLayout(const ASTContext &context, const FieldDecl *fd);
+
+/// isEmptyRecordForLayout - Return true if a structure contains only empty
+/// base classes (per  isEmptyRecordForLayout) and fields (per
+/// isEmptyFieldForLayout). Note, C++ record fields are considered empty
+/// if the [[no_unique_address]] attribute would have made them empty.
+bool isEmptyRecordForLayout(const ASTContext &context, QualType t);
+
 class TargetCIRGenInfo {
   std::unique_ptr<ABIInfo> info;
 
diff --git a/clang/test/CIR/CodeGen/bitfields.c b/clang/test/CIR/CodeGen/bitfields.c
new file mode 100644
index 000000000000..ff5c6bc1787b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/bitfields.c
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+typedef struct {
+  char a, b, c;
+  unsigned bits : 3;
+  unsigned more_bits : 4;
+  unsigned still_more_bits : 7;
+} A;
+
+// CIR-DAG:  !rec_A = !cir.record<struct "A" packed padded {!s8i, !s8i, !s8i, !u16i, !cir.array<!u8i x 3>}>
+// LLVM-DAG: %struct.A = type <{ i8, i8, i8, i16, [3 x i8] }>
+// OGCG-DAG: %struct.A = type <{ i8, i8, i8, i16, [3 x i8] }>
+
+typedef struct {
+  int a : 4;
+  int b : 5;
+  int c;
+} D;
+
+// CIR-DAG:  !rec_D = !cir.record<struct "D" {!u16i, !s32i}>
+// LLVM-DAG: %struct.D = type { i16, i32 }
+// OGCG-DAG: %struct.D = type { i16, i32 }
+
+typedef struct {
+  int a : 4;
+  int b : 27;
+  int c : 17;
+  int d : 2;
+  int e : 15;
+  unsigned f; // type other than int above, not a bitfield
+} S;
+// CIR-DAG:  !rec_S = !cir.record<struct "S" {!u64i, !u16i, !u32i}>
+// LLVM-DAG: %struct.S = type { i64, i16, i32 }
+// OGCG-DAG: %struct.S = type { i64, i16, i32 }
+
+typedef struct {
+  int a : 3;  // one bitfield with size < 8
+  unsigned b;
+} T;
+
+// CIR-DAG:  !rec_T = !cir.record<struct "T" {!u8i, !u32i}>
+// LLVM-DAG: %struct.T = type { i8, i32 }
+// OGCG-DAG: %struct.T = type { i8, i32 }
+
+typedef struct {
+    char a;
+    char b;
+    char c;
+
+    // startOffset 24 bits, new storage from here
+    int d: 2;
+    int e: 2;
+    int f: 4;
+    int g: 25;
+    int h: 3;
+    int i: 4;
+    int j: 3;
+    int k: 8;
+
+    int l: 14;
+} U;
+
+// CIR-DAG:  !rec_U = !cir.record<struct "U" packed {!s8i, !s8i, !s8i, !u8i, !u64i}>
+// LLVM-DAG: %struct.U = type <{ i8, i8, i8, i8, i64 }>
+// OGCG-DAG: %struct.U = type <{ i8, i8, i8, i8, i64 }>
+
+void def() {
+  A a;
+  D d;
+  S s;
+  T t;
+  U u;
+}
diff --git a/clang/test/CIR/CodeGen/bitfields.cpp b/clang/test/CIR/CodeGen/bitfields.cpp
new file mode 100644
index 000000000000..762d24988474
--- /dev/null
+++ b/clang/test/CIR/CodeGen/bitfields.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+typedef struct {
+  int a : 4;
+  int b : 27;
+  int c : 17;
+  int d : 2;
+  int e : 15;
+  unsigned f; // type other than int above, not a bitfield
+} S;
+// CIR-DAG:  !rec_S = !cir.record<struct "S" {!u64i, !u16i, !u32i}>
+// LLVM-DAG: %struct.S = type { i64, i16, i32 }
+// OGCG-DAG: %struct.S = type { i64, i16, i32 }
+
+typedef struct {
+  int a : 3;  // one bitfield with size < 8
+  unsigned b;
+} T;
+
+// CIR-DAG:  !rec_T = !cir.record<struct "T" {!u8i, !u32i}>
+// LLVM-DAG: %struct.T = type { i8, i32 }
+// OGCG-DAG: %struct.T = type { i8, i32 }
+
+void def() {
+  S s;
+  T t;
+}

From e0633d59b9d0f931d3917e51a51b5aa7d7e775ac Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Fri, 20 Jun 2025 15:04:50 +0100
Subject: [PATCH 1052/1322] [Offload] Check for initialization (#144370)

All entry points (except olInit) now check that offload has been
initialized. If not, a new `OL_ERRC_UNINITIALIZED` error is returned.
---
 offload/liboffload/API/Common.td              |  1 +
 offload/liboffload/include/OffloadImpl.hpp    |  1 +
 offload/liboffload/src/OffloadImpl.cpp        |  3 ++-
 .../tools/offload-tblgen/EntryPointGen.cpp    | 12 ++++++++++
 offload/unittests/OffloadAPI/CMakeLists.txt   |  4 ++++
 .../OffloadAPI/common/Environment.cpp         |  2 ++
 offload/unittests/OffloadAPI/init/olInit.cpp  | 22 +++++++++++++++++++
 7 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 offload/unittests/OffloadAPI/init/olInit.cpp

diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 8a2ecd6c6e8f..cd8c3c63fde8 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -106,6 +106,7 @@ def ErrorCode : Enum {
     Etor<"ASSEMBLE_FAILURE", "assembler failure while processing binary image">,
     Etor<"LINK_FAILURE", "linker failure while processing binary image">,
     Etor<"BACKEND_FAILURE", "the plugin backend is in an invalid or unsupported state">,
+    Etor<"UNINITIALIZED", "not initialized">,
 
     // Handle related errors - only makes sense for liboffload
     Etor<"INVALID_NULL_HANDLE", "a handle argument is null when it should not be">,
diff --git a/offload/liboffload/include/OffloadImpl.hpp b/offload/liboffload/include/OffloadImpl.hpp
index a12d8c47a180..f98164d5e178 100644
--- a/offload/liboffload/include/OffloadImpl.hpp
+++ b/offload/liboffload/include/OffloadImpl.hpp
@@ -26,6 +26,7 @@ namespace llvm {
 namespace offload {
 bool isTracingEnabled();
 bool isValidationEnabled();
+bool isOffloadInitialized();
 } // namespace offload
 } // namespace llvm
 
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index f02497c0a633..8a487563f610 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -120,9 +120,10 @@ struct OffloadContext {
 
 // If the context is uninited, then we assume tracing is disabled
 bool isTracingEnabled() {
-  return OffloadContextVal && OffloadContext::get().TracingEnabled;
+  return isOffloadInitialized() && OffloadContext::get().TracingEnabled;
 }
 bool isValidationEnabled() { return OffloadContext::get().ValidationEnabled; }
+bool isOffloadInitialized() { return OffloadContextVal != nullptr; }
 
 template <typename HandleT> Error olDestroy(HandleT Handle) {
   delete Handle;
diff --git a/offload/tools/offload-tblgen/EntryPointGen.cpp b/offload/tools/offload-tblgen/EntryPointGen.cpp
index 13aa0d1f6318..4e42e4905b99 100644
--- a/offload/tools/offload-tblgen/EntryPointGen.cpp
+++ b/offload/tools/offload-tblgen/EntryPointGen.cpp
@@ -82,6 +82,10 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
   }
   OS << ") {\n";
 
+  // Check offload is initialized
+  if (F.getName() != "olInit")
+    OS << "if (!llvm::offload::isOffloadInitialized()) return &UninitError;";
+
   // Emit pre-call prints
   OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n";
   OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName());
@@ -143,6 +147,14 @@ static void EmitCodeLocWrapper(const FunctionRec &F, raw_ostream &OS) {
 
 void EmitOffloadEntryPoints(const RecordKeeper &Records, raw_ostream &OS) {
   OS << GenericHeader;
+
+  constexpr const char *UninitMessage =
+      "liboffload has not been initialized - please call olInit before using "
+      "this API";
+  OS << formatv("static {0}_error_struct_t UninitError = "
+                "{{{1}_ERRC_UNINITIALIZED, \"{2}\"};",
+                PrefixLower, PrefixUpper, UninitMessage);
+
   for (auto *R : Records.getAllDerivedDefinitions("Function")) {
     EmitValidationFunc(FunctionRec{R}, OS);
     EmitEntryPointFunc(FunctionRec{R}, OS);
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 2844b675e5de..05e862865ed3 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -12,6 +12,10 @@ add_offload_unittest("event"
     event/olDestroyEvent.cpp
     event/olWaitEvent.cpp)
 
+add_offload_unittest("init"
+    init/olInit.cpp)
+target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
+
 add_offload_unittest("kernel"
     kernel/olGetKernel.cpp
     kernel/olLaunchKernel.cpp)
diff --git a/offload/unittests/OffloadAPI/common/Environment.cpp b/offload/unittests/OffloadAPI/common/Environment.cpp
index 943347246b6d..ef092cd4187d 100644
--- a/offload/unittests/OffloadAPI/common/Environment.cpp
+++ b/offload/unittests/OffloadAPI/common/Environment.cpp
@@ -17,11 +17,13 @@ using namespace llvm;
 
 // Wrapper so we don't have to constantly init and shutdown Offload in every
 // test, while having sensible lifetime for the platform environment
+#ifndef DISABLE_WRAPPER
 struct OffloadInitWrapper {
   OffloadInitWrapper() { olInit(); }
   ~OffloadInitWrapper() { olShutDown(); }
 };
 static OffloadInitWrapper Wrapper{};
+#endif
 
 static cl::opt<std::string>
     SelectedPlatform("platform", cl::desc("Only test the specified platform"),
diff --git a/offload/unittests/OffloadAPI/init/olInit.cpp b/offload/unittests/OffloadAPI/init/olInit.cpp
new file mode 100644
index 000000000000..8e27e77cd0fb
--- /dev/null
+++ b/offload/unittests/OffloadAPI/init/olInit.cpp
@@ -0,0 +1,22 @@
+//===------- Offload API tests - olInit -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: For this test suite, the implicit olInit/olShutDown doesn't happen, so
+// tests have to do it themselves
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+struct olInitTest : ::testing::Test {};
+
+TEST_F(olInitTest, Uninitialized) {
+  ASSERT_ERROR(OL_ERRC_UNINITIALIZED,
+               olIterateDevices(
+                   [](ol_device_handle_t, void *) { return false; }, nullptr));
+}

From f242360e156b407902829d694c036b2d22211894 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Fri, 20 Jun 2025 15:05:05 +0100
Subject: [PATCH 1053/1322] [Offload] Add type information to device info nodes
 (#144535)

Rather than being "stringly typed", store values as a std::variant that
can hold various types. This means that liboffload doesn't have to do
any string parsing for integer/bool device info keys.
---
 offload/liboffload/src/OffloadImpl.cpp        | 18 ++++----
 .../common/include/PluginInterface.h          | 41 +++++++++++++------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 8a487563f610..eba8e91ed688 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -246,23 +246,23 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);
 
   // Find the info if it exists under any of the given names
-  auto GetInfo = [&](std::vector<std::string> Names) {
+  auto GetInfoString = [&](std::vector<std::string> Names) {
     if (Device == OffloadContext::get().HostDevice())
-      return std::string("Host");
+      return "Host";
 
     if (!Device->Device)
-      return std::string("");
+      return "";
 
     auto Info = Device->Device->obtainInfoImpl();
     if (auto Err = Info.takeError())
-      return std::string("");
+      return "";
 
     for (auto Name : Names) {
       if (auto Entry = Info->get(Name))
-        return (*Entry)->Value;
+        return std::get<std::string>((*Entry)->Value).c_str();
     }
 
-    return std::string("");
+    return "";
   };
 
   switch (PropName) {
@@ -273,12 +273,12 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
                ? ReturnValue(OL_DEVICE_TYPE_HOST)
                : ReturnValue(OL_DEVICE_TYPE_GPU);
   case OL_DEVICE_INFO_NAME:
-    return ReturnValue(GetInfo({"Device Name"}).c_str());
+    return ReturnValue(GetInfoString({"Device Name"}));
   case OL_DEVICE_INFO_VENDOR:
-    return ReturnValue(GetInfo({"Vendor Name"}).c_str());
+    return ReturnValue(GetInfoString({"Vendor Name"}));
   case OL_DEVICE_INFO_DRIVER_VERSION:
     return ReturnValue(
-        GetInfo({"CUDA Driver Version", "HSA Runtime Version"}).c_str());
+        GetInfoString({"CUDA Driver Version", "HSA Runtime Version"}));
   default:
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
                               "getDeviceInfo enum '%i' is invalid", PropName);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f5d995532b7a..91df80030437 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -17,6 +17,7 @@
 #include <list>
 #include <map>
 #include <shared_mutex>
+#include <variant>
 #include <vector>
 
 #include "ExclusiveAccess.h"
@@ -122,7 +123,8 @@ struct InfoTreeNode {
   static constexpr uint64_t IndentSize = 4;
 
   std::string Key;
-  std::string Value;
+  using VariantType = std::variant<uint64_t, std::string, bool, std::monostate>;
+  VariantType Value;
   std::string Units;
   // Need to specify a default value number of elements here as `InfoTreeNode`'s
   // size is unknown. This is a vector (rather than a Key->Value map) since:
@@ -131,15 +133,15 @@ struct InfoTreeNode {
   // * The same key can appear multiple times
   std::unique_ptr<llvm::SmallVector<InfoTreeNode, 8>> Children;
 
-  InfoTreeNode() : InfoTreeNode("", "", "") {}
-  InfoTreeNode(std::string Key, std::string Value, std::string Units)
+  InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {}
+  InfoTreeNode(std::string Key, VariantType Value, std::string Units)
       : Key(Key), Value(Value), Units(Units) {}
 
   /// Add a new info entry as a child of this node. The entry requires at least
   /// a key string in \p Key. The value in \p Value is optional and can be any
   /// type that is representable as a string. The units in \p Units is optional
   /// and must be a string.
-  template <typename T = std::string>
+  template <typename T = std::monostate>
   InfoTreeNode *add(std::string Key, T Value = T(),
                     const std::string &Units = std::string()) {
     assert(!Key.empty() && "Invalid info key");
@@ -147,15 +149,15 @@ struct InfoTreeNode {
     if (!Children)
       Children = std::make_unique<llvm::SmallVector<InfoTreeNode, 8>>();
 
-    std::string ValueStr;
-    if constexpr (std::is_same_v<T, bool>)
-      ValueStr = Value ? "Yes" : "No";
+    VariantType ValueVariant;
+    if constexpr (std::is_same_v<T, bool> || std::is_same_v<T, std::monostate>)
+      ValueVariant = Value;
     else if constexpr (std::is_arithmetic_v<T>)
-      ValueStr = std::to_string(Value);
+      ValueVariant = static_cast<uint64_t>(Value);
     else
-      ValueStr = Value;
+      ValueVariant = std::string{Value};
 
-    return &Children->emplace_back(Key, ValueStr, Units);
+    return &Children->emplace_back(Key, ValueVariant, Units);
   }
 
   std::optional<InfoTreeNode *> get(StringRef Key) {
@@ -184,8 +186,23 @@ private:
           MaxKeySize - (Key.size() + KeyIndentSize) + IndentSize;
 
       llvm::outs() << std::string(KeyIndentSize, ' ') << Key
-                   << std::string(ValIndentSize, ' ') << Value
-                   << (Units.empty() ? "" : " ") << Units << "\n";
+                   << std::string(ValIndentSize, ' ');
+      std::visit(
+          [](auto &&V) {
+            using T = std::decay_t<decltype(V)>;
+            if constexpr (std::is_same_v<T, std::string>)
+              llvm::outs() << V;
+            else if constexpr (std::is_same_v<T, bool>)
+              llvm::outs() << (V ? "Yes" : "No");
+            else if constexpr (std::is_same_v<T, uint64_t>)
+              llvm::outs() << V;
+            else if constexpr (std::is_same_v<T, std::monostate>) {
+              // Do nothing
+            } else
+              static_assert(false, "doPrint visit not exhaustive");
+          },
+          Value);
+      llvm::outs() << (Units.empty() ? "" : " ") << Units << "\n";
     }
 
     // Print children

From 96ab74bf175f46de4b6fbfc68deecd3567e42a52 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Fri, 20 Jun 2025 10:32:31 -0400
Subject: [PATCH 1054/1322] [InstCombine] remove undef loads, such as memcpy
 from undef (#143958)

Extend `isAllocSiteRemovable` to be able to check if the ModRef info
indicates the alloca is only Ref or only Mod, and be able to remove it
accordingly. It seemed that there were a surprising number of
benchmarks with this pattern which weren't getting optimized previously
(due to MemorySSA walk limits). There were somewhat more existing tests
than I'd like to have modified which were simply doing exactly this
pattern (and thus relying on undef memory). Claude code contributed the
new tests (and found an important typo that I'd made).

This implements the discussion in
https://github.com/llvm/llvm-project/pull/143782#discussion_r2142720376.
---
 clang/test/Misc/loop-opt-setup.c              |   2 +-
 .../InstCombine/InstructionCombining.cpp      | 109 +++++++++++---
 .../Transforms/InstCombine/and-or-icmps.ll    |  74 +++++++--
 .../Transforms/InstCombine/apint-shift.ll     |   3 +-
 .../InstCombine/call-cast-target.ll           |  14 +-
 .../Transforms/InstCombine/dead-alloc-elim.ll | 140 ++++++++++++++++++
 .../Transforms/InstCombine/fp-ret-bitcast.ll  |   8 +-
 .../Transforms/InstCombine/getelementptr.ll   |  17 +--
 .../Transforms/InstCombine/malloc-free.ll     |   6 +-
 .../multiple-uses-load-bitcast-select.ll      |  20 +--
 llvm/test/Transforms/InstCombine/objsize.ll   |   8 +-
 .../Transforms/InstCombine/select-load.ll     |  22 +--
 .../InstCombine/shift-amount-reassociation.ll |   5 +-
 .../test/Transforms/InstCombine/vscale_gep.ll |  12 +-
 14 files changed, 329 insertions(+), 111 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/dead-alloc-elim.ll

diff --git a/clang/test/Misc/loop-opt-setup.c b/clang/test/Misc/loop-opt-setup.c
index 01643e6073b5..c1c620e52200 100644
--- a/clang/test/Misc/loop-opt-setup.c
+++ b/clang/test/Misc/loop-opt-setup.c
@@ -15,7 +15,7 @@ int foo(void) {
 // CHECK-NOT: br i1
 
 void Helper(void) {
-  const int *nodes[5];
+  const int *nodes[5] = {0};
   int num_active = 5;
 
   while (num_active)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index afd3359e22ff..bcc73090277a 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3277,12 +3277,13 @@ static bool isRemovableWrite(CallBase &CB, Value *UsedV,
   return Dest && Dest->Ptr == UsedV;
 }
 
-static bool isAllocSiteRemovable(Instruction *AI,
-                                 SmallVectorImpl<WeakTrackingVH> &Users,
-                                 const TargetLibraryInfo &TLI) {
+static std::optional<ModRefInfo>
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users,
+                     const TargetLibraryInfo &TLI, bool KnowInit) {
   SmallVector<Instruction*, 4> Worklist;
   const std::optional<StringRef> Family = getAllocationFamily(AI, &TLI);
   Worklist.push_back(AI);
+  ModRefInfo Access = KnowInit ? ModRefInfo::NoModRef : ModRefInfo::Mod;
 
   do {
     Instruction *PI = Worklist.pop_back_val();
@@ -3291,7 +3292,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
       switch (I->getOpcode()) {
       default:
         // Give up the moment we see something we can't handle.
-        return false;
+        return std::nullopt;
 
       case Instruction::AddrSpaceCast:
       case Instruction::BitCast:
@@ -3306,10 +3307,10 @@ static bool isAllocSiteRemovable(Instruction *AI,
         // We also fold comparisons in some conditions provided the alloc has
         // not escaped (see isNeverEqualToUnescapedAlloc).
         if (!ICI->isEquality())
-          return false;
+          return std::nullopt;
         unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
         if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
-          return false;
+          return std::nullopt;
 
         // Do not fold compares to aligned_alloc calls, as they may have to
         // return null in case the required alignment cannot be satisfied,
@@ -3329,7 +3330,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
         if (CB && TLI.getLibFunc(*CB->getCalledFunction(), TheLibFunc) &&
             TLI.has(TheLibFunc) && TheLibFunc == LibFunc_aligned_alloc &&
             !AlignmentAndSizeKnownValid(CB))
-          return false;
+          return std::nullopt;
         Users.emplace_back(I);
         continue;
       }
@@ -3339,14 +3340,21 @@ static bool isAllocSiteRemovable(Instruction *AI,
         if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
           switch (II->getIntrinsicID()) {
           default:
-            return false;
+            return std::nullopt;
 
           case Intrinsic::memmove:
           case Intrinsic::memcpy:
           case Intrinsic::memset: {
             MemIntrinsic *MI = cast<MemIntrinsic>(II);
-            if (MI->isVolatile() || MI->getRawDest() != PI)
-              return false;
+            if (MI->isVolatile())
+              return std::nullopt;
+            // Note: this could also be ModRef, but we can still interpret that
+            // as just Mod in that case.
+            ModRefInfo NewAccess =
+                MI->getRawDest() == PI ? ModRefInfo::Mod : ModRefInfo::Ref;
+            if ((Access & ~NewAccess) != ModRefInfo::NoModRef)
+              return std::nullopt;
+            Access |= NewAccess;
             [[fallthrough]];
           }
           case Intrinsic::assume:
@@ -3365,11 +3373,6 @@ static bool isAllocSiteRemovable(Instruction *AI,
           }
         }
 
-        if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
-          Users.emplace_back(I);
-          continue;
-        }
-
         if (Family && getFreedOperand(cast<CallBase>(I), &TLI) == PI &&
             getAllocationFamily(I, &TLI) == Family) {
           Users.emplace_back(I);
@@ -3383,12 +3386,33 @@ static bool isAllocSiteRemovable(Instruction *AI,
           continue;
         }
 
-        return false;
+        if (!isRefSet(Access) &&
+            isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
+          Access |= ModRefInfo::Mod;
+          Users.emplace_back(I);
+          continue;
+        }
+
+        return std::nullopt;
 
       case Instruction::Store: {
         StoreInst *SI = cast<StoreInst>(I);
         if (SI->isVolatile() || SI->getPointerOperand() != PI)
-          return false;
+          return std::nullopt;
+        if (isRefSet(Access))
+          return std::nullopt;
+        Access |= ModRefInfo::Mod;
+        Users.emplace_back(I);
+        continue;
+      }
+
+      case Instruction::Load: {
+        LoadInst *LI = cast<LoadInst>(I);
+        if (LI->isVolatile() || LI->getPointerOperand() != PI)
+          return std::nullopt;
+        if (isModSet(Access))
+          return std::nullopt;
+        Access |= ModRefInfo::Ref;
         Users.emplace_back(I);
         continue;
       }
@@ -3396,7 +3420,9 @@ static bool isAllocSiteRemovable(Instruction *AI,
       llvm_unreachable("missing a return?");
     }
   } while (!Worklist.empty());
-  return true;
+
+  assert(Access != ModRefInfo::ModRef);
+  return Access;
 }
 
 Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
@@ -3424,10 +3450,31 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
-  if (isAllocSiteRemovable(&MI, Users, TLI)) {
+  // Determine what getInitialValueOfAllocation would return without actually
+  // allocating the result.
+  bool KnowInitUndef = false;
+  bool KnowInitZero = false;
+  Constant *Init =
+      getInitialValueOfAllocation(&MI, &TLI, Type::getInt8Ty(MI.getContext()));
+  if (Init) {
+    if (isa<UndefValue>(Init))
+      KnowInitUndef = true;
+    else if (Init->isNullValue())
+      KnowInitZero = true;
+  }
+  // The various sanitizers don't actually return undef memory, but rather
+  // memory initialized with special forms of runtime poison
+  auto &F = *MI.getFunction();
+  if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
+      F.hasFnAttribute(Attribute::SanitizeAddress))
+    KnowInitUndef = false;
+
+  auto Removable =
+      isAllocSiteRemovable(&MI, Users, TLI, KnowInitZero | KnowInitUndef);
+  if (Removable) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
-      // Lowering all @llvm.objectsize calls first because they may
-      // use a bitcast/GEP of the alloca we are removing.
+      // Lowering all @llvm.objectsize and MTI calls first because they may use
+      // a bitcast/GEP of the alloca we are removing.
       if (!Users[i])
        continue;
 
@@ -3444,6 +3491,17 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
           eraseInstFromFunction(*I);
           Users[i] = nullptr; // Skip examining in the next loop.
         }
+        if (auto *MTI = dyn_cast<MemTransferInst>(I)) {
+          if (KnowInitZero && isRefSet(*Removable)) {
+            IRBuilderBase::InsertPointGuard Guard(Builder);
+            Builder.SetInsertPoint(MTI);
+            auto *M = Builder.CreateMemSet(
+                MTI->getRawDest(),
+                ConstantInt::get(Type::getInt8Ty(MI.getContext()), 0),
+                MTI->getLength(), MTI->getDestAlign());
+            M->copyMetadata(*MTI);
+          }
+        }
       }
     }
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
@@ -3466,7 +3524,14 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
       } else {
         // Casts, GEP, or anything else: we're about to delete this instruction,
         // so it can not have any valid uses.
-        replaceInstUsesWith(*I, PoisonValue::get(I->getType()));
+        Constant *Replace;
+        if (isa<LoadInst>(I)) {
+          assert(KnowInitZero || KnowInitUndef);
+          Replace = KnowInitUndef ? UndefValue::get(I->getType())
+                                  : Constant::getNullValue(I->getType());
+        } else
+          Replace = PoisonValue::get(I->getType());
+        replaceInstUsesWith(*I, Replace);
       }
       eraseInstFromFunction(*I);
     }
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index 8824ae48417b..42e502074812 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -364,23 +364,77 @@ define <2 x i1> @and_ne_with_diff_one_splatvec(<2 x i32> %x) {
 
 define void @simplify_before_foldAndOfICmps(ptr %p) {
 ; CHECK-LABEL: @simplify_before_foldAndOfICmps(
-; CHECK-NEXT:    [[A8:%.*]] = alloca i16, align 2
-; CHECK-NEXT:    [[L7:%.*]] = load i16, ptr [[A8]], align 2
+; CHECK-NEXT:    store i1 true, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    store ptr null, ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %A8 = alloca i16
+  %L7 = load i16, ptr %A8
+  %G21 = getelementptr i16, ptr %A8, i8 -1
+  %B11 = udiv i16 %L7, -1
+  %G4 = getelementptr i16, ptr %A8, i16 %B11
+  %L2 = load i16, ptr %G4
+  %L = load i16, ptr %G4
+  %B23 = mul i16 %B11, %B11
+  %L4 = load i16, ptr %A8
+  %B21 = sdiv i16 %L7, %L4
+  %B7 = sub i16 0, %B21
+  %B18 = mul i16 %B23, %B7
+  %C10 = icmp ugt i16 %L, %B11
+  %B20 = and i16 %L7, %L2
+  %B1 = mul i1 %C10, true
+  %C5 = icmp sle i16 %B21, %L
+  %C11 = icmp ule i16 %B21, %L
+  %C7 = icmp slt i16 %B20, 0
+  %B29 = srem i16 %L4, %B18
+  %B15 = add i1 %C7, %C10
+  %B19 = add i1 %C11, %B15
+  %C6 = icmp sge i1 %C11, %B19
+  %B33 = or i16 %B29, %L4
+  %C13 = icmp uge i1 %C5, %B1
+  %C3 = icmp ult i1 %C13, %C6
+  store i16 undef, ptr %G21
+  %C18 = icmp ule i1 %C10, %C7
+  %G26 = getelementptr i1, ptr null, i1 %C3
+  store i16 %B33, ptr %p
+  store i1 %C18, ptr %p
+  store ptr %G26, ptr %p
+  ret void
+}
+
+define void @simplify_before_foldAndOfICmps2(ptr %p, ptr %A8) "instcombine-no-verify-fixpoint" {
+; CHECK-LABEL: @simplify_before_foldAndOfICmps2(
+; CHECK-NEXT:    [[L7:%.*]] = load i16, ptr [[A8:%.*]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i16 [[L7]], -1
 ; CHECK-NEXT:    [[B11:%.*]] = zext i1 [[TMP1]] to i16
-; CHECK-NEXT:    [[C10:%.*]] = icmp ugt i16 [[L7]], [[B11]]
-; CHECK-NEXT:    [[C7:%.*]] = icmp slt i16 [[L7]], 0
-; CHECK-NEXT:    [[C3:%.*]] = and i1 [[C7]], [[C10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[C10]], true
-; CHECK-NEXT:    [[C18:%.*]] = or i1 [[C7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[C3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i64
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr i16, ptr [[A8]], i64 [[TMP2]]
+; CHECK-NEXT:    [[L2:%.*]] = load i16, ptr [[G4]], align 2
+; CHECK-NEXT:    [[L4:%.*]] = load i16, ptr [[A8]], align 2
+; CHECK-NEXT:    [[B21:%.*]] = sdiv i16 [[L7]], [[L4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP1]], i16 [[B21]], i16 0
+; CHECK-NEXT:    [[B18:%.*]] = sub i16 0, [[TMP5]]
+; CHECK-NEXT:    [[C11:%.*]] = icmp ugt i16 [[L2]], [[B11]]
+; CHECK-NEXT:    [[B20:%.*]] = and i16 [[L7]], [[L2]]
+; CHECK-NEXT:    [[C5:%.*]] = icmp sgt i16 [[B21]], [[L2]]
+; CHECK-NEXT:    [[C12:%.*]] = icmp ule i16 [[B21]], [[L2]]
+; CHECK-NEXT:    [[C10:%.*]] = icmp slt i16 [[B20]], 0
+; CHECK-NEXT:    [[B29:%.*]] = srem i16 [[L4]], [[B18]]
+; CHECK-NEXT:    [[B15:%.*]] = xor i1 [[C10]], [[C11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[C12]], [[B15]]
+; CHECK-NEXT:    [[C6:%.*]] = xor i1 [[TMP6]], true
+; CHECK-NEXT:    [[B33:%.*]] = or i16 [[B29]], [[L4]]
+; CHECK-NEXT:    [[C3:%.*]] = and i1 [[C5]], [[C6]]
+; CHECK-NEXT:    [[C4:%.*]] = and i1 [[C3]], [[C11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[C11]], true
+; CHECK-NEXT:    [[C18:%.*]] = or i1 [[C10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[C4]] to i64
 ; CHECK-NEXT:    [[G26:%.*]] = getelementptr i1, ptr null, i64 [[TMP3]]
-; CHECK-NEXT:    store i16 [[L7]], ptr [[P:%.*]], align 2
+; CHECK-NEXT:    store i16 [[B33]], ptr [[P:%.*]], align 2
 ; CHECK-NEXT:    store i1 [[C18]], ptr [[P]], align 1
 ; CHECK-NEXT:    store ptr [[G26]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  %A8 = alloca i16
   %L7 = load i16, ptr %A8
   %G21 = getelementptr i16, ptr %A8, i8 -1
   %B11 = udiv i16 %L7, -1
diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll
index 3cc530bdbd02..4dd0811bb7ec 100644
--- a/llvm/test/Transforms/InstCombine/apint-shift.ll
+++ b/llvm/test/Transforms/InstCombine/apint-shift.ll
@@ -562,11 +562,10 @@ define i40 @test26(i40 %A) {
 
 ; OSS-Fuzz #9880
 ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9880
-define i177 @ossfuzz_9880(i177 %X) {
+define i177 @ossfuzz_9880(i177 %X, ptr %A) {
 ; CHECK-LABEL: @ossfuzz_9880(
 ; CHECK-NEXT:    ret i177 0
 ;
-  %A = alloca i177
   %L1 = load i177, ptr %A
   %B = or i177 0, -1
   %B5 = udiv i177 %L1, %B
diff --git a/llvm/test/Transforms/InstCombine/call-cast-target.ll b/llvm/test/Transforms/InstCombine/call-cast-target.ll
index 2cedc6c81d73..2f4b4ad2409e 100644
--- a/llvm/test/Transforms/InstCombine/call-cast-target.ll
+++ b/llvm/test/Transforms/InstCombine/call-cast-target.ll
@@ -110,19 +110,17 @@ entry:
 
 declare i1 @fn5(ptr byval({ i32, i32 }) align 4 %r)
 
-define i1 @test5() {
-; CHECK-LABEL: define i1 @test5() {
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca { i32, i32 }, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 4
+define i1 @test5(ptr %ptr) {
+; CHECK-LABEL: define i1 @test5(ptr %ptr) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @fn5(i32 [[TMP2]], i32 [[TMP4]])
 ; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
-  %1 = alloca { i32, i32 }, align 4
-  %2 = getelementptr inbounds { i32, i32 }, ptr %1, i32 0, i32 0
+  %2 = getelementptr inbounds { i32, i32 }, ptr %ptr, i32 0, i32 0
   %3 = load i32, ptr %2, align 4
-  %4 = getelementptr inbounds { i32, i32 }, ptr %1, i32 0, i32 1
+  %4 = getelementptr inbounds { i32, i32 }, ptr %ptr, i32 0, i32 1
   %5 = load i32, ptr %4, align 4
   %6 = call i1 @fn5(i32 %3, i32 %5)
   ret i1 %6
diff --git a/llvm/test/Transforms/InstCombine/dead-alloc-elim.ll b/llvm/test/Transforms/InstCombine/dead-alloc-elim.ll
new file mode 100644
index 000000000000..b135f76f709f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/dead-alloc-elim.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare noalias ptr @calloc(i32, i32) nounwind allockind("alloc,zeroed") allocsize(0,1) "alloc-family"="malloc"
+declare void @free(ptr) allockind("free") "alloc-family"="malloc"
+
+; Test load from uninitialized alloca - should be removed and replaced with undef
+define i32 @test_load_uninitialized_alloca() {
+; CHECK-LABEL: @test_load_uninitialized_alloca(
+; CHECK-NEXT:    ret i32 undef
+;
+  %a = alloca i32
+  %v = load i32, ptr %a
+  ret i32 %v
+}
+
+; Test load from zero-initialized malloc - should be removed and replaced with zero
+define i32 @test_load_zero_initialized_malloc() {
+; CHECK-LABEL: @test_load_zero_initialized_malloc(
+; CHECK-NEXT:    ret i32 0
+;
+  %a = call ptr @calloc(i32 1, i32 4)
+  %v = load i32, ptr %a
+  call void @free(ptr %a)
+  ret i32 %v
+}
+
+; Test memcpy from uninitialized source - should be removed
+define void @test_memcpy_from_uninitialized_alloca(ptr %dest) {
+; CHECK-LABEL: @test_memcpy_from_uninitialized_alloca(
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i32, align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr %src, ptr %src, i32 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 4, i1 false)
+  ret void
+}
+
+; Test memcpy from zeroed source - should transform to memset with zero
+define void @test_memcpy_from_uninitialized_calloc(ptr %dest) {
+; CHECK-LABEL: @test_memcpy_from_uninitialized_calloc(
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(16) [[DEST:%.*]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %src = call ptr @calloc(i32 1, i32 16)
+  call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 16, i1 false)
+  call void @free(ptr %src)
+  ret void
+}
+
+; Test mixed read/write pattern - should not be removable due to write before read
+define i32 @test_write_then_read_alloca() {
+; CHECK-LABEL: @test_write_then_read_alloca(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i8 42, ptr [[A]], align 1
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32
+  store i8 42, ptr %a
+  %v = load i32, ptr %a
+  ret i32 %v
+}
+
+; Test read then write pattern - should not be removable due to conflicting access
+define void @test_read_then_write_alloca() {
+; CHECK-LABEL: @test_read_then_write_alloca(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[V8:%.*]] = trunc i32 [[V]] to i8
+; CHECK-NEXT:    store i8 [[V8]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32
+  %v = load i32, ptr %a
+  %v8 = trunc i32 %v to i8
+  store i8 %v8, ptr %a
+  ret void
+}
+
+; Test load through GEP from uninitialized alloca
+define i8 @test_load_gep_uninitialized_alloca() {
+; CHECK-LABEL: @test_load_gep_uninitialized_alloca(
+; CHECK-NEXT:    ret i8 undef
+;
+  %a = alloca [4 x i8]
+  %gep = getelementptr [4 x i8], ptr %a, i32 0, i32 2
+  %v = load i8, ptr %gep
+  ret i8 %v
+}
+
+; Test load through bitcast from uninitialized alloca
+define i16 @test_load_bitcast_uninitialized_alloca() {
+; CHECK-LABEL: @test_load_bitcast_uninitialized_alloca(
+; CHECK-NEXT:    ret i16 undef
+;
+  %a = alloca i32
+  %bc = bitcast ptr %a to ptr
+  %v = load i16, ptr %bc
+  ret i16 %v
+}
+
+; Test memmove from zero-initialized malloc
+define void @test_memmove_from_zero_initialized_malloc(ptr %dest) {
+; CHECK-LABEL: @test_memmove_from_zero_initialized_malloc(
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(32) [[DEST:%.*]], i8 0, i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %src = call ptr @calloc(i32 8, i32 4)
+  call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 32, i1 false)
+  call void @free(ptr %src)
+  ret void
+}
+
+; Test multiple loads from same uninitialized alloca
+define { i32, i32 } @test_multiple_loads_uninitialized_alloca() {
+; CHECK-LABEL: @test_multiple_loads_uninitialized_alloca(
+; CHECK-NEXT:    ret { i32, i32 } undef
+;
+  %a = alloca [2 x i32]
+  %gep1 = getelementptr [2 x i32], ptr %a, i32 0, i32 0
+  %gep2 = getelementptr [2 x i32], ptr %a, i32 0, i32 1
+  %v1 = load i32, ptr %gep1
+  %v2 = load i32, ptr %gep2
+  %ret = insertvalue { i32, i32 } { i32 undef, i32 poison }, i32 %v1, 0
+  %ret2 = insertvalue { i32, i32 } %ret, i32 %v2, 1
+  ret { i32, i32 } %ret2
+}
+
+; Test that volatile operations prevent removal
+define i32 @test_volatile_load_prevents_removal() {
+; CHECK-LABEL: @test_volatile_load_prevents_removal(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32
+  %v = load volatile i32, ptr %a
+  ret i32 %v
+}
diff --git a/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll b/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
index 15eb3e15ea44..af18a427ee37 100644
--- a/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/fp-ret-bitcast.ll
@@ -12,11 +12,11 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 @"\01L_OBJC_METH_VAR_NAME_112" = internal global [15 x i8] c"whiteComponent\00", section "__TEXT,__cstring,cstring_literals"
 @"\01L_OBJC_SELECTOR_REFERENCES_81" = internal global ptr @"\01L_OBJC_METH_VAR_NAME_112", section "__OBJC,__message_refs,literal_pointers,no_dead_strip"
 
-define void @bork() nounwind  {
+define void @bork(ptr %color, ptr %color.466) nounwind  {
 ; CHECK-LABEL: @bork(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COLOR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[COLOR]], align 4
+; CHECK-NEXT:    [[TMP103:%.*]] = load ptr, ptr [[COLOR:%.*]], align 4
+; CHECK-NEXT:    store ptr [[TMP103]], ptr [[COLOR_466:%.*]], align 4
 ; CHECK-NEXT:    [[TMP105:%.*]] = load ptr, ptr @"\01L_OBJC_SELECTOR_REFERENCES_81", align 4
 ; CHECK-NEXT:    [[TMP107:%.*]] = call float @objc_msgSend_fpret(ptr [[TMP103]], ptr [[TMP105]]) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -24,8 +24,6 @@ define void @bork() nounwind  {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %color = alloca ptr
-  %color.466 = alloca ptr
   %tmp103 = load ptr, ptr %color, align 4
   store ptr %tmp103, ptr %color.466, align 4
   %tmp105 = load ptr, ptr @"\01L_OBJC_SELECTOR_REFERENCES_81", align 4
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index e78d70058c14..7568c6edc429 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -580,13 +580,11 @@ define i32 @test20_as1(ptr addrspace(1) %P, i32 %A, i32 %B) {
 }
 
 
-define i32 @test21() {
+define i32 @test21(ptr %pbob1) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[PBOB1:%.*]] = alloca [[INTSTRUCT:%.*]], align 8
-; CHECK-NEXT:    [[RVAL:%.*]] = load i32, ptr [[PBOB1]], align 4
+; CHECK-NEXT:    [[RVAL:%.*]] = load i32, ptr [[PBOB1:%.*]], align 4
 ; CHECK-NEXT:    ret i32 [[RVAL]]
 ;
-  %pbob1 = alloca %intstruct
   %pbob2 = getelementptr %intstruct, ptr %pbob1
   %rval = load i32, ptr %pbob2
   ret i32 %rval
@@ -654,18 +652,16 @@ define i1 @test26(ptr %arr) {
   %struct.siginfo_t = type { i32, i32, i32, { { i32, i32, [0 x i8], %struct.sigval_t, i32 }, [88 x i8] } }
   %struct.sigval_t = type { ptr }
 
-define i32 @test27(ptr %to, ptr %from) {
+define i32 @test27(ptr %to, ptr %from, ptr %from_addr) {
 ; CHECK-LABEL: @test27(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FROM_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[T344:%.*]] = load ptr, ptr [[FROM_ADDR]], align 8
+; CHECK-NEXT:    [[T344:%.*]] = load ptr, ptr [[FROM_ADDR:%.*]], align 8
 ; CHECK-NEXT:    [[T348:%.*]] = getelementptr i8, ptr [[T344]], i64 24
 ; CHECK-NEXT:    [[T351:%.*]] = load i32, ptr [[T348]], align 8
 ; CHECK-NEXT:    [[T360:%.*]] = call i32 asm sideeffect "...", "=r,ir,*m,i,0,~{dirflag},~{fpsr},~{flags}"(i32 [[T351]], ptr elementtype([[STRUCT___LARGE_STRUCT:%.*]]) null, i32 -14, i32 0) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
-  %from_addr = alloca ptr
   %t344 = load ptr, ptr %from_addr, align 8
   %t345 = getelementptr %struct.siginfo_t, ptr %t344, i32 0, i32 3
   %t346 = getelementptr { { i32, i32, [0 x i8], %struct.sigval_t, i32 }, [88 x i8] }, ptr %t345, i32 0, i32 0
@@ -1345,10 +1341,7 @@ declare noalias ptr @malloc(i64) nounwind allockind("alloc,uninitialized") alloc
 define i32 @test_gep_bitcast_malloc(ptr %a) {
 ; CHECK-LABEL: @test_gep_bitcast_malloc(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = call noalias dereferenceable_or_null(16) ptr @malloc(i64 16)
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr i8, ptr [[CALL]], i64 12
-; CHECK-NEXT:    [[A_C:%.*]] = load i32, ptr [[G3]], align 4
-; CHECK-NEXT:    ret i32 [[A_C]]
+; CHECK-NEXT:    ret i32 undef
 ;
 entry:
   %call = call noalias ptr @malloc(i64 16) #2
diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll
index 1f556821270a..989074f97aaf 100644
--- a/llvm/test/Transforms/InstCombine/malloc-free.ll
+++ b/llvm/test/Transforms/InstCombine/malloc-free.ll
@@ -133,17 +133,13 @@ define void @test4() {
 
 define void @test5(ptr %ptr, ptr %esc) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[A:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
-; CHECK-NEXT:    [[B:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
 ; CHECK-NEXT:    [[C:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
 ; CHECK-NEXT:    [[D:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
 ; CHECK-NEXT:    [[E:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
 ; CHECK-NEXT:    [[F:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
 ; CHECK-NEXT:    [[G:%.*]] = call dereferenceable_or_null(700) ptr @malloc(i32 700)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(32) [[PTR:%.*]], ptr noundef nonnull align 1 dereferenceable(32) [[A]], i32 32, i1 false)
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 1 dereferenceable(32) [[B]], i32 32, i1 false)
 ; CHECK-NEXT:    store ptr [[C]], ptr [[ESC:%.*]], align 4
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[D]], ptr [[PTR]], i32 32, i1 true)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[D]], ptr [[PTR:%.*]], i32 32, i1 true)
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i32(ptr [[E]], ptr [[PTR]], i32 32, i1 true)
 ; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr [[F]], i8 5, i32 32, i1 true)
 ; CHECK-NEXT:    store volatile i8 4, ptr [[G]], align 1
diff --git a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
index 38fca0314ae1..5d3512e10f41 100644
--- a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
+++ b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
@@ -1,20 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S -data-layout="E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" | FileCheck %s
 
-define void @PR35618(ptr %st1, ptr %st2) {
+define void @PR35618(ptr %st1, ptr %st2, ptr %y1, ptr %z1) {
 ; CHECK-LABEL: @PR35618(
-; CHECK-NEXT:    [[Y1:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[Z1:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1]], align 8
-; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1]], align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1:%.*]], align 8
+; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1:%.*]], align 8
 ; CHECK-NEXT:    [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
 ; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST1:%.*]], align 8
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST2:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  %y1 = alloca double
-  %z1 = alloca double
   %ld1 = load double, ptr %y1
   %ld2 = load double, ptr %z1
   %tmp = fcmp olt double %ld1, %ld2
@@ -25,20 +21,16 @@ define void @PR35618(ptr %st1, ptr %st2) {
   ret void
 }
 
-define void @PR35618_asan(ptr %st1, ptr %st2) sanitize_address {
+define void @PR35618_asan(ptr %st1, ptr %st2, ptr %y1, ptr %z1) sanitize_address {
 ; CHECK-LABEL: @PR35618_asan(
-; CHECK-NEXT:    [[Y1:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[Z1:%.*]] = alloca double, align 8
-; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1]], align 8
-; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1]], align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1:%.*]], align 8
+; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1:%.*]], align 8
 ; CHECK-NEXT:    [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
 ; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST1:%.*]], align 8
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST2:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  %y1 = alloca double
-  %z1 = alloca double
   %ld1 = load double, ptr %y1
   %ld2 = load double, ptr %z1
   %tmp = fcmp olt double %ld1, %ld2
diff --git a/llvm/test/Transforms/InstCombine/objsize.ll b/llvm/test/Transforms/InstCombine/objsize.ll
index 1c33412303c1..39f6f493782d 100644
--- a/llvm/test/Transforms/InstCombine/objsize.ll
+++ b/llvm/test/Transforms/InstCombine/objsize.ll
@@ -14,19 +14,17 @@ define i32 @foo() nounwind {
   ret i32 %1
 }
 
-define ptr @bar() nounwind {
+define ptr @bar(ptr %retval) nounwind {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[RETVAL:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    br i1 true, label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.true:
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret ptr [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[RETVAL:%.*]], align 4
+; CHECK-NEXT:    ret ptr [[TMP1]]
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    ret ptr poison
 ;
 entry:
-  %retval = alloca ptr
   %0 = call i32 @llvm.objectsize.i32.p0(ptr @a, i1 false, i1 false, i1 false)
   %cmp = icmp ne i32 %0, -1
   br i1 %cmp, label %cond.true, label %cond.false
diff --git a/llvm/test/Transforms/InstCombine/select-load.ll b/llvm/test/Transforms/InstCombine/select-load.ll
index 36883423aea3..308dc25bf780 100644
--- a/llvm/test/Transforms/InstCombine/select-load.ll
+++ b/llvm/test/Transforms/InstCombine/select-load.ll
@@ -4,19 +4,14 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-grtev4-linux-gnu"
 
-define i32 @test_plain(i1 %f) {
+define i32 @test_plain(i1 %f, ptr %a, ptr %b) {
 ; CHECK-LABEL: @test_plain(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
-; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
-; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 8
+; CHECK-NEXT:    [[B:%.*]] = select i1 [[F:%.*]], ptr [[A:%.*]], ptr [[B1:%.*]]
 ; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 8
-; CHECK-NEXT:    [[L:%.*]] = select i1 [[F:%.*]], i32 [[A_VAL]], i32 [[B_VAL]]
-; CHECK-NEXT:    ret i32 [[L]]
+; CHECK-NEXT:    ret i32 [[B_VAL]]
 ;
 entry:
-  %a = alloca i32, align 8
-  %b = alloca i32, align 8
   %sel = select i1 %f, ptr %a, ptr %b
   %l = load i32, ptr %sel, align 8
   ret i32 %l
@@ -82,19 +77,14 @@ entry:
 
 ; Msan just propagates shadow, even if speculated load accesses uninitialized
 ; value, instrumentation will select shadow of the desired value anyway.
-define i32 @test_msan(i1 %f) sanitize_memory {
+define i32 @test_msan(i1 %f, ptr %a, ptr %b) sanitize_memory {
 ; CHECK-LABEL: @test_msan(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
-; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
-; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 8
+; CHECK-NEXT:    [[B:%.*]] = select i1 [[F:%.*]], ptr [[A:%.*]], ptr [[B1:%.*]]
 ; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 8
-; CHECK-NEXT:    [[L:%.*]] = select i1 [[F:%.*]], i32 [[A_VAL]], i32 [[B_VAL]]
-; CHECK-NEXT:    ret i32 [[L]]
+; CHECK-NEXT:    ret i32 [[B_VAL]]
 ;
 entry:
-  %a = alloca i32, align 8
-  %b = alloca i32, align 8
   %sel = select i1 %f, ptr %a, ptr %b
   %l = load i32, ptr %sel, align 8
   ret i32 %l
diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll
index b4c606f037d5..abae2c6b9ab8 100644
--- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll
+++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll
@@ -156,11 +156,10 @@ define i32 @t11_shl_nsw_flag_preservation(i32 %x, i32 %y) {
 
 ; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15587
 @X = external global i32
-define i64 @constantexpr() {
-; CHECK-LABEL: @constantexpr(
+define i64 @constantexpr(ptr %A) {
+; CHECK-LABEL: @constantexpr(ptr %A) {
 ; CHECK-NEXT:    ret i64 0
 ;
-  %A = alloca i64
   %L = load i64, ptr %A
   %V = add i64 ptrtoint (ptr @X to i64), 0
   %B2 = shl i64 %V, 0
diff --git a/llvm/test/Transforms/InstCombine/vscale_gep.ll b/llvm/test/Transforms/InstCombine/vscale_gep.ll
index 84019e613d23..5d39ad4e01af 100644
--- a/llvm/test/Transforms/InstCombine/vscale_gep.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_gep.ll
@@ -41,31 +41,27 @@ define void @gep_bitcast(ptr %p) {
 
 ; These tests serve to verify code changes when underlying gep ptr is alloca.
 ; This test is to verify 'inbounds' is added when it's valid to accumulate constant offset.
-define i32 @gep_alloca_inbounds_vscale_zero() {
+define i32 @gep_alloca_inbounds_vscale_zero(ptr %a) {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_zero(
-; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
-  %a = alloca <vscale x 4 x i32>
   %tmp = getelementptr <vscale x 4 x i32>, ptr %a, i32 0, i32 2
   %load = load i32, ptr %tmp
   ret i32 %load
 }
 
 ; This test is to verify 'inbounds' is not added when a constant offset can not be determined at compile-time.
-define i32 @gep_alloca_inbounds_vscale_nonzero() {
+define i32 @gep_alloca_inbounds_vscale_nonzero(ptr %a) {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_nonzero(
-; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
-  %a = alloca <vscale x 4 x i32>
   %tmp = getelementptr <vscale x 4 x i32>, ptr %a, i32 1, i32 2
   %load = load i32, ptr %tmp
   ret i32 %load

From a2b8a93ff9cfdae4a1578c60fe5efc8ebd8c5571 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 20 Jun 2025 15:46:52 +0100
Subject: [PATCH 1055/1322] [VPlan] Pass NumUnrolledElems as operand to
 VPWidenPointerInductionRecipe. NFC (#119859)

Similarly to VPWidenIntOrFpInductionRecipe, if we want to support it in
EVL tail folding we need to increment the induction by EVL steps instead
of VF*UF steps, but currently this is hard-wired in
VPWidenPointerInductionRecipe.

This adds an operand for the number of elements unrolled and plumbs it
through, so that we can swap it out in
VPlanTransforms::tryAddExplicitVectorLength further down the line.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |  2 +-
 llvm/lib/Transforms/Vectorize/VPlan.h           | 15 ++++++++++-----
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  | 13 +++++++------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 88b2ffba1b79..ca1e4cbc6b29 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7826,7 +7826,7 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
                                                            *PSE.getSE());
     return new VPWidenPointerInductionRecipe(
-        Phi, Operands[0], Step, *II,
+        Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
         LoopVectorizationPlanner::getDecisionAndClampRange(
             [&](ElementCount VF) {
               return CM.isScalarAfterVectorization(Phi, VF);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ab5ff82a7720..f4163b0743a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2034,25 +2034,30 @@ public:
 };
 
 class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
-                                      public VPUnrollPartAccessor<3> {
+                                      public VPUnrollPartAccessor<4> {
   bool IsScalarAfterVectorization;
 
 public:
   /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
-  /// Start.
+  /// Start and the number of elements unrolled \p NumUnrolledElems, typically
+  /// VF*UF.
   VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
+                                VPValue *NumUnrolledElems,
                                 const InductionDescriptor &IndDesc,
                                 bool IsScalarAfterVectorization, DebugLoc DL)
       : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start,
                                Step, IndDesc, DL),
-        IsScalarAfterVectorization(IsScalarAfterVectorization) {}
+        IsScalarAfterVectorization(IsScalarAfterVectorization) {
+    addOperand(NumUnrolledElems);
+  }
 
   ~VPWidenPointerInductionRecipe() override = default;
 
   VPWidenPointerInductionRecipe *clone() override {
     return new VPWidenPointerInductionRecipe(
         cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
-        getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc());
+        getOperand(2), getInductionDescriptor(), IsScalarAfterVectorization,
+        getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
@@ -2067,7 +2072,7 @@ public:
   /// the first unrolled part, if it exists. Returns itself if unrolling did not
   /// take place.
   VPValue *getFirstUnrolledPartOperand() {
-    return getUnrollPart(*this) == 0 ? this : getOperand(2);
+    return getUnrollPart(*this) == 0 ? this : getOperand(3);
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f45ce46763c5..39def05b2eac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3544,8 +3544,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   if (CurrentPart == 0) {
     // The recipe represents the first part of the pointer induction. Create the
     // GEP to increment the phi across all unrolled parts.
-    Value *NumUnrolledElems =
-        State.get(&getParent()->getPlan()->getVFxUF(), true);
+    Value *NumUnrolledElems = State.get(getOperand(2), true);
 
     Value *InductionGEP = GetElementPtrInst::Create(
         State.Builder.getInt8Ty(), NewPointerPhi,
@@ -3581,7 +3580,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
-  assert((getNumOperands() == 2 || getNumOperands() == 4) &&
+  assert((getNumOperands() == 3 || getNumOperands() == 5) &&
          "unexpected number of operands");
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
@@ -3589,11 +3588,13 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
   getStartValue()->printAsOperand(O, SlotTracker);
   O << ", ";
   getStepValue()->printAsOperand(O, SlotTracker);
-  if (getNumOperands() == 4) {
-    O << ", ";
-    getOperand(2)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(2)->printAsOperand(O, SlotTracker);
+  if (getNumOperands() == 5) {
     O << ", ";
     getOperand(3)->printAsOperand(O, SlotTracker);
+    O << ", ";
+    getOperand(4)->printAsOperand(O, SlotTracker);
   }
 }
 #endif

From 20d57e77f6709ef32791391bc064d3ed5663272a Mon Sep 17 00:00:00 2001
From: Hemang Gadhavi <hemang.gadhavi@ibm.com>
Date: Fri, 20 Jun 2025 20:18:39 +0530
Subject: [PATCH 1056/1322] [lldb][AIX] Added base file for AIX Register
 Context (#144645)

This PR is in reference to porting LLDB on AIX.
Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

- Added skeleton for Registercontext file for AIX. (Later we will add
implementation respectively)
---
 .../source/Plugins/Process/AIX/CMakeLists.txt |  1 +
 .../Process/AIX/NativeRegisterContextAIX.cpp  | 54 ++++++++++++++++
 .../Process/AIX/NativeRegisterContextAIX.h    | 62 +++++++++++++++++++
 3 files changed, 117 insertions(+)
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h

diff --git a/lldb/source/Plugins/Process/AIX/CMakeLists.txt b/lldb/source/Plugins/Process/AIX/CMakeLists.txt
index 6b3151edbd1e..3a6d9ec118e6 100644
--- a/lldb/source/Plugins/Process/AIX/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/AIX/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_lldb_library(lldbPluginProcessAIX
   NativeProcessAIX.cpp
   NativeThreadAIX.cpp
+  NativeRegisterContextAIX.cpp
 
   LINK_COMPONENTS
     Support
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
new file mode 100644
index 000000000000..e44cd7b5a30f
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
@@ -0,0 +1,54 @@
+//===---- NativeRegisterContextAIX.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NativeRegisterContextAIX.h"
+#include "Plugins/Process/AIX/NativeProcessAIX.h"
+
+using namespace lldb_private;
+using namespace lldb_private::process_aix;
+
+lldb::ByteOrder NativeRegisterContextAIX::GetByteOrder() const {
+  return lldb::eByteOrderInvalid;
+}
+
+Status NativeRegisterContextAIX::ReadRegisterRaw(uint32_t reg_index,
+                                                 RegisterValue &reg_value) {
+  return Status("unimplemented");
+}
+
+Status
+NativeRegisterContextAIX::WriteRegisterRaw(uint32_t reg_index,
+                                           const RegisterValue &reg_value) {
+  return Status("unimplemented");
+}
+
+Status NativeRegisterContextAIX::ReadGPR() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::WriteGPR() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::ReadFPR() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::WriteFPR() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::ReadVMX() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::WriteVMX() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::ReadVSX() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::WriteVSX() { return Status("unimplemented"); }
+
+Status NativeRegisterContextAIX::ReadRegisterSet(void *buf, size_t buf_size,
+                                                 unsigned int regset) {
+  return Status("unimplemented");
+}
+
+Status NativeRegisterContextAIX::WriteRegisterSet(void *buf, size_t buf_size,
+                                                  unsigned int regset) {
+  return Status("unimplemented");
+}
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h
new file mode 100644
index 000000000000..e78483a7670f
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h
@@ -0,0 +1,62 @@
+//===---- NativeRegisterContextAIX.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_NATIVEREGISTERCONTEXTAIX_H
+#define LLDB_SOURCE_PLUGINS_PROCESS_AIX_NATIVEREGISTERCONTEXTAIX_H
+
+#include "Plugins/Process/Utility/NativeRegisterContextRegisterInfo.h"
+
+namespace lldb_private::process_aix {
+
+class NativeRegisterContextAIX
+    : public virtual NativeRegisterContextRegisterInfo {
+protected:
+  NativeRegisterContextAIX(NativeThreadProtocol &thread)
+      : NativeRegisterContextRegisterInfo(thread, nullptr) {}
+
+  lldb::ByteOrder GetByteOrder() const;
+
+  virtual Status ReadRegisterRaw(uint32_t reg_index, RegisterValue &reg_value);
+
+  virtual Status WriteRegisterRaw(uint32_t reg_index,
+                                  const RegisterValue &reg_value);
+
+  virtual Status ReadRegisterSet(void *buf, size_t buf_size,
+                                 unsigned int regset);
+
+  virtual Status WriteRegisterSet(void *buf, size_t buf_size,
+                                  unsigned int regset);
+
+  virtual Status ReadGPR();
+
+  virtual Status WriteGPR();
+
+  virtual Status ReadFPR();
+
+  virtual Status WriteFPR();
+
+  virtual Status ReadVMX();
+
+  virtual Status WriteVMX();
+
+  virtual Status ReadVSX();
+
+  virtual Status WriteVSX();
+
+  virtual void *GetGPRBuffer() = 0;
+
+  virtual size_t GetGPRSize() = 0;
+
+  virtual void *GetFPRBuffer() = 0;
+
+  virtual size_t GetFPRSize() = 0;
+};
+
+} // namespace lldb_private::process_aix
+
+#endif // #ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_NATIVEREGISTERCONTEXTAIX_H

From 95c6c11c747dee61133cff56f1a7ea7445c7ae79 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 15:51:04 +0100
Subject: [PATCH 1057/1322] [X86] combineConcatVectorOps - only always concat
 logic ops on AVX512 targets (#145036)

We should only concat logic ops if at least one operand will freely
concatenate. We've now addressed the remaining regressions on AVX2
targets, but still have a number on AVX512 targets which can
aggressively use VPTERNLOG in many cases.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp  |  4 ++--
 llvm/test/CodeGen/X86/vector-fshl-256.ll | 26 ++++++++++++------------
 llvm/test/CodeGen/X86/vector-fshr-256.ll | 26 ++++++++++++------------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index defb7730b4c7..085f44e72476 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58885,7 +58885,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     case ISD::OR:
     case ISD::XOR:
     case X86ISD::ANDNP:
-      // TODO: AVX2+ targets should only use CombineSubOperand like AVX1.
+      // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
       if (!IsSplat && (VT.is256BitVector() ||
                        (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
         // Don't concatenate root AVX1 NOT patterns.
@@ -58897,7 +58897,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
           break;
         SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
         SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
-        if (Concat0 || Concat1 || Subtarget.hasInt256())
+        if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
           return DAG.getNode(Opcode, DL, VT,
                              Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
                              Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index c6e1aa9cd90c..6fbc10307e0b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -739,15 +739,15 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
 ; XOPAVX2-NEXT:    vpaddb %xmm6, %xmm5, %xmm7
 ; XOPAVX2-NEXT:    vpshlb %xmm7, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpaddb %xmm6, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
   ret <32 x i8> %res
@@ -1992,17 +1992,17 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
 ; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; XOPAVX2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
+; XOPAVX2-NEXT:    vpshlb %xmm6, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
   ret <32 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 9479174d964c..b0a1a91bdccc 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -766,18 +766,18 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm6
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
 ; XOPAVX2-NEXT:    vpshlb %xmm6, %xmm7, %xmm6
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm7
+; XOPAVX2-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm7, %xmm4
+; XOPAVX2-NEXT:    vpor %xmm6, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm5, %xmm5
 ; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vpxor %xmm3, %xmm4, %xmm4
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; XOPAVX2-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm5, %xmm4
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
   ret <32 x i8> %res
@@ -1793,16 +1793,16 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,255,254,253,252,251,250,249,0,249,250,251,252,253,254,255]
 ; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; XOPAVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlb %xmm5, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
   ret <32 x i8> %res

From b017b4ce9a45d4c5a339e24142da5d4a7e4c5db1 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 20 Jun 2025 16:01:55 +0100
Subject: [PATCH 1058/1322] [lldb][DWARF] Remove object_pointer from
 ParsedDWARFAttributes (#144880)

We can just always use `GetCXXObjectParameter` instead. We've only used
this attribute to set the object parameter name on ClangASTMetadata,
which doesn't seem like good enough justification to keep it around.

Depends on https://github.com/llvm/llvm-project/pull/144879
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  | 28 ++++++-------------
 .../SymbolFile/DWARF/DWARFASTParserClang.h    |  7 +++--
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 4f79c8aa3f81..3bec89cdf746 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -445,15 +445,6 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) {
       name.SetCString(form_value.AsCString());
       break;
 
-    case DW_AT_object_pointer:
-      // GetAttributes follows DW_AT_specification.
-      // DW_TAG_subprogram definitions and declarations may both
-      // have a DW_AT_object_pointer. Don't overwrite the one
-      // we parsed for the definition with the one from the declaration.
-      if (!object_pointer.IsValid())
-        object_pointer = form_value.Reference();
-      break;
-
     case DW_AT_signature:
       signature = form_value;
       break;
@@ -1116,7 +1107,7 @@ bool DWARFASTParserClang::ParseObjCMethod(
 std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
     const DWARFDIE &die, CompilerType clang_type,
     const ParsedDWARFTypeAttributes &attrs, const DWARFDIE &decl_ctx_die,
-    bool is_static, bool &ignore_containing_context) {
+    const DWARFDIE &object_parameter, bool &ignore_containing_context) {
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
   SymbolFileDWARF *dwarf = die.GetDWARF();
   assert(dwarf);
@@ -1200,6 +1191,9 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
       TypeSystemClang::GetDeclContextForType(class_opaque_type), die,
       attrs.name.GetCString());
 
+  // In DWARF, a C++ method is static if it has no object parameter child.
+  const bool is_static = !object_parameter.IsValid();
+
   // We have a C++ member function with no children (this pointer!) and clang
   // will get mad if we try and make a function that isn't well formed in the
   // DWARF, so we will just skip it...
@@ -1225,9 +1219,7 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
     ClangASTMetadata metadata;
     metadata.SetUserID(die.GetID());
 
-    char const *object_pointer_name =
-        attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
-    if (object_pointer_name) {
+    if (char const *object_pointer_name = object_parameter.GetName()) {
       metadata.SetObjectPtrName(object_pointer_name);
       LLDB_LOGF(log, "Setting object pointer name: %s on method object %p.\n",
                 object_pointer_name, static_cast<void *>(cxx_method_decl));
@@ -1323,11 +1315,9 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
         type_handled =
             ParseObjCMethod(*objc_method, die, clang_type, attrs, is_variadic);
       } else if (is_cxx_method) {
-        // In DWARF, a C++ method is static if it has no object parameter child.
-        const bool is_static = !object_parameter.IsValid();
         auto [handled, type_sp] =
-            ParseCXXMethod(die, clang_type, attrs, decl_ctx_die, is_static,
-                           ignore_containing_context);
+            ParseCXXMethod(die, clang_type, attrs, decl_ctx_die,
+                           object_parameter, ignore_containing_context);
         if (type_sp)
           return type_sp;
 
@@ -1422,9 +1412,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
           ClangASTMetadata metadata;
           metadata.SetUserID(die.GetID());
 
-          char const *object_pointer_name =
-              attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
-          if (object_pointer_name) {
+          if (char const *object_pointer_name = object_parameter.GetName()) {
             metadata.SetObjectPtrName(object_pointer_name);
             LLDB_LOGF(log,
                       "Setting object pointer name: %s on function "
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 111604ce4068..a90f55bcff94 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -470,7 +470,8 @@ private:
   /// \param[in] decl_ctx_die The DIE representing the DeclContext of the C++
   ///                         method being parsed.
   ///
-  /// \param[in] is_static Is true iff we're parsing a static method.
+  /// \param[in] object_parameter The DIE of this subprogram's object parameter.
+  ///                             May be an invalid DIE for C++ static methods.
   ///
   /// \param[out] ignore_containing_context Will get set to true if the caller
   ///             should treat this C++ method as-if it was not a C++ method.
@@ -485,7 +486,8 @@ private:
                  lldb_private::CompilerType clang_type,
                  const ParsedDWARFTypeAttributes &attrs,
                  const lldb_private::plugin::dwarf::DWARFDIE &decl_ctx_die,
-                 bool is_static, bool &ignore_containing_context);
+                 const lldb_private::plugin::dwarf::DWARFDIE &object_parameter,
+                 bool &ignore_containing_context);
 
   lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die,
                               const ParsedDWARFTypeAttributes &attrs);
@@ -555,7 +557,6 @@ struct ParsedDWARFTypeAttributes {
   const char *mangled_name = nullptr;
   lldb_private::ConstString name;
   lldb_private::Declaration decl;
-  lldb_private::plugin::dwarf::DWARFDIE object_pointer;
   lldb_private::plugin::dwarf::DWARFFormValue abstract_origin;
   lldb_private::plugin::dwarf::DWARFFormValue containing_type;
   lldb_private::plugin::dwarf::DWARFFormValue signature;

From 9dc59cc95b1766510ab43ec62bb087aa9273341a Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 20 Jun 2025 10:02:09 -0500
Subject: [PATCH 1059/1322] [MLIR] Incorrect track of usedKey in
 setPropertiesFromParsedAttr (#144789)

co-authored by @chencha3 and @joker-eph
---
 mlir/test/lib/Dialect/Test/TestOps.td  | 2 +-
 mlir/test/mlir-tblgen/op-format.mlir   | 4 ++--
 mlir/tools/mlir-tblgen/OpFormatGen.cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 79bcd9c2e0a9..30234698bc8d 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -3179,7 +3179,7 @@ def TestOpWithPropertiesAndInferredType
   ]> {
   let assemblyFormat = "$lhs prop-dict attr-dict";
 
-  let arguments = (ins I32Attr:$lhs, IntProp<"int64_t">:$rhs);
+  let arguments = (ins I32Attr:$lhs, IntProp<"int64_t">:$rhs, OptionalAttr<UnitAttr>: $packed);
   let results = (outs AnyType:$result);
 }
 
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
index 08b0c52413a7..981fb5aff2aa 100644
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -516,8 +516,8 @@ test.format_infer_variadic_type_from_non_variadic %i64, %i64 : i64
 // CHECK: test.with_properties_and_attr 16 < {rhs = 16 : i64}>
 test.with_properties_and_attr 16 <{rhs = 16 : i64}>
 
-// CHECK: test.with_properties_and_inferred_type 16 < {rhs = 16 : i64}>
-%should_be_i32 = test.with_properties_and_inferred_type 16 <{rhs = 16 : i64}>
+// CHECK: test.with_properties_and_inferred_type 16 < {packed, rhs = 16 : i64}>
+%should_be_i32 = test.with_properties_and_inferred_type 16 <{packed, rhs = 16 : i64}>
 // Assert through the verifier that its inferred as i32.
 test.format_all_types_match_var %should_be_i32, %i32 : i32
 
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index ef3a18ba7df2..11edf2523f1a 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1365,7 +1365,7 @@ if (attr && ::mlir::failed(setFromAttr(prop.{1}, attr, emitError)))
 auto &propStorage = prop.{0};
 auto {0}AttrName = StringAttr::get(ctx, "{0}");
 auto attr = dict.get({0}AttrName);
-usedKeys.insert(StringAttr::get(ctx, "{1}"));
+usedKeys.insert({0}AttrName);
 if (attr || /*isRequired=*/{1}) {{
   if (!attr) {{
     emitError() << "expected key entry for {0} in DictionaryAttr to set "

From 05b4bfe19eaba13b4fdf39fc5541077c255b8e3f Mon Sep 17 00:00:00 2001
From: Prajwal Nadig <pnadig@apple.com>
Date: Fri, 20 Jun 2025 16:05:16 +0100
Subject: [PATCH 1060/1322] [ExtractAPI] Include +/- symbols for ObjC methods
 (#145035)

ObjC methods include a +/- prefix to indicate if they are a class or
instance method. This information is valuable, and must be included in
the navigator generated by ExtractAPI.

rdar://150870936
---
 .../Serialization/SymbolGraphSerializer.cpp      | 16 ++++++++++++++++
 clang/test/ExtractAPI/objc_instancetype.m        |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index 139023f32e8d..d3df9eb604f2 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -341,6 +341,22 @@ Object serializeNames(const APIRecord *Record) {
   serializeArray(Names, "subHeading",
                  serializeDeclarationFragments(Record->SubHeading));
   DeclarationFragments NavigatorFragments;
+  // The +/- prefix for Objective-C methods is important information, and
+  // should be included in the navigator fragment. The entire subheading is
+  // not included as it can contain too much information for other records.
+  switch (Record->getKind()) {
+  case APIRecord::RK_ObjCClassMethod:
+    NavigatorFragments.append("+ ", DeclarationFragments::FragmentKind::Text,
+                              /*PreciseIdentifier*/ "");
+    break;
+  case APIRecord::RK_ObjCInstanceMethod:
+    NavigatorFragments.append("- ", DeclarationFragments::FragmentKind::Text,
+                              /*PreciseIdentifier*/ "");
+    break;
+  default:
+    break;
+  }
+
   NavigatorFragments.append(Record->Name,
                             DeclarationFragments::FragmentKind::Identifier,
                             /*PreciseIdentifier*/ "");
diff --git a/clang/test/ExtractAPI/objc_instancetype.m b/clang/test/ExtractAPI/objc_instancetype.m
index 071ebe440918..dbd47a1f746f 100644
--- a/clang/test/ExtractAPI/objc_instancetype.m
+++ b/clang/test/ExtractAPI/objc_instancetype.m
@@ -157,6 +157,10 @@
       },
       "names": {
         "navigator": [
+          {
+            "kind": "text",
+            "spelling": "- "
+          },
           {
             "kind": "identifier",
             "spelling": "init"
@@ -228,6 +232,10 @@
       },
       "names": {
         "navigator": [
+          {
+            "kind": "text",
+            "spelling": "- "
+          },
           {
             "kind": "identifier",
             "spelling": "reset"

From 71e20c6c86e04863df80e286a004a20070a5a610 Mon Sep 17 00:00:00 2001
From: Douglas <Douglas.Gliner@sony.com>
Date: Fri, 20 Jun 2025 08:13:48 -0700
Subject: [PATCH 1061/1322] Fix references to required libraries when building
 LLVM with ASAN and MultiThreaded[Debug] on Windows (#139657)

After https://github.com/llvm/llvm-project/pull/81677, statically
linking ASAN under Windows is no longer supported. Therefore, when using
Clang built past
https://github.com/llvm/llvm-project/commit/53a81d4d26f0409de8a0655d7af90f2bea222a12
to build LLVM / Clang with
`-DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded[Debug]
-DLLVM_USE_SANITIZER=Address`, a different set of dependent libraries
must be linked. This is mentioned in the description of
https://github.com/llvm/llvm-project/pull/81677 and also in
https://devblogs.microsoft.com/cppblog/msvc-address-sanitizer-one-dll-for-all-runtime-configurations/.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 8004d3571fc8..5aa047a33ba6 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1038,13 +1038,14 @@ if(LLVM_USE_SANITIZER)
         # lld string tail merging interacts badly with ASAN on Windows, turn it off here
         # See https://github.com/llvm/llvm-project/issues/62078
         append("/opt:nolldtailmerge" CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+        # Static and dynamic C runtimes all load ASAN as a DLL
+        # See https://devblogs.microsoft.com/cppblog/msvc-address-sanitizer-one-dll-for-all-runtime-configurations/
+        append("clang_rt.asan_dynamic-${arch}.lib" CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
         if (${CMAKE_MSVC_RUNTIME_LIBRARY} MATCHES "^(MultiThreaded|MultiThreadedDebug)$")
-          append("/wholearchive:clang_rt.asan-${arch}.lib /wholearchive:clang_rt.asan_cxx-${arch}.lib"
-            CMAKE_EXE_LINKER_FLAGS)
-          append("/wholearchive:clang_rt.asan_dll_thunk-${arch}.lib"
-            CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+          append("/wholearchive:clang_rt.asan_static_runtime_thunk-${arch}.lib"
+            CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
         else()
-          append("clang_rt.asan_dynamic-${arch}.lib /wholearchive:clang_rt.asan_dynamic_runtime_thunk-${arch}.lib"
+          append("/wholearchive:clang_rt.asan_dynamic_runtime_thunk-${arch}.lib"
             CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
         endif()
       endif()

From b533b0ec34ac36d8a6af406d1fb046e07f95f717 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <jkun@google.com>
Date: Fri, 20 Jun 2025 08:16:52 -0700
Subject: [PATCH 1062/1322] Define a DataFlowSolver helper that loads sensible
 default analyses (#143415)

Cf. https://discourse.llvm.org/t/mlir-dead-code-analysis/67568/10

Custom analysis passes will not work properly unless both
DeadCodeAnalysis and SparseConstantPropagation are loaded to the
DataFlowSolver. This is intended behavior, but surprising to many users
as shown in the thread. In lieu of a longer-term fix (which I am not
knowledgeable enough to implement myself, yet), this commit adds a
helper function that loads these two analyses, as well as providing
breadcrumbs for an explanation of the problem. The existing places in
the codebase where these two analyses are loaded for the purpose of
running other unrelated analyses are replaced by the use of the helper.

---------

Co-authored-by: Jeremy Kun <j2kun@users.noreply.github.com>
Co-authored-by: Oleksandr "Alex" Zinenko <azinenko@amd.com>
---
 mlir/include/mlir/Analysis/DataFlow/Utils.h   | 37 +++++++++++++++++++
 .../Analysis/DataFlow/LivenessAnalysis.cpp    |  6 +--
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  6 +--
 .../TestDenseBackwardDataFlowAnalysis.cpp     |  6 +--
 .../TestDenseForwardDataFlowAnalysis.cpp      |  6 +--
 .../TestSparseBackwardDataFlowAnalysis.cpp    |  6 +--
 6 files changed, 47 insertions(+), 20 deletions(-)
 create mode 100644 mlir/include/mlir/Analysis/DataFlow/Utils.h

diff --git a/mlir/include/mlir/Analysis/DataFlow/Utils.h b/mlir/include/mlir/Analysis/DataFlow/Utils.h
new file mode 100644
index 000000000000..e97f2f70f609
--- /dev/null
+++ b/mlir/include/mlir/Analysis/DataFlow/Utils.h
@@ -0,0 +1,37 @@
+//===-Utils.h - DataFlow utility functions ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility functions for dataflow analyses.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_DATAFLOW_UTILS_H
+#define MLIR_ANALYSIS_DATAFLOW_UTILS_H
+
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+
+namespace mlir {
+namespace dataflow {
+
+/// Populates a DataFlowSolver with analyses that are required to ensure
+/// user-defined analyses are run properly.
+///
+/// This helper is intended to be an interim fix until a more robust solution
+/// can be implemented in the DataFlow framework directly. Cf.
+/// https://discourse.llvm.org/t/mlir-dead-code-analysis/67568
+inline void loadBaselineAnalyses(DataFlowSolver &solver) {
+  solver.load<dataflow::DeadCodeAnalysis>();
+  solver.load<dataflow::SparseConstantPropagation>();
+}
+
+} // end namespace dataflow
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_DATAFLOW_INTEGERANGEANALYSIS_H
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index d61cdb143e7d..24a78400eb84 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -10,9 +10,8 @@
 #include <cassert>
 #include <mlir/Analysis/DataFlow/LivenessAnalysis.h>
 
-#include <mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h>
-#include <mlir/Analysis/DataFlow/DeadCodeAnalysis.h>
 #include <mlir/Analysis/DataFlow/SparseAnalysis.h>
+#include <mlir/Analysis/DataFlow/Utils.h>
 #include <mlir/Analysis/DataFlowFramework.h>
 #include <mlir/IR/Operation.h>
 #include <mlir/IR/Value.h>
@@ -249,8 +248,7 @@ void LivenessAnalysis::setToExitState(Liveness *lattice) {
 RunLivenessAnalysis::RunLivenessAnalysis(Operation *op) {
   SymbolTableCollection symbolTable;
 
-  solver.load<DeadCodeAnalysis>();
-  solver.load<SparseConstantPropagation>();
+  loadBaselineAnalyses(solver);
   solver.load<LivenessAnalysis>(symbolTable);
   (void)solver.initializeAndRun(op);
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index c84906cc4556..66d21dbdaf06 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/Analysis/DataFlow/Utils.h"
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
@@ -620,8 +619,7 @@ public:
 
   RunLayoutInfoPropagation(Operation *op) : target(op) {
     SymbolTableCollection symbolTable;
-    solver.load<DeadCodeAnalysis>();
-    solver.load<SparseConstantPropagation>();
+    loadBaselineAnalyses(solver);
     solver.load<LayoutInfoPropagation>(symbolTable);
     (void)solver.initializeAndRun(op);
   }
diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
index da543f4f04f9..d57b41c41de6 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
@@ -13,9 +13,8 @@
 #include "TestDenseDataFlowAnalysis.h"
 #include "TestDialect.h"
 #include "TestOps.h"
-#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/DenseAnalysis.h"
+#include "mlir/Analysis/DataFlow/Utils.h"
 #include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/SymbolTable.h"
@@ -283,9 +282,8 @@ struct TestNextAccessPass
 
     auto config = DataFlowConfig().setInterprocedural(interprocedural);
     DataFlowSolver solver(config);
-    solver.load<DeadCodeAnalysis>();
+    loadBaselineAnalyses(solver);
     solver.load<NextAccessAnalysis>(symbolTable, assumeFuncReads);
-    solver.load<SparseConstantPropagation>();
     solver.load<UnderlyingValueAnalysis>();
     if (failed(solver.initializeAndRun(op))) {
       emitError(op->getLoc(), "dataflow solver failed");
diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp
index f4f8e9115a3f..a88ed7f8dea8 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp
@@ -13,9 +13,8 @@
 #include "TestDenseDataFlowAnalysis.h"
 #include "TestDialect.h"
 #include "TestOps.h"
-#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/DenseAnalysis.h"
+#include "mlir/Analysis/DataFlow/Utils.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -240,8 +239,7 @@ struct TestLastModifiedPass
     Operation *op = getOperation();
 
     DataFlowSolver solver(DataFlowConfig().setInterprocedural(interprocedural));
-    solver.load<DeadCodeAnalysis>();
-    solver.load<SparseConstantPropagation>();
+    loadBaselineAnalyses(solver);
     solver.load<LastModifiedAnalysis>(assumeFuncWrites);
     solver.load<UnderlyingValueAnalysis>();
     if (failed(solver.initializeAndRun(op)))
diff --git a/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp
index 4b02865b6ae1..0bdb7c25c3b5 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/Analysis/DataFlow/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
@@ -182,8 +181,7 @@ struct TestWrittenToPass
     SymbolTableCollection symbolTable;
 
     DataFlowSolver solver(DataFlowConfig().setInterprocedural(interprocedural));
-    solver.load<DeadCodeAnalysis>();
-    solver.load<SparseConstantPropagation>();
+    loadBaselineAnalyses(solver);
     solver.load<WrittenToAnalysis>(symbolTable, assumeFuncWrites);
     if (failed(solver.initializeAndRun(op)))
       return signalPassFailure();

From c734377544fc5a854c539fafc9b9b658f12230a3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 16:20:03 +0100
Subject: [PATCH 1063/1322] [DAG] foldMaskedMerge - fix Wparentheses operator
 precedence warning. NFC.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a6b9cc81edde..701a76c4cc6b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7214,9 +7214,8 @@ static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
                                const TargetLowering &TLI, const SDLoc &DL) {
   // Note that masked-merge variants using XOR or ADD expressions are
   // normalized to OR by InstCombine so we only check for OR or AND.
-  assert(Node->getOpcode() == ISD::OR ||
-         Node->getOpcode() == ISD::AND &&
-             "Must be called with ISD::OR or ISD::AND node");
+  assert((Node->getOpcode() == ISD::OR || Node->getOpcode() == ISD::AND) &&
+         "Must be called with ISD::OR or ISD::AND node");
 
   // If the target supports and-not, don't fold this.
   if (TLI.hasAndNot(SDValue(Node, 0)))

From 3f1de197b1c339b311329c02bb739860b32c073f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 20 Jun 2025 10:31:54 -0500
Subject: [PATCH 1064/1322] [Offload] Rework compiling device code for unit
 test suites (#144776)

Summary:
I'll probably want to use this as a more generic utility in the future.
This patch reworks it to make it a top level function. I also tried to
decouple this from the OpenMP utilities to make that easier in the
future. Instead, I just use `-march=native` functionality which is the
same thing. Needed a small hack to skip the linker stage for checking if
that works.

This should still create the same output as far as I'm aware.
---
 offload/unittests/CMakeLists.txt              | 73 ++++++++++++++++++-
 .../OffloadAPI/device_code/CMakeLists.txt     | 69 +-----------------
 2 files changed, 74 insertions(+), 68 deletions(-)

diff --git a/offload/unittests/CMakeLists.txt b/offload/unittests/CMakeLists.txt
index 985dd892d804..7cd41e1dcdaf 100644
--- a/offload/unittests/CMakeLists.txt
+++ b/offload/unittests/CMakeLists.txt
@@ -1,6 +1,72 @@
 add_custom_target(OffloadUnitTests)
 set_target_properties(OffloadUnitTests PROPERTIES FOLDER "Tests/UnitTests")
 
+function(add_offload_test_device_code test_filename test_name)
+  set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
+  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+
+  # Try to build with support for NVPTX devices.
+  if("cuda" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    find_package(CUDAToolkit QUIET)
+    if(CUDAToolkit_FOUND)
+      get_filename_component(cuda_path "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
+    endif()
+    check_cxx_compiler_flag(
+      "--target=nvptx64-nvidia-cuda -march=native --cuda-path=${cuda_path}" PLATFORM_HAS_NVPTX)
+
+    if(OFFLOAD_TESTS_FORCE_NVPTX_ARCH)
+      set(nvptx_arch "${OFFLOAD_TESTS_FORCE_NVPTX_ARCH}")
+    elseif(PLATFORM_HAS_NVPTX)
+      set(nvptx_arch "native")
+    endif()
+
+    if(nvptx_arch AND CUDAToolkit_FOUND)
+      set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin")
+      add_custom_command(
+        OUTPUT ${output_file}
+        COMMAND ${CMAKE_C_COMPILER}
+        --target=nvptx64-nvidia-cuda -march=${nvptx_arch}
+        -nogpulib --cuda-path=${CUDA_ROOT} -flto ${ARGN}
+        -c ${SRC_PATH} -o ${output_file}
+        DEPENDS ${SRC_PATH}
+      )
+      add_custom_target(${test_name}.nvptx64 DEPENDS ${output_file})
+    endif()
+  endif()
+
+  # Try to build with support for AMDGPU devices.
+  if("amdgpu" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    check_cxx_compiler_flag("--target=amdgcn-amd-amdhsa -mcpu=native" PLATFORM_HAS_AMDGPU)
+
+    if(OFFLOAD_TESTS_FORCE_AMDGPU_ARCH)
+      set(amdgpu_arch "${OFFLOAD_TESTS_FORCE_AMDGPU_ARCH}")
+    elseif(PLATFORM_HAS_AMDGPU)
+      set(amdgpu_arch "native")
+    endif()
+
+    if(amdgpu_arch)
+      set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin")
+      add_custom_command(
+        OUTPUT ${output_file}
+        COMMAND ${CMAKE_C_COMPILER}
+        --target=amdgcn-amd-amdhsa -mcpu=${amdgpu_arch}
+        -nogpulib -flto ${ARGN} -c ${SRC_PATH} -o ${output_file}
+        DEPENDS ${SRC_PATH}
+      )
+      add_custom_target(${test_name}.amdgpu DEPENDS ${output_file})
+    endif()
+  endif()
+
+  # Create a single dependency target for the device code.
+  add_custom_target(${test_name}.bin)
+  if(TARGET ${test_name}.amdgpu)
+    add_dependencies(${test_name}.bin ${test_name}.amdgpu)
+  endif()
+  if(TARGET ${test_name}.nvptx64)
+    add_dependencies(${test_name}.bin ${test_name}.nvptx64)
+  endif()
+endfunction()
+
 function(add_offload_unittest test_dirname)
   set(target_name "${test_dirname}.unittests")
 
@@ -9,10 +75,15 @@ function(add_offload_unittest test_dirname)
   add_unittest(OffloadUnitTests "${target_name}"
     ${CMAKE_CURRENT_SOURCE_DIR}/common/Environment.cpp
     ${files})
-  add_dependencies(${target_name} ${PLUGINS_TEST_COMMON} OffloadUnitTestsDeviceBins)
+  add_dependencies(${target_name} ${PLUGINS_TEST_COMMON} offload_device_binaries)
   target_compile_definitions(${target_name} PRIVATE DEVICE_CODE_PATH="${OFFLOAD_TEST_DEVICE_CODE_PATH}")
   target_link_libraries(${target_name} PRIVATE ${PLUGINS_TEST_COMMON})
   target_include_directories(${target_name} PRIVATE ${PLUGINS_TEST_INCLUDE})
 endfunction()
 
+set(OFFLOAD_TESTS_FORCE_NVPTX_ARCH "" CACHE STRING
+  "Force building of NVPTX device code for Offload unit tests with the given arch, e.g. sm_61")
+set(OFFLOAD_TESTS_FORCE_AMDGPU_ARCH "" CACHE STRING
+  "Force building of AMDGPU device code for Offload unit tests with the given arch, e.g. gfx1030")
+
 add_subdirectory(OffloadAPI)
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index c2e4d0cb24e6..132c7a7c51fb 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -1,72 +1,7 @@
-macro(add_offload_test_device_code test_filename test_name)
-    set(SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${test_filename})
-
-    # Build for NVPTX
-    if(OFFLOAD_TEST_TARGET_NVIDIA)
-        set(BIN_PATH ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.nvptx64.bin)
-        add_custom_command(OUTPUT ${BIN_PATH}
-            COMMAND
-            ${CMAKE_C_COMPILER} --target=nvptx64-nvidia-cuda
-            ${ARGN}
-            -march=${LIBOMPTARGET_DEP_CUDA_ARCH}
-            --cuda-path=${CUDA_ROOT}
-            ${SRC_PATH} -o ${BIN_PATH}
-            DEPENDS ${SRC_PATH}
-        )
-        list(APPEND BIN_PATHS ${BIN_PATH})
-    endif()
-
-    # Build for AMDGPU
-    if(OFFLOAD_TEST_TARGET_AMDGPU)
-        set(BIN_PATH ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.amdgpu.bin)
-        add_custom_command(OUTPUT ${BIN_PATH}
-            COMMAND
-            ${CMAKE_C_COMPILER} --target=amdgcn-amd-amdhsa -nogpulib
-            ${ARGN}
-            -mcpu=${LIBOMPTARGET_DEP_AMDGPU_ARCH}
-            ${SRC_PATH} -o ${BIN_PATH}
-            DEPENDS ${SRC_PATH}
-        )
-        list(APPEND BIN_PATHS ${BIN_PATH})
-    endif()
-
-    # TODO: Build for host CPU
-endmacro()
-
-
-# Decide what device targets to build for. LibomptargetGetDependencies is
-# included at the top-level so the GPUs present on the system are already
-# detected.
-set(OFFLOAD_TESTS_FORCE_NVIDIA_ARCH "" CACHE STRING
-    "Force building of NVPTX device code for Offload unit tests with the given arch, e.g. sm_61")
-set(OFFLOAD_TESTS_FORCE_AMDGPU_ARCH "" CACHE STRING
-    "Force building of AMDGPU device code for Offload unit tests with the given arch, e.g. gfx1030")
-
-find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
-  get_filename_component(CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
-endif()
-if (OFFLOAD_TESTS_FORCE_NVIDIA_ARCH)
-    set(LIBOMPTARGET_DEP_CUDA_ARCH ${OFFLOAD_TESTS_FORCE_NVIDIA_ARCH})
-    set(OFFLOAD_TEST_TARGET_NVIDIA ON)
-elseif (LIBOMPTARGET_FOUND_NVIDIA_GPU AND CUDA_ROOT AND "cuda" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
-    set(OFFLOAD_TEST_TARGET_NVIDIA ON)
-endif()
-
-if (OFFLOAD_TESTS_FORCE_AMDGPU_ARCH)
-    set(LIBOMPTARGET_DEP_AMDGPU_ARCH ${OFFLOAD_TESTS_FORCE_AMDGPU_ARCH})
-    set(OFFLOAD_TEST_TARGET_AMDGPU ON)
-elseif (LIBOMPTARGET_FOUND_AMDGPU_GPU AND "amdgpu" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
-    list(GET LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST 0 LIBOMPTARGET_DEP_AMDGPU_ARCH)
-    set(OFFLOAD_TEST_TARGET_AMDGPU ON)
-endif()
-
 add_offload_test_device_code(foo.c foo)
 add_offload_test_device_code(bar.c bar)
-# By default, amdhsa will add a number of "hidden" arguments to the kernel defintion
-# O3 disables this, and results in a kernel function with actually no arguments as seen by liboffload
+# Compile with optimizations to eliminate AMDGPU implicit arguments.
 add_offload_test_device_code(noargs.c noargs -O3)
 
-add_custom_target(OffloadUnitTestsDeviceBins DEPENDS ${BIN_PATHS})
-
+add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
 set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)

From 887222e3526fbe08e748a33f740296ac22bf1ab1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 20 Jun 2025 08:33:59 -0700
Subject: [PATCH 1065/1322] [mlir] Migrate away from ArrayRef(std::nullopt)
 (NFC) (#144989)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

This patch takes care of the mlir side of the migration, starting with
straightforward places where I see ArrayRef or ValueRange nearby.
Note that ValueRange has a constructor that forwards arguments to an
ArrayRef constructor.
---
 mlir/include/mlir/AsmParser/AsmParserState.h         | 2 +-
 mlir/include/mlir/Dialect/Linalg/Utils/Utils.h       | 2 +-
 mlir/include/mlir/Dialect/Tensor/Utils/Utils.h       | 2 +-
 mlir/include/mlir/ExecutionEngine/ExecutionEngine.h  | 3 +--
 mlir/include/mlir/IR/BlockSupport.h                  | 2 +-
 mlir/include/mlir/IR/Builders.h                      | 7 +++----
 mlir/include/mlir/IR/BuiltinTypes.td                 | 4 ++--
 mlir/include/mlir/IR/PatternMatch.h                  | 7 +++----
 mlir/include/mlir/IR/Region.h                        | 2 +-
 mlir/include/mlir/IR/SymbolTable.h                   | 3 ++-
 mlir/include/mlir/IR/TypeRange.h                     | 2 +-
 mlir/include/mlir/IR/ValueRange.h                    | 4 ++--
 mlir/include/mlir/Rewrite/FrozenRewritePatternSet.h  | 7 +++----
 mlir/include/mlir/Tools/PDLL/AST/Types.h             | 3 +--
 mlir/include/mlir/Transforms/DialectConversion.h     | 2 +-
 mlir/include/mlir/Transforms/Passes.h                | 4 ++--
 mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp | 2 +-
 mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h              | 2 +-
 mlir/lib/Pass/PassStatistics.cpp                     | 2 +-
 mlir/unittests/IR/OperationSupportTest.cpp           | 5 ++---
 mlir/unittests/IR/ValueTest.cpp                      | 5 ++---
 21 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/mlir/include/mlir/AsmParser/AsmParserState.h b/mlir/include/mlir/AsmParser/AsmParserState.h
index 98bdc4696b84..e47cb32ee402 100644
--- a/mlir/include/mlir/AsmParser/AsmParserState.h
+++ b/mlir/include/mlir/AsmParser/AsmParserState.h
@@ -195,7 +195,7 @@ public:
   /// Finalize the most recently started operation definition.
   void finalizeOperationDefinition(
       Operation *op, SMRange nameLoc, SMLoc endLoc,
-      ArrayRef<std::pair<unsigned, SMLoc>> resultGroups = std::nullopt);
+      ArrayRef<std::pair<unsigned, SMLoc>> resultGroups = {});
 
   /// Start a definition for a region nested under the current operation.
   void startRegionDefinition();
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index fc151d02ceef..48978eb7663d 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -94,7 +94,7 @@ bool isReductionIterator(utils::IteratorType iteratorType);
 /// ```
 Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
                             Value source, Value padding, bool nofold,
-                            ValueRange typeDynDims = std::nullopt);
+                            ValueRange typeDynDims = {});
 
 /// Returns GenericOp that copies an n-D memref. Unlike the current
 /// implementation of memref::CopyOp, this op can further tile, lower to loops
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
index a1ce4e252c2f..da4c4ffb6d1b 100644
--- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -30,7 +30,7 @@ namespace tensor {
 // for _static_ dimensions.
 PadOp createPadHighOp(RankedTensorType resType, Value source, Value pad,
                       bool nofold, Location loc, OpBuilder &builder,
-                      ValueRange dynOutDims = std::nullopt);
+                      ValueRange dynOutDims = {});
 
 // Creates dim ops for each dynamic dimension of the ranked tensor argument and
 // returns these as values.
diff --git a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
index 66f49c787c19..96ccebcd5685 100644
--- a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
+++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -157,8 +157,7 @@ public:
 
   /// Invokes the function with the given name passing it the list of opaque
   /// pointers to the actual arguments.
-  llvm::Error invokePacked(StringRef name,
-                           MutableArrayRef<void *> args = std::nullopt);
+  llvm::Error invokePacked(StringRef name, MutableArrayRef<void *> args = {});
 
   /// Trait that defines how a given type is passed to the JIT code. This
   /// defaults to passing the address but can be specialized.
diff --git a/mlir/include/mlir/IR/BlockSupport.h b/mlir/include/mlir/IR/BlockSupport.h
index 41434269d555..f9fbef2f3753 100644
--- a/mlir/include/mlir/IR/BlockSupport.h
+++ b/mlir/include/mlir/IR/BlockSupport.h
@@ -106,7 +106,7 @@ class BlockRange final
           Block *, Block *, Block *> {
 public:
   using RangeBaseT::RangeBaseT;
-  BlockRange(ArrayRef<Block *> blocks = std::nullopt);
+  BlockRange(ArrayRef<Block *> blocks = {});
   BlockRange(SuccessorRange successors);
   template <typename Arg, typename = std::enable_if_t<std::is_constructible<
                               ArrayRef<Block *>, Arg>::value>>
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index d68dbdb1efee..ad59ea63a690 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -454,15 +454,14 @@ public:
   /// 'parent'. `locs` contains the locations of the inserted arguments, and
   /// should match the size of `argTypes`.
   Block *createBlock(Region *parent, Region::iterator insertPt = {},
-                     TypeRange argTypes = std::nullopt,
-                     ArrayRef<Location> locs = std::nullopt);
+                     TypeRange argTypes = {}, ArrayRef<Location> locs = {});
 
   /// Add new block with 'argTypes' arguments and set the insertion point to the
   /// end of it. The block is placed before 'insertBefore'. `locs` contains the
   /// locations of the inserted arguments, and should match the size of
   /// `argTypes`.
-  Block *createBlock(Block *insertBefore, TypeRange argTypes = std::nullopt,
-                     ArrayRef<Location> locs = std::nullopt);
+  Block *createBlock(Block *insertBefore, TypeRange argTypes = {},
+                     ArrayRef<Location> locs = {});
 
   //===--------------------------------------------------------------------===//
   // Operation Creation
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index 771de01fc8d5..d5663bcbf6a5 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -1200,7 +1200,7 @@ def Builtin_UnrankedMemRef : Builtin_Type<"UnrankedMemRef", "unranked_memref", [
     using ShapedType::Trait<UnrankedMemRefType>::getDimSize;
     using ShapedType::Trait<UnrankedMemRefType>::getDynamicDimIndex;
 
-    ArrayRef<int64_t> getShape() const { return std::nullopt; }
+    ArrayRef<int64_t> getShape() const { return {}; }
 
     /// [deprecated] Returns the memory space in old raw integer representation.
     /// New `Attribute getMemorySpace()` method should be used instead.
@@ -1259,7 +1259,7 @@ def Builtin_UnrankedTensor : Builtin_Type<"UnrankedTensor", "unranked_tensor", [
     using ShapedType::Trait<UnrankedTensorType>::getDimSize;
     using ShapedType::Trait<UnrankedTensorType>::getDynamicDimIndex;
 
-    ArrayRef<int64_t> getShape() const { return std::nullopt; }
+    ArrayRef<int64_t> getShape() const { return {}; }
   }];
   let skipDefaultBuilders = 1;
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 10cfe851765d..b6659a7f915c 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -520,7 +520,7 @@ public:
   /// unreachable operations.
   virtual void inlineBlockBefore(Block *source, Block *dest,
                                  Block::iterator before,
-                                 ValueRange argValues = std::nullopt);
+                                 ValueRange argValues = {});
 
   /// Inline the operations of block 'source' before the operation 'op'. The
   /// source block will be deleted and must have no uses. 'argValues' is used to
@@ -529,7 +529,7 @@ public:
   /// The source block must have no successors. Otherwise, the resulting IR
   /// would have unreachable operations.
   void inlineBlockBefore(Block *source, Operation *op,
-                         ValueRange argValues = std::nullopt);
+                         ValueRange argValues = {});
 
   /// Inline the operations of block 'source' into the end of block 'dest'. The
   /// source block will be deleted and must have no uses. 'argValues' is used to
@@ -537,8 +537,7 @@ public:
   ///
   /// The dest block must have no successors. Otherwise, the resulting IR would
   /// have unreachable operation.
-  void mergeBlocks(Block *source, Block *dest,
-                   ValueRange argValues = std::nullopt);
+  void mergeBlocks(Block *source, Block *dest, ValueRange argValues = {});
 
   /// Split the operations starting at "before" (inclusive) out of the given
   /// block into a new block, and return it.
diff --git a/mlir/include/mlir/IR/Region.h b/mlir/include/mlir/IR/Region.h
index 22cb7037772d..1fcb31675023 100644
--- a/mlir/include/mlir/IR/Region.h
+++ b/mlir/include/mlir/IR/Region.h
@@ -353,7 +353,7 @@ class RegionRange
 public:
   using RangeBaseT::RangeBaseT;
 
-  RegionRange(MutableArrayRef<Region> regions = std::nullopt);
+  RegionRange(MutableArrayRef<Region> regions = {});
 
   template <typename Arg, typename = std::enable_if_t<std::is_constructible<
                               ArrayRef<std::unique_ptr<Region>>, Arg>::value>>
diff --git a/mlir/include/mlir/IR/SymbolTable.h b/mlir/include/mlir/IR/SymbolTable.h
index 557a2ba85cd6..e4622354b898 100644
--- a/mlir/include/mlir/IR/SymbolTable.h
+++ b/mlir/include/mlir/IR/SymbolTable.h
@@ -411,7 +411,8 @@ public:
   /// Return the users of the provided symbol operation.
   ArrayRef<Operation *> getUsers(Operation *symbol) const {
     auto it = symbolToUsers.find(symbol);
-    return it != symbolToUsers.end() ? it->second.getArrayRef() : std::nullopt;
+    return it != symbolToUsers.end() ? it->second.getArrayRef()
+                                     : ArrayRef<Operation *>();
   }
 
   /// Return true if the given symbol has no uses.
diff --git a/mlir/include/mlir/IR/TypeRange.h b/mlir/include/mlir/IR/TypeRange.h
index e098370ae6e5..c6cbf3461bcd 100644
--- a/mlir/include/mlir/IR/TypeRange.h
+++ b/mlir/include/mlir/IR/TypeRange.h
@@ -37,7 +37,7 @@ class TypeRange : public llvm::detail::indexed_accessor_range_base<
                       Type, Type, Type> {
 public:
   using RangeBaseT::RangeBaseT;
-  TypeRange(ArrayRef<Type> types = std::nullopt);
+  TypeRange(ArrayRef<Type> types = {});
   explicit TypeRange(OperandRange values);
   explicit TypeRange(ResultRange values);
   explicit TypeRange(ValueRange values);
diff --git a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h
index 0c33e2b596b9..f04ed0544c0f 100644
--- a/mlir/include/mlir/IR/ValueRange.h
+++ b/mlir/include/mlir/IR/ValueRange.h
@@ -126,7 +126,7 @@ public:
   /// and range length. `operandSegments` is an optional set of operand segments
   /// to be updated when mutating the operand list.
   MutableOperandRange(Operation *owner, unsigned start, unsigned length,
-                      ArrayRef<OperandSegment> operandSegments = std::nullopt);
+                      ArrayRef<OperandSegment> operandSegments = {});
   MutableOperandRange(Operation *owner);
 
   /// Construct a new mutable range for the given OpOperand.
@@ -409,7 +409,7 @@ public:
       : ValueRange(ResultRange(values)) {}
   ValueRange(ArrayRef<BlockArgument> values)
       : ValueRange(ArrayRef<Value>(values.data(), values.size())) {}
-  ValueRange(ArrayRef<Value> values = std::nullopt);
+  ValueRange(ArrayRef<Value> values = {});
   ValueRange(OperandRange values);
   ValueRange(ResultRange values);
 
diff --git a/mlir/include/mlir/Rewrite/FrozenRewritePatternSet.h b/mlir/include/mlir/Rewrite/FrozenRewritePatternSet.h
index 4c6e3cd9ce6f..d6c431206e06 100644
--- a/mlir/include/mlir/Rewrite/FrozenRewritePatternSet.h
+++ b/mlir/include/mlir/Rewrite/FrozenRewritePatternSet.h
@@ -47,10 +47,9 @@ public:
   /// `RewritePatternSet::addWithLabel`. Debug names may be empty, but patterns
   /// created with `RewritePattern::create` have their default debug name set to
   /// their type name.
-  FrozenRewritePatternSet(
-      RewritePatternSet &&patterns,
-      ArrayRef<std::string> disabledPatternLabels = std::nullopt,
-      ArrayRef<std::string> enabledPatternLabels = std::nullopt);
+  FrozenRewritePatternSet(RewritePatternSet &&patterns,
+                          ArrayRef<std::string> disabledPatternLabels = {},
+                          ArrayRef<std::string> enabledPatternLabels = {});
 
   /// Return the op specific native patterns held by this list.
   const OpSpecificNativePatternListT &getOpSpecificNativePatterns() const {
diff --git a/mlir/include/mlir/Tools/PDLL/AST/Types.h b/mlir/include/mlir/Tools/PDLL/AST/Types.h
index 57161db5fdba..538ea7c61b44 100644
--- a/mlir/include/mlir/Tools/PDLL/AST/Types.h
+++ b/mlir/include/mlir/Tools/PDLL/AST/Types.h
@@ -226,8 +226,7 @@ public:
   /// Return an instance of the Tuple type.
   static TupleType get(Context &context, ArrayRef<Type> elementTypes,
                        ArrayRef<StringRef> elementNames);
-  static TupleType get(Context &context,
-                       ArrayRef<Type> elementTypes = std::nullopt);
+  static TupleType get(Context &context, ArrayRef<Type> elementTypes = {});
 
   /// Return the element types of this tuple.
   ArrayRef<Type> getElementTypes() const;
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index e7d05c3ce1ad..5a5f116073a9 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -826,7 +826,7 @@ public:
 
   /// PatternRewriter hook for inlining the ops of a block into another block.
   void inlineBlockBefore(Block *source, Block *dest, Block::iterator before,
-                         ValueRange argValues = std::nullopt) override;
+                         ValueRange argValues = {}) override;
   using PatternRewriter::inlineBlockBefore;
 
   /// PatternRewriter hook for updating the given operation in-place.
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 41f208216374..9cd2ef34e15e 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -62,8 +62,8 @@ std::unique_ptr<Pass> createCanonicalizerPass();
 /// set to their type name.
 std::unique_ptr<Pass>
 createCanonicalizerPass(const GreedyRewriteConfig &config,
-                        ArrayRef<std::string> disabledPatterns = std::nullopt,
-                        ArrayRef<std::string> enabledPatterns = std::nullopt);
+                        ArrayRef<std::string> disabledPatterns = {},
+                        ArrayRef<std::string> enabledPatterns = {});
 
 /// Creates a pass to perform control-flow sinking.
 std::unique_ptr<Pass> createControlFlowSinkPass();
diff --git a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
index a64dc7f74a19..695d43b04cff 100644
--- a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
@@ -55,7 +55,7 @@ static bool isInsideLaunch(Operation *op) {
 static std::tuple<Value, OpFoldResult, SmallVector<OpFoldResult>>
 getFlatOffsetAndStrides(OpBuilder &rewriter, Location loc, Value source,
                         ArrayRef<OpFoldResult> subOffsets,
-                        ArrayRef<OpFoldResult> subStrides = std::nullopt) {
+                        ArrayRef<OpFoldResult> subStrides = {}) {
   auto sourceType = cast<MemRefType>(source.getType());
   auto sourceRank = static_cast<unsigned>(sourceType.getRank());
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h b/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
index 8767b1c3ffc5..081a89d0b5dc 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
+++ b/mlir/lib/Dialect/LLVMIR/IR/TypeDetail.h
@@ -69,7 +69,7 @@ public:
   class Key {
   public:
     /// Constructs a key for an identified struct.
-    Key(StringRef name, bool opaque, ArrayRef<Type> types = std::nullopt)
+    Key(StringRef name, bool opaque, ArrayRef<Type> types = {})
         : types(types), name(name), identified(true), packed(false),
           opaque(opaque) {}
     /// Constructs a key for a literal struct.
diff --git a/mlir/lib/Pass/PassStatistics.cpp b/mlir/lib/Pass/PassStatistics.cpp
index 779dcfe7b666..01191aa82444 100644
--- a/mlir/lib/Pass/PassStatistics.cpp
+++ b/mlir/lib/Pass/PassStatistics.cpp
@@ -27,7 +27,7 @@ struct Statistic {
 
 /// Utility to print a pass entry in the statistics output.
 static void printPassEntry(raw_ostream &os, unsigned indent, StringRef pass,
-                           MutableArrayRef<Statistic> stats = std::nullopt) {
+                           MutableArrayRef<Statistic> stats = {}) {
   os.indent(indent) << pass << "\n";
   if (stats.empty())
     return;
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
index 18ee9d71cb9f..6ea74a988ca8 100644
--- a/mlir/unittests/IR/OperationSupportTest.cpp
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -18,9 +18,8 @@
 using namespace mlir;
 using namespace mlir::detail;
 
-static Operation *createOp(MLIRContext *context,
-                           ArrayRef<Value> operands = std::nullopt,
-                           ArrayRef<Type> resultTypes = std::nullopt,
+static Operation *createOp(MLIRContext *context, ArrayRef<Value> operands = {},
+                           ArrayRef<Type> resultTypes = {},
                            unsigned int numRegions = 0) {
   context->allowUnregisteredDialects();
   return Operation::create(
diff --git a/mlir/unittests/IR/ValueTest.cpp b/mlir/unittests/IR/ValueTest.cpp
index 1a84b7ca13df..58678224780b 100644
--- a/mlir/unittests/IR/ValueTest.cpp
+++ b/mlir/unittests/IR/ValueTest.cpp
@@ -16,9 +16,8 @@
 
 using namespace mlir;
 
-static Operation *createOp(MLIRContext *context,
-                           ArrayRef<Value> operands = std::nullopt,
-                           ArrayRef<Type> resultTypes = std::nullopt,
+static Operation *createOp(MLIRContext *context, ArrayRef<Value> operands = {},
+                           ArrayRef<Type> resultTypes = {},
                            unsigned int numRegions = 0) {
   context->allowUnregisteredDialects();
   return Operation::create(

From 9524bfb27020d31b9474f595b7c0e5d2e1ac65f5 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 20 Jun 2025 10:48:04 -0500
Subject: [PATCH 1066/1322] [lldb] Add Model Context Protocol (MCP) support to
 LLDB (#143628)

This PR adds an MCP (Model Context Protocol ) server to LLDB. For
motivation and background, please refer to the corresponding RFC:
https://discourse.llvm.org/t/rfc-adding-mcp-support-to-lldb/86798

I implemented this as a new kind of plugin. The idea is that we could
support multiple protocol servers (e.g. if we want to support DAP from
within LLDB). This also introduces a corresponding top-level command
(`protocol-server`) with two subcommands to `start` and `stop` the
server.

```
(lldb) protocol-server start MCP tcp://localhost:1234
MCP server started with connection listeners: connection://[::1]:1234, connection://[127.0.0.1]:1234
```

The MCP sever supports one tool (`lldb_command`) which executes a
command, but can easily be extended with more commands.
---
 lldb/cmake/modules/LLDBConfig.cmake           |   1 +
 lldb/include/lldb/Core/Debugger.h             |   6 +
 lldb/include/lldb/Core/PluginManager.h        |  11 +
 lldb/include/lldb/Core/ProtocolServer.h       |  39 +++
 .../Interpreter/CommandOptionArgumentTable.h  |   1 +
 lldb/include/lldb/lldb-enumerations.h         |   1 +
 lldb/include/lldb/lldb-forward.h              |   3 +-
 lldb/include/lldb/lldb-private-interfaces.h   |   2 +
 lldb/source/Commands/CMakeLists.txt           |   1 +
 .../Commands/CommandObjectProtocolServer.cpp  | 176 ++++++++++
 .../Commands/CommandObjectProtocolServer.h    |  25 ++
 lldb/source/Core/CMakeLists.txt               |   1 +
 lldb/source/Core/Debugger.cpp                 |  24 ++
 lldb/source/Core/PluginManager.cpp            |  32 ++
 lldb/source/Core/ProtocolServer.cpp           |  21 ++
 .../source/Interpreter/CommandInterpreter.cpp |   2 +
 lldb/source/Plugins/CMakeLists.txt            |   4 +
 lldb/source/Plugins/Protocol/CMakeLists.txt   |   1 +
 .../Plugins/Protocol/MCP/CMakeLists.txt       |  13 +
 lldb/source/Plugins/Protocol/MCP/MCPError.cpp |  34 ++
 lldb/source/Plugins/Protocol/MCP/MCPError.h   |  33 ++
 lldb/source/Plugins/Protocol/MCP/Protocol.cpp | 214 ++++++++++++
 lldb/source/Plugins/Protocol/MCP/Protocol.h   | 128 +++++++
 .../Protocol/MCP/ProtocolServerMCP.cpp        | 327 ++++++++++++++++++
 .../Plugins/Protocol/MCP/ProtocolServerMCP.h  | 100 ++++++
 lldb/source/Plugins/Protocol/MCP/Tool.cpp     |  81 +++++
 lldb/source/Plugins/Protocol/MCP/Tool.h       |  56 +++
 lldb/unittests/CMakeLists.txt                 |   4 +
 lldb/unittests/DAP/ProtocolTypesTest.cpp      |  38 +-
 lldb/unittests/Protocol/CMakeLists.txt        |  12 +
 .../Protocol/ProtocolMCPServerTest.cpp        | 291 ++++++++++++++++
 lldb/unittests/Protocol/ProtocolMCPTest.cpp   | 135 ++++++++
 lldb/unittests/TestingSupport/TestUtilities.h |   9 +
 33 files changed, 1803 insertions(+), 23 deletions(-)
 create mode 100644 lldb/include/lldb/Core/ProtocolServer.h
 create mode 100644 lldb/source/Commands/CommandObjectProtocolServer.cpp
 create mode 100644 lldb/source/Commands/CommandObjectProtocolServer.h
 create mode 100644 lldb/source/Core/ProtocolServer.cpp
 create mode 100644 lldb/source/Plugins/Protocol/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/Protocol/MCP/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/Protocol/MCP/MCPError.cpp
 create mode 100644 lldb/source/Plugins/Protocol/MCP/MCPError.h
 create mode 100644 lldb/source/Plugins/Protocol/MCP/Protocol.cpp
 create mode 100644 lldb/source/Plugins/Protocol/MCP/Protocol.h
 create mode 100644 lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
 create mode 100644 lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
 create mode 100644 lldb/source/Plugins/Protocol/MCP/Tool.cpp
 create mode 100644 lldb/source/Plugins/Protocol/MCP/Tool.h
 create mode 100644 lldb/unittests/Protocol/CMakeLists.txt
 create mode 100644 lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
 create mode 100644 lldb/unittests/Protocol/ProtocolMCPTest.cpp

diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 37b823feb584..8c30b6e09d2c 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -67,6 +67,7 @@ add_optional_dependency(LLDB_ENABLE_FBSDVMCORE "Enable libfbsdvmcore support in
 
 option(LLDB_USE_ENTITLEMENTS "When codesigning, use entitlements if available" ON)
 option(LLDB_BUILD_FRAMEWORK "Build LLDB.framework (Darwin only)" OFF)
+option(LLDB_ENABLE_PROTOCOL_SERVERS "Enable protocol servers (e.g. MCP) in LLDB" ON)
 option(LLDB_NO_INSTALL_DEFAULT_RPATH "Disable default RPATH settings in binaries" OFF)
 option(LLDB_USE_SYSTEM_DEBUGSERVER "Use the system's debugserver for testing (Darwin only)." OFF)
 option(LLDB_SKIP_STRIP "Whether to skip stripping of binaries when installing lldb." OFF)
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index 2087ef2a1156..9f82466a8341 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -602,6 +602,10 @@ public:
   void FlushProcessOutput(Process &process, bool flush_stdout,
                           bool flush_stderr);
 
+  void AddProtocolServer(lldb::ProtocolServerSP protocol_server_sp);
+  void RemoveProtocolServer(lldb::ProtocolServerSP protocol_server_sp);
+  lldb::ProtocolServerSP GetProtocolServer(llvm::StringRef protocol) const;
+
   SourceManager::SourceFileCache &GetSourceFileCache() {
     return m_source_file_cache;
   }
@@ -772,6 +776,8 @@ protected:
   mutable std::mutex m_progress_reports_mutex;
   /// @}
 
+  llvm::SmallVector<lldb::ProtocolServerSP> m_protocol_servers;
+
   std::mutex m_destroy_callback_mutex;
   lldb::callback_token_t m_destroy_callback_next_token = 0;
   struct DestroyCallbackInfo {
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index 1d7c976f3c38..d1af25988e50 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -321,6 +321,17 @@ public:
   static void AutoCompleteProcessName(llvm::StringRef partial_name,
                                       CompletionRequest &request);
 
+  // Protocol
+  static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
+                             ProtocolServerCreateInstance create_callback);
+
+  static bool UnregisterPlugin(ProtocolServerCreateInstance create_callback);
+
+  static llvm::StringRef GetProtocolServerPluginNameAtIndex(uint32_t idx);
+
+  static ProtocolServerCreateInstance
+  GetProtocolCreateCallbackForPluginName(llvm::StringRef name);
+
   // Register Type Provider
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              RegisterTypeBuilderCreateInstance create_callback);
diff --git a/lldb/include/lldb/Core/ProtocolServer.h b/lldb/include/lldb/Core/ProtocolServer.h
new file mode 100644
index 000000000000..fafe46090432
--- /dev/null
+++ b/lldb/include/lldb/Core/ProtocolServer.h
@@ -0,0 +1,39 @@
+//===-- ProtocolServer.h --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_CORE_PROTOCOLSERVER_H
+#define LLDB_CORE_PROTOCOLSERVER_H
+
+#include "lldb/Core/PluginInterface.h"
+#include "lldb/Host/Socket.h"
+#include "lldb/lldb-private-interfaces.h"
+
+namespace lldb_private {
+
+class ProtocolServer : public PluginInterface {
+public:
+  ProtocolServer() = default;
+  virtual ~ProtocolServer() = default;
+
+  static lldb::ProtocolServerSP Create(llvm::StringRef name,
+                                       Debugger &debugger);
+
+  struct Connection {
+    Socket::SocketProtocol protocol;
+    std::string name;
+  };
+
+  virtual llvm::Error Start(Connection connection) = 0;
+  virtual llvm::Error Stop() = 0;
+
+  virtual Socket *GetSocket() const = 0;
+};
+
+} // namespace lldb_private
+
+#endif
diff --git a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
index 8535dfcf46da..4face717531b 100644
--- a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
+++ b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h
@@ -315,6 +315,7 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = {
     { lldb::eArgTypeCPUName, "cpu-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a CPU." },
     { lldb::eArgTypeCPUFeatures, "cpu-features", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The CPU feature string." },
     { lldb::eArgTypeManagedPlugin, "managed-plugin", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Plugins managed by the PluginManager" },
+    { lldb::eArgTypeProtocol, "protocol", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of the protocol." },
     // clang-format on
 };
 
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index eeb7299a354e..69e8671b6e21 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -664,6 +664,7 @@ enum CommandArgumentType {
   eArgTypeCPUName,
   eArgTypeCPUFeatures,
   eArgTypeManagedPlugin,
+  eArgTypeProtocol,
   eArgTypeLastArg // Always keep this entry as the last entry in this
                   // enumeration!!
 };
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index c664d1398f74..558818e8e230 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -164,13 +164,13 @@ class PersistentExpressionState;
 class Platform;
 class Process;
 class ProcessAttachInfo;
-class ProcessLaunchInfo;
 class ProcessInfo;
 class ProcessInstanceInfo;
 class ProcessInstanceInfoMatch;
 class ProcessLaunchInfo;
 class ProcessModID;
 class Property;
+class ProtocolServer;
 class Queue;
 class QueueImpl;
 class QueueItem;
@@ -391,6 +391,7 @@ typedef std::shared_ptr<lldb_private::Platform> PlatformSP;
 typedef std::shared_ptr<lldb_private::Process> ProcessSP;
 typedef std::shared_ptr<lldb_private::ProcessAttachInfo> ProcessAttachInfoSP;
 typedef std::shared_ptr<lldb_private::ProcessLaunchInfo> ProcessLaunchInfoSP;
+typedef std::shared_ptr<lldb_private::ProtocolServer> ProtocolServerSP;
 typedef std::weak_ptr<lldb_private::Process> ProcessWP;
 typedef std::shared_ptr<lldb_private::RegisterCheckpoint> RegisterCheckpointSP;
 typedef std::shared_ptr<lldb_private::RegisterContext> RegisterContextSP;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index d366dbd1d783..34eaaa8e581e 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -81,6 +81,8 @@ typedef lldb::PlatformSP (*PlatformCreateInstance)(bool force,
 typedef lldb::ProcessSP (*ProcessCreateInstance)(
     lldb::TargetSP target_sp, lldb::ListenerSP listener_sp,
     const FileSpec *crash_file_path, bool can_connect);
+typedef lldb::ProtocolServerSP (*ProtocolServerCreateInstance)(
+    Debugger &debugger);
 typedef lldb::RegisterTypeBuilderSP (*RegisterTypeBuilderCreateInstance)(
     Target &target);
 typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)(
diff --git a/lldb/source/Commands/CMakeLists.txt b/lldb/source/Commands/CMakeLists.txt
index 1ea51acec5f1..69e4c45f0b8e 100644
--- a/lldb/source/Commands/CMakeLists.txt
+++ b/lldb/source/Commands/CMakeLists.txt
@@ -23,6 +23,7 @@ add_lldb_library(lldbCommands NO_PLUGIN_DEPENDENCIES
   CommandObjectPlatform.cpp
   CommandObjectPlugin.cpp
   CommandObjectProcess.cpp
+  CommandObjectProtocolServer.cpp
   CommandObjectQuit.cpp
   CommandObjectRegexCommand.cpp
   CommandObjectRegister.cpp
diff --git a/lldb/source/Commands/CommandObjectProtocolServer.cpp b/lldb/source/Commands/CommandObjectProtocolServer.cpp
new file mode 100644
index 000000000000..420fc5fdddad
--- /dev/null
+++ b/lldb/source/Commands/CommandObjectProtocolServer.cpp
@@ -0,0 +1,176 @@
+//===-- CommandObjectProtocolServer.cpp
+//----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CommandObjectProtocolServer.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Core/ProtocolServer.h"
+#include "lldb/Host/Socket.h"
+#include "lldb/Interpreter/CommandInterpreter.h"
+#include "lldb/Interpreter/CommandReturnObject.h"
+#include "lldb/Utility/UriParser.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FormatAdapters.h"
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+
+#define LLDB_OPTIONS_mcp
+#include "CommandOptions.inc"
+
+static std::vector<llvm::StringRef> GetSupportedProtocols() {
+  std::vector<llvm::StringRef> supported_protocols;
+  size_t i = 0;
+
+  for (llvm::StringRef protocol_name =
+           PluginManager::GetProtocolServerPluginNameAtIndex(i++);
+       !protocol_name.empty();
+       protocol_name = PluginManager::GetProtocolServerPluginNameAtIndex(i++)) {
+    supported_protocols.push_back(protocol_name);
+  }
+
+  return supported_protocols;
+}
+
+class CommandObjectProtocolServerStart : public CommandObjectParsed {
+public:
+  CommandObjectProtocolServerStart(CommandInterpreter &interpreter)
+      : CommandObjectParsed(interpreter, "protocol-server start",
+                            "start protocol server",
+                            "protocol-server start <protocol> <connection>") {
+    AddSimpleArgumentList(lldb::eArgTypeProtocol, eArgRepeatPlain);
+    AddSimpleArgumentList(lldb::eArgTypeConnectURL, eArgRepeatPlain);
+  }
+
+  ~CommandObjectProtocolServerStart() override = default;
+
+protected:
+  void DoExecute(Args &args, CommandReturnObject &result) override {
+    if (args.GetArgumentCount() < 1) {
+      result.AppendError("no protocol specified");
+      return;
+    }
+
+    llvm::StringRef protocol = args.GetArgumentAtIndex(0);
+    std::vector<llvm::StringRef> supported_protocols = GetSupportedProtocols();
+    if (llvm::find(supported_protocols, protocol) ==
+        supported_protocols.end()) {
+      result.AppendErrorWithFormatv(
+          "unsupported protocol: {0}. Supported protocols are: {1}", protocol,
+          llvm::join(GetSupportedProtocols(), ", "));
+      return;
+    }
+
+    if (args.GetArgumentCount() < 2) {
+      result.AppendError("no connection specified");
+      return;
+    }
+    llvm::StringRef connection_uri = args.GetArgumentAtIndex(1);
+
+    ProtocolServerSP server_sp = GetDebugger().GetProtocolServer(protocol);
+    if (!server_sp)
+      server_sp = ProtocolServer::Create(protocol, GetDebugger());
+
+    const char *connection_error =
+        "unsupported connection specifier, expected 'accept:///path' or "
+        "'listen://[host]:port', got '{0}'.";
+    auto uri = lldb_private::URI::Parse(connection_uri);
+    if (!uri) {
+      result.AppendErrorWithFormatv(connection_error, connection_uri);
+      return;
+    }
+
+    std::optional<Socket::ProtocolModePair> protocol_and_mode =
+        Socket::GetProtocolAndMode(uri->scheme);
+    if (!protocol_and_mode || protocol_and_mode->second != Socket::ModeAccept) {
+      result.AppendErrorWithFormatv(connection_error, connection_uri);
+      return;
+    }
+
+    ProtocolServer::Connection connection;
+    connection.protocol = protocol_and_mode->first;
+    connection.name =
+        formatv("[{0}]:{1}", uri->hostname.empty() ? "0.0.0.0" : uri->hostname,
+                uri->port.value_or(0));
+
+    if (llvm::Error error = server_sp->Start(connection)) {
+      result.AppendErrorWithFormatv("{0}", llvm::fmt_consume(std::move(error)));
+      return;
+    }
+
+    GetDebugger().AddProtocolServer(server_sp);
+
+    if (Socket *socket = server_sp->GetSocket()) {
+      std::string address =
+          llvm::join(socket->GetListeningConnectionURI(), ", ");
+      result.AppendMessageWithFormatv(
+          "{0} server started with connection listeners: {1}", protocol,
+          address);
+    }
+  }
+};
+
+class CommandObjectProtocolServerStop : public CommandObjectParsed {
+public:
+  CommandObjectProtocolServerStop(CommandInterpreter &interpreter)
+      : CommandObjectParsed(interpreter, "protocol-server stop",
+                            "stop protocol server",
+                            "protocol-server stop <protocol>") {
+    AddSimpleArgumentList(lldb::eArgTypeProtocol, eArgRepeatPlain);
+  }
+
+  ~CommandObjectProtocolServerStop() override = default;
+
+protected:
+  void DoExecute(Args &args, CommandReturnObject &result) override {
+    if (args.GetArgumentCount() < 1) {
+      result.AppendError("no protocol specified");
+      return;
+    }
+
+    llvm::StringRef protocol = args.GetArgumentAtIndex(0);
+    std::vector<llvm::StringRef> supported_protocols = GetSupportedProtocols();
+    if (llvm::find(supported_protocols, protocol) ==
+        supported_protocols.end()) {
+      result.AppendErrorWithFormatv(
+          "unsupported protocol: {0}. Supported protocols are: {1}", protocol,
+          llvm::join(GetSupportedProtocols(), ", "));
+      return;
+    }
+
+    Debugger &debugger = GetDebugger();
+
+    ProtocolServerSP server_sp = debugger.GetProtocolServer(protocol);
+    if (!server_sp) {
+      result.AppendError(
+          llvm::formatv("no {0} protocol server running", protocol).str());
+      return;
+    }
+
+    if (llvm::Error error = server_sp->Stop()) {
+      result.AppendErrorWithFormatv("{0}", llvm::fmt_consume(std::move(error)));
+      return;
+    }
+
+    debugger.RemoveProtocolServer(server_sp);
+  }
+};
+
+CommandObjectProtocolServer::CommandObjectProtocolServer(
+    CommandInterpreter &interpreter)
+    : CommandObjectMultiword(interpreter, "protocol-server",
+                             "Start and stop a protocol server.",
+                             "protocol-server") {
+  LoadSubCommand("start", CommandObjectSP(new CommandObjectProtocolServerStart(
+                              interpreter)));
+  LoadSubCommand("stop", CommandObjectSP(
+                             new CommandObjectProtocolServerStop(interpreter)));
+}
+
+CommandObjectProtocolServer::~CommandObjectProtocolServer() = default;
diff --git a/lldb/source/Commands/CommandObjectProtocolServer.h b/lldb/source/Commands/CommandObjectProtocolServer.h
new file mode 100644
index 000000000000..3591216b014c
--- /dev/null
+++ b/lldb/source/Commands/CommandObjectProtocolServer.h
@@ -0,0 +1,25 @@
+//===-- CommandObjectProtocolServer.h
+//------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_COMMANDS_COMMANDOBJECTPROTOCOLSERVER_H
+#define LLDB_SOURCE_COMMANDS_COMMANDOBJECTPROTOCOLSERVER_H
+
+#include "lldb/Interpreter/CommandObjectMultiword.h"
+
+namespace lldb_private {
+
+class CommandObjectProtocolServer : public CommandObjectMultiword {
+public:
+  CommandObjectProtocolServer(CommandInterpreter &interpreter);
+  ~CommandObjectProtocolServer() override;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_COMMANDS_COMMANDOBJECTMCP_H
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index d6b75bca7f2d..df35bd5c025f 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -46,6 +46,7 @@ add_lldb_library(lldbCore NO_PLUGIN_DEPENDENCIES
   Opcode.cpp
   PluginManager.cpp
   Progress.cpp
+  ProtocolServer.cpp
   Statusline.cpp
   RichManglingContext.cpp
   SearchFilter.cpp
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index c9935f2d745f..33d1053fd8a6 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -16,6 +16,7 @@
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Core/Progress.h"
+#include "lldb/Core/ProtocolServer.h"
 #include "lldb/Core/StreamAsynchronousIO.h"
 #include "lldb/Core/Telemetry.h"
 #include "lldb/DataFormatters/DataVisualization.h"
@@ -2375,3 +2376,26 @@ llvm::ThreadPoolInterface &Debugger::GetThreadPool() {
          "Debugger::GetThreadPool called before Debugger::Initialize");
   return *g_thread_pool;
 }
+
+void Debugger::AddProtocolServer(lldb::ProtocolServerSP protocol_server_sp) {
+  assert(protocol_server_sp &&
+         GetProtocolServer(protocol_server_sp->GetPluginName()) == nullptr);
+  m_protocol_servers.push_back(protocol_server_sp);
+}
+
+void Debugger::RemoveProtocolServer(lldb::ProtocolServerSP protocol_server_sp) {
+  auto it = llvm::find(m_protocol_servers, protocol_server_sp);
+  if (it != m_protocol_servers.end())
+    m_protocol_servers.erase(it);
+}
+
+lldb::ProtocolServerSP
+Debugger::GetProtocolServer(llvm::StringRef protocol) const {
+  for (ProtocolServerSP protocol_server_sp : m_protocol_servers) {
+    if (!protocol_server_sp)
+      continue;
+    if (protocol_server_sp->GetPluginName() == protocol)
+      return protocol_server_sp;
+  }
+  return nullptr;
+}
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index dfa865929b64..dc0731c04eef 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -1152,6 +1152,38 @@ void PluginManager::AutoCompleteProcessName(llvm::StringRef name,
   }
 }
 
+#pragma mark ProtocolServer
+
+typedef PluginInstance<ProtocolServerCreateInstance> ProtocolServerInstance;
+typedef PluginInstances<ProtocolServerInstance> ProtocolServerInstances;
+
+static ProtocolServerInstances &GetProtocolServerInstances() {
+  static ProtocolServerInstances g_instances;
+  return g_instances;
+}
+
+bool PluginManager::RegisterPlugin(
+    llvm::StringRef name, llvm::StringRef description,
+    ProtocolServerCreateInstance create_callback) {
+  return GetProtocolServerInstances().RegisterPlugin(name, description,
+                                                     create_callback);
+}
+
+bool PluginManager::UnregisterPlugin(
+    ProtocolServerCreateInstance create_callback) {
+  return GetProtocolServerInstances().UnregisterPlugin(create_callback);
+}
+
+llvm::StringRef
+PluginManager::GetProtocolServerPluginNameAtIndex(uint32_t idx) {
+  return GetProtocolServerInstances().GetNameAtIndex(idx);
+}
+
+ProtocolServerCreateInstance
+PluginManager::GetProtocolCreateCallbackForPluginName(llvm::StringRef name) {
+  return GetProtocolServerInstances().GetCallbackForName(name);
+}
+
 #pragma mark RegisterTypeBuilder
 
 struct RegisterTypeBuilderInstance
diff --git a/lldb/source/Core/ProtocolServer.cpp b/lldb/source/Core/ProtocolServer.cpp
new file mode 100644
index 000000000000..d57a047afa7b
--- /dev/null
+++ b/lldb/source/Core/ProtocolServer.cpp
@@ -0,0 +1,21 @@
+//===-- ProtocolServer.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/ProtocolServer.h"
+#include "lldb/Core/PluginManager.h"
+
+using namespace lldb_private;
+using namespace lldb;
+
+ProtocolServerSP ProtocolServer::Create(llvm::StringRef name,
+                                        Debugger &debugger) {
+  if (ProtocolServerCreateInstance create_callback =
+          PluginManager::GetProtocolCreateCallbackForPluginName(name))
+    return create_callback(debugger);
+  return nullptr;
+}
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 4f9ae104dede..00c3472444d2 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -30,6 +30,7 @@
 #include "Commands/CommandObjectPlatform.h"
 #include "Commands/CommandObjectPlugin.h"
 #include "Commands/CommandObjectProcess.h"
+#include "Commands/CommandObjectProtocolServer.h"
 #include "Commands/CommandObjectQuit.h"
 #include "Commands/CommandObjectRegexCommand.h"
 #include "Commands/CommandObjectRegister.h"
@@ -574,6 +575,7 @@ void CommandInterpreter::LoadCommandDictionary() {
   REGISTER_COMMAND_OBJECT("platform", CommandObjectPlatform);
   REGISTER_COMMAND_OBJECT("plugin", CommandObjectPlugin);
   REGISTER_COMMAND_OBJECT("process", CommandObjectMultiwordProcess);
+  REGISTER_COMMAND_OBJECT("protocol-server", CommandObjectProtocolServer);
   REGISTER_COMMAND_OBJECT("quit", CommandObjectQuit);
   REGISTER_COMMAND_OBJECT("register", CommandObjectRegister);
   REGISTER_COMMAND_OBJECT("scripting", CommandObjectMultiwordScripting);
diff --git a/lldb/source/Plugins/CMakeLists.txt b/lldb/source/Plugins/CMakeLists.txt
index 854f589f45ae..08f444e7b15e 100644
--- a/lldb/source/Plugins/CMakeLists.txt
+++ b/lldb/source/Plugins/CMakeLists.txt
@@ -27,6 +27,10 @@ add_subdirectory(TraceExporter)
 add_subdirectory(TypeSystem)
 add_subdirectory(UnwindAssembly)
 
+if(LLDB_ENABLE_PROTOCOL_SERVERS)
+  add_subdirectory(Protocol)
+endif()
+
 set(LLDB_STRIPPED_PLUGINS)
 get_property(LLDB_ALL_PLUGINS GLOBAL PROPERTY LLDB_PLUGINS)
 
diff --git a/lldb/source/Plugins/Protocol/CMakeLists.txt b/lldb/source/Plugins/Protocol/CMakeLists.txt
new file mode 100644
index 000000000000..93b347d4cc9d
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(MCP)
diff --git a/lldb/source/Plugins/Protocol/MCP/CMakeLists.txt b/lldb/source/Plugins/Protocol/MCP/CMakeLists.txt
new file mode 100644
index 000000000000..db31a7a69cb3
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_lldb_library(lldbPluginProtocolServerMCP PLUGIN
+  MCPError.cpp
+  Protocol.cpp
+  ProtocolServerMCP.cpp
+  Tool.cpp
+
+  LINK_COMPONENTS
+    Support
+
+  LINK_LIBS
+    lldbHost
+    lldbUtility
+)
diff --git a/lldb/source/Plugins/Protocol/MCP/MCPError.cpp b/lldb/source/Plugins/Protocol/MCP/MCPError.cpp
new file mode 100644
index 000000000000..5ed850066b65
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/MCPError.cpp
@@ -0,0 +1,34 @@
+//===-- MCPError.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCPError.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
+
+namespace lldb_private::mcp {
+
+char MCPError::ID;
+
+MCPError::MCPError(std::string message, int64_t error_code)
+    : m_message(message), m_error_code(error_code) {}
+
+void MCPError::log(llvm::raw_ostream &OS) const { OS << m_message; }
+
+std::error_code MCPError::convertToErrorCode() const {
+  return llvm::inconvertibleErrorCode();
+}
+
+protocol::Error MCPError::toProtcolError() const {
+  protocol::Error error;
+  error.error.code = m_error_code;
+  error.error.message = m_message;
+  return error;
+}
+
+} // namespace lldb_private::mcp
diff --git a/lldb/source/Plugins/Protocol/MCP/MCPError.h b/lldb/source/Plugins/Protocol/MCP/MCPError.h
new file mode 100644
index 000000000000..2a76a7b087e2
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/MCPError.h
@@ -0,0 +1,33 @@
+//===-- MCPError.h --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Protocol.h"
+#include "llvm/Support/Error.h"
+#include <string>
+
+namespace lldb_private::mcp {
+
+class MCPError : public llvm::ErrorInfo<MCPError> {
+public:
+  static char ID;
+
+  MCPError(std::string message, int64_t error_code);
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override;
+
+  const std::string &getMessage() const { return m_message; }
+
+  protocol::Error toProtcolError() const;
+
+private:
+  std::string m_message;
+  int64_t m_error_code;
+};
+
+} // namespace lldb_private::mcp
diff --git a/lldb/source/Plugins/Protocol/MCP/Protocol.cpp b/lldb/source/Plugins/Protocol/MCP/Protocol.cpp
new file mode 100644
index 000000000000..d66c931a0b28
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/Protocol.cpp
@@ -0,0 +1,214 @@
+//===- Protocol.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Protocol.h"
+#include "llvm/Support/JSON.h"
+
+using namespace llvm;
+
+namespace lldb_private::mcp::protocol {
+
+static bool mapRaw(const json::Value &Params, StringLiteral Prop,
+                   std::optional<json::Value> &V, json::Path P) {
+  const auto *O = Params.getAsObject();
+  if (!O) {
+    P.report("expected object");
+    return false;
+  }
+  const json::Value *E = O->get(Prop);
+  if (E)
+    V = std::move(*E);
+  return true;
+}
+
+llvm::json::Value toJSON(const Request &R) {
+  json::Object Result{{"jsonrpc", "2.0"}, {"id", R.id}, {"method", R.method}};
+  if (R.params)
+    Result.insert({"params", R.params});
+  return Result;
+}
+
+bool fromJSON(const llvm::json::Value &V, Request &R, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  if (!O || !O.map("id", R.id) || !O.map("method", R.method))
+    return false;
+  return mapRaw(V, "params", R.params, P);
+}
+
+llvm::json::Value toJSON(const ErrorInfo &EI) {
+  llvm::json::Object Result{{"code", EI.code}, {"message", EI.message}};
+  if (EI.data)
+    Result.insert({"data", EI.data});
+  return Result;
+}
+
+bool fromJSON(const llvm::json::Value &V, ErrorInfo &EI, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("code", EI.code) && O.map("message", EI.message) &&
+         O.mapOptional("data", EI.data);
+}
+
+llvm::json::Value toJSON(const Error &E) {
+  return json::Object{{"jsonrpc", "2.0"}, {"id", E.id}, {"error", E.error}};
+}
+
+bool fromJSON(const llvm::json::Value &V, Error &E, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("id", E.id) && O.map("error", E.error);
+}
+
+llvm::json::Value toJSON(const Response &R) {
+  llvm::json::Object Result{{"jsonrpc", "2.0"}, {"id", R.id}};
+  if (R.result)
+    Result.insert({"result", R.result});
+  if (R.error)
+    Result.insert({"error", R.error});
+  return Result;
+}
+
+bool fromJSON(const llvm::json::Value &V, Response &R, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  if (!O || !O.map("id", R.id) || !O.map("error", R.error))
+    return false;
+  return mapRaw(V, "result", R.result, P);
+}
+
+llvm::json::Value toJSON(const Notification &N) {
+  llvm::json::Object Result{{"jsonrpc", "2.0"}, {"method", N.method}};
+  if (N.params)
+    Result.insert({"params", N.params});
+  return Result;
+}
+
+bool fromJSON(const llvm::json::Value &V, Notification &N, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  if (!O || !O.map("method", N.method))
+    return false;
+  auto *Obj = V.getAsObject();
+  if (!Obj)
+    return false;
+  if (auto *Params = Obj->get("params"))
+    N.params = *Params;
+  return true;
+}
+
+llvm::json::Value toJSON(const ToolCapability &TC) {
+  return llvm::json::Object{{"listChanged", TC.listChanged}};
+}
+
+bool fromJSON(const llvm::json::Value &V, ToolCapability &TC,
+              llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("listChanged", TC.listChanged);
+}
+
+llvm::json::Value toJSON(const Capabilities &C) {
+  return llvm::json::Object{{"tools", C.tools}};
+}
+
+bool fromJSON(const llvm::json::Value &V, Capabilities &C, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("tools", C.tools);
+}
+
+llvm::json::Value toJSON(const TextContent &TC) {
+  return llvm::json::Object{{"type", "text"}, {"text", TC.text}};
+}
+
+bool fromJSON(const llvm::json::Value &V, TextContent &TC, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("text", TC.text);
+}
+
+llvm::json::Value toJSON(const TextResult &TR) {
+  return llvm::json::Object{{"content", TR.content}, {"isError", TR.isError}};
+}
+
+bool fromJSON(const llvm::json::Value &V, TextResult &TR, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("content", TR.content) && O.map("isError", TR.isError);
+}
+
+llvm::json::Value toJSON(const ToolDefinition &TD) {
+  llvm::json::Object Result{{"name", TD.name}};
+  if (TD.description)
+    Result.insert({"description", TD.description});
+  if (TD.inputSchema)
+    Result.insert({"inputSchema", TD.inputSchema});
+  return Result;
+}
+
+bool fromJSON(const llvm::json::Value &V, ToolDefinition &TD,
+              llvm::json::Path P) {
+
+  llvm::json::ObjectMapper O(V, P);
+  if (!O || !O.map("name", TD.name) ||
+      !O.mapOptional("description", TD.description))
+    return false;
+  return mapRaw(V, "inputSchema", TD.inputSchema, P);
+}
+
+llvm::json::Value toJSON(const Message &M) {
+  return std::visit([](auto &M) { return toJSON(M); }, M);
+}
+
+bool fromJSON(const llvm::json::Value &V, Message &M, llvm::json::Path P) {
+  const auto *O = V.getAsObject();
+  if (!O) {
+    P.report("expected object");
+    return false;
+  }
+
+  if (const json::Value *V = O->get("jsonrpc")) {
+    if (V->getAsString().value_or("") != "2.0") {
+      P.report("unsupported JSON RPC version");
+      return false;
+    }
+  } else {
+    P.report("not a valid JSON RPC message");
+    return false;
+  }
+
+  // A message without an ID is a Notification.
+  if (!O->get("id")) {
+    protocol::Notification N;
+    if (!fromJSON(V, N, P))
+      return false;
+    M = std::move(N);
+    return true;
+  }
+
+  if (O->get("error")) {
+    protocol::Error E;
+    if (!fromJSON(V, E, P))
+      return false;
+    M = std::move(E);
+    return true;
+  }
+
+  if (O->get("result")) {
+    protocol::Response R;
+    if (!fromJSON(V, R, P))
+      return false;
+    M = std::move(R);
+    return true;
+  }
+
+  if (O->get("method")) {
+    protocol::Request R;
+    if (!fromJSON(V, R, P))
+      return false;
+    M = std::move(R);
+    return true;
+  }
+
+  P.report("unrecognized message type");
+  return false;
+}
+
+} // namespace lldb_private::mcp::protocol
diff --git a/lldb/source/Plugins/Protocol/MCP/Protocol.h b/lldb/source/Plugins/Protocol/MCP/Protocol.h
new file mode 100644
index 000000000000..e31589940657
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/Protocol.h
@@ -0,0 +1,128 @@
+//===- Protocol.h ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains POD structs based on the MCP specification at
+// https://github.com/modelcontextprotocol/modelcontextprotocol/blob/main/schema/2024-11-05/schema.json
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_PROTOCOL_MCP_PROTOCOL_H
+#define LLDB_PLUGINS_PROTOCOL_MCP_PROTOCOL_H
+
+#include "llvm/Support/JSON.h"
+#include <optional>
+#include <string>
+#include <variant>
+
+namespace lldb_private::mcp::protocol {
+
+static llvm::StringLiteral kVersion = "2024-11-05";
+
+/// A request that expects a response.
+struct Request {
+  uint64_t id = 0;
+  std::string method;
+  std::optional<llvm::json::Value> params;
+};
+
+llvm::json::Value toJSON(const Request &);
+bool fromJSON(const llvm::json::Value &, Request &, llvm::json::Path);
+
+struct ErrorInfo {
+  int64_t code = 0;
+  std::string message;
+  std::optional<std::string> data;
+};
+
+llvm::json::Value toJSON(const ErrorInfo &);
+bool fromJSON(const llvm::json::Value &, ErrorInfo &, llvm::json::Path);
+
+struct Error {
+  uint64_t id = 0;
+  ErrorInfo error;
+};
+
+llvm::json::Value toJSON(const Error &);
+bool fromJSON(const llvm::json::Value &, Error &, llvm::json::Path);
+
+struct Response {
+  uint64_t id = 0;
+  std::optional<llvm::json::Value> result;
+  std::optional<ErrorInfo> error;
+};
+
+llvm::json::Value toJSON(const Response &);
+bool fromJSON(const llvm::json::Value &, Response &, llvm::json::Path);
+
+/// A notification which does not expect a response.
+struct Notification {
+  std::string method;
+  std::optional<llvm::json::Value> params;
+};
+
+llvm::json::Value toJSON(const Notification &);
+bool fromJSON(const llvm::json::Value &, Notification &, llvm::json::Path);
+
+struct ToolCapability {
+  /// Whether this server supports notifications for changes to the tool list.
+  bool listChanged = false;
+};
+
+llvm::json::Value toJSON(const ToolCapability &);
+bool fromJSON(const llvm::json::Value &, ToolCapability &, llvm::json::Path);
+
+/// Capabilities that a server may support. Known capabilities are defined here,
+/// in this schema, but this is not a closed set: any server can define its own,
+/// additional capabilities.
+struct Capabilities {
+  /// Present if the server offers any tools to call.
+  ToolCapability tools;
+};
+
+llvm::json::Value toJSON(const Capabilities &);
+bool fromJSON(const llvm::json::Value &, Capabilities &, llvm::json::Path);
+
+/// Text provided to or from an LLM.
+struct TextContent {
+  /// The text content of the message.
+  std::string text;
+};
+
+llvm::json::Value toJSON(const TextContent &);
+bool fromJSON(const llvm::json::Value &, TextContent &, llvm::json::Path);
+
+struct TextResult {
+  std::vector<TextContent> content;
+  bool isError = false;
+};
+
+llvm::json::Value toJSON(const TextResult &);
+bool fromJSON(const llvm::json::Value &, TextResult &, llvm::json::Path);
+
+struct ToolDefinition {
+  /// Unique identifier for the tool.
+  std::string name;
+
+  /// Human-readable description.
+  std::optional<std::string> description;
+
+  // JSON Schema for the tool's parameters.
+  std::optional<llvm::json::Value> inputSchema;
+};
+
+llvm::json::Value toJSON(const ToolDefinition &);
+bool fromJSON(const llvm::json::Value &, ToolDefinition &, llvm::json::Path);
+
+using Message = std::variant<Request, Response, Notification, Error>;
+
+bool fromJSON(const llvm::json::Value &, Message &, llvm::json::Path);
+llvm::json::Value toJSON(const Message &);
+
+} // namespace lldb_private::mcp::protocol
+
+#endif
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
new file mode 100644
index 000000000000..029d4a887b0c
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
@@ -0,0 +1,327 @@
+//===- ProtocolServerMCP.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProtocolServerMCP.h"
+#include "MCPError.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Threading.h"
+#include <thread>
+#include <variant>
+
+using namespace lldb_private;
+using namespace lldb_private::mcp;
+using namespace llvm;
+
+LLDB_PLUGIN_DEFINE(ProtocolServerMCP)
+
+static constexpr size_t kChunkSize = 1024;
+
+ProtocolServerMCP::ProtocolServerMCP(Debugger &debugger)
+    : ProtocolServer(), m_debugger(debugger) {
+  AddRequestHandler("initialize",
+                    std::bind(&ProtocolServerMCP::InitializeHandler, this,
+                              std::placeholders::_1));
+  AddRequestHandler("tools/list",
+                    std::bind(&ProtocolServerMCP::ToolsListHandler, this,
+                              std::placeholders::_1));
+  AddRequestHandler("tools/call",
+                    std::bind(&ProtocolServerMCP::ToolsCallHandler, this,
+                              std::placeholders::_1));
+  AddNotificationHandler(
+      "notifications/initialized", [](const protocol::Notification &) {
+        LLDB_LOG(GetLog(LLDBLog::Host), "MCP initialization complete");
+      });
+  AddTool(std::make_unique<LLDBCommandTool>(
+      "lldb_command", "Run an lldb command.", m_debugger));
+}
+
+ProtocolServerMCP::~ProtocolServerMCP() { llvm::consumeError(Stop()); }
+
+void ProtocolServerMCP::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(), CreateInstance);
+}
+
+void ProtocolServerMCP::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+lldb::ProtocolServerSP ProtocolServerMCP::CreateInstance(Debugger &debugger) {
+  return std::make_shared<ProtocolServerMCP>(debugger);
+}
+
+llvm::StringRef ProtocolServerMCP::GetPluginDescriptionStatic() {
+  return "MCP Server.";
+}
+
+llvm::Expected<protocol::Response>
+ProtocolServerMCP::Handle(protocol::Request request) {
+  auto it = m_request_handlers.find(request.method);
+  if (it != m_request_handlers.end()) {
+    llvm::Expected<protocol::Response> response = it->second(request);
+    if (!response)
+      return response;
+    response->id = request.id;
+    return *response;
+  }
+
+  return make_error<MCPError>(
+      llvm::formatv("no handler for request: {0}", request.method).str(), 1);
+}
+
+void ProtocolServerMCP::Handle(protocol::Notification notification) {
+  auto it = m_notification_handlers.find(notification.method);
+  if (it != m_notification_handlers.end()) {
+    it->second(notification);
+    return;
+  }
+
+  LLDB_LOG(GetLog(LLDBLog::Host), "MPC notification: {0} ({1})",
+           notification.method, notification.params);
+}
+
+void ProtocolServerMCP::AcceptCallback(std::unique_ptr<Socket> socket) {
+  LLDB_LOG(GetLog(LLDBLog::Host), "New MCP client ({0}) connected",
+           m_clients.size() + 1);
+
+  lldb::IOObjectSP io_sp = std::move(socket);
+  auto client_up = std::make_unique<Client>();
+  client_up->io_sp = io_sp;
+  Client *client = client_up.get();
+
+  Status status;
+  auto read_handle_up = m_loop.RegisterReadObject(
+      io_sp,
+      [this, client](MainLoopBase &loop) {
+        if (Error error = ReadCallback(*client)) {
+          LLDB_LOG_ERROR(GetLog(LLDBLog::Host), std::move(error), "{0}");
+          client->read_handle_up.reset();
+        }
+      },
+      status);
+  if (status.Fail())
+    return;
+
+  client_up->read_handle_up = std::move(read_handle_up);
+  m_clients.emplace_back(std::move(client_up));
+}
+
+llvm::Error ProtocolServerMCP::ReadCallback(Client &client) {
+  char chunk[kChunkSize];
+  size_t bytes_read = sizeof(chunk);
+  if (Status status = client.io_sp->Read(chunk, bytes_read); status.Fail())
+    return status.takeError();
+  client.buffer.append(chunk, bytes_read);
+
+  for (std::string::size_type pos;
+       (pos = client.buffer.find('\n')) != std::string::npos;) {
+    llvm::Expected<std::optional<protocol::Message>> message =
+        HandleData(StringRef(client.buffer.data(), pos));
+    client.buffer = client.buffer.erase(0, pos + 1);
+    if (!message)
+      return message.takeError();
+
+    if (*message) {
+      std::string Output;
+      llvm::raw_string_ostream OS(Output);
+      OS << llvm::formatv("{0}", toJSON(**message)) << '\n';
+      size_t num_bytes = Output.size();
+      return client.io_sp->Write(Output.data(), num_bytes).takeError();
+    }
+  }
+
+  return llvm::Error::success();
+}
+
+llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
+  std::lock_guard<std::mutex> guard(m_server_mutex);
+
+  if (m_running)
+    return llvm::createStringError("server already running");
+
+  Status status;
+  m_listener = Socket::Create(connection.protocol, status);
+  if (status.Fail())
+    return status.takeError();
+
+  status = m_listener->Listen(connection.name, /*backlog=*/5);
+  if (status.Fail())
+    return status.takeError();
+
+  std::string address =
+      llvm::join(m_listener->GetListeningConnectionURI(), ", ");
+  auto handles =
+      m_listener->Accept(m_loop, std::bind(&ProtocolServerMCP::AcceptCallback,
+                                           this, std::placeholders::_1));
+  if (llvm::Error error = handles.takeError())
+    return error;
+
+  m_listen_handlers = std::move(*handles);
+  m_loop_thread = std::thread([=] {
+    llvm::set_thread_name(
+        llvm::formatv("debugger-{0}.mcp.runloop", m_debugger.GetID()));
+    m_loop.Run();
+  });
+
+  return llvm::Error::success();
+}
+
+llvm::Error ProtocolServerMCP::Stop() {
+  {
+    std::lock_guard<std::mutex> guard(m_server_mutex);
+    m_running = false;
+  }
+
+  // Stop the main loop.
+  m_loop.AddPendingCallback(
+      [](MainLoopBase &loop) { loop.RequestTermination(); });
+
+  // Wait for the main loop to exit.
+  if (m_loop_thread.joinable())
+    m_loop_thread.join();
+
+  {
+    std::lock_guard<std::mutex> guard(m_server_mutex);
+    m_listener.reset();
+    m_listen_handlers.clear();
+    m_clients.clear();
+  }
+
+  return llvm::Error::success();
+}
+
+llvm::Expected<std::optional<protocol::Message>>
+ProtocolServerMCP::HandleData(llvm::StringRef data) {
+  auto message = llvm::json::parse<protocol::Message>(/*JSON=*/data);
+  if (!message)
+    return message.takeError();
+
+  if (const protocol::Request *request =
+          std::get_if<protocol::Request>(&(*message))) {
+    llvm::Expected<protocol::Response> response = Handle(*request);
+
+    // Handle failures by converting them into an Error message.
+    if (!response) {
+      protocol::Error protocol_error;
+      llvm::handleAllErrors(
+          response.takeError(),
+          [&](const MCPError &err) { protocol_error = err.toProtcolError(); },
+          [&](const llvm::ErrorInfoBase &err) {
+            protocol_error.error.code = -1;
+            protocol_error.error.message = err.message();
+          });
+      protocol_error.id = request->id;
+      return protocol_error;
+    }
+
+    return *response;
+  }
+
+  if (const protocol::Notification *notification =
+          std::get_if<protocol::Notification>(&(*message))) {
+    Handle(*notification);
+    return std::nullopt;
+  }
+
+  if (std::get_if<protocol::Error>(&(*message)))
+    return llvm::createStringError("unexpected MCP message: error");
+
+  if (std::get_if<protocol::Response>(&(*message)))
+    return llvm::createStringError("unexpected MCP message: response");
+
+  llvm_unreachable("all message types handled");
+}
+
+protocol::Capabilities ProtocolServerMCP::GetCapabilities() {
+  protocol::Capabilities capabilities;
+  capabilities.tools.listChanged = true;
+  return capabilities;
+}
+
+void ProtocolServerMCP::AddTool(std::unique_ptr<Tool> tool) {
+  std::lock_guard<std::mutex> guard(m_server_mutex);
+
+  if (!tool)
+    return;
+  m_tools[tool->GetName()] = std::move(tool);
+}
+
+void ProtocolServerMCP::AddRequestHandler(llvm::StringRef method,
+                                          RequestHandler handler) {
+  std::lock_guard<std::mutex> guard(m_server_mutex);
+  m_request_handlers[method] = std::move(handler);
+}
+
+void ProtocolServerMCP::AddNotificationHandler(llvm::StringRef method,
+                                               NotificationHandler handler) {
+  std::lock_guard<std::mutex> guard(m_server_mutex);
+  m_notification_handlers[method] = std::move(handler);
+}
+
+llvm::Expected<protocol::Response>
+ProtocolServerMCP::InitializeHandler(const protocol::Request &request) {
+  protocol::Response response;
+  response.result.emplace(llvm::json::Object{
+      {"protocolVersion", protocol::kVersion},
+      {"capabilities", GetCapabilities()},
+      {"serverInfo",
+       llvm::json::Object{{"name", kName}, {"version", kVersion}}}});
+  return response;
+}
+
+llvm::Expected<protocol::Response>
+ProtocolServerMCP::ToolsListHandler(const protocol::Request &request) {
+  protocol::Response response;
+
+  llvm::json::Array tools;
+  for (const auto &tool : m_tools)
+    tools.emplace_back(toJSON(tool.second->GetDefinition()));
+
+  response.result.emplace(llvm::json::Object{{"tools", std::move(tools)}});
+
+  return response;
+}
+
+llvm::Expected<protocol::Response>
+ProtocolServerMCP::ToolsCallHandler(const protocol::Request &request) {
+  protocol::Response response;
+
+  if (!request.params)
+    return llvm::createStringError("no tool parameters");
+
+  const json::Object *param_obj = request.params->getAsObject();
+  if (!param_obj)
+    return llvm::createStringError("no tool parameters");
+
+  const json::Value *name = param_obj->get("name");
+  if (!name)
+    return llvm::createStringError("no tool name");
+
+  llvm::StringRef tool_name = name->getAsString().value_or("");
+  if (tool_name.empty())
+    return llvm::createStringError("no tool name");
+
+  auto it = m_tools.find(tool_name);
+  if (it == m_tools.end())
+    return llvm::createStringError(llvm::formatv("no tool \"{0}\"", tool_name));
+
+  const json::Value *args = param_obj->get("arguments");
+  if (!args)
+    return llvm::createStringError("no tool arguments");
+
+  llvm::Expected<protocol::TextResult> text_result = it->second->Call(*args);
+  if (!text_result)
+    return text_result.takeError();
+
+  response.result.emplace(toJSON(*text_result));
+
+  return response;
+}
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
new file mode 100644
index 000000000000..52bb92a04a80
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
@@ -0,0 +1,100 @@
+//===- ProtocolServerMCP.h ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_PROTOCOL_MCP_PROTOCOLSERVERMCP_H
+#define LLDB_PLUGINS_PROTOCOL_MCP_PROTOCOLSERVERMCP_H
+
+#include "Protocol.h"
+#include "Tool.h"
+#include "lldb/Core/ProtocolServer.h"
+#include "lldb/Host/MainLoop.h"
+#include "lldb/Host/Socket.h"
+#include "llvm/ADT/StringMap.h"
+#include <thread>
+
+namespace lldb_private::mcp {
+
+class ProtocolServerMCP : public ProtocolServer {
+public:
+  ProtocolServerMCP(Debugger &debugger);
+  virtual ~ProtocolServerMCP() override;
+
+  virtual llvm::Error Start(ProtocolServer::Connection connection) override;
+  virtual llvm::Error Stop() override;
+
+  static void Initialize();
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() { return "MCP"; }
+  static llvm::StringRef GetPluginDescriptionStatic();
+
+  static lldb::ProtocolServerSP CreateInstance(Debugger &debugger);
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+  Socket *GetSocket() const override { return m_listener.get(); }
+
+protected:
+  using RequestHandler = std::function<llvm::Expected<protocol::Response>(
+      const protocol::Request &)>;
+  using NotificationHandler =
+      std::function<void(const protocol::Notification &)>;
+
+  void AddTool(std::unique_ptr<Tool> tool);
+  void AddRequestHandler(llvm::StringRef method, RequestHandler handler);
+  void AddNotificationHandler(llvm::StringRef method,
+                              NotificationHandler handler);
+
+private:
+  void AcceptCallback(std::unique_ptr<Socket> socket);
+
+  llvm::Expected<std::optional<protocol::Message>>
+  HandleData(llvm::StringRef data);
+
+  llvm::Expected<protocol::Response> Handle(protocol::Request request);
+  void Handle(protocol::Notification notification);
+
+  llvm::Expected<protocol::Response>
+  InitializeHandler(const protocol::Request &);
+  llvm::Expected<protocol::Response>
+  ToolsListHandler(const protocol::Request &);
+  llvm::Expected<protocol::Response>
+  ToolsCallHandler(const protocol::Request &);
+
+  protocol::Capabilities GetCapabilities();
+
+  llvm::StringLiteral kName = "lldb-mcp";
+  llvm::StringLiteral kVersion = "0.1.0";
+
+  Debugger &m_debugger;
+
+  bool m_running = false;
+
+  MainLoop m_loop;
+  std::thread m_loop_thread;
+
+  std::unique_ptr<Socket> m_listener;
+  std::vector<MainLoopBase::ReadHandleUP> m_listen_handlers;
+
+  struct Client {
+    lldb::IOObjectSP io_sp;
+    MainLoopBase::ReadHandleUP read_handle_up;
+    std::string buffer;
+  };
+  llvm::Error ReadCallback(Client &client);
+  std::vector<std::unique_ptr<Client>> m_clients;
+
+  std::mutex m_server_mutex;
+  llvm::StringMap<std::unique_ptr<Tool>> m_tools;
+
+  llvm::StringMap<RequestHandler> m_request_handlers;
+  llvm::StringMap<NotificationHandler> m_notification_handlers;
+};
+} // namespace lldb_private::mcp
+
+#endif
diff --git a/lldb/source/Plugins/Protocol/MCP/Tool.cpp b/lldb/source/Plugins/Protocol/MCP/Tool.cpp
new file mode 100644
index 000000000000..de8fcc8f3cb4
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/Tool.cpp
@@ -0,0 +1,81 @@
+//===- Tool.cpp -----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Tool.h"
+#include "lldb/Interpreter/CommandInterpreter.h"
+#include "lldb/Interpreter/CommandReturnObject.h"
+
+using namespace lldb_private::mcp;
+using namespace llvm;
+
+struct LLDBCommandToolArguments {
+  std::string arguments;
+};
+
+bool fromJSON(const llvm::json::Value &V, LLDBCommandToolArguments &A,
+              llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("arguments", A.arguments);
+}
+
+Tool::Tool(std::string name, std::string description)
+    : m_name(std::move(name)), m_description(std::move(description)) {}
+
+protocol::ToolDefinition Tool::GetDefinition() const {
+  protocol::ToolDefinition definition;
+  definition.name = m_name;
+  definition.description.emplace(m_description);
+
+  if (std::optional<llvm::json::Value> input_schema = GetSchema())
+    definition.inputSchema = *input_schema;
+
+  return definition;
+}
+
+LLDBCommandTool::LLDBCommandTool(std::string name, std::string description,
+                                 Debugger &debugger)
+    : Tool(std::move(name), std::move(description)), m_debugger(debugger) {}
+
+llvm::Expected<protocol::TextResult>
+LLDBCommandTool::Call(const llvm::json::Value &args) {
+  llvm::json::Path::Root root;
+
+  LLDBCommandToolArguments arguments;
+  if (!fromJSON(args, arguments, root))
+    return root.getError();
+
+  // FIXME: Disallow certain commands and their aliases.
+  CommandReturnObject result(/*colors=*/false);
+  m_debugger.GetCommandInterpreter().HandleCommand(arguments.arguments.c_str(),
+                                                   eLazyBoolYes, result);
+
+  std::string output;
+  llvm::StringRef output_str = result.GetOutputString();
+  if (!output_str.empty())
+    output += output_str.str();
+
+  std::string err_str = result.GetErrorString();
+  if (!err_str.empty()) {
+    if (!output.empty())
+      output += '\n';
+    output += err_str;
+  }
+
+  mcp::protocol::TextResult text_result;
+  text_result.content.emplace_back(mcp::protocol::TextContent{{output}});
+  text_result.isError = !result.Succeeded();
+  return text_result;
+}
+
+std::optional<llvm::json::Value> LLDBCommandTool::GetSchema() const {
+  llvm::json::Object str_type{{"type", "string"}};
+  llvm::json::Object properties{{"arguments", std::move(str_type)}};
+  llvm::json::Object schema{{"type", "object"},
+                            {"properties", std::move(properties)}};
+  return schema;
+}
diff --git a/lldb/source/Plugins/Protocol/MCP/Tool.h b/lldb/source/Plugins/Protocol/MCP/Tool.h
new file mode 100644
index 000000000000..57a5125813b7
--- /dev/null
+++ b/lldb/source/Plugins/Protocol/MCP/Tool.h
@@ -0,0 +1,56 @@
+//===- Tool.h -------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_PROTOCOL_MCP_TOOL_H
+#define LLDB_PLUGINS_PROTOCOL_MCP_TOOL_H
+
+#include "Protocol.h"
+#include "lldb/Core/Debugger.h"
+#include "llvm/Support/JSON.h"
+#include <string>
+
+namespace lldb_private::mcp {
+
+class Tool {
+public:
+  Tool(std::string name, std::string description);
+  virtual ~Tool() = default;
+
+  virtual llvm::Expected<protocol::TextResult>
+  Call(const llvm::json::Value &args) = 0;
+
+  virtual std::optional<llvm::json::Value> GetSchema() const {
+    return std::nullopt;
+  }
+
+  protocol::ToolDefinition GetDefinition() const;
+
+  const std::string &GetName() { return m_name; }
+
+private:
+  std::string m_name;
+  std::string m_description;
+};
+
+class LLDBCommandTool : public mcp::Tool {
+public:
+  LLDBCommandTool(std::string name, std::string description,
+                  Debugger &debugger);
+  ~LLDBCommandTool() = default;
+
+  virtual llvm::Expected<protocol::TextResult>
+  Call(const llvm::json::Value &args) override;
+
+  virtual std::optional<llvm::json::Value> GetSchema() const override;
+
+private:
+  Debugger &m_debugger;
+};
+} // namespace lldb_private::mcp
+
+#endif
diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt
index 6eaaa4f4c8c9..b48b9bafe3bc 100644
--- a/lldb/unittests/CMakeLists.txt
+++ b/lldb/unittests/CMakeLists.txt
@@ -78,6 +78,10 @@ add_subdirectory(Utility)
 add_subdirectory(Thread)
 add_subdirectory(ValueObject)
 
+if(LLDB_ENABLE_PROTOCOL_SERVERS)
+  add_subdirectory(Protocol)
+endif()
+
 if(LLDB_CAN_USE_DEBUGSERVER AND LLDB_TOOL_DEBUGSERVER_BUILD AND NOT LLDB_USE_SYSTEM_DEBUGSERVER)
   add_subdirectory(debugserver)
 endif()
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index 9c93eb8c94b0..085348ffc519 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -9,6 +9,7 @@
 #include "Protocol/ProtocolTypes.h"
 #include "Protocol/ProtocolEvents.h"
 #include "Protocol/ProtocolRequests.h"
+#include "TestingSupport/TestUtilities.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Testing/Support/Error.h"
@@ -20,6 +21,7 @@ using namespace llvm;
 using namespace lldb;
 using namespace lldb_dap;
 using namespace lldb_dap::protocol;
+using lldb_private::roundtripJSON;
 using llvm::json::parse;
 using llvm::json::Value;
 
@@ -28,15 +30,6 @@ static std::string pp(const json::Value &E) {
   return formatv("{0:2}", E).str();
 }
 
-template <typename T> static llvm::Expected<T> roundtrip(const T &input) {
-  llvm::json::Value value = toJSON(input);
-  llvm::json::Path::Root root;
-  T output;
-  if (!fromJSON(value, output, root))
-    return root.getError();
-  return output;
-}
-
 TEST(ProtocolTypesTest, ExceptionBreakpointsFilter) {
   ExceptionBreakpointsFilter filter;
   filter.filter = "testFilter";
@@ -47,7 +40,7 @@ TEST(ProtocolTypesTest, ExceptionBreakpointsFilter) {
   filter.conditionDescription = "Condition for test filter";
 
   llvm::Expected<ExceptionBreakpointsFilter> deserialized_filter =
-      roundtrip(filter);
+      roundtripJSON(filter);
   ASSERT_THAT_EXPECTED(deserialized_filter, llvm::Succeeded());
 
   EXPECT_EQ(filter.filter, deserialized_filter->filter);
@@ -66,7 +59,7 @@ TEST(ProtocolTypesTest, Source) {
   source.sourceReference = 12345;
   source.presentationHint = Source::eSourcePresentationHintEmphasize;
 
-  llvm::Expected<Source> deserialized_source = roundtrip(source);
+  llvm::Expected<Source> deserialized_source = roundtripJSON(source);
   ASSERT_THAT_EXPECTED(deserialized_source, llvm::Succeeded());
 
   EXPECT_EQ(source.name, deserialized_source->name);
@@ -83,7 +76,7 @@ TEST(ProtocolTypesTest, ColumnDescriptor) {
   column.type = eColumnTypeString;
   column.width = 20;
 
-  llvm::Expected<ColumnDescriptor> deserialized_column = roundtrip(column);
+  llvm::Expected<ColumnDescriptor> deserialized_column = roundtripJSON(column);
   ASSERT_THAT_EXPECTED(deserialized_column, llvm::Succeeded());
 
   EXPECT_EQ(column.attributeName, deserialized_column->attributeName);
@@ -101,7 +94,7 @@ TEST(ProtocolTypesTest, BreakpointMode) {
   mode.appliesTo = {eBreakpointModeApplicabilitySource,
                     eBreakpointModeApplicabilityException};
 
-  llvm::Expected<BreakpointMode> deserialized_mode = roundtrip(mode);
+  llvm::Expected<BreakpointMode> deserialized_mode = roundtripJSON(mode);
   ASSERT_THAT_EXPECTED(deserialized_mode, llvm::Succeeded());
 
   EXPECT_EQ(mode.mode, deserialized_mode->mode);
@@ -125,7 +118,8 @@ TEST(ProtocolTypesTest, Breakpoint) {
   breakpoint.offset = 4;
   breakpoint.reason = BreakpointReason::eBreakpointReasonPending;
 
-  llvm::Expected<Breakpoint> deserialized_breakpoint = roundtrip(breakpoint);
+  llvm::Expected<Breakpoint> deserialized_breakpoint =
+      roundtripJSON(breakpoint);
   ASSERT_THAT_EXPECTED(deserialized_breakpoint, llvm::Succeeded());
 
   EXPECT_EQ(breakpoint.id, deserialized_breakpoint->id);
@@ -157,7 +151,7 @@ TEST(ProtocolTypesTest, SourceBreakpoint) {
   source_breakpoint.mode = "hardware";
 
   llvm::Expected<SourceBreakpoint> deserialized_source_breakpoint =
-      roundtrip(source_breakpoint);
+      roundtripJSON(source_breakpoint);
   ASSERT_THAT_EXPECTED(deserialized_source_breakpoint, llvm::Succeeded());
 
   EXPECT_EQ(source_breakpoint.line, deserialized_source_breakpoint->line);
@@ -178,7 +172,7 @@ TEST(ProtocolTypesTest, FunctionBreakpoint) {
   function_breakpoint.hitCondition = "3";
 
   llvm::Expected<FunctionBreakpoint> deserialized_function_breakpoint =
-      roundtrip(function_breakpoint);
+      roundtripJSON(function_breakpoint);
   ASSERT_THAT_EXPECTED(deserialized_function_breakpoint, llvm::Succeeded());
 
   EXPECT_EQ(function_breakpoint.name, deserialized_function_breakpoint->name);
@@ -196,7 +190,7 @@ TEST(ProtocolTypesTest, DataBreakpoint) {
   data_breakpoint_info.hitCondition = "10";
 
   llvm::Expected<DataBreakpoint> deserialized_data_breakpoint_info =
-      roundtrip(data_breakpoint_info);
+      roundtripJSON(data_breakpoint_info);
   ASSERT_THAT_EXPECTED(deserialized_data_breakpoint_info, llvm::Succeeded());
 
   EXPECT_EQ(data_breakpoint_info.dataId,
@@ -233,9 +227,9 @@ TEST(ProtocolTypesTest, Capabilities) {
                                    {eBreakpointModeApplicabilitySource}}};
   capabilities.lldbExtVersion = "1.0.0";
 
-  // Perform roundtrip serialization and deserialization.
+  // Perform roundtripJSON serialization and deserialization.
   llvm::Expected<Capabilities> deserialized_capabilities =
-      roundtrip(capabilities);
+      roundtripJSON(capabilities);
   ASSERT_THAT_EXPECTED(deserialized_capabilities, llvm::Succeeded());
 
   // Verify supported features.
@@ -316,7 +310,7 @@ TEST(ProtocolTypesTest, Scope) {
   source.presentationHint = Source::eSourcePresentationHintNormal;
   scope.source = source;
 
-  llvm::Expected<Scope> deserialized_scope = roundtrip(scope);
+  llvm::Expected<Scope> deserialized_scope = roundtripJSON(scope);
   ASSERT_THAT_EXPECTED(deserialized_scope, llvm::Succeeded());
   EXPECT_EQ(scope.name, deserialized_scope->name);
   EXPECT_EQ(scope.presentationHint, deserialized_scope->presentationHint);
@@ -755,7 +749,7 @@ TEST(ProtocolTypesTest, StepInTarget) {
   target.endLine = 32;
   target.endColumn = 23;
 
-  llvm::Expected<StepInTarget> deserialized_target = roundtrip(target);
+  llvm::Expected<StepInTarget> deserialized_target = roundtripJSON(target);
   ASSERT_THAT_EXPECTED(deserialized_target, llvm::Succeeded());
 
   EXPECT_EQ(target.id, deserialized_target->id);
@@ -797,4 +791,4 @@ TEST(ProtocolTypesTest, ReadMemoryResponseBody) {
       R"({ "address": "0xdeadbeef", "data": "aGVsbG8gd29ybGQh", "unreadableBytes": 1})");
   ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
   EXPECT_EQ(pp(*expected), pp(response));
-}
\ No newline at end of file
+}
diff --git a/lldb/unittests/Protocol/CMakeLists.txt b/lldb/unittests/Protocol/CMakeLists.txt
new file mode 100644
index 000000000000..801662b0544d
--- /dev/null
+++ b/lldb/unittests/Protocol/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_lldb_unittest(ProtocolTests
+  ProtocolMCPTest.cpp
+  ProtocolMCPServerTest.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbUtility
+    lldbHost
+    lldbPluginPlatformMacOSX
+    lldbPluginProtocolServerMCP
+    LLVMTestingSupport
+  )
diff --git a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
new file mode 100644
index 000000000000..72b8c7b1fd82
--- /dev/null
+++ b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
@@ -0,0 +1,291 @@
+//===-- ProtocolServerMCPTest.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Plugins/Platform/MacOSX/PlatformRemoteMacOSX.h"
+#include "Plugins/Protocol/MCP/ProtocolServerMCP.h"
+#include "TestingSupport/Host/SocketTestUtilities.h"
+#include "TestingSupport/SubsystemRAII.h"
+#include "lldb/Core/ProtocolServer.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/HostInfo.h"
+#include "lldb/Host/JSONTransport.h"
+#include "lldb/Host/Socket.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::mcp::protocol;
+
+namespace {
+class TestProtocolServerMCP : public lldb_private::mcp::ProtocolServerMCP {
+public:
+  using ProtocolServerMCP::AddNotificationHandler;
+  using ProtocolServerMCP::AddRequestHandler;
+  using ProtocolServerMCP::AddTool;
+  using ProtocolServerMCP::GetSocket;
+  using ProtocolServerMCP::ProtocolServerMCP;
+};
+
+class TestJSONTransport : public lldb_private::JSONRPCTransport {
+public:
+  using JSONRPCTransport::JSONRPCTransport;
+  using JSONRPCTransport::ReadImpl;
+  using JSONRPCTransport::WriteImpl;
+};
+
+/// Test tool that returns it argument as text.
+class TestTool : public mcp::Tool {
+public:
+  using mcp::Tool::Tool;
+
+  virtual llvm::Expected<mcp::protocol::TextResult>
+  Call(const llvm::json::Value &args) override {
+    std::string argument;
+    if (const json::Object *args_obj = args.getAsObject()) {
+      if (const json::Value *s = args_obj->get("arguments")) {
+        argument = s->getAsString().value_or("");
+      }
+    }
+
+    mcp::protocol::TextResult text_result;
+    text_result.content.emplace_back(mcp::protocol::TextContent{{argument}});
+    return text_result;
+  }
+};
+
+/// Test tool that returns an error.
+class ErrorTool : public mcp::Tool {
+public:
+  using mcp::Tool::Tool;
+
+  virtual llvm::Expected<mcp::protocol::TextResult>
+  Call(const llvm::json::Value &args) override {
+    return llvm::createStringError("error");
+  }
+};
+
+/// Test tool that fails but doesn't return an error.
+class FailTool : public mcp::Tool {
+public:
+  using mcp::Tool::Tool;
+
+  virtual llvm::Expected<mcp::protocol::TextResult>
+  Call(const llvm::json::Value &args) override {
+    mcp::protocol::TextResult text_result;
+    text_result.content.emplace_back(mcp::protocol::TextContent{{"failed"}});
+    text_result.isError = true;
+    return text_result;
+  }
+};
+
+class ProtocolServerMCPTest : public ::testing::Test {
+public:
+  SubsystemRAII<FileSystem, HostInfo, PlatformRemoteMacOSX, Socket> subsystems;
+  DebuggerSP m_debugger_sp;
+
+  lldb::IOObjectSP m_io_sp;
+  std::unique_ptr<TestJSONTransport> m_transport_up;
+  std::unique_ptr<TestProtocolServerMCP> m_server_up;
+
+  static constexpr llvm::StringLiteral k_localhost = "localhost";
+
+  llvm::Error Write(llvm::StringRef message) {
+    return m_transport_up->WriteImpl(llvm::formatv("{0}\n", message).str());
+  }
+
+  llvm::Expected<std::string> Read() {
+    return m_transport_up->ReadImpl(std::chrono::milliseconds(100));
+  }
+
+  void SetUp() {
+    // Create a debugger.
+    ArchSpec arch("arm64-apple-macosx-");
+    Platform::SetHostPlatform(
+        PlatformRemoteMacOSX::CreateInstance(true, &arch));
+    m_debugger_sp = Debugger::CreateInstance();
+
+    // Create & start the server.
+    ProtocolServer::Connection connection;
+    connection.protocol = Socket::SocketProtocol::ProtocolTcp;
+    connection.name = llvm::formatv("{0}:0", k_localhost).str();
+    m_server_up = std::make_unique<TestProtocolServerMCP>(*m_debugger_sp);
+    m_server_up->AddTool(std::make_unique<TestTool>("test", "test tool"));
+    ASSERT_THAT_ERROR(m_server_up->Start(connection), llvm::Succeeded());
+
+    // Connect to the server over a TCP socket.
+    auto connect_socket_up = std::make_unique<TCPSocket>(true);
+    ASSERT_THAT_ERROR(connect_socket_up
+                          ->Connect(llvm::formatv("{0}:{1}", k_localhost,
+                                                  static_cast<TCPSocket *>(
+                                                      m_server_up->GetSocket())
+                                                      ->GetLocalPortNumber())
+                                        .str())
+                          .ToError(),
+                      llvm::Succeeded());
+
+    // Set up JSON transport for the client.
+    m_io_sp = std::move(connect_socket_up);
+    m_transport_up = std::make_unique<TestJSONTransport>(m_io_sp, m_io_sp);
+  }
+
+  void TearDown() {
+    // Stop the server.
+    ASSERT_THAT_ERROR(m_server_up->Stop(), llvm::Succeeded());
+  }
+};
+
+} // namespace
+
+TEST_F(ProtocolServerMCPTest, Intialization) {
+  llvm::StringLiteral request =
+      R"json({"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"claude-ai","version":"0.1.0"}},"jsonrpc":"2.0","id":0})json";
+  llvm::StringLiteral response =
+      R"json({"jsonrpc":"2.0","id":0,"result":{"capabilities":{"tools":{"listChanged":true}},"protocolVersion":"2024-11-05","serverInfo":{"name":"lldb-mcp","version":"0.1.0"}}})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  llvm::Expected<std::string> response_str = Read();
+  ASSERT_THAT_EXPECTED(response_str, llvm::Succeeded());
+
+  llvm::Expected<json::Value> response_json = json::parse(*response_str);
+  ASSERT_THAT_EXPECTED(response_json, llvm::Succeeded());
+
+  llvm::Expected<json::Value> expected_json = json::parse(response);
+  ASSERT_THAT_EXPECTED(expected_json, llvm::Succeeded());
+
+  EXPECT_EQ(*response_json, *expected_json);
+}
+
+TEST_F(ProtocolServerMCPTest, ToolsList) {
+  llvm::StringLiteral request =
+      R"json({"method":"tools/list","params":{},"jsonrpc":"2.0","id":1})json";
+  llvm::StringLiteral response =
+      R"json({"id":1,"jsonrpc":"2.0","result":{"tools":[{"description":"test tool","name":"test"},{"description":"Run an lldb command.","inputSchema":{"properties":{"arguments":{"type":"string"}},"type":"object"},"name":"lldb_command"}]}})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  llvm::Expected<std::string> response_str = Read();
+  ASSERT_THAT_EXPECTED(response_str, llvm::Succeeded());
+
+  llvm::Expected<json::Value> response_json = json::parse(*response_str);
+  ASSERT_THAT_EXPECTED(response_json, llvm::Succeeded());
+
+  llvm::Expected<json::Value> expected_json = json::parse(response);
+  ASSERT_THAT_EXPECTED(expected_json, llvm::Succeeded());
+
+  EXPECT_EQ(*response_json, *expected_json);
+}
+
+TEST_F(ProtocolServerMCPTest, ResourcesList) {
+  llvm::StringLiteral request =
+      R"json({"method":"resources/list","params":{},"jsonrpc":"2.0","id":2})json";
+  llvm::StringLiteral response =
+      R"json({"error":{"code":1,"message":"no handler for request: resources/list"},"id":2,"jsonrpc":"2.0"})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  llvm::Expected<std::string> response_str = Read();
+  ASSERT_THAT_EXPECTED(response_str, llvm::Succeeded());
+
+  llvm::Expected<json::Value> response_json = json::parse(*response_str);
+  ASSERT_THAT_EXPECTED(response_json, llvm::Succeeded());
+
+  llvm::Expected<json::Value> expected_json = json::parse(response);
+  ASSERT_THAT_EXPECTED(expected_json, llvm::Succeeded());
+
+  EXPECT_EQ(*response_json, *expected_json);
+}
+
+TEST_F(ProtocolServerMCPTest, ToolsCall) {
+  llvm::StringLiteral request =
+      R"json({"method":"tools/call","params":{"name":"test","arguments":{"arguments":"foo"}},"jsonrpc":"2.0","id":11})json";
+  llvm::StringLiteral response =
+      R"json({"id":11,"jsonrpc":"2.0","result":{"content":[{"text":"foo","type":"text"}],"isError":false}})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  llvm::Expected<std::string> response_str = Read();
+  ASSERT_THAT_EXPECTED(response_str, llvm::Succeeded());
+
+  llvm::Expected<json::Value> response_json = json::parse(*response_str);
+  ASSERT_THAT_EXPECTED(response_json, llvm::Succeeded());
+
+  llvm::Expected<json::Value> expected_json = json::parse(response);
+  ASSERT_THAT_EXPECTED(expected_json, llvm::Succeeded());
+
+  EXPECT_EQ(*response_json, *expected_json);
+}
+
+TEST_F(ProtocolServerMCPTest, ToolsCallError) {
+  m_server_up->AddTool(std::make_unique<ErrorTool>("error", "error tool"));
+
+  llvm::StringLiteral request =
+      R"json({"method":"tools/call","params":{"name":"error","arguments":{"arguments":"foo"}},"jsonrpc":"2.0","id":11})json";
+  llvm::StringLiteral response =
+      R"json({"error":{"code":-1,"message":"error"},"id":11,"jsonrpc":"2.0"})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  llvm::Expected<std::string> response_str = Read();
+  ASSERT_THAT_EXPECTED(response_str, llvm::Succeeded());
+
+  llvm::Expected<json::Value> response_json = json::parse(*response_str);
+  ASSERT_THAT_EXPECTED(response_json, llvm::Succeeded());
+
+  llvm::Expected<json::Value> expected_json = json::parse(response);
+  ASSERT_THAT_EXPECTED(expected_json, llvm::Succeeded());
+
+  EXPECT_EQ(*response_json, *expected_json);
+}
+
+TEST_F(ProtocolServerMCPTest, ToolsCallFail) {
+  m_server_up->AddTool(std::make_unique<FailTool>("fail", "fail tool"));
+
+  llvm::StringLiteral request =
+      R"json({"method":"tools/call","params":{"name":"fail","arguments":{"arguments":"foo"}},"jsonrpc":"2.0","id":11})json";
+  llvm::StringLiteral response =
+      R"json({"id":11,"jsonrpc":"2.0","result":{"content":[{"text":"failed","type":"text"}],"isError":true}})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  llvm::Expected<std::string> response_str = Read();
+  ASSERT_THAT_EXPECTED(response_str, llvm::Succeeded());
+
+  llvm::Expected<json::Value> response_json = json::parse(*response_str);
+  ASSERT_THAT_EXPECTED(response_json, llvm::Succeeded());
+
+  llvm::Expected<json::Value> expected_json = json::parse(response);
+  ASSERT_THAT_EXPECTED(expected_json, llvm::Succeeded());
+
+  EXPECT_EQ(*response_json, *expected_json);
+}
+
+TEST_F(ProtocolServerMCPTest, NotificationInitialized) {
+  bool handler_called = false;
+  std::condition_variable cv;
+  std::mutex mutex;
+
+  m_server_up->AddNotificationHandler(
+      "notifications/initialized",
+      [&](const mcp::protocol::Notification &notification) {
+        {
+          std::lock_guard<std::mutex> lock(mutex);
+          handler_called = true;
+        }
+        cv.notify_all();
+      });
+  llvm::StringLiteral request =
+      R"json({"method":"notifications/initialized","jsonrpc":"2.0"})json";
+
+  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
+
+  std::unique_lock<std::mutex> lock(mutex);
+  cv.wait(lock, [&] { return handler_called; });
+}
diff --git a/lldb/unittests/Protocol/ProtocolMCPTest.cpp b/lldb/unittests/Protocol/ProtocolMCPTest.cpp
new file mode 100644
index 000000000000..00959f3ce20b
--- /dev/null
+++ b/lldb/unittests/Protocol/ProtocolMCPTest.cpp
@@ -0,0 +1,135 @@
+//===-- ProtocolMCPTest.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Plugins/Protocol/MCP/Protocol.h"
+#include "TestingSupport/TestUtilities.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::mcp::protocol;
+
+TEST(ProtocolMCPTest, Request) {
+  Request request;
+  request.id = 1;
+  request.method = "foo";
+  request.params = llvm::json::Object{{"key", "value"}};
+
+  llvm::Expected<Request> deserialized_request = roundtripJSON(request);
+  ASSERT_THAT_EXPECTED(deserialized_request, llvm::Succeeded());
+
+  EXPECT_EQ(request.id, deserialized_request->id);
+  EXPECT_EQ(request.method, deserialized_request->method);
+  EXPECT_EQ(request.params, deserialized_request->params);
+}
+
+TEST(ProtocolMCPTest, Response) {
+  Response response;
+  response.id = 1;
+  response.result = llvm::json::Object{{"key", "value"}};
+
+  llvm::Expected<Response> deserialized_response = roundtripJSON(response);
+  ASSERT_THAT_EXPECTED(deserialized_response, llvm::Succeeded());
+
+  EXPECT_EQ(response.id, deserialized_response->id);
+  EXPECT_EQ(response.result, deserialized_response->result);
+}
+
+TEST(ProtocolMCPTest, Notification) {
+  Notification notification;
+  notification.method = "notifyMethod";
+  notification.params = llvm::json::Object{{"key", "value"}};
+
+  llvm::Expected<Notification> deserialized_notification =
+      roundtripJSON(notification);
+  ASSERT_THAT_EXPECTED(deserialized_notification, llvm::Succeeded());
+
+  EXPECT_EQ(notification.method, deserialized_notification->method);
+  EXPECT_EQ(notification.params, deserialized_notification->params);
+}
+
+TEST(ProtocolMCPTest, ToolCapability) {
+  ToolCapability tool_capability;
+  tool_capability.listChanged = true;
+
+  llvm::Expected<ToolCapability> deserialized_tool_capability =
+      roundtripJSON(tool_capability);
+  ASSERT_THAT_EXPECTED(deserialized_tool_capability, llvm::Succeeded());
+
+  EXPECT_EQ(tool_capability.listChanged,
+            deserialized_tool_capability->listChanged);
+}
+
+TEST(ProtocolMCPTest, Capabilities) {
+  ToolCapability tool_capability;
+  tool_capability.listChanged = true;
+
+  Capabilities capabilities;
+  capabilities.tools = tool_capability;
+
+  llvm::Expected<Capabilities> deserialized_capabilities =
+      roundtripJSON(capabilities);
+  ASSERT_THAT_EXPECTED(deserialized_capabilities, llvm::Succeeded());
+
+  EXPECT_EQ(capabilities.tools.listChanged,
+            deserialized_capabilities->tools.listChanged);
+}
+
+TEST(ProtocolMCPTest, TextContent) {
+  TextContent text_content;
+  text_content.text = "Sample text";
+
+  llvm::Expected<TextContent> deserialized_text_content =
+      roundtripJSON(text_content);
+  ASSERT_THAT_EXPECTED(deserialized_text_content, llvm::Succeeded());
+
+  EXPECT_EQ(text_content.text, deserialized_text_content->text);
+}
+
+TEST(ProtocolMCPTest, TextResult) {
+  TextContent text_content1;
+  text_content1.text = "Text 1";
+
+  TextContent text_content2;
+  text_content2.text = "Text 2";
+
+  TextResult text_result;
+  text_result.content = {text_content1, text_content2};
+  text_result.isError = true;
+
+  llvm::Expected<TextResult> deserialized_text_result =
+      roundtripJSON(text_result);
+  ASSERT_THAT_EXPECTED(deserialized_text_result, llvm::Succeeded());
+
+  EXPECT_EQ(text_result.isError, deserialized_text_result->isError);
+  ASSERT_EQ(text_result.content.size(),
+            deserialized_text_result->content.size());
+  EXPECT_EQ(text_result.content[0].text,
+            deserialized_text_result->content[0].text);
+  EXPECT_EQ(text_result.content[1].text,
+            deserialized_text_result->content[1].text);
+}
+
+TEST(ProtocolMCPTest, ToolDefinition) {
+  ToolDefinition tool_definition;
+  tool_definition.name = "ToolName";
+  tool_definition.description = "Tool Description";
+  tool_definition.inputSchema =
+      llvm::json::Object{{"schemaKey", "schemaValue"}};
+
+  llvm::Expected<ToolDefinition> deserialized_tool_definition =
+      roundtripJSON(tool_definition);
+  ASSERT_THAT_EXPECTED(deserialized_tool_definition, llvm::Succeeded());
+
+  EXPECT_EQ(tool_definition.name, deserialized_tool_definition->name);
+  EXPECT_EQ(tool_definition.description,
+            deserialized_tool_definition->description);
+  EXPECT_EQ(tool_definition.inputSchema,
+            deserialized_tool_definition->inputSchema);
+}
diff --git a/lldb/unittests/TestingSupport/TestUtilities.h b/lldb/unittests/TestingSupport/TestUtilities.h
index 65994384059f..db62881872fe 100644
--- a/lldb/unittests/TestingSupport/TestUtilities.h
+++ b/lldb/unittests/TestingSupport/TestUtilities.h
@@ -59,6 +59,15 @@ private:
 
   std::string Buffer;
 };
+
+template <typename T> static llvm::Expected<T> roundtripJSON(const T &input) {
+  llvm::json::Value value = toJSON(input);
+  llvm::json::Path::Root root;
+  T output;
+  if (!fromJSON(value, output, root))
+    return root.getError();
+  return output;
+}
 } // namespace lldb_private
 
 #endif

From 78971916da04895838ff043f4fc71760dcca5bac Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Jun 2025 15:48:30 +0000
Subject: [PATCH 1067/1322] [gn build] Port 9524bfb27020

---
 llvm/utils/gn/secondary/lldb/source/Commands/BUILD.gn | 1 +
 llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Commands/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Commands/BUILD.gn
index 1fc119c07244..f92438977045 100644
--- a/llvm/utils/gn/secondary/lldb/source/Commands/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Commands/BUILD.gn
@@ -50,6 +50,7 @@ static_library("Commands") {
     "CommandObjectPlatform.cpp",
     "CommandObjectPlugin.cpp",
     "CommandObjectProcess.cpp",
+    "CommandObjectProtocolServer.cpp",
     "CommandObjectQuit.cpp",
     "CommandObjectRegexCommand.cpp",
     "CommandObjectRegister.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
index 36d6f270d7e5..5ae699a33ea4 100644
--- a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
@@ -74,6 +74,7 @@ static_library("Core") {
     "Opcode.cpp",
     "PluginManager.cpp",
     "Progress.cpp",
+    "ProtocolServer.cpp",
     "RichManglingContext.cpp",
     "SearchFilter.cpp",
     "Section.cpp",

From 6e86b7e34b9494a01bf7164825c3d72ff21a4c7f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 20 Jun 2025 16:52:43 +0100
Subject: [PATCH 1068/1322] [AMDGPU] Do not replace SALU floating point
 multiply with VALU-only ldexp (#145048)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  6 ++
 .../AMDGPU/pseudo-scalar-transcendental.ll    | 92 +++++++++++--------
 2 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 07d79d677104..3281eabcd4ad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15189,6 +15189,12 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   EVT ScalarVT = VT.getScalarType();
   EVT IntVT = VT.changeElementType(MVT::i32);
 
+  if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
+      (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
+    // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
+    return SDValue();
+  }
+
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index 733fe9317ddc..a2e7f2e62f5d 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -2,19 +2,34 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
+; TODO: GlobalISel should avoid generating v_ldexp_f32.
 define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
-; GFX12-LABEL: v_s_exp_f32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
-; GFX12-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-NEXT:    v_ldexp_f32 v0, s0, s1
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: v_s_exp_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42800000, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
+; GFX12-SDAG-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: v_s_exp_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42800000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
+; GFX12-GISEL-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, s0, s1
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.exp2.f32(float %src)
   ret float %result
 }
@@ -59,14 +74,15 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: v_s_log_f32:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, s0, s1
-; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x42000000, 0
-; GFX12-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT:    v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_sub_f32 s0, s0, s1
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_subrev_f32_e32 v0, s0, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: v_s_log_f32:
@@ -147,7 +163,7 @@ define amdgpu_cs half @v_s_rcp_f16(half inreg %src) {
   ret half %result
 }
 
-; TODO-GFX12: GlobalISel should generate v_s_rsq.
+; TODO: GlobalISel should generate v_s_rsq.
 define amdgpu_cs float @v_s_rsq_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: v_s_rsq_f32:
 ; GFX12-SDAG:       ; %bb.0:
@@ -184,7 +200,7 @@ define amdgpu_cs half @v_s_rsq_f16(half inreg %src) {
   ret half %result
 }
 
-; TODO-GFX12: Should not use any VALU instructions.
+; TODO: Should avoid generating v_cmp_class_f32.
 define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: v_s_sqrt_f32:
 ; GFX12-SDAG:       ; %bb.0:
@@ -298,16 +314,18 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src)  {
 define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: srcmods_abs_f32:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_and_b32 s1, s0, 0x7fffffff
+; GFX12-SDAG-NEXT:    s_bitset0_b32 s0, 31
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s1, 0x800000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, |s0|, s1
-; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x42000000, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0x800000
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-SDAG-NEXT:    v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_sub_f32 s0, s0, s1
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_subrev_f32_e32 v0, s0, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: srcmods_abs_f32:
@@ -333,15 +351,17 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: srcmods_neg_f32:
 ; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_xor_b32 s1, s0, 0x80000000
 ; GFX12-SDAG-NEXT:    s_cmp_gt_f32 s0, 0x80800000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, -s0, s1
-; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x42000000, 0
-; GFX12-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x4f800000, 1.0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s1, s0
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT:    v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_sub_f32 s0, s0, s1
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_subrev_f32_e32 v0, s0, v0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-LABEL: srcmods_neg_f32:

From 749e4a53d252e23e870d4a1638ff9d846af58e7f Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 20 Jun 2025 10:53:52 -0500
Subject: [PATCH 1069/1322] [lldb] Fix ASCII art in CommandObjectProtocolServer
 (NFC)

---
 lldb/source/Commands/CommandObjectProtocolServer.cpp | 3 +--
 lldb/source/Commands/CommandObjectProtocolServer.h   | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectProtocolServer.cpp b/lldb/source/Commands/CommandObjectProtocolServer.cpp
index 420fc5fdddad..115754769f3e 100644
--- a/lldb/source/Commands/CommandObjectProtocolServer.cpp
+++ b/lldb/source/Commands/CommandObjectProtocolServer.cpp
@@ -1,5 +1,4 @@
-//===-- CommandObjectProtocolServer.cpp
-//----------------------------------------------===//
+//===-- CommandObjectProtocolServer.cpp -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/lldb/source/Commands/CommandObjectProtocolServer.h b/lldb/source/Commands/CommandObjectProtocolServer.h
index 3591216b014c..791b3a78aaf0 100644
--- a/lldb/source/Commands/CommandObjectProtocolServer.h
+++ b/lldb/source/Commands/CommandObjectProtocolServer.h
@@ -1,5 +1,4 @@
-//===-- CommandObjectProtocolServer.h
-//------------------------------------------------===//
+//===-- CommandObjectProtocolServer.h -------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 151ee0faad427651304b51b8af77704be26bb485 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 17:03:23 +0100
Subject: [PATCH 1070/1322] [X86] SimplifyDemandedVectorEltsForTargetNode -
 ensure X86ISD::VPERMILPV node use v2f64/v4f32 types

When reducing v4f64/v8f32 non-lane crossing X86ISD::VPERMV nodes, we use X86ISD::VPERMILPV nodes for 128-bits, but these are only available for fp types.

Fixes #145046
---
 llvm/lib/Target/X86/X86ISelLowering.cpp               |  9 +++++++--
 .../test/CodeGen/X86/vector-shuffle-combining-avx2.ll | 11 +++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 085f44e72476..c32737010560 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44110,8 +44110,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
           // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
           if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
             Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
-          else
-            Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
+          else {
+            MVT ShufSVT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
+            MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
+            Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
+                                  TLO.DAG.getBitcast(ShufVT, V), M);
+            Ext = TLO.DAG.getBitcast(HalfVT, Ext);
+          }
           SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
                                           Subtarget, TLO.DAG, DL, SizeInBits);
           return TLO.CombineTo(Op, Insert);
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index f4f4842e4c69..f7764b1593b5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -775,6 +775,17 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
   ret <32 x i8> %4
 }
 
+define <4 x i32> @extract_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: extract_vpermd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %1 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 0, i32 1, i32 0, i32 7, i32 6>)
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
 ; Not beneficial to concatenate both inputs just to create a 256-bit vpaddb
 define <32 x i8> @concat_add_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1, <16 x i8> %a2) nounwind {
 ; CHECK-LABEL: concat_add_unnecessary:

From 32fc625a3fa27fa325c75b0fc841db4ce8e06805 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 20 Jun 2025 18:06:01 +0200
Subject: [PATCH 1071/1322] =?UTF-8?q?Reapply=20"Reapply=20"[clang][bytecod?=
 =?UTF-8?q?e]=20Allocate=20IntegralAP=20and=20Floating=20=E2=80=A6=20(#145?=
 =?UTF-8?q?014)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…types usi… (#144676)"

This reverts commit 68471d29eed2c49f9b439e505b3f24d387d54f97.

IntegralAP contains a union:
  union {
    uint64_t *Memory = nullptr;
    uint64_t Val;
  };

On 64bit systems, both Memory and Val have the same size. However, on 32
bit system, Val is 64bit and Memory only 32bit. Which means the default
initializer for Memory will only zero half of Val. We fixed this by
zero-initializing Val explicitly in the IntegralAP(unsigned BitWidth)
constructor.


See also the discussion in
https://github.com/llvm/llvm-project/pull/144246
---
 clang/lib/AST/ByteCode/ByteCodeEmitter.cpp    |   3 +-
 clang/lib/AST/ByteCode/Compiler.cpp           | 123 ++++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 +
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  59 ++-
 clang/lib/AST/ByteCode/Floating.h             | 252 ++++++++-----
 clang/lib/AST/ByteCode/Integral.h             |   3 +
 clang/lib/AST/ByteCode/IntegralAP.h           | 240 ++++++------
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +++++-
 clang/lib/AST/ByteCode/Interp.h               | 341 ++++++++++++++----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 ++-
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 ++
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 +
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 18 files changed, 934 insertions(+), 362 deletions(-)

diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index d91d5f16fc7a..965e23503603 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -176,7 +176,8 @@ static void emitSerialized(std::vector<std::byte> &Code, const T &Val,
   }
 
   // Access must be aligned!
-  size_t ValPos = align(Code.size());
+  assert(aligned(Code.size()));
+  size_t ValPos = Code.size();
   Size = align(Size);
   assert(aligned(ValPos + Size));
   Code.resize(ValPos + Size);
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 9fe4803ce98e..c5ac40210e47 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,7 +748,8 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  return this->emitConstFloat(E->getValue(), E);
+  APFloat F = E->getValue();
+  return this->emitFloat(F, E);
 }
 
 template <class Emitter>
@@ -4185,13 +4186,14 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float:
-    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
+  case PT_Float: {
+    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
+    return this->emitFloat(F, E);
+  }
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
   }
-    llvm_unreachable("Implement");
   }
   llvm_unreachable("unknown primitive type");
 }
@@ -4674,10 +4676,7 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      if (!this->emitFinishInit(Init))
-        return false;
-
-      return this->emitPopPtr(Init);
+      return this->emitFinishInitGlobal(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,51 +4697,45 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  } else {
-    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
-
-    if (VarT) {
-      unsigned Offset = this->allocateLocalPrimitive(
-          VD, *VarT, VD->getType().isConstQualified(), nullptr,
-          ScopeKind::Block, IsConstexprUnknown);
-      if (Init) {
-        // If this is a toplevel declaration, create a scope for the
-        // initializer.
-        if (Toplevel) {
-          LocalScope<Emitter> Scope(this);
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-        } else {
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD);
-        }
-      }
-    } else {
-      if (std::optional<unsigned> Offset =
-              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
-                                  IsConstexprUnknown)) {
-        if (!Init)
-          return true;
-
-        if (!this->emitGetPtrLocal(*Offset, Init))
-          return false;
-
-        if (!visitInitializer(Init))
-          return false;
-
-        if (!this->emitFinishInit(Init))
-          return false;
-
-        return this->emitPopPtr(Init);
-      }
-      return false;
-    }
-    return true;
   }
+  // Local variables.
+  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-  return false;
+  if (VarT) {
+    unsigned Offset = this->allocateLocalPrimitive(
+        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
+        IsConstexprUnknown);
+    if (Init) {
+      // If this is a toplevel declaration, create a scope for the
+      // initializer.
+      if (Toplevel) {
+        LocalScope<Emitter> Scope(this);
+        if (!this->visit(Init))
+          return false;
+        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+      } else {
+        if (!this->visit(Init))
+          return false;
+        return this->emitSetLocal(*VarT, Offset, VD);
+      }
+    }
+  } else {
+    if (std::optional<unsigned> Offset = this->allocateLocal(
+            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
+      if (!Init)
+        return true;
+
+      if (!this->emitGetPtrLocal(*Offset, Init))
+        return false;
+
+      if (!visitInitializer(Init))
+        return false;
+
+      return this->emitFinishInitPop(Init);
+    }
+    return false;
+  }
+  return true;
 }
 
 template <class Emitter>
@@ -4751,8 +4744,10 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat())
-    return this->emitConstFloat(Val.getFloat(), E);
+  else if (Val.isFloat()) {
+    APFloat F = Val.getFloat();
+    return this->emitFloat(F, E);
+  }
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6133,8 +6128,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6176,8 +6173,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,6 +6952,20 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
+  assert(!DiscardResult && "Should've been checked before");
+
+  if (Floating::singleWord(F.getSemantics()))
+    return this->emitConstFloat(Floating(F), E);
+
+  APInt I = F.bitcastToAPInt();
+  return this->emitConstFloat(
+      Floating(const_cast<uint64_t *>(I.getRawData()),
+               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
+      E);
+}
+
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index ac3ad84766dc..a1d068cc7e0a 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,6 +391,7 @@ private:
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
+  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 5531295dfa2f..46e4d0d940b3 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 846dc2fe92a7..a3eecd06369b 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,34 +50,57 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto F = Floating::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  auto Sem = Floating::deserializeSemantics(*OpPC);
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
+      llvm::APFloatBase::EnumToSemantics(Sem));
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  Floating Result(Memory.get(), Sem);
+  Floating::deserialize(*OpPC, &Result);
+
+  OpPC += align(Result.bytesToSerialize());
+
+  std::string S;
+  llvm::raw_string_ostream SS(S);
+  SS << Result;
+  return S;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<false>;
+  uint32_t BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  OpPC += align(Result.bytesToSerialize());
+
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+  return Str;
 }
+
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<true>;
+  uint32_t BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  OpPC += align(Result.bytesToSerialize());
+
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+  return Str;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 3750568fc23c..659892e720ab 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,63 +17,79 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
+// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
+// floating values.
+#define ALLOCATE_ALL 0
+
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
+using APInt = llvm::APInt;
 
+/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 class Floating final {
 private:
-  // The underlying value storage.
-  APFloat F;
+  union {
+    uint64_t Val = 0;
+    uint64_t *Memory;
+  };
+  llvm::APFloatBase::Semantics Semantics;
+
+  APFloat getValue() const {
+    unsigned BitWidth = bitWidth();
+    if (singleWord())
+      return APFloat(getSemantics(), APInt(BitWidth, Val));
+    unsigned NumWords = numWords();
+    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+  }
 
 public:
-  /// Zero-initializes a Floating.
-  Floating() : F(0.0f) {}
-  Floating(const APFloat &F) : F(F) {}
+  Floating() = default;
+  Floating(llvm::APFloatBase::Semantics Semantics)
+      : Val(0), Semantics(Semantics) {}
+  Floating(const APFloat &F) {
 
-  // Static constructors for special floating point values.
-  static Floating getInf(const llvm::fltSemantics &Sem) {
-    return Floating(APFloat::getInf(Sem));
+    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
+    this->copy(F);
   }
-  const APFloat &getAPFloat() const { return F; }
+  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
+      : Memory(Memory), Semantics(Semantics) {}
 
-  bool operator<(Floating RHS) const { return F < RHS.F; }
-  bool operator>(Floating RHS) const { return F > RHS.F; }
-  bool operator<=(Floating RHS) const { return F <= RHS.F; }
-  bool operator>=(Floating RHS) const { return F >= RHS.F; }
-  bool operator==(Floating RHS) const { return F == RHS.F; }
-  bool operator!=(Floating RHS) const { return F != RHS.F; }
-  Floating operator-() const { return Floating(-F); }
+  APFloat getAPFloat() const { return getValue(); }
+
+  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
+  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
+  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
+  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
+    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
+                                       &IsExact);
   }
 
-  Floating toSemantics(const llvm::fltSemantics *Sem,
-                       llvm::RoundingMode RM) const {
-    APFloat Copy = F;
+  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
+                   Floating *Result) const {
+    APFloat Copy = getValue();
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    return Floating(Copy);
-  }
-
-  /// Convert this Floating to one with the same semantics as \Other.
-  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
-    return toSemantics(&Other.F.getSemantics(), RM);
+    Result->copy(Copy);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(F.bitcastToAPInt());
+    return APSInt(getValue().bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(F); }
+  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    F.toString(Buffer);
+    getValue().toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -83,25 +99,62 @@ public:
     return NameStr;
   }
 
-  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
+  unsigned bitWidth() const {
+    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
+  }
+  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
+  bool singleWord() const {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return numWords() == 1;
+  }
+  static bool singleWord(const llvm::fltSemantics &Sem) {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
+  }
+  const llvm::fltSemantics &getSemantics() const {
+    return llvm::APFloatBase::EnumToSemantics(Semantics);
+  }
+
+  void copy(const APFloat &F) {
+    if (singleWord()) {
+      Val = F.bitcastToAPInt().getZExtValue();
+    } else {
+      assert(Memory);
+      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
+                  numWords() * sizeof(uint64_t));
+    }
+  }
+
+  void take(uint64_t *NewMemory) {
+    if (singleWord())
+      return;
+
+    if (Memory)
+      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return F.isNegative(); }
-  bool isZero() const { return F.isZero(); }
-  bool isNonZero() const { return F.isNonZero(); }
-  bool isMin() const { return F.isSmallest(); }
-  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
-  bool isNan() const { return F.isNaN(); }
-  bool isSignaling() const { return F.isSignaling(); }
-  bool isInf() const { return F.isInfinity(); }
-  bool isFinite() const { return F.isFinite(); }
-  bool isNormal() const { return F.isNormal(); }
-  bool isDenormal() const { return F.isDenormal(); }
-  llvm::FPClassTest classify() const { return F.classify(); }
-  APFloat::fltCategory getCategory() const { return F.getCategory(); }
+  bool isNegative() const { return getValue().isNegative(); }
+  bool isZero() const { return getValue().isZero(); }
+  bool isNonZero() const { return getValue().isNonZero(); }
+  bool isMin() const { return getValue().isSmallest(); }
+  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
+  bool isNan() const { return getValue().isNaN(); }
+  bool isSignaling() const { return getValue().isSignaling(); }
+  bool isInf() const { return getValue().isInfinity(); }
+  bool isFinite() const { return getValue().isFinite(); }
+  bool isNormal() const { return getValue().isNormal(); }
+  bool isDenormal() const { return getValue().isDenormal(); }
+  llvm::FPClassTest classify() const { return getValue().classify(); }
+  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
+    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -118,97 +171,130 @@ public:
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating &Result) {
+                                        Floating *Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result = Floating(F);
+    Result->copy(F);
     return Status;
   }
 
-  static Floating bitcastFromMemory(const std::byte *Buff,
-                                    const llvm::fltSemantics &Sem) {
+  static void bitcastFromMemory(const std::byte *Buff,
+                                const llvm::fltSemantics &Sem,
+                                Floating *Result) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-
-    return Floating(APFloat(Sem, API));
+    Result->copy(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = F.bitcastToAPInt();
+    llvm::APInt API = getValue().bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(llvm::fltSemantics *) +
-           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
+    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    // Semantics followed by an APInt.
-    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
-
-    llvm::APInt API = F.bitcastToAPInt();
-    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
-                           bitWidth() / 8);
+    std::memcpy(Buff, &Semantics, sizeof(Semantics));
+    if (singleWord()) {
+      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
+    } else {
+      std::memcpy(Buff + sizeof(Semantics), Memory,
+                  numWords() * sizeof(uint64_t));
+    }
   }
 
-  static Floating deserialize(const std::byte *Buff) {
-    const llvm::fltSemantics *Sem;
-    std::memcpy((void *)&Sem, Buff, sizeof(void *));
-    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
+  static llvm::APFloatBase::Semantics
+  deserializeSemantics(const std::byte *Buff) {
+    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
   }
 
-  static Floating abs(const Floating &F) {
-    APFloat V = F.F;
-    if (V.isNegative())
-      V.changeSign();
-    return Floating(V);
+  static void deserialize(const std::byte *Buff, Floating *Result) {
+    llvm::APFloatBase::Semantics Semantics;
+    std::memcpy(&Semantics, Buff, sizeof(Semantics));
+
+    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
+        llvm::APFloatBase::EnumToSemantics(Semantics));
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+
+    Result->Semantics = Semantics;
+    if (NumWords == 1 && !ALLOCATE_ALL) {
+      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
+    } else {
+      assert(Result->Memory);
+      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
+                  NumWords * sizeof(uint64_t));
+    }
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.add(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.add(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.add(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.add(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.subtract(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.subtract(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.subtract(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.subtract(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.multiply(B.F, RM);
+
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.multiply(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.divide(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.divide(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    *R = -A;
+    R->copy(-A.getValue());
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index 13fdb5369f2b..af5cd2d13ecc 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,6 +99,9 @@ public:
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
+  bool operator>=(unsigned RHS) const {
+    return static_cast<unsigned>(V) >= RHS;
+  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 8ee08dfb5cfe..316c003e0e50 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,12 +28,19 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
-template <unsigned Bits, bool Signed> class Integral;
 
+/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 template <bool Signed> class IntegralAP final {
-private:
+public:
+  union {
+    uint64_t *Memory = nullptr;
+    uint64_t Val;
+  };
+  uint32_t BitWidth = 0;
   friend IntegralAP<!Signed>;
-  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -52,106 +59,118 @@ private:
                                : V.trunc(BitSize).getZExtValue();
   }
 
+  APInt getValue() const {
+    if (singleWord())
+      return APInt(BitWidth, Val, Signed);
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+    return llvm::APInt(BitWidth, NumWords, Memory);
+  }
+
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  template <typename T>
-  IntegralAP(T Value, unsigned BitWidth)
-      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
-
-  IntegralAP(APInt V) : V(V) {}
-  /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
-
-  IntegralAP operator-() const { return IntegralAP(-V); }
-  IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(V - Other.V);
+  void take(uint64_t *NewMemory) {
+    assert(!singleWord());
+    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
   }
+
+  void copy(const APInt &V) {
+    assert(BitWidth == V.getBitWidth());
+    assert(numWords() == V.getNumWords());
+
+    if (V.isSingleWord()) {
+      if constexpr (Signed)
+        Val = V.getSExtValue();
+      else
+        Val = V.getZExtValue();
+      return;
+    }
+    assert(Memory);
+    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
+  }
+
+  IntegralAP() = default;
+  /// Zeroed, single-word IntegralAP of the given bitwidth.
+  IntegralAP(unsigned BitWidth) : Val(0), BitWidth(BitWidth) {
+    assert(singleWord());
+  }
+  IntegralAP(uint64_t *Memory, unsigned BitWidth)
+      : Memory(Memory), BitWidth(BitWidth) {}
+  IntegralAP(const APInt &V) : BitWidth(V.getBitWidth()) {
+    if (V.isSingleWord()) {
+      Val = Signed ? V.getSExtValue() : V.getZExtValue();
+    } else {
+      Memory = const_cast<uint64_t *>(V.getRawData());
+    }
+  }
+
+  IntegralAP operator-() const { return IntegralAP(-getValue()); }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return V.ugt(RHS.V);
-    return V.sgt(RHS.V);
+      return getValue().sgt(RHS.getValue());
+    return getValue().ugt(RHS.getValue());
   }
-  bool operator>=(IntegralAP RHS) const {
+  bool operator>=(unsigned RHS) const {
     if constexpr (Signed)
-      return V.uge(RHS.V);
-    return V.sge(RHS.V);
+      return getValue().sge(RHS);
+    return getValue().uge(RHS);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return V.slt(RHS.V);
-    return V.slt(RHS.V);
-  }
-  bool operator<=(IntegralAP RHS) const {
-    if constexpr (Signed)
-      return V.ult(RHS.V);
-    return V.ult(RHS.V);
+      return getValue().slt(RHS.getValue());
+    return getValue().ult(RHS.getValue());
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(V);
+    return truncateCast<Ty, Signed>(getValue());
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
+    if (NumBits == 0)
+      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-
+    assert(false);
     return IntegralAP<Signed>(Copy);
   }
 
-  template <bool InputSigned>
-  static IntegralAP from(IntegralAP<InputSigned> V, unsigned NumBits = 0) {
-    if (NumBits == 0)
-      NumBits = V.bitWidth();
-
-    if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
-  }
-
-  template <unsigned Bits, bool InputSigned>
-  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
-    return IntegralAP<Signed>(I.toAPInt(BitWidth));
-  }
-
-  static IntegralAP zero(int32_t BitWidth) {
-    APInt V = APInt(BitWidth, 0LL, Signed);
-    return IntegralAP(V);
-  }
-
-  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
+  constexpr uint32_t bitWidth() const { return BitWidth; }
+  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
+  constexpr bool singleWord() const { return numWords() == 1; }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
+    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(V.sext(Bits), !Signed);
+      return APSInt(getValue().sext(Bits), !Signed);
     else
-      return APSInt(V.zext(Bits), !Signed);
+      return APSInt(getValue().zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return V.isZero(); }
+  bool isZero() const { return getValue().isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return V.isNonNegative();
+      return getValue().isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !V.isNonNegative();
+      return !getValue().isNonNegative();
     return false;
   }
-  bool isMin() const { return V.isMinValue(); }
-  bool isMax() const { return V.isMaxValue(); }
+  bool isMin() const { return getValue().isMinValue(); }
+  bool isMax() const { return getValue().isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && V == -1; }
+  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
 
-  unsigned countLeadingZeros() const { return V.countl_zero(); }
+  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
+  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -161,53 +180,57 @@ public:
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    APInt Copy = V;
-    return IntegralAP<false>(Copy);
+    return IntegralAP<false>(Memory, BitWidth);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
   }
 
-  static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
+  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
+                                IntegralAP *Result) {
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
-    return IntegralAP(V);
+    Result->copy(V);
   }
 
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
+    APInt V1 = getValue();
+    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V.slt(RHS.V))
+      if (V1.slt(V2))
         return ComparisonCategoryResult::Less;
-      if (V.sgt(RHS.V))
+      if (V1.sgt(V2))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V.ult(RHS.V))
+    if (V1.ult(V2))
       return ComparisonCategoryResult::Less;
-    if (V.ugt(RHS.V))
+    if (V1.ugt(V2))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return add(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return sub(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -224,87 +247,96 @@ public:
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.srem(B.V));
+      R->copy(A.getValue().srem(B.getValue()));
     else
-      *R = IntegralAP(A.V.urem(B.V));
+      R->copy(A.getValue().urem(B.getValue()));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.sdiv(B.V));
+      R->copy(A.getValue().sdiv(B.getValue()));
     else
-      *R = IntegralAP(A.V.udiv(B.V));
+      R->copy(A.getValue().udiv(B.getValue()));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V & B.V);
+    R->copy(A.getValue() & B.getValue());
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    *R = IntegralAP(A.V | B.V);
+    R->copy(A.getValue() | B.getValue());
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V ^ B.V);
+    R->copy(A.getValue() ^ B.getValue());
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.V;
+    APInt AI = A.getValue();
     AI.negate();
-    *R = IntegralAP(AI);
+    R->copy(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    *R = IntegralAP(~A.V);
+    R->copy(~A.getValue());
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
+    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.V.getZExtValue();
+    unsigned ShiftAmount = B.getValue().getZExtValue();
     if constexpr (Signed)
-      *R = IntegralAP(A.V.ashr(ShiftAmount));
+      R->copy(A.getValue().ashr(ShiftAmount));
     else
-      *R = IntegralAP(A.V.lshr(ShiftAmount));
+      R->copy(A.getValue().lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
-    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
+    assert(BitWidth != 0);
+    return sizeof(uint32_t) + (numWords() * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
-    uint32_t BitWidth = V.getBitWidth();
-
     std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
-    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
-                           BitWidth / CHAR_BIT);
+    if (singleWord())
+      std::memcpy(Buff + sizeof(uint32_t), &Val, sizeof(uint64_t));
+    else {
+      std::memcpy(Buff + sizeof(uint32_t), Memory,
+                  numWords() * sizeof(uint64_t));
+    }
   }
 
-  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
-    uint32_t BitWidth;
-    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
-    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
+  static uint32_t deserializeSize(const std::byte *Buff) {
+    return *reinterpret_cast<const uint32_t *>(Buff);
+  }
 
-    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
-                            BitWidth / CHAR_BIT);
-    return Val;
+  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
+    uint32_t BitWidth = Result->BitWidth;
+    assert(BitWidth != 0);
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+
+    if (NumWords == 1)
+      std::memcpy(&Result->Val, Buff + sizeof(uint32_t), sizeof(uint64_t));
+    else {
+      assert(Result->Memory);
+      std::memcpy(Result->Memory, Buff + sizeof(uint32_t),
+                  NumWords * sizeof(uint64_t));
+    }
   }
 
 private:
@@ -312,7 +344,7 @@ private:
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->V = Op<APInt>{}(A.V, B.V);
+      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
       return false;
     }
 
@@ -320,7 +352,7 @@ private:
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->V = Result;
+    R->copy(Result);
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 5c8abffb3a99..1e2032feabb6 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,8 +1935,10 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
@@ -1946,8 +1948,10 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2053,6 +2057,100 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
+                                PrimType T) {
+
+  if (T == PT_IntAPS) {
+    auto &Val = Ptr.deref<IntegralAP<true>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_IntAP) {
+    auto &Val = Ptr.deref<IntegralAP<false>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_Float) {
+    auto &Val = Ptr.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+}
+
+template <typename T>
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
+  assert(needsAlloc<T>());
+  auto &Val = Ptr.deref<T>();
+  if (!Val.singleWord()) {
+    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+    Val.take(NewMemory);
+  }
+}
+
+static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
+  if (const Record *R = Ptr.getRecord()) {
+    for (const Record::Field &Fi : R->fields()) {
+      if (Fi.Desc->isPrimitive()) {
+        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
+          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
+        });
+        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
+      } else
+        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
+    }
+    return;
+  }
+
+  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
+    unsigned NumElems = D->getNumElems();
+    if (NumElems == 0)
+      return;
+
+    if (D->isPrimitiveArray()) {
+      PrimType PT = D->getPrimType();
+      if (!needsAlloc(PT))
+        return;
+      assert(NumElems >= 1);
+      const Pointer EP = Ptr.atIndex(0);
+      bool AllSingleWord = true;
+      TYPE_SWITCH_ALLOC(PT, {
+        if (!EP.deref<T>().singleWord()) {
+          copyPrimitiveMemory<T>(S, EP);
+          AllSingleWord = false;
+        }
+      });
+      if (AllSingleWord)
+        return;
+      for (unsigned I = 1; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I);
+        copyPrimitiveMemory(S, EP, PT);
+      }
+    } else {
+      assert(D->isCompositeArray());
+      for (unsigned I = 0; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I).narrow();
+        finishGlobalRecurse(S, EP);
+      }
+    }
+  }
+}
+
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+
+  finishGlobalRecurse(S, Ptr);
+  if (Ptr.canBeInitialized()) {
+    Ptr.initialize();
+    Ptr.activate();
+  }
+
+  return true;
+}
+
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index ae3d4a441a79..66d3e6d79e8b 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
+  if (Bits > 1 && RHS >= Bits) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,6 +370,9 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -408,6 +411,7 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -423,7 +427,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -434,6 +438,7 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -442,7 +447,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -453,6 +458,7 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
+
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -461,8 +467,10 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
+
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -484,9 +492,14 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -539,10 +552,20 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).initialize();
+    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    // Result.atIndex(1).initialize();
+
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    Result.atIndex(1).initialize();
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+
     Result.initialize();
   } else {
     // Integer element type.
@@ -608,9 +631,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -625,9 +651,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -644,7 +673,11 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -659,12 +692,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -679,12 +715,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -707,8 +746,10 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -730,31 +771,44 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
-  T Result;
 
-  if (!T::neg(Value, &Result)) {
+  if constexpr (std::is_same_v<T, Floating>) {
+    T Result = S.allocFloat(Value.getSemantics());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+    return false;
+  } else {
+    T Result;
+    if constexpr (needsAlloc<T>())
+      Result = S.allocAP<T>(Value.bitWidth());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+
+    assert(isIntegralType(Name) &&
+           "don't expect other types to fail at constexpr negation");
     S.Stk.push<T>(Result);
-    return true;
+
+    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+    if (S.checkingForUndefinedBehavior()) {
+      const Expr *E = S.Current->getExpr(OpPC);
+      QualType Type = E->getType();
+      SmallString<32> Trunc;
+      NegatedValue.trunc(Result.bitWidth())
+          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                    /*UpperCase=*/true, /*InsertSeparators=*/true);
+      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+          << Trunc << Type << E->getSourceRange();
+      return true;
+    }
+
+    return handleOverflow(S, OpPC, NegatedValue);
   }
-
-  assert(isIntegralType(Name) &&
-         "don't expect other types to fail at constexpr negation");
-  S.Stk.push<T>(Result);
-
-  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-  if (S.checkingForUndefinedBehavior()) {
-    const Expr *E = S.Current->getExpr(OpPC);
-    QualType Type = E->getType();
-    SmallString<32> Trunc;
-    NegatedValue.trunc(Result.bitWidth())
-        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                  /*UpperCase=*/true, /*InsertSeparators=*/true);
-    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-        << Trunc << Type << E->getSourceRange();
-    return true;
-  }
-
-  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -783,6 +837,8 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -890,7 +946,6 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
-
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -898,7 +953,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result;
+  Floating Result = S.allocFloat(Value.getSemantics());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -952,12 +1007,15 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Val.bitWidth());
+
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
-
   return false;
 }
 
@@ -1325,10 +1383,23 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Arg.bitWidth());
+    Result.copy(Arg.toAPSInt());
+    S.Stk.push<T>(Result);
+    return true;
+  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
+inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
+  Floating Result = S.allocFloat(F.getSemantics());
+  Result.copy(F.getAPFloat());
+  S.Stk.push<Floating>(Result);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1483,7 +1554,24 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
+
   P.deref<T>() = S.Stk.pop<T>();
+
+  if constexpr (std::is_same_v<T, Floating>) {
+    auto &Val = P.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+
+  } else if constexpr (needsAlloc<T>()) {
+    auto &Val = P.deref<T>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+
   P.initialize();
   return true;
 }
@@ -1585,7 +1673,22 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Value.bitWidth());
+    if (T::isSigned())
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .sextOrTrunc(Value.bitWidth()));
+    else
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .zextOrTrunc(Value.bitWidth()));
+
+    Field.deref<T>() = Result;
+  } else {
+    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+  }
   Field.activate();
   Field.initialize();
   return true;
@@ -1765,6 +1868,8 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
+
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2271,7 +2376,8 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = F.toSemantics(Sem, RM);
+  Floating Result = S.allocFloat(*Sem);
+  F.toSemantics(Sem, RM, &Result);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2295,15 +2401,25 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2312,11 +2428,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
-  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
+  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2365,7 +2481,12 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<false>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2381,7 +2502,12 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<true>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2441,8 +2567,9 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-
-  S.Stk.push<Floating>(Fixed.toFloat(Sem));
+  Floating Result = S.allocFloat(*Sem);
+  Result.copy(Fixed.toFloat(Sem));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -2506,12 +2633,18 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2578,7 +2711,9 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                    LT *Result) {
+
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2596,7 +2731,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS);
+        S, OpPC, LHS, RHS, Result);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2644,6 +2779,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
+      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2652,6 +2788,48 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   return true;
 }
 
+/// A version of DoShift that works on IntegralAP.
+template <class LT, class RT, ShiftDir Dir>
+inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                      LT *Result) {
+  const unsigned Bits = LHS.bitWidth();
+  const APSInt &LHSAP = LHS.toAPSInt();
+  APSInt RHSAP = RHS.toAPSInt();
+
+  // OpenCL 6.3j: shift values are effectively % word size of LHS.
+  if (S.getLangOpts().OpenCL)
+    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
+                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
+                    RHSAP.isUnsigned());
+
+  if (RHS.isNegative()) {
+    // During constant-folding, a negative shift is an opposite shift. Such a
+    // shift is not a constant expression.
+    const SourceInfo &Loc = S.Current->getSource(OpPC);
+    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
+    if (!S.noteUndefinedBehavior())
+      return false;
+    RHS = -RHS;
+    return DoShiftAP<LT, RT,
+                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
+        S, OpPC, LHS, RHS, Result);
+  }
+
+  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
+    return false;
+
+  if constexpr (Dir == ShiftDir::Left) {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP << SA);
+  } else {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP >> SA);
+  }
+
+  S.Stk.push<LT>(*Result);
+  return true;
+}
+
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2659,7 +2837,13 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2668,8 +2852,13 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-
-  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3252,7 +3441,15 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+      Floating Result = S.allocFloat(*Sem);
+      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
+      S.Stk.push<Floating>(Result);
+
+      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+    } else if constexpr (needsAlloc<T>()) {
+      T Result = S.allocAP<T>(ResultBitWidth);
+      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
+      S.Stk.push<T>(Result);
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3310,7 +3507,11 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  Floating F = Floating::deserialize(*OpPC);
+  auto &Semantics =
+      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
+
+  auto F = S.allocFloat(Semantics);
+  Floating::deserialize(*OpPC, &F);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3318,17 +3519,25 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<false>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<true>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d01e3d042a8b..5304bd77f2c0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,6 +57,21 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
+
+  if (T == PT_IntAPS) {
+    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<true>>(Result);
+    return;
+  }
+
+  if (T == PT_IntAP) {
+    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<false>>(Result);
+    return;
+  }
+
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -327,13 +342,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result;
+  Floating Result = S.allocFloat(TargetSemantics);
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -342,10 +357,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -360,7 +375,9 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
+  Floating Result = S.allocFloat(TargetSemantics);
+  Result.copy(APFloat::getInf(TargetSemantics));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -368,10 +385,12 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  S.Stk.push<Floating>(Floating(Copy));
+  Result.copy(Copy);
+  S.Stk.push<Floating>(Result);
 
   return true;
 }
@@ -380,11 +399,13 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -392,11 +413,13 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -571,8 +594,16 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
+  APFloat F = Val.getAPFloat();
+  if (!F.isNegative()) {
+    S.Stk.push<Floating>(Val);
+    return true;
+  }
 
-  S.Stk.push<Floating>(Floating::abs(Val));
+  Floating Result = S.allocFloat(Val.getSemantics());
+  F.changeSign();
+  Result.copy(F);
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 239b3104e89f..2569cac018b3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,7 +402,9 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
+          Floating R = S.allocFloat(Semantics);
+          Floating::bitcastFromMemory(M.get(), Semantics, &R);
+          P.deref<Floating>() = R;
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index e8dc6f0483d6..08765561985e 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,6 +15,7 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
+#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -126,6 +127,33 @@ public:
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
+  void *allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *allocate(size_t Num = 1) const {
+    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
+  }
+
+  template <typename T> T allocAP(unsigned BitWidth) {
+    unsigned NumWords = APInt::getNumWords(BitWidth);
+    if (NumWords == 1)
+      return T(BitWidth);
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return T(Mem, BitWidth);
+  }
+
+  Floating allocFloat(const llvm::fltSemantics &Sem) {
+    if (Floating::singleWord(Sem))
+      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
+
+    unsigned NumWords =
+        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
+  }
+
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -161,6 +189,8 @@ public:
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
+
+  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index c76ac5f8ae86..57e01f7bd9da 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,6 +48,7 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
+
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -88,6 +89,9 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
+def IntegralTypeClass : TypeClass {
+  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
+}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -265,12 +269,13 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstFloat : ConstOpcode<Float, ArgFloat>;
-def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
+def ConstFloat : Opcode { let Args = [ArgFloat]; }
+
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -328,6 +333,7 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
+def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -389,7 +395,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [AluTypeClass];
+  let Types = [IntegralTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index 6152fbfbe3a7..a156cccbb3c1 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,6 +76,13 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
+template <typename T> constexpr bool needsAlloc() {
+  return std::is_same_v<T, IntegralAP<false>> ||
+         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
+}
+constexpr bool needsAlloc(PrimType T) {
+  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
+}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -209,6 +216,16 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
+#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
+  do {                                                                         \
+    switch (Expr) {                                                            \
+      TYPE_SWITCH_CASE(PT_Float, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
+    default:;                                                                  \
+    }                                                                          \
+  } while (0)
+
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 23ba1bbd193b..5d9c42244749 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,6 +132,14 @@ public:
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
+  void *Allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *Allocate(size_t Num = 1) const {
+    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
+  }
+  void Deallocate(void *Ptr) const {}
+
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -204,7 +212,7 @@ private:
   };
 
   /// Allocator for globals.
-  PoolAllocTy Allocator;
+  mutable PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -238,4 +246,18 @@ public:
 } // namespace interp
 } // namespace clang
 
+inline void *operator new(size_t Bytes, const clang::interp::Program &C,
+                          size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
+inline void operator delete(void *Ptr, const clang::interp::Program &C,
+                            size_t) {
+  C.Deallocate(Ptr);
+}
+inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
+                            size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 710612bef8fd..1013a771d13b 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,6 +21,9 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
+#if __x86_64
+  // both-note@-2 {{indeterminate value can only initialize an object of type}}
+#endif
 }
 
 template <class Intermediate, class Init>
@@ -38,11 +41,8 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
-#if 0
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
-                                                                                 // expected-note{{in call}}
-#endif
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
+                                                                                 // both-note{{in call}}
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 21dca15a4577..174c1ffa79a4 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  //constexpr long double NaN5 = __builtin_nanf128("");
+  constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,8 +655,6 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
-/// FIXME: The commented out tests here use a IntAP value and fail.
-/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -709,7 +707,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int clz50 = __builtin_clzg((unsigned __int128)0);
+  int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -717,7 +715,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -775,7 +773,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -785,7 +783,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 770b16cd4939054e30f5bba40765cfe1f5895f74 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 20 Jun 2025 09:07:08 -0700
Subject: [PATCH 1072/1322] [BOLT][test] Update X86/perf2bolt-spe.test
 (#145061)

Address NFC mismatches caused by running perf2bolt from under the
wrapper script:
https://lab.llvm.org/buildbot/#/builders/92/builds/20938
> <stdin>:2:64: note: possible intended match here
>
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/llvm-bolt.old:
-spe is available only on AArch64.

Test Plan:
ninja check-bolt
---
 bolt/test/perf2bolt/X86/perf2bolt-spe.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/test/perf2bolt/X86/perf2bolt-spe.test b/bolt/test/perf2bolt/X86/perf2bolt-spe.test
index 8eed2c859509..101bd3789a18 100644
--- a/bolt/test/perf2bolt/X86/perf2bolt-spe.test
+++ b/bolt/test/perf2bolt/X86/perf2bolt-spe.test
@@ -6,4 +6,4 @@ RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.e
 RUN: touch %t.empty.perf.data
 RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --spe --pa %t.exe 2>&1 | FileCheck %s
 
-CHECK: perf2bolt: -spe is available only on AArch64.
+CHECK: perf2bolt{{.*}} -spe is available only on AArch64.

From 72de0e45846bab293a0b07e5aeda65a5e6590a87 Mon Sep 17 00:00:00 2001
From: Chenguang Wang <w3cing@gmail.com>
Date: Fri, 20 Jun 2025 09:07:35 -0700
Subject: [PATCH 1073/1322] [TableGen][Docs] Fix empty list syntax in TableGen
 doc. (#145041)

`[]<list<int>>` actually produces `list<list<int>>`.
---
 llvm/docs/TableGen/ProgRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index fe0564d2990d..7b30698ce4fd 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -1915,7 +1915,7 @@ and non-0 as true.
     ``list<int>``. *start* is ``0`` and *step* is ``1`` by default. *step* can
     be negative and cannot be 0. If *start* ``<`` *end* and *step* is negative,
     or *start* ``>`` *end* and *step* is positive, the result is an empty list
-    ``[]<list<int>>``.
+    ``[]<int>``.
 
     For example:
 

From 6ba1955ba22c0bea4d2fb76c6dfcad0f5354be72 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 20 Jun 2025 10:45:38 -0500
Subject: [PATCH 1074/1322] [flang][OpenMP] Fix ignore-target-data.f90 test

Allow the function definition line to match with and without attrbute
set number.

This fixes build break after PR144534:
https://lab.llvm.org/buildbot/#/builders/157/builds/31331

Also move the test to the OpenMP subdirectory where it should have
been from the beginning.
---
 flang/test/Lower/{ => OpenMP}/ignore-target-data.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename flang/test/Lower/{ => OpenMP}/ignore-target-data.f90 (98%)

diff --git a/flang/test/Lower/ignore-target-data.f90 b/flang/test/Lower/OpenMP/ignore-target-data.f90
similarity index 98%
rename from flang/test/Lower/ignore-target-data.f90
rename to flang/test/Lower/OpenMP/ignore-target-data.f90
index 08947c137090..f1a150d5dfab 100644
--- a/flang/test/Lower/ignore-target-data.f90
+++ b/flang/test/Lower/OpenMP/ignore-target-data.f90
@@ -6,7 +6,7 @@
 !NORT-NOT: call{{.*}}__tgt_target_data_end_mapper
 
 !Make sure we generate the body
-!LLVM: define internal void @_QFPf(ptr %[[A0:[0-9]+]], ptr %[[A1:[0-9]+]]) {
+!LLVM: define internal void @_QFPf(ptr %[[A0:[0-9]+]], ptr %[[A1:[0-9]+]])
 !LLVM:   %[[V0:[0-9]+]] = load i32, ptr %[[A0]], align 4
 !LLVM:   %[[V1:[0-9]+]] = load i32, ptr %[[A1]], align 4
 !LLVM:   %[[V2:[0-9]+]] = add i32 %[[V0]], %[[V1]]

From bfef8732be1b7b3a7ba7b3ccd93d043fd044293e Mon Sep 17 00:00:00 2001
From: Justin King <jcking@google.com>
Date: Fri, 20 Jun 2025 09:16:40 -0700
Subject: [PATCH 1075/1322] msan: Support free_sized and free_aligned_sized
 from C23 (#144529)

Adds support to MSan for `free_sized` and `free_aligned_sized` from C23.

Other sanitizers will be handled with their own separate PRs.

For https://github.com/llvm/llvm-project/issues/144435

Signed-off-by: Justin King <jcking@google.com>
---
 compiler-rt/lib/msan/msan_interceptors.cpp    | 32 +++++++++++++++++++
 .../TestCases/Linux/free_aligned_sized.c      |  2 +-
 .../TestCases/Linux/free_sized.c              |  2 +-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index f94d3cb79aa0..bc28434d1d49 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -34,6 +34,7 @@
 #include "sanitizer_common/sanitizer_glibc_version.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_linux.h"
+#include "sanitizer_common/sanitizer_platform_interceptors.h"
 #include "sanitizer_common/sanitizer_platform_limits_netbsd.h"
 #include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
@@ -215,6 +216,35 @@ INTERCEPTOR(void, free, void *ptr) {
   MsanDeallocate(&stack, ptr);
 }
 
+#if SANITIZER_INTERCEPT_FREE_SIZED
+INTERCEPTOR(void, free_sized, void *ptr, uptr size) {
+  if (UNLIKELY(!ptr))
+    return;
+  if (DlsymAlloc::PointerIsMine(ptr))
+    return DlsymAlloc::Free(ptr);
+  GET_MALLOC_STACK_TRACE;
+  MsanDeallocate(&stack, ptr);
+}
+#  define MSAN_MAYBE_INTERCEPT_FREE_SIZED INTERCEPT_FUNCTION(free_sized)
+#else
+#  define MSAN_MAYBE_INTERCEPT_FREE_SIZED
+#endif
+
+#if SANITIZER_INTERCEPT_FREE_ALIGNED_SIZED
+INTERCEPTOR(void, free_aligned_sized, void *ptr, uptr alignment, uptr size) {
+  if (UNLIKELY(!ptr))
+    return;
+  if (DlsymAlloc::PointerIsMine(ptr))
+    return DlsymAlloc::Free(ptr);
+  GET_MALLOC_STACK_TRACE;
+  MsanDeallocate(&stack, ptr);
+}
+#  define MSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED \
+    INTERCEPT_FUNCTION(free_aligned_sized)
+#else
+#  define MSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED
+#endif
+
 #if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
 INTERCEPTOR(void, cfree, void *ptr) {
   if (UNLIKELY(!ptr))
@@ -1775,6 +1805,8 @@ void InitializeInterceptors() {
   INTERCEPT_FUNCTION(realloc);
   INTERCEPT_FUNCTION(reallocarray);
   INTERCEPT_FUNCTION(free);
+  MSAN_MAYBE_INTERCEPT_FREE_SIZED;
+  MSAN_MAYBE_INTERCEPT_FREE_ALIGNED_SIZED;
   MSAN_MAYBE_INTERCEPT_CFREE;
   MSAN_MAYBE_INTERCEPT_MALLOC_USABLE_SIZE;
   MSAN_MAYBE_INTERCEPT_MALLINFO;
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
index e9cb6f20c5ea..7710c6236819 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_aligned_sized.c
@@ -1,5 +1,5 @@
 // RUN: %clang -std=c23 -O0 %s -o %t && %run %t
-// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, ubsan
+// UNSUPPORTED: asan, hwasan, rtsan, tsan, ubsan
 
 #include <stddef.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
index 8cdf3216e528..9eac562fecb0 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/free_sized.c
@@ -1,5 +1,5 @@
 // RUN: %clang -std=c23 -O0 %s -o %t && %run %t
-// UNSUPPORTED: asan, hwasan, rtsan, tsan, msan, ubsan
+// UNSUPPORTED: asan, hwasan, rtsan, tsan, ubsan
 
 #include <stddef.h>
 #include <stdlib.h>

From 877511920dcf36463e06746d626e8876583a6abd Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 20 Jun 2025 17:20:58 +0100
Subject: [PATCH 1076/1322] Revert "[lldb][DWARF] Remove object_pointer from
 ParsedDWARFAttributes" (#145065)

Reverts llvm/llvm-project#144880

Caused `TestObjCIvarsInBlocks.py` to fail on macOS CI.
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  | 28 +++++++++++++------
 .../SymbolFile/DWARF/DWARFASTParserClang.h    |  7 ++---
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 3bec89cdf746..4f79c8aa3f81 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -445,6 +445,15 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) {
       name.SetCString(form_value.AsCString());
       break;
 
+    case DW_AT_object_pointer:
+      // GetAttributes follows DW_AT_specification.
+      // DW_TAG_subprogram definitions and declarations may both
+      // have a DW_AT_object_pointer. Don't overwrite the one
+      // we parsed for the definition with the one from the declaration.
+      if (!object_pointer.IsValid())
+        object_pointer = form_value.Reference();
+      break;
+
     case DW_AT_signature:
       signature = form_value;
       break;
@@ -1107,7 +1116,7 @@ bool DWARFASTParserClang::ParseObjCMethod(
 std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
     const DWARFDIE &die, CompilerType clang_type,
     const ParsedDWARFTypeAttributes &attrs, const DWARFDIE &decl_ctx_die,
-    const DWARFDIE &object_parameter, bool &ignore_containing_context) {
+    bool is_static, bool &ignore_containing_context) {
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
   SymbolFileDWARF *dwarf = die.GetDWARF();
   assert(dwarf);
@@ -1191,9 +1200,6 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
       TypeSystemClang::GetDeclContextForType(class_opaque_type), die,
       attrs.name.GetCString());
 
-  // In DWARF, a C++ method is static if it has no object parameter child.
-  const bool is_static = !object_parameter.IsValid();
-
   // We have a C++ member function with no children (this pointer!) and clang
   // will get mad if we try and make a function that isn't well formed in the
   // DWARF, so we will just skip it...
@@ -1219,7 +1225,9 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
     ClangASTMetadata metadata;
     metadata.SetUserID(die.GetID());
 
-    if (char const *object_pointer_name = object_parameter.GetName()) {
+    char const *object_pointer_name =
+        attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
+    if (object_pointer_name) {
       metadata.SetObjectPtrName(object_pointer_name);
       LLDB_LOGF(log, "Setting object pointer name: %s on method object %p.\n",
                 object_pointer_name, static_cast<void *>(cxx_method_decl));
@@ -1315,9 +1323,11 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
         type_handled =
             ParseObjCMethod(*objc_method, die, clang_type, attrs, is_variadic);
       } else if (is_cxx_method) {
+        // In DWARF, a C++ method is static if it has no object parameter child.
+        const bool is_static = !object_parameter.IsValid();
         auto [handled, type_sp] =
-            ParseCXXMethod(die, clang_type, attrs, decl_ctx_die,
-                           object_parameter, ignore_containing_context);
+            ParseCXXMethod(die, clang_type, attrs, decl_ctx_die, is_static,
+                           ignore_containing_context);
         if (type_sp)
           return type_sp;
 
@@ -1412,7 +1422,9 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
           ClangASTMetadata metadata;
           metadata.SetUserID(die.GetID());
 
-          if (char const *object_pointer_name = object_parameter.GetName()) {
+          char const *object_pointer_name =
+              attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
+          if (object_pointer_name) {
             metadata.SetObjectPtrName(object_pointer_name);
             LLDB_LOGF(log,
                       "Setting object pointer name: %s on function "
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index a90f55bcff94..111604ce4068 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -470,8 +470,7 @@ private:
   /// \param[in] decl_ctx_die The DIE representing the DeclContext of the C++
   ///                         method being parsed.
   ///
-  /// \param[in] object_parameter The DIE of this subprogram's object parameter.
-  ///                             May be an invalid DIE for C++ static methods.
+  /// \param[in] is_static Is true iff we're parsing a static method.
   ///
   /// \param[out] ignore_containing_context Will get set to true if the caller
   ///             should treat this C++ method as-if it was not a C++ method.
@@ -486,8 +485,7 @@ private:
                  lldb_private::CompilerType clang_type,
                  const ParsedDWARFTypeAttributes &attrs,
                  const lldb_private::plugin::dwarf::DWARFDIE &decl_ctx_die,
-                 const lldb_private::plugin::dwarf::DWARFDIE &object_parameter,
-                 bool &ignore_containing_context);
+                 bool is_static, bool &ignore_containing_context);
 
   lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die,
                               const ParsedDWARFTypeAttributes &attrs);
@@ -557,6 +555,7 @@ struct ParsedDWARFTypeAttributes {
   const char *mangled_name = nullptr;
   lldb_private::ConstString name;
   lldb_private::Declaration decl;
+  lldb_private::plugin::dwarf::DWARFDIE object_pointer;
   lldb_private::plugin::dwarf::DWARFFormValue abstract_origin;
   lldb_private::plugin::dwarf::DWARFFormValue containing_type;
   lldb_private::plugin::dwarf::DWARFFormValue signature;

From 940ff110d78ba1eea7e0004275e098024cfd87c4 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Fri, 20 Jun 2025 12:22:59 -0400
Subject: [PATCH 1077/1322] [InstCombine] fix hwasan mistake in "remove dead
 loads" (#145057)

Detected by CI after #143958.
---
 llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index bcc73090277a..ce4202926135 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3490,6 +3490,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
           replaceInstUsesWith(*I, Result);
           eraseInstFromFunction(*I);
           Users[i] = nullptr; // Skip examining in the next loop.
+          continue;
         }
         if (auto *MTI = dyn_cast<MemTransferInst>(I)) {
           if (KnowInitZero && isRefSet(*Removable)) {

From 379a609dadc1733c2b62d2bf3bab6e8032236836 Mon Sep 17 00:00:00 2001
From: Muzammil <55665739+Muzammiluddin-Syed-ECE@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:27:35 -0400
Subject: [PATCH 1078/1322] [mlir][arith][transforms] Adds f4E2M1FN support to
 truncf and extf (#144157)

See work detail: https://github.com/iree-org/iree/issues/20920

Add support for f4E2M1FN in `arith.truncf` and `arith.extf` ops though a software emulation

---------

Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
---
 .../mlir/Dialect/Arith/Transforms/Passes.h    |   3 +
 .../mlir/Dialect/Arith/Transforms/Passes.td   |   2 +
 .../Dialect/Arith/Transforms/ExpandOps.cpp    | 258 +++++++++++++++++-
 mlir/test/Dialect/Arith/expand-ops.mlir       |  42 ++-
 .../CPU/test-arith-expand-truncf-extf.mlir    |  73 +++++
 5 files changed, 370 insertions(+), 8 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir

diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index e0a4567d6f40..b03cf2db7804 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -59,6 +59,9 @@ void populateCeilFloorDivExpandOpsPatterns(RewritePatternSet &patterns);
 /// Add patterns to expand Arith bf16 patterns to lower level bitcasts/shifts.
 void populateExpandBFloat16Patterns(RewritePatternSet &patterns);
 
+/// Add patterns to expand Arith f4e2m1 patterns to lower level bitcasts/shifts.
+void populateExpandF4E2M1Patterns(RewritePatternSet &patterns);
+
 /// Add patterns to expand Arith f8e8m0 patterns to lower level bitcasts/shifts.
 void populateExpandF8E8M0Patterns(RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
index e14b2aeee1c6..c7370b83fdb6 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -19,6 +19,8 @@ def ArithExpandOpsPass : Pass<"arith-expand"> {
               "Enable the BF16 expansion patterns">,
        Option<"includeF8E8M0", "include-f8e8m0", "bool", /*default=*/"false",
               "Enable the F8E8M0 expansion patterns">,
+       Option<"includeF4E2M1", "include-f4e2m1", "bool", /*default=*/"false",
+              "Enable the F4E2M1 expansion patterns">,
   ];
 }
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index 534aff9562b7..dfa01844737c 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -11,8 +11,11 @@
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+#include <cstdint>
 
 namespace mlir {
 namespace arith {
@@ -34,6 +37,18 @@ static Value createConst(Location loc, Type type, int value,
   return rewriter.create<arith::ConstantOp>(loc, attr);
 }
 
+/// Create a float constant.
+static Value createFloatConst(Location loc, Type type, APFloat value,
+                              PatternRewriter &rewriter) {
+  auto attr = rewriter.getFloatAttr(getElementTypeOrSelf(type), value);
+  if (auto shapedTy = dyn_cast<ShapedType>(type)) {
+    return rewriter.create<arith::ConstantOp>(
+        loc, DenseElementsAttr::get(shapedTy, attr));
+  }
+
+  return rewriter.create<arith::ConstantOp>(loc, attr);
+}
+
 /// Creates shapedType using shape from cloneFrom and base type from cloneTo
 static Type cloneToShapedType(Type cloneFrom, Type cloneTo) {
   if (auto shapedTy = dyn_cast<ShapedType>(cloneFrom)) {
@@ -322,6 +337,100 @@ struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
   }
 };
 
+/// In this implementation of extf we take advantage of some key patterns we
+/// notice between the binary representation of an F4E2M1 value and its
+/// corresponding value in F32.
+///
+/// Note: x is sign bit
+/// | Binary | F4E2M1 | f32[23:32]
+/// | x000   | 0.0    | x000 0000 00
+/// | x001   | 0.5    | x011 1111 00
+/// | x010   | 1.0    | x011 1111 10
+/// | x011   | 1.5    | x011 1111 11
+/// | x100   | 2.0    | x010 0000 00
+/// | x101   | 3.0    | x010 0000 01
+/// | x110   | 4.0    | x010 0000 10
+/// | x111   | 6.0    | x010 0000 11
+///
+/// 1) There are only two versions of bits [25:31] in the f32 result
+///     F4E2M1 bits[2:3] decide whether:
+///       - F32 bits[25:31] = 0011 1111
+///       - F32 bits[25:31] = 0010 0000
+///     Exception is zero where
+///       - F32 bits[25:31] = 0000 0000
+///
+/// 2) F4E2M1 bits[1:2] = F32 bits[23:24]
+///     Exception is 0.5 where
+///       - F4E2M1 bits[1:2] = 01, F32 bits[23:24] = 00
+///
+/// 3) F4E2M1 bits[4] = F32 bits[32] (sign bits are equal)
+///
+/// 4) F32 bits[1:22] = 0
+struct F4E2M1ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::ExtFOp op,
+                                PatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    ImplicitLocOpBuilder b(loc, rewriter);
+    Value operand = op.getOperand();
+    Type operandTy = operand.getType();
+    Type resultTy = op.getType();
+    Type operandETy = getElementTypeOrSelf(operandTy);
+    Type resultETy = getElementTypeOrSelf(resultTy);
+
+    if (!isa<Float4E2M1FNType>(operandETy))
+      return rewriter.notifyMatchFailure(op, "not a ext of F4E2M1FN");
+
+    Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
+    Type i4Ty = cloneToShapedType(operandTy, b.getI4Type());
+    Type i32Ty = cloneToShapedType(operandTy, b.getI32Type());
+    Value i4Bits = b.create<arith::BitcastOp>(i4Ty, operand);
+
+    Value c0x0 = createConst(loc, i4Ty, 0x0, rewriter);
+    Value c0x1 = createConst(loc, i4Ty, 0x1, rewriter);
+    Value c0x2 = createConst(loc, i4Ty, 0x2, rewriter);
+    Value c0x4 = createConst(loc, i4Ty, 0x4, rewriter);
+
+    // Set last Exponent bit and Mantissa.
+    Value c0x00000014 = createConst(loc, i32Ty, 0x14, rewriter);
+    Value bits1To24 = b.create<arith::ShLIOp>(i4Bits, c0x2);
+    Value isHalf =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, i4Bits, c0x1);
+    bits1To24 = b.create<arith::SelectOp>(isHalf, c0x0, bits1To24);
+    bits1To24 = b.create<arith::ExtUIOp>(i32Ty, bits1To24);
+    bits1To24 = b.create<arith::ShLIOp>(bits1To24, c0x00000014);
+
+    // Set first 7 bits of Exponent.
+    Value zeroExpBits = createConst(loc, i32Ty, 0x00000000, rewriter);
+    Value highExpBits = createConst(loc, i32Ty, 0x40000000, rewriter);
+    Value lowExpBits = createConst(loc, i32Ty, 0x3f000000, rewriter);
+    Value useLargerExp =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, i4Bits, c0x4);
+    Value bits25To31 =
+        b.create<arith::SelectOp>(useLargerExp, highExpBits, lowExpBits);
+    Value zeroExp =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, i4Bits, c0x0);
+    bits25To31 = b.create<arith::SelectOp>(zeroExp, zeroExpBits, bits25To31);
+
+    // Set sign.
+    Value c0x80000000 = createConst(loc, i32Ty, 0x80000000, rewriter);
+    Value c0x8 = createConst(loc, i4Ty, 0x8, rewriter);
+    Value negative =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, i4Bits, c0x8);
+    Value bit32 = b.create<arith::SelectOp>(negative, c0x80000000, zeroExpBits);
+
+    // Add segments together.
+    Value bits1To31 = b.create<arith::AddIOp>(bits1To24, bits25To31);
+    Value bits1To32 = b.create<arith::AddIOp>(bits1To31, bit32);
+    Value result = b.create<arith::BitcastOp>(f32Ty, bits1To32);
+    if (!isa<Float32Type>(resultETy))
+      result = b.create<arith::TruncFOp>(resultTy, result);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 struct F8E8M0ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::ExtFOp op,
@@ -366,6 +475,130 @@ struct F8E8M0ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
   }
 };
 
+/// Conversion from F32 to F4E2M1 according to the OCP Spec:
+/// www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+///
+/// The spec requiers us to perform Round to Nearest, Ties to Even.
+///
+/// This means that after rounding, we should break ties by choosing the option
+/// which results in a mantissa of 0 in the least significant digit.
+///
+/// Table of representable values in F4E2M1:
+///
+/// Note: x is sign bit
+/// | Binary | F4E2M1 | F32[23:32]
+/// | x000   | 0.0    | x000 0000 00
+/// | x001   | 0.5    | x011 1111 00
+/// | x010   | 1.0    | x011 1111 10
+/// | x011   | 1.5    | x011 1111 11
+/// | x100   | 2.0    | x010 0000 00
+/// | x101   | 3.0    | x010 0000 01
+/// | x110   | 4.0    | x010 0000 10
+/// | x111   | 6.0    | x010 0000 11
+///
+/// Conversion procedure:
+///   Step 1: Clamp to representable bounds.
+///   Step 2: Convert exponent by adjusting bias.
+///   Step 3: Set mantissa to first bit.
+///   Step 4: Special consideration for subnormal and zero exponent.
+///   Step 5: Round up if necessary, if mantissa[1:] greater than 1000000 or
+///   subnormal.
+struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::TruncFOp op,
+                                PatternRewriter &rewriter) const final {
+    Location loc = op.getLoc();
+    ImplicitLocOpBuilder b(loc, rewriter);
+    Value operand = op.getOperand();
+    Type operandTy = operand.getType();
+    Type resultTy = op.getType();
+    Type operandETy = getElementTypeOrSelf(operandTy);
+    Type resultETy = getElementTypeOrSelf(resultTy);
+
+    Type i4Ty = cloneToShapedType(operandTy, b.getI4Type());
+    Type i8Ty = cloneToShapedType(operandTy, b.getI8Type());
+    Type i32Ty = cloneToShapedType(operandTy, b.getI32Type());
+    Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
+
+    if (!isa<Float32Type>(operandETy))
+      operand = b.create<arith::ExtFOp>(f32Ty, operand);
+    if (!isa<Float4E2M1FNType>(resultETy))
+      return rewriter.notifyMatchFailure(op, "not a trunc of F4E2M1FN");
+
+    Value c0x1 = createConst(loc, i4Ty, 1, rewriter);
+    Value c0x3 = createConst(loc, i4Ty, 3, rewriter);
+    Value c0x00000016 = createConst(loc, i32Ty, 22, rewriter);
+    Value c0x00 = createConst(loc, i8Ty, 0x00, rewriter);
+    Value c0xff = createConst(loc, i8Ty, 0xff, rewriter);
+    Value zeroExpBits = createConst(loc, i32Ty, 0, rewriter);
+
+    // Step 0: Clamp to bounds.
+    Value cHigherBound = createFloatConst(loc, f32Ty, APFloat(6.0f), rewriter);
+    Value cLowerBound = createFloatConst(loc, f32Ty, APFloat(-6.0f), rewriter);
+    Value operandClamped = b.create<arith::MinNumFOp>(cHigherBound, operand);
+    operandClamped = b.create<arith::MaxNumFOp>(cLowerBound, operandClamped);
+    Value f32Bits = b.create<arith::BitcastOp>(i32Ty, operandClamped);
+
+    // Step 1: Set sign bit.
+    Value cF32ExpManWidth = createConst(loc, i32Ty, 31, rewriter); // 23
+    Value f32Sign = b.create<arith::ShRUIOp>(f32Bits, cF32ExpManWidth);
+    Value f4Sign = b.create<arith::TruncIOp>(i4Ty, f32Sign);
+    Value f4Bits = b.create<arith::ShLIOp>(f4Sign, c0x3);
+
+    // Step 2: Convert exponent by adjusting bias.
+    Value biasAdjustment = createConst(loc, i32Ty, 0x7e, rewriter);
+    Value cF4MantissaWidth = c0x1;                                   // 1
+    Value cF32MantissaWidth = createConst(loc, i32Ty, 23, rewriter); // 23
+    Value f32SignExp = b.create<arith::ShRUIOp>(f32Bits, cF32MantissaWidth);
+    Value biasAdjustedSignExp =
+        b.create<arith::SubIOp>(f32SignExp, biasAdjustment);
+    Value f4Exp = b.create<arith::TruncIOp>(i4Ty, biasAdjustedSignExp);
+    f4Exp = b.create<arith::ShLIOp>(f4Exp, cF4MantissaWidth);
+    f4Bits = b.create<arith::AddIOp>(f4Bits, f4Exp);
+
+    // Step 3: Set mantissa to first bit.
+    Value cF32FirstBitMask = createConst(loc, i32Ty, 0x400000, rewriter);
+    Value man1Bit = b.create<arith::AndIOp>(f32Bits, cF32FirstBitMask);
+    man1Bit = b.create<arith::ShRUIOp>(man1Bit, c0x00000016);
+    Value f4Man = b.create<arith::TruncIOp>(i4Ty, man1Bit);
+    f4Bits = b.create<arith::AddIOp>(f4Bits, f4Man);
+
+    // Step 4: Special consideration for conversion to 0.5.
+    Value cF32MantissaMask = createConst(loc, i32Ty, 0x7fffff, rewriter);
+    Value f8Exp = b.create<arith::TruncIOp>(i8Ty, biasAdjustedSignExp);
+    Value isSubnormal =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::sle, f8Exp, c0x00);
+    Value isNegOneExp =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, f8Exp, c0xff);
+    Value man23Bits = b.create<arith::AndIOp>(f32Bits, cF32MantissaMask);
+    Value isNonZeroMan = b.create<arith::CmpIOp>(arith::CmpIPredicate::ugt,
+                                                 man23Bits, zeroExpBits);
+    Value roundToHalf = b.create<arith::AndIOp>(isNegOneExp, isNonZeroMan);
+    Value isZeroExp =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, f8Exp, c0x00);
+    Value subnormalF4Bits = createConst(loc, i4Ty, 0xf, rewriter);
+    Value halfF4Bits = createConst(loc, i4Ty, 0x0, rewriter);
+    Value subResult =
+        b.create<arith::SelectOp>(isSubnormal, subnormalF4Bits, f4Bits);
+    subResult = b.create<arith::SelectOp>(roundToHalf, halfF4Bits, subResult);
+    f4Bits = b.create<arith::SelectOp>(isZeroExp, f4Bits, subResult);
+
+    // Step 5: Round up if necessary.
+    Value cF32Last22BitMask = createConst(loc, i32Ty, 0x3fffff, rewriter);
+    Value cRound = createConst(loc, i32Ty, 0x200000, rewriter); // 010 0000...
+    Value man22Bits = b.create<arith::AndIOp>(f32Bits, cF32Last22BitMask);
+    Value shouldRound =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, man22Bits, cRound);
+    shouldRound = b.create<arith::OrIOp>(shouldRound, isSubnormal);
+    Value roundedF4Bits = b.create<arith::AddIOp>(f4Bits, c0x1);
+    f4Bits = b.create<arith::SelectOp>(shouldRound, roundedF4Bits, f4Bits);
+
+    Value result = b.create<arith::BitcastOp>(resultTy, f4Bits);
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 /*
 TruncF to F8E8M0 is expected to extract exponent bits out of F32 type
 Since All kinds of Infs and NaNs are mapped to same exponent bits in F32 type,
@@ -498,6 +731,8 @@ struct ArithExpandOpsPass
     arith::populateArithExpandOpsPatterns(patterns);
 
     target.addLegalDialect<arith::ArithDialect>();
+    target.addLegalDialect<vector::VectorDialect>();
+
     // clang-format off
     target.addIllegalOp<
       arith::CeilDivSIOp,
@@ -515,22 +750,24 @@ struct ArithExpandOpsPass
       arith::ScalingTruncFOp
     >();
 
-    if (includeBf16) {
+    if (includeBf16)
       arith::populateExpandBFloat16Patterns(patterns);
-    }
-    if (includeF8E8M0) {
+    if (includeF8E8M0)
       arith::populateExpandF8E8M0Patterns(patterns);
-    }
+    if (includeF4E2M1)
+      arith::populateExpandF4E2M1Patterns(patterns);
 
     target.addDynamicallyLegalOp<arith::ExtFOp>(
       [=](arith::ExtFOp op) {
         Type inETy = getElementTypeOrSelf(op.getOperand().getType());
         Type outETy = getElementTypeOrSelf(op.getType());
         bool legalTypes = true;
-        if (includeBf16) 
+        if (includeBf16)
           legalTypes &= !(inETy.isBF16() && outETy.isF32());
         if (includeF8E8M0)
           legalTypes &= !llvm::isa<Float8E8M0FNUType>(inETy);
+        if (includeF4E2M1)
+          legalTypes &= !llvm::isa<Float4E2M1FNType>(inETy);
         return legalTypes;
       });
 
@@ -539,10 +776,12 @@ struct ArithExpandOpsPass
         Type inETy = getElementTypeOrSelf(op.getOperand().getType());
         Type outETy = getElementTypeOrSelf(op.getType());
         bool legalTypes = true;
-        if (includeBf16) 
+        if (includeBf16)
           legalTypes &= !(inETy.isF32() && outETy.isBF16());
-        if (includeF8E8M0) 
+        if (includeF8E8M0)
           legalTypes &= !(llvm::isa<Float8E8M0FNUType>(outETy)); 
+        if (includeF4E2M1)
+          legalTypes &= !llvm::isa<Float4E2M1FNType>(outETy);
         return legalTypes;
       });
 
@@ -567,6 +806,11 @@ void mlir::arith::populateExpandBFloat16Patterns(RewritePatternSet &patterns) {
       patterns.getContext());
 }
 
+void mlir::arith::populateExpandF4E2M1Patterns(RewritePatternSet &patterns) {
+  patterns.add<F4E2M1ExtFOpConverter, F4E2M1TruncFOpConverter>(
+      patterns.getContext());
+}
+
 void mlir::arith::populateExpandF8E8M0Patterns(RewritePatternSet &patterns) {
   patterns.add<F8E8M0ExtFOpConverter, F8E8M0TruncFOpConverter>(
       patterns.getContext());
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index db1349feaff3..8f9b0feba442 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -arith-expand="include-bf16=true include-f8e8m0=true" -verify-diagnostics -split-input-file | FileCheck %s 
+// RUN: mlir-opt %s -arith-expand="include-bf16=true include-f8e8m0=true include-f4e2m1=true" -verify-diagnostics -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -arith-expand -split-input-file -verify-diagnostics | FileCheck %s --check-prefix=SCHECK
 
 // Test ceil divide with signed integer
@@ -593,3 +593,43 @@ func.func @minui(%a: i32, %b: i32) -> i32 {
 // CHECK-NEXT: %[[CMP:.*]] = arith.cmpi ult, %[[LHS]], %[[RHS]] : i32
 // CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : i32
 // CHECK-NEXT: return %[[RESULT]] : i32
+
+// -----
+
+func.func @truncf_f32_to_f4E2M1FN(%arg0 : f32) -> f4E2M1FN {
+    %0 = arith.truncf %arg0 : f32 to f4E2M1FN
+    return %0 : f4E2M1FN
+}
+
+// CHECK-LABEL: @truncf_f32_to_f4E2M1FN
+// CHECK-NOT: arith.truncf
+
+// -----
+
+func.func @truncf_vector_f32_to_f4E2M1FN(%arg0 : vector<4xf32>) -> vector<4xf4E2M1FN> {
+    %0 = arith.truncf %arg0 : vector<4xf32> to vector<4xf4E2M1FN>
+    return %0 : vector<4xf4E2M1FN>
+}
+
+// CHECK-LABEL: @truncf_vector_f32_to_f4E2M1FN
+// CHECK-NOT: arith.truncf
+
+// -----
+
+func.func @extf_f4E2M1FN_to_f32(%arg0 : f4E2M1FN) -> f32 {
+    %0 = arith.extf %arg0 : f4E2M1FN to f32
+    return %0 : f32
+}
+
+// CHECK-LABEL: @extf_f4E2M1FN_to_f32
+// CHECK-NOT: arith.extf
+
+// -----
+
+func.func @extf_vector_f4E2M1FN_to_f32(%arg0 : vector<4xf4E2M1FN>) -> vector<4xf32> {
+    %0 = arith.extf %arg0 : vector<4xf4E2M1FN> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: @extf_vector_f4E2M1FN_to_f32
+// CHECK-NOT: arith.extf
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir
new file mode 100644
index 000000000000..9c310d80d4c2
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir
@@ -0,0 +1,73 @@
+// Check various edge cases for truncf/extf ops involving f32 and f4e2m1 types.
+
+// RUN: mlir-opt %s --convert-func-to-llvm \
+// RUN:             --arith-expand="include-f4e2m1=true" \
+// RUN:             --convert-arith-to-llvm --convert-vector-to-llvm \
+// RUN:             --reconcile-unrealized-casts | \
+// RUN:   mlir-runner -e entry --entry-point-result=void \
+// RUN:               --shared-libs=%mlir_c_runner_utils | \
+// RUN:   FileCheck %s --match-full-lines
+
+func.func @check_extf(%in : f4E2M1FN) -> () {
+  %res = arith.extf %in : f4E2M1FN to f32
+  vector.print %res : f32
+  return
+}
+
+// See https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+// for details on F4E2M1 representation 
+func.func @check_truncf(%in : f32) -> () {
+  %trunc = arith.truncf %in : f32 to f4E2M1FN
+  %bitcast = arith.bitcast %trunc : f4E2M1FN to i4
+  %res = arith.extui %bitcast : i4 to i64
+  vector.print %res : i64
+  return
+}
+
+func.func @entry() {
+  %zero = arith.constant 0.0 : f32
+  %half = arith.constant 0.5 : f32
+  %one = arith.constant 1.0 : f32
+  %max = arith.constant 6.0 : f32
+  %min = arith.constant -6.0 : f32
+  %lowerThanMin = arith.constant -1000000.0 : f32
+  %higherThanMax = arith.constant 1000000.0 : f32
+  %mustRound = arith.constant -3.14 : f32
+  %nan = arith.constant 0x7f80000 : f32
+
+  // CHECK: 0
+  func.call @check_truncf(%zero) : (f32) -> ()
+  // CHECK: 1
+  func.call @check_truncf(%half) : (f32) -> ()
+  // CHECK: 2
+  func.call @check_truncf(%one) : (f32) -> ()
+  // CHECK: 7
+  func.call @check_truncf(%max) : (f32) -> ()
+  // CHECK: 15
+  func.call @check_truncf(%min) : (f32) -> ()
+  // CHECK: 7
+  func.call @check_truncf(%higherThanMax) : (f32) -> ()
+  // CHECK: 15
+  func.call @check_truncf(%lowerThanMin) : (f32) -> ()
+  // CHECK: 13
+  func.call @check_truncf(%mustRound) : (f32) -> ()
+  // CHECK: 0
+  func.call @check_truncf(%nan) : (f32) -> ()
+
+  // CHECK: 0
+  %zeroF4 = arith.truncf %zero : f32 to f4E2M1FN
+  func.call @check_extf(%zeroF4) : (f4E2M1FN) -> ()
+  // CHECK: 0.5
+  %halfF4 = arith.truncf %half : f32 to f4E2M1FN
+  func.call @check_extf(%halfF4) : (f4E2M1FN) -> ()
+  // CHECK: 6
+  %higherThanMaxF4 = arith.truncf %higherThanMax : f32 to f4E2M1FN
+  func.call @check_extf(%higherThanMaxF4) : (f4E2M1FN) -> ()
+  // CHECK: -6
+  %lowerThanMinF4 = arith.truncf %lowerThanMin : f32 to f4E2M1FN
+  func.call @check_extf(%lowerThanMinF4) : (f4E2M1FN) -> ()
+  // CHECK: -3
+  %mustRoundF4 = arith.truncf %mustRound : f32 to f4E2M1FN
+  func.call @check_extf(%mustRoundF4) : (f4E2M1FN) -> ()
+  return
+}

From edbaf19c46c678e080d5a43e719bc19a02b44ca9 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 20 Jun 2025 12:29:32 -0400
Subject: [PATCH 1079/1322] [AMDGPU] Fix a potential integer overflow in
 GCNRegPressure when true16 is enabled (#144968)

Fixes SWDEV-537014.
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 28 +++++++++--
 ...n-reg-pressure-true16-integer-overflow.mir | 48 +++++++++++++++++++
 2 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/gcn-reg-pressure-true16-integer-overflow.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index ce213b91b1f7..5724ce9cc5d1 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -44,16 +44,19 @@ void GCNRegPressure::inc(unsigned Reg,
                          LaneBitmask PrevMask,
                          LaneBitmask NewMask,
                          const MachineRegisterInfo &MRI) {
-  if (SIRegisterInfo::getNumCoveredRegs(NewMask) ==
-      SIRegisterInfo::getNumCoveredRegs(PrevMask))
+  unsigned NewNumCoveredRegs = SIRegisterInfo::getNumCoveredRegs(NewMask);
+  unsigned PrevNumCoveredRegs = SIRegisterInfo::getNumCoveredRegs(PrevMask);
+  if (NewNumCoveredRegs == PrevNumCoveredRegs)
     return;
 
   int Sign = 1;
   if (NewMask < PrevMask) {
     std::swap(NewMask, PrevMask);
+    std::swap(NewNumCoveredRegs, PrevNumCoveredRegs);
     Sign = -1;
   }
-  assert(PrevMask < NewMask && "prev mask should always be lesser than new");
+  assert(PrevMask < NewMask && PrevNumCoveredRegs < NewNumCoveredRegs &&
+         "prev mask should always be lesser than new");
 
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
@@ -66,7 +69,24 @@ void GCNRegPressure::inc(unsigned Reg,
       Value[TupleIdx] += Sign * TRI->getRegClassWeight(RC).RegWeight;
     }
     // Pressure scales with number of new registers covered by the new mask.
-    Sign *= SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask);
+    // Note when true16 is enabled, we can no longer safely use the following
+    // approach to calculate the difference in the number of 32-bit registers
+    // between two masks:
+    //
+    // Sign *= SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask);
+    //
+    // The issue is that the mask calculation `~PrevMask & NewMask` doesn't
+    // properly account for partial usage of a 32-bit register when dealing with
+    // 16-bit registers.
+    //
+    // Consider this example:
+    // Assume PrevMask = 0b0010 and NewMask = 0b1111. Here, the correct register
+    // usage difference should be 1, because even though PrevMask uses only half
+    // of a 32-bit register, it should still be counted as a full register use.
+    // However, the mask calculation yields `~PrevMask & NewMask = 0b1101`, and
+    // calling `getNumCoveredRegs` returns 2 instead of 1. This incorrect
+    // calculation can lead to integer overflow when Sign = -1.
+    Sign *= NewNumCoveredRegs - PrevNumCoveredRegs;
   }
   Value[RegKind] += Sign;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/gcn-reg-pressure-true16-integer-overflow.mir b/llvm/test/CodeGen/AMDGPU/gcn-reg-pressure-true16-integer-overflow.mir
new file mode 100644
index 000000000000..7f0654746e13
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gcn-reg-pressure-true16-integer-overflow.mir
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -x mir -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -run-pass=machine-scheduler %s -o - | FileCheck %s
+
+---
+name:            foo
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; CHECK-LABEL: name: foo
+    ; CHECK: liveins: $sgpr4_sgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_128 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[COPY1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8)
+    ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    ; CHECK-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64_e64 24, [[BUFFER_LOAD_DWORDX2_OFFSET]], implicit $exec
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].lo16:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].lo16
+    ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; CHECK-NEXT: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 0, 8, 8, [[V_LSHLREV_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_PK_LSHLREV_B16_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; CHECK-NEXT: S_WAITCNT 0
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sgpr_64(p4) = COPY killed $sgpr4_sgpr5
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %0(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    %2:sreg_32 = S_MOV_B32 0
+    undef %3.sub0:sgpr_128 = COPY %2
+    %3.sub1:sgpr_128 = COPY %2
+    %3.sub2:sgpr_128 = COPY %2
+    %3.sub3:sgpr_128 = COPY killed %2
+    %4:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET killed %3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8)
+    %5:vreg_64 = V_LSHRREV_B64_e64 24, killed %4, implicit $exec
+    undef %6.lo16:vgpr_32 = COPY killed %5.lo16
+    %7:vgpr_32 = V_LSHLREV_B32_e64 16, killed %6, implicit $exec
+    %8:vgpr_32 = V_PK_LSHLREV_B16 0, 8, 8, killed %7, 0, 0, 0, 0, 0, implicit $exec
+    %9:vreg_64 = COPY killed %1
+    FLAT_STORE_DWORD killed %9, killed %8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    S_WAITCNT 0
+    S_ENDPGM 0
+...

From 945ce1aa3d29e24c49720ae9e0bcfbac88f2defd Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Fri, 20 Jun 2025 19:36:46 +0300
Subject: [PATCH 1080/1322] [libc++] Update the value of
 __cpp_lib_constrained_equality after P3379R0 (#144553)

https://wg21.link/P3379R0 updated the value of __cpp_lib_constrained_equality,
but we forgot to update it when we implemented the paper.
---
 libcxx/include/version                        |  6 ++--
 .../expected.version.compile.pass.cpp         | 33 +++++++++++++++++++
 .../optional.version.compile.pass.cpp         |  4 +--
 .../tuple.version.compile.pass.cpp            |  4 +--
 .../utility.version.compile.pass.cpp          |  4 +--
 .../variant.version.compile.pass.cpp          |  4 +--
 .../version.version.compile.pass.cpp          |  4 +--
 .../generate_feature_test_macro_components.py |  7 ++--
 8 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/libcxx/include/version b/libcxx/include/version
index 7154cab92335..f430c7edff2b 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -83,8 +83,8 @@ __cpp_lib_constexpr_tuple                               201811L <tuple>
 __cpp_lib_constexpr_typeinfo                            202106L <typeinfo>
 __cpp_lib_constexpr_utility                             201811L <utility>
 __cpp_lib_constexpr_vector                              201907L <vector>
-__cpp_lib_constrained_equality                          202403L <optional> <tuple> <utility>
-                                                                <variant>
+__cpp_lib_constrained_equality                          202411L <expected> <optional> <tuple>
+                                                                <utility> <variant>
 __cpp_lib_containers_ranges                             202202L <deque> <forward_list> <list>
                                                                 <map> <queue> <set>
                                                                 <stack> <string> <unordered_map>
@@ -551,7 +551,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 #   define __cpp_lib_constexpr_new                      202406L
 # endif
 # define __cpp_lib_constexpr_queue                      202502L
-// # define __cpp_lib_constrained_equality                 202403L
+// # define __cpp_lib_constrained_equality                 202411L
 // # define __cpp_lib_copyable_function                    202306L
 // # define __cpp_lib_debugging                            202311L
 // # define __cpp_lib_default_template_type_for_algorithm_values 202403L
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
index d58f726f66e2..9c7a84f145dd 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
@@ -20,6 +20,10 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constrained_equality
+#    error "__cpp_lib_constrained_equality should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_expected
 #    error "__cpp_lib_expected should not be defined before c++23"
 #  endif
@@ -30,6 +34,10 @@
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constrained_equality
+#    error "__cpp_lib_constrained_equality should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_expected
 #    error "__cpp_lib_expected should not be defined before c++23"
 #  endif
@@ -40,6 +48,10 @@
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constrained_equality
+#    error "__cpp_lib_constrained_equality should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_expected
 #    error "__cpp_lib_expected should not be defined before c++23"
 #  endif
@@ -50,6 +62,10 @@
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constrained_equality
+#    error "__cpp_lib_constrained_equality should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_expected
 #    error "__cpp_lib_expected should not be defined before c++23"
 #  endif
@@ -60,6 +76,10 @@
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constrained_equality
+#    error "__cpp_lib_constrained_equality should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_expected
 #    error "__cpp_lib_expected should be defined in c++23"
 #  endif
@@ -73,6 +93,19 @@
 
 #elif TEST_STD_VER > 23
 
+#  if !defined(_LIBCPP_VERSION)
+#    ifndef __cpp_lib_constrained_equality
+#      error "__cpp_lib_constrained_equality should be defined in c++26"
+#    endif
+#    if __cpp_lib_constrained_equality != 202411L
+#      error "__cpp_lib_constrained_equality should have the value 202411L in c++26"
+#    endif
+#  else
+#    ifdef __cpp_lib_constrained_equality
+#      error "__cpp_lib_constrained_equality should not be defined because it is unimplemented in libc++!"
+#    endif
+#  endif
+
 #  ifndef __cpp_lib_expected
 #    error "__cpp_lib_expected should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
index f8ff69f61877..32685972d601 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
@@ -123,8 +123,8 @@
 #    ifndef __cpp_lib_constrained_equality
 #      error "__cpp_lib_constrained_equality should be defined in c++26"
 #    endif
-#    if __cpp_lib_constrained_equality != 202403L
-#      error "__cpp_lib_constrained_equality should have the value 202403L in c++26"
+#    if __cpp_lib_constrained_equality != 202411L
+#      error "__cpp_lib_constrained_equality should have the value 202411L in c++26"
 #    endif
 #  else
 #    ifdef __cpp_lib_constrained_equality
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
index f4a2fd8b29f4..b583edfc43ad 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
@@ -274,8 +274,8 @@
 #    ifndef __cpp_lib_constrained_equality
 #      error "__cpp_lib_constrained_equality should be defined in c++26"
 #    endif
-#    if __cpp_lib_constrained_equality != 202403L
-#      error "__cpp_lib_constrained_equality should have the value 202403L in c++26"
+#    if __cpp_lib_constrained_equality != 202411L
+#      error "__cpp_lib_constrained_equality should have the value 202411L in c++26"
 #    endif
 #  else
 #    ifdef __cpp_lib_constrained_equality
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
index e0a83c7813b2..7dd347857633 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
@@ -405,8 +405,8 @@
 #    ifndef __cpp_lib_constrained_equality
 #      error "__cpp_lib_constrained_equality should be defined in c++26"
 #    endif
-#    if __cpp_lib_constrained_equality != 202403L
-#      error "__cpp_lib_constrained_equality should have the value 202403L in c++26"
+#    if __cpp_lib_constrained_equality != 202411L
+#      error "__cpp_lib_constrained_equality should have the value 202411L in c++26"
 #    endif
 #  else
 #    ifdef __cpp_lib_constrained_equality
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
index dc4af4d09f9e..4a7b9f7431a8 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
@@ -103,8 +103,8 @@
 #    ifndef __cpp_lib_constrained_equality
 #      error "__cpp_lib_constrained_equality should be defined in c++26"
 #    endif
-#    if __cpp_lib_constrained_equality != 202403L
-#      error "__cpp_lib_constrained_equality should have the value 202403L in c++26"
+#    if __cpp_lib_constrained_equality != 202411L
+#      error "__cpp_lib_constrained_equality should have the value 202411L in c++26"
 #    endif
 #  else
 #    ifdef __cpp_lib_constrained_equality
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index e4fe9f994e2e..cef334f70c07 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -6575,8 +6575,8 @@
 #    ifndef __cpp_lib_constrained_equality
 #      error "__cpp_lib_constrained_equality should be defined in c++26"
 #    endif
-#    if __cpp_lib_constrained_equality != 202403L
-#      error "__cpp_lib_constrained_equality should have the value 202403L in c++26"
+#    if __cpp_lib_constrained_equality != 202411L
+#      error "__cpp_lib_constrained_equality should have the value 202411L in c++26"
 #    endif
 #  else
 #    ifdef __cpp_lib_constrained_equality
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index de06b9dd1bee..849a96b9585b 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -431,8 +431,11 @@ feature_test_macros = [
         },
         {
             "name": "__cpp_lib_constrained_equality",
-            "values": {"c++26": 202403}, # P2944R3: Comparisons for reference_wrapper
-            "headers": ["optional", "tuple", "utility", "variant"],
+            "values": {
+                # "c++26": 202403,  # P2944R3: Comparisons for reference_wrapper
+                "c++26": 202411,  # P3379R0: Constrain std::expected equality operators
+            },
+            "headers": ["expected", "optional", "tuple", "utility", "variant"],
             "unimplemented": True,
         },
         {

From 6ddb3a69c15963b7cf1cbb0323dd3e467b71cefc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 20 Jun 2025 17:54:44 +0100
Subject: [PATCH 1081/1322] [AMDGPU] Add another test showing unwanted VALU
 codegen (#145062)

---
 .../AMDGPU/pseudo-scalar-transcendental.ll    | 53 +++++++++++++------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index a2e7f2e62f5d..d957aaa8a48a 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -410,19 +410,40 @@ define amdgpu_cs half @srcmods_neg_f16(half inreg %src) {
   ret half %result
 }
 
-declare half @llvm.exp2.f16(half)
-declare float @llvm.exp2.f32(float)
-declare half @llvm.amdgcn.exp2.f16(half)
-declare float @llvm.amdgcn.exp2.f32(float)
-declare half @llvm.log2.f16(half)
-declare float @llvm.log2.f32(float)
-declare half @llvm.amdgcn.log.f16(half)
-declare float @llvm.amdgcn.log.f32(float)
-declare half @llvm.amdgcn.rcp.f16(half)
-declare float @llvm.amdgcn.rcp.f32(float)
-declare half @llvm.sqrt.f16(half)
-declare float @llvm.sqrt.f32(float)
-declare half @llvm.amdgcn.sqrt.f16(half)
-declare float @llvm.amdgcn.sqrt.f32(float)
-declare half @llvm.fabs.f16(half)
-declare float @llvm.fabs.f32(float)
+; TODO: SelectionDAG should avoid generating v_rcp_iflag_f32.
+define amdgpu_cs float @fdiv_f32_i32(float inreg %a, i32 inreg %b) {
+; GFX12-SDAG-LABEL: fdiv_f32_i32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_cvt_f32_u32 s1, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-SDAG-NEXT:    v_rcp_iflag_f32_e32 v0, s1
+; GFX12-SDAG-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: fdiv_f32_i32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_cvt_f32_u32 s1, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_s_rcp_f32 s1, s1
+; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
+  %uint = uitofp i32 %b to float
+  %result = fdiv afn float %a, %uint
+  ret float %result
+}
+
+define amdgpu_cs half @fdiv_f16_i16(half inreg %a, i16 inreg %b) {
+; GFX12-LABEL: fdiv_f16_i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_f16_u16_e32 v0, s1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX12-NEXT:    v_mul_f16_e32 v0, s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+  %uint = uitofp i16 %b to half
+  %result = fdiv afn half %a, %uint
+  ret half %result
+}

From 7157f33c6c88143acb8bc02bc26a0d6786136419 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 20 Jun 2025 09:58:59 -0700
Subject: [PATCH 1082/1322] [libc++] Disable a std::unexpected test in modules
 build (#144466)

This patch disables unexpected_disabled_cpp17.verify.cpp under clang
modules builds because it changes diagnostics criteria post #143423,
causing the test to fail.

This patch follows a similar style to 853059a15011fd8b57dd0.
This was found when working on trying to land #144033.
---
 .../unexpected_disabled_cpp17.verify.cpp                     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libcxx/test/libcxx/depr/exception.unexpected/unexpected_disabled_cpp17.verify.cpp b/libcxx/test/libcxx/depr/exception.unexpected/unexpected_disabled_cpp17.verify.cpp
index 299cb123fbda..b92f2baa02e4 100644
--- a/libcxx/test/libcxx/depr/exception.unexpected/unexpected_disabled_cpp17.verify.cpp
+++ b/libcxx/test/libcxx/depr/exception.unexpected/unexpected_disabled_cpp17.verify.cpp
@@ -8,6 +8,11 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
+// When built with modules, this test gives diagnostics like declaration of
+// 'unexpected' must be imported from module 'std.expected.unexpected' before
+// it is required. Therefore disable it in this configuration.
+// UNSUPPORTED: clang-modules-build
+
 // test unexpected
 
 #include <exception>

From 8d6e29d0d3e3acb67adaa61648f35a38a9d196b3 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Fri, 20 Jun 2025 16:59:30 +0000
Subject: [PATCH 1083/1322] [libc] Reworked CharacterConverter isComplete into
 isFull and isEmpty (#144799)

isComplete previously meant different things for different conversion
directions.
Refactored bytes_processed to bytes_stored which now consistently
increments on every push and decrements on pop making both directions
more consistent with each other
---
 .../__support/wchar/character_converter.cpp   | 33 ++++++-------
 .../src/__support/wchar/character_converter.h |  3 +-
 libc/src/__support/wchar/mbstate.h            |  6 +--
 .../src/__support/wchar/utf32_to_8_test.cpp   | 48 +++++++++++--------
 .../src/__support/wchar/utf8_to_32_test.cpp   | 20 ++++----
 5 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 5ab0447bb08b..1f81de4248ff 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -30,18 +30,20 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
 void CharacterConverter::clear() {
   state->partial = 0;
-  state->bytes_processed = 0;
+  state->bytes_stored = 0;
   state->total_bytes = 0;
 }
 
-bool CharacterConverter::isComplete() {
-  return state->bytes_processed == state->total_bytes;
+bool CharacterConverter::isFull() {
+  return state->bytes_stored == state->total_bytes && state->total_bytes != 0;
 }
 
+bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
+
 int CharacterConverter::push(char8_t utf8_byte) {
   uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
   // Checking the first byte if first push
-  if (state->bytes_processed == 0) {
+  if (isEmpty()) {
     // UTF-8 char has 1 byte total
     if (num_ones == 0) {
       state->total_bytes = 1;
@@ -58,21 +60,21 @@ int CharacterConverter::push(char8_t utf8_byte) {
     }
     // Invalid first byte
     else {
-      // bytes_processed and total_bytes will always be 0 here
+      // bytes_stored and total_bytes will always be 0 here
       state->partial = static_cast<char32_t>(0);
       return -1;
     }
     state->partial = static_cast<char32_t>(utf8_byte);
-    state->bytes_processed++;
+    state->bytes_stored++;
     return 0;
   }
   // Any subsequent push
   // Adding 6 more bits so need to left shift
-  if (num_ones == 1 && !isComplete()) {
+  if (num_ones == 1 && !isFull()) {
     char32_t byte = utf8_byte & MASK_ENCODED_BITS;
     state->partial = state->partial << ENCODED_BITS_PER_UTF8;
     state->partial |= byte;
-    state->bytes_processed++;
+    state->bytes_stored++;
     return 0;
   }
   // Invalid byte -> reset the state
@@ -82,11 +84,10 @@ int CharacterConverter::push(char8_t utf8_byte) {
 
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
-  if (!isComplete())
+  if (!isEmpty())
     return -1;
 
   state->partial = utf32;
-  state->bytes_processed = 0;
 
   // determine number of utf-8 bytes needed to represent this utf32 value
   constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
@@ -94,6 +95,7 @@ int CharacterConverter::push(char32_t utf32) {
   for (uint8_t i = 0; i < NUM_RANGES; i++) {
     if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
       state->total_bytes = i + 1;
+      state->bytes_stored = i + 1;
       return 0;
     }
   }
@@ -107,7 +109,7 @@ int CharacterConverter::push(char32_t utf32) {
 ErrorOr<char32_t> CharacterConverter::pop_utf32() {
   // If pop is called too early, do not reset the state, use error to determine
   // whether enough bytes have been pushed
-  if (!isComplete() || state->bytes_processed == 0)
+  if (!isFull())
     return Error(-1);
   char32_t utf32 = state->partial;
   // reset if successful pop
@@ -116,7 +118,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
 }
 
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
-  if (isComplete())
+  if (isEmpty())
     return Error(-1);
 
   constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
@@ -125,9 +127,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   char32_t output;
 
   // Shift to get the next 6 bits from the utf32 encoding
-  const size_t shift_amount =
-      (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
-  if (state->bytes_processed == 0) {
+  const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
+  if (isFull()) {
     /*
       Choose the correct set of most significant bits to encode the length
       of the utf8 sequence. The remaining bits contain the most significant
@@ -141,7 +142,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
              ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
   }
 
-  state->bytes_processed++;
+  state->bytes_stored--;
   return static_cast<char8_t>(output);
 }
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index c4ba7cf6b689..be0e6129df23 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -26,7 +26,8 @@ public:
   CharacterConverter(mbstate *mbstate);
 
   void clear();
-  bool isComplete();
+  bool isFull();
+  bool isEmpty();
 
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index fb08fb4eaa18..fea693f73c3b 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -22,10 +22,10 @@ struct mbstate {
 
   /*
   Progress towards a conversion
-    For utf8  -> utf32, increases with each CharacterConverter::push(utf8_byte)
-    For utf32 ->  utf8, increases with each CharacterConverter::pop_utf8()
+    Increases with each push(...) until it reaches total_bytes
+    Decreases with each pop(...) until it reaches 0
   */
-  uint8_t bytes_processed;
+  uint8_t bytes_stored;
 
   // Total number of bytes that will be needed to represent this character
   uint8_t total_bytes;
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index f4c5cb863ff3..a6a7bc4aa6f4 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
   // utf8 1-byte encodings are identical to their utf32 representations
   char32_t utf32_A = 0x41; // 'A'
   cr.push(utf32_A);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<char>(popped.value()), 'A');
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   char32_t utf32_B = 0x42; // 'B'
   cr.push(utf32_B);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<char>(popped.value()), 'B');
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
@@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   // testing utf32: 0xff -> utf8: 0xc3 0xbf
   char32_t utf32 = 0xff;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // testing utf32: 0x58e -> utf8: 0xd6 0x8e
   utf32 = 0x58e;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
@@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
   char32_t utf32 = 0xac15;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
   utf32 = 0x267b;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
@@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
   // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
   char32_t utf32 = 0x1f921;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
   utf32 = 0x12121;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 9cb059faa937..36ae7d689cc0 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -13,7 +13,7 @@
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   char ch = 'A';
 
@@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[2] = {static_cast<char>(0xC2),
                       static_cast<char>(0x8E)}; //  car symbol
@@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
                       static_cast<char>(0x91)}; // ∑ sigma symbol
@@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
                       static_cast<char>(0xA4),
@@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch = static_cast<char>(0x80); // invalid starting bit sequence
 
@@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[4] = {
       static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
@@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   // Last byte is invalid since it does not have correct starting sequence.
   // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
@@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
                       static_cast<char>(0x80)};
@@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
                       static_cast<char>(0xC7), static_cast<char>(0x8C)};
@@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};

From 65cb3bcf327da8f9740e56897bc9954364e59eb6 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo@ca.ibm.com>
Date: Fri, 20 Jun 2025 13:03:14 -0400
Subject: [PATCH 1084/1322] [Clang][PowerPC] Add __dmr1024 type and DMF integer
 calculation builtins (#142480)

Define the __dmr1024 type used to manipulate the new DMR registers
introduced by the Dense Math Facility (DMF) on PowerPC, and add six
Clang builtins that correspond to the integer outer-product accumulate
to ACC PowerPC instructions:
*  __builtin_mma_dmxvi8gerx4
* __builtin_mma_pmdmxvi8gerx4
*  __builtin_mma_dmxvi8gerx4pp
*  __builtin_mma_pmdmxvi8gerx4pp
*  __builtin_mma_dmxvi8gerx4spp
* __builtin_mma_pmdmxvi8gerx4spp.
---
 clang/include/clang/Basic/BuiltinsPPC.def     |  12 ++
 clang/include/clang/Basic/PPCTypes.def        |   1 +
 clang/lib/AST/ASTContext.cpp                  |   1 +
 clang/test/AST/ast-dump-ppc-types.c           |  13 +-
 clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c |  94 ++++++++++
 .../ppc-dmf-paired-vec-memops-builtin-err.c   |  20 ++
 clang/test/CodeGen/PowerPC/ppc-dmf-types.c    | 177 ++++++++++++++++++
 .../PowerPC/ppc-future-mma-builtin-err.c      |  21 +++
 .../test/CodeGenCXX/ppc-mangle-mma-types.cpp  |   5 +
 clang/test/Sema/ppc-dmf-types.c               | 103 ++++++++++
 .../TypeSystem/Clang/TypeSystemClang.cpp      |   1 +
 11 files changed, 445 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
 create mode 100644 clang/test/CodeGen/PowerPC/ppc-dmf-paired-vec-memops-builtin-err.c
 create mode 100644 clang/test/CodeGen/PowerPC/ppc-dmf-types.c
 create mode 100644 clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c
 create mode 100644 clang/test/Sema/ppc-dmf-types.c

diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index bb7d54bbb793..099500754a0e 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1134,6 +1134,18 @@ UNALIASED_CUSTOM_BUILTIN(mma_pmxvbf16ger2np, "vW512*VVi15i15i3", true,
                          "mma,paired-vector-memops")
 UNALIASED_CUSTOM_BUILTIN(mma_pmxvbf16ger2nn, "vW512*VVi15i15i3", true,
                          "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4, "vW1024*W256V", false,
+                         "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4, "vW1024*W256Vi255i15i15", false,
+                         "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4pp, "vW1024*W256V", true,
+                         "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4pp, "vW1024*W256Vi255i15i15", true,
+                         "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4spp,  "vW1024*W256V", true,
+                         "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4spp, "vW1024*W256Vi255i15i15", true,
+                         "mma,paired-vector-memops")
 
 // FIXME: Obviously incomplete.
 
diff --git a/clang/include/clang/Basic/PPCTypes.def b/clang/include/clang/Basic/PPCTypes.def
index 9e2cb2aedc9f..fc4155ca98b2 100644
--- a/clang/include/clang/Basic/PPCTypes.def
+++ b/clang/include/clang/Basic/PPCTypes.def
@@ -30,6 +30,7 @@
 #endif
 
 
+PPC_VECTOR_MMA_TYPE(__dmr1024, DMR1024, 1024)
 PPC_VECTOR_MMA_TYPE(__vector_quad, VectorQuad, 512)
 PPC_VECTOR_VSX_TYPE(__vector_pair, VectorPair, 256)
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 74be2871f270..02d6570d0ea0 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3522,6 +3522,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     case BuiltinType::BFloat16:
     case BuiltinType::VectorQuad:
     case BuiltinType::VectorPair:
+    case BuiltinType::DMR1024:
       OS << "?";
       return;
 
diff --git a/clang/test/AST/ast-dump-ppc-types.c b/clang/test/AST/ast-dump-ppc-types.c
index 26ae5441f20d..1c860c268e0e 100644
--- a/clang/test/AST/ast-dump-ppc-types.c
+++ b/clang/test/AST/ast-dump-ppc-types.c
@@ -1,9 +1,11 @@
+// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -target-cpu future \
+// RUN:   -ast-dump  %s | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
-// RUN:   -ast-dump -ast-dump-filter __vector %s | FileCheck %s
+// RUN:   -ast-dump  %s | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -target-cpu pwr9 \
-// RUN:   -ast-dump -ast-dump-filter __vector %s | FileCheck %s
+// RUN:   -ast-dump  %s | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -target-cpu pwr8 \
-// RUN:   -ast-dump -ast-dump-filter __vector %s | FileCheck %s
+// RUN:   -ast-dump  %s | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s \
 // RUN:   --check-prefix=CHECK-X86_64
 // RUN: %clang_cc1 -triple arm-unknown-unknown -ast-dump %s | FileCheck %s \
@@ -15,16 +17,21 @@
 // are correctly defined. We also added checks on a couple of other targets to
 // ensure the types are target-dependent.
 
+// CHECK: TypedefDecl {{.*}} implicit __dmr1024 '__dmr1024'
+// CHECK: `-BuiltinType {{.*}} '__dmr1024'
 // CHECK: TypedefDecl {{.*}} implicit __vector_quad '__vector_quad'
 // CHECK-NEXT: -BuiltinType {{.*}} '__vector_quad'
 // CHECK: TypedefDecl {{.*}} implicit __vector_pair '__vector_pair'
 // CHECK-NEXT: -BuiltinType {{.*}} '__vector_pair'
 
+// CHECK-X86_64-NOT: __dmr1024
 // CHECK-X86_64-NOT: __vector_quad
 // CHECK-X86_64-NOT: __vector_pair
 
+// CHECK-ARM-NOT: __dmr1024
 // CHECK-ARM-NOT: __vector_quad
 // CHECK-ARM-NOT: __vector_pair
 
+// CHECK-RISCV64-NOT: __dmr1024
 // CHECK-RISCV64-NOT: __vector_quad
 // CHECK-RISCV64-NOT: __vector_pair
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
new file mode 100644
index 000000000000..41f13155847b
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -0,0 +1,94 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future \
+// RUN:  -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O3 -triple powerpc64-ibm-aix -target-cpu future \
+// RUN: -emit-llvm %s -o - | FileCheck %s
+
+
+// CHECK-LABEL: @test_dmxvi8gerx4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    ret void
+//
+void test_dmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_dmxvi8gerx4(&vdmr, vp, vc);
+  *((__dmr1024 *)resp) = vdmr;
+}
+
+// CHECK-LABEL: @test_pmdmxvi8gerx4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    ret void
+//
+void test_pmdmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_pmdmxvi8gerx4(&vdmr, vp, vc, 0, 0, 0);
+  *((__dmr1024 *)resp) = vdmr;
+}
+
+// CHECK-LABEL: @test_dmxvi8gerx4pp(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    ret void
+//
+void test_dmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_dmxvi8gerx4pp(&vdmr, vp, vc);
+  *((__dmr1024 *)resp) = vdmr;
+}
+
+// CHECK-LABEL: @test_pmdmxvi8gerx4pp(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    ret void
+//
+void test_pmdmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_pmdmxvi8gerx4pp(&vdmr, vp, vc, 0, 0, 0);
+  *((__dmr1024 *)resp) = vdmr;
+}
+
+// CHECK-LABEL: @test_dmxvi8gerx4spp(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    ret void
+//
+void test_dmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_dmxvi8gerx4spp(&vdmr, vp, vc);
+  *((__dmr1024 *)resp) = vdmr;
+}
+
+// CHECK-LABEL: @test_pmdmxvi8gerx4spp(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    ret void
+//
+void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0);
+  *((__dmr1024 *)resp) = vdmr;
+}
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-paired-vec-memops-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-paired-vec-memops-builtin-err.c
new file mode 100644
index 000000000000..eef1abca2241
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-paired-vec-memops-builtin-err.c
@@ -0,0 +1,20 @@
+// RUN: not %clang_cc1 -triple powerpc64le-unknown-linux-gnu -target-cpu future \
+// RUN:   %s -emit-llvm-only 2>&1 | FileCheck %s
+
+__attribute__((target("no-paired-vector-memops")))
+void test_pair(unsigned char *vdmr, unsigned char *vpp, vector unsigned char vc) {
+   __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_dmxvi8gerx4((__dmr1024 *)vdmr, vp, vc);
+  __builtin_mma_pmdmxvi8gerx4((__dmr1024 *)vdmr, vp, vc, 0, 0, 0);
+  __builtin_mma_dmxvi8gerx4pp((__dmr1024 *)vdmr, vp, vc);
+  __builtin_mma_pmdmxvi8gerx4pp((__dmr1024 *)vdmr, vp, vc, 0, 0, 0);
+  __builtin_mma_dmxvi8gerx4spp((__dmr1024 *)vdmr, vp, vc);
+  __builtin_mma_pmdmxvi8gerx4spp((__dmr1024 *)vdmr, vp, vc, 0, 0, 0);
+
+// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_dmxvi8gerx4pp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops
+}
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-types.c b/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
new file mode 100644
index 000000000000..9dff289370eb
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
@@ -0,0 +1,177 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu future \
+// RUN:   -emit-llvm -o - %s | FileCheck %s
+
+
+// CHECK-LABEL: @test_dmr_copy(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR1_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR2_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR1:%.*]], ptr [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR2:%.*]], ptr [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <1024 x i1>, ptr [[TMP0]], i64 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[ADD_PTR]], align 128
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <1024 x i1>, ptr [[TMP2]], i64 1
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[ADD_PTR1]], align 128
+// CHECK-NEXT:    ret void
+//
+void test_dmr_copy(__dmr1024 *ptr1, __dmr1024 *ptr2) {
+  *(ptr2 + 1) = *(ptr1 + 2);
+}
+
+// CHECK-LABEL: @test_dmr_typedef(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[OUTP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRIN:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMROUT:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[INP:%.*]], ptr [[INP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[OUTP:%.*]], ptr [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[INP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRIN]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[VDMROUT]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VDMRIN]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1024 x i1>, ptr [[TMP2]], align 128
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VDMROUT]], align 8
+// CHECK-NEXT:    store <1024 x i1> [[TMP3]], ptr [[TMP4]], align 128
+// CHECK-NEXT:    ret void
+//
+void test_dmr_typedef(int *inp, int *outp) {
+  __dmr1024 *vdmrin = (__dmr1024 *)inp;
+  __dmr1024 *vdmrout = (__dmr1024 *)outp;
+  *vdmrout = *vdmrin;
+}
+
+// CHECK-LABEL: @test_dmr_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMR:%.*]], ptr [[VDMR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[TMP1]], align 128
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRP]], align 8
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[TMP3]], align 128
+// CHECK-NEXT:    ret void
+//
+void test_dmr_arg(__dmr1024 *vdmr, int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  *vdmrp = *vdmr;
+}
+
+// CHECK-LABEL: @test_dmr_const_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMR:%.*]], ptr [[VDMR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[TMP1]], align 128
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRP]], align 8
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[TMP3]], align 128
+// CHECK-NEXT:    ret void
+//
+void test_dmr_const_arg(const __dmr1024 *const vdmr, int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  *vdmrp = *vdmr;
+}
+
+// CHECK-LABEL: @test_dmr_array_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRA:%.*]], ptr [[VDMRA_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRA_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <1024 x i1>, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[ARRAYIDX]], align 128
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRP]], align 8
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[TMP3]], align 128
+// CHECK-NEXT:    ret void
+//
+void test_dmr_array_arg(__dmr1024 vdmra[], int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  *vdmrp = vdmra[0];
+}
+
+// CHECK-LABEL: @test_dmr_ret(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <1024 x i1>, ptr [[TMP1]], i64 2
+// CHECK-NEXT:    ret ptr [[ADD_PTR]]
+//
+__dmr1024 *test_dmr_ret(int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  return vdmrp + 2;
+}
+
+// CHECK-LABEL: @test_dmr_ret_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <1024 x i1>, ptr [[TMP1]], i64 2
+// CHECK-NEXT:    ret ptr [[ADD_PTR]]
+//
+const __dmr1024 *test_dmr_ret_const(int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  return vdmrp + 2;
+}
+
+// CHECK-LABEL: @test_dmr_sizeof_alignof(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMR:%.*]] = alloca <1024 x i1>, align 128
+// CHECK-NEXT:    [[SIZET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SIZEV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[TMP1]], align 128
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[VDMR]], align 128
+// CHECK-NEXT:    store i32 128, ptr [[SIZET]], align 4
+// CHECK-NEXT:    store i32 128, ptr [[ALIGNT]], align 4
+// CHECK-NEXT:    store i32 128, ptr [[SIZEV]], align 4
+// CHECK-NEXT:    store i32 128, ptr [[ALIGNV]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIZET]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ALIGNT]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SIZEV]], align 4
+// CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ALIGNV]], align 4
+// CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP6]]
+// CHECK-NEXT:    ret i32 [[ADD2]]
+//
+int test_dmr_sizeof_alignof(int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  __dmr1024 vdmr = *vdmrp;
+  unsigned sizet = sizeof(__dmr1024);
+  unsigned alignt = __alignof__(__dmr1024);
+   unsigned sizev = sizeof(vdmr);
+  unsigned alignv = __alignof__(vdmr);
+  return sizet + alignt + sizev + alignv;
+}
diff --git a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c
new file mode 100644
index 000000000000..1b8d345ac7ec
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c
@@ -0,0 +1,21 @@
+// RUN: not %clang_cc1 -triple powerpc64le-unknown-linux-gnu -target-cpu future \
+// RUN:   %s -emit-llvm-only 2>&1 | FileCheck %s
+
+__attribute__((target("no-mma")))
+void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_dmxvi8gerx4(&vdmr, vp, vc);
+  __builtin_mma_pmdmxvi8gerx4(&vdmr, vp, vc, 0, 0, 0);
+  __builtin_mma_dmxvi8gerx4pp(&vdmr, vp, vc);
+  __builtin_mma_pmdmxvi8gerx4pp(&vdmr, vp, vc, 0, 0, 0);
+  __builtin_mma_dmxvi8gerx4spp(&vdmr, vp, vc);
+  __builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0);
+
+// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_dmxvi8gerx4pp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops
+}
diff --git a/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp b/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
index 74e50ceea386..1e213e7f7512 100644
--- a/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
+++ b/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
@@ -1,3 +1,5 @@
+// RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu future %s \
+// RUN:   -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu pwr10 %s \
 // RUN:   -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu pwr9 %s \
@@ -5,6 +7,9 @@
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu pwr8 %s \
 // RUN:   -emit-llvm -o - | FileCheck %s
 
+// CHECK: _Z2f0Pu9__dmr1024
+void f0(__dmr1024 *vdmr) {}
+
 // CHECK: _Z2f1Pu13__vector_quad
 void f1(__vector_quad *vq) {}
 
diff --git a/clang/test/Sema/ppc-dmf-types.c b/clang/test/Sema/ppc-dmf-types.c
new file mode 100644
index 000000000000..b3da72df2508
--- /dev/null
+++ b/clang/test/Sema/ppc-dmf-types.c
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple powerpc64le-unknown-unknown -fsyntax-only \
+// RUN:   -target-cpu future %s -verify
+// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -fsyntax-only \
+// RUN:   -target-cpu future %s -verify
+
+// The use of PPC MMA types is strongly restricted. Non-pointer MMA variables
+// can only be declared in functions and a limited number of operations are
+// supported on these types. This test case checks that invalid uses of MMA
+// types are correctly prevented.
+
+// vector dmr
+
+// typedef
+typedef __dmr1024 dmr_t;
+
+// function argument
+void testDmrArg1(__dmr1024 vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  *vdmrp = vdmr;
+}
+
+void testDmrArg2(const __dmr1024 vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  *vdmrp = vdmr;
+}
+
+void testDmrArg3(const dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  *vdmrp = vdmr;
+}
+
+// function return
+__dmr1024 testDmrRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  return *vdmrp; // expected-error {{invalid use of PPC MMA type}}
+}
+
+const dmr_t testDmrRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  return *vdmrp; // expected-error {{invalid use of PPC MMA type}}
+}
+
+// global
+__dmr1024 globalvdmr;        // expected-error {{invalid use of PPC MMA type}}
+const __dmr1024 globalvdmr2; // expected-error {{invalid use of PPC MMA type}}
+__dmr1024 *globalvdmrp;
+const __dmr1024 *const globalvdmrp2;
+dmr_t globalvdmr_t; // expected-error {{invalid use of PPC MMA type}}
+
+// struct field
+struct TestDmrStruct {
+  int a;
+  float b;
+  __dmr1024 c; // expected-error {{invalid use of PPC MMA type}}
+  __dmr1024 *vq;
+};
+
+// operators
+int testDmrOperators1(int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  __dmr1024 vdmr1 = *(vdmrp + 0);
+  __dmr1024 vdmr2 = *(vdmrp + 1);
+  __dmr1024 vdmr3 = *(vdmrp + 2);
+  if (vdmr1) // expected-error {{statement requires expression of scalar type ('__dmr1024' invalid)}}
+    *(vdmrp + 10) = vdmr1;
+  if (!vdmr2) // expected-error {{invalid argument type '__dmr1024' to unary expression}}
+    *(vdmrp + 11) = vdmr3;
+  int c1 = vdmr1 && vdmr2; // expected-error {{invalid operands to binary expression ('__dmr1024' and '__dmr1024')}}
+  int c2 = vdmr2 == vdmr3; // expected-error {{invalid operands to binary expression ('__dmr1024' and '__dmr1024')}}
+  int c3 = vdmr2 < vdmr1;  // expected-error {{invalid operands to binary expression ('__dmr1024' and '__dmr1024')}}
+  return c1 || c2 || c3;
+}
+
+void testDmrOperators2(int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  __dmr1024 vdmr1 = *(vdmrp + 0);
+  __dmr1024 vdmr2 = *(vdmrp + 1);
+  __dmr1024 vdmr3 = *(vdmrp + 2);
+  vdmr1 = -vdmr1;        // expected-error {{invalid argument type '__dmr1024' to unary expression}}
+  vdmr2 = vdmr1 + vdmr3; // expected-error {{invalid operands to binary expression ('__dmr1024' and '__dmr1024')}}
+  vdmr2 = vdmr2 * vdmr3; // expected-error {{invalid operands to binary expression ('__dmr1024' and '__dmr1024')}}
+  vdmr3 = vdmr3 | vdmr3; // expected-error {{invalid operands to binary expression ('__dmr1024' and '__dmr1024')}}
+  vdmr3 = vdmr3 << 2;    // expected-error {{invalid operands to binary expression ('__dmr1024' and 'int')}}
+  *(vdmrp + 10) = vdmr1;
+  *(vdmrp + 11) = vdmr2;
+  *(vdmrp + 12) = vdmr3;
+}
+
+vector unsigned char testDmrOperators3(int *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  __dmr1024 vdmr1 = *(vdmrp + 0);
+  __dmr1024 vdmr2 = *(vdmrp + 1);
+  __dmr1024 vdmr3 = *(vdmrp + 2);
+  vdmr1 ? *(vdmrp + 10) = vdmr2 : *(vdmrp + 11) = vdmr3; // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
+  vdmr2 = vdmr3;
+  return vdmr2[1]; // expected-error {{subscripted value is not an array, pointer, or vector}}
+}
+
+void testDmrOperators4(int v, void *ptr) {
+  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  __dmr1024 vdmr1 = (__dmr1024)v;   // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
+  __dmr1024 vdmr2 = (__dmr1024)vdmrp; // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
+}
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 29302413cf8f..82e07bb8e0ff 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5024,6 +5024,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
     // PowerPC -- Matrix Multiply Assist
     case clang::BuiltinType::VectorPair:
     case clang::BuiltinType::VectorQuad:
+    case clang::BuiltinType::DMR1024:
       break;
 
     // ARM -- Scalable Vector Extension

From f8ee5774b612c425051adfc9108d1c52820b193d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 18:09:07 +0100
Subject: [PATCH 1085/1322] [X86] combineConcatVectorOps - only concat AVX1
 v4i64 shift-by-32 to a shuffle if the concat is free (#145043)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 20 ++++++++++---------
 .../test/CodeGen/X86/vector-shift-lshr-256.ll | 18 ++++++-----------
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 18 ++++++-----------
 3 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c32737010560..35d7b8084405 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58844,16 +58844,18 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
           llvm::all_of(Ops, [](SDValue Op) {
             return Op.getConstantOperandAPInt(1) == 32;
           })) {
-        SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
-        SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
-        if (Opcode == X86ISD::VSHLI) {
-          Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
-                                     {8, 0, 8, 2, 8, 4, 8, 6});
-        } else {
-          Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
-                                     {1, 8, 3, 8, 5, 8, 7, 8});
+        if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
+          SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
+          Res = DAG.getBitcast(MVT::v8i32, Res);
+          if (Opcode == X86ISD::VSHLI) {
+            Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+                                       {8, 0, 8, 2, 8, 4, 8, 6});
+          } else {
+            Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+                                       {1, 8, 3, 8, 5, 8, 7, 8});
+          }
+          return DAG.getBitcast(VT, Res);
         }
-        return DAG.getBitcast(VT, Res);
       }
       [[fallthrough]];
     case X86ISD::VSRAI:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index b45525b6e20f..3a4bb223618d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1974,11 +1974,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
 define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
 ; AVX1-LABEL: shift32_v4i64_concat:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shift32_v4i64_concat:
@@ -1990,11 +1988,9 @@ define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
 ;
 ; XOPAVX1-LABEL: shift32_v4i64_concat:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: shift32_v4i64_concat:
@@ -2020,11 +2016,9 @@ define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
 ;
 ; X86-AVX1-LABEL: shift32_v4i64_concat:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: shift32_v4i64_concat:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 2248ee997d52..b56a8b5b2294 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1827,11 +1827,9 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
 define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
 ; AVX1-LABEL: shift32_v4i64_concat:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shift32_v4i64_concat:
@@ -1843,11 +1841,9 @@ define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
 ;
 ; XOPAVX1-LABEL: shift32_v4i64_concat:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; XOPAVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: shift32_v4i64_concat:
@@ -1873,11 +1869,9 @@ define <4 x i64> @shift32_v4i64_concat(<2 x i64> %lo, <2 x i64> %hi) nounwind {
 ;
 ; X86-AVX1-LABEL: shift32_v4i64_concat:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: shift32_v4i64_concat:

From 33a92af1b2260506356eb838125b356703bf02bb Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Fri, 20 Jun 2025 10:11:12 -0700
Subject: [PATCH 1086/1322] [msan] Add off-by-default flag to fix false
 negatives from partially undefined constant fixed-length vectors (#143837)

This patch adds an off-by-default flag which, when enabled via `-mllvm -msan-poison-undef-vectors=true`, fixes a false negative in MSan (partially-undefined constant fixed-length vectors). It is currently off by default since, by fixing the false positive, code/tests that previously passed MSan may start failing. The default will be changed in a future patch.

Prior to this patch, MSan computes that partially-undefined constant fixed-length vectors are fully initialized, which leads to false negatives; moreover, benign vector rewriting could theoretically flip MSan's shadow computation from initialized to uninitialized or vice-versa (*). `-msan-poison-undef-vectors=true` calculates the shadow precisely: for each element of the vector, the corresponding shadow is fully uninitialized if the element is undefined/poisoned, otherwise it is fully initialized.

Updates the test from https://github.com/llvm/llvm-project/pull/143823

(*) For example:
  ```
  %x = insertelement <2 x i64> <i64 0, i64 poison>, i64 42, i64 0
  %y = insertelement <2 x i64> <i64 poison, i64 poison>, i64 42, i64 0
  ```
%x and %y are equivalent but, prior to this patch, MSan incorrectly computes the shadow of %x as <0, 0> rather than <0, -1>.
---
 .../Instrumentation/MemorySanitizer.cpp       | 46 +++++++++++++++++--
 .../MemorySanitizer/partial-poison.ll         | 39 +++++++++++++---
 2 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fb55bd7bfe56..5aeca60b08d3 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -265,9 +265,22 @@ static cl::opt<bool>
                       cl::desc("Print name of local stack variable"),
                       cl::Hidden, cl::init(true));
 
-static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
-                                   cl::desc("poison undef temps"), cl::Hidden,
-                                   cl::init(true));
+static cl::opt<bool>
+    ClPoisonUndef("msan-poison-undef",
+                  cl::desc("Poison fully undef temporary values. "
+                           "Partially undefined constant vectors "
+                           "are unaffected by this flag (see "
+                           "-msan-poison-undef-vectors)."),
+                  cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClPoisonUndefVectors(
+    "msan-poison-undef-vectors",
+    cl::desc("Precisely poison partially undefined constant vectors. "
+             "If false (legacy behavior), the entire vector is "
+             "considered fully initialized, which may lead to false "
+             "negatives. Fully undefined constant vectors are "
+             "unaffected by this flag (see -msan-poison-undef)."),
+    cl::Hidden, cl::init(false));
 
 static cl::opt<bool>
     ClHandleICmp("msan-handle-icmp",
@@ -1181,6 +1194,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   bool PropagateShadow;
   bool PoisonStack;
   bool PoisonUndef;
+  bool PoisonUndefVectors;
 
   struct ShadowOriginAndInsertPoint {
     Value *Shadow;
@@ -1207,6 +1221,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     PropagateShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
     PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    PoisonUndefVectors = SanitizeFunction && ClPoisonUndefVectors;
 
     // In the presence of unreachable blocks, we may see Phi nodes with
     // incoming nodes from such blocks. Since InstVisitor skips unreachable
@@ -1989,6 +2004,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
       return Shadow;
     }
+    // Handle fully undefined values
+    // (partially undefined constant vectors are handled later)
     if (UndefValue *U = dyn_cast<UndefValue>(V)) {
       Value *AllOnes = (PropagateShadow && PoisonUndef) ? getPoisonedShadow(V)
                                                         : getCleanShadow(V);
@@ -2086,8 +2103,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return ShadowPtr;
     }
 
-    // TODO: Partially undefined vectors are handled by the fall-through case
-    //       below (see partial-poison.ll); this causes false negatives.
+    // Check for partially-undefined constant vectors
+    // TODO: scalable vectors (this is hard because we do not have IRBuilder)
+    if (isa<FixedVectorType>(V->getType()) && isa<Constant>(V) &&
+        cast<Constant>(V)->containsUndefOrPoisonElement() && PropagateShadow &&
+        PoisonUndefVectors) {
+      unsigned NumElems = cast<FixedVectorType>(V->getType())->getNumElements();
+      SmallVector<Constant *, 32> ShadowVector(NumElems);
+      for (unsigned i = 0; i != NumElems; ++i) {
+        Constant *Elem = cast<Constant>(V)->getAggregateElement(i);
+        ShadowVector[i] = isa<UndefValue>(Elem) ? getPoisonedShadow(Elem)
+                                                : getCleanShadow(Elem);
+      }
+
+      Value *ShadowConstant = ConstantVector::get(ShadowVector);
+      LLVM_DEBUG(dbgs() << "Partial undef constant vector: " << *V << " ==> "
+                        << *ShadowConstant << "\n");
+
+      return ShadowConstant;
+    }
+
+    // TODO: partially-undefined constant arrays, structures, and nested types
 
     // For everything else the shadow is zero.
     return getCleanShadow(V);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
index 5164441c17e1..025317a53c8d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -passes='msan' 2>&1 | FileCheck %s
+; RUN: opt < %s -S -passes='msan' -msan-poison-undef-vectors=true  2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PRECISE
+; RUN: opt < %s -S -passes='msan' -msan-poison-undef-vectors=false 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-IMPRECISE
 ;
-; Test case to show that MSan computes shadows for partially poisoned vectors
-; as fully initialized, resulting in false negatives.
+; Regression test case for computing shadows of partially poisoned vectors.
+; Partially poisoned structs and arrays are not correctly implemented.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -11,7 +12,8 @@ define <2 x i64> @left_poison(ptr %add.ptr) sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @left_poison(
 ; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-PRECISE:   store <2 x i64> <i64 -1, i64 0>, ptr @__msan_retval_tls, align 8
+; CHECK-IMPRECISE: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> <i64 poison, i64 42>
 ;
   ret <2 x i64> <i64 poison, i64 42>
@@ -21,7 +23,8 @@ define <2 x i64> @right_poison(ptr %add.ptr) sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @right_poison(
 ; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-PRECISE:   store <2 x i64> <i64 0, i64 -1>, ptr @__msan_retval_tls, align 8
+; CHECK-IMPRECISE: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> <i64 42, i64 poison>
 ;
   ret <2 x i64> <i64 42, i64 poison>
@@ -51,7 +54,8 @@ define <2 x i64> @left_undef(ptr %add.ptr) sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @left_undef(
 ; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-PRECISE:   store <2 x i64> <i64 -1, i64 0>, ptr @__msan_retval_tls, align 8
+; CHECK-IMPRECISE: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> <i64 undef, i64 42>
 ;
   ret <2 x i64> <i64 undef, i64 42>
@@ -61,7 +65,8 @@ define <2 x i64> @right_undef(ptr %add.ptr) sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @right_undef(
 ; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-PRECISE:   store <2 x i64> <i64 0, i64 -1>, ptr @__msan_retval_tls, align 8
+; CHECK-IMPRECISE: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> <i64 42, i64 undef>
 ;
   ret <2 x i64> <i64 42, i64 undef>
@@ -76,3 +81,23 @@ define <2 x i64> @full_undef(ptr %add.ptr) sanitize_memory {
 ;
   ret <2 x i64> <i64 undef, i64 undef>
 }
+
+define {i64, i64}  @struct_left_undef() sanitize_memory {
+; CHECK-LABEL: define { i64, i64 } @struct_left_undef(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store { i64, i64 } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { i64, i64 } { i64 undef, i64 42 }
+;
+  ret {i64, i64} { i64 undef, i64 42 }
+}
+
+define [2x i64]  @array_right_undef() sanitize_memory {
+; CHECK-LABEL: define [2 x i64] @array_right_undef(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [2 x i64] [i64 42, i64 undef]
+; 
+  ret [2x i64] [ i64 42, i64 undef ]
+}

From 0d21c956a5c1640c0f9588b307963bf32b09f6ab Mon Sep 17 00:00:00 2001
From: Michal Rostecki <vadorovsky@disroot.org>
Date: Fri, 20 Jun 2025 19:17:36 +0200
Subject: [PATCH 1087/1322] [BPF] Handle nested wrapper structs in BPF map
 definition traversal (#144097)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In Aya/Rust, BPF map definitions are nested in two nested types:

* A struct representing the map type (e.g., `HashMap`, `RingBuf`) that
provides methods for interacting with the map type (e.g. `HashMap::get`,
`RingBuf::reserve`).
* An `UnsafeCell`, which informs the Rust compiler that the type is
thread-safe and can be safely mutated even as a global variable. The
kernel guarantees map operation safety.

This leads to a type hierarchy like:

```rust
    pub struct HashMap<K, V, const M: usize, const F: usize = 0>(
        core::cell::UnsafeCell<HashMapDef<K, V, M, F>>,
    );
    const BPF_MAP_TYPE_HASH: usize = 1;
    pub struct HashMapDef<K, V, const M: usize, const F: usize = 0> {
        r#type: *const [i32; BPF_MAP_TYPE_HASH],
        key: *const K,
        value: *const V,
        max_entries: *const [i32; M],
        map_flags: *const [i32; F],
    }
```

Then used in the BPF program code as a global variable:

```rust
    #[link_section = ".maps"]
    static HASH_MAP: HashMap<u32, u32, 1337> = HashMap::new();
```

Which is an equivalent of the following BPF map definition in C:

```c
    #define BPF_MAP_TYPE_HASH 1
    struct {
        int (*type)[BPF_MAP_TYPE_HASH];
        typeof(int) *key;
        typeof(int) *value;
        int (*max_entries)[1337];
    } map_1 __attribute__((section(".maps")));
```

Accessing the actual map definition requires traversing:

```
  HASH_MAP -> __0 -> value
```

Previously, the BPF backend only visited the pointee types of the
outermost struct, and didn’t descend into inner wrappers. This caused
issues when the key/value types were custom structs:

```rust
    // Define custom structs for key and values.
    pub struct MyKey(u32);
    pub struct MyValue(u32);

    #[link_section = ".maps"]
    #[export_name = "HASH_MAP"]
    pub static HASH_MAP: HashMap<MyKey, MyValue, 10> = HashMap::new();
```

These types weren’t fully visited and appeared in BTF as forward
declarations:

```
    #30: <FWD> 'MyKey' kind:struct
    #31: <FWD> 'MyValue' kind:struct
```

The fix is to enhance `visitMapDefType` to recursively visit inner
composite members. If a member is a composite type (likely a wrapper),
it is now also visited using `visitMapDefType`, ensuring that the
pointee types of the innermost stuct members, like `MyKey` and
`MyValue`, are fully resolved in BTF.

With this fix, the correct BTF entries are emitted:

```
    #6: <STRUCT> 'MyKey' sz:4 n:1
            #00 '__0' off:0 --> [7]
    #7: <INT> 'u32' bits:32 off:0
    #8: <PTR> --> [9]
    #9: <STRUCT> 'MyValue' sz:4 n:1
            #00 '__0' off:0 --> [7]
```

Fixes: #143361
---
 llvm/lib/Target/BPF/BTFDebug.cpp            |  17 ++-
 llvm/test/CodeGen/BPF/BTF/map-def-nested.ll | 117 ++++++++++++++++++++
 2 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/BTF/map-def-nested.ll

diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 1d7c6fc799ff..1e29a0f1e85a 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -976,11 +976,24 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
   if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
     return;
 
-  // Visit all struct members to ensure pointee type is visited
+  // Visit all struct members to ensure their types are visited.
   const DINodeArray Elements = CTy->getElements();
   for (const auto *Element : Elements) {
     const auto *MemberType = cast<DIDerivedType>(Element);
-    visitTypeEntry(MemberType->getBaseType());
+    const DIType *MemberBaseType = MemberType->getBaseType();
+
+    // If the member is a composite type, that may indicate the currently
+    // visited composite type is a wrapper, and the member represents the
+    // actual map definition.
+    // In that case, visit the member with `visitMapDefType` instead of
+    // `visitTypeEntry`, treating it specifically as a map definition rather
+    // than as a regular composite type.
+    const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
+    if (MemberCTy) {
+      visitMapDefType(MemberBaseType, TypeId);
+    } else {
+      visitTypeEntry(MemberBaseType);
+    }
   }
 
   // Visit this type, struct or a const/typedef/volatile/restrict type
diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-nested.ll b/llvm/test/CodeGen/BPF/BTF/map-def-nested.ll
new file mode 100644
index 000000000000..ac04be371529
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/map-def-nested.ll
@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF-SHORT %s
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+
+; Source code:
+;   struct key { int i; };
+;   struct val { int j; };
+;   
+;   #define __uint(name, val) int (*name)[val]
+;   #define __type(name, val) typeof(val) *name
+;   
+;   struct {
+;      struct {
+;           __uint(type, 1);
+;           __uint(max_entries, 1337);
+;           __type(key, struct key);
+;           __type(value, struct val);
+;       } map_def;
+;   } map __attribute__((section(".maps")));
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm t.c
+
+; ModuleID = 'bpf.c'
+source_filename = "bpf.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpf"
+
+%struct.anon = type { %struct.anon.0 }
+%struct.anon.0 = type { ptr, ptr, ptr, ptr }
+
+@map = dso_local local_unnamed_addr global %struct.anon zeroinitializer, section ".maps", align 8, !dbg !0
+
+; We expect exactly 4 structs:
+; * key
+; * val
+; * inner map type (the actual definition)
+; * outer map type (the wrapper)
+;
+; CHECK-BTF-SHORT-COUNT-4: STRUCT
+; CHECK-BTF-SHORT-NOT:     STRUCT
+
+; We expect no forward declarations.
+;
+; CHECK-BTF-SHORT-NOT: FWD
+
+; Assert the whole BTF.
+;
+; CHECK-BTF:      [1] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [2] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [3] ARRAY '(anon)' type_id=2 index_type_id=4 nr_elems=1
+; CHECK-BTF-NEXT: [4] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [5] PTR '(anon)' type_id=6
+; CHECK-BTF-NEXT: [6] ARRAY '(anon)' type_id=2 index_type_id=4 nr_elems=1337
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=8
+;
+; Before bug https://github.com/llvm/llvm-project/issues/143361 was fixed, the
+; BTF kind of MyKey (#6) and MyValue (#9) would be FWD instead of STRUCT. The
+; main goal of this test is making sure that the full STRUCT BTF is generated
+; for these types.
+;
+; CHECK-BTF-NEXT: [8] STRUCT 'key' size=4 vlen=1
+; CHECK-BTF-NEXT:         'i' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [9] PTR '(anon)' type_id=10
+; CHECK-BTF-NEXT: [10] STRUCT 'val' size=4 vlen=1
+; CHECK-BTF-NEXT:         'j' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [11] STRUCT '(anon)' size=32 vlen=4
+; CHECK-BTF-NEXT:         'type' type_id=1 bits_offset=0
+; CHECK-BTF-NEXT:         'max_entries' type_id=5 bits_offset=64
+; CHECK-BTF-NEXT:         'key' type_id=7 bits_offset=128
+; CHECK-BTF-NEXT:         'value' type_id=9 bits_offset=192
+; CHECK-BTF-NEXT: [12] STRUCT '(anon)' size=32 vlen=1
+; CHECK-BTF-NEXT:         'map_def' type_id=11 bits_offset=0
+; CHECK-BTF-NEXT: [13] VAR 'map' type_id=12, linkage=global
+; CHECK-BTF-NEXT: [14] DATASEC '.maps' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=13 offset=0 size=32
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!31, !32, !33, !34}
+!llvm.ident = !{!35}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "map", scope: !2, file: !3, line: 14, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git c935bd3798b39330aab2c9ca29a519457d5e5245)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "bpf.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "2330cce6d83c72ef5335abc3016de28e")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !3, line: 7, size: 256, elements: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "map_def", scope: !5, file: !3, line: 13, baseType: !8, size: 256)
+!8 = distinct !DICompositeType(tag: DW_TAG_structure_type, scope: !5, file: !3, line: 8, size: 256, elements: !9)
+!9 = !{!10, !16, !21, !26}
+!10 = !DIDerivedType(tag: DW_TAG_member, name: "type", scope: !8, file: !3, line: 9, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, size: 32, elements: !14)
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !{!15}
+!15 = !DISubrange(count: 1)
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "max_entries", scope: !8, file: !3, line: 10, baseType: !17, size: 64, offset: 64)
+!17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64)
+!18 = !DICompositeType(tag: DW_TAG_array_type, baseType: !13, size: 42784, elements: !19)
+!19 = !{!20}
+!20 = !DISubrange(count: 1337)
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "key", scope: !8, file: !3, line: 11, baseType: !22, size: 64, offset: 128)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
+!23 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "key", file: !3, line: 1, size: 32, elements: !24)
+!24 = !{!25}
+!25 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !23, file: !3, line: 1, baseType: !13, size: 32)
+!26 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !8, file: !3, line: 12, baseType: !27, size: 64, offset: 192)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
+!28 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "val", file: !3, line: 2, size: 32, elements: !29)
+!29 = !{!30}
+!30 = !DIDerivedType(tag: DW_TAG_member, name: "j", scope: !28, file: !3, line: 2, baseType: !13, size: 32)
+!31 = !{i32 7, !"Dwarf Version", i32 5}
+!32 = !{i32 2, !"Debug Info Version", i32 3}
+!33 = !{i32 1, !"wchar_size", i32 4}
+!34 = !{i32 7, !"frame-pointer", i32 2}
+!35 = !{!"clang version 21.0.0git (git@github.com:llvm/llvm-project.git c935bd3798b39330aab2c9ca29a519457d5e5245)"}

From adc6228ea07eba401481e218c3e0536a4aa6b8ec Mon Sep 17 00:00:00 2001
From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com>
Date: Fri, 20 Jun 2025 10:43:19 -0700
Subject: [PATCH 1088/1322] [mlir][xegpu] Refine layout assignment in XeGPU
 SIMT distribution. (#142687)

Changes:
* Decouple layout propagation from subgroup distribution and move it to
an independent pass.
* Refine layout assignment to handle control-flow ops correctly (scf.for, scf.while).
* Refine test cases.
---
 .../mlir/Dialect/XeGPU/Transforms/Passes.td   |  19 +-
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |  14 +
 .../Dialect/XeGPU/Transforms/CMakeLists.txt   |   1 +
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 889 +++++++++++++++++
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 921 ++----------------
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 430 ++++++++
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 280 ++++++
 .../Dialect/XeGPU/subgroup-distribution.mlir  | 275 ------
 .../XeGPU/subgroup-map-propagation.mlir       | 622 ------------
 9 files changed, 1725 insertions(+), 1726 deletions(-)
 create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
 create mode 100644 mlir/test/Dialect/XeGPU/propagate-layout.mlir
 create mode 100644 mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
 delete mode 100644 mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
 delete mode 100644 mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 8bdf19ac0e47..3a88dae041dd 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -27,10 +27,23 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
                            "vector::VectorDialect"];
+}
+
+def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
+  let summary = "Propagate and assign XeGPU layout information";
+  let description = [{
+    This pass propagates the XeGPU layout information accross ops. Starting
+    from a set of anchor operations (e.g. `dpas`, `store_nd`), this will
+    propagate the layouts required for their operands to the producers. With
+    this propagated layout information, pass will then update op result type
+    with the layout information.
+  }];
+  let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
+                           "vector::VectorDialect"];
   let options = [Option<
-      "printOnly", "print-analysis-only", "bool",
-      /*default=*/"false",
-      "Print the result of the subgroup map propagation analysis and exit.">];
+    "printOnly", "print-analysis-only", "bool",
+    /*default=*/"false",
+    "Print the result of layout propagation analysis and exit.">];
 }
 
 def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 6fea10185402..772cf7364964 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -24,6 +24,20 @@ class LayoutAttr;
 class TensorDescType;
 } // namespace xegpu
 
+namespace xegpu {
+/// HW dependent constants.
+/// TODO: These constants should be queried from the target information.
+namespace targetinfo {
+constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
+/// If DPAS A or B operands have low precision element types they must be packed
+/// according to the following sizes.
+constexpr unsigned packedSizeInBitsForDefault =
+    16; // Minimum packing size per register for DPAS A.
+constexpr unsigned packedSizeInBitsForDpasB =
+    32; // Minimum packing size per register for DPAS B.
+} // namespace targetinfo
+} // namespace xegpu
+
 namespace xegpu {
 
 /// Flatten a set of ValueRange into a single SmallVector<Value>
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index af0d7f6bd907..9c178d1d8564 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUSubgroupDistribute.cpp
   XeGPUUnroll.cpp
   XeGPUWgToSgDistribute.cpp
+  XeGPUPropagateLayout.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
new file mode 100644
index 000000000000..cc22d2bbd8c3
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -0,0 +1,889 @@
+//===- XeGPUPropagateLayout.cpp - XeGPU Layout Propagation ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/Analysis/DataFlow/Utils.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InterleavedRange.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-propagate-layout"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+using namespace mlir::dataflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Layout
+//===----------------------------------------------------------------------===//
+
+/// Helper class to store the ND layout of lanes within a subgroup and data
+/// owned by each lane.
+struct Layout {
+  SmallVector<int64_t, 3> layout;
+  Layout() = default;
+  Layout(std::initializer_list<int64_t> list) : layout(list) {}
+  void print(llvm::raw_ostream &os) const;
+  size_t size() const { return layout.size(); }
+};
+
+void Layout::print(llvm::raw_ostream &os) const {
+  os << llvm::interleaved_array(layout);
+}
+
+/// LaneLayout represents the logical layout of lanes within a subgroup when it
+/// accesses some value. LaneData represents the logical layout of data owned by
+/// each work item.
+using LaneLayout = Layout;
+using LaneData = Layout;
+
+//===----------------------------------------------------------------------===//
+// LayoutInfo
+//===----------------------------------------------------------------------===//
+
+/// Helper class for tracking the analysis state of an mlir value. For layout
+/// propagation, the analysis state is simply the lane_layout and lane_data of
+/// each value. Purpose of this analysis to propagate some unique layout for
+/// each value in the program starting from a set of anchor operations (like
+/// DPAS, StoreNd, etc.).
+///
+/// Given this, LayoutInfo  satisifies the following properties:
+///  1) A LayoutInfo value can be in one of two states - `assigned` or `not
+///  assigned`.
+///  2) Two LayoutInfo values are equal if they are both assigned or
+///  both not assigned. The concrete value of assigned state does not matter.
+///  3) The meet operator works as follows:
+///     - If current state is assigned, return the current state. (already
+///     a unique layout is assigned. don't change it)
+///     - Otherwise, return the other state.
+
+struct LayoutInfo {
+private:
+  LaneLayout laneLayout;
+  LaneData laneData;
+  xegpu::LayoutAttr layoutAttr;
+
+public:
+  LayoutInfo() = default;
+  LayoutInfo(const LaneLayout &layout, const LaneData &data)
+      : laneLayout(layout), laneData(data) {}
+
+  // Two lattice values are equal if they have `some` layout. The actual
+  // content of the layout does not matter.
+  bool operator==(const LayoutInfo &other) const {
+    return this->isAssigned() == other.isAssigned();
+  }
+
+  static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
+
+  static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
+
+  void print(raw_ostream &os) const;
+
+  bool isAssigned() const {
+    return laneLayout.size() > 0 && laneData.size() > 0;
+  }
+
+  LayoutInfo getTransposedLayout(ArrayRef<int64_t> permutation) const;
+
+  const LaneLayout &getLayout() const { return laneLayout; }
+  const LaneData &getData() const { return laneData; }
+  ArrayRef<int64_t> getLayoutAsArrayRef() const { return laneLayout.layout; }
+  ArrayRef<int64_t> getDataAsArrayRef() const { return laneData.layout; }
+};
+
+void LayoutInfo::print(raw_ostream &os) const {
+  if (isAssigned()) {
+    os << "lane_layout: ";
+    laneLayout.print(os);
+    os << ", lane_data: ";
+    laneData.print(os);
+  } else {
+    os << "Not assigned.";
+  }
+}
+
+LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
+  if (!lhs.isAssigned())
+    return rhs;
+  return lhs;
+}
+
+/// Since this is a backward analysis, join method is not used.
+LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
+  llvm_unreachable("Join should not be triggered by layout propagation.");
+}
+
+/// Get the transposed layout according to the given permutation.
+LayoutInfo
+LayoutInfo::getTransposedLayout(ArrayRef<int64_t> permutation) const {
+  if (!isAssigned())
+    return {};
+  LaneLayout newLayout;
+  LaneData newData;
+  for (int64_t idx : permutation) {
+    newLayout.layout.push_back(laneLayout.layout[idx]);
+    newData.layout.push_back(laneData.layout[idx]);
+  }
+  return LayoutInfo(newLayout, newData);
+}
+
+//===----------------------------------------------------------------------===//
+// LayoutInfoLattice
+//===----------------------------------------------------------------------===//
+
+/// Lattice holding the LayoutInfo for each value.
+struct LayoutInfoLattice : public Lattice<LayoutInfo> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
+  using Lattice::Lattice;
+};
+
+/// Helper Functions to get default layouts. A `default layout` is a layout that
+/// is assigned to a value when the layout is not fixed by some anchor operation
+/// (like DPAS).
+
+/// Helper Function to get the default layout for uniform values like constants.
+/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
+/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
+static LayoutInfo getDefaultSIMTLayoutInfo(unsigned rank) {
+  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
+  if (rank == 1)
+    return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}),
+                      LaneData({1}));
+  return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}),
+                    LaneData({1, 1}));
+}
+
+/// Helper to get the default layout for a vector type.
+static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy) {
+  // Expecting a 1D or 2D vector.
+  assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
+         "Expected 1D or 2D vector.");
+  // Expecting int or float element type.
+  assert(vectorTy.getElementType().isIntOrFloat() &&
+         "Expected int or float element type.");
+  // If the rank is 1, then return default layout for 1D vector.
+  if (vectorTy.getRank() == 1)
+    return getDefaultSIMTLayoutInfo(1);
+  // Packing factor is determined by the element type bitwidth.
+  int packingFactor = 1;
+  unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+  if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
+    packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
+  return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}),
+                    LaneData({1, packingFactor}));
+}
+
+/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
+/// is set according to the following criteria:
+/// * For A operand, the data must be packed in minimum
+/// `packedSizeInBitsForDefault`
+/// * For B operand, the data must be packed in minimum
+/// `packedSizeInBitsForDpasB`
+static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
+                                                  unsigned operandNum) {
+  Type elementTy = vectorTy.getElementType();
+  assert(elementTy.isIntOrFloat() &&
+         "Expected int or float type in DPAS operands");
+  LaneLayout layout({1, xegpu::targetinfo::subgroupSize});
+  // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
+  // must have the VNNI format.
+  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() <
+                             xegpu::targetinfo::packedSizeInBitsForDpasB) {
+    LaneData data({xegpu::targetinfo::packedSizeInBitsForDpasB /
+                       elementTy.getIntOrFloatBitWidth(),
+                   1});
+    return LayoutInfo(layout, data);
+  }
+  // Otherwise, return the default layout for the vector type.
+  return getDefaultSIMTLayoutInfo(vectorTy);
+}
+
+//===----------------------------------------------------------------------===//
+// LayoutInfoPropagation
+//===----------------------------------------------------------------------===//
+
+/// Backward data flow analysis to propagate the lane_layout and lane_data of
+/// each value in the program. Currently, the layouts for operands DPAS,
+/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
+/// this analysis is to propagate those known layouts to all their producers and
+/// (other) consumers.
+class LayoutInfoPropagation
+    : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
+private:
+  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+                   ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitStoreNdOp(xegpu::StoreNdOp store,
+                      ArrayRef<LayoutInfoLattice *> operands,
+                      ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
+                           ArrayRef<LayoutInfoLattice *> operands,
+                           ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitLoadNdOp(xegpu::LoadNdOp load,
+                     ArrayRef<LayoutInfoLattice *> operands,
+                     ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitLoadGatherOp(xegpu::LoadGatherOp load,
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitTransposeOp(vector::TransposeOp transpose,
+                        ArrayRef<LayoutInfoLattice *> operands,
+                        ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitVectorBitcastOp(vector::BitCastOp bitcast,
+                            ArrayRef<LayoutInfoLattice *> operands,
+                            ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitCreateDescOp(xegpu::CreateDescOp createDesc,
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
+                             ArrayRef<LayoutInfoLattice *> operands,
+                             ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results);
+
+  void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
+                                   ArrayRef<LayoutInfoLattice *> operands,
+                                   ArrayRef<const LayoutInfoLattice *> results);
+
+public:
+  LayoutInfoPropagation(DataFlowSolver &solver,
+                        SymbolTableCollection &symbolTable)
+      : SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
+  using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
+
+  LogicalResult
+  visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+                 ArrayRef<const LayoutInfoLattice *> results) override;
+
+  void visitBranchOperand(OpOperand &operand) override {};
+
+  void visitCallOperand(OpOperand &operand) override {};
+
+  void visitExternalCall(CallOpInterface call,
+                         ArrayRef<LayoutInfoLattice *> operands,
+                         ArrayRef<const LayoutInfoLattice *> results) override {
+  };
+
+  void setToExitState(LayoutInfoLattice *lattice) override {
+    (void)lattice->meet(LayoutInfo());
+  }
+};
+} // namespace
+
+LogicalResult LayoutInfoPropagation::visitOperation(
+    Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  TypeSwitch<Operation *>(op)
+      .Case<xegpu::DpasOp>(
+          [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); })
+      .Case<xegpu::StoreNdOp>(
+          [&](auto storeNdOp) { visitStoreNdOp(storeNdOp, operands, results); })
+      .Case<xegpu::StoreScatterOp>([&](auto storeScatterOp) {
+        visitStoreScatterOp(storeScatterOp, operands, results);
+      })
+      .Case<xegpu::LoadNdOp>(
+          [&](auto loadNdOp) { visitLoadNdOp(loadNdOp, operands, results); })
+      .Case<xegpu::LoadGatherOp>([&](auto loadGatherOp) {
+        visitLoadGatherOp(loadGatherOp, operands, results);
+      })
+      .Case<xegpu::CreateDescOp>([&](auto createDescOp) {
+        visitCreateDescOp(createDescOp, operands, results);
+      })
+      .Case<xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
+        visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
+      })
+      .Case<xegpu::PrefetchNdOp>([&](auto prefetchNdOp) {
+        visitPrefetchNdOp(prefetchNdOp, operands, results);
+      })
+      .Case<vector::TransposeOp>([&](auto transposeOp) {
+        visitTransposeOp(transposeOp, operands, results);
+      })
+      .Case<vector::BitCastOp>([&](auto bitcastOp) {
+        visitVectorBitcastOp(bitcastOp, operands, results);
+      })
+      .Case<vector::MultiDimReductionOp>([&](auto reductionOp) {
+        visitVectorMultiReductionOp(reductionOp, operands, results);
+      })
+      // All other ops.
+      .Default([&](Operation *op) {
+        for (const LayoutInfoLattice *resultInfo : results) {
+          if (!resultInfo->getValue().isAssigned())
+            continue;
+          for (auto [operandInfo, operand] :
+               llvm::zip(operands, op->getOpOperands())) {
+            // If the operand type is not a vector or tensor descriptor, skip
+            // it.
+            if (!isa<xegpu::TensorDescType, VectorType>(
+                    operand.get().getType()))
+              continue;
+            // Propagate the result layout to the operand.
+            meet(operandInfo, *resultInfo);
+          }
+        }
+      });
+
+  return success();
+}
+
+void LayoutInfoPropagation::visitPrefetchNdOp(
+    xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Here we assign the default layout to the tensor descriptor operand of
+  // prefetch.
+  auto tdescTy = prefetch.getTensorDescType();
+  auto prefetchLayout = getDefaultSIMTLayoutInfo(
+      VectorType::get(tdescTy.getShape(), tdescTy.getElementType()));
+  // Propagate the layout to the source tensor descriptor.
+  propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
+}
+
+void LayoutInfoPropagation::visitVectorMultiReductionOp(
+    vector::MultiDimReductionOp reduction,
+    ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // The layout of the result must be present.
+  LayoutInfo resultLayout = results[0]->getValue();
+  if (!resultLayout.isAssigned())
+    return;
+  // We only consider 2D -> 1D reductions at this point.
+  VectorType resultTy = llvm::dyn_cast<VectorType>(reduction.getDestType());
+  if (!resultTy || resultTy.getRank() != 1) {
+    reduction.emitWarning("Expecting output type to be 1D vector.");
+    return;
+  }
+  // Given that the result is 1D, the layout of the operand should be 2D with
+  // default layout.
+  LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(2);
+  propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
+  // Accumulator should have the same layout as the result.
+  propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
+}
+
+/// Propagate the layout of the result tensor to the source tensor descriptor in
+/// UpdateNdOffsetOp.
+void LayoutInfoPropagation::visitUpdateNdOffsetOp(
+    xegpu::UpdateNdOffsetOp updateNdOffset,
+    ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // The layout of the result must be present.
+  LayoutInfo resultLayout = results[0]->getValue();
+  if (!resultLayout.isAssigned())
+    return;
+  // Propagate the layout to the source operand.
+  propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
+}
+
+/// Set the layouts for DPAS A, B, and C operands.
+void LayoutInfoPropagation::visitDpasOp(
+    xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  VectorType aTy = dpas.getLhsType();
+  VectorType bTy = dpas.getRhsType();
+  propagateIfChanged(
+      operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0)));
+  propagateIfChanged(
+      operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1)));
+  if (operands.size() > 2) {
+    VectorType cTy = dpas.getAccType();
+    propagateIfChanged(
+        operands[2],
+        operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2)));
+  }
+}
+
+/// Set the layout for the value and tensor descriptor operands in StoreNdOp.
+void LayoutInfoPropagation::visitStoreNdOp(
+    xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
+  // Both operands should have the same layout
+  for (LayoutInfoLattice *operand : operands)
+    propagateIfChanged(operand, operand->meet(storeLayout));
+}
+
+/// Propagate the layout of the value to the tensor descriptor operand in
+/// LoadNdOp.
+void LayoutInfoPropagation::visitLoadNdOp(
+    xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo valueLayout = results[0]->getValue();
+  // Need the layout of the value to propagate to the tensor descriptor.
+  if (!valueLayout.isAssigned())
+    return;
+  LayoutInfo tensorDescLayout = valueLayout;
+  // LoadNdOp has the transpose effect. However, at the stage of this analysis
+  // this effect is not expected and should be abstracted away. Emit a
+  // warning.
+  if (auto transpose = load.getTranspose()) {
+    load.emitWarning("Transpose effect is not expected for LoadNdOp at "
+                     "LayoutInfoPropagation stage.");
+    tensorDescLayout = valueLayout.getTransposedLayout(transpose.value());
+  }
+  // Propagate the new layout to the tensor descriptor operand.
+  propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
+}
+
+/// For vector::TransposeOp, the layout of the result is transposed and
+/// propagated to the operand.
+void LayoutInfoPropagation::visitTransposeOp(
+    vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Need the layout of transpose result to propagate to the operands.
+  LayoutInfo resultLayout = results[0]->getValue();
+  if (!resultLayout.isAssigned())
+    return;
+  LayoutInfo newLayout =
+      resultLayout.getTransposedLayout(transpose.getPermutation());
+  // Propagate the new layout to the vector operand.
+  propagateIfChanged(operands[0], operands[0]->meet(newLayout));
+}
+
+/// For vector::BitCastOp, the lane_data of the source layout is changed based
+/// on the bit width of the source and result types.
+void LayoutInfoPropagation::visitVectorBitcastOp(
+    vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Need the layout of bitcast result to propagate to the operands.
+  LayoutInfo resultLayout = results[0]->getValue();
+  if (!resultLayout.isAssigned())
+    return;
+  int inElemTyBitWidth =
+      bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
+  int outElemTyBitWidth =
+      bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
+
+  // NOTE: We do not expect widening or narrowing bitcasts at this stage. Emit
+  // a warning and return.
+  if (inElemTyBitWidth != outElemTyBitWidth) {
+    bitcast.emitWarning("Widening or narrowing bitcasts are not expected at "
+                        "layout propagation stage.");
+    return;
+  }
+
+  propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
+}
+
+/// Propagate the layout of the result to the tensor descriptor and mask
+/// operands in LoadGatherOp.
+void LayoutInfoPropagation::visitLoadGatherOp(
+    xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo valueLayout = results[0]->getValue();
+  // Need the layout of the value to propagate to the tensor descriptor.
+  if (!valueLayout.isAssigned())
+    return;
+
+  LayoutInfo tensorDescLayout = valueLayout;
+  if (load.getTranspose()) {
+    // LoadGatherOp has the transpose effect. However, at the stage of this
+    // analyis this effect is not expected and should be abstracted away. Emit
+    // a warning.
+    load.emitWarning("Transpose effect is not expected for LoadGatherOp at "
+                     "LayoutInfoPropagation stage.");
+    tensorDescLayout = valueLayout.getTransposedLayout({1, 0});
+  }
+  // Mask operand should have 1D default layout.
+  LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1);
+  // Propagate the new layout to the tensor descriptor operand.
+  propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
+  // Propagate the new layout to the mask operand.
+  propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
+}
+
+/// Propagate the layout of the descriptor to the vector offset operand in
+/// CreateDescOp.
+void LayoutInfoPropagation::visitCreateDescOp(
+    xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  LayoutInfo descLayout = results[0]->getValue();
+  // Need the layout of the descriptor to propagate to the operands.
+  if (!descLayout.isAssigned())
+    return;
+  // For offset operand propagate 1D default layout.
+  LayoutInfo layout = getDefaultSIMTLayoutInfo(1);
+  propagateIfChanged(operands[1], operands[1]->meet(layout));
+}
+
+/// Set the layout for the value, tensor descriptor, and mask operands in the
+/// StoreScatterOp.
+void LayoutInfoPropagation::visitStoreScatterOp(
+    xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+  // Currently, for 2D StoreScatterOp we expect that the height dimension of
+  // the tensor descriptor is equal to the subgroup size. This is ensured by
+  // the op verifier.
+  ArrayRef<int64_t> tdescShape = storeScatter.getTensorDescType().getShape();
+  if (tdescShape.size() > 1)
+    assert(
+        tdescShape[0] == xegpu::targetinfo::subgroupSize &&
+        "Expected the first dimension of 2D tensor descriptor to be equal to "
+        "subgroup size.");
+
+  LayoutInfo valueLayout =
+      getDefaultSIMTLayoutInfo(storeScatter.getValueType());
+  LayoutInfo storeScatterLayout = valueLayout;
+  if (storeScatter.getTranspose()) {
+    // StoreScatteOp allows transpose effect. However, at the stage of this
+    // analyis this effect is not expected and should be abstracted away. Emit
+    // a warning.
+    storeScatter.emitWarning("Transpose effect is not expected for "
+                             "StoreScatterOp at LayoutInfoPropagation stage.");
+    storeScatterLayout = valueLayout.getTransposedLayout({1, 0});
+  }
+  // Propagate the value layout.
+  propagateIfChanged(operands[0], operands[0]->meet(valueLayout));
+  // Propagate the tensor descriptor layout.
+  propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout));
+  // Use default 1D layout for mask operand.
+  LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1);
+  propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// RunLayoutInfoPropagation
+//===----------------------------------------------------------------------===//
+
+/// Driver class for running the LayoutInfoPropagation analysis.
+class RunLayoutInfoPropagation {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
+
+  RunLayoutInfoPropagation(Operation *op) : target(op) {
+    SymbolTableCollection symbolTable;
+    loadBaselineAnalyses(solver);
+    solver.load<LayoutInfoPropagation>(symbolTable);
+    (void)solver.initializeAndRun(op);
+  }
+
+  LayoutInfo getLayoutInfo(Value val);
+
+  void printAnalysisResult(llvm::raw_ostream &os);
+
+private:
+  DataFlowSolver solver;
+  const Operation *target;
+};
+} // namespace
+
+LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
+  auto *state = solver.lookupState<LayoutInfoLattice>(val);
+  if (!state)
+    return {};
+  return state->getValue();
+}
+
+// Print the analysis result for debugging purposes.
+void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
+  auto printFunctionResult = [&](FunctionOpInterface funcOp) {
+    os << "function: " << funcOp.getName() << ":\n";
+    // Function arguments
+    for (BlockArgument arg : funcOp.getArguments()) {
+      LayoutInfo layout = getLayoutInfo(arg);
+      os << "argument: " << arg << "\n";
+      os << "layout  : ";
+      layout.print(os);
+      os << "\n";
+    }
+    // Function ops
+    funcOp.walk([&](Operation *op) {
+      // Skip ops that do not have results
+      if (op->getResults().empty())
+        return;
+      os << "op    : ";
+      // For control-flow ops, print the op name only.
+      if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op))
+        os << op->getName();
+      else
+        op->print(os);
+      os << "\n";
+      // Print the layout for each result.
+      for (auto [i, r] : llvm::enumerate(op->getResults())) {
+        LayoutInfo layout = getLayoutInfo(r);
+        os << "layout for result #" << i << ": ";
+        layout.print(os);
+        os << "\n";
+      }
+    });
+  };
+
+  SmallVector<FunctionOpInterface> funcOps;
+  if (auto modOp = dyn_cast<ModuleOp>(target)) {
+    for (auto funcOp : modOp.getOps<FunctionOpInterface>())
+      funcOps.push_back(funcOp);
+
+    // Collect all GpuFuncOps in the module.
+    for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
+      for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>())
+        funcOps.push_back(gpuFuncOp);
+    }
+  }
+  // Print the analysis result for each function.
+  for (FunctionOpInterface funcOp : funcOps)
+    printFunctionResult(funcOp);
+}
+
+using GetLayoutFnTy = function_ref<xegpu::LayoutAttr(Value)>;
+/// Update an operation with the layout of its results. If the result type is a
+/// vector type, a temporary layout attribute is added to the operation. If the
+/// result type is a tensor descriptor type, the type is updated with the layout
+/// attribute. The users of the result are also updated with the layout
+/// attribute.
+static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
+                              GetLayoutFnTy getLayoutOfValue) {
+  // Region ops (like scf.for) are already handled by the updateControlFlowOps.
+  if (mlir::isa<mlir::RegionBranchOpInterface>(op))
+    return success();
+
+  // Iterate over all the results.
+  for (OpResult result : op->getResults()) {
+    Type resultType = result.getType();
+    // Layouts are needed only for vector and tensor descriptor types.
+    if (!isa<VectorType, xegpu::TensorDescType>(resultType))
+      continue;
+    // If the result has no layout but has users, emit a warning and continue.
+    xegpu::LayoutAttr layout = getLayoutOfValue(result);
+    if (!layout && result.getNumUses() > 0) {
+      op->emitWarning("op has users but no layout assigned for its result");
+      continue;
+    }
+    // If the result is a tensor descriptor type, update the tensor desc type
+    // with layout.
+    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+      auto typeWithLayout = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+      result.setType(typeWithLayout);
+      continue;
+    }
+    // If the result is a vector type, add a temporary layout attribute to the
+    // op.
+    xegpu::setLayoutAttr(result, layout);
+  }
+  return success();
+}
+
+/// Region ops like scf.for need special handling because they have blocks
+/// inside. If the blocks have tensor descriptor type as block arguments, thier
+/// types must be updated. Also region op can have results that may not have any
+/// users (e.g. A and B tiles). They are not assigned a layout by layout
+/// analysis because they have no users. However inside the region op
+/// corresponding block arguments for these results do have layouts. Therefore,
+/// in this case we still need to update the result types with the layout
+/// attribute. This function function updates the internal block arguments and
+/// the result types of the region op with the assigned layouts.
+/// clang-format off
+/// Example: scf.for ... iter_args(...) -> (out types) {
+///   ^bb0(block types):
+///     ...
+///   scf.yield ... : (yield types)
+/// }
+/// clang-format on
+/// In this example, at scf.yield, control-flow can transfer to two successor
+/// regions. One is the ^bb0 (for loop body) and the other is the scf.for op
+/// itself (yield the results). So we update both the block arguments of the
+/// successor region (i.e. block types) and the result types of the scf.for op
+/// (i.e. out types). Note that yield types are updated by respective producers
+/// inside bb0.
+static LogicalResult
+updateControlFlowOps(mlir::OpBuilder &builder,
+                     mlir::RegionBranchTerminatorOpInterface terminator,
+                     GetLayoutFnTy getLayoutOfValue) {
+  // Only process if the terminator is inside a region branch op.
+  if (!mlir::isa<mlir::RegionBranchOpInterface>(terminator->getParentOp()))
+    return success();
+
+  llvm::SmallVector<mlir::RegionSuccessor> successors;
+  llvm::SmallVector<mlir::Attribute> operands(terminator->getNumOperands(),
+                                              nullptr);
+  terminator.getSuccessorRegions(operands, successors);
+
+  for (mlir::RegionSuccessor &successor : successors) {
+    mlir::OperandRange successorOperands =
+        terminator.getSuccessorOperands(successor);
+    mlir::ValueRange successorInputs = successor.getSuccessorInputs();
+    for (auto [successorOperand, successorInput] :
+         llvm::zip(successorOperands, successorInputs)) {
+      Type inputType = successorInput.getType();
+      // We only need to operate on tensor descriptor or vector types.
+      if (!isa<xegpu::TensorDescType, VectorType>(inputType))
+        continue;
+      xegpu::LayoutAttr successorInputLayout = getLayoutOfValue(successorInput);
+      xegpu::LayoutAttr successorOperandLayout =
+          getLayoutOfValue(successorOperand);
+
+      // If either of the layouts is not assigned, we cannot proceed.
+      if (!successorOperandLayout) {
+        LLVM_DEBUG(
+            DBGS()
+            << "No layout assigned for forwarded operand in branch terminator: "
+            << successorOperand << "\n");
+        return failure();
+      }
+      // We expect the layouts to match.
+      if (successorInputLayout &&
+          successorInputLayout != successorOperandLayout) {
+        LLVM_DEBUG(DBGS() << "Conflicting layouts for region argument and "
+                             "operand forwarded as the argument: "
+                          << successorInputLayout << " vs "
+                          << successorOperandLayout << "\n");
+        return failure();
+      }
+      // Get tensor descriptor type with the layout.
+      if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(inputType)) {
+        auto newTdescTy = xegpu::TensorDescType::get(
+            tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
+            tdescTy.getEncoding(), successorOperandLayout);
+        successorInput.setType(newTdescTy);
+        continue;
+      }
+      // If the type is a vector type and this region argument is an OpResult,
+      // set the layout attribute on the OpResult.
+      if (auto result = dyn_cast<OpResult>(successorInput))
+        xegpu::setLayoutAttr(result, successorOperandLayout);
+    }
+  }
+  return success();
+}
+
+/// Update the function arguments and results with the layouts.
+static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder,
+                                               mlir::FunctionOpInterface funcOp,
+                                               GetLayoutFnTy getLayoutOfValue) {
+  SmallVector<Type> newArgTypes;
+  // Update the function arguments.
+  for (BlockArgument arg : funcOp.getArguments()) {
+    Type argType = arg.getType();
+    newArgTypes.push_back(argType);
+    if (!isa<VectorType, xegpu::TensorDescType>(argType))
+      continue;
+    xegpu::LayoutAttr layout = getLayoutOfValue(arg);
+    if (!layout) {
+      LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg
+                        << " but got none.\n");
+      return failure();
+    }
+    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(argType)) {
+      auto newTdescTy = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+      arg.setType(newTdescTy);
+      newArgTypes.back() = newTdescTy;
+    }
+  }
+  // Update the function type with the new argument types.
+  // NOTE: We assume that function results are not expected to have layouts.
+  funcOp.setType(FunctionType::get(funcOp.getContext(), newArgTypes,
+                                   funcOp.getResultTypes()));
+  return success();
+}
+
+namespace {
+struct XeGPUPropagateLayoutPass final
+    : public xegpu::impl::XeGPUPropagateLayoutBase<XeGPUPropagateLayoutPass> {
+  XeGPUPropagateLayoutPass() = default;
+  XeGPUPropagateLayoutPass(const XeGPUPropagateLayoutPass &other) = default;
+  XeGPUPropagateLayoutPass(xegpu::XeGPUPropagateLayoutOptions options)
+      : XeGPUPropagateLayoutBase(options) {}
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void XeGPUPropagateLayoutPass::runOnOperation() {
+  auto &analysis = getAnalysis<RunLayoutInfoPropagation>();
+  // Print the analysis result and exit. (for debugging purposes)
+  if (printOnly) {
+    auto &os = llvm::outs();
+    analysis.printAnalysisResult(os);
+    return;
+  }
+  // Helper to convert LayoutInfo to xegpu::LayoutAttr.
+  auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
+    LayoutInfo layout = analysis.getLayoutInfo(val);
+    if (!layout.isAssigned())
+      return {};
+    return xegpu::LayoutAttr::get(
+        val.getContext(), llvm::to_vector_of<int>(layout.getLayoutAsArrayRef()),
+        llvm::to_vector_of<int>(layout.getDataAsArrayRef()));
+  };
+
+  mlir::OpBuilder builder(&getContext());
+  Operation *op = getOperation();
+  auto walkResult = op->walk([&](mlir::Block *block) -> WalkResult {
+    for (mlir::Operation &op : llvm::reverse(block->getOperations())) {
+      LogicalResult r = success();
+      TypeSwitch<Operation *>(&op)
+          .Case<mlir::RegionBranchTerminatorOpInterface>(
+              [&](mlir::RegionBranchTerminatorOpInterface branchTermOp) {
+                r = updateControlFlowOps(builder, branchTermOp,
+                                         getXeGPULayoutForValue);
+              })
+          .Case<mlir::FunctionOpInterface>(
+              [&](mlir::FunctionOpInterface funcOp) {
+                r = updateFunctionOpInterface(builder, funcOp,
+                                              getXeGPULayoutForValue);
+              })
+          .Default([&](Operation *op) {
+            r = updateOp(builder, op, getXeGPULayoutForValue);
+          });
+      if (failed(r)) {
+        op.emitError("Failed to update operation with the layout.");
+        return WalkResult::interrupt();
+      }
+    }
+    return WalkResult::advance();
+  });
+  if (walkResult.wasInterrupted()) {
+    signalPassFailure();
+    return;
+  }
+}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 66d21dbdaf06..dabcae0bfe4b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -5,9 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
-#include "mlir/Analysis/DataFlow/Utils.h"
-#include "mlir/Analysis/DataFlowFramework.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -29,15 +26,13 @@
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/InterleavedRange.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 namespace xegpu {
@@ -50,788 +45,11 @@ namespace xegpu {
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 
 using namespace mlir;
-using namespace mlir::dataflow;
 
-/// HW dependent constants.
-/// TODO: These constants should be queried from the target information.
-constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
-/// If DPAS A or B operands have low precision element types they must be packed
-/// according to the following sizes.
-constexpr unsigned packedSizeInBitsForDefault =
-    16; // Minimum packing size per register for DPAS A.
-constexpr unsigned packedSizeInBitsForDpasB =
-    32; // Minimum packing size per register for DPAS B.
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// Layout
-//===----------------------------------------------------------------------===//
-
-/// Helper class to store the ND layout of lanes within a subgroup and data
-/// owned by each lane.
-struct Layout {
-  SmallVector<int64_t, 3> layout;
-  Layout() = default;
-  Layout(std::initializer_list<int64_t> list) : layout(list) {}
-  void print(llvm::raw_ostream &os) const;
-  size_t size() const { return layout.size(); }
-  int64_t operator[](size_t idx) const;
-};
-
-void Layout::print(llvm::raw_ostream &os) const {
-  os << llvm::interleaved_array(layout);
-}
-
-int64_t Layout::operator[](size_t idx) const {
-  assert(idx < layout.size() && "Index out of bounds.");
-  return layout[idx];
-}
-
-/// LaneLayout represents the logical layout of lanes within a subgroup when it
-/// accesses some value. LaneData represents the logical layout of data owned by
-/// each work item.
-using LaneLayout = Layout;
-using LaneData = Layout;
-
-//===----------------------------------------------------------------------===//
-// LayoutInfo
-//===----------------------------------------------------------------------===//
-
-/// Helper class for tracking the analysis state of an mlir value. For layout
-/// propagation, the analysis state is simply the lane_layout and lane_data of
-/// each value. Purpose of this analysis to propagate some unique layout for
-/// each value in the program starting from a set of anchor operations (like
-/// DPAS, StoreNd, etc.).
-///
-/// Given this, LayoutInfo  satisifies the following properties:
-///  1) A LayoutInfo value can be in one of two states - `assigned` or `not
-///  assigned`.
-///  2) Two LayoutInfo values are equal if they are both assigned or
-///  both not assigned. The concrete value of assigned state does not matter.
-///  3) The meet operator works as follows:
-///     - If current state is assigned, return the current state. (already
-///     a unique layout is assigned. don't change it)
-///     - Otherwise, return the other state.
-
-struct LayoutInfo {
-private:
-  LaneLayout laneLayout;
-  LaneData laneData;
-
-public:
-  LayoutInfo() = default;
-  LayoutInfo(const LaneLayout &layout, const LaneData &data)
-      : laneLayout(layout), laneData(data) {}
-
-  // Two lattice values are equal if they have `some` layout. The actual
-  // content of the layout does not matter.
-  bool operator==(const LayoutInfo &other) const {
-    return this->isAssigned() == other.isAssigned();
-  }
-
-  static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
-
-  static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
-
-  void print(raw_ostream &os) const;
-
-  bool isAssigned() const {
-    return laneLayout.size() > 0 && laneData.size() > 0;
-  }
-
-  LayoutInfo getTransposedLayout(ArrayRef<int64_t> permutation) const;
-
-  const LaneLayout &getLayout() const { return laneLayout; }
-  const LaneData &getData() const { return laneData; }
-  ArrayRef<int64_t> getLayoutAsArrayRef() const { return laneLayout.layout; }
-  ArrayRef<int64_t> getDataAsArrayRef() const { return laneData.layout; }
-};
-
-void LayoutInfo::print(raw_ostream &os) const {
-  if (isAssigned()) {
-    os << "lane_layout: ";
-    laneLayout.print(os);
-    os << ", lane_data: ";
-    laneData.print(os);
-  } else {
-    os << "Not assigned.";
-  }
-}
-
-LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
-  if (!lhs.isAssigned())
-    return rhs;
-  return lhs;
-}
-
-/// Since this is a backward analysis, join method is not used.
-LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
-  llvm_unreachable("Join should not be triggered by layout propagation.");
-}
-
-/// Get the transposed layout according to the given permutation.
-LayoutInfo
-LayoutInfo::getTransposedLayout(ArrayRef<int64_t> permutation) const {
-  if (!isAssigned())
-    return {};
-  LaneLayout newLayout;
-  LaneData newData;
-  for (int64_t idx : permutation) {
-    newLayout.layout.push_back(laneLayout.layout[idx]);
-    newData.layout.push_back(laneData.layout[idx]);
-  }
-  return LayoutInfo(newLayout, newData);
-}
-
-//===----------------------------------------------------------------------===//
-// LayoutInfoLattice
-//===----------------------------------------------------------------------===//
-
-/// Lattice holding the LayoutInfo for each value.
-struct LayoutInfoLattice : public Lattice<LayoutInfo> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
-  using Lattice::Lattice;
-};
-
-/// Helper Functions to get default layouts. A `default layout` is a layout that
-/// is assigned to a value when the layout is not fixed by some anchor operation
-/// (like DPAS).
-
-/// Helper Function to get the default layout for uniform values like constants.
-/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
-/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
-static LayoutInfo getDefaultLayoutInfo(unsigned rank) {
-  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
-  if (rank == 1)
-    return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1}));
-  return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1}));
-}
-
-/// Helper to get the default layout for a vector type.
-static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) {
-  // Expecting a 1D or 2D vector.
-  assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
-         "Expected 1D or 2D vector.");
-  // Expecting int or float element type.
-  assert(vectorTy.getElementType().isIntOrFloat() &&
-         "Expected int or float element type.");
-  // If the rank is 1, then return default layout for 1D vector.
-  if (vectorTy.getRank() == 1)
-    return getDefaultLayoutInfo(1);
-  // Packing factor is determined by the element type bitwidth.
-  int packingFactor = 1;
-  unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
-  if (bitwidth < packedSizeInBitsForDefault)
-    packingFactor = packedSizeInBitsForDefault / bitwidth;
-  return LayoutInfo(LaneLayout({1, subgroupSize}),
-                    LaneData({1, packingFactor}));
-}
-
-/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
-/// is set according to the following criteria:
-/// * For A operand, the data must be packed in minimum
-/// `packedSizeInBitsForDefault`
-/// * For B operand, the data must be packed in minimum
-/// `packedSizeInBitsForDpasB`
-static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy,
-                                              unsigned operandNum) {
-  Type elementTy = vectorTy.getElementType();
-  assert(elementTy.isIntOrFloat() &&
-         "Expected int or float type in DPAS operands");
-  LaneLayout layout({1, subgroupSize});
-  // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
-  // must have the VNNI format.
-  if (operandNum == 1 &&
-      elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) {
-    LaneData data(
-        {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1});
-    return LayoutInfo(layout, data);
-  }
-  // Otherwise, return the default layout for the vector type.
-  return getDefaultLayoutInfo(vectorTy);
-}
-
-//===----------------------------------------------------------------------===//
-// LayoutInfoPropagation
-//===----------------------------------------------------------------------===//
-
-/// Backward data flow analysis to propagate the lane_layout and lane_data of
-/// each value in the program. Currently, the layouts for operands DPAS,
-/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
-/// this analysis is to propagate those known layouts to all their producers and
-/// (other) consumers.
-class LayoutInfoPropagation
-    : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
-private:
-  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
-                   ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitStoreNdOp(xegpu::StoreNdOp store,
-                      ArrayRef<LayoutInfoLattice *> operands,
-                      ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
-                           ArrayRef<LayoutInfoLattice *> operands,
-                           ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitLoadNdOp(xegpu::LoadNdOp load,
-                     ArrayRef<LayoutInfoLattice *> operands,
-                     ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitLoadGatherOp(xegpu::LoadGatherOp load,
-                         ArrayRef<LayoutInfoLattice *> operands,
-                         ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitTransposeOp(vector::TransposeOp transpose,
-                        ArrayRef<LayoutInfoLattice *> operands,
-                        ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitVectorBitcastOp(vector::BitCastOp bitcast,
-                            ArrayRef<LayoutInfoLattice *> operands,
-                            ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitCreateDescOp(xegpu::CreateDescOp createDesc,
-                         ArrayRef<LayoutInfoLattice *> operands,
-                         ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
-                             ArrayRef<LayoutInfoLattice *> operands,
-                             ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
-                         ArrayRef<LayoutInfoLattice *> operands,
-                         ArrayRef<const LayoutInfoLattice *> results);
-
-  void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
-                                   ArrayRef<LayoutInfoLattice *> operands,
-                                   ArrayRef<const LayoutInfoLattice *> results);
-
-public:
-  LayoutInfoPropagation(DataFlowSolver &solver,
-                        SymbolTableCollection &symbolTable)
-      : SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
-  using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
-
-  LogicalResult
-  visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
-                 ArrayRef<const LayoutInfoLattice *> results) override;
-
-  void visitBranchOperand(OpOperand &operand) override {};
-
-  void visitCallOperand(OpOperand &operand) override {};
-
-  void visitExternalCall(CallOpInterface call,
-                         ArrayRef<LayoutInfoLattice *> operands,
-                         ArrayRef<const LayoutInfoLattice *> results) override {
-  };
-
-  void setToExitState(LayoutInfoLattice *lattice) override {
-    (void)lattice->meet(LayoutInfo());
-  }
-};
-} // namespace
-
-LogicalResult LayoutInfoPropagation::visitOperation(
-    Operation *op, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  TypeSwitch<Operation *>(op)
-      .Case<xegpu::DpasOp>(
-          [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); })
-      .Case<xegpu::StoreNdOp>(
-          [&](auto storeNdOp) { visitStoreNdOp(storeNdOp, operands, results); })
-      .Case<xegpu::StoreScatterOp>([&](auto storeScatterOp) {
-        visitStoreScatterOp(storeScatterOp, operands, results);
-      })
-      .Case<xegpu::LoadNdOp>(
-          [&](auto loadNdOp) { visitLoadNdOp(loadNdOp, operands, results); })
-      .Case<xegpu::LoadGatherOp>([&](auto loadGatherOp) {
-        visitLoadGatherOp(loadGatherOp, operands, results);
-      })
-      .Case<xegpu::CreateDescOp>([&](auto createDescOp) {
-        visitCreateDescOp(createDescOp, operands, results);
-      })
-      .Case<xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
-        visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
-      })
-      .Case<xegpu::PrefetchNdOp>([&](auto prefetchNdOp) {
-        visitPrefetchNdOp(prefetchNdOp, operands, results);
-      })
-      // No need to propagate the layout to operands in CreateNdDescOp because
-      // they are scalars (offsets, sizes, etc.).
-      .Case<xegpu::CreateNdDescOp>([&](auto createNdDescOp) {})
-      .Case<vector::TransposeOp>([&](auto transposeOp) {
-        visitTransposeOp(transposeOp, operands, results);
-      })
-      .Case<vector::BitCastOp>([&](auto bitcastOp) {
-        visitVectorBitcastOp(bitcastOp, operands, results);
-      })
-      .Case<vector::MultiDimReductionOp>([&](auto reductionOp) {
-        visitVectorMultiReductionOp(reductionOp, operands, results);
-      })
-      // All other ops.
-      .Default([&](Operation *op) {
-        for (const LayoutInfoLattice *r : results) {
-          for (LayoutInfoLattice *operand : operands) {
-            // Propagate the layout of the result to the operand.
-            if (r->getValue().isAssigned())
-              meet(operand, *r);
-          }
-        }
-      });
-  // Add a dependency from each result to program point after the operation.
-  for (const LayoutInfoLattice *r : results) {
-    addDependency(const_cast<LayoutInfoLattice *>(r), getProgramPointAfter(op));
-  }
-  return success();
-}
-
-void LayoutInfoPropagation::visitPrefetchNdOp(
-    xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  // Here we assign the default layout to the tensor descriptor operand of
-  // prefetch.
-  auto tdescTy = prefetch.getTensorDescType();
-  auto prefetchLayout = getDefaultLayoutInfo(
-      VectorType::get(tdescTy.getShape(), tdescTy.getElementType()));
-  // Propagate the layout to the source tensor descriptor.
-  propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
-}
-
-void LayoutInfoPropagation::visitVectorMultiReductionOp(
-    vector::MultiDimReductionOp reduction,
-    ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  // The layout of the result must be present.
-  LayoutInfo resultLayout = results[0]->getValue();
-  if (!resultLayout.isAssigned())
-    return;
-  // We only consider 2D -> 1D reductions at this point.
-  assert(resultLayout.getLayout().size() == 1 &&
-         "Expected 1D layout for reduction result.");
-  // Given that the result is 1D, the layout of the operand should be 2D with
-  // default layout.
-  LayoutInfo operandLayout = getDefaultLayoutInfo(2);
-  propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
-  // Accumulator should have the same layout as the result.
-  propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
-}
-
-/// Propagate the layout of the result tensor to the source tensor descriptor in
-/// UpdateNdOffsetOp.
-void LayoutInfoPropagation::visitUpdateNdOffsetOp(
-    xegpu::UpdateNdOffsetOp updateNdOffset,
-    ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  // The layout of the result must be present.
-  LayoutInfo resultLayout = results[0]->getValue();
-  if (!resultLayout.isAssigned())
-    return;
-  // Propagate the layout to the source operand.
-  propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
-}
-
-/// Set the layouts for DPAS A, B, and C operands.
-void LayoutInfoPropagation::visitDpasOp(
-    xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  VectorType aTy = dpas.getLhsType();
-  VectorType bTy = dpas.getRhsType();
-  propagateIfChanged(operands[0],
-                     operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0)));
-  propagateIfChanged(operands[1],
-                     operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1)));
-  if (operands.size() > 2) {
-    VectorType cTy = dpas.getAccType();
-    propagateIfChanged(operands[2],
-                       operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2)));
-  }
-}
-
-/// Set the layout for the value and tensor descriptor operands in StoreNdOp.
-void LayoutInfoPropagation::visitStoreNdOp(
-    xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType());
-  // Both operands should have the same layout
-  for (LayoutInfoLattice *operand : operands) {
-    propagateIfChanged(operand, operand->meet(storeLayout));
-  }
-}
-
-/// Propagate the layout of the value to the tensor descriptor operand in
-/// LoadNdOp.
-void LayoutInfoPropagation::visitLoadNdOp(
-    xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo valueLayout = results[0]->getValue();
-  // Need the layout of the value to propagate to the tensor descriptor.
-  if (!valueLayout.isAssigned())
-    return;
-  LayoutInfo tensorDescLayout = valueLayout;
-  // LoadNdOp has the transpose effect. However, at the stage of this analysis
-  // this effect is not expected and should be abstracted away. Emit a warning.
-  if (auto transpose = load.getTranspose()) {
-    load.emitWarning("Transpose effect is not expected for LoadNdOp at "
-                     "LayoutInfoPropagation stage.");
-    tensorDescLayout = valueLayout.getTransposedLayout(transpose.value());
-  }
-  // Propagate the new layout to the tensor descriptor operand.
-  propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
-}
-
-/// For vector::TransposeOp, the layout of the result is transposed and
-/// propagated to the operand.
-void LayoutInfoPropagation::visitTransposeOp(
-    vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  // Need the layout of transpose result to propagate to the operands.
-  LayoutInfo resultLayout = results[0]->getValue();
-  if (!resultLayout.isAssigned())
-    return;
-  LayoutInfo newLayout =
-      resultLayout.getTransposedLayout(transpose.getPermutation());
-  // Propagate the new layout to the vector operand.
-  propagateIfChanged(operands[0], operands[0]->meet(newLayout));
-}
-
-/// For vector::BitCastOp, the lane_data of the source layout is changed based
-/// on the bit width of the source and result types.
-void LayoutInfoPropagation::visitVectorBitcastOp(
-    vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  // Need the layout of bitcast result to propagate to the operands.
-  LayoutInfo resultLayout = results[0]->getValue();
-  if (!resultLayout.isAssigned())
-    return;
-  int inElemTyBitWidth =
-      bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
-  int outElemTyBitWidth =
-      bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
-
-  // LaneLayout does not change.
-  const LaneLayout &newLaneLayout = resultLayout.getLayout();
-  const LaneData &currData = resultLayout.getData();
-  LaneData newLaneData;
-  // It's a widening bitcast
-  if (inElemTyBitWidth < outElemTyBitWidth) {
-    int ratio = outElemTyBitWidth / inElemTyBitWidth;
-    newLaneData = resultLayout.getData()[0] == 1
-                      ? LaneData({1, currData[1] * ratio})
-                      : LaneData({currData[0] * ratio, 1});
-  } else {
-    // It's a narrowing bitcast
-    int ratio = inElemTyBitWidth / outElemTyBitWidth;
-    newLaneData = resultLayout.getData()[0] == 1
-                      ? LaneData({1, currData[1] / ratio})
-                      : LaneData({currData[0] / ratio, 1});
-  }
-
-  propagateIfChanged(operands[0],
-                     operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData)));
-}
-
-/// Propagate the layout of the result to the tensor descriptor and mask
-/// operands in LoadGatherOp.
-void LayoutInfoPropagation::visitLoadGatherOp(
-    xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo valueLayout = results[0]->getValue();
-  // Need the layout of the value to propagate to the tensor descriptor.
-  if (!valueLayout.isAssigned())
-    return;
-
-  LayoutInfo tensorDescLayout = valueLayout;
-  if (load.getTranspose()) {
-    // LoadGatherOp has the transpose effect. However, at the stage of this
-    // analyis this effect is not expected and should be abstracted away. Emit
-    // a warning.
-    load.emitWarning("Transpose effect is not expected for LoadGatherOp at "
-                     "LayoutInfoPropagation stage.");
-    tensorDescLayout = valueLayout.getTransposedLayout({1, 0});
-  }
-  // Mask operand should have 1D default layout.
-  LayoutInfo maskLayout = getDefaultLayoutInfo(1);
-  // Propagate the new layout to the tensor descriptor operand.
-  propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
-  // Propagate the new layout to the mask operand.
-  propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
-}
-
-/// Propagate the layout of the descriptor to the vector offset operand in
-/// CreateDescOp.
-void LayoutInfoPropagation::visitCreateDescOp(
-    xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo descLayout = results[0]->getValue();
-  // Need the layout of the descriptor to propagate to the operands.
-  if (!descLayout.isAssigned())
-    return;
-  // For offset operand propagate 1D default layout.
-  LayoutInfo layout = getDefaultLayoutInfo(1);
-  propagateIfChanged(operands[1], operands[1]->meet(layout));
-}
-
-/// Set the layout for the value, tensor descriptor, and mask operands in the
-/// StoreScatterOp.
-void LayoutInfoPropagation::visitStoreScatterOp(
-    xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
-    ArrayRef<const LayoutInfoLattice *> results) {
-  // Currently, for 2D StoreScatterOp we expect that the height dimension of
-  // the tensor descriptor is equal to the subgroup size. This is ensured by
-  // the op verifier.
-  ArrayRef<int64_t> tdescShape = storeScatter.getTensorDescType().getShape();
-  if (tdescShape.size() > 1)
-    assert(
-        tdescShape[0] == subgroupSize &&
-        "Expected the first dimension of 2D tensor descriptor to be equal to "
-        "subgroup size.");
-
-  LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType());
-  LayoutInfo storeScatterLayout = valueLayout;
-  if (storeScatter.getTranspose()) {
-    // StoreScatteOp allows transpose effect. However, at the stage of this
-    // analyis this effect is not expected and should be abstracted away. Emit
-    // a warning.
-    storeScatter.emitWarning("Transpose effect is not expected for "
-                             "StoreScatterOp at LayoutInfoPropagation stage.");
-    storeScatterLayout = valueLayout.getTransposedLayout({1, 0});
-  }
-  // Propagate the value layout.
-  propagateIfChanged(operands[0], operands[0]->meet(valueLayout));
-  // Propagate the tensor descriptor layout.
-  propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout));
-  // Use default 1D layout for mask operand.
-  LayoutInfo maskLayout = getDefaultLayoutInfo(1);
-  propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
-}
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// RunLayoutInfoPropagation
-//===----------------------------------------------------------------------===//
-
-/// Driver class for running the LayoutInfoPropagation analysis.
-class RunLayoutInfoPropagation {
-public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
-
-  RunLayoutInfoPropagation(Operation *op) : target(op) {
-    SymbolTableCollection symbolTable;
-    loadBaselineAnalyses(solver);
-    solver.load<LayoutInfoPropagation>(symbolTable);
-    (void)solver.initializeAndRun(op);
-  }
-
-  LayoutInfo getLayoutInfo(Value val);
-
-  void printAnalysisResult(llvm::raw_ostream &os);
-
-private:
-  DataFlowSolver solver;
-  const Operation *target;
-};
-} // namespace
-
-LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
-  auto *state = solver.lookupState<LayoutInfoLattice>(val);
-  if (!state)
-    return {};
-  return state->getValue();
-}
-
-void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
-  auto printFunctionResult = [&](FunctionOpInterface funcOp) {
-    os << "function: " << funcOp.getName() << ":\n";
-    // Function arguments
-    for (BlockArgument arg : funcOp.getArguments()) {
-      LayoutInfo layout = getLayoutInfo(arg);
-      os << "argument: " << arg << "\n";
-      os << "layout  : ";
-      layout.print(os);
-      os << "\n";
-    }
-    // Function ops
-    funcOp.walk([&](Operation *op) {
-      // Skip ops that do not have results
-      if (op->getResults().empty())
-        return;
-      os << "op    : ";
-      // For control-flow ops, print the op name only.
-      if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op))
-        os << op->getName();
-      else
-        op->print(os);
-      os << "\n";
-      // Print the layout for each result.
-      for (auto [i, r] : llvm::enumerate(op->getResults())) {
-        LayoutInfo layout = getLayoutInfo(r);
-        os << "layout for result #" << i << ": ";
-        layout.print(os);
-        os << "\n";
-      }
-    });
-  };
-
-  SmallVector<FunctionOpInterface> funcOps;
-  if (auto modOp = dyn_cast<ModuleOp>(target)) {
-    for (auto funcOp : modOp.getOps<FunctionOpInterface>()) {
-      funcOps.push_back(funcOp);
-    }
-    // Collect all GpuFuncOps in the module.
-    for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
-      for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>()) {
-        funcOps.push_back(gpuFuncOp);
-      }
-    }
-  }
-  // Print the analysis result for each function.
-  for (FunctionOpInterface funcOp : funcOps) {
-    printFunctionResult(funcOp);
-  }
-}
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// LayoutAttrAssignment
-//===----------------------------------------------------------------------===//
-
-/// This class is responsible for assigning the layout attributes to the ops and
-/// their users based on the layout propagation analysis result.
-class LayoutAttrAssignment {
-public:
-  LayoutAttrAssignment(Operation *top,
-                       function_ref<LayoutInfo(Value)> getLayout)
-      : getAnalysisResult(getLayout), top(top) {}
-
-  LogicalResult run();
-
-private:
-  LogicalResult assign(Operation *op);
-  void assignToUsers(Value v, xegpu::LayoutAttr layout);
-  xegpu::LayoutAttr getLayoutAttrForValue(Value v);
-  LogicalResult resolveConflicts();
-  // Callable to get the layout of a value based on the layout propagation
-  // analysis.
-  function_ref<LayoutInfo(Value)> getAnalysisResult;
-  Operation *top;
-};
-
-} // namespace
-
-/// Helper to assign the layout attribute to the users of the value.
-void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
-  for (OpOperand &user : v.getUses()) {
-    Operation *owner = user.getOwner();
-    std::string attrName = xegpu::getLayoutName(user);
-    owner->setAttr(attrName, layout);
-  }
-}
-
-/// Convert the layout assigned to a value to xegpu::LayoutAttr.
-xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
-  LayoutInfo layout = getAnalysisResult(v);
-  if (!layout.isAssigned())
-    return {};
-  SmallVector<int, 2> laneLayout, laneData;
-  for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
-                                             layout.getDataAsArrayRef())) {
-    laneLayout.push_back(static_cast<int>(layout));
-    laneData.push_back(static_cast<int>(data));
-  }
-  return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
-}
-
-/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
-/// based on the layout propagation analysis result.
-LogicalResult LayoutAttrAssignment::assign(Operation *op) {
-  // For function ops, propagate the function argument layout to the users.
-  if (auto func = dyn_cast<FunctionOpInterface>(op)) {
-    for (BlockArgument arg : func.getArguments()) {
-      xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg);
-      if (layoutInfo) {
-        assignToUsers(arg, layoutInfo);
-      }
-    }
-    return success();
-  }
-  // If no results, move on.
-  if (op->getNumResults() == 0)
-    return success();
-  // If all the results are scalars, move on.
-  if (llvm::all_of(op->getResultTypes(),
-                   [](Type t) { return t.isIntOrIndexOrFloat(); }))
-    return success();
-  // If the op has more than one result and at least one result is a tensor
-  // descriptor, exit. This case is not supported yet.
-  // TODO: Support this case.
-  if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type t) {
-        return isa<xegpu::TensorDescType>(t);
-      })) {
-    LLVM_DEBUG(
-        DBGS() << op->getName()
-               << " op has more than one result and at least one is a tensor "
-                  "descriptor. This case is not handled.\n");
-    return failure();
-  }
-  // If the result is a tensor descriptor, attach the layout to the tensor
-  // descriptor itself.
-  if (auto tensorDescTy =
-          dyn_cast<xegpu::TensorDescType>(op->getResultTypes()[0])) {
-    xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0));
-    if (!layoutInfo) {
-      LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
-      return failure();
-    }
-
-    // Clone the op, attach the layout to the result tensor descriptor, and
-    // remove the original op.
-    OpBuilder builder(op);
-    Operation *newOp = builder.clone(*op);
-    auto newTensorDescTy = xegpu::TensorDescType::get(
-        tensorDescTy.getContext(), tensorDescTy.getShape(),
-        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
-    newOp->getResult(0).setType(newTensorDescTy);
-    op->replaceAllUsesWith(newOp->getResults());
-    op->erase();
-    return success();
-  }
-  // Otherwise simply attach the layout to the op itself.
-  for (auto r : op->getOpResults()) {
-    xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r);
-    if (layoutInfo) {
-      std::string attrName = xegpu::getLayoutName(r);
-      op->setAttr(attrName, layoutInfo);
-      // Attach the layout attribute to the users of the result.
-      assignToUsers(r, layoutInfo);
-    }
-  }
-  return success();
-}
-
-/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
-LogicalResult LayoutAttrAssignment::run() {
-  auto walkResult = top->walk([&](Operation *op) {
-    if (failed(assign(op)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  });
-
-  if (walkResult.wasInterrupted())
-    return failure();
-
-  return resolveConflicts();
-}
-
-/// TODO: Implement the layout conflict resolution. This must ensure mainly two
-/// things:
-/// 1) Is a given layout supported by the op? (need to query the target
-///    HW info). Otherwise can we achieve this layout using a layout conversion?
-/// 2) Do all the operands have the required layout? If not, can it
-///    be resolved using a layout conversion?
-LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+static const char *const resolveSIMTTypeMismatch =
+    "resolve_simt_type_mismatch"; // Attribute name for identifying
+                                  // UnrelizedConversionCastOp added to resolve
+                                  // SIMT type mismatches.
 
 namespace {
 
@@ -867,9 +85,9 @@ getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
   // dimensions are not distributed.
   unsigned distributionStart = originalType.getRank() - laneLayout.size();
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i < distributionStart) {
+    if (i < distributionStart)
       continue;
-    }
+
     // Check if the dimension can be distributed evenly.
     if (dim % laneLayout[i - distributionStart] != 0)
       return failure();
@@ -909,6 +127,7 @@ static Value resolveDistributedTy(Value orig, T expected,
   if (isa<xegpu::TensorDescType>(orig.getType())) {
     auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
                                                               expected, orig);
+    castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
     return castOp.getResult(0);
   }
   llvm_unreachable("Unsupported type for reconciliation");
@@ -988,8 +207,9 @@ struct MoveFuncBodyToWarpExecuteOnLane0
         /** upperBound = **/ mlir::IntegerAttr());
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
     auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
-        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
-        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+        laneId.getLoc(), gpuFuncResultType, laneId,
+        xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
+        newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
     // Replace the ReturnOp of the original gpu function with a YieldOp.
     auto origRetunOp =
@@ -1080,11 +300,14 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
     xegpu::TensorDescType distributedTensorDescTy =
         descOp.getType().dropLayouts(); // Distributed tensor descriptor type
                                         // does not contain layout info.
-    auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+    Value newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
         newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
         descOp->getAttrs());
 
     Value distributedVal = newWarpOp.getResult(operandIdx);
+    // Resolve the distributed type to the expected type.
+    newDescOp =
+        resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newDescOp);
     return success();
   }
@@ -1485,10 +708,13 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
       }
     }
     // Create a new update op outside the warp op.
-    auto newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
+    Value newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
         newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
         removeTemporaryLayoutAttributes(updateOp->getAttrs()));
     Value distributedVal = newWarpOp.getResult(operandIdx);
+    // Resolve the distributed type with the original type.
+    newUpdateOp =
+        resolveDistributedTy(newUpdateOp, distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newUpdateOp);
     return success();
   }
@@ -1562,11 +788,6 @@ namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
           XeGPUSubgroupDistributePass> {
-  XeGPUSubgroupDistributePass() = default;
-  XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
-      default;
-  XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
-      : XeGPUSubgroupDistributeBase(options) {}
   void runOnOperation() override;
 };
 } // namespace
@@ -1579,27 +800,29 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
 }
 
 void XeGPUSubgroupDistributePass::runOnOperation() {
-  auto &analyis = getAnalysis<RunLayoutInfoPropagation>();
-  // Print the analysis result and exit. (for testing purposes)
-  if (printOnly) {
-    auto &os = llvm::outs();
-    analyis.printAnalysisResult(os);
-    return;
-  }
-  auto getPropagatedLayout = [&](Value val) {
-    return analyis.getLayoutInfo(val);
-  };
+  // Step 1: Attach layouts to op operands.
+  // TODO: Following assumptions are made:
+  // 1) It is assumed that there are no layout conflicts.
+  // 2) Any existing layout attributes attached to the operands are ignored.
+  Operation *op = getOperation();
+  op->walk([&](Operation *op) {
+    for (OpOperand &operand : op->getOpOperands()) {
+      // Layouts are needed for vector type only.
+      if (!isa<VectorType>(operand.get().getType()))
+        continue;
 
-  // Assign xegpu::LayoutAttr to all ops and their users based on the layout
-  // propagation analysis result.
-  LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
-  if (failed(layoutAssignment.run())) {
-    signalPassFailure();
-    return;
-  }
-
-  // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
-  // operation.
+      xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand);
+      if (!layout) {
+        op->emitError("Could not find layout attribute for operand ")
+            << operand.getOperandNumber() << " of operation " << op->getName();
+        signalPassFailure();
+        return;
+      }
+      xegpu::setLayoutAttr(operand, layout);
+    }
+  });
+  // Step 2: Move all operations of a GPU function inside
+  // gpu.warp_execute_on_lane_0 operation.
   {
     RewritePatternSet patterns(&getContext());
     patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
@@ -1608,17 +831,16 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       signalPassFailure();
       return;
     }
-    // At this point, we have moved the entire function body inside the warpOp.
-    // Now move any scalar uniform code outside of the warpOp (like GPU index
-    // ops, scalar constants, etc.). This will simplify the later lowering and
-    // avoid custom patterns for these ops.
+    // At this point, we have moved the entire function body inside the
+    // warpOp. Now move any scalar uniform code outside of the warpOp (like
+    // GPU index ops, scalar constants, etc.). This will simplify the
+    // later lowering and avoid custom patterns for these ops.
     getOperation()->walk([&](Operation *op) {
-      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
+      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
         vector::moveScalarUniformCode(warpOp);
-      }
     });
   }
-  // Finally, do the SIMD to SIMT distribution.
+  // Step 3: Apply subgroup to workitem distribution patterns.
   RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
   // TODO: distributionFn and shuffleFn are not used at this point.
@@ -1638,4 +860,51 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     signalPassFailure();
     return;
   }
+
+  // Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted
+  // due to tensor desc type mismatches created by using upstream distribution
+  // patterns (scf.for)
+  getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
+    // We are only interested in UnrealizedConversionCastOps there were added
+    // for resolving SIMT type mismatches.
+    if (!op->getAttr(resolveSIMTTypeMismatch))
+      return WalkResult::skip();
+
+    Value input = op.getOperand(0);
+    Value output = op.getResult(0);
+
+    // Both input and output must have tensor descriptor types.
+    xegpu::TensorDescType inputDescType =
+        mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
+    xegpu::TensorDescType outputDescType =
+        mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
+    assert(inputDescType && outputDescType &&
+           "Unrealized conversion cast must have tensor descriptor types");
+
+    // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
+    // This occurs iside scf.for body to resolve the block argument type to
+    // SIMT type.
+    if (inputDescType.getLayout()) {
+      auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
+      if (argument) {
+        argument.setType(output.getType());
+        output.replaceAllUsesWith(argument);
+        if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
+                argument.getOwner()->getParentOp())) {
+          auto result = loopOp.getTiedLoopResult(argument);
+          result.setType(output.getType());
+        }
+      }
+    }
+
+    // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
+    // conversions. This occurs at the yield op of scf.for body to go back
+    // from SIMT type to original type.
+    if (outputDescType.getLayout())
+      output.replaceAllUsesWith(input);
+
+    if (op->use_empty())
+      op->erase();
+    return WalkResult::advance();
+  });
 }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
new file mode 100644
index 000000000000..429081079de1
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -0,0 +1,430 @@
+// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @dpas_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @dpas_i8(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
+// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
+func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
+  %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
+  xegpu.store_nd %0, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @load_with_transpose_effect(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> -> vector<16x16xf16>
+func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1 <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @vector_transpose(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16>
+func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
+  %5 = xegpu.dpas %2, %4, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @extf_truncf(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]:
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> {
+// CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf32>
+// CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf32> to vector<16x16xf16>
+func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
+  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32>
+  %3 = arith.truncf %2 : vector<16x16xf32> to vector<16x16xf16>
+  %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  return %4 : vector<8x16xf32>
+}
+
+// -----
+// CHECK-LABEL: func.func @load_gather_with_transpose_effect(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+// CHECK-SAME:  dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{transpose}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
+func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+  %cst_0 = arith.constant dense<true> : vector<16xi1>
+  %2 = xegpu.create_tdesc %arg1, %cst : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
+  %3 = xegpu.load %2, %cst_0 <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
+  %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @load_gather_1d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
+// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
+func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+  %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+  %cst_0 = arith.constant dense<true> : vector<16xi1>
+  %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.load %0, %cst_0  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  xegpu.store_nd %1, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @store_scatter_with_transpose_effect(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
+// CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>,
+// CHECK-SAME: #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<16xi1>
+func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
+  %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
+  %cst_0 = arith.constant dense<true> : vector<16xi1>
+  %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+  %0 = xegpu.create_tdesc %arg0, %cst_1 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+  xegpu.store %cst, %0, %cst_0 <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @store_scatter_1d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
+// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1>
+func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
+  %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+  %cst_0 = arith.constant dense<true> : vector<16xi1>
+  %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  xegpu.store %arg0, %0, %cst_0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xi16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xi16> to vector<8x16xf16>
+// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xi16> to vector<16x16xf16>
+func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xi16> -> !xegpu.tensor_desc<16x16xi16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xi16> -> vector<16x16xi16>
+  %4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x16xf16>
+  %5 = vector.bitcast %3 : vector<16x16xi16> to vector<16x16xf16>
+  %6 = xegpu.dpas %4, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @binary_op_one_use(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
+// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
+func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
+  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %2 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %3 = arith.addf %1, %2 : vector<16x16xf16>
+  %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  xegpu.store_nd %4, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @binary_op_multiple_uses(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]]  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
+  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+  %2 = arith.addf %1, %cst : vector<16x16xf16>
+  %3 = xegpu.dpas %0, %2 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  xegpu.store_nd %3, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %2, %arg3  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @for_op(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
+// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
+// CHECK-NEXT:   %[[T4:.*]] = xegpu.load_nd %[[ARG4]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK-NEXT:   %[[T5:.*]] = xegpu.load_nd %[[ARG5]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:   %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK-NEXT:   %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT:   %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK-NEXT:   scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>
+// CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c16 = arith.constant 16 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %0, %arg5 = %1, %arg6 = %cst) -> (!xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>) {
+    %4 = xegpu.load_nd %arg4  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+    %5 = xegpu.load_nd %arg5  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+    %6 = xegpu.dpas %4, %5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %7 = xegpu.update_nd_offset %arg4, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16>
+    %8 = xegpu.update_nd_offset %arg5, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16>
+    scf.yield %7, %8, %6 : !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>
+  }
+  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %2#2, %3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @if_single_use(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
+// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK:  %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
+// CHECK-NEXT:    %[[T3:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:    scf.yield %[[T3]] : vector<16x16xf16>
+// CHECK-NEXT:  } else {
+// CHECK-NEXT:    %[[T4:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:    scf.yield %[[T4]] : vector<16x16xf16>
+// CHECK-NEXT:  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
+  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %1 = scf.if %arg2 -> (vector<16x16xf16>) {
+    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+    scf.yield %3 : vector<16x16xf16>
+  } else {
+    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+    scf.yield %3 : vector<16x16xf16>
+  }
+  %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @if_multiple_uses(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
+// CHECK-NEXT:       %[[T3:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:       scf.yield %[[T3]] : vector<16x16xf16>
+// CHECK-NEXT:     } else {
+// CHECK-NEXT:       %[[T4:.*]] = xegpu.load_nd %[[ARG1]]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:       scf.yield %[[T4]] : vector<16x16xf16>
+// CHECK-NEXT:     } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
+  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %1 = scf.if %arg2 -> (vector<16x16xf16>) {
+    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+    scf.yield %3 : vector<16x16xf16>
+  } else {
+    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+    scf.yield %3 : vector<16x16xf16>
+  }
+  %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %1, %arg4  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @vector_outer_reduction(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
+// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
+func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
+  %0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
+  xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @vector_inner_reduction(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
+// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
+func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
+  %0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
+  xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @update_nd_offset_1d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+  %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
+  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @update_nd_offset_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+  %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
+  xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @prefetch_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @prefetch_2d(%arg0: memref<256x256xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @prefetch_1d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+func.func @prefetch_1d(%arg0: memref<256xf16>){
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
+  return
+}
+
+// -----
+// CHECK-LABEL: func.func @test_scf_while_and_condition(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
+// CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
+// CHECK:       scf.condition(%{{.*}}) {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: } do {
+// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>):
+// CHECK:     scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) {
+  %c0 = arith.constant 0 : i32
+  %c16 = arith.constant 16 : i32
+  %c256 = arith.constant 256 : i32
+  %0 = xegpu.create_nd_tdesc %arg0[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+  %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+  %2 = xegpu.create_nd_tdesc %arg1[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+
+  %3:3 = scf.while (%arg2 = %1, %arg3 = %c0, %arg4 = %0) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>)
+    -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) {
+    %4 = arith.cmpi slt, %arg3, %c256 : i32
+    scf.condition(%4) %arg2, %arg3, %arg4 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>
+  } do {
+  ^bb0(%arg2: vector<16xf32>, %arg3: i32, %arg4: !xegpu.tensor_desc<16xf32>):
+    xegpu.store_nd %arg2, %2  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+    %4 = arith.addi %arg3, %c16 : i32
+    %5 = xegpu.update_nd_offset %arg4, [16] : !xegpu.tensor_desc<16xf32>
+    %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+    scf.yield %6, %4, %5 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>
+  }
+  return
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
new file mode 100644
index 000000000000..a59633b0cbd9
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -0,0 +1,280 @@
+// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @store_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+// CHECK: gpu.return
+gpu.module @test {
+  gpu.func @store_nd_1d(%arg0: memref<16xf32>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32>
+    %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    xegpu.store_nd %cst, %0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+  gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf16>
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %cst, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @test {
+  gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+  gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
+// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[T5]], %[[T4]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+  gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
+    %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> from vector<2x16x16xf16>
+    %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_dpas_store
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+  gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    %3 = xegpu.load_nd %2  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+    %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_dpas_postop_store
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
+// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T8]], %[[T7]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+  gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    %3 = xegpu.load_nd %2  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+    %5 = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
+    %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
+// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
+// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution.
+// CHECK-LABEL: gpu.func @gemm
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
+// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
+// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
+// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
+// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
+// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) {
+// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[K]], %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %[[K]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c8 = arith.constant 8 : index
+  %c1024 = arith.constant 1024 : index
+  %block_id_x = gpu.block_id  x
+  %block_id_y = gpu.block_id  y
+  %0 = arith.muli %block_id_x, %c8 : index
+  %1 = arith.muli %block_id_y, %c16 : index
+  %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+    %5 = xegpu.create_nd_tdesc %arg0[%0, %arg3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %6 = xegpu.create_nd_tdesc %arg1[%arg3, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    %7 = xegpu.load_nd %5  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %8 = xegpu.load_nd %6  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+    %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  xegpu.store_nd %4, %2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @update_nd_offset_1d(
+// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @test {
+  gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32>
+    %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    %1 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    xegpu.store_nd %cst, %1  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @update_nd_offset_2d
+// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<16xf32>, !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+  gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf32>
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %cst, %1  : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+  gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
+gpu.module @test {
+  gpu.func @prefetch_1d(%arg0: memref<256xf16>) {
+    %c0 = arith.constant 0 : index
+    %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
deleted file mode 100644
index e5606c564250..000000000000
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ /dev/null
@@ -1,275 +0,0 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -cse -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: gpu.func @store_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-// CHECK: gpu.return
-gpu.module @test {
-gpu.func @store_nd_1d(%arg0: memref<16xf32>){
-  %c0 = arith.constant 0 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-  xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @store_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @test {
-gpu.func @store_nd_2d(%arg0: memref<16x16xf16>){
-  %c0 = arith.constant 0 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
-
-
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.module @test {
-gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-  %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @test {
-gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_array_length
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
-// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
-// CHECK: xegpu.store_nd %[[T5]], %[[T4]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @test {
-gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
-  %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
-  %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @dpas
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]
-// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
-// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>):
-// CHECK:  gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
-// CHECK: }
-// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[T1]]#0 : vector<8x1xf16> to vector<8xf16>
-// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[T1]]#1 : vector<16x1xf16> to vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = vector.shape_cast %[[T1]]#2 : vector<8x1xf32> to vector<8xf32>
-// CHECK: %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[T4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
-// CHECK: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG3]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T5]], %[[T6]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @test {
-gpu.func @dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_dpas_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @test {
-gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.return
-}
-}
-
-// -----
-gpu.module @test {
-// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
-  %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0 [%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @gemm_loop
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
-// CHECK: %[[BLOCK_ID_X:.*]] = gpu.block_id x
-// CHECK: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
-// CHECK: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) {
-// CHECK: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[K]], %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-// CHECK: %[[T11:.*]] = xegpu.load_nd %[[T10]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %[[K]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK: %[[T13:.*]] = xegpu.load_nd %[[T12]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
-// CHECK: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
-// CHECK: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: scf.yield %[[T16]] : vector<8x1xf32>
-// CHECK: }
-// CHECK: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @test {
-gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
-  %c0 = arith.constant 0 : index
-  %c16 = arith.constant 16 : index
-  %c8 = arith.constant 8 : index
-  %c1024 = arith.constant 1024 : index
-  %0 = gpu.block_id x
-  %1 = gpu.block_id y
-  %2 = arith.muli %0, %c8 : index
-  %3 = arith.muli %1, %c16 : index
-  %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-  %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
-  %6 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) {
-    %7 = xegpu.create_nd_tdesc %arg0[%2, %arg3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-    %8 = xegpu.create_nd_tdesc %arg1[%arg3, %3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-    %9 = xegpu.load_nd %7 : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
-    %10 = xegpu.load_nd %8 : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
-    %11 = xegpu.dpas %9, %10, %arg4 : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
-    scf.yield %11 : vector<8x16xf32>
-  }
-  xegpu.store_nd %6, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @update_nd_offset_1d(
-// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.module @test {
-gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>){
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
-  %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
-  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @update_nd_offset_2d
-// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
-// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
-// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T1]]  : vector<16xf32>, !xegpu.tensor_desc<16x16xf32>
-gpu.module @test {
-gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
-  %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
-  xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @prefetch_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.module @test {
-gpu.func @prefetch_2d(%arg0: memref<256x256xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @prefetch_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.module @test {
-gpu.func @prefetch_1d(%arg0: memref<256xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
-  gpu.return
-}
-}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
deleted file mode 100644
index 35ac39d074c7..000000000000
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ /dev/null
@@ -1,622 +0,0 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s
-
-// CHECK: function: dpas_f16:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-
-// -----
-// CHECK: function: dpas_i8:
-// CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 2]
-// CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [4, 1]
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-  %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-  xegpu.store_nd %0, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
-  return
-}
-
-// -----
-// CHECK: function: load_with_transpose_effect:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %3 = xegpu.load_nd %1 <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: vector_transpose:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-  %5 = xegpu.dpas %2, %4, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-  %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: extf_truncf:
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: Not assigned.
-func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
-  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32>
-  %3 = arith.truncf %2 : vector<16x16xf32> to vector<16x16xf16>
-  %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  return %4 : vector<8x16xf32>
-}
-
-// -----
-// CHECK: function: load_gather_with_transpose_effect:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
-// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-  %cst_0 = arith.constant dense<true> : vector<16xi1>
-  %2 = xegpu.create_tdesc %arg1, %cst : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
-  %3 = xegpu.load %2, %cst_0 <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
-  %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: load_gather_1d:
-// CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T1]] = xegpu.load %[[T0]], %[[CST0]]  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
-  %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-  %cst_0 = arith.constant dense<true> : vector<16xi1>
-  %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %1 = xegpu.load %0, %cst_0  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-  xegpu.store_nd %1, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  return
-}
-
-// -----
-// CHECK: function: store_scatter_with_transpose_effect:
-// CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
-func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
-  %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
-  %cst_0 = arith.constant dense<true> : vector<16xi1>
-  %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-  %0 = xegpu.create_tdesc %arg0, %cst_1 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-  xegpu.store %cst, %0, %cst_0 <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
-  return
-}
-
-// -----
-// CHECK: function: store_scatter_1d:
-// CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST1:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
-  %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-  %cst_0 = arith.constant dense<true> : vector<16xi1>
-  %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  xegpu.store %arg0, %0, %cst_0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
-  return
-}
-
-// -----
-// CHECK: function: vector_bitcast_i16_to_i8:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
-  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
-  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
-  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8>
-  %4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x32xi8>
-  %5 = xegpu.dpas %4, %3 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-  %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-  xegpu.store_nd %5, %6  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
-  return
-}
-
-// -----
-// CHECK: function: vector_bitcast_i8_to_f16:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %[[T1]]  : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
-  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
-  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8>
-  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8>
-  %4 = vector.bitcast %2 : vector<8x32xi8> to vector<8x16xf16>
-  %5 = vector.bitcast %3 : vector<16x32xi8> to vector<16x16xf16>
-  %6 = xegpu.dpas %4, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: binary_op_one_use:
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
-  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %2 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %3 = arith.addf %1, %2 : vector<16x16xf16>
-  %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  xegpu.store_nd %4, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: binary_op_multiple_uses:
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 3
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
-  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-  %2 = arith.addf %1, %cst : vector<16x16xf16>
-  %3 = xegpu.dpas %0, %2 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  xegpu.store_nd %3, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %2, %arg3  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  return
-}
-
-// -----
-// CHECK: function: for_op:
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 128 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %{{.*}} = arith.constant 16 : index
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T5:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : scf.for
-// CHECK-NEXT: layout for result #0: Not assigned.
-// CHECK-NEXT: layout for result #1: Not assigned.
-// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c16 = arith.constant 16 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-  %2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %0, %arg5 = %1, %arg6 = %cst) -> (!xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>) {
-    %4 = xegpu.load_nd %arg4  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-    %5 = xegpu.load_nd %arg5  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-    %6 = xegpu.dpas %4, %5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-    %7 = xegpu.update_nd_offset %arg4, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16>
-    %8 = xegpu.update_nd_offset %arg5, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16>
-    scf.yield %7, %8, %6 : !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>
-  }
-  %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %2#2, %3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: if_single_use:
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : scf.if
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
-  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %1 = scf.if %arg2 -> (vector<16x16xf16>) {
-    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-    scf.yield %3 : vector<16x16xf16>
-  } else {
-    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-    scf.yield %3 : vector<16x16xf16>
-  }
-  %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: if_multiple_uses:
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: layout  : Not assigned.
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 4
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T3:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T4:.*]] = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : scf.if
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
-  %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %1 = scf.if %arg2 -> (vector<16x16xf16>) {
-    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-    scf.yield %3 : vector<16x16xf16>
-  } else {
-    %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-    scf.yield %3 : vector<16x16xf16>
-  }
-  %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %1, %arg4  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-  return
-}
-
-// -----
-// CHECK: function: vector_outer_reduction:
-// CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
-  %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
-  %0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
-  xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  return
-}
-
-// -----
-// CHECK: function: vector_inner_reduction:
-// CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
-  %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
-  %0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
-  xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  return
-}
-
-// -----
-// CHECK: function: update_nd_offset_1d:
-// CHECK: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
-  %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
-  xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  return
-}
-
-// -----
-// CHECK: function: update_nd_offset_2d:
-// CHECK: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-// CHECK-NEXT: op    : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
-  %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
-  xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
-  return
-}
-
-// -----
-// CHECK: function: prefetch_2d:
-// CHECK: layout for result #0: Not assigned.
-// CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @prefetch_2d(%arg0: memref<256x256xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
-  return
-}
-
-// -----
-// CHECK: function: prefetch_1d:
-// CHECK: op    : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @prefetch_1d(%arg0: memref<256xf16>){
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-  xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
-  return
-}

From 04e2e581ac000934782398e05853338040bf7c46 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 20 Jun 2025 10:44:51 -0700
Subject: [PATCH 1089/1322] [RISCV] Treat bf16->f32 as separate ExtKind in
 combineOp_VLToVWOp_VL. (#144653)

This allows us to better track the narrow type we need and to fix
miscompiles if f16->f32 and bf16->f32 extends are mixed.

Fixes #144651.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 78 ++++++++++++-------
 .../RISCV/rvv/fixed-vectors-vfwmaccbf16.ll    | 58 +++++++++++++-
 2 files changed, 103 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b8ef221742a2..139fa7ba3562 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16316,7 +16316,12 @@ namespace {
 // apply a combine.
 struct CombineResult;
 
-enum ExtKind : uint8_t { ZExt = 1 << 0, SExt = 1 << 1, FPExt = 1 << 2 };
+enum ExtKind : uint8_t {
+  ZExt = 1 << 0,
+  SExt = 1 << 1,
+  FPExt = 1 << 2,
+  BF16Ext = 1 << 3
+};
 /// Helper class for folding sign/zero extensions.
 /// In particular, this class is used for the following combines:
 /// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w
@@ -16351,8 +16356,10 @@ struct NodeExtensionHelper {
   /// instance, a splat constant (e.g., 3), would support being both sign and
   /// zero extended.
   bool SupportsSExt;
-  /// Records if this operand is like being floating-Point extended.
+  /// Records if this operand is like being floating point extended.
   bool SupportsFPExt;
+  /// Records if this operand is extended from bf16.
+  bool SupportsBF16Ext;
   /// This boolean captures whether we care if this operand would still be
   /// around after the folding happens.
   bool EnforceOneUse;
@@ -16388,6 +16395,7 @@ struct NodeExtensionHelper {
     case ExtKind::ZExt:
       return RISCVISD::VZEXT_VL;
     case ExtKind::FPExt:
+    case ExtKind::BF16Ext:
       return RISCVISD::FP_EXTEND_VL;
     }
     llvm_unreachable("Unknown ExtKind enum");
@@ -16409,13 +16417,6 @@ struct NodeExtensionHelper {
     if (Source.getValueType() == NarrowVT)
       return Source;
 
-    // vfmadd_vl -> vfwmadd_vl can take bf16 operands
-    if (Source.getValueType().getVectorElementType() == MVT::bf16) {
-      assert(Root->getSimpleValueType(0).getVectorElementType() == MVT::f32 &&
-             Root->getOpcode() == RISCVISD::VFMADD_VL);
-      return Source;
-    }
-
     unsigned ExtOpc = getExtOpc(*SupportsExt);
 
     // If we need an extension, we should be changing the type.
@@ -16458,7 +16459,8 @@ struct NodeExtensionHelper {
     // Determine the narrow size.
     unsigned NarrowSize = VT.getScalarSizeInBits() / 2;
 
-    MVT EltVT = SupportsExt == ExtKind::FPExt
+    MVT EltVT = SupportsExt == ExtKind::BF16Ext ? MVT::bf16
+                : SupportsExt == ExtKind::FPExt
                     ? MVT::getFloatingPointVT(NarrowSize)
                     : MVT::getIntegerVT(NarrowSize);
 
@@ -16635,17 +16637,13 @@ struct NodeExtensionHelper {
     EnforceOneUse = false;
   }
 
-  bool isSupportedFPExtend(SDNode *Root, MVT NarrowEltVT,
-                           const RISCVSubtarget &Subtarget) {
-    // Any f16 extension will need zvfh
-    if (NarrowEltVT == MVT::f16 && !Subtarget.hasVInstructionsF16())
-      return false;
-    // The only bf16 extension we can do is vfmadd_vl -> vfwmadd_vl with
-    // zvfbfwma
-    if (NarrowEltVT == MVT::bf16 && (!Subtarget.hasStdExtZvfbfwma() ||
-                                     Root->getOpcode() != RISCVISD::VFMADD_VL))
-      return false;
-    return true;
+  bool isSupportedFPExtend(MVT NarrowEltVT, const RISCVSubtarget &Subtarget) {
+    return (NarrowEltVT == MVT::f32 ||
+            (NarrowEltVT == MVT::f16 && Subtarget.hasVInstructionsF16()));
+  }
+
+  bool isSupportedBF16Extend(MVT NarrowEltVT, const RISCVSubtarget &Subtarget) {
+    return NarrowEltVT == MVT::bf16 && Subtarget.hasStdExtZvfbfwma();
   }
 
   /// Helper method to set the various fields of this struct based on the
@@ -16655,6 +16653,7 @@ struct NodeExtensionHelper {
     SupportsZExt = false;
     SupportsSExt = false;
     SupportsFPExt = false;
+    SupportsBF16Ext = false;
     EnforceOneUse = true;
     unsigned Opc = OrigOperand.getOpcode();
     // For the nodes we handle below, we end up using their inputs directly: see
@@ -16686,9 +16685,11 @@ struct NodeExtensionHelper {
     case RISCVISD::FP_EXTEND_VL: {
       MVT NarrowEltVT =
           OrigOperand.getOperand(0).getSimpleValueType().getVectorElementType();
-      if (!isSupportedFPExtend(Root, NarrowEltVT, Subtarget))
-        break;
-      SupportsFPExt = true;
+      if (isSupportedFPExtend(NarrowEltVT, Subtarget))
+        SupportsFPExt = true;
+      if (isSupportedBF16Extend(NarrowEltVT, Subtarget))
+        SupportsBF16Ext = true;
+
       break;
     }
     case ISD::SPLAT_VECTOR:
@@ -16705,16 +16706,16 @@ struct NodeExtensionHelper {
       if (Op.getOpcode() != ISD::FP_EXTEND)
         break;
 
-      if (!isSupportedFPExtend(Root, Op.getOperand(0).getSimpleValueType(),
-                               Subtarget))
-        break;
-
       unsigned NarrowSize = VT.getScalarSizeInBits() / 2;
       unsigned ScalarBits = Op.getOperand(0).getValueSizeInBits();
       if (NarrowSize != ScalarBits)
         break;
 
-      SupportsFPExt = true;
+      if (isSupportedFPExtend(Op.getOperand(0).getSimpleValueType(), Subtarget))
+        SupportsFPExt = true;
+      if (isSupportedBF16Extend(Op.getOperand(0).getSimpleValueType(),
+                                Subtarget))
+        SupportsBF16Ext = true;
       break;
     }
     default:
@@ -16947,6 +16948,11 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
     return CombineResult(NodeExtensionHelper::getFPExtOpcode(Root->getOpcode()),
                          Root, LHS, /*LHSExt=*/{ExtKind::FPExt}, RHS,
                          /*RHSExt=*/{ExtKind::FPExt});
+  if ((AllowExtMask & ExtKind::BF16Ext) && LHS.SupportsBF16Ext &&
+      RHS.SupportsBF16Ext)
+    return CombineResult(NodeExtensionHelper::getFPExtOpcode(Root->getOpcode()),
+                         Root, LHS, /*LHSExt=*/{ExtKind::BF16Ext}, RHS,
+                         /*RHSExt=*/{ExtKind::BF16Ext});
   return std::nullopt;
 }
 
@@ -17029,6 +17035,18 @@ canFoldToVWWithFPEXT(SDNode *Root, const NodeExtensionHelper &LHS,
                                           Subtarget);
 }
 
+/// Check if \p Root follows a pattern Root(bf16ext(LHS), bf16ext(RHS))
+///
+/// \returns std::nullopt if the pattern doesn't match or a CombineResult that
+/// can be used to apply the pattern.
+static std::optional<CombineResult>
+canFoldToVWWithBF16EXT(SDNode *Root, const NodeExtensionHelper &LHS,
+                       const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+                       const RISCVSubtarget &Subtarget) {
+  return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, ExtKind::BF16Ext, DAG,
+                                          Subtarget);
+}
+
 /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS))
 ///
 /// \returns std::nullopt if the pattern doesn't match or a CombineResult that
@@ -17068,6 +17086,8 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
   case RISCVISD::VFNMADD_VL:
   case RISCVISD::VFNMSUB_VL:
     Strategies.push_back(canFoldToVWWithSameExtension);
+    if (Root->getOpcode() == RISCVISD::VFMADD_VL)
+      Strategies.push_back(canFoldToVWWithBF16EXT);
     break;
   case ISD::MUL:
   case RISCVISD::MUL_VL:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
index 1639f21f243d..aec970adff51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
 
 define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x bfloat> %c) {
 ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v1f32:
@@ -295,3 +295,53 @@ define <32 x float> @vfwmaccbf32_vf_v32f32(<32 x float> %a, bfloat %b, <32 x bfl
   %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %b.ext, <32 x float> %c.ext, <32 x float> %a)
   ret <32 x float> %res
 }
+
+define <4 x float> @vfwmaccbf16_vf_v4f32_scalar_extend(<4 x float> %rd, bfloat %a, <4 x bfloat> %b) local_unnamed_addr #0 {
+; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v4f32_scalar_extend:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vf v8, fa0, v9
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v4f32_scalar_extend:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFBFMIN-NEXT:    slli a0, a0, 16
+; ZVFBFMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vf v8, fa5, v10
+; ZVFBFMIN-NEXT:    ret
+  %b_ext = fpext <4 x bfloat> %b to <4 x float>
+  %a_extend = fpext bfloat %a to float
+  %a_insert = insertelement <4 x float> poison, float %a_extend, i64 0
+  %a_shuffle = shufflevector <4 x float> %a_insert, <4 x float> poison, <4 x i32> zeroinitializer
+  %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a_shuffle, <4 x float> %b_ext, <4 x float> %rd)
+  ret <4 x float> %fma
+}
+
+; Negative test with a mix of bfloat and half fpext.
+define <4 x float> @mix(<4 x float> %rd, <4 x half> %a, <4 x bfloat> %b) {
+; ZVFBFWMA-LABEL: mix:
+; ZVFBFWMA:       # %bb.0:
+; ZVFBFWMA-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFBFWMA-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFBFWMA-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; ZVFBFWMA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFWMA-NEXT:    vfmacc.vv v8, v11, v9
+; ZVFBFWMA-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: mix:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v11, v9
+; ZVFBFMIN-NEXT:    ret
+  %a_ext = fpext <4 x half> %a to <4 x float>
+  %b_ext = fpext <4 x bfloat> %b to <4 x float>
+  %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a_ext, <4 x float> %b_ext, <4 x float> %rd)
+  ret <4 x float> %fma
+}

From ab8b8c1e138ae705f251626b63ad2cf4d7937003 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Fri, 20 Jun 2025 11:05:25 -0700
Subject: [PATCH 1090/1322] [TargetParser][cmake] Be Smarter about TableGen
 Deps (#144848)

This tries to be a bit smarter for the OLD behaviour of CMP0116, to glob
more relevant directories looking for possible dependencies.

The changes are:
- Remove some duplication of lines in the `tablegen` function.
- Put CURRENT_SOURCE_DIR into `tblgen_includes` (at the front)
- Glob all directories in `tblgen_includes`
- Give up on `local_tds` which was wrong when using tablegen to compile
a file in a different directory (as TargetParser does)
- Use `EXTRA_INCLUDES` in TargetParser `tablegen` calls.

This is still an under-approximation of what might be included, at least
comparing the RISCVTargetParserDef.inc.d (after building
`target_parser_gen`), and the list of deps in the ninja file when
explicitly setting CMP0116 to OLD.

Fixes #144639
---
 llvm/cmake/modules/TableGen.cmake             | 71 ++++++++-----------
 llvm/include/llvm/TargetParser/CMakeLists.txt |  6 +-
 2 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index b26fc62d4cc0..67a628d4953c 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -21,6 +21,13 @@ function(tablegen project ofn)
     message(FATAL_ERROR "${project}_TABLEGEN_EXE not set")
   endif()
 
+  # Set the include directories
+  get_directory_property(tblgen_includes INCLUDE_DIRECTORIES)
+  list(PREPEND tblgen_includes ${ARG_EXTRA_INCLUDES})
+  list(PREPEND tblgen_includes ${CMAKE_CURRENT_SOURCE_DIR})
+  # Filter out any empty include items.
+  list(REMOVE_ITEM tblgen_includes "")
+
   # Use depfile instead of globbing arbitrary *.td(s) for Ninja. We force
   # CMake versions older than v3.30 on Windows to use the fallback behavior
   # due to a depfile parsing bug on Windows paths in versions prior to 3.30.
@@ -42,22 +49,16 @@ function(tablegen project ofn)
       -d ${ofn}.d
       DEPFILE ${ofn}.d
       )
-    set(local_tds)
     set(global_tds)
   else()
-    file(GLOB local_tds "*.td")
-    file(GLOB_RECURSE global_tds "${LLVM_MAIN_INCLUDE_DIR}/llvm/*.td")
+    set(include_td_dirs "${tblgen_includes}")
+    list(TRANSFORM include_td_dirs APPEND "/*.td")
+    file(GLOB global_tds ${include_td_dirs})
     set(additional_cmdline
       -o ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
       )
   endif()
 
-  if (IS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
-    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
-  else()
-    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE
-      ${CMAKE_CURRENT_SOURCE_DIR}/${LLVM_TARGET_DEFINITIONS})
-  endif()
   if (LLVM_ENABLE_DAGISEL_COV AND "-gen-dag-isel" IN_LIST ARGN)
     list(APPEND LLVM_TABLEGEN_FLAGS "-instrument-coverage")
   endif()
@@ -92,6 +93,25 @@ function(tablegen project ofn)
     list(APPEND LLVM_TABLEGEN_FLAGS "-no-warn-on-unused-template-args")
   endif()
 
+  # Build the absolute path for the current input file.
+  if (IS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
+    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
+  else()
+    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE
+      ${CMAKE_CURRENT_SOURCE_DIR}/${LLVM_TARGET_DEFINITIONS})
+  endif()
+
+  # Append this file and its includes to the compile commands file.
+  # This file is used by the TableGen LSP Language Server (tblgen-lsp-server).
+  file(APPEND ${CMAKE_BINARY_DIR}/tablegen_compile_commands.yml
+      "--- !FileInfo:\n"
+      "  filepath: \"${LLVM_TARGET_DEFINITIONS_ABSOLUTE}\"\n"
+      "  includes: \"${tblgen_includes}\"\n"
+  )
+
+  # Prepend each include entry with -I for arguments.
+  list(TRANSFORM tblgen_includes PREPEND -I)
+
   # We need both _TABLEGEN_TARGET and _TABLEGEN_EXE in the  DEPENDS list
   # (both the target and the file) to have .inc files rebuilt on
   # a tablegen change, as cmake does not propagate file-level dependencies
@@ -101,35 +121,6 @@ function(tablegen project ofn)
   # dependency twice in the result file when
   # ("${${project}_TABLEGEN_TARGET}" STREQUAL "${${project}_TABLEGEN_EXE}")
   # but lets us having smaller and cleaner code here.
-  get_directory_property(tblgen_includes INCLUDE_DIRECTORIES)
-  list(APPEND tblgen_includes ${ARG_EXTRA_INCLUDES})
-
-  # Get the current set of include paths for this td file.
-  cmake_parse_arguments(ARG "" "" "DEPENDS;EXTRA_INCLUDES" ${ARGN})
-  get_directory_property(tblgen_includes INCLUDE_DIRECTORIES)
-  list(APPEND tblgen_includes ${ARG_EXTRA_INCLUDES})
-  # Filter out any empty include items.
-  list(REMOVE_ITEM tblgen_includes "")
-
-  # Build the absolute path for the current input file.
-  if (IS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
-    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
-  else()
-    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${CMAKE_CURRENT_SOURCE_DIR}/${LLVM_TARGET_DEFINITIONS})
-  endif()
-
-  # Append this file and its includes to the compile commands file.
-  # This file is used by the TableGen LSP Language Server (tblgen-lsp-server).
-  file(APPEND ${CMAKE_BINARY_DIR}/tablegen_compile_commands.yml
-      "--- !FileInfo:\n"
-      "  filepath: \"${LLVM_TARGET_DEFINITIONS_ABSOLUTE}\"\n"
-      "  includes: \"${CMAKE_CURRENT_SOURCE_DIR};${tblgen_includes}\"\n"
-  )
-
-  # Filter out empty items before prepending each entry with -I
-  list(REMOVE_ITEM tblgen_includes "")
-  list(TRANSFORM tblgen_includes PREPEND -I)
-
   set(tablegen_exe ${${project}_TABLEGEN_EXE})
   set(tablegen_depends ${${project}_TABLEGEN_TARGET} ${tablegen_exe})
 
@@ -140,7 +131,7 @@ function(tablegen project ofn)
   endif()
 
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
-    COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS} -I ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS}
     ${tblgen_includes}
     ${LLVM_TABLEGEN_FLAGS}
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
@@ -150,7 +141,7 @@ function(tablegen project ofn)
     # directory and local_tds may not contain it, so we must
     # explicitly list it here:
     DEPENDS ${ARG_DEPENDS} ${tablegen_depends}
-      ${local_tds} ${global_tds}
+      ${global_tds}
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
     ${LLVM_TARGET_DEPENDS}
     ${LLVM_TABLEGEN_JOB_POOL}
diff --git a/llvm/include/llvm/TargetParser/CMakeLists.txt b/llvm/include/llvm/TargetParser/CMakeLists.txt
index bb6d58d74a35..3811367be614 100644
--- a/llvm/include/llvm/TargetParser/CMakeLists.txt
+++ b/llvm/include/llvm/TargetParser/CMakeLists.txt
@@ -1,11 +1,11 @@
 set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/lib/Target/ARM/ARM.td)
-tablegen(LLVM ARMTargetParserDef.inc -gen-arm-target-def -I ${PROJECT_SOURCE_DIR}/lib/Target/ARM/)
+tablegen(LLVM ARMTargetParserDef.inc -gen-arm-target-def EXTRA_INCLUDES ${PROJECT_SOURCE_DIR}/lib/Target/ARM)
 
 set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/lib/Target/AArch64/AArch64.td)
-tablegen(LLVM AArch64TargetParserDef.inc -gen-arm-target-def -I ${PROJECT_SOURCE_DIR}/lib/Target/AArch64/)
+tablegen(LLVM AArch64TargetParserDef.inc -gen-arm-target-def EXTRA_INCLUDES ${PROJECT_SOURCE_DIR}/lib/Target/AArch64)
 
 set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/lib/Target/RISCV/RISCV.td)
-tablegen(LLVM RISCVTargetParserDef.inc -gen-riscv-target-def -I ${PROJECT_SOURCE_DIR}/lib/Target/RISCV/)
+tablegen(LLVM RISCVTargetParserDef.inc -gen-riscv-target-def EXTRA_INCLUDES ${PROJECT_SOURCE_DIR}/lib/Target/RISCV)
 
 set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/lib/Target/PowerPC/PPC.td)
 tablegen(LLVM PPCGenTargetFeatures.inc -gen-target-features -I${PROJECT_SOURCE_DIR}/lib/Target/PowerPC)

From 7f74a377d094c34eba1adde1f1edc382d01d2e5e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 20 Jun 2025 19:10:25 +0100
Subject: [PATCH 1091/1322] [LV] Regenerate uniform_across_vf* check lines.

Re-generate check lines to reduce diff in upcoming changes.

Also filters out the code after scalar.ph:, which is dead.
---
 .../uniform_across_vf_induction1.ll           |  288 +---
 .../uniform_across_vf_induction1_and.ll       |  271 +---
 .../uniform_across_vf_induction1_div_urem.ll  |  321 ++--
 .../uniform_across_vf_induction1_lshr.ll      |  494 +-----
 .../uniform_across_vf_induction2.ll           | 1404 ++++-------------
 5 files changed, 607 insertions(+), 2171 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index d9f0e180bdaa..3aad626554ce 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "scalar.ph:" --version 2
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 %s -S | FileCheck %s
 
 ; Tests for checking uniformity within a VF.
@@ -27,21 +27,6 @@ define void @ld_div1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -86,21 +71,6 @@ define void @ld_div2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -150,21 +120,6 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -220,21 +175,6 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -283,21 +223,6 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -353,21 +278,6 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -423,21 +333,6 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -493,21 +388,6 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -556,21 +436,6 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -614,21 +479,6 @@ define void @ld_div1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -679,21 +529,6 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -744,21 +579,6 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -815,21 +635,6 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -879,21 +684,6 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -950,21 +740,6 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1021,21 +796,6 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1092,21 +852,6 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1156,21 +901,6 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1219,22 +949,6 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 56, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[IV]], [[IV]]
-; CHECK-NEXT:    [[DIV_LHS_TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; CHECK-NEXT:    [[DIV5:%.*]] = udiv i16 [[DIV_LHS_TRUNC]], 6
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[IV]] to i16
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i16 [[DIV5]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IDXPROM]]
-; CHECK-NEXT:    store i16 [[CONV]], ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 56
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP39:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index e85cf2471144..1f331a4bf973 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "scalar.ph:" --version 2
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 %s -S | FileCheck %s
 
 ; Tests for checking uniformity within a VF.
@@ -13,35 +13,20 @@ define void @ld_and_neg1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[INDEX]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = and i64 [[INDEX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -71,36 +56,21 @@ define void @ld_and_neg2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[INDEX]], -2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = and i64 [[INDEX]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -130,41 +100,26 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i64> [[TMP9]], splat (i64 42)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP10]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -220,21 +175,6 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -275,26 +215,11 @@ define void @ld_and_neg2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -350,21 +275,6 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -1
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -420,21 +330,6 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -465,41 +360,26 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i64> [[TMP9]], splat (i64 42)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP10]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -556,21 +436,6 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -627,21 +492,6 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -2
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -698,21 +548,6 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], -3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AND]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 12b7353a7803..ea8831c8ab7e 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "scalar.ph:" --version 2
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 %s -S | FileCheck %s
 
 ; Tests for checking uniformity within a VF.
@@ -15,7 +15,83 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = urem <8 x i64> [[TMP0]], splat (i64 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 5
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[TMP24]], i32 6
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[TMP25]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = add nsw <8 x i64> [[TMP33]], splat (i64 42)
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i64, ptr [[TMP35]], i32 0
+; CHECK-NEXT:    store <8 x i64> [[TMP34]], ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop ]
+  %div = udiv i64 %iv, 2
+  %rem = urem i64 %div, 3
+  %gep_ld = getelementptr inbounds i64, ptr %A, i64 %rem
+  %ld = load i64, ptr %gep_ld, align 8
+  %calc = add nsw i64 %ld, 42
+  %gep_st = getelementptr inbounds i64, ptr %B, i64 %iv
+  store i64 %calc, ptr %gep_st, align 8
+  %iv_next = add nsw i64 %iv, 1
+  %cond = icmp eq i64 %iv_next, 1000
+  br i1 %cond, label %exit, label %loop
+exit:
+  ret void
+}
+
+; for (iv = 0 ; ; iv += 1) B[iv] = A[((iv++)/2)%3];
+define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
+; CHECK-LABEL: define void @ld_div2_urem3_2
+; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <8 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv <8 x i64> [[TMP0]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem <8 x i64> [[TMP1]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
@@ -56,118 +132,10 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 2
-; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[DIV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[REM]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop ]
-  %div = udiv i64 %iv, 2
-  %rem = urem i64 %div, 3
-  %gep_ld = getelementptr inbounds i64, ptr %A, i64 %rem
-  %ld = load i64, ptr %gep_ld, align 8
-  %calc = add nsw i64 %ld, 42
-  %gep_st = getelementptr inbounds i64, ptr %B, i64 %iv
-  store i64 %calc, ptr %gep_st, align 8
-  %iv_next = add nsw i64 %iv, 1
-  %cond = icmp eq i64 %iv_next, 1000
-  br i1 %cond, label %exit, label %loop
-exit:
-  ret void
-}
-
-; for (iv = 0 ; ; iv += 1) B[iv] = A[((iv++)/2)%3];
-define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
-; CHECK-LABEL: define void @ld_div2_urem3_2
-; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <8 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv <8 x i64> [[TMP1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = urem <8 x i64> [[TMP2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i64> [[TMP3]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i64> [[TMP3]], i32 5
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i64> [[TMP3]], i32 6
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i64> [[TMP3]], i32 7
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> poison, i64 [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 1
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 2
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 3
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[TMP25]], i32 5
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i64> [[TMP33]], i64 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <8 x i64> [[TMP34]], i64 [[TMP27]], i32 7
-; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <8 x i64> [[TMP35]], splat (i64 42)
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i64, ptr [[TMP37]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[TMP36]], ptr [[TMP38]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV_NEXT]], 2
-; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[DIV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[REM]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -198,65 +166,50 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 2
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 5
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[TMP24]], i32 6
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[TMP25]], i32 7
-; CHECK-NEXT:    [[TMP34:%.*]] = add nsw <8 x i64> [[TMP33]], splat (i64 42)
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i64, ptr [[TMP35]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[TMP34]], ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> poison, i64 [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 4
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 5
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 6
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[TMP24]], i32 7
+; CHECK-NEXT:    [[TMP33:%.*]] = add nsw <8 x i64> [[TMP32]], splat (i64 42)
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
+; CHECK-NEXT:    store <8 x i64> [[TMP33]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[DIV]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -286,38 +239,22 @@ define void @ld_div8_urem3(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 42
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[IV]], 8
-; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[DIV]], 3
-; CHECK-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[REM]]
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; CHECK-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; CHECK-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 6b501905c33d..1f33f7a15edd 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "scalar.ph:" --version 2
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 %s -S | FileCheck --check-prefixes=VF2 %s
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 %s -S | FileCheck --check-prefixes=VF4 %s
 
@@ -28,21 +28,6 @@ define void @ld_lshr0_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 0
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr0_step1_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -66,21 +51,6 @@ define void @ld_lshr0_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 0
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -125,21 +95,6 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step1_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -150,49 +105,34 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP3]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
-; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 1
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 2
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 3
-; VF4-NEXT:    [[TMP18:%.*]] = add nsw <4 x i64> [[TMP17]], splat (i64 42)
-; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP20]], align 8
+; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 2
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 3
+; VF4-NEXT:    [[TMP17:%.*]] = add nsw <4 x i64> [[TMP16]], splat (i64 42)
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP19]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -222,36 +162,21 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP1:%.*]] = lshr i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; VF2-NEXT:    [[TMP0:%.*]] = lshr i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 2
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr2_step1_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -261,36 +186,21 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP1:%.*]] = lshr i64 [[INDEX]], 2
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
-; VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; VF4-NEXT:    [[TMP0:%.*]] = lshr i64 [[INDEX]], 2
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
+; VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
 ; VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VF4-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
+; VF4-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 2
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -346,21 +256,6 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 0
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr0_step2_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -413,21 +308,6 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 0
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -476,21 +356,6 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step2_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -528,21 +393,6 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -598,21 +448,6 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 0
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr0_step3_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -665,21 +500,6 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 0
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -735,21 +555,6 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step3_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -802,21 +607,6 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -848,41 +638,26 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF2-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP3]], align 8
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
-; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP7]], i32 1
-; VF2-NEXT:    [[TMP10:%.*]] = add nsw <2 x i64> [[TMP9]], splat (i64 42)
-; VF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP10]], ptr [[TMP12]], align 8
+; VF2-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
+; VF2-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
+; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; VF2-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step1_start1_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -894,49 +669,34 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP3]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
-; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 1
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 2
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 3
-; VF4-NEXT:    [[TMP18:%.*]] = add nsw <4 x i64> [[TMP17]], splat (i64 42)
-; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP20]], align 8
+; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
+; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 2
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 3
+; VF4-NEXT:    [[TMP17:%.*]] = add nsw <4 x i64> [[TMP16]], splat (i64 42)
+; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP19]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -986,21 +746,6 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step2_start1_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1039,21 +784,6 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 993, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1110,21 +840,6 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step3_start1_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1178,21 +893,6 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 1
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1249,21 +949,6 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 2
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr2_step3_start1_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1317,21 +1002,6 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[LSHR:%.*]] = lshr i64 [[IV]], 2
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[LSHR]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index b3af23c49637..ef6255720d73 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "scalar.ph:" --version 2
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 %s -S | FileCheck --check-prefix=VF2 %s
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 %s -S | FileCheck --check-prefix=VF4 %s
 
@@ -13,50 +13,30 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 1)
-; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
-; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
+; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
+; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
-; VF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step1_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -67,58 +47,38 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP2:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 1)
-; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
-; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
-; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = add nsw <4 x i64> [[TMP19]], splat (i64 42)
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 2
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
+; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
+; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
-; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -152,43 +112,23 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP2:%.*]] = udiv i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP3:%.*]] = udiv i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], 42
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i64 0
+; VF2-NEXT:    [[TMP0:%.*]] = udiv i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP1:%.*]] = udiv i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[TMP0]], [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 42
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step1_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -199,58 +139,38 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
-; VF4-NEXT:    [[TMP2:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 2)
-; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
-; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
-; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = add nsw <4 x i64> [[TMP19]], splat (i64 42)
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
+; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 2
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
+; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
+; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
-; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -284,50 +204,30 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
-; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 3)
-; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
-; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
+; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
+; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
+; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
-; VF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step1_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -338,58 +238,38 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
-; VF4-NEXT:    [[TMP2:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 3)
-; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
-; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
-; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = add nsw <4 x i64> [[TMP19]], splat (i64 42)
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
+; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 2
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
+; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
+; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
-; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -423,12 +303,12 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 1)
+; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
@@ -447,32 +327,12 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step2_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -483,14 +343,14 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 1)
+; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
@@ -523,32 +383,12 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -582,12 +422,12 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
@@ -606,32 +446,12 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step2_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -642,14 +462,14 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
-; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 2)
+; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
@@ -682,32 +502,12 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -741,12 +541,12 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
-; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 3)
+; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
@@ -765,32 +565,12 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step2_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -801,14 +581,14 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
-; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 3)
+; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
@@ -841,32 +621,12 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -900,12 +660,12 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 1)
+; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
@@ -924,32 +684,12 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step3_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -960,14 +700,14 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 3, i64 6, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
 ; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 1)
+; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
@@ -1000,32 +740,12 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1059,12 +779,12 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
@@ -1083,32 +803,12 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step3_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1119,14 +819,14 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 3, i64 6, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
 ; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
-; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 2)
+; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
@@ -1159,32 +859,12 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1218,12 +898,12 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
 ; VF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
-; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 3)
+; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
@@ -1242,32 +922,12 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step3_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1278,14 +938,14 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 3, i64 6, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
 ; VF4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
-; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 3)
+; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
@@ -1318,32 +978,12 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1377,51 +1017,31 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 1)
-; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
-; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
+; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
+; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
+; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
-; VF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 998, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step1_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1432,59 +1052,39 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP2:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 1)
-; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
-; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
-; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = add nsw <4 x i64> [[TMP19]], splat (i64 42)
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP0:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
+; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 2
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
+; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
+; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
-; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1518,51 +1118,31 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 2)
-; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
-; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
+; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
+; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
-; VF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 998, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step1_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1573,59 +1153,39 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
-; VF4-NEXT:    [[TMP2:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 2)
-; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
-; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
-; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = add nsw <4 x i64> [[TMP19]], splat (i64 42)
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP0:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
+; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 2
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
+; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
+; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
-; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1659,51 +1219,31 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
-; VF2-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 3)
-; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
-; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
-; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
+; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
+; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
+; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
+; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
-; VF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 999, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 998, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step1_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1714,59 +1254,39 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
-; VF4-NEXT:    [[TMP2:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 3)
-; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
-; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
-; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
-; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
-; VF4-NEXT:    [[TMP20:%.*]] = add nsw <4 x i64> [[TMP19]], splat (i64 42)
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP22]], align 8
+; VF4-NEXT:    [[TMP0:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
+; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
+; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 2
+; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
+; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
+; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
-; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1800,13 +1320,13 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
@@ -1825,32 +1345,12 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 498, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step2_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -1861,7 +1361,7 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 3, i64 5, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 2
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -1869,7 +1369,7 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 1)
+; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
 ; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
@@ -1902,32 +1402,12 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 993, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 496, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1961,13 +1441,13 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
@@ -1986,32 +1466,12 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 498, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step2_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -2022,7 +1482,7 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 3, i64 5, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 2
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -2030,7 +1490,7 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
-; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 2)
+; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
 ; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
@@ -2063,32 +1523,12 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 993, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 496, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -2122,13 +1562,13 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 2
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
-; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 3)
+; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
@@ -2147,32 +1587,12 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 498, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step2_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -2183,7 +1603,7 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 3, i64 5, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 2
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -2191,7 +1611,7 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
-; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 3)
+; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
 ; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
@@ -2224,32 +1644,12 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 993, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 496, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -2283,13 +1683,13 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
-; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 1)
+; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
@@ -2308,32 +1708,12 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step3_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -2344,7 +1724,7 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 4, i64 7, i64 10>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -2352,7 +1732,7 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 1)
-; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 1)
+; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
 ; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
@@ -2385,32 +1765,12 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 1
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 1
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -2444,13 +1804,13 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
@@ -2469,32 +1829,12 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step3_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -2505,7 +1845,7 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 4, i64 7, i64 10>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -2513,7 +1853,7 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 2)
-; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 2)
+; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
 ; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
@@ -2546,32 +1886,12 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 2
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 2
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -2605,13 +1925,13 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2:       vector.body:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF2-NEXT:    [[VEC_IND2:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
-; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND2]], splat (i64 3)
+; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
@@ -2630,32 +1950,12 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i64> [[VEC_IND2]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF2-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF2-NEXT:    br label [[LOOP:%.*]]
-; VF2:       loop:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF2-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF2-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF2-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF2-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF2-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF2-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF2-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF2-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
-; VF2:       exit:
-; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step3_start1_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -2666,7 +1966,7 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4:       vector.body:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 4, i64 7, i64 10>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VF4-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; VF4-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]]
 ; VF4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -2674,7 +1974,7 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND]], splat (i64 3)
-; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND2]], splat (i64 3)
+; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
 ; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
@@ -2707,32 +2007,12 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; VF4-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 332, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; VF4-NEXT:    br label [[LOOP:%.*]]
-; VF4:       loop:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
-; VF4-NEXT:    [[DIV1:%.*]] = udiv i64 [[IV]], 3
-; VF4-NEXT:    [[DIV2:%.*]] = udiv i64 [[IV2]], 3
-; VF4-NEXT:    [[ADD:%.*]] = add i64 [[DIV1]], [[DIV2]]
-; VF4-NEXT:    [[GEP_LD:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[ADD]]
-; VF4-NEXT:    [[LD:%.*]] = load i64, ptr [[GEP_LD]], align 8
-; VF4-NEXT:    [[CALC:%.*]] = add nsw i64 [[LD]], 42
-; VF4-NEXT:    [[GEP_ST:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    store i64 [[CALC]], ptr [[GEP_ST]], align 8
-; VF4-NEXT:    [[IV2_NEXT]] = add nsw i64 [[IV2]], 1
-; VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; VF4-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; VF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
-; VF4:       exit:
-; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop

From bae48ac3c0e6f406038833199b185493a67ee08b Mon Sep 17 00:00:00 2001
From: Yijia Gu <yijiagu@google.com>
Date: Fri, 20 Jun 2025 11:14:14 -0700
Subject: [PATCH 1092/1322] [mlir][bazel] add missing deps for XeGPUTransforms

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 761a93ea7dfa..175d2d57d116 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3526,6 +3526,7 @@ cc_library(
         ":Analysis",
         ":ArithDialect",
         ":ArithUtils",
+        ":ControlFlowInterfaces",
         ":DialectUtils",
         ":FunctionInterfaces",
         ":GPUDialect",
@@ -3538,6 +3539,7 @@ cc_library(
         ":MemRefDialect",
         ":Pass",
         ":SCFTransforms",
+        ":Support",
         ":TransformUtils",
         ":VectorDialect",
         ":VectorTransforms",

From ff6367b47071a7d80e773127e2a798c087e81ff5 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Fri, 20 Jun 2025 11:16:46 -0700
Subject: [PATCH 1093/1322] [[mlir][Vector] Add simple folders for
 `vector.from_element`/`vector.to_elements` (#144444)

This PR adds simple folders to remove no-op sequences of
`vector.from_elements` and `vector.to_elements`.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       |  2 +
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 85 +++++++++++++++++++
 mlir/test/Dialect/Vector/canonicalize.mlir    | 52 ++++++++++++
 3 files changed, 139 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 85cc22ab3964..aef156c5f1d0 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -836,6 +836,7 @@ def Vector_ToElementsOp : Vector_Op<"to_elements", [
   let arguments = (ins AnyVectorOfAnyRank:$source);
   let results = (outs Variadic<AnyType>:$elements);
   let assemblyFormat = "$source attr-dict `:` type($source)";
+  let hasFolder = 1;
 }
 
 def Vector_FromElementsOp : Vector_Op<"from_elements", [
@@ -873,6 +874,7 @@ def Vector_FromElementsOp : Vector_Op<"from_elements", [
   let arguments = (ins Variadic<AnyType>:$elements);
   let results = (outs AnyFixedVectorOfAnyRank:$dest);
   let assemblyFormat = "$elements attr-dict `:` type($dest)";
+  let hasFolder = 1;
   let hasCanonicalizer = 1;
 }
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index e576eeac2365..6f0ac6bb5828 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2373,10 +2373,95 @@ std::optional<SmallVector<int64_t, 4>> FMAOp::getShapeForUnroll() {
   return llvm::to_vector<4>(getVectorType().getShape());
 }
 
+//===----------------------------------------------------------------------===//
+// ToElementsOp
+//===----------------------------------------------------------------------===//
+
+/// Returns true if all the `operands` are defined by `defOp`.
+/// Otherwise, returns false.
+static bool haveSameDefiningOp(OperandRange operands, Operation *defOp) {
+  if (operands.empty())
+    return false;
+
+  return llvm::all_of(operands, [&](Value operand) {
+    Operation *currentDef = operand.getDefiningOp();
+    return currentDef == defOp;
+  });
+}
+
+/// Folds vector.to_elements(vector.from_elements(%e0, %e1, ...)) into
+/// (%e0, %e1, ...). For example:
+///
+///   %0 = vector.from_elements %a, %b, %c : vector<3xf32>
+///   %1:3 = vector.to_elements %0 : vector<3xf32>
+///   user_op %1#0, %1#1, %1#2
+///
+/// becomes:
+///
+///   user_op %a, %b, %c
+///
+static LogicalResult
+foldToElementsFromElements(ToElementsOp toElementsOp,
+                           SmallVectorImpl<OpFoldResult> &results) {
+  auto fromElementsOp =
+      toElementsOp.getSource().getDefiningOp<FromElementsOp>();
+  if (!fromElementsOp)
+    return failure();
+
+  llvm::append_range(results, fromElementsOp.getElements());
+  return success();
+}
+
+LogicalResult ToElementsOp::fold(FoldAdaptor adaptor,
+                                 SmallVectorImpl<OpFoldResult> &results) {
+  return foldToElementsFromElements(*this, results);
+}
+
 //===----------------------------------------------------------------------===//
 // FromElementsOp
 //===----------------------------------------------------------------------===//
 
+/// Folds vector.from_elements(vector.to_elements(%vector)) into %vector.
+///
+/// Case #1: Input and output vectors are the same.
+///
+///   %0:3 = vector.to_elements %a : vector<3xf32>
+///   %1 = vector.from_elements %0#0, %0#1, %0#2 : vector<3xf32>
+///   user_op %1
+///
+/// becomes:
+///
+///   user_op %a
+///
+static OpFoldResult foldFromElementsToElements(FromElementsOp fromElementsOp) {
+  OperandRange fromElemsOperands = fromElementsOp.getElements();
+  if (fromElemsOperands.empty())
+    return {};
+
+  auto toElementsOp = fromElemsOperands[0].getDefiningOp<ToElementsOp>();
+  if (!toElementsOp)
+    return {};
+
+  if (!haveSameDefiningOp(fromElemsOperands, toElementsOp))
+    return {};
+
+  // Case #1: Input and output vectors are the same. Forward the input vector.
+  Value toElementsInput = toElementsOp.getSource();
+  if (fromElementsOp.getType() == toElementsInput.getType() &&
+      llvm::equal(fromElemsOperands, toElementsOp.getResults())) {
+    return toElementsInput;
+  }
+
+  // TODO: Support cases with different input and output shapes and different
+  // number of elements.
+
+  return {};
+}
+
+OpFoldResult FromElementsOp::fold(FoldAdaptor adaptor) {
+  return foldFromElementsToElements(*this);
+}
+
 /// Rewrite a vector.from_elements into a vector.splat if all elements are the
 /// same SSA value. E.g.:
 ///
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 6691cb52acdc..65b73375831d 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3023,6 +3023,58 @@ func.func @extract_from_0d_splat_broadcast_regression(%a: f32, %b: vector<f32>,
 
 // -----
 
+// CHECK-LABEL: func @to_elements_from_elements_no_op(
+// CHECK-SAME:     %[[A:.*]]: f32, %[[B:.*]]: f32
+func.func @to_elements_from_elements_no_op(%a: f32, %b: f32) -> (f32, f32) {
+  // CHECK-NOT: vector.from_elements
+  // CHECK-NOT: vector.to_elements
+  %0 = vector.from_elements %b, %a : vector<2xf32>
+  %1:2 = vector.to_elements %0 : vector<2xf32>
+  // CHECK: return %[[B]], %[[A]]
+  return %1#0, %1#1 : f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @from_elements_to_elements_no_op(
+// CHECK-SAME:     %[[A:.*]]: vector<4x2xf32>
+func.func @from_elements_to_elements_no_op(%a: vector<4x2xf32>) -> vector<4x2xf32> {
+  // CHECK-NOT: vector.from_elements
+  // CHECK-NOT: vector.to_elements
+  %0:8 = vector.to_elements %a : vector<4x2xf32>
+  %1 = vector.from_elements %0#0, %0#1, %0#2, %0#3, %0#4, %0#5, %0#6, %0#7 : vector<4x2xf32>
+  // CHECK: return %[[A]]
+  return %1 : vector<4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @from_elements_to_elements_dup_elems(
+// CHECK-SAME:     %[[A:.*]]: vector<4xf32>
+func.func @from_elements_to_elements_dup_elems(%a: vector<4xf32>) -> vector<4x2xf32> {
+  // CHECK: %[[TO_EL:.*]]:4 = vector.to_elements %[[A]]
+  // CHECK: %[[FROM_EL:.*]] = vector.from_elements %[[TO_EL]]#0, %[[TO_EL]]#1, %[[TO_EL]]#2
+  %0:4 = vector.to_elements %a : vector<4xf32> // 4 elements
+  %1 = vector.from_elements %0#0, %0#1, %0#2, %0#3, %0#0, %0#1, %0#2, %0#3 : vector<4x2xf32>
+  // CHECK: return %[[FROM_EL]]
+  return %1 : vector<4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @from_elements_to_elements_shuffle(
+// CHECK-SAME:     %[[A:.*]]: vector<4x2xf32>
+func.func @from_elements_to_elements_shuffle(%a: vector<4x2xf32>) -> vector<4x2xf32> {
+  // CHECK: %[[TO_EL:.*]]:8 = vector.to_elements %[[A]]
+  // CHECK: %[[FROM_EL:.*]] = vector.from_elements %[[TO_EL]]#7, %[[TO_EL]]#0, %[[TO_EL]]#6
+  %0:8 = vector.to_elements %a : vector<4x2xf32>
+  %1 = vector.from_elements %0#7, %0#0, %0#6, %0#1, %0#5, %0#2, %0#4, %0#3 : vector<4x2xf32>
+  // CHECK: return %[[FROM_EL]]
+  return %1 : vector<4x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @vector_insert_const_regression(
 //       CHECK:   llvm.mlir.undef
 //       CHECK:   vector.insert

From 521adc9fa270c1524f125f155327bf8f3743bb87 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 20 Jun 2025 19:17:58 +0100
Subject: [PATCH 1094/1322] [VPlan] Use createScalarZExtOrTrunc when expanding
 expandVPWidenIntOrFpInduction

Split off from #144666
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp         | 4 ++--
 llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b6ca50549fa3..fbfc527ff7db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2608,8 +2608,8 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
       VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
                                     DL);
     else
-      VF =
-          Builder.createScalarCast(Instruction::CastOps::Trunc, VF, StepTy, DL);
+      VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
+                                           TypeInfo.inferScalarType(VF), DL);
 
     Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
     Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 7654bc9a141e..49a7fb734ade 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -96,8 +96,7 @@ define void @iv_expand(ptr %p, i64 %n) {
 ; CHECK-NEXT:     EMIT vp<[[BROADCAST_1:%.+]]> = broadcast ir<1>
 ; CHECK-NEXT:     EMIT vp<[[MUL:%.+]]> = mul vp<[[STEP_VECTOR]]>, vp<[[BROADCAST_1]]>
 ; CHECK-NEXT:     EMIT vp<[[INDUCTION:%.+]]> = add vp<[[BROADCAST_0]]>, vp<[[MUL]]>
-; CHECK-NEXT:     EMIT vp<[[TRUNC:%.+]]> = trunc ir<8> to i64
-; CHECK-NEXT:     EMIT vp<[[INC:%.+]]> = mul ir<1>, vp<[[TRUNC]]>
+; CHECK-NEXT:     EMIT vp<[[INC:%.+]]> = mul ir<1>, ir<8>
 ; CHECK-NEXT:     EMIT vp<[[BROADCAST_INC:%.+]]> = broadcast vp<[[INC]]>
 ; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:

From 3f42c6bddd2495710331c82ce117ee7d5a58856d Mon Sep 17 00:00:00 2001
From: "Deric C." <cheung.deric@gmail.com>
Date: Fri, 20 Jun 2025 11:20:30 -0700
Subject: [PATCH 1095/1322] [DirectX] Scalarize `extractelement` and
 `insertelement` with dynamic indices (#141676)

Fixes #141136

- Implement `visitExtractElementInst` and `visitInsertElementInst` in
`DXILDataScalarizerVisitor` to scalarize `extractelement` and
`insertelement` instructions whose index operand is not a `ConstantInt`
by converting the vector to an array and then loading from the array
- Rename the `replaceVectorWithArray` helper function to
`equivalentArrayTypeFromVector`, relocate the function toward the top of
the file, and remove the unused `Ctx` parameter
---
 .../Target/DirectX/DXILDataScalarization.cpp  | 170 ++++++++++++++--
 .../DirectX/scalarize-dynamic-vector-index.ll | 182 ++++++++++++++++++
 2 files changed, 331 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 06708cec00ce..286fd0c5bb97 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -27,6 +27,20 @@ static const int MaxVecSize = 4;
 
 using namespace llvm;
 
+// Recursively creates an array-like version of a given vector type.
+static Type *equivalentArrayTypeFromVector(Type *T) {
+  if (auto *VecTy = dyn_cast<VectorType>(T))
+    return ArrayType::get(VecTy->getElementType(),
+                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
+  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
+    Type *NewElementType =
+        equivalentArrayTypeFromVector(ArrayTy->getElementType());
+    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
+  }
+  // If it's not a vector or array, return the original type.
+  return T;
+}
+
 class DXILDataScalarizationLegacy : public ModulePass {
 
 public:
@@ -54,8 +68,8 @@ public:
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
   bool visitCastInst(CastInst &CI) { return false; }
   bool visitBitCastInst(BitCastInst &BCI) { return false; }
-  bool visitInsertElementInst(InsertElementInst &IEI) { return false; }
-  bool visitExtractElementInst(ExtractElementInst &EEI) { return false; }
+  bool visitInsertElementInst(InsertElementInst &IEI);
+  bool visitExtractElementInst(ExtractElementInst &EEI);
   bool visitShuffleVectorInst(ShuffleVectorInst &SVI) { return false; }
   bool visitPHINode(PHINode &PHI) { return false; }
   bool visitLoadInst(LoadInst &LI);
@@ -65,6 +79,16 @@ public:
   friend bool findAndReplaceVectors(llvm::Module &M);
 
 private:
+  typedef std::pair<AllocaInst *, SmallVector<Value *, 4>> AllocaAndGEPs;
+  typedef SmallDenseMap<Value *, AllocaAndGEPs>
+      VectorToArrayMap; // A map from a vector-typed Value to its corresponding
+                        // AllocaInst and GEPs to each element of an array
+  VectorToArrayMap VectorAllocaMap;
+  AllocaAndGEPs createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
+                                      const Twine &Name);
+  bool replaceDynamicInsertElementInst(InsertElementInst &IEI);
+  bool replaceDynamicExtractElementInst(ExtractElementInst &EEI);
+
   GlobalVariable *lookupReplacementGlobal(Value *CurrOperand);
   DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
 };
@@ -76,6 +100,7 @@ bool DataScalarizerVisitor::visit(Function &F) {
     for (Instruction &I : make_early_inc_range(*BB))
       MadeChange |= InstVisitor::visit(I);
   }
+  VectorAllocaMap.clear();
   return MadeChange;
 }
 
@@ -90,20 +115,6 @@ DataScalarizerVisitor::lookupReplacementGlobal(Value *CurrOperand) {
   return nullptr; // Not found
 }
 
-// Recursively creates an array version of the given vector type.
-static Type *replaceVectorWithArray(Type *T, LLVMContext &Ctx) {
-  if (auto *VecTy = dyn_cast<VectorType>(T))
-    return ArrayType::get(VecTy->getElementType(),
-                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
-  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
-    Type *NewElementType =
-        replaceVectorWithArray(ArrayTy->getElementType(), Ctx);
-    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
-  }
-  // If it's not a vector or array, return the original type.
-  return T;
-}
-
 static bool isArrayOfVectors(Type *T) {
   if (ArrayType *ArrType = dyn_cast<ArrayType>(T))
     return isa<VectorType>(ArrType->getElementType());
@@ -116,8 +127,7 @@ bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
 
   ArrayType *ArrType = cast<ArrayType>(AI.getAllocatedType());
   IRBuilder<> Builder(&AI);
-  LLVMContext &Ctx = AI.getContext();
-  Type *NewType = replaceVectorWithArray(ArrType, Ctx);
+  Type *NewType = equivalentArrayTypeFromVector(ArrType);
   AllocaInst *ArrAlloca =
       Builder.CreateAlloca(NewType, nullptr, AI.getName() + ".scalarize");
   ArrAlloca->setAlignment(AI.getAlign());
@@ -173,6 +183,124 @@ bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   return false;
 }
 
+DataScalarizerVisitor::AllocaAndGEPs
+DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
+                                             const Twine &Name = "") {
+  // If there is already an alloca for this vector, return it
+  if (VectorAllocaMap.contains(Vec))
+    return VectorAllocaMap[Vec];
+
+  auto InsertPoint = Builder.GetInsertPoint();
+
+  // Allocate the array to hold the vector elements
+  Builder.SetInsertPointPastAllocas(Builder.GetInsertBlock()->getParent());
+  Type *ArrTy = equivalentArrayTypeFromVector(Vec->getType());
+  AllocaInst *ArrAlloca =
+      Builder.CreateAlloca(ArrTy, nullptr, Name + ".alloca");
+  const uint64_t ArrNumElems = ArrTy->getArrayNumElements();
+
+  // Create loads and stores to populate the array immediately after the
+  // original vector's defining instruction if available, else immediately after
+  // the alloca
+  if (auto *Instr = dyn_cast<Instruction>(Vec))
+    Builder.SetInsertPoint(Instr->getNextNonDebugInstruction());
+  SmallVector<Value *, 4> GEPs(ArrNumElems);
+  for (unsigned I = 0; I < ArrNumElems; ++I) {
+    Value *EE = Builder.CreateExtractElement(Vec, I, Name + ".extract");
+    GEPs[I] = Builder.CreateInBoundsGEP(
+        ArrTy, ArrAlloca, {Builder.getInt32(0), Builder.getInt32(I)},
+        Name + ".index");
+    Builder.CreateStore(EE, GEPs[I]);
+  }
+
+  VectorAllocaMap.insert({Vec, {ArrAlloca, GEPs}});
+  Builder.SetInsertPoint(InsertPoint);
+  return {ArrAlloca, GEPs};
+}
+
+/// Returns a pair of Value* with the first being a GEP into ArrAlloca using
+/// indices {0, Index}, and the second Value* being a Load of the GEP
+static std::pair<Value *, Value *>
+dynamicallyLoadArray(IRBuilder<> &Builder, AllocaInst *ArrAlloca, Value *Index,
+                     const Twine &Name = "") {
+  Type *ArrTy = ArrAlloca->getAllocatedType();
+  Value *GEP = Builder.CreateInBoundsGEP(
+      ArrTy, ArrAlloca, {Builder.getInt32(0), Index}, Name + ".index");
+  Value *Load =
+      Builder.CreateLoad(ArrTy->getArrayElementType(), GEP, Name + ".load");
+  return std::make_pair(GEP, Load);
+}
+
+bool DataScalarizerVisitor::replaceDynamicInsertElementInst(
+    InsertElementInst &IEI) {
+  IRBuilder<> Builder(&IEI);
+
+  Value *Vec = IEI.getOperand(0);
+  Value *Val = IEI.getOperand(1);
+  Value *Index = IEI.getOperand(2);
+
+  AllocaAndGEPs ArrAllocaAndGEPs =
+      createArrayFromVector(Builder, Vec, IEI.getName());
+  AllocaInst *ArrAlloca = ArrAllocaAndGEPs.first;
+  Type *ArrTy = ArrAlloca->getAllocatedType();
+  SmallVector<Value *, 4> &ArrGEPs = ArrAllocaAndGEPs.second;
+
+  auto GEPAndLoad =
+      dynamicallyLoadArray(Builder, ArrAlloca, Index, IEI.getName());
+  Value *GEP = GEPAndLoad.first;
+  Value *Load = GEPAndLoad.second;
+
+  Builder.CreateStore(Val, GEP);
+  Value *NewIEI = PoisonValue::get(Vec->getType());
+  for (unsigned I = 0; I < ArrTy->getArrayNumElements(); ++I) {
+    Value *Load = Builder.CreateLoad(ArrTy->getArrayElementType(), ArrGEPs[I],
+                                     IEI.getName() + ".load");
+    NewIEI = Builder.CreateInsertElement(NewIEI, Load, Builder.getInt32(I),
+                                         IEI.getName() + ".insert");
+  }
+
+  // Store back the original value so the Alloca can be reused for subsequent
+  // insertelement instructions on the same vector
+  Builder.CreateStore(Load, GEP);
+
+  IEI.replaceAllUsesWith(NewIEI);
+  IEI.eraseFromParent();
+  return true;
+}
+
+bool DataScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
+  // If the index is a constant then we don't need to scalarize it
+  Value *Index = IEI.getOperand(2);
+  if (isa<ConstantInt>(Index))
+    return false;
+  return replaceDynamicInsertElementInst(IEI);
+}
+
+bool DataScalarizerVisitor::replaceDynamicExtractElementInst(
+    ExtractElementInst &EEI) {
+  IRBuilder<> Builder(&EEI);
+
+  AllocaAndGEPs ArrAllocaAndGEPs =
+      createArrayFromVector(Builder, EEI.getVectorOperand(), EEI.getName());
+  AllocaInst *ArrAlloca = ArrAllocaAndGEPs.first;
+
+  auto GEPAndLoad = dynamicallyLoadArray(Builder, ArrAlloca,
+                                         EEI.getIndexOperand(), EEI.getName());
+  Value *Load = GEPAndLoad.second;
+
+  EEI.replaceAllUsesWith(Load);
+  EEI.eraseFromParent();
+  return true;
+}
+
+bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+  // If the index is a constant then we don't need to scalarize it
+  Value *Index = EEI.getIndexOperand();
+  if (isa<ConstantInt>(Index))
+    return false;
+  return replaceDynamicExtractElementInst(EEI);
+}
+
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
 
   unsigned NumOperands = GEPI.getNumOperands();
@@ -197,8 +325,8 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   return true;
 }
 
-Constant *transformInitializer(Constant *Init, Type *OrigType, Type *NewType,
-                               LLVMContext &Ctx) {
+static Constant *transformInitializer(Constant *Init, Type *OrigType,
+                                      Type *NewType, LLVMContext &Ctx) {
   // Handle ConstantAggregateZero (zero-initialized constants)
   if (isa<ConstantAggregateZero>(Init)) {
     return ConstantAggregateZero::get(NewType);
@@ -257,7 +385,7 @@ static bool findAndReplaceVectors(Module &M) {
   for (GlobalVariable &G : M.globals()) {
     Type *OrigType = G.getValueType();
 
-    Type *NewType = replaceVectorWithArray(OrigType, Ctx);
+    Type *NewType = equivalentArrayTypeFromVector(OrigType);
     if (OrigType != NewType) {
       // Create a new global variable with the updated type
       // Note: Initializer is set via transformInitializer
diff --git a/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
new file mode 100644
index 000000000000..0eb65bd4fc75
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/scalarize-dynamic-vector-index.ll
@@ -0,0 +1,182 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Ensure that insertelement instructions have no side effects on each other
+; even in the presence of control flow
+define void @test_multiple_insert(i32 %c, i32 %i, i32 %j) {
+; CHECK-LABEL: define void @test_multiple_insert(
+; CHECK-SAME: i32 [[C:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) {
+; CHECK-NEXT:    [[V0_ALLOCA:%.*]] = alloca [2 x i32], align 4
+; CHECK-NEXT:    [[V_ALLOCA:%.*]] = alloca [2 x i32], align 4
+; CHECK-NEXT:    [[V0_0:%.*]] = insertelement <2 x i32> poison, i32 0, i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <2 x i32> [[V0_0]], i32 0, i32 1
+; CHECK-NEXT:    [[V0_EXTRACT0:%.*]] = extractelement <2 x i32> [[V0]], i64 0
+; CHECK-NEXT:    [[V0_INDEX0:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[V0_EXTRACT0]], ptr [[V0_INDEX0]], align 4
+; CHECK-NEXT:    [[V0_EXTRACT1:%.*]] = extractelement <2 x i32> [[V0]], i64 1
+; CHECK-NEXT:    [[V0_INDEX1:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[V0_EXTRACT1]], ptr [[V0_INDEX1]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[C]], 1
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[V1_INDEX:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[V1_LOAD:%.*]] = load i32, ptr [[V1_INDEX]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[V1_INDEX]], align 4
+; CHECK-NEXT:    [[V1_LOAD0:%.*]] = load i32, ptr [[V0_INDEX0]], align 4
+; CHECK-NEXT:    [[V1_INSERT0:%.*]] = insertelement <2 x i32> poison, i32 [[V1_LOAD0]], i32 0
+; CHECK-NEXT:    [[V1_LOAD1:%.*]] = load i32, ptr [[V0_INDEX1]], align 4
+; CHECK-NEXT:    [[V1_INSERT1:%.*]] = insertelement <2 x i32> [[V1_INSERT0]], i32 [[V1_LOAD1]], i32 1
+; CHECK-NEXT:    store i32 [[V1_LOAD]], ptr [[V1_INDEX]], align 4
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[V2_INDEX:%.*]] = getelementptr inbounds [2 x i32], ptr [[V0_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[V2_LOAD:%.*]] = load i32, ptr [[V2_INDEX]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[V2_INDEX]], align 4
+; CHECK-NEXT:    [[V2_LOAD0:%.*]] = load i32, ptr [[V0_INDEX0]], align 4
+; CHECK-NEXT:    [[V2_INSERT0:%.*]] = insertelement <2 x i32> poison, i32 [[V2_LOAD0]], i32 0
+; CHECK-NEXT:    [[V2_LOAD1:%.*]] = load i32, ptr [[V0_INDEX1]], align 4
+; CHECK-NEXT:    [[V2_INSERT1:%.*]] = insertelement <2 x i32> [[V2_INSERT0]], i32 [[V2_LOAD1]], i32 1
+; CHECK-NEXT:    store i32 [[V2_LOAD]], ptr [[V2_INDEX]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[V:%.*]] = phi <2 x i32> [ [[V1_INSERT1]], %[[IF]] ], [ [[V2_INSERT1]], %[[ELSE]] ]
+; CHECK-NEXT:    [[V_EXTRACT:%.*]] = extractelement <2 x i32> [[V]], i64 0
+; CHECK-NEXT:    [[V_INDEX:%.*]] = getelementptr inbounds [2 x i32], ptr [[V_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[V_EXTRACT]], ptr [[V_INDEX]], align 4
+; CHECK-NEXT:    [[V_EXTRACT10:%.*]] = extractelement <2 x i32> [[V]], i64 1
+; CHECK-NEXT:    [[V_INDEX1:%.*]] = getelementptr inbounds [2 x i32], ptr [[V_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[V_EXTRACT10]], ptr [[V_INDEX1]], align 4
+; CHECK-NEXT:    [[V3_INDEXJ:%.*]] = getelementptr inbounds [2 x i32], ptr [[V_ALLOCA]], i32 0, i32 [[J]]
+; CHECK-NEXT:    [[V3_LOAD:%.*]] = load i32, ptr [[V3_INDEXJ]], align 4
+; CHECK-NEXT:    store i32 3, ptr [[V3_INDEXJ]], align 4
+; CHECK-NEXT:    [[V3_LOAD0:%.*]] = load i32, ptr [[V_INDEX]], align 4
+; CHECK-NEXT:    [[V3_INSERT0:%.*]] = insertelement <2 x i32> poison, i32 [[V3_LOAD0]], i32 0
+; CHECK-NEXT:    [[V3_LOAD1:%.*]] = load i32, ptr [[V_INDEX1]], align 4
+; CHECK-NEXT:    [[V3_INSERT1:%.*]] = insertelement <2 x i32> [[V3_INSERT0]], i32 [[V3_LOAD1]], i32 1
+; CHECK-NEXT:    store i32 [[V3_LOAD]], ptr [[V3_INDEXJ]], align 4
+; CHECK-NEXT:    ret void
+;
+  %v0_0 = insertelement <2 x i32> poison, i32 0, i32 0
+  %v0 = insertelement <2 x i32> %v0_0, i32 0, i32 1
+  %cond = icmp eq i32 %c, 1
+  br i1 %cond, label %if, label %else
+if:
+  %v1 = insertelement <2 x i32> %v0, i32 1, i32 %i
+  br label %exit
+else:
+  %v2 = insertelement <2 x i32> %v0, i32 2, i32 %i
+  br label %exit
+exit:
+  %v = phi <2 x i32> [ %v1, %if ], [ %v2, %else ]
+  %v3 = insertelement <2 x i32> %v, i32 3, i32 %j
+  ret void
+}
+
+; Allocas can be reused across insert/extractelement instructions on the same vector
+define void @test_alloca_reuse(<3 x i32> %v, i32 %a, i32 %i) {
+; CHECK-LABEL: define void @test_alloca_reuse(
+; CHECK-SAME: <3 x i32> [[V:%.*]], i32 [[A:%.*]], i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[EE1_ALLOCA:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[EE1_EXTRACT:%.*]] = extractelement <3 x i32> [[V]], i64 0
+; CHECK-NEXT:    [[EE1_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[EE1_EXTRACT]], ptr [[EE1_INDEX]], align 4
+; CHECK-NEXT:    [[EE1_EXTRACT1:%.*]] = extractelement <3 x i32> [[V]], i64 1
+; CHECK-NEXT:    [[EE1_INDEX2:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[EE1_EXTRACT1]], ptr [[EE1_INDEX2]], align 4
+; CHECK-NEXT:    [[EE1_EXTRACT3:%.*]] = extractelement <3 x i32> [[V]], i64 2
+; CHECK-NEXT:    [[EE1_INDEX4:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[EE1_EXTRACT3]], ptr [[EE1_INDEX4]], align 4
+; CHECK-NEXT:    [[EE1_INDEX5:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[EE1_LOAD:%.*]] = load i32, ptr [[EE1_INDEX5]], align 4
+; CHECK-NEXT:    [[IE1_DYNINDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[EE1_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[IE1_LOAD1:%.*]] = load i32, ptr [[IE1_DYNINDEX]], align 4
+; CHECK-NEXT:    store i32 [[A]], ptr [[IE1_DYNINDEX]], align 4
+; CHECK-NEXT:    [[IE1_LOAD:%.*]] = load i32, ptr [[EE1_INDEX]], align 4
+; CHECK-NEXT:    [[IE1_INSERT:%.*]] = insertelement <3 x i32> poison, i32 [[IE1_LOAD]], i32 0
+; CHECK-NEXT:    [[IE1_LOAD6:%.*]] = load i32, ptr [[EE1_INDEX2]], align 4
+; CHECK-NEXT:    [[IE1_INSERT7:%.*]] = insertelement <3 x i32> [[IE1_INSERT]], i32 [[IE1_LOAD6]], i32 1
+; CHECK-NEXT:    [[IE1_LOAD8:%.*]] = load i32, ptr [[EE1_INDEX4]], align 4
+; CHECK-NEXT:    [[IE1_INSERT9:%.*]] = insertelement <3 x i32> [[IE1_INSERT7]], i32 [[IE1_LOAD8]], i32 2
+; CHECK-NEXT:    store i32 [[IE1_LOAD1]], ptr [[IE1_DYNINDEX]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ee1 = extractelement <3 x i32> %v, i32 %i
+  %ie1 = insertelement <3 x i32> %v, i32 %a, i32 %i
+  ret void
+}
+
+define float @extract_float_vec_dynamic(<4 x float> %v, i32 %i) {
+; CHECK-LABEL: define float @extract_float_vec_dynamic(
+; CHECK-SAME: <4 x float> [[V:%.*]], i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[EE_ALLOCA:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT:    [[EE_EXTRACT:%.*]] = extractelement <4 x float> [[V]], i64 0
+; CHECK-NEXT:    [[EE_INDEX:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store float [[EE_EXTRACT]], ptr [[EE_INDEX]], align 4
+; CHECK-NEXT:    [[EE_EXTRACT1:%.*]] = extractelement <4 x float> [[V]], i64 1
+; CHECK-NEXT:    [[EE_INDEX2:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store float [[EE_EXTRACT1]], ptr [[EE_INDEX2]], align 4
+; CHECK-NEXT:    [[EE_EXTRACT3:%.*]] = extractelement <4 x float> [[V]], i64 2
+; CHECK-NEXT:    [[EE_INDEX4:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store float [[EE_EXTRACT3]], ptr [[EE_INDEX4]], align 4
+; CHECK-NEXT:    [[EE_EXTRACT5:%.*]] = extractelement <4 x float> [[V]], i64 3
+; CHECK-NEXT:    [[EE_INDEX6:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 3
+; CHECK-NEXT:    store float [[EE_EXTRACT5]], ptr [[EE_INDEX6]], align 4
+; CHECK-NEXT:    [[EE_INDEX7:%.*]] = getelementptr inbounds [4 x float], ptr [[EE_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[EE_LOAD:%.*]] = load float, ptr [[EE_INDEX7]], align 4
+; CHECK-NEXT:    ret float [[EE_LOAD]]
+;
+  %ee = extractelement <4 x float> %v, i32 %i
+  ret float %ee
+}
+
+define <3 x i32> @insert_i32_vec_dynamic(<3 x i32> %v, i32 %a, i32 %i) {
+; CHECK-LABEL: define <3 x i32> @insert_i32_vec_dynamic(
+; CHECK-SAME: <3 x i32> [[V:%.*]], i32 [[A:%.*]], i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[IE_ALLOCA:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[IE_EXTRACT:%.*]] = extractelement <3 x i32> [[V]], i64 0
+; CHECK-NEXT:    [[IE_INDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 [[IE_EXTRACT]], ptr [[IE_INDEX]], align 4
+; CHECK-NEXT:    [[IE_EXTRACT1:%.*]] = extractelement <3 x i32> [[V]], i64 1
+; CHECK-NEXT:    [[IE_INDEX2:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[IE_EXTRACT1]], ptr [[IE_INDEX2]], align 4
+; CHECK-NEXT:    [[IE_EXTRACT3:%.*]] = extractelement <3 x i32> [[V]], i64 2
+; CHECK-NEXT:    [[IE_INDEX4:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[IE_EXTRACT3]], ptr [[IE_INDEX4]], align 4
+; CHECK-NEXT:    [[IE_DYNINDEX:%.*]] = getelementptr inbounds [3 x i32], ptr [[IE_ALLOCA]], i32 0, i32 [[I]]
+; CHECK-NEXT:    [[IE_LOAD1:%.*]] = load i32, ptr [[IE_DYNINDEX]], align 4
+; CHECK-NEXT:    store i32 [[A]], ptr [[IE_DYNINDEX]], align 4
+; CHECK-NEXT:    [[IE_LOAD:%.*]] = load i32, ptr [[IE_INDEX]], align 4
+; CHECK-NEXT:    [[IE_INSERT:%.*]] = insertelement <3 x i32> poison, i32 [[IE_LOAD]], i32 0
+; CHECK-NEXT:    [[IE_LOAD5:%.*]] = load i32, ptr [[IE_INDEX2]], align 4
+; CHECK-NEXT:    [[IE_INSERT6:%.*]] = insertelement <3 x i32> [[IE_INSERT]], i32 [[IE_LOAD5]], i32 1
+; CHECK-NEXT:    [[IE_LOAD7:%.*]] = load i32, ptr [[IE_INDEX4]], align 4
+; CHECK-NEXT:    [[IE_INSERT8:%.*]] = insertelement <3 x i32> [[IE_INSERT6]], i32 [[IE_LOAD7]], i32 2
+; CHECK-NEXT:    store i32 [[IE_LOAD1]], ptr [[IE_DYNINDEX]], align 4
+; CHECK-NEXT:    ret <3 x i32> [[IE_INSERT8]]
+;
+  %ie = insertelement <3 x i32> %v, i32 %a, i32 %i
+  ret <3 x i32> %ie
+}
+
+; An extractelement with a constant index should not be converted to array form
+define i16 @extract_i16_vec_constant(<4 x i16> %v) {
+; CHECK-LABEL: define i16 @extract_i16_vec_constant(
+; CHECK-SAME: <4 x i16> [[V:%.*]]) {
+; CHECK-NEXT:    [[EE:%.*]] = extractelement <4 x i16> [[V]], i32 1
+; CHECK-NEXT:    ret i16 [[EE]]
+;
+  %ee = extractelement <4 x i16> %v, i32 1
+  ret i16 %ee
+}
+
+; An insertelement with a constant index should not be converted to array form
+define <2 x half> @insert_half_vec_constant(<2 x half> %v, half %a) {
+; CHECK-LABEL: define <2 x half> @insert_half_vec_constant(
+; CHECK-SAME: <2 x half> [[V:%.*]], half [[A:%.*]]) {
+; CHECK-NEXT:    [[IE:%.*]] = insertelement <2 x half> [[V]], half [[A]], i32 1
+; CHECK-NEXT:    ret <2 x half> [[IE]]
+;
+  %ie = insertelement <2 x half> %v, half %a, i32 1
+  ret <2 x half> %ie
+}
+

From 925dbc798828d78e5300972dfdabb89955216229 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 20 Jun 2025 13:18:38 -0500
Subject: [PATCH 1096/1322] [flang][OpenMP] Fix namespace nesting after
 PR144960

Newly introduced Atomic.cpp fails to compile on its own, but somehow
compiles fine in the build. Maybe it's because PCH, but it needs to be
fixed nevertheless.
---
 flang/lib/Lower/OpenMP/Atomic.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/Atomic.h b/flang/lib/Lower/OpenMP/Atomic.h
index b83773b11300..96db4d7e90c8 100644
--- a/flang/lib/Lower/OpenMP/Atomic.h
+++ b/flang/lib/Lower/OpenMP/Atomic.h
@@ -12,16 +12,16 @@ namespace Fortran {
 namespace lower {
 class AbstractConverter;
 class SymMap;
+
+namespace pft {
+struct Evaluation;
+}
 } // namespace lower
 
 namespace parser {
 struct OpenMPAtomicConstruct;
 }
 
-namespace pft {
-struct Evaluation;
-}
-
 namespace semantics {
 class SemanticsContext;
 }

From f159774352c37a75829c04febb89f141175fc2bf Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Fri, 20 Jun 2025 14:23:39 -0400
Subject: [PATCH 1097/1322] [mlir][core|ptr] Add `PtrLikeTypeInterface` and
 casting ops to the `ptr` dialect (#137469)

This patch adds the `PtrLikeTypeInterface` type interface to identify
pointer-like types. This interface is defined as:

```
A ptr-like type represents an object storing a memory address. This object
is constituted by:
- A memory address called the base pointer. This pointer is treated as a
  bag of bits without any assumed structure. The bit-width of the base
  pointer must be a compile-time constant. However, the bit-width may remain
  opaque or unavailable during transformations that do not depend on the
  base pointer. Finally, it is considered indivisible in the sense that as
  a `PtrLikeTypeInterface` value, it has no metadata.
- Optional metadata about the pointer. For example, the size of the  memory
  region associated with the pointer.

Furthermore, all ptr-like types have two properties:
- The memory space associated with the address held by the pointer.
- An optional element type. If the element type is not specified, the
  pointer is considered opaque.
```

This patch adds this interface to `!ptr.ptr` and the `memref` type.

Furthermore, this patch adds necessary ops and type to handle casting
between `!ptr.ptr` and ptr-like types.

First, it defines the `!ptr.ptr_metadata` type. An opaque type to
represent the metadata of a ptr-like type. The rationale behind adding
this type, is that at high-level the metadata of a type like `memref`
cannot be specified, as its structure is tied to its lowering.

The `ptr.get_metadata` operation was added to extract the opaque pointer
metadata. The concrete structure of the metadata is only known when the
op is lowered.

Finally, this patch adds the `ptr.from_ptr` and `ptr.to_ptr` operations.
Allowing to cast back and forth between `!ptr.ptr` and ptr-like types.

```mlir
func.func @func(%mr: memref<f32, #ptr.generic_space>) -> memref<f32, #ptr.generic_space> {
  %ptr = ptr.to_ptr %mr : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
  %mda = ptr.get_metadata %mr : memref<f32, #ptr.generic_space>
  %res = ptr.from_ptr %ptr metadata %mda : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
  return %res : memref<f32, #ptr.generic_space>
}
```

It's future work to replace and remove the `bare-ptr-convention` through
the use of these ops.

---------

Co-authored-by: Mehdi Amini <joker.eph@gmail.com>
---
 .../include/mlir/Dialect/Ptr/IR/PtrDialect.td |  50 +++++++++
 mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td    | 100 ++++++++++++++++-
 mlir/include/mlir/IR/BuiltinTypeInterfaces.td |  53 +++++++++
 mlir/include/mlir/IR/BuiltinTypes.h           |  18 ++-
 mlir/include/mlir/IR/BuiltinTypes.td          |   2 +
 mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp        |  80 +++++++++++++
 mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp          |  12 ++
 mlir/lib/IR/BuiltinTypes.cpp                  |  14 +++
 mlir/test/Dialect/Ptr/canonicalize.mlir       | 106 ++++++++++++++++++
 mlir/test/Dialect/Ptr/invalid.mlir            |  16 +++
 mlir/test/Dialect/Ptr/ops.mlir                |  10 ++
 11 files changed, 458 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Dialect/Ptr/invalid.mlir

diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
index 73b2a0857cef..7407d74ce3a8 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
@@ -37,6 +37,7 @@ class Ptr_Type<string name, string typeMnemonic, list<Trait> traits = []>
 
 def Ptr_PtrType : Ptr_Type<"Ptr", "ptr", [
     MemRefElementTypeInterface,
+    PtrLikeTypeInterface,
     VectorElementTypeInterface,
     DeclareTypeInterfaceMethods<DataLayoutTypeInterface, [
       "areCompatible", "getIndexBitwidth", "verifyEntries",
@@ -63,6 +64,55 @@ def Ptr_PtrType : Ptr_Type<"Ptr", "ptr", [
       return $_get(memorySpace.getContext(), memorySpace);
     }]>
   ];
+  let extraClassDeclaration = [{
+    // `PtrLikeTypeInterface` interface methods.
+    /// Returns `Type()` as this pointer type is opaque.
+    Type getElementType() const {
+      return Type();
+    }
+    /// Clones the pointer with specified memory space or returns failure
+    /// if an `elementType` was specified or if the memory space doesn't
+    /// implement `MemorySpaceAttrInterface`.
+    FailureOr<PtrLikeTypeInterface> clonePtrWith(Attribute memorySpace,
+      std::optional<Type> elementType) const {
+      if (elementType)
+        return failure();
+      if (auto ms = dyn_cast<MemorySpaceAttrInterface>(memorySpace))
+        return cast<PtrLikeTypeInterface>(get(ms));
+      return failure();
+    }
+    /// `!ptr.ptr` types are seen as ptr-like objects with no metadata.
+    bool hasPtrMetadata() const {
+      return false;
+    }
+  }];
+}
+
+def Ptr_PtrMetadata : Ptr_Type<"PtrMetadata", "ptr_metadata"> {
+  let summary = "Pointer metadata type";
+  let description = [{
+    The `ptr_metadata` type represents an opaque-view of the metadata associated
+    with a `ptr-like` object type.
+
+    Note: It's a verification error to construct a `ptr_metadata` type using a
+    `ptr-like` type with no metadata.
+
+    Example:
+
+    ```mlir
+    // The metadata associated with a `memref` type.
+    !ptr.ptr_metadata<memref<f32>>
+    ```
+  }];
+  let parameters = (ins "PtrLikeTypeInterface":$type);
+  let assemblyFormat = "`<` $type `>`";
+  let builders = [
+    TypeBuilderWithInferredContext<(ins
+      "PtrLikeTypeInterface":$ptrLike), [{
+      return $_get(ptrLike.getContext(), ptrLike);
+    }]>
+  ];
+  let genVerifyDecl = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
index 791b95ad3559..1523762efc18 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td
@@ -17,6 +17,72 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 include "mlir/IR/OpAsmInterface.td"
 
+//===----------------------------------------------------------------------===//
+// FromPtrOp
+//===----------------------------------------------------------------------===//
+
+def Ptr_FromPtrOp : Pointer_Op<"from_ptr", [
+    Pure, OptionalTypesMatchWith<"metadata type", "result", "metadata",
+            "PtrMetadataType::get(cast<PtrLikeTypeInterface>($_self))">
+  ]> {
+  let summary = "Casts a `!ptr.ptr` value to a ptr-like value.";
+  let description = [{
+    The `from_ptr` operation casts a `ptr` value to a ptr-like object. It's
+    important to note that:
+    - The ptr-like object cannot be a `!ptr.ptr`.
+    - The memory-space of both the `ptr` and ptr-like object must match.
+    - The cast is Pure (no UB and side-effect free).
+
+    The optional `metadata` operand exists to provide any ptr-like metadata
+    that might be required to perform the cast.
+
+    Example:
+
+    ```mlir
+    %typed_ptr = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> !my.ptr<f32, #ptr.generic_space>
+    %memref = ptr.from_ptr %ptr metadata %md : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  
+    // Cast the `%ptr` to a memref without utilizing metadata.
+    %memref = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+    ```
+  }];
+
+  let arguments = (ins Ptr_PtrType:$ptr, Optional<Ptr_PtrMetadata>:$metadata);
+  let results = (outs PtrLikeTypeInterface:$result);
+  let assemblyFormat = [{
+    $ptr (`metadata` $metadata^)? attr-dict `:` type($ptr) `->` type($result)
+  }];
+  let hasFolder = 1;
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// GetMetadataOp
+//===----------------------------------------------------------------------===//
+
+def Ptr_GetMetadataOp : Pointer_Op<"get_metadata", [
+    Pure, TypesMatchWith<"metadata type", "ptr", "result",
+            "PtrMetadataType::get(cast<PtrLikeTypeInterface>($_self))">
+  ]> {
+  let summary = "SSA value representing pointer metadata.";
+  let description = [{
+    The `get_metadata` operation produces an opaque value that encodes the
+    metadata of the ptr-like type.
+
+    Example:
+
+    ```mlir
+    %metadata = ptr.get_metadata %memref : memref<?x?xf32>
+    ```
+  }];
+
+  let arguments = (ins PtrLikeTypeInterface:$ptr);
+  let results = (outs Ptr_PtrMetadata:$result);
+  let assemblyFormat = [{
+    $ptr attr-dict `:` type($ptr)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // PtrAddOp
 //===----------------------------------------------------------------------===//
@@ -32,8 +98,8 @@ def Ptr_PtrAddOp : Pointer_Op<"ptr_add", [
     Example:
 
     ```mlir
-    %x_off  = ptr.ptr_add %x, %off : !ptr.ptr<0>, i32
-    %x_off0 = ptr.ptr_add nusw %x, %off : !ptr.ptr<0>, i32
+    %x_off  = ptr.ptr_add %x, %off : !ptr.ptr<#ptr.generic_space>, i32
+    %x_off0 = ptr.ptr_add nusw %x, %off : !ptr.ptr<#ptr.generic_space>, i32
     ```
   }];
 
@@ -52,6 +118,36 @@ def Ptr_PtrAddOp : Pointer_Op<"ptr_add", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// ToPtrOp
+//===----------------------------------------------------------------------===//
+
+def Ptr_ToPtrOp : Pointer_Op<"to_ptr", [Pure]> {
+  let summary = "Casts a ptr-like value to a `!ptr.ptr` value.";
+  let description = [{
+    The `to_ptr` operation casts a ptr-like object to a `!ptr.ptr`. It's
+    important to note that:
+    - The ptr-like object cannot be a `!ptr.ptr`.
+    - The memory-space of both the `ptr` and ptr-like object must match.
+    - The cast is side-effect free.
+
+    Example:
+
+    ```mlir
+    %ptr0 = ptr.to_ptr %my_ptr : !my.ptr<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+    %ptr1 = ptr.to_ptr %memref : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+    ```
+  }];
+
+  let arguments = (ins PtrLikeTypeInterface:$ptr);
+  let results = (outs Ptr_PtrType:$result);
+  let assemblyFormat = [{
+    $ptr attr-dict `:` type($ptr) `->` type($result)
+  }];
+  let hasFolder = 1;
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // TypeOffsetOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
index 4a4f818b46c5..367aeb6ac512 100644
--- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
+++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td
@@ -110,6 +110,59 @@ def MemRefElementTypeInterface : TypeInterface<"MemRefElementTypeInterface"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// PtrLikeTypeInterface
+//===----------------------------------------------------------------------===//
+
+def PtrLikeTypeInterface : TypeInterface<"PtrLikeTypeInterface"> {
+  let cppNamespace = "::mlir";
+  let description = [{
+    A ptr-like type represents an object storing a memory address. This object
+    is constituted by:
+    - A memory address called the base pointer. This pointer is treated as a
+      bag of bits without any assumed structure. The bit-width of the base
+      pointer must be a compile-time constant. However, the bit-width may remain
+      opaque or unavailable during transformations that do not depend on the
+      base pointer. Finally, it is considered indivisible in the sense that as
+      a `PtrLikeTypeInterface` value, it has no metadata.
+    - Optional metadata about the pointer. For example, the size of the  memory
+      region associated with the pointer.
+
+    Furthermore, all ptr-like types have two properties:
+    - The memory space associated with the address held by the pointer.
+    - An optional element type. If the element type is not specified, the
+      pointer is considered opaque.
+  }];
+  let methods = [
+    InterfaceMethod<[{
+      Returns the memory space of this ptr-like type.
+    }],
+    "::mlir::Attribute", "getMemorySpace">,
+    InterfaceMethod<[{
+      Returns the element type of this ptr-like type. Note: this method can
+      return `::mlir::Type()`, in which case the pointer is considered opaque.
+    }],
+    "::mlir::Type", "getElementType">,
+    InterfaceMethod<[{
+      Returns whether this ptr-like type has non-empty metadata.
+    }],
+    "bool", "hasPtrMetadata">,
+    InterfaceMethod<[{
+      Returns a clone of this type with the given memory space and element type,
+      or `failure` if the type cannot be cloned with the specified arguments.
+      If the pointer is opaque and `elementType` is not `std::nullopt` the
+      method will return `failure`.
+
+      If no `elementType` is provided and ptr is not opaque, the `elementType`
+      of this type is used.
+    }],
+    "::llvm::FailureOr<::mlir::PtrLikeTypeInterface>", "clonePtrWith", (ins
+      "::mlir::Attribute":$memorySpace,
+      "::std::optional<::mlir::Type>":$elementType
+    )>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // ShapedType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index df1e02732617..86ec5c43970b 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -99,7 +99,9 @@ public:
 /// Note: This class attaches the ShapedType trait to act as a mixin to
 ///       provide many useful utility functions. This inheritance has no effect
 ///       on derived memref types.
-class BaseMemRefType : public Type, public ShapedType::Trait<BaseMemRefType> {
+class BaseMemRefType : public Type,
+                       public PtrLikeTypeInterface::Trait<BaseMemRefType>,
+                       public ShapedType::Trait<BaseMemRefType> {
 public:
   using Type::Type;
 
@@ -117,6 +119,12 @@ public:
   BaseMemRefType cloneWith(std::optional<ArrayRef<int64_t>> shape,
                            Type elementType) const;
 
+  /// Clone this type with the given memory space and element type. If the
+  /// provided element type is `std::nullopt`, the current element type of the
+  /// type is used.
+  FailureOr<PtrLikeTypeInterface>
+  clonePtrWith(Attribute memorySpace, std::optional<Type> elementType) const;
+
   // Make sure that base class overloads are visible.
   using ShapedType::Trait<BaseMemRefType>::clone;
 
@@ -141,8 +149,16 @@ public:
   /// New `Attribute getMemorySpace()` method should be used instead.
   unsigned getMemorySpaceAsInt() const;
 
+  /// Returns that this ptr-like object has non-empty ptr metadata.
+  bool hasPtrMetadata() const { return true; }
+
   /// Allow implicit conversion to ShapedType.
   operator ShapedType() const { return llvm::cast<ShapedType>(*this); }
+
+  /// Allow implicit conversion to PtrLikeTypeInterface.
+  operator PtrLikeTypeInterface() const {
+    return llvm::cast<PtrLikeTypeInterface>(*this);
+  }
 };
 
 } // namespace mlir
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index d5663bcbf6a5..55d64d663f7e 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -562,6 +562,7 @@ def Builtin_Integer : Builtin_Type<"Integer", "integer",
 //===----------------------------------------------------------------------===//
 
 def Builtin_MemRef : Builtin_Type<"MemRef", "memref", [
+    PtrLikeTypeInterface,
     ShapedTypeInterface
   ], "BaseMemRefType"> {
   let summary = "Shaped reference to a region of memory";
@@ -1143,6 +1144,7 @@ def Builtin_Tuple : Builtin_Type<"Tuple", "tuple"> {
 //===----------------------------------------------------------------------===//
 
 def Builtin_UnrankedMemRef : Builtin_Type<"UnrankedMemRef", "unranked_memref", [
+    PtrLikeTypeInterface,
     ShapedTypeInterface
   ], "BaseMemRefType"> {
   let summary = "Shaped reference, with unknown rank, to a region of memory";
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
index c21783011452..c48814450812 100644
--- a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
+++ b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
@@ -41,6 +41,52 @@ void PtrDialect::initialize() {
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// FromPtrOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult FromPtrOp::fold(FoldAdaptor adaptor) {
+  // Fold the pattern:
+  // %ptr = ptr.to_ptr %v : type -> ptr
+  // (%mda = ptr.get_metadata %v : type)?
+  // %val = ptr.from_ptr %ptr (metadata %mda)? : ptr -> type
+  // To:
+  // %val -> %v
+  Value ptrLike;
+  FromPtrOp fromPtr = *this;
+  while (fromPtr != nullptr) {
+    auto toPtr = dyn_cast_or_null<ToPtrOp>(fromPtr.getPtr().getDefiningOp());
+    // Cannot fold if it's not a `to_ptr` op or the initial and final types are
+    // different.
+    if (!toPtr || toPtr.getPtr().getType() != fromPtr.getType())
+      return ptrLike;
+    Value md = fromPtr.getMetadata();
+    // If the type has trivial metadata fold.
+    if (!fromPtr.getType().hasPtrMetadata()) {
+      ptrLike = toPtr.getPtr();
+    } else if (md) {
+      // Fold if the metadata can be verified to be equal.
+      if (auto mdOp = dyn_cast_or_null<GetMetadataOp>(md.getDefiningOp());
+          mdOp && mdOp.getPtr() == toPtr.getPtr())
+        ptrLike = toPtr.getPtr();
+    }
+    // Check for a sequence of casts.
+    fromPtr = dyn_cast_or_null<FromPtrOp>(ptrLike ? ptrLike.getDefiningOp()
+                                                  : nullptr);
+  }
+  return ptrLike;
+}
+
+LogicalResult FromPtrOp::verify() {
+  if (isa<PtrType>(getType()))
+    return emitError() << "the result type cannot be `!ptr.ptr`";
+  if (getType().getMemorySpace() != getPtr().getType().getMemorySpace()) {
+    return emitError()
+           << "expected the input and output to have the same memory space";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // PtrAddOp
 //===----------------------------------------------------------------------===//
@@ -55,6 +101,40 @@ OpFoldResult PtrAddOp::fold(FoldAdaptor adaptor) {
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// ToPtrOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ToPtrOp::fold(FoldAdaptor adaptor) {
+  // Fold the pattern:
+  // %val = ptr.from_ptr %p (metadata ...)? : ptr -> type
+  // %ptr = ptr.to_ptr %val : type -> ptr
+  // To:
+  // %ptr -> %p
+  Value ptr;
+  ToPtrOp toPtr = *this;
+  while (toPtr != nullptr) {
+    auto fromPtr = dyn_cast_or_null<FromPtrOp>(toPtr.getPtr().getDefiningOp());
+    // Cannot fold if it's not a `from_ptr` op.
+    if (!fromPtr)
+      return ptr;
+    ptr = fromPtr.getPtr();
+    // Check for chains of casts.
+    toPtr = dyn_cast_or_null<ToPtrOp>(ptr.getDefiningOp());
+  }
+  return ptr;
+}
+
+LogicalResult ToPtrOp::verify() {
+  if (isa<PtrType>(getPtr().getType()))
+    return emitError() << "the input value cannot be of type `!ptr.ptr`";
+  if (getType().getMemorySpace() != getPtr().getType().getMemorySpace()) {
+    return emitError()
+           << "expected the input and output to have the same memory space";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TypeOffsetOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp b/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
index cab9ca11e679..7ad2a6bc4c80 100644
--- a/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
+++ b/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
@@ -151,3 +151,15 @@ LogicalResult PtrType::verifyEntries(DataLayoutEntryListRef entries,
   }
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// Pointer metadata
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+PtrMetadataType::verify(function_ref<InFlightDiagnostic()> emitError,
+                        PtrLikeTypeInterface type) {
+  if (!type.hasPtrMetadata())
+    return emitError() << "the ptr-like type has no metadata";
+  return success();
+}
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index d47e360e9dc1..97bab479c79b 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -376,6 +376,20 @@ BaseMemRefType BaseMemRefType::cloneWith(std::optional<ArrayRef<int64_t>> shape,
   return builder;
 }
 
+FailureOr<PtrLikeTypeInterface>
+BaseMemRefType::clonePtrWith(Attribute memorySpace,
+                             std::optional<Type> elementType) const {
+  Type eTy = elementType ? *elementType : getElementType();
+  if (llvm::dyn_cast<UnrankedMemRefType>(*this))
+    return cast<PtrLikeTypeInterface>(
+        UnrankedMemRefType::get(eTy, memorySpace));
+
+  MemRefType::Builder builder(llvm::cast<MemRefType>(*this));
+  builder.setElementType(eTy);
+  builder.setMemorySpace(memorySpace);
+  return cast<PtrLikeTypeInterface>(static_cast<MemRefType>(builder));
+}
+
 MemRefType BaseMemRefType::clone(::llvm::ArrayRef<int64_t> shape,
                                  Type elementType) const {
   return ::llvm::cast<MemRefType>(cloneWith(shape, elementType));
diff --git a/mlir/test/Dialect/Ptr/canonicalize.mlir b/mlir/test/Dialect/Ptr/canonicalize.mlir
index ad363d554f24..e50cd1b76caf 100644
--- a/mlir/test/Dialect/Ptr/canonicalize.mlir
+++ b/mlir/test/Dialect/Ptr/canonicalize.mlir
@@ -13,3 +13,109 @@ func.func @zero_offset(%ptr: !ptr.ptr<#ptr.generic_space>) -> !ptr.ptr<#ptr.gene
   %res0 = ptr.ptr_add %ptr, %off : !ptr.ptr<#ptr.generic_space>, index
   return %res0 : !ptr.ptr<#ptr.generic_space>
 }
+
+/// Tests the the `from_ptr` folder.
+// CHECK-LABEL: @test_from_ptr_0
+// CHECK-SAME: (%[[MEM_REF:.*]]: memref<f32, #ptr.generic_space>)
+func.func @test_from_ptr_0(%mr: memref<f32, #ptr.generic_space>) -> memref<f32, #ptr.generic_space> {
+  // CHECK-NOT: ptr.to_ptr
+  // CHECK-NOT: ptr.get_metadata
+  // CHECK-NOT: ptr.from_ptr
+  // CHECK: return %[[MEM_REF]]
+  %ptr = ptr.to_ptr %mr : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %mda = ptr.get_metadata %mr : memref<f32, #ptr.generic_space>
+  %res = ptr.from_ptr %ptr metadata %mda : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  return %res : memref<f32, #ptr.generic_space>
+}
+
+/// Check the op doesn't fold because folding a ptr-type with metadata requires knowing the origin of the metadata.
+// CHECK-LABEL: @test_from_ptr_1
+// CHECK-SAME: (%[[MEM_REF:.*]]: memref<f32, #ptr.generic_space>)
+func.func @test_from_ptr_1(%mr: memref<f32, #ptr.generic_space>) -> memref<f32, #ptr.generic_space> {
+  // CHECK: ptr.to_ptr
+  // CHECK: ptr.from_ptr
+  %ptr = ptr.to_ptr %mr : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %res = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  return %res : memref<f32, #ptr.generic_space>
+}
+
+/// Check that the ops cannot be folded because the metadata cannot be guaranteed to be the same.
+// CHECK-LABEL: @test_from_ptr_2
+func.func @test_from_ptr_2(%mr: memref<f32, #ptr.generic_space>, %md: !ptr.ptr_metadata<memref<f32, #ptr.generic_space>>) -> memref<f32, #ptr.generic_space> {
+  // CHECK: ptr.to_ptr
+  // CHECK: ptr.from_ptr
+  %ptr = ptr.to_ptr %mr : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %res = ptr.from_ptr %ptr metadata %md : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  return %res : memref<f32, #ptr.generic_space>
+}
+
+// Check the folding of `to_ptr -> from_ptr` chains.
+// CHECK-LABEL: @test_from_ptr_3
+// CHECK-SAME: (%[[MEM_REF:.*]]: memref<f32, #ptr.generic_space>)
+func.func @test_from_ptr_3(%mr0: memref<f32, #ptr.generic_space>) -> memref<f32, #ptr.generic_space> {
+  // CHECK-NOT: ptr.to_ptr
+  // CHECK-NOT: ptr.from_ptr
+  // CHECK: return %[[MEM_REF]]
+  %mda = ptr.get_metadata %mr0 : memref<f32, #ptr.generic_space>
+  %ptr0 = ptr.to_ptr %mr0 : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %mrf0 = ptr.from_ptr %ptr0 metadata %mda : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %ptr1 = ptr.to_ptr %mrf0 : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %mrf1 = ptr.from_ptr %ptr1 metadata %mda : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  return %mrf1 : memref<f32, #ptr.generic_space>
+}
+
+/// Tests the the `to_ptr` folder.
+// CHECK-LABEL: @test_to_ptr_0
+// CHECK-SAME: (%[[PTR:.*]]: !ptr.ptr<#ptr.generic_space>
+func.func @test_to_ptr_0(%ptr: !ptr.ptr<#ptr.generic_space>, %md: !ptr.ptr_metadata<memref<f32, #ptr.generic_space>>) -> !ptr.ptr<#ptr.generic_space> {
+  // CHECK: return %[[PTR]]
+  // CHECK-NOT: ptr.from_ptr
+  // CHECK-NOT: ptr.to_ptr
+  %mrf = ptr.from_ptr %ptr metadata %md : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %res = ptr.to_ptr %mrf : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  return %res : !ptr.ptr<#ptr.generic_space>
+}
+
+// CHECK-LABEL: @test_to_ptr_1
+// CHECK-SAME: (%[[PTR:.*]]: !ptr.ptr<#ptr.generic_space>)
+func.func @test_to_ptr_1(%ptr: !ptr.ptr<#ptr.generic_space>) -> !ptr.ptr<#ptr.generic_space> {
+  // CHECK-NOT: ptr.from_ptr
+  // CHECK-NOT: ptr.to_ptr
+  // CHECK: return %[[PTR]]
+  %mrf = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %res = ptr.to_ptr %mrf : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  return %res : !ptr.ptr<#ptr.generic_space>
+}
+
+// Check the folding of `from_ptr -> to_ptr` chains.
+// CHECK-LABEL: @test_to_ptr_2
+// CHECK-SAME: (%[[PTR:.*]]: !ptr.ptr<#ptr.generic_space>
+func.func @test_to_ptr_2(%ptr0: !ptr.ptr<#ptr.generic_space>) -> !ptr.ptr<#ptr.generic_space> {
+  // CHECK-NOT: ptr.from_ptr
+  // CHECK-NOT: ptr.to_ptr
+  // CHECK: return %[[PTR]]
+  %mrf0 = ptr.from_ptr %ptr0 : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %ptr1 = ptr.to_ptr %mrf0 : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %mrf1 = ptr.from_ptr %ptr1 : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %ptr2 = ptr.to_ptr %mrf1 : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %mrf2 = ptr.from_ptr %ptr2 : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %res = ptr.to_ptr %mrf2 : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  return %res : !ptr.ptr<#ptr.generic_space>
+}
+
+// Check the folding of chains with different metadata.
+// CHECK-LABEL: @test_cast_chain_folding
+// CHECK-SAME: (%[[MEM_REF:.*]]: memref<f32, #ptr.generic_space>
+func.func @test_cast_chain_folding(%mr: memref<f32, #ptr.generic_space>, %md: !ptr.ptr_metadata<memref<f32, #ptr.generic_space>>) -> memref<f32, #ptr.generic_space> {
+  // CHECK-NOT: ptr.to_ptr
+  // CHECK-NOT: ptr.from_ptr
+  // CHECK: return %[[MEM_REF]]
+   %ptr1 = ptr.to_ptr %mr : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+   %memrefWithOtherMd = ptr.from_ptr %ptr1 metadata %md : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+   %ptr = ptr.to_ptr %memrefWithOtherMd : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+   %mda = ptr.get_metadata %mr : memref<f32, #ptr.generic_space>
+   // The chain can be folded because: the ptr always has the same value because
+   // `to_ptr` is a loss-less cast and %mda comes from the original memref.
+   %res = ptr.from_ptr %ptr metadata %mda : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+   return %res : memref<f32, #ptr.generic_space>
+}
diff --git a/mlir/test/Dialect/Ptr/invalid.mlir b/mlir/test/Dialect/Ptr/invalid.mlir
new file mode 100644
index 000000000000..19fd715e5bba
--- /dev/null
+++ b/mlir/test/Dialect/Ptr/invalid.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+
+/// Test `to_ptr` verifiers.
+func.func @invalid_to_ptr(%v: memref<f32, 0>) {
+  // expected-error@+1 {{expected the input and output to have the same memory space}}
+  %r = ptr.to_ptr %v : memref<f32, 0> -> !ptr.ptr<#ptr.generic_space>
+  return
+}
+
+// -----
+
+func.func @invalid_to_ptr(%v: !ptr.ptr<#ptr.generic_space>) {
+  // expected-error@+1 {{the input value cannot be of type `!ptr.ptr`}}
+  %r = ptr.to_ptr %v : !ptr.ptr<#ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  return
+}
diff --git a/mlir/test/Dialect/Ptr/ops.mlir b/mlir/test/Dialect/Ptr/ops.mlir
index d763ea221944..eed3272d98da 100644
--- a/mlir/test/Dialect/Ptr/ops.mlir
+++ b/mlir/test/Dialect/Ptr/ops.mlir
@@ -17,3 +17,13 @@ func.func @ptr_add_type_offset(%ptr: !ptr.ptr<#ptr.generic_space>) -> !ptr.ptr<#
   %res3 = ptr.ptr_add inbounds %ptr, %off : !ptr.ptr<#ptr.generic_space>, index
   return %res : !ptr.ptr<#ptr.generic_space>
 }
+
+/// Check cast ops assembly.
+// CHECK-LABEL: @cast_ops
+func.func @cast_ops(%mr: memref<f32, #ptr.generic_space>) -> memref<f32, #ptr.generic_space> {
+  %ptr = ptr.to_ptr %mr : memref<f32, #ptr.generic_space> -> !ptr.ptr<#ptr.generic_space>
+  %mda = ptr.get_metadata %mr : memref<f32, #ptr.generic_space>
+  %res = ptr.from_ptr %ptr metadata %mda : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  %mr0 = ptr.from_ptr %ptr : !ptr.ptr<#ptr.generic_space> -> memref<f32, #ptr.generic_space>
+  return %res : memref<f32, #ptr.generic_space>
+}

From c103bbc836691d637c249ca19d38bdb0034437c6 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 20 Jun 2025 11:37:27 -0700
Subject: [PATCH 1098/1322] [LV] Consider whether vscale is a known power of
 two for iteration check (#144963)

Going mostly by the comment here - but it says "vscale is not
necessarily a power-of-2". Both in tree targets have vscale as a power
of two, and we have an existing TTI hook for that.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   2 +-
 .../AArch64/tail-folding-styles.ll            |  18 +--
 .../LoopVectorize/RISCV/inloop-reduction.ll   |  16 +--
 .../RISCV/type-info-cache-evl-crash.ll        |   6 +-
 ...ize-force-tail-with-evl-call-intrinsics.ll |  61 ++--------
 ...ize-force-tail-with-evl-cast-intrinsics.ll |  70 ++---------
 ...rize-force-tail-with-evl-cond-reduction.ll |  24 +---
 ...ce-tail-with-evl-fixed-order-recurrence.ll |  18 +--
 ...ze-force-tail-with-evl-inloop-reduction.ll | 112 +++++-------------
 ...ectorize-force-tail-with-evl-interleave.ll |   8 +-
 ...-force-tail-with-evl-intermediate-store.ll |  12 +-
 .../vectorize-force-tail-with-evl-iv32.ll     |   8 +-
 ...e-force-tail-with-evl-known-no-overflow.ll |  12 +-
 ...ze-force-tail-with-evl-masked-loadstore.ll |   8 +-
 ...e-force-tail-with-evl-ordered-reduction.ll |   8 +-
 ...vectorize-force-tail-with-evl-reduction.ll | 112 +++++-------------
 ...orize-force-tail-with-evl-uniform-store.ll |   6 +-
 .../RISCV/vectorize-vp-intrinsics.ll          |   8 +-
 .../RISCV/vplan-vp-select-intrinsics.ll       |  14 +--
 .../LoopVectorize/scalable-predication.ll     |  65 ++++++++++
 20 files changed, 178 insertions(+), 410 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/scalable-predication.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca1e4cbc6b29..f28c2ce0acc9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2429,7 +2429,7 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
       // check is known to be true, or known to be false.
       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
     } // else step known to be < trip count, use CheckMinIters preset to false.
-  } else if (VF.isScalable() &&
+  } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
     // vscale is not necessarily a power-of-2, which means we cannot guarantee
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 1cdc290757dd..0326fe523e67 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -54,11 +54,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA-LABEL: @simple_memset_tailfold(
 ; DATA-NEXT:  entry:
 ; DATA-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; DATA-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; DATA-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; DATA-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DATA-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA:       vector.ph:
 ; DATA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; DATA-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -98,11 +94,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-LABEL: @simple_memset_tailfold(
 ; DATA_NO_LANEMASK-NEXT:  entry:
 ; DATA_NO_LANEMASK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; DATA_NO_LANEMASK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; DATA_NO_LANEMASK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA_NO_LANEMASK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA_NO_LANEMASK:       vector.ph:
 ; DATA_NO_LANEMASK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; DATA_NO_LANEMASK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -150,11 +142,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL-LABEL: @simple_memset_tailfold(
 ; DATA_AND_CONTROL-NEXT:  entry:
 ; DATA_AND_CONTROL-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; DATA_AND_CONTROL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; DATA_AND_CONTROL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_AND_CONTROL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; DATA_AND_CONTROL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DATA_AND_CONTROL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA_AND_CONTROL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DATA_AND_CONTROL:       vector.ph:
 ; DATA_AND_CONTROL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; DATA_AND_CONTROL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index a8e6ef7ebfec..c063e6ff3f48 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -344,16 +344,12 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ;
 ; IF-EVL-OUTLOOP-LABEL: @smin(
 ; IF-EVL-OUTLOOP-NEXT:  entry:
-; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-OUTLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -401,16 +397,12 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ;
 ; IF-EVL-INLOOP-LABEL: @smin(
 ; IF-EVL-INLOOP-NEXT:  entry:
-; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-INLOOP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
index a256e92c823e..f059730a245b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
@@ -10,11 +10,7 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-SAME: ptr [[DSTV:%.*]], ptr [[SRC:%.*]], i64 [[WIDE_TRIP_COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DSTV]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
index 325f3fd2b968..7a5415a61fc7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
@@ -16,12 +16,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -130,12 +125,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -244,12 +234,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -358,12 +343,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -472,11 +452,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -571,11 +547,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -670,12 +642,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 9, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -778,12 +745,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 9, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -886,12 +848,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP19]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
index 107ca54c002f..38ed3f24ab23 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
@@ -13,12 +13,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-LABEL: define void @vp_sext(
 ; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 18, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 3
 ; IF-EVL-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
@@ -112,12 +107,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-LABEL: define void @vp_zext(
 ; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 18, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 3
 ; IF-EVL-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
@@ -211,12 +201,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-LABEL: define void @vp_trunc(
 ; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 18, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 2
 ; IF-EVL-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
@@ -310,12 +295,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-LABEL: define void @vp_fpext(
 ; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 14, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 3
 ; IF-EVL-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
@@ -409,12 +389,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-LABEL: define void @vp_fptrunc(
 ; IF-EVL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 14, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 2
 ; IF-EVL-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
@@ -510,12 +485,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
@@ -609,12 +579,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
@@ -708,12 +673,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
@@ -807,12 +767,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
@@ -906,12 +861,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
 ; IF-EVL-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
 ; IF-EVL-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 13, i64 [[TMP2]])
-; IF-EVL-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
-; IF-EVL-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; IF-EVL:       [[VECTOR_MEMCHECK]]:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 8faec471cf5a..e5364136135d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -25,11 +25,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add(
 ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IF-EVL-OUTLOOP-NEXT:  entry:
-; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -84,11 +80,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-LABEL: define i32 @cond_add(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IF-EVL-INLOOP-NEXT:  entry:
-; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
@@ -262,11 +254,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-LABEL: define i32 @cond_add_pred(
 ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; IF-EVL-OUTLOOP-NEXT:  entry:
-; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -335,11 +323,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-LABEL: define i32 @cond_add_pred(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
 ; IF-EVL-INLOOP-NEXT:  entry:
-; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-INLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-INLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-INLOOP-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index f651f2295b3b..172335d0e447 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -15,11 +15,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define void @first_order_recurrence(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -156,11 +152,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define void @second_order_recurrence(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
@@ -318,11 +310,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define void @third_order_recurrence(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
index c8d52efde4ac..87ac697bf202 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
@@ -14,16 +14,12 @@
 define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @add(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -239,16 +235,12 @@ for.end:
 define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @or(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -354,16 +346,12 @@ for.end:
 define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @and(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -469,16 +457,12 @@ for.end:
 define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @xor(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -584,16 +568,12 @@ for.end:
 define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @smin(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -702,16 +682,12 @@ for.end:
 define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @smax(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -820,16 +796,12 @@ for.end:
 define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @umin(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -938,16 +910,12 @@ for.end:
 define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @umax(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1056,16 +1024,12 @@ for.end:
 define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fadd(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1281,16 +1245,12 @@ for.end:
 define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-LABEL: @fmin(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1401,16 +1361,12 @@ for.end:
 define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-LABEL: @fmax(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1745,16 +1701,12 @@ for.end:
 define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fmuladd(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1874,16 +1826,12 @@ for.end:
 define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-LABEL: @anyof_icmp(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1998,16 +1946,12 @@ for.end:
 define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-LABEL: @anyof_fcmp(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 38ef88457c6c..fe6a693e8381 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -13,16 +13,12 @@
 define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-LABEL: @interleave(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
index 6d54b219a346..3e804c0ea219 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
@@ -25,11 +25,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-OUTLOOP-LABEL: define void @reduction_intermediate_store(
 ; IF-EVL-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IF-EVL-OUTLOOP-NEXT:  entry:
-; IF-EVL-OUTLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-OUTLOOP-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; IF-EVL-OUTLOOP:       vector.memcheck:
 ; IF-EVL-OUTLOOP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 2
@@ -89,11 +85,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-INLOOP-LABEL: define void @reduction_intermediate_store(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IF-EVL-INLOOP-NEXT:  entry:
-; IF-EVL-INLOOP-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
-; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-INLOOP-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-INLOOP-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; IF-EVL-INLOOP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; IF-EVL-INLOOP:       vector.memcheck:
 ; IF-EVL-INLOOP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ADDR]], i64 4
 ; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = shl i64 [[N]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
index a05aabc063b1..3d91738d7a0d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
@@ -12,16 +12,12 @@
 define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
 ; IF-EVL-LABEL: @iv32(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i32 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP19]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i32 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N:%.*]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
index dcd15087ce3d..258b7ce0e240 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
@@ -15,11 +15,7 @@ define void @trip_count_max_1024(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    br i1 [[GUARD]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
 ; CHECK:       [[LOOP_PREHEADER]]:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TC]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
@@ -90,11 +86,7 @@ define void @overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    [[GUARD:%.*]] = icmp ugt i64 [[TC]], 1024
 ; CHECK-NEXT:    br i1 [[GUARD]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
 ; CHECK:       [[LOOP_PREHEADER]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[TC]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
index 7609182c7690..63acbdfcaf2b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -12,16 +12,12 @@
 define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL-LABEL: @masked_loadstore(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
index bc28918dac68..a97c4b303f9f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
@@ -14,16 +14,12 @@
 define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
 ; IF-EVL-LABEL: @fadd(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
index 53e0b2f45aaa..83deebe42d38 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
@@ -12,16 +12,12 @@
 define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @add(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -241,16 +237,12 @@ for.end:
 define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @or(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -359,16 +351,12 @@ for.end:
 define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @and(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -477,16 +465,12 @@ for.end:
 define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @xor(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -595,16 +579,12 @@ for.end:
 define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @smin(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -720,16 +700,12 @@ for.end:
 define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @smax(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -845,16 +821,12 @@ for.end:
 define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @umin(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -970,16 +942,12 @@ for.end:
 define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @umax(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1095,16 +1063,12 @@ for.end:
 define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fadd(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1324,16 +1288,12 @@ for.end:
 define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-LABEL: @fmin(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1449,16 +1409,12 @@ for.end:
 define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-LABEL: @fmax(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1798,16 +1754,12 @@ for.end:
 define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fmuladd(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -1928,16 +1880,12 @@ for.end:
 define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-LABEL: @anyof_icmp(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
@@ -2052,16 +2000,12 @@ for.end:
 define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-LABEL: @anyof_fcmp(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index 81f52627b179..5c94ce180578 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -13,11 +13,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[CMP1_PEEL:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP1_PEEL]], i64 2, i64 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 3, [[SPEC_SELECT]]
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 -1, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
index f8f397212e0e..7b91f10fa6a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -12,16 +12,12 @@
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-LABEL: @foo(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP8]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 69d4f96f7ba2..543c706250da 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -20,14 +20,14 @@
  ; IF-EVL-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 
  ; IF-EVL:      ir-bb<vector.ph>:
- ; IF-EVL-NEXT:   IR   %4 = call i64 @llvm.vscale.i64()
- ; IF-EVL-NEXT:   IR   %5 = mul nuw i64 %4, 4
- ; IF-EVL-NEXT:   IR   %6 = sub i64 %5, 1
- ; IF-EVL-NEXT:   IR   %n.rnd.up = add i64 %N, %6
- ; IF-EVL-NEXT:   IR   %n.mod.vf = urem i64 %n.rnd.up, %5
+ ; IF-EVL-NEXT:   IR   %0 = call i64 @llvm.vscale.i64()
+ ; IF-EVL-NEXT:   IR   %1 = mul nuw i64 %0, 4
+ ; IF-EVL-NEXT:   IR   %2 = sub i64 %1, 1
+ ; IF-EVL-NEXT:   IR   %n.rnd.up = add i64 %N, %2
+ ; IF-EVL-NEXT:   IR   %n.mod.vf = urem i64 %n.rnd.up, %1
  ; IF-EVL-NEXT:   IR   %n.vec = sub i64 %n.rnd.up, %n.mod.vf
- ; IF-EVL-NEXT:   IR   %7 = call i64 @llvm.vscale.i64()
- ; IF-EVL-NEXT:   IR   %8 = mul nuw i64 %7, 4
+ ; IF-EVL-NEXT:   IR   %3 = call i64 @llvm.vscale.i64()
+ ; IF-EVL-NEXT:   IR   %4 = mul nuw i64 %3, 4
  ; IF-EVL-NEXT: Successor(s): vector.body
  ; IF-EVL-EMPTY:
  ; IF-EVL-NEXT: vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-predication.ll b/llvm/test/Transforms/LoopVectorize/scalable-predication.ll
new file mode 100644
index 000000000000..8e272debb299
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-predication.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-target-supports-scalable-vectors -S < %s | FileCheck %s
+
+; vscale is not guaranteed to be a power of two, so this test (which
+; deliberately doesn't correspond to an in-tree backend since those
+; *do* have vscale as power-of-two) exercises the code required for the
+; minimum iteration check in the non-power-of-two case.
+define void @foo(i32 %val, ptr dereferenceable(1024) %ptr) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 -257, [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 256, [[TMP2]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 256)
+; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
+  %gep = getelementptr i32, ptr %ptr, i64 %index
+  %ld1 = load i32, ptr %gep, align 4
+  %index.next = add nsw i64 %index, 1
+  %cmp10 = icmp ult i64 %index.next, 256
+  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
+
+while.end.loopexit:                               ; preds = %while.body
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.width", i32 4}

From 8d2eea96b391c5346543eceae5c8d24efe4f4497 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:06:55 -0700
Subject: [PATCH 1099/1322] [AMDGPU] gfx1250 SOPP MC tests. NFC. (#145082)

---
 llvm/test/MC/AMDGPU/gfx12_asm_sopp.s          | 71 ++++++++++---------
 .../Disassembler/AMDGPU/gfx12_dasm_sopp.txt   | 25 +++----
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index 97b6e3ba22b0..a58d68cb30cb 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -1,4 +1,5 @@
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
+// RUN: not llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX12 %s
 
 s_wait_loadcnt 0x1234
 // GFX12: encoding: [0x34,0x12,0xc0,0xbf]
@@ -13,22 +14,22 @@ s_wait_storecnt 0xc1d1
 // GFX12: encoding: [0xd1,0xc1,0xc1,0xbf]
 
 s_wait_samplecnt 0x1234
-// GFX12: encoding: [0x34,0x12,0xc2,0xbf]
+// GFX1200: encoding: [0x34,0x12,0xc2,0xbf]
 
 s_wait_samplecnt 0xc1d1
-// GFX12: encoding: [0xd1,0xc1,0xc2,0xbf]
+// GFX1200: encoding: [0xd1,0xc1,0xc2,0xbf]
 
 s_wait_bvhcnt 0x1234
-// GFX12: encoding: [0x34,0x12,0xc3,0xbf]
+// GFX1200: encoding: [0x34,0x12,0xc3,0xbf]
 
 s_wait_bvhcnt 0xc1d1
-// GFX12: encoding: [0xd1,0xc1,0xc3,0xbf]
+// GFX1200: encoding: [0xd1,0xc1,0xc3,0xbf]
 
 s_wait_expcnt 0x1234
-// GFX12: encoding: [0x34,0x12,0xc4,0xbf]
+// GFX1200: encoding: [0x34,0x12,0xc4,0xbf]
 
 s_wait_expcnt 0xc1d1
-// GFX12: encoding: [0xd1,0xc1,0xc4,0xbf]
+// GFX1200: encoding: [0xd1,0xc1,0xc4,0xbf]
 
 s_wait_dscnt 0x1234
 // GFX12: encoding: [0x34,0x12,0xc6,0xbf]
@@ -80,73 +81,73 @@ s_barrier_wait 1
 //===----------------------------------------------------------------------===//
 
 s_waitcnt 0
-// GFX12: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 
 s_waitcnt 0x1234
-// GFX12: s_waitcnt vmcnt(4) expcnt(4) lgkmcnt(35) ; encoding: [0x34,0x12,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(4) expcnt(4) lgkmcnt(35) ; encoding: [0x34,0x12,0x89,0xbf]
 
 s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0)
-// GFX12: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 
 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-// GFX12: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 
 s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0)
-// GFX12: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 
 s_waitcnt vmcnt(1)
-// GFX12: s_waitcnt vmcnt(1) ; encoding: [0xf7,0x07,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(1) ; encoding: [0xf7,0x07,0x89,0xbf]
 
 s_waitcnt vmcnt(9)
-// GFX12: s_waitcnt vmcnt(9) ; encoding: [0xf7,0x27,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(9) ; encoding: [0xf7,0x27,0x89,0xbf]
 
 s_waitcnt expcnt(2)
-// GFX12: s_waitcnt expcnt(2) ; encoding: [0xf2,0xff,0x89,0xbf]
+// GFX1200: s_waitcnt expcnt(2) ; encoding: [0xf2,0xff,0x89,0xbf]
 
 s_waitcnt lgkmcnt(3)
-// GFX12: s_waitcnt lgkmcnt(3) ; encoding: [0x37,0xfc,0x89,0xbf]
+// GFX1200: s_waitcnt lgkmcnt(3) ; encoding: [0x37,0xfc,0x89,0xbf]
 
 s_waitcnt lgkmcnt(9)
-// GFX12: s_waitcnt lgkmcnt(9) ; encoding: [0x97,0xfc,0x89,0xbf]
+// GFX1200: s_waitcnt lgkmcnt(9) ; encoding: [0x97,0xfc,0x89,0xbf]
 
 s_waitcnt vmcnt(0), expcnt(0)
-// GFX12: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0xf0,0x03,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0xf0,0x03,0x89,0xbf]
 
 s_waitcnt vmcnt(15)
-// GFX12: s_waitcnt vmcnt(15) ; encoding: [0xf7,0x3f,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(15) ; encoding: [0xf7,0x3f,0x89,0xbf]
 
 s_waitcnt vmcnt(15) expcnt(6)
-// GFX12: s_waitcnt vmcnt(15) expcnt(6) ; encoding: [0xf6,0x3f,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(15) expcnt(6) ; encoding: [0xf6,0x3f,0x89,0xbf]
 
 s_waitcnt vmcnt(15) lgkmcnt(14)
-// GFX12: s_waitcnt vmcnt(15) lgkmcnt(14) ; encoding: [0xe7,0x3c,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(15) lgkmcnt(14) ; encoding: [0xe7,0x3c,0x89,0xbf]
 
 s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14)
-// GFX12: s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14) ; encoding: [0xe6,0x3c,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14) ; encoding: [0xe6,0x3c,0x89,0xbf]
 
 s_waitcnt vmcnt(31)
-// GFX12: s_waitcnt vmcnt(31) ; encoding: [0xf7,0x7f,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(31) ; encoding: [0xf7,0x7f,0x89,0xbf]
 
 s_waitcnt vmcnt(31) expcnt(6)
-// GFX12: s_waitcnt vmcnt(31) expcnt(6) ; encoding: [0xf6,0x7f,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(31) expcnt(6) ; encoding: [0xf6,0x7f,0x89,0xbf]
 
 s_waitcnt vmcnt(31) lgkmcnt(14)
-// GFX12: s_waitcnt vmcnt(31) lgkmcnt(14) ; encoding: [0xe7,0x7c,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(31) lgkmcnt(14) ; encoding: [0xe7,0x7c,0x89,0xbf]
 
 s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14)
-// GFX12: s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14) ; encoding: [0xe6,0x7c,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14) ; encoding: [0xe6,0x7c,0x89,0xbf]
 
 s_waitcnt vmcnt(62)
-// GFX12: s_waitcnt vmcnt(62) ; encoding: [0xf7,0xfb,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(62) ; encoding: [0xf7,0xfb,0x89,0xbf]
 
 s_waitcnt vmcnt(62) expcnt(6)
-// GFX12: s_waitcnt vmcnt(62) expcnt(6) ; encoding: [0xf6,0xfb,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(62) expcnt(6) ; encoding: [0xf6,0xfb,0x89,0xbf]
 
 s_waitcnt vmcnt(62) lgkmcnt(14)
-// GFX12: s_waitcnt vmcnt(62) lgkmcnt(14) ; encoding: [0xe7,0xf8,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(62) lgkmcnt(14) ; encoding: [0xe7,0xf8,0x89,0xbf]
 
 s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14)
-// GFX12: s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14) ; encoding: [0xe6,0xf8,0x89,0xbf]
+// GFX1200: s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14) ; encoding: [0xe6,0xf8,0x89,0xbf]
 
 //===----------------------------------------------------------------------===//
 // s_sendmsg
@@ -259,13 +260,13 @@ s_cbranch_execnz 0x1234
 // GFX12: s_cbranch_execnz 4660 ; encoding: [0x34,0x12,0xa6,0xbf]
 
 s_setkill 0x0
-// GFX12: s_setkill 0 ; encoding: [0x00,0x00,0x81,0xbf]
+// GFX1200: s_setkill 0 ; encoding: [0x00,0x00,0x81,0xbf]
 
 s_setkill 0x1234
-// GFX12: s_setkill 0x1234 ; encoding: [0x34,0x12,0x81,0xbf]
+// GFX1200: s_setkill 0x1234 ; encoding: [0x34,0x12,0x81,0xbf]
 
 s_setkill 0xc1d1
-// GFX12: s_setkill 0xc1d1 ; encoding: [0xd1,0xc1,0x81,0xbf]
+// GFX1200: s_setkill 0xc1d1 ; encoding: [0xd1,0xc1,0x81,0xbf]
 
 s_sethalt 0x0
 // GFX12: s_sethalt 0 ; encoding: [0x00,0x00,0x82,0xbf]
@@ -379,7 +380,7 @@ s_ttracedata_imm 0xc1d1
 // GFX12: s_ttracedata_imm 0xc1d1 ; encoding: [0xd1,0xc1,0xbb,0xbf]
 
 s_wait_event 0x3141
-// GFX12: s_wait_event 0x3141 ; encoding: [0x41,0x31,0x8b,0xbf]
+// GFX1200: s_wait_event 0x3141 ; encoding: [0x41,0x31,0x8b,0xbf]
 
 s_wait_event 0xc1d1
-// GFX12: s_wait_event 0xc1d1 ; encoding: [0xd1,0xc1,0x8b,0xbf]
+// GFX1200: s_wait_event 0xc1d1 ; encoding: [0xd1,0xc1,0x8b,0xbf]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index 266ebf3822d3..f8aef7267862 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX12,GFX1200 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX12 %s
 
 # GFX12: s_wait_alu depctr_hold_cnt(0) depctr_sa_sdst(0) depctr_va_vdst(0) depctr_va_sdst(0) depctr_va_ssrc(0) depctr_va_vcc(0) depctr_vm_vsrc(0) ; encoding: [0x00,0x00,0x88,0xbf]
 0x00,0x00,0x88,0xbf
@@ -18,22 +19,22 @@
 # GFX12: s_wait_storecnt 0xc1d1                  ; encoding: [0xd1,0xc1,0xc1,0xbf]
 0xd1,0xc1,0xc1,0xbf
 
-# GFX12: s_wait_samplecnt 0x1234                 ; encoding: [0x34,0x12,0xc2,0xbf]
+# GFX1200: s_wait_samplecnt 0x1234                 ; encoding: [0x34,0x12,0xc2,0xbf]
 0x34,0x12,0xc2,0xbf
 
-# GFX12: s_wait_samplecnt 0xc1d1                 ; encoding: [0xd1,0xc1,0xc2,0xbf]
+# GFX1200: s_wait_samplecnt 0xc1d1                 ; encoding: [0xd1,0xc1,0xc2,0xbf]
 0xd1,0xc1,0xc2,0xbf
 
-# GFX12: s_wait_bvhcnt 0x1234                    ; encoding: [0x34,0x12,0xc3,0xbf]
+# GFX1200: s_wait_bvhcnt 0x1234                    ; encoding: [0x34,0x12,0xc3,0xbf]
 0x34,0x12,0xc3,0xbf
 
-# GFX12: s_wait_bvhcnt 0xc1d1                    ; encoding: [0xd1,0xc1,0xc3,0xbf]
+# GFX1200: s_wait_bvhcnt 0xc1d1                    ; encoding: [0xd1,0xc1,0xc3,0xbf]
 0xd1,0xc1,0xc3,0xbf
 
-# GFX12: s_wait_expcnt 0x1234                    ; encoding: [0x34,0x12,0xc4,0xbf]
+# GFX1200: s_wait_expcnt 0x1234                    ; encoding: [0x34,0x12,0xc4,0xbf]
 0x34,0x12,0xc4,0xbf
 
-# GFX12: s_wait_expcnt 0xc1d1                    ; encoding: [0xd1,0xc1,0xc4,0xbf]
+# GFX1200: s_wait_expcnt 0xc1d1                    ; encoding: [0xd1,0xc1,0xc4,0xbf]
 0xd1,0xc1,0xc4,0xbf
 
 # GFX12: s_wait_dscnt 0x1234                     ; encoding: [0x34,0x12,0xc6,0xbf]
@@ -250,13 +251,13 @@
 # GFX12: s_sethalt 0xc1d1                        ; encoding: [0xd1,0xc1,0x82,0xbf]
 0xd1,0xc1,0x82,0xbf
 
-# GFX12: s_setkill 0                             ; encoding: [0x00,0x00,0x81,0xbf]
+# GFX1200: s_setkill 0                             ; encoding: [0x00,0x00,0x81,0xbf]
 0x00,0x00,0x81,0xbf
 
-# GFX12: s_setkill 0x1234                        ; encoding: [0x34,0x12,0x81,0xbf]
+# GFX1200: s_setkill 0x1234                        ; encoding: [0x34,0x12,0x81,0xbf]
 0x34,0x12,0x81,0xbf
 
-# GFX12: s_setkill 0xc1d1                        ; encoding: [0xd1,0xc1,0x81,0xbf]
+# GFX1200: s_setkill 0xc1d1                        ; encoding: [0xd1,0xc1,0x81,0xbf]
 0xd1,0xc1,0x81,0xbf
 
 # GFX12: s_setprio 0                             ; encoding: [0x00,0x00,0xb5,0xbf]
@@ -304,8 +305,8 @@
 # GFX12: s_wakeup                                ; encoding: [0x00,0x00,0xb4,0xbf]
 0x00,0x00,0xb4,0xbf
 
-# GFX12: s_wait_event 0x3141                     ; encoding: [0x41,0x31,0x8b,0xbf]
+# GFX1200: s_wait_event 0x3141                     ; encoding: [0x41,0x31,0x8b,0xbf]
 0x41,0x31,0x8b,0xbf
 
-# GFX12: s_wait_event 0xc1d1                     ; encoding: [0xd1,0xc1,0x8b,0xbf]
+# GFX1200: s_wait_event 0xc1d1                     ; encoding: [0xd1,0xc1,0x8b,0xbf]
 0xd1,0xc1,0x8b,0xbf

From b5d5708128e99f69add50c322bfbed5f4905c23d Mon Sep 17 00:00:00 2001
From: joaosaffran <126493771+joaosaffran@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:12:02 -0700
Subject: [PATCH 1100/1322] [HLSL] Add descriptor table metadata parsing
 (#142492)

Implements descriptor table parsing from root signature metadata. This
is required to support root signatures in hlsl.
Closes: #[126640](https://github.com/llvm/llvm-project/issues/126640)

---------

Co-authored-by: joaosaffran <joao.saffran@microsoft.com>
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  |   5 +
 .../BinaryFormat/DXContainerConstants.def     |  10 +
 llvm/lib/Target/DirectX/DXILRootSignature.cpp | 245 +++++++++++++++---
 llvm/lib/Target/DirectX/DXILRootSignature.h   |   1 +
 ...escriptorTable-AllValidFlagCombinations.ll | 157 +++++++++++
 ...criptorTable-AllValidFlagCombinationsV1.ll |  44 ++++
 ...tSignature-DescriptorTable-Invalid-Flag.ll |  20 ++
 ...ature-DescriptorTable-Invalid-RangeType.ll |  20 ++
 ...e-DescriptorTable-Invalid-RegisterSpace.ll |  20 ++
 .../RootSignature-DescriptorTable.ll          |  48 ++++
 .../ContainerData/RootSignature-Parameters.ll |  22 +-
 ...tSignature-RootDescriptor-Invalid-Flags.ll |   2 +-
 12 files changed, 559 insertions(+), 35 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-Flag.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RangeType.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RegisterSpace.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 6d625dad5853..56c9e5330867 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_BINARYFORMAT_DXCONTAINER_H
 #define LLVM_BINARYFORMAT_DXCONTAINER_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
@@ -40,6 +41,8 @@ template <typename T> struct EnumEntry;
 
 namespace dxbc {
 
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
 inline Triple::EnvironmentType getShaderStage(uint32_t Kind) {
   assert(Kind <= Triple::Amplification - Triple::Pixel &&
          "Shader kind out of expected range.");
@@ -167,6 +170,8 @@ enum class RootDescriptorFlag : uint32_t {
 #define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = Num,
 enum class DescriptorRangeFlag : uint32_t {
 #include "DXContainerConstants.def"
+
+  LLVM_MARK_AS_BITMASK_ENUM(DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
 };
 
 #define ROOT_PARAMETER(Val, Enum) Enum = Val,
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 18e79e6fa65a..a281256a44ae 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -99,6 +99,16 @@ DESCRIPTOR_RANGE_FLAG(0x10000, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
 #undef DESCRIPTOR_RANGE_FLAG
 #endif // DESCRIPTOR_RANGE_FLAG
 
+// DESCRIPTOR_RANGE(value, name).
+#ifdef DESCRIPTOR_RANGE
+
+DESCRIPTOR_RANGE(0, SRV)
+DESCRIPTOR_RANGE(1, UAV)
+DESCRIPTOR_RANGE(2, CBV)
+DESCRIPTOR_RANGE(3, Sampler)
+#undef DESCRIPTOR_RANGE
+#endif // DESCRIPTOR_RANGE
+
 #ifdef ROOT_PARAMETER
 
 ROOT_PARAMETER(0, DescriptorTable)
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index 88914a31f46e..1f175fd4ecd9 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -174,6 +174,93 @@ static bool parseRootDescriptors(LLVMContext *Ctx,
   return false;
 }
 
+static bool parseDescriptorRange(LLVMContext *Ctx,
+                                 mcdxbc::DescriptorTable &Table,
+                                 MDNode *RangeDescriptorNode) {
+
+  if (RangeDescriptorNode->getNumOperands() != 6)
+    return reportError(Ctx, "Invalid format for Descriptor Range");
+
+  dxbc::RTS0::v2::DescriptorRange Range;
+
+  std::optional<StringRef> ElementText =
+      extractMdStringValue(RangeDescriptorNode, 0);
+
+  if (!ElementText.has_value())
+    return reportError(Ctx, "Descriptor Range, first element is not a string.");
+
+  Range.RangeType =
+      StringSwitch<uint32_t>(*ElementText)
+          .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV))
+          .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV))
+          .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV))
+          .Case("Sampler",
+                llvm::to_underlying(dxbc::DescriptorRangeType::Sampler))
+          .Default(~0U);
+
+  if (Range.RangeType == ~0U)
+    return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText);
+
+  if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
+    Range.NumDescriptors = *Val;
+  else
+    return reportError(Ctx, "Invalid value for Number of Descriptor in Range");
+
+  if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
+    Range.BaseShaderRegister = *Val;
+  else
+    return reportError(Ctx, "Invalid value for BaseShaderRegister");
+
+  if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
+    Range.RegisterSpace = *Val;
+  else
+    return reportError(Ctx, "Invalid value for RegisterSpace");
+
+  if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
+    Range.OffsetInDescriptorsFromTableStart = *Val;
+  else
+    return reportError(Ctx,
+                       "Invalid value for OffsetInDescriptorsFromTableStart");
+
+  if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
+    Range.Flags = *Val;
+  else
+    return reportError(Ctx, "Invalid value for Descriptor Range Flags");
+
+  Table.Ranges.push_back(Range);
+  return false;
+}
+
+static bool parseDescriptorTable(LLVMContext *Ctx,
+                                 mcdxbc::RootSignatureDesc &RSD,
+                                 MDNode *DescriptorTableNode) {
+  const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
+  if (NumOperands < 2)
+    return reportError(Ctx, "Invalid format for Descriptor Table");
+
+  dxbc::RTS0::v1::RootParameterHeader Header;
+  if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1))
+    Header.ShaderVisibility = *Val;
+  else
+    return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+  mcdxbc::DescriptorTable Table;
+  Header.ParameterType =
+      llvm::to_underlying(dxbc::RootParameterType::DescriptorTable);
+
+  for (unsigned int I = 2; I < NumOperands; I++) {
+    MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
+    if (Element == nullptr)
+      return reportError(Ctx, "Missing Root Element Metadata Node.");
+
+    if (parseDescriptorRange(Ctx, Table, Element))
+      return true;
+  }
+
+  RSD.ParametersContainer.addParameter(Header, Table);
+  return false;
+}
+
 static bool parseRootSignatureElement(LLVMContext *Ctx,
                                       mcdxbc::RootSignatureDesc &RSD,
                                       MDNode *Element) {
@@ -188,6 +275,7 @@ static bool parseRootSignatureElement(LLVMContext *Ctx,
           .Case("RootCBV", RootSignatureElementKind::CBV)
           .Case("RootSRV", RootSignatureElementKind::SRV)
           .Case("RootUAV", RootSignatureElementKind::UAV)
+          .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable)
           .Default(RootSignatureElementKind::Error);
 
   switch (ElementKind) {
@@ -200,6 +288,8 @@ static bool parseRootSignatureElement(LLVMContext *Ctx,
   case RootSignatureElementKind::SRV:
   case RootSignatureElementKind::UAV:
     return parseRootDescriptors(Ctx, RSD, Element, ElementKind);
+  case RootSignatureElementKind::DescriptorTable:
+    return parseDescriptorTable(Ctx, RSD, Element);
   case RootSignatureElementKind::Error:
     return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText);
   }
@@ -241,6 +331,81 @@ static bool verifyRegisterSpace(uint32_t RegisterSpace) {
 
 static bool verifyDescriptorFlag(uint32_t Flags) { return (Flags & ~0xE) == 0; }
 
+static bool verifyRangeType(uint32_t Type) {
+  switch (Type) {
+  case llvm::to_underlying(dxbc::DescriptorRangeType::CBV):
+  case llvm::to_underlying(dxbc::DescriptorRangeType::SRV):
+  case llvm::to_underlying(dxbc::DescriptorRangeType::UAV):
+  case llvm::to_underlying(dxbc::DescriptorRangeType::Sampler):
+    return true;
+  };
+
+  return false;
+}
+
+static bool verifyDescriptorRangeFlag(uint32_t Version, uint32_t Type,
+                                      uint32_t FlagsVal) {
+  using FlagT = dxbc::DescriptorRangeFlag;
+  FlagT Flags = FlagT(FlagsVal);
+
+  const bool IsSampler =
+      (Type == llvm::to_underlying(dxbc::DescriptorRangeType::Sampler));
+
+  if (Version == 1) {
+    // Since the metadata is unversioned, we expect to explicitly see the values
+    // that map to the version 1 behaviour here.
+    if (IsSampler)
+      return Flags == FlagT::DESCRIPTORS_VOLATILE;
+    return Flags == (FlagT::DATA_VOLATILE | FlagT::DESCRIPTORS_VOLATILE);
+  }
+
+  // The data-specific flags are mutually exclusive.
+  FlagT DataFlags = FlagT::DATA_VOLATILE | FlagT::DATA_STATIC |
+                    FlagT::DATA_STATIC_WHILE_SET_AT_EXECUTE;
+
+  if (popcount(llvm::to_underlying(Flags & DataFlags)) > 1)
+    return false;
+
+  // The descriptor-specific flags are mutually exclusive.
+  FlagT DescriptorFlags =
+      FlagT::DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS |
+      FlagT::DESCRIPTORS_VOLATILE;
+  if (popcount(llvm::to_underlying(Flags & DescriptorFlags)) > 1)
+    return false;
+
+  // For volatile descriptors, DATA_STATIC is never valid.
+  if ((Flags & FlagT::DESCRIPTORS_VOLATILE) == FlagT::DESCRIPTORS_VOLATILE) {
+    FlagT Mask = FlagT::DESCRIPTORS_VOLATILE;
+    if (!IsSampler) {
+      Mask |= FlagT::DATA_VOLATILE;
+      Mask |= FlagT::DATA_STATIC_WHILE_SET_AT_EXECUTE;
+    }
+    return (Flags & ~Mask) == FlagT::NONE;
+  }
+
+  // For "STATIC_KEEPING_BUFFER_BOUNDS_CHECKS" descriptors,
+  // the other data-specific flags may all be set.
+  if ((Flags & FlagT::DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS) ==
+      FlagT::DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS) {
+    FlagT Mask = FlagT::DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS;
+    if (!IsSampler) {
+      Mask |= FlagT::DATA_VOLATILE;
+      Mask |= FlagT::DATA_STATIC;
+      Mask |= FlagT::DATA_STATIC_WHILE_SET_AT_EXECUTE;
+    }
+    return (Flags & ~Mask) == FlagT::NONE;
+  }
+
+  // When no descriptor flag is set, any data flag is allowed.
+  FlagT Mask = FlagT::NONE;
+  if (!IsSampler) {
+    Mask |= FlagT::DATA_VOLATILE;
+    Mask |= FlagT::DATA_STATIC;
+    Mask |= FlagT::DATA_STATIC_WHILE_SET_AT_EXECUTE;
+  }
+  return (Flags & ~Mask) == FlagT::NONE;
+}
+
 static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) {
 
   if (!verifyVersion(RSD.Version)) {
@@ -275,7 +440,23 @@ static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) {
 
       if (RSD.Version > 1) {
         if (!verifyDescriptorFlag(Descriptor.Flags))
-          return reportValueError(Ctx, "DescriptorFlag", Descriptor.Flags);
+          return reportValueError(Ctx, "DescriptorRangeFlag", Descriptor.Flags);
+      }
+      break;
+    }
+    case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+      const mcdxbc::DescriptorTable &Table =
+          RSD.ParametersContainer.getDescriptorTable(Info.Location);
+      for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) {
+        if (!verifyRangeType(Range.RangeType))
+          return reportValueError(Ctx, "RangeType", Range.RangeType);
+
+        if (!verifyRegisterSpace(Range.RegisterSpace))
+          return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace);
+
+        if (!verifyDescriptorRangeFlag(RSD.Version, Range.RangeType,
+                                       Range.Flags))
+          return reportValueError(Ctx, "DescriptorFlag", Range.Flags);
       }
       break;
     }
@@ -388,43 +569,33 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M,
 
   OS << "Root Signature Definitions"
      << "\n";
-  uint8_t Space = 0;
   for (const Function &F : M) {
     auto It = RSDMap.find(&F);
     if (It == RSDMap.end())
       continue;
     const auto &RS = It->second;
     OS << "Definition for '" << F.getName() << "':\n";
-
     // start root signature header
-    Space++;
-    OS << indent(Space) << "Flags: " << format_hex(RS.Flags, 8) << "\n";
-    OS << indent(Space) << "Version: " << RS.Version << "\n";
-    OS << indent(Space) << "RootParametersOffset: " << RS.RootParameterOffset
-       << "\n";
-    OS << indent(Space) << "NumParameters: " << RS.ParametersContainer.size()
-       << "\n";
-    Space++;
+    OS << "Flags: " << format_hex(RS.Flags, 8) << "\n"
+       << "Version: " << RS.Version << "\n"
+       << "RootParametersOffset: " << RS.RootParameterOffset << "\n"
+       << "NumParameters: " << RS.ParametersContainer.size() << "\n";
     for (size_t I = 0; I < RS.ParametersContainer.size(); I++) {
       const auto &[Type, Loc] =
           RS.ParametersContainer.getTypeAndLocForParameter(I);
       const dxbc::RTS0::v1::RootParameterHeader Header =
           RS.ParametersContainer.getHeader(I);
 
-      OS << indent(Space) << "- Parameter Type: " << Type << "\n";
-      OS << indent(Space + 2)
-         << "Shader Visibility: " << Header.ShaderVisibility << "\n";
+      OS << "- Parameter Type: " << Type << "\n"
+         << "  Shader Visibility: " << Header.ShaderVisibility << "\n";
 
       switch (Type) {
       case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): {
         const dxbc::RTS0::v1::RootConstants &Constants =
             RS.ParametersContainer.getConstant(Loc);
-        OS << indent(Space + 2) << "Register Space: " << Constants.RegisterSpace
-           << "\n";
-        OS << indent(Space + 2)
-           << "Shader Register: " << Constants.ShaderRegister << "\n";
-        OS << indent(Space + 2)
-           << "Num 32 Bit Values: " << Constants.Num32BitValues << "\n";
+        OS << "  Register Space: " << Constants.RegisterSpace << "\n"
+           << "  Shader Register: " << Constants.ShaderRegister << "\n"
+           << "  Num 32 Bit Values: " << Constants.Num32BitValues << "\n";
         break;
       }
       case llvm::to_underlying(dxbc::RootParameterType::CBV):
@@ -432,23 +603,33 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M,
       case llvm::to_underlying(dxbc::RootParameterType::SRV): {
         const dxbc::RTS0::v2::RootDescriptor &Descriptor =
             RS.ParametersContainer.getRootDescriptor(Loc);
-        OS << indent(Space + 2)
-           << "Register Space: " << Descriptor.RegisterSpace << "\n";
-        OS << indent(Space + 2)
-           << "Shader Register: " << Descriptor.ShaderRegister << "\n";
+        OS << "  Register Space: " << Descriptor.RegisterSpace << "\n"
+           << "  Shader Register: " << Descriptor.ShaderRegister << "\n";
         if (RS.Version > 1)
-          OS << indent(Space + 2) << "Flags: " << Descriptor.Flags << "\n";
+          OS << "  Flags: " << Descriptor.Flags << "\n";
+        break;
+      }
+      case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): {
+        const mcdxbc::DescriptorTable &Table =
+            RS.ParametersContainer.getDescriptorTable(Loc);
+        OS << "  NumRanges: " << Table.Ranges.size() << "\n";
+
+        for (const dxbc::RTS0::v2::DescriptorRange Range : Table) {
+          OS << "  - Range Type: " << Range.RangeType << "\n"
+             << "    Register Space: " << Range.RegisterSpace << "\n"
+             << "    Base Shader Register: " << Range.BaseShaderRegister << "\n"
+             << "    Num Descriptors: " << Range.NumDescriptors << "\n"
+             << "    Offset In Descriptors From Table Start: "
+             << Range.OffsetInDescriptorsFromTableStart << "\n";
+          if (RS.Version > 1)
+            OS << "    Flags: " << Range.Flags << "\n";
+        }
         break;
       }
       }
-      Space--;
     }
-    OS << indent(Space) << "NumStaticSamplers: " << 0 << "\n";
-    OS << indent(Space) << "StaticSamplersOffset: " << RS.StaticSamplersOffset
-       << "\n";
-
-    Space--;
-    // end root signature header
+    OS << "NumStaticSamplers: " << 0 << "\n";
+    OS << "StaticSamplersOffset: " << RS.StaticSamplersOffset << "\n";
   }
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index 3f25551b2b5e..b45cebc15fd3 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -31,6 +31,7 @@ enum class RootSignatureElementKind {
   SRV = 3,
   UAV = 4,
   CBV = 5,
+  DescriptorTable = 6,
 };
 class RootSignatureAnalysis : public AnalysisInfoMixin<RootSignatureAnalysis> {
   friend AnalysisInfoMixin<RootSignatureAnalysis>;
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll
new file mode 100644
index 000000000000..8eb7f90c6b75
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll
@@ -0,0 +1,157 @@
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"DescriptorTable", i32 0, !6, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20 }
+
+; typedef enum D3D12_DESCRIPTOR_RANGE_FLAGS {
+;   NONE = 0,
+;   DESCRIPTORS_VOLATILE = 0x1,
+;   DATA_VOLATILE = 0x2,
+;   DATA_STATIC_WHILE_SET_AT_EXECUTE = 0x4,
+;   DATA_STATIC = 0x8,
+;   DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS = 0x10000
+; } ;
+
+;0
+!6 = !{ !"Sampler", i32 1, i32 0, i32 1, i32 -1, i32 0 }
+;DESCRIPTORS_VOLATILE
+!8 = !{ !"Sampler", i32 1, i32 0, i32 3, i32 -1, i32 1 }
+;DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS
+!9 = !{ !"Sampler", i32 1, i32 0, i32 4, i32 -1, i32 65536 }
+;0
+!10 = !{ !"SRV", i32 1, i32 0, i32 5, i32 -1, i32 1 }
+;DESCRIPTORS_VOLATILE
+!11 = !{ !"UAV", i32 5, i32 1, i32 6, i32 5, i32 1 }
+;DATA_VOLATILE
+!12 = !{ !"CBV", i32 5, i32 1, i32 7, i32 5, i32 2 }
+;DATA_STATIC
+!13 = !{ !"SRV", i32 5, i32 1, i32 8, i32 5, i32 8 }
+;DATA_STATIC_WHILE_SET_AT_EXECUTE
+!14 = !{ !"UAV", i32 5, i32 1, i32 9, i32 5, i32 4 }
+;DESCRIPTORS_VOLATILE | DATA_VOLATILE
+!15 = !{ !"CBV", i32 5, i32 1, i32 10, i32 5, i32 3 }
+;DESCRIPTORS_VOLATILE | DATA_STATIC_WHILE_SET_AT_EXECUTE
+!16 = !{ !"SRV", i32 5, i32 1, i32 11, i32 5, i32 5 }
+;DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS
+!17 = !{ !"UAV", i32 5, i32 1, i32 12, i32 5, i32 65536 }
+;DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS | DATA_VOLATILE
+!18 = !{ !"CBV", i32 5, i32 1, i32 13, i32 5, i32 65538 }
+;DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS | DATA_STATIC
+!19 = !{ !"SRV", i32 5, i32 1, i32 14, i32 5, i32 65544 }
+;DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS | DATA_STATIC_WHILE_SET_AT_EXECUTE
+!20 = !{ !"UAV", i32 5, i32 1, i32 15, i32 5, i32 65540 }
+
+;DXC:- Name:            RTS0
+;DXC-NEXT:    Size:            380
+;DXC-NEXT:    RootSignature:
+;DXC-NEXT:      Version:         2
+;DXC-NEXT:      NumRootParameters: 1
+;DXC-NEXT:      RootParametersOffset: 24
+;DXC-NEXT:      NumStaticSamplers: 0
+;DXC-NEXT:      StaticSamplersOffset: 0
+;DXC-NEXT:      Parameters:
+;DXC-NEXT:        - ParameterType:   0
+;DXC-NEXT:          ShaderVisibility: 0
+;DXC-NEXT:          Table:
+;DXC-NEXT:            NumRanges:       14
+;DXC-NEXT:            RangesOffset:    44
+;DXC-NEXT:            Ranges:
+;DXC-NEXT:              - RangeType:       3
+;DXC-NEXT:                NumDescriptors:  1
+;DXC-NEXT:                BaseShaderRegister: 0
+;DXC-NEXT:                RegisterSpace:   1
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
+;DXC-NEXT:              - RangeType:       3
+;DXC-NEXT:                NumDescriptors:  1
+;DXC-NEXT:                BaseShaderRegister: 0
+;DXC-NEXT:                RegisterSpace:   3
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
+;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
+;DXC-NEXT:              - RangeType:       3
+;DXC-NEXT:                NumDescriptors:  1
+;DXC-NEXT:                BaseShaderRegister: 0
+;DXC-NEXT:                RegisterSpace:   4
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
+;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
+;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:                NumDescriptors:  1
+;DXC-NEXT:                BaseShaderRegister: 0
+;DXC-NEXT:                RegisterSpace:   5
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
+;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
+;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   6
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
+;DXC-NEXT:              - RangeType:       2
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   7
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DATA_VOLATILE:   true
+;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   8
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DATA_STATIC:     true
+;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   9
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE: true
+;DXC-NEXT:              - RangeType:       2
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   10
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
+;DXC-NEXT:                DATA_VOLATILE:   true
+;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   11
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DESCRIPTORS_VOLATILE: true
+;DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE: true
+;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   12
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
+;DXC-NEXT:              - RangeType:       2
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   13
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DATA_VOLATILE:   true
+;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
+;DXC-NEXT:              - RangeType:       0
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   14
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DATA_STATIC:     true
+;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
+;DXC-NEXT:              - RangeType:       1
+;DXC-NEXT:                NumDescriptors:  5
+;DXC-NEXT:                BaseShaderRegister: 1
+;DXC-NEXT:                RegisterSpace:   15
+;DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+;DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE: true
+;DXC-NEXT:                DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll
new file mode 100644
index 000000000000..9d89dbdd9107
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll
@@ -0,0 +1,44 @@
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 1 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"DescriptorTable", i32 0, !6, !7 }
+!6 = !{ !"Sampler", i32 0, i32 1, i32 0, i32 -1, i32 1 }
+!7 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 3 }
+
+
+; DXC:        - Name:            RTS0
+; DXC-NEXT:     Size:            84
+; DXC-NEXT:     RootSignature:
+; DXC-NEXT:       Version:         1
+; DXC-NEXT:       NumRootParameters: 1
+; DXC-NEXT:       RootParametersOffset: 24
+; DXC-NEXT:       NumStaticSamplers: 0
+; DXC-NEXT:       StaticSamplersOffset: 0
+; DXC-NEXT:       Parameters:
+; DXC-NEXT:         - ParameterType:   0
+; DXC-NEXT:           ShaderVisibility: 0
+; DXC-NEXT:           Table:
+; DXC-NEXT:             NumRanges:       2
+; DXC-NEXT:             RangesOffset:    44
+; DXC-NEXT:             Ranges:
+; DXC-NEXT:               - RangeType:       3
+; DXC-NEXT:                 NumDescriptors:  0
+; DXC-NEXT:                 BaseShaderRegister: 1
+; DXC-NEXT:                 RegisterSpace:   0
+; DXC-NEXT:                 OffsetInDescriptorsFromTableStart: 4294967295
+; DXC-NEXT:               - RangeType:       1
+; DXC-NEXT:                 NumDescriptors:  5
+; DXC-NEXT:                 BaseShaderRegister: 1
+; DXC-NEXT:                 RegisterSpace:   10
+; DXC-NEXT:                 OffsetInDescriptorsFromTableStart: 5
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-Flag.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-Flag.ll
new file mode 100644
index 000000000000..41101c1f1fe8
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-Flag.ll
@@ -0,0 +1,20 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Invalid value for DescriptorFlag: 22
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"DescriptorTable", i32 0, !6, !7 }
+!6 = !{ !"SRV", i32 0, i32 1, i32 0, i32 -1, i32 22 }
+!7 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 2 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RangeType.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RangeType.ll
new file mode 100644
index 000000000000..b7e99ae7cd27
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RangeType.ll
@@ -0,0 +1,20 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Invalid Descriptor Range type: Invalid 
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"DescriptorTable", i32 0, !6, !7 }
+!6 = !{ !"Invalid", i32 0, i32 0, i32 -1, i32 -1, i32 4 }
+!7 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 2 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RegisterSpace.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RegisterSpace.ll
new file mode 100644
index 000000000000..4cef5d86a980
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-Invalid-RegisterSpace.ll
@@ -0,0 +1,20 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Invalid value for RegisterSpace: 4294967280
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"DescriptorTable", i32 0, !6, !7 }
+!6 = !{ !"SRV", i32 0, i32 0, i32 10, i32 -1, i32 4 }
+!7 = !{ !"UAV", i32 5, i32 1, i32 4294967280, i32 5, i32 2 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll
new file mode 100644
index 000000000000..b516d6618024
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable.ll
@@ -0,0 +1,48 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.rts0 = private constant [92 x i8]  c"{{.*}}", section "RTS0", align 4
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 2 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"DescriptorTable", i32 0, !6, !7 }
+!6 = !{ !"SRV", i32 0, i32 1, i32 0, i32 -1, i32 4 }
+!7 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 2 }
+
+; DXC:  - Name:            RTS0
+; DXC-NEXT:    Size:            92
+; DXC-NEXT:    RootSignature:
+; DXC-NEXT:      Version:         2
+; DXC-NEXT:      NumRootParameters: 1 
+; DXC-NEXT:      RootParametersOffset: 24 
+; DXC-NEXT:      NumStaticSamplers: 0
+; DXC-NEXT:      StaticSamplersOffset: 0
+; DXC-NEXT:      Parameters:
+; DXC-NEXT:        - ParameterType:   0
+; DXC-NEXT:          ShaderVisibility: 0
+; DXC-NEXT:          Table:
+; DXC-NEXT:            NumRanges:       2
+; DXC-NEXT:            RangesOffset:    44
+; DXC-NEXT:            Ranges:
+; DXC-NEXT:              - RangeType:       0
+; DXC-NEXT:                NumDescriptors:  0
+; DXC-NEXT:                BaseShaderRegister: 1
+; DXC-NEXT:                RegisterSpace:   0
+; DXC-NEXT:                OffsetInDescriptorsFromTableStart: 4294967295
+; DXC-NEXT:                DATA_STATIC_WHILE_SET_AT_EXECUTE:   true
+; DXC-NEXT:              - RangeType:       1
+; DXC-NEXT:                NumDescriptors:  5
+; DXC-NEXT:                BaseShaderRegister: 1
+; DXC-NEXT:                RegisterSpace:   10
+; DXC-NEXT:                OffsetInDescriptorsFromTableStart: 5
+; DXC-NEXT:                DATA_VOLATILE: true
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll
index 80aa757d7e10..d0a58bc34ffa 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Parameters.ll
@@ -12,16 +12,19 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
-!3 = !{ !4, !5, !6 } ; list of root signature elements
+!3 = !{ !4, !5, !6, !7 } ; list of root signature elements
 !4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
 !5 = !{ !"RootConstants", i32 0, i32 1, i32 2, i32 3 }
 !6 = !{ !"RootSRV", i32 1, i32 4, i32 5, i32 6 }
+!7 = !{ !"DescriptorTable", i32 0, !8, !9 }
+!8 = !{ !"SRV", i32 0, i32 1, i32 0, i32 -1, i32 4 }
+!9 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 2 }
 
 ;CHECK-LABEL: Definition for 'main':
 ;CHECK-NEXT:  Flags: 0x000001
 ;CHECK-NEXT:  Version: 2
 ;CHECK-NEXT:  RootParametersOffset: 24
-;CHECK-NEXT:  NumParameters: 2
+;CHECK-NEXT:  NumParameters: 3
 ;CHECK-NEXT:   - Parameter Type: 1
 ;CHECK-NEXT:     Shader Visibility: 0
 ;CHECK-NEXT:     Register Space: 2
@@ -32,5 +35,20 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 ;CHECK-NEXT:     Register Space: 5
 ;CHECK-NEXT:     Shader Register: 4
 ;CHECK-NEXT:     Flags: 6
+;CHECK-NEXT:   - Parameter Type: 0
+;CHECK-NEXT:     Shader Visibility: 0
+;CHECK-NEXT:     NumRanges: 2
+;CHECK-NEXT:     - Range Type: 0
+;CHECK-NEXT:       Register Space: 0
+;CHECK-NEXT:       Base Shader Register: 1
+;CHECK-NEXT:       Num Descriptors: 0
+;CHECK-NEXT:       Offset In Descriptors From Table Start: 4294967295
+;CHECK-NEXT:       Flags: 4
+;CHECK-NEXT:     - Range Type: 1
+;CHECK-NEXT:       Register Space: 10
+;CHECK-NEXT:       Base Shader Register: 1
+;CHECK-NEXT:       Num Descriptors: 5
+;CHECK-NEXT:       Offset In Descriptors From Table Start: 5
+;CHECK-NEXT:       Flags: 2
 ;CHECK-NEXT:  NumStaticSamplers: 0
 ;CHECK-NEXT:  StaticSamplersOffset: 0
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll
index 6c90bcb09b64..7ee04710be0a 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor-Invalid-Flags.ll
@@ -3,7 +3,7 @@
 target triple = "dxil-unknown-shadermodel6.0-compute"
 
 
-; CHECK: error: Invalid value for DescriptorFlag: 3
+; CHECK: error: Invalid value for DescriptorRangeFlag: 3
 ; CHECK-NOT: Root Signature Definitions
 define void @main() #0 {
 entry:

From 958dc8602651261f8285b59d352a1c4b4da2e90c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:21:45 -0700
Subject: [PATCH 1101/1322] [AMDGPU] Don't insert wait instructions that are
 not supported by gfx1250 (#145084)

No tests yet, but it will allow further tests not to be
polluted with these waits.
---
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index d2de494a23ef..3212060f303a 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2272,8 +2272,10 @@ bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
   const DebugLoc &DL = MI->getDebugLoc();
 
   BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
-  BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
-  BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+  if (ST.hasImageInsts()) {
+    BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
+    BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+  }
   BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
   BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
 

From 2a4207e7322c90176814b17870051f9692f9994f Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Fri, 20 Jun 2025 15:23:20 -0400
Subject: [PATCH 1102/1322] [DirectX] Don't limit visitGetElementPtrInst to
 global ptrs (#144959)

fixes #144608
- there is a getPointerOperandIndex function so we don't need to iterate
the operands trying to find the pointer. This resulted in a small
cleanup to visitStoreInst and visitLoadInst.

- The meat of this change was in visitGetElementPtrInst to account for
allocas and not bail when we don't find a global.
---
 .../Target/DirectX/DXILDataScalarization.cpp  | 102 +++++++++---------
 llvm/test/CodeGen/DirectX/scalarize-alloca.ll |  19 +++-
 2 files changed, 71 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 286fd0c5bb97..0a9b2bb99f7e 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -14,11 +14,13 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -137,49 +139,42 @@ bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
 }
 
 bool DataScalarizerVisitor::visitLoadInst(LoadInst &LI) {
-  unsigned NumOperands = LI.getNumOperands();
-  for (unsigned I = 0; I < NumOperands; ++I) {
-    Value *CurrOpperand = LI.getOperand(I);
-    ConstantExpr *CE = dyn_cast<ConstantExpr>(CurrOpperand);
-    if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
-      GetElementPtrInst *OldGEP =
-          cast<GetElementPtrInst>(CE->getAsInstruction());
-      OldGEP->insertBefore(LI.getIterator());
-      IRBuilder<> Builder(&LI);
-      LoadInst *NewLoad =
-          Builder.CreateLoad(LI.getType(), OldGEP, LI.getName());
-      NewLoad->setAlignment(LI.getAlign());
-      LI.replaceAllUsesWith(NewLoad);
-      LI.eraseFromParent();
-      visitGetElementPtrInst(*OldGEP);
-      return true;
-    }
-    if (GlobalVariable *NewGlobal = lookupReplacementGlobal(CurrOpperand))
-      LI.setOperand(I, NewGlobal);
+  Value *PtrOperand = LI.getPointerOperand();
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(PtrOperand);
+  if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
+    GetElementPtrInst *OldGEP = cast<GetElementPtrInst>(CE->getAsInstruction());
+    OldGEP->insertBefore(LI.getIterator());
+    IRBuilder<> Builder(&LI);
+    LoadInst *NewLoad = Builder.CreateLoad(LI.getType(), OldGEP, LI.getName());
+    NewLoad->setAlignment(LI.getAlign());
+    LI.replaceAllUsesWith(NewLoad);
+    LI.eraseFromParent();
+    visitGetElementPtrInst(*OldGEP);
+    return true;
   }
+  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand))
+    LI.setOperand(LI.getPointerOperandIndex(), NewGlobal);
   return false;
 }
 
 bool DataScalarizerVisitor::visitStoreInst(StoreInst &SI) {
-  unsigned NumOperands = SI.getNumOperands();
-  for (unsigned I = 0; I < NumOperands; ++I) {
-    Value *CurrOpperand = SI.getOperand(I);
-    ConstantExpr *CE = dyn_cast<ConstantExpr>(CurrOpperand);
-    if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
-      GetElementPtrInst *OldGEP =
-          cast<GetElementPtrInst>(CE->getAsInstruction());
-      OldGEP->insertBefore(SI.getIterator());
-      IRBuilder<> Builder(&SI);
-      StoreInst *NewStore = Builder.CreateStore(SI.getValueOperand(), OldGEP);
-      NewStore->setAlignment(SI.getAlign());
-      SI.replaceAllUsesWith(NewStore);
-      SI.eraseFromParent();
-      visitGetElementPtrInst(*OldGEP);
-      return true;
-    }
-    if (GlobalVariable *NewGlobal = lookupReplacementGlobal(CurrOpperand))
-      SI.setOperand(I, NewGlobal);
+
+  Value *PtrOperand = SI.getPointerOperand();
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(PtrOperand);
+  if (CE && CE->getOpcode() == Instruction::GetElementPtr) {
+    GetElementPtrInst *OldGEP = cast<GetElementPtrInst>(CE->getAsInstruction());
+    OldGEP->insertBefore(SI.getIterator());
+    IRBuilder<> Builder(&SI);
+    StoreInst *NewStore = Builder.CreateStore(SI.getValueOperand(), OldGEP);
+    NewStore->setAlignment(SI.getAlign());
+    SI.replaceAllUsesWith(NewStore);
+    SI.eraseFromParent();
+    visitGetElementPtrInst(*OldGEP);
+    return true;
   }
+  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand))
+    SI.setOperand(SI.getPointerOperandIndex(), NewGlobal);
+
   return false;
 }
 
@@ -302,24 +297,35 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
 }
 
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  Value *PtrOperand = GEPI.getPointerOperand();
+  Type *OrigGEPType = GEPI.getPointerOperandType();
+  Type *NewGEPType = OrigGEPType;
+  bool NeedsTransform = false;
 
-  unsigned NumOperands = GEPI.getNumOperands();
-  GlobalVariable *NewGlobal = nullptr;
-  for (unsigned I = 0; I < NumOperands; ++I) {
-    Value *CurrOpperand = GEPI.getOperand(I);
-    NewGlobal = lookupReplacementGlobal(CurrOpperand);
-    if (NewGlobal)
-      break;
+  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
+    NewGEPType = NewGlobal->getValueType();
+    PtrOperand = NewGlobal;
+    NeedsTransform = true;
+  } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
+    Type *AllocatedType = Alloca->getAllocatedType();
+    // OrigGEPType might just be a pointer lets make sure
+    // to add the allocated type so we have a size
+    if (AllocatedType != OrigGEPType) {
+      NewGEPType = AllocatedType;
+      NeedsTransform = true;
+    }
   }
-  if (!NewGlobal)
+
+  // Note: We bail if this isn't a gep touched via alloca or global
+  // transformations
+  if (!NeedsTransform)
     return false;
 
   IRBuilder<> Builder(&GEPI);
   SmallVector<Value *, MaxVecSize> Indices(GEPI.indices());
 
-  Value *NewGEP =
-      Builder.CreateGEP(NewGlobal->getValueType(), NewGlobal, Indices,
-                        GEPI.getName(), GEPI.getNoWrapFlags());
+  Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
+                                    GEPI.getName(), GEPI.getNoWrapFlags());
   GEPI.replaceAllUsesWith(NewGEP);
   GEPI.eraseFromParent();
   return true;
diff --git a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
index 4829f3a31791..b589136d6965 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
@@ -1,10 +1,25 @@
-; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=SCHECK
-; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=FCHECK
+; RUN: opt -S -passes='dxil-data-scalarization' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=SCHECK,CHECK
+; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=FCHECK,CHECK
 
 ; CHECK-LABEL: alloca_2d__vec_test
 define void @alloca_2d__vec_test() local_unnamed_addr #2 {
   ; SCHECK:  alloca [2 x [4 x i32]], align 16
   ; FCHECK:  alloca [8 x i32], align 16
+  ; CHECK: ret void
   %1 = alloca [2 x <4 x i32>], align 16
   ret void
 }
+
+; CHECK-LABEL: alloca_2d_gep_test
+define void @alloca_2d_gep_test() {
+  ; SCHECK:  [[alloca_val:%.*]] = alloca [2 x [2 x i32]], align 16
+  ; FCHECK:  [[alloca_val:%.*]] = alloca [4 x i32], align 16
+  ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [2 x [2 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; CHECK: ret void
+  %1 = alloca [2 x <2 x i32>], align 16
+  %2 = tail call i32 @llvm.dx.thread.id(i32 0)
+  %3 = getelementptr inbounds nuw [2 x <2 x i32>], ptr %1, i32 0, i32 %2
+  ret void
+}

From affcc5e728c86260590ae398c136d43ac6cfbfb0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:28:18 -0700
Subject: [PATCH 1103/1322] [AMDGPU] Add s_wait_xcnt gfx1250 instruction
 (#145086)

---
 llvm/lib/Target/AMDGPU/AMDGPU.td                   | 10 ++++++++++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h              |  5 +++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td          | 10 ++++++++++
 llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s             | 14 ++++++++++++++
 .../MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt   | 10 ++++++++++
 5 files changed, 49 insertions(+)
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 559328459141..4b17e1c808b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1112,6 +1112,12 @@ def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel",
   "Has point sample acceleration feature"
 >;
 
+def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt",
+  "HasWaitXcnt",
+  "true",
+  "Has s_wait_xcnt instruction"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -1927,6 +1933,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureAshrPkInsts,
    FeatureSupportsSRAMECC,
    FeatureMaxHardClauseLength63,
+   FeatureWaitXcnt,
    FeatureAtomicFMinFMaxF64GlobalInsts,
    FeatureAtomicFMinFMaxF64FlatInsts,
    FeatureFlatBufferGlobalAtomicFaddF64Inst,
@@ -2591,6 +2598,9 @@ def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
 def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">,
   AssemblerPredicate<(all_of FeatureBVHDualAndBVH8Insts)>;
 
+def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">,
+  AssemblerPredicate<(all_of FeatureWaitXcnt)>;
+
 def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">,
   AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index dd57cc96e41c..4ec60dc2752e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -202,6 +202,7 @@ protected:
 
   bool HasNoSdstCMPX = false;
   bool HasVscnt = false;
+  bool HasWaitXcnt = false;
   bool HasGetWaveIdInst = false;
   bool HasSMemTimeInst = false;
   bool HasShaderCyclesRegister = false;
@@ -1368,6 +1369,10 @@ public:
     return HasMinimum3Maximum3PKF16;
   }
 
+  /// \returns true if the target has s_wait_xcnt insertion. Supported for
+  /// GFX1250.
+  bool hasWaitXCnt() const { return HasWaitXcnt; }
+
   bool hasPointSampleAccel() const { return HasPointSampleAccel; }
 
   /// \returns The maximum number of instructions that can be enclosed in an
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 90e65a6950c0..3f2e764f2926 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1751,6 +1751,11 @@ let OtherPredicates = [HasExportInsts] in
                 [(int_amdgcn_s_wait_kmcnt timm:$simm16)]>;
 } // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1
 
+let SubtargetPredicate = HasWaitXcnt, hasSideEffects = 1 in {
+  def S_WAIT_XCNT :
+    SOPP_Pseudo<"s_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
+} // End SubtargetPredicate = hasWaitXcnt, hasSideEffects = 1
+
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -2560,6 +2565,11 @@ defm S_WAIT_KMCNT           : SOPP_Real_32_gfx12<0x047>;
 defm S_WAIT_LOADCNT_DSCNT   : SOPP_Real_32_gfx12<0x048>;
 defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;
 
+//===----------------------------------------------------------------------===//
+// SOPP - GFX1250 only.
+//===----------------------------------------------------------------------===//
+defm S_WAIT_XCNT      : SOPP_Real_32_gfx12<0x045>;
+
 //===----------------------------------------------------------------------===//
 // SOPP - GFX11, GFX12.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
new file mode 100644
index 000000000000..1aca88771c1f
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -0,0 +1,14 @@
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: -strict-whitespace %s
+
+s_wait_xcnt 0x0
+// GFX1250: [0x00,0x00,0xc5,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_wait_xcnt 0x7
+// GFX1250: [0x07,0x00,0xc5,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_wait_xcnt 0xf
+// GFX1250: [0x0f,0x00,0xc5,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
new file mode 100644
index 000000000000..e785fe9cc6d5
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+# GFX1250: s_wait_xcnt 0x0 ; encoding: [0x00,0x00,0xc5,0xbf]
+0x00,0x00,0xc5,0xbf
+
+# GFX1250: s_wait_xcnt 0x7 ; encoding: [0x07,0x00,0xc5,0xbf]
+0x07,0x00,0xc5,0xbf
+
+# GFX1250: s_wait_xcnt 0xf ; encoding: [0x0f,0x00,0xc5,0xbf]
+0x0f,0x00,0xc5,0xbf

From 227f759644bbc208045178c54633df241f27da7f Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Fri, 20 Jun 2025 14:34:43 -0500
Subject: [PATCH 1104/1322] [mlir][python] expose operation.block (#145088)

Expose `operation-getBlock()` in python.
---
 mlir/lib/Bindings/Python/IRCore.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index b5720b7ad8b2..cbd35f2974ae 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -3385,6 +3385,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyOperation::createFromCapsule)
       .def_prop_ro("operation", [](nb::object self) { return self; })
       .def_prop_ro("opview", &PyOperation::createOpView)
+      .def_prop_ro("block", &PyOperation::getBlock)
       .def_prop_ro(
           "successors",
           [](PyOperationBase &self) {

From 6d8c6ef90c1a4d17d764c4479d5165251bf07c95 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 20 Jun 2025 12:45:21 -0700
Subject: [PATCH 1105/1322] [BOLT][NFC] Simplify doTrace in BAT mode (#143233)

`BoltAddressTranslation::getFallthroughsInTrace` iterates over address
translation map entries and therefore has direct access to both original
and translated offsets. Return the translated offsets in fall-throughs
list to avoid duplicate address translation inside `doTrace`.

Test Plan: NFC
---
 bolt/lib/Profile/BoltAddressTranslation.cpp | 4 ++--
 bolt/lib/Profile/DataAggregator.cpp         | 7 +------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index a253522e4fb1..7ad4e6a2e141 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -546,7 +546,7 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress,
     return Res;
 
   for (auto Iter = FromIter; Iter != ToIter;) {
-    const uint32_t Src = Iter->first;
+    const uint32_t Src = Iter->second >> 1;
     if (Iter->second & BRANCHENTRY) {
       ++Iter;
       continue;
@@ -557,7 +557,7 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress,
       ++Iter;
     if (Iter->second & BRANCHENTRY)
       break;
-    Res.emplace_back(Src, Iter->first);
+    Res.emplace_back(Src, Iter->second >> 1);
   }
 
   return Res;
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index c067b2f5b73b..5c8af3710720 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -835,13 +835,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
 
   LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
                     << FromFunc->getPrintName() << ":" << Trace << '\n');
-  for (auto [From, To] : *FTs) {
-    if (BAT) {
-      From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
-      To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false);
-    }
+  for (const auto &[From, To] : *FTs)
     doIntraBranch(*ParentFunc, From, To, Count, false);
-  }
 
   return true;
 }

From 4959e8a1dadd96499d701bcf02cd9b25dba98c98 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 20 Jun 2025 12:46:56 -0700
Subject: [PATCH 1106/1322] [BOLT][NFCI] Use heuristic for matching split
 global functions (#90429)

This change speeds up fragment matching for large BOLTed binaries where
all fragments of global parent functions are put under `bolt-pseudo.o`
file symbol:
- before: iterating over symbols under `bolt-pseudo.o` only to fail
  to find a parent,
- after: bail out immediately and use a global parent by name.

Test Plan: NFC, updated register-fragments-bolt-symbols.s
---
 bolt/lib/Rewrite/RewriteInstance.cpp            | 5 +++++
 bolt/test/X86/register-fragments-bolt-symbols.s | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index e1aa00a3d749..d650e5db54bf 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1567,6 +1567,11 @@ void RewriteInstance::registerFragments() {
 
     uint64_t ParentAddress{0};
 
+    // Check if containing FILE symbol is BOLT emitted synthetic symbol marking
+    // local fragments of global parents.
+    if (cantFail(FSI[-1].getName()) == getBOLTFileSymbolName())
+      goto registerParent;
+
     // BOLT split fragment symbols are emitted just before the main function
     // symbol.
     for (ELFSymbolRef NextSymbol = Symbol; NextSymbol < StopSymbol;
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
index c9f1859c4e8a..20e7345541d9 100644
--- a/bolt/test/X86/register-fragments-bolt-symbols.s
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -29,6 +29,7 @@
 
 # RUN: link_fdata %s %t.bolt %t.preagg PREAGG
 # PREAGG: B X:0 #chain.cold.0# 1 0
+# PREAGG: B X:0 #dummy# 1 0
 # RUN: perf2bolt %t.bolt -p %t.preagg --pa -o %t.bat.fdata -w %t.bat.yaml -v=1 \
 # RUN:   | FileCheck %s --check-prefix=CHECK-REGISTER
 # RUN: FileCheck --input-file %t.bat.fdata --check-prefix=CHECK-FDATA %s
@@ -44,7 +45,13 @@
 # CHECK-SYMS: l  F .text.cold     [[#]] chain.cold.0
 # CHECK-SYMS: l  F .text          [[#]] chain
 # CHECK-SYMS: l df *ABS*          [[#]] bolt-pseudo.o
+# CHECK-SYMS: l  F .text.cold     [[#]] dummy.cold.0
+# CHECK-SYMS: l  F .text.cold.1   [[#]] dummy.cold.1
+# CHECK-SYMS: l  F .text.cold.2   [[#]] dummy.cold.2
 
+# CHECK-REGISTER: BOLT-INFO: marking dummy.cold.0/1(*2) as a fragment of dummy
+# CHECK-REGISTER: BOLT-INFO: marking dummy.cold.1/1(*2) as a fragment of dummy
+# CHECK-REGISTER: BOLT-INFO: marking dummy.cold.2/1(*2) as a fragment of dummy
 # CHECK-REGISTER: BOLT-INFO: marking chain.cold.0/1(*2) as a fragment of chain/2(*2)
 
 # CHECK-FDATA: 0 [unknown] 0 1 chain/chain.s/2 10 0 1

From d8924d4da78fc980b720b328897b1bd5efba348a Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Fri, 20 Jun 2025 21:54:06 +0200
Subject: [PATCH 1107/1322] [LLDB] Explicitly use python for version fixup
 (#144217)

On Windows, the post build command would open the script in the default
editor, since it doesn't know about shebangs. This effectively adds
`python3` in front of the command.

Amends https://github.com/llvm/llvm-project/pull/142871 /
https://github.com/llvm/llvm-project/pull/141116
---
 lldb/source/API/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 2772aa920411..f4a323aaf267 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -326,7 +326,7 @@ foreach(header
 endforeach()
 
 add_custom_command(TARGET liblldb POST_BUILD
-  COMMAND ${LLDB_SOURCE_DIR}/scripts/version-header-fix.py ${LLDB_SOURCE_DIR}/include/lldb/lldb-defines.h ${lldb_header_staging_dir}/lldb-defines.h ${LLDB_VERSION_MAJOR} ${LLDB_VERSION_MINOR} ${LLDB_VERSION_PATCH}
+  COMMAND "${Python3_EXECUTABLE}" ${LLDB_SOURCE_DIR}/scripts/version-header-fix.py ${LLDB_SOURCE_DIR}/include/lldb/lldb-defines.h ${lldb_header_staging_dir}/lldb-defines.h ${LLDB_VERSION_MAJOR} ${LLDB_VERSION_MINOR} ${LLDB_VERSION_PATCH}
 )
 add_custom_target(liblldb-header-staging DEPENDS ${lldb_staged_headers})
 add_dependencies(liblldb liblldb-header-staging)

From 3a66e2065296b3e0b27f0a14431eba1d74e7f8c4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:57:36 -0700
Subject: [PATCH 1108/1322] [AMDGPU] Add gfx1250 runlines to vop3 dpp tests.
 NFC. (#145089)

dpp8 disasm test does not work yet.
---
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s    | 104 ++++++++++------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s     |  86 +++++++++----
 .../AMDGPU/gfx12_dasm_vop3_dpp16.txt          | 113 ++++++++++--------
 3 files changed, 195 insertions(+), 108 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index cf497c058b7e..90a449173320 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1,8 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200,W64 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W32-ERR --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 
 v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -1237,37 +1239,37 @@ v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_m
 // GFX12: v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3]
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
 
 v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
 
 v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| quad_perm:[0,1,2,3]
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
 
 v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
 // GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
@@ -5592,34 +5594,67 @@ v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 ro
 // GFX12: v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+
+v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX12-ERR: :[[@LINE-1]]:39: error: invalid op_sel operand
+
+v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX12-ERR: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1200: v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x04,0x00]
+
+v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1200: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+
+v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1200: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+
+v_dot2_f16_f16_e64_dpp v5, v1, v2, 0.5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1200: v_dot2_f16_f16_e64_dpp v5, v1, v2, 0.5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x1b,0x00,0xff]
+
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX12-ERR: :[[@LINE-1]]:41: error: invalid op_sel operand
+
+v_dot2_bf16_bf16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX12-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction
 
 v_dot2_f16_f16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12-ERR: :[[@LINE-1]]:43: error: invalid op_sel operand
+v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x00,0x00]
 
-v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x04,0x00]
+v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX12-ERR: :[[@LINE-1]]:45: error: op_sel operand conflicts with 16-bit operand suffix
+
+v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 
 v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+
+v_dot2_bf16_bf16_e64_dpp v5, v1, v2, 0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, 0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd6,0xfa,0x04,0x02,0x02,0x01,0x1b,0x00,0xff]
 
 v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 
 v_dot2_f16_f16_e64_dpp v5.l, v1, v2, 0.5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX12: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, 0.5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x1b,0x00,0xff]
+// GFX1200: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, 0.5 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x66,0xd6,0xfa,0x04,0xc2,0x03,0x01,0x1b,0x00,0xff]
 
 v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h quad_perm:[0,1,2,3]
-// GFX12: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+// GFX1200: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+// GFX1200: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 // GFX12-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction
@@ -5627,35 +5662,32 @@ v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 ban
 v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand
 
-v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x00,0x00]
-
 v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 
 v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, 0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX12: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, 0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd6,0xfa,0x04,0x02,0x02,0x01,0x1b,0x00,0xff]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, 0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd6,0xfa,0x04,0x02,0x02,0x01,0x1b,0x00,0xff]
 
 v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h row_mirror
-// GFX12: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 
 v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
-// GFX12: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
+// GFX1200: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
 
 v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x1a,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
-// GFX12: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x1a,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
+// GFX1200: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x1a,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
 
 v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
-// GFX12: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
+// GFX1200: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
 
 v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0x00]
-// GFX12: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0x00]
+// GFX1200: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0x00,0x40,0x1a,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0x00]
 
 v_minimum_f32 v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_minimum_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index f43a1128f521..a73aa40a2751 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -1,8 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200,W64 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W32-ERR --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR,W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s
 
 v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -761,19 +763,19 @@ v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 // GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1]
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
 
 v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v5.l, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v5.l, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.l, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cvt_pk_fp8_f32_e64_dpp v255.h, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_cvt_pk_fp8_f32_e64_dpp v255.h, -|v255|, -|v255| op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+// GFX1200: v_cvt_pk_fp8_f32_e64_dpp v255.h, -|v255|, -|v255| op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cvt_pk_bf8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cvt_pk_bf8_f32_e64_dpp v5.l, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3849,34 +3851,70 @@ v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0
 // GFX12: v_pack_b32_f16_e64_dpp v255, -|v255.l|, -|v255.h| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
-v_dot2_f16_f16_e64_dpp v0.l, s1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4]
+v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] dpp8:[0,1,2,3,4,4,4,4]
+// GFX12-ERR: :[[@LINE-1]]:39: error: invalid op_sel operand
+
+v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
+// GFX12-ERR: :[[@LINE-1]]:28: error: invalid operand for instruction
+
+v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
+
+v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+
+v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x66,0xd6,0xe9,0x04,0x0e,0xc0,0x01,0x88,0x46,0x92]
+
+v_dot2_f16_f16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1200: v_dot2_f16_f16_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x66,0xd6,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05]
+
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] dpp8:[0,1,2,3,4,4,4,4]
+// GFX12-ERR: :[[@LINE-1]]:41: error: invalid op_sel operand
+
+v_dot2_bf16_bf16_e64_dpp v0, s1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:30: error: invalid operand for instruction
 
+v_dot2_bf16_bf16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
+
 v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:43: error: invalid op_sel operand
 
+v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+
 v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
+
+v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x67,0xd6,0xe9,0x04,0x0e,0xc0,0x01,0x88,0x46,0x92]
 
 v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+
+v_dot2_bf16_bf16_e64_dpp v5, v1, v2, 0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, 0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd6,0xe9,0x04,0x02,0x02,0x01,0x77,0x39,0x05]
 
 v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x66,0xd6,0xe9,0x04,0x0e,0xc0,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x66,0xd6,0xe9,0x04,0x0e,0xc0,0x01,0x88,0x46,0x92]
 
 v_dot2_f16_f16_e64_dpp v5.l, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x66,0xd6,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05]
+// GFX1200: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x66,0xd6,0xe9,0x04,0xc2,0x03,0x01,0x77,0x39,0x05]
 
 v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x66,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+// GFX1200: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x66,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x47,0x66,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+// GFX1200: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x47,0x66,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:32: error: invalid operand for instruction
@@ -3885,28 +3923,28 @@ v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] dpp8:[0,1,2,3,4,4,4
 // GFX12-ERR: :[[@LINE-1]]:45: error: invalid op_sel operand
 
 v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
-// GFX12: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x67,0xd6,0xe9,0x04,0x0e,0xc0,0x01,0x88,0x46,0x92]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x67,0xd6,0xe9,0x04,0x0e,0xc0,0x01,0x88,0x46,0x92]
 
 v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, 0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, 0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd6,0xe9,0x04,0x02,0x02,0x01,0x77,0x39,0x05]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, 0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd6,0xe9,0x04,0x02,0x02,0x01,0x77,0x39,0x05]
 
 v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x67,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x67,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x47,0x67,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+// GFX1200: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x47,0x67,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_dot2_f32_bf16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x40,0x1a,0xcc,0xe9,0x04,0x0e,0x18,0x01,0x77,0x39,0x05]
-// GFX12: v_dot2_f32_bf16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x40,0x1a,0xcc,0xe9,0x04,0x0e,0x18,0x01,0x77,0x39,0x05]
+// GFX1200: v_dot2_f32_bf16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x40,0x1a,0xcc,0xe9,0x04,0x0e,0x18,0x01,0x77,0x39,0x05]
 
 v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x1a,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
-// GFX12: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x1a,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
+// GFX1200: v_dot2_f32_bf16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x1a,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
 
 v_minimum_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_minimum_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 397852da6ef3..19915341a4be 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -1,8 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,W32-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,W32-FAKE16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,W64-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,W64-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,W32-REAL16,GFX1200,GFX1200-W32,GFX1200-W32-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,W32-FAKE16,GFX1200,GFX1200-W32,GFX1200-W32-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,W64-REAL16,GFX1200,GFX1200-W64,GFX1200-W64-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,W64-FAKE16,GFX1200,GFX1200-W64,GFX1200-W64-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX1250,W32,GFX1250-W32,W32-REAL16,GFX1250-W32-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX1250,W32,GFX1250-W32,W32-FAKE16,GFX1250-W32-FAKE16 %s
 
 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -1179,62 +1181,62 @@
 # W64-FAKE16: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x42,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
 0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
-# W32-REAL16: v_cvt_pk_fp8_f32 v1.l, -v2, |v3|        ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
 # W32-FAKE16: v_cvt_pk_fp8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32 v1.l, -v2, |v3|        ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
 # W64-REAL16: v_cvt_pk_fp8_f32 v1.l, -v2, |v3|        ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
 # W64-FAKE16: v_cvt_pk_fp8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
 
 0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v6.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
 0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
 
 0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
 
 0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
 
 0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
 
 0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
 
 0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
 0x01,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
-# W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 # W32-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+# GFX1200-W32-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 # W64-REAL16: v_cvt_pk_fp8_f32_e64_dpp v1.h, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 # W64-FAKE16: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| op_sel:[0,0,1] quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x42,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
@@ -5793,7 +5795,6 @@
 # W64-REAL16: v_max3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
 # W64-FAKE16: v_max3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2c,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
 
-
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_max3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 # W32-FAKE16: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -6628,7 +6629,6 @@
 # W64-REAL16: v_min3_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
 # W64-FAKE16: v_min3_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x2b,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x01,0x13]
 
-
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # W32-REAL16: v_min3_i16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 # W32-FAKE16: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -7290,100 +7290,108 @@
 # W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 # op_sel[1:0] are ignored
 0x00,0x78,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 
 0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# GFX1250: v_add_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1250: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# GFX1250: v_add_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
 0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_dot2_f16_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1250: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-REAL16: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-FAKE16: v_dot2_f16_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-REAL16: v_dot2_f16_f16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-FAKE16: v_dot2_f16_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x66,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 # op_sel[1:0] are ignored
 0x00,0x78,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, v1, v2, v3.h op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
 0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v0.h, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
 
 0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+# GFX1250: v_add_f32_dpp v255, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 
 0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1250: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+# GFX1250: v_add_f32_dpp v255, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v5.l, v1, v2, v255.h op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x67,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x40,0x01,0xff]
 
 0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1250: v_lshlrev_b32_e32 v6, v255, v183        ; encoding: [0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-REAL16: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# GFX1200-W32-FAKE16: v_dot2_bf16_bf16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-REAL16: v_dot2_bf16_bf16_e64_dpp v255.h, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 # W64-FAKE16: v_dot2_bf16_bf16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x47,0x67,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
@@ -8287,3 +8295,12 @@
 # W64-REAL16: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 # W64-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GFX1200: {{.*}}
+# GFX1200-W32: {{.*}}
+# GFX1200-W64: {{.*}}
+# GFX1200-W64-FAKE16: {{.*}}
+# GFX1200-W64-REAL16: {{.*}}
+# GFX1250-W32: {{.*}}
+# GFX1250-W32-FAKE16: {{.*}}
+# GFX1250-W32-REAL16: {{.*}}

From d078ce7c98a3f9371d01d526e20f671ca2231667 Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Fri, 20 Jun 2025 13:00:59 -0700
Subject: [PATCH 1109/1322] [libc] mbrtowc implementation (#144760)

implemented the internal and public mbrtowc as well as tests for the
public function.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/config/linux/x86_64/entrypoints.txt |   3 +
 libc/hdr/types/CMakeLists.txt            |   8 ++
 libc/hdr/types/mbstate_t.h               |  22 +++
 libc/include/llvm-libc-types/mbstate_t.h |   6 +-
 libc/include/wchar.yaml                  |   9 ++
 libc/src/__support/wchar/CMakeLists.txt  |  16 +++
 libc/src/__support/wchar/mbrtowc.cpp     |  49 +++++++
 libc/src/__support/wchar/mbrtowc.h       |  29 ++++
 libc/src/__support/wchar/mbstate.h       |   6 +-
 libc/src/wchar/CMakeLists.txt            |  17 +++
 libc/src/wchar/mbrtowc.cpp               |  38 +++++
 libc/src/wchar/mbrtowc.h                 |  24 ++++
 libc/test/src/wchar/CMakeLists.txt       |  14 ++
 libc/test/src/wchar/mbrtowc_test.cpp     | 172 +++++++++++++++++++++++
 14 files changed, 409 insertions(+), 4 deletions(-)
 create mode 100644 libc/hdr/types/mbstate_t.h
 create mode 100644 libc/src/__support/wchar/mbrtowc.cpp
 create mode 100644 libc/src/__support/wchar/mbrtowc.h
 create mode 100644 libc/src/wchar/mbrtowc.cpp
 create mode 100644 libc/src/wchar/mbrtowc.h
 create mode 100644 libc/test/src/wchar/mbrtowc_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index aa2079faed40..8bf6c402b039 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1244,6 +1244,9 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.sys.socket.recv
     libc.src.sys.socket.recvfrom
     libc.src.sys.socket.recvmsg
+
+    # wchar.h entrypoints
+    libc.src.wchar.mbrtowc
   )
 endif()
 
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index c88c35700907..e4b3cb0faa82 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -20,6 +20,14 @@ add_proxy_header_library(
     libc.include.uchar
 )
 
+add_proxy_header_library(
+  mbstate_t
+  HDRS
+    mbstate_t.h
+  DEPENDS
+    libc.include.llvm-libc-types.mbstate_t
+)
+
 add_proxy_header_library(
   div_t
   HDRS
diff --git a/libc/hdr/types/mbstate_t.h b/libc/hdr/types/mbstate_t.h
new file mode 100644
index 000000000000..367c6af7a3ff
--- /dev/null
+++ b/libc/hdr/types/mbstate_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from mbstate_t.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_MBSTATE_T_H
+#define LLVM_LIBC_HDR_TYPES_MBSTATE_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/mbstate_t.h"
+
+#else // Overlay mode
+
+#error "Cannot overlay mbstate_t
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_MBSTATE_T_H
diff --git a/libc/include/llvm-libc-types/mbstate_t.h b/libc/include/llvm-libc-types/mbstate_t.h
index 540d50975a26..009fe57da50e 100644
--- a/libc/include/llvm-libc-types/mbstate_t.h
+++ b/libc/include/llvm-libc-types/mbstate_t.h
@@ -9,8 +9,12 @@
 #ifndef LLVM_LIBC_TYPES_MBSTATE_T_H
 #define LLVM_LIBC_TYPES_MBSTATE_T_H
 
-// TODO: Complete this once we implement functions that operate on this type.
+#include "../llvm-libc-macros/stdint-macros.h"
+
 typedef struct {
+  uint32_t __field1;
+  uint8_t __field2;
+  uint8_t __field3;
 } mbstate_t;
 
 #endif // LLVM_LIBC_TYPES_MBSTATE_T_H
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 84db73d8f01e..c036636e12c3 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -29,6 +29,15 @@ functions:
     return_type: wint_t
     arguments:
       - type: int
+  - name: mbrtowc
+    standards:
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: wchar_t *__restrict
+      - type: const char *__restrict
+      - type: size_t
+      - type: mbstate_t *__restrict
   - name: wmemset
     standards:
       - stdc
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 6715e354e23e..479c1dff2c6e 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,19 @@ add_object_library(
     libc.src.__support.math_extras
     .mbstate
 )
+
+add_object_library(
+  mbrtowc
+  HDRS
+    mbrtowc.h
+  SRCS
+    mbrtowc.cpp
+  DEPENDS
+  libc.hdr.types.wchar_t
+  libc.hdr.types.size_t
+  libc.src.__support.common
+  libc.src.__support.error_or
+  libc.src.__support.macros.config
+  .character_converter
+  .mbstate
+)
diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp
new file mode 100644
index 000000000000..954c7458f4df
--- /dev/null
+++ b/libc/src/__support/wchar/mbrtowc.cpp
@@ -0,0 +1,49 @@
+//===-- Implementation for mbrtowc function ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/mbrtowc.h"
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+ErrorOr<size_t> mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
+                        size_t n, mbstate *__restrict ps) {
+  CharacterConverter char_conv(ps);
+  if (s == nullptr)
+    return 0;
+  size_t i = 0;
+  // Reading in bytes until we have a complete wc or error
+  for (; i < n && !char_conv.isFull(); ++i) {
+    int err = char_conv.push(static_cast<char8_t>(s[i]));
+    // Encoding error
+    if (err == -1)
+      return Error(-1);
+  }
+  auto wc = char_conv.pop_utf32();
+  if (wc.has_value()) {
+    *pwc = wc.value();
+    // null terminator -> return 0
+    if (wc.value() == L'\0')
+      return 0;
+    return i;
+  }
+  // Incomplete but potentially valid
+  return -2;
+}
+
+} // namespace internal
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/mbrtowc.h b/libc/src/__support/wchar/mbrtowc.h
new file mode 100644
index 000000000000..37329ee61bea
--- /dev/null
+++ b/libc/src/__support/wchar/mbrtowc.h
@@ -0,0 +1,29 @@
+//===-- Implementation header for mbrtowc function --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC
+#define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+ErrorOr<size_t> mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
+                        size_t n, mbstate *__restrict ps);
+
+} // namespace internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index fea693f73c3b..32304a521524 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -18,17 +18,17 @@ namespace internal {
 
 struct mbstate {
   // store a partial codepoint (in UTF-32)
-  char32_t partial;
+  char32_t partial = 0;
 
   /*
   Progress towards a conversion
     Increases with each push(...) until it reaches total_bytes
     Decreases with each pop(...) until it reaches 0
   */
-  uint8_t bytes_stored;
+  uint8_t bytes_stored = 0;
 
   // Total number of bytes that will be needed to represent this character
-  uint8_t total_bytes;
+  uint8_t total_bytes = 0;
 };
 
 } // namespace internal
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 491dd5b34340..163c29847e6a 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -34,6 +34,23 @@ add_entrypoint_object(
     libc.src.__support.wctype_utils
 )
 
+add_entrypoint_object(
+  mbrtowc
+  SRCS
+    mbrtowc.cpp
+  HDRS
+    mbrtowc.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.mbstate_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.wchar.mbrtowc
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.mbstate
+)
+
 add_entrypoint_object(
   wmemset
   SRCS
diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp
new file mode 100644
index 000000000000..cd429ab8d30e
--- /dev/null
+++ b/libc/src/wchar/mbrtowc.cpp
@@ -0,0 +1,38 @@
+//===-- Implementation of mbrtowc -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbrtowc.h"
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbrtowc.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, mbrtowc,
+                   (wchar_t *__restrict pwc, const char *__restrict s, size_t n,
+                    mbstate_t *__restrict ps)) {
+  static internal::mbstate internal_mbstate;
+  auto ret = internal::mbrtowc(pwc, s, n,
+                               ps == nullptr
+                                   ? &internal_mbstate
+                                   : reinterpret_cast<internal::mbstate *>(ps));
+  if (!ret.has_value()) {
+    // Encoding failure
+    libc_errno = EILSEQ;
+    return -1;
+  }
+  return ret.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbrtowc.h b/libc/src/wchar/mbrtowc.h
new file mode 100644
index 000000000000..e2e3d3ebd285
--- /dev/null
+++ b/libc/src/wchar/mbrtowc.h
@@ -0,0 +1,24 @@
+//===-- Implementation header for mbrtowc ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBRTOWC_H
+#define LLVM_LIBC_SRC_WCHAR_MBRTOWC_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n,
+               mbstate_t *__restrict ps);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBRTOWC_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 4990b6953348..d4cae1f6228b 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -23,6 +23,20 @@ add_libc_test(
     libc.src.wchar.btowc
 )
 
+add_libc_test(
+  mbrtowc_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    mbrtowc_test.cpp
+  DEPENDS
+    libc.src.__support.libc_errno
+    libc.src.string.memset
+    libc.src.wchar.mbrtowc
+    libc.hdr.types.mbstate_t
+    libc.hdr.types.wchar_t
+)
+
 add_libc_test(
   wctob_test
   SUITE
diff --git a/libc/test/src/wchar/mbrtowc_test.cpp b/libc/test/src/wchar/mbrtowc_test.cpp
new file mode 100644
index 000000000000..69dcf00fde20
--- /dev/null
+++ b/libc/test/src/wchar/mbrtowc_test.cpp
@@ -0,0 +1,172 @@
+//===-- Unittests for mbrtowc ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/libc_errno.h"
+#include "src/string/memset.h"
+#include "src/wchar/mbrtowc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMBRToWC, OneByte) {
+  const char *ch = "A";
+  wchar_t dest[2];
+  // Testing if it works with nullptr mbstate_t
+  mbstate_t *mb = nullptr;
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+  ASSERT_EQ(static_cast<char>(*dest), 'A');
+  ASSERT_EQ(static_cast<int>(n), 1);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch, 0, mb);
+  ASSERT_EQ(static_cast<int>(n), -2);
+}
+
+TEST(LlvmLibcMBRToWC, TwoByte) {
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; //  car symbol
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+  ASSERT_EQ(static_cast<int>(n), 2);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+  ASSERT_EQ(static_cast<int>(n), -2);
+  // Should pass after reading one more byte
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 1, mb);
+  ASSERT_EQ(static_cast<int>(n), 1);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+}
+
+TEST(LlvmLibcMBRToWC, ThreeByte) {
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb);
+  ASSERT_EQ(static_cast<int>(*dest), 8721);
+  ASSERT_EQ(static_cast<int>(n), 3);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+  ASSERT_EQ(static_cast<int>(n), -2);
+  // Should pass after reading two more bytes
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), 2);
+  ASSERT_EQ(static_cast<int>(*dest), 8721);
+}
+
+TEST(LlvmLibcMBRToWC, FourByte) {
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb);
+  ASSERT_EQ(static_cast<int>(*dest), 129313);
+  ASSERT_EQ(static_cast<int>(n), 4);
+
+  // Should fail since we have not read enough
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), -2);
+  // Should pass after reading two more bytes
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch + 2, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), 2);
+  ASSERT_EQ(static_cast<int>(*dest), 129313);
+}
+
+TEST(LlvmLibcMBRToWC, InvalidByte) {
+  const char ch[1] = {static_cast<char>(0x80)};
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+  ASSERT_EQ(static_cast<int>(n), -1);
+  ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+}
+
+TEST(LlvmLibcMBRToWC, InvalidMultiByte) {
+  const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+                      static_cast<char>(0x80),
+                      static_cast<char>(0x00)}; // invalid sequence of bytes
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  // Trying to push all 4 should error
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb);
+  ASSERT_EQ(static_cast<int>(n), -1);
+  ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+  // Trying to push just the first one should error
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+  ASSERT_EQ(static_cast<int>(n), -1);
+  ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+  // Trying to push the second and third should correspond to null wc
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), 0);
+  ASSERT_TRUE(*dest == L'\0');
+}
+
+TEST(LlvmLibcMBRToWC, InvalidLastByte) {
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  // Trying to push all 4 should error
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb);
+  ASSERT_EQ(static_cast<int>(n), -1);
+  ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+}
+
+TEST(LlvmLibcMBRToWC, ValidTwoByteWithExtraRead) {
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  // Trying to push all 3 should return valid 2 byte
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb);
+  ASSERT_EQ(static_cast<int>(n), 2);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+}
+
+TEST(LlvmLibcMBRToWC, TwoValidTwoBytes) {
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+  wchar_t dest[2];
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  // mbstate should reset after reading first one
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), 2);
+  ASSERT_EQ(static_cast<int>(*dest), 142);
+  n = LIBC_NAMESPACE::mbrtowc(dest + 1, ch + 2, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), 2);
+  ASSERT_EQ(static_cast<int>(*(dest + 1)), 460);
+}
+
+TEST(LlvmLibcMBRToWC, NullString) {
+  wchar_t dest[2] = {L'O', L'K'};
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  // reading on nullptr should return 0
+  size_t n = LIBC_NAMESPACE::mbrtowc(dest, nullptr, 2, mb);
+  ASSERT_EQ(static_cast<int>(n), 0);
+  ASSERT_TRUE(dest[0] == L'O');
+  // reading a null terminator should return 0
+  const char *ch = "\0";
+  n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+  ASSERT_EQ(static_cast<int>(n), 0);
+}

From f8ffb4e7cd94b661c3edd323f3dd85dc77892c16 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 20 Jun 2025 21:08:14 +0100
Subject: [PATCH 1110/1322] [VPlan] Simplify ExtractLastElement(Broadcast(A))
 -> A.

Remove trivial ExtractLastElement VPInstructions.
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp          | 7 +++++++
 .../first-order-recurrence-dead-instructions.ll            | 6 ++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fbfc527ff7db..c0bdbb1f4f88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1164,6 +1164,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  if (match(Def,
+            m_VPInstruction<VPInstruction::ExtractLastElement>(
+                m_VPInstruction<VPInstruction::Broadcast>(m_VPValue(A))))) {
+    Def->replaceAllUsesWith(A);
+    return;
+  }
+
   VPInstruction *OpVPI;
   if (match(Def, m_VPInstruction<VPInstruction::ExtractLastElement>(
                      m_VPInstruction(OpVPI))) &&
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
index d98cd45cb634..3bcb832b3fe3 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -26,12 +26,10 @@ define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(i8 %fo
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], -8
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -7, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ], [ 1, %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_START]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[FOR_START]], %[[MIDDLE_BLOCK]] ], [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_START]], %[[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -43,7 +41,7 @@ define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(i8 %fo
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ], [ [[FOR_START]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[FOR_NEXT_LCSSA]]
 ;
 entry:

From 3b9795b3d3b249a5a3052a44f2c1ad7268ed34c6 Mon Sep 17 00:00:00 2001
From: Rodolfo Wottrich <rgwott@users.noreply.github.com>
Date: Fri, 20 Jun 2025 21:14:52 +0100
Subject: [PATCH 1111/1322] [AArch64] Add CodeGen support for scalar FEAT_CPA
 (#105669)

CPA stands for Checked Pointer Arithmetic and is part of the 2023 MTE
architecture extensions for A-profile.
The new CPA instructions perform regular pointer arithmetic (such as
base register + offset) but check for overflow in the most significant
bits of the result, enhancing security by detecting address tampering.

In this patch we intend to capture the semantics of pointer arithmetic
when it is not folded into loads/stores, then generate the appropriate
scalar CPA instructions. In order to preserve pointer arithmetic
semantics through the backend, we use the PTRADD SelectionDAG node type.

Use backend option `-aarch64-use-featcpa-codegen=true` to enable CPA
CodeGen (for a target with CPA enabled).

The story of this PR is that initially it introduced the PTRADD
SelectionDAG node and the respective visitPTRADD() function, adapted
from the CHERI/Morello LLVM tree. The original authors are
@davidchisnall, @jrtc27, @arichardson.
After a while, @ritter-x2a took the part of the code that was
target-independent and merged it separately in #140017. This PR thus
remains as the AArch64-part only.

Mode details about the CPA extension can be found at:

-
https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-a-profile-architecture-developments-2023
- https://developer.arm.com/documentation/ddi0602/2023-09/ (e.g ADDPT
instruction)

This PR follows #79569.
It does not address vector FEAT_CPA instructions.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  13 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +
 .../lib/Target/AArch64/AArch64InstrFormats.td |   2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  15 +
 .../GISel/AArch64InstructionSelector.cpp      |   8 +-
 llvm/test/CodeGen/AArch64/cpa-globalisel.ll   | 776 +++++++++++++++++
 llvm/test/CodeGen/AArch64/cpa-selectiondag.ll | 796 ++++++++++++++++++
 7 files changed, 1612 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/cpa-globalisel.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cpa-selectiondag.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e9d05710cbc4..a2c914c6e09c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,6 +153,14 @@ cl::opt<bool> EnableSVEGISel(
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
+// TODO: This option should be removed once we switch to always using PTRADD in
+// the SelectionDAG.
+static cl::opt<bool> UseFEATCPACodegen(
+    "aarch64-use-featcpa-codegen", cl::Hidden,
+    cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
+             "SelectionDAG for FEAT_CPA"),
+    cl::init(false));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -30480,3 +30488,8 @@ bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
 
   return TargetLowering::isTypeDesirableForOp(Opc, VT);
 }
+
+bool AArch64TargetLowering::shouldPreservePtrArith(const Function &F,
+                                                   EVT VT) const {
+  return Subtarget->hasCPA() && UseFEATCPACodegen;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e0b6c1b8c0ba..89f90ee2b770 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -539,6 +539,10 @@ public:
   /// True if stack clash protection is enabled for this functions.
   bool hasInlineStackProbe(const MachineFunction &MF) const override;
 
+  /// In AArch64, true if FEAT_CPA is present. Allows pointer arithmetic
+  /// semantics to be preserved for instruction selection.
+  bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 9078748c1483..ba7cbccc0bcd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -13124,7 +13124,7 @@ def LSLImm3ShiftOperand : AsmOperandClass {
   let DiagnosticType = "AddSubLSLImm3ShiftLarge";
 }
 
-def lsl_imm3_shift_operand : Operand<i32> {
+def lsl_imm3_shift_operand : Operand<i64> {
   let PrintMethod = "printShifter";
   let ParserMatchClass = LSLImm3ShiftOperand;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index efe6cc1aa8ae..0f3f24f0853c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10741,6 +10741,21 @@ let Predicates = [HasCPA] in {
   // Scalar multiply-add/subtract
   def MADDPT : MulAccumCPA<0, "maddpt">;
   def MSUBPT : MulAccumCPA<1, "msubpt">;
+
+  def : Pat<(ptradd GPR64sp:$Rn, GPR64sp:$Rm),
+            (ADDPT_shift GPR64sp:$Rn, GPR64sp:$Rm, (i64 0))>;
+  def : Pat<(ptradd GPR64sp:$Rn, (shl GPR64sp:$Rm, (i64 imm0_7:$imm))),
+            (ADDPT_shift GPR64sp:$Rn, GPR64sp:$Rm,
+                         (i64 imm0_7:$imm))>;
+  def : Pat<(ptradd GPR64sp:$Rn, (ineg GPR64sp:$Rm)),
+            (SUBPT_shift GPR64sp:$Rn, GPR64sp:$Rm, (i64 0))>;
+  def : Pat<(ptradd GPR64sp:$Rn, (ineg (shl GPR64sp:$Rm, (i64 imm0_7:$imm)))),
+            (SUBPT_shift GPR64sp:$Rn, GPR64sp:$Rm,
+                         (i64 imm0_7:$imm))>;
+  def : Pat<(ptradd GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)),
+            (MADDPT GPR64:$Rn, GPR64:$Rm, GPR64:$Ra)>;
+  def : Pat<(ptradd GPR64:$Ra, (mul GPR64:$Rn, (ineg GPR64:$Rm))),
+            (MSUBPT GPR64:$Rn, GPR64:$Rm, GPR64:$Ra)>;
 }
 
 def round_v4fp32_to_v4bf16 :
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 51b42325ef84..5081cc4bba14 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2197,8 +2197,14 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
     }
     return Changed;
   }
-  case TargetOpcode::G_PTR_ADD:
+  case TargetOpcode::G_PTR_ADD: {
+    // If Checked Pointer Arithmetic (FEAT_CPA) is present, preserve the pointer
+    // arithmetic semantics instead of falling back to regular arithmetic.
+    const auto &TL = STI.getTargetLowering();
+    if (TL->shouldPreservePtrArith(MF.getFunction(), EVT()))
+      return false;
     return convertPtrAddToAdd(I, MRI);
+  }
   case TargetOpcode::G_LOAD: {
     // For scalar loads of pointers, we try to convert the dest type from p0
     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
diff --git a/llvm/test/CodeGen/AArch64/cpa-globalisel.ll b/llvm/test/CodeGen/AArch64/cpa-globalisel.ll
new file mode 100644
index 000000000000..c9b48b9685df
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cpa-globalisel.ll
@@ -0,0 +1,776 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=+cpa -aarch64-use-featcpa-codegen=true -O0 -global-isel=1 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-CPA-O0
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=+cpa -aarch64-use-featcpa-codegen=true -O3 -global-isel=1 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-CPA-O3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=-cpa -O0 -global-isel=1 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-NOCPA-O0
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=-cpa -O3 -global-isel=1 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-NOCPA-O3
+
+%struct.my_type = type { i64, i64 }
+%struct.my_type2 = type { i64, i64, i64, i64, i64, i64 }
+
+@array = external dso_local global [10 x %struct.my_type], align 8
+@array2 = external dso_local global [10 x %struct.my_type2], align 8
+
+define void @addpt1(i64 %index, i64 %arg) {
+; CHECK-CPA-O0-LABEL: addpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    adrp x8, array
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x0, lsl #4
+; CHECK-CPA-O0-NEXT:    str x1, [x8, #8]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    adrp x8, array
+; CHECK-CPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O3-NEXT:    addpt x8, x8, x0, lsl #4
+; CHECK-CPA-O3-NEXT:    str x1, [x8, #8]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    adrp x8, array
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x0, lsl #4
+; CHECK-NOCPA-O0-NEXT:    str x1, [x8, #8]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    adrp x8, array
+; CHECK-NOCPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O3-NEXT:    add x8, x8, x0, lsl #4
+; CHECK-NOCPA-O3-NEXT:    str x1, [x8, #8]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %e2 = getelementptr inbounds %struct.my_type, ptr @array, i64 %index, i32 1
+  store i64 %arg, ptr %e2, align 8
+  ret void
+}
+
+define void @maddpt1(i32 %pos, ptr %val) {
+; CHECK-CPA-O0-LABEL: maddpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-CPA-O0-NEXT:    mov w8, w0
+; CHECK-CPA-O0-NEXT:    sxtw x8, w8
+; CHECK-CPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-CPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-CPA-O0-NEXT:    adrp x10, array2
+; CHECK-CPA-O0-NEXT:    add x10, x10, :lo12:array2
+; CHECK-CPA-O0-NEXT:    maddpt x0, x8, x9, x10
+; CHECK-CPA-O0-NEXT:    mov w8, #48 // =0x30
+; CHECK-CPA-O0-NEXT:    mov w2, w8
+; CHECK-CPA-O0-NEXT:    b memcpy
+;
+; CHECK-CPA-O3-LABEL: maddpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-CPA-O3-NEXT:    sxtw x9, w0
+; CHECK-CPA-O3-NEXT:    mov w8, #48 // =0x30
+; CHECK-CPA-O3-NEXT:    ldr q0, [x1]
+; CHECK-CPA-O3-NEXT:    adrp x10, array2
+; CHECK-CPA-O3-NEXT:    add x10, x10, :lo12:array2
+; CHECK-CPA-O3-NEXT:    maddpt x8, x9, x8, x10
+; CHECK-CPA-O3-NEXT:    str q0, [x8]
+; CHECK-CPA-O3-NEXT:    ldr q0, [x1, #16]
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #16]
+; CHECK-CPA-O3-NEXT:    ldr q0, [x1, #32]
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #32]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: maddpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    adrp x9, array2
+; CHECK-NOCPA-O0-NEXT:    add x9, x9, :lo12:array2
+; CHECK-NOCPA-O0-NEXT:    mov w8, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    smaddl x0, w0, w8, x9
+; CHECK-NOCPA-O0-NEXT:    mov w8, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    mov w2, w8
+; CHECK-NOCPA-O0-NEXT:    b memcpy
+;
+; CHECK-NOCPA-O3-LABEL: maddpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    mov w8, #48 // =0x30
+; CHECK-NOCPA-O3-NEXT:    adrp x9, array2
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:array2
+; CHECK-NOCPA-O3-NEXT:    smaddl x8, w0, w8, x9
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x1]
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8]
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x1, #16]
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #16]
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x1, #32]
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #32]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %idxprom = sext i32 %pos to i64
+  %arrayidx = getelementptr inbounds [10 x %struct.my_type2], ptr @array2, i64 0, i64 %idxprom
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 dereferenceable(48) %arrayidx, ptr align 8 dereferenceable(48) %val, i64 48, i1 false)
+  ret void
+}
+
+define void @msubpt1(i32 %index, i32 %elem) {
+; CHECK-CPA-O0-LABEL: msubpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-CPA-O0-NEXT:    mov w8, w0
+; CHECK-CPA-O0-NEXT:    sxtw x10, w8
+; CHECK-CPA-O0-NEXT:    mov w8, #48 // =0x30
+; CHECK-CPA-O0-NEXT:    mov w9, w8
+; CHECK-CPA-O0-NEXT:    mov w8, #288 // =0x120
+; CHECK-CPA-O0-NEXT:    mov w11, w8
+; CHECK-CPA-O0-NEXT:    adrp x8, array2
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:array2
+; CHECK-CPA-O0-NEXT:    addpt x11, x8, x11
+; CHECK-CPA-O0-NEXT:    msubpt x0, x9, x10, x11
+; CHECK-CPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-CPA-O0-NEXT:    mov w2, w9
+; CHECK-CPA-O0-NEXT:    mov w9, #96 // =0x60
+; CHECK-CPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-CPA-O0-NEXT:    addpt x1, x8, x9
+; CHECK-CPA-O0-NEXT:    b memcpy
+;
+; CHECK-CPA-O3-LABEL: msubpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-CPA-O3-NEXT:    sxtw x9, w0
+; CHECK-CPA-O3-NEXT:    adrp x10, array2+96
+; CHECK-CPA-O3-NEXT:    add x10, x10, :lo12:array2+96
+; CHECK-CPA-O3-NEXT:    mov w8, #48 // =0x30
+; CHECK-CPA-O3-NEXT:    ldr q0, [x10]
+; CHECK-CPA-O3-NEXT:    msubpt x8, x8, x9, x10
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #192]
+; CHECK-CPA-O3-NEXT:    ldr q0, [x10, #16]
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #208]
+; CHECK-CPA-O3-NEXT:    ldr q0, [x10, #32]
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #224]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: msubpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-NOCPA-O0-NEXT:    mov w8, w0
+; CHECK-NOCPA-O0-NEXT:    sxtw x8, w8
+; CHECK-NOCPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-NOCPA-O0-NEXT:    mneg x10, x8, x9
+; CHECK-NOCPA-O0-NEXT:    adrp x8, array2
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:array2
+; CHECK-NOCPA-O0-NEXT:    add x9, x8, #288
+; CHECK-NOCPA-O0-NEXT:    add x0, x9, x10
+; CHECK-NOCPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    mov w2, w9
+; CHECK-NOCPA-O0-NEXT:    add x1, x8, #96
+; CHECK-NOCPA-O0-NEXT:    b memcpy
+;
+; CHECK-NOCPA-O3-LABEL: msubpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NOCPA-O3-NEXT:    sxtw x8, w0
+; CHECK-NOCPA-O3-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O3-NEXT:    mneg x8, x8, x9
+; CHECK-NOCPA-O3-NEXT:    adrp x9, array2+96
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:array2+96
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x9]
+; CHECK-NOCPA-O3-NEXT:    add x8, x9, x8
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #192]
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x9, #16]
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #208]
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x9, #32]
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #224]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %idx.ext = sext i32 %index to i64
+  %idx.neg = sub nsw i64 0, %idx.ext
+  %add.ptr = getelementptr inbounds %struct.my_type2, ptr getelementptr inbounds ([10 x %struct.my_type2], ptr @array2, i64 0, i64 6), i64 %idx.neg
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 dereferenceable(48) %add.ptr, ptr align 8 dereferenceable(48) getelementptr inbounds ([10 x %struct.my_type2], ptr @array2, i64 0, i64 2), i64 48, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @subpt1(i32 %index, i32 %elem) {
+; CHECK-CPA-O0-LABEL: subpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    mov w8, #-16 // =0xfffffff0
+; CHECK-CPA-O0-NEXT:    smull x9, w0, w8
+; CHECK-CPA-O0-NEXT:    adrp x8, array
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O0-NEXT:    ldr q0, [x8, #32]
+; CHECK-CPA-O0-NEXT:    mov w10, #96 // =0x60
+; CHECK-CPA-O0-NEXT:    // kill: def $x10 killed $w10
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x10
+; CHECK-CPA-O0-NEXT:    str q0, [x8, x9, lsl #4]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    mov w8, #-16 // =0xfffffff0
+; CHECK-CPA-O3-NEXT:    adrp x9, array+32
+; CHECK-CPA-O3-NEXT:    add x9, x9, :lo12:array+32
+; CHECK-CPA-O3-NEXT:    smull x8, w0, w8
+; CHECK-CPA-O3-NEXT:    ldr q0, [x9]
+; CHECK-CPA-O3-NEXT:    addpt x8, x9, x8, lsl #4
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #64]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    mov w8, #-16 // =0xfffffff0
+; CHECK-NOCPA-O0-NEXT:    smull x9, w0, w8
+; CHECK-NOCPA-O0-NEXT:    adrp x8, array
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O0-NEXT:    ldr q0, [x8, #32]
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, #96
+; CHECK-NOCPA-O0-NEXT:    str q0, [x8, x9, lsl #4]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    mov w8, #-16 // =0xfffffff0
+; CHECK-NOCPA-O3-NEXT:    adrp x9, array+32
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:array+32
+; CHECK-NOCPA-O3-NEXT:    smull x8, w0, w8
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x9]
+; CHECK-NOCPA-O3-NEXT:    add x8, x9, x8, lsl #4
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #64]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %conv = sext i32 %index to i64
+  %mul.neg = mul nsw i64 %conv, -16
+  %add.ptr = getelementptr inbounds %struct.my_type, ptr getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 6), i64 %mul.neg
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %add.ptr, ptr noundef nonnull align 8 dereferenceable(16) getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 2), i64 16, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @subpt2(i32 %index, i32 %elem) {
+; CHECK-CPA-O0-LABEL: subpt2:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    mov x8, xzr
+; CHECK-CPA-O0-NEXT:    subs x9, x8, w0, sxtw
+; CHECK-CPA-O0-NEXT:    adrp x8, array
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O0-NEXT:    ldr q0, [x8, #32]
+; CHECK-CPA-O0-NEXT:    mov w10, #96 // =0x60
+; CHECK-CPA-O0-NEXT:    // kill: def $x10 killed $w10
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x10
+; CHECK-CPA-O0-NEXT:    str q0, [x8, x9, lsl #4]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subpt2:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    mov x8, xzr
+; CHECK-CPA-O3-NEXT:    adrp x9, array+32
+; CHECK-CPA-O3-NEXT:    add x9, x9, :lo12:array+32
+; CHECK-CPA-O3-NEXT:    sub x8, x8, w0, sxtw
+; CHECK-CPA-O3-NEXT:    ldr q0, [x9]
+; CHECK-CPA-O3-NEXT:    addpt x8, x9, x8, lsl #4
+; CHECK-CPA-O3-NEXT:    str q0, [x8, #64]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subpt2:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    mov x8, xzr
+; CHECK-NOCPA-O0-NEXT:    subs x9, x8, w0, sxtw
+; CHECK-NOCPA-O0-NEXT:    adrp x8, array
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O0-NEXT:    ldr q0, [x8, #32]
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, #96
+; CHECK-NOCPA-O0-NEXT:    str q0, [x8, x9, lsl #4]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subpt2:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    mov x8, xzr
+; CHECK-NOCPA-O3-NEXT:    adrp x9, array+32
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:array+32
+; CHECK-NOCPA-O3-NEXT:    sub x8, x8, w0, sxtw
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x9]
+; CHECK-NOCPA-O3-NEXT:    add x8, x9, x8, lsl #4
+; CHECK-NOCPA-O3-NEXT:    str q0, [x8, #64]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %idx.ext = sext i32 %index to i64
+  %idx.neg = sub nsw i64 0, %idx.ext
+  %add.ptr = getelementptr inbounds %struct.my_type, ptr getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 6), i64 %idx.neg
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %add.ptr, ptr noundef nonnull align 8 dereferenceable(16) getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 2), i64 16, i1 false), !tbaa.struct !11
+  ret void
+}
+
+define ptr @subpt3(ptr %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subpt3:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    mov x8, #-8 // =0xfffffffffffffff8
+; CHECK-CPA-O0-NEXT:    addpt x0, x0, x8
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subpt3:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    mov x8, #-8 // =0xfffffffffffffff8
+; CHECK-CPA-O3-NEXT:    addpt x0, x0, x8
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subpt3:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs x0, x0, #8
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subpt3:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub x0, x0, #8
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = getelementptr inbounds i64, ptr %ptr, i64 -1
+  ret ptr %incdec.ptr.i.i.i
+}
+
+define i64 @subi64(i64 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subi64:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    subs x0, x0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subi64:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub x0, x0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subi64:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs x0, x0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subi64:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub x0, x0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i64 %ptr, -1
+  ret i64 %incdec.ptr.i.i.i
+}
+
+define i32 @subi32(i32 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subi32:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subi32:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subi32:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subi32:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i32 %ptr, -1
+  ret i32 %incdec.ptr.i.i.i
+}
+
+define i16 @subi16(i16 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subi16:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subi16:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subi16:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subi16:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i16 %ptr, -1
+  ret i16 %incdec.ptr.i.i.i
+}
+
+define i64 @addi64(i64 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: addi64:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    add x0, x0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addi64:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    add x0, x0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addi64:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    add x0, x0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addi64:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    add x0, x0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i64 %ptr, 1
+  ret i64 %incdec.ptr.i.i.i
+}
+
+define i32 @addi32(i32 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: addi32:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    add w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addi32:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    add w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addi32:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addi32:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i32 %ptr, 1
+  ret i32 %incdec.ptr.i.i.i
+}
+
+define i16 @addi16(i16 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: addi16:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    add w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addi16:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    add w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addi16:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addi16:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i16 %ptr, 1
+  ret i16 %incdec.ptr.i.i.i
+}
+
+define i64 @arith1(i64 noundef %0, i64 noundef %1, i64 noundef %2) {
+; CHECK-CPA-O0-LABEL: arith1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-CPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-CPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O0-NEXT:    ldr x8, [sp, #24]
+; CHECK-CPA-O0-NEXT:    ldr x9, [sp, #16]
+; CHECK-CPA-O0-NEXT:    ldr x10, [sp, #8]
+; CHECK-CPA-O0-NEXT:    mul x9, x9, x10
+; CHECK-CPA-O0-NEXT:    add x0, x8, x9
+; CHECK-CPA-O0-NEXT:    add sp, sp, #32
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: arith1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O3-NEXT:    stp x2, x1, [sp, #8]
+; CHECK-CPA-O3-NEXT:    str x0, [sp, #24]
+; CHECK-CPA-O3-NEXT:    madd x0, x1, x2, x0
+; CHECK-CPA-O3-NEXT:    add sp, sp, #32
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: arith1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    ldr x8, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    ldr x9, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    ldr x10, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    mul x9, x9, x10
+; CHECK-NOCPA-O0-NEXT:    add x0, x8, x9
+; CHECK-NOCPA-O0-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: arith1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O3-NEXT:    stp x2, x1, [sp, #8]
+; CHECK-NOCPA-O3-NEXT:    str x0, [sp, #24]
+; CHECK-NOCPA-O3-NEXT:    madd x0, x1, x2, x0
+; CHECK-NOCPA-O3-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %4 = alloca i64, align 8
+  %5 = alloca i64, align 8
+  %6 = alloca i64, align 8
+  store i64 %0, ptr %4, align 8
+  store i64 %1, ptr %5, align 8
+  store i64 %2, ptr %6, align 8
+  %7 = load i64, ptr %4, align 8
+  %8 = load i64, ptr %5, align 8
+  %9 = load i64, ptr %6, align 8
+  %10 = mul nsw i64 %8, %9
+  %11 = add nsw i64 %7, %10
+  ret i64 %11
+}
+
+define i64 @arith2(ptr noundef %0, i64 noundef %1, i64 noundef %2, i32 noundef %3) {
+; CHECK-CPA-O0-LABEL: arith2:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-CPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-CPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O0-NEXT:    str w3, [sp, #4]
+; CHECK-CPA-O0-NEXT:    ldr x10, [sp, #24]
+; CHECK-CPA-O0-NEXT:    ldrsw x8, [sp, #4]
+; CHECK-CPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-CPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-CPA-O0-NEXT:    maddpt x8, x8, x9, x10
+; CHECK-CPA-O0-NEXT:    ldr x8, [x8, #24]
+; CHECK-CPA-O0-NEXT:    ldr x10, [sp, #16]
+; CHECK-CPA-O0-NEXT:    ldr x9, [sp, #8]
+; CHECK-CPA-O0-NEXT:    mul x10, x10, x9
+; CHECK-CPA-O0-NEXT:    add x8, x8, x10
+; CHECK-CPA-O0-NEXT:    subs x0, x8, x9
+; CHECK-CPA-O0-NEXT:    add sp, sp, #32
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: arith2:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O3-NEXT:    str w3, [sp, #4]
+; CHECK-CPA-O3-NEXT:    mov w10, #48 // =0x30
+; CHECK-CPA-O3-NEXT:    ldrsw x9, [sp, #4]
+; CHECK-CPA-O3-NEXT:    stp x1, x0, [sp, #16]
+; CHECK-CPA-O3-NEXT:    maddpt x8, x9, x10, x0
+; CHECK-CPA-O3-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O3-NEXT:    ldr x8, [x8, #24]
+; CHECK-CPA-O3-NEXT:    madd x8, x1, x2, x8
+; CHECK-CPA-O3-NEXT:    sub x0, x8, x2
+; CHECK-CPA-O3-NEXT:    add sp, sp, #32
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: arith2:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    str w3, [sp, #4]
+; CHECK-NOCPA-O0-NEXT:    ldr x8, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    ldrsw x9, [sp, #4]
+; CHECK-NOCPA-O0-NEXT:    mov w10, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    // kill: def $x10 killed $w10
+; CHECK-NOCPA-O0-NEXT:    mul x9, x9, x10
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x9
+; CHECK-NOCPA-O0-NEXT:    ldr x8, [x8, #24]
+; CHECK-NOCPA-O0-NEXT:    ldr x10, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    ldr x9, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    mul x10, x10, x9
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x10
+; CHECK-NOCPA-O0-NEXT:    subs x0, x8, x9
+; CHECK-NOCPA-O0-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: arith2:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O3-NEXT:    str w3, [sp, #4]
+; CHECK-NOCPA-O3-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O3-NEXT:    ldrsw x8, [sp, #4]
+; CHECK-NOCPA-O3-NEXT:    stp x1, x0, [sp, #16]
+; CHECK-NOCPA-O3-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O3-NEXT:    madd x8, x8, x9, x0
+; CHECK-NOCPA-O3-NEXT:    ldr x8, [x8, #24]
+; CHECK-NOCPA-O3-NEXT:    madd x8, x1, x2, x8
+; CHECK-NOCPA-O3-NEXT:    sub x0, x8, x2
+; CHECK-NOCPA-O3-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %5 = alloca ptr, align 8
+  %6 = alloca i64, align 8
+  %7 = alloca i64, align 8
+  %8 = alloca i32, align 4
+  store ptr %0, ptr %5, align 8
+  store i64 %1, ptr %6, align 8
+  store i64 %2, ptr %7, align 8
+  store i32 %3, ptr %8, align 4
+  %9 = load ptr, ptr %5, align 8
+  %10 = load i32, ptr %8, align 4
+  %11 = sext i32 %10 to i64
+  %12 = getelementptr inbounds %struct.my_type2, ptr %9, i64 %11
+  %13 = getelementptr inbounds %struct.my_type2, ptr %12, i32 0, i32 3
+  %14 = load i64, ptr %13, align 8
+  %15 = load i64, ptr %6, align 8
+  %16 = load i64, ptr %7, align 8
+  %17 = mul nsw i64 %15, %16
+  %18 = add nsw i64 %14, %17
+  %19 = sub nsw i64 %18, %16
+  ret i64 %19
+}
+
+@a = hidden global [2 x [1 x [2 x i8]]] [[1 x [2 x i8]] [[2 x i8] c"\01\01"], [1 x [2 x i8]] [[2 x i8] c"\01\01"]], align 1
+@b = hidden global i16 0, align 2
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
+
+define hidden void @multidim() {
+; CHECK-CPA-O0-LABEL: multidim:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CPA-O0-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-CPA-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-CPA-O0-NEXT:    adrp x8, b
+; CHECK-CPA-O0-NEXT:    ldrh w9, [x8, :lo12:b]
+; CHECK-CPA-O0-NEXT:    mov w10, w9
+; CHECK-CPA-O0-NEXT:    ldrh w8, [x8, :lo12:b]
+; CHECK-CPA-O0-NEXT:    add w9, w8, #1
+; CHECK-CPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-CPA-O0-NEXT:    mov w8, w9
+; CHECK-CPA-O0-NEXT:    sxtw x9, w8
+; CHECK-CPA-O0-NEXT:    mov w8, #2 // =0x2
+; CHECK-CPA-O0-NEXT:    mov w11, w8
+; CHECK-CPA-O0-NEXT:    adrp x8, a
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:a
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x11
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x10, lsl #1
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x9
+; CHECK-CPA-O0-NEXT:    ldrb w8, [x8]
+; CHECK-CPA-O0-NEXT:    cbz w8, .LBB14_2
+; CHECK-CPA-O0-NEXT:    b .LBB14_1
+; CHECK-CPA-O0-NEXT:  .LBB14_1:
+; CHECK-CPA-O0-NEXT:    adrp x0, .L.str
+; CHECK-CPA-O0-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-CPA-O0-NEXT:    bl printf
+; CHECK-CPA-O0-NEXT:    b .LBB14_2
+; CHECK-CPA-O0-NEXT:  .LBB14_2:
+; CHECK-CPA-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: multidim:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    adrp x8, b
+; CHECK-CPA-O3-NEXT:    adrp x10, a+2
+; CHECK-CPA-O3-NEXT:    add x10, x10, :lo12:a+2
+; CHECK-CPA-O3-NEXT:    ldrh w9, [x8, :lo12:b]
+; CHECK-CPA-O3-NEXT:    ldrh w8, [x8, :lo12:b]
+; CHECK-CPA-O3-NEXT:    addpt x9, x10, x9, lsl #1
+; CHECK-CPA-O3-NEXT:    addpt x8, x9, x8
+; CHECK-CPA-O3-NEXT:    ldrb w8, [x8, #1]
+; CHECK-CPA-O3-NEXT:    cbz w8, .LBB14_2
+; CHECK-CPA-O3-NEXT:  // %bb.1:
+; CHECK-CPA-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CPA-O3-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-CPA-O3-NEXT:    .cfi_offset w30, -16
+; CHECK-CPA-O3-NEXT:    adrp x0, .L.str
+; CHECK-CPA-O3-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-CPA-O3-NEXT:    bl printf
+; CHECK-CPA-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CPA-O3-NEXT:  .LBB14_2:
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: multidim:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOCPA-O0-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NOCPA-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-NOCPA-O0-NEXT:    adrp x8, b
+; CHECK-NOCPA-O0-NEXT:    ldrh w9, [x8, :lo12:b]
+; CHECK-NOCPA-O0-NEXT:    mov w10, w9
+; CHECK-NOCPA-O0-NEXT:    ldrh w8, [x8, :lo12:b]
+; CHECK-NOCPA-O0-NEXT:    add w9, w8, #1
+; CHECK-NOCPA-O0-NEXT:    adrp x8, a
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:a
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, #2
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x10, lsl #1
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, w9, sxtw
+; CHECK-NOCPA-O0-NEXT:    ldrb w8, [x8]
+; CHECK-NOCPA-O0-NEXT:    cbz w8, .LBB14_2
+; CHECK-NOCPA-O0-NEXT:    b .LBB14_1
+; CHECK-NOCPA-O0-NEXT:  .LBB14_1:
+; CHECK-NOCPA-O0-NEXT:    adrp x0, .L.str
+; CHECK-NOCPA-O0-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-NOCPA-O0-NEXT:    bl printf
+; CHECK-NOCPA-O0-NEXT:    b .LBB14_2
+; CHECK-NOCPA-O0-NEXT:  .LBB14_2:
+; CHECK-NOCPA-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: multidim:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    adrp x8, b
+; CHECK-NOCPA-O3-NEXT:    adrp x10, a+2
+; CHECK-NOCPA-O3-NEXT:    add x10, x10, :lo12:a+2
+; CHECK-NOCPA-O3-NEXT:    ldrh w9, [x8, :lo12:b]
+; CHECK-NOCPA-O3-NEXT:    ldrh w8, [x8, :lo12:b]
+; CHECK-NOCPA-O3-NEXT:    add x9, x10, x9, lsl #1
+; CHECK-NOCPA-O3-NEXT:    add x8, x9, x8
+; CHECK-NOCPA-O3-NEXT:    ldrb w8, [x8, #1]
+; CHECK-NOCPA-O3-NEXT:    cbz w8, .LBB14_2
+; CHECK-NOCPA-O3-NEXT:  // %bb.1:
+; CHECK-NOCPA-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOCPA-O3-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NOCPA-O3-NEXT:    .cfi_offset w30, -16
+; CHECK-NOCPA-O3-NEXT:    adrp x0, .L.str
+; CHECK-NOCPA-O3-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-NOCPA-O3-NEXT:    bl printf
+; CHECK-NOCPA-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOCPA-O3-NEXT:  .LBB14_2:
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %1 = load i16, ptr @b, align 2
+  %2 = zext i16 %1 to i64
+  %3 = getelementptr inbounds nuw [1 x [2 x i8]], ptr getelementptr inbounds ([2 x [1 x [2 x i8]]], ptr @a, i64 0, i64 1), i64 0, i64 %2
+  %4 = load i16, ptr @b, align 2
+  %5 = zext i16 %4 to i32
+  %6 = add nsw i32 %5, 1
+  %7 = sext i32 %6 to i64
+  %8 = getelementptr inbounds [2 x i8], ptr %3, i64 0, i64 %7
+  %9 = load i8, ptr %8, align 1
+  %10 = icmp ne i8 %9, 0
+  br i1 %10, label %11, label %13
+
+11:
+  %12 = call i32 (ptr, ...) @printf(ptr noundef @.str)
+  br label %13
+
+13:
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare i32 @printf(ptr noundef, ...) #1
+
+!6 = !{i64 0, i64 8, !7, i64 8, i64 8, !7, i64 16, i64 8, !7, i64 24, i64 8, !7, i64 32, i64 8, !7, i64 40, i64 8, !7}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"long", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!11 = !{i64 0, i64 8, !7, i64 8, i64 8, !7}
diff --git a/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll b/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll
new file mode 100644
index 000000000000..69fd6c4de78b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll
@@ -0,0 +1,796 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=+cpa -aarch64-use-featcpa-codegen=true -O0 -global-isel=0 -fast-isel=0 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-CPA-O0
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=+cpa -aarch64-use-featcpa-codegen=true -O3 -global-isel=0 -fast-isel=0 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-CPA-O3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=-cpa -O0 -global-isel=0 -fast-isel=0 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-NOCPA-O0
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs --mattr=-cpa -O3 -global-isel=0 -fast-isel=0 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-NOCPA-O3
+
+%struct.my_type = type { i64, i64 }
+%struct.my_type2 = type { i64, i64, i64, i64, i64, i64 }
+
+@array = external dso_local global [10 x %struct.my_type], align 8
+@array2 = external dso_local global [10 x %struct.my_type2], align 8
+
+define void @addpt1(i64 %index, i64 %arg) {
+; CHECK-CPA-O0-LABEL: addpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    adrp x8, array
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x0, lsl #4
+; CHECK-CPA-O0-NEXT:    str x1, [x8, #8]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    adrp x8, array
+; CHECK-CPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O3-NEXT:    addpt x8, x8, x0, lsl #4
+; CHECK-CPA-O3-NEXT:    str x1, [x8, #8]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    adrp x8, array
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x0, lsl #4
+; CHECK-NOCPA-O0-NEXT:    str x1, [x8, #8]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    adrp x8, array
+; CHECK-NOCPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O3-NEXT:    add x8, x8, x0, lsl #4
+; CHECK-NOCPA-O3-NEXT:    str x1, [x8, #8]
+; CHECK-NOCPA-O3-NEXT:    ret
+
+entry:
+  %e2 = getelementptr inbounds %struct.my_type, ptr @array, i64 %index, i32 1
+  store i64 %arg, ptr %e2, align 8
+  ret void
+}
+
+define void @maddpt1(i32 %pos, ptr %val) {
+; CHECK-CPA-O0-LABEL: maddpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-CPA-O0-NEXT:    mov w8, w0
+; CHECK-CPA-O0-NEXT:    sxtw x8, w8
+; CHECK-CPA-O0-NEXT:    adrp x10, array2
+; CHECK-CPA-O0-NEXT:    add x10, x10, :lo12:array2
+; CHECK-CPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-CPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-CPA-O0-NEXT:    maddpt x8, x8, x9, x10
+; CHECK-CPA-O0-NEXT:    ldr q0, [x1]
+; CHECK-CPA-O0-NEXT:    ldr q1, [x1, #16]
+; CHECK-CPA-O0-NEXT:    ldr q2, [x1, #32]
+; CHECK-CPA-O0-NEXT:    str q2, [x8, #32]
+; CHECK-CPA-O0-NEXT:    str q1, [x8, #16]
+; CHECK-CPA-O0-NEXT:    str q0, [x8]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: maddpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-CPA-O3-NEXT:    sxtw x9, w0
+; CHECK-CPA-O3-NEXT:    ldp q1, q0, [x1, #16]
+; CHECK-CPA-O3-NEXT:    mov w8, #48 // =0x30
+; CHECK-CPA-O3-NEXT:    adrp x10, array2
+; CHECK-CPA-O3-NEXT:    add x10, x10, :lo12:array2
+; CHECK-CPA-O3-NEXT:    ldr q2, [x1]
+; CHECK-CPA-O3-NEXT:    maddpt x8, x9, x8, x10
+; CHECK-CPA-O3-NEXT:    stp q1, q0, [x8, #16]
+; CHECK-CPA-O3-NEXT:    str q2, [x8]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: maddpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    adrp x9, array2
+; CHECK-NOCPA-O0-NEXT:    add x9, x9, :lo12:array2
+; CHECK-NOCPA-O0-NEXT:    mov w8, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    smaddl x8, w0, w8, x9
+; CHECK-NOCPA-O0-NEXT:    ldr q0, [x1]
+; CHECK-NOCPA-O0-NEXT:    ldr q1, [x1, #16]
+; CHECK-NOCPA-O0-NEXT:    ldr q2, [x1, #32]
+; CHECK-NOCPA-O0-NEXT:    str q2, [x8, #32]
+; CHECK-NOCPA-O0-NEXT:    str q1, [x8, #16]
+; CHECK-NOCPA-O0-NEXT:    str q0, [x8]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: maddpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    mov w8, #48 // =0x30
+; CHECK-NOCPA-O3-NEXT:    adrp x9, array2
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:array2
+; CHECK-NOCPA-O3-NEXT:    smaddl x8, w0, w8, x9
+; CHECK-NOCPA-O3-NEXT:    ldp q1, q0, [x1, #16]
+; CHECK-NOCPA-O3-NEXT:    ldr q2, [x1]
+; CHECK-NOCPA-O3-NEXT:    stp q1, q0, [x8, #16]
+; CHECK-NOCPA-O3-NEXT:    str q2, [x8]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %idxprom = sext i32 %pos to i64
+  %arrayidx = getelementptr inbounds [10 x %struct.my_type2], ptr @array2, i64 0, i64 %idxprom
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 dereferenceable(48) %arrayidx, ptr align 8 dereferenceable(48) %val, i64 48, i1 false)
+  ret void
+}
+
+define void @msubpt1(i32 %index, i32 %elem) {
+; CHECK-CPA-O0-LABEL: msubpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-CPA-O0-NEXT:    mov w8, w0
+; CHECK-CPA-O0-NEXT:    sxtw x9, w8
+; CHECK-CPA-O0-NEXT:    mov x8, xzr
+; CHECK-CPA-O0-NEXT:    subs x8, x8, x9
+; CHECK-CPA-O0-NEXT:    lsl x8, x8, #1
+; CHECK-CPA-O0-NEXT:    subs x10, x8, x9
+; CHECK-CPA-O0-NEXT:    adrp x9, array2
+; CHECK-CPA-O0-NEXT:    add x9, x9, :lo12:array2
+; CHECK-CPA-O0-NEXT:    mov w8, #288 // =0x120
+; CHECK-CPA-O0-NEXT:    // kill: def $x8 killed $w8
+; CHECK-CPA-O0-NEXT:    addpt x8, x9, x8
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x10, lsl #4
+; CHECK-CPA-O0-NEXT:    mov w10, #96 // =0x60
+; CHECK-CPA-O0-NEXT:    // kill: def $x10 killed $w10
+; CHECK-CPA-O0-NEXT:    addpt x10, x9, x10
+; CHECK-CPA-O0-NEXT:    ldr q1, [x10, #16]
+; CHECK-CPA-O0-NEXT:    ldr q2, [x10, #32]
+; CHECK-CPA-O0-NEXT:    ldr q0, [x9, #96]
+; CHECK-CPA-O0-NEXT:    str q2, [x8, #32]
+; CHECK-CPA-O0-NEXT:    str q1, [x8, #16]
+; CHECK-CPA-O0-NEXT:    str q0, [x8]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: msubpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-CPA-O3-NEXT:    sxtw x9, w0
+; CHECK-CPA-O3-NEXT:    adrp x8, array2
+; CHECK-CPA-O3-NEXT:    add x8, x8, :lo12:array2
+; CHECK-CPA-O3-NEXT:    mov w11, #96 // =0x60
+; CHECK-CPA-O3-NEXT:    mov w12, #288 // =0x120
+; CHECK-CPA-O3-NEXT:    ldr q2, [x8, #96]
+; CHECK-CPA-O3-NEXT:    neg x10, x9
+; CHECK-CPA-O3-NEXT:    addpt x11, x8, x11
+; CHECK-CPA-O3-NEXT:    lsl x10, x10, #1
+; CHECK-CPA-O3-NEXT:    ldp q1, q0, [x11, #16]
+; CHECK-CPA-O3-NEXT:    sub x9, x10, x9
+; CHECK-CPA-O3-NEXT:    addpt x10, x8, x12
+; CHECK-CPA-O3-NEXT:    addpt x9, x10, x9, lsl #4
+; CHECK-CPA-O3-NEXT:    stp q1, q0, [x9, #16]
+; CHECK-CPA-O3-NEXT:    str q2, [x9]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: msubpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-NOCPA-O0-NEXT:    mov w8, w0
+; CHECK-NOCPA-O0-NEXT:    sxtw x8, w8
+; CHECK-NOCPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-NOCPA-O0-NEXT:    mneg x8, x8, x9
+; CHECK-NOCPA-O0-NEXT:    adrp x9, array2
+; CHECK-NOCPA-O0-NEXT:    add x9, x9, :lo12:array2
+; CHECK-NOCPA-O0-NEXT:    add x8, x9, x8
+; CHECK-NOCPA-O0-NEXT:    ldr q0, [x9, #96]
+; CHECK-NOCPA-O0-NEXT:    ldr q1, [x9, #112]
+; CHECK-NOCPA-O0-NEXT:    ldr q2, [x9, #128]
+; CHECK-NOCPA-O0-NEXT:    str q2, [x8, #320]
+; CHECK-NOCPA-O0-NEXT:    str q1, [x8, #304]
+; CHECK-NOCPA-O0-NEXT:    str q0, [x8, #288]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: msubpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NOCPA-O3-NEXT:    sxtw x8, w0
+; CHECK-NOCPA-O3-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O3-NEXT:    mneg x8, x8, x9
+; CHECK-NOCPA-O3-NEXT:    adrp x9, array2
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:array2
+; CHECK-NOCPA-O3-NEXT:    ldp q1, q0, [x9, #112]
+; CHECK-NOCPA-O3-NEXT:    ldr q2, [x9, #96]
+; CHECK-NOCPA-O3-NEXT:    add x8, x9, x8
+; CHECK-NOCPA-O3-NEXT:    stp q1, q0, [x8, #304]
+; CHECK-NOCPA-O3-NEXT:    str q2, [x8, #288]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %idx.ext = sext i32 %index to i64
+  %idx.neg = sub nsw i64 0, %idx.ext
+  %add.ptr = getelementptr inbounds %struct.my_type2, ptr getelementptr inbounds ([10 x %struct.my_type2], ptr @array2, i64 0, i64 6), i64 %idx.neg
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 dereferenceable(48) %add.ptr, ptr align 8 dereferenceable(48) getelementptr inbounds ([10 x %struct.my_type2], ptr @array2, i64 0, i64 2), i64 48, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @subpt1(i32 %index, i32 %elem) {
+; CHECK-CPA-O0-LABEL: subpt1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    adrp x9, array
+; CHECK-CPA-O0-NEXT:    add x9, x9, :lo12:array
+; CHECK-CPA-O0-NEXT:    mov w8, #96 // =0x60
+; CHECK-CPA-O0-NEXT:    // kill: def $x8 killed $w8
+; CHECK-CPA-O0-NEXT:    addpt x8, x9, x8
+; CHECK-CPA-O0-NEXT:    // implicit-def: $x10
+; CHECK-CPA-O0-NEXT:    mov w10, w0
+; CHECK-CPA-O0-NEXT:    sbfiz x10, x10, #8, #32
+; CHECK-CPA-O0-NEXT:    subpt x8, x8, x10
+; CHECK-CPA-O0-NEXT:    ldr q0, [x9, #32]
+; CHECK-CPA-O0-NEXT:    str q0, [x8]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subpt1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-CPA-O3-NEXT:    adrp x8, array
+; CHECK-CPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-CPA-O3-NEXT:    mov w9, #96 // =0x60
+; CHECK-CPA-O3-NEXT:    sbfiz x10, x0, #8, #32
+; CHECK-CPA-O3-NEXT:    addpt x9, x8, x9
+; CHECK-CPA-O3-NEXT:    ldr q0, [x8, #32]
+; CHECK-CPA-O3-NEXT:    subpt x8, x9, x10
+; CHECK-CPA-O3-NEXT:    str q0, [x8]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subpt1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    // implicit-def: $x8
+; CHECK-NOCPA-O0-NEXT:    mov w8, w0
+; CHECK-NOCPA-O0-NEXT:    sxtw x8, w8
+; CHECK-NOCPA-O0-NEXT:    adrp x9, array
+; CHECK-NOCPA-O0-NEXT:    add x9, x9, :lo12:array
+; CHECK-NOCPA-O0-NEXT:    subs x8, x9, x8, lsl #8
+; CHECK-NOCPA-O0-NEXT:    ldr q0, [x9, #32]
+; CHECK-NOCPA-O0-NEXT:    str q0, [x8, #96]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subpt1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NOCPA-O3-NEXT:    sxtw x9, w0
+; CHECK-NOCPA-O3-NEXT:    adrp x8, array
+; CHECK-NOCPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x8, #32]
+; CHECK-NOCPA-O3-NEXT:    sub x9, x8, x9, lsl #8
+; CHECK-NOCPA-O3-NEXT:    str q0, [x9, #96]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %conv = sext i32 %index to i64
+  %mul.neg = mul nsw i64 %conv, -16
+  %add.ptr = getelementptr inbounds %struct.my_type, ptr getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 6), i64 %mul.neg
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %add.ptr, ptr noundef nonnull align 8 dereferenceable(16) getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 2), i64 16, i1 false), !tbaa.struct !6
+  ret void
+}
+
+define void @subpt2(i32 %index, i32 %elem) {
+; CHECK-CPA-O0-LABEL: subpt2:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    mov x8, xzr
+; CHECK-CPA-O0-NEXT:    subs x10, x8, w0, sxtw
+; CHECK-CPA-O0-NEXT:    adrp x9, array
+; CHECK-CPA-O0-NEXT:    add x9, x9, :lo12:array
+; CHECK-CPA-O0-NEXT:    mov w8, #96 // =0x60
+; CHECK-CPA-O0-NEXT:    // kill: def $x8 killed $w8
+; CHECK-CPA-O0-NEXT:    addpt x8, x9, x8
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x10, lsl #4
+; CHECK-CPA-O0-NEXT:    ldr q0, [x9, #32]
+; CHECK-CPA-O0-NEXT:    str q0, [x8]
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subpt2:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    mov x8, xzr
+; CHECK-CPA-O3-NEXT:    mov w9, #96 // =0x60
+; CHECK-CPA-O3-NEXT:    adrp x10, array
+; CHECK-CPA-O3-NEXT:    add x10, x10, :lo12:array
+; CHECK-CPA-O3-NEXT:    sub x8, x8, w0, sxtw
+; CHECK-CPA-O3-NEXT:    addpt x9, x10, x9
+; CHECK-CPA-O3-NEXT:    ldr q0, [x10, #32]
+; CHECK-CPA-O3-NEXT:    addpt x8, x9, x8, lsl #4
+; CHECK-CPA-O3-NEXT:    str q0, [x8]
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subpt2:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    adrp x9, array
+; CHECK-NOCPA-O0-NEXT:    add x9, x9, :lo12:array
+; CHECK-NOCPA-O0-NEXT:    subs x8, x9, w0, sxtw #4
+; CHECK-NOCPA-O0-NEXT:    ldr q0, [x9, #32]
+; CHECK-NOCPA-O0-NEXT:    str q0, [x8, #96]
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subpt2:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    adrp x8, array
+; CHECK-NOCPA-O3-NEXT:    add x8, x8, :lo12:array
+; CHECK-NOCPA-O3-NEXT:    sub x9, x8, w0, sxtw #4
+; CHECK-NOCPA-O3-NEXT:    ldr q0, [x8, #32]
+; CHECK-NOCPA-O3-NEXT:    str q0, [x9, #96]
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %idx.ext = sext i32 %index to i64
+  %idx.neg = sub nsw i64 0, %idx.ext
+  %add.ptr = getelementptr inbounds %struct.my_type, ptr getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 6), i64 %idx.neg
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %add.ptr, ptr noundef nonnull align 8 dereferenceable(16) getelementptr inbounds ([10 x %struct.my_type], ptr @array, i64 0, i64 2), i64 16, i1 false), !tbaa.struct !11
+  ret void
+}
+
+define ptr @subpt3(ptr %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subpt3:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    mov x8, #-8 // =0xfffffffffffffff8
+; CHECK-CPA-O0-NEXT:    addpt x0, x0, x8
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subpt3:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    mov x8, #-8 // =0xfffffffffffffff8
+; CHECK-CPA-O3-NEXT:    addpt x0, x0, x8
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subpt3:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs x0, x0, #8
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subpt3:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub x0, x0, #8
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = getelementptr inbounds i64, ptr %ptr, i64 -1
+  ret ptr %incdec.ptr.i.i.i
+}
+
+define i64 @subi64(i64 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subi64:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    subs x0, x0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subi64:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub x0, x0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subi64:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs x0, x0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subi64:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub x0, x0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i64 %ptr, -1
+  ret i64 %incdec.ptr.i.i.i
+}
+
+define i32 @subi32(i32 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subi32:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subi32:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subi32:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subi32:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i32 %ptr, -1
+  ret i32 %incdec.ptr.i.i.i
+}
+
+define i16 @subi16(i16 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: subi16:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: subi16:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: subi16:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    subs w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: subi16:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i16 %ptr, -1
+  ret i16 %incdec.ptr.i.i.i
+}
+
+define i64 @addi64(i64 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: addi64:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    add x0, x0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addi64:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    add x0, x0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addi64:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    add x0, x0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addi64:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    add x0, x0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i64 %ptr, 1
+  ret i64 %incdec.ptr.i.i.i
+}
+
+define i32 @addi32(i32 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: addi32:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    add w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addi32:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    add w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addi32:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addi32:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i32 %ptr, 1
+  ret i32 %incdec.ptr.i.i.i
+}
+
+define i16 @addi16(i16 %ptr, i32 %index) {
+; CHECK-CPA-O0-LABEL: addi16:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    add w0, w0, #1
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: addi16:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    add w0, w0, #1
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: addi16:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: addi16:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    add w0, w0, #1
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %incdec.ptr.i.i.i = add i16 %ptr, 1
+  ret i16 %incdec.ptr.i.i.i
+}
+
+define i64 @arith1(i64 noundef %0, i64 noundef %1, i64 noundef %2) {
+; CHECK-CPA-O0-LABEL: arith1:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-CPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-CPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O0-NEXT:    ldr x8, [sp, #24]
+; CHECK-CPA-O0-NEXT:    ldr x9, [sp, #16]
+; CHECK-CPA-O0-NEXT:    ldr x10, [sp, #8]
+; CHECK-CPA-O0-NEXT:    mul x9, x9, x10
+; CHECK-CPA-O0-NEXT:    add x0, x8, x9
+; CHECK-CPA-O0-NEXT:    add sp, sp, #32
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: arith1:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O3-NEXT:    madd x8, x1, x2, x0
+; CHECK-CPA-O3-NEXT:    stp x1, x0, [sp, #16]
+; CHECK-CPA-O3-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O3-NEXT:    mov x0, x8
+; CHECK-CPA-O3-NEXT:    add sp, sp, #32
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: arith1:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    ldr x8, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    ldr x9, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    ldr x10, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    mul x9, x9, x10
+; CHECK-NOCPA-O0-NEXT:    add x0, x8, x9
+; CHECK-NOCPA-O0-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: arith1:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O3-NEXT:    madd x8, x1, x2, x0
+; CHECK-NOCPA-O3-NEXT:    stp x1, x0, [sp, #16]
+; CHECK-NOCPA-O3-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O3-NEXT:    mov x0, x8
+; CHECK-NOCPA-O3-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %4 = alloca i64, align 8
+  %5 = alloca i64, align 8
+  %6 = alloca i64, align 8
+  store i64 %0, ptr %4, align 8
+  store i64 %1, ptr %5, align 8
+  store i64 %2, ptr %6, align 8
+  %7 = load i64, ptr %4, align 8
+  %8 = load i64, ptr %5, align 8
+  %9 = load i64, ptr %6, align 8
+  %10 = mul nsw i64 %8, %9
+  %11 = add nsw i64 %7, %10
+  ret i64 %11
+}
+
+define i64 @arith2(ptr noundef %0, i64 noundef %1, i64 noundef %2, i32 noundef %3) {
+; CHECK-CPA-O0-LABEL: arith2:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-CPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-CPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O0-NEXT:    str w3, [sp, #4]
+; CHECK-CPA-O0-NEXT:    ldr x8, [sp, #24]
+; CHECK-CPA-O0-NEXT:    ldrsw x9, [sp, #4]
+; CHECK-CPA-O0-NEXT:    add x9, x9, x9, lsl #1
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x9, lsl #4
+; CHECK-CPA-O0-NEXT:    ldr x8, [x8, #24]
+; CHECK-CPA-O0-NEXT:    ldr x10, [sp, #16]
+; CHECK-CPA-O0-NEXT:    ldr x9, [sp, #8]
+; CHECK-CPA-O0-NEXT:    mul x10, x10, x9
+; CHECK-CPA-O0-NEXT:    add x8, x8, x10
+; CHECK-CPA-O0-NEXT:    subs x0, x8, x9
+; CHECK-CPA-O0-NEXT:    add sp, sp, #32
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: arith2:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-CPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-CPA-O3-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-CPA-O3-NEXT:    sxtw x8, w3
+; CHECK-CPA-O3-NEXT:    mov w9, #48 // =0x30
+; CHECK-CPA-O3-NEXT:    stp x1, x0, [sp, #16]
+; CHECK-CPA-O3-NEXT:    str x2, [sp, #8]
+; CHECK-CPA-O3-NEXT:    maddpt x8, x8, x9, x0
+; CHECK-CPA-O3-NEXT:    str w3, [sp, #4]
+; CHECK-CPA-O3-NEXT:    ldr x8, [x8, #24]
+; CHECK-CPA-O3-NEXT:    madd x8, x1, x2, x8
+; CHECK-CPA-O3-NEXT:    sub x0, x8, x2
+; CHECK-CPA-O3-NEXT:    add sp, sp, #32
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: arith2:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O0-NEXT:    str x0, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    str x1, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    str w3, [sp, #4]
+; CHECK-NOCPA-O0-NEXT:    ldr x10, [sp, #24]
+; CHECK-NOCPA-O0-NEXT:    ldrsw x8, [sp, #4]
+; CHECK-NOCPA-O0-NEXT:    mov w9, #48 // =0x30
+; CHECK-NOCPA-O0-NEXT:    mov w0, w9
+; CHECK-NOCPA-O0-NEXT:    mov w9, w0
+; CHECK-NOCPA-O0-NEXT:    // kill: def $w8 killed $w8 killed $x8
+; CHECK-NOCPA-O0-NEXT:    smaddl x8, w8, w9, x10
+; CHECK-NOCPA-O0-NEXT:    ldr x8, [x8, #24]
+; CHECK-NOCPA-O0-NEXT:    ldr x10, [sp, #16]
+; CHECK-NOCPA-O0-NEXT:    ldr x9, [sp, #8]
+; CHECK-NOCPA-O0-NEXT:    mul x10, x10, x9
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x10
+; CHECK-NOCPA-O0-NEXT:    subs x0, x8, x9
+; CHECK-NOCPA-O0-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: arith2:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    sub sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NOCPA-O3-NEXT:    mov w8, #48 // =0x30
+; CHECK-NOCPA-O3-NEXT:    stp x1, x0, [sp, #16]
+; CHECK-NOCPA-O3-NEXT:    smaddl x8, w3, w8, x0
+; CHECK-NOCPA-O3-NEXT:    str x2, [sp, #8]
+; CHECK-NOCPA-O3-NEXT:    str w3, [sp, #4]
+; CHECK-NOCPA-O3-NEXT:    ldr x8, [x8, #24]
+; CHECK-NOCPA-O3-NEXT:    madd x8, x1, x2, x8
+; CHECK-NOCPA-O3-NEXT:    sub x0, x8, x2
+; CHECK-NOCPA-O3-NEXT:    add sp, sp, #32
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %5 = alloca ptr, align 8
+  %6 = alloca i64, align 8
+  %7 = alloca i64, align 8
+  %8 = alloca i32, align 4
+  store ptr %0, ptr %5, align 8
+  store i64 %1, ptr %6, align 8
+  store i64 %2, ptr %7, align 8
+  store i32 %3, ptr %8, align 4
+  %9 = load ptr, ptr %5, align 8
+  %10 = load i32, ptr %8, align 4
+  %11 = sext i32 %10 to i64
+  %12 = getelementptr inbounds %struct.my_type2, ptr %9, i64 %11
+  %13 = getelementptr inbounds %struct.my_type2, ptr %12, i32 0, i32 3
+  %14 = load i64, ptr %13, align 8
+  %15 = load i64, ptr %6, align 8
+  %16 = load i64, ptr %7, align 8
+  %17 = mul nsw i64 %15, %16
+  %18 = add nsw i64 %14, %17
+  %19 = sub nsw i64 %18, %16
+  ret i64 %19
+}
+
+@a = hidden global [2 x [1 x [2 x i8]]] [[1 x [2 x i8]] [[2 x i8] c"\01\01"], [1 x [2 x i8]] [[2 x i8] c"\01\01"]], align 1
+@b = hidden global i16 0, align 2
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
+
+define hidden void @multidim() {
+; CHECK-CPA-O0-LABEL: multidim:
+; CHECK-CPA-O0:       // %bb.0: // %entry
+; CHECK-CPA-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CPA-O0-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-CPA-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-CPA-O0-NEXT:    adrp x8, b
+; CHECK-CPA-O0-NEXT:    ldrh w9, [x8, :lo12:b]
+; CHECK-CPA-O0-NEXT:    mov w8, w9
+; CHECK-CPA-O0-NEXT:    mov w10, w8
+; CHECK-CPA-O0-NEXT:    adrp x8, a
+; CHECK-CPA-O0-NEXT:    add x8, x8, :lo12:a
+; CHECK-CPA-O0-NEXT:    mov w11, #2 // =0x2
+; CHECK-CPA-O0-NEXT:    // kill: def $x11 killed $w11
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x11
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x10, lsl #1
+; CHECK-CPA-O0-NEXT:    add w9, w9, #1
+; CHECK-CPA-O0-NEXT:    mov w9, w9
+; CHECK-CPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-CPA-O0-NEXT:    addpt x8, x8, x9
+; CHECK-CPA-O0-NEXT:    ldrb w8, [x8]
+; CHECK-CPA-O0-NEXT:    cbz w8, .LBB14_2
+; CHECK-CPA-O0-NEXT:    b .LBB14_1
+; CHECK-CPA-O0-NEXT:  .LBB14_1:
+; CHECK-CPA-O0-NEXT:    adrp x0, .L.str
+; CHECK-CPA-O0-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-CPA-O0-NEXT:    bl printf
+; CHECK-CPA-O0-NEXT:    b .LBB14_2
+; CHECK-CPA-O0-NEXT:  .LBB14_2:
+; CHECK-CPA-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CPA-O0-NEXT:    ret
+;
+; CHECK-CPA-O3-LABEL: multidim:
+; CHECK-CPA-O3:       // %bb.0: // %entry
+; CHECK-CPA-O3-NEXT:    adrp x8, b
+; CHECK-CPA-O3-NEXT:    mov w9, #2 // =0x2
+; CHECK-CPA-O3-NEXT:    adrp x10, a
+; CHECK-CPA-O3-NEXT:    add x10, x10, :lo12:a
+; CHECK-CPA-O3-NEXT:    ldrh w8, [x8, :lo12:b]
+; CHECK-CPA-O3-NEXT:    addpt x9, x10, x9
+; CHECK-CPA-O3-NEXT:    addpt x9, x9, x8, lsl #1
+; CHECK-CPA-O3-NEXT:    add x8, x8, #1
+; CHECK-CPA-O3-NEXT:    addpt x8, x9, x8
+; CHECK-CPA-O3-NEXT:    ldrb w8, [x8]
+; CHECK-CPA-O3-NEXT:    cbz w8, .LBB14_2
+; CHECK-CPA-O3-NEXT:  // %bb.1:
+; CHECK-CPA-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CPA-O3-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-CPA-O3-NEXT:    .cfi_offset w30, -16
+; CHECK-CPA-O3-NEXT:    adrp x0, .L.str
+; CHECK-CPA-O3-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-CPA-O3-NEXT:    bl printf
+; CHECK-CPA-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CPA-O3-NEXT:  .LBB14_2:
+; CHECK-CPA-O3-NEXT:    ret
+;
+; CHECK-NOCPA-O0-LABEL: multidim:
+; CHECK-NOCPA-O0:       // %bb.0: // %entry
+; CHECK-NOCPA-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOCPA-O0-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NOCPA-O0-NEXT:    .cfi_offset w30, -16
+; CHECK-NOCPA-O0-NEXT:    adrp x8, b
+; CHECK-NOCPA-O0-NEXT:    ldrh w9, [x8, :lo12:b]
+; CHECK-NOCPA-O0-NEXT:    adrp x8, a
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, :lo12:a
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, w9, uxtw #1
+; CHECK-NOCPA-O0-NEXT:    add w9, w9, #1
+; CHECK-NOCPA-O0-NEXT:    mov w9, w9
+; CHECK-NOCPA-O0-NEXT:    // kill: def $x9 killed $w9
+; CHECK-NOCPA-O0-NEXT:    add x8, x8, x9
+; CHECK-NOCPA-O0-NEXT:    ldrb w8, [x8, #2]
+; CHECK-NOCPA-O0-NEXT:    cbz w8, .LBB14_2
+; CHECK-NOCPA-O0-NEXT:    b .LBB14_1
+; CHECK-NOCPA-O0-NEXT:  .LBB14_1:
+; CHECK-NOCPA-O0-NEXT:    adrp x0, .L.str
+; CHECK-NOCPA-O0-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-NOCPA-O0-NEXT:    bl printf
+; CHECK-NOCPA-O0-NEXT:    b .LBB14_2
+; CHECK-NOCPA-O0-NEXT:  .LBB14_2:
+; CHECK-NOCPA-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOCPA-O0-NEXT:    ret
+;
+; CHECK-NOCPA-O3-LABEL: multidim:
+; CHECK-NOCPA-O3:       // %bb.0: // %entry
+; CHECK-NOCPA-O3-NEXT:    adrp x8, b
+; CHECK-NOCPA-O3-NEXT:    adrp x9, a
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, :lo12:a
+; CHECK-NOCPA-O3-NEXT:    ldrh w8, [x8, :lo12:b]
+; CHECK-NOCPA-O3-NEXT:    add x9, x9, x8, lsl #1
+; CHECK-NOCPA-O3-NEXT:    add x8, x9, x8
+; CHECK-NOCPA-O3-NEXT:    ldrb w8, [x8, #3]
+; CHECK-NOCPA-O3-NEXT:    cbz w8, .LBB14_2
+; CHECK-NOCPA-O3-NEXT:  // %bb.1:
+; CHECK-NOCPA-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NOCPA-O3-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NOCPA-O3-NEXT:    .cfi_offset w30, -16
+; CHECK-NOCPA-O3-NEXT:    adrp x0, .L.str
+; CHECK-NOCPA-O3-NEXT:    add x0, x0, :lo12:.L.str
+; CHECK-NOCPA-O3-NEXT:    bl printf
+; CHECK-NOCPA-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOCPA-O3-NEXT:  .LBB14_2:
+; CHECK-NOCPA-O3-NEXT:    ret
+entry:
+  %1 = load i16, ptr @b, align 2
+  %2 = zext i16 %1 to i64
+  %3 = getelementptr inbounds nuw [1 x [2 x i8]], ptr getelementptr inbounds ([2 x [1 x [2 x i8]]], ptr @a, i64 0, i64 1), i64 0, i64 %2
+  %4 = load i16, ptr @b, align 2
+  %5 = zext i16 %4 to i32
+  %6 = add nsw i32 %5, 1
+  %7 = sext i32 %6 to i64
+  %8 = getelementptr inbounds [2 x i8], ptr %3, i64 0, i64 %7
+  %9 = load i8, ptr %8, align 1
+  %10 = icmp ne i8 %9, 0
+  br i1 %10, label %11, label %13
+
+11:
+  %12 = call i32 (ptr, ...) @printf(ptr noundef @.str)
+  br label %13
+
+13:
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare i32 @printf(ptr noundef, ...) #1
+
+!6 = !{i64 0, i64 8, !7, i64 8, i64 8, !7, i64 16, i64 8, !7, i64 24, i64 8, !7, i64 32, i64 8, !7, i64 40, i64 8, !7}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"long", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!11 = !{i64 0, i64 8, !7, i64 8, i64 8, !7}

From 491b82a5ec1add78d2c93370580a2f1897b6a364 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Wed, 30 Apr 2025 18:25:54 -0700
Subject: [PATCH 1112/1322] ELF: Add branch-to-branch optimization.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When code calls a function which then immediately tail calls another
function there is no need to go via the intermediate function. By
branching directly to the target function we reduce the program's working
set for a slight increase in runtime performance.

Normally it is relatively uncommon to have functions that just tail call
another function, but with LLVM control flow integrity we have jump tables
that replace the function itself as the canonical address. As a result,
when a function address is taken and called directly, for example after
a compiler optimization resolves the indirect call, or if code built
without control flow integrity calls the function, the call will go via
the jump table.

The impact of this optimization was measured using a large internal
Google benchmark. The results were as follows:

CFI enabled:  +0.1% ± 0.05% queries per second
CFI disabled: +0.01% queries per second [not statistically significant]

The optimization is enabled by default at -O2 but may also be enabled
or disabled individually with --{,no-}branch-to-branch.

This optimization is implemented for AArch64 and X86_64 only.

lld's runtime performance (real execution time) after adding this
optimization was measured using firefox-x64 from lld-speed-test [1]
with ldflags "-O2 -S" on an Apple M2 Ultra. The results are as follows:

```
    N           Min           Max        Median           Avg        Stddev
x 512     1.2264546     1.3481076     1.2970261     1.2965788   0.018620888
+ 512     1.2561196     1.3839965     1.3214632     1.3209327   0.019443971
Difference at 95.0% confidence
	0.0243538 +/- 0.00233202
	1.87831% +/- 0.179859%
	(Student's t, pooled s = 0.0190369)
```

[1] https://discourse.llvm.org/t/improving-the-reproducibility-of-linker-benchmarking/86057

Pull Request: https://github.com/llvm/llvm-project/pull/138366
---
 lld/ELF/Arch/AArch64.cpp                |  59 +++++++++++
 lld/ELF/Arch/TargetImpl.h               |  93 +++++++++++++++++
 lld/ELF/Arch/X86_64.cpp                 |  69 ++++++++++++
 lld/ELF/Config.h                        |   1 +
 lld/ELF/Driver.cpp                      |   2 +
 lld/ELF/InputSection.cpp                |   5 +-
 lld/ELF/Options.td                      |   4 +
 lld/ELF/Relocations.cpp                 |   8 +-
 lld/ELF/Target.h                        |   1 +
 lld/docs/ReleaseNotes.rst               |   4 +
 lld/docs/ld.lld.1                       |   9 +-
 lld/test/ELF/aarch64-branch-to-branch.s |  82 +++++++++++++++
 lld/test/ELF/x86-64-branch-to-branch.s  | 133 ++++++++++++++++++++++++
 13 files changed, 464 insertions(+), 6 deletions(-)
 create mode 100644 lld/ELF/Arch/TargetImpl.h
 create mode 100644 lld/test/ELF/aarch64-branch-to-branch.s
 create mode 100644 lld/test/ELF/x86-64-branch-to-branch.s

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 8a225ed103ee..f00c91b5886f 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "TargetImpl.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Endian.h"
 
@@ -82,6 +83,7 @@ public:
                 uint64_t val) const override;
   RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
   void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
+  void applyBranchToBranchOpt() const override;
 
 private:
   void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -974,6 +976,63 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   }
 }
 
+static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
+                                                        Relocation &r) {
+  // Identify a control transfer relocation for the branch-to-branch
+  // optimization. A "control transfer relocation" means a B or BL
+  // target but it also includes relative vtable relocations for example.
+  //
+  // We require the relocation type to be JUMP26, CALL26 or PLT32. With a
+  // relocation type of PLT32 the value may be assumed to be used for branching
+  // directly to the symbol and the addend is only used to produce the relocated
+  // value (hence the effective addend is always 0). This is because if a PLT is
+  // needed the addend will be added to the address of the PLT, and it doesn't
+  // make sense to branch into the middle of a PLT. For example, relative vtable
+  // relocations use PLT32 and 0 or a positive value as the addend but still are
+  // used to branch to the symbol.
+  //
+  // With JUMP26 or CALL26 the only reasonable interpretation of a non-zero
+  // addend is that we are branching to symbol+addend so that becomes the
+  // effective addend.
+  if (r.type == R_AARCH64_PLT32)
+    return 0;
+  if (r.type == R_AARCH64_JUMP26 || r.type == R_AARCH64_CALL26)
+    return r.addend;
+  return std::nullopt;
+}
+
+static std::pair<Relocation *, uint64_t>
+getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
+  auto *i =
+      std::partition_point(is.relocations.begin(), is.relocations.end(),
+                           [&](Relocation &r) { return r.offset < offset; });
+  if (i != is.relocations.end() && i->offset == offset &&
+      i->type == R_AARCH64_JUMP26) {
+    return {i, i->addend};
+  }
+  return {nullptr, 0};
+}
+
+static void redirectControlTransferRelocations(Relocation &r1,
+                                               const Relocation &r2) {
+  r1.expr = r2.expr;
+  r1.sym = r2.sym;
+  // With PLT32 we must respect the original addend as that affects the value's
+  // interpretation. With the other relocation types the original addend is
+  // irrelevant because it referred to an offset within the original target
+  // section so we overwrite it.
+  if (r1.type == R_AARCH64_PLT32)
+    r1.addend += r2.addend;
+  else
+    r1.addend = r2.addend;
+}
+
+void AArch64::applyBranchToBranchOpt() const {
+  applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
+                             getBranchInfoAtTarget,
+                             redirectControlTransferRelocations);
+}
+
 // AArch64 may use security features in variant PLT sequences. These are:
 // Pointer Authentication (PAC), introduced in armv8.3-a and Branch Target
 // Indicator (BTI) introduced in armv8.5-a. The additional instructions used
diff --git a/lld/ELF/Arch/TargetImpl.h b/lld/ELF/Arch/TargetImpl.h
new file mode 100644
index 000000000000..f1206570d3e3
--- /dev/null
+++ b/lld/ELF/Arch/TargetImpl.h
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_ARCH_TARGETIMPL_H
+#define LLD_ELF_ARCH_TARGETIMPL_H
+
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "Relocations.h"
+#include "Symbols.h"
+#include "llvm/BinaryFormat/ELF.h"
+
+namespace lld::elf {
+
+// getControlTransferAddend: If this relocation is used for control transfer
+// instructions (e.g. branch, branch-link or call) or code references (e.g.
+// virtual function pointers) and indicates an address-insignificant reference,
+// return the effective addend for the relocation, otherwise return
+// std::nullopt. The effective addend for a relocation is the addend that is
+// used to determine its branch destination.
+//
+// getBranchInfoAtTarget: If a control transfer relocation referring to
+// is+offset directly transfers control to a relocated branch instruction in the
+// specified section, return the relocation for the branch target as well as its
+// effective addend (see above). Otherwise return {nullptr, 0}.
+//
+// redirectControlTransferRelocations: Given r1, a relocation for which
+// getControlTransferAddend() returned a value, and r2, a relocation returned by
+// getBranchInfo(), modify r1 so that it branches directly to the target of r2.
+template <typename GetControlTransferAddend, typename GetBranchInfoAtTarget,
+          typename RedirectControlTransferRelocations>
+inline void applyBranchToBranchOptImpl(
+    Ctx &ctx, GetControlTransferAddend getControlTransferAddend,
+    GetBranchInfoAtTarget getBranchInfoAtTarget,
+    RedirectControlTransferRelocations redirectControlTransferRelocations) {
+  // Needs to run serially because it writes to the relocations array as well as
+  // reading relocations of other sections.
+  for (ELFFileBase *f : ctx.objectFiles) {
+    auto getRelocBranchInfo =
+        [&getBranchInfoAtTarget](
+            Relocation &r,
+            uint64_t addend) -> std::pair<Relocation *, uint64_t> {
+      auto *target = dyn_cast_or_null<Defined>(r.sym);
+      // We don't allow preemptible symbols or ifuncs (may go somewhere else),
+      // absolute symbols (runtime behavior unknown), non-executable or writable
+      // memory (ditto) or non-regular sections (no section data).
+      if (!target || target->isPreemptible || target->isGnuIFunc() ||
+          !target->section ||
+          !(target->section->flags & llvm::ELF::SHF_EXECINSTR) ||
+          (target->section->flags & llvm::ELF::SHF_WRITE) ||
+          target->section->kind() != SectionBase::Regular)
+        return {nullptr, 0};
+      return getBranchInfoAtTarget(*cast<InputSection>(target->section),
+                                   target->value + addend);
+    };
+    for (InputSectionBase *s : f->getSections()) {
+      if (!s)
+        continue;
+      for (Relocation &r : s->relocations) {
+        std::optional<uint64_t> addend =
+            getControlTransferAddend(*cast<InputSection>(s), r);
+        if (!addend)
+          continue;
+        std::pair<Relocation *, uint64_t> targetAndAddend =
+            getRelocBranchInfo(r, *addend);
+        if (!targetAndAddend.first)
+          continue;
+        // Avoid getting stuck in an infinite loop if we encounter a branch
+        // that (possibly indirectly) branches to itself. It is unlikely
+        // that more than 5 iterations will ever be needed in practice.
+        size_t iterations = 5;
+        while (iterations--) {
+          std::pair<Relocation *, uint64_t> nextTargetAndAddend =
+              getRelocBranchInfo(*targetAndAddend.first,
+                                 targetAndAddend.second);
+          if (!nextTargetAndAddend.first)
+            break;
+          targetAndAddend = nextTargetAndAddend;
+        }
+        redirectControlTransferRelocations(r, *targetAndAddend.first);
+      }
+    }
+  }
+}
+
+} // namespace lld::elf
+
+#endif
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 974da4d96320..b991b6f905b9 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "TargetImpl.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MathExtras.h"
@@ -49,6 +50,7 @@ public:
   bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
                              InputSection *nextIS) const override;
   bool relaxOnce(int pass) const override;
+  void applyBranchToBranchOpt() const override;
 
 private:
   void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -1161,6 +1163,73 @@ void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   }
 }
 
+static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
+                                                        Relocation &r) {
+  // Identify a control transfer relocation for the branch-to-branch
+  // optimization. A "control transfer relocation" usually means a CALL or JMP
+  // target but it also includes relative vtable relocations for example.
+  //
+  // We require the relocation type to be PLT32. With a relocation type of PLT32
+  // the value may be assumed to be used for branching directly to the symbol
+  // and the addend is only used to produce the relocated value (hence the
+  // effective addend is always 0). This is because if a PLT is needed the
+  // addend will be added to the address of the PLT, and it doesn't make sense
+  // to branch into the middle of a PLT. For example, relative vtable
+  // relocations use PLT32 and 0 or a positive value as the addend but still are
+  // used to branch to the symbol.
+  //
+  // STT_SECTION symbols are a special case on x86 because the LLVM assembler
+  // uses them for branches to local symbols which are assembled as referring to
+  // the section symbol with the addend equal to the symbol value - 4.
+  if (r.type == R_X86_64_PLT32) {
+    if (r.sym->isSection())
+      return r.addend + 4;
+    return 0;
+  }
+  return std::nullopt;
+}
+
+static std::pair<Relocation *, uint64_t>
+getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
+  auto content = is.contentMaybeDecompress();
+  if (content.size() > offset && content[offset] == 0xe9) { // JMP immediate
+    auto *i = std::partition_point(
+        is.relocations.begin(), is.relocations.end(),
+        [&](Relocation &r) { return r.offset < offset + 1; });
+    // Unlike with getControlTransferAddend() it is valid to accept a PC32
+    // relocation here because we know that this is actually a JMP and not some
+    // other reference, so the interpretation is that we add 4 to the addend and
+    // use that as the effective addend.
+    if (i != is.relocations.end() && i->offset == offset + 1 &&
+        (i->type == R_X86_64_PC32 || i->type == R_X86_64_PLT32)) {
+      return {i, i->addend + 4};
+    }
+  }
+  return {nullptr, 0};
+}
+
+static void redirectControlTransferRelocations(Relocation &r1,
+                                               const Relocation &r2) {
+  // The isSection() check handles the STT_SECTION case described above.
+  // In that case the original addend is irrelevant because it referred to an
+  // offset within the original target section so we overwrite it.
+  //
+  // The +4 is here to compensate for r2.addend which will likely be -4,
+  // but may also be addend-4 in case of a PC32 branch to symbol+addend.
+  if (r1.sym->isSection())
+    r1.addend = r2.addend;
+  else
+    r1.addend += r2.addend + 4;
+  r1.expr = r2.expr;
+  r1.sym = r2.sym;
+}
+
+void X86_64::applyBranchToBranchOpt() const {
+  applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
+                             getBranchInfoAtTarget,
+                             redirectControlTransferRelocations);
+}
+
 // If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
 // entries containing endbr64 instructions. A PLT entry will be split into two
 // parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 2b72d54ba410..88bda41d3648 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -302,6 +302,7 @@ struct Config {
   bool bpFunctionOrderForCompression = false;
   bool bpDataOrderForCompression = false;
   bool bpVerboseSectionOrderer = false;
+  bool branchToBranch = false;
   bool checkSections;
   bool checkDynamicRelocs;
   std::optional<llvm::DebugCompressionType> compressDebugSections;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7e132a387a04..1e0b5988343a 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1644,6 +1644,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.zWxneeded = hasZOption(args, "wxneeded");
   setUnresolvedSymbolPolicy(ctx, args);
   ctx.arg.power10Stubs = args.getLastArgValue(OPT_power10_stubs_eq) != "no";
+  ctx.arg.branchToBranch = args.hasFlag(
+      OPT_branch_to_branch, OPT_no_branch_to_branch, ctx.arg.optimize >= 2);
 
   if (opt::Arg *arg = args.getLastArg(OPT_eb, OPT_el)) {
     if (arg->getOption().matches(OPT_eb))
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 0ce0f08d0387..f8786265029e 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -430,8 +430,9 @@ InputSectionBase *InputSection::getRelocatedSection() const {
 
 template <class ELFT, class RelTy>
 void InputSection::copyRelocations(Ctx &ctx, uint8_t *buf) {
-  if (ctx.arg.relax && !ctx.arg.relocatable &&
-      (ctx.arg.emachine == EM_RISCV || ctx.arg.emachine == EM_LOONGARCH)) {
+  bool linkerRelax =
+      ctx.arg.relax && is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine);
+  if (!ctx.arg.relocatable && (linkerRelax || ctx.arg.branchToBranch)) {
     // On LoongArch and RISC-V, relaxation might change relocations: copy
     // from internal ones that are updated by relaxation.
     InputSectionBase *sec = getRelocatedSection();
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index c795147eb966..d7e331316700 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -59,6 +59,10 @@ def build_id: J<"build-id=">, HelpText<"Generate build ID note">,
   MetaVarName<"[fast,md5,sha1,uuid,0x<hexstring>]">;
 def : F<"build-id">, Alias<build_id>, AliasArgs<["sha1"]>, HelpText<"Alias for --build-id=sha1">;
 
+defm branch_to_branch: BB<"branch-to-branch",
+    "Enable branch-to-branch optimization (default at -O2)",
+    "Disable branch-to-branch optimization (default at -O0 and -O1)">;
+
 defm check_sections: B<"check-sections",
     "Check section addresses for overlaps (default)",
     "Do not check section addresses for overlaps">;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 6c4209a2b81e..43f19186f098 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1665,9 +1665,10 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
   }
 
   // Sort relocations by offset for more efficient searching for
-  // R_RISCV_PCREL_HI20 and R_PPC64_ADDR64.
+  // R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization.
   if (ctx.arg.emachine == EM_RISCV ||
-      (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc"))
+      (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
+      ctx.arg.branchToBranch)
     llvm::stable_sort(sec->relocs(),
                       [](const Relocation &lhs, const Relocation &rhs) {
                         return lhs.offset < rhs.offset;
@@ -1958,6 +1959,9 @@ void elf::postScanRelocations(Ctx &ctx) {
   for (ELFFileBase *file : ctx.objectFiles)
     for (Symbol *sym : file->getLocalSymbols())
       fn(*sym);
+
+  if (ctx.arg.branchToBranch)
+    ctx.target->applyBranchToBranchOpt();
 }
 
 static bool mergeCmp(const InputSection *a, const InputSection *b) {
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index fd1e5d33c438..6dd20b2f0cba 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -101,6 +101,7 @@ public:
 
   virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type,
                                  JumpModType val) const {}
+  virtual void applyBranchToBranchOpt() const {}
 
   virtual ~TargetInfo();
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 064ed0828c31..dabfc961dd5b 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -62,6 +62,10 @@ ELF Improvements
   on executable sections.
   (`#128883 <https://github.com/llvm/llvm-project/pull/128883>`_)
 
+* For AArch64 and X86_64, added ``--branch-to-branch``, which rewrites branches
+  that point to another branch instruction to instead branch directly to the
+  target of the second instruction. Enabled by default at ``-O2``.
+  
 Breaking changes
 ----------------
 * Executable-only and readable-executable sections are now allowed to be placed
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index cfacdb081a80..7edc522b4f6a 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -93,6 +93,11 @@ Bind default visibility defined STB_GLOBAL function symbols locally for
 .Fl shared.
 .It Fl -be8
 Write a Big Endian ELF File using BE8 format(AArch32 only)
+.It Fl -branch-to-branch
+Enable the branch-to-branch optimizations: a branch whose target is
+another branch instruction is rewritten to point to the latter branch
+target (AArch64 and X86_64 only). Enabled by default at
+.Fl O2 Ns .
 .It Fl -build-id Ns = Ns Ar value
 Generate a build ID note.
 .Ar value
@@ -414,7 +419,7 @@ If not specified,
 .Dv a.out
 is used as a default.
 .It Fl O Ns Ar value
-Optimize output file size.
+Optimize output file.
 .Ar value
 may be:
 .Pp
@@ -424,7 +429,7 @@ Disable string merging.
 .It Cm 1
 Enable string merging.
 .It Cm 2
-Enable string tail merging.
+Enable string tail merging and branch-to-branch optimization.
 .El
 .Pp
 .Fl O Ns Cm 1
diff --git a/lld/test/ELF/aarch64-branch-to-branch.s b/lld/test/ELF/aarch64-branch-to-branch.s
new file mode 100644
index 000000000000..7dc485aef853
--- /dev/null
+++ b/lld/test/ELF/aarch64-branch-to-branch.s
@@ -0,0 +1,82 @@
+# REQUIRES: aarch64
+
+## Test that the branch-to-branch optimization follows the links
+## from f1 -> f2 -> f3 and updates all references to point to f3.
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+
+## Test that branch-to-branch is disabled by default.
+
+# RUN: ld.lld %t.o -o %t --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+## Test that branch-to-branch is disabled for preemptible symbols.
+
+# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+.section .rodata.vtable,"a"
+.globl vtable
+vtable:
+# B2B: Contents of section .rodata:
+# RELOC: RELOCATION RECORDS FOR [.rodata]:
+# RELOC-NEXT: OFFSET
+# B2B-NEXT: [[VF:[0-9a-f]{8}]]
+# B2B-RELOC-NEXT: R_AARCH64_PLT32 f3
+# NOB2B-RELOC-NEXT: R_AARCH64_PLT32 f1
+.4byte f1@PLT - vtable
+# B2B-SAME: [[VF]]
+# B2B-RELOC-NEXT: R_AARCH64_PLT32 f3+0x4
+# NOB2B-RELOC-NEXT: R_AARCH64_PLT32 f2+0x4
+.4byte f2@PLT - vtable
+# B2B-SAME: [[VF]]
+# RELOC-NEXT: R_AARCH64_PLT32 f3+0x8
+.4byte f3@PLT - vtable
+
+.section .text._start,"ax"
+.globl _start
+# CHECK: <_start>:
+# RELOC: RELOCATION RECORDS FOR [.text]:
+# RELOC-NEXT: OFFSET
+_start:
+# B2B: bl {{.*}} <f3>
+# B2B-RELOC-NEXT: R_AARCH64_CALL26 f3
+# NOB2B: bl {{.*}} <f1{{.*}}>
+# NOB2B-RELOC-NEXT: R_AARCH64_CALL26 f1
+bl f1
+# B2B: b {{.*}} <f3>
+# B2B-RELOC-NEXT: R_AARCH64_JUMP26 f3
+# NOB2B: b {{.*}} <f2{{.*}}>
+# NOB2B-RELOC-NEXT: R_AARCH64_JUMP26 f2
+b f2
+
+.section .text.f1,"ax"
+.globl f1
+f1:
+# B2B-RELOC-NEXT: R_AARCH64_JUMP26 f3
+# NOB2B-RELOC-NEXT: R_AARCH64_JUMP26 f2
+b f2
+
+.section .text.f2,"ax"
+.globl f2
+# CHECK: <f2>:
+f2:
+# CHECK-NEXT: b {{.*}} <f3{{.*}}>
+# RELOC-NEXT: R_AARCH64_JUMP26 f3
+b f3
+
+.section .text.f3,"ax"
+.globl f3
+f3:
+ret
diff --git a/lld/test/ELF/x86-64-branch-to-branch.s b/lld/test/ELF/x86-64-branch-to-branch.s
new file mode 100644
index 000000000000..dabf5be571ec
--- /dev/null
+++ b/lld/test/ELF/x86-64-branch-to-branch.s
@@ -0,0 +1,133 @@
+# REQUIRES: x86
+
+## Test that the branch-to-branch optimization follows the links
+## from f1 -> f2 -> f3 and updates all references to point to f3.
+ 
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
+
+## Test that branch-to-branch is disabled by default.
+
+# RUN: ld.lld %t.o -o %t --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+## Test that branch-to-branch is disabled for preemptible symbols.
+
+# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
+# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
+# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
+
+.section .rodata.vtable,"a"
+.globl vtable
+vtable:
+# B2B: Contents of section .rodata:
+# RELOC: RELOCATION RECORDS FOR [.rodata]:
+# RELOC-NEXT: OFFSET
+# B2B-NEXT: [[VF:[0-9a-f]{8}]]
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1
+.4byte f1@PLT - vtable
+# B2B-SAME: [[VF]]
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3+0x4
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2+0x4
+.4byte f2@PLT - vtable
+# B2B-SAME: [[VF]]
+# RELOC-NEXT: R_X86_64_PLT32 f3+0x8
+.4byte f3@PLT - vtable
+
+# For .rodata.f6
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+ 
+.section .text._start,"ax"
+.globl _start
+# CHECK: <_start>:
+# RELOC: RELOCATION RECORDS FOR [.text]:
+# RELOC-NEXT: OFFSET
+_start:
+# B2B-NEXT: jmp {{.*}} <f3>
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+# NOB2B-NEXT: jmp {{.*}} <f1{{.*}}>
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1-0x4
+jmp f1
+# B2B-NEXT: jmp {{.*}} <f3>
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+# NOB2B-NEXT: jmp {{.*}} <f2{{.*}}>
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
+jmp f2
+# This will assemble to a relocation pointing to an STT_SECTION for .text.f4
+# with an addend, which looks similar to the relative vtable cases above but
+# requires different handling of the addend so that we don't think this is
+# branching to the `jmp f3` at the start of the target section.
+# CHECK-NEXT: jmp {{.*}} <f4{{.*}}>
+# RELOC-NEXT: R_X86_64_PLT32 .text+0x2e
+jmp f4
+# B2B-NEXT: jmp 0x[[IPLT:[0-9a-f]*]]
+# RELOC-NEXT: R_X86_64_PLT32 f5-0x4
+jmp f5
+# B2B-NEXT: jmp {{.*}} <f6>
+# RELOC-NEXT: R_X86_64_PLT32 f6-0x4
+jmp f6
+# B2B-NEXT: jmp {{.*}} <f7>
+# RELOC-NEXT: R_X86_64_PLT32 f7-0x4
+jmp f7
+
+.section .text.f1,"ax"
+.globl f1
+f1:
+# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
+jmp f2
+
+.section .text.f2,"ax"
+.globl f2
+# CHECK: <f2>:
+f2:
+# CHECK-NEXT: jmp {{.*}} <f3{{.*}}>
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+.section .text.f3,"ax"
+.globl f3
+f3:
+# Test that a self-branch doesn't trigger an infinite loop.
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+.section .text.f4,"ax"
+jmp f3
+f4:
+ret
+
+.section .text.f5,"ax"
+.type f5, @gnu_indirect_function
+.globl f5
+f5:
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+.section .rodata.f6,"a"
+.globl f6
+f6:
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+# RELOC: RELOCATION RECORDS FOR [.wtext.f7]:
+# RELOC-NEXT: OFFSET
+
+.section .wtext.f7,"awx"
+.globl f7
+f7:
+# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
+jmp f3
+
+# B2B: <.iplt>:
+# B2B-NEXT: [[IPLT]]:

From 6110dead894bec37d6373eda4cba2d5dc426f824 Mon Sep 17 00:00:00 2001
From: Michael Spencer <bigcheesegs@gmail.com>
Date: Fri, 20 Jun 2025 13:28:05 -0700
Subject: [PATCH 1113/1322] [clang][scan-deps] Add option to disable caching
 stat failures (#144000)

While the source code isn't supposed to change during a build, in some
environments it does. This adds an option that disables caching of stat
failures, meaning that source files can be added to the build during
scanning.

This adds a `-no-cache-negative-stats` option to clang-scan-deps to
enable this behavior. There are no tests for clang-scan-deps as there's
no reliable way to do so from it. A unit test has been added that
modifies the filesystem between scans to test it.
---
 .../DependencyScanningFilesystem.h            | 24 ++++-----
 .../DependencyScanningService.h               |  6 ++-
 .../DependencyScanningFilesystem.cpp          | 40 +++++++++++---
 .../DependencyScanningService.cpp             |  3 +-
 .../DependencyScanningWorker.cpp              |  3 +-
 clang/tools/clang-scan-deps/ClangScanDeps.cpp |  8 ++-
 clang/tools/clang-scan-deps/Opts.td           |  1 +
 .../DependencyScannerTest.cpp                 | 52 +++++++++++++++++++
 .../DependencyScanningFilesystemTest.cpp      | 37 +++++++------
 9 files changed, 132 insertions(+), 42 deletions(-)

diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
index a20a89a4c2b7..da83220babea 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
@@ -23,6 +23,8 @@ namespace clang {
 namespace tooling {
 namespace dependencies {
 
+class DependencyScanningService;
+
 using DependencyDirectivesTy =
     SmallVector<dependency_directives_scan::Directive, 20>;
 
@@ -349,7 +351,7 @@ public:
   static const char ID;
 
   DependencyScanningWorkerFilesystem(
-      DependencyScanningFilesystemSharedCache &SharedCache,
+      DependencyScanningService &Service,
       IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
 
   llvm::ErrorOr<llvm::vfs::Status> status(const Twine &Path) override;
@@ -435,10 +437,7 @@ private:
   /// Returns entry associated with the unique ID in the shared cache or nullptr
   /// if none is found.
   const CachedFileSystemEntry *
-  findSharedEntryByUID(llvm::vfs::Status Stat) const {
-    return SharedCache.getShardForUID(Stat.getUniqueID())
-        .findEntryByUID(Stat.getUniqueID());
-  }
+  findSharedEntryByUID(llvm::vfs::Status Stat) const;
 
   /// Associates the given entry with the filename in the local cache and
   /// returns it.
@@ -452,20 +451,14 @@ private:
   /// some. Otherwise, constructs new one with the given error code, associates
   /// it with the filename and returns the result.
   const CachedFileSystemEntry &
-  getOrEmplaceSharedEntryForFilename(StringRef Filename, std::error_code EC) {
-    return SharedCache.getShardForFilename(Filename)
-        .getOrEmplaceEntryForFilename(Filename, EC);
-  }
+  getOrEmplaceSharedEntryForFilename(StringRef Filename, std::error_code EC);
 
   /// Returns entry associated with the filename in the shared cache if there is
   /// some. Otherwise, associates the given entry with the filename and returns
   /// it.
   const CachedFileSystemEntry &
   getOrInsertSharedEntryForFilename(StringRef Filename,
-                                    const CachedFileSystemEntry &Entry) {
-    return SharedCache.getShardForFilename(Filename)
-        .getOrInsertEntryForFilename(Filename, Entry);
-  }
+                                    const CachedFileSystemEntry &Entry);
 
   void printImpl(raw_ostream &OS, PrintType Type,
                  unsigned IndentLevel) const override {
@@ -478,8 +471,9 @@ private:
   /// VFS.
   bool shouldBypass(StringRef Path) const;
 
-  /// The global cache shared between worker threads.
-  DependencyScanningFilesystemSharedCache &SharedCache;
+  /// The service associated with this VFS.
+  DependencyScanningService &Service;
+
   /// The local cache is used by the worker thread to cache file system queries
   /// locally instead of querying the global cache every time.
   DependencyScanningFilesystemLocalCache LocalCache;
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
index 4e97c7bc9f36..ceaf3c2279e7 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
@@ -87,7 +87,8 @@ public:
       ScanningOptimizations OptimizeArgs = ScanningOptimizations::Default,
       bool EagerLoadModules = false, bool TraceVFS = false,
       std::time_t BuildSessionTimestamp =
-          llvm::sys::toTimeT(std::chrono::system_clock::now()));
+          llvm::sys::toTimeT(std::chrono::system_clock::now()),
+      bool CacheNegativeStats = true);
 
   ScanningMode getMode() const { return Mode; }
 
@@ -99,6 +100,8 @@ public:
 
   bool shouldTraceVFS() const { return TraceVFS; }
 
+  bool shouldCacheNegativeStats() const { return CacheNegativeStats; }
+
   DependencyScanningFilesystemSharedCache &getSharedCache() {
     return SharedCache;
   }
@@ -116,6 +119,7 @@ private:
   const bool EagerLoadModules;
   /// Whether to trace VFS accesses.
   const bool TraceVFS;
+  const bool CacheNegativeStats;
   /// The global file system cache.
   DependencyScanningFilesystemSharedCache SharedCache;
   /// The global module cache entries.
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
index 140833050f4e..2868522f8001 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
+#include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Threading.h"
 #include <optional>
@@ -232,19 +233,19 @@ bool DependencyScanningWorkerFilesystem::shouldBypass(StringRef Path) const {
 }
 
 DependencyScanningWorkerFilesystem::DependencyScanningWorkerFilesystem(
-    DependencyScanningFilesystemSharedCache &SharedCache,
+    DependencyScanningService &Service,
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS)
     : llvm::RTTIExtends<DependencyScanningWorkerFilesystem,
                         llvm::vfs::ProxyFileSystem>(std::move(FS)),
-      SharedCache(SharedCache),
-      WorkingDirForCacheLookup(llvm::errc::invalid_argument) {
+      Service(Service), WorkingDirForCacheLookup(llvm::errc::invalid_argument) {
   updateWorkingDirForCacheLookup();
 }
 
 const CachedFileSystemEntry &
 DependencyScanningWorkerFilesystem::getOrEmplaceSharedEntryForUID(
     TentativeEntry TEntry) {
-  auto &Shard = SharedCache.getShardForUID(TEntry.Status.getUniqueID());
+  auto &Shard =
+      Service.getSharedCache().getShardForUID(TEntry.Status.getUniqueID());
   return Shard.getOrEmplaceEntryForUID(TEntry.Status.getUniqueID(),
                                        std::move(TEntry.Status),
                                        std::move(TEntry.Contents));
@@ -255,18 +256,44 @@ DependencyScanningWorkerFilesystem::findEntryByFilenameWithWriteThrough(
     StringRef Filename) {
   if (const auto *Entry = LocalCache.findEntryByFilename(Filename))
     return Entry;
-  auto &Shard = SharedCache.getShardForFilename(Filename);
+  auto &Shard = Service.getSharedCache().getShardForFilename(Filename);
   if (const auto *Entry = Shard.findEntryByFilename(Filename))
     return &LocalCache.insertEntryForFilename(Filename, *Entry);
   return nullptr;
 }
 
+const CachedFileSystemEntry *
+DependencyScanningWorkerFilesystem::findSharedEntryByUID(
+    llvm::vfs::Status Stat) const {
+  return Service.getSharedCache()
+      .getShardForUID(Stat.getUniqueID())
+      .findEntryByUID(Stat.getUniqueID());
+}
+
+const CachedFileSystemEntry &
+DependencyScanningWorkerFilesystem::getOrEmplaceSharedEntryForFilename(
+    StringRef Filename, std::error_code EC) {
+  return Service.getSharedCache()
+      .getShardForFilename(Filename)
+      .getOrEmplaceEntryForFilename(Filename, EC);
+}
+
+const CachedFileSystemEntry &
+DependencyScanningWorkerFilesystem::getOrInsertSharedEntryForFilename(
+    StringRef Filename, const CachedFileSystemEntry &Entry) {
+  return Service.getSharedCache()
+      .getShardForFilename(Filename)
+      .getOrInsertEntryForFilename(Filename, Entry);
+}
+
 llvm::ErrorOr<const CachedFileSystemEntry &>
 DependencyScanningWorkerFilesystem::computeAndStoreResult(
     StringRef OriginalFilename, StringRef FilenameForLookup) {
   llvm::ErrorOr<llvm::vfs::Status> Stat =
       getUnderlyingFS().status(OriginalFilename);
   if (!Stat) {
+    if (!Service.shouldCacheNegativeStats())
+      return Stat.getError();
     const auto &Entry =
         getOrEmplaceSharedEntryForFilename(FilenameForLookup, Stat.getError());
     return insertLocalEntryForFilename(FilenameForLookup, Entry);
@@ -420,7 +447,8 @@ DependencyScanningWorkerFilesystem::getRealPath(const Twine &Path,
     return HandleCachedRealPath(*RealPath);
 
   // If we have the result in the shared cache, cache it locally.
-  auto &Shard = SharedCache.getShardForFilename(*FilenameForLookup);
+  auto &Shard =
+      Service.getSharedCache().getShardForFilename(*FilenameForLookup);
   if (const auto *ShardRealPath =
           Shard.findRealPathByFilename(*FilenameForLookup)) {
     const auto &RealPath = LocalCache.insertRealPathForFilename(
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
index 7f40c99f0728..c2f3cdbb02e3 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp
@@ -15,7 +15,8 @@ using namespace dependencies;
 DependencyScanningService::DependencyScanningService(
     ScanningMode Mode, ScanningOutputFormat Format,
     ScanningOptimizations OptimizeArgs, bool EagerLoadModules, bool TraceVFS,
-    std::time_t BuildSessionTimestamp)
+    std::time_t BuildSessionTimestamp, bool CacheNegativeStats)
     : Mode(Mode), Format(Format), OptimizeArgs(OptimizeArgs),
       EagerLoadModules(EagerLoadModules), TraceVFS(TraceVFS),
+      CacheNegativeStats(CacheNegativeStats),
       BuildSessionTimestamp(BuildSessionTimestamp) {}
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 9bd85479d981..d9820ca3c584 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -611,8 +611,7 @@ DependencyScanningWorker::DependencyScanningWorker(
 
   switch (Service.getMode()) {
   case ScanningMode::DependencyDirectivesScan:
-    DepFS =
-        new DependencyScanningWorkerFilesystem(Service.getSharedCache(), FS);
+    DepFS = new DependencyScanningWorkerFilesystem(Service, FS);
     BaseFS = DepFS;
     break;
   case ScanningMode::CanonicalPreprocessing:
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 921ba7aadd67..ce0770a51d65 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -85,6 +85,7 @@ static ScanningOutputFormat Format = ScanningOutputFormat::Make;
 static ScanningOptimizations OptimizeArgs;
 static std::string ModuleFilesDir;
 static bool EagerLoadModules;
+static bool CacheNegativeStats = true;
 static unsigned NumThreads = 0;
 static std::string CompilationDB;
 static std::optional<std::string> ModuleName;
@@ -191,6 +192,8 @@ static void ParseArgs(int argc, char **argv) {
 
   EagerLoadModules = Args.hasArg(OPT_eager_load_pcm);
 
+  CacheNegativeStats = !Args.hasArg(OPT_no_cache_negative_stats);
+
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_j)) {
     StringRef S{A->getValue()};
     if (!llvm::to_integer(S, NumThreads, 0)) {
@@ -1080,8 +1083,9 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
     });
   };
 
-  DependencyScanningService Service(ScanMode, Format, OptimizeArgs,
-                                    EagerLoadModules, /*TraceVFS=*/Verbose);
+  DependencyScanningService Service(
+      ScanMode, Format, OptimizeArgs, EagerLoadModules, /*TraceVFS=*/Verbose,
+      llvm::sys::toTimeT(std::chrono::system_clock::now()), CacheNegativeStats);
 
   llvm::Timer T;
   T.startTimer();
diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td
index 9cccbb3aaf0c..582ae60851e1 100644
--- a/clang/tools/clang-scan-deps/Opts.td
+++ b/clang/tools/clang-scan-deps/Opts.td
@@ -22,6 +22,7 @@ defm module_files_dir : Eq<"module-files-dir",
 
 def optimize_args_EQ : CommaJoined<["-", "--"], "optimize-args=">, HelpText<"Which command-line arguments of modules to optimize">;
 def eager_load_pcm : F<"eager-load-pcm", "Load PCM files eagerly (instead of lazily on import)">;
+def no_cache_negative_stats : F<"no-cache-negative-stats", "Don't cache stat failures">;
 
 def j : Arg<"j", "Number of worker threads to use (default: use all concurrent threads)">;
 
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
index 683d9070b1dc..d194b2877ad8 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
@@ -384,3 +384,55 @@ TEST(DependencyScanner, ScanDepsWithDiagConsumer) {
     EXPECT_TRUE(DiagConsumer.Finished);
   }
 }
+
+TEST(DependencyScanner, NoNegativeCache) {
+  StringRef CWD = "/root";
+
+  auto VFS = new llvm::vfs::InMemoryFileSystem();
+  VFS->setCurrentWorkingDirectory(CWD);
+  auto Sept = llvm::sys::path::get_separator();
+  std::string HeaderPath =
+      std::string(llvm::formatv("{0}root{0}header.h", Sept));
+  std::string Test0Path =
+      std::string(llvm::formatv("{0}root{0}test0.cpp", Sept));
+  std::string Test1Path =
+      std::string(llvm::formatv("{0}root{0}test1.cpp", Sept));
+
+  VFS->addFile(Test0Path, 0,
+               llvm::MemoryBuffer::getMemBuffer(
+                   "#if __has_include(\"header.h\")\n#endif"));
+  VFS->addFile(Test1Path, 0,
+               llvm::MemoryBuffer::getMemBuffer("#include \"header.h\""));
+
+  DependencyScanningService Service(
+      ScanningMode::DependencyDirectivesScan, ScanningOutputFormat::Make,
+      ScanningOptimizations::All, false, false,
+      llvm::sys::toTimeT(std::chrono::system_clock::now()), false);
+  DependencyScanningTool ScanTool(Service, VFS);
+
+  std::vector<std::string> CommandLine0 = {"clang",
+                                           "-target",
+                                           "x86_64-apple-macosx10.7",
+                                           "-c",
+                                           "test0.cpp",
+                                           "-o"
+                                           "test0.cpp.o"};
+  std::vector<std::string> CommandLine1 = {"clang",
+                                           "-target",
+                                           "x86_64-apple-macosx10.7",
+                                           "-c",
+                                           "test1.cpp",
+                                           "-o"
+                                           "test1.cpp.o"};
+
+  std::string Result;
+  ASSERT_THAT_ERROR(
+      ScanTool.getDependencyFile(CommandLine0, CWD).moveInto(Result),
+      llvm::Succeeded());
+
+  VFS->addFile(HeaderPath, 0, llvm::MemoryBuffer::getMemBuffer(""));
+
+  ASSERT_THAT_ERROR(
+      ScanTool.getDependencyFile(CommandLine1, CWD).moveInto(Result),
+      llvm::Succeeded());
+}
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
index 7420743c97a2..a68ea72d3816 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
+#include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "gtest/gtest.h"
@@ -19,9 +20,10 @@ TEST(DependencyScanningWorkerFilesystem, CacheStatusFailures) {
   auto InstrumentingFS =
       llvm::makeIntrusiveRefCnt<llvm::vfs::TracingFileSystem>(InMemoryFS);
 
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, InstrumentingFS);
-  DependencyScanningWorkerFilesystem DepFS2(SharedCache, InstrumentingFS);
+  DependencyScanningService Service(ScanningMode::DependencyDirectivesScan,
+                                    ScanningOutputFormat::Make);
+  DependencyScanningWorkerFilesystem DepFS(Service, InstrumentingFS);
+  DependencyScanningWorkerFilesystem DepFS2(Service, InstrumentingFS);
 
   DepFS.status("/foo.c");
   EXPECT_EQ(InstrumentingFS->NumStatusCalls, 1u);
@@ -45,9 +47,10 @@ TEST(DependencyScanningFilesystem, CacheGetRealPath) {
   auto InstrumentingFS =
       llvm::makeIntrusiveRefCnt<llvm::vfs::TracingFileSystem>(InMemoryFS);
 
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, InstrumentingFS);
-  DependencyScanningWorkerFilesystem DepFS2(SharedCache, InstrumentingFS);
+  DependencyScanningService Service(ScanningMode::DependencyDirectivesScan,
+                                    ScanningOutputFormat::Make);
+  DependencyScanningWorkerFilesystem DepFS(Service, InstrumentingFS);
+  DependencyScanningWorkerFilesystem DepFS2(Service, InstrumentingFS);
 
   {
     llvm::SmallString<128> Result;
@@ -80,8 +83,9 @@ TEST(DependencyScanningFilesystem, RealPathAndStatusInvariants) {
   InMemoryFS->addFile("/foo.c", 0, llvm::MemoryBuffer::getMemBuffer(""));
   InMemoryFS->addFile("/bar.c", 0, llvm::MemoryBuffer::getMemBuffer(""));
 
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, InMemoryFS);
+  DependencyScanningService Service(ScanningMode::DependencyDirectivesScan,
+                                    ScanningOutputFormat::Make);
+  DependencyScanningWorkerFilesystem DepFS(Service, InMemoryFS);
 
   // Success.
   {
@@ -133,8 +137,9 @@ TEST(DependencyScanningFilesystem, CacheStatOnExists) {
   InMemoryFS->setCurrentWorkingDirectory("/");
   InMemoryFS->addFile("/foo", 0, llvm::MemoryBuffer::getMemBuffer(""));
   InMemoryFS->addFile("/bar", 0, llvm::MemoryBuffer::getMemBuffer(""));
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, InstrumentingFS);
+  DependencyScanningService Service(ScanningMode::DependencyDirectivesScan,
+                                    ScanningOutputFormat::Make);
+  DependencyScanningWorkerFilesystem DepFS(Service, InstrumentingFS);
 
   DepFS.status("/foo");
   DepFS.status("/foo");
@@ -156,8 +161,9 @@ TEST(DependencyScanningFilesystem, CacheStatFailures) {
   auto InstrumentingFS =
       llvm::makeIntrusiveRefCnt<llvm::vfs::TracingFileSystem>(InMemoryFS);
 
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, InstrumentingFS);
+  DependencyScanningService Service(ScanningMode::DependencyDirectivesScan,
+                                    ScanningOutputFormat::Make);
+  DependencyScanningWorkerFilesystem DepFS(Service, InstrumentingFS);
 
   DepFS.status("/dir");
   DepFS.status("/dir");
@@ -183,8 +189,9 @@ TEST(DependencyScanningFilesystem, DiagnoseStaleStatFailures) {
   auto InMemoryFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
   InMemoryFS->setCurrentWorkingDirectory("/");
 
-  DependencyScanningFilesystemSharedCache SharedCache;
-  DependencyScanningWorkerFilesystem DepFS(SharedCache, InMemoryFS);
+  DependencyScanningService Service(ScanningMode::DependencyDirectivesScan,
+                                    ScanningOutputFormat::Make);
+  DependencyScanningWorkerFilesystem DepFS(Service, InMemoryFS);
 
   bool Path1Exists = DepFS.exists("/path1.suffix");
   EXPECT_EQ(Path1Exists, false);
@@ -197,7 +204,7 @@ TEST(DependencyScanningFilesystem, DiagnoseStaleStatFailures) {
   EXPECT_EQ(Path1Exists, false);
 
   std::vector<llvm::StringRef> InvalidPaths =
-      SharedCache.getInvalidNegativeStatCachedPaths(*InMemoryFS);
+      Service.getSharedCache().getInvalidNegativeStatCachedPaths(*InMemoryFS);
 
   EXPECT_EQ(InvalidPaths.size(), 1u);
   ASSERT_STREQ("/path1.suffix", InvalidPaths[0].str().c_str());

From 58f48011b3229b568d3a23a6f4853128712d5f8b Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Fri, 20 Jun 2025 13:28:21 -0700
Subject: [PATCH 1114/1322] [lldb] Add support for x86_64h to scripted process
 (#145099)

This patch adds support to the haswell sub-architecture (x86_64h) to
scripted processes.

rdar://147208252

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/examples/python/templates/scripted_process.py              | 2 +-
 .../Shell/ScriptInterpreter/Python/Crashlog/Inputs/a.out.ips    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/examples/python/templates/scripted_process.py b/lldb/examples/python/templates/scripted_process.py
index c7d5b28b52c0..b6360b851907 100644
--- a/lldb/examples/python/templates/scripted_process.py
+++ b/lldb/examples/python/templates/scripted_process.py
@@ -352,7 +352,7 @@ class ScriptedThread(metaclass=ABCMeta):
     def get_register_info(self):
         if self.register_info is None:
             self.register_info = dict()
-            if self.originating_process.arch == "x86_64":
+            if "x86_64" in self.originating_process.arch:
                 self.register_info["sets"] = ["General Purpose Registers"]
                 self.register_info["registers"] = INTEL64_GPR
             elif (
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/Inputs/a.out.ips b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/Inputs/a.out.ips
index cc2f16c01614..a11074f13e90 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/Inputs/a.out.ips
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/Inputs/a.out.ips
@@ -169,7 +169,7 @@
   },
   {
     "source" : "P",
-    "arch" : "x86_64",
+    "arch" : "x86_64h",
     "base" : 140733734899712,
     "size" : 245760,
     "uuid" : "c5caf30b-0617-3b07-88c7-6319cd06f30a",

From b6445ac0c54992993b154875d6afb04eeaa13910 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 20 Jun 2025 21:37:02 +0100
Subject: [PATCH 1115/1322] [GlobalISel] Create a common
 register_vector_matchinfo (#144306)

Several combiner use MatchInfo that are just SmallVector<Register>. This
creates a common register_vector_matchinfo that they can all use.
---
 llvm/include/llvm/Target/GlobalISel/Combine.td | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index efd88524a159..4a92dc16c1bf 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -211,6 +211,7 @@ def constantfp_matchinfo : GIDefMatchData<"ConstantFP*">;
 def build_fn_matchinfo :
 GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
 def unsigned_matchinfo: GIDefMatchData<"unsigned">;
+def register_vector_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 
 def copy_prop : GICombineRule<
   (defs root:$d),
@@ -811,9 +812,8 @@ def zext_trunc_fold: GICombineRule <
   (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
 >;
 
-def not_cmp_fold_matchinfo : GIDefMatchData<"SmallVector<Register, 4>">;
 def not_cmp_fold : GICombineRule<
-  (defs root:$d, not_cmp_fold_matchinfo:$info),
+  (defs root:$d, register_vector_matchinfo:$info),
   (match (wip_match_opcode G_XOR): $d,
   [{ return Helper.matchNotCmp(*${d}, ${info}); }]),
   (apply [{ Helper.applyNotCmp(*${d}, ${info}); }])
@@ -828,9 +828,8 @@ def fneg_fneg_fold: GICombineRule <
 >;
 
 // Fold (unmerge(merge x, y, z)) -> z, y, z.
-def unmerge_merge_matchinfo : GIDefMatchData<"SmallVector<Register, 8>">;
 def unmerge_merge : GICombineRule<
-  (defs root:$d, unmerge_merge_matchinfo:$info),
+  (defs root:$d, register_vector_matchinfo:$info),
   (match (wip_match_opcode G_UNMERGE_VALUES): $d,
   [{ return Helper.matchCombineUnmergeMergeToPlainValues(*${d}, ${info}); }]),
   (apply [{ Helper.applyCombineUnmergeMergeToPlainValues(*${d}, ${info}); }])
@@ -955,9 +954,8 @@ def ptr_add_with_zero: GICombineRule<
          [{ return Helper.matchPtrAddZero(*${root}); }]),
   (apply [{ Helper.applyPtrAddZero(*${root}); }])>;
 
-def regs_small_vec : GIDefMatchData<"SmallVector<Register, 4>">;
 def combine_insert_vec_elts_build_vector : GICombineRule<
-  (defs root:$root, regs_small_vec:$info),
+  (defs root:$root, register_vector_matchinfo:$info),
   (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
     [{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]),
   (apply [{ Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>;
@@ -1553,9 +1551,8 @@ def nneg_zext : GICombineRule<
    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
 
 // Combines concat operations
-def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 def combine_concat_vector : GICombineRule<
-  (defs root:$root, concat_matchinfo:$matchinfo),
+  (defs root:$root, register_vector_matchinfo:$matchinfo),
   (match (wip_match_opcode G_CONCAT_VECTORS):$root,
         [{ return Helper.matchCombineConcatVectors(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineConcatVectors(*${root}, ${matchinfo}); }])>;
@@ -1567,7 +1564,7 @@ def combine_concat_vector : GICombineRule<
 // ===>
 // c = G_CONCAT_VECTORS x, y, z, undef
 def combine_shuffle_concat : GICombineRule<
-  (defs root:$root, concat_matchinfo:$matchinfo),
+  (defs root:$root, register_vector_matchinfo:$matchinfo),
   (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
         [{ return Helper.matchCombineShuffleConcat(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShuffleConcat(*${root}, ${matchinfo}); }])>;

From 9c1ce31f546368d6296bc881e9f576ad25c20c73 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel@intel.com>
Date: Fri, 20 Jun 2025 13:50:25 -0700
Subject: [PATCH 1116/1322] [mlir][vector] Add unroll patterns for vector.load
 and vector.store (#143420)

This PR adds unroll patterns for vector.load and vector.store. This PR is follow up of #137558
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       |   8 +-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |   8 ++
 .../Vector/Transforms/VectorUnroll.cpp        | 118 +++++++++++++++++-
 .../Dialect/Vector/vector-unroll-options.mlir |  42 +++++++
 .../Dialect/Vector/TestVectorTransforms.cpp   |   3 +-
 5 files changed, 170 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index aef156c5f1d0..926a92eff2eb 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1736,7 +1736,9 @@ def Vector_TransferWriteOp :
   let hasVerifier = 1;
 }
 
-def Vector_LoadOp : Vector_Op<"load"> {
+def Vector_LoadOp : Vector_Op<"load", [
+    DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
+  ]> {
   let summary = "reads an n-D slice of memory into an n-D vector";
   let description = [{
     The 'vector.load' operation reads an n-D slice of memory into an n-D
@@ -1822,7 +1824,9 @@ def Vector_LoadOp : Vector_Op<"load"> {
       "$base `[` $indices `]` attr-dict `:` type($base) `,` type($result)";
 }
 
-def Vector_StoreOp : Vector_Op<"store"> {
+def Vector_StoreOp : Vector_Op<"store", [
+    DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
+  ]> {
   let summary = "writes an n-D vector to an n-D slice of memory";
   let description = [{
     The 'vector.store' operation writes an n-D vector to an n-D slice of memory.
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 6f0ac6bb5828..ee9ab61b670c 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5371,6 +5371,10 @@ OpFoldResult LoadOp::fold(FoldAdaptor) {
   return OpFoldResult();
 }
 
+std::optional<SmallVector<int64_t, 4>> LoadOp::getShapeForUnroll() {
+  return llvm::to_vector<4>(getVectorType().getShape());
+}
+
 //===----------------------------------------------------------------------===//
 // StoreOp
 //===----------------------------------------------------------------------===//
@@ -5406,6 +5410,10 @@ LogicalResult StoreOp::fold(FoldAdaptor adaptor,
   return memref::foldMemRefCast(*this);
 }
 
+std::optional<SmallVector<int64_t, 4>> StoreOp::getShapeForUnroll() {
+  return llvm::to_vector<4>(getVectorType().getShape());
+}
+
 //===----------------------------------------------------------------------===//
 // MaskedLoadOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
index fc443ab0d138..693f4f955994 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
@@ -54,6 +54,28 @@ static SmallVector<Value> sliceTransferIndices(ArrayRef<int64_t> elementOffsets,
   return slicedIndices;
 }
 
+// Compute the new indices by adding `offsets` to `originalIndices`.
+// If m < n (m = offsets.size(), n = originalIndices.size()),
+// then only the trailing m values in `originalIndices` are updated.
+static SmallVector<Value> sliceLoadStoreIndices(PatternRewriter &rewriter,
+                                                Location loc,
+                                                OperandRange originalIndices,
+                                                ArrayRef<int64_t> offsets) {
+  assert(offsets.size() <= originalIndices.size() &&
+         "Offsets should not exceed the number of original indices");
+  SmallVector<Value> indices(originalIndices);
+
+  auto start = indices.size() - offsets.size();
+  for (auto [i, offset] : llvm::enumerate(offsets)) {
+    if (offset != 0) {
+      indices[start + i] = rewriter.create<arith::AddIOp>(
+          loc, originalIndices[start + i],
+          rewriter.create<arith::ConstantIndexOp>(loc, offset));
+    }
+  }
+  return indices;
+}
+
 // Clones `op` into a new operations that takes `operands` and returns
 // `resultTypes`.
 static Operation *cloneOpWithOperandsAndTypes(OpBuilder &builder, Location loc,
@@ -631,6 +653,90 @@ private:
   vector::UnrollVectorOptions options;
 };
 
+struct UnrollLoadPattern : public OpRewritePattern<vector::LoadOp> {
+  UnrollLoadPattern(MLIRContext *context,
+                    const vector::UnrollVectorOptions &options,
+                    PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::LoadOp>(context, benefit), options(options) {}
+
+  LogicalResult matchAndRewrite(vector::LoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    VectorType vecType = loadOp.getVectorType();
+
+    auto targetShape = getTargetShape(options, loadOp);
+    if (!targetShape)
+      return failure();
+
+    Location loc = loadOp.getLoc();
+    ArrayRef<int64_t> originalShape = vecType.getShape();
+    SmallVector<int64_t> strides(targetShape->size(), 1);
+
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, vecType, rewriter.getZeroAttr(vecType));
+
+    SmallVector<int64_t> loopOrder =
+        getUnrollOrder(originalShape.size(), loadOp, options);
+
+    auto targetVecType =
+        VectorType::get(*targetShape, vecType.getElementType());
+
+    for (SmallVector<int64_t> offsets :
+         StaticTileOffsetRange(originalShape, *targetShape, loopOrder)) {
+      SmallVector<Value> indices =
+          sliceLoadStoreIndices(rewriter, loc, loadOp.getIndices(), offsets);
+      Value slicedLoad = rewriter.create<vector::LoadOp>(
+          loc, targetVecType, loadOp.getBase(), indices);
+      result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
+          loc, slicedLoad, result, offsets, strides);
+    }
+    rewriter.replaceOp(loadOp, result);
+    return success();
+  }
+
+private:
+  vector::UnrollVectorOptions options;
+};
+
+struct UnrollStorePattern : public OpRewritePattern<vector::StoreOp> {
+  UnrollStorePattern(MLIRContext *context,
+                     const vector::UnrollVectorOptions &options,
+                     PatternBenefit benefit = 1)
+      : OpRewritePattern<vector::StoreOp>(context, benefit), options(options) {}
+
+  LogicalResult matchAndRewrite(vector::StoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    VectorType vecType = storeOp.getVectorType();
+
+    auto targetShape = getTargetShape(options, storeOp);
+    if (!targetShape)
+      return failure();
+
+    Location loc = storeOp.getLoc();
+    ArrayRef<int64_t> originalShape = vecType.getShape();
+    SmallVector<int64_t> strides(targetShape->size(), 1);
+
+    Value base = storeOp.getBase();
+    Value vector = storeOp.getValueToStore();
+
+    SmallVector<int64_t> loopOrder =
+        getUnrollOrder(originalShape.size(), storeOp, options);
+
+    for (SmallVector<int64_t> offsets :
+         StaticTileOffsetRange(originalShape, *targetShape, loopOrder)) {
+      SmallVector<Value> indices =
+          sliceLoadStoreIndices(rewriter, loc, storeOp.getIndices(), offsets);
+      Value slice = rewriter.createOrFold<vector::ExtractStridedSliceOp>(
+          loc, vector, offsets, *targetShape, strides);
+      rewriter.create<vector::StoreOp>(loc, slice, base, indices);
+    }
+    rewriter.eraseOp(storeOp);
+    return success();
+  }
+
+private:
+  vector::UnrollVectorOptions options;
+};
+
 struct UnrollBroadcastPattern : public OpRewritePattern<vector::BroadcastOp> {
   UnrollBroadcastPattern(MLIRContext *context,
                          const vector::UnrollVectorOptions &options,
@@ -699,10 +805,10 @@ private:
 void mlir::vector::populateVectorUnrollPatterns(
     RewritePatternSet &patterns, const UnrollVectorOptions &options,
     PatternBenefit benefit) {
-  patterns
-      .add<UnrollTransferReadPattern, UnrollTransferWritePattern,
-           UnrollContractionPattern, UnrollElementwisePattern,
-           UnrollReductionPattern, UnrollMultiReductionPattern,
-           UnrollTransposePattern, UnrollGatherPattern, UnrollBroadcastPattern>(
-          patterns.getContext(), options, benefit);
+  patterns.add<UnrollTransferReadPattern, UnrollTransferWritePattern,
+               UnrollContractionPattern, UnrollElementwisePattern,
+               UnrollReductionPattern, UnrollMultiReductionPattern,
+               UnrollTransposePattern, UnrollGatherPattern, UnrollLoadPattern,
+               UnrollStorePattern, UnrollBroadcastPattern>(
+      patterns.getContext(), options, benefit);
 }
diff --git a/mlir/test/Dialect/Vector/vector-unroll-options.mlir b/mlir/test/Dialect/Vector/vector-unroll-options.mlir
index fbb178fb49d8..e129cd5c40b9 100644
--- a/mlir/test/Dialect/Vector/vector-unroll-options.mlir
+++ b/mlir/test/Dialect/Vector/vector-unroll-options.mlir
@@ -378,3 +378,45 @@ func.func @vector_broadcast_with_tailing_unit_dim(%v: vector<4x1xf32>) -> vector
 //       CHECK: [[b3:%.+]] = vector.broadcast [[s3]] : vector<2x1xf32> to vector<2x2xf32>
 //       CHECK: [[r3:%.+]] = vector.insert_strided_slice [[b3]], [[r2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
 //       CHECK: return [[r3]] : vector<4x4xf32>
+
+
+func.func @vector_load_2D(%mem: memref<4x4xf16>) -> vector<4x4xf16> {
+  %c0 = arith.constant 0 : index
+  %0 = vector.load %mem[%c0, %c0] : memref<4x4xf16>, vector<4x4xf16>
+  return %0 : vector<4x4xf16>
+}
+
+// CHECK-LABEL: func.func @vector_load_2D(
+// CHECK-SAME:  %[[ARG:.*]]: memref<4x4xf16>) -> vector<4x4xf16> {
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4x4xf16>
+  // CHECK: %[[V0:.*]] = vector.load %[[ARG]][%[[C0]], %[[C0]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V1:.*]] = vector.insert_strided_slice %[[V0]], %[[CST]] {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf16> into vector<4x4xf16>
+  // CHECK: %[[V2:.*]] = vector.load %[[ARG]][%[[C0]], %[[C2]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V3:.*]] = vector.insert_strided_slice %[[V2]], %[[V1]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf16> into vector<4x4xf16>
+  // CHECK: %[[V4:.*]] = vector.load %[[ARG]][%[[C2]], %[[C0]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V5:.*]] = vector.insert_strided_slice %[[V4]], %[[V3]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf16> into vector<4x4xf16>
+  // CHECK: %[[V6:.*]] = vector.load %[[ARG]][%[[C2]], %[[C2]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V7:.*]] = vector.insert_strided_slice %[[V6]], %[[V5]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf16> into vector<4x4xf16>
+  // CHECK: return %[[V7]] : vector<4x4xf16>
+
+
+func.func @vector_store_2D(%mem: memref<4x4xf16>, %v: vector<4x4xf16>) {
+  %c0 = arith.constant 0 : index
+  vector.store %v, %mem[%c0, %c0] : memref<4x4xf16>, vector<4x4xf16>
+  return
+}
+
+// CHECK-LABEL: func.func @vector_store_2D(
+// CHECK-SAME:  %[[ARG0:.*]]: memref<4x4xf16>, %[[ARG1:.*]]: vector<4x4xf16>) {
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[V0:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf16> to vector<2x2xf16>
+  // CHECK: vector.store %[[V0]], %[[ARG0]][%[[C0]], %[[C0]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V1:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf16> to vector<2x2xf16>
+  // CHECK: vector.store %[[V1]], %[[ARG0]][%[[C0]], %[[C2]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V2:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf16> to vector<2x2xf16>
+  // CHECK: vector.store %[[V2]], %[[ARG0]][%[[C2]], %[[C0]]] : memref<4x4xf16>, vector<2x2xf16>
+  // CHECK: %[[V3:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf16> to vector<2x2xf16>
+  // CHECK: vector.store %[[V3]], %[[ARG0]][%[[C2]], %[[C2]]] : memref<4x4xf16>, vector<2x2xf16>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 54aa96ba89a0..71000b98fb8f 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -163,7 +163,8 @@ struct TestVectorUnrollingPatterns
             .setFilterConstraint([](Operation *op) {
               return success(
                   isa<arith::AddFOp, vector::FMAOp, vector::MultiDimReductionOp,
-                      vector::BroadcastOp>(op));
+                      vector::BroadcastOp, vector::LoadOp, vector::StoreOp>(
+                      op));
             }));
     populateVectorUnrollPatterns(
         patterns, UnrollVectorOptions()

From 4c97a91dc085fba40e5a86c4da8feeffd15b1f8a Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Fri, 20 Jun 2025 14:00:56 -0700
Subject: [PATCH 1117/1322] [libc] Added closing quote (#145101)

Error message was missing a closing quote, added it.

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/hdr/types/mbstate_t.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/hdr/types/mbstate_t.h b/libc/hdr/types/mbstate_t.h
index 367c6af7a3ff..d8fadceaaac8 100644
--- a/libc/hdr/types/mbstate_t.h
+++ b/libc/hdr/types/mbstate_t.h
@@ -15,7 +15,7 @@
 
 #else // Overlay mode
 
-#error "Cannot overlay mbstate_t
+#error "Cannot overlay mbstate_t"
 
 #endif // LLVM_LIBC_FULL_BUILD
 

From 0c2191b3a79d91d1115b1dbc1a9bd39daed1d9c5 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 20 Jun 2025 14:11:29 -0700
Subject: [PATCH 1118/1322] [AMDGPU] Omit image waits in function prologue on
 gfx1250 (#145097)

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f7b88bf2d5eb..a60e2102d4e8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2681,6 +2681,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
         if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
           continue;
 
+        if (!ST->hasImageInsts() &&
+            (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
+          continue;
+
         BuildMI(EntryBB, I, DebugLoc(),
                 TII->get(instrsForExtendedCounterTypes[CT]))
             .addImm(0);

From 5886f0a18342457f142871ef73f40fc27f936d9b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 20 Jun 2025 14:20:17 -0700
Subject: [PATCH 1119/1322] [RISCV] Allow larger offset when matching
 build_vector as vid sequence (#144756)

I happened to notice that when legalizing get.active.lane.mask with
large vectors we were materializing via constant pool instead of just
shifting by a constant.

We should probably be doing a full cost comparison for the different
lowering strategies as opposed to our current adhoc heuristics, but the
few cases this regresses seem pretty minor. (Given the reduction in vset
toggles, they might not be regressions at all.)

---------

Co-authored-by: Craig Topper <craig.topper@sifive.com>
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  10 +-
 .../CodeGen/RISCV/rvv/active_lane_mask.ll     | 130 ++++++++----------
 .../rvv/fixed-vectors-fp-buildvec-bf16.ll     |  16 ++-
 .../RISCV/rvv/fixed-vectors-fp-buildvec.ll    |  40 +++---
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   |   6 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |  30 ++--
 llvm/test/CodeGen/RISCV/rvv/vle_vid-vfcvt.ll  |  17 ++-
 7 files changed, 119 insertions(+), 130 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 139fa7ba3562..0c54101a1156 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3724,14 +3724,14 @@ static SDValue lowerBuildVectorViaVID(SDValue Op, SelectionDAG &DAG,
       SplatStepVal = Log2_64(std::abs(StepNumerator));
     }
 
-    // Only emit VIDs with suitably-small steps/addends. We use imm5 is a
-    // threshold since it's the immediate value many RVV instructions accept.
-    // There is no vmul.vi instruction so ensure multiply constant can fit in
-    // a single addi instruction.
+    // Only emit VIDs with suitably-small steps. We use imm5 as a threshold
+    // since it's the immediate value many RVV instructions accept. There is
+    // no vmul.vi instruction so ensure multiply constant can fit in a
+    // single addi instruction.  For the addend, we allow up to 32 bits..
     if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
          (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
         isPowerOf2_32(StepDenominator) &&
-        (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
+        (SplatStepVal >= 0 || StepDenominator == 1) && isInt<32>(Addend)) {
       MVT VIDVT =
           VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
       MVT VIDContainerVT =
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index ec422a8fbb92..1acc830347de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -103,18 +103,16 @@ define <8 x i1> @fv8(ptr %p, i64 %index, i64 %tc) {
 define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-LABEL: fv32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI8_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI8_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vadd.vx v16, v8, a0
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v24, v16, a2
 ; CHECK-NEXT:    vmsltu.vx v0, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v16
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v16, 2
+; CHECK-NEXT:    vslideup.vi v0, v24, 2
 ; CHECK-NEXT:    ret
   %mask = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 %index, i64 %tc)
   ret <32 x i1> %mask
@@ -125,30 +123,24 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_0)
-; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v17, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vsaddu.vx v16, v8, a1
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v24, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v25, v16, a2
+; CHECK-NEXT:    li a0, 48
+; CHECK-NEXT:    vadd.vx v8, v8, a0
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vle8.v v18, (a0)
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v16
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v16, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v17
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v17, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v16, 2
+; CHECK-NEXT:    vslideup.vi v0, v24, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v17, 4
+; CHECK-NEXT:    vslideup.vi v0, v25, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v8, v18
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v16, 6
@@ -160,63 +152,49 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-LABEL: fv128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_1)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT:    vle8.v v17, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
-; CHECK-NEXT:    vle8.v v18, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vle8.v v19, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
-; CHECK-NEXT:    vle8.v v20, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
-; CHECK-NEXT:    vle8.v v21, (a0)
-; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vle8.v v22, (a0)
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v16
+; CHECK-NEXT:    li a0, 80
+; CHECK-NEXT:    vsaddu.vx v16, v8, a1
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v24, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    li a0, 96
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v25, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    li a0, 112
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v26, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v27, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v28, v16, a2
+; CHECK-NEXT:    vadd.vx v16, v8, a0
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v29, v16, a2
+; CHECK-NEXT:    li a0, 48
+; CHECK-NEXT:    vadd.vx v8, v8, a0
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v17
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v17, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v18
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v18, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v19
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v19, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v20
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v20, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v21
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v21, v8, a2
-; CHECK-NEXT:    vsext.vf8 v8, v22
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v22, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v17, v16, 2
-; CHECK-NEXT:    vslideup.vi v0, v20, 2
+; CHECK-NEXT:    vslideup.vi v25, v24, 2
+; CHECK-NEXT:    vslideup.vi v0, v28, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v17, v18, 4
-; CHECK-NEXT:    vslideup.vi v0, v21, 4
+; CHECK-NEXT:    vslideup.vi v25, v26, 4
+; CHECK-NEXT:    vslideup.vi v0, v29, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v17, v19, 6
-; CHECK-NEXT:    vslideup.vi v0, v22, 6
+; CHECK-NEXT:    vslideup.vi v25, v27, 6
+; CHECK-NEXT:    vslideup.vi v0, v16, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v17, 8
+; CHECK-NEXT:    vslideup.vi v0, v25, 8
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
index 3f7cd91737f4..9bb3f4a976ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
@@ -94,10 +94,12 @@ define <2 x bfloat> @vid_v2bf16() {
 define <2 x bfloat> @vid_addend1_v2bf16() {
 ; CHECK-LABEL: vid_addend1_v2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 262148
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    lui a0, 4
+; CHECK-NEXT:    vsll.vi v8, v8, 7
 ; CHECK-NEXT:    addi a0, a0, -128
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    vadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
 }
@@ -105,10 +107,12 @@ define <2 x bfloat> @vid_addend1_v2bf16() {
 define <2 x bfloat> @vid_denominator2_v2bf16() {
 ; CHECK-LABEL: vid_denominator2_v2bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 260100
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    lui a0, 4
+; CHECK-NEXT:    vsll.vi v8, v8, 7
 ; CHECK-NEXT:    addi a0, a0, -256
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    vadd.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   ret <2 x bfloat> <bfloat 0.5, bfloat 1.0>
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index eb40c133514f..564e95c43f68 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1573,18 +1573,22 @@ define <2 x half> @vid_addend1_v2f16() {
 ;
 ; RV32ZVFHMIN-LABEL: vid_addend1_v2f16:
 ; RV32ZVFHMIN:       # %bb.0:
-; RV32ZVFHMIN-NEXT:    lui a0, 262148
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1024
-; RV32ZVFHMIN-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV32ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; RV32ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32ZVFHMIN-NEXT:    vid.v v8
+; RV32ZVFHMIN-NEXT:    li a0, 15
+; RV32ZVFHMIN-NEXT:    vsll.vi v8, v8, 10
+; RV32ZVFHMIN-NEXT:    slli a0, a0, 10
+; RV32ZVFHMIN-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVFHMIN-NEXT:    ret
 ;
 ; RV64ZVFHMIN-LABEL: vid_addend1_v2f16:
 ; RV64ZVFHMIN:       # %bb.0:
-; RV64ZVFHMIN-NEXT:    lui a0, 262148
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1024
-; RV64ZVFHMIN-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV64ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; RV64ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64ZVFHMIN-NEXT:    vid.v v8
+; RV64ZVFHMIN-NEXT:    li a0, 15
+; RV64ZVFHMIN-NEXT:    vsll.vi v8, v8, 10
+; RV64ZVFHMIN-NEXT:    slli a0, a0, 10
+; RV64ZVFHMIN-NEXT:    vadd.vx v8, v8, a0
 ; RV64ZVFHMIN-NEXT:    ret
   ret <2 x half> <half 1.0, half 2.0>
 }
@@ -1608,18 +1612,22 @@ define <2 x half> @vid_denominator2_v2f16() {
 ;
 ; RV32ZVFHMIN-LABEL: vid_denominator2_v2f16:
 ; RV32ZVFHMIN:       # %bb.0:
-; RV32ZVFHMIN-NEXT:    lui a0, 245764
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -2048
-; RV32ZVFHMIN-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV32ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; RV32ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV32ZVFHMIN-NEXT:    vid.v v8
+; RV32ZVFHMIN-NEXT:    li a0, 7
+; RV32ZVFHMIN-NEXT:    vsll.vi v8, v8, 10
+; RV32ZVFHMIN-NEXT:    slli a0, a0, 11
+; RV32ZVFHMIN-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVFHMIN-NEXT:    ret
 ;
 ; RV64ZVFHMIN-LABEL: vid_denominator2_v2f16:
 ; RV64ZVFHMIN:       # %bb.0:
-; RV64ZVFHMIN-NEXT:    lui a0, 245764
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -2048
-; RV64ZVFHMIN-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; RV64ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; RV64ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; RV64ZVFHMIN-NEXT:    vid.v v8
+; RV64ZVFHMIN-NEXT:    li a0, 7
+; RV64ZVFHMIN-NEXT:    vsll.vi v8, v8, 10
+; RV64ZVFHMIN-NEXT:    slli a0, a0, 11
+; RV64ZVFHMIN-NEXT:    vadd.vx v8, v8, a0
 ; RV64ZVFHMIN-NEXT:    ret
   ret <2 x half> <half 0.5, half 1.0>
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index 1fa96d3c07ca..f235540cc8ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -58,10 +58,10 @@ define void @buildvec_vid_plus_imm_v16i8(ptr %x) {
 define void @buildvec_vid_plus_nonimm_v16i8(ptr %x) {
 ; CHECK-LABEL: buildvec_vid_plus_nonimm_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI4_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI4_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    li a1, 100
+; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   store <16 x i8> <i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 722a1186faca..dfe8f358b778 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -1734,19 +1734,16 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    lui a3, %hi(.LCPI72_0)
-; RV32-NEXT:    addi a3, a3, %lo(.LCPI72_0)
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT:    vle8.v v12, (a3)
 ; RV32-NEXT:    vid.v v16
-; RV32-NEXT:    vmsltu.vx v14, v16, a1
-; RV32-NEXT:    li a3, 64
-; RV32-NEXT:    vsext.vf4 v16, v12
 ; RV32-NEXT:    vmsltu.vx v12, v16, a1
+; RV32-NEXT:    vadd.vx v16, v16, a0
+; RV32-NEXT:    vmsltu.vx v13, v16, a1
+; RV32-NEXT:    li a1, 64
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vi v14, v12, 4
-; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
-; RV32-NEXT:    vmand.mm v0, v14, v0
+; RV32-NEXT:    vslideup.vi v12, v13, 4
+; RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT:    vmand.mm v0, v12, v0
 ; RV32-NEXT:    vmv.v.i v12, 1
 ; RV32-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; RV32-NEXT:    vslidedown.vx v12, v8, a0
@@ -1780,19 +1777,16 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    lui a3, %hi(.LCPI72_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI72_0)
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    vle8.v v12, (a3)
 ; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    vmsltu.vx v14, v16, a1
-; RV64-NEXT:    li a3, 64
-; RV64-NEXT:    vsext.vf4 v16, v12
 ; RV64-NEXT:    vmsltu.vx v12, v16, a1
+; RV64-NEXT:    vadd.vx v16, v16, a0
+; RV64-NEXT:    vmsltu.vx v13, v16, a1
+; RV64-NEXT:    li a1, 64
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vi v14, v12, 4
-; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
-; RV64-NEXT:    vmand.mm v0, v14, v0
+; RV64-NEXT:    vslideup.vi v12, v13, 4
+; RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT:    vmand.mm v0, v12, v0
 ; RV64-NEXT:    vmv.v.i v12, 1
 ; RV64-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; RV64-NEXT:    vslidedown.vx v12, v8, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vle_vid-vfcvt.ll b/llvm/test/CodeGen/RISCV/rvv/vle_vid-vfcvt.ll
index bf330ea38a00..c837063231e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vle_vid-vfcvt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vle_vid-vfcvt.ll
@@ -4,10 +4,12 @@
 define void @foo_1(ptr nocapture noundef writeonly %t) {
 ; CHECK-LABEL: foo_1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, %hi(.LCPI0_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI0_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsll.vi v8, v8, 7
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
@@ -18,10 +20,13 @@ entry:
 define void @foo_2(ptr nocapture noundef writeonly %t) {
 ; CHECK-LABEL: foo_2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vsll.vi v8, v8, 7
+; CHECK-NEXT:    addi a1, a1, -512
+; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:

From f0d32575a1193741bc9ca90e5beced693cba28b0 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 20 Jun 2025 14:29:32 -0700
Subject: [PATCH 1120/1322] [BOLT][NFCI] Use FileSymbols for local symbol
 disambiguation (#89088)

Remove SymbolToFileName mapping from every local symbol to its
containing FILE symbol name, and reuse FileSymbols to disambiguate
local symbols instead.

Also removes the check for `ld-temp.o` file symbol which was added to
prevent LTO build mode from affecting the disambiguated name. This may
cause incompatibility when using the profile collected on a binary built
in a different mode than the input binary.

Addresses #90661.

Speeds up discover file objects by 5-10% for large binaries:
- binary with ~1.2M symbols: 12.6422s -> 12.0297s
- binary with ~4.5M symbols: 48.8851s -> 43.7315s
---
 bolt/lib/Rewrite/RewriteInstance.cpp | 37 ++++++----------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index d650e5db54bf..93bd93b6cb98 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -780,14 +780,6 @@ void RewriteInstance::discoverFileObjects() {
 
   // For local symbols we want to keep track of associated FILE symbol name for
   // disambiguation by combined name.
-  StringRef FileSymbolName;
-  bool SeenFileName = false;
-  struct SymbolRefHash {
-    size_t operator()(SymbolRef const &S) const {
-      return std::hash<decltype(DataRefImpl::p)>{}(S.getRawDataRefImpl().p);
-    }
-  };
-  std::unordered_map<SymbolRef, StringRef, SymbolRefHash> SymbolToFileName;
   for (const ELFSymbolRef &Symbol : InputFile->symbols()) {
     Expected<StringRef> NameOrError = Symbol.getName();
     if (NameOrError && NameOrError->starts_with("__asan_init")) {
@@ -806,21 +798,8 @@ void RewriteInstance::discoverFileObjects() {
     if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
       continue;
 
-    if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
+    if (cantFail(Symbol.getType()) == SymbolRef::ST_File)
       FileSymbols.emplace_back(Symbol);
-      StringRef Name =
-          cantFail(std::move(NameOrError), "cannot get symbol name for file");
-      // Ignore Clang LTO artificial FILE symbol as it is not always generated,
-      // and this uncertainty is causing havoc in function name matching.
-      if (Name == "ld-temp.o")
-        continue;
-      FileSymbolName = Name;
-      SeenFileName = true;
-      continue;
-    }
-    if (!FileSymbolName.empty() &&
-        !(cantFail(Symbol.getFlags()) & SymbolRef::SF_Global))
-      SymbolToFileName[Symbol] = FileSymbolName;
   }
 
   // Sort symbols in the file by value. Ignore symbols from non-allocatable
@@ -1028,14 +1007,14 @@ void RewriteInstance::discoverFileObjects() {
       // The <id> field is used for disambiguation of local symbols since there
       // could be identical function names coming from identical file names
       // (e.g. from different directories).
-      std::string AltPrefix;
-      auto SFI = SymbolToFileName.find(Symbol);
-      if (SymbolType == SymbolRef::ST_Function && SFI != SymbolToFileName.end())
-        AltPrefix = Name + "/" + std::string(SFI->second);
+      auto SFI = llvm::upper_bound(FileSymbols, ELFSymbolRef(Symbol));
+      if (SymbolType == SymbolRef::ST_Function && SFI != FileSymbols.begin()) {
+        StringRef FileSymbolName = cantFail(SFI[-1].getName());
+        if (!FileSymbolName.empty())
+          AlternativeName = NR.uniquify(Name + "/" + FileSymbolName.str());
+      }
 
       UniqueName = NR.uniquify(Name);
-      if (!AltPrefix.empty())
-        AlternativeName = NR.uniquify(AltPrefix);
     }
 
     uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
@@ -1294,7 +1273,7 @@ void RewriteInstance::discoverFileObjects() {
                              FDE->getAddressRange());
   }
 
-  BC->setHasSymbolsWithFileName(SeenFileName);
+  BC->setHasSymbolsWithFileName(FileSymbols.size());
 
   // Now that all the functions were created - adjust their boundaries.
   adjustFunctionBoundaries();

From a91154343780dae022bb314aa76f0b0affc28b62 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Fri, 20 Jun 2025 21:43:00 +0000
Subject: [PATCH 1121/1322] [libc] Implemented wcrtomb internal function and
 public libc function (#144596)

Implemented internal wcrtomb function using the CharacterConverter class
public libc function calls this internal function to perform the
conversion
---
 libc/config/linux/x86_64/entrypoints.txt |  1 +
 libc/include/wchar.yaml                  |  8 ++
 libc/src/__support/wchar/CMakeLists.txt  | 16 ++++
 libc/src/__support/wchar/wcrtomb.cpp     | 49 +++++++++++++
 libc/src/__support/wchar/wcrtomb.h       | 26 +++++++
 libc/src/wchar/CMakeLists.txt            | 14 ++++
 libc/src/wchar/wcrtomb.cpp               | 45 ++++++++++++
 libc/src/wchar/wcrtomb.h                 | 23 ++++++
 libc/test/src/wchar/CMakeLists.txt       | 14 ++++
 libc/test/src/wchar/wcrtomb_test.cpp     | 93 ++++++++++++++++++++++++
 10 files changed, 289 insertions(+)
 create mode 100644 libc/src/__support/wchar/wcrtomb.cpp
 create mode 100644 libc/src/__support/wchar/wcrtomb.h
 create mode 100644 libc/src/wchar/wcrtomb.cpp
 create mode 100644 libc/src/wchar/wcrtomb.h
 create mode 100644 libc/test/src/wchar/wcrtomb_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 8bf6c402b039..4d94f10196fd 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1247,6 +1247,7 @@ if(LLVM_LIBC_FULL_BUILD)
 
     # wchar.h entrypoints
     libc.src.wchar.mbrtowc
+    libc.src.wchar.wcrtomb
   )
 endif()
 
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index c036636e12c3..64eb38171066 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -159,6 +159,14 @@ functions:
       - type: wchar_t *__restrict
       - type: const wchar_t *__restrict
       - type: size_t
+  - name: wcrtomb
+    standards:
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: char *__restrict
+      - type: wchar_t
+      - type: mbstate_t *__restrict
   - name: wcscpy
     standards:
       - stdc
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 479c1dff2c6e..6aade4ccc84a 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -20,6 +20,22 @@ add_object_library(
     .mbstate
 )
 
+add_object_library(
+  wcrtomb
+  HDRS
+    wcrtomb.h
+  SRCS 
+    wcrtomb.cpp
+  DEPENDS
+    libc.hdr.types.char32_t
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.error_or
+    libc.src.__support.common
+    .character_converter
+    .mbstate
+)
+
 add_object_library(
   mbrtowc
   HDRS
diff --git a/libc/src/__support/wchar/wcrtomb.cpp b/libc/src/__support/wchar/wcrtomb.cpp
new file mode 100644
index 000000000000..8ca3d17ad6ce
--- /dev/null
+++ b/libc/src/__support/wchar/wcrtomb.cpp
@@ -0,0 +1,49 @@
+//===-- Implementation of wcrtomb -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_assert.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
+                        mbstate *__restrict ps) {
+  static_assert(sizeof(wchar_t) == 4);
+
+  CharacterConverter cr(ps);
+
+  if (s == nullptr)
+    return Error(-1);
+
+  int status = cr.push(static_cast<char32_t>(wc));
+  if (status != 0)
+    return Error(status);
+
+  size_t count = 0;
+  while (!cr.isEmpty()) {
+    auto utf8 = cr.pop_utf8(); // can never fail as long as the push succeeded
+    LIBC_ASSERT(utf8.has_value());
+
+    *s = utf8.value();
+    s++;
+    count++;
+  }
+  return count;
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/wcrtomb.h b/libc/src/__support/wchar/wcrtomb.h
new file mode 100644
index 000000000000..bcd39a92a3b7
--- /dev/null
+++ b/libc/src/__support/wchar/wcrtomb.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for wcrtomb ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
+#define LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc, mbstate *__restrict ps);
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 163c29847e6a..ec33caccb16d 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -34,6 +34,20 @@ add_entrypoint_object(
     libc.src.__support.wctype_utils
 )
 
+add_entrypoint_object(
+  wcrtomb
+  SRCS
+    wcrtomb.cpp
+  HDRS
+    wcrtomb.h
+  DEPENDS
+    libc.hdr.types.wchar_t
+    libc.hdr.types.mbstate_t
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.wcrtomb
+    libc.src.__support.wchar.mbstate
+)
+
 add_entrypoint_object(
   mbrtowc
   SRCS
diff --git a/libc/src/wchar/wcrtomb.cpp b/libc/src/wchar/wcrtomb.cpp
new file mode 100644
index 000000000000..6d604a00599e
--- /dev/null
+++ b/libc/src/wchar/wcrtomb.cpp
@@ -0,0 +1,45 @@
+//===-- Implementation of wcrtomb -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcrtomb.h"
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/wcrtomb.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, wcrtomb,
+                   (char *__restrict s, wchar_t wc, mbstate_t *__restrict ps)) {
+  static internal::mbstate internal_mbstate;
+
+  // when s is nullptr, this is equivalent to wcrtomb(buf, L'\0', ps)
+  char buf[sizeof(wchar_t) / sizeof(char)];
+  if (s == nullptr) {
+    s = buf;
+    wc = L'\0';
+  }
+
+  auto result = internal::wcrtomb(
+      s, wc,
+      ps == nullptr ? &internal_mbstate
+                    : reinterpret_cast<internal::mbstate *>(ps));
+
+  if (!result.has_value()) {
+    libc_errno = EILSEQ;
+    return -1;
+  }
+
+  return result.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcrtomb.h b/libc/src/wchar/wcrtomb.h
new file mode 100644
index 000000000000..06c42f158122
--- /dev/null
+++ b/libc/src/wchar/wcrtomb.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for wcrtomb -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCRTOMB_H
+#define LLVM_LIBC_SRC_WCHAR_WCRTOMB_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t wcrtomb(char *__restrict s, wchar_t wc, mbstate_t *__restrict ps);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCRTOMB_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index d4cae1f6228b..184e482c895b 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -47,6 +47,20 @@ add_libc_test(
     libc.src.wchar.wctob
 )
 
+add_libc_test(
+  wcrtomb_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcrtomb_test.cpp
+  DEPENDS
+    libc.src.wchar.wcrtomb
+    libc.src.string.memset
+    libc.hdr.types.wchar_t
+    libc.hdr.types.mbstate_t
+    libc.src.__support.libc_errno
+)
+
 add_libc_test(
   wmemset_test 
   SUITE
diff --git a/libc/test/src/wchar/wcrtomb_test.cpp b/libc/test/src/wchar/wcrtomb_test.cpp
new file mode 100644
index 000000000000..c06b39ae0143
--- /dev/null
+++ b/libc/test/src/wchar/wcrtomb_test.cpp
@@ -0,0 +1,93 @@
+//===-- Unittests for wcrtomb --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/libc_errno.h"
+#include "src/string/memset.h"
+#include "src/wchar/wcrtomb.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcWCRToMBTest, OneByte) {
+  mbstate_t state;
+  LIBC_NAMESPACE::memset(&state, 0, sizeof(mbstate_t));
+  wchar_t wc = L'U';
+  char mb[4];
+  size_t cnt = LIBC_NAMESPACE::wcrtomb(mb, wc, &state);
+  ASSERT_EQ(cnt, static_cast<size_t>(1));
+  ASSERT_EQ(mb[0], 'U');
+}
+
+TEST(LlvmLibcWCRToMBTest, TwoByte) {
+  mbstate_t state;
+  LIBC_NAMESPACE::memset(&state, 0, sizeof(mbstate_t));
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
+  wchar_t wc = 0xff;
+  char mb[4];
+  size_t cnt = LIBC_NAMESPACE::wcrtomb(mb, wc, &state);
+  ASSERT_EQ(cnt, static_cast<size_t>(2));
+  ASSERT_EQ(mb[0], static_cast<char>(0xc3));
+  ASSERT_EQ(mb[1], static_cast<char>(0xbf));
+}
+
+TEST(LlvmLibcWCRToMBTest, ThreeByte) {
+  mbstate_t state;
+  LIBC_NAMESPACE::memset(&state, 0, sizeof(mbstate_t));
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+  wchar_t wc = 0xac15;
+  char mb[4];
+  size_t cnt = LIBC_NAMESPACE::wcrtomb(mb, wc, &state);
+  ASSERT_EQ(cnt, static_cast<size_t>(3));
+  ASSERT_EQ(mb[0], static_cast<char>(0xea));
+  ASSERT_EQ(mb[1], static_cast<char>(0xb0));
+  ASSERT_EQ(mb[2], static_cast<char>(0x95));
+}
+
+TEST(LlvmLibcWCRToMBTest, FourByte) {
+  mbstate_t state;
+  LIBC_NAMESPACE::memset(&state, 0, sizeof(mbstate_t));
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  wchar_t wc = 0x1f921;
+  char mb[4];
+  size_t cnt = LIBC_NAMESPACE::wcrtomb(mb, wc, &state);
+  ASSERT_EQ(cnt, static_cast<size_t>(4));
+  ASSERT_EQ(mb[0], static_cast<char>(0xf0));
+  ASSERT_EQ(mb[1], static_cast<char>(0x9f));
+  ASSERT_EQ(mb[2], static_cast<char>(0xa4));
+  ASSERT_EQ(mb[3], static_cast<char>(0xa1));
+}
+
+TEST(LlvmLibcWCRToMBTest, NullString) {
+  mbstate_t state;
+  LIBC_NAMESPACE::memset(&state, 0, sizeof(mbstate_t));
+  wchar_t wc = L'A';
+  char mb[4];
+
+  // should be equivalent to the call wcrtomb(buf, L'\0', state)
+  size_t cnt1 = LIBC_NAMESPACE::wcrtomb(nullptr, wc, &state);
+  size_t cnt2 = LIBC_NAMESPACE::wcrtomb(mb, L'\0', &state);
+
+  ASSERT_EQ(cnt1, cnt2);
+}
+
+TEST(LlvmLibcWCRToMBTest, NullState) {
+  wchar_t wc = L'A';
+  char mb[4];
+  size_t cnt = LIBC_NAMESPACE::wcrtomb(mb, wc, nullptr);
+  ASSERT_EQ(cnt, static_cast<size_t>(1));
+}
+
+TEST(LlvmLibcWCRToMBTest, InvalidWchar) {
+  mbstate_t state;
+  LIBC_NAMESPACE::memset(&state, 0, sizeof(mbstate_t));
+  wchar_t wc = 0x12ffff;
+  char mb[4];
+  size_t cnt = LIBC_NAMESPACE::wcrtomb(mb, wc, &state);
+  ASSERT_EQ(cnt, static_cast<size_t>(-1));
+  ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+}

From e6ee2c7c7b36825331b39e221725780167457e6d Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Fri, 20 Jun 2025 14:54:58 -0700
Subject: [PATCH 1122/1322] [HLSL][RootSignature] Implement validation of
 resource ranges for `RootDescriptors` (#140962)

As was established
[previously](https://github.com/llvm/llvm-project/pull/140957), we
created a structure to model a resource range and to detect an overlap
in a given set of these.

However, a resource range only overlaps with another resource range if
they have:
- equivalent ResourceClass (SRV, UAV, CBuffer, Sampler)
- equivalent resource name-space
- overlapping shader visibility

For instance, the following don't overlap even though they have the same
register range:
- `CBV(b0)` and `SRV(t0)` (different resource class)
- `CBV(b0, space = 0)` and `CBV(b0, space = 1)` (different space)
- `CBV(b0, visibility = Pixel)` and `CBV(b0, visibility = Domain)`
(non-overlapping visibility)

The first two clauses are naturally modelled by grouping all the
`RangeInfo`s that have the equivalent `ResourceClass` and `Space` values
together and check if there is any overlap on a `ResourceRange` for all
these `RangeInfo`s. However, `Visibility` is not quite as easily mapped
(`Visibility = All` would overlap with any other visibility). So we will
instead need to track a `ResourceRange` for each of the `Visibility`
types in a group. Then we can determine when inserting a range of the
same group if it would overlap with any overlapping visibilities.

The collection of `RangeInfo` for `RootDescriptor`s, sorting of the
`RangeInfo`s into the groups and finally the insertion of each point
into their respective `ResourceRange`s are implemented. Furthermore, we
integrate this into `SemaHLSL` to provide a diagnostic for each entry
function that uses the invalid root signature.

- Implements collection of `RangeInfo` for `RootDescriptors`
- Implements resource range validation in `SemaHLSL`
- Add diagnostic testing of error production in
`RootSignature-resource-ranges-err.hlsl`
- Add testing to ensure no errors are raised in valid root signatures
`RootSignature-resource-ranges.hlsl`

Part 2 of https://github.com/llvm/llvm-project/issues/129942

A final pr will be produced to integrate the analysis of
`DescriptorTable`, `StaticSampler` and `RootConstants` by defining how
to construct the `RangeInfo` from their elements respectively.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 +
 clang/include/clang/Sema/SemaHLSL.h           |   2 +
 clang/lib/Sema/SemaHLSL.cpp                   | 130 +++++++++++++++++-
 .../RootSignature-resource-ranges-err.hlsl    |  21 +++
 .../RootSignature-resource-ranges.hlsl        |  15 ++
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |  11 +-
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  |   2 +
 7 files changed, 183 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaHLSL/RootSignature-resource-ranges-err.hlsl
 create mode 100644 clang/test/SemaHLSL/RootSignature-resource-ranges.hlsl

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 34b798a09c21..f2f2152b8bbb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13054,6 +13054,11 @@ def err_invalid_hlsl_resource_type: Error<
 def err_hlsl_spirv_only: Error<"%0 is only available for the SPIR-V target">;
 def err_hlsl_vk_literal_must_contain_constant: Error<"the argument to vk::Literal must be a vk::integral_constant">;
 
+def err_hlsl_resource_range_overlap: Error<
+  "resource ranges %select{t|u|b|s}0[%1;%2] and %select{t|u|b|s}3[%4;%5] "
+  "overlap within space = %6 and visibility = "
+  "%select{All|Vertex|Hull|Domain|Geometry|Pixel|Amplification|Mesh}7">;
+
 // Layout randomization diagnostics.
 def err_non_designated_init_used : Error<
   "a randomized struct can only be initialized with a designated initializer">;
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 97091792ba23..7d7eae4db532 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -134,6 +134,8 @@ public:
       SourceLocation Loc, IdentifierInfo *DeclIdent,
       SmallVector<llvm::hlsl::rootsig::RootElement> &Elements);
 
+  // Returns true when D is invalid and a diagnostic was produced
+  bool handleRootSignatureDecl(HLSLRootSignatureDecl *D, SourceLocation Loc);
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 9b43ee00810b..d003967a522a 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -39,6 +39,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Frontend/HLSL/HLSLRootSignatureUtils.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DXILABI.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1068,10 +1069,138 @@ void SemaHLSL::ActOnFinishRootSignatureDecl(
       SemaRef.getASTContext(), /*DeclContext=*/SemaRef.CurContext, Loc,
       DeclIdent, Elements);
 
+  if (handleRootSignatureDecl(SignatureDecl, Loc))
+    return;
+
   SignatureDecl->setImplicit();
   SemaRef.PushOnScopeChains(SignatureDecl, SemaRef.getCurScope());
 }
 
+bool SemaHLSL::handleRootSignatureDecl(HLSLRootSignatureDecl *D,
+                                       SourceLocation Loc) {
+  // The following conducts analysis on resource ranges to detect and report
+  // any overlaps in resource ranges.
+  //
+  // A resource range overlaps with another resource range if they have:
+  // - equivalent ResourceClass (SRV, UAV, CBuffer, Sampler)
+  // - equivalent resource space
+  // - overlapping visbility
+  //
+  // The following algorithm is implemented in the following steps:
+  //
+  // 1. Collect RangeInfo from relevant RootElements:
+  //   - RangeInfo will retain the interval, ResourceClass, Space and Visibility
+  // 2. Sort the RangeInfo's such that they are grouped together by
+  //  ResourceClass and Space (GroupT defined below)
+  // 3. Iterate through the collected RangeInfos by their groups
+  //   - For each group we will have a ResourceRange for each visibility
+  //   - As we iterate through we will:
+  //      A: Insert the current RangeInfo into the corresponding Visibility
+  //   ResourceRange
+  //      B: Check for overlap with any overlapping Visibility ResourceRange
+  using RangeInfo = llvm::hlsl::rootsig::RangeInfo;
+  using ResourceRange = llvm::hlsl::rootsig::ResourceRange;
+  using GroupT = std::pair<ResourceClass, /*Space*/ uint32_t>;
+
+  // 1. Collect RangeInfos
+  llvm::SmallVector<RangeInfo> Infos;
+  for (const llvm::hlsl::rootsig::RootElement &Elem : D->getRootElements()) {
+    if (const auto *Descriptor =
+            std::get_if<llvm::hlsl::rootsig::RootDescriptor>(&Elem)) {
+      RangeInfo Info;
+      Info.LowerBound = Descriptor->Reg.Number;
+      Info.UpperBound = Info.LowerBound; // use inclusive ranges []
+
+      Info.Class =
+          llvm::dxil::ResourceClass(llvm::to_underlying(Descriptor->Type));
+      Info.Space = Descriptor->Space;
+      Info.Visibility = Descriptor->Visibility;
+      Infos.push_back(Info);
+    }
+  }
+
+  // 2. Sort the RangeInfo's by their GroupT to form groupings
+  std::sort(Infos.begin(), Infos.end(), [](RangeInfo A, RangeInfo B) {
+    return std::tie(A.Class, A.Space) < std::tie(B.Class, B.Space);
+  });
+
+  // 3. First we will init our state to track:
+  if (Infos.size() == 0)
+    return false; // No ranges to overlap
+  GroupT CurGroup = {Infos[0].Class, Infos[0].Space};
+  bool HadOverlap = false;
+
+  // Create a ResourceRange for each Visibility
+  ResourceRange::MapT::Allocator Allocator;
+  std::array<ResourceRange, 8> Ranges = {
+      ResourceRange(Allocator), // All
+      ResourceRange(Allocator), // Vertex
+      ResourceRange(Allocator), // Hull
+      ResourceRange(Allocator), // Domain
+      ResourceRange(Allocator), // Geometry
+      ResourceRange(Allocator), // Pixel
+      ResourceRange(Allocator), // Amplification
+      ResourceRange(Allocator), // Mesh
+  };
+
+  // Reset the ResourceRanges for when we iterate through a new group
+  auto ClearRanges = [&Ranges]() {
+    for (ResourceRange &Range : Ranges)
+      Range.clear();
+  };
+
+  // Helper to report diagnostics
+  auto ReportOverlap = [this, Loc, &HadOverlap](const RangeInfo *Info,
+                                                const RangeInfo *OInfo) {
+    HadOverlap = true;
+    auto CommonVis =
+        Info->Visibility == llvm::hlsl::rootsig::ShaderVisibility::All
+            ? OInfo->Visibility
+            : Info->Visibility;
+    this->Diag(Loc, diag::err_hlsl_resource_range_overlap)
+        << llvm::to_underlying(Info->Class) << Info->LowerBound
+        << Info->UpperBound << llvm::to_underlying(OInfo->Class)
+        << OInfo->LowerBound << OInfo->UpperBound << Info->Space << CommonVis;
+  };
+
+  // 3: Iterate through collected RangeInfos
+  for (const RangeInfo &Info : Infos) {
+    GroupT InfoGroup = {Info.Class, Info.Space};
+    // Reset our ResourceRanges when we enter a new group
+    if (CurGroup != InfoGroup) {
+      ClearRanges();
+      CurGroup = InfoGroup;
+    }
+
+    // 3A: Insert range info into corresponding Visibility ResourceRange
+    ResourceRange &VisRange = Ranges[llvm::to_underlying(Info.Visibility)];
+    if (std::optional<const RangeInfo *> Overlapping = VisRange.insert(Info))
+      ReportOverlap(&Info, Overlapping.value());
+
+    // 3B: Check for overlap in all overlapping Visibility ResourceRanges
+    //
+    // If the range that we are inserting has ShaderVisiblity::All it needs to
+    // check for an overlap in all other visibility types as well.
+    // Otherwise, the range that is inserted needs to check that it does not
+    // overlap with ShaderVisibility::All.
+    //
+    // OverlapRanges will be an ArrayRef to all non-all visibility
+    // ResourceRanges in the former case and it will be an ArrayRef to just the
+    // all visiblity ResourceRange in the latter case.
+    ArrayRef<ResourceRange> OverlapRanges =
+        Info.Visibility == llvm::hlsl::rootsig::ShaderVisibility::All
+            ? ArrayRef<ResourceRange>{Ranges}.drop_front()
+            : ArrayRef<ResourceRange>{Ranges}.take_front();
+
+    for (const ResourceRange &Range : OverlapRanges)
+      if (std::optional<const RangeInfo *> Overlapping =
+              Range.getOverlapping(Info))
+        ReportOverlap(&Info, Overlapping.value());
+  }
+
+  return HadOverlap;
+}
+
 void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) {
   if (AL.getNumArgs() != 1) {
     Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;
@@ -1093,7 +1222,6 @@ void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) {
   if (SemaRef.LookupQualifiedName(R, D->getDeclContext()))
     if (auto *SignatureDecl =
             dyn_cast<HLSLRootSignatureDecl>(R.getFoundDecl())) {
-      // Perform validation of constructs here
       D->addAttr(::new (getASTContext()) RootSignatureAttr(
           getASTContext(), AL, Ident, SignatureDecl));
     }
diff --git a/clang/test/SemaHLSL/RootSignature-resource-ranges-err.hlsl b/clang/test/SemaHLSL/RootSignature-resource-ranges-err.hlsl
new file mode 100644
index 000000000000..e5152e72d480
--- /dev/null
+++ b/clang/test/SemaHLSL/RootSignature-resource-ranges-err.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - %s -verify
+
+// expected-error@+1 {{resource ranges b[42;42] and b[42;42] overlap within space = 0 and visibility = All}}
+[RootSignature("CBV(b42), CBV(b42)")]
+void bad_root_signature_0() {}
+
+// expected-error@+1 {{resource ranges t[0;0] and t[0;0] overlap within space = 3 and visibility = All}}
+[RootSignature("SRV(t0, space = 3), SRV(t0, space = 3)")]
+void bad_root_signature_1() {}
+
+// expected-error@+1 {{resource ranges u[0;0] and u[0;0] overlap within space = 0 and visibility = Pixel}}
+[RootSignature("UAV(u0, visibility = SHADER_VISIBILITY_PIXEL), UAV(u0, visibility = SHADER_VISIBILITY_PIXEL)")]
+void bad_root_signature_2() {}
+
+// expected-error@+1 {{resource ranges u[0;0] and u[0;0] overlap within space = 0 and visibility = Pixel}}
+[RootSignature("UAV(u0, visibility = SHADER_VISIBILITY_ALL), UAV(u0, visibility = SHADER_VISIBILITY_PIXEL)")]
+void bad_root_signature_3() {}
+
+// expected-error@+1 {{resource ranges u[0;0] and u[0;0] overlap within space = 0 and visibility = Pixel}}
+[RootSignature("UAV(u0, visibility = SHADER_VISIBILITY_PIXEL), UAV(u0, visibility = SHADER_VISIBILITY_ALL)")]
+void bad_root_signature_4() {}
diff --git a/clang/test/SemaHLSL/RootSignature-resource-ranges.hlsl b/clang/test/SemaHLSL/RootSignature-resource-ranges.hlsl
new file mode 100644
index 000000000000..5778fb2ae4eb
--- /dev/null
+++ b/clang/test/SemaHLSL/RootSignature-resource-ranges.hlsl
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - %s -verify
+
+// expected-no-diagnostics
+
+[RootSignature("CBV(b0), CBV(b1)")]
+void valid_root_signature_0() {}
+
+[RootSignature("CBV(b0, visibility = SHADER_VISIBILITY_DOMAIN), CBV(b0, visibility = SHADER_VISIBILITY_PIXEL)")]
+void valid_root_signature_1() {}
+
+[RootSignature("CBV(b0, space = 1), CBV(b0, space = 2)")]
+void valid_root_signature_2() {}
+
+[RootSignature("CBV(b0), SRV(t0)")]
+void valid_root_signature_3() {}
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index 25c2a9f0cc80..4769fd055996 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -71,13 +71,17 @@ private:
   SmallVector<Metadata *> GeneratedMetadata;
 };
 
-// RangeInfo holds the information to correctly construct a ResourceRange
-// and retains this information to be used for displaying a better diagnostic
 struct RangeInfo {
   const static uint32_t Unbounded = ~0u;
 
+  // Interval information
   uint32_t LowerBound;
   uint32_t UpperBound;
+
+  // Information retained for diagnostics
+  llvm::dxil::ResourceClass Class;
+  uint32_t Space;
+  ShaderVisibility Visibility;
 };
 
 class ResourceRange {
@@ -98,6 +102,9 @@ public:
   // Return the mapped RangeInfo at X or nullptr if no mapping exists
   const RangeInfo *lookup(uint32_t X) const;
 
+  // Removes all entries of the ResourceRange
+  void clear();
+
   // Insert the required (sub-)intervals such that the interval of [a;b] =
   // [Info.LowerBound, Info.UpperBound] is covered and points to a valid
   // RangeInfo &.
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index a1ddb318055b..f95c141c54d8 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -509,6 +509,8 @@ const RangeInfo *ResourceRange::lookup(uint32_t X) const {
   return Intervals.lookup(X, nullptr);
 }
 
+void ResourceRange::clear() { return Intervals.clear(); }
+
 std::optional<const RangeInfo *> ResourceRange::insert(const RangeInfo &Info) {
   uint32_t LowerBound = Info.LowerBound;
   uint32_t UpperBound = Info.UpperBound;

From 94865edfa85a61dd4ad985d2fb86990a1bba357b Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Fri, 20 Jun 2025 18:03:54 -0400
Subject: [PATCH 1123/1322] [Reland][InstCombine] Iterative replacement in
 PtrReplacer (#144626)

This patch enhances the PtrReplacer as follows:
1. Users are now collected iteratively to be generous on the stack. In the case of PHIs with incoming values which have not yet been visited, they are pushed back into the stack for reconsideration.
2. Replace users of the pointer root in a reverse-postorder traversal, instead of a simpletraversal over the collected users. This reordering ensures that the uses of an instruction are replaced before replacing the instruction itself.
3. During the replacement of PHI, use the same incoming value if it does not have a replacement.

This patch specifically fixes the case when an incoming value of a PHI
is addrspacecasted.

This is a reland of https://github.com/llvm/llvm-project/pull/137215.
---
 .../InstCombineLoadStoreAlloca.cpp            | 163 ++++++++++--------
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |  79 +++++++++
 2 files changed, 174 insertions(+), 68 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index a9751ab03e20..9aec90120d8b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -243,11 +243,10 @@ public:
   void replacePointer(Value *V);
 
 private:
-  bool collectUsersRecursive(Instruction &I);
   void replace(Instruction *I);
-  Value *getReplacement(Value *I);
+  Value *getReplacement(Value *V) const { return WorkMap.lookup(V); }
   bool isAvailable(Instruction *I) const {
-    return I == &Root || Worklist.contains(I);
+    return I == &Root || UsersToReplace.contains(I);
   }
 
   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
@@ -259,8 +258,7 @@ private:
     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
   }
 
-  SmallPtrSet<Instruction *, 32> ValuesToRevisit;
-  SmallSetVector<Instruction *, 4> Worklist;
+  SmallSetVector<Instruction *, 32> UsersToReplace;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
   Instruction &Root;
@@ -269,72 +267,79 @@ private:
 } // end anonymous namespace
 
 bool PointerReplacer::collectUsers() {
-  if (!collectUsersRecursive(Root))
-    return false;
+  SmallVector<Instruction *> Worklist;
+  SmallSetVector<Instruction *, 32> ValuesToRevisit;
 
-  // Ensure that all outstanding (indirect) users of I
-  // are inserted into the Worklist. Return false
-  // otherwise.
-  return llvm::set_is_subset(ValuesToRevisit, Worklist);
-}
+  auto PushUsersToWorklist = [&](Instruction *Inst) {
+    for (auto *U : Inst->users())
+      if (auto *I = dyn_cast<Instruction>(U))
+        if (!isAvailable(I) && !ValuesToRevisit.contains(I))
+          Worklist.emplace_back(I);
+  };
 
-bool PointerReplacer::collectUsersRecursive(Instruction &I) {
-  for (auto *U : I.users()) {
-    auto *Inst = cast<Instruction>(&*U);
+  PushUsersToWorklist(&Root);
+  while (!Worklist.empty()) {
+    Instruction *Inst = Worklist.pop_back_val();
     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
-      Worklist.insert(Load);
+      UsersToReplace.insert(Load);
     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
-      // All incoming values must be instructions for replacability
-      if (any_of(PHI->incoming_values(),
-                 [](Value *V) { return !isa<Instruction>(V); }))
-        return false;
-
-      // If at least one incoming value of the PHI is not in Worklist,
-      // store the PHI for revisiting and skip this iteration of the
-      // loop.
-      if (any_of(PHI->incoming_values(), [this](Value *V) {
-            return !isAvailable(cast<Instruction>(V));
+      /// TODO: Handle poison and null pointers for PHI and select.
+      // If all incoming values are available, mark this PHI as
+      // replacable and push it's users into the worklist.
+      bool IsReplacable = true;
+      if (all_of(PHI->incoming_values(), [&](Value *V) {
+            if (!isa<Instruction>(V))
+              return IsReplacable = false;
+            return isAvailable(cast<Instruction>(V));
           })) {
-        ValuesToRevisit.insert(Inst);
+        UsersToReplace.insert(PHI);
+        PushUsersToWorklist(PHI);
         continue;
       }
 
-      Worklist.insert(PHI);
-      if (!collectUsersRecursive(*PHI))
+      // Either an incoming value is not an instruction or not all
+      // incoming values are available. If this PHI was already
+      // visited prior to this iteration, return false.
+      if (!IsReplacable || !ValuesToRevisit.insert(PHI))
         return false;
+
+      // Push PHI back into the stack, followed by unavailable
+      // incoming values.
+      Worklist.emplace_back(PHI);
+      for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
+        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
+        if (UsersToReplace.contains(IncomingValue))
+          continue;
+        if (!ValuesToRevisit.insert(IncomingValue))
+          return false;
+        Worklist.emplace_back(IncomingValue);
+      }
     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
-      if (!isa<Instruction>(SI->getTrueValue()) ||
-          !isa<Instruction>(SI->getFalseValue()))
+      auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
+      auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
+      if (!TrueInst || !FalseInst)
         return false;
 
-      if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
-          !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
-        ValuesToRevisit.insert(Inst);
-        continue;
-      }
-      Worklist.insert(SI);
-      if (!collectUsersRecursive(*SI))
-        return false;
-    } else if (isa<GetElementPtrInst>(Inst)) {
-      Worklist.insert(Inst);
-      if (!collectUsersRecursive(*Inst))
-        return false;
+      UsersToReplace.insert(SI);
+      PushUsersToWorklist(SI);
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      UsersToReplace.insert(GEP);
+      PushUsersToWorklist(GEP);
     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
       if (MI->isVolatile())
         return false;
-      Worklist.insert(Inst);
+      UsersToReplace.insert(Inst);
     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
-      Worklist.insert(Inst);
-      if (!collectUsersRecursive(*Inst))
-        return false;
+      UsersToReplace.insert(Inst);
+      PushUsersToWorklist(Inst);
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
     } else {
       // TODO: For arbitrary uses with address space mismatches, should we check
       // if we can introduce a valid addrspacecast?
-      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n');
       return false;
     }
   }
@@ -342,7 +347,39 @@ bool PointerReplacer::collectUsersRecursive(Instruction &I) {
   return true;
 }
 
-Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
+void PointerReplacer::replacePointer(Value *V) {
+  assert(cast<PointerType>(Root.getType()) != cast<PointerType>(V->getType()) &&
+         "Invalid usage");
+  WorkMap[&Root] = V;
+  SmallVector<Instruction *> Worklist;
+  SetVector<Instruction *> PostOrderWorklist;
+  SmallPtrSet<Instruction *, 32> Visited;
+
+  // Perform a postorder traversal of the users of Root.
+  Worklist.push_back(&Root);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+
+    // If I has not been processed before, push each of its
+    // replacable users into the worklist.
+    if (Visited.insert(I).second) {
+      for (auto *U : I->users()) {
+        auto *UserInst = cast<Instruction>(U);
+        if (UsersToReplace.contains(UserInst))
+          Worklist.push_back(UserInst);
+      }
+      // Otherwise, users of I have already been pushed into
+      // the PostOrderWorklist. Push I as well.
+    } else {
+      PostOrderWorklist.insert(I);
+      Worklist.pop_back();
+    }
+  }
+
+  // Replace pointers in reverse-postorder.
+  for (Instruction *I : reverse(PostOrderWorklist))
+    replace(I);
+}
 
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
@@ -364,13 +401,15 @@ void PointerReplacer::replace(Instruction *I) {
     // replacement (new value).
     WorkMap[NewI] = NewI;
   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
-    Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
-    auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
-                                   PHI->getName(), PHI->getIterator());
-    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
-      NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
-                          PHI->getIncomingBlock(I));
-    WorkMap[PHI] = NewPHI;
+    // Create a new PHI by replacing any incoming value that is a user of the
+    // root pointer and has a replacement.
+    Value *V = WorkMap.lookup(PHI->getIncomingValue(0));
+    PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType());
+    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) {
+      Value *V = WorkMap.lookup(PHI->getIncomingValue(I));
+      PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I));
+    }
+    WorkMap[PHI] = PHI;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
@@ -434,18 +473,6 @@ void PointerReplacer::replace(Instruction *I) {
   }
 }
 
-void PointerReplacer::replacePointer(Value *V) {
-#ifndef NDEBUG
-  auto *PT = cast<PointerType>(Root.getType());
-  auto *NT = cast<PointerType>(V->getType());
-  assert(PT != NT && "Invalid usage");
-#endif
-  WorkMap[&Root] = V;
-
-  for (Instruction *Workitem : Worklist)
-    replace(Workitem);
-}
-
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
     return I;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
new file mode 100644
index 000000000000..538cc19f9722
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s
+
+%struct.type = type { [256 x <2 x i64>] }
+@g1 = external hidden addrspace(3) global %struct.type, align 16
+
+; This test requires the PtrReplacer to replace users in an RPO traversal.
+; Furthermore, %ptr.else need not to be replaced so it must be retained in
+; %ptr.sink.
+define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
+; CHECK-LABEL: define <2 x i64> @func(
+; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+; CHECK-NEXT:    br label %[[SINK:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+; CHECK-NEXT:    br label %[[SINK]]
+; CHECK:       [[SINK]]:
+; CHECK-NEXT:    [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ]
+; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
+;
+entry:
+  %coerce = alloca %struct.type, align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
+  br i1 %cmp.0, label %if.then, label %if.else
+
+if.then:                                    ; preds = %entry
+  %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
+  %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr
+  br label %sink
+
+if.else:                                      ; preds = %entry
+  %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+  %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0
+  br label %sink
+
+sink:
+  %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ]
+  %val.sink = load <2 x i64>, ptr %ptr.sink, align 16
+  ret <2 x i64> %val.sink
+}
+
+define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
+; CHECK-LABEL: define <2 x i64> @func_phi_loop(
+; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+; CHECK-NEXT:    br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]]
+; CHECK:       [[SINK]]:
+; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
+;
+entry:
+  %coerce = alloca %struct.type, align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
+  %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
+  %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr
+  br label %loop
+
+loop:
+  %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ]
+  %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+  %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0
+  br i1 %cmp.0, label %loop, label %sink
+
+sink:
+  %val.sink = load <2 x i64>, ptr %ptr.phi, align 16
+  ret <2 x i64> %val.sink
+}
+
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0

From b7be8786af42d131974ec9cfc3ba79b264511b7b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 20 Jun 2025 15:07:00 -0700
Subject: [PATCH 1124/1322] Reapply "[CI] Migrate to runtimes build" (#143612)

This reverts commit 6f62979a5a5bcf70d65f23e0991a274e6df5955b.

The reapplies commit 80ea5f46df3e365a0a2112889bb91732167b6214.

That commit was reverted because it was causing compiler-rt test
failures due to tysan not having its dependencies set up properly within
CMake. That situation has since been rectified in
3cef099ceddccefca8e11268624397cde9e04af6.

Reviewers: lnihlen, rnk, gburgessiv, cmtice

Reviewed By: rnk, cmtice

Pull Request: https://github.com/llvm/llvm-project/pull/144033
---
 .ci/compute_projects.py         | 119 ++++++++++++++++++++------------
 .ci/compute_projects_test.py    |  55 +++++++++++++--
 .ci/monolithic-linux.sh         |  13 +++-
 .github/workflows/premerge.yaml |   3 +-
 4 files changed, 139 insertions(+), 51 deletions(-)

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index 40dd0507a9ea..e61b8dc5021f 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -49,8 +49,7 @@ DEPENDENTS_TO_TEST = {
     },
     "lld": {"bolt", "cross-project-tests"},
     # TODO(issues/132795): LLDB should be enabled on clang changes.
-    "clang": {"clang-tools-extra", "compiler-rt", "cross-project-tests"},
-    "clang-tools-extra": {"libc"},
+    "clang": {"clang-tools-extra", "cross-project-tests"},
     "mlir": {"flang"},
     # Test everything if ci scripts are changed.
     # FIXME: Figure out what is missing and add here.
@@ -64,7 +63,15 @@ DEPENDENT_RUNTIMES_TO_BUILD = {"lldb": {"libcxx", "libcxxabi", "libunwind"}}
 
 # This mapping describes runtimes that should be tested when the key project is
 # touched.
-DEPENDENT_RUNTIMES_TO_TEST = {"clang": {"libcxx", "libcxxabi", "libunwind"}}
+DEPENDENT_RUNTIMES_TO_TEST = {
+    "clang": {"compiler-rt"},
+    "clang-tools-extra": {"libc"},
+}
+DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG = {
+    "llvm": {"libcxx", "libcxxabi", "libunwind"},
+    "clang": {"libcxx", "libcxxabi", "libunwind"},
+    ".ci": {"libcxx", "libcxxabi", "libunwind"},
+}
 
 EXCLUDE_LINUX = {
     "cross-project-tests",  # TODO(issues/132796): Tests are failing.
@@ -93,9 +100,6 @@ EXCLUDE_MAC = {
     "cross-project-tests",
     "flang",
     "libc",
-    "libcxx",
-    "libcxxabi",
-    "libunwind",
     "lldb",
     "openmp",
     "polly",
@@ -122,10 +126,10 @@ PROJECT_CHECK_TARGETS = {
     "polly": "check-polly",
 }
 
-RUNTIMES = {"libcxx", "libcxxabi", "libunwind"}
+RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc"}
 
 
-def _add_dependencies(projects: Set[str]) -> Set[str]:
+def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
     projects_with_dependents = set(projects)
     current_projects_count = 0
     while current_projects_count != len(projects_with_dependents):
@@ -134,9 +138,25 @@ def _add_dependencies(projects: Set[str]) -> Set[str]:
             if project not in PROJECT_DEPENDENCIES:
                 continue
             projects_with_dependents.update(PROJECT_DEPENDENCIES[project])
+    for runtime in runtimes:
+        if runtime not in PROJECT_DEPENDENCIES:
+            continue
+        projects_with_dependents.update(PROJECT_DEPENDENCIES[runtime])
     return projects_with_dependents
 
 
+def _exclude_projects(current_projects: Set[str], platform: str) -> Set[str]:
+    if platform == "Linux":
+        to_exclude = EXCLUDE_LINUX
+    elif platform == "Windows":
+        to_exclude = EXCLUDE_WINDOWS
+    elif platform == "Darwin":
+        to_exclude = EXCLUDE_MAC
+    else:
+        raise ValueError(f"Unexpected platform: {platform}")
+    return current_projects.difference(to_exclude)
+
+
 def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
     projects_to_test = set()
     for modified_project in modified_projects:
@@ -154,25 +174,14 @@ def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set
             ):
                 continue
             projects_to_test.add(dependent_project)
-    if platform == "Linux":
-        for to_exclude in EXCLUDE_LINUX:
-            if to_exclude in projects_to_test:
-                projects_to_test.remove(to_exclude)
-    elif platform == "Windows":
-        for to_exclude in EXCLUDE_WINDOWS:
-            if to_exclude in projects_to_test:
-                projects_to_test.remove(to_exclude)
-    elif platform == "Darwin":
-        for to_exclude in EXCLUDE_MAC:
-            if to_exclude in projects_to_test:
-                projects_to_test.remove(to_exclude)
-    else:
-        raise ValueError("Unexpected platform.")
+    projects_to_test = _exclude_projects(projects_to_test, platform)
     return projects_to_test
 
 
-def _compute_projects_to_build(projects_to_test: Set[str]) -> Set[str]:
-    return _add_dependencies(projects_to_test)
+def _compute_projects_to_build(
+    projects_to_test: Set[str], runtimes: Set[str]
+) -> Set[str]:
+    return _add_dependencies(projects_to_test, runtimes)
 
 
 def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
@@ -184,24 +193,36 @@ def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
     return check_targets
 
 
-def _compute_runtimes_to_test(projects_to_test: Set[str]) -> Set[str]:
+def _compute_runtimes_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
     runtimes_to_test = set()
-    for project_to_test in projects_to_test:
-        if project_to_test in DEPENDENT_RUNTIMES_TO_TEST:
-            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[project_to_test])
-        if project_to_test in DEPENDENT_RUNTIMES_TO_BUILD:
-            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_BUILD[project_to_test])
-    return runtimes_to_test
-
-
-def _compute_runtime_check_targets(projects_to_test: Set[str]) -> Set[str]:
-    check_targets = set()
-    for project_to_test in projects_to_test:
-        if project_to_test not in DEPENDENT_RUNTIMES_TO_TEST:
+    for modified_project in modified_projects:
+        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST:
             continue
-        for runtime_to_test in DEPENDENT_RUNTIMES_TO_TEST[project_to_test]:
-            check_targets.add(PROJECT_CHECK_TARGETS[runtime_to_test])
-    return check_targets
+        runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project])
+    return _exclude_projects(runtimes_to_test, platform)
+
+
+def _compute_runtimes_to_test_needs_reconfig(
+    modified_projects: Set[str], platform: str
+) -> Set[str]:
+    runtimes_to_test = set()
+    for modified_project in modified_projects:
+        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG:
+            continue
+        runtimes_to_test.update(
+            DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG[modified_project]
+        )
+    return _exclude_projects(runtimes_to_test, platform)
+
+
+def _compute_runtimes_to_build(
+    runtimes_to_test: Set[str], modified_projects: Set[str], platform: str
+) -> Set[str]:
+    runtimes_to_build = set(runtimes_to_test)
+    for modified_project in modified_projects:
+        if modified_project in DEPENDENT_RUNTIMES_TO_BUILD:
+            runtimes_to_build.update(DEPENDENT_RUNTIMES_TO_BUILD[modified_project])
+    return _exclude_projects(runtimes_to_build, platform)
 
 
 def _get_modified_projects(modified_files: list[str]) -> Set[str]:
@@ -225,10 +246,19 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]:
 def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     modified_projects = _get_modified_projects(modified_files)
     projects_to_test = _compute_projects_to_test(modified_projects, platform)
-    projects_to_build = _compute_projects_to_build(projects_to_test)
+    runtimes_to_test = _compute_runtimes_to_test(modified_projects, platform)
+    runtimes_to_test_needs_reconfig = _compute_runtimes_to_test_needs_reconfig(
+        modified_projects, platform
+    )
+    runtimes_to_build = _compute_runtimes_to_build(
+        runtimes_to_test | runtimes_to_test_needs_reconfig, modified_projects, platform
+    )
+    projects_to_build = _compute_projects_to_build(projects_to_test, runtimes_to_build)
     projects_check_targets = _compute_project_check_targets(projects_to_test)
-    runtimes_to_build = _compute_runtimes_to_test(projects_to_test)
-    runtimes_check_targets = _compute_runtime_check_targets(projects_to_test)
+    runtimes_check_targets = _compute_project_check_targets(runtimes_to_test)
+    runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
+        runtimes_to_test_needs_reconfig
+    )
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -238,6 +268,9 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "project_check_targets": " ".join(sorted(projects_check_targets)),
         "runtimes_to_build": ";".join(sorted(runtimes_to_build)),
         "runtimes_check_targets": " ".join(sorted(runtimes_check_targets)),
+        "runtimes_check_targets_needs_reconfig": " ".join(
+            sorted(runtimes_check_targets_needs_reconfig)
+        ),
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index ae376ea6a43c..6bc2e34a1cbe 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -26,6 +26,10 @@ class TestComputeProjects(unittest.TestCase):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
+            "",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -46,6 +50,10 @@ class TestComputeProjects(unittest.TestCase):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
+            "",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -66,6 +74,10 @@ class TestComputeProjects(unittest.TestCase):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
+            "",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -75,17 +87,21 @@ class TestComputeProjects(unittest.TestCase):
         )
         self.assertEqual(
             env_variables["projects_to_build"],
-            "clang;clang-tools-extra;compiler-rt;lld;llvm",
+            "clang;clang-tools-extra;lld;llvm",
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-clang check-clang-tools check-compiler-rt",
+            "check-clang check-clang-tools",
         )
         self.assertEqual(
-            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
+            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
+            "check-compiler-rt",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -104,6 +120,10 @@ class TestComputeProjects(unittest.TestCase):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
+            "",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -115,6 +135,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "check-bolt")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_lldb(self):
         env_variables = compute_projects.get_env_variables(
@@ -124,6 +145,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "check-lldb")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_mlir(self):
         env_variables = compute_projects.get_env_variables(
@@ -135,6 +157,7 @@ class TestComputeProjects(unittest.TestCase):
         )
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -144,6 +167,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "check-flang")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
@@ -153,6 +177,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_top_level_file(self):
         env_variables = compute_projects.get_env_variables(["README.md"], "Linux")
@@ -160,6 +185,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_runtiems_in_projects(self):
         env_variables = compute_projects.get_env_variables(
@@ -169,6 +195,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_docs(self):
         env_variables = compute_projects.get_env_variables(
@@ -178,6 +205,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_gn(self):
         env_variables = compute_projects.get_env_variables(
@@ -187,6 +215,7 @@ class TestComputeProjects(unittest.TestCase):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_ci(self):
         env_variables = compute_projects.get_env_variables(
@@ -198,10 +227,15 @@ class TestComputeProjects(unittest.TestCase):
             "check-clang check-lld check-lldb check-llvm",
         )
         self.assertEqual(
-            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
+            env_variables["runtimes_to_build"],
+            "libcxx;libcxxabi;libunwind",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
+            "",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -215,6 +249,19 @@ class TestComputeProjects(unittest.TestCase):
             env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+
+    def test_clang_tools_extra(self):
+        env_variables = compute_projects.get_env_variables(
+            ["clang-tools-extra/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"], "clang;clang-tools-extra;lld;llvm"
+        )
+        self.assertEqual(env_variables["project_check_targets"], "check-clang-tools")
+        self.assertEqual(env_variables["runtimes_to_build"], "libc")
+        self.assertEqual(env_variables["runtimes_check_targets"], "check-libc")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
 
 if __name__ == "__main__":
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 7503ea4e6a99..c350a5867914 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -57,6 +57,7 @@ projects="${1}"
 targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
+runtime_targets_needs_reconfig="${5}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -93,9 +94,15 @@ echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets}
 
+if [[ "${runtime_targets}" != "" ]]; then
+  echo "--- ninja runtimes"
+
+  ninja -C "${BUILD_DIR}" ${runtime_targets}
+fi
+
 # Compiling runtimes with just-built Clang and running their tests
 # as an additional testing for Clang.
-if [[ "${runtimes_targets}" != "" ]]; then
+if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
   echo "--- cmake runtimes C++26"
 
   cmake \
@@ -105,7 +112,7 @@ if [[ "${runtimes_targets}" != "" ]]; then
 
   echo "--- ninja runtimes C++26"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets}
+  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
 
   echo "--- cmake runtimes clang modules"
 
@@ -116,5 +123,5 @@ if [[ "${runtimes_targets}" != "" ]]; then
 
   echo "--- ninja runtimes clang modules"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets}
+  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
 fi
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 709b6d03d94c..4435a3e90576 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -56,11 +56,12 @@ jobs:
           echo "Running project checks targets: ${project_check_targets}"
           echo "Building runtimes: ${runtimes_to_build}"
           echo "Running runtimes checks targets: ${runtimes_check_targets}"
+          echo "Running runtimes checks requiring reconfiguring targets: ${runtimes_check_targets_needs_reconfig}"
 
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}"
+          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
       - name: Upload Artifacts
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:

From 1db9afb102acbb447ee374851d11fcdbea7fe5ec Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Fri, 20 Jun 2025 15:26:14 -0700
Subject: [PATCH 1125/1322] [lldb/crashlog] Make registers always available &
 fix x29/x30 parsing (#145104)

This patch addresses 2 issues:
1. It makes registers available on non-crashed threads all the time
2. It fixes arm64 registers parsing for registers that don't use the `x`
prefix (`fp` -> `x29` / `lr` -> `x30`)

---------

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/examples/python/crashlog.py              |  8 +-
 .../python/crashlog_scripted_process.py       | 12 +--
 .../interactive_crashlog_arm64_register.test  | 80 +++++++++++++++++++
 3 files changed, 91 insertions(+), 9 deletions(-)
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_arm64_register.test

diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py
index 5f07cda2892a..bb20f3a25c1c 100755
--- a/lldb/examples/python/crashlog.py
+++ b/lldb/examples/python/crashlog.py
@@ -777,10 +777,10 @@ class JSONCrashLogParser(CrashLogParser):
             if json_thread.get("triggered", False):
                 self.crashlog.crashed_thread_idx = idx
                 thread.crashed = True
-                if "threadState" in json_thread:
-                    thread.registers = self.parse_thread_registers(
-                        json_thread["threadState"]
-                    )
+            if "threadState" in json_thread:
+                thread.registers = self.parse_thread_registers(
+                    json_thread["threadState"]
+                )
             if "queue" in json_thread:
                 thread.queue = json_thread.get("queue")
             self.parse_frames(thread, json_thread.get("frames", []))
diff --git a/lldb/examples/python/crashlog_scripted_process.py b/lldb/examples/python/crashlog_scripted_process.py
index be0ed49d3590..f54a8df0479e 100644
--- a/lldb/examples/python/crashlog_scripted_process.py
+++ b/lldb/examples/python/crashlog_scripted_process.py
@@ -123,11 +123,6 @@ class CrashLogScriptedProcess(ScriptedProcess):
 
 class CrashLogScriptedThread(ScriptedThread):
     def create_register_ctx(self):
-        if not self.has_crashed:
-            return dict.fromkeys(
-                [*map(lambda reg: reg["name"], self.register_info["registers"])], 0
-            )
-
         if not self.backing_thread or not len(self.backing_thread.registers):
             return dict.fromkeys(
                 [*map(lambda reg: reg["name"], self.register_info["registers"])], 0
@@ -135,8 +130,15 @@ class CrashLogScriptedThread(ScriptedThread):
 
         for reg in self.register_info["registers"]:
             reg_name = reg["name"]
+            reg_alt_name = None
+            if "alt-name" in reg:
+                reg_alt_name = reg["alt-name"]
             if reg_name in self.backing_thread.registers:
                 self.register_ctx[reg_name] = self.backing_thread.registers[reg_name]
+            elif reg_alt_name and reg_alt_name in self.backing_thread.registers:
+                self.register_ctx[reg_name] = self.backing_thread.registers[
+                    reg_alt_name
+                ]
             else:
                 self.register_ctx[reg_name] = 0
 
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_arm64_register.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_arm64_register.test
new file mode 100644
index 000000000000..3f572c3300c0
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_arm64_register.test
@@ -0,0 +1,80 @@
+# REQUIRES: python, native && system-darwin
+
+# RUN: mkdir -p %t.dir
+# RUN: yaml2obj %S/Inputs/interactive_crashlog/multithread-test.yaml > %t.dir/multithread-test
+# RUN: %lldb -o 'command script import lldb.macosx.crashlog' \
+# RUN: -o 'crashlog -a -t %t.dir/multithread-test %S/Inputs/interactive_crashlog/multithread-test.ips' \
+# RUN: -o "thread list" -o "bt all" -o "register read" 2>&1 | FileCheck %s
+
+# CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands
+
+# CHECK: (lldb) process status
+# CHECK-NEXT: Process 22511 stopped
+# CHECK-NEXT: * thread #3, stop reason = EXC_BAD_ACCESS (code=1, address=0x0)
+# CHECK-NEXT:     frame #0: 0x0000000100ec58f4 multithread-test`bar
+
+# CHECK: (lldb) thread backtrace
+# CHECK-NEXT: * thread #3, stop reason = EXC_BAD_ACCESS (code=1, address=0x0)
+# CHECK-NEXT:   * frame #0: 0x0000000100ec58f4 multithread-test`bar{{.*}} [artificial]
+# CHECK-NEXT:     frame #1: 0x0000000100ec591b multithread-test`foo{{.*}} [artificial]
+# CHECK-NEXT:     frame #2: 0x0000000100ec5a87 multithread-test`compute_pow{{.*}} [artificial]
+
+# CHECK: (lldb) thread list
+# CHECK-NEXT: Process 22511 stopped
+# CHECK-NEXT:   thread #1: tid = 0x23c7fe, 0x000000019cc40b84{{.*}}, queue = 'com.apple.main-thread'
+# CHECK-NEXT:   thread #2: tid = 0x23c800, 0x000000019cc42c9c{{.*}}
+# CHECK-NEXT: * thread #3: tid = 0x23c801, 0x0000000100ec58f4 multithread-test`bar{{.*}}, stop reason = EXC_BAD_ACCESS (code=1, address=0x0)
+
+# CHECK: (lldb) bt all
+# CHECK:  thread #1, queue = 'com.apple.main-thread'
+# CHECK:    frame #{{[0-9]+}}: 0x000000019cc40b84{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x0000000100ec5b3b multithread-test`main{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x00000002230f8da7{{.*}} [artificial]
+# CHECK-NEXT:  thread #2
+# CHECK-NEXT:    frame #0: 0x000000019cc42c9c{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x0000000100ec5957 multithread-test`call_and_wait{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x000000019cc7e06b{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x000000019cc78e2b{{.*}} [artificial]
+# CHECK-NEXT:* thread #3, stop reason = EXC_BAD_ACCESS (code=1, address=0x0)
+# CHECK-NEXT:  * frame #0: 0x0000000100ec58f4 multithread-test`bar{{.*}} [artificial]
+# CHECK-NEXT:    frame #1: 0x0000000100ec591b multithread-test`foo{{.*}} [artificial]
+# CHECK-NEXT:    frame #2: 0x0000000100ec5a87 multithread-test`compute_pow{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x000000019cc7e06b{{.*}} [artificial]
+# CHECK:    frame #{{[0-9]+}}: 0x000000019cc78e2b{{.*}} [artificial]
+
+# CHECK: (lldb) register read
+# CHECK: General Purpose Registers:
+# CHECK:         x0 = 0x000000000000002a
+# CHECK:         x1 = 0x0000600001d291b0
+# CHECK:         x2 = 0x000000019cbbf000
+# CHECK:         x3 = 0x0000000000000000
+# CHECK:         x4 = 0x00000000000030a0
+# CHECK:         x5 = 0x00000000190008ff
+# CHECK:         x6 = 0x0000000000000000
+# CHECK:         x7 = 0x0000000000000000
+# CHECK:         x8 = 0x0000000000000001
+# CHECK:         x9 = 0x0000000000000000
+# CHECK:        x10 = 0xfffffffe634277cf
+# CHECK:        x11 = 0x0000010000000102
+# CHECK:        x12 = 0x0000010000000102
+# CHECK:        x13 = 0x0000010000000100
+# CHECK:        x14 = 0x0000010000000000
+# CHECK:        x15 = 0x0000000000000001
+# CHECK:        x16 = 0x000000019cc78ea8
+# CHECK:        x17 = 0x00000001fd0a7698
+# CHECK:        x18 = 0x0000000000000000
+# CHECK:        x19 = 0x000000016f04f000
+# CHECK:        x20 = 0x0000000000000000
+# CHECK:        x21 = 0x0000000000000000
+# CHECK:        x22 = 0x0000000000000000
+# CHECK:        x23 = 0x0000000000000000
+# CHECK:        x24 = 0x0000000000000000
+# CHECK:        x25 = 0x0000000000000000
+# CHECK:        x26 = 0x0000000000000000
+# CHECK:        x27 = 0x0000000000000000
+# CHECK:        x28 = 0x0000000000000000
+# CHECK:        x29 = 0x000000016f04ef00
+# CHECK:        x30 = 0x0000000100ec591c
+# CHECK:         sp = 0x000000016f04eee0
+# CHECK:         pc = 0x0000000100ec58f4
+# CHECK:       cpsr = 0x80001000

From 1753aba0342bd286d65d1c8465b9fa2659424766 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Jun 2025 23:38:30 +0100
Subject: [PATCH 1126/1322] [X86] combineINSERT_SUBVECTOR - directly fold to
 X86ISD::SUBV_BROADCAST_LOAD to prevent vector split infinite loop (#145077)

This reverts #140919 / f1d03dedfbe87119cfcafb07e0e0f90ec291cb97 - which
could result in another fold trying to split the concatenation apart
again before it was folded to a SUBV_BROADCAST_LOAD
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  10 +-
 llvm/test/CodeGen/X86/oddshuffles.ll          |  47 ++
 .../vector-interleaved-store-i8-stride-7.ll   | 556 +++++++++---------
 3 files changed, 332 insertions(+), 281 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 35d7b8084405..33083c0eba69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59465,8 +59465,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   }
 
   // If we're splatting the lower half subvector of a full vector load into the
-  // upper half, just splat the subvector directly, potentially creating a
-  // subvector broadcast.
+  // upper half, attempt to create a subvector broadcast.
   if ((int)IdxVal == (VecNumElts / 2) &&
       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
@@ -59474,7 +59473,12 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
     if (VecLd && SubLd &&
         DAG.areNonVolatileConsecutiveLoads(
             SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
-      return concatSubVectors(SubVec, SubVec, DAG, dl);
+      SDValue BcastLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT,
+                                          SubVecVT, SubLd, 0, DAG);
+      SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
+                                      BcastLd, DAG.getVectorIdxConstant(0, dl));
+      DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
+      return BcastLd;
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 6b9a86343ea1..4b0f75df83a7 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2508,3 +2508,50 @@ define void @D107009(ptr %input, ptr %output) {
   store <64 x i32> %i7, ptr %output, align 16
   ret void
 }
+
+; Ensure concatenation of repeated subvector loads before vector can be split apart.
+define void @split_v2i64_subvector_broadcast(ptr readonly align 8 captures(none) dereferenceable(64) %arg) {
+; SSE-LABEL: split_v2i64_subvector_broadcast:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups 8(%rdi), %xmm0
+; SSE-NEXT:    movups 40(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    movups %xmm0, (%rax)
+; SSE-NEXT:    movups %xmm2, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: split_v2i64_subvector_broadcast:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; AVX1-NEXT:    vmovupd %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: split_v2i64_subvector_broadcast:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovups 40(%rdi), %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,1,1,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovups %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: split_v2i64_subvector_broadcast:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; XOP-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; XOP-NEXT:    vmovupd %ymm0, (%rax)
+; XOP-NEXT:    vzeroupper
+; XOP-NEXT:    retq
+  %gep = getelementptr inbounds nuw i8, ptr %arg, i64 8
+  %load = load <6 x i64>, ptr %gep, align 8
+  %shuffle = shufflevector <6 x i64> %load, <6 x i64> poison, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i64> %shuffle, ptr poison, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 25e489eef9d1..61bfee133d84 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -10105,14 +10105,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm3
-; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm4
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
@@ -10122,105 +10122,106 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm6
-; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm5
+; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm5
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
 ; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm6
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm7
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm8
-; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm10, %ymm3
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm10
+; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpor %ymm11, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm10
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-FCP-NEXT:    vpermw %ymm10, %ymm4, %ymm11
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10]
+; AVX512BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm11
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
 ; AVX512BW-FCP-NEXT:    movabsq $145249953336295682, %rax # imm = 0x204081020408102
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm3
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm18[8],xmm3[8],xmm18[9],xmm3[9],xmm18[10],xmm3[10],xmm18[11],xmm3[11],xmm18[12],xmm3[12],xmm18[13],xmm3[13],xmm18[14],xmm3[14],xmm18[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
+; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm8, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm16
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm14
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm14, %ymm2
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm13
+; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm13, %ymm2
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
 ; AVX512BW-FCP-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
 ; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm18, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm23, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm3, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm1
 ; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm18[0],xmm3[1],xmm18[1],xmm3[2],xmm18[2],xmm3[3],xmm18[3],xmm3[4],xmm18[4],xmm3[5],xmm18[5],xmm3[6],xmm18[6],xmm3[7],xmm18[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,0,1],zmm0[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm13, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm12, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm12, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm11, %xmm1
 ; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
@@ -10228,48 +10229,47 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm14, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm13, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm15, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm14, %xmm1
 ; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm28 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm28
 ; AVX512BW-FCP-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm28[0],xmm8[1],xmm28[1],xmm8[2],xmm28[2],xmm8[3],xmm28[3],xmm8[4],xmm28[4],xmm8[5],xmm28[5],xmm8[6],xmm28[6],xmm8[7],xmm28[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm16[0],xmm7[1],xmm16[1],xmm7[2],xmm16[2],xmm7[3],xmm16[3],xmm7[4],xmm16[4],xmm7[5],xmm16[5],xmm7[6],xmm16[6],xmm7[7],xmm16[7]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[28,29,30],zero,ymm10[28],zero,ymm10[30,31,30,31],zero,ymm10[29],zero,ymm10[31,28,29]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[28,29,30],zero,ymm9[28],zero,ymm9[30,31,30,31],zero,ymm9[29],zero,ymm9[31,28,29]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero,zero
 ; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm31
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm31[0],xmm3[1],xmm31[1],xmm3[2],xmm31[2],xmm3[3],xmm31[3],xmm3[4],xmm31[4],xmm3[5],xmm31[5],xmm3[6],xmm31[6],xmm3[7],xmm31[7]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm30 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %ymm23, %ymm30, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm29 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %ymm23, %ymm29, %ymm23
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[2,3,2,3],zmm1[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm23 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm30
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm31
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm30[0],xmm31[1],xmm30[1],xmm31[2],xmm30[2],xmm31[3],xmm30[3],xmm31[4],xmm30[4],xmm31[5],xmm30[5],xmm31[6],xmm30[6],xmm31[7],xmm30[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm29 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29]
-; AVX512BW-FCP-NEXT:    vporq %ymm1, %ymm29, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm30
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm28, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm28 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29]
+; AVX512BW-FCP-NEXT:    vporq %ymm1, %ymm28, %ymm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
@@ -10278,82 +10278,82 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm23 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm1
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[4,5,6,7],zmm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm28 = zmm0[4,5,6,7],zmm1[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm29[23],zero,zmm29[21,22,23,26],zero,zmm29[24],zero,zmm29[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm29[62],zero,zmm29[60],zero,zero,zero,zero,zmm29[63],zero,zmm29[61],zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm28[23],zero,zmm28[21,22,23,26],zero,zmm28[24],zero,zmm28[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm0[25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm0, %zmm29
+; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm0, %zmm28
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm2
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm2[4,5,6,7],zmm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zero,zero,zmm6[27],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm6[60],zero,zmm6[62,63,62,63],zero,zmm6[61],zero,zmm6[63,60,61]
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm1[4,5,6,7],zmm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm7[23],zero,zero,zero,zero,zmm7[26],zero,zmm7[24],zero,zero,zero,zero,zmm7[27],zero,zmm7[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm7[62],zero,zmm7[60],zero,zero,zero,zero,zmm7[63],zero,zmm7[61],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vporq %zmm6, %zmm7, %zmm6
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm29[2,3,2,3,6,7,6,7]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm6[2,3,2,3,6,7,6,7]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm2[4,5,6,7],zmm1[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zero,zero,zmm5[27],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm5[60],zero,zmm5[62,63,62,63],zero,zmm5[61],zero,zmm5[63,60,61]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm1[4,5,6,7],zmm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zero,zero,zmm6[27],zero,zmm6[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm6[62],zero,zmm6[60],zero,zero,zero,zero,zmm6[63],zero,zmm6[61],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vporq %zmm5, %zmm6, %zmm5
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm28[2,3,2,3,6,7,6,7]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm28 = zmm5[2,3,2,3,6,7,6,7]
 ; AVX512BW-FCP-NEXT:    movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm29 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm7
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm6[4,5,6,7],zmm7[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm7[4,5,6,7],zmm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm28 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm6
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[4,5,6,7],zmm6[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7],zmm5[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm6[25],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[59],zero,zero,zero,zero,zmm6[62],zero,zmm6[60],zero,zero,zero,zero,zmm6[63],zero,zmm6[61]
-; AVX512BW-FCP-NEXT:    vporq %zmm0, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[59],zero,zero,zero,zero,zmm5[62],zero,zmm5[60],zero,zero,zero,zero,zmm5[63],zero,zmm5[61]
+; AVX512BW-FCP-NEXT:    vporq %zmm0, %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
 ; AVX512BW-FCP-NEXT:    movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm29 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm28 {%k1}
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm29 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm4, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm3, %xmm6
-; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm6, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm28 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm31, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm3, %xmm5
+; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm31[8],xmm3[8],xmm31[9],xmm3[9],xmm31[10],xmm3[10],xmm31[11],xmm3[11],xmm31[12],xmm3[12],xmm31[13],xmm3[13],xmm31[14],xmm3[14],xmm31[15],xmm3[15]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm3
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm3[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm28, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm8, %xmm4
-; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm28[8],xmm8[8],xmm28[9],xmm8[9],xmm28[10],xmm8[10],xmm28[11],xmm8[11],xmm28[12],xmm8[12],xmm28[13],xmm8[13],xmm28[14],xmm8[14],xmm28[15],xmm8[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm4[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm16, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm7, %xmm5
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm16[8],xmm7[8],xmm16[9],xmm7[9],xmm16[10],xmm7[10],xmm16[11],xmm7[11],xmm16[12],xmm7[12],xmm16[13],xmm7[13],xmm16[14],xmm7[14],xmm16[15],xmm7[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm5[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm30, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm31, %xmm4
-; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm31[8],xmm30[8],xmm31[9],xmm30[9],xmm31[10],xmm30[10],xmm31[11],xmm30[11],xmm31[12],xmm30[12],xmm31[13],xmm30[13],xmm31[14],xmm30[14],xmm31[15],xmm30[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm4[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm29, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm30, %xmm5
+; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm5[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22]
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm5, %zmm5
 ; AVX512BW-FCP-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm12[18],zero,zmm12[18,19,20,21],zero,zmm12[19],zero,zmm12[25,26,27,22],zero,zmm12[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm12[56,57],zero,zmm12[55],zero,zmm12[53,54,55,58],zero,zmm12[56],zero,zmm12[60,61,58,59]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[18],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22],zero,zmm13[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero,zero,zero
-; AVX512BW-FCP-NEXT:    vporq %zmm0, %zmm4, %zmm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[18],zero,zmm11[18,19,20,21],zero,zmm11[19],zero,zmm11[25,26,27,22],zero,zmm11[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[56,57],zero,zmm11[55],zero,zmm11[53,54,55,58],zero,zmm11[56],zero,zmm11[60,61,58,59]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm12[18],zero,zero,zero,zero,zmm12[21],zero,zmm12[19],zero,zero,zero,zero,zmm12[22],zero,zmm12[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm12[57],zero,zmm12[55],zero,zero,zero,zero,zmm12[58],zero,zmm12[56],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT:    vporq %zmm0, %zmm5, %zmm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[21],zero,zmm2[19],zero,zero,zero,zero,zmm2[22],zero,zmm2[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zmm2[58],zero,zmm2[56],zero,zero,zero,zero,zmm2[59],zero,zmm2[57]
 ; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm15[20],zero,zmm15[18],zero,zmm15[20,21,20,21],zero,zmm15[19],zero,zmm15[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm15[56,57,56,57],zero,zmm15[55],zero,zmm15[55,56,57,58],zero,zmm15[56],zero,zmm15[62,63]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[20],zero,zmm14[18],zero,zero,zero,zero,zmm14[21],zero,zmm14[19],zero,zero,zero,zero,zmm14[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm14[57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[20],zero,zmm14[18],zero,zmm14[20,21,20,21],zero,zmm14[19],zero,zmm14[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[56,57,56,57],zero,zmm14[55],zero,zmm14[55,56,57,58],zero,zmm14[56],zero,zmm14[62,63]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[20],zero,zmm13[18],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero
 ; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm2, %zmm2
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
 ; AVX512BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
@@ -10363,12 +10363,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 384(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, 384(%rax)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 192(%rax)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -10653,14 +10653,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm15
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
@@ -10670,105 +10670,106 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm6
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm10, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm6, %ymm10, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm11, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm10, %ymm4, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm11
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $145249953336295682, %rax # imm = 0x204081020408102
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm5 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm18[8],xmm3[8],xmm18[9],xmm3[9],xmm18[10],xmm3[10],xmm18[11],xmm3[11],xmm18[12],xmm3[12],xmm18[13],xmm3[13],xmm18[14],xmm3[14],xmm18[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm8, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm14, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm13, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm18, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm23, %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm3, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm18[0],xmm3[1],xmm18[1],xmm3[2],xmm18[2],xmm3[3],xmm18[3],xmm3[4],xmm18[4],xmm3[5],xmm18[5],xmm3[6],xmm18[6],xmm3[7],xmm18[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,0,1],zmm0[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm13, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm12, %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm12, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm11, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm1, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
@@ -10776,48 +10777,47 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm14, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm13, %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm15, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm14, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm28 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm1, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm28
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm28[0],xmm8[1],xmm28[1],xmm8[2],xmm28[2],xmm8[3],xmm28[3],xmm8[4],xmm28[4],xmm8[5],xmm28[5],xmm8[6],xmm28[6],xmm8[7],xmm28[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm16[0],xmm7[1],xmm16[1],xmm7[2],xmm16[2],xmm7[3],xmm16[3],xmm7[4],xmm16[4],xmm7[5],xmm16[5],xmm7[6],xmm16[6],xmm7[7],xmm16[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[28,29,30],zero,ymm10[28],zero,ymm10[30,31,30,31],zero,ymm10[29],zero,ymm10[31,28,29]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[28,29,30],zero,ymm9[28],zero,ymm9[30,31,30,31],zero,ymm9[29],zero,ymm9[31,28,29]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm31
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm31[0],xmm3[1],xmm31[1],xmm3[2],xmm31[2],xmm3[3],xmm31[3],xmm3[4],xmm31[4],xmm3[5],xmm31[5],xmm3[6],xmm31[6],xmm3[7],xmm31[7]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm30 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm23, %ymm30, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm29 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm23, %ymm29, %ymm23
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm23[2,3,2,3],zmm1[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm23 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm30
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm30[0],xmm31[1],xmm30[1],xmm31[2],xmm30[2],xmm31[3],xmm30[3],xmm31[4],xmm30[4],xmm31[5],xmm30[5],xmm31[6],xmm30[6],xmm31[7],xmm30[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm29 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29]
-; AVX512DQ-BW-FCP-NEXT:    vporq %ymm1, %ymm29, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm30
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm28, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm28 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29]
+; AVX512DQ-BW-FCP-NEXT:    vporq %ymm1, %ymm28, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
@@ -10826,82 +10826,82 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm23 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm0[4,5,6,7],zmm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm28 = zmm0[4,5,6,7],zmm1[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm29[23],zero,zmm29[21,22,23,26],zero,zmm29[24],zero,zmm29[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm29[62],zero,zmm29[60],zero,zero,zero,zero,zmm29[63],zero,zmm29[61],zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm28[23],zero,zmm28[21,22,23,26],zero,zmm28[24],zero,zmm28[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm0[25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm0, %zmm29
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm0, %zmm28
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm2[4,5,6,7],zmm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zero,zero,zmm6[27],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm6[60],zero,zmm6[62,63,62,63],zero,zmm6[61],zero,zmm6[63,60,61]
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm1[4,5,6,7],zmm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm7[23],zero,zero,zero,zero,zmm7[26],zero,zmm7[24],zero,zero,zero,zero,zmm7[27],zero,zmm7[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm7[62],zero,zmm7[60],zero,zero,zero,zero,zmm7[63],zero,zmm7[61],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm6, %zmm7, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm29[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm2[4,5,6,7],zmm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zero,zero,zmm5[27],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm5[60],zero,zmm5[62,63,62,63],zero,zmm5[61],zero,zmm5[63,60,61]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm1[4,5,6,7],zmm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zero,zero,zmm6[27],zero,zmm6[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm6[62],zero,zmm6[60],zero,zero,zero,zero,zmm6[63],zero,zmm6[61],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm5, %zmm6, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm28[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm28 = zmm5[2,3,2,3,6,7,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm29 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm6[4,5,6,7],zmm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm7[4,5,6,7],zmm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm28 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm5[4,5,6,7],zmm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7],zmm5[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm6[25],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[59],zero,zero,zero,zero,zmm6[62],zero,zmm6[60],zero,zero,zero,zero,zmm6[63],zero,zmm6[61]
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm0, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[59],zero,zero,zero,zero,zmm5[62],zero,zmm5[60],zero,zero,zero,zero,zmm5[63],zero,zmm5[61]
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm0, %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm29 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm28 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm29 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm4, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm3, %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm6, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm28 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm26, %xmm31, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm3, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm31[8],xmm3[8],xmm31[9],xmm3[9],xmm31[10],xmm3[10],xmm31[11],xmm3[11],xmm31[12],xmm3[12],xmm31[13],xmm3[13],xmm31[14],xmm3[14],xmm31[15],xmm3[15]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm3[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm28, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm8, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm28[8],xmm8[8],xmm28[9],xmm8[9],xmm28[10],xmm8[10],xmm28[11],xmm8[11],xmm28[12],xmm8[12],xmm28[13],xmm8[13],xmm28[14],xmm8[14],xmm28[15],xmm8[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm4[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm16, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm7, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm16[8],xmm7[8],xmm16[9],xmm7[9],xmm16[10],xmm7[10],xmm16[11],xmm7[11],xmm16[12],xmm7[12],xmm16[13],xmm7[13],xmm16[14],xmm7[14],xmm16[15],xmm7[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm5[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm30, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm31, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm31[8],xmm30[8],xmm31[9],xmm30[9],xmm31[10],xmm30[10],xmm31[11],xmm30[11],xmm31[12],xmm30[12],xmm31[13],xmm30[13],xmm31[14],xmm30[14],xmm31[15],xmm30[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm4[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm24, %xmm29, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm30, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm5[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm5, %zmm5
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm12[18],zero,zmm12[18,19,20,21],zero,zmm12[19],zero,zmm12[25,26,27,22],zero,zmm12[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm12[56,57],zero,zmm12[55],zero,zmm12[53,54,55,58],zero,zmm12[56],zero,zmm12[60,61,58,59]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[18],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22],zero,zmm13[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %zmm0, %zmm4, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[18],zero,zmm11[18,19,20,21],zero,zmm11[19],zero,zmm11[25,26,27,22],zero,zmm11[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[56,57],zero,zmm11[55],zero,zmm11[53,54,55,58],zero,zmm11[56],zero,zmm11[60,61,58,59]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm5 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm12[18],zero,zero,zero,zero,zmm12[21],zero,zmm12[19],zero,zero,zero,zero,zmm12[22],zero,zmm12[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm12[57],zero,zmm12[55],zero,zero,zero,zero,zmm12[58],zero,zmm12[56],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %zmm0, %zmm5, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[21],zero,zmm2[19],zero,zero,zero,zero,zmm2[22],zero,zmm2[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zmm2[58],zero,zmm2[56],zero,zero,zero,zero,zmm2[59],zero,zmm2[57]
 ; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm15[20],zero,zmm15[18],zero,zmm15[20,21,20,21],zero,zmm15[19],zero,zmm15[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm15[56,57,56,57],zero,zmm15[55],zero,zmm15[55,56,57,58],zero,zmm15[56],zero,zmm15[62,63]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[20],zero,zmm14[18],zero,zero,zero,zero,zmm14[21],zero,zmm14[19],zero,zero,zero,zero,zmm14[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm14[57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[20],zero,zmm14[18],zero,zmm14[20,21,20,21],zero,zmm14[19],zero,zmm14[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[56,57,56,57],zero,zmm14[55],zero,zmm14[55,56,57,58],zero,zmm14[56],zero,zmm14[62,63]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[20],zero,zmm13[18],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm2, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
@@ -10911,12 +10911,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 384(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, 384(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 192(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64

From ad36800d65250618b1c1446ae0f97d250bf09df8 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sat, 21 Jun 2025 07:43:57 +0900
Subject: [PATCH 1127/1322] [bazel][lld] Follow up 491b82a5ec1a
 (llvmorg-21-init-16192-g491b82a5ec1a)

---
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 450157758d75..6533d783c2b3 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -74,6 +74,7 @@ cc_library(
         "ELF/*.cpp",
         "ELF/*.h",
         "ELF/Arch/*.cpp",
+        "ELF/Arch/*.h",
     ]),
     includes = ["ELF"],
     textual_hdrs = [

From be7e4113c84317feda055cac68a855429e46e381 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 20 Jun 2025 18:55:41 -0400
Subject: [PATCH 1128/1322] [NFC] Add comment to describe the intention use of
 newly added `avail-extern-gv-in-addrspace-to-local` (#144911)

---
 llvm/lib/Transforms/IPO/ElimAvailExtern.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index bc98f994f490..538755e66f9c 100644
--- a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -35,6 +35,10 @@ static cl::opt<bool> ConvertToLocal(
     cl::desc("Convert available_externally into locals, renaming them "
              "to avoid link-time clashes."));
 
+// This option was originally introduced to correctly support the lowering of
+// LDS variables for AMDGPU when ThinLTO is enabled. It can be utilized for
+// other purposes, but make sure it is safe to do so, as privatizing global
+// variables is generally not safe.
 static cl::opt<unsigned> ConvertGlobalVariableInAddrSpace(
     "avail-extern-gv-in-addrspace-to-local", cl::Hidden,
     cl::desc(

From 6b9fe9e0bc7421d89f38ade857c91f0ae153ad9c Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Fri, 20 Jun 2025 16:21:47 -0700
Subject: [PATCH 1129/1322] [HLSL] Emit a version in the dx.rootsignatures
 metadata (#145113)

In #144957 the backend was updated to expect a version in the metadata,
but since the frontend wasn't updated this breaks compilation. This is a
somewhat temporary fix to that until #144813 lands.
---
 clang/lib/CodeGen/CGHLSLRuntime.cpp                  | 11 ++++++++---
 clang/test/CodeGenHLSL/RootSignature.hlsl            | 12 ++++++------
 llvm/lib/Target/DirectX/DXILRootSignature.cpp        |  4 ++--
 .../DirectX/ContainerData/RootSignature-Error.ll     |  2 +-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 34960c34e109..3103f1798e14 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -72,12 +72,17 @@ void addRootSignature(ArrayRef<llvm::hlsl::rootsig::RootElement> Elements,
 
   llvm::hlsl::rootsig::MetadataBuilder Builder(Ctx, Elements);
   MDNode *RootSignature = Builder.BuildRootSignature();
-  MDNode *FnPairing =
-      MDNode::get(Ctx, {ValueAsMetadata::get(Fn), RootSignature});
+
+  // TODO: We need to wire the root signature version up through the frontend
+  // rather than hardcoding it.
+  ConstantAsMetadata *Version =
+      ConstantAsMetadata::get(ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 2));
+  MDNode *MDVals =
+      MDNode::get(Ctx, {ValueAsMetadata::get(Fn), RootSignature, Version});
 
   StringRef RootSignatureValKey = "dx.rootsignatures";
   auto *RootSignatureValMD = M.getOrInsertNamedMetadata(RootSignatureValKey);
-  RootSignatureValMD->addOperand(FnPairing);
+  RootSignatureValMD->addOperand(MDVals);
 }
 
 } // namespace
diff --git a/clang/test/CodeGenHLSL/RootSignature.hlsl b/clang/test/CodeGenHLSL/RootSignature.hlsl
index ca843ffbb1ce..6618ca741aa9 100644
--- a/clang/test/CodeGenHLSL/RootSignature.hlsl
+++ b/clang/test/CodeGenHLSL/RootSignature.hlsl
@@ -3,14 +3,14 @@
 // CHECK: !dx.rootsignatures = !{![[#EMPTY_ENTRY:]], ![[#DT_ENTRY:]],
 // CHECK-SAME: ![[#RF_ENTRY:]], ![[#RC_ENTRY:]], ![[#RD_ENTRY:]], ![[#SS_ENTRY:]]}
 
-// CHECK: ![[#EMPTY_ENTRY]] = !{ptr @EmptyEntry, ![[#EMPTY:]]}
+// CHECK: ![[#EMPTY_ENTRY]] = !{ptr @EmptyEntry, ![[#EMPTY:]], i32 2}
 // CHECK: ![[#EMPTY]] = !{}
 
 [shader("compute"), RootSignature("")]
 [numthreads(1,1,1)]
 void EmptyEntry() {}
 
-// CHECK: ![[#DT_ENTRY]] = !{ptr @DescriptorTableEntry, ![[#DT_RS:]]}
+// CHECK: ![[#DT_ENTRY]] = !{ptr @DescriptorTableEntry, ![[#DT_RS:]], i32 2}
 // CHECK: ![[#DT_RS]] = !{![[#TABLE:]]}
 // CHECK: ![[#TABLE]] = !{!"DescriptorTable", i32 0, ![[#CBV:]], ![[#SRV:]]}
 // CHECK: ![[#CBV]] = !{!"CBV", i32 1, i32 0, i32 0, i32 -1, i32 4}
@@ -25,7 +25,7 @@ void EmptyEntry() {}
 [numthreads(1,1,1)]
 void DescriptorTableEntry() {}
 
-// CHECK: ![[#RF_ENTRY]] = !{ptr @RootFlagsEntry, ![[#RF_RS:]]}
+// CHECK: ![[#RF_ENTRY]] = !{ptr @RootFlagsEntry, ![[#RF_RS:]], i32 2}
 // CHECK: ![[#RF_RS]] = !{![[#ROOT_FLAGS:]]}
 // CHECK: ![[#ROOT_FLAGS]] = !{!"RootFlags", i32 2114}
 
@@ -38,7 +38,7 @@ void DescriptorTableEntry() {}
 [numthreads(1,1,1)]
 void RootFlagsEntry() {}
 
-// CHECK: ![[#RC_ENTRY]] = !{ptr @RootConstantsEntry, ![[#RC_RS:]]}
+// CHECK: ![[#RC_ENTRY]] = !{ptr @RootConstantsEntry, ![[#RC_RS:]], i32 2}
 // CHECK: ![[#RC_RS]] = !{![[#ROOT_CONSTANTS:]]}
 // CHECK: ![[#ROOT_CONSTANTS]] = !{!"RootConstants", i32 5, i32 1, i32 2, i32 1}
 
@@ -52,7 +52,7 @@ void RootFlagsEntry() {}
 [numthreads(1,1,1)]
 void RootConstantsEntry() {}
 
-// CHECK: ![[#RD_ENTRY]] = !{ptr @RootDescriptorsEntry, ![[#RD_RS:]]}
+// CHECK: ![[#RD_ENTRY]] = !{ptr @RootDescriptorsEntry, ![[#RD_RS:]], i32 2}
 // CHECK: ![[#RD_RS]] = !{![[#ROOT_CBV:]], ![[#ROOT_UAV:]], ![[#ROOT_SRV:]]}
 // CHECK: ![[#ROOT_CBV]] = !{!"RootCBV", i32 0, i32 0, i32 0, i32 4}
 // CHECK: ![[#ROOT_UAV]] = !{!"RootUAV", i32 0, i32 42, i32 3, i32 2}
@@ -66,7 +66,7 @@ void RootConstantsEntry() {}
 [numthreads(1,1,1)]
 void RootDescriptorsEntry() {}
 
-// CHECK: ![[#SS_ENTRY]] = !{ptr @StaticSamplerEntry, ![[#SS_RS:]]}
+// CHECK: ![[#SS_ENTRY]] = !{ptr @StaticSamplerEntry, ![[#SS_RS:]], i32 2}
 // CHECK: ![[#SS_RS]] = !{![[#STATIC_SAMPLER:]]}
 
 // checking filter = 0x4
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index 1f175fd4ecd9..29e78fcce526 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -491,8 +491,8 @@ analyzeModule(Module &M) {
 
   for (const auto &RSDefNode : RootSignatureNode->operands()) {
     if (RSDefNode->getNumOperands() != 3) {
-      reportError(Ctx, "Invalid format for Root Signature Definition. Pairs "
-                       "of function, root signature expected.");
+      reportError(Ctx, "Invalid Root Signature metadata - expected function, "
+                       "signature, and version.");
       continue;
     }
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll
index 2a2188b1a13b..039206584211 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error.ll
@@ -2,7 +2,7 @@
 
 target triple = "dxil-unknown-shadermodel6.0-compute"
 
-; CHECK: error: Invalid format for Root Signature Definition. Pairs of function, root signature expected.
+; CHECK: error: Invalid Root Signature metadata - expected function, signature, and version.
 ; CHECK-NOT: Root Signature Definitions
 
 
From 2dfcc4375faa5e3692bd82a022d33bdd6fe55f10 Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Fri, 20 Jun 2025 16:38:20 -0700
Subject: [PATCH 1130/1322] [clang-doc] Precommit concept tests (#144160)

---
 .../test/clang-doc/json/class-requires.cpp    | 34 ++++++++
 .../test/clang-doc/json/concept.cpp           | 37 +++++++++
 .../test/clang-doc/json/function-requires.cpp | 79 +++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 clang-tools-extra/test/clang-doc/json/class-requires.cpp
 create mode 100644 clang-tools-extra/test/clang-doc/json/concept.cpp
 create mode 100644 clang-tools-extra/test/clang-doc/json/function-requires.cpp

diff --git a/clang-tools-extra/test/clang-doc/json/class-requires.cpp b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
new file mode 100644
index 000000000000..af108a402b40
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
@@ -0,0 +1,34 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
+// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.json
+
+template<typename T>
+concept Addable = requires(T a, T b) {
+  { a + b };
+};
+
+template<typename T>
+requires Addable<T>
+struct MyClass;
+
+// CHECK:       "Name": "MyClass",
+// CHECK-NEXT:  "Namespace": [
+// CHECK-NEXT:    "GlobalNamespace"
+// CHECK-NEXT:  ],
+// CHECK-NEXT:  "Path": "GlobalNamespace",
+// CHECK-NEXT:  "TagType": "struct",
+// CHECK-NEXT:  "Template": {
+// CHECK-NOT:     "Constraints": [
+// CHECK-NOT:       {
+// CHECK-NOT:         "Expression": "Addable<T>",
+// CHECK-NOT:         "Name": "Addable",
+// CHECK-NOT:         "Path": "",
+// CHECK-NOT:         "QualName": "Addable",
+// CHECK-NOT:         "USR": "{{[0-9A-F]*}}"
+// CHECK-NOT:       }
+// CHECK-NOT:     ],
+// CHECK-NEXT:    "Parameters": [
+// CHECK-NEXT:      "typename T"
+// CHECK-NEXT:    ]
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "USR": "{{[0-9A-F]*}}"
diff --git a/clang-tools-extra/test/clang-doc/json/concept.cpp b/clang-tools-extra/test/clang-doc/json/concept.cpp
new file mode 100644
index 000000000000..624f71c6bf9b
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/json/concept.cpp
@@ -0,0 +1,37 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
+
+// Requires that T suports post and pre-incrementing.
+template<typename T>
+concept Incrementable = requires(T x) {
+  ++x;
+  x++;
+};
+
+// CHECK:      {
+// CHECK-NOT:    "Concepts": [
+// CHECK-NOT:      {
+// CHECK-NOT:        "ConstraintExpression": "requires (T x) { ++x; x++; }",
+// CHECK-NOT:        "Description": [
+// CHECK-NOT:          {
+// CHECK-NOT:            "FullComment": {
+// CHECK-NOT:              "Children": [
+// CHECK-NOT:                {
+// CHECK-NOT:                  "ParagraphComment": {
+// CHECK-NOT:                    "Children": [
+// CHECK-NOT:                      {
+// CHECK-NOT:                        "TextComment": " Requires that T suports post and pre-incrementing."
+// CHECK-NOT:        ],
+// CHECK-NOT:        "IsType": true,
+// CHECK-NOT:        "Name": "Incrementable",
+// CHECK-NOT:        "Template": {
+// CHECK-NOT:          "Parameters": [
+// CHECK-NOT:            "typename T"
+// CHECK-NOT:          ]
+// CHECK-NOT:        },
+// CHECK-NOT:        "USR": "{{[0-9A-F]*}}"
+// CHECK-NOT:      }
+// CHECK-NOT:    ],
+// CHECK:        "Name": "",
+// CHECK:        "USR": "0000000000000000000000000000000000000000"
+// CHECK:      }
diff --git a/clang-tools-extra/test/clang-doc/json/function-requires.cpp b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
new file mode 100644
index 000000000000..aa62464d07b4
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
@@ -0,0 +1,79 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
+// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+
+template<typename T>
+concept Incrementable = requires(T x) {
+  ++x;
+  x++;
+};
+
+template<typename T> void increment(T t) requires Incrementable<T>;
+
+template<Incrementable T> Incrementable auto incrementTwo(T t);
+
+// CHECK:       "Functions": [
+// CHECK-NEXT:    {
+// CHECK-NEXT:      "IsStatic": false,
+// CHECK-NEXT:      "Name": "increment",
+// CHECK-NEXT:      "Params": [
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Name": "t",
+// CHECK-NEXT:          "Type": "T"
+// CHECK-NEXT:        }
+// CHECK-NEXT:      ],
+// CHECK-NEXT:      "ReturnType": {
+// CHECK-NEXT:        "IsBuiltIn": false,
+// CHECK-NEXT:        "IsTemplate": false,
+// CHECK-NEXT:        "Name": "void",
+// CHECK-NEXT:        "QualName": "void",
+// CHECK-NEXT:        "USR": "0000000000000000000000000000000000000000"
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "Template": {
+// CHECK-NOT:         "Constraints": [
+// CHECK-NOT:           {
+// CHECK-NOT:             "Expression": "Incrementable<T>",
+// CHECK-NOT:             "Name": "Incrementable",
+// CHECK-NOT:             "Path": "",
+// CHECK-NOT:             "QualName": "Incrementable",
+// CHECK-NOT:             "USR": "{{[0-9A-F]*}}"
+// CHECK-NOT:           }
+// CHECK-NOT:         ],
+// CHECK-NEXT:        "Parameters": [
+// CHECK-NEXT:          "typename T"
+// CHECK-NEXT:        ]
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "USR": "{{[0-9A-F]*}}" 
+// CHECK-NEXT:    },
+// CHECK-NEXT:    {
+// CHECK-NEXT:      "IsStatic": false,
+// CHECK-NEXT:      "Name": "incrementTwo",
+// CHECK-NEXT:      "Params": [
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Name": "t",
+// CHECK-NEXT:          "Type": "T"
+// CHECK-NEXT:        }
+// CHECK-NEXT:      ],
+// CHECK-NEXT:      "ReturnType": {
+// CHECK-NEXT:        "IsBuiltIn": false,
+// CHECK-NEXT:        "IsTemplate": false,
+// CHECK-NEXT:        "Name": "Incrementable auto",
+// CHECK-NEXT:        "QualName": "Incrementable auto",
+// CHECK-NEXT:        "USR": "0000000000000000000000000000000000000000"
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "Template": {
+// CHECK-NOT:         "Constraints": [
+// CHECK-NOT:           {
+// CHECK-NOT:             "Expression": "Incrementable<T>",
+// CHECK-NOT:             "Name": "Incrementable",
+// CHECK-NOT:             "Path": "",
+// CHECK-NOT:             "QualName": "Incrementable",
+// CHECK-NOT:             "USR": "{{[0-9A-F]*}}"
+// CHECK-NOT:           }
+// CHECK-NOT:         ],
+// CHECK-NEXT:        "Parameters": [
+// CHECK-NEXT:          "Incrementable T"
+// CHECK-NEXT:        ]
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:    }

From 8050a6e0732c6614ce3e5296fdeb5a3c36bde26d Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Fri, 20 Jun 2025 17:39:31 -0700
Subject: [PATCH 1131/1322] [clang-doc] add support for concepts (#144430)

Add support for documenting concepts. This handles concepts and constraints on function and class templates.

Atomic constraints are not considered yet. We don't order constraints based on their conjunctive or disjunctive properties.
---
 clang-tools-extra/clang-doc/BitcodeReader.cpp |  73 +++++++++++
 clang-tools-extra/clang-doc/BitcodeWriter.cpp |  44 ++++++-
 clang-tools-extra/clang-doc/BitcodeWriter.h   |  12 +-
 clang-tools-extra/clang-doc/HTMLGenerator.cpp |   4 +
 .../clang-doc/HTMLMustacheGenerator.cpp       |   2 +
 clang-tools-extra/clang-doc/JSONGenerator.cpp |  50 ++++++++
 clang-tools-extra/clang-doc/MDGenerator.cpp   |   5 +
 clang-tools-extra/clang-doc/Mapper.cpp        |   4 +
 clang-tools-extra/clang-doc/Mapper.h          |   1 +
 .../clang-doc/Representation.cpp              |  20 +++
 clang-tools-extra/clang-doc/Representation.h  |  26 +++-
 clang-tools-extra/clang-doc/Serialize.cpp     |  90 +++++++++++++
 clang-tools-extra/clang-doc/Serialize.h       |   4 +
 clang-tools-extra/clang-doc/YAMLGenerator.cpp |   2 +
 .../test/clang-doc/json/class-requires.cpp    |  18 +--
 .../clang-doc/json/compound-constraints.cpp   | 121 ++++++++++++++++++
 .../test/clang-doc/json/concept.cpp           |  49 +++----
 .../test/clang-doc/json/function-requires.cpp |  36 +++---
 .../unittests/clang-doc/BitcodeTest.cpp       |   2 +
 19 files changed, 507 insertions(+), 56 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-doc/json/compound-constraints.cpp

diff --git a/clang-tools-extra/clang-doc/BitcodeReader.cpp b/clang-tools-extra/clang-doc/BitcodeReader.cpp
index 35058abab066..66852931226b 100644
--- a/clang-tools-extra/clang-doc/BitcodeReader.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeReader.cpp
@@ -92,6 +92,7 @@ static llvm::Error decodeRecord(const Record &R, InfoType &Field,
   case InfoType::IT_default:
   case InfoType::IT_enum:
   case InfoType::IT_typedef:
+  case InfoType::IT_concept:
     Field = IT;
     return llvm::Error::success();
   }
@@ -108,6 +109,7 @@ static llvm::Error decodeRecord(const Record &R, FieldId &Field,
   case FieldId::F_type:
   case FieldId::F_child_namespace:
   case FieldId::F_child_record:
+  case FieldId::F_concept:
   case FieldId::F_default:
     Field = F;
     return llvm::Error::success();
@@ -391,6 +393,29 @@ static llvm::Error parseRecord(const Record &R, unsigned ID,
                                  "invalid field for TemplateParamInfo");
 }
 
+static llvm::Error parseRecord(const Record &R, unsigned ID,
+                               llvm::StringRef Blob, ConceptInfo *I) {
+  switch (ID) {
+  case CONCEPT_USR:
+    return decodeRecord(R, I->USR, Blob);
+  case CONCEPT_NAME:
+    return decodeRecord(R, I->Name, Blob);
+  case CONCEPT_IS_TYPE:
+    return decodeRecord(R, I->IsType, Blob);
+  case CONCEPT_CONSTRAINT_EXPRESSION:
+    return decodeRecord(R, I->ConstraintExpression, Blob);
+  }
+  llvm_unreachable("invalid field for ConceptInfo");
+}
+
+static llvm::Error parseRecord(const Record &R, unsigned ID,
+                               llvm::StringRef Blob, ConstraintInfo *I) {
+  if (ID == CONSTRAINT_EXPRESSION)
+    return decodeRecord(R, I->ConstraintExpr, Blob);
+  return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                 "invalid field for ConstraintInfo");
+}
+
 template <typename T> static llvm::Expected<CommentInfo *> getCommentInfo(T I) {
   return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                  "invalid type cannot contain CommentInfo");
@@ -429,6 +454,10 @@ template <> llvm::Expected<CommentInfo *> getCommentInfo(CommentInfo *I) {
   return I->Children.back().get();
 }
 
+template <> llvm::Expected<CommentInfo *> getCommentInfo(ConceptInfo *I) {
+  return &I->Description.emplace_back();
+}
+
 // When readSubBlock encounters a TypeInfo sub-block, it calls addTypeInfo on
 // the parent block to set it. The template specializations define what to do
 // for each supported parent block.
@@ -584,6 +613,17 @@ template <> llvm::Error addReference(RecordInfo *I, Reference &&R, FieldId F) {
   }
 }
 
+template <>
+llvm::Error addReference(ConstraintInfo *I, Reference &&R, FieldId F) {
+  if (F == FieldId::F_concept) {
+    I->ConceptRef = std::move(R);
+    return llvm::Error::success();
+  }
+  return llvm::createStringError(
+      llvm::inconvertibleErrorCode(),
+      "ConstraintInfo cannot contain this Reference");
+}
+
 template <typename T, typename ChildInfoType>
 static void addChild(T I, ChildInfoType &&R) {
   llvm::errs() << "invalid child type for info";
@@ -600,6 +640,9 @@ template <> void addChild(NamespaceInfo *I, EnumInfo &&R) {
 template <> void addChild(NamespaceInfo *I, TypedefInfo &&R) {
   I->Children.Typedefs.emplace_back(std::move(R));
 }
+template <> void addChild(NamespaceInfo *I, ConceptInfo &&R) {
+  I->Children.Concepts.emplace_back(std::move(R));
+}
 
 // Record children:
 template <> void addChild(RecordInfo *I, FunctionInfo &&R) {
@@ -649,6 +692,9 @@ template <> void addTemplate(RecordInfo *I, TemplateInfo &&P) {
 template <> void addTemplate(FunctionInfo *I, TemplateInfo &&P) {
   I->Template.emplace(std::move(P));
 }
+template <> void addTemplate(ConceptInfo *I, TemplateInfo &&P) {
+  I->Template = std::move(P);
+}
 
 // Template specializations go only into template records.
 template <typename T>
@@ -662,6 +708,14 @@ void addTemplateSpecialization(TemplateInfo *I,
   I->Specialization.emplace(std::move(TSI));
 }
 
+template <typename T> static void addConstraint(T I, ConstraintInfo &&C) {
+  llvm::errs() << "invalid container for constraint info";
+  exit(1);
+}
+template <> void addConstraint(TemplateInfo *I, ConstraintInfo &&C) {
+  I->Constraints.emplace_back(std::move(C));
+}
+
 // Read records from bitcode into a given info.
 template <typename T>
 llvm::Error ClangDocBitcodeReader::readRecord(unsigned ID, T I) {
@@ -716,6 +770,8 @@ llvm::Error ClangDocBitcodeReader::readBlock(unsigned ID, T I) {
   }
 }
 
+// TODO: Create a helper that can receive a function to reduce repetition for
+// most blocks.
 template <typename T>
 llvm::Error ClangDocBitcodeReader::readSubBlock(unsigned ID, T I) {
   llvm::TimeTraceScope("Reducing infos", "readSubBlock");
@@ -817,6 +873,20 @@ llvm::Error ClangDocBitcodeReader::readSubBlock(unsigned ID, T I) {
     addChild(I, std::move(TI));
     return llvm::Error::success();
   }
+  case BI_CONSTRAINT_BLOCK_ID: {
+    ConstraintInfo CI;
+    if (auto Err = readBlock(ID, &CI))
+      return Err;
+    addConstraint(I, std::move(CI));
+    return llvm::Error::success();
+  }
+  case BI_CONCEPT_BLOCK_ID: {
+    ConceptInfo CI;
+    if (auto Err = readBlock(ID, &CI))
+      return Err;
+    addChild(I, std::move(CI));
+    return llvm::Error::success();
+  }
   default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "invalid subblock type");
@@ -922,6 +992,8 @@ ClangDocBitcodeReader::readBlockToInfo(unsigned ID) {
     return createInfo<EnumInfo>(ID);
   case BI_TYPEDEF_BLOCK_ID:
     return createInfo<TypedefInfo>(ID);
+  case BI_CONCEPT_BLOCK_ID:
+    return createInfo<ConceptInfo>(ID);
   case BI_FUNCTION_BLOCK_ID:
     return createInfo<FunctionInfo>(ID);
   default:
@@ -962,6 +1034,7 @@ ClangDocBitcodeReader::readBitcode() {
     case BI_RECORD_BLOCK_ID:
     case BI_ENUM_BLOCK_ID:
     case BI_TYPEDEF_BLOCK_ID:
+    case BI_CONCEPT_BLOCK_ID:
     case BI_FUNCTION_BLOCK_ID: {
       auto InfoOrErr = readBlockToInfo(ID);
       if (!InfoOrErr)
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
index f8a6859169b0..b7308c012786 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
@@ -128,7 +128,9 @@ static const llvm::IndexedMap<llvm::StringRef, BlockIdToIndexFunctor>
           {BI_REFERENCE_BLOCK_ID, "ReferenceBlock"},
           {BI_TEMPLATE_BLOCK_ID, "TemplateBlock"},
           {BI_TEMPLATE_SPECIALIZATION_BLOCK_ID, "TemplateSpecializationBlock"},
-          {BI_TEMPLATE_PARAM_BLOCK_ID, "TemplateParamBlock"}};
+          {BI_TEMPLATE_PARAM_BLOCK_ID, "TemplateParamBlock"},
+          {BI_CONSTRAINT_BLOCK_ID, "ConstraintBlock"},
+          {BI_CONCEPT_BLOCK_ID, "ConceptBlock"}};
       assert(Inits.size() == BlockIdCount);
       for (const auto &Init : Inits)
         BlockIdNameMap[Init.first] = Init.second;
@@ -205,7 +207,13 @@ static const llvm::IndexedMap<RecordIdDsc, RecordIdToIndexFunctor>
           {TYPEDEF_USR, {"USR", &genSymbolIdAbbrev}},
           {TYPEDEF_NAME, {"Name", &genStringAbbrev}},
           {TYPEDEF_DEFLOCATION, {"DefLocation", &genLocationAbbrev}},
-          {TYPEDEF_IS_USING, {"IsUsing", &genBoolAbbrev}}};
+          {TYPEDEF_IS_USING, {"IsUsing", &genBoolAbbrev}},
+          {CONCEPT_USR, {"USR", &genSymbolIdAbbrev}},
+          {CONCEPT_NAME, {"Name", &genStringAbbrev}},
+          {CONCEPT_IS_TYPE, {"IsType", &genBoolAbbrev}},
+          {CONCEPT_CONSTRAINT_EXPRESSION,
+           {"ConstraintExpression", &genStringAbbrev}},
+          {CONSTRAINT_EXPRESSION, {"Expression", &genStringAbbrev}}};
       assert(Inits.size() == RecordIdCount);
       for (const auto &Init : Inits) {
         RecordIdNameMap[Init.first] = Init.second;
@@ -263,7 +271,13 @@ static const std::vector<std::pair<BlockId, std::vector<RecordId>>>
         // Template Blocks.
         {BI_TEMPLATE_BLOCK_ID, {}},
         {BI_TEMPLATE_PARAM_BLOCK_ID, {TEMPLATE_PARAM_CONTENTS}},
-        {BI_TEMPLATE_SPECIALIZATION_BLOCK_ID, {TEMPLATE_SPECIALIZATION_OF}}};
+        {BI_TEMPLATE_SPECIALIZATION_BLOCK_ID, {TEMPLATE_SPECIALIZATION_OF}},
+        // Concept Block
+        {BI_CONCEPT_BLOCK_ID,
+         {CONCEPT_USR, CONCEPT_NAME, CONCEPT_IS_TYPE,
+          CONCEPT_CONSTRAINT_EXPRESSION}},
+        // Constraint Block
+        {BI_CONSTRAINT_BLOCK_ID, {CONSTRAINT_EXPRESSION}}};
 
 // AbbreviationMap
 
@@ -524,6 +538,8 @@ void ClangDocBitcodeWriter::emitBlock(const NamespaceInfo &I) {
     emitBlock(C);
   for (const auto &C : I.Children.Typedefs)
     emitBlock(C);
+  for (const auto &C : I.Children.Concepts)
+    emitBlock(C);
 }
 
 void ClangDocBitcodeWriter::emitBlock(const EnumInfo &I) {
@@ -627,12 +643,25 @@ void ClangDocBitcodeWriter::emitBlock(const FunctionInfo &I) {
     emitBlock(*I.Template);
 }
 
+void ClangDocBitcodeWriter::emitBlock(const ConceptInfo &I) {
+  StreamSubBlockGuard Block(Stream, BI_CONCEPT_BLOCK_ID);
+  emitRecord(I.USR, CONCEPT_USR);
+  emitRecord(I.Name, CONCEPT_NAME);
+  for (const auto &CI : I.Description)
+    emitBlock(CI);
+  emitRecord(I.IsType, CONCEPT_IS_TYPE);
+  emitRecord(I.ConstraintExpression, CONCEPT_CONSTRAINT_EXPRESSION);
+  emitBlock(I.Template);
+}
+
 void ClangDocBitcodeWriter::emitBlock(const TemplateInfo &T) {
   StreamSubBlockGuard Block(Stream, BI_TEMPLATE_BLOCK_ID);
   for (const auto &P : T.Params)
     emitBlock(P);
   if (T.Specialization)
     emitBlock(*T.Specialization);
+  for (const auto &C : T.Constraints)
+    emitBlock(C);
 }
 
 void ClangDocBitcodeWriter::emitBlock(const TemplateSpecializationInfo &T) {
@@ -647,6 +676,12 @@ void ClangDocBitcodeWriter::emitBlock(const TemplateParamInfo &T) {
   emitRecord(T.Contents, TEMPLATE_PARAM_CONTENTS);
 }
 
+void ClangDocBitcodeWriter::emitBlock(const ConstraintInfo &C) {
+  StreamSubBlockGuard Block(Stream, BI_CONSTRAINT_BLOCK_ID);
+  emitRecord(C.ConstraintExpr, CONSTRAINT_EXPRESSION);
+  emitBlock(C.ConceptRef, FieldId::F_concept);
+}
+
 bool ClangDocBitcodeWriter::dispatchInfoForWrite(Info *I) {
   switch (I->IT) {
   case InfoType::IT_namespace:
@@ -664,6 +699,9 @@ bool ClangDocBitcodeWriter::dispatchInfoForWrite(Info *I) {
   case InfoType::IT_typedef:
     emitBlock(*static_cast<clang::doc::TypedefInfo *>(I));
     break;
+  case InfoType::IT_concept:
+    emitBlock(*static_cast<clang::doc::ConceptInfo *>(I));
+    break;
   case InfoType::IT_default:
     llvm::errs() << "Unexpected info, unable to write.\n";
     return true;
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.h b/clang-tools-extra/clang-doc/BitcodeWriter.h
index e33a1aece883..4d0c0c07805e 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.h
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.h
@@ -66,7 +66,9 @@ enum BlockId {
   BI_TEMPLATE_BLOCK_ID,
   BI_TEMPLATE_SPECIALIZATION_BLOCK_ID,
   BI_TEMPLATE_PARAM_BLOCK_ID,
+  BI_CONSTRAINT_BLOCK_ID,
   BI_TYPEDEF_BLOCK_ID,
+  BI_CONCEPT_BLOCK_ID,
   BI_LAST,
   BI_FIRST = BI_VERSION_BLOCK_ID
 };
@@ -135,6 +137,11 @@ enum RecordId {
   TYPEDEF_NAME,
   TYPEDEF_DEFLOCATION,
   TYPEDEF_IS_USING,
+  CONCEPT_USR,
+  CONCEPT_NAME,
+  CONCEPT_IS_TYPE,
+  CONCEPT_CONSTRAINT_EXPRESSION,
+  CONSTRAINT_EXPRESSION,
   RI_LAST,
   RI_FIRST = VERSION
 };
@@ -150,7 +157,8 @@ enum class FieldId {
   F_vparent,
   F_type,
   F_child_namespace,
-  F_child_record
+  F_child_record,
+  F_concept
 };
 
 class ClangDocBitcodeWriter {
@@ -179,6 +187,8 @@ public:
   void emitBlock(const TemplateInfo &T);
   void emitBlock(const TemplateSpecializationInfo &T);
   void emitBlock(const TemplateParamInfo &T);
+  void emitBlock(const ConceptInfo &T);
+  void emitBlock(const ConstraintInfo &T);
   void emitBlock(const Reference &B, FieldId F);
 
 private:
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index 7293a129177c..935bbfee7a9b 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -985,6 +985,8 @@ llvm::Error HTMLGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
     MainContentNodes =
         genHTML(*static_cast<clang::doc::TypedefInfo *>(I), CDCtx, InfoTitle);
     break;
+  case InfoType::IT_concept:
+    break;
   case InfoType::IT_default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
@@ -1011,6 +1013,8 @@ static std::string getRefType(InfoType IT) {
     return "enum";
   case InfoType::IT_typedef:
     return "typedef";
+  case InfoType::IT_concept:
+    return "concept";
   }
   llvm_unreachable("Unknown InfoType");
 }
diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
index 69c670b20844..81ba99c21e37 100644
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
@@ -585,6 +585,8 @@ Error MustacheHTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
   case InfoType::IT_typedef:
     OS << "IT_typedef\n";
     break;
+  case InfoType::IT_concept:
+    break;
   case InfoType::IT_default:
     return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
   }
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 0f7cbafcf513..8a37621597c6 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -26,6 +26,15 @@ static void serializeInfo(const TypedefInfo &I, json::Object &Obj,
                           std::optional<StringRef> RepositoryUrl);
 static void serializeInfo(const EnumInfo &I, json::Object &Obj,
                           std::optional<StringRef> RepositoryUrl);
+static void serializeInfo(const ConstraintInfo &I, Object &Obj);
+
+// Convenience lambda to pass to serializeArray.
+// If a serializeInfo needs a RepositoryUrl, create a local lambda that captures
+// the optional.
+static auto SerializeInfoLambda = [](const ConstraintInfo &Info,
+                                     Object &Object) {
+  serializeInfo(Info, Object);
+};
 
 static json::Object serializeLocation(const Location &Loc,
                                       std::optional<StringRef> RepositoryUrl) {
@@ -248,6 +257,27 @@ static void serializeCommonChildren(const ScopeChildren &Children,
   }
 }
 
+template <typename T, typename SerializationFunc>
+static void serializeArray(const std::vector<T> &Records, Object &Obj,
+                           const std::string &Key,
+                           SerializationFunc SerializeInfo) {
+  json::Value RecordsArray = Array();
+  auto &RecordsArrayRef = *RecordsArray.getAsArray();
+  RecordsArrayRef.reserve(Records.size());
+  for (const auto &Item : Records) {
+    json::Value ItemVal = Object();
+    auto &ItemObj = *ItemVal.getAsObject();
+    SerializeInfo(Item, ItemObj);
+    RecordsArrayRef.push_back(ItemVal);
+  }
+  Obj[Key] = RecordsArray;
+}
+
+static void serializeInfo(const ConstraintInfo &I, Object &Obj) {
+  serializeReference(I.ConceptRef, Obj);
+  Obj["Expression"] = I.ConstraintExpr;
+}
+
 static void serializeInfo(const TemplateInfo &Template, Object &Obj) {
   json::Value TemplateVal = Object();
   auto &TemplateObj = *TemplateVal.getAsObject();
@@ -277,9 +307,21 @@ static void serializeInfo(const TemplateInfo &Template, Object &Obj) {
     TemplateObj["Parameters"] = ParamsArray;
   }
 
+  if (!Template.Constraints.empty())
+    serializeArray(Template.Constraints, TemplateObj, "Constraints",
+                   SerializeInfoLambda);
+
   Obj["Template"] = TemplateVal;
 }
 
+static void serializeInfo(const ConceptInfo &I, Object &Obj,
+                          std::optional<StringRef> RepositoryUrl) {
+  serializeCommonAttributes(I, Obj, RepositoryUrl);
+  Obj["IsType"] = I.IsType;
+  Obj["ConstraintExpression"] = I.ConstraintExpression;
+  serializeInfo(I.Template, Obj);
+}
+
 static void serializeInfo(const TypeInfo &I, Object &Obj) {
   Obj["Name"] = I.Type.Name;
   Obj["QualName"] = I.Type.QualName;
@@ -457,6 +499,10 @@ static void serializeInfo(const NamespaceInfo &I, json::Object &Obj,
     Obj["Namespaces"] = NamespacesArray;
   }
 
+  auto SerializeInfo = [RepositoryUrl](const auto &Info, Object &Object) {
+    serializeInfo(Info, Object, RepositoryUrl);
+  };
+
   if (!I.Children.Functions.empty()) {
     json::Value FunctionsArray = Array();
     auto &FunctionsArrayRef = *FunctionsArray.getAsArray();
@@ -470,6 +516,9 @@ static void serializeInfo(const NamespaceInfo &I, json::Object &Obj,
     Obj["Functions"] = FunctionsArray;
   }
 
+  if (!I.Children.Concepts.empty())
+    serializeArray(I.Children.Concepts, Obj, "Concepts", SerializeInfo);
+
   serializeCommonChildren(I.Children, Obj, RepositoryUrl);
 }
 
@@ -520,6 +569,7 @@ Error JSONGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
   case InfoType::IT_record:
     serializeInfo(*static_cast<RecordInfo *>(I), Obj, CDCtx.RepositoryUrl);
     break;
+  case InfoType::IT_concept:
   case InfoType::IT_enum:
   case InfoType::IT_function:
   case InfoType::IT_typedef:
diff --git a/clang-tools-extra/clang-doc/MDGenerator.cpp b/clang-tools-extra/clang-doc/MDGenerator.cpp
index 2becccf8b07d..6e68e09cfa2a 100644
--- a/clang-tools-extra/clang-doc/MDGenerator.cpp
+++ b/clang-tools-extra/clang-doc/MDGenerator.cpp
@@ -372,6 +372,9 @@ static llvm::Error genIndex(ClangDocContext &CDCtx) {
       case InfoType::IT_typedef:
         Type = "Typedef";
         break;
+      case InfoType::IT_concept:
+        Type = "Concept";
+        break;
       case InfoType::IT_default:
         Type = "Other";
       }
@@ -464,6 +467,8 @@ llvm::Error MDGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
   case InfoType::IT_typedef:
     genMarkdown(CDCtx, *static_cast<clang::doc::TypedefInfo *>(I), OS);
     break;
+  case InfoType::IT_concept:
+    break;
   case InfoType::IT_default:
     return createStringError(llvm::inconvertibleErrorCode(),
                              "unexpected InfoType");
diff --git a/clang-tools-extra/clang-doc/Mapper.cpp b/clang-tools-extra/clang-doc/Mapper.cpp
index 9f640b5325da..6021e17b4696 100644
--- a/clang-tools-extra/clang-doc/Mapper.cpp
+++ b/clang-tools-extra/clang-doc/Mapper.cpp
@@ -134,6 +134,10 @@ bool MapASTVisitor::VisitTypeAliasDecl(const TypeAliasDecl *D) {
   return mapDecl(D, /*isDefinition=*/true);
 }
 
+bool MapASTVisitor::VisitConceptDecl(const ConceptDecl *D) {
+  return mapDecl(D, true);
+}
+
 comments::FullComment *
 MapASTVisitor::getComment(const NamedDecl *D, const ASTContext &Context) const {
   RawComment *Comment = Context.getRawCommentForDeclNoCache(D);
diff --git a/clang-tools-extra/clang-doc/Mapper.h b/clang-tools-extra/clang-doc/Mapper.h
index 36322ea2bfb7..04dc5450c8ba 100644
--- a/clang-tools-extra/clang-doc/Mapper.h
+++ b/clang-tools-extra/clang-doc/Mapper.h
@@ -41,6 +41,7 @@ public:
   bool VisitFunctionDecl(const FunctionDecl *D);
   bool VisitTypedefDecl(const TypedefDecl *D);
   bool VisitTypeAliasDecl(const TypeAliasDecl *D);
+  bool VisitConceptDecl(const ConceptDecl *D);
 
 private:
   template <typename T> bool mapDecl(const T *D, bool IsDefinition);
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 71a926f1c73e..286aeeea1001 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -143,6 +143,8 @@ mergeInfos(std::vector<std::unique_ptr<Info>> &Values) {
     return reduce<FunctionInfo>(Values);
   case InfoType::IT_typedef:
     return reduce<TypedefInfo>(Values);
+  case InfoType::IT_concept:
+    return reduce<ConceptInfo>(Values);
   case InfoType::IT_default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
@@ -288,6 +290,7 @@ void NamespaceInfo::merge(NamespaceInfo &&Other) {
   reduceChildren(Children.Functions, std::move(Other.Children.Functions));
   reduceChildren(Children.Enums, std::move(Other.Children.Enums));
   reduceChildren(Children.Typedefs, std::move(Other.Children.Typedefs));
+  reduceChildren(Children.Concepts, std::move(Other.Children.Concepts));
   mergeBase(std::move(Other));
 }
 
@@ -352,6 +355,19 @@ void TypedefInfo::merge(TypedefInfo &&Other) {
   SymbolInfo::merge(std::move(Other));
 }
 
+void ConceptInfo::merge(ConceptInfo &&Other) {
+  assert(mergeable(Other));
+  if (!IsType)
+    IsType = Other.IsType;
+  if (ConstraintExpression.empty())
+    ConstraintExpression = std::move(Other.ConstraintExpression);
+  if (Template.Constraints.empty())
+    Template.Constraints = std::move(Other.Template.Constraints);
+  if (Template.Params.empty())
+    Template.Params = std::move(Other.Template.Params);
+  SymbolInfo::merge(std::move(Other));
+}
+
 BaseRecordInfo::BaseRecordInfo() : RecordInfo() {}
 
 BaseRecordInfo::BaseRecordInfo(SymbolID USR, StringRef Name, StringRef Path,
@@ -388,6 +404,9 @@ llvm::SmallString<16> Info::extractName() const {
   case InfoType::IT_function:
     return llvm::SmallString<16>("@nonymous_function_" +
                                  toHex(llvm::toStringRef(USR)));
+  case InfoType::IT_concept:
+    return llvm::SmallString<16>("@nonymous_concept_" +
+                                 toHex(llvm::toStringRef(USR)));
   case InfoType::IT_default:
     return llvm::SmallString<16>("@nonymous_" + toHex(llvm::toStringRef(USR)));
   }
@@ -453,6 +472,7 @@ void ScopeChildren::sort() {
   llvm::sort(Functions.begin(), Functions.end());
   llvm::sort(Enums.begin(), Enums.end());
   llvm::sort(Typedefs.begin(), Typedefs.end());
+  llvm::sort(Concepts.begin(), Concepts.end());
 }
 } // namespace doc
 } // namespace clang
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index 75da50064581..b23069f2bd32 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -35,6 +35,7 @@ struct EnumInfo;
 struct FunctionInfo;
 struct Info;
 struct TypedefInfo;
+struct ConceptInfo;
 
 enum class InfoType {
   IT_default,
@@ -42,7 +43,8 @@ enum class InfoType {
   IT_record,
   IT_function,
   IT_enum,
-  IT_typedef
+  IT_typedef,
+  IT_concept
 };
 
 enum class CommentKind {
@@ -166,6 +168,7 @@ struct ScopeChildren {
   std::vector<FunctionInfo> Functions;
   std::vector<EnumInfo> Enums;
   std::vector<TypedefInfo> Typedefs;
+  std::vector<ConceptInfo> Concepts;
 
   void sort();
 };
@@ -211,6 +214,15 @@ struct TemplateSpecializationInfo {
   std::vector<TemplateParamInfo> Params;
 };
 
+struct ConstraintInfo {
+  ConstraintInfo() = default;
+  ConstraintInfo(SymbolID USR, StringRef Name)
+      : ConceptRef(USR, Name, InfoType::IT_concept) {}
+  Reference ConceptRef;
+
+  SmallString<16> ConstraintExpr;
+};
+
 // Records the template information for a struct or function that is a template
 // or an explicit template specialization.
 struct TemplateInfo {
@@ -219,6 +231,7 @@ struct TemplateInfo {
 
   // Set when this is a specialization of another record/function.
   std::optional<TemplateSpecializationInfo> Specialization;
+  std::vector<ConstraintInfo> Constraints;
 };
 
 // Info for field types.
@@ -513,6 +526,17 @@ struct EnumInfo : public SymbolInfo {
   llvm::SmallVector<EnumValueInfo, 4> Members; // List of enum members.
 };
 
+struct ConceptInfo : public SymbolInfo {
+  ConceptInfo() : SymbolInfo(InfoType::IT_concept) {}
+  ConceptInfo(SymbolID USR) : SymbolInfo(InfoType::IT_concept, USR) {}
+
+  void merge(ConceptInfo &&I);
+
+  bool IsType;
+  TemplateInfo Template;
+  SmallString<16> ConstraintExpression;
+};
+
 struct Index : public Reference {
   Index() = default;
   Index(StringRef Name) : Reference(SymbolID(), Name) {}
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index 820e8bfd8e64..5f3e5c37fa34 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -21,6 +21,17 @@ namespace clang {
 namespace doc {
 namespace serialize {
 
+namespace {
+static SmallString<16> exprToString(const clang::Expr *E) {
+  clang::LangOptions Opts;
+  clang::PrintingPolicy Policy(Opts);
+  SmallString<16> Result;
+  llvm::raw_svector_ostream OS(Result);
+  E->printPretty(OS, nullptr, Policy);
+  return Result;
+}
+} // namespace
+
 SymbolID hashUSR(llvm::StringRef USR) {
   return llvm::SHA1::hash(arrayRefFromStringRef(USR));
 }
@@ -388,6 +399,8 @@ std::string serialize(std::unique_ptr<Info> &I) {
     return serialize(*static_cast<EnumInfo *>(I.get()));
   case InfoType::IT_function:
     return serialize(*static_cast<FunctionInfo *>(I.get()));
+  case InfoType::IT_concept:
+    return serialize(*static_cast<ConceptInfo *>(I.get()));
   case InfoType::IT_typedef:
   case InfoType::IT_default:
     return "";
@@ -491,6 +504,10 @@ static void InsertChild(ScopeChildren &Scope, TypedefInfo Info) {
   Scope.Typedefs.push_back(std::move(Info));
 }
 
+static void InsertChild(ScopeChildren &Scope, ConceptInfo Info) {
+  Scope.Concepts.push_back(std::move(Info));
+}
+
 // Creates a parent of the correct type for the given child and inserts it into
 // that parent.
 //
@@ -531,6 +548,7 @@ static std::unique_ptr<Info> makeAndInsertIntoParent(ChildType Child) {
   case InfoType::IT_enum:
   case InfoType::IT_function:
   case InfoType::IT_typedef:
+  case InfoType::IT_concept:
     break;
   }
   llvm_unreachable("Invalid reference type for parent namespace");
@@ -740,6 +758,50 @@ static void populateSymbolInfo(SymbolInfo &I, const T *D, const FullComment *C,
     I.Loc.emplace_back(Loc);
 }
 
+static void
+handleCompoundConstraints(const Expr *Constraint,
+                          std::vector<ConstraintInfo> &ConstraintInfos) {
+  if (Constraint->getStmtClass() == Stmt::ParenExprClass) {
+    handleCompoundConstraints(dyn_cast<ParenExpr>(Constraint)->getSubExpr(),
+                              ConstraintInfos);
+  } else if (Constraint->getStmtClass() == Stmt::BinaryOperatorClass) {
+    auto *BinaryOpExpr = dyn_cast<BinaryOperator>(Constraint);
+    handleCompoundConstraints(BinaryOpExpr->getLHS(), ConstraintInfos);
+    handleCompoundConstraints(BinaryOpExpr->getRHS(), ConstraintInfos);
+  } else if (Constraint->getStmtClass() ==
+             Stmt::ConceptSpecializationExprClass) {
+    auto *Concept = dyn_cast<ConceptSpecializationExpr>(Constraint);
+    ConstraintInfo CI(getUSRForDecl(Concept->getNamedConcept()),
+                      Concept->getNamedConcept()->getNameAsString());
+    CI.ConstraintExpr = exprToString(Concept);
+    ConstraintInfos.push_back(CI);
+  }
+}
+
+static void populateConstraints(TemplateInfo &I, const TemplateDecl *D) {
+  if (!D || !D->hasAssociatedConstraints())
+    return;
+
+  SmallVector<AssociatedConstraint> AssociatedConstraints;
+  D->getAssociatedConstraints(AssociatedConstraints);
+  for (const auto &Constraint : AssociatedConstraints) {
+    if (!Constraint)
+      continue;
+
+    // TODO: Investigate if atomic constraints need to be handled specifically.
+    if (const auto *ConstraintExpr =
+            dyn_cast_or_null<ConceptSpecializationExpr>(
+                Constraint.ConstraintExpr)) {
+      ConstraintInfo CI(getUSRForDecl(ConstraintExpr->getNamedConcept()),
+                        ConstraintExpr->getNamedConcept()->getNameAsString());
+      CI.ConstraintExpr = exprToString(ConstraintExpr);
+      I.Constraints.push_back(std::move(CI));
+    } else {
+      handleCompoundConstraints(Constraint.ConstraintExpr, I.Constraints);
+    }
+  }
+}
+
 static void populateFunctionInfo(FunctionInfo &I, const FunctionDecl *D,
                                  const FullComment *FC, Location Loc,
                                  bool &IsInAnonymousNamespace) {
@@ -751,6 +813,8 @@ static void populateFunctionInfo(FunctionInfo &I, const FunctionDecl *D,
   I.IsStatic = D->isStatic();
 
   populateTemplateParameters(I.Template, D);
+  if (I.Template)
+    populateConstraints(I.Template.value(), D->getDescribedFunctionTemplate());
 
   // Handle function template specializations.
   if (const FunctionTemplateSpecializationInfo *FTSI =
@@ -903,6 +967,8 @@ emitInfo(const RecordDecl *D, const FullComment *FC, Location Loc,
   RI->Path = getInfoRelativePath(RI->Namespace);
 
   populateTemplateParameters(RI->Template, D);
+  if (RI->Template)
+    populateConstraints(RI->Template.value(), D->getDescribedTemplate());
 
   // Full and partial specializations.
   if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
@@ -1074,6 +1140,30 @@ emitInfo(const EnumDecl *D, const FullComment *FC, Location Loc,
   return {nullptr, makeAndInsertIntoParent<EnumInfo &&>(std::move(Enum))};
 }
 
+std::pair<std::unique_ptr<Info>, std::unique_ptr<Info>>
+emitInfo(const ConceptDecl *D, const FullComment *FC, const Location &Loc,
+         bool PublicOnly) {
+  ConceptInfo Concept;
+
+  bool IsInAnonymousNamespace = false;
+  populateInfo(Concept, D, FC, IsInAnonymousNamespace);
+  Concept.IsType = D->isTypeConcept();
+  Concept.DefLoc = Loc;
+  Concept.ConstraintExpression = exprToString(D->getConstraintExpr());
+
+  if (auto *ConceptParams = D->getTemplateParameters()) {
+    for (const auto *Param : ConceptParams->asArray()) {
+      Concept.Template.Params.emplace_back(
+          getSourceCode(Param, Param->getSourceRange()));
+    }
+  }
+
+  if (!shouldSerializeInfo(PublicOnly, IsInAnonymousNamespace, D))
+    return {};
+
+  return {nullptr, makeAndInsertIntoParent<ConceptInfo &&>(std::move(Concept))};
+}
+
 } // namespace serialize
 } // namespace doc
 } // namespace clang
diff --git a/clang-tools-extra/clang-doc/Serialize.h b/clang-tools-extra/clang-doc/Serialize.h
index 7e6cbb70721e..497b09bb339f 100644
--- a/clang-tools-extra/clang-doc/Serialize.h
+++ b/clang-tools-extra/clang-doc/Serialize.h
@@ -68,6 +68,10 @@ std::pair<std::unique_ptr<Info>, std::unique_ptr<Info>>
 emitInfo(const TypeAliasDecl *D, const FullComment *FC, Location Loc,
          bool PublicOnly);
 
+std::pair<std::unique_ptr<Info>, std::unique_ptr<Info>>
+emitInfo(const ConceptDecl *D, const FullComment *FC, const Location &Loc,
+         bool PublicOnly);
+
 // Function to hash a given USR value for storage.
 // As USRs (Unified Symbol Resolution) could be large, especially for functions
 // with long type arguments, we use 160-bits SHA1(USR) values to
diff --git a/clang-tools-extra/clang-doc/YAMLGenerator.cpp b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
index 897b5d5ae4c9..f95887104698 100644
--- a/clang-tools-extra/clang-doc/YAMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
@@ -408,6 +408,8 @@ llvm::Error YAMLGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
   case InfoType::IT_typedef:
     InfoYAML << *static_cast<clang::doc::TypedefInfo *>(I);
     break;
+  case InfoType::IT_concept:
+    break;
   case InfoType::IT_default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected InfoType");
diff --git a/clang-tools-extra/test/clang-doc/json/class-requires.cpp b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
index af108a402b40..2dd25771d6d8 100644
--- a/clang-tools-extra/test/clang-doc/json/class-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
@@ -18,15 +18,15 @@ struct MyClass;
 // CHECK-NEXT:  "Path": "GlobalNamespace",
 // CHECK-NEXT:  "TagType": "struct",
 // CHECK-NEXT:  "Template": {
-// CHECK-NOT:     "Constraints": [
-// CHECK-NOT:       {
-// CHECK-NOT:         "Expression": "Addable<T>",
-// CHECK-NOT:         "Name": "Addable",
-// CHECK-NOT:         "Path": "",
-// CHECK-NOT:         "QualName": "Addable",
-// CHECK-NOT:         "USR": "{{[0-9A-F]*}}"
-// CHECK-NOT:       }
-// CHECK-NOT:     ],
+// CHECK-NEXT:    "Constraints": [
+// CHECK-NEXT:      {
+// CHECK-NEXT:        "Expression": "Addable<T>",
+// CHECK-NEXT:        "Name": "Addable",
+// CHECK-NEXT:        "Path": "",
+// CHECK-NEXT:        "QualName": "Addable",
+// CHECK-NEXT:        "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:      }
+// CHECK-NEXT:    ],
 // CHECK-NEXT:    "Parameters": [
 // CHECK-NEXT:      "typename T"
 // CHECK-NEXT:    ]
diff --git a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
new file mode 100644
index 000000000000..b49dec5cc78c
--- /dev/null
+++ b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
@@ -0,0 +1,121 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
+// RUN: FileCheck %s < %t/GlobalNamespace/index.json
+
+template<typename T> concept Incrementable = requires (T a) {
+  a++;
+};
+
+template<typename T> concept Decrementable = requires (T a) {
+  a--;
+};
+
+template<typename T> concept PreIncrementable = requires (T a) {
+  ++a;
+};
+
+template<typename T> concept PreDecrementable = requires (T a) {
+  --a;
+};
+
+template<typename T> requires Incrementable<T> && Decrementable<T> void One();
+
+template<typename T> requires (Incrementable<T> && Decrementable<T>) void Two();
+
+template<typename T> requires (Incrementable<T> && Decrementable<T>) || (PreIncrementable<T> && PreDecrementable<T>) void Three();
+
+template<typename T> requires (Incrementable<T> && Decrementable<T>) || PreIncrementable<T> void Four();
+
+// CHECK:         "Name": "One",
+// CHECK:         "Template": {
+// CHECK-NEXT:      "Constraints": [
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Incrementable<T>",
+// CHECK-NEXT:          "Name": "Incrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Incrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Decrementable<T>",
+// CHECK-NEXT:          "Name": "Decrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Decrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        }
+// CHECK-NEXT:      ],
+// CHECK:         "Name": "Two",
+// CHECK:         "Template": {
+// CHECK-NEXT:      "Constraints": [
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Incrementable<T>",
+// CHECK-NEXT:          "Name": "Incrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Incrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Decrementable<T>",
+// CHECK-NEXT:          "Name": "Decrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Decrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        }
+// CHECK-NEXT:      ],
+// CHECK:         "Name": "Three",
+// CHECK:         "Template": {
+// CHECK-NEXT:      "Constraints": [
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Incrementable<T>",
+// CHECK-NEXT:          "Name": "Incrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Incrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Decrementable<T>",
+// CHECK-NEXT:          "Name": "Decrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Decrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "PreIncrementable<T>",
+// CHECK-NEXT:          "Name": "PreIncrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "PreIncrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "PreDecrementable<T>",
+// CHECK-NEXT:          "Name": "PreDecrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "PreDecrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        }
+// CHECK-NEXT:      ],
+// CHECK:         "Name": "Four",
+// CHECK:         "Template": {
+// CHECK-NEXT:      "Constraints": [
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Incrementable<T>",
+// CHECK-NEXT:          "Name": "Incrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Incrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "Decrementable<T>",
+// CHECK-NEXT:          "Name": "Decrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "Decrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        {
+// CHECK-NEXT:          "Expression": "PreIncrementable<T>",
+// CHECK-NEXT:          "Name": "PreIncrementable",
+// CHECK-NEXT:          "Path": "",
+// CHECK-NEXT:          "QualName": "PreIncrementable",
+// CHECK-NEXT:          "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:        }
+// CHECK-NEXT:      ],
diff --git a/clang-tools-extra/test/clang-doc/json/concept.cpp b/clang-tools-extra/test/clang-doc/json/concept.cpp
index 624f71c6bf9b..887c9d79146a 100644
--- a/clang-tools-extra/test/clang-doc/json/concept.cpp
+++ b/clang-tools-extra/test/clang-doc/json/concept.cpp
@@ -1,5 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
+// RUN: FileCheck %s < %t/GlobalNamespace/index.json
 
 // Requires that T suports post and pre-incrementing.
 template<typename T>
@@ -8,30 +9,30 @@ concept Incrementable = requires(T x) {
   x++;
 };
 
-// CHECK:      {
-// CHECK-NOT:    "Concepts": [
-// CHECK-NOT:      {
-// CHECK-NOT:        "ConstraintExpression": "requires (T x) { ++x; x++; }",
-// CHECK-NOT:        "Description": [
-// CHECK-NOT:          {
-// CHECK-NOT:            "FullComment": {
-// CHECK-NOT:              "Children": [
-// CHECK-NOT:                {
-// CHECK-NOT:                  "ParagraphComment": {
-// CHECK-NOT:                    "Children": [
-// CHECK-NOT:                      {
-// CHECK-NOT:                        "TextComment": " Requires that T suports post and pre-incrementing."
-// CHECK-NOT:        ],
-// CHECK-NOT:        "IsType": true,
-// CHECK-NOT:        "Name": "Incrementable",
-// CHECK-NOT:        "Template": {
-// CHECK-NOT:          "Parameters": [
-// CHECK-NOT:            "typename T"
-// CHECK-NOT:          ]
-// CHECK-NOT:        },
-// CHECK-NOT:        "USR": "{{[0-9A-F]*}}"
-// CHECK-NOT:      }
-// CHECK-NOT:    ],
+// CHECK:       {
+// CHECK-NEXT:    "Concepts": [
+// CHECK-NEXT:      {
+// CHECK-NEXT:        "ConstraintExpression": "requires (T x) { ++x; x++; }",
+// CHECK-NEXT:        "Description": [
+// CHECK-NEXT:          {
+// CHECK-NEXT:            "FullComment": {
+// CHECK-NEXT:              "Children": [
+// CHECK-NEXT:                {
+// CHECK-NEXT:                  "ParagraphComment": {
+// CHECK-NEXT:                    "Children": [
+// CHECK-NEXT:                      {
+// CHECK-NEXT:                        "TextComment": " Requires that T suports post and pre-incrementing."
+// CHECK:             ],
+// CHECK-NEXT:        "IsType": true,
+// CHECK-NEXT:        "Name": "Incrementable",
+// CHECK-NEXT:        "Template": {
+// CHECK-NEXT:          "Parameters": [
+// CHECK-NEXT:            "typename T"
+// CHECK-NEXT:          ]
+// CHECK-NEXT:        },
+// CHECK-NEXT:        "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:      }
+// CHECK-NEXT:    ],
 // CHECK:        "Name": "",
 // CHECK:        "USR": "0000000000000000000000000000000000000000"
 // CHECK:      }
diff --git a/clang-tools-extra/test/clang-doc/json/function-requires.cpp b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
index aa62464d07b4..99eb2bdb898f 100644
--- a/clang-tools-extra/test/clang-doc/json/function-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
@@ -30,15 +30,15 @@ template<Incrementable T> Incrementable auto incrementTwo(T t);
 // CHECK-NEXT:        "USR": "0000000000000000000000000000000000000000"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "Template": {
-// CHECK-NOT:         "Constraints": [
-// CHECK-NOT:           {
-// CHECK-NOT:             "Expression": "Incrementable<T>",
-// CHECK-NOT:             "Name": "Incrementable",
-// CHECK-NOT:             "Path": "",
-// CHECK-NOT:             "QualName": "Incrementable",
-// CHECK-NOT:             "USR": "{{[0-9A-F]*}}"
-// CHECK-NOT:           }
-// CHECK-NOT:         ],
+// CHECK-NEXT:        "Constraints": [
+// CHECK-NEXT:          {
+// CHECK-NEXT:            "Expression": "Incrementable<T>",
+// CHECK-NEXT:            "Name": "Incrementable",
+// CHECK-NEXT:            "Path": "",
+// CHECK-NEXT:            "QualName": "Incrementable",
+// CHECK-NEXT:            "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:          }
+// CHECK-NEXT:        ],
 // CHECK-NEXT:        "Parameters": [
 // CHECK-NEXT:          "typename T"
 // CHECK-NEXT:        ]
@@ -62,15 +62,15 @@ template<Incrementable T> Incrementable auto incrementTwo(T t);
 // CHECK-NEXT:        "USR": "0000000000000000000000000000000000000000"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "Template": {
-// CHECK-NOT:         "Constraints": [
-// CHECK-NOT:           {
-// CHECK-NOT:             "Expression": "Incrementable<T>",
-// CHECK-NOT:             "Name": "Incrementable",
-// CHECK-NOT:             "Path": "",
-// CHECK-NOT:             "QualName": "Incrementable",
-// CHECK-NOT:             "USR": "{{[0-9A-F]*}}"
-// CHECK-NOT:           }
-// CHECK-NOT:         ],
+// CHECK-NEXT:        "Constraints": [
+// CHECK-NEXT:          {
+// CHECK-NEXT:            "Expression": "Incrementable<T>",
+// CHECK-NEXT:            "Name": "Incrementable",
+// CHECK-NEXT:            "Path": "",
+// CHECK-NEXT:            "QualName": "Incrementable",
+// CHECK-NEXT:            "USR": "{{[0-9A-F]*}}"
+// CHECK-NEXT:          }
+// CHECK-NEXT:        ],
 // CHECK-NEXT:        "Parameters": [
 // CHECK-NEXT:          "Incrementable T"
 // CHECK-NEXT:        ]
diff --git a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
index 659870d2a5c0..a38dfd303660 100644
--- a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
@@ -37,6 +37,8 @@ static std::string writeInfo(Info *I) {
     return writeInfo(*static_cast<FunctionInfo *>(I));
   case InfoType::IT_typedef:
     return writeInfo(*static_cast<TypedefInfo *>(I));
+  case InfoType::IT_concept:
+    return writeInfo(*static_cast<ConceptInfo *>(I));
   case InfoType::IT_default:
     return "";
   }

From 72979093e79ca905eac7ce951423f8b0f81c28a8 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Fri, 20 Jun 2025 22:51:23 -0400
Subject: [PATCH 1132/1322] Revert "[Reland][InstCombine] Iterative replacement
 in PtrReplacer" (#145137)

Reverts llvm/llvm-project#144626
---
 .../InstCombineLoadStoreAlloca.cpp            | 163 ++++++++----------
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |  79 ---------
 2 files changed, 68 insertions(+), 174 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 9aec90120d8b..a9751ab03e20 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -243,10 +243,11 @@ public:
   void replacePointer(Value *V);
 
 private:
+  bool collectUsersRecursive(Instruction &I);
   void replace(Instruction *I);
-  Value *getReplacement(Value *V) const { return WorkMap.lookup(V); }
+  Value *getReplacement(Value *I);
   bool isAvailable(Instruction *I) const {
-    return I == &Root || UsersToReplace.contains(I);
+    return I == &Root || Worklist.contains(I);
   }
 
   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
@@ -258,7 +259,8 @@ private:
     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
   }
 
-  SmallSetVector<Instruction *, 32> UsersToReplace;
+  SmallPtrSet<Instruction *, 32> ValuesToRevisit;
+  SmallSetVector<Instruction *, 4> Worklist;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
   Instruction &Root;
@@ -267,79 +269,72 @@ private:
 } // end anonymous namespace
 
 bool PointerReplacer::collectUsers() {
-  SmallVector<Instruction *> Worklist;
-  SmallSetVector<Instruction *, 32> ValuesToRevisit;
+  if (!collectUsersRecursive(Root))
+    return false;
 
-  auto PushUsersToWorklist = [&](Instruction *Inst) {
-    for (auto *U : Inst->users())
-      if (auto *I = dyn_cast<Instruction>(U))
-        if (!isAvailable(I) && !ValuesToRevisit.contains(I))
-          Worklist.emplace_back(I);
-  };
+  // Ensure that all outstanding (indirect) users of I
+  // are inserted into the Worklist. Return false
+  // otherwise.
+  return llvm::set_is_subset(ValuesToRevisit, Worklist);
+}
 
-  PushUsersToWorklist(&Root);
-  while (!Worklist.empty()) {
-    Instruction *Inst = Worklist.pop_back_val();
+bool PointerReplacer::collectUsersRecursive(Instruction &I) {
+  for (auto *U : I.users()) {
+    auto *Inst = cast<Instruction>(&*U);
     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
-      UsersToReplace.insert(Load);
+      Worklist.insert(Load);
     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
-      /// TODO: Handle poison and null pointers for PHI and select.
-      // If all incoming values are available, mark this PHI as
-      // replacable and push it's users into the worklist.
-      bool IsReplacable = true;
-      if (all_of(PHI->incoming_values(), [&](Value *V) {
-            if (!isa<Instruction>(V))
-              return IsReplacable = false;
-            return isAvailable(cast<Instruction>(V));
+      // All incoming values must be instructions for replacability
+      if (any_of(PHI->incoming_values(),
+                 [](Value *V) { return !isa<Instruction>(V); }))
+        return false;
+
+      // If at least one incoming value of the PHI is not in Worklist,
+      // store the PHI for revisiting and skip this iteration of the
+      // loop.
+      if (any_of(PHI->incoming_values(), [this](Value *V) {
+            return !isAvailable(cast<Instruction>(V));
           })) {
-        UsersToReplace.insert(PHI);
-        PushUsersToWorklist(PHI);
+        ValuesToRevisit.insert(Inst);
         continue;
       }
 
-      // Either an incoming value is not an instruction or not all
-      // incoming values are available. If this PHI was already
-      // visited prior to this iteration, return false.
-      if (!IsReplacable || !ValuesToRevisit.insert(PHI))
+      Worklist.insert(PHI);
+      if (!collectUsersRecursive(*PHI))
         return false;
-
-      // Push PHI back into the stack, followed by unavailable
-      // incoming values.
-      Worklist.emplace_back(PHI);
-      for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
-        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
-        if (UsersToReplace.contains(IncomingValue))
-          continue;
-        if (!ValuesToRevisit.insert(IncomingValue))
-          return false;
-        Worklist.emplace_back(IncomingValue);
-      }
     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
-      auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
-      auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
-      if (!TrueInst || !FalseInst)
+      if (!isa<Instruction>(SI->getTrueValue()) ||
+          !isa<Instruction>(SI->getFalseValue()))
         return false;
 
-      UsersToReplace.insert(SI);
-      PushUsersToWorklist(SI);
-    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-      UsersToReplace.insert(GEP);
-      PushUsersToWorklist(GEP);
+      if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
+          !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
+        ValuesToRevisit.insert(Inst);
+        continue;
+      }
+      Worklist.insert(SI);
+      if (!collectUsersRecursive(*SI))
+        return false;
+    } else if (isa<GetElementPtrInst>(Inst)) {
+      Worklist.insert(Inst);
+      if (!collectUsersRecursive(*Inst))
+        return false;
     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
       if (MI->isVolatile())
         return false;
-      UsersToReplace.insert(Inst);
+      Worklist.insert(Inst);
     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
-      UsersToReplace.insert(Inst);
-      PushUsersToWorklist(Inst);
+      Worklist.insert(Inst);
+      if (!collectUsersRecursive(*Inst))
+        return false;
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
     } else {
       // TODO: For arbitrary uses with address space mismatches, should we check
       // if we can introduce a valid addrspacecast?
-      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n');
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
       return false;
     }
   }
@@ -347,39 +342,7 @@ bool PointerReplacer::collectUsers() {
   return true;
 }
 
-void PointerReplacer::replacePointer(Value *V) {
-  assert(cast<PointerType>(Root.getType()) != cast<PointerType>(V->getType()) &&
-         "Invalid usage");
-  WorkMap[&Root] = V;
-  SmallVector<Instruction *> Worklist;
-  SetVector<Instruction *> PostOrderWorklist;
-  SmallPtrSet<Instruction *, 32> Visited;
-
-  // Perform a postorder traversal of the users of Root.
-  Worklist.push_back(&Root);
-  while (!Worklist.empty()) {
-    Instruction *I = Worklist.back();
-
-    // If I has not been processed before, push each of its
-    // replacable users into the worklist.
-    if (Visited.insert(I).second) {
-      for (auto *U : I->users()) {
-        auto *UserInst = cast<Instruction>(U);
-        if (UsersToReplace.contains(UserInst))
-          Worklist.push_back(UserInst);
-      }
-      // Otherwise, users of I have already been pushed into
-      // the PostOrderWorklist. Push I as well.
-    } else {
-      PostOrderWorklist.insert(I);
-      Worklist.pop_back();
-    }
-  }
-
-  // Replace pointers in reverse-postorder.
-  for (Instruction *I : reverse(PostOrderWorklist))
-    replace(I);
-}
+Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
 
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
@@ -401,15 +364,13 @@ void PointerReplacer::replace(Instruction *I) {
     // replacement (new value).
     WorkMap[NewI] = NewI;
   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
-    // Create a new PHI by replacing any incoming value that is a user of the
-    // root pointer and has a replacement.
-    Value *V = WorkMap.lookup(PHI->getIncomingValue(0));
-    PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType());
-    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) {
-      Value *V = WorkMap.lookup(PHI->getIncomingValue(I));
-      PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I));
-    }
-    WorkMap[PHI] = PHI;
+    Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
+    auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
+                                   PHI->getName(), PHI->getIterator());
+    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
+      NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
+                          PHI->getIncomingBlock(I));
+    WorkMap[PHI] = NewPHI;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
@@ -473,6 +434,18 @@ void PointerReplacer::replace(Instruction *I) {
   }
 }
 
+void PointerReplacer::replacePointer(Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(Root.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && "Invalid usage");
+#endif
+  WorkMap[&Root] = V;
+
+  for (Instruction *Workitem : Worklist)
+    replace(Workitem);
+}
+
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
     return I;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
deleted file mode 100644
index 538cc19f9722..000000000000
--- a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s
-
-%struct.type = type { [256 x <2 x i64>] }
-@g1 = external hidden addrspace(3) global %struct.type, align 16
-
-; This test requires the PtrReplacer to replace users in an RPO traversal.
-; Furthermore, %ptr.else need not to be replaced so it must be retained in
-; %ptr.sink.
-define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
-; CHECK-LABEL: define <2 x i64> @func(
-; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
-; CHECK-NEXT:    br label %[[SINK:.*]]
-; CHECK:       [[IF_ELSE]]:
-; CHECK-NEXT:    [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-; CHECK-NEXT:    br label %[[SINK]]
-; CHECK:       [[SINK]]:
-; CHECK-NEXT:    [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ]
-; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16
-; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
-;
-entry:
-  %coerce = alloca %struct.type, align 16, addrspace(5)
-  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
-  br i1 %cmp.0, label %if.then, label %if.else
-
-if.then:                                    ; preds = %entry
-  %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
-  %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr
-  br label %sink
-
-if.else:                                      ; preds = %entry
-  %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-  %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0
-  br label %sink
-
-sink:
-  %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ]
-  %val.sink = load <2 x i64>, ptr %ptr.sink, align 16
-  ret <2 x i64> %val.sink
-}
-
-define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
-; CHECK-LABEL: define <2 x i64> @func_phi_loop(
-; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-; CHECK-NEXT:    br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]]
-; CHECK:       [[SINK]]:
-; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16
-; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
-;
-entry:
-  %coerce = alloca %struct.type, align 16, addrspace(5)
-  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
-  %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
-  %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr
-  br label %loop
-
-loop:
-  %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ]
-  %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-  %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0
-  br i1 %cmp.0, label %loop, label %sink
-
-sink:
-  %val.sink = load <2 x i64>, ptr %ptr.phi, align 16
-  ret <2 x i64> %val.sink
-}
-
-declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0

From 17e8465a3eb0cae48b9f62d27fd26f2b070f1f9b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 20 Jun 2025 20:06:32 -0700
Subject: [PATCH 1133/1322] AArch64: Replace AArch64MCExpr with MCSpecifierExpr

Replace AArch64MCExpr, which encodes expressions with relocation
specifiers, with the new generic MCSpecifierExpr interface, aligning
with other targets by phasing out target-specific XXXMCExpr classes.

Temporarily convert AArch64MCExpr to a namespace to avoid renaming
`AArch64MCExpr::VK_` constants in this PR. A follow-up patch will rename
these to `AArch64::S_` to match the convention used by other targets.

Move helper functions to AArch64MCAsmInfo.h, with the goal of eventually
removing AArch64MCExpr.h.

Pull Request: https://github.com/llvm/llvm-project/pull/144632
---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 47 ++++++++----------
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  4 +-
 .../lib/Target/AArch64/AArch64MCInstLower.cpp |  6 +--
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 12 ++---
 .../MCTargetDesc/AArch64AsmBackend.cpp        | 26 +++++-----
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |  9 ++--
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   | 12 +++++
 .../MCTargetDesc/AArch64MCCodeEmitter.cpp     |  4 +-
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    | 10 ----
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      | 48 ++++++-------------
 .../AArch64WinCOFFObjectWriter.cpp            | 10 ++--
 11 files changed, 81 insertions(+), 107 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index eb1d9d8a1951..e6e0aeba3457 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -14,7 +14,7 @@
 #include "AArch64MCSymbolizer.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "bolt/Core/BinaryBasicBlock.h"
@@ -179,13 +179,10 @@ public:
 
   bool equals(const MCSpecifierExpr &A, const MCSpecifierExpr &B,
               CompFuncTy Comp) const override {
-    const auto &AArch64ExprA = cast<AArch64MCExpr>(A);
-    const auto &AArch64ExprB = cast<AArch64MCExpr>(B);
-    if (AArch64ExprA.getKind() != AArch64ExprB.getKind())
+    if (A.getSpecifier() != B.getSpecifier())
       return false;
 
-    return MCPlusBuilder::equals(*AArch64ExprA.getSubExpr(),
-                                 *AArch64ExprB.getSubExpr(), Comp);
+    return MCPlusBuilder::equals(*A.getSubExpr(), *B.getSubExpr(), Comp);
   }
 
   bool shortenInstruction(MCInst &, const MCSubtargetInfo &) const override {
@@ -1084,7 +1081,7 @@ public:
 
     if (isADR(Inst) || RelType == ELF::R_AARCH64_ADR_PREL_LO21 ||
         RelType == ELF::R_AARCH64_TLSDESC_ADR_PREL21) {
-      return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS, Ctx);
+      return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS, Ctx);
     } else if (isADRP(Inst) || RelType == ELF::R_AARCH64_ADR_PREL_PG_HI21 ||
                RelType == ELF::R_AARCH64_ADR_PREL_PG_HI21_NC ||
                RelType == ELF::R_AARCH64_TLSDESC_ADR_PAGE21 ||
@@ -1092,7 +1089,7 @@ public:
                RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
       // Never emit a GOT reloc, we handled this in
       // RewriteInstance::readRelocations().
-      return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, Ctx);
+      return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, Ctx);
     } else {
       switch (RelType) {
       case ELF::R_AARCH64_ADD_ABS_LO12_NC:
@@ -1106,18 +1103,18 @@ public:
       case ELF::R_AARCH64_TLSDESC_LD64_LO12:
       case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
       case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
-        return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_LO12, Ctx);
+        return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_LO12, Ctx);
       case ELF::R_AARCH64_MOVW_UABS_G3:
-        return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_G3, Ctx);
+        return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS_G3, Ctx);
       case ELF::R_AARCH64_MOVW_UABS_G2:
       case ELF::R_AARCH64_MOVW_UABS_G2_NC:
-        return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_G2_NC, Ctx);
+        return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS_G2_NC, Ctx);
       case ELF::R_AARCH64_MOVW_UABS_G1:
       case ELF::R_AARCH64_MOVW_UABS_G1_NC:
-        return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_G1_NC, Ctx);
+        return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS_G1_NC, Ctx);
       case ELF::R_AARCH64_MOVW_UABS_G0:
       case ELF::R_AARCH64_MOVW_UABS_G0_NC:
-        return AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_G0_NC, Ctx);
+        return MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS_G0_NC, Ctx);
       default:
         break;
       }
@@ -1142,7 +1139,7 @@ public:
   }
 
   const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
-    auto *AArchExpr = dyn_cast<AArch64MCExpr>(Expr);
+    auto *AArchExpr = dyn_cast<MCSpecifierExpr>(Expr);
     if (AArchExpr && AArchExpr->getSubExpr())
       return getTargetSymbol(AArchExpr->getSubExpr());
 
@@ -1162,7 +1159,7 @@ public:
   }
 
   int64_t getTargetAddend(const MCExpr *Expr) const override {
-    auto *AArchExpr = dyn_cast<AArch64MCExpr>(Expr);
+    auto *AArchExpr = dyn_cast<MCSpecifierExpr>(Expr);
     if (AArchExpr && AArchExpr->getSubExpr())
       return getTargetAddend(AArchExpr->getSubExpr());
 
@@ -2030,9 +2027,8 @@ public:
     MCInst Inst;
     Inst.setOpcode(AArch64::MOVZXi);
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
-    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
-        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
-        AArch64MCExpr::VK_ABS_G3, *Ctx)));
+    Inst.addOperand(MCOperand::createExpr(
+        MCSpecifierExpr::create(Target, AArch64MCExpr::VK_ABS_G3, *Ctx)));
     Inst.addOperand(MCOperand::createImm(0x30));
     Seq.emplace_back(Inst);
 
@@ -2040,9 +2036,8 @@ public:
     Inst.setOpcode(AArch64::MOVKXi);
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
-    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
-        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
-        AArch64MCExpr::VK_ABS_G2_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createExpr(
+        MCSpecifierExpr::create(Target, AArch64MCExpr::VK_ABS_G2_NC, *Ctx)));
     Inst.addOperand(MCOperand::createImm(0x20));
     Seq.emplace_back(Inst);
 
@@ -2050,9 +2045,8 @@ public:
     Inst.setOpcode(AArch64::MOVKXi);
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
-    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
-        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
-        AArch64MCExpr::VK_ABS_G1_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createExpr(
+        MCSpecifierExpr::create(Target, AArch64MCExpr::VK_ABS_G1_NC, *Ctx)));
     Inst.addOperand(MCOperand::createImm(0x10));
     Seq.emplace_back(Inst);
 
@@ -2060,9 +2054,8 @@ public:
     Inst.setOpcode(AArch64::MOVKXi);
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
     Inst.addOperand(MCOperand::createReg(AArch64::X16));
-    Inst.addOperand(MCOperand::createExpr(AArch64MCExpr::create(
-        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
-        AArch64MCExpr::VK_ABS_G0_NC, *Ctx)));
+    Inst.addOperand(MCOperand::createExpr(
+        MCSpecifierExpr::create(Target, AArch64MCExpr::VK_ABS_G0_NC, *Ctx)));
     Inst.addOperand(MCOperand::createImm(0));
     Seq.emplace_back(Inst);
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 4099f40ea07f..a16c104d8bef 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -910,13 +910,13 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) {
       // have a chance to save them.
       EmitToStreamer(MCInstBuilder(AArch64::ADRP)
                          .addReg(AArch64::X16)
-                         .addExpr(AArch64MCExpr::create(
+                         .addExpr(MCSpecifierExpr::create(
                              HwasanTagMismatchRef, AArch64MCExpr::VK_GOT_PAGE,
                              OutContext)));
       EmitToStreamer(MCInstBuilder(AArch64::LDRXui)
                          .addReg(AArch64::X16)
                          .addReg(AArch64::X16)
-                         .addExpr(AArch64MCExpr::create(
+                         .addExpr(MCSpecifierExpr::create(
                              HwasanTagMismatchRef, AArch64MCExpr::VK_GOT_LO12,
                              OutContext)));
       EmitToStreamer(MCInstBuilder(AArch64::BR).addReg(AArch64::X16));
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index eb699c75cf10..fd3ce6c72e50 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -171,7 +171,7 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandMachO(const MachineOperand &MO,
              AArch64II::MO_PAGEOFF)
       Spec = AArch64MCExpr::M_PAGEOFF;
   }
-  // TODO: Migrate to AArch64MCExpr::create like ELF.
+  // TODO: Migrate to MCSpecifierExpr::create like ELF.
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Spec, Ctx);
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::createAdd(
@@ -265,7 +265,7 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
 
   AArch64MCExpr::Specifier RefKind;
   RefKind = static_cast<AArch64MCExpr::Specifier>(RefFlags);
-  Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, RefKind, Ctx);
 
   return MCOperand::createExpr(Expr);
 }
@@ -320,7 +320,7 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
   auto RefKind = static_cast<AArch64MCExpr::Specifier>(RefFlags);
   assert(RefKind != AArch64MCExpr::VK_INVALID &&
          "Invalid relocation requested");
-  Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, RefKind, Ctx);
 
   return MCOperand::createExpr(Expr);
 }
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index d8bdc01a3454..faa82abbd898 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3307,8 +3307,8 @@ ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
         ELFSpec == AArch64MCExpr::VK_INVALID) {
       // No modifier was specified at all; this is the syntax for an ELF basic
       // ADRP relocation (unfortunately).
-      Expr =
-          AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+      Expr = MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE,
+                                     getContext());
     } else if ((DarwinSpec == AArch64MCExpr::M_GOTPAGE ||
                 DarwinSpec == AArch64MCExpr::M_TLVPPAGE) &&
                Addend != 0) {
@@ -3361,7 +3361,7 @@ ParseStatus AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
         ELFSpec == AArch64MCExpr::VK_INVALID) {
       // No modifier was specified at all; this is the syntax for an ELF basic
       // ADR relocation (unfortunately).
-      Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS, getContext());
+      Expr = MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_ABS, getContext());
     } else if (ELFSpec != AArch64MCExpr::VK_GOT_AUTH_PAGE) {
       // For tiny code model, we use :got_auth: operator to fill 21-bit imm of
       // adr. It's not actually GOT entry page address but the GOT address
@@ -4478,7 +4478,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
     return true;
 
   if (HasELFModifier)
-    ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext());
+    ImmVal = MCSpecifierExpr::create(ImmVal, RefKind, getContext());
 
   SMLoc EndLoc;
   if (getContext().getAsmInfo()->hasSubsectionsViaSymbols()) {
@@ -7360,7 +7360,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-  Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
+  Expr = MCSpecifierExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
 
   MCInst Inst;
   Inst.setOpcode(AArch64::TLSDESCCALL);
@@ -8288,7 +8288,7 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
   DarwinSpec = AArch64MCExpr::None;
   Addend = 0;
 
-  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+  if (auto *AE = dyn_cast<MCSpecifierExpr>(Expr)) {
     ELFSpec = AE->getSpecifier();
     Expr = AE->getSubExpr();
   }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 0d29316d843e..88ba2ef3fe1f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/BinaryFormat/MachO.h"
@@ -221,8 +221,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
   case AArch64::fixup_aarch64_movw: {
     AArch64MCExpr::Specifier RefKind =
         static_cast<AArch64MCExpr::Specifier>(Target.getSpecifier());
-    if (AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS &&
-        AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) {
+    if (AArch64::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS &&
+        AArch64::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) {
       if (!RefKind) {
         // The fixup is an expression
         if (SignedValue > 0xFFFF || SignedValue < -0xFFFF)
@@ -250,8 +250,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
       return Value;
     }
 
-    if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
-      switch (AArch64MCExpr::getAddressFrag(RefKind)) {
+    if (AArch64::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
+      switch (AArch64::getAddressFrag(RefKind)) {
       case AArch64MCExpr::VK_G0:
         break;
       case AArch64MCExpr::VK_G1:
@@ -268,7 +268,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
       }
 
     } else {
-      switch (AArch64MCExpr::getAddressFrag(RefKind)) {
+      switch (AArch64::getAddressFrag(RefKind)) {
       case AArch64MCExpr::VK_G0:
         break;
       case AArch64MCExpr::VK_G1:
@@ -287,8 +287,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
 
     if (RefKind & AArch64MCExpr::VK_NC) {
       Value &= 0xFFFF;
-    }
-    else if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
+    } else if (AArch64::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
       if (SignedValue > 0xFFFF || SignedValue < -0xFFFF)
         Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
 
@@ -296,8 +295,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
       if (SignedValue < 0)
         SignedValue = ~SignedValue;
       Value = static_cast<uint64_t>(SignedValue);
-    }
-    else if (Value > 0xFFFF) {
+    } else if (Value > 0xFFFF) {
       Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     }
     return Value;
@@ -424,9 +422,9 @@ void AArch64AsmBackend::applyFixup(const MCFragment &, const MCFixup &Fixup,
 
   if (Fixup.getTargetKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) {
     auto RefKind = static_cast<AArch64MCExpr::Specifier>(Target.getSpecifier());
-    AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
-    if (SymLoc == AArch64AuthMCExpr::VK_AUTH ||
-        SymLoc == AArch64AuthMCExpr::VK_AUTHADDR) {
+    AArch64MCExpr::Specifier SymLoc = AArch64::getSymbolLoc(RefKind);
+    if (SymLoc == AArch64MCExpr::VK_AUTH ||
+        SymLoc == AArch64MCExpr::VK_AUTHADDR) {
       const auto *Expr = dyn_cast<AArch64AuthMCExpr>(Fixup.getValue());
       if (!Expr) {
         getContext().reportError(Fixup.getValue()->getLoc(),
@@ -479,7 +477,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &, const MCFixup &Fixup,
   // handle this more cleanly. This may affect the output of -show-mc-encoding.
   AArch64MCExpr::Specifier RefKind =
       static_cast<AArch64MCExpr::Specifier>(Target.getSpecifier());
-  if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS ||
+  if (AArch64::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS ||
       (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) {
     // If the immediate is negative, generate MOVN else MOVZ.
     // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index df2e13a35dcb..28aa4644a55f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
@@ -88,8 +88,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
   unsigned Kind = Fixup.getTargetKind();
   AArch64MCExpr::Specifier RefKind =
       static_cast<AArch64MCExpr::Specifier>(Target.getSpecifier());
-  AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
-  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+  AArch64MCExpr::Specifier SymLoc = AArch64::getSymbolLoc(RefKind);
+  bool IsNC = AArch64::isNotChecked(RefKind);
 
   switch (SymLoc) {
   case AArch64MCExpr::VK_DTPREL:
@@ -356,8 +356,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
       if ((SymLoc == AArch64MCExpr::VK_GOT ||
            SymLoc == AArch64MCExpr::VK_GOT_AUTH) &&
           IsNC) {
-        AArch64MCExpr::Specifier AddressLoc =
-            AArch64MCExpr::getAddressFrag(RefKind);
+        AArch64MCExpr::Specifier AddressLoc = AArch64::getAddressFrag(RefKind);
         bool IsAuth = (SymLoc == AArch64MCExpr::VK_GOT_AUTH);
         if (!IsILP32) {
           if (AddressLoc == AArch64MCExpr::VK_LO15)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index bc02586d7388..58e87f8a208f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -58,9 +58,21 @@ struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
 };
 
 namespace AArch64 {
+using Specifier = uint16_t;
+
 /// Return the string representation of the ELF relocation specifier
 /// (e.g. ":got:", ":lo12:").
 StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+
+inline Specifier getSymbolLoc(Specifier S) {
+  return static_cast<Specifier>(S & AArch64MCExpr::VK_SymLocBits);
+}
+
+inline Specifier getAddressFrag(Specifier S) {
+  return static_cast<Specifier>(S & AArch64MCExpr::VK_AddressFragBits);
+}
+
+inline bool isNotChecked(Specifier S) { return S & AArch64MCExpr::VK_NC; }
 } // namespace AArch64
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 4dc30b48c902..6db0d7de45e5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -308,7 +308,7 @@ AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
 
   // Set the shift bit of the add instruction for relocation types
   // R_AARCH64_TLSLE_ADD_TPREL_HI12 and R_AARCH64_TLSLD_ADD_DTPREL_HI12.
-  if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+  if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
     AArch64MCExpr::Specifier RefKind = A64E->getSpecifier();
     if (RefKind == AArch64MCExpr::VK_TPREL_HI12 ||
         RefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
@@ -718,7 +718,7 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
     return EncodedValue;
 
   const MCExpr *E = UImm16MO.getExpr();
-  if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
+  if (auto *A64E = dyn_cast<MCSpecifierExpr>(E)) {
     switch (A64E->getSpecifier()) {
     case AArch64MCExpr::VK_DTPREL_G2:
     case AArch64MCExpr::VK_DTPREL_G1:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 7a7c6f7effd9..a3f58ca4ee14 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -5,11 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the relocation specifiers
-// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
-//
-//===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
 #include "AArch64MCAsmInfo.h"
@@ -20,11 +15,6 @@
 
 using namespace llvm;
 
-const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
-                                           MCContext &Ctx) {
-  return new (Ctx) AArch64MCExpr(Expr, S);
-}
-
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 541f24c943a1..8ffd14e338e1 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -21,11 +21,10 @@
 
 namespace llvm {
 
-class AArch64MCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-  enum {
-    // clang-format off
+namespace AArch64MCExpr {
+using Specifier = uint16_t;
+enum {
+  // clang-format off
     None          = 0,
     // Symbol locations specifying (roughly speaking) what calculation should be
     // performed to construct the final address for the relocated
@@ -137,35 +136,18 @@ public:
     M_TLVPPAGEOFF,
 
     VK_INVALID  = 0xfff
-    // clang-format on
-  };
-
-protected:
-  explicit AArch64MCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
-                                     MCContext &Ctx);
-
-  static Specifier getSymbolLoc(Specifier S) {
-    return static_cast<Specifier>(S & VK_SymLocBits);
-  }
-
-  static Specifier getAddressFrag(Specifier S) {
-    return static_cast<Specifier>(S & VK_AddressFragBits);
-  }
-
-  static bool isNotChecked(Specifier S) { return S & VK_NC; }
+  // clang-format on
 };
+} // namespace AArch64MCExpr
 
-class AArch64AuthMCExpr final : public AArch64MCExpr {
+class AArch64AuthMCExpr final : public MCSpecifierExpr {
   uint16_t Discriminator;
   AArch64PACKey::ID Key;
 
   explicit AArch64AuthMCExpr(const MCExpr *Expr, uint16_t Discriminator,
                              AArch64PACKey::ID Key, bool HasAddressDiversity)
-      : AArch64MCExpr(Expr, HasAddressDiversity ? VK_AUTHADDR : VK_AUTH),
+      : MCSpecifierExpr(Expr, HasAddressDiversity ? AArch64MCExpr::VK_AUTHADDR
+                                                  : AArch64MCExpr::VK_AUTH),
         Discriminator(Discriminator), Key(Key) {}
 
 public:
@@ -175,16 +157,16 @@ public:
 
   AArch64PACKey::ID getKey() const { return Key; }
   uint16_t getDiscriminator() const { return Discriminator; }
-  bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
+  bool hasAddressDiversity() const {
+    return getSpecifier() == AArch64MCExpr::VK_AUTHADDR;
+  }
 
   void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   static bool classof(const MCExpr *E) {
-    return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
-  }
-
-  static bool classof(const AArch64MCExpr *E) {
-    return E->getSpecifier() == VK_AUTH || E->getSpecifier() == VK_AUTHADDR;
+    auto *SE = dyn_cast<MCSpecifierExpr>(E);
+    return SE && (SE->getSpecifier() == AArch64MCExpr::VK_AUTH ||
+                  SE->getSpecifier() == AArch64MCExpr::VK_AUTHADDR);
   }
 };
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 2e997631655e..70df95029c33 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -64,9 +64,9 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   auto Spec = Target.getSpecifier();
   const MCExpr *Expr = Fixup.getValue();
 
-  if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+  if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
     AArch64MCExpr::Specifier Spec = A64E->getSpecifier();
-    switch (AArch64MCExpr::getSymbolLoc(Spec)) {
+    switch (AArch64::getSymbolLoc(Spec)) {
     case AArch64MCExpr::VK_ABS:
     case AArch64MCExpr::VK_SECREL:
       // Supported
@@ -81,7 +81,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
 
   switch (FixupKind) {
   default: {
-    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+    if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
                                           AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
@@ -116,7 +116,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
     return COFF::IMAGE_REL_ARM64_SECREL;
 
   case AArch64::fixup_aarch64_add_imm12:
-    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+    if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
       AArch64MCExpr::Specifier Spec = A64E->getSpecifier();
       if (Spec == AArch64MCExpr::VK_SECREL_LO12)
         return COFF::IMAGE_REL_ARM64_SECREL_LOW12A;
@@ -130,7 +130,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
-    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+    if (auto *A64E = dyn_cast<MCSpecifierExpr>(Expr)) {
       AArch64MCExpr::Specifier Spec = A64E->getSpecifier();
       if (Spec == AArch64MCExpr::VK_SECREL_LO12)
         return COFF::IMAGE_REL_ARM64_SECREL_LOW12L;

From f4661310550d33dcb6942427ed32a3cefc0efaf2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 21 Jun 2025 12:18:25 +0900
Subject: [PATCH 1134/1322] AMDGPU: Use reportFatalUsageError in
 AMDGPULowerModuleLDS (#145130)

---
 llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp      | 9 +++++----
 llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll | 6 +++---
 .../AMDGPU/lower-module-lds-single-var-ambiguous.ll      | 2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index e0444da2c461..d443f4ea7d5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -572,7 +572,7 @@ public:
 
       if (OrderedKernels.size() > UINT32_MAX) {
         // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
-        report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+        reportFatalUsageError("unimplemented LDS lowering for > 2**32 kernels");
       }
 
       for (size_t i = 0; i < OrderedKernels.size(); i++) {
@@ -632,7 +632,8 @@ public:
         if (K.second.size() == 1) {
           KernelAccessVariables.insert(GV);
         } else {
-          report_fatal_error(
+          // FIXME: This should use DiagnosticInfo
+          reportFatalUsageError(
               "cannot lower LDS '" + GV->getName() +
               "' to kernel access as it is reachable from multiple kernels");
         }
@@ -781,7 +782,7 @@ public:
       // backend) difficult to use. This does mean that llvm test cases need
       // to name the kernels.
       if (!Func.hasName()) {
-        report_fatal_error("Anonymous kernels cannot use LDS variables");
+        reportFatalUsageError("anonymous kernels cannot use LDS variables");
       }
 
       std::string VarName =
@@ -877,7 +878,7 @@ public:
         if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
           assert(isKernelLDS(func));
           if (!func->hasName()) {
-            report_fatal_error("Anonymous kernels cannot use LDS variables");
+            reportFatalUsageError("anonymous kernels cannot use LDS variables");
           }
 
           GlobalVariable *N =
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
index 9648cb12186b..2a9d18add0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
@@ -1,9 +1,9 @@
-; RUN: not --crash opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
-; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: not opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: not opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 
 @var1 = addrspace(3) global i32 poison, align 8
 
-; CHECK: LLVM ERROR: Anonymous kernels cannot use LDS variables
+; CHECK: LLVM ERROR: anonymous kernels cannot use LDS variables
 define amdgpu_kernel void @0() {
   %val0 = load i32, ptr addrspace(3) @var1
   %val1 = add i32 %val0, 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
index bd95cdddd461..3aeaa1ddbef2 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -check-prefixes=CHECK,M_OR_HY %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefixes=CHECK,TABLE %s
-; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=kernel 2>&1 | FileCheck -check-prefixes=KERNEL %s
+; RUN: not opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=kernel 2>&1 | FileCheck -check-prefixes=KERNEL %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefixes=CHECK,M_OR_HY %s
 
 ;; Two kernels access the same variable, specialisation gives them each their own copy of it

From fa117715ca3645603859c8474ea3312639f3ff66 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Sat, 21 Jun 2025 11:32:28 +0800
Subject: [PATCH 1135/1322] [RISCV] Implement Feature Bit for Q (#145001)

---
 compiler-rt/lib/builtins/cpu_model/riscv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
index 16d55fcfffe7..c02f6e9961ca 100644
--- a/compiler-rt/lib/builtins/cpu_model/riscv.c
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -40,6 +40,8 @@ struct {
 #define I_BITMASK (1ULL << 8)
 #define M_GROUPID 0
 #define M_BITMASK (1ULL << 12)
+#define Q_GROUPID 0
+#define Q_BITMASK (1ULL << 16)
 #define V_GROUPID 0
 #define V_BITMASK (1ULL << 21)
 #define ZACAS_GROUPID 0

From 70312802184f4000b286e8830d9e1342505939ee Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 21 Jun 2025 13:01:18 +0900
Subject: [PATCH 1136/1322] AMDGPU: Use reportFatalUsageError for unsupported
 code object version (#145133)

---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp            |  2 +-
 .../CodeGen/AMDGPU/unsupported-code-object-version.ll  | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 84b0f9855409..9513b7b2aef2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -350,7 +350,7 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
       HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
       break;
     default:
-      report_fatal_error("Unexpected code object version");
+      reportFatalUsageError("unsupported code object version");
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-code-object-version.ll b/llvm/test/CodeGen/AMDGPU/unsupported-code-object-version.ll
index da7bc3a85d73..f38925418366 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-code-object-version.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-code-object-version.ll
@@ -1,8 +1,10 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/0/g' %s | not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
-; RUN: sed 's/CODE_OBJECT_VERSION/100/g' %s | not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
-; RUN: sed 's/CODE_OBJECT_VERSION/9900/g' %s | not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
+; RUN: sed 's/CODE_OBJECT_VERSION/0/g' %s | not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
+; RUN: sed 's/CODE_OBJECT_VERSION/100/g' %s | not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
+; RUN: sed 's/CODE_OBJECT_VERSION/9900/g' %s | not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
+; RUN: sed 's/CODE_OBJECT_VERSION/0/g' %s | not llc -filetype=obj -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
+; RUN: sed 's/CODE_OBJECT_VERSION/0/g' %s | not llc -filetype=asm -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 2>&1 | FileCheck --check-prefix=HSA-ERROR %s
 
-; HSA-ERROR: Unexpected code object version
+; HSA-ERROR: unsupported code object version
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION}

From 4c2b9317f8fc08e2f6ef369a5018ee5b5626cf01 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Sat, 21 Jun 2025 06:43:25 +0200
Subject: [PATCH 1137/1322] [mlir][ods] Document InferTypeOpInterface behavior.
 (#145060)

Confused folks again recently and I couldn't find where we documented
it, figured this is more discoverable.
---
 .../mlir/Interfaces/InferTypeOpInterface.td   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
index 92d4a99ecb7f..49925fe7701c 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
@@ -9,6 +9,11 @@
 // This file contains a set of interfaces that can be used to define information
 // related to type inference.
 //
+// This interface is also used by ODS to create builders for operations that
+// do not require result type to be specified. Including this interface in
+// dialect op definitions is sufficient to result in such builders being
+// automatically generated for trivially buildable result types.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef MLIR_INFERTYPEOPINTERFACE
@@ -37,6 +42,10 @@ def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> {
       and the regions of the op. Be aware that this method is supposed to be
       called with valid arguments, e.g., operands are verified, or it may result
       in an undefined behavior.
+
+      The inferred result types may be less precise than what may be specified
+      directly or produced by refinement, but are required to be compatible
+      (as defined by the op's compatibility function).
       }],
       /*retTy=*/"::llvm::LogicalResult",
       /*methodName=*/"inferReturnTypes",
@@ -67,10 +76,10 @@ def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> {
       The return types may be elided or specific elements be null for elements
       that should just be returned but not verified.
 
-      Because this method can be called from within different stages of IR
-      verification, implementations should not assume the arguments to
-      represent fully valid IR and are responsible for checking inputs for
-      validity to the degree necessary to perform the return type inference.
+      This method may be called from within different stages of IR verification,
+      implementations should not assume the arguments to represent fully valid
+      IR and are responsible for checking inputs for validity to the degree
+      necessary to perform the return type inference.
       }],
       /*retTy=*/"::llvm::LogicalResult",
       /*methodName=*/"refineReturnTypes",
@@ -100,8 +109,7 @@ def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> {
       }]
     >,
     StaticInterfaceMethod<
-      /*desc=*/"Returns whether two array of types are compatible result types"
-               " for an op.",
+      /*desc=*/"Returns whether two type ranges are compatible result types.",
       /*retTy=*/"bool",
       /*methodName=*/"isCompatibleReturnTypes",
       /*args=*/(ins "::mlir::TypeRange":$lhs, "::mlir::TypeRange":$rhs),

From 981f8e1380b63b5fc08ca71dc05615b439cb1bfe Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 20 Jun 2025 21:55:14 -0700
Subject: [PATCH 1138/1322] [TableGen] Remove redundant control flow statements
 (NFC) (#145143)

---
 llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
index f016cc43c0e3..6b723bc0fb02 100644
--- a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
@@ -118,8 +118,6 @@ void TargetFeaturesEmitter::printFeatureKeyValues(
 
   // End feature table.
   OS << "};\n";
-
-  return;
 }
 
 void TargetFeaturesEmitter::printCPUKeyValues(raw_ostream &OS,
@@ -146,8 +144,6 @@ void TargetFeaturesEmitter::printCPUKeyValues(raw_ostream &OS,
 
   // End processor table.
   OS << "};\n";
-
-  return;
 }
 
 void TargetFeaturesEmitter::run(raw_ostream &OS) {

From 0fa0c3c2333c65035d5f4d54719d803596329d30 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 21 Jun 2025 14:24:30 +0900
Subject: [PATCH 1139/1322] AMDGPU: Use reportFatalUsageError in
 AMDGPULowerBufferFatPointers (#145132)

---
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 67db961e60fa..fa8af68817df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -624,7 +624,7 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst(
   if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
       MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
     return false;
-  report_fatal_error(
+  reportFatalUsageError(
       "memmove() on buffer descriptors is not implemented because pointer "
       "comparison on buffer descriptors isn't implemented\n");
 }
@@ -738,10 +738,10 @@ Type *LegalizeBufferContentTypesVisitor::scalarArrayTypeAsVector(Type *T) {
     return T;
   Type *ET = AT->getElementType();
   if (!ET->isSingleValueType() || isa<VectorType>(ET))
-    report_fatal_error("loading non-scalar arrays from buffer fat pointers "
-                       "should have recursed");
+    reportFatalUsageError("loading non-scalar arrays from buffer fat pointers "
+                          "should have recursed");
   if (!DL.typeSizeEqualsStoreSize(AT))
-    report_fatal_error(
+    reportFatalUsageError(
         "loading padded arrays from buffer fat pinters should have recursed");
   return FixedVectorType::get(ET, AT->getNumElements());
 }
@@ -1259,12 +1259,13 @@ Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) {
   }
 
   if (isa<GlobalValue>(C))
-    report_fatal_error("Global values containing ptr addrspace(7) (buffer "
-                       "fat pointer) values are not supported");
+    reportFatalUsageError("global values containing ptr addrspace(7) (buffer "
+                          "fat pointer) values are not supported");
 
   if (isa<ConstantExpr>(C))
-    report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer "
-                       "fat pointer) values should have been expanded earlier");
+    reportFatalUsageError(
+        "constant exprs containing ptr addrspace(7) (buffer "
+        "fat pointer) values should have been expanded earlier");
 
   return nullptr;
 }
@@ -1744,28 +1745,32 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
       IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
       break;
     case AtomicRMWInst::FSub: {
-      report_fatal_error("atomic floating point subtraction not supported for "
-                         "buffer resources and should've been expanded away");
+      reportFatalUsageError(
+          "atomic floating point subtraction not supported for "
+          "buffer resources and should've been expanded away");
       break;
     }
     case AtomicRMWInst::FMaximum: {
-      report_fatal_error("atomic floating point fmaximum not supported for "
-                         "buffer resources and should've been expanded away");
+      reportFatalUsageError(
+          "atomic floating point fmaximum not supported for "
+          "buffer resources and should've been expanded away");
       break;
     }
     case AtomicRMWInst::FMinimum: {
-      report_fatal_error("atomic floating point fminimum not supported for "
-                         "buffer resources and should've been expanded away");
+      reportFatalUsageError(
+          "atomic floating point fminimum not supported for "
+          "buffer resources and should've been expanded away");
       break;
     }
     case AtomicRMWInst::Nand:
-      report_fatal_error("atomic nand not supported for buffer resources and "
-                         "should've been expanded away");
+      reportFatalUsageError(
+          "atomic nand not supported for buffer resources and "
+          "should've been expanded away");
       break;
     case AtomicRMWInst::UIncWrap:
     case AtomicRMWInst::UDecWrap:
-      report_fatal_error("wrapping increment/decrement not supported for "
-                         "buffer resources and should've ben expanded away");
+      reportFatalUsageError("wrapping increment/decrement not supported for "
+                            "buffer resources and should've ben expanded away");
       break;
     case AtomicRMWInst::BAD_BINOP:
       llvm_unreachable("Not sure how we got a bad binop");
@@ -2019,7 +2024,7 @@ PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   }
 
   if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE)
-    report_fatal_error(
+    reportFatalUsageError(
         "only buffer resources (addrspace 8) and null/poison pointers can be "
         "cast to buffer fat pointers (addrspace 7)");
   SplitUsers.insert(&I);
@@ -2225,8 +2230,8 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
     IRB.SetInsertPoint(&I);
     auto [Rsrc, Off] = getPtrParts(Ptr);
     if (Mask->getType() != Off->getType())
-      report_fatal_error("offset width is not equal to index width of fat "
-                         "pointer (data layout not set up correctly?)");
+      reportFatalUsageError("offset width is not equal to index width of fat "
+                            "pointer (data layout not set up correctly?)");
     Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off");
     copyMetadata(OffRes, &I);
     SplitUsers.insert(&I);

From 9cc9efc483339ece1d52923569bb755db42b69f3 Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen@intel.com>
Date: Sat, 21 Jun 2025 13:44:10 +0800
Subject: [PATCH 1140/1322] [lld][COFF] Remove duplicate strtab entries
 (#141197)

String table size is too big for large binary when symbol table is
enabled. Some strings in strtab is same so it can be reused.

This patch revives 9ffeaaa authored by mstorsjo with the prioritized
string table builder to fix debug section name issue (see 4d2eda2
for more details).

---------

Co-authored-by: Wen Haohai <whh108@live.com>
Co-authored-by: James Henderson <James.Henderson@sony.com>
---
 lld/COFF/Writer.cpp                       | 49 ++++++++++++++---------
 lld/test/COFF/strtab.s                    | 29 ++++++++++++++
 llvm/include/llvm/MC/StringTableBuilder.h | 18 ++++++---
 llvm/lib/MC/StringTableBuilder.cpp        | 27 +++++++++++--
 4 files changed, 97 insertions(+), 26 deletions(-)
 create mode 100644 lld/test/COFF/strtab.s

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 5f1da5e79dac..076561807af4 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/Parallel.h"
@@ -201,7 +202,8 @@ struct ChunkRange {
 class Writer {
 public:
   Writer(COFFLinkerContext &c)
-      : buffer(c.e.outputBuffer), delayIdata(c), ctx(c) {}
+      : buffer(c.e.outputBuffer), strtab(StringTableBuilder::WinCOFF),
+        delayIdata(c), ctx(c) {}
   void run();
 
 private:
@@ -281,7 +283,7 @@ private:
 
   std::unique_ptr<FileOutputBuffer> &buffer;
   std::map<PartialSectionKey, PartialSection *> partialSections;
-  std::vector<char> strtab;
+  StringTableBuilder strtab;
   std::vector<llvm::object::coff_symbol16> outputSymtab;
   std::vector<ECCodeMapEntry> codeMap;
   IdataContents idata;
@@ -1434,14 +1436,6 @@ void Writer::assignOutputSectionIndices() {
           sc->setOutputSectionIdx(mc->getOutputSectionIdx());
 }
 
-size_t Writer::addEntryToStringTable(StringRef str) {
-  assert(str.size() > COFF::NameSize);
-  size_t offsetOfEntry = strtab.size() + 4; // +4 for the size field
-  strtab.insert(strtab.end(), str.begin(), str.end());
-  strtab.push_back('\0');
-  return offsetOfEntry;
-}
-
 std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
   coff_symbol16 sym;
   switch (def->kind()) {
@@ -1482,7 +1476,8 @@ std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
   StringRef name = def->getName();
   if (name.size() > COFF::NameSize) {
     sym.Name.Offset.Zeroes = 0;
-    sym.Name.Offset.Offset = addEntryToStringTable(name);
+    sym.Name.Offset.Offset = 0; // Filled in later.
+    strtab.add(name);
   } else {
     memset(sym.Name.ShortName, 0, COFF::NameSize);
     memcpy(sym.Name.ShortName, name.data(), name.size());
@@ -1514,6 +1509,7 @@ void Writer::createSymbolAndStringTable() {
   // solution where discardable sections have long names preserved and
   // non-discardable sections have their names truncated, to ensure that any
   // section which is mapped at runtime also has its name mapped at runtime.
+  SmallVector<OutputSection *> longNameSections;
   for (OutputSection *sec : ctx.outputSections) {
     if (sec->name.size() <= COFF::NameSize)
       continue;
@@ -1525,9 +1521,13 @@ void Writer::createSymbolAndStringTable() {
           << " is longer than 8 characters and will use a non-standard string "
              "table";
     }
-    sec->setStringTableOff(addEntryToStringTable(sec->name));
+    // Put the section name in the begin of strtab so that its offset is less
+    // than Max7DecimalOffset otherwise lldb/gdb will not read it.
+    strtab.add(sec->name, /*Priority=*/UINT8_MAX);
+    longNameSections.push_back(sec);
   }
 
+  std::vector<std::pair<size_t, StringRef>> longNameSymbols;
   if (ctx.config.writeSymtab) {
     for (ObjFile *file : ctx.objFileInstances) {
       for (Symbol *b : file->getSymbols()) {
@@ -1542,15 +1542,22 @@ void Writer::createSymbolAndStringTable() {
             continue;
         }
 
-        if (std::optional<coff_symbol16> sym = createSymbol(d))
+        if (std::optional<coff_symbol16> sym = createSymbol(d)) {
+          if (d->getName().size() > COFF::NameSize)
+            longNameSymbols.emplace_back(outputSymtab.size(), d->getName());
           outputSymtab.push_back(*sym);
+        }
 
         if (auto *dthunk = dyn_cast<DefinedImportThunk>(d)) {
           if (!dthunk->wrappedSym->writtenToSymtab) {
             dthunk->wrappedSym->writtenToSymtab = true;
             if (std::optional<coff_symbol16> sym =
-                    createSymbol(dthunk->wrappedSym))
+                    createSymbol(dthunk->wrappedSym)) {
+              if (d->getName().size() > COFF::NameSize)
+                longNameSymbols.emplace_back(outputSymtab.size(),
+                                             dthunk->wrappedSym->getName());
               outputSymtab.push_back(*sym);
+            }
           }
         }
       }
@@ -1560,11 +1567,19 @@ void Writer::createSymbolAndStringTable() {
   if (outputSymtab.empty() && strtab.empty())
     return;
 
+  strtab.finalize();
+  for (OutputSection *sec : longNameSections)
+    sec->setStringTableOff(strtab.getOffset(sec->name));
+  for (auto P : longNameSymbols) {
+    coff_symbol16 &sym = outputSymtab[P.first];
+    sym.Name.Offset.Offset = strtab.getOffset(P.second);
+  }
+
   // We position the symbol table to be adjacent to the end of the last section.
   uint64_t fileOff = fileSize;
   pointerToSymbolTable = fileOff;
   fileOff += outputSymtab.size() * sizeof(coff_symbol16);
-  fileOff += 4 + strtab.size();
+  fileOff += strtab.getSize();
   fileSize = alignTo(fileOff, ctx.config.fileAlign);
 }
 
@@ -1945,9 +1960,7 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
   // Create the string table, it follows immediately after the symbol table.
   // The first 4 bytes is length including itself.
   buf = reinterpret_cast<uint8_t *>(&symbolTable[numberOfSymbols]);
-  write32le(buf, strtab.size() + 4);
-  if (!strtab.empty())
-    memcpy(buf + 4, strtab.data(), strtab.size());
+  strtab.write(buf);
 }
 
 void Writer::openFile(StringRef path) {
diff --git a/lld/test/COFF/strtab.s b/lld/test/COFF/strtab.s
new file mode 100644
index 000000000000..4d8fa39f56db
--- /dev/null
+++ b/lld/test/COFF/strtab.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple=x86_64-windows-msvc %s -filetype=obj -o %t.obj
+# RUN: lld-link -out:%t.exe -entry:main %t.obj -debug:dwarf
+# RUN: llvm-readobj --string-table %t.exe | FileCheck %s
+
+# CHECK:      StringTable {
+# CHECK-NEXT:   Length: 87
+# CHECK-NEXT:   [     4] .debug_abbrev
+# CHECK-NEXT:   [    12] .debug_line
+# CHECK-NEXT:   [    1e] long_name_symbolz
+# CHECK-NEXT:   [    30] .debug_abbrez
+# CHECK-NEXT:   [    3e] __impl_long_name_symbolA
+# CHECK-NEXT: }
+
+
+.global main
+.text
+main:
+long_name_symbolz:
+long_name_symbolA:
+__impl_long_name_symbolA:
+name_symbolA:
+.debug_abbrez:
+  ret
+
+.section        .debug_abbrev,"dr"
+.byte 0
+
+.section        .debug_line,"dr"
+.byte 0
diff --git a/llvm/include/llvm/MC/StringTableBuilder.h b/llvm/include/llvm/MC/StringTableBuilder.h
index 83d55ef8512f..3f1c045fc0bd 100644
--- a/llvm/include/llvm/MC/StringTableBuilder.h
+++ b/llvm/include/llvm/MC/StringTableBuilder.h
@@ -38,6 +38,8 @@ public:
   };
 
 private:
+  // Only non-zero priority will be recorded.
+  DenseMap<CachedHashStringRef, uint8_t> StringPriorityMap;
   DenseMap<CachedHashStringRef, size_t> StringIndexMap;
   size_t Size = 0;
   Kind K;
@@ -51,11 +53,16 @@ public:
   LLVM_ABI StringTableBuilder(Kind K, Align Alignment = Align(1));
   LLVM_ABI ~StringTableBuilder();
 
-  /// Add a string to the builder. Returns the position of S in the
-  /// table. The position will be changed if finalize is used.
-  /// Can only be used before the table is finalized.
-  LLVM_ABI size_t add(CachedHashStringRef S);
-  size_t add(StringRef S) { return add(CachedHashStringRef(S)); }
+  /// Add a string to the builder. Returns the position of S in the table. The
+  /// position will be changed if finalize is used. Can only be used before the
+  /// table is finalized. Priority is only useful with reordering. Strings with
+  /// the same priority will be put together. Strings with higher priority are
+  /// placed closer to the begin of string table. When adding same string with
+  /// different priority, the maximum priority win.
+  LLVM_ABI size_t add(CachedHashStringRef S, uint8_t Priority = 0);
+  size_t add(StringRef S, uint8_t Priority = 0) {
+    return add(CachedHashStringRef(S), Priority);
+  }
 
   /// Analyze the strings and build the final table. No more strings can
   /// be added after this point.
@@ -78,6 +85,7 @@ public:
   bool contains(StringRef S) const { return contains(CachedHashStringRef(S)); }
   bool contains(CachedHashStringRef S) const { return StringIndexMap.count(S); }
 
+  bool empty() const { return StringIndexMap.empty(); }
   size_t getSize() const { return Size; }
   LLVM_ABI void clear();
 
diff --git a/llvm/lib/MC/StringTableBuilder.cpp b/llvm/lib/MC/StringTableBuilder.cpp
index 7accdc2a9e77..f2b82998f245 100644
--- a/llvm/lib/MC/StringTableBuilder.cpp
+++ b/llvm/lib/MC/StringTableBuilder.cpp
@@ -138,13 +138,31 @@ void StringTableBuilder::finalizeInOrder() {
 void StringTableBuilder::finalizeStringTable(bool Optimize) {
   Finalized = true;
 
-  if (Optimize) {
+  if (Optimize && StringIndexMap.size()) {
     std::vector<StringPair *> Strings;
     Strings.reserve(StringIndexMap.size());
     for (StringPair &P : StringIndexMap)
       Strings.push_back(&P);
 
-    multikeySort(Strings, 0);
+    size_t RangeBegin = 0;
+    MutableArrayRef<StringPair *> StringsRef(Strings);
+    if (StringPriorityMap.size()) {
+      llvm::sort(Strings,
+                 [&](const StringPair *LHS, const StringPair *RHS) -> bool {
+                   return StringPriorityMap.lookup(LHS->first) >
+                          StringPriorityMap.lookup(RHS->first);
+                 });
+      uint8_t RangePriority = StringPriorityMap.lookup(Strings[0]->first);
+      for (size_t I = 1, E = Strings.size(); I != E && RangePriority; ++I) {
+        uint8_t Priority = StringPriorityMap.lookup(Strings[I]->first);
+        if (Priority != RangePriority) {
+          multikeySort(StringsRef.slice(RangeBegin, I - RangeBegin), 0);
+          RangePriority = Priority;
+          RangeBegin = I;
+        }
+      }
+    }
+    multikeySort(StringsRef.slice(RangeBegin), 0);
     initSize();
 
     StringRef Previous;
@@ -199,11 +217,14 @@ size_t StringTableBuilder::getOffset(CachedHashStringRef S) const {
   return I->second;
 }
 
-size_t StringTableBuilder::add(CachedHashStringRef S) {
+size_t StringTableBuilder::add(CachedHashStringRef S, uint8_t Priority) {
   if (K == WinCOFF)
     assert(S.size() > COFF::NameSize && "Short string in COFF string table!");
 
   assert(!isFinalized());
+  if (Priority)
+    StringPriorityMap[S] = std::max(Priority, StringPriorityMap[S]);
+
   auto P = StringIndexMap.try_emplace(S);
   if (P.second) {
     size_t Start = alignTo(Size, Alignment);

From cb4f329004b8fc346bbd44ae8f9b94ff2e41998b Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Fri, 20 Jun 2025 22:44:55 -0700
Subject: [PATCH 1141/1322] [RISCV] Fix HasStdExtCOrZcfOrZce Syntax (#145141)

---
 llvm/lib/Target/RISCV/RISCVFeatures.td   | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 6df6368929da..4c761fb6a7ab 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -459,7 +459,7 @@ def FeatureStdExtZce
                      [FeatureStdExtZcb, FeatureStdExtZcmp, FeatureStdExtZcmt]>;
 
 def HasStdExtCOrZcfOrZce
-    : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcf() "
+    : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcf() ||"
                 "Subtarget->hasStdExtZce()">,
       AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcf,
                                  FeatureStdExtZce),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index fd8591f5ab2d..17d73e6d6d0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -870,7 +870,7 @@ def : CompressPat<(LW_INX GPRF32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
 let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
 def : CompressPat<(FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm),
                   (C_FLW FPR32C:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
-} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
 
 let Predicates = [HasStdExtCOrZca, IsRV64] in {
 def : CompressPat<(LD GPRC:$rd, GPRCMem:$rs1, uimm8_lsb000:$imm),
@@ -894,7 +894,7 @@ def : CompressPat<(SW_INX GPRF32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
 let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
 def : CompressPat<(FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm),
                   (C_FSW FPR32C:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>;
-} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
 
 let Predicates = [HasStdExtCOrZca, IsRV64] in {
 def : CompressPat<(SD GPRC:$rs2, GPRCMem:$rs1, uimm8_lsb000:$imm),
@@ -1001,7 +1001,7 @@ def : CompressPat<(LW_INX GPRF32NoX0:$rd, SPMem:$rs1,  uimm8_lsb00:$imm),
 let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
 def : CompressPat<(FLW FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm),
                   (C_FLWSP FPR32:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>;
-} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
 
 let Predicates = [HasStdExtCOrZca, IsRV64] in {
 def : CompressPat<(LD GPRNoX0:$rd, SPMem:$rs1, uimm9_lsb000:$imm),
@@ -1047,7 +1047,7 @@ def : CompressPat<(SW_INX GPRF32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
 let Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in {
 def : CompressPat<(FSW FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm),
                   (C_FSWSP FPR32:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>;
-} // Predicates = [HasStdExtC, HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32]
 
 let Predicates = [HasStdExtCOrZca, IsRV64] in {
 def : CompressPat<(SD GPR:$rs2, SPMem:$rs1, uimm9_lsb000:$imm),

From 1fec092fd74abc6fa7399da5bcf165d6249883f5 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 21 Jun 2025 07:01:35 +0100
Subject: [PATCH 1142/1322] [AArch64][GlobalISel] Allow selecting FPR index
 loads. (#143835)

We can, through legalization of certain operations, end up generating
G_INDEXED_LOAD into FPR registers that require entensions. SExt and ZExt
will always opt for GPR, but anyext/noext can curently be set to FPR
registers in regbankselect. As writing a subregister will set higher
bits in the same register to 0, we can successfully handle zext and
anyext on FPR registers, which is what this patch attempts to add.
---
 .../GISel/AArch64InstructionSelector.cpp      |  42 ++-
 .../GlobalISel/select-fp-index-load.mir       | 328 ++++++++++++++++++
 .../CodeGen/AArch64/arm64-indexed-memory.ll   |  43 +++
 3 files changed, 401 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-fp-index-load.mir

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 5081cc4bba14..d55ff5acb3dc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5553,9 +5553,15 @@ bool AArch64InstructionSelector::selectIndexedExtLoad(
   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
   bool IsPre = ExtLd.isPre();
   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
-  bool InsertIntoXReg = false;
+  unsigned InsertIntoSubReg = 0;
   bool IsDst64 = Ty.getSizeInBits() == 64;
 
+  // ZExt/SExt should be on gpr but can handle extload and zextload of fpr, so
+  // long as they are scalar.
+  bool IsFPR = RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID;
+  if ((IsSExt && IsFPR) || Ty.isVector())
+    return false;
+
   unsigned Opc = 0;
   LLT NewLdDstTy;
   LLT s32 = LLT::scalar(32);
@@ -5568,9 +5574,13 @@ bool AArch64InstructionSelector::selectIndexedExtLoad(
       else
         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
       NewLdDstTy = IsDst64 ? s64 : s32;
+    } else if (IsFPR) {
+      Opc = IsPre ? AArch64::LDRBpre : AArch64::LDRBpost;
+      InsertIntoSubReg = AArch64::bsub;
+      NewLdDstTy = LLT::scalar(MemSizeBits);
     } else {
       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
-      InsertIntoXReg = IsDst64;
+      InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
       NewLdDstTy = s32;
     }
   } else if (MemSizeBits == 16) {
@@ -5580,27 +5590,32 @@ bool AArch64InstructionSelector::selectIndexedExtLoad(
       else
         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
       NewLdDstTy = IsDst64 ? s64 : s32;
+    } else if (IsFPR) {
+      Opc = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
+      InsertIntoSubReg = AArch64::hsub;
+      NewLdDstTy = LLT::scalar(MemSizeBits);
     } else {
       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
-      InsertIntoXReg = IsDst64;
+      InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
       NewLdDstTy = s32;
     }
   } else if (MemSizeBits == 32) {
     if (IsSExt) {
       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
       NewLdDstTy = s64;
+    } else if (IsFPR) {
+      Opc = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+      InsertIntoSubReg = AArch64::ssub;
+      NewLdDstTy = LLT::scalar(MemSizeBits);
     } else {
       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
-      InsertIntoXReg = IsDst64;
+      InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
       NewLdDstTy = s32;
     }
   } else {
     llvm_unreachable("Unexpected size for indexed load");
   }
 
-  if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
-    return false; // We should be on gpr.
-
   auto Cst = getIConstantVRegVal(Offset, MRI);
   if (!Cst)
     return false; // Shouldn't happen, but just in case.
@@ -5610,15 +5625,18 @@ bool AArch64InstructionSelector::selectIndexedExtLoad(
   LdMI.cloneMemRefs(ExtLd);
   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
   // Make sure to select the load with the MemTy as the dest type, and then
-  // insert into X reg if needed.
-  if (InsertIntoXReg) {
+  // insert into a larger reg if needed.
+  if (InsertIntoSubReg) {
     // Generate a SUBREG_TO_REG.
     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
                         .addImm(0)
                         .addUse(LdMI.getReg(1))
-                        .addImm(AArch64::sub_32);
-    RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
-                                 MRI);
+                        .addImm(InsertIntoSubReg);
+    RBI.constrainGenericRegister(
+        SubToReg.getReg(0),
+        *getRegClassForTypeOnBank(MRI.getType(Dst),
+                                  *RBI.getRegBank(Dst, MRI, TRI)),
+        MRI);
   } else {
     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
     selectCopy(*Copy, TII, MRI, TRI, RBI);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-index-load.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-index-load.mir
new file mode 100644
index 000000000000..80c2f8ca0860
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-index-load.mir
@@ -0,0 +1,328 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -verify-machineinstrs -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -mattr=+fullfp16 -o - | FileCheck %s
+
+...
+---
+name:            load_s8_s16
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s8_s16
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr8 = LDRBpost [[COPY]], 4 :: (load (s8))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr16 = SUBREG_TO_REG 0, %4, %subreg.bsub
+    ; CHECK-NEXT: $h0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s16), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 0 :: (load (s8))
+    $h0 = COPY %2(s16)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s8_s32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s8_s32
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr8 = LDRBpost [[COPY]], 4 :: (load (s8))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, %4, %subreg.bsub
+    ; CHECK-NEXT: $s0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s32), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 0 :: (load (s8))
+    $s0 = COPY %2(s32)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s8_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s8_s64
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr8 = LDRBpost [[COPY]], 4 :: (load (s8))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, %4, %subreg.bsub
+    ; CHECK-NEXT: $d0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s64), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 0 :: (load (s8))
+    $d0 = COPY %2(s64)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s16_s32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s16_s32
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr16 = LDRHpost [[COPY]], 4 :: (load (s16))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, %4, %subreg.hsub
+    ; CHECK-NEXT: $s0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s32), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 0 :: (load (s16))
+    $s0 = COPY %2(s32)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s16_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s16_s64
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr16 = LDRHpost [[COPY]], 4 :: (load (s16))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, %4, %subreg.hsub
+    ; CHECK-NEXT: $d0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s64), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 0 :: (load (s16))
+    $d0 = COPY %2(s64)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s32_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s32_s64
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr32 = LDRSpost [[COPY]], 4 :: (load (s32))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, %4, %subreg.ssub
+    ; CHECK-NEXT: $d0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s64), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 0 :: (load (s32))
+    $d0 = COPY %2(s64)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s8_s16_pre
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s8_s16_pre
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr8 = LDRBpre [[COPY]], 4 :: (load (s8))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr16 = SUBREG_TO_REG 0, %4, %subreg.bsub
+    ; CHECK-NEXT: $h0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s16), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 1 :: (load (s8))
+    $h0 = COPY %2(s16)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s8_s32_pre
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s8_s32_pre
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr8 = LDRBpre [[COPY]], 4 :: (load (s8))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, %4, %subreg.bsub
+    ; CHECK-NEXT: $s0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s32), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 1 :: (load (s8))
+    $s0 = COPY %2(s32)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s8_s64_pre
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s8_s64_pre
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr8 = LDRBpre [[COPY]], 4 :: (load (s8))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, %4, %subreg.bsub
+    ; CHECK-NEXT: $d0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s64), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 1 :: (load (s8))
+    $d0 = COPY %2(s64)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s16_s32_pre
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s16_s32_pre
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr16 = LDRHpre [[COPY]], 4 :: (load (s16))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, %4, %subreg.hsub
+    ; CHECK-NEXT: $s0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s32), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 1 :: (load (s16))
+    $s0 = COPY %2(s32)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s16_s64_pre
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s16_s64_pre
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr16 = LDRHpre [[COPY]], 4 :: (load (s16))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, %4, %subreg.hsub
+    ; CHECK-NEXT: $d0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s64), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 1 :: (load (s16))
+    $d0 = COPY %2(s64)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            load_s32_s64_pre
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $d0, $x0
+    ; CHECK-LABEL: name: load_s32_s64_pre
+    ; CHECK: liveins: $d0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK-NEXT: early-clobber %3:gpr64sp, %4:fpr32 = LDRSpre [[COPY]], 4 :: (load (s32))
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, %4, %subreg.ssub
+    ; CHECK-NEXT: $d0 = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: $x0 = COPY %3
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:gpr(p0) = COPY $x0
+    %1:gpr(s64) = G_CONSTANT i64 4
+    %2:fpr(s64), %3:gpr(p0) = G_INDEXED_LOAD %0:gpr, %1:gpr, 1 :: (load (s32))
+    $d0 = COPY %2(s64)
+    $x0 = COPY %3(p0)
+    RET_ReallyLR implicit $d0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index d949f9520957..cb5df07c7ede 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -725,3 +725,46 @@ define ptr @postidx64_sw(ptr %src, ptr %out) {
   store i64 %sext, ptr %out, align 8
   ret ptr %ptr
 }
+
+define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) {
+; CHECK64-LABEL: postidx32_shalf:
+; CHECK64:       ; %bb.0:
+; CHECK64-NEXT:    ldr h1, [x0], #4
+; CHECK64-NEXT:    ; kill: def $h0 killed $h0 def $s0
+; CHECK64-NEXT:    fcvt s2, h1
+; CHECK64-NEXT:    fcmp s2, #0.0
+; CHECK64-NEXT:    fcsel s0, s1, s0, mi
+; CHECK64-NEXT:    str h0, [x1]
+; CHECK64-NEXT:    ret
+;
+; GISEL-LABEL: postidx32_shalf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    mov w8, #0 ; =0x0
+; GISEL-NEXT:    ldr h1, [x0], #4
+; GISEL-NEXT:    fmov s2, w8
+; GISEL-NEXT:    ; kill: def $h0 killed $h0 def $s0
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    fcvt s3, h1
+; GISEL-NEXT:    fmov w8, s1
+; GISEL-NEXT:    fcvt s2, h2
+; GISEL-NEXT:    fcmp s3, s2
+; GISEL-NEXT:    csel w8, w8, w9, mi
+; GISEL-NEXT:    strh w8, [x1]
+; GISEL-NEXT:    ret
+;
+; CHECK32-LABEL: postidx32_shalf:
+; CHECK32:       ; %bb.0:
+; CHECK32-NEXT:    ldr h1, [x0], #4
+; CHECK32-NEXT:    ; kill: def $h0 killed $h0 def $s0
+; CHECK32-NEXT:    fcvt s2, h1
+; CHECK32-NEXT:    fcmp s2, #0.0
+; CHECK32-NEXT:    fcsel s0, s1, s0, mi
+; CHECK32-NEXT:    str h0, [x1]
+; CHECK32-NEXT:    ret
+  %tmp = load half, ptr %src, align 2
+  %ptr = getelementptr inbounds i32, ptr %src, i64 1
+  %c = fcmp olt half %tmp, 0.0
+  %s = select i1 %c, half %tmp, half %a
+  store half %s, ptr %out, align 8
+  ret ptr %ptr
+}

From 437346378fd4d40af30e6969621a605cbd6215d1 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 21 Jun 2025 07:42:54 +0100
Subject: [PATCH 1143/1322] [GlobalISel] Widen vector loads from aligned ptrs
 (#144309)

If the pointer is aligned to more than the size of the vector, we can
widen the load up to next power of 2 size, as SDAG performs.

Some of the v3 tests are currently worse - those should be addressed in
other issues.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  15 +
 .../GlobalISel/legalize-load-range.mir        |  46 ++
 llvm/test/CodeGen/AArch64/add.ll              |  30 +-
 llvm/test/CodeGen/AArch64/andorxor.ll         |  90 +--
 llvm/test/CodeGen/AArch64/ctlz.ll             |  18 +-
 llvm/test/CodeGen/AArch64/ctpop.ll            |  18 +-
 llvm/test/CodeGen/AArch64/cttz.ll             |  38 +-
 llvm/test/CodeGen/AArch64/load.ll             |  98 +--
 llvm/test/CodeGen/AArch64/mul.ll              |  30 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   | 743 +++++++++---------
 llvm/test/CodeGen/AArch64/sub.ll              |  30 +-
 11 files changed, 600 insertions(+), 556 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 028bffd1bf5a..a28361051b41 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4072,6 +4072,21 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
     if (MemTy != DstTy)
       return UnableToLegalize;
 
+    Align Alignment = LoadMI.getAlign();
+    // Given an alignment larger than the size of the memory, we can increase
+    // the size of the load without needing to scalarize it.
+    if (Alignment.value() * 8 > MemSizeInBits &&
+        isPowerOf2_64(DstTy.getScalarSizeInBits())) {
+      LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
+                                     DstTy.getElementType());
+      MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
+      auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
+      MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
+                                                   NewLoad.getReg(0));
+      LoadMI.eraseFromParent();
+      return Legalized;
+    }
+
     // TODO: We can do better than scalarizing the vector and at least split it
     // in half.
     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir
new file mode 100644
index 000000000000..5611642a1364
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel -o - %s | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+  target triple = "aarch64"
+
+  define <3 x i16> @range_v3i16(ptr %a_ptr, ptr %b_ptr) {
+    %a = load <3 x i16>, ptr %a_ptr, align 8, !range !0, !noundef !1
+    %b = load <3 x i16>, ptr %b_ptr, align 8, !range !2, !noundef !1
+    %result = add <3 x i16> %a, %b
+    ret <3 x i16> %result
+  }
+
+  !0 = !{i16 16, i16 17}
+  !1 = !{}
+  !2 = !{i16 32, i16 33}
+...
+---
+name:            range_v3i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x0, $x1
+    ; Make sure we drop the range metadata when widening an aligned load.
+
+    ; CHECK-LABEL: name: range_v3i16
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>) from %ir.a_ptr)
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY1]](p0) :: (load (<4 x s16>) from %ir.b_ptr)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s16>) = G_ADD [[LOAD]], [[LOAD1]]
+    ; CHECK-NEXT: $d0 = COPY [[ADD]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(<3 x s16>) = G_LOAD %0(p0) :: (load (<3 x s16>) from %ir.a_ptr, align 8, !range !0)
+    %3:_(<3 x s16>) = G_LOAD %1(p0) :: (load (<3 x s16>) from %ir.b_ptr, align 8, !range !2)
+    %4:_(<3 x s16>) = G_ADD %2, %3
+    %5:_(s16), %6:_(s16), %7:_(s16) = G_UNMERGE_VALUES %4(<3 x s16>)
+    %8:_(s16) = G_IMPLICIT_DEF
+    %9:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %8(s16)
+    $d0 = COPY %9(<4 x s16>)
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index d5bd1b712a2a..96168cb80196 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index f7df1092287b..a7875dbebd0e 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -302,16 +302,20 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: and_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
@@ -350,16 +354,20 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: or_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
@@ -398,16 +406,20 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: xor_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
@@ -805,16 +817,10 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: and_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
@@ -842,16 +848,10 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: or_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
@@ -879,16 +879,10 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: xor_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index b1c6e24c30a7..04124609eec7 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -56,12 +56,16 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    add x9, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-GI-NEXT:    clz v0.8b, v0.8b
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v0.b[0]
+; CHECK-GI-NEXT:    clz v0.8b, v2.8b
 ; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
 ; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    add x8, x0, #2
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    clz v0.4h, v0.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 55f75b6bc3f2..c739be95cd24 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -55,12 +55,16 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    add x9, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v0.b[0]
+; CHECK-GI-NEXT:    cnt v0.8b, v2.8b
 ; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
 ; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    add x8, x0, #2
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index 93ac97e20dab..fc9bf2c0aca6 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -68,21 +68,23 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w9, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x0]
 ; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    mov v1.h[1], w10
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.h[2], w8
 ; CHECK-GI-NEXT:    add x8, x0, #1
-; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    add x9, x0, #2
-; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    eor v1.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
@@ -275,22 +277,20 @@ define void @v3i16(ptr %p1) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    add x9, x0, #4
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    add x10, x0, #4
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    add x8, x0, #2
 ; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
 ; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
 ; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
-; CHECK-GI-NEXT:    st1 { v0.h }[1], [x9]
-; CHECK-GI-NEXT:    st1 { v0.h }[2], [x10]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i16>, ptr %p1
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 6b26ae98a4ed..c4bb6e37d6ea 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -335,102 +335,50 @@ define <3 x i8> @load_v3i8(ptr %ptr) {
 ;
 ; CHECK-GI-LABEL: load_v3i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
-; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
   %a = load <3 x i8>, ptr %ptr
   ret <3 x i8> %a
 }
 
 define <7 x i8> @load_v7i8(ptr %ptr) {
-; CHECK-SD-LABEL: load_v7i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v7i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #1
-; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
-; CHECK-GI-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.b }[2], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #3
-; CHECK-GI-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #5
-; CHECK-GI-NEXT:    ld1 { v0.b }[5], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #6
-; CHECK-GI-NEXT:    ld1 { v0.b }[6], [x8]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
   %a = load <7 x i8>, ptr %ptr
   ret <7 x i8> %a
 }
 
 define <3 x i16> @load_v3i16(ptr %ptr) {
-; CHECK-SD-LABEL: load_v3i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v3i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
   %a = load <3 x i16>, ptr %ptr
   ret <3 x i16> %a
 }
 
 define <7 x i16> @load_v7i16(ptr %ptr) {
-; CHECK-SD-LABEL: load_v7i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v7i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #6
-; CHECK-GI-NEXT:    ld1 { v0.h }[3], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v0.h }[4], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #10
-; CHECK-GI-NEXT:    ld1 { v0.h }[5], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #12
-; CHECK-GI-NEXT:    ld1 { v0.h }[6], [x8]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
   %a = load <7 x i16>, ptr %ptr
   ret <7 x i16> %a
 }
 
 define <3 x i32> @load_v3i32(ptr %ptr) {
-; CHECK-SD-LABEL: load_v3i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v3i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.s }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
   %a = load <3 x i32>, ptr %ptr
   ret <3 x i32> %a
 }
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 1558043f7f40..9c69a6f03b85 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -122,16 +122,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
@@ -282,16 +286,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index a534112b7c55..4f0c4080aa0c 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -412,31 +412,33 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ;
 ; CHECK-GI-LABEL: test_udot_v5i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0, #4]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #4]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
-; CHECK-GI-NEXT:    ldrb w11, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w12, [x1, #1]
-; CHECK-GI-NEXT:    mul w8, w9, w8
-; CHECK-GI-NEXT:    ldrb w9, [x0]
-; CHECK-GI-NEXT:    fmov s0, w10
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w12
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v0.s[2], w9
-; CHECK-GI-NEXT:    ldrb w9, [x1, #3]
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v0.s[3], w9
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    umov w8, v1.b[4]
+; CHECK-GI-NEXT:    umov w9, v0.b[4]
+; CHECK-GI-NEXT:    umov w10, v1.b[0]
+; CHECK-GI-NEXT:    umov w12, v0.b[0]
+; CHECK-GI-NEXT:    umov w11, v1.b[1]
+; CHECK-GI-NEXT:    umov w13, v0.b[1]
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    umov w9, v1.b[2]
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    umov w10, v1.b[3]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.s[1], w13
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    mov v4.s[1], wzr
+; CHECK-GI-NEXT:    mov v2.s[2], w9
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov v4.s[2], wzr
+; CHECK-GI-NEXT:    mov v2.s[3], w10
+; CHECK-GI-NEXT:    mov v3.s[3], w11
+; CHECK-GI-NEXT:    mov v4.s[3], wzr
+; CHECK-GI-NEXT:    mla v4.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v4.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
 ; CHECK-GI-NEXT:    ret
@@ -466,20 +468,21 @@ define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
 ;
 ; CHECK-GI-LABEL: test_udot_v5i8_nomla:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #4]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.s[1], w10
-; CHECK-GI-NEXT:    mov v1.s[1], wzr
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[2], wzr
-; CHECK-GI-NEXT:    ldrb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v1.s[3], wzr
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[4]
+; CHECK-GI-NEXT:    umov w10, v0.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    umov w9, v0.b[3]
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], wzr
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], wzr
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mov v2.s[3], wzr
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
@@ -506,31 +509,33 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ;
 ; CHECK-GI-LABEL: test_sdot_v5i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrsb w8, [x0, #4]
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #4]
-; CHECK-GI-NEXT:    ldrsb w10, [x1]
-; CHECK-GI-NEXT:    ldrsb w11, [x0, #1]
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #1]
-; CHECK-GI-NEXT:    mul w8, w9, w8
-; CHECK-GI-NEXT:    ldrsb w9, [x0]
-; CHECK-GI-NEXT:    fmov s0, w10
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #2]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w12
-; CHECK-GI-NEXT:    ldrsb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v0.s[2], w9
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #3]
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    ldrsb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v0.s[3], w9
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    smov w8, v1.b[4]
+; CHECK-GI-NEXT:    smov w9, v0.b[4]
+; CHECK-GI-NEXT:    smov w10, v1.b[0]
+; CHECK-GI-NEXT:    smov w12, v0.b[0]
+; CHECK-GI-NEXT:    smov w11, v1.b[1]
+; CHECK-GI-NEXT:    smov w13, v0.b[1]
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    smov w9, v1.b[2]
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    smov w10, v1.b[3]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    smov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.s[1], w13
+; CHECK-GI-NEXT:    smov w11, v0.b[3]
+; CHECK-GI-NEXT:    mov v4.s[1], wzr
+; CHECK-GI-NEXT:    mov v2.s[2], w9
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov v4.s[2], wzr
+; CHECK-GI-NEXT:    mov v2.s[3], w10
+; CHECK-GI-NEXT:    mov v3.s[3], w11
+; CHECK-GI-NEXT:    mov v4.s[3], wzr
+; CHECK-GI-NEXT:    mla v4.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v4.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
 ; CHECK-GI-NEXT:    ret
@@ -2298,128 +2303,145 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ;
 ; CHECK-GI-LABEL: test_udot_v25i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #16]!
-; CHECK-GI-NEXT:    ldrb w11, [x1, #4]
-; CHECK-GI-NEXT:    ldrb w12, [x1, #5]
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    ldp q1, q7, [x1]
 ; CHECK-GI-NEXT:    fmov s0, wzr
-; CHECK-GI-NEXT:    umov w13, v2.b[4]
-; CHECK-GI-NEXT:    umov w14, v2.b[5]
-; CHECK-GI-NEXT:    umov w10, v2.b[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    umov w9, v2.b[8]
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    umov w11, v2.b[12]
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldrb w8, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], wzr
-; CHECK-GI-NEXT:    fmov s7, w13
-; CHECK-GI-NEXT:    fmov s4, w10
-; CHECK-GI-NEXT:    umov w10, v2.b[13]
-; CHECK-GI-NEXT:    mov v5.s[1], w12
-; CHECK-GI-NEXT:    umov w13, v2.b[9]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    fmov s16, w11
-; CHECK-GI-NEXT:    umov w9, v1.b[0]
-; CHECK-GI-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NEXT:    mov v7.s[1], w14
-; CHECK-GI-NEXT:    umov w14, v2.b[6]
-; CHECK-GI-NEXT:    ldrb w12, [x1, #6]
-; CHECK-GI-NEXT:    umov w8, v2.b[1]
-; CHECK-GI-NEXT:    umov w11, v2.b[2]
-; CHECK-GI-NEXT:    mov v0.s[2], wzr
-; CHECK-GI-NEXT:    mov v16.s[1], w10
-; CHECK-GI-NEXT:    umov w10, v2.b[14]
-; CHECK-GI-NEXT:    mov v5.s[2], w12
-; CHECK-GI-NEXT:    umov w12, v1.b[5]
-; CHECK-GI-NEXT:    mov v6.s[1], w13
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    mov v7.s[2], w14
-; CHECK-GI-NEXT:    umov w14, v1.b[4]
-; CHECK-GI-NEXT:    umov w9, v2.b[10]
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[1]
-; CHECK-GI-NEXT:    umov w13, v2.b[7]
-; CHECK-GI-NEXT:    mov v16.s[2], w10
-; CHECK-GI-NEXT:    umov w10, v2.b[15]
-; CHECK-GI-NEXT:    mov v0.s[3], wzr
-; CHECK-GI-NEXT:    fmov s18, w14
-; CHECK-GI-NEXT:    mov v6.s[2], w9
-; CHECK-GI-NEXT:    umov w9, v1.b[12]
-; CHECK-GI-NEXT:    mov v4.s[2], w11
-; CHECK-GI-NEXT:    ldrb w11, [x1, #7]
-; CHECK-GI-NEXT:    mov v17.s[1], w8
-; CHECK-GI-NEXT:    ldrb w8, [x1, #2]
-; CHECK-GI-NEXT:    mov v16.s[3], w10
-; CHECK-GI-NEXT:    umov w10, v1.b[13]
-; CHECK-GI-NEXT:    mov v18.s[1], w12
-; CHECK-GI-NEXT:    umov w12, v1.b[6]
-; CHECK-GI-NEXT:    mov v5.s[3], w11
-; CHECK-GI-NEXT:    ldrb w11, [x0, #16]!
-; CHECK-GI-NEXT:    mov v7.s[3], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[2]
-; CHECK-GI-NEXT:    fmov s20, w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #5]
-; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[8]
-; CHECK-GI-NEXT:    fmov s22, w11
-; CHECK-GI-NEXT:    mov v18.s[2], w12
-; CHECK-GI-NEXT:    ldrb w12, [x0, #4]
-; CHECK-GI-NEXT:    umov w11, v2.b[3]
-; CHECK-GI-NEXT:    mov v20.s[1], w10
-; CHECK-GI-NEXT:    ldrb w10, [x0, #8]
-; CHECK-GI-NEXT:    fmov s21, w12
-; CHECK-GI-NEXT:    ldrb w12, [x1, #8]
-; CHECK-GI-NEXT:    mov v17.s[2], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[9]
-; CHECK-GI-NEXT:    fmov s19, w8
-; CHECK-GI-NEXT:    umov w8, v1.b[14]
-; CHECK-GI-NEXT:    mul w10, w12, w10
-; CHECK-GI-NEXT:    umov w12, v1.b[7]
-; CHECK-GI-NEXT:    mov v4.s[3], w11
-; CHECK-GI-NEXT:    mov v21.s[1], w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #6]
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    ldrb w13, [x0, #1]
-; CHECK-GI-NEXT:    mov v20.s[2], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[10]
-; CHECK-GI-NEXT:    mov v18.s[3], w12
-; CHECK-GI-NEXT:    ldrb w12, [x0, #7]
-; CHECK-GI-NEXT:    mov v21.s[2], w9
-; CHECK-GI-NEXT:    umov w9, v2.b[11]
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    ldrb w10, [x0, #2]
-; CHECK-GI-NEXT:    mov v22.s[1], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[15]
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v19.s[2], w8
+; CHECK-GI-NEXT:    ldp q16, q3, [x0]
+; CHECK-GI-NEXT:    umov w9, v1.b[4]
+; CHECK-GI-NEXT:    umov w11, v1.b[5]
+; CHECK-GI-NEXT:    umov w18, v1.b[0]
+; CHECK-GI-NEXT:    umov w0, v1.b[12]
+; CHECK-GI-NEXT:    umov w3, v7.b[4]
+; CHECK-GI-NEXT:    umov w12, v1.b[1]
+; CHECK-GI-NEXT:    umov w13, v1.b[6]
+; CHECK-GI-NEXT:    umov w1, v1.b[13]
+; CHECK-GI-NEXT:    umov w4, v7.b[5]
+; CHECK-GI-NEXT:    umov w15, v1.b[2]
 ; CHECK-GI-NEXT:    umov w8, v1.b[3]
-; CHECK-GI-NEXT:    mov v21.s[3], w12
+; CHECK-GI-NEXT:    umov w16, v1.b[7]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    umov w14, v1.b[8]
+; CHECK-GI-NEXT:    umov w17, v1.b[9]
+; CHECK-GI-NEXT:    umov w10, v1.b[10]
+; CHECK-GI-NEXT:    umov w9, v1.b[11]
+; CHECK-GI-NEXT:    umov w5, v1.b[14]
+; CHECK-GI-NEXT:    umov w6, v7.b[0]
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w3
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    umov w11, v1.b[15]
+; CHECK-GI-NEXT:    fmov s1, w18
+; CHECK-GI-NEXT:    umov w7, v7.b[1]
+; CHECK-GI-NEXT:    umov w18, v7.b[6]
+; CHECK-GI-NEXT:    umov w21, v16.b[4]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w4
+; CHECK-GI-NEXT:    fmov s6, w14
+; CHECK-GI-NEXT:    mov v1.s[1], w12
+; CHECK-GI-NEXT:    umov w12, v7.b[3]
+; CHECK-GI-NEXT:    umov w14, v7.b[7]
+; CHECK-GI-NEXT:    mov v2.s[2], w13
+; CHECK-GI-NEXT:    umov w13, v7.b[2]
+; CHECK-GI-NEXT:    umov w0, v7.b[8]
+; CHECK-GI-NEXT:    fmov s7, w6
+; CHECK-GI-NEXT:    umov w23, v16.b[12]
+; CHECK-GI-NEXT:    umov w25, v3.b[4]
+; CHECK-GI-NEXT:    mov v6.s[1], w17
+; CHECK-GI-NEXT:    mov v4.s[2], w5
+; CHECK-GI-NEXT:    mov v5.s[2], w18
+; CHECK-GI-NEXT:    mov v1.s[2], w15
+; CHECK-GI-NEXT:    umov w6, v16.b[0]
+; CHECK-GI-NEXT:    umov w3, v16.b[1]
+; CHECK-GI-NEXT:    mov v2.s[3], w16
+; CHECK-GI-NEXT:    mov v7.s[1], w7
+; CHECK-GI-NEXT:    umov w16, v16.b[2]
+; CHECK-GI-NEXT:    umov w15, v16.b[3]
+; CHECK-GI-NEXT:    umov w22, v16.b[5]
+; CHECK-GI-NEXT:    umov w5, v16.b[6]
+; CHECK-GI-NEXT:    umov w18, v16.b[7]
+; CHECK-GI-NEXT:    umov w19, v16.b[8]
+; CHECK-GI-NEXT:    umov w7, v16.b[9]
+; CHECK-GI-NEXT:    umov w24, v16.b[13]
+; CHECK-GI-NEXT:    umov w1, v16.b[10]
+; CHECK-GI-NEXT:    umov w17, v16.b[11]
+; CHECK-GI-NEXT:    umov w20, v16.b[14]
+; CHECK-GI-NEXT:    umov w4, v16.b[15]
+; CHECK-GI-NEXT:    fmov s16, w21
+; CHECK-GI-NEXT:    umov w21, v3.b[8]
+; CHECK-GI-NEXT:    umov w26, v3.b[5]
+; CHECK-GI-NEXT:    fmov s17, w23
+; CHECK-GI-NEXT:    umov w23, v3.b[0]
+; CHECK-GI-NEXT:    fmov s18, w25
+; CHECK-GI-NEXT:    umov w25, v3.b[3]
+; CHECK-GI-NEXT:    mov v16.s[1], w22
+; CHECK-GI-NEXT:    umov w22, v3.b[1]
+; CHECK-GI-NEXT:    fmov s19, w6
+; CHECK-GI-NEXT:    mov v17.s[1], w24
+; CHECK-GI-NEXT:    umov w24, v3.b[2]
+; CHECK-GI-NEXT:    umov w6, v3.b[7]
+; CHECK-GI-NEXT:    mul w0, w0, w21
+; CHECK-GI-NEXT:    mov v18.s[1], w26
+; CHECK-GI-NEXT:    umov w26, v3.b[6]
+; CHECK-GI-NEXT:    fmov s3, w19
+; CHECK-GI-NEXT:    fmov s20, w23
+; CHECK-GI-NEXT:    mov v19.s[1], w3
+; CHECK-GI-NEXT:    mov v16.s[2], w5
+; CHECK-GI-NEXT:    mov v0.s[1], wzr
+; CHECK-GI-NEXT:    mov v6.s[2], w10
+; CHECK-GI-NEXT:    fmov s21, w0
+; CHECK-GI-NEXT:    mov v17.s[2], w20
+; CHECK-GI-NEXT:    mov v4.s[3], w11
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.s[1], w7
+; CHECK-GI-NEXT:    mov v20.s[1], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[2], w26
+; CHECK-GI-NEXT:    mov v21.s[1], wzr
+; CHECK-GI-NEXT:    mov v16.s[3], w18
+; CHECK-GI-NEXT:    mov v17.s[3], w4
+; CHECK-GI-NEXT:    mov v7.s[2], w13
+; CHECK-GI-NEXT:    mov v5.s[3], w14
+; CHECK-GI-NEXT:    mov v19.s[2], w16
+; CHECK-GI-NEXT:    mov v3.s[2], w1
+; CHECK-GI-NEXT:    mov v0.s[2], wzr
+; CHECK-GI-NEXT:    mov v20.s[2], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[3], w6
+; CHECK-GI-NEXT:    mov v21.s[2], wzr
+; CHECK-GI-NEXT:    mul v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v6.s[3], w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #3]
-; CHECK-GI-NEXT:    mov v20.s[3], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[11]
-; CHECK-GI-NEXT:    mov v22.s[2], w10
-; CHECK-GI-NEXT:    ldrb w10, [x1, #3]
-; CHECK-GI-NEXT:    mul v1.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v17.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w10
-; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v21.4s
-; CHECK-GI-NEXT:    mov v19.s[3], w13
-; CHECK-GI-NEXT:    mul v7.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT:    mov v22.s[3], w9
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    mla v7.4s, v6.4s, v19.4s
-; CHECK-GI-NEXT:    mla v5.4s, v3.4s, v22.4s
-; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w12
+; CHECK-GI-NEXT:    mov v19.s[3], w15
+; CHECK-GI-NEXT:    mov v3.s[3], w17
+; CHECK-GI-NEXT:    mov v20.s[3], w25
+; CHECK-GI-NEXT:    mov v0.s[3], wzr
+; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v18.4s
+; CHECK-GI-NEXT:    mov v21.s[3], wzr
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v19.4s
+; CHECK-GI-NEXT:    mla v4.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    mla v5.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    add v0.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v4.4s
 ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <25 x i8>, ptr %a
@@ -2455,73 +2477,77 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
 ;
 ; CHECK-GI-LABEL: test_udot_v25i8_nomla:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldrb w17, [x0, #16]!
-; CHECK-GI-NEXT:    ldrb w16, [x0, #4]
-; CHECK-GI-NEXT:    ldrb w14, [x0, #8]
+; CHECK-GI-NEXT:    str x19, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w19, -16
+; CHECK-GI-NEXT:    ldp q2, q1, [x0]
 ; CHECK-GI-NEXT:    fmov s0, wzr
-; CHECK-GI-NEXT:    umov w15, v1.b[0]
-; CHECK-GI-NEXT:    umov w2, v1.b[4]
-; CHECK-GI-NEXT:    umov w4, v1.b[8]
-; CHECK-GI-NEXT:    umov w5, v1.b[12]
-; CHECK-GI-NEXT:    umov w1, v1.b[1]
-; CHECK-GI-NEXT:    umov w3, v1.b[5]
-; CHECK-GI-NEXT:    umov w6, v1.b[9]
-; CHECK-GI-NEXT:    umov w7, v1.b[13]
-; CHECK-GI-NEXT:    fmov s6, w17
-; CHECK-GI-NEXT:    fmov s7, w16
-; CHECK-GI-NEXT:    fmov s16, w14
-; CHECK-GI-NEXT:    ldrb w18, [x0, #1]
-; CHECK-GI-NEXT:    fmov s2, w15
-; CHECK-GI-NEXT:    fmov s3, w2
-; CHECK-GI-NEXT:    ldrb w11, [x0, #5]
-; CHECK-GI-NEXT:    fmov s4, w4
-; CHECK-GI-NEXT:    fmov s5, w5
-; CHECK-GI-NEXT:    ldrb w16, [x0, #2]
-; CHECK-GI-NEXT:    umov w9, v1.b[2]
-; CHECK-GI-NEXT:    umov w12, v1.b[6]
-; CHECK-GI-NEXT:    ldrb w17, [x0, #6]
-; CHECK-GI-NEXT:    umov w13, v1.b[10]
-; CHECK-GI-NEXT:    umov w15, v1.b[14]
-; CHECK-GI-NEXT:    mov v2.s[1], w1
-; CHECK-GI-NEXT:    mov v3.s[1], w3
-; CHECK-GI-NEXT:    mov v4.s[1], w6
-; CHECK-GI-NEXT:    mov v5.s[1], w7
-; CHECK-GI-NEXT:    mov v6.s[1], w18
-; CHECK-GI-NEXT:    mov v7.s[1], w11
-; CHECK-GI-NEXT:    mov v16.s[1], wzr
+; CHECK-GI-NEXT:    umov w15, v2.b[0]
+; CHECK-GI-NEXT:    umov w17, v2.b[4]
+; CHECK-GI-NEXT:    umov w0, v2.b[8]
+; CHECK-GI-NEXT:    umov w2, v2.b[12]
+; CHECK-GI-NEXT:    umov w4, v1.b[0]
+; CHECK-GI-NEXT:    umov w6, v1.b[4]
+; CHECK-GI-NEXT:    umov w19, v1.b[8]
+; CHECK-GI-NEXT:    umov w16, v2.b[1]
+; CHECK-GI-NEXT:    umov w18, v2.b[5]
+; CHECK-GI-NEXT:    umov w1, v2.b[9]
+; CHECK-GI-NEXT:    umov w3, v2.b[13]
+; CHECK-GI-NEXT:    umov w5, v1.b[1]
+; CHECK-GI-NEXT:    umov w7, v1.b[5]
+; CHECK-GI-NEXT:    fmov s3, w15
+; CHECK-GI-NEXT:    fmov s4, w17
+; CHECK-GI-NEXT:    fmov s5, w0
+; CHECK-GI-NEXT:    fmov s6, w2
+; CHECK-GI-NEXT:    fmov s7, w4
+; CHECK-GI-NEXT:    fmov s16, w6
+; CHECK-GI-NEXT:    fmov s17, w19
+; CHECK-GI-NEXT:    umov w10, v2.b[2]
+; CHECK-GI-NEXT:    umov w11, v2.b[6]
+; CHECK-GI-NEXT:    umov w12, v2.b[10]
+; CHECK-GI-NEXT:    umov w13, v2.b[14]
+; CHECK-GI-NEXT:    umov w14, v1.b[2]
+; CHECK-GI-NEXT:    umov w15, v1.b[6]
+; CHECK-GI-NEXT:    mov v3.s[1], w16
+; CHECK-GI-NEXT:    mov v4.s[1], w18
+; CHECK-GI-NEXT:    mov v5.s[1], w1
+; CHECK-GI-NEXT:    mov v6.s[1], w3
+; CHECK-GI-NEXT:    mov v7.s[1], w5
+; CHECK-GI-NEXT:    mov v16.s[1], w7
+; CHECK-GI-NEXT:    mov v17.s[1], wzr
 ; CHECK-GI-NEXT:    mov v0.s[1], wzr
-; CHECK-GI-NEXT:    umov w8, v1.b[3]
-; CHECK-GI-NEXT:    umov w10, v1.b[7]
-; CHECK-GI-NEXT:    umov w11, v1.b[11]
-; CHECK-GI-NEXT:    umov w14, v1.b[15]
-; CHECK-GI-NEXT:    mov v2.s[2], w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #3]
-; CHECK-GI-NEXT:    mov v3.s[2], w12
-; CHECK-GI-NEXT:    ldrb w12, [x0, #7]
-; CHECK-GI-NEXT:    mov v4.s[2], w13
-; CHECK-GI-NEXT:    mov v5.s[2], w15
-; CHECK-GI-NEXT:    mov v6.s[2], w16
-; CHECK-GI-NEXT:    mov v7.s[2], w17
-; CHECK-GI-NEXT:    mov v16.s[2], wzr
+; CHECK-GI-NEXT:    umov w8, v2.b[3]
+; CHECK-GI-NEXT:    umov w9, v2.b[7]
+; CHECK-GI-NEXT:    umov w16, v2.b[11]
+; CHECK-GI-NEXT:    umov w17, v2.b[15]
+; CHECK-GI-NEXT:    umov w18, v1.b[3]
+; CHECK-GI-NEXT:    umov w0, v1.b[7]
+; CHECK-GI-NEXT:    mov v3.s[2], w10
+; CHECK-GI-NEXT:    mov v4.s[2], w11
+; CHECK-GI-NEXT:    mov v5.s[2], w12
+; CHECK-GI-NEXT:    mov v6.s[2], w13
+; CHECK-GI-NEXT:    mov v7.s[2], w14
+; CHECK-GI-NEXT:    mov v16.s[2], w15
+; CHECK-GI-NEXT:    mov v17.s[2], wzr
 ; CHECK-GI-NEXT:    mov v0.s[2], wzr
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w10
-; CHECK-GI-NEXT:    mov v4.s[3], w11
-; CHECK-GI-NEXT:    mov v5.s[3], w14
-; CHECK-GI-NEXT:    mov v6.s[3], w9
-; CHECK-GI-NEXT:    mov v7.s[3], w12
-; CHECK-GI-NEXT:    mov v16.s[3], wzr
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mov v6.s[3], w17
+; CHECK-GI-NEXT:    mov v7.s[3], w18
+; CHECK-GI-NEXT:    mov v16.s[3], w0
+; CHECK-GI-NEXT:    mov v17.s[3], wzr
 ; CHECK-GI-NEXT:    mov v0.s[3], wzr
-; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT:    add v2.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    add v3.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    add v0.4s, v16.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    add v2.4s, v5.4s, v6.4s
+; CHECK-GI-NEXT:    add v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    add v0.4s, v17.4s, v0.4s
 ; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ldr x19, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <25 x i8>, ptr %a1
@@ -2554,128 +2580,145 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ;
 ; CHECK-GI-LABEL: test_sdot_v25i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #16]!
-; CHECK-GI-NEXT:    ldrsb w11, [x1, #4]
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #5]
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    ldp q1, q7, [x1]
 ; CHECK-GI-NEXT:    fmov s0, wzr
-; CHECK-GI-NEXT:    smov w13, v2.b[4]
-; CHECK-GI-NEXT:    smov w14, v2.b[5]
-; CHECK-GI-NEXT:    smov w10, v2.b[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    smov w9, v2.b[8]
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    smov w11, v2.b[12]
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldrsb w8, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], wzr
-; CHECK-GI-NEXT:    fmov s7, w13
-; CHECK-GI-NEXT:    fmov s4, w10
-; CHECK-GI-NEXT:    smov w10, v2.b[13]
-; CHECK-GI-NEXT:    mov v5.s[1], w12
-; CHECK-GI-NEXT:    smov w13, v2.b[9]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    fmov s16, w11
-; CHECK-GI-NEXT:    smov w9, v1.b[0]
-; CHECK-GI-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NEXT:    mov v7.s[1], w14
-; CHECK-GI-NEXT:    smov w14, v2.b[6]
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #6]
-; CHECK-GI-NEXT:    smov w8, v2.b[1]
-; CHECK-GI-NEXT:    smov w11, v2.b[2]
-; CHECK-GI-NEXT:    mov v0.s[2], wzr
-; CHECK-GI-NEXT:    mov v16.s[1], w10
-; CHECK-GI-NEXT:    smov w10, v2.b[14]
-; CHECK-GI-NEXT:    mov v5.s[2], w12
-; CHECK-GI-NEXT:    smov w12, v1.b[5]
-; CHECK-GI-NEXT:    mov v6.s[1], w13
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    mov v7.s[2], w14
-; CHECK-GI-NEXT:    smov w14, v1.b[4]
-; CHECK-GI-NEXT:    smov w9, v2.b[10]
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[1]
-; CHECK-GI-NEXT:    smov w13, v2.b[7]
-; CHECK-GI-NEXT:    mov v16.s[2], w10
-; CHECK-GI-NEXT:    smov w10, v2.b[15]
-; CHECK-GI-NEXT:    mov v0.s[3], wzr
-; CHECK-GI-NEXT:    fmov s18, w14
-; CHECK-GI-NEXT:    mov v6.s[2], w9
-; CHECK-GI-NEXT:    smov w9, v1.b[12]
-; CHECK-GI-NEXT:    mov v4.s[2], w11
-; CHECK-GI-NEXT:    ldrsb w11, [x1, #7]
-; CHECK-GI-NEXT:    mov v17.s[1], w8
-; CHECK-GI-NEXT:    ldrsb w8, [x1, #2]
-; CHECK-GI-NEXT:    mov v16.s[3], w10
-; CHECK-GI-NEXT:    smov w10, v1.b[13]
-; CHECK-GI-NEXT:    mov v18.s[1], w12
-; CHECK-GI-NEXT:    smov w12, v1.b[6]
-; CHECK-GI-NEXT:    mov v5.s[3], w11
-; CHECK-GI-NEXT:    ldrsb w11, [x0, #16]!
-; CHECK-GI-NEXT:    mov v7.s[3], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[2]
-; CHECK-GI-NEXT:    fmov s20, w9
-; CHECK-GI-NEXT:    ldrsb w9, [x0, #5]
-; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[8]
-; CHECK-GI-NEXT:    fmov s22, w11
-; CHECK-GI-NEXT:    mov v18.s[2], w12
-; CHECK-GI-NEXT:    ldrsb w12, [x0, #4]
-; CHECK-GI-NEXT:    smov w11, v2.b[3]
-; CHECK-GI-NEXT:    mov v20.s[1], w10
-; CHECK-GI-NEXT:    ldrsb w10, [x0, #8]
-; CHECK-GI-NEXT:    fmov s21, w12
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #8]
-; CHECK-GI-NEXT:    mov v17.s[2], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[9]
-; CHECK-GI-NEXT:    fmov s19, w8
-; CHECK-GI-NEXT:    smov w8, v1.b[14]
-; CHECK-GI-NEXT:    mul w10, w12, w10
-; CHECK-GI-NEXT:    smov w12, v1.b[7]
-; CHECK-GI-NEXT:    mov v4.s[3], w11
-; CHECK-GI-NEXT:    mov v21.s[1], w9
-; CHECK-GI-NEXT:    ldrsb w9, [x0, #6]
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    ldrsb w13, [x0, #1]
-; CHECK-GI-NEXT:    mov v20.s[2], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[10]
-; CHECK-GI-NEXT:    mov v18.s[3], w12
-; CHECK-GI-NEXT:    ldrsb w12, [x0, #7]
-; CHECK-GI-NEXT:    mov v21.s[2], w9
-; CHECK-GI-NEXT:    smov w9, v2.b[11]
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    ldrsb w10, [x0, #2]
-; CHECK-GI-NEXT:    mov v22.s[1], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[15]
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v19.s[2], w8
+; CHECK-GI-NEXT:    ldp q16, q3, [x0]
+; CHECK-GI-NEXT:    smov w9, v1.b[4]
+; CHECK-GI-NEXT:    smov w11, v1.b[5]
+; CHECK-GI-NEXT:    smov w18, v1.b[0]
+; CHECK-GI-NEXT:    smov w0, v1.b[12]
+; CHECK-GI-NEXT:    smov w3, v7.b[4]
+; CHECK-GI-NEXT:    smov w12, v1.b[1]
+; CHECK-GI-NEXT:    smov w13, v1.b[6]
+; CHECK-GI-NEXT:    smov w1, v1.b[13]
+; CHECK-GI-NEXT:    smov w4, v7.b[5]
+; CHECK-GI-NEXT:    smov w15, v1.b[2]
 ; CHECK-GI-NEXT:    smov w8, v1.b[3]
-; CHECK-GI-NEXT:    mov v21.s[3], w12
+; CHECK-GI-NEXT:    smov w16, v1.b[7]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    smov w14, v1.b[8]
+; CHECK-GI-NEXT:    smov w17, v1.b[9]
+; CHECK-GI-NEXT:    smov w10, v1.b[10]
+; CHECK-GI-NEXT:    smov w9, v1.b[11]
+; CHECK-GI-NEXT:    smov w5, v1.b[14]
+; CHECK-GI-NEXT:    smov w6, v7.b[0]
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w3
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    smov w11, v1.b[15]
+; CHECK-GI-NEXT:    fmov s1, w18
+; CHECK-GI-NEXT:    smov w7, v7.b[1]
+; CHECK-GI-NEXT:    smov w18, v7.b[6]
+; CHECK-GI-NEXT:    smov w21, v16.b[4]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w4
+; CHECK-GI-NEXT:    fmov s6, w14
+; CHECK-GI-NEXT:    mov v1.s[1], w12
+; CHECK-GI-NEXT:    smov w12, v7.b[3]
+; CHECK-GI-NEXT:    smov w14, v7.b[7]
+; CHECK-GI-NEXT:    mov v2.s[2], w13
+; CHECK-GI-NEXT:    smov w13, v7.b[2]
+; CHECK-GI-NEXT:    smov w0, v7.b[8]
+; CHECK-GI-NEXT:    fmov s7, w6
+; CHECK-GI-NEXT:    smov w23, v16.b[12]
+; CHECK-GI-NEXT:    smov w25, v3.b[4]
+; CHECK-GI-NEXT:    mov v6.s[1], w17
+; CHECK-GI-NEXT:    mov v4.s[2], w5
+; CHECK-GI-NEXT:    mov v5.s[2], w18
+; CHECK-GI-NEXT:    mov v1.s[2], w15
+; CHECK-GI-NEXT:    smov w6, v16.b[0]
+; CHECK-GI-NEXT:    smov w3, v16.b[1]
+; CHECK-GI-NEXT:    mov v2.s[3], w16
+; CHECK-GI-NEXT:    mov v7.s[1], w7
+; CHECK-GI-NEXT:    smov w16, v16.b[2]
+; CHECK-GI-NEXT:    smov w15, v16.b[3]
+; CHECK-GI-NEXT:    smov w22, v16.b[5]
+; CHECK-GI-NEXT:    smov w5, v16.b[6]
+; CHECK-GI-NEXT:    smov w18, v16.b[7]
+; CHECK-GI-NEXT:    smov w19, v16.b[8]
+; CHECK-GI-NEXT:    smov w7, v16.b[9]
+; CHECK-GI-NEXT:    smov w24, v16.b[13]
+; CHECK-GI-NEXT:    smov w1, v16.b[10]
+; CHECK-GI-NEXT:    smov w17, v16.b[11]
+; CHECK-GI-NEXT:    smov w20, v16.b[14]
+; CHECK-GI-NEXT:    smov w4, v16.b[15]
+; CHECK-GI-NEXT:    fmov s16, w21
+; CHECK-GI-NEXT:    smov w21, v3.b[8]
+; CHECK-GI-NEXT:    smov w26, v3.b[5]
+; CHECK-GI-NEXT:    fmov s17, w23
+; CHECK-GI-NEXT:    smov w23, v3.b[0]
+; CHECK-GI-NEXT:    fmov s18, w25
+; CHECK-GI-NEXT:    smov w25, v3.b[3]
+; CHECK-GI-NEXT:    mov v16.s[1], w22
+; CHECK-GI-NEXT:    smov w22, v3.b[1]
+; CHECK-GI-NEXT:    fmov s19, w6
+; CHECK-GI-NEXT:    mov v17.s[1], w24
+; CHECK-GI-NEXT:    smov w24, v3.b[2]
+; CHECK-GI-NEXT:    smov w6, v3.b[7]
+; CHECK-GI-NEXT:    mul w0, w0, w21
+; CHECK-GI-NEXT:    mov v18.s[1], w26
+; CHECK-GI-NEXT:    smov w26, v3.b[6]
+; CHECK-GI-NEXT:    fmov s3, w19
+; CHECK-GI-NEXT:    fmov s20, w23
+; CHECK-GI-NEXT:    mov v19.s[1], w3
+; CHECK-GI-NEXT:    mov v16.s[2], w5
+; CHECK-GI-NEXT:    mov v0.s[1], wzr
+; CHECK-GI-NEXT:    mov v6.s[2], w10
+; CHECK-GI-NEXT:    fmov s21, w0
+; CHECK-GI-NEXT:    mov v17.s[2], w20
+; CHECK-GI-NEXT:    mov v4.s[3], w11
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.s[1], w7
+; CHECK-GI-NEXT:    mov v20.s[1], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[2], w26
+; CHECK-GI-NEXT:    mov v21.s[1], wzr
+; CHECK-GI-NEXT:    mov v16.s[3], w18
+; CHECK-GI-NEXT:    mov v17.s[3], w4
+; CHECK-GI-NEXT:    mov v7.s[2], w13
+; CHECK-GI-NEXT:    mov v5.s[3], w14
+; CHECK-GI-NEXT:    mov v19.s[2], w16
+; CHECK-GI-NEXT:    mov v3.s[2], w1
+; CHECK-GI-NEXT:    mov v0.s[2], wzr
+; CHECK-GI-NEXT:    mov v20.s[2], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[3], w6
+; CHECK-GI-NEXT:    mov v21.s[2], wzr
+; CHECK-GI-NEXT:    mul v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v6.s[3], w9
-; CHECK-GI-NEXT:    ldrsb w9, [x0, #3]
-; CHECK-GI-NEXT:    mov v20.s[3], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[11]
-; CHECK-GI-NEXT:    mov v22.s[2], w10
-; CHECK-GI-NEXT:    ldrsb w10, [x1, #3]
-; CHECK-GI-NEXT:    mul v1.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v17.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w10
-; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v21.4s
-; CHECK-GI-NEXT:    mov v19.s[3], w13
-; CHECK-GI-NEXT:    mul v7.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT:    mov v22.s[3], w9
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    mla v7.4s, v6.4s, v19.4s
-; CHECK-GI-NEXT:    mla v5.4s, v3.4s, v22.4s
-; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w12
+; CHECK-GI-NEXT:    mov v19.s[3], w15
+; CHECK-GI-NEXT:    mov v3.s[3], w17
+; CHECK-GI-NEXT:    mov v20.s[3], w25
+; CHECK-GI-NEXT:    mov v0.s[3], wzr
+; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v18.4s
+; CHECK-GI-NEXT:    mov v21.s[3], wzr
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v19.4s
+; CHECK-GI-NEXT:    mla v4.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    mla v5.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    add v0.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v4.4s
 ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <25 x i8>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 7a436eddb23a..5e278d59b659 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]

From 32dbaf12be54b028c9b9e6cf0fde27fa9a81a335 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sat, 21 Jun 2025 09:43:34 +0200
Subject: [PATCH 1144/1322] [mlir][Transforms][NFC] Dialect conversion: Rename
 internal functions (#145018)

Rename a few internal functions: drop the `notify` prefix, which
incorrectly suggests that the function is a listener callback function.
---
 .../Transforms/Utils/DialectConversion.cpp    | 61 +++++++++++--------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 3b669f51a615..ff48647f4330 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -896,7 +896,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   bool wasOpReplaced(Operation *op) const;
 
   //===--------------------------------------------------------------------===//
-  // Type Conversion
+  // IR Rewrites / Type Conversion
   //===--------------------------------------------------------------------===//
 
   /// Convert the types of block arguments within the given region.
@@ -916,6 +916,22 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
       const TypeConverter *converter,
       TypeConverter::SignatureConversion &signatureConversion);
 
+  /// Replace the results of the given operation with the given values and
+  /// erase the operation.
+  ///
+  /// There can be multiple replacement values for each result (1:N
+  /// replacement). If the replacement values are empty, the respective result
+  /// is dropped and a source materialization is built if the result still has
+  /// uses.
+  void replaceOp(Operation *op, SmallVector<SmallVector<Value>> &&newValues);
+
+  /// Erase the given block and its contents.
+  void eraseBlock(Block *block);
+
+  /// Inline the source block into the destination block before the given
+  /// iterator.
+  void inlineBlockBefore(Block *source, Block *dest, Block::iterator before);
+
   //===--------------------------------------------------------------------===//
   // Materializations
   //===--------------------------------------------------------------------===//
@@ -952,21 +968,10 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   void notifyOperationInserted(Operation *op,
                                OpBuilder::InsertPoint previous) override;
 
-  /// Notifies that an op is about to be replaced with the given values.
-  void notifyOpReplaced(Operation *op,
-                        SmallVector<SmallVector<Value>> &&newValues);
-
-  /// Notifies that a block is about to be erased.
-  void notifyBlockIsBeingErased(Block *block);
-
   /// Notifies that a block was inserted.
   void notifyBlockInserted(Block *block, Region *previous,
                            Region::iterator previousIt) override;
 
-  /// Notifies that a block is being inlined into another block.
-  void notifyBlockBeingInlined(Block *block, Block *srcBlock,
-                               Block::iterator before);
-
   /// Notifies that a pattern match failed for the given reason.
   void
   notifyMatchFailure(Location loc,
@@ -1548,7 +1553,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
   appendRewrite<MoveOperationRewrite>(op, previous.getBlock(), prevOp);
 }
 
-void ConversionPatternRewriterImpl::notifyOpReplaced(
+void ConversionPatternRewriterImpl::replaceOp(
     Operation *op, SmallVector<SmallVector<Value>> &&newValues) {
   assert(newValues.size() == op->getNumResults());
   assert(!ignoredOps.contains(op) && "operation was already replaced");
@@ -1599,8 +1604,14 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(
   op->walk([&](Operation *op) { replacedOps.insert(op); });
 }
 
-void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
+void ConversionPatternRewriterImpl::eraseBlock(Block *block) {
   appendRewrite<EraseBlockRewrite>(block);
+
+  // Unlink the block from its parent region. The block is kept in the rewrite
+  // object and will be actually destroyed when rewrites are applied. This
+  // allows us to keep the operations in the block live and undo the removal by
+  // re-inserting the block.
+  block->getParent()->getBlocks().remove(block);
 }
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
@@ -1628,9 +1639,10 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   appendRewrite<MoveBlockRewrite>(block, previous, prevBlock);
 }
 
-void ConversionPatternRewriterImpl::notifyBlockBeingInlined(
-    Block *block, Block *srcBlock, Block::iterator before) {
-  appendRewrite<InlineBlockRewrite>(block, srcBlock, before);
+void ConversionPatternRewriterImpl::inlineBlockBefore(Block *source,
+                                                      Block *dest,
+                                                      Block::iterator before) {
+  appendRewrite<InlineBlockRewrite>(dest, source, before);
 }
 
 void ConversionPatternRewriterImpl::notifyMatchFailure(
@@ -1673,7 +1685,7 @@ void ConversionPatternRewriter::replaceOp(Operation *op, ValueRange newValues) {
       llvm::map_to_vector(newValues, [](Value v) -> SmallVector<Value> {
         return v ? SmallVector<Value>{v} : SmallVector<Value>();
       });
-  impl->notifyOpReplaced(op, std::move(newVals));
+  impl->replaceOp(op, std::move(newVals));
 }
 
 void ConversionPatternRewriter::replaceOpWithMultiple(
@@ -1684,7 +1696,7 @@ void ConversionPatternRewriter::replaceOpWithMultiple(
     impl->logger.startLine()
         << "** Replace : '" << op->getName() << "'(" << op << ")\n";
   });
-  impl->notifyOpReplaced(op, std::move(newValues));
+  impl->replaceOp(op, std::move(newValues));
 }
 
 void ConversionPatternRewriter::eraseOp(Operation *op) {
@@ -1693,7 +1705,7 @@ void ConversionPatternRewriter::eraseOp(Operation *op) {
         << "** Erase   : '" << op->getName() << "'(" << op << ")\n";
   });
   SmallVector<SmallVector<Value>> nullRepls(op->getNumResults(), {});
-  impl->notifyOpReplaced(op, std::move(nullRepls));
+  impl->replaceOp(op, std::move(nullRepls));
 }
 
 void ConversionPatternRewriter::eraseBlock(Block *block) {
@@ -1704,12 +1716,7 @@ void ConversionPatternRewriter::eraseBlock(Block *block) {
   for (Operation &op : *block)
     eraseOp(&op);
 
-  // Unlink the block from its parent region. The block is kept in the rewrite
-  // object and will be actually destroyed when rewrites are applied. This
-  // allows us to keep the operations in the block live and undo the removal by
-  // re-inserting the block.
-  impl->notifyBlockIsBeingErased(block);
-  block->getParent()->getBlocks().remove(block);
+  impl->eraseBlock(block);
 }
 
 Block *ConversionPatternRewriter::applySignatureConversion(
@@ -1797,7 +1804,7 @@ void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
   bool fastPath = !impl->config.listener;
 
   if (fastPath)
-    impl->notifyBlockBeingInlined(dest, source, before);
+    impl->inlineBlockBefore(source, dest, before);
 
   // Replace all uses of block arguments.
   for (auto it : llvm::zip(source->getArguments(), argValues))

From 2050d2e1815b4b306f4b3842d6ac0fa73e1d4085 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 21 Jun 2025 09:47:54 +0200
Subject: [PATCH 1145/1322] [libc++] Simplify std::function further after
 removing allocator support (#144443)

Since we've removed allocator support, we can remove a few support
structures. This only affects the policy implementation, so this
shouldn't even be ABI sensitive.
---
 libcxx/include/__functional/function.h | 127 +++++++------------------
 1 file changed, 37 insertions(+), 90 deletions(-)

diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index e71c778386fd..dc112ebfd0fa 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -125,31 +125,6 @@ _LIBCPP_HIDE_FROM_ABI bool __not_null(_Rp (^__p)(_Args...)) {
 
 namespace __function {
 
-template <class _Fp, class _FB>
-class __default_alloc_func;
-
-template <class _Fp, class _Rp, class... _ArgTypes>
-class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
-  _Fp __f_;
-
-public:
-  using _Target _LIBCPP_NODEBUG = _Fp;
-
-  _LIBCPP_HIDE_FROM_ABI const _Target& __target() const { return __f_; }
-
-  _LIBCPP_HIDE_FROM_ABI explicit __default_alloc_func(_Target&& __f) : __f_(std::move(__f)) {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __default_alloc_func(const _Target& __f) : __f_(__f) {}
-
-  _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) {
-    return std::__invoke_r<_Rp>(__f_, std::forward<_ArgTypes>(__arg)...);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const { return new __default_alloc_func(__f_); }
-
-  _LIBCPP_HIDE_FROM_ABI void destroy() _NOEXCEPT { __f_.~_Target(); }
-};
-
 // __base provides an abstract interface for copyable functors.
 
 template <class _Fp>
@@ -402,7 +377,7 @@ private:
   template <typename _Fun>
   _LIBCPP_HIDE_FROM_ABI static void* __large_clone(const void* __s) {
     const _Fun* __f = static_cast<const _Fun*>(__s);
-    return __f->__clone();
+    return new _Fun(*__f);
   }
 
   template <typename _Fun>
@@ -417,7 +392,7 @@ private:
         std::addressof(__large_destroy<_Fun>),
         false,
 #  if _LIBCPP_HAS_RTTI
-        &typeid(typename _Fun::_Target)
+        &typeid(_Fun)
 #  else
         nullptr
 #  endif
@@ -432,7 +407,7 @@ private:
         nullptr,
         false,
 #  if _LIBCPP_HAS_RTTI
-        &typeid(typename _Fun::_Target)
+        &typeid(_Fun)
 #  else
         nullptr
 #  endif
@@ -446,42 +421,7 @@ private:
 template <typename _Tp>
 using __fast_forward _LIBCPP_NODEBUG = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
 
-// __policy_invoker calls an instance of __default_alloc_func held in __policy_storage.
-
-template <class _Fp>
-struct __policy_invoker;
-
-template <class _Rp, class... _ArgTypes>
-struct __policy_invoker<_Rp(_ArgTypes...)> {
-  typedef _Rp (*__Call)(const __policy_storage*, __fast_forward<_ArgTypes>...);
-
-  __Call __call_;
-
-  // Creates an invoker that throws bad_function_call.
-  _LIBCPP_HIDE_FROM_ABI __policy_invoker() : __call_(&__call_empty) {}
-
-  // Creates an invoker that calls the given instance of __func.
-  template <typename _Fun>
-  _LIBCPP_HIDE_FROM_ABI static __policy_invoker __create() {
-    return __policy_invoker(std::addressof(__call_impl<_Fun>));
-  }
-
-private:
-  _LIBCPP_HIDE_FROM_ABI explicit __policy_invoker(__Call __c) : __call_(__c) {}
-
-  _LIBCPP_HIDE_FROM_ABI static _Rp __call_empty(const __policy_storage*, __fast_forward<_ArgTypes>...) {
-    std::__throw_bad_function_call();
-  }
-
-  template <typename _Fun>
-  _LIBCPP_HIDE_FROM_ABI static _Rp __call_impl(const __policy_storage* __buf, __fast_forward<_ArgTypes>... __args) {
-    _Fun* __f = reinterpret_cast<_Fun*>(__use_small_storage<_Fun>::value ? &__buf->__small : __buf->__large);
-    return (*__f)(std::forward<_ArgTypes>(__args)...);
-  }
-};
-
-// __policy_func uses a __policy and __policy_invoker to create a type-erased,
-// copyable functor.
+// __policy_func uses a __policy to create a type-erased, copyable functor.
 
 template <class _Fp>
 class __policy_func;
@@ -491,45 +431,52 @@ class __policy_func<_Rp(_ArgTypes...)> {
   // Inline storage for small objects.
   __policy_storage __buf_;
 
-  // Calls the value stored in __buf_. This could technically be part of
-  // policy, but storing it here eliminates a level of indirection inside
-  // operator().
-  typedef __function::__policy_invoker<_Rp(_ArgTypes...)> __invoker;
-  __invoker __invoker_;
+  using _ErasedFunc _LIBCPP_NODEBUG = _Rp(const __policy_storage*, __fast_forward<_ArgTypes>...);
+
+  _ErasedFunc* __func_;
 
   // The policy that describes how to move / copy / destroy __buf_. Never
   // null, even if the function is empty.
   const __policy* __policy_;
 
+  _LIBCPP_HIDE_FROM_ABI static _Rp __empty_func(const __policy_storage*, __fast_forward<_ArgTypes>...) {
+    std::__throw_bad_function_call();
+  }
+
+  template <class _Fun>
+  _LIBCPP_HIDE_FROM_ABI static _Rp __call_func(const __policy_storage* __buf, __fast_forward<_ArgTypes>... __args) {
+    _Fun* __func = reinterpret_cast<_Fun*>(__use_small_storage<_Fun>::value ? &__buf->__small : __buf->__large);
+
+    return std::__invoke_r<_Rp>(*__func, std::forward<_ArgTypes>(__args)...);
+  }
+
 public:
-  _LIBCPP_HIDE_FROM_ABI __policy_func() : __policy_(__policy::__create_empty()) {}
+  _LIBCPP_HIDE_FROM_ABI __policy_func() : __func_(__empty_func), __policy_(__policy::__create_empty()) {}
 
   template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __policy_func>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI explicit __policy_func(_Fp&& __f) : __policy_(__policy::__create_empty()) {
-    typedef __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fun;
-
     if (__function::__not_null(__f)) {
-      __invoker_ = __invoker::template __create<_Fun>();
-      __policy_  = __policy::__create<_Fun>();
-      if (__use_small_storage<_Fun>()) {
-        ::new ((void*)&__buf_.__small) _Fun(std::move(__f));
+      __func_   = __call_func<_Fp>;
+      __policy_ = __policy::__create<_Fp>();
+      if (__use_small_storage<_Fp>()) {
+        ::new ((void*)&__buf_.__small) _Fp(std::move(__f));
       } else {
-        __buf_.__large = ::new _Fun(std::move(__f));
+        __buf_.__large = ::new _Fp(std::move(__f));
       }
     }
   }
 
   _LIBCPP_HIDE_FROM_ABI __policy_func(const __policy_func& __f)
-      : __buf_(__f.__buf_), __invoker_(__f.__invoker_), __policy_(__f.__policy_) {
+      : __buf_(__f.__buf_), __func_(__f.__func_), __policy_(__f.__policy_) {
     if (__policy_->__clone)
       __buf_.__large = __policy_->__clone(__f.__buf_.__large);
   }
 
   _LIBCPP_HIDE_FROM_ABI __policy_func(__policy_func&& __f)
-      : __buf_(__f.__buf_), __invoker_(__f.__invoker_), __policy_(__f.__policy_) {
+      : __buf_(__f.__buf_), __func_(__f.__func_), __policy_(__f.__policy_) {
     if (__policy_->__destroy) {
-      __f.__policy_  = __policy::__create_empty();
-      __f.__invoker_ = __invoker();
+      __f.__policy_ = __policy::__create_empty();
+      __f.__func_   = {};
     }
   }
 
@@ -539,30 +486,30 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI __policy_func& operator=(__policy_func&& __f) {
-    *this          = nullptr;
-    __buf_         = __f.__buf_;
-    __invoker_     = __f.__invoker_;
-    __policy_      = __f.__policy_;
-    __f.__policy_  = __policy::__create_empty();
-    __f.__invoker_ = __invoker();
+    *this         = nullptr;
+    __buf_        = __f.__buf_;
+    __func_       = __f.__func_;
+    __policy_     = __f.__policy_;
+    __f.__policy_ = __policy::__create_empty();
+    __f.__func_   = {};
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI __policy_func& operator=(nullptr_t) {
     const __policy* __p = __policy_;
     __policy_           = __policy::__create_empty();
-    __invoker_          = __invoker();
+    __func_             = {};
     if (__p->__destroy)
       __p->__destroy(__buf_.__large);
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __args) const {
-    return __invoker_.__call_(std::addressof(__buf_), std::forward<_ArgTypes>(__args)...);
+    return __func_(std::addressof(__buf_), std::forward<_ArgTypes>(__args)...);
   }
 
   _LIBCPP_HIDE_FROM_ABI void swap(__policy_func& __f) {
-    std::swap(__invoker_, __f.__invoker_);
+    std::swap(__func_, __f.__func_);
     std::swap(__policy_, __f.__policy_);
     std::swap(__buf_, __f.__buf_);
   }

From 685af55fe004b0d904c3de1c28fdebbeee15d0a4 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 21 Jun 2025 09:58:45 +0200
Subject: [PATCH 1146/1322] [libc++] Simplify <limits> a bit (#140021)

This does a few small things:
- inline `__libcpp_compute_min`, since we can don't have to put the
arithmetic behind a constraint. Simple arithmetic also tends to be
faster to compile than instantiating a type.
- Remove an unused include (and add missing includes elsewhere)
- Remove `__min` and `__max` from the `bool` specialization

Co-authored-by: Louis Dionne <ldionne.2@gmail.com>
---
 libcxx/include/__numeric/gcd_lcm.h |  2 +-
 libcxx/include/forward_list        |  1 +
 libcxx/include/limits              | 19 +++----------------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/libcxx/include/__numeric/gcd_lcm.h b/libcxx/include/__numeric/gcd_lcm.h
index ce58f8698f72..95df54dc066d 100644
--- a/libcxx/include/__numeric/gcd_lcm.h
+++ b/libcxx/include/__numeric/gcd_lcm.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___NUMERIC_GCD_LCM_H
 #define _LIBCPP___NUMERIC_GCD_LCM_H
 
-#include <__algorithm/min.h>
 #include <__assert>
 #include <__bit/countr.h>
 #include <__config>
@@ -20,6 +19,7 @@
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_signed.h>
 #include <__type_traits/make_unsigned.h>
+#include <__type_traits/remove_cv.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index bad0c11b7c7e..6daa7fbbc03c 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -233,6 +233,7 @@ template <class T, class Allocator, class Predicate>
 #  include <__type_traits/is_pointer.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
+#  include <__type_traits/remove_cv.h>
 #  include <__type_traits/type_identity.h>
 #  include <__utility/forward.h>
 #  include <__utility/move.h>
diff --git a/libcxx/include/limits b/libcxx/include/limits
index f5d16523763b..1205e6a0c278 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -108,7 +108,6 @@ template<> class numeric_limits<cv long double>;
 #  include <__config>
 #  include <__type_traits/is_arithmetic.h>
 #  include <__type_traits/is_signed.h>
-#  include <__type_traits/remove_cv.h>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #    pragma GCC system_header
@@ -178,16 +177,6 @@ protected:
   static _LIBCPP_CONSTEXPR const float_round_style round_style = round_toward_zero;
 };
 
-template <class _Tp, int __digits, bool _IsSigned>
-struct __libcpp_compute_min {
-  static _LIBCPP_CONSTEXPR const _Tp value = _Tp(_Tp(1) << __digits);
-};
-
-template <class _Tp, int __digits>
-struct __libcpp_compute_min<_Tp, __digits, false> {
-  static _LIBCPP_CONSTEXPR const _Tp value = _Tp(0);
-};
-
 template <class _Tp>
 class __libcpp_numeric_limits<_Tp, true> {
 protected:
@@ -199,7 +188,7 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
   static _LIBCPP_CONSTEXPR const int digits10     = digits * 3 / 10;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
-  static _LIBCPP_CONSTEXPR const type __min       = __libcpp_compute_min<type, digits, is_signed>::value;
+  static _LIBCPP_CONSTEXPR const type __min       = is_signed ? _Tp(_Tp(1) << digits) : 0;
   static _LIBCPP_CONSTEXPR const type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
@@ -250,10 +239,8 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = 1;
   static _LIBCPP_CONSTEXPR const int digits10     = 0;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
-  static _LIBCPP_CONSTEXPR const type __min       = false;
-  static _LIBCPP_CONSTEXPR const type __max       = true;
-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return false; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return true; }
   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = true;

From c69f97cb95a2579fc561af30624d744811d4d287 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Sat, 21 Jun 2025 08:51:07 +0300
Subject: [PATCH 1147/1322] [Clang] Add WG21 papers from the Sofia meeting to
 the status page

---
 clang/www/cxx_status.html | 42 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index dff57689e84b..a70e65e35d5e 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -292,6 +292,48 @@ C++23, informally referred to as C++26.</p>
   <td><a href="https://wg21.link/P1967">P1967R14</a></td>
   <td class="none" align="center">No</td>
  </tr>
+ <!-- Bulgaria, Spring 2025 -->
+ <tr>
+  <td rowspan="5">Reflection</td>
+  <td><a href="https://wg21.link/P2996">P2996R13</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td><a href="https://wg21.link/P3394">P3394R4</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td><a href="https://wg21.link/P3293">P3293R3</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td><a href="https://wg21.link/P3491">P3491R3</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td><a href="https://wg21.link/P3096">P3096R12</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Attaching main to the global module</td>
+  <td><a href="https://wg21.link/P3618">P3618R0</a> (<a href="#dr">DR</a>)</td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Expansion Statements</td>
+  <td><a href="https://wg21.link/P1306">P1306R5</a></td>
+  <td class="none" align="center">No</td>
+  </tr>
+  <tr>
+   <td>constexpr virtual inheritance</td>
+   <td><a href="https://wg21.link/P3533">P3533R2</a></td>
+   <td class="none" align="center">No</td>
+  </tr>
+  <tr>
+   <td>Preprocessing is never undefined</td>
+   <td><a href="https://wg21.link/P2843">P2843R3</a></td>
+   <td class="none" align="center">No</td>
+  </tr>
 </table>
 </details>
 

From 5f2135df171e96217dc9581d358f467eb2fc067b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 21 Jun 2025 17:25:30 +0900
Subject: [PATCH 1148/1322] AMDGPU: Really delete AMDGPUAnnotateKernelFeatures
 (#145136)

---
 .../AMDGPU/AMDGPUAnnotateKernelFeatures.cpp   | 128 ------------------
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   3 -
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 -
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 -
 4 files changed, 133 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
deleted file mode 100644
index 9c9fa5c6e2f0..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This pass propagates the uniform-work-group-size attribute from
-/// kernels to leaf functions when possible. It also adds additional attributes
-/// to hint ABI lowering optimizations later.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Target/TargetMachine.h"
-
-#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
-
-using namespace llvm;
-
-namespace {
-class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
-private:
-  const TargetMachine *TM = nullptr;
-
-  bool addFeatureAttributes(Function &F);
-
-public:
-  static char ID;
-
-  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
-
-  bool doInitialization(CallGraph &CG) override;
-  bool runOnSCC(CallGraphSCC &SCC) override;
-
-  StringRef getPassName() const override {
-    return "AMDGPU Annotate Kernel Features";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    CallGraphSCCPass::getAnalysisUsage(AU);
-  }
-};
-
-} // end anonymous namespace
-
-char AMDGPUAnnotateKernelFeatures::ID = 0;
-
-bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
-  bool HaveStackObjects = false;
-  bool Changed = false;
-  bool HaveCall = false;
-  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
-
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (isa<AllocaInst>(I)) {
-        HaveStackObjects = true;
-        continue;
-      }
-
-      if (auto *CB = dyn_cast<CallBase>(&I)) {
-        const Function *Callee =
-            dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
-
-        // Note the occurrence of indirect call.
-        if (!Callee) {
-          if (!CB->isInlineAsm())
-            HaveCall = true;
-
-          continue;
-        }
-
-        Intrinsic::ID IID = Callee->getIntrinsicID();
-        if (IID == Intrinsic::not_intrinsic) {
-          HaveCall = true;
-          Changed = true;
-        }
-      }
-    }
-  }
-
-  // TODO: We could refine this to captured pointers that could possibly be
-  // accessed by flat instructions. For now this is mostly a poor way of
-  // estimating whether there are calls before argument lowering.
-  if (!IsFunc && HaveCall) {
-    F.addFnAttr("amdgpu-calls");
-    Changed = true;
-  }
-
-  if (HaveStackObjects) {
-    F.addFnAttr("amdgpu-stack-objects");
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
-  bool Changed = false;
-
-  for (CallGraphNode *I : SCC) {
-    Function *F = I->getFunction();
-    // Ignore functions with graphics calling conventions, these are currently
-    // not allowed to have kernel arguments.
-    if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
-      continue;
-    // Add feature attributes
-    Changed |= addFeatureAttributes(*F);
-  }
-
-  return Changed;
-}
-
-bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    report_fatal_error("TargetMachine is required");
-
-  TM = &TPC->getTM<TargetMachine>();
-  return false;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1ea7dd01d15c..c28f407391c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2073,9 +2073,6 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
 }
 
 void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
-  // AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be
-  // deleted soon.
-
   if (TM.getOptLevel() > CodeGenOptLevel::None)
     addPass(AMDGPUPreloadKernelArgumentsPass(TM));
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c6d70ee39202..928a5001e0c9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -43,7 +43,6 @@ add_public_tablegen_target(InstCombineTableGen)
 add_llvm_target(AMDGPUCodeGen
   AMDGPUAliasAnalysis.cpp
   AMDGPUAlwaysInlinePass.cpp
-  AMDGPUAnnotateKernelFeatures.cpp
   AMDGPUAnnotateUniformValues.cpp
   AMDGPUArgumentUsageInfo.cpp
   AMDGPUAsanInstrumentation.cpp
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index facdde20454d..628e544a687e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -131,7 +131,6 @@ static_library("LLVMAMDGPUCodeGen") {
   sources = [
     "AMDGPUAliasAnalysis.cpp",
     "AMDGPUAlwaysInlinePass.cpp",
-    "AMDGPUAnnotateKernelFeatures.cpp",
     "AMDGPUAnnotateUniformValues.cpp",
     "AMDGPUArgumentUsageInfo.cpp",
     "AMDGPUAsanInstrumentation.cpp",

From 4a4582dd788b8bb7dab919fe4b76807ad9c7ed48 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 21 Jun 2025 10:30:59 +0200
Subject: [PATCH 1149/1322] [CIR] Upstream BinAssign for ComplexType (#144868)

This change adds support for the BinAssign op and LValueToRValue for
ComplexType

https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp        |  5 +-
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 98 +++++++++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenFunction.h      |  2 +
 clang/lib/CIR/CodeGen/CIRGenValue.h         |  4 +-
 clang/test/CIR/CodeGen/complex.cpp          | 26 ++++++
 5 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 4f2046ad26d7..c31754dc11d6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -997,10 +997,9 @@ LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
   }
 
   case cir::TEK_Complex: {
-    assert(!cir::MissingFeatures::complexType());
-    cgm.errorNYI(e->getSourceRange(), "complex l-values");
-    return {};
+    return emitComplexAssignmentLValue(e);
   }
+
   case cir::TEK_Aggregate:
     cgm.errorNYI(e->getSourceRange(), "aggregate lvalues");
     return {};
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 12e8e27948cf..eaa199abc165 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -19,6 +19,13 @@ public:
   //                               Utilities
   //===--------------------------------------------------------------------===//
 
+  LValue emitBinAssignLValue(const BinaryOperator *e, mlir::Value &val);
+
+  mlir::Value emitCast(CastKind ck, Expr *op, QualType destTy);
+
+  mlir::Value emitConstant(const CIRGenFunction::ConstantEmission &constant,
+                           Expr *e);
+
   /// Given an expression with complex type that represents a value l-value,
   /// this method emits the address of the l-value, then loads and returns the
   /// result.
@@ -27,18 +34,18 @@ public:
   }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc);
-
   /// Store the specified real/imag parts into the
   /// specified value pointer.
   void emitStoreOfComplex(mlir::Location loc, mlir::Value val, LValue lv,
                           bool isInit);
 
+  mlir::Value VisitBinAssign(const BinaryOperator *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
-  mlir::Value VisitInitListExpr(InitListExpr *e);
-
+  mlir::Value VisitDeclRefExpr(DeclRefExpr *e);
+  mlir::Value VisitImplicitCastExpr(ImplicitCastExpr *e);
+  mlir::Value VisitInitListExpr(const InitListExpr *e);
   mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il);
 };
-
 } // namespace
 
 static const ComplexType *getComplexType(QualType type) {
@@ -48,6 +55,46 @@ static const ComplexType *getComplexType(QualType type) {
   return cast<ComplexType>(cast<AtomicType>(type)->getValueType());
 }
 
+LValue ComplexExprEmitter::emitBinAssignLValue(const BinaryOperator *e,
+                                               mlir::Value &value) {
+  assert(cgf.getContext().hasSameUnqualifiedType(e->getLHS()->getType(),
+                                                 e->getRHS()->getType()) &&
+         "Invalid assignment");
+
+  // Emit the RHS.  __block variables need the RHS evaluated first.
+  value = Visit(e->getRHS());
+
+  // Compute the address to store into.
+  LValue lhs = cgf.emitLValue(e->getLHS());
+
+  // Store the result value into the LHS lvalue.
+  emitStoreOfComplex(cgf.getLoc(e->getExprLoc()), value, lhs, /*isInit*/ false);
+  return lhs;
+}
+
+mlir::Value ComplexExprEmitter::emitCast(CastKind ck, Expr *op,
+                                         QualType destTy) {
+  switch (ck) {
+  case CK_LValueToRValue:
+    return Visit(op);
+  default:
+    cgf.cgm.errorNYI("ComplexType Cast");
+    break;
+  }
+  return {};
+}
+
+mlir::Value ComplexExprEmitter::emitConstant(
+    const CIRGenFunction::ConstantEmission &constant, Expr *e) {
+  assert(constant && "not a constant");
+  if (constant.isReference())
+    return emitLoadOfLValue(constant.getReferenceLValue(cgf, e),
+                            e->getExprLoc());
+
+  mlir::TypedAttr valueAttr = constant.getValue();
+  return builder.getConstant(cgf.getLoc(e->getSourceRange()), valueAttr);
+}
+
 mlir::Value ComplexExprEmitter::emitLoadOfLValue(LValue lv,
                                                  SourceLocation loc) {
   assert(lv.isSimple() && "non-simple complex l-value?");
@@ -70,6 +117,22 @@ void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
   builder.createStore(loc, val, destAddr);
 }
 
+mlir::Value ComplexExprEmitter::VisitBinAssign(const BinaryOperator *e) {
+  mlir::Value value;
+  LValue lv = emitBinAssignLValue(e, value);
+
+  // The result of an assignment in C is the assigned r-value.
+  if (!cgf.getLangOpts().CPlusPlus)
+    return value;
+
+  // If the lvalue is non-volatile, return the computed value of the
+  // assignment.
+  if (!lv.isVolatile())
+    return value;
+
+  return emitLoadOfLValue(lv, e->getExprLoc());
+}
+
 mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
@@ -77,7 +140,21 @@ mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
   return cgf.emitCallExpr(e).getValue();
 }
 
-mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
+mlir::Value ComplexExprEmitter::VisitDeclRefExpr(DeclRefExpr *e) {
+  if (CIRGenFunction::ConstantEmission constant = cgf.tryEmitAsConstant(e))
+    return emitConstant(constant, e);
+  return emitLoadOfLValue(e);
+}
+
+mlir::Value ComplexExprEmitter::VisitImplicitCastExpr(ImplicitCastExpr *e) {
+  // Unlike for scalars, we don't have to worry about function->ptr demotion
+  // here.
+  if (e->changesVolatileQualification())
+    return emitLoadOfLValue(e);
+  return emitCast(e->getCastKind(), e->getSubExpr(), e->getType());
+}
+
+mlir::Value ComplexExprEmitter::VisitInitListExpr(const InitListExpr *e) {
   mlir::Location loc = cgf.getLoc(e->getExprLoc());
   if (e->getNumInits() == 2) {
     mlir::Value real = cgf.emitScalarExpr(e->getInit(0));
@@ -127,6 +204,17 @@ ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *il) {
   return builder.create<cir::ConstantOp>(loc, complexAttr);
 }
 
+LValue CIRGenFunction::emitComplexAssignmentLValue(const BinaryOperator *e) {
+  assert(e->getOpcode() == BO_Assign && "Expected assign op");
+
+  mlir::Value value; // ignored
+  LValue lvalue = ComplexExprEmitter(*this).emitBinAssignLValue(e, value);
+  if (getLangOpts().OpenMP)
+    cgm.errorNYI("emitComplexAssignmentLValue OpenMP");
+
+  return lvalue;
+}
+
 mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   assert(e && getComplexType(e->getType()) &&
          "Invalid complex expression to emit");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 6c490a72b2e9..82aa7ec9cb22 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -870,6 +870,8 @@ public:
   /// returning the result.
   mlir::Value emitComplexExpr(const Expr *e);
 
+  LValue emitComplexAssignmentLValue(const BinaryOperator *e);
+
   void emitCompoundStmt(const clang::CompoundStmt &s);
 
   void emitCompoundStmtWithoutScope(const clang::CompoundStmt &s);
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 7180d92f8c31..a5a457ddafa9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -182,9 +182,7 @@ public:
   bool isSimple() const { return lvType == Simple; }
   bool isVectorElt() const { return lvType == VectorElt; }
   bool isBitField() const { return lvType == BitField; }
-
-  // TODO: Add support for volatile
-  bool isVolatile() const { return false; }
+  bool isVolatile() const { return quals.hasVolatile(); }
 
   unsigned getVRQualifiers() const {
     return quals.getCVRQualifiers() & ~clang::Qualifiers::Const;
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 721db235b37d..cfeed345b4f1 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -230,3 +230,29 @@ void foo14() {
 // OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
 // OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
 // OGCG: store i32 2, ptr %[[C_IMAG_PTR]], align 4
+
+void foo15() {
+  int _Complex a;
+  int _Complex b = a;
+}
+
+// CIR: %[[COMPLEX_A:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR: %[[COMPLEX_B:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[COMPLEX_A]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[COMPLEX_B]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[COMPLEX_A:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[COMPLEX_B:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { i32, i32 }, ptr %[[COMPLEX_A]], align 4
+// LLVM: store { i32, i32 } %[[TMP_A]], ptr %[[COMPLEX_B]], align 4
+
+// OGCG: %[[COMPLEX_A:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[COMPLEX_B:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_A]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_A]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 0
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX_B]], i32 0, i32 1
+// OGCG: store i32 %[[A_REAL]], ptr %[[B_REAL_PTR]], align 4
+// OGCG: store i32 %[[A_IMAG]], ptr %[[B_IMAG_PTR]], align 4

From 0921bfd81d57eb652e1cb1e12ea4dd3df037d222 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sat, 21 Jun 2025 10:44:54 +0200
Subject: [PATCH 1150/1322] [mlir][Transforms] Dialect conversion: Add missing
 erasure notifications (#145030)

Add missing listener notifications when erasing nested
blocks/operations.

This commit also moves some of the functionality from
`ConversionPatternRewriter` to `ConversionPatternRewriterImpl`. This is
in preparation of the One-Shot Dialect Conversion refactoring: The
implementations in `ConversionPatternRewriter` should be as simple as
possible, so that a switch between "rollback allowed" and "rollback not
allowed" can be inserted at that level. (In the latter case,
`ConversionPatternRewriterImpl` can be bypassed to some degree, and
`PatternRewriter::eraseBlock` etc. can be used.)

Depends on #145018.
---
 .../Transforms/Utils/DialectConversion.cpp    | 52 +++++++++++++------
 mlir/test/Transforms/test-legalizer.mlir      | 18 ++++++-
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index ff48647f4330..ad82a007b799 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -274,6 +274,26 @@ struct RewriterState {
 // IR rewrites
 //===----------------------------------------------------------------------===//
 
+static void notifyIRErased(RewriterBase::Listener *listener, Operation &op);
+
+/// Notify the listener that the given block and its contents are being erased.
+static void notifyIRErased(RewriterBase::Listener *listener, Block &b) {
+  for (Operation &op : b)
+    notifyIRErased(listener, op);
+  listener->notifyBlockErased(&b);
+}
+
+/// Notify the listener that the given operation and its contents are being
+/// erased.
+static void notifyIRErased(RewriterBase::Listener *listener, Operation &op) {
+  for (Region &r : op.getRegions()) {
+    for (Block &b : r) {
+      notifyIRErased(listener, b);
+    }
+  }
+  listener->notifyOperationErased(&op);
+}
+
 /// An IR rewrite that can be committed (upon success) or rolled back (upon
 /// failure).
 ///
@@ -422,17 +442,20 @@ public:
   }
 
   void commit(RewriterBase &rewriter) override {
-    // Erase the block.
     assert(block && "expected block");
-    assert(block->empty() && "expected empty block");
 
-    // Notify the listener that the block is about to be erased.
+    // Notify the listener that the block and its contents are being erased.
     if (auto *listener =
             dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener()))
-      listener->notifyBlockErased(block);
+      notifyIRErased(listener, *block);
   }
 
   void cleanup(RewriterBase &rewriter) override {
+    // Erase the contents of the block.
+    for (auto &op : llvm::make_early_inc_range(llvm::reverse(*block)))
+      rewriter.eraseOp(&op);
+    assert(block->empty() && "expected empty block");
+
     // Erase the block.
     block->dropAllDefinedValueUses();
     delete block;
@@ -1147,12 +1170,9 @@ void ReplaceOperationRewrite::commit(RewriterBase &rewriter) {
   if (getConfig().unlegalizedOps)
     getConfig().unlegalizedOps->erase(op);
 
-  // Notify the listener that the operation (and its nested operations) was
-  // erased.
-  if (listener) {
-    op->walk<WalkOrder::PostOrder>(
-        [&](Operation *op) { listener->notifyOperationErased(op); });
-  }
+  // Notify the listener that the operation and its contents are being erased.
+  if (listener)
+    notifyIRErased(listener, *op);
 
   // Do not erase the operation yet. It may still be referenced in `mapping`.
   // Just unlink it for now and erase it during cleanup.
@@ -1605,6 +1625,8 @@ void ConversionPatternRewriterImpl::replaceOp(
 }
 
 void ConversionPatternRewriterImpl::eraseBlock(Block *block) {
+  assert(!wasOpReplaced(block->getParentOp()) &&
+         "attempting to erase a block within a replaced/erased op");
   appendRewrite<EraseBlockRewrite>(block);
 
   // Unlink the block from its parent region. The block is kept in the rewrite
@@ -1612,6 +1634,9 @@ void ConversionPatternRewriterImpl::eraseBlock(Block *block) {
   // allows us to keep the operations in the block live and undo the removal by
   // re-inserting the block.
   block->getParent()->getBlocks().remove(block);
+
+  // Mark all nested ops as erased.
+  block->walk([&](Operation *op) { replacedOps.insert(op); });
 }
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
@@ -1709,13 +1734,6 @@ void ConversionPatternRewriter::eraseOp(Operation *op) {
 }
 
 void ConversionPatternRewriter::eraseBlock(Block *block) {
-  assert(!impl->wasOpReplaced(block->getParentOp()) &&
-         "attempting to erase a block within a replaced/erased op");
-
-  // Mark all ops for erasure.
-  for (Operation &op : *block)
-    eraseOp(&op);
-
   impl->eraseBlock(block);
 }
 
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 34948ae685f0..204c8c145682 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -461,12 +461,26 @@ func.func @convert_detached_signature() {
 
 // -----
 
+// CHECK: notifyOperationReplaced: test.erase_op
+// CHECK: notifyOperationErased: test.dummy_op_lvl_2
+// CHECK: notifyBlockErased
+// CHECK: notifyOperationErased: test.dummy_op_lvl_1
+// CHECK: notifyBlockErased
+// CHECK: notifyOperationErased: test.erase_op
+// CHECK: notifyOperationInserted: test.valid, was unlinked
+// CHECK: notifyOperationReplaced: test.drop_operands_and_replace_with_valid
+// CHECK: notifyOperationErased: test.drop_operands_and_replace_with_valid
+
 // CHECK-LABEL: func @circular_mapping()
 //  CHECK-NEXT:   "test.valid"() : () -> ()
 func.func @circular_mapping() {
   // Regression test that used to crash due to circular
-  // unrealized_conversion_cast ops.
-  %0 = "test.erase_op"() : () -> (i64)
+  // unrealized_conversion_cast ops. 
+  %0 = "test.erase_op"() ({
+    "test.dummy_op_lvl_1"() ({
+      "test.dummy_op_lvl_2"() : () -> ()
+    }) : () -> ()
+  }): () -> (i64)
   "test.drop_operands_and_replace_with_valid"(%0) : (i64) -> ()
 }
 

From bb372963dfcef9722d5aeb4f65ddb5c50be24e01 Mon Sep 17 00:00:00 2001
From: Michele Scuttari <michele.scuttari@outlook.com>
Date: Sat, 21 Jun 2025 10:55:44 +0200
Subject: [PATCH 1151/1322] [MLIR] Add optional cached symbol tables to LLVM
 conversion patterns (#144032)

This PR allows to optionally speed up the lookup of symbols by providing a `SymbolTableCollection` instance to the interested conversion patterns. It is follow-up on the discussion about symbol / symbol table management carried on [Discourse](https://discourse.llvm.org/t/symbol-table-as-first-class-citizen-in-builders/86813).
---
 .../ControlFlowToLLVM/ControlFlowToLLVM.h     |   7 +-
 .../Conversion/FuncToLLVM/ConvertFuncToLLVM.h |   9 +-
 .../Conversion/LLVMCommon/PrintCallHelper.h   |   4 +-
 .../Conversion/MemRefToLLVM/MemRefToLLVM.h    |   4 +-
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |  83 +++++++----
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   |  13 +-
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |  71 +++++----
 .../Conversion/LLVMCommon/PrintCallHelper.cpp |  27 +++-
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  | 140 ++++++++++++-----
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  36 +++--
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   | 141 ++++++++++++------
 11 files changed, 359 insertions(+), 176 deletions(-)

diff --git a/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h b/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
index 88f18022da9b..2dfb6b03bcfc 100644
--- a/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
+++ b/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
@@ -20,6 +20,7 @@ class DialectRegistry;
 class LLVMTypeConverter;
 class RewritePatternSet;
 class Pass;
+class SymbolTableCollection;
 
 #define GEN_PASS_DECL_CONVERTCONTROLFLOWTOLLVMPASS
 #include "mlir/Conversion/Passes.h.inc"
@@ -39,9 +40,9 @@ void populateControlFlowToLLVMConversionPatterns(
 /// Populate the cf.assert to LLVM conversion pattern. If `abortOnFailure` is
 /// set to false, the program execution continues when a condition is
 /// unsatisfied.
-void populateAssertToLLVMConversionPattern(const LLVMTypeConverter &converter,
-                                           RewritePatternSet &patterns,
-                                           bool abortOnFailure = true);
+void populateAssertToLLVMConversionPattern(
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    bool abortOnFailure = true, SymbolTableCollection *symbolTables = nullptr);
 
 void registerConvertControlFlowToLLVMInterface(DialectRegistry &registry);
 
diff --git a/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h b/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h
index b1ea2740c060..e530b0a43b8e 100644
--- a/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h
+++ b/mlir/include/mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h
@@ -27,20 +27,23 @@ class DialectRegistry;
 class LLVMTypeConverter;
 class RewritePatternSet;
 class SymbolTable;
+class SymbolTableCollection;
 
 /// Convert input FunctionOpInterface operation to LLVMFuncOp by using the
 /// provided LLVMTypeConverter. Return failure if failed to so.
 FailureOr<LLVM::LLVMFuncOp>
 convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                           ConversionPatternRewriter &rewriter,
-                          const LLVMTypeConverter &converter);
+                          const LLVMTypeConverter &converter,
+                          SymbolTableCollection *symbolTables = nullptr);
 
 /// Collect the default pattern to convert a FuncOp to the LLVM dialect. If
 /// `emitCWrappers` is set, the pattern will also produce functions
 /// that pass memref descriptors by pointer-to-structure in addition to the
 /// default unpacked form.
 void populateFuncToLLVMFuncOpConversionPattern(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns);
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    SymbolTableCollection *symbolTables = nullptr);
 
 /// Collect the patterns to convert from the Func dialect to LLVM. The
 /// conversion patterns capture the LLVMTypeConverter and the LowerToLLVMOptions
@@ -57,7 +60,7 @@ void populateFuncToLLVMFuncOpConversionPattern(
 /// not an error to provide it anyway.
 void populateFuncToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
-    const SymbolTable *symbolTable = nullptr);
+    SymbolTableCollection *symbolTables = nullptr);
 
 void registerConvertFuncToLLVMInterface(DialectRegistry &registry);
 
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h b/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
index 33402301115b..d7de40555bb6 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
@@ -17,6 +17,7 @@ namespace mlir {
 
 class OpBuilder;
 class LLVMTypeConverter;
+class SymbolTableCollection;
 
 namespace LLVM {
 
@@ -26,7 +27,8 @@ namespace LLVM {
 LogicalResult createPrintStrCall(
     OpBuilder &builder, Location loc, ModuleOp moduleOp, StringRef symbolName,
     StringRef string, const LLVMTypeConverter &typeConverter,
-    bool addNewline = true, std::optional<StringRef> runtimeFunctionName = {});
+    bool addNewline = true, std::optional<StringRef> runtimeFunctionName = {},
+    SymbolTableCollection *symbolTables = nullptr);
 } // namespace LLVM
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h b/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h
index 996a64baf9dd..e93d5bdce7bf 100644
--- a/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h
+++ b/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h
@@ -16,6 +16,7 @@ class DialectRegistry;
 class Pass;
 class LLVMTypeConverter;
 class RewritePatternSet;
+class SymbolTableCollection;
 
 #define GEN_PASS_DECL_FINALIZEMEMREFTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
@@ -23,7 +24,8 @@ class RewritePatternSet;
 /// Collect a set of patterns to convert memory-related operations from the
 /// MemRef dialect to the LLVM dialect.
 void populateFinalizeMemRefToLLVMConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns);
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    SymbolTableCollection *symbolTables = nullptr);
 
 void registerConvertMemRefToLLVMInterface(DialectRegistry &registry);
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 4a7ec6f2efe6..8ad9ed18aceb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -24,6 +24,7 @@ class OpBuilder;
 class Operation;
 class Type;
 class ValueRange;
+class SymbolTableCollection;
 
 namespace LLVM {
 class LLVMFuncOp;
@@ -33,55 +34,73 @@ class LLVMFuncOp;
 /// implemented separately (e.g. as part of a support runtime library or as part
 /// of the libc).
 /// Failure if an unexpected version of function is found.
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintI64Fn(OpBuilder &b,
-                                                     Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintU64Fn(OpBuilder &b,
-                                                     Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintF16Fn(OpBuilder &b,
-                                                     Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintBF16Fn(OpBuilder &b,
-                                                      Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintF32Fn(OpBuilder &b,
-                                                     Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintF64Fn(OpBuilder &b,
-                                                     Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreatePrintI64Fn(OpBuilder &b, Operation *moduleOp,
+                         SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreatePrintU64Fn(OpBuilder &b, Operation *moduleOp,
+                         SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreatePrintF16Fn(OpBuilder &b, Operation *moduleOp,
+                         SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreatePrintBF16Fn(OpBuilder &b, Operation *moduleOp,
+                          SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
+                         SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
+                         SymbolTableCollection *symbolTables = nullptr);
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintStringFn(OpBuilder &b, Operation *moduleOp,
-                            std::optional<StringRef> runtimeFunctionName = {});
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintOpenFn(OpBuilder &b,
-                                                      Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintCloseFn(OpBuilder &b,
-                                                       Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintCommaFn(OpBuilder &b,
-                                                       Operation *moduleOp);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintNewlineFn(OpBuilder &b,
-                                                         Operation *moduleOp);
+                            std::optional<StringRef> runtimeFunctionName = {},
+                            SymbolTableCollection *symbolTables = nullptr);
 FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateMallocFn(OpBuilder &b, Operation *moduleOp, Type indexType);
+lookupOrCreatePrintOpenFn(OpBuilder &b, Operation *moduleOp,
+                          SymbolTableCollection *symbolTables = nullptr);
 FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateAlignedAllocFn(OpBuilder &b, Operation *moduleOp, Type indexType);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreateFreeFn(OpBuilder &b,
-                                                 Operation *moduleOp);
+lookupOrCreatePrintCloseFn(OpBuilder &b, Operation *moduleOp,
+                           SymbolTableCollection *symbolTables = nullptr);
 FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateGenericAllocFn(OpBuilder &b, Operation *moduleOp, Type indexType);
+lookupOrCreatePrintCommaFn(OpBuilder &b, Operation *moduleOp,
+                           SymbolTableCollection *symbolTables = nullptr);
 FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateGenericAlignedAllocFn(OpBuilder &b, Operation *moduleOp,
-                                    Type indexType);
-FailureOr<LLVM::LLVMFuncOp> lookupOrCreateGenericFreeFn(OpBuilder &b,
-                                                        Operation *moduleOp);
+lookupOrCreatePrintNewlineFn(OpBuilder &b, Operation *moduleOp,
+                             SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateMallocFn(OpBuilder &b, Operation *moduleOp, Type indexType,
+                       SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateAlignedAllocFn(OpBuilder &b, Operation *moduleOp, Type indexType,
+                             SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateFreeFn(OpBuilder &b, Operation *moduleOp,
+                     SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateGenericAllocFn(OpBuilder &b, Operation *moduleOp, Type indexType,
+                             SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreateGenericAlignedAllocFn(
+    OpBuilder &b, Operation *moduleOp, Type indexType,
+    SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateGenericFreeFn(OpBuilder &b, Operation *moduleOp,
+                            SymbolTableCollection *symbolTables = nullptr);
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreateMemRefCopyFn(OpBuilder &b, Operation *moduleOp, Type indexType,
-                           Type unrankedDescriptorType);
+                           Type unrankedDescriptorType,
+                           SymbolTableCollection *symbolTables = nullptr);
 
 /// Create a FuncOp with signature `resultType`(`paramTypes`)` and name `name`.
 /// Return a failure if the FuncOp found has unexpected signature.
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
                  ArrayRef<Type> paramTypes = {}, Type resultType = {},
-                 bool isVarArg = false, bool isReserved = false);
+                 bool isVarArg = false, bool isReserved = false,
+                 SymbolTableCollection *symbolTables = nullptr);
 
 } // namespace LLVM
 } // namespace mlir
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index d31d7d801e14..3d0804fd11b6 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -44,9 +44,10 @@ namespace {
 /// lowering.
 struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
   explicit AssertOpLowering(const LLVMTypeConverter &typeConverter,
-                            bool abortOnFailedAssert = true)
+                            bool abortOnFailedAssert = true,
+                            SymbolTableCollection *symbolTables = nullptr)
       : ConvertOpToLLVMPattern<cf::AssertOp>(typeConverter, /*benefit=*/1),
-        abortOnFailedAssert(abortOnFailedAssert) {}
+        abortOnFailedAssert(abortOnFailedAssert), symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(cf::AssertOp op, OpAdaptor adaptor,
@@ -64,7 +65,7 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
     auto createResult = LLVM::createPrintStrCall(
         rewriter, loc, module, "assert_msg", op.getMsg(), *getTypeConverter(),
         /*addNewLine=*/false,
-        /*runtimeFunctionName=*/"puts");
+        /*runtimeFunctionName=*/"puts", symbolTables);
     if (createResult.failed())
       return failure();
 
@@ -96,6 +97,8 @@ private:
   /// If set to `false`, messages are printed but program execution continues.
   /// This is useful for testing asserts.
   bool abortOnFailedAssert = true;
+
+  SymbolTableCollection *symbolTables = nullptr;
 };
 
 /// Helper function for converting branch ops. This function converts the
@@ -232,8 +235,8 @@ void mlir::cf::populateControlFlowToLLVMConversionPatterns(
 
 void mlir::cf::populateAssertToLLVMConversionPattern(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
-    bool abortOnFailure) {
-  patterns.add<AssertOpLowering>(converter, abortOnFailure);
+    bool abortOnFailure, SymbolTableCollection *symbolTables) {
+  patterns.add<AssertOpLowering>(converter, abortOnFailure, symbolTables);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 538016927256..4499cbd4d1a2 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -299,10 +299,9 @@ static void restoreByValRefArgumentType(
   }
 }
 
-FailureOr<LLVM::LLVMFuncOp>
-mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
-                                ConversionPatternRewriter &rewriter,
-                                const LLVMTypeConverter &converter) {
+FailureOr<LLVM::LLVMFuncOp> mlir::convertFuncOpToLLVMFuncOp(
+    FunctionOpInterface funcOp, ConversionPatternRewriter &rewriter,
+    const LLVMTypeConverter &converter, SymbolTableCollection *symbolTables) {
   // Check the funcOp has `FunctionType`.
   auto funcTy = dyn_cast<FunctionType>(funcOp.getFunctionType());
   if (!funcTy)
@@ -361,10 +360,25 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
 
   SmallVector<NamedAttribute, 4> attributes;
   filterFuncAttributes(funcOp, attributes);
+
+  Operation *symbolTableOp = funcOp->getParentWithTrait<OpTrait::SymbolTable>();
+
+  if (symbolTables && symbolTableOp) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symbolTableOp);
+    symbolTable.remove(funcOp);
+  }
+
   auto newFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
       funcOp.getLoc(), funcOp.getName(), llvmType, linkage,
       /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C, /*comdat=*/nullptr,
       attributes);
+
+  if (symbolTables && symbolTableOp) {
+    auto ip = rewriter.getInsertionPoint();
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symbolTableOp);
+    symbolTable.insert(newFuncOp, ip);
+  }
+
   cast<FunctionOpInterface>(newFuncOp.getOperation())
       .setVisibility(funcOp.getVisibility());
 
@@ -473,16 +487,20 @@ namespace {
 /// FuncOp legalization pattern that converts MemRef arguments to pointers to
 /// MemRef descriptors (LLVM struct data types) containing all the MemRef type
 /// information.
-struct FuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
-  FuncOpConversion(const LLVMTypeConverter &converter)
-      : ConvertOpToLLVMPattern(converter) {}
+class FuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
+  SymbolTableCollection *symbolTables = nullptr;
+
+public:
+  explicit FuncOpConversion(const LLVMTypeConverter &converter,
+                            SymbolTableCollection *symbolTables = nullptr)
+      : ConvertOpToLLVMPattern(converter), symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     FailureOr<LLVM::LLVMFuncOp> newFuncOp = mlir::convertFuncOpToLLVMFuncOp(
         cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
-        *getTypeConverter());
+        *getTypeConverter(), symbolTables);
     if (failed(newFuncOp))
       return rewriter.notifyMatchFailure(funcOp, "Could not convert funcop");
 
@@ -591,11 +609,11 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern<CallOpType> {
 
 class CallOpLowering : public CallOpInterfaceLowering<func::CallOp> {
 public:
-  CallOpLowering(const LLVMTypeConverter &typeConverter,
-                 // Can be nullptr.
-                 const SymbolTable *symbolTable, PatternBenefit benefit = 1)
+  explicit CallOpLowering(const LLVMTypeConverter &typeConverter,
+                          SymbolTableCollection *symbolTables = nullptr,
+                          PatternBenefit benefit = 1)
       : CallOpInterfaceLowering<func::CallOp>(typeConverter, benefit),
-        symbolTable(symbolTable) {}
+        symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(func::CallOp callOp, OpAdaptor adaptor,
@@ -603,10 +621,10 @@ public:
     bool useBarePtrCallConv = false;
     if (getTypeConverter()->getOptions().useBarePtrCallConv) {
       useBarePtrCallConv = true;
-    } else if (symbolTable != nullptr) {
+    } else if (symbolTables != nullptr) {
       // Fast lookup.
       Operation *callee =
-          symbolTable->lookup(callOp.getCalleeAttr().getValue());
+          symbolTables->lookupNearestSymbolFrom(callOp, callOp.getCalleeAttr());
       useBarePtrCallConv =
           callee != nullptr && callee->hasAttr(barePtrAttrName);
     } else {
@@ -620,7 +638,7 @@ public:
   }
 
 private:
-  const SymbolTable *symbolTable = nullptr;
+  SymbolTableCollection *symbolTables = nullptr;
 };
 
 struct CallIndirectOpLowering
@@ -731,16 +749,17 @@ struct ReturnOpLowering : public ConvertOpToLLVMPattern<func::ReturnOp> {
 } // namespace
 
 void mlir::populateFuncToLLVMFuncOpConversionPattern(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
-  patterns.add<FuncOpConversion>(converter);
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    SymbolTableCollection *symbolTables) {
+  patterns.add<FuncOpConversion>(converter, symbolTables);
 }
 
 void mlir::populateFuncToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
-    const SymbolTable *symbolTable) {
-  populateFuncToLLVMFuncOpConversionPattern(converter, patterns);
+    SymbolTableCollection *symbolTables) {
+  populateFuncToLLVMFuncOpConversionPattern(converter, patterns, symbolTables);
   patterns.add<CallIndirectOpLowering>(converter);
-  patterns.add<CallOpLowering>(converter, symbolTable);
+  patterns.add<CallOpLowering>(converter, symbolTables);
   patterns.add<ConstantOpLowering>(converter);
   patterns.add<ReturnOpLowering>(converter);
 }
@@ -780,15 +799,11 @@ struct ConvertFuncToLLVMPass
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
 
-    std::optional<SymbolTable> optSymbolTable = std::nullopt;
-    const SymbolTable *symbolTable = nullptr;
-    if (!options.useBarePtrCallConv) {
-      optSymbolTable.emplace(m);
-      symbolTable = &optSymbolTable.value();
-    }
-
     RewritePatternSet patterns(&getContext());
-    populateFuncToLLVMConversionPatterns(typeConverter, patterns, symbolTable);
+    SymbolTableCollection symbolTables;
+
+    populateFuncToLLVMConversionPatterns(typeConverter, patterns,
+                                         &symbolTables);
 
     LLVMConversionTarget target(getContext());
     if (failed(applyPartialConversion(m, target, std::move(patterns))))
diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
index 2815e05b3e11..49c73fbc9dd7 100644
--- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -17,8 +17,26 @@
 using namespace mlir;
 using namespace llvm;
 
-static std::string ensureSymbolNameIsUnique(ModuleOp moduleOp,
-                                            StringRef symbolName) {
+/// Check if a given symbol name is already in use within the module operation.
+/// If no symbol with such name is present, then the same identifier is
+/// returned. Otherwise, a unique and yet unused identifier is computed starting
+/// from the requested one.
+static std::string
+ensureSymbolNameIsUnique(ModuleOp moduleOp, StringRef symbolName,
+                         SymbolTableCollection *symbolTables = nullptr) {
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(moduleOp);
+    unsigned counter = 0;
+    SmallString<128> uniqueName = symbolTable.generateSymbolName<128>(
+        symbolName,
+        [&](const SmallString<128> &tentativeName) {
+          return symbolTable.lookupSymbolIn(moduleOp, tentativeName) != nullptr;
+        },
+        counter);
+
+    return static_cast<std::string>(uniqueName);
+  }
+
   static int counter = 0;
   std::string uniqueName = std::string(symbolName);
   while (moduleOp.lookupSymbol(uniqueName)) {
@@ -30,7 +48,8 @@ static std::string ensureSymbolNameIsUnique(ModuleOp moduleOp,
 LogicalResult mlir::LLVM::createPrintStrCall(
     OpBuilder &builder, Location loc, ModuleOp moduleOp, StringRef symbolName,
     StringRef string, const LLVMTypeConverter &typeConverter, bool addNewline,
-    std::optional<StringRef> runtimeFunctionName) {
+    std::optional<StringRef> runtimeFunctionName,
+    SymbolTableCollection *symbolTables) {
   auto ip = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(moduleOp.getBody());
   MLIRContext *ctx = builder.getContext();
@@ -49,7 +68,7 @@ LogicalResult mlir::LLVM::createPrintStrCall(
       LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size());
   auto globalOp = builder.create<LLVM::GlobalOp>(
       loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
-      ensureSymbolNameIsUnique(moduleOp, symbolName), dataAttr);
+      ensureSymbolNameIsUnique(moduleOp, symbolName, symbolTables), dataAttr);
 
   auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
   // Emit call to `printStr` in runtime library.
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 8ccf1bfc292d..e8294a5234c4 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -48,35 +48,39 @@ static bool isStaticStrideOrOffset(int64_t strideOrOffset) {
 }
 
 static FailureOr<LLVM::LLVMFuncOp>
-getFreeFn(OpBuilder &b, const LLVMTypeConverter *typeConverter,
-          ModuleOp module) {
+getFreeFn(OpBuilder &b, const LLVMTypeConverter *typeConverter, ModuleOp module,
+          SymbolTableCollection *symbolTables) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
-    return LLVM::lookupOrCreateGenericFreeFn(b, module);
+    return LLVM::lookupOrCreateGenericFreeFn(b, module, symbolTables);
 
-  return LLVM::lookupOrCreateFreeFn(b, module);
+  return LLVM::lookupOrCreateFreeFn(b, module, symbolTables);
 }
 
 static FailureOr<LLVM::LLVMFuncOp>
 getNotalignedAllocFn(OpBuilder &b, const LLVMTypeConverter *typeConverter,
-                     Operation *module, Type indexType) {
+                     Operation *module, Type indexType,
+                     SymbolTableCollection *symbolTables) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
   if (useGenericFn)
-    return LLVM::lookupOrCreateGenericAllocFn(b, module, indexType);
+    return LLVM::lookupOrCreateGenericAllocFn(b, module, indexType,
+                                              symbolTables);
 
-  return LLVM::lookupOrCreateMallocFn(b, module, indexType);
+  return LLVM::lookupOrCreateMallocFn(b, module, indexType, symbolTables);
 }
 
 static FailureOr<LLVM::LLVMFuncOp>
 getAlignedAllocFn(OpBuilder &b, const LLVMTypeConverter *typeConverter,
-                  Operation *module, Type indexType) {
+                  Operation *module, Type indexType,
+                  SymbolTableCollection *symbolTables) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
-    return LLVM::lookupOrCreateGenericAlignedAllocFn(b, module, indexType);
+    return LLVM::lookupOrCreateGenericAlignedAllocFn(b, module, indexType,
+                                                     symbolTables);
 
-  return LLVM::lookupOrCreateAlignedAllocFn(b, module, indexType);
+  return LLVM::lookupOrCreateAlignedAllocFn(b, module, indexType, symbolTables);
 }
 
 /// Computes the aligned value for 'input' as follows:
@@ -126,8 +130,15 @@ static Value castAllocFuncResult(ConversionPatternRewriter &rewriter,
   return allocatedPtr;
 }
 
-struct AllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
-  using ConvertOpToLLVMPattern<memref::AllocOp>::ConvertOpToLLVMPattern;
+class AllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
+  SymbolTableCollection *symbolTables = nullptr;
+
+public:
+  explicit AllocOpLowering(const LLVMTypeConverter &typeConverter,
+                           SymbolTableCollection *symbolTables = nullptr,
+                           PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<memref::AllocOp>(typeConverter, benefit),
+        symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(memref::AllocOp op, OpAdaptor adaptor,
@@ -138,9 +149,10 @@ struct AllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
       return rewriter.notifyMatchFailure(op, "incompatible memref type");
 
     // Get or insert alloc function into the module.
-    FailureOr<LLVM::LLVMFuncOp> allocFuncOp = getNotalignedAllocFn(
-        rewriter, getTypeConverter(),
-        op->getParentWithTrait<OpTrait::SymbolTable>(), getIndexType());
+    FailureOr<LLVM::LLVMFuncOp> allocFuncOp =
+        getNotalignedAllocFn(rewriter, getTypeConverter(),
+                             op->getParentWithTrait<OpTrait::SymbolTable>(),
+                             getIndexType(), symbolTables);
     if (failed(allocFuncOp))
       return failure();
 
@@ -210,8 +222,15 @@ struct AllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
   }
 };
 
-struct AlignedAllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
-  using ConvertOpToLLVMPattern<memref::AllocOp>::ConvertOpToLLVMPattern;
+class AlignedAllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
+  SymbolTableCollection *symbolTables = nullptr;
+
+public:
+  explicit AlignedAllocOpLowering(const LLVMTypeConverter &typeConverter,
+                                  SymbolTableCollection *symbolTables = nullptr,
+                                  PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<memref::AllocOp>(typeConverter, benefit),
+        symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(memref::AllocOp op, OpAdaptor adaptor,
@@ -222,9 +241,10 @@ struct AlignedAllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
       return rewriter.notifyMatchFailure(op, "incompatible memref type");
 
     // Get or insert alloc function into module.
-    FailureOr<LLVM::LLVMFuncOp> allocFuncOp = getAlignedAllocFn(
-        rewriter, getTypeConverter(),
-        op->getParentWithTrait<OpTrait::SymbolTable>(), getIndexType());
+    FailureOr<LLVM::LLVMFuncOp> allocFuncOp =
+        getAlignedAllocFn(rewriter, getTypeConverter(),
+                          op->getParentWithTrait<OpTrait::SymbolTable>(),
+                          getIndexType(), symbolTables);
     if (failed(allocFuncOp))
       return failure();
 
@@ -446,18 +466,23 @@ struct AssumeAlignmentOpLowering
 // A `dealloc` is converted into a call to `free` on the underlying data buffer.
 // The memref descriptor being an SSA value, there is no need to clean it up
 // in any way.
-struct DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
-  using ConvertOpToLLVMPattern<memref::DeallocOp>::ConvertOpToLLVMPattern;
+class DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
+  SymbolTableCollection *symbolTables = nullptr;
 
-  explicit DeallocOpLowering(const LLVMTypeConverter &converter)
-      : ConvertOpToLLVMPattern<memref::DeallocOp>(converter) {}
+public:
+  explicit DeallocOpLowering(const LLVMTypeConverter &typeConverter,
+                             SymbolTableCollection *symbolTables = nullptr,
+                             PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<memref::DeallocOp>(typeConverter, benefit),
+        symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(memref::DeallocOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Insert the `free` declaration if it is not already present.
-    FailureOr<LLVM::LLVMFuncOp> freeFunc = getFreeFn(
-        rewriter, getTypeConverter(), op->getParentOfType<ModuleOp>());
+    FailureOr<LLVM::LLVMFuncOp> freeFunc =
+        getFreeFn(rewriter, getTypeConverter(), op->getParentOfType<ModuleOp>(),
+                  symbolTables);
     if (failed(freeFunc))
       return failure();
     Value allocatedPtr;
@@ -710,9 +735,15 @@ convertGlobalMemrefTypeToLLVM(MemRefType type,
 }
 
 /// GlobalMemrefOp is lowered to a LLVM Global Variable.
-struct GlobalMemrefOpLowering
-    : public ConvertOpToLLVMPattern<memref::GlobalOp> {
-  using ConvertOpToLLVMPattern<memref::GlobalOp>::ConvertOpToLLVMPattern;
+class GlobalMemrefOpLowering : public ConvertOpToLLVMPattern<memref::GlobalOp> {
+  SymbolTableCollection *symbolTables = nullptr;
+
+public:
+  explicit GlobalMemrefOpLowering(const LLVMTypeConverter &typeConverter,
+                                  SymbolTableCollection *symbolTables = nullptr,
+                                  PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<memref::GlobalOp>(typeConverter, benefit),
+        symbolTables(symbolTables) {}
 
   LogicalResult
   matchAndRewrite(memref::GlobalOp global, OpAdaptor adaptor,
@@ -743,9 +774,31 @@ struct GlobalMemrefOpLowering
     if (failed(addressSpace))
       return global.emitOpError(
           "memory space cannot be converted to an integer address space");
+
+    if (symbolTables) {
+      Operation *symbolTableOp =
+          global->getParentWithTrait<OpTrait::SymbolTable>();
+
+      if (symbolTableOp) {
+        SymbolTable &symbolTable = symbolTables->getSymbolTable(symbolTableOp);
+        symbolTable.remove(global);
+      }
+    }
+
     auto newGlobal = rewriter.replaceOpWithNewOp<LLVM::GlobalOp>(
         global, arrayTy, global.getConstant(), linkage, global.getSymName(),
         initialValue, alignment, *addressSpace);
+
+    if (symbolTables) {
+      Operation *symbolTableOp =
+          global->getParentWithTrait<OpTrait::SymbolTable>();
+
+      if (symbolTableOp) {
+        SymbolTable &symbolTable = symbolTables->getSymbolTable(symbolTableOp);
+        symbolTable.insert(newGlobal, rewriter.getInsertionPoint());
+      }
+    }
+
     if (!global.isExternal() && global.isUninitialized()) {
       rewriter.createBlock(&newGlobal.getInitializerRegion());
       Value undef[] = {
@@ -997,8 +1050,15 @@ struct MemRefCastOpLowering : public ConvertOpToLLVMPattern<memref::CastOp> {
 /// For memrefs with identity layouts, the copy is lowered to the llvm
 /// `memcpy` intrinsic. For non-identity layouts, the copy is lowered to a call
 /// to the generic `MemrefCopyFn`.
-struct MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
-  using ConvertOpToLLVMPattern<memref::CopyOp>::ConvertOpToLLVMPattern;
+class MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
+  SymbolTableCollection *symbolTables = nullptr;
+
+public:
+  explicit MemRefCopyOpLowering(const LLVMTypeConverter &typeConverter,
+                                SymbolTableCollection *symbolTables = nullptr,
+                                PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<memref::CopyOp>(typeConverter, benefit),
+        symbolTables(symbolTables) {}
 
   LogicalResult
   lowerToMemCopyIntrinsic(memref::CopyOp op, OpAdaptor adaptor,
@@ -1093,7 +1153,7 @@ struct MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
     auto elemSize = getSizeInBytes(loc, srcType.getElementType(), rewriter);
     auto copyFn = LLVM::lookupOrCreateMemRefCopyFn(
         rewriter, op->getParentOfType<ModuleOp>(), getIndexType(),
-        sourcePtr.getType());
+        sourcePtr.getType(), symbolTables);
     if (failed(copyFn))
       return failure();
     rewriter.create<LLVM::CallOp>(loc, copyFn.value(),
@@ -1928,7 +1988,8 @@ public:
 } // namespace
 
 void mlir::populateFinalizeMemRefToLLVMConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    SymbolTableCollection *symbolTables) {
   // clang-format off
   patterns.add<
       AllocaOpLowering,
@@ -1939,11 +2000,9 @@ void mlir::populateFinalizeMemRefToLLVMConversionPatterns(
       DimOpLowering,
       ExtractStridedMetadataOpLowering,
       GenericAtomicRMWOpLowering,
-      GlobalMemrefOpLowering,
       GetGlobalMemrefOpLowering,
       LoadOpLowering,
       MemRefCastOpLowering,
-      MemRefCopyOpLowering,
       MemorySpaceCastOpLowering,
       MemRefReinterpretCastOpLowering,
       MemRefReshapeOpLowering,
@@ -1956,11 +2015,14 @@ void mlir::populateFinalizeMemRefToLLVMConversionPatterns(
       TransposeOpLowering,
       ViewOpLowering>(converter);
   // clang-format on
+  patterns.add<GlobalMemrefOpLowering, MemRefCopyOpLowering>(converter,
+                                                             symbolTables);
   auto allocLowering = converter.getOptions().allocLowering;
   if (allocLowering == LowerToLLVMOptions::AllocLowering::AlignedAlloc)
-    patterns.add<AlignedAllocOpLowering, DeallocOpLowering>(converter);
+    patterns.add<AlignedAllocOpLowering, DeallocOpLowering>(converter,
+                                                            symbolTables);
   else if (allocLowering == LowerToLLVMOptions::AllocLowering::Malloc)
-    patterns.add<AllocOpLowering, DeallocOpLowering>(converter);
+    patterns.add<AllocOpLowering, DeallocOpLowering>(converter, symbolTables);
 }
 
 namespace {
@@ -1987,7 +2049,9 @@ struct FinalizeMemRefToLLVMConversionPass
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
     RewritePatternSet patterns(&getContext());
-    populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
+    SymbolTableCollection symbolTables;
+    populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns,
+                                                   &symbolTables);
     LLVMConversionTarget target(getContext());
     target.addLegalOp<func::FuncOp>();
     if (failed(applyPartialConversion(op, target, std::move(patterns))))
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index f72599363567..d53d11f87efe 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1595,8 +1595,14 @@ private:
 };
 
 class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
+  SymbolTableCollection *symbolTables = nullptr;
+
 public:
-  using ConvertOpToLLVMPattern<vector::PrintOp>::ConvertOpToLLVMPattern;
+  explicit VectorPrintOpConversion(
+      const LLVMTypeConverter &typeConverter,
+      SymbolTableCollection *symbolTables = nullptr)
+      : ConvertOpToLLVMPattern<vector::PrintOp>(typeConverter),
+        symbolTables(symbolTables) {}
 
   // Lowering implementation that relies on a small runtime support library,
   // which only needs to provide a few printing methods (single value for all
@@ -1643,13 +1649,17 @@ public:
       FailureOr<LLVM::LLVMFuncOp> op = [&]() {
         switch (punct) {
         case PrintPunctuation::Close:
-          return LLVM::lookupOrCreatePrintCloseFn(rewriter, parent);
+          return LLVM::lookupOrCreatePrintCloseFn(rewriter, parent,
+                                                  symbolTables);
         case PrintPunctuation::Open:
-          return LLVM::lookupOrCreatePrintOpenFn(rewriter, parent);
+          return LLVM::lookupOrCreatePrintOpenFn(rewriter, parent,
+                                                 symbolTables);
         case PrintPunctuation::Comma:
-          return LLVM::lookupOrCreatePrintCommaFn(rewriter, parent);
+          return LLVM::lookupOrCreatePrintCommaFn(rewriter, parent,
+                                                  symbolTables);
         case PrintPunctuation::NewLine:
-          return LLVM::lookupOrCreatePrintNewlineFn(rewriter, parent);
+          return LLVM::lookupOrCreatePrintNewlineFn(rewriter, parent,
+                                                    symbolTables);
         default:
           llvm_unreachable("unexpected punctuation");
         }
@@ -1683,17 +1693,17 @@ private:
     PrintConversion conversion = PrintConversion::None;
     FailureOr<Operation *> printer;
     if (printType.isF32()) {
-      printer = LLVM::lookupOrCreatePrintF32Fn(rewriter, parent);
+      printer = LLVM::lookupOrCreatePrintF32Fn(rewriter, parent, symbolTables);
     } else if (printType.isF64()) {
-      printer = LLVM::lookupOrCreatePrintF64Fn(rewriter, parent);
+      printer = LLVM::lookupOrCreatePrintF64Fn(rewriter, parent, symbolTables);
     } else if (printType.isF16()) {
       conversion = PrintConversion::Bitcast16; // bits!
-      printer = LLVM::lookupOrCreatePrintF16Fn(rewriter, parent);
+      printer = LLVM::lookupOrCreatePrintF16Fn(rewriter, parent, symbolTables);
     } else if (printType.isBF16()) {
       conversion = PrintConversion::Bitcast16; // bits!
-      printer = LLVM::lookupOrCreatePrintBF16Fn(rewriter, parent);
+      printer = LLVM::lookupOrCreatePrintBF16Fn(rewriter, parent, symbolTables);
     } else if (printType.isIndex()) {
-      printer = LLVM::lookupOrCreatePrintU64Fn(rewriter, parent);
+      printer = LLVM::lookupOrCreatePrintU64Fn(rewriter, parent, symbolTables);
     } else if (auto intTy = dyn_cast<IntegerType>(printType)) {
       // Integers need a zero or sign extension on the operand
       // (depending on the source type) as well as a signed or
@@ -1703,7 +1713,8 @@ private:
         if (width <= 64) {
           if (width < 64)
             conversion = PrintConversion::ZeroExt64;
-          printer = LLVM::lookupOrCreatePrintU64Fn(rewriter, parent);
+          printer =
+              LLVM::lookupOrCreatePrintU64Fn(rewriter, parent, symbolTables);
         } else {
           return failure();
         }
@@ -1716,7 +1727,8 @@ private:
             conversion = PrintConversion::ZeroExt64;
           else if (width < 64)
             conversion = PrintConversion::SignExt64;
-          printer = LLVM::lookupOrCreatePrintI64Fn(rewriter, parent);
+          printer =
+              LLVM::lookupOrCreatePrintI64Fn(rewriter, parent, symbolTables);
         } else {
           return failure();
         }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 1b4a8f496d3d..89f765dacda3 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -44,15 +44,31 @@ static constexpr llvm::StringRef kGenericAlignedAlloc =
 static constexpr llvm::StringRef kGenericFree = "_mlir_memref_to_llvm_free";
 static constexpr llvm::StringRef kMemRefCopy = "memrefCopy";
 
+namespace {
+/// Search for an LLVMFuncOp with a given name within an operation with the
+/// SymbolTable trait. An optional collection of cached symbol tables can be
+/// given to avoid a linear scan of the symbol table operation.
+LLVM::LLVMFuncOp lookupFuncOp(StringRef name, Operation *symbolTableOp,
+                              SymbolTableCollection *symbolTables = nullptr) {
+  if (symbolTables) {
+    return symbolTables->lookupSymbolIn<LLVM::LLVMFuncOp>(
+        symbolTableOp, StringAttr::get(symbolTableOp->getContext(), name));
+  }
+
+  return llvm::dyn_cast_or_null<LLVM::LLVMFuncOp>(
+      SymbolTable::lookupSymbolIn(symbolTableOp, name));
+}
+} // namespace
+
 /// Generic print function lookupOrCreate helper.
 FailureOr<LLVM::LLVMFuncOp>
 mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
                              ArrayRef<Type> paramTypes, Type resultType,
-                             bool isVarArg, bool isReserved) {
+                             bool isVarArg, bool isReserved,
+                             SymbolTableCollection *symbolTables) {
   assert(moduleOp->hasTrait<OpTrait::SymbolTable>() &&
          "expected SymbolTable operation");
-  auto func = llvm::dyn_cast_or_null<LLVM::LLVMFuncOp>(
-      SymbolTable::lookupSymbolIn(moduleOp, name));
+  auto func = lookupFuncOp(name, moduleOp, symbolTables);
   auto funcT = LLVMFunctionType::get(resultType, paramTypes, isVarArg);
   // Assert the signature of the found function is same as expected
   if (func) {
@@ -73,60 +89,75 @@ mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
   OpBuilder::InsertionGuard g(b);
   assert(!moduleOp->getRegion(0).empty() && "expected non-empty region");
   b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
-  return b.create<LLVM::LLVMFuncOp>(
+  auto funcOp = b.create<LLVM::LLVMFuncOp>(
       moduleOp->getLoc(), name,
       LLVM::LLVMFunctionType::get(resultType, paramTypes, isVarArg));
+
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(moduleOp);
+    symbolTable.insert(funcOp, moduleOp->getRegion(0).front().begin());
+  }
+
+  return funcOp;
 }
 
 static FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreateReservedFn(OpBuilder &b, Operation *moduleOp, StringRef name,
-                         ArrayRef<Type> paramTypes, Type resultType) {
+                         ArrayRef<Type> paramTypes, Type resultType,
+                         SymbolTableCollection *symbolTables) {
   return lookupOrCreateFn(b, moduleOp, name, paramTypes, resultType,
-                          /*isVarArg=*/false, /*isReserved=*/true);
+                          /*isVarArg=*/false, /*isReserved=*/true,
+                          symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintI64Fn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintI64Fn(OpBuilder &b, Operation *moduleOp,
+                                     SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintI64, IntegerType::get(moduleOp->getContext(), 64),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintU64Fn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintU64Fn(OpBuilder &b, Operation *moduleOp,
+                                     SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintU64, IntegerType::get(moduleOp->getContext(), 64),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintF16Fn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintF16Fn(OpBuilder &b, Operation *moduleOp,
+                                     SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintF16,
       IntegerType::get(moduleOp->getContext(), 16), // bits!
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintBF16Fn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintBF16Fn(OpBuilder &b, Operation *moduleOp,
+                                      SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintBF16,
       IntegerType::get(moduleOp->getContext(), 16), // bits!
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
+                                     SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintF32, Float32Type::get(moduleOp->getContext()),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
+                                     SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintF64, Float64Type::get(moduleOp->getContext()),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
@@ -140,90 +171,102 @@ static LLVM::LLVMPointerType getVoidPtr(MLIRContext *context) {
 
 FailureOr<LLVM::LLVMFuncOp> mlir::LLVM::lookupOrCreatePrintStringFn(
     OpBuilder &b, Operation *moduleOp,
-    std::optional<StringRef> runtimeFunctionName) {
+    std::optional<StringRef> runtimeFunctionName,
+    SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, runtimeFunctionName.value_or(kPrintString),
       getCharPtr(moduleOp->getContext()),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintOpenFn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintOpenFn(OpBuilder &b, Operation *moduleOp,
+                                      SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintOpen, {},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintCloseFn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintCloseFn(OpBuilder &b, Operation *moduleOp,
+                                       SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintClose, {},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintCommaFn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintCommaFn(OpBuilder &b, Operation *moduleOp,
+                                       SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintComma, {},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreatePrintNewlineFn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreatePrintNewlineFn(OpBuilder &b, Operation *moduleOp,
+                                         SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kPrintNewline, {},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
 mlir::LLVM::lookupOrCreateMallocFn(OpBuilder &b, Operation *moduleOp,
-                                   Type indexType) {
+                                   Type indexType,
+                                   SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(b, moduleOp, kMalloc, indexType,
-                                  getVoidPtr(moduleOp->getContext()));
+                                  getVoidPtr(moduleOp->getContext()),
+                                  symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
 mlir::LLVM::lookupOrCreateAlignedAllocFn(OpBuilder &b, Operation *moduleOp,
-                                         Type indexType) {
-  return lookupOrCreateReservedFn(b, moduleOp, kAlignedAlloc,
-                                  {indexType, indexType},
-                                  getVoidPtr(moduleOp->getContext()));
+                                         Type indexType,
+                                         SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kAlignedAlloc, {indexType, indexType},
+      getVoidPtr(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreateFreeFn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreateFreeFn(OpBuilder &b, Operation *moduleOp,
+                                 SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kFree, getVoidPtr(moduleOp->getContext()),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
 mlir::LLVM::lookupOrCreateGenericAllocFn(OpBuilder &b, Operation *moduleOp,
-                                         Type indexType) {
+                                         Type indexType,
+                                         SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(b, moduleOp, kGenericAlloc, indexType,
-                                  getVoidPtr(moduleOp->getContext()));
+                                  getVoidPtr(moduleOp->getContext()),
+                                  symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp> mlir::LLVM::lookupOrCreateGenericAlignedAllocFn(
-    OpBuilder &b, Operation *moduleOp, Type indexType) {
-  return lookupOrCreateReservedFn(b, moduleOp, kGenericAlignedAlloc,
-                                  {indexType, indexType},
-                                  getVoidPtr(moduleOp->getContext()));
+    OpBuilder &b, Operation *moduleOp, Type indexType,
+    SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kGenericAlignedAlloc, {indexType, indexType},
+      getVoidPtr(moduleOp->getContext()), symbolTables);
 }
 
 FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreateGenericFreeFn(OpBuilder &b, Operation *moduleOp) {
+mlir::LLVM::lookupOrCreateGenericFreeFn(OpBuilder &b, Operation *moduleOp,
+                                        SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kGenericFree, getVoidPtr(moduleOp->getContext()),
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
-FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreateMemRefCopyFn(OpBuilder &b, Operation *moduleOp,
-                                       Type indexType,
-                                       Type unrankedDescriptorType) {
+FailureOr<LLVM::LLVMFuncOp> mlir::LLVM::lookupOrCreateMemRefCopyFn(
+    OpBuilder &b, Operation *moduleOp, Type indexType,
+    Type unrankedDescriptorType, SymbolTableCollection *symbolTables) {
   return lookupOrCreateReservedFn(
       b, moduleOp, kMemRefCopy,
       ArrayRef<Type>{indexType, unrankedDescriptorType, unrankedDescriptorType},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }

From 2f5d965bb59879c1604d0308385b30565785d412 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 21 Jun 2025 10:23:53 +0100
Subject: [PATCH 1152/1322] [VPlan] Use EMIT-SCALAR when printing casts.

Split off EMIT-SCALAR printing changes from already approved
https://github.com/llvm/llvm-project/pull/140623.

Currently all casts are single scalars, this brings printing in line
with printing for other VPInstructions.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  4 ++--
 .../RISCV/riscv-vector-reverse.ll             |  4 ++--
 .../RISCV/vplan-vp-call-intrinsics.ll         | 18 ++++++++---------
 .../RISCV/vplan-vp-cast-intrinsics.ll         | 20 +++++++++----------
 ...an-vp-intrinsics-fixed-order-recurrence.ll |  4 ++--
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |  4 ++--
 .../RISCV/vplan-vp-intrinsics.ll              |  2 +-
 .../RISCV/vplan-vp-select-intrinsics.ll       |  2 +-
 .../interleave-and-scalarize-only.ll          |  2 +-
 9 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 39def05b2eac..a48ff168efcc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -865,7 +865,7 @@ bool VPInstruction::isVectorToScalar() const {
 }
 
 bool VPInstruction::isSingleScalar() const {
-  return getOpcode() == Instruction::PHI;
+  return getOpcode() == Instruction::PHI || isScalarCast();
 }
 
 void VPInstruction::execute(VPTransformState &State) {
@@ -1100,7 +1100,7 @@ void VPInstructionWithType::execute(VPTransformState &State) {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
                                   VPSlotTracker &SlotTracker) const {
-  O << Indent << "EMIT ";
+  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
   printAsOperand(O, SlotTracker);
   O << " = ";
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 41252f519155..b23b0ce759d4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -58,7 +58,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:    EMIT ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    EMIT-SCALAR ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
 ; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
 ; CHECK-NEXT:    EMIT ir<%add9> = add ir<%1>, ir<1>
@@ -467,7 +467,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:    EMIT ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    EMIT-SCALAR ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
 ; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
 ; CHECK-NEXT:    EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
index 2dd12f70f128..83ae1f2d704e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -35,7 +35,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -92,7 +92,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -149,7 +149,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -206,7 +206,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -260,7 +260,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[CTLZ]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -312,7 +312,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[CTTZ]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -366,7 +366,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -422,7 +422,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -476,7 +476,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ABS]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
index ada84d4ef833..5c5492eec743 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
@@ -31,7 +31,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[SEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -85,7 +85,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ZEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -137,7 +137,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[TRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -189,7 +189,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPEXT]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -241,7 +241,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTRUNC]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -293,7 +293,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[SITOFP]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -345,7 +345,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[UITOFP]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -397,7 +397,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOSI]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -449,7 +449,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[FPTOUI]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
@@ -501,7 +501,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[INTTOPTR]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
index 0f024bb89bab..7540b77d0d50 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
@@ -19,7 +19,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT: Successor(s): scalar.ph, vector.ph
 ; IF-EVL-EMPTY:
 ; IF-EVL: vector.ph:
-; IF-EVL-NEXT:  EMIT vp<[[VF32:%[0-9]+]]> = trunc vp<[[VF]]> to i32
+; IF-EVL-NEXT:  EMIT-SCALAR vp<[[VF32:%[0-9]+]]> = trunc vp<[[VF]]> to i32
 ; IF-EVL-NEXT: Successor(s): vector loop
 ; IF-EVL-EMPTY:
 ; IF-EVL: <x1> vector loop: {
@@ -39,7 +39,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds nuw ir<%B>, vp<[[ST]]>
 ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR2]]>, ir<[[ADD]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:     EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 79f490aa16a9..0fd27f335b64 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -53,7 +53,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
 ; IF-EVL-OUTLOOP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]>
 ; IF-EVL-OUTLOOP-NEXT:    WIDEN-INTRINSIC vp<[[RDX_SELECT]]> = call llvm.vp.merge(ir<true>, ir<[[ADD]]>, ir<[[RDX_PHI]]>, vp<[[EVL]]>)
-; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-OUTLOOP-NEXT:    EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-OUTLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-OUTLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
@@ -93,7 +93,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
 ; IF-EVL-INLOOP-NEXT:    WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
 ; IF-EVL-INLOOP-NEXT:    REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>)
-; IF-EVL-INLOOP-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-INLOOP-NEXT:    EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index b0b69c74a229..fc81bba4e9c2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -40,7 +40,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:    WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
-; IF-EVL-NEXT:    EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 543c706250da..8c0e4da51f72 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -48,7 +48,7 @@
  ; IF-EVL-NEXT:   CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>
  ; IF-EVL-NEXT:   vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
  ; IF-EVL-NEXT:   WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
- ; IF-EVL-NEXT:   EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+ ; IF-EVL-NEXT:   EMIT-SCALAR vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
  ; IF-EVL-NEXT:   EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
  ; IF-EVL-NEXT:   EMIT vp<[[IV_NEXT_EXIT]]> = add vp<[[IV]]>, ir<[[VFUF]]>
  ; IF-EVL-NEXT:   EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  ir<[[VTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index db25e7bede5c..f03870096ca9 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -208,7 +208,7 @@ exit:
 ; DBG-NEXT:   vector.body:
 ; DBG-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; DBG-NEXT:     FIRST-ORDER-RECURRENCE-PHI ir<%for> = phi ir<0>, vp<[[SCALAR_STEPS:.+]]>
-; DBG-NEXT:     EMIT vp<[[TRUNC_IV:%.+]]> = trunc vp<[[CAN_IV]]> to i32
+; DBG-NEXT:     EMIT-SCALAR vp<[[TRUNC_IV:%.+]]> = trunc vp<[[CAN_IV]]> to i32
 ; DBG-NEXT:     vp<[[SCALAR_STEPS]]> = SCALAR-STEPS vp<[[TRUNC_IV]]>, ir<1>, vp<[[VF]]
 ; DBG-NEXT:     EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%for>, vp<[[SCALAR_STEPS]]>
 ; DBG-NEXT:     CLONE store vp<[[SPLICE]]>, ir<%dst>

From d6a486c221c1a2d18e88ca39279bcf1675fe7723 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 21 Jun 2025 19:31:51 +1000
Subject: [PATCH 1153/1322] =?UTF-8?q?[ORC]=20Apply=20MachO::CPU=5FSUBTYPE?=
 =?UTF-8?q?=5FMASK=20to=20comparison=20in=20getDylibInterfac=E2=80=A6=20(#?=
 =?UTF-8?q?145154)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…eFromDylib.

When comparing CPU subtypes from slices in a MachO universal binary we
need to apply the MachO::CPU_SUBTYPE_MASK to mask out any flags in the
high bits, otherwise we might fail to correctly match a slice and return
a spurious "does not contain slice" error.

rdar://153913779
---
 .../ExecutionEngine/Orc/GetDylibInterface.cpp |   3 +-
 .../Inputs/MachO_Universal_libFoo_dylib.yaml  | 350 ++++++++++++++++++
 .../MachO_universal_binaries_weak_library.ll  |  19 +
 3 files changed, 371 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/Generic/Inputs/MachO_Universal_libFoo_dylib.yaml
 create mode 100644 llvm/test/ExecutionEngine/JITLink/Generic/MachO_universal_binaries_weak_library.ll

diff --git a/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp b/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
index c80ec7686890..9ccb211931a5 100644
--- a/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
@@ -41,7 +41,8 @@ Expected<SymbolNameSet> getDylibInterfaceFromDylib(ExecutionSession &ES,
   else if (auto *MachOUni =
                dyn_cast<object::MachOUniversalBinary>(BinFile->get())) {
     for (auto &O : MachOUni->objects()) {
-      if (O.getCPUType() == *CPUType && O.getCPUSubType() == *CPUSubType) {
+      if (O.getCPUType() == *CPUType &&
+          (O.getCPUSubType() & ~MachO::CPU_SUBTYPE_MASK) == *CPUSubType) {
         if (auto Obj = O.getAsObjectFile())
           MachOFile = std::move(*Obj);
         else
diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/Inputs/MachO_Universal_libFoo_dylib.yaml b/llvm/test/ExecutionEngine/JITLink/Generic/Inputs/MachO_Universal_libFoo_dylib.yaml
new file mode 100644
index 000000000000..56e976a3c3bb
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/Generic/Inputs/MachO_Universal_libFoo_dylib.yaml
@@ -0,0 +1,350 @@
+--- !fat-mach-o
+
+# Contains two slices (arm64e and x86-64) for libfoo.dylib, each containing the
+# following:
+#
+# int foo(void) { return 0; }
+
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       2
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            4200
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x4000
+    size:            16784
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           13
+      sizeofcmds:      568
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x2A0
+            size:            3
+            offset:          0x2A0
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         31C0C3
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        104
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         libfoo.x86-64.dylib
+        ZeroPadBytes:    5
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         4096
+        datasize:        48
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         4144
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          4176
+        nsyms:           1
+        stroff:          4192
+        strsize:         8
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       0
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  0
+        nindirectsyms:   0
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            A29E87C8-EF8B-3721-B0D3-9749DEBEEBBB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             984320
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         76481792
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         4168
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         4176
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      12
+            Name:            _foo
+            Flags:           0x0
+            Address:         0x2A0
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         672
+      StringTable:
+        - ' '
+        - _foo
+        - ''
+      FunctionStarts:  [ 0x2A0 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x2C, 
+                         0x0, 0x0, 0x0, 0x2C, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      664
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         232
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          2
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x2D8
+            size:            8
+            offset:          0x2D8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00008052C0035FD6
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x2E0
+            size:            88
+            offset:          0x2E0
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000D80200004000000040000000E00200000000000040000000000000000000000000000000030000000C00010010000100000000000000000200000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        400
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         libfoo.arm64e.dylib
+        ZeroPadBytes:    5
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         16384
+        datasize:        48
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         16432
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          16464
+        nsyms:           1
+        stroff:          16480
+        strsize:         8
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       0
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  0
+        nindirectsyms:   0
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            0220C592-2D73-3642-B893-B7DA8FCD396C
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             984320
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         76481792
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         16456
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         16464
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         16496
+        datasize:        288
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      12
+            Name:            _foo
+            Flags:           0x0
+            Address:         0x2D8
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         728
+      StringTable:
+        - ' '
+        - _foo
+        - ''
+      FunctionStarts:  [ 0x2D8 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x2C, 
+                         0x0, 0x0, 0x0, 0x2C, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/MachO_universal_binaries_weak_library.ll b/llvm/test/ExecutionEngine/JITLink/Generic/MachO_universal_binaries_weak_library.ll
new file mode 100644
index 000000000000..1df0cf4a6bd0
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/Generic/MachO_universal_binaries_weak_library.ll
@@ -0,0 +1,19 @@
+; RUN: rm -rf %t && mkdir %t
+; RUN: yaml2obj -o %t/libFoo.dylib %S/Inputs/MachO_Universal_libFoo_dylib.yaml
+; RUN: llc -filetype=obj -mtriple arm64e-apple-macosx -o %t/main.o %s
+; RUN: llvm-jitlink -noexec -triple arm64e-apple-macosx %t/main.o -weak_library \
+; RUN:     %t/libFoo.dylib
+;
+; REQUIRES: x86-registered-target && aarch64-registered-target
+;
+; Check MachO universal binary handling in the orc::getDylibInterfaceFromDylib
+; function, including that the cpusubtype field is masked correctly (for arm64e
+; slices this field will have the MachO::CPU_SUBTYPE_LIB64 flag set in the high
+; bits -- the subtype will fail to match unless it's masked out).
+
+declare i32 @foo()
+
+define i32 @main(i32 %argc, ptr %argv) {
+entry:
+  ret i32 ptrtoint (ptr @foo to i32)
+}

From 1bb2328fd3adf137cb32af4e3722a1b3e8a53a8e Mon Sep 17 00:00:00 2001
From: Jakub Mazurkiewicz <mazkuba3@gmail.com>
Date: Sat, 21 Jun 2025 11:54:50 +0200
Subject: [PATCH 1154/1322] [libc++] Implement `views::join_with` (#65536)

* Implement "P2441R2 `views::join_with`" (https://wg21.link/P2441R2),
closes #105185
* Implement LWG4074 (https://wg21.link/LWG4074), closes #105346
* Complete implementation of "P2711R1 Making multi-param constructors of
views explicit" (https://wg21.link/P2711R1), closes #105252
* Complete implementation of "P2770R0 Stashing stashing iterators for
proper flattening" (https://wg21.link/P2770R0), closes #105250
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +-
 libcxx/docs/ReleaseNotes/21.rst               |   3 +
 libcxx/docs/Status/Cxx23Papers.csv            |   6 +-
 libcxx/docs/Status/Cxx2cIssues.csv            |   2 +-
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__ranges/concepts.h            |  40 ++
 libcxx/include/__ranges/join_with_view.h      | 460 ++++++++++++++++++
 libcxx/include/module.modulemap.in            |   1 +
 libcxx/include/ranges                         |  10 +
 libcxx/include/version                        |   2 +-
 libcxx/modules/std/ranges.inc                 |   7 +-
 .../deref.nodiscard.verify.cpp                |  28 ++
 .../eq.nodiscard.verify.cpp                   |  30 ++
 .../iter_move.nodiscard.verify.cpp            |  28 ++
 .../no_unique_address.compile.pass.cpp        |  56 +++
 .../adaptor.nodiscard.verify.cpp              |  33 ++
 .../eq.nodiscard.verify.cpp                   |  35 ++
 .../no_unique_address.compile.pass.cpp        |  48 ++
 .../base.nodiscard.verify.cpp                 |  30 ++
 .../begin.nodiscard.verify.cpp                |  28 ++
 .../end.nodiscard.verify.cpp                  |  28 ++
 .../no_unique_address.compile.pass.cpp        |  47 ++
 .../ranges.version.compile.pass.cpp           |  32 +-
 .../version.version.compile.pass.cpp          |  32 +-
 .../ctor.default.pass.cpp                     |  79 +++
 .../ctor.not_const.pass.cpp                   | 111 +++++
 .../decrement.pass.cpp                        | 283 +++++++++++
 .../range.join.with.iterator/deref.pass.cpp   | 225 +++++++++
 .../range.join.with.iterator/eq.pass.cpp      | 259 ++++++++++
 .../increment.pass.cpp                        | 372 ++++++++++++++
 .../iter_move.pass.cpp                        | 420 ++++++++++++++++
 .../iter_swap.pass.cpp                        | 186 +++++++
 .../types.compile.pass.cpp                    | 456 +++++++++++++++++
 .../range.join.with.overview/adaptor.pass.cpp | 360 ++++++++++++++
 .../range.join.with.overview/example.pass.cpp |  42 ++
 .../ctor.default.pass.cpp                     |  37 ++
 .../ctor.non_const.pass.cpp                   |  74 +++
 .../range.join.with.sentinel/eq.pass.cpp      | 109 +++++
 .../range.join.with.view/base.pass.cpp        | 132 +++++
 .../range.join.with.view/begin.pass.cpp       | 221 +++++++++
 .../constraints.compile.pass.cpp              | 289 +++++++++++
 .../ctad.compile.pass.cpp                     | 230 +++++++++
 .../ctor.default.pass.cpp                     |  77 +++
 .../ctor.range.element.pass.cpp               | 244 ++++++++++
 .../ctor.range.pattern.pass.cpp               | 111 +++++
 .../range.join.with.view/end.pass.cpp         | 232 +++++++++
 .../inheritance.compile.pass.cpp              |  38 ++
 .../range.adaptors/range.join.with/types.h    | 319 ++++++++++++
 .../generate_feature_test_macro_components.py |   1 -
 49 files changed, 5843 insertions(+), 53 deletions(-)
 create mode 100644 libcxx/include/__ranges/join_with_view.h
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/no_unique_address.compile.pass.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/no_unique_address.compile.pass.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/base.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/begin.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/end.nodiscard.verify.cpp
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/no_unique_address.compile.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.default.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.not_const.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/decrement.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/increment.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_swap.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/types.compile.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/example.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.default.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.non_const.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/base.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/begin.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/constraints.compile.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctad.compile.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.default.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.element.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.pattern.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/end.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/inheritance.compile.pass.cpp
 create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.with/types.h

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 5ebc9bb7dcda..3c635e5e46bb 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -376,7 +376,7 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_ranges_iota``                                  ``202202L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_ranges_join_with``                             *unimplemented*
+    ``__cpp_lib_ranges_join_with``                             ``202202L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_ranges_repeat``                                ``202207L``
     ---------------------------------------------------------- -----------------
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 8661e5898fbc..7e8570691200 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -47,6 +47,9 @@ Implemented Papers
 - P1222R4: A Standard ``flat_set`` (`Github <https://github.com/llvm/llvm-project/issues/105193>`__)
 - P2897R7: ``aligned_accessor``: An mdspan accessor expressing pointer over-alignment (`Github <https://github.com/llvm/llvm-project/issues/118372>`__)
 - P3247R2: Deprecate the notion of trivial types (`Github <https://github.com/llvm/llvm-project/issues/118387>`__)
+- P2441R2: ``views::join_with`` (`Github <https://github.com/llvm/llvm-project/issues/105185>`__)
+- P2711R1: Making multi-param constructors of ``views`` ``explicit`` (`Github <https://github.com/llvm/llvm-project/issues/105252>`__)
+- P2770R0: Stashing stashing ``iterators`` for proper flattening (`Github <https://github.com/llvm/llvm-project/issues/105250>`__)
 
 Improvements and New Features
 -----------------------------
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index c26363bcda79..574675175a4c 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -47,7 +47,7 @@
 "`P2273R3 <https://wg21.link/P2273R3>`__","Making ``std::unique_ptr`` constexpr","2022-02 (Virtual)","|Complete|","16",""
 "`P2387R3 <https://wg21.link/P2387R3>`__","Pipe support for user-defined range adaptors","2022-02 (Virtual)","|Complete|","19",""
 "`P2440R1 <https://wg21.link/P2440R1>`__","``ranges::iota``, ``ranges::shift_left`` and ``ranges::shift_right``","2022-02 (Virtual)","|Partial|","","Only ``ranges::iota`` is implemented."
-"`P2441R2 <https://wg21.link/P2441R2>`__","``views::join_with``","2022-02 (Virtual)","|In Progress|","",""
+"`P2441R2 <https://wg21.link/P2441R2>`__","``views::join_with``","2022-02 (Virtual)","|Complete|","21",""
 "`P2442R1 <https://wg21.link/P2442R1>`__","Windowing range adaptors: ``views::chunk`` and ``views::slide``","2022-02 (Virtual)","","",""
 "`P2443R1 <https://wg21.link/P2443R1>`__","``views::chunk_by``","2022-02 (Virtual)","|Complete|","18",""
 "","","","","",""
@@ -103,9 +103,9 @@
 "`P2708R1 <https://wg21.link/P2708R1>`__","No Further Fundamentals TSes","2022-11 (Kona)","|Nothing To Do|","",""
 "","","","","",""
 "`P0290R4 <https://wg21.link/P0290R4>`__","``apply()`` for ``synchronized_value<T>``","2023-02 (Issaquah)","","",""
-"`P2770R0 <https://wg21.link/P2770R0>`__","Stashing stashing ``iterators`` for proper flattening","2023-02 (Issaquah)","|Partial|","","``join_with_view`` hasn't been done yet since this type isn't implemented yet"
+"`P2770R0 <https://wg21.link/P2770R0>`__","Stashing stashing ``iterators`` for proper flattening","2023-02 (Issaquah)","|Complete|","21",""
 "`P2164R9 <https://wg21.link/P2164R9>`__","``views::enumerate``","2023-02 (Issaquah)","","",""
-"`P2711R1 <https://wg21.link/P2711R1>`__","Making multi-param constructors of ``views`` ``explicit``","2023-02 (Issaquah)","|In Progress|","","``join_with_view`` hasn't been done yet since this type isn't implemented yet"
+"`P2711R1 <https://wg21.link/P2711R1>`__","Making multi-param constructors of ``views`` ``explicit``","2023-02 (Issaquah)","|Complete|","21",""
 "`P2609R3 <https://wg21.link/P2609R3>`__","Relaxing Ranges Just A Smidge","2023-02 (Issaquah)","|Complete|","20","Implemented as a DR in C++20. Other implementations will do the same."
 "`P2713R1 <https://wg21.link/P2713R1>`__","Escaping improvements in ``std::format``","2023-02 (Issaquah)","|Complete|","19",""
 "`P2675R1 <https://wg21.link/P2675R1>`__","``format``'s width estimation is too approximate and not forward compatible","2023-02 (Issaquah)","|Complete|","17",""
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index fdf381862d87..d3feecf6513e 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -66,7 +66,7 @@
 "`LWG4060 <https://wg21.link/LWG4060>`__","``submdspan`` preconditions do not forbid creating invalid pointer","2024-06 (St. Louis)","","",""
 "`LWG4061 <https://wg21.link/LWG4061>`__","Should ``std::basic_format_context`` be default-constructible/copyable/movable?","2024-06 (St. Louis)","|Complete|","19",""
 "`LWG4071 <https://wg21.link/LWG4071>`__","``reference_wrapper`` comparisons are not SFINAE-friendly","2024-06 (St. Louis)","|Complete|","19",""
-"`LWG4074 <https://wg21.link/LWG4074>`__","``compatible-joinable-ranges`` is underconstrained","2024-06 (St. Louis)","","",""
+"`LWG4074 <https://wg21.link/LWG4074>`__","``compatible-joinable-ranges`` is underconstrained","2024-06 (St. Louis)","|Complete|","21",""
 "`LWG4076 <https://wg21.link/LWG4076>`__","``concat_view`` should be freestanding","2024-06 (St. Louis)","","",""
 "`LWG4079 <https://wg21.link/LWG4079>`__","Missing Preconditions in ``concat_view::iterator``\`s conversion constructor","2024-06 (St. Louis)","","",""
 "`LWG4082 <https://wg21.link/LWG4082>`__","``views::concat(r)`` is well-formed when ``r`` is an ``output_range``","2024-06 (St. Louis)","","",""
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8931a1b35f6d..e386f31386b6 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -706,6 +706,7 @@ set(files
   __ranges/iota_view.h
   __ranges/istream_view.h
   __ranges/join_view.h
+  __ranges/join_with_view.h
   __ranges/lazy_split_view.h
   __ranges/movable_box.h
   __ranges/non_propagating_cache.h
diff --git a/libcxx/include/__ranges/concepts.h b/libcxx/include/__ranges/concepts.h
index 674a3f359ff9..bf75fe8a6fef 100644
--- a/libcxx/include/__ranges/concepts.h
+++ b/libcxx/include/__ranges/concepts.h
@@ -10,7 +10,9 @@
 #ifndef _LIBCPP___RANGES_CONCEPTS_H
 #define _LIBCPP___RANGES_CONCEPTS_H
 
+#include <__concepts/common_reference_with.h>
 #include <__concepts/constructible.h>
+#include <__concepts/convertible_to.h>
 #include <__concepts/movable.h>
 #include <__concepts/same_as.h>
 #include <__config>
@@ -25,6 +27,8 @@
 #include <__ranges/enable_view.h>
 #include <__ranges/size.h>
 #include <__type_traits/add_pointer.h>
+#include <__type_traits/common_reference.h>
+#include <__type_traits/common_type.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/remove_reference.h>
@@ -133,6 +137,42 @@ concept viewable_range =
       (is_lvalue_reference_v<_Tp> ||
        (movable<remove_reference_t<_Tp>> && !__is_std_initializer_list<remove_cvref_t<_Tp>>))));
 
+#  if _LIBCPP_STD_VER >= 23
+
+template <class... _Rs>
+using __concat_reference_t _LIBCPP_NODEBUG = common_reference_t<range_reference_t<_Rs>...>;
+
+template <class... _Rs>
+using __concat_value_t _LIBCPP_NODEBUG = common_type_t<range_value_t<_Rs>...>;
+
+template <class... _Rs>
+using __concat_rvalue_reference_t _LIBCPP_NODEBUG = common_reference_t<range_rvalue_reference_t<_Rs>...>;
+
+template <class _Ref, class _RRef, class _It>
+concept __concat_indirectly_readable_impl = requires(const _It __it) {
+  { *__it } -> convertible_to<_Ref>;
+  { ranges::iter_move(__it) } -> convertible_to<_RRef>;
+};
+
+template <class... _Rs>
+concept __concat_indirectly_readable =
+    common_reference_with<__concat_reference_t<_Rs...>&&, __concat_value_t<_Rs...>&> &&
+    common_reference_with<__concat_reference_t<_Rs...>&&, __concat_rvalue_reference_t<_Rs...>&&> &&
+    common_reference_with<__concat_rvalue_reference_t<_Rs...>&&, const __concat_value_t<_Rs...>&> &&
+    (__concat_indirectly_readable_impl<__concat_reference_t<_Rs...>,
+                                       __concat_rvalue_reference_t<_Rs...>,
+                                       iterator_t<_Rs>> &&
+     ...);
+
+template <class... _Rs>
+concept __concatable = requires {
+  typename __concat_reference_t<_Rs...>;
+  typename __concat_value_t<_Rs...>;
+  typename __concat_rvalue_reference_t<_Rs...>;
+} && __concat_indirectly_readable<_Rs...>;
+
+#  endif // _LIBCPP_STD_VER >= 23
+
 } // namespace ranges
 
 #endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__ranges/join_with_view.h b/libcxx/include/__ranges/join_with_view.h
new file mode 100644
index 000000000000..8ed989a66468
--- /dev/null
+++ b/libcxx/include/__ranges/join_with_view.h
@@ -0,0 +1,460 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___RANGES_JOIN_WITH_VIEW_H
+#define _LIBCPP___RANGES_JOIN_WITH_VIEW_H
+
+#include <__concepts/common_reference_with.h>
+#include <__concepts/common_with.h>
+#include <__concepts/constructible.h>
+#include <__concepts/convertible_to.h>
+#include <__concepts/derived_from.h>
+#include <__concepts/equality_comparable.h>
+#include <__config>
+#include <__functional/bind_back.h>
+#include <__iterator/concepts.h>
+#include <__iterator/incrementable_traits.h>
+#include <__iterator/iter_move.h>
+#include <__iterator/iter_swap.h>
+#include <__iterator/iterator_traits.h>
+#include <__memory/addressof.h>
+#include <__ranges/access.h>
+#include <__ranges/all.h>
+#include <__ranges/concepts.h>
+#include <__ranges/non_propagating_cache.h>
+#include <__ranges/range_adaptor.h>
+#include <__ranges/single_view.h>
+#include <__ranges/view_interface.h>
+#include <__type_traits/conditional.h>
+#include <__type_traits/decay.h>
+#include <__type_traits/is_reference.h>
+#include <__type_traits/maybe_const.h>
+#include <__utility/as_const.h>
+#include <__utility/as_lvalue.h>
+#include <__utility/empty.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <variant>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+namespace ranges {
+template <class _Range>
+concept __bidirectional_common = bidirectional_range<_Range> && common_range<_Range>;
+
+template <input_range _View, forward_range _Pattern>
+  requires view<_View> && input_range<range_reference_t<_View>> && view<_Pattern> &&
+           __concatable<range_reference_t<_View>, _Pattern>
+class join_with_view : public view_interface<join_with_view<_View, _Pattern>> {
+  using _InnerRng _LIBCPP_NODEBUG = range_reference_t<_View>;
+
+  _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
+
+  static constexpr bool _UseOuterItCache = !forward_range<_View>;
+  using _OuterItCache _LIBCPP_NODEBUG =
+      _If<_UseOuterItCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  _LIBCPP_NO_UNIQUE_ADDRESS _OuterItCache __outer_it_;
+
+  static constexpr bool _UseInnerCache = !is_reference_v<_InnerRng>;
+  using _InnerCache _LIBCPP_NODEBUG =
+      _If<_UseInnerCache, __non_propagating_cache<remove_cvref_t<_InnerRng>>, __empty_cache>;
+  _LIBCPP_NO_UNIQUE_ADDRESS _InnerCache __inner_;
+
+  _LIBCPP_NO_UNIQUE_ADDRESS _Pattern __pattern_ = _Pattern();
+
+  template <bool _Const>
+  struct __iterator;
+
+  template <bool _Const>
+  struct __sentinel;
+
+public:
+  _LIBCPP_HIDE_FROM_ABI join_with_view()
+    requires default_initializable<_View> && default_initializable<_Pattern>
+  = default;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit join_with_view(_View __base, _Pattern __pattern)
+      : __base_(std::move(__base)), __pattern_(std::move(__pattern)) {}
+
+  template <input_range _Range>
+    requires constructible_from<_View, views::all_t<_Range>> &&
+                 constructible_from<_Pattern, single_view<range_value_t<_InnerRng>>>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit join_with_view(_Range&& __r, range_value_t<_InnerRng> __e)
+      : __base_(views::all(std::forward<_Range>(__r))), __pattern_(views::single(std::move(__e))) {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _View base() const&
+    requires copy_constructible<_View>
+  {
+    return __base_;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _View base() && { return std::move(__base_); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto begin() {
+    if constexpr (forward_range<_View>) {
+      constexpr bool __use_const = __simple_view<_View> && is_reference_v<_InnerRng> && __simple_view<_Pattern>;
+      return __iterator<__use_const>{*this, ranges::begin(__base_)};
+    } else {
+      __outer_it_.__emplace(ranges::begin(__base_));
+      return __iterator<false>{*this};
+    }
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto begin() const
+    requires forward_range<const _View> && forward_range<const _Pattern> &&
+             is_reference_v<range_reference_t<const _View>> && input_range<range_reference_t<const _View>> &&
+             __concatable<range_reference_t<const _View>, const _Pattern>
+  {
+    return __iterator<true>{*this, ranges::begin(__base_)};
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto end() {
+    constexpr bool __use_const = __simple_view<_View> && __simple_view<_Pattern>;
+    if constexpr (forward_range<_View> && is_reference_v<_InnerRng> && forward_range<_InnerRng> &&
+                  common_range<_View> && common_range<_InnerRng>)
+      return __iterator<__use_const>{*this, ranges::end(__base_)};
+    else
+      return __sentinel<__use_const>{*this};
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto end() const
+    requires forward_range<const _View> && forward_range<const _Pattern> &&
+             is_reference_v<range_reference_t<const _View>> && input_range<range_reference_t<const _View>> &&
+             __concatable<range_reference_t<const _View>, const _Pattern>
+  {
+    using _InnerConstRng = range_reference_t<const _View>;
+    if constexpr (forward_range<_InnerConstRng> && common_range<const _View> && common_range<_InnerConstRng>)
+      return __iterator<true>{*this, ranges::end(__base_)};
+    else
+      return __sentinel<true>{*this};
+  }
+};
+
+template <class _Range, class _Pattern>
+join_with_view(_Range&&, _Pattern&&) -> join_with_view<views::all_t<_Range>, views::all_t<_Pattern>>;
+
+template <input_range _Range>
+join_with_view(_Range&&, range_value_t<range_reference_t<_Range>>)
+    -> join_with_view<views::all_t<_Range>, single_view<range_value_t<range_reference_t<_Range>>>>;
+
+template <class _Base, class _PatternBase, class _InnerBase = range_reference_t<_Base>>
+struct __join_with_view_iterator_category {};
+
+template <class _Base, class _PatternBase, class _InnerBase>
+  requires is_reference_v<_InnerBase> && forward_range<_Base> && forward_range<_InnerBase>
+struct __join_with_view_iterator_category<_Base, _PatternBase, _InnerBase> {
+private:
+  static consteval auto __get_iterator_category() noexcept {
+    using _OuterC   = iterator_traits<iterator_t<_Base>>::iterator_category;
+    using _InnerC   = iterator_traits<iterator_t<_InnerBase>>::iterator_category;
+    using _PatternC = iterator_traits<iterator_t<_PatternBase>>::iterator_category;
+
+    if constexpr (!is_reference_v<common_reference_t<iter_reference_t<iterator_t<_InnerBase>>,
+                                                     iter_reference_t<iterator_t<_PatternBase>>>>)
+      return input_iterator_tag{};
+    else if constexpr (derived_from<_OuterC, bidirectional_iterator_tag> &&
+                       derived_from<_InnerC, bidirectional_iterator_tag> &&
+                       derived_from<_PatternC, bidirectional_iterator_tag> && common_range<_InnerBase> &&
+                       common_range<_PatternBase>)
+      return bidirectional_iterator_tag{};
+    else if constexpr (derived_from<_OuterC, forward_iterator_tag> && derived_from<_InnerC, forward_iterator_tag> &&
+                       derived_from<_PatternC, forward_iterator_tag>)
+      return forward_iterator_tag{};
+    else
+      return input_iterator_tag{};
+  }
+
+public:
+  using iterator_category = decltype(__get_iterator_category());
+};
+
+template <input_range _View, forward_range _Pattern>
+  requires view<_View> && input_range<range_reference_t<_View>> && view<_Pattern> &&
+           __concatable<range_reference_t<_View>, _Pattern>
+template <bool _Const>
+struct join_with_view<_View, _Pattern>::__iterator
+    : public __join_with_view_iterator_category<__maybe_const<_Const, _View>, __maybe_const<_Const, _Pattern>> {
+private:
+  friend join_with_view;
+
+  using _Parent _LIBCPP_NODEBUG      = __maybe_const<_Const, join_with_view>;
+  using _Base _LIBCPP_NODEBUG        = __maybe_const<_Const, _View>;
+  using _InnerBase _LIBCPP_NODEBUG   = range_reference_t<_Base>;
+  using _PatternBase _LIBCPP_NODEBUG = __maybe_const<_Const, _Pattern>;
+
+  using _OuterIter _LIBCPP_NODEBUG   = iterator_t<_Base>;
+  using _InnerIter _LIBCPP_NODEBUG   = iterator_t<_InnerBase>;
+  using _PatternIter _LIBCPP_NODEBUG = iterator_t<_PatternBase>;
+
+  static_assert(!_Const || forward_range<_Base>, "Const can only be true when Base models forward_range.");
+
+  static constexpr bool __ref_is_glvalue = is_reference_v<_InnerBase>;
+
+  _Parent* __parent_ = nullptr;
+
+  static constexpr bool _OuterIterPresent              = forward_range<_Base>;
+  using _OuterIterType _LIBCPP_NODEBUG                 = _If<_OuterIterPresent, _OuterIter, std::__empty>;
+  _LIBCPP_NO_UNIQUE_ADDRESS _OuterIterType __outer_it_ = _OuterIterType();
+
+  variant<_PatternIter, _InnerIter> __inner_it_;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator(_Parent& __parent, _OuterIter __outer)
+    requires forward_range<_Base>
+      : __parent_(std::addressof(__parent)), __outer_it_(std::move(__outer)) {
+    if (__get_outer() != ranges::end(__parent_->__base_)) {
+      __inner_it_.template emplace<1>(ranges::begin(__update_inner()));
+      __satisfy();
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(_Parent& __parent)
+    requires(!forward_range<_Base>)
+      : __parent_(std::addressof(__parent)) {
+    if (__get_outer() != ranges::end(__parent_->__base_)) {
+      __inner_it_.template emplace<1>(ranges::begin(__update_inner()));
+      __satisfy();
+    }
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _OuterIter& __get_outer() {
+    if constexpr (forward_range<_Base>)
+      return __outer_it_;
+    else
+      return *__parent_->__outer_it_;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr const _OuterIter& __get_outer() const {
+    if constexpr (forward_range<_Base>)
+      return __outer_it_;
+    else
+      return *__parent_->__outer_it_;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto& __update_inner() {
+    if constexpr (__ref_is_glvalue)
+      return std::__as_lvalue(*__get_outer());
+    else
+      return __parent_->__inner_.__emplace_from([this]() -> decltype(auto) { return *__get_outer(); });
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto& __get_inner() {
+    if constexpr (__ref_is_glvalue)
+      return std::__as_lvalue(*__get_outer());
+    else
+      return *__parent_->__inner_;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr void __satisfy() {
+    while (true) {
+      if (__inner_it_.index() == 0) {
+        if (std::get<0>(__inner_it_) != ranges::end(__parent_->__pattern_))
+          break;
+
+        __inner_it_.template emplace<1>(ranges::begin(__update_inner()));
+      } else {
+        if (std::get<1>(__inner_it_) != ranges::end(__get_inner()))
+          break;
+
+        if (++__get_outer() == ranges::end(__parent_->__base_)) {
+          if constexpr (__ref_is_glvalue)
+            __inner_it_.template emplace<0>();
+
+          break;
+        }
+
+        __inner_it_.template emplace<0>(ranges::begin(__parent_->__pattern_));
+      }
+    }
+  }
+
+  [[nodiscard]] static consteval auto __get_iterator_concept() noexcept {
+    if constexpr (__ref_is_glvalue && bidirectional_range<_Base> && __bidirectional_common<_InnerBase> &&
+                  __bidirectional_common<_PatternBase>)
+      return bidirectional_iterator_tag{};
+    else if constexpr (__ref_is_glvalue && forward_range<_Base> && forward_range<_InnerBase>)
+      return forward_iterator_tag{};
+    else
+      return input_iterator_tag{};
+  }
+
+public:
+  using iterator_concept = decltype(__get_iterator_concept());
+  using value_type       = common_type_t<iter_value_t<_InnerIter>, iter_value_t<_PatternIter>>;
+  using difference_type =
+      common_type_t<iter_difference_t<_OuterIter>, iter_difference_t<_InnerIter>, iter_difference_t<_PatternIter>>;
+
+  _LIBCPP_HIDE_FROM_ABI __iterator() = default;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator(__iterator<!_Const> __i)
+    requires _Const && convertible_to<iterator_t<_View>, _OuterIter> &&
+                 convertible_to<iterator_t<_InnerRng>, _InnerIter> && convertible_to<iterator_t<_Pattern>, _PatternIter>
+      : __parent_(__i.__parent_), __outer_it_(std::move(__i.__outer_it_)) {
+    if (__i.__inner_it_.index() == 0) {
+      __inner_it_.template emplace<0>(std::get<0>(std::move(__i.__inner_it_)));
+    } else {
+      __inner_it_.template emplace<1>(std::get<1>(std::move(__i.__inner_it_)));
+    }
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() const {
+    using __reference = common_reference_t<iter_reference_t<_InnerIter>, iter_reference_t<_PatternIter>>;
+    return std::visit([](auto& __it) -> __reference { return *__it; }, __inner_it_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator& operator++() {
+    std::visit([](auto& __it) { ++__it; }, __inner_it_);
+    __satisfy();
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr void operator++(int) { ++*this; }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator operator++(int)
+    requires __ref_is_glvalue && forward_iterator<_OuterIter> && forward_iterator<_InnerIter>
+  {
+    __iterator __tmp = *this;
+    ++*this;
+    return __tmp;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator& operator--()
+    requires __ref_is_glvalue
+          && bidirectional_range<_Base> && __bidirectional_common<_InnerBase> && __bidirectional_common<_PatternBase>
+  {
+    if (__outer_it_ == ranges::end(__parent_->__base_)) {
+      auto&& __inner = *--__outer_it_;
+      __inner_it_.template emplace<1>(ranges::end(__inner));
+    }
+
+    while (true) {
+      if (__inner_it_.index() == 0) {
+        auto& __it = std::get<0>(__inner_it_);
+        if (__it == ranges::begin(__parent_->__pattern_)) {
+          auto&& __inner = *--__outer_it_;
+          __inner_it_.template emplace<1>(ranges::end(__inner));
+        } else
+          break;
+      } else {
+        auto& __it     = std::get<1>(__inner_it_);
+        auto&& __inner = *__outer_it_;
+        if (__it == ranges::begin(__inner))
+          __inner_it_.template emplace<0>(ranges::end(__parent_->__pattern_));
+        else
+          break;
+      }
+    }
+
+    std::visit([](auto& __it) { --__it; }, __inner_it_);
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator operator--(int)
+    requires __ref_is_glvalue
+          && bidirectional_range<_Base> && __bidirectional_common<_InnerBase> && __bidirectional_common<_PatternBase>
+  {
+    __iterator __tmp = *this;
+    --*this;
+    return __tmp;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const __iterator& __x, const __iterator& __y)
+    requires __ref_is_glvalue && forward_range<_Base> && equality_comparable<_InnerIter>
+  {
+    return __x.__outer_it_ == __y.__outer_it_ && __x.__inner_it_ == __y.__inner_it_;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI friend constexpr decltype(auto) iter_move(const __iterator& __x) {
+    using __rvalue_reference =
+        common_reference_t<iter_rvalue_reference_t<_InnerIter>, iter_rvalue_reference_t<_PatternIter>>;
+    return std::visit<__rvalue_reference>(ranges::iter_move, __x.__inner_it_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr void iter_swap(const __iterator& __x, const __iterator& __y)
+    requires indirectly_swappable<_InnerIter, _PatternIter>
+  {
+    std::visit(ranges::iter_swap, __x.__inner_it_, __y.__inner_it_);
+  }
+};
+
+template <input_range _View, forward_range _Pattern>
+  requires view<_View> && input_range<range_reference_t<_View>> && view<_Pattern> &&
+           __concatable<range_reference_t<_View>, _Pattern>
+template <bool _Const>
+struct join_with_view<_View, _Pattern>::__sentinel {
+private:
+  friend join_with_view;
+
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, join_with_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
+
+  _LIBCPP_NO_UNIQUE_ADDRESS sentinel_t<_Base> __end_ = sentinel_t<_Base>();
+
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(_Parent& __parent) : __end_(ranges::end(__parent.__base_)) {}
+
+  template <bool _OtherConst>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto& __get_outer_of(const __iterator<_OtherConst>& __x) {
+    return __x.__get_outer();
+  }
+
+public:
+  _LIBCPP_HIDE_FROM_ABI __sentinel() = default;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __sentinel(__sentinel<!_Const> __s)
+    requires _Const && convertible_to<sentinel_t<_View>, sentinel_t<_Base>>
+      : __end_(std::move(__s.__end_)) {}
+
+  template <bool _OtherConst>
+    requires sentinel_for<sentinel_t<_Base>, iterator_t<__maybe_const<_OtherConst, _View>>>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI friend constexpr bool
+  operator==(const __iterator<_OtherConst>& __x, const __sentinel& __y) {
+    return __get_outer_of(__x) == __y.__end_;
+  }
+};
+
+namespace views {
+namespace __join_with_view {
+struct __fn {
+  template <class _Range, class _Pattern>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Range&& __range, _Pattern&& __pattern) const
+      noexcept(noexcept(/**/ join_with_view(std::forward<_Range>(__range), std::forward<_Pattern>(__pattern))))
+          -> decltype(/*--*/ join_with_view(std::forward<_Range>(__range), std::forward<_Pattern>(__pattern))) {
+    return /*-------------*/ join_with_view(std::forward<_Range>(__range), std::forward<_Pattern>(__pattern));
+  }
+
+  template <class _Pattern>
+    requires constructible_from<decay_t<_Pattern>, _Pattern>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Pattern&& __pattern) const
+      noexcept(is_nothrow_constructible_v<decay_t<_Pattern>, _Pattern>) {
+    return __pipeable(std::__bind_back(*this, std::forward<_Pattern>(__pattern)));
+  }
+};
+} // namespace __join_with_view
+
+inline namespace __cpo {
+inline constexpr auto join_with = __join_with_view::__fn{};
+} // namespace __cpo
+} // namespace views
+} // namespace ranges
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___RANGES_JOIN_WITH_VIEW_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index f5fd970934e9..4a081e65cb7f 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1865,6 +1865,7 @@ module std [system] {
     module iota_view                      { header "__ranges/iota_view.h" }
     module istream_view                   { header "__ranges/istream_view.h" }
     module join_view                      { header "__ranges/join_view.h" }
+    module join_with_view                 { header "__ranges/join_with_view.h" }
     module lazy_split_view {
       header "__ranges/lazy_split_view.h"
       export std.functional.bind_back
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 49fea7c3f84e..2a6321bd2c5d 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -285,6 +285,15 @@ namespace std::ranges {
     requires view<V> && input_range<range_reference_t<V>>
   class join_view;
 
+  // [range.join.with], join with view
+  template<input_range V, forward_range Pattern>
+    requires view<V> && input_range<range_reference_t<V>>
+          && view<Pattern>
+          && concatable<range_reference_t<V>, Pattern>
+  class join_with_view;                                                     // since C++23
+
+  namespace views { inline constexpr unspecified join_with = unspecified; } // since C++23
+
   // [range.lazy.split], lazy split view
   template<class R>
     concept tiny-range = see below;   // exposition only
@@ -427,6 +436,7 @@ namespace std {
 #    include <__ranges/as_rvalue_view.h>
 #    include <__ranges/chunk_by_view.h>
 #    include <__ranges/from_range.h>
+#    include <__ranges/join_with_view.h>
 #    include <__ranges/repeat_view.h>
 #    include <__ranges/to.h>
 #    include <__ranges/zip_view.h>
diff --git a/libcxx/include/version b/libcxx/include/version
index f430c7edff2b..91fe48351e16 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -519,7 +519,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_ranges_contains                      202207L
 # define __cpp_lib_ranges_find_last                     202207L
 # define __cpp_lib_ranges_iota                          202202L
-// # define __cpp_lib_ranges_join_with                     202202L
+# define __cpp_lib_ranges_join_with                     202202L
 # define __cpp_lib_ranges_repeat                        202207L
 // # define __cpp_lib_ranges_slide                         202202L
 # define __cpp_lib_ranges_starts_ends_with              202106L
diff --git a/libcxx/modules/std/ranges.inc b/libcxx/modules/std/ranges.inc
index a5e2a2b4583c..adabeeb22d55 100644
--- a/libcxx/modules/std/ranges.inc
+++ b/libcxx/modules/std/ranges.inc
@@ -223,13 +223,16 @@ export namespace std {
     namespace views {
       using std::ranges::views::join;
     } // namespace views
-#if 0
+
+#if _LIBCPP_STD_VER >= 23
+    // [range.join.with]
     using std::ranges::join_with_view;
 
     namespace views {
       using std::ranges::views::join_with;
     } // namespace views
-#endif
+#endif // _LIBCPP_STD_VER >= 23
+
     using std::ranges::lazy_split_view;
 
     // [range.split], split view
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.nodiscard.verify.cpp
new file mode 100644
index 000000000000..97133613bf58
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.nodiscard.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::iterator<Const>::operator* is marked as [[nodiscard]].
+
+#include <ranges>
+#include <utility>
+
+void test() {
+  char range[3][2] = {{'x', 'x'}, {'y', 'y'}, {'z', 'z'}};
+  char pattern[2]  = {',', ' '};
+
+  std::ranges::join_with_view view(range, pattern);
+
+  // clang-format off
+  *view.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  *std::as_const(view).begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.nodiscard.verify.cpp
new file mode 100644
index 000000000000..823d8def9808
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.nodiscard.verify.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::iterator<Const>::operator== is marked as [[nodiscard]].
+
+#include <ranges>
+#include <utility>
+
+void test() {
+  char16_t range[3][1] = {{u'x'}, {u'y'}, {u'z'}};
+  char16_t pattern[1]  = {u'-'};
+
+  std::ranges::join_with_view view(range, pattern);
+
+  // clang-format off
+  (view.begin() == view.end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (std::as_const(view).begin() == view.end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (view.begin() == std::as_const(view).end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (std::as_const(view).begin() == std::as_const(view).end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.nodiscard.verify.cpp
new file mode 100644
index 000000000000..9e046ef43fda
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.nodiscard.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::iterator<Const>::iter_move is marked as [[nodiscard]].
+
+#include <ranges>
+#include <utility>
+
+void test() {
+  long range[2][1] = {{0L}, {2L}};
+  long pattern[1]  = {1L};
+
+  std::ranges::join_with_view view(range, pattern);
+
+  // clang-format off
+  iter_move(view.begin()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  iter_move(std::as_const(view).begin()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/no_unique_address.compile.pass.cpp
new file mode 100644
index 000000000000..6b2abb5c8053
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.iterator/no_unique_address.compile.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// This test ensures that we use `[[no_unique_address]]` in `join_with_view::iterator`.
+
+#include <cstddef>
+#include <ranges>
+#include <variant>
+
+struct IntRange : std::ranges::view_base {
+  int* begin();
+  int* end();
+};
+
+class Iter {
+public:
+  using value_type      = IntRange;
+  using difference_type = ptrdiff_t;
+
+  Iter& operator++();
+  void operator++(int);
+  value_type& operator*() const;
+  bool operator==(std::default_sentinel_t) const;
+
+private:
+  int* ptr_;
+};
+
+static_assert(std::input_iterator<Iter>);
+static_assert(!std::forward_iterator<Iter>);
+
+struct View : std::ranges::view_base {
+  Iter begin();
+  std::default_sentinel_t end();
+};
+
+static_assert(std::ranges::input_range<View>);
+static_assert(!std::ranges::forward_range<View>);
+
+using JWV = std::ranges::join_with_view<View, IntRange>;
+
+// Expected JWV::iterator layout:
+// _Parent* __parent_;                           // offset: 0
+// [[no_unique_address]] __empty __outer_it;     //         0
+// variant<_PatternIter, _InnerIter> __pattern_; //         sizeof(pointer)
+static_assert(sizeof(std::ranges::iterator_t<JWV>) ==
+              sizeof(void*) + sizeof(std::variant<int*, int*>)); // sizeof(__parent_) + sizeof(__inner_it_)
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.nodiscard.verify.cpp
new file mode 100644
index 000000000000..3efe77a3765d
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.nodiscard.verify.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::views::join_with is marked as [[nodiscard]].
+
+#include <ranges>
+
+void test() {
+  int range[3][3]     = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+  int pattern_base[2] = {-1, -1};
+  auto pattern        = std::views::all(pattern_base);
+
+  // clang-format off
+  std::views::join_with(pattern); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::views::join_with(range, pattern); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  range | std::views::join_with(pattern); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::views::reverse | std::views::join_with(pattern); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  std::views::join_with(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::views::join_with(range, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  range | std::views::join_with(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::views::reverse | std::views::join_with(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.nodiscard.verify.cpp
new file mode 100644
index 000000000000..e7e4d262eedb
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.nodiscard.verify.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::sentinel<Const>::operator== is marked as [[nodiscard]].
+
+#include <array>
+#include <ranges>
+#include <utility>
+
+#include "test_iterators.h"
+#include "test_range.h"
+
+void test() {
+  std::array<test_range<cpp20_input_iterator>, 0> range;
+  std::array<int, 0> pattern;
+
+  std::ranges::join_with_view view(range, pattern);
+  static_assert(!std::ranges::common_range<decltype(view)>);
+
+  // clang-format off
+  (view.begin() == view.end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (std::as_const(view).begin() == view.end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (view.begin() == std::as_const(view).end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  (std::as_const(view).begin() == std::as_const(view).end()); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/no_unique_address.compile.pass.cpp
new file mode 100644
index 000000000000..3badac162ce7
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.sentinel/no_unique_address.compile.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// XFAIL: msvc
+
+// <ranges>
+
+// This test ensures that we use `[[no_unique_address]]` in `join_with_view::sentinel`.
+
+#include <cstddef>
+#include <ranges>
+#include <string_view>
+
+template <bool Const>
+struct Iter {
+  using value_type      = std::string_view;
+  using difference_type = std::ptrdiff_t;
+
+  Iter& operator++();
+  Iter operator++(int);
+  value_type& operator*() const;
+  bool operator==(const Iter&) const;
+  bool operator==(std::default_sentinel_t) const;
+};
+
+struct View : std::ranges::view_base {
+  Iter<false> begin();
+  Iter<true> begin() const;
+  std::default_sentinel_t end() const;
+};
+
+using JWV = std::ranges::join_with_view<View, std::string_view>;
+
+template <class View>
+struct Test {
+  [[no_unique_address]] std::ranges::sentinel_t<View> se;
+  unsigned char pad;
+};
+
+static_assert(sizeof(Test<JWV>) == 1);
+static_assert(sizeof(Test<const JWV>) == 1);
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/base.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/base.nodiscard.verify.cpp
new file mode 100644
index 000000000000..ddf1ebdc5e46
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/base.nodiscard.verify.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::base is marked as [[nodiscard]].
+
+#include <ranges>
+#include <utility>
+
+void test() {
+  int range[3][3] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+  int pattern[2]  = {-1, -1};
+
+  std::ranges::join_with_view view(range, pattern);
+
+  // clang-format off
+  view.base(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::as_const(view).base(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::move(std::as_const(view)).base(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::move(view).base(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/begin.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/begin.nodiscard.verify.cpp
new file mode 100644
index 000000000000..858490a82c75
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/begin.nodiscard.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::begin is marked as [[nodiscard]].
+
+#include <ranges>
+#include <utility>
+
+void test() {
+  int range[3][2] = {{1, 3}, {4, 6}, {7, 9}};
+  int pattern[1]  = {-2};
+
+  std::ranges::join_with_view view(range, pattern);
+
+  // clang-format off
+  view.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::as_const(view).begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/end.nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/end.nodiscard.verify.cpp
new file mode 100644
index 000000000000..e57e0ee3f0d0
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/end.nodiscard.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// Test the libc++ extension that std::ranges::join_with_view::end is marked as [[nodiscard]].
+
+#include <ranges>
+#include <utility>
+
+void test() {
+  int range[3][2] = {{1, 2}, {4, 5}, {7, 8}};
+  int pattern[1]  = {-3};
+
+  std::ranges::join_with_view view(range, pattern);
+
+  // clang-format off
+  view.end(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::as_const(view).end(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/no_unique_address.compile.pass.cpp
new file mode 100644
index 000000000000..aa6eeafe4be1
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.join.with/range.join.with.view/no_unique_address.compile.pass.cpp
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// XFAIL: msvc
+
+// <ranges>
+
+// This test ensures that we use `[[no_unique_address]]` in `join_with_view`.
+
+#include <ranges>
+#include <string>
+
+struct ForwardView : std::ranges::view_base {
+  std::string* begin() const;
+  std::string* end() const;
+};
+
+static_assert(std::ranges::forward_range<ForwardView>);
+static_assert(std::is_reference_v<std::ranges::range_reference_t<ForwardView>>);
+
+struct Pattern : std::ranges::view_base {
+  char* begin() const;
+  char* end() const;
+};
+
+template <class View>
+struct Test {
+  [[no_unique_address]] View view;
+  unsigned char pad;
+};
+
+using JWV = std::ranges::join_with_view<ForwardView, Pattern>;
+
+// Expected JWV layout:
+// [[no_unique_address]] _View __base_             // offset: 0
+// [[no_unique_address]] __empty_cache __outer_it; //         0
+// [[no_unique_address]] __empty_cache __inner_;   //         1
+// [[no_unique_address]] _Patter __pattern_        //         0
+static_assert(sizeof(JWV) == 2);
+static_assert(sizeof(Test<JWV>) == 2);
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
index c7c8112e123c..4cf5178dd7b8 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
@@ -278,17 +278,11 @@
 #    error "__cpp_lib_ranges_concat should not be defined before c++26"
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
-#    ifndef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should be defined in c++23"
-#    endif
-#    if __cpp_lib_ranges_join_with != 202202L
-#      error "__cpp_lib_ranges_join_with should have the value 202202L in c++23"
-#    endif
-#  else
-#    ifdef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should not be defined because it is unimplemented in libc++!"
-#    endif
+#  ifndef __cpp_lib_ranges_join_with
+#    error "__cpp_lib_ranges_join_with should be defined in c++23"
+#  endif
+#  if __cpp_lib_ranges_join_with != 202202L
+#    error "__cpp_lib_ranges_join_with should have the value 202202L in c++23"
 #  endif
 
 #  ifndef __cpp_lib_ranges_repeat
@@ -406,17 +400,11 @@
 #    endif
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
-#    ifndef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should be defined in c++26"
-#    endif
-#    if __cpp_lib_ranges_join_with != 202202L
-#      error "__cpp_lib_ranges_join_with should have the value 202202L in c++26"
-#    endif
-#  else
-#    ifdef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should not be defined because it is unimplemented in libc++!"
-#    endif
+#  ifndef __cpp_lib_ranges_join_with
+#    error "__cpp_lib_ranges_join_with should be defined in c++26"
+#  endif
+#  if __cpp_lib_ranges_join_with != 202202L
+#    error "__cpp_lib_ranges_join_with should have the value 202202L in c++26"
 #  endif
 
 #  ifndef __cpp_lib_ranges_repeat
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index cef334f70c07..e54671914223 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -5633,17 +5633,11 @@
 #    error "__cpp_lib_ranges_iota should have the value 202202L in c++23"
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
-#    ifndef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should be defined in c++23"
-#    endif
-#    if __cpp_lib_ranges_join_with != 202202L
-#      error "__cpp_lib_ranges_join_with should have the value 202202L in c++23"
-#    endif
-#  else
-#    ifdef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should not be defined because it is unimplemented in libc++!"
-#    endif
+#  ifndef __cpp_lib_ranges_join_with
+#    error "__cpp_lib_ranges_join_with should be defined in c++23"
+#  endif
+#  if __cpp_lib_ranges_join_with != 202202L
+#    error "__cpp_lib_ranges_join_with should have the value 202202L in c++23"
 #  endif
 
 #  ifndef __cpp_lib_ranges_repeat
@@ -7549,17 +7543,11 @@
 #    error "__cpp_lib_ranges_iota should have the value 202202L in c++26"
 #  endif
 
-#  if !defined(_LIBCPP_VERSION)
-#    ifndef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should be defined in c++26"
-#    endif
-#    if __cpp_lib_ranges_join_with != 202202L
-#      error "__cpp_lib_ranges_join_with should have the value 202202L in c++26"
-#    endif
-#  else
-#    ifdef __cpp_lib_ranges_join_with
-#      error "__cpp_lib_ranges_join_with should not be defined because it is unimplemented in libc++!"
-#    endif
+#  ifndef __cpp_lib_ranges_join_with
+#    error "__cpp_lib_ranges_join_with should be defined in c++26"
+#  endif
+#  if __cpp_lib_ranges_join_with != 202202L
+#    error "__cpp_lib_ranges_join_with should have the value 202202L in c++26"
 #  endif
 
 #  ifndef __cpp_lib_ranges_repeat
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.default.pass.cpp
new file mode 100644
index 000000000000..0d9df493305d
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.default.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// iterator() = default;
+
+#include <ranges>
+
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#include "../types.h"
+#include "test_comparisons.h"
+#include "test_iterators.h"
+
+constexpr bool test() {
+  { // `V` and `Pattern` model forward range
+    using Inner     = BasicVectorView<int, ViewProperties{}, forward_iterator>;
+    using V         = BasicVectorView<Inner, ViewProperties{}, forward_iterator>;
+    using Pattern   = Inner;
+    using JWV       = std::ranges::join_with_view<V, Pattern>;
+    using Iter      = std::ranges::iterator_t<JWV>;
+    using ConstIter = std::ranges::iterator_t<const JWV>;
+
+    // Default constructor of iterator<false> should not be explicit
+    Iter iter = {};
+    assert(testEquality(iter, Iter{}, true));
+
+    // Default constructor of iterator<true> should not be explicit
+    ConstIter citer = {};
+    assert(testEquality(citer, ConstIter{}, true));
+    assert(testEquality(iter, citer, true));
+
+    std::ranges::join_with_view<V, Pattern> jwv(V{Inner{1, 2}, Inner{2, 1}}, Pattern{3, 3});
+    Iter jwv_iter       = jwv.begin();
+    ConstIter jwv_citer = std::as_const(jwv).begin();
+    assert(testEquality(jwv_iter, jwv_citer, true));
+
+    assert(testEquality(jwv_iter, iter, false));
+    assert(testEquality(jwv_iter, citer, false));
+    assert(testEquality(jwv_citer, iter, false));
+    assert(testEquality(jwv_citer, citer, false));
+  }
+
+  { // `InnerIter` is not default constructible (does not model forward iterator, JWV cannot be const-accessed)
+    using Inner   = BasicVectorView<char, ViewProperties{.common = false}, EqComparableInputIter>;
+    using V       = BasicVectorView<Inner, ViewProperties{.common = false}, forward_iterator>;
+    using Pattern = BasicVectorView<char, ViewProperties{}, forward_iterator>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+
+    Iter iter;
+    assert(testEquality(iter, Iter{}, true));
+
+    std::ranges::join_with_view<V, Pattern> jwv(V{Inner{'a', 'b'}, Inner{'c', 'd'}}, Pattern{',', ' '});
+    Iter jwv_iter = jwv.begin();
+    assert(testEquality(jwv_iter, iter, false));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.not_const.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.not_const.pass.cpp
new file mode 100644
index 000000000000..0ca31392ed8f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/ctor.not_const.pass.cpp
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr iterator(iterator<!Const> i)
+//   requires Const && convertible_to<iterator_t<V>, OuterIter> &&
+//            convertible_to<iterator_t<InnerRng>, InnerIter> &&
+//            convertible_to<iterator_t<Pattern>, PatternIter>;
+
+#include <ranges>
+
+#include <cassert>
+#include <vector>
+
+#include "../types.h"
+
+constexpr bool test() {
+  { // Regular conversion from `!Const` to `Const` iterator
+    std::vector<std::vector<int>> vec = {{1, 2}, {3, 4}, {5, 6}};
+    int pattern                       = 0;
+    std::ranges::join_with_view jwv(vec, pattern);
+
+    using JWV   = decltype(jwv);
+    using Iter  = std::ranges::iterator_t<JWV>;
+    using CIter = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::same_as<Iter, CIter>);
+    static_assert(std::convertible_to<Iter, CIter>);
+    static_assert(std::constructible_from<CIter, Iter>);
+
+    Iter it = jwv.begin();
+    assert(*it == 1);
+
+    const CIter cit1 = it; // `cit1` points to element of `V`; this constructor should not be explicit
+    assert(*cit1 == 1);
+    assert(cit1 == it);
+
+    std::ranges::advance(it, 2);
+    assert(*it == 0);
+    CIter cit2 = it; // `cit2` points to element of `Pattern`
+    assert(*cit2 == 0);
+    assert(cit2 == it);
+
+    ++it;
+    assert(*it == 3);
+    CIter cit3 = it;
+    assert(*cit3 == 3);
+    assert(cit3 == it);
+
+    --cit3;
+    assert(cit2 == cit3);
+  }
+
+  { // Test conversion from `Const` to `!Const` (should be invalid)
+    using V       = std::vector<std::vector<int>>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<std::views::all_t<V>, Pattern>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    using CIter   = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::convertible_to<CIter, Iter>);
+    static_assert(!std::constructible_from<Iter, CIter>);
+  }
+
+  { // When `convertible_to<iterator_t<V>, OuterIter>` is not modeled
+    using Inner   = std::vector<short>;
+    using V       = ConstOppositeView<Inner>;
+    using Pattern = std::ranges::single_view<short>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    using CIter   = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::convertible_to<CIter, Iter>);
+    static_assert(!std::constructible_from<Iter, CIter>);
+  }
+
+  { // When `convertible_to<iterator_t<InnerRng>, InnerIter>` is not modeled
+    using Inner   = ConstOppositeView<long>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::ranges::single_view<long>;
+    using JWV     = std::ranges::join_with_view<std::views::all_t<V>, Pattern>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    using CIter   = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::convertible_to<CIter, Iter>);
+    static_assert(!std::constructible_from<Iter, CIter>);
+  }
+
+  { // When `convertible_to<iterator_t<Pattern>, PatternIter>` is not modeled
+    using V       = std::vector<std::vector<long long>>;
+    using Pattern = ConstOppositeView<long long>;
+    using JWV     = std::ranges::join_with_view<std::views::all_t<V>, Pattern>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    using CIter   = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::convertible_to<CIter, Iter>);
+    static_assert(!std::constructible_from<Iter, CIter>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/decrement.pass.cpp
new file mode 100644
index 000000000000..207d60a7296f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/decrement.pass.cpp
@@ -0,0 +1,283 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr iterator& operator--()
+//   requires ref-is-glvalue && bidirectional_range<Base> &&
+//            bidirectional-common<InnerBase> && bidirectional-common<PatternBase>;
+// constexpr iterator operator--(int)
+//   requires ref-is-glvalue && bidirectional_range<Base> &&
+//            bidirectional-common<InnerBase> && bidirectional-common<PatternBase>;
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <forward_list>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "../types.h"
+
+template <class I>
+concept CanPreDecrement = requires(I& i) {
+  { --i } -> std::same_as<I&>;
+};
+
+template <class I>
+concept CanPostDecrement = requires(I& i) {
+  { i-- } -> std::same_as<I>;
+};
+
+template <class I>
+concept CanDecrement = CanPreDecrement<I> && CanPostDecrement<I>;
+
+constexpr bool test() {
+  { // `V` and `Pattern` are not empty. Test return type too.
+    using V       = std::ranges::owning_view<std::vector<std::string>>;
+    using Pattern = std::ranges::single_view<char>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    using Iter  = std::ranges::iterator_t<JWV>;
+    using CIter = std::ranges::iterator_t<const JWV>;
+    static_assert(CanDecrement<Iter>);
+    static_assert(CanDecrement<CIter>);
+
+    JWV jwv(V{{"01", "23", "45"}}, Pattern{'_'});
+
+    {
+      auto it                                   = jwv.end();
+      std::same_as<Iter&> decltype(auto) it_ref = --it;
+      assert(it_ref == it);
+      assert(*it == '5');
+      std::same_as<Iter> decltype(auto) it_copy = it--;
+      assert(--it_copy == it);
+      --it;
+      assert(*it == '_');
+      it--;
+      assert(*it == '3');
+      --it;
+      it--;
+      assert(*it == '_');
+    }
+
+    {
+      auto cit                                    = std::as_const(jwv).end();
+      std::same_as<CIter&> decltype(auto) cit_ref = --cit;
+      assert(cit_ref == cit);
+      assert(*cit == '5');
+      std::same_as<CIter> decltype(auto) cit_copy = cit--;
+      assert(--cit_copy == cit);
+      --cit;
+      assert(*cit == '_');
+      cit--;
+      assert(*cit == '3');
+      --cit;
+      cit--;
+      assert(*cit == '_');
+    }
+
+    assert(std::ranges::equal(std::views::reverse(std::move(jwv)), std::string_view{"54_32_10"}));
+  }
+
+  { // `Pattern` is empty, `V` is not.
+    using Inner   = std::array<int, 1>;
+    using V       = std::ranges::owning_view<std::array<Inner, 3>>;
+    using Pattern = std::ranges::owning_view<std::array<int, 0>>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{{Inner{-9}, Inner{-99}, Inner{-999}}}, Pattern{});
+
+    {
+      auto it = jwv.end();
+      --it;
+      assert(*it == -999);
+      it--;
+      assert(*it == -99);
+      --it;
+      assert(*it == -9);
+      assert(it == jwv.begin());
+    }
+
+    {
+      auto cit = std::as_const(jwv).end();
+      --cit;
+      assert(*cit == -999);
+      cit--;
+      assert(*cit == -99);
+      --cit;
+      assert(*cit == -9);
+      assert(cit == std::as_const(jwv).begin());
+    }
+  }
+
+#if !defined(TEST_COMPILER_GCC) // GCC c++/101777
+  { // `V` has empty subrange in the middle, `Pattern` is not empty. Try to go back and forth.
+    using V       = std::array<std::vector<int>, 3>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    JWV jwv(V{{{5}, {}, {125}}}, Pattern{1});
+
+    {
+      auto it = jwv.end();
+      --it;
+      assert(*it == 125);
+      it--;
+      assert(*it == 1);
+      --it;
+      assert(*it == 1);
+      it--;
+      assert(*it == 5);
+      ++it;
+      assert(*it == 1);
+      --it;
+      assert(*it == 5);
+      std::ranges::advance(it, 4);
+      it--;
+      assert(*it == 125);
+    }
+
+    {
+      auto cit = std::as_const(jwv).end();
+      --cit;
+      assert(*cit == 125);
+      cit--;
+      assert(*cit == 1);
+      --cit;
+      assert(*cit == 1);
+      cit--;
+      assert(*cit == 5);
+      ++cit;
+      assert(*cit == 1);
+      --cit;
+      assert(*cit == 5);
+      std::ranges::advance(cit, 4);
+      cit--;
+      assert(*cit == 125);
+    }
+  }
+
+  { // Only first element of `V` is not empty. `Pattern` is empty. Try to go back and forth.
+    using Inner   = std::vector<int>;
+    using V       = std::ranges::owning_view<std::array<Inner, 3>>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{{Inner{999}, {}, {}}}, Pattern{});
+
+    {
+      auto it = jwv.end();
+      --it;
+      assert(*it == 999);
+      ++it;
+      assert(it == jwv.end());
+      it--;
+      assert(*it == 999);
+    }
+
+    {
+      auto cit = std::as_const(jwv).end();
+      --cit;
+      assert(*cit == 999);
+      ++cit;
+      assert(cit == std::as_const(jwv).end());
+      cit--;
+      assert(*cit == 999);
+    }
+  }
+#endif // !defined(TEST_COMPILER_GCC)
+
+  { // `ref-is-glvalue` is false
+    using V       = RvalueVector<std::vector<int>>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    static_assert(!CanPreDecrement<Iter>);
+    static_assert(!CanPostDecrement<Iter>);
+  }
+
+  { // `Base` does not model bidirectional range
+    using V       = std::ranges::owning_view<std::forward_list<std::vector<int>>>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    using CIter   = std::ranges::iterator_t<const JWV>;
+    static_assert(!CanPreDecrement<Iter>);
+    static_assert(!CanPostDecrement<Iter>);
+    static_assert(!CanPreDecrement<CIter>);
+    static_assert(!CanPostDecrement<CIter>);
+  }
+
+  {   // InnerBase does not model bidirectional-common
+    { // InnerBase does not model bidirectional range
+      using V       = std::ranges::owning_view<std::vector<std::forward_list<int>>>;
+      using Pattern = std::ranges::single_view<int>;
+      using JWV     = std::ranges::join_with_view<V, Pattern>;
+      using Iter    = std::ranges::iterator_t<JWV>;
+      using CIter   = std::ranges::iterator_t<const JWV>;
+      static_assert(!CanPreDecrement<Iter>);
+      static_assert(!CanPostDecrement<Iter>);
+      static_assert(!CanPreDecrement<CIter>);
+      static_assert(!CanPostDecrement<CIter>);
+    }
+
+    { // InnerBase does not model common range
+      using InnerBase = BasicVectorView<int, ViewProperties{.common = false}, bidirectional_iterator>;
+      using V         = std::ranges::owning_view<std::vector<InnerBase>>;
+      using Pattern   = std::ranges::single_view<int>;
+      using JWV       = std::ranges::join_with_view<V, Pattern>;
+      using Iter      = std::ranges::iterator_t<JWV>;
+      using CIter     = std::ranges::iterator_t<const JWV>;
+      static_assert(!CanPreDecrement<Iter>);
+      static_assert(!CanPostDecrement<Iter>);
+      static_assert(!CanPreDecrement<CIter>);
+      static_assert(!CanPostDecrement<CIter>);
+    }
+  }
+
+  {   // PatternBase does not model bidirectional-common
+    { // PatternBase does not model bidirectional range
+      using V       = std::ranges::owning_view<std::vector<std::vector<int>>>;
+      using Pattern = std::ranges::owning_view<std::forward_list<int>>;
+      using JWV     = std::ranges::join_with_view<V, Pattern>;
+      using Iter    = std::ranges::iterator_t<JWV>;
+      using CIter   = std::ranges::iterator_t<const JWV>;
+      static_assert(!CanPreDecrement<Iter>);
+      static_assert(!CanPostDecrement<Iter>);
+      static_assert(!CanPreDecrement<CIter>);
+      static_assert(!CanPostDecrement<CIter>);
+    }
+
+    { // PatternBase does not model common range
+      using V       = std::ranges::owning_view<std::vector<std::vector<int>>>;
+      using Pattern = BasicVectorView<int, ViewProperties{.common = false}, bidirectional_iterator>;
+      using JWV     = std::ranges::join_with_view<V, Pattern>;
+      using Iter    = std::ranges::iterator_t<JWV>;
+      using CIter   = std::ranges::iterator_t<const JWV>;
+      static_assert(!CanPreDecrement<Iter>);
+      static_assert(!CanPostDecrement<Iter>);
+      static_assert(!CanPreDecrement<CIter>);
+      static_assert(!CanPostDecrement<CIter>);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.pass.cpp
new file mode 100644
index 000000000000..b2eeddd2941d
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/deref.pass.cpp
@@ -0,0 +1,225 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr decltype(auto) operator*() const;
+
+#include <ranges>
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "../types.h"
+
+struct ProxyRef {
+  int& val;
+};
+
+class CommonProxyRef {
+public:
+  constexpr CommonProxyRef(ProxyRef i) : val(i.val) {}
+  constexpr CommonProxyRef(int i) : val(i) {}
+
+  constexpr int get() const { return val; }
+
+private:
+  int val;
+};
+
+template <template <class> class TQual, template <class> class UQual>
+struct std::basic_common_reference<ProxyRef, int, TQual, UQual> {
+  using type = CommonProxyRef;
+};
+
+template <template <class> class TQual, template <class> class UQual>
+struct std::basic_common_reference<int, ProxyRef, TQual, UQual> {
+  using type = CommonProxyRef;
+};
+
+static_assert(std::common_reference_with<int&, ProxyRef>);
+static_assert(std::common_reference_with<int&, CommonProxyRef>);
+
+class ProxyIter {
+public:
+  using value_type      = int;
+  using difference_type = std::ptrdiff_t;
+
+  constexpr ProxyIter() : ptr_(nullptr) {}
+  constexpr explicit ProxyIter(int* p) : ptr_(p) {}
+
+  constexpr ProxyRef operator*() const { return ProxyRef{*ptr_}; }
+
+  constexpr ProxyIter& operator++() {
+    ++ptr_;
+    return *this;
+  }
+
+  constexpr ProxyIter operator++(int) {
+    ProxyIter tmp = *this;
+    ++ptr_;
+    return tmp;
+  }
+
+  constexpr bool operator==(const ProxyIter& other) const { return ptr_ == other.ptr_; }
+
+private:
+  int* ptr_;
+};
+
+static_assert(std::forward_iterator<ProxyIter>);
+
+constexpr bool test() {
+  { // Result of `operator*` is (maybe const) lvalue reference
+    using V       = std::ranges::owning_view<std::vector<std::string>>;
+    using Pattern = std::ranges::owning_view<std::string>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{{"ab", "cd", "ef"}}, Pattern{"><"});
+
+    {
+      auto it                                  = jwv.begin();
+      std::same_as<char&> decltype(auto) v_ref = *std::as_const(it);
+      assert(v_ref == 'a');
+      std::ranges::advance(it, 2);
+      std::same_as<char&> decltype(auto) pattern_ref = *it;
+      assert(pattern_ref == '>');
+    }
+
+    {
+      auto cit                                        = std::as_const(jwv).begin();
+      std::same_as<const char&> decltype(auto) cv_ref = *cit;
+      assert(cv_ref == 'a');
+      std::ranges::advance(cit, 3);
+      std::same_as<const char&> decltype(auto) cpattern_ref = *std::as_const(cit);
+      assert(cpattern_ref == '<');
+    }
+  }
+
+  { // Result of `operator*` is const lvalue reference
+    using V       = std::ranges::owning_view<std::vector<std::string_view>>;
+    using Pattern = std::string_view;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{{"123", "456", "789"}}, Pattern{"._."});
+
+    {
+      auto it                                        = jwv.begin();
+      std::same_as<const char&> decltype(auto) v_ref = *it;
+      assert(v_ref == '1');
+      std::ranges::advance(it, 3);
+      std::same_as<const char&> decltype(auto) pattern_ref = *std::as_const(it);
+      assert(pattern_ref == '.');
+    }
+
+    {
+      auto cit                                        = std::as_const(jwv).begin();
+      std::same_as<const char&> decltype(auto) cv_ref = *std::as_const(cit);
+      assert(cv_ref == '1');
+      std::ranges::advance(cit, 4);
+      std::same_as<const char&> decltype(auto) cpattern_ref = *cit;
+      assert(cpattern_ref == '_');
+    }
+  }
+
+  { // Result of `operator*` is prvalue
+    using V       = std::vector<std::string_view>;
+    using Pattern = RvalueVector<char>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    JWV jwv(V{"x^2", "y^2", "z^2"}, Pattern{{' ', '+', ' '}});
+
+    {
+      auto it                                 = jwv.begin();
+      std::same_as<char> decltype(auto) v_ref = *std::as_const(it);
+      assert(v_ref == 'x');
+      std::ranges::advance(it, 3);
+      std::same_as<char> decltype(auto) pattern_ref = *it;
+      assert(pattern_ref == ' ');
+    }
+
+    {
+      auto cit                                 = std::as_const(jwv).begin();
+      std::same_as<char> decltype(auto) cv_ref = *cit;
+      assert(cv_ref == 'x');
+      std::ranges::advance(cit, 4);
+      std::same_as<char> decltype(auto) cpattern_ref = *std::as_const(cit);
+      assert(cpattern_ref == '+');
+    }
+  }
+
+  { // Result of `operator*` is (maybe const) rvalue reference
+    using Inner   = std::ranges::as_rvalue_view<std::ranges::owning_view<std::string>>;
+    using V       = std::ranges::owning_view<std::vector<Inner>>;
+    using Pattern = std::ranges::as_rvalue_view<std::ranges::owning_view<std::array<char, 2>>>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    std::vector<Inner> vec;
+    vec.emplace_back(Inner{{"x*y"}});
+    vec.emplace_back(Inner{{"y*z"}});
+    vec.emplace_back(Inner{{"z*x"}});
+    JWV jwv(V(std::move(vec)), Pattern(std::array{',', ' '}));
+
+    {
+      auto it                                   = jwv.begin();
+      std::same_as<char&&> decltype(auto) v_ref = *it;
+      assert(v_ref == 'x');
+      std::ranges::advance(it, 3);
+      std::same_as<char&&> decltype(auto) pattern_ref = *std::as_const(it);
+      assert(pattern_ref == ',');
+    }
+
+    {
+      auto cit                                         = std::as_const(jwv).begin();
+      std::same_as<const char&&> decltype(auto) cv_ref = *std::as_const(cit);
+      assert(cv_ref == 'x');
+      std::ranges::advance(cit, 4);
+      std::same_as<const char&&> decltype(auto) cpattern_ref = *cit;
+      assert(cpattern_ref == ' ');
+    }
+  }
+
+  { // Result of `operator*` is type different from range_reference_t<InnerRng> and range_reference_t<Pattern>
+    using Inner   = std::vector<int>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::ranges::subrange<ProxyIter, ProxyIter>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    static_assert(!std::same_as<std::ranges::range_reference_t<V>, std::ranges::range_reference_t<JWV>>);
+    static_assert(!std::same_as<std::ranges::range_reference_t<Pattern>, std::ranges::range_reference_t<JWV>>);
+
+    std::array<int, 2> pattern = {-1, -1};
+    Pattern pattern_as_subrange(ProxyIter{pattern.data()}, ProxyIter{pattern.data() + pattern.size()});
+
+    JWV jwv(V{Inner{1, 1}, Inner{2, 2}, Inner{3, 3}}, pattern_as_subrange);
+
+    auto it                                           = jwv.begin();
+    std::same_as<CommonProxyRef> decltype(auto) v_ref = *it;
+    assert(v_ref.get() == 1);
+    std::ranges::advance(it, 7);
+    std::same_as<CommonProxyRef> decltype(auto) pattern_ref = *std::as_const(it);
+    assert(pattern_ref.get() == -1);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.pass.cpp
new file mode 100644
index 000000000000..daf07c854933
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/eq.pass.cpp
@@ -0,0 +1,259 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// friend constexpr bool operator==(const iterator& x, const iterator& y)
+//   requires ref-is-glvalue && forward_range<Base> &&
+//            equality_comparable<InnerIter>;
+
+#include <ranges>
+
+#include <array>
+#include <cassert>
+#include <utility>
+
+#include "../types.h"
+#include "test_comparisons.h"
+
+template <class I1, class I2 = I1>
+concept CanEq = requires(const I1& i1, const I2& i2) {
+  { i1 == i2 } -> std::same_as<bool>;
+  { i2 == i1 } -> std::same_as<bool>;
+  { i1 != i2 } -> std::same_as<bool>;
+  { i2 != i1 } -> std::same_as<bool>;
+};
+
+constexpr bool test() {
+  { // `V` and `Pattern` are not empty. Test return types too.
+    using V       = std::array<std::array<int, 2>, 3>;
+    using Pattern = std::array<long, 1>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    using Iter  = std::ranges::iterator_t<JWV>;
+    using CIter = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::same_as<Iter, CIter>);
+    static_assert(CanEq<Iter>);
+    static_assert(CanEq<CIter>);
+    static_assert(CanEq<Iter, CIter>);
+
+    JWV jwv(V{{{9, 8}, {7, 6}, {5, 4}}}, Pattern{0L});
+
+    Iter it1 = jwv.begin();
+    assert(*it1 == 9);
+    assert(testEquality(it1, it1, true));
+
+    Iter it2 = std::ranges::prev(jwv.end());
+    assert(*it2 == 4);
+    assert(testEquality(it2, it2, true));
+    assert(testEquality(it1, it2, false));
+
+    CIter cit1 = std::as_const(jwv).begin();
+    assert(*cit1 == 9);
+    assert(testEquality(cit1, cit1, true));
+    assert(testEquality(it1, cit1, true));
+    assert(testEquality(it2, cit1, false));
+
+    CIter cit2 = std::ranges::prev(std::as_const(jwv).end());
+    assert(*cit2 == 4);
+    assert(testEquality(cit2, cit2, true));
+    assert(testEquality(cit1, cit2, false));
+    assert(testEquality(it1, cit2, false));
+    assert(testEquality(it2, cit2, true));
+
+    // `it1.inner_it_` and `it2.inner_it_` are equal, but `it1.outer_it_` and `it2.outer_it_` are not.
+    std::ranges::advance(it1, 2);
+    assert(*it1 == 0);
+    std::ranges::advance(it2, -2);
+    assert(*it2 == 0);
+    assert(testEquality(it1, it2, false));
+
+    // `cit1.inner_it_` and `cit2.inner_it_` are equal, but `cit1.outer_it_` and `cit2.outer_it_` are not.
+    std::ranges::advance(cit1, 2);
+    assert(*cit1 == 0);
+    assert(testEquality(it1, cit1, true));
+    std::ranges::advance(cit2, -2);
+    assert(*cit2 == 0);
+    assert(testEquality(it2, cit2, true));
+    assert(testEquality(cit1, cit2, false));
+
+    // `it1.inner_it_` and `it2.inner_it_` are equal, `it1.outer_it_` and `it2.outer_it_` are equal too.
+    // `it1.inner_it_index()` and `it2.inner_it_index()` are equal to 1.
+    ++it1;
+    assert(*it1 == 7);
+    std::ranges::advance(it2, -2);
+    assert(*it2 == 7);
+    assert(testEquality(it1, it2, true));
+
+    // `cit1.inner_it_` and `cit2.inner_it_` are equal, `cit1.outer_it_` and `cit2.outer_it_` are equal too.
+    // `cit1.inner_it_index()` and `cit2.inner_it_index()` are equal to 1.
+    ++cit1;
+    assert(*cit1 == 7);
+    assert(testEquality(it1, cit1, true));
+    std::ranges::advance(cit2, -2);
+    assert(*cit2 == 7);
+    assert(testEquality(it2, cit2, true));
+    assert(testEquality(cit1, cit2, true));
+
+    // `it1.inner_it_` and `it2.inner_it_` are equal, `it1.outer_it_` and `it2.outer_it_` are equal too.
+    // `it1.inner_it_index()` and `it2.inner_it_index()` are equal to 0.
+    --it1;
+    assert(*it1 == 0);
+    --it2;
+    assert(*it2 == 0);
+    assert(testEquality(it1, it2, true));
+
+    // `cit1.inner_it_` and `cit2.inner_it_` are equal, `cit1.outer_it_` and `cit2.outer_it_` are equal too.
+    // `cit1.inner_it_index()` and `cit2.inner_it_index()` are equal to 0.
+    --cit1;
+    assert(*cit1 == 0);
+    assert(testEquality(it1, cit1, true));
+    --cit2;
+    assert(*cit2 == 0);
+    assert(testEquality(it2, cit2, true));
+    assert(testEquality(cit2, cit2, true));
+  }
+
+  { // `InnerIter` models input iterator and equality comparable. `Pattern` is empty.
+    using Inner   = BasicVectorView<int, ViewProperties{.common = false}, EqComparableInputIter>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    using Iter  = std::ranges::iterator_t<JWV>;
+    using CIter = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::same_as<Iter, CIter>);
+    static_assert(CanEq<Iter>);
+    static_assert(CanEq<CIter>);
+    static_assert(!CanEq<CIter, Iter>);
+
+    JWV jwv(V{Inner{1, 2}, Inner{5, 6}, Inner{9, 0}}, Pattern{});
+
+    {
+      Iter it1 = jwv.begin();
+      assert(*it1 == 1);
+      Iter it2 = std::ranges::next(jwv.begin(), 2);
+      assert(*it2 == 5);
+      assert(testEquality(it1, it2, false));
+      ++it1;
+      ++it1;
+      assert(testEquality(it1, it2, true));
+      ++it1;
+      assert(testEquality(it1, it2, false));
+    }
+
+    {
+      CIter cit1 = std::as_const(jwv).begin();
+      assert(*cit1 == 1);
+      CIter cit2 = std::ranges::next(std::as_const(jwv).begin(), 2);
+      assert(*cit2 == 5);
+      assert(testEquality(cit1, cit2, false));
+      ++cit1;
+      ++cit1;
+      assert(testEquality(cit1, cit2, true));
+      ++cit1;
+      assert(testEquality(cit1, cit2, false));
+    }
+  }
+
+  { // `Pattern` is not empty. Some elements of `V` are.
+    using Inner   = BasicVectorView<int, ViewProperties{.common = false}, EqComparableInputIter>;
+    using V       = BasicVectorView<Inner, ViewProperties{}, forward_iterator>;
+    using Pattern = BasicVectorView<int, ViewProperties{.common = false}, forward_iterator>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    using Iter  = std::ranges::iterator_t<JWV>;
+    using CIter = std::ranges::iterator_t<const JWV>;
+    static_assert(!std::same_as<Iter, CIter>);
+    static_assert(CanEq<Iter>);
+    static_assert(CanEq<CIter>);
+    static_assert(!CanEq<CIter, Iter>);
+
+    JWV jwv(V{Inner{1}, Inner{}, Inner{27}}, Pattern{0});
+
+    {
+      Iter it1 = jwv.begin();
+      assert(*it1 == 1);
+      ++it1;
+      assert(*it1 == 0);
+      Iter it2 = jwv.begin();
+      assert(testEquality(it1, it2, false));
+      ++it2;
+      assert(testEquality(it1, it2, true));
+
+      ++it2;
+      assert(*it1 == *it2);
+      assert(testEquality(it1, it2, false));
+
+      std::ranges::advance(it1, 2);
+      ++it2;
+      assert(*it1 == *it2);
+      assert(testEquality(it1, it2, true));
+    }
+
+    {
+      CIter cit1 = std::as_const(jwv).begin();
+      assert(*cit1 == 1);
+      ++cit1;
+      assert(*cit1 == 0);
+      CIter cit2 = std::as_const(jwv).begin();
+      assert(testEquality(cit1, cit2, false));
+      ++cit2;
+      assert(testEquality(cit1, cit2, true));
+
+      ++cit2;
+      assert(*cit1 == *cit2);
+      assert(testEquality(cit1, cit2, false));
+
+      std::ranges::advance(cit1, 2);
+      ++cit2;
+      assert(*cit1 == *cit2);
+      assert(testEquality(cit1, cit2, true));
+    }
+  }
+
+  { // `ref-is-glvalue` is false
+    using Inner   = std::vector<int>;
+    using V       = RvalueVector<Inner>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    static_assert(!CanEq<Iter>);
+  }
+
+  { // `Base` does not model forward range
+    using Inner   = std::vector<int>;
+    using V       = BasicVectorView<Inner, ViewProperties{}, DefaultCtorInputIter>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    static_assert(!CanEq<Iter>);
+  }
+
+  { // `InnerIter` does not model equality comparable
+    using Inner   = BasicVectorView<int, ViewProperties{.common = false}, cpp20_input_iterator>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+    using Iter    = std::ranges::iterator_t<JWV>;
+    using CIter   = std::ranges::iterator_t<const JWV>;
+    static_assert(!CanEq<Iter>);
+    static_assert(!CanEq<CIter>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/increment.pass.cpp
new file mode 100644
index 000000000000..af14b516b2de
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/increment.pass.cpp
@@ -0,0 +1,372 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr iterator& operator++();
+// constexpr void operator++(int);
+// constexpr iterator operator++(int)
+//   requires ref-is-glvalue && forward_iterator<OuterIter> &&
+//            forward_iterator<InnerIter>;
+
+#include <ranges>
+
+#include <array>
+#include <cassert>
+#include <type_traits>
+#include <vector>
+
+#include "../types.h"
+
+template <class I>
+concept CanPreIncrement = requires(I& i) { ++i; };
+
+template <class I>
+concept CanPostIncrement = requires(I& i) { i++; };
+
+template <bool RefIsGlvalue, class Inner>
+using VRange = std::conditional_t<RefIsGlvalue, std::vector<Inner>, RvalueVector<Inner>>;
+
+template <bool RefIsGlvalue>
+constexpr void test_pre_increment() {
+  { // `V` and `Pattern` are not empty. Test return type too.
+    using V       = VRange<RefIsGlvalue, std::array<int, 2>>;
+    using Pattern = std::array<int, 2>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    JWV jwv(V{{1, 1}, {2, 2}, {3, 3}}, Pattern{0, 0});
+
+    {
+      using Iter = std::ranges::iterator_t<JWV>;
+      static_assert(CanPreIncrement<Iter>);
+      static_assert(!CanPreIncrement<const Iter>);
+
+      auto it = jwv.begin();
+      assert(*it == 1);
+      std::same_as<Iter&> decltype(auto) it_ref = ++it;
+      if constexpr (RefIsGlvalue) {
+        assert(it_ref == it);
+      }
+
+      ++it;
+      assert(*it == 0);
+      ++it_ref;
+      ++it_ref;
+      assert(*it_ref == 2);
+      ++it;
+      ++it_ref;
+      assert(*it == 0);
+    }
+
+    if constexpr (RefIsGlvalue) {
+      using CIter = std::ranges::iterator_t<const JWV>;
+      static_assert(CanPreIncrement<CIter>);
+      static_assert(!CanPreIncrement<const CIter>);
+
+      auto cit = std::as_const(jwv).begin();
+      assert(*cit == 1);
+      std::same_as<CIter&> decltype(auto) cit_ref = ++cit;
+      assert(cit_ref == cit);
+      ++cit;
+      assert(*cit == 0);
+      ++cit_ref;
+      ++cit_ref;
+      assert(*cit_ref == 2);
+      ++cit;
+      ++cit_ref;
+      assert(*cit == 0);
+    }
+  }
+
+  { // `V` and `Pattern` are empty.
+    using V       = VRange<RefIsGlvalue, std::ranges::empty_view<int>>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    JWV jwv = {};
+
+    {
+      auto it = jwv.begin();
+      assert(it == jwv.end());
+    }
+
+    if constexpr (RefIsGlvalue) {
+      auto cit = std::as_const(jwv).begin();
+      assert(cit == std::as_const(jwv).end());
+    }
+  }
+
+  { // `Pattern` is empty, `V` is not.
+    using V       = VRange<RefIsGlvalue, std::vector<int>>;
+    using Pattern = std::vector<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    JWV jwv(V{{{-1}, {-2}, {-3}}}, Pattern{});
+
+    {
+      auto it = jwv.begin();
+      assert(*it == -1);
+      ++it;
+      assert(*it == -2);
+      ++it;
+      assert(*it == -3);
+      ++it;
+      assert(it == jwv.end());
+    }
+
+    if constexpr (RefIsGlvalue) {
+      auto cit = std::as_const(jwv).begin();
+      assert(*cit == -1);
+      ++cit;
+      assert(*cit == -2);
+      ++cit;
+      assert(*cit == -3);
+      ++cit;
+      assert(cit == std::as_const(jwv).end());
+    }
+  }
+
+  { // `V` has empty subrange in the middle, `Pattern` is not empty.
+    using V       = VRange<RefIsGlvalue, std::vector<int>>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    JWV jwv(V{{1}, {}, {3}}, Pattern{0});
+
+    {
+      auto it = jwv.begin();
+      assert(*it == 1);
+      ++it;
+      assert(*it == 0);
+      ++it;
+      assert(*it == 0);
+      ++it;
+      assert(*it == 3);
+    }
+
+    if constexpr (RefIsGlvalue) {
+      auto cit = std::as_const(jwv).begin();
+      assert(*cit == 1);
+      ++cit;
+      assert(*cit == 0);
+      ++cit;
+      assert(*cit == 0);
+      ++cit;
+      assert(*cit == 3);
+    }
+  }
+
+  { // Only last element of `V` is not empty. `Pattern` is not empty.
+    using V       = VRange<RefIsGlvalue, std::vector<int>>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    JWV jwv(V{{}, {}, {555}}, Pattern{1});
+
+    {
+      auto it = jwv.begin();
+      assert(*it == 1);
+      ++it;
+      assert(*it == 1);
+      ++it;
+      assert(*it == 555);
+      ++it;
+      assert(it == jwv.end());
+    }
+
+    if constexpr (RefIsGlvalue) {
+      auto cit = std::as_const(jwv).begin();
+      assert(*cit == 1);
+      ++cit;
+      assert(*cit == 1);
+      ++cit;
+      assert(*cit == 555);
+      ++cit;
+      assert(cit == std::as_const(jwv).end());
+    }
+  }
+
+  { // Only first element of `V` is not empty. `Pattern` is empty.
+    using V       = VRange<RefIsGlvalue, std::vector<int>>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    JWV jwv(V{{777}, {}, {}}, Pattern{});
+
+    {
+      auto it = jwv.begin();
+      assert(*it == 777);
+      ++it;
+      assert(it == jwv.end());
+    }
+
+    if constexpr (RefIsGlvalue) {
+      auto cit = std::as_const(jwv).begin();
+      assert(*cit == 777);
+      ++cit;
+      assert(cit == std::as_const(jwv).end());
+    }
+  }
+
+  { // Only last element of `V` is not empty. `Pattern` is empty. `V` models input range.
+    using V       = BasicView<VRange<RefIsGlvalue, std::string>, ViewProperties{}, DefaultCtorInputIter>;
+    using Pattern = std::ranges::empty_view<char>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{{}, {}, {'a'}}, Pattern{});
+
+    auto it = jwv.begin();
+    assert(*it == 'a');
+    ++it;
+    assert(it == jwv.end());
+  }
+
+  { // Only first element of `V` is not empty. `Pattern` is not empty. `V` models input range.
+    using V       = BasicView<VRange<RefIsGlvalue, std::string>, ViewProperties{}, DefaultCtorInputIter>;
+    using Pattern = std::ranges::single_view<char>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{{'b'}, {}, {}}, Pattern{'.'});
+
+    auto it = jwv.begin();
+    assert(*it == 'b');
+    ++it;
+    assert(*it == '.');
+    ++it;
+    assert(*it == '.');
+    ++it;
+    assert(it == jwv.end());
+  }
+}
+
+constexpr void test_post_increment() {
+  { // `V` and `Pattern` are not empty. Return type should be `iterator`.
+    using V       = std::array<std::array<int, 3>, 2>;
+    using Pattern = std::array<int, 1>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    using Iter  = std::ranges::iterator_t<JWV>;
+    using CIter = std::ranges::iterator_t<const JWV>;
+    static_assert(CanPostIncrement<Iter>);
+    static_assert(!CanPostIncrement<const Iter>);
+    static_assert(CanPostIncrement<CIter>);
+    static_assert(!CanPostIncrement<const CIter>);
+
+    JWV jwv(V{{{6, 5, 4}, {3, 2, 1}}}, Pattern{-5});
+
+    {
+      auto it = jwv.begin();
+      assert(*it == 6);
+      std::same_as<Iter> decltype(auto) it_copy = it++;
+      assert(++it_copy == it);
+      it++;
+      it++;
+      assert(*it == -5);
+      it_copy++;
+      it_copy++;
+      assert(*it_copy == -5);
+      it++;
+      it_copy++;
+      assert(*it == 3);
+      assert(*it_copy == 3);
+    }
+
+    {
+      auto cit = std::as_const(jwv).begin();
+      assert(*cit == 6);
+      std::same_as<CIter> decltype(auto) cit_copy = cit++;
+      assert(++cit_copy == cit);
+      cit++;
+      cit++;
+      assert(*cit == -5);
+      cit_copy++;
+      cit_copy++;
+      assert(*cit_copy == -5);
+      cit++;
+      cit_copy++;
+      assert(*cit == 3);
+      assert(*cit_copy == 3);
+    }
+  }
+
+  { // `Pattern` is empty, `V` is not. Value of `ref-is-glvalue` is false (return type should be `void`).
+    using Inner   = std::vector<int>;
+    using V       = RvalueVector<Inner>;
+    using Pattern = std::ranges::empty_view<int>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    JWV jwv(V{Inner{-3}, Inner{-2}, Inner{-1}}, Pattern{});
+
+    auto it = jwv.begin();
+    assert(*it == -3);
+    it++;
+    assert(*it == -2);
+    it++;
+    assert(*it == -1);
+    it++;
+    assert(it == jwv.end());
+    static_assert(std::is_void_v<decltype(it++)>);
+  }
+
+  { // `V` has empty subrange in the middle, `Pattern` is not empty.
+    // OuterIter does not model forward iterator (return type should be `void`).
+    using Inner   = std::vector<int>;
+    using V       = BasicVectorView<Inner, ViewProperties{.common = false}, cpp20_input_iterator>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+
+    JWV jwv(V{Inner{7}, {}, Inner{9}}, Pattern{8});
+
+    auto it = jwv.begin();
+    assert(*it == 7);
+    it++;
+    assert(*it == 8);
+    it++;
+    assert(*it == 8);
+    it++;
+    assert(*it == 9);
+    it++;
+    assert(it == jwv.end());
+    static_assert(std::is_void_v<decltype(it++)>);
+  }
+
+#if !defined(TEST_COMPILER_GCC) // GCC c++/101777
+  { // Only first element of `V` is not empty. `Pattern` is empty. InnerIter does not model forward
+    // iterator (return type should be `void`).
+    using Inner   = BasicVectorView<char32_t, ViewProperties{.common = false}, cpp17_input_iterator>;
+    using V       = std::array<Inner, 3>;
+    using Pattern = std::ranges::empty_view<char32_t>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    JWV jwv(V{Inner{U'?'}, Inner{}, Inner{}}, Pattern{});
+
+    auto it = jwv.begin();
+    assert(*it == U'?');
+    it++;
+    assert(it == jwv.end());
+    static_assert(std::is_void_v<decltype(it++)>);
+  }
+#endif // !defined(TEST_COMPILER_GCC)
+}
+
+constexpr bool test() {
+  test_pre_increment<false>();
+  test_pre_increment<true>();
+  test_post_increment();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.pass.cpp
new file mode 100644
index 000000000000..4a68f10b11c3
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_move.pass.cpp
@@ -0,0 +1,420 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// friend constexpr decltype(auto) iter_move(const iterator& x);
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "../types.h"
+
+class MoveOnlyInt {
+public:
+  enum Status { constructed, move_constructed, moved_from_this };
+
+  MoveOnlyInt() = default;
+  constexpr MoveOnlyInt(int val) : val_(val) {}
+
+  constexpr MoveOnlyInt(MoveOnlyInt&& other) noexcept : val_(other.val_), status_(move_constructed) {
+    other.val_    = -1;
+    other.status_ = moved_from_this;
+  }
+
+  constexpr MoveOnlyInt(const MoveOnlyInt&& other) noexcept : val_(other.val_), status_(move_constructed) {
+    other.val_    = -1;
+    other.status_ = moved_from_this;
+  }
+
+  MoveOnlyInt(const MoveOnlyInt&) { assert(false); } // Should never be called in this test.
+
+  MoveOnlyInt& operator=(MoveOnlyInt&&) { // Should never be called in this test.
+    assert(false);
+    return *this;
+  }
+
+  constexpr bool was_normally_constructed() const { return status_ == constructed; }
+  constexpr bool was_move_constructed() const { return status_ == move_constructed; }
+  constexpr bool was_moved_from() const { return status_ == moved_from_this; }
+
+  friend constexpr bool operator==(const MoveOnlyInt& left, int right) { return left.val_ == right; }
+  friend constexpr bool operator==(const MoveOnlyInt& left, const MoveOnlyInt& right) {
+    return left.val_ == right.val_;
+  }
+
+private:
+  mutable int val_       = -1;
+  mutable Status status_ = constructed;
+};
+
+static_assert(std::movable<MoveOnlyInt>);
+
+struct ProxyRvalueRef {
+  MoveOnlyInt&& val;
+};
+
+class CommonProxyRvalueRef {
+public:
+  constexpr CommonProxyRvalueRef(ProxyRvalueRef i) : val_(std::move(i.val)) {}
+  constexpr CommonProxyRvalueRef(MoveOnlyInt i) : val_(std::move(i)) {}
+
+  constexpr MoveOnlyInt&& get() { return std::move(val_); }
+
+private:
+  MoveOnlyInt val_;
+};
+
+template <template <class> class TQual, template <class> class UQual>
+struct std::basic_common_reference<ProxyRvalueRef, MoveOnlyInt, TQual, UQual> {
+  using type = CommonProxyRvalueRef;
+};
+
+template <template <class> class TQual, template <class> class UQual>
+struct std::basic_common_reference<MoveOnlyInt, ProxyRvalueRef, TQual, UQual> {
+  using type = CommonProxyRvalueRef;
+};
+
+static_assert(std::common_reference_with<MoveOnlyInt&&, ProxyRvalueRef>);
+static_assert(std::common_reference_with<MoveOnlyInt&&, CommonProxyRvalueRef>);
+
+class ProxyIter {
+public:
+  using value_type      = MoveOnlyInt;
+  using difference_type = std::ptrdiff_t;
+
+  constexpr ProxyIter() : ptr_(nullptr) {}
+  constexpr explicit ProxyIter(MoveOnlyInt* it) : ptr_(std::move(it)) {}
+
+  constexpr decltype(auto) operator*() const { return *ptr_; }
+
+  constexpr ProxyIter& operator++() {
+    ++ptr_;
+    return *this;
+  }
+
+  constexpr ProxyIter operator++(int) {
+    ProxyIter copy = *this;
+    ++ptr_;
+    return copy;
+  }
+
+  constexpr ProxyIter& operator--() {
+    --ptr_;
+    return *this;
+  }
+
+  constexpr ProxyIter operator--(int) {
+    ProxyIter copy = *this;
+    --ptr_;
+    return copy;
+  }
+
+  friend bool operator==(const ProxyIter&, const ProxyIter&) = default;
+
+  friend constexpr ProxyRvalueRef iter_move(const ProxyIter iter) {
+    return ProxyRvalueRef{std::ranges::iter_move(iter.ptr_)};
+  }
+
+private:
+  MoveOnlyInt* ptr_;
+};
+
+static_assert(std::forward_iterator<ProxyIter>);
+
+template <std::forward_iterator Iter>
+class IterMoveTrackingIterator {
+public:
+  using value_type      = std::iter_value_t<Iter>;
+  using difference_type = std::iter_difference_t<Iter>;
+
+  IterMoveTrackingIterator() = default;
+  constexpr explicit IterMoveTrackingIterator(Iter iter, bool* flag = nullptr) : iter_(std::move(iter)), flag_(flag) {}
+
+  constexpr IterMoveTrackingIterator& operator++() {
+    ++iter_;
+    return *this;
+  }
+
+  constexpr IterMoveTrackingIterator operator++(int) {
+    auto tmp = *this;
+    ++*this;
+    return tmp;
+  }
+
+  constexpr decltype(auto) operator*() const { return *iter_; }
+
+  constexpr bool operator==(const IterMoveTrackingIterator& other) const { return iter_ == other.iter_; }
+
+  friend constexpr decltype(auto) iter_move(const IterMoveTrackingIterator& iter) {
+    assert(iter.flag_ != nullptr);
+    *iter.flag_ = true;
+    return std::ranges::iter_move(iter.iter_);
+  }
+
+private:
+  Iter iter_  = Iter();
+  bool* flag_ = nullptr;
+};
+
+static_assert(std::forward_iterator<IterMoveTrackingIterator<int*>> &&
+              !std::bidirectional_iterator<IterMoveTrackingIterator<int*>>);
+
+constexpr bool test() {
+  { // Test `iter_move` when result is true rvalue reference. Test return types.
+    using V       = std::array<std::array<char, 1>, 2>;
+    using Pattern = std::array<char, 1>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    JWV jwv(V{{{'0'}, {'1'}}}, Pattern{','});
+
+    {
+      auto it                                     = jwv.begin();
+      std::same_as<char&&> decltype(auto) v_rref1 = iter_move(it);
+      std::same_as<char&&> decltype(auto) v_rref2 = iter_move(std::as_const(it));
+      std::same_as<char&&> decltype(auto) v_rref3 = std::ranges::iter_move(it);
+      std::same_as<char&&> decltype(auto) v_rref4 = std::ranges::iter_move(std::as_const(it));
+      assert(std::ranges::equal(std::array{v_rref1, v_rref2, v_rref3, v_rref4}, std::views::repeat('0', 4)));
+
+      ++it; // `it` points to element of `Pattern` from here
+      std::same_as<char&&> decltype(auto) pattern_rref1 = iter_move(it);
+      std::same_as<char&&> decltype(auto) pattern_rref2 = iter_move(std::as_const(it));
+      std::same_as<char&&> decltype(auto) pattern_rref3 = std::ranges::iter_move(it);
+      std::same_as<char&&> decltype(auto) pattern_rref4 = std::ranges::iter_move(std::as_const(it));
+      assert(std::ranges::equal(
+          std::array{pattern_rref1, pattern_rref2, pattern_rref3, pattern_rref4}, std::views::repeat(',', 4)));
+    }
+
+    {
+      auto cit                                           = std::prev(std::as_const(jwv).end());
+      std::same_as<const char&&> decltype(auto) cv_rref1 = iter_move(cit);
+      std::same_as<const char&&> decltype(auto) cv_rref2 = iter_move(std::as_const(cit));
+      std::same_as<const char&&> decltype(auto) cv_rref3 = std::ranges::iter_move(cit);
+      std::same_as<const char&&> decltype(auto) cv_rref4 = std::ranges::iter_move(std::as_const(cit));
+      assert(std::ranges::equal(std::array{cv_rref1, cv_rref2, cv_rref3, cv_rref4}, std::views::repeat('1', 4)));
+
+      cit--; // `cit` points to element of `Pattern` from here
+      std::same_as<const char&&> decltype(auto) cpattern_rref1 = iter_move(cit);
+      std::same_as<const char&&> decltype(auto) cpattern_rref2 = iter_move(std::as_const(cit));
+      std::same_as<const char&&> decltype(auto) cpattern_rref3 = std::ranges::iter_move(cit);
+      std::same_as<const char&&> decltype(auto) cpattern_rref4 = std::ranges::iter_move(std::as_const(cit));
+      assert(std::ranges::equal(
+          std::array{cpattern_rref1, cpattern_rref2, cpattern_rref3, cpattern_rref4}, std::views::repeat(',', 4)));
+    }
+  }
+
+  { // Test `iter_move` when result is true rvalue reference. Test moving.
+    using Inner   = std::vector<MoveOnlyInt>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::vector<MoveOnlyInt>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+
+    V v;
+    v.reserve(2);
+    v.emplace_back(std::ranges::to<Inner>(std::views::iota(0, 4)));
+    v.emplace_back(std::ranges::to<Inner>(std::views::iota(12, 16)));
+    JWV jwv(std::move(v), std::ranges::to<Pattern>(std::views::iota(4, 12)));
+    assert(std::ranges::all_of(jwv, &MoveOnlyInt::was_normally_constructed));
+
+    {
+      std::vector<MoveOnlyInt> values;
+      values.reserve(8);
+
+      auto it = jwv.begin();
+      values.emplace_back(iter_move(it));
+      ++it;
+      values.emplace_back(iter_move(std::as_const(it)));
+      it++;
+      values.emplace_back(std::ranges::iter_move(it));
+      ++it;
+      values.emplace_back(std::ranges::iter_move(std::as_const(it)));
+      it++; // `it` points to element of `Pattern` from here
+      values.emplace_back(iter_move(it));
+      ++it;
+      values.emplace_back(iter_move(std::as_const(it)));
+      it++;
+      values.emplace_back(std::ranges::iter_move(it));
+      ++it;
+      values.emplace_back(std::ranges::iter_move(std::as_const(it)));
+
+      assert(std::ranges::equal(values, std::views::iota(0, 8)));
+      assert(std::ranges::all_of(values, &MoveOnlyInt::was_move_constructed));
+    }
+
+    {
+      std::vector<MoveOnlyInt> values;
+      values.reserve(8);
+
+      auto cit = std::prev(std::as_const(jwv).end());
+      values.emplace_back(iter_move(cit));
+      cit--;
+      values.emplace_back(iter_move(std::as_const(cit)));
+      --cit;
+      values.emplace_back(std::ranges::iter_move(cit));
+      cit--;
+      values.emplace_back(std::ranges::iter_move(std::as_const(cit)));
+      --cit; // `it` points to element of `Pattern` from here
+      values.emplace_back(iter_move(cit));
+      cit--;
+      values.emplace_back(iter_move(std::as_const(cit)));
+      --cit;
+      values.emplace_back(std::ranges::iter_move(cit));
+      cit--;
+      values.emplace_back(std::ranges::iter_move(std::as_const(cit)));
+
+      assert(std::ranges::equal(std::views::reverse(values), std::views::iota(8, 16)));
+      assert(std::ranges::all_of(values, &MoveOnlyInt::was_move_constructed));
+    }
+
+    assert(std::ranges::all_of(jwv, &MoveOnlyInt::was_moved_from));
+  }
+
+  { // Test `iter_move` when result is proxy rvalue reference type, which is different from
+    // range_rvalue_reference_t<InnerRng> and range_rvalue_reference_t<Pattern>.
+    using Inner   = std::vector<MoveOnlyInt>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::ranges::subrange<ProxyIter, ProxyIter>;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+
+    static_assert(!std::same_as<std::ranges::range_rvalue_reference_t<V>, std::ranges::range_rvalue_reference_t<JWV>>);
+    static_assert(
+        !std::same_as<std::ranges::range_rvalue_reference_t<Pattern>, std::ranges::range_rvalue_reference_t<JWV>>);
+    static_assert(std::same_as<CommonProxyRvalueRef, std::ranges::range_rvalue_reference_t<JWV>>);
+
+    V v;
+    v.reserve(2);
+    v.emplace_back(std::ranges::to<Inner>(std::views::iota(0, 4)));
+    v.emplace_back(std::ranges::to<Inner>(std::views::iota(12, 16)));
+
+    auto pattern = std::ranges::to<std::vector<MoveOnlyInt>>(std::views::iota(4, 12));
+    Pattern pattern_as_subrange(ProxyIter{pattern.data()}, ProxyIter{pattern.data() + pattern.size()});
+
+    JWV jwv(std::move(v), pattern_as_subrange);
+    assert(std::ranges::all_of(jwv, &MoveOnlyInt::was_normally_constructed));
+
+    {
+      std::vector<MoveOnlyInt> values;
+      values.reserve(8);
+
+      auto it                                                 = jwv.begin();
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref1 = iter_move(it);
+      values.emplace_back(rref1.get());
+      ++it;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref2 = iter_move(std::as_const(it));
+      values.emplace_back(rref2.get());
+      it++;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref3 = std::ranges::iter_move(it);
+      values.emplace_back(rref3.get());
+      ++it;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref4 = std::ranges::iter_move(std::as_const(it));
+      values.emplace_back(rref4.get());
+      it++; // `it` points to element of `Pattern` from here
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref5 = iter_move(it);
+      values.emplace_back(rref5.get());
+      ++it;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref6 = iter_move(std::as_const(it));
+      values.emplace_back(rref6.get());
+      it++;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref7 = std::ranges::iter_move(it);
+      values.emplace_back(rref7.get());
+      ++it;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref8 = std::ranges::iter_move(std::as_const(it));
+      values.emplace_back(rref8.get());
+
+      assert(std::ranges::equal(values, std::views::iota(0, 8)));
+      assert(std::ranges::all_of(values, &MoveOnlyInt::was_move_constructed));
+    }
+
+    {
+      std::vector<MoveOnlyInt> values;
+      values.reserve(8);
+
+      auto cit                                                = std::prev(std::as_const(jwv).end());
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref1 = iter_move(cit);
+      values.emplace_back(rref1.get());
+      cit--;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref2 = iter_move(std::as_const(cit));
+      values.emplace_back(rref2.get());
+      --cit;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref3 = std::ranges::iter_move(cit);
+      values.emplace_back(rref3.get());
+      cit--;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref4 = std::ranges::iter_move(std::as_const(cit));
+      values.emplace_back(rref4.get());
+      --cit; // `it` points to element of `Pattern` from here
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref5 = iter_move(cit);
+      values.emplace_back(rref5.get());
+      cit--;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref6 = iter_move(std::as_const(cit));
+      values.emplace_back(rref6.get());
+      --cit;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref7 = std::ranges::iter_move(cit);
+      values.emplace_back(rref7.get());
+      cit--;
+      std::same_as<CommonProxyRvalueRef> decltype(auto) rref8 = std::ranges::iter_move(std::as_const(cit));
+      values.emplace_back(rref8.get());
+
+      assert(std::ranges::equal(std::views::reverse(values), std::views::iota(8, 16)));
+      assert(std::ranges::all_of(values, &MoveOnlyInt::was_move_constructed));
+    }
+
+    assert(std::ranges::all_of(jwv, &MoveOnlyInt::was_moved_from));
+  }
+
+  { // Make sure `iter_move` calls underlying's iterator `iter_move` (not `std::move(*i)`).
+    using Inner               = std::vector<int>;
+    using InnerTrackingIter   = IterMoveTrackingIterator<Inner::iterator>;
+    using TrackingInner       = std::ranges::subrange<InnerTrackingIter>;
+    using Pattern             = std::array<int, 1>;
+    using PatternTrackingIter = IterMoveTrackingIterator<Pattern::iterator>;
+    using TrackingPattern     = std::ranges::subrange<PatternTrackingIter>;
+    using JWV                 = std::ranges::join_with_view<std::span<TrackingInner>, TrackingPattern>;
+
+    std::array<Inner, 2> v{{{1}, {2}}};
+    Pattern pat{-1};
+
+    bool v_moved = false;
+    std::array<TrackingInner, 2> tracking_v{
+        TrackingInner(InnerTrackingIter(v[0].begin(), &v_moved), InnerTrackingIter(v[0].end())),
+        TrackingInner(InnerTrackingIter(v[1].begin()), InnerTrackingIter(v[1].end()))};
+
+    bool pat_moved = false;
+    TrackingPattern tracking_pat(PatternTrackingIter(pat.begin(), &pat_moved), PatternTrackingIter(pat.end()));
+
+    JWV jwv(tracking_v, tracking_pat);
+    auto it = jwv.begin();
+
+    // Test calling `iter_move` when `it` points to element of `v`
+    assert(!v_moved);
+    assert(iter_move(it) == 1);
+    assert(v_moved);
+
+    // Test calling `iter_move` when `it` points to element of `pat`
+    ++it;
+    assert(!pat_moved);
+    assert(iter_move(it) == -1);
+    assert(pat_moved);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_swap.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_swap.pass.cpp
new file mode 100644
index 000000000000..738592175228
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/iter_swap.pass.cpp
@@ -0,0 +1,186 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// friend constexpr void iter_swap(const iterator& x, const iterator& y);
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <span>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+template <class I>
+concept CanIterSwap = requires(I i) { iter_swap(i); };
+
+enum class SwapKind { no_swap, with_same_type, with_different_type };
+enum class IterKind { inner_view, pattern };
+
+template <std::forward_iterator Iter, IterKind Kind>
+class IterSwapTrackingIterator {
+public:
+  using value_type      = std::iter_value_t<Iter>;
+  using difference_type = std::iter_difference_t<Iter>;
+
+  constexpr Iter get_iter() const { return iter_; }
+
+  constexpr SwapKind* get_flag() const { return flag_; }
+
+  IterSwapTrackingIterator() = default;
+  constexpr explicit IterSwapTrackingIterator(Iter iter, SwapKind* flag = nullptr)
+      : iter_(std::move(iter)), flag_(flag) {}
+
+  constexpr IterSwapTrackingIterator& operator++() {
+    ++iter_;
+    return *this;
+  }
+
+  constexpr IterSwapTrackingIterator operator++(int) {
+    auto tmp = *this;
+    ++*this;
+    return tmp;
+  }
+
+  constexpr decltype(auto) operator*() const { return *iter_; }
+
+  constexpr bool operator==(const IterSwapTrackingIterator& other) const { return iter_ == other.iter_; }
+
+  friend constexpr decltype(auto) iter_swap(const IterSwapTrackingIterator& lhs, const IterSwapTrackingIterator& rhs) {
+    assert(lhs.flag_ != nullptr && rhs.flag_ != nullptr);
+    *lhs.flag_ = *rhs.flag_ = SwapKind::with_same_type;
+    return std::ranges::iter_swap(lhs.iter_, rhs.iter_);
+  }
+
+  template <std::indirectly_swappable<Iter> OtherIter, IterKind OtherKind>
+  friend constexpr decltype(auto)
+  iter_swap(const IterSwapTrackingIterator& lhs, const IterSwapTrackingIterator<OtherIter, OtherKind>& rhs) {
+    assert(lhs.flag_ != nullptr && rhs.get_flag() != nullptr);
+    *lhs.flag_ = *rhs.get_flag() = SwapKind::with_different_type;
+    return std::ranges::iter_swap(lhs.iter_, rhs.get_iter());
+  }
+
+private:
+  Iter iter_      = Iter();
+  SwapKind* flag_ = nullptr;
+};
+
+static_assert(std::forward_iterator<IterSwapTrackingIterator<int*, IterKind::inner_view>> &&
+              !std::bidirectional_iterator<IterSwapTrackingIterator<int*, IterKind::inner_view>>);
+
+constexpr bool test() {
+  { // Test common usage
+    using V       = std::vector<std::string>;
+    using Pattern = std::string;
+    using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, std::ranges::owning_view<Pattern>>;
+    using namespace std::string_view_literals;
+
+    JWV jwv(V{"std", "ranges", "views", "join_with_view"}, Pattern{":: "});
+    assert(std::ranges::equal(jwv, "std:: ranges:: views:: join_with_view"sv));
+
+    auto it = jwv.begin();
+    iter_swap(it, std::ranges::next(it, 2)); // Swap elements of the same inner range.
+    assert(std::ranges::equal(jwv, "dts:: ranges:: views:: join_with_view"sv));
+
+    std::ranges::advance(it, 3);
+    iter_swap(std::as_const(it), std::ranges::next(it, 2)); // Swap elements of the pattern.
+    assert(std::ranges::equal(jwv, "dts ::ranges ::views ::join_with_view"sv));
+
+    std::ranges::advance(it, 3);
+    const auto it2 = jwv.begin();
+    iter_swap(std::as_const(it), it2); // Swap elements of different inner ranges.
+    assert(std::ranges::equal(jwv, "rts ::danges ::views ::join_with_view"sv));
+
+    std::ranges::advance(it, 6);
+    iter_swap(std::as_const(it), it2); // Swap element from inner range with element from the pattern.
+    assert(std::ranges::equal(jwv, " tsr::dangesr::viewsr::join_with_view"sv));
+
+    static_assert(std::is_void_v<decltype(iter_swap(it, it))>);
+    static_assert(std::is_void_v<decltype(iter_swap(it2, it2))>);
+    static_assert(!CanIterSwap<std::ranges::iterator_t<const JWV>>);
+    static_assert(!CanIterSwap<const std::ranges::iterator_t<const JWV>>);
+  }
+
+  { // Make sure `iter_swap` calls underlying's iterator `iter_swap` (not `ranges::swap(*i1, *i2)`).
+    using Inner               = std::vector<int>;
+    using InnerTrackingIter   = IterSwapTrackingIterator<Inner::iterator, IterKind::inner_view>;
+    using TrackingInner       = std::ranges::subrange<InnerTrackingIter>;
+    using Pattern             = std::array<int, 2>;
+    using PatternTrackingIter = IterSwapTrackingIterator<Pattern::iterator, IterKind::pattern>;
+    using TrackingPattern     = std::ranges::subrange<PatternTrackingIter>;
+    using JWV                 = std::ranges::join_with_view<std::span<TrackingInner>, TrackingPattern>;
+
+    std::array<Inner, 3> v{{{1, 2, 3}, {4, 5}}};
+    Pattern pat{-1, -2};
+
+    SwapKind v_swap_kind = SwapKind::no_swap;
+    std::array<TrackingInner, 2> tracking_v{
+        TrackingInner(InnerTrackingIter(v[0].begin(), &v_swap_kind), InnerTrackingIter(v[0].end())),
+        TrackingInner(InnerTrackingIter(v[1].begin(), &v_swap_kind), InnerTrackingIter(v[1].end()))};
+
+    SwapKind pat_swap_kind = SwapKind::no_swap;
+    TrackingPattern tracking_pat(PatternTrackingIter(pat.begin(), &pat_swap_kind), PatternTrackingIter(pat.end()));
+
+    JWV jwv(tracking_v, tracking_pat);
+    auto it1 = jwv.begin();
+    auto it2 = std::ranges::next(it1);
+
+    // Test calling `iter_swap` when both `it1` and `it2` point to elements of `v`.
+    assert(v_swap_kind == SwapKind::no_swap);
+    iter_swap(it1, it2);
+    assert(*it1 == 2 && *it2 == 1);
+    assert(v_swap_kind == SwapKind::with_same_type && pat_swap_kind == SwapKind::no_swap);
+
+    // Test calling `iter_swap` when `it1` points to element of `v` and `it2` points to element of `pat`.
+    std::ranges::advance(it2, 2);
+    v_swap_kind = SwapKind::no_swap;
+    assert(pat_swap_kind == SwapKind::no_swap);
+    iter_swap(it1, it2);
+    assert(*it1 == -1 && *it2 == 2);
+    assert(v_swap_kind == SwapKind::with_different_type && pat_swap_kind == SwapKind::with_different_type);
+
+    // Test calling `iter_swap` when `it1` and `it2` point to elements of `pat`.
+    std::ranges::advance(it1, 4);
+    v_swap_kind = pat_swap_kind = SwapKind::no_swap;
+    iter_swap(it1, it2);
+    assert(*it1 == 2 && *it2 == -2);
+    assert(v_swap_kind == SwapKind::no_swap && pat_swap_kind == SwapKind::with_same_type);
+
+    // Test calling `iter_swap` when `it1` points to element of `pat` and `it2` points to element of `v`.
+    std::ranges::advance(it2, 3);
+    v_swap_kind = pat_swap_kind = SwapKind::no_swap;
+    iter_swap(it1, it2);
+    assert(*it1 == 5 && *it2 == 2);
+    assert(v_swap_kind == SwapKind::with_different_type && pat_swap_kind == SwapKind::with_different_type);
+  }
+
+  { // InnerIter and PatternIter don't model indirectly swappable
+    using JWV = std::ranges::join_with_view<std::span<std::string>, std::string_view>;
+    static_assert(!CanIterSwap<std::ranges::iterator_t<JWV>>);
+    static_assert(!CanIterSwap<const std::ranges::iterator_t<JWV>>);
+    static_assert(!CanIterSwap<std::ranges::iterator_t<const JWV>>);
+    static_assert(!CanIterSwap<const std::ranges::iterator_t<const JWV>>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/types.compile.pass.cpp
new file mode 100644
index 000000000000..a373e17dddbf
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.iterator/types.compile.pass.cpp
@@ -0,0 +1,456 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// using iterator_concept = see below;
+// using iterator_category = see below; // not always present
+// using value_type = see below;
+// using difference_type = see below;
+
+#include <ranges>
+
+#include <iterator>
+#include <vector>
+
+#include "../types.h"
+#include "test_iterators.h"
+
+namespace test_iterator_concept {
+template <template <class> class InnerIt>
+using InnerRange = BasicView<std::vector<int>, ViewProperties{}, InnerIt>;
+
+template <template <class> class It, template <class> class InnerIt>
+using View = BasicView<std::vector<InnerRange<InnerIt>>, ViewProperties{}, It>;
+
+template <template <class> class It, template <class> class InnerIt>
+using RvalueView = BasicView<RvalueVector<InnerRange<InnerIt>>, ViewProperties{}, It>;
+
+template <template <class> class It>
+using Pattern = BasicView<std::vector<int>, ViewProperties{}, It>;
+
+template <class V, class Pat>
+using IteratorConcept = std::ranges::iterator_t<std::ranges::join_with_view<V, Pat>>::iterator_concept;
+
+template <class V, class Pat, class Concept>
+concept IteratorConceptIs = std::same_as<IteratorConcept<V, Pat>, Concept>;
+
+// When `iterator<false>::iterator_concept` is `bidirectional_iterator_tag`
+static_assert(IteratorConceptIs<View<bidirectional_iterator, bidirectional_iterator>,
+                                Pattern<bidirectional_iterator>,
+                                std::bidirectional_iterator_tag>);
+
+// When `iterator<false>::iterator_concept` is `forward_iterator_tag`
+static_assert(IteratorConceptIs<View<forward_iterator, bidirectional_iterator>,
+                                Pattern<bidirectional_iterator>,
+                                std::forward_iterator_tag>);
+static_assert(IteratorConceptIs<View<bidirectional_iterator, forward_iterator>,
+                                Pattern<bidirectional_iterator>,
+                                std::forward_iterator_tag>);
+static_assert(IteratorConceptIs<View<bidirectional_iterator, bidirectional_iterator>,
+                                Pattern<forward_iterator>,
+                                std::forward_iterator_tag>);
+static_assert(IteratorConceptIs<View<forward_iterator, forward_iterator>,
+                                Pattern<bidirectional_iterator>,
+                                std::forward_iterator_tag>);
+static_assert(IteratorConceptIs<View<forward_iterator, bidirectional_iterator>,
+                                Pattern<forward_iterator>,
+                                std::forward_iterator_tag>);
+static_assert(IteratorConceptIs<View<bidirectional_iterator, forward_iterator>,
+                                Pattern<forward_iterator>,
+                                std::forward_iterator_tag>);
+static_assert(IteratorConceptIs<View<forward_iterator, forward_iterator>, //
+                                Pattern<forward_iterator>,
+                                std::forward_iterator_tag>);
+
+// When `iterator<false>::iterator_concept` is `input_iterator_tag`
+static_assert(IteratorConceptIs<View<DefaultCtorInputIter, forward_iterator>,
+                                Pattern<forward_iterator>,
+                                std::input_iterator_tag>);
+static_assert(IteratorConceptIs<View<forward_iterator, DefaultCtorInputIter>,
+                                Pattern<forward_iterator>,
+                                std::input_iterator_tag>);
+static_assert(IteratorConceptIs<View<DefaultCtorInputIter, DefaultCtorInputIter>,
+                                Pattern<forward_iterator>,
+                                std::input_iterator_tag>);
+static_assert(IteratorConceptIs<RvalueView<bidirectional_iterator, bidirectional_iterator>,
+                                Pattern<bidirectional_iterator>,
+                                std::input_iterator_tag>);
+static_assert(IteratorConceptIs<RvalueView<forward_iterator, forward_iterator>,
+                                Pattern<forward_iterator>,
+                                std::input_iterator_tag>);
+
+template <class V, class Pat>
+using ConstIteratorConcept = std::ranges::iterator_t<const std::ranges::join_with_view<V, Pat>>::iterator_concept;
+
+template <class V, class Pat, class Concept>
+concept ConstIteratorConceptIs = std::same_as<ConstIteratorConcept<V, Pat>, Concept>;
+
+// When `iterator<true>::iterator_concept` is `bidirectional_iterator_tag`
+static_assert(ConstIteratorConceptIs<View<bidirectional_iterator, bidirectional_iterator>,
+                                     Pattern<bidirectional_iterator>,
+                                     std::bidirectional_iterator_tag>);
+
+// When `iterator<true>::iterator_concept` is `forward_iterator_tag`
+static_assert(ConstIteratorConceptIs<View<forward_iterator, bidirectional_iterator>,
+                                     Pattern<bidirectional_iterator>,
+                                     std::forward_iterator_tag>);
+static_assert(ConstIteratorConceptIs<View<bidirectional_iterator, forward_iterator>,
+                                     Pattern<bidirectional_iterator>,
+                                     std::forward_iterator_tag>);
+static_assert(ConstIteratorConceptIs<View<bidirectional_iterator, bidirectional_iterator>,
+                                     Pattern<forward_iterator>,
+                                     std::forward_iterator_tag>);
+static_assert(ConstIteratorConceptIs<View<forward_iterator, forward_iterator>,
+                                     Pattern<bidirectional_iterator>,
+                                     std::forward_iterator_tag>);
+static_assert(ConstIteratorConceptIs<View<forward_iterator, bidirectional_iterator>,
+                                     Pattern<forward_iterator>,
+                                     std::forward_iterator_tag>);
+static_assert(ConstIteratorConceptIs<View<bidirectional_iterator, forward_iterator>,
+                                     Pattern<forward_iterator>,
+                                     std::forward_iterator_tag>);
+static_assert(ConstIteratorConceptIs<View<forward_iterator, forward_iterator>,
+                                     Pattern<forward_iterator>,
+                                     std::forward_iterator_tag>);
+
+// `iterator<true>::iterator_concept` cannot be `input_iterator_tag`
+} // namespace test_iterator_concept
+
+namespace test_iterator_category {
+template <template <class> class InnerIt>
+using InnerRange = BasicView<std::vector<float>, ViewProperties{}, InnerIt>;
+
+template <bool Common, template <class> class InnerIt>
+using MaybeCommonInnerRange = BasicView<std::vector<float>, ViewProperties{.common = Common}, InnerIt>;
+
+template <template <class> class It, template <class> class InnerIt>
+using View = BasicView<std::vector<InnerRange<InnerIt>>, ViewProperties{}, It>;
+
+template <template <class> class It, template <class> class InnerIt>
+using RvalueView = BasicView<RvalueVector<InnerRange<InnerIt>>, ViewProperties{}, It>;
+
+template <bool Common, template <class> class It, bool CommonInner, template <class> class InnerIt>
+using MaybeCommonView =
+    BasicView<std::vector<MaybeCommonInnerRange<CommonInner, InnerIt>>, ViewProperties{.common = Common}, It>;
+
+template <template <class> class It>
+using Pattern = BasicView<std::vector<float>, ViewProperties{}, It>;
+
+template <template <class> class It>
+using RvaluePattern = BasicView<RvalueVector<float>, ViewProperties{}, It>;
+
+template <bool Common, template <class> class It>
+using MaybeCommonPattern = BasicView<std::vector<float>, ViewProperties{.common = Common}, It>;
+
+template <class V, class Pattern>
+using IteratorCategory = std::ranges::iterator_t<std::ranges::join_with_view<V, Pattern>>::iterator_category;
+
+template <class V, class Pattern>
+concept HasIteratorCategory = requires { typename IteratorCategory<V, Pattern>; };
+
+template <class V, class Pat, class Category>
+concept IteratorCategoryIs = std::same_as<IteratorCategory<V, Pat>, Category>;
+
+// When `iterator<false>::iterator_category` is not defined
+static_assert(!HasIteratorCategory<View<cpp20_input_iterator, forward_iterator>, Pattern<forward_iterator>>);
+static_assert(!HasIteratorCategory<View<forward_iterator, cpp20_input_iterator>, Pattern<forward_iterator>>);
+static_assert(!HasIteratorCategory<View<forward_iterator, forward_iterator>, Pattern<cpp20_input_iterator>>);
+static_assert(!HasIteratorCategory<RvalueView<forward_iterator, forward_iterator>, Pattern<forward_iterator>>);
+static_assert(HasIteratorCategory<View<forward_iterator, forward_iterator>, Pattern<forward_iterator>>);
+
+// When
+//   is_reference_v<common_reference_t<iter_reference_t<InnerIter>,
+//                                     iter_reference_t<PatternIter>>>
+// has different values for `iterator<false>`
+static_assert(IteratorCategoryIs<View<forward_iterator, forward_iterator>,
+                                 RvaluePattern<forward_iterator>,
+                                 std::input_iterator_tag>);
+
+// When `iterator<false>::iterator_category` is `bidirectional_iterator_tag`
+static_assert(IteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, bidirectional_iterator>,
+                                 MaybeCommonPattern<true, bidirectional_iterator>,
+                                 std::bidirectional_iterator_tag>);
+static_assert(IteratorCategoryIs<MaybeCommonView<false, bidirectional_iterator, true, bidirectional_iterator>,
+                                 MaybeCommonPattern<true, bidirectional_iterator>,
+                                 std::bidirectional_iterator_tag>);
+
+// When `iterator<false>::iterator_category` is `forward_iterator_tag`
+static_assert(IteratorCategoryIs<MaybeCommonView<true, forward_iterator, true, bidirectional_iterator>,
+                                 MaybeCommonPattern<true, bidirectional_iterator>,
+                                 std::forward_iterator_tag>);
+static_assert(IteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, forward_iterator>,
+                                 MaybeCommonPattern<true, bidirectional_iterator>,
+                                 std::forward_iterator_tag>);
+static_assert(IteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, bidirectional_iterator>,
+                                 MaybeCommonPattern<true, forward_iterator>,
+                                 std::forward_iterator_tag>);
+static_assert(IteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, false, bidirectional_iterator>,
+                                 MaybeCommonPattern<true, bidirectional_iterator>,
+                                 std::forward_iterator_tag>);
+static_assert(IteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, bidirectional_iterator>,
+                                 MaybeCommonPattern<false, bidirectional_iterator>,
+                                 std::forward_iterator_tag>);
+static_assert(IteratorCategoryIs<MaybeCommonView<false, forward_iterator, false, forward_iterator>,
+                                 MaybeCommonPattern<false, forward_iterator>,
+                                 std::forward_iterator_tag>);
+
+// When `iterator<false>::iterator_category` is `input_iterator_tag`
+static_assert(IteratorCategoryIs<View<ForwardIteratorWithInputCategory, forward_iterator>,
+                                 Pattern<forward_iterator>,
+                                 std::input_iterator_tag>);
+static_assert(IteratorCategoryIs<View<forward_iterator, ForwardIteratorWithInputCategory>,
+                                 Pattern<forward_iterator>,
+                                 std::input_iterator_tag>);
+static_assert(IteratorCategoryIs<View<forward_iterator, forward_iterator>,
+                                 Pattern<ForwardIteratorWithInputCategory>,
+                                 std::input_iterator_tag>);
+static_assert(IteratorCategoryIs<View<ForwardIteratorWithInputCategory, ForwardIteratorWithInputCategory>,
+                                 Pattern<ForwardIteratorWithInputCategory>,
+                                 std::input_iterator_tag>);
+
+template <class V, class Pattern>
+using ConstIteratorCategory = std::ranges::iterator_t<const std::ranges::join_with_view<V, Pattern>>::iterator_category;
+
+template <class V, class Pattern>
+concept HasConstIteratorCategory = requires { typename ConstIteratorCategory<V, Pattern>; };
+
+template <class V, class Pat, class Category>
+concept ConstIteratorCategoryIs = std::same_as<ConstIteratorCategory<V, Pat>, Category>;
+
+// `iterator<true>::iterator_category` is not defined in those
+// cases because `join_with_view<V, Pattern>` cannot const-accessed
+static_assert(!HasConstIteratorCategory<View<cpp20_input_iterator, forward_iterator>, Pattern<forward_iterator>>);
+static_assert(!HasConstIteratorCategory<View<forward_iterator, cpp20_input_iterator>, Pattern<forward_iterator>>);
+static_assert(!HasConstIteratorCategory<View<forward_iterator, forward_iterator>, Pattern<cpp20_input_iterator>>);
+static_assert(!HasConstIteratorCategory<RvalueView<forward_iterator, forward_iterator>, Pattern<forward_iterator>>);
+static_assert(HasConstIteratorCategory<View<forward_iterator, forward_iterator>, Pattern<forward_iterator>>);
+
+// When
+//   is_reference_v<common_reference_t<iter_reference_t<InnerIter>,
+//                                     iter_reference_t<PatternIter>>>
+// has different values for `iterator<true>`
+static_assert(ConstIteratorCategoryIs<View<forward_iterator, forward_iterator>,
+                                      RvaluePattern<forward_iterator>,
+                                      std::input_iterator_tag>);
+
+// When `iterator<true>::iterator_category` is `bidirectional_iterator_tag`
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, bidirectional_iterator>,
+                                      MaybeCommonPattern<true, bidirectional_iterator>,
+                                      std::bidirectional_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<false, bidirectional_iterator, true, bidirectional_iterator>,
+                                      MaybeCommonPattern<true, bidirectional_iterator>,
+                                      std::bidirectional_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<
+              BasicVectorView<
+                  BasicVectorView<float, ViewProperties{.common = true}, forward_iterator, bidirectional_iterator>,
+                  ViewProperties{.common = true},
+                  forward_iterator,
+                  bidirectional_iterator>,
+              BasicVectorView<float, ViewProperties{.common = true}, forward_iterator, bidirectional_iterator>,
+              std::bidirectional_iterator_tag>);
+
+// When `iterator<true>::iterator_category` is `forward_iterator_tag`
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<true, forward_iterator, true, bidirectional_iterator>,
+                                      MaybeCommonPattern<true, bidirectional_iterator>,
+                                      std::forward_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, forward_iterator>,
+                                      MaybeCommonPattern<true, bidirectional_iterator>,
+                                      std::forward_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, bidirectional_iterator>,
+                                      MaybeCommonPattern<true, forward_iterator>,
+                                      std::forward_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, false, bidirectional_iterator>,
+                                      MaybeCommonPattern<true, bidirectional_iterator>,
+                                      std::forward_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<true, bidirectional_iterator, true, bidirectional_iterator>,
+                                      MaybeCommonPattern<false, bidirectional_iterator>,
+                                      std::forward_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<MaybeCommonView<false, forward_iterator, false, forward_iterator>,
+                                      MaybeCommonPattern<false, forward_iterator>,
+                                      std::forward_iterator_tag>);
+static_assert(
+    ConstIteratorCategoryIs<
+        BasicVectorView<BasicVectorView<float, ViewProperties{}, ForwardIteratorWithInputCategory, forward_iterator>,
+                        ViewProperties{},
+                        ForwardIteratorWithInputCategory,
+                        forward_iterator>,
+        BasicVectorView<float, ViewProperties{}, ForwardIteratorWithInputCategory, forward_iterator>,
+        std::forward_iterator_tag>);
+
+// When `iterator<true>::iterator_category` is `input_iterator_tag`
+static_assert(ConstIteratorCategoryIs<View<ForwardIteratorWithInputCategory, forward_iterator>,
+                                      Pattern<forward_iterator>,
+                                      std::input_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<View<forward_iterator, ForwardIteratorWithInputCategory>,
+                                      Pattern<forward_iterator>,
+                                      std::input_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<View<forward_iterator, forward_iterator>,
+                                      Pattern<ForwardIteratorWithInputCategory>,
+                                      std::input_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<View<ForwardIteratorWithInputCategory, ForwardIteratorWithInputCategory>,
+                                      Pattern<ForwardIteratorWithInputCategory>,
+                                      std::input_iterator_tag>);
+static_assert(ConstIteratorCategoryIs<
+              BasicVectorView<
+                  BasicVectorView<float, ViewProperties{}, DefaultCtorInputIter, ForwardIteratorWithInputCategory>,
+                  ViewProperties{},
+                  DefaultCtorInputIter,
+                  ForwardIteratorWithInputCategory>,
+              BasicVectorView<float, ViewProperties{}, ForwardIteratorWithInputCategory>,
+              std::input_iterator_tag>);
+} // namespace test_iterator_category
+
+namespace test_value_type {
+template <class ValueType, class ConstValueType = ValueType>
+struct View : std::ranges::view_base {
+  struct InnerRange : std::ranges::view_base {
+    ValueType* begin();
+    ValueType* end();
+    ConstValueType* begin() const;
+    ConstValueType* end() const;
+  };
+
+  InnerRange* begin();
+  InnerRange* end();
+  const InnerRange* begin() const;
+  const InnerRange* end() const;
+};
+
+template <class ValueType, class ConstValueType = ValueType>
+using Pattern = View<ValueType, ConstValueType>::InnerRange;
+
+template <class V, class Pat>
+using IteratorValueType = std::ranges::iterator_t<std::ranges::join_with_view<V, Pat>>::value_type;
+
+template <class V, class Pat, class ValueType>
+concept IteratorValueTypeIs = std::same_as<IteratorValueType<V, Pat>, ValueType>;
+
+// Test that `iterator<false>::value_type` is equal to
+//   common_type_t<iter_value_t<InnerIter>, iter_value_t<PatternIter>>
+static_assert(IteratorValueTypeIs<View<int>, Pattern<int>, int>);
+static_assert(IteratorValueTypeIs<View<int>, Pattern<long>, long>);
+static_assert(IteratorValueTypeIs<View<long>, Pattern<int>, long>);
+static_assert(IteratorValueTypeIs<View<std::nullptr_t>, Pattern<void*>, void*>);
+static_assert(IteratorValueTypeIs<View<std::tuple<long, int>>, Pattern<std::tuple<int, long>>, std::tuple<long, long>>);
+
+template <class V, class Pat>
+using ConstIteratorValueType = std::ranges::iterator_t<const std::ranges::join_with_view<V, Pat>>::value_type;
+
+template <class V, class Pat, class ValueType>
+concept ConstIteratorValueTypeIs = std::same_as<ConstIteratorValueType<V, Pat>, ValueType>;
+
+// Test that `iterator<true>::value_type` is equal to
+//   common_type_t<iter_value_t<InnerIter>, iter_value_t<PatternIter>>
+static_assert(ConstIteratorValueTypeIs<View<int>, Pattern<int>, int>);
+static_assert(ConstIteratorValueTypeIs<View<int>, Pattern<long>, long>);
+static_assert(ConstIteratorValueTypeIs<View<long>, Pattern<int>, long>);
+static_assert(ConstIteratorValueTypeIs<View<std::nullptr_t>, Pattern<void*>, void*>);
+static_assert(
+    ConstIteratorValueTypeIs<View<std::tuple<long, int>>, Pattern<std::tuple<int, long>>, std::tuple<long, long>>);
+
+// Test value types of non-simple const ranges
+static_assert(ConstIteratorValueTypeIs<View<short, int>, Pattern<short, int>, int>);
+static_assert(ConstIteratorValueTypeIs<View<short, int>, Pattern<int, long>, long>);
+static_assert(ConstIteratorValueTypeIs<View<int, long>, Pattern<short, int>, long>);
+static_assert(ConstIteratorValueTypeIs<View<int, std::nullptr_t>, Pattern<int, void*>, void*>);
+static_assert(ConstIteratorValueTypeIs<View<std::tuple<long, int>, std::pair<long, int>>,
+                                       Pattern<std::tuple<int, long>, std::pair<int, long>>,
+                                       std::pair<long, long>>);
+} // namespace test_value_type
+
+namespace test_difference_type {
+template <class DifferenceType, class ValueType>
+struct Iter {
+  using value_type      = std::remove_const_t<ValueType>;
+  using difference_type = DifferenceType;
+
+  ValueType& operator*() const;
+  Iter& operator++();
+  Iter operator++(int);
+  bool operator==(const Iter&) const;
+};
+
+static_assert(std::forward_iterator<Iter<int, void*>>);
+
+template <class DifferenceType,
+          class InnerDifferenceType,
+          class ConstDifferenceType      = DifferenceType,
+          class InnerConstDifferenceType = InnerDifferenceType>
+struct View : std::ranges::view_base {
+  struct InnerRange : std::ranges::view_base {
+    Iter<InnerDifferenceType, float> begin();
+    Iter<InnerDifferenceType, float> end();
+    Iter<InnerConstDifferenceType, double> begin() const;
+    Iter<InnerConstDifferenceType, double> end() const;
+  };
+
+  Iter<DifferenceType, InnerRange> begin();
+  Iter<DifferenceType, InnerRange> end();
+  Iter<ConstDifferenceType, const InnerRange> begin() const;
+  Iter<ConstDifferenceType, const InnerRange> end() const;
+};
+
+template <class DifferenceType, class ConstDifferenceType = DifferenceType>
+struct Pattern : std::ranges::view_base {
+  Iter<DifferenceType, float> begin();
+  Iter<DifferenceType, float> end();
+  Iter<ConstDifferenceType, double> begin() const;
+  Iter<ConstDifferenceType, double> end() const;
+};
+
+template <class V, class Pat>
+using IteratorDifferenceType = std::ranges::iterator_t<std::ranges::join_with_view<V, Pat>>::difference_type;
+
+template <class V, class Pat, class DifferenceType>
+concept IteratorDifferenceTypeIs = std::same_as<IteratorDifferenceType<V, Pat>, DifferenceType>;
+
+// Test that `iterator<false>::difference_type` is equal to
+//   common_type_t<
+//       iter_difference_t<OuterIter>,
+//       iter_difference_t<InnerIter>,
+//       iter_difference_t<PatternIter>>
+static_assert(IteratorDifferenceTypeIs<View<int, int>, Pattern<int>, int>);
+static_assert(IteratorDifferenceTypeIs<View<signed char, signed char>, Pattern<signed char>, signed char>);
+static_assert(IteratorDifferenceTypeIs<View<short, short>, Pattern<short>, short>);
+static_assert(IteratorDifferenceTypeIs<View<signed char, short>, Pattern<short>, int>);
+static_assert(IteratorDifferenceTypeIs<View<signed char, short>, Pattern<int>, int>);
+static_assert(IteratorDifferenceTypeIs<View<long long, long>, Pattern<int>, long long>);
+static_assert(IteratorDifferenceTypeIs<View<long, long long>, Pattern<int>, long long>);
+
+template <class V, class Pat>
+using ConstIteratorDifferenceType = std::ranges::iterator_t<const std::ranges::join_with_view<V, Pat>>::difference_type;
+
+template <class V, class Pat, class DifferenceType>
+concept ConstIteratorDifferenceTypeIs = std::same_as<ConstIteratorDifferenceType<V, Pat>, DifferenceType>;
+
+// Test that `iterator<true>::difference_type` is equal to
+//   common_type_t<
+//       iter_difference_t<OuterIter>,
+//       iter_difference_t<InnerIter>,
+//       iter_difference_t<PatternIter>>
+static_assert(ConstIteratorDifferenceTypeIs<View<int, int>, Pattern<int>, int>);
+static_assert(ConstIteratorDifferenceTypeIs<View<signed char, signed char>, Pattern<signed char>, signed char>);
+static_assert(ConstIteratorDifferenceTypeIs<View<short, short>, Pattern<short>, short>);
+static_assert(ConstIteratorDifferenceTypeIs<View<signed char, short>, Pattern<short>, int>);
+static_assert(ConstIteratorDifferenceTypeIs<View<signed char, short>, Pattern<int>, int>);
+static_assert(ConstIteratorDifferenceTypeIs<View<long long, long>, Pattern<int>, long long>);
+static_assert(ConstIteratorDifferenceTypeIs<View<long, long long>, Pattern<int>, long long>);
+
+// Test difference types of non-simple const ranges
+static_assert(ConstIteratorDifferenceTypeIs<View<short, short, int, int>, Pattern<short, int>, int>);
+static_assert(
+    ConstIteratorDifferenceTypeIs<View<int, short, signed char, signed char>, Pattern<long, signed char>, signed char>);
+static_assert(ConstIteratorDifferenceTypeIs<View<long, long long, signed char, short>, Pattern<long, short>, int>);
+static_assert(ConstIteratorDifferenceTypeIs<View<short, short, long long, long>, Pattern<short, int>, long long>);
+static_assert(ConstIteratorDifferenceTypeIs<View<signed char, signed char, long, long long>,
+                                            Pattern<signed char, int>,
+                                            long long>);
+} // namespace test_difference_type
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.pass.cpp
new file mode 100644
index 000000000000..da53bfaaa5a5
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/adaptor.pass.cpp
@@ -0,0 +1,360 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// std::views::join_with_view
+
+#include <ranges>
+
+#include <memory>
+#include <span>
+#include <string_view>
+#include <utility>
+
+#include "test_iterators.h"
+
+template <class View, class T>
+concept CanBePiped = requires(View&& view, T&& t) {
+  { std::forward<View>(view) | std::forward<T>(t) };
+};
+
+struct Range : std::ranges::view_base {
+  using Iterator = forward_iterator<std::string_view*>;
+  using Sentinel = sentinel_wrapper<Iterator>;
+  constexpr explicit Range(std::string_view* b, std::string_view* e) : begin_(b), end_(e) {}
+  constexpr Iterator begin() const { return Iterator(begin_); }
+  constexpr Sentinel end() const { return Sentinel(Iterator(end_)); }
+
+private:
+  std::string_view* begin_;
+  std::string_view* end_;
+};
+
+struct Pattern : std::ranges::view_base {
+  using Iterator = forward_iterator<const char*>;
+  using Sentinel = sentinel_wrapper<Iterator>;
+  static constexpr std::string_view pat{", "};
+
+  constexpr Pattern() = default;
+  constexpr Iterator begin() const { return Iterator(pat.data()); }
+  constexpr Sentinel end() const { return Sentinel(Iterator(pat.data() + pat.size())); }
+};
+
+struct NonCopyablePattern : Pattern {
+  NonCopyablePattern(const NonCopyablePattern&) = delete;
+};
+
+template <typename View>
+constexpr void compareViews(View v, std::string_view list) {
+  auto b1 = v.begin();
+  auto e1 = v.end();
+  auto b2 = list.begin();
+  auto e2 = list.end();
+  for (; b1 != e1 && b2 != e2; ++b1, ++b2) {
+    assert(*b1 == *b2);
+  }
+  assert(b1 == e1);
+  assert(b2 == e2);
+}
+
+constexpr int absoluteValue(int x) { return x < 0 ? -x : x; }
+
+template <class T>
+constexpr const T&& asConstRvalue(T&& t) {
+  return static_cast<const T&&>(t);
+}
+
+constexpr void test_adaptor_with_pattern(std::span<std::string_view> buff) {
+  // Test `views::join_with(pattern)(v)`
+  {
+    using Result = std::ranges::join_with_view<Range, Pattern>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    Pattern pattern;
+
+    {
+      // 'views::join_with(pattern)' - &&
+      std::same_as<Result> decltype(auto) result = std::views::join_with(pattern)(range);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with(pattern)' - const&&
+      std::same_as<Result> decltype(auto) result = asConstRvalue(std::views::join_with(pattern))(range);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with(pattern)' - &
+      auto partial                               = std::views::join_with(pattern);
+      std::same_as<Result> decltype(auto) result = partial(range);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with(pattern)' - const&
+      auto const partial                         = std::views::join_with(pattern);
+      std::same_as<Result> decltype(auto) result = partial(range);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+  }
+
+  // Test `v | views::join_with(pattern)`
+  {
+    using Result = std::ranges::join_with_view<Range, Pattern>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    Pattern pattern;
+
+    {
+      // 'views::join_with(pattern)' - &&
+      std::same_as<Result> decltype(auto) result = range | std::views::join_with(pattern);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with(pattern)' - const&&
+      std::same_as<Result> decltype(auto) result = range | asConstRvalue(std::views::join_with(pattern));
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with(pattern)' - &
+      auto partial                               = std::views::join_with(pattern);
+      std::same_as<Result> decltype(auto) result = range | partial;
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with(pattern)' - const&
+      auto const partial                         = std::views::join_with(pattern);
+      std::same_as<Result> decltype(auto) result = range | partial;
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+  }
+
+  // Test `views::join_with(v, pattern)` range adaptor object
+  {
+    using Result = std::ranges::join_with_view<Range, Pattern>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    Pattern pattern;
+
+    {
+      // 'views::join_with' - &&
+      auto range_adaptor                         = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = std::move(range_adaptor)(range, pattern);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with' - const&&
+      const auto range_adaptor                   = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = std::move(range_adaptor)(range, pattern);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with' - &
+      auto range_adaptor                         = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = range_adaptor(range, pattern);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+    {
+      // 'views::join_with' - const&
+      const auto range_adaptor                   = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = range_adaptor(range, pattern);
+      compareViews(result, "abcd, ef, ghij, kl");
+    }
+  }
+
+  // Test `adaptor | views::join_with(pattern)`
+  {
+    auto pred    = [](std::string_view s) { return s.size() >= 3; };
+    using Result = std::ranges::join_with_view<std::ranges::filter_view<Range, decltype(pred)>, Pattern>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    Pattern pattern;
+
+    {
+      std::same_as<Result> decltype(auto) result = range | std::views::filter(pred) | std::views::join_with(pattern);
+      compareViews(result, "abcd, ghij");
+    }
+    {
+      const auto partial                         = std::views::filter(pred) | std::views::join_with(pattern);
+      std::same_as<Result> decltype(auto) result = range | partial;
+      compareViews(result, "abcd, ghij");
+    }
+  }
+}
+
+constexpr void test_adaptor_with_single_element(std::span<std::string_view> buff) {
+  // Test `views::join_with(element)(v)`
+  {
+    using Result = std::ranges::join_with_view<Range, std::ranges::single_view<char>>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    const char element = '.';
+
+    {
+      // 'views::join_with(element)' - &&
+      std::same_as<Result> decltype(auto) result = std::views::join_with(element)(range);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with(element)' - const&&
+      std::same_as<Result> decltype(auto) result = asConstRvalue(std::views::join_with(element))(range);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with(element)' - &
+      auto partial                               = std::views::join_with(element);
+      std::same_as<Result> decltype(auto) result = partial(range);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with(element)' - const&
+      const auto partial                         = std::views::join_with(element);
+      std::same_as<Result> decltype(auto) result = partial(range);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+  }
+
+  // Test `v | views::join_with(element)`
+  {
+    using Result = std::ranges::join_with_view<Range, std::ranges::single_view<char>>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    const char element = '.';
+
+    {
+      // 'views::join_with(element)' - &&
+      std::same_as<Result> decltype(auto) result = range | std::views::join_with(element);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with(element)' - const&&
+      std::same_as<Result> decltype(auto) result = range | asConstRvalue(std::views::join_with(element));
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with(element)' - &
+      auto partial                               = std::views::join_with(element);
+      std::same_as<Result> decltype(auto) result = range | partial;
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with(element)' - const&
+      const auto partial                         = std::views::join_with(element);
+      std::same_as<Result> decltype(auto) result = range | partial;
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+  }
+
+  // Test `views::join_with(v, element)` range adaptor object
+  {
+    using Result = std::ranges::join_with_view<Range, std::ranges::single_view<char>>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    const char element = '.';
+
+    {
+      // 'views::join_with' - &&
+      auto range_adaptor                         = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = std::move(range_adaptor)(range, element);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with' - const&&
+      const auto range_adaptor                   = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = std::move(range_adaptor)(range, element);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with' - &
+      auto range_adaptor                         = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = range_adaptor(range, element);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+    {
+      // 'views::join_with' - const&
+      const auto range_adaptor                   = std::views::join_with;
+      std::same_as<Result> decltype(auto) result = range_adaptor(range, element);
+      compareViews(result, "abcd.ef.ghij.kl");
+    }
+  }
+
+  // Test `adaptor | views::join_with(element)`
+  {
+    auto pred = [](std::string_view s) { return s.size() >= 3; };
+    using Result =
+        std::ranges::join_with_view<std::ranges::filter_view<Range, decltype(pred)>, std::ranges::single_view<char>>;
+    const Range range(buff.data(), buff.data() + buff.size());
+    const char element = '.';
+
+    {
+      std::same_as<Result> decltype(auto) result = range | std::views::filter(pred) | std::views::join_with(element);
+      compareViews(result, "abcd.ghij");
+    }
+    {
+      const auto partial                         = std::views::filter(pred) | std::views::join_with(element);
+      std::same_as<Result> decltype(auto) result = range | partial;
+      compareViews(result, "abcd.ghij");
+    }
+  }
+}
+
+constexpr bool test() {
+  std::string_view buff[] = {"abcd", "ef", "ghij", "kl"};
+
+  // Test range adaptor object
+  {
+    using RangeAdaptorObject = decltype(std::views::join_with);
+    static_assert(std::is_const_v<RangeAdaptorObject>);
+
+    // The type of a customization point object, ignoring cv-qualifiers, shall model semiregular
+    static_assert(std::semiregular<std::remove_const<RangeAdaptorObject>>);
+  }
+
+  test_adaptor_with_pattern(buff);
+  test_adaptor_with_single_element(buff);
+
+  // Test that one can call std::views::join_with with arbitrary stuff, as long as we
+  // don't try to actually complete the call by passing it a range.
+  //
+  // That makes no sense and we can't do anything with the result, but it's valid.
+  {
+    long array[3]                 = {1, 2, 3};
+    [[maybe_unused]] auto partial = std::views::join_with(std::move(array));
+  }
+
+  // Test SFINAE friendliness
+  {
+    struct NotAView {};
+
+    static_assert(!CanBePiped<Range, decltype(std::views::join_with)>);
+    static_assert(CanBePiped<Range, decltype(std::views::join_with(Pattern{}))>);
+    static_assert(CanBePiped<Range, decltype(std::views::join_with('.'))>);
+    static_assert(!CanBePiped<NotAView, decltype(std::views::join_with(Pattern{}))>);
+    static_assert(!CanBePiped<NotAView, decltype(std::views::join_with('.'))>);
+    static_assert(!CanBePiped<std::initializer_list<char>, decltype(std::views::join_with(Pattern{}))>);
+    static_assert(!CanBePiped<std::initializer_list<char>, decltype(std::views::join_with('.'))>);
+    static_assert(!CanBePiped<Range, decltype(std::views::join_with(NotAView{}))>);
+
+    static_assert(!std::is_invocable_v<decltype(std::views::join_with)>);
+    static_assert(!std::is_invocable_v<decltype(std::views::join_with), Pattern, Range>);
+    static_assert(!std::is_invocable_v<decltype(std::views::join_with), char, Range>);
+    static_assert(std::is_invocable_v<decltype(std::views::join_with), Range, Pattern>);
+    static_assert(std::is_invocable_v<decltype(std::views::join_with), Range, char>);
+    static_assert(!std::is_invocable_v<decltype(std::views::join_with), Range, Pattern, Pattern>);
+    static_assert(!std::is_invocable_v<decltype(std::views::join_with), Range, char, char>);
+    static_assert(!std::is_invocable_v<decltype(std::views::join_with), NonCopyablePattern>);
+  }
+
+  {
+    static_assert(std::is_same_v<decltype(std::ranges::views::join_with), decltype(std::views::join_with)>);
+    assert(std::addressof(std::ranges::views::join_with) == std::addressof(std::views::join_with));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/example.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/example.pass.cpp
new file mode 100644
index 000000000000..1f7b889e2602
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.overview/example.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// [Example 1:
+//   vector<string> vs = {"the", "quick", "brown", "fox"};
+//   for (char c : vs | views::join_with('-')) {
+//     cout << c;
+//   }
+//   // The above prints the-quick-brown-fox
+// - end example]
+
+#include <ranges>
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+constexpr bool test() {
+  std::vector<std::string> vs = {"the", "quick", "brown", "fox"};
+  std::string result;
+  for (char c : vs | std::views::join_with('-')) {
+    result += c;
+  }
+
+  return result == "the-quick-brown-fox";
+}
+
+int main(int, char**) {
+  assert(test());
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.default.pass.cpp
new file mode 100644
index 000000000000..634db24e02f2
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.default.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// sentinel() = default;
+
+#include <ranges>
+
+#include "../types.h"
+
+constexpr bool test() {
+  using Inner   = BasicVectorView<char, ViewProperties{.common = false}, forward_iterator>;
+  using V       = BasicVectorView<Inner, ViewProperties{}, forward_iterator>;
+  using Pattern = Inner;
+  using JWV     = std::ranges::join_with_view<V, Pattern>;
+  static_assert(!std::ranges::common_range<JWV>);
+
+  [[maybe_unused]] std::ranges::sentinel_t<JWV> se;
+  [[maybe_unused]] std::ranges::sentinel_t<const JWV> cse;
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.non_const.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.non_const.pass.cpp
new file mode 100644
index 000000000000..14688ba7c538
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/ctor.non_const.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr sentinel(sentinel<!Const> s)
+//   requires Const && convertible_to<sentinel_t<V>, sentinel_t<Base>>;
+
+#include <ranges>
+
+#include <type_traits>
+#include <vector>
+
+#include "../types.h"
+#include "test_iterators.h"
+
+constexpr bool test() {
+  { // Regular conversion from `!Const` to `Const` sentinel
+    using Inner            = BasicVectorView<int, ViewProperties{.common = false}, forward_iterator>;
+    std::vector<Inner> vec = {Inner{11, 12}, Inner{13, 14}};
+
+    std::ranges::join_with_view jwv(vec, 0);
+    using JWV = decltype(jwv);
+    static_assert(!std::ranges::common_range<JWV>);
+
+    using Sent  = std::ranges::sentinel_t<JWV>;
+    using CSent = std::ranges::sentinel_t<const JWV>;
+    static_assert(!std::same_as<Sent, CSent>);
+
+    Sent se                    = jwv.end();
+    [[maybe_unused]] CSent cse = se;
+  }
+
+  { // Test conversion from `Const` to `!Const` (should be invalid)
+    using Inner   = BasicVectorView<int, ViewProperties{.common = false}, forward_iterator>;
+    using V       = std::vector<Inner>;
+    using Pattern = std::ranges::single_view<int>;
+    using JWV     = std::ranges::join_with_view<std::views::all_t<V>, Pattern>;
+    static_assert(!std::ranges::common_range<JWV>);
+
+    using Sent  = std::ranges::sentinel_t<JWV>;
+    using CSent = std::ranges::sentinel_t<const JWV>;
+    static_assert(!std::convertible_to<CSent, Sent>);
+    static_assert(!std::constructible_from<Sent, CSent>);
+  }
+
+  { // When `convertible_to<sentinel_t<V>, sentinel_t<Base>>` is not modeled
+    using V       = ConstOppositeView<std::vector<long>>;
+    using Pattern = std::ranges::single_view<long>;
+    using JWV     = std::ranges::join_with_view<V, Pattern>;
+    static_assert(!std::ranges::common_range<JWV>);
+
+    using Sent  = std::ranges::sentinel_t<JWV>;
+    using CSent = std::ranges::sentinel_t<const JWV>;
+    static_assert(!std::convertible_to<CSent, Sent>);
+    static_assert(!std::constructible_from<Sent, CSent>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.pass.cpp
new file mode 100644
index 000000000000..44fb25b403ba
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.sentinel/eq.pass.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// template<bool OtherConst>
+//   requires sentinel_for<sentinel_t<Base>, iterator_t<maybe-const<OtherConst, V>>>
+// friend constexpr bool operator==(const iterator<OtherConst>& x, const sentinel& y);
+
+#include <ranges>
+
+#include <cassert>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "../types.h"
+#include "test_comparisons.h"
+#include "test_iterators.h"
+
+struct NonCrossConstComparableView : std::ranges::view_base {
+  using NonConstRange = std::vector<int>;
+  NonConstRange* begin();
+  sentinel_wrapper<NonConstRange*> end();
+
+  using ConstRange = BasicVectorView<int, ViewProperties{}, forward_iterator>;
+  ConstRange* begin() const;
+  sentinel_wrapper<ConstRange*> end() const;
+};
+
+static_assert(std::ranges::range<NonCrossConstComparableView>);
+static_assert(std::ranges::range<const NonCrossConstComparableView>);
+
+constexpr bool test() {
+  using Inner   = BasicVectorView<int, ViewProperties{.common = false}, cpp20_input_iterator>;
+  using V       = std::vector<Inner>;
+  using Pattern = std::ranges::single_view<int>;
+  using JWV     = std::ranges::join_with_view<std::ranges::owning_view<V>, Pattern>;
+  static_assert(!std::ranges::common_range<JWV>);
+
+  using Iter  = std::ranges::iterator_t<JWV>;
+  using CIter = std::ranges::iterator_t<const JWV>;
+  static_assert(!std::same_as<Iter, CIter>);
+
+  using Sent  = std::ranges::sentinel_t<JWV>;
+  using CSent = std::ranges::sentinel_t<const JWV>;
+  static_assert(!std::same_as<Sent, CSent>);
+
+  {   // Compare iterator<Const> with sentinel<Const>
+    { // Const == true
+      AssertEqualityReturnBool<CIter, CSent>();
+      const JWV jwv(V{Inner{1, 2}, Inner{4}}, 3);
+      assert(testEquality(std::ranges::next(jwv.begin(), 4), jwv.end(), true));
+      assert(testEquality(jwv.begin(), jwv.end(), false));
+    }
+
+    { // Const == false
+      AssertEqualityReturnBool<Iter, Sent>();
+      JWV jwv(V{Inner{5}, Inner{7, 8}}, 6);
+      assert(testEquality(std::ranges::next(jwv.begin(), 4), jwv.end(), true));
+      assert(testEquality(std::ranges::next(jwv.begin(), 2), jwv.end(), false));
+    }
+  }
+
+  {   // Compare iterator<Const> with sentinel<!Const>
+    { // Const == true
+      AssertEqualityReturnBool<CIter, Sent>();
+      JWV jwv(V{Inner{9, 10}, Inner{12}}, 11);
+      assert(testEquality(std::ranges::next(std::as_const(jwv).begin(), 4), jwv.end(), true));
+      assert(testEquality(std::ranges::next(std::as_const(jwv).begin(), 2), jwv.end(), false));
+    }
+
+    { // Const == false
+      AssertEqualityReturnBool<Iter, CSent>();
+      JWV jwv(V{Inner{13}, Inner{15, 16}}, 14);
+      assert(testEquality(std::ranges::next(jwv.begin(), 4), std::as_const(jwv).end(), true));
+      assert(testEquality(std::ranges::next(jwv.begin(), 3), std::as_const(jwv).end(), false));
+    }
+  }
+
+  { // Check invalid comparisons between iterator<Const> and sentinel<!Const>
+    using JWV2 = std::ranges::join_with_view<NonCrossConstComparableView, Pattern>;
+    static_assert(!std::ranges::common_range<JWV2>);
+
+    static_assert(!weakly_equality_comparable_with<std::ranges::iterator_t<const JWV2>, std::ranges::sentinel_t<JWV2>>);
+    static_assert(!weakly_equality_comparable_with<std::ranges::iterator_t<JWV2>, std::ranges::sentinel_t<const JWV2>>);
+
+    // Those should be valid
+    static_assert(weakly_equality_comparable_with<std::ranges::iterator_t<JWV2>, std::ranges::sentinel_t<JWV2>>);
+    static_assert(
+        weakly_equality_comparable_with<std::ranges::iterator_t<const JWV2>, std::ranges::sentinel_t<const JWV2>>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/base.pass.cpp
new file mode 100644
index 000000000000..4e79b0fadfce
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/base.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr View base() const& requires copy_constructible<View>;
+// constexpr View base() &&;
+
+#include <ranges>
+
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using InnerRange = std::vector<int>;
+
+struct Range : std::ranges::view_base {
+  constexpr explicit Range(InnerRange* b, InnerRange* e) : begin_(b), end_(e) {}
+  constexpr Range(const Range& other) : begin_(other.begin_), end_(other.end_), was_copy_initialized_(true) {}
+  constexpr Range(Range&& other) : begin_(other.begin_), end_(other.end_), was_move_initialized_(true) {}
+  Range& operator=(const Range&) = default;
+  Range& operator=(Range&&)      = default;
+  constexpr InnerRange* begin() const { return begin_; }
+  constexpr InnerRange* end() const { return end_; }
+
+  InnerRange* begin_;
+  InnerRange* end_;
+  bool was_copy_initialized_ = false;
+  bool was_move_initialized_ = false;
+};
+
+static_assert(std::ranges::view<Range>);
+static_assert(std::ranges::input_range<Range>);
+
+struct Pattern : std::ranges::view_base {
+  static constexpr int pat[2] = {0, 0};
+  constexpr const int* begin() const { return pat; }
+  constexpr const int* end() const { return pat + 2; }
+};
+
+static_assert(std::ranges::view<Pattern>);
+static_assert(std::ranges::forward_range<Pattern>);
+
+template <class Tp>
+struct NonCopyableRange : std::ranges::view_base {
+  NonCopyableRange(const NonCopyableRange&)            = delete;
+  NonCopyableRange(NonCopyableRange&&)                 = default;
+  NonCopyableRange& operator=(const NonCopyableRange&) = default;
+  NonCopyableRange& operator=(NonCopyableRange&&)      = default;
+  Tp* begin() const;
+  Tp* end() const;
+};
+
+static_assert(!std::copy_constructible<NonCopyableRange<InnerRange>>);
+static_assert(!std::copy_constructible<NonCopyableRange<int>>);
+
+template <typename T>
+concept CanCallBaseOn = requires(T&& t) { std::forward<T>(t).base(); };
+
+constexpr bool test() {
+  InnerRange buff[3] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
+  Pattern pattern;
+
+  { // Check the const& overload
+    Range range(buff, buff + 3);
+    std::ranges::join_with_view<Range, Pattern> view(range, pattern);
+    std::same_as<Range> decltype(auto) result = view.base();
+    assert(result.was_copy_initialized_);
+    assert(result.begin() == buff);
+    assert(result.end() == buff + 3);
+  }
+
+  { // Check the const& overload on const `view`
+    Range range(buff, buff + 3);
+    const std::ranges::join_with_view<Range, Pattern> view(range, pattern);
+    std::same_as<Range> decltype(auto) result = view.base();
+    assert(result.was_copy_initialized_);
+    assert(result.begin() == buff);
+    assert(result.end() == buff + 3);
+  }
+
+  { // Check the && overload
+    Range range(buff, buff + 3);
+    std::ranges::join_with_view<Range, Pattern> view(range, pattern);
+    std::same_as<Range> decltype(auto) result = std::move(view).base();
+    assert(result.was_move_initialized_);
+    assert(result.begin() == buff);
+    assert(result.end() == buff + 3);
+  }
+
+  { // Ensure the const& overload is not considered when the base is not copy-constructible
+    static_assert(!CanCallBaseOn<const std::ranges::join_with_view<NonCopyableRange<InnerRange>, Pattern>&>);
+    static_assert(!CanCallBaseOn<std::ranges::join_with_view<NonCopyableRange<InnerRange>, Pattern>&>);
+    static_assert(!CanCallBaseOn<const std::ranges::join_with_view<NonCopyableRange<InnerRange>, Pattern>&&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<NonCopyableRange<InnerRange>, Pattern>&&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<NonCopyableRange<InnerRange>, Pattern>>);
+  }
+
+  { // Ensure the const& overload does not depend on Pattern's copy-constructability
+    static_assert(CanCallBaseOn<const std::ranges::join_with_view<Range, NonCopyableRange<int>>&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<Range, NonCopyableRange<int>>&>);
+    static_assert(CanCallBaseOn<const std::ranges::join_with_view<Range, NonCopyableRange<int>>&&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<Range, NonCopyableRange<int>>&&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<Range, NonCopyableRange<int>>>);
+  }
+
+  { // Check above two at the same time
+    static_assert(
+        !CanCallBaseOn<const std::ranges::join_with_view<NonCopyableRange<InnerRange>, NonCopyableRange<int>>&>);
+    static_assert(!CanCallBaseOn<std::ranges::join_with_view<NonCopyableRange<InnerRange>, NonCopyableRange<int>>&>);
+    static_assert(
+        !CanCallBaseOn<const std::ranges::join_with_view< NonCopyableRange<InnerRange>, NonCopyableRange<int>>&&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<NonCopyableRange<InnerRange>, NonCopyableRange<int>>&&>);
+    static_assert(CanCallBaseOn<std::ranges::join_with_view<NonCopyableRange<InnerRange>, NonCopyableRange<int>>>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/begin.pass.cpp
new file mode 100644
index 000000000000..22872c20773a
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/begin.pass.cpp
@@ -0,0 +1,221 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr auto begin();
+// constexpr auto begin() const
+//   requires forward_range<const V> &&
+//            forward_range<const Pattern> &&
+//            is_reference_v<range_reference_t<const V>> &&
+//            input_range<range_reference_t<const V>>;
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "../types.h"
+#include "test_iterators.h"
+
+template <bool Simple>
+using MaybeSimpleForwardView = BasicView<std::vector<std::string>, ViewProperties{.simple = Simple}, forward_iterator>;
+
+template <bool Simple>
+using MaybeSimpleForwardRvalueView =
+    BasicView<RvalueVector<std::string>, ViewProperties{.simple = Simple}, forward_iterator>;
+
+template <bool Simple>
+using MaybeSimplePattern = BasicView<std::string, ViewProperties{.simple = Simple}, forward_iterator>;
+
+template <class V, class Pattern>
+concept JoinWithViewHasConstBegin = requires(const std::ranges::join_with_view<V, Pattern> jwv) {
+  { jwv.begin() } -> std::input_iterator;
+};
+
+constexpr void test_begin() {
+  using Str = std::string;
+  using Vec = std::vector<Str>;
+
+  { // `V` models simple-view
+    // `is_reference_v<InnerRng>` is true
+    // `Pattern` models simple-view
+    // `V` and `Pattern` contain some elements
+    using V       = MaybeSimpleForwardView<true>;
+    using Pattern = MaybeSimplePattern<true>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"A", "B", "C"}), Pattern(Str{">>"}));
+    auto it = jwv.begin();
+    assert(std::ranges::equal(std::views::counted(it, 7), Str{"A>>B>>C"}));
+  }
+
+  { // `V` does not model simple-view
+    // `is_reference_v<InnerRng>` is true
+    // `Pattern` models simple-view
+    // `V` and `Pattern` are empty
+    using V       = MaybeSimpleForwardView<false>;
+    using Pattern = MaybeSimplePattern<true>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{}), Pattern(Str{}));
+    auto it = jwv.begin();
+    assert(it == jwv.end());
+  }
+
+  { // `V` models simple-view
+    // `is_reference_v<InnerRng>` is false
+    // `Pattern` models simple-view
+    // `V` contains two elements, `Pattern` is empty
+    using V       = MaybeSimpleForwardRvalueView<true>;
+    using Pattern = MaybeSimplePattern<true>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"1", "2"}), Pattern(Str{""}));
+    auto it = jwv.begin();
+    assert(*it == '1');
+    assert(*++it == '2');
+  }
+
+  { // `V` models simple-view
+    // `is_reference_v<InnerRng>` is true
+    // `Pattern` does not model simple-view
+    // `V` contains one element, `Pattern` is empty
+    using V       = MaybeSimpleForwardView<true>;
+    using Pattern = MaybeSimplePattern<false>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"07"}), Pattern(Str{}));
+    auto it = jwv.begin();
+    assert(*it++ == '0');
+    assert(*it == '7');
+  }
+
+  { // `V` does not model simple-view
+    // `is_reference_v<InnerRng>` is false
+    // `Pattern` models simple-view
+    // `V` contains three elements (2nd is empty), `Pattern` is not empty
+    using V       = MaybeSimpleForwardRvalueView<false>;
+    using Pattern = MaybeSimplePattern<true>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"A", "", "C"}), Pattern(Str{"--"}));
+    auto it = jwv.begin();
+    assert(std::ranges::equal(std::views::counted(it, 6), Str("A----C")));
+  }
+
+  { // `V` does not model simple-view
+    // `is_reference_v<InnerRng>` is true
+    // `Pattern` does not model simple-view
+    // `V` contains some empty elements, `Pattern` is not empty
+    using V       = MaybeSimpleForwardView<false>;
+    using Pattern = MaybeSimplePattern<false>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"", "", ""}), Pattern(Str{"-"}));
+    auto it = jwv.begin();
+    assert(*it++ == '-');
+    assert(*it == '-');
+  }
+
+  { // `V` models simple-view
+    // `is_reference_v<InnerRng>` is false
+    // `Pattern` does not model simple-view
+    // `V` contains two elements, `Pattern` is not empty
+    using V       = MaybeSimpleForwardRvalueView<true>;
+    using Pattern = MaybeSimplePattern<false>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"X", "Z"}), Pattern(Str{"Y"}));
+    auto it = jwv.begin();
+    assert(*it == 'X');
+    assert(*++it == 'Y');
+    assert(*++it == 'Z');
+  }
+
+  { // `V` does not model simple-view
+    // `is_reference_v<InnerRng>` is false
+    // `Pattern` does not model simple-view
+    // `V` contains two empty elements, `Pattern` is not empty
+    using V       = MaybeSimpleForwardRvalueView<false>;
+    using Pattern = MaybeSimplePattern<false>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"", ""}), Pattern(Str{"?"}));
+    auto it = jwv.begin();
+    assert(*it == '?');
+    assert(++it == jwv.end());
+  }
+
+  { // `V` does not model forward range
+    // `V` contains some empty elements, `Pattern` is empty
+    using V       = BasicView<Vec, ViewProperties{.common = false}, cpp20_input_iterator>;
+    using Pattern = MaybeSimplePattern<false>;
+    std::ranges::join_with_view<V, Pattern> jwv(V(Vec{"", "", ""}), Pattern(Str{""}));
+    auto it = jwv.begin();
+    assert(it == jwv.end());
+  }
+}
+
+constexpr void test_const_begin() {
+  using Vec = std::vector<std::array<int, 2>>;
+  using Pat = std::array<int, 2>;
+
+  { // `const V` models forward range
+    // `const Pattern` models forward range
+    // `is_reference_v<range_reference_t<const V>>` is true
+    // `range_reference_t<const V>` models input range
+    using V       = BasicView<Vec, ViewProperties{}, forward_iterator>;
+    using Pattern = BasicView<Pat, ViewProperties{}, forward_iterator>;
+
+    const std::ranges::join_with_view<V, Pattern> jwv{V{Vec{std::array{1, 2}, std::array{3, 4}}}, Pattern{Pat{0, 0}}};
+    auto it = jwv.begin();
+    assert(std::ranges::equal(std::views::counted(it, 6), std::array{1, 2, 0, 0, 3, 4}));
+  }
+
+  // `const V` does not model forward range
+  // `const Pattern` models forward range
+  // `is_reference_v<range_reference_t<const V>>` is true
+  // `range_reference_t<const V>` models input range
+  static_assert(!JoinWithViewHasConstBegin<BasicView<Vec, ViewProperties{.common = false}, cpp20_input_iterator>,
+                                           BasicView<Pat, ViewProperties{}, forward_iterator>>);
+
+  // `const V` models forward range
+  // `const Pattern` does not model forward range
+  // `is_reference_v<range_reference_t<const V>>` is true
+  // `range_reference_t<const V>` models input range
+  static_assert(!JoinWithViewHasConstBegin<BasicView<Vec, ViewProperties{}, forward_iterator>,
+                                           BasicView<Pat, ViewProperties{.common = false}, cpp20_input_iterator>>);
+
+  // `const V` models forward range
+  // `const Pattern` models forward range
+  // `is_reference_v<range_reference_t<const V>>` is false
+  // `range_reference_t<const V>` models input range
+  static_assert(
+      !JoinWithViewHasConstBegin<BasicView<RvalueVector<std::vector<int>>, ViewProperties{}, forward_iterator>,
+                                 BasicView<Pat, ViewProperties{}, forward_iterator>>);
+
+  // `const V` models forward range
+  // `const Pattern` models forward range
+  // `is_reference_v<range_reference_t<const V>>` is true
+  // `range_reference_t<const V>` does not model input range
+  static_assert(!JoinWithViewHasConstBegin<
+                BasicView<std::vector<InputRangeButOutputWhenConst<int>>, ViewProperties{}, forward_iterator>,
+                BasicView<Pat, ViewProperties{}, forward_iterator>>);
+
+  // `concatable<range_reference_t<const V>, const Pattern>` is not satisfied
+  // See also LWG-4074: compatible-joinable-ranges is underconstrained
+  static_assert(!JoinWithViewHasConstBegin<BasicVectorView<int, ViewProperties{}, forward_iterator>,
+                                           lwg4074::PatternWithProxyConstAccess>);
+
+  // Check situation when iterators returned by `begin()` and `begin() const` are the same
+  using JWV = std::ranges::join_with_view<MaybeSimpleForwardView<true>, MaybeSimplePattern<true>>;
+  static_assert(std::same_as<std::ranges::iterator_t<JWV&>, std::ranges::iterator_t<const JWV&>>);
+}
+
+constexpr bool test() {
+  test_begin();
+  test_const_begin();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/constraints.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/constraints.compile.pass.cpp
new file mode 100644
index 000000000000..86d624941d22
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/constraints.compile.pass.cpp
@@ -0,0 +1,289 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// template <input_range V, forward_range Pattern>
+//  requires view<V> && input_range<range_reference_t<V>> && view<Pattern> &&
+//           compatible-joinable-ranges<range_reference_t<V>, Pattern>
+// class join_with_view;
+
+#include <ranges>
+
+#include <cstddef>
+#include <vector>
+
+#include "test_iterators.h"
+#include "../types.h"
+
+template <class View, class Pattern>
+concept CanFormJoinWithView = requires { typename std::ranges::join_with_view<View, Pattern>; };
+
+// join_with_view is not valid when `V` is not an input_range
+namespace test_when_view_is_not_an_input_range {
+struct View : std::ranges::view_base {
+  using It = cpp20_output_iterator<std::vector<int>*>;
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+struct Pattern : std::ranges::view_base {
+  int* begin();
+  int* end();
+};
+
+static_assert(std::ranges::range<View>);
+static_assert(!std::ranges::input_range<View>);
+static_assert(std::ranges::view<View>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(!CanFormJoinWithView<View, Pattern>);
+} // namespace test_when_view_is_not_an_input_range
+
+// join_with_view is not valid when `Pattern` is not a forward_range
+namespace test_when_pattern_is_not_a_forward_range {
+struct View : std::ranges::view_base {
+  std::vector<float>* begin();
+  std::vector<float>* end();
+};
+
+struct Pattern : std::ranges::view_base {
+  using It = cpp20_input_iterator<float*>;
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+static_assert(std::ranges::input_range<View>);
+static_assert(std::ranges::view<View>);
+static_assert(!std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(!CanFormJoinWithView<View, Pattern>);
+} // namespace test_when_pattern_is_not_a_forward_range
+
+// join_with_view is not valid when `V` does not model std::ranges::view
+namespace test_when_view_does_not_model_view {
+struct View {
+  std::vector<double>* begin();
+  std::vector<double>* end();
+};
+
+struct Pattern : std::ranges::view_base {
+  double* begin();
+  double* end();
+};
+
+static_assert(std::ranges::input_range<View>);
+static_assert(!std::ranges::view<View>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(!CanFormJoinWithView<View, Pattern>);
+} // namespace test_when_view_does_not_model_view
+
+// join_with_view is not valid when `range_reference_t` of `V` is not an input_range
+namespace test_when_range_reference_t_of_view_is_not_an_input_range {
+struct InnerRange {
+  using It = cpp20_output_iterator<long*>;
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+struct View : std::ranges::view_base {
+  InnerRange* begin();
+  InnerRange* end();
+};
+
+struct Pattern : std::ranges::view_base {
+  long* begin();
+  long* end();
+};
+
+static_assert(std::ranges::range<InnerRange>);
+static_assert(!std::ranges::input_range<InnerRange>);
+static_assert(std::ranges::input_range<View>);
+static_assert(std::ranges::view<View>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(!CanFormJoinWithView<View, Pattern>);
+} // namespace test_when_range_reference_t_of_view_is_not_an_input_range
+
+// join_with_view is not valid when `Pattern` does not model std::ranges::view
+namespace test_when_pattern_does_not_model_view {
+struct View : std::ranges::view_base {
+  std::vector<short>* begin();
+  std::vector<short>* end();
+};
+
+struct Pattern {
+  short* begin();
+  short* end();
+};
+
+static_assert(std::ranges::input_range<View>);
+static_assert(std::ranges::view<View>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(!std::ranges::view<Pattern>);
+static_assert(!CanFormJoinWithView<View, Pattern>);
+} // namespace test_when_pattern_does_not_model_view
+
+// join_with_view is not valid when `range_reference_t<View>` and pattern
+// does not model together compatible-joinable-ranges
+namespace test_when_used_ranges_are_not_concatable {
+using std::ranges::range_reference_t;
+using std::ranges::range_rvalue_reference_t;
+using std::ranges::range_value_t;
+
+template <class InnerRange>
+struct View : std::ranges::view_base {
+  InnerRange* begin();
+  InnerRange* end();
+};
+
+namespace no_concat_reference_t {
+struct ValueType {};
+
+struct InnerRange {
+  struct It {
+    using difference_type = std::ptrdiff_t;
+    using value_type      = ValueType;
+    struct reference {
+      operator value_type();
+    };
+
+    It& operator++();
+    void operator++(int);
+    reference operator*() const;
+  };
+
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+struct Pattern : std::ranges::view_base {
+  struct It {
+    using difference_type = std::ptrdiff_t;
+    using value_type      = ValueType;
+    struct reference {
+      operator value_type();
+    };
+
+    It& operator++();
+    It operator++(int);
+    reference operator*() const;
+    bool operator==(const It&) const;
+    friend value_type&& iter_move(const It&);
+  };
+
+  It begin();
+  It end();
+};
+
+static_assert(std::ranges::input_range<InnerRange>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(!std::common_reference_with<range_reference_t<InnerRange>, range_reference_t<Pattern>>);
+static_assert(std::common_with<range_value_t<InnerRange>, range_value_t<Pattern>>);
+static_assert(std::common_reference_with<range_rvalue_reference_t<InnerRange>, range_rvalue_reference_t<Pattern>>);
+static_assert(!CanFormJoinWithView<View<InnerRange>, Pattern>);
+} // namespace no_concat_reference_t
+
+namespace no_concat_value_t {
+struct InnerRange {
+  struct It {
+    using difference_type = std::ptrdiff_t;
+    struct value_type {};
+
+    struct reference {
+      operator value_type();
+      operator float();
+    };
+
+    It& operator++();
+    void operator++(int);
+    reference operator*() const;
+  };
+
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+struct Pattern : std::ranges::view_base {
+  const float* begin();
+  const float* end();
+};
+
+static_assert(std::ranges::input_range<InnerRange>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(std::common_reference_with<range_reference_t<InnerRange>, range_reference_t<Pattern>>);
+static_assert(!std::common_with<range_value_t<InnerRange>, range_value_t<Pattern>>);
+static_assert(std::common_reference_with<range_rvalue_reference_t<InnerRange>, range_rvalue_reference_t<Pattern>>);
+static_assert(!CanFormJoinWithView<View<InnerRange>, Pattern>);
+} // namespace no_concat_value_t
+
+namespace no_concat_rvalue_reference_t {
+struct InnerRange {
+  using It = cpp20_input_iterator<int*>;
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+struct Pattern : std::ranges::view_base {
+  struct It {
+    using difference_type = std::ptrdiff_t;
+    struct value_type {
+      operator int() const;
+    };
+
+    struct rvalue_reference {
+      operator value_type();
+    };
+
+    It& operator++();
+    It operator++(int);
+    value_type& operator*() const;
+    bool operator==(const It&) const;
+    friend rvalue_reference iter_move(const It&);
+  };
+
+  It begin();
+  It end();
+};
+
+static_assert(std::ranges::input_range<InnerRange>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(std::common_reference_with<range_reference_t<InnerRange>, range_reference_t<Pattern>>);
+static_assert(std::common_with<range_value_t<InnerRange>, range_value_t<Pattern>>);
+static_assert(!std::common_reference_with<range_rvalue_reference_t<InnerRange>, range_rvalue_reference_t<Pattern>>);
+static_assert(!CanFormJoinWithView<View<InnerRange>, Pattern>);
+} // namespace no_concat_rvalue_reference_t
+
+namespace not_concat_indirectly_readable { // Required after LWG-4074 ("compatible-joinable-ranges is underconstrained")
+struct InnerRange {
+  using It = cpp20_input_iterator<int*>;
+  It begin();
+  sentinel_wrapper<It> end();
+};
+
+struct Pattern : std::ranges::view_base {
+  lwg4074::Iter begin();
+  lwg4074::Iter end();
+};
+
+static_assert(std::ranges::input_range<InnerRange>);
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+static_assert(std::common_reference_with<range_reference_t<InnerRange>, range_reference_t<Pattern>>);
+static_assert(std::common_with<range_value_t<InnerRange>, range_value_t<Pattern>>);
+static_assert(std::common_reference_with<range_rvalue_reference_t<InnerRange>, range_rvalue_reference_t<Pattern>>);
+LIBCPP_STATIC_ASSERT(!std::ranges::__concat_indirectly_readable<InnerRange, Pattern>);
+static_assert(!CanFormJoinWithView<View<InnerRange>, Pattern>);
+} // namespace not_concat_indirectly_readable
+} // namespace test_when_used_ranges_are_not_concatable
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctad.compile.pass.cpp
new file mode 100644
index 000000000000..07b70bc39cda
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctad.compile.pass.cpp
@@ -0,0 +1,230 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// template<class R, class P>
+//   join_with_view(R&&, P&&) -> join_with_view<views::all_t<R>, views::all_t<P>>;
+//
+// template<input_range R>
+//   join_with_view(R&&, range_value_t<range_reference_t<R>>)
+//     -> join_with_view<views::all_t<R>, single_view<range_value_t<range_reference_t<R>>>>;
+
+#include <ranges>
+
+#include <deque>
+#include <type_traits>
+
+#include "test_iterators.h"
+
+struct View : std::ranges::view_base {
+  using It = cpp20_input_iterator<std::deque<int>*>;
+
+  View() = default;
+  It begin() const;
+  sentinel_wrapper<It> end() const;
+};
+
+static_assert(std::ranges::input_range<View>);
+static_assert(std::ranges::view<View>);
+
+struct Pattern : std::ranges::view_base {
+  Pattern() = default;
+  forward_iterator<int*> begin();
+  forward_iterator<int*> end();
+};
+
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::view<Pattern>);
+
+// A range that is not a view
+struct Range {
+  using It = cpp20_input_iterator<std::deque<int>*>;
+
+  Range() = default;
+  It begin() const;
+  sentinel_wrapper<It> end() const;
+};
+
+static_assert(std::ranges::input_range<Range>);
+static_assert(!std::ranges::view<Range>);
+
+// A pattern that is not a view
+struct RangePattern {
+  RangePattern() = default;
+  forward_iterator<int*> begin();
+  forward_iterator<int*> end();
+};
+
+static_assert(std::ranges::forward_range<RangePattern>);
+static_assert(!std::ranges::view<RangePattern>);
+
+void test_range_and_pattern_deduction_guide() {
+  { // Both `v` and `pat` model `std::ranges::view`.
+    {
+      View v;
+      Pattern pat;
+      std::ranges::join_with_view view(v, pat);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, Pattern>>);
+    }
+    {
+      View v;
+      std::ranges::join_with_view view(v, Pattern{});
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, Pattern>>);
+    }
+    {
+      Pattern pat;
+      std::ranges::join_with_view view(View{}, pat);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, Pattern>>);
+    }
+    {
+      std::ranges::join_with_view view(View{}, Pattern{});
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, Pattern>>);
+    }
+  }
+
+  { // Only `pat` models `std::ranges::view`.
+    {
+      Range v;
+      Pattern pat;
+      std::ranges::join_with_view view(v, pat);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<std::ranges::ref_view<Range>, Pattern>>);
+    }
+    {
+      Range v;
+      std::ranges::join_with_view view(v, Pattern{});
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<std::ranges::ref_view<Range>, Pattern>>);
+    }
+    {
+      Pattern pat;
+      std::ranges::join_with_view view(Range{}, pat);
+      static_assert(
+          std::is_same_v<decltype(view), std::ranges::join_with_view<std::ranges::owning_view<Range>, Pattern>>);
+    }
+    {
+      std::ranges::join_with_view view(Range{}, Pattern{});
+      static_assert(
+          std::is_same_v<decltype(view), std::ranges::join_with_view<std::ranges::owning_view<Range>, Pattern>>);
+    }
+  }
+
+  { // Only `v` models `std::ranges::view`.
+    {
+      View v;
+      RangePattern pat;
+      std::ranges::join_with_view view(v, pat);
+      static_assert(
+          std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::ref_view<RangePattern>>>);
+    }
+    {
+      View v;
+      std::ranges::join_with_view view(v, RangePattern{});
+      static_assert(
+          std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::owning_view<RangePattern>>>);
+    }
+    {
+      RangePattern pat;
+      std::ranges::join_with_view view(View{}, pat);
+      static_assert(
+          std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::ref_view<RangePattern>>>);
+    }
+    {
+      std::ranges::join_with_view view(View{}, RangePattern{});
+      static_assert(
+          std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::owning_view<RangePattern>>>);
+    }
+  }
+
+  { // Both `v` and `pat` don't model `std::ranges::view`.
+    {
+      Range r;
+      RangePattern pat;
+      std::ranges::join_with_view view(r, pat);
+      static_assert(std::is_same_v<
+                    decltype(view),
+                    std::ranges::join_with_view<std::ranges::ref_view<Range>, std::ranges::ref_view<RangePattern>>>);
+    }
+    {
+      Range r;
+      std::ranges::join_with_view view(r, RangePattern{});
+      static_assert(std::is_same_v<
+                    decltype(view),
+                    std::ranges::join_with_view<std::ranges::ref_view<Range>, std::ranges::owning_view<RangePattern>>>);
+    }
+    {
+      RangePattern pat;
+      std::ranges::join_with_view view(Range{}, pat);
+      static_assert(std::is_same_v<
+                    decltype(view),
+                    std::ranges::join_with_view<std::ranges::owning_view<Range>, std::ranges::ref_view<RangePattern>>>);
+    }
+    {
+      std::ranges::join_with_view view(Range{}, RangePattern{});
+      static_assert(
+          std::is_same_v<
+              decltype(view),
+              std::ranges::join_with_view<std::ranges::owning_view<Range>, std::ranges::owning_view<RangePattern>>>);
+    }
+  }
+}
+
+void test_range_and_element_deduction_guide() {
+  { // Element is lvalue
+    int elem = 0;
+
+    {
+      View v;
+      std::ranges::join_with_view view(v, elem);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::single_view<int>>>);
+    }
+    {
+      std::ranges::join_with_view view(View{}, elem);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::single_view<int>>>);
+    }
+    {
+      Range r;
+      std::ranges::join_with_view view(r, elem);
+      static_assert(
+          std::is_same_v<decltype(view),
+                         std::ranges::join_with_view<std::ranges::ref_view<Range>, std::ranges::single_view<int>>>);
+    }
+    {
+      std::ranges::join_with_view view(Range{}, elem);
+      static_assert(
+          std::is_same_v<decltype(view),
+                         std::ranges::join_with_view<std::ranges::owning_view<Range>, std::ranges::single_view<int>>>);
+    }
+  }
+
+  { // Element is rvalue
+    {
+      View v;
+      std::ranges::join_with_view view(v, 1);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::single_view<int>>>);
+    }
+    {
+      std::ranges::join_with_view view(View{}, 1);
+      static_assert(std::is_same_v<decltype(view), std::ranges::join_with_view<View, std::ranges::single_view<int>>>);
+    }
+    {
+      Range r;
+      std::ranges::join_with_view view(r, 1);
+      static_assert(
+          std::is_same_v<decltype(view),
+                         std::ranges::join_with_view<std::ranges::ref_view<Range>, std::ranges::single_view<int>>>);
+    }
+    {
+      std::ranges::join_with_view view(Range{}, 1);
+      static_assert(
+          std::is_same_v<decltype(view),
+                         std::ranges::join_with_view<std::ranges::owning_view<Range>, std::ranges::single_view<int>>>);
+    }
+  }
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.default.pass.cpp
new file mode 100644
index 000000000000..2177ea48ce85
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.default.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// join_with_view()
+//   requires default_initializable<V> && default_initializable<Pattern> = default;
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <type_traits>
+
+static constexpr auto view = std::to_array<std::array<int, 2>>({{1, 2}, {3, 4}, {5, 6}});
+
+struct TrivialView : std::ranges::view_base {
+  int val_; // intentionally uninitialized
+
+  constexpr auto begin() { return view.data(); }
+  constexpr auto end() { return view.data() + view.size(); }
+};
+
+static_assert(std::is_trivially_copyable_v<TrivialView> && std::is_trivially_default_constructible_v<TrivialView>);
+
+struct NonDefaultConstructibleView : TrivialView {
+  NonDefaultConstructibleView(int);
+};
+
+struct TrivialPattern : std::ranges::view_base {
+  int val_; // intentionally uninitialized
+
+  constexpr int* begin() { return &val_; }
+  constexpr int* end() { return &val_ + 1; }
+};
+
+static_assert(std::is_trivially_copyable_v<TrivialPattern> &&
+              std::is_trivially_default_constructible_v<TrivialPattern>);
+
+struct NonDefaultConstructiblePattern : TrivialPattern {
+  NonDefaultConstructiblePattern(int);
+};
+
+constexpr bool test() {
+  { // Check if `base_` and `pattern_` are value initialised
+    std::ranges::join_with_view<TrivialView, TrivialPattern> v;
+    assert(std::move(v).base().val_ == 0);
+    assert(std::ranges::equal(v, std::array{1, 2, 0, 3, 4, 0, 5, 6}));
+  }
+
+  { // Default constructor should not be explicit
+    [[maybe_unused]] std::ranges::join_with_view<TrivialView, TrivialPattern> v = {};
+  }
+
+  static_assert(std::default_initializable<std::ranges::join_with_view<TrivialView, TrivialPattern>>);
+  static_assert(!std::default_initializable<std::ranges::join_with_view<TrivialView, NonDefaultConstructiblePattern>>);
+  static_assert(!std::default_initializable<std::ranges::join_with_view<NonDefaultConstructibleView, TrivialPattern>>);
+  static_assert(!std::default_initializable<
+                std::ranges::join_with_view<NonDefaultConstructibleView, NonDefaultConstructiblePattern>>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.element.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.element.pass.cpp
new file mode 100644
index 000000000000..7266912e41fd
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.element.pass.cpp
@@ -0,0 +1,244 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// template<input_range R>
+//   requires constructible_from<V, views::all_t<R>> &&
+//   constructible_from<Pattern, single_view<range_value_t<InnerRng>>>
+// constexpr explicit join_with_view(R&& r, range_value_t<InnerRng> e);
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+#include "../types.h"
+#include "test_iterators.h"
+#include "test_range.h"
+
+struct MoveOnlyInt {
+  MoveOnlyInt()                         = default;
+  MoveOnlyInt(MoveOnlyInt&&)            = default;
+  MoveOnlyInt& operator=(MoveOnlyInt&&) = default;
+
+  constexpr MoveOnlyInt(int val) : val_(val) {}
+  constexpr operator int() const { return val_; }
+
+  int val_ = 0;
+};
+
+template <>
+struct std::common_type<MoveOnlyInt, int> {
+  using type = int;
+};
+
+template <>
+struct std::common_type<int, MoveOnlyInt> {
+  using type = int;
+};
+
+struct OutputView : std::ranges::view_base {
+  using It = cpp20_output_iterator<int*>;
+  It begin() const;
+  sentinel_wrapper<It> end() const;
+};
+
+static_assert(std::ranges::output_range<OutputView, int>);
+static_assert(std::ranges::view<OutputView>);
+
+struct InputRange {
+  using It = cpp20_input_iterator<int*>;
+  It begin() const;
+  sentinel_wrapper<It> end() const;
+};
+
+struct InputView : InputRange, std::ranges::view_base {};
+
+static_assert(std::ranges::input_range<InputRange>);
+static_assert(std::ranges::input_range<const InputRange>);
+static_assert(std::ranges::view<InputView>);
+static_assert(std::ranges::input_range<InputView>);
+static_assert(std::ranges::input_range<const InputView>);
+
+class View : public std::ranges::view_base {
+  using OuterRange = std::array<std::array<MoveOnlyInt, 2>, 3>;
+
+  static constexpr OuterRange range_on_input_view            = {{{1, 1}, {1, 1}, {1, 1}}};
+  static constexpr OuterRange range_on_ref_input_range       = {{{2, 2}, {2, 2}, {2, 2}}};
+  static constexpr OuterRange range_on_const_ref_input_range = {{{3, 3}, {3, 3}, {3, 3}}};
+  static constexpr OuterRange range_on_owning_input_range    = {{{4, 4}, {4, 4}, {4, 4}}};
+
+  const OuterRange* r_;
+
+public:
+  // Those functions should never be called in this test.
+  View(View&&) { assert(false); }
+  View(OutputView) { assert(false); }
+  View& operator=(View&&) {
+    assert(false);
+    return *this;
+  }
+
+  constexpr explicit View(InputView) : r_(&range_on_input_view) {}
+  constexpr explicit View(InputRange) = delete;
+  constexpr explicit View(std::ranges::ref_view<InputRange>) : r_(&range_on_ref_input_range) {}
+  constexpr explicit View(std::ranges::ref_view<const InputRange>) : r_(&range_on_const_ref_input_range) {}
+  constexpr explicit View(std::ranges::owning_view<InputRange>) : r_(&range_on_owning_input_range) {}
+
+  constexpr auto begin() const { return r_->begin(); }
+  constexpr auto end() const { return r_->end(); }
+};
+
+static_assert(std::ranges::input_range<View>);
+static_assert(std::ranges::input_range<const View>);
+
+class Pattern : public std::ranges::view_base {
+  int val_;
+
+public:
+  // Those functions should never be called in this test.
+  Pattern(Pattern&&) { assert(false); }
+  template <class T>
+  Pattern(const std::ranges::single_view<T>&) {
+    assert(false);
+  }
+  Pattern& operator=(Pattern&&) {
+    assert(false);
+    return *this;
+  }
+
+  template <class T>
+  constexpr explicit Pattern(std::ranges::single_view<T>&& v) : val_(v[0]) {}
+
+  constexpr const int* begin() const { return &val_; }
+  constexpr const int* end() const { return &val_ + 1; }
+};
+
+static_assert(std::ranges::forward_range<Pattern>);
+static_assert(std::ranges::forward_range<const Pattern>);
+
+constexpr void test_ctor_with_view_and_element() {
+  // Check construction from `r` and `e`, when `r` models `std::ranges::view`
+
+  { // `r` and `e` are glvalues
+    InputView r;
+    int e = 0;
+    std::ranges::join_with_view<View, Pattern> jwv(r, e);
+    assert(std::ranges::equal(jwv, std::array{1, 1, 0, 1, 1, 0, 1, 1}));
+  }
+
+  { // `r` and `e` are const glvalues
+    const InputView r;
+    const int e = 1;
+    std::ranges::join_with_view<View, Pattern> jwv(r, e);
+    assert(std::ranges::equal(jwv, std::array{1, 1, 1, 1, 1, 1, 1, 1}));
+  }
+
+  { // `r` and `e` are prvalues
+    std::ranges::join_with_view<View, Pattern> jwv(InputView{}, MoveOnlyInt{2});
+    assert(std::ranges::equal(jwv, std::array{1, 1, 2, 1, 1, 2, 1, 1}));
+  }
+
+  { // `r` and `e` are xvalues
+    InputView r;
+    MoveOnlyInt e = 3;
+    std::ranges::join_with_view<View, Pattern> jwv(std::move(r), std::move(e));
+    assert(std::ranges::equal(jwv, std::array{1, 1, 3, 1, 1, 3, 1, 1}));
+  }
+
+  // Check explicitness
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, InputView, MoveOnlyInt>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, InputView, int>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, InputView&, int&>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, const InputView, const int>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, const InputView&, const int&>);
+}
+
+constexpr void test_ctor_with_non_view_and_element() {
+  // Check construction from `r` and `e`, when `r` does not model `std::ranges::view`
+
+  { // `r` and `e` are glvalues
+    InputRange r;
+    int e = 0;
+    std::ranges::join_with_view<View, Pattern> jwv(r, e);
+    assert(std::ranges::equal(jwv, std::array{2, 2, 0, 2, 2, 0, 2, 2}));
+  }
+
+  { // `r` and `e` are const glvalues
+    const InputRange r;
+    const int e = 1;
+    std::ranges::join_with_view<View, Pattern> jwv(r, e);
+    assert(std::ranges::equal(jwv, std::array{3, 3, 1, 3, 3, 1, 3, 3}));
+  }
+
+  { // `r` and `e` are prvalues
+    std::ranges::join_with_view<View, Pattern> jwv(InputRange{}, MoveOnlyInt{2});
+    assert(std::ranges::equal(jwv, std::array{4, 4, 2, 4, 4, 2, 4, 4}));
+  }
+
+  { // `r` and `e` are xvalues
+    InputRange r;
+    MoveOnlyInt e = 3;
+    std::ranges::join_with_view<View, Pattern> jwv(std::move(r), std::move(e));
+    assert(std::ranges::equal(jwv, std::array{4, 4, 3, 4, 4, 3, 4, 4}));
+  }
+
+  // Check explicitness
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, InputRange, MoveOnlyInt>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, InputRange, int>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, InputRange&, int&>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, const InputRange&, const int&>);
+}
+
+constexpr void test_constraints() {
+  { // `R` is not an input range
+    using R = OutputView;
+    static_assert(!std::ranges::input_range<R>);
+    static_assert(std::constructible_from<View, std::views::all_t<R>>);
+    static_assert(std::constructible_from<Pattern, std::ranges::single_view<int>>);
+    static_assert(!std::constructible_from<std::ranges::join_with_view<View, Pattern>, R, int>);
+  }
+
+  { // `V` is not constructible from `views::all_t<R>`
+    using R = test_range<cpp20_input_iterator>;
+    static_assert(std::ranges::input_range<R>);
+    static_assert(!std::constructible_from<View, std::views::all_t<R>>);
+    static_assert(std::constructible_from<Pattern, std::ranges::single_view<int>>);
+    static_assert(!std::constructible_from<std::ranges::join_with_view<View, Pattern>, R, int>);
+  }
+
+  { // `Pattern` is not constructible from `single_view<range_value_t<InnerRng>>`
+    using R   = InputView;
+    using Pat = test_view<forward_iterator>;
+    static_assert(std::ranges::input_range<R>);
+    static_assert(std::constructible_from<View, std::views::all_t<R>>);
+    static_assert(!std::constructible_from<Pat, std::ranges::single_view<int>>);
+    static_assert(!std::constructible_from<std::ranges::join_with_view<View, Pat>, R, int>);
+  }
+}
+
+constexpr bool test() {
+  test_ctor_with_view_and_element();
+  test_ctor_with_non_view_and_element();
+  test_constraints();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.pattern.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.pattern.pass.cpp
new file mode 100644
index 000000000000..d3ee228ab0eb
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/ctor.range.pattern.pass.cpp
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr explicit join_with_view(V base, Pattern pattern);
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <utility>
+
+#include "../types.h"
+
+class View : public std::ranges::view_base {
+  using OuterRange = std::array<std::array<int, 2>, 3>;
+
+  static constexpr OuterRange default_range = {{{1, 2}, {3, 4}, {5, 6}}};
+  static constexpr OuterRange range_on_move = {{{6, 5}, {4, 3}, {2, 1}}};
+
+  const OuterRange* r_ = &default_range;
+
+public:
+  View() = default;
+  constexpr View(const View&) : r_(&default_range) {}
+  constexpr View(View&&) : r_(&range_on_move) {}
+
+  constexpr View& operator=(View) {
+    r_ = &default_range;
+    return *this;
+  }
+
+  constexpr auto begin() { return r_->begin(); }
+  constexpr auto end() { return r_->end(); }
+};
+
+class Pattern : public std::ranges::view_base {
+  using PatternRange = std::array<int, 2>;
+
+  static constexpr PatternRange default_range = {0, 0};
+  static constexpr PatternRange range_on_move = {7, 7};
+
+  const PatternRange* val_ = &default_range;
+
+public:
+  Pattern() = default;
+  constexpr Pattern(const Pattern&) : val_(&default_range) {}
+  constexpr Pattern(Pattern&&) : val_(&range_on_move) {}
+
+  constexpr Pattern& operator=(Pattern) {
+    val_ = &default_range;
+    return *this;
+  }
+
+  constexpr auto begin() { return val_->begin(); }
+  constexpr auto end() { return val_->end(); }
+};
+
+constexpr bool test() {
+  {   // Check construction from `view` and `pattern`
+    { // `view` and `pattern` are glvalues
+      View v;
+      Pattern p;
+      std::ranges::join_with_view<View, Pattern> jwv(v, p);
+      assert(std::ranges::equal(jwv, std::array{6, 5, 7, 7, 4, 3, 7, 7, 2, 1}));
+    }
+
+    { // `view` and `pattern` are const glvalues
+      const View v;
+      const Pattern p;
+      std::ranges::join_with_view<View, Pattern> jwv(v, p);
+      assert(std::ranges::equal(jwv, std::array{6, 5, 7, 7, 4, 3, 7, 7, 2, 1}));
+    }
+
+    { // `view` and `pattern` are prvalues
+      std::ranges::join_with_view<View, Pattern> jwv(View{}, Pattern{});
+      assert(std::ranges::equal(jwv, std::array{6, 5, 7, 7, 4, 3, 7, 7, 2, 1}));
+    }
+
+    { // `view` and `pattern` are xvalues
+      View v;
+      Pattern p;
+      std::ranges::join_with_view<View, Pattern> jwv(std::move(v), std::move(p));
+      assert(std::ranges::equal(jwv, std::array{6, 5, 7, 7, 4, 3, 7, 7, 2, 1}));
+    }
+  }
+
+  // Check explicitness
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, View, Pattern>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, View&, Pattern&>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, const View, const Pattern>);
+  static_assert(ConstructionIsExplicit<std::ranges::join_with_view<View, Pattern>, const View&, const Pattern&>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/end.pass.cpp
new file mode 100644
index 000000000000..c6e973abffac
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/end.pass.cpp
@@ -0,0 +1,232 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// constexpr auto end();
+// constexpr auto end() const
+//   requires forward_range<const V> && forward_range<const Pattern> &&
+//            is_reference_v<range_reference_t<const V>> &&
+//            input_range<range_reference_t<const V>>;
+
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=10000000
+
+#include <ranges>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "../types.h"
+#include "test_iterators.h"
+
+template <class V, class Pattern>
+concept JoinWithViewHasConstEnd = requires(const std::ranges::join_with_view<V, Pattern> jwv) { jwv.end(); };
+
+template <size_t Bits>
+  requires(Bits < (1 << 7))
+constexpr void test_end() {
+  constexpr bool v_models_forward_range           = static_cast<bool>(Bits & (1 << 0));
+  constexpr bool inner_range_is_reference         = static_cast<bool>(Bits & (1 << 1));
+  constexpr bool inner_range_models_forward_range = static_cast<bool>(Bits & (1 << 2));
+  constexpr bool v_models_common_range            = static_cast<bool>(Bits & (1 << 3));
+  constexpr bool inner_range_models_common_range  = static_cast<bool>(Bits & (1 << 4));
+  constexpr bool v_models_simple_range            = static_cast<bool>(Bits & (1 << 5));
+  constexpr bool pattern_models_simple_range      = static_cast<bool>(Bits & (1 << 6));
+
+  constexpr ViewProperties inner_range_props{.common = inner_range_models_common_range};
+  using InnerRange =
+      std::conditional_t<inner_range_models_forward_range,
+                         BasicView<std::vector<int>, inner_range_props, forward_iterator>,
+                         BasicView<std::vector<int>, inner_range_props, DefaultCtorInputIter>>;
+
+  constexpr ViewProperties v_props{.simple = v_models_simple_range, .common = v_models_common_range};
+  using UnderlyingV = std::conditional_t<inner_range_is_reference, std::vector<InnerRange>, RvalueVector<InnerRange>>;
+  using V           = std::conditional_t<v_models_forward_range,
+                                         BasicView<UnderlyingV, v_props, forward_iterator>,
+                                         BasicView<UnderlyingV, v_props, DefaultCtorInputIter>>;
+
+  using UnderlyingPattern = std::vector<int>;
+  using Pattern = BasicView<UnderlyingPattern, ViewProperties{.simple = pattern_models_simple_range}, forward_iterator>;
+
+  using JWV  = std::ranges::join_with_view<V, Pattern>;
+  using Iter = std::ranges::iterator_t<JWV>;
+
+  // Test when `JWV` models common range
+  static_assert(std::same_as<Iter, std::ranges::sentinel_t<JWV>> ==
+                (v_models_forward_range && inner_range_is_reference && inner_range_models_forward_range &&
+                 v_models_common_range && inner_range_models_common_range));
+
+  { // `V` and `Pattern` are empty
+    V v{};
+    Pattern pattern{};
+    JWV jwv(std::move(v), std::move(pattern));
+    Iter it                                   = jwv.begin();
+    std::sentinel_for<Iter> decltype(auto) se = jwv.end();
+    assert(it == se);
+  }
+
+  { // `V` is empty, `Pattern` contains some elements
+    V v{};
+    Pattern pattern{std::vector<int>{0}};
+    JWV jwv(std::move(v), std::move(pattern));
+    Iter it                                   = jwv.begin();
+    std::sentinel_for<Iter> decltype(auto) se = jwv.end();
+    assert(it == se);
+  }
+
+  { // `V` is not empty, `Pattern is empty`
+    V v{UnderlyingV{
+        std::vector<InnerRange>{InnerRange(std::vector<int>{1, 2, 3}), InnerRange(std::vector<int>{4, 5, 6})}}};
+    Pattern pattern{};
+    JWV jwv(std::move(v), std::move(pattern));
+    Iter it                                   = jwv.begin();
+    std::sentinel_for<Iter> decltype(auto) se = jwv.end();
+    assert(std::ranges::next(it, 6) == se);
+  }
+
+  { // `V` and `Pattern` are not empty
+    V v{UnderlyingV{std::vector<InnerRange>{
+        InnerRange(std::vector<int>{6, 5}),
+        InnerRange(std::vector<int>{4, 3}),
+        InnerRange(std::vector<int>{2, 1, 0})}}};
+    Pattern pattern{std::vector<int>{-1, -1}};
+    JWV jwv(std::move(v), std::move(pattern));
+    Iter it                                   = jwv.begin();
+    std::sentinel_for<Iter> decltype(auto) se = jwv.end();
+    assert(std::ranges::next(it, 11) == se);
+  }
+}
+
+template <std::size_t Bits>
+  requires(Bits < (1 << 7))
+constexpr void test_const_end() {
+  constexpr bool const_v_models_forward_range           = static_cast<bool>(Bits & (1 << 0));
+  constexpr bool const_pattern_models_forward_range     = static_cast<bool>(Bits & (1 << 1));
+  constexpr bool inner_const_range_is_reference         = static_cast<bool>(Bits & (1 << 2));
+  constexpr bool inner_const_range_models_input_range   = static_cast<bool>(Bits & (1 << 3));
+  constexpr bool inner_const_range_models_forward_range = static_cast<bool>(Bits & (1 << 4));
+  constexpr bool const_v_models_common_range            = static_cast<bool>(Bits & (1 << 5));
+  constexpr bool inner_const_range_models_common_range  = static_cast<bool>(Bits & (1 << 6));
+
+  constexpr ViewProperties inner_range_props{.common = inner_const_range_models_common_range};
+  using InnerRange =
+      std::conditional_t<inner_const_range_models_forward_range,
+                         BasicView<std::vector<int>, inner_range_props, forward_iterator>,
+                         std::conditional_t<inner_const_range_models_input_range,
+                                            BasicView<std::vector<int>, inner_range_props, DefaultCtorInputIter>,
+                                            InputRangeButOutputWhenConst<int>>>;
+
+  constexpr ViewProperties v_props{.common = const_v_models_common_range};
+  using UnderlyingV =
+      std::conditional_t<inner_const_range_is_reference, std::vector<InnerRange>, RvalueVector<InnerRange>>;
+  using V = std::conditional_t<const_v_models_forward_range,
+                               BasicView<UnderlyingV, v_props, forward_iterator>,
+                               BasicView<UnderlyingV, v_props, DefaultCtorInputIter>>;
+  using Pattern =
+      std::conditional_t<const_pattern_models_forward_range,
+                         BasicView<std::vector<int>, ViewProperties{}, forward_iterator>,
+                         ForwardViewButInputWhenConst<int>>;
+
+  using JWV = std::ranges::join_with_view<V, Pattern>;
+  static_assert(JoinWithViewHasConstEnd<V, Pattern> ==
+                (const_v_models_forward_range && const_pattern_models_forward_range && inner_const_range_is_reference &&
+                 (inner_const_range_models_input_range || inner_const_range_models_forward_range)));
+  static_assert(JoinWithViewHasConstEnd<V, Pattern> == std::ranges::range<const JWV>);
+
+  if constexpr (std::ranges::range<const JWV>) {
+    using ConstIter = std::ranges::iterator_t<const JWV>;
+
+    // Test when `const JWV` models common range
+    static_assert(std::same_as<ConstIter, std::ranges::sentinel_t<const JWV>> ==
+                  (inner_const_range_models_forward_range && const_v_models_common_range &&
+                   inner_const_range_models_common_range));
+
+    { // `const V` and `const Pattern` are empty
+      V v{};
+      Pattern pattern{};
+      const JWV jwv(std::move(v), std::move(pattern));
+      ConstIter it                                   = jwv.begin();
+      std::sentinel_for<ConstIter> decltype(auto) se = jwv.end();
+      assert(it == se);
+    }
+
+    { // `const V` is empty, `const Pattern` contains some elements
+      V v{};
+      Pattern pattern{std::vector<int>{1}};
+      const JWV jwv(std::move(v), std::move(pattern));
+      ConstIter it                                   = jwv.begin();
+      std::sentinel_for<ConstIter> decltype(auto) se = jwv.end();
+      assert(it == se);
+    }
+
+    { // `const V` is not empty, `const Pattern is empty`
+      V v{UnderlyingV{
+          std::vector<InnerRange>{InnerRange(std::vector<int>{1, 2, 3}), InnerRange(std::vector<int>{4, 5, 6})}}};
+      Pattern pattern{};
+      const JWV jwv(std::move(v), std::move(pattern));
+      ConstIter it                                   = jwv.begin();
+      std::sentinel_for<ConstIter> decltype(auto) se = jwv.end();
+      assert(std::ranges::next(it, 6) == se);
+    }
+
+    { // `const V` and `const Pattern` are not empty
+      V v{UnderlyingV{std::vector<InnerRange>{
+          InnerRange(std::vector<int>{1}), InnerRange(std::vector<int>{2, 2}), InnerRange(std::vector<int>{3, 3, 3})}}};
+      Pattern pattern{std::vector<int>{0}};
+      const JWV jwv(std::move(v), std::move(pattern));
+      ConstIter it                                   = jwv.begin();
+      std::sentinel_for<ConstIter> decltype(auto) se = jwv.end();
+      assert(std::ranges::next(it, 8) == se);
+    }
+  }
+}
+
+constexpr bool test() {
+  []<std::size_t... Bits>(std::index_sequence<Bits...>) {
+    (test_end<Bits>(), ...);
+    (test_const_end<Bits>(), ...);
+  }(std::make_index_sequence<(1 << 7)>{});
+
+  { // Check situation when iterators returned by `end()` and `end() const` are of the same type
+    using V             = BasicView<std::vector<std::string>, ViewProperties{.simple = true}, forward_iterator>;
+    using Pattern       = BasicView<std::string, ViewProperties{.simple = true}, forward_iterator>;
+    using JWV           = std::ranges::join_with_view<V, Pattern>;
+    using Sentinel      = std::ranges::sentinel_t<JWV&>;
+    using ConstSentinel = std::ranges::sentinel_t<const JWV&>;
+    static_assert(std::input_iterator<Sentinel>);
+    static_assert(std::input_iterator<ConstSentinel>);
+    static_assert(std::same_as<Sentinel, ConstSentinel>);
+  }
+
+  { // Check situation when sentinels returned by `end()` and `end() const` are of the same type
+    using V = BasicView<std::vector<std::string>, ViewProperties{.simple = true, .common = false}, forward_iterator>;
+    using Pattern       = BasicView<std::string, ViewProperties{.simple = true}, forward_iterator>;
+    using JWV           = std::ranges::join_with_view<V, Pattern>;
+    using Sentinel      = std::ranges::sentinel_t<JWV&>;
+    using ConstSentinel = std::ranges::sentinel_t<const JWV&>;
+    static_assert(!std::input_iterator<Sentinel>);
+    static_assert(!std::input_iterator<ConstSentinel>);
+    static_assert(std::same_as<Sentinel, ConstSentinel>);
+  }
+
+  // Check LWG-4074: compatible-joinable-ranges is underconstrained
+  static_assert(!JoinWithViewHasConstEnd<BasicVectorView<int, ViewProperties{}, forward_iterator>,
+                                         lwg4074::PatternWithProxyConstAccess>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/inheritance.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/inheritance.compile.pass.cpp
new file mode 100644
index 000000000000..dff881766653
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/range.join.with.view/inheritance.compile.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <ranges>
+
+// class join_with_view : public view_interface<join_with_view<V, Pattern>>
+
+#include <ranges>
+
+#include <concepts>
+#include <string>
+#include <vector>
+
+template <class T>
+struct View : std::ranges::view_base {
+  std::vector<T>* begin();
+  std::vector<T>* end();
+};
+
+template <class T>
+struct Pattern : std::ranges::view_base {
+  T* begin();
+  T* end();
+};
+
+template <class T>
+using JoinWithView = std::ranges::join_with_view<View<T>, Pattern<T>>;
+
+static_assert(std::derived_from<JoinWithView<int>, std::ranges::view_interface<JoinWithView<int>>>);
+static_assert(std::derived_from<JoinWithView<void*>, std::ranges::view_interface<JoinWithView<void*>>>);
+static_assert(std::derived_from<JoinWithView<std::string>, std::ranges::view_interface<JoinWithView<std::string>>>);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.with/types.h b/libcxx/test/std/ranges/range.adaptors/range.join.with/types.h
new file mode 100644
index 000000000000..588c647cef23
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.with/types.h
@@ -0,0 +1,319 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_WITH_TYPES_H
+#define TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_WITH_TYPES_H
+
+#include <cstddef>
+#include <initializer_list>
+#include <ranges>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "test_range.h"
+
+template <class Tp>
+void pass_value(Tp);
+
+template <class Tp, class... Args>
+concept ConstructionIsExplicit = std::constructible_from<Tp, Args...> && !requires(Args&&... args) {
+  pass_value<Tp>({std::forward<Args>(args)...});
+};
+
+struct ViewProperties {
+  bool simple = false;
+  bool common = true;
+};
+
+template <std::ranges::input_range Data,
+          ViewProperties Prop,
+          template <class...> class It,
+          template <class...> class ConstIt = It>
+class BasicView : public std::ranges::view_base {
+  Data data_;
+
+public:
+  constexpr BasicView()
+    requires std::default_initializable<Data>
+  = default;
+
+  template <class R>
+  constexpr explicit BasicView(R&& r)
+    requires requires { std::ranges::to<Data>(std::forward<R>(r)); }
+      /*******/ : data_(std::ranges::to<Data>(std::forward<R>(r))) {}
+
+  constexpr explicit BasicView(std::initializer_list<std::ranges::range_value_t<Data>> il)
+      : data_(std::ranges::to<Data>(il)) {}
+
+  constexpr auto begin()
+    requires(!Prop.simple)
+  {
+    return It(data_.begin());
+  }
+
+  constexpr auto end()
+    requires(!Prop.simple)
+  {
+    if constexpr (Prop.common)
+      return It(data_.end());
+    else
+      return sentinel_wrapper(It(data_.end()));
+  }
+
+  constexpr auto begin() const { return ConstIt(data_.begin()); }
+
+  constexpr auto end() const {
+    if constexpr (Prop.common)
+      return ConstIt(data_.end());
+    else
+      return sentinel_wrapper(ConstIt(data_.end()));
+  }
+};
+
+template <class Tp, ViewProperties Prop, template <class...> class It, template <class...> class ConstIt = It>
+using BasicVectorView = BasicView<std::vector<Tp>, Prop, It, ConstIt>;
+
+struct AsPrvalue {
+  template <class Tp>
+  constexpr auto operator()(Tp&& t) const {
+    return std::forward<Tp>(t);
+  }
+};
+
+template <class Tp>
+class RvalueVector {
+  using Vec = std::vector<Tp>;
+  std::ranges::transform_view<std::ranges::owning_view<Vec>, AsPrvalue> range_;
+
+public:
+  constexpr RvalueVector() = default;
+  constexpr explicit RvalueVector(Vec vec) : range_(std::move(vec), AsPrvalue{}) {}
+  constexpr explicit RvalueVector(std::initializer_list<Tp> il) : RvalueVector(Vec(il)) {}
+
+  constexpr auto begin() { return range_.begin(); }
+  constexpr auto end() { return range_.end(); }
+  constexpr auto begin() const { return range_.begin(); }
+  constexpr auto end() const { return range_.end(); }
+};
+
+template <class It>
+class DefaultCtorInputIter {
+  It it_ = It();
+
+public:
+  using value_type      = std::iter_value_t<It>;
+  using difference_type = std::iter_difference_t<It>;
+
+  DefaultCtorInputIter() = default;
+  constexpr explicit DefaultCtorInputIter(It it) : it_(it) {}
+
+  constexpr DefaultCtorInputIter& operator++() {
+    ++it_;
+    return *this;
+  }
+
+  constexpr void operator++(int) { ++*this; }
+  constexpr decltype(auto) operator*() const { return *it_; }
+  constexpr bool operator==(const DefaultCtorInputIter&) const = default;
+};
+
+template <class It>
+DefaultCtorInputIter(It) -> DefaultCtorInputIter<It>;
+
+template <class Tp>
+class InputRangeButOutputWhenConst {
+  using Vec = std::vector<Tp>;
+  std::ranges::ref_view<Vec> range_;
+
+public:
+  constexpr explicit InputRangeButOutputWhenConst(Vec& vec) : range_(vec) {}
+
+  constexpr auto begin() { return cpp20_input_iterator(range_.begin()); }
+  constexpr auto end() { return sentinel_wrapper(cpp20_input_iterator(range_.end())); }
+  constexpr auto begin() const { return cpp20_output_iterator(range_.begin()); }
+  constexpr auto end() const { return sentinel_wrapper(cpp20_output_iterator(range_.end())); }
+};
+
+template <class Tp>
+using ForwardViewButInputWhenConst =
+    BasicVectorView<Tp, ViewProperties{.common = false}, forward_iterator, cpp20_input_iterator>;
+
+template <class It>
+class ForwardIteratorWithInputCategory {
+  It it_ = It();
+
+public:
+  using value_type        = std::iter_value_t<It>;
+  using difference_type   = std::iter_difference_t<It>;
+  using iterator_concept  = std::forward_iterator_tag;
+  using iterator_category = std::input_iterator_tag;
+
+  ForwardIteratorWithInputCategory() = default;
+  explicit ForwardIteratorWithInputCategory(It it);
+
+  std::iter_reference_t<It> operator*() const;
+  ForwardIteratorWithInputCategory& operator++();
+  ForwardIteratorWithInputCategory operator++(int);
+  bool operator==(const ForwardIteratorWithInputCategory&) const;
+};
+
+template <class It>
+explicit ForwardIteratorWithInputCategory(It) -> ForwardIteratorWithInputCategory<It>;
+
+template <class It>
+class EqComparableInputIter {
+  It it_;
+
+public:
+  using value_type      = std::iter_value_t<It>;
+  using difference_type = std::iter_difference_t<It>;
+
+  constexpr explicit EqComparableInputIter(It it) : it_(it) {}
+
+  constexpr decltype(auto) operator*() const { return *it_; }
+  constexpr EqComparableInputIter& operator++() {
+    ++it_;
+    return *this;
+  }
+  constexpr void operator++(int) { ++it_; }
+
+  friend constexpr It base(const EqComparableInputIter& i) { return i.it_; }
+
+  friend constexpr bool operator==(const EqComparableInputIter& left, const EqComparableInputIter& right) {
+    return left.it_ == right.it_;
+  }
+};
+
+template <class It>
+EqComparableInputIter(It) -> EqComparableInputIter<It>;
+
+template <class Val>
+struct ConstOppositeView : std::ranges::view_base {
+  const Val* begin();
+  sentinel_wrapper<const Val*> end();
+  Val* begin() const;
+  sentinel_wrapper<Val*> end() const;
+};
+
+namespace lwg4074 { // Helpers for LWG-4074 ("compatible-joinable-ranges is underconstrained")
+struct CommonReference;
+
+struct Value {
+  Value(int);
+};
+
+struct Reference {
+  operator Value() const;
+  operator CommonReference() const;
+};
+
+struct CommonReference {
+  CommonReference(int);
+};
+
+struct Iter {
+  using value_type      = Value;
+  using difference_type = std::ptrdiff_t;
+
+  Iter& operator++();
+  Iter operator++(int);
+  Reference operator*() const;
+  bool operator==(const Iter&) const;
+};
+
+struct PatternWithProxyConstAccess {
+  int* begin();
+  int* end();
+  Iter begin() const;
+  Iter end() const;
+};
+} // namespace lwg4074
+
+template <template <class> class TQual, template <class> class UQual>
+struct std::basic_common_reference<lwg4074::Reference, int, TQual, UQual> {
+  using type = lwg4074::CommonReference;
+};
+
+template <template <class> class TQual, template <class> class UQual>
+struct std::basic_common_reference<int, lwg4074::Reference, TQual, UQual> {
+  using type = lwg4074::CommonReference;
+};
+
+namespace selftest {
+using BV1 = BasicView<std::string, ViewProperties{.simple = true}, forward_iterator>;
+static_assert(std::ranges::forward_range<BV1>);
+static_assert(!std::ranges::bidirectional_range<BV1>);
+static_assert(std::ranges::common_range<BV1>);
+static_assert(simple_view<BV1>);
+
+using BV2 =
+    BasicView<RvalueVector<std::string>, ViewProperties{.simple = false, .common = false}, cpp20_input_iterator>;
+static_assert(std::ranges::input_range<BV2>);
+static_assert(!std::ranges::forward_range<BV2>);
+static_assert(!std::ranges::common_range<BV2>);
+static_assert(!std::is_reference_v<std::ranges::range_reference_t<BV2>>);
+static_assert(!simple_view<BV2>);
+
+using RV = RvalueVector<int>;
+static_assert(std::movable<RV>);
+static_assert(std::ranges::random_access_range<RV>);
+static_assert(std::ranges::random_access_range<const RV>);
+static_assert(!std::is_reference_v<std::ranges::range_reference_t<RV>>);
+static_assert(!std::is_reference_v<std::ranges::range_reference_t<const RV>>);
+
+using DCII = DefaultCtorInputIter<int*>;
+static_assert(std::default_initializable<DCII>);
+static_assert(std::sentinel_for<DCII, DCII>);
+static_assert(std::input_iterator<DCII>);
+static_assert(!std::forward_iterator<DCII>);
+
+using IRBOWC = InputRangeButOutputWhenConst<int>;
+static_assert(std::ranges::input_range<IRBOWC>);
+static_assert(std::ranges::output_range<const IRBOWC&, int>);
+
+using FVBIWC = ForwardViewButInputWhenConst<int>;
+static_assert(std::default_initializable<FVBIWC>);
+static_assert(std::ranges::view<FVBIWC>);
+static_assert(std::ranges::forward_range<FVBIWC>);
+static_assert(!std::ranges::common_range<FVBIWC>);
+static_assert(std::ranges::input_range<const FVBIWC&>);
+static_assert(!std::ranges::forward_range<const FVBIWC&>);
+static_assert(!std::ranges::common_range<const FVBIWC&>);
+
+using FIWIC = ForwardIteratorWithInputCategory<long*>;
+static_assert(std::forward_iterator<FIWIC>);
+static_assert(std::same_as<FIWIC::iterator_category, std::input_iterator_tag>);
+static_assert(std::same_as<FIWIC::iterator_category, std::iterator_traits<FIWIC>::iterator_category>);
+
+using ECII = EqComparableInputIter<int*>;
+static_assert(std::input_iterator<ECII>);
+static_assert(!std::forward_iterator<ECII>);
+static_assert(std::equality_comparable<ECII>);
+
+using COV = ConstOppositeView<int>;
+static_assert(std::ranges::view<COV>);
+static_assert(std::ranges::range<const COV>);
+static_assert(!std::ranges::common_range<COV>);
+static_assert(!std::ranges::common_range<const COV>);
+static_assert(std::convertible_to<std::ranges::iterator_t<const COV>, std::ranges::iterator_t<COV>>);
+static_assert(!std::convertible_to<std::ranges::iterator_t<COV>, std::ranges::iterator_t<const COV>>);
+
+static_assert(std::common_with<lwg4074::Value, int>);
+static_assert(std::common_with<lwg4074::Value, lwg4074::Reference>);
+static_assert(std::common_reference_with<lwg4074::Reference, int&>);
+static_assert(std::common_reference_with<lwg4074::Reference, lwg4074::CommonReference>);
+static_assert(std::forward_iterator<lwg4074::Iter>);
+static_assert(std::ranges::forward_range<lwg4074::PatternWithProxyConstAccess>);
+static_assert(std::ranges::forward_range<const lwg4074::PatternWithProxyConstAccess>);
+} // namespace selftest
+
+#endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_WITH_TYPES_H
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 849a96b9585b..edd7b124a1fb 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1115,7 +1115,6 @@ feature_test_macros = [
             "name": "__cpp_lib_ranges_join_with",
             "values": {"c++23": 202202},
             "headers": ["ranges"],
-            "unimplemented": True,
         },
         {
             "name": "__cpp_lib_ranges_repeat",

From 369e8403b616a168cdebbde9813f4a64d33618b7 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 21 Jun 2025 09:55:19 +0000
Subject: [PATCH 1155/1322] [gn build] Port 1bb2328fd3ad

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 41516d677c45..7b9966d9bfc5 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1363,6 +1363,7 @@ if (current_toolchain == default_toolchain) {
       "__ranges/iota_view.h",
       "__ranges/istream_view.h",
       "__ranges/join_view.h",
+      "__ranges/join_with_view.h",
       "__ranges/lazy_split_view.h",
       "__ranges/movable_box.h",
       "__ranges/non_propagating_cache.h",

From 1b5d6ec6855369d109fcb740ecd3812231b7a279 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Sat, 21 Jun 2025 13:14:19 +0300
Subject: [PATCH 1156/1322] [clang-tidy] count class member initializers as
 statements in 'readability-function-size' (#131669)

Improve `readability-function-size` by counting class member
initializers as statements.
Relates to https://github.com/llvm/llvm-project/issues/131126
---
 .../readability/FunctionSizeCheck.cpp         | 15 +++-
 .../readability/FunctionSizeCheck.h           |  2 +
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 ++
 .../checks/readability/function-size.rst      |  5 ++
 .../function-size-no-member-init-as-stmts.cpp | 73 +++++++++++++++++++
 .../checkers/readability/function-size.cpp    | 57 +++++++++++++++
 6 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/function-size-no-member-init-as-stmts.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
index 3313bcb39b7f..8e3a2e306dbf 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.cpp
@@ -108,6 +108,14 @@ public:
     return true;
   }
 
+  bool TraverseConstructorInitializer(CXXCtorInitializer *Init) {
+    if (CountMemberInitAsStmt)
+      ++Info.Statements;
+
+    Base::TraverseConstructorInitializer(Init);
+    return true;
+  }
+
   struct FunctionInfo {
     unsigned Lines = 0;
     unsigned Statements = 0;
@@ -120,6 +128,7 @@ public:
   llvm::BitVector TrackedParent;
   unsigned StructNesting = 0;
   unsigned CurrentNestingLevel = 0;
+  bool CountMemberInitAsStmt;
 };
 
 } // namespace
@@ -135,7 +144,9 @@ FunctionSizeCheck::FunctionSizeCheck(StringRef Name, ClangTidyContext *Context)
       NestingThreshold(
           Options.get("NestingThreshold", DefaultNestingThreshold)),
       VariableThreshold(
-          Options.get("VariableThreshold", DefaultVariableThreshold)) {}
+          Options.get("VariableThreshold", DefaultVariableThreshold)),
+      CountMemberInitAsStmt(
+          Options.get("CountMemberInitAsStmt", DefaultCountMemberInitAsStmt)) {}
 
 void FunctionSizeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "LineThreshold", LineThreshold);
@@ -144,6 +155,7 @@ void FunctionSizeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "ParameterThreshold", ParameterThreshold);
   Options.store(Opts, "NestingThreshold", NestingThreshold);
   Options.store(Opts, "VariableThreshold", VariableThreshold);
+  Options.store(Opts, "CountMemberInitAsStmt", CountMemberInitAsStmt);
 }
 
 void FunctionSizeCheck::registerMatchers(MatchFinder *Finder) {
@@ -160,6 +172,7 @@ void FunctionSizeCheck::check(const MatchFinder::MatchResult &Result) {
 
   FunctionASTVisitor Visitor;
   Visitor.Info.NestingThreshold = NestingThreshold.value_or(-1);
+  Visitor.CountMemberInitAsStmt = CountMemberInitAsStmt;
   Visitor.TraverseDecl(const_cast<FunctionDecl *>(Func));
   auto &FI = Visitor.Info;
 
diff --git a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h
index 106c69ff0739..f668ab18fea5 100644
--- a/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/FunctionSizeCheck.h
@@ -47,6 +47,7 @@ private:
   const std::optional<unsigned> ParameterThreshold;
   const std::optional<unsigned> NestingThreshold;
   const std::optional<unsigned> VariableThreshold;
+  const bool CountMemberInitAsStmt;
 
   static constexpr std::optional<unsigned> DefaultLineThreshold = std::nullopt;
   static constexpr std::optional<unsigned> DefaultStatementThreshold = 800U;
@@ -58,6 +59,7 @@ private:
       std::nullopt;
   static constexpr std::optional<unsigned> DefaultVariableThreshold =
       std::nullopt;
+  static constexpr bool DefaultCountMemberInitAsStmt = true;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 7c0c534dbc73..4801dab8c1bd 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -278,6 +278,11 @@ Changes in existing checks
   <clang-tidy/checks/readability/convert-member-functions-to-static>` check by
   fixing false positives on member functions with an explicit object parameter.
 
+- Improved :doc:`readability-function-size
+  <clang-tidy/checks/readability/function-size>` check by adding new option
+  `CountMemberInitAsStmt` that allows counting class member initializers in
+  constructors as statements.
+
 - Improved :doc:`readability-math-missing-parentheses
   <clang-tidy/checks/readability/math-missing-parentheses>` check by fixing
   false negatives where math expressions are the operand of assignment operators
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/function-size.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/function-size.rst
index 133bd3e9c8cb..253e7c483cb8 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/function-size.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/function-size.rst
@@ -43,3 +43,8 @@ Options
    The default is `none` (ignore the number of variables).
    Please note that function parameters and variables declared in lambdas,
    GNU Statement Expressions, and nested class inline functions are not counted.
+
+.. option:: CountMemberInitAsStmt
+
+   When `true`, count class member initializers in constructors as statements.
+   Default is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/function-size-no-member-init-as-stmts.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/function-size-no-member-init-as-stmts.cpp
new file mode 100644
index 000000000000..d335988e5e03
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/function-size-no-member-init-as-stmts.cpp
@@ -0,0 +1,73 @@
+// RUN: %check_clang_tidy %s readability-function-size %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-function-size.LineThreshold: 0, \
+// RUN:         readability-function-size.StatementThreshold: 0, \
+// RUN:         readability-function-size.BranchThreshold: 0, \
+// RUN:         readability-function-size.ParameterThreshold: 5, \
+// RUN:         readability-function-size.NestingThreshold: 2, \
+// RUN:         readability-function-size.VariableThreshold: 1, \
+// RUN:         readability-function-size.CountMemberInitAsStmt: false \
+// RUN:     }}'
+
+// Bad formatting is intentional, don't run clang-format over the whole file!
+
+void foo1() {
+}
+
+void foo2() {;}
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'foo2' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-2]]:6: note: 1 statements (threshold 0)
+
+struct A {
+  A(int c, int d) : a(0), b(c) { ; }
+  int a;
+  int b;
+};
+// CHECK-MESSAGES: :[[@LINE-4]]:3: warning: function 'A' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-5]]:3: note: 1 statements (threshold 0)
+
+struct B {
+  B(int x, int y, int z) : a(x + y * z), b(), c_a(y, z) {
+    ;
+  }
+  int a;
+  int b;
+  A c_a;
+};
+// CHECK-MESSAGES: :[[@LINE-7]]:3: warning: function 'B' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-8]]:3: note: 2 lines including whitespace and comments (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-9]]:3: note: 1 statements (threshold 0)
+
+struct C : A, B {
+  // 0 statements
+  C() : A(0, 4), B(1, 2, 3) {}
+};
+
+template<typename T>
+struct TemplateC {
+  // 0 statements
+  TemplateC() : a(3) {}
+  T a;
+};
+
+template<typename T>
+struct TemplateD {
+  template<typename U>
+  TemplateD(U&& val) : member(val) { 
+    ;
+  }
+  
+  T member;
+};
+// CHECK-MESSAGES: :[[@LINE-6]]:3: warning: function 'TemplateD<T>' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-7]]:3: note: 2 lines including whitespace and comments (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-8]]:3: note: 1 statements (threshold 0)
+
+void instantiate() {
+  TemplateC<int> c;
+  TemplateD<int> d(5);
+}
+// CHECK-MESSAGES: :[[@LINE-4]]:6: warning: function 'instantiate' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-5]]:6: note: 3 lines including whitespace and comments (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-6]]:6: note: 2 statements (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-7]]:6: note: 2 variables (threshold 1)
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/function-size.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/function-size.cpp
index 45b2604b43d0..9364fa3077da 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/function-size.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/function-size.cpp
@@ -319,3 +319,60 @@ void variables_16() {
 // CHECK-MESSAGES: :[[@LINE-5]]:6: note: 3 lines including whitespace and comments (threshold 0)
 // CHECK-MESSAGES: :[[@LINE-6]]:6: note: 4 statements (threshold 0)
 // CHECK-MESSAGES: :[[@LINE-7]]:6: note: 2 variables (threshold 1)
+
+struct A {
+  A(int c, int d) : a(0), b(c) { ; }
+  int a;
+  int b;
+};
+// CHECK-MESSAGES: :[[@LINE-4]]:3: warning: function 'A' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-5]]:3: note: 3 statements (threshold 0)
+
+struct B {
+  B(int x, int y, int z) : a(x + y * z), b(), c_a(y, z) {
+    ;
+  }
+  int a;
+  int b;
+  A c_a;
+};
+// CHECK-MESSAGES: :[[@LINE-7]]:3: warning: function 'B' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-8]]:3: note: 2 lines including whitespace and comments (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-9]]:3: note: 4 statements (threshold 0)
+
+struct C : A, B {
+  C() : A(0, 4), B(1, 2, 3) {}
+};
+// CHECK-MESSAGES: :[[@LINE-2]]:3: warning: function 'C' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-3]]:3: note: 2 statements (threshold 0)
+
+template<typename T>
+struct TemplateC {
+  // 0 statements
+  TemplateC() : a(3) {}
+  T a;
+};
+// CHECK-MESSAGES: :[[@LINE-3]]:3: warning: function 'TemplateC<T>' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-4]]:3: note: 1 statements (threshold 0)
+
+template<typename T>
+struct TemplateD {
+  template<typename U>
+  TemplateD(U&& val) : member(val) { 
+    ;
+  }
+  
+  T member;
+};
+// CHECK-MESSAGES: :[[@LINE-6]]:3: warning: function 'TemplateD<T>' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-7]]:3: note: 2 lines including whitespace and comments (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-8]]:3: note: 2 statements (threshold 0)
+
+void instantiate() {
+  TemplateC<int> c;
+  TemplateD<int> d(5);
+}
+// CHECK-MESSAGES: :[[@LINE-4]]:6: warning: function 'instantiate' exceeds recommended size/complexity thresholds [readability-function-size]
+// CHECK-MESSAGES: :[[@LINE-5]]:6: note: 3 lines including whitespace and comments (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-6]]:6: note: 2 statements (threshold 0)
+// CHECK-MESSAGES: :[[@LINE-7]]:6: note: 2 variables (threshold 1)

From ea321392ebc487c1000e43576f44af99edf28a5f Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777@gmail.com>
Date: Sat, 21 Jun 2025 18:58:56 +0800
Subject: [PATCH 1157/1322] [C++][Modules] A module directive may only appear
 as the first preprocessing tokens in a file (#144233)

This PR is 2nd part of
[P1857R3](https://github.com/llvm/llvm-project/pull/107168)
implementation, and mainly implement the restriction `A module directive
may only appear as the first preprocessing tokens in a file (excluding
the global module fragment.)`:
[cpp.pre](https://eel.is/c++draft/cpp.pre):
```
module-file:
    pp-global-module-fragment[opt] pp-module group[opt] pp-private-module-fragment[opt]
```

We also refine tests use `split-file` instead of conditional macro.

Signed-off-by: yronglin <yronglin777@gmail.com>
---
 clang/include/clang/Lex/Lexer.h               |   3 +
 clang/include/clang/Lex/Preprocessor.h        |  17 ++
 clang/include/clang/Lex/Token.h               |  12 +-
 clang/include/clang/Sema/Sema.h               |   3 +-
 clang/lib/Lex/Lexer.cpp                       |  13 ++
 clang/lib/Lex/PPDirectives.cpp                |   3 +
 clang/lib/Lex/PPMacroExpansion.cpp            |   3 +
 clang/lib/Lex/Preprocessor.cpp                |   2 +
 clang/lib/Parse/Parser.cpp                    |   7 +-
 clang/lib/Sema/SemaModule.cpp                 |  15 +-
 clang/test/CXX/basic/basic.link/p1.cpp        | 149 +++++++++----
 clang/test/CXX/basic/basic.link/p2.cpp        |  26 +--
 .../basic.scope/basic.scope.namespace/p2.cpp  |  80 ++++---
 .../CXX/module/basic/basic.def.odr/p6.cppm    | 202 +++++++++++++-----
 .../basic/basic.link/module-declaration.cpp   |  62 +++---
 clang/test/CXX/module/cpp.pre/module_decl.cpp |   8 +
 .../dcl.module/dcl.module.import/p1.cppm      |  34 ++-
 .../dcl.module/dcl.module.interface/p1.cppm   |  47 ++--
 .../test/CXX/module/dcl.dcl/dcl.module/p1.cpp |  40 ++--
 .../test/CXX/module/dcl.dcl/dcl.module/p5.cpp |  69 ++++--
 clang/test/CXX/module/module.interface/p2.cpp |  26 ++-
 clang/test/CXX/module/module.unit/p8.cpp      |  46 ++--
 clang/test/Driver/modules.cpp                 |  31 +--
 clang/test/Modules/named-modules-adl-3.cppm   |   1 +
 clang/test/Modules/reserved-names-1.cppm      |  10 +
 .../reserved-names-system-header-1.cpp        |   1 +
 .../reserved-names-system-header-2.cpp        |   1 +
 clang/test/SemaCXX/modules.cppm               |  89 ++++----
 ...-aware-new-delete-transparent-contexts.cpp |  20 +-
 clang/unittests/Lex/LexerTest.cpp             |  47 +++-
 30 files changed, 738 insertions(+), 329 deletions(-)
 create mode 100644 clang/test/CXX/module/cpp.pre/module_decl.cpp

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index bb65ae010cff..ca812ba1583f 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -143,6 +143,9 @@ class Lexer : public PreprocessorLexer {
   /// True if this is the first time we're lexing the input file.
   bool IsFirstTimeLexingFile;
 
+  /// True if current lexing token is the first pp-token.
+  bool IsFirstPPToken;
+
   // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
   // it also points to '\n.'
   const char *NewLinePtr;
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 78be2bd64d61..47830b428c8a 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -350,6 +350,9 @@ private:
   /// Whether the last token we lexed was an '@'.
   bool LastTokenWasAt = false;
 
+  /// First pp-token in current translation unit.
+  std::optional<Token> FirstPPToken;
+
   /// A position within a C++20 import-seq.
   class StdCXXImportSeq {
   public:
@@ -1766,6 +1769,20 @@ public:
   std::optional<LexEmbedParametersResult> LexEmbedParameters(Token &Current,
                                                              bool ForHasEmbed);
 
+  /// Whether the preprocessor already seen the first pp-token in main file.
+  bool hasSeenMainFileFirstPPToken() const { return FirstPPToken.has_value(); }
+
+  /// Record first pp-token and check if it has a Token::FirstPPToken flag.
+  void HandleMainFileFirstPPToken(const Token &Tok) {
+    if (!hasSeenMainFileFirstPPToken() && Tok.isFirstPPToken() &&
+        SourceMgr.isWrittenInMainFile(Tok.getLocation()))
+      FirstPPToken = Tok;
+  }
+
+  Token getMainFileFirstPPToken() const {
+    assert(FirstPPToken && "First main file pp-token doesn't exists");
+    return *FirstPPToken;
+  }
   bool LexAfterModuleImport(Token &Result);
   void CollectPpImportSuffix(SmallVectorImpl<Token> &Toks);
 
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index 4f29fb7d1141..d4dfd7b44d9a 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -86,9 +86,12 @@ public:
                                 // macro stringizing or charizing operator.
     CommaAfterElided = 0x200, // The comma following this token was elided (MS).
     IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
-    IsReinjected = 0x800, // A phase 4 token that was produced before and
-                          // re-added, e.g. via EnterTokenStream. Annotation
-                          // tokens are *not* reinjected.
+
+    IsReinjected = 0x800,  // A phase 4 token that was produced before and
+                           // re-added, e.g. via EnterTokenStream. Annotation
+                           // tokens are *not* reinjected.
+    FirstPPToken = 0x1000, // This token is the first pp token in the
+                           // translation unit.
   };
 
   tok::TokenKind getKind() const { return Kind; }
@@ -318,6 +321,9 @@ public:
   /// represented as characters between '<#' and '#>' in the source code. The
   /// lexer uses identifier tokens to represent placeholders.
   bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
+
+  /// Returns true if this token is the first pp-token.
+  bool isFirstPPToken() const { return getFlag(FirstPPToken); }
 };
 
 /// Information about the conditional stack (\#if directives)
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 29452bb37260..9397546c8fc5 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9822,7 +9822,8 @@ public:
   DeclGroupPtrTy ActOnModuleDecl(SourceLocation StartLoc,
                                  SourceLocation ModuleLoc, ModuleDeclKind MDK,
                                  ModuleIdPath Path, ModuleIdPath Partition,
-                                 ModuleImportState &ImportState);
+                                 ModuleImportState &ImportState,
+                                 bool IntroducerIsFirstPPToken);
 
   /// The parser has processed a global-module-fragment declaration that begins
   /// the definition of the global module fragment of the current module unit.
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 93200458f04b..b61ea3b1614c 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -174,6 +174,8 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
   ExtendedTokenMode = 0;
 
   NewLinePtr = nullptr;
+
+  IsFirstPPToken = true;
 }
 
 /// Lexer constructor - Create a new lexer object for the specified buffer
@@ -3725,6 +3727,11 @@ bool Lexer::Lex(Token &Result) {
     HasLeadingEmptyMacro = false;
   }
 
+  if (IsFirstPPToken) {
+    Result.setFlag(Token::FirstPPToken);
+    IsFirstPPToken = false;
+  }
+
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   IsAtPhysicalStartOfLine = false;
   bool isRawLex = isLexingRawMode();
@@ -3732,6 +3739,10 @@ bool Lexer::Lex(Token &Result) {
   bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
   // (After the LexTokenInternal call, the lexer might be destroyed.)
   assert((returnedToken || !isRawLex) && "Raw lex must succeed");
+
+  if (returnedToken && Result.isFirstPPToken() && PP &&
+      !PP->hasSeenMainFileFirstPPToken())
+    PP->HandleMainFileFirstPPToken(Result);
   return returnedToken;
 }
 
@@ -4535,6 +4546,8 @@ const char *Lexer::convertDependencyDirectiveToken(
   Result.setFlag((Token::TokenFlags)DDTok.Flags);
   Result.setLength(DDTok.Length);
   BufferPtr = TokPtr + DDTok.Length;
+  if (PP && !PP->hasSeenMainFileFirstPPToken() && Result.isFirstPPToken())
+    PP->HandleMainFileFirstPPToken(Result);
   return TokPtr;
 }
 
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 04a30f66fb73..70934b9b1dec 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1242,6 +1242,9 @@ void Preprocessor::HandleDirective(Token &Result) {
   // pp-directive.
   bool ReadAnyTokensBeforeDirective =CurPPLexer->MIOpt.getHasReadAnyTokensVal();
 
+  if (!hasSeenMainFileFirstPPToken())
+    HandleMainFileFirstPPToken(Result);
+
   // Save the '#' token in case we need to return it later.
   Token SavedHash = Result;
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 37ac1bf07e9c..97bdeb873d69 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -469,6 +469,9 @@ bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier,
   // to disable the optimization in this case.
   if (CurPPLexer) CurPPLexer->MIOpt.ExpandedMacro();
 
+  if (!hasSeenMainFileFirstPPToken())
+    HandleMainFileFirstPPToken(Identifier);
+
   // If this is a builtin macro, like __LINE__ or _Pragma, handle it specially.
   if (MI->isBuiltinMacro()) {
     if (Callbacks)
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 21fc7a2b6fae..18b2f5f02d6c 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -247,6 +247,8 @@ void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
     llvm::errs() << " [LeadingSpace]";
   if (Tok.isExpandDisabled())
     llvm::errs() << " [ExpandDisabled]";
+  if (Tok.isFirstPPToken())
+    llvm::errs() << " [First pp-token]";
   if (Tok.needsCleaning()) {
     const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
     llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 788ed79e0c1f..18f399aca59e 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2340,7 +2340,8 @@ void Parser::ParseMicrosoftIfExistsExternalDeclaration() {
 
 Parser::DeclGroupPtrTy
 Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
-  SourceLocation StartLoc = Tok.getLocation();
+  Token Introducer = Tok;
+  SourceLocation StartLoc = Introducer.getLocation();
 
   Sema::ModuleDeclKind MDK = TryConsumeToken(tok::kw_export)
                                  ? Sema::ModuleDeclKind::Interface
@@ -2359,7 +2360,7 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   // Parse a global-module-fragment, if present.
   if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) {
     SourceLocation SemiLoc = ConsumeToken();
-    if (ImportState != Sema::ModuleImportState::FirstDecl) {
+    if (!Introducer.isFirstPPToken()) {
       Diag(StartLoc, diag::err_global_module_introducer_not_at_start)
         << SourceRange(StartLoc, SemiLoc);
       return nullptr;
@@ -2416,7 +2417,7 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
 
   return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, Partition,
-                                 ImportState);
+                                 ImportState, Introducer.isFirstPPToken());
 }
 
 Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 54ee0486763b..fe70ce3fba6a 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -263,11 +263,11 @@ static bool DiagReservedModuleName(Sema &S, const IdentifierInfo *II,
 Sema::DeclGroupPtrTy
 Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
                       ModuleDeclKind MDK, ModuleIdPath Path,
-                      ModuleIdPath Partition, ModuleImportState &ImportState) {
+                      ModuleIdPath Partition, ModuleImportState &ImportState,
+                      bool IntroducerIsFirstPPToken) {
   assert(getLangOpts().CPlusPlusModules &&
          "should only have module decl in standard C++ modules");
 
-  bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl;
   bool SeenGMF = ImportState == ModuleImportState::GlobalFragment;
   // If any of the steps here fail, we count that as invalidating C++20
   // module state;
@@ -333,14 +333,11 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
           SeenGMF == (bool)this->TheGlobalModuleFragment) &&
          "mismatched global module state");
 
-  // In C++20, the module-declaration must be the first declaration if there
-  // is no global module fragment.
-  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !SeenGMF) {
+  // In C++20, A module directive may only appear as the first preprocessing
+  // tokens in a file (excluding the global module fragment.).
+  if (getLangOpts().CPlusPlusModules && !IntroducerIsFirstPPToken && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
-    SourceLocation BeginLoc =
-        ModuleScopes.empty()
-            ? SourceMgr.getLocForStartOfFile(SourceMgr.getMainFileID())
-            : ModuleScopes.back().BeginLoc;
+    SourceLocation BeginLoc = PP.getMainFileFirstPPToken().getLocation();
     if (BeginLoc.isValid()) {
       Diag(BeginLoc, diag::note_global_module_introducer_missing)
           << FixItHint::CreateInsertion(BeginLoc, "module;\n");
diff --git a/clang/test/CXX/basic/basic.link/p1.cpp b/clang/test/CXX/basic/basic.link/p1.cpp
index c6a119aa7f47..26a5f025f42f 100644
--- a/clang/test/CXX/basic/basic.link/p1.cpp
+++ b/clang/test/CXX/basic/basic.link/p1.cpp
@@ -1,57 +1,128 @@
-// RUN: %clang_cc1 -std=c++2a -verify %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_GLOBAL_FRAG %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_MODULE_DECL %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_PRIVATE_FRAG %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_MODULE_DECL -DNO_PRIVATE_FRAG %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_GLOBAL_FRAG -DNO_PRIVATE_FRAG %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_GLOBAL_FRAG -DNO_MODULE_DECL %s
-// RUN: %clang_cc1 -std=c++2a -verify -DNO_GLOBAL_FRAG -DNO_MODULE_DECL -DNO_PRIVATE_FRAG %s
-// RUN: %clang_cc1 -std=c++2a -verify -DEXPORT_FRAGS %s
+// RUN: rm -rf %t
+// RUN: split-file %s %t
 
-#ifndef NO_GLOBAL_FRAG
-#ifdef EXPORT_FRAGS
-export // expected-error {{global module fragment cannot be exported}}
-#endif
+// RUN: %clang_cc1 -std=c++2a -verify %t/M.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoGlobalFrag.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoModuleDecl.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoPrivateFrag.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoModuleDeclAndNoPrivateFrag.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoGlobalFragAndNoPrivateFrag.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoGlobalFragAndNoModuleDecl.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/NoGlobalFragAndNoModuleDeclAndNoPrivateFrag.cppm
+// RUN: %clang_cc1 -std=c++2a -verify %t/ExportFrags.cppm
+
+//--- M.cppm
 module;
-#ifdef NO_MODULE_DECL
-// expected-error@-2 {{missing 'module' declaration at end of global module fragment introduced here}}
-#endif
-#endif
+extern int a; // #a1
+export module Foo;
+
+int a; // expected-error {{declaration of 'a' in module Foo follows declaration in the global module}}
+       // expected-note@#a1 {{previous decl}}
+extern int b;
+
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+module :private; // #priv-frag
+int b; // ok
+module :private; // expected-error {{private module fragment redefined}}
+                 // expected-note@#priv-frag {{previous definition is here}}
+
+//--- NoGlobalFrag.cppm
 
 extern int a; // #a1
-
-#ifndef NO_MODULE_DECL
-export module Foo;
-#ifdef NO_GLOBAL_FRAG
-// expected-error@-2 {{module declaration must occur at the start of the translation unit}}
-// expected-note@1 {{add 'module;' to the start of the file to introduce a global module fragment}}
-#endif
+export module Foo; // expected-error {{module declaration must occur at the start of the translation unit}}
+                   // expected-note@-2 {{add 'module;' to the start of the file to introduce a global module fragment}}
+
+// expected-error@#a2 {{declaration of 'a' in module Foo follows declaration in the global module}}
+// expected-note@#a1 {{previous decl}}
+
+int a; // #a2
+extern int b;
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+module :private; // #priv-frag
+int b; // ok
+module :private; // expected-error {{private module fragment redefined}}
+// expected-note@#priv-frag {{previous definition is here}}
+
+//--- NoModuleDecl.cppm
+module; // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+extern int a; // #a1
+int a; // #a2
+extern int b;
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+module :private; // expected-error {{private module fragment declaration with no preceding module declaration}}
+int b; // ok
+
+//--- NoPrivateFrag.cppm
+module;
+extern int a; // #a1
+export module Foo;
+
+// expected-error@#a2 {{declaration of 'a' in module Foo follows declaration in the global module}}
+// expected-note@#a1 {{previous decl}}
+int a; // #a2
+extern int b;
+
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+int b; // ok
+
+
+//--- NoModuleDeclAndNoPrivateFrag.cppm
+module; // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
+extern int a; // #a1
+int a; // #a2
+extern int b;
+
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+
+int b; // ok
+
+//--- NoGlobalFragAndNoPrivateFrag.cppm
+extern int a; // #a1
+export module Foo; // expected-error {{module declaration must occur at the start of the translation unit}}
+// expected-note@1 {{add 'module;' to the start of the file to introduce a global module fragment}}
 
 // expected-error@#a2 {{declaration of 'a' in module Foo follows declaration in the global module}}
 // expected-note@#a1 {{previous decl}}
-#endif
 
 int a; // #a2
 extern int b;
 
 module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
 
-#ifndef NO_PRIVATE_FRAG
-#ifdef EXPORT_FRAGS
-export // expected-error {{private module fragment cannot be exported}}
-#endif
-module :private; // #priv-frag
-#ifdef NO_MODULE_DECL
-// expected-error@-2 {{private module fragment declaration with no preceding module declaration}}
-#endif
-#endif
+int b; // ok
 
+//--- NoGlobalFragAndNoModuleDecl.cppm
+extern int a; // #a1
+int a; // #a2
+extern int b;
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+module :private; // #priv-frag
+// expected-error@-1 {{private module fragment declaration with no preceding module declaration}}
 int b; // ok
 
 
-#ifndef NO_PRIVATE_FRAG
-#ifndef NO_MODULE_DECL
+//--- NoGlobalFragAndNoModuleDeclAndNoPrivateFrag.cppm
+extern int a; // #a1
+int a; // #a2
+extern int b;
+
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+int b; // ok
+
+//--- ExportFrags.cppm
+export module; // expected-error {{global module fragment cannot be exported}}
+extern int a; // #a1
+export module Foo;
+// expected-error@#a2 {{declaration of 'a' in module Foo follows declaration in the global module}}
+// expected-note@#a1 {{previous decl}}
+
+int a; // #a2
+extern int b;
+
+module; // expected-error {{'module;' introducing a global module fragment can appear only at the start of the translation unit}}
+
+module :private; // #priv-frag
+
+int b; // ok
 module :private; // expected-error {{private module fragment redefined}}
-// expected-note@#priv-frag {{previous definition is here}}
-#endif
-#endif
+                 // expected-note@#priv-frag {{previous definition is here}}
diff --git a/clang/test/CXX/basic/basic.link/p2.cpp b/clang/test/CXX/basic/basic.link/p2.cpp
index ccad42022ee8..94cbc62490b2 100644
--- a/clang/test/CXX/basic/basic.link/p2.cpp
+++ b/clang/test/CXX/basic/basic.link/p2.cpp
@@ -1,16 +1,16 @@
-// RUN: %clang_cc1 -std=c++2a -DEXPORT %s -verify
-// RUN: %clang_cc1 -std=c++2a -DEXPORT %s -emit-module-interface -o %t.pcm
-// RUN: %clang_cc1 -std=c++2a -UEXPORT %s -verify -fmodule-file=M=%t.pcm
+// RUN: rm -rf %t
+// RUN: split-file %s %t
 
-#ifdef EXPORT
+// RUN: %clang_cc1 -std=c++2a %t/pmf_in_interface.cpp -verify
+// RUN: %clang_cc1 -std=c++2a %t/pmf_in_interface.cpp -emit-module-interface -o %t.pcm
+// RUN: %clang_cc1 -std=c++2a %t/pmf_in_implementation.cpp -verify -fmodule-file=M=%t.pcm
+
+
+//--- pmf_in_interface.cpp
 // expected-no-diagnostics
-export
-#else
-// expected-note@+2 {{add 'export' here}}
-#endif
-module M;
-
-#ifndef EXPORT
-// expected-error@+2 {{private module fragment in module implementation unit}}
-#endif
+export module M;
 module :private;
+
+//--- pmf_in_implementation.cpp
+module M; // expected-note {{add 'export' here}}
+module :private; // expected-error {{private module fragment in module implementation unit}}
diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
index d70eb7de22c6..fd0038b3f774 100644
--- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
+++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
@@ -1,14 +1,16 @@
 // RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: echo '#ifndef FOO_H' > %t/foo.h
-// RUN: echo '#define FOO_H' >> %t/foo.h
-// RUN: echo 'extern int in_header;' >> %t/foo.h
-// RUN: echo '#endif' >> %t/foo.h
-// RUN: %clang_cc1 -std=c++2a -I%t -emit-module-interface -DINTERFACE %s -o %t.pcm
-// RUN: %clang_cc1 -std=c++2a -I%t -fmodule-file=A=%t.pcm -DIMPLEMENTATION %s -verify -fno-modules-error-recovery
-// RUN: %clang_cc1 -std=c++2a -I%t -fmodule-file=A=%t.pcm %s -verify -fno-modules-error-recovery
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -std=c++2a -I%t -emit-module-interface %t/interface.cppm -o %t.pcm
+// RUN: %clang_cc1 -std=c++2a -I%t -fmodule-file=A=%t.pcm %t/implA.cppm -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -std=c++2a -I%t -fmodule-file=A=%t.pcm %t/implB.cppm -verify -fno-modules-error-recovery
 
-#ifdef INTERFACE
+//--- foo.h
+#ifndef FOO_H
+#define FOO_H
+extern int in_header;
+#endif
+
+//--- interface.cppm
 module;
 #include "foo.h"
 // FIXME: The following need to be moved to a header file. The global module
@@ -22,11 +24,9 @@ static int internal;
 module :private;
 int not_exported_private;
 static int internal_private;
-#else
 
-#ifdef IMPLEMENTATION
+//--- implA.cppm
 module;
-#endif
 
 void test_early() {
   in_header = 1; // expected-error {{use of undeclared identifier 'in_header'}}
@@ -46,11 +46,7 @@ void test_early() {
   internal_private = 1; // expected-error {{undeclared identifier}}
 }
 
-#ifdef IMPLEMENTATION
 module A;
-#else
-import A;
-#endif
 
 void test_late() {
   in_header = 1; // expected-error {{missing '#include "foo.h"'; 'in_header' must be declared before it is used}}
@@ -61,20 +57,54 @@ void test_late() {
   exported = 1;
 
   not_exported = 1;
-#ifndef IMPLEMENTATION
-  // expected-error@-2 {{use of undeclared identifier 'not_exported'; did you mean 'exported'?}}
-  // expected-note@p2.cpp:18 {{'exported' declared here}}
-#endif
 
   internal = 1; // expected-error {{use of undeclared identifier 'internal'}}
 
   not_exported_private = 1;
-#ifndef IMPLEMENTATION
-  // FIXME: should not be visible here
-  // expected-error@-3 {{undeclared identifier}}
-#endif
 
   internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}}
 }
 
-#endif
+//--- implB.cppm
+module;
+
+void test_early() {
+  in_header = 1; // expected-error {{use of undeclared identifier 'in_header'}}
+  // expected-note@* {{not visible}}
+
+  global_module_fragment = 1; // expected-error {{use of undeclared identifier 'global_module_fragment'}}
+
+  exported = 1; // expected-error {{use of undeclared identifier 'exported'}}
+
+  not_exported = 1; // expected-error {{use of undeclared identifier 'not_exported'}}
+
+  // FIXME: We need better diagnostic message for static variable.
+  internal = 1; // expected-error {{use of undeclared identifier 'internal'}}
+
+  not_exported_private = 1; // expected-error {{undeclared identifier}}
+
+  internal_private = 1; // expected-error {{undeclared identifier}}
+}
+
+export module B;
+import A;
+
+void test_late() {
+  in_header = 1; // expected-error {{missing '#include "foo.h"'; 'in_header' must be declared before it is used}}
+  // expected-note@* {{not visible}}
+
+  global_module_fragment = 1; // expected-error {{missing '#include'; 'global_module_fragment' must be declared before it is used}}
+
+  exported = 1;
+
+  not_exported = 1; // expected-error {{use of undeclared identifier 'not_exported'; did you mean 'exported'?}}
+  // expected-note@* {{'exported' declared here}}
+
+  internal = 1; // expected-error {{use of undeclared identifier 'internal'}}
+
+  not_exported_private = 1;
+  // FIXME: should not be visible here
+  // expected-error@-2 {{undeclared identifier}}
+
+  internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}}
+}
\ No newline at end of file
diff --git a/clang/test/CXX/module/basic/basic.def.odr/p6.cppm b/clang/test/CXX/module/basic/basic.def.odr/p6.cppm
index 8e7917dc63ea..c532e7ad40a1 100644
--- a/clang/test/CXX/module/basic/basic.def.odr/p6.cppm
+++ b/clang/test/CXX/module/basic/basic.def.odr/p6.cppm
@@ -3,29 +3,28 @@
 // RUN: split-file %s %t
 //
 // RUN: %clang_cc1 -std=c++20 -verify %t/global-vs-module.cppm 
-// RUN: %clang_cc1 -std=c++20 -verify %t/global-vs-module.cppm -DEXPORT
-// RUN: %clang_cc1 -std=c++20 -verify %t/global-vs-module.cppm -DUSING
+// RUN: %clang_cc1 -std=c++20 -verify %t/global-vs-module-export.cppm
+// RUN: %clang_cc1 -std=c++20 -verify %t/global-vs-module-using.cppm
 //
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/global-vs-module.cppm -o %t/M.pcm -DNO_GLOBAL -DEXPORT
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/M.cppm -o %t/M.pcm
 // RUN: %clang_cc1 -std=c++20 -verify %t/module-vs-global.cpp -fmodule-file=M=%t/M.pcm
 //
 // Some of the following tests intentionally have no -verify in their RUN
 // lines; we are testing that those cases do not produce errors.
 //
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -DMODULE_INTERFACE -verify
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -DMODULE_INTERFACE -DNO_IMPORT
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-interface.cpp -fmodule-file=M=%t/M.pcm -verify
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-interface.cpp -fmodule-file=M=%t/M.pcm -DNO_IMPORT
 //
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -emit-module-interface -o %t/N.pcm -DMODULE_INTERFACE -DNO_ERRORS
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -fmodule-file=N=%t/N.pcm -verify
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-interface.cpp -fmodule-file=M=%t/M.pcm -emit-module-interface -o %t/N.pcm -DNO_ERRORS
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-impl.cpp -fmodule-file=M=%t/M.pcm -fmodule-file=N=%t/N.pcm -verify
 //
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -fmodule-file=N=%t/N.pcm -DNO_IMPORT -verify
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-impl.cpp -fmodule-file=M=%t/M.pcm -fmodule-file=N=%t/N.pcm -DNO_IMPORT -verify
 //
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -emit-module-interface -o %t/N-no-M.pcm -DMODULE_INTERFACE -DNO_ERRORS -DNO_IMPORT
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=M=%t/M.pcm -fmodule-file=N=%t/N-no-M.pcm -verify
-// RUN: %clang_cc1 -std=c++20 %t/module-vs-module.cpp -fmodule-file=N=%t/N-no-M.pcm -DNO_IMPORT
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-interface.cpp -fmodule-file=M=%t/M.pcm -emit-module-interface -o %t/N-no-M.pcm -DNO_ERRORS -DNO_IMPORT
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-impl.cpp -fmodule-file=M=%t/M.pcm -fmodule-file=N=%t/N-no-M.pcm -verify
+// RUN: %clang_cc1 -std=c++20 %t/module-vs-module-impl.cpp -fmodule-file=N=%t/N-no-M.pcm -DNO_IMPORT
 
 //--- global-vs-module.cppm
-#ifndef NO_GLOBAL
 module;
 extern int var; // expected-note {{previous declaration is here}}
 int func(); // expected-note {{previous declaration is here}}
@@ -40,25 +39,9 @@ template<typename> using type_tpl = int; // expected-note {{previous declaration
 typedef int type;
 namespace ns { using ::func; }
 namespace ns_alias = ns;
-#endif
 
 export module M;
 
-#ifdef USING
-using ::var;
-using ::func;
-using ::str;
-using ::type;
-using ::var_tpl;
-using ::func_tpl;
-using ::str_tpl;
-using ::type_tpl;
-#endif
-
-#ifdef EXPORT
-export {
-#endif
-
 extern int var; // expected-error {{declaration of 'var' in module M follows declaration in the global module}}
 int func(); // expected-error {{declaration of 'func' in module M follows declaration in the global module}}
 struct str; // expected-error {{declaration of 'str' in module M follows declaration in the global module}}
@@ -73,51 +56,162 @@ typedef int type;
 namespace ns { using ::func; }
 namespace ns_alias = ns;
 
-#ifdef EXPORT
-}
-#endif
-
-//--- module-vs-global.cpp
-import M;
-
-extern int var; // expected-error {{declaration of 'var' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:35 {{previous}}
-int func(); // expected-error {{declaration of 'func' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:36 {{previous}}
-struct str; // expected-error {{declaration of 'str' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:37 {{previous}}
+//--- global-vs-module-export.cppm
+module;
+extern int var; // expected-note {{previous declaration is here}}
+int func(); // expected-note {{previous declaration is here}}
+struct str; // expected-note {{previous declaration is here}}
 using type = int;
 
-template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:40 {{previous}}
-template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:41 {{previous}}
-template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:42 {{previous}}
-template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in the global module follows declaration in module M}} expected-note@global-vs-module.cppm:43 {{previous}}
+template<typename> extern int var_tpl; // expected-note {{previous declaration is here}}
+template<typename> int func_tpl(); // expected-note {{previous declaration is here}}
+template<typename> struct str_tpl; // expected-note {{previous declaration is here}}
+template<typename> using type_tpl = int; // expected-note {{previous declaration is here}}
+
+typedef int type;
+namespace ns { using ::func; }
+namespace ns_alias = ns;
+
+export module M;
+
+export {
+extern int var; // expected-error {{declaration of 'var' in module M follows declaration in the global module}}
+int func(); // expected-error {{declaration of 'func' in module M follows declaration in the global module}}
+struct str; // expected-error {{declaration of 'str' in module M follows declaration in the global module}}
+using type = int;
+
+template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in module M follows declaration in the global module}}
+template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in module M follows declaration in the global module}}
+template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in module M follows declaration in the global module}}
+template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in module M follows declaration in the global module}}
+
+typedef int type;
+namespace ns { using ::func; }
+namespace ns_alias = ns;
+}
+
+//--- global-vs-module-using.cppm
+module;
+extern int var; // expected-note {{previous declaration is here}}
+int func(); // expected-note {{previous declaration is here}}
+struct str; // expected-note {{previous declaration is here}}
+using type = int;
+
+template<typename> extern int var_tpl; // expected-note {{previous declaration is here}}
+template<typename> int func_tpl(); // expected-note {{previous declaration is here}}
+template<typename> struct str_tpl; // expected-note {{previous declaration is here}}
+template<typename> using type_tpl = int; // expected-note {{previous declaration is here}}
+
+typedef int type;
+namespace ns { using ::func; }
+namespace ns_alias = ns;
+
+export module M;
+
+using ::var;
+using ::func;
+using ::str;
+using ::type;
+using ::var_tpl;
+using ::func_tpl;
+using ::str_tpl;
+using ::type_tpl;
+
+extern int var; // expected-error {{declaration of 'var' in module M follows declaration in the global module}}
+int func(); // expected-error {{declaration of 'func' in module M follows declaration in the global module}}
+struct str; // expected-error {{declaration of 'str' in module M follows declaration in the global module}}
+using type = int;
+
+template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in module M follows declaration in the global module}}
+template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in module M follows declaration in the global module}}
+template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in module M follows declaration in the global module}}
+template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in module M follows declaration in the global module}}
+
+typedef int type;
+namespace ns { using ::func; }
+namespace ns_alias = ns;
+
+//--- M.cppm
+export module M;
+
+export {
+extern int var; // expected-error {{declaration of 'var' in module M follows declaration in the global module}}
+int func(); // expected-error {{declaration of 'func' in module M follows declaration in the global module}}
+struct str; // expected-error {{declaration of 'str' in module M follows declaration in the global module}}
+using type = int;
+
+template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in module M follows declaration in the global module}}
+template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in module M follows declaration in the global module}}
+template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in module M follows declaration in the global module}}
+template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in module M follows declaration in the global module}}
+
+typedef int type;
+namespace ns { using ::func; }
+namespace ns_alias = ns;
+}
+
+//--- module-vs-global.cpp
+module;
+import M;
+
+extern int var; // expected-error {{declaration of 'var' in the global module follows declaration in module M}} expected-note@M.cppm:4 {{previous}}
+int func(); // expected-error {{declaration of 'func' in the global module follows declaration in module M}} expected-note@M.cppm:5 {{previous}}
+struct str; // expected-error {{declaration of 'str' in the global module follows declaration in module M}} expected-note@M.cppm:6 {{previous}}
+using type = int;
+
+template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in the global module follows declaration in module M}} expected-note@M.cppm:9 {{previous}}
+template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in the global module follows declaration in module M}} expected-note@M.cppm:10 {{previous}}
+template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in the global module follows declaration in module M}} expected-note@M.cppm:11 {{previous}}
+template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in the global module follows declaration in module M}} expected-note@M.cppm:12 {{previous}}
 
 typedef int type;
 namespace ns { using ::func; }
 namespace ns_alias = ns;
 
-//--- module-vs-module.cpp
-#ifdef MODULE_INTERFACE
 export module N;
-#else
-module N;
-#endif
+
+//--- module-vs-module-interface.cpp
+export module N;
 
 #ifndef NO_IMPORT
 import M;
 #endif
 
 #ifndef NO_ERRORS
-extern int var; // expected-error {{declaration of 'var' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:35 {{previous}}
-int func(); // expected-error {{declaration of 'func' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:36 {{previous}}
-struct str; // expected-error {{declaration of 'str' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:37 {{previous}}
+extern int var; // expected-error {{declaration of 'var' in module N follows declaration in module M}} expected-note@M.cppm:4 {{previous}}
+int func(); // expected-error {{declaration of 'func' in module N follows declaration in module M}} expected-note@M.cppm:5 {{previous}}
+struct str; // expected-error {{declaration of 'str' in module N follows declaration in module M}} expected-note@M.cppm:6 {{previous}}
 using type = int;
 
-template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:40 {{previous}}
-template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:41 {{previous}}
-template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:42 {{previous}}
-template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in module N follows declaration in module M}} expected-note@global-vs-module.cppm:43 {{previous}}
+template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in module N follows declaration in module M}} expected-note@M.cppm:9 {{previous}}
+template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in module N follows declaration in module M}} expected-note@M.cppm:10 {{previous}}
+template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in module N follows declaration in module M}} expected-note@M.cppm:11 {{previous}}
+template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in module N follows declaration in module M}} expected-note@M.cppm:12 {{previous}}
 
 typedef int type;
 namespace ns { using ::func; }
 namespace ns_alias = ns;
 #endif
 
+//--- module-vs-module-impl.cpp
+module N;
+
+#ifndef NO_IMPORT
+import M;
+#endif
+
+#ifndef NO_ERRORS
+extern int var; // expected-error {{declaration of 'var' in module N follows declaration in module M}} expected-note@M.cppm:4 {{previous}}
+int func(); // expected-error {{declaration of 'func' in module N follows declaration in module M}} expected-note@M.cppm:5 {{previous}}
+struct str; // expected-error {{declaration of 'str' in module N follows declaration in module M}} expected-note@M.cppm:6 {{previous}}
+using type = int;
+
+template<typename> extern int var_tpl; // expected-error {{declaration of 'var_tpl' in module N follows declaration in module M}} expected-note@M.cppm:9 {{previous}}
+template<typename> int func_tpl(); // expected-error {{declaration of 'func_tpl' in module N follows declaration in module M}} expected-note@M.cppm:10 {{previous}}
+template<typename> struct str_tpl; // expected-error {{declaration of 'str_tpl' in module N follows declaration in module M}} expected-note@M.cppm:11 {{previous}}
+template<typename> using type_tpl = int; // expected-error {{declaration of 'type_tpl' in module N follows declaration in module M}} expected-note@M.cppm:12 {{previous}}
+
+typedef int type;
+namespace ns { using ::func; }
+namespace ns_alias = ns;
+#endif
diff --git a/clang/test/CXX/module/basic/basic.link/module-declaration.cpp b/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
index d71358cc7a57..4bdcc9e5f278 100644
--- a/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
+++ b/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
@@ -8,27 +8,19 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -fmodule-file=x=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
 //
 // Module implementation for unknown and known module. (The former is ill-formed.)
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M.cpp \
-// RUN:            -DTEST=1 -DEXPORT= -DMODULE_NAME=z
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x=%t/x.pcm -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M.cpp \
-// RUN:            -DTEST=2 -DEXPORT= -DMODULE_NAME=x
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/z_impl.cppm
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x=%t/x.pcm -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/x_impl.cppm
 //
 // Module interface for unknown and known module. (The latter is ill-formed due to
 // redefinition.)
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=3 -DEXPORT=export -DMODULE_NAME=z
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=4 -DEXPORT=export -DMODULE_NAME=x
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/z_interface.cppm
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/x_interface.cppm
 //
 // Miscellaneous syntax.
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=7 -DEXPORT=export -DMODULE_NAME='z elderberry'
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=8 -DEXPORT=export -DMODULE_NAME='z [[]]'
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=9 -DEXPORT=export -DMODULE_NAME='z [[fancy]]'
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
-// RUN:            -DTEST=10 -DEXPORT=export -DMODULE_NAME='z [[maybe_unused]]'
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/invalid_module_name.cppm
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/empty_attribute.cppm
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/fancy_attribute.cppm
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/maybe_unused_attribute.cppm
 
 //--- x.cppm
 export module x;
@@ -38,17 +30,31 @@ int a, b;
 export module x.y;
 int c;
 
-//--- M.cpp
+//--- z_impl.cppm
+module z; // expected-error {{module 'z' not found}}
 
-EXPORT module MODULE_NAME;
-#if TEST == 7
-// expected-error@-2 {{expected ';'}} expected-error@-2 {{a type specifier is required}}
-#elif TEST == 9
-// expected-warning@-4 {{unknown attribute 'fancy' ignored}}
-#elif TEST == 10
-// expected-error-re@-6 {{'maybe_unused' attribute cannot be applied to a module{{$}}}}
-#elif TEST == 1
-// expected-error@-8 {{module 'z' not found}}
-#else
+//--- x_impl.cppm
 // expected-no-diagnostics
-#endif
+module x;
+
+//--- z_interface.cppm
+// expected-no-diagnostics
+export module z;
+
+//--- x_interface.cppm
+// expected-no-diagnostics
+export module x;
+
+//--- invalid_module_name.cppm
+export module z elderberry; // expected-error {{expected ';'}} \
+                            // expected-error {{a type specifier is required}}
+
+//--- empty_attribute.cppm
+// expected-no-diagnostics
+export module z [[]];
+
+//--- fancy_attribute.cppm
+export module z [[fancy]]; // expected-warning {{unknown attribute 'fancy' ignored}}
+
+//--- maybe_unused_attribute.cppm
+export module z [[maybe_unused]]; // expected-error-re {{'maybe_unused' attribute cannot be applied to a module{{$}}}}
diff --git a/clang/test/CXX/module/cpp.pre/module_decl.cpp b/clang/test/CXX/module/cpp.pre/module_decl.cpp
new file mode 100644
index 000000000000..6238347c167a
--- /dev/null
+++ b/clang/test/CXX/module/cpp.pre/module_decl.cpp
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %s -verify -o %t/M.pcm
+
+// This is a comment
+#define I32 int // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+export I32 i32;
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
index 3670f9430ed4..f65f050a3c7b 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
@@ -6,10 +6,10 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -fmodule-file=x=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.b.cppm -o %t/a.b.pcm
 //
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test.cpp \
-// RUN:            -DMODULE_NAME=z -DINTERFACE
 // RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm \
-// RUN:            -fmodule-file=a.b=%t/a.b.pcm -verify %t/test.cpp -DMODULE_NAME=a.b
+// RUN:             -verify %t/test.interface.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm \
+// RUN:            -fmodule-file=a.b=%t/a.b.pcm -verify %t/test.implementation.cpp
 // RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test.x.cpp
 
 //--- x.cppm
@@ -33,19 +33,33 @@ int use_2 = b; // ok
 // There is no relation between module x and module x.y.
 int use_3 = c; // expected-error {{use of undeclared identifier 'c'}}
 
-//--- test.cpp
-#ifdef INTERFACE
-export module MODULE_NAME;
-#else
-module MODULE_NAME;
-#endif
+//--- test.interface.cpp
+export module z;
 
 import x;
 
 import x [[]];
 import x [[foo]]; // expected-warning {{unknown attribute 'foo' ignored}}
 import x [[noreturn]]; // expected-error {{'noreturn' attribute cannot be applied to a module import}}
-import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'blarg::noreturn' ignored}}
+import x [[blarg::noreturn]]; // expected-warning-re {{unknown attribute 'blarg::noreturn' ignored{{.*}}}}
+
+import x.y;
+import x.; // expected-error {{expected a module name after 'import'}}
+import .x; // expected-error {{expected a module name after 'import'}}
+
+import blarg; // expected-error {{module 'blarg' not found}}
+
+int use_4 = c; // ok
+
+//--- test.implementation.cpp
+module a.b;
+
+import x;
+
+import x [[]];
+import x [[foo]]; // expected-warning {{unknown attribute 'foo' ignored}}
+import x [[noreturn]]; // expected-error {{'noreturn' attribute cannot be applied to a module import}}
+import x [[blarg::noreturn]]; // expected-warning-re {{unknown attribute 'blarg::noreturn' ignored{{.*}}}}
 
 import x.y;
 import x.; // expected-error {{expected a module name after 'import'}}
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
index 84ef85126c36..2158d7fa84b8 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
@@ -1,29 +1,26 @@
-// RUN: %clang_cc1 -std=c++20 %s -verify -emit-module-interface -o /dev/null
-// RUN: %clang_cc1 -std=c++20 %s -DINTERFACE -verify -emit-module-interface -o %t
-// RUN: %clang_cc1 -std=c++20 %s -DIMPLEMENTATION -verify -fmodule-file=A=%t -o /dev/null
-//
-// RUN: %clang_cc1 -std=c++20 %s -DBUILT_AS_INTERFACE -emit-module-interface -verify -o /dev/null
-// RUN: %clang_cc1 -std=c++20 %s -DINTERFACE -DBUILT_AS_INTERFACE -emit-module-interface -verify -o /dev/null
-// RUN: %clang_cc1 -std=c++20 %s -DIMPLEMENTATION -DBUILT_AS_INTERFACE -emit-module-interface -verify -o /dev/null
+// RUN: rm -rf %t
+// RUN: split-file %s %t
 
-#if INTERFACE
+// RUN: %clang_cc1 -std=c++20 %t/ExportDeclNotInModulePurview.cppm -verify -emit-module-interface -o /dev/null
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -verify -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/AddExport.cppm -verify -fmodule-file=A=%t/A.pcm -o /dev/null
+//
+// RUN: %clang_cc1 -std=c++20 %t/AddExport2.cppm -emit-module-interface -verify -o /dev/null
+
+//--- ExportDeclNotInModulePurview.cppm
+// expected-error@* {{missing 'export module' declaration in module interface unit}}
+export int b; // expected-error {{export declaration can only be used within a module purview}}
+
+//--- A.cppm
 // expected-no-diagnostics
 export module A;
-#elif IMPLEMENTATION
-module A; // #module-decl
- #ifdef BUILT_AS_INTERFACE
-  // expected-error@-2 {{missing 'export' specifier in module declaration while building module interface}}
-  #define INTERFACE
- #endif
-#else // Not in a module
-// expected-error@* {{missing 'export module' declaration in module interface unit}}
-#endif
-
-#ifndef INTERFACE
-export int b; // expected-error {{export declaration can only be used within a module purview}}
-#ifdef IMPLEMENTATION
-// expected-note@#module-decl {{add 'export' here}}
-#endif
-#else
 export int a;
-#endif
+
+//--- AddExport.cppm
+module A; // #module-decl
+export int b; // expected-error {{export declaration can only be used within a module purview}}
+// expected-note@#module-decl {{add 'export' here}}
+
+//--- AddExport2.cppm
+module A; // expected-error {{missing 'export' specifier in module declaration while building module interface}}
+export int a;
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/p1.cpp b/clang/test/CXX/module/dcl.dcl/dcl.module/p1.cpp
index db86b5dd34c3..95d087e0f6c7 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/p1.cpp
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/p1.cpp
@@ -1,14 +1,30 @@
-// RUN: %clang_cc1 -std=c++20 -verify %s -DFOO=export -DBAR=export
-// RUN: %clang_cc1 -std=c++20 -verify %s -DFOO=export -DBAR=
-// RUN: %clang_cc1 -std=c++20 %s -DFOO=export -emit-module-interface -o %t
-// RUN: %clang_cc1 -std=c++20 %s -fmodule-file=foo=%t -DFOO=
-// RUN: %clang_cc1 -std=c++20 %s -fmodule-file=foo=%t -DBAR=export
-// RUN: %clang_cc1 -std=c++20 -verify %s -fmodule-file=foo=%t -DFOO= -DBAR=export
+// RUN: rm -rf %t
+// RUN: split-file %s %t
 
-#ifdef FOO
-FOO module foo; // expected-note {{previous module declaration is here}}
-#endif
+// RUN: %clang_cc1 -std=c++20 -verify %t/A.cppm
+// RUN: %clang_cc1 -std=c++20 -verify %t/B.cppm
+// RUN: %clang_cc1 -std=c++20 %t/C.cppm -emit-module-interface -o %t/C.pcm
+// RUN: %clang_cc1 -std=c++20 %t/D.cppm -fmodule-file=foo=%t/C.pcm
+// RUN: %clang_cc1 -std=c++20 %t/E.cppm -fmodule-file=foo=%t/C.pcm
+// RUN: %clang_cc1 -std=c++20 -verify %t/F.cppm -fmodule-file=foo=%t/C.pcm
 
-#ifdef BAR
-BAR module bar; // expected-error {{translation unit contains multiple module declarations}}
-#endif
+//--- A.cppm
+export module foo; // expected-note {{previous module declaration is here}}
+export module bar; // expected-error {{translation unit contains multiple module declarations}}
+
+//--- B.cppm
+export module foo; // expected-note {{previous module declaration is here}}
+module bar; // expected-error {{translation unit contains multiple module declarations}}
+
+//--- C.cppm
+export module foo;
+
+//--- D.cppm
+module foo;
+
+//--- E.cppm
+export module bar;
+
+//--- F.cppm
+module foo; // expected-note {{previous module declaration is here}}
+export module bar; // expected-error {{translation unit contains multiple module declarations}}
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/p5.cpp b/clang/test/CXX/module/dcl.dcl/dcl.module/p5.cpp
index ca100443a4c6..a0d30233809f 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/p5.cpp
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/p5.cpp
@@ -1,22 +1,14 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %s -o %t -DINTERFACE
-// RUN: %clang_cc1 -std=c++20 -fmodule-file=Foo=%t %s -verify -DIMPLEMENTATION
-// RUN: %clang_cc1 -std=c++20 -fmodule-file=Foo=%t %s -verify -DEARLY_IMPLEMENTATION
-// RUN: %clang_cc1 -std=c++20 -fmodule-file=Foo=%t %s -verify -DUSER
+// RUN: split-file %s %t
 
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/interface.cppm -o %t/interface.pcm
+// RUN: %clang_cc1 -std=c++20 -fmodule-file=Foo=%t/interface.pcm %t/implementation.cppm -verify -DIMPLEMENTATION
+// RUN: %clang_cc1 -std=c++20 -fmodule-file=Foo=%t/interface.pcm %t/early_impl.cppm -verify -DEARLY_IMPLEMENTATION
+// RUN: %clang_cc1 -std=c++20 -fmodule-file=Foo=%t/interface.pcm %t/user.cppm -verify -DUSER
+
+//--- interface.cppm
 // expected-no-diagnostics
-
-#if defined(INTERFACE) || defined(EARLY_IMPLEMENTATION) || defined(IMPLEMENTATION)
 module;
-#endif
-
-#ifdef USER
-import Foo;
-#endif
-
-#ifdef EARLY_IMPLEMENTATION
-module Foo;
-#endif
 
 template<typename T> struct type_template {
   typedef T type;
@@ -28,10 +20,49 @@ template<typename T> void type_template<T>::f(type) {}
 template<int = 0, typename = int, template<typename> class = type_template>
 struct default_template_args {};
 
-#ifdef INTERFACE
 export module Foo;
-#endif
 
-#ifdef IMPLEMENTATION
+//--- implementation.cppm
+// expected-no-diagnostics
+module;
+
+template<typename T> struct type_template {
+  typedef T type;
+  void f(type);
+};
+
+template<typename T> void type_template<T>::f(type) {}
+
+template<int = 0, typename = int, template<typename> class = type_template>
+struct default_template_args {};
+
 module Foo;
-#endif
+
+//--- early_impl.cppm
+// expected-no-diagnostics
+module;
+module Foo;
+
+template<typename T> struct type_template {
+  typedef T type;
+  void f(type);
+};
+
+template<typename T> void type_template<T>::f(type) {}
+
+template<int = 0, typename = int, template<typename> class = type_template>
+struct default_template_args {};
+
+//--- user.cppm
+// expected-no-diagnostics
+import Foo;
+
+template<typename T> struct type_template {
+  typedef T type;
+  void f(type);
+};
+
+template<typename T> void type_template<T>::f(type) {}
+
+template<int = 0, typename = int, template<typename> class = type_template>
+struct default_template_args {};
diff --git a/clang/test/CXX/module/module.interface/p2.cpp b/clang/test/CXX/module/module.interface/p2.cpp
index 4f06b9f38686..8221c400ecd6 100644
--- a/clang/test/CXX/module/module.interface/p2.cpp
+++ b/clang/test/CXX/module/module.interface/p2.cpp
@@ -1,24 +1,26 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
 // RUN: %clang_cc1 -std=c++20 -x c++-header %S/Inputs/header.h -emit-header-unit -o %t/h.pcm
-// RUN: %clang_cc1 -std=c++20 %s -DX_INTERFACE -emit-module-interface -o %t/x.pcm
-// RUN: %clang_cc1 -std=c++20 %s -DY_INTERFACE -emit-module-interface -o %t/y.pcm
-// RUN: %clang_cc1 -std=c++20 %s -DINTERFACE -fmodule-file=X=%t/x.pcm -fmodule-file=Y=%t/y.pcm -emit-module-interface -o %t/m.pcm
-// RUN: %clang_cc1 -std=c++20 %s -DIMPLEMENTATION -I%S/Inputs -fmodule-file=%t/h.pcm \
+// RUN: %clang_cc1 -std=c++20 %t/x.cppm -emit-module-interface -o %t/x.pcm
+// RUN: %clang_cc1 -std=c++20 %t/y.cppm -emit-module-interface -o %t/y.pcm
+// RUN: %clang_cc1 -std=c++20 %t/interface.cppm -fmodule-file=X=%t/x.pcm -fmodule-file=Y=%t/y.pcm -emit-module-interface -o %t/m.pcm
+// RUN: %clang_cc1 -std=c++20 %t/impl.cppm -I%S/Inputs -fmodule-file=%t/h.pcm \
 // RUN:   -fmodule-file=X=%t/x.pcm -fmodule-file=Y=%t/y.pcm -fmodule-file=p2=%t/m.pcm -verify \
 // RUN:   -Wno-experimental-header-units
-// RUN: %clang_cc1 -std=c++20 %s -DUSER -I%S/Inputs -fmodule-file=%t/h.pcm -fmodule-file=p2=%t/m.pcm \
+// RUN: %clang_cc1 -std=c++20 %t/user.cppm -I%S/Inputs -fmodule-file=%t/h.pcm -fmodule-file=p2=%t/m.pcm \
 // RUN:   -fmodule-file=X=%t/x.pcm -fmodule-file=Y=%t/y.pcm -Wno-experimental-header-units -verify
 
-#if defined(X_INTERFACE)
+//--- x.cppm
 export module X;
 export int x;
 
-#elif defined(Y_INTERFACE)
+//--- y.cppm
 export module Y;
 export int y;
 
-#elif defined(INTERFACE)
+//--- interface.cppm
 export module p2;
 export import X;
 import Y; // not exported
@@ -39,7 +41,7 @@ namespace C {}
 namespace D { int f(); }
 export namespace D {}
 
-#elif defined(IMPLEMENTATION)
+//--- impl.cppm
 module p2;
 import "header.h";
 
@@ -66,7 +68,7 @@ void use() {
 
 int use_header() { return foo + bar::baz(); }
 
-#elif defined(USER)
+//--- user.cppm
 import p2;
 import "header.h";
 
@@ -96,7 +98,3 @@ void use() {
 }
 
 int use_header() { return foo + bar::baz(); }
-
-#else
-#error unknown mode
-#endif
diff --git a/clang/test/CXX/module/module.unit/p8.cpp b/clang/test/CXX/module/module.unit/p8.cpp
index a5c01c493558..fb190257d3a2 100644
--- a/clang/test/CXX/module/module.unit/p8.cpp
+++ b/clang/test/CXX/module/module.unit/p8.cpp
@@ -1,37 +1,45 @@
-// RUN: echo 'export module foo; export int n;' > %t.cppm
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
+// RUN: echo 'export module foo;' > %t.cppm
+// RUN: echo 'export int n;' >> %t.cppm
 // RUN: %clang_cc1 -std=c++2a %t.cppm -emit-module-interface -o %t.pcm
-// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=0 %s
-// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=1 %s
-// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=2 %s
-// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=3 %s
-// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=4 %s
-// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=5 %s
+// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=0 %t/A.cppm
+// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=1 %t/B.cppm
+// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=2 %t/C.cppm
+// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=3 %t/D.cppm
+// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=4 %t/E.cppm
+// RUN: %clang_cc1 -std=c++2a -fmodule-file=foo=%t.pcm -verify -DMODE=5 %t/F.cppm
 
-#if MODE == 0
+//--- A.cppm
 // no module declaration
+// expected-no-diagnostics
 
-#elif MODE == 1
+//--- B.cppm
 // expected-no-diagnostics
 module foo; // Implementation, implicitly imports foo.
 #define IMPORTED
 
-#elif MODE == 2
+int k = n;
+
+//--- C.cppm
 export module foo;
 
-#elif MODE == 3
+int k = n; // expected-error {{use of undeclared identifier 'n'}}
+
+//--- D.cppm
 export module bar; // A different module
 
-#elif MODE == 4
+int k = n; // expected-error {{use of undeclared identifier 'n'}}
+
+//--- E.cppm
 module foo:bar; // Partition implementation
 //#define IMPORTED (we don't import foo here)
 
-#elif MODE == 5
+int k = n; // expected-error {{use of undeclared identifier 'n'}}
+
+//--- F.cppm
 export module foo:bar; // Partition interface
 //#define IMPORTED  (we don't import foo here)
 
-#endif
-
-int k = n;
-#ifndef IMPORTED
-// expected-error@-2 {{use of undeclared identifier 'n'}}
-#endif
+int k = n; // expected-error {{use of undeclared identifier 'n'}}
diff --git a/clang/test/Driver/modules.cpp b/clang/test/Driver/modules.cpp
index b0d1f2280d25..088a73230f81 100644
--- a/clang/test/Driver/modules.cpp
+++ b/clang/test/Driver/modules.cpp
@@ -1,43 +1,48 @@
 // RUN: rm -rf %t
 // RUN: mkdir %t
+// RUN: split-file %s %t
 
 // Check compiling a module interface to a .pcm file.
 //
-// RUN: %clang -std=c++2a -x c++-module --precompile %s -o %t/module.pcm -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE
-// RUN: %clang -std=gnu++2a -x c++-module --precompile %s -o %t/module-gnu.pcm -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE
+// RUN: %clang -std=c++2a -x c++-module --precompile %t/foo.cpp -o %t/foo.pcm -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE
+// RUN: %clang -std=gnu++2a -x c++-module --precompile %t/foo.cpp -o %t/foo-gnu.pcm -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE
 //
 // CHECK-PRECOMPILE: -cc1 {{.*}} -emit-module-interface
 // CHECK-PRECOMPILE-SAME: -o {{.*}}.pcm
 // CHECK-PRECOMPILE-SAME: -x c++
-// CHECK-PRECOMPILE-SAME: modules.cpp
+// CHECK-PRECOMPILE-SAME: foo.cpp
 
 // Check compiling a .pcm file to a .o file.
 //
-// RUN: %clang -std=c++2a %t/module.pcm -S -o %t/module.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-COMPILE
+// RUN: %clang -std=c++2a %t/foo.pcm -S -o %t/foo.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-COMPILE
 //
 // CHECK-COMPILE: -cc1 {{.*}} {{-emit-obj|-S}}
-// CHECK-COMPILE-SAME: -o {{.*}}module{{2*}}.pcm.o
+// CHECK-COMPILE-SAME: -o {{.*}}foo{{2*}}.pcm.o
 // CHECK-COMPILE-SAME: -x pcm
 // CHECK-COMPILE-SAME: {{.*}}.pcm
 
 // Check use of a .pcm file in another compilation.
 //
-// RUN: %clang -std=c++2a -fmodule-file=%t/module.pcm -Dexport= %s -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
-// RUN: %clang -std=c++20 -fmodule-file=%t/module.pcm -Dexport= %s -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
-// RUN: %clang -std=gnu++20 -fmodule-file=%t/module-gnu.pcm -Dexport= %s -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
+// RUN: %clang -std=c++2a -fmodule-file=foo=%t/foo.pcm %t/foo_impl.cpp -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
+// RUN: %clang -std=c++20 -fmodule-file=foo=%t/foo.pcm %t/foo_impl.cpp -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
+// RUN: %clang -std=gnu++20 -fmodule-file=foo=%t/foo-gnu.pcm %t/foo_impl.cpp -S -o %t/module.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-USE
 //
 // CHECK-USE: -cc1 {{.*}} {{-emit-obj|-S}}
-// CHECK-USE-SAME: -fmodule-file={{.*}}.pcm
+// CHECK-USE-SAME: -fmodule-file=foo={{.*}}.pcm
 // CHECK-USE-SAME: -o {{.*}}.{{o|s}}{{"?}} {{.*}}-x c++
-// CHECK-USE-SAME: modules.cpp
+// CHECK-USE-SAME: foo_impl.cpp
 
 // Check combining precompile and compile steps works.
 //
-// RUN: %clang -std=c++2a -x c++-module %s -S -o %t/module2.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE --check-prefix=CHECK-COMPILE
+// RUN: %clang -std=c++2a -x c++-module %t/foo.cpp -S -o %t/foo2.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE --check-prefix=CHECK-COMPILE
 
 // Check that .cppm is treated as a module implicitly.
 //
-// RUN: cp %s %t/module.cppm
-// RUN: %clang -std=c++2a --precompile %t/module.cppm -o %t/module.pcm -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE
+// RUN: cp %t/foo.cpp %t/foo.cppm
+// RUN: %clang -std=c++2a --precompile %t/foo.cppm -o %t/foo.pcm -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE
 
+//--- foo.cpp
 export module foo;
+
+//--- foo_impl.cpp
+module foo;
diff --git a/clang/test/Modules/named-modules-adl-3.cppm b/clang/test/Modules/named-modules-adl-3.cppm
index d70946fa068b..a3644b45a534 100644
--- a/clang/test/Modules/named-modules-adl-3.cppm
+++ b/clang/test/Modules/named-modules-adl-3.cppm
@@ -58,6 +58,7 @@ void b(T x) {
 }
 
 //--- c.cppm
+module;
 #ifdef EXPORT_OPERATOR
 // expected-no-diagnostics
 #endif
diff --git a/clang/test/Modules/reserved-names-1.cppm b/clang/test/Modules/reserved-names-1.cppm
index e780f1e35b3b..35b264bcb573 100644
--- a/clang/test/Modules/reserved-names-1.cppm
+++ b/clang/test/Modules/reserved-names-1.cppm
@@ -88,12 +88,14 @@ export module module; // expected-error {{'module' is an invalid name for a modu
 export module import; // expected-error {{'import' is an invalid name for a module}}
 
 //--- _Test.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
 export module _Test;  // loud-warning {{'_Test' is a reserved name for a module}}
 
 //--- __test.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
@@ -101,6 +103,7 @@ export module __test; // loud-warning {{'__test' is a reserved name for a module
 export int a = 43;
 
 //--- te__st.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
@@ -108,6 +111,7 @@ export module te__st; // loud-warning {{'te__st' is a reserved name for a module
 export int a = 43;
 
 //--- std.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
@@ -116,36 +120,42 @@ export module std;    // loud-warning {{'std' is a reserved name for a module}}
 export int a = 43;
 
 //--- std.foo.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
 export module std.foo;// loud-warning {{'std' is a reserved name for a module}}
 
 //--- std0.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
 export module std0;   // loud-warning {{'std0' is a reserved name for a module}}
 
 //--- std1000000.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
 export module std1000000; // loud-warning {{'std1000000' is a reserved name for a module}}
 
 //--- should_diag._Test.cppm
+module;
 #ifdef NODIAGNOSTICS
 // expected-no-diagnostics
 #endif
 export module should_diag._Test; // loud-warning {{'_Test' is a reserved name for a module}}
 
 //--- system-module.cppm
+module; // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
 // Show that being in a system header doesn't save you from diagnostics about
 // use of an invalid module-name identifier.
 # 34 "reserved-names-1.cpp" 1 3
 export module module;       // expected-error {{'module' is an invalid name for a module}}
 
 //--- system._Test.import.cppm
+module; // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
 # 34 "reserved-names-1.cpp" 1 3
 export module _Test.import; // expected-error {{'import' is an invalid name for a module}}
 
diff --git a/clang/test/Modules/reserved-names-system-header-1.cpp b/clang/test/Modules/reserved-names-system-header-1.cpp
index 2db4c08add1d..fa869483980f 100644
--- a/clang/test/Modules/reserved-names-system-header-1.cpp
+++ b/clang/test/Modules/reserved-names-system-header-1.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s
 // expected-no-diagnostics
 
+module;
 // Show that we suppress the reserved identifier diagnostic in a system header.
 # 100 "file.cpp" 1 3  // Enter a system header
 export module std;
diff --git a/clang/test/Modules/reserved-names-system-header-2.cpp b/clang/test/Modules/reserved-names-system-header-2.cpp
index 2087f487721c..d429e58dc171 100644
--- a/clang/test/Modules/reserved-names-system-header-2.cpp
+++ b/clang/test/Modules/reserved-names-system-header-2.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s
 // expected-no-diagnostics
 
+module;
 // Show that we suppress the reserved identifier diagnostic in a system header.
 # 100 "file.cpp" 1 3  // Enter a system header
 export module __test;
diff --git a/clang/test/SemaCXX/modules.cppm b/clang/test/SemaCXX/modules.cppm
index 41204be76eaf..5d0d6da44a2e 100644
--- a/clang/test/SemaCXX/modules.cppm
+++ b/clang/test/SemaCXX/modules.cppm
@@ -1,19 +1,20 @@
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -o %t.0.pcm -verify -DTEST=0
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -o %t.1.pcm -verify -DTEST=1
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -fmodule-file=foo=%t.0.pcm -o %t.2.pcm -verify -DTEST=2
-// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify -Dfoo=bar -DTEST=3
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
 
-#if TEST == 0 || TEST == 2
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %t/test0.cpp -o %t/test0.pcm -verify
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %t/test1.cpp -o %t/test1.pcm -verify
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %t/test2.cpp -fmodule-file=foo=%t/test0.pcm -o %t/test2.pcm -verify
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %t/test3.cpp -fmodule-file=foo=%t/test0.pcm -o %t/test3.pcm -verify
+
+//--- test0.cpp
 // expected-no-diagnostics
-#endif
-
 export module foo;
 
 static int m;
 
 int n;
 
-#if TEST == 0
 export {
   int a;
   int b;
@@ -27,10 +28,52 @@ export void f() {}
 
 export struct T {
 } t;
-#elif TEST == 3
+
+//--- test1.cpp
+export module foo;
+
+static int m;
+
+int n;
+
+struct S {
+  export int n;        // expected-error {{expected member name or ';'}}
+  export static int n; // expected-error {{expected member name or ';'}}
+};
+
+// FIXME: Exports of declarations without external linkage are disallowed.
+// Exports of declarations with non-external-linkage types are disallowed.
+
+// Cannot export within another export. This isn't precisely covered by the
+// language rules right now, but (per personal correspondence between zygoloid
+// and gdr) is the intent.
+export { // expected-note {{export block begins here}}
+  extern "C++" {
+  namespace NestedExport {
+  export { // expected-error {{export declaration appears within another export declaration}}
+    int q;
+  }
+  } // namespace NestedExport
+  }
+}
+
+//--- test2.cpp
+// expected-no-diagnostics
+export module foo;
+
+static int m;
+
+int n;
+
+//--- test3.cpp
+export module bar;
+
+static int m;
+
+int n;
+
 int use_a = a; // expected-error {{use of undeclared identifier 'a'}}
 
-#undef foo
 import foo; // expected-error {{imports must immediately follow the module declaration}}
 
 export {}
@@ -46,29 +89,3 @@ int use_n = n; // FIXME: this should not be visible, because it is not exported
 
 extern int n;
 static_assert(&n != p); // expected-error{{use of undeclared identifier 'p'}}
-#endif
-
-#if TEST == 1
-struct S {
-  export int n;        // expected-error {{expected member name or ';'}}
-  export static int n; // expected-error {{expected member name or ';'}}
-};
-#endif
-
-// FIXME: Exports of declarations without external linkage are disallowed.
-// Exports of declarations with non-external-linkage types are disallowed.
-
-// Cannot export within another export. This isn't precisely covered by the
-// language rules right now, but (per personal correspondence between zygoloid
-// and gdr) is the intent.
-#if TEST == 1
-export { // expected-note {{export block begins here}}
-  extern "C++" {
-  namespace NestedExport {
-  export { // expected-error {{export declaration appears within another export declaration}}
-    int q;
-  }
-  } // namespace NestedExport
-  }
-}
-#endif
diff --git a/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp b/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp
index 7c0b967a3c03..30fea464a8dc 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp
@@ -1,12 +1,22 @@
-// RUN: %clang_cc1  -fsyntax-only -verify %s -std=c++26 -fexceptions -DTRANSPARENT_DECL=0
-// RUN: %clang_cc1  -fsyntax-only -verify %s -std=c++26 -fexceptions -DTRANSPARENT_DECL=1
-// RUN: %clang_cc1  -fsyntax-only -verify %s -std=c++26 -fexceptions -DTRANSPARENT_DECL=2
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
 
+// RUN: %clang_cc1  -fsyntax-only -verify %t/testing.cpp -std=c++26 -fexceptions -DTRANSPARENT_DECL=0
+// RUN: %clang_cc1  -fsyntax-only -verify %t/testing.cpp -std=c++26 -fexceptions -DTRANSPARENT_DECL=1
+// RUN: %clang_cc1  -fsyntax-only -verify %t/module_testing.cppm -std=c++26 -fexceptions -DTRANSPARENT_DECL=2
+
+//--- module_testing.cppm
 // expected-no-diagnostics
-#if TRANSPARENT_DECL==2
 export module Testing;
-#endif
 
+#include "testing.inc"
+
+//--- testing.cpp
+// expected-no-diagnostics
+#include "testing.inc"
+
+//--- testing.inc
 namespace std {
   template <class T> struct type_identity {};
   using size_t = __SIZE_TYPE__;
diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 381755d4d1b6..33c8abbec35a 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -49,7 +49,8 @@ protected:
   }
 
   std::unique_ptr<Preprocessor> CreatePP(StringRef Source,
-                                         TrivialModuleLoader &ModLoader) {
+                                         TrivialModuleLoader &ModLoader,
+                                         StringRef PreDefines = {}) {
     std::unique_ptr<llvm::MemoryBuffer> Buf =
         llvm::MemoryBuffer::getMemBuffer(Source);
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
@@ -61,6 +62,8 @@ protected:
         PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader,
         /*IILookup =*/nullptr,
         /*OwnsHeaderSearch =*/false);
+    if (!PreDefines.empty())
+      PP->setPredefines(PreDefines.str());
     PP->Initialize(*Target);
     PP->EnterMainSourceFile();
     return PP;
@@ -769,4 +772,46 @@ TEST(LexerPreambleTest, PreambleBounds) {
   }
 }
 
+TEST_F(LexerTest, CheckFirstPPToken) {
+  {
+    TrivialModuleLoader ModLoader;
+    auto PP = CreatePP("// This is a comment\n"
+                       "int a;",
+                       ModLoader);
+    Token Tok;
+    PP->Lex(Tok);
+    EXPECT_TRUE(Tok.is(tok::kw_int));
+    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::kw_int));
+  }
+  {
+    TrivialModuleLoader ModLoader;
+    auto PP = CreatePP("// This is a comment\n"
+                       "#define FOO int\n"
+                       "FOO a;",
+                       ModLoader);
+    Token Tok;
+    PP->Lex(Tok);
+    EXPECT_TRUE(Tok.is(tok::kw_int));
+    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::hash));
+  }
+
+  {
+    TrivialModuleLoader ModLoader;
+    auto PP = CreatePP("// This is a comment\n"
+                       "FOO a;",
+                       ModLoader, "#define FOO int\n");
+    Token Tok;
+    PP->Lex(Tok);
+    EXPECT_TRUE(Tok.is(tok::kw_int));
+    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::identifier));
+    EXPECT_TRUE(
+        PP->getMainFileFirstPPToken().getIdentifierInfo()->isStr("FOO"));
+  }
+}
 } // anonymous namespace

From 77941eba7f01fc6576b3e060a3fb9cad1a64f9ea Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 21 Jun 2025 12:29:29 +0100
Subject: [PATCH 1158/1322] [CostModel] Add a DstTy to getShuffleCost (#141634)

A shuffle will take two input vectors and a mask, to produce a new
vector of size <MaskElts x SrcEltTy>. Historically it has been assumed
that the SrcTy and the DstTy are the same for getShuffleCost, with that
being relaxed in recent years. If the Tp passed to getShuffleCost is the
SrcTy, then the DstTy can be calculated from the Mask elts and the src
elt size, but the Mask is not always provided and the Tp is not reliably
always the SrcTy. This has led to situations notably in the SLP
vectorizer but also in the generic cost routines where assumption about
how vectors will be legalized are built into the generic cost routines -
for example whether they will widen or promote, with the cost modelling
assuming they will widen but the default lowering to promote for integer
vectors.

This patch attempts to start improving that - it originally tried to
alter more of the cost model but that too quickly became too many
changes at once, so this patch just plumbs in a DstTy to getShuffleCost
so that DstTy and SrcTy can be reliably distinguished. The callers of
getShuffleCost have been updated to try and include a DstTy that is more
accurate. Otherwise it tries to be fairly non-functional, keeping the
SrcTy used as the primary type used in shuffle cost routines, only using
DstTy where it was in the past (for InsertSubVector for example).

Some asserts have been added that help to check for consistent values
when a Mask and a DstTy are provided to getShuffleCost. Some of them
took a while to get right, and some non-mask calls might still be
incorrect. Hopefully this will provide a useful base to build more
shuffles that alter size.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  18 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  60 ++++---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  60 +++----
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  11 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |  59 ++++---
 .../AArch64/AArch64TargetTransformInfo.h      |   6 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  17 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |   6 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  27 +--
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   6 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |  10 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   6 +-
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |   7 +-
 .../Target/PowerPC/PPCTargetTransformInfo.h   |   6 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  80 +++++----
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   6 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  19 ++-
 .../SystemZ/SystemZTargetTransformInfo.h      |   6 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 151 +++++++++--------
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |   6 +-
 .../Scalar/LowerMatrixIntrinsics.cpp          |  12 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  23 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  22 +--
 .../Transforms/Vectorize/VectorCombine.cpp    | 155 +++++++++---------
 25 files changed, 444 insertions(+), 348 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index ba47cef274be..90d92e0fcf55 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1381,16 +1381,16 @@ public:
       const SmallBitVector &OpcodeMask,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
-  /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
-  /// The exact mask may be passed as Mask, or else the array will be empty.
-  /// The index and subtype parameters are used by the subvector insertion and
-  /// extraction shuffle kinds to show the insert/extract point and the type of
-  /// the subvector being inserted/extracted. The operands of the shuffle can be
-  /// passed through \p Args, which helps improve the cost estimation in some
-  /// cases, like in broadcast loads.
-  /// NOTE: For subvector extractions Tp represents the source type.
+  /// \return The cost of a shuffle instruction of kind Kind with inputs of type
+  /// SrcTy, producing a vector of type DstTy. The exact mask may be passed as
+  /// Mask, or else the array will be empty. The Index and SubTp parameters
+  /// are used by the subvector insertions shuffle kinds to show the insert
+  /// point and the type of the subvector being inserted. The operands of the
+  /// shuffle can be passed through \p Args, which helps improve the cost
+  /// estimation in some cases, like in broadcast loads.
   LLVM_ABI InstructionCost getShuffleCost(
-      ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask = {},
+      ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+      ArrayRef<int> Mask = {},
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0,
       VectorType *SubTp = nullptr, ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 640766cf8cd1..c22928c9bcd9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -710,9 +710,9 @@ public:
   }
 
   virtual InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const {
     return 1;
   }
@@ -1541,13 +1541,14 @@ public:
           return 0;
 
         if (Shuffle->isExtractSubvectorMask(SubIndex))
-          return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy,
-                                           Mask, CostKind, SubIndex, VecTy,
-                                           Operands, Shuffle);
+          return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
+                                           VecSrcTy, Mask, CostKind, SubIndex,
+                                           VecTy, Operands, Shuffle);
 
         if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
           return TargetTTI->getShuffleCost(
-              TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex,
+              TTI::SK_InsertSubvector, VecTy, VecSrcTy, Mask, CostKind,
+              SubIndex,
               FixedVectorType::get(VecTy->getScalarType(), NumSubElts),
               Operands, Shuffle);
 
@@ -1576,21 +1577,24 @@ public:
 
           return TargetTTI->getShuffleCost(
               IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy,
-              AdjustMask, CostKind, 0, nullptr, Operands, Shuffle);
+              VecTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle);
         }
 
         // Narrowing shuffle - perform shuffle at original wider width and
         // then extract the lower elements.
+        // FIXME: This can assume widening, which is not true of all vector
+        // architectures (and is not even the default).
         AdjustMask.append(NumSubElts - Mask.size(), PoisonMaskElem);
 
         InstructionCost ShuffleCost = TargetTTI->getShuffleCost(
             IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc,
-            VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle);
+            VecSrcTy, VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands,
+            Shuffle);
 
         SmallVector<int, 16> ExtractMask(Mask.size());
         std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
         return ShuffleCost + TargetTTI->getShuffleCost(
-                                 TTI::SK_ExtractSubvector, VecSrcTy,
+                                 TTI::SK_ExtractSubvector, VecTy, VecSrcTy,
                                  ExtractMask, CostKind, 0, VecTy, {}, Shuffle);
       }
 
@@ -1598,40 +1602,44 @@ public:
         return 0;
 
       if (Shuffle->isReverse())
-        return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, Mask, CostKind,
-                                         0, nullptr, Operands, Shuffle);
+        return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, VecSrcTy, Mask,
+                                         CostKind, 0, nullptr, Operands,
+                                         Shuffle);
 
       if (Shuffle->isSelect())
-        return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, Mask, CostKind,
-                                         0, nullptr, Operands, Shuffle);
+        return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, VecSrcTy, Mask,
+                                         CostKind, 0, nullptr, Operands,
+                                         Shuffle);
 
       if (Shuffle->isTranspose())
-        return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, Mask,
-                                         CostKind, 0, nullptr, Operands,
+        return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, VecSrcTy,
+                                         Mask, CostKind, 0, nullptr, Operands,
                                          Shuffle);
 
       if (Shuffle->isZeroEltSplat())
-        return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, Mask,
-                                         CostKind, 0, nullptr, Operands,
+        return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, VecSrcTy,
+                                         Mask, CostKind, 0, nullptr, Operands,
                                          Shuffle);
 
       if (Shuffle->isSingleSource())
-        return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask,
-                                         CostKind, 0, nullptr, Operands,
-                                         Shuffle);
+        return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy,
+                                         VecSrcTy, Mask, CostKind, 0, nullptr,
+                                         Operands, Shuffle);
 
       if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
         return TargetTTI->getShuffleCost(
-            TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex,
+            TTI::SK_InsertSubvector, VecTy, VecSrcTy, Mask, CostKind, SubIndex,
             FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands,
             Shuffle);
 
       if (Shuffle->isSplice(SubIndex))
-        return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, Mask, CostKind,
-                                         SubIndex, nullptr, Operands, Shuffle);
+        return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, VecSrcTy, Mask,
+                                         CostKind, SubIndex, nullptr, Operands,
+                                         Shuffle);
 
-      return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask,
-                                       CostKind, 0, nullptr, Operands, Shuffle);
+      return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, VecSrcTy,
+                                       Mask, CostKind, 0, nullptr, Operands,
+                                       Shuffle);
     }
     case Instruction::ExtractElement: {
       auto *EEI = dyn_cast<ExtractElementInst>(U);
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 90a75c3d352e..0477c1b6f1a6 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -329,11 +329,11 @@ private:
     // Cost the call + mask.
     auto Cost =
         thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
-    if (VD->isMasked())
-      Cost += thisT()->getShuffleCost(
-          TargetTransformInfo::SK_Broadcast,
-          VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
-          nullptr, {});
+    if (VD->isMasked()) {
+      auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
+      Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+                                      VecTy, {}, CostKind, 0, nullptr, {});
+    }
 
     // Lowering to a library call (with output pointers) may require us to emit
     // reloads for the results.
@@ -1101,11 +1101,11 @@ public:
 
   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
                                               ArrayRef<int> Mask,
-                                              VectorType *Ty, int &Index,
+                                              VectorType *SrcTy, int &Index,
                                               VectorType *&SubTy) const {
     if (Mask.empty())
       return Kind;
-    int NumSrcElts = Ty->getElementCount().getKnownMinValue();
+    int NumSrcElts = SrcTy->getElementCount().getKnownMinValue();
     switch (Kind) {
     case TTI::SK_PermuteSingleSrc: {
       if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
@@ -1116,7 +1116,7 @@ public:
         return TTI::SK_Broadcast;
       if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
           (Index + Mask.size()) <= (size_t)NumSrcElts) {
-        SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
+        SubTy = FixedVectorType::get(SrcTy->getElementType(), Mask.size());
         return TTI::SK_ExtractSubvector;
       }
       break;
@@ -1127,7 +1127,7 @@ public:
                                  Mask, NumSrcElts, NumSubElts, Index)) {
         if (Index + NumSubElts > NumSrcElts)
           return Kind;
-        SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
+        SubTy = FixedVectorType::get(SrcTy->getElementType(), NumSubElts);
         return TTI::SK_InsertSubvector;
       }
       if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
@@ -1151,13 +1151,13 @@ public:
   }
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override {
-    switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
+    switch (improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp)) {
     case TTI::SK_Broadcast:
-      if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
+      if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
         return getBroadcastShuffleOverhead(FVT, CostKind);
       return InstructionCost::getInvalid();
     case TTI::SK_Select:
@@ -1166,14 +1166,14 @@ public:
     case TTI::SK_Transpose:
     case TTI::SK_PermuteSingleSrc:
     case TTI::SK_PermuteTwoSrc:
-      if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
+      if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
         return getPermuteShuffleOverhead(FVT, CostKind);
       return InstructionCost::getInvalid();
     case TTI::SK_ExtractSubvector:
-      return getExtractSubvectorOverhead(Tp, CostKind, Index,
+      return getExtractSubvectorOverhead(SrcTy, CostKind, Index,
                                          cast<FixedVectorType>(SubTp));
     case TTI::SK_InsertSubvector:
-      return getInsertSubvectorOverhead(Tp, CostKind, Index,
+      return getInsertSubvectorOverhead(DstTy, CostKind, Index,
                                         cast<FixedVectorType>(SubTp));
     }
     llvm_unreachable("Unknown TTI::ShuffleKind");
@@ -1910,6 +1910,7 @@ public:
         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
+                                     cast<VectorType>(RetTy),
                                      cast<VectorType>(Args[0]->getType()), {},
                                      CostKind, Index, cast<VectorType>(RetTy));
     }
@@ -1920,17 +1921,18 @@ public:
         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
       return thisT()->getShuffleCost(
-          TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {},
-          CostKind, Index, cast<VectorType>(Args[1]->getType()));
+          TTI::SK_InsertSubvector, cast<VectorType>(RetTy),
+          cast<VectorType>(Args[0]->getType()), {}, CostKind, Index,
+          cast<VectorType>(Args[1]->getType()));
     }
     case Intrinsic::vector_reverse: {
-      return thisT()->getShuffleCost(TTI::SK_Reverse,
+      return thisT()->getShuffleCost(TTI::SK_Reverse, cast<VectorType>(RetTy),
                                      cast<VectorType>(Args[0]->getType()), {},
                                      CostKind, 0, cast<VectorType>(RetTy));
     }
     case Intrinsic::vector_splice: {
       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
-      return thisT()->getShuffleCost(TTI::SK_Splice,
+      return thisT()->getShuffleCost(TTI::SK_Splice, cast<VectorType>(RetTy),
                                      cast<VectorType>(Args[0]->getType()), {},
                                      CostKind, Index, cast<VectorType>(RetTy));
     }
@@ -2376,8 +2378,8 @@ public:
                                           CostKind, 1, nullptr, nullptr);
       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
                                           CostKind, 0, nullptr, nullptr);
-      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, {}, CostKind,
-                                      0, nullptr);
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, SearchTy, {},
+                                      CostKind, 0, nullptr);
       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
                                           CmpInst::ICMP_EQ, CostKind);
       Cost +=
@@ -2961,8 +2963,8 @@ public:
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
-      ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
-                                             CostKind, NumVecElts, SubTy);
+      ShuffleCost += thisT()->getShuffleCost(
+          TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy);
       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
       Ty = SubTy;
       ++LongVectorCount;
@@ -2978,7 +2980,7 @@ public:
     // By default reductions need one shuffle per reduction level.
     ShuffleCost +=
         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
-                                                 {}, CostKind, 0, Ty);
+                                                 Ty, {}, CostKind, 0, Ty);
     ArithCost +=
         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
     return ShuffleCost + ArithCost +
@@ -3052,8 +3054,8 @@ public:
       NumVecElts /= 2;
       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
 
-      ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
-                                             CostKind, NumVecElts, SubTy);
+      ShuffleCost += thisT()->getShuffleCost(
+          TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy);
 
       IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
       MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
@@ -3069,7 +3071,7 @@ public:
     // architecture-dependent length.
     ShuffleCost +=
         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
-                                                 {}, CostKind, 0, Ty);
+                                                 Ty, {}, CostKind, 0, Ty);
     IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
     MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
     // The last min/max should be in vector registers and we counted it above.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8cc7f8a9d2ab..3ebd9d487ba0 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -985,11 +985,16 @@ InstructionCost TargetTransformInfo::getAltInstrCost(
 }
 
 InstructionCost TargetTransformInfo::getShuffleCost(
-    ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
+    ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef<int> Mask,
     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
     ArrayRef<const Value *> Args, const Instruction *CxtI) const {
-  InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind,
-                                                 Index, SubTp, Args, CxtI);
+  assert((Mask.empty() || DstTy->isScalableTy() ||
+          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
+         "Expected the Mask to match the return size if given");
+  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
+         "Expected the same scalar types");
+  InstructionCost Cost = TTIImpl->getShuffleCost(
+      Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp, Args, CxtI);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9d5c984fa4f1..8c6f272a8c8d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5468,19 +5468,25 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost;
 }
 
-InstructionCost AArch64TTIImpl::getShuffleCost(
-    TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-    TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-    ArrayRef<const Value *> Args, const Instruction *CxtI) const {
-  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+InstructionCost
+AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
+  assert((Mask.empty() || DstTy->isScalableTy() ||
+          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
+         "Expected the Mask to match the return size if given");
+  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
+         "Expected the same scalar types");
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
 
   // If we have a Mask, and the LT is being legalized somehow, split the Mask
   // into smaller vectors and sum the cost of each shuffle.
-  if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
+  if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
       LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
-      Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
+      SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
       Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
-
     // Check for LD3/LD4 instructions, which are represented in llvm IR as
     // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
     // but we model it with a cost of LT.first so that LD3/LD4 have a higher
@@ -5496,16 +5502,16 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
     // cost than just the store.
     if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
         (ShuffleVectorInst::isInterleaveMask(
-             Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
+             Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
          ShuffleVectorInst::isInterleaveMask(
-             Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
+             Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
       return LT.first;
 
     unsigned TpNumElts = Mask.size();
     unsigned LTNumElts = LT.second.getVectorNumElements();
     unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
-    VectorType *NTp =
-        VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
+    VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
+                                      LT.second.getVectorElementCount());
     InstructionCost Cost;
     std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
         PreviousCosts;
@@ -5513,7 +5519,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
       SmallVector<int> NMask;
       // Split the existing mask into chunks of size LTNumElts. Track the source
       // sub-vectors to ensure the result has at most 2 inputs.
-      unsigned Source1 = 0, Source2 = 0;
+      unsigned Source1 = -1U, Source2 = -1U;
       unsigned NumSources = 0;
       for (unsigned E = 0; E < LTNumElts; E++) {
         int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
@@ -5561,7 +5567,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
           NumSources <= 2
               ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
                                                : TTI::SK_PermuteTwoSrc,
-                               NTp, NMask, CostKind, 0, nullptr, Args, CxtI)
+                               NTp, NTp, NMask, CostKind, 0, nullptr, Args,
+                               CxtI)
               : LTNumElts;
       Result.first->second = NCost;
       Cost += NCost;
@@ -5569,7 +5576,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
     return Cost;
   }
 
-  Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
   // A subvector extract can be implemented with an ext (or trivial extract, if
   // from lane 0). This currently only handles low or high extracts to prevent
@@ -5585,6 +5592,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
     }
     Kind = TTI::SK_PermuteSingleSrc;
   }
+  // FIXME: This was added to keep the costs equal when adding DstTys. Update
+  // the code to handle length-changing shuffles.
+  if (Kind == TTI::SK_InsertSubvector) {
+    LT = getTypeLegalizationCost(DstTy);
+    SrcTy = DstTy;
+  }
 
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
@@ -5596,15 +5609,17 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
   if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
     bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
     if (IsLoad && LT.second.isVector() &&
-        isLegalBroadcastLoad(Tp->getElementType(),
+        isLegalBroadcastLoad(SrcTy->getElementType(),
                              LT.second.getVectorElementCount()))
       return 0;
   }
 
   // If we have 4 elements for the shuffle and a Mask, get the cost straight
   // from the perfect shuffle tables.
-  if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
-      (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
+  if (Mask.size() == 4 &&
+      SrcTy->getElementCount() == ElementCount::getFixed(4) &&
+      (SrcTy->getScalarSizeInBits() == 16 ||
+       SrcTy->getScalarSizeInBits() == 32) &&
       all_of(Mask, [](int E) { return E < 8; }))
     return getPerfectShuffleCost(Mask);
 
@@ -5764,8 +5779,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
       return LT.first * Entry->Cost;
   }
 
-  if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
-    return getSpliceCost(Tp, Index, CostKind);
+  if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
+    return getSpliceCost(SrcTy, Index, CostKind);
 
   // Inserting a subvector can often be done with either a D, S or H register
   // move, so long as the inserted vector is "aligned".
@@ -5783,8 +5798,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
   // Restore optimal kind.
   if (IsExtractSubvector)
     Kind = TTI::SK_ExtractSubvector;
-  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
-                               CxtI);
+  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
+                               Args, CxtI);
 }
 
 static bool containsDecreasingPointers(Loop *TheLoop,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 470af01be315..9ada70bd7086 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -451,9 +451,9 @@ public:
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
 
   InstructionCost getScalarizationOverhead(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d5a1aaef4ad6..5e41273556d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1183,21 +1183,23 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
 }
 
 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *VT, ArrayRef<int> Mask,
+                                           VectorType *DstTy, VectorType *SrcTy,
+                                           ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args,
                                            const Instruction *CxtI) const {
-  if (!isa<FixedVectorType>(VT))
-    return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
+  if (!isa<FixedVectorType>(SrcTy))
+    return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                 SubTp);
 
-  Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
+  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
 
   // Larger vector widths may require additional instructions, but are
   // typically cheaper than scalarized versions.
-  unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+  unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-      DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+      DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) {
     bool HasVOP3P = ST->hasVOP3PInsts();
     unsigned RequestedElts =
         count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
@@ -1239,7 +1241,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
-  return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                               SubTp);
 }
 
 /// Whether it is profitable to sink the operands of an
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 0fae301abf53..64a244e33f18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -236,9 +236,9 @@ public:
   InstructionCost getVectorSplitCost() const { return 0; }
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
 
   bool isProfitableToSinkOperands(Instruction *I,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6c3a1ae7e177..203fb76d7be8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1233,12 +1233,19 @@ InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const {
 }
 
 InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *Tp, ArrayRef<int> Mask,
+                                           VectorType *DstTy, VectorType *SrcTy,
+                                           ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args,
                                            const Instruction *CxtI) const {
-  Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+  assert((Mask.empty() || DstTy->isScalableTy() ||
+          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
+         "Expected the Mask to match the return size if given");
+  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
+         "Expected the same scalar types");
+
+  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
   // Treat extractsubvector as single op permutation.
   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
   if (IsExtractSubvector)
@@ -1259,7 +1266,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
 
-      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
       if (const auto *Entry =
               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
         return LT.first * Entry->Cost;
@@ -1280,7 +1287,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 
-      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
       if (const auto *Entry =
               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
         return LT.first * Entry->Cost;
@@ -1304,7 +1311,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 
-      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
                                               ISD::VECTOR_SHUFFLE, LT.second))
         return LT.first * Entry->Cost;
@@ -1320,7 +1327,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
 
-      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
                                               LT.second))
         return LT.first * Entry->Cost *
@@ -1328,7 +1335,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
 
     if (!Mask.empty()) {
-      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+      std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
       if (LT.second.isVector() &&
           Mask.size() <= LT.second.getVectorNumElements() &&
           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
@@ -1340,11 +1347,11 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   // Restore optimal kind.
   if (IsExtractSubvector)
     Kind = TTI::SK_ExtractSubvector;
-  int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
+  int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
                      : 1;
-  return BaseCost *
-         BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
+  return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
+                                          Index, SubTp);
 }
 
 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index c1af4e3dc5da..ca06b9e3cb66 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -223,9 +223,9 @@ public:
   int getNumMemOps(const IntrinsicInst *I) const;
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
 
   bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index a4cc472fdbf2..9fb7d471fd22 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -226,10 +226,12 @@ HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       CostKind);
 }
 
-InstructionCost HexagonTTIImpl::getShuffleCost(
-    TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-    TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-    ArrayRef<const Value *> Args, const Instruction *CxtI) const {
+InstructionCost
+HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
   return 1;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index c03cad4713e4..af8dede72308 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,9 +123,9 @@ public:
                         unsigned AddressSpace,
                         TTI::TargetCostKind CostKind) const override;
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index cd9b226ca82d..2fba090f2d50 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -604,19 +604,20 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
 }
 
 InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *Tp, ArrayRef<int> Mask,
+                                           VectorType *DstTy, VectorType *SrcTy,
+                                           ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args,
                                            const Instruction *CxtI) const {
 
   InstructionCost CostFactor =
-      vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
+      vectorCostAdjustmentFactor(Instruction::ShuffleVector, SrcTy, nullptr);
   if (!CostFactor.isValid())
     return InstructionCost::getMax();
 
   // Legalize the type.
-  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
 
   // PPC, for both Altivec/VSX, support cheap arbitrary permutations
   // (at least in the sense that there need only be one non-loop-invariant
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index bc5f7a4d06de..475472ac3720 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -111,9 +111,9 @@ public:
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
   InstructionCost
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1b80b0fcaf10..67a51c12b508 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -452,12 +452,16 @@ static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI,
         if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
                  .second)
           return;
-        Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
-                                   RegMask, CostKind, 0, nullptr);
+        Cost += TTI.getShuffleCost(
+            TTI::SK_PermuteSingleSrc,
+            FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
+            SingleOpTy, RegMask, CostKind, 0, nullptr);
       },
       [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
-        Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
-                                   CostKind, 0, nullptr);
+        Cost += TTI.getShuffleCost(
+            TTI::SK_PermuteTwoSrc,
+            FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
+            SingleOpTy, RegMask, CostKind, 0, nullptr);
       });
   return Cost;
 }
@@ -526,11 +530,11 @@ costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT,
           return;
         ++NumShuffles;
         Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
-                                   RegMask, CostKind, 0, nullptr);
+                                   SingleOpTy, RegMask, CostKind, 0, nullptr);
       },
       [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
-        Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
-                                   CostKind, 0, nullptr);
+        Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
+                                   SingleOpTy, RegMask, CostKind, 0, nullptr);
         NumShuffles += 2;
       });
   // Note: check that we do not emit too many shuffles here to prevent code
@@ -601,22 +605,29 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
   return FirstSlideCost + SecondSlideCost + MaskCost;
 }
 
-InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                             VectorType *Tp, ArrayRef<int> Mask,
-                                             TTI::TargetCostKind CostKind,
-                                             int Index, VectorType *SubTp,
-                                             ArrayRef<const Value *> Args,
-                                             const Instruction *CxtI) const {
-  Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
-  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+InstructionCost
+RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
+                             VectorType *SrcTy, ArrayRef<int> Mask,
+                             TTI::TargetCostKind CostKind, int Index,
+                             VectorType *SubTp, ArrayRef<const Value *> Args,
+                             const Instruction *CxtI) const {
+  assert((Mask.empty() || DstTy->isScalableTy() ||
+          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
+         "Expected the Mask to match the return size if given");
+  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
+         "Expected the same scalar types");
+
+  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
 
   // First, handle cases where having a fixed length vector enables us to
   // give a more accurate cost than falling back to generic scalable codegen.
   // TODO: Each of these cases hints at a modeling gap around scalable vectors.
-  if (auto *FVTp = dyn_cast<FixedVectorType>(Tp);
+  if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
       FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
     InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting(
-        *this, LT.second, ST->getRealVLen(), Tp, Mask, CostKind);
+        *this, LT.second, ST->getRealVLen(),
+        Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
     if (VRegSplittingCost.isValid())
       return VRegSplittingCost;
     switch (Kind) {
@@ -655,7 +666,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
           for (unsigned I = 0; I != NumSlides; ++I) {
             unsigned InsertIndex = SubVectorSize * (1 << I);
             FixedVectorType *SubTp =
-                FixedVectorType::get(Tp->getElementType(), InsertIndex);
+                FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
             FixedVectorType *DestTp =
                 FixedVectorType::getDoubleElementsVectorType(SubTp);
             std::pair<InstructionCost, MVT> DestLT =
@@ -664,7 +675,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
             // destination vector register group for vslideup cannot overlap the
             // source.
             Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
-            Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, {},
+            Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
                                    CostKind, InsertIndex, SubTp);
           }
           return Cost;
@@ -680,7 +691,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
                             LT.second.getVectorNumElements() <= 256)) {
         VectorType *IdxTy =
-            getVRGatherIndexType(LT.second, *ST, Tp->getContext());
+            getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
         InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
         return IndexCost +
                getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
@@ -699,8 +710,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       // (shuffle) mask.
       if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
                             LT.second.getVectorNumElements() <= 256)) {
-        auto &C = Tp->getContext();
-        auto EC = Tp->getElementCount();
+        auto &C = SrcTy->getContext();
+        auto EC = SrcTy->getElementCount();
         VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
         VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
         InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
@@ -769,6 +780,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     // Example sequence:
     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
     // vslideup.vi  v8, v9, 2
+    LT = getTypeLegalizationCost(DstTy);
     return LT.first *
            getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
   case TTI::SK_Select: {
@@ -846,14 +858,15 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     // TODO: Cases to improve here:
     // * Illegal vector types
     // * i64 on RV32
-    if (Tp->getElementType()->isIntegerTy(1)) {
+    if (SrcTy->getElementType()->isIntegerTy(1)) {
       VectorType *WideTy =
-          VectorType::get(IntegerType::get(Tp->getContext(), 8),
-                          cast<VectorType>(Tp)->getElementCount());
-      return getCastInstrCost(Instruction::ZExt, WideTy, Tp,
+          VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
+                          cast<VectorType>(SrcTy)->getElementCount());
+      return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
                               TTI::CastContextHint::None, CostKind) +
-             getShuffleCost(TTI::SK_Reverse, WideTy, {}, CostKind, 0, nullptr) +
-             getCastInstrCost(Instruction::Trunc, Tp, WideTy,
+             getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
+                            nullptr) +
+             getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
                               TTI::CastContextHint::None, CostKind);
     }
 
@@ -899,7 +912,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     return FixedCost + LT.first * (GatherCost + SlideCost);
   }
   }
-  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                               SubTp);
 }
 
 static unsigned isM1OrSmaller(MVT VT) {
@@ -1025,8 +1039,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
       auto Mask = createStrideMask(Index, Factor, VF);
       Mask.resize(VF * Factor, -1);
       InstructionCost ShuffleCost =
-          getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, Mask,
-                         CostKind, 0, nullptr, {});
+          getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, VecTy,
+                         Mask, CostKind, 0, nullptr, {});
       Cost += ShuffleCost;
     }
     return Cost;
@@ -1052,7 +1066,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
   // shuffle that goes into the wide store
   auto Mask = createInterleaveMask(VF, Factor);
   InstructionCost ShuffleCost =
-      getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
+      getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy, Mask,
                      CostKind, 0, nullptr, {});
   return MemCost + ShuffleCost;
 }
@@ -1523,7 +1537,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     // To support type-based query from vectorizer, set the index to 0.
     // Note that index only change the cost from vslide.vx to vslide.vi and in
     // current implementations they have same costs.
-    return getShuffleCost(TTI::SK_Splice,
+    return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),
                           cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
                           0, cast<VectorType>(ICA.getReturnType()));
   }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 83ac71ed9da6..12bf8c1b4de7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -160,9 +160,9 @@ public:
   }
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
 
   InstructionCost getScalarizationOverhead(
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 68ba7498d586..f32c9bd2bdea 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -738,20 +738,22 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
                                        Args, CxtI);
 }
 
-InstructionCost SystemZTTIImpl::getShuffleCost(
-    TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-    TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-    ArrayRef<const Value *> Args, const Instruction *CxtI) const {
-  Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
+InstructionCost
+SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
+  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
   if (ST->hasVector()) {
-    unsigned NumVectors = getNumVectorRegs(Tp);
+    unsigned NumVectors = getNumVectorRegs(SrcTy);
 
     // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
 
     // FP128 values are always in scalar registers, so there is no work
     // involved with a shuffle, except for broadcast. In that case register
     // moves are done with a single instruction per element.
-    if (Tp->getScalarType()->isFP128Ty())
+    if (SrcTy->getScalarType()->isFP128Ty())
       return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
 
     switch (Kind) {
@@ -775,7 +777,8 @@ InstructionCost SystemZTTIImpl::getShuffleCost(
     }
   }
 
-  return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                               SubTp);
 }
 
 // Return the log2 difference of the element sizes of the two vector types.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 368a4af768b3..dc5736e8af00 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -102,9 +102,9 @@ public:
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy) const;
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) const;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index a1a177528eb2..6a05a1700f0c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1522,15 +1522,24 @@ X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
   return InstructionCost::getInvalid();
 }
 
-InstructionCost X86TTIImpl::getShuffleCost(
-    TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask,
-    TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-    ArrayRef<const Value *> Args, const Instruction *CxtI) const {
+InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+                                           VectorType *DstTy, VectorType *SrcTy,
+                                           ArrayRef<int> Mask,
+                                           TTI::TargetCostKind CostKind,
+                                           int Index, VectorType *SubTp,
+                                           ArrayRef<const Value *> Args,
+                                           const Instruction *CxtI) const {
+  assert((Mask.empty() || DstTy->isScalableTy() ||
+          Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
+         "Expected the Mask to match the return size if given");
+  assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
+         "Expected the same scalar types");
+
   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
-  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
 
-  Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
+  Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
 
   // If all args are constant than this will be constant folded away.
   if (!Args.empty() &&
@@ -1539,11 +1548,12 @@ InstructionCost X86TTIImpl::getShuffleCost(
 
   // Recognize a basic concat_vector shuffle.
   if (Kind == TTI::SK_PermuteTwoSrc &&
-      Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
+      Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
       ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
     return getShuffleCost(TTI::SK_InsertSubvector,
-                          VectorType::getDoubleElementsVectorType(BaseTp), Mask,
-                          CostKind, Mask.size() / 2, BaseTp);
+                          VectorType::getDoubleElementsVectorType(SrcTy),
+                          VectorType::getDoubleElementsVectorType(SrcTy), Mask,
+                          CostKind, Mask.size() / 2, SrcTy);
 
   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
   if (Kind == TTI::SK_Transpose)
@@ -1568,11 +1578,11 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // Attempt to detect a shuffle mask with a single defined element.
   bool IsInLaneShuffle = false;
   bool IsSingleElementMask = false;
-  if (BaseTp->getPrimitiveSizeInBits() > 0 &&
-      (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
-      BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
-      Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
-    unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
+  if (SrcTy->getPrimitiveSizeInBits() > 0 &&
+      (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
+      SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
+      Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
+    unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
     unsigned NumEltsPerLane = Mask.size() / NumLanes;
     if ((Mask.size() % NumLanes) == 0) {
       IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
@@ -1614,16 +1624,17 @@ InstructionCost X86TTIImpl::getShuffleCost(
           LT.second.getVectorElementType() ==
               SubLT.second.getVectorElementType() &&
           LT.second.getVectorElementType().getSizeInBits() ==
-              BaseTp->getElementType()->getPrimitiveSizeInBits()) {
+              SrcTy->getElementType()->getPrimitiveSizeInBits()) {
         assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
                "Unexpected number of elements!");
-        auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
+        auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
                                            LT.second.getVectorNumElements());
-        auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
+        auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
                                            SubLT.second.getVectorNumElements());
         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
-        InstructionCost ExtractCost = getShuffleCost(
-            TTI::SK_ExtractSubvector, VecTy, {}, CostKind, ExtractIndex, SubTy);
+        InstructionCost ExtractCost =
+            getShuffleCost(TTI::SK_ExtractSubvector, VecTy, VecTy, {}, CostKind,
+                           ExtractIndex, SubTy);
 
         // If the original size is 32-bits or more, we can use pshufd. Otherwise
         // if we have SSSE3 we can use pshufb.
@@ -1646,7 +1657,8 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // but if the destination vector legalizes to the same width as the subvector
   // then the insertion will simplify to a (free) register copy.
   if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
-    int NumElts = LT.second.getVectorNumElements();
+    std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
+    int NumElts = DstLT.second.getVectorNumElements();
     std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
     if (SubLT.second.isVector()) {
       int NumSubElts = SubLT.second.getVectorNumElements();
@@ -1670,7 +1682,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
 
   // Handle some common (illegal) sub-vector types as they are often very cheap
   // to shuffle even on targets without PSHUFB.
-  EVT VT = TLI->getValueType(DL, BaseTp);
+  EVT VT = TLI->getValueType(DL, SrcTy);
   if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
       !ST->hasSSSE3()) {
      static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
@@ -1717,17 +1729,17 @@ InstructionCost X86TTIImpl::getShuffleCost(
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
-            BaseTp->getElementType()->getPrimitiveSizeInBits() &&
+            SrcTy->getElementType()->getPrimitiveSizeInBits() &&
         LegalVT.getVectorNumElements() <
-            cast<FixedVectorType>(BaseTp)->getNumElements()) {
-      unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
+            cast<FixedVectorType>(SrcTy)->getNumElements()) {
+      unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
       unsigned LegalVTSize = LegalVT.getStoreSize();
       // Number of source vectors after legalization:
       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
       // Number of destination vectors after legalization:
       InstructionCost NumOfDests = LT.first;
 
-      auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
+      auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
                                               LegalVT.getVectorNumElements());
 
       if (!Mask.empty() && NumOfDests.isValid()) {
@@ -1746,7 +1758,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
         // this operation is TTI::TCC_Free.
         NumOfDests =
             getTypeLegalizationCost(
-                FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
+                FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
                 .first;
         unsigned E = NumOfDests.getValue();
         unsigned NormalizedVF =
@@ -1767,8 +1779,9 @@ InstructionCost X86TTIImpl::getShuffleCost(
                 // one.
                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
                     PrevRegMask != RegMask)
-                  Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
-                                         RegMask, CostKind, 0, nullptr);
+                  Cost +=
+                      getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+                                     SingleOpTy, RegMask, CostKind, 0, nullptr);
                 else
                   // Just a copy of previous destination register.
                   Cost += TTI::TCC_Basic;
@@ -1785,18 +1798,20 @@ InstructionCost X86TTIImpl::getShuffleCost(
             [this, SingleOpTy, CostKind,
              &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
                     unsigned /*Unused*/, bool /*Unused*/) {
-              Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
-                                     CostKind, 0, nullptr);
+              Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
+                                     SingleOpTy, RegMask, CostKind, 0, nullptr);
             });
         return Cost;
       }
 
       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
       return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
-                                            {}, CostKind, 0, nullptr);
+                                            SingleOpTy, {}, CostKind, 0,
+                                            nullptr);
     }
 
-    return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
+    return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                 SubTp);
   }
 
   // If we're just moving a single element around (probably as an alternative to
@@ -2229,7 +2244,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
     if (ST->hasSSE3() && IsLoad)
       if (const auto *Entry =
               CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
-        assert(isLegalBroadcastLoad(BaseTp->getElementType(),
+        assert(isLegalBroadcastLoad(SrcTy->getElementType(),
                                     LT.second.getVectorElementCount()) &&
                "Table entry missing from isLegalBroadcastLoad()");
         return LT.first * Entry->Cost;
@@ -2263,7 +2278,8 @@ InstructionCost X86TTIImpl::getShuffleCost(
         return LT.first * *KindCost;
   }
 
-  return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                               SubTp);
 }
 
 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
@@ -4903,8 +4919,8 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       EVT VT = TLI->getValueType(DL, Val);
       if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
         SubTy = FixedVectorType::get(ScalarType, SubNumElts);
-      ShuffleCost =
-          getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, {}, CostKind, 0, SubTy);
+      ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
+                                   CostKind, 0, SubTy);
     }
     int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
     return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
@@ -4999,8 +5015,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(
           // FIXME: we don't need to extract if all non-demanded elements
           //        are legalization-inserted padding.
           if (!LaneEltMask.isAllOnes())
-            Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
-                                   I * NumEltsPerLane, LaneTy);
+            Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
+                                   CostKind, I * NumEltsPerLane, LaneTy);
           Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
                                                   /*Extract*/ false, CostKind);
         }
@@ -5017,8 +5033,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(
             if (!AffectedLanes[I] ||
                 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
               continue;
-            Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, {}, CostKind,
-                                   I * NumEltsPerLane, LaneTy);
+            Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, Ty, {},
+                                   CostKind, I * NumEltsPerLane, LaneTy);
           }
         }
       }
@@ -5077,7 +5093,7 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(
               NumEltsPerLane, I * NumEltsPerLane);
           if (LaneEltMask.isZero())
             continue;
-          Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
+          Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {}, CostKind,
                                  I * NumEltsPerLane, LaneTy);
           Cost += BaseT::getScalarizationOverhead(
               LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
@@ -5195,9 +5211,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
       DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
   unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
 
-  InstructionCost SingleShuffleCost = getShuffleCost(
-      TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/{}, CostKind,
-      /*Index=*/0, /*SubTp=*/nullptr);
+  InstructionCost SingleShuffleCost =
+      getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
+                     /*Mask=*/{}, CostKind,
+                     /*Index=*/0, /*SubTp=*/nullptr);
   return NumDstVectorsDemanded * SingleShuffleCost;
 }
 
@@ -5338,9 +5355,10 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
         SubVecEltsLeft += CurrVecTy->getNumElements();
         // And that's free only for the 0'th subvector of a legalized vector.
         if (!Is0thSubVec)
-          Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
-                                        : TTI::ShuffleKind::SK_ExtractSubvector,
-                                 VTy, {}, CostKind, NumEltDone(), CurrVecTy);
+          Cost +=
+              getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
+                                    : TTI::ShuffleKind::SK_ExtractSubvector,
+                             VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
       }
 
       // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
@@ -5416,17 +5434,17 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
   if (VT.isSimple() && Ty != VT.getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires extend/truncate for data and a shuffle for mask.
-    Cost +=
-        getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, {}, CostKind, 0,
-                       nullptr) +
-        getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, {}, CostKind, 0, nullptr);
+    Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
+                           0, nullptr) +
+            getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
+                           0, nullptr);
 
   else if (LT.first * Ty.getVectorNumElements() > NumElem) {
     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
                                            Ty.getVectorNumElements());
     // Expanding requires fill mask with zeroes
-    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, {}, CostKind, 0,
-                           MaskTy);
+    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
+                           CostKind, 0, MaskTy);
   }
 
   // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
@@ -5690,7 +5708,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     // If we're reducing from 256/512 bits, use an extract_subvector.
     if (Size > 128) {
       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
-      ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
+      ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
                                       CostKind, NumVecElts, SubTy);
       Ty = SubTy;
     } else if (Size == 128) {
@@ -5702,8 +5720,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
       else
         ShufTy =
             FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
-      ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
-                                      CostKind, 0, nullptr);
+      ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
+                                      {}, CostKind, 0, nullptr);
     } else if (Size == 64) {
       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
       FixedVectorType *ShufTy;
@@ -5713,8 +5731,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
       else
         ShufTy =
             FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
-      ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
-                                      CostKind, 0, nullptr);
+      ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
+                                      {}, CostKind, 0, nullptr);
     } else {
       // Reducing from smaller size is a shift by immediate.
       auto *ShiftTy = FixedVectorType::get(
@@ -5872,8 +5890,8 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
     // If we're reducing from 256/512 bits, use an extract_subvector.
     if (Size > 128) {
       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
-      MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, {}, CostKind,
-                                   NumVecElts, SubTy);
+      MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
+                                   CostKind, NumVecElts, SubTy);
       Ty = SubTy;
     } else if (Size == 128) {
       // Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -5883,7 +5901,7 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
             FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
       else
         ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
-      MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
+      MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
                                    CostKind, 0, nullptr);
     } else if (Size == 64) {
       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
@@ -5892,7 +5910,7 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
         ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
       else
         ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
-      MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, {},
+      MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
                                    CostKind, 0, nullptr);
     } else {
       // Reducing from smaller size is a shift by immediate.
@@ -6678,8 +6696,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
     TTI::ShuffleKind ShuffleKind =
         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
 
-    InstructionCost ShuffleCost =
-        getShuffleCost(ShuffleKind, SingleMemOpTy, {}, CostKind, 0, nullptr);
+    InstructionCost ShuffleCost = getShuffleCost(
+        ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
 
     unsigned NumOfLoadsInInterleaveGrp =
         Indices.size() ? Indices.size() : Factor;
@@ -6735,8 +6753,9 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   // There is no strided stores meanwhile. And store can't be folded in
   // shuffle.
   unsigned NumOfSources = Factor; // The number of values to be merged.
-  InstructionCost ShuffleCost = getShuffleCost(
-      TTI::SK_PermuteTwoSrc, SingleMemOpTy, {}, CostKind, 0, nullptr);
+  InstructionCost ShuffleCost =
+      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
+                     CostKind, 0, nullptr);
   unsigned NumOfShufflesPerStore = NumOfSources - 1;
 
   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 8045f1b1d663..bc06c4746c3c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -149,9 +149,9 @@ public:
                                   TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
-                 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
-                 ArrayRef<const Value *> Args = {},
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
   InstructionCost
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index ccb68700747b..a712b4632e9a 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1560,9 +1560,9 @@ public:
         InstructionCost EmbedCost(0);
         // Roughly estimate the cost for embedding the columns into a vector.
         for (unsigned I = 1; I < N; ++I)
-          EmbedCost +=
-              TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1),
-                                 {}, TTI::TCK_RecipThroughput);
+          EmbedCost += TTI.getShuffleCost(
+              TTI::SK_Splice, FixedVectorType::get(EltTy, 1),
+              FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput);
         return EmbedCost;
       }
 
@@ -1582,9 +1582,9 @@ public:
         // vector.
         InstructionCost EmbedCost(0);
         for (unsigned I = 1; I < N; ++I)
-          EmbedCost -=
-              TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1),
-                                 {}, TTI::TCK_RecipThroughput);
+          EmbedCost -= TTI.getShuffleCost(
+              TTI::SK_Splice, FixedVectorType::get(EltTy, 1),
+              FixedVectorType::get(EltTy, 1), {}, TTI::TCK_RecipThroughput);
         return EmbedCost;
       }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f28c2ce0acc9..f4259d3d6988 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5343,8 +5343,8 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
 
   bool Reverse = ConsecutiveStride < 0;
   if (Reverse)
-    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
-                               CostKind, 0);
+    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
+                               VectorTy, {}, CostKind, 0);
   return Cost;
 }
 
@@ -5361,8 +5361,8 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
     return TTI.getAddressComputationCost(ValTy) +
            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
                                CostKind) +
-           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
-                              CostKind);
+           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
+                              VectorTy, {}, CostKind);
   }
   StoreInst *SI = cast<StoreInst>(I);
 
@@ -5428,8 +5428,8 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
     assert(!Legal->isMaskRequired(I) &&
            "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
-            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},
-                               CostKind, 0);
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
+                               VectorTy, {}, CostKind, 0);
   }
   return Cost;
 }
@@ -6171,6 +6171,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       SmallVector<int> Mask(VF.getKnownMinValue());
       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
+                                cast<VectorType>(VectorTy),
                                 cast<VectorType>(VectorTy), Mask, CostKind,
                                 VF.getKnownMinValue() - 1);
     }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1141c1b2babb..cb65c225dcdb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5717,20 +5717,24 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
                TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                int Index = 0, VectorType *SubTp = nullptr,
                ArrayRef<const Value *> Args = {}) {
+  VectorType *DstTy = Tp;
+  if (!Mask.empty())
+    DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
+
   if (Kind != TTI::SK_PermuteTwoSrc)
-    return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
+    return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
+                              Args);
   int NumSrcElts = Tp->getElementCount().getKnownMinValue();
   int NumSubElts;
   if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
                              Mask, NumSrcElts, NumSubElts, Index)) {
     if (Index + NumSubElts > NumSrcElts &&
         Index + NumSrcElts <= static_cast<int>(Mask.size()))
-      return TTI.getShuffleCost(
-          TTI::SK_InsertSubvector,
-          getWidenedType(Tp->getElementType(), Mask.size()), Mask,
-          TTI::TCK_RecipThroughput, Index, Tp);
+      return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
+                                TTI::TCK_RecipThroughput, Index, Tp);
   }
-  return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
+  return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
+                            Args);
 }
 
 /// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
@@ -12036,7 +12040,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         if (isa<FixedVectorType>(ScalarTy)) {
           assert(SLPReVec && "FixedVectorType is not expected.");
           return TTI.getShuffleCost(
-              TTI::SK_InsertSubvector, VecTy, {}, CostKind,
+              TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
               std::distance(VL.begin(), It) * getNumElements(ScalarTy),
               cast<FixedVectorType>(ScalarTy));
         }
@@ -22995,7 +22999,10 @@ private:
             unsigned ScalarTyNumElements = VecTy->getNumElements();
             for (unsigned I : seq<unsigned>(ReducedVals.size())) {
               VectorCost += TTI->getShuffleCost(
-                  TTI::SK_PermuteSingleSrc, VectorTy,
+                  TTI::SK_PermuteSingleSrc,
+                  FixedVectorType::get(VecTy->getScalarType(),
+                                       ReducedVals.size()),
+                  VectorTy,
                   createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
               VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
                                                             FMF, CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a48ff168efcc..3d237de5fa8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -822,6 +822,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
     Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
 
     return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
+                                  cast<VectorType>(VectorTy),
                                   cast<VectorType>(VectorTy), Mask,
                                   Ctx.CostKind, VF.getKnownMinValue() - 1);
   }
@@ -2869,9 +2870,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
   if (!Reverse)
     return Cost;
 
-  return Cost +=
-         Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
+  return Cost += Ctx.TTI.getShuffleCost(
+             TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
+             cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }
 
 void VPWidenLoadRecipe::execute(VPTransformState &State) {
@@ -2985,9 +2986,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
   if (!Reverse)
     return Cost;
 
-  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
-                                       0);
+  return Cost + Ctx.TTI.getShuffleCost(
+                    TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
+                    cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3098,9 +3099,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
   if (!Reverse)
     return Cost;
 
-  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
-                                       0);
+  return Cost + Ctx.TTI.getShuffleCost(
+                    TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
+                    cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3478,7 +3479,8 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
 
   return Cost + IG->getNumMembers() *
                     Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                           VectorTy, {}, Ctx.CostKind, 0);
+                                           VectorTy, VectorTy, {}, Ctx.CostKind,
+                                           0);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 52cb1dbb33b8..95e1f96c71b4 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -289,8 +289,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
   Mask[0] = OffsetEltIndex;
   if (OffsetEltIndex)
-    NewCost +=
-        TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind);
+    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
+                                  CostKind);
 
   // We can aggressively convert to the vector form because the backend can
   // invert this transform if it does not result in a performance win.
@@ -510,12 +510,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                    PoisonMaskElem);
       ShuffleMask[BestInsIndex] = BestExtIndex;
       NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    VecTy, ShuffleMask, CostKind, 0, nullptr,
-                                    {ConvertToShuffle});
+                                    VecTy, VecTy, ShuffleMask, CostKind, 0,
+                                    nullptr, {ConvertToShuffle});
     } else {
-      NewCost +=
-          TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
-                             {}, CostKind, 0, nullptr, {ConvertToShuffle});
+      NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    VecTy, VecTy, {}, CostKind, 0, nullptr,
+                                    {ConvertToShuffle});
     }
   }
 
@@ -712,8 +712,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
 
   InstructionCost NewCost =
       TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask,
-                         CostKind);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy,
+                         Mask, CostKind);
 
   bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
   // If the lengths of the two vectors are not equal,
@@ -723,7 +723,7 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
     SrcMask.assign(NumElts, PoisonMaskElem);
     SrcMask[Index] = Index;
     NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                  SrcVecTy, SrcMask, CostKind);
+                                  VecTy, SrcVecTy, SrcMask, CostKind);
   }
 
   if (NewCost > OldCost)
@@ -871,12 +871,12 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
               : TargetTransformInfo::SK_PermuteTwoSrc;
 
   InstructionCost NewCost =
-      TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CostKind) +
+      TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
       (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
                                      TargetTransformInfo::CastContextHint::None,
                                      CostKind));
   InstructionCost OldCost =
-      TTI.getShuffleCost(SK, SrcTy, Mask, CostKind) +
+      TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
       TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
                            TargetTransformInfo::CastContextHint::None,
                            CostKind);
@@ -943,7 +943,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
     Mask.resize(FVTy->getNumElements(), 0);
   InstructionCost SplatCost =
       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask,
+      TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, VecTy, Mask,
                          CostKind);
 
   // Calculate the cost of the VP Intrinsic
@@ -1260,14 +1260,13 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
-  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
+  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecTy));
   InstructionCost NewCost = TTI.getCmpSelInstrCost(
-      CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred,
-      CostKind);
+      CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
   ShufMask[CheapIndex] = ExpensiveIndex;
   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
-                                ShufMask, CostKind);
+                                CmpTy, ShufMask, CostKind);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
   NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
   NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
@@ -1783,8 +1782,8 @@ bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
                                       TTI::CastContextHint::None, CostKind);
 
   InstructionCost NewCost = 0;
-  NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, MaskTy,
-                                ConcatMask, CostKind);
+  NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ConcatTy,
+                                MaskTy, ConcatMask, CostKind);
   NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
                                   TTI::CastContextHint::None, CostKind);
   if (Ty != ConcatIntTy)
@@ -1889,26 +1888,28 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   // Try to merge shuffles across the binop if the new shuffles are not costly.
   InstructionCost OldCost =
       TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
-                         OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
+                         BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
   if (Match0)
-    OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
-                                  Mask0, CostKind, 0, nullptr, {Op00, Op01},
-                                  cast<Instruction>(BinOp->getOperand(0)));
+    OldCost += TTI.getShuffleCost(
+        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
+        0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
   if (Match1)
-    OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
-                                  Mask1, CostKind, 0, nullptr, {Op10, Op11},
-                                  cast<Instruction>(BinOp->getOperand(1)));
+    OldCost += TTI.getShuffleCost(
+        TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
+        0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
 
   InstructionCost NewCost =
       TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
 
   if (!IsIdentity0)
-    NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
-                                  NewMask0, CostKind, 0, nullptr, {Op00, Op01});
+    NewCost +=
+        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
+                           Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
   if (!IsIdentity1)
-    NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
-                                  NewMask1, CostKind, 0, nullptr, {Op10, Op11});
+    NewCost +=
+        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
+                           Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -2002,8 +2003,9 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   InstructionCost OldCost =
       TTI.getInstructionCost(LHS, CostKind) +
       TTI.getInstructionCost(RHS, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
-                         OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
+                         BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
+                         &I);
 
   // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
   // where one use shuffles have gotten split across the binop/cmp. These
@@ -2035,16 +2037,18 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
   ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
 
+  auto *ShuffleCmpTy =
+      FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
   InstructionCost NewCost =
-      TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
-      TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
+      TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
+                         nullptr, {X, Z}) +
+      TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
+                         nullptr, {Y, W});
 
   if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
     NewCost +=
         TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
   } else {
-    auto *ShuffleCmpTy =
-        FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
     NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
                                       ShuffleDstTy, PredLHS, CostKind);
   }
@@ -2112,15 +2116,17 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
       SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
   OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
                                     CmpInst::BAD_ICMP_PREDICATE, CostKind);
-  OldCost += TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr,
-                                {I.getOperand(0), I.getOperand(1)}, &I);
+  OldCost +=
+      TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
+                         {I.getOperand(0), I.getOperand(1)}, &I);
 
-  InstructionCost NewCost =
-      TTI.getShuffleCost(SK, C1VecTy, Mask, CostKind, 0, nullptr, {C1, C2});
-  NewCost +=
-      TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, {T1, T2});
-  NewCost +=
-      TTI.getShuffleCost(SK, SrcVecTy, Mask, CostKind, 0, nullptr, {F1, F2});
+  InstructionCost NewCost = TTI.getShuffleCost(
+      SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
+      Mask, CostKind, 0, nullptr, {C1, C2});
+  NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
+                                nullptr, {T1, T2});
+  NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
+                                nullptr, {F1, F2});
   auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
       toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
   NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
@@ -2220,11 +2226,12 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
                            TTI::CastContextHint::None, CostKind);
   InstructionCost OldCost = CostC0 + CostC1;
   OldCost +=
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy,
-                         OldMask, CostKind, 0, nullptr, {}, &I);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
+                         CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
 
-  InstructionCost NewCost = TTI.getShuffleCost(
-      TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind);
+  InstructionCost NewCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy,
+                         CastSrcTy, NewMask, CostKind);
   NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
                                   TTI::CastContextHint::None, CostKind);
   if (!C0->hasOneUse())
@@ -2363,8 +2370,9 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   TargetTransformInfo::ShuffleKind SK =
       IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
               : TargetTransformInfo::SK_PermuteTwoSrc;
-  InstructionCost NewCost = TTI.getShuffleCost(
-      SK, ShuffleSrcTy, NewMask, CostKind, 0, nullptr, {NewX, NewY});
+  InstructionCost NewCost =
+      TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
+                         nullptr, {NewX, NewY});
   if (!OuterV0->hasOneUse())
     NewCost += InnerCost0;
   if (!OuterV1->hasOneUse())
@@ -2415,21 +2423,23 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   InstructionCost OldCost =
       TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
       TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
-                         CostKind, 0, nullptr, {II0, II1}, &I);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
+                         II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
 
   SmallVector<Type *> NewArgsTy;
   InstructionCost NewCost = 0;
-  for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
+  for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
     if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
       NewArgsTy.push_back(II0->getArgOperand(I)->getType());
     } else {
       auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
-      NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
-                                               ShuffleDstTy->getNumElements()));
+      auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
+                                         ShuffleDstTy->getNumElements());
+      NewArgsTy.push_back(ArgTy);
       NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
-                                    VecTy, OldMask, CostKind);
+                                    ArgTy, VecTy, OldMask, CostKind);
     }
+  }
   IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
   NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
 
@@ -2508,7 +2518,9 @@ static bool isFreeConcat(ArrayRef<InstLane> Item, TTI::TargetCostKind CostKind,
   // during legalization.
   SmallVector<int, 16> ConcatMask(NumElts * 2);
   std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
-  if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, ConcatMask, CostKind) != 0)
+  if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
+                         FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
+                         Ty, ConcatMask, CostKind) != 0)
     return false;
 
   unsigned NumSlices = Item.size() / NumElts;
@@ -2877,21 +2889,15 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
   SmallVector<int> ConcatMask;
   Shuffle->getShuffleMask(ConcatMask);
   sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
-  // In the case of a truncating shuffle it's possible for the mask
-  // to have an index greater than the size of the resulting vector.
-  // This requires special handling.
-  bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
   bool UsesSecondVec =
       any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
 
-  FixedVectorType *VecTyForCost =
-      (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
   InstructionCost OldCost = TTI.getShuffleCost(
-      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
-      VecTyForCost, Shuffle->getShuffleMask(), CostKind);
+      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+      ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
   InstructionCost NewCost = TTI.getShuffleCost(
-      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
-      VecTyForCost, ConcatMask, CostKind);
+      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+      ShuffleInputType, ConcatMask, CostKind);
 
   LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
                     << "\n");
@@ -3205,10 +3211,11 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
     return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
                                       ? TTI::SK_PermuteSingleSrc
                                       : TTI::SK_PermuteTwoSrc,
-                                  VT, SV->getShuffleMask(), CostKind);
+                                  VT, VT, SV->getShuffleMask(), CostKind);
   };
   auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
-    return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask, CostKind);
+    return C +
+           TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
   };
 
   // Get the costs of the shuffles + binops before and after with the new
@@ -3446,8 +3453,8 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
     // Ignore 'free' identity insertion shuffle.
     // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
     if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
-      NewCost += TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind, 0, nullptr,
-                                    {DstVec, SrcVec});
+      NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
+                                    nullptr, {DstVec, SrcVec});
   } else {
     // When creating length-changing-vector, always create with a Mask whose
     // first element has an ExtIdx, so that the first element of the vector
@@ -3459,8 +3466,8 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
       ExtToVecMask[0] = ExtIdx;
     // Add cost for expanding or narrowing
     NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                 SrcVecTy, ExtToVecMask, CostKind);
-    NewCost += TTI.getShuffleCost(SK, DstVecTy, Mask, CostKind);
+                                 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
+    NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
   }
 
   if (!Ext->hasOneUse())

From 831fcb5e91a6dce6260bb177eb15a5036f6e6804 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Sat, 21 Jun 2025 13:41:32 +0100
Subject: [PATCH 1159/1322] [libc++] constexpr flat_map (#137453)

Fixes #128673
---
 libcxx/docs/ReleaseNotes/21.rst               |   1 +
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/include/__flat_map/flat_map.h          | 401 +++++++++++-------
 .../include/__flat_map/key_value_iterator.h   |  58 ++-
 libcxx/include/__flat_map/utils.h             |   4 +-
 libcxx/include/module.modulemap.in            |   3 +
 libcxx/test/std/containers/Emplaceable.h      |  16 +-
 .../flat.map/flat.map.access/at.pass.cpp      |  41 +-
 .../flat.map.access/at_transparent.pass.cpp   |  40 +-
 .../flat.map.access/index_key.pass.cpp        |  24 +-
 .../flat.map.access/index_rv_key.pass.cpp     |  23 +-
 .../index_transparent.pass.cpp                |  25 +-
 .../flat.map/flat.map.capacity/empty.pass.cpp |  22 +-
 .../flat.map.capacity/max_size.pass.cpp       |  12 +-
 .../flat.map/flat.map.capacity/size.pass.cpp  |  23 +-
 .../flat.map/flat.map.cons/alloc.pass.cpp     |  46 +-
 .../assign_initializer_list.pass.cpp          |  22 +-
 .../flat.map/flat.map.cons/compare.pass.cpp   | 130 ++++--
 .../flat.map.cons/containers.pass.cpp         | 269 +++++++-----
 .../flat.map/flat.map.cons/copy.pass.cpp      |  32 +-
 .../flat.map.cons/copy_alloc.pass.cpp         |  60 ++-
 .../flat.map.cons/copy_assign.pass.cpp        |  35 +-
 .../flat.map/flat.map.cons/default.pass.cpp   |  40 +-
 .../flat.map.cons/default_noexcept.pass.cpp   |  16 +-
 .../flat.map.cons/dtor_noexcept.pass.cpp      |  41 +-
 .../flat.map.cons/initializer_list.pass.cpp   | 159 ++++---
 .../flat.map/flat.map.cons/iter_iter.pass.cpp | 259 +++++++----
 .../flat.map/flat.map.cons/move.pass.cpp      |  35 +-
 .../flat.map.cons/move_alloc.pass.cpp         |  64 ++-
 .../flat.map.cons/move_assign.pass.cpp        |  31 +-
 .../flat.map.cons/move_assign_clears.pass.cpp |  43 +-
 ... => move_assign_noexcept.compile.pass.cpp} |   4 +-
 .../flat.map/flat.map.cons/range.pass.cpp     | 287 ++++++++-----
 .../flat.map.cons/sorted_container.pass.cpp   | 220 +++++-----
 .../sorted_initializer_list.pass.cpp          | 164 +++----
 .../flat.map.cons/sorted_iter_iter.pass.cpp   | 255 ++++++-----
 .../flat.map.erasure/erase_if.pass.cpp        |  28 +-
 .../flat.map.iterators/iterator.pass.cpp      |  20 +-
 .../iterator_comparison.pass.cpp              |  20 +-
 .../reverse_iterator.pass.cpp                 | 102 +++--
 .../flat.map.modifiers/clear.pass.cpp         |  20 +-
 .../flat.map.modifiers/emplace.pass.cpp       |  26 +-
 .../flat.map.modifiers/emplace_hint.pass.cpp  |  27 +-
 .../flat.map.modifiers/erase_iter.pass.cpp    |  23 +-
 .../erase_iter_iter.pass.cpp                  |  24 +-
 .../flat.map.modifiers/erase_key.pass.cpp     |  23 +-
 .../erase_key_transparent.pass.cpp            |  33 +-
 .../flat.map.modifiers/extract.pass.cpp       |  23 +-
 .../flat.map.modifiers/insert_cv.pass.cpp     |  23 +-
 .../insert_initializer_list.pass.cpp          |  40 +-
 .../insert_iter_cv.pass.cpp                   |  23 +-
 .../insert_iter_iter.pass.cpp                 |  52 ++-
 .../insert_iter_rv.pass.cpp                   |  26 +-
 .../insert_or_assign.pass.cpp                 |  24 +-
 .../insert_or_assign_transparent.pass.cpp     |  52 ++-
 .../flat.map.modifiers/insert_range.pass.cpp  |  27 +-
 .../flat.map.modifiers/insert_rv.pass.cpp     |  24 +-
 .../insert_sorted_initializer_list.pass.cpp   |  39 +-
 .../insert_sorted_iter_iter.pass.cpp          |  22 +-
 .../insert_transparent.pass.cpp               | 161 +++----
 .../flat.map.modifiers/replace.pass.cpp       |  23 +-
 .../flat.map.modifiers/swap_free.pass.cpp     |  20 +-
 .../flat.map.modifiers/swap_member.pass.cpp   |  20 +-
 .../flat.map.modifiers/try_emplace.pass.cpp   |  96 +++--
 .../try_emplace_transparent.pass.cpp          |  56 ++-
 .../flat.map/flat.map.observers/comp.pass.cpp |  17 +-
 .../flat.map.observers/keys_values.pass.cpp   |  20 +-
 .../flat.map.operations/contains.pass.cpp     |  20 +-
 .../contains_transparent.pass.cpp             |  21 +-
 .../flat.map.operations/count.pass.cpp        |  20 +-
 .../count_transparent.pass.cpp                |  20 +-
 .../flat.map.operations/equal_range.pass.cpp  |  20 +-
 .../equal_range_transparent.pass.cpp          |  20 +-
 .../flat.map.operations/find.pass.cpp         |  20 +-
 .../find_transparent.pass.cpp                 |  20 +-
 .../flat.map.operations/lower_bound.pass.cpp  |  20 +-
 .../lower_bound_transparent.pass.cpp          |  20 +-
 .../flat.map.operations/upper_bound.pass.cpp  |  20 +-
 .../upper_bound_transparent.pass.cpp          |  20 +-
 .../container.adaptors/flat.map/helpers.h     |  11 +-
 .../flat.map/incomplete_type.pass.cpp         |  16 +-
 .../flat.map/op_compare.pass.cpp              |  23 +-
 .../container.adaptors/flat_helpers.h         |  37 +-
 libcxx/test/std/containers/test_compare.h     |  18 +-
 libcxx/test/support/MinSequenceContainer.h    |  49 ++-
 85 files changed, 2954 insertions(+), 1482 deletions(-)
 rename libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/{move_assign_noexcept.pass.cpp => move_assign_noexcept.compile.pass.cpp} (99%)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 7e8570691200..c52bc54f412b 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -47,6 +47,7 @@ Implemented Papers
 - P1222R4: A Standard ``flat_set`` (`Github <https://github.com/llvm/llvm-project/issues/105193>`__)
 - P2897R7: ``aligned_accessor``: An mdspan accessor expressing pointer over-alignment (`Github <https://github.com/llvm/llvm-project/issues/118372>`__)
 - P3247R2: Deprecate the notion of trivial types (`Github <https://github.com/llvm/llvm-project/issues/118387>`__)
+- P3372R3: ``constexpr`` containers and adaptors (`Github <https://github.com/llvm/llvm-project/issues/128673>`__) (Only ``constexpr flat_map`` is implemented)
 - P2441R2: ``views::join_with`` (`Github <https://github.com/llvm/llvm-project/issues/105185>`__)
 - P2711R1: Making multi-param constructors of ``views`` ``explicit`` (`Github <https://github.com/llvm/llvm-project/issues/105252>`__)
 - P2770R0: Stashing stashing ``iterators`` for proper flattening (`Github <https://github.com/llvm/llvm-project/issues/105250>`__)
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 8a0417e120d7..2eb192106977 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -104,7 +104,7 @@
 "`P3137R3 <https://wg21.link/P3137R3>`__","``views::to_input``","2025-02 (Hagenberg)","","",""
 "`P0472R3 <https://wg21.link/P0472R3>`__","Put ``std::monostate`` in ``<utility>``","2025-02 (Hagenberg)","|Complete|","21",""
 "`P3349R1 <https://wg21.link/P3349R1>`__","Converting contiguous iterators to pointers","2025-02 (Hagenberg)","","",""
-"`P3372R3 <https://wg21.link/P3372R3>`__","constexpr containers and adaptors","2025-02 (Hagenberg)","","",""
+"`P3372R3 <https://wg21.link/P3372R3>`__","constexpr containers and adaptors","2025-02 (Hagenberg)","|In Progress|","",""
 "`P3378R2 <https://wg21.link/P3378R2>`__","constexpr exception types","2025-02 (Hagenberg)","","",""
 "`P3441R2 <https://wg21.link/P3441R2>`__","Rename ``simd_split`` to ``simd_chunk``","2025-02 (Hagenberg)","","",""
 "`P3287R3 <https://wg21.link/P3287R3>`__","Exploration of namespaces for ``std::simd``","2025-02 (Hagenberg)","","",""
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index 8f01882934b7..bf193f6d3c62 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -114,11 +114,12 @@ public:
   class value_compare {
   private:
     _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_;
-    _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare(key_compare __c) : __comp_(__c) {}
     friend flat_map;
 
   public:
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return __comp_(__x.first, __y.first);
     }
   };
@@ -137,14 +138,14 @@ private:
 
 public:
   // [flat.map.cons], construct/copy/destroy
-  _LIBCPP_HIDE_FROM_ABI flat_map() noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map() noexcept(
       is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_MappedContainer> &&
       is_nothrow_default_constructible_v<_Compare>)
       : __containers_(), __compare_() {}
 
   _LIBCPP_HIDE_FROM_ABI flat_map(const flat_map&) = default;
 
-  _LIBCPP_HIDE_FROM_ABI flat_map(flat_map&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(flat_map&& __other) noexcept(
       is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_MappedContainer> &&
       is_nothrow_move_constructible_v<_Compare>)
 #  if _LIBCPP_HAS_EXCEPTIONS
@@ -165,7 +166,7 @@ public:
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(const flat_map& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(const flat_map& __other, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_tag{},
                  __alloc,
                  __other.__containers_.keys,
@@ -174,7 +175,7 @@ public:
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(flat_map&& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(flat_map&& __other, const _Allocator& __alloc)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -191,7 +192,7 @@ public:
 #  endif // _LIBCPP_HAS_EXCEPTIONS
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_map(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(
       key_container_type __key_cont, mapped_container_type __mapped_cont, const key_compare& __comp = key_compare())
       : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
@@ -201,7 +202,7 @@ public:
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(const key_container_type& __key_cont, const mapped_container_type& __mapped_cont, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
@@ -211,7 +212,7 @@ public:
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(const key_container_type& __key_cont,
            const mapped_container_type& __mapped_cont,
            const key_compare& __comp,
@@ -222,7 +223,7 @@ public:
     __sort_and_unique();
   }
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(sorted_unique_t,
            key_container_type __key_cont,
            mapped_container_type __mapped_cont,
@@ -236,7 +237,7 @@ public:
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(sorted_unique_t,
            const key_container_type& __key_cont,
            const mapped_container_type& __mapped_cont,
@@ -250,12 +251,12 @@ public:
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
-  flat_map(sorted_unique_t,
-           const key_container_type& __key_cont,
-           const mapped_container_type& __mapped_cont,
-           const key_compare& __comp,
-           const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(
+      sorted_unique_t,
+      const key_container_type& __key_cont,
+      const mapped_container_type& __mapped_cont,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
                                      "flat_map keys and mapped containers have different size");
@@ -263,21 +264,22 @@ public:
         __is_sorted_and_unique(__containers_.keys), "Either the key container is not sorted or it contains duplicates");
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_map(const key_compare& __comp) : __containers_(), __compare_(__comp) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_map(const key_compare& __comp)
+      : __containers_(), __compare_(__comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(const key_compare& __comp, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI explicit flat_map(const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_map(const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc) {}
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __containers_(), __compare_(__comp) {
     insert(__first, __last);
@@ -285,7 +287,7 @@ public:
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
     insert(__first, __last);
@@ -293,99 +295,105 @@ public:
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI flat_map(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_map(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc) {
     insert(__first, __last);
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_map(from_range_t __fr, _Range&& __rg)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(from_range_t __fr, _Range&& __rg)
       : flat_map(__fr, std::forward<_Range>(__rg), key_compare()) {}
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(from_range_t, _Range&& __rg, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(from_range_t, _Range&& __rg, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_map(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_map(__comp) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(from_range_t, _Range&& __rg, const key_compare& __comp)
+      : flat_map(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_map(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(sorted_unique_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __containers_(), __compare_(__comp) {
     insert(sorted_unique, __first, __last);
   }
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI
-  flat_map(sorted_unique_t,
-           _InputIterator __first,
-           _InputIterator __last,
-           const key_compare& __comp,
-           const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(
+      sorted_unique_t,
+      _InputIterator __first,
+      _InputIterator __last,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
     insert(sorted_unique, __first, __last);
   }
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(sorted_unique_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : flat_map(__ctor_uses_allocator_empty_tag{}, __alloc) {
     insert(sorted_unique, __first, __last);
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_map(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_map(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_map(__il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_map(__il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_map(initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_map(__il.begin(), __il.end(), __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(sorted_unique_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_map(sorted_unique, __il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_map(sorted_unique_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_map(sorted_unique, __il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(sorted_unique_t, initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_map(sorted_unique_t, initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_map(sorted_unique, __il.begin(), __il.end(), __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_map& operator=(initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map& operator=(initializer_list<value_type> __il) {
     clear();
     insert(__il);
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_map& operator=(const flat_map&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map& operator=(const flat_map&) = default;
 
-  _LIBCPP_HIDE_FROM_ABI flat_map& operator=(flat_map&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map& operator=(flat_map&& __other) noexcept(
       is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_MappedContainer> &&
       is_nothrow_move_assignable_v<_Compare>) {
     // No matter what happens, we always want to clear the other container before returning
@@ -402,49 +410,65 @@ public:
   }
 
   // iterators
-  _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept {
     return iterator(__containers_.keys.begin(), __containers_.values.begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept {
     return const_iterator(__containers_.keys.begin(), __containers_.values.begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator end() noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept {
     return iterator(__containers_.keys.end(), __containers_.values.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept {
     return const_iterator(__containers_.keys.end(), __containers_.values.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept {
+    return reverse_iterator(begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
   // [flat.map.capacity], capacity
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __containers_.keys.empty(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept {
+    return __containers_.keys.empty();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __containers_.keys.size(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept {
+    return __containers_.keys.size();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept {
     return std::min<size_type>(__containers_.keys.max_size(), __containers_.values.max_size());
   }
 
   // [flat.map.access], element access
-  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __x)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
     requires is_constructible_v<mapped_type>
   {
     return try_emplace(__x).first->second;
   }
 
-  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __x)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
     requires is_constructible_v<mapped_type>
   {
     return try_emplace(std::move(__x)).first->second;
@@ -453,11 +477,11 @@ public:
   template <class _Kp>
     requires(__is_compare_transparent && is_constructible_v<key_type, _Kp> && is_constructible_v<mapped_type> &&
              !is_convertible_v<_Kp &&, const_iterator> && !is_convertible_v<_Kp &&, iterator>)
-  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](_Kp&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
     return try_emplace(std::forward<_Kp>(__x)).first->second;
   }
 
-  _LIBCPP_HIDE_FROM_ABI mapped_type& at(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& at(const key_type& __x) {
     auto __it = find(__x);
     if (__it == end()) {
       std::__throw_out_of_range("flat_map::at(const key_type&): Key does not exist");
@@ -465,7 +489,7 @@ public:
     return __it->second;
   }
 
-  _LIBCPP_HIDE_FROM_ABI const mapped_type& at(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const mapped_type& at(const key_type& __x) const {
     auto __it = find(__x);
     if (__it == end()) {
       std::__throw_out_of_range("flat_map::at(const key_type&) const: Key does not exist");
@@ -475,7 +499,7 @@ public:
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI mapped_type& at(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& at(const _Kp& __x) {
     auto __it = find(__x);
     if (__it == end()) {
       std::__throw_out_of_range("flat_map::at(const K&): Key does not exist");
@@ -485,7 +509,7 @@ public:
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const mapped_type& at(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const mapped_type& at(const _Kp& __x) const {
     auto __it = find(__x);
     if (__it == end()) {
       std::__throw_out_of_range("flat_map::at(const K&) const: Key does not exist");
@@ -496,45 +520,49 @@ public:
   // [flat.map.modifiers], modifiers
   template <class... _Args>
     requires is_constructible_v<pair<key_type, mapped_type>, _Args...>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool> emplace(_Args&&... __args) {
     std::pair<key_type, mapped_type> __pair(std::forward<_Args>(__args)...);
     return __try_emplace(std::move(__pair.first), std::move(__pair.second));
   }
 
   template <class... _Args>
     requires is_constructible_v<pair<key_type, mapped_type>, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
     std::pair<key_type, mapped_type> __pair(std::forward<_Args>(__args)...);
     return __try_emplace_hint(__hint, std::move(__pair.first), std::move(__pair.second)).first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert(const value_type& __x) { return emplace(__x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool> insert(const value_type& __x) {
+    return emplace(__x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert(value_type&& __x) { return emplace(std::move(__x)); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool> insert(value_type&& __x) {
+    return emplace(std::move(__x));
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) {
     return emplace_hint(__hint, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) {
     return emplace_hint(__hint, std::move(__x));
   }
 
   template <class _PairLike>
     requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert(_PairLike&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool> insert(_PairLike&& __x) {
     return emplace(std::forward<_PairLike>(__x));
   }
 
   template <class _PairLike>
     requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, _PairLike&& __x) {
     return emplace_hint(__hint, std::forward<_PairLike>(__x));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -543,7 +571,8 @@ public:
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_unique_t, _InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_unique_t, _InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -552,7 +581,7 @@ public:
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) {
     if constexpr (ranges::sized_range<_Range>) {
       __reserve(ranges::size(__range));
     }
@@ -560,19 +589,22 @@ public:
     __append_sort_merge_unique</*WasSorted = */ false>(ranges::begin(__range), ranges::end(__range));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list<value_type> __il) {
+    insert(__il.begin(), __il.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_unique_t, initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(sorted_unique_t, initializer_list<value_type> __il) {
     insert(sorted_unique, __il.begin(), __il.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI containers extract() && {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 containers extract() && {
     auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; });
     auto __ret   = std::move(__containers_);
     return __ret;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __key_cont.size() == __mapped_cont.size(), "flat_map keys and mapped containers have different size");
 
@@ -586,13 +618,15 @@ public:
 
   template <class... _Args>
     requires is_constructible_v<mapped_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> try_emplace(const key_type& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  try_emplace(const key_type& __key, _Args&&... __args) {
     return __try_emplace(__key, std::forward<_Args>(__args)...);
   }
 
   template <class... _Args>
     requires is_constructible_v<mapped_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> try_emplace(key_type&& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  try_emplace(key_type&& __key, _Args&&... __args) {
     return __try_emplace(std::move(__key), std::forward<_Args>(__args)...);
   }
 
@@ -600,75 +634,84 @@ public:
     requires(__is_compare_transparent && is_constructible_v<key_type, _Kp> &&
              is_constructible_v<mapped_type, _Args...> && !is_convertible_v<_Kp &&, const_iterator> &&
              !is_convertible_v<_Kp &&, iterator>)
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> try_emplace(_Kp&& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool> try_emplace(_Kp&& __key, _Args&&... __args) {
     return __try_emplace(std::forward<_Kp>(__key), std::forward<_Args>(__args)...);
   }
 
   template <class... _Args>
     requires is_constructible_v<mapped_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator try_emplace(const_iterator __hint, const key_type& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  try_emplace(const_iterator __hint, const key_type& __key, _Args&&... __args) {
     return __try_emplace_hint(__hint, __key, std::forward<_Args>(__args)...).first;
   }
 
   template <class... _Args>
     requires is_constructible_v<mapped_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator try_emplace(const_iterator __hint, key_type&& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  try_emplace(const_iterator __hint, key_type&& __key, _Args&&... __args) {
     return __try_emplace_hint(__hint, std::move(__key), std::forward<_Args>(__args)...).first;
   }
 
   template <class _Kp, class... _Args>
     requires __is_compare_transparent && is_constructible_v<key_type, _Kp> && is_constructible_v<mapped_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator try_emplace(const_iterator __hint, _Kp&& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  try_emplace(const_iterator __hint, _Kp&& __key, _Args&&... __args) {
     return __try_emplace_hint(__hint, std::forward<_Kp>(__key), std::forward<_Args>(__args)...).first;
   }
 
   template <class _Mapped>
     requires is_assignable_v<mapped_type&, _Mapped> && is_constructible_v<mapped_type, _Mapped>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert_or_assign(const key_type& __key, _Mapped&& __obj) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  insert_or_assign(const key_type& __key, _Mapped&& __obj) {
     return __insert_or_assign(__key, std::forward<_Mapped>(__obj));
   }
 
   template <class _Mapped>
     requires is_assignable_v<mapped_type&, _Mapped> && is_constructible_v<mapped_type, _Mapped>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert_or_assign(key_type&& __key, _Mapped&& __obj) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  insert_or_assign(key_type&& __key, _Mapped&& __obj) {
     return __insert_or_assign(std::move(__key), std::forward<_Mapped>(__obj));
   }
 
   template <class _Kp, class _Mapped>
     requires __is_compare_transparent && is_constructible_v<key_type, _Kp> && is_assignable_v<mapped_type&, _Mapped> &&
              is_constructible_v<mapped_type, _Mapped>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert_or_assign(_Kp&& __key, _Mapped&& __obj) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  insert_or_assign(_Kp&& __key, _Mapped&& __obj) {
     return __insert_or_assign(std::forward<_Kp>(__key), std::forward<_Mapped>(__obj));
   }
 
   template <class _Mapped>
     requires is_assignable_v<mapped_type&, _Mapped> && is_constructible_v<mapped_type, _Mapped>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_or_assign(const_iterator __hint, const key_type& __key, _Mapped&& __obj) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  insert_or_assign(const_iterator __hint, const key_type& __key, _Mapped&& __obj) {
     return __insert_or_assign(__hint, __key, std::forward<_Mapped>(__obj));
   }
 
   template <class _Mapped>
     requires is_assignable_v<mapped_type&, _Mapped> && is_constructible_v<mapped_type, _Mapped>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_or_assign(const_iterator __hint, key_type&& __key, _Mapped&& __obj) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  insert_or_assign(const_iterator __hint, key_type&& __key, _Mapped&& __obj) {
     return __insert_or_assign(__hint, std::move(__key), std::forward<_Mapped>(__obj));
   }
 
   template <class _Kp, class _Mapped>
     requires __is_compare_transparent && is_constructible_v<key_type, _Kp> && is_assignable_v<mapped_type&, _Mapped> &&
              is_constructible_v<mapped_type, _Mapped>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_or_assign(const_iterator __hint, _Kp&& __key, _Mapped&& __obj) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  insert_or_assign(const_iterator __hint, _Kp&& __key, _Mapped&& __obj) {
     return __insert_or_assign(__hint, std::forward<_Kp>(__key), std::forward<_Mapped>(__obj));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) {
     return __erase(__position.__key_iter_, __position.__mapped_iter_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __position) {
     return __erase(__position.__key_iter_, __position.__mapped_iter_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) {
     auto __iter = find(__x);
     if (__iter != end()) {
       erase(__iter);
@@ -680,14 +723,14 @@ public:
   template <class _Kp>
     requires(__is_compare_transparent && !is_convertible_v<_Kp &&, iterator> &&
              !is_convertible_v<_Kp &&, const_iterator>)
-  _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
     return __res;
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_it     = __containers_.keys.erase(__first.__key_iter_, __last.__key_iter_);
     auto __mapped_it  = __containers_.values.erase(__first.__mapped_iter_, __last.__mapped_iter_);
@@ -695,7 +738,7 @@ public:
     return iterator(std::move(__key_it), std::move(__mapped_it));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void swap(flat_map& __y) noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_map& __y) noexcept {
     // warning: The spec has unconditional noexcept, which means that
     // if any of the following functions throw an exception,
     // std::terminate will be called.
@@ -705,133 +748,156 @@ public:
     ranges::swap(__containers_.values, __y.__containers_.values);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void clear() noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept {
     __containers_.keys.clear();
     __containers_.values.clear();
   }
 
   // observers
-  _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; }
-  _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return value_compare(__compare_); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const {
+    return value_compare(__compare_);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const key_container_type& keys() const noexcept { return __containers_.keys; }
-  _LIBCPP_HIDE_FROM_ABI const mapped_container_type& values() const noexcept { return __containers_.values; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const key_container_type& keys() const noexcept {
+    return __containers_.keys;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const mapped_container_type& values() const noexcept {
+    return __containers_.values;
+  }
 
   // map operations
-  _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) {
+    return __find_impl(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); }
-
-  template <class _Kp>
-    requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const {
     return __find_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) {
     return __find_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const { return contains(__x) ? 1 : 0; }
-
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const {
+    return __find_impl(*this, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const {
     return contains(__x) ? 1 : 0;
   }
 
-  _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); }
-
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const {
+    return contains(__x) ? 1 : 0;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const {
     return find(__x) != end();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { return __lower_bound<iterator>(*this, __x); }
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const {
+    return find(__x) != end();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) {
+    return __lower_bound<iterator>(*this, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const {
     return __lower_bound<const_iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) {
     return __lower_bound<iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const {
     return __lower_bound<const_iterator>(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { return __upper_bound<iterator>(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) {
+    return __upper_bound<iterator>(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const {
     return __upper_bound<const_iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) {
     return __upper_bound<iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const {
     return __upper_bound<const_iterator>(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const key_type& __x) {
     return __equal_range_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const key_type& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const _Kp& __x) {
     return __equal_range_impl(*this, __x);
   }
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const _Kp& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_map& __x, const flat_map& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool operator==(const flat_map& __x, const flat_map& __y) {
     return ranges::equal(__x, __y);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_map& __x, const flat_map& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto
+  operator<=>(const flat_map& __x, const flat_map& __y) {
     return std::lexicographical_compare_three_way(
         __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI void swap(flat_map& __x, flat_map& __y) noexcept { __x.swap(__y); }
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_map& __x, flat_map& __y) noexcept {
+    __x.swap(__y);
+  }
 
 private:
   struct __ctor_uses_allocator_tag {
-    explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_tag() = default;
+    explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __ctor_uses_allocator_tag() = default;
   };
   struct __ctor_uses_allocator_empty_tag {
-    explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_empty_tag() = default;
+    explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __ctor_uses_allocator_empty_tag() = default;
   };
 
   template <class _Allocator, class _KeyCont, class _MappedCont, class... _CompArg>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
-  flat_map(__ctor_uses_allocator_tag,
-           const _Allocator& __alloc,
-           _KeyCont&& __key_cont,
-           _MappedCont&& __mapped_cont,
-           _CompArg&&... __comp)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_map(
+      __ctor_uses_allocator_tag,
+      const _Allocator& __alloc,
+      _KeyCont&& __key_cont,
+      _MappedCont&& __mapped_cont,
+      _CompArg&&... __comp)
       : __containers_{.keys = std::make_obj_using_allocator<key_container_type>(
                           __alloc, std::forward<_KeyCont>(__key_cont)),
                       .values = std::make_obj_using_allocator<mapped_container_type>(
@@ -840,12 +906,13 @@ private:
 
   template <class _Allocator, class... _CompArg>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_map(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_map(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp)
       : __containers_{.keys   = std::make_obj_using_allocator<key_container_type>(__alloc),
                       .values = std::make_obj_using_allocator<mapped_container_type>(__alloc)},
         __compare_(std::forward<_CompArg>(__comp)...) {}
 
-  _LIBCPP_HIDE_FROM_ABI bool __is_sorted_and_unique(auto&& __key_container) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool __is_sorted_and_unique(auto&& __key_container) const {
     auto __greater_or_equal_to = [this](const auto& __x, const auto& __y) { return !__compare_(__x, __y); };
     return ranges::adjacent_find(__key_container, __greater_or_equal_to) == ranges::end(__key_container);
   }
@@ -853,7 +920,7 @@ private:
   // This function is only used in constructors. So there is not exception handling in this function.
   // If the function exits via an exception, there will be no flat_map object constructed, thus, there
   // is no invariant state to preserve
-  _LIBCPP_HIDE_FROM_ABI void __sort_and_unique() {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __sort_and_unique() {
     auto __zv = ranges::views::zip(__containers_.keys, __containers_.values);
     ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); });
     auto __dup_start = ranges::unique(__zv, __key_equiv(__compare_)).begin();
@@ -863,14 +930,16 @@ private:
   }
 
   template <class _Self, class _KeyIter>
-  _LIBCPP_HIDE_FROM_ABI static auto __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto
+  __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) {
     return __self.__containers_.values.begin() +
            static_cast<ranges::range_difference_t<mapped_container_type>>(
                ranges::distance(__self.__containers_.keys.begin(), __key_iter));
   }
 
   template <bool _WasSorted, class _InputIterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI void __append_sort_merge_unique(_InputIterator __first, _Sentinel __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  __append_sort_merge_unique(_InputIterator __first, _Sentinel __last) {
     auto __on_failure        = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     size_t __num_of_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last));
     if (__num_of_appended != 0) {
@@ -898,7 +967,7 @@ private:
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) {
     auto __it   = __self.lower_bound(__key);
     auto __last = __self.end();
     if (__it == __last || __self.__compare_(__key, __it->first)) {
@@ -908,7 +977,7 @@ private:
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __key_equal_range(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __key_equal_range(_Self&& __self, const _Kp& __key) {
     auto __it =
         std::lower_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __key, __self.__compare_);
     auto __last = __self.__containers_.keys.end();
@@ -919,7 +988,7 @@ private:
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
     auto [__key_first, __key_last] = __key_equal_range(__self, __key);
     using __iterator_type          = ranges::iterator_t<decltype(__self)>;
     return std::make_pair(__iterator_type(__key_first, __corresponding_mapped_it(__self, __key_first)),
@@ -927,7 +996,7 @@ private:
   }
 
   template <class _Res, class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static _Res __lower_bound(_Self&& __self, _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static _Res __lower_bound(_Self&& __self, _Kp& __x) {
     auto __key_iter =
         std::lower_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __x, __self.__compare_);
     auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter);
@@ -935,7 +1004,7 @@ private:
   }
 
   template <class _Res, class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static _Res __upper_bound(_Self&& __self, _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static _Res __upper_bound(_Self&& __self, _Kp& __x) {
     auto __key_iter =
         std::upper_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __x, __self.__compare_);
     auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter);
@@ -943,7 +1012,8 @@ private:
   }
 
   template <class _KeyArg, class... _MArgs>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __try_emplace(_KeyArg&& __key, _MArgs&&... __mapped_args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  __try_emplace(_KeyArg&& __key, _MArgs&&... __mapped_args) {
     auto __key_it    = std::lower_bound(__containers_.keys.begin(), __containers_.keys.end(), __key, __compare_);
     auto __mapped_it = __containers_.values.begin() + ranges::distance(__containers_.keys.begin(), __key_it);
 
@@ -962,7 +1032,7 @@ private:
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI bool __is_hint_correct(const_iterator __hint, _Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool __is_hint_correct(const_iterator __hint, _Kp&& __key) {
     if (__hint != cbegin() && !__compare_((__hint - 1)->first, __key)) {
       return false;
     }
@@ -973,7 +1043,8 @@ private:
   }
 
   template <class _Kp, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __try_emplace_hint(const_iterator __hint, _Kp&& __key, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  __try_emplace_hint(const_iterator __hint, _Kp&& __key, _Args&&... __args) {
     if (__is_hint_correct(__hint, __key)) {
       if (__hint == cend() || __compare_(__key, __hint->first)) {
         return {__flat_map_utils::__emplace_exact_pos(
@@ -994,7 +1065,8 @@ private:
   }
 
   template <class _Kp, class _Mapped>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_or_assign(_Kp&& __key, _Mapped&& __mapped) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, bool>
+  __insert_or_assign(_Kp&& __key, _Mapped&& __mapped) {
     auto __r = try_emplace(std::forward<_Kp>(__key), std::forward<_Mapped>(__mapped));
     if (!__r.second) {
       __r.first->second = std::forward<_Mapped>(__mapped);
@@ -1003,7 +1075,8 @@ private:
   }
 
   template <class _Kp, class _Mapped>
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_or_assign(const_iterator __hint, _Kp&& __key, _Mapped&& __mapped) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  __insert_or_assign(const_iterator __hint, _Kp&& __key, _Mapped&& __mapped) {
     auto __r = __try_emplace_hint(__hint, std::forward<_Kp>(__key), std::forward<_Mapped>(__mapped));
     if (!__r.second) {
       __r.first->second = std::forward<_Mapped>(__mapped);
@@ -1011,7 +1084,7 @@ private:
     return __r.first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) {
     if constexpr (__container_traits<_KeyContainer>::__reservable) {
       __containers_.keys.reserve(__size);
     }
@@ -1022,7 +1095,8 @@ private:
   }
 
   template <class _KIter, class _MIter>
-  _LIBCPP_HIDE_FROM_ABI iterator __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) {
     auto __on_failure  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_iter    = __containers_.keys.erase(__key_iter_to_remove);
     auto __mapped_iter = __containers_.values.erase(__mapped_iter_to_remove);
@@ -1032,7 +1106,8 @@ private:
 
   template <class _Key2, class _Tp2, class _Compare2, class _KeyContainer2, class _MappedContainer2, class _Predicate>
   friend typename flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type
-  erase_if(flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate);
+      _LIBCPP_CONSTEXPR_SINCE_CXX26
+      erase_if(flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate);
 
   friend __flat_map_utils;
 
@@ -1040,8 +1115,9 @@ private:
   _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
 
   struct __key_equiv {
-    _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x));
     }
     key_compare __comp_;
@@ -1164,8 +1240,9 @@ struct uses_allocator<flat_map<_Key, _Tp, _Compare, _KeyContainer, _MappedContai
     : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator>> {};
 
 template <class _Key, class _Tp, class _Compare, class _KeyContainer, class _MappedContainer, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI typename flat_map<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type
-erase_if(flat_map<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_map, _Predicate __pred) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+    typename flat_map<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type
+    erase_if(flat_map<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_map, _Predicate __pred) {
   auto __zv     = ranges::views::zip(__flat_map.__containers_.keys, __flat_map.__containers_.values);
   auto __first  = __zv.begin();
   auto __last   = __zv.end();
diff --git a/libcxx/include/__flat_map/key_value_iterator.h b/libcxx/include/__flat_map/key_value_iterator.h
index 3ebb653deb19..f163dfc28706 100644
--- a/libcxx/include/__flat_map/key_value_iterator.h
+++ b/libcxx/include/__flat_map/key_value_iterator.h
@@ -46,7 +46,7 @@ private:
 
   struct __arrow_proxy {
     __reference __ref_;
-    _LIBCPP_HIDE_FROM_ABI __reference* operator->() { return std::addressof(__ref_); }
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __reference* operator->() { return std::addressof(__ref_); }
   };
 
   __key_iterator __key_iter_;
@@ -69,99 +69,113 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __key_value_iterator() = default;
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator(__key_value_iterator<_Owner, _KeyContainer, _MappedContainer, !_Const> __i)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  __key_value_iterator(__key_value_iterator<_Owner, _KeyContainer, _MappedContainer, !_Const> __i)
     requires _Const && convertible_to<typename _KeyContainer::iterator, __key_iterator> &&
                  convertible_to<typename _MappedContainer::iterator, __mapped_iterator>
       : __key_iter_(std::move(__i.__key_iter_)), __mapped_iter_(std::move(__i.__mapped_iter_)) {}
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator(__key_iterator __key_iter, __mapped_iterator __mapped_iter)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  __key_value_iterator(__key_iterator __key_iter, __mapped_iterator __mapped_iter)
       : __key_iter_(std::move(__key_iter)), __mapped_iter_(std::move(__mapped_iter)) {}
 
-  _LIBCPP_HIDE_FROM_ABI __reference operator*() const { return __reference(*__key_iter_, *__mapped_iter_); }
-  _LIBCPP_HIDE_FROM_ABI __arrow_proxy operator->() const { return __arrow_proxy{**this}; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __reference operator*() const {
+    return __reference(*__key_iter_, *__mapped_iter_);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __arrow_proxy operator->() const { return __arrow_proxy{**this}; }
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator& operator++() {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_value_iterator& operator++() {
     ++__key_iter_;
     ++__mapped_iter_;
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator operator++(int) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_value_iterator operator++(int) {
     __key_value_iterator __tmp(*this);
     ++*this;
     return __tmp;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator& operator--() {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_value_iterator& operator--() {
     --__key_iter_;
     --__mapped_iter_;
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator operator--(int) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_value_iterator operator--(int) {
     __key_value_iterator __tmp(*this);
     --*this;
     return __tmp;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator& operator+=(difference_type __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_value_iterator& operator+=(difference_type __x) {
     __key_iter_ += __x;
     __mapped_iter_ += __x;
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __key_value_iterator& operator-=(difference_type __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_value_iterator& operator-=(difference_type __x) {
     __key_iter_ -= __x;
     __mapped_iter_ -= __x;
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __reference operator[](difference_type __n) const { return *(*this + __n); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __reference operator[](difference_type __n) const {
+    return *(*this + __n);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend bool
   operator==(const __key_value_iterator& __x, const __key_value_iterator& __y) {
     return __x.__key_iter_ == __y.__key_iter_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend bool operator<(const __key_value_iterator& __x, const __key_value_iterator& __y) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend bool
+  operator<(const __key_value_iterator& __x, const __key_value_iterator& __y) {
     return __x.__key_iter_ < __y.__key_iter_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend bool operator>(const __key_value_iterator& __x, const __key_value_iterator& __y) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend bool
+  operator>(const __key_value_iterator& __x, const __key_value_iterator& __y) {
     return __y < __x;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend bool operator<=(const __key_value_iterator& __x, const __key_value_iterator& __y) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend bool
+  operator<=(const __key_value_iterator& __x, const __key_value_iterator& __y) {
     return !(__y < __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend bool operator>=(const __key_value_iterator& __x, const __key_value_iterator& __y) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend bool
+  operator>=(const __key_value_iterator& __x, const __key_value_iterator& __y) {
     return !(__x < __y);
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend auto operator<=>(const __key_value_iterator& __x, const __key_value_iterator& __y)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend auto
+  operator<=>(const __key_value_iterator& __x, const __key_value_iterator& __y)
     requires three_way_comparable<__key_iterator>
   {
     return __x.__key_iter_ <=> __y.__key_iter_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend __key_value_iterator operator+(const __key_value_iterator& __i, difference_type __n) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend __key_value_iterator
+  operator+(const __key_value_iterator& __i, difference_type __n) {
     auto __tmp = __i;
     __tmp += __n;
     return __tmp;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend __key_value_iterator operator+(difference_type __n, const __key_value_iterator& __i) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend __key_value_iterator
+  operator+(difference_type __n, const __key_value_iterator& __i) {
     return __i + __n;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend __key_value_iterator operator-(const __key_value_iterator& __i, difference_type __n) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend __key_value_iterator
+  operator-(const __key_value_iterator& __i, difference_type __n) {
     auto __tmp = __i;
     __tmp -= __n;
     return __tmp;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend difference_type
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 friend difference_type
   operator-(const __key_value_iterator& __x, const __key_value_iterator& __y) {
     return difference_type(__x.__key_iter_ - __y.__key_iter_);
   }
diff --git a/libcxx/include/__flat_map/utils.h b/libcxx/include/__flat_map/utils.h
index acb7dca7ffe9..27687ae8de3b 100644
--- a/libcxx/include/__flat_map/utils.h
+++ b/libcxx/include/__flat_map/utils.h
@@ -35,7 +35,7 @@ struct __flat_map_utils {
   // roll back the changes it made to the map. If it cannot roll back the changes, it will
   // clear the map.
   template <class _Map, class _IterK, class _IterM, class _KeyArg, class... _MArgs>
-  _LIBCPP_HIDE_FROM_ABI static typename _Map::iterator __emplace_exact_pos(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static typename _Map::iterator __emplace_exact_pos(
       _Map& __map, _IterK&& __it_key, _IterM&& __it_mapped, _KeyArg&& __key, _MArgs&&... __mapped_args) {
     auto __on_key_failed = std::__make_exception_guard([&]() noexcept {
       using _KeyContainer = typename _Map::key_container_type;
@@ -82,7 +82,7 @@ struct __flat_map_utils {
   // TODO: We could optimize this, see
   // https://github.com/llvm/llvm-project/issues/108624
   template <class _Map, class _InputIterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI static typename _Map::size_type
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static typename _Map::size_type
   __append(_Map& __map, _InputIterator __first, _Sentinel __last) {
     typename _Map::size_type __num_appended = 0;
     for (; __first != __last; ++__first) {
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 4a081e65cb7f..adf80f2ac9ac 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1298,6 +1298,9 @@ module std [system] {
 
     header "flat_map"
     export *
+    export std.algorithm.ranges_sort
+    export std.ranges.zip_view
+    export std.tuple
   }
 
   module flat_set {
diff --git a/libcxx/test/std/containers/Emplaceable.h b/libcxx/test/std/containers/Emplaceable.h
index 246d5b255d6b..d8d340779173 100644
--- a/libcxx/test/std/containers/Emplaceable.h
+++ b/libcxx/test/std/containers/Emplaceable.h
@@ -22,13 +22,13 @@ class Emplaceable {
   double double_;
 
 public:
-  TEST_CONSTEXPR Emplaceable() : int_(0), double_(0) {}
-  TEST_CONSTEXPR Emplaceable(int i, double d) : int_(i), double_(d) {}
-  TEST_CONSTEXPR_CXX14 Emplaceable(Emplaceable&& x) : int_(x.int_), double_(x.double_) {
+  TEST_CONSTEXPR_CXX20 Emplaceable() : int_(0), double_(0) {}
+  TEST_CONSTEXPR_CXX20 Emplaceable(int i, double d) : int_(i), double_(d) {}
+  TEST_CONSTEXPR_CXX20 Emplaceable(Emplaceable&& x) : int_(x.int_), double_(x.double_) {
     x.int_    = 0;
     x.double_ = 0;
   }
-  TEST_CONSTEXPR_CXX14 Emplaceable& operator=(Emplaceable&& x) {
+  TEST_CONSTEXPR_CXX20 Emplaceable& operator=(Emplaceable&& x) {
     int_      = x.int_;
     x.int_    = 0;
     double_   = x.double_;
@@ -36,12 +36,12 @@ public:
     return *this;
   }
 
-  TEST_CONSTEXPR bool operator==(const Emplaceable& x) const { return int_ == x.int_ && double_ == x.double_; }
-  TEST_CONSTEXPR bool operator<(const Emplaceable& x) const {
+  TEST_CONSTEXPR_CXX20 bool operator==(const Emplaceable& x) const { return int_ == x.int_ && double_ == x.double_; }
+  TEST_CONSTEXPR_CXX20 bool operator<(const Emplaceable& x) const {
     return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_);
   }
 
-  TEST_CONSTEXPR int get() const { return int_; }
+  TEST_CONSTEXPR_CXX20 int get() const { return int_; }
 };
 
 template <>
@@ -49,7 +49,7 @@ struct std::hash<Emplaceable> {
   typedef Emplaceable argument_type;
   typedef std::size_t result_type;
 
-  TEST_CONSTEXPR std::size_t operator()(const Emplaceable& x) const { return static_cast<std::size_t>(x.get()); }
+  TEST_CONSTEXPR_CXX20 std::size_t operator()(const Emplaceable& x) const { return static_cast<std::size_t>(x.get()); }
 };
 
 #endif // TEST_STD_VER >= 11
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at.pass.cpp
index d30055bf1701..083a9dd9f76c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at.pass.cpp
@@ -18,6 +18,7 @@
 #include <flat_map>
 #include <functional>
 #include <stdexcept>
+#include <type_traits>
 #include <vector>
 
 #include "MinSequenceContainer.h"
@@ -25,7 +26,7 @@
 #include "test_macros.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using P = std::pair<int, double>;
   P ar[]  = {
       P(1, 1.5),
@@ -49,10 +50,12 @@ void test() {
     assert(m.at(4) == 4.5);
     assert(m.at(5) == 5.5);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try {
-      TEST_IGNORE_NODISCARD m.at(6);
-      assert(false);
-    } catch (std::out_of_range&) {
+    if (!TEST_IS_CONSTANT_EVALUATED) {
+      try {
+        TEST_IGNORE_NODISCARD m.at(6);
+        assert(false);
+      } catch (std::out_of_range&) {
+      }
     }
 #endif
     assert(m.at(7) == 7.5);
@@ -70,10 +73,12 @@ void test() {
     assert(m.at(4) == 4.5);
     assert(m.at(5) == 5.5);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try {
-      TEST_IGNORE_NODISCARD m.at(6);
-      assert(false);
-    } catch (std::out_of_range&) {
+    if (!TEST_IS_CONSTANT_EVALUATED) {
+      try {
+        TEST_IGNORE_NODISCARD m.at(6);
+        assert(false);
+      } catch (std::out_of_range&) {
+      }
     }
 #endif
     assert(m.at(7) == 7.5);
@@ -82,11 +87,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at_transparent.pass.cpp
index bc3fbfca5762..7be6fd7cba0d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/at_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanAt<NonTransparentMap>);
 static_assert(!CanAt<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using P = std::pair<int, double>;
   P ar[]  = {
       P(1, 1.5),
@@ -60,10 +60,12 @@ void test() {
     assert(m.at(Transparent<int>{4}) == 4.5);
     assert(m.at(Transparent<int>{5}) == 5.5);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try {
-      TEST_IGNORE_NODISCARD m.at(Transparent<int>{6});
-      assert(false);
-    } catch (std::out_of_range&) {
+    if (!TEST_IS_CONSTANT_EVALUATED) {
+      try {
+        TEST_IGNORE_NODISCARD m.at(Transparent<int>{6});
+        assert(false);
+      } catch (std::out_of_range&) {
+      }
     }
 #endif
     assert(m.at(Transparent<int>{7}) == 7.5);
@@ -81,10 +83,12 @@ void test() {
     assert(m.at(Transparent<int>{4}) == 4.5);
     assert(m.at(Transparent<int>{5}) == 5.5);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try {
-      TEST_IGNORE_NODISCARD m.at(Transparent<int>{6});
-      assert(false);
-    } catch (std::out_of_range&) {
+    if (!TEST_IS_CONSTANT_EVALUATED) {
+      try {
+        TEST_IGNORE_NODISCARD m.at(Transparent<int>{6});
+        assert(false);
+      } catch (std::out_of_range&) {
+      }
     }
 #endif
     assert(m.at(Transparent<int>{7}) == 7.5);
@@ -93,9 +97,14 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   {
@@ -114,5 +123,14 @@ int main(int, char**) {
     assert(x == 1);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_key.pass.cpp
index ea2f5d800878..a83aa4f0d6b1 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_key.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "MinSequenceContainer.h"
@@ -31,7 +32,7 @@ static_assert(CanIndex<std::flat_map<int, double>, const int&>);
 static_assert(!CanIndex<std::flat_map<int, NoDefaultCtr>, const int&>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using P = std::pair<int, double>;
   P ar[]  = {
       P(1, 1.5),
@@ -58,13 +59,18 @@ void test() {
   assert(m.size() == 8);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto index_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap                             = std::decay_t<decltype(m)>;
       const typename FlatMap::key_type key      = key_arg;
@@ -73,5 +79,15 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(index_func);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_rv_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_rv_key.pass.cpp
index faacc3cfe8f9..778288fd13d2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_rv_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_rv_key.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <functional>
 #include <cassert>
+#include <type_traits>
 
 #include "MinSequenceContainer.h"
 #include "../helpers.h"
@@ -31,7 +32,7 @@ static_assert(CanIndex<std::flat_map<int, double>, int&&>);
 static_assert(!CanIndex<std::flat_map<int, NoDefaultCtr>, int&&>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   {
     std::flat_map<MoveOnly, double, std::less<MoveOnly>, KeyContainer, ValueContainer> m;
     ASSERT_SAME_TYPE(decltype(m[MoveOnly{}]), double&);
@@ -49,13 +50,18 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<MoveOnly>, std::vector<double>>();
-  test<std::deque<MoveOnly>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<MoveOnly>, std::vector<double>>();
+  }
   test<MinSequenceContainer<MoveOnly>, MinSequenceContainer<double>>();
   test<std::vector<MoveOnly, min_allocator<MoveOnly>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto index_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap                             = std::decay_t<decltype(m)>;
       typename FlatMap::key_type key            = key_arg;
@@ -64,5 +70,14 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(index_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_transparent.pass.cpp
index 760ec69ae878..e8ea20b345e3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.access/index_transparent.pass.cpp
@@ -50,7 +50,7 @@ static_assert(!CanIndex<TransparentMap, TransparentMap::iterator>);
 static_assert(!CanIndex<TransparentMap, TransparentMap::const_iterator>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using P = std::pair<int, double>;
   P ar[]  = {
       P(1, 1.5),
@@ -81,11 +81,17 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
   {
     bool transparent_used = false;
     TransparentComparator c(transparent_used);
@@ -101,7 +107,8 @@ int main(int, char**) {
     int& x = m["alpha"];
     assert(x == 1);
   }
-  {
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto index_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap                             = std::decay_t<decltype(m)>;
       using Key                                 = typename FlatMap::key_type;
@@ -110,5 +117,15 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(index_func);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp
index 05efe063c1e1..c1f7b7c5cb1f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp
@@ -24,10 +24,10 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
-  using M     = std::flat_multimap<Key, Value, std::less<int>, KeyContainer, ValueContainer>;
+  using M     = std::flat_map<Key, Value, std::less<int>, KeyContainer, ValueContainer>;
   M m;
   ASSERT_SAME_TYPE(decltype(m.empty()), bool);
   ASSERT_NOEXCEPT(m.empty());
@@ -39,11 +39,25 @@ void test() {
   assert(m.empty());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/max_size.pass.cpp
index 87acdfd2cf62..ee01f8eb1dfe 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/max_size.pass.cpp
@@ -24,7 +24,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+constexpr bool test() {
   {
     using A1 = limited_allocator<int, 10>;
     using A2 = limited_allocator<int, 20>;
@@ -72,5 +72,15 @@ int main(int, char**) {
     assert(c.max_size() <= max_dist);
     assert(c.max_size() <= alloc_max_size(std::allocator<char>()));
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/size.pass.cpp
index 957a86045009..94f2793293f1 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/size.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "MinSequenceContainer.h"
@@ -23,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using M = std::flat_map<int, char, std::less<int>, KeyContainer, ValueContainer>;
   {
     const M m = {{1, 'a'}, {1, 'b'}, {4, 'd'}, {5, 'e'}, {5, 'h'}};
@@ -45,7 +46,7 @@ void test() {
   }
   {
     M m;
-    std::size_t s = 1000000;
+    std::size_t s = TEST_IS_CONSTANT_EVALUATED ? 100 : 1000000;
     for (auto i = 0u; i < s; ++i) {
       m.emplace(i, 'a');
     }
@@ -55,11 +56,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/alloc.pass.cpp
index 3f8d2ed332d6..b7a033454a35 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/alloc.pass.cpp
@@ -14,6 +14,7 @@
 //   explicit flat_map(const Allocator& a);
 
 #include <cassert>
+#include <deque>
 #include <flat_map>
 #include <functional>
 #include <vector>
@@ -22,7 +23,23 @@
 #include "test_allocator.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  using A = test_allocator<short>;
+  using M =
+      std::flat_map<int,
+                    long,
+                    std::less<int>,
+                    KeyContainer<int, test_allocator<int>>,
+                    ValueContainer<long, test_allocator<long>>>;
+  M m(A(0, 5));
+  assert(m.empty());
+  assert(m.begin() == m.end());
+  assert(m.keys().get_allocator().get_id() == 5);
+  assert(m.values().get_allocator().get_id() == 5);
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -53,20 +70,23 @@ int main(int, char**) {
     static_assert(std::is_constructible_v<M, test_allocator<int>>);
     static_assert(!std::is_convertible_v<test_allocator<int>, M>);
   }
+
+  test<std::vector, std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    using A = test_allocator<short>;
-    using M =
-        std::flat_map<int,
-                      long,
-                      std::less<int>,
-                      std::vector<int, test_allocator<int>>,
-                      std::vector<long, test_allocator<long>>>;
-    M m(A(0, 5));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.keys().get_allocator().get_id() == 5);
-    assert(m.values().get_allocator().get_id() == 5);
+    test<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/assign_initializer_list.pass.cpp
index 06bde71e7994..8e3079302719 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/assign_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/assign_initializer_list.pass.cpp
@@ -26,7 +26,7 @@
 #include "test_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -36,7 +36,6 @@ void test() {
     m                              = {{3, 0}, {1, 0}, {2, 0}, {2, 1}, {3, 1}, {4, 0}, {3, 2}, {5, 0}, {6, 0}, {5, 1}};
     std::pair<int, int> expected[] = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}, {6, 0}};
     assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
   }
   {
     M m = {{10, 1}, {8, 1}};
@@ -47,13 +46,28 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
+
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/compare.pass.cpp
index 40a1710f55e4..aa57c8f648bb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/compare.pass.cpp
@@ -20,11 +20,73 @@
 #include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
+#include "min_allocator.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+// explicit flat_map(const key_compare& comp);
+template <class KeyContainer, class ValueContainer>
+constexpr void test_compare() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  {
+    // The one-argument ctor is explicit.
+    using C = test_less<Key>;
+    static_assert(std::is_constructible_v<std::flat_map<Key, Value, C>, C>);
+    static_assert(!std::is_convertible_v<C, std::flat_map<Key, Value, C>>);
+
+    static_assert(std::is_constructible_v<std::flat_map<Key, Value>, std::less<Key>>);
+    static_assert(!std::is_convertible_v<std::less<Key>, std::flat_map<Key, Value>>);
+  }
+  {
+    using C = test_less<Key>;
+    auto m  = std::flat_map<Key, Value, C>(C(3));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(3));
+  }
+}
+
+// template <class Alloc>
+//   flat_map(const key_compare& comp, const Alloc& a);
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_compare_alloc() {
+  {
+    // If an allocator is given, it must be usable by both containers.
+    using A = test_allocator<int>;
+    using M = std::flat_map<int, int, std::less<>, KeyContainer<int>, ValueContainer<int, A>>;
+    static_assert(std::is_constructible_v<M, std::less<>>);
+    static_assert(!std::is_constructible_v<M, std::less<>, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, std::less<>, A>);
+  }
+  {
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    auto m   = std::flat_map<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>(C(4), A1(5));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // explicit(false)
+    using C                                                                          = test_less<int>;
+    using A1                                                                         = test_allocator<int>;
+    using A2                                                                         = test_allocator<short>;
+    std::flat_map<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>> m = {C(4), A1(5)};
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -41,53 +103,31 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M2, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M3, const C&, const A2&>);
   }
-  {
-    using C = test_less<int>;
-    auto m  = std::flat_map<int, char*, C>(C(3));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.key_comp() == C(3));
-  }
-  {
-    // The one-argument ctor is explicit.
-    using C = test_less<int>;
-    static_assert(std::is_constructible_v<std::flat_map<int, char*, C>, C>);
-    static_assert(!std::is_convertible_v<C, std::flat_map<int, char*, C>>);
 
-    static_assert(std::is_constructible_v<std::flat_map<int, char*>, std::less<int>>);
-    static_assert(!std::is_convertible_v<std::less<int>, std::flat_map<int, char*>>);
-  }
+  test_compare<std::vector<int>, std::vector<int>>();
+  test_compare<std::vector<int>, std::vector<double>>();
+  test_compare<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test_compare<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test_compare<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  test_compare_alloc<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    auto m   = std::flat_map<int, short, C, std::vector<int, A1>, std::vector<short, A2>>(C(4), A1(5));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.key_comp() == C(4));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // explicit(false)
-    using C                                                                    = test_less<int>;
-    using A1                                                                   = test_allocator<int>;
-    using A2                                                                   = test_allocator<short>;
-    std::flat_map<int, short, C, std::deque<int, A1>, std::deque<short, A2>> m = {C(4), A1(5)};
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.key_comp() == C(4));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // If an allocator is given, it must be usable by both containers.
-    using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<>, std::vector<int>, std::vector<int, A>>;
-    static_assert(std::is_constructible_v<M, std::less<>>);
-    static_assert(!std::is_constructible_v<M, std::less<>, std::allocator<int>>);
-    static_assert(!std::is_constructible_v<M, std::less<>, A>);
+    test_compare<std::deque<int>, std::vector<double>>();
+    test_compare_alloc<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/containers.pass.cpp
index 812e2c3e4f02..c7503bb8aa77 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/containers.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/containers.pass.cpp
@@ -30,18 +30,162 @@
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
 struct P {
   int first;
   int second;
   template <class T, class U>
-  bool operator==(const std::pair<T, U>& rhs) const {
+  constexpr bool operator==(const std::pair<T, U>& rhs) const {
     return MoveOnly(first) == rhs.first && MoveOnly(second) == rhs.second;
   }
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  {
+    // flat_map(key_container_type , mapped_container_type)
+    using M                  = std::flat_map<int, short, std::less<int>, KeyContainer<int>, ValueContainer<short>>;
+    KeyContainer<int> ks     = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    ValueContainer<short> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                   = M(ks, vs);
+    assert((m.keys() == KeyContainer<int>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+
+    // explicit(false)
+    M m2 = {ks, vs};
+    assert(m2 == m);
+
+    m = M(std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m.keys() == KeyContainer<int>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+  }
+  {
+    // flat_map(key_container_type , mapped_container_type)
+    // move-only
+    P expected[] = {{3, 2}, {2, 1}, {1, 3}};
+    using Ks     = KeyContainer<int, min_allocator<int>>;
+    using Vs     = ValueContainer<MoveOnly, min_allocator<MoveOnly>>;
+    using M      = std::flat_map<int, MoveOnly, std::greater<int>, Ks, Vs>;
+    Ks ks        = {1, 3, 2};
+    Vs vs;
+    vs.push_back(3);
+    vs.push_back(2);
+    vs.push_back(1);
+    auto m = M(std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert(std::ranges::equal(m, expected, std::equal_to<>()));
+  }
+  {
+    // flat_map(key_container_type , mapped_container_type)
+    // container's allocators are used
+    using A = test_allocator<int>;
+    using M = std::flat_map<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = ValueContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    auto m  = M(std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{1, 1}, {2, 2}, {3, 3}}));
+    assert(m.keys().get_allocator() == A(5));
+    assert(m.values().get_allocator() == A(6));
+  }
+  {
+    // flat_map(key_container_type , mapped_container_type, key_compare)
+    using C                 = test_less<int>;
+    using M                 = std::flat_map<int, char, C, KeyContainer<int>, ValueContainer<char>>;
+    KeyContainer<int> ks    = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    ValueContainer<char> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                  = M(ks, vs, C(4));
+    assert((m.keys() == KeyContainer<int>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<char>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.key_comp() == C(4));
+
+    // explicit(false)
+    M m2 = {ks, vs, C(4)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+  }
+  {
+    // flat_map(key_container_type , mapped_container_type, const Allocator&)
+    using A = test_allocator<int>;
+    using M = std::flat_map<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = ValueContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    auto m  = M(ks, vs, A(4)); // replaces the allocators
+    assert(!ks.empty());       // it was an lvalue above
+    assert(!vs.empty());       // it was an lvalue above
+    assert((m == M{{1, 1}, {2, 2}, {3, 3}}));
+    assert(m.keys().get_allocator() == A(4));
+    assert(m.values().get_allocator() == A(4));
+  }
+  {
+    // flat_map(key_container_type , mapped_container_type, const Allocator&)
+    // explicit(false)
+    using A = test_allocator<int>;
+    using M = std::flat_map<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = ValueContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    M m     = {ks, vs, A(4)}; // implicit ctor
+    assert(!ks.empty());      // it was an lvalue above
+    assert(!vs.empty());      // it was an lvalue above
+    assert((m == M{{1, 1}, {2, 2}, {3, 3}}));
+    assert(m.keys().get_allocator() == A(4));
+    assert(m.values().get_allocator() == A(4));
+  }
+
+  {
+    // flat_map(key_container_type , mapped_container_type, key_compare, const Allocator&)
+    using C                = test_less<int>;
+    using A                = test_allocator<int>;
+    using M                = std::flat_map<int, int, C, std::vector<int, A>, std::vector<int, A>>;
+    std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    std::vector<int, A> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                 = M(ks, vs, C(4), A(5));
+    assert((m.keys() == std::vector<int, A>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<int>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A(5));
+    assert(m.values().get_allocator() == A(5));
+
+    // explicit(false)
+    M m2 = {ks, vs, C(4), A(5)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+    assert(m2.keys().get_allocator() == A(5));
+    assert(m2.values().get_allocator() == A(5));
+  }
+}
+
+bool constexpr test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -65,120 +209,25 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const C&, const A2&>);
   }
-  {
-    // flat_map(key_container_type , mapped_container_type)
-    using M              = std::flat_map<int, char>;
-    std::vector<int> ks  = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    std::vector<char> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    auto m               = M(ks, vs);
-    assert((m.keys() == std::vector<int>{1, 2, 3}));
-    LIBCPP_ASSERT((m.values() == std::vector<char>{1, 4, 6}));
 
-    // explicit(false)
-    M m2 = {ks, vs};
-    assert(m2 == m);
+  test<std::vector, std::vector>();
 
-    m = M(std::move(ks), std::move(vs));
-    assert(ks.empty()); // it was moved-from
-    assert(vs.empty()); // it was moved-from
-    assert((m.keys() == std::vector<int>{1, 2, 3}));
-    LIBCPP_ASSERT((m.values() == std::vector<char>{1, 4, 6}));
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(key_container_type , mapped_container_type)
-    // move-only
-    P expected[] = {{3, 2}, {2, 1}, {1, 3}};
-    using Ks     = std::deque<int, min_allocator<int>>;
-    using Vs     = std::vector<MoveOnly, min_allocator<MoveOnly>>;
-    using M      = std::flat_map<int, MoveOnly, std::greater<int>, Ks, Vs>;
-    Ks ks        = {1, 3, 2};
-    Vs vs;
-    vs.push_back(3);
-    vs.push_back(2);
-    vs.push_back(1);
-    auto m = M(std::move(ks), std::move(vs));
-    assert(ks.empty()); // it was moved-from
-    assert(vs.empty()); // it was moved-from
-    assert(std::ranges::equal(m, expected, std::equal_to<>()));
+    test<std::deque, std::vector>();
+    test<std::deque, std::deque>();
   }
-  {
-    // flat_map(key_container_type , mapped_container_type)
-    // container's allocators are used
-    using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
-    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
-    auto m  = M(std::move(ks), std::move(vs));
-    assert(ks.empty()); // it was moved-from
-    assert(vs.empty()); // it was moved-from
-    assert((m == M{{1, 1}, {2, 2}, {3, 3}}));
-    assert(m.keys().get_allocator() == A(5));
-    assert(m.values().get_allocator() == A(6));
-  }
-  {
-    // flat_map(key_container_type , mapped_container_type, key_compare)
-    using C              = test_less<int>;
-    using M              = std::flat_map<int, char, C>;
-    std::vector<int> ks  = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    std::vector<char> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    auto m               = M(ks, vs, C(4));
-    assert((m.keys() == std::vector<int>{1, 2, 3}));
-    LIBCPP_ASSERT((m.values() == std::vector<char>{1, 4, 6}));
-    assert(m.key_comp() == C(4));
 
-    // explicit(false)
-    M m2 = {ks, vs, C(4)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(4));
-  }
-  {
-    // flat_map(key_container_type , mapped_container_type, const Allocator&)
-    using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
-    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
-    auto m  = M(ks, vs, A(4)); // replaces the allocators
-    assert(!ks.empty());       // it was an lvalue above
-    assert(!vs.empty());       // it was an lvalue above
-    assert((m == M{{1, 1}, {2, 2}, {3, 3}}));
-    assert(m.keys().get_allocator() == A(4));
-    assert(m.values().get_allocator() == A(4));
-  }
-  {
-    // flat_map(key_container_type , mapped_container_type, const Allocator&)
-    // explicit(false)
-    using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
-    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
-    M m     = {ks, vs, A(4)}; // implicit ctor
-    assert(!ks.empty());      // it was an lvalue above
-    assert(!vs.empty());      // it was an lvalue above
-    assert((m == M{{1, 1}, {2, 2}, {3, 3}}));
-    assert(m.keys().get_allocator() == A(4));
-    assert(m.values().get_allocator() == A(4));
-  }
-  {
-    // flat_map(key_container_type , mapped_container_type, key_compare, const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_map<int, int, C, std::vector<int, A>, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    std::vector<int, A> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    auto m                 = M(ks, vs, C(4), A(5));
-    assert((m.keys() == std::vector<int, A>{1, 2, 3}));
-    LIBCPP_ASSERT((m.values() == std::vector<int, A>{1, 4, 6}));
-    assert(m.key_comp() == C(4));
-    assert(m.keys().get_allocator() == A(5));
-    assert(m.values().get_allocator() == A(5));
+  return true;
+}
 
-    // explicit(false)
-    M m2 = {ks, vs, C(4), A(5)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(4));
-    assert(m2.keys().get_allocator() == A(5));
-    assert(m2.values().get_allocator() == A(5));
-  }
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy.pass.cpp
index fcd0415088c1..856886718999 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy.pass.cpp
@@ -13,6 +13,7 @@
 // flat_map(const flat_map& m);
 
 #include <cassert>
+#include <deque>
 #include <flat_map>
 #include <vector>
 
@@ -20,11 +21,12 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5}, test_allocator<int>(6));
-    std::vector<char, test_allocator<char>> vs({2, 2, 1}, test_allocator<char>(7));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5}, test_allocator<int>(6));
+    ValueContainer<char, test_allocator<char>> vs({2, 2, 1}, test_allocator<char>(7));
     using M = std::flat_map<int, char, C, decltype(ks), decltype(vs)>;
     auto mo = M(ks, vs, C(5));
     auto m  = mo;
@@ -44,8 +46,8 @@ int main(int, char**) {
   }
   {
     using C  = test_less<int>;
-    using Ks = std::vector<int, other_allocator<int>>;
-    using Vs = std::vector<char, other_allocator<char>>;
+    using Ks = KeyContainer<int, other_allocator<int>>;
+    using Vs = ValueContainer<char, other_allocator<char>>;
     auto ks  = Ks({1, 3, 5}, other_allocator<int>(6));
     auto vs  = Vs({2, 2, 1}, other_allocator<char>(7));
     using M  = std::flat_map<int, char, C, Ks, Vs>;
@@ -65,6 +67,26 @@ int main(int, char**) {
     assert(mo.keys().get_allocator() == other_allocator<int>(6));
     assert(mo.values().get_allocator() == other_allocator<char>(7));
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_alloc.pass.cpp
index cbda6ea85326..f8b5774f8619 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_alloc.pass.cpp
@@ -22,7 +22,30 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  using C = test_less<int>;
+  KeyContainer<int, test_allocator<int>> ks({1, 3, 5}, test_allocator<int>(6));
+  ValueContainer<char, test_allocator<char>> vs({2, 2, 1}, test_allocator<char>(7));
+  using M = std::flat_map<int, char, C, decltype(ks), decltype(vs)>;
+  auto mo = M(ks, vs, C(5));
+  auto m  = M(mo, test_allocator<int>(3));
+
+  assert(m.key_comp() == C(5));
+  assert(m.keys() == ks);
+  assert(m.values() == vs);
+  assert(m.keys().get_allocator() == test_allocator<int>(3));
+  assert(m.values().get_allocator() == test_allocator<char>(3));
+
+  // mo is unchanged
+  assert(mo.key_comp() == C(5));
+  assert(mo.keys() == ks);
+  assert(mo.values() == vs);
+  assert(mo.keys().get_allocator() == test_allocator<int>(6));
+  assert(mo.values().get_allocator() == test_allocator<char>(7));
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -41,27 +64,24 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M2, const M2&, const A2&>);
     static_assert(!std::is_constructible_v<M3, const M3&, const A2&>);
   }
+
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5}, test_allocator<int>(6));
-    std::vector<char, test_allocator<char>> vs({2, 2, 1}, test_allocator<char>(7));
-    using M = std::flat_map<int, char, C, decltype(ks), decltype(vs)>;
-    auto mo = M(ks, vs, C(5));
-    auto m  = M(mo, test_allocator<int>(3));
-
-    assert(m.key_comp() == C(5));
-    assert(m.keys() == ks);
-    assert(m.values() == vs);
-    assert(m.keys().get_allocator() == test_allocator<int>(3));
-    assert(m.values().get_allocator() == test_allocator<char>(3));
-
-    // mo is unchanged
-    assert(mo.key_comp() == C(5));
-    assert(mo.keys() == ks);
-    assert(mo.values() == vs);
-    assert(mo.keys().get_allocator() == test_allocator<int>(6));
-    assert(mo.values().get_allocator() == test_allocator<char>(7));
+    test<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp
index 4f9797d5bf81..8aa2e7bc539f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp
@@ -15,18 +15,20 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     // test_allocator is not propagated
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5}, test_allocator<int>(6));
-    std::vector<char, test_allocator<char>> vs({2, 2, 1}, test_allocator<char>(7));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5}, test_allocator<int>(6));
+    ValueContainer<char, test_allocator<char>> vs({2, 2, 1}, test_allocator<char>(7));
     using M = std::flat_map<int, char, C, decltype(ks), decltype(vs)>;
     auto mo = M(ks, vs, C(5));
     auto m  = M({{3, 3}, {4, 4}, {5, 5}}, C(3), test_allocator<int>(2));
@@ -48,8 +50,8 @@ int main(int, char**) {
   {
     // other_allocator is propagated
     using C  = test_less<int>;
-    using Ks = std::vector<int, other_allocator<int>>;
-    using Vs = std::vector<char, other_allocator<char>>;
+    using Ks = KeyContainer<int, other_allocator<int>>;
+    using Vs = ValueContainer<char, other_allocator<char>>;
     auto ks  = Ks({1, 3, 5}, other_allocator<int>(6));
     auto vs  = Vs({2, 2, 1}, other_allocator<char>(7));
     using M  = std::flat_map<int, char, C, Ks, Vs>;
@@ -70,7 +72,7 @@ int main(int, char**) {
     assert(mo.keys().get_allocator() == other_allocator<int>(6));
     assert(mo.values().get_allocator() == other_allocator<char>(7));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // comparator is copied and invariant is preserved
     using M = std::flat_map<int, int, std::function<bool(int, int)>>;
     M mo    = M({{1, 2}, {3, 4}}, std::less<int>());
@@ -88,5 +90,26 @@ int main(int, char**) {
     m       = static_cast<const M&>(m);
     assert((m == M{{1, 2}, {3, 4}}));
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default.pass.cpp
index c5b94896b929..5ef31ef82384 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default.pass.cpp
@@ -24,23 +24,24 @@
 #include "test_allocator.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
-    std::flat_map<int, char*> m;
+    std::flat_map<int, char*, std::less<int>, KeyContainer<int>, ValueContainer<char*>> m;
     assert(m.empty());
   }
   {
     // explicit(false)
-    std::flat_map<int, char*> m = {};
+    std::flat_map<int, char*, std::less<int>, KeyContainer<int>, ValueContainer<char*>> m = {};
     assert(m.empty());
   }
   {
-    std::flat_map<int, char*, DefaultCtableComp, std::deque<int, min_allocator<int>>> m;
+    std::flat_map<int, char*, DefaultCtableComp, KeyContainer<int, min_allocator<int>>> m;
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp().default_constructed_);
@@ -49,13 +50,13 @@ int main(int, char**) {
     using A1 = explicit_allocator<int>;
     using A2 = explicit_allocator<char*>;
     {
-      std::flat_map<int, char*, DefaultCtableComp, std::vector<int, A1>, std::vector<char*, A2>> m;
+      std::flat_map<int, char*, DefaultCtableComp, KeyContainer<int, A1>, ValueContainer<char*, A2>> m;
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
     {
       A1 a1;
-      std::flat_map<int, int, DefaultCtableComp, std::vector<int, A1>, std::vector<int, A1>> m(a1);
+      std::flat_map<int, int, DefaultCtableComp, KeyContainer<int, A1>, ValueContainer<int, A1>> m(a1);
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
@@ -63,10 +64,31 @@ int main(int, char**) {
   {
     // If an allocator is given, it must be usable by both containers.
     using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<>, std::vector<int>, std::vector<int, A>>;
+    using M = std::flat_map<int, int, std::less<>, KeyContainer<int>, ValueContainer<int, A>>;
     static_assert(std::is_constructible_v<M>);
     static_assert(!std::is_constructible_v<M, std::allocator<int>>);
     static_assert(!std::is_constructible_v<M, A>);
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp
index 790dfa4a02ed..9fb6785a7cf1 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp
@@ -28,11 +28,11 @@
 #include "test_allocator.h"
 
 struct ThrowingCtorComp {
-  ThrowingCtorComp() noexcept(false) {}
-  bool operator()(const auto&, const auto&) const { return false; }
+  constexpr ThrowingCtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const { return false; }
 };
 
-int main(int, char**) {
+constexpr bool test() {
 #if defined(_LIBCPP_VERSION)
   {
     using C = std::flat_map<MoveOnly, MoveOnly>;
@@ -55,5 +55,15 @@ int main(int, char**) {
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
index 1570b0fa1488..4562b01bc8c4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
@@ -23,35 +23,58 @@
 #include "test_allocator.h"
 
 struct ThrowingDtorComp {
-  bool operator()(const auto&, const auto&) const;
-  ~ThrowingDtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const;
+  constexpr ~ThrowingDtorComp() noexcept(false) {}
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
-    using C = std::flat_map<MoveOnly, MoveOnly>;
+    using C = std::flat_map<MoveOnly, MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly>, ValueContainer<MoveOnly>>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
-    using C = std::flat_map<MoveOnly, MoveOnly, std::less<MoveOnly>, V, V>;
+    using V  = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using V2 = ValueContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using C  = std::flat_map<MoveOnly, MoveOnly, std::less<MoveOnly>, V, V2>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
-    using C = std::flat_map<MoveOnly, MoveOnly, std::greater<MoveOnly>, V, V>;
+    using V  = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using V2 = ValueContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using C  = std::flat_map<MoveOnly, MoveOnly, std::greater<MoveOnly>, V, V2>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_map<MoveOnly, MoveOnly, ThrowingDtorComp>;
+    using C = std::flat_map<MoveOnly, MoveOnly, ThrowingDtorComp, KeyContainer<MoveOnly>, ValueContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_destructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/initializer_list.pass.cpp
index 7a22746845d0..aea2002ba8d9 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/initializer_list.pass.cpp
@@ -30,12 +30,84 @@
 #include "../../../test_compare.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  std::pair<int, short> expected[] = {{1, 1}, {2, 2}, {3, 3}, {5, 2}};
+  {
+    // flat_map(initializer_list<value_type>);
+    using M                                         = std::flat_map<int, short>;
+    std::initializer_list<std::pair<int, short>> il = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    M m(il);
+    assert(std::equal(m.begin(), m.end(), expected, expected + 4));
+  }
+  {
+    // flat_map(initializer_list<value_type>);
+    // explicit(false)
+    using M = std::flat_map<int, short>;
+    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    assert(std::equal(m.begin(), m.end(), expected, expected + 4));
+  }
+  {
+    // flat_map(initializer_list<value_type>);
+    using M = std::flat_map<int, short, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
+    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
+  }
+  {
+    // flat_map(initializer_list<value_type>);
+    // different comparator
+    using A = explicit_allocator<int>;
+    using M = std::flat_map<int, int, DefaultCtableComp, KeyContainer<int, A>, ValueContainer<int, A>>;
+    M m     = {{1, 1}, {2, 2}, {3, 3}};
+    assert(m.size() == 1);
+    assert(m.begin()->first == m.begin()->second);
+    assert(m.key_comp().default_constructed_);
+  }
+  {
+    // flat_map(initializer_list<value_type>, const Allocator&);
+    using A = explicit_allocator<int>;
+    using M = std::flat_map<int, int, std::greater<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    A a;
+    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, a);
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
+  }
+  {
+    // flat_map(initializer_list<value_type>, const key_compare&);
+    using C = test_less<int>;
+    using M = std::flat_map<int, short, C>;
+    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10));
+    assert(std::equal(m.begin(), m.end(), expected, expected + 4));
+    assert(m.key_comp() == C(10));
+
+    // explicit(false)
+    M m2 = {{{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(10));
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    // flat_map(initializer_list<value_type>, const key_compare&);
+    // Sorting uses the comparator that was passed in
+    using M = std::flat_map<int, short, std::function<bool(int, int)>, KeyContainer<int, min_allocator<int>>>;
+    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, std::greater<int>());
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
+    assert(m.key_comp()(2, 1) == true);
+  }
+  {
+    // flat_map(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
+    using A = explicit_allocator<int>;
+    using M = std::flat_map<int, int, std::greater<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    A a;
+    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, {}, a);
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -82,76 +154,23 @@ int main(int, char**) {
         !std::is_constructible_v<M, std::initializer_list<std::pair<const int, const short>>, std::allocator<int>>);
   }
 
-  std::pair<int, short> expected[] = {{1, 1}, {2, 2}, {3, 3}, {5, 2}};
-  {
-    // flat_map(initializer_list<value_type>);
-    using M                                         = std::flat_map<int, short>;
-    std::initializer_list<std::pair<int, short>> il = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
-    M m(il);
-    assert(std::equal(m.begin(), m.end(), expected, expected + 4));
-  }
-  {
-    // flat_map(initializer_list<value_type>);
-    // explicit(false)
-    using M = std::flat_map<int, short>;
-    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
-    assert(std::equal(m.begin(), m.end(), expected, expected + 4));
-  }
-  {
-    // flat_map(initializer_list<value_type>);
-    using M = std::flat_map<int, short, std::greater<int>, std::deque<int, min_allocator<int>>>;
-    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
-    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
-  }
-  {
-    using A = explicit_allocator<int>;
-    {
-      // flat_map(initializer_list<value_type>);
-      // different comparator
-      using M = std::flat_map<int, int, DefaultCtableComp, std::vector<int, A>, std::deque<int, A>>;
-      M m     = {{1, 1}, {2, 2}, {3, 3}};
-      assert(m.size() == 1);
-      assert(m.begin()->first == m.begin()->second);
-      LIBCPP_ASSERT(*m.begin() == std::make_pair(1, 1));
-      assert(m.key_comp().default_constructed_);
-    }
-    {
-      // flat_map(initializer_list<value_type>, const Allocator&);
-      using M = std::flat_map<int, int, std::greater<int>, std::deque<int, A>, std::vector<int, A>>;
-      A a;
-      M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, a);
-      assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
-    }
-  }
-  {
-    // flat_map(initializer_list<value_type>, const key_compare&);
-    using C = test_less<int>;
-    using M = std::flat_map<int, short, C>;
-    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10));
-    assert(std::equal(m.begin(), m.end(), expected, expected + 4));
-    assert(m.key_comp() == C(10));
+  test<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {{{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(10));
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(initializer_list<value_type>, const key_compare&);
-    // Sorting uses the comparator that was passed in
-    using M = std::flat_map<int, short, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>;
-    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, std::greater<int>());
-    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
-    assert(m.key_comp()(2, 1) == true);
-  }
-  {
-    // flat_map(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
-    using A = explicit_allocator<int>;
-    using M = std::flat_map<int, int, std::greater<int>, std::deque<int, A>, std::vector<int, A>>;
-    A a;
-    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, {}, a);
-    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 4));
+    test<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter.pass.cpp
index 7c0c48796994..0dce4f1993c6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter.pass.cpp
@@ -22,14 +22,163 @@
 #include <flat_map>
 #include <functional>
 #include <vector>
+#include <ranges>
 
+#include "MinSequenceContainer.h"
 #include "min_allocator.h"
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using P     = std::pair<Key, Value>;
+  P ar[]      = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+  {
+    // flat_map(InputIterator , InputIterator)
+    // cpp17_input_iterator
+    using M = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+
+    // explicit(false)
+    M m2 = {cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9)};
+    assert(m2 == m);
+  }
+  {
+    // flat_map(InputIterator , InputIterator)
+    // greater
+    using M = std::flat_map<Key, Value, std::greater<Key>, KeyContainer, ValueContainer>;
+    auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
+    assert(std::ranges::equal(m.keys(), KeyContainer{3, 2, 1}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {6, 8, 9},
+            {4, 5, 7},
+            {1, 2, 3},
+        });
+  }
+  {
+    // flat_map(InputIterator , InputIterator)
+    // Test when the operands are of array type (also contiguous iterator type)
+    using M = std::flat_map<Key, Value, std::greater<Key>, KeyContainer, ValueContainer>;
+    auto m  = M(ar, ar);
+    assert(m.empty());
+  }
+  {
+    // flat_map(InputIterator , InputIterator, const key_compare&)
+    using C = test_less<Key>;
+    using M = std::flat_map<Key, Value, C, KeyContainer, ValueContainer>;
+    auto m  = M(ar, ar + 9, C(3));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.key_comp() == C(3));
+
+    // explicit(false)
+    M m2 = {ar, ar + 9, C(3)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(3));
+  }
+}
+
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_alloc() {
+  using P = std::pair<int, short>;
+  P ar[]  = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+
+  {
+    // flat_map(InputIterator , InputIterator, const Allocator&)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    auto m   = M(ar, ar + 9, A1(5));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(InputIterator , InputIterator, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    M m      = {ar, ar + 9, A1(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(InputIterator , InputIterator, const key_compare&, const Allocator&)
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    auto m   = M(ar, ar + 9, C(3), A1(5));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(InputIterator , InputIterator, const key_compare&, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    M m      = {ar, ar + 9, {}, A2(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -57,98 +206,30 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const C&, const A2&>);
   }
 
-  using P      = std::pair<int, short>;
-  P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
-  P expected[] = {{1, 1}, {2, 4}, {3, 6}};
-  {
-    // flat_map(InputIterator , InputIterator)
-    // cpp17_input_iterator
-    using M = std::flat_map<int, short>;
-    auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+  test<std::vector<int>, std::vector<int>>();
+  test<std::vector<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
-    // explicit(false)
-    M m2 = {cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9)};
-    assert(m2 == m);
-  }
-  {
-    // flat_map(InputIterator , InputIterator)
-    // greater
-    using M = std::flat_map<int, short, std::greater<int>, std::deque<int, min_allocator<int>>, std::deque<short>>;
-    auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
-    assert((m.keys() == std::deque<int, min_allocator<int>>{3, 2, 1}));
-    LIBCPP_ASSERT((m.values() == std::deque<short>{6, 4, 1}));
-  }
-  {
-    // flat_map(InputIterator , InputIterator)
-    // Test when the operands are of array type (also contiguous iterator type)
-    using M = std::flat_map<int, short, std::greater<int>, std::vector<int, min_allocator<int>>>;
-    auto m  = M(ar, ar);
-    assert(m.empty());
-  }
-  {
-    // flat_map(InputIterator , InputIterator, const key_compare&)
-    using C = test_less<int>;
-    using M = std::flat_map<int, short, C, std::vector<int>, std::deque<short>>;
-    auto m  = M(ar, ar + 9, C(3));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.key_comp() == C(3));
+  test_alloc<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {ar, ar + 9, C(3)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(3));
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(InputIterator , InputIterator, const Allocator&)
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    auto m   = M(ar, ar + 9, A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(InputIterator , InputIterator, const Allocator&)
-    // explicit(false)
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    M m      = {ar, ar + 9, A1(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(InputIterator , InputIterator, const key_compare&, const Allocator&)
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
-    auto m   = M(ar, ar + 9, C(3), A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.key_comp() == C(3));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(InputIterator , InputIterator, const key_compare&, const Allocator&)
-    // explicit(false)
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, std::less<int>, std::deque<int, A1>, std::vector<short, A2>>;
-    M m      = {ar, ar + 9, {}, A2(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
+    test<std::deque<int>, std::vector<double>>();
+    test_alloc<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move.pass.cpp
index 955d3156064a..de4cef1409ff 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -25,11 +26,12 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
     using A = test_allocator<int>;
-    using M = std::flat_map<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    using M = std::flat_map<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
     M mo    = M({{1, 1}, {2, 2}, {3, 1}}, C(5), A(7));
     M m     = std::move(mo);
     assert((m == M{{1, 1}, {2, 2}, {3, 1}}));
@@ -45,7 +47,7 @@ int main(int, char**) {
   {
     using C = test_less<int>;
     using A = min_allocator<int>;
-    using M = std::flat_map<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    using M = std::flat_map<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
     M mo    = M({{1, 1}, {2, 2}, {3, 1}}, C(5), A());
     M m     = std::move(mo);
     assert((m == M{{1, 1}, {2, 2}, {3, 1}}));
@@ -58,9 +60,9 @@ int main(int, char**) {
     assert(m.keys().get_allocator() == A());
     assert(m.values().get_allocator() == A());
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // A moved-from flat_map maintains its class invariant in the presence of moved-from comparators.
-    using M = std::flat_map<int, int, std::function<bool(int, int)>>;
+    using M = std::flat_map<int, int, std::function<bool(int, int)>, KeyContainer<int>, ValueContainer<int>>;
     M mo    = M({{1, 1}, {2, 2}, {3, 1}}, std::less<int>());
     M m     = std::move(mo);
     assert(m.size() == 3);
@@ -75,7 +77,7 @@ int main(int, char**) {
   }
   {
     // moved-from object maintains invariant if one of underlying container does not clear after move
-    using M = std::flat_map<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    using M = std::flat_map<int, int, std::less<>, KeyContainer<int>, CopyOnlyVector<int>>;
     M m1    = M({1, 2, 3}, {1, 2, 3});
     M m2    = std::move(m1);
     assert(m2.size() == 3);
@@ -84,5 +86,26 @@ int main(int, char**) {
     LIBCPP_ASSERT(m1.keys().size() == 0);
     LIBCPP_ASSERT(m1.values().size() == 0);
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_alloc.pass.cpp
index 93a397642252..94121123b6a3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_alloc.pass.cpp
@@ -24,30 +24,13 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
-    using M1 = std::flat_map<int, int, C, V1, V1>;
-    using M2 = std::flat_map<int, int, C, V1, V2>;
-    using M3 = std::flat_map<int, int, C, V2, V1>;
-    static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, M1&&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, M2&&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, M3&&, const A2&>);
-  }
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 1}};
     using C                        = test_less<int>;
     using A                        = test_allocator<int>;
-    using M                        = std::flat_map<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    using M                        = std::flat_map<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
     auto mo                        = M(expected, expected + 3, C(5), A(7));
     auto m                         = M(std::move(mo), A(3));
 
@@ -68,7 +51,7 @@ int main(int, char**) {
   }
   {
     // moved-from object maintains invariant if one of underlying container does not clear after move
-    using M = std::flat_map<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    using M = std::flat_map<int, int, std::less<>, KeyContainer<int>, CopyOnlyVector<int>>;
     M m1    = M({1, 2, 3}, {1, 2, 3});
     M m2(std::move(m1), std::allocator<int>{});
     assert(m2.size() == 3);
@@ -77,6 +60,45 @@ int main(int, char**) {
     LIBCPP_ASSERT(m1.keys().size() == 0);
     LIBCPP_ASSERT(m1.values().size() == 0);
   }
+}
+
+constexpr bool test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_map<int, int, C, V1, V1>;
+    using M2 = std::flat_map<int, int, C, V1, V2>;
+    using M3 = std::flat_map<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, M1&&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, M2&&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, M3&&, const A2&>);
+  }
+
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign.pass.cpp
index a94c442c695d..633c73e167a8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign.pass.cpp
@@ -26,12 +26,13 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = test_allocator<char>;
-    using M  = std::flat_map<int, char, C, std::vector<int, A1>, std::vector<char, A2>>;
+    using M  = std::flat_map<int, char, C, KeyContainer<int, A1>, ValueContainer<char, A2>>;
     M mo     = M({{1, 1}, {2, 3}, {3, 2}}, C(5), A1(7));
     M m      = M({}, C(3), A1(7));
     m        = std::move(mo);
@@ -46,7 +47,7 @@ int main(int, char**) {
     using C  = test_less<int>;
     using A1 = other_allocator<int>;
     using A2 = other_allocator<char>;
-    using M  = std::flat_map<int, char, C, std::deque<int, A1>, std::deque<char, A2>>;
+    using M  = std::flat_map<int, char, C, KeyContainer<int, A1>, ValueContainer<char, A2>>;
     M mo     = M({{4, 5}, {5, 4}}, C(5), A1(7));
     M m      = M({{1, 1}, {2, 2}, {3, 3}, {4, 4}}, C(3), A1(7));
     m        = std::move(mo);
@@ -59,7 +60,7 @@ int main(int, char**) {
   }
   {
     using A = min_allocator<int>;
-    using M = std::flat_map<int, int, std::greater<int>, std::vector<int, A>, std::vector<int, A>>;
+    using M = std::flat_map<int, int, std::greater<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
     M mo    = M({{5, 1}, {4, 2}, {3, 3}}, A());
     M m     = M({{4, 4}, {3, 3}, {2, 2}, {1, 1}}, A());
     m       = std::move(mo);
@@ -69,6 +70,26 @@ int main(int, char**) {
     assert(vs.get_allocator() == A());
     assert(mo.empty());
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
-}
+}
\ No newline at end of file
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_clears.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_clears.pass.cpp
index f28d52dd4e46..0b52d67b0a9b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_clears.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_clears.pass.cpp
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <cassert>
 #include <compare>
+#include <deque>
 #include <flat_map>
 #include <functional>
 #include <utility>
@@ -27,9 +28,9 @@
 struct MoveNegates {
   int value_    = 0;
   MoveNegates() = default;
-  MoveNegates(int v) : value_(v) {}
-  MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; }
-  MoveNegates& operator=(MoveNegates&& rhs) {
+  constexpr MoveNegates(int v) : value_(v) {}
+  constexpr MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; }
+  constexpr MoveNegates& operator=(MoveNegates&& rhs) {
     value_     = rhs.value_;
     rhs.value_ = -rhs.value_;
     return *this;
@@ -41,9 +42,9 @@ struct MoveNegates {
 struct MoveClears {
   int value_   = 0;
   MoveClears() = default;
-  MoveClears(int v) : value_(v) {}
-  MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; }
-  MoveClears& operator=(MoveClears&& rhs) {
+  constexpr MoveClears(int v) : value_(v) {}
+  constexpr MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; }
+  constexpr MoveClears& operator=(MoveClears&& rhs) {
     value_     = rhs.value_;
     rhs.value_ = 0;
     return *this;
@@ -52,11 +53,12 @@ struct MoveClears {
   auto operator<=>(const MoveClears&) const = default;
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   auto value_eq = [](auto&& p, auto&& q) { return p.first == q.first; };
   {
     const std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}};
-    using M = std::flat_map<MoveNegates, int, std::less<MoveNegates>, std::vector<MoveNegates>>;
+    using M = std::flat_map<MoveNegates, int, std::less<MoveNegates>, KeyContainer<MoveNegates>, ValueContainer<int>>;
     M m     = M(expected, expected + 8);
     M m2    = M(expected, expected + 3);
 
@@ -73,7 +75,7 @@ int main(int, char**) {
   }
   {
     const std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}};
-    using M = std::flat_map<MoveClears, int, std::less<MoveClears>, std::vector<MoveClears>>;
+    using M = std::flat_map<MoveClears, int, std::less<MoveClears>, KeyContainer<MoveClears>, ValueContainer<int>>;
     M m     = M(expected, expected + 8);
     M m2    = M(expected, expected + 3);
 
@@ -90,7 +92,7 @@ int main(int, char**) {
   }
   {
     // moved-from object maintains invariant if one of underlying container does not clear after move
-    using M = std::flat_map<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    using M = std::flat_map<int, int, std::less<>, KeyContainer<int>, CopyOnlyVector<int>>;
     M m1    = M({1, 2, 3}, {1, 2, 3});
     M m2    = M({1, 2}, {1, 2});
     m2      = std::move(m1);
@@ -100,5 +102,26 @@ int main(int, char**) {
     LIBCPP_ASSERT(m1.keys().size() == 0);
     LIBCPP_ASSERT(m1.values().size() == 0);
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_noexcept.compile.pass.cpp
similarity index 99%
rename from libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_noexcept.pass.cpp
rename to libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_noexcept.compile.pass.cpp
index 665b763e6c4f..18d332cf27b5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/move_assign_noexcept.compile.pass.cpp
@@ -49,7 +49,7 @@ struct MoveThrowsComp {
   bool operator()(const auto&, const auto&) const;
 };
 
-int main(int, char**) {
+void test() {
   {
     using C = std::flat_map<int, int>;
     LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v<C>);
@@ -105,6 +105,4 @@ int main(int, char**) {
     using C = std::flat_map<int, int, std::less<int>, std::vector<int>, std::pmr::vector<int>>;
     static_assert(!std::is_nothrow_move_assignable_v<C>);
   }
-
-  return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/range.pass.cpp
index 282cc71f3199..9f748738a545 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/range.pass.cpp
@@ -27,9 +27,11 @@
 #include <vector>
 
 #include "min_allocator.h"
+#include "MinSequenceContainer.h"
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
 // test constraint container-compatible-range
@@ -66,7 +68,171 @@ static_assert(std::is_constructible_v<Map,
 static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<int>, std::less<int>, std::allocator<int>>);
 static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<double>, std::less<int>, std::allocator<int>>);
 
-int main(int, char**) {
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using P     = std::pair<Key, Value>;
+  P ar[]      = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+  {
+    // flat_map(from_range_t, R&&)
+    // input_range && !common
+    using M    = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    using Iter = cpp20_input_iterator<const P*>;
+    using Sent = sentinel_wrapper<Iter>;
+    using R    = std::ranges::subrange<Iter, Sent>;
+    auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+
+    // explicit(false)
+    M m2 = {std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))};
+    assert(m2 == m);
+  }
+  {
+    // flat_map(from_range_t, R&&)
+    // greater
+    using M    = std::flat_map<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
+    using Iter = cpp20_input_iterator<const P*>;
+    using Sent = sentinel_wrapper<Iter>;
+    using R    = std::ranges::subrange<Iter, Sent>;
+    auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
+    assert(std::ranges::equal(m.keys(), KeyContainer{3, 2, 1}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {6, 8, 9},
+            {4, 5, 7},
+            {1, 2, 3},
+        });
+  }
+  {
+    // flat_map(from_range_t, R&&)
+    // contiguous range
+    using M = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    using R = std::ranges::subrange<const P*>;
+    auto m  = M(std::from_range, R(ar, ar + 9));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+  }
+  {
+    // flat_map(from_range_t, R&&, const key_compare&)
+    using C = test_less<int>;
+    using M = std::flat_map<Key, Value, C, KeyContainer, ValueContainer>;
+    using R = std::ranges::subrange<const P*>;
+    auto m  = M(std::from_range, R(ar, ar + 9), C(3));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.key_comp() == C(3));
+
+    // explicit(false)
+    M m2 = {std::from_range, R(ar, ar + 9), C(3)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(3));
+  }
+}
+
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_alloc() {
+  using P = std::pair<int, short>;
+  P ar[]  = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+  {
+    // flat_map(from_range_t, R&&, const Allocator&)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(from_range_t, R&&, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(from_range_t, R&&, const key_compare&, const Allocator&)
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(from_range_t, R&&, const key_compare&, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    M m      = {std::from_range, R(ar, ar + 9), {}, A2(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 2, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {4, 5, 7},
+            {6, 8, 9},
+        });
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -117,111 +283,28 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&, const A1&>);
   }
 
-  using P      = std::pair<int, short>;
-  P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
-  P expected[] = {{1, 1}, {2, 4}, {3, 6}};
-  {
-    // flat_map(from_range_t, R&&)
-    // input_range && !common
-    using M    = std::flat_map<int, short>;
-    using Iter = cpp20_input_iterator<const P*>;
-    using Sent = sentinel_wrapper<Iter>;
-    using R    = std::ranges::subrange<Iter, Sent>;
-    auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+  test<std::vector<int>, std::vector<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-    // explicit(false)
-    M m2 = {std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))};
-    assert(m2 == m);
-  }
-  {
-    // flat_map(from_range_t, R&&)
-    // greater
-    using M    = std::flat_map<int, short, std::greater<int>, std::deque<int, min_allocator<int>>, std::deque<short>>;
-    using Iter = cpp20_input_iterator<const P*>;
-    using Sent = sentinel_wrapper<Iter>;
-    using R    = std::ranges::subrange<Iter, Sent>;
-    auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert((m.keys() == std::deque<int, min_allocator<int>>{3, 2, 1}));
-    LIBCPP_ASSERT((m.values() == std::deque<short>{6, 4, 1}));
-  }
-  {
-    // flat_map(from_range_t, R&&)
-    // contiguous range
-    using M = std::flat_map<int, short>;
-    using R = std::ranges::subrange<const P*>;
-    auto m  = M(std::from_range, R(ar, ar + 9));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-  }
-  {
-    // flat_map(from_range_t, R&&, const key_compare&)
-    using C = test_less<int>;
-    using M = std::flat_map<int, short, C, std::vector<int>, std::deque<short>>;
-    using R = std::ranges::subrange<const P*>;
-    auto m  = M(std::from_range, R(ar, ar + 9), C(3));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.key_comp() == C(3));
+  test_alloc<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {std::from_range, R(ar, ar + 9), C(3)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(3));
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(from_range_t, R&&, const Allocator&)
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    using R  = std::ranges::subrange<const P*>;
-    auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(from_range_t, R&&, const Allocator&)
-    // explicit(false)
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    using R  = std::ranges::subrange<const P*>;
-    M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(from_range_t, R&&, const key_compare&, const Allocator&)
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
-    using R  = std::ranges::subrange<const P*>;
-    auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.key_comp() == C(3));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(from_range_t, R&&, const key_compare&, const Allocator&)
-    // explicit(false)
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, std::less<int>, std::deque<int, A1>, std::vector<short, A2>>;
-    using R  = std::ranges::subrange<const P*>;
-    M m      = {std::from_range, R(ar, ar + 9), {}, A2(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
+    test<std::deque<int>, std::vector<double>>();
+    test_alloc<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_container.pass.cpp
index 3c8868f2ff42..4c583594b250 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_container.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_container.pass.cpp
@@ -31,9 +31,117 @@
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  {
+    // flat_map(sorted_unique_t, key_container_type , mapped_container_type)
+    using M                 = std::flat_map<int, char, std::less<int>, KeyContainer<int>, ValueContainer<char>>;
+    KeyContainer<int> ks    = {1, 2, 4, 10};
+    ValueContainer<char> vs = {4, 3, 2, 1};
+    auto ks2                = ks;
+    auto vs2                = vs;
+
+    auto m = M(std::sorted_unique, ks, vs);
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    m = M(std::sorted_unique, std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, std::move(ks2), std::move(vs2)};
+    assert(m == m2);
+  }
+  {
+    // flat_map(sorted_unique_t, key_container_type , mapped_container_type)
+    // non-default container, comparator and allocator type
+    using Ks = KeyContainer<int, min_allocator<int>>;
+    using Vs = ValueContainer<char, min_allocator<char>>;
+    using M  = std::flat_map<int, char, std::greater<int>, Ks, Vs>;
+    Ks ks    = {10, 4, 2, 1};
+    Vs vs    = {1, 2, 3, 4};
+    auto m   = M(std::sorted_unique, ks, vs);
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    m = M(std::sorted_unique, std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+  }
+  {
+    // flat_map(sorted_unique_t, key_container_type , mapped_container_type)
+    // allocator copied into the containers
+    using A = test_allocator<int>;
+    using M = std::flat_map<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 4, 10}, A(4));
+    auto vs = ValueContainer<int, A>({4, 3, 2, 1}, A(5));
+    auto m  = M(std::sorted_unique, std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(m.keys().get_allocator() == A(4));
+    assert(m.values().get_allocator() == A(5));
+  }
+  {
+    // flat_map(sorted_unique_t, key_container_type , mapped_container_type, key_compare)
+    using C                 = test_less<int>;
+    using M                 = std::flat_map<int, char, C, KeyContainer<int>, ValueContainer<char>>;
+    KeyContainer<int> ks    = {1, 2, 4, 10};
+    ValueContainer<char> vs = {4, 3, 2, 1};
+
+    auto m = M(std::sorted_unique, ks, vs, C(4));
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(m.key_comp() == C(4));
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, ks, vs, C(4)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+  }
+  {
+    // flat_map(sorted_unique_t, key_container_type , mapped_container_type, key_compare, const Allocator&)
+    using C                   = test_less<int>;
+    using A                   = test_allocator<int>;
+    using M                   = std::flat_map<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
+    KeyContainer<int, A> ks   = {1, 2, 4, 10};
+    ValueContainer<int, A> vs = {4, 3, 2, 1};
+    auto m                    = M(std::sorted_unique, ks, vs, C(4), A(5));
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A(5));
+    assert(m.values().get_allocator() == A(5));
+
+    // explicit(false)
+    M m2 = {ks, vs, C(4), A(5)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+    assert(m2.keys().get_allocator() == A(5));
+    assert(m2.values().get_allocator() == A(5));
+  }
+  {
+    // flat_map(sorted_unique_t, key_container_type , mapped_container_type, const Allocator&)
+    using A = test_allocator<int>;
+    using M = std::flat_map<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 4, 10}, A(4));
+    auto vs = ValueContainer<int, A>({4, 3, 2, 1}, A(5));
+    auto m  = M(std::sorted_unique, ks, vs, A(6)); // replaces the allocators
+    assert(!ks.empty());                           // it was an lvalue above
+    assert(!vs.empty());                           // it was an lvalue above
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(m.keys().get_allocator() == A(6));
+    assert(m.values().get_allocator() == A(6));
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, ks, vs, A(6)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A(6));
+    assert(m2.values().get_allocator() == A(6));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -57,109 +165,25 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M2, std::sorted_unique_t, const V1&, const V2&, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M3, std::sorted_unique_t, const V2&, const V1&, const C&, const A2&>);
   }
-  {
-    // flat_map(sorted_unique_t, key_container_type , mapped_container_type)
-    using M              = std::flat_map<int, char>;
-    std::vector<int> ks  = {1, 2, 4, 10};
-    std::vector<char> vs = {4, 3, 2, 1};
-    auto ks2             = ks;
-    auto vs2             = vs;
 
-    auto m = M(std::sorted_unique, ks, vs);
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
-    m = M(std::sorted_unique, std::move(ks), std::move(vs));
-    assert(ks.empty()); // it was moved-from
-    assert(vs.empty()); // it was moved-from
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+  test<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, std::move(ks2), std::move(vs2)};
-    assert(m == m2);
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(sorted_unique_t, key_container_type , mapped_container_type)
-    // non-default container, comparator and allocator type
-    using Ks = std::deque<int, min_allocator<int>>;
-    using Vs = std::deque<char, min_allocator<char>>;
-    using M  = std::flat_map<int, char, std::greater<int>, Ks, Vs>;
-    Ks ks    = {10, 4, 2, 1};
-    Vs vs    = {1, 2, 3, 4};
-    auto m   = M(std::sorted_unique, ks, vs);
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
-    m = M(std::sorted_unique, std::move(ks), std::move(vs));
-    assert(ks.empty()); // it was moved-from
-    assert(vs.empty()); // it was moved-from
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    test<std::deque, std::vector>();
+    test<std::deque, std::deque>();
   }
-  {
-    // flat_map(sorted_unique_t, key_container_type , mapped_container_type)
-    // allocator copied into the containers
-    using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 2, 4, 10}, A(4));
-    auto vs = std::deque<int, A>({4, 3, 2, 1}, A(5));
-    auto m  = M(std::sorted_unique, std::move(ks), std::move(vs));
-    assert(ks.empty()); // it was moved-from
-    assert(vs.empty()); // it was moved-from
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
-    assert(m.keys().get_allocator() == A(4));
-    assert(m.values().get_allocator() == A(5));
-  }
-  {
-    // flat_map(sorted_unique_t, key_container_type , mapped_container_type, key_compare)
-    using C              = test_less<int>;
-    using M              = std::flat_map<int, char, C>;
-    std::vector<int> ks  = {1, 2, 4, 10};
-    std::vector<char> vs = {4, 3, 2, 1};
 
-    auto m = M(std::sorted_unique, ks, vs, C(4));
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
-    assert(m.key_comp() == C(4));
+  return true;
+}
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, ks, vs, C(4)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(4));
-  }
-  {
-    // flat_map(sorted_unique_t, key_container_type , mapped_container_type, key_compare, const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_map<int, int, C, std::vector<int, A>, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 2, 4, 10};
-    std::vector<int, A> vs = {4, 3, 2, 1};
-    auto m                 = M(std::sorted_unique, ks, vs, C(4), A(5));
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
-    assert(m.key_comp() == C(4));
-    assert(m.keys().get_allocator() == A(5));
-    assert(m.values().get_allocator() == A(5));
-
-    // explicit(false)
-    M m2 = {ks, vs, C(4), A(5)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(4));
-    assert(m2.keys().get_allocator() == A(5));
-    assert(m2.values().get_allocator() == A(5));
-  }
-  {
-    // flat_map(sorted_unique_t, key_container_type , mapped_container_type, const Allocator&)
-    using A = test_allocator<int>;
-    using M = std::flat_map<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 2, 4, 10}, A(4));
-    auto vs = std::deque<int, A>({4, 3, 2, 1}, A(5));
-    auto m  = M(std::sorted_unique, ks, vs, A(6)); // replaces the allocators
-    assert(!ks.empty());                           // it was an lvalue above
-    assert(!vs.empty());                           // it was an lvalue above
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
-    assert(m.keys().get_allocator() == A(6));
-    assert(m.values().get_allocator() == A(6));
-
-    // explicit(false)
-    M m2 = {std::sorted_unique, ks, vs, A(6)};
-    assert(m2 == m);
-    assert(m2.keys().get_allocator() == A(6));
-    assert(m2.values().get_allocator() == A(6));
-  }
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_initializer_list.pass.cpp
index 26452472ba20..e8ac5d3961f7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_initializer_list.pass.cpp
@@ -22,6 +22,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "min_allocator.h"
@@ -31,13 +32,87 @@
 #include "../../../test_compare.h"
 
 template <class T, class U>
-std::initializer_list<std::pair<T, U>> il = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+constexpr std::initializer_list<std::pair<T, U>> il = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
 
-const auto il1 = il<int, int>;
-const auto il2 = il<int, short>;
-const auto il3 = il<short, int>;
+constexpr auto il1 = il<int, int>;
+constexpr auto il2 = il<int, short>;
+constexpr auto il3 = il<short, int>;
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  {
+    // flat_map(sorted_unique_t, initializer_list<value_type>);
+    using M       = std::flat_map<int, int, std::less<int>, KeyContainer<int>, ValueContainer<int>>;
+    auto m        = M(std::sorted_unique, il1);
+    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, il1};
+    assert(m2 == m);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&);
+    using M = std::flat_map<int, int, std::function<bool(int, int)>, KeyContainer<int>, ValueContainer<int>>;
+    auto m  = M(std::sorted_unique, il1, std::less<int>());
+    assert(m == M({{1, 1}, {2, 2}, {4, 4}, {5, 5}}, std::less<>()));
+    assert(m.key_comp()(1, 2) == true);
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, il1, std::less<int>()};
+    assert(m2 == m);
+  }
+  {
+    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&);
+    // greater
+    using M = std::flat_map<int, int, std::greater<int>, KeyContainer<int, min_allocator<int>>, ValueContainer<int>>;
+    std::initializer_list<std::pair<int, int>> il4{{5, 5}, {4, 4}, {2, 2}, {1, 1}};
+    auto m = M(std::sorted_unique, il4, std::greater<int>());
+    assert((m == M{{5, 5}, {4, 4}, {2, 2}, {1, 1}}));
+  }
+  {
+    // flat_map(sorted_unique_t, initializer_list<value_type>,  const Allocator&)
+    using A1      = test_allocator<int>;
+    using A2      = test_allocator<short>;
+    using M       = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    auto m        = M(std::sorted_unique, il2, A1(5));
+    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, il2, A1(5)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A1(5));
+    assert(m2.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&, const Allocator&);
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    auto m   = M(std::sorted_unique, il2, C(3), A1(5));
+    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&, const Allocator&);
+    // explicit(false)
+    using A1 = test_allocator<short>;
+    using A2 = test_allocator<int>;
+    using M  = std::flat_map<short, int, std::less<int>, KeyContainer<short, A1>, ValueContainer<int, A2>>;
+    M m      = {std::sorted_unique, il3, {}, A1(5)}; // implicit ctor
+    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -104,76 +179,23 @@ int main(int, char**) {
                                            std::allocator<int>>);
   }
 
-  {
-    // flat_map(sorted_unique_t, initializer_list<value_type>);
-    using M       = std::flat_map<int, int>;
-    auto m        = M(std::sorted_unique, il1);
-    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
+  test<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, il1};
-    assert(m2 == m);
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&);
-    using M = std::flat_map<int, int, std::function<bool(int, int)>>;
-    auto m  = M(std::sorted_unique, il1, std::less<int>());
-    assert(m == M({{1, 1}, {2, 2}, {4, 4}, {5, 5}}, std::less<>()));
-    assert(m.key_comp()(1, 2) == true);
+    test<std::deque, std::deque>();
+  }
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, il1, std::less<int>()};
-    assert(m2 == m);
-  }
-  {
-    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&);
-    // greater
-    using M = std::flat_map<int, int, std::greater<int>, std::deque<int, min_allocator<int>>, std::vector<int>>;
-    std::initializer_list<std::pair<int, int>> il4{{5, 5}, {4, 4}, {2, 2}, {1, 1}};
-    auto m = M(std::sorted_unique, il4, std::greater<int>());
-    assert((m == M{{5, 5}, {4, 4}, {2, 2}, {1, 1}}));
-  }
-  {
-    // flat_map(sorted_unique_t, initializer_list<value_type>,  const Allocator&)
-    using A1      = test_allocator<int>;
-    using A2      = test_allocator<short>;
-    using M       = std::flat_map<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    auto m        = M(std::sorted_unique, il2, A1(5));
-    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
+  return true;
+}
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, il2, A1(5)};
-    assert(m2 == m);
-    assert(m2.keys().get_allocator() == A1(5));
-    assert(m2.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&, const Allocator&);
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
-    auto m   = M(std::sorted_unique, il2, C(3), A1(5));
-    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
-    assert(m.key_comp() == C(3));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(sorted_unique_t, initializer_list<value_type>, const key_compare&, const Allocator&);
-    // explicit(false)
-    using A1 = test_allocator<short>;
-    using A2 = test_allocator<int>;
-    using M  = std::flat_map<short, int, std::less<int>, std::deque<short, A1>, std::vector<int, A2>>;
-    M m      = {std::sorted_unique, il3, {}, A1(5)}; // implicit ctor
-    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_iter_iter.pass.cpp
index 8eb7547e917c..f853a083e477 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/sorted_iter_iter.pass.cpp
@@ -20,15 +20,141 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
 #include "min_allocator.h"
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+
+  {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator);
+    // cpp17_input_iterator
+    using M       = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    using P       = std::pair<Key, Value>;
+    P ar[]        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    auto m        = M(std::sorted_unique, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4));
+    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4)};
+    assert(m2 == m);
+  }
+  {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator);
+    // contiguous iterator
+    using C       = test_less<Key>;
+    using P       = std::pair<Key, Value>;
+    using M       = std::flat_map<Key, Value, C, KeyContainer, ValueContainer>;
+    P ar[]        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    auto m        = M(std::sorted_unique, ar, ar + 4);
+    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&);
+    // cpp_17_input_iterator
+    using M = std::flat_map<Key, Value, std::function<bool(Key, Value)>>;
+    using P = std::pair<Key, Value>;
+    P ar[]  = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    auto m  = M(std::sorted_unique,
+               cpp17_input_iterator<const P*>(ar),
+               cpp17_input_iterator<const P*>(ar + 4),
+               std::less<int>());
+    assert(m == M({{1, 1}, {2, 2}, {4, 4}, {5, 5}}, std::less<>()));
+    assert(m.key_comp()(1, 2) == true);
+
+    // explicit(false)
+    M m2 = {std::sorted_unique,
+            cpp17_input_iterator<const P*>(ar),
+            cpp17_input_iterator<const P*>(ar + 4),
+            std::less<int>()};
+    assert(m2 == m);
+  }
+  {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&);
+    // greater
+    using M = std::flat_map<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
+    using P = std::pair<Key, Value>;
+    P ar[]  = {{5, 5}, {4, 4}, {2, 2}, {1, 1}};
+    auto m  = M(std::sorted_unique,
+               cpp17_input_iterator<const P*>(ar),
+               cpp17_input_iterator<const P*>(ar + 4),
+               std::greater<int>());
+    assert((m == M{{5, 5}, {4, 4}, {2, 2}, {1, 1}}));
+  }
+  {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&);
+    // contiguous iterator
+    using C                     = test_less<Key>;
+    using M                     = std::flat_map<Key, Value, C, KeyContainer, ValueContainer>;
+    std::pair<Key, Value> ar[1] = {{42, 42}};
+    auto m                      = M(std::sorted_unique, ar, ar, C(5));
+    assert(m.empty());
+    assert(m.key_comp() == C(5));
+  }
+}
+
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_alloc() {
+  {
+    // flat_map(sorted_unique_t, InputIterator , InputIterator, const Allocator&)
+    using A1      = test_allocator<int>;
+    using A2      = test_allocator<short>;
+    using M       = std::flat_map<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using P       = std::pair<int, int>;
+    P ar[]        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    auto m        = M(std::sorted_unique, ar, ar + 4, A1(5));
+    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+
+    // explicit(false)
+    M m2 = {std::sorted_unique, ar, ar + 4, A1(5)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A1(5));
+    assert(m2.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_map<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using P  = std::pair<int, int>;
+    P ar[]   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    auto m   = M(std::sorted_unique, ar, ar + 4, C(3), A1(5));
+    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
+    // explicit(false)
+    using A1 = test_allocator<short>;
+    using A2 = test_allocator<int>;
+    using M  = std::flat_map<short, int, std::less<int>, KeyContainer<short, A1>, ValueContainer<int, A2>>;
+    using P  = std::pair<int, int>;
+    P ar[]   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
+    M m      = {std::sorted_unique, ar, ar + 4, {}, A1(5)}; // implicit ctor
+    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -54,118 +180,31 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M2, std::sorted_unique_t, Iter2, Iter2, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M3, std::sorted_unique_t, Iter3, Iter3, const C&, const A2&>);
   }
-  {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator);
-    // cpp17_input_iterator
-    using M       = std::flat_map<int, int>;
-    using P       = std::pair<int, int>;
-    P ar[]        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    auto m        = M(std::sorted_unique, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4));
-    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4)};
-    assert(m2 == m);
-  }
-  {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator);
-    // contiguous iterator
-    using C = test_less<int>;
-    using M = std::flat_map<int, int, C, std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>;
-    std::pair<int, int> ar[] = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    auto m                   = M(std::sorted_unique, ar, ar + 4);
-    auto expected            = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
-  }
-  {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&);
-    // cpp_17_input_iterator
-    using M = std::flat_map<int, int, std::function<bool(int, int)>>;
-    using P = std::pair<int, int>;
-    P ar[]  = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    auto m  = M(std::sorted_unique,
-               cpp17_input_iterator<const P*>(ar),
-               cpp17_input_iterator<const P*>(ar + 4),
-               std::less<int>());
-    assert(m == M({{1, 1}, {2, 2}, {4, 4}, {5, 5}}, std::less<>()));
-    assert(m.key_comp()(1, 2) == true);
+  test<std::vector<int>, std::vector<int>>();
+  test<std::vector<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
-    // explicit(false)
-    M m2 = {std::sorted_unique,
-            cpp17_input_iterator<const P*>(ar),
-            cpp17_input_iterator<const P*>(ar + 4),
-            std::less<int>()};
-    assert(m2 == m);
-  }
-  {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&);
-    // greater
-    using M = std::flat_map<int, int, std::greater<int>, std::deque<int, min_allocator<int>>, std::vector<int>>;
-    using P = std::pair<int, int>;
-    P ar[]  = {{5, 5}, {4, 4}, {2, 2}, {1, 1}};
-    auto m  = M(std::sorted_unique,
-               cpp17_input_iterator<const P*>(ar),
-               cpp17_input_iterator<const P*>(ar + 4),
-               std::greater<int>());
-    assert((m == M{{5, 5}, {4, 4}, {2, 2}, {1, 1}}));
-  }
-  {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&);
-    // contiguous iterator
-    using C = test_less<int>;
-    using M = std::flat_map<int, int, C, std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>;
-    std::pair<int, int> ar[1] = {{42, 42}};
-    auto m                    = M(std::sorted_unique, ar, ar, C(5));
-    assert(m.empty());
-    assert(m.key_comp() == C(5));
-  }
-  {
-    // flat_map(sorted_unique_t, InputIterator , InputIterator, const Allocator&)
-    using A1      = test_allocator<int>;
-    using A2      = test_allocator<short>;
-    using M       = std::flat_map<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    using P       = std::pair<int, int>;
-    P ar[]        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    auto m        = M(std::sorted_unique, ar, ar + 4, A1(5));
-    auto expected = M{{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
+  test_alloc<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {std::sorted_unique, ar, ar + 4, A1(5)};
-    assert(m2 == m);
-    assert(m2.keys().get_allocator() == A1(5));
-    assert(m2.values().get_allocator() == A2(5));
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_map<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
-    using P  = std::pair<int, int>;
-    P ar[]   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    auto m   = M(std::sorted_unique, ar, ar + 4, C(3), A1(5));
-    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
-    assert(m.key_comp() == C(3));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_map(sorted_unique_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
-    // explicit(false)
-    using A1 = test_allocator<short>;
-    using A2 = test_allocator<int>;
-    using M  = std::flat_map<short, int, std::less<int>, std::deque<short, A1>, std::vector<int, A2>>;
-    using P  = std::pair<int, int>;
-    P ar[]   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    M m      = {std::sorted_unique, ar, ar + 4, {}, A1(5)}; // implicit ctor
-    assert((m == M{{1, 1}, {2, 2}, {4, 4}, {5, 5}}));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
+    test<std::deque<int>, std::vector<double>>();
+    test_alloc<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.erasure/erase_if.pass.cpp
index fb0563eec537..7d5a6ecf02f6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.erasure/erase_if.pass.cpp
@@ -32,7 +32,7 @@ static_assert(HasStdErase<std::vector<int>>);
 static_assert(!HasStdErase<std::flat_map<int, int>>);
 
 template <class M>
-M make(std::initializer_list<int> vals) {
+constexpr M make(std::initializer_list<int> vals) {
   M ret;
   for (int v : vals)
     ret[static_cast<typename M::key_type>(v)] = static_cast<typename M::mapped_type>(v + 10);
@@ -40,8 +40,8 @@ M make(std::initializer_list<int> vals) {
 }
 
 template <class M, class Pred>
-void test0(
-    std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
+constexpr void
+test0(std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
   M s = make<M>(vals);
   ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
@@ -49,7 +49,7 @@ void test0(
 }
 
 template <class S>
-void test() {
+constexpr void test() {
   // Test all the plausible signatures for this predicate.
   auto is1   = [](typename S::const_reference v) { return v.first == 1; };
   auto is2   = [](typename S::value_type v) { return v.first == 2; };
@@ -76,7 +76,7 @@ void test() {
   test0<S>({1, 2, 3}, False, {1, 2, 3}, 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::flat_map<int, char>>();
   test<std::flat_map<int,
                      char,
@@ -84,10 +84,24 @@ int main(int, char**) {
                      std::vector<int, min_allocator<int>>,
                      std::vector<char, min_allocator<char>>>>();
   test<std::flat_map<int, char, std::greater<int>, std::vector<int, test_allocator<int>>>>();
-  test<std::flat_map<int, char, std::less<int>, std::deque<int, min_allocator<int>>>>();
-  test<std::flat_map<int, char, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::flat_map<int, char, std::less<int>, std::deque<int, min_allocator<int>>>>();
+    test<std::flat_map<int, char, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+  }
   test<std::flat_map<long, int>>();
   test<std::flat_map<double, int>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator.pass.cpp
index b63ce6b19ee1..eef90f4ac6bc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator.pass.cpp
@@ -30,7 +30,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -70,9 +70,14 @@ void test() {
   assert(i == m.begin());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
@@ -92,5 +97,14 @@ int main(int, char**) {
     assert(!(cii != ii1));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator_comparison.pass.cpp
index d31daa84924f..e8aac7fb7c9e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator_comparison.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/iterator_comparison.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -142,11 +142,25 @@ void test() {
   assert(cri2 <=> cri1 == std::strong_ordering::greater);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/reverse_iterator.pass.cpp
index b4d76aac119a..2de992e07ba4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/reverse_iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.iterators/reverse_iterator.pass.cpp
@@ -25,48 +25,64 @@
 
 #include <iterator>
 
+#include "MinSequenceContainer.h"
 #include "test_macros.h"
+#include "min_allocator.h"
 
-int main(int, char**) {
-  {
-    using M     = std::flat_map<int, char, std::less<int>, std::deque<int>, std::deque<char>>;
-    M m         = {{1, 'a'}, {2, 'b'}, {3, 'c'}, {4, 'd'}};
-    const M& cm = m;
-    ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator);
-    static_assert(noexcept(m.rbegin()));
-    static_assert(noexcept(cm.rbegin()));
-    static_assert(noexcept(m.crbegin()));
-    static_assert(noexcept(m.rend()));
-    static_assert(noexcept(cm.rend()));
-    static_assert(noexcept(m.crend()));
-    assert(m.size() == 4);
-    assert(std::distance(m.rbegin(), m.rend()) == 4);
-    assert(std::distance(cm.rbegin(), cm.rend()) == 4);
-    assert(std::distance(m.crbegin(), m.crend()) == 4);
-    assert(std::distance(cm.crbegin(), cm.crend()) == 4);
-    M::reverse_iterator i; // default-construct
-    ASSERT_SAME_TYPE(decltype(i->first), const int&);
-    ASSERT_SAME_TYPE(decltype(i->second), char&);
-    i                           = m.rbegin(); // move-assignment
-    M::const_reverse_iterator k = i;          // converting constructor
-    assert(i == k);                           // comparison
-    for (int j = 4; j >= 1; --j, ++i) {       // pre-increment
-      assert(i->first == j);                  // operator->
-      assert(i->second == 'a' + j - 1);
-    }
-    assert(i == m.rend());
-    for (int j = 1; j <= 4; ++j) {
-      --i; // pre-decrement
-      assert((*i).first == j);
-      assert((*i).second == 'a' + j - 1);
-    }
-    assert(i == m.rbegin());
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  M m         = {{1, 'a'}, {2, 'b'}, {3, 'c'}, {4, 'd'}};
+  const M& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.rbegin()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.rend()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crend()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rend()), typename M::const_reverse_iterator);
+  static_assert(noexcept(m.rbegin()));
+  static_assert(noexcept(cm.rbegin()));
+  static_assert(noexcept(m.crbegin()));
+  static_assert(noexcept(m.rend()));
+  static_assert(noexcept(cm.rend()));
+  static_assert(noexcept(m.crend()));
+  assert(m.size() == 4);
+  assert(std::distance(m.rbegin(), m.rend()) == 4);
+  assert(std::distance(cm.rbegin(), cm.rend()) == 4);
+  assert(std::distance(m.crbegin(), m.crend()) == 4);
+  assert(std::distance(cm.crbegin(), cm.crend()) == 4);
+  typename M::reverse_iterator i; // default-construct
+  ASSERT_SAME_TYPE(decltype(i->first), const int&);
+  ASSERT_SAME_TYPE(decltype(i->second), char&);
+  i                                    = m.rbegin(); // move-assignment
+  typename M::const_reverse_iterator k = i;          // converting constructor
+  assert(i == k);                                    // comparison
+  for (int j = 4; j >= 1; --j, ++i) {                // pre-increment
+    assert(i->first == j);                           // operator->
+    assert(i->second == 'a' + j - 1);
   }
+  assert(i == m.rend());
+  for (int j = 1; j <= 4; ++j) {
+    --i; // pre-decrement
+    assert((*i).first == j);
+    assert((*i).second == 'a' + j - 1);
+  }
+  assert(i == m.rbegin());
+}
+
+constexpr bool test() {
+  test<std::vector<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
   {
     // N3644 testing
     using C = std::flat_map<int, char>;
@@ -82,6 +98,14 @@ int main(int, char**) {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/clear.pass.cpp
index 30271eb55660..85ef573f47da 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/clear.pass.cpp
@@ -39,7 +39,7 @@ static_assert(
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -52,13 +52,27 @@ void test() {
   assert(m.size() == 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace.pass.cpp
index a54fcad63928..03192588373a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace.pass.cpp
@@ -18,6 +18,7 @@
 #include <deque>
 #include <tuple>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "MinSequenceContainer.h"
@@ -39,7 +40,7 @@ static_assert(!CanEmplace<Map, Emplaceable>);
 static_assert(!CanEmplace<Map, int, double>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -117,7 +118,7 @@ void test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_map<int, Emplaceable, std::less<int>, KeyContainer, ValueContainer>;
   using R = std::pair<typename M::iterator, bool>;
 
@@ -143,23 +144,38 @@ void test_emplaceable() {
   assert(m.begin()->second == Emplaceable(2, 3.5));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   test_emplaceable<std::vector<int>, std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<int>, MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<int, min_allocator<int>>, std::vector<Emplaceable, min_allocator<Emplaceable>>>();
 
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
+    test<std::deque<int>, std::vector<double>>();
+    test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
+  }
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto emplace_func = [](auto& m, auto key_arg, auto value_arg) {
       m.emplace(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
     };
     test_emplace_exception_guarantee(emplace_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace_hint.pass.cpp
index 77c022a6de92..02af400cec9d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace_hint.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/emplace_hint.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <deque>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "MinSequenceContainer.h"
@@ -42,7 +43,7 @@ static_assert(!CanEmplaceHint<Map, int, double>);
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -139,7 +140,7 @@ void test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_map<int, Emplaceable, std::less<int>, KeyContainer, ValueContainer>;
   using R = M::iterator;
 
@@ -162,23 +163,37 @@ void test_emplaceable() {
   assert(m.begin()->second == Emplaceable(2, 3.5));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+    test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   test_emplaceable<std::vector<int>, std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<int>, MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<int, min_allocator<int>>, std::vector<Emplaceable, min_allocator<Emplaceable>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto emplace_func = [](auto& m, auto key_arg, auto value_arg) {
       m.emplace_hint(m.begin(), std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
     };
     test_emplace_exception_guarantee(emplace_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter.pass.cpp
index 914e8b676a65..841a7938aa26 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter.pass.cpp
@@ -18,6 +18,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -27,7 +28,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -136,16 +137,30 @@ void test() {
   assert(i8 == m.end());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_function = [](auto& m, auto) { m.erase(m.begin() + 2); };
     test_erase_exception_guarantee(erase_function);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter_iter.pass.cpp
index 0bc920829402..76f184c01372 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_iter_iter.pass.cpp
@@ -17,6 +17,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -26,7 +27,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -95,15 +96,30 @@ void test() {
   assert(i4 == m.end());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_function = [](auto& m, auto) { m.erase(m.begin(), m.begin() + 2); };
     test_erase_exception_guarantee(erase_function);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key.pass.cpp
index ef57b1cb5512..e4cce76b9091 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer, class Compare = std::less<>>
-void test() {
+constexpr void test() {
   using M = std::flat_map<int, char, Compare, KeyContainer, ValueContainer>;
 
   auto make = [](std::initializer_list<int> il) {
@@ -70,14 +70,20 @@ void test() {
   assert(m.empty());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
   test<std::vector<int>, std::vector<char>, std::greater<>>();
-  test<std::deque<int>, std::vector<char>>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_function = [](auto& m, auto key_arg) {
       using Map = std::decay_t<decltype(m)>;
       using Key = typename Map::key_type;
@@ -87,5 +93,14 @@ int main(int, char**) {
     test_erase_exception_guarantee(erase_function);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key_transparent.pass.cpp
index ecc71a65caa8..be660b75101d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/erase_key_transparent.pass.cpp
@@ -38,9 +38,9 @@ static_assert(!CanErase<const NonTransparentMap>);
 
 template <class Key, class It>
 struct HeterogeneousKey {
-  explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
-  operator It() && { return it_; }
-  auto operator<=>(Key key) const { return key_ <=> key; }
+  constexpr explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
+  constexpr operator It() && { return it_; }
+  constexpr auto operator<=>(Key key) const { return key_ <=> key; }
   friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
     assert(false);
     return false;
@@ -50,7 +50,7 @@ struct HeterogeneousKey {
 };
 
 template <class KeyContainer, class ValueContainer>
-void test_simple() {
+constexpr void test_simple() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -71,7 +71,7 @@ void test_simple() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_transparent_comparator() {
+constexpr void test_transparent_comparator() {
   using M = std::flat_map<std::string, int, TransparentComparator, KeyContainer, ValueContainer>;
   M m     = {{"alpha", 1}, {"beta", 2}, {"epsilon", 3}, {"eta", 4}, {"gamma", 5}};
   ASSERT_SAME_TYPE(decltype(m.erase(Transparent<std::string>{"abc"})), typename M::size_type);
@@ -87,18 +87,24 @@ void test_transparent_comparator() {
   assert(m == expected);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test_simple<std::vector<int>, std::vector<double>>();
-  test_simple<std::deque<int>, std::vector<double>>();
   test_simple<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test_simple<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   test_transparent_comparator<std::vector<std::string>, std::vector<int>>();
-  test_transparent_comparator<std::deque<std::string>, std::vector<int>>();
   test_transparent_comparator<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>,
                               std::vector<int, min_allocator<int>>>();
 
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_simple<std::deque<int>, std::vector<double>>();
+    test_transparent_comparator<std::deque<std::string>, std::vector<int>>();
+  }
+
   {
     // P2077's HeterogeneousKey example
     using M                           = std::flat_map<int, int, std::less<>>;
@@ -131,7 +137,7 @@ int main(int, char**) {
     assert(n == 1);
     assert(transparent_used);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_transparent = [](auto& m, auto key_arg) {
       using Map = std::decay_t<decltype(m)>;
       using Key = typename Map::key_type;
@@ -147,5 +153,14 @@ int main(int, char**) {
     assert(n == 1);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/extract.pass.cpp
index d8e4ce94efb9..996e16aee4a6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/extract.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/extract.pass.cpp
@@ -33,7 +33,7 @@ static_assert(!CanExtract<std::flat_map<int, int> const&>);
 static_assert(!CanExtract<std::flat_map<int, int> const&&>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using M = std::flat_map<int, int, std::less<int>, KeyContainer, ValueContainer>;
   M m     = M({1, 2, 3}, {4, 5, 6});
 
@@ -49,9 +49,14 @@ void test() {
   LIBCPP_ASSERT(m.values().size() == 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<int>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
   {
@@ -67,7 +72,7 @@ int main(int, char**) {
     LIBCPP_ASSERT(m.values().size() == 0);
   }
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
     using KeyContainer   = std::vector<int>;
     using ValueContainer = ThrowOnMoveContainer<int>;
@@ -87,5 +92,15 @@ int main(int, char**) {
     }
 #endif
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_cv.pass.cpp
index 7e667c4e4877..fd4b93d3d223 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -64,13 +64,18 @@ void test() {
   assert(r.first->second == 3.5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -79,5 +84,15 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(insert_func);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_initializer_list.pass.cpp
index 32be3ab8a95b..a111837d8d6e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_initializer_list.pass.cpp
@@ -12,10 +12,12 @@
 
 // void insert(initializer_list<value_type> il);
 
+#include <algorithm>
 #include <flat_map>
 #include <cassert>
 #include <functional>
 #include <deque>
+#include <type_traits>
 
 #include "MinSequenceContainer.h"
 #include "../helpers.h"
@@ -23,11 +25,10 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
-  using V     = std::pair<const int, double>;
 
   M m = {{1, 1}, {1, 1.5}, {1, 2}, {3, 1}, {3, 1.5}, {3, 2}};
   m.insert({
@@ -42,20 +43,29 @@ void test() {
       {2, 2},
   });
   assert(m.size() == 4);
-  assert(std::distance(m.begin(), m.end()) == 4);
-  assert(*m.begin() == V(1, 1));
-  assert(*std::next(m.begin()) == V(2, 1));
-  assert(*std::next(m.begin(), 2) == V(3, 1));
-  assert(*std::next(m.begin(), 3) == V(4, 1));
+  assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3, 4}));
+  check_possible_values(
+      m.values(),
+      std::vector<std::vector<Value>>{
+          {1, 1.5, 2},
+          {1, 1.5, 2},
+          {1, 1.5, 2},
+          {1, 1.5, 2},
+      });
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) {
       using FlatMap                        = std::decay_t<decltype(m)>;
       using value_type                     = typename FlatMap::value_type;
@@ -64,5 +74,15 @@ int main(int, char**) {
     };
     test_insert_range_exception_guarantee(insert_func);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_cv.pass.cpp
index 4bbe0628317d..a65c5cde6886 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -60,13 +60,18 @@ void test() {
   assert(r->second == 3.5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -75,5 +80,15 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(insert_func);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_iter.pass.cpp
index ccce117c90fc..66bd4022f321 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_iter.pass.cpp
@@ -38,7 +38,7 @@ static_assert(!CanInsert<Map, int, int>);
 static_assert(!CanInsert<Map, cpp20_input_iterator<Pair*>, cpp20_input_iterator<Pair*>>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using P = std::pair<int, double>;
   using M = std::flat_map<int, double, std::less<int>, KeyContainer, ValueContainer>;
 
@@ -68,22 +68,45 @@ void test() {
   M m;
   m.insert(cpp17_input_iterator<P*>(ar1), cpp17_input_iterator<P*>(ar1 + sizeof(ar1) / sizeof(ar1[0])));
   assert(m.size() == 3);
-  M expected{{1, 1}, {2, 1}, {3, 1}};
-  assert(m == expected);
 
-  m.insert(cpp17_input_iterator<P*>(ar2), cpp17_input_iterator<P*>(ar2 + sizeof(ar2) / sizeof(ar2[0])));
-  assert(m.size() == 5);
-  M expected2{{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}};
-  assert(m == expected2);
+  assert(std::ranges::equal(m.keys(), KeyContainer{1, 2, 3}));
+  check_possible_values(
+      m.values(),
+      std::vector<std::vector<double>>{
+          {1, 1.5, 2},
+          {1, 1.5, 2},
+          {1, 1.5, 2},
+      });
+
+  auto m2 = m;
+
+  m2.insert(cpp17_input_iterator<P*>(ar2), cpp17_input_iterator<P*>(ar2 + sizeof(ar2) / sizeof(ar2[0])));
+  assert(m2.size() == 5);
+
+  assert(std::ranges::equal(m2.keys(), KeyContainer{0, 1, 2, 3, 4}));
+  check_possible_values(
+      m2.values(),
+      std::vector<std::vector<double>>{
+          {1, 1.5, 2},
+          {m[1]},
+          {m[2]},
+          {m[3]},
+          {1, 1.5, 2},
+      });
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) { m.insert(newValues.begin(), newValues.end()); };
     test_insert_range_exception_guarantee(insert_func);
   }
@@ -94,5 +117,14 @@ int main(int, char**) {
     assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {2, 2}, {3, 3}, {4, 4}}));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_rv.pass.cpp
index 034941b55eb8..8a5d6481bd0d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_iter_rv.pass.cpp
@@ -22,7 +22,7 @@
 #include "test_macros.h"
 
 template <class Container, class Pair>
-void do_insert_iter_rv_test() {
+constexpr void do_insert_iter_rv_test() {
   using M = Container;
   using P = Pair;
   using R = typename M::iterator;
@@ -53,7 +53,7 @@ void do_insert_iter_rv_test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -64,17 +64,22 @@ void test() {
   do_insert_iter_rv_test<M, CP>();
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
   test<std::vector<int>, std::vector<MoveOnly>>();
-  test<std::deque<int>, std::deque<double>>();
-  test<std::deque<int>, std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::deque<double>>();
+    test<std::deque<int>, std::deque<MoveOnly>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<MoveOnly>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<MoveOnly, min_allocator<MoveOnly>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -84,5 +89,14 @@ int main(int, char**) {
     test_emplace_exception_guarantee(insert_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign.pass.cpp
index 398a7a1a4052..daa4674db6c1 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign.pass.cpp
@@ -78,7 +78,7 @@ static_assert(!CanInsertOrAssignIter<std::flat_map<int, ConstructFrom<V>>, int&&
 static_assert(!CanInsertOrAssignIter<std::flat_map<int, AssignFrom<V>>, int&&, V>);
 
 template <class KeyContainer, class ValueContainer>
-void test_cv_key() {
+constexpr void test_cv_key() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -194,7 +194,7 @@ void test_cv_key() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_rv_key() {
+constexpr void test_rv_key() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -311,16 +311,30 @@ void test_rv_key() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test_cv_key<std::vector<int>, std::vector<Moveable>>();
-  test_cv_key<std::deque<int>, std::vector<Moveable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_cv_key<std::deque<int>, std::vector<Moveable>>();
+    test_rv_key<std::deque<Moveable>, std::vector<Moveable>>();
+  }
   test_cv_key<MinSequenceContainer<int>, MinSequenceContainer<Moveable>>();
   test_cv_key<std::vector<int, min_allocator<int>>, std::vector<Moveable, min_allocator<Moveable>>>();
 
   test_rv_key<std::vector<Moveable>, std::vector<Moveable>>();
-  test_rv_key<std::deque<Moveable>, std::vector<Moveable>>();
   test_rv_key<MinSequenceContainer<Moveable>, MinSequenceContainer<Moveable>>();
   test_rv_key<std::vector<Moveable, min_allocator<Moveable>>, std::vector<Moveable, min_allocator<Moveable>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign_transparent.pass.cpp
index bd6d176e26e2..d036b45b958a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_or_assign_transparent.pass.cpp
@@ -94,7 +94,7 @@ static_assert(
     !CanInsertOrAssignIter<std::flat_map<int, AssignFrom<V>, TransparentComparator>, ConvertibleTransparent<int>, V>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -212,9 +212,14 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<Moveable>>();
-  test<std::deque<int>, std::vector<Moveable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<Moveable>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<Moveable>>();
   test<std::vector<int, min_allocator<int>>, std::vector<Moveable, min_allocator<Moveable>>>();
 
@@ -237,22 +242,24 @@ int main(int, char**) {
     assert(transparent_used);
   }
 
-  {
-    auto insert_or_assign = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      m.insert_or_assign(ConvertibleTransparent<Key>{key_arg}, value_arg);
-    };
-    test_emplace_exception_guarantee(insert_or_assign);
-  }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      auto insert_or_assign = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        m.insert_or_assign(ConvertibleTransparent<Key>{key_arg}, value_arg);
+      };
+      test_emplace_exception_guarantee(insert_or_assign);
+    }
 
-  {
-    auto insert_or_assign_iter = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      m.insert_or_assign(m.begin(), ConvertibleTransparent<Key>{key_arg}, value_arg);
-    };
-    test_emplace_exception_guarantee(insert_or_assign_iter);
+    {
+      auto insert_or_assign_iter = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        m.insert_or_assign(m.begin(), ConvertibleTransparent<Key>{key_arg}, value_arg);
+      };
+      test_emplace_exception_guarantee(insert_or_assign_iter);
+    }
   }
   {
     // LWG4239 std::string and C string literal
@@ -267,5 +274,14 @@ int main(int, char**) {
     assert(it2->second == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range.pass.cpp
index a2e64431a3c2..88be747df679 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range.pass.cpp
@@ -39,7 +39,7 @@ static_assert(!CanInsertRange<Map, std::ranges::subrange<int*>>);
 static_assert(!CanInsertRange<Map, std::ranges::subrange<double*>>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
 
@@ -68,7 +68,7 @@ void test() {
   {
     // The "uniquing" part uses the comparator, not operator==.
     struct ModTen {
-      bool operator()(int a, int b) const { return (a % 10) < (b % 10); }
+      constexpr bool operator()(int a, int b) const { return (a % 10) < (b % 10); }
     };
     using P = std::pair<int, int>;
     using M = std::flat_map<Key, Value, ModTen, KeyContainer, ValueContainer>;
@@ -79,9 +79,14 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<int>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
   {
@@ -95,15 +100,25 @@ int main(int, char**) {
   {
     // The element type of the range doesn't need to be std::pair (P2767).
     std::pair<int, int> pa[] = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}};
-    std::deque<std::reference_wrapper<std::pair<int, int>>> a(pa, pa + 5);
+    std::vector<std::reference_wrapper<std::pair<int, int>>> a(pa, pa + 5);
     std::flat_map<int, int> m;
     m.insert_range(a);
     std::pair<int, int> expected[] = {{1, 1}, {3, 3}, {4, 4}, {5, 5}};
     assert(std::ranges::equal(m, expected));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) { m.insert_range(newValues); };
     test_insert_range_exception_guarantee(insert_func);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_rv.pass.cpp
index 9ea7a6a63666..76deaa43b3e4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_rv.pass.cpp
@@ -25,7 +25,7 @@
 #include "../helpers.h"
 
 template <class Container, class Pair>
-void do_insert_rv_test() {
+constexpr void do_insert_rv_test() {
   using M = Container;
   using P = Pair;
   using R = std::pair<typename M::iterator, bool>;
@@ -60,7 +60,7 @@ void do_insert_rv_test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -72,9 +72,14 @@ void test() {
   do_insert_rv_test<M, CP>();
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<MoveOnly>>();
-  test<std::deque<int>, std::vector<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<MoveOnly>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<MoveOnly>>();
   test<std::vector<int, min_allocator<int>>, std::vector<MoveOnly, min_allocator<MoveOnly>>>();
 
@@ -110,7 +115,7 @@ int main(int, char**) {
     assert(r.first->first == 3);
     assert(r.first->second == 3);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -120,5 +125,14 @@ int main(int, char**) {
     test_emplace_exception_guarantee(insert_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_initializer_list.pass.cpp
index 08d2caf34987..f4469cd46892 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -39,20 +39,34 @@ void test() {
            });
   assert(m.size() == 5);
   assert(std::distance(m.begin(), m.end()) == 5);
+
   assert(*m.begin() == V(0, 1));
-  assert(*std::next(m.begin()) == V(1, 1));
-  assert(*std::next(m.begin(), 2) == V(2, 1));
-  assert(*std::next(m.begin(), 3) == V(3, 1));
-  assert(*std::next(m.begin(), 4) == V(4, 1));
+  auto v1 = *std::next(m.begin());
+  assert(v1.first == 1);
+  assert(v1.second == 1 || v1.second == 1.5 || v1.second == 2);
+  auto v2 = *std::next(m.begin(), 2);
+  assert(v2.first == 2);
+  assert(v2.second == 1);
+  auto v3 = *std::next(m.begin(), 3);
+  assert(v3.first == 3);
+  assert(v3.second == 1 || v3.second == 1.5 || v3.second == 2);
+  auto v4 = *std::next(m.begin(), 4);
+  assert(v4.first == 4);
+  assert(v4.second == 1);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) {
       using FlatMap                        = std::decay_t<decltype(m)>;
       using value_type                     = typename FlatMap::value_type;
@@ -62,5 +76,14 @@ int main(int, char**) {
     test_insert_range_exception_guarantee(insert_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_iter_iter.pass.cpp
index 18a3b571a419..e943dc6408e0 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_sorted_iter_iter.pass.cpp
@@ -37,7 +37,7 @@ static_assert(!CanInsert<Map, std::sorted_unique_t, int, int>);
 static_assert(!CanInsert<Map, std::sorted_unique_t, cpp20_input_iterator<Pair*>, cpp20_input_iterator<Pair*>>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -69,18 +69,32 @@ void test() {
   assert(m == expected2);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) {
       m.insert(std::sorted_unique, newValues.begin(), newValues.end());
     };
     test_insert_range_exception_guarantee(insert_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_transparent.pass.cpp
index c24c8373f51e..75ddc49c1380 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_transparent.pass.cpp
@@ -41,98 +41,42 @@ static_assert(CanInsert<Map, Iter, std::tuple<short, double>&&>);
 static_assert(!CanInsert<Map, int>);
 static_assert(!CanInsert<Map, Iter, int>);
 
-static int expensive_comparisons = 0;
-static int cheap_comparisons     = 0;
-
-struct CompareCounter {
-  int i_ = 0;
-  CompareCounter(int i) : i_(i) {}
-  friend auto operator<=>(const CompareCounter& x, const CompareCounter& y) {
-    expensive_comparisons += 1;
-    return x.i_ <=> y.i_;
-  }
-  bool operator==(const CompareCounter&) const = default;
-  friend auto operator<=>(const CompareCounter& x, int y) {
-    cheap_comparisons += 1;
-    return x.i_ <=> y;
-  }
-};
-
-template <class KeyContainer, class ValueContainer>
-void test() {
-  using Key   = typename KeyContainer::value_type;
-  using Value = typename ValueContainer::value_type;
-  using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
-
-  const std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}};
+constexpr bool test() {
   {
-    // insert(P&&)
+    // template<class K> pair<iterator, bool> insert(P&& x);
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    using M = std::flat_map<int, int, TransparentComparator>;
+    M m(std::sorted_unique, {{1, 1}, {2, 2}, {4, 4}}, c);
+    assert(!transparent_used);
+
+    std::same_as<std::pair<typename M::iterator, bool>> decltype(auto) res =
+        m.insert(std::pair(ConvertibleTransparent<int>{3}, 3));
+
+    assert(res.second);
+    assert(res.first->first == 3);
+    assert(res.first->second == 3);
     //   Unlike flat_set, here we can't use key_compare to compare value_type versus P,
     //   so we must eagerly convert to value_type.
-    M m                   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    expensive_comparisons = 0;
-    cheap_comparisons     = 0;
-    std::same_as<std::pair<typename M::iterator, bool>> auto p =
-        m.insert(std::make_pair(3, 3)); // conversion happens first
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(p == std::make_pair(m.begin() + 2, true));
-    assert(std::ranges::equal(m, expected));
+    assert(!transparent_used);
   }
   {
-    // insert(const_iterator, P&&)
-    M m                                        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    expensive_comparisons                      = 0;
-    cheap_comparisons                          = 0;
-    std::same_as<typename M::iterator> auto it = m.insert(m.begin(), std::make_pair(3, 3));
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(it == m.begin() + 2);
-    assert(std::ranges::equal(m, expected));
-  }
-  {
-    // insert(value_type&&)
-    M m                   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    expensive_comparisons = 0;
-    cheap_comparisons     = 0;
-    std::same_as<std::pair<typename M::iterator, bool>> auto p =
-        m.insert(std::make_pair(3, 3)); // conversion happens last
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(p == std::make_pair(m.begin() + 2, true));
-    assert(std::ranges::equal(m, expected));
-  }
-  {
-    // insert(const_iterator, value_type&&)
-    M m                                        = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    expensive_comparisons                      = 0;
-    cheap_comparisons                          = 0;
-    std::same_as<typename M::iterator> auto it = m.insert(m.begin(), std::make_pair(3, 3));
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(it == m.begin() + 2);
-    assert(std::ranges::equal(m, expected));
-  }
-  {
-    // emplace(Args&&...)
-    M m                   = {{1, 1}, {2, 2}, {4, 4}, {5, 5}};
-    expensive_comparisons = 0;
-    cheap_comparisons     = 0;
-    std::same_as<std::pair<typename M::iterator, bool>> auto p =
-        m.emplace(std::make_pair(3, 3)); // conversion happens first
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(p == std::make_pair(m.begin() + 2, true));
-    assert(std::ranges::equal(m, expected));
-  }
-}
+    // template<class K> iterator insert(const_iterator hint, P&& x);
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    using M = std::flat_map<int, int, TransparentComparator>;
+    M m(std::sorted_unique, {{1, 1}, {2, 2}, {4, 4}}, c);
+    assert(!transparent_used);
 
-int main(int, char**) {
-  test<std::vector<CompareCounter>, std::vector<double>>();
-  test<std::deque<CompareCounter>, std::vector<double>>();
-  test<MinSequenceContainer<CompareCounter>, MinSequenceContainer<double>>();
-  test<std::vector<CompareCounter, min_allocator<CompareCounter>>, std::vector<double, min_allocator<double>>>();
+    std::same_as<typename M::iterator> decltype(auto) res =
+        m.insert(m.begin(), std::pair(ConvertibleTransparent<int>{3}, 3));
 
+    assert(res->first == 3);
+    assert(res->second == 3);
+    //   Unlike flat_set, here we can't use key_compare to compare value_type versus P,
+    //   so we must eagerly convert to value_type.
+    assert(!transparent_used);
+  }
   {
     // no ambiguity between insert(pos, P&&) and insert(first, last)
     using M = std::flat_map<int, int>;
@@ -145,23 +89,26 @@ int main(int, char**) {
     ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), Evil())), M::iterator);
     ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), m.end())), void);
   }
-  {
-    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
-      using FlatMap    = std::decay_t<decltype(m)>;
-      using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
-      tuple_type t(key_arg, value_arg);
-      m.insert(t);
-    };
-    test_emplace_exception_guarantee(insert_func);
-  }
-  {
-    auto insert_func_iter = [](auto& m, auto key_arg, auto value_arg) {
-      using FlatMap    = std::decay_t<decltype(m)>;
-      using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
-      tuple_type t(key_arg, value_arg);
-      m.insert(m.begin(), t);
-    };
-    test_emplace_exception_guarantee(insert_func_iter);
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+        using FlatMap    = std::decay_t<decltype(m)>;
+        using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
+        tuple_type t(key_arg, value_arg);
+        m.insert(t);
+      };
+      test_emplace_exception_guarantee(insert_func);
+    }
+    {
+      auto insert_func_iter = [](auto& m, auto key_arg, auto value_arg) {
+        using FlatMap    = std::decay_t<decltype(m)>;
+        using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
+        tuple_type t(key_arg, value_arg);
+        m.insert(m.begin(), t);
+      };
+      test_emplace_exception_guarantee(insert_func_iter);
+    }
   }
   {
     // LWG4239 std::string and C string literal
@@ -173,5 +120,15 @@ int main(int, char**) {
     auto it2 = m.insert(m.begin(), {"beta2", 2});
     assert(it2 == m.begin() + 2);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/replace.pass.cpp
index 5ca811d76152..6582d475708f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/replace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/replace.pass.cpp
@@ -33,7 +33,7 @@ static_assert(!CanReplace<Map, std::vector<int>, const std::vector<int>&>);
 static_assert(!CanReplace<Map, const std::vector<int>&, const std::vector<int>&>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -49,13 +49,18 @@ void test() {
   assert(std::ranges::equal(m.values(), expected_values));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
     using KeyContainer   = std::vector<int>;
     using ValueContainer = ThrowOnMoveContainer<int>;
@@ -76,5 +81,15 @@ int main(int, char**) {
     }
 #endif
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_free.pass.cpp
index 98c60c1488cf..37f2914ebfdb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_free.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_free.pass.cpp
@@ -39,7 +39,7 @@ static_assert(
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -87,11 +87,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_member.pass.cpp
index d2d8f5673ede..771bcd3de2c0 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_member.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/swap_member.pass.cpp
@@ -38,7 +38,7 @@ static_assert(
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -85,11 +85,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace.pass.cpp
index 4be2fe1c4333..7fdddc4d19dc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace.pass.cpp
@@ -63,7 +63,7 @@ static_assert(!CanTryEmplace<Map, Iter, Emplaceable, const Emplaceable&>);
 static_assert(!CanTryEmplace<Map, Iter, Emplaceable, int>);
 
 template <class KeyContainer, class ValueContainer>
-void test_ck() {
+constexpr void test_ck() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -101,7 +101,7 @@ void test_ck() {
     assert(r3.first->second.get() == 5); // value
 
     Moveable mv3(-1, 3.0);
-    std::same_as<R> decltype(auto) r4 = m.try_emplace(117, std::move(mv2));
+    std::same_as<R> decltype(auto) r4 = m.try_emplace(117, std::move(mv3));
     assert(m.size() == 13);
     assert(r4.second);                    // was inserted
     assert(mv2.moved());                  // was moved from
@@ -135,7 +135,7 @@ void test_ck() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_rk() {
+constexpr void test_rk() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -193,54 +193,70 @@ void test_rk() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test_ck<std::vector<int>, std::vector<Moveable>>();
-  test_ck<std::deque<int>, std::vector<Moveable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_ck<std::deque<int>, std::vector<Moveable>>();
+    test_rk<std::deque<Moveable>, std::vector<Moveable>>();
+  }
   test_ck<MinSequenceContainer<int>, MinSequenceContainer<Moveable>>();
   test_ck<std::vector<int, min_allocator<int>>, std::vector<Moveable, min_allocator<Moveable>>>();
 
   test_rk<std::vector<Moveable>, std::vector<Moveable>>();
-  test_rk<std::deque<Moveable>, std::vector<Moveable>>();
   test_rk<MinSequenceContainer<Moveable>, MinSequenceContainer<Moveable>>();
   test_rk<std::vector<Moveable, min_allocator<Moveable>>, std::vector<Moveable, min_allocator<Moveable>>>();
 
-  {
-    auto try_emplace_ck = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      const Key key{key_arg};
-      m.try_emplace(key, value_arg);
-    };
-    test_emplace_exception_guarantee(try_emplace_ck);
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      auto try_emplace_ck = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        const Key key{key_arg};
+        m.try_emplace(key, value_arg);
+      };
+      test_emplace_exception_guarantee(try_emplace_ck);
+    }
+
+    {
+      auto try_emplace_rk = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        m.try_emplace(Key{key_arg}, value_arg);
+      };
+      test_emplace_exception_guarantee(try_emplace_rk);
+    }
+
+    {
+      auto try_emplace_iter_ck = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        const Key key{key_arg};
+        m.try_emplace(m.begin(), key, value_arg);
+      };
+      test_emplace_exception_guarantee(try_emplace_iter_ck);
+    }
+
+    {
+      auto try_emplace_iter_rk = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        m.try_emplace(m.begin(), Key{key_arg}, value_arg);
+      };
+      test_emplace_exception_guarantee(try_emplace_iter_rk);
+    }
   }
 
-  {
-    auto try_emplace_rk = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      m.try_emplace(Key{key_arg}, value_arg);
-    };
-    test_emplace_exception_guarantee(try_emplace_rk);
-  }
+  return true;
+}
 
-  {
-    auto try_emplace_iter_ck = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      const Key key{key_arg};
-      m.try_emplace(m.begin(), key, value_arg);
-    };
-    test_emplace_exception_guarantee(try_emplace_iter_ck);
-  }
-
-  {
-    auto try_emplace_iter_rk = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      m.try_emplace(m.begin(), Key{key_arg}, value_arg);
-    };
-    test_emplace_exception_guarantee(try_emplace_iter_rk);
-  }
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace_transparent.pass.cpp
index c301f07937ea..6a536d4d8282 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/try_emplace_transparent.pass.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// gcc 15 ICE on this test
+// UNSUPPORTED: gcc
 
 // <flat_map>
 
@@ -65,7 +67,7 @@ static_assert(!CanTryEmplace<NonTransparentMap, TransparentMapConstIter, NonConv
 static_assert(!CanTryEmplace<TransparentMap, TransparentMapConstIter, ConvertibleTransparent<int>, int>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -103,7 +105,7 @@ void test() {
     assert(r3.first->second.get() == 5); // value
 
     Moveable mv3(-1, 3.0);
-    std::same_as<R> decltype(auto) r4 = m.try_emplace(ConvertibleTransparent<int>{117}, std::move(mv2));
+    std::same_as<R> decltype(auto) r4 = m.try_emplace(ConvertibleTransparent<int>{117}, std::move(mv3));
     assert(m.size() == 13);
     assert(r4.second);                    // was inserted
     assert(mv2.moved());                  // was moved from
@@ -136,9 +138,14 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<Moveable>>();
-  test<std::deque<int>, std::vector<Moveable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<Moveable>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<Moveable>>();
   test<std::vector<int, min_allocator<int>>, std::vector<Moveable, min_allocator<Moveable>>>();
 
@@ -171,23 +178,34 @@ int main(int, char**) {
     auto it2 = m.try_emplace(m.begin(), "beta2", 2);
     assert(it2 == m.begin() + 2);
   }
-  {
-    auto try_emplace = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      m.try_emplace(ConvertibleTransparent<Key>{key_arg}, value_arg);
-    };
-    test_emplace_exception_guarantee(try_emplace);
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      auto try_emplace = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        m.try_emplace(ConvertibleTransparent<Key>{key_arg}, value_arg);
+      };
+      test_emplace_exception_guarantee(try_emplace);
+    }
+
+    {
+      auto try_emplace_iter = [](auto& m, auto key_arg, auto value_arg) {
+        using M   = std::decay_t<decltype(m)>;
+        using Key = typename M::key_type;
+        m.try_emplace(m.begin(), ConvertibleTransparent<Key>{key_arg}, value_arg);
+      };
+      test_emplace_exception_guarantee(try_emplace_iter);
+    }
   }
 
-  {
-    auto try_emplace_iter = [](auto& m, auto key_arg, auto value_arg) {
-      using M   = std::decay_t<decltype(m)>;
-      using Key = typename M::key_type;
-      m.try_emplace(m.begin(), ConvertibleTransparent<Key>{key_arg}, value_arg);
-    };
-    test_emplace_exception_guarantee(try_emplace_iter);
-  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/comp.pass.cpp
index d86224952dee..5712493740bc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/comp.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/comp.pass.cpp
@@ -16,12 +16,13 @@
 #include <cassert>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "test_macros.h"
 
-int main(int, char**) {
+constexpr bool test() {
   {
     using M    = std::flat_map<int, char>;
     using Comp = std::less<int>; // the default
@@ -38,7 +39,7 @@ int main(int, char**) {
     assert(vc({1, '2'}, {2, '1'}));
     assert(!vc({2, '1'}, {1, '2'}));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(int, int)>;
     using M    = std::flat_map<int, int, Comp>;
     Comp comp  = std::greater<int>();
@@ -72,7 +73,7 @@ int main(int, char**) {
     assert(vc({1, 2}, {2, 1}));
     assert(!vc({2, 1}, {1, 2}));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(const std::vector<int>&, const std::vector<int>&)>;
     using M    = std::flat_map<std::vector<int>, int, Comp>;
     Comp comp  = [i = 1](const auto& x, const auto& y) { return x[i] < y[i]; };
@@ -92,5 +93,15 @@ int main(int, char**) {
     assert(!vc(b, a));
     assert(!vc(c, b));
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/keys_values.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/keys_values.pass.cpp
index 84d8f8344aaa..6b98c116f287 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/keys_values.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.observers/keys_values.pass.cpp
@@ -28,7 +28,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -47,11 +47,25 @@ void test() {
   assert(std::ranges::equal(values, expected_values));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains.pass.cpp
index 208d6138fa68..051c8952bd1d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -60,11 +60,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<int>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains_transparent.pass.cpp
index e28e94c3d8fb..ecbd42366dad 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/contains_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanContains<NonTransparentMap>);
 static_assert(!CanContains<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -53,9 +53,14 @@ void test() {
   assert(m.contains(Transparent<std::string>{"g"}) == false);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<std::string>, std::vector<int>>();
+  }
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -75,5 +80,15 @@ int main(int, char**) {
     assert(m.contains("beta") == true);
     assert(m.contains("charlie") == false);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count.pass.cpp
index db675854d5e9..acfc8acb2746 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
 
@@ -59,11 +59,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<int>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count_transparent.pass.cpp
index 37d815b2ce98..660b2249471c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/count_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanCount<NonTransparentMap>);
 static_assert(!CanCount<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -53,9 +53,14 @@ void test() {
   assert(m.count(Transparent<std::string>{"g"}) == 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<std::string>, std::vector<int>>();
+  }
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -76,5 +81,14 @@ int main(int, char**) {
     assert(m.count("charlie") == 0);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range.pass.cpp
index 8fa73d2a2eb5..fabce9cdd5a8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -68,11 +68,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range_transparent.pass.cpp
index bff70a320928..fbaf81417b34 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/equal_range_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanEqualRange<NonTransparentMap>);
 static_assert(!CanEqualRange<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -81,9 +81,14 @@ void test() {
   test_not_found(cm, "zzz", 5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<std::string>, std::vector<int>>();
+  }
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -105,5 +110,14 @@ int main(int, char**) {
     assert(last == m.begin() + 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find.pass.cpp
index 9fae407c7d8f..c5bf7478d001 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -45,11 +45,25 @@ void test() {
   assert(std::as_const(m).find(9) == m.end());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find_transparent.pass.cpp
index b1d1301fce63..a0f9a15924b4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/find_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanFind<NonTransparentMap>);
 static_assert(!CanFind<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -69,9 +69,14 @@ void test() {
   test_find(cm, "zzz", 5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<std::string>, std::vector<int>>();
+  }
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -92,5 +97,14 @@ int main(int, char**) {
     assert(it == m.begin() + 1);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound.pass.cpp
index b5491f3b2267..36f7c56e70f3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -61,11 +61,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound_transparent.pass.cpp
index 9e2561631196..e3dfa47eb2b0 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/lower_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanLowerBound<NonTransparentMap>);
 static_assert(!CanLowerBound<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -76,9 +76,14 @@ void test() {
   test_lower_bound(cm, "zzz", 5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<std::string>, std::vector<int>>();
+  }
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -99,5 +104,14 @@ int main(int, char**) {
     assert(it == m.begin() + 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound.pass.cpp
index 775e53286d62..45a8b339026e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -62,11 +62,25 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<char>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound_transparent.pass.cpp
index c87113f67018..54223777c4df 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.operations/upper_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanUpperBound<NonTransparentMap>);
 static_assert(!CanUpperBound<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_map<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -76,9 +76,14 @@ void test() {
   test_upper_bound(cm, "zzz", 5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<std::string>, std::vector<int>>();
+  }
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
   {
@@ -98,5 +103,14 @@ int main(int, char**) {
     assert(it == m.begin() + 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h
index b6b8fa061c84..932f330db829 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h
@@ -14,13 +14,14 @@
 #include <string>
 #include <vector>
 #include <flat_map>
+#include <ranges>
 
 #include "../flat_helpers.h"
 #include "test_allocator.h"
 #include "test_macros.h"
 
 template <class... Args>
-void check_invariant(const std::flat_map<Args...>& m) {
+constexpr void check_invariant(const std::flat_map<Args...>& m) {
   assert(m.keys().size() == m.values().size());
   const auto& keys = m.keys();
   assert(std::is_sorted(keys.begin(), keys.end(), m.key_comp()));
@@ -31,6 +32,14 @@ void check_invariant(const std::flat_map<Args...>& m) {
   assert(std::adjacent_find(keys.begin(), keys.end(), key_equal) == keys.end());
 }
 
+constexpr void check_possible_values(const auto& actual, const auto& expected) {
+  assert(std::ranges::size(actual) == std::ranges::size(expected));
+
+  for (const auto& [actual_value, possible_values] : std::views::zip(actual, expected)) {
+    assert(std::ranges::find(possible_values, actual_value) != std::ranges::end(possible_values));
+  }
+}
+
 template <class F>
 void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/incomplete_type.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/incomplete_type.pass.cpp
index 76461951f0d3..d89e549b5fc2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/incomplete_type.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/incomplete_type.pass.cpp
@@ -16,6 +16,8 @@
 #include <flat_map>
 #include <vector>
 
+#include "test_macros.h"
+
 struct A {
   using Map = std::flat_map<A, A>;
   int data;
@@ -25,9 +27,19 @@ struct A {
 };
 
 // Implement the operator< required in order to instantiate flat_map<A, X>
-bool operator<(A const& L, A const& R) { return L.data < R.data; }
+constexpr bool operator<(A const& L, A const& R) { return L.data < R.data; }
+
+constexpr bool test() {
+  A a;
+
+  return true;
+}
 
 int main(int, char**) {
-  A a;
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/op_compare.pass.cpp
index fffe71158070..07f2486cc41e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/op_compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/op_compare.pass.cpp
@@ -31,7 +31,7 @@
 #include "test_container_comparisons.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
 
@@ -70,9 +70,14 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::deque<int>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
@@ -98,7 +103,7 @@ int main(int, char**) {
   {
     // Comparisons use value_type's native operators, not the comparator
     struct StrongComp {
-      bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
+      constexpr bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
     };
     using C = std::flat_map<double, double, StrongComp>;
     C s1    = {{1, 1}};
@@ -114,5 +119,15 @@ int main(int, char**) {
     assert(s1 != s2);
     assert((s1 <=> s2) == std::partial_ordering::unordered);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat_helpers.h b/libcxx/test/std/containers/container.adaptors/flat_helpers.h
index 19d637052203..57c4595ccfeb 100644
--- a/libcxx/test/std/containers/container.adaptors/flat_helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat_helpers.h
@@ -18,11 +18,12 @@ struct CopyOnlyVector : std::vector<T> {
   using std::vector<T>::vector;
 
   CopyOnlyVector(const CopyOnlyVector&) = default;
-  CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {}
-  CopyOnlyVector(CopyOnlyVector&& other, std::vector<T>::allocator_type alloc) : CopyOnlyVector(other, alloc) {}
+  constexpr CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {}
+  constexpr CopyOnlyVector(CopyOnlyVector&& other, std::vector<T>::allocator_type alloc)
+      : CopyOnlyVector(other, alloc) {}
 
-  CopyOnlyVector& operator=(const CopyOnlyVector&) = default;
-  CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); }
+  constexpr CopyOnlyVector& operator=(const CopyOnlyVector&) = default;
+  constexpr CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); }
 };
 
 template <class T>
@@ -36,7 +37,7 @@ template <class T, bool ConvertibleToT = false>
 struct Transparent {
   T t;
 
-  explicit operator T() const
+  TEST_CONSTEXPR explicit operator T() const
     requires ConvertibleToT
   {
     return t;
@@ -57,10 +58,10 @@ struct TransparentComparator {
 
   bool* transparent_used  = nullptr;
   TransparentComparator() = default;
-  TransparentComparator(bool& used) : transparent_used(&used) {}
+  TEST_CONSTEXPR TransparentComparator(bool& used) : transparent_used(&used) {}
 
   template <class T, bool Convertible>
-  bool operator()(const T& t, const Transparent<T, Convertible>& transparent) const {
+  TEST_CONSTEXPR bool operator()(const T& t, const Transparent<T, Convertible>& transparent) const {
     if (transparent_used != nullptr) {
       *transparent_used = true;
     }
@@ -68,7 +69,7 @@ struct TransparentComparator {
   }
 
   template <class T, bool Convertible>
-  bool operator()(const Transparent<T, Convertible>& transparent, const T& t) const {
+  TEST_CONSTEXPR bool operator()(const Transparent<T, Convertible>& transparent, const T& t) const {
     if (transparent_used != nullptr) {
       *transparent_used = true;
     }
@@ -76,7 +77,7 @@ struct TransparentComparator {
   }
 
   template <class T>
-  bool operator()(const T& t1, const T& t2) const {
+  TEST_CONSTEXPR bool operator()(const T& t1, const T& t2) const {
     return t1 < t2;
   }
 };
@@ -101,13 +102,13 @@ class Moveable {
   double double_;
 
 public:
-  Moveable() : int_(0), double_(0) {}
-  Moveable(int i, double d) : int_(i), double_(d) {}
-  Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) {
+  TEST_CONSTEXPR Moveable() : int_(0), double_(0) {}
+  TEST_CONSTEXPR Moveable(int i, double d) : int_(i), double_(d) {}
+  TEST_CONSTEXPR Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) {
     x.int_    = -1;
     x.double_ = -1;
   }
-  Moveable& operator=(Moveable&& x) {
+  TEST_CONSTEXPR Moveable& operator=(Moveable&& x) {
     int_      = x.int_;
     x.int_    = -1;
     double_   = x.double_;
@@ -117,11 +118,13 @@ public:
 
   Moveable(const Moveable&)            = delete;
   Moveable& operator=(const Moveable&) = delete;
-  bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; }
-  bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); }
+  TEST_CONSTEXPR bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; }
+  TEST_CONSTEXPR bool operator<(const Moveable& x) const {
+    return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_);
+  }
 
-  int get() const { return int_; }
-  bool moved() const { return int_ == -1; }
+  TEST_CONSTEXPR int get() const { return int_; }
+  TEST_CONSTEXPR bool moved() const { return int_ == -1; }
 };
 
 #ifndef TEST_HAS_NO_EXCEPTIONS
diff --git a/libcxx/test/std/containers/test_compare.h b/libcxx/test/std/containers/test_compare.h
index 18e0b27dc954..070f15869a4a 100644
--- a/libcxx/test/std/containers/test_compare.h
+++ b/libcxx/test/std/containers/test_compare.h
@@ -9,22 +9,24 @@
 #ifndef TEST_COMPARE_H
 #define TEST_COMPARE_H
 
+#include "test_macros.h"
+
 template <class T>
 struct test_equal_to {
   int data_;
-  explicit test_equal_to() : data_(0) {}
-  explicit test_equal_to(int data) : data_(data) {}
-  bool operator()(const T& a, const T& b) const { return a == b; }
-  friend bool operator==(const test_equal_to& a, const test_equal_to& b) { return a.data_ == b.data_; }
+  TEST_CONSTEXPR explicit test_equal_to() : data_(0) {}
+  TEST_CONSTEXPR explicit test_equal_to(int data) : data_(data) {}
+  TEST_CONSTEXPR bool operator()(const T& a, const T& b) const { return a == b; }
+  TEST_CONSTEXPR friend bool operator==(const test_equal_to& a, const test_equal_to& b) { return a.data_ == b.data_; }
 };
 
 template <class T>
 struct test_less {
   int data_;
-  explicit test_less() : data_(0) {}
-  explicit test_less(int data) : data_(data) {}
-  bool operator()(const T& a, const T& b) const { return a < b; }
-  friend bool operator==(const test_less& a, const test_less& b) { return a.data_ == b.data_; }
+  TEST_CONSTEXPR explicit test_less() : data_(0) {}
+  TEST_CONSTEXPR explicit test_less(int data) : data_(data) {}
+  TEST_CONSTEXPR bool operator()(const T& a, const T& b) const { return a < b; }
+  TEST_CONSTEXPR friend bool operator==(const test_less& a, const test_less& b) { return a.data_ == b.data_; }
 };
 
 #endif // TEST_COMPARE_H
diff --git a/libcxx/test/support/MinSequenceContainer.h b/libcxx/test/support/MinSequenceContainer.h
index ccc17b79288b..9af5847f0887 100644
--- a/libcxx/test/support/MinSequenceContainer.h
+++ b/libcxx/test/support/MinSequenceContainer.h
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "test_iterators.h"
+#include "test_macros.h"
 
 template <class T,
           class Iterator      = three_way_random_access_iterator<T*>,
@@ -26,54 +27,60 @@ struct MinSequenceContainer {
 
   explicit MinSequenceContainer() = default;
   template <class It>
-  explicit MinSequenceContainer(It first, It last) : data_(first, last) {}
-  MinSequenceContainer(std::initializer_list<T> il) : data_(il) {}
+  explicit TEST_CONSTEXPR_CXX20 MinSequenceContainer(It first, It last) : data_(first, last) {}
+  TEST_CONSTEXPR_CXX20 MinSequenceContainer(std::initializer_list<T> il) : data_(il) {}
 
   template <class It>
-  void assign(It first, It last) {
+  TEST_CONSTEXPR_CXX20 void assign(It first, It last) {
     data_.assign(first, last);
   }
-  void assign(std::initializer_list<T> il) { data_.assign(il); }
-  void assign(size_type n, value_type t) { data_.assign(n, t); }
-  iterator begin() { return iterator(data_.data()); }
-  const_iterator begin() const { return const_iterator(data_.data()); }
-  const_iterator cbegin() const { return const_iterator(data_.data()); }
-  iterator end() { return begin() + size(); }
-  const_iterator end() const { return begin() + size(); }
-  size_type size() const { return static_cast<size_type>(data_.size()); }
-  bool empty() const { return data_.empty(); }
+  TEST_CONSTEXPR_CXX20 void assign(std::initializer_list<T> il) { data_.assign(il); }
+  TEST_CONSTEXPR_CXX20 void assign(size_type n, value_type t) { data_.assign(n, t); }
+  TEST_CONSTEXPR_CXX20 iterator begin() { return iterator(data_.data()); }
+  TEST_CONSTEXPR_CXX20 const_iterator begin() const { return const_iterator(data_.data()); }
+  TEST_CONSTEXPR_CXX20 const_iterator cbegin() const { return const_iterator(data_.data()); }
+  TEST_CONSTEXPR_CXX20 iterator end() { return begin() + size(); }
+  TEST_CONSTEXPR_CXX20 const_iterator end() const { return begin() + size(); }
+  TEST_CONSTEXPR_CXX20 size_type size() const { return static_cast<size_type>(data_.size()); }
+  TEST_CONSTEXPR_CXX20 bool empty() const { return data_.empty(); }
 
-  void clear() { data_.clear(); }
+  TEST_CONSTEXPR_CXX20 void clear() { data_.clear(); }
 
   template <class It>
-  iterator insert(const_iterator p, It first, It last) {
+  TEST_CONSTEXPR_CXX20 iterator insert(const_iterator p, It first, It last) {
     return from_vector_iterator(data_.insert(to_vector_iterator(p), first, last));
   }
 
-  iterator insert(const_iterator p, T value) {
+  TEST_CONSTEXPR_CXX20 iterator insert(const_iterator p, T value) {
     return from_vector_iterator(data_.insert(to_vector_iterator(p), std::move(value)));
   }
 
   template <class Range>
-  iterator insert_range(const_iterator p, Range&& rg) {
+  TEST_CONSTEXPR_CXX20 iterator insert_range(const_iterator p, Range&& rg) {
     return from_vector_iterator(data_.insert_range(to_vector_iterator(p), std::forward<Range>(rg)));
   }
 
-  iterator erase(const_iterator first, const_iterator last) {
+  TEST_CONSTEXPR_CXX20 iterator erase(const_iterator first, const_iterator last) {
     return from_vector_iterator(data_.erase(to_vector_iterator(first), to_vector_iterator(last)));
   }
 
-  iterator erase(const_iterator iter) { return from_vector_iterator(data_.erase(to_vector_iterator(iter))); }
+  TEST_CONSTEXPR_CXX20 iterator erase(const_iterator iter) {
+    return from_vector_iterator(data_.erase(to_vector_iterator(iter)));
+  }
 
   template <class... Args>
-  iterator emplace(const_iterator pos, Args&&... args) {
+  TEST_CONSTEXPR_CXX20 iterator emplace(const_iterator pos, Args&&... args) {
     return from_vector_iterator(data_.emplace(to_vector_iterator(pos), std::forward<Args>(args)...));
   }
 
 private:
-  std::vector<T>::const_iterator to_vector_iterator(const_iterator cit) const { return cit - cbegin() + data_.begin(); }
+  TEST_CONSTEXPR_CXX20 std::vector<T>::const_iterator to_vector_iterator(const_iterator cit) const {
+    return cit - cbegin() + data_.begin();
+  }
 
-  iterator from_vector_iterator(std::vector<T>::iterator it) { return it - data_.begin() + begin(); }
+  TEST_CONSTEXPR_CXX20 iterator from_vector_iterator(std::vector<T>::iterator it) {
+    return it - data_.begin() + begin();
+  }
 
   std::vector<T> data_;
 };

From 60d1276b0e0dcd287af3ea1e48d0070a5a9c752a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 21 Jun 2025 15:41:25 +0100
Subject: [PATCH 1160/1322] [VPlan] Pass operand index to canNarrowLoad. (NFC)

Explicitly pass the operand we are checking to canNarrowLoad. This
simplifies the check if the operands match across recipes and enables
future optimizations.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c0bdbb1f4f88..d66733cac4d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3107,28 +3107,22 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
 /// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
 /// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
 /// be narrowed to an index-independent load if it feeds all wide ops at all
-/// indices (checked by via the operands of the wide recipe at lane0, \p
-/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
-/// is defined at \p Idx of a load interleave group.
+/// indices (\p OpV must be the operand at index \p OpIdx for both the recipe at
+/// lane 0, \p WideMember0, and \p WideMember). A VPInterleaveRecipe can be
+/// narrowed to a wide load, if \p V is defined at \p Idx of a load interleave
+/// group.
 static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
-                          VPValue *V, unsigned Idx) {
-  auto *DefR = V->getDefiningRecipe();
+                          unsigned OpIdx, VPValue *OpV, unsigned Idx) {
+  auto *DefR = OpV->getDefiningRecipe();
   if (!DefR)
     return false;
   if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
-    return !W->getMask() &&
-           all_of(zip(WideMember0->operands(), WideMember->operands()),
-                  [V](const auto P) {
-                    // V must be as at the same places in both WideMember0 and
-                    // WideMember.
-                    const auto &[WideMember0Op, WideMemberOp] = P;
-                    return (WideMember0Op == V) == (WideMemberOp == V);
-                  });
+    return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
 
   if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
     return IR->getInterleaveGroup()->getFactor() ==
                IR->getInterleaveGroup()->getNumMembers() &&
-           IR->getVPValue(Idx) == V;
+           IR->getVPValue(Idx) == OpV;
   return false;
 }
 
@@ -3243,9 +3237,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
         return;
-      if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) {
-            return !canNarrowLoad(WideMember0, R, V, Idx);
-          }))
+      if (any_of(enumerate(R->operands()),
+                 [WideMember0, Idx = I, R](const auto &P) {
+                   const auto &[OpIdx, OpV] = P;
+                   return !canNarrowLoad(WideMember0, R, OpIdx, OpV, Idx);
+                 }))
         return;
     }
     StoreGroups.push_back(InterleaveR);

From a961ba88e1d48b8df1531d778e904efb2839662c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 21 Jun 2025 23:54:27 +0900
Subject: [PATCH 1161/1322] AMDGPU: Use reportFatalUsageError for LDS mixed
 absolute addresses (#145135)

---
 llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp                | 4 ++--
 .../CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index 241dbd63eb5c..aa72c3e61f68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -278,8 +278,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
         }
         if (HasAbsoluteGVs.has_value()) {
           if (*HasAbsoluteGVs != IsAbsolute) {
-            report_fatal_error(
-                "Module cannot mix absolute and non-absolute LDS GVs");
+            reportFatalUsageError(
+                "module cannot mix absolute and non-absolute LDS GVs");
           }
         } else
           HasAbsoluteGVs = IsAbsolute;
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
index e1ee9b2df9fa..eacf6759b039 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
@@ -1,10 +1,10 @@
-; RUN: not --crash opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
-; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: not opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: not opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 
 @var1 = addrspace(3) global i32 poison, !absolute_symbol !0
 @var2 = addrspace(3) global i32 poison
 
-; CHECK: Module cannot mix absolute and non-absolute LDS GVs
+; CHECK: LLVM ERROR: module cannot mix absolute and non-absolute LDS GVs
 define amdgpu_kernel void @kern() {
   %val0 = load i32, ptr addrspace(3) @var1
   %val1 = add i32 %val0, 4

From 99af99c6657db0ad76bc348fe075c873a10da031 Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Sat, 21 Jun 2025 23:02:41 +0800
Subject: [PATCH 1162/1322] [TSan] Fix p == end == ShadowMem::end in ShadowSet
 (#144994)

In `ShadowSet`, when `p == end == ShadowMem::end`, it triggered an
assertion fail previously.

Now we do not allow `p == end` anymore in `ShadowSet`.
---
 compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
index dbdc6359d92a..bd8deefefa1b 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
@@ -523,9 +523,9 @@ SECOND:
 }
 
 void ShadowSet(RawShadow* p, RawShadow* end, RawShadow v) {
-  DCHECK_LE(p, end);
+  DCHECK_LT(p, end);
   DCHECK(IsShadowMem(p));
-  DCHECK(p == end || IsShadowMem(end - 1));
+  DCHECK(IsShadowMem(end - 1));
   UNUSED const uptr kAlign = kShadowCnt * kShadowSize;
   DCHECK_EQ(reinterpret_cast<uptr>(p) % kAlign, 0);
   DCHECK_EQ(reinterpret_cast<uptr>(end) % kAlign, 0);
@@ -569,6 +569,7 @@ static void MemoryRangeSet(uptr addr, uptr size, RawShadow val) {
   RawShadow* mid1 =
       Min(end, reinterpret_cast<RawShadow*>(RoundUp(
                    reinterpret_cast<uptr>(begin) + kPageSize / 2, kPageSize)));
+  // begin must < mid1
   ShadowSet(begin, mid1, val);
   // Reset middle part.
   RawShadow* mid2 = RoundDown(end, kPageSize);
@@ -577,7 +578,10 @@ static void MemoryRangeSet(uptr addr, uptr size, RawShadow val) {
       Die();
   }
   // Set the ending.
-  ShadowSet(mid2, end, val);
+  if (mid2 < end)
+    ShadowSet(mid2, end, val);
+  else
+    DCHECK_EQ(mid2, end);
 }
 
 void MemoryResetRange(ThreadState* thr, uptr pc, uptr addr, uptr size) {

From e6ebf8f99ba27d60026c7473b0cd5e24c855b018 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 21 Jun 2025 08:20:49 -0700
Subject: [PATCH 1163/1322] [mlir] Migrate away from ArrayRef(std::nullopt)
 (NFC) (#145140)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

This patch takes care of the mlir side of the migration, starting with
straightforward places like "return std::nullopt;" and ternally
expressions involving std::nullopt.
---
 mlir/include/mlir/CAPI/Wrap.h                      |  2 +-
 mlir/include/mlir/Dialect/PDL/IR/PDLOps.td         | 13 +++++++------
 mlir/include/mlir/IR/BuiltinAttributes.td          |  5 ++---
 mlir/include/mlir/Support/StorageUniquer.h         |  2 +-
 mlir/lib/Interfaces/FunctionInterfaces.cpp         |  4 ++--
 mlir/lib/Tools/PDLL/Parser/Parser.cpp              |  9 ++++++---
 mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp |  6 ++++--
 mlir/test/lib/Dialect/Test/TestAttributes.cpp      |  2 +-
 8 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/CAPI/Wrap.h b/mlir/include/mlir/CAPI/Wrap.h
index 5b68f417a3df..fd5b6e18d495 100644
--- a/mlir/include/mlir/CAPI/Wrap.h
+++ b/mlir/include/mlir/CAPI/Wrap.h
@@ -44,7 +44,7 @@ static llvm::ArrayRef<CppTy> unwrapList(size_t size, CTy *first,
       "incompatible C and C++ types");
 
   if (size == 0)
-    return std::nullopt;
+    return {};
 
   assert(storage.empty() && "expected to populate storage");
   storage.reserve(size);
diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
index 1e108c3d8ac7..6ee638c19d1a 100644
--- a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
+++ b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
@@ -360,12 +360,13 @@ def PDL_OperationOp : PDL_Op<"operation", [AttrSizedOperandSegments]> {
     (`->` `(` $typeValues^ `:` type($typeValues) `)`)? attr-dict
   }];
 
-  let builders = [
-    OpBuilder<(ins CArg<"std::optional<StringRef>", "std::nullopt">:$name,
-      CArg<"ValueRange", "std::nullopt">:$operandValues,
-      CArg<"ArrayRef<StringRef>", "std::nullopt">:$attrNames,
-      CArg<"ValueRange", "std::nullopt">:$attrValues,
-      CArg<"ValueRange", "std::nullopt">:$resultTypes), [{
+  let builders =
+      [OpBuilder<(ins CArg<"std::optional<StringRef>", "std::nullopt">:$name,
+                     CArg<"ValueRange", "{}">:$operandValues,
+                     CArg<"ArrayRef<StringRef>", "{}">:$attrNames,
+                     CArg<"ValueRange", "{}">:$attrValues,
+                     CArg<"ValueRange", "{}">:$resultTypes),
+                 [{
       auto nameAttr = name ? $_builder.getStringAttr(*name) : StringAttr();
       build($_builder, $_state, $_builder.getType<OperationType>(), nameAttr,
             operandValues, attrValues, $_builder.getStrArrayAttr(attrNames),
diff --git a/mlir/include/mlir/IR/BuiltinAttributes.td b/mlir/include/mlir/IR/BuiltinAttributes.td
index 885590827650..b67b8f939b9e 100644
--- a/mlir/include/mlir/IR/BuiltinAttributes.td
+++ b/mlir/include/mlir/IR/BuiltinAttributes.td
@@ -535,9 +535,8 @@ def Builtin_DictionaryAttr : Builtin_Attr<"Dictionary", "dictionary"> {
     ```
   }];
   let parameters = (ins ArrayRefParameter<"NamedAttribute", "">:$value);
-  let builders = [
-    AttrBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "std::nullopt">:$value)>
-  ];
+  let builders = [AttrBuilder<(
+      ins CArg<"ArrayRef<NamedAttribute>", "{}">:$value)>];
   let extraClassDeclaration = [{
     using ValueType = ArrayRef<NamedAttribute>;
 
diff --git a/mlir/include/mlir/Support/StorageUniquer.h b/mlir/include/mlir/Support/StorageUniquer.h
index 6756c4390276..b5f4df680ac6 100644
--- a/mlir/include/mlir/Support/StorageUniquer.h
+++ b/mlir/include/mlir/Support/StorageUniquer.h
@@ -97,7 +97,7 @@ public:
     template <typename T>
     ArrayRef<T> copyInto(ArrayRef<T> elements) {
       if (elements.empty())
-        return std::nullopt;
+        return {};
       auto result = allocator.Allocate<T>(elements.size());
       llvm::uninitialized_copy(elements, result);
       return ArrayRef<T>(result, elements.size());
diff --git a/mlir/lib/Interfaces/FunctionInterfaces.cpp b/mlir/lib/Interfaces/FunctionInterfaces.cpp
index 57a8668117c6..e0f1135e992a 100644
--- a/mlir/lib/Interfaces/FunctionInterfaces.cpp
+++ b/mlir/lib/Interfaces/FunctionInterfaces.cpp
@@ -44,14 +44,14 @@ function_interface_impl::getResultAttrDict(FunctionOpInterface op,
 ArrayRef<NamedAttribute>
 function_interface_impl::getArgAttrs(FunctionOpInterface op, unsigned index) {
   auto argDict = getArgAttrDict(op, index);
-  return argDict ? argDict.getValue() : std::nullopt;
+  return argDict ? argDict.getValue() : ArrayRef<NamedAttribute>();
 }
 
 ArrayRef<NamedAttribute>
 function_interface_impl::getResultAttrs(FunctionOpInterface op,
                                         unsigned index) {
   auto resultDict = getResultAttrDict(op, index);
-  return resultDict ? resultDict.getValue() : std::nullopt;
+  return resultDict ? resultDict.getValue() : ArrayRef<NamedAttribute>();
 }
 
 /// Get either the argument or result attributes array.
diff --git a/mlir/lib/Tools/PDLL/Parser/Parser.cpp b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
index c0e2252bdebc..ad7d71b0bdd2 100644
--- a/mlir/lib/Tools/PDLL/Parser/Parser.cpp
+++ b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
@@ -2883,8 +2883,9 @@ Parser::validateOperationOperands(SMRange loc, std::optional<StringRef> name,
                                   SmallVectorImpl<ast::Expr *> &operands) {
   return validateOperationOperandsOrResults(
       "operand", loc, odsOp ? odsOp->getLoc() : std::optional<SMRange>(), name,
-      operands, odsOp ? odsOp->getOperands() : std::nullopt, valueTy,
-      valueRangeTy);
+      operands,
+      odsOp ? odsOp->getOperands() : ArrayRef<pdll::ods::OperandOrResult>(),
+      valueTy, valueRangeTy);
 }
 
 LogicalResult
@@ -2893,7 +2894,9 @@ Parser::validateOperationResults(SMRange loc, std::optional<StringRef> name,
                                  SmallVectorImpl<ast::Expr *> &results) {
   return validateOperationOperandsOrResults(
       "result", loc, odsOp ? odsOp->getLoc() : std::optional<SMRange>(), name,
-      results, odsOp ? odsOp->getResults() : std::nullopt, typeTy, typeRangeTy);
+      results,
+      odsOp ? odsOp->getResults() : ArrayRef<pdll::ods::OperandOrResult>(),
+      typeTy, typeRangeTy);
 }
 
 void Parser::checkOperationResultTypeInferrence(SMRange loc, StringRef opName,
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index c39540da396b..84f529ae1640 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -1044,7 +1044,8 @@ public:
     const ods::Operation *odsOp =
         opName ? odsContext.lookupOperation(*opName) : nullptr;
     codeCompleteOperationOperandOrResultSignature(
-        opName, odsOp, odsOp ? odsOp->getOperands() : std::nullopt,
+        opName, odsOp,
+        odsOp ? odsOp->getOperands() : ArrayRef<ods::OperandOrResult>(),
         currentNumOperands, "operand", "Value");
   }
 
@@ -1053,7 +1054,8 @@ public:
     const ods::Operation *odsOp =
         opName ? odsContext.lookupOperation(*opName) : nullptr;
     codeCompleteOperationOperandOrResultSignature(
-        opName, odsOp, odsOp ? odsOp->getResults() : std::nullopt,
+        opName, odsOp,
+        odsOp ? odsOp->getResults() : ArrayRef<ods::OperandOrResult>(),
         currentNumResults, "result", "Type");
   }
 
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index 80661e68754c..4f6655d0b297 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -194,7 +194,7 @@ void TestSubElementsAccessAttr::print(::mlir::AsmPrinter &printer) const {
 ArrayRef<uint64_t> TestExtern1DI64ElementsAttr::getElements() const {
   if (auto *blob = getHandle().getBlob())
     return blob->getDataAs<uint64_t>();
-  return std::nullopt;
+  return {};
 }
 
 //===----------------------------------------------------------------------===//

From ae372bfca890cc7a67553b3cb19134359b66c0e1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 21 Jun 2025 08:20:57 -0700
Subject: [PATCH 1164/1322] [CodeGen] Use range-based for loops (NFC) (#145142)

---
 clang/include/clang/CodeGen/CGFunctionInfo.h |  6 +-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp        | 26 +++----
 clang/lib/CodeGen/CGRecordLayoutBuilder.cpp  | 74 +++++++++-----------
 clang/lib/CodeGen/CGStmt.cpp                 | 11 ++-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp        |  4 +-
 clang/lib/CodeGen/ModuleBuilder.cpp          |  4 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp     | 16 ++---
 clang/lib/CodeGen/Targets/X86.cpp            |  5 +-
 8 files changed, 63 insertions(+), 83 deletions(-)

diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
index 040ee025afaa..50be51769f1a 100644
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -828,10 +828,8 @@ public:
         ID.AddInteger(paramInfo.getOpaqueValue());
     }
     resultType.Profile(ID);
-    for (ArrayRef<CanQualType>::iterator
-           i = argTypes.begin(), e = argTypes.end(); i != e; ++i) {
-      i->Profile(ID);
-    }
+    for (const CanQualType &argType : argTypes)
+      argType.Profile(ID);
   }
 };
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 4173355491fd..8ccc37ef98a7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4168,8 +4168,7 @@ void CGOpenMPRuntime::emitDepobjElements(CodeGenFunction &CGF,
         CGF, cast_or_null<OMPIteratorExpr>(
                  Data.IteratorExpr ? Data.IteratorExpr->IgnoreParenImpCasts()
                                    : nullptr));
-    for (unsigned I = 0, End = Data.DepExprs.size(); I < End; ++I) {
-      const Expr *E = Data.DepExprs[I];
+    for (const Expr *E : Data.DepExprs) {
       llvm::Value *NumDeps;
       LValue Base;
       LValue DepobjLVal = CGF.EmitLValue(E->IgnoreParenImpCasts());
@@ -4289,31 +4288,26 @@ std::pair<llvm::Value *, Address> CGOpenMPRuntime::emitDependClause(
                                            /*isSigned=*/false);
   }
   unsigned Pos = 0;
-  for (unsigned I = 0, End = Dependencies.size(); I < End; ++I) {
-    if (Dependencies[I].DepKind == OMPC_DEPEND_depobj ||
-        Dependencies[I].IteratorExpr)
+  for (const OMPTaskDataTy::DependData &Dep : Dependencies) {
+    if (Dep.DepKind == OMPC_DEPEND_depobj || Dep.IteratorExpr)
       continue;
-    emitDependData(CGF, KmpDependInfoTy, &Pos, Dependencies[I],
-                   DependenciesArray);
+    emitDependData(CGF, KmpDependInfoTy, &Pos, Dep, DependenciesArray);
   }
   // Copy regular dependencies with iterators.
   LValue PosLVal = CGF.MakeAddrLValue(
       CGF.CreateMemTemp(C.getSizeType(), "dep.counter.addr"), C.getSizeType());
   CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Pos), PosLVal);
-  for (unsigned I = 0, End = Dependencies.size(); I < End; ++I) {
-    if (Dependencies[I].DepKind == OMPC_DEPEND_depobj ||
-        !Dependencies[I].IteratorExpr)
+  for (const OMPTaskDataTy::DependData &Dep : Dependencies) {
+    if (Dep.DepKind == OMPC_DEPEND_depobj || !Dep.IteratorExpr)
       continue;
-    emitDependData(CGF, KmpDependInfoTy, &PosLVal, Dependencies[I],
-                   DependenciesArray);
+    emitDependData(CGF, KmpDependInfoTy, &PosLVal, Dep, DependenciesArray);
   }
   // Copy final depobj arrays without iterators.
   if (HasDepobjDeps) {
-    for (unsigned I = 0, End = Dependencies.size(); I < End; ++I) {
-      if (Dependencies[I].DepKind != OMPC_DEPEND_depobj)
+    for (const OMPTaskDataTy::DependData &Dep : Dependencies) {
+      if (Dep.DepKind != OMPC_DEPEND_depobj)
         continue;
-      emitDepobjElements(CGF, KmpDependInfoTy, PosLVal, Dependencies[I],
-                         DependenciesArray);
+      emitDepobjElements(CGF, KmpDependInfoTy, PosLVal, Dep, DependenciesArray);
     }
   }
   DependenciesArray = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 232e2d8b43ca..e1310aed818a 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -972,18 +972,16 @@ void CGRecordLowering::determinePacked(bool NVBaseType) {
   CharUnits NVAlignment = CharUnits::One();
   CharUnits NVSize =
       !NVBaseType && RD ? Layout.getNonVirtualSize() : CharUnits::Zero();
-  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
-                                               MemberEnd = Members.end();
-       Member != MemberEnd; ++Member) {
-    if (!Member->Data)
+  for (const MemberInfo &Member : Members) {
+    if (!Member.Data)
       continue;
     // If any member falls at an offset that it not a multiple of its alignment,
     // then the entire record must be packed.
-    if (Member->Offset % getAlignment(Member->Data))
+    if (Member.Offset % getAlignment(Member.Data))
       Packed = true;
-    if (Member->Offset < NVSize)
-      NVAlignment = std::max(NVAlignment, getAlignment(Member->Data));
-    Alignment = std::max(Alignment, getAlignment(Member->Data));
+    if (Member.Offset < NVSize)
+      NVAlignment = std::max(NVAlignment, getAlignment(Member.Data));
+    Alignment = std::max(Alignment, getAlignment(Member.Data));
   }
   // If the size of the record (the capstone's offset) is not a multiple of the
   // record's alignment, it must be packed.
@@ -1002,45 +1000,39 @@ void CGRecordLowering::determinePacked(bool NVBaseType) {
 void CGRecordLowering::insertPadding() {
   std::vector<std::pair<CharUnits, CharUnits> > Padding;
   CharUnits Size = CharUnits::Zero();
-  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
-                                               MemberEnd = Members.end();
-       Member != MemberEnd; ++Member) {
-    if (!Member->Data)
+  for (const MemberInfo &Member : Members) {
+    if (!Member.Data)
       continue;
-    CharUnits Offset = Member->Offset;
+    CharUnits Offset = Member.Offset;
     assert(Offset >= Size);
     // Insert padding if we need to.
     if (Offset !=
-        Size.alignTo(Packed ? CharUnits::One() : getAlignment(Member->Data)))
+        Size.alignTo(Packed ? CharUnits::One() : getAlignment(Member.Data)))
       Padding.push_back(std::make_pair(Size, Offset - Size));
-    Size = Offset + getSize(Member->Data);
+    Size = Offset + getSize(Member.Data);
   }
   if (Padding.empty())
     return;
   // Add the padding to the Members list and sort it.
-  for (std::vector<std::pair<CharUnits, CharUnits> >::const_iterator
-        Pad = Padding.begin(), PadEnd = Padding.end();
-        Pad != PadEnd; ++Pad)
-    Members.push_back(StorageInfo(Pad->first, getByteArrayType(Pad->second)));
+  for (const auto &Pad : Padding)
+    Members.push_back(StorageInfo(Pad.first, getByteArrayType(Pad.second)));
   llvm::stable_sort(Members);
 }
 
 void CGRecordLowering::fillOutputFields() {
-  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
-                                               MemberEnd = Members.end();
-       Member != MemberEnd; ++Member) {
-    if (Member->Data)
-      FieldTypes.push_back(Member->Data);
-    if (Member->Kind == MemberInfo::Field) {
-      if (Member->FD)
-        Fields[Member->FD->getCanonicalDecl()] = FieldTypes.size() - 1;
+  for (const MemberInfo &Member : Members) {
+    if (Member.Data)
+      FieldTypes.push_back(Member.Data);
+    if (Member.Kind == MemberInfo::Field) {
+      if (Member.FD)
+        Fields[Member.FD->getCanonicalDecl()] = FieldTypes.size() - 1;
       // A field without storage must be a bitfield.
-      if (!Member->Data)
-        setBitFieldInfo(Member->FD, Member->Offset, FieldTypes.back());
-    } else if (Member->Kind == MemberInfo::Base)
-      NonVirtualBases[Member->RD] = FieldTypes.size() - 1;
-    else if (Member->Kind == MemberInfo::VBase)
-      VirtualBases[Member->RD] = FieldTypes.size() - 1;
+      if (!Member.Data)
+        setBitFieldInfo(Member.FD, Member.Offset, FieldTypes.back());
+    } else if (Member.Kind == MemberInfo::Base)
+      NonVirtualBases[Member.RD] = FieldTypes.size() - 1;
+    else if (Member.Kind == MemberInfo::VBase)
+      VirtualBases[Member.RD] = FieldTypes.size() - 1;
   }
 }
 
@@ -1224,20 +1216,18 @@ void CGRecordLayout::print(raw_ostream &OS) const {
 
   // Print bit-field infos in declaration order.
   std::vector<std::pair<unsigned, const CGBitFieldInfo*> > BFIs;
-  for (llvm::DenseMap<const FieldDecl*, CGBitFieldInfo>::const_iterator
-         it = BitFields.begin(), ie = BitFields.end();
-       it != ie; ++it) {
-    const RecordDecl *RD = it->first->getParent();
+  for (const auto &BitField : BitFields) {
+    const RecordDecl *RD = BitField.first->getParent();
     unsigned Index = 0;
-    for (RecordDecl::field_iterator
-           it2 = RD->field_begin(); *it2 != it->first; ++it2)
+    for (RecordDecl::field_iterator it2 = RD->field_begin();
+         *it2 != BitField.first; ++it2)
       ++Index;
-    BFIs.push_back(std::make_pair(Index, &it->second));
+    BFIs.push_back(std::make_pair(Index, &BitField.second));
   }
   llvm::array_pod_sort(BFIs.begin(), BFIs.end());
-  for (unsigned i = 0, e = BFIs.size(); i != e; ++i) {
+  for (auto &BFI : BFIs) {
     OS.indent(4);
-    BFIs[i].second->print(OS);
+    BFI.second->print(OS);
     OS << "\n";
   }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 8742f8e0fc04..e0650067b954 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -757,10 +757,9 @@ void CodeGenFunction::LexicalScope::rescopeLabels() {
     = CGF.EHStack.getInnermostNormalCleanup();
 
   // Change the scope depth of all the labels.
-  for (SmallVectorImpl<const LabelDecl*>::const_iterator
-         i = Labels.begin(), e = Labels.end(); i != e; ++i) {
-    assert(CGF.LabelMap.count(*i));
-    JumpDest &dest = CGF.LabelMap.find(*i)->second;
+  for (const LabelDecl *Label : Labels) {
+    assert(CGF.LabelMap.count(Label));
+    JumpDest &dest = CGF.LabelMap.find(Label)->second;
     assert(dest.getScopeDepth().isValid());
     assert(innermostScope.encloses(dest.getScopeDepth()));
     dest.setScopeDepth(innermostScope);
@@ -2302,8 +2301,8 @@ void CodeGenFunction::EmitSwitchStmt(const SwitchStmt &S) {
 
       // Okay, we can dead code eliminate everything except this case.  Emit the
       // specified series of statements and we're good.
-      for (unsigned i = 0, e = CaseStmts.size(); i != e; ++i)
-        EmitStmt(CaseStmts[i]);
+      for (const Stmt *CaseStmt : CaseStmts)
+        EmitStmt(CaseStmt);
       incrementProfileCounter(&S);
       PGO->markStmtMaybeUsed(S.getBody());
 
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 514cc1d9015e..a18155983429 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1836,9 +1836,9 @@ llvm::GlobalVariable *MicrosoftCXXABI::getAddrOfVTable(const CXXRecordDecl *RD,
     // Create all the vftables at once in order to make sure each vftable has
     // a unique mangled name.
     llvm::StringSet<> ObservedMangledNames;
-    for (size_t J = 0, F = VFPtrs.size(); J != F; ++J) {
+    for (const auto &VFPtr : VFPtrs) {
       SmallString<256> Name;
-      mangleVFTableName(getMangleContext(), RD, *VFPtrs[J], Name);
+      mangleVFTableName(getMangleContext(), RD, *VFPtr, Name);
       if (!ObservedMangledNames.insert(Name.str()).second)
         llvm_unreachable("Already saw this mangling before?");
     }
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 09a7d79ae4af..8c1fee8c974f 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -186,8 +186,8 @@ namespace {
       HandlingTopLevelDeclRAII HandlingDecl(*this);
 
       // Make sure to emit all elements of a Decl.
-      for (DeclGroupRef::iterator I = DG.begin(), E = DG.end(); I != E; ++I)
-        Builder->EmitTopLevelDecl(*I);
+      for (auto &I : DG)
+        Builder->EmitTopLevelDecl(I);
 
       return true;
     }
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index dab311903f6d..6738d4be6dd2 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4560,10 +4560,10 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
       Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31));
 
     // Predicates must match the main datatype.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
+    for (Value *&Op : Ops)
+      if (auto PredTy = dyn_cast<llvm::VectorType>(Op->getType()))
         if (PredTy->getElementType()->isIntegerTy(1))
-          Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
+          Op = EmitSVEPredicateCast(Op, getSVEType(TypeFlags));
 
     // Splat scalar operand to vector (intrinsics with _n infix)
     if (TypeFlags.hasSplatOperand()) {
@@ -4936,10 +4936,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
   }
 
   // Predicates must match the main datatype.
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
+  for (Value *&Op : Ops)
+    if (auto PredTy = dyn_cast<llvm::VectorType>(Op->getType()))
       if (PredTy->getElementType()->isIntegerTy(1))
-        Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
+        Op = EmitSVEPredicateCast(Op, getSVEType(TypeFlags));
 
   Function *F =
       TypeFlags.isOverloadNone()
@@ -8036,8 +8036,8 @@ BuildVector(ArrayRef<llvm::Value*> Ops) {
   // If this is a constant vector, create a ConstantVector.
   if (AllConstants) {
     SmallVector<llvm::Constant*, 16> CstOps;
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      CstOps.push_back(cast<Constant>(Ops[i]));
+    for (llvm::Value *Op : Ops)
+      CstOps.push_back(cast<Constant>(Op));
     return llvm::ConstantVector::get(CstOps);
   }
 
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index b36a6e139665..0f59caac2323 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -1462,9 +1462,8 @@ public:
     // defines varargs anyway.
     if (fnType->getCallConv() == CC_C) {
       bool HasAVXType = false;
-      for (CallArgList::const_iterator
-             it = args.begin(), ie = args.end(); it != ie; ++it) {
-        if (getABIInfo<X86_64ABIInfo>().isPassedUsingAVXType(it->Ty)) {
+      for (const CallArg &arg : args) {
+        if (getABIInfo<X86_64ABIInfo>().isPassedUsingAVXType(arg.Ty)) {
           HasAVXType = true;
           break;
         }

From 463ce0131047457fdd320c852f187fd70532de9a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 21 Jun 2025 08:21:04 -0700
Subject: [PATCH 1165/1322] [CodeGen] Remove extraneous casts to VariableID
 (NFC) (#145144)

We do not need these casts because values being cast here are already
of type VariableID.
---
 llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index ffdf08eec996..7da01e331b5a 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -1312,7 +1312,7 @@ private:
   DenseSet<DebugAggregate> NotAlwaysStackHomed;
 
   VariableID getVariableID(const DebugVariable &Var) {
-    return static_cast<VariableID>(FnVarLocs->insertVariable(Var));
+    return FnVarLocs->insertVariable(Var);
   }
 
   /// Join the LiveOut values of preds that are contained in \p Visited into
@@ -1556,7 +1556,7 @@ void AssignmentTrackingLowering::emitDbgValue(
 
     VariableID Var = getVariableID(DebugVariable(Source));
     VarLocInfo VarLoc;
-    VarLoc.VariableID = static_cast<VariableID>(Var);
+    VarLoc.VariableID = Var;
     VarLoc.Expr = Expr;
     VarLoc.Values = RawLocationWrapper(Val);
     VarLoc.DL = DL;
@@ -1642,7 +1642,7 @@ void AssignmentTrackingLowering::processUnknownStoreToVariable(
       Fn.getContext(), 0, 0, V.getVariable()->getScope(), InlinedAt);
 
   VarLocInfo VarLoc;
-  VarLoc.VariableID = static_cast<VariableID>(Var);
+  VarLoc.VariableID = Var;
   VarLoc.Expr = DIExpression::get(I.getContext(), {});
   VarLoc.Values = RawLocationWrapper(
       ValueAsMetadata::get(PoisonValue::get(Type::getInt1Ty(I.getContext()))));

From 4c1a1009ad8a346068ad9428966b008ac793c170 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik@intel.com>
Date: Sat, 21 Jun 2025 09:34:41 -0700
Subject: [PATCH 1166/1322] [X86][NFC] Use std::move to avoid copy (#141455)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 33083c0eba69..53c0da45f2f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41266,7 +41266,7 @@ static SDValue combineX86ShufflesRecursively(
     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
                                       ResolveKnownZeros);
 
-    Mask = OpMask;
+    Mask = std::move(OpMask);
     Ops.append(OpInputs.begin(), OpInputs.end());
   } else {
     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

From 9f7a15539441cc589c4116de43e752c9c62cafef Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 21 Jun 2025 18:01:32 +0100
Subject: [PATCH 1167/1322] [VPlan] Update packScalarIntoVector to take and
 return wide value (NFC)

Make the function more flexible in preparation for new users.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp        | 17 ++++++++---------
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h   |  7 ++++---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++++++----
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 773a5a4a829c..a3f39f5ad7a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -364,13 +364,12 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     VectorValue = GetBroadcastInstrs(ScalarValue);
     set(Def, VectorValue);
   } else {
-    // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
-    set(Def, Undef);
+    // Initialize packing with insertelements to start from poison.
+    VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
-      packScalarIntoVectorizedValue(Def, Lane);
-    VectorValue = get(Def);
+      VectorValue = packScalarIntoVectorizedValue(Def, VectorValue, Lane);
+    set(Def, VectorValue);
   }
   Builder.restoreIP(OldIP);
   return VectorValue;
@@ -398,10 +397,10 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DL);
 }
 
-void VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
-                                                     const VPLane &Lane) {
+Value *VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
+                                                       Value *WideValue,
+                                                       const VPLane &Lane) {
   Value *ScalarInst = get(Def, Lane);
-  Value *WideValue = get(Def);
   Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
   if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
     // We must handle each element of a vectorized struct type.
@@ -415,7 +414,7 @@ void VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,
   } else {
     WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
   }
-  set(Def, WideValue);
+  return WideValue;
 }
 
 BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 0446991ebfff..f33f94b7162c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -286,9 +286,10 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
-  /// Construct the vectorized value of a scalarized value \p V one lane at a
-  /// time.
-  void packScalarIntoVectorizedValue(const VPValue *Def, const VPLane &Lane);
+  /// Insert the scalar value of \p Def at \p Lane into \p Lane of \p WideValue
+  /// and return the resulting value.
+  Value *packScalarIntoVectorizedValue(const VPValue *Def, Value *WideValue,
+                                       const VPLane &Lane);
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3d237de5fa8d..3e12fdf9163e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2634,14 +2634,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     scalarizeInstruction(UI, this, *State.Lane, State);
     // Insert scalar instance packing it into a vector.
     if (State.VF.isVector() && shouldPack()) {
+      Value *WideValue;
       // If we're constructing lane 0, initialize to start from poison.
       if (State.Lane->isFirstLane()) {
         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
-        Value *Poison =
-            PoisonValue::get(VectorType::get(UI->getType(), State.VF));
-        State.set(this, Poison);
+        WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF));
+      } else {
+        WideValue = State.get(this);
       }
-      State.packScalarIntoVectorizedValue(this, *State.Lane);
+      State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
+                                                          *State.Lane));
     }
     return;
   }

From 1da864b574f699d5c9be68dca9b3969ad50f4803 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 21 Jun 2025 10:17:30 -0700
Subject: [PATCH 1168/1322] [RISCV] Properly support RISCVISD::LLA in
 getTargetConstantFromLoad. (#145112)

We need to pass the operand of LLA to GetSupportedConstantPool.

This replaces #142292 with test from there added as a pre-commit
for both medlow and pic.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +-
 .../CodeGen/RISCV/constpool-known-bits.ll     | 69 +++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/constpool-known-bits.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c54101a1156..9e568052079c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21062,7 +21062,7 @@ RISCVTargetLowering::getTargetConstantFromLoad(LoadSDNode *Ld) const {
 
   // Simple case, LLA.
   if (Ptr.getOpcode() == RISCVISD::LLA) {
-    auto *CNode = GetSupportedConstantPool(Ptr);
+    auto *CNode = GetSupportedConstantPool(Ptr.getOperand(0));
     if (!CNode || CNode->getTargetFlags() != 0)
       return nullptr;
 
diff --git a/llvm/test/CodeGen/RISCV/constpool-known-bits.ll b/llvm/test/CodeGen/RISCV/constpool-known-bits.ll
new file mode 100644
index 000000000000..85a6de1095a0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/constpool-known-bits.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O0 -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefix=NOPIC
+; RUN: llc < %s -O0 -mtriple=riscv64 -mattr=+m -relocation-model=pic | FileCheck %s --check-prefix=PIC
+
+define i64 @test(i32 noundef signext %c, i32 noundef signext %d) {
+; NOPIC-LABEL: test:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    # kill: def $x11 killed $x10
+; NOPIC-NEXT:    slli a0, a0, 32
+; NOPIC-NEXT:    srli a1, a0, 32
+; NOPIC-NEXT:    lui a0, %hi(.LCPI0_0)
+; NOPIC-NEXT:    ld a0, %lo(.LCPI0_0)(a0)
+; NOPIC-NEXT:    mul a0, a1, a0
+; NOPIC-NEXT:    addi a0, a0, 127
+; NOPIC-NEXT:    mul a0, a1, a0
+; NOPIC-NEXT:    lui a2, %hi(.LCPI0_1)
+; NOPIC-NEXT:    ld a2, %lo(.LCPI0_1)(a2)
+; NOPIC-NEXT:    mul a0, a0, a2
+; NOPIC-NEXT:    add a0, a0, a1
+; NOPIC-NEXT:    lui a1, 1015920
+; NOPIC-NEXT:    addi a1, a1, 1541
+; NOPIC-NEXT:    slli a1, a1, 16
+; NOPIC-NEXT:    addi a1, a1, 1027
+; NOPIC-NEXT:    slli a1, a1, 20
+; NOPIC-NEXT:    add a0, a0, a1
+; NOPIC-NEXT:    ret
+;
+; PIC-LABEL: test:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    # kill: def $x11 killed $x10
+; PIC-NEXT:    slli a0, a0, 32
+; PIC-NEXT:    srli a1, a0, 32
+; PIC-NEXT:  .Lpcrel_hi0:
+; PIC-NEXT:    auipc a0, %pcrel_hi(.LCPI0_0)
+; PIC-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi0)
+; PIC-NEXT:    ld a0, 0(a0)
+; PIC-NEXT:    mul a0, a1, a0
+; PIC-NEXT:    addi a0, a0, 127
+; PIC-NEXT:    mul a0, a1, a0
+; PIC-NEXT:  .Lpcrel_hi1:
+; PIC-NEXT:    auipc a2, %pcrel_hi(.LCPI0_1)
+; PIC-NEXT:    addi a2, a2, %pcrel_lo(.Lpcrel_hi1)
+; PIC-NEXT:    ld a2, 0(a2)
+; PIC-NEXT:    mul a0, a0, a2
+; PIC-NEXT:    add a0, a0, a1
+; PIC-NEXT:    lui a1, 1015920
+; PIC-NEXT:    addi a1, a1, 1541
+; PIC-NEXT:    slli a1, a1, 16
+; PIC-NEXT:    addi a1, a1, 1027
+; PIC-NEXT:    slli a1, a1, 20
+; PIC-NEXT:    add a0, a0, a1
+; PIC-NEXT:    ret
+entry:
+  %or1 = or i64 -9191740941672644608, 4096
+  %or2 = or i64 -9191740941672644608, 8192
+  %or3 = or i64 -9191740941672644608, 16384
+  %conv = zext i32 %c to i64
+  %donv = zext i32 %d to i64
+  %3 = mul i64 %or1, %conv
+  %4 = mul i64 %or2, %donv
+  %5 = mul i64 %or3, %conv
+  %6 = add i64 %3, %4
+  %7 = add i64 %6, %5
+  %8 = or i64 %5, 127
+  %9 = mul i64 %3, %8
+  %add = add i64 -9191740941672644608, %9
+  %add2 = add i64 %add, %conv
+  ret i64 %add2
+}

From fc36e47a495941910d48f29c7d305e8bbd9115fa Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 21 Jun 2025 10:18:22 -0700
Subject: [PATCH 1169/1322] Revert "[RISCV] Properly support RISCVISD::LLA in
 getTargetConstantFromLoad. (#145112)"

I missed the Co-authored-by that I tried to add.

This reverts commit 1da864b574f699d5c9be68dca9b3969ad50f4803.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +-
 .../CodeGen/RISCV/constpool-known-bits.ll     | 69 -------------------
 2 files changed, 1 insertion(+), 70 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/constpool-known-bits.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9e568052079c..0c54101a1156 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21062,7 +21062,7 @@ RISCVTargetLowering::getTargetConstantFromLoad(LoadSDNode *Ld) const {
 
   // Simple case, LLA.
   if (Ptr.getOpcode() == RISCVISD::LLA) {
-    auto *CNode = GetSupportedConstantPool(Ptr.getOperand(0));
+    auto *CNode = GetSupportedConstantPool(Ptr);
     if (!CNode || CNode->getTargetFlags() != 0)
       return nullptr;
 
diff --git a/llvm/test/CodeGen/RISCV/constpool-known-bits.ll b/llvm/test/CodeGen/RISCV/constpool-known-bits.ll
deleted file mode 100644
index 85a6de1095a0..000000000000
--- a/llvm/test/CodeGen/RISCV/constpool-known-bits.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -O0 -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefix=NOPIC
-; RUN: llc < %s -O0 -mtriple=riscv64 -mattr=+m -relocation-model=pic | FileCheck %s --check-prefix=PIC
-
-define i64 @test(i32 noundef signext %c, i32 noundef signext %d) {
-; NOPIC-LABEL: test:
-; NOPIC:       # %bb.0: # %entry
-; NOPIC-NEXT:    # kill: def $x11 killed $x10
-; NOPIC-NEXT:    slli a0, a0, 32
-; NOPIC-NEXT:    srli a1, a0, 32
-; NOPIC-NEXT:    lui a0, %hi(.LCPI0_0)
-; NOPIC-NEXT:    ld a0, %lo(.LCPI0_0)(a0)
-; NOPIC-NEXT:    mul a0, a1, a0
-; NOPIC-NEXT:    addi a0, a0, 127
-; NOPIC-NEXT:    mul a0, a1, a0
-; NOPIC-NEXT:    lui a2, %hi(.LCPI0_1)
-; NOPIC-NEXT:    ld a2, %lo(.LCPI0_1)(a2)
-; NOPIC-NEXT:    mul a0, a0, a2
-; NOPIC-NEXT:    add a0, a0, a1
-; NOPIC-NEXT:    lui a1, 1015920
-; NOPIC-NEXT:    addi a1, a1, 1541
-; NOPIC-NEXT:    slli a1, a1, 16
-; NOPIC-NEXT:    addi a1, a1, 1027
-; NOPIC-NEXT:    slli a1, a1, 20
-; NOPIC-NEXT:    add a0, a0, a1
-; NOPIC-NEXT:    ret
-;
-; PIC-LABEL: test:
-; PIC:       # %bb.0: # %entry
-; PIC-NEXT:    # kill: def $x11 killed $x10
-; PIC-NEXT:    slli a0, a0, 32
-; PIC-NEXT:    srli a1, a0, 32
-; PIC-NEXT:  .Lpcrel_hi0:
-; PIC-NEXT:    auipc a0, %pcrel_hi(.LCPI0_0)
-; PIC-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi0)
-; PIC-NEXT:    ld a0, 0(a0)
-; PIC-NEXT:    mul a0, a1, a0
-; PIC-NEXT:    addi a0, a0, 127
-; PIC-NEXT:    mul a0, a1, a0
-; PIC-NEXT:  .Lpcrel_hi1:
-; PIC-NEXT:    auipc a2, %pcrel_hi(.LCPI0_1)
-; PIC-NEXT:    addi a2, a2, %pcrel_lo(.Lpcrel_hi1)
-; PIC-NEXT:    ld a2, 0(a2)
-; PIC-NEXT:    mul a0, a0, a2
-; PIC-NEXT:    add a0, a0, a1
-; PIC-NEXT:    lui a1, 1015920
-; PIC-NEXT:    addi a1, a1, 1541
-; PIC-NEXT:    slli a1, a1, 16
-; PIC-NEXT:    addi a1, a1, 1027
-; PIC-NEXT:    slli a1, a1, 20
-; PIC-NEXT:    add a0, a0, a1
-; PIC-NEXT:    ret
-entry:
-  %or1 = or i64 -9191740941672644608, 4096
-  %or2 = or i64 -9191740941672644608, 8192
-  %or3 = or i64 -9191740941672644608, 16384
-  %conv = zext i32 %c to i64
-  %donv = zext i32 %d to i64
-  %3 = mul i64 %or1, %conv
-  %4 = mul i64 %or2, %donv
-  %5 = mul i64 %or3, %conv
-  %6 = add i64 %3, %4
-  %7 = add i64 %6, %5
-  %8 = or i64 %5, 127
-  %9 = mul i64 %3, %8
-  %add = add i64 -9191740941672644608, %9
-  %add2 = add i64 %add, %conv
-  ret i64 %add2
-}

From 0c47628515dc80bd50599f936614da07943572a4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 21 Jun 2025 10:18:49 -0700
Subject: [PATCH 1170/1322] Re-commit "[RISCV] Properly support RISCVISD::LLA
 in getTargetConstantFromLoad. (#145112)"

With proper co-author.

Original message:

We need to pass the operand of LLA to GetSupportedConstantPool.

This replaces #142292 with test from there added as a pre-commit
for both medlow and pic.

Co-authored-by: Carl Nettelblad carl.nettelblad@rapidity-space.com
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +-
 .../CodeGen/RISCV/constpool-known-bits.ll     | 69 +++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/constpool-known-bits.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c54101a1156..9e568052079c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21062,7 +21062,7 @@ RISCVTargetLowering::getTargetConstantFromLoad(LoadSDNode *Ld) const {
 
   // Simple case, LLA.
   if (Ptr.getOpcode() == RISCVISD::LLA) {
-    auto *CNode = GetSupportedConstantPool(Ptr);
+    auto *CNode = GetSupportedConstantPool(Ptr.getOperand(0));
     if (!CNode || CNode->getTargetFlags() != 0)
       return nullptr;
 
diff --git a/llvm/test/CodeGen/RISCV/constpool-known-bits.ll b/llvm/test/CodeGen/RISCV/constpool-known-bits.ll
new file mode 100644
index 000000000000..85a6de1095a0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/constpool-known-bits.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O0 -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefix=NOPIC
+; RUN: llc < %s -O0 -mtriple=riscv64 -mattr=+m -relocation-model=pic | FileCheck %s --check-prefix=PIC
+
+define i64 @test(i32 noundef signext %c, i32 noundef signext %d) {
+; NOPIC-LABEL: test:
+; NOPIC:       # %bb.0: # %entry
+; NOPIC-NEXT:    # kill: def $x11 killed $x10
+; NOPIC-NEXT:    slli a0, a0, 32
+; NOPIC-NEXT:    srli a1, a0, 32
+; NOPIC-NEXT:    lui a0, %hi(.LCPI0_0)
+; NOPIC-NEXT:    ld a0, %lo(.LCPI0_0)(a0)
+; NOPIC-NEXT:    mul a0, a1, a0
+; NOPIC-NEXT:    addi a0, a0, 127
+; NOPIC-NEXT:    mul a0, a1, a0
+; NOPIC-NEXT:    lui a2, %hi(.LCPI0_1)
+; NOPIC-NEXT:    ld a2, %lo(.LCPI0_1)(a2)
+; NOPIC-NEXT:    mul a0, a0, a2
+; NOPIC-NEXT:    add a0, a0, a1
+; NOPIC-NEXT:    lui a1, 1015920
+; NOPIC-NEXT:    addi a1, a1, 1541
+; NOPIC-NEXT:    slli a1, a1, 16
+; NOPIC-NEXT:    addi a1, a1, 1027
+; NOPIC-NEXT:    slli a1, a1, 20
+; NOPIC-NEXT:    add a0, a0, a1
+; NOPIC-NEXT:    ret
+;
+; PIC-LABEL: test:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    # kill: def $x11 killed $x10
+; PIC-NEXT:    slli a0, a0, 32
+; PIC-NEXT:    srli a1, a0, 32
+; PIC-NEXT:  .Lpcrel_hi0:
+; PIC-NEXT:    auipc a0, %pcrel_hi(.LCPI0_0)
+; PIC-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi0)
+; PIC-NEXT:    ld a0, 0(a0)
+; PIC-NEXT:    mul a0, a1, a0
+; PIC-NEXT:    addi a0, a0, 127
+; PIC-NEXT:    mul a0, a1, a0
+; PIC-NEXT:  .Lpcrel_hi1:
+; PIC-NEXT:    auipc a2, %pcrel_hi(.LCPI0_1)
+; PIC-NEXT:    addi a2, a2, %pcrel_lo(.Lpcrel_hi1)
+; PIC-NEXT:    ld a2, 0(a2)
+; PIC-NEXT:    mul a0, a0, a2
+; PIC-NEXT:    add a0, a0, a1
+; PIC-NEXT:    lui a1, 1015920
+; PIC-NEXT:    addi a1, a1, 1541
+; PIC-NEXT:    slli a1, a1, 16
+; PIC-NEXT:    addi a1, a1, 1027
+; PIC-NEXT:    slli a1, a1, 20
+; PIC-NEXT:    add a0, a0, a1
+; PIC-NEXT:    ret
+entry:
+  %or1 = or i64 -9191740941672644608, 4096
+  %or2 = or i64 -9191740941672644608, 8192
+  %or3 = or i64 -9191740941672644608, 16384
+  %conv = zext i32 %c to i64
+  %donv = zext i32 %d to i64
+  %3 = mul i64 %or1, %conv
+  %4 = mul i64 %or2, %donv
+  %5 = mul i64 %or3, %conv
+  %6 = add i64 %3, %4
+  %7 = add i64 %6, %5
+  %8 = or i64 %5, 127
+  %9 = mul i64 %3, %8
+  %add = add i64 -9191740941672644608, %9
+  %add2 = add i64 %add, %conv
+  ret i64 %add2
+}

From 6c8c816b175b3b14f47f35619cade4eced1586a2 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail@igalia.com>
Date: Sat, 21 Jun 2025 14:42:45 -0300
Subject: [PATCH 1171/1322] [libc] Fix feature check for riscv (#145169)

This PR fixes the feature detection for RISC-V floating-point support in
LLVM's libc implementation.

The `__riscv_flen` macro represents the floating-point register width in
bits (32, 64, or 128). Since Extension D is specifically documented as
implying F, we can use simple >= comparisons to detect them.

For half-precision support, the implementation checks for the Zfhmin
extension as RVA22 and RVA23 profiles only require Zfhmin rather than
the full Zfh extension. Zfh also implies Zfhmin, so checking for Zfhmin
should cover all cases.
---
 libc/src/__support/macros/properties/cpu_features.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 3677e1fc3275..457a2b7869d4 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -61,15 +61,15 @@
 
 #if defined(__riscv_flen)
 // https://github.com/riscv-non-isa/riscv-c-api-doc/blob/main/src/c-api.adoc
-#if (__riscv_flen & 0x10)
+#if (__riscv_arch_test && __riscv_zfhmin)
 #define LIBC_TARGET_CPU_HAS_RISCV_FPU_HALF
 #define LIBC_TARGET_CPU_HAS_FPU_HALF
 #endif // LIBC_TARGET_CPU_HAS_RISCV_FPU_HALF
-#if (__riscv_flen & 0x20)
+#if (__riscv_flen >= 32)
 #define LIBC_TARGET_CPU_HAS_RISCV_FPU_FLOAT
 #define LIBC_TARGET_CPU_HAS_FPU_FLOAT
 #endif // LIBC_TARGET_CPU_HAS_RISCV_FPU_FLOAT
-#if (__riscv_flen & 0x40)
+#if (__riscv_flen >= 64)
 #define LIBC_TARGET_CPU_HAS_RISCV_FPU_DOUBLE
 #define LIBC_TARGET_CPU_HAS_FPU_DOUBLE
 #endif // LIBC_TARGET_CPU_HAS_RISCV_FPU_DOUBLE

From 2ed089fb18b92ad668509076b9830f55d96d27fe Mon Sep 17 00:00:00 2001
From: Tal Kedar <tal.kedar@k2dq.com>
Date: Sat, 21 Jun 2025 14:00:22 -0400
Subject: [PATCH 1172/1322] [clang/docs] add a missing brace in
 ClangFormatStyleOptions.rst (#145145)

... in the example for `WrapNamespaceBodyWithEmptyLines: Never`
---
 clang/docs/ClangFormatStyleOptions.rst | 2 +-
 clang/include/clang/Format/Format.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 83716cc049ee..548c73af6587 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6992,7 +6992,7 @@ the configuration (without a prefix: ``Auto``).
     .. code-block:: c++
 
       namespace N1 {
-      namespace N2
+      namespace N2 {
       function();
       }
       }
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 127b1d08919d..2a5cf5fb50db 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -5275,7 +5275,7 @@ struct FormatStyle {
     /// Remove all empty lines at the beginning and the end of namespace body.
     /// \code
     ///   namespace N1 {
-    ///   namespace N2
+    ///   namespace N2 {
     ///   function();
     ///   }
     ///   }

From e7dd223ec451d4e8e522aa4f2c2baaa3d027f347 Mon Sep 17 00:00:00 2001
From: Katherine Whitlock <kate@skylinesynths.nyc>
Date: Sat, 21 Jun 2025 14:10:20 -0400
Subject: [PATCH 1173/1322] [clang-tidy] Add new check
 `readability-use-numeric-limits` (#127430)

The adds a check that replaces specific numeric literals like `32767`
with the equivalent call to `std::numeric_limits` (such as
`std::numeric_limits<int16_t>::max())`.

Partially addresses #34434, but notably does not handle cases listed in
the title post such as `~0` and `-1`.
---
 .../clang-tidy/readability/CMakeLists.txt     |   1 +
 .../readability/ReadabilityTidyModule.cpp     |   3 +
 .../readability/UseNumericLimitsCheck.cpp     | 160 ++++++++++++++++++
 .../readability/UseNumericLimitsCheck.h       |  38 +++++
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 +
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../checks/readability/use-numeric-limits.rst |  31 ++++
 .../readability/use-numeric-limits.cpp        | 100 +++++++++++
 8 files changed, 340 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index 4be1a8f83133..2c40a863c5b7 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -58,6 +58,7 @@ add_clang_library(clangTidyReadabilityModule STATIC
   UniqueptrDeleteReleaseCheck.cpp
   UppercaseLiteralSuffixCheck.cpp
   UseAnyOfAllOfCheck.cpp
+  UseNumericLimitsCheck.cpp
   UseStdMinMaxCheck.cpp
 
   LINK_LIBS
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index d59b0312673b..dc47c2fb3193 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -61,6 +61,7 @@
 #include "UniqueptrDeleteReleaseCheck.h"
 #include "UppercaseLiteralSuffixCheck.h"
 #include "UseAnyOfAllOfCheck.h"
+#include "UseNumericLimitsCheck.h"
 #include "UseStdMinMaxCheck.h"
 
 namespace clang::tidy {
@@ -173,6 +174,8 @@ public:
         "readability-uppercase-literal-suffix");
     CheckFactories.registerCheck<UseAnyOfAllOfCheck>(
         "readability-use-anyofallof");
+    CheckFactories.registerCheck<UseNumericLimitsCheck>(
+        "readability-use-numeric-limits");
     CheckFactories.registerCheck<UseStdMinMaxCheck>(
         "readability-use-std-min-max");
   }
diff --git a/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp
new file mode 100644
index 000000000000..334b69755db2
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp
@@ -0,0 +1,160 @@
+//===--- UseNumericLimitsCheck.cpp - clang-tidy ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UseNumericLimitsCheck.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Lex/Preprocessor.h"
+#include <cmath>
+#include <limits>
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::readability {
+
+UseNumericLimitsCheck::UseNumericLimitsCheck(StringRef Name,
+                                             ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      SignedConstants{
+          {std::numeric_limits<int8_t>::min(),
+           "std::numeric_limits<int8_t>::min()"},
+          {std::numeric_limits<int8_t>::max(),
+           "std::numeric_limits<int8_t>::max()"},
+          {std::numeric_limits<int16_t>::min(),
+           "std::numeric_limits<int16_t>::min()"},
+          {std::numeric_limits<int16_t>::max(),
+           "std::numeric_limits<int16_t>::max()"},
+          {std::numeric_limits<int32_t>::min(),
+           "std::numeric_limits<int32_t>::min()"},
+          {std::numeric_limits<int32_t>::max(),
+           "std::numeric_limits<int32_t>::max()"},
+          {std::numeric_limits<int64_t>::min(),
+           "std::numeric_limits<int64_t>::min()"},
+          {std::numeric_limits<int64_t>::max(),
+           "std::numeric_limits<int64_t>::max()"},
+      },
+      UnsignedConstants{
+          {std::numeric_limits<uint8_t>::max(),
+           "std::numeric_limits<uint8_t>::max()"},
+          {std::numeric_limits<uint16_t>::max(),
+           "std::numeric_limits<uint16_t>::max()"},
+          {std::numeric_limits<uint32_t>::max(),
+           "std::numeric_limits<uint32_t>::max()"},
+          {std::numeric_limits<uint64_t>::max(),
+           "std::numeric_limits<uint64_t>::max()"},
+      },
+      Inserter(Options.getLocalOrGlobal("IncludeStyle",
+                                        utils::IncludeSorter::IS_LLVM),
+               areDiagsSelfContained()) {}
+
+void UseNumericLimitsCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
+}
+
+void UseNumericLimitsCheck::registerMatchers(MatchFinder *Finder) {
+  auto PositiveIntegerMatcher = [](auto Value) {
+    return unaryOperator(hasOperatorName("+"),
+                         hasUnaryOperand(integerLiteral(equals(Value))
+                                             .bind("positive-integer-literal")))
+        .bind("unary-op");
+  };
+
+  auto NegativeIntegerMatcher = [](auto Value) {
+    return unaryOperator(hasOperatorName("-"),
+                         hasUnaryOperand(integerLiteral(equals(-Value))
+                                             .bind("negative-integer-literal")))
+        .bind("unary-op");
+  };
+
+  auto BareIntegerMatcher = [](auto Value) {
+    return integerLiteral(allOf(unless(hasParent(unaryOperator(
+                                    hasAnyOperatorName("-", "+")))),
+                                equals(Value)))
+        .bind("bare-integer-literal");
+  };
+
+  for (const auto &[Value, _] : SignedConstants) {
+    if (Value < 0) {
+      Finder->addMatcher(NegativeIntegerMatcher(Value), this);
+    } else {
+      Finder->addMatcher(
+          expr(anyOf(PositiveIntegerMatcher(Value), BareIntegerMatcher(Value))),
+          this);
+    }
+  }
+
+  for (const auto &[Value, _] : UnsignedConstants) {
+    Finder->addMatcher(
+        expr(anyOf(PositiveIntegerMatcher(Value), BareIntegerMatcher(Value))),
+        this);
+  }
+}
+
+void UseNumericLimitsCheck::registerPPCallbacks(
+    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
+  Inserter.registerPreprocessor(PP);
+}
+
+void UseNumericLimitsCheck::check(const MatchFinder::MatchResult &Result) {
+  const IntegerLiteral *MatchedDecl = nullptr;
+
+  const IntegerLiteral *NegativeMatchedDecl =
+      Result.Nodes.getNodeAs<IntegerLiteral>("negative-integer-literal");
+  const IntegerLiteral *PositiveMatchedDecl =
+      Result.Nodes.getNodeAs<IntegerLiteral>("positive-integer-literal");
+  const IntegerLiteral *BareMatchedDecl =
+      Result.Nodes.getNodeAs<IntegerLiteral>("bare-integer-literal");
+
+  if (NegativeMatchedDecl != nullptr)
+    MatchedDecl = NegativeMatchedDecl;
+  else if (PositiveMatchedDecl != nullptr)
+    MatchedDecl = PositiveMatchedDecl;
+  else if (BareMatchedDecl != nullptr)
+    MatchedDecl = BareMatchedDecl;
+
+  const llvm::APInt MatchedIntegerConstant = MatchedDecl->getValue();
+
+  auto Fixer = [&](auto SourceValue, auto Value,
+                   const std::string &Replacement) {
+    static_assert(std::is_same_v<decltype(SourceValue), decltype(Value)>,
+                  "The types of SourceValue and Value must match");
+
+    SourceLocation Location = MatchedDecl->getExprLoc();
+    SourceRange Range{MatchedDecl->getBeginLoc(), MatchedDecl->getEndLoc()};
+
+    // Only valid if unary operator is present
+    const UnaryOperator *UnaryOpExpr =
+        Result.Nodes.getNodeAs<UnaryOperator>("unary-op");
+
+    if (MatchedDecl == NegativeMatchedDecl && -SourceValue == Value) {
+      Range = SourceRange(UnaryOpExpr->getBeginLoc(), UnaryOpExpr->getEndLoc());
+      Location = UnaryOpExpr->getExprLoc();
+      SourceValue = -SourceValue;
+    } else if (MatchedDecl == PositiveMatchedDecl && SourceValue == Value) {
+      Range = SourceRange(UnaryOpExpr->getBeginLoc(), UnaryOpExpr->getEndLoc());
+      Location = UnaryOpExpr->getExprLoc();
+    } else if (MatchedDecl != BareMatchedDecl || SourceValue != Value) {
+      return;
+    }
+
+    diag(Location,
+         "the constant '%0' is being utilized; consider using '%1' instead")
+        << SourceValue << Replacement
+        << FixItHint::CreateReplacement(Range, Replacement)
+        << Inserter.createIncludeInsertion(
+               Result.SourceManager->getFileID(Location), "<limits>");
+  };
+
+  for (const auto &[Value, Replacement] : SignedConstants)
+    Fixer(MatchedIntegerConstant.getSExtValue(), Value, Replacement);
+
+  for (const auto &[Value, Replacement] : UnsignedConstants)
+    Fixer(MatchedIntegerConstant.getZExtValue(), Value, Replacement);
+}
+
+} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h b/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h
new file mode 100644
index 000000000000..0e7e9abb8562
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h
@@ -0,0 +1,38 @@
+//===--- UseNumericLimitsCheck.h - clang-tidy -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USENUMERICLIMITSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USENUMERICLIMITSCHECK_H
+
+#include "../ClangTidyCheck.h"
+#include "../utils/IncludeInserter.h"
+
+namespace clang::tidy::readability {
+
+/// Finds certain integer literals and suggests replacing them with equivalent
+/// ``std::numeric_limits`` calls.
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/readability/use-numeric-limits.html
+class UseNumericLimitsCheck : public ClangTidyCheck {
+public:
+  UseNumericLimitsCheck(StringRef Name, ClangTidyContext *Context);
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP,
+                           Preprocessor *ModuleExpanderPP) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+private:
+  const llvm::SmallVector<std::pair<int64_t, std::string>> SignedConstants;
+  const llvm::SmallVector<std::pair<uint64_t, std::string>> UnsignedConstants;
+  utils::IncludeInserter Inserter;
+};
+
+} // namespace clang::tidy::readability
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USENUMERICLIMITSCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 4801dab8c1bd..9dede347b8c9 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -154,6 +154,12 @@ New checks
   Finds potentially erroneous calls to ``reset`` method on smart pointers when
   the pointee type also has a ``reset`` method.
 
+- New :doc:`readability-use-numeric-limits
+  <clang-tidy/checks/readability/use-numeric-limits>` check.
+
+  Finds certain integer literals and suggests replacing them with equivalent
+  ``std::numeric_limits`` calls.
+
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index ccb78ee45e9c..57ae7d330a3c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -409,6 +409,7 @@ Clang-Tidy Checks
    :doc:`readability-uniqueptr-delete-release <readability/uniqueptr-delete-release>`, "Yes"
    :doc:`readability-uppercase-literal-suffix <readability/uppercase-literal-suffix>`, "Yes"
    :doc:`readability-use-anyofallof <readability/use-anyofallof>`,
+   :doc:`readability-use-numeric-limits <readability/use-numeric-limits>`, "Yes"
    :doc:`readability-use-std-min-max <readability/use-std-min-max>`, "Yes"
    :doc:`zircon-temporary-objects <zircon/temporary-objects>`,
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst
new file mode 100644
index 000000000000..0f6ca9f0cf2c
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst
@@ -0,0 +1,31 @@
+.. title:: clang-tidy - readability-use-numeric-limits
+
+readability-use-numeric-limits
+==============================
+
+Finds certain integer literals and suggests replacing them with equivalent
+``std::numeric_limits`` calls.
+
+Before:
+
+.. code-block:: c++
+
+  void foo() {
+    int32_t a = 2147483647;
+  }
+
+After:
+
+.. code-block:: c++
+
+  void foo() {
+    int32_t a = std::numeric_limits<int32_t>::max();
+  }
+
+Options
+-------
+
+.. option:: IncludeStyle
+
+   A string specifying which include-style is used, `llvm` or `google`. Default
+   is `llvm`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp
new file mode 100644
index 000000000000..e02d6f1b7126
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp
@@ -0,0 +1,100 @@
+// RUN: %check_clang_tidy %s readability-use-numeric-limits %t
+// CHECK-FIXES: #include <limits>
+
+using int8_t = signed char;
+using int16_t = short;
+using int32_t = int;
+using int64_t = long long;
+using uint8_t = unsigned char;
+using uint16_t = unsigned short;
+using uint32_t = unsigned int;
+using uint64_t = unsigned long long;
+
+
+void Invalid() {
+  // CHECK-MESSAGES: :[[@LINE+2]]:14: warning: the constant '-128' is being utilized; consider using 'std::numeric_limits<int8_t>::min()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int8_t a = std::numeric_limits<int8_t>::min();
+  int8_t a = -128;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:14: warning: the constant '127' is being utilized; consider using 'std::numeric_limits<int8_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int8_t b = std::numeric_limits<int8_t>::max();
+  int8_t b = +127;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:14: warning: the constant '127' is being utilized; consider using 'std::numeric_limits<int8_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int8_t c = std::numeric_limits<int8_t>::max();
+  int8_t c = 127;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '-32768' is being utilized; consider using 'std::numeric_limits<int16_t>::min()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int16_t d = std::numeric_limits<int16_t>::min();
+  int16_t d = -32768;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '32767' is being utilized; consider using 'std::numeric_limits<int16_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int16_t e = std::numeric_limits<int16_t>::max();
+  int16_t e = +32767;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '32767' is being utilized; consider using 'std::numeric_limits<int16_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int16_t f = std::numeric_limits<int16_t>::max();
+  int16_t f = 32767;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '-2147483648' is being utilized; consider using 'std::numeric_limits<int32_t>::min()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int32_t g = std::numeric_limits<int32_t>::min();
+  int32_t g = -2147483648;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '2147483647' is being utilized; consider using 'std::numeric_limits<int32_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int32_t h = std::numeric_limits<int32_t>::max();
+  int32_t h = +2147483647;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '2147483647' is being utilized; consider using 'std::numeric_limits<int32_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int32_t i = std::numeric_limits<int32_t>::max();
+  int32_t i = 2147483647;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '-9223372036854775808' is being utilized; consider using 'std::numeric_limits<int64_t>::min()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int64_t j = std::numeric_limits<int64_t>::min();
+  int64_t j = -9223372036854775808;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '9223372036854775807' is being utilized; consider using 'std::numeric_limits<int64_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int64_t k = std::numeric_limits<int64_t>::max();
+  int64_t k = +9223372036854775807;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '9223372036854775807' is being utilized; consider using 'std::numeric_limits<int64_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: int64_t l = std::numeric_limits<int64_t>::max();
+  int64_t l = 9223372036854775807;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '255' is being utilized; consider using 'std::numeric_limits<uint8_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint8_t m = std::numeric_limits<uint8_t>::max();
+  uint8_t m = 255;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '255' is being utilized; consider using 'std::numeric_limits<uint8_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint8_t n = std::numeric_limits<uint8_t>::max();
+  uint8_t n = +255;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '65535' is being utilized; consider using 'std::numeric_limits<uint16_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint16_t o = std::numeric_limits<uint16_t>::max();
+  uint16_t o = 65535;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '65535' is being utilized; consider using 'std::numeric_limits<uint16_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint16_t p = std::numeric_limits<uint16_t>::max();
+  uint16_t p = +65535;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '4294967295' is being utilized; consider using 'std::numeric_limits<uint32_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint32_t q = std::numeric_limits<uint32_t>::max();
+  uint32_t q = 4294967295;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '4294967295' is being utilized; consider using 'std::numeric_limits<uint32_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint32_t r = std::numeric_limits<uint32_t>::max();
+  uint32_t r = +4294967295;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '18446744073709551615' is being utilized; consider using 'std::numeric_limits<uint64_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint64_t s = std::numeric_limits<uint64_t>::max();
+  uint64_t s = 18446744073709551615;
+
+  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '18446744073709551615' is being utilized; consider using 'std::numeric_limits<uint64_t>::max()' instead [readability-use-numeric-limits]
+  // CHECK-FIXES: uint64_t t = std::numeric_limits<uint64_t>::max();
+  uint64_t t = +18446744073709551615;
+}
+
+void Valid(){
+  int16_t a = +128;
+
+  int16_t b = -127;
+}

From 550ed9ef198e530fb66e22198165503cc9c9de80 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 21 Jun 2025 18:10:42 +0000
Subject: [PATCH 1174/1322] [gn build] Port e7dd223ec451

---
 .../secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index cad2c4912b2c..f6dd75008f19 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -66,6 +66,7 @@ static_library("readability") {
     "UniqueptrDeleteReleaseCheck.cpp",
     "UppercaseLiteralSuffixCheck.cpp",
     "UseAnyOfAllOfCheck.cpp",
+    "UseNumericLimitsCheck.cpp",
     "UseStdMinMaxCheck.cpp",
   ]
 }

From 056b52df344f688fd3831a07bc477f77f883a696 Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Sat, 21 Jun 2025 11:56:35 -0700
Subject: [PATCH 1175/1322] [clang-doc] Precommit test for global variables
 (#145069)

---
 .../test/clang-doc/json/namespace.cpp         | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/test/clang-doc/json/namespace.cpp b/clang-tools-extra/test/clang-doc/json/namespace.cpp
index 928864be1feb..248d47351bd3 100644
--- a/clang-tools-extra/test/clang-doc/json/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/namespace.cpp
@@ -103,5 +103,23 @@ typedef int MyTypedef;
 // CHECK-NEXT:      }
 // CHECK-NEXT:    ],
 // CHECK-NEXT:    "USR": "0000000000000000000000000000000000000000"
-// CHECK-NOT:     "Variables": [
+// CHECK-NOT:    "Variables": [
+// CHECK-NOT:      {
+// CHECK-NOT:        "IsStatic": true,
+// CHECK-NOT:        "Location": {
+// CHECK-NOT:          "Filename": "{{.*}}namespace.cpp",
+// CHECK-NOT:          "LineNumber": 13
+// CHECK-NOT:        },
+// CHECK-NOT:        "Name": "Global",
+// CHECK-NOT:        "Type": {
+// COM:                FIXME: IsBuiltIn emits as its default value
+// CHECK-NOT:          "IsBuiltIn": false,
+// CHECK-NOT:          "IsTemplate": false,
+// CHECK-NOT:          "Name": "int",
+// CHECK-NOT:          "QualName": "int",
+// CHECK-NOT:          "USR": "0000000000000000000000000000000000000000"
+// CHECK-NOT:        },
+// CHECK-NOT:        "USR": "{{[0-9A-F]*}}"
+// CHECK-NOT:      }
+// CHECK-NOT:    ]
 // CHECK-NEXT:  }

From e066f35c6981c720e3a7e5883efc40c861b3b7ee Mon Sep 17 00:00:00 2001
From: eleviant <56861949+eleviant@users.noreply.github.com>
Date: Sat, 21 Jun 2025 22:48:08 +0200
Subject: [PATCH 1176/1322] [lldb] Fix qEcho message handling (#145072)

Patch fixes the sync-on-timeout logic in lldb and switches to qEcho
based ping, instead of qC. This fixes vRun message case, when there is
no process yet and qC returns an error.
---
 .../Python/lldbsuite/test/gdbclientutils.py   | 10 +++
 .../gdb-remote/GDBRemoteCommunication.cpp     |  3 +-
 .../GDBRemoteCommunicationClient.cpp          |  2 +-
 .../gdb_remote_client/TestGDBRemoteClient.py  | 72 +++++++++++++++++++
 4 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 753de22b9cfe..b603c35c8df0 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -92,6 +92,9 @@ class MockGDBServerResponder:
     class RESPONSE_DISCONNECT:
         pass
 
+    class RESPONSE_NONE:
+        pass
+
     def __init__(self):
         self.packetLog = []
 
@@ -181,6 +184,8 @@ class MockGDBServerResponder:
             return self.qQueryGDBServer()
         if packet == "qHostInfo":
             return self.qHostInfo()
+        if packet.startswith("qEcho"):
+            return self.qEcho(int(packet.split(":")[1]))
         if packet == "qGetWorkingDir":
             return self.qGetWorkingDir()
         if packet == "qOffsets":
@@ -237,6 +242,9 @@ class MockGDBServerResponder:
     def qHostInfo(self):
         return "ptrsize:8;endian:little;"
 
+    def qEcho(self):
+        return "E04"
+
     def qQueryGDBServer(self):
         return "E04"
 
@@ -655,6 +663,8 @@ class MockGDBServer:
         if not isinstance(response, list):
             response = [response]
         for part in response:
+            if part is MockGDBServerResponder.RESPONSE_NONE:
+                continue
             if part is MockGDBServerResponder.RESPONSE_DISCONNECT:
                 raise self.TerminateConnectionException()
             self._sendPacket(part)
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 2aea7c6b781d..f244f7abe45e 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -354,8 +354,9 @@ GDBRemoteCommunication::WaitForPacketNoLock(StringExtractorGDBRemote &packet,
             disconnected = true;
             Disconnect();
           }
+        } else {
+          timed_out = true;
         }
-        timed_out = true;
         break;
       case eConnectionStatusSuccess:
         // printf ("status = success but error = %s\n",
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index adbf06b9a19a..d8130cae71ce 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -406,7 +406,7 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qXfer_memory_map_read = eLazyBoolYes;
       else if (x == "qXfer:siginfo:read+")
         m_supports_qXfer_siginfo_read = eLazyBoolYes;
-      else if (x == "qEcho")
+      else if (x == "qEcho+")
         m_supports_qEcho = eLazyBoolYes;
       else if (x == "QPassSignals+")
         m_supports_QPassSignals = eLazyBoolYes;
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
index 08ac9290ee85..12b464d3397e 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
@@ -356,6 +356,78 @@ class TestGDBRemoteClient(GDBRemoteTestBase):
             ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
         )
 
+    def test_launch_lengthy_vRun(self):
+        class MyResponder(MockGDBServerResponder):
+            def __init__(self, *args, **kwargs):
+                self.started = False
+                return super().__init__(*args, **kwargs)
+
+            def qC(self):
+                if self.started:
+                    return "QCp10.10"
+                else:
+                    return "E42"
+
+            def qfThreadInfo(self):
+                if self.started:
+                    return "mp10.10"
+                else:
+                    return "E42"
+
+            def qsThreadInfo(self):
+                return "l"
+
+            def qEcho(self, num):
+                resp = "qEcho:" + str(num)
+                if num >= 2:
+                    # We have launched our program
+                    self.started = True
+                    return [resp, "T13"]
+
+                return resp
+
+            def qSupported(self, client_supported):
+                return "PacketSize=3fff;QStartNoAckMode+;qEcho+;"
+
+            def qHostInfo(self):
+                return "default_packet_timeout:1;"
+
+            def vRun(self, packet):
+                return [self.RESPONSE_NONE]
+
+            def A(self, packet):
+                return "E28"
+
+        self.server.responder = MyResponder()
+
+        target = self.createTarget("a.yaml")
+        # NB: apparently GDB packets are using "/" on Windows too
+        exe_path = self.getBuildArtifact("a").replace(os.path.sep, "/")
+        exe_hex = binascii.b2a_hex(exe_path.encode()).decode()
+        process = self.connect(target)
+        lldbutil.expect_state_changes(
+            self, self.dbg.GetListener(), process, [lldb.eStateConnected]
+        )
+
+        process = target.Launch(
+            lldb.SBListener(),
+            ["arg1", "arg2", "arg3"],  # argv
+            [],  # envp
+            None,  # stdin_path
+            None,  # stdout_path
+            None,  # stderr_path
+            None,  # working_directory
+            0,  # launch_flags
+            True,  # stop_at_entry
+            lldb.SBError(),
+        )  # error
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertEqual(process.GetProcessID(), 16)
+
+        self.assertPacketLogContains(
+            ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
+        )
+
     def test_launch_QEnvironment(self):
         class MyResponder(MockGDBServerResponder):
             def qC(self):

From c455f4a32d91436c131a751fc9587d7fa3ded614 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 21 Jun 2025 22:03:18 +0100
Subject: [PATCH 1177/1322] [LV] Add more tests for narrowing interleave groups
 with live-ins.

---
 ...interleave-to-widen-memory-constant-ops.ll | 314 ++++++++++++++++++
 1 file changed, 314 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 5f6a372db2e9..7d3b3d86b90d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -183,3 +183,317 @@ loop:
 exit:
   ret void
 }
+
+define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noalias %B, double %x) {
+; CHECK-LABEL: define void @test_add_double_same_var_args_1(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], double [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %add.0 = fadd double %l.A.0, %x
+  %add.1 = fadd double %l.A.1, %x
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noalias %B, double %x) {
+; CHECK-LABEL: define void @test_add_double_same_var_args_2(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], double [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %add.0 = fadd double %x, %l.A.0
+  %add.1 = fadd double %x, %l.A.1
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_add_double_same_var_args_at_different_positions(ptr %res, ptr noalias %A, ptr noalias %B, double %x) {
+; CHECK-LABEL: define void @test_add_double_same_var_args_at_different_positions(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], double [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %add.0 = fadd double %l.A.0, %x
+  %add.1 = fadd double %x, %l.A.1
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_add_double_different_var_args_1(ptr %res, ptr noalias %A, ptr noalias %B, double %x, double %y) {
+; CHECK-LABEL: define void @test_add_double_different_var_args_1(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[Y]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC7]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %add.0 = fadd double %l.A.0, %x
+  %add.1 = fadd double %l.A.1, %y
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_add_double_different_var_args_2(ptr %res, ptr noalias %A, ptr noalias %B, double %x, double %y) {
+; CHECK-LABEL: define void @test_add_double_different_var_args_2(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Y]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT2]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT2]], [[STRIDED_VEC6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC7]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %add.0 = fadd double %y, %l.A.0
+  %add.1 = fadd double %x, %l.A.1
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

From 757c80d88a6aeb6c4d61d76c492a033bb25ab028 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Sat, 21 Jun 2025 22:37:02 +0000
Subject: [PATCH 1178/1322] Add `REQUIRES: x86` to test added in 141197 to skip
 when x86 target is not present.

---
 lld/test/COFF/strtab.s | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lld/test/COFF/strtab.s b/lld/test/COFF/strtab.s
index 4d8fa39f56db..fbdd8df52d54 100644
--- a/lld/test/COFF/strtab.s
+++ b/lld/test/COFF/strtab.s
@@ -1,3 +1,4 @@
+# REQUIRES: x86
 # RUN: llvm-mc -triple=x86_64-windows-msvc %s -filetype=obj -o %t.obj
 # RUN: lld-link -out:%t.exe -entry:main %t.obj -debug:dwarf
 # RUN: llvm-readobj --string-table %t.exe | FileCheck %s

From 570885128351868c1308bb22e8ca351d318bc4a1 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Sat, 21 Jun 2025 16:02:26 -0700
Subject: [PATCH 1179/1322] Revert "[ValueTracking] Improve `Bitcast` handling
 to match SDAG" (#145191)

Reverts llvm/llvm-project#125935

Causes miscompiles, see comments in #125935
---
 llvm/lib/Analysis/ValueTracking.cpp           | 27 ++-----------------
 .../InstCombine/X86/x86-vector-shifts.ll      |  4 +--
 .../InstCombine/bitcast-known-bits.ll         | 21 ++++++++++-----
 3 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 73320b556f82..a17417cb5189 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1346,8 +1346,6 @@ static void computeKnownBitsFromOperator(const Operator *I,
         isa<ScalableVectorType>(I->getType()))
       break;
 
-    unsigned NumElts = DemandedElts.getBitWidth();
-    bool IsLE = Q.DL.isLittleEndian();
     // Look through a cast from narrow vector elements to wider type.
     // Examples: v4i32 -> v2i64, v3i8 -> v24
     unsigned SubBitWidth = SrcVecTy->getScalarSizeInBits();
@@ -1366,6 +1364,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       //
       // The known bits of each sub-element are then inserted into place
       // (dependent on endian) to form the full result of known bits.
+      unsigned NumElts = DemandedElts.getBitWidth();
       unsigned SubScale = BitWidth / SubBitWidth;
       APInt SubDemandedElts = APInt::getZero(NumElts * SubScale);
       for (unsigned i = 0; i != NumElts; ++i) {
@@ -1377,32 +1376,10 @@ static void computeKnownBitsFromOperator(const Operator *I,
       for (unsigned i = 0; i != SubScale; ++i) {
         computeKnownBits(I->getOperand(0), SubDemandedElts.shl(i), KnownSrc, Q,
                          Depth + 1);
-        unsigned ShiftElt = IsLE ? i : SubScale - 1 - i;
+        unsigned ShiftElt = Q.DL.isLittleEndian() ? i : SubScale - 1 - i;
         Known.insertBits(KnownSrc, ShiftElt * SubBitWidth);
       }
     }
-    // Look through a cast from wider vector elements to narrow type.
-    // Examples: v2i64 -> v4i32
-    if (SubBitWidth % BitWidth == 0) {
-      unsigned SubScale = SubBitWidth / BitWidth;
-      KnownBits KnownSrc(SubBitWidth);
-      APInt SubDemandedElts =
-          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
-      computeKnownBits(I->getOperand(0), SubDemandedElts, KnownSrc, Q,
-                       Depth + 1);
-
-      Known.Zero.setAllBits();
-      Known.One.setAllBits();
-      for (unsigned i = 0; i != SubScale; ++i) {
-        if (DemandedElts[i]) {
-          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
-          unsigned Offset = (Shifts % SubScale) * BitWidth;
-          Known = Known.intersectWith(KnownSrc.extractBits(BitWidth, Offset));
-          if (Known.isUnknown())
-            break;
-        }
-      }
-    }
     break;
   }
   case Instruction::SExt: {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index cc252ae53803..db56080a3ea2 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -3732,6 +3732,7 @@ define <4 x i64> @test_avx2_psrl_0() {
   ret <4 x i64> %16
 }
 
+; FIXME: Failure to peek through bitcasts to ensure psllq shift amount is within bounds.
 define <2 x i64> @PR125228(<2 x i64> %v, <2 x i64> %s) {
 ; CHECK-LABEL: @PR125228(
 ; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i64> [[S:%.*]], splat (i64 63)
@@ -3740,8 +3741,7 @@ define <2 x i64> @PR125228(<2 x i64> %v, <2 x i64> %s) {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[MASK]] to <16 x i8>
 ; CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[CAST3:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[CAST3]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[SLL1:%.*]] = shl <2 x i64> [[V]], [[TMP2]]
+; CHECK-NEXT:    [[SLL1:%.*]] = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> [[V]], <2 x i64> [[CAST3]])
 ; CHECK-NEXT:    [[SHUFP_UNCASTED:%.*]] = shufflevector <2 x i64> [[SLL0]], <2 x i64> [[SLL1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    ret <2 x i64> [[SHUFP_UNCASTED]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
index 65b43df752f7..3e47e775e3a2 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-known-bits.ll
@@ -12,7 +12,8 @@ define <16 x i8> @knownbits_bitcast_masked_shift(<16 x i8> %arg1, <16 x i8> %arg
 ; CHECK-NEXT:    [[BITCAST4:%.*]] = bitcast <16 x i8> [[OR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL5:%.*]] = shl nuw <8 x i16> [[BITCAST4]], splat (i16 2)
 ; CHECK-NEXT:    [[BITCAST6:%.*]] = bitcast <8 x i16> [[SHL5]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[BITCAST6]]
+; CHECK-NEXT:    [[AND7:%.*]] = and <16 x i8> [[BITCAST6]], splat (i8 -52)
+; CHECK-NEXT:    ret <16 x i8> [[AND7]]
 ;
   %and = and <16 x i8> %arg1, splat (i8 3)
   %and3 = and <16 x i8> %arg2, splat (i8 48)
@@ -32,7 +33,8 @@ define <16 x i8> @knownbits_shuffle_masked_nibble_shift(<16 x i8> %arg)  {
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
 ; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[BITCAST2]]
+; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
+; CHECK-NEXT:    ret <16 x i8> [[AND3]]
 ;
   %and = and <16 x i8> %arg, splat (i8 15)
   %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -51,7 +53,8 @@ define <16 x i8> @knownbits_reverse_shuffle_masked_shift(<16 x i8> %arg)  {
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <16 x i8> [[SHUFFLEVECTOR]] to <8 x i16>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <8 x i16> [[BITCAST1]], splat (i16 4)
 ; CHECK-NEXT:    [[BITCAST2:%.*]] = bitcast <8 x i16> [[SHL]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[BITCAST2]]
+; CHECK-NEXT:    [[AND3:%.*]] = and <16 x i8> [[BITCAST2]], splat (i8 -16)
+; CHECK-NEXT:    ret <16 x i8> [[AND3]]
 ;
   %and = and <16 x i8> %arg, splat (i8 15)
   %shufflevector = shufflevector <16 x i8> %and, <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -67,7 +70,8 @@ define <16 x i8> @knownbits_extract_bit(<8 x i16> %arg)  {
 ; CHECK-SAME: <8 x i16> [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <8 x i16> [[ARG]], splat (i16 15)
 ; CHECK-NEXT:    [[BITCAST1:%.*]] = bitcast <8 x i16> [[LSHR]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[BITCAST1]]
+; CHECK-NEXT:    [[AND:%.*]] = and <16 x i8> [[BITCAST1]], splat (i8 1)
+; CHECK-NEXT:    ret <16 x i8> [[AND]]
 ;
   %lshr = lshr <8 x i16> %arg, splat (i16 15)
   %bitcast1 = bitcast <8 x i16> %lshr to <16 x i8>
@@ -84,8 +88,7 @@ define { i32, i1 } @knownbits_popcount_add_with_overflow(<2 x i64> %arg1, <2 x i
 ; CHECK-NEXT:    [[CALL9:%.*]] = tail call range(i64 0, 65) <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[ARG2]])
 ; CHECK-NEXT:    [[BITCAST10:%.*]] = bitcast <2 x i64> [[CALL9]] to <4 x i32>
 ; CHECK-NEXT:    [[EXTRACTELEMENT11:%.*]] = extractelement <4 x i32> [[BITCAST10]], i64 0
-; CHECK-NEXT:    [[CALL12:%.*]] = add nuw nsw i32 [[EXTRACTELEMENT]], [[EXTRACTELEMENT11]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, i1 } { i32 poison, i1 false }, i32 [[CALL12]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[EXTRACTELEMENT]], i32 [[EXTRACTELEMENT11]])
 ; CHECK-NEXT:    ret { i32, i1 } [[TMP1]]
 ;
   %call = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %arg1)
@@ -107,7 +110,11 @@ define <16 x i8> @knownbits_shuffle_add_shift_v32i8(<16 x i8> %arg1, <8 x i16> %
 ; CHECK-NEXT:    [[BITCAST11:%.*]] = bitcast <8 x i16> [[SHL10]] to <16 x i8>
 ; CHECK-NEXT:    [[ADD12:%.*]] = add <16 x i8> [[BITCAST11]], [[BITCAST7]]
 ; CHECK-NEXT:    [[ADD14:%.*]] = add <16 x i8> [[ADD12]], [[ARG1]]
-; CHECK-NEXT:    ret <16 x i8> [[ADD14]]
+; CHECK-NEXT:    [[BITCAST14:%.*]] = bitcast <16 x i8> [[ADD12]] to <8 x i16>
+; CHECK-NEXT:    [[SHL15:%.*]] = shl <8 x i16> [[BITCAST14]], splat (i16 8)
+; CHECK-NEXT:    [[BITCAST16:%.*]] = bitcast <8 x i16> [[SHL15]] to <16 x i8>
+; CHECK-NEXT:    [[ADD13:%.*]] = add <16 x i8> [[ADD14]], [[BITCAST16]]
+; CHECK-NEXT:    ret <16 x i8> [[ADD13]]
 ;
   %shl6 = shl <8 x i16> %arg2, splat (i16 8)
   %bitcast7 = bitcast <8 x i16> %shl6 to <16 x i8>

From f280d3b705de7f94ef9756e3ef2842b415a7c038 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 22 Jun 2025 08:19:19 +0900
Subject: [PATCH 1180/1322] AMDGPU: Avoid report_fatal_error for
 getRegisterByName subtarget case (#145173)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp                   | 6 ++++--
 llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3281eabcd4ad..b9023b6d7a3a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4481,6 +4481,8 @@ SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
 
 Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                              const MachineFunction &MF) const {
+  const Function &Fn = MF.getFunction();
+
   Register Reg = StringSwitch<Register>(RegName)
                      .Case("m0", AMDGPU::M0)
                      .Case("exec", AMDGPU::EXEC)
@@ -4498,8 +4500,8 @@ Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
 
   if (!Subtarget->hasFlatScrRegister() &&
       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
-    report_fatal_error(Twine("invalid register \"" + StringRef(RegName) +
-                             "\" for subtarget."));
+    Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
+                                    "\" for subtarget."));
   }
 
   switch (Reg) {
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index 0e9ea0c341cd..a91bba41bed4 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -1,6 +1,6 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
-; CHECK: invalid register "flat_scratch_lo" for subtarget.
+; CHECK: error: invalid register "flat_scratch_lo" for subtarget.
 
 declare i32 @llvm.read_register.i32(metadata) #0
 

From b7d0c9b9d8e2b5c5d6677e368e3cdaf438df294e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 21 Jun 2025 18:56:00 -0700
Subject: [PATCH 1181/1322] [SelectionDAG][RISCV] Treat zext nneg as sext in
 PromoteIntOp_ZERO_EXTEND if the promoted input is sign extended. (#145120)

If the zext has the nneg flag and we can prove the promoted input
is sign extended, we can avoid generating an AND that we might not
be able to remove. RISC-V emits a lot of sext_inreg operations during
i32->i64 promotion that makes this likely.

I've restricted this to the case where the promoted type is the same
as the result type so we don't need to create an additional extend.

I've also restricted it to cases where the target has stated a
preference for sext like i32->i64 on RV64. This is largely to avoid
wasting time in computeNumSignBits until we have a test case that
benefits.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 19 ++++-
 llvm/test/CodeGen/RISCV/shifts.ll             | 75 +++++++++++++++++++
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index dd6467622205..dd0412460f4e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2605,9 +2605,22 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STRICT_UINT_TO_FP(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
   SDLoc dl(N);
-  SDValue Op = GetPromotedInteger(N->getOperand(0));
-  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
-  return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType());
+  SDValue Src = N->getOperand(0);
+  SDValue Op = GetPromotedInteger(Src);
+  EVT VT = N->getValueType(0);
+
+  // If this zext has the nneg flag and the target prefers sext, see if the
+  // promoted input is already sign extended.
+  // TODO: Should we have some way to set nneg on ISD::AND instead?
+  if (N->getFlags().hasNonNeg() && Op.getValueType() == VT &&
+      TLI.isSExtCheaperThanZExt(Src.getValueType(), VT)) {
+    unsigned OpEffectiveBits = DAG.ComputeMaxSignificantBits(Op);
+    if (OpEffectiveBits <= Src.getScalarValueSizeInBits())
+      return Op;
+  }
+
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op);
+  return DAG.getZeroExtendInReg(Op, dl, Src.getValueType());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_VP_ZERO_EXTEND(SDNode *N) {
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 32a037918a5a..7ca1ee1cba2f 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -779,3 +779,78 @@ define i128 @shl128_shamt32(i128 %a, i32 signext %b) nounwind {
   %1 = shl i128 %a, %zext
   ret i128 %1
 }
+
+; Do some arithmetic on the i32 shift amount before the zext nneg. This
+; arithmetic will be promoted using a W instruction RV64. Make sure we can use
+; this to avoid an unncessary zext of the shift amount.
+define i128 @shl128_shamt32_arith(i128 %a, i32 signext %b) nounwind {
+; RV32I-LABEL: shl128_shamt32_arith:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    addi a2, a2, 1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    addi a6, sp, 16
+; RV32I-NEXT:    srli a7, a2, 3
+; RV32I-NEXT:    andi t0, a2, 31
+; RV32I-NEXT:    andi a7, a7, 12
+; RV32I-NEXT:    sub a6, a6, a7
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    lw a1, 0(a6)
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    lw a4, 8(a6)
+; RV32I-NEXT:    lw a5, 12(a6)
+; RV32I-NEXT:    xori a6, t0, 31
+; RV32I-NEXT:    sll a7, a3, a2
+; RV32I-NEXT:    srli t0, a1, 1
+; RV32I-NEXT:    sll a5, a5, a2
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    sll a2, a4, a2
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    srli a4, a4, 1
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    srl a3, a3, a6
+; RV32I-NEXT:    srl a4, a4, a6
+; RV32I-NEXT:    or a6, a7, t0
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    sw a6, 4(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: shl128_shamt32_arith:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addiw a4, a2, 1
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    sll a2, a0, a4
+; RV64I-NEXT:    bltz a3, .LBB17_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a1, a2
+; RV64I-NEXT:    j .LBB17_3
+; RV64I-NEXT:  .LBB17_2:
+; RV64I-NEXT:    sll a1, a1, a4
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    not a4, a4
+; RV64I-NEXT:    srl a0, a0, a4
+; RV64I-NEXT:    or a1, a1, a0
+; RV64I-NEXT:  .LBB17_3:
+; RV64I-NEXT:    srai a0, a3, 63
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    ret
+  %c = add i32 %b, 1
+  %zext = zext nneg i32 %c to i128
+  %1 = shl i128 %a, %zext
+  ret i128 %1
+}

From fa0b84f23c08994ce4e780f7c406e981d37599b1 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Sat, 21 Jun 2025 21:42:09 -0700
Subject: [PATCH 1182/1322] [AMDGPU] Rename call instructions from b64 to i64
 (#145103)

These get renamed in gfx1250 and on from B64 to I64:

  S_CALL_I64
  S_GET_PC_I64
  S_RFE_I64
  S_SET_PC_I64
  S_SWAP_PC_I64
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 20 +++++++
 .../Disassembler/AMDGPUDisassembler.cpp       |  8 +++
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  1 +
 llvm/lib/Target/AMDGPU/SIDefines.h            |  1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  3 ++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     | 26 +++++++++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  4 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  1 +
 llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s        | 30 +++++++++++
 llvm/test/MC/AMDGPU/gfx1250_asm_sopk.s        |  9 ++++
 .../Disassembler/AMDGPU/gfx12_dasm_sop1.txt   | 54 ++++++++++++-------
 .../Disassembler/AMDGPU/gfx12_dasm_sopk.txt   | 20 ++++---
 12 files changed, 152 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_sopk.s

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4b17e1c808b5..ab83cf9e7395 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2244,10 +2244,30 @@ def isGFX12Only :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">,
   AssemblerPredicate<(all_of FeatureGFX12Insts)>;
 
+def isGFX12Not12_50 :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX1250Insts))>;
+
 def isGFX12Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
   AssemblerPredicate<(all_of FeatureGFX12Insts)>;
 
+def isGFX12PlusNot12_50 :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX1250Insts))>;
+
+def isGFX125xOnly :
+  Predicate<"Subtarget->hasGFX1250Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX1250Insts)>;
+
+def isGFX1250Plus :
+  Predicate<"Subtarget->hasGFX1250Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX1250Insts)>;
+
+def isNotGFX1250Plus :
+  Predicate<"!Subtarget->hasGFX1250Insts()">,
+  AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>;
+
 def HasMinimum3Maximum3F32 :
   Predicate<"Subtarget->hasMinimum3Maximum3F32()">,
   AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 349e408b7965..2e891419f0e3 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -717,6 +717,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                         Address, CS))
         break;
 
+      // FIXME: Should use DecoderTableGFX1250_FAKE1632, but it is not generated
+      //        yet.
+      if (isGFX1250() &&
+          tryDecodeInst(DecoderTableGFX125032, MI, DW, Address, CS))
+        break;
+
       if (isGFX12() &&
           tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
                         Address, CS))
@@ -2022,6 +2028,8 @@ bool AMDGPUDisassembler::isGFX12Plus() const {
   return AMDGPU::isGFX12Plus(STI);
 }
 
+bool AMDGPUDisassembler::isGFX1250() const { return AMDGPU::isGFX1250(STI); }
+
 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index a82dee430e01..67156b4a3a18 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -230,6 +230,7 @@ public:
   bool isGFX11Plus() const;
   bool isGFX12() const;
   bool isGFX12Plus() const;
+  bool isGFX1250() const;
 
   bool hasArchitectedFlatScratch() const;
   bool hasKernargPreload() const;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 0f603a43fd62..baf74dbdde20 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -45,6 +45,7 @@ enum {
   GFX940 = 9,
   GFX11 = 10,
   GFX12 = 11,
+  GFX1250 = 12,
 };
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 768f57c469d6..53c0635f02bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -31,6 +31,7 @@ def SIEncodingFamily {
   int GFX940 = 9;
   int GFX11 = 10;
   int GFX12 = 11;
+  int GFX1250 = 12;
 }
 
 //===----------------------------------------------------------------------===//
@@ -44,6 +45,8 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> {
   int Subtarget = sub;
 }
 
+def GFX1250Gen       : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>;
+def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
 def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
 def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
 def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 3f2e764f2926..8c739c2760b1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2008,6 +2008,15 @@ multiclass SOP1_IMM_Real_gfx12<bits<8> op> {
 multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> :
   SOP1_Real_gfx11<op, name>, SOP1_Real_gfx12<op, name>;
 
+multiclass SOP1_Real_gfx1250<bits<8> op, string name = !tolower(NAME)> {
+  defvar ps = !cast<SOP1_Pseudo>(NAME);
+  def _gfx1250 : SOP1_Real<op, ps, name>,
+                 Select<GFX1250Gen, ps.PseudoInstr>;
+  if !ne(ps.Mnemonic, name) then
+    let AssemblerPredicate = isGFX1250Plus in
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
+}
+
 defm S_MOV_B32                    : SOP1_Real_gfx11_gfx12<0x000>;
 defm S_MOV_B64                    : SOP1_Real_gfx11_gfx12<0x001>;
 defm S_CMOV_B32                   : SOP1_Real_gfx11_gfx12<0x002>;
@@ -2066,10 +2075,16 @@ defm S_MOVRELS_B64                : SOP1_Real_gfx11_gfx12<0x041>;
 defm S_MOVRELD_B32                : SOP1_Real_gfx11_gfx12<0x042>;
 defm S_MOVRELD_B64                : SOP1_Real_gfx11_gfx12<0x043>;
 defm S_MOVRELSD_2_B32             : SOP1_Real_gfx11_gfx12<0x044>;
+let OtherPredicates = [isNotGFX1250Plus] in {
 defm S_GETPC_B64                  : SOP1_Real_gfx11_gfx12<0x047>;
 defm S_SETPC_B64                  : SOP1_Real_gfx11_gfx12<0x048>;
 defm S_SWAPPC_B64                 : SOP1_Real_gfx11_gfx12<0x049>;
 defm S_RFE_B64                    : SOP1_Real_gfx11_gfx12<0x04a>;
+}
+defm S_GETPC_B64                  : SOP1_Real_gfx1250<0x047, "s_get_pc_i64">;
+defm S_SETPC_B64                  : SOP1_Real_gfx1250<0x048, "s_set_pc_i64">;
+defm S_SWAPPC_B64                 : SOP1_Real_gfx1250<0x049, "s_swap_pc_i64">;
+defm S_RFE_B64                    : SOP1_Real_gfx1250<0x04a, "s_rfe_i64">;
 defm S_SENDMSG_RTN_B32            : SOP1_Real_gfx11_gfx12<0x04c>;
 defm S_SENDMSG_RTN_B64            : SOP1_Real_gfx11_gfx12<0x04d>;
 defm S_BARRIER_SIGNAL_M0          : SOP1_M0_Real_gfx12<0x04e>;
@@ -2444,10 +2459,21 @@ multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> :
 multiclass SOPK_Real64_gfx11_gfx12<bits<5> op> :
   SOPK_Real64_gfx11<op>, SOPK_Real64_gfx12<op>;
 
+multiclass SOPK_Real32_gfx1250<bits<5> op, string name = !tolower(NAME)> {
+  defvar ps = !cast<SOPK_Pseudo>(NAME);
+  def _gfx1250 : SOPK_Real32<op, ps, name>,
+                 Select<GFX1250Gen, ps.PseudoInstr>;
+  if !ne(ps.Mnemonic, name) then
+    let AssemblerPredicate = isGFX1250Plus in
+    def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
+}
+
 defm S_GETREG_B32           : SOPK_Real32_gfx11_gfx12<0x011>;
 defm S_SETREG_B32           : SOPK_Real32_gfx11_gfx12<0x012>;
 defm S_SETREG_IMM32_B32     : SOPK_Real64_gfx11_gfx12<0x013>;
+let OtherPredicates = [isNotGFX1250Plus] in
 defm S_CALL_B64             : SOPK_Real32_gfx11_gfx12<0x014>;
+defm S_CALL_B64             : SOPK_Real32_gfx1250<0x014, "s_call_i64">;
 defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
 defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx11<0x017>;
 defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx11<0x018>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c0cd43a9c35d..47d213d28ff7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2233,6 +2233,10 @@ bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); }
 
 bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); }
 
+bool isGFX1250(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
+}
+
 bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
 
 bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 975a8908059c..aa5406370d84 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1435,6 +1435,7 @@ bool isGFX11(const MCSubtargetInfo &STI);
 bool isGFX11Plus(const MCSubtargetInfo &STI);
 bool isGFX12(const MCSubtargetInfo &STI);
 bool isGFX12Plus(const MCSubtargetInfo &STI);
+bool isGFX1250(const MCSubtargetInfo &STI);
 bool isNotGFX12Plus(const MCSubtargetInfo &STI);
 bool isNotGFX11Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
new file mode 100644
index 000000000000..95a926811292
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
@@ -0,0 +1,30 @@
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+s_get_pc_i64 s[2:3]
+// GFX1250: s_get_pc_i64 s[2:3]                     ; encoding: [0x00,0x47,0x82,0xbe]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_getpc_b64 s[2:3]
+// GFX1250: s_get_pc_i64 s[2:3]                     ; encoding: [0x00,0x47,0x82,0xbe]
+
+s_set_pc_i64 s[2:3]
+// GFX1250: s_set_pc_i64 s[2:3]                     ; encoding: [0x02,0x48,0x80,0xbe]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_setpc_b64 s[2:3]
+// GFX1250: s_set_pc_i64 s[2:3]                     ; encoding: [0x02,0x48,0x80,0xbe]
+
+s_swap_pc_i64 s[2:3], 10
+// GFX1250: s_swap_pc_i64 s[2:3], 10                ; encoding: [0x8a,0x49,0x82,0xbe]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_swappc_b64 s[2:3], 10
+// GFX1250: s_swap_pc_i64 s[2:3], 10                ; encoding: [0x8a,0x49,0x82,0xbe]
+
+s_rfe_i64 s[2:3]
+// GFX1250: s_rfe_i64 s[2:3]                        ; encoding: [0x02,0x4a,0x80,0xbe]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_rfe_b64 s[2:3]
+// GFX1250: s_rfe_i64 s[2:3]                        ; encoding: [0x02,0x4a,0x80,0xbe]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopk.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopk.s
new file mode 100644
index 000000000000..0945e0dcc9b8
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopk.s
@@ -0,0 +1,9 @@
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: -strict-whitespace %s
+
+s_call_i64 s[0:1], 4660
+// GFX1250: s_call_i64 s[0:1], 4660                 ; encoding: [0x34,0x12,0x00,0xba]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_call_b64 s[0:1], 4660
+// GFX1250: s_call_i64 s[0:1], 4660                 ; encoding: [0x34,0x12,0x00,0xba]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index 31f61e48126c..dd3588d5da01 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
 
 0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00
 # GFX12: s_alloc_vgpr 0x1235                     ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
@@ -2270,16 +2271,20 @@
 # GFX12: s_ctz_i32_b64 vcc_lo, s[2:3]            ; encoding: [0x02,0x09,0xea,0xbe]
 
 0x00,0x47,0xfe,0xbe
-# GFX12: s_getpc_b64 exec                        ; encoding: [0x00,0x47,0xfe,0xbe]
+# GFX1200: s_getpc_b64 exec                        ; encoding: [0x00,0x47,0xfe,0xbe]
+# GFX1250: s_get_pc_i64 exec                       ; encoding: [0x00,0x47,0xfe,0xbe]
 
 0x00,0x47,0x80,0xbe
-# GFX12: s_getpc_b64 s[0:1]                      ; encoding: [0x00,0x47,0x80,0xbe]
+# GFX1200: s_getpc_b64 s[0:1]                      ; encoding: [0x00,0x47,0x80,0xbe]
+# GFX1250: s_get_pc_i64 s[0:1]                     ; encoding: [0x00,0x47,0x80,0xbe]
 
 0x00,0x47,0xe8,0xbe
-# GFX12: s_getpc_b64 s[104:105]                  ; encoding: [0x00,0x47,0xe8,0xbe]
+# GFX1200: s_getpc_b64 s[104:105]                  ; encoding: [0x00,0x47,0xe8,0xbe]
+# GFX1250: s_get_pc_i64 s[104:105]                 ; encoding: [0x00,0x47,0xe8,0xbe]
 
 0x00,0x47,0xea,0xbe
-# GFX12: s_getpc_b64 vcc                         ; encoding: [0x00,0x47,0xea,0xbe]
+# GFX1200: s_getpc_b64 vcc                         ; encoding: [0x00,0x47,0xea,0xbe]
+# GFX1250: s_get_pc_i64 vcc                        ; encoding: [0x00,0x47,0xea,0xbe]
 
 0x01,0x00,0xff,0xbe
 # GFX12: s_mov_b32 exec_hi, s1                   ; encoding: [0x01,0x00,0xff,0xbe]
@@ -3218,13 +3223,16 @@
 # GFX12: s_quadmask_b64 vcc, s[2:3]              ; encoding: [0x02,0x1b,0xea,0xbe]
 
 0x00,0x4a,0x80,0xbe
-# GFX12: s_rfe_b64 s[0:1]                        ; encoding: [0x00,0x4a,0x80,0xbe]
+# GFX1200: s_rfe_b64 s[0:1]                        ; encoding: [0x00,0x4a,0x80,0xbe]
+# GFX1250: s_rfe_i64 s[0:1]                        ; encoding: [0x00,0x4a,0x80,0xbe]
 
 0x68,0x4a,0x80,0xbe
-# GFX12: s_rfe_b64 s[104:105]                    ; encoding: [0x68,0x4a,0x80,0xbe]
+# GFX1200: s_rfe_b64 s[104:105]                    ; encoding: [0x68,0x4a,0x80,0xbe]
+# GFX1250: s_rfe_i64 s[104:105]                    ; encoding: [0x68,0x4a,0x80,0xbe]
 
 0x6a,0x4a,0x80,0xbe
-# GFX12: s_rfe_b64 vcc                           ; encoding: [0x6a,0x4a,0x80,0xbe]
+# GFX1200: s_rfe_b64 vcc                           ; encoding: [0x6a,0x4a,0x80,0xbe]
+# GFX1250: s_rfe_i64 vcc                           ; encoding: [0x6a,0x4a,0x80,0xbe]
 
 0x00,0x4c,0x81,0xbe
 # GFX12: s_sendmsg_rtn_b32 s1, sendmsg(0, 0, 0)  ; encoding: [0x00,0x4c,0x81,0xbe]
@@ -3269,16 +3277,20 @@
 # GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID) ; encoding: [0x87,0x4c,0x80,0xbe]
 
 0x00,0x48,0x80,0xbe
-# GFX12: s_setpc_b64 s[0:1]                      ; encoding: [0x00,0x48,0x80,0xbe]
+# GFX1200: s_setpc_b64 s[0:1]                      ; encoding: [0x00,0x48,0x80,0xbe]
+# GFX1250: s_set_pc_i64 s[0:1]                     ; encoding: [0x00,0x48,0x80,0xbe]
 
 0x68,0x48,0x80,0xbe
-# GFX12: s_setpc_b64 s[104:105]                  ; encoding: [0x68,0x48,0x80,0xbe]
+# GFX1200: s_setpc_b64 s[104:105]                  ; encoding: [0x68,0x48,0x80,0xbe]
+# GFX1250: s_set_pc_i64 s[104:105]                 ; encoding: [0x68,0x48,0x80,0xbe]
 
 0x6a,0x48,0x80,0xbe
-# GFX12: s_setpc_b64 vcc                         ; encoding: [0x6a,0x48,0x80,0xbe]
+# GFX1200: s_setpc_b64 vcc                         ; encoding: [0x6a,0x48,0x80,0xbe]
+# GFX1250: s_set_pc_i64 vcc                        ; encoding: [0x6a,0x48,0x80,0xbe]
 
 0xcb,0x48,0xf5,0xbe
-# GFX12: s_setpc_b64 -11/*Invalid immediate*/    ; encoding: [0xf5,0x48,0x80,0xbe]
+# GFX1200: s_setpc_b64 -11/*Invalid immediate*/    ; encoding: [0xf5,0x48,0x80,0xbe]
+# GFX1250: s_set_pc_i64 -11/*Invalid immediate*/   ; encoding: [0xf5,0x48,0x80,0xbe]
 
 0x01,0x0f,0xff,0xbe
 # GFX12: s_sext_i32_i16 exec_hi, s1              ; encoding: [0x01,0x0f,0xff,0xbe]
@@ -3401,22 +3413,28 @@
 # GFX12: s_sext_i32_i8 vcc_lo, s1                ; encoding: [0x01,0x0e,0xea,0xbe]
 
 0x66,0x49,0x80,0xbe
-# GFX12: s_swappc_b64 s[0:1], s[102:103]         ; encoding: [0x66,0x49,0x80,0xbe]
+# GFX1200: s_swappc_b64 s[0:1], s[102:103]         ; encoding: [0x66,0x49,0x80,0xbe]
+# GFX1250: s_swap_pc_i64 s[0:1], s[102:103]        ; encoding: [0x66,0x49,0x80,0xbe]
 
 0x02,0x49,0x80,0xbe
-# GFX12: s_swappc_b64 s[0:1], s[2:3]             ; encoding: [0x02,0x49,0x80,0xbe]
+# GFX1200: s_swappc_b64 s[0:1], s[2:3]             ; encoding: [0x02,0x49,0x80,0xbe]
+# GFX1250: s_swap_pc_i64 s[0:1], s[2:3]            ; encoding: [0x02,0x49,0x80,0xbe]
 
 0x6a,0x49,0x80,0xbe
-# GFX12: s_swappc_b64 s[0:1], vcc                ; encoding: [0x6a,0x49,0x80,0xbe]
+# GFX1200: s_swappc_b64 s[0:1], vcc                ; encoding: [0x6a,0x49,0x80,0xbe]
+# GFX1250: s_swap_pc_i64 s[0:1], vcc               ; encoding: [0x6a,0x49,0x80,0xbe]
 
 0x66,0x49,0xe8,0xbe
-# GFX12: s_swappc_b64 s[104:105], s[102:103]     ; encoding: [0x66,0x49,0xe8,0xbe]
+# GFX1200: s_swappc_b64 s[104:105], s[102:103]     ; encoding: [0x66,0x49,0xe8,0xbe]
+# GFX1250: s_swap_pc_i64 s[104:105], s[102:103]    ; encoding: [0x66,0x49,0xe8,0xbe]
 
 0x02,0x49,0xe8,0xbe
-# GFX12: s_swappc_b64 s[104:105], s[2:3]         ; encoding: [0x02,0x49,0xe8,0xbe]
+# GFX1200: s_swappc_b64 s[104:105], s[2:3]         ; encoding: [0x02,0x49,0xe8,0xbe]
+# GFX1250: s_swap_pc_i64 s[104:105], s[2:3]        ; encoding: [0x02,0x49,0xe8,0xbe]
 
 0x02,0x49,0xea,0xbe
-# GFX12: s_swappc_b64 vcc, s[2:3]                ; encoding: [0x02,0x49,0xea,0xbe]
+# GFX1200: s_swappc_b64 vcc, s[2:3]                ; encoding: [0x02,0x49,0xea,0xbe]
+# GFX1250: s_swap_pc_i64 vcc, s[2:3]               ; encoding: [0x02,0x49,0xea,0xbe]
 
 0x01,0x1c,0xff,0xbe
 # GFX12: s_wqm_b32 exec_hi, s1                   ; encoding: [0x01,0x1c,0xff,0xbe]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
index 3e323ed69216..49fa263f6bbf 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
@@ -1,5 +1,6 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX12,GFX1200 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX1200 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX12,GFX1250 %s
 
 # GFX12: s_addk_co_i32 exec_hi, 0x1234           ; encoding: [0x34,0x12,0xff,0xb7]
 0x34,0x12,0xff,0xb7
@@ -25,19 +26,24 @@
 # GFX12: s_addk_co_i32 vcc_lo, 0x1234            ; encoding: [0x34,0x12,0xea,0xb7]
 0x34,0x12,0xea,0xb7
 
-# GFX12: s_call_b64 exec, 4660                   ; encoding: [0x34,0x12,0x7e,0xba]
+# GFX1200: s_call_b64 exec, 4660                   ; encoding: [0x34,0x12,0x7e,0xba]
+# GFX1250: s_call_i64 exec, 4660                   ; encoding: [0x34,0x12,0x7e,0xba]
 0x34,0x12,0x7e,0xba
 
-# GFX12: s_call_b64 s[0:1], 4660                 ; encoding: [0x34,0x12,0x00,0xba]
+# GFX1200: s_call_b64 s[0:1], 4660                 ; encoding: [0x34,0x12,0x00,0xba]
+# GFX1250: s_call_i64 s[0:1], 4660                 ; encoding: [0x34,0x12,0x00,0xba]
 0x34,0x12,0x00,0xba
 
-# GFX12: s_call_b64 s[104:105], 4660             ; encoding: [0x34,0x12,0x68,0xba]
+# GFX1200: s_call_b64 s[104:105], 4660             ; encoding: [0x34,0x12,0x68,0xba]
+# GFX1250: s_call_i64 s[104:105], 4660             ; encoding: [0x34,0x12,0x68,0xba]
 0x34,0x12,0x68,0xba
 
-# GFX12: s_call_b64 vcc, 4660                    ; encoding: [0x34,0x12,0x6a,0xba]
+# GFX1200: s_call_b64 vcc, 4660                    ; encoding: [0x34,0x12,0x6a,0xba]
+# GFX1250: s_call_i64 vcc, 4660                    ; encoding: [0x34,0x12,0x6a,0xba]
 0x34,0x12,0x6a,0xba
 
-# GFX12: s_call_b64 null, 4660                   ; encoding: [0x34,0x12,0x7c,0xba]
+# GFX1200: s_call_b64 null, 4660                   ; encoding: [0x34,0x12,0x7c,0xba]
+# GFX1250: s_call_i64 null, 4660                   ; encoding: [0x34,0x12,0x7c,0xba]
 0x34,0x12,0x7c,0xba
 
 # GFX12: s_cmovk_i32 exec_hi, 0x1234             ; encoding: [0x34,0x12,0x7f,0xb1]

From f51d8730b309c14a78764e1b9a2e112e038ed3a0 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Sun, 22 Jun 2025 13:32:19 +0800
Subject: [PATCH 1183/1322] [InstSimplify] Simplify 'x u>= 1' to true when x is
 known non-zero (#145204)

---
 llvm/lib/Analysis/InstructionSimplify.cpp   |  11 +-
 llvm/test/Transforms/InstSimplify/umax-1.ll | 120 ++++++++++++++++++++
 2 files changed, 127 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/InstSimplify/umax-1.ll

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d1ac8d9fbdfd..cb1dae92faf9 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2981,7 +2981,7 @@ static Value *simplifyICmpWithZero(CmpPredicate Pred, Value *LHS, Value *RHS,
 }
 
 static Value *simplifyICmpWithConstant(CmpPredicate Pred, Value *LHS,
-                                       Value *RHS, const InstrInfoQuery &IIQ) {
+                                       Value *RHS, const SimplifyQuery &Q) {
   Type *ITy = getCompareTy(RHS); // The return type.
 
   Value *X;
@@ -3007,7 +3007,7 @@ static Value *simplifyICmpWithConstant(CmpPredicate Pred, Value *LHS,
     return ConstantInt::getTrue(ITy);
 
   ConstantRange LHS_CR =
-      computeConstantRange(LHS, CmpInst::isSigned(Pred), IIQ.UseInstrInfo);
+      computeConstantRange(LHS, CmpInst::isSigned(Pred), Q.IIQ.UseInstrInfo);
   if (!LHS_CR.isFullSet()) {
     if (RHS_CR.contains(LHS_CR))
       return ConstantInt::getTrue(ITy);
@@ -3018,13 +3018,16 @@ static Value *simplifyICmpWithConstant(CmpPredicate Pred, Value *LHS,
   // (mul nuw/nsw X, MulC) != C --> true  (if C is not a multiple of MulC)
   // (mul nuw/nsw X, MulC) == C --> false (if C is not a multiple of MulC)
   const APInt *MulC;
-  if (IIQ.UseInstrInfo && ICmpInst::isEquality(Pred) &&
+  if (Q.IIQ.UseInstrInfo && ICmpInst::isEquality(Pred) &&
       ((match(LHS, m_NUWMul(m_Value(), m_APIntAllowPoison(MulC))) &&
         *MulC != 0 && C->urem(*MulC) != 0) ||
        (match(LHS, m_NSWMul(m_Value(), m_APIntAllowPoison(MulC))) &&
         *MulC != 0 && C->srem(*MulC) != 0)))
     return ConstantInt::get(ITy, Pred == ICmpInst::ICMP_NE);
 
+  if (Pred == ICmpInst::ICMP_UGE && C->isOne() && isKnownNonZero(LHS, Q))
+    return ConstantInt::getTrue(ITy);
+
   return nullptr;
 }
 
@@ -3776,7 +3779,7 @@ static Value *simplifyICmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
   if (Value *V = simplifyICmpWithZero(Pred, LHS, RHS, Q))
     return V;
 
-  if (Value *V = simplifyICmpWithConstant(Pred, LHS, RHS, Q.IIQ))
+  if (Value *V = simplifyICmpWithConstant(Pred, LHS, RHS, Q))
     return V;
 
   // If both operands have range metadata, use the metadata
diff --git a/llvm/test/Transforms/InstSimplify/umax-1.ll b/llvm/test/Transforms/InstSimplify/umax-1.ll
new file mode 100644
index 000000000000..77863acb0232
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/umax-1.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+define i32 @known_non_zero_by_or(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @known_non_zero_by_or(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[VAL]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %val = or i32 %x, %y
+  %max = call i32 @llvm.umax.i32(i32 %val, i32 1)
+  ret i32 %max
+
+if.else:
+  ret i32 0
+}
+
+define i32 @known_non_zero_by_mul(i32 %x) {
+; CHECK-LABEL: define i32 @known_non_zero_by_mul(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[NONZERO1:%.*]] = mul nuw i32 [[X]], 3
+; CHECK-NEXT:    ret i32 [[NONZERO1]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %nonzero = mul nuw i32 %x, 3
+  %max = call i32 @llvm.umax.i32(i32 %nonzero, i32 1)
+  ret i32 %max
+
+if.else:
+  ret i32 0
+}
+
+define i32 @known_non_zero_commute(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @known_non_zero_commute(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    ret i32 [[VAL]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %val = or i32 %x, %y
+  %max = call i32 @llvm.umax.i32(i32 1, i32 %val)
+  ret i32 %max
+
+if.else:
+  ret i32 0
+}
+
+; Negative
+define i32 @umax_ge_2(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @umax_ge_2(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[MAX:%.*]] = call i32 @llvm.umax.i32(i32 [[VAL]], i32 2)
+; CHECK-NEXT:    ret i32 [[MAX]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %val = or i32 %x, %y
+  %max = call i32 @llvm.umax.i32(i32 %val, i32 2)
+  ret i32 %max
+
+if.else:
+  ret i32 0
+}
+
+define i32 @unknown_by_and(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @unknown_by_and(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[MAX:%.*]] = call i32 @llvm.umax.i32(i32 [[VAL]], i32 1)
+; CHECK-NEXT:    ret i32 [[MAX]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %val = and i32 %x, %y
+  %max = call i32 @llvm.umax.i32(i32 %val, i32 1)
+  ret i32 %max
+
+if.else:
+  ret i32 0
+}

From cbfec48697adeb5e6f5f35acba73a4a1408aea21 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik@intel.com>
Date: Sun, 22 Jun 2025 00:22:57 -0700
Subject: [PATCH 1184/1322] Revert "[X86][NFC] Use std::move to avoid copy"
 (#145215)

Reverts llvm/llvm-project#141455
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 53c0da45f2f6..33083c0eba69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41266,7 +41266,7 @@ static SDValue combineX86ShufflesRecursively(
     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
                                       ResolveKnownZeros);
 
-    Mask = std::move(OpMask);
+    Mask = OpMask;
     Ops.append(OpInputs.begin(), OpInputs.end());
   } else {
     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

From 58b939abe5085a750be844cc3c681b40afe98454 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 22 Jun 2025 08:45:40 +0100
Subject: [PATCH 1185/1322] [VPlan] Support matching constants in
 narrowInterleaveGroups.

Matching constants can trivially be broadcasted, allow them if the same
constant is used for all recipes in a bundle.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 12 ++---
 ...interleave-to-widen-memory-constant-ops.ll | 44 +++++--------------
 2 files changed, 19 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d66733cac4d6..ac6be09ef271 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3115,7 +3115,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
                           unsigned OpIdx, VPValue *OpV, unsigned Idx) {
   auto *DefR = OpV->getDefiningRecipe();
   if (!DefR)
-    return false;
+    return WideMember0->getOperand(OpIdx) == OpV;
   if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
     return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
 
@@ -3251,7 +3251,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     return;
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
-  auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
+  auto NarrowOp = [](VPValue *V) -> VPValue * {
+    auto *R = V->getDefiningRecipe();
+    if (!R)
+      return V;
     if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
       // Narrow interleave group to wide load, as transformed VPlan will only
       // process one original iteration.
@@ -3280,11 +3283,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
             StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
       for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
-        WideMember0->setOperand(
-            Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe()));
+        WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
       Res = WideMember0;
     } else {
-      Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
+      Res = NarrowOp(StoreGroup->getStoredValues()[0]);
     }
 
     auto *S = new VPWidenStoreRecipe(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 7d3b3d86b90d..94f46bfe3973 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -13,28 +13,18 @@ define void @test_add_double_same_const_args_1(ptr %res, ptr noalias %A, ptr noa
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC]], splat (double 1.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -73,28 +63,18 @@ define void @test_add_double_same_const_args_2(ptr %res, ptr noalias %A, ptr noa
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> splat (double 1.000000e+00), [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> splat (double 1.000000e+00), [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> splat (double 1.000000e+00), [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> splat (double 1.000000e+00), [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:

From 6ae5b89553a9393cc9ab68c0d776a506381a3009 Mon Sep 17 00:00:00 2001
From: Evan Liu <liuyievan@gmail.com>
Date: Sun, 22 Jun 2025 02:21:47 -0700
Subject: [PATCH 1186/1322] Make getStridesAndOffset const (#145148)

Make getStridesAndOffset const.
---
 mlir/include/mlir/IR/BuiltinTypes.td | 4 ++--
 mlir/lib/IR/BuiltinTypes.cpp         | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index 55d64d663f7e..89ade79a3ac0 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -863,11 +863,11 @@ def Builtin_MemRef : Builtin_Type<"MemRef", "memref", [
     /// the distance in the number of elements between successive entries along
     /// a particular dimension.
     LogicalResult getStridesAndOffset(SmallVectorImpl<int64_t> &strides,
-                                      int64_t &offset);
+                                      int64_t &offset) const;
 
     /// Wrapper around getStridesAndOffset(SmallVectorImpl<int64_t>, int64_t)
     /// that will assert if the logical result is not succeeded.
-    std::pair<SmallVector<int64_t>, int64_t> getStridesAndOffset();
+    std::pair<SmallVector<int64_t>, int64_t> getStridesAndOffset() const;
 
     /// Return "true" if the layout is compatible with strided semantics.
     bool isStrided();
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 97bab479c79b..e3a00ac5a14b 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -730,11 +730,12 @@ MemRefType MemRefType::canonicalizeStridedLayout() {
 }
 
 LogicalResult MemRefType::getStridesAndOffset(SmallVectorImpl<int64_t> &strides,
-                                              int64_t &offset) {
+                                              int64_t &offset) const {
   return getLayout().getStridesAndOffset(getShape(), strides, offset);
 }
 
-std::pair<SmallVector<int64_t>, int64_t> MemRefType::getStridesAndOffset() {
+std::pair<SmallVector<int64_t>, int64_t>
+MemRefType::getStridesAndOffset() const {
   SmallVector<int64_t> strides;
   int64_t offset;
   LogicalResult status = getStridesAndOffset(strides, offset);

From 075cb691a5e810f7114369c67b475dfd9127d4af Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Sun, 22 Jun 2025 11:40:01 +0200
Subject: [PATCH 1187/1322] [MLIR] Add logging/tracing to DataFlow analysis and
 RemoveDeadValues (NFC) (#144695)

Debugging issues with this pass is quite difficult at the moment, this
should help.
---
 .../Analysis/DataFlow/DeadCodeAnalysis.cpp    | 81 +++++++++++++++++--
 .../Analysis/DataFlow/LivenessAnalysis.cpp    | 51 +++++++++++-
 mlir/lib/Transforms/RemoveDeadValues.cpp      | 56 +++++++++++--
 3 files changed, 176 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
index e805e21d878b..1abdfcbf3496 100644
--- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
@@ -22,9 +22,14 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include <cassert>
 #include <optional>
 
+#define DEBUG_TYPE "dead-code-analysis"
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 using namespace mlir;
 using namespace mlir::dataflow;
 
@@ -122,6 +127,7 @@ DeadCodeAnalysis::DeadCodeAnalysis(DataFlowSolver &solver)
 }
 
 LogicalResult DeadCodeAnalysis::initialize(Operation *top) {
+  LDBG("Initializing DeadCodeAnalysis for top-level op: " << top->getName());
   // Mark the top-level blocks as executable.
   for (Region &region : top->getRegions()) {
     if (region.empty())
@@ -129,6 +135,7 @@ LogicalResult DeadCodeAnalysis::initialize(Operation *top) {
     auto *state =
         getOrCreate<Executable>(getProgramPointBefore(&region.front()));
     propagateIfChanged(state, state->setToLive());
+    LDBG("Marked entry block live for region in op: " << top->getName());
   }
 
   // Mark as overdefined the predecessors of symbol callables with potentially
@@ -139,13 +146,18 @@ LogicalResult DeadCodeAnalysis::initialize(Operation *top) {
 }
 
 void DeadCodeAnalysis::initializeSymbolCallables(Operation *top) {
+  LDBG("[init] Entering initializeSymbolCallables for top-level op: "
+       << top->getName());
   analysisScope = top;
   auto walkFn = [&](Operation *symTable, bool allUsesVisible) {
+    LDBG("[init] Processing symbol table op: " << symTable->getName());
     Region &symbolTableRegion = symTable->getRegion(0);
     Block *symbolTableBlock = &symbolTableRegion.front();
 
     bool foundSymbolCallable = false;
     for (auto callable : symbolTableBlock->getOps<CallableOpInterface>()) {
+      LDBG("[init] Found CallableOpInterface: "
+           << callable.getOperation()->getName());
       Region *callableRegion = callable.getCallableRegion();
       if (!callableRegion)
         continue;
@@ -159,6 +171,8 @@ void DeadCodeAnalysis::initializeSymbolCallables(Operation *top) {
         auto *state =
             getOrCreate<PredecessorState>(getProgramPointAfter(callable));
         propagateIfChanged(state, state->setHasUnknownPredecessors());
+        LDBG("[init] Marked callable as having unknown predecessors: "
+             << callable.getOperation()->getName());
       }
       foundSymbolCallable = true;
     }
@@ -173,10 +187,15 @@ void DeadCodeAnalysis::initializeSymbolCallables(Operation *top) {
     if (!uses) {
       // If we couldn't gather the symbol uses, conservatively assume that
       // we can't track information for any nested symbols.
+      LDBG("[init] Could not gather symbol uses, conservatively marking "
+           "all nested callables as having unknown predecessors");
       return top->walk([&](CallableOpInterface callable) {
         auto *state =
             getOrCreate<PredecessorState>(getProgramPointAfter(callable));
         propagateIfChanged(state, state->setHasUnknownPredecessors());
+        LDBG("[init] Marked nested callable as "
+             "having unknown predecessors: "
+             << callable.getOperation()->getName());
       });
     }
 
@@ -190,10 +209,15 @@ void DeadCodeAnalysis::initializeSymbolCallables(Operation *top) {
         continue;
       auto *state = getOrCreate<PredecessorState>(getProgramPointAfter(symbol));
       propagateIfChanged(state, state->setHasUnknownPredecessors());
+      LDBG("[init] Found non-call use for symbol, "
+           "marked as having unknown predecessors: "
+           << symbol->getName());
     }
   };
   SymbolTable::walkSymbolTables(top, /*allSymUsesVisible=*/!top->getBlock(),
                                 walkFn);
+  LDBG("[init] Finished initializeSymbolCallables for top-level op: "
+       << top->getName());
 }
 
 /// Returns true if the operation is a returning terminator in region
@@ -205,9 +229,12 @@ static bool isRegionOrCallableReturn(Operation *op) {
 }
 
 LogicalResult DeadCodeAnalysis::initializeRecursively(Operation *op) {
+  LDBG("[init] Entering initializeRecursively for op: " << op->getName()
+                                                        << " at " << op);
   // Initialize the analysis by visiting every op with control-flow semantics.
   if (op->getNumRegions() || op->getNumSuccessors() ||
       isRegionOrCallableReturn(op) || isa<CallOpInterface>(op)) {
+    LDBG("[init] Visiting op with control-flow semantics: " << *op);
     // When the liveness of the parent block changes, make sure to re-invoke the
     // analysis on the op.
     if (op->getBlock())
@@ -218,14 +245,22 @@ LogicalResult DeadCodeAnalysis::initializeRecursively(Operation *op) {
       return failure();
   }
   // Recurse on nested operations.
-  for (Region &region : op->getRegions())
-    for (Operation &op : region.getOps())
-      if (failed(initializeRecursively(&op)))
+  for (Region &region : op->getRegions()) {
+    LDBG("[init] Recursing into region of op: " << op->getName());
+    for (Operation &nestedOp : region.getOps()) {
+      LDBG("[init] Recursing into nested op: " << nestedOp.getName() << " at "
+                                               << &nestedOp);
+      if (failed(initializeRecursively(&nestedOp)))
         return failure();
+    }
+  }
+  LDBG("[init] Finished initializeRecursively for op: " << op->getName()
+                                                        << " at " << op);
   return success();
 }
 
 void DeadCodeAnalysis::markEdgeLive(Block *from, Block *to) {
+  LDBG("Marking edge live from block " << from << " to block " << to);
   auto *state = getOrCreate<Executable>(getProgramPointBefore(to));
   propagateIfChanged(state, state->setToLive());
   auto *edgeState =
@@ -234,37 +269,48 @@ void DeadCodeAnalysis::markEdgeLive(Block *from, Block *to) {
 }
 
 void DeadCodeAnalysis::markEntryBlocksLive(Operation *op) {
+  LDBG("Marking entry blocks live for op: " << op->getName());
   for (Region &region : op->getRegions()) {
     if (region.empty())
       continue;
     auto *state =
         getOrCreate<Executable>(getProgramPointBefore(&region.front()));
     propagateIfChanged(state, state->setToLive());
+    LDBG("Marked entry block live for region in op: " << op->getName());
   }
 }
 
 LogicalResult DeadCodeAnalysis::visit(ProgramPoint *point) {
+  LDBG("Visiting program point: " << point << " " << *point);
   if (point->isBlockStart())
     return success();
   Operation *op = point->getPrevOp();
+  LDBG("Visiting operation: " << *op);
 
   // If the parent block is not executable, there is nothing to do.
   if (op->getBlock() != nullptr &&
-      !getOrCreate<Executable>(getProgramPointBefore(op->getBlock()))->isLive())
+      !getOrCreate<Executable>(getProgramPointBefore(op->getBlock()))
+           ->isLive()) {
+    LDBG("Parent block not live, skipping op: " << *op);
     return success();
+  }
 
   // We have a live call op. Add this as a live predecessor of the callee.
-  if (auto call = dyn_cast<CallOpInterface>(op))
+  if (auto call = dyn_cast<CallOpInterface>(op)) {
+    LDBG("Visiting call operation: " << *op);
     visitCallOperation(call);
+  }
 
   // Visit the regions.
   if (op->getNumRegions()) {
     // Check if we can reason about the region control-flow.
     if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
+      LDBG("Visiting region branch operation: " << *op);
       visitRegionBranchOperation(branch);
 
       // Check if this is a callable operation.
     } else if (auto callable = dyn_cast<CallableOpInterface>(op)) {
+      LDBG("Visiting callable operation: " << *op);
       const auto *callsites = getOrCreateFor<PredecessorState>(
           getProgramPointAfter(op), getProgramPointAfter(callable));
 
@@ -276,16 +322,19 @@ LogicalResult DeadCodeAnalysis::visit(ProgramPoint *point) {
 
       // Otherwise, conservatively mark all entry blocks as executable.
     } else {
+      LDBG("Marking all entry blocks live for op: " << *op);
       markEntryBlocksLive(op);
     }
   }
 
   if (isRegionOrCallableReturn(op)) {
     if (auto branch = dyn_cast<RegionBranchOpInterface>(op->getParentOp())) {
+      LDBG("Visiting region terminator: " << *op);
       // Visit the exiting terminator of a region.
       visitRegionTerminator(op, branch);
     } else if (auto callable =
                    dyn_cast<CallableOpInterface>(op->getParentOp())) {
+      LDBG("Visiting callable terminator: " << *op);
       // Visit the exiting terminator of a callable.
       visitCallableTerminator(op, callable);
     }
@@ -294,10 +343,12 @@ LogicalResult DeadCodeAnalysis::visit(ProgramPoint *point) {
   if (op->getNumSuccessors()) {
     // Check if we can reason about the control-flow.
     if (auto branch = dyn_cast<BranchOpInterface>(op)) {
+      LDBG("Visiting branch operation: " << *op);
       visitBranchOperation(branch);
 
       // Otherwise, conservatively mark all successors as exectuable.
     } else {
+      LDBG("Marking all successors live for op: " << *op);
       for (Block *successor : op->getSuccessors())
         markEdgeLive(op->getBlock(), successor);
     }
@@ -307,6 +358,7 @@ LogicalResult DeadCodeAnalysis::visit(ProgramPoint *point) {
 }
 
 void DeadCodeAnalysis::visitCallOperation(CallOpInterface call) {
+  LDBG("visitCallOperation: " << call.getOperation()->getName());
   Operation *callableOp = call.resolveCallableInTable(&symbolTable);
 
   // A call to a externally-defined callable has unknown predecessors.
@@ -329,11 +381,15 @@ void DeadCodeAnalysis::visitCallOperation(CallOpInterface call) {
     auto *callsites =
         getOrCreate<PredecessorState>(getProgramPointAfter(callableOp));
     propagateIfChanged(callsites, callsites->join(call));
+    LDBG("Added callsite as predecessor for callable: "
+         << callableOp->getName());
   } else {
     // Mark this call op's predecessors as overdefined.
     auto *predecessors =
         getOrCreate<PredecessorState>(getProgramPointAfter(call));
     propagateIfChanged(predecessors, predecessors->setHasUnknownPredecessors());
+    LDBG("Marked call op's predecessors as unknown for: "
+         << call.getOperation()->getName());
   }
 }
 
@@ -365,6 +421,7 @@ DeadCodeAnalysis::getOperandValues(Operation *op) {
 }
 
 void DeadCodeAnalysis::visitBranchOperation(BranchOpInterface branch) {
+  LDBG("visitBranchOperation: " << branch.getOperation()->getName());
   // Try to deduce a single successor for the branch.
   std::optional<SmallVector<Attribute>> operands = getOperandValues(branch);
   if (!operands)
@@ -372,15 +429,18 @@ void DeadCodeAnalysis::visitBranchOperation(BranchOpInterface branch) {
 
   if (Block *successor = branch.getSuccessorForOperands(*operands)) {
     markEdgeLive(branch->getBlock(), successor);
+    LDBG("Branch has single successor: " << successor);
   } else {
     // Otherwise, mark all successors as executable and outgoing edges.
     for (Block *successor : branch->getSuccessors())
       markEdgeLive(branch->getBlock(), successor);
+    LDBG("Branch has multiple/all successors live");
   }
 }
 
 void DeadCodeAnalysis::visitRegionBranchOperation(
     RegionBranchOpInterface branch) {
+  LDBG("visitRegionBranchOperation: " << branch.getOperation()->getName());
   // Try to deduce which regions are executable.
   std::optional<SmallVector<Attribute>> operands = getOperandValues(branch);
   if (!operands)
@@ -397,16 +457,19 @@ void DeadCodeAnalysis::visitRegionBranchOperation(
     // Mark the entry block as executable.
     auto *state = getOrCreate<Executable>(point);
     propagateIfChanged(state, state->setToLive());
+    LDBG("Marked region successor live: " << point);
     // Add the parent op as a predecessor.
     auto *predecessors = getOrCreate<PredecessorState>(point);
     propagateIfChanged(
         predecessors,
         predecessors->join(branch, successor.getSuccessorInputs()));
+    LDBG("Added region branch as predecessor for successor: " << point);
   }
 }
 
 void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
                                              RegionBranchOpInterface branch) {
+  LDBG("visitRegionTerminator: " << *op);
   std::optional<SmallVector<Attribute>> operands = getOperandValues(op);
   if (!operands)
     return;
@@ -425,6 +488,7 @@ void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
       auto *state =
           getOrCreate<Executable>(getProgramPointBefore(&region->front()));
       propagateIfChanged(state, state->setToLive());
+      LDBG("Marked region entry block live for region: " << region);
       predecessors = getOrCreate<PredecessorState>(
           getProgramPointBefore(&region->front()));
     } else {
@@ -434,11 +498,14 @@ void DeadCodeAnalysis::visitRegionTerminator(Operation *op,
     }
     propagateIfChanged(predecessors,
                        predecessors->join(op, successor.getSuccessorInputs()));
+    LDBG("Added region terminator as predecessor for successor: "
+         << (successor.getSuccessor() ? "region entry" : "parent op"));
   }
 }
 
 void DeadCodeAnalysis::visitCallableTerminator(Operation *op,
                                                CallableOpInterface callable) {
+  LDBG("visitCallableTerminator: " << *op);
   // Add as predecessors to all callsites this return op.
   auto *callsites = getOrCreateFor<PredecessorState>(
       getProgramPointAfter(op), getProgramPointAfter(callable));
@@ -449,11 +516,15 @@ void DeadCodeAnalysis::visitCallableTerminator(Operation *op,
         getOrCreate<PredecessorState>(getProgramPointAfter(predecessor));
     if (canResolve) {
       propagateIfChanged(predecessors, predecessors->join(op));
+      LDBG("Added callable terminator as predecessor for callsite: "
+           << predecessor->getName());
     } else {
       // If the terminator is not a return-like, then conservatively assume we
       // can't resolve the predecessor.
       propagateIfChanged(predecessors,
                          predecessors->setHasUnknownPredecessors());
+      LDBG("Could not resolve callable terminator for callsite: "
+           << predecessor->getName());
     }
   }
 }
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index 24a78400eb84..6a12fe3acc2c 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -10,6 +10,7 @@
 #include <cassert>
 #include <mlir/Analysis/DataFlow/LivenessAnalysis.h>
 
+#include <llvm/Support/Debug.h>
 #include <mlir/Analysis/DataFlow/SparseAnalysis.h>
 #include <mlir/Analysis/DataFlow/Utils.h>
 #include <mlir/Analysis/DataFlowFramework.h>
@@ -19,6 +20,10 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include <mlir/Support/LLVM.h>
 
+#define DEBUG_TYPE "liveness-analysis"
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 using namespace mlir;
 using namespace mlir::dataflow;
 
@@ -76,28 +81,46 @@ ChangeResult Liveness::meet(const AbstractSparseLattice &other) {
 LogicalResult
 LivenessAnalysis::visitOperation(Operation *op, ArrayRef<Liveness *> operands,
                                  ArrayRef<const Liveness *> results) {
+  LLVM_DEBUG(DBGS() << "[visitOperation] Enter: ";
+             op->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
+             llvm::dbgs() << "\n");
   // This marks values of type (1.a) and (4) liveness as "live".
   if (!isMemoryEffectFree(op) || op->hasTrait<OpTrait::ReturnLike>()) {
-    for (auto *operand : operands)
+    LDBG("[visitOperation] Operation has memory effects or is "
+         "return-like, marking operands live");
+    for (auto *operand : operands) {
+      LDBG(" [visitOperation] Marking operand live: "
+           << operand << " (" << operand->isLive << ")");
       propagateIfChanged(operand, operand->markLive());
+    }
   }
 
   // This marks values of type (3) liveness as "live".
   bool foundLiveResult = false;
   for (const Liveness *r : results) {
     if (r->isLive && !foundLiveResult) {
+      LDBG("[visitOperation] Found live result, "
+           "meeting all operands with result: "
+           << r);
       // It is assumed that each operand is used to compute each result of an
       // op. Thus, if at least one result is live, each operand is live.
-      for (Liveness *operand : operands)
+      for (Liveness *operand : operands) {
+        LDBG(" [visitOperation] Meeting operand: " << operand
+                                                   << " with result: " << r);
         meet(operand, *r);
+      }
       foundLiveResult = true;
     }
+    LDBG("[visitOperation] Adding dependency for result: " << r << " after op: "
+                                                           << *op);
     addDependency(const_cast<Liveness *>(r), getProgramPointAfter(op));
   }
   return success();
 }
 
 void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
+  LDBG("Visiting branch operand: " << operand.get()
+                                   << " in op: " << *operand.getOwner());
   // We know (at the moment) and assume (for the future) that `operand` is a
   // non-forwarded branch operand of a `RegionBranchOpInterface`,
   // `BranchOpInterface`, `RegionBranchTerminatorOpInterface` or return-like op.
@@ -129,6 +152,9 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
       for (Value result : op->getResults()) {
         if (getLatticeElement(result)->isLive) {
           mayLive = true;
+          LDBG("[visitBranchOperand] Non-forwarded branch "
+               "operand may be live due to live result: "
+               << result);
           break;
         }
       }
@@ -148,6 +174,8 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
     // Therefore, we conservatively consider the non-forwarded operand of the
     // branch operation may live.
     mayLive = true;
+    LDBG("[visitBranchOperand] Non-forwarded branch operand may "
+         "be live due to branch op interface");
   } else {
     Operation *parentOp = op->getParentOp();
     assert(isa<RegionBranchOpInterface>(parentOp) &&
@@ -163,6 +191,9 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
       for (Value result : parentOp->getResults()) {
         if (getLatticeElement(result)->isLive) {
           mayLive = true;
+          LDBG("[visitBranchOperand] Non-forwarded branch "
+               "operand may be live due to parent live result: "
+               << result);
           break;
         }
       }
@@ -183,6 +214,9 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
     for (Operation &nestedOp : *block) {
       if (!isMemoryEffectFree(&nestedOp)) {
         mayLive = true;
+        LDBG("Non-forwarded branch operand may be "
+             "live due to memory effect in block: "
+             << block);
         break;
       }
     }
@@ -190,6 +224,7 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
 
   if (mayLive) {
     Liveness *operandLiveness = getLatticeElement(operand.get());
+    LDBG("Marking branch operand live: " << operand.get());
     propagateIfChanged(operandLiveness, operandLiveness->markLive());
   }
 
@@ -201,6 +236,7 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
   SmallVector<const Liveness *, 4> resultsLiveness;
   for (const Value result : op->getResults())
     resultsLiveness.push_back(getLatticeElement(result));
+  LDBG("Visiting operation for non-forwarded branch operand: " << *op);
   (void)visitOperation(op, operandLiveness, resultsLiveness);
 
   // We also visit the parent op with the parent's results and this operand if
@@ -213,10 +249,14 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
   SmallVector<const Liveness *, 4> parentResultsLiveness;
   for (const Value parentResult : parentOp->getResults())
     parentResultsLiveness.push_back(getLatticeElement(parentResult));
+  LDBG("Visiting parent operation for non-forwarded branch operand: "
+       << *parentOp);
   (void)visitOperation(parentOp, operandLiveness, parentResultsLiveness);
 }
 
 void LivenessAnalysis::visitCallOperand(OpOperand &operand) {
+  LDBG("Visiting call operand: " << operand.get()
+                                 << " in op: " << *operand.getOwner());
   // We know (at the moment) and assume (for the future) that `operand` is a
   // non-forwarded call operand of an op implementing `CallOpInterface`.
   assert(isa<CallOpInterface>(operand.getOwner()) &&
@@ -229,14 +269,18 @@ void LivenessAnalysis::visitCallOperand(OpOperand &operand) {
   // This marks values of type (1.c) liveness as "live". A non-forwarded
   // call operand is live.
   Liveness *operandLiveness = getLatticeElement(operand.get());
+  LDBG("Marking call operand live: " << operand.get());
   propagateIfChanged(operandLiveness, operandLiveness->markLive());
 }
 
 void LivenessAnalysis::setToExitState(Liveness *lattice) {
+  LDBG("setToExitState for lattice: " << lattice);
   if (lattice->isLive) {
+    LDBG("Lattice already live, nothing to do");
     return;
   }
   // This marks values of type (2) liveness as "live".
+  LDBG("Marking lattice live due to exit state");
   (void)lattice->markLive();
   propagateIfChanged(lattice, ChangeResult::Change);
 }
@@ -246,11 +290,14 @@ void LivenessAnalysis::setToExitState(Liveness *lattice) {
 //===----------------------------------------------------------------------===//
 
 RunLivenessAnalysis::RunLivenessAnalysis(Operation *op) {
+  LDBG("Constructing RunLivenessAnalysis for op: " << op->getName());
   SymbolTableCollection symbolTable;
 
   loadBaselineAnalyses(solver);
   solver.load<LivenessAnalysis>(symbolTable);
+  LDBG("Initializing and running solver");
   (void)solver.initializeAndRun(op);
+  LDBG("Dumping liveness state for op");
 }
 
 const Liveness *RunLivenessAnalysis::getLiveness(Value val) {
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 08dfea8eb264..ad21ce8f1870 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -52,12 +52,17 @@
 #include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include <cassert>
 #include <cstddef>
 #include <memory>
 #include <optional>
 #include <vector>
 
+#define DEBUG_TYPE "remove-dead-values"
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 namespace mlir {
 #define GEN_PASS_DEF_REMOVEDEADVALUES
 #include "mlir/Transforms/Passes.h.inc"
@@ -115,12 +120,23 @@ struct RDVFinalCleanupList {
 static bool hasLive(ValueRange values, const DenseSet<Value> &nonLiveSet,
                     RunLivenessAnalysis &la) {
   for (Value value : values) {
-    if (nonLiveSet.contains(value))
+    if (nonLiveSet.contains(value)) {
+      LDBG("Value " << value << " is already marked non-live (dead)");
       continue;
+    }
 
     const Liveness *liveness = la.getLiveness(value);
-    if (!liveness || liveness->isLive)
+    if (!liveness) {
+      LDBG("Value " << value
+                    << " has no liveness info, conservatively considered live");
       return true;
+    }
+    if (liveness->isLive) {
+      LDBG("Value " << value << " is live according to liveness analysis");
+      return true;
+    } else {
+      LDBG("Value " << value << " is dead according to liveness analysis");
+    }
   }
   return false;
 }
@@ -134,6 +150,8 @@ static BitVector markLives(ValueRange values, const DenseSet<Value> &nonLiveSet,
   for (auto [index, value] : llvm::enumerate(values)) {
     if (nonLiveSet.contains(value)) {
       lives.reset(index);
+      LDBG("Value " << value << " is already marked non-live (dead) at index "
+                    << index);
       continue;
     }
 
@@ -144,8 +162,19 @@ static BitVector markLives(ValueRange values, const DenseSet<Value> &nonLiveSet,
     // of the results of an op and we know that these new values are live
     // (because they weren't erased) and also their liveness is null because
     // liveness analysis ran before their creation.
-    if (liveness && !liveness->isLive)
+    if (!liveness) {
+      LDBG("Value " << value << " at index " << index
+                    << " has no liveness info, conservatively considered live");
+      continue;
+    }
+    if (!liveness->isLive) {
       lives.reset(index);
+      LDBG("Value " << value << " at index " << index
+                    << " is dead according to liveness analysis");
+    } else {
+      LDBG("Value " << value << " at index " << index
+                    << " is live according to liveness analysis");
+    }
   }
 
   return lives;
@@ -160,6 +189,8 @@ static void collectNonLiveValues(DenseSet<Value> &nonLiveSet, ValueRange range,
     if (!nonLive[index])
       continue;
     nonLiveSet.insert(result);
+    LDBG("Marking value " << result << " as non-live (dead) at index "
+                          << index);
   }
 }
 
@@ -229,9 +260,16 @@ static SmallVector<OpOperand *> operandsToOpOperands(OperandRange operands) {
 static void processSimpleOp(Operation *op, RunLivenessAnalysis &la,
                             DenseSet<Value> &nonLiveSet,
                             RDVFinalCleanupList &cl) {
-  if (!isMemoryEffectFree(op) || hasLive(op->getResults(), nonLiveSet, la))
+  LDBG("Processing simple op: " << *op);
+  if (!isMemoryEffectFree(op) || hasLive(op->getResults(), nonLiveSet, la)) {
+    LDBG("Simple op is not memory effect free or has live results, skipping: "
+         << *op);
     return;
+  }
 
+  LDBG("Simple op has all dead results and is memory effect free, scheduling "
+       "for removal: "
+       << *op);
   cl.operations.push_back(op);
   collectNonLiveValues(nonLiveSet, op->getResults(),
                        BitVector(op->getNumResults(), true));
@@ -250,8 +288,12 @@ static void processSimpleOp(Operation *op, RunLivenessAnalysis &la,
 static void processFuncOp(FunctionOpInterface funcOp, Operation *module,
                           RunLivenessAnalysis &la, DenseSet<Value> &nonLiveSet,
                           RDVFinalCleanupList &cl) {
-  if (funcOp.isPublic() || funcOp.isExternal())
+  LDBG("Processing function op: " << funcOp.getOperation()->getName());
+  if (funcOp.isPublic() || funcOp.isExternal()) {
+    LDBG("Function is public or external, skipping: "
+         << funcOp.getOperation()->getName());
     return;
+  }
 
   // Get the list of unnecessary (non-live) arguments in `nonLiveArgs`.
   SmallVector<Value> arguments(funcOp.getArguments());
@@ -369,6 +411,9 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
                                   RunLivenessAnalysis &la,
                                   DenseSet<Value> &nonLiveSet,
                                   RDVFinalCleanupList &cl) {
+  LLVM_DEBUG(DBGS() << "Processing region branch op: "; regionBranchOp->print(
+      llvm::dbgs(), OpPrintingFlags().skipRegions());
+             llvm::dbgs() << "\n");
   // Mark live results of `regionBranchOp` in `liveResults`.
   auto markLiveResults = [&](BitVector &liveResults) {
     liveResults = markLives(regionBranchOp->getResults(), nonLiveSet, la);
@@ -654,6 +699,7 @@ static void processRegionBranchOp(RegionBranchOpInterface regionBranchOp,
 static void processBranchOp(BranchOpInterface branchOp, RunLivenessAnalysis &la,
                             DenseSet<Value> &nonLiveSet,
                             RDVFinalCleanupList &cl) {
+  LDBG("Processing branch op: " << *branchOp);
   unsigned numSuccessors = branchOp->getNumSuccessors();
 
   for (unsigned succIdx = 0; succIdx < numSuccessors; ++succIdx) {

From 8583882bdcabc37982e76154d65d07a9aecd2a16 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 22 Jun 2025 10:52:13 +0100
Subject: [PATCH 1188/1322] [AArch64] Remove unnecessary DL variable. NFC

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8c6f272a8c8d..15e38e6cb240 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6369,7 +6369,6 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
         // the backend to generate a umull.
         unsigned Bitwidth = I->getType()->getScalarSizeInBits();
         APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
-        const DataLayout &DL = I->getDataLayout();
         if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
           continue;
         NumZExts++;

From f78819aeef32e50ac3fec9a175b70a971b7c10e5 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Sun, 22 Jun 2025 17:52:30 +0800
Subject: [PATCH 1189/1322] Revert "Revert "[RISCV] Remove B and Zbc extension
 from Andes series cpus." (#144402)"

Since the fix https://github.com/llvm/llvm-project/pull/144848 for post-commit CI failure
has landed.

This reverts commit f83d09a1f60aee28a8ed9020cd72971ec2885f24.
---
 .../Driver/print-enabled-extensions/riscv-andes-a25.c     | 7 +------
 .../Driver/print-enabled-extensions/riscv-andes-a45.c     | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-ax25.c    | 7 +------
 .../Driver/print-enabled-extensions/riscv-andes-ax45.c    | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-n45.c     | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-nx45.c    | 6 +-----
 llvm/lib/Target/RISCV/RISCVProcessors.td                  | 8 --------
 llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s              | 2 +-
 8 files changed, 7 insertions(+), 41 deletions(-)

diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
index d8b3848d8452..cfb4d0ed58d1 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,12 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
index a0a1c3591140..3c3c554dffc5 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,11 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
index 3f933ecd8ac8..70100a0a8df1 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,12 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
index 6460d701411b..d2b1a32e321e 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,11 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
index 4d9c514b756e..1a2c30bfc7a2 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,11 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
index 5eaada3f9e16..50c38da3bd03 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,11 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 32f4ab607a34..d7e6c71ea062 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -703,8 +703,6 @@ def ANDES_A25 : RISCVProcessorModel<"andes-a25",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
-                                     FeatureStdExtZbc,
                                      FeatureVendorXAndesPerf]>;
 
 def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
@@ -718,8 +716,6 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
-                                      FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
 defvar Andes45TuneFeatures = [TuneAndes45,
@@ -741,7 +737,6 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -756,7 +751,6 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
 
@@ -771,7 +765,6 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -786,6 +779,5 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
diff --git a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
index f6dc6eef3f0f..d90dce8c5c3f 100644
--- a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
+++ b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+zbc -timeline -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+b,+zbc -timeline -iterations=1 < %s | FileCheck %s
 
 # Two ALUs without dependency can be dispatched in the same cycle.
 add a0, a0, a0

From c7d9eabf4a9c1c8c70b5976ea775fd3d143e93f7 Mon Sep 17 00:00:00 2001
From: Patryk Wychowaniec <pwychowaniec@pm.me>
Date: Sun, 22 Jun 2025 12:18:00 +0200
Subject: [PATCH 1190/1322] [AVR] Don't apply post-indexing on mismatched
 pointers (#145224)

fixes https://github.com/llvm/llvm-project/issues/143247
---
 llvm/lib/Target/AVR/AVRISelLowering.cpp |  9 +++++++
 llvm/test/CodeGen/AVR/bug-143247.ll     | 36 +++++++++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 llvm/test/CodeGen/AVR/bug-143247.ll

diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 9747ad0c5cd5..3955f2a252e7 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1071,14 +1071,17 @@ bool AVRTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
   EVT VT;
+  SDValue Ptr;
   SDLoc DL(N);
 
   if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
     if (LD->getExtensionType() != ISD::NON_EXTLOAD)
       return false;
   } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
     // We can not store to program memory.
     if (AVR::isProgramMemoryAccess(ST))
       return false;
@@ -1115,6 +1118,12 @@ bool AVRTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
         return false;
 
     Base = Op->getOperand(0);
+
+    // Post-indexing updates the base, so it's not a valid transform
+    // if that's not the same as the load's pointer.
+    if (Ptr != Base)
+      return false;
+
     Offset = DAG.getConstant(RHSC, DL, MVT::i8);
     AM = ISD::POST_INC;
 
diff --git a/llvm/test/CodeGen/AVR/bug-143247.ll b/llvm/test/CodeGen/AVR/bug-143247.ll
new file mode 100644
index 000000000000..07c4c6562c95
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/bug-143247.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O=2 -mtriple=avr-none --mcpu=avr128db28 -verify-machineinstrs | FileCheck %s
+
+declare dso_local void @nil(i16 noundef) addrspace(1)
+
+define void @complex_sbi() {
+; CHECK-LABEL: complex_sbi:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    push r16
+; CHECK-NEXT:    push r17
+; CHECK-NEXT:    ldi r24, 0
+; CHECK-NEXT:    ldi r25, 0
+; CHECK-NEXT:  .LBB0_1: ; %while.cond
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sbi 1, 7
+; CHECK-NEXT:    adiw r24, 1
+; CHECK-NEXT:    movw r16, r24
+; CHECK-NEXT:    andi r24, 15
+; CHECK-NEXT:    andi r25, 0
+; CHECK-NEXT:    adiw r24, 1
+; CHECK-NEXT:    call nil
+; CHECK-NEXT:    movw r24, r16
+; CHECK-NEXT:    rjmp .LBB0_1
+entry:
+  br label %while.cond
+while.cond:
+  %s.0 = phi i16 [ 0, %entry ], [ %inc, %while.cond ]
+  %inc = add nuw nsw i16 %s.0, 1
+  %0 = load volatile i8, ptr inttoptr (i16 1 to ptr), align 1
+  %or = or i8 %0, -128
+  store volatile i8 %or, ptr inttoptr (i16 1 to ptr), align 1
+  %and = and i16 %inc, 15
+  %add = add nuw nsw i16 %and, 1
+  tail call addrspace(1) void @nil(i16 noundef %add)
+  br label %while.cond
+}

From d2c0451d05d95c98727d2447abd1cb4bfed90890 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Sun, 22 Jun 2025 11:40:58 +0100
Subject: [PATCH 1191/1322] [lldb][DWAFASTParserClang][NFC] Rename
 GetCXXObjectParameter to GetObjectParameter

Since this is used for Objective-C too.
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp     | 12 ++++--------
 .../SymbolFile/DWARF/DWARFASTParserClang.h       | 16 +++++++++++++---
 .../DWARF/DWARFASTParserClangTests.cpp           |  7 +++----
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 4f79c8aa3f81..a4cb608edd8b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -159,13 +159,9 @@ static bool TagIsRecordType(dw_tag_t tag) {
   }
 }
 
-/// Get the object parameter DIE if one exists, otherwise returns
-/// a default DWARFDIE. If \c containing_decl_ctx is not a valid
-/// C++ declaration context for class methods, assume no object
-/// parameter exists for the given \c subprogram.
 DWARFDIE
-DWARFASTParserClang::GetCXXObjectParameter(const DWARFDIE &subprogram,
-                                           const DWARFDIE &decl_ctx_die) {
+DWARFASTParserClang::GetObjectParameter(const DWARFDIE &subprogram,
+                                        const DWARFDIE &decl_ctx_die) {
   assert(subprogram);
   assert(subprogram.Tag() == DW_TAG_subprogram ||
          subprogram.Tag() == DW_TAG_inlined_subroutine ||
@@ -1305,7 +1301,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
   clang::CallingConv calling_convention =
       ConvertDWARFCallingConventionToClang(attrs);
 
-  const DWARFDIE object_parameter = GetCXXObjectParameter(die, decl_ctx_die);
+  const DWARFDIE object_parameter = GetObjectParameter(die, decl_ctx_die);
 
   // clang_type will get the function prototype clang type after this
   // call
@@ -2417,7 +2413,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) {
   assert(containing_decl_ctx);
 
   const unsigned cv_quals =
-      GetCXXMethodCVQuals(die, GetCXXObjectParameter(die, decl_ctx_die));
+      GetCXXMethodCVQuals(die, GetObjectParameter(die, decl_ctx_die));
 
   ParseChildParameters(containing_decl_ctx, die, is_variadic,
                        has_template_params, param_types, param_names);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 111604ce4068..e57fc503d34c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -112,9 +112,19 @@ public:
   void MapDeclDIEToDefDIE(const lldb_private::plugin::dwarf::DWARFDIE &decl_die,
                           const lldb_private::plugin::dwarf::DWARFDIE &def_die);
 
-  lldb_private::plugin::dwarf::DWARFDIE GetCXXObjectParameter(
-      const lldb_private::plugin::dwarf::DWARFDIE &subprogram,
-      const lldb_private::plugin::dwarf::DWARFDIE &decl_ctx_die);
+  /// Get the object parameter DIE if one exists, otherwise returns
+  /// a default DWARFDIE.
+  ///
+  /// \param[in] subprogram DIE of function for which to get the object
+  /// parameter. \param[in] containing_decl_ctx DIE representing declaration
+  /// context of \a subprogram. If this DIE isn't a valid declaration context
+  /// for class methods, assume no object parameter exists.
+  ///
+  /// \returns DIE of object parameter if one exists.
+  ///
+  lldb_private::plugin::dwarf::DWARFDIE
+  GetObjectParameter(const lldb_private::plugin::dwarf::DWARFDIE &subprogram,
+                     const lldb_private::plugin::dwarf::DWARFDIE &decl_ctx_die);
 
 protected:
   /// Protected typedefs and members.
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index 2d4b79fed4a5..f18e938dbc4c 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -898,8 +898,7 @@ DWARF:
     auto param_die = decl_die.GetFirstChild();
     ASSERT_TRUE(param_die.IsValid());
 
-    EXPECT_EQ(param_die,
-              ast_parser.GetCXXObjectParameter(decl_die, context_die));
+    EXPECT_EQ(param_die, ast_parser.GetObjectParameter(decl_die, context_die));
   }
 
   {
@@ -912,8 +911,8 @@ DWARF:
     auto param_die = subprogram_definition.GetFirstChild();
     ASSERT_TRUE(param_die.IsValid());
 
-    EXPECT_EQ(param_die, ast_parser.GetCXXObjectParameter(subprogram_definition,
-                                                          context_die));
+    EXPECT_EQ(param_die, ast_parser.GetObjectParameter(subprogram_definition,
+                                                       context_die));
   }
 }
 

From 26390f22b8aa90c422b9e39a8295a7a0a6ef33ba Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 22 Jun 2025 13:30:45 +0100
Subject: [PATCH 1192/1322] [VectorCombine] foldShuffleOfShuffles - fold
 shuffle(shuffle(x,y),poison) length changing masks (#144690)

The shuffle merging code assumes that the shuffle sources are all the
same type, which fails if we've changed length and don't have 2 inner
shuffles. We already handle length-changing shuffles if we do have 2
inner shuffles.

This patch creates a fake "all poison" shuffle mask and reuses the other
shuffle's sources, which can be safely used with the existing merge
code.

The alternative was a considerable refactor of the merge code to account
for different vector widths......

Fixes #144656
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 15 ++++++--
 .../AArch64/shuffletoidentity.ll              |  6 ++--
 .../X86/extract-insert-poison.ll              | 36 +++++++++----------
 .../VectorCombine/X86/load-inseltpoison.ll    |  3 +-
 4 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 95e1f96c71b4..3e459f5ea4ce 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2282,6 +2282,17 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   if (!Match0 && !Match1)
     return false;
 
+  // If the outer shuffle is a permute, then create a fake inner all-poison
+  // shuffle. This is easier than accounting for length-changing shuffles below.
+  SmallVector<int, 16> PoisonMask1;
+  if (!Match1 && isa<PoisonValue>(OuterV1)) {
+    X1 = X0;
+    Y1 = Y0;
+    PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
+    InnerMask1 = PoisonMask1;
+    Match1 = true; // fake match
+  }
+
   X0 = Match0 ? X0 : OuterV0;
   Y0 = Match0 ? Y0 : OuterV0;
   X1 = Match1 ? X1 : OuterV1;
@@ -2356,11 +2367,11 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   // Try to merge the shuffles if the new shuffle is not costly.
   InstructionCost InnerCost0 = 0;
   if (Match0)
-    InnerCost0 = TTI.getInstructionCost(cast<Instruction>(OuterV0), CostKind);
+    InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
 
   InstructionCost InnerCost1 = 0;
   if (Match1)
-    InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind);
+    InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
 
   InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind);
 
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 1c9bc77ac3be..1c128c8f56a0 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -262,10 +262,8 @@ define <8 x half> @splatandidentity(<8 x half> %a, <8 x half> %b) {
 
 define <8 x half> @splattwice(<8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: @splattwice(
-; CHECK-NEXT:    [[AS:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[AS]], <4 x half> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[BS]], <4 x half> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = fadd <8 x half> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
index 642d07a8f325..e85c092b1b21 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
@@ -32,8 +32,7 @@ define <4 x double> @src_ins2_v4f64_ext0_v2f64(<4 x double> %a, <2 x double> %b)
 ; SSE-NEXT:    ret <4 x double> [[INS]]
 ;
 ; AVX-LABEL: @src_ins2_v4f64_ext0_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; AVX-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
 ; AVX-NEXT:    ret <4 x double> [[INS]]
 ;
   %ext = extractelement <2 x double> %b, i32 0
@@ -48,8 +47,7 @@ define <4 x double> @src_ins3_v4f64_ext0_v2f64(<4 x double> %a, <2 x double> %b)
 ; SSE-NEXT:    ret <4 x double> [[INS]]
 ;
 ; AVX-LABEL: @src_ins3_v4f64_ext0_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>
+; AVX-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>
 ; AVX-NEXT:    ret <4 x double> [[INS]]
 ;
   %ext = extractelement <2 x double> %b, i32 0
@@ -86,8 +84,7 @@ define <4 x double> @src_ins2_v4f64_ext1_v2f64(<4 x double> %a, <2 x double> %b)
 ; SSE-NEXT:    ret <4 x double> [[INS]]
 ;
 ; AVX-LABEL: @src_ins2_v4f64_ext1_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
-; AVX-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
+; AVX-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
 ; AVX-NEXT:    ret <4 x double> [[INS]]
 ;
   %ext = extractelement <2 x double> %b, i32 1
@@ -96,10 +93,14 @@ define <4 x double> @src_ins2_v4f64_ext1_v2f64(<4 x double> %a, <2 x double> %b)
 }
 
 define <4 x double> @src_ins3_v4f64_ext1_v2f64(<4 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: @src_ins3_v4f64_ext1_v2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[INS]]
+; SSE-LABEL: @src_ins3_v4f64_ext1_v2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; SSE-NEXT:    ret <4 x double> [[INS]]
+;
+; AVX-LABEL: @src_ins3_v4f64_ext1_v2f64(
+; AVX-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; AVX-NEXT:    ret <4 x double> [[INS]]
 ;
   %ext = extractelement <2 x double> %b, i32 1
   %ins = insertelement <4 x double> poison, double %ext, i32 3
@@ -119,8 +120,7 @@ define <2 x double> @src_ins0_v2f64_ext0_v4f64(<2 x double> %a, <4 x double> %b)
 
 define <2 x double> @src_ins0_v2f64_ext1_v4f64(<2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @src_ins0_v2f64_ext1_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    ret <2 x double> [[INS]]
 ;
   %ext = extractelement <4 x double> %b, i32 1
@@ -152,8 +152,7 @@ define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b)
 
 define <2 x double> @src_ins1_v2f64_ext0_v4f64(<2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @src_ins1_v2f64_ext0_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    ret <2 x double> [[INS]]
 ;
   %ext = extractelement <4 x double> %b, i32 0
@@ -164,8 +163,7 @@ define <2 x double> @src_ins1_v2f64_ext0_v4f64(<2 x double> %a, <4 x double> %b)
 define <2 x double> @src_ins1_v2f64_ext1_v4f64(<2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @src_ins1_v2f64_ext1_v4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT:    ret <2 x double> [[INS]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %ext = extractelement <4 x double> %b, i32 1
   %ins = insertelement <2 x double> poison, double %ext, i32 1
@@ -174,8 +172,7 @@ define <2 x double> @src_ins1_v2f64_ext1_v4f64(<2 x double> %a, <4 x double> %b)
 
 define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @src_ins1_v2f64_ext2_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 poison>
-; SSE-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; SSE-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 2>
 ; SSE-NEXT:    ret <2 x double> [[INS]]
 ;
 ; AVX-LABEL: @src_ins1_v2f64_ext2_v4f64(
@@ -190,8 +187,7 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
 
 define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT:    [[INS:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
 ; CHECK-NEXT:    ret <2 x double> [[INS]]
 ;
   %ext = extractelement <4 x double> %b, i32 3
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 73476308916f..40437ca34522 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -578,8 +578,7 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
 ; CHECK-NEXT:    [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i32> [[L]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    ret <8 x i32> [[R]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %l = load <1 x i32>, ptr %p, align 4
   store <1 x i32> %l, ptr %store_ptr

From 0dc0aeb14f1e38b84f4abca9d170e971e28d2ec3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 22 Jun 2025 13:48:24 +0100
Subject: [PATCH 1193/1322] [LV] Add additional tests for replicating calls
 returning structs.

Add additional test coverage for replicating calls return structs, in
particular cases where the number of struct elements does not match the
VF.

Extra test coverage for
https://github.com/llvm/llvm-project/pull/142433.
---
 .../LoopVectorize/struct-return-replicate.ll  | 484 ++++++++++++++++++
 .../Transforms/LoopVectorize/struct-return.ll |  53 --
 2 files changed, 484 insertions(+), 53 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll

diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
new file mode 100644
index 000000000000..fe53334cb25a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -0,0 +1,484 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "middle.block:" --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF4 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF2IC2 %s
+
+define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonly %out_a) {
+; VF4-LABEL: define void @struct_return_1xi64_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP12:%.*]] = insertvalue { <4 x i64> } poison, <4 x i64> [[TMP11]], 0
+; VF4-NEXT:    [[TMP13:%.*]] = extractvalue { i64 } [[TMP5]], 0
+; VF4-NEXT:    [[TMP14:%.*]] = extractvalue { <4 x i64> } [[TMP12]], 0
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP13]], i32 1
+; VF4-NEXT:    [[TMP16:%.*]] = insertvalue { <4 x i64> } [[TMP12]], <4 x i64> [[TMP15]], 0
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { i64 } [[TMP7]], 0
+; VF4-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i64> } [[TMP16]], 0
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP17]], i32 2
+; VF4-NEXT:    [[TMP20:%.*]] = insertvalue { <4 x i64> } [[TMP16]], <4 x i64> [[TMP19]], 0
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { i64 } [[TMP9]], 0
+; VF4-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x i64> } [[TMP20]], 0
+; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x i64> } [[TMP20]], <4 x i64> [[TMP23]], 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { <4 x i64> } [[TMP24]], 0
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[TMP26]], i32 0
+; VF4-NEXT:    store <4 x i64> [[TMP25]], ptr [[TMP27]], align 4
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_1xi64_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractvalue { i64 } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0
+; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
+; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0
+; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0
+; VF2IC2-NEXT:    [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0
+; VF2IC2-NEXT:    [[TMP21:%.*]] = extractvalue { i64 } [[TMP17]], 0
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractvalue { <2 x i64> } [[TMP20]], 0
+; VF2IC2-NEXT:    [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertvalue { <2 x i64> } [[TMP20]], <2 x i64> [[TMP23]], 0
+; VF2IC2-NEXT:    [[TMP25:%.*]] = extractvalue { <2 x i64> } [[TMP13]], 0
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { <2 x i64> } [[TMP24]], 0
+; VF2IC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 0
+; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 2
+; VF2IC2-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP28]], align 4
+; VF2IC2-NEXT:    store <2 x i64> [[TMP26]], ptr [[TMP29]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { i64 } @fn1(float %in_val) #0
+  %extract_a = extractvalue { i64 } %call, 0
+  %arrayidx2 = getelementptr inbounds i64, ptr %out_a, i64 %iv
+  store i64 %extract_a, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; VF4-LABEL: define void @struct_return_2xf32_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP12:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP11]], 0
+; VF4-NEXT:    [[TMP13:%.*]] = extractvalue { float, float } [[TMP3]], 1
+; VF4-NEXT:    [[TMP14:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 1
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP13]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP15]], 1
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { float, float } [[TMP5]], 0
+; VF4-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP16]], 0
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP17]], i32 1
+; VF4-NEXT:    [[TMP20:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP16]], <4 x float> [[TMP19]], 0
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { float, float } [[TMP5]], 1
+; VF4-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP20]], 1
+; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP20]], <4 x float> [[TMP23]], 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { float, float } [[TMP7]], 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP25]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP27]], 0
+; VF4-NEXT:    [[TMP29:%.*]] = extractvalue { float, float } [[TMP7]], 1
+; VF4-NEXT:    [[TMP30:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP28]], 1
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP29]], i32 2
+; VF4-NEXT:    [[TMP32:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP28]], <4 x float> [[TMP31]], 1
+; VF4-NEXT:    [[TMP33:%.*]] = extractvalue { float, float } [[TMP9]], 0
+; VF4-NEXT:    [[TMP34:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP32]], 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP33]], i32 3
+; VF4-NEXT:    [[TMP36:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP32]], <4 x float> [[TMP35]], 0
+; VF4-NEXT:    [[TMP37:%.*]] = extractvalue { float, float } [[TMP9]], 1
+; VF4-NEXT:    [[TMP38:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 1
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP37]], i32 3
+; VF4-NEXT:    [[TMP40:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP39]], 1
+; VF4-NEXT:    [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 0
+; VF4-NEXT:    [[TMP42:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 1
+; VF4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0
+; VF4-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP44]], align 4
+; VF4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
+; VF4-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP46]], align 4
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_2xf32_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP4]], 1
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP9]], 1
+; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP10]], i32 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP9]], <2 x float> [[TMP12]], 1
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractvalue { float, float } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP13]], 0
+; VF2IC2-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1
+; VF2IC2-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP13]], <2 x float> [[TMP16]], 0
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { float, float } [[TMP6]], 1
+; VF2IC2-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1
+; VF2IC2-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i32 1
+; VF2IC2-NEXT:    [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0
+; VF2IC2-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i32 0
+; VF2IC2-NEXT:    [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0
+; VF2IC2-NEXT:    [[TMP29:%.*]] = extractvalue { float, float } [[TMP23]], 1
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP28]], 1
+; VF2IC2-NEXT:    [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i32 0
+; VF2IC2-NEXT:    [[TMP32:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP28]], <2 x float> [[TMP31]], 1
+; VF2IC2-NEXT:    [[TMP33:%.*]] = extractvalue { float, float } [[TMP25]], 0
+; VF2IC2-NEXT:    [[TMP34:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP32]], 0
+; VF2IC2-NEXT:    [[TMP35:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i32 1
+; VF2IC2-NEXT:    [[TMP36:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP32]], <2 x float> [[TMP35]], 0
+; VF2IC2-NEXT:    [[TMP37:%.*]] = extractvalue { float, float } [[TMP25]], 1
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP36]], 1
+; VF2IC2-NEXT:    [[TMP39:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i32 1
+; VF2IC2-NEXT:    [[TMP40:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP36]], <2 x float> [[TMP39]], 1
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 0
+; VF2IC2-NEXT:    [[TMP42:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 0
+; VF2IC2-NEXT:    [[TMP43:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 1
+; VF2IC2-NEXT:    [[TMP44:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 1
+; VF2IC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
+; VF2IC2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 2
+; VF2IC2-NEXT:    store <2 x float> [[TMP41]], ptr [[TMP46]], align 4
+; VF2IC2-NEXT:    store <2 x float> [[TMP42]], ptr [[TMP47]], align 4
+; VF2IC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
+; VF2IC2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 2
+; VF2IC2-NEXT:    store <2 x float> [[TMP43]], ptr [[TMP49]], align 4
+; VF2IC2-NEXT:    store <2 x float> [[TMP44]], ptr [[TMP50]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @fn2(float %in_val) #1
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonly %dst.a, ptr noalias %dst.b, ptr noalias %dst.c) {
+; VF4-LABEL: define void @struct_return_3xi32_replicate(
+; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0
+; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; VF4-NEXT:    [[TMP12:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP11]], 0
+; VF4-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 1
+; VF4-NEXT:    [[TMP14:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1
+; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i32 0
+; VF4-NEXT:    [[TMP16:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], <4 x i32> [[TMP15]], 1
+; VF4-NEXT:    [[TMP17:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 2
+; VF4-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], 2
+; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP17]], i32 0
+; VF4-NEXT:    [[TMP20:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], <4 x i32> [[TMP19]], 2
+; VF4-NEXT:    [[TMP21:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 0
+; VF4-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], 0
+; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], <4 x i32> [[TMP23]], 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], 1
+; VF4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP25]], i32 1
+; VF4-NEXT:    [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], <4 x i32> [[TMP27]], 1
+; VF4-NEXT:    [[TMP29:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 2
+; VF4-NEXT:    [[TMP30:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], 2
+; VF4-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i32 1
+; VF4-NEXT:    [[TMP32:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], <4 x i32> [[TMP31]], 2
+; VF4-NEXT:    [[TMP33:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 0
+; VF4-NEXT:    [[TMP34:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], 0
+; VF4-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP33]], i32 2
+; VF4-NEXT:    [[TMP36:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], <4 x i32> [[TMP35]], 0
+; VF4-NEXT:    [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 1
+; VF4-NEXT:    [[TMP38:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], 1
+; VF4-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP37]], i32 2
+; VF4-NEXT:    [[TMP40:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], <4 x i32> [[TMP39]], 1
+; VF4-NEXT:    [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 2
+; VF4-NEXT:    [[TMP42:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], 2
+; VF4-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP41]], i32 2
+; VF4-NEXT:    [[TMP44:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], <4 x i32> [[TMP43]], 2
+; VF4-NEXT:    [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 0
+; VF4-NEXT:    [[TMP46:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], 0
+; VF4-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP45]], i32 3
+; VF4-NEXT:    [[TMP48:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], <4 x i32> [[TMP47]], 0
+; VF4-NEXT:    [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 1
+; VF4-NEXT:    [[TMP50:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], 1
+; VF4-NEXT:    [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP49]], i32 3
+; VF4-NEXT:    [[TMP52:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], <4 x i32> [[TMP51]], 1
+; VF4-NEXT:    [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 2
+; VF4-NEXT:    [[TMP54:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], 2
+; VF4-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP53]], i32 3
+; VF4-NEXT:    [[TMP56:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], <4 x i32> [[TMP55]], 2
+; VF4-NEXT:    [[TMP57:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 0
+; VF4-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 0
+; VF4-NEXT:    store <4 x i32> [[TMP57]], ptr [[TMP59]], align 4
+; VF4-NEXT:    [[TMP60:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 1
+; VF4-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 0
+; VF4-NEXT:    store <4 x i32> [[TMP60]], ptr [[TMP62]], align 4
+; VF4-NEXT:    [[TMP63:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 2
+; VF4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
+; VF4-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; VF4-NEXT:    store <4 x i32> [[TMP63]], ptr [[TMP65]], align 4
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4-NEXT:    br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+;
+; VF2IC2-LABEL: define void @struct_return_3xi32_replicate(
+; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0
+; VF2IC2-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 1
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], 1
+; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], <2 x i32> [[TMP12]], 1
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 2
+; VF2IC2-NEXT:    [[TMP15:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], 2
+; VF2IC2-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 0
+; VF2IC2-NEXT:    [[TMP17:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], <2 x i32> [[TMP16]], 2
+; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 0
+; VF2IC2-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], 0
+; VF2IC2-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i32 1
+; VF2IC2-NEXT:    [[TMP21:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], <2 x i32> [[TMP20]], 0
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 1
+; VF2IC2-NEXT:    [[TMP23:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], 1
+; VF2IC2-NEXT:    [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP22]], i32 1
+; VF2IC2-NEXT:    [[TMP25:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], <2 x i32> [[TMP24]], 1
+; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 2
+; VF2IC2-NEXT:    [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2
+; VF2IC2-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i32 1
+; VF2IC2-NEXT:    [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0
+; VF2IC2-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0
+; VF2IC2-NEXT:    [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0
+; VF2IC2-NEXT:    [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 1
+; VF2IC2-NEXT:    [[TMP38:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], 1
+; VF2IC2-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP37]], i32 0
+; VF2IC2-NEXT:    [[TMP40:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], <2 x i32> [[TMP39]], 1
+; VF2IC2-NEXT:    [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 2
+; VF2IC2-NEXT:    [[TMP42:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], 2
+; VF2IC2-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[TMP41]], i32 0
+; VF2IC2-NEXT:    [[TMP44:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], <2 x i32> [[TMP43]], 2
+; VF2IC2-NEXT:    [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 0
+; VF2IC2-NEXT:    [[TMP46:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], 0
+; VF2IC2-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> [[TMP46]], i32 [[TMP45]], i32 1
+; VF2IC2-NEXT:    [[TMP48:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], <2 x i32> [[TMP47]], 0
+; VF2IC2-NEXT:    [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 1
+; VF2IC2-NEXT:    [[TMP50:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], 1
+; VF2IC2-NEXT:    [[TMP51:%.*]] = insertelement <2 x i32> [[TMP50]], i32 [[TMP49]], i32 1
+; VF2IC2-NEXT:    [[TMP52:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], <2 x i32> [[TMP51]], 1
+; VF2IC2-NEXT:    [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 2
+; VF2IC2-NEXT:    [[TMP54:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], 2
+; VF2IC2-NEXT:    [[TMP55:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP53]], i32 1
+; VF2IC2-NEXT:    [[TMP56:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], <2 x i32> [[TMP55]], 2
+; VF2IC2-NEXT:    [[TMP57:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 0
+; VF2IC2-NEXT:    [[TMP58:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 0
+; VF2IC2-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 0
+; VF2IC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP57]], ptr [[TMP60]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP58]], ptr [[TMP61]], align 4
+; VF2IC2-NEXT:    [[TMP62:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 1
+; VF2IC2-NEXT:    [[TMP63:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 1
+; VF2IC2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; VF2IC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP62]], ptr [[TMP65]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP63]], ptr [[TMP66]], align 4
+; VF2IC2-NEXT:    [[TMP67:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 2
+; VF2IC2-NEXT:    [[TMP68:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 2
+; VF2IC2-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 0
+; VF2IC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2
+; VF2IC2-NEXT:    store <2 x i32> [[TMP67]], ptr [[TMP70]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF2IC2-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %iv
+  %in_val = load i32, ptr %arrayidx, align 4
+  %call = tail call { i32, i32, i32 } @fn3(i32 %in_val) #2
+  %extract_a = extractvalue { i32, i32, i32 } %call, 0
+  %gep.dst.a = getelementptr inbounds i32, ptr %dst.a, i64 %iv
+  store i32 %extract_a, ptr %gep.dst.a, align 4
+  %extract_b = extractvalue { i32, i32, i32 } %call, 1
+  %gep.dst.b = getelementptr inbounds i32, ptr %dst.b, i64 %iv
+  store i32 %extract_b, ptr %gep.dst.b, align 4
+  %extract_c = extractvalue { i32, i32, i32 } %call, 2
+  %gep.dst.c = getelementptr inbounds i32, ptr %dst.c, i64 %iv
+  store i32 %extract_c, ptr %gep.dst.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare { i64 } @fn1(float)
+declare { float, float } @fn2(float)
+declare { i32, i32, i32 } @fn3(i32)
+
+declare { <8 x i64> } @fixed_vec_fn1(<8 x float>)
+declare { <8 x float>, <8 x float> } @fixed_vec_fn2(<8 x float>)
+declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 50b9ba12af82..6d849c01f405 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -69,59 +69,6 @@ exit:
   ret void
 }
 
-; CHECK-REMARKS: remark: {{.*}} vectorized loop
-; Note: Later instcombines reduce this down quite a lot.
-define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
-; CHECK-LABEL: define void @struct_return_f32_replicate
-; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
-; CHECK:       vector.body:
-; CHECK:         [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
-; CHECK:         [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
-;                // Lane 0
-; CHECK:         [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
-; CHECK:         [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK:         [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0
-; CHECK:         [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
-; CHECK:         [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1
-; CHECK:         [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0
-; CHECK:         [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1
-;                // Lane 1
-; CHECK:         [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
-; CHECK:         [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0
-; CHECK:         [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1
-; CHECK:         [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0
-; CHECK:         [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
-; CHECK:         [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1
-; CHECK:         [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1
-; CHECK:         [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1
-;                // Store wide values:
-; CHECK:         [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0
-; CHECK:         [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1
-; CHECK:         store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4
-; CHECK:         store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4
-entry:
-  br label %for.body
-
-for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
-  %in_val = load float, ptr %arrayidx, align 4
-  ; #3 does not have a fixed-size vector mapping (so replication is used)
-  %call = tail call { float, float } @foo(float %in_val) #3
-  %extract_a = extractvalue { float, float } %call, 0
-  %extract_b = extractvalue { float, float } %call, 1
-  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
-  store float %extract_a, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
-  store float %extract_b, ptr %arrayidx4, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:
-  ret void
-}
-
 ; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks

From 5d7d8d627a418fda2706c4880389711e12d43aea Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 22 Jun 2025 06:30:02 -0700
Subject: [PATCH 1194/1322] [Instrumentation] Drop "const" from a return type
 (NFC) (#145208)

We don't need to put a const on a return type.
---
 llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 4c7b8c69c1bf..71ab61253e64 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -245,7 +245,7 @@ namespace {
   // to the block.
   class GCOVLines : public GCOVRecord {
   public:
-    const StringRef getFilename() { return Filename; }
+    StringRef getFilename() { return Filename; }
 
     void addLine(uint32_t Line) {
       assert(Line != 0 && "Line zero is not a valid real line number.");

From 2ac293f5ac4cf65c0c038bf75a88f1d6715e467d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 22 Jun 2025 06:30:10 -0700
Subject: [PATCH 1195/1322] [lld] Use llvm::partition_point (NFC) (#145209)

---
 lld/ELF/Arch/AArch64.cpp | 5 ++---
 lld/ELF/Arch/X86_64.cpp  | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index f00c91b5886f..1812f2af419d 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -1003,9 +1003,8 @@ static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
 
 static std::pair<Relocation *, uint64_t>
 getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
-  auto *i =
-      std::partition_point(is.relocations.begin(), is.relocations.end(),
-                           [&](Relocation &r) { return r.offset < offset; });
+  auto *i = llvm::partition_point(
+      is.relocations, [&](Relocation &r) { return r.offset < offset; });
   if (i != is.relocations.end() && i->offset == offset &&
       i->type == R_AARCH64_JUMP26) {
     return {i, i->addend};
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index b991b6f905b9..163505102d0e 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -1193,9 +1193,8 @@ static std::pair<Relocation *, uint64_t>
 getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
   auto content = is.contentMaybeDecompress();
   if (content.size() > offset && content[offset] == 0xe9) { // JMP immediate
-    auto *i = std::partition_point(
-        is.relocations.begin(), is.relocations.end(),
-        [&](Relocation &r) { return r.offset < offset + 1; });
+    auto *i = llvm::partition_point(
+        is.relocations, [&](Relocation &r) { return r.offset < offset + 1; });
     // Unlike with getControlTransferAddend() it is valid to accept a PC32
     // relocation here because we know that this is actually a JMP and not some
     // other reference, so the interpretation is that we add 4 to the addend and

From dec93ae45492cd84e3eec05f00ffb0fb1d35f045 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 22 Jun 2025 06:30:17 -0700
Subject: [PATCH 1196/1322] [mlir] Migrate away from ValueRange(std::nullopt)
 (NFC) (#145210)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

One of the common uses of std::nullopt is in one of the constructors
for ValueRange.  This patch takes care of the migration where we need
ValueRange() to facilitate perfect forwarding.  Note that {} would be
ambiguous for perfecting forwarding to work.
---
 mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp   | 2 +-
 mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp             | 4 ++--
 mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp           | 2 +-
 mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp                 | 2 +-
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp                      | 4 ++--
 .../Dialect/Bufferization/Transforms/LowerDeallocations.cpp   | 4 ++--
 mlir/lib/Dialect/SCF/IR/SCF.cpp                               | 2 +-
 .../Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 3d0804fd11b6..cf416e140c85 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -79,7 +79,7 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
         abortFunc = rewriter.create<LLVM::LLVMFuncOp>(rewriter.getUnknownLoc(),
                                                       "abort", abortFuncTy);
       }
-      rewriter.create<LLVM::CallOp>(loc, abortFunc, std::nullopt);
+      rewriter.create<LLVM::CallOp>(loc, abortFunc, ValueRange());
       rewriter.create<LLVM::UnreachableOp>(loc);
     } else {
       rewriter.create<LLVM::BrOp>(loc, ValueRange(), continuationBlock);
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index e8294a5234c4..4b7b2cc224ce 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -1616,8 +1616,8 @@ private:
 
     // Hook up the cond exit to the remainder.
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<LLVM::CondBrOp>(loc, pred, bodyBlock, std::nullopt,
-                                    remainder, std::nullopt);
+    rewriter.create<LLVM::CondBrOp>(loc, pred, bodyBlock, ValueRange(),
+                                    remainder, ValueRange());
 
     // Reset position to beginning of new remainder block.
     rewriter.setInsertionPointToStart(remainder);
diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index ff5b762a969d..f8867c65fe7d 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -928,7 +928,7 @@ LogicalResult ReinterpretCastPattern::matchAndRewrite(
   }();
 
   rewriter.replaceOpWithNewOp<spirv::InBoundsPtrAccessChainOp>(
-      op, src, offsetValue, std::nullopt);
+      op, src, offsetValue, ValueRange());
   return success();
 }
 
diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
index baac1b374b12..a59001de299f 100644
--- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
+++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
@@ -421,7 +421,7 @@ struct WhileOpConversion final : SCFToSPIRVPattern<scf::WhileOp> {
 
     rewriter.setInsertionPointToEnd(&beforeBlock);
     rewriter.replaceOpWithNewOp<spirv::BranchConditionalOp>(
-        cond, conditionVal, &afterBlock, condArgs, &mergeBlock, std::nullopt);
+        cond, conditionVal, &afterBlock, condArgs, &mergeBlock, ValueRange());
 
     // Convert the scf.yield op to a branch back to the header block.
     rewriter.setInsertionPointToEnd(&afterBlock);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 3d09c6a9b2c2..48770d4f4ff7 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2742,7 +2742,7 @@ buildAffineLoopFromConstants(OpBuilder &builder, Location loc, int64_t lb,
                              int64_t ub, int64_t step,
                              AffineForOp::BodyBuilderFn bodyBuilderFn) {
   return builder.create<AffineForOp>(loc, lb, ub, step,
-                                     /*iterArgs=*/std::nullopt, bodyBuilderFn);
+                                     /*iterArgs=*/ValueRange(), bodyBuilderFn);
 }
 
 /// Creates an affine loop from the bounds that may or may not be constants.
@@ -2757,7 +2757,7 @@ buildAffineLoopFromValues(OpBuilder &builder, Location loc, Value lb, Value ub,
                                         ubConst.value(), step, bodyBuilderFn);
   return builder.create<AffineForOp>(loc, lb, builder.getDimIdentityMap(), ub,
                                      builder.getDimIdentityMap(), step,
-                                     /*iterArgs=*/std::nullopt, bodyBuilderFn);
+                                     /*iterArgs=*/ValueRange(), bodyBuilderFn);
 }
 
 void mlir::affine::buildAffineLoopNest(
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp
index f51b125bda6e..2a17ae4f6a24 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/LowerDeallocations.cpp
@@ -460,14 +460,14 @@ func::FuncOp mlir::bufferization::buildDeallocationLibraryFunction(
   Value toRetainSize = builder.create<memref::DimOp>(loc, toRetainMemref, c0);
 
   builder.create<scf::ForOp>(
-      loc, c0, toRetainSize, c1, std::nullopt,
+      loc, c0, toRetainSize, c1, ValueRange(),
       [&](OpBuilder &builder, Location loc, Value i, ValueRange iterArgs) {
         builder.create<memref::StoreOp>(loc, falseValue, retainCondsMemref, i);
         builder.create<scf::YieldOp>(loc);
       });
 
   builder.create<scf::ForOp>(
-      loc, c0, toDeallocSize, c1, std::nullopt,
+      loc, c0, toDeallocSize, c1, ValueRange(),
       [&](OpBuilder &builder, Location loc, Value outerIter,
           ValueRange iterArgs) {
         Value toDealloc =
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 5a0b8a058dd6..b3271462df27 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -3194,7 +3194,7 @@ struct MergeNestedParallelLoops : public OpRewritePattern<ParallelOp> {
     auto newSteps = concatValues(op.getStep(), innerOp.getStep());
 
     rewriter.replaceOpWithNewOp<ParallelOp>(op, newLowerBounds, newUpperBounds,
-                                            newSteps, std::nullopt,
+                                            newSteps, ValueRange(),
                                             bodyBuilder);
     return success();
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index dcb023707088..093a35a3d4c9 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1537,7 +1537,7 @@ struct OutRewriter : public OpRewritePattern<OutOp> {
 
     // For each element in the source tensor, output the element.
     rewriter.create<ForeachOp>(
-        loc, src, std::nullopt,
+        loc, src, ValueRange(),
         [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
             ValueRange reduc) {
           for (Dimension d = 0; d < dimRank; d++) {

From 96493c514efad52c9fce6db3b0abebf0e61ae93a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 22 Jun 2025 22:41:05 +0900
Subject: [PATCH 1197/1322] AMDGPU: Use reportFatalUsageError for regalloc flag
 error (#145198)

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp  | 4 ++--
 llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c28f407391c1..88613cf5eb4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1636,7 +1636,7 @@ static const char RegAllocOptNotSupportedMessage[] =
 
 bool GCNPassConfig::addRegAssignAndRewriteFast() {
   if (!usingDefaultRegAlloc())
-    report_fatal_error(RegAllocOptNotSupportedMessage);
+    reportFatalUsageError(RegAllocOptNotSupportedMessage);
 
   addPass(&GCNPreRALongBranchRegID);
 
@@ -1662,7 +1662,7 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
 
 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
   if (!usingDefaultRegAlloc())
-    report_fatal_error(RegAllocOptNotSupportedMessage);
+    reportFatalUsageError(RegAllocOptNotSupportedMessage);
 
   addPass(&GCNPreRALongBranchRegID);
 
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index 52ad7e535520..33585024d81d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -9,8 +9,8 @@
 ; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s
 ; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s
 
-; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
-; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
+; RUN: not llc -verify-machineinstrs=0 -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
+; RUN: not llc -verify-machineinstrs=0 -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
 
 
 ; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, and -vgpr-regalloc

From ee5dcdc275b136172a5bfa85a098ae506a836c85 Mon Sep 17 00:00:00 2001
From: Menooker <myjisgreat@live.cn>
Date: Sun, 22 Jun 2025 22:03:36 +0800
Subject: [PATCH 1198/1322] [mlir] fix assertion failure in remove-dead-values
 (#144849)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simple IR patterns will trigger assertion error:

```
  func.func @test_zero_operands(%I: memref<10xindex>, %I2: memref<10xf32>) {
    %v0 = arith.constant 0 : index
    %result = memref.alloca_scope -> index {
      %c = arith.addi %v0, %v0 : index
      memref.store %c, %I[%v0] : memref<10xindex>
      memref.alloca_scope.return %c: index
    }
    func.return
  }
```

with error: `mlir/include/mlir/IR/Operation.h:988:
mlir::detail::OperandStorage& mlir::Operation::getOperandStorage():
Assertion `hasOperandStorage && "expected operation to have operand
storage"' failed.`

This PR will fix this issue.

---------

Co-authored-by: Andrzej Warzyński <andrzej.warzynski@gmail.com>
Co-authored-by: Mehdi Amini <joker.eph@gmail.com>
---
 mlir/lib/Transforms/RemoveDeadValues.cpp     |  5 +++--
 mlir/test/Transforms/remove-dead-values.mlir | 23 ++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index ad21ce8f1870..0a02a01977ca 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -750,8 +750,9 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
   }
 
   // 4. Operands
-  for (auto &o : list.operands) {
-    o.op->eraseOperands(o.nonLive);
+  for (OperationToCleanup &o : list.operands) {
+    if (o.op->getNumOperands() > 0)
+      o.op->eraseOperands(o.nonLive);
   }
 
   // 5. Results
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index e55a9160f5b3..3af95db3c0e2 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -511,6 +511,28 @@ module {
 // CHECK: linalg.yield %[[yield]] : f32
 // CHECK-NOT: arith.subf
 
+
+// -----
+
+// check that ops with zero operands are correctly handled
+
+module {
+  func.func @test_zero_operands(%I: memref<10xindex>, %I2: memref<10xf32>) {
+    %v0 = arith.constant 0 : index
+    %result = memref.alloca_scope -> index {
+      %c = arith.addi %v0, %v0 : index
+      memref.store %c, %I[%v0] : memref<10xindex>
+      memref.alloca_scope.return %c: index
+    }
+    func.return
+  }
+}
+
+// CHECK-LABEL: func @test_zero_operands
+// CHECK: memref.alloca_scope
+// CHECK: memref.store
+// CHECK-NOT: memref.alloca_scope.return
+
 // -----
 
 // CHECK-LABEL: func.func @test_atomic_yield
@@ -525,3 +547,4 @@ func.func @test_atomic_yield(%I: memref<10xf32>, %idx : index) {
   }
   func.return
 }
+

From 584a2c2e7c5474e858e644baacae859ae54c7d60 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 22 Jun 2025 23:10:18 +0900
Subject: [PATCH 1199/1322] AMDGPU: Avoid report_fatal_error for reporting
 libcalls (#145134)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b9023b6d7a3a..6f37e2dd3aa7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3805,7 +3805,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (!CLI.CB)
-    report_fatal_error("unsupported libcall legalization");
+    return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
 
   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
     return lowerUnhandledCall(CLI, InVals,

From 078475d6c153b83d5eef7edef78e536503683443 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 22 Jun 2025 15:24:39 +0100
Subject: [PATCH 1200/1322] [ARM] Add test coverage for #144845 and regenerate
 tests. NFC

---
 llvm/test/CodeGen/ARM/special-reg.ll | 76 +++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/llvm/test/CodeGen/ARM/special-reg.ll b/llvm/test/CodeGen/ARM/special-reg.ll
index 7ccb490f5d4a..e966550e673d 100644
--- a/llvm/test/CodeGen/ARM/special-reg.ll
+++ b/llvm/test/CodeGen/ARM/special-reg.ll
@@ -1,67 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ARM --check-prefix=ACORE
 ; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=ARM --check-prefix=MCORE
 
 define i32 @read_i32_encoded_register() nounwind {
-entry:
 ; ARM-LABEL: read_i32_encoded_register:
-; ARM: mrc p1, #2, r0, c3, c4, #5
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    mrc p1, #2, r0, c3, c4, #5
+; ARM-NEXT:    bx lr
+entry:
   %reg = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %reg
 }
 
 define i64 @read_i64_encoded_register() nounwind {
-entry:
 ; ARM-LABEL: read_i64_encoded_register:
-; ARM: mrrc p1, #2, r0, r1, c3
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    mrrc p1, #2, r0, r1, c3
+; ARM-NEXT:    bx lr
+entry:
   %reg = call i64 @llvm.read_register.i64(metadata !1)
   ret i64 %reg
 }
 
-define i32 @read_apsr() nounwind {
+define i64 @read_volatile_i64_twice() {
+; ACORE-LABEL: read_volatile_i64_twice:
+; ACORE:       @ %bb.0: @ %entry
+; ACORE-NEXT:    mov r0, #0
+; ACORE-NEXT:    mov r1, #0
+; ACORE-NEXT:    bx lr
+;
+; MCORE-LABEL: read_volatile_i64_twice:
+; MCORE:       @ %bb.0: @ %entry
+; MCORE-NEXT:    movs r0, #0
+; MCORE-NEXT:    movs r1, #0
+; MCORE-NEXT:    bx lr
 entry:
+  %0 = tail call i64 @llvm.read_volatile_register.i64(metadata !5)
+  %1 = tail call i64 @llvm.read_volatile_register.i64(metadata !5)
+  %xor = xor i64 %1, %0
+  ret i64 %xor
+}
+
+
+define i32 @read_apsr() nounwind {
 ; ARM-LABEL: read_apsr:
-; ARM: mrs r0, apsr
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    mrs r0, apsr
+; ARM-NEXT:    bx lr
+entry:
   %reg = call i32 @llvm.read_register.i32(metadata !2)
   ret i32 %reg
 }
 
 define i32 @read_fpscr() nounwind {
-entry:
 ; ARM-LABEL: read_fpscr:
-; ARM: vmrs r0, fpscr
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    vmrs r0, fpscr
+; ARM-NEXT:    bx lr
+entry:
   %reg = call i32 @llvm.read_register.i32(metadata !3)
   ret i32 %reg
 }
 
 define void @write_i32_encoded_register(i32 %x) nounwind {
-entry:
 ; ARM-LABEL: write_i32_encoded_register:
-; ARM: mcr p1, #2, r0, c3, c4, #5
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    mcr p1, #2, r0, c3, c4, #5
+; ARM-NEXT:    bx lr
+entry:
   call void @llvm.write_register.i32(metadata !0, i32 %x)
   ret void
 }
 
 define void @write_i64_encoded_register(i64 %x) nounwind {
-entry:
 ; ARM-LABEL: write_i64_encoded_register:
-; ARM: mcrr p1, #2, r0, r1, c3
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    mcrr p1, #2, r0, r1, c3
+; ARM-NEXT:    bx lr
+entry:
   call void @llvm.write_register.i64(metadata !1, i64 %x)
   ret void
 }
 
 define void @write_apsr(i32 %x) nounwind {
+; ACORE-LABEL: write_apsr:
+; ACORE:       @ %bb.0: @ %entry
+; ACORE-NEXT:    msr APSR_nzcvq, r0
+; ACORE-NEXT:    bx lr
+;
+; MCORE-LABEL: write_apsr:
+; MCORE:       @ %bb.0: @ %entry
+; MCORE-NEXT:    msr apsr_nzcvq, r0
+; MCORE-NEXT:    bx lr
 entry:
-; ARM-LABEL: write_apsr:
-; ACORE: msr APSR_nzcvq, r0
-; MCORE: msr apsr_nzcvq, r0
   call void @llvm.write_register.i32(metadata !4, i32 %x)
   ret void
 }
 
 define void @write_fpscr(i32 %x) nounwind {
-entry:
 ; ARM-LABEL: write_fpscr:
-; ARM: vmsr fpscr, r0
+; ARM:       @ %bb.0: @ %entry
+; ARM-NEXT:    vmsr fpscr, r0
+; ARM-NEXT:    bx lr
+entry:
   call void @llvm.write_register.i32(metadata !3, i32 %x)
   ret void
 }
@@ -76,3 +117,4 @@ declare void @llvm.write_register.i64(metadata, i64) nounwind
 !2 = !{!"apsr"}
 !3 = !{!"fpscr"}
 !4 = !{!"apsr_nzcvq"}
+!5 = !{!"cp15:1:c14"}

From f2eb5d416ed1e539b215aa0bab46b8e8a98f1eec Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Sun, 22 Jun 2025 22:48:59 +0800
Subject: [PATCH 1201/1322] [SelectionDAG] Handle `fneg`/`fabs`/`fcopysign` in
 `SimplifyDemandedBits` (#139239)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  52 +---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  71 +++++
 .../CodeGen/AArch64/extract-vector-elt.ll     |  12 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll    |  54 ++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     | 288 +++++++++---------
 .../AMDGPU/select-fabs-fneg-extract.f16.ll    |  12 +-
 .../AMDGPU/select-fabs-fneg-extract.ll        |   3 +-
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  |  41 +--
 llvm/test/CodeGen/AMDGPU/udiv.ll              |   6 +-
 9 files changed, 268 insertions(+), 271 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 701a76c4cc6b..4f91f90b0469 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18403,49 +18403,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, DL, VT, {N0, N1}))
     return C;
 
-  if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
-    const APFloat &V = N1C->getValueAPF();
-    // copysign(x, c1) -> fabs(x)       iff ispos(c1)
-    // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
-    if (!V.isNegative()) {
-      if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
-        return DAG.getNode(ISD::FABS, DL, VT, N0);
-    } else {
-      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-        return DAG.getNode(ISD::FNEG, DL, VT,
-                           DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
-    }
-  }
-
-  // copysign(fabs(x), y) -> copysign(x, y)
-  // copysign(fneg(x), y) -> copysign(x, y)
-  // copysign(copysign(x,z), y) -> copysign(x, y)
-  if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
-      N0.getOpcode() == ISD::FCOPYSIGN)
-    return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
-
-  // copysign(x, abs(y)) -> abs(x)
-  if (N1.getOpcode() == ISD::FABS)
-    return DAG.getNode(ISD::FABS, DL, VT, N0);
-
-  // copysign(x, copysign(y,z)) -> copysign(x, z)
-  if (N1.getOpcode() == ISD::FCOPYSIGN)
-    return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
-
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
   if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
     return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(0));
 
-  // We only take the sign bit from the sign operand.
-  EVT SignVT = N1.getValueType();
-  if (SimplifyDemandedBits(N1,
-                           APInt::getSignMask(SignVT.getScalarSizeInBits())))
-    return SDValue(N, 0);
-
-  // We only take the non-sign bits from the value operand
-  if (SimplifyDemandedBits(N0,
-                           APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   return SDValue();
@@ -18972,6 +18935,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
                        N0.getOperand(0));
   }
 
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   if (SDValue Cast = foldSignChangeInBitcast(N))
     return Cast;
 
@@ -19045,14 +19011,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
     return C;
 
-  // fold (fabs (fabs x)) -> (fabs x)
-  if (N0.getOpcode() == ISD::FABS)
-    return N->getOperand(0);
-
-  // fold (fabs (fneg x)) -> (fabs x)
-  // fold (fabs (fcopysign x, y)) -> (fabs x)
-  if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
-    return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
 
   if (SDValue Cast = foldSignChangeInBitcast(N))
     return Cast;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1e470318ced0..66717135c9ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2967,6 +2967,77 @@ bool TargetLowering::SimplifyDemandedBits(
     }
     break;
   }
+  case ISD::FABS: {
+    SDValue Op0 = Op.getOperand(0);
+    APInt SignMask = APInt::getSignMask(BitWidth);
+
+    if (!DemandedBits.intersects(SignMask))
+      return TLO.CombineTo(Op, Op0);
+
+    if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+
+    if (Known.isNonNegative())
+      return TLO.CombineTo(Op, Op0);
+    if (Known.isNegative())
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0, Op->getFlags()));
+
+    Known.Zero |= SignMask;
+    Known.One &= ~SignMask;
+
+    break;
+  }
+  case ISD::FCOPYSIGN: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    unsigned BitWidth0 = Op0.getScalarValueSizeInBits();
+    unsigned BitWidth1 = Op1.getScalarValueSizeInBits();
+    APInt SignMask0 = APInt::getSignMask(BitWidth0);
+    APInt SignMask1 = APInt::getSignMask(BitWidth1);
+
+    if (!DemandedBits.intersects(SignMask0))
+      return TLO.CombineTo(Op, Op0);
+
+    if (SimplifyDemandedBits(Op0, ~SignMask0 & DemandedBits, DemandedElts,
+                             Known, TLO, Depth + 1) ||
+        SimplifyDemandedBits(Op1, SignMask1, DemandedElts, Known2, TLO,
+                             Depth + 1))
+      return true;
+
+    if (Known2.isNonNegative())
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags()));
+
+    if (Known2.isNegative())
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(ISD::FNEG, dl, VT,
+                              TLO.DAG.getNode(ISD::FABS, SDLoc(Op0), VT, Op0)));
+
+    Known.Zero &= ~SignMask0;
+    Known.One &= ~SignMask0;
+    break;
+  }
+  case ISD::FNEG: {
+    SDValue Op0 = Op.getOperand(0);
+    APInt SignMask = APInt::getSignMask(BitWidth);
+
+    if (!DemandedBits.intersects(SignMask))
+      return TLO.CombineTo(Op, Op0);
+
+    if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+
+    if (!Known.isSignUnknown()) {
+      Known.Zero ^= SignMask;
+      Known.One ^= SignMask;
+    }
+
+    break;
+  }
   default:
     // We also ask the target about intrinsics (which could be specific to it).
     if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 5e5fdd6d3170..0189f52bbac0 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -391,13 +391,10 @@ define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
-; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
 ; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
-; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    str q0, [sp]
 ; CHECK-SD-NEXT:    ldr s0, [x8]
 ; CHECK-SD-NEXT:    add sp, sp, #16
@@ -425,10 +422,7 @@ entry:
 define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    adrp x8, .LCPI17_0
-; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
-; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
-; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
 ; CHECK-SD-NEXT:    mov s0, v0.s[2]
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 7c89a41d62fb..f901626e54a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4388,12 +4388,11 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m
 ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -5267,13 +5266,12 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
 ;
 ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x10010
-; GFX8-NEXT:    s_add_i32 s4, s4, s1
-; GFX8-NEXT:    s_or_b32 s3, s1, 0x400000
-; GFX8-NEXT:    s_add_i32 s6, s4, 0x7fff
+; GFX8-NEXT:    s_bfe_u32 s3, s1, 0x10010
+; GFX8-NEXT:    s_add_i32 s3, s3, s1
+; GFX8-NEXT:    s_addk_i32 s3, 0x7fff
 ; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], s1, s1
 ; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT:    s_cselect_b32 s1, s3, s6
+; GFX8-NEXT:    s_cselect_b32 s1, s1, s3
 ; GFX8-NEXT:    s_bfe_u32 s3, s2, 0x10010
 ; GFX8-NEXT:    s_add_i32 s3, s3, s2
 ; GFX8-NEXT:    s_addk_i32 s3, 0x7fff
@@ -6340,18 +6338,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
 ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
@@ -7687,24 +7683,22 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m
 ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index e74d5ba24079..a3ec35da29f6 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -3227,40 +3227,38 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffe, v3
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_bfe_u32 v5, v1, 20, 11
+; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v3, v0
-; VI-NEXT:    v_sub_u32_e32 v8, vcc, s4, v5
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v1
 ; VI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
-; VI-NEXT:    v_med3_i32 v8, v8, 0, 13
-; VI-NEXT:    v_lshrrev_b32_e32 v9, v8, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v8, v8, v9
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v3
+; VI-NEXT:    v_med3_i32 v5, v5, 0, 13
+; VI-NEXT:    v_lshrrev_b32_e32 v8, v5, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v3
 ; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 12, v5
-; VI-NEXT:    v_or_b32_e32 v3, v9, v3
-; VI-NEXT:    v_or_b32_e32 v8, v0, v8
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
-; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; VI-NEXT:    v_and_b32_e32 v8, 7, v3
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
-; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 12, v1
+; VI-NEXT:    v_or_b32_e32 v3, v8, v3
+; VI-NEXT:    v_or_b32_e32 v5, v0, v5
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT:    v_and_b32_e32 v5, 7, v3
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
 ; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v8, v8, v9
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 0x8000
-; VI-NEXT:    v_and_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; VI-NEXT:    v_bfi_b32 v0, s4, v0, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -4050,41 +4048,38 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
 ; VI-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-NEXT:    s_bfe_u32 s3, s1, 0xb0014
+; VI-NEXT:    s_sub_i32 s3, 0x3f1, s1
 ; VI-NEXT:    s_or_b32 s0, s7, s0
-; VI-NEXT:    s_sub_i32 s7, 0x3f1, s3
-; VI-NEXT:    v_med3_i32 v0, s7, 0, 13
+; VI-NEXT:    v_med3_i32 v0, s3, 0, 13
 ; VI-NEXT:    s_or_b32 s2, s0, 0x1000
-; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    s_lshr_b32 s8, s2, s7
-; VI-NEXT:    s_lshl_b32 s7, s8, s7
-; VI-NEXT:    s_cmp_lg_u32 s7, s2
+; VI-NEXT:    v_readfirstlane_b32 s3, v0
+; VI-NEXT:    s_lshr_b32 s7, s2, s3
+; VI-NEXT:    s_lshl_b32 s3, s7, s3
+; VI-NEXT:    s_cmp_lg_u32 s3, s2
 ; VI-NEXT:    s_cselect_b32 s2, 1, 0
-; VI-NEXT:    s_addk_i32 s3, 0xfc10
-; VI-NEXT:    s_lshl_b32 s7, s3, 12
-; VI-NEXT:    s_or_b32 s2, s8, s2
-; VI-NEXT:    s_or_b32 s7, s0, s7
-; VI-NEXT:    s_cmp_lt_i32 s3, 1
-; VI-NEXT:    s_cselect_b32 s2, s2, s7
-; VI-NEXT:    s_and_b32 s7, s2, 7
-; VI-NEXT:    s_cmp_gt_i32 s7, 5
-; VI-NEXT:    s_cselect_b32 s8, 1, 0
-; VI-NEXT:    s_cmp_eq_u32 s7, 3
+; VI-NEXT:    s_addk_i32 s1, 0xfc10
+; VI-NEXT:    s_lshl_b32 s3, s1, 12
+; VI-NEXT:    s_or_b32 s2, s7, s2
+; VI-NEXT:    s_or_b32 s3, s0, s3
+; VI-NEXT:    s_cmp_lt_i32 s1, 1
+; VI-NEXT:    s_cselect_b32 s2, s2, s3
+; VI-NEXT:    s_and_b32 s3, s2, 7
+; VI-NEXT:    s_cmp_gt_i32 s3, 5
 ; VI-NEXT:    s_cselect_b32 s7, 1, 0
-; VI-NEXT:    s_or_b32 s7, s7, s8
+; VI-NEXT:    s_cmp_eq_u32 s3, 3
+; VI-NEXT:    s_cselect_b32 s3, 1, 0
+; VI-NEXT:    s_or_b32 s3, s3, s7
 ; VI-NEXT:    s_lshr_b32 s2, s2, 2
-; VI-NEXT:    s_add_i32 s2, s2, s7
-; VI-NEXT:    s_cmp_lt_i32 s3, 31
+; VI-NEXT:    s_add_i32 s2, s2, s3
+; VI-NEXT:    s_cmp_lt_i32 s1, 31
 ; VI-NEXT:    s_cselect_b32 s2, s2, 0x7c00
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; VI-NEXT:    s_cmpk_eq_i32 s1, 0x40f
 ; VI-NEXT:    s_cselect_b32 s0, s0, s2
-; VI-NEXT:    s_lshr_b32 s1, s1, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0x8000
-; VI-NEXT:    s_or_b32 s0, s1, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_and_b32 s0, s0, 0x7fff
 ; VI-NEXT:    s_or_b32 s0, s0, s5
 ; VI-NEXT:    s_mov_b32 s1, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -4918,40 +4913,37 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffe, v5
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_bfe_u32 v8, v1, 20, 11
+; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
-; VI-NEXT:    v_sub_u32_e32 v11, vcc, s4, v8
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, s4, v1
 ; VI-NEXT:    v_or_b32_e32 v5, 0x1000, v0
-; VI-NEXT:    v_med3_i32 v11, v11, 0, 13
-; VI-NEXT:    v_lshrrev_b32_e32 v12, v11, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v5
+; VI-NEXT:    v_med3_i32 v8, v8, 0, 13
+; VI-NEXT:    v_lshrrev_b32_e32 v11, v8, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v5
 ; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v8, vcc, s5, v8
-; VI-NEXT:    v_lshlrev_b32_e32 v11, 12, v8
-; VI-NEXT:    v_or_b32_e32 v5, v12, v5
-; VI-NEXT:    v_or_b32_e32 v11, v0, v11
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v8
-; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
-; VI-NEXT:    v_and_b32_e32 v11, 7, v5
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; VI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 12, v1
+; VI-NEXT:    v_or_b32_e32 v5, v11, v5
+; VI-NEXT:    v_or_b32_e32 v8, v0, v8
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; VI-NEXT:    v_and_b32_e32 v8, 7, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
 ; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v11, v11, v12
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v8, v8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v8
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, 0x8000
-; VI-NEXT:    v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_and_b32_e32 v5, 0x1ff, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v5, v2
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffe, v1
@@ -4986,7 +4978,8 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; VI-NEXT:    v_bfi_b32 v0, s4, v0, v6
 ; VI-NEXT:    v_bfi_b32 v1, s4, v4, v7
@@ -6061,76 +6054,73 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffe, v10
 ; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-NEXT:    v_bfe_u32 v11, v5, 20, 11
+; VI-NEXT:    v_bfe_u32 v5, v5, 20, 11
 ; VI-NEXT:    s_movk_i32 s4, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_sub_u32_e32 v12, vcc, s4, v11
+; VI-NEXT:    v_sub_u32_e32 v11, vcc, s4, v5
 ; VI-NEXT:    v_or_b32_e32 v10, 0x1000, v4
-; VI-NEXT:    v_med3_i32 v12, v12, 0, 13
-; VI-NEXT:    v_lshrrev_b32_e32 v13, v12, v10
-; VI-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v12, v10
+; VI-NEXT:    v_med3_i32 v11, v11, 0, 13
+; VI-NEXT:    v_lshrrev_b32_e32 v12, v11, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v10
 ; VI-NEXT:    s_movk_i32 s5, 0xfc10
 ; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v11, vcc, s5, v11
-; VI-NEXT:    v_lshlrev_b32_e32 v12, 12, v11
-; VI-NEXT:    v_or_b32_e32 v10, v13, v10
-; VI-NEXT:    v_or_b32_e32 v12, v4, v12
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v11
-; VI-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; VI-NEXT:    v_and_b32_e32 v12, 7, v10
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v12
-; VI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 12, v5
+; VI-NEXT:    v_or_b32_e32 v10, v12, v10
+; VI-NEXT:    v_or_b32_e32 v11, v4, v11
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; VI-NEXT:    v_and_b32_e32 v11, 7, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
 ; VI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v12, v12, v13
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
+; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v12
-; VI-NEXT:    v_mov_b32_e32 v12, 0x7c00
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v11
-; VI-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
-; VI-NEXT:    v_mov_b32_e32 v13, 0x7e00
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; VI-NEXT:    v_mov_b32_e32 v11, 0x7c00
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; VI-NEXT:    v_mov_b32_e32 v12, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; VI-NEXT:    s_movk_i32 s6, 0x40f
-; VI-NEXT:    v_cndmask_b32_e32 v4, v12, v13, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v11
+; VI-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v10, 0x8000
-; VI-NEXT:    v_and_b32_e32 v11, 0x1ff, v7
-; VI-NEXT:    v_and_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v6, v11, v6
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    v_and_b32_e32 v10, 0x1ff, v7
+; VI-NEXT:    v_or_b32_e32 v6, v10, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffe, v5
 ; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v7, v7, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v5, v5, v6
-; VI-NEXT:    v_sub_u32_e32 v11, vcc, s4, v7
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, s4, v7
 ; VI-NEXT:    v_or_b32_e32 v6, 0x1000, v5
-; VI-NEXT:    v_med3_i32 v11, v11, 0, 13
-; VI-NEXT:    v_lshrrev_b32_e32 v14, v11, v6
-; VI-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v6
+; VI-NEXT:    v_med3_i32 v10, v10, 0, 13
+; VI-NEXT:    v_lshrrev_b32_e32 v13, v10, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v6
 ; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v11, 12, v7
-; VI-NEXT:    v_or_b32_e32 v6, v14, v6
-; VI-NEXT:    v_or_b32_e32 v11, v5, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v10, 12, v7
+; VI-NEXT:    v_or_b32_e32 v6, v13, v6
+; VI-NEXT:    v_or_b32_e32 v10, v5, v10
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
-; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
-; VI-NEXT:    v_and_b32_e32 v11, 7, v6
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; VI-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v11, v11, v14
+; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
+; VI-NEXT:    v_and_b32_e32 v10, 7, v6
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
+; VI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v10, v10, v13
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v10
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
-; VI-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
 ; VI-NEXT:    v_and_b32_e32 v7, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v7, v0
@@ -6139,39 +6129,37 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffe, v6
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_bfe_u32 v7, v1, 20, 11
+; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v6, v0
-; VI-NEXT:    v_sub_u32_e32 v11, vcc, s4, v7
+; VI-NEXT:    v_sub_u32_e32 v7, vcc, s4, v1
 ; VI-NEXT:    v_or_b32_e32 v6, 0x1000, v0
-; VI-NEXT:    v_med3_i32 v11, v11, 0, 13
-; VI-NEXT:    v_lshrrev_b32_e32 v14, v11, v6
-; VI-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v6
+; VI-NEXT:    v_med3_i32 v7, v7, 0, 13
+; VI-NEXT:    v_lshrrev_b32_e32 v10, v7, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v6
 ; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v11, 12, v7
-; VI-NEXT:    v_or_b32_e32 v6, v14, v6
-; VI-NEXT:    v_or_b32_e32 v11, v0, v11
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
-; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
-; VI-NEXT:    v_and_b32_e32 v11, 7, v6
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; VI-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v11, v11, v14
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 12, v1
+; VI-NEXT:    v_or_b32_e32 v6, v10, v6
+; VI-NEXT:    v_or_b32_e32 v7, v0, v7
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; VI-NEXT:    v_and_b32_e32 v7, 7, v6
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v7, v7, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
-; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
-; VI-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
+; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; VI-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 0x1ff, v3
-; VI-NEXT:    v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v6, v2
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffe, v1
@@ -6200,16 +6188,18 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
-; VI-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v12, v13, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff, v4
 ; VI-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    v_bfi_b32 v0, s4, v0, v8
 ; VI-NEXT:    v_bfi_b32 v1, s4, v1, v9
 ; VI-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index 030c33285012..0684c3081983 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -452,7 +452,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; CI-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negk_negk_f16:
@@ -462,7 +462,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_add_f16_e64 v0, |v0|, v1
+; VI-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_f16:
@@ -472,7 +472,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_add_f16_e64 v0.l, |v0.l|, v1.l
+; GFX11-SAFE-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_f16:
@@ -482,7 +482,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_add_f16_e64 v0, |v0|, v1
+; GFX11-SAFE-FAKE16-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_f16:
@@ -492,7 +492,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_add_f16_e64 v0.l, |v0.l|, v1.l
+; GFX11-NSZ-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_f16:
@@ -502,7 +502,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
-; GFX11-NSZ-FAKE16-NEXT:    v_add_f16_e64 v0, |v0|, v1
+; GFX11-NSZ-FAKE16-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; GFX11-NSZ-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, half -2.0, half -1.0
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index a680ba593341..ec0455ab6e93 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -132,12 +132,11 @@ define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
   ret void
 }
 
-; FIXME: fabs should fold away
 ; GCN-LABEL: {{^}}add_select_fabs_negk_negk_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
-; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
 define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) poison
   %cmp = icmp eq i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 7ed27f008083..4e07c724b8a8 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -776,20 +776,16 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; CI-LABEL: add_select_fabs_negk_negk_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
-; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
-; CI-NEXT:    v_add_f32_e32 v0, v0, v2
-; CI-NEXT:    v_add_f32_e32 v1, v1, v3
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
+; CI-NEXT:    v_sub_f32_e32 v1, v3, v1
+; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negk_negk_v2f16:
@@ -801,8 +797,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT:    v_add_f16_sdwa v1, |v1|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT:    v_add_f16_e64 v0, |v0|, v2
+; VI-NEXT:    v_sub_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -816,8 +812,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16:
@@ -831,9 +826,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16:
@@ -846,9 +839,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
 ; GFX11-SAFE-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16:
@@ -862,9 +854,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16:
@@ -877,9 +867,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
 ; GFX11-NSZ-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NSZ-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NSZ-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x half> <half -2.0, half -2.0>, <2 x half> <half -1.0, half -1.0>
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 660ff4677547..04b98730c6a1 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -1515,7 +1515,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG-NEXT:     TRUNC * T0.W, PV.W,
 ; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
 ; EG-NEXT:     TRUNC * T0.W, PV.W,
-; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
+; EG-NEXT:     SETGE * T1.W, |PV.W|, T0.Y,
 ; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
 ; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
 ; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
@@ -1658,7 +1658,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG-NEXT:     TRUNC * T0.W, PV.W,
 ; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
 ; EG-NEXT:     TRUNC * T0.W, PV.W,
-; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
+; EG-NEXT:     SETGE * T1.W, |PV.W|, T0.Y,
 ; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
 ; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
 ; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
@@ -1858,7 +1858,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; EG-NEXT:     TRUNC * T0.W, PV.W,
 ; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
 ; EG-NEXT:     TRUNC * T0.W, PV.W,
-; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.X|,
+; EG-NEXT:     SETGE * T1.W, |PV.W|, T0.X,
 ; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
 ; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
 ; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)

From ee414e35047c588daba1ee62e8291ea171700f43 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sun, 22 Jun 2025 15:08:11 +0000
Subject: [PATCH 1202/1322] [CI] Refactor out some early exits in
 compute_projects

I have a habit of using early exits given it is in the LLVM coding
standards, but most of the early exits used within this script were
trivial and actually adding complexity. These are all instances where we
only perform one operation after the early exit, so removing the early
exit means less lines of code and arguably more readable code.

Reviewers: DavidSpickett, tstellar, cmtice, lnihlen

Reviewed By: DavidSpickett

Pull Request: https://github.com/llvm/llvm-project/pull/143478
---
 .ci/compute_projects.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index e61b8dc5021f..32ddf525bcfb 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -135,13 +135,11 @@ def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
     while current_projects_count != len(projects_with_dependents):
         current_projects_count = len(projects_with_dependents)
         for project in list(projects_with_dependents):
-            if project not in PROJECT_DEPENDENCIES:
-                continue
-            projects_with_dependents.update(PROJECT_DEPENDENCIES[project])
+            if project in PROJECT_DEPENDENCIES:
+                projects_with_dependents.update(PROJECT_DEPENDENCIES[project])
     for runtime in runtimes:
-        if runtime not in PROJECT_DEPENDENCIES:
-            continue
-        projects_with_dependents.update(PROJECT_DEPENDENCIES[runtime])
+        if runtime in PROJECT_DEPENDENCIES:
+            projects_with_dependents.update(PROJECT_DEPENDENCIES[runtime])
     return projects_with_dependents
 
 
@@ -187,18 +185,16 @@ def _compute_projects_to_build(
 def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
     check_targets = set()
     for project_to_test in projects_to_test:
-        if project_to_test not in PROJECT_CHECK_TARGETS:
-            continue
-        check_targets.add(PROJECT_CHECK_TARGETS[project_to_test])
+        if project_to_test in PROJECT_CHECK_TARGETS:
+            check_targets.add(PROJECT_CHECK_TARGETS[project_to_test])
     return check_targets
 
 
 def _compute_runtimes_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
     runtimes_to_test = set()
     for modified_project in modified_projects:
-        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST:
-            continue
-        runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project])
+        if modified_project in DEPENDENT_RUNTIMES_TO_TEST:
+            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project])
     return _exclude_projects(runtimes_to_test, platform)
 
 
@@ -207,11 +203,10 @@ def _compute_runtimes_to_test_needs_reconfig(
 ) -> Set[str]:
     runtimes_to_test = set()
     for modified_project in modified_projects:
-        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG:
-            continue
-        runtimes_to_test.update(
-            DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG[modified_project]
-        )
+        if modified_project in DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG:
+            runtimes_to_test.update(
+                DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG[modified_project]
+            )
     return _exclude_projects(runtimes_to_test, platform)
 
 
From 80b9fcf8fdf2a44da291b41d9244aa99e867f26c Mon Sep 17 00:00:00 2001
From: eleviant <56861949+eleviant@users.noreply.github.com>
Date: Sun, 22 Jun 2025 18:59:08 +0200
Subject: [PATCH 1203/1322] Revert "[lldb] Fix qEcho message handling
 (#145072)" (#145241)

Temporarily revert commit e066f35c6981c720e3a7e5883efc40c861b3b7ee,
because lldb tests randomly hang after it's been pushed.
---
 .../Python/lldbsuite/test/gdbclientutils.py   | 10 ---
 .../gdb-remote/GDBRemoteCommunication.cpp     |  3 +-
 .../GDBRemoteCommunicationClient.cpp          |  2 +-
 .../gdb_remote_client/TestGDBRemoteClient.py  | 72 -------------------
 4 files changed, 2 insertions(+), 85 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index b603c35c8df0..753de22b9cfe 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -92,9 +92,6 @@ class MockGDBServerResponder:
     class RESPONSE_DISCONNECT:
         pass
 
-    class RESPONSE_NONE:
-        pass
-
     def __init__(self):
         self.packetLog = []
 
@@ -184,8 +181,6 @@ class MockGDBServerResponder:
             return self.qQueryGDBServer()
         if packet == "qHostInfo":
             return self.qHostInfo()
-        if packet.startswith("qEcho"):
-            return self.qEcho(int(packet.split(":")[1]))
         if packet == "qGetWorkingDir":
             return self.qGetWorkingDir()
         if packet == "qOffsets":
@@ -242,9 +237,6 @@ class MockGDBServerResponder:
     def qHostInfo(self):
         return "ptrsize:8;endian:little;"
 
-    def qEcho(self):
-        return "E04"
-
     def qQueryGDBServer(self):
         return "E04"
 
@@ -663,8 +655,6 @@ class MockGDBServer:
         if not isinstance(response, list):
             response = [response]
         for part in response:
-            if part is MockGDBServerResponder.RESPONSE_NONE:
-                continue
             if part is MockGDBServerResponder.RESPONSE_DISCONNECT:
                 raise self.TerminateConnectionException()
             self._sendPacket(part)
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index f244f7abe45e..2aea7c6b781d 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -354,9 +354,8 @@ GDBRemoteCommunication::WaitForPacketNoLock(StringExtractorGDBRemote &packet,
             disconnected = true;
             Disconnect();
           }
-        } else {
-          timed_out = true;
         }
+        timed_out = true;
         break;
       case eConnectionStatusSuccess:
         // printf ("status = success but error = %s\n",
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index d8130cae71ce..adbf06b9a19a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -406,7 +406,7 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qXfer_memory_map_read = eLazyBoolYes;
       else if (x == "qXfer:siginfo:read+")
         m_supports_qXfer_siginfo_read = eLazyBoolYes;
-      else if (x == "qEcho+")
+      else if (x == "qEcho")
         m_supports_qEcho = eLazyBoolYes;
       else if (x == "QPassSignals+")
         m_supports_QPassSignals = eLazyBoolYes;
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
index 12b464d3397e..08ac9290ee85 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
@@ -356,78 +356,6 @@ class TestGDBRemoteClient(GDBRemoteTestBase):
             ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
         )
 
-    def test_launch_lengthy_vRun(self):
-        class MyResponder(MockGDBServerResponder):
-            def __init__(self, *args, **kwargs):
-                self.started = False
-                return super().__init__(*args, **kwargs)
-
-            def qC(self):
-                if self.started:
-                    return "QCp10.10"
-                else:
-                    return "E42"
-
-            def qfThreadInfo(self):
-                if self.started:
-                    return "mp10.10"
-                else:
-                    return "E42"
-
-            def qsThreadInfo(self):
-                return "l"
-
-            def qEcho(self, num):
-                resp = "qEcho:" + str(num)
-                if num >= 2:
-                    # We have launched our program
-                    self.started = True
-                    return [resp, "T13"]
-
-                return resp
-
-            def qSupported(self, client_supported):
-                return "PacketSize=3fff;QStartNoAckMode+;qEcho+;"
-
-            def qHostInfo(self):
-                return "default_packet_timeout:1;"
-
-            def vRun(self, packet):
-                return [self.RESPONSE_NONE]
-
-            def A(self, packet):
-                return "E28"
-
-        self.server.responder = MyResponder()
-
-        target = self.createTarget("a.yaml")
-        # NB: apparently GDB packets are using "/" on Windows too
-        exe_path = self.getBuildArtifact("a").replace(os.path.sep, "/")
-        exe_hex = binascii.b2a_hex(exe_path.encode()).decode()
-        process = self.connect(target)
-        lldbutil.expect_state_changes(
-            self, self.dbg.GetListener(), process, [lldb.eStateConnected]
-        )
-
-        process = target.Launch(
-            lldb.SBListener(),
-            ["arg1", "arg2", "arg3"],  # argv
-            [],  # envp
-            None,  # stdin_path
-            None,  # stdout_path
-            None,  # stderr_path
-            None,  # working_directory
-            0,  # launch_flags
-            True,  # stop_at_entry
-            lldb.SBError(),
-        )  # error
-        self.assertTrue(process, PROCESS_IS_VALID)
-        self.assertEqual(process.GetProcessID(), 16)
-
-        self.assertPacketLogContains(
-            ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
-        )
-
     def test_launch_QEnvironment(self):
         class MyResponder(MockGDBServerResponder):
             def qC(self):

From 214ca3161bf50267881ce589194699a0336ee3d9 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sun, 22 Jun 2025 17:12:16 +0000
Subject: [PATCH 1204/1322] [CI] Test all projects when CI scripts change

This patch resolves a fixme in the compute_projects script to actually
test all the projects we can when touching something in the .ci
directory. This ensures we test things like compiler-rt before landing
changes.

Reviewers: gburgessiv, lnihlen, cmtice

Reviewed By: cmtice, gburgessiv

Pull Request: https://github.com/llvm/llvm-project/pull/144034
---
 .ci/compute_projects.py      | 16 ++++++++++++++--
 .ci/compute_projects_test.py | 11 +++++++----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index 32ddf525bcfb..80e4e0221b3d 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -52,8 +52,19 @@ DEPENDENTS_TO_TEST = {
     "clang": {"clang-tools-extra", "cross-project-tests"},
     "mlir": {"flang"},
     # Test everything if ci scripts are changed.
-    # FIXME: Figure out what is missing and add here.
-    ".ci": {"llvm", "clang", "lld", "lldb"},
+    ".ci": {
+        "llvm",
+        "clang",
+        "lld",
+        "lldb",
+        "bolt",
+        "clang-tools-extra",
+        "mlir",
+        "polly",
+        "flang",
+        "libclc",
+        "openmp",
+    },
 }
 
 # This mapping describes runtimes that should be enabled for a specific project,
@@ -66,6 +77,7 @@ DEPENDENT_RUNTIMES_TO_BUILD = {"lldb": {"libcxx", "libcxxabi", "libunwind"}}
 DEPENDENT_RUNTIMES_TO_TEST = {
     "clang": {"compiler-rt"},
     "clang-tools-extra": {"libc"},
+    ".ci": {"compiler-rt", "libc"},
 }
 DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG = {
     "llvm": {"libcxx", "libcxxabi", "libunwind"},
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6bc2e34a1cbe..5efac2636698 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -221,18 +221,21 @@ class TestComputeProjects(unittest.TestCase):
         env_variables = compute_projects.get_env_variables(
             [".ci/compute_projects.py"], "Linux"
         )
-        self.assertEqual(env_variables["projects_to_build"], "clang;lld;lldb;llvm")
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "bolt;clang;clang-tools-extra;flang;libclc;lld;lldb;llvm;mlir;polly",
+        )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-clang check-lld check-lldb check-llvm",
+            "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
         )
         self.assertEqual(
             env_variables["runtimes_to_build"],
-            "libcxx;libcxxabi;libunwind",
+            "compiler-rt;libc;libcxx;libcxxabi;libunwind",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
+            "check-compiler-rt check-libc",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets_needs_reconfig"],

From 99cdc26c94eed43f7619a352a1441c193700107d Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sun, 22 Jun 2025 17:17:12 +0000
Subject: [PATCH 1205/1322] [CI] Cleanup buildkite test report script

This patch removes the generate_test_report_buildkite script entrypoint
as we no longer need it now that buildkite has been sunsetted. Also
remove the calls in the monolithic-* scripts given they are adding
complexity for no value.

Also remove the generate-buildkite-pipeline-premerge script as it is
no longer needed.

Reviewers: tstellar, DavidSpickett, lnihlen, cmtice

Reviewed By: DavidSpickett

Pull Request: https://github.com/llvm/llvm-project/pull/143480
---
 .ci/generate-buildkite-pipeline-premerge | 131 -----------------------
 .ci/generate_test_report_buildkite.py    |  57 ----------
 .ci/monolithic-linux.sh                  |  11 +-
 .ci/monolithic-windows.sh                |  11 +-
 4 files changed, 6 insertions(+), 204 deletions(-)
 delete mode 100755 .ci/generate-buildkite-pipeline-premerge
 delete mode 100644 .ci/generate_test_report_buildkite.py

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
deleted file mode 100755
index 5e5f916f35b7..000000000000
--- a/.ci/generate-buildkite-pipeline-premerge
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===----------------------------------------------------------------------===##
-
-#
-# This file generates a Buildkite pipeline that triggers the various CI jobs for
-# the LLVM project during pre-commit CI.
-#
-# See https://buildkite.com/docs/agent/v3/cli-pipeline#pipeline-format.
-#
-# As this outputs a yaml file, it's possible to log messages to stderr or
-# prefix with "#".
-
-
-set -eu
-set -o pipefail
-
-# Environment variables script works with:
-
-# Set by buildkite
-: ${BUILDKITE_PULL_REQUEST_BASE_BRANCH:=}
-: ${BUILDKITE_COMMIT:=}
-: ${BUILDKITE_BRANCH:=}
-# Fetch origin to have an up to date merge base for the diff.
-git fetch origin
-# List of files affected by this commit
-: ${MODIFIED_FILES:=$(git diff --name-only origin/${BUILDKITE_PULL_REQUEST_BASE_BRANCH}...HEAD)}
-# Filter rules for generic windows tests
-: ${WINDOWS_AGENTS:='{"queue": "windows"}'}
-# Filter rules for generic linux tests
-: ${LINUX_AGENTS:='{"queue": "linux"}'}
-
-reviewID="$(git log --format=%B -n 1 | sed -nE 's/^Review-ID:[[:space:]]*(.+)$/\1/p')"
-if [[ "${reviewID}" != "" ]]; then
-  buildMessage="https://llvm.org/${reviewID}"
-else
-  buildMessage="Push to branch ${BUILDKITE_BRANCH}"
-fi
-
-cat <<EOF
-steps:
-EOF
-
-echo "Files modified:" >&2
-echo "$MODIFIED_FILES" >&2
-modified_dirs=$(echo "$MODIFIED_FILES" | cut -d'/' -f1 | sort -u)
-echo "Directories modified:" >&2
-echo "$modified_dirs" >&2
-
-# Project specific pipelines.
-
-# If libc++ or one of the runtimes directories changed.
-if echo "$modified_dirs" | grep -q -E "^(libcxx|libcxxabi|libunwind|runtimes|cmake)$"; then
-  cat <<EOF
-- trigger: "libcxx-ci"
-  build:
-    message: "${buildMessage}"
-    commit: "${BUILDKITE_COMMIT}"
-    branch: "${BUILDKITE_BRANCH}"
-EOF
-fi
-
-# Generic pipeline for projects that have not defined custom steps.
-#
-# Individual projects should instead define the pre-commit CI tests that suits their
-# needs while letting them run on the infrastructure provided by LLVM.
-
-# Figure out which projects need to be built on each platform
-source <(git diff --name-only origin/${BUILDKITE_PULL_REQUEST_BASE_BRANCH}...HEAD | python3 .ci/compute_projects.py Linux)
-linux_projects=${projects_to_build}
-linux_check_targets=${project_check_targets}
-linux_runtimes=${runtimes_to_build}
-linux_runtime_check_targets=${runtimes_check_targets}
-
-source <(git diff --name-only origin/${BUILDKITE_PULL_REQUEST_BASE_BRANCH}...HEAD | python3 .ci/compute_projects.py Windows)
-windows_projects=${projects_to_build}
-windows_check_targets=${project_check_targets}
-
-
-# Generate the appropriate pipeline
-if [[ "${linux_projects}" != "" ]]; then
-  cat <<EOF
-- label: ':linux: Linux x64'
-  artifact_paths:
-  - 'artifacts/**/*'
-  - '*_result.json'
-  - 'build/test-results.*.xml'
-  agents: ${LINUX_AGENTS}
-  retry:
-    automatic:
-      - exit_status: -1  # Agent was lost
-        limit: 2
-      - exit_status: 255 # Forced agent shutdown
-        limit: 2
-  timeout_in_minutes: 120
-  env:
-    CC: 'clang'
-    CXX: 'clang++'
-  commands:
-  - './.ci/monolithic-linux.sh "$(echo ${linux_projects} | tr ' ' ';')" "$(echo ${linux_check_targets})" "$(echo ${linux_runtimes} | tr ' ' ';')" "$(echo ${linux_runtime_check_targets})"'
-EOF
-fi
-
-if [[ "${windows_projects}" != "" ]]; then
-  cat <<EOF
-- label: ':windows: Windows x64'
-  artifact_paths:
-  - 'artifacts/**/*'
-  - '*_result.json'
-  - 'build/test-results.*.xml'
-  agents: ${WINDOWS_AGENTS}
-  retry:
-    automatic:
-      - exit_status: -1  # Agent was lost
-        limit: 2
-      - exit_status: 255 # Forced agent shutdown
-        limit: 2
-  timeout_in_minutes: 150
-  env:
-    MAX_PARALLEL_COMPILE_JOBS: '16'
-    MAX_PARALLEL_LINK_JOBS: '4'
-  commands:
-  - 'C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64'
-  - 'bash .ci/monolithic-windows.sh "$(echo ${windows_projects} | tr ' ' ';')" "$(echo ${windows_check_targets})"'
-EOF
-fi
diff --git a/.ci/generate_test_report_buildkite.py b/.ci/generate_test_report_buildkite.py
deleted file mode 100644
index 82bbc6d1d32d..000000000000
--- a/.ci/generate_test_report_buildkite.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Script to generate a build report for buildkite."""
-
-import argparse
-import os
-import subprocess
-
-import generate_test_report_lib
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "title", help="Title of the test report, without Markdown formatting."
-    )
-    parser.add_argument("context", help="Annotation context to write to.")
-    parser.add_argument("return_code", help="The build's return code.", type=int)
-    parser.add_argument("junit_files", help="Paths to JUnit report files.", nargs="*")
-    args = parser.parse_args()
-
-    # All of these are required to build a link to download the log file.
-    env_var_names = [
-        "BUILDKITE_ORGANIZATION_SLUG",
-        "BUILDKITE_PIPELINE_SLUG",
-        "BUILDKITE_BUILD_NUMBER",
-        "BUILDKITE_JOB_ID",
-    ]
-    buildkite_info = {k: v for k, v in os.environ.items() if k in env_var_names}
-    if len(buildkite_info) != len(env_var_names):
-        buildkite_info = None
-
-    report, style = generate_test_report_lib.generate_report_from_files(
-        args.title, args.return_code, args.junit_files, buildkite_info
-    )
-
-    if report:
-        p = subprocess.Popen(
-            [
-                "buildkite-agent",
-                "annotate",
-                "--context",
-                args.context,
-                "--style",
-                style,
-            ],
-            stdin=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            universal_newlines=True,
-        )
-
-        # The report can be larger than the buffer for command arguments so we send
-        # it over stdin instead.
-        _, err = p.communicate(input=report)
-        if p.returncode:
-            raise RuntimeError(f"Failed to send report to buildkite-agent:\n{err}")
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index c350a5867914..89447963b852 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -42,14 +42,9 @@ function at-exit {
 
   # If building fails there will be no results files.
   shopt -s nullglob
-  if command -v buildkite-agent 2>&1 >/dev/null
-  then
-    python3 "${MONOREPO_ROOT}"/.ci/generate_test_report_buildkite.py ":linux: Linux x64 Test Results" \
-      "linux-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml
-  else
-    python3 "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py ":penguin: Linux x64 Test Results" \
-      $retcode "${BUILD_DIR}"/test-results.*.xml >> $GITHUB_STEP_SUMMARY
-  fi
+  
+  python3 "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py ":penguin: Linux x64 Test Results" \
+    $retcode "${BUILD_DIR}"/test-results.*.xml >> $GITHUB_STEP_SUMMARY
 }
 trap at-exit EXIT
 
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index a0997420b0d3..dc2913830e92 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -37,14 +37,9 @@ function at-exit {
 
   # If building fails there will be no results files.
   shopt -s nullglob
-  if command -v buildkite-agent 2>&1 >/dev/null
-  then
-    python "${MONOREPO_ROOT}"/.ci/generate_test_report_buildkite.py ":windows: Windows x64 Test Results" \
-      "windows-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml
-  else
-    python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py ":window: Windows x64 Test Results" \
-      $retcode "${BUILD_DIR}"/test-results.*.xml >> $GITHUB_STEP_SUMMARY
-  fi
+    
+  python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py ":window: Windows x64 Test Results" \
+    $retcode "${BUILD_DIR}"/test-results.*.xml >> $GITHUB_STEP_SUMMARY
 }
 trap at-exit EXIT
 

From f9c9a32e9cbe3547631db4c55596bdfde76c44dc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 22 Jun 2025 20:34:13 +0100
Subject: [PATCH 1206/1322] [LV] Add tests with fmax reductions without
 fast-math flags.

Adds extra tests with fmax reductions without fast-math flags for
upcoming patches.
---
 .../AArch64/fmax-without-fast-math-flags.ll   |  74 ++++
 ...fmax-without-fast-math-flags-interleave.ll |  74 ++++
 .../fmax-without-fast-math-flags.ll           | 361 ++++++++++++++++++
 3 files changed, 509 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
new file mode 100644
index 000000000000..77b40dabae1e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -mtriple=arm64-apple-macosx -S %s | FileCheck %s
+
+define float @fmax_ugt_with_select(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ugt_with_select(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %l, %max
+  %max.next = select i1 %cmp, float %l, float %max
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
new file mode 100644
index 000000000000..fb68d4cbd9e4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
+
+define float @fmax_ugt_with_select(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ugt_with_select(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %l, %max
+  %max.next = select i1 %cmp, float %l, float %max
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
new file mode 100644
index 000000000000..3a8ef7e0b08c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -0,0 +1,361 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define float @fmax_ugt_with_select_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ugt_with_select_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %l, %max
+  %max.next = select i1 %cmp, float %l, float %max
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmax_ugt_with_select_2(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ugt_with_select_2(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[MAX]], [[L]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[MAX]], float [[L]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %max, %l
+  %max.next = select i1 %cmp, float %max, float %l
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmax_ogt_with_select_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ogt_with_select_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ogt float %l, %max
+  %max.next = select i1 %cmp, float %l, float %max
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmax_ogt_with_select_2(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ogt_with_select_2(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[MAX]], [[L]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[MAX]], float [[L]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ogt float %max, %l
+  %max.next = select i1 %cmp, float %max, float %l
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmax_ugt_with_select_store_result(ptr %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define float @fmax_ugt_with_select_store_result(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    store float [[MAX_NEXT]], ptr [[DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %l, %max
+  %max.next = select i1 %cmp, float %l, float %max
+  store float %max.next, ptr %dst, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_2(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_2(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmax_select_with_blend(ptr %A, ptr %B) {
+; CHECK-LABEL: define float @fmax_select_with_blend(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L_A]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_2:%.*]] = fcmp ogt float [[MAX]], [[L]]
+; CHECK-NEXT:    [[MAX_SEL:%.*]] = select i1 [[C_2]], float [[MAX]], float [[L]]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MAX_NEXT]] = phi float [ [[MAX_SEL]], %[[LOOP_THEN]] ], [ [[MAX]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop.latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %l.A = load i32, ptr %gep.A
+  %c.1 = icmp eq i32 %l.A, 0
+  br i1 %c.1, label %loop.then, label %loop.latch
+
+loop.then:
+  %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
+  %l = load float, ptr %gep.B
+  %c.2 = fcmp ogt float %max, %l
+  %max.sel = select i1 %c.2, float %max, float %l
+  br label %loop.latch
+
+loop.latch:
+  %max.next = phi float [ %max.sel, %loop.then ], [ %max, %loop ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %max.next
+}
+
+define float @fmax_with_select_and_load_store(ptr %src, ptr noalias %dst, i64 %n) {
+; CHECK-LABEL: define float @fmax_with_select_and_load_store(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[IV_1:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_DST_1]], align 4
+; CHECK-NEXT:    [[GEP_DST_0:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[L_2]], ptr [[GEP_DST_0]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %l, %max
+  %iv.1 = add i64 %iv, 1
+  %gep.dst.1 = getelementptr inbounds i32, ptr %dst, i64 %iv.1
+  %l.2 = load i32, ptr %gep.dst.1
+  %gep.dst.0 = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 %l.2, ptr %gep.dst.0
+  %max.next = select i1 %cmp, float %l, float %max
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}

From 40eee8ec7fb64dd1c6f15e8cd6e087f5d9f9d37e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Sun, 22 Jun 2025 12:52:05 -0700
Subject: [PATCH 1207/1322] [AMDGPU] Add s_setprio_inc_wg gfx1250 instruction
 (#145152)

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  6 ++++
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  | 12 +++++++
 .../builtins-amdgcn-error-gfx1250-param.cl    |  6 ++++
 .../builtins-amdgcn-error-gfx1250.cl          |  6 ++++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  5 +++
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 10 ++++++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  6 ++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  6 +++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  3 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  6 ++++
 .../AMDGPU/llvm.amdgcn.s.setprio.inc.wg.ll    | 34 +++++++++++++++++++
 llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s        |  4 +++
 .../Disassembler/AMDGPU/gfx1250_dasm_sopp.txt |  3 ++
 13 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.inc.wg.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 802b4be42419..edb3a17ac07c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -636,5 +636,11 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
 
+//===----------------------------------------------------------------------===//
+// GFX1250+ only builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
new file mode 100644
index 000000000000..3709b1ff52f3
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -0,0 +1,12 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+// CHECK-LABEL: @test_setprio_inc_wg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.setprio.inc.wg(i16 10)
+// CHECK-NEXT:    ret void
+//
+void test_setprio_inc_wg() {
+  __builtin_amdgcn_s_setprio_inc_wg(10);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
new file mode 100644
index 000000000000..b69fcb5f445b
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -0,0 +1,6 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+
+void test_setprio_inc_wg(short a) {
+  __builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl
new file mode 100644
index 000000000000..c5440ed1a75a
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl
@@ -0,0 +1,6 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1200 -verify -S -o - %s
+
+void test() {
+  __builtin_amdgcn_s_setprio_inc_wg(1); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' needs target feature setprio-inc-wg-inst}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 412993755dac..51dfe53aa00e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2054,6 +2054,11 @@ def int_amdgcn_s_setprio :
   DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
                                 IntrHasSideEffects]>;
 
+def int_amdgcn_s_setprio_inc_wg :
+  ClangBuiltin<"__builtin_amdgcn_s_setprio_inc_wg">,
+  DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
+                                IntrHasSideEffects]>;
+
 def int_amdgcn_s_ttracedata :
   ClangBuiltin<"__builtin_amdgcn_s_ttracedata">,
   DefaultAttrsIntrinsic<[], [llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ab83cf9e7395..9c27fa0c5d15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1118,6 +1118,12 @@ def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt",
   "Has s_wait_xcnt instruction"
 >;
 
+def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
+  "HasSetPrioIncWgInst",
+  "true",
+  "Has s_setprio_inc_wg instruction."
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -1940,6 +1946,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureMemoryAtomicFAddF32DenormalSupport,
    FeatureKernargPreload,
    FeatureLshlAddU64Inst,
+   FeatureSetPrioIncWgInst,
 ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
@@ -2662,6 +2669,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
 def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
                         AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
 
+def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
+ AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4ec60dc2752e..fce46a6f7224 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -262,6 +262,7 @@ protected:
   bool HasMinimum3Maximum3PKF16 = false;
   bool HasLshlAddU64Inst = false;
   bool HasPointSampleAccel = false;
+  bool HasSetPrioIncWgInst = false;
 
   bool RequiresCOV6 = false;
   bool UseBlockVGPROpsForCSR = false;
@@ -1465,6 +1466,11 @@ public:
   /// values.
   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
 
+  bool hasGFX1250Insts() const { return GFX1250Insts; }
+
+  // \returns true if target has S_SETPRIO_INC_WG instruction.
+  bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
+
   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
   // of sign-extending.
   bool hasGetPCZeroExtension() const { return GFX12Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a538ec9df6f0..333e91bf37df 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9575,7 +9575,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
   case AMDGPUSubtarget::GFX11:
     return SIEncodingFamily::GFX11;
   case AMDGPUSubtarget::GFX12:
-    return SIEncodingFamily::GFX12;
+    return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
+                                : SIEncodingFamily::GFX12;
   }
   llvm_unreachable("Unknown subtarget generation!");
 }
@@ -9669,6 +9670,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
+  if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
+    MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);
+
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
     return Opcode;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 53c0635f02bf..6d6c2af7ce49 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -3069,7 +3069,8 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.GFX90A)],
                    [!cast<string>(SIEncodingFamily.GFX940)],
                    [!cast<string>(SIEncodingFamily.GFX11)],
-                   [!cast<string>(SIEncodingFamily.GFX12)]];
+                   [!cast<string>(SIEncodingFamily.GFX12)],
+                   [!cast<string>(SIEncodingFamily.GFX1250)]];
 }
 
 // Get equivalent SOPK instruction.
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 8c739c2760b1..376c6eb135b1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1632,6 +1632,11 @@ def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
   [(int_amdgcn_s_setprio timm:$simm16)]> {
 }
 
+def S_SETPRIO_INC_WG : SOPP_Pseudo <"s_setprio_inc_wg", (ins i16imm:$simm16), "$simm16",
+  [(int_amdgcn_s_setprio_inc_wg timm:$simm16)]> {
+  let SubtargetPredicate = HasSetPrioIncWgInst;
+}
+
 let Uses = [EXEC, M0] in {
 def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16",
   [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
@@ -2594,6 +2599,7 @@ defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;
 //===----------------------------------------------------------------------===//
 // SOPP - GFX1250 only.
 //===----------------------------------------------------------------------===//
+defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
 defm S_WAIT_XCNT      : SOPP_Real_32_gfx12<0x045>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.inc.wg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.inc.wg.ll
new file mode 100644
index 000000000000..54d996677d31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.inc.wg.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -show-mc-encoding < %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -show-mc-encoding < %s | FileCheck -check-prefix=GFX1250 %s
+
+declare void @llvm.amdgcn.s.setprio.inc.wg(i16) #0
+
+define void @test_llvm.amdgcn.s.setprio.inc.wg() #0 {
+; GFX1250-LABEL: test_llvm.amdgcn.s.setprio.inc.wg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 0 ; encoding: [0x00,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 1 ; encoding: [0x01,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 2 ; encoding: [0x02,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 3 ; encoding: [0x03,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 10 ; encoding: [0x0a,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg -1 ; encoding: [0xff,0xff,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 0 ; encoding: [0x00,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg 1 ; encoding: [0x01,0x00,0xbe,0xbf]
+; GFX1250-NEXT:    s_setprio_inc_wg -1 ; encoding: [0xff,0xff,0xbe,0xbf]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 0)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 1)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 2)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 3)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 10)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 65535)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 65536)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 65537)
+  call void @llvm.amdgcn.s.setprio.inc.wg(i16 -1)
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 1aca88771c1f..48ec44b410c2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -12,3 +12,7 @@ s_wait_xcnt 0x7
 s_wait_xcnt 0xf
 // GFX1250: [0x0f,0x00,0xc5,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_setprio_inc_wg 100
+// GFX1250: [0x64,0x00,0xbe,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index e785fe9cc6d5..55f74d3a31bf 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -8,3 +8,6 @@
 
 # GFX1250: s_wait_xcnt 0xf ; encoding: [0x0f,0x00,0xc5,0xbf]
 0x0f,0x00,0xc5,0xbf
+
+# GFX1250: s_setprio_inc_wg 0x64 ; encoding: [0x64,0x00,0xbe,0xbf]
+0x64,0x00,0xbe,0xbf

From 8f01edfa112f76050cf812a64f4e9c7baea52375 Mon Sep 17 00:00:00 2001
From: Benji Smith <6193112+Benjins@users.noreply.github.com>
Date: Sun, 22 Jun 2025 18:05:17 -0400
Subject: [PATCH 1208/1322] [C API] Add getter/setter for samesign flag on icmp
 (#145247)

This was added to the C++ API in
https://github.com/llvm/llvm-project/pull/111419 so this change adds
accessors in the C API, along with a couple tests.
---
 llvm/docs/ReleaseNotes.md         |  3 +++
 llvm/include/llvm-c/Core.h        | 18 ++++++++++++++++++
 llvm/lib/IR/Core.cpp              |  8 ++++++++
 llvm/test/Bindings/llvm-c/echo.ll |  8 ++++++++
 llvm/tools/llvm-c-test/echo.cpp   |  2 ++
 5 files changed, 39 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 0395f43c6195..95bcc1a1f3f5 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -251,6 +251,9 @@ Changes to the C API
 * Added ``LLVMDIBuilderCreateEnumeratorOfArbitraryPrecision`` for creating
   debugging metadata of enumerators larger than 64 bits.
 
+* Added ``LLVMGetICmpSameSign`` and ``LLVMSetICmpSameSign`` for the `samesign`
+  flag on `icmp` instructions.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 6857944e6875..3f30ed92997b 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -3675,6 +3675,24 @@ LLVMOpcode LLVMGetInstructionOpcode(LLVMValueRef Inst);
  */
 LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst);
 
+/**
+ * Get whether or not an icmp instruction has the samesign flag.
+ *
+ * This is only valid for instructions that correspond to llvm::ICmpInst.
+ *
+ * @see llvm::ICmpInst::hasSameSign()
+ */
+LLVMBool LLVMGetICmpSameSign(LLVMValueRef Inst);
+
+/**
+ * Set the samesign flag on an icmp instruction.
+ *
+ * This is only valid for instructions that correspond to llvm::ICmpInst.
+ *
+ * @see llvm::ICmpInst::setSameSign()
+ */
+void LLVMSetICmpSameSign(LLVMValueRef Inst, LLVMBool SameSign);
+
 /**
  * Obtain the float predicate of an instruction.
  *
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 9810f04cc503..f7ef4aa473ef 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2951,6 +2951,14 @@ LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst) {
   return (LLVMIntPredicate)0;
 }
 
+LLVMBool LLVMGetICmpSameSign(LLVMValueRef Inst) {
+  return unwrap<ICmpInst>(Inst)->hasSameSign();
+}
+
+void LLVMSetICmpSameSign(LLVMValueRef Inst, LLVMBool SameSign) {
+  unwrap<ICmpInst>(Inst)->setSameSign(SameSign);
+}
+
 LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst) {
   if (FCmpInst *I = dyn_cast<FCmpInst>(unwrap(Inst)))
     return (LLVMRealPredicate)I->getPredicate();
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index 0a688afab612..ab1771d1f879 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -417,6 +417,14 @@ define ptr @test_gep_no_wrap_flags(ptr %0) {
   ret ptr %gep.nusw
 }
 
+define void @test_icmp_same_sign(i32 %a, i32 %b) {
+  %icmp.1 = icmp eq i32 %a, %b
+  %icmp.2 = icmp slt i32 %a, %b
+  %icmp.3 = icmp samesign eq i32 %a, %b
+  %icmp.4 = icmp samesign slt i32 %a, %b
+  ret void
+}
+
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!3}
 
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index 3ec40fdba0ba..026d815b43da 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -823,9 +823,11 @@ struct FunCloner {
       }
       case LLVMICmp: {
         LLVMIntPredicate Pred = LLVMGetICmpPredicate(Src);
+        LLVMBool IsSameSign = LLVMGetICmpSameSign(Src);
         LLVMValueRef LHS = CloneValue(LLVMGetOperand(Src, 0));
         LLVMValueRef RHS = CloneValue(LLVMGetOperand(Src, 1));
         Dst = LLVMBuildICmp(Builder, Pred, LHS, RHS, Name);
+        LLVMSetICmpSameSign(Dst, IsSameSign);
         break;
       }
       case LLVMPHI: {

From 4154ada1d485e652a7d0088e25959f101be79723 Mon Sep 17 00:00:00 2001
From: Andrei Safronov <andrei.safronov@espressif.com>
Date: Mon, 23 Jun 2025 01:18:04 +0300
Subject: [PATCH 1209/1322] [Xtensa] Implement Xtensa Floating Point Option.
 (#136086)

 Implement Xtensa FP Option instructions and lowering
of the base FP operations with tests. Implement UR registers parsing.
 Fix loading from constant pool callee, basic block, globaladdress and
jumptable addresses. Also fixed potential memory leakage when several
similar XtensaConstantPoolValue objects are created Fix lowering i32 immediate.
---
 .../Xtensa/AsmParser/XtensaAsmParser.cpp      |  51 +-
 .../Disassembler/XtensaDisassembler.cpp       |  61 +-
 .../MCTargetDesc/XtensaMCCodeEmitter.cpp      |   5 +
 .../MCTargetDesc/XtensaMCTargetDesc.cpp       |  13 +
 .../Xtensa/MCTargetDesc/XtensaMCTargetDesc.h  |   3 +
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp   |  33 +-
 llvm/lib/Target/Xtensa/XtensaCallingConv.td   |   1 +
 llvm/lib/Target/Xtensa/XtensaFeatures.td      |   5 +
 llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 157 ++++-
 llvm/lib/Target/Xtensa/XtensaISelLowering.h   |  21 +
 llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp    |  18 +-
 llvm/lib/Target/Xtensa/XtensaInstrInfo.td     | 320 +++++++++-
 .../Target/Xtensa/XtensaMachineFunctionInfo.h |   3 +
 llvm/lib/Target/Xtensa/XtensaOperators.td     |  17 +
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.td  |  44 ++
 llvm/lib/Target/Xtensa/XtensaSubtarget.h      |   1 +
 llvm/test/CodeGen/Xtensa/add_shifted_imm.ll   |  53 ++
 llvm/test/CodeGen/Xtensa/aligned_alloc.ll     |   2 +-
 llvm/test/CodeGen/Xtensa/brcc_fp.ll           | 223 +++++++
 llvm/test/CodeGen/Xtensa/bswap.ll             |  39 +-
 llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll   |  15 +-
 llvm/test/CodeGen/Xtensa/float-arith.ll       | 603 ++++++++++++++++++
 llvm/test/CodeGen/Xtensa/mul.ll               | 296 ++++-----
 llvm/test/MC/Disassembler/Xtensa/fp.txt       | 215 +++++++
 llvm/test/MC/Xtensa/float-err.s               |  37 ++
 llvm/test/MC/Xtensa/float.s                   | 209 ++++++
 26 files changed, 2204 insertions(+), 241 deletions(-)
 create mode 100644 llvm/test/CodeGen/Xtensa/add_shifted_imm.ll
 create mode 100644 llvm/test/CodeGen/Xtensa/brcc_fp.ll
 create mode 100644 llvm/test/CodeGen/Xtensa/float-arith.ll
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/fp.txt
 create mode 100644 llvm/test/MC/Xtensa/float-err.s
 create mode 100644 llvm/test/MC/Xtensa/float.s

diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index 6c4e365451af..4103ff6b877a 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -35,7 +35,9 @@ using namespace llvm;
 struct XtensaOperand;
 
 class XtensaAsmParser : public MCTargetAsmParser {
+  const MCRegisterInfo &MRI;
 
+  enum XtensaRegisterType { Xtensa_Generic, Xtensa_SR, Xtensa_UR };
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   XtensaTargetStreamer &getTargetStreamer() {
@@ -64,11 +66,12 @@ class XtensaAsmParser : public MCTargetAsmParser {
   ParseStatus parseImmediate(OperandVector &Operands);
   ParseStatus
   parseRegister(OperandVector &Operands, bool AllowParens = false,
-                bool SR = false,
+                XtensaRegisterType SR = Xtensa_Generic,
                 Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   ParseStatus parseOperandWithModifier(OperandVector &Operands);
   bool
-  parseOperand(OperandVector &Operands, StringRef Mnemonic, bool SR = false,
+  parseOperand(OperandVector &Operands, StringRef Mnemonic,
+               XtensaRegisterType SR = Xtensa_Generic,
                Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   bool ParseInstructionWithSR(ParseInstructionInfo &Info, StringRef Name,
                               SMLoc NameLoc, OperandVector &Operands);
@@ -90,7 +93,8 @@ public:
 
   XtensaAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                   const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, STI, MII) {
+      : MCTargetAsmParser(Options, STI, MII),
+        MRI(*Parser.getContext().getRegisterInfo()) {
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
@@ -583,7 +587,8 @@ bool XtensaAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
 }
 
 ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
-                                           bool AllowParens, bool SR,
+                                           bool AllowParens,
+                                           XtensaRegisterType RegType,
                                            Xtensa::RegisterAccessType RAType) {
   SMLoc FirstS = getLoc();
   bool HadParens = false;
@@ -594,25 +599,32 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
   if (AllowParens && getLexer().is(AsmToken::LParen)) {
     size_t ReadCount = getLexer().peekTokens(Buf);
     if (ReadCount == 2 && Buf[1].getKind() == AsmToken::RParen) {
-      if ((Buf[0].getKind() == AsmToken::Integer) && (!SR))
+      if (Buf[0].getKind() == AsmToken::Integer && RegType == Xtensa_Generic)
         return ParseStatus::NoMatch;
       HadParens = true;
       getParser().Lex(); // Eat '('
     }
   }
 
-  unsigned RegNo = 0;
+  MCRegister RegNo = 0;
 
   switch (getLexer().getKind()) {
   default:
     return ParseStatus::NoMatch;
   case AsmToken::Integer:
-    if (!SR)
+    if (RegType == Xtensa_Generic)
       return ParseStatus::NoMatch;
-    RegName = getLexer().getTok().getString();
-    RegNo = MatchRegisterName(RegName);
-    if (RegNo == 0)
+
+    // Parse case when we expect UR register code as special case,
+    // because SR and UR registers may have the same number
+    // and such situation may lead to confilct
+    if (RegType == Xtensa_UR) {
+      int64_t RegCode = getLexer().getTok().getIntVal();
+      RegNo = Xtensa::getUserRegister(RegCode, MRI);
+    } else {
+      RegName = getLexer().getTok().getString();
       RegNo = MatchRegisterAltName(RegName);
+    }
     break;
   case AsmToken::Identifier:
     RegName = getLexer().getTok().getIdentifier();
@@ -689,7 +701,8 @@ ParseStatus XtensaAsmParser::parseOperandWithModifier(OperandVector &Operands) {
 /// from this information, adding to Operands.
 /// If operand was parsed, returns false, else true.
 bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                                   bool SR, Xtensa::RegisterAccessType RAType) {
+                                   XtensaRegisterType RegType,
+                                   Xtensa::RegisterAccessType RAType) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   ParseStatus Res = MatchOperandParserImpl(Operands, Mnemonic);
@@ -703,7 +716,7 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
     return true;
 
   // Attempt to parse token as register
-  if (parseRegister(Operands, true, SR, RAType).isSuccess())
+  if (parseRegister(Operands, true, RegType, RAType).isSuccess())
     return false;
 
   // Attempt to parse token as an immediate
@@ -722,11 +735,9 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
                      : (Name[0] == 'r' ? Xtensa::REGISTER_READ
                                        : Xtensa::REGISTER_EXCHANGE);
 
-  if ((Name.starts_with("wsr.") || Name.starts_with("rsr.") ||
-       Name.starts_with("xsr.")) &&
-      (Name.size() > 4)) {
-    // Parse case when instruction name is concatenated with SR register
-    // name, like "wsr.sar a1"
+  if ((Name.size() > 4) && Name[3] == '.') {
+    // Parse case when instruction name is concatenated with SR/UR register
+    // name, like "wsr.sar a1" or "wur.fcr a1"
 
     // First operand is token for instruction
     Operands.push_back(XtensaOperand::createToken(Name.take_front(3), NameLoc));
@@ -762,7 +773,8 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     }
 
     // Parse second operand
-    if (parseOperand(Operands, Name, true, RAType))
+    if (parseOperand(Operands, Name, Name[1] == 's' ? Xtensa_SR : Xtensa_UR,
+                     RAType))
       return true;
   }
 
@@ -780,7 +792,8 @@ bool XtensaAsmParser::parseInstruction(ParseInstructionInfo &Info,
                                        StringRef Name, SMLoc NameLoc,
                                        OperandVector &Operands) {
   if (Name.starts_with("wsr") || Name.starts_with("rsr") ||
-      Name.starts_with("xsr")) {
+      Name.starts_with("xsr") || Name.starts_with("rur") ||
+      Name.starts_with("wur")) {
     return ParseInstructionWithSR(Info, Name, NameLoc, Operands);
   }
 
diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index 3b37ac88b9b1..2f92f8606fb4 100644
--- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -73,42 +73,68 @@ static DecodeStatus DecodeARRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
-static const MCPhysReg MRDecoderTable[] = {Xtensa::M0, Xtensa::M1, Xtensa::M2,
-                                           Xtensa::M3};
-
 static DecodeStatus DecodeMRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
                                           const void *Decoder) {
-  if (RegNo >= std::size(MRDecoderTable))
+  if (RegNo > 3)
     return MCDisassembler::Fail;
 
-  MCPhysReg Reg = MRDecoderTable[RegNo];
+  MCPhysReg Reg = Xtensa::M0 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
 
-static const MCPhysReg MR01DecoderTable[] = {Xtensa::M0, Xtensa::M1};
-
 static DecodeStatus DecodeMR01RegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
-  if (RegNo > 2)
+  if (RegNo > 1)
     return MCDisassembler::Fail;
 
-  MCPhysReg Reg = MR01DecoderTable[RegNo];
+  MCPhysReg Reg = Xtensa::M0 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
 
-static const MCPhysReg MR23DecoderTable[] = {Xtensa::M2, Xtensa::M3};
-
 static DecodeStatus DecodeMR23RegisterClass(MCInst &Inst, uint64_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
-  if (RegNo != 0 && RegNo != 1)
+  if (RegNo > 1)
+    return MCDisassembler::Fail;
+
+  MCPhysReg Reg = Xtensa::M2 + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  MCPhysReg Reg = Xtensa::F0 + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeURRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                          uint64_t Address,
+                                          const MCDisassembler *Decoder) {
+  if (RegNo > 255)
+    return MCDisassembler::Fail;
+
+  Xtensa::RegisterAccessType RAType = Inst.getOpcode() == Xtensa::WUR
+                                          ? Xtensa::REGISTER_WRITE
+                                          : Xtensa::REGISTER_READ;
+
+  const XtensaDisassembler *Dis =
+      static_cast<const XtensaDisassembler *>(Decoder);
+  const MCRegisterInfo *MRI = Dis->getContext().getRegisterInfo();
+  MCPhysReg Reg = Xtensa::getUserRegister(RegNo, *MRI);
+  if (!Xtensa::checkRegister(Reg, Decoder->getSubtargetInfo().getFeatureBits(),
+                             RAType))
     return MCDisassembler::Fail;
 
-  MCPhysReg Reg = MR23DecoderTable[RegNo];
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -187,18 +213,13 @@ static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Fail;
 }
 
-const MCPhysReg BRDecoderTable[] = {
-    Xtensa::B0,  Xtensa::B1,  Xtensa::B2,  Xtensa::B3, Xtensa::B4,  Xtensa::B5,
-    Xtensa::B6,  Xtensa::B7,  Xtensa::B8,  Xtensa::B9, Xtensa::B10, Xtensa::B11,
-    Xtensa::B12, Xtensa::B13, Xtensa::B14, Xtensa::B15};
-
 static DecodeStatus DecodeBRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
                                           const void *Decoder) {
-  if (RegNo >= std::size(BRDecoderTable))
+  if (RegNo > 15)
     return MCDisassembler::Fail;
 
-  MCPhysReg Reg = BRDecoderTable[RegNo];
+  MCPhysReg Reg = Xtensa::B0 + RegNo;
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index 8231a8a9a44d..03b3ed0c121b 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -307,6 +307,11 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo,
   case Xtensa::L32I:
   case Xtensa::S32I_N:
   case Xtensa::L32I_N:
+  case Xtensa::SSI:
+  case Xtensa::SSIP:
+  case Xtensa::LSI:
+  case Xtensa::LSIP:
+
     if (Res & 0x3) {
       report_fatal_error("Unexpected operand value!");
     }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index f48c6225827b..9a55635674d7 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -188,6 +188,9 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
     return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeaturePRID];
   case Xtensa::VECBASE:
     return FeatureBits[Xtensa::FeatureRelocatableVector];
+  case Xtensa::FCR:
+  case Xtensa::FSR:
+    return FeatureBits[FeatureSingleFloat];
   case Xtensa::WINDOWBASE:
   case Xtensa::WINDOWSTART:
     return FeatureBits[Xtensa::FeatureWindowed];
@@ -198,6 +201,16 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
   return true;
 }
 
+// Get Xtensa User Register by encoding value.
+MCRegister Xtensa::getUserRegister(unsigned Code, const MCRegisterInfo &MRI) {
+  if (MRI.getEncodingValue(Xtensa::FCR) == Code) {
+    return Xtensa::FCR;
+  } else if (MRI.getEncodingValue(Xtensa::FSR) == Code) {
+    return Xtensa::FSR;
+  }
+  return Xtensa::NoRegister;
+}
+
 static MCAsmInfo *createXtensaMCAsmInfo(const MCRegisterInfo &MRI,
                                         const Triple &TT,
                                         const MCTargetOptions &Options) {
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
index ec91f656bdcb..963f86679fcf 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
@@ -64,6 +64,9 @@ enum RegisterAccessType {
 // Verify if it's correct to use a special register.
 bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
                    RegisterAccessType RA);
+
+// Get Xtensa User Register by register encoding value.
+MCRegister getUserRegister(unsigned Code, const MCRegisterInfo &MRI);
 } // namespace Xtensa
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index 4e3ed4b9e8ee..d31a256dd00f 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -62,22 +62,25 @@ void XtensaAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
 void XtensaAsmPrinter::emitMachineConstantPoolValue(
     MachineConstantPoolValue *MCPV) {
-  XtensaConstantPoolValue *ACPV = static_cast<XtensaConstantPoolValue *>(MCPV);
+  XtensaConstantPoolValue *XtensaCPV =
+      static_cast<XtensaConstantPoolValue *>(MCPV);
   MCSymbol *MCSym;
 
-  if (ACPV->isBlockAddress()) {
+  if (XtensaCPV->isBlockAddress()) {
     const BlockAddress *BA =
-        cast<XtensaConstantPoolConstant>(ACPV)->getBlockAddress();
+        cast<XtensaConstantPoolConstant>(XtensaCPV)->getBlockAddress();
     MCSym = GetBlockAddressSymbol(BA);
-  } else if (ACPV->isMachineBasicBlock()) {
-    const MachineBasicBlock *MBB = cast<XtensaConstantPoolMBB>(ACPV)->getMBB();
+  } else if (XtensaCPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB =
+        cast<XtensaConstantPoolMBB>(XtensaCPV)->getMBB();
     MCSym = MBB->getSymbol();
-  } else if (ACPV->isJumpTable()) {
-    unsigned Idx = cast<XtensaConstantPoolJumpTable>(ACPV)->getIndex();
+  } else if (XtensaCPV->isJumpTable()) {
+    unsigned Idx = cast<XtensaConstantPoolJumpTable>(XtensaCPV)->getIndex();
     MCSym = this->GetJTISymbol(Idx, false);
   } else {
-    assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
-    XtensaConstantPoolSymbol *XtensaSym = cast<XtensaConstantPoolSymbol>(ACPV);
+    assert(XtensaCPV->isExtSymbol() && "unrecognized constant pool value");
+    XtensaConstantPoolSymbol *XtensaSym =
+        cast<XtensaConstantPoolSymbol>(XtensaCPV);
     const char *SymName = XtensaSym->getSymbol();
 
     if (XtensaSym->isPrivateLinkage()) {
@@ -89,14 +92,14 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
     }
   }
 
-  MCSymbol *LblSym = GetCPISymbol(ACPV->getLabelId());
+  MCSymbol *LblSym = GetCPISymbol(XtensaCPV->getLabelId());
   auto *TS =
       static_cast<XtensaTargetStreamer *>(OutStreamer->getTargetStreamer());
-  auto Spec = getModifierSpecifier(ACPV->getModifier());
+  auto Spec = getModifierSpecifier(XtensaCPV->getModifier());
 
-  if (ACPV->getModifier() != XtensaCP::no_modifier) {
+  if (XtensaCPV->getModifier() != XtensaCP::no_modifier) {
     std::string SymName(MCSym->getName());
-    StringRef Modifier = ACPV->getModifierText();
+    StringRef Modifier = XtensaCPV->getModifierText();
     SymName += Modifier;
     MCSym = OutContext.getOrCreateSymbol(SymName);
   }
@@ -108,9 +111,9 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
 void XtensaAsmPrinter::emitMachineConstantPoolEntry(
     const MachineConstantPoolEntry &CPE, int i) {
   if (CPE.isMachineConstantPoolEntry()) {
-    XtensaConstantPoolValue *ACPV =
+    XtensaConstantPoolValue *XtensaCPV =
         static_cast<XtensaConstantPoolValue *>(CPE.Val.MachineCPVal);
-    ACPV->setLabelId(i);
+    XtensaCPV->setLabelId(i);
     emitMachineConstantPoolValue(CPE.Val.MachineCPVal);
   } else {
     MCSymbol *LblSym = GetCPISymbol(i);
diff --git a/llvm/lib/Target/Xtensa/XtensaCallingConv.td b/llvm/lib/Target/Xtensa/XtensaCallingConv.td
index 2c48f8f86caf..96528ed283a3 100644
--- a/llvm/lib/Target/Xtensa/XtensaCallingConv.td
+++ b/llvm/lib/Target/Xtensa/XtensaCallingConv.td
@@ -15,6 +15,7 @@
 def RetCC_Xtensa : CallingConv<[
   // First two return values go in a2, a3, a4, a5
   CCIfType<[i32], CCAssignToReg<[A2, A3, A4, A5]>>,
+  CCIfType<[f32], CCAssignToReg<[A2, A3, A4, A5]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[A2, A4], [A3, A5]>>
 ]>;
 
diff --git a/llvm/lib/Target/Xtensa/XtensaFeatures.td b/llvm/lib/Target/Xtensa/XtensaFeatures.td
index 1dd03283e931..d48f494388a2 100644
--- a/llvm/lib/Target/Xtensa/XtensaFeatures.td
+++ b/llvm/lib/Target/Xtensa/XtensaFeatures.td
@@ -8,6 +8,11 @@ def FeatureDensity : SubtargetFeature<"density", "HasDensity", "true",
 def HasDensity : Predicate<"Subtarget->hasDensity()">,
                  AssemblerPredicate<(all_of FeatureDensity)>;
 
+def FeatureSingleFloat : SubtargetFeature<"fp", "HasSingleFloat", "true",
+                                          "Enable Xtensa Single FP instructions">;
+def HasSingleFloat : Predicate<"Subtarget->hasSingleFloat()">,
+                     AssemblerPredicate<(all_of FeatureSingleFloat)>;
+
 def FeatureWindowed : SubtargetFeature<"windowed", "HasWindowed", "true",
                                        "Enable Xtensa Windowed Register option">;
 def HasWindowed : Predicate<"Subtarget->hasWindowed()">,
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index b17840aad9b4..d51c573282da 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -58,6 +58,10 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
   // Set up the register classes.
   addRegisterClass(MVT::i32, &Xtensa::ARRegClass);
 
+  if (Subtarget.hasSingleFloat()) {
+    addRegisterClass(MVT::f32, &Xtensa::FPRRegClass);
+  }
+
   if (Subtarget.hasBoolean()) {
     addRegisterClass(MVT::v1i1, &Xtensa::BRRegClass);
   }
@@ -71,6 +75,8 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::Constant, MVT::i32, Custom);
   setOperationAction(ISD::Constant, MVT::i64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
 
   setBooleanContents(ZeroOrOneBooleanContent);
 
@@ -107,8 +113,12 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
 
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
 
   setCondCodeAction(ISD::SETGT, MVT::i32, Expand);
   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
@@ -175,6 +185,64 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
+  // Handle floating-point types.
+  for (unsigned I = MVT::FIRST_FP_VALUETYPE; I <= MVT::LAST_FP_VALUETYPE; ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      if (VT.getSizeInBits() == 32 && Subtarget.hasSingleFloat()) {
+        setOperationAction(ISD::FABS, VT, Legal);
+        setOperationAction(ISD::FADD, VT, Legal);
+        setOperationAction(ISD::FSUB, VT, Legal);
+        setOperationAction(ISD::FMA, VT, Legal);
+        setOperationAction(ISD::FMUL, VT, Legal);
+        setOperationAction(ISD::FNEG, VT, Legal);
+      } else {
+        setOperationAction(ISD::FABS, VT, Expand);
+        setOperationAction(ISD::FADD, VT, Expand);
+        setOperationAction(ISD::FSUB, VT, Expand);
+        setOperationAction(ISD::FMA, VT, Expand);
+        setOperationAction(ISD::FMUL, VT, Expand);
+        setOperationAction(ISD::FNEG, VT, Expand);
+      }
+
+      // TODO: once implemented in InstrInfo uncomment
+      setOperationAction(ISD::FSQRT, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::FSQRT, VT, Expand);
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    }
+  }
+
+  // Handle floating-point types.
+  if (Subtarget.hasSingleFloat()) {
+    setOperationAction(ISD::BITCAST, MVT::i32, Legal);
+    setOperationAction(ISD::BITCAST, MVT::f32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  } else {
+    setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand);
+  }
+
+  // Floating-point truncation and stores need to be done separately.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // Compute derived properties from the register classes
   computeRegisterProperties(STI.getRegisterInfo());
 }
@@ -185,6 +253,11 @@ bool XtensaTargetLowering::isOffsetFoldingLegal(
   return false;
 }
 
+bool XtensaTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+                                        bool ForCodeSize) const {
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Inline asm support
 //===----------------------------------------------------------------------===//
@@ -335,6 +408,16 @@ static bool CC_Xtensa_Custom(unsigned ValNo, MVT ValVT, MVT LocVT,
   return false;
 }
 
+/// Return the register type for a given MVT
+MVT XtensaTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                        CallingConv::ID CC,
+                                                        EVT VT) const {
+  if (VT.isFloatingPoint())
+    return MVT::i32;
+
+  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
 CCAssignFn *XtensaTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                     bool IsVarArg) const {
   return CC_Xtensa_Custom;
@@ -604,15 +687,19 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if ((!name.empty()) && isLongCall(name.c_str())) {
     // Create a constant pool entry for the callee address
     XtensaCP::XtensaCPModifier Modifier = XtensaCP::no_modifier;
+    XtensaMachineFunctionInfo *XtensaFI =
+        MF.getInfo<XtensaMachineFunctionInfo>();
+    unsigned LabelId = XtensaFI->createCPLabelId();
 
     XtensaConstantPoolValue *CPV = XtensaConstantPoolSymbol::Create(
-        *DAG.getContext(), name.c_str(), 0 /* XtensaCLabelIndex */, false,
-        Modifier);
+        *DAG.getContext(), name.c_str(), LabelId, false, Modifier);
 
     // Get the address of the callee into a register
     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4), 0, TF);
     SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
-    Callee = CPWrap;
+    Callee = DAG.getLoad(
+        PtrVT, DL, DAG.getEntryNode(), CPWrap,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 
   // The first call operand is the chain and the second is the target address.
@@ -805,12 +892,14 @@ SDValue XtensaTargetLowering::LowerImmediate(SDValue Op,
     // Check if use node maybe lowered to the ADDMI instruction
     SDNode &OpNode = *Op.getNode();
     if ((OpNode.hasOneUse() && OpNode.user_begin()->getOpcode() == ISD::ADD) &&
-        isShiftedInt<16, 8>(Value))
+        isShiftedInt<8, 8>(Value))
       return Op;
     Type *Ty = Type::getInt32Ty(*DAG.getContext());
     Constant *CV = ConstantInt::get(Ty, Value);
     SDValue CP = DAG.getConstantPool(CV, MVT::i32);
-    return CP;
+    SDValue Res =
+        DAG.getLoad(MVT::i32, DL, DAG.getEntryNode(), CP, MachinePointerInfo());
+    return Res;
   }
   return Op;
 }
@@ -824,22 +913,30 @@ SDValue XtensaTargetLowering::LowerGlobalAddress(SDValue Op,
 
   SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
   SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
-
-  return CPWrap;
+  SDValue Res = DAG.getLoad(
+      PtrVT, DL, DAG.getEntryNode(), CPWrap,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  return Res;
 }
 
 SDValue XtensaTargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   BlockAddressSDNode *Node = cast<BlockAddressSDNode>(Op);
+  SDLoc DL(Op);
   const BlockAddress *BA = Node->getBlockAddress();
   EVT PtrVT = Op.getValueType();
+  MachineFunction &MF = DAG.getMachineFunction();
+  XtensaMachineFunctionInfo *XtensaFI = MF.getInfo<XtensaMachineFunctionInfo>();
+  unsigned LabelId = XtensaFI->createCPLabelId();
 
   XtensaConstantPoolValue *CPV =
-      XtensaConstantPoolConstant::Create(BA, 0, XtensaCP::CPBlockAddress);
+      XtensaConstantPoolConstant::Create(BA, LabelId, XtensaCP::CPBlockAddress);
   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
   SDValue CPWrap = getAddrPCRel(CPAddr, DAG);
-
-  return CPWrap;
+  SDValue Res = DAG.getLoad(
+      PtrVT, DL, DAG.getEntryNode(), CPWrap,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  return Res;
 }
 
 SDValue XtensaTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
@@ -874,15 +971,19 @@ SDValue XtensaTargetLowering::LowerJumpTable(SDValue Op,
                                              SelectionDAG &DAG) const {
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   EVT PtrVT = Op.getValueType();
+  SDLoc DL(Op);
 
-  // Create a constant pool entry for the callee address
+  // Create a constant pool entry for the jumptable address
   XtensaConstantPoolValue *CPV =
       XtensaConstantPoolJumpTable::Create(*DAG.getContext(), JT->getIndex());
 
-  // Get the address of the callee into a register
+  // Get the address of the jumptable into a register
   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
 
-  return getAddrPCRel(CPAddr, DAG);
+  SDValue Res = DAG.getLoad(
+      PtrVT, DL, DAG.getEntryNode(), getAddrPCRel(CPAddr, DAG),
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  return Res;
 }
 
 SDValue XtensaTargetLowering::getAddrPCRel(SDValue Op,
@@ -1311,6 +1412,26 @@ const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "XtensaISD::SRCL";
   case XtensaISD::SRCR:
     return "XtensaISD::SRCR";
+  case XtensaISD::CMPUO:
+    return "XtensaISD::CMPUO";
+  case XtensaISD::CMPUEQ:
+    return "XtensaISD::CMPUEQ";
+  case XtensaISD::CMPULE:
+    return "XtensaISD::CMPULE";
+  case XtensaISD::CMPULT:
+    return "XtensaISD::CMPULT";
+  case XtensaISD::CMPOEQ:
+    return "XtensaISD::CMPOEQ";
+  case XtensaISD::CMPOLE:
+    return "XtensaISD::CMPOLE";
+  case XtensaISD::CMPOLT:
+    return "XtensaISD::CMPOLT";
+  case XtensaISD::MADD:
+    return "XtensaISD::MADD";
+  case XtensaISD::MSUB:
+    return "XtensaISD::MSUB";
+  case XtensaISD::MOVS:
+    return "XtensaISD::MOVS";
   }
   return nullptr;
 }
@@ -1395,11 +1516,19 @@ MachineBasicBlock *XtensaTargetLowering::EmitInstrWithCustomInserter(
   case Xtensa::S16I:
   case Xtensa::S32I:
   case Xtensa::S32I_N:
+  case Xtensa::SSI:
+  case Xtensa::SSIP:
+  case Xtensa::SSX:
+  case Xtensa::SSXP:
   case Xtensa::L8UI:
   case Xtensa::L16SI:
   case Xtensa::L16UI:
   case Xtensa::L32I:
-  case Xtensa::L32I_N: {
+  case Xtensa::L32I_N:
+  case Xtensa::LSI:
+  case Xtensa::LSIP:
+  case Xtensa::LSX:
+  case Xtensa::LSXP: {
     // Insert memory wait instruction "memw" before volatile load/store as it is
     // implemented in gcc. If memoperands is empty then assume that it aslo
     // maybe volatile load/store and insert "memw".
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
index c7d4f41b1f08..b6f2ebe21c94 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h
@@ -56,6 +56,21 @@ enum {
   SRCL,
   // Shift Right Combined
   SRCR,
+
+  // Floating point unordered compare conditions
+  CMPUEQ,
+  CMPULE,
+  CMPULT,
+  CMPUO,
+  // Floating point compare conditions
+  CMPOEQ,
+  CMPOLE,
+  CMPOLT,
+  // FP multipy-add/sub
+  MADD,
+  MSUB,
+  // FP move
+  MOVS,
 };
 }
 
@@ -70,6 +85,9 @@ public:
     return LHSTy.getSizeInBits() <= 32 ? MVT::i32 : MVT::i64;
   }
 
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+                                    EVT VT) const override;
+
   EVT getSetCCResultType(const DataLayout &, LLVMContext &,
                          EVT VT) const override {
     if (!VT.isVector())
@@ -81,6 +99,9 @@ public:
 
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                    bool ForCodeSize) const override;
+
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index 005532b864c4..896e2f8f1c01 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -151,11 +151,15 @@ void XtensaInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
                                           unsigned &LoadOpcode,
                                           unsigned &StoreOpcode,
                                           int64_t offset) const {
-  assert((RC == &Xtensa::ARRegClass) &&
-         "Unsupported regclass to load or store");
-
-  LoadOpcode = Xtensa::L32I;
-  StoreOpcode = Xtensa::S32I;
+  if (RC == &Xtensa::ARRegClass) {
+    LoadOpcode = Xtensa::L32I;
+    StoreOpcode = Xtensa::S32I;
+  } else if (RC == &Xtensa::FPRRegClass) {
+    LoadOpcode = Xtensa::LSI;
+    StoreOpcode = Xtensa::SSI;
+  } else {
+    llvm_unreachable("Unsupported regclass to load or store");
+  }
 }
 
 void XtensaInstrInfo::loadImmediate(MachineBasicBlock &MBB,
@@ -520,8 +524,10 @@ void XtensaInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
     JumpToMBB = &RestoreBB;
   }
 
+  unsigned LabelId = XtensaFI->createCPLabelId();
+
   XtensaConstantPoolValue *C = XtensaConstantPoolMBB::Create(
-      MF->getFunction().getContext(), JumpToMBB, 0);
+      MF->getFunction().getContext(), JumpToMBB, LabelId);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
   L32R.addOperand(MachineOperand::CreateCPI(Idx, 0));
 
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 7e9fcd7058c2..1335c6faff6b 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -234,15 +234,13 @@ def S8I  : Store_II8<0x04, "s8i", truncstorei8, addr_ish1, mem8>;
 def S16I : Store_II8<0x05, "s16i", truncstorei16, addr_ish2, mem16>;
 def S32I : Store_II8<0x06, "s32i", store, addr_ish4, mem32>;
 
+let AddedComplexity = 10 in
 def L32R : RI16_Inst<0x01, (outs AR:$t), (ins L32Rtarget:$label),
-                    "l32r\t$t, $label", []> {
+                    "l32r\t$t, $label", [(set AR:$t, (load (Xtensa_pcrel_wrapper tconstpool:$label)))]> {
   bits<16> label;
   let imm16 = label;
 }
 
-// pcrel addr loading using L32R
-def : Pat<(Xtensa_pcrel_wrapper tconstpool : $in), (L32R tconstpool : $in)>;
-
 // FrameIndexes are legalized when they are operands from load/store
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
@@ -565,6 +563,26 @@ def XSR : RSR_Inst<0x00, 0x01, 0x06, (outs AR:$ard, SR:$srd), (ins AR:$t, SR:$sr
   let Constraints = "$ard = $t, $srd = $sr";
 }
 
+//===----------------------------------------------------------------------===//
+// User Registers read/write instructions
+//===----------------------------------------------------------------------===//
+
+def WUR : RRR_Inst<0x00, 0x03, 0x0F, (outs UR:$ur), (ins AR:$t),
+                  "wur\t$t, $ur", []> {
+  bits<8> ur;
+
+  let r = ur{7-4};
+  let s = ur{3-0};
+}
+
+def RUR : RRR_Inst<0x00, 0x03, 0x0E, (outs AR:$r), (ins UR:$ur),
+                  "rur\t$r, $ur", []> {
+  bits<8> ur;
+
+  let s = ur{7-4};
+  let t = ur{3-0};
+}
+
 //===----------------------------------------------------------------------===//
 // Stack allocation
 //===----------------------------------------------------------------------===//
@@ -998,6 +1016,300 @@ let Predicates = [HasDiv32] in {
   def REMU : ArithLogic_RRR<0x0E, 0x02, "remu", urem>;
 }
 
+//===----------------------------------------------------------------------===//
+// Floating-Point Instructions
+//===----------------------------------------------------------------------===//
+
+class FPArith_RRR<bits<4> oper2, bits<4> oper1, string instrAsm,
+                 SDPatternOperator opNode, bit isComm = 0>
+  : RRR_Inst<0x00, oper1, oper2, (outs FPR:$r), (ins FPR:$s, FPR:$t),
+             instrAsm#"\t$r, $s, $t",
+            [(set FPR:$r, (opNode FPR:$s, FPR:$t))]> {
+  let isCommutable = isComm;
+  let isReMaterializable = 0;
+  let Predicates = [HasSingleFloat];
+}
+
+def ADD_S : FPArith_RRR<0x00, 0x0A, "add.s", fadd, 1>;
+def SUB_S : FPArith_RRR<0x01, 0x0A, "sub.s", fsub>;
+def MUL_S : FPArith_RRR<0x02, 0x0A, "mul.s", fmul, 1>;
+
+// FP load instructions
+let mayLoad = 1, usesCustomInserter = 1, Predicates = [HasSingleFloat] in {
+  def LSI : RRI8_Inst<0x03, (outs FPR:$t), (ins mem32:$addr),
+                     "lsi\t$t, $addr", []> {
+    bits<12> addr;
+
+    let r = 0x00;
+    let imm8{7-0} = addr{11-4};
+    let s{3-0} = addr{3-0};
+  }
+
+  def LSIP : RRI8_Inst<0x03, (outs FPR:$t), (ins mem32:$addr),
+                      "lsip\t$t, $addr", []> {
+    bits<12> addr;
+
+    let r = 0x08;
+    let imm8{7-0} = addr{11-4};
+    let s{3-0} = addr{3-0};
+  }
+
+  def LSX : RRR_Inst<0x00, 0x08, 0x00, (outs), (ins FPR:$r, AR:$s, AR:$t),
+                    "lsx\t$r, $s, $t", []>;
+
+  def LSXP : RRR_Inst<0x00, 0x08, 0x01, (outs), (ins FPR:$r, AR:$s, AR:$t),
+                     "lsxp\t$r, $s, $t", []>;
+}
+
+def : Pat<(f32 (load addr_ish4:$addr)), (f32 (LSI mem32:$addr))>;
+
+// FP store instructions
+let mayStore = 1, usesCustomInserter = 1, Predicates = [HasSingleFloat] in {
+  def SSI : RRI8_Inst<0x03, (outs), (ins FPR:$t, mem32:$addr),
+                     "ssi\t$t, $addr", []> {
+    bits<12> addr;
+
+    let r = 0x04;
+    let imm8{7-0} = addr{11-4};
+    let s{3-0} = addr{3-0};
+  }
+
+  def SSIP : RRI8_Inst<0x03, (outs), (ins FPR:$t, mem32:$addr),
+                      "ssip\t$t, $addr", []> {
+    bits<12> addr;
+
+    let r = 0x0C;
+    let imm8{7-0} = addr{11-4};
+    let s{3-0} = addr{3-0};
+  }
+
+  def SSX: RRR_Inst<0x00, 0x08, 0x04, (outs), (ins FPR:$r, AR:$s, AR:$t),
+                   "ssx\t$r, $s, $t", []>;
+
+  def SSXP: RRR_Inst<0x00, 0x08, 0x05, (outs), (ins FPR:$r, AR:$s, AR:$t),
+                    "ssxp\t$r, $s, $t", []>;
+}
+
+def : Pat<(store FPR:$t, addr_ish4:$addr), (SSI FPR:$t, mem32:$addr)>;
+
+// FP compare instructions
+let isCompare = 1, Predicates = [HasSingleFloat] in {
+  class FCompare <bits<4> oper2, bits<4> oper1, string instrAsm,
+                 SDPatternOperator opNode, bit isComm = 0>
+    : RRR_Inst<0x00, oper1, oper2, (outs BR:$r), (ins FPR:$s, FPR:$t),
+               instrAsm#"\t$r, $s, $t",
+              [(set BR:$r, (opNode FPR:$s, FPR:$t))]> {
+    let isCommutable = isComm;
+    let isReMaterializable = 0;
+    let Predicates = [HasSingleFloat];
+  }
+}
+
+def OEQ_S :  FCompare<0x02, 0x0b, "oeq.s", Xtensa_cmpoeq, 1>;
+def OLT_S :  FCompare<0x04, 0x0b, "olt.s", Xtensa_cmpolt, 0>;
+def OLE_S :  FCompare<0x06, 0x0b, "ole.s", Xtensa_cmpole, 0>;
+
+def UEQ_S :  FCompare<0x03, 0x0b, "ueq.s", Xtensa_cmpueq, 1>;
+def ULT_S :  FCompare<0x05, 0x0b, "ult.s", Xtensa_cmpult, 0>;
+def ULE_S :  FCompare<0x07, 0x0b, "ule.s", Xtensa_cmpule, 0>;
+def UN_S  :  FCompare<0x01, 0x0b, "un.s",  Xtensa_cmpuo, 1>;
+
+def ABS_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                    "abs.s\t$r, $s",
+                    [(set FPR:$r, (fabs FPR:$s))]>, Requires<[HasSingleFloat]> {
+  let t = 0x01;
+}
+
+def : Pat<(fabs FPR:$s), (ABS_S $s)>;
+
+def ADDEXP_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                       "addexp.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x0E;
+}
+
+def ADDEXPM_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                        "addexpm.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x0F;
+}
+
+def CEIL_S : RRR_Inst<0x00, 0x0A, 0x0B, (outs AR:$r), (ins FPR:$s, uimm4:$imm),
+                     "ceil.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]> {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def CONST_S : RRR_Inst<0x00, 0x0a, 0x0f, (outs FPR:$r), (ins uimm4:$imm),
+                      "const.s\t$r, $imm", []>, Requires<[HasSingleFloat]> {
+  bits<4> imm;
+
+  let t = 0x03;
+  let s = imm{3-0};
+}
+
+def DIV0_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                     "div0.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x7;
+}
+
+def DIVN_S : RRR_Inst<0x00, 0x0A, 0x07, (outs FPR:$r), (ins FPR:$s, FPR:$t),
+                     "divn.s\t$r, $s, $t", []>, Requires<[HasSingleFloat]>;
+
+def FLOAT_S : RRR_Inst<0x00, 0x0A, 0x0c, (outs FPR:$r), (ins AR:$s, uimm4:$imm),
+                      "float.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]> {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def : Pat<(f32 (sint_to_fp AR:$s)), (FLOAT_S AR:$s, 0)>;
+
+def FLOOR_S : RRR_Inst<0x00, 0x0A, 0x0A, (outs AR:$r), (ins FPR:$s, uimm4:$imm),
+                      "floor.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]> {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def MADDN_S : RRR_Inst<0x00, 0x0A, 0x06, (outs FPR:$r), (ins FPR:$s, FPR:$t),
+                      "maddn.s\t$r, $s, $t", []>, Requires<[HasSingleFloat]> {
+  let isCommutable = 0;
+}
+
+// FP multipy-add
+def MADD_S : RRR_Inst<0x00, 0x0A, 0x04, (outs FPR:$r), (ins FPR:$a, FPR:$s, FPR:$t),
+                     "madd.s\t$r, $s, $t",
+                     [(set FPR:$r, (Xtensa_madd FPR:$a, FPR:$s, FPR:$t))]>,
+                     Requires<[HasSingleFloat]> {
+  let isCommutable = 0;
+  let isReMaterializable = 0;
+  let Constraints = "$r = $a";
+}
+
+// fmadd: r1 * r2 + r3
+def : Pat<(fma FPR:$r1, FPR:$r2, FPR:$r3),
+          (MADD_S $r3, $r1, $r2)>;
+
+def MKDADJ_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                       "mkdadj.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x0D;
+}
+
+def MKSADJ_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                       "mksadj.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x0C;
+}
+
+// FP move instructions
+def MOV_S : RRR_Inst<0x00, 0x0A, 0x0f, (outs FPR:$r), (ins FPR:$s),
+                    "mov.s\t$r, $s",
+                    [(set FPR:$r, (Xtensa_movs FPR:$s))]>, Requires<[HasSingleFloat]> {
+  let t = 0x00;
+}
+
+def MOVEQZ_S : RRR_Inst<0x00, 0x0B, 0x08, (outs FPR:$r), (ins FPR:$s, AR:$t),
+                       "moveqz.s\t$r, $s, $t", []>, Requires<[HasSingleFloat]>;
+
+def MOVF_S   : RRR_Inst<0x00, 0x0B, 0x0C, (outs FPR:$r), (ins FPR:$s, BR:$t),
+                       "movf.s\t$r, $s, $t", []>, Requires<[HasBoolean, HasSingleFloat]>;
+
+def MOVGEZ_S : RRR_Inst<0x00, 0x0B, 0x0B, (outs FPR:$r), (ins FPR:$s, AR:$t),
+                       "movgez.s\t$r, $s, $t", []>, Requires<[HasSingleFloat]>;
+
+def MOVLTZ_S : RRR_Inst<0x00, 0x0B, 0x0A, (outs FPR:$r), (ins FPR:$s, AR:$t),
+                       "movltz.s\t$r, $s, $t", []>, Requires<[HasSingleFloat]>;
+
+def MOVNEZ_S : RRR_Inst<0x00, 0x0B, 0x09, (outs FPR:$r), (ins FPR:$s, AR:$t),
+                       "movnez.s\t$r, $s, $t", []>, Requires<[HasSingleFloat]>;
+
+def MOVT_S   : RRR_Inst<0x00, 0x0B, 0x0D, (outs FPR:$r), (ins FPR:$s, BR:$t),
+                       "movt.s\t$r, $s, $t", []>, Requires<[HasBoolean, HasSingleFloat]>;
+
+// FP multipy-sub
+def MSUB_S : RRR_Inst<0x00, 0x0A, 0x05, (outs FPR:$r), (ins FPR:$a, FPR:$s, FPR:$t),
+                     "msub.s\t$r, $s, $t",
+                     [(set FPR:$r, (Xtensa_msub FPR:$a, FPR:$s, FPR:$t))]>, Requires<[HasSingleFloat]> {
+  let isCommutable = 0;
+  let isReMaterializable = 0;
+  let Constraints = "$r = $a";
+}
+
+def NEXP01_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                       "nexp01.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x0B;
+}
+
+def NEG_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                    "neg.s\t$r, $s",
+                    [(set FPR:$r, (fneg FPR:$s))]>, Requires<[HasSingleFloat]> {
+  let t = 0x06;
+}
+
+def RECIP0_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                       "recip0.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x08;
+}
+
+def RFR : RRR_Inst<0x00, 0x0A, 0x0f, (outs AR:$r), (ins FPR:$s),
+                  "rfr\t$r, $s",
+                  [(set AR:$r, (bitconvert FPR:$s))]>, Requires<[HasSingleFloat]> {
+  let t = 0x04;
+}
+
+def ROUND_S : RRR_Inst<0x00, 0x0A, 0x08, (outs AR:$r), (ins FPR:$s, uimm4:$imm),
+                      "round.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]> {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def RSQRT0_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                       "rsqrt0.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x0A;
+}
+
+def SQRT0_S : RRR_Inst<0x00, 0x0A, 0x0F, (outs FPR:$r), (ins FPR:$s),
+                      "sqrt0.s\t$r, $s", []>, Requires<[HasSingleFloat]> {
+  let t = 0x09;
+}
+
+def TRUNC_S : RRR_Inst<0x00, 0x0A, 0x09, (outs AR:$r), (ins FPR:$s, uimm4:$imm),
+                      "trunc.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]>  {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def : Pat<(i32 (fp_to_sint FPR:$s)), (TRUNC_S FPR:$s, 0)>;
+
+def UFLOAT_S : RRR_Inst<0x00, 0x0A, 0x0D, (outs FPR:$r), (ins AR:$s, uimm4:$imm),
+                       "ufloat.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]>  {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def : Pat<(f32 (uint_to_fp AR:$s)), (UFLOAT_S AR:$s, 0)>;
+
+def UTRUNC_S : RRR_Inst<0x00, 0x0A, 0x0e, (outs AR:$r), (ins FPR:$s, uimm4:$imm),
+                       "utrunc.s\t$r, $s, $imm", []>, Requires<[HasSingleFloat]>  {
+  bits<4> imm;
+
+  let t = imm;
+}
+
+def : Pat<(i32 (fp_to_uint FPR:$s)), (UTRUNC_S FPR:$s, 0)>;
+
+def WFR : RRR_Inst<0x00, 0x0A, 0x0f, (outs FPR:$r), (ins AR:$s),
+                  "wfr\t$r, $s",
+                  [(set FPR:$r, (bitconvert AR:$s))]>, Requires<[HasSingleFloat]>  {
+  let t = 0x05;
+}
+
+let AddedComplexity = 10 in
+def : Pat<(f32 (load (Xtensa_pcrel_wrapper tconstpool:$in))),
+          (WFR (L32R tconstpool:$in))>;
+
 //===----------------------------------------------------------------------===//
 // Region Protection feature instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h b/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h
index bc051d9ca14f..ff3bba0985c2 100644
--- a/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h
+++ b/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h
@@ -28,6 +28,7 @@ class XtensaMachineFunctionInfo : public MachineFunctionInfo {
   int VarArgsOnStackFrameIndex;
   int VarArgsInRegsFrameIndex;
   bool SaveFrameRegister = false;
+  unsigned CPLabelId = 0;
 
 public:
   explicit XtensaMachineFunctionInfo(const Function &F,
@@ -54,6 +55,8 @@ public:
 
   bool isSaveFrameRegister() const { return SaveFrameRegister; }
   void setSaveFrameRegister() { SaveFrameRegister = true; }
+
+  unsigned createCPLabelId() { return CPLabelId++; }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Xtensa/XtensaOperators.td b/llvm/lib/Target/Xtensa/XtensaOperators.td
index 12b81fccec47..fea13c2298d9 100644
--- a/llvm/lib/Target/Xtensa/XtensaOperators.td
+++ b/llvm/lib/Target/Xtensa/XtensaOperators.td
@@ -25,6 +25,11 @@ def SDT_XtensaSelectCC            : SDTypeProfile<1, 5,
                                                   SDTCisSameAs<2, 3>,
                                                   SDTCisVT<5, i32>]>;
 
+def SDT_XtensaCmp                 : SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, SDTCisVT<1, f32>, SDTCisVT<2, f32>]>;
+def SDT_XtensaMADD                : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, f32>]>;
+def SDT_XtensaMOVS                : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVT<0, f32>]>;
+def SDT_XtensaSelectCCFP          : SDTypeProfile<1, 5, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, SDTCisVT<5, i32>]>;
+
 def SDT_XtensaSRC                 : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
@@ -70,3 +75,15 @@ def Xtensa_extui: SDNode<"XtensaISD::EXTUI", SDT_XtensaEXTUI>;
 
 def Xtensa_movsp: SDNode<"XtensaISD::MOVSP", SDT_XtensaMOVSP,
                         [SDNPHasChain, SDNPSideEffect, SDNPInGlue]>;
+
+def Xtensa_cmpoeq     : SDNode<"XtensaISD::CMPOEQ", SDT_XtensaCmp, [SDNPOutGlue]>;
+def Xtensa_cmpolt     : SDNode<"XtensaISD::CMPOLT", SDT_XtensaCmp, [SDNPOutGlue]>;
+def Xtensa_cmpole     : SDNode<"XtensaISD::CMPOLE", SDT_XtensaCmp, [SDNPOutGlue]>;
+def Xtensa_cmpueq     : SDNode<"XtensaISD::CMPUEQ", SDT_XtensaCmp, [SDNPOutGlue]>;
+def Xtensa_cmpult     : SDNode<"XtensaISD::CMPULT", SDT_XtensaCmp, [SDNPOutGlue]>;
+def Xtensa_cmpule     : SDNode<"XtensaISD::CMPULE", SDT_XtensaCmp, [SDNPOutGlue]>;
+def Xtensa_cmpuo      : SDNode<"XtensaISD::CMPUO", SDT_XtensaCmp, [SDNPOutGlue]>;
+
+def Xtensa_madd: SDNode<"XtensaISD::MADD", SDT_XtensaMADD, [SDNPInGlue]>;
+def Xtensa_msub: SDNode<"XtensaISD::MSUB", SDT_XtensaMADD, [SDNPInGlue]>;
+def Xtensa_movs: SDNode<"XtensaISD::MOVS", SDT_XtensaMOVS, [SDNPInGlue]>;
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
index 7d4402912434..644faee51f51 100644
--- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
@@ -226,6 +226,50 @@ def SR :  RegisterClass<"Xtensa", [i32], 32, (add
   PS, VECBASE, EXCCAUSE, DEBUGCAUSE, CCOUNT, PRID, ICOUNT, ICOUNTLEVEL, EXCVADDR, CCOMPARE0,
   CCOMPARE1, CCOMPARE2, MISC0, MISC1, MISC2, MISC3)>;
 
+//===----------------------------------------------------------------------===//
+// USER registers
+//===----------------------------------------------------------------------===//
+class URReg<bits<8> num, string n, list<string> alt = []> : XtensaReg<n> {
+  let HWEncoding{7-0} = num;
+  let AltNames = alt;
+}
+
+def FCR : URReg<232, "fcr", ["FCR"]>;
+def FSR : URReg<233, "fsr", ["FSR"]>;
+
+def UR :  RegisterClass<"Xtensa", [i32], 32, (add FCR, FSR)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-Point registers
+//===----------------------------------------------------------------------===//
+
+// Xtensa Floating-Point regs
+class FPReg<bits<4> num, string n> : XtensaReg<n> {
+  let HWEncoding{3-0} = num;
+}
+
+def F0 : FPReg<0, "f0">, DwarfRegNum<[19]>;
+def F1 : FPReg<1, "f1">, DwarfRegNum<[20]>;
+def F2 : FPReg<2, "f2">, DwarfRegNum<[21]>;
+def F3 : FPReg<3, "f3">, DwarfRegNum<[22]>;
+def F4 : FPReg<4, "f4">, DwarfRegNum<[23]>;
+def F5 : FPReg<5, "f5">, DwarfRegNum<[24]>;
+def F6 : FPReg<6, "f6">, DwarfRegNum<[25]>;
+def F7 : FPReg<7, "f7">, DwarfRegNum<[26]>;
+def F8 : FPReg<8, "f8">, DwarfRegNum<[27]>;
+def F9 : FPReg<9, "f9">, DwarfRegNum<[28]>;
+def F10 : FPReg<10, "f10">, DwarfRegNum<[29]>;
+def F11 : FPReg<11, "f11">, DwarfRegNum<[30]>;
+def F12 : FPReg<12, "f12">, DwarfRegNum<[31]>;
+def F13 : FPReg<13, "f13">, DwarfRegNum<[32]>;
+def F14 : FPReg<14, "f14">, DwarfRegNum<[33]>;
+def F15 : FPReg<15, "f15">, DwarfRegNum<[34]>;
+
+// Floating-Point register class with allocation order
+def FPR : RegisterClass<"Xtensa", [f32], 32, (add
+  F8, F9, F10, F11, F12, F13, F14, F15,
+  F7, F6, F5, F4, F3, F2, F1, F0)>;
+
 //===----------------------------------------------------------------------===//
 // Boolean registers
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
index da4e14a53eef..bc32541750ec 100644
--- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -77,6 +77,7 @@ public:
   bool hasMul32() const { return HasMul32; }
   bool hasMul32High() const { return HasMul32High; }
   bool hasDiv32() const { return HasDiv32; }
+  bool hasSingleFloat() const { return HasSingleFloat; }
   bool hasRegionProtection() const { return HasRegionProtection; }
   bool hasRelocatableVector() const { return HasRelocatableVector; }
   bool hasMiscSR() const { return HasMiscSR; }
diff --git a/llvm/test/CodeGen/Xtensa/add_shifted_imm.ll b/llvm/test/CodeGen/Xtensa/add_shifted_imm.ll
new file mode 100644
index 000000000000..243076f4619a
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/add_shifted_imm.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+define i64 @test_1(i32 %v) {
+; CHECK-LABEL: test_1:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    l32r a8, .LCPI0_0
+; CHECK-NEXT:    add a2, a2, a8
+; CHECK-NEXT:    movi a3, 0
+; CHECK-NEXT:    ret
+  %addres = add i32 %v, -65536
+  %res = zext i32 %addres to i64
+  ret i64 %res
+}
+
+define i64 @test_2(i32 %v) {
+; CHECK-LABEL: test_2:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    l32r a8, .LCPI1_0
+; CHECK-NEXT:    add a2, a2, a8
+; CHECK-NEXT:    movi a3, 0
+; CHECK-NEXT:    ret
+  %addres = add i32 %v, 65536
+  %res = zext i32 %addres to i64
+  ret i64 %res
+}
+
+define i64 @test_3(i32 %v) {
+; CHECK-LABEL: test_3:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    addmi a2, a2, -32768
+; CHECK-NEXT:    movi a3, 0
+; CHECK-NEXT:    ret
+  %addres = add i32 %v, -32768
+  %res = zext i32 %addres to i64
+  ret i64 %res
+}
+
+define i64 @test_4(i32 %v) {
+; CHECK-LABEL: test_4:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    addmi a2, a2, 32512
+; CHECK-NEXT:    movi a3, 0
+; CHECK-NEXT:    ret
+  %addres = add i32 %v, 32512
+  %res = zext i32 %addres to i64
+  ret i64 %res
+}
diff --git a/llvm/test/CodeGen/Xtensa/aligned_alloc.ll b/llvm/test/CodeGen/Xtensa/aligned_alloc.ll
index ebb24d9272dd..471158fb30b7 100644
--- a/llvm/test/CodeGen/Xtensa/aligned_alloc.ll
+++ b/llvm/test/CodeGen/Xtensa/aligned_alloc.ll
@@ -11,10 +11,10 @@ define i8 @loadi8_128(i8 %a) {
 ; XTENSA-NEXT:    .cfi_def_cfa_offset 128
 ; XTENSA-NEXT:    s32i a0, a1, 124 # 4-byte Folded Spill
 ; XTENSA-NEXT:    .cfi_offset a0, -4
+; XTENSA-NEXT:    l32r a8, .LCPI0_0
 ; XTENSA-NEXT:    addi a2, a1, 0
 ; XTENSA-NEXT:    movi a3, 0
 ; XTENSA-NEXT:    movi a4, 64
-; XTENSA-NEXT:    l32r a8, .LCPI0_0
 ; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    l8ui a2, a1, 0
 ; XTENSA-NEXT:    l32i a0, a1, 124 # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/Xtensa/brcc_fp.ll b/llvm/test/CodeGen/Xtensa/brcc_fp.ll
new file mode 100644
index 000000000000..3dd17b9ccb94
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/brcc_fp.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=xtensa -mattr=+fp -disable-block-placement -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+define i32 @brcc_sgt(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_sgt:
+; CHECK:         bge a3, a2, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp sgt i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_ugt(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_ugt:
+; CHECK:         bgeu a3, a2, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp ugt i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_sle(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_sle:
+; CHECK:         blt a3, a2, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp sle i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_ule(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_ule:
+; CHECK:         bltu a3, a2, .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp ule i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_eq(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_eq:
+; CHECK:         bne a2, a3, .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp eq i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_ne(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_ne:
+; CHECK:         beq a2, a3, .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp ne i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_ge(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_ge:
+; CHECK:         blt a2, a3, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp sge i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_lt(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_lt:
+; CHECK:         bge a2, a3, .LBB7_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB7_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp slt i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_uge(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_uge:
+; CHECK:         bltu a2, a3, .LBB8_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB8_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp uge i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
+
+define i32 @brcc_ult(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: brcc_ult:
+; CHECK:         bgeu a2, a3, .LBB9_2
+; CHECK-NEXT:  # %bb.1: # %t1
+; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB9_2: # %t2
+; CHECK-NEXT:    addi a2, a3, 8
+; CHECK-NEXT:    ret
+  %wb = icmp ult i32 %a, %b
+  br i1 %wb, label %t1, label %t2
+t1:
+  %t1v = add i32 %a, 4
+  br label %exit
+t2:
+  %t2v = add i32 %b, 8
+  br label %exit
+exit:
+  %v = phi i32 [ %t1v, %t1 ], [ %t2v, %t2 ]
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/Xtensa/bswap.ll b/llvm/test/CodeGen/Xtensa/bswap.ll
index 6a87aa84351c..a836f4c30e3e 100644
--- a/llvm/test/CodeGen/Xtensa/bswap.ll
+++ b/llvm/test/CodeGen/Xtensa/bswap.ll
@@ -12,7 +12,8 @@ declare i64 @llvm.bitreverse.i64(i64)
 
 define i16 @test_bswap_i16(i16 %a) nounwind {
 ; XTENSA-LABEL: test_bswap_i16:
-; XTENSA:         l32r a8, .LCPI0_0
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI0_0
 ; XTENSA-NEXT:    and a8, a2, a8
 ; XTENSA-NEXT:    srli a8, a8, 8
 ; XTENSA-NEXT:    slli a9, a2, 8
@@ -24,7 +25,8 @@ define i16 @test_bswap_i16(i16 %a) nounwind {
 
 define i32 @test_bswap_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_bswap_i32:
-; XTENSA:         srli a8, a2, 8
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 8
 ; XTENSA-NEXT:    l32r a9, .LCPI1_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    extui a10, a2, 24, 8
@@ -41,7 +43,8 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 
 define i64 @test_bswap_i64(i64 %a) nounwind {
 ; XTENSA-LABEL: test_bswap_i64:
-; XTENSA:         srli a8, a3, 8
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a3, 8
 ; XTENSA-NEXT:    l32r a9, .LCPI2_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    extui a10, a3, 24, 8
@@ -68,7 +71,8 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 
 define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_i8:
-; XTENSA:         movi a8, 15
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    movi a8, 15
 ; XTENSA-NEXT:    and a8, a2, a8
 ; XTENSA-NEXT:    slli a8, a8, 4
 ; XTENSA-NEXT:    movi a9, 240
@@ -94,7 +98,8 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 
 define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_i16:
-; XTENSA:         l32r a8, .LCPI4_0
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI4_0
 ; XTENSA-NEXT:    and a8, a2, a8
 ; XTENSA-NEXT:    srli a8, a8, 8
 ; XTENSA-NEXT:    slli a9, a2, 8
@@ -124,7 +129,8 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 
 define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_i32:
-; XTENSA:         srli a8, a2, 8
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 8
 ; XTENSA-NEXT:    l32r a9, .LCPI5_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    extui a10, a2, 24, 8
@@ -159,7 +165,8 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 
 define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_i64:
-; XTENSA:         srli a8, a3, 8
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a3, 8
 ; XTENSA-NEXT:    l32r a9, .LCPI6_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    extui a10, a3, 24, 8
@@ -219,7 +226,8 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 
 define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; XTENSA-LABEL: test_bswap_bitreverse_i16:
-; XTENSA:         srli a8, a2, 4
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 4
 ; XTENSA-NEXT:    l32r a9, .LCPI7_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    and a9, a2, a9
@@ -245,7 +253,8 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 
 define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_bswap_bitreverse_i32:
-; XTENSA:         srli a8, a2, 4
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 4
 ; XTENSA-NEXT:    l32r a9, .LCPI8_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    and a9, a2, a9
@@ -271,7 +280,8 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 
 define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind {
 ; XTENSA-LABEL: test_bswap_bitreverse_i64:
-; XTENSA:         srli a8, a2, 4
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 4
 ; XTENSA-NEXT:    l32r a9, .LCPI9_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    and a10, a2, a9
@@ -312,7 +322,8 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind {
 
 define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_bswap_i16:
-; XTENSA:         srli a8, a2, 4
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 4
 ; XTENSA-NEXT:    l32r a9, .LCPI10_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    and a9, a2, a9
@@ -338,7 +349,8 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 
 define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_bswap_i32:
-; XTENSA:         srli a8, a2, 4
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 4
 ; XTENSA-NEXT:    l32r a9, .LCPI11_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    and a9, a2, a9
@@ -364,7 +376,8 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 
 define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
 ; XTENSA-LABEL: test_bitreverse_bswap_i64:
-; XTENSA:         srli a8, a2, 4
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 4
 ; XTENSA-NEXT:    l32r a9, .LCPI12_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    and a10, a2, a9
diff --git a/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll
index 603032353862..2524a333556e 100644
--- a/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll
@@ -8,7 +8,8 @@ declare i32 @llvm.ctpop.i32(i32)
 
 define i32 @test_cttz_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_cttz_i32:
-; XTENSA:         beqz a2, .LBB0_2
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    beqz a2, .LBB0_2
 ; XTENSA-NEXT:  # %bb.1: # %cond.false
 ; XTENSA-NEXT:    movi a8, -1
 ; XTENSA-NEXT:    xor a8, a2, a8
@@ -42,7 +43,8 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 
 define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ; XTENSA-LABEL: test_cttz_i32_zero_undef:
-; XTENSA:         movi a8, -1
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    movi a8, -1
 ; XTENSA-NEXT:    xor a8, a2, a8
 ; XTENSA-NEXT:    addi a9, a2, -1
 ; XTENSA-NEXT:    and a8, a8, a9
@@ -71,7 +73,8 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 
 define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_ctlz_i32:
-; XTENSA:         beqz a2, .LBB2_2
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    beqz a2, .LBB2_2
 ; XTENSA-NEXT:  # %bb.1: # %cond.false
 ; XTENSA-NEXT:    srli a8, a2, 1
 ; XTENSA-NEXT:    or a8, a2, a8
@@ -113,7 +116,8 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 
 define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; XTENSA-LABEL: test_ctlz_i32_zero_undef:
-; XTENSA:         srli a8, a2, 1
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 1
 ; XTENSA-NEXT:    or a8, a2, a8
 ; XTENSA-NEXT:    srli a9, a8, 2
 ; XTENSA-NEXT:    or a8, a8, a9
@@ -150,7 +154,8 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 
 define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; XTENSA-LABEL: test_ctpop_i32:
-; XTENSA:         srli a8, a2, 1
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    srli a8, a2, 1
 ; XTENSA-NEXT:    l32r a9, .LCPI4_0
 ; XTENSA-NEXT:    and a8, a8, a9
 ; XTENSA-NEXT:    sub a8, a2, a8
diff --git a/llvm/test/CodeGen/Xtensa/float-arith.ll b/llvm/test/CodeGen/Xtensa/float-arith.ll
new file mode 100644
index 000000000000..2b7186c44aeb
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/float-arith.ll
@@ -0,0 +1,603 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=xtensa -mattr=+fp -verify-machineinstrs < %s | FileCheck -check-prefix=XTENSA %s
+
+define float @fadd_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fadd_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %res = fadd float %a, %b
+  ret float %res
+}
+
+define float @fsub_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fsub_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    sub.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %res = fsub float %a, %b
+  ret float %res
+}
+
+define float @fmul_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fmul_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    mul.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %res = fmul float %a, %b
+  ret float %res
+}
+
+define float @fdiv_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fdiv_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI3_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = fdiv float %a, %b
+  ret float %res
+}
+
+declare float @llvm.sqrt.f32(float)
+
+define float @fsqrt_s(float %a) nounwind {
+; XTENSA-LABEL: fsqrt_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI4_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.sqrt.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.fabs.f32(float)
+
+define float @fabs_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fabs_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    abs.s f9, f8
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %fa = fadd float %a, %b
+  %call_res = call float @llvm.fabs.f32(float %fa)
+  %res = fadd float %call_res, %fa
+  ret float %res
+}
+
+declare float @llvm.minnum.f32(float, float)
+
+define float @fmin_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fmin_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI6_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.minnum.f32(float %a, float %b)
+  ret float %res
+}
+
+declare float @llvm.maxnum.f32(float, float)
+
+define float @fmax_s(float %a, float %b) nounwind {
+; XTENSA-LABEL: fmax_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI7_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.maxnum.f32(float %a, float %b)
+  ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define float @fmadd_s(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fmadd_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    madd.s f10, f9, f8
+; XTENSA-NEXT:    rfr a2, f10
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %res
+}
+
+define float @fmsub_s(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fmsub_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI9_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a4
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    neg.s f8, f8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    wfr f10, a2
+; XTENSA-NEXT:    madd.s f8, f10, f9
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %c_ = fadd float 0.0, %c ; avoid negation using xor
+  %negc = fsub float -0.0, %c_
+  %res = call float @llvm.fma.f32(float %a, float %b, float %negc)
+  ret float %res
+}
+
+define float @fnmadd_s(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmadd_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI10_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    add.s f9, f9, f8
+; XTENSA-NEXT:    neg.s f9, f9
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    add.s f8, f10, f8
+; XTENSA-NEXT:    neg.s f8, f8
+; XTENSA-NEXT:    wfr f10, a3
+; XTENSA-NEXT:    madd.s f8, f9, f10
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %a_ = fadd float 0.0, %a
+  %c_ = fadd float 0.0, %c
+  %nega = fsub float -0.0, %a_
+  %negc = fsub float -0.0, %c_
+  %res = call float @llvm.fma.f32(float %nega, float %b, float %negc)
+  ret float %res
+}
+
+define float @fnmadd_s_2(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmadd_s_2:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI11_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    add.s f9, f9, f8
+; XTENSA-NEXT:    neg.s f9, f9
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    add.s f8, f10, f8
+; XTENSA-NEXT:    neg.s f8, f8
+; XTENSA-NEXT:    wfr f10, a2
+; XTENSA-NEXT:    madd.s f8, f10, f9
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %b_ = fadd float 0.0, %b
+  %c_ = fadd float 0.0, %c
+  %negb = fsub float -0.0, %b_
+  %negc = fsub float -0.0, %c_
+  %res = call float @llvm.fma.f32(float %a, float %negb, float %negc)
+  ret float %res
+}
+
+define float @fnmadd_s_3(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmadd_s_3:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    madd.s f10, f9, f8
+; XTENSA-NEXT:    rfr a8, f10
+; XTENSA-NEXT:    l32r a9, .LCPI12_0
+; XTENSA-NEXT:    xor a2, a8, a9
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %neg = fneg float %res
+  ret float %neg
+}
+
+define float @fnmadd_nsz(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmadd_nsz:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    madd.s f10, f9, f8
+; XTENSA-NEXT:    rfr a8, f10
+; XTENSA-NEXT:    l32r a9, .LCPI13_0
+; XTENSA-NEXT:    xor a2, a8, a9
+; XTENSA-NEXT:    ret
+  %res = call nsz float @llvm.fma.f32(float %a, float %b, float %c)
+  %neg = fneg nsz float %res
+  ret float %neg
+}
+
+define float @fnmsub_s(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmsub_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI14_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    neg.s f8, f8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    madd.s f10, f8, f9
+; XTENSA-NEXT:    rfr a2, f10
+; XTENSA-NEXT:    ret
+  %a_ = fadd float 0.0, %a
+  %nega = fsub float -0.0, %a_
+  %res = call float @llvm.fma.f32(float %nega, float %b, float %c)
+  ret float %res
+}
+
+define float @fnmsub_s_2(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmsub_s_2:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI15_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    neg.s f8, f8
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    madd.s f10, f9, f8
+; XTENSA-NEXT:    rfr a2, f10
+; XTENSA-NEXT:    ret
+  %b_ = fadd float 0.0, %b
+  %negb = fsub float -0.0, %b_
+  %res = call float @llvm.fma.f32(float %a, float %negb, float %c)
+  ret float %res
+}
+
+define float @fmadd_s_contract(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fmadd_s_contract:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    wfr f8, a3
+; XTENSA-NEXT:    wfr f9, a2
+; XTENSA-NEXT:    mul.s f8, f9, f8
+; XTENSA-NEXT:    wfr f9, a4
+; XTENSA-NEXT:    add.s f8, f8, f9
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %fm = fmul contract float %a, %b
+  %res = fadd contract float %fm, %c
+  ret float %res
+}
+
+define float @fmsub_s_contract(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fmsub_s_contract:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI17_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a4
+; XTENSA-NEXT:    add.s f8, f9, f8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    wfr f10, a2
+; XTENSA-NEXT:    mul.s f9, f10, f9
+; XTENSA-NEXT:    sub.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %c_ = fadd float 0.0, %c ; avoid negation using xor
+  %fm = fmul contract float %a, %b
+  %res = fsub contract float %fm, %c_
+  ret float %res
+}
+
+define float @fnmadd_s_contract(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmadd_s_contract:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI18_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    add.s f9, f9, f8
+; XTENSA-NEXT:    wfr f10, a2
+; XTENSA-NEXT:    add.s f10, f10, f8
+; XTENSA-NEXT:    mul.s f9, f10, f9
+; XTENSA-NEXT:    neg.s f9, f9
+; XTENSA-NEXT:    wfr f10, a4
+; XTENSA-NEXT:    add.s f8, f10, f8
+; XTENSA-NEXT:    sub.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %a_ = fadd float 0.0, %a ; avoid negation using xor
+  %b_ = fadd float 0.0, %b ; avoid negation using xor
+  %c_ = fadd float 0.0, %c ; avoid negation using xor
+  %fm = fmul contract float %a_, %b_
+  %fn = fneg float %fm
+  %res = fsub contract float %fn, %c_
+  ret float %res
+}
+
+define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
+; XTENSA-LABEL: fnmsub_s_contract:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI19_0
+; XTENSA-NEXT:    wfr f8, a8
+; XTENSA-NEXT:    wfr f9, a3
+; XTENSA-NEXT:    add.s f9, f9, f8
+; XTENSA-NEXT:    wfr f10, a2
+; XTENSA-NEXT:    add.s f8, f10, f8
+; XTENSA-NEXT:    mul.s f8, f8, f9
+; XTENSA-NEXT:    wfr f9, a4
+; XTENSA-NEXT:    sub.s f8, f9, f8
+; XTENSA-NEXT:    rfr a2, f8
+; XTENSA-NEXT:    ret
+  %a_ = fadd float 0.0, %a ; avoid negation using xor
+  %b_ = fadd float 0.0, %b ; avoid negation using xor
+  %fm = fmul contract float %a_, %b_
+  %res = fsub contract float %c, %fm
+  ret float %res
+}
+
+declare float @llvm.powi.f32(float, i32)
+
+define float @powi_f32(float %a, i32 %b) nounwind {
+; XTENSA-LABEL: powi_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI20_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+
+  %res = call float @llvm.powi.f32(float %a, i32 %b)
+  ret float %res
+}
+
+declare float @llvm.sin.f32(float)
+
+define float @sin_f32(float %a) nounwind {
+; XTENSA-LABEL: sin_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI21_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.sin.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.cos.f32(float)
+
+define float @cos_f32(float %a) nounwind {
+; XTENSA-LABEL: cos_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI22_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.cos.f32(float %a)
+  ret float %res
+}
+declare float @llvm.exp.f32(float)
+
+define float @exp_f32(float %a) nounwind {
+; XTENSA-LABEL: exp_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI23_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.exp.f32(float %a)
+  ret float %res
+}
+
+define float @log_f32(float %a) nounwind {
+; XTENSA-LABEL: log_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI24_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.log.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.log10.f32(float)
+
+define float @log10_f32(float %a) nounwind {
+; XTENSA-LABEL: log10_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI25_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.log10.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.log2.f32(float)
+
+define float @log2_f32(float %a) nounwind {
+; XTENSA-LABEL: log2_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI26_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.log2.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.floor.f32(float)
+
+define float @floor_f32(float %a) nounwind {
+; XTENSA-LABEL: floor_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI27_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.ceil.f32(float)
+
+define float @ceil_f32(float %a) nounwind {
+; XTENSA-LABEL: ceil_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI28_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.ceil.f32(float %a)
+  ret float %res
+}
+declare float @llvm.rint.f32(float)
+
+define float @rint_f32(float %a) nounwind {
+; XTENSA-LABEL: rint_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI29_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.rint.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.nearbyint.f32(float)
+
+define float @nearbyint_f32(float %a) nounwind {
+; XTENSA-LABEL: nearbyint_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI30_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.nearbyint.f32(float %a)
+  ret float %res
+}
+
+declare float @llvm.round.f32(float)
+
+define float @round_f32(float %a) nounwind {
+; XTENSA-LABEL: round_f32:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    addi a8, a1, -16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    s32i a0, a1, 0 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI31_0
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    l32i a0, a1, 0 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 16
+; XTENSA-NEXT:    or a1, a8, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.round.f32(float %a)
+  ret float %res
+}
+
+
+define float @fneg_s(float %a) nounwind {
+; XTENSA-LABEL: fneg_s:
+; XTENSA:       # %bb.0:
+; XTENSA-NEXT:    l32r a8, .LCPI32_0
+; XTENSA-NEXT:    and a2, a2, a8
+; XTENSA-NEXT:    ret
+  %res = call float @llvm.fabs.f32(float %a)
+  ret float %res
+}
+
+define i32 @fptosi(float %f) {
+; XTENSA-LABEL: fptosi:
+; XTENSA:         .cfi_startproc
+; XTENSA-NEXT:  # %bb.0:
+; XTENSA-NEXT:    wfr f8, a2
+; XTENSA-NEXT:    trunc.s a2, f8, 0
+; XTENSA-NEXT:    ret
+  %conv = fptosi float %f to i32
+  ret i32 %conv
+}
+
+define i32 @fptoui(float %f) {
+; XTENSA-LABEL: fptoui:
+; XTENSA:         .cfi_startproc
+; XTENSA-NEXT:  # %bb.0:
+; XTENSA-NEXT:    wfr f8, a2
+; XTENSA-NEXT:    utrunc.s a2, f8, 0
+; XTENSA-NEXT:    ret
+  %conv = fptoui float %f to i32
+  ret i32 %conv
+}
+
diff --git a/llvm/test/CodeGen/Xtensa/mul.ll b/llvm/test/CodeGen/Xtensa/mul.ll
index 690132382228..4dd03e408fcf 100644
--- a/llvm/test/CodeGen/Xtensa/mul.ll
+++ b/llvm/test/CodeGen/Xtensa/mul.ll
@@ -767,51 +767,52 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 define i128 @muli128_m3840(i128 %a) nounwind {
 ; XTENSA-LABEL: muli128_m3840:
 ; XTENSA:       # %bb.0:
-; XTENSA-NEXT:    addi a8, a1, -80
+; XTENSA-NEXT:    addi a8, a1, -64
 ; XTENSA-NEXT:    or a1, a8, a8
-; XTENSA-NEXT:    s32i a0, a1, 64 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a12, a1, 60 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a13, a1, 56 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a14, a1, 52 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 48 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a0, a1, 60 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a12, a1, 56 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a13, a1, 52 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a14, a1, 48 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a15, a1, 44 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a5, a1, 20 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a4, a1, 16 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a15, a3, a3
-; XTENSA-NEXT:    l32r a14, .LCPI30_0
+; XTENSA-NEXT:    or a13, a3, a3
+; XTENSA-NEXT:    l32r a15, .LCPI30_0
 ; XTENSA-NEXT:    movi a12, 0
-; XTENSA-NEXT:    l32r a13, .LCPI30_1
+; XTENSA-NEXT:    l32r a8, .LCPI30_1
 ; XTENSA-NEXT:    s32i a2, a1, 36 # 4-byte Folded Spill
 ; XTENSA-NEXT:    or a3, a12, a12
-; XTENSA-NEXT:    or a4, a14, a14
+; XTENSA-NEXT:    or a4, a15, a15
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a2, a1, 28 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a3, a1, 44 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 40 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a2, a15, a15
+; XTENSA-NEXT:    or a14, a3, a3
+; XTENSA-NEXT:    l32r a8, .LCPI30_2
+; XTENSA-NEXT:    s32i a13, a1, 40 # 4-byte Folded Spill
+; XTENSA-NEXT:    or a2, a13, a13
 ; XTENSA-NEXT:    or a3, a12, a12
-; XTENSA-NEXT:    s32i a14, a1, 12 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a4, a14, a14
+; XTENSA-NEXT:    s32i a15, a1, 12 # 4-byte Folded Spill
+; XTENSA-NEXT:    or a4, a15, a15
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
-; XTENSA-NEXT:    l32i a8, a1, 44 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a15, a2, a8
-; XTENSA-NEXT:    movi a8, 1
-; XTENSA-NEXT:    s32i a8, a1, 44 # 4-byte Folded Spill
-; XTENSA-NEXT:    bltu a15, a2, .LBB30_2
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    add a13, a2, a14
+; XTENSA-NEXT:    movi a15, 1
+; XTENSA-NEXT:    or a8, a15, a15
+; XTENSA-NEXT:    bltu a13, a2, .LBB30_2
 ; XTENSA-NEXT:  # %bb.1:
 ; XTENSA-NEXT:    or a8, a12, a12
 ; XTENSA-NEXT:  .LBB30_2:
 ; XTENSA-NEXT:    add a8, a3, a8
 ; XTENSA-NEXT:    s32i a8, a1, 32 # 4-byte Folded Spill
 ; XTENSA-NEXT:    movi a14, -1
+; XTENSA-NEXT:    l32r a8, .LCPI30_3
 ; XTENSA-NEXT:    l32i a2, a1, 36 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a3, a12, a12
 ; XTENSA-NEXT:    or a4, a14, a14
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
-; XTENSA-NEXT:    add a9, a2, a15
-; XTENSA-NEXT:    l32i a8, a1, 44 # 4-byte Folded Reload
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    add a9, a2, a13
+; XTENSA-NEXT:    or a8, a15, a15
 ; XTENSA-NEXT:    s32i a9, a1, 24 # 4-byte Folded Spill
 ; XTENSA-NEXT:    bltu a9, a2, .LBB30_4
 ; XTENSA-NEXT:  # %bb.3:
@@ -819,70 +820,72 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; XTENSA-NEXT:  .LBB30_4:
 ; XTENSA-NEXT:    add a8, a3, a8
 ; XTENSA-NEXT:    l32i a9, a1, 32 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a15, a9, a8
+; XTENSA-NEXT:    add a13, a9, a8
+; XTENSA-NEXT:    l32r a8, .LCPI30_4
 ; XTENSA-NEXT:    l32i a2, a1, 40 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a3, a12, a12
 ; XTENSA-NEXT:    or a4, a14, a14
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a3, a1, 4 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 8 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a13, a1, 8 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a2, a1, 0 # 4-byte Folded Spill
-; XTENSA-NEXT:    add a15, a2, a15
+; XTENSA-NEXT:    add a13, a2, a13
+; XTENSA-NEXT:    l32r a8, .LCPI30_5
 ; XTENSA-NEXT:    l32i a2, a1, 16 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a3, a1, 20 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a4, a1, 12 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a5, a14, a14
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a2, a1, 16 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a3, a1, 20 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI30_6
 ; XTENSA-NEXT:    l32i a2, a1, 36 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a3, a1, 40 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a4, a14, a14
 ; XTENSA-NEXT:    or a5, a14, a14
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    l32i a8, a1, 16 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a9, a2, a8
-; XTENSA-NEXT:    add a4, a15, a9
-; XTENSA-NEXT:    l32i a7, a1, 44 # 4-byte Folded Reload
-; XTENSA-NEXT:    or a8, a7, a7
-; XTENSA-NEXT:    bltu a4, a15, .LBB30_6
+; XTENSA-NEXT:    add a4, a13, a9
+; XTENSA-NEXT:    or a8, a15, a15
+; XTENSA-NEXT:    bltu a4, a13, .LBB30_6
 ; XTENSA-NEXT:  # %bb.5:
 ; XTENSA-NEXT:    or a8, a12, a12
 ; XTENSA-NEXT:  .LBB30_6:
-; XTENSA-NEXT:    or a10, a7, a7
+; XTENSA-NEXT:    or a10, a15, a15
 ; XTENSA-NEXT:    l32i a11, a1, 0 # 4-byte Folded Reload
-; XTENSA-NEXT:    bltu a15, a11, .LBB30_8
+; XTENSA-NEXT:    bltu a13, a11, .LBB30_8
 ; XTENSA-NEXT:  # %bb.7:
 ; XTENSA-NEXT:    or a10, a12, a12
 ; XTENSA-NEXT:  .LBB30_8:
-; XTENSA-NEXT:    or a11, a7, a7
-; XTENSA-NEXT:    l32i a6, a1, 32 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a5, a1, 8 # 4-byte Folded Reload
-; XTENSA-NEXT:    bltu a5, a6, .LBB30_10
+; XTENSA-NEXT:    or a11, a15, a15
+; XTENSA-NEXT:    l32i a7, a1, 32 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a6, a1, 8 # 4-byte Folded Reload
+; XTENSA-NEXT:    bltu a6, a7, .LBB30_10
 ; XTENSA-NEXT:  # %bb.9:
 ; XTENSA-NEXT:    or a11, a12, a12
 ; XTENSA-NEXT:  .LBB30_10:
-; XTENSA-NEXT:    l32i a6, a1, 4 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a11, a6, a11
+; XTENSA-NEXT:    l32i a7, a1, 4 # 4-byte Folded Reload
+; XTENSA-NEXT:    add a11, a7, a11
 ; XTENSA-NEXT:    add a10, a11, a10
 ; XTENSA-NEXT:    bltu a9, a2, .LBB30_12
 ; XTENSA-NEXT:  # %bb.11:
-; XTENSA-NEXT:    or a7, a12, a12
+; XTENSA-NEXT:    or a15, a12, a12
 ; XTENSA-NEXT:  .LBB30_12:
 ; XTENSA-NEXT:    l32i a9, a1, 20 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a9, a3, a9
-; XTENSA-NEXT:    add a9, a9, a7
+; XTENSA-NEXT:    add a9, a9, a15
 ; XTENSA-NEXT:    add a9, a10, a9
 ; XTENSA-NEXT:    add a5, a9, a8
 ; XTENSA-NEXT:    l32i a2, a1, 28 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a3, a1, 24 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a15, a1, 48 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a14, a1, 52 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a13, a1, 56 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a12, a1, 60 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a0, a1, 64 # 4-byte Folded Reload
-; XTENSA-NEXT:    addi a8, a1, 80
+; XTENSA-NEXT:    l32i a15, a1, 44 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a14, a1, 48 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a13, a1, 52 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a12, a1, 56 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a0, a1, 60 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 64
 ; XTENSA-NEXT:    or a1, a8, a8
 ; XTENSA-NEXT:    ret
 ;
@@ -980,51 +983,52 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 define i128 @muli128_m63(i128 %a) nounwind {
 ; XTENSA-LABEL: muli128_m63:
 ; XTENSA:       # %bb.0:
-; XTENSA-NEXT:    addi a8, a1, -80
+; XTENSA-NEXT:    addi a8, a1, -64
 ; XTENSA-NEXT:    or a1, a8, a8
-; XTENSA-NEXT:    s32i a0, a1, 64 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a12, a1, 60 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a13, a1, 56 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a14, a1, 52 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 48 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a0, a1, 60 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a12, a1, 56 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a13, a1, 52 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a14, a1, 48 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a15, a1, 44 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a5, a1, 20 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a4, a1, 16 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a15, a3, a3
-; XTENSA-NEXT:    movi a14, -63
+; XTENSA-NEXT:    or a13, a3, a3
+; XTENSA-NEXT:    movi a15, -63
 ; XTENSA-NEXT:    movi a12, 0
-; XTENSA-NEXT:    l32r a13, .LCPI31_0
+; XTENSA-NEXT:    l32r a8, .LCPI31_0
 ; XTENSA-NEXT:    s32i a2, a1, 36 # 4-byte Folded Spill
 ; XTENSA-NEXT:    or a3, a12, a12
-; XTENSA-NEXT:    or a4, a14, a14
+; XTENSA-NEXT:    or a4, a15, a15
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a2, a1, 28 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a3, a1, 44 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 40 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a2, a15, a15
+; XTENSA-NEXT:    or a14, a3, a3
+; XTENSA-NEXT:    l32r a8, .LCPI31_1
+; XTENSA-NEXT:    s32i a13, a1, 40 # 4-byte Folded Spill
+; XTENSA-NEXT:    or a2, a13, a13
 ; XTENSA-NEXT:    or a3, a12, a12
-; XTENSA-NEXT:    s32i a14, a1, 12 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a4, a14, a14
+; XTENSA-NEXT:    s32i a15, a1, 12 # 4-byte Folded Spill
+; XTENSA-NEXT:    or a4, a15, a15
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
-; XTENSA-NEXT:    l32i a8, a1, 44 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a15, a2, a8
-; XTENSA-NEXT:    movi a8, 1
-; XTENSA-NEXT:    s32i a8, a1, 44 # 4-byte Folded Spill
-; XTENSA-NEXT:    bltu a15, a2, .LBB31_2
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    add a13, a2, a14
+; XTENSA-NEXT:    movi a15, 1
+; XTENSA-NEXT:    or a8, a15, a15
+; XTENSA-NEXT:    bltu a13, a2, .LBB31_2
 ; XTENSA-NEXT:  # %bb.1:
 ; XTENSA-NEXT:    or a8, a12, a12
 ; XTENSA-NEXT:  .LBB31_2:
 ; XTENSA-NEXT:    add a8, a3, a8
 ; XTENSA-NEXT:    s32i a8, a1, 32 # 4-byte Folded Spill
 ; XTENSA-NEXT:    movi a14, -1
+; XTENSA-NEXT:    l32r a8, .LCPI31_2
 ; XTENSA-NEXT:    l32i a2, a1, 36 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a3, a12, a12
 ; XTENSA-NEXT:    or a4, a14, a14
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
-; XTENSA-NEXT:    add a9, a2, a15
-; XTENSA-NEXT:    l32i a8, a1, 44 # 4-byte Folded Reload
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    add a9, a2, a13
+; XTENSA-NEXT:    or a8, a15, a15
 ; XTENSA-NEXT:    s32i a9, a1, 24 # 4-byte Folded Spill
 ; XTENSA-NEXT:    bltu a9, a2, .LBB31_4
 ; XTENSA-NEXT:  # %bb.3:
@@ -1032,70 +1036,72 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; XTENSA-NEXT:  .LBB31_4:
 ; XTENSA-NEXT:    add a8, a3, a8
 ; XTENSA-NEXT:    l32i a9, a1, 32 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a15, a9, a8
+; XTENSA-NEXT:    add a13, a9, a8
+; XTENSA-NEXT:    l32r a8, .LCPI31_3
 ; XTENSA-NEXT:    l32i a2, a1, 40 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a3, a12, a12
 ; XTENSA-NEXT:    or a4, a14, a14
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a3, a1, 4 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 8 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a13, a1, 8 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a2, a1, 0 # 4-byte Folded Spill
-; XTENSA-NEXT:    add a15, a2, a15
+; XTENSA-NEXT:    add a13, a2, a13
+; XTENSA-NEXT:    l32r a8, .LCPI31_4
 ; XTENSA-NEXT:    l32i a2, a1, 16 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a3, a1, 20 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a4, a1, 12 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a5, a14, a14
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a2, a1, 16 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a3, a1, 20 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI31_5
 ; XTENSA-NEXT:    l32i a2, a1, 36 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a3, a1, 40 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a4, a14, a14
 ; XTENSA-NEXT:    or a5, a14, a14
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    l32i a8, a1, 16 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a9, a2, a8
-; XTENSA-NEXT:    add a4, a15, a9
-; XTENSA-NEXT:    l32i a7, a1, 44 # 4-byte Folded Reload
-; XTENSA-NEXT:    or a8, a7, a7
-; XTENSA-NEXT:    bltu a4, a15, .LBB31_6
+; XTENSA-NEXT:    add a4, a13, a9
+; XTENSA-NEXT:    or a8, a15, a15
+; XTENSA-NEXT:    bltu a4, a13, .LBB31_6
 ; XTENSA-NEXT:  # %bb.5:
 ; XTENSA-NEXT:    or a8, a12, a12
 ; XTENSA-NEXT:  .LBB31_6:
-; XTENSA-NEXT:    or a10, a7, a7
+; XTENSA-NEXT:    or a10, a15, a15
 ; XTENSA-NEXT:    l32i a11, a1, 0 # 4-byte Folded Reload
-; XTENSA-NEXT:    bltu a15, a11, .LBB31_8
+; XTENSA-NEXT:    bltu a13, a11, .LBB31_8
 ; XTENSA-NEXT:  # %bb.7:
 ; XTENSA-NEXT:    or a10, a12, a12
 ; XTENSA-NEXT:  .LBB31_8:
-; XTENSA-NEXT:    or a11, a7, a7
-; XTENSA-NEXT:    l32i a6, a1, 32 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a5, a1, 8 # 4-byte Folded Reload
-; XTENSA-NEXT:    bltu a5, a6, .LBB31_10
+; XTENSA-NEXT:    or a11, a15, a15
+; XTENSA-NEXT:    l32i a7, a1, 32 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a6, a1, 8 # 4-byte Folded Reload
+; XTENSA-NEXT:    bltu a6, a7, .LBB31_10
 ; XTENSA-NEXT:  # %bb.9:
 ; XTENSA-NEXT:    or a11, a12, a12
 ; XTENSA-NEXT:  .LBB31_10:
-; XTENSA-NEXT:    l32i a6, a1, 4 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a11, a6, a11
+; XTENSA-NEXT:    l32i a7, a1, 4 # 4-byte Folded Reload
+; XTENSA-NEXT:    add a11, a7, a11
 ; XTENSA-NEXT:    add a10, a11, a10
 ; XTENSA-NEXT:    bltu a9, a2, .LBB31_12
 ; XTENSA-NEXT:  # %bb.11:
-; XTENSA-NEXT:    or a7, a12, a12
+; XTENSA-NEXT:    or a15, a12, a12
 ; XTENSA-NEXT:  .LBB31_12:
 ; XTENSA-NEXT:    l32i a9, a1, 20 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a9, a3, a9
-; XTENSA-NEXT:    add a9, a9, a7
+; XTENSA-NEXT:    add a9, a9, a15
 ; XTENSA-NEXT:    add a9, a10, a9
 ; XTENSA-NEXT:    add a5, a9, a8
 ; XTENSA-NEXT:    l32i a2, a1, 28 # 4-byte Folded Reload
 ; XTENSA-NEXT:    l32i a3, a1, 24 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a15, a1, 48 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a14, a1, 52 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a13, a1, 56 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a12, a1, 60 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a0, a1, 64 # 4-byte Folded Reload
-; XTENSA-NEXT:    addi a8, a1, 80
+; XTENSA-NEXT:    l32i a15, a1, 44 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a14, a1, 48 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a13, a1, 52 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a12, a1, 56 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a0, a1, 60 # 4-byte Folded Reload
+; XTENSA-NEXT:    addi a8, a1, 64
 ; XTENSA-NEXT:    or a1, a8, a8
 ; XTENSA-NEXT:    ret
 ;
@@ -1195,30 +1201,30 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; XTENSA:       # %bb.0:
 ; XTENSA-NEXT:    addi a8, a1, -64
 ; XTENSA-NEXT:    or a1, a8, a8
-; XTENSA-NEXT:    s32i a0, a1, 56 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a12, a1, 52 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a13, a1, 48 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a14, a1, 44 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 40 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a0, a1, 52 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a12, a1, 48 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a13, a1, 44 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a14, a1, 40 # 4-byte Folded Spill
+; XTENSA-NEXT:    s32i a15, a1, 36 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a5, a1, 28 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a14, a4, a4
+; XTENSA-NEXT:    or a13, a4, a4
 ; XTENSA-NEXT:    or a15, a3, a3
 ; XTENSA-NEXT:    movi a12, 0
-; XTENSA-NEXT:    l32r a13, .LCPI32_0
+; XTENSA-NEXT:    l32r a8, .LCPI32_0
 ; XTENSA-NEXT:    s32i a2, a1, 32 # 4-byte Folded Spill
 ; XTENSA-NEXT:    or a3, a12, a12
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
-; XTENSA-NEXT:    s32i a3, a1, 24 # 4-byte Folded Spill
-; XTENSA-NEXT:    s32i a15, a1, 36 # 4-byte Folded Spill
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    or a14, a3, a3
+; XTENSA-NEXT:    l32r a8, .LCPI32_1
+; XTENSA-NEXT:    s32i a15, a1, 20 # 4-byte Folded Spill
 ; XTENSA-NEXT:    or a2, a15, a15
 ; XTENSA-NEXT:    or a3, a12, a12
-; XTENSA-NEXT:    s32i a14, a1, 16 # 4-byte Folded Spill
-; XTENSA-NEXT:    or a4, a14, a14
+; XTENSA-NEXT:    s32i a13, a1, 16 # 4-byte Folded Spill
+; XTENSA-NEXT:    or a4, a13, a13
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
-; XTENSA-NEXT:    l32i a8, a1, 24 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a14, a2, a8
+; XTENSA-NEXT:    callx0 a8
+; XTENSA-NEXT:    add a14, a2, a14
 ; XTENSA-NEXT:    movi a15, 1
 ; XTENSA-NEXT:    or a8, a15, a15
 ; XTENSA-NEXT:    bltu a14, a2, .LBB32_2
@@ -1227,13 +1233,13 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; XTENSA-NEXT:  .LBB32_2:
 ; XTENSA-NEXT:    add a8, a3, a8
 ; XTENSA-NEXT:    s32i a8, a1, 24 # 4-byte Folded Spill
+; XTENSA-NEXT:    l32r a8, .LCPI32_2
 ; XTENSA-NEXT:    l32i a2, a1, 32 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a3, a12, a12
 ; XTENSA-NEXT:    l32i a4, a1, 28 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    add a9, a2, a14
-; XTENSA-NEXT:    s32i a15, a1, 20 # 4-byte Folded Spill
 ; XTENSA-NEXT:    or a8, a15, a15
 ; XTENSA-NEXT:    bltu a9, a2, .LBB32_4
 ; XTENSA-NEXT:  # %bb.3:
@@ -1242,68 +1248,70 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; XTENSA-NEXT:    add a8, a3, a8
 ; XTENSA-NEXT:    l32i a9, a1, 24 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a14, a9, a8
-; XTENSA-NEXT:    l32i a2, a1, 36 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32r a8, .LCPI32_3
+; XTENSA-NEXT:    l32i a2, a1, 20 # 4-byte Folded Reload
 ; XTENSA-NEXT:    or a3, a12, a12
-; XTENSA-NEXT:    l32i a15, a1, 28 # 4-byte Folded Reload
-; XTENSA-NEXT:    or a4, a15, a15
+; XTENSA-NEXT:    l32i a13, a1, 28 # 4-byte Folded Reload
+; XTENSA-NEXT:    or a4, a13, a13
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a3, a1, 8 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a14, a1, 12 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a2, a1, 4 # 4-byte Folded Spill
 ; XTENSA-NEXT:    add a14, a2, a14
+; XTENSA-NEXT:    l32r a8, .LCPI32_4
 ; XTENSA-NEXT:    l32i a2, a1, 16 # 4-byte Folded Reload
-; XTENSA-NEXT:    or a3, a15, a15
+; XTENSA-NEXT:    or a3, a13, a13
 ; XTENSA-NEXT:    or a4, a12, a12
 ; XTENSA-NEXT:    or a5, a12, a12
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    s32i a2, a1, 0 # 4-byte Folded Spill
 ; XTENSA-NEXT:    s32i a3, a1, 16 # 4-byte Folded Spill
-; XTENSA-NEXT:    srai a2, a15, 31
+; XTENSA-NEXT:    srai a2, a13, 31
+; XTENSA-NEXT:    l32r a8, .LCPI32_5
 ; XTENSA-NEXT:    or a3, a2, a2
 ; XTENSA-NEXT:    l32i a4, a1, 32 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a5, a1, 36 # 4-byte Folded Reload
-; XTENSA-NEXT:    callx0 a13
+; XTENSA-NEXT:    l32i a5, a1, 20 # 4-byte Folded Reload
+; XTENSA-NEXT:    callx0 a8
 ; XTENSA-NEXT:    or a8, a2, a2
 ; XTENSA-NEXT:    l32i a9, a1, 0 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a10, a8, a9
 ; XTENSA-NEXT:    add a2, a14, a10
-; XTENSA-NEXT:    l32i a6, a1, 20 # 4-byte Folded Reload
-; XTENSA-NEXT:    or a9, a6, a6
+; XTENSA-NEXT:    or a9, a15, a15
 ; XTENSA-NEXT:    bltu a2, a14, .LBB32_6
 ; XTENSA-NEXT:  # %bb.5:
 ; XTENSA-NEXT:    or a9, a12, a12
 ; XTENSA-NEXT:  .LBB32_6:
-; XTENSA-NEXT:    or a11, a6, a6
+; XTENSA-NEXT:    or a11, a15, a15
 ; XTENSA-NEXT:    l32i a7, a1, 4 # 4-byte Folded Reload
 ; XTENSA-NEXT:    bltu a14, a7, .LBB32_8
 ; XTENSA-NEXT:  # %bb.7:
 ; XTENSA-NEXT:    or a11, a12, a12
 ; XTENSA-NEXT:  .LBB32_8:
-; XTENSA-NEXT:    or a7, a6, a6
-; XTENSA-NEXT:    l32i a5, a1, 24 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a4, a1, 12 # 4-byte Folded Reload
-; XTENSA-NEXT:    bltu a4, a5, .LBB32_10
+; XTENSA-NEXT:    or a7, a15, a15
+; XTENSA-NEXT:    l32i a6, a1, 24 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a5, a1, 12 # 4-byte Folded Reload
+; XTENSA-NEXT:    bltu a5, a6, .LBB32_10
 ; XTENSA-NEXT:  # %bb.9:
 ; XTENSA-NEXT:    or a7, a12, a12
 ; XTENSA-NEXT:  .LBB32_10:
-; XTENSA-NEXT:    l32i a5, a1, 8 # 4-byte Folded Reload
-; XTENSA-NEXT:    add a7, a5, a7
+; XTENSA-NEXT:    l32i a6, a1, 8 # 4-byte Folded Reload
+; XTENSA-NEXT:    add a7, a6, a7
 ; XTENSA-NEXT:    add a11, a7, a11
 ; XTENSA-NEXT:    bltu a10, a8, .LBB32_12
 ; XTENSA-NEXT:  # %bb.11:
-; XTENSA-NEXT:    or a6, a12, a12
+; XTENSA-NEXT:    or a15, a12, a12
 ; XTENSA-NEXT:  .LBB32_12:
 ; XTENSA-NEXT:    l32i a8, a1, 16 # 4-byte Folded Reload
 ; XTENSA-NEXT:    add a8, a3, a8
-; XTENSA-NEXT:    add a8, a8, a6
+; XTENSA-NEXT:    add a8, a8, a15
 ; XTENSA-NEXT:    add a8, a11, a8
 ; XTENSA-NEXT:    add a3, a8, a9
-; XTENSA-NEXT:    l32i a15, a1, 40 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a14, a1, 44 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a13, a1, 48 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a12, a1, 52 # 4-byte Folded Reload
-; XTENSA-NEXT:    l32i a0, a1, 56 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a15, a1, 36 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a14, a1, 40 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a13, a1, 44 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a12, a1, 48 # 4-byte Folded Reload
+; XTENSA-NEXT:    l32i a0, a1, 52 # 4-byte Folded Reload
 ; XTENSA-NEXT:    addi a8, a1, 64
 ; XTENSA-NEXT:    or a1, a8, a8
 ; XTENSA-NEXT:    ret
diff --git a/llvm/test/MC/Disassembler/Xtensa/fp.txt b/llvm/test/MC/Disassembler/Xtensa/fp.txt
new file mode 100644
index 000000000000..993b9987c1bc
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/fp.txt
@@ -0,0 +1,215 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=xtensa -mattr=+fp,+bool -disassemble %s | FileCheck -check-prefixes=CHECK-FLOAT %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## fp option enabled. Also verify that dissasembling without
+## fp option generates warnings.
+
+[0x10,0x23,0xfa]
+# CHECK-FLOAT: abs.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x0a]
+# CHECK-FLOAT: add.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xe0,0x23,0xfa]
+# CHECK-FLOAT: addexp.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xf0,0x23,0xfa]
+# CHECK-FLOAT: addexpm.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0xba]
+# CHECK-FLOAT: ceil.s a2, f3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x35,0xfa]
+# CHECK-FLOAT: const.s f3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x70,0x23,0xfa]
+# CHECK-FLOAT: div0.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x7a]
+# CHECK-FLOAT: divn.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0xca]
+# CHECK-FLOAT: float.s f2, a3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0xaa]
+# CHECK-FLOAT: floor.s a2, f3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x23,0x03,0x02]
+# CHECK-FLOAT: lsi f2, a3, 8
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x23,0x83,0x02]
+# CHECK-FLOAT: lsip f2, a3, 8
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x08]
+# CHECK-FLOAT: lsx f2, a3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x18]
+# CHECK-FLOAT: lsxp f2, a3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x4a]
+# CHECK-FLOAT: madd.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x6a]
+# CHECK-FLOAT: maddn.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xd0,0x23,0xfa]
+# CHECK-FLOAT: mkdadj.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xc0,0x23,0xfa]
+# CHECK-FLOAT: mksadj.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x23,0xfa]
+# CHECK-FLOAT: mov.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x8b]
+# CHECK-FLOAT: moveqz.s f2, f3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x23,0xcb]
+# CHECK-FLOAT: movf.s f2, f3, b0
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0xbb]
+# CHECK-FLOAT: movgez.s f2, f3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0xab]
+# CHECK-FLOAT: movltz.s f2, f3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x9b]
+# CHECK-FLOAT: movnez.s f2, f3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x23,0xdb]
+# CHECK-FLOAT: movt.s f2, f3, b0
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x5a]
+# CHECK-FLOAT: msub.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x2a]
+# CHECK-FLOAT: mul.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x60,0x23,0xfa]
+# CHECK-FLOAT: neg.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xb0,0x23,0xfa]
+# CHECK-FLOAT: nexp01.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x2b]
+# CHECK-FLOAT: oeq.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x6b]
+# CHECK-FLOAT: ole.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x4b]
+# CHECK-FLOAT: olt.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x80,0x23,0xfa]
+# CHECK-FLOAT: recip0.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0xfa]
+# CHECK-FLOAT: rfr a2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0x8a]
+# CHECK-FLOAT: round.s a2, f3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xa0,0x23,0xfa]
+# CHECK-FLOAT: rsqrt0.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x90,0x23,0xfa]
+# CHECK-FLOAT: sqrt0.s f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x23,0x43,0x02]
+# CHECK-FLOAT: ssi f2, a3, 8
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x23,0xc3,0x02]
+# CHECK-FLOAT: ssip f2, a3, 8
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x48]
+# CHECK-FLOAT: ssx f2, a3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x58]
+# CHECK-FLOAT: ssxp f2, a3, a4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x40,0x23,0x1a]
+# CHECK-FLOAT: sub.s f2, f3, f4
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0x9a]
+# CHECK-FLOAT: trunc.s a2, f3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x3b]
+# CHECK-FLOAT: ueq.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0xda]
+# CHECK-FLOAT: ufloat.s f2, a3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x7b]
+# CHECK-FLOAT: ule.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x5b]
+# CHECK-FLOAT: ult.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x30,0x02,0x1b]
+# CHECK-FLOAT: un.s b0, f2, f3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0xea]
+# CHECK-FLOAT: utrunc.s a2, f3, 5
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x50,0x23,0xfa]
+# CHECK-FLOAT: wfr f2, a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x80,0x3e,0xe3]
+# CHECK-FLOAT: rur a3, fcr
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x90,0x3e,0xe3]
+# CHECK-FLOAT: rur a3, fsr
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Xtensa/float-err.s b/llvm/test/MC/Xtensa/float-err.s
new file mode 100644
index 000000000000..34c41fab1808
--- /dev/null
+++ b/llvm/test/MC/Xtensa/float-err.s
@@ -0,0 +1,37 @@
+# RUN: not llvm-mc %s -triple=xtensa -mattr=+fp -filetype=asm 2>&1 | FileCheck --implicit-check-not=error: %s
+
+ceil.s	a2, f3, 17
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+const.s	f3, 18
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+float.s	f2, a3, 16
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+ufloat.s	f2, a3, 25
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+floor.s	a2, f3, 17
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+lsi f2, a3, 4099
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 1020]
+
+lsip f2, a3, 4099
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 1020]
+
+round.s	a2, f3, 20
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+ssi f2, a3, 5000
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 1020]
+
+ssip f2, a3, 5001
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 1020]
+
+trunc.s	a2, f3, 21
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
+
+utrunc.s	a2, f3, 19
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected immediate in range [0, 15]
diff --git a/llvm/test/MC/Xtensa/float.s b/llvm/test/MC/Xtensa/float.s
new file mode 100644
index 000000000000..08aa2a12878e
--- /dev/null
+++ b/llvm/test/MC/Xtensa/float.s
@@ -0,0 +1,209 @@
+# RUN: llvm-mc %s -triple=xtensa -mattr=+fp -mattr=+bool -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# CHECK-INST: abs.s	f2, f3
+# CHECK: encoding: [0x10,0x23,0xfa]
+	abs.s	f2, f3
+# CHECK-INST: add.s	f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x0a]
+	add.s	f2, f3, f4
+# CHECK-INST: addexp.s	f2, f3
+# CHECK: encoding: [0xe0,0x23,0xfa]
+	addexp.s	f2, f3
+# CHECK-INST: addexpm.s	f2, f3
+# CHECK: encoding: [0xf0,0x23,0xfa]
+	addexpm.s	f2, f3
+
+# CHECK-INST: ceil.s a2, f3, 5
+# CHECK: encoding: [0x50,0x23,0xba]
+	ceil.s	a2, f3, 5
+# CHECK-INST: const.s	f3, 5
+# CHECK: encoding: [0x30,0x35,0xfa]
+	const.s	f3, 5
+
+# CHECK-INST: div0.s f2, f3
+# CHECK: encoding: [0x70,0x23,0xfa]
+	div0.s f2, f3
+# CHECK-INST: divn.s f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x7a]
+	divn.s f2, f3, f4
+
+# CHECK-INST: float.s	f2, a3, 5
+# CHECK: encoding: [0x50,0x23,0xca]
+	float.s	f2, a3, 5
+# CHECK-INST: floor.s a2, f3, 5
+# CHECK: encoding: [0x50,0x23,0xaa]
+	floor.s	a2, f3, 5
+
+# CHECK-INST: lsi f2, a3, 8
+# CHECK: encoding: [0x23,0x03,0x02]
+	lsi f2, a3, 8
+# CHECK-INST: lsip f2, a3, 8
+# CHECK: encoding: [0x23,0x83,0x02]
+	lsip f2, a3, 8
+# CHECK-INST: lsx f2, a3, a4
+# CHECK: encoding: [0x40,0x23,0x08]
+	lsx f2, a3, a4
+# CHECK-INST: lsxp f2, a3, a4
+# CHECK: encoding: [0x40,0x23,0x18]
+	lsxp f2, a3, a4
+
+# CHECK-INST: madd.s f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x4a]
+	madd.s f2, f3, f4
+# CHECK-INST: maddn.s f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x6a]
+	maddn.s f2, f3, f4
+# CHECK-INST: mkdadj.s f2, f3
+# CHECK: encoding: [0xd0,0x23,0xfa]
+	mkdadj.s f2, f3
+# CHECK-INST: mksadj.s f2, f3
+# CHECK: encoding: [0xc0,0x23,0xfa]
+	mksadj.s f2, f3
+
+# CHECK-INST: mov.s f2, f3
+# CHECK: encoding: [0x00,0x23,0xfa]
+	mov.s f2, f3
+
+# CHECK-INST: moveqz.s f2, f3, a4
+# CHECK: encoding: [0x40,0x23,0x8b]
+	moveqz.s f2, f3, a4
+# CHECK-INST: movf.s f2, f3, b0
+# CHECK: encoding: [0x00,0x23,0xcb]
+	movf.s f2, f3, b0
+# CHECK-INST: movgez.s f2, f3, a4
+# CHECK: encoding: [0x40,0x23,0xbb]
+	movgez.s f2, f3, a4
+# CHECK-INST: movltz.s f2, f3, a4
+# CHECK: encoding: [0x40,0x23,0xab]
+	movltz.s f2, f3, a4
+# CHECK-INST: movnez.s f2, f3, a4
+# CHECK: encoding: [0x40,0x23,0x9b]
+	movnez.s f2, f3, a4
+# CHECK-INST: movt.s f2, f3, b0
+# CHECK: encoding: [0x00,0x23,0xdb]
+	movt.s f2, f3, b0
+
+# CHECK-INST: msub.s f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x5a]
+	msub.s f2, f3, f4
+# CHECK-INST: mul.s	f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x2a]
+	mul.s	f2, f3, f4
+# CHECK-INST: neg.s f2, f3
+# CHECK: encoding: [0x60,0x23,0xfa]
+	neg.s f2, f3
+
+# CHECK-INST: nexp01.s f2, f3
+# CHECK: encoding: [0xb0,0x23,0xfa]
+	nexp01.s f2, f3
+
+# CHECK-INST: oeq.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x2b]
+	oeq.s b0, f2, f3
+# CHECK-INST: ole.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x6b]
+	ole.s b0, f2, f3
+# CHECK-INST: olt.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x4b]
+	olt.s b0, f2, f3
+
+# CHECK-INST: recip0.s f2, f3
+# CHECK: encoding: [0x80,0x23,0xfa]
+	recip0.s f2, f3
+
+# CHECK-INST: rfr a2, f3
+# CHECK: encoding: [0x40,0x23,0xfa]
+	rfr a2, f3
+
+# CHECK-INST: round.s a2, f3, 5
+# CHECK: encoding: [0x50,0x23,0x8a]
+	round.s	a2, f3, 5
+# CHECK-INST: rsqrt0.s f2, f3
+# CHECK: encoding: [0xa0,0x23,0xfa]
+	rsqrt0.s f2, f3
+# CHECK-INST: sqrt0.s f2, f3
+# CHECK: encoding: [0x90,0x23,0xfa]
+	sqrt0.s f2, f3
+
+# CHECK-INST: ssi f2, a3, 8
+# CHECK: encoding: [0x23,0x43,0x02]
+	ssi f2, a3, 8
+# CHECK-INST: ssip f2, a3, 8
+# CHECK: encoding: [0x23,0xc3,0x02]
+	ssip f2, a3, 8
+# CHECK-INST: ssx f2, a3, a4
+# CHECK: encoding: [0x40,0x23,0x48]
+	ssx f2, a3, a4
+# CHECK-INST: ssxp f2, a3, a4
+# CHECK: encoding: [0x40,0x23,0x58]
+	ssxp f2, a3, a4
+
+# CHECK-INST: sub.s	f2, f3, f4
+# CHECK: encoding: [0x40,0x23,0x1a]
+	sub.s	f2, f3, f4
+
+# CHECK-INST: trunc.s a2, f3, 5
+# CHECK: encoding: [0x50,0x23,0x9a]
+	trunc.s	a2, f3, 5
+
+# CHECK-INST: ueq.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x3b]
+	ueq.s b0, f2, f3
+
+# CHECK-INST: ufloat.s	f2, a3, 5
+# CHECK: encoding: [0x50,0x23,0xda]
+	ufloat.s	f2, a3, 5
+
+# CHECK-INST: ule.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x7b]
+	ule.s b0, f2, f3
+# CHECK-INST: ult.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x5b]
+	ult.s b0, f2, f3
+# CHECK-INST: un.s b0, f2, f3
+# CHECK: encoding: [0x30,0x02,0x1b]
+	un.s b0, f2, f3
+
+# CHECK-INST: utrunc.s a2, f3, 5
+# CHECK: encoding: [0x50,0x23,0xea]
+	utrunc.s	a2, f3, 5
+
+# CHECK-INST: wfr f2, a3
+# CHECK: encoding: [0x50,0x23,0xfa]
+	wfr f2, a3
+
+# CHECK-INST: rur a3, fcr
+# CHECK: encoding: [0x80,0x3e,0xe3]
+	rur a3, fcr
+
+# CHECK-INST: rur a3, fcr
+# CHECK: encoding: [0x80,0x3e,0xe3]
+	rur a3, 232
+
+# CHECK-INST: rur a3, fcr
+# CHECK: encoding: [0x80,0x3e,0xe3]
+	rur.fcr a3
+
+# CHECK-INST: wur a3, fcr
+# CHECK: encoding: [0x30,0xe8,0xf3]
+	wur a3, fcr
+
+# CHECK-INST: rur a3, fsr
+# CHECK: encoding: [0x90,0x3e,0xe3]
+	rur a3, fsr
+
+# CHECK-INST: rur a3, fsr
+# CHECK: encoding: [0x90,0x3e,0xe3]
+	rur a3, 233
+
+# CHECK-INST: rur a3, fsr
+# CHECK: encoding: [0x90,0x3e,0xe3]
+	rur.fsr a3
+
+# CHECK-INST: wur a3, fsr
+# CHECK: encoding: [0x30,0xe9,0xf3]
+	wur a3, fsr

From 9844085ea8132eae1086eaf55894db3904c189be Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 22 Jun 2025 16:31:34 -0700
Subject: [PATCH 1210/1322] [lldb] Migrate away from ValueRange(std::nullopt)
 (NFC) (#145245)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

This patch takes care of the lldb side of migration.
---
 lldb/source/Symbol/Function.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index 4fc793750f84..6114eccd935e 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -343,7 +343,7 @@ llvm::ArrayRef<std::unique_ptr<CallEdge>> Function::GetCallEdges() {
   Block &block = GetBlock(/*can_create*/true);
   SymbolFile *sym_file = block.GetSymbolFile();
   if (!sym_file)
-    return std::nullopt;
+    return {};
 
   // Lazily read call site information from the SymbolFile.
   m_call_edges = sym_file->ParseCallEdgesInFunction(GetID());

From f9fce4975bbad835deba6e639c21a62154dd8c14 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 23 Jun 2025 09:45:32 +1000
Subject: [PATCH 1211/1322] [ORC] Fix potential stack corruption in
 Platform::lookupInitSymbols.

We can't exit early when on error here as some threads may still be holding
references to LookupMutex.

Since we don't need high performance in the error case the easy solution is to
drop the early-exit in the error case and wait for all tasks to complete before
returning the error.

Thanks to Jameson Nash for spotting this bug!
---
 llvm/lib/ExecutionEngine/Orc/Core.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 88bdba0ccc64..f47b7ecdcc7b 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1530,7 +1530,7 @@ Expected<DenseMap<JITDylib *, SymbolMap>> Platform::lookupInitSymbols(
   }
 
   std::unique_lock<std::mutex> Lock(LookupMutex);
-  CV.wait(Lock, [&] { return Count == 0 || CompoundErr; });
+  CV.wait(Lock, [&] { return Count == 0; });
 
   if (CompoundErr)
     return std::move(CompoundErr);

From b00ddce731c65d0924e49bd2f81436e3077aeab2 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Mon, 23 Jun 2025 09:38:00 +0800
Subject: [PATCH 1212/1322] [mlir][affine] Fix a crash when cast incompatible
 type (#145162)

This PR fixes a crash in `getSemiAffineExprFromFlatForm` when localExpr
is not `AffineBinaryOpExpr`. Fixes #144091.
---
 mlir/lib/IR/AffineExpr.cpp                       | 10 +++++++---
 .../test/Dialect/Affine/simplify-structures.mlir | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index c8d9761511be..cc81f9d19aca 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -1174,11 +1174,15 @@ static AffineExpr getSemiAffineExprFromFlatForm(ArrayRef<int64_t> flatExprs,
   // the indices in `coefficients` map, and affine expression corresponding to
   // in indices in `indexToExprMap` map.
   for (const auto &it : llvm::enumerate(localExprs)) {
-    AffineExpr expr = it.value();
     if (flatExprs[numDims + numSymbols + it.index()] == 0)
       continue;
-    AffineExpr lhs = cast<AffineBinaryOpExpr>(expr).getLHS();
-    AffineExpr rhs = cast<AffineBinaryOpExpr>(expr).getRHS();
+    AffineExpr expr = it.value();
+    auto binaryExpr = dyn_cast<AffineBinaryOpExpr>(expr);
+    if (!binaryExpr)
+      continue;
+
+    AffineExpr lhs = binaryExpr.getLHS();
+    AffineExpr rhs = binaryExpr.getRHS();
     if (!((isa<AffineDimExpr>(lhs) || isa<AffineSymbolExpr>(lhs)) &&
           (isa<AffineDimExpr>(rhs) || isa<AffineSymbolExpr>(rhs) ||
            isa<AffineConstantExpr>(rhs)))) {
diff --git a/mlir/test/Dialect/Affine/simplify-structures.mlir b/mlir/test/Dialect/Affine/simplify-structures.mlir
index e4a8512b002e..6f2737a98275 100644
--- a/mlir/test/Dialect/Affine/simplify-structures.mlir
+++ b/mlir/test/Dialect/Affine/simplify-structures.mlir
@@ -592,3 +592,19 @@ func.func @semiaffine_modulo_dim(%arg0: index, %arg1: index, %arg2: index) -> in
   //CHECK: affine.apply #[[$MAP]]()[%{{.*}}, %{{.*}}, %{{.*}}]
   return %a : index
 }
+
+// -----
+
+// CHECK-LABEL: func @semiaffine_simplification_floordiv_and_ceildiv_const
+func.func @semiaffine_simplification_floordiv_and_ceildiv_const(%arg0: tensor<?xf32>) -> (index, index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c13 = arith.constant 13 : index
+  %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
+  %a = affine.apply affine_map<()[s0, s1, s2] -> (s0 floordiv (s1 + (-s1 + 2) * (-s1 + s1 * s2 + 1)))>()[%c13, %dim, %c1]
+  %b = affine.apply affine_map<()[s0, s1, s2] -> (s0 ceildiv (s1 + (-s1 + 2) * (-s1 + s1 * s2 + 1)))>()[%c13, %dim, %c1]
+  // CHECK:      %[[C6:.*]] = arith.constant 6 : index
+  // CHECK-NEXT: %[[C7:.*]] = arith.constant 7 : index
+  // CHECK-NEXT: return %[[C6]], %[[C7]]
+  return %a, %b : index, index
+}

From a17b5bce8c9bfa7767827df392855cff9f2af372 Mon Sep 17 00:00:00 2001
From: Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
Date: Mon, 23 Jun 2025 04:00:11 +0200
Subject: [PATCH 1213/1322] [clang-reorder-fields] Prevent rewriting
 unsupported cases (#142149)

Add checks to prevent rewriting when doing so might result in incorrect
code. The following cases are checked:
- There are multiple field declarations in one statement like `int a, b`
- Multiple fields are created from a single macro expansion
- Preprocessor directives are present in the struct
---
 .../ReorderFieldsAction.cpp                   | 83 +++++++++++++++++++
 .../MacroExpandsToMultipleFields.cpp          | 13 +++
 .../MultipleFieldDeclsInStatement.cpp         | 11 +++
 .../PreprocessorDirectiveAroundDefinition.cpp | 15 ++++
 .../PreprocessorDirectiveAroundFields.cpp     | 15 ++++
 .../PreprocessorDirectiveInDefinition.cpp     | 16 ++++
 6 files changed, 153 insertions(+)
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/MacroExpandsToMultipleFields.cpp
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/MultipleFieldDeclsInStatement.cpp
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundDefinition.cpp
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundFields.cpp
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveInDefinition.cpp

diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
index 3b1cd18d8034..ada9122b587a 100644
--- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
+++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
@@ -19,6 +19,8 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Tooling/Refactoring.h"
 #include "llvm/ADT/STLExtras.h"
@@ -50,6 +52,85 @@ static const RecordDecl *findDefinition(StringRef RecordName,
   return selectFirst<RecordDecl>("recordDecl", Results);
 }
 
+static bool declaresMultipleFieldsInStatement(const RecordDecl *Decl) {
+  SourceLocation LastTypeLoc;
+  for (const auto &Field : Decl->fields()) {
+    SourceLocation TypeLoc =
+        Field->getTypeSourceInfo()->getTypeLoc().getBeginLoc();
+    if (LastTypeLoc.isValid() && TypeLoc == LastTypeLoc)
+      return true;
+    LastTypeLoc = TypeLoc;
+  }
+  return false;
+}
+
+static bool declaresMultipleFieldsInMacro(const RecordDecl *Decl,
+                                          const SourceManager &SrcMgr) {
+  SourceLocation LastMacroLoc;
+  for (const auto &Field : Decl->fields()) {
+    if (!Field->getLocation().isMacroID())
+      continue;
+    SourceLocation MacroLoc = SrcMgr.getExpansionLoc(Field->getLocation());
+    if (LastMacroLoc.isValid() && MacroLoc == LastMacroLoc)
+      return true;
+    LastMacroLoc = MacroLoc;
+  }
+  return false;
+}
+
+static bool containsPreprocessorDirectives(const RecordDecl *Decl,
+                                           const SourceManager &SrcMgr,
+                                           const LangOptions &LangOpts) {
+  std::pair<FileID, unsigned> FileAndOffset =
+      SrcMgr.getDecomposedLoc(Decl->field_begin()->getBeginLoc());
+  assert(!Decl->field_empty());
+  auto LastField = Decl->field_begin();
+  while (std::next(LastField) != Decl->field_end())
+    ++LastField;
+  unsigned EndOffset = SrcMgr.getFileOffset(LastField->getEndLoc());
+  StringRef SrcBuffer = SrcMgr.getBufferData(FileAndOffset.first);
+  Lexer L(SrcMgr.getLocForStartOfFile(FileAndOffset.first), LangOpts,
+          SrcBuffer.data(), SrcBuffer.data() + FileAndOffset.second,
+          SrcBuffer.data() + SrcBuffer.size());
+  IdentifierTable Identifiers(LangOpts);
+  clang::Token T;
+  while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < EndOffset) {
+    if (T.getKind() == tok::hash) {
+      L.LexFromRawLexer(T);
+      if (T.getKind() == tok::raw_identifier) {
+        clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
+        if (II.getPPKeywordID() != clang::tok::pp_not_keyword)
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+static bool isSafeToRewrite(const RecordDecl *Decl, const ASTContext &Context) {
+  // All following checks expect at least one field declaration.
+  if (Decl->field_empty())
+    return true;
+
+  // Don't attempt to rewrite if there is a declaration like 'int a, b;'.
+  if (declaresMultipleFieldsInStatement(Decl))
+    return false;
+
+  const SourceManager &SrcMgr = Context.getSourceManager();
+
+  // Don't attempt to rewrite if a single macro expansion creates multiple
+  // fields.
+  if (declaresMultipleFieldsInMacro(Decl, SrcMgr))
+    return false;
+
+  // Prevent rewriting if there are preprocessor directives present between the
+  // start of the first field and the end of last field.
+  if (containsPreprocessorDirectives(Decl, SrcMgr, Context.getLangOpts()))
+    return false;
+
+  return true;
+}
+
 /// Calculates the new order of fields.
 ///
 /// \returns empty vector if the list of fields doesn't match the definition.
@@ -345,6 +426,8 @@ public:
     const RecordDecl *RD = findDefinition(RecordName, Context);
     if (!RD)
       return;
+    if (!isSafeToRewrite(RD, Context))
+      return;
     SmallVector<unsigned, 4> NewFieldsOrder =
         getNewFieldsOrder(RD, DesiredFieldsOrder);
     if (NewFieldsOrder.empty())
diff --git a/clang-tools-extra/test/clang-reorder-fields/MacroExpandsToMultipleFields.cpp b/clang-tools-extra/test/clang-reorder-fields/MacroExpandsToMultipleFields.cpp
new file mode 100644
index 000000000000..5bafcd19ea82
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/MacroExpandsToMultipleFields.cpp
@@ -0,0 +1,13 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order z,y,x %s -- | FileCheck %s
+
+namespace bar {
+
+#define FIELDS_DECL int x; int y; // CHECK: {{^#define FIELDS_DECL int x; int y;}}
+
+// The order of fields should not change.
+struct Foo {
+  FIELDS_DECL  // CHECK:      {{^ FIELDS_DECL}}
+  int z;       // CHECK-NEXT: {{^ int z;}}
+};
+
+} // end namespace bar
diff --git a/clang-tools-extra/test/clang-reorder-fields/MultipleFieldDeclsInStatement.cpp b/clang-tools-extra/test/clang-reorder-fields/MultipleFieldDeclsInStatement.cpp
new file mode 100644
index 000000000000..437e7b91e27a
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/MultipleFieldDeclsInStatement.cpp
@@ -0,0 +1,11 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order z,y,x %s -- | FileCheck %s
+
+namespace bar {
+
+// The order of fields should not change.
+struct Foo {
+  int x, y; // CHECK: {{^  int x, y;}}
+  double z; // CHECK-NEXT: {{^  double z;}}
+};
+
+} // end namespace bar
diff --git a/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundDefinition.cpp b/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundDefinition.cpp
new file mode 100644
index 000000000000..f00b4b0b57bf
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundDefinition.cpp
@@ -0,0 +1,15 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order y,x %s -- | FileCheck %s
+
+namespace bar {
+
+#define DEFINE_FOO
+
+// This is okay to reorder.
+#ifdef DEFINE_FOO
+struct Foo {
+  int x;     // CHECK:      {{^ int y;}}
+  int y;     // CHECK-NEXT: {{^ int x;}}
+};
+#endif
+
+} // end namespace bar
diff --git a/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundFields.cpp b/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundFields.cpp
new file mode 100644
index 000000000000..c37546a05afd
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveAroundFields.cpp
@@ -0,0 +1,15 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order y,x %s -- | FileCheck %s
+
+namespace bar {
+
+#define DEFINE_FIELDS
+
+// This is okay to reorder.
+struct Foo {
+#ifdef DEFINE_FIELDS // CHECK:      {{^#ifdef DEFINE_FIELDS}}
+  int x;             // CHECK-NEXT: {{^ int y;}}
+  int y;             // CHECK-NEXT: {{^ int x;}}
+#endif               // CHECK-NEXT: {{^#endif}}
+};
+
+} // end namespace bar
diff --git a/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveInDefinition.cpp b/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveInDefinition.cpp
new file mode 100644
index 000000000000..fee6b0e637b9
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/PreprocessorDirectiveInDefinition.cpp
@@ -0,0 +1,16 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order z,y,x %s -- | FileCheck %s
+
+namespace bar {
+
+#define ADD_Z
+
+// The order of fields should not change.
+struct Foo {
+  int x;     // CHECK:      {{^ int x;}}
+  int y;     // CHECK-NEXT: {{^ int y;}}
+#ifdef ADD_Z // CHECK-NEXT: {{^#ifdef ADD_Z}}
+  int z;     // CHECK-NEXT: {{^ int z;}}
+#endif       // CHECK-NEXT: {{^#endif}}
+};
+
+} // end namespace bar

From 6023ba2bf78918c29838f7ab79d9ba9df532a96d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 22 Jun 2025 19:09:31 -0700
Subject: [PATCH 1214/1322] [mlir] Migrate away from TypeRange(std::nullopt)
 (NFC) (#145246)

ArrayRef has a constructor that accepts std::nullopt.  This
constructor dates back to the days when we still had llvm::Optional.

Since the use of std::nullopt outside the context of std::optional is
kind of abuse and not intuitive to new comers, I would like to move
away from the constructor and eventually remove it.

One of the uses of std::nullopt is in a the constructors for
TypeRange.  This patch takes care of the migration where we need
TypeRange() to facilitate perfect forwarding.  Note that {} would be
ambiguous for perfecting forwarding to work.
---
 mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp       | 2 +-
 mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp | 2 +-
 mlir/test/lib/Dialect/Test/TestPatterns.cpp     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index 1ac95ebcdc87..84959d5ff8ab 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -245,7 +245,7 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
   Location terminatorLoc = terminator.getLoc();
   terminator.erase();
   builder.setInsertionPointToEnd(innermostForOp.getBody());
-  builder.create<gpu::TerminatorOp>(terminatorLoc, std::nullopt);
+  builder.create<gpu::TerminatorOp>(terminatorLoc, TypeRange());
   launchOp.getBody().front().getOperations().splice(
       launchOp.getBody().front().begin(),
       innermostForOp.getBody()->getOperations());
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index 93979e0f7332..2e4561e7cc3e 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -839,7 +839,7 @@ public:
                   ConversionPatternRewriter &rewriter) const override {
     if (callOp.getNumResults() == 0) {
       auto newOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
-          callOp, std::nullopt, adaptor.getOperands(), callOp->getAttrs());
+          callOp, TypeRange(), adaptor.getOperands(), callOp->getAttrs());
       newOp.getProperties().operandSegmentSizes = {
           static_cast<int32_t>(adaptor.getOperands().size()), 0};
       newOp.getProperties().op_bundle_sizes = rewriter.getDenseI32ArrayAttr({});
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index d073843484d8..a4cb705e6c8e 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1008,7 +1008,7 @@ struct TestPassthroughInvalidOp : public ConversionPattern {
                                   op->getOperand(it.index()).getType(), range)
               .getResult());
     }
-    rewriter.replaceOpWithNewOp<TestValidOp>(op, std::nullopt, flattened,
+    rewriter.replaceOpWithNewOp<TestValidOp>(op, TypeRange(), flattened,
                                              std::nullopt);
     return success();
   }
@@ -1024,7 +1024,7 @@ struct TestDropAndReplaceInvalidOp : public ConversionPattern {
   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<TestValidOp>(op, std::nullopt, ValueRange(),
+    rewriter.replaceOpWithNewOp<TestValidOp>(op, TypeRange(), ValueRange(),
                                              std::nullopt);
     return success();
   }

From 76ae9aa4d2fbf82d95224e4219980e0b35898fd8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 22 Jun 2025 19:09:38 -0700
Subject: [PATCH 1215/1322] [CodeGen] Use range-based for loops (NFC) (#145251)

---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |  6 ++---
 llvm/lib/CodeGen/LiveInterval.cpp             |  4 +--
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +++++++++----------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 06217337a46f..92da4ef7f055 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1700,9 +1700,9 @@ public:
 
   static int getSplatMaskIndex(ArrayRef<int> Mask) {
     assert(isSplatMask(Mask) && "Cannot get splat index for non-splat!");
-    for (unsigned i = 0, e = Mask.size(); i != e; ++i)
-      if (Mask[i] >= 0)
-        return Mask[i];
+    for (int Elem : Mask)
+      if (Elem >= 0)
+        return Elem;
 
     // We can choose any index value here and be correct because all elements
     // are undefined. Return 0 for better potential for callers to simplify.
diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp
index 0238ce3bae3e..b682998c329b 100644
--- a/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/llvm/lib/CodeGen/LiveInterval.cpp
@@ -1162,8 +1162,8 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {
   for (const auto &S : make_range(LR->begin(), WriteI))
     OS << ' ' << S;
   OS << "\n  Spills:";
-  for (unsigned I = 0, E = Spills.size(); I != E; ++I)
-    OS << ' ' << Spills[I];
+  for (const LiveRange::Segment &Spill : Spills)
+    OS << ' ' << Spill;
   OS << "\n  Area 2:";
   for (const auto &S : make_range(ReadI, LR->end()))
     OS << ' ' << S;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f91f90b0469..56a5643e1344 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19448,12 +19448,12 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     std::swap(BasePtr, Offset);
 
   // Replace other uses of BasePtr that can be updated to use Ptr
-  for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
+  for (SDNode *OtherUse : OtherUses) {
     unsigned OffsetIdx = 1;
-    if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
+    if (OtherUse->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
       OffsetIdx = 0;
-    assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
-           BasePtr.getNode() && "Expected BasePtr operand");
+    assert(OtherUse->getOperand(!OffsetIdx).getNode() == BasePtr.getNode() &&
+           "Expected BasePtr operand");
 
     // We need to replace ptr0 in the following expression:
     //   x0 * offset0 + y0 * ptr0 = t0
@@ -19466,11 +19466,11 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     // Therefore, we have:
     //   t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
 
-    auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
+    auto *CN = cast<ConstantSDNode>(OtherUse->getOperand(OffsetIdx));
     const APInt &Offset0 = CN->getAPIntValue();
     const APInt &Offset1 = Offset->getAsAPIntVal();
-    int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
-    int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
+    int X0 = (OtherUse->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
+    int Y0 = (OtherUse->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
     int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
     int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
 
@@ -19481,17 +19481,16 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
     else CNV = CNV - Offset1;
 
-    SDLoc DL(OtherUses[i]);
+    SDLoc DL(OtherUse);
 
     // We can now generate the new expression.
     SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
     SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
 
-    SDValue NewUse = DAG.getNode(Opcode,
-                                 DL,
-                                 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
-    deleteAndRecombine(OtherUses[i]);
+    SDValue NewUse =
+        DAG.getNode(Opcode, DL, OtherUse->getValueType(0), NewOp1, NewOp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUse, 0), NewUse);
+    deleteAndRecombine(OtherUse);
   }
 
   // Replace the uses of Ptr with uses of the updated base value.

From 32f911f3e83b0a3d2242502940c487998672ffd4 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Mon, 23 Jun 2025 10:51:17 +0800
Subject: [PATCH 1216/1322] [InstCombine] Fold `ceil(X / (2 ^ C)) == 0` -> `X
 == 0` (#143683)

Co-authored-by: Yingwei Zheng <dtcxzyw2333@gmail.com>
---
 llvm/include/llvm/Analysis/ValueTracking.h    |   5 +
 llvm/lib/Analysis/ValueTracking.cpp           |  26 ++
 .../InstCombine/InstCombineCompares.cpp       |   8 +
 .../test/Transforms/InstCombine/ceil-shift.ll | 308 ++++++++++++++++++
 4 files changed, 347 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/ceil-shift.ll

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index e215c90b5a72..4596b2563c1d 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -999,6 +999,11 @@ LLVM_ABI void
 findValuesAffectedByCondition(Value *Cond, bool IsAssume,
                               function_ref<void(Value *)> InsertAffected);
 
+/// Returns the inner value X if the expression has the form f(X)
+/// where f(X) == 0 if and only if X == 0, otherwise returns nullptr.
+LLVM_ABI Value *stripNullTest(Value *V);
+LLVM_ABI const Value *stripNullTest(const Value *V);
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index a17417cb5189..3df9af4bc95f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3521,6 +3521,9 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
       isKnownNonNullFromDominatingCondition(V, Q.CxtI, Q.DT))
     return true;
 
+  if (const Value *Stripped = stripNullTest(V))
+    return isKnownNonZero(Stripped, DemandedElts, Q, Depth);
+
   return false;
 }
 
@@ -10170,3 +10173,26 @@ void llvm::findValuesAffectedByCondition(
     }
   }
 }
+
+const Value *llvm::stripNullTest(const Value *V) {
+  // (X >> C) or/add (X & mask(C) != 0)
+  if (const auto *BO = dyn_cast<BinaryOperator>(V)) {
+    if (BO->getOpcode() == Instruction::Add ||
+        BO->getOpcode() == Instruction::Or) {
+      const Value *X;
+      const APInt *C1, *C2;
+      if (match(BO, m_c_BinOp(m_LShr(m_Value(X), m_APInt(C1)),
+                              m_ZExt(m_SpecificICmp(
+                                  ICmpInst::ICMP_NE,
+                                  m_And(m_Deferred(X), m_LowBitMask(C2)),
+                                  m_Zero())))) &&
+          C2->popcount() == C1->getZExtValue())
+        return X;
+    }
+  }
+  return nullptr;
+}
+
+Value *llvm::stripNullTest(Value *V) {
+  return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V)));
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 084e7fbaa268..0894ca92086f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1298,6 +1298,14 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
     // eq/ne (mul X, Y)) with (icmp eq/ne X/Y) and if X/Y is known non-zero that
     // will fold to a constant elsewhere.
   }
+
+  // (icmp eq/ne f(X), 0) -> (icmp eq/ne X, 0)
+  // where f(X) == 0 if and only if X == 0
+  if (ICmpInst::isEquality(Pred))
+    if (Value *Stripped = stripNullTest(Cmp.getOperand(0)))
+      return new ICmpInst(Pred, Stripped,
+                          Constant::getNullValue(Stripped->getType()));
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/ceil-shift.ll b/llvm/test/Transforms/InstCombine/ceil-shift.ll
new file mode 100644
index 000000000000..d4b37786bb26
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ceil-shift.ll
@@ -0,0 +1,308 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @ceil_shift4(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift4(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %quot = lshr i32 %arg0, 4
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %is_zero = icmp eq i32 %quot_or_rem, 0
+  ret i1 %is_zero
+}
+
+define i1 @ceil_shift4_add(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift4_add(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %quot = lshr i32 %arg0, 4
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %ceil = add i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %ceil, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift6(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift6(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %quot = lshr i32 %arg0, 6
+  %rem = and i32 %arg0, 63
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift6_ne(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift6_ne(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %quot = lshr i32 %arg0, 6
+  %rem = and i32 %arg0, 63
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp ne i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift11(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift11(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %quot = lshr i32 %arg0, 11
+  %rem = and i32 %arg0, 2047
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift11_ne(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift11_ne(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %quot = lshr i32 %arg0, 6
+  %rem = and i32 %arg0, 63
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp ne i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift0(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift0(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %quot = lshr i32 %arg0, 0
+  %rem = and i32 %arg0, 0
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift4_comm(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift4_comm(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %quot = lshr i32 %arg0, 4
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %zext_has_rem, %quot
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+declare void @use(i32)
+
+define i1 @ceil_shift4_used_1(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift4_used_1(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[ARG0]], 4
+; CHECK-NEXT:    call void @use(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %quot = lshr i32 %arg0, 4
+  call void @use(i32 %quot)
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift4_used_5(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift4_used_5(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[ARG0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[ARG0]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    call void @use(i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %quot = lshr i32 %arg0, 4
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  call void @use(i32 %quot_or_rem)
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift4_used_add_nuw_nsw(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift4_used_add_nuw_nsw(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[QUOT:%.*]] = lshr i32 [[ARG0]], 4
+; CHECK-NEXT:    [[REM:%.*]] = and i32 [[ARG0]], 15
+; CHECK-NEXT:    [[HAS_REM:%.*]] = icmp ne i32 [[REM]], 0
+; CHECK-NEXT:    [[ZEXT_HAS_REM:%.*]] = zext i1 [[HAS_REM]] to i32
+; CHECK-NEXT:    [[CEIL:%.*]] = add nuw nsw i32 [[QUOT]], [[ZEXT_HAS_REM]]
+; CHECK-NEXT:    call void @use(i32 [[CEIL]])
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i32 [[ARG0]], 0
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %quot = lshr i32 %arg0, 4
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %ceil = add nuw nsw i32 %quot, %zext_has_rem
+  call void @use(i32 %ceil)
+  %res = icmp eq i32 %ceil, 0
+  ret i1 %res
+}
+
+define <4 x i1> @ceil_shift4_v4i32(<4 x i32> %arg0) {
+; CHECK-LABEL: define <4 x i1> @ceil_shift4_v4i32(
+; CHECK-SAME: <4 x i32> [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[ARG0]], zeroinitializer
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %quot = lshr <4 x i32> %arg0, splat (i32 16)
+  %rem = and <4 x i32> %arg0, splat (i32 65535)
+  %has_rem = icmp ne <4 x i32> %rem, zeroinitializer
+  %zext_has_rem = zext <4 x i1> %has_rem to <4 x i32>
+  %quot_or_rem = or <4 x i32> %quot, %zext_has_rem
+  %res = icmp eq <4 x i32> %quot_or_rem, zeroinitializer
+  ret <4 x i1> %res
+}
+
+define <8 x i1> @ceil_shift4_v8i16(<8 x i16> %arg0) {
+; CHECK-LABEL: define <8 x i1> @ceil_shift4_v8i16(
+; CHECK-SAME: <8 x i16> [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i16> [[ARG0]], zeroinitializer
+; CHECK-NEXT:    ret <8 x i1> [[TMP1]]
+;
+  %quot = lshr <8 x i16> %arg0, splat (i16 4)
+  %rem = and <8 x i16> %arg0, splat (i16 15)
+  %has_rem = icmp ne <8 x i16> %rem, zeroinitializer
+  %zext_has_rem = zext <8 x i1> %has_rem to <8 x i16>
+  %quot_or_rem = or <8 x i16> %quot, %zext_has_rem
+  %res = icmp eq <8 x i16> %quot_or_rem, zeroinitializer
+  ret <8 x i1> %res
+}
+
+; negative tests
+
+define i1 @ceil_shift_not_mask_1(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift_not_mask_1(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[ARG0]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[ARG0]], 31
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %quot = lshr i32 %arg0, 4
+  %rem = and i32 %arg0, 31
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift_not_mask_2(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift_not_mask_2(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[ARG0]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[ARG0]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %quot = lshr i32 %arg0, 5
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_or_rem = or i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_or_rem, 0
+  ret i1 %res
+}
+
+define i1 @ceil_shift_not_add_or(i32 %arg0) {
+; CHECK-LABEL: define i1 @ceil_shift_not_add_or(
+; CHECK-SAME: i32 [[ARG0:%.*]]) {
+; CHECK-NEXT:    [[REM:%.*]] = and i32 [[ARG0]], 15
+; CHECK-NEXT:    [[HAS_REM_NOT:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[ARG0]], 32
+; CHECK-NEXT:    [[RES1:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[RES:%.*]] = or i1 [[HAS_REM_NOT]], [[RES1]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %quot = lshr i32 %arg0, 5
+  %rem = and i32 %arg0, 15
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %quot_and_rem = and i32 %quot, %zext_has_rem
+  %res = icmp eq i32 %quot_and_rem, 0
+  ret i1 %res
+}
+
+define i32 @ceil_shift_should_infer_ge_zero(i32 %x) {
+; CHECK-LABEL: define i32 @ceil_shift_should_infer_ge_zero(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND_NOT]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X]], 20
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[X]], 1048575
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    ret i32 0
+;
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %quot = lshr i32 %x, 20
+  %rem = and i32 %x, 1048575
+  %has_rem = icmp ne i32 %rem, 0
+  %zext_has_rem = zext i1 %has_rem to i32
+  %ceil = add nuw nsw i32 %quot, %zext_has_rem
+  %max = call i32 @llvm.umax.i32(i32 %ceil, i32 1)
+  ret i32 %max
+
+if.else:
+  ret i32 0
+}

From 89c61449e60703e42c5f274ed41a21f3bc386cf0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Sun, 22 Jun 2025 20:26:06 -0700
Subject: [PATCH 1217/1322] [AMDGPU] Baseline gfx1250 speed model. (#145217)

---
 llvm/lib/Target/AMDGPU/GCNProcessors.td |  2 +-
 llvm/lib/Target/AMDGPU/SISchedule.td    | 33 +++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 0b331bd3f3fb..b5ffa64c3a4b 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -326,6 +326,6 @@ def : ProcessorModel<"gfx12-generic", GFX12SpeedModel,
   FeatureISAVersion12_Generic.Features
 >;
 
-def : ProcessorModel<"gfx1250", GFX12SpeedModel,
+def : ProcessorModel<"gfx1250", GFX1250SpeedModel,
   FeatureISAVersion12_50.Features
 >;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 2a374b360b04..1679cee32006 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -99,6 +99,7 @@ def SIDPGFX950FullSpeedModel : SISchedMachineModel;
 def GFX10SpeedModel : SISchedMachineModel;
 def GFX11SpeedModel : SISchedMachineModel;
 def GFX12SpeedModel : SISchedMachineModel;
+def GFX1250SpeedModel : SISchedMachineModel;
 
 // XXX: Are the resource counts correct?
 def HWBranch : ProcResource<1> {
@@ -455,3 +456,35 @@ def : HWWriteRes<WriteBarrier,           [HWBranch],       2000>;
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
 }  // End SchedModel = GFX12SpeedModel
+
+multiclass GFX125xCommonWriteRes {
+
+def : HWWriteRes<Write32Bit,             [HWVALU, HWRC],   5>;
+def : HWWriteRes<WriteFloatCvt,          [HWVALU, HWRC],   5>;
+def : HWWriteRes<WriteTrans32,           [HWTransVALU, HWRC],   7>;
+def : HWWriteRes<WriteQuarterRate32,     [HWVALU, HWRC],   6>;
+def : HWWriteRes<WriteFloatFMA,          [HWVALU, HWRC],   5>;
+def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC],   8>;
+
+def : HWWriteRes<WriteBranch,            [HWBranch],       32>;
+def : HWWriteRes<WriteExport,            [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS,               [HWLGKM,   HWRC], 20>;
+def : HWWriteRes<WriteSALU,              [HWSALU,   HWRC], 2>;
+def : HWWriteRes<WriteSFPU,              [HWSALU,   HWRC], 4>;
+def : HWWriteRes<WriteSMEM,              [HWLGKM,   HWRC], 20>;
+def : HWWriteRes<WriteVMEM,              [HWVMEM,   HWRC], 320>;
+def : HWWriteRes<WriteBarrier,           [HWBranch],       2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+} // End GFX125xCommonWriteRes
+
+let SchedModel = GFX1250SpeedModel in {
+defm : GFX125xCommonWriteRes;
+
+def : HWWriteRes<Write64Bit,             [HWVALU, HWRC],   7>;
+def : HWWriteRes<WriteIntMul,            [HWVALU, HWRC],   11>;
+def : HWWriteRes<WriteDouble,            [HWVALU, HWRC],   32>;
+def : HWWriteRes<WriteDoubleAdd,         [HWVALU, HWRC],   32>;
+def : HWWriteRes<WriteDoubleCvt,         [HWVALU, HWRC],   32>;
+def : HWWriteRes<WriteTrans64,           [HWVALU, HWTransVALU, HWRC], 38>;
+} // SchedModel = GFX1250SpeedModel

From c7165587e49605452f96249412f123b47b78bb81 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Mon, 23 Jun 2025 00:05:20 -0400
Subject: [PATCH 1218/1322] [mlir][affine|ValueBounds] Add transform to
 simplify affine min max ops with ValueBoundsOpInterface (#145068)

This commit makes the following changes:

- Expose `map` and `mapOperands` in
`ValueBoundsConstraintSet::Variable`, so that the class can be used by
subclasses of `ValueBoundsConstraintSet`. Otherwise subclasses cannot
access those members.

- Add `ValueBoundsConstraintSet::strongCompare`. This method is similar
to `ValueBoundsConstraintSet::compare` except that it returns false when
the inverse comparison holds, and `llvm::failure()` if neither the
relation nor its inverse relation could be proven.

- Add `simplifyAffineMinOp`, `simplifyAffineMaxOp`, and
`simplifyAffineMinMaxOps` to simplify those operations using
`ValueBoundsConstraintSet`.

- Adds the `SimplifyMinMaxAffineOpsOp` transform op that uses
`simplifyAffineMinMaxOps`.

- Add the `test.value_with_bounds` op to test unknown values with a min
max range using `ValueBoundsOpInterface`.

- Adds tests verifying the transform.

Example:

```mlir
func.func @overlapping_constraints() -> (index, index) {
  %0 = test.value_with_bounds {min = 0 : index, max = 192 : index}
  %1 = test.value_with_bounds {min = 128 : index, max = 384 : index}
  %2 = test.value_with_bounds {min = 256 : index, max = 512 : index}
  %r0 = affine.min affine_map<()[s0, s1, s2] -> (s0, s1, s2)>()[%0, %1, %2]
  %r1 = affine.max affine_map<()[s0, s1, s2] -> (s0, s1, s2)>()[%0, %1, %2]
  return %r0, %r1 : index, index
}
// Result of applying `simplifyAffineMinMaxOps` to `func.func`
#map1 = affine_map<()[s0, s1] -> (s1, s0)>
func.func @overlapping_constraints() -> (index, index) {
  %0 = test.value_with_bounds {max = 192 : index, min = 0 : index}
  %1 = test.value_with_bounds {max = 384 : index, min = 128 : index}
  %2 = test.value_with_bounds {max = 512 : index, min = 256 : index}
  %3 = affine.min #map1()[%0, %1]
  %4 = affine.max #map1()[%1, %2]
  return %3, %4 : index, index
}
```

---------

Co-authored-by: Nicolas Vasilache <Nico.Vasilache@amd.com>
---
 .../Affine/TransformOps/AffineTransformOps.td |  31 ++++
 .../Dialect/Affine/Transforms/Transforms.h    |  33 ++++
 .../mlir/Interfaces/ValueBoundsOpInterface.h  |  25 ++-
 .../TransformOps/AffineTransformOps.cpp       |  39 +++-
 .../Dialect/Affine/Transforms/CMakeLists.txt  |   1 +
 .../Transforms/SimplifyAffineMinMax.cpp       | 174 ++++++++++++++++++
 .../lib/Interfaces/ValueBoundsOpInterface.cpp |  67 ++++++-
 .../transform-op-simplify-min-max-ops.mlir    |  68 +++++++
 .../test/Dialect/Linalg/transform-op-pad.mlir |  35 ++++
 mlir/test/lib/Dialect/Test/TestOpDefs.cpp     |  10 +
 mlir/test/lib/Dialect/Test/TestOps.td         |  19 ++
 11 files changed, 493 insertions(+), 9 deletions(-)
 create mode 100644 mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp
 create mode 100644 mlir/test/Dialect/Affine/transform-op-simplify-min-max-ops.mlir

diff --git a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
index 70b127fd063c..2969b4238dd6 100644
--- a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
+++ b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
@@ -63,4 +63,35 @@ def SimplifyBoundedAffineOpsOp
   }];
 }
 
+def SimplifyMinMaxAffineOpsOp :
+  Op<Transform_Dialect, "affine.simplify_min_max_affine_ops", [
+    DeclareOpInterfaceMethods<TransformOpInterface>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+  let description = [{
+    Simplify the targeted `affine.min` / `affine.max` ops using the
+    `mlir::affine::simplifyAffineMinMaxOps` transform.
+
+    Example:
+    ```
+    %0 = transform.structured.match ops{["affine.max"]} in %arg1
+    transform.affine.simplify_min_max_affine_ops %0 : !transform.any_op
+    ```
+
+    #### Return modes
+
+    This transform consumes the target handle and does not produce any results.
+    This transforms definitely fails if any of the targeted operations is not an
+    `affine.min` or `affine.max` operation, or if the canonicalization patterns
+    failed to converge.
+    This transform silently fails if none of the operations were simplified.
+    Otherwise, it succeeds.
+  }];
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs);
+  let assemblyFormat = [{
+      $target attr-dict `:` type($target)
+  }];
+}
+
 #endif // Affine_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
index 5c538d28c183..272054448374 100644
--- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
@@ -34,6 +34,8 @@ namespace affine {
 class AffineApplyOp;
 class AffineDelinearizeIndexOp;
 class AffineLinearizeIndexOp;
+class AffineMaxOp;
+class AffineMinOp;
 
 /// Lowers `affine.delinearize_index` into a sequence of division and remainder
 /// operations.
@@ -127,6 +129,37 @@ OpFoldResult materializeComputedBound(
     OpBuilder &b, Location loc, AffineMap boundMap,
     ArrayRef<std::pair<Value, std::optional<int64_t>>> mapOperands);
 
+/// This transform tries to simplify the affine min operation `op`, by finding a
+/// common lower bound for a set of expressions in the affine map results. It
+/// returns whether the transform updated `op`'s affine map.
+///
+/// In concrete terms, given an operation like:
+/// `affine.min affine_map<(d0)[s0, s1] -> (d0, s1, s0, 128)>(%i)[%s0, %s1]`
+/// If `d0 < 128` and `128 < s1 < s0`, the transform will update `op` to:
+/// `affine.min affine_map<(d0)[s0, s1] -> (d0, 128)>(%i)[%s0, %s1]`.
+bool simplifyAffineMinOp(RewriterBase &rewriter, AffineMinOp op);
+
+/// This transform tries to simplify the affine max operation `op`, by finding a
+/// common upper bound for a set of expressions in the affine map results. It
+/// returns whether the transform updated `op`'s affine map.
+///
+/// In concrete terms, given an operation like:
+/// `affine.max affine_map<(d0)[s0, s1] -> (d0, s1, s0, 128)>(%i)[%s0, %s1]`
+/// If `d0 > 128` and `s0 > s1 > 128`, the transform will update `op` to:
+/// `affine.max affine_map<(d0)[s0, s1] -> (d0, s0)>(%i)[%s0, %s1]`.
+bool simplifyAffineMaxOp(RewriterBase &rewriter, AffineMaxOp op);
+
+/// This transform applies `simplifyAffineMinOp` and `simplifyAffineMaxOp` to
+/// all the `affine.min` or `affine.max` operations in `ops`. After
+/// simplification, it invokes the `affine.min/max` canonicalization patterns on
+/// `ops`.
+///
+/// This transform returns failure if the greedy pattern rewriter failed to
+/// converge during canonicalization, otherwise it returns success. If provided,
+/// `modified` is set to `true` if the IR was modified in any way.
+LogicalResult simplifyAffineMinMaxOps(RewriterBase &rewriter,
+                                      ArrayRef<Operation *> ops,
+                                      bool *modified = nullptr);
 } // namespace affine
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 337314143c80..d168735f5059 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -135,10 +135,17 @@ public:
 
     /// Construct a variable for a map and its operands.
     Variable(AffineMap map, ArrayRef<Variable> mapOperands);
-    Variable(AffineMap map, ArrayRef<Value> mapOperands);
+    Variable(AffineMap map, ValueRange mapOperands);
 
     MLIRContext *getContext() const { return map.getContext(); }
 
+    /// Returns the affine map.
+    AffineMap getMap() const { return map; }
+
+    /// Returns the map operands.
+    ValueDimList &getOperands() { return mapOperands; }
+    const ValueDimList &getOperands() const { return mapOperands; }
+
   private:
     friend class ValueBoundsConstraintSet;
     AffineMap map;
@@ -254,6 +261,12 @@ public:
   /// prove the relation or until it ran out of IR.
   static bool compare(const Variable &lhs, ComparisonOperator cmp,
                       const Variable &rhs);
+  /// This function is similar to `ValueBoundsConstraintSet::compare`, except
+  /// that it returns false if `!(lhs cmp rhs)`, and `failure` if neither the
+  /// relation nor its inverse relation could be proven.
+  static llvm::FailureOr<bool> strongCompare(const Variable &lhs,
+                                             ComparisonOperator cmp,
+                                             const Variable &rhs);
 
   /// Compute whether the given variables are equal. Return "failure" if
   /// equality could not be determined.
@@ -327,6 +340,16 @@ protected:
   /// constraints.
   bool comparePos(int64_t lhsPos, ComparisonOperator cmp, int64_t rhsPos);
 
+  /// Return "true" if, based on the current state of the constraint system,
+  /// "lhs cmp rhs" was proven to hold. It returns "false" if "!(lhs cmp rhs)"
+  /// can be proven. Otherwise, it returns `failure` if neither the relation nor
+  /// its inverse relation could be proven.
+  ///
+  /// This function does not analyze any IR and does not populate any additional
+  /// constraints.
+  llvm::FailureOr<bool> strongComparePos(int64_t lhsPos, ComparisonOperator cmp,
+                                         int64_t rhsPos);
+
   /// Given an affine map with a single result (and map operands), add a new
   /// column to the constraint set that represents the result of the map.
   /// Traverse additional IR starting from the map operands as needed (as long
diff --git a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
index c9fe4474a68f..b1e40d9b289e 100644
--- a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
+++ b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -112,7 +113,7 @@ SimplifyBoundedAffineOpsOp::apply(transform::TransformRewriter &rewriter,
     }
     if (boundedOps.contains(target)) {
       auto diag = emitDefiniteFailure()
-                  << "target op result must not be constrainted";
+                  << "target op result must not be constrained";
       diag.attachNote(target->getLoc()) << "target/constrained op";
       return diag;
     }
@@ -148,6 +149,42 @@ void SimplifyBoundedAffineOpsOp::getEffects(
   modifiesPayload(effects);
 }
 
+//===----------------------------------------------------------------------===//
+// SimplifyMinMaxAffineOpsOp
+//===----------------------------------------------------------------------===//
+DiagnosedSilenceableFailure
+SimplifyMinMaxAffineOpsOp::apply(transform::TransformRewriter &rewriter,
+                                 TransformResults &results,
+                                 TransformState &state) {
+  SmallVector<Operation *> targets;
+  for (Operation *target : state.getPayloadOps(getTarget())) {
+    if (!isa<AffineMinOp, AffineMaxOp>(target)) {
+      auto diag = emitDefiniteFailure()
+                  << "target must be affine.min or affine.max";
+      diag.attachNote(target->getLoc()) << "target op";
+      return diag;
+    }
+    targets.push_back(target);
+  }
+  bool modified = false;
+  if (failed(mlir::affine::simplifyAffineMinMaxOps(rewriter, targets,
+                                                   &modified))) {
+    return emitDefiniteFailure()
+           << "affine.min/max simplification did not converge";
+  }
+  if (!modified) {
+    return emitSilenceableError()
+           << "the transform failed to simplify any of the target operations";
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
+void SimplifyMinMaxAffineOpsOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTargetMutable(), effects);
+  modifiesPayload(effects);
+}
+
 //===----------------------------------------------------------------------===//
 // Transform op registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
index 1c82822b2bd7..c792200f4a49 100644
--- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRAffineTransforms
   ReifyValueBounds.cpp
   SuperVectorize.cpp
   SimplifyAffineStructures.cpp
+  SimplifyAffineMinMax.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp
new file mode 100644
index 000000000000..c992badcfa49
--- /dev/null
+++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp
@@ -0,0 +1,174 @@
+//===- SimplifyAffineMinMax.cpp - Simplify affine min/max ops -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transform to simplify mix/max affine operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Transforms/Transforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "affine-min-max"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
+
+using namespace mlir;
+using namespace mlir::affine;
+
+/// Simplifies an affine min/max operation by proving there's a lower or upper
+/// bound.
+template <typename AffineOp>
+static bool simplifyAffineMinMaxOp(RewriterBase &rewriter, AffineOp affineOp) {
+  using Variable = ValueBoundsConstraintSet::Variable;
+  using ComparisonOperator = ValueBoundsConstraintSet::ComparisonOperator;
+
+  AffineMap affineMap = affineOp.getMap();
+  ValueRange operands = affineOp.getOperands();
+  static constexpr bool isMin = std::is_same_v<AffineOp, AffineMinOp>;
+
+  LLVM_DEBUG({ DBGS() << "analyzing value: `" << affineOp << "`\n"; });
+
+  // Create a `Variable` list with values corresponding to each of the results
+  // in the affine affineMap.
+  SmallVector<Variable> variables = llvm::map_to_vector(
+      llvm::iota_range<unsigned>(0u, affineMap.getNumResults(), false),
+      [&](unsigned i) {
+        return Variable(affineMap.getSliceMap(i, 1), operands);
+      });
+
+  // Get the comparison operation.
+  ComparisonOperator cmpOp =
+      isMin ? ComparisonOperator::LT : ComparisonOperator::GT;
+
+  // Find disjoint sets bounded by a common value.
+  llvm::IntEqClasses boundedClasses(variables.size());
+  DenseMap<unsigned, Variable *> bounds;
+  for (auto &&[i, v] : llvm::enumerate(variables)) {
+    unsigned eqClass = boundedClasses.findLeader(i);
+
+    // If the class already has a bound continue.
+    if (bounds.contains(eqClass))
+      continue;
+
+    // Initialize the bound.
+    Variable *bound = &v;
+
+    LLVM_DEBUG({
+      DBGS() << "- inspecting variable: #" << i << ", with map: `" << v.getMap()
+             << "`\n";
+    });
+
+    // Check against the other variables.
+    for (size_t j = i + 1; j < variables.size(); ++j) {
+      unsigned jEqClass = boundedClasses.findLeader(j);
+      // Skip if the class is the same.
+      if (jEqClass == eqClass)
+        continue;
+
+      // Get the bound of the equivalence class or itself.
+      Variable *nv = bounds.lookup_or(jEqClass, &variables[j]);
+
+      LLVM_DEBUG({
+        DBGS() << "- comparing with variable: #" << jEqClass
+               << ", with map: " << nv->getMap() << "\n";
+      });
+
+      // Compare the variables.
+      FailureOr<bool> cmpResult =
+          ValueBoundsConstraintSet::strongCompare(*bound, cmpOp, *nv);
+
+      // The variables cannot be compared.
+      if (failed(cmpResult)) {
+        LLVM_DEBUG({
+          DBGS() << "-- classes: #" << i << ", #" << jEqClass
+                 << " cannot be merged\n";
+        });
+        continue;
+      }
+
+      // Join the equivalent classes and update the bound if necessary.
+      LLVM_DEBUG({
+        DBGS() << "-- merging classes: #" << i << ", #" << jEqClass
+               << ", is cmp(lhs, rhs): " << *cmpResult << "`\n";
+      });
+      if (*cmpResult) {
+        boundedClasses.join(eqClass, jEqClass);
+      } else {
+        // In this case we have lhs > rhs if isMin == true, or lhs < rhs if
+        // isMin == false.
+        bound = nv;
+        boundedClasses.join(eqClass, jEqClass);
+      }
+    }
+    bounds[boundedClasses.findLeader(i)] = bound;
+  }
+
+  // Return if there's no simplification.
+  if (bounds.size() >= affineMap.getNumResults()) {
+    LLVM_DEBUG(
+        { DBGS() << "- the affine operation couldn't get simplified\n"; });
+    return false;
+  }
+
+  // Construct the new affine affineMap.
+  SmallVector<AffineExpr> results;
+  results.reserve(bounds.size());
+  for (auto [k, bound] : bounds)
+    results.push_back(bound->getMap().getResult(0));
+
+  affineMap = AffineMap::get(affineMap.getNumDims(), affineMap.getNumSymbols(),
+                             results, rewriter.getContext());
+
+  // Update the affine op.
+  rewriter.modifyOpInPlace(affineOp, [&]() { affineOp.setMap(affineMap); });
+  LLVM_DEBUG({ DBGS() << "- simplified affine op: `" << affineOp << "`\n"; });
+  return true;
+}
+
+bool mlir::affine::simplifyAffineMinOp(RewriterBase &rewriter, AffineMinOp op) {
+  return simplifyAffineMinMaxOp(rewriter, op);
+}
+
+bool mlir::affine::simplifyAffineMaxOp(RewriterBase &rewriter, AffineMaxOp op) {
+  return simplifyAffineMinMaxOp(rewriter, op);
+}
+
+LogicalResult mlir::affine::simplifyAffineMinMaxOps(RewriterBase &rewriter,
+                                                    ArrayRef<Operation *> ops,
+                                                    bool *modified) {
+  bool changed = false;
+  for (Operation *op : ops) {
+    if (auto minOp = dyn_cast<AffineMinOp>(op))
+      changed = simplifyAffineMinOp(rewriter, minOp) || changed;
+    else if (auto maxOp = cast<AffineMaxOp>(op))
+      changed = simplifyAffineMaxOp(rewriter, maxOp) || changed;
+  }
+  RewritePatternSet patterns(rewriter.getContext());
+  AffineMaxOp::getCanonicalizationPatterns(patterns, rewriter.getContext());
+  AffineMinOp::getCanonicalizationPatterns(patterns, rewriter.getContext());
+  FrozenRewritePatternSet frozenPatterns(std::move(patterns));
+  if (modified)
+    *modified = changed;
+  // Canonicalize to a fixpoint.
+  if (failed(applyOpPatternsGreedily(
+          ops, frozenPatterns,
+          GreedyRewriteConfig()
+              .setListener(
+                  static_cast<RewriterBase::Listener *>(rewriter.getListener()))
+              .setStrictness(GreedyRewriteStrictness::ExistingAndNewOps),
+          &changed))) {
+    return failure();
+  }
+  if (modified)
+    *modified = changed;
+  return success();
+}
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 87f883c2e648..c9481fb5d940 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -146,7 +146,7 @@ ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
 }
 
 ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
-                                             ArrayRef<Value> mapOperands)
+                                             ValueRange mapOperands)
     : Variable(map, llvm::map_to_vector(mapOperands,
                                         [](Value v) { return Variable(v); })) {}
 
@@ -736,6 +736,44 @@ bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
   return isEmpty;
 }
 
+FailureOr<bool> ValueBoundsConstraintSet::strongComparePos(
+    int64_t lhsPos, ComparisonOperator cmp, int64_t rhsPos) {
+  auto strongCmp = [&](ComparisonOperator cmp,
+                       ComparisonOperator negCmp) -> FailureOr<bool> {
+    if (comparePos(lhsPos, cmp, rhsPos))
+      return true;
+    if (comparePos(lhsPos, negCmp, rhsPos))
+      return false;
+    return failure();
+  };
+  switch (cmp) {
+  case ComparisonOperator::LT:
+    return strongCmp(ComparisonOperator::LT, ComparisonOperator::GE);
+  case ComparisonOperator::LE:
+    return strongCmp(ComparisonOperator::LE, ComparisonOperator::GT);
+  case ComparisonOperator::GT:
+    return strongCmp(ComparisonOperator::GT, ComparisonOperator::LE);
+  case ComparisonOperator::GE:
+    return strongCmp(ComparisonOperator::GE, ComparisonOperator::LT);
+  case ComparisonOperator::EQ: {
+    std::optional<bool> le =
+        strongComparePos(lhsPos, ComparisonOperator::LE, rhsPos);
+    if (!le)
+      return failure();
+    if (!*le)
+      return false;
+    std::optional<bool> ge =
+        strongComparePos(lhsPos, ComparisonOperator::GE, rhsPos);
+    if (!ge)
+      return failure();
+    if (!*ge)
+      return false;
+    return true;
+  }
+  }
+  llvm_unreachable("invalid comparison operator");
+}
+
 bool ValueBoundsConstraintSet::populateAndCompare(const Variable &lhs,
                                                   ComparisonOperator cmp,
                                                   const Variable &rhs) {
@@ -763,14 +801,29 @@ bool ValueBoundsConstraintSet::compare(const Variable &lhs,
   return cstr.comparePos(lhsPos, cmp, rhsPos);
 }
 
+FailureOr<bool> ValueBoundsConstraintSet::strongCompare(const Variable &lhs,
+                                                        ComparisonOperator cmp,
+                                                        const Variable &rhs) {
+  int64_t lhsPos = -1, rhsPos = -1;
+  auto stopCondition = [&](Value v, std::optional<int64_t> dim,
+                           ValueBoundsConstraintSet &cstr) {
+    // Keep processing as long as lhs/rhs were not processed.
+    if (size_t(lhsPos) >= cstr.positionToValueDim.size() ||
+        size_t(rhsPos) >= cstr.positionToValueDim.size())
+      return false;
+    // Keep processing as long as the strong relation cannot be proven.
+    FailureOr<bool> ordered = cstr.strongComparePos(lhsPos, cmp, rhsPos);
+    return failed(ordered) ? true : false;
+  };
+  ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition);
+  lhsPos = cstr.populateConstraints(lhs.map, lhs.mapOperands);
+  rhsPos = cstr.populateConstraints(rhs.map, rhs.mapOperands);
+  return cstr.strongComparePos(lhsPos, cmp, rhsPos);
+}
+
 FailureOr<bool> ValueBoundsConstraintSet::areEqual(const Variable &var1,
                                                    const Variable &var2) {
-  if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::EQ, var2))
-    return true;
-  if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::LT, var2) ||
-      ValueBoundsConstraintSet::compare(var1, ComparisonOperator::GT, var2))
-    return false;
-  return failure();
+  return strongCompare(var1, ComparisonOperator::EQ, var2);
 }
 
 FailureOr<bool>
diff --git a/mlir/test/Dialect/Affine/transform-op-simplify-min-max-ops.mlir b/mlir/test/Dialect/Affine/transform-op-simplify-min-max-ops.mlir
new file mode 100644
index 000000000000..948f434f3fa5
--- /dev/null
+++ b/mlir/test/Dialect/Affine/transform-op-simplify-min-max-ops.mlir
@@ -0,0 +1,68 @@
+// RUN: mlir-opt  %s  --transform-interpreter | FileCheck %s
+
+// CHECK-DAG: #[[MAP_0:.*]] = affine_map<()[s0] -> (32, s0)>
+// CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0, s1] -> (s1, s0)>
+// CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (256, s0)>
+
+// CHECK: @min_max_full_simplify
+func.func @min_max_full_simplify() -> (index, index) {
+  %0 = test.value_with_bounds {max = 128 : index, min = 0 : index}
+  %1 = test.value_with_bounds {max = 512 : index, min = 256 : index}
+  // CHECK: %[[V0:.*]] = test.value_with_bounds {max = 128 : index, min = 0 : index}
+  // CHECK: %[[V1:.*]] = test.value_with_bounds {max = 512 : index, min = 256 : index}
+  // CHECK-NOT: affine.min
+  // CHECK-NOT: affine.max
+  // CHECK: return %[[V0]], %[[V1]]
+  %r0 = affine.min affine_map<()[s0, s1] -> (s0, 192, s1)>()[%0, %1]
+  %r1 = affine.max affine_map<()[s0, s1] -> (s0, 192, s1)>()[%0, %1]
+  return %r0, %r1 : index, index
+}
+
+// CHECK: @min_only_simplify
+func.func @min_only_simplify() -> (index, index) {
+  // CHECK: %[[V0:.*]] = test.value_with_bounds {max = 512 : index, min = 0 : index}
+  // CHECK: %[[V1:.*]] = test.value_with_bounds {max = 512 : index, min = 256 : index}
+  // CHECK: affine.min #[[MAP_0]]()[%[[V0]]]
+  // CHECK: affine.max #[[MAP_1]]()[%[[V0]], %[[V1]]]
+  %0 = test.value_with_bounds {max = 512 : index, min = 0 : index}
+  %1 = test.value_with_bounds {max = 512 : index, min = 256 : index}
+  %r0 = affine.min affine_map<()[s0, s1] -> (s0, 32, s1)>()[%0, %1]
+  %r1 = affine.max affine_map<()[s0, s1] -> (s0, 32, s1)>()[%0, %1]
+  return %r0, %r1 : index, index
+}
+
+// CHECK: @max_only_simplify
+func.func @max_only_simplify() -> (index, index) {
+  // CHECK: %[[V0:.*]] = test.value_with_bounds {max = 128 : index, min = 0 : index}
+  // CHECK: %[[V1:.*]] = test.value_with_bounds {max = 512 : index, min = 0 : index}
+  // CHECK: affine.min #[[MAP_1]]()[%[[V0]], %[[V1]]]
+  // CHECK: affine.max #[[MAP_2]]()[%[[V1]]]
+  %0 = test.value_with_bounds {max = 128 : index, min = 0 : index}
+  %1 = test.value_with_bounds {max = 512 : index, min = 0 : index}
+  %r0 = affine.min affine_map<()[s0, s1] -> (s0, 256, s1)>()[%0, %1]
+  %r1 = affine.max affine_map<()[s0, s1] -> (s0, 256, s1)>()[%0, %1]
+  return %r0, %r1 : index, index
+}
+
+// CHECK: @overlapping_constraints
+func.func @overlapping_constraints() -> (index, index) {
+  %0 = test.value_with_bounds {max = 192 : index, min = 0 : index}
+  %1 = test.value_with_bounds {max = 384 : index, min = 128 : index}
+  %2 = test.value_with_bounds {max = 512 : index, min = 256 : index}
+  // CHECK: %[[V0:.*]] = test.value_with_bounds {max = 192 : index, min = 0 : index}
+  // CHECK: %[[V1:.*]] = test.value_with_bounds {max = 384 : index, min = 128 : index}
+  // CHECK: %[[V2:.*]] = test.value_with_bounds {max = 512 : index, min = 256 : index}
+  // CHECK: affine.min #[[MAP_1]]()[%[[V0]], %[[V1]]]
+  // CHECK: affine.max #[[MAP_1]]()[%[[V1]], %[[V2]]]
+  %r0 = affine.min affine_map<()[s0, s1, s2] -> (s0, s1, s2)>()[%0, %1, %2]
+  %r1 = affine.max affine_map<()[s0, s1, s2] -> (s0, s1, s2)>()[%0, %1, %2]
+  return %r0, %r1 : index, index
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["affine.min", "affine.max"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.affine.simplify_min_max_affine_ops %0 : !transform.any_op
+    transform.yield 
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index bc684b53c9b6..f91eb9c30a51 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -454,3 +454,38 @@ module attributes {transform.with_named_sequence} {
     transform.yield 
   }
 }
+
+// -----
+
+// This test checks that by using `simplify_min_max_affine_ops` after padding
+// and tiling, it's possible to recover static tiled slices.
+
+// CHECK-LABEL: @dyn_pad_tiling
+// CHECK: %[[LHS:.*]] = tensor.pad
+// CHECK: %[[RHS:.*]] = tensor.pad
+// CHECK: scf.for
+// CHECK-DAG: tensor.extract_slice %[[LHS]][0, %{{.*}}] [%{{.*}}, 32]
+// CHECK-DAG: tensor.extract_slice %[[RHS]][0, %{{.*}}] [%{{.*}}, 32]
+func.func @dyn_pad_tiling(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %padded, %pad, %copy = transform.structured.pad %0 pad_to_multiple_of [32] use_prescribed_tensor_shapes {padding_dimensions = [2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %tiled_linalg_op, %loops = transform.structured.tile_using_for %padded tile_sizes [0, 0, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.apply_registered_pass "resolve-shaped-type-result-dims" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %2 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    %3 = transform.structured.match ops{["affine.min", "affine.max"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.affine.simplify_min_max_affine_ops %3 : !transform.any_op
+    transform.apply_patterns to %2 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield 
+  }
+}
+
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index 78e44c6ec7a9..6c1a5d344153 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -836,6 +836,16 @@ void ConversionFuncOp::print(OpAsmPrinter &p) {
       getArgAttrsAttrName(), getResAttrsAttrName());
 }
 
+//===----------------------------------------------------------------------===//
+// TestValueWithBoundsOp
+//===----------------------------------------------------------------------===//
+
+void TestValueWithBoundsOp::populateBoundsForIndexValue(
+    Value v, ValueBoundsConstraintSet &cstr) {
+  cstr.bound(v) >= getMin().getSExtValue();
+  cstr.bound(v) <= getMax().getSExtValue();
+}
+
 //===----------------------------------------------------------------------===//
 // ReifyBoundOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 30234698bc8d..8a4981a90831 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -31,6 +31,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/MemorySlotInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ValueBoundsOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
 
 // Include the attribute definitions.
@@ -2375,6 +2376,24 @@ def ForwardBufferOp : TEST_Op<"forward_buffer", [Pure]> {
 // Test ValueBoundsOpInterface
 //===----------------------------------------------------------------------===//
 
+def TestValueWithBoundsOp : TEST_Op<"value_with_bounds", [
+    DeclareOpInterfaceMethods<ValueBoundsOpInterface, ["populateBoundsForIndexValue"]>
+  ]> {
+  let description = [{
+    Creates a value with specified [min, max] range for value bounds analysis.
+
+    Example:
+
+    ```mlir
+    %0 = test.value_with_bounds { min = 4 : index, max = 5 : index}
+    ```
+  }];
+  let arguments = (ins IndexAttr:$min, IndexAttr:$max);
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+
 def ReifyBoundOp : TEST_Op<"reify_bound", [Pure]> {
   let description = [{
     Reify a bound for the given index-typed value or dimension size of a shaped

From ed155ff9f25c0f0c9fcdfaae42db8423dc24a208 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 13:09:09 +0900
Subject: [PATCH 1219/1322] AMDGPU: Avoid report_fatal_error on ds ordered
 intrinsics (#145202)

---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 21 +++++++++----
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 23 ++++++++++----
 .../llvm.amdgcn.ds.ordered.add-errors.ll      | 30 +++++++++++++++++++
 3 files changed, 62 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add-errors.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 672520390c8b..b632b16f5c19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1767,8 +1767,12 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
   bool WaveRelease = MI.getOperand(8).getImm() != 0;
   bool WaveDone = MI.getOperand(9).getImm() != 0;
 
-  if (WaveDone && !WaveRelease)
-    report_fatal_error("ds_ordered_count: wave_done requires wave_release");
+  if (WaveDone && !WaveRelease) {
+    // TODO: Move this to IR verifier
+    const Function &Fn = MF->getFunction();
+    Fn.getContext().diagnose(DiagnosticInfoUnsupported(
+        Fn, "ds_ordered_count: wave_done requires wave_release", DL));
+  }
 
   unsigned OrderedCountIndex = IndexOperand & 0x3f;
   IndexOperand &= ~0x3f;
@@ -1779,13 +1783,18 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
     IndexOperand &= ~(0xf << 24);
 
     if (CountDw < 1 || CountDw > 4) {
-      report_fatal_error(
-        "ds_ordered_count: dword count must be between 1 and 4");
+      const Function &Fn = MF->getFunction();
+      Fn.getContext().diagnose(DiagnosticInfoUnsupported(
+          Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
+      CountDw = 1;
     }
   }
 
-  if (IndexOperand)
-    report_fatal_error("ds_ordered_count: bad index operand");
+  if (IndexOperand) {
+    const Function &Fn = MF->getFunction();
+    Fn.getContext().diagnose(DiagnosticInfoUnsupported(
+        Fn, "ds_ordered_count: bad index operand", DL));
+  }
 
   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6f37e2dd3aa7..931a6363e74e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9330,16 +9330,27 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       IndexOperand &= ~(0xf << 24);
 
       if (CountDw < 1 || CountDw > 4) {
-        report_fatal_error(
-            "ds_ordered_count: dword count must be between 1 and 4");
+        const Function &Fn = DAG.getMachineFunction().getFunction();
+        DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+            Fn, "ds_ordered_count: dword count must be between 1 and 4",
+            DL.getDebugLoc()));
+        CountDw = 1;
       }
     }
 
-    if (IndexOperand)
-      report_fatal_error("ds_ordered_count: bad index operand");
+    if (IndexOperand) {
+      const Function &Fn = DAG.getMachineFunction().getFunction();
+      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+          Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
+    }
 
-    if (WaveDone && !WaveRelease)
-      report_fatal_error("ds_ordered_count: wave_done requires wave_release");
+    if (WaveDone && !WaveRelease) {
+      // TODO: Move this to IR verifier
+      const Function &Fn = DAG.getMachineFunction().getFunction();
+      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+          Fn, "ds_ordered_count: wave_done requires wave_release",
+          DL.getDebugLoc()));
+    }
 
     unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
     unsigned ShaderType =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add-errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add-errors.ll
new file mode 100644
index 000000000000..36b13e3b16cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add-errors.ll
@@ -0,0 +1,30 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: <unknown>:0:0: in function ds_ordered_add_dword_count_too_low void (ptr addrspace(2), ptr addrspace(1)): ds_ordered_count: dword count must be between 1 and 4
+define amdgpu_kernel void @ds_ordered_add_dword_count_too_low(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 0, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: in function ds_ordered_add_dword_count_too_high void (ptr addrspace(2), ptr addrspace(1)): ds_ordered_count: dword count must be between 1 and 4
+define amdgpu_kernel void @ds_ordered_add_dword_count_too_high(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 5, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: in function ds_ordered_add_bad_index_operand void (ptr addrspace(2), ptr addrspace(1)): ds_ordered_count: bad index operand
+define amdgpu_kernel void @ds_ordered_add_bad_index_operand(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 1, i1 false, i32 -1, i1 true, i1 true)
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: in function ds_ordered_add_dword_count_wave_done_without_wave_release void (ptr addrspace(2), ptr addrspace(1)): ds_ordered_count: wave_done requires wave_release
+define amdgpu_kernel void @ds_ordered_add_dword_count_wave_done_without_wave_release(ptr addrspace(2) inreg %gds, ptr addrspace(1) %out) {
+  %val = call i32 @llvm.amdgcn.ds.ordered.add(ptr addrspace(2) %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 false, i1 true)
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}

From da0c21bd4b62ee2a4a2709f49ea8b19538d1588a Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@nvidia.com>
Date: Sun, 22 Jun 2025 21:09:44 -0700
Subject: [PATCH 1220/1322] [mlir][gpu] Fix bug with GPU hardware intrinsic
 global location (#144923)

Bug description: Hardware intrinsic functions created during GPU
conversion to NVVM may contain debug info metadata from the original
function which cannot be used out of that function.
---
 mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h     | 7 ++++++-
 .../test/Conversion/GPUToNVVM/gpu-to-nvvm-debuginfo.mlir | 9 +++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index 34150c4d1308..64cf09e600b8 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -164,7 +164,12 @@ public:
     auto parentFunc = op->getParentOfType<FunctionOpInterface>();
     assert(parentFunc && "expected there to be a parent function");
     OpBuilder b(parentFunc);
-    return b.create<LLVMFuncOp>(op->getLoc(), funcName, funcType);
+
+    // Create a valid global location removing any metadata attached to the
+    // location as debug info metadata inside of a function cannot be used
+    // outside of that function.
+    auto globalloc = op->getLoc()->findInstanceOfOrUnknown<FileLineColLoc>();
+    return b.create<LLVMFuncOp>(globalloc, funcName, funcType);
   }
 
   StringRef getFunctionName(Type type, SourceOp op) const {
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-debuginfo.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-debuginfo.mlir
index 08c5800fe93b..5304abfb09a1 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-debuginfo.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-debuginfo.mlir
@@ -23,3 +23,12 @@ gpu.module @test_module_1 {
     gpu.return
   }
 }
+
+// Check that debug info metadata from the function is removed from the global location.
+gpu.module @test_module_2 {
+  // CHECK-DAG: llvm.func @__nv_abs(i32) -> i32 loc([[LOC]])
+  func.func @gpu_abs_with_loc(%arg_i32 : i32) -> (i32) {
+    %result32 = math.absi %arg_i32 : i32 loc(fused<#di_subprogram>[#loc])
+    func.return %result32 : i32
+  }
+}

From 6a0593b0a3831a14fd0e01ffca992f6ee6d86c64 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Mon, 23 Jun 2025 10:31:22 +0530
Subject: [PATCH 1221/1322] [AMDGPU] Extend wave reduce intrinsics for i32 type
 (#126469)

Currently, wave reduction intrinsics are supported for `umin` and `umax`
operations for `i32` type only.
This patch extends support for the following operations:
`add`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type.

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   10 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |    9 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  122 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   29 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 1233 ++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll  |  982 +++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll  |  982 +++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll  |  982 +++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll   |  982 +++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 1282 ++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |   52 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |   53 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll  | 1286 +++++++++++++++++
 .../AMDGPU/llvm.amdgcn.wave.reduce.umax.mir   |   89 --
 .../AMDGPU/llvm.amdgcn.wave.reduce.umin.mir   |   89 --
 15 files changed, 7927 insertions(+), 255 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 51dfe53aa00e..e6f0bf627608 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2347,8 +2347,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
 
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceOps {
+  foreach Op =
+      ["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"] in {
+    def Op : AMDGPUWaveReduce;
+  }
+}
+
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps;
 
 def int_amdgcn_readfirstlane :
   Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dca55dafcc5e..a7b08794fdf1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5006,8 +5006,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
       break;
     }
+    case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_sub:
+    case Intrinsic::amdgcn_wave_reduce_min:
     case Intrinsic::amdgcn_wave_reduce_umin:
-    case Intrinsic::amdgcn_wave_reduce_umax: {
+    case Intrinsic::amdgcn_wave_reduce_max:
+    case Intrinsic::amdgcn_wave_reduce_umax:
+    case Intrinsic::amdgcn_wave_reduce_and:
+    case Intrinsic::amdgcn_wave_reduce_or:
+    case Intrinsic::amdgcn_wave_reduce_xor: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 931a6363e74e..83155ee790df 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5040,6 +5040,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   return LoopBB;
 }
 
+static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::S_MIN_U32:
+    return std::numeric_limits<uint32_t>::max();
+  case AMDGPU::S_MIN_I32:
+    return std::numeric_limits<int32_t>::max();
+  case AMDGPU::S_MAX_U32:
+    return std::numeric_limits<uint32_t>::min();
+  case AMDGPU::S_MAX_I32:
+    return std::numeric_limits<int32_t>::min();
+  case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_SUB_I32:
+  case AMDGPU::S_OR_B32:
+  case AMDGPU::S_XOR_B32:
+    return std::numeric_limits<uint32_t>::min();
+  case AMDGPU::S_AND_B32:
+    return std::numeric_limits<uint32_t>::max();
+  default:
+    llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
+  }
+}
+
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                                           MachineBasicBlock &BB,
                                           const GCNSubtarget &ST,
@@ -5055,13 +5077,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
   MachineBasicBlock *RetBB = nullptr;
   if (isSGPR) {
-    // These operations with a uniform value i.e. SGPR are idempotent.
-    // Reduced value will be same as given sgpr.
-    // clang-format off
-    BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
-        .addReg(SrcReg);
-    // clang-format on
-    RetBB = &BB;
+    switch (Opc) {
+    case AMDGPU::S_MIN_U32:
+    case AMDGPU::S_MIN_I32:
+    case AMDGPU::S_MAX_U32:
+    case AMDGPU::S_MAX_I32:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_OR_B32: {
+      // Idempotent operations.
+      BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+      RetBB = &BB;
+      break;
+    }
+    case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_ADD_I32:
+    case AMDGPU::S_SUB_I32: {
+      const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+      const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+      Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+      Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+
+      bool IsWave32 = ST.isWave32();
+      unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+      MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+      unsigned CountReg =
+          IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
+
+      auto Exec =
+          BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+      auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+                                .addReg(Exec->getOperand(0).getReg());
+
+      switch (Opc) {
+      case AMDGPU::S_XOR_B32: {
+        // Performing an XOR operation on a uniform value
+        // depends on the parity of the number of active lanes.
+        // For even parity, the result will be 0, for odd
+        // parity the result will be the same as the input value.
+        Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+
+        auto ParityReg =
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+                .addReg(NewAccumulator->getOperand(0).getReg())
+                .addImm(1);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+            .addReg(SrcReg)
+            .addReg(ParityReg->getOperand(0).getReg());
+        break;
+      }
+      case AMDGPU::S_SUB_I32: {
+        Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+
+        // Take the negation of the source operand.
+        auto InvertedValReg =
+            BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
+                .addImm(-1)
+                .addReg(SrcReg);
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+            .addReg(InvertedValReg->getOperand(0).getReg())
+            .addReg(NewAccumulator->getOperand(0).getReg());
+        break;
+      }
+      case AMDGPU::S_ADD_I32: {
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+            .addReg(SrcReg)
+            .addReg(NewAccumulator->getOperand(0).getReg());
+        break;
+      }
+      }
+      RetBB = &BB;
+    }
+    }
   } else {
     // TODO: Implement DPP Strategy and switch based on immediate strategy
     // operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5097,10 +5184,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
-    // Create initail values of induction variable from Exec, Accumulator and
-    // insert branch instr to newly created ComputeBlockk
-    uint32_t InitalValue =
-        (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+    // Create initial values of induction variable from Exec, Accumulator and
+    // insert branch instr to newly created ComputeBlock
+    uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
     auto TmpSReg =
         BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5172,8 +5258,22 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   switch (MI.getOpcode()) {
   case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
     const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 56b15c11a669..f85df5598534 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -303,16 +303,29 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
 def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
     (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
-let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
-  def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+// clang-format off
+defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+multiclass
+    AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
+  let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+    def !toupper(Op) #"_PSEUDO_" #DataType
+        : VPseudoInstSI<(outs SGPR_32 : $sdst),
+                        (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
+                        [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
   }
+}
+// clang-format on
 
-  def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
-    (ins VSrc_b32: $src, VSrc_b32:$strategy),
-    [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
-  }
+// Input list : [Operation_name,
+//              type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
+defvar Operations = [
+  ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
+  ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
+  ["xor", "B32"]
+];
+
+foreach Op = Operations in {
+  defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
 }
 
 let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
new file mode 100644
index 000000000000..d2ca1d813604
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -0,0 +1,1233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_add_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_add_i32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_add_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_add_i32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_add_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_add_i32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_add_i32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_add_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_add_i32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
new file mode 100644
index 000000000000..356b0e73b39e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -0,0 +1,982 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, -1
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_and_b32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, -1
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, -1
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_and_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_and_b32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, -1
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_and_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_and_b32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
new file mode 100644
index 000000000000..7dc0cb05b0ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -0,0 +1,982 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s4, 1
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s4, 1
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_max_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_max_i32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s4, 1
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_max_i32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_max_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_max_i32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_max_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s0, 1
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_max_i32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_max_i32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_max_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s0, 1
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_max_i32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
new file mode 100644
index 000000000000..7cb0e6533c72
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -0,0 +1,982 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s4, -2
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s4, -2
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s4, -2
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s4, -2
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s4, -2
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s4, -2
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s2, -2
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_min_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s2, -2
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_min_i32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s4, -2
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s4, -2
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_min_i32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s2, -2
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_min_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s2, -2
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_min_i32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s6, -2
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s6, -2
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s6, -2
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s6, -2
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, -2
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s6, -2
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s1, -2
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_min_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s0, -2
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_min_i32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s6, -2
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s6, -2
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_min_i32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, -2
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_min_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s0, -2
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_min_i32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
new file mode 100644
index 000000000000..e08787e6ba70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -0,0 +1,982 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL:       ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT:    s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL:       ; %bb.0: ; %entry
+; GFX11GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_or_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_or_b32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_or_b32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_or_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_or_b32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_or_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_or_b32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_or_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_or_b32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
new file mode 100644
index 000000000000..edb888a21f73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -0,0 +1,1282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, -1, 0x7b
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, -1, 0x7b
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s4, s0, -1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s3, s0, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_sub_i32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_sub_i32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_sub_i32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s3, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s3, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s3, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, -1
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_sub_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, -1
+; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_sub_i32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s3, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_sub_i32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, -1
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_sub_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, -1
+; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_sub_i32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index deeceed3a19b..ba7a816184cd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -1,19 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
 
-declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
-declare i32 @llvm.amdgcn.workitem.id.x()
 
 define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: uniform_value:
@@ -122,9 +120,9 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
-    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
@@ -218,9 +216,9 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
-    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
@@ -256,9 +254,9 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX11GISEL:       ; %bb.0: ; %entry
 ; GFX11GISEL-NEXT:    s_endpgm
 entry:
-    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
@@ -498,10 +496,10 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
-    %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 434e761a5f8a..3eaa89c95747 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -1,21 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
 
 
-declare i32 @llvm.amdgcn.wave.reduce.umin.i32(i32, i32 immarg)
-declare i32 @llvm.amdgcn.workitem.id.x()
-
 define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-LABEL: uniform_value:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -123,9 +120,9 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
-    %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %in, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
@@ -219,9 +216,9 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
-    %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 123, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @poison_value(ptr addrspace(1) %out) {
@@ -257,9 +254,9 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out) {
 ; GFX11GISEL:       ; %bb.0: ; %entry
 ; GFX11GISEL-NEXT:    s_endpgm
 entry:
-    %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 poison, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
@@ -499,10 +496,10 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
-    %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-    %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %id.x, i32 1)
-    store i32 %result, ptr addrspace(1) %out
-    ret void
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
 }
 
 define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
new file mode 100644
index 000000000000..5b21d5c3aaeb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -0,0 +1,1286 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck  -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_clause 0x1
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_clause 0x1
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT:    s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT:    s_xor_b32 s2, s2, s5
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT:    s_xor_b32 s4, s4, s6
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT:    s_xor_b32 s2, s2, s5
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT:  ; %bb.5:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT:  ; %bb.5:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT:  ; %bb.5:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT:  ; %bb.5:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT:    s_xor_b32 s0, s0, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT:  ; %bb.5:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, s2
+; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT:  ; %bb.5:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT:    s_xor_b32 s0, s0, s6
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
deleted file mode 100644
index 0733d34f5e36..000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
+++ /dev/null
@@ -1,89 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-
----
-name:            uniform_value
-tracksRegLiveness: true
-machineFunctionInfo:
-  isEntryFunction: true
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0_sgpr1
-
-    ; GCN-LABEL: name: uniform_value
-    ; GCN: liveins: $sgpr0_sgpr1
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
-    ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
-    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
-    %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
-    %7:sgpr_32 = WAVE_REDUCE_UMAX_PSEUDO_U32 killed %6, 1, implicit $exec
-    %8:vgpr_32 = COPY %7
-    GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
-    S_ENDPGM 0
-
-...
-
----
-name:            divergent_value
-machineFunctionInfo:
-  isEntryFunction: true
-tracksRegLiveness: true
-body:             |
-  ; GCN-LABEL: name: divergent_value
-  ; GCN: bb.0.entry:
-  ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
-  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_BRANCH %bb.2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
-  ; GCN-NEXT:   [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
-  ; GCN-NEXT:   [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
-  ; GCN-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
-  ; GCN-NEXT:   [[S_MAX_U32_:%[0-9]+]]:sgpr_32 = S_MAX_U32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
-  ; GCN-NEXT:   [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
-  ; GCN-NEXT:   S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MAX_U32_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
-  ; GCN-NEXT:   S_ENDPGM 0
-  bb.0.entry:
-    liveins: $vgpr0, $sgpr0_sgpr1
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %0:vgpr_32 = COPY $vgpr0
-    %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
-    %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %6:sgpr_32 = WAVE_REDUCE_UMAX_PSEUDO_U32 %0, 1, implicit $exec
-    %7:vgpr_32 = COPY %6
-    GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
-  bb.1:
-    %8:vgpr_32 = PHI %0, %bb.0
-    S_ENDPGM 0
-
-...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
deleted file mode 100644
index 486c08335b17..000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
+++ /dev/null
@@ -1,89 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-
----
-name:            uniform_value
-tracksRegLiveness: true
-machineFunctionInfo:
-  isEntryFunction: true
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0_sgpr1
-
-    ; GCN-LABEL: name: uniform_value
-    ; GCN: liveins: $sgpr0_sgpr1
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
-    ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
-    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
-    %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
-    %7:sgpr_32 = WAVE_REDUCE_UMIN_PSEUDO_U32 killed %6, 1, implicit $exec
-    %8:vgpr_32 = COPY %7
-    GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
-    S_ENDPGM 0
-
-...
-
----
-name:            divergent_value
-machineFunctionInfo:
-  isEntryFunction: true
-tracksRegLiveness: true
-body:             |
-  ; GCN-LABEL: name: divergent_value
-  ; GCN: bb.0.entry:
-  ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
-  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295
-  ; GCN-NEXT:   S_BRANCH %bb.2
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
-  ; GCN-NEXT:   [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
-  ; GCN-NEXT:   [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
-  ; GCN-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
-  ; GCN-NEXT:   [[S_MIN_U32_:%[0-9]+]]:sgpr_32 = S_MIN_U32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
-  ; GCN-NEXT:   [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
-  ; GCN-NEXT:   S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
-  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MIN_U32_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
-  ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
-  ; GCN-NEXT:   S_ENDPGM 0
-  bb.0.entry:
-    liveins: $vgpr0, $sgpr0_sgpr1
-    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
-    %0:vgpr_32 = COPY $vgpr0
-    %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
-    %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %6:sgpr_32 = WAVE_REDUCE_UMIN_PSEUDO_U32 %0, 1, implicit $exec
-    %7:vgpr_32 = COPY %6
-    GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
-  bb.1:
-    %8:vgpr_32 = PHI %0, %bb.0
-    S_ENDPGM 0
-
-...

From 26f3f24a4f0a67eb23d255aba7a73a12bee1db11 Mon Sep 17 00:00:00 2001
From: Artemiy Bulavin <artemiyb@graphcore.ai>
Date: Mon, 23 Jun 2025 07:02:49 +0100
Subject: [PATCH 1222/1322] [MLIR][NFC] Declare RuntimeVerifiableOpInterface
 for memref ops that have an implementation  (#145230)

Previously running `-generate-runtime-verification` on an IR containing
`memref.reinterpret_cast` would crash because its implementation of the
`RuntimeVerifiableOpInterface` was removed in
https://github.com/llvm/llvm-project/pull/132547 but its associated
entry in `declarePromisedInterface` was never removed.

This causes an error when you try and run
`-generate-runtime-verification` on an IR containing
`memref.reinterpret_cast` that looks like

```
LLVM ERROR: checking for an interface (`mlir::RuntimeVerifiableOpInterface`) that was promised by dialect 'memref' but never implemented. This is generally an indication that the dialect extension implementing the interface was never registered.
```
as reported in https://github.com/llvm/llvm-project/issues/144028.

In this PR I also added all the ops that do have implementations of this
interface in
`mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp` to the
`declarePromisedInterface` for consistency.

Fixes https://github.com/llvm/llvm-project/issues/144028
---
 mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
index 6d0ac8a65077..2f55f5c0a743 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
@@ -52,8 +52,9 @@ void mlir::memref::MemRefDialect::initialize() {
   declarePromisedInterface<ConvertToLLVMPatternInterface, MemRefDialect>();
   declarePromisedInterfaces<bufferization::AllocationOpInterface, AllocOp,
                             AllocaOp, ReallocOp>();
-  declarePromisedInterfaces<RuntimeVerifiableOpInterface, CastOp, ExpandShapeOp,
-                            LoadOp, ReinterpretCastOp, StoreOp, SubViewOp>();
+  declarePromisedInterfaces<RuntimeVerifiableOpInterface, AssumeAlignmentOp,
+                            AtomicRMWOp, CastOp, CopyOp, DimOp, ExpandShapeOp,
+                            GenericAtomicRMWOp, LoadOp, StoreOp, SubViewOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, AllocOp, AllocaOp, CastOp,
                             DimOp, GetGlobalOp, RankOp, SubViewOp>();
   declarePromisedInterface<DestructurableTypeInterface, MemRefType>();

From 38920964babfc0e2474accb606846c0bc740dc4d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 15:22:59 +0900
Subject: [PATCH 1223/1322] ARM: Move ABI enum from TargetMachine to
 TargetParser (#144725)

Consolidate ABI parsing logic in TargetParser where
computeDefaultTargetABI is defined, instead of splitting it into the
backend. We need the full ABI information computable in
RuntimeLibcallsInfo
---
 .../llvm/TargetParser/ARMTargetParser.h       | 10 ++++
 llvm/lib/Target/ARM/ARMTargetMachine.cpp      | 51 +++----------------
 llvm/lib/Target/ARM/ARMTargetMachine.h        | 28 ++++++----
 llvm/lib/Target/ARM/ARMTargetObjectFile.cpp   |  2 +-
 llvm/lib/TargetParser/ARMTargetParser.cpp     | 17 +++++++
 5 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h
index 798c578ced93..3ae6c4956656 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h
@@ -27,6 +27,13 @@ class Triple;
 
 namespace ARM {
 
+enum ARMABI {
+  ARM_ABI_UNKNOWN,
+  ARM_ABI_APCS,
+  ARM_ABI_AAPCS, // ARM EABI
+  ARM_ABI_AAPCS16
+};
+
 // Arch extension modifiers for CPUs.
 // Note that this is not the same as the AArch64 list
 enum ArchExtKind : uint64_t {
@@ -265,6 +272,9 @@ LLVM_ABI unsigned parseArchVersion(StringRef Arch);
 LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 LLVM_ABI StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
 
+LLVM_ABI ARMABI computeTargetABI(const Triple &TT, StringRef CPU,
+                                 StringRef ABIName = "");
+
 /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting.
 ///
 /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 0d947d924eb6..c66232ef4dc7 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -121,29 +121,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   return std::make_unique<ARMElfTargetObjectFile>();
 }
 
-static ARMBaseTargetMachine::ARMABI
-computeTargetABI(const Triple &TT, StringRef CPU,
-                 const TargetOptions &Options) {
-  StringRef ABIName = Options.MCOptions.getABIName();
-
-  if (ABIName.empty())
-    ABIName = ARM::computeDefaultTargetABI(TT, CPU);
-
-  if (ABIName == "aapcs16")
-    return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-  else if (ABIName.starts_with("aapcs"))
-    return ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  else if (ABIName.starts_with("apcs"))
-    return ARMBaseTargetMachine::ARM_ABI_APCS;
-
-  llvm_unreachable("Unhandled/unknown ABI Name!");
-  return ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
-}
-
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options,
                                      bool isLittle) {
-  auto ABI = computeTargetABI(TT, CPU, Options);
+  auto ABI = ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName);
   std::string Ret;
 
   if (isLittle)
@@ -163,19 +144,19 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   Ret += "-Fi8";
 
   // ABIs other than APCS have 64 bit integers with natural alignment.
-  if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS)
+  if (ABI != ARM::ARM_ABI_APCS)
     Ret += "-i64:64";
 
   // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
   // bits, others to 64 bits. We always try to align to 64 bits.
-  if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+  if (ABI == ARM::ARM_ABI_APCS)
     Ret += "-f64:32:64";
 
   // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
   // to 64. We always ty to give them natural alignment.
-  if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+  if (ABI == ARM::ARM_ABI_APCS)
     Ret += "-v64:32:64-v128:32:128";
-  else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+  else if (ABI != ARM::ARM_ABI_AAPCS16)
     Ret += "-v128:64:128";
 
   // Try to align aggregates to 32 bits (the default is 64 bits, which has no
@@ -187,9 +168,9 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
 
   // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
   // aligned everywhere else.
-  if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+  if (TT.isOSNaCl() || ABI == ARM::ARM_ABI_AAPCS16)
     Ret += "-S128";
-  else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
+  else if (ABI == ARM::ARM_ABI_AAPCS)
     Ret += "-S64";
   else
     Ret += "-S32";
@@ -226,7 +207,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                                TT, CPU, FS, Options,
                                getEffectiveRelocModel(TT, RM),
                                getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TargetABI(computeTargetABI(TT, CPU, Options)),
+      TargetABI(ARM::computeTargetABI(TT, CPU, Options.MCOptions.ABIName)),
       TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
 
   // Default to triple-appropriate float ABI
@@ -271,22 +252,6 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
 
-bool ARMBaseTargetMachine::isAPCS_ABI() const {
-  assert(TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
-}
-
-bool ARMBaseTargetMachine::isAAPCS_ABI() const {
-  assert(TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
-         TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-}
-
-bool ARMBaseTargetMachine::isAAPCS16_ABI() const {
-  assert(TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-}
-
 MachineFunctionInfo *ARMBaseTargetMachine::createMachineFunctionInfo(
     BumpPtrAllocator &Allocator, const Function &F,
     const TargetSubtargetInfo *STI) const {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
index 513fe713c0bc..1d73af1da6d0 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/ARMTargetParser.h"
 #include <memory>
 #include <optional>
 
@@ -27,12 +28,7 @@ namespace llvm {
 
 class ARMBaseTargetMachine : public CodeGenTargetMachineImpl {
 public:
-  enum ARMABI {
-    ARM_ABI_UNKNOWN,
-    ARM_ABI_APCS,
-    ARM_ABI_AAPCS, // ARM EABI
-    ARM_ABI_AAPCS16
-  } TargetABI;
+  ARM::ARMABI TargetABI;
 
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -66,9 +62,20 @@ public:
     return TLOF.get();
   }
 
-  bool isAPCS_ABI() const;
-  bool isAAPCS_ABI() const;
-  bool isAAPCS16_ABI() const;
+  bool isAPCS_ABI() const {
+    assert(TargetABI != ARM::ARM_ABI_UNKNOWN);
+    return TargetABI == ARM::ARM_ABI_APCS;
+  }
+
+  bool isAAPCS_ABI() const {
+    assert(TargetABI != ARM::ARM_ABI_UNKNOWN);
+    return TargetABI == ARM::ARM_ABI_AAPCS || TargetABI == ARM::ARM_ABI_AAPCS16;
+  }
+
+  bool isAAPCS16_ABI() const {
+    assert(TargetABI != ARM::ARM_ABI_UNKNOWN);
+    return TargetABI == ARM::ARM_ABI_AAPCS16;
+  }
 
   bool isTargetHardFloat() const {
     return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
@@ -77,8 +84,7 @@ public:
            TargetTriple.getEnvironment() == Triple::EABIHF ||
            (TargetTriple.isOSBinFormatMachO() &&
             TargetTriple.getSubArch() == Triple::ARMSubArch_v7em) ||
-           TargetTriple.isOSWindows() ||
-           TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+           TargetTriple.isOSWindows() || TargetABI == ARM::ARM_ABI_AAPCS16;
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index a0a400f93848..cf84f1043cc6 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -37,7 +37,7 @@ ARMElfTargetObjectFile::ARMElfTargetObjectFile() {
 void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
   const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
-  bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
+  bool isAAPCS_ABI = ARM_TM.TargetABI == ARM::ARMABI::ARM_ABI_AAPCS;
   bool genExecuteOnly =
       ARM_TM.getMCSubtargetInfo()->hasFeature(ARM::FeatureExecuteOnly);
 
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index a7a895d87266..9ff6521c5d1e 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -575,6 +575,23 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) {
   }
 }
 
+ARM::ARMABI ARM::computeTargetABI(const Triple &TT, StringRef CPU,
+                                  StringRef ABIName) {
+  if (ABIName.empty())
+    ABIName = ARM::computeDefaultTargetABI(TT, CPU);
+
+  if (ABIName == "aapcs16")
+    return ARM_ABI_AAPCS16;
+
+  if (ABIName.starts_with("aapcs"))
+    return ARM_ABI_AAPCS;
+
+  if (ABIName.starts_with("apcs"))
+    return ARM_ABI_APCS;
+
+  return ARM_ABI_UNKNOWN;
+}
+
 StringRef ARM::getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch) {
   if (MArch.empty())
     MArch = Triple.getArchName();

From 4be4b82e74f1b06f18efd1c6be3582daba2e6739 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 15:30:13 +0900
Subject: [PATCH 1224/1322] AMDGPU: Use reportFatalUsageError for unhandled
 calling conventions (#145261)

Should switch this to DiagnosticInfo and use the default calling
convention, but that would require passing in the context.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp             | 4 ++--
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp               | 2 +-
 llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-call.ll | 7 +++++++
 llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-func.ll | 6 ++++++
 4 files changed, 16 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-call.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-func.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 134adc681215..3db2b3bff2d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1142,7 +1142,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
   default:
-    report_fatal_error("Unsupported calling convention for call");
+    reportFatalUsageError("unsupported calling convention for call");
   }
 }
 
@@ -1169,7 +1169,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   case CallingConv::Cold:
     return RetCC_AMDGPU_Func;
   default:
-    report_fatal_error("Unsupported calling convention.");
+    reportFatalUsageError("unsupported calling convention");
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 9c2811006bc1..3c8bfa629ed3 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1448,7 +1448,7 @@ CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::AMDGPU_LS:
     return CC_R600;
   default:
-    report_fatal_error("Unsupported calling convention.");
+    reportFatalUsageError("unsupported calling convention");
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-call.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-call.ll
new file mode 100644
index 000000000000..8824e8e1b375
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-call.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: unsupported calling convention
+define void @caller(ptr %func) {
+  call aarch64_sve_vector_pcs void %func()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-func.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-func.ll
new file mode 100644
index 000000000000..7ace7628a6e5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calling-conv-func.ll
@@ -0,0 +1,6 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: unsupported calling convention
+define aarch64_sve_vector_pcs void @func() {
+  ret void
+}

From fccc6ee7021811a27ab1303d19407f703853ab92 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 23 Jun 2025 14:31:21 +0800
Subject: [PATCH 1225/1322] [C++20] [Modules] Don't make enum constant members
 always visible

Close https://github.com/llvm/llvm-project/issues/131058

See the comments in
ASTWriter.cpp:ASTDeclContextNameLookupTrait::getLookupVisibility and
SemaLookup.cpp:Sema::makeMergedDefinitionVisible for details.
---
 clang/lib/Sema/SemaLookup.cpp                 | 35 ++++----
 clang/lib/Serialization/ASTWriter.cpp         | 32 ++++++-
 .../Modules/include-after-imports-enums.cppm  | 25 ++++++
 clang/test/Modules/pr131058.cppm              | 85 +++++++++++++++++++
 4 files changed, 160 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/Modules/include-after-imports-enums.cppm
 create mode 100644 clang/test/Modules/pr131058.cppm

diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 5ad9dd8ed0d3..aa7191d2814f 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1565,6 +1565,21 @@ void Sema::makeMergedDefinitionVisible(NamedDecl *ND) {
   if (auto *TD = dyn_cast<TemplateDecl>(ND))
     for (auto *Param : *TD->getTemplateParameters())
       makeMergedDefinitionVisible(Param);
+
+  // If we import a named module which contains a header, and then we include a
+  // header which contains a definition of enums, we will skip parsing the enums
+  // in the current TU. But we need to ensure the visibility of the enum
+  // contants, since they are able to be found with the parents of their
+  // parents.
+  if (auto *ED = dyn_cast<EnumDecl>(ND);
+      ED && ED->isFromGlobalModule() && !ED->isScoped()) {
+    for (auto *ECD : ED->enumerators()) {
+      ECD->setVisibleDespiteOwningModule();
+      DeclContext *RedeclCtx = ED->getDeclContext()->getRedeclContext();
+      if (RedeclCtx->lookup(ECD->getDeclName()).empty())
+        RedeclCtx->makeDeclVisibleInContext(ECD);
+    }
+  }
 }
 
 /// Find the module in which the given declaration was defined.
@@ -2185,22 +2200,10 @@ bool LookupResult::isAvailableForLookup(Sema &SemaRef, NamedDecl *ND) {
   // Class and enumeration member names can be found by name lookup in any
   // context in which a definition of the type is reachable.
   //
-  // FIXME: The current implementation didn't consider about scope. For example,
-  // ```
-  // // m.cppm
-  // export module m;
-  // enum E1 { e1 };
-  // // Use.cpp
-  // import m;
-  // void test() {
-  //   auto a = E1::e1; // Error as expected.
-  //   auto b = e1; // Should be error. namespace-scope name e1 is not visible
-  // }
-  // ```
-  // For the above example, the current implementation would emit error for `a`
-  // correctly. However, the implementation wouldn't diagnose about `b` now.
-  // Since we only check the reachability for the parent only.
-  // See clang/test/CXX/module/module.interface/p7.cpp for example.
+  // NOTE: The above wording may be problematic. See
+  // https://github.com/llvm/llvm-project/issues/131058 But it is much complext
+  // to adjust it in Sema's lookup process. Now we hacked it in ASTWriter. See
+  // the comments in ASTDeclContextNameLookupTrait::getLookupVisibility.
   if (auto *TD = dyn_cast<TagDecl>(DC))
     return SemaRef.hasReachableDefinition(TD);
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index af7229d74887..c6487c5366a2 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -4331,9 +4331,36 @@ private:
       return LookupVisibility::ModuleLocalVisible;
     if (isTULocalInNamedModules(D))
       return LookupVisibility::TULocal;
+
+    // A trick to handle enum constants. The enum constants is special since
+    // they can be found directly without their parent context. This makes it
+    // tricky to decide if an EnumConstantDecl is visible or not by their own
+    // visibilities. E.g., for a class member, we can assume it is visible if
+    // the user get its parent somehow. But for an enum constant, the users may
+    // access if without its parent context. Although we can fix the problem in
+    // Sema lookup process, it might be too complex, we just make a trick here.
+    // Note that we only removes enum constant from the lookup table from its
+    // parent of parent. We DON'T remove the enum constant from its parent. So
+    // we don't need to care about merging problems here.
+    if (auto *ECD = dyn_cast<EnumConstantDecl>(D);
+        ECD && DC.isFileContext() && ECD->getOwningModule() &&
+        ECD->getTopLevelOwningNamedModule()->isNamedModule()) {
+      if (llvm::all_of(
+              DC.noload_lookup(
+                  cast<EnumDecl>(ECD->getDeclContext())->getDeclName()),
+              [](auto *Found) {
+                return Found->isInvisibleOutsideTheOwningModule();
+              }))
+        return ECD->isFromExplicitGlobalModule() ||
+                       ECD->isInAnonymousNamespace()
+                   ? LookupVisibility::TULocal
+                   : LookupVisibility::ModuleLocalVisible;
+    }
+
     return LookupVisibility::GenerallyVisibile;
   }
 
+  DeclContext &DC;
   ModuleLevelDeclsMapTy ModuleLocalDeclsMap;
   TULocalDeclsMapTy TULocalDeclsMap;
 
@@ -4341,6 +4368,9 @@ public:
   using ASTDeclContextNameTrivialLookupTrait::
       ASTDeclContextNameTrivialLookupTrait;
 
+  ASTDeclContextNameLookupTrait(ASTWriter &Writer, DeclContext &DC)
+      : ASTDeclContextNameTrivialLookupTrait(Writer), DC(DC) {}
+
   template <typename Coll> data_type getData(const Coll &Decls) {
     unsigned Start = DeclIDs.size();
     for (NamedDecl *D : Decls) {
@@ -4612,7 +4642,7 @@ void ASTWriter::GenerateNameLookupTable(
   MultiOnDiskHashTableGenerator<reader::ASTDeclContextNameLookupTrait,
                                 ASTDeclContextNameLookupTrait>
       Generator;
-  ASTDeclContextNameLookupTrait Trait(*this);
+  ASTDeclContextNameLookupTrait Trait(*this, *DC);
 
   // The first step is to collect the declaration names which we need to
   // serialize into the name lookup table, and to collect them in a stable
diff --git a/clang/test/Modules/include-after-imports-enums.cppm b/clang/test/Modules/include-after-imports-enums.cppm
new file mode 100644
index 000000000000..00affd98e299
--- /dev/null
+++ b/clang/test/Modules/include-after-imports-enums.cppm
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -verify -fsyntax-only
+//
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-reduced-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -verify -fsyntax-only
+
+//--- enum.h
+enum E { Value };
+
+//--- M.cppm
+module;
+#include "enum.h"
+export module M;
+auto e = Value;
+
+//--- use.cpp
+// expected-no-diagnostics
+import M;
+#include "enum.h"
+
+auto e = Value;
diff --git a/clang/test/Modules/pr131058.cppm b/clang/test/Modules/pr131058.cppm
new file mode 100644
index 000000000000..c5a626103373
--- /dev/null
+++ b/clang/test/Modules/pr131058.cppm
@@ -0,0 +1,85 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-reduced-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+// RUN: %clang_cc1 -std=c++20 %t/M0.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fsyntax-only -verify -fprebuilt-module-path=%t -DMODULE_LOCAL
+// RUN: %clang_cc1 -std=c++20 %t/M0.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+// RUN: %clang_cc1 -std=c++20 %t/M0.cppm -emit-reduced-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fsyntax-only -verify -fprebuilt-module-path=%t -DMODULE_LOCAL
+// RUN: %clang_cc1 -std=c++20 %t/M0.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+// RUN: %clang_cc1 -std=c++20 %t/M2.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+// RUN: %clang_cc1 -std=c++20 %t/M2.cppm -emit-reduced-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+// RUN: %clang_cc1 -std=c++20 %t/M3.cppm -emit-reduced-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use2.cpp -fsyntax-only -verify -fprebuilt-module-path=%t
+
+//--- enum.h
+enum {    SomeName,    };
+
+//--- M.cppm
+module;
+#include "enum.h"
+export module M;
+export auto e = SomeName;
+
+//--- M0.cppm
+export module M;
+enum {    SomeName,    };
+export auto e = SomeName;
+
+//--- M0.cpp
+// expected-no-diagnostics
+module M;
+auto a = SomeName;
+
+//--- use.cpp
+import M;
+auto a = SomeName; // expected-error {{use of undeclared identifier 'SomeName'}}
+auto b = decltype(e)::SomeName;
+
+//--- enum1.h
+extern "C++" {
+enum {    SomeName,    };
+}
+
+//--- M2.cppm
+module;
+#include "enum1.h"
+export module M;
+export auto e = SomeName;
+
+//--- enums.h
+namespace nn {
+enum E { Value };
+enum E2 { VisibleEnum };
+enum AlwaysVisibleEnums { UnconditionallyVisible };
+}
+
+//--- M3.cppm
+module;
+#include "enums.h"
+export module M;
+export namespace nn {
+    using nn::E2::VisibleEnum;
+    using nn::AlwaysVisibleEnums;
+}
+auto e1 = nn::Value;
+auto e2 = nn::VisibleEnum;
+
+//--- use2.cpp
+import M;
+auto e = nn::Value1; // expected-error {{no member named 'Value1' in namespace 'nn'}}
+auto e2 = nn::VisibleEnum;
+auto e3 = nn::UnconditionallyVisible;

From c6be4ff0c8966c4dbe1cbac9a071762982a70651 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 09:09:19 +0200
Subject: [PATCH 1226/1322] [PredicateInfo] Don't use depth first walk (NFCI)
 (#145016)

The order in which we collect the predicates does not matter, as they
will be sorted anyway. As such, avoid the expensive depth first walk
over the dominator tree and instead use plain iteration over the
function.

(To be a bit more precise, the predicates and uses for a specific value
are sorted, so this change has no impact on that. It can change the
order in which values are processed in the first place, but that order
is not semantically relevant.)
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 97f13e3b2674..b57d41906ef0 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -453,17 +453,19 @@ void PredicateInfoBuilder::buildPredicateInfo() {
   // Collect operands to rename from all conditional branch terminators, as well
   // as assume statements.
   SmallVector<Value *, 8> OpsToRename;
-  for (auto *DTN : depth_first(DT.getRootNode())) {
-    BasicBlock *BranchBB = DTN->getBlock();
-    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+  for (BasicBlock &BB : F) {
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+
+    if (auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
       if (!BI->isConditional())
         continue;
       // Can't insert conditional information if they all go to the same place.
       if (BI->getSuccessor(0) == BI->getSuccessor(1))
         continue;
-      processBranch(BI, BranchBB, OpsToRename);
-    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
-      processSwitch(SI, BranchBB, OpsToRename);
+      processBranch(BI, &BB, OpsToRename);
+    } else if (auto *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      processSwitch(SI, &BB, OpsToRename);
     }
   }
   for (auto &Assume : AC.assumptions()) {

From 1e58e9c4b27351c5f7482ce01daabfaad6cbe5fb Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 09:10:33 +0200
Subject: [PATCH 1227/1322] [PredicateInfo] Don't store Def in ValueDFS (NFC)
 (#145022)

Def is only actually used during renaming, and having it in ValueDFS
causes unnecessary confusion. Remove it from ValueDFS and instead use a
separate StackEntry structure for renaming, which holds the ValueDFS and
the Def.
---
 llvm/lib/Transforms/Utils/PredicateInfo.cpp | 40 +++++++++++----------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index b57d41906ef0..95c125d43dea 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -80,14 +80,13 @@ enum LocalNum {
   LN_Last
 };
 
-// Associate global and local DFS info with defs and uses, so we can sort them
-// into a global domination ordering.
+// Associate global and local DFS info with defs (PInfo set) and uses (U set),
+// so we can sort them into a global domination ordering.
 struct ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
   unsigned int LocalNum = LN_Middle;
-  // Only one of Def or Use will be set.
-  Value *Def = nullptr;
+  // Only one of U or PInfo will be set.
   Use *U = nullptr;
   PredicateBase *PInfo = nullptr;
 };
@@ -101,7 +100,6 @@ struct ValueDFS_Compare {
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
       return false;
-    assert(!A.Def && !B.Def && "Should not have Def during comparison");
 
     // Order by block first.
     if (A.DFSIn != B.DFSIn)
@@ -133,7 +131,7 @@ struct ValueDFS_Compare {
 
   // For a phi use, or a non-materialized def, return the edge it represents.
   std::pair<BasicBlock *, BasicBlock *> getBlockEdge(const ValueDFS &VD) const {
-    if (!VD.Def && VD.U) {
+    if (VD.U) {
       auto *PHI = cast<PHINode>(VD.U->getUser());
       return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
     }
@@ -229,7 +227,14 @@ class PredicateInfoBuilder {
   void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
                   PredicateBase *PB);
 
-  typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
+  struct StackEntry {
+    const ValueDFS *V;
+    Value *Def = nullptr;
+
+    StackEntry(const ValueDFS *V) : V(V) {}
+  };
+
+  using ValueDFSStack = SmallVectorImpl<StackEntry>;
   void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
   Value *materializeStack(unsigned int &, ValueDFSStack &, Value *);
   bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
@@ -254,7 +259,7 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
   // a LN_Last def, we need to pop the stack.  We deliberately sort phi uses
   // next to the defs they must go with so that we can know it's time to pop
   // the stack when we hit the end of the phi uses for a given def.
-  const ValueDFS &Top = Stack.back();
+  const ValueDFS &Top = *Stack.back().V;
   if (Top.LocalNum == LN_Last && Top.PInfo) {
     if (!VDUse.U)
       return false;
@@ -496,8 +501,8 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
        RenameIter != RenameStack.end(); ++RenameIter) {
     auto *Op =
         RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
-    ValueDFS &Result = *RenameIter;
-    auto *ValInfo = Result.PInfo;
+    StackEntry &Result = *RenameIter;
+    auto *ValInfo = Result.V->PInfo;
     ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
                              ? OrigOp
                              : (RenameStack.end() - Start - 1)->Def;
@@ -625,19 +630,18 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
     // currently and will be considered equal. We could get rid of the
     // stable sort by creating one if we wanted.
     llvm::stable_sort(OrderedUses, Compare);
-    SmallVector<ValueDFS, 8> RenameStack;
+    SmallVector<StackEntry, 8> RenameStack;
     // For each use, sorted into dfs order, push values and replaces uses with
     // top of stack, which will represent the reaching def.
-    for (auto &VD : OrderedUses) {
+    for (const ValueDFS &VD : OrderedUses) {
       // We currently do not materialize copy over copy, but we should decide if
       // we want to.
-      bool PossibleCopy = VD.PInfo != nullptr;
       if (RenameStack.empty()) {
         LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
       } else {
         LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
-                          << RenameStack.back().DFSIn << ","
-                          << RenameStack.back().DFSOut << ")\n");
+                          << RenameStack.back().V->DFSIn << ","
+                          << RenameStack.back().V->DFSOut << ")\n");
       }
 
       LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
@@ -646,8 +650,8 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
       // Sync to our current scope.
       popStackUntilDFSScope(RenameStack, VD);
 
-      if (VD.Def || PossibleCopy) {
-        RenameStack.push_back(VD);
+      if (VD.PInfo) {
+        RenameStack.push_back(&VD);
         continue;
       }
 
@@ -659,7 +663,7 @@ void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
         LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
         continue;
       }
-      ValueDFS &Result = RenameStack.back();
+      StackEntry &Result = RenameStack.back();
 
       // If the possible copy dominates something, materialize our stack up to
       // this point. This ensures every comparison that affects our operation

From ae8c85c9ce89ad224a1b34888c6fa7d56ad5f453 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 09:11:03 +0200
Subject: [PATCH 1228/1322] [Passes] Remove LoopInterchange from O1 pipeline
 (#145071)

This is a fairly exotic pass, I don't think it makes a lot of sense to
run it at O1, esp. as vectorization wouldn't run at O1 anyway.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index a99146d5eaa3..b0cdd1b94e56 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -501,9 +501,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
 
   LPM2.addPass(LoopDeletionPass());
 
-  if (PTO.LoopInterchange)
-    LPM2.addPass(LoopInterchangePass());
-
   // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
   // because it changes IR to makes profile annotation in back compile
   // inaccurate. The normal unroller doesn't pay attention to forced full unroll

From 86beba9301112c6092cbfa3e53bdacc0d68337df Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 09:16:45 +0200
Subject: [PATCH 1229/1322] [PredicateInfo] Cache ssa.copy declarations (NFC)
 (#145020)

This pass creates a lot of ssa.copy intrinsics, typically for a small
set of types. Determining the function type, performing intrinsic name
mangling and looking up the declaration has noticeable overhead in this
case.

Improve this by caching the declarations by type. I've made this a
separate map from CreatedDeclarations, which only tracks the
declarations that were newly inserted (but not pre-existing ones).
---
 .../llvm/Transforms/Utils/PredicateInfo.h     |  2 +
 llvm/lib/Transforms/Utils/PredicateInfo.cpp   | 37 ++++++++++---------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index 670bfaa5ad6f..1619fa31fb6f 100644
--- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -206,6 +206,8 @@ private:
   DenseMap<const Value *, const PredicateBase *> PredicateMap;
   // The set of ssa_copy declarations we created with our custom mangling.
   SmallSet<AssertingVH<Function>, 20> CreatedDeclarations;
+  // Cache of ssa.copy declaration for a given type.
+  SmallDenseMap<Type *, Function *> DeclarationCache;
 };
 
 /// Printer pass for \c PredicateInfo.
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 95c125d43dea..d26707a1361b 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -506,25 +506,33 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
     ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
                              ? OrigOp
                              : (RenameStack.end() - Start - 1)->Def;
+    auto CreateSSACopy = [this](IRBuilderBase &B, Value *Op,
+                                const Twine &Name = "") {
+      auto It = PI.DeclarationCache.try_emplace(Op->getType());
+      if (It.second) {
+        // The number of named values is used to detect if a new declaration
+        // was added. If so, that declaration is tracked so that it can be
+        // removed when the analysis is done. The corner case were a new
+        // declaration results in a name clash and the old name being renamed
+        // is not considered as that represents an invalid module.
+        auto NumDecls = F.getParent()->getNumNamedValues();
+        Function *IF = Intrinsic::getOrInsertDeclaration(
+            F.getParent(), Intrinsic::ssa_copy, Op->getType());
+        if (NumDecls != F.getParent()->getNumNamedValues())
+          PI.CreatedDeclarations.insert(IF);
+        It.first->second = IF;
+      }
+      return B.CreateCall(It.first->second, Op, Name);
+    };
     // For edge predicates, we can just place the operand in the block before
     // the terminator. For assume, we have to place it right after the assume
     // to ensure we dominate all uses except assume itself. Always insert
     // right before the terminator or after the assume, so that we insert in
     // proper order in the case of multiple predicateinfo in the same block.
-    // The number of named values is used to detect if a new declaration was
-    // added. If so, that declaration is tracked so that it can be removed when
-    // the analysis is done. The corner case were a new declaration results in
-    // a name clash and the old name being renamed is not considered as that
-    // represents an invalid module.
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
-      auto NumDecls = F.getParent()->getNumNamedValues();
-      Function *IF = Intrinsic::getOrInsertDeclaration(
-          F.getParent(), Intrinsic::ssa_copy, Op->getType());
-      if (NumDecls != F.getParent()->getNumNamedValues())
-        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC =
-          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+          CreateSSACopy(B, Op, Op->getName() + "." + Twine(Counter++));
       PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
     } else {
@@ -534,12 +542,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
       // Insert the predicate directly after the assume. While it also holds
       // directly before it, assume(i1 true) is not a useful fact.
       IRBuilder<> B(PAssume->AssumeInst->getNextNode());
-      auto NumDecls = F.getParent()->getNumNamedValues();
-      Function *IF = Intrinsic::getOrInsertDeclaration(
-          F.getParent(), Intrinsic::ssa_copy, Op->getType());
-      if (NumDecls != F.getParent()->getNumNamedValues())
-        PI.CreatedDeclarations.insert(IF);
-      CallInst *PIC = B.CreateCall(IF, Op);
+      CallInst *PIC = CreateSSACopy(B, Op);
       PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
     }

From ddfc7cb61f022d78db6a2b9ac072fce233c19343 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 23 Jun 2025 08:17:17 +0100
Subject: [PATCH 1230/1322] [mlir][tosa] Check negative output size in resize
 shape inference (#143382)

This commit adds a check to ensure that the calculated output height and
width, during shape inference, should be non-negative. An error is
output if this is the case.

Fixes: #142402
---
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          | 14 ++++++++++++--
 mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 12 ++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index a32e4ccbed59..3135fbd49bfb 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -2533,16 +2533,26 @@ LogicalResult tosa::ResizeOp::inferReturnTypeComponents(
   }
 
   // Compute the output shape based on attributes: scale, offset, and border.
-  outputShape[1] =
+  const int64_t outputHeight =
       (((inputHeight - 1) * scaleInt[0] - offsetInt[0] + borderInt[0]) /
        scaleInt[1]) +
       1;
 
-  outputShape[2] =
+  const int64_t outputWidth =
       (((inputWidth - 1) * scaleInt[2] - offsetInt[1] + borderInt[1]) /
        scaleInt[3]) +
       1;
 
+  if (outputHeight < 0 || outputWidth < 0) {
+    return emitOptionalError(
+        location,
+        "calculated output height and width must be non-negative, "
+        "got height = ",
+        outputHeight, ", width = ", outputWidth);
+  }
+
+  outputShape[1] = outputHeight;
+  outputShape[2] = outputWidth;
   inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
   return success();
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 591a3f0acf65..18409d24fbc1 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -1115,6 +1115,18 @@ func.func @resize_fp_power_of_two_upscale_offsetted(%arg0: tensor<1x50x48x1xf32>
 
 // -----
 
+// CHECK-LABEL: @resize_negative_output_dim
+func.func @resize_negative_output_dim(%arg0: tensor<1x3x1x1xi8>) {
+  %scale = tosa.const_shape { values = dense<[1, 3, 1, 1]> : tensor<4xindex> } : () -> !tosa.shape<4>
+  %offset = tosa.const_shape { values = dense<[6, 1]> : tensor<2xindex> } : () -> !tosa.shape<2>
+  %border = tosa.const_shape { values = dense<[-15, 0]> : tensor<2xindex> } : () -> !tosa.shape<2>
+  // expected-error@+1 {{calculated output height and width must be non-negative, got height = -5, width = 0}}
+  %0 = tosa.resize %arg0, %scale, %offset, %border {mode = "NEAREST_NEIGHBOR"} : (tensor<1x3x1x1xi8>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<*xi8>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @if_test_simple
 func.func @if_test_simple(%arg0 : tensor<f32>, %arg1 : tensor<f32>, %arg2 : tensor<i1>) -> () {
   %a = tosa.log %arg0 : (tensor<f32>) -> tensor<f32>

From 4795b2b5a3d739b59f7f4e97fe7ef201f539d2ba Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 23 Jun 2025 13:54:22 +0800
Subject: [PATCH 1231/1322] [RISCV] Fix comment typo and indentation for class
 SchedNary. NFC.

---
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index fe43a2be4aab..96c4a1b6277b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -138,16 +138,17 @@ class SchedCommon<list<SchedWrite> writes, list<SchedRead> reads,
 }
 
 // Common class of scheduling definitions for n-ary instructions.
-// The scheudling resources are relevant to LMUL and may be relevant to SEW.
+// The scheduling resources are relevant to LMUL and may be relevant to SEW.
 class SchedNary<string write, list<string> reads, string mx, int sew = 0,
                 bit forceMasked = 0, bit forcePassthruRead = 0>
     : SchedCommon<[!cast<SchedWrite>(
-                      !if(sew,
-                          write # "_" # mx # "_E" # sew,
-                          write # "_" # mx))],
+                       !if(sew,
+                           write # "_" # mx # "_E" # sew,
+                           write # "_" # mx))],
                   !foreach(read, reads,
-                           !cast<SchedRead>(!if(sew, read #"_" #mx #"_E" #sew,
-                                                 read #"_" #mx))),
+                           !cast<SchedRead>(!if(sew,
+                                                read # "_" # mx # "_E" #sew,
+                                                read # "_" # mx))),
                   mx, sew, forceMasked, forcePassthruRead>;
 
 // Classes with postfix "MC" are only used in MC layer.

From 20c04a646bcfa20d0828e0d7eae969f262718e48 Mon Sep 17 00:00:00 2001
From: Camsyn <camsyn@foxmail.com>
Date: Mon, 23 Jun 2025 15:21:34 +0800
Subject: [PATCH 1232/1322] [NFC][Sanitizer] Fix incorrect desc of [beg, end]
 to [beg, end)

Correct the interval desc of ReleaseMemoryPagesToOS from [beg, end] to
[beg, end), as it actually does.

The previous incorrect description of [beg, end] might cause an
incorrect invoke as follows: `ReleaseMemoryPagesToOS(0, kPageSize - 1);`
---
 compiler-rt/lib/sanitizer_common/sanitizer_common.h | 2 +-
 compiler-rt/lib/sanitizer_common/sanitizer_linux.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index d9e7ded593fe..3f52cfcaeeca 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -166,7 +166,7 @@ uptr FindAvailableMemoryRange(uptr size, uptr alignment, uptr left_padding,
 
 // Used to check if we can map shadow memory to a fixed location.
 bool MemoryRangeIsAvailable(uptr range_start, uptr range_end);
-// Releases memory pages entirely within the [beg, end] address range. Noop if
+// Releases memory pages entirely within the [beg, end) address range. Noop if
 // the provided range does not contain at least one entire page.
 void ReleaseMemoryPagesToOS(uptr beg, uptr end);
 void IncreaseTotalMmap(uptr size);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
index 6959f785990f..05b7d2e28a61 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
@@ -130,7 +130,7 @@ bool LibraryNameIs(const char *full_name, const char *base_name);
 // Call cb for each region mapped by map.
 void ForEachMappedRegion(link_map *map, void (*cb)(const void *, uptr));
 
-// Releases memory pages entirely within the [beg, end] address range.
+// Releases memory pages entirely within the [beg, end) address range.
 // The pages no longer count toward RSS; reads are guaranteed to return 0.
 // Requires (but does not verify!) that pages are MAP_PRIVATE.
 inline void ReleaseMemoryPagesToOSAndZeroFill(uptr beg, uptr end) {

From 74aab3045d187ed76566fcdb5e9ce829afdbcdc1 Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Mon, 23 Jun 2025 15:24:03 +0800
Subject: [PATCH 1233/1322] [TSan, NFC] Eliminate useless calculations in TSan
 (#145283)

Previously, TSan repeatedly calculated some values in some cases, and
this change eliminates these duplicate calculations.
---
 compiler-rt/lib/tsan/rtl/tsan_rtl.cpp  |  4 +---
 compiler-rt/lib/tsan/rtl/tsan_sync.cpp | 13 ++++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index d8be21284b93..981f37b89e78 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -624,6 +624,7 @@ void MapShadow(uptr addr, uptr size) {
   static uptr mapped_meta_end = 0;
   uptr meta_begin = (uptr)MemToMeta(addr);
   uptr meta_end = (uptr)MemToMeta(addr + size);
+  // Windows wants 64K alignment.
   meta_begin = RoundDownTo(meta_begin, 64 << 10);
   meta_end = RoundUpTo(meta_end, 64 << 10);
   if (!data_mapped) {
@@ -634,9 +635,6 @@ void MapShadow(uptr addr, uptr size) {
       Die();
   } else {
     // Mapping continuous heap.
-    // Windows wants 64K alignment.
-    meta_begin = RoundDownTo(meta_begin, 64 << 10);
-    meta_end = RoundUpTo(meta_end, 64 << 10);
     CHECK_GT(meta_end, mapped_meta_end);
     if (meta_begin < mapped_meta_end)
       meta_begin = mapped_meta_end;
diff --git a/compiler-rt/lib/tsan/rtl/tsan_sync.cpp b/compiler-rt/lib/tsan/rtl/tsan_sync.cpp
index 09d41780d188..97335bc8ecf7 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_sync.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_sync.cpp
@@ -247,11 +247,14 @@ void MetaMap::MoveMemory(uptr src, uptr dst, uptr sz) {
   CHECK_NE(src, dst);
   CHECK_NE(sz, 0);
   uptr diff = dst - src;
-  u32 *src_meta = MemToMeta(src);
-  u32 *dst_meta = MemToMeta(dst);
-  u32 *src_meta_end = MemToMeta(src + sz);
-  uptr inc = 1;
-  if (dst > src) {
+  u32 *src_meta, *dst_meta, *src_meta_end;
+  uptr inc;
+  if (dst < src) {
+    src_meta = MemToMeta(src);
+    dst_meta = MemToMeta(dst);
+    src_meta_end = MemToMeta(src + sz);
+    inc = 1;
+  } else {
     src_meta = MemToMeta(src + sz) - 1;
     dst_meta = MemToMeta(dst + sz) - 1;
     src_meta_end = MemToMeta(src) - 1;

From 338ee673bde3843ec48579f11d405c768eb4b2ac Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 16:25:53 +0900
Subject: [PATCH 1234/1322] AsmPrinter: Do not use report_fatal_error for AIX
 XXStructor error (#145273)

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp              | 6 ++++--
 llvm/test/CodeGen/PowerPC/aix-static-init-key-object.ll | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 403963f33b65..541373520ffb 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3243,9 +3243,11 @@ void AsmPrinter::preprocessXXStructorList(const DataLayout &DL,
     S.Priority = Priority->getLimitedValue(65535);
     S.Func = CS->getOperand(1);
     if (!CS->getOperand(2)->isNullValue()) {
-      if (TM.getTargetTriple().isOSAIX())
-        llvm::report_fatal_error(
+      if (TM.getTargetTriple().isOSAIX()) {
+        CS->getContext().emitError(
             "associated data of XXStructor list is not yet supported on AIX");
+      }
+
       S.ComdatKey =
           dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
     }
diff --git a/llvm/test/CodeGen/PowerPC/aix-static-init-key-object.ll b/llvm/test/CodeGen/PowerPC/aix-static-init-key-object.ll
index e0a77c84b81a..b8aba26b574c 100644
--- a/llvm/test/CodeGen/PowerPC/aix-static-init-key-object.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-static-init-key-object.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
 
 @v = global i8 0
 
@@ -9,4 +9,4 @@ define void @foo() {
   ret void
 }
 
-; CHECK: LLVM ERROR: associated data of XXStructor list is not yet supported on AIX
+; CHECK: error: associated data of XXStructor list is not yet supported on AIX

From 16607f64374926a291242c0adeee2133d58b71ca Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 16:26:10 +0900
Subject: [PATCH 1235/1322] AMDGPU: Fix typo in argument allocation error
 message (#145265)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 83155ee790df..c187c8ab4543 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2459,7 +2459,7 @@ void SITargetLowering::allocateSpecialInputVGPRsFixed(
     SIMachineFunctionInfo &Info) const {
   Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
   if (!Reg)
-    report_fatal_error("failed to allocated VGPR for implicit arguments");
+    report_fatal_error("failed to allocate VGPR for implicit arguments");
 
   const unsigned Mask = 0x3ff;
   Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));

From db051e8800d33247625280328c6759e411f62421 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 16:26:27 +0900
Subject: [PATCH 1236/1322] AsmPrinter: Do not use report_fatal_error for
 unknown appending linkage (#145269)

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 5 ++++-
 llvm/test/CodeGen/X86/AppendingLinkage.ll  | 9 ++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 541373520ffb..837be95b42aa 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3207,7 +3207,10 @@ bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
     return true;
   }
 
-  report_fatal_error("unknown special variable with appending linkage");
+  GV->getContext().emitError(
+      "unknown special variable with appending linkage: " +
+      GV->getNameOrAsOperand());
+  return true;
 }
 
 /// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
diff --git a/llvm/test/CodeGen/X86/AppendingLinkage.ll b/llvm/test/CodeGen/X86/AppendingLinkage.ll
index ace5d192b7ea..77d1ce4a2148 100644
--- a/llvm/test/CodeGen/X86/AppendingLinkage.ll
+++ b/llvm/test/CodeGen/X86/AppendingLinkage.ll
@@ -1,4 +1,7 @@
-; RUN: not --crash llc < %s -mtriple=i686-- 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=i686-- 2>&1 | FileCheck %s
 
-; CHECK: unknown special variable with appending linkage
-@foo = appending constant [1 x i32 ]zeroinitializer
+; CHECK: error: unknown special variable with appending linkage: foo
+@foo = appending constant [1 x i32 ] zeroinitializer
+
+; CHECK: error: unknown special variable with appending linkage: @0
+@0 = appending constant [1 x i32 ] zeroinitializer

From 6b129d6bbf3995579df39786abdd26b8f83ffe55 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 16:29:58 +0900
Subject: [PATCH 1237/1322] AsmPrinter: Do not use report_fatal_error for
 unhandled ConstantExpr (#145275)

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp             |  5 +++--
 .../AMDGPU/addrspacecast-initializer-unsupported.ll    | 10 ++++++++--
 llvm/test/CodeGen/X86/ptrtoint-constexpr-invalid.ll    |  4 ++--
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 837be95b42aa..3fc550c6dd1a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3608,10 +3608,11 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV,
   // Otherwise report the problem to the user.
   std::string S;
   raw_string_ostream OS(S);
-  OS << "Unsupported expression in static initializer: ";
+  OS << "unsupported expression in static initializer: ";
   CE->printAsOperand(OS, /*PrintType=*/false,
                      !MF ? nullptr : MF->getFunction().getParent());
-  report_fatal_error(Twine(S));
+  CE->getContext().emitError(S);
+  return MCConstantExpr::create(0, Ctx);
 }
 
 static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
index ba8398ea227c..30ae18f64cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@@ -1,6 +1,12 @@
-; RUN: not --crash llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-enable-lower-module-lds=false < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-enable-lower-module-lds=false < %s 2> %t.err | FileCheck %s
+; RUN: FileCheck -check-prefix=ERROR %s < %t.err
+
+; ERROR: error: unsupported expression in static initializer: addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4))
+
+; CHECK: gv_flatptr_from_lds:
+; CHECK-NEXT: .quad 0+32
+; CHECK-NEXT: .size gv_flatptr_from_lds, 8
 
-; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4))
 
 @lds.arr = unnamed_addr addrspace(3) global [256 x i32] poison, align 4
 
diff --git a/llvm/test/CodeGen/X86/ptrtoint-constexpr-invalid.ll b/llvm/test/CodeGen/X86/ptrtoint-constexpr-invalid.ll
index f1f5d7e2b720..45f3ab60e904 100644
--- a/llvm/test/CodeGen/X86/ptrtoint-constexpr-invalid.ll
+++ b/llvm/test/CodeGen/X86/ptrtoint-constexpr-invalid.ll
@@ -1,10 +1,10 @@
-; RUN: not --crash llc < %s -mtriple=i386-linux 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=i386-linux 2>&1 | FileCheck %s
 
 ; ptrtoint expressions that cast to a wider integer type are not supported.
 ; A frontend can achieve a similar result by casting to the correct integer
 ; type and explicitly zeroing any additional bytes.
 ; { i32, i32 } { i32 ptrtoint (ptr @r to i32), i32 0 }
 
-; CHECK: LLVM ERROR: Unsupported expression in static initializer: ptrtoint (ptr @r to i64)
+; CHECK: error: unsupported expression in static initializer: ptrtoint (ptr @r to i64)
 
 @r = global i64 ptrtoint (ptr @r to i64)

From 48155f93dd84919432e3276a143f8ef46c247b9a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 16:33:35 +0900
Subject: [PATCH 1238/1322] CodeGen: Emit error if getRegisterByName fails
 (#145194)

This avoids using report_fatal_error and standardizes the error
message in a subset of the error conditions.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 14 ++-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 47 +++++++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  7 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  7 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 10 +--
 .../Target/Hexagon/HexagonISelLowering.cpp    |  5 +-
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp   |  9 +-
 .../LoongArch/LoongArchISelLowering.cpp       |  7 +-
 llvm/lib/Target/Mips/MipsISelLowering.cpp     | 17 ++--
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  3 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  8 +-
 llvm/lib/Target/Sparc/SparcISelLowering.cpp   |  7 +-
 .../Target/SystemZ/SystemZISelLowering.cpp    |  6 +-
 llvm/lib/Target/VE/VEISelLowering.cpp         |  8 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  5 +-
 .../CodeGen/AArch64/arm64-named-reg-alloc.ll  |  8 +-
 .../AArch64/arm64-named-reg-notareg.ll        |  8 +-
 .../AMDGPU/read-register-invalid-register.ll  | 90 +++++++++++++++++++
 .../AMDGPU/write-register-invalid-register.ll | 10 +++
 llvm/test/CodeGen/ARM/named-reg-alloc.ll      |  8 +-
 llvm/test/CodeGen/ARM/named-reg-notareg.ll    |  8 +-
 llvm/test/CodeGen/ARM/special-reg-acore.ll    |  6 +-
 llvm/test/CodeGen/ARM/special-reg-mcore.ll    |  9 +-
 llvm/test/CodeGen/ARM/special-reg-v8m-base.ll |  4 +-
 llvm/test/CodeGen/ARM/special-reg-v8m-main.ll |  4 +-
 .../CodeGen/RISCV/get-register-invalid.ll     |  4 +-
 .../SPARC/reserved-regs-unavailable.ll        |  4 +-
 llvm/test/CodeGen/X86/named-reg-alloc.ll      |  8 +-
 llvm/test/CodeGen/X86/named-reg-notareg.ll    |  8 +-
 29 files changed, 222 insertions(+), 117 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/read-register-invalid-register.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/write-register-invalid-register.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a28361051b41..e0656141d432 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -9107,8 +9107,18 @@ LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
 
   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
-  if (!PhysReg.isValid())
-    return UnableToLegalize;
+  if (!PhysReg) {
+    const Function &Fn = MF.getFunction();
+    Fn.getContext().diagnose(DiagnosticInfoGenericWithLoc(
+        "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
+            (IsRead ? "llvm.read_register" : "llvm.write_register"),
+        Fn, MI.getDebugLoc()));
+    if (IsRead)
+      MIRBuilder.buildUndef(ValReg);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
 
   if (IsRead)
     MIRBuilder.buildCopy(ValReg, PhysReg);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 4b98d87fcc63..d9b9cf6bcc77 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2460,11 +2460,25 @@ void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
 
   EVT VT = Op->getValueType(0);
   LLT Ty = VT.isSimple() ? getLLTForMVT(VT.getSimpleVT()) : LLT();
-  Register Reg =
-      TLI->getRegisterByName(RegStr->getString().data(), Ty,
-                             CurDAG->getMachineFunction());
-  SDValue New = CurDAG->getCopyFromReg(
-                        Op->getOperand(0), dl, Reg, Op->getValueType(0));
+
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  Register Reg = TLI->getRegisterByName(RegStr->getString().data(), Ty, MF);
+
+  SDValue New;
+  if (!Reg) {
+    const Function &Fn = MF.getFunction();
+    Fn.getContext().diagnose(DiagnosticInfoGenericWithLoc(
+        "invalid register \"" + Twine(RegStr->getString().data()) +
+            "\" for llvm.read_register",
+        Fn, Op->getDebugLoc()));
+    New =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0);
+    ReplaceUses(SDValue(Op, 1), Op->getOperand(0));
+  } else {
+    New =
+        CurDAG->getCopyFromReg(Op->getOperand(0), dl, Reg, Op->getValueType(0));
+  }
+
   New->setNodeId(-1);
   ReplaceUses(Op, New.getNode());
   CurDAG->RemoveDeadNode(Op);
@@ -2478,12 +2492,23 @@ void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
   EVT VT = Op->getOperand(2).getValueType();
   LLT Ty = VT.isSimple() ? getLLTForMVT(VT.getSimpleVT()) : LLT();
 
-  Register Reg = TLI->getRegisterByName(RegStr->getString().data(), Ty,
-                                        CurDAG->getMachineFunction());
-  SDValue New = CurDAG->getCopyToReg(
-                        Op->getOperand(0), dl, Reg, Op->getOperand(2));
-  New->setNodeId(-1);
-  ReplaceUses(Op, New.getNode());
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  Register Reg = TLI->getRegisterByName(RegStr->getString().data(), Ty, MF);
+
+  if (!Reg) {
+    const Function &Fn = MF.getFunction();
+    Fn.getContext().diagnose(DiagnosticInfoGenericWithLoc(
+        "invalid register \"" + Twine(RegStr->getString().data()) +
+            "\" for llvm.write_register",
+        Fn, Op->getDebugLoc()));
+    ReplaceUses(SDValue(Op, 0), Op->getOperand(0));
+  } else {
+    SDValue New =
+        CurDAG->getCopyToReg(Op->getOperand(0), dl, Reg, Op->getOperand(2));
+    New->setNodeId(-1);
+    ReplaceUses(Op, New.getNode());
+  }
+
   CurDAG->RemoveDeadNode(Op);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a2c914c6e09c..1f98d69edb47 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11977,12 +11977,9 @@ getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const
     unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
     if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
         !MRI->isReservedReg(MF, Reg))
-      Reg = 0;
+      Reg = Register();
   }
-  if (Reg)
-    return Reg;
-  report_fatal_error(Twine("Invalid register name \""
-                              + StringRef(RegName)  + "\"."));
+  return Reg;
 }
 
 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c187c8ab4543..17c7fb7bb153 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4492,11 +4492,8 @@ Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
                      .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
                      .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
                      .Default(Register());
-
-  if (Reg == AMDGPU::NoRegister) {
-    report_fatal_error(
-        Twine("invalid register name \"" + StringRef(RegName) + "\"."));
-  }
+  if (!Reg)
+    return Reg;
 
   if (!Subtarget->hasFlatScrRegister() &&
       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 4567081fe78d..2400761975f8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6166,13 +6166,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 // this table could be generated automatically from RegInfo.
 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                               const MachineFunction &MF) const {
-  Register Reg = StringSwitch<unsigned>(RegName)
-                       .Case("sp", ARM::SP)
-                       .Default(0);
-  if (Reg)
-    return Reg;
-  report_fatal_error(Twine("Invalid register name \""
-                              + StringRef(RegName)  + "\"."));
+  return StringSwitch<Register>(RegName)
+      .Case("sp", ARM::SP)
+      .Default(Register());
 }
 
 // Result is 64 bit value so split into two 32 bit values and return as a
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 078eccaa706a..6534ae938fed 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -329,10 +329,7 @@ Register HexagonTargetLowering::getRegisterByName(
                      .Case("cs0", Hexagon::CS0)
                      .Case("cs1", Hexagon::CS1)
                      .Default(Register());
-  if (Reg)
-    return Reg;
-
-  report_fatal_error("Invalid register name global variable");
+  return Reg;
 }
 
 /// LowerCallResult - Lower the result values of an ISD::CALL into the
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 7781817aef71..16ea2c8461fa 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -211,7 +211,7 @@ Register LanaiTargetLowering::getRegisterByName(
   const char *RegName, LLT /*VT*/,
   const MachineFunction & /*MF*/) const {
   // Only unallocatable registers should be matched here.
-  Register Reg = StringSwitch<unsigned>(RegName)
+  Register Reg = StringSwitch<Register>(RegName)
                      .Case("pc", Lanai::PC)
                      .Case("sp", Lanai::SP)
                      .Case("fp", Lanai::FP)
@@ -220,11 +220,8 @@ Register LanaiTargetLowering::getRegisterByName(
                      .Case("rr2", Lanai::RR2)
                      .Case("r11", Lanai::R11)
                      .Case("rca", Lanai::RCA)
-                     .Default(0);
-
-  if (Reg)
-    return Reg;
-  report_fatal_error("Invalid register name global variable");
+                     .Default(Register());
+  return Reg;
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 99dae6ec3eb0..6946ed554a7e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -7957,11 +7957,10 @@ LoongArchTargetLowering::getRegisterByName(const char *RegName, LLT VT,
   std::pair<StringRef, StringRef> Name = StringRef(RegName).split('$');
   std::string NewRegName = Name.second.str();
   Register Reg = MatchRegisterAltName(NewRegName);
-  if (Reg == LoongArch::NoRegister)
+  if (!Reg)
     Reg = MatchRegisterName(NewRegName);
-  if (Reg == LoongArch::NoRegister)
-    report_fatal_error(
-        Twine("Invalid register name \"" + StringRef(RegName) + "\"."));
+  if (!Reg)
+    return Reg;
   BitVector ReservedRegs = Subtarget.getRegisterInfo()->getReservedRegs(MF);
   if (!ReservedRegs.test(Reg))
     report_fatal_error(Twine("Trying to obtain non-reserved register \"" +
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index e933e97ea370..9b4b33672723 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -4969,17 +4969,14 @@ MipsTargetLowering::getRegisterByName(const char *RegName, LLT VT,
                        .Case("$28", Mips::GP_64)
                        .Case("sp", Mips::SP_64)
                        .Default(Register());
-    if (Reg)
-      return Reg;
-  } else {
-    Register Reg = StringSwitch<Register>(RegName)
-                       .Case("$28", Mips::GP)
-                       .Case("sp", Mips::SP)
-                       .Default(Register());
-    if (Reg)
-      return Reg;
+    return Reg;
   }
-  report_fatal_error("Invalid register name global variable");
+
+  Register Reg = StringSwitch<Register>(RegName)
+                     .Case("$28", Mips::GP)
+                     .Case("sp", Mips::SP)
+                     .Default(Register());
+  return Reg;
 }
 
 MachineBasicBlock *MipsTargetLowering::emitLDR_W(MachineInstr &MI,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f502d8570425..421a808de667 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -17984,8 +17984,7 @@ Register PPCTargetLowering::getRegisterByName(const char *RegName, LLT VT,
 
   Register Reg = MatchRegisterName(RegName);
   if (!Reg)
-    report_fatal_error(
-        Twine("Invalid global name register \"" + StringRef(RegName) + "\"."));
+    return Reg;
 
   // FIXME: Unable to generate code for `-O2` but okay for `-O0`.
   // Need followup investigation as to why.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9e568052079c..d7cfdd9905d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24563,11 +24563,11 @@ Register
 RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                        const MachineFunction &MF) const {
   Register Reg = MatchRegisterAltName(RegName);
-  if (Reg == RISCV::NoRegister)
+  if (!Reg)
     Reg = MatchRegisterName(RegName);
-  if (Reg == RISCV::NoRegister)
-    report_fatal_error(
-        Twine("Invalid register name \"" + StringRef(RegName) + "\"."));
+  if (!Reg)
+    return Reg;
+
   BitVector ReservedRegs = Subtarget.getRegisterInfo()->getReservedRegs(MF);
   if (!ReservedRegs.test(Reg) && !Subtarget.isRegisterReservedByUser(Reg))
     report_fatal_error(Twine("Trying to obtain non-reserved register \"" +
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 21ecf3d5ed70..7abe84a2451b 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1162,12 +1162,9 @@ Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT,
   // make sure that said register is in the reserve list.
   const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
   if (!TRI->isReservedReg(MF, Reg))
-    Reg = 0;
+    Reg = Register();
 
-  if (Reg)
-    return Reg;
-
-  report_fatal_error("Invalid register name global variable");
+  return Reg;
 }
 
 // Fixup floating point arguments in the ... part of a varargs call.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 1c59b1e63b7b..831618c4eff5 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1713,11 +1713,9 @@ SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                                    : SystemZ::NoRegister)
           .Case("r15",
                 Subtarget.isTargetELF() ? SystemZ::R15D : SystemZ::NoRegister)
-          .Default(SystemZ::NoRegister);
+          .Default(Register());
 
-  if (Reg)
-    return Reg;
-  report_fatal_error("Invalid register name global variable");
+  return Reg;
 }
 
 Register SystemZTargetLowering::getExceptionPointerRegister(
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 98c5fdd13898..a0389aef6dce 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -563,12 +563,8 @@ Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
                      .Case("info", VE::SX17)  // Info area register
                      .Case("got", VE::SX15)   // Global offset table register
                      .Case("plt", VE::SX16) // Procedure linkage table register
-                     .Default(0);
-
-  if (Reg)
-    return Reg;
-
-  report_fatal_error("Invalid register name global variable");
+                     .Default(Register());
+  return Reg;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 33083c0eba69..2541182de120 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28312,10 +28312,7 @@ Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
 #endif
   }
 
-  if (Reg)
-    return Reg;
-
-  report_fatal_error("Invalid register name global variable");
+  return Reg;
 }
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
diff --git a/llvm/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/llvm/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
index bd14ec61b55c..1f18e9266fa9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
@@ -1,11 +1,11 @@
-; RUN: not --crash llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
-; RUN: not --crash llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-apple-darwin -filetype=null 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi -filetype=null 2>&1 | FileCheck %s
 
 define i32 @get_stack() nounwind {
 entry:
 ; FIXME: Include an allocatable-specific error message
-; CHECK: Invalid register name "x5".
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
+; CHECK: error: <unknown>:0:0: invalid register "x5" for llvm.read_register
+  %sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/llvm/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
index fe5f000a393e..b671026251da 100644
--- a/llvm/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
@@ -1,10 +1,10 @@
-; RUN: not --crash llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
-; RUN: not --crash llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
 
 define i32 @get_stack() nounwind {
 entry:
-; CHECK: Invalid register name "notareg".
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
+; CHECK: error: <unknown>:0:0: invalid register "notareg" for llvm.read_register
+  %sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-register.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-register.ll
new file mode 100644
index 000000000000..28f9ca8d6576
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-register.ll
@@ -0,0 +1,90 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -filetype=null < %s 2>&1 | FileCheck --implicit-check-not=error %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -filetype=null < %s 2>&1 | FileCheck --implicit-check-not=error %s
+
+declare i32 @llvm.read_register.i32(metadata) #0
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_i1(ptr addrspace(1) %out) nounwind {
+  %reg = call i1 @llvm.read_register.i1(metadata !0)
+  store i1 %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_i16(ptr addrspace(1) %out) nounwind {
+  %reg = call i16 @llvm.read_register.i16(metadata !0)
+  store i16 %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_i32(ptr addrspace(1) %out) nounwind {
+  %reg = call i32 @llvm.read_register.i32(metadata !0)
+  store i32 %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_i64(ptr addrspace(1) %out) nounwind {
+  %reg = call i64 @llvm.read_register.i64(metadata !0)
+  store i64 %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v2i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <2 x i32> @llvm.read_register.v2i32(metadata !0)
+  store <2 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v3i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <3 x i32> @llvm.read_register.v3i32(metadata !0)
+  store <3 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v4i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <4 x i32> @llvm.read_register.v4i32(metadata !0)
+  store <4 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v5i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <5 x i32> @llvm.read_register.v5i32(metadata !0)
+  store <5 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v6i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <6 x i32> @llvm.read_register.v6i32(metadata !0)
+  store <6 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v8i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <8 x i32> @llvm.read_register.v8i32(metadata !0)
+  store <8 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v16i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <16 x i32> @llvm.read_register.v16i32(metadata !0)
+  store <16 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.read_register
+define amdgpu_kernel void @test_invalid_register_v32i32(ptr addrspace(1) %out) nounwind {
+  %reg = call <32 x i32> @llvm.read_register.v32i32(metadata !0)
+  store <32 x i32> %reg, ptr addrspace(1) %out
+  ret void
+}
+
+!0 = !{!"not-a-register"}
diff --git a/llvm/test/CodeGen/AMDGPU/write-register-invalid-register.ll b/llvm/test/CodeGen/AMDGPU/write-register-invalid-register.ll
new file mode 100644
index 000000000000..62d75527a796
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/write-register-invalid-register.ll
@@ -0,0 +1,10 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -filetype=null < %s 2>&1 | FileCheck --implicit-check-not=error %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -filetype=null < %s 2>&1 | FileCheck --implicit-check-not=error %s
+
+; CHECK: error: <unknown>:0:0: invalid register "not-a-register" for llvm.write_register
+define amdgpu_kernel void @test_invalid_write_register_i32() nounwind {
+  call void @llvm.write_register.i32(metadata !0, i32 0)
+  ret void
+}
+
+!0 = !{!"not-a-register"}
diff --git a/llvm/test/CodeGen/ARM/named-reg-alloc.ll b/llvm/test/CodeGen/ARM/named-reg-alloc.ll
index 535149a67455..9325e00dcb01 100644
--- a/llvm/test/CodeGen/ARM/named-reg-alloc.ll
+++ b/llvm/test/CodeGen/ARM/named-reg-alloc.ll
@@ -1,11 +1,11 @@
-; RUN: not --crash llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
-; RUN: not --crash llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-apple-darwin -filetype=null 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-linux-gnueabi -filetype=null 2>&1 | FileCheck %s
 
 define i32 @get_stack() nounwind {
 entry:
 ; FIXME: Include an allocatable-specific error message
-; CHECK: Invalid register name "r5".
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
+; CHECK: error: <unknown>:0:0: invalid register "r5" for llvm.read_register
+  %sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 
diff --git a/llvm/test/CodeGen/ARM/named-reg-notareg.ll b/llvm/test/CodeGen/ARM/named-reg-notareg.ll
index 0af948250050..a2eb01629373 100644
--- a/llvm/test/CodeGen/ARM/named-reg-notareg.ll
+++ b/llvm/test/CodeGen/ARM/named-reg-notareg.ll
@@ -1,10 +1,10 @@
-; RUN: not --crash llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
-; RUN: not --crash llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
 
 define i32 @get_stack() nounwind {
 entry:
-; CHECK: Invalid register name "notareg".
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
+; CHECK: error: <unknown>:0:0: invalid register "notareg" for llvm.read_register
+  %sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 
diff --git a/llvm/test/CodeGen/ARM/special-reg-acore.ll b/llvm/test/CodeGen/ARM/special-reg-acore.ll
index 30e59b14685f..64a1a47baaee 100644
--- a/llvm/test/CodeGen/ARM/special-reg-acore.ll
+++ b/llvm/test/CodeGen/ARM/special-reg-acore.ll
@@ -1,7 +1,9 @@
 ; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE
-; RUN: not --crash llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=MCORE
+; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=MCORE
+
+; MCORE: error: <unknown>:0:0: invalid register "cpsr" for llvm.read_register
+; MCORE: error: <unknown>:0:0: invalid register "spsr_cxsf" for llvm.write_register
 
-; MCORE: LLVM ERROR: Invalid register name "cpsr".
 
 define i32 @read_cpsr() nounwind {
   ; ACORE-LABEL: read_cpsr:
diff --git a/llvm/test/CodeGen/ARM/special-reg-mcore.ll b/llvm/test/CodeGen/ARM/special-reg-mcore.ll
index dff02ce2ea44..1104dc6a81e9 100644
--- a/llvm/test/CodeGen/ARM/special-reg-mcore.ll
+++ b/llvm/test/CodeGen/ARM/special-reg-mcore.ll
@@ -1,9 +1,10 @@
 ; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 --show-mc-encoding 2>&1 | FileCheck %s --check-prefix=MCORE
-; RUN: not --crash llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m3 2>&1 | FileCheck %s --check-prefix=M3CORE
-; RUN: not --crash llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE
+; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m3 2>&1 | FileCheck %s --check-prefix=M3CORE
+; RUN: not llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE
+
+; ACORE: error: <unknown>:0:0: invalid register "control" for llvm.write_register
+; M3CORE: error: <unknown>:0:0: invalid register "xpsr_nzcvqg" for llvm.write_register
 
-; ACORE: LLVM ERROR: Invalid register name "control".
-; M3CORE: LLVM ERROR: Invalid register name "xpsr_nzcvqg".
 
 define i32 @read_mclass_registers() nounwind {
 entry:
diff --git a/llvm/test/CodeGen/ARM/special-reg-v8m-base.ll b/llvm/test/CodeGen/ARM/special-reg-v8m-base.ll
index 5b74a55fe8c8..b7251392b822 100644
--- a/llvm/test/CodeGen/ARM/special-reg-v8m-base.ll
+++ b/llvm/test/CodeGen/ARM/special-reg-v8m-base.ll
@@ -1,7 +1,7 @@
-; RUN: not --crash llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=V7M
+; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=V7M
 ; RUN: llc < %s -mtriple=thumbv8m.base-none-eabi 2>&1 | FileCheck %s
 
-; V7M: LLVM ERROR: Invalid register name "sp_ns".
+; V7M: error: <unknown>:0:0: invalid register "sp_ns" for llvm.read_register
 
 define i32 @read_mclass_registers() nounwind {
 entry:
diff --git a/llvm/test/CodeGen/ARM/special-reg-v8m-main.ll b/llvm/test/CodeGen/ARM/special-reg-v8m-main.ll
index 9a314fa4c771..5ebd7c51d64b 100644
--- a/llvm/test/CodeGen/ARM/special-reg-v8m-main.ll
+++ b/llvm/test/CodeGen/ARM/special-reg-v8m-main.ll
@@ -1,7 +1,7 @@
-; RUN: not --crash llc < %s -mtriple=thumbv8m.base-none-eabi 2>&1 | FileCheck %s --check-prefix=BASELINE
+; RUN: not llc < %s -mtriple=thumbv8m.base-none-eabi 2>&1 | FileCheck %s --check-prefix=BASELINE
 ; RUN: llc < %s -mtriple=thumbv8m.main-none-eabi -mattr=+dsp 2>&1 | FileCheck %s --check-prefix=MAINLINE
 
-; BASELINE: LLVM ERROR: Invalid register name "faultmask_ns".
+; BASELINE: error: <unknown>:0:0: invalid register "faultmask_ns" for llvm.read_register
 
 define i32 @read_mclass_registers() nounwind {
 entry:
diff --git a/llvm/test/CodeGen/RISCV/get-register-invalid.ll b/llvm/test/CodeGen/RISCV/get-register-invalid.ll
index 1f23445b4f73..a86de3e8868f 100644
--- a/llvm/test/CodeGen/RISCV/get-register-invalid.ll
+++ b/llvm/test/CodeGen/RISCV/get-register-invalid.ll
@@ -1,8 +1,8 @@
-; RUN: not --crash llc < %s -mtriple=riscv32 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=riscv32 2>&1 | FileCheck %s
 
 define i32 @get_invalid_reg() nounwind {
 entry:
-; CHECK: Invalid register name "notareg".
+; CHECK: error: <unknown>:0:0: invalid register "notareg" for llvm.read_register
   %reg = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %reg
 }
diff --git a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
index 53ca045f1004..45f53a590230 100644
--- a/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
+++ b/llvm/test/CodeGen/SPARC/reserved-regs-unavailable.ll
@@ -1,9 +1,9 @@
-; RUN: not --crash llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
+; RUN: not llc -mtriple=sparc64-linux-gnu -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK-RESERVED-L0
 
 ;; Ensure explicit register references for non-reserved registers
 ;; are caught properly.
 
-; CHECK-RESERVED-L0: LLVM ERROR: Invalid register name global variable
+; CHECK-RESERVED-L0: error: <unknown>:0:0: invalid register "l0" for llvm.write_register
 define void @set_reg(i32 zeroext %x) {
 entry:
   tail call void @llvm.write_register.i32(metadata !0, i32 %x)
diff --git a/llvm/test/CodeGen/X86/named-reg-alloc.ll b/llvm/test/CodeGen/X86/named-reg-alloc.ll
index 34c5ea99f94c..464c5e880139 100644
--- a/llvm/test/CodeGen/X86/named-reg-alloc.ll
+++ b/llvm/test/CodeGen/X86/named-reg-alloc.ll
@@ -1,11 +1,11 @@
-; RUN: not --crash llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
-; RUN: not --crash llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
 
 define i32 @get_stack() nounwind {
 entry:
 ; FIXME: Include an allocatable-specific error message
-; CHECK: Invalid register name global variable
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
+; CHECK: error: <unknown>:0:0: invalid register "eax" for llvm.read_register
+  %sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 
diff --git a/llvm/test/CodeGen/X86/named-reg-notareg.ll b/llvm/test/CodeGen/X86/named-reg-notareg.ll
index 6da65e2dfd02..cb3736df9ce5 100644
--- a/llvm/test/CodeGen/X86/named-reg-notareg.ll
+++ b/llvm/test/CodeGen/X86/named-reg-notareg.ll
@@ -1,10 +1,10 @@
-; RUN: not --crash llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
-; RUN: not --crash llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
 
 define i32 @get_stack() nounwind {
 entry:
-; CHECK: Invalid register name global variable
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
+; CHECK: error: <unknown>:0:0: invalid register "notareg" for llvm.read_register
+  %sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 

From 2dcf436340849d757432dc76ac46bf537e10fd8c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 16:49:46 +0900
Subject: [PATCH 1239/1322] AMDGPU: Remove legacy pass manager version of
 AMDGPUAttributor (#145262)

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  3 --
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 41 -------------------
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  1 -
 .../AMDGPU/propagate-flat-work-group-size.ll  |  1 -
 4 files changed, 46 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 68a3caf59544..71dd99c0d7a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -98,9 +98,6 @@ void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
 void initializeAMDGPUAsmPrinterPass(PassRegistry &);
 
-Pass *createAMDGPUAttributorLegacyPass();
-void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
-
 // DPP/Iterative option enables the atomic optimizer with given strategy
 // whereas None disables the atomic optimizer.
 enum class ScanOptions { DPP, Iterative, None };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index f4d3a014f992..fef22c81c939 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1442,36 +1442,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
 
   return Changed;
 }
-
-class AMDGPUAttributorLegacy : public ModulePass {
-public:
-  AMDGPUAttributorLegacy() : ModulePass(ID) {}
-
-  /// doInitialization - Virtual method overridden by subclasses to do
-  /// any necessary initialization before any pass is run.
-  bool doInitialization(Module &) override {
-    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-    if (!TPC)
-      report_fatal_error("TargetMachine is required");
-
-    TM = &TPC->getTM<TargetMachine>();
-    return false;
-  }
-
-  bool runOnModule(Module &M) override {
-    AnalysisGetter AG(this);
-    return runImpl(M, AG, *TM, /*Options=*/{},
-                   /*LTOPhase=*/ThinOrFullLTOPhase::None);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<CycleInfoWrapperPass>();
-  }
-
-  StringRef getPassName() const override { return "AMDGPU Attributor"; }
-  TargetMachine *TM;
-  static char ID;
-};
 } // namespace
 
 PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
@@ -1485,14 +1455,3 @@ PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
   return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
                                                : PreservedAnalyses::all();
 }
-
-char AMDGPUAttributorLegacy::ID = 0;
-
-Pass *llvm::createAMDGPUAttributorLegacyPass() {
-  return new AMDGPUAttributorLegacy();
-}
-INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
-INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
-                    false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 88613cf5eb4c..d2e4825cf3c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -517,7 +517,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
-  initializeAMDGPUAttributorLegacyPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index d1e1073530df..13605a1f7230 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
 ; Check propagation of amdgpu-flat-work-group-size attribute.

From 529662a6b5f1eec832d35786b6c49da1d21513e8 Mon Sep 17 00:00:00 2001
From: Andrei Golubev <andrey.golubev@intel.com>
Date: Mon, 23 Jun 2025 09:50:28 +0200
Subject: [PATCH 1240/1322] [mlir] Allow accessing
 DialectResourceBlobManager::blobMap (#142352)

Add a new API to access all blobs that are stored in the blob manager.
The main purpose (as of now) is to allow users of dialect resources to
iterate over all blobs, especially when the blobs are no longer used in
IR (e.g. the operation that uses the blob is deleted) and thus cannot be
easily accessed without manual tracking of keys.
---
 .../mlir/IR/DialectResourceBlobManager.h      |  7 +-
 mlir/lib/IR/DialectResourceBlobManager.cpp    |  8 ++
 mlir/unittests/IR/BlobManagerTest.cpp         | 74 +++++++++++++++++++
 mlir/unittests/IR/CMakeLists.txt              |  1 +
 4 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 mlir/unittests/IR/BlobManagerTest.cpp

diff --git a/mlir/include/mlir/IR/DialectResourceBlobManager.h b/mlir/include/mlir/IR/DialectResourceBlobManager.h
index e3f32b7a9ab5..6c30efde306e 100644
--- a/mlir/include/mlir/IR/DialectResourceBlobManager.h
+++ b/mlir/include/mlir/IR/DialectResourceBlobManager.h
@@ -93,9 +93,14 @@ public:
     return HandleT(&entry, dialect);
   }
 
+  /// Provide access to all the registered blobs via a callable. During access
+  /// the blob map is guaranteed to remain unchanged.
+  void getBlobMap(llvm::function_ref<void(const llvm::StringMap<BlobEntry> &)>
+                      accessor) const;
+
 private:
   /// A mutex to protect access to the blob map.
-  llvm::sys::SmartRWMutex<true> blobMapLock;
+  mutable llvm::sys::SmartRWMutex<true> blobMapLock;
 
   /// The internal map of tracked blobs. StringMap stores entries in distinct
   /// allocations, so we can freely take references to the data without fear of
diff --git a/mlir/lib/IR/DialectResourceBlobManager.cpp b/mlir/lib/IR/DialectResourceBlobManager.cpp
index b83b31e30ef1..83cc1879241d 100644
--- a/mlir/lib/IR/DialectResourceBlobManager.cpp
+++ b/mlir/lib/IR/DialectResourceBlobManager.cpp
@@ -63,3 +63,11 @@ auto DialectResourceBlobManager::insert(StringRef name,
     nameStorage.resize(name.size() + 1);
   } while (true);
 }
+
+void DialectResourceBlobManager::getBlobMap(
+    llvm::function_ref<void(const llvm::StringMap<BlobEntry> &)> accessor)
+    const {
+  llvm::sys::SmartScopedReader<true> reader(blobMapLock);
+
+  accessor(blobMap);
+}
diff --git a/mlir/unittests/IR/BlobManagerTest.cpp b/mlir/unittests/IR/BlobManagerTest.cpp
new file mode 100644
index 000000000000..d82482ddb793
--- /dev/null
+++ b/mlir/unittests/IR/BlobManagerTest.cpp
@@ -0,0 +1,74 @@
+//===- mlir/unittest/IR/BlobManagerTest.cpp - Blob management unit tests --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../test/lib/Dialect/Test/TestDialect.h"
+#include "mlir/IR/DialectResourceBlobManager.h"
+#include "mlir/Parser/Parser.h"
+
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+namespace {
+
+StringLiteral moduleStr = R"mlir(
+"test.use1"() {attr = dense_resource<blob1> : tensor<1xi64> } : () -> ()
+
+{-#
+    dialect_resources: {
+    builtin: {
+        blob1: "0x08000000ABCDABCDABCDABCE"
+    }
+    }
+#-}
+)mlir";
+
+TEST(DialectResourceBlobManagerTest, Lookup) {
+  MLIRContext context;
+  context.loadDialect<test::TestDialect>();
+
+  OwningOpRef<ModuleOp> m = parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(m);
+
+  const auto &dialectManager =
+      mlir::DenseResourceElementsHandle::getManagerInterface(&context);
+  ASSERT_NE(dialectManager.getBlobManager().lookup("blob1"), nullptr);
+}
+
+TEST(DialectResourceBlobManagerTest, GetBlobMap) {
+  MLIRContext context;
+  context.loadDialect<test::TestDialect>();
+
+  OwningOpRef<ModuleOp> m = parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(m);
+
+  Block *block = m->getBody();
+  auto &op = block->getOperations().front();
+  auto resourceAttr = op.getAttrOfType<DenseResourceElementsAttr>("attr");
+  ASSERT_NE(resourceAttr, nullptr);
+
+  const auto &dialectManager =
+      resourceAttr.getRawHandle().getManagerInterface(&context);
+
+  bool blobsArePresent = false;
+  dialectManager.getBlobManager().getBlobMap(
+      [&](const llvm::StringMap<DialectResourceBlobManager::BlobEntry>
+              &blobMap) { blobsArePresent = blobMap.contains("blob1"); });
+  ASSERT_TRUE(blobsArePresent);
+
+  // remove operations that use resources - resources must still be accessible
+  block->clear();
+
+  blobsArePresent = false;
+  dialectManager.getBlobManager().getBlobMap(
+      [&](const llvm::StringMap<DialectResourceBlobManager::BlobEntry>
+              &blobMap) { blobsArePresent = blobMap.contains("blob1"); });
+  ASSERT_TRUE(blobsArePresent);
+}
+
+} // end anonymous namespace
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index 9ab6029c3480..770064486457 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_unittest(MLIRIRTests
   TypeAttrNamesTest.cpp
   OpPropertiesTest.cpp
   ValueTest.cpp
+  BlobManagerTest.cpp
 
   DEPENDS
   MLIRTestInterfaceIncGen

From 1c78d8d9d7bcb4b20910047ad7db35f177a17c8c Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 23 Jun 2025 10:14:33 +0200
Subject: [PATCH 1241/1322] [clang][bytecode] Fix shifts with an allocated RHS
 (#145280)

This was broken before because we ended up using a constructor that was
disabled via assert(false). Use ShiftAP() if either LHS or RHS is
allocated.
---
 clang/lib/AST/ByteCode/IntegralAP.h |  2 +-
 clang/lib/AST/ByteCode/Interp.h     | 63 +++++++++++++++++------------
 clang/test/AST/ByteCode/intap.cpp   | 14 +++++++
 3 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 316c003e0e50..e7499fc9bf5a 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -131,8 +131,8 @@ public:
     if (NumBits == 0)
       NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
+    assert(APInt::getNumWords(NumBits) == 1);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-    assert(false);
     return IntegralAP<Signed>(Copy);
   }
 
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 66d3e6d79e8b..cb6eb1e76c43 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2713,7 +2713,7 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 template <class LT, class RT, ShiftDir Dir>
 inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
                     LT *Result) {
-
+  static_assert(!needsAlloc<LT>());
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2770,7 +2770,10 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
       LT::AsUnsigned::shiftLeft(LT::AsUnsigned::from(LHS),
                                 LT::AsUnsigned::from(RHS, Bits), Bits, &R);
     }
-  } else {
+    S.Stk.push<LT>(LT::from(R));
+    return true;
+  }
+
     // Right shift.
     if (Compare(RHS, RT::from(MaxShiftAmount, RHS.bitWidth())) ==
         ComparisonCategoryResult::Greater) {
@@ -2779,10 +2782,8 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
-      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
-  }
 
   S.Stk.push<LT>(LT::from(R));
   return true;
@@ -2790,40 +2791,43 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
 
 /// A version of DoShift that works on IntegralAP.
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                      LT *Result) {
-  const unsigned Bits = LHS.bitWidth();
-  const APSInt &LHSAP = LHS.toAPSInt();
-  APSInt RHSAP = RHS.toAPSInt();
+inline bool DoShiftAP(InterpState &S, CodePtr OpPC, const APSInt &LHS,
+                      APSInt RHS, LT *Result) {
+  const unsigned Bits = LHS.getBitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
   if (S.getLangOpts().OpenCL)
-    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
-                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
-                    RHSAP.isUnsigned());
+    RHS &=
+        APSInt(llvm::APInt(RHS.getBitWidth(), static_cast<uint64_t>(Bits - 1)),
+               RHS.isUnsigned());
 
   if (RHS.isNegative()) {
     // During constant-folding, a negative shift is an opposite shift. Such a
     // shift is not a constant expression.
     const SourceInfo &Loc = S.Current->getSource(OpPC);
-    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
+    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS; //.toAPSInt();
     if (!S.noteUndefinedBehavior())
       return false;
-    RHS = -RHS;
     return DoShiftAP<LT, RT,
                      Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
+        S, OpPC, LHS, -RHS, Result);
   }
 
-  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
+  if (!CheckShift<Dir>(S, OpPC, static_cast<LT>(LHS), static_cast<RT>(RHS),
+                       Bits))
     return false;
 
+  unsigned SA = (unsigned)RHS.getLimitedValue(Bits - 1);
   if constexpr (Dir == ShiftDir::Left) {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP << SA);
+    if constexpr (needsAlloc<LT>())
+      Result->copy(LHS << SA);
+    else
+      *Result = LT(LHS << SA);
   } else {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP >> SA);
+    if constexpr (needsAlloc<LT>())
+      Result->copy(LHS >> SA);
+    else
+      *Result = LT(LHS >> SA);
   }
 
   S.Stk.push<LT>(*Result);
@@ -2837,9 +2841,12 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  if constexpr (needsAlloc<LT>() || needsAlloc<RT>()) {
+    LT Result;
+    if constexpr (needsAlloc<LT>())
+      Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS.toAPSInt(),
+                                              RHS.toAPSInt(), &Result);
   } else {
     LT Result;
     return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
@@ -2852,9 +2859,13 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+
+  if constexpr (needsAlloc<LT>() || needsAlloc<RT>()) {
+    LT Result;
+    if constexpr (needsAlloc<LT>())
+      Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS.toAPSInt(),
+                                             RHS.toAPSInt(), &Result);
   } else {
     LT Result;
     return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
diff --git a/clang/test/AST/ByteCode/intap.cpp b/clang/test/AST/ByteCode/intap.cpp
index 3f952ddf626b..11dd9edb61a9 100644
--- a/clang/test/AST/ByteCode/intap.cpp
+++ b/clang/test/AST/ByteCode/intap.cpp
@@ -273,4 +273,18 @@ namespace IncDec {
 #endif
 }
 
+#if __cplusplus >= 201402L
+const __int128_t a = ( (__int128_t)1 << 64 );
+const _BitInt(72) b = ( 1 << 72 ); // both-warning {{shift count >= width of type}}
+constexpr int shifts() { // both-error {{never produces a constant expression}}
+  (void)(2 >> a); // both-warning {{shift count >= width of type}} \
+                  // both-note {{shift count 18446744073709551616 >= width of type 'int' (32 bits)}}
+  (void)(2 >> b); // ref-warning {{shift count is negative}}
+  (void)(2 << a); // both-warning {{shift count >= width of type}}
+  (void)(2 << b); // ref-warning {{shift count is negative}}
+  return 1;
+}
+#endif
+
+
 #endif

From 4af96a9d83335b3b59f3441af47c879c7a9eb183 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 23 Jun 2025 09:28:33 +0100
Subject: [PATCH 1242/1322] [MLIR] Determine contiguousness of memrefs with
 dynamic dimensions (#142421)

This patch enhances `MemRefType::areTrailingDimsContiguous` to also
handle memrefs with dynamic dimensions.

The implementation itself is based on a new member function
`MemRefType::getMaxCollapsableTrailingDims` that return the maximum
number of trailing dimensions that can be collapsed - trivially all
dimensions for memrefs with identity layout, or by examining the memref
strides stopping at discontiguous or statically unknown strides.
---
 .../mlir/Dialect/Utils/IndexingUtils.h        |   2 +-
 mlir/include/mlir/IR/BuiltinTypes.td          |  19 +++
 mlir/lib/Dialect/Utils/IndexingUtils.cpp      |   3 +-
 mlir/lib/IR/BuiltinTypes.cpp                  |  50 ++++----
 .../Vector/vector-transfer-flatten.mlir       |  95 +++++++++++++--
 mlir/unittests/IR/CMakeLists.txt              |   1 +
 mlir/unittests/IR/MemrefLayoutTest.cpp        | 111 ++++++++++++++++++
 7 files changed, 249 insertions(+), 32 deletions(-)
 create mode 100644 mlir/unittests/IR/MemrefLayoutTest.cpp

diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index 99218f491dde..852407292979 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -40,7 +40,7 @@ class ArrayAttr;
 /// Assuming `sizes` is `[s0, .. sn]`, return the vector<int64_t>
 ///   `[s1 * ... * sn, s2 * ... * sn, ..., sn, 1]`.
 ///
-/// `sizes` elements are asserted to be non-negative.
+/// `sizes` elements `s1` to `sn` are asserted to be non-negative.
 ///
 /// Return an empty vector if `sizes` is empty.
 SmallVector<int64_t> computeSuffixProduct(ArrayRef<int64_t> sizes);
diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td
index 89ade79a3ac0..a0c8acea91dc 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.td
+++ b/mlir/include/mlir/IR/BuiltinTypes.td
@@ -839,6 +839,25 @@ def Builtin_MemRef : Builtin_Type<"MemRef", "memref", [
     ///
     bool areTrailingDimsContiguous(int64_t n);
 
+    /// Return the number of trailing dimensions that are contiguous.
+    ///
+    /// Examples:
+    ///   - memref<5x3x2xi8, strided<[6,2,1]>>, the number of collapsable
+    ///     trailing dimensions is 3
+    ///   - memref<5x3x2xi8, strided<[12,2,1]>>, the number of collapsable
+    ///     trailing dimensions is 2 (dimension 0 is non-contiguous)
+    ///   - memref<5x3x2xi8, strided<[12,4,1]>>, the number of collapsable
+    ///     trailing dimensions is 1 (dimension 1 is non-contiguous)
+    ///   - memref<5x3x2xi8, strided<[12,4,2]>>, the number of collapsable
+    ///     trailing dimensions is 0 (dimension 2 is non-contiguous)
+    ///   - memref<?x3x2xi8, strided<[6,2,1]>>, the number of collapsable
+    ///     trailing dimensions is 3
+    ///   - memref<?x3x2xi8, strided<[12,2,1]>>, the number of collapsable
+    ///     trailing dimensions is 2 (dimension 0 is non-contiguous)
+    ///   - memref<5x?x2xi8, strided<[?,2,1]>>, the number of collapsable
+    ///     trailing dimensions is 2 (stride 0 is dynamic)
+    int64_t getNumContiguousTrailingDims();
+
     /// Return a version of this type with identity layout if it can be
     /// determined statically that the layout is the canonical contiguous
     /// strided layout. Otherwise pass the layout into `simplifyAffineMap`
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index 8de77e2c3cb0..e1648ab99ff2 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -69,7 +69,8 @@ SmallVector<ExprType> delinearizeImpl(ExprType linearIndex,
 //===----------------------------------------------------------------------===//
 
 SmallVector<int64_t> mlir::computeSuffixProduct(ArrayRef<int64_t> sizes) {
-  assert(llvm::all_of(sizes, [](int64_t s) { return s >= 0; }) &&
+  assert((sizes.empty() ||
+          llvm::all_of(sizes.drop_front(), [](int64_t s) { return s >= 0; })) &&
          "sizes must be nonnegative");
   int64_t unit = 1;
   return ::computeSuffixProductImpl(sizes, unit);
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index e3a00ac5a14b..6661efa8907b 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -660,35 +660,45 @@ LogicalResult MemRefType::verify(function_ref<InFlightDiagnostic()> emitError,
 }
 
 bool MemRefType::areTrailingDimsContiguous(int64_t n) {
-  if (!isLastDimUnitStride())
-    return false;
+  assert(n <= getRank() &&
+         "number of dimensions to check must not exceed rank");
+  return n <= getNumContiguousTrailingDims();
+}
 
-  auto memrefShape = getShape().take_back(n);
-  if (ShapedType::isDynamicShape(memrefShape))
-    return false;
+int64_t MemRefType::getNumContiguousTrailingDims() {
+  const int64_t n = getRank();
 
+  // memrefs with identity layout are entirely contiguous.
   if (getLayout().isIdentity())
-    return true;
+    return n;
 
+  // Get the strides (if any). Failing to do that, conservatively assume a
+  // non-contiguous layout.
   int64_t offset;
-  SmallVector<int64_t> stridesFull;
-  if (!succeeded(getStridesAndOffset(stridesFull, offset)))
-    return false;
-  auto strides = ArrayRef<int64_t>(stridesFull).take_back(n);
+  SmallVector<int64_t> strides;
+  if (!succeeded(getStridesAndOffset(strides, offset)))
+    return 0;
 
-  if (strides.empty())
-    return true;
+  ArrayRef<int64_t> shape = getShape();
 
-  // Check whether strides match "flattened" dims.
-  SmallVector<int64_t> flattenedDims;
-  auto dimProduct = 1;
-  for (auto dim : llvm::reverse(memrefShape.drop_front(1))) {
-    dimProduct *= dim;
-    flattenedDims.push_back(dimProduct);
+  // A memref with dimensions `d0, d1, ..., dn-1` and strides
+  // `s0, s1, ..., sn-1` is contiguous up to dimension `k`
+  // if each stride `si` is the product of the dimensions `di+1, ..., dn-1`,
+  // for `i` in `[k, n-1]`.
+  // Ignore stride elements if the corresponding dimension is 1, as they are
+  // of no consequence.
+  int64_t dimProduct = 1;
+  for (int64_t i = n - 1; i >= 0; --i) {
+    if (shape[i] == 1)
+      continue;
+    if (strides[i] != dimProduct)
+      return n - i - 1;
+    if (shape[i] == ShapedType::kDynamic)
+      return n - i;
+    dimProduct *= shape[i];
   }
 
-  strides = strides.drop_back(1);
-  return llvm::equal(strides, llvm::reverse(flattenedDims));
+  return n;
 }
 
 MemRefType MemRefType::canonicalizeStridedLayout() {
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index e840dc6bbf22..45873aa93153 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -188,9 +188,35 @@ func.func @transfer_read_leading_dynamic_dims(
 
 // -----
 
-// One of the dims to be flattened is dynamic - not supported ATM.
+// The vector is a non-contiguous slice of the input
+// memref.
 
 func.func @negative_transfer_read_dynamic_dim_to_flatten(
+    %mem : memref<4x?x?x2xi8>) -> vector<2x2x2xi8> {
+
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0 : i8
+  %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst :
+    memref<4x?x?x2xi8>, vector<2x2x2xi8>
+  return %res : vector<2x2x2xi8>
+}
+
+// CHECK-LABEL: func.func @negative_transfer_read_dynamic_dim_to_flatten(
+// CHECK-NOT: memref.collapse_shape
+// CHECK-NOT: vector.shape_cast
+
+// CHECK-128B-LABEL: func @negative_transfer_read_dynamic_dim_to_flatten(
+//   CHECK-128B-NOT:   memref.collapse_shape
+
+// -----
+
+// When collapsing memref dimensions, we may include the rightmost dynamic
+// dimension (e.g., at position `k`) provided that the strides for dimensions
+// `k+1`, `k+2`, etc., ensure contiguity in memory. The stride at position `k`
+// itself does not factor into this. (Here "strides" mean both explicit and
+// implied by identity map)
+
+func.func @transfer_read_dynamic_dim_to_flatten(
     %idx_1: index,
     %idx_2: index,
     %mem: memref<1x?x4x6xi32>) -> vector<1x2x6xi32> {
@@ -203,11 +229,25 @@ func.func @negative_transfer_read_dynamic_dim_to_flatten(
   return %res : vector<1x2x6xi32>
 }
 
-// CHECK-LABEL: func.func @negative_transfer_read_dynamic_dim_to_flatten
-// CHECK-NOT: memref.collapse_shape
-// CHECK-NOT: vector.shape_cast
+// CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
 
-// CHECK-128B-LABEL: func @negative_transfer_read_dynamic_dim_to_flatten
+// CHECK-LABEL: func.func @transfer_read_dynamic_dim_to_flatten
+// CHECK-SAME:    %[[IDX_1:arg0]]
+// CHECK-SAME:    %[[IDX_2:arg1]]
+// CHECK-SAME:    %[[MEM:arg2]]
+// CHECK:              %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK:              %[[C0:.*]] = arith.constant 0 : index
+// CHECK:              %[[COLLAPSED:.*]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}:  [[0], [1, 2, 3]]
+// CHECK-SAME:           memref<1x?x4x6xi32> into memref<1x?xi32>
+// CHECK:              %[[COLLAPSED_IDX:.*]] = affine.apply #[[$MAP]]()[%[[IDX_1]], %[[IDX_2]]]
+// CHECK:              %[[VEC_1D:.*]] = vector.transfer_read %[[COLLAPSED]][%[[C0]], %[[COLLAPSED_IDX]]],
+// CHECK-SAME:           %[[C0_I32]] {in_bounds = [true]} : memref<1x?xi32>, vector<12xi32>
+// CHECK:              %[[RESULT:.*]] = vector.shape_cast %[[VEC_1D]] : vector<12xi32> to vector<1x2x6xi32>
+// CHECK:              return %[[RESULT]] : vector<1x2x6xi32>
+
+
+// CHECK-128B-LABEL: func @transfer_read_dynamic_dim_to_flatten
 //   CHECK-128B-NOT:   memref.collapse_shape
 
 // -----
@@ -451,9 +491,31 @@ func.func @transfer_write_leading_dynamic_dims(
 
 // -----
 
-// One of the dims to be flattened is dynamic - not supported ATM.
+// The vector is a non-contiguous slice of the input
+// memref.
 
 func.func @negative_transfer_write_dynamic_to_flatten(
+    %mem : memref<4x?x?x2xi8>,
+    %vec : vector<2x2x2xi8>) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write  %vec, %mem[%c0, %c0, %c0, %c0]
+    : vector<2x2x2xi8>, memref<4x?x?x2xi8>
+  return
+}
+
+// CHECK-LABEL: func.func @negative_transfer_write_dynamic_to_flatten(
+// CHECK-NOT: memref.collapse_shape
+// CHECK-NOT: vector.shape_cast
+
+// CHECK-128B-LABEL: func @negative_transfer_write_dynamic_to_flatten(
+//   CHECK-128B-NOT:   memref.collapse_shape
+
+// -----
+
+// See the comment in front of @transfer_read_dynamic_dim_to_flatten.
+
+func.func @transfer_write_dynamic_dim_to_flatten(
     %idx_1: index,
     %idx_2: index,
     %vec : vector<1x2x6xi32>,
@@ -466,11 +528,24 @@ func.func @negative_transfer_write_dynamic_to_flatten(
   return
 }
 
-// CHECK-LABEL: func.func @negative_transfer_write_dynamic_to_flatten
-// CHECK-NOT: memref.collapse_shape
-// CHECK-NOT: vector.shape_cast
+// CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
 
-// CHECK-128B-LABEL: func @negative_transfer_write_dynamic_to_flatten
+// CHECK-LABEL: func.func @transfer_write_dynamic_dim_to_flatten
+// CHECK-SAME:    %[[IDX_1:arg0]]: index
+// CHECK-SAME:    %[[IDX_2:arg1]]: index
+// CHECK-SAME:    %[[VEC:arg2]]: vector<1x2x6xi32>
+// CHECK-SAME:    %[[MEM:arg3]]: memref<1x?x4x6xi32>
+
+// CHECK:              %[[C0:.*]] = arith.constant 0 : index
+// CHECK:              %[[COLLAPSED_MEM:.*]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}:  [[0], [1, 2, 3]]
+// CHECK-SAME:           : memref<1x?x4x6xi32> into memref<1x?xi32>
+// CHECK:              %[[COLLAPSED_IDX:.*]] = affine.apply #[[$MAP]]()[%[[IDX_1]], %[[IDX_2]]]
+// CHECK:              %[[VEC_1D:.*]] = vector.shape_cast %[[VEC]] : vector<1x2x6xi32> to vector<12xi32>
+// CHECK:              vector.transfer_write %[[VEC_1D]], %[[COLLAPSED_MEM]][%[[C0]], %[[COLLAPSED_IDX]]]
+// CHECK-SAME:           {in_bounds = [true]} : vector<12xi32>, memref<1x?xi32>
+
+// CHECK-128B-LABEL: func @transfer_write_dynamic_dim_to_flatten
 //   CHECK-128B-NOT:   memref.collapse_shape
 
 // -----
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index 770064486457..d22afb3003e7 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_unittest(MLIRIRTests
   IRMapping.cpp
   InterfaceAttachmentTest.cpp
   LocationTest.cpp
+  MemrefLayoutTest.cpp
   OperationSupportTest.cpp
   PatternMatchTest.cpp
   ShapedTypeTest.cpp
diff --git a/mlir/unittests/IR/MemrefLayoutTest.cpp b/mlir/unittests/IR/MemrefLayoutTest.cpp
new file mode 100644
index 000000000000..f243a76ee660
--- /dev/null
+++ b/mlir/unittests/IR/MemrefLayoutTest.cpp
@@ -0,0 +1,111 @@
+//===- LayoutTest.cpp - unit tests related to memref layout ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::memref;
+
+//
+// Test the correctness of `memref::getNumContiguousTrailingDims`
+//
+TEST(MemRefLayout, numContigDim) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+
+  const int64_t _ = ShapedType::kDynamic;
+  const FloatType f32 = b.getF32Type();
+  auto strided = [&ctx](ArrayRef<int64_t> s) {
+    return StridedLayoutAttr::get(&ctx, 0, s);
+  };
+
+  // Special case for identity maps and no explicit `strided` attribute - the
+  // memref is entirely contiguous even if the strides cannot be determined
+  // statically.
+
+  // memref<?x?x?xf32>
+  auto m0 = MemRefType::get({_, _, _}, f32);
+  EXPECT_EQ(m0.getNumContiguousTrailingDims(), 3);
+
+  // Conservatively assume memref is sparse everywhere if cannot get the
+  // strides.
+
+  // memref<2x2x2xf32, (i,j,k)->(i,k,j)>
+  auto m1 = MemRefType::get(
+      {2, 2, 2}, f32,
+      AffineMap::getPermutationMap(ArrayRef<int64_t>{0, 2, 1}, &ctx));
+  EXPECT_EQ(m1.getNumContiguousTrailingDims(), 0);
+
+  // A base cases of a fixed memref with the usual strides.
+
+  // memref<2x2x2xf32, strided<[4, 2, 1]>>
+  auto m3 = MemRefType::get({2, 2, 2}, f32, strided({4, 2, 1}));
+  EXPECT_EQ(m3.getNumContiguousTrailingDims(), 3);
+
+  // A fixed memref with a discontinuity in the rightmost dimension.
+
+  // memref<2x2x2xf32, strided<[8, 4, 2]>>
+  auto m4 = MemRefType::get({2, 2, 2}, f32, strided({8, 4, 2}));
+  EXPECT_EQ(m4.getNumContiguousTrailingDims(), 0);
+
+  // A fixed memref with a discontinuity in the "middle".
+
+  // memref<2x2x2xf32, strided<[8, 2, 1]>>
+  auto m5 = MemRefType::get({2, 2, 2}, f32, strided({8, 2, 1}));
+  EXPECT_EQ(m5.getNumContiguousTrailingDims(), 2);
+
+  // A dynamic memref where the dynamic dimension breaks continuity.
+
+  // memref<2x?x2xf32, strided<[4, 2, 1]>>
+  auto m6 = MemRefType::get({2, _, 2}, f32, strided({4, 2, 1}));
+  EXPECT_EQ(m6.getNumContiguousTrailingDims(), 2);
+
+  // A edge case of a dynamic memref where the dynamic dimension is the first
+  // one.
+
+  // memref<?x2x2xf32, strided<[4, 2, 1]>>
+  auto m7 = MemRefType::get({2, _, 2}, f32, strided({4, 2, 1}));
+  EXPECT_EQ(m7.getNumContiguousTrailingDims(), 2);
+
+  // A memref with a unit dimension. Unit dimensions do not affect continuity,
+  // even if the corresponding stride is dynamic.
+
+  // memref<2x1x2xf32, strided<[2,?,1]>>
+  auto m8 = MemRefType::get({2, 1, 2}, f32, strided({2, _, 1}));
+  EXPECT_EQ(m8.getNumContiguousTrailingDims(), 3);
+}
+
+//
+// Test the member function `memref::areTrailingDimsContiguous`
+//
+TEST(MemRefLayout, contigTrailingDim) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+
+  const int64_t _ = ShapedType::kDynamic;
+  const FloatType f32 = b.getF32Type();
+  auto strided = [&ctx](ArrayRef<int64_t> s) {
+    return StridedLayoutAttr::get(&ctx, 0, s);
+  };
+
+  // A not-entirely-continuous, not-entirely-discontinuous memref.
+  // ensure `areTrailingDimsContiguous` returns `true` for the value
+  // returned by `getNumContiguousTrailingDims` and `false` for the next bigger
+  // number.
+
+  // memref<2x?x2xf32, strided<[?,2,1]>>
+  auto m = MemRefType::get({2, _, 2}, f32, strided({_, 2, 1}));
+  int64_t n = m.getNumContiguousTrailingDims();
+  EXPECT_TRUE(m.areTrailingDimsContiguous(n));
+  ASSERT_TRUE(n + 1 <= m.getRank());
+  EXPECT_FALSE(m.areTrailingDimsContiguous(n + 1));
+}

From b31413a96603cd904281368b6f5f8e36836a7cac Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 23 Jun 2025 09:38:01 +0100
Subject: [PATCH 1243/1322] [MLIR][AArch64] Simplify
 LowerContractionToSVEI8MMPattern.cpp:getExtOperand (NFC) (#144909)

Just recently learned about `isSignlessInteger`, use that instead of
comparing to types obtained via `rewriter.getI<N>Type()`.
It also makes it closer to a similar function in
`LowerContractionToNeonI8MMPattern.cpp` (formerly `LowerContractionToSMMLAPattern.cpp`)
which would help a potential effort to unify these patterns.
---
 .../LowerContractionToSVEI8MMPattern.cpp      | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
index b1233c5c06eb..a1209fe8230e 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
@@ -39,7 +39,7 @@ namespace {
 //
 // Return success only for extensions from `i8` to `i32`.
 template <typename Op>
-std::optional<Value> getExtOperand(Value v, Type i8Ty, Type i32Ty) {
+std::optional<Value> getExtOperand(Value v) {
 
   static_assert(llvm::is_one_of<Op, arith::ExtSIOp, arith::ExtUIOp>::value,
                 "Must be instantiated with either sign- or zero- extension op");
@@ -50,7 +50,7 @@ std::optional<Value> getExtOperand(Value v, Type i8Ty, Type i32Ty) {
   if (!extOp) {
     if constexpr (std::is_same<Op, arith::ExtSIOp>::value) {
       auto vTy = cast<VectorType>(v.getType());
-      if (vTy.getElementType() != i8Ty)
+      if (!vTy.getElementType().isSignlessInteger(8))
         return {};
       return v;
     }
@@ -61,11 +61,11 @@ std::optional<Value> getExtOperand(Value v, Type i8Ty, Type i32Ty) {
   // operation type, check it's extended from `i8` to `i32`.
   auto inOp = extOp.getIn();
   auto inTy = dyn_cast<VectorType>(inOp.getType());
-  if (!inTy || inTy.getElementType() != i8Ty)
+  if (!inTy || !inTy.getElementType().isSignlessInteger(8))
     return {};
 
   auto outTy = dyn_cast<VectorType>(extOp.getType());
-  if (!outTy || outTy.getElementType() != i32Ty)
+  if (!outTy || !outTy.getElementType().isSignlessInteger(32))
     return {};
 
   return inOp;
@@ -199,27 +199,23 @@ public:
     // operands are supported, but they are lowered to different operations.
     // Determine which is the appropriate operation to lower to.
     MMLA mmlaOp = MMLA::Signed;
-    auto maybeLhs = getExtOperand<arith::ExtSIOp>(
-        op.getLhs(), rewriter.getI8Type(), rewriter.getI32Type());
+    auto maybeLhs = getExtOperand<arith::ExtSIOp>(op.getLhs());
     if (!maybeLhs) {
       mmlaOp = MMLA::Unsigned;
-      maybeLhs = getExtOperand<arith::ExtUIOp>(
-          op.getLhs(), rewriter.getI8Type(), rewriter.getI32Type());
+      maybeLhs = getExtOperand<arith::ExtUIOp>(op.getLhs());
     }
     if (!maybeLhs)
       return rewriter.notifyMatchFailure(
           op, "LHS is not a sign- or zero- extended i8");
 
-    auto maybeRhs = getExtOperand<arith::ExtSIOp>(
-        op.getRhs(), rewriter.getI8Type(), rewriter.getI32Type());
+    auto maybeRhs = getExtOperand<arith::ExtSIOp>(op.getRhs());
     if (maybeRhs) {
       if (mmlaOp == MMLA::Unsigned)
         mmlaOp = MMLA::Mixed;
     } else {
       if (mmlaOp == MMLA::Signed)
         mmlaOp = MMLA::MixedSwapped;
-      maybeRhs = getExtOperand<arith::ExtUIOp>(
-          op.getRhs(), rewriter.getI8Type(), rewriter.getI32Type());
+      maybeRhs = getExtOperand<arith::ExtUIOp>(op.getRhs());
     }
     if (!maybeRhs)
       return rewriter.notifyMatchFailure(

From 98a6fed0965702619162243be263626380a6c0f6 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 23 Jun 2025 09:38:39 +0100
Subject: [PATCH 1244/1322] [mlir][tosa] Allow zero-points to be unranked
 (#143770)

This commit allows zero-points used by a number of tosa operations to be
unranked. This allows the shape inference pass to propagate shape
information.
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td |  2 +-
 mlir/test/Dialect/Tosa/invalid.mlir                |  2 +-
 mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir      | 11 +++++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 536551c8f843..1cfe6eee576b 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -152,7 +152,7 @@ def Tosa_Rank0Tensor : TosaTensorRankOf<[Tosa_AnyNumber], [0]>;
 
 def Tosa_ScalarTensor : TosaScalarTensorOf<[Tosa_AnyNumber], [1]>;
 def Tosa_ScalarInt8Tensor : TosaScalarTensorOf<[Tosa_Int8], [1]>;
-def Tosa_ScalarIntOrFloatTensor : TosaScalarTensorOf<[Tosa_Int, AnyFloat], [1]>;
+def Tosa_ScalarIntOrFloatTensor : AnyTypeOf<[TosaUnrankedTensorOf<[Tosa_Int, AnyFloat]>, TosaScalarTensorOf<[Tosa_Int, AnyFloat], [1]>]>;
 
 // We include unranked tensors as a supported type for all possible tosa
 // Tensors as unranked does not guarantee invalid. If unranked tensors exist
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 805522799a6d..6fa6756822ac 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -1007,7 +1007,7 @@ func.func @test_pad_rank0_pad_const(%arg0: tensor<13x21x3xf8E4M3FN>) -> tensor<1
 func.func @test_conv2d_rank0_zp(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi32> {
   %input_zp = "tosa.const"() <{values = dense<0> : tensor<i8>}> : () -> tensor<i8>
   %weight_zp = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
-  // expected-error@+1 {{'tosa.conv2d' op operand #3 must be tosa-conformant scalar tensor of unsigned integer or signless integer or floating-point values, but got 'tensor<i8>'}}
+  // expected-error@+1 {{'tosa.conv2d' op operand #3 must be tosa-conformant unranked tensor of unsigned integer or signless integer or floating-point values or tosa-conformant scalar tensor of unsigned integer or signless integer or floating-point values, but got 'tensor<i8>'}}
   %0 = tosa.conv2d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>, tensor<i8>, tensor<1xi8>) -> tensor<1x27x27x16xi32>
   return %0 : tensor<1x27x27x16xi32>
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 18409d24fbc1..e280c1155f52 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -333,6 +333,17 @@ func.func @test_dynamic_mixed_matmul(%arg0 : tensor<?x3x?xi32>, %arg1 : tensor<?
 
 // -----
 
+// CHECK-LABEL: @test_unranked_zero_points_matmul
+func.func @test_unranked_zero_points_matmul(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x3x4xf32>, %zero_point: tensor<1xf32>) -> tensor<1x2x4xf32> {
+    // CHECK: %[[ZP:.*]] = tosa.cast %arg2 : (tensor<1xf32>) -> tensor<1xf32>
+    %zero_point_unranked = "tosa.cast"(%zero_point) : (tensor<1xf32>) -> tensor<*xf32>
+    // CHECK: tosa.matmul %arg0, %arg1, %[[ZP]], %[[ZP]] : (tensor<1x2x3xf32>, tensor<1x3x4xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x2x4xf32>
+    %0 = tosa.matmul %arg0, %arg1, %zero_point_unranked, %zero_point_unranked : (tensor<1x2x3xf32>, tensor<1x3x4xf32>, tensor<*xf32>, tensor<*xf32>)  -> tensor<1x2x4xf32>
+    return %0 : tensor<1x2x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @test_table_static
 func.func @test_table_static(%arg0 : tensor<4x5xi16>, %arg1 : tensor<513xi16>) -> () {
   // CHECK:tosa.table %arg0, %arg1 : (tensor<4x5xi16>, tensor<513xi16>) -> tensor<4x5xi16>

From 43260b01dd6e46508fe8c650df6818de64060070 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 23 Jun 2025 10:43:47 +0200
Subject: [PATCH 1245/1322] [libc++] Add test to ensure that the mangling of
 types stays the same (#143556)

---
 libcxx/test/libcxx/mangled_names.pass.cpp | 74 +++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 libcxx/test/libcxx/mangled_names.pass.cpp

diff --git a/libcxx/test/libcxx/mangled_names.pass.cpp b/libcxx/test/libcxx/mangled_names.pass.cpp
new file mode 100644
index 000000000000..210c40809781
--- /dev/null
+++ b/libcxx/test/libcxx/mangled_names.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// We're using `string::starts_with` in this test
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Make sure that the mangling of our public types stays the same
+
+// UNSUPPORTED: no-rtti, msvc
+
+#include <cassert>
+#include <charconv>
+#include <iostream>
+#include <map>
+#include <typeinfo>
+#include <string>
+#include <string_view>
+
+template <class>
+struct mangling {};
+
+struct test_struct {};
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+struct ns_mangling {};
+_LIBCPP_END_NAMESPACE_STD
+
+namespace std::__name {
+struct ns_mangling {};
+} // namespace std::__name
+
+namespace std::__long_name_to_make_sure_multiple_digits_work {
+struct ns_mangling {};
+} // namespace std::__long_name_to_make_sure_multiple_digits_work
+
+std::string get_std_inline_namespace_mangling(const std::type_info& info) {
+  std::string name = info.name();
+  assert(name.starts_with("NSt"));
+  unsigned name_len;
+  auto res = std::from_chars(name.data() + 3, name.data() + name.size(), name_len);
+  assert(res.ec == std::errc{});
+  return std::move(name).substr(0, (res.ptr + name_len) - name.data());
+}
+
+void expect_mangling(const std::type_info& info, std::string expected_name) {
+  if (expected_name != info.name())
+    std::__libcpp_verbose_abort("Expected: '%s'\n     Got: '%s'\n", expected_name.c_str(), info.name());
+}
+
+// Mangling names are really long, but splitting it up into multiple lines doesn't make it any more readable
+// clang-format off
+int main(int, char**) {
+  // self-test inline namespace recovery
+  assert(get_std_inline_namespace_mangling(typeid(std::__name::ns_mangling)) == "NSt6__name");
+  assert(get_std_inline_namespace_mangling(typeid(std::__long_name_to_make_sure_multiple_digits_work::ns_mangling)) == "NSt45__long_name_to_make_sure_multiple_digits_work");
+
+  // selftest
+  expect_mangling(typeid(test_struct), "11test_struct");
+
+  std::string ns_std = get_std_inline_namespace_mangling(typeid(std::ns_mangling));
+  std::string ptrdiff = typeid(std::ptrdiff_t).name();
+
+  // std::map
+  expect_mangling(typeid(std::map<int, int>), ns_std + "3mapIiiNS_4lessIiEENS_9allocatorINS_4pairIKiiEEEEEE");
+  expect_mangling(typeid(std::map<int, int>::iterator), ns_std + "14__map_iteratorINS_15__tree_iteratorINS_12__value_typeIiiEEPNS_11__tree_nodeIS3_PvEE" + ptrdiff +"EEEE");
+  expect_mangling(typeid(std::map<int, int>::const_iterator), ns_std + "20__map_const_iteratorINS_21__tree_const_iteratorINS_12__value_typeIiiEEPNS_11__tree_nodeIS3_PvEE" + ptrdiff + "EEEE");
+
+  return 0;
+}

From be000986322ebb796ec7127629a865897dede966 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 23 Jun 2025 10:47:47 +0200
Subject: [PATCH 1246/1322] [libc++] Remove a bunch of now unnecessary
 indirections in __tree (#142397)

Most notably, this removes the notion of a distinct `value_type` and
`__container_value_type` from `__tree`, since these are now always the
same type. There are a few places we need to keep `__value_type` around,
since they are ABI visibile. In these cases `_Tp` is used directly. The
second simplification here is that we use `const value_type&` instead of
`const key_type&` in a few places and make use of the fact that the
comparator is capable of comparing any combination of `key_type` and
`value_type`.

This is a follow-up to #134819.
---
 libcxx/include/__tree                         | 169 +++++++-----------
 libcxx/include/map                            |  26 ++-
 .../tree_key_value_traits.pass.cpp            |  56 ------
 3 files changed, 79 insertions(+), 172 deletions(-)
 delete mode 100644 libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp

diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 403cfe1ba403..25d098351d57 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -35,6 +35,7 @@
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
+#include <__type_traits/remove_const.h>
 #include <__type_traits/remove_const_ref.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
@@ -504,29 +505,6 @@ struct __is_tree_value_type : false_type {};
 template <class _One>
 struct __is_tree_value_type<_One> : __is_tree_value_type_imp<__remove_cvref_t<_One> > {};
 
-template <class _Tp>
-struct __tree_key_value_types {
-  typedef _Tp key_type;
-  typedef _Tp __container_value_type;
-  static const bool __is_map = false;
-
-  _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Tp const& __v) { return __v; }
-};
-
-template <class _Key, class _Tp>
-struct __tree_key_value_types<__value_type<_Key, _Tp> > {
-  typedef _Key key_type;
-  typedef _Tp mapped_type;
-  typedef pair<const _Key, _Tp> __container_value_type;
-  typedef __container_value_type __map_value_type;
-  static const bool __is_map = true;
-
-  template <class _Up, __enable_if_t<__is_same_uncvref<_Up, __container_value_type>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Up& __t) {
-    return __t.first;
-  }
-};
-
 template <class _VoidPtr>
 struct __tree_node_base_types {
   typedef _VoidPtr __void_pointer;
@@ -555,16 +533,19 @@ private:
                 "_VoidPtr does not point to unqualified void type");
 };
 
-template <class _Tp, class _AllocPtr, class _KVTypes = __tree_key_value_types<_Tp>, bool = _KVTypes::__is_map>
-struct __tree_map_pointer_types {};
-
-template <class _Tp, class _AllocPtr, class _KVTypes>
-struct __tree_map_pointer_types<_Tp, _AllocPtr, _KVTypes, true> {
-  typedef typename _KVTypes::__map_value_type _Mv;
-  typedef __rebind_pointer_t<_AllocPtr, _Mv> __map_value_type_pointer;
-  typedef __rebind_pointer_t<_AllocPtr, const _Mv> __const_map_value_type_pointer;
+template <class _Tp>
+struct __get_tree_key_type {
+  using type _LIBCPP_NODEBUG = _Tp;
 };
 
+template <class _Key, class _ValueT>
+struct __get_tree_key_type<__value_type<_Key, _ValueT> > {
+  using type _LIBCPP_NODEBUG = _Key;
+};
+
+template <class _Tp>
+using __get_tree_key_type_t _LIBCPP_NODEBUG = typename __get_tree_key_type<_Tp>::type;
+
 template <class _Tp>
 struct __get_node_value_type {
   using type _LIBCPP_NODEBUG = _Tp;
@@ -582,8 +563,7 @@ template <class _NodePtr, class _NodeT = typename pointer_traits<_NodePtr>::elem
 struct __tree_node_types;
 
 template <class _NodePtr, class _Tp, class _VoidPtr>
-struct __tree_node_types<_NodePtr, __tree_node<_Tp, _VoidPtr> >
-    : public __tree_node_base_types<_VoidPtr>, __tree_key_value_types<_Tp>, __tree_map_pointer_types<_Tp, _VoidPtr> {
+struct __tree_node_types<_NodePtr, __tree_node<_Tp, _VoidPtr> > : public __tree_node_base_types<_VoidPtr> {
   typedef __tree_node_base_types<_VoidPtr> __base;
 
 public:
@@ -592,7 +572,6 @@ public:
 
   using __node_value_type _LIBCPP_NODEBUG = __get_node_value_type_t<_Tp>;
   typedef __rebind_pointer_t<_VoidPtr, __node_value_type> __node_value_type_pointer;
-  typedef __rebind_pointer_t<_VoidPtr, const __node_value_type> __const_node_value_type_pointer;
 
 private:
   static_assert(!is_const<__node_type>::value, "_NodePtr should never be a pointer to const");
@@ -700,16 +679,15 @@ class __tree_iterator {
   typedef _NodePtr __node_pointer;
   typedef typename _NodeTypes::__node_base_pointer __node_base_pointer;
   typedef typename _NodeTypes::__end_node_pointer __end_node_pointer;
-  typedef pointer_traits<__node_pointer> __pointer_traits;
 
   __end_node_pointer __ptr_;
 
 public:
-  typedef bidirectional_iterator_tag iterator_category;
-  using value_type = __get_node_value_type_t<_Tp>;
-  typedef _DiffType difference_type;
-  typedef value_type& reference;
-  typedef typename _NodeTypes::__node_value_type_pointer pointer;
+  using iterator_category = bidirectional_iterator_tag;
+  using value_type        = __get_node_value_type_t<_Tp>;
+  using difference_type   = _DiffType;
+  using reference         = value_type&;
+  using pointer           = __rebind_pointer_t<_NodePtr, value_type>;
 
   _LIBCPP_HIDE_FROM_ABI __tree_iterator() _NOEXCEPT
 #if _LIBCPP_STD_VER >= 14
@@ -774,16 +752,15 @@ class __tree_const_iterator {
   typedef typename _NodeTypes::__node_pointer __node_pointer;
   typedef typename _NodeTypes::__node_base_pointer __node_base_pointer;
   typedef typename _NodeTypes::__end_node_pointer __end_node_pointer;
-  typedef pointer_traits<__node_pointer> __pointer_traits;
 
   __end_node_pointer __ptr_;
 
 public:
-  typedef bidirectional_iterator_tag iterator_category;
-  using value_type = __get_node_value_type_t<_Tp>;
-  typedef _DiffType difference_type;
-  typedef const value_type& reference;
-  typedef typename _NodeTypes::__const_node_value_type_pointer pointer;
+  using iterator_category = bidirectional_iterator_tag;
+  using value_type        = __get_node_value_type_t<_Tp>;
+  using difference_type   = _DiffType;
+  using reference         = const value_type&;
+  using pointer           = __rebind_pointer_t<_NodePtr, const value_type>;
 
   _LIBCPP_HIDE_FROM_ABI __tree_const_iterator() _NOEXCEPT
 #if _LIBCPP_STD_VER >= 14
@@ -859,18 +836,17 @@ int __diagnose_non_const_comparator();
 template <class _Tp, class _Compare, class _Allocator>
 class __tree {
 public:
-  typedef _Tp value_type;
+  using value_type = __get_node_value_type_t<_Tp>;
   typedef _Compare value_compare;
   typedef _Allocator allocator_type;
 
 private:
   typedef allocator_traits<allocator_type> __alloc_traits;
-  typedef typename __make_tree_node_types<value_type, typename __alloc_traits::void_pointer>::type _NodeTypes;
-  typedef typename _NodeTypes::key_type key_type;
+  typedef typename __make_tree_node_types<_Tp, typename __alloc_traits::void_pointer>::type _NodeTypes;
+  using key_type = __get_tree_key_type_t<_Tp>;
 
 public:
   typedef typename _NodeTypes::__node_value_type __node_value_type;
-  typedef typename _NodeTypes::__container_value_type __container_value_type;
 
   typedef typename __alloc_traits::pointer pointer;
   typedef typename __alloc_traits::const_pointer const_pointer;
@@ -945,8 +921,8 @@ public:
     return std::addressof(__end_node()->__left_);
   }
 
-  typedef __tree_iterator<value_type, __node_pointer, difference_type> iterator;
-  typedef __tree_const_iterator<value_type, __node_pointer, difference_type> const_iterator;
+  typedef __tree_iterator<_Tp, __node_pointer, difference_type> iterator;
+  typedef __tree_const_iterator<_Tp, __node_pointer, difference_type> const_iterator;
 
   _LIBCPP_HIDE_FROM_ABI explicit __tree(const value_compare& __comp) _NOEXCEPT_(
       is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_copy_constructible<value_compare>::value);
@@ -1012,7 +988,7 @@ public:
 
   template <class _First,
             class _Second,
-            __enable_if_t<__can_extract_map_key<_First, key_type, __container_value_type>::value, int> = 0>
+            __enable_if_t<__can_extract_map_key<_First, key_type, value_type>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __emplace_unique(_First&& __f, _Second&& __s) {
     return __emplace_unique_key_args(__f, std::forward<_First>(__f), std::forward<_Second>(__s));
   }
@@ -1044,7 +1020,7 @@ public:
 
   template <class _First,
             class _Second,
-            __enable_if_t<__can_extract_map_key<_First, key_type, __container_value_type>::value, int> = 0>
+            __enable_if_t<__can_extract_map_key<_First, key_type, value_type>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint_unique(const_iterator __p, _First&& __f, _Second&& __s) {
     return __emplace_hint_unique_key_args(__p, __f, std::forward<_First>(__f), std::forward<_Second>(__s)).first;
   }
@@ -1072,28 +1048,28 @@ public:
     return __emplace_hint_unique_key_args(__p, __x.first, std::forward<_Pp>(__x)).first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_unique(const __container_value_type& __v) {
-    return __emplace_unique_key_args(_NodeTypes::__get_key(__v), __v);
+  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_unique(const value_type& __v) {
+    return __emplace_unique_key_args(__v, __v);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_unique(const_iterator __p, const __container_value_type& __v) {
-    return __emplace_hint_unique_key_args(__p, _NodeTypes::__get_key(__v), __v).first;
+  _LIBCPP_HIDE_FROM_ABI iterator __insert_unique(const_iterator __p, const value_type& __v) {
+    return __emplace_hint_unique_key_args(__p, __v, __v).first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_unique(__container_value_type&& __v) {
-    return __emplace_unique_key_args(_NodeTypes::__get_key(__v), std::move(__v));
+  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_unique(value_type&& __v) {
+    return __emplace_unique_key_args(__v, std::move(__v));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_unique(const_iterator __p, __container_value_type&& __v) {
-    return __emplace_hint_unique_key_args(__p, _NodeTypes::__get_key(__v), std::move(__v)).first;
+  _LIBCPP_HIDE_FROM_ABI iterator __insert_unique(const_iterator __p, value_type&& __v) {
+    return __emplace_hint_unique_key_args(__p, __v, std::move(__v)).first;
   }
 
-  template <class _Vp, __enable_if_t<!is_same<__remove_const_ref_t<_Vp>, __container_value_type>::value, int> = 0>
+  template <class _Vp, __enable_if_t<!is_same<__remove_const_ref_t<_Vp>, value_type>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_unique(_Vp&& __v) {
     return __emplace_unique(std::forward<_Vp>(__v));
   }
 
-  template <class _Vp, __enable_if_t<!is_same<__remove_const_ref_t<_Vp>, __container_value_type>::value, int> = 0>
+  template <class _Vp, __enable_if_t<!is_same<__remove_const_ref_t<_Vp>, value_type>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator __insert_unique(const_iterator __p, _Vp&& __v) {
     return __emplace_hint_unique(__p, std::forward<_Vp>(__v));
   }
@@ -1101,8 +1077,7 @@ public:
   template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void
   __insert_unique_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) {
-    using __key_type = typename _NodeTypes::key_type;
-    __emplace_hint_unique(__p, const_cast<__key_type&&>(__value.first), std::move(__value.second));
+    __emplace_hint_unique(__p, const_cast<key_type&&>(__value.first), std::move(__value.second));
   }
 
   template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type<_ValueT>::value, int> = 0>
@@ -1110,11 +1085,9 @@ public:
     __emplace_hint_unique(__p, std::move(__value));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_multi(__container_value_type&& __v) {
-    return __emplace_multi(std::move(__v));
-  }
+  _LIBCPP_HIDE_FROM_ABI iterator __insert_multi(value_type&& __v) { return __emplace_multi(std::move(__v)); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_multi(const_iterator __p, __container_value_type&& __v) {
+  _LIBCPP_HIDE_FROM_ABI iterator __insert_multi(const_iterator __p, value_type&& __v) {
     return __emplace_hint_multi(__p, std::move(__v));
   }
 
@@ -1129,10 +1102,8 @@ public:
   }
 
   template <class _ValueT = _Tp, __enable_if_t<__is_tree_value_type<_ValueT>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI void
-  __insert_multi_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) {
-    using __key_type = typename _NodeTypes::key_type;
-    __emplace_hint_multi(__p, const_cast<__key_type&&>(__value.first), std::move(__value.second));
+  _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, value_type&& __value) {
+    __emplace_hint_multi(__p, const_cast<key_type&&>(__value.first), std::move(__value.second));
   }
 
   template <class _ValueT = _Tp, __enable_if_t<!__is_tree_value_type<_ValueT>::value, int> = 0>
@@ -1140,8 +1111,7 @@ public:
     __emplace_hint_multi(__p, std::move(__value));
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool>
-  __node_assign_unique(const __container_value_type& __v, __node_pointer __dest);
+  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __node_assign_unique(const value_type& __v, __node_pointer __dest);
 
   _LIBCPP_HIDE_FROM_ABI iterator __node_insert_multi(__node_pointer __nd);
   _LIBCPP_HIDE_FROM_ABI iterator __node_insert_multi(const_iterator __p, __node_pointer __nd);
@@ -1254,10 +1224,11 @@ public:
   _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __tree&, false_type) {}
 
 private:
-  _LIBCPP_HIDE_FROM_ABI __node_base_pointer& __find_leaf_low(__parent_pointer& __parent, const key_type& __v);
-  _LIBCPP_HIDE_FROM_ABI __node_base_pointer& __find_leaf_high(__parent_pointer& __parent, const key_type& __v);
+  _LIBCPP_HIDE_FROM_ABI __node_base_pointer& __find_leaf_low(__parent_pointer& __parent, const value_type& __v);
+  _LIBCPP_HIDE_FROM_ABI __node_base_pointer& __find_leaf_high(__parent_pointer& __parent, const value_type& __v);
+
   _LIBCPP_HIDE_FROM_ABI __node_base_pointer&
-  __find_leaf(const_iterator __hint, __parent_pointer& __parent, const key_type& __v);
+  __find_leaf(const_iterator __hint, __parent_pointer& __parent, const value_type& __v);
 
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI __node_holder __construct_node(_Args&&... __args);
@@ -1283,7 +1254,7 @@ private:
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<__is_tree_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI static void __assign_value(__get_node_value_type_t<value_type>& __lhs, _From&& __rhs) {
-    using __key_type = typename _NodeTypes::key_type;
+    using __key_type = __remove_const_t<typename value_type::first_type>;
 
     // This is technically UB, since the object was constructed as `const`.
     // Clang doesn't optimize on this currently though.
@@ -1409,8 +1380,8 @@ template <class _ForwardIterator>
 void __tree<_Tp, _Compare, _Allocator>::__assign_unique(_ForwardIterator __first, _ForwardIterator __last) {
   typedef iterator_traits<_ForwardIterator> _ITraits;
   typedef typename _ITraits::value_type _ItValueType;
-  static_assert(is_same<_ItValueType, __container_value_type>::value,
-                "__assign_unique may only be called with the containers value type");
+  static_assert(
+      is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type");
   static_assert(
       __has_forward_iterator_category<_ForwardIterator>::value, "__assign_unique requires a forward iterator");
   if (size() != 0) {
@@ -1429,10 +1400,8 @@ template <class _InputIterator>
 void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _InputIterator __last) {
   typedef iterator_traits<_InputIterator> _ITraits;
   typedef typename _ITraits::value_type _ItValueType;
-  static_assert(
-      (is_same<_ItValueType, __container_value_type>::value || is_same<_ItValueType, __node_value_type>::value),
-      "__assign_multi may only be called with the containers value type"
-      " or the nodes value type");
+  static_assert((is_same<_ItValueType, value_type>::value || is_same<_ItValueType, __node_value_type>::value),
+                "__assign_multi may only be called with the containers value type or the nodes value type");
   if (size() != 0) {
     _DetachedTreeCache __cache(this);
     for (; __cache.__get() && __first != __last; ++__first) {
@@ -1598,7 +1567,7 @@ void __tree<_Tp, _Compare, _Allocator>::clear() _NOEXCEPT {
 // Return reference to null leaf
 template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::__node_base_pointer&
-__tree<_Tp, _Compare, _Allocator>::__find_leaf_low(__parent_pointer& __parent, const key_type& __v) {
+__tree<_Tp, _Compare, _Allocator>::__find_leaf_low(__parent_pointer& __parent, const value_type& __v) {
   __node_pointer __nd = __root();
   if (__nd != nullptr) {
     while (true) {
@@ -1628,7 +1597,7 @@ __tree<_Tp, _Compare, _Allocator>::__find_leaf_low(__parent_pointer& __parent, c
 // Return reference to null leaf
 template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::__node_base_pointer&
-__tree<_Tp, _Compare, _Allocator>::__find_leaf_high(__parent_pointer& __parent, const key_type& __v) {
+__tree<_Tp, _Compare, _Allocator>::__find_leaf_high(__parent_pointer& __parent, const value_type& __v) {
   __node_pointer __nd = __root();
   if (__nd != nullptr) {
     while (true) {
@@ -1660,8 +1629,8 @@ __tree<_Tp, _Compare, _Allocator>::__find_leaf_high(__parent_pointer& __parent,
 // Set __parent to parent of null leaf
 // Return reference to null leaf
 template <class _Tp, class _Compare, class _Allocator>
-typename __tree<_Tp, _Compare, _Allocator>::__node_base_pointer&
-__tree<_Tp, _Compare, _Allocator>::__find_leaf(const_iterator __hint, __parent_pointer& __parent, const key_type& __v) {
+typename __tree<_Tp, _Compare, _Allocator>::__node_base_pointer& __tree<_Tp, _Compare, _Allocator>::__find_leaf(
+    const_iterator __hint, __parent_pointer& __parent, const value_type& __v) {
   if (__hint == end() || !value_comp()(*__hint, __v)) // check before
   {
     // __v <= *__hint
@@ -1871,7 +1840,7 @@ typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__emplace_multi(_Args&&... __args) {
   __node_holder __h = __construct_node(std::forward<_Args>(__args)...);
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_leaf_high(__parent, _NodeTypes::__get_key(__h->__value_));
+  __node_base_pointer& __child = __find_leaf_high(__parent, __h->__value_);
   __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get()));
   return iterator(static_cast<__node_pointer>(__h.release()));
 }
@@ -1882,16 +1851,16 @@ typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__emplace_hint_multi(const_iterator __p, _Args&&... __args) {
   __node_holder __h = __construct_node(std::forward<_Args>(__args)...);
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_leaf(__p, __parent, _NodeTypes::__get_key(__h->__value_));
+  __node_base_pointer& __child = __find_leaf(__p, __parent, __h->__value_);
   __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get()));
   return iterator(static_cast<__node_pointer>(__h.release()));
 }
 
 template <class _Tp, class _Compare, class _Allocator>
 pair<typename __tree<_Tp, _Compare, _Allocator>::iterator, bool>
-__tree<_Tp, _Compare, _Allocator>::__node_assign_unique(const __container_value_type& __v, __node_pointer __nd) {
+__tree<_Tp, _Compare, _Allocator>::__node_assign_unique(const value_type& __v, __node_pointer __nd) {
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_equal(__parent, _NodeTypes::__get_key(__v));
+  __node_base_pointer& __child = __find_equal(__parent, __v);
   __node_pointer __r           = static_cast<__node_pointer>(__child);
   bool __inserted              = false;
   if (__child == nullptr) {
@@ -1907,7 +1876,7 @@ template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__node_insert_multi(__node_pointer __nd) {
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_leaf_high(__parent, _NodeTypes::__get_key(__nd->__value_));
+  __node_base_pointer& __child = __find_leaf_high(__parent, __nd->__value_);
   __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd));
   return iterator(__nd);
 }
@@ -1916,7 +1885,7 @@ template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__node_insert_multi(const_iterator __p, __node_pointer __nd) {
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_leaf(__p, __parent, _NodeTypes::__get_key(__nd->__value_));
+  __node_base_pointer& __child = __find_leaf(__p, __parent, __nd->__value_);
   __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd));
   return iterator(__nd);
 }
@@ -1997,7 +1966,7 @@ _LIBCPP_HIDE_FROM_ABI void __tree<_Tp, _Compare, _Allocator>::__node_handle_merg
   for (typename _Tree::iterator __i = __source.begin(); __i != __source.end();) {
     __node_pointer __src_ptr = __i.__get_np();
     __parent_pointer __parent;
-    __node_base_pointer& __child = __find_equal(__parent, _NodeTypes::__get_key(__src_ptr->__value_));
+    __node_base_pointer& __child = __find_equal(__parent, __src_ptr->__value_);
     ++__i;
     if (__child != nullptr)
       continue;
@@ -2014,7 +1983,7 @@ __tree<_Tp, _Compare, _Allocator>::__node_handle_insert_multi(_NodeHandle&& __nh
     return end();
   __node_pointer __ptr = __nh.__ptr_;
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_leaf_high(__parent, _NodeTypes::__get_key(__ptr->__value_));
+  __node_base_pointer& __child = __find_leaf_high(__parent, __ptr->__value_);
   __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__ptr));
   __nh.__release_ptr();
   return iterator(__ptr);
@@ -2029,7 +1998,7 @@ __tree<_Tp, _Compare, _Allocator>::__node_handle_insert_multi(const_iterator __h
 
   __node_pointer __ptr = __nh.__ptr_;
   __parent_pointer __parent;
-  __node_base_pointer& __child = __find_leaf(__hint, __parent, _NodeTypes::__get_key(__ptr->__value_));
+  __node_base_pointer& __child = __find_leaf(__hint, __parent, __ptr->__value_);
   __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__ptr));
   __nh.__release_ptr();
   return iterator(__ptr);
@@ -2043,7 +2012,7 @@ _LIBCPP_HIDE_FROM_ABI void __tree<_Tp, _Compare, _Allocator>::__node_handle_merg
   for (typename _Tree::iterator __i = __source.begin(); __i != __source.end();) {
     __node_pointer __src_ptr = __i.__get_np();
     __parent_pointer __parent;
-    __node_base_pointer& __child = __find_leaf_high(__parent, _NodeTypes::__get_key(__src_ptr->__value_));
+    __node_base_pointer& __child = __find_leaf_high(__parent, __src_ptr->__value_);
     ++__i;
     __source.__remove_node_pointer(__src_ptr);
     __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__src_ptr));
diff --git a/libcxx/include/map b/libcxx/include/map
index 73bf672d3a08..6c271392911d 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -755,17 +755,14 @@ struct __value_type;
 
 template <class _TreeIterator>
 class __map_iterator {
-  typedef typename _TreeIterator::_NodeTypes _NodeTypes;
-  typedef typename _TreeIterator::__pointer_traits __pointer_traits;
-
   _TreeIterator __i_;
 
 public:
-  typedef bidirectional_iterator_tag iterator_category;
-  typedef typename _NodeTypes::__map_value_type value_type;
-  typedef typename _TreeIterator::difference_type difference_type;
-  typedef value_type& reference;
-  typedef typename _NodeTypes::__map_value_type_pointer pointer;
+  using iterator_category = bidirectional_iterator_tag;
+  using value_type        = typename _TreeIterator::value_type;
+  using difference_type   = typename _TreeIterator::difference_type;
+  using reference         = value_type&;
+  using pointer           = typename _TreeIterator::pointer;
 
   _LIBCPP_HIDE_FROM_ABI __map_iterator() _NOEXCEPT {}
 
@@ -811,17 +808,14 @@ public:
 
 template <class _TreeIterator>
 class __map_const_iterator {
-  typedef typename _TreeIterator::_NodeTypes _NodeTypes;
-  typedef typename _TreeIterator::__pointer_traits __pointer_traits;
-
   _TreeIterator __i_;
 
 public:
-  typedef bidirectional_iterator_tag iterator_category;
-  typedef typename _NodeTypes::__map_value_type value_type;
-  typedef typename _TreeIterator::difference_type difference_type;
-  typedef const value_type& reference;
-  typedef typename _NodeTypes::__const_map_value_type_pointer pointer;
+  using iterator_category = bidirectional_iterator_tag;
+  using value_type        = typename _TreeIterator::value_type;
+  using difference_type   = typename _TreeIterator::difference_type;
+  using reference         = const value_type&;
+  using pointer           = typename _TreeIterator::pointer;
 
   _LIBCPP_HIDE_FROM_ABI __map_const_iterator() _NOEXCEPT {}
 
diff --git a/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp b/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp
deleted file mode 100644
index 04dcb8f54faf..000000000000
--- a/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
-#include <__tree>
-#include <map>
-#include <set>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "min_allocator.h"
-
-void testKeyValueTrait() {
-  {
-    typedef int Tp;
-    typedef std::__tree_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, int>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::pair<int, int> Tp;
-    typedef std::__tree_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::pair<const int, int> Tp;
-    typedef std::__tree_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::__value_type<int, int> Tp;
-    typedef std::__tree_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, int>::value), "");
-    static_assert((std::is_same<Traits::mapped_type, int>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, std::pair<const int, int> >::value), "");
-    static_assert((std::is_same<Traits::__map_value_type, std::pair<const int, int> >::value), "");
-    static_assert(Traits::__is_map == true, "");
-  }
-}
-
-int main(int, char**) {
-  testKeyValueTrait();
-
-  return 0;
-}

From bc6faf9a020880f2902581a898e4434000d263f7 Mon Sep 17 00:00:00 2001
From: Dipesh Sharma <76941383+dipeshs809@users.noreply.github.com>
Date: Mon, 23 Jun 2025 14:20:28 +0530
Subject: [PATCH 1247/1322] [X86] X86LegalizerInfo - use LegalFor instead if
 LegalIf for simple ISA/test pairs (#144675)

We have lots of `legalIf to` evaluate the legality of instructions based
on predicate's truthfulness, which should be simplified to use the
`legalFor({Types})` or `legalFor(Pred, {Types})` helpers:

closes #138259

for eg:

```
  getActionDefinitionsBuilder({G_ADD, G_SUB})
      .legalIf([=](const LegalityQuery &Query) -> bool {
        if (typeInSet(0, {s8, s16, s32})(Query))
          return true;
        if (Is64Bit && typeInSet(0, {s64})(Query))
          return true;
        if (HasSSE2 && typeInSet(0, {v16s8, v8s16, v4s32, v2s64})(Query))
          return true;
        if (HasAVX2 && typeInSet(0, {v32s8, v16s16, v8s32, v4s64})(Query))
          return true;
        if (HasAVX512 && typeInSet(0, {v16s32, v8s64})(Query))
          return true;
        if (HasBWI && typeInSet(0, {v64s8, v32s16})(Query))
          return true;
        return false;
      })
```
gets transformed to:

```
  getActionDefinitionsBuilder({G_ADD, G_SUB})
      .legalFor({s8, s16, s32})
      .legalFor(Is64Bit, {s64})
      .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64})
 --- etc ---
```

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 287 +++++++-----------
 1 file changed, 112 insertions(+), 175 deletions(-)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 11dd05c58498..f21a7c81459f 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -43,6 +43,9 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   bool HasDQI = Subtarget.hasAVX512() && Subtarget.hasDQI();
   bool HasBWI = Subtarget.hasAVX512() && Subtarget.hasBWI();
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
+  bool HasPOPCNT = Subtarget.hasPOPCNT();
+  bool HasLZCNT = Subtarget.hasLZCNT();
+  bool HasBMI = Subtarget.hasBMI();
 
   const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
   const LLT s1 = LLT::scalar(1);
@@ -56,7 +59,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   const LLT v2s32 = LLT::fixed_vector(2, 32);
   const LLT v4s8 = LLT::fixed_vector(4, 8);
 
-
   const LLT v16s8 = LLT::fixed_vector(16, 8);
   const LLT v8s16 = LLT::fixed_vector(8, 16);
   const LLT v4s32 = LLT::fixed_vector(4, 32);
@@ -82,20 +84,16 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   // todo: AVX512 bool vector predicate types
 
   // implicit/constants
+  // 32/64-bits needs support for s64/s128 to handle cases:
+  // s64 = EXTEND (G_IMPLICIT_DEF s32) -> s64 = G_IMPLICIT_DEF
+  // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        // 32/64-bits needs support for s64/s128 to handle cases:
-        // s64 = EXTEND (G_IMPLICIT_DEF s32) -> s64 = G_IMPLICIT_DEF
-        // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
-        return typeInSet(0, {p0, s1, s8, s16, s32, s64})(Query) ||
-               (Is64Bit && typeInSet(0, {s128})(Query));
-      });
+      .legalFor({p0, s1, s8, s16, s32, s64})
+      .legalFor(Is64Bit, {s128});
 
   getActionDefinitionsBuilder(G_CONSTANT)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typeInSet(0, {p0, s8, s16, s32})(Query) ||
-               (Is64Bit && typeInSet(0, {s64})(Query));
-      })
+      .legalFor({p0, s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
       .widenScalarToNextPow2(0, /*Min=*/8)
       .clampScalar(0, s8, sMaxScalar);
 
@@ -147,21 +145,12 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   // integer addition/subtraction
   getActionDefinitionsBuilder({G_ADD, G_SUB})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        if (typeInSet(0, {s8, s16, s32})(Query))
-          return true;
-        if (Is64Bit && typeInSet(0, {s64})(Query))
-          return true;
-        if (HasSSE2 && typeInSet(0, {v16s8, v8s16, v4s32, v2s64})(Query))
-          return true;
-        if (HasAVX2 && typeInSet(0, {v32s8, v16s16, v8s32, v4s64})(Query))
-          return true;
-        if (HasAVX512 && typeInSet(0, {v16s32, v8s64})(Query))
-          return true;
-        if (HasBWI && typeInSet(0, {v64s8, v32s16})(Query))
-          return true;
-        return false;
-      })
+      .legalFor({s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
+      .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64})
+      .legalFor(HasAVX2, {v32s8, v16s16, v8s32, v4s64})
+      .legalFor(HasAVX512, {v16s32, v8s64})
+      .legalFor(HasBWI, {v64s8, v32s16})
       .clampMinNumElements(0, s8, 16)
       .clampMinNumElements(0, s16, 8)
       .clampMinNumElements(0, s32, 4)
@@ -175,10 +164,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .scalarize(0);
 
   getActionDefinitionsBuilder({G_UADDE, G_UADDO, G_USUBE, G_USUBO})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typePairInSet(0, 1, {{s8, s1}, {s16, s1}, {s32, s1}})(Query) ||
-               (Is64Bit && typePairInSet(0, 1, {{s64, s1}})(Query));
-      })
+      .legalFor({{s8, s1}, {s16, s1}, {s32, s1}})
+      .legalFor(Is64Bit, {{s64, s1}})
       .widenScalarToNextPow2(0, /*Min=*/32)
       .clampScalar(0, s8, sMaxScalar)
       .clampScalar(1, s1, s1)
@@ -186,27 +173,15 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   // integer multiply
   getActionDefinitionsBuilder(G_MUL)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        if (typeInSet(0, {s8, s16, s32})(Query))
-          return true;
-        if (Is64Bit && typeInSet(0, {s64})(Query))
-          return true;
-        if (HasSSE2 && typeInSet(0, {v8s16})(Query))
-          return true;
-        if (HasSSE41 && typeInSet(0, {v4s32})(Query))
-          return true;
-        if (HasAVX2 && typeInSet(0, {v16s16, v8s32})(Query))
-          return true;
-        if (HasAVX512 && typeInSet(0, {v16s32})(Query))
-          return true;
-        if (HasDQI && typeInSet(0, {v8s64})(Query))
-          return true;
-        if (HasDQI && HasVLX && typeInSet(0, {v2s64, v4s64})(Query))
-          return true;
-        if (HasBWI && typeInSet(0, {v32s16})(Query))
-          return true;
-        return false;
-      })
+      .legalFor({s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
+      .legalFor(HasSSE2, {v8s16})
+      .legalFor(HasSSE41, {v4s32})
+      .legalFor(HasAVX2, {v16s16, v8s32})
+      .legalFor(HasAVX512, {v16s32})
+      .legalFor(HasDQI, {v8s64})
+      .legalFor(HasDQI && HasVLX, {v2s64, v4s64})
+      .legalFor(HasBWI, {v32s16})
       .clampMinNumElements(0, s16, 8)
       .clampMinNumElements(0, s32, 4)
       .clampMinNumElements(0, s64, HasVLX ? 2 : 8)
@@ -218,47 +193,33 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .scalarize(0);
 
   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typeInSet(0, {s8, s16, s32})(Query) ||
-               (Is64Bit && typeInSet(0, {s64})(Query));
-      })
+      .legalFor({s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
       .widenScalarToNextPow2(0, /*Min=*/32)
       .clampScalar(0, s8, sMaxScalar)
       .scalarize(0);
 
   // integer divisions
   getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UDIV, G_UREM})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typeInSet(0, {s8, s16, s32})(Query) ||
-               (Is64Bit && typeInSet(0, {s64})(Query));
-      })
+      .legalFor({s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
       .libcallFor({s64})
       .clampScalar(0, s8, sMaxScalar);
 
   // integer shifts
   getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typePairInSet(0, 1, {{s8, s8}, {s16, s8}, {s32, s8}})(Query) ||
-               (Is64Bit && typePairInSet(0, 1, {{s64, s8}})(Query));
-      })
+      .legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
+      .legalFor(Is64Bit, {{s64, s8}})
       .clampScalar(0, s8, sMaxScalar)
       .clampScalar(1, s8, s8);
 
   // integer logic
   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        if (typeInSet(0, {s8, s16, s32})(Query))
-          return true;
-        if (Is64Bit && typeInSet(0, {s64})(Query))
-          return true;
-        if (HasSSE2 && typeInSet(0, {v16s8, v8s16, v4s32, v2s64})(Query))
-          return true;
-        if (HasAVX && typeInSet(0, {v32s8, v16s16, v8s32, v4s64})(Query))
-          return true;
-        if (HasAVX512 && typeInSet(0, {v64s8, v32s16, v16s32, v8s64})(Query))
-          return true;
-        return false;
-      })
+      .legalFor({s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
+      .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64})
+      .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64})
+      .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64})
       .clampMinNumElements(0, s8, 16)
       .clampMinNumElements(0, s16, 8)
       .clampMinNumElements(0, s32, 4)
@@ -282,57 +243,50 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   // bswap
   getActionDefinitionsBuilder(G_BSWAP)
-      .legalIf([=](const LegalityQuery &Query) {
-        return Query.Types[0] == s32 ||
-               (Subtarget.is64Bit() && Query.Types[0] == s64);
-      })
+      .legalFor({s32})
+      .legalFor(Is64Bit, {s64})
       .widenScalarToNextPow2(0, /*Min=*/32)
       .clampScalar(0, s32, sMaxScalar);
 
   // popcount
   getActionDefinitionsBuilder(G_CTPOP)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return Subtarget.hasPOPCNT() &&
-               (typePairInSet(0, 1, {{s16, s16}, {s32, s32}})(Query) ||
-                (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query)));
-      })
+      .legalFor(HasPOPCNT, {{s16, s16}, {s32, s32}})
+      .legalFor(HasPOPCNT && Is64Bit, {{s64, s64}})
       .widenScalarToNextPow2(1, /*Min=*/16)
       .clampScalar(1, s16, sMaxScalar)
       .scalarSameSizeAs(0, 1);
 
   // count leading zeros (LZCNT)
   getActionDefinitionsBuilder(G_CTLZ)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return Subtarget.hasLZCNT() &&
-               (typePairInSet(0, 1, {{s16, s16}, {s32, s32}})(Query) ||
-                (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query)));
-      })
+      .legalFor(HasLZCNT, {{s16, s16}, {s32, s32}})
+      .legalFor(HasLZCNT && Is64Bit, {{s64, s64}})
       .widenScalarToNextPow2(1, /*Min=*/16)
       .clampScalar(1, s16, sMaxScalar)
       .scalarSameSizeAs(0, 1);
 
   // count trailing zeros
-  getActionDefinitionsBuilder({G_CTTZ_ZERO_UNDEF, G_CTTZ})
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return (Query.Opcode == G_CTTZ_ZERO_UNDEF || Subtarget.hasBMI()) &&
-               (typePairInSet(0, 1, {{s16, s16}, {s32, s32}})(Query) ||
-                (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query)));
-      })
+  getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
+      .legalFor({{s16, s16}, {s32, s32}})
+      .legalFor(Is64Bit, {{s64, s64}})
+      .widenScalarToNextPow2(1, /*Min=*/16)
+      .clampScalar(1, s16, sMaxScalar)
+      .scalarSameSizeAs(0, 1);
+
+  getActionDefinitionsBuilder(G_CTTZ)
+      .legalFor(HasBMI, {{s16, s16}, {s32, s32}})
+      .legalFor(HasBMI && Is64Bit, {{s64, s64}})
       .widenScalarToNextPow2(1, /*Min=*/16)
       .clampScalar(1, s16, sMaxScalar)
       .scalarSameSizeAs(0, 1);
 
   // control flow
   getActionDefinitionsBuilder(G_PHI)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typeInSet(0, {s8, s16, s32, p0})(Query) ||
-               (UseX87 && typeIs(0, s80)(Query)) ||
-               (Is64Bit && typeIs(0, s64)(Query)) ||
-               (HasSSE1 && typeInSet(0, {v16s8, v8s16, v4s32, v2s64})(Query)) ||
-               (HasAVX && typeInSet(0, {v32s8, v16s16, v8s32, v4s64})(Query)) ||
-               (HasAVX512 &&
-                typeInSet(0, {v64s8, v32s16, v16s32, v8s64})(Query));
-      })
+      .legalFor({s8, s16, s32, p0})
+      .legalFor(UseX87, {s80})
+      .legalFor(Is64Bit, {s64})
+      .legalFor(HasSSE1, {v16s8, v8s16, v4s32, v2s64})
+      .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64})
+      .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64})
       .clampMinNumElements(0, s8, 16)
       .clampMinNumElements(0, s16, 8)
       .clampMinNumElements(0, s32, 4)
@@ -361,10 +315,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   getActionDefinitionsBuilder(G_CONSTANT_POOL).legalFor({p0});
 
   getActionDefinitionsBuilder(G_PTR_ADD)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return typePairInSet(0, 1, {{p0, s32}})(Query) ||
-               (Is64Bit && typePairInSet(0, 1, {{p0, s64}})(Query));
-      })
+      .legalFor({{p0, s32}})
+      .legalFor(Is64Bit, {{p0, s64}})
       .widenScalarToNextPow2(1, /*Min*/ 32)
       .clampScalar(1, s32, sMaxScalar);
 
@@ -423,23 +375,27 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
     auto &Action = getActionDefinitionsBuilder(Op);
-    Action.legalForTypesWithMemDesc({{s16, p0, s8, 1},
-                                     {s32, p0, s8, 1},
-                                     {s32, p0, s16, 1}});
+    Action.legalForTypesWithMemDesc(
+        {{s16, p0, s8, 1}, {s32, p0, s8, 1}, {s32, p0, s16, 1}});
     if (Is64Bit)
-      Action.legalForTypesWithMemDesc({{s64, p0, s8, 1},
-                                       {s64, p0, s16, 1},
-                                       {s64, p0, s32, 1}});
+      Action.legalForTypesWithMemDesc(
+          {{s64, p0, s8, 1}, {s64, p0, s16, 1}, {s64, p0, s32, 1}});
     // TODO - SSE41/AVX2/AVX512F/AVX512BW vector extensions
   }
 
   // sext, zext, and anyext
-  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
-      .legalIf([=](const LegalityQuery &Query) {
-        return typeInSet(0, {s8, s16, s32})(Query) ||
-          (Query.Opcode == G_ANYEXT && Query.Types[0] == s128) ||
-          (Is64Bit && Query.Types[0] == s64);
-      })
+  getActionDefinitionsBuilder(G_ANYEXT)
+      .legalFor({s8, s16, s32, s128})
+      .legalFor(Is64Bit, {s64})
+      .widenScalarToNextPow2(0, /*Min=*/8)
+      .clampScalar(0, s8, sMaxScalar)
+      .widenScalarToNextPow2(1, /*Min=*/8)
+      .clampScalar(1, s8, sMaxScalar)
+      .scalarize(0);
+
+  getActionDefinitionsBuilder({G_SEXT, G_ZEXT})
+      .legalFor({s8, s16, s32})
+      .legalFor(Is64Bit, {s64})
       .widenScalarToNextPow2(0, /*Min=*/8)
       .clampScalar(0, s8, sMaxScalar)
       .widenScalarToNextPow2(1, /*Min=*/8)
@@ -450,21 +406,17 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalIf([=](const LegalityQuery &Query) -> bool {
-        return (typeInSet(0, {s32, s64})(Query)) ||
-               (UseX87 && typeInSet(0, {s80})(Query));
-      });
+      .legalFor({s32, s64})
+      .legalFor(UseX87, {s80});
 
   // fp arithmetic
   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV})
-      .legalIf([=](const LegalityQuery &Query) {
-        return (typeInSet(0, {s32, s64})(Query)) ||
-               (HasSSE1 && typeInSet(0, {v4s32})(Query)) ||
-               (HasSSE2 && typeInSet(0, {v2s64})(Query)) ||
-               (HasAVX && typeInSet(0, {v8s32, v4s64})(Query)) ||
-               (HasAVX512 && typeInSet(0, {v16s32, v8s64})(Query)) ||
-               (UseX87 && typeInSet(0, {s80})(Query));
-      });
+      .legalFor({s32, s64})
+      .legalFor(HasSSE1, {v4s32})
+      .legalFor(HasSSE2, {v2s64})
+      .legalFor(HasAVX, {v8s32, v4s64})
+      .legalFor(HasAVX512, {v16s32, v8s64})
+      .legalFor(UseX87, {s80});
 
   // fp comparison
   getActionDefinitionsBuilder(G_FCMP)
@@ -476,18 +428,15 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .widenScalarToNextPow2(1);
 
   // fp conversions
-  getActionDefinitionsBuilder(G_FPEXT).legalIf([=](const LegalityQuery &Query) {
-    return (HasSSE2 && typePairInSet(0, 1, {{s64, s32}})(Query)) ||
-           (HasAVX && typePairInSet(0, 1, {{v4s64, v4s32}})(Query)) ||
-           (HasAVX512 && typePairInSet(0, 1, {{v8s64, v8s32}})(Query));
-  });
+  getActionDefinitionsBuilder(G_FPEXT)
+      .legalFor(HasSSE2, {{s64, s32}})
+      .legalFor(HasAVX, {{v4s64, v4s32}})
+      .legalFor(HasAVX512, {{v8s64, v8s32}});
 
-  getActionDefinitionsBuilder(G_FPTRUNC).legalIf(
-      [=](const LegalityQuery &Query) {
-        return (HasSSE2 && typePairInSet(0, 1, {{s32, s64}})(Query)) ||
-               (HasAVX && typePairInSet(0, 1, {{v4s32, v4s64}})(Query)) ||
-               (HasAVX512 && typePairInSet(0, 1, {{v8s32, v8s64}})(Query));
-      });
+  getActionDefinitionsBuilder(G_FPTRUNC)
+      .legalFor(HasSSE2, {{s32, s64}})
+      .legalFor(HasAVX, {{v4s32, v4s64}})
+      .legalFor(HasAVX512, {{v8s32, v8s64}});
 
   getActionDefinitionsBuilder(G_SITOFP)
       .legalFor(HasSSE1, {{s32, s32}})
@@ -519,10 +468,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   // For AVX512 we simply widen types as there is direct mapping from opcodes
   // to asm instructions.
   getActionDefinitionsBuilder(G_UITOFP)
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
-               typeInSet(1, {s32, s64})(Query);
-      })
+      .legalFor(HasAVX512, {{s32, s32}, {s32, s64}, {s64, s32}, {s64, s64}})
       .customIf([=](const LegalityQuery &Query) {
         return !HasAVX512 &&
                ((HasSSE1 && typeIs(0, s32)(Query)) ||
@@ -542,10 +488,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .widenScalarToNextPow2(1);
 
   getActionDefinitionsBuilder(G_FPTOUI)
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
-               typeInSet(1, {s32, s64})(Query);
-      })
+      .legalFor(HasAVX512, {{s32, s32}, {s32, s64}, {s64, s32}, {s64, s64}})
       .customIf([=](const LegalityQuery &Query) {
         return !HasAVX512 &&
                ((HasSSE1 && typeIs(1, s32)(Query)) ||
@@ -603,22 +546,17 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   // todo: only permit dst types up to max legal vector register size?
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
-      .legalIf([=](const LegalityQuery &Query) {
-        return (HasSSE1 && typePairInSet(1, 0,
-                                         {{v16s8, v32s8},
-                                          {v8s16, v16s16},
-                                          {v4s32, v8s32},
-                                          {v2s64, v4s64}})(Query)) ||
-               (HasAVX && typePairInSet(1, 0,
-                                        {{v16s8, v64s8},
-                                         {v32s8, v64s8},
-                                         {v8s16, v32s16},
-                                         {v16s16, v32s16},
-                                         {v4s32, v16s32},
-                                         {v8s32, v16s32},
-                                         {v2s64, v8s64},
-                                         {v4s64, v8s64}})(Query));
-      });
+      .legalFor(
+          HasSSE1,
+          {{v32s8, v16s8}, {v16s16, v8s16}, {v8s32, v4s32}, {v4s64, v2s64}})
+      .legalFor(HasAVX, {{v64s8, v16s8},
+                         {v64s8, v32s8},
+                         {v32s16, v8s16},
+                         {v32s16, v16s16},
+                         {v16s32, v4s32},
+                         {v16s32, v8s32},
+                         {v8s64, v2s64},
+                         {v8s64, v4s64}});
 
   // todo: vectors and address spaces
   getActionDefinitionsBuilder(G_SELECT)
@@ -630,9 +568,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   // memory intrinsics
   getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
 
-  getActionDefinitionsBuilder({G_DYN_STACKALLOC,
-                               G_STACKSAVE,
-                               G_STACKRESTORE}).lower();
+  getActionDefinitionsBuilder({G_DYN_STACKALLOC, G_STACKSAVE, G_STACKRESTORE})
+      .lower();
 
   // fp intrinsics
   getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
@@ -641,9 +578,9 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .libcall();
 
   getActionDefinitionsBuilder({G_FREEZE, G_CONSTANT_FOLD_BARRIER})
-    .legalFor({s8, s16, s32, s64, p0})
-    .widenScalarToNextPow2(0, /*Min=*/8)
-    .clampScalar(0, s8, sMaxScalar);
+      .legalFor({s8, s16, s32, s64, p0})
+      .widenScalarToNextPow2(0, /*Min=*/8)
+      .clampScalar(0, s8, sMaxScalar);
 
   getLegacyLegalizerInfo().computeTables();
   verify(*STI.getInstrInfo());

From c5629f2b600488e1aee3f8c9f2266523b460367f Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 23 Jun 2025 10:51:26 +0200
Subject: [PATCH 1248/1322] [lldb] Add Socket::CreatePair (#145015)

It creates a pair of connected sockets using the simplest mechanism for
the given platform (TCP on windows, socketpair(2) elsewhere).

Main motivation is to remove the ugly platform-specific code in
ProcessGDBRemote::LaunchAndConnectToDebugserver, but it can also be used
in other places where we need to create a pair of connected sockets.
---
 lldb/include/lldb/Host/Socket.h               |  4 +++
 lldb/include/lldb/Host/common/TCPSocket.h     |  4 +++
 lldb/include/lldb/Host/posix/DomainSocket.h   |  4 +++
 lldb/source/Host/common/Socket.cpp            | 17 +++++++++
 lldb/source/Host/common/TCPSocket.cpp         | 26 ++++++++++++++
 lldb/source/Host/posix/DomainSocket.cpp       | 27 ++++++++++++++
 .../gdb-remote/GDBRemoteCommunication.cpp     | 32 ++++-------------
 lldb/unittests/Core/CommunicationTest.cpp     | 26 ++++++++------
 lldb/unittests/Host/SocketTest.cpp            | 35 +++++++++++++++++++
 9 files changed, 138 insertions(+), 37 deletions(-)

diff --git a/lldb/include/lldb/Host/Socket.h b/lldb/include/lldb/Host/Socket.h
index c313aa4f6d26..89953ee7fd5b 100644
--- a/lldb/include/lldb/Host/Socket.h
+++ b/lldb/include/lldb/Host/Socket.h
@@ -106,6 +106,10 @@ public:
   static std::unique_ptr<Socket> Create(const SocketProtocol protocol,
                                         Status &error);
 
+  using Pair = std::pair<std::unique_ptr<Socket>, std::unique_ptr<Socket>>;
+  static llvm::Expected<Pair>
+  CreatePair(std::optional<SocketProtocol> protocol = std::nullopt);
+
   virtual Status Connect(llvm::StringRef name) = 0;
   virtual Status Listen(llvm::StringRef name, int backlog) = 0;
 
diff --git a/lldb/include/lldb/Host/common/TCPSocket.h b/lldb/include/lldb/Host/common/TCPSocket.h
index cb950c0015ea..353e538d0552 100644
--- a/lldb/include/lldb/Host/common/TCPSocket.h
+++ b/lldb/include/lldb/Host/common/TCPSocket.h
@@ -23,6 +23,10 @@ public:
   TCPSocket(NativeSocket socket, bool should_close);
   ~TCPSocket() override;
 
+  using Pair =
+      std::pair<std::unique_ptr<TCPSocket>, std::unique_ptr<TCPSocket>>;
+  static llvm::Expected<Pair> CreatePair();
+
   // returns port number or 0 if error
   uint16_t GetLocalPortNumber() const;
 
diff --git a/lldb/include/lldb/Host/posix/DomainSocket.h b/lldb/include/lldb/Host/posix/DomainSocket.h
index a840d474429e..cfb31922367c 100644
--- a/lldb/include/lldb/Host/posix/DomainSocket.h
+++ b/lldb/include/lldb/Host/posix/DomainSocket.h
@@ -19,6 +19,10 @@ public:
   DomainSocket(NativeSocket socket, bool should_close);
   explicit DomainSocket(bool should_close);
 
+  using Pair =
+      std::pair<std::unique_ptr<DomainSocket>, std::unique_ptr<DomainSocket>>;
+  static llvm::Expected<Pair> CreatePair();
+
   Status Connect(llvm::StringRef name) override;
   Status Listen(llvm::StringRef name, int backlog) override;
 
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index 5c5cd653c3d9..2b23fd1e6e57 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -234,6 +234,23 @@ std::unique_ptr<Socket> Socket::Create(const SocketProtocol protocol,
   return socket_up;
 }
 
+llvm::Expected<Socket::Pair>
+Socket::CreatePair(std::optional<SocketProtocol> protocol) {
+  constexpr SocketProtocol kBestProtocol =
+      LLDB_ENABLE_POSIX ? ProtocolUnixDomain : ProtocolTcp;
+  switch (protocol.value_or(kBestProtocol)) {
+  case ProtocolTcp:
+    return TCPSocket::CreatePair();
+#if LLDB_ENABLE_POSIX
+  case ProtocolUnixDomain:
+  case ProtocolUnixAbstract:
+    return DomainSocket::CreatePair();
+#endif
+  default:
+    return llvm::createStringError("Unsupported protocol");
+  }
+}
+
 llvm::Expected<std::unique_ptr<Socket>>
 Socket::TcpConnect(llvm::StringRef host_and_port) {
   Log *log = GetLog(LLDBLog::Connection);
diff --git a/lldb/source/Host/common/TCPSocket.cpp b/lldb/source/Host/common/TCPSocket.cpp
index 3d0dea1c61dd..c144f3c501d7 100644
--- a/lldb/source/Host/common/TCPSocket.cpp
+++ b/lldb/source/Host/common/TCPSocket.cpp
@@ -52,6 +52,32 @@ TCPSocket::TCPSocket(NativeSocket socket, bool should_close)
 
 TCPSocket::~TCPSocket() { CloseListenSockets(); }
 
+llvm::Expected<TCPSocket::Pair> TCPSocket::CreatePair() {
+  auto listen_socket_up = std::make_unique<TCPSocket>(true);
+  if (Status error = listen_socket_up->Listen("localhost:0", 5); error.Fail())
+    return error.takeError();
+
+  std::string connect_address =
+      llvm::StringRef(listen_socket_up->GetListeningConnectionURI()[0])
+          .split("://")
+          .second.str();
+
+  auto connect_socket_up = std::make_unique<TCPSocket>(true);
+  if (Status error = connect_socket_up->Connect(connect_address); error.Fail())
+    return error.takeError();
+
+  // Connection has already been made above, so a short timeout is sufficient.
+  Socket *accept_socket;
+  if (Status error =
+          listen_socket_up->Accept(std::chrono::seconds(1), accept_socket);
+      error.Fail())
+    return error.takeError();
+
+  return Pair(
+      std::move(connect_socket_up),
+      std::unique_ptr<TCPSocket>(static_cast<TCPSocket *>(accept_socket)));
+}
+
 bool TCPSocket::IsValid() const {
   return m_socket != kInvalidSocketValue || m_listen_sockets.size() != 0;
 }
diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index 4f76e0c16d4c..6a730324a1ea 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -13,9 +13,11 @@
 #endif
 
 #include "llvm/Support/Errno.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 
 #include <cstddef>
+#include <fcntl.h>
 #include <memory>
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -76,6 +78,31 @@ DomainSocket::DomainSocket(SocketProtocol protocol, NativeSocket socket,
   m_socket = socket;
 }
 
+llvm::Expected<DomainSocket::Pair> DomainSocket::CreatePair() {
+  int sockets[2];
+  int type = SOCK_STREAM;
+#ifdef SOCK_CLOEXEC
+  type |= SOCK_CLOEXEC;
+#endif
+  if (socketpair(AF_UNIX, type, 0, sockets) == -1)
+    return llvm::errorCodeToError(llvm::errnoAsErrorCode());
+
+#ifndef SOCK_CLOEXEC
+  for (int s : sockets) {
+    int r = fcntl(s, F_SETFD, FD_CLOEXEC | fcntl(s, F_GETFD));
+    assert(r == 0);
+    (void)r;
+  }
+#endif
+
+  return Pair(std::unique_ptr<DomainSocket>(
+                  new DomainSocket(ProtocolUnixDomain, sockets[0],
+                                   /*should_close=*/true)),
+              std::unique_ptr<DomainSocket>(
+                  new DomainSocket(ProtocolUnixDomain, sockets[1],
+                                   /*should_close=*/true)));
+}
+
 Status DomainSocket::Connect(llvm::StringRef name) {
   sockaddr_un saddr_un;
   socklen_t saddr_un_len;
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 2aea7c6b781d..d1f57cc22d8b 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -1141,34 +1141,14 @@ void GDBRemoteCommunication::DumpHistory(Stream &strm) { m_history.Dump(strm); }
 llvm::Error
 GDBRemoteCommunication::ConnectLocally(GDBRemoteCommunication &client,
                                        GDBRemoteCommunication &server) {
-  const int backlog = 5;
-  TCPSocket listen_socket(true);
-  if (llvm::Error error =
-          listen_socket.Listen("localhost:0", backlog).ToError())
-    return error;
+  llvm::Expected<Socket::Pair> pair = Socket::CreatePair();
+  if (!pair)
+    return pair.takeError();
 
-  llvm::SmallString<32> remote_addr;
-  llvm::raw_svector_ostream(remote_addr)
-      << "connect://localhost:" << listen_socket.GetLocalPortNumber();
-
-  std::unique_ptr<ConnectionFileDescriptor> conn_up(
-      new ConnectionFileDescriptor());
-  Status status;
-  if (conn_up->Connect(remote_addr, &status) != lldb::eConnectionStatusSuccess)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Unable to connect: %s", status.AsCString());
-
-  // The connection was already established above, so a short timeout is
-  // sufficient.
-  Socket *accept_socket = nullptr;
-  if (Status accept_status =
-          listen_socket.Accept(std::chrono::seconds(1), accept_socket);
-      accept_status.Fail())
-    return accept_status.takeError();
-
-  client.SetConnection(std::move(conn_up));
+  client.SetConnection(
+      std::make_unique<ConnectionFileDescriptor>(pair->first.release()));
   server.SetConnection(
-      std::make_unique<ConnectionFileDescriptor>(accept_socket));
+      std::make_unique<ConnectionFileDescriptor>(pair->second.release()));
   return llvm::Error::success();
 }
 
diff --git a/lldb/unittests/Core/CommunicationTest.cpp b/lldb/unittests/Core/CommunicationTest.cpp
index df9ff089a0d7..fd07b4da9f8c 100644
--- a/lldb/unittests/Core/CommunicationTest.cpp
+++ b/lldb/unittests/Core/CommunicationTest.cpp
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Core/Communication.h"
+#include "TestingSupport/SubsystemRAII.h"
 #include "lldb/Core/ThreadedCommunication.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/ConnectionFileDescriptor.h"
 #include "lldb/Host/Pipe.h"
+#include "lldb/Host/Socket.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
-#include "TestingSupport/Host/SocketTestUtilities.h"
-#include "TestingSupport/SubsystemRAII.h"
 
 #include <chrono>
 #include <thread>
@@ -31,15 +31,17 @@ private:
 };
 
 static void CommunicationReadTest(bool use_read_thread) {
-  std::unique_ptr<TCPSocket> a, b;
-  ASSERT_TRUE(CreateTCPConnectedSockets("localhost", &a, &b));
+  llvm::Expected<Socket::Pair> pair = Socket::CreatePair();
+  ASSERT_THAT_EXPECTED(pair, llvm::Succeeded());
+  Socket &a = *pair->first;
 
   size_t num_bytes = 4;
-  ASSERT_THAT_ERROR(a->Write("test", num_bytes).ToError(), llvm::Succeeded());
+  ASSERT_THAT_ERROR(a.Write("test", num_bytes).ToError(), llvm::Succeeded());
   ASSERT_EQ(num_bytes, 4U);
 
   ThreadedCommunication comm("test");
-  comm.SetConnection(std::make_unique<ConnectionFileDescriptor>(b.release()));
+  comm.SetConnection(
+      std::make_unique<ConnectionFileDescriptor>(pair->second.release()));
   comm.SetCloseOnEOF(true);
 
   if (use_read_thread) {
@@ -73,7 +75,7 @@ static void CommunicationReadTest(bool use_read_thread) {
   EXPECT_THAT_ERROR(error.ToError(), llvm::Failed());
 
   // This read should return EOF.
-  ASSERT_THAT_ERROR(a->Close().ToError(), llvm::Succeeded());
+  ASSERT_THAT_ERROR(a.Close().ToError(), llvm::Succeeded());
   error.Clear();
   EXPECT_EQ(
       comm.Read(buf, sizeof(buf), std::chrono::seconds(5), status, &error), 0U);
@@ -118,17 +120,19 @@ TEST_F(CommunicationTest, ReadThread) {
 }
 
 TEST_F(CommunicationTest, SynchronizeWhileClosing) {
-  std::unique_ptr<TCPSocket> a, b;
-  ASSERT_TRUE(CreateTCPConnectedSockets("localhost", &a, &b));
+  llvm::Expected<Socket::Pair> pair = Socket::CreatePair();
+  ASSERT_THAT_EXPECTED(pair, llvm::Succeeded());
+  Socket &a = *pair->first;
 
   ThreadedCommunication comm("test");
-  comm.SetConnection(std::make_unique<ConnectionFileDescriptor>(b.release()));
+  comm.SetConnection(
+      std::make_unique<ConnectionFileDescriptor>(pair->second.release()));
   comm.SetCloseOnEOF(true);
   ASSERT_TRUE(comm.StartReadThread());
 
   // Ensure that we can safely synchronize with the read thread while it is
   // closing the read end (in response to us closing the write end).
-  ASSERT_THAT_ERROR(a->Close().ToError(), llvm::Succeeded());
+  ASSERT_THAT_ERROR(a.Close().ToError(), llvm::Succeeded());
   comm.SynchronizeWithReadThread();
 
   ASSERT_TRUE(comm.StopReadThread());
diff --git a/lldb/unittests/Host/SocketTest.cpp b/lldb/unittests/Host/SocketTest.cpp
index 77366593f05f..3630b6324270 100644
--- a/lldb/unittests/Host/SocketTest.cpp
+++ b/lldb/unittests/Host/SocketTest.cpp
@@ -74,6 +74,41 @@ TEST_F(SocketTest, DecodeHostAndPort) {
       llvm::HasValue(Socket::HostAndPort{"abcd:12fg:AF58::1", 12345}));
 }
 
+TEST_F(SocketTest, CreatePair) {
+  std::vector<std::optional<Socket::SocketProtocol>> functional_protocols = {
+      std::nullopt,
+      Socket::ProtocolTcp,
+#if LLDB_ENABLE_POSIX
+      Socket::ProtocolUnixDomain,
+      Socket::ProtocolUnixAbstract,
+#endif
+  };
+  for (auto p : functional_protocols) {
+    auto expected_socket_pair = Socket::CreatePair(p);
+    ASSERT_THAT_EXPECTED(expected_socket_pair, llvm::Succeeded());
+    Socket &a = *expected_socket_pair->first;
+    Socket &b = *expected_socket_pair->second;
+    size_t num_bytes = 1;
+    ASSERT_THAT_ERROR(a.Write("a", num_bytes).takeError(), llvm::Succeeded());
+    ASSERT_EQ(num_bytes, 1);
+    char c;
+    ASSERT_THAT_ERROR(b.Read(&c, num_bytes).takeError(), llvm::Succeeded());
+    ASSERT_EQ(num_bytes, 1);
+    ASSERT_EQ(c, 'a');
+  }
+
+  std::vector<Socket::SocketProtocol> erroring_protocols = {
+#if !LLDB_ENABLE_POSIX
+      Socket::ProtocolUnixDomain,
+      Socket::ProtocolUnixAbstract,
+#endif
+  };
+  for (auto p : erroring_protocols) {
+    ASSERT_THAT_EXPECTED(Socket::CreatePair(p),
+                         llvm::FailedWithMessage("Unsupported protocol"));
+  }
+}
+
 #if LLDB_ENABLE_POSIX
 TEST_F(SocketTest, DomainListenConnectAccept) {
   llvm::SmallString<64> Path;

From 092ef1da45945d1bfa304db63c140b8cd115850d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 17:52:27 +0900
Subject: [PATCH 1249/1322] AMDGPU: Use reportFatalUsageError for unsupported
 disassembly error (#145264)

---
 llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 2 +-
 llvm/test/MC/Disassembler/AMDGPU/si-support.txt            | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2e891419f0e3..07a4292ef28b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -58,7 +58,7 @@ AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
       CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {
   // ToDo: AMDGPUDisassembler supports only VI ISA.
   if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
-    report_fatal_error("Disassembly not yet supported for subtarget");
+    reportFatalUsageError("disassembly not yet supported for subtarget");
 
   for (auto [Symbol, Code] : AMDGPU::UCVersion::getGFXVersions())
     createConstantSymbolExpr(Symbol, Code);
diff --git a/llvm/test/MC/Disassembler/AMDGPU/si-support.txt b/llvm/test/MC/Disassembler/AMDGPU/si-support.txt
index f62d89ff7b0e..d0c7b679fb52 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/si-support.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/si-support.txt
@@ -1,4 +1,5 @@
-# RUN: not --crash llvm-mc -triple=amdgcn -mcpu=tahiti -disassemble < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti -disassemble < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=amdgcn -mcpu=bonaire -disassemble < %s 2>&1 | FileCheck %s
 
-# CHECK: LLVM ERROR: Disassembly not yet supported for subtarget
+# CHECK: LLVM ERROR: disassembly not yet supported for subtarget
 0x00 0x00 0x00 0x7e

From b9c979d3698e56714db88385629a167786d07410 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 23 Jun 2025 11:06:14 +0200
Subject: [PATCH 1250/1322] [mlir][Transforms] Dialect conversion: Simplify
 `replaceOp` implementation (#145155)

Since #145030, `ConversionPatternRewriter::eraseBlock` no longer calls
`ConversionPatternRewriter::eraseOp`. This now happens in the rewriter
impl (during the cleanup phase). Therefore, a safety check in
`replaceOp` can now be simplified.
---
 .../Transforms/Utils/DialectConversion.cpp    | 26 +++++--------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index ad82a007b799..774d58973eb9 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1580,21 +1580,18 @@ void ConversionPatternRewriterImpl::replaceOp(
 
   // Check if replaced op is an unresolved materialization, i.e., an
   // unrealized_conversion_cast op that was created by the conversion driver.
-  bool isUnresolvedMaterialization = false;
-  if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
-    if (unresolvedMaterializations.contains(castOp))
-      isUnresolvedMaterialization = true;
+  if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op)) {
+    // Make sure that the user does not mess with unresolved materializations
+    // that were inserted by the conversion driver. We keep track of these
+    // ops in internal data structures.
+    assert(!unresolvedMaterializations.contains(castOp) &&
+           "attempting to replace/erase an unresolved materialization");
+  }
 
   // Create mappings for each of the new result values.
   for (auto [repl, result] : llvm::zip_equal(newValues, op->getResults())) {
     if (repl.empty()) {
       // This result was dropped and no replacement value was provided.
-      if (isUnresolvedMaterialization) {
-        // Do not create another materializations if we are erasing a
-        // materialization.
-        continue;
-      }
-
       // Materialize a replacement value "out of thin air".
       buildUnresolvedMaterialization(
           MaterializationKind::Source, computeInsertPoint(result),
@@ -1602,15 +1599,6 @@ void ConversionPatternRewriterImpl::replaceOp(
           /*outputTypes=*/result.getType(), /*originalType=*/Type(),
           currentTypeConverter);
       continue;
-    } else {
-      // Make sure that the user does not mess with unresolved materializations
-      // that were inserted by the conversion driver. We keep track of these
-      // ops in internal data structures. Erasing them must be allowed because
-      // this can happen when the user is erasing an entire block (including
-      // its body). But replacing them with another value should be forbidden
-      // to avoid problems with the `mapping`.
-      assert(!isUnresolvedMaterialization &&
-             "attempting to replace an unresolved materialization");
     }
 
     // Remap result to replacement value.

From 613c38a9923ad482f28a5f82c7b29efaad24bdce Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 23 Jun 2025 10:14:12 +0100
Subject: [PATCH 1251/1322] [Offload] Fix type mismatch warning in test
 (#143700)

---
 offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 157f33a36370..a3da334afcca 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -66,7 +66,7 @@ TEST_P(olLaunchKernelTest, Success) {
   ASSERT_SUCCESS(olWaitQueue(Queue));
 
   uint32_t *Data = (uint32_t *)Mem;
-  for (int i = 0; i < 64; i++) {
+  for (uint32_t i = 0; i < 64; i++) {
     ASSERT_EQ(Data[i], i);
   }
 
@@ -93,7 +93,7 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {
                                 &LaunchArgs, nullptr));
 
   uint32_t *Data = (uint32_t *)Mem;
-  for (int i = 0; i < 64; i++) {
+  for (uint32_t i = 0; i < 64; i++) {
     ASSERT_EQ(Data[i], i);
   }
 

From e7c1da7c8ef31c258619c1668062985e7ae83b70 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 23 Jun 2025 11:31:26 +0200
Subject: [PATCH 1252/1322] [lldb/cmake] Plugin layering enforcement mechanism
 (#144543)

Some inter-plugin dependencies are okay, others are not. Yet others not,
but we're sort of stuck with them. The idea is to be able to prevent
backsliding while making sure that acceptable dependencies are..
accepted. For context, see
https://github.com/llvm/llvm-project/pull/139170 and the attached
changes to the documentation.
---
 lldb/CMakeLists.txt                           |  3 +
 lldb/cmake/modules/LLDBLayeringCheck.cmake    | 68 +++++++++++++++++++
 lldb/docs/resources/contributing.rst          | 50 ++++++++++++++
 lldb/source/Plugins/ABI/CMakeLists.txt        |  6 ++
 .../Plugins/Architecture/CMakeLists.txt       |  2 +
 .../Plugins/Disassembler/CMakeLists.txt       |  2 +
 .../Plugins/DynamicLoader/CMakeLists.txt      |  7 ++
 .../Plugins/ExpressionParser/CMakeLists.txt   |  2 +
 .../source/Plugins/Instruction/CMakeLists.txt |  2 +
 .../InstrumentationRuntime/CMakeLists.txt     |  2 +
 lldb/source/Plugins/JITLoader/CMakeLists.txt  |  3 +
 lldb/source/Plugins/Language/CMakeLists.txt   |  6 ++
 .../Plugins/LanguageRuntime/CMakeLists.txt    |  3 +
 .../Plugins/MemoryHistory/CMakeLists.txt      |  2 +
 .../Plugins/ObjectContainer/CMakeLists.txt    |  2 +
 lldb/source/Plugins/ObjectFile/CMakeLists.txt |  2 +
 .../Plugins/OperatingSystem/CMakeLists.txt    |  2 +
 lldb/source/Plugins/Platform/CMakeLists.txt   |  7 ++
 lldb/source/Plugins/Process/CMakeLists.txt    |  5 ++
 .../Plugins/Process/Utility/CMakeLists.txt    |  3 +
 lldb/source/Plugins/REPL/CMakeLists.txt       |  3 +
 .../RegisterTypeBuilder/CMakeLists.txt        |  2 +
 .../Plugins/ScriptInterpreter/CMakeLists.txt  |  2 +
 .../Plugins/StructuredData/CMakeLists.txt     |  2 +
 lldb/source/Plugins/SymbolFile/CMakeLists.txt |  7 ++
 .../Plugins/SymbolLocator/CMakeLists.txt      |  2 +
 .../Plugins/SymbolVendor/CMakeLists.txt       |  3 +
 .../Plugins/SystemRuntime/CMakeLists.txt      |  3 +
 lldb/source/Plugins/Trace/CMakeLists.txt      |  2 +
 .../Plugins/TraceExporter/CMakeLists.txt      |  2 +
 lldb/source/Plugins/TypeSystem/CMakeLists.txt |  4 ++
 .../Plugins/UnwindAssembly/CMakeLists.txt     |  2 +
 32 files changed, 213 insertions(+)
 create mode 100644 lldb/cmake/modules/LLDBLayeringCheck.cmake

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 2aaf75dd87bc..e3b72e94d4be 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -37,6 +37,7 @@ endif()
 
 include(LLDBConfig)
 include(AddLLDB)
+include(LLDBLayeringCheck)
 
 set(LLDB_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
@@ -127,6 +128,8 @@ add_subdirectory(source)
 add_subdirectory(tools)
 add_subdirectory(docs)
 
+check_lldb_plugin_layering()
+
 if (LLDB_ENABLE_PYTHON)
   if(LLDB_BUILD_FRAMEWORK)
     set(lldb_python_target_dir "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Python/lldb")
diff --git a/lldb/cmake/modules/LLDBLayeringCheck.cmake b/lldb/cmake/modules/LLDBLayeringCheck.cmake
new file mode 100644
index 000000000000..082bbe37a980
--- /dev/null
+++ b/lldb/cmake/modules/LLDBLayeringCheck.cmake
@@ -0,0 +1,68 @@
+define_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND)
+define_property(TARGET PROPERTY LLDB_PLUGIN_KIND INHERITED)
+
+define_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES)
+define_property(TARGET PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES INHERITED)
+
+define_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES)
+define_property(TARGET PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES INHERITED)
+
+option(LLDB_GENERATE_PLUGIN_DEP_GRAPH OFF)
+
+function(check_lldb_plugin_layering)
+  get_property(plugins GLOBAL PROPERTY LLDB_PLUGINS)
+  foreach(plugin ${plugins})
+    get_property(plugin_kind TARGET ${plugin} PROPERTY LLDB_PLUGIN_KIND)
+    get_property(acceptable_deps TARGET ${plugin}
+      PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES)
+    get_property(tolerated_deps TARGET ${plugin}
+      PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES)
+
+    # A plugin is always permitted to depend on its own kind for the purposes
+    # subclassing. Ideally the intra-kind dependencies should not form a loop,
+    # but we're not checking that here.
+    list(APPEND acceptable_deps ${plugin_kind})
+
+    list(APPEND all_plugin_kinds ${plugin_kind})
+
+    get_property(link_libs TARGET ${plugin} PROPERTY LINK_LIBRARIES)
+    foreach(link_lib ${link_libs})
+      if(link_lib IN_LIST plugins)
+        get_property(lib_kind TARGET ${link_lib} PROPERTY LLDB_PLUGIN_KIND)
+        if (lib_kind)
+          if (lib_kind IN_LIST acceptable_deps)
+            set(dep_kind green)
+          elseif (lib_kind IN_LIST tolerated_deps)
+            set(dep_kind yellow)
+          else()
+            set(dep_kind red)
+            message(SEND_ERROR "Plugin ${plugin} cannot depend on ${lib_kind} "
+              "plugin ${link_lib}")
+          endif()
+          list(APPEND dep_${dep_kind}_${plugin_kind}_${lib_kind} ${plugin})
+        endif()
+      endif()
+    endforeach()
+  endforeach()
+
+  if (LLDB_GENERATE_PLUGIN_DEP_GRAPH)
+    set(dep_graph "digraph Plugins {\n")
+    list(REMOVE_DUPLICATES all_plugin_kinds)
+    foreach (from ${all_plugin_kinds})
+      foreach (to ${all_plugin_kinds})
+        foreach (dep_kind green yellow red)
+          if (dep_${dep_kind}_${from}_${to})
+            list(REMOVE_DUPLICATES dep_${dep_kind}_${from}_${to})
+            string(REGEX REPLACE "lldbPlugin|${from}" "" short_deps
+              "${dep_${dep_kind}_${from}_${to}}")
+            string(JOIN "\n" plugins ${short_deps})
+            string(APPEND dep_graph
+              "  ${from}->${to}[color=\"${dep_kind}\" label=\"${plugins}\"];\n")
+          endif()
+        endforeach()
+      endforeach()
+    endforeach()
+    string(APPEND dep_graph "}\n")
+    file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/lldb-plugin-deps.dot" "${dep_graph}")
+  endif()
+endfunction()
diff --git a/lldb/docs/resources/contributing.rst b/lldb/docs/resources/contributing.rst
index 48fd000765f6..e7d46c9061d5 100644
--- a/lldb/docs/resources/contributing.rst
+++ b/lldb/docs/resources/contributing.rst
@@ -56,6 +56,56 @@ subset of LLDB tests (the API tests) use a different system. Refer to the
 `lldb/test <https://github.com/llvm/llvm-project/tree/main/lldb/test>`_ folder
 for examples.
 
+
+LLDB plugins and their dependencies
+-----------------------------------
+
+LLDB has a concept of *plugins*, which are used to provide abstraction
+boundaries over functionality that is specific to a certain architecture,
+operating system, programming language, etc. A plugin implements an abstract
+base class (rarely, a set of related base classes), which is a part of LLDB
+core. This setup allows the LLDB core to remain generic while making it possible
+to support for new architectures, languages, and so on. For this to work, all
+code needs to obey certain rules.
+
+The principal rule is that LLDB core (defined as: everything under lldb/source
+*minus* lldb/source/Plugins) must not depend on any specific plugin. The only
+way it can interact with them is through the abstract interface. Explicit
+dependencies such as casting the base class to the plugin type are not permitted
+and neither are more subtle dependencies like checking the name plugin or or
+other situations where some code in LLDB core is tightly coupled to the
+implementation details of a specific plugin.
+
+The rule for interaction between different plugins is more nuanced. We recognize
+that some cross-plugin dependencies are unavoidable or even desirable. For
+example, a plugin may want to extend a plugin of the same kind to
+add/override/refine some functionality (e.g., Android is a "kind of" Linux, but
+it handles some things differently). Alternatively, a plugin of one kind may
+want to build on the functionality offered by a specific plugin of another kind
+(ELFCore Process plugin uses ELF ObjectFile plugin to create a process out of an
+ELF core file).
+
+In cases such as these, direct dependencies are acceptable. However, to keep the
+dependency graph manageable, we still have some rules to govern these
+relationships:
+
+* All dependencies between plugins of the same kind must flow in the same
+  direction (if plugin `A1` depends on plugin `B1`, then `B2` must not depend on
+  `A2`)
+* Dependency graph of plugin kinds must not contain loops (dependencies like
+  `A1->B1`, `B2->C2` and `C3->A3` are forbidden because they induce a cycle in
+  the plugin kind graph even though the plugins themselves are acyclical)
+
+
+The first of these rules is checked via CMake scripts (using the
+`LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES` property). Dependencies in this category
+are expected and permitted (subject to other constraints such as that dependency
+making sense for the particular pair of plugins). Unfortunately, due to historic
+reasons, not all plugin dependencies follow this rule, which is why we have
+another category called `LLDB_TOLERATED_PLUGIN_DEPENDENCIES`. New dependencies
+are forbidden (even though they are accepted by CMake) and existing ones should
+be removed whereever possible.
+
 .. _Error handling:
 
 Error handling and use of assertions in LLDB
diff --git a/lldb/source/Plugins/ABI/CMakeLists.txt b/lldb/source/Plugins/ABI/CMakeLists.txt
index e33ac8735486..97a20364ae7d 100644
--- a/lldb/source/Plugins/ABI/CMakeLists.txt
+++ b/lldb/source/Plugins/ABI/CMakeLists.txt
@@ -1,3 +1,9 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ABI)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
+  ProcessUtility
+  TypeSystem
+)
+
 foreach(target AArch64 ARM ARC Hexagon LoongArch Mips MSP430 PowerPC RISCV SystemZ X86)
   if (${target} IN_LIST LLVM_TARGETS_TO_BUILD)
     add_subdirectory(${target})
diff --git a/lldb/source/Plugins/Architecture/CMakeLists.txt b/lldb/source/Plugins/Architecture/CMakeLists.txt
index 9ed8edf70af3..0f898ef5116e 100644
--- a/lldb/source/Plugins/Architecture/CMakeLists.txt
+++ b/lldb/source/Plugins/Architecture/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Architecture)
+
 add_subdirectory(Arm)
 add_subdirectory(Mips)
 add_subdirectory(PPC64)
diff --git a/lldb/source/Plugins/Disassembler/CMakeLists.txt b/lldb/source/Plugins/Disassembler/CMakeLists.txt
index bec56765b60f..1d1ea206e277 100644
--- a/lldb/source/Plugins/Disassembler/CMakeLists.txt
+++ b/lldb/source/Plugins/Disassembler/CMakeLists.txt
@@ -1 +1,3 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Disassembler)
+
 add_subdirectory(LLVMC)
diff --git a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
index 30607159acdc..01aba34b9416 100644
--- a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
+++ b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
@@ -1,3 +1,10 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND DynamicLoader)
+set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
+  Process # part of a loop (Process<->DynamicLoader).
+  TypeSystem
+)
+
 add_subdirectory(Darwin-Kernel)
 add_subdirectory(FreeBSD-Kernel)
 add_subdirectory(MacOSX-DYLD)
diff --git a/lldb/source/Plugins/ExpressionParser/CMakeLists.txt b/lldb/source/Plugins/ExpressionParser/CMakeLists.txt
index 17c40aee44cc..8a8089879bd9 100644
--- a/lldb/source/Plugins/ExpressionParser/CMakeLists.txt
+++ b/lldb/source/Plugins/ExpressionParser/CMakeLists.txt
@@ -1 +1,3 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ExpressionParser)
+
 add_subdirectory(Clang)
diff --git a/lldb/source/Plugins/Instruction/CMakeLists.txt b/lldb/source/Plugins/Instruction/CMakeLists.txt
index 46d610f261e0..bf48a1c1cc11 100644
--- a/lldb/source/Plugins/Instruction/CMakeLists.txt
+++ b/lldb/source/Plugins/Instruction/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Instruction)
+
 add_subdirectory(ARM)
 add_subdirectory(ARM64)
 add_subdirectory(LoongArch)
diff --git a/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt b/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt
index 7f301bca14a8..2a6cf930945d 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt
+++ b/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND InstrumentationRuntime)
+
 add_subdirectory(ASan)
 add_subdirectory(ASanLibsanitizers)
 add_subdirectory(MainThreadChecker)
diff --git a/lldb/source/Plugins/JITLoader/CMakeLists.txt b/lldb/source/Plugins/JITLoader/CMakeLists.txt
index e52230199109..ffba54f8b287 100644
--- a/lldb/source/Plugins/JITLoader/CMakeLists.txt
+++ b/lldb/source/Plugins/JITLoader/CMakeLists.txt
@@ -1 +1,4 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND JITLoader)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES ObjectFile)
+
 add_subdirectory(GDB)
diff --git a/lldb/source/Plugins/Language/CMakeLists.txt b/lldb/source/Plugins/Language/CMakeLists.txt
index 7869074566d1..b432dd300668 100644
--- a/lldb/source/Plugins/Language/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CMakeLists.txt
@@ -1,3 +1,9 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Language)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
+  LanguageRuntime
+  TypeSystem
+)
+
 add_subdirectory(ClangCommon)
 add_subdirectory(CPlusPlus)
 add_subdirectory(ObjC)
diff --git a/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt
index 034ae1545ae8..32528d6d6171 100644
--- a/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt
+++ b/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt
@@ -1,2 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND LanguageRuntime)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES TypeSystem)
+
 add_subdirectory(CPlusPlus)
 add_subdirectory(ObjC)
diff --git a/lldb/source/Plugins/MemoryHistory/CMakeLists.txt b/lldb/source/Plugins/MemoryHistory/CMakeLists.txt
index 113f06362578..50838bb8a077 100644
--- a/lldb/source/Plugins/MemoryHistory/CMakeLists.txt
+++ b/lldb/source/Plugins/MemoryHistory/CMakeLists.txt
@@ -1 +1,3 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND MemoryHistory)
+
 add_subdirectory(asan)
diff --git a/lldb/source/Plugins/ObjectContainer/CMakeLists.txt b/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
index cda0c8151dd8..4ae1bb138a9a 100644
--- a/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ObjectContainer)
+
 add_subdirectory(BSD-Archive)
 add_subdirectory(Universal-Mach-O)
 add_subdirectory(Mach-O-Fileset)
diff --git a/lldb/source/Plugins/ObjectFile/CMakeLists.txt b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
index 7abd0c96f4fd..6004b1f414d4 100644
--- a/lldb/source/Plugins/ObjectFile/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ObjectFile)
+
 add_subdirectory(Breakpad)
 add_subdirectory(COFF)
 add_subdirectory(ELF)
diff --git a/lldb/source/Plugins/OperatingSystem/CMakeLists.txt b/lldb/source/Plugins/OperatingSystem/CMakeLists.txt
index 06d909b862a0..1a29f9ad3af4 100644
--- a/lldb/source/Plugins/OperatingSystem/CMakeLists.txt
+++ b/lldb/source/Plugins/OperatingSystem/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND OperatingSystem)
+
 if (LLDB_ENABLE_PYTHON)
   add_subdirectory(Python)
 endif()
diff --git a/lldb/source/Plugins/Platform/CMakeLists.txt b/lldb/source/Plugins/Platform/CMakeLists.txt
index 0220e734b36d..f4753ab47ce1 100644
--- a/lldb/source/Plugins/Platform/CMakeLists.txt
+++ b/lldb/source/Plugins/Platform/CMakeLists.txt
@@ -1,3 +1,10 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Platform)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
+  DynamicLoader
+  ObjectContainer
+  Process
+)
+
 add_subdirectory(AIX)
 add_subdirectory(Android)
 add_subdirectory(FreeBSD)
diff --git a/lldb/source/Plugins/Process/CMakeLists.txt b/lldb/source/Plugins/Process/CMakeLists.txt
index 058b4b9ad215..bd9b1b86dbf1 100644
--- a/lldb/source/Plugins/Process/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/CMakeLists.txt
@@ -1,3 +1,8 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Process)
+set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
+# This dependency is part of a loop (Process<->DynamicLoader).
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES DynamicLoader)
+
 if (CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
   add_subdirectory(Linux)
   add_subdirectory(POSIX)
diff --git a/lldb/source/Plugins/Process/Utility/CMakeLists.txt b/lldb/source/Plugins/Process/Utility/CMakeLists.txt
index fd3019613892..48646b784f93 100644
--- a/lldb/source/Plugins/Process/Utility/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/Utility/CMakeLists.txt
@@ -1,3 +1,6 @@
+# TODO: Clean up this directory and its dependencies
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ProcessUtility)
+
 add_lldb_library(lldbPluginProcessUtility
   AuxVector.cpp
   FreeBSDSignals.cpp
diff --git a/lldb/source/Plugins/REPL/CMakeLists.txt b/lldb/source/Plugins/REPL/CMakeLists.txt
index 17c40aee44cc..8b51b3899625 100644
--- a/lldb/source/Plugins/REPL/CMakeLists.txt
+++ b/lldb/source/Plugins/REPL/CMakeLists.txt
@@ -1 +1,4 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND REPL)
+set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES TypeSystem)
+
 add_subdirectory(Clang)
diff --git a/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt b/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt
index 336ae9105802..7411b24c86cb 100644
--- a/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt
+++ b/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND RegisterTypeBuilder)
+
 add_lldb_library(lldbPluginRegisterTypeBuilderClang PLUGIN
   RegisterTypeBuilderClang.cpp
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt
index fa1c72a32fe1..4429b006173a 100644
--- a/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ScriptInterpreter)
+
 add_subdirectory(None)
 if (LLDB_ENABLE_PYTHON)
   add_subdirectory(Python)
diff --git a/lldb/source/Plugins/StructuredData/CMakeLists.txt b/lldb/source/Plugins/StructuredData/CMakeLists.txt
index 40d64558482d..a6caa233f222 100644
--- a/lldb/source/Plugins/StructuredData/CMakeLists.txt
+++ b/lldb/source/Plugins/StructuredData/CMakeLists.txt
@@ -1,2 +1,4 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND StructuredData)
+
 add_subdirectory(DarwinLog)
 
diff --git a/lldb/source/Plugins/SymbolFile/CMakeLists.txt b/lldb/source/Plugins/SymbolFile/CMakeLists.txt
index 106387b45ec1..351652846420 100644
--- a/lldb/source/Plugins/SymbolFile/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolFile/CMakeLists.txt
@@ -1,3 +1,10 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolFile)
+set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
+  Language
+  TypeSystem  # part of a loop (TypeSystem<->SymbolFile).
+)
+
 add_subdirectory(Breakpad)
 add_subdirectory(CTF)
 add_subdirectory(DWARF)
diff --git a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
index 3367022639ab..3b466f71dca5 100644
--- a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolLocator)
+
 # Order matters here: the first symbol locator prevents further searching.
 # For DWARF binaries that are both stripped and split, the Default plugin
 # will return the stripped binary when asked for the ObjectFile, which then
diff --git a/lldb/source/Plugins/SymbolVendor/CMakeLists.txt b/lldb/source/Plugins/SymbolVendor/CMakeLists.txt
index 1981706e06f4..a07330d7d8bc 100644
--- a/lldb/source/Plugins/SymbolVendor/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolVendor/CMakeLists.txt
@@ -1,3 +1,6 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolVendor)
+set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
+
 add_subdirectory(ELF)
 
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
diff --git a/lldb/source/Plugins/SystemRuntime/CMakeLists.txt b/lldb/source/Plugins/SystemRuntime/CMakeLists.txt
index 0955a9eb74c2..58fdc060bda3 100644
--- a/lldb/source/Plugins/SystemRuntime/CMakeLists.txt
+++ b/lldb/source/Plugins/SystemRuntime/CMakeLists.txt
@@ -1 +1,4 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SystemRuntime)
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES TypeSystem)
+
 add_subdirectory(MacOSX)
diff --git a/lldb/source/Plugins/Trace/CMakeLists.txt b/lldb/source/Plugins/Trace/CMakeLists.txt
index 955f88cec340..331b48f95f1a 100644
--- a/lldb/source/Plugins/Trace/CMakeLists.txt
+++ b/lldb/source/Plugins/Trace/CMakeLists.txt
@@ -1,3 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Trace)
+
 option(LLDB_BUILD_INTEL_PT "Enable Building of Intel(R) Processor Trace Tool" OFF)
 
 add_subdirectory(common)
diff --git a/lldb/source/Plugins/TraceExporter/CMakeLists.txt b/lldb/source/Plugins/TraceExporter/CMakeLists.txt
index e0252ee36720..7130548d7a9f 100644
--- a/lldb/source/Plugins/TraceExporter/CMakeLists.txt
+++ b/lldb/source/Plugins/TraceExporter/CMakeLists.txt
@@ -1,2 +1,4 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND TraceExporter)
+
 add_subdirectory(common)
 add_subdirectory(ctf)
diff --git a/lldb/source/Plugins/TypeSystem/CMakeLists.txt b/lldb/source/Plugins/TypeSystem/CMakeLists.txt
index 17c40aee44cc..47e32ff176d8 100644
--- a/lldb/source/Plugins/TypeSystem/CMakeLists.txt
+++ b/lldb/source/Plugins/TypeSystem/CMakeLists.txt
@@ -1 +1,5 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND TypeSystem)
+# This dependency is part of a loop (TypeSystem<->SymbolFile).
+set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES SymbolFile)
+
 add_subdirectory(Clang)
diff --git a/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt b/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt
index 1723a0604580..1f505599c3ff 100644
--- a/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt
+++ b/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt
@@ -1,2 +1,4 @@
+set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND UnwindAssembly)
+
 add_subdirectory(InstEmulation)
 add_subdirectory(x86)

From 5c22793eadd8758d589eafd1cbbb2897ab8b3c8b Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 23 Jun 2025 10:32:46 +0100
Subject: [PATCH 1253/1322] [lldb-dap][test] Refactor runInTerminal Tests.
 (#144954)

Replace `isTestSupported` function with `skipIfBuildType` annotation.

Test that uses the `IsTestSupported` function are no longer run, as the
size of lldb-dap binary is now more than `1mb`.

Update the broken test.

Fixes #108621

We could probably check if the test now passes on `linux arm` since it
was disabled because it timed out. I experienced the timeout after
replacing the `IsTestSupported` with `skipIfBuildType`.
---
 .../test/tools/lldb-dap/dap_server.py         | 12 ++-
 .../restart/TestDAP_restart_runInTerminal.py  | 87 ++++++++++---------
 .../runInTerminal/TestDAP_runInTerminal.py    | 71 +++++----------
 .../API/tools/lldb-dap/runInTerminal/main.c   |  7 +-
 4 files changed, 78 insertions(+), 99 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 6d32491eaa5e..0fe36cd4bc71 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -179,9 +179,13 @@ class DebugCommunication(object):
     @classmethod
     def validate_response(cls, command, response):
         if command["command"] != response["command"]:
-            raise ValueError("command mismatch in response")
+            raise ValueError(
+                f"command mismatch in response {command['command']} != {response['command']}"
+            )
         if command["seq"] != response["request_seq"]:
-            raise ValueError("seq mismatch in response")
+            raise ValueError(
+                f"seq mismatch in response {command['seq']} != {response['request_seq']}"
+            )
 
     def _read_packet_thread(self):
         done = False
@@ -404,8 +408,8 @@ class DebugCommunication(object):
                 self.reverse_requests.append(response_or_request)
                 if response_or_request["command"] == "runInTerminal":
                     subprocess.Popen(
-                        response_or_request["arguments"]["args"],
-                        env=response_or_request["arguments"]["env"],
+                        response_or_request["arguments"].get("args"),
+                        env=response_or_request["arguments"].get("env", {}),
                     )
                     self.send_packet(
                         {
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
index e23d34bd9930..3ba7deb285de 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
@@ -2,23 +2,35 @@
 Test lldb-dap RestartRequest.
 """
 
-import os
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import line_number
+from typing import Dict, Any, List
+
 import lldbdap_testcase
+from lldbsuite.test.decorators import skipIfWindows, skipIf, skipIfBuildType
+from lldbsuite.test.lldbtest import line_number
 
 
+@skipIfBuildType(["debug"])
 class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
-    def isTestSupported(self):
-        try:
-            # We skip this test for debug builds because it takes too long
-            # parsing lldb's own debug info. Release builds are fine.
-            # Checking the size of the lldb-dap binary seems to be a decent
-            # proxy for a quick detection. It should be far less than 1 MB in
-            # Release builds.
-            return os.path.getsize(os.environ["LLDBDAP_EXEC"]) < 1000000
-        except:
-            return False
+    def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]):
+        seen_stopped_event = 0
+        for stopped_event in stopped_events:
+            body = stopped_event.get("body")
+            if body is None:
+                continue
+
+            reason = body.get("reason")
+            if reason is None:
+                continue
+
+            self.assertNotEqual(
+                reason,
+                "breakpoint",
+                'verify stop after restart isn\'t "main" breakpoint',
+            )
+            if reason == "entry":
+                seen_stopped_event += 1
+
+        self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.")
 
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
@@ -27,8 +39,6 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         Test basic restarting functionality when the process is running in
         a terminal.
         """
-        if not self.isTestSupported():
-            return
         line_A = line_number("main.c", "// breakpoint A")
         line_B = line_number("main.c", "// breakpoint B")
 
@@ -60,33 +70,31 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             "i != 0 after hitting breakpoint A on restart",
         )
 
+        # Check breakpoint B
+        self.dap_server.request_continue()
+        self.verify_breakpoint_hit([bp_B])
+        self.assertEqual(
+            int(self.dap_server.get_local_variable_value("i")),
+            1234,
+            "i != 1234 after hitting breakpoint B",
+        )
+        self.continue_to_exit()
+
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
     def test_stopOnEntry(self):
         """
         Check that stopOnEntry works correctly when using runInTerminal.
         """
-        if not self.isTestSupported():
-            return
-        line_A = line_number("main.c", "// breakpoint A")
-        line_B = line_number("main.c", "// breakpoint B")
-
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program, runInTerminal=True, stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
-        # When using stopOnEntry, configurationDone doesn't result in a running
-        # process, we should immediately get a stopped event instead.
+        self.dap_server.request_continue()  # sends configuration done
         stopped_events = self.dap_server.wait_for_stopped()
         # We should be stopped at the entry point.
-        for stopped_event in stopped_events:
-            if "body" in stopped_event:
-                body = stopped_event["body"]
-                if "reason" in body:
-                    reason = body["reason"]
-                    self.assertNotEqual(
-                        reason, "breakpoint", "verify stop isn't a breakpoint"
-                    )
+        self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events")
+        self.verify_stopped_on_entry(stopped_events)
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.dap_server.request_continue()
@@ -95,14 +103,11 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         # Restart and check that we still get a stopped event before reaching
         # main.
         self.dap_server.request_restart()
-        stopped_events = self.dap_server.wait_for_stopped()
-        for stopped_event in stopped_events:
-            if "body" in stopped_event:
-                body = stopped_event["body"]
-                if "reason" in body:
-                    reason = body["reason"]
-                    self.assertNotEqual(
-                        reason,
-                        "breakpoint",
-                        'verify stop after restart isn\'t "main" breakpoint',
-                    )
+        stopped_events = self.dap_server.wait_for_stopped(timeout=20)
+        self.verify_stopped_on_entry(stopped_events)
+
+        # continue to main
+        self.dap_server.request_continue()
+        self.verify_breakpoint_hit([bp_main])
+
+        self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index 65c931210d40..3d07cd8b20e2 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -2,52 +2,33 @@
 Test lldb-dap runInTerminal reverse request
 """
 
-
-import dap_server
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
+from lldbsuite.test.decorators import skipIfBuildType, skipIfWindows, skipIf, no_match
+from lldbsuite.test.lldbtest import line_number
 import lldbdap_testcase
-import time
 import os
 import subprocess
-import shutil
 import json
-from threading import Thread
 
 
+@skipIfBuildType(["debug"])
 class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
-    def readPidMessage(self, fifo_file):
+    def read_pid_message(self, fifo_file):
         with open(fifo_file, "r") as file:
             self.assertIn("pid", file.readline())
 
-    def sendDidAttachMessage(self, fifo_file):
+    @staticmethod
+    def send_did_attach_message(fifo_file):
         with open(fifo_file, "w") as file:
             file.write(json.dumps({"kind": "didAttach"}) + "\n")
 
-    def readErrorMessage(self, fifo_file):
+    @staticmethod
+    def read_error_message(fifo_file):
         with open(fifo_file, "r") as file:
             return file.readline()
 
-    def isTestSupported(self):
-        # For some strange reason, this test fails on python3.6
-        if not (sys.version_info.major == 3 and sys.version_info.minor >= 7):
-            return False
-        try:
-            # We skip this test for debug builds because it takes too long parsing lldb's own
-            # debug info. Release builds are fine.
-            # Checking the size of the lldb-dap binary seems to be a decent proxy for a quick
-            # detection. It should be far less than 1 MB in Release builds.
-            if os.path.getsize(os.environ["LLDBDAP_EXEC"]) < 1000000:
-                return True
-        except:
-            return False
-
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_runInTerminal(self):
-        if not self.isTestSupported():
-            return
         """
             Tests the "runInTerminal" reverse request. It makes sure that the IDE can
             launch the inferior with the correct environment variables and arguments.
@@ -77,7 +58,7 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
 
         # We verify we actually stopped inside the loop
         counter = int(self.dap_server.get_local_variable_value("counter"))
-        self.assertGreater(counter, 0)
+        self.assertEqual(counter, 1)
 
         # We verify we were able to set the launch arguments
         argc = int(self.dap_server.get_local_variable_value("argc"))
@@ -90,10 +71,10 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         env = self.dap_server.request_evaluate("foo")["body"]["result"]
         self.assertIn("bar", env)
 
+        self.continue_to_exit()
+
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_runInTerminalWithObjectEnv(self):
-        if not self.isTestSupported():
-            return
         """
             Tests the "runInTerminal" reverse request. It makes sure that the IDE can
             launch the inferior with the correct environment variables using an object.
@@ -113,11 +94,11 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("FOO", request_envs)
         self.assertEqual("BAR", request_envs["FOO"])
 
+        self.continue_to_exit()
+
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_runInTerminalInvalidTarget(self):
-        if not self.isTestSupported():
-            return
         self.build_and_create_debug_adapter()
         response = self.launch(
             "INVALIDPROGRAM",
@@ -135,8 +116,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_missingArgInRunInTerminalLauncher(self):
-        if not self.isTestSupported():
-            return
         proc = subprocess.run(
             [self.lldbDAPExec, "--launch-target", "INVALIDPROGRAM"],
             capture_output=True,
@@ -150,8 +129,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self):
-        if not self.isTestSupported():
-            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -167,9 +144,9 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             stderr=subprocess.PIPE,
         )
 
-        self.readPidMessage(comm_file)
-        self.sendDidAttachMessage(comm_file)
-        self.assertIn("No such file or directory", self.readErrorMessage(comm_file))
+        self.read_pid_message(comm_file)
+        self.send_did_attach_message(comm_file)
+        self.assertIn("No such file or directory", self.read_error_message(comm_file))
 
         _, stderr = proc.communicate()
         self.assertIn("No such file or directory", stderr)
@@ -177,8 +154,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self):
-        if not self.isTestSupported():
-            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -195,8 +170,8 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             stdout=subprocess.PIPE,
         )
 
-        self.readPidMessage(comm_file)
-        self.sendDidAttachMessage(comm_file)
+        self.read_pid_message(comm_file)
+        self.send_did_attach_message(comm_file)
 
         stdout, _ = proc.communicate()
         self.assertIn("foo", stdout)
@@ -204,8 +179,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self):
-        if not self.isTestSupported():
-            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -216,8 +189,8 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             env={**os.environ, "FOO": "BAR"},
         )
 
-        self.readPidMessage(comm_file)
-        self.sendDidAttachMessage(comm_file)
+        self.read_pid_message(comm_file)
+        self.send_did_attach_message(comm_file)
 
         stdout, _ = proc.communicate()
         self.assertIn("FOO=BAR", stdout)
@@ -225,8 +198,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
     def test_NonAttachedRunInTerminalLauncher(self):
-        if not self.isTestSupported():
-            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -244,7 +215,7 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             env={**os.environ, "LLDB_DAP_RIT_TIMEOUT_IN_MS": "1000"},
         )
 
-        self.readPidMessage(comm_file)
+        self.read_pid_message(comm_file)
 
         _, stderr = proc.communicate()
         self.assertIn("Timed out trying to get messages from the debug adapter", stderr)
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/main.c b/lldb/test/API/tools/lldb-dap/runInTerminal/main.c
index 676bd830e657..0cc25d374d08 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/main.c
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/main.c
@@ -4,8 +4,7 @@
 
 int main(int argc, char *argv[]) {
   const char *foo = getenv("FOO");
-  for (int counter = 1;; counter++) {
-    sleep(1); // breakpoint
-  }
-  return 0;
+  int counter = 1;
+
+  return 0; // breakpoint
 }

From 714b2fdf3a385e5b9a95c435f56b1696ec3ec9e8 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 23 Jun 2025 11:39:20 +0200
Subject: [PATCH 1254/1322] [lldb] Add BRIEF_DOCS for cmake properties defined
 in #144543

It seems some cmake versions require it.
---
 lldb/cmake/modules/LLDBLayeringCheck.cmake | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/lldb/cmake/modules/LLDBLayeringCheck.cmake b/lldb/cmake/modules/LLDBLayeringCheck.cmake
index 082bbe37a980..1669f6150cca 100644
--- a/lldb/cmake/modules/LLDBLayeringCheck.cmake
+++ b/lldb/cmake/modules/LLDBLayeringCheck.cmake
@@ -1,11 +1,17 @@
-define_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND)
-define_property(TARGET PROPERTY LLDB_PLUGIN_KIND INHERITED)
+define_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND
+  BRIEF_DOCS "LLDB plugin kind (Process, SymbolFile, etc.")
+define_property(TARGET PROPERTY LLDB_PLUGIN_KIND INHERITED
+  BRIEF_DOCS "LLDB plugin kind (Process, SymbolFile, etc.")
 
-define_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES)
-define_property(TARGET PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES INHERITED)
+define_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES
+  BRIEF_DOCS "LLDB plugin kinds which the plugin can depend on")
+define_property(TARGET PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES INHERITED
+  BRIEF_DOCS "LLDB plugin kinds which the plugin can depend on")
 
-define_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES)
-define_property(TARGET PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES INHERITED)
+define_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
+  BRIEF_DOCS "LLDB plugin kinds which are depended on for historic reasons.")
+define_property(TARGET PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES INHERITED
+  BRIEF_DOCS "LLDB plugin kinds which are depended on for historic reasons.")
 
 option(LLDB_GENERATE_PLUGIN_DEP_GRAPH OFF)
 

From 0662045bdf6ca2186511dd84f3b2248bedece690 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 23 Jun 2025 12:03:35 +0200
Subject: [PATCH 1255/1322] [TLI] Add support for pvalloc() (#144949)

While pvalloc() is a legacy POSIX function, it remains widely available
in common C libraries like glibc.

Model pvalloc() in TargetLibraryInfo, allowing LLVM to correctly infer
its attributes.
---
 llvm/include/llvm/Analysis/TargetLibraryInfo.def    | 5 +++++
 llvm/lib/Analysis/TargetLibraryInfo.cpp             | 1 +
 llvm/lib/Transforms/Utils/BuildLibCalls.cpp         | 4 +++-
 llvm/test/Transforms/InferFunctionAttrs/annotate.ll | 4 ++++
 llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml | 4 ++--
 llvm/unittests/Analysis/TargetLibraryInfoTest.cpp   | 1 +
 6 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index db566b8ee610..014988299d37 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -2046,6 +2046,11 @@ TLI_DEFINE_ENUM_INTERNAL(puts)
 TLI_DEFINE_STRING_INTERNAL("puts")
 TLI_DEFINE_SIG_INTERNAL(Int, Ptr)
 
+/// void *pvalloc(size_t size);
+TLI_DEFINE_ENUM_INTERNAL(pvalloc)
+TLI_DEFINE_STRING_INTERNAL("pvalloc")
+TLI_DEFINE_SIG_INTERNAL(Ptr, SizeT)
+
 /// ssize_t pwrite(int fildes, const void *buf, size_t nbyte, off_t offset);
 TLI_DEFINE_ENUM_INTERNAL(pwrite)
 TLI_DEFINE_STRING_INTERNAL("pwrite")
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index a3ed09313439..28a5cdb5561d 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -814,6 +814,7 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_pclose);
     TLI.setUnavailable(LibFunc_popen);
     TLI.setUnavailable(LibFunc_pread);
+    TLI.setUnavailable(LibFunc_pvalloc);
     TLI.setUnavailable(LibFunc_pwrite);
     TLI.setUnavailable(LibFunc_read);
     TLI.setUnavailable(LibFunc_readlink);
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index a5f6047b0b7a..573a78150ff3 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -514,10 +514,12 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
   case LibFunc_valloc:
   case LibFunc_malloc:
   case LibFunc_vec_malloc:
+    Changed |= setAllocSize(F, 0, std::nullopt);
+    [[fallthrough]];
+  case LibFunc_pvalloc:
     Changed |= setAllocFamily(F, TheLibFunc == LibFunc_vec_malloc ? "vec_malloc"
                                                                   : "malloc");
     Changed |= setAllocKind(F, AllocFnKind::Alloc | AllocFnKind::Uninitialized);
-    Changed |= setAllocSize(F, 0, std::nullopt);
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index 9e56b01c6b31..51e22bb86f33 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -812,6 +812,9 @@ declare i32 @putchar_unlocked(i32)
 ; CHECK: declare noundef i32 @puts(ptr noundef readonly captures(none)) [[NOFREE_NOUNWIND]]
 declare i32 @puts(ptr)
 
+; CHECK: declare noalias noundef ptr @pvalloc(i64 noundef) [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN_ALLOCKIND_ALLOCUNINIT_FAMILY_MALLOC:#[0-9]+]]
+declare ptr @pvalloc(i64)
+
 ; CHECK: declare noundef i64 @pwrite(i32 noundef, ptr noundef readonly captures(none), i64 noundef, i64 noundef) [[NOFREE]]
 declare i64 @pwrite(i32, ptr, i64, i64)
 
@@ -1195,6 +1198,7 @@ declare void @memset_pattern16(ptr, ptr, i64)
 ; CHECK-DAG: attributes [[NOFREE_NOUNWIND_READONLY]] = { nofree nounwind memory(read) }
 ; CHECK-DAG: attributes [[INACCESSIBLEMEMORARGMEMONLY_NOUNWIND_WILLRETURN_ALLOCKIND_FREE_FAMILY_MALLOC]] = { mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" }
 ; CHECK-DAG: attributes [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN_ALLOCKIND_ALLOCUNINIT_ALLOCSIZE0_FAMILY_MALLOC]] = { mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) memory(inaccessiblemem: readwrite) "alloc-family"="malloc" }
+; CHECK-DAG: attributes [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN_ALLOCKIND_ALLOCUNINIT_FAMILY_MALLOC]] = { mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") memory(inaccessiblemem: readwrite) "alloc-family"="malloc" }
 ; CHECK-DAG: attributes [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]] = { mustprogress nocallback nofree nounwind willreturn memory(argmem: read) }
 ; CHECK-DAG: attributes [[NOFREE]] = { nofree }
 ; CHECK-DAG: attributes [[ARGMEMONLY_NOFREE_NOUNWIND]] = { nocallback nofree nounwind memory(argmem: readwrite) }
diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
index 2d23b15d74b1..51a5a63ba370 100644
--- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
+++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
@@ -54,10 +54,10 @@
 ## the exact count first; the two directives should add up to that.
 ## Yes, this means additions to TLI will fail this test, but the argument
 ## to -COUNT can't be an expression.
-# AVAIL: TLI knows 523 symbols, 289 available
+# AVAIL: TLI knows 524 symbols, 289 available
 # AVAIL-COUNT-289: {{^}} available
 # AVAIL-NOT:       {{^}} available
-# UNAVAIL-COUNT-234: not available
+# UNAVAIL-COUNT-235: not available
 # UNAVAIL-NOT:       not available
 
 ## This is a large file so it's worth telling lit to stop here.
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 97722483aefe..2f1bcbae4fc5 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -315,6 +315,7 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare i32 @putchar(i32)\n"
       "declare i32 @putchar_unlocked(i32)\n"
       "declare i32 @puts(i8*)\n"
+      "declare i8* @pvalloc(i64)\n"
       "declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)*)\n"
       "declare i64 @readlink(i8*, i8*, i64)\n"
       "declare i8* @realloc(i8*, i64)\n"

From 2545d6f7237bb27620e6aef10eee487af8ee8471 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 23 Jun 2025 11:06:25 +0100
Subject: [PATCH 1256/1322] [ARM] Add MVE test coverage for LD2/ST2 shuffle
 costs. NFC

---
 .../test/Analysis/CostModel/ARM/mve-minmax.ll |   2 +-
 .../CostModel/ARM/mve-shuffle-loadstore.ll    | 653 ++++++++++++++++++
 2 files changed, 654 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll

diff --git a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
index cacad28ef6ec..7f1005285747 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=MVE-RECIP,MVEI-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=MVE-SIZE,MVEI-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.-none-eabimain -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=MVE-RECIP,MVEF-RECIP
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=MVE-RECIP,MVEF-RECIP
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=MVE-SIZE,MVEF-SIZE
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll b/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
new file mode 100644
index 000000000000..6a327cfed4e4
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
@@ -0,0 +1,653 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF2
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF2
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -mve-max-interleave-factor=4 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF4
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -mve-max-interleave-factor=4 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UF4
+
+define void @vld2(ptr %p) {
+; CHECK-LABEL: 'vld2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v4i8 = load <4 x i8>, ptr %p
+  %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+  %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+  %v8i8 = load <8 x i8>, ptr %p
+  %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v16i8 = load <16 x i8>, ptr %p
+  %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v32i8 = load <32 x i8>, ptr %p
+  %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  %v4i16 = load <4 x i16>, ptr %p
+  %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+  %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+  %v8i16 = load <8 x i16>, ptr %p
+  %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v16i16 = load <16 x i16>, ptr %p
+  %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v32i16 = load <32 x i16>, ptr %p
+  %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  %v4i32 = load <4 x i32>, ptr %p
+  %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %v8i32 = load <8 x i32>, ptr %p
+  %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v16i32 = load <16 x i32>, ptr %p
+  %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v32i32 = load <32 x i32>, ptr %p
+  %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  %v2i64 = load <4 x i64>, ptr %p
+  %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+  %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+  %v4i64 = load <8 x i64>, ptr %p
+  %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v8i64 = load <16 x i64>, ptr %p
+  %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v16i64 = load <32 x i64>, ptr %p
+  %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  ret void
+}
+
+
+define void @vld3(ptr %p) {
+; CHECK-LABEL: 'vld3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v2i8 = load <6 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %v4i8 = load <12 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <12 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <24 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <48 x i64>, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8 = load <6 x i8>, ptr %p
+  %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+  %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+  %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+  %v4i8 = load <12 x i8>, ptr %p
+  %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i8 = load <24 x i8>, ptr %p
+  %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i8 = load <48 x i8>, ptr %p
+  %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  %v2i16 = load <6 x i16>, ptr %p
+  %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+  %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+  %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+  %v4i16 = load <12 x i16>, ptr %p
+  %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i16 = load <24 x i16>, ptr %p
+  %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i16 = load <48 x i16>, ptr %p
+  %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  %v2i32 = load <6 x i32>, ptr %p
+  %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+  %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+  %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+  %v4i32 = load <12 x i32>, ptr %p
+  %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i32 = load <24 x i32>, ptr %p
+  %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i32 = load <48 x i32>, ptr %p
+  %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  %v2i64 = load <6 x i64>, ptr %p
+  %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+  %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+  %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+  %v4i64 = load <12 x i64>, ptr %p
+  %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i64 = load <24 x i64>, ptr %p
+  %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i64 = load <48 x i64>, ptr %p
+  %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  ret void
+}
+
+define void @vld4(ptr %p) {
+; CHECK-LABEL: 'vld4'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8 = load <8 x i8>, ptr %p
+  %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+  %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+  %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+  %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+  %v4i8 = load <16 x i8>, ptr %p
+  %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i8 = load <32 x i8>, ptr %p
+  %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i8 = load <64 x i8>, ptr %p
+  %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  %v2i16 = load <8 x i16>, ptr %p
+  %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+  %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+  %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+  %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+  %v4i16 = load <16 x i16>, ptr %p
+  %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i16 = load <32 x i16>, ptr %p
+  %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i16 = load <64 x i16>, ptr %p
+  %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  %v2i32 = load <8 x i32>, ptr %p
+  %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+  %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+  %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+  %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+  %v4i32 = load <16 x i32>, ptr %p
+  %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i32 = load <32 x i32>, ptr %p
+  %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i32 = load <64 x i32>, ptr %p
+  %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  %v2i64 = load <8 x i64>, ptr %p
+  %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+  %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+  %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+  %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+  %v4i64 = load <16 x i64>, ptr %p
+  %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i64 = load <32 x i64>, ptr %p
+  %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i64 = load <64 x i64>, ptr %p
+  %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  ret void
+}
+
+define void @vst2(ptr %p) {
+; CHECK-LABEL: 'vst2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> %v4i8, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i16> %v4i16, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> %v4i32, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i64> %v4i64, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i8> %v4i8, ptr %p
+  %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i8> %v8i8, ptr %p
+  %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %v16i8, ptr %p
+  %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i8> %v32i8, ptr %p
+
+  %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i16> %v4i16, ptr %p
+  %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i16> %v8i16, ptr %p
+  %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i16> %v16i16, ptr %p
+  %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i16> %v32i16, ptr %p
+
+  %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %v4i32, ptr %p
+  %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %v8i32, ptr %p
+  %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %v16i32, ptr %p
+  %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i32> %v32i32, ptr %p
+
+  %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i64> %v4i64, ptr %p
+  %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i64> %v8i64, ptr %p
+  %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i64> %v16i64, ptr %p
+  %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  store <32 x i64> %v32i64, ptr %p
+
+  ret void
+}
+
+
+define void @vst3(ptr %p) {
+; CHECK-LABEL: 'vst3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: store <6 x i8> %v8i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: store <12 x i8> %v16i8, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <24 x i8> %v32i8, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <48 x i8> %v64i8, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: store <6 x i16> %v8i16, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <12 x i16> %v16i16, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <24 x i16> %v32i16, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <48 x i16> %v64i16, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <6 x i32> %v8i32, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <12 x i32> %v16i32, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <24 x i32> %v32i32, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <48 x i32> %v64i32, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <6 x i64> %v8i64, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <12 x i64> %v16i64, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <24 x i64> %v32i64, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %v64i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <48 x i64> %v64i64, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i8> %v8i8, ptr %p
+  %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i8> %v16i8, ptr %p
+  %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i8> %v32i8, ptr %p
+  %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i8> %v64i8, ptr %p
+
+  %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i16> %v8i16, ptr %p
+  %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i16> %v16i16, ptr %p
+  %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i16> %v32i16, ptr %p
+  %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i16> %v64i16, ptr %p
+
+  %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32> %v8i32, ptr %p
+  %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %v16i32, ptr %p
+  %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %v32i32, ptr %p
+  %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i32> %v64i32, ptr %p
+
+  %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i64> %v8i64, ptr %p
+  %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i64> %v16i64, ptr %p
+  %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i64> %v32i64, ptr %p
+  %v64i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i64> %v64i64, ptr %p
+
+  ret void
+}
+
+
+define void @vst4(ptr %p) {
+; CHECK-LABEL: 'vst4'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <64 x i8> %v64i8, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i16> %v64i16, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <64 x i32> %v64i32, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i64> %v64i64, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i8> %v8i8, ptr %p
+  %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i8> %v16i8, ptr %p
+  %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i8> %v32i8, ptr %p
+  %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i8> %v64i8, ptr %p
+
+  %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i16> %v8i16, ptr %p
+  %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i16> %v16i16, ptr %p
+  %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i16> %v32i16, ptr %p
+  %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i16> %v64i16, ptr %p
+
+  %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32> %v8i32, ptr %p
+  %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %v16i32, ptr %p
+  %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %v32i32, ptr %p
+  %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i32> %v64i32, ptr %p
+
+  %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i64> %v8i64, ptr %p
+  %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i64> %v16i64, ptr %p
+  %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i64> %v32i64, ptr %p
+  %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i64> %v64i64, ptr %p
+
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-UF2: {{.*}}
+; CHECK-UF4: {{.*}}

From b1b8f67eab084f235e9f9b145e4b86b823cccc2b Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 23 Jun 2025 12:07:00 +0200
Subject: [PATCH 1257/1322] [mlir][Transforms] Add 1:N support to
 `replaceUsesOfBlockArgument` (#145171)

This commit adds 1:N support to
`ConversionPatternRewriter::replaceUsesOfBlockArgument`. This was one of
the few remaining dialect conversion APIs that does not support 1:N
conversions yet.

This commit also reuses `replaceUsesOfBlockArgument` in the
implementation of `applySignatureConversion`. This is in preparation of
the One-Shot Dialect Conversion refactoring. The goal is to bring the
`applySignatureConversion` implementation into a state where it works
both with and without rollbacks. To that end, `applySignatureConversion`
should not directly access the `mapping`.
---
 .../mlir/Transforms/DialectConversion.h       |  5 +-
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |  2 +-
 .../Transforms/Utils/DialectConversion.cpp    | 40 +++++++++------
 mlir/test/Transforms/test-legalizer.mlir      | 31 ++++++++---
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   | 51 +++++++++++--------
 5 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 5a5f116073a9..81858812d262 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -763,8 +763,9 @@ public:
       Region *region, const TypeConverter &converter,
       TypeConverter::SignatureConversion *entryConversion = nullptr);
 
-  /// Replace all the uses of the block argument `from` with value `to`.
-  void replaceUsesOfBlockArgument(BlockArgument from, Value to);
+  /// Replace all the uses of the block argument `from` with `to`. This
+  /// function supports both 1:1 and 1:N replacements.
+  void replaceUsesOfBlockArgument(BlockArgument from, ValueRange to);
 
   /// Return the converted value of 'key' with a type defined by the type
   /// converter of the currently executing pattern. Return nullptr in the case
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 4499cbd4d1a2..72977e6cc0e4 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -294,7 +294,7 @@ static void restoreByValRefArgumentType(
     Type resTy = typeConverter.convertType(
         cast<TypeAttr>(byValRefAttr->getValue()).getValue());
 
-    auto valueArg = rewriter.create<LLVM::LoadOp>(arg.getLoc(), resTy, arg);
+    Value valueArg = rewriter.create<LLVM::LoadOp>(arg.getLoc(), resTy, arg);
     rewriter.replaceUsesOfBlockArgument(arg, valueArg);
   }
 }
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 774d58973eb9..9cb6f2ba1eaa 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -948,6 +948,11 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// uses.
   void replaceOp(Operation *op, SmallVector<SmallVector<Value>> &&newValues);
 
+  /// Replace the given block argument with the given values. The specified
+  /// converter is used to build materializations (if necessary).
+  void replaceUsesOfBlockArgument(BlockArgument from, ValueRange to,
+                                  const TypeConverter *converter);
+
   /// Erase the given block and its contents.
   void eraseBlock(Block *block);
 
@@ -1434,12 +1439,15 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
     if (!inputMap) {
       // This block argument was dropped and no replacement value was provided.
       // Materialize a replacement value "out of thin air".
-      buildUnresolvedMaterialization(
-          MaterializationKind::Source,
-          OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(),
-          /*valuesToMap=*/{origArg}, /*inputs=*/ValueRange(),
-          /*outputTypes=*/origArgType, /*originalType=*/Type(), converter);
-      appendRewrite<ReplaceBlockArgRewrite>(block, origArg, converter);
+      Value mat =
+          buildUnresolvedMaterialization(
+              MaterializationKind::Source,
+              OpBuilder::InsertPoint(newBlock, newBlock->begin()),
+              origArg.getLoc(),
+              /*valuesToMap=*/{}, /*inputs=*/ValueRange(),
+              /*outputTypes=*/origArgType, /*originalType=*/Type(), converter)
+              .front();
+      replaceUsesOfBlockArgument(origArg, mat, converter);
       continue;
     }
 
@@ -1448,17 +1456,15 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
       assert(inputMap->size == 0 &&
              "invalid to provide a replacement value when the argument isn't "
              "dropped");
-      mapping.map(origArg, inputMap->replacementValues);
-      appendRewrite<ReplaceBlockArgRewrite>(block, origArg, converter);
+      replaceUsesOfBlockArgument(origArg, inputMap->replacementValues,
+                                 converter);
       continue;
     }
 
     // This is a 1->1+ mapping.
     auto replArgs =
         newBlock->getArguments().slice(inputMap->inputNo, inputMap->size);
-    ValueVector replArgVals = llvm::to_vector_of<Value, 1>(replArgs);
-    mapping.map(origArg, std::move(replArgVals));
-    appendRewrite<ReplaceBlockArgRewrite>(block, origArg, converter);
+    replaceUsesOfBlockArgument(origArg, replArgs, converter);
   }
 
   appendRewrite<BlockTypeConversionRewrite>(/*origBlock=*/block, newBlock);
@@ -1612,6 +1618,12 @@ void ConversionPatternRewriterImpl::replaceOp(
   op->walk([&](Operation *op) { replacedOps.insert(op); });
 }
 
+void ConversionPatternRewriterImpl::replaceUsesOfBlockArgument(
+    BlockArgument from, ValueRange to, const TypeConverter *converter) {
+  appendRewrite<ReplaceBlockArgRewrite>(from.getOwner(), from, converter);
+  mapping.map(from, to);
+}
+
 void ConversionPatternRewriterImpl::eraseBlock(Block *block) {
   assert(!wasOpReplaced(block->getParentOp()) &&
          "attempting to erase a block within a replaced/erased op");
@@ -1744,7 +1756,7 @@ FailureOr<Block *> ConversionPatternRewriter::convertRegionTypes(
 }
 
 void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from,
-                                                           Value to) {
+                                                           ValueRange to) {
   LLVM_DEBUG({
     impl->logger.startLine() << "** Replace Argument : '" << from << "'";
     if (Operation *parentOp = from.getOwner()->getParentOp()) {
@@ -1754,9 +1766,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from,
       impl->logger.getOStream() << " (unlinked block)\n";
     }
   });
-  impl->appendRewrite<ReplaceBlockArgRewrite>(from.getOwner(), from,
-                                              impl->currentTypeConverter);
-  impl->mapping.map(from, to);
+  impl->replaceUsesOfBlockArgument(from, to, impl->currentTypeConverter);
 }
 
 Value ConversionPatternRewriter::getRemappedValue(Value key) {
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 204c8c145682..79518b04e715 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -300,18 +300,35 @@ func.func @create_illegal_block() {
 // -----
 
 // CHECK-LABEL: @undo_block_arg_replace
+// expected-remark@+1{{applyPartialConversion failed}}
+module {
 func.func @undo_block_arg_replace() {
-  // expected-remark@+1 {{op 'test.undo_block_arg_replace' is not legalizable}}
-  "test.undo_block_arg_replace"() ({
-  ^bb0(%arg0: i32):
-    // CHECK: ^bb0(%[[ARG:.*]]: i32):
-    // CHECK-NEXT: "test.return"(%[[ARG]]) : (i32)
+  // expected-error@+1{{failed to legalize operation 'test.block_arg_replace' that was explicitly marked illegal}}
+  "test.block_arg_replace"() ({
+  ^bb0(%arg0: i32, %arg1: i16):
+    // CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i16):
+    // CHECK-NEXT: "test.return"(%[[ARG0]]) : (i32)
 
     "test.return"(%arg0) : (i32) -> ()
-  }) : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  }) {trigger_rollback} : () -> ()
   return
 }
+}
+
+// -----
+
+// CHECK-LABEL: @replace_block_arg_1_to_n
+func.func @replace_block_arg_1_to_n() {
+  // CHECK: "test.block_arg_replace"
+  "test.block_arg_replace"() ({
+  ^bb0(%arg0: i32, %arg1: i16):
+    // CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i16):
+    // CHECK: %[[cast:.*]] = "test.cast"(%[[ARG1]], %[[ARG1]]) : (i16, i16) -> i32
+    // CHECK-NEXT: "test.return"(%[[cast]]) : (i32)
+    "test.return"(%arg0) : (i32) -> ()
+  }) : () -> ()
+  "test.return"() : () -> ()
+}
 
 // -----
 
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index a4cb705e6c8e..9126736d1d17 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -891,20 +891,25 @@ struct TestCreateIllegalBlock : public RewritePattern {
   }
 };
 
-/// A simple pattern that tests the undo mechanism when replacing the uses of a
-/// block argument.
-struct TestUndoBlockArgReplace : public ConversionPattern {
-  TestUndoBlockArgReplace(MLIRContext *ctx)
-      : ConversionPattern("test.undo_block_arg_replace", /*benefit=*/1, ctx) {}
+/// A simple pattern that tests the "replaceUsesOfBlockArgument" API.
+struct TestBlockArgReplace : public ConversionPattern {
+  TestBlockArgReplace(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.block_arg_replace", /*benefit=*/1,
+                          ctx) {}
 
   LogicalResult
   matchAndRewrite(Operation *op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const final {
-    auto illegalOp =
-        rewriter.create<ILLegalOpF>(op->getLoc(), rewriter.getF32Type());
+    // Replace the first block argument with 2x the second block argument.
+    Value repl = op->getRegion(0).getArgument(1);
     rewriter.replaceUsesOfBlockArgument(op->getRegion(0).getArgument(0),
-                                        illegalOp->getResult(0));
-    rewriter.modifyOpInPlace(op, [] {});
+                                        {repl, repl});
+    rewriter.modifyOpInPlace(op, [&] {
+      // If the "trigger_rollback" attribute is set, keep the op illegal, so
+      // that a rollback is triggered.
+      if (!op->hasAttr("trigger_rollback"))
+        op->setAttr("is_legal", rewriter.getUnitAttr());
+    });
     return success();
   }
 };
@@ -1375,20 +1380,19 @@ struct TestLegalizePatternDriver
     TestTypeConverter converter;
     mlir::RewritePatternSet patterns(&getContext());
     populateWithGenerated(patterns);
-    patterns
-        .add<TestRegionRewriteBlockMovement, TestDetachedSignatureConversion,
-             TestRegionRewriteUndo, TestCreateBlock, TestCreateIllegalBlock,
-             TestUndoBlockArgReplace, TestUndoBlockErase, TestSplitReturnType,
-             TestChangeProducerTypeI32ToF32, TestChangeProducerTypeF32ToF64,
-             TestChangeProducerTypeF32ToInvalid, TestUpdateConsumerType,
-             TestNonRootReplacement, TestBoundedRecursiveRewrite,
-             TestNestedOpCreationUndoRewrite, TestReplaceEraseOp,
-             TestCreateUnregisteredOp, TestUndoMoveOpBefore,
-             TestUndoPropertiesModification, TestEraseOp,
-             TestRepetitive1ToNConsumer>(&getContext());
+    patterns.add<
+        TestRegionRewriteBlockMovement, TestDetachedSignatureConversion,
+        TestRegionRewriteUndo, TestCreateBlock, TestCreateIllegalBlock,
+        TestUndoBlockErase, TestSplitReturnType, TestChangeProducerTypeI32ToF32,
+        TestChangeProducerTypeF32ToF64, TestChangeProducerTypeF32ToInvalid,
+        TestUpdateConsumerType, TestNonRootReplacement,
+        TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite,
+        TestReplaceEraseOp, TestCreateUnregisteredOp, TestUndoMoveOpBefore,
+        TestUndoPropertiesModification, TestEraseOp,
+        TestRepetitive1ToNConsumer>(&getContext());
     patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
-                 TestPassthroughInvalidOp, TestMultiple1ToNReplacement>(
-        &getContext(), converter);
+                 TestPassthroughInvalidOp, TestMultiple1ToNReplacement,
+                 TestBlockArgReplace>(&getContext(), converter);
     patterns.add<TestConvertBlockArgs>(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1413,6 +1417,9 @@ struct TestLegalizePatternDriver
     });
     target.addDynamicallyLegalOp<func::CallOp>(
         [&](func::CallOp op) { return converter.isLegal(op); });
+    target.addDynamicallyLegalOp(
+        OperationName("test.block_arg_replace", &getContext()),
+        [](Operation *op) { return op->hasAttr("is_legal"); });
 
     // TestCreateUnregisteredOp creates `arith.constant` operation,
     // which was not added to target intentionally to test

From 18f667d804144e74d3ba2c67ee6f3610916002a8 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 23 Jun 2025 12:05:32 +0200
Subject: [PATCH 1258/1322] Revert "[lldb/cmake] Plugin layering enforcement
 mechanism (#144543)"

Causes failures on several bots.

This reverts commits 714b2fdf3a385e5b9a95c435f56b1696ec3ec9e8 and
e7c1da7c8ef31c258619c1668062985e7ae83b70.
---
 lldb/CMakeLists.txt                           |  3 -
 lldb/cmake/modules/LLDBLayeringCheck.cmake    | 74 -------------------
 lldb/docs/resources/contributing.rst          | 50 -------------
 lldb/source/Plugins/ABI/CMakeLists.txt        |  6 --
 .../Plugins/Architecture/CMakeLists.txt       |  2 -
 .../Plugins/Disassembler/CMakeLists.txt       |  2 -
 .../Plugins/DynamicLoader/CMakeLists.txt      |  7 --
 .../Plugins/ExpressionParser/CMakeLists.txt   |  2 -
 .../source/Plugins/Instruction/CMakeLists.txt |  2 -
 .../InstrumentationRuntime/CMakeLists.txt     |  2 -
 lldb/source/Plugins/JITLoader/CMakeLists.txt  |  3 -
 lldb/source/Plugins/Language/CMakeLists.txt   |  6 --
 .../Plugins/LanguageRuntime/CMakeLists.txt    |  3 -
 .../Plugins/MemoryHistory/CMakeLists.txt      |  2 -
 .../Plugins/ObjectContainer/CMakeLists.txt    |  2 -
 lldb/source/Plugins/ObjectFile/CMakeLists.txt |  2 -
 .../Plugins/OperatingSystem/CMakeLists.txt    |  2 -
 lldb/source/Plugins/Platform/CMakeLists.txt   |  7 --
 lldb/source/Plugins/Process/CMakeLists.txt    |  5 --
 .../Plugins/Process/Utility/CMakeLists.txt    |  3 -
 lldb/source/Plugins/REPL/CMakeLists.txt       |  3 -
 .../RegisterTypeBuilder/CMakeLists.txt        |  2 -
 .../Plugins/ScriptInterpreter/CMakeLists.txt  |  2 -
 .../Plugins/StructuredData/CMakeLists.txt     |  2 -
 lldb/source/Plugins/SymbolFile/CMakeLists.txt |  7 --
 .../Plugins/SymbolLocator/CMakeLists.txt      |  2 -
 .../Plugins/SymbolVendor/CMakeLists.txt       |  3 -
 .../Plugins/SystemRuntime/CMakeLists.txt      |  3 -
 lldb/source/Plugins/Trace/CMakeLists.txt      |  2 -
 .../Plugins/TraceExporter/CMakeLists.txt      |  2 -
 lldb/source/Plugins/TypeSystem/CMakeLists.txt |  4 -
 .../Plugins/UnwindAssembly/CMakeLists.txt     |  2 -
 32 files changed, 219 deletions(-)
 delete mode 100644 lldb/cmake/modules/LLDBLayeringCheck.cmake

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index e3b72e94d4be..2aaf75dd87bc 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -37,7 +37,6 @@ endif()
 
 include(LLDBConfig)
 include(AddLLDB)
-include(LLDBLayeringCheck)
 
 set(LLDB_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
@@ -128,8 +127,6 @@ add_subdirectory(source)
 add_subdirectory(tools)
 add_subdirectory(docs)
 
-check_lldb_plugin_layering()
-
 if (LLDB_ENABLE_PYTHON)
   if(LLDB_BUILD_FRAMEWORK)
     set(lldb_python_target_dir "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Python/lldb")
diff --git a/lldb/cmake/modules/LLDBLayeringCheck.cmake b/lldb/cmake/modules/LLDBLayeringCheck.cmake
deleted file mode 100644
index 1669f6150cca..000000000000
--- a/lldb/cmake/modules/LLDBLayeringCheck.cmake
+++ /dev/null
@@ -1,74 +0,0 @@
-define_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND
-  BRIEF_DOCS "LLDB plugin kind (Process, SymbolFile, etc.")
-define_property(TARGET PROPERTY LLDB_PLUGIN_KIND INHERITED
-  BRIEF_DOCS "LLDB plugin kind (Process, SymbolFile, etc.")
-
-define_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES
-  BRIEF_DOCS "LLDB plugin kinds which the plugin can depend on")
-define_property(TARGET PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES INHERITED
-  BRIEF_DOCS "LLDB plugin kinds which the plugin can depend on")
-
-define_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
-  BRIEF_DOCS "LLDB plugin kinds which are depended on for historic reasons.")
-define_property(TARGET PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES INHERITED
-  BRIEF_DOCS "LLDB plugin kinds which are depended on for historic reasons.")
-
-option(LLDB_GENERATE_PLUGIN_DEP_GRAPH OFF)
-
-function(check_lldb_plugin_layering)
-  get_property(plugins GLOBAL PROPERTY LLDB_PLUGINS)
-  foreach(plugin ${plugins})
-    get_property(plugin_kind TARGET ${plugin} PROPERTY LLDB_PLUGIN_KIND)
-    get_property(acceptable_deps TARGET ${plugin}
-      PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES)
-    get_property(tolerated_deps TARGET ${plugin}
-      PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES)
-
-    # A plugin is always permitted to depend on its own kind for the purposes
-    # subclassing. Ideally the intra-kind dependencies should not form a loop,
-    # but we're not checking that here.
-    list(APPEND acceptable_deps ${plugin_kind})
-
-    list(APPEND all_plugin_kinds ${plugin_kind})
-
-    get_property(link_libs TARGET ${plugin} PROPERTY LINK_LIBRARIES)
-    foreach(link_lib ${link_libs})
-      if(link_lib IN_LIST plugins)
-        get_property(lib_kind TARGET ${link_lib} PROPERTY LLDB_PLUGIN_KIND)
-        if (lib_kind)
-          if (lib_kind IN_LIST acceptable_deps)
-            set(dep_kind green)
-          elseif (lib_kind IN_LIST tolerated_deps)
-            set(dep_kind yellow)
-          else()
-            set(dep_kind red)
-            message(SEND_ERROR "Plugin ${plugin} cannot depend on ${lib_kind} "
-              "plugin ${link_lib}")
-          endif()
-          list(APPEND dep_${dep_kind}_${plugin_kind}_${lib_kind} ${plugin})
-        endif()
-      endif()
-    endforeach()
-  endforeach()
-
-  if (LLDB_GENERATE_PLUGIN_DEP_GRAPH)
-    set(dep_graph "digraph Plugins {\n")
-    list(REMOVE_DUPLICATES all_plugin_kinds)
-    foreach (from ${all_plugin_kinds})
-      foreach (to ${all_plugin_kinds})
-        foreach (dep_kind green yellow red)
-          if (dep_${dep_kind}_${from}_${to})
-            list(REMOVE_DUPLICATES dep_${dep_kind}_${from}_${to})
-            string(REGEX REPLACE "lldbPlugin|${from}" "" short_deps
-              "${dep_${dep_kind}_${from}_${to}}")
-            string(JOIN "\n" plugins ${short_deps})
-            string(APPEND dep_graph
-              "  ${from}->${to}[color=\"${dep_kind}\" label=\"${plugins}\"];\n")
-          endif()
-        endforeach()
-      endforeach()
-    endforeach()
-    string(APPEND dep_graph "}\n")
-    file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/lldb-plugin-deps.dot" "${dep_graph}")
-  endif()
-endfunction()
diff --git a/lldb/docs/resources/contributing.rst b/lldb/docs/resources/contributing.rst
index e7d46c9061d5..48fd000765f6 100644
--- a/lldb/docs/resources/contributing.rst
+++ b/lldb/docs/resources/contributing.rst
@@ -56,56 +56,6 @@ subset of LLDB tests (the API tests) use a different system. Refer to the
 `lldb/test <https://github.com/llvm/llvm-project/tree/main/lldb/test>`_ folder
 for examples.
 
-
-LLDB plugins and their dependencies
------------------------------------
-
-LLDB has a concept of *plugins*, which are used to provide abstraction
-boundaries over functionality that is specific to a certain architecture,
-operating system, programming language, etc. A plugin implements an abstract
-base class (rarely, a set of related base classes), which is a part of LLDB
-core. This setup allows the LLDB core to remain generic while making it possible
-to support for new architectures, languages, and so on. For this to work, all
-code needs to obey certain rules.
-
-The principal rule is that LLDB core (defined as: everything under lldb/source
-*minus* lldb/source/Plugins) must not depend on any specific plugin. The only
-way it can interact with them is through the abstract interface. Explicit
-dependencies such as casting the base class to the plugin type are not permitted
-and neither are more subtle dependencies like checking the name plugin or or
-other situations where some code in LLDB core is tightly coupled to the
-implementation details of a specific plugin.
-
-The rule for interaction between different plugins is more nuanced. We recognize
-that some cross-plugin dependencies are unavoidable or even desirable. For
-example, a plugin may want to extend a plugin of the same kind to
-add/override/refine some functionality (e.g., Android is a "kind of" Linux, but
-it handles some things differently). Alternatively, a plugin of one kind may
-want to build on the functionality offered by a specific plugin of another kind
-(ELFCore Process plugin uses ELF ObjectFile plugin to create a process out of an
-ELF core file).
-
-In cases such as these, direct dependencies are acceptable. However, to keep the
-dependency graph manageable, we still have some rules to govern these
-relationships:
-
-* All dependencies between plugins of the same kind must flow in the same
-  direction (if plugin `A1` depends on plugin `B1`, then `B2` must not depend on
-  `A2`)
-* Dependency graph of plugin kinds must not contain loops (dependencies like
-  `A1->B1`, `B2->C2` and `C3->A3` are forbidden because they induce a cycle in
-  the plugin kind graph even though the plugins themselves are acyclical)
-
-
-The first of these rules is checked via CMake scripts (using the
-`LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES` property). Dependencies in this category
-are expected and permitted (subject to other constraints such as that dependency
-making sense for the particular pair of plugins). Unfortunately, due to historic
-reasons, not all plugin dependencies follow this rule, which is why we have
-another category called `LLDB_TOLERATED_PLUGIN_DEPENDENCIES`. New dependencies
-are forbidden (even though they are accepted by CMake) and existing ones should
-be removed whereever possible.
-
 .. _Error handling:
 
 Error handling and use of assertions in LLDB
diff --git a/lldb/source/Plugins/ABI/CMakeLists.txt b/lldb/source/Plugins/ABI/CMakeLists.txt
index 97a20364ae7d..e33ac8735486 100644
--- a/lldb/source/Plugins/ABI/CMakeLists.txt
+++ b/lldb/source/Plugins/ABI/CMakeLists.txt
@@ -1,9 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ABI)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
-  ProcessUtility
-  TypeSystem
-)
-
 foreach(target AArch64 ARM ARC Hexagon LoongArch Mips MSP430 PowerPC RISCV SystemZ X86)
   if (${target} IN_LIST LLVM_TARGETS_TO_BUILD)
     add_subdirectory(${target})
diff --git a/lldb/source/Plugins/Architecture/CMakeLists.txt b/lldb/source/Plugins/Architecture/CMakeLists.txt
index 0f898ef5116e..9ed8edf70af3 100644
--- a/lldb/source/Plugins/Architecture/CMakeLists.txt
+++ b/lldb/source/Plugins/Architecture/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Architecture)
-
 add_subdirectory(Arm)
 add_subdirectory(Mips)
 add_subdirectory(PPC64)
diff --git a/lldb/source/Plugins/Disassembler/CMakeLists.txt b/lldb/source/Plugins/Disassembler/CMakeLists.txt
index 1d1ea206e277..bec56765b60f 100644
--- a/lldb/source/Plugins/Disassembler/CMakeLists.txt
+++ b/lldb/source/Plugins/Disassembler/CMakeLists.txt
@@ -1,3 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Disassembler)
-
 add_subdirectory(LLVMC)
diff --git a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
index 01aba34b9416..30607159acdc 100644
--- a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
+++ b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
@@ -1,10 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND DynamicLoader)
-set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
-  Process # part of a loop (Process<->DynamicLoader).
-  TypeSystem
-)
-
 add_subdirectory(Darwin-Kernel)
 add_subdirectory(FreeBSD-Kernel)
 add_subdirectory(MacOSX-DYLD)
diff --git a/lldb/source/Plugins/ExpressionParser/CMakeLists.txt b/lldb/source/Plugins/ExpressionParser/CMakeLists.txt
index 8a8089879bd9..17c40aee44cc 100644
--- a/lldb/source/Plugins/ExpressionParser/CMakeLists.txt
+++ b/lldb/source/Plugins/ExpressionParser/CMakeLists.txt
@@ -1,3 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ExpressionParser)
-
 add_subdirectory(Clang)
diff --git a/lldb/source/Plugins/Instruction/CMakeLists.txt b/lldb/source/Plugins/Instruction/CMakeLists.txt
index bf48a1c1cc11..46d610f261e0 100644
--- a/lldb/source/Plugins/Instruction/CMakeLists.txt
+++ b/lldb/source/Plugins/Instruction/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Instruction)
-
 add_subdirectory(ARM)
 add_subdirectory(ARM64)
 add_subdirectory(LoongArch)
diff --git a/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt b/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt
index 2a6cf930945d..7f301bca14a8 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt
+++ b/lldb/source/Plugins/InstrumentationRuntime/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND InstrumentationRuntime)
-
 add_subdirectory(ASan)
 add_subdirectory(ASanLibsanitizers)
 add_subdirectory(MainThreadChecker)
diff --git a/lldb/source/Plugins/JITLoader/CMakeLists.txt b/lldb/source/Plugins/JITLoader/CMakeLists.txt
index ffba54f8b287..e52230199109 100644
--- a/lldb/source/Plugins/JITLoader/CMakeLists.txt
+++ b/lldb/source/Plugins/JITLoader/CMakeLists.txt
@@ -1,4 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND JITLoader)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES ObjectFile)
-
 add_subdirectory(GDB)
diff --git a/lldb/source/Plugins/Language/CMakeLists.txt b/lldb/source/Plugins/Language/CMakeLists.txt
index b432dd300668..7869074566d1 100644
--- a/lldb/source/Plugins/Language/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CMakeLists.txt
@@ -1,9 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Language)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
-  LanguageRuntime
-  TypeSystem
-)
-
 add_subdirectory(ClangCommon)
 add_subdirectory(CPlusPlus)
 add_subdirectory(ObjC)
diff --git a/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt
index 32528d6d6171..034ae1545ae8 100644
--- a/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt
+++ b/lldb/source/Plugins/LanguageRuntime/CMakeLists.txt
@@ -1,5 +1,2 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND LanguageRuntime)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES TypeSystem)
-
 add_subdirectory(CPlusPlus)
 add_subdirectory(ObjC)
diff --git a/lldb/source/Plugins/MemoryHistory/CMakeLists.txt b/lldb/source/Plugins/MemoryHistory/CMakeLists.txt
index 50838bb8a077..113f06362578 100644
--- a/lldb/source/Plugins/MemoryHistory/CMakeLists.txt
+++ b/lldb/source/Plugins/MemoryHistory/CMakeLists.txt
@@ -1,3 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND MemoryHistory)
-
 add_subdirectory(asan)
diff --git a/lldb/source/Plugins/ObjectContainer/CMakeLists.txt b/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
index 4ae1bb138a9a..cda0c8151dd8 100644
--- a/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ObjectContainer)
-
 add_subdirectory(BSD-Archive)
 add_subdirectory(Universal-Mach-O)
 add_subdirectory(Mach-O-Fileset)
diff --git a/lldb/source/Plugins/ObjectFile/CMakeLists.txt b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
index 6004b1f414d4..7abd0c96f4fd 100644
--- a/lldb/source/Plugins/ObjectFile/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ObjectFile)
-
 add_subdirectory(Breakpad)
 add_subdirectory(COFF)
 add_subdirectory(ELF)
diff --git a/lldb/source/Plugins/OperatingSystem/CMakeLists.txt b/lldb/source/Plugins/OperatingSystem/CMakeLists.txt
index 1a29f9ad3af4..06d909b862a0 100644
--- a/lldb/source/Plugins/OperatingSystem/CMakeLists.txt
+++ b/lldb/source/Plugins/OperatingSystem/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND OperatingSystem)
-
 if (LLDB_ENABLE_PYTHON)
   add_subdirectory(Python)
 endif()
diff --git a/lldb/source/Plugins/Platform/CMakeLists.txt b/lldb/source/Plugins/Platform/CMakeLists.txt
index f4753ab47ce1..0220e734b36d 100644
--- a/lldb/source/Plugins/Platform/CMakeLists.txt
+++ b/lldb/source/Plugins/Platform/CMakeLists.txt
@@ -1,10 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Platform)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
-  DynamicLoader
-  ObjectContainer
-  Process
-)
-
 add_subdirectory(AIX)
 add_subdirectory(Android)
 add_subdirectory(FreeBSD)
diff --git a/lldb/source/Plugins/Process/CMakeLists.txt b/lldb/source/Plugins/Process/CMakeLists.txt
index bd9b1b86dbf1..058b4b9ad215 100644
--- a/lldb/source/Plugins/Process/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/CMakeLists.txt
@@ -1,8 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Process)
-set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
-# This dependency is part of a loop (Process<->DynamicLoader).
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES DynamicLoader)
-
 if (CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
   add_subdirectory(Linux)
   add_subdirectory(POSIX)
diff --git a/lldb/source/Plugins/Process/Utility/CMakeLists.txt b/lldb/source/Plugins/Process/Utility/CMakeLists.txt
index 48646b784f93..fd3019613892 100644
--- a/lldb/source/Plugins/Process/Utility/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/Utility/CMakeLists.txt
@@ -1,6 +1,3 @@
-# TODO: Clean up this directory and its dependencies
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ProcessUtility)
-
 add_lldb_library(lldbPluginProcessUtility
   AuxVector.cpp
   FreeBSDSignals.cpp
diff --git a/lldb/source/Plugins/REPL/CMakeLists.txt b/lldb/source/Plugins/REPL/CMakeLists.txt
index 8b51b3899625..17c40aee44cc 100644
--- a/lldb/source/Plugins/REPL/CMakeLists.txt
+++ b/lldb/source/Plugins/REPL/CMakeLists.txt
@@ -1,4 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND REPL)
-set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES TypeSystem)
-
 add_subdirectory(Clang)
diff --git a/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt b/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt
index 7411b24c86cb..336ae9105802 100644
--- a/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt
+++ b/lldb/source/Plugins/RegisterTypeBuilder/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND RegisterTypeBuilder)
-
 add_lldb_library(lldbPluginRegisterTypeBuilderClang PLUGIN
   RegisterTypeBuilderClang.cpp
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt
index 4429b006173a..fa1c72a32fe1 100644
--- a/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND ScriptInterpreter)
-
 add_subdirectory(None)
 if (LLDB_ENABLE_PYTHON)
   add_subdirectory(Python)
diff --git a/lldb/source/Plugins/StructuredData/CMakeLists.txt b/lldb/source/Plugins/StructuredData/CMakeLists.txt
index a6caa233f222..40d64558482d 100644
--- a/lldb/source/Plugins/StructuredData/CMakeLists.txt
+++ b/lldb/source/Plugins/StructuredData/CMakeLists.txt
@@ -1,4 +1,2 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND StructuredData)
-
 add_subdirectory(DarwinLog)
 
diff --git a/lldb/source/Plugins/SymbolFile/CMakeLists.txt b/lldb/source/Plugins/SymbolFile/CMakeLists.txt
index 351652846420..106387b45ec1 100644
--- a/lldb/source/Plugins/SymbolFile/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolFile/CMakeLists.txt
@@ -1,10 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolFile)
-set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES
-  Language
-  TypeSystem  # part of a loop (TypeSystem<->SymbolFile).
-)
-
 add_subdirectory(Breakpad)
 add_subdirectory(CTF)
 add_subdirectory(DWARF)
diff --git a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
index 3b466f71dca5..3367022639ab 100644
--- a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolLocator)
-
 # Order matters here: the first symbol locator prevents further searching.
 # For DWARF binaries that are both stripped and split, the Default plugin
 # will return the stripped binary when asked for the ObjectFile, which then
diff --git a/lldb/source/Plugins/SymbolVendor/CMakeLists.txt b/lldb/source/Plugins/SymbolVendor/CMakeLists.txt
index a07330d7d8bc..1981706e06f4 100644
--- a/lldb/source/Plugins/SymbolVendor/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolVendor/CMakeLists.txt
@@ -1,6 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolVendor)
-set_property(DIRECTORY PROPERTY LLDB_ACCEPTABLE_PLUGIN_DEPENDENCIES ObjectFile)
-
 add_subdirectory(ELF)
 
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
diff --git a/lldb/source/Plugins/SystemRuntime/CMakeLists.txt b/lldb/source/Plugins/SystemRuntime/CMakeLists.txt
index 58fdc060bda3..0955a9eb74c2 100644
--- a/lldb/source/Plugins/SystemRuntime/CMakeLists.txt
+++ b/lldb/source/Plugins/SystemRuntime/CMakeLists.txt
@@ -1,4 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SystemRuntime)
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES TypeSystem)
-
 add_subdirectory(MacOSX)
diff --git a/lldb/source/Plugins/Trace/CMakeLists.txt b/lldb/source/Plugins/Trace/CMakeLists.txt
index 331b48f95f1a..955f88cec340 100644
--- a/lldb/source/Plugins/Trace/CMakeLists.txt
+++ b/lldb/source/Plugins/Trace/CMakeLists.txt
@@ -1,5 +1,3 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND Trace)
-
 option(LLDB_BUILD_INTEL_PT "Enable Building of Intel(R) Processor Trace Tool" OFF)
 
 add_subdirectory(common)
diff --git a/lldb/source/Plugins/TraceExporter/CMakeLists.txt b/lldb/source/Plugins/TraceExporter/CMakeLists.txt
index 7130548d7a9f..e0252ee36720 100644
--- a/lldb/source/Plugins/TraceExporter/CMakeLists.txt
+++ b/lldb/source/Plugins/TraceExporter/CMakeLists.txt
@@ -1,4 +1,2 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND TraceExporter)
-
 add_subdirectory(common)
 add_subdirectory(ctf)
diff --git a/lldb/source/Plugins/TypeSystem/CMakeLists.txt b/lldb/source/Plugins/TypeSystem/CMakeLists.txt
index 47e32ff176d8..17c40aee44cc 100644
--- a/lldb/source/Plugins/TypeSystem/CMakeLists.txt
+++ b/lldb/source/Plugins/TypeSystem/CMakeLists.txt
@@ -1,5 +1 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND TypeSystem)
-# This dependency is part of a loop (TypeSystem<->SymbolFile).
-set_property(DIRECTORY PROPERTY LLDB_TOLERATED_PLUGIN_DEPENDENCIES SymbolFile)
-
 add_subdirectory(Clang)
diff --git a/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt b/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt
index 1f505599c3ff..1723a0604580 100644
--- a/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt
+++ b/lldb/source/Plugins/UnwindAssembly/CMakeLists.txt
@@ -1,4 +1,2 @@
-set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND UnwindAssembly)
-
 add_subdirectory(InstEmulation)
 add_subdirectory(x86)

From 02d2a1646a4aec3a7cc5fba9ae3cb54af41fa05d Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 23 Jun 2025 11:09:08 +0100
Subject: [PATCH 1259/1322] [Offload] Fix entry_points.td test (#145292)

This was broken as part of #144494 , and just needs an update to the
check lines.
---
 offload/test/tools/offload-tblgen/entry_points.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/test/tools/offload-tblgen/entry_points.td b/offload/test/tools/offload-tblgen/entry_points.td
index c69650c2cff1..c66d5b488b46 100644
--- a/offload/test/tools/offload-tblgen/entry_points.td
+++ b/offload/test/tools/offload-tblgen/entry_points.td
@@ -25,13 +25,13 @@ def : Function {
 // CHECK: ol_result_t{{.*}} FunctionA(
 
 // The entry point should print tracing output if enabled
-// CHECK: if (offloadConfig().TracingEnabled) {
+// CHECK: if (llvm::offload::isTracingEnabled()) {
 // CHECK-NEXT: "---> FunctionA";
 
 // CHECK: Result = llvmErrorToOffloadError(FunctionA_val(ParamA, ParamB));
 
 // Tracing should construct a param struct for printing
-// CHECK: if (offloadConfig().TracingEnabled) {
+// CHECK: if (llvm::offload::isTracingEnabled()) {
 // CHECK: function_a_params_t Params = {&ParamA, &ParamB};
 
 // CHECK: return Result;

From d9a99afbfc325b2ba7257d3f707bb143351e3e61 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <Nico.Vasilache@amd.com>
Date: Mon, 23 Jun 2025 12:23:24 +0200
Subject: [PATCH 1260/1322] =?UTF-8?q?[mlir][transform]=20Plumb=20a=20simpl?=
 =?UTF-8?q?ified=20form=20of=20AffineMin=20folding=20into=20t=E2=80=A6=20(?=
 =?UTF-8?q?#145170)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ransform.pad-tiling-interface

This revision introduces a simple variant of AffineMin folding in
makeComposedFoldedAffineApply and makes use of it in
transform.pad-tiling-interface. Since this version explicitly call
ValueBoundsInterface, it may be too expensive and is only activate
behind a flag.
It results in better foldings when mixing tiling and padding, including
with dynamic shapes.
---
 .../mlir/Dialect/Affine/IR/AffineOps.h        |  18 +-
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp      | 135 ++++++++++----
 .../Linalg/Transforms/PadTilingInterface.cpp  |   5 +-
 ...m-op-pad-tiling-interface-multiple-of.mlir | 164 ++++++++++++++++--
 4 files changed, 273 insertions(+), 49 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
index 6fdb72c370e6..2091faa6b0b0 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
@@ -410,9 +410,11 @@ void canonicalizeSetAndOperands(IntegerSet *set,
 /// other AffineApplyOps supplying those operands. The operands of the resulting
 /// AffineApplyOp do not change the length of  AffineApplyOp chains.
 AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
-                                      ArrayRef<OpFoldResult> operands);
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin = false);
 AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
-                                      ArrayRef<OpFoldResult> operands);
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin = false);
 
 /// Constructs an AffineApplyOp that applies `map` to `operands` after composing
 /// the map with the maps of any other AffineApplyOp supplying the operands,
@@ -421,16 +423,19 @@ AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
 /// map.
 OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                            AffineMap map,
-                                           ArrayRef<OpFoldResult> operands);
+                                           ArrayRef<OpFoldResult> operands,
+                                           bool composeAffineMin = false);
 /// Variant of `makeComposedFoldedAffineApply` that applies to an expression.
 OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
                                            AffineExpr expr,
-                                           ArrayRef<OpFoldResult> operands);
+                                           ArrayRef<OpFoldResult> operands,
+                                           bool composeAffineMin = false);
 /// Variant of `makeComposedFoldedAffineApply` suitable for multi-result maps.
 /// Note that this may create as many affine.apply operations as the map has
 /// results given that affine.apply must be single-result.
 SmallVector<OpFoldResult> makeComposedFoldedMultiResultAffineApply(
-    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands);
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin = false);
 
 /// Returns an AffineMinOp obtained by composing `map` and `operands` with
 /// AffineApplyOps supplying those operands.
@@ -459,7 +464,8 @@ OpFoldResult makeComposedFoldedAffineMax(OpBuilder &b, Location loc,
 /// terminal symbol, i.e., a symbol defined at the top level or a block/function
 /// argument.
 void fullyComposeAffineMapAndOperands(AffineMap *map,
-                                      SmallVectorImpl<Value> *operands);
+                                      SmallVectorImpl<Value> *operands,
+                                      bool composeAffineMin = false);
 
 } // namespace affine
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 48770d4f4ff7..3b4d51d914d8 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -11,12 +11,14 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -26,7 +28,9 @@
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
+#include <limits>
 #include <numeric>
 #include <optional>
 
@@ -1042,6 +1046,62 @@ simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
                        map.getContext());
 }
 
+/// Assuming `dimOrSym` is a quantity in `map` that is defined by `minOp`.
+/// Assuming that the quantity is of the form:
+///   `affine_min(f(x, y), symbolic_cst)`.
+/// This function checks that `0 < affine_min(f(x, y), symbolic_cst)` and
+/// proceeds with replacing the patterns:
+/// ```
+///   dimOrSym.ceildiv(symbolic_cst)
+///   (dimOrSym + symbolic_cst - 1).floordiv(symbolic_cst)
+/// ```
+/// by `1`.
+///
+/// Additionally, allows the caller to pass `affineMinKnownToBeNonNegative` to
+/// inject static information that may not be statically discoverable.
+///
+/// Warning: ValueBoundsConstraintSet::computeConstantBound is needed to check
+/// for the nonnegative case, if `affineMinKnownToBeNonNegative` is false.
+static LogicalResult replaceAffineMinBoundingBoxExpression(
+    AffineMinOp minOp, AffineExpr dimOrSym, AffineMap *map,
+    bool affineMinKnownToBeNonNegative = false) {
+  auto affineMinMap = minOp.getAffineMap();
+  if (!affineMinKnownToBeNonNegative) {
+    ValueRange values = minOp->getOperands();
+    for (unsigned i = 0, e = affineMinMap.getNumResults(); i < e; ++i) {
+      AffineMap row = affineMinMap.getSubMap(ArrayRef<unsigned>{i});
+      FailureOr<int64_t> lowerBound =
+          ValueBoundsConstraintSet::computeConstantBound(
+              presburger::BoundType::LB, {row, values},
+              /*stopCondition=*/nullptr,
+              /*closedUB=*/true);
+      if (failed(lowerBound) || lowerBound.value() <= 0)
+        return failure();
+    }
+  }
+
+  AffineMap initialMap = *map;
+  for (unsigned i = 0, e = affineMinMap.getNumResults(); i != e; ++i) {
+    auto m = affineMinMap.getSubMap(ArrayRef<unsigned>{i});
+    AffineExpr expr = m.getResult(0);
+    if (!expr.isSymbolicOrConstant())
+      continue;
+
+    DenseMap<AffineExpr, AffineExpr> repl;
+    // dimOrSym.ceilDiv(expr) -> 1
+    repl[dimOrSym.ceilDiv(expr)] = getAffineConstantExpr(1, minOp.getContext());
+    // (dimOrSym + expr - 1).floorDiv(expr) -> 1
+    repl[(dimOrSym + expr - 1).floorDiv(expr)] =
+        getAffineConstantExpr(1, minOp.getContext());
+    auto newMap = map->replace(repl);
+    if (newMap == *map)
+      continue;
+    *map = newMap;
+  }
+
+  return success(*map != initialMap);
+}
+
 /// Replace all occurrences of AffineExpr at position `pos` in `map` by the
 /// defining AffineApplyOp expression and operands.
 /// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
@@ -1052,10 +1112,13 @@ simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
 ///   2. `map` dim and symbols are gradually shifted to higher positions.
 ///   3. Old `dim` and `sym` entries are replaced by nullptr
 /// This avoids the need for any bookkeeping.
+/// If `replaceAffineMin` is set to true, additionally triggers more expensive
+/// replacements involving affine_min operations.
 static LogicalResult replaceDimOrSym(AffineMap *map,
                                      unsigned dimOrSymbolPosition,
                                      SmallVectorImpl<Value> &dims,
-                                     SmallVectorImpl<Value> &syms) {
+                                     SmallVectorImpl<Value> &syms,
+                                     bool replaceAffineMin) {
   MLIRContext *ctx = map->getContext();
   bool isDimReplacement = (dimOrSymbolPosition < dims.size());
   unsigned pos = isDimReplacement ? dimOrSymbolPosition
@@ -1064,6 +1127,13 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
   if (!v)
     return failure();
 
+  auto minOp = v.getDefiningOp<AffineMinOp>();
+  if (minOp && replaceAffineMin) {
+    AffineExpr dimOrSym = isDimReplacement ? getAffineDimExpr(pos, ctx)
+                                           : getAffineSymbolExpr(pos, ctx);
+    return replaceAffineMinBoundingBoxExpression(minOp, dimOrSym, map);
+  }
+
   auto affineApply = v.getDefiningOp<AffineApplyOp>();
   if (!affineApply)
     return failure();
@@ -1101,7 +1171,8 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
 /// iteratively. Perform canonicalization of map and operands as well as
 /// AffineMap simplification. `map` and `operands` are mutated in place.
 static void composeAffineMapAndOperands(AffineMap *map,
-                                        SmallVectorImpl<Value> *operands) {
+                                        SmallVectorImpl<Value> *operands,
+                                        bool composeAffineMin = false) {
   if (map->getNumResults() == 0) {
     canonicalizeMapAndOperands(map, operands);
     *map = simplifyAffineMap(*map);
@@ -1122,7 +1193,8 @@ static void composeAffineMapAndOperands(AffineMap *map,
   while (true) {
     bool changed = false;
     for (unsigned pos = 0; pos != dims.size() + syms.size(); ++pos)
-      if ((changed |= succeeded(replaceDimOrSym(map, pos, dims, syms))))
+      if ((changed |=
+           succeeded(replaceDimOrSym(map, pos, dims, syms, composeAffineMin))))
         break;
     if (!changed)
       break;
@@ -1163,38 +1235,41 @@ static void composeAffineMapAndOperands(AffineMap *map,
 }
 
 void mlir::affine::fullyComposeAffineMapAndOperands(
-    AffineMap *map, SmallVectorImpl<Value> *operands) {
+    AffineMap *map, SmallVectorImpl<Value> *operands, bool composeAffineMin) {
   while (llvm::any_of(*operands, [](Value v) {
     return isa_and_nonnull<AffineApplyOp>(v.getDefiningOp());
   })) {
-    composeAffineMapAndOperands(map, operands);
+    composeAffineMapAndOperands(map, operands, composeAffineMin);
   }
 }
 
 AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
-                                      ArrayRef<OpFoldResult> operands) {
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin) {
   SmallVector<Value> valueOperands;
   map = foldAttributesIntoMap(b, map, operands, valueOperands);
-  composeAffineMapAndOperands(&map, &valueOperands);
+  composeAffineMapAndOperands(&map, &valueOperands, composeAffineMin);
   assert(map);
   return b.create<AffineApplyOp>(loc, map, valueOperands);
 }
 
 AffineApplyOp
 mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineExpr e,
-                                      ArrayRef<OpFoldResult> operands) {
+                                      ArrayRef<OpFoldResult> operands,
+                                      bool composeAffineMin) {
   return makeComposedAffineApply(
       b, loc,
       AffineMap::inferFromExprList(ArrayRef<AffineExpr>{e}, b.getContext())
           .front(),
-      operands);
+      operands, composeAffineMin);
 }
 
 /// Composes the given affine map with the given list of operands, pulling in
 /// the maps from any affine.apply operations that supply the operands.
 static void composeMultiResultAffineMap(AffineMap &map,
-                                        SmallVectorImpl<Value> &operands) {
+                                        SmallVectorImpl<Value> &operands,
+                                        bool composeAffineMin = false) {
   // Compose and canonicalize each expression in the map individually because
   // composition only applies to single-result maps, collecting potentially
   // duplicate operands in a single list with shifted dimensions and symbols.
@@ -1203,7 +1278,8 @@ static void composeMultiResultAffineMap(AffineMap &map,
   for (unsigned i : llvm::seq<unsigned>(0, map.getNumResults())) {
     SmallVector<Value> submapOperands(operands.begin(), operands.end());
     AffineMap submap = map.getSubMap({i});
-    fullyComposeAffineMapAndOperands(&submap, &submapOperands);
+    fullyComposeAffineMapAndOperands(&submap, &submapOperands,
+                                     composeAffineMin);
     canonicalizeMapAndOperands(&submap, &submapOperands);
     unsigned numNewDims = submap.getNumDims();
     submap = submap.shiftDims(dims.size()).shiftSymbols(symbols.size());
@@ -1221,10 +1297,9 @@ static void composeMultiResultAffineMap(AffineMap &map,
   canonicalizeMapAndOperands(&map, &operands);
 }
 
-OpFoldResult
-mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
-                                            AffineMap map,
-                                            ArrayRef<OpFoldResult> operands) {
+OpFoldResult mlir::affine::makeComposedFoldedAffineApply(
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin) {
   assert(map.getNumResults() == 1 && "building affine.apply with !=1 result");
 
   // Create new builder without a listener, so that no notification is
@@ -1236,7 +1311,7 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
 
   // Create op.
   AffineApplyOp applyOp =
-      makeComposedAffineApply(newBuilder, loc, map, operands);
+      makeComposedAffineApply(newBuilder, loc, map, operands, composeAffineMin);
 
   // Get constant operands.
   SmallVector<Attribute> constOperands(applyOp->getNumOperands());
@@ -1256,26 +1331,25 @@ mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
   return llvm::getSingleElement(foldResults);
 }
 
-OpFoldResult
-mlir::affine::makeComposedFoldedAffineApply(OpBuilder &b, Location loc,
-                                            AffineExpr expr,
-                                            ArrayRef<OpFoldResult> operands) {
+OpFoldResult mlir::affine::makeComposedFoldedAffineApply(
+    OpBuilder &b, Location loc, AffineExpr expr,
+    ArrayRef<OpFoldResult> operands, bool composeAffineMin) {
   return makeComposedFoldedAffineApply(
       b, loc,
       AffineMap::inferFromExprList(ArrayRef<AffineExpr>{expr}, b.getContext())
           .front(),
-      operands);
+      operands, composeAffineMin);
 }
 
 SmallVector<OpFoldResult>
 mlir::affine::makeComposedFoldedMultiResultAffineApply(
-    OpBuilder &b, Location loc, AffineMap map,
-    ArrayRef<OpFoldResult> operands) {
-  return llvm::map_to_vector(llvm::seq<unsigned>(0, map.getNumResults()),
-                             [&](unsigned i) {
-                               return makeComposedFoldedAffineApply(
-                                   b, loc, map.getSubMap({i}), operands);
-                             });
+    OpBuilder &b, Location loc, AffineMap map, ArrayRef<OpFoldResult> operands,
+    bool composeAffineMin) {
+  return llvm::map_to_vector(
+      llvm::seq<unsigned>(0, map.getNumResults()), [&](unsigned i) {
+        return makeComposedFoldedAffineApply(b, loc, map.getSubMap({i}),
+                                             operands, composeAffineMin);
+      });
 }
 
 template <typename OpTy>
@@ -3024,7 +3098,8 @@ void AffineIfOp::build(OpBuilder &builder, OperationState &result,
 /// `set` by composing the maps of such affine.apply ops with the integer
 /// set constraints.
 static void composeSetAndOperands(IntegerSet &set,
-                                  SmallVectorImpl<Value> &operands) {
+                                  SmallVectorImpl<Value> &operands,
+                                  bool composeAffineMin = false) {
   // We will simply reuse the API of the map composition by viewing the LHSs of
   // the equalities and inequalities of `set` as the affine exprs of an affine
   // map. Convert to equivalent map, compose, and convert back to set.
@@ -3035,7 +3110,7 @@ static void composeSetAndOperands(IntegerSet &set,
                     [](Value v) { return v.getDefiningOp<AffineApplyOp>(); }))
     return;
 
-  composeAffineMapAndOperands(&map, &operands);
+  composeAffineMapAndOperands(&map, &operands, composeAffineMin);
   set = IntegerSet::get(map.getNumDims(), map.getNumSymbols(), map.getResults(),
                         set.getEqFlags());
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 5383ae48aeb3..42dac0776bac 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -84,7 +84,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
       getDimsToSize(rewriter, indexingSizes, options);
 
   // For each dimension in the operand's shape, iterate over indexingSizes and
-  // add
+  // add the various term contributions.
   for (const auto &enResults : enumerate(indexingMap.getResults())) {
     int64_t resultIndex = enResults.index();
     AffineMap partialIndexingMap = indexingMap.getSubMap(
@@ -122,7 +122,8 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
         AffineMap composedMap = projectedMap.compose(ceilMap);
         OpFoldResult paddingDimOfr = affine::makeComposedFoldedAffineApply(
             rewriter, loc, composedMap,
-            {indexingSizes[paddingDim], paddingSize});
+            {indexingSizes[paddingDim], paddingSize},
+            /*composeAffineMin=*/true);
         terms.push_back(paddingDimOfr);
       } else {
         // Otherwise just set to paddingSize.
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
index 5ac35c14be3f..845fe2519301 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
@@ -19,7 +19,7 @@ func.func @pad_lhs(
   //      CHECK:     : tensor<?x25xf32> to tensor<?x25xf32>
 
   //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<?x12xf32>, tensor<12x25xf32>) outs(%{{.*}} : tensor<?x25xf32>) -> tensor<?x25xf32>
-  
+
   //      CHECK:   tensor.extract_slice %{{.*}}[0, 0] [%{{.*}}, 25] [1, 1]
   //      CHECK:     : tensor<?x25xf32> to tensor<?x25xf32>
   //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [%{{.*}}, 25] [1, 1]
@@ -29,8 +29,8 @@ func.func @pad_lhs(
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
       : (!transform.any_op) -> !transform.any_op
 
     // Tile to 5 then pad to 8 (supposedly to better hit vector ops).
@@ -71,13 +71,13 @@ module {
     return %0 : tensor<7x11x12xf32>
   }
   module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %module_op : (!transform.any_op) -> !transform.any_op
       %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
-        padding_dimensions = [0, 2], 
+        padding_dimensions = [0, 2],
         padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
       } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-      transform.yield 
+      transform.yield
     }
   }
 }
@@ -126,13 +126,155 @@ module {
     return %0 : tensor<?x11x?xf32>
   }
   module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %module_op : (!transform.any_op) -> !transform.any_op
       %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
-        padding_dimensions = [0, 2], 
+        padding_dimensions = [0, 2],
         padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
       } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-      transform.yield 
+      transform.yield
     }
   }
 }
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (-s0 + (s0 ceildiv 16) * 16)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1] -> (-s1 + (s0 ceildiv 16) * 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> ((s0 ceildiv 16) * 16)>
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: %[[D0_0:.*]] = tensor.dim
+  //      CHECK: %[[H0:.*]] = affine.apply #[[$MAP0]]()[%[[D0_0]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[0, %[[H0]]]
+  //      CHECK:   : tensor<24x?xf32> to tensor<24x?xf32>
+
+  //      CHECK: %[[D0_2:.*]] = tensor.dim
+  //      CHECK: %[[H1:.*]] = affine.apply #[[$MAP1]]()[%[[D0_0]], %[[D0_2]]]
+  //      CHECK: tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:   : tensor<?x25xf32> to tensor<?x25xf32>
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+
+  //      CHECK:    linalg.matmul ins(%{{.*}}, %{{.*}}: tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [8, 25] [1, 1]
+  // CHECK-SAME:     : tensor<8x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+
+    // Pad then tile should produce static shapes.
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul_padded tile_sizes [8, 0, 16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %func = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.apply_registered_pass "resolve-shaped-type-result-dims" to %func
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func2 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    %minmax = transform.structured.match ops{["affine.min", "affine.max"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+    transform.affine.simplify_min_max_affine_ops %minmax : !transform.any_op
+    transform.apply_patterns to %func2 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (-d0 + 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<24x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<24x25xf32>)
+     -> tensor<24x25xf32>
+{
+  //      CHECK: scf.for %{{.*}} -> (tensor<24x25xf32>)
+  //      CHECK:   %[[MIN:.*]] = affine.min #[[$MAP0]](%{{.*}})
+  //      CHECK:   %[[H0:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[0, %[[H0]]]
+  //      CHECK:     : tensor<8x?xf32> to tensor<8x16xf32>
+
+  //      CHECK:   %[[H1:.*]] = affine.apply #[[$MAP1]](%[[MIN]])
+  //      CHECK:   tensor.pad %{{.*}} low[0, 0] high[%[[H1]], 0]
+  //      CHECK:     : tensor<?x25xf32> to tensor<16x25xf32>
+
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+
+  //      CHECK:   tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 0] [8, 25] [1, 1]
+  // CHECK-SAME:     : tensor<8x25xf32> into tensor<24x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32>
+  func.return %0 : tensor<24x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile then pad should produce static shapes.
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0) -> (-d0 + 20, 8)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (-d0 + 8)>
+// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> (-d0 + 16)>
+
+//     CHECK-LABEL: pad_lhs
+func.func @pad_lhs(
+  %arg0: tensor<20x?xf32>, %arg1: tensor<?x25xf32>, %arg2: tensor<20x25xf32>)
+     -> tensor<20x25xf32>
+{
+  //      CHECK:   linalg.matmul ins(%{{.*}}, %{{.*}} : tensor<8x16xf32>, tensor<16x25xf32>) outs(%{{.*}} : tensor<8x25xf32>) -> tensor<8x25xf32>
+  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<20x?xf32>, tensor<?x25xf32>) outs(%arg2 : tensor<20x25xf32>) -> tensor<20x25xf32>
+  func.return %0 : tensor<20x25xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+
+    // Tile then pad should produce static shapes.
+    %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 2]
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    transform.yield
+  }
+}
+

From 7360ed0159a42f2071d16e51c8575faaae88d91b Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <Nico.Vasilache@amd.com>
Date: Mon, 23 Jun 2025 12:29:22 +0200
Subject: [PATCH 1261/1322] [mlir][transform] Drop redundant padding_dimensions
 spec from pad_tiling_interface (#145257)

This revision aligns padding specification in pad_tiling_interface to
that of tiling specification.
Dimensions that should be skipped are specified by "padding by 0".
Trailing dimensions that are ignored are automatically completed to "pad
to 0".
---
 .../Linalg/TransformOps/LinalgTransformOps.td | 25 +++++++----
 .../TransformOps/LinalgTransformOps.cpp       | 21 +--------
 .../Linalg/Transforms/PadTilingInterface.cpp  | 45 +++++++++----------
 ...m-op-pad-tiling-interface-multiple-of.mlir | 36 +++++++--------
 .../transform-op-pad-tiling-interface.mlir    | 12 ++---
 5 files changed, 58 insertions(+), 81 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index cf3f2b70580d..c5650470fdc8 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1195,17 +1195,29 @@ def PadTilingInterfaceOp : Op<Transform_Dialect, "structured.pad_tiling_interfac
      TransformOpInterface,
      ReportTrackingListenerFailuresOpTrait]> {
   let description = [{
-    Pads the operations pointed to by the target handle using the options
-    provided as operation attributes. The operation returns a handle to the
-    padded operation and to the padding operation ("tensor.pad").
+    Pads the **iteration domain** of the operations pointed to by the target
+    handle using the options provided as operation attributes. Padding the
+    iteration domain induces a padding of the operands that is consistent
+    across the op semantics and, unlike for simple elementwise ops, may not be
+    trivially deducible or specifiable on operands only (e.g. convolutions).
+    
+    The specification of `padding_sizes` follows that of `tile_sizes` during
+    tiling: the value "0" on a particular iterator encode "no padding". Like in
+    the case of tiling, an automatic completion by 0 to the operation rank
+    occurs.
+    
+    This transformation returns a handle to the padded operation and to the
+    padding operation ("tensor.pad").
 
     TODO: in the future this should be moved out of a specific Linalg
     implementation file and into a more general "Structured" file.
 
     #### Return modes
 
-    This operation ignores non-Linalg ops and drops them in the return.
-    In the future, this operation will support all TilingInterfaceOps.
+    This operation ignores non-IndexingMapOpInterface ops and drops them in the
+    return. In the future, this operation will support all TilingInterfaceOps
+    for which the contract between iteration domain and operands can be 
+    reified.    
 
     This operation may produce a definite failure if the padding fails for any
     reason.
@@ -1219,7 +1231,6 @@ def PadTilingInterfaceOp : Op<Transform_Dialect, "structured.pad_tiling_interfac
   let arguments =
     (ins TransformHandleTypeInterface:$target,
          DefaultValuedAttr<ArrayAttr, "{}">:$padding_values,
-         DefaultValuedAttr<I64ArrayAttr, "{}">:$padding_dimensions,
          Variadic<TransformAnyParamTypeOrAnyHandle>:$padding_sizes,
          DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:
             $static_padding_sizes,
@@ -1245,11 +1256,9 @@ def PadTilingInterfaceOp : Op<Transform_Dialect, "structured.pad_tiling_interfac
     // add/mul ring at the moment.
     // TODO: support other operations (e.g. min, max etc).
     OpBuilder<(ins "Value":$target,
-                   "ArrayRef<int64_t>":$paddingDimensions,
                    CArg<"ArrayRef<int64_t>", "{}">:$staticPaddingSizes,
                    CArg<"bool", "false">:$padToMultipleOf)>,
     OpBuilder<(ins "Value":$target,
-                   "ArrayRef<int64_t>":$paddingDimensions,
                    "ArrayRef<OpFoldResult>":$mixedPadPaddingSizes,
                    CArg<"bool", "false">:$usePrescribedTensorShapes)>
   ];
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 5d55adbf46f3..d9a0ba02f4fe 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2163,7 +2163,6 @@ LogicalResult transform::PadOp::verify() {
 void transform::PadTilingInterfaceOp::build(OpBuilder &b,
                                             OperationState &result,
                                             Value target,
-                                            ArrayRef<int64_t> paddingDimensions,
                                             ArrayRef<int64_t> paddingSizes,
                                             bool padToMultipleOf) {
   auto resultType = transform::AnyOpType::get(b.getContext());
@@ -2172,7 +2171,6 @@ void transform::PadTilingInterfaceOp::build(OpBuilder &b,
                /*types=*/TypeRange{resultType, resultType},
                /*target=*/target,
                /*paddingValues=*/ArrayAttr(), // let inference handle this
-               /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
                /*paddingSizes=*/ValueRange{},
                /*paddingSizes=*/
                (paddingSizes.empty() ? DenseI64ArrayAttr()
@@ -2183,7 +2181,6 @@ void transform::PadTilingInterfaceOp::build(OpBuilder &b,
 
 void transform::PadTilingInterfaceOp::build(
     OpBuilder &b, OperationState &result, Value target,
-    ArrayRef<int64_t> paddingDimensions,
     ArrayRef<OpFoldResult> mixedPaddingSizes, bool padToMultipleOf) {
   auto resultType = transform::AnyOpType::get(b.getContext());
   SmallVector<int64_t> staticPaddingSizes;
@@ -2195,7 +2192,6 @@ void transform::PadTilingInterfaceOp::build(
                /*types=*/TypeRange{resultType, resultType},
                /*target=*/target,
                /*paddingValues=*/ArrayAttr(), // let inference handle this
-               /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
                /*paddingSizes=*/dynamicPaddingSizes,
                /*paddingSizes=*/staticPaddingSizes,
                /*usePrescribedTensorShapes=*/padToMultipleOf);
@@ -2277,8 +2273,6 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
     TilingInterface paddedOp;
     PadTilingInterfaceOptions options;
     options.setPaddingValues(paddingValues)
-        .setPaddingDimensions(
-            extractFromIntegerArrayAttr<int64_t>(getPaddingDimensions()))
         .setPaddingSizes(getMixedPaddingSizes())
         .setPadToMultipleOf(getPadToMultipleOf());
 
@@ -2303,20 +2297,7 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
   return DiagnosedSilenceableFailure::success();
 }
 
-LogicalResult transform::PadTilingInterfaceOp::verify() {
-  SmallVector<int64_t> paddingDimensions =
-      extractFromIntegerArrayAttr<int64_t>(getPaddingDimensions());
-  if (any_of(paddingDimensions,
-             [](int64_t paddingDimension) { return paddingDimension < 0; })) {
-    return emitOpError() << "expects padding_dimensions to contain positive "
-                            "integers, found "
-                         << getPaddingDimensions();
-  }
-  if (getMixedPaddingSizes().size() != paddingDimensions.size()) {
-    return emitOpError() << "expects as many multiples as padding_dimensions";
-  }
-  return success();
-}
+LogicalResult transform::PadTilingInterfaceOp::verify() { return success(); }
 
 //===---------------------------------------------------------------------===//
 // HoistPadOp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 42dac0776bac..eda3373b4d63 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -32,29 +32,27 @@ using namespace mlir::tensor;
 #define DBGSNL() (llvm::dbgs() << "\n")
 
 /// Form a "full-rank" padding specification so that the application is easy.
-static llvm::SmallDenseMap<int64_t, OpFoldResult>
-getDimsToSize(Builder &b, ArrayRef<OpFoldResult> indexingSizes,
-              const PadTilingInterfaceOptions &options) {
-  llvm::SmallDenseMap<int64_t, OpFoldResult> dimsToSize;
-  for (const auto &[paddingDim, paddingSize] :
-       llvm::zip_equal(options.paddingDimensions, options.paddingSizes)) {
-    dimsToSize[paddingDim] = paddingSize;
-  }
+static SmallVector<OpFoldResult>
+getFullRankPaddingSizes(Builder &b, ArrayRef<OpFoldResult> indexingSizes,
+                        const PadTilingInterfaceOptions &options) {
+  SmallVector<OpFoldResult> paddingSizes;
   // Complete the padding specification to specify all dimensions.
-  for (int64_t idx = 0, e = indexingSizes.size(); idx != e; ++idx) {
-    if (dimsToSize.find(idx) != dimsToSize.end())
-      continue;
-    // If a dimension is not specified, either complete with:
+  for (size_t idx = 0, e = indexingSizes.size(); idx != e; ++idx) {
+    // Complete to zero if needed.
+    paddingSizes.push_back(options.paddingSizes.size() > idx
+                               ? options.paddingSizes[idx]
+                               : b.getIndexAttr(0));
+    // If a dimension is zero (either specified or completed), replace by:
     //   - 1 if we are padding to the next multiple of.
     //   - indexingSizes[idx] otherwise
-    dimsToSize[idx] =
-        options.padToMultipleOf ? b.getIndexAttr(1) : indexingSizes[idx];
-  }
-  for (int64_t idx = 0, e = indexingSizes.size(); idx != e; ++idx) {
-    LLVM_DEBUG(DBGS() << "----idx: " << idx << " : " << dimsToSize[idx]
+    if (isZeroInteger(paddingSizes[idx])) {
+      paddingSizes[idx] =
+          options.padToMultipleOf ? b.getIndexAttr(1) : indexingSizes[idx];
+    }
+    LLVM_DEBUG(DBGS() << "----idx: " << idx << " : " << paddingSizes[idx]
                       << "\n");
   }
-  return dimsToSize;
+  return paddingSizes;
 }
 
 /// Compute the padded shape of the given value `v` of `RankedTensorType` given
@@ -80,8 +78,8 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
          "rank");
 
   // "Full-rank" padding specification.
-  llvm::SmallDenseMap<int64_t, OpFoldResult> dimsToSize =
-      getDimsToSize(rewriter, indexingSizes, options);
+  SmallVector<OpFoldResult> paddingSizes =
+      getFullRankPaddingSizes(rewriter, indexingSizes, options);
 
   // For each dimension in the operand's shape, iterate over indexingSizes and
   // add the various term contributions.
@@ -97,7 +95,9 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
     // Find all padding dimensions that contribute to this operand dimension
     // and compute the padded term contribution to the final padded shape.
     SmallVector<OpFoldResult> terms;
-    for (const auto &[paddingDim, paddingSize] : dimsToSize) {
+    for (size_t paddingDim = 0, e = paddingSizes.size(); paddingDim != e;
+         ++paddingDim) {
+      OpFoldResult paddingSize = paddingSizes[paddingDim];
       LLVM_DEBUG(DBGS() << "------try apply padding of dim: " << paddingDim
                         << " to: " << paddingSize << "\n");
       if (!enResults.value().isFunctionOfDim(paddingDim))
@@ -224,9 +224,6 @@ linalg::rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
                           SmallVector<tensor::PadOp> &padOps,
                           PadSizeComputationFunction computePaddingSizeFun) {
   LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << opToPad << "\n");
-  assert(constOptions.paddingSizes.size() ==
-             constOptions.paddingDimensions.size() &&
-         "invalid number of elements in padToMultipleOf");
 
   Location loc = opToPad.getLoc();
   PadTilingInterfaceOptions options(constOptions);
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
index 845fe2519301..78619b682673 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface-multiple-of.mlir
@@ -36,8 +36,7 @@ module attributes {transform.with_named_sequence} {
     // Tile to 5 then pad to 8 (supposedly to better hit vector ops).
     %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul_l1 to padding_sizes [8] pad_to_multiple_of {
-      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
-      padding_dimensions=[0]
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.yield
@@ -71,11 +70,10 @@ module {
     return %0 : tensor<7x11x12xf32>
   }
   module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
-      %0 = transform.structured.match ops{["linalg.generic"]} in %module_op : (!transform.any_op) -> !transform.any_op
-      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
-        padding_dimensions = [0, 2],
-        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 0, 5] pad_to_multiple_of {
+        padding_values = [0.0 : f32, 0.0 : f32, 0.0 : f32]
       } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
     }
@@ -126,11 +124,10 @@ module {
     return %0 : tensor<?x11x?xf32>
   }
   module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
-      %0 = transform.structured.match ops{["linalg.generic"]} in %module_op : (!transform.any_op) -> !transform.any_op
-      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 5] pad_to_multiple_of {
-        padding_dimensions = [0, 2],
-        padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
+    transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+      %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [3, 0, 5] pad_to_multiple_of {
+        padding_values = [0.0 : f32, 0.0 : f32, 0.0 : f32]
       } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield
     }
@@ -172,9 +169,8 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> !transform.any_op
 
     // Pad then tile should produce static shapes.
-    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul to padding_sizes [8, 16] pad_to_multiple_of {
-      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
-      padding_dimensions=[0, 2]
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul to padding_sizes [8, 0, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %m, %l0, %l1 = transform.structured.tile_using_for %matmul_padded tile_sizes [8, 0, 16]
@@ -234,9 +230,8 @@ module attributes {transform.with_named_sequence} {
     %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
-    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
-      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
-      padding_dimensions=[0, 2]
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 0, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.yield
@@ -269,9 +264,8 @@ module attributes {transform.with_named_sequence} {
     %m, %l0, %l1 = transform.structured.tile_using_for %matmul tile_sizes [8, 0, 16]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
-    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 16] pad_to_multiple_of {
-      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
-      padding_dimensions=[0, 2]
+    %matmul_padded, %_ = transform.structured.pad_tiling_interface %m to padding_sizes [8, 0, 16] pad_to_multiple_of {
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.yield
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
index f0a410fa4015..26c03ed309c0 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad-tiling-interface.mlir
@@ -18,8 +18,7 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     %fill_padded, %_ = transform.structured.pad_tiling_interface %fill_l1 to padding_sizes [8] {
-      padding_values=[0.0 : f32, 0.0 : f32],
-      padding_dimensions=[0]
+      padding_values=[0.0 : f32, 0.0 : f32]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.yield
@@ -55,8 +54,7 @@ module attributes {transform.with_named_sequence} {
     // Tile to 5 then pad to 8 (supposedly to better hit vector ops).
     %matmul_l1, %loops_l1 = transform.structured.tile_using_for %matmul tile_sizes [5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     %matmul_padded, %_ = transform.structured.pad_tiling_interface %matmul_l1 to padding_sizes [8] {
-      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
-      padding_dimensions=[0]
+      padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.yield
@@ -91,8 +89,7 @@ module {
   module attributes {transform.with_named_sequence} {
     transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 14] {
-        padding_dimensions = [0, 2], 
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 0, 14] {
         padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
       } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield 
@@ -147,8 +144,7 @@ module {
   module attributes {transform.with_named_sequence} {
     transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
       %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 14] {
-        padding_dimensions = [0, 2], 
+      %padded, %pad = transform.structured.pad_tiling_interface %0 to padding_sizes [8, 0, 14] {
         padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]
       } : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.yield 

From e56384ff540e68f9d0500fa27a95354c0730e37b Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 11:42:32 +0200
Subject: [PATCH 1262/1322] [IRTranslator] Remove unnecessary isIntrinsic()
 check (NFC)

Directly call getIntrinsicID(), there is no need to check for
isIntrinsic() first.
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index fe5dcd14d880..5d7e07003f10 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2777,11 +2777,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
 
   diagnoseDontCall(CI);
 
-  Intrinsic::ID ID = Intrinsic::not_intrinsic;
-  if (F && F->isIntrinsic())
-    ID = F->getIntrinsicID();
-
-  if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic)
+  Intrinsic::ID ID = F ? F->getIntrinsicID() : Intrinsic::not_intrinsic;
+  if (!F || ID == Intrinsic::not_intrinsic)
     return translateCallBase(CI, MIRBuilder);
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");

From cfcb7888c71c7b5468e3fc55b6de0804403dc3fe Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 12:44:14 +0200
Subject: [PATCH 1263/1322] [EarlyCSE] Fix dead store elimination for unwinding
 readnone calls (#145287)

EarlyCSE already resets LastStore when it hits an potentially unwinding
instruction, as the memory state may be observed by the caller after the
unwind.

There also was a test specifically making sure that this works even for
unwinding readnone calls -- however, the call in that test did not
participate in EarlyCSE in the first place, because it returns void
(relaxing that is how I got here), so it was not actually testing the
right thing.

Move the check for unwinding instructions earlier, so it also handles
the readnone case.
---
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp       | 10 ++++++---
 llvm/test/Transforms/EarlyCSE/basic.ll        | 21 ++++++++++++++++---
 .../Transforms/EarlyCSE/readnone-mayunwind.ll | 18 ----------------
 3 files changed, 25 insertions(+), 24 deletions(-)
 delete mode 100644 llvm/test/Transforms/EarlyCSE/readnone-mayunwind.ll

diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 5c62a2cf526e..e1e283f171d3 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1525,6 +1525,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
     }
 
+    // Make sure stores prior to a potential unwind are not removed, as the
+    // caller may read the memory.
+    if (Inst.mayThrow())
+      LastStore = nullptr;
+
     // If this is a simple instruction that we can value number, process it.
     if (SimpleValue::canHandle(&Inst)) {
       if ([[maybe_unused]] auto *CI = dyn_cast<ConstrainedFPIntrinsic>(&Inst)) {
@@ -1616,13 +1621,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
-    // If this instruction may read from memory or throw (and potentially read
-    // from memory in the exception handler), forget LastStore.  Load/store
+    // If this instruction may read from memory, forget LastStore.  Load/store
     // intrinsics will indicate both a read and a write to memory.  The target
     // may override this (e.g. so that a store intrinsic does not read from
     // memory, and thus will be treated the same as a regular store for
     // commoning purposes).
-    if ((Inst.mayReadFromMemory() || Inst.mayThrow()) &&
+    if (Inst.mayReadFromMemory() &&
         !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
       LastStore = nullptr;
 
diff --git a/llvm/test/Transforms/EarlyCSE/basic.ll b/llvm/test/Transforms/EarlyCSE/basic.ll
index c6b746026c94..2c6b2a961392 100644
--- a/llvm/test/Transforms/EarlyCSE/basic.ll
+++ b/llvm/test/Transforms/EarlyCSE/basic.ll
@@ -137,7 +137,7 @@ declare i32 @func(ptr%P) readonly
 ;; Simple call CSE'ing.
 define i32 @test5(ptr%P) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[V1:%.*]] = call i32 @func(ptr [[P:%.*]]), !prof !0
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @func(ptr [[P:%.*]]), !prof [[PROF0:![0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
   %V1 = call i32 @func(ptr %P), !prof !0
@@ -212,10 +212,25 @@ define i32 @test9(ptr%P) {
   ret i32 %V1
 }
 
-;; Trivial DSE can be performed across a readnone call.
+;; Trivial DSE can be performed across a readnone nounwind call.
 define i32 @test10(ptr%P) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[V1:%.*]] = call i32 @func(ptr [[P:%.*]]) #[[ATTR2]]
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @func(ptr [[P:%.*]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V1]]
+;
+  store i32 4, ptr %P
+  %V1 = call i32 @func(ptr %P) readnone nounwind
+  store i32 5, ptr %P
+  ret i32 %V1
+}
+
+; Trivial DSE can't be performed across a potentially unwinding readnone
+; call, as the caller may read the memory on unwind.
+define i32 @test_readnone_missing_nounwind(ptr %P) {
+; CHECK-LABEL: @test_readnone_missing_nounwind(
+; CHECK-NEXT:    store i32 4, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @func(ptr [[P]]) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
 ; CHECK-NEXT:    ret i32 [[V1]]
 ;
diff --git a/llvm/test/Transforms/EarlyCSE/readnone-mayunwind.ll b/llvm/test/Transforms/EarlyCSE/readnone-mayunwind.ll
deleted file mode 100644
index e4d31f31d9ff..000000000000
--- a/llvm/test/Transforms/EarlyCSE/readnone-mayunwind.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=early-cse -earlycse-debug-hash < %s | FileCheck %s
-
-declare void @readnone_may_unwind() readnone
-
-define void @f(ptr %ptr) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    store i32 100, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    call void @readnone_may_unwind()
-; CHECK-NEXT:    store i32 200, ptr [[PTR]], align 4
-; CHECK-NEXT:    ret void
-;
-
-  store i32 100, ptr %ptr
-  call void @readnone_may_unwind()
-  store i32 200, ptr %ptr
-  ret void
-}

From 1d907c28b64b870e5075b7ded5fbd660f1e3ed58 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 23 Jun 2025 12:18:32 +0100
Subject: [PATCH 1264/1322] [VectorCombine][X86] fmaddsub.ll - add test
 variants without any undef elements

---
 .../Transforms/PhaseOrdering/X86/fmaddsub.ll  | 638 +++++++++++++-----
 1 file changed, 475 insertions(+), 163 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index ad4452431a48..5cb2c4530aa5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA4
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA3
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA,AVX_FMA4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA,AVX_FMA3
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA4
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA3
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA,AVX_FMA4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA,AVX_FMA3
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; This test checks the vectorisation of FMUL+ADDSUB/FMADDSUB patterns, including cases with undef elements.
@@ -152,42 +152,134 @@ define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double>
 }
 
 define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
-; SSE-LABEL: @buildvector_mul_addsub_ps512(
-; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
-; SSE-NEXT:    [[TMP2:%.*]] = fsub <12 x float> [[TMP0]], [[TMP1]]
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <12 x float> [[TMP0]], [[TMP1]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <12 x float> [[TMP2]], <12 x float> [[TMP3]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <12 x float> [[TMP4]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP8]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+; SSE2-LABEL: @buildvector_mul_addsub_ps512(
+; SSE2-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT:    ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @buildvector_mul_addsub_ps512(
+; SSE4-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT:    ret <16 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @buildvector_mul_addsub_ps512(
 ; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
-; AVX-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[TMP0]], [[TMP1]]
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP0]], [[TMP1]]
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
-; AVX-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:    [[TMP11:%.*]] = fsub <2 x float> [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP12]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
+;
+  %A = fmul <16 x float> %C, %D
+  %A0 = extractelement <16 x float> %A, i32 0
+  %B0 = extractelement <16 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <16 x float> %A, i32 2
+  %B2 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A4 = extractelement <16 x float> %A, i32 4
+  %B4 = extractelement <16 x float> %B, i32 4
+  %sub4 = fsub float %A4, %B4
+  %A6 = extractelement <16 x float> %A, i32 6
+  %B6 = extractelement <16 x float> %B, i32 6
+  %sub6 = fsub float %A6, %B6
+  %A8 = extractelement <16 x float> %A, i32 8
+  %B8 = extractelement <16 x float> %B, i32 8
+  %sub8 = fsub float %A8, %B8
+  %A10 = extractelement <16 x float> %A, i32 10
+  %B10 = extractelement <16 x float> %B, i32 10
+  %sub10 = fsub float %A10, %B10
+  %A12 = extractelement <16 x float> %A, i32 12
+  %B12 = extractelement <16 x float> %B, i32 12
+  %sub12 = fsub float %A12, %B12
+  %A14 = extractelement <16 x float> %A, i32 14
+  %B14 = extractelement <16 x float> %B, i32 14
+  %sub14 = fsub float %A14, %B14
+  %A1 = extractelement <16 x float> %A, i32 1
+  %B1 = extractelement <16 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <16 x float> %A, i32 3
+  %B3 = extractelement <16 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %A5 = extractelement <16 x float> %A, i32 5
+  %B5 = extractelement <16 x float> %B, i32 5
+  %add5 = fadd float %A5, %B5
+  %A7 = extractelement <16 x float> %A, i32 7
+  %B7 = extractelement <16 x float> %B, i32 7
+  %add7 = fadd float %A7, %B7
+  %A9 = extractelement <16 x float> %A, i32 9
+  %B9 = extractelement <16 x float> %B, i32 9
+  %add9 = fadd float %A9, %B9
+  %A11 = extractelement <16 x float> %A, i32 11
+  %B11 = extractelement <16 x float> %B, i32 11
+  %add11 = fadd float %A11, %B11
+  %A13 = extractelement <16 x float> %A, i32 13
+  %B13 = extractelement <16 x float> %B, i32 13
+  %add13 = fadd float %A13, %B13
+  %A15 = extractelement <16 x float> %A, i32 15
+  %B15 = extractelement <16 x float> %B, i32 15
+  %add15 = fadd float %A15, %B15
+  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+  %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add5, i32 5
+  %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub6, i32 6
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+  %vecinsert13 = insertelement <16 x float> %vecinsert12, float %sub12, i32 12
+  %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add13, i32 13
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+  ret <16 x float> %vecinsert16
+}
+
+define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; SSE-LABEL: @buildvector_mul_addsub_ps512_partial(
+; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <12 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = fadd <12 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <12 x float> [[TMP3]], <12 x float> [[TMP4]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP8:%.*]] = fsub <2 x float> [[TMP6]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP6]], [[TMP7]]
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <12 x float> [[TMP5]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+;
+; AVX-LABEL: @buildvector_mul_addsub_ps512_partial(
+; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP6]], [[TMP7]]
+; AVX-NEXT:    [[TMP9:%.*]] = fsub <4 x float> [[TMP6]], [[TMP7]]
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP10]], [[TMP11]]
+; AVX-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[TMP10]], [[TMP11]]
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP13]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
 ;
   %A = fmul <16 x float> %C, %D
@@ -259,67 +351,127 @@ define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float>
 }
 
 define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
-; SSE-LABEL: @buildvector_mul_addsub_pd512(
+; SSE2-LABEL: @buildvector_mul_addsub_pd512(
+; SSE2-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @buildvector_mul_addsub_pd512(
+; SSE4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @buildvector_mul_addsub_pd512(
+; AVX-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %A = fmul <8 x double> %C, %D
+  %A0 = extractelement <8 x double> %A, i32 0
+  %B0 = extractelement <8 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A2 = extractelement <8 x double> %A, i32 2
+  %B2 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %A2, %B2
+  %A4 = extractelement <8 x double> %A, i32 4
+  %B4 = extractelement <8 x double> %B, i32 4
+  %sub4 = fsub double %A4, %B4
+  %A6 = extractelement <8 x double> %A, i32 6
+  %B6 = extractelement <8 x double> %B, i32 6
+  %sub6 = fsub double %A6, %B6
+  %A1 = extractelement <8 x double> %A, i32 1
+  %B1 = extractelement <8 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %A3 = extractelement <8 x double> %A, i32 3
+  %B3 = extractelement <8 x double> %B, i32 3
+  %add3 = fadd double %A3, %B3
+  %A5 = extractelement <8 x double> %A, i32 5
+  %B5 = extractelement <8 x double> %B, i32 5
+  %add5 = fadd double %A5, %B5
+  %A7 = extractelement <8 x double> %A, i32 7
+  %B7 = extractelement <8 x double> %B, i32 7
+  %add7 = fadd double %A7, %B7
+  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+  %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add5, i32 5
+  %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub6, i32 6
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+  ret <8 x double> %vecinsert8
+}
+
+define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; SSE-LABEL: @buildvector_mul_addsub_pd512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[TMP0:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
 ; SSE-NEXT:    [[ADD7:%.*]] = fadd double [[A7]], [[B7]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP6]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
-; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP7]], double [[ADD7]], i64 7
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP8]], double [[ADD7]], i64 7
 ; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
 ;
-; AVX_FMA4-LABEL: @buildvector_mul_addsub_pd512(
+; AVX_FMA4-LABEL: @buildvector_mul_addsub_pd512_partial(
 ; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA4-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 ; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
 ; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
 ; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fadd double [[A7]], [[B7]]
-; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[TMP6:%.*]] = fsub <8 x double> [[A]], [[B]]
-; AVX_FMA4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
 ; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
 ; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
 ;
-; AVX_FMA3-LABEL: @buildvector_mul_addsub_pd512(
+; AVX_FMA3-LABEL: @buildvector_mul_addsub_pd512_partial(
 ; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA3-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA3-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA3-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; AVX_FMA3-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[A]], [[B]]
-; AVX_FMA3-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
-; AVX_FMA3-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
-; AVX_FMA3-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
-; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA3-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA3-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA3-NEXT:    [[TMP6:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[TMP8:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 ; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
 ;
-; AVX512-LABEL: @buildvector_mul_addsub_pd512(
+; AVX512-LABEL: @buildvector_mul_addsub_pd512_partial(
 ; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
-; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX512-NEXT:    [[TMP6:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP8:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 ; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
 ;
   %A = fmul <8 x double> %C, %D
@@ -516,40 +668,136 @@ define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double>
 define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
 ; SSE-LABEL: @buildvector_mul_subadd_ps512(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; SSE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <12 x float> [[TMP0]], [[TMP1]]
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <12 x float> [[TMP0]], [[TMP1]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <12 x float> [[TMP2]], <12 x float> [[TMP3]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP8:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <12 x float> [[TMP4]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP8]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <16 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 ; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
 ;
-; AVX-LABEL: @buildvector_mul_subadd_ps512(
+; AVX_FMA-LABEL: @buildvector_mul_subadd_ps512(
+; AVX_FMA-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
+; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; AVX_FMA-NEXT:    ret <16 x float> [[TMP7]]
+;
+; AVX512-LABEL: @buildvector_mul_subadd_ps512(
+; AVX512-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP10:%.*]] = fsub <16 x float> [[A]], [[B]]
+; AVX512-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX512-NEXT:    ret <16 x float> [[VECINSERT162]]
+;
+  %A = fmul <16 x float> %C, %D
+  %A0 = extractelement <16 x float> %A, i32 0
+  %B0 = extractelement <16 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <16 x float> %A, i32 2
+  %B2 = extractelement <16 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A4 = extractelement <16 x float> %A, i32 4
+  %B4 = extractelement <16 x float> %B, i32 4
+  %sub4 = fadd float %A4, %B4
+  %A6 = extractelement <16 x float> %A, i32 6
+  %B6 = extractelement <16 x float> %B, i32 6
+  %sub6 = fadd float %A6, %B6
+  %A8 = extractelement <16 x float> %A, i32 8
+  %B8 = extractelement <16 x float> %B, i32 8
+  %sub8 = fadd float %A8, %B8
+  %A10 = extractelement <16 x float> %A, i32 10
+  %B10 = extractelement <16 x float> %B, i32 10
+  %sub10 = fadd float %A10, %B10
+  %A12 = extractelement <16 x float> %A, i32 12
+  %B12 = extractelement <16 x float> %B, i32 12
+  %sub12 = fadd float %A12, %B12
+  %A14 = extractelement <16 x float> %A, i32 14
+  %B14 = extractelement <16 x float> %B, i32 14
+  %sub14 = fadd float %A14, %B14
+  %A1 = extractelement <16 x float> %A, i32 1
+  %B1 = extractelement <16 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <16 x float> %A, i32 3
+  %B3 = extractelement <16 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %A5 = extractelement <16 x float> %A, i32 5
+  %B5 = extractelement <16 x float> %B, i32 5
+  %add5 = fsub float %A5, %B5
+  %A7 = extractelement <16 x float> %A, i32 7
+  %B7 = extractelement <16 x float> %B, i32 7
+  %add7 = fsub float %A7, %B7
+  %A9 = extractelement <16 x float> %A, i32 9
+  %B9 = extractelement <16 x float> %B, i32 9
+  %add9 = fsub float %A9, %B9
+  %A11 = extractelement <16 x float> %A, i32 11
+  %B11 = extractelement <16 x float> %B, i32 11
+  %add11 = fsub float %A11, %B11
+  %A13 = extractelement <16 x float> %A, i32 13
+  %B13 = extractelement <16 x float> %B, i32 13
+  %add13 = fsub float %A13, %B13
+  %A15 = extractelement <16 x float> %A, i32 15
+  %B15 = extractelement <16 x float> %B, i32 15
+  %add15 = fsub float %A15, %B15
+  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+  %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add5, i32 5
+  %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub6, i32 6
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+  %vecinsert13 = insertelement <16 x float> %vecinsert12, float %sub12, i32 12
+  %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add13, i32 13
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+  ret <16 x float> %vecinsert16
+}
+
+define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; SSE-LABEL: @buildvector_mul_subadd_ps512_partial(
+; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <12 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = fsub <12 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <12 x float> [[TMP3]], <12 x float> [[TMP4]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP6]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP6]], [[TMP7]]
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <12 x float> [[TMP5]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+;
+; AVX-LABEL: @buildvector_mul_subadd_ps512_partial(
 ; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[TMP0]], [[TMP1]]
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP0]], [[TMP1]]
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
-; AVX-NEXT:    [[TMP7:%.*]] = fsub <4 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
-; AVX-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP12]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP6]], [[TMP7]]
+; AVX-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP7]]
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP10]], [[TMP11]]
+; AVX-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP10]], [[TMP11]]
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP13]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
 ;
   %A = fmul <16 x float> %C, %D
@@ -626,62 +874,126 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
 ; SSE-NEXT:    [[TMP0:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE-NEXT:    [[TMP2:%.*]] = fsub <8 x double> [[A]], [[B]]
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
-; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
-; SSE-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP6]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
-; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP7]], double [[ADD7]], i64 7
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
 ;
-; AVX_FMA4-LABEL: @buildvector_mul_subadd_pd512(
-; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA4-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
-; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
-; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
-; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[TMP6:%.*]] = fadd <8 x double> [[A]], [[B]]
-; AVX_FMA4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
-; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
-; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
-;
-; AVX_FMA3-LABEL: @buildvector_mul_subadd_pd512(
-; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA3-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX_FMA3-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
-; AVX_FMA3-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; AVX_FMA3-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
-; AVX_FMA3-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
-; AVX_FMA3-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
-; AVX_FMA3-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
-; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
-; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
+; AVX_FMA-LABEL: @buildvector_mul_subadd_pd512(
+; AVX_FMA-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA-NEXT:    ret <8 x double> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_pd512(
 ; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
-; AVX512-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; AVX512-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
-; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
-; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+  %A = fmul <8 x double> %C, %D
+  %A0 = extractelement <8 x double> %A, i32 0
+  %B0 = extractelement <8 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A2 = extractelement <8 x double> %A, i32 2
+  %B2 = extractelement <8 x double> %B, i32 2
+  %sub2 = fadd double %A2, %B2
+  %A4 = extractelement <8 x double> %A, i32 4
+  %B4 = extractelement <8 x double> %B, i32 4
+  %sub4 = fadd double %A4, %B4
+  %A6 = extractelement <8 x double> %A, i32 6
+  %B6 = extractelement <8 x double> %B, i32 6
+  %sub6 = fadd double %A6, %B6
+  %A1 = extractelement <8 x double> %A, i32 1
+  %B1 = extractelement <8 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %A3 = extractelement <8 x double> %A, i32 3
+  %B3 = extractelement <8 x double> %B, i32 3
+  %add3 = fsub double %A3, %B3
+  %A5 = extractelement <8 x double> %A, i52 5
+  %B5 = extractelement <8 x double> %B, i52 5
+  %add5 = fsub double %A5, %B5
+  %A7 = extractelement <8 x double> %A, i32 7
+  %B7 = extractelement <8 x double> %B, i32 7
+  %add7 = fsub double %A7, %B7
+  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+  %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add5, i32 5
+  %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub6, i32 6
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+  ret <8 x double> %vecinsert8
+}
+
+define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; SSE-LABEL: @buildvector_mul_subadd_pd512_partial(
+; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <8 x double> [[A]], [[B]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; SSE-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP8]], double [[ADD7]], i64 7
+; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_subadd_pd512_partial(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
+; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_subadd_pd512_partial(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA3-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX_FMA3-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA3-NEXT:    [[TMP6:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[TMP8:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX512-LABEL: @buildvector_mul_subadd_pd512_partial(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX512-NEXT:    [[TMP6:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
 ; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
 ;
   %A = fmul <8 x double> %C, %D

From 6d8d4cf9a46b3729732736ffe288f6b722d85121 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Mon, 23 Jun 2025 15:25:08 +0400
Subject: [PATCH 1265/1322] [lldb] Disable flaky TestDetachResumes.py on
 Windows x86_64 (#145301)

See #144891 for details.
---
 .../API/commands/process/detach-resumes/TestDetachResumes.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py
index 57727294ddc3..db730574124b 100644
--- a/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py
+++ b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py
@@ -12,6 +12,11 @@ from lldbsuite.test import lldbutil
 class DetachResumesTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipIf(
+        oslist=["windows"],
+        archs=["x86_64"],
+        bugnumber="github.com/llvm/llvm-project/issues/144891",
+    )
     def test_detach_resumes(self):
         self.build()
         exe = self.getBuildArtifact()

From 1e95349dbe329938d2962a78baa0ec421e9cd7d1 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 23 Jun 2025 13:14:09 +0200
Subject: [PATCH 1266/1322] Revert "ELF: Add branch-to-branch optimization."

This caused assertion failures in applyBranchToBranchOpt():

  llvm/include/llvm/Support/Casting.h:578:
  decltype(auto) llvm::cast(From*)
  [with To = lld::elf::InputSection; From = lld::elf::InputSectionBase]:
  Assertion `isa<To>(Val) && "cast<Ty>() argument of incompatible type!"' failed.

See comment on the PR (https://github.com/llvm/llvm-project/pull/138366)

This reverts commit 491b82a5ec1add78d2c93370580a2f1897b6a364.

This also reverts the follow-up "[lld] Use llvm::partition_point (NFC) (#145209)"

This reverts commit 2ac293f5ac4cf65c0c038bf75a88f1d6715e467d.
---
 lld/ELF/Arch/AArch64.cpp                |  58 -----------
 lld/ELF/Arch/TargetImpl.h               |  93 -----------------
 lld/ELF/Arch/X86_64.cpp                 |  68 ------------
 lld/ELF/Config.h                        |   1 -
 lld/ELF/Driver.cpp                      |   2 -
 lld/ELF/InputSection.cpp                |   5 +-
 lld/ELF/Options.td                      |   4 -
 lld/ELF/Relocations.cpp                 |   8 +-
 lld/ELF/Target.h                        |   1 -
 lld/docs/ReleaseNotes.rst               |   4 -
 lld/docs/ld.lld.1                       |   9 +-
 lld/test/ELF/aarch64-branch-to-branch.s |  82 ---------------
 lld/test/ELF/x86-64-branch-to-branch.s  | 133 ------------------------
 13 files changed, 6 insertions(+), 462 deletions(-)
 delete mode 100644 lld/ELF/Arch/TargetImpl.h
 delete mode 100644 lld/test/ELF/aarch64-branch-to-branch.s
 delete mode 100644 lld/test/ELF/x86-64-branch-to-branch.s

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 1812f2af419d..8a225ed103ee 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -11,7 +11,6 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
-#include "TargetImpl.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Endian.h"
 
@@ -83,7 +82,6 @@ public:
                 uint64_t val) const override;
   RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
   void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
-  void applyBranchToBranchOpt() const override;
 
 private:
   void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -976,62 +974,6 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   }
 }
 
-static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
-                                                        Relocation &r) {
-  // Identify a control transfer relocation for the branch-to-branch
-  // optimization. A "control transfer relocation" means a B or BL
-  // target but it also includes relative vtable relocations for example.
-  //
-  // We require the relocation type to be JUMP26, CALL26 or PLT32. With a
-  // relocation type of PLT32 the value may be assumed to be used for branching
-  // directly to the symbol and the addend is only used to produce the relocated
-  // value (hence the effective addend is always 0). This is because if a PLT is
-  // needed the addend will be added to the address of the PLT, and it doesn't
-  // make sense to branch into the middle of a PLT. For example, relative vtable
-  // relocations use PLT32 and 0 or a positive value as the addend but still are
-  // used to branch to the symbol.
-  //
-  // With JUMP26 or CALL26 the only reasonable interpretation of a non-zero
-  // addend is that we are branching to symbol+addend so that becomes the
-  // effective addend.
-  if (r.type == R_AARCH64_PLT32)
-    return 0;
-  if (r.type == R_AARCH64_JUMP26 || r.type == R_AARCH64_CALL26)
-    return r.addend;
-  return std::nullopt;
-}
-
-static std::pair<Relocation *, uint64_t>
-getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
-  auto *i = llvm::partition_point(
-      is.relocations, [&](Relocation &r) { return r.offset < offset; });
-  if (i != is.relocations.end() && i->offset == offset &&
-      i->type == R_AARCH64_JUMP26) {
-    return {i, i->addend};
-  }
-  return {nullptr, 0};
-}
-
-static void redirectControlTransferRelocations(Relocation &r1,
-                                               const Relocation &r2) {
-  r1.expr = r2.expr;
-  r1.sym = r2.sym;
-  // With PLT32 we must respect the original addend as that affects the value's
-  // interpretation. With the other relocation types the original addend is
-  // irrelevant because it referred to an offset within the original target
-  // section so we overwrite it.
-  if (r1.type == R_AARCH64_PLT32)
-    r1.addend += r2.addend;
-  else
-    r1.addend = r2.addend;
-}
-
-void AArch64::applyBranchToBranchOpt() const {
-  applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
-                             getBranchInfoAtTarget,
-                             redirectControlTransferRelocations);
-}
-
 // AArch64 may use security features in variant PLT sequences. These are:
 // Pointer Authentication (PAC), introduced in armv8.3-a and Branch Target
 // Indicator (BTI) introduced in armv8.5-a. The additional instructions used
diff --git a/lld/ELF/Arch/TargetImpl.h b/lld/ELF/Arch/TargetImpl.h
deleted file mode 100644
index f1206570d3e3..000000000000
--- a/lld/ELF/Arch/TargetImpl.h
+++ /dev/null
@@ -1,93 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLD_ELF_ARCH_TARGETIMPL_H
-#define LLD_ELF_ARCH_TARGETIMPL_H
-
-#include "InputFiles.h"
-#include "InputSection.h"
-#include "Relocations.h"
-#include "Symbols.h"
-#include "llvm/BinaryFormat/ELF.h"
-
-namespace lld::elf {
-
-// getControlTransferAddend: If this relocation is used for control transfer
-// instructions (e.g. branch, branch-link or call) or code references (e.g.
-// virtual function pointers) and indicates an address-insignificant reference,
-// return the effective addend for the relocation, otherwise return
-// std::nullopt. The effective addend for a relocation is the addend that is
-// used to determine its branch destination.
-//
-// getBranchInfoAtTarget: If a control transfer relocation referring to
-// is+offset directly transfers control to a relocated branch instruction in the
-// specified section, return the relocation for the branch target as well as its
-// effective addend (see above). Otherwise return {nullptr, 0}.
-//
-// redirectControlTransferRelocations: Given r1, a relocation for which
-// getControlTransferAddend() returned a value, and r2, a relocation returned by
-// getBranchInfo(), modify r1 so that it branches directly to the target of r2.
-template <typename GetControlTransferAddend, typename GetBranchInfoAtTarget,
-          typename RedirectControlTransferRelocations>
-inline void applyBranchToBranchOptImpl(
-    Ctx &ctx, GetControlTransferAddend getControlTransferAddend,
-    GetBranchInfoAtTarget getBranchInfoAtTarget,
-    RedirectControlTransferRelocations redirectControlTransferRelocations) {
-  // Needs to run serially because it writes to the relocations array as well as
-  // reading relocations of other sections.
-  for (ELFFileBase *f : ctx.objectFiles) {
-    auto getRelocBranchInfo =
-        [&getBranchInfoAtTarget](
-            Relocation &r,
-            uint64_t addend) -> std::pair<Relocation *, uint64_t> {
-      auto *target = dyn_cast_or_null<Defined>(r.sym);
-      // We don't allow preemptible symbols or ifuncs (may go somewhere else),
-      // absolute symbols (runtime behavior unknown), non-executable or writable
-      // memory (ditto) or non-regular sections (no section data).
-      if (!target || target->isPreemptible || target->isGnuIFunc() ||
-          !target->section ||
-          !(target->section->flags & llvm::ELF::SHF_EXECINSTR) ||
-          (target->section->flags & llvm::ELF::SHF_WRITE) ||
-          target->section->kind() != SectionBase::Regular)
-        return {nullptr, 0};
-      return getBranchInfoAtTarget(*cast<InputSection>(target->section),
-                                   target->value + addend);
-    };
-    for (InputSectionBase *s : f->getSections()) {
-      if (!s)
-        continue;
-      for (Relocation &r : s->relocations) {
-        std::optional<uint64_t> addend =
-            getControlTransferAddend(*cast<InputSection>(s), r);
-        if (!addend)
-          continue;
-        std::pair<Relocation *, uint64_t> targetAndAddend =
-            getRelocBranchInfo(r, *addend);
-        if (!targetAndAddend.first)
-          continue;
-        // Avoid getting stuck in an infinite loop if we encounter a branch
-        // that (possibly indirectly) branches to itself. It is unlikely
-        // that more than 5 iterations will ever be needed in practice.
-        size_t iterations = 5;
-        while (iterations--) {
-          std::pair<Relocation *, uint64_t> nextTargetAndAddend =
-              getRelocBranchInfo(*targetAndAddend.first,
-                                 targetAndAddend.second);
-          if (!nextTargetAndAddend.first)
-            break;
-          targetAndAddend = nextTargetAndAddend;
-        }
-        redirectControlTransferRelocations(r, *targetAndAddend.first);
-      }
-    }
-  }
-}
-
-} // namespace lld::elf
-
-#endif
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 163505102d0e..974da4d96320 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -11,7 +11,6 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
-#include "TargetImpl.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MathExtras.h"
@@ -50,7 +49,6 @@ public:
   bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
                              InputSection *nextIS) const override;
   bool relaxOnce(int pass) const override;
-  void applyBranchToBranchOpt() const override;
 
 private:
   void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
@@ -1163,72 +1161,6 @@ void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   }
 }
 
-static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
-                                                        Relocation &r) {
-  // Identify a control transfer relocation for the branch-to-branch
-  // optimization. A "control transfer relocation" usually means a CALL or JMP
-  // target but it also includes relative vtable relocations for example.
-  //
-  // We require the relocation type to be PLT32. With a relocation type of PLT32
-  // the value may be assumed to be used for branching directly to the symbol
-  // and the addend is only used to produce the relocated value (hence the
-  // effective addend is always 0). This is because if a PLT is needed the
-  // addend will be added to the address of the PLT, and it doesn't make sense
-  // to branch into the middle of a PLT. For example, relative vtable
-  // relocations use PLT32 and 0 or a positive value as the addend but still are
-  // used to branch to the symbol.
-  //
-  // STT_SECTION symbols are a special case on x86 because the LLVM assembler
-  // uses them for branches to local symbols which are assembled as referring to
-  // the section symbol with the addend equal to the symbol value - 4.
-  if (r.type == R_X86_64_PLT32) {
-    if (r.sym->isSection())
-      return r.addend + 4;
-    return 0;
-  }
-  return std::nullopt;
-}
-
-static std::pair<Relocation *, uint64_t>
-getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
-  auto content = is.contentMaybeDecompress();
-  if (content.size() > offset && content[offset] == 0xe9) { // JMP immediate
-    auto *i = llvm::partition_point(
-        is.relocations, [&](Relocation &r) { return r.offset < offset + 1; });
-    // Unlike with getControlTransferAddend() it is valid to accept a PC32
-    // relocation here because we know that this is actually a JMP and not some
-    // other reference, so the interpretation is that we add 4 to the addend and
-    // use that as the effective addend.
-    if (i != is.relocations.end() && i->offset == offset + 1 &&
-        (i->type == R_X86_64_PC32 || i->type == R_X86_64_PLT32)) {
-      return {i, i->addend + 4};
-    }
-  }
-  return {nullptr, 0};
-}
-
-static void redirectControlTransferRelocations(Relocation &r1,
-                                               const Relocation &r2) {
-  // The isSection() check handles the STT_SECTION case described above.
-  // In that case the original addend is irrelevant because it referred to an
-  // offset within the original target section so we overwrite it.
-  //
-  // The +4 is here to compensate for r2.addend which will likely be -4,
-  // but may also be addend-4 in case of a PC32 branch to symbol+addend.
-  if (r1.sym->isSection())
-    r1.addend = r2.addend;
-  else
-    r1.addend += r2.addend + 4;
-  r1.expr = r2.expr;
-  r1.sym = r2.sym;
-}
-
-void X86_64::applyBranchToBranchOpt() const {
-  applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
-                             getBranchInfoAtTarget,
-                             redirectControlTransferRelocations);
-}
-
 // If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
 // entries containing endbr64 instructions. A PLT entry will be split into two
 // parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 88bda41d3648..2b72d54ba410 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -302,7 +302,6 @@ struct Config {
   bool bpFunctionOrderForCompression = false;
   bool bpDataOrderForCompression = false;
   bool bpVerboseSectionOrderer = false;
-  bool branchToBranch = false;
   bool checkSections;
   bool checkDynamicRelocs;
   std::optional<llvm::DebugCompressionType> compressDebugSections;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 1e0b5988343a..7e132a387a04 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1644,8 +1644,6 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.zWxneeded = hasZOption(args, "wxneeded");
   setUnresolvedSymbolPolicy(ctx, args);
   ctx.arg.power10Stubs = args.getLastArgValue(OPT_power10_stubs_eq) != "no";
-  ctx.arg.branchToBranch = args.hasFlag(
-      OPT_branch_to_branch, OPT_no_branch_to_branch, ctx.arg.optimize >= 2);
 
   if (opt::Arg *arg = args.getLastArg(OPT_eb, OPT_el)) {
     if (arg->getOption().matches(OPT_eb))
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index f8786265029e..0ce0f08d0387 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -430,9 +430,8 @@ InputSectionBase *InputSection::getRelocatedSection() const {
 
 template <class ELFT, class RelTy>
 void InputSection::copyRelocations(Ctx &ctx, uint8_t *buf) {
-  bool linkerRelax =
-      ctx.arg.relax && is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine);
-  if (!ctx.arg.relocatable && (linkerRelax || ctx.arg.branchToBranch)) {
+  if (ctx.arg.relax && !ctx.arg.relocatable &&
+      (ctx.arg.emachine == EM_RISCV || ctx.arg.emachine == EM_LOONGARCH)) {
     // On LoongArch and RISC-V, relaxation might change relocations: copy
     // from internal ones that are updated by relaxation.
     InputSectionBase *sec = getRelocatedSection();
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index d7e331316700..c795147eb966 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -59,10 +59,6 @@ def build_id: J<"build-id=">, HelpText<"Generate build ID note">,
   MetaVarName<"[fast,md5,sha1,uuid,0x<hexstring>]">;
 def : F<"build-id">, Alias<build_id>, AliasArgs<["sha1"]>, HelpText<"Alias for --build-id=sha1">;
 
-defm branch_to_branch: BB<"branch-to-branch",
-    "Enable branch-to-branch optimization (default at -O2)",
-    "Disable branch-to-branch optimization (default at -O0 and -O1)">;
-
 defm check_sections: B<"check-sections",
     "Check section addresses for overlaps (default)",
     "Do not check section addresses for overlaps">;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 43f19186f098..6c4209a2b81e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1665,10 +1665,9 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
   }
 
   // Sort relocations by offset for more efficient searching for
-  // R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization.
+  // R_RISCV_PCREL_HI20 and R_PPC64_ADDR64.
   if (ctx.arg.emachine == EM_RISCV ||
-      (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
-      ctx.arg.branchToBranch)
+      (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc"))
     llvm::stable_sort(sec->relocs(),
                       [](const Relocation &lhs, const Relocation &rhs) {
                         return lhs.offset < rhs.offset;
@@ -1959,9 +1958,6 @@ void elf::postScanRelocations(Ctx &ctx) {
   for (ELFFileBase *file : ctx.objectFiles)
     for (Symbol *sym : file->getLocalSymbols())
       fn(*sym);
-
-  if (ctx.arg.branchToBranch)
-    ctx.target->applyBranchToBranchOpt();
 }
 
 static bool mergeCmp(const InputSection *a, const InputSection *b) {
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 6dd20b2f0cba..fd1e5d33c438 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -101,7 +101,6 @@ public:
 
   virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type,
                                  JumpModType val) const {}
-  virtual void applyBranchToBranchOpt() const {}
 
   virtual ~TargetInfo();
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index dabfc961dd5b..064ed0828c31 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -62,10 +62,6 @@ ELF Improvements
   on executable sections.
   (`#128883 <https://github.com/llvm/llvm-project/pull/128883>`_)
 
-* For AArch64 and X86_64, added ``--branch-to-branch``, which rewrites branches
-  that point to another branch instruction to instead branch directly to the
-  target of the second instruction. Enabled by default at ``-O2``.
-  
 Breaking changes
 ----------------
 * Executable-only and readable-executable sections are now allowed to be placed
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 7edc522b4f6a..cfacdb081a80 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -93,11 +93,6 @@ Bind default visibility defined STB_GLOBAL function symbols locally for
 .Fl shared.
 .It Fl -be8
 Write a Big Endian ELF File using BE8 format(AArch32 only)
-.It Fl -branch-to-branch
-Enable the branch-to-branch optimizations: a branch whose target is
-another branch instruction is rewritten to point to the latter branch
-target (AArch64 and X86_64 only). Enabled by default at
-.Fl O2 Ns .
 .It Fl -build-id Ns = Ns Ar value
 Generate a build ID note.
 .Ar value
@@ -419,7 +414,7 @@ If not specified,
 .Dv a.out
 is used as a default.
 .It Fl O Ns Ar value
-Optimize output file.
+Optimize output file size.
 .Ar value
 may be:
 .Pp
@@ -429,7 +424,7 @@ Disable string merging.
 .It Cm 1
 Enable string merging.
 .It Cm 2
-Enable string tail merging and branch-to-branch optimization.
+Enable string tail merging.
 .El
 .Pp
 .Fl O Ns Cm 1
diff --git a/lld/test/ELF/aarch64-branch-to-branch.s b/lld/test/ELF/aarch64-branch-to-branch.s
deleted file mode 100644
index 7dc485aef853..000000000000
--- a/lld/test/ELF/aarch64-branch-to-branch.s
+++ /dev/null
@@ -1,82 +0,0 @@
-# REQUIRES: aarch64
-
-## Test that the branch-to-branch optimization follows the links
-## from f1 -> f2 -> f3 and updates all references to point to f3.
-
-# RUN: llvm-mc -filetype=obj -triple=aarch64-pc-linux %s -o %t.o
-# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
-# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
-
-## Test that branch-to-branch is disabled by default.
-
-# RUN: ld.lld %t.o -o %t --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
-# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
-
-## Test that branch-to-branch is disabled for preemptible symbols.
-
-# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
-
-.section .rodata.vtable,"a"
-.globl vtable
-vtable:
-# B2B: Contents of section .rodata:
-# RELOC: RELOCATION RECORDS FOR [.rodata]:
-# RELOC-NEXT: OFFSET
-# B2B-NEXT: [[VF:[0-9a-f]{8}]]
-# B2B-RELOC-NEXT: R_AARCH64_PLT32 f3
-# NOB2B-RELOC-NEXT: R_AARCH64_PLT32 f1
-.4byte f1@PLT - vtable
-# B2B-SAME: [[VF]]
-# B2B-RELOC-NEXT: R_AARCH64_PLT32 f3+0x4
-# NOB2B-RELOC-NEXT: R_AARCH64_PLT32 f2+0x4
-.4byte f2@PLT - vtable
-# B2B-SAME: [[VF]]
-# RELOC-NEXT: R_AARCH64_PLT32 f3+0x8
-.4byte f3@PLT - vtable
-
-.section .text._start,"ax"
-.globl _start
-# CHECK: <_start>:
-# RELOC: RELOCATION RECORDS FOR [.text]:
-# RELOC-NEXT: OFFSET
-_start:
-# B2B: bl {{.*}} <f3>
-# B2B-RELOC-NEXT: R_AARCH64_CALL26 f3
-# NOB2B: bl {{.*}} <f1{{.*}}>
-# NOB2B-RELOC-NEXT: R_AARCH64_CALL26 f1
-bl f1
-# B2B: b {{.*}} <f3>
-# B2B-RELOC-NEXT: R_AARCH64_JUMP26 f3
-# NOB2B: b {{.*}} <f2{{.*}}>
-# NOB2B-RELOC-NEXT: R_AARCH64_JUMP26 f2
-b f2
-
-.section .text.f1,"ax"
-.globl f1
-f1:
-# B2B-RELOC-NEXT: R_AARCH64_JUMP26 f3
-# NOB2B-RELOC-NEXT: R_AARCH64_JUMP26 f2
-b f2
-
-.section .text.f2,"ax"
-.globl f2
-# CHECK: <f2>:
-f2:
-# CHECK-NEXT: b {{.*}} <f3{{.*}}>
-# RELOC-NEXT: R_AARCH64_JUMP26 f3
-b f3
-
-.section .text.f3,"ax"
-.globl f3
-f3:
-ret
diff --git a/lld/test/ELF/x86-64-branch-to-branch.s b/lld/test/ELF/x86-64-branch-to-branch.s
deleted file mode 100644
index dabf5be571ec..000000000000
--- a/lld/test/ELF/x86-64-branch-to-branch.s
+++ /dev/null
@@ -1,133 +0,0 @@
-# REQUIRES: x86
-
-## Test that the branch-to-branch optimization follows the links
-## from f1 -> f2 -> f3 and updates all references to point to f3.
- 
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
-# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
-
-## Test that branch-to-branch is disabled by default.
-
-# RUN: ld.lld %t.o -o %t --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
-# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
-
-## Test that branch-to-branch is disabled for preemptible symbols.
-
-# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
-# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
-# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
-
-.section .rodata.vtable,"a"
-.globl vtable
-vtable:
-# B2B: Contents of section .rodata:
-# RELOC: RELOCATION RECORDS FOR [.rodata]:
-# RELOC-NEXT: OFFSET
-# B2B-NEXT: [[VF:[0-9a-f]{8}]]
-# B2B-RELOC-NEXT: R_X86_64_PLT32 f3
-# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1
-.4byte f1@PLT - vtable
-# B2B-SAME: [[VF]]
-# B2B-RELOC-NEXT: R_X86_64_PLT32 f3+0x4
-# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2+0x4
-.4byte f2@PLT - vtable
-# B2B-SAME: [[VF]]
-# RELOC-NEXT: R_X86_64_PLT32 f3+0x8
-.4byte f3@PLT - vtable
-
-# For .rodata.f6
-# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
- 
-.section .text._start,"ax"
-.globl _start
-# CHECK: <_start>:
-# RELOC: RELOCATION RECORDS FOR [.text]:
-# RELOC-NEXT: OFFSET
-_start:
-# B2B-NEXT: jmp {{.*}} <f3>
-# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-# NOB2B-NEXT: jmp {{.*}} <f1{{.*}}>
-# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1-0x4
-jmp f1
-# B2B-NEXT: jmp {{.*}} <f3>
-# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-# NOB2B-NEXT: jmp {{.*}} <f2{{.*}}>
-# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
-jmp f2
-# This will assemble to a relocation pointing to an STT_SECTION for .text.f4
-# with an addend, which looks similar to the relative vtable cases above but
-# requires different handling of the addend so that we don't think this is
-# branching to the `jmp f3` at the start of the target section.
-# CHECK-NEXT: jmp {{.*}} <f4{{.*}}>
-# RELOC-NEXT: R_X86_64_PLT32 .text+0x2e
-jmp f4
-# B2B-NEXT: jmp 0x[[IPLT:[0-9a-f]*]]
-# RELOC-NEXT: R_X86_64_PLT32 f5-0x4
-jmp f5
-# B2B-NEXT: jmp {{.*}} <f6>
-# RELOC-NEXT: R_X86_64_PLT32 f6-0x4
-jmp f6
-# B2B-NEXT: jmp {{.*}} <f7>
-# RELOC-NEXT: R_X86_64_PLT32 f7-0x4
-jmp f7
-
-.section .text.f1,"ax"
-.globl f1
-f1:
-# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
-jmp f2
-
-.section .text.f2,"ax"
-.globl f2
-# CHECK: <f2>:
-f2:
-# CHECK-NEXT: jmp {{.*}} <f3{{.*}}>
-# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-jmp f3
-
-.section .text.f3,"ax"
-.globl f3
-f3:
-# Test that a self-branch doesn't trigger an infinite loop.
-# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-jmp f3
-
-.section .text.f4,"ax"
-jmp f3
-f4:
-ret
-
-.section .text.f5,"ax"
-.type f5, @gnu_indirect_function
-.globl f5
-f5:
-# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-jmp f3
-
-.section .rodata.f6,"a"
-.globl f6
-f6:
-# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-jmp f3
-
-# RELOC: RELOCATION RECORDS FOR [.wtext.f7]:
-# RELOC-NEXT: OFFSET
-
-.section .wtext.f7,"awx"
-.globl f7
-f7:
-# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
-jmp f3
-
-# B2B: <.iplt>:
-# B2B-NEXT: [[IPLT]]:

From 680bce007273d19aa74fbf336e7ce076fffa339f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 23 Jun 2025 13:28:43 +0200
Subject: [PATCH 1267/1322] [bazel] Add missing dependency for
 c7165587e49605452f96249412f123b47b78bb81

---
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 51731b1e8f74..90e0f1b8ca74 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -128,6 +128,7 @@ td_library(
         "//mlir:OpBaseTdFiles",
         "//mlir:PtrTdFiles",
         "//mlir:SideEffectInterfacesTdFiles",
+        "//mlir:ValueBoundsOpInterfaceTdFiles",
     ],
 )
 

From bd809ffb4b5f277a661509fbbbf9ea893a545ab0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 23 Jun 2025 13:32:30 +0200
Subject: [PATCH 1268/1322] [bazel] Make lld:ELF glob wider so it doesn't break
 as easily

---
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 6533d783c2b3..76957a95a544 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -71,10 +71,8 @@ gentbl_cc_library(
 cc_library(
     name = "ELF",
     srcs = glob([
-        "ELF/*.cpp",
-        "ELF/*.h",
-        "ELF/Arch/*.cpp",
-        "ELF/Arch/*.h",
+        "ELF/**/*.cpp",
+        "ELF/**/*.h",
     ]),
     includes = ["ELF"],
     textual_hdrs = [

From 7e77aaebfbfce2861c562fe631649ec472ea233b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 23 Jun 2025 13:37:45 +0200
Subject: [PATCH 1269/1322] [bazel] Add missing dependency for
 4af96a9d83335b3b59f3441af47c879c7a9eb183

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel     | 11 ++++++-----
 .../llvm-project-overlay/mlir/unittests/BUILD.bazel   |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 175d2d57d116..ab85731dcde4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -330,8 +330,8 @@ cc_library(
     hdrs = glob([
         "include/mlir/IR/*.h",
     ]) + [
-        "include/mlir/Interfaces/FoldInterfaces.h",
         "include/mlir/Interfaces/CallInterfaces.h",
+        "include/mlir/Interfaces/FoldInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -1510,6 +1510,7 @@ cc_library(
         ":AffineAnalysis",
         ":AffineDialect",
         ":AffineTransformOpsIncGen",
+        ":AffineTransforms",
         ":AffineUtils",
         ":BytecodeOpInterface",
         ":IR",
@@ -1650,8 +1651,8 @@ td_library(
     srcs = [
         "include/mlir/Dialect/EmitC/IR/EmitC.td",
         "include/mlir/Dialect/EmitC/IR/EmitCAttributes.td",
-        "include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td",
         "include/mlir/Dialect/EmitC/IR/EmitCBase.td",
+        "include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td",
         "include/mlir/Dialect/EmitC/IR/EmitCTypes.td",
     ],
     includes = ["include"],
@@ -3693,8 +3694,8 @@ cc_library(
         ":BytecodeOpInterface",
         ":CastInterfaces",
         ":ControlFlowInterfaces",
-        ":EmitCInterfacesIncGen",
         ":EmitCAttributesIncGen",
+        ":EmitCInterfacesIncGen",
         ":EmitCOpsIncGen",
         ":FunctionInterfaces",
         ":IR",
@@ -12798,8 +12799,8 @@ cc_library(
         "lib/Dialect/Bufferization/IR/BufferViewFlowOpInterface.cpp",
         "lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp",
         "lib/Dialect/Bufferization/IR/BufferizationDialect.cpp",
-        "lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp",
         "lib/Dialect/Bufferization/IR/BufferizationOps.cpp",
+        "lib/Dialect/Bufferization/IR/BufferizationTypeInterfaces.cpp",
         "lib/Dialect/Bufferization/IR/UnstructuredControlFlow.cpp",
     ],
     hdrs = [
@@ -12815,10 +12816,10 @@ cc_library(
         ":BufferDeallocationOpInterfaceIncGen",
         ":BufferViewFlowOpInterfaceIncGen",
         ":BufferizableOpInterfaceIncGen",
-        ":BufferizationTypeInterfacesIncGen",
         ":BufferizationBaseIncGen",
         ":BufferizationInterfaces",
         ":BufferizationOpsIncGen",
+        ":BufferizationTypeInterfacesIncGen",
         ":BytecodeOpInterface",
         ":ControlFlowInterfaces",
         ":CopyOpInterface",
diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
index 4ce6fcc97136..02dc408813d2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
@@ -41,6 +41,7 @@ cc_test(
         "//mlir:CallOpInterfaces",
         "//mlir:FunctionInterfaces",
         "//mlir:IR",
+        "//mlir:MemRefDialect",
         "//mlir:Parser",
         "//mlir:Support",
         "//mlir/test:TestDialect",

From 8584b216b87085a913fe39be15bfa4ab4754aeb9 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Mon, 23 Jun 2025 14:01:17 +0200
Subject: [PATCH 1270/1322] Lower allreduce (#144716)

Adding lowering mesh.allreduce to mpi.allreduce.
Minor restructuring to increase code reuse.
---
 mlir/include/mlir/Conversion/Passes.td        |   2 +
 mlir/include/mlir/Dialect/MPI/IR/MPI.h        |   1 +
 mlir/include/mlir/Dialect/MPI/IR/MPI.td       |   2 +-
 mlir/include/mlir/Dialect/MPI/IR/MPIOps.td    |  12 +-
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h   |   5 +
 mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td  |   4 +-
 .../Dialect/Mesh/Transforms/Simplifications.h |  10 +-
 .../mlir/Dialect/Mesh/Transforms/Transforms.h |   5 +
 mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp   |  62 +--
 mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp   | 184 +++++++--
 mlir/lib/Dialect/MPI/IR/MPIOps.cpp            |  34 ++
 mlir/lib/Dialect/Mesh/IR/MeshOps.cpp          |  23 ++
 .../Dialect/Mesh/Transforms/Transforms.cpp    |  22 +-
 .../MeshToMPI/convert-mesh-to-mpi.mlir        | 357 ++++++++++--------
 14 files changed, 482 insertions(+), 241 deletions(-)

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index b496ee011491..5a864865adff 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -905,6 +905,8 @@ def ConvertMeshToMPIPass : Pass<"convert-mesh-to-mpi"> {
     shard/partition sizes depend on the rank.
   }];
   let dependentDialects = [
+    "affine::AffineDialect",
+    "arith::ArithDialect",
     "memref::MemRefDialect",
     "mpi::MPIDialect",
     "scf::SCFDialect",
diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPI.h b/mlir/include/mlir/Dialect/MPI/IR/MPI.h
index f06b911ce3fe..2b6743cd008c 100644
--- a/mlir/include/mlir/Dialect/MPI/IR/MPI.h
+++ b/mlir/include/mlir/Dialect/MPI/IR/MPI.h
@@ -12,6 +12,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 //===----------------------------------------------------------------------===//
 // MPIDialect
diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPI.td b/mlir/include/mlir/Dialect/MPI/IR/MPI.td
index f2837e71df06..0c62a1794e19 100644
--- a/mlir/include/mlir/Dialect/MPI/IR/MPI.td
+++ b/mlir/include/mlir/Dialect/MPI/IR/MPI.td
@@ -230,7 +230,7 @@ def MPI_OpMinloc : I32EnumAttrCase<"MPI_MINLOC", 11, "MPI_MINLOC">;
 def MPI_OpMaxloc : I32EnumAttrCase<"MPI_MAXLOC", 12, "MPI_MAXLOC">;
 def MPI_OpReplace : I32EnumAttrCase<"MPI_REPLACE", 13, "MPI_REPLACE">;
 
-def MPI_OpClassEnum : I32EnumAttr<"MPI_OpClassEnum", "MPI operation class", [
+def MPI_ReductionOpEnum : I32EnumAttr<"MPI_ReductionOpEnum", "MPI operation class", [
       MPI_OpNull,
       MPI_OpMax,
       MPI_OpMin,
diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td b/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td
index d78aa92d201e..935e0f785ef0 100644
--- a/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td
+++ b/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td
@@ -11,6 +11,7 @@
 
 include "mlir/Dialect/MPI/IR/MPI.td"
 include "mlir/Dialect/MPI/IR/MPITypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 class MPI_Op<string mnemonic, list<Trait> traits = []>
     : Op<MPI_Dialect, mnemonic, traits>;
@@ -41,7 +42,7 @@ def MPI_InitOp : MPI_Op<"init", []> {
 // CommWorldOp
 //===----------------------------------------------------------------------===//
 
-def MPI_CommWorldOp : MPI_Op<"comm_world", []> {
+def MPI_CommWorldOp : MPI_Op<"comm_world", [Pure]> {
   let summary = "Get the World communicator, equivalent to `MPI_COMM_WORLD`";
   let description = [{
     This operation returns the predefined MPI_COMM_WORLD communicator.
@@ -56,7 +57,7 @@ def MPI_CommWorldOp : MPI_Op<"comm_world", []> {
 // CommRankOp
 //===----------------------------------------------------------------------===//
 
-def MPI_CommRankOp : MPI_Op<"comm_rank", []> {
+def MPI_CommRankOp : MPI_Op<"comm_rank", [Pure]> {
   let summary = "Get the current rank, equivalent to "
                 "`MPI_Comm_rank(comm, &rank)`";
   let description = [{
@@ -72,13 +73,14 @@ def MPI_CommRankOp : MPI_Op<"comm_rank", []> {
   );
 
   let assemblyFormat = "`(` $comm `)` attr-dict `:` type(results)";
+  let hasCanonicalizer = 1;
 }
 
 //===----------------------------------------------------------------------===//
 // CommSizeOp
 //===----------------------------------------------------------------------===//
 
-def MPI_CommSizeOp : MPI_Op<"comm_size", []> {
+def MPI_CommSizeOp : MPI_Op<"comm_size", [Pure]> {
   let summary = "Get the size of the group associated to the communicator, "
                 "equivalent to `MPI_Comm_size(comm, &size)`";
   let description = [{
@@ -100,7 +102,7 @@ def MPI_CommSizeOp : MPI_Op<"comm_size", []> {
 // CommSplitOp
 //===----------------------------------------------------------------------===//
 
-def MPI_CommSplitOp : MPI_Op<"comm_split", []> {
+def MPI_CommSplitOp : MPI_Op<"comm_split", [Pure]> {
   let summary = "Partition the group associated with the given communicator into "
                 "disjoint subgroups";
   let description = [{
@@ -281,7 +283,7 @@ def MPI_AllReduceOp : MPI_Op<"allreduce", []> {
   let arguments = (
     ins AnyMemRef : $sendbuf,
     AnyMemRef : $recvbuf,
-    MPI_OpClassEnum : $op,
+    MPI_ReductionOpEnum : $op,
     MPI_Comm : $comm
   );
 
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index 3878505f8f93..c4d512b60bc5 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -212,6 +212,11 @@ void maybeInsertSourceShardingAnnotation(MeshSharding sharding,
                                          OpOperand &operand,
                                          OpBuilder &builder);
 
+/// Converts a vector of OpFoldResults (ints) into vector of Values of the
+/// provided type.
+SmallVector<Value> getMixedAsValues(OpBuilder b, const Location &loc,
+                                    llvm::ArrayRef<int64_t> statics,
+                                    ValueRange dynamics, Type type = Type());
 } // namespace mesh
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
index f59c4c4c6751..ac05ee243d7b 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -584,11 +584,11 @@ def Mesh_AllReduceOp : Mesh_CollectiveCommunicationOpBase<"all_reduce", [
     ```
   }];
   let arguments = !con(commonArgs, (ins
-    AnyRankedTensor:$input,
+    AnyTypeOf<[AnyMemRef, AnyRankedTensor]>:$input,
     DefaultValuedAttr<Mesh_ReductionKindAttr, "::mlir::mesh::ReductionKind::Sum">:$reduction
   ));
   let results = (outs
-    AnyRankedTensor:$result
+    AnyTypeOf<[AnyMemRef, AnyRankedTensor]>:$result
   );
   let assemblyFormat = [{
     $input `on` $mesh (`mesh_axes` `=` $mesh_axes^)? (`reduction` `=` $reduction^)?
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
index c64da29ca641..3f1041cb2510 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
@@ -62,9 +62,9 @@ void populateAllReduceEndomorphismSimplificationPatterns(
   auto isEndomorphismOp = [reduction](Operation *op,
                                       std::optional<Operation *> referenceOp) {
     auto allReduceOp = llvm::dyn_cast<AllReduceOp>(op);
-    if (!allReduceOp ||
-        allReduceOp.getInput().getType().getElementType() !=
-            allReduceOp.getResult().getType().getElementType() ||
+    auto inType = cast<ShapedType>(allReduceOp.getInput().getType());
+    auto outType = cast<ShapedType>(allReduceOp.getResult().getType());
+    if (!allReduceOp || inType.getElementType() != outType.getElementType() ||
         allReduceOp.getReduction() != reduction) {
       return false;
     }
@@ -83,9 +83,9 @@ void populateAllReduceEndomorphismSimplificationPatterns(
     }
 
     auto refAllReduceOp = llvm::dyn_cast<AllReduceOp>(referenceOp.value());
+    auto refType = cast<ShapedType>(refAllReduceOp.getResult().getType());
     return refAllReduceOp->getAttrs() == allReduceOp->getAttrs() &&
-           allReduceOp.getInput().getType().getElementType() ==
-               refAllReduceOp.getInput().getType().getElementType();
+           inType.getElementType() == refType.getElementType();
   };
   auto isAlgebraicOp = [](Operation *op) {
     return static_cast<bool>(llvm::dyn_cast<AlgebraicOp>(op));
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Transforms.h
index be82e2af399d..f46c0db84608 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Transforms.h
@@ -42,6 +42,11 @@ createCollectiveProcessGroupSize(MeshOp mesh, ArrayRef<MeshAxis> axes,
 TypedValue<IndexType> createProcessLinearIndex(StringRef mesh,
                                                ArrayRef<MeshAxis> meshAxes,
                                                ImplicitLocOpBuilder &builder);
+// Get process linear index from a multi-index along the given mesh axes .
+TypedValue<IndexType>
+createProcessLinearIndex(StringRef mesh, ValueRange processInGroupMultiIndex,
+                         ArrayRef<MeshAxis> meshAxes,
+                         ImplicitLocOpBuilder &builder);
 
 } // namespace mesh
 } // namespace mlir
diff --git a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
index 5575b295ae20..d4deff5b8807 100644
--- a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
+++ b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
@@ -116,7 +116,7 @@ public:
   /// enum value.
   virtual Value getMPIOp(const Location loc,
                          ConversionPatternRewriter &rewriter,
-                         mpi::MPI_OpClassEnum opAttr) = 0;
+                         mpi::MPI_ReductionOpEnum opAttr) = 0;
 };
 
 //===----------------------------------------------------------------------===//
@@ -199,49 +199,49 @@ public:
   }
 
   Value getMPIOp(const Location loc, ConversionPatternRewriter &rewriter,
-                 mpi::MPI_OpClassEnum opAttr) override {
+                 mpi::MPI_ReductionOpEnum opAttr) override {
     int32_t op = MPI_NO_OP;
     switch (opAttr) {
-    case mpi::MPI_OpClassEnum::MPI_OP_NULL:
+    case mpi::MPI_ReductionOpEnum::MPI_OP_NULL:
       op = MPI_NO_OP;
       break;
-    case mpi::MPI_OpClassEnum::MPI_MAX:
+    case mpi::MPI_ReductionOpEnum::MPI_MAX:
       op = MPI_MAX;
       break;
-    case mpi::MPI_OpClassEnum::MPI_MIN:
+    case mpi::MPI_ReductionOpEnum::MPI_MIN:
       op = MPI_MIN;
       break;
-    case mpi::MPI_OpClassEnum::MPI_SUM:
+    case mpi::MPI_ReductionOpEnum::MPI_SUM:
       op = MPI_SUM;
       break;
-    case mpi::MPI_OpClassEnum::MPI_PROD:
+    case mpi::MPI_ReductionOpEnum::MPI_PROD:
       op = MPI_PROD;
       break;
-    case mpi::MPI_OpClassEnum::MPI_LAND:
+    case mpi::MPI_ReductionOpEnum::MPI_LAND:
       op = MPI_LAND;
       break;
-    case mpi::MPI_OpClassEnum::MPI_BAND:
+    case mpi::MPI_ReductionOpEnum::MPI_BAND:
       op = MPI_BAND;
       break;
-    case mpi::MPI_OpClassEnum::MPI_LOR:
+    case mpi::MPI_ReductionOpEnum::MPI_LOR:
       op = MPI_LOR;
       break;
-    case mpi::MPI_OpClassEnum::MPI_BOR:
+    case mpi::MPI_ReductionOpEnum::MPI_BOR:
       op = MPI_BOR;
       break;
-    case mpi::MPI_OpClassEnum::MPI_LXOR:
+    case mpi::MPI_ReductionOpEnum::MPI_LXOR:
       op = MPI_LXOR;
       break;
-    case mpi::MPI_OpClassEnum::MPI_BXOR:
+    case mpi::MPI_ReductionOpEnum::MPI_BXOR:
       op = MPI_BXOR;
       break;
-    case mpi::MPI_OpClassEnum::MPI_MINLOC:
+    case mpi::MPI_ReductionOpEnum::MPI_MINLOC:
       op = MPI_MINLOC;
       break;
-    case mpi::MPI_OpClassEnum::MPI_MAXLOC:
+    case mpi::MPI_ReductionOpEnum::MPI_MAXLOC:
       op = MPI_MAXLOC;
       break;
-    case mpi::MPI_OpClassEnum::MPI_REPLACE:
+    case mpi::MPI_ReductionOpEnum::MPI_REPLACE:
       op = MPI_REPLACE;
       break;
     }
@@ -336,49 +336,49 @@ public:
   }
 
   Value getMPIOp(const Location loc, ConversionPatternRewriter &rewriter,
-                 mpi::MPI_OpClassEnum opAttr) override {
+                 mpi::MPI_ReductionOpEnum opAttr) override {
     StringRef op;
     switch (opAttr) {
-    case mpi::MPI_OpClassEnum::MPI_OP_NULL:
+    case mpi::MPI_ReductionOpEnum::MPI_OP_NULL:
       op = "ompi_mpi_no_op";
       break;
-    case mpi::MPI_OpClassEnum::MPI_MAX:
+    case mpi::MPI_ReductionOpEnum::MPI_MAX:
       op = "ompi_mpi_max";
       break;
-    case mpi::MPI_OpClassEnum::MPI_MIN:
+    case mpi::MPI_ReductionOpEnum::MPI_MIN:
       op = "ompi_mpi_min";
       break;
-    case mpi::MPI_OpClassEnum::MPI_SUM:
+    case mpi::MPI_ReductionOpEnum::MPI_SUM:
       op = "ompi_mpi_sum";
       break;
-    case mpi::MPI_OpClassEnum::MPI_PROD:
+    case mpi::MPI_ReductionOpEnum::MPI_PROD:
       op = "ompi_mpi_prod";
       break;
-    case mpi::MPI_OpClassEnum::MPI_LAND:
+    case mpi::MPI_ReductionOpEnum::MPI_LAND:
       op = "ompi_mpi_land";
       break;
-    case mpi::MPI_OpClassEnum::MPI_BAND:
+    case mpi::MPI_ReductionOpEnum::MPI_BAND:
       op = "ompi_mpi_band";
       break;
-    case mpi::MPI_OpClassEnum::MPI_LOR:
+    case mpi::MPI_ReductionOpEnum::MPI_LOR:
       op = "ompi_mpi_lor";
       break;
-    case mpi::MPI_OpClassEnum::MPI_BOR:
+    case mpi::MPI_ReductionOpEnum::MPI_BOR:
       op = "ompi_mpi_bor";
       break;
-    case mpi::MPI_OpClassEnum::MPI_LXOR:
+    case mpi::MPI_ReductionOpEnum::MPI_LXOR:
       op = "ompi_mpi_lxor";
       break;
-    case mpi::MPI_OpClassEnum::MPI_BXOR:
+    case mpi::MPI_ReductionOpEnum::MPI_BXOR:
       op = "ompi_mpi_bxor";
       break;
-    case mpi::MPI_OpClassEnum::MPI_MINLOC:
+    case mpi::MPI_ReductionOpEnum::MPI_MINLOC:
       op = "ompi_mpi_minloc";
       break;
-    case mpi::MPI_OpClassEnum::MPI_MAXLOC:
+    case mpi::MPI_ReductionOpEnum::MPI_MAXLOC:
       op = "ompi_mpi_maxloc";
       break;
-    case mpi::MPI_OpClassEnum::MPI_REPLACE:
+    case mpi::MPI_ReductionOpEnum::MPI_REPLACE:
       op = "ompi_mpi_replace";
       break;
     }
diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
index 823d4d644f58..1b9efb105b69 100644
--- a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
+++ b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
@@ -12,9 +12,9 @@
 
 #include "mlir/Conversion/MeshToMPI/MeshToMPI.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -22,6 +22,8 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Mesh/IR/MeshDialect.h"
 #include "mlir/Dialect/Mesh/IR/MeshOps.h"
+#include "mlir/Dialect/Mesh/Transforms/Simplifications.h"
+#include "mlir/Dialect/Mesh/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
@@ -289,27 +291,15 @@ struct ConvertProcessMultiIndexOp
 
 class ConvertProcessLinearIndexOp
     : public OpConversionPattern<ProcessLinearIndexOp> {
-  int64_t worldRank; // rank in MPI_COMM_WORLD if available, else < 0
 
 public:
   using OpConversionPattern::OpConversionPattern;
 
-  // Constructor accepting worldRank
-  ConvertProcessLinearIndexOp(const TypeConverter &typeConverter,
-                              MLIRContext *context, int64_t worldRank = -1)
-      : OpConversionPattern(typeConverter, context), worldRank(worldRank) {}
-
   LogicalResult
   matchAndRewrite(ProcessLinearIndexOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-
+    // Create mpi::CommRankOp
     Location loc = op.getLoc();
-    if (worldRank >= 0) { // if rank in MPI_COMM_WORLD is known -> use it
-      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, worldRank);
-      return success();
-    }
-
-    // Otherwise call create mpi::CommRankOp
     auto ctx = op.getContext();
     Value commWorld =
         rewriter.create<mpi::CommWorldOp>(loc, mpi::CommType::get(ctx));
@@ -529,6 +519,129 @@ struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {
   }
 };
 
+static mpi::MPI_ReductionOpEnumAttr getMPIReductionOp(ReductionKindAttr kind) {
+  auto ctx = kind.getContext();
+  auto getReductionOp = [ctx](mpi::MPI_ReductionOpEnum redOp) {
+    return mpi::MPI_ReductionOpEnumAttr::get(ctx, redOp);
+  };
+
+  switch (kind.getValue()) {
+  case ReductionKind::Sum:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_SUM);
+  case ReductionKind::Product:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_PROD);
+  case ReductionKind::Min:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_MIN);
+  case ReductionKind::Max:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_MAX);
+  case ReductionKind::BitwiseAnd:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BAND);
+  case ReductionKind::BitwiseOr:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BOR);
+  case ReductionKind::BitwiseXor:
+    return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BXOR);
+  default:
+    assert(false && "Unknown/unsupported reduction kind");
+  }
+}
+
+struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(AllReduceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SymbolTableCollection symbolTableCollection;
+    auto mesh = adaptor.getMesh();
+    mlir::mesh::MeshOp meshOp = getMesh(op, symbolTableCollection);
+    if (!meshOp)
+      return op->emitError() << "No mesh found for AllReduceOp";
+    if (ShapedType::isDynamicShape(meshOp.getShape()))
+      return op->emitError()
+             << "Dynamic mesh shape not supported in AllReduceOp";
+
+    ImplicitLocOpBuilder iBuilder(op.getLoc(), rewriter);
+    Value input = adaptor.getInput();
+    auto inputShape = cast<ShapedType>(input.getType()).getShape();
+
+    // If the source is a memref, cast it to a tensor.
+    if (isa<RankedTensorType>(input.getType())) {
+      auto memrefType = MemRefType::get(
+          inputShape, cast<ShapedType>(input.getType()).getElementType());
+      input = iBuilder.create<bufferization::ToBufferOp>(memrefType, input);
+    }
+    MemRefType inType = cast<MemRefType>(input.getType());
+
+    // Get the actual shape to allocate the buffer.
+    SmallVector<OpFoldResult> shape(inType.getRank());
+    for (auto i = 0; i < inType.getRank(); ++i) {
+      auto s = inputShape[i];
+      if (ShapedType::isDynamic(s))
+        shape[i] = iBuilder.create<memref::DimOp>(input, s).getResult();
+      else
+        shape[i] = iBuilder.getIndexAttr(s);
+    }
+
+    // Allocate buffer and copy input to buffer.
+    Value buffer = iBuilder.create<memref::AllocOp>(
+        shape, cast<ShapedType>(op.getType()).getElementType());
+    iBuilder.create<linalg::CopyOp>(input, buffer);
+
+    // Get an MPI_Comm_split for the AllReduce operation.
+    // The color is the linear index of the process in the mesh along the
+    // non-reduced axes. The key is the linear index of the process in the mesh
+    // along the reduced axes.
+    SmallVector<Type> indexResultTypes(meshOp.getShape().size(),
+                                       iBuilder.getIndexType());
+    SmallVector<Value> myMultiIndex =
+        iBuilder.create<ProcessMultiIndexOp>(indexResultTypes, mesh)
+            .getResult();
+    Value zero = iBuilder.create<arith::ConstantIndexOp>(0);
+    SmallVector<Value> multiKey(myMultiIndex.size(), zero);
+
+    auto redAxes = adaptor.getMeshAxes();
+    for (auto axis : redAxes) {
+      multiKey[axis] = myMultiIndex[axis];
+      myMultiIndex[axis] = zero;
+    }
+
+    Value color =
+        createProcessLinearIndex(mesh, myMultiIndex, redAxes, iBuilder);
+    color = iBuilder.create<arith::IndexCastOp>(iBuilder.getI32Type(), color);
+    Value key = createProcessLinearIndex(mesh, multiKey, redAxes, iBuilder);
+    key = iBuilder.create<arith::IndexCastOp>(iBuilder.getI32Type(), key);
+
+    // Finally split the communicator
+    auto commType = mpi::CommType::get(op->getContext());
+    Value commWorld = iBuilder.create<mpi::CommWorldOp>(commType);
+    auto comm =
+        iBuilder.create<mpi::CommSplitOp>(commType, commWorld, color, key)
+            .getNewcomm();
+
+    Value buffer1d = buffer;
+    // Collapse shape to 1d if needed
+    if (inType.getRank() > 1) {
+      ReassociationIndices reassociation(inType.getRank());
+      std::iota(reassociation.begin(), reassociation.end(), 0);
+      buffer1d = iBuilder.create<memref::CollapseShapeOp>(
+          buffer, ArrayRef<ReassociationIndices>(reassociation));
+    }
+
+    // Create the MPI AllReduce operation.
+    iBuilder.create<mpi::AllReduceOp>(
+        TypeRange(), buffer1d, buffer1d,
+        getMPIReductionOp(adaptor.getReductionAttr()), comm);
+
+    // If the destination is a memref, cast it to a tensor
+    if (isa<RankedTensorType>(op.getType()))
+      buffer = iBuilder.create<bufferization::ToTensorOp>(op.getType(), buffer,
+                                                          true);
+
+    rewriter.replaceOp(op, buffer);
+    return success();
+  }
+};
+
 struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -573,10 +686,10 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
     Value array = dest;
     if (isa<RankedTensorType>(array.getType())) {
       // If the destination is a memref, we need to cast it to a tensor
-      auto tensorType = MemRefType::get(
+      auto mmemrefType = MemRefType::get(
           dstShape, cast<ShapedType>(array.getType()).getElementType());
       array =
-          rewriter.create<bufferization::ToBufferOp>(loc, tensorType, array);
+          rewriter.create<bufferization::ToBufferOp>(loc, mmemrefType, array);
     }
     auto rank = cast<ShapedType>(array.getType()).getRank();
     auto opSplitAxes = adaptor.getSplitAxes().getAxes();
@@ -753,22 +866,6 @@ struct ConvertMeshToMPIPass
 
   /// Run the dialect converter on the module.
   void runOnOperation() override {
-    uint64_t worldRank = -1;
-    // Try to get DLTI attribute for MPI:comm_world_rank
-    // If found, set worldRank to the value of the attribute.
-    {
-      auto dltiAttr =
-          dlti::query(getOperation(), {"MPI:comm_world_rank"}, false);
-      if (succeeded(dltiAttr)) {
-        if (!isa<IntegerAttr>(dltiAttr.value())) {
-          getOperation()->emitError()
-              << "Expected an integer attribute for MPI:comm_world_rank";
-          return signalPassFailure();
-        }
-        worldRank = cast<IntegerAttr>(dltiAttr.value()).getInt();
-      }
-    }
-
     auto *ctxt = &getContext();
     RewritePatternSet patterns(ctxt);
     ConversionTarget target(getContext());
@@ -816,13 +913,13 @@ struct ConvertMeshToMPIPass
 
     // No mesh dialect should left after conversion...
     target.addIllegalDialect<mesh::MeshDialect>();
-    // ...except the global MeshOp
-    target.addLegalOp<mesh::MeshOp>();
+    // ...except the global MeshOp. MeshShapeOp which will get folded later.
+    target.addLegalOp<mesh::MeshOp, mesh::MeshShapeOp>();
     // Allow all the stuff that our patterns will convert to
-    target.addLegalDialect<BuiltinDialect, mpi::MPIDialect, scf::SCFDialect,
-                           arith::ArithDialect, tensor::TensorDialect,
-                           bufferization::BufferizationDialect,
-                           linalg::LinalgDialect, memref::MemRefDialect>();
+    target.addLegalDialect<
+        BuiltinDialect, mpi::MPIDialect, scf::SCFDialect, arith::ArithDialect,
+        tensor::TensorDialect, bufferization::BufferizationDialect,
+        linalg::LinalgDialect, memref::MemRefDialect, affine::AffineDialect>();
     // Make sure the function signature, calls etc. are legal
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
       return typeConverter.isSignatureLegal(op.getFunctionType());
@@ -832,9 +929,8 @@ struct ConvertMeshToMPIPass
 
     patterns.add<ConvertUpdateHaloOp, ConvertNeighborsLinearIndicesOp,
                  ConvertProcessMultiIndexOp, ConvertGetShardingOp,
-                 ConvertShardingOp, ConvertShardShapeOp>(typeConverter, ctxt);
-    // ConvertProcessLinearIndexOp accepts an optional worldRank
-    patterns.add<ConvertProcessLinearIndexOp>(typeConverter, ctxt, worldRank);
+                 ConvertShardingOp, ConvertShardShapeOp, ConvertAllReduceOp,
+                 ConvertProcessLinearIndexOp>(typeConverter, ctxt);
 
     populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
         patterns, typeConverter);
@@ -842,6 +938,12 @@ struct ConvertMeshToMPIPass
     populateReturnOpTypeConversionPattern(patterns, typeConverter);
 
     (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+
+    // Folding patterns cannot be mixed with conversion patterns -> extra pass.
+    patterns.clear();
+    SymbolTableCollection symbolTableCollection;
+    mlir::mesh::populateFoldingPatterns(patterns, symbolTableCollection);
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 
diff --git a/mlir/lib/Dialect/MPI/IR/MPIOps.cpp b/mlir/lib/Dialect/MPI/IR/MPIOps.cpp
index 56d8edfbcc02..7940ff60a48e 100644
--- a/mlir/lib/Dialect/MPI/IR/MPIOps.cpp
+++ b/mlir/lib/Dialect/MPI/IR/MPIOps.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/MPI/IR/MPI.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
@@ -41,6 +42,34 @@ struct FoldCast final : public mlir::OpRewritePattern<OpT> {
     return mlir::success();
   }
 };
+
+struct FoldRank final : public mlir::OpRewritePattern<mlir::mpi::CommRankOp> {
+  using mlir::OpRewritePattern<mlir::mpi::CommRankOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::mpi::CommRankOp op,
+                                mlir::PatternRewriter &b) const override {
+    auto comm = op.getComm();
+    if (!comm.getDefiningOp<mlir::mpi::CommWorldOp>())
+      return mlir::failure();
+
+    // Try to get DLTI attribute for MPI:comm_world_rank
+    // If found, set worldRank to the value of the attribute.
+    auto dltiAttr = dlti::query(op, {"MPI:comm_world_rank"}, false);
+    if (failed(dltiAttr))
+      return mlir::failure();
+    if (!isa<IntegerAttr>(dltiAttr.value()))
+      return op->emitError()
+             << "Expected an integer attribute for MPI:comm_world_rank";
+    Value res = b.create<arith::ConstantIndexOp>(
+        op.getLoc(), cast<IntegerAttr>(dltiAttr.value()).getInt());
+    if (Value retVal = op.getRetval())
+      b.replaceOp(op, {retVal, res});
+    else
+      b.replaceOp(op, res);
+    return mlir::success();
+  }
+};
+
 } // namespace
 
 void mlir::mpi::SendOp::getCanonicalizationPatterns(
@@ -63,6 +92,11 @@ void mlir::mpi::IRecvOp::getCanonicalizationPatterns(
   results.add<FoldCast<mlir::mpi::IRecvOp>>(context);
 }
 
+void mlir::mpi::CommRankOp::getCanonicalizationPatterns(
+    mlir::RewritePatternSet &results, mlir::MLIRContext *context) {
+  results.add<FoldRank>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index a2c2d1a7470c..b8cc91da722f 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -75,6 +75,29 @@ static DimensionSize operator*(DimensionSize lhs, DimensionSize rhs) {
   return lhs.value() * rhs.value();
 }
 
+SmallVector<Value> mlir::mesh::getMixedAsValues(OpBuilder b,
+                                                const Location &loc,
+                                                llvm::ArrayRef<int64_t> statics,
+                                                ValueRange dynamics,
+                                                Type type) {
+  SmallVector<Value> values;
+  auto dyn = dynamics.begin();
+  Type i64 = b.getI64Type();
+  if (!type)
+    type = i64;
+  assert((i64 == type || b.getIndexType() == type) &&
+         "expected an i64 or an intex type");
+  for (auto s : statics) {
+    if (s == ShapedType::kDynamic) {
+      values.emplace_back(*(dyn++));
+    } else {
+      TypedAttr val = type == i64 ? b.getI64IntegerAttr(s) : b.getIndexAttr(s);
+      values.emplace_back(b.create<arith::ConstantOp>(loc, type, val));
+    }
+  }
+  return values;
+}
+
 //===----------------------------------------------------------------------===//
 // Inliner
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp b/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
index 447668cc0ea5..f08ef75d8a00 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
@@ -207,17 +207,27 @@ createCollectiveProcessGroupSize(MeshOp mesh, ArrayRef<MeshAxis> axes,
       builder.getIndexType()));
 }
 
-TypedValue<IndexType> createProcessLinearIndex(StringRef mesh,
-                                               ArrayRef<MeshAxis> meshAxes,
-                                               ImplicitLocOpBuilder &builder) {
-  ResultRange processInGroupMultiIndex =
-      builder.create<ProcessMultiIndexOp>(mesh, meshAxes).getResults();
+TypedValue<IndexType>
+createProcessLinearIndex(StringRef mesh, ValueRange processInGroupMultiIndex,
+                         ArrayRef<MeshAxis> meshAxes,
+                         ImplicitLocOpBuilder &builder) {
   Operation::result_range processGroupShape =
       builder.create<MeshShapeOp>(mesh, meshAxes).getResult();
   OpFoldResult processInGroupLinearIndex = affine::linearizeIndex(
       llvm::to_vector_of<OpFoldResult>(processInGroupMultiIndex),
       llvm::to_vector_of<OpFoldResult>(processGroupShape), builder);
-  return cast<TypedValue<IndexType>>(cast<Value>(processInGroupLinearIndex));
+  auto res = dyn_cast<Value>(processInGroupLinearIndex);
+  if (!res)
+    res = builder.create<arith::ConstantIndexOp>(
+        cast<IntegerAttr>(cast<Attribute>(processInGroupLinearIndex)).getInt());
+  return cast<TypedValue<IndexType>>(res);
 }
 
+TypedValue<IndexType> createProcessLinearIndex(StringRef mesh,
+                                               ArrayRef<MeshAxis> meshAxes,
+                                               ImplicitLocOpBuilder &builder) {
+  return createProcessLinearIndex(
+      mesh, builder.create<ProcessMultiIndexOp>(mesh, meshAxes).getResults(),
+      meshAxes, builder);
+}
 } // namespace mlir::mesh
diff --git a/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir b/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir
index d314ad3ac30f..d54d0034da5b 100644
--- a/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir
+++ b/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir
@@ -80,6 +80,63 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } {
   }
 }
 
+// -----
+module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 7> } {
+  mesh.mesh @mesh0(shape = 3x4x5)
+  // CHECK-LABEL: func.func @allreduce_tensor(
+  func.func @allreduce_tensor(
+    // CHECK-SAME: [[varg0:%.*]]: tensor<3x4xf32>
+    %arg0 : tensor<3x4xf32>) -> tensor<3x4xf32> {
+    // CHECK-DAG: [[vc4_i32:%.*]] = arith.constant 4 : i32
+    // CHECK-DAG: [[vc2_i32:%.*]] = arith.constant 2 : i32
+    // CHECK: [[v0:%.*]] = bufferization.to_buffer [[varg0]] : tensor<3x4xf32> to memref<3x4xf32>
+    // CHECK: [[valloc:%.*]] = memref.alloc() : memref<3x4xf32>
+    // CHECK: linalg.copy ins([[v0]] : memref<3x4xf32>) outs([[valloc]] : memref<3x4xf32>)
+    // CHECK: [[v1:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[vnewcomm:%.*]] = mpi.comm_split([[v1]], [[vc2_i32]], [[vc4_i32]]) : !mpi.comm
+    // CHECK: [[vcollapse_shape:%.*]] = memref.collapse_shape [[valloc]] {{\[\[}}0, 1]] : memref<3x4xf32> into memref<12xf32>
+    // CHECK: mpi.allreduce([[vcollapse_shape]], [[vcollapse_shape]], MPI_MAX, [[vnewcomm]]) : memref<12xf32>, memref<12xf32>
+    // CHECK: [[v2:%.*]] = bufferization.to_tensor [[valloc]] restrict : memref<3x4xf32> to tensor<3x4xf32>
+    %0 = mesh.all_reduce %arg0 on @mesh0 mesh_axes = [0, 1] reduction = max : tensor<3x4xf32> -> tensor<3x4xf32>
+    // CHECK: return [[v2]] : tensor<3x4xf32>
+    return %0 : tensor<3x4xf32>
+  }
+
+  // CHECK-LABEL: func.func @allreduce_memref(
+  func.func @allreduce_memref(
+    // CHECK-SAME: [[varg0:%.*]]: memref<3x4xf32>
+    %arg0 : memref<3x4xf32>) -> memref<3x4xf32> {
+    // CHECK: [[vc4_i32:%.*]] = arith.constant 4 : i32
+    // CHECK: [[vc2_i32:%.*]] = arith.constant 2 : i32
+    // CHECK: [[valloc:%.*]] = memref.alloc() : memref<3x4xf32>
+    // CHECK: linalg.copy ins([[varg0]] : memref<3x4xf32>) outs([[valloc]] : memref<3x4xf32>)
+    // CHECK: [[v0:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[vnewcomm:%.*]] = mpi.comm_split([[v0]], [[vc2_i32]], [[vc4_i32]]) : !mpi.comm
+    // CHECK: [[vcollapse_shape:%.*]] = memref.collapse_shape [[valloc]] {{\[\[}}0, 1]] : memref<3x4xf32> into memref<12xf32>
+    // CHECK: mpi.allreduce([[vcollapse_shape]], [[vcollapse_shape]], MPI_MAX, [[vnewcomm]]) : memref<12xf32>, memref<12xf32>
+    %0 = mesh.all_reduce %arg0 on @mesh0 mesh_axes = [0, 1] reduction = max : memref<3x4xf32> -> memref<3x4xf32>
+    // CHECK: return [[valloc]] : memref<3x4xf32>
+    return %0 : memref<3x4xf32>
+  }
+
+  // CHECK-LABEL: func.func @allreduce_new_type(
+  func.func @allreduce_new_type(
+    // CHECK-SAME: [[varg0:%.*]]: memref<3x4xf32>
+    %arg0 : memref<3x4xf32>) -> memref<3x4xf64> {
+    // CHECK: [[vc4_i32:%.*]] = arith.constant 4 : i32
+    // CHECK: [[vc2_i32:%.*]] = arith.constant 2 : i32
+    // CHECK: [[valloc:%.*]] = memref.alloc() : memref<3x4xf64>
+    // CHECK: linalg.copy ins([[varg0]] : memref<3x4xf32>) outs([[valloc]] : memref<3x4xf64>)
+    // CHECK: [[v0:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[vnewcomm:%.*]] = mpi.comm_split([[v0]], [[vc2_i32]], [[vc4_i32]]) : !mpi.comm
+    // CHECK: [[vcollapse_shape:%.*]] = memref.collapse_shape [[valloc]] {{\[\[}}0, 1]] : memref<3x4xf64> into memref<12xf64>
+    // CHECK: mpi.allreduce([[vcollapse_shape]], [[vcollapse_shape]], MPI_MAX, [[vnewcomm]]) : memref<12xf64>, memref<12xf64>
+    %0 = mesh.all_reduce %arg0 on @mesh0 mesh_axes = [0, 1] reduction = max : memref<3x4xf32> -> memref<3x4xf64>
+    // CHECK: return [[valloc]] : memref<3x4xf64>
+    return %0 : memref<3x4xf64>
+  }
+}
+
 // -----
 mesh.mesh @mesh0(shape = 3x4x5)
 // CHECK-LABEL: func @update_halo_1d_first
@@ -91,13 +148,13 @@ func.func @update_halo_1d_first(
   // CHECK-SAME: : memref<2x120x120xi8>, i32, i32
   // CHECK: mpi.recv(
   // CHECK-SAME: : memref<2x120x120xi8>, i32, i32
-  // CHECK-NEXT: memref.subview [[arg0]][0, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8
+  // CHECK: memref.subview [[arg0]][0, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8
   // CHECK: memref.subview [[arg0]][2, 0, 0] [3, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<3x120x120xi8
   // CHECK: mpi.send(
   // CHECK-SAME: : memref<3x120x120xi8>, i32, i32
   // CHECK: mpi.recv(
   // CHECK-SAME: : memref<3x120x120xi8>, i32, i32
-  // CHECK-NEXT: memref.subview [[arg0]][117, 0, 0] [3, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<3x120x120xi8
+  // CHECK: memref.subview [[arg0]][117, 0, 0] [3, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<3x120x120xi8
   %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[0]] halo_sizes = [2, 3] : memref<120x120x120xi8>
   // CHECK: return [[res:%.*]] : memref<120x120x120xi8>
   return %res : memref<120x120x120xi8>
@@ -110,18 +167,18 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 1> } {
   func.func @update_halo_1d_with_zero (
     // CHECK-SAME: [[varg0:%.*]]: memref<120x120x120xi8>
     %arg0 : memref<120x120x120xi8>) -> memref<120x120x120xi8> {
-    // CHECK: [[vc91_i32:%.*]] = arith.constant 91 : i32
-    // CHECK-NEXT: [[vc0_i32:%.*]] = arith.constant 0 : i32
-    // CHECK-NEXT: [[vc2_i32:%.*]] = arith.constant 2 : i32
-    // CHECK-NEXT: [[v0:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<2x120x120xi8>
-    // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[varg0]][118, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 1699200>>
-    // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 1699200>> to memref<2x120x120xi8>
-    // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc2_i32]], [[v0]]) : memref<2x120x120xi8>, i32, i32
-    // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc0_i32]], [[v0]]) : memref<2x120x120xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[varg0]][0, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1]>>
-    // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<2x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1]>>
-    // CHECK-NEXT: memref.dealloc [[valloc]] : memref<2x120x120xi8>
+    // CHECK-DAG: [[vc91_i32:%.*]] = arith.constant 91 : i32
+    // CHECK-DAG: [[vc0_i32:%.*]] = arith.constant 0 : i32
+    // CHECK-DAG: [[vc2_i32:%.*]] = arith.constant 2 : i32
+    // CHECK: [[v0:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc:%.*]] = memref.alloc() : memref<2x120x120xi8>
+    // CHECK: [[vsubview:%.*]] = memref.subview [[varg0]][118, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 1699200>>
+    // CHECK: memref.copy [[vsubview]], [[valloc]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 1699200>> to memref<2x120x120xi8>
+    // CHECK: mpi.send([[valloc]], [[vc91_i32]], [[vc2_i32]], [[v0]]) : memref<2x120x120xi8>, i32, i32
+    // CHECK: mpi.recv([[valloc]], [[vc91_i32]], [[vc0_i32]], [[v0]]) : memref<2x120x120xi8>, i32, i32
+    // CHECK: [[vsubview_0:%.*]] = memref.subview [[varg0]][0, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1]>>
+    // CHECK: memref.copy [[valloc]], [[vsubview_0]] : memref<2x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1]>>
+    // CHECK: memref.dealloc [[valloc]] : memref<2x120x120xi8>
     %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[0]] halo_sizes = [2, 0] : memref<120x120x120xi8>
     // CHECK: return [[varg0]] : memref<120x120x120xi8>
     return %res : memref<120x120x120xi8>
@@ -135,50 +192,50 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } {
   func.func @update_halo_3d(
     // CHECK-SAME: [[varg0:%.*]]: memref<120x120x120xi8>
     %arg0 : memref<120x120x120xi8>) -> memref<120x120x120xi8> {
-    // CHECK-NEXT: [[vc23_i32:%.*]] = arith.constant 23 : i32
-    // CHECK-NEXT: [[vc29_i32:%.*]] = arith.constant 29 : i32
-    // CHECK-NEXT: [[vc91_i32:%.*]] = arith.constant 91 : i32
-    // CHECK-NEXT: [[vc4_i32:%.*]] = arith.constant 4 : i32
-    // CHECK-NEXT: [[vc44_i32:%.*]] = arith.constant 44 : i32
-    // CHECK-NEXT: [[v0:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8>
-    // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[varg0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>>
-    // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> to memref<117x113x5xi8>
-    // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]], [[v0]]) : memref<117x113x5xi8>, i32, i32
-    // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]], [[v0]]) : memref<117x113x5xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[varg0]][1, 3, 0] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
-    // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<117x113x5xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
-    // CHECK-NEXT: memref.dealloc [[valloc]] : memref<117x113x5xi8>
-    // CHECK-NEXT: [[valloc_1:%.*]] = memref.alloc() : memref<117x113x6xi8>
-    // CHECK-NEXT: [[vsubview_2:%.*]] = memref.subview [[varg0]][1, 3, 5] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>>
-    // CHECK-NEXT: memref.copy [[vsubview_2]], [[valloc_1]] : memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> to memref<117x113x6xi8>
-    // CHECK-NEXT: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]], [[v0]]) : memref<117x113x6xi8>, i32, i32
-    // CHECK-NEXT: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]], [[v0]]) : memref<117x113x6xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_3:%.*]] = memref.subview [[varg0]][1, 3, 114] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
-    // CHECK-NEXT: memref.copy [[valloc_1]], [[vsubview_3]] : memref<117x113x6xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
-    // CHECK-NEXT: memref.dealloc [[valloc_1]] : memref<117x113x6xi8>
-    // CHECK-NEXT: [[v1:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc_4:%.*]] = memref.alloc() : memref<117x3x120xi8>
-    // CHECK-NEXT: [[vsubview_5:%.*]] = memref.subview [[varg0]][1, 113, 0] [117, 3, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>>
-    // CHECK-NEXT: memref.copy [[vsubview_5]], [[valloc_4]] : memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> to memref<117x3x120xi8>
-    // CHECK-NEXT: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]], [[v1]]) : memref<117x3x120xi8>, i32, i32
-    // CHECK-NEXT: memref.dealloc [[valloc_4]] : memref<117x3x120xi8>
-    // CHECK-NEXT: [[valloc_6:%.*]] = memref.alloc() : memref<117x4x120xi8>
-    // CHECK-NEXT: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]], [[v1]]) : memref<117x4x120xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_7:%.*]] = memref.subview [[varg0]][1, 116, 0] [117, 4, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
-    // CHECK-NEXT: memref.copy [[valloc_6]], [[vsubview_7]] : memref<117x4x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
-    // CHECK-NEXT: memref.dealloc [[valloc_6]] : memref<117x4x120xi8>
-    // CHECK-NEXT: [[v2:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc_8:%.*]] = memref.alloc() : memref<1x120x120xi8>
-    // CHECK-NEXT: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]], [[v2]]) : memref<1x120x120xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_9:%.*]] = memref.subview [[varg0]][0, 0, 0] [1, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
-    // CHECK-NEXT: memref.copy [[valloc_8]], [[vsubview_9]] : memref<1x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
-    // CHECK-NEXT: memref.dealloc [[valloc_8]] : memref<1x120x120xi8>
-    // CHECK-NEXT: [[valloc_10:%.*]] = memref.alloc() : memref<2x120x120xi8>
-    // CHECK-NEXT: [[vsubview_11:%.*]] = memref.subview [[varg0]][1, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>>
-    // CHECK-NEXT: memref.copy [[vsubview_11]], [[valloc_10]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> to memref<2x120x120xi8>
-    // CHECK-NEXT: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]], [[v2]]) : memref<2x120x120xi8>, i32, i32
-    // CHECK-NEXT: memref.dealloc [[valloc_10]] : memref<2x120x120xi8>
+    // CHECK-DAG: [[vc23_i32:%.*]] = arith.constant 23 : i32
+    // CHECK-DAG: [[vc29_i32:%.*]] = arith.constant 29 : i32
+    // CHECK-DAG: [[vc91_i32:%.*]] = arith.constant 91 : i32
+    // CHECK-DAG: [[vc4_i32:%.*]] = arith.constant 4 : i32
+    // CHECK-DAG: [[vc44_i32:%.*]] = arith.constant 44 : i32
+    // CHECK: [[v0:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8>
+    // CHECK: [[vsubview:%.*]] = memref.subview [[varg0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>>
+    // CHECK: memref.copy [[vsubview]], [[valloc]] : memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> to memref<117x113x5xi8>
+    // CHECK: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]], [[v0]]) : memref<117x113x5xi8>, i32, i32
+    // CHECK: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]], [[v0]]) : memref<117x113x5xi8>, i32, i32
+    // CHECK: [[vsubview_0:%.*]] = memref.subview [[varg0]][1, 3, 0] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
+    // CHECK: memref.copy [[valloc]], [[vsubview_0]] : memref<117x113x5xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
+    // CHECK: memref.dealloc [[valloc]] : memref<117x113x5xi8>
+    // CHECK: [[valloc_1:%.*]] = memref.alloc() : memref<117x113x6xi8>
+    // CHECK: [[vsubview_2:%.*]] = memref.subview [[varg0]][1, 3, 5] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>>
+    // CHECK: memref.copy [[vsubview_2]], [[valloc_1]] : memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> to memref<117x113x6xi8>
+    // CHECK: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]], [[v0]]) : memref<117x113x6xi8>, i32, i32
+    // CHECK: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]], [[v0]]) : memref<117x113x6xi8>, i32, i32
+    // CHECK: [[vsubview_3:%.*]] = memref.subview [[varg0]][1, 3, 114] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
+    // CHECK: memref.copy [[valloc_1]], [[vsubview_3]] : memref<117x113x6xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
+    // CHECK: memref.dealloc [[valloc_1]] : memref<117x113x6xi8>
+    // CHECK: [[v1:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc_4:%.*]] = memref.alloc() : memref<117x3x120xi8>
+    // CHECK: [[vsubview_5:%.*]] = memref.subview [[varg0]][1, 113, 0] [117, 3, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>>
+    // CHECK: memref.copy [[vsubview_5]], [[valloc_4]] : memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> to memref<117x3x120xi8>
+    // CHECK: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]], [[v1]]) : memref<117x3x120xi8>, i32, i32
+    // CHECK: memref.dealloc [[valloc_4]] : memref<117x3x120xi8>
+    // CHECK: [[valloc_6:%.*]] = memref.alloc() : memref<117x4x120xi8>
+    // CHECK: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]], [[v1]]) : memref<117x4x120xi8>, i32, i32
+    // CHECK: [[vsubview_7:%.*]] = memref.subview [[varg0]][1, 116, 0] [117, 4, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
+    // CHECK: memref.copy [[valloc_6]], [[vsubview_7]] : memref<117x4x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
+    // CHECK: memref.dealloc [[valloc_6]] : memref<117x4x120xi8>
+    // CHECK: [[v2:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc_8:%.*]] = memref.alloc() : memref<1x120x120xi8>
+    // CHECK: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]], [[v2]]) : memref<1x120x120xi8>, i32, i32
+    // CHECK: [[vsubview_9:%.*]] = memref.subview [[varg0]][0, 0, 0] [1, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
+    // CHECK: memref.copy [[valloc_8]], [[vsubview_9]] : memref<1x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
+    // CHECK: memref.dealloc [[valloc_8]] : memref<1x120x120xi8>
+    // CHECK: [[valloc_10:%.*]] = memref.alloc() : memref<2x120x120xi8>
+    // CHECK: [[vsubview_11:%.*]] = memref.subview [[varg0]][1, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>>
+    // CHECK: memref.copy [[vsubview_11]], [[valloc_10]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> to memref<2x120x120xi8>
+    // CHECK: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]], [[v2]]) : memref<2x120x120xi8>, i32, i32
+    // CHECK: memref.dealloc [[valloc_10]] : memref<2x120x120xi8>
     %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[2], [1], [0]] halo_sizes = [1, 2, 3, 4, 5, 6] : memref<120x120x120xi8>
     // CHECK: return [[varg0]] : memref<120x120x120xi8>
     return %res : memref<120x120x120xi8>
@@ -188,54 +245,54 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } {
   func.func @update_halo_3d_tensor(
     // CHECK-SAME: [[varg0:%.*]]: tensor<120x120x120xi8>
     %arg0 : tensor<120x120x120xi8>) -> tensor<120x120x120xi8> {
-    // CHECK-NEXT: [[vc23_i32:%.*]] = arith.constant 23 : i32
-    // CHECK-NEXT: [[vc29_i32:%.*]] = arith.constant 29 : i32
-    // CHECK-NEXT: [[vc44_i32:%.*]] = arith.constant 44 : i32
-    // CHECK-NEXT: [[vc4_i32:%.*]] = arith.constant 4 : i32
-    // CHECK-NEXT: [[vc91_i32:%.*]] = arith.constant 91 : i32
-    // CHECK-NEXT: [[v0:%.*]] = bufferization.to_buffer [[varg0]] : tensor<120x120x120xi8> to memref<120x120x120xi8>
-    // CHECK-NEXT: [[v1:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8>
-    // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[v0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>>
-    // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> to memref<117x113x5xi8>
-    // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]], [[v1]]) : memref<117x113x5xi8>, i32, i32
-    // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]], [[v1]]) : memref<117x113x5xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[v0]][1, 3, 0] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
-    // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<117x113x5xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
-    // CHECK-NEXT: memref.dealloc [[valloc]] : memref<117x113x5xi8>
-    // CHECK-NEXT: [[valloc_1:%.*]] = memref.alloc() : memref<117x113x6xi8>
-    // CHECK-NEXT: [[vsubview_2:%.*]] = memref.subview [[v0]][1, 3, 5] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>>
-    // CHECK-NEXT: memref.copy [[vsubview_2]], [[valloc_1]] : memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> to memref<117x113x6xi8>
-    // CHECK-NEXT: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]], [[v1]]) : memref<117x113x6xi8>, i32, i32
-    // CHECK-NEXT: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]], [[v1]]) : memref<117x113x6xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_3:%.*]] = memref.subview [[v0]][1, 3, 114] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
-    // CHECK-NEXT: memref.copy [[valloc_1]], [[vsubview_3]] : memref<117x113x6xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
-    // CHECK-NEXT: memref.dealloc [[valloc_1]] : memref<117x113x6xi8>
-    // CHECK-NEXT: [[v2:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc_4:%.*]] = memref.alloc() : memref<117x3x120xi8>
-    // CHECK-NEXT: [[vsubview_5:%.*]] = memref.subview [[v0]][1, 113, 0] [117, 3, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>>
-    // CHECK-NEXT: memref.copy [[vsubview_5]], [[valloc_4]] : memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> to memref<117x3x120xi8>
-    // CHECK-NEXT: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]], [[v2]]) : memref<117x3x120xi8>, i32, i32
-    // CHECK-NEXT: memref.dealloc [[valloc_4]] : memref<117x3x120xi8>
-    // CHECK-NEXT: [[valloc_6:%.*]] = memref.alloc() : memref<117x4x120xi8>
-    // CHECK-NEXT: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]], [[v2]]) : memref<117x4x120xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_7:%.*]] = memref.subview [[v0]][1, 116, 0] [117, 4, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
-    // CHECK-NEXT: memref.copy [[valloc_6]], [[vsubview_7]] : memref<117x4x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
-    // CHECK-NEXT: memref.dealloc [[valloc_6]] : memref<117x4x120xi8>
-    // CHECK-NEXT: [[v3:%.*]] = mpi.comm_world : !mpi.comm
-    // CHECK-NEXT: [[valloc_8:%.*]] = memref.alloc() : memref<1x120x120xi8>
-    // CHECK-NEXT: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]], [[v3]]) : memref<1x120x120xi8>, i32, i32
-    // CHECK-NEXT: [[vsubview_9:%.*]] = memref.subview [[v0]][0, 0, 0] [1, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
-    // CHECK-NEXT: memref.copy [[valloc_8]], [[vsubview_9]] : memref<1x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
-    // CHECK-NEXT: memref.dealloc [[valloc_8]] : memref<1x120x120xi8>
-    // CHECK-NEXT: [[valloc_10:%.*]] = memref.alloc() : memref<2x120x120xi8>
-    // CHECK-NEXT: [[vsubview_11:%.*]] = memref.subview [[v0]][1, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>>
-    // CHECK-NEXT: memref.copy [[vsubview_11]], [[valloc_10]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> to memref<2x120x120xi8>
-    // CHECK-NEXT: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]], [[v3]]) : memref<2x120x120xi8>, i32, i32
-    // CHECK-NEXT: memref.dealloc [[valloc_10]] : memref<2x120x120xi8>
-    // CHECK-NEXT: [[v4:%.*]] = bufferization.to_tensor [[v0]] restrict writable : memref<120x120x120xi8> to tensor<120x120x120xi8>
+    // CHECK-DAG: [[vc23_i32:%.*]] = arith.constant 23 : i32
+    // CHECK-DAG: [[vc29_i32:%.*]] = arith.constant 29 : i32
+    // CHECK-DAG: [[vc44_i32:%.*]] = arith.constant 44 : i32
+    // CHECK-DAG: [[vc4_i32:%.*]] = arith.constant 4 : i32
+    // CHECK-DAG: [[vc91_i32:%.*]] = arith.constant 91 : i32
+    // CHECK: [[v0:%.*]] = bufferization.to_buffer [[varg0]] : tensor<120x120x120xi8> to memref<120x120x120xi8>
+    // CHECK: [[v1:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8>
+    // CHECK: [[vsubview:%.*]] = memref.subview [[v0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>>
+    // CHECK: memref.copy [[vsubview]], [[valloc]] : memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> to memref<117x113x5xi8>
+    // CHECK: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]], [[v1]]) : memref<117x113x5xi8>, i32, i32
+    // CHECK: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]], [[v1]]) : memref<117x113x5xi8>, i32, i32
+    // CHECK: [[vsubview_0:%.*]] = memref.subview [[v0]][1, 3, 0] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
+    // CHECK: memref.copy [[valloc]], [[vsubview_0]] : memref<117x113x5xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>>
+    // CHECK: memref.dealloc [[valloc]] : memref<117x113x5xi8>
+    // CHECK: [[valloc_1:%.*]] = memref.alloc() : memref<117x113x6xi8>
+    // CHECK: [[vsubview_2:%.*]] = memref.subview [[v0]][1, 3, 5] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>>
+    // CHECK: memref.copy [[vsubview_2]], [[valloc_1]] : memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> to memref<117x113x6xi8>
+    // CHECK: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]], [[v1]]) : memref<117x113x6xi8>, i32, i32
+    // CHECK: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]], [[v1]]) : memref<117x113x6xi8>, i32, i32
+    // CHECK: [[vsubview_3:%.*]] = memref.subview [[v0]][1, 3, 114] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
+    // CHECK: memref.copy [[valloc_1]], [[vsubview_3]] : memref<117x113x6xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>>
+    // CHECK: memref.dealloc [[valloc_1]] : memref<117x113x6xi8>
+    // CHECK: [[v2:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc_4:%.*]] = memref.alloc() : memref<117x3x120xi8>
+    // CHECK: [[vsubview_5:%.*]] = memref.subview [[v0]][1, 113, 0] [117, 3, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>>
+    // CHECK: memref.copy [[vsubview_5]], [[valloc_4]] : memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> to memref<117x3x120xi8>
+    // CHECK: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]], [[v2]]) : memref<117x3x120xi8>, i32, i32
+    // CHECK: memref.dealloc [[valloc_4]] : memref<117x3x120xi8>
+    // CHECK: [[valloc_6:%.*]] = memref.alloc() : memref<117x4x120xi8>
+    // CHECK: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]], [[v2]]) : memref<117x4x120xi8>, i32, i32
+    // CHECK: [[vsubview_7:%.*]] = memref.subview [[v0]][1, 116, 0] [117, 4, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
+    // CHECK: memref.copy [[valloc_6]], [[vsubview_7]] : memref<117x4x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>>
+    // CHECK: memref.dealloc [[valloc_6]] : memref<117x4x120xi8>
+    // CHECK: [[v3:%.*]] = mpi.comm_world : !mpi.comm
+    // CHECK: [[valloc_8:%.*]] = memref.alloc() : memref<1x120x120xi8>
+    // CHECK: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]], [[v3]]) : memref<1x120x120xi8>, i32, i32
+    // CHECK: [[vsubview_9:%.*]] = memref.subview [[v0]][0, 0, 0] [1, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
+    // CHECK: memref.copy [[valloc_8]], [[vsubview_9]] : memref<1x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>>
+    // CHECK: memref.dealloc [[valloc_8]] : memref<1x120x120xi8>
+    // CHECK: [[valloc_10:%.*]] = memref.alloc() : memref<2x120x120xi8>
+    // CHECK: [[vsubview_11:%.*]] = memref.subview [[v0]][1, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>>
+    // CHECK: memref.copy [[vsubview_11]], [[valloc_10]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> to memref<2x120x120xi8>
+    // CHECK: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]], [[v3]]) : memref<2x120x120xi8>, i32, i32
+    // CHECK: memref.dealloc [[valloc_10]] : memref<2x120x120xi8>
+    // CHECK: [[v4:%.*]] = bufferization.to_tensor [[v0]] restrict writable : memref<120x120x120xi8> to tensor<120x120x120xi8>
     %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[2], [1], [0]] halo_sizes = [1, 2, 3, 4, 5, 6] : tensor<120x120x120xi8>
-    // CHECK-NEXT: return [[v4]] : tensor<120x120x120xi8>
+    // CHECK: return [[v4]] : tensor<120x120x120xi8>
     return %res : tensor<120x120x120xi8>
   }
 }
@@ -246,19 +303,19 @@ mesh.mesh @mesh0(shape = 2x2x4)
 // CHECK-SAME: [[varg0:%.*]]: tensor<2x4xf32>) -> (tensor<2x4xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>) {
 func.func @return_sharding(%arg0: tensor<2x4xf32>) -> (tensor<2x4xf32>, !mesh.sharding) {
   %sharding = mesh.sharding @mesh0 split_axes = [[0, 1], [2]] : !mesh.sharding
-  // CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<2> : tensor<1xi16>
-  // CHECK-NEXT: [[vcst_0:%.*]] = arith.constant dense<[0, 1]> : tensor<2xi16>
-  // CHECK-NEXT: [[vcm1_i16:%.*]] = arith.constant -1 : i16
-  // CHECK-NEXT: [[v0:%.*]] = tensor.empty() : tensor<2x2xi16>
-  // CHECK-NEXT: [[v1:%.*]] = linalg.fill ins([[vcm1_i16]] : i16) outs([[v0]] : tensor<2x2xi16>) -> tensor<2x2xi16>
-  // CHECK-NEXT: [[vinserted_slice:%.*]] = tensor.insert_slice [[vcst_0]] into [[v1]][0, 0] [1, 2] [1, 1] : tensor<2xi16> into tensor<2x2xi16>
-  // CHECK-NEXT: [[vinserted_slice_1:%.*]] = tensor.insert_slice [[vcst]] into [[vinserted_slice]][1, 0] [1, 1] [1, 1] : tensor<1xi16> into tensor<2x2xi16>
-  // CHECK-NEXT: [[v2:%.*]] = tensor.empty() : tensor<0x0xi64>
-  // CHECK-NEXT: [[v3:%.*]] = tensor.empty() : tensor<0x0xi64>
-  // CHECK-NEXT: [[vcast:%.*]] = tensor.cast [[vinserted_slice_1]] : tensor<2x2xi16> to tensor<?x?xi16>
-  // CHECK-NEXT: [[vcast_2:%.*]] = tensor.cast [[v2]] : tensor<0x0xi64> to tensor<?x?xi64>
-  // CHECK-NEXT: [[vcast_3:%.*]] = tensor.cast [[v3]] : tensor<0x0xi64> to tensor<?x?xi64>
-  // CHECK-NEXT: return [[varg0]], [[vcast]], [[vcast_2]], [[vcast_3]] : tensor<2x4xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>
+  // CHECK: [[vcst:%.*]] = arith.constant dense<2> : tensor<1xi16>
+  // CHECK: [[vcst_0:%.*]] = arith.constant dense<[0, 1]> : tensor<2xi16>
+  // CHECK: [[vcm1_i16:%.*]] = arith.constant -1 : i16
+  // CHECK: [[v0:%.*]] = tensor.empty() : tensor<2x2xi16>
+  // CHECK: [[v1:%.*]] = linalg.fill ins([[vcm1_i16]] : i16) outs([[v0]] : tensor<2x2xi16>) -> tensor<2x2xi16>
+  // CHECK: [[vinserted_slice:%.*]] = tensor.insert_slice [[vcst_0]] into [[v1]][0, 0] [1, 2] [1, 1] : tensor<2xi16> into tensor<2x2xi16>
+  // CHECK: [[vinserted_slice_1:%.*]] = tensor.insert_slice [[vcst]] into [[vinserted_slice]][1, 0] [1, 1] [1, 1] : tensor<1xi16> into tensor<2x2xi16>
+  // CHECK: [[v2:%.*]] = tensor.empty() : tensor<0x0xi64>
+  // CHECK: [[v3:%.*]] = tensor.empty() : tensor<0x0xi64>
+  // CHECK: [[vcast:%.*]] = tensor.cast [[vinserted_slice_1]] : tensor<2x2xi16> to tensor<?x?xi16>
+  // CHECK: [[vcast_2:%.*]] = tensor.cast [[v2]] : tensor<0x0xi64> to tensor<?x?xi64>
+  // CHECK: [[vcast_3:%.*]] = tensor.cast [[v3]] : tensor<0x0xi64> to tensor<?x?xi64>
+  // CHECK: return [[varg0]], [[vcast]], [[vcast_2]], [[vcast_3]] : tensor<2x4xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>
   return %arg0, %sharding : tensor<2x4xf32>, !mesh.sharding
 }
 
@@ -266,19 +323,19 @@ func.func @return_sharding(%arg0: tensor<2x4xf32>) -> (tensor<2x4xf32>, !mesh.sh
 // CHECK-SAME: [[varg0:%.*]]: tensor<6x8xf32>) -> (tensor<6x8xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>) {
 func.func @return_sharding_halos(%arg0: tensor<6x8xf32>) -> (tensor<6x8xf32>, !mesh.sharding) {
   %sharding = mesh.sharding @mesh0 split_axes = [[0, 1], [2]] halo_sizes = [0, 4, 3, 1] : !mesh.sharding
-  // CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<{{\[\[}}0, 4], [3, 1]]> : tensor<2x2xi64>
-  // CHECK-NEXT: [[vcst_0:%.*]] = arith.constant dense<2> : tensor<1xi16>
-  // CHECK-NEXT: [[vcst_1:%.*]] = arith.constant dense<[0, 1]> : tensor<2xi16>
-  // CHECK-NEXT: [[vcm1_i16:%.*]] = arith.constant -1 : i16
-  // CHECK-NEXT: [[v0:%.*]] = tensor.empty() : tensor<2x2xi16>
-  // CHECK-NEXT: [[v1:%.*]] = linalg.fill ins([[vcm1_i16]] : i16) outs([[v0]] : tensor<2x2xi16>) -> tensor<2x2xi16>
-  // CHECK-NEXT: [[vinserted_slice:%.*]] = tensor.insert_slice [[vcst_1]] into [[v1]][0, 0] [1, 2] [1, 1] : tensor<2xi16> into tensor<2x2xi16>
-  // CHECK-NEXT: [[vinserted_slice_2:%.*]] = tensor.insert_slice [[vcst_0]] into [[vinserted_slice]][1, 0] [1, 1] [1, 1] : tensor<1xi16> into tensor<2x2xi16>
-  // CHECK-NEXT: [[v2:%.*]] = tensor.empty() : tensor<0x0xi64>
-  // CHECK-NEXT: [[vcast:%.*]] = tensor.cast [[vinserted_slice_2]] : tensor<2x2xi16> to tensor<?x?xi16>
-  // CHECK-NEXT: [[vcast_3:%.*]] = tensor.cast [[vcst]] : tensor<2x2xi64> to tensor<?x?xi64>
-  // CHECK-NEXT: [[vcast_4:%.*]] = tensor.cast [[v2]] : tensor<0x0xi64> to tensor<?x?xi64>
-  // CHECK-NEXT: return [[varg0]], [[vcast]], [[vcast_3]], [[vcast_4]] : tensor<6x8xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>
+  // CHECK: [[vcst:%.*]] = arith.constant dense<{{\[\[}}0, 4], [3, 1]]> : tensor<2x2xi64>
+  // CHECK: [[vcst_0:%.*]] = arith.constant dense<2> : tensor<1xi16>
+  // CHECK: [[vcst_1:%.*]] = arith.constant dense<[0, 1]> : tensor<2xi16>
+  // CHECK: [[vcm1_i16:%.*]] = arith.constant -1 : i16
+  // CHECK: [[v0:%.*]] = tensor.empty() : tensor<2x2xi16>
+  // CHECK: [[v1:%.*]] = linalg.fill ins([[vcm1_i16]] : i16) outs([[v0]] : tensor<2x2xi16>) -> tensor<2x2xi16>
+  // CHECK: [[vinserted_slice:%.*]] = tensor.insert_slice [[vcst_1]] into [[v1]][0, 0] [1, 2] [1, 1] : tensor<2xi16> into tensor<2x2xi16>
+  // CHECK: [[vinserted_slice_2:%.*]] = tensor.insert_slice [[vcst_0]] into [[vinserted_slice]][1, 0] [1, 1] [1, 1] : tensor<1xi16> into tensor<2x2xi16>
+  // CHECK: [[v2:%.*]] = tensor.empty() : tensor<0x0xi64>
+  // CHECK: [[vcast:%.*]] = tensor.cast [[vinserted_slice_2]] : tensor<2x2xi16> to tensor<?x?xi16>
+  // CHECK: [[vcast_3:%.*]] = tensor.cast [[vcst]] : tensor<2x2xi64> to tensor<?x?xi64>
+  // CHECK: [[vcast_4:%.*]] = tensor.cast [[v2]] : tensor<0x0xi64> to tensor<?x?xi64>
+  // CHECK: return [[varg0]], [[vcast]], [[vcast_3]], [[vcast_4]] : tensor<6x8xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>
   return %arg0, %sharding : tensor<6x8xf32>, !mesh.sharding
 }
 
@@ -286,24 +343,24 @@ func.func @return_sharding_halos(%arg0: tensor<6x8xf32>) -> (tensor<6x8xf32>, !m
 // CHECK-SAME: [[varg0:%.*]]: tensor<?x?xf32>) -> (tensor<?x?xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>) {
 func.func @return_sharding_offs(%arg0: tensor<?x?xf32>) -> (tensor<?x?xf32>, !mesh.sharding) {
   %sharding = mesh.sharding @mesh0 split_axes = [[0, 1], [2]] sharded_dims_offsets = [0, 3, 5, 7, 8, 0, 0, 5, 10, 16] : !mesh.sharding
-  // CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<[0, 0, 5, 10, 16]> : tensor<5xi64>
-  // CHECK-NEXT: [[vcst_0:%.*]] = arith.constant dense<[0, 3, 5, 7, 8]> : tensor<5xi64>
-  // CHECK-NEXT: [[vcm9223372036854775808_i64:%.*]] = arith.constant -9223372036854775808 : i64
-  // CHECK-NEXT: [[vcst_1:%.*]] = arith.constant dense<2> : tensor<1xi16>
-  // CHECK-NEXT: [[vcst_2:%.*]] = arith.constant dense<[0, 1]> : tensor<2xi16>
-  // CHECK-NEXT: [[vcm1_i16:%.*]] = arith.constant -1 : i16
-  // CHECK-NEXT: [[v0:%.*]] = tensor.empty() : tensor<2x2xi16>
-  // CHECK-NEXT: [[v1:%.*]] = linalg.fill ins([[vcm1_i16]] : i16) outs([[v0]] : tensor<2x2xi16>) -> tensor<2x2xi16>
-  // CHECK-NEXT: [[vinserted_slice:%.*]] = tensor.insert_slice [[vcst_2]] into [[v1]][0, 0] [1, 2] [1, 1] : tensor<2xi16> into tensor<2x2xi16>
-  // CHECK-NEXT: [[vinserted_slice_3:%.*]] = tensor.insert_slice [[vcst_1]] into [[vinserted_slice]][1, 0] [1, 1] [1, 1] : tensor<1xi16> into tensor<2x2xi16>
-  // CHECK-NEXT: [[v2:%.*]] = tensor.empty() : tensor<0x0xi64>
-  // CHECK-NEXT: [[v3:%.*]] = tensor.empty() : tensor<2x5xi64>
-  // CHECK-NEXT: [[v4:%.*]] = linalg.fill ins([[vcm9223372036854775808_i64]] : i64) outs([[v3]] : tensor<2x5xi64>) -> tensor<2x5xi64>
-  // CHECK-NEXT: [[vinserted_slice_4:%.*]] = tensor.insert_slice [[vcst_0]] into [[v4]][0, 0] [1, 5] [1, 1] : tensor<5xi64> into tensor<2x5xi64>
-  // CHECK-NEXT: [[vinserted_slice_5:%.*]] = tensor.insert_slice [[vcst]] into [[vinserted_slice_4]][1, 0] [1, 5] [1, 1] : tensor<5xi64> into tensor<2x5xi64>
-  // CHECK-NEXT: [[vcast:%.*]] = tensor.cast [[vinserted_slice_3]] : tensor<2x2xi16> to tensor<?x?xi16>
-  // CHECK-NEXT: [[vcast_6:%.*]] = tensor.cast [[v2]] : tensor<0x0xi64> to tensor<?x?xi64>
-  // CHECK-NEXT: [[vcast_7:%.*]] = tensor.cast [[vinserted_slice_5]] : tensor<2x5xi64> to tensor<?x?xi64>
-  // CHECK-NEXT: return [[varg0]], [[vcast]], [[vcast_6]], [[vcast_7]] : tensor<?x?xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>
+  // CHECK: [[vcst:%.*]] = arith.constant dense<[0, 0, 5, 10, 16]> : tensor<5xi64>
+  // CHECK: [[vcst_0:%.*]] = arith.constant dense<[0, 3, 5, 7, 8]> : tensor<5xi64>
+  // CHECK: [[vcm9223372036854775808_i64:%.*]] = arith.constant -9223372036854775808 : i64
+  // CHECK: [[vcst_1:%.*]] = arith.constant dense<2> : tensor<1xi16>
+  // CHECK: [[vcst_2:%.*]] = arith.constant dense<[0, 1]> : tensor<2xi16>
+  // CHECK: [[vcm1_i16:%.*]] = arith.constant -1 : i16
+  // CHECK: [[v0:%.*]] = tensor.empty() : tensor<2x2xi16>
+  // CHECK: [[v1:%.*]] = linalg.fill ins([[vcm1_i16]] : i16) outs([[v0]] : tensor<2x2xi16>) -> tensor<2x2xi16>
+  // CHECK: [[vinserted_slice:%.*]] = tensor.insert_slice [[vcst_2]] into [[v1]][0, 0] [1, 2] [1, 1] : tensor<2xi16> into tensor<2x2xi16>
+  // CHECK: [[vinserted_slice_3:%.*]] = tensor.insert_slice [[vcst_1]] into [[vinserted_slice]][1, 0] [1, 1] [1, 1] : tensor<1xi16> into tensor<2x2xi16>
+  // CHECK: [[v2:%.*]] = tensor.empty() : tensor<0x0xi64>
+  // CHECK: [[v3:%.*]] = tensor.empty() : tensor<2x5xi64>
+  // CHECK: [[v4:%.*]] = linalg.fill ins([[vcm9223372036854775808_i64]] : i64) outs([[v3]] : tensor<2x5xi64>) -> tensor<2x5xi64>
+  // CHECK: [[vinserted_slice_4:%.*]] = tensor.insert_slice [[vcst_0]] into [[v4]][0, 0] [1, 5] [1, 1] : tensor<5xi64> into tensor<2x5xi64>
+  // CHECK: [[vinserted_slice_5:%.*]] = tensor.insert_slice [[vcst]] into [[vinserted_slice_4]][1, 0] [1, 5] [1, 1] : tensor<5xi64> into tensor<2x5xi64>
+  // CHECK: [[vcast:%.*]] = tensor.cast [[vinserted_slice_3]] : tensor<2x2xi16> to tensor<?x?xi16>
+  // CHECK: [[vcast_6:%.*]] = tensor.cast [[v2]] : tensor<0x0xi64> to tensor<?x?xi64>
+  // CHECK: [[vcast_7:%.*]] = tensor.cast [[vinserted_slice_5]] : tensor<2x5xi64> to tensor<?x?xi64>
+  // CHECK: return [[varg0]], [[vcast]], [[vcast_6]], [[vcast_7]] : tensor<?x?xf32>, tensor<?x?xi16>, tensor<?x?xi64>, tensor<?x?xi64>
   return %arg0, %sharding : tensor<?x?xf32>, !mesh.sharding
 }

From c5b256a0e48037fc3ab12102b14511e9ac1e3242 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Mon, 23 Jun 2025 14:26:14 +0200
Subject: [PATCH 1271/1322] [MLIR][MPI] adding MLIRDLTIDialect when linking
 MLIRMPIDialect (#145316)

Fixing buildbot errors on some platforms like
```
undefined reference to `mlir::dlti::query(mlir::Operation*, llvm::ArrayRef<llvm::StringRef>, bool)'
```
Introduced in #144716
---
 mlir/lib/Dialect/MPI/IR/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/MPI/IR/CMakeLists.txt b/mlir/lib/Dialect/MPI/IR/CMakeLists.txt
index edd7d911d024..86c53495f01a 100644
--- a/mlir/lib/Dialect/MPI/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/MPI/IR/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRMPIDialect
 
   LINK_LIBS PUBLIC
   MLIRDialect
+  MLIRDLTIDialect
   MLIRIR
   MLIRMemRefDialect
   MLIRInferTypeOpInterface

From 9881a507a2a0d5e717e97206ef66d34318672be7 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 23 Jun 2025 14:19:06 +0200
Subject: [PATCH 1272/1322] [bazel] Port
 8584b216b87085a913fe39be15bfa4ab4754aeb9

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index ab85731dcde4..5543f1304894 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3185,6 +3185,7 @@ cc_library(
     hdrs = glob(["include/mlir/Conversion/MeshToMPI/*.h"]),
     includes = ["include"],
     deps = [
+        ":AffineDialect",
         ":ArithDialect",
         ":BufferizationDialect",
         ":ConversionPassIncGen",
@@ -3197,6 +3198,7 @@ cc_library(
         ":MPIDialect",
         ":MemRefDialect",
         ":MeshDialect",
+        ":MeshTransforms",
         ":Pass",
         ":SCFDialect",
         ":Support",
@@ -12503,6 +12505,7 @@ td_library(
     deps = [
         ":AttrTdFiles",
         ":OpBaseTdFiles",
+        ":SideEffectInterfacesTdFiles",
     ],
 )
 
@@ -12561,6 +12564,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BytecodeOpInterface",
+        ":DLTIDialect",
         ":IR",
         ":MPIAttrsIncGen",
         ":MPIIncGen",

From 23ba0fdbfc2a240ca5ddcc7f71c7383437c00aaa Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 23 Jun 2025 14:43:11 +0200
Subject: [PATCH 1273/1322] [clang][bytecode] Fix assignInteger() with
 allocated primtypes (#145302)

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 23 ++++++++++++++-----
 clang/test/AST/ByteCode/builtin-functions.cpp | 14 +++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5304bd77f2c0..ea96e21ea944 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -96,10 +96,21 @@ static void pushInteger(InterpState &S, T Val, QualType QT) {
                 QT);
 }
 
-static void assignInteger(const Pointer &Dest, PrimType ValueT,
+static void assignInteger(InterpState &S, const Pointer &Dest, PrimType ValueT,
                           const APSInt &Value) {
-  INT_TYPE_SWITCH_NO_BOOL(
-      ValueT, { Dest.deref<T>() = T::from(static_cast<T>(Value)); });
+
+  if (ValueT == PT_IntAPS) {
+    Dest.deref<IntegralAP<true>>() =
+        S.allocAP<IntegralAP<true>>(Value.getBitWidth());
+    Dest.deref<IntegralAP<true>>().copy(Value);
+  } else if (ValueT == PT_IntAP) {
+    Dest.deref<IntegralAP<false>>() =
+        S.allocAP<IntegralAP<false>>(Value.getBitWidth());
+    Dest.deref<IntegralAP<false>>().copy(Value);
+  } else {
+    INT_TYPE_SWITCH_NO_BOOL(
+        ValueT, { Dest.deref<T>() = T::from(static_cast<T>(Value)); });
+  }
 }
 
 static QualType getElemType(const Pointer &P) {
@@ -849,7 +860,7 @@ static bool interp__builtin_overflowop(InterpState &S, CodePtr OpPC,
   }
 
   // Write Result to ResultPtr and put Overflow on the stack.
-  assignInteger(ResultPtr, ResultT, Result);
+  assignInteger(S, ResultPtr, ResultT, Result);
   ResultPtr.initialize();
   assert(Call->getDirectCallee()->getReturnType()->isBooleanType());
   S.Stk.push<Boolean>(Overflow);
@@ -902,7 +913,7 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC,
 
   QualType CarryOutType = Call->getArg(3)->getType()->getPointeeType();
   PrimType CarryOutT = *S.getContext().classify(CarryOutType);
-  assignInteger(CarryOutPtr, CarryOutT, CarryOut);
+  assignInteger(S, CarryOutPtr, CarryOutT, CarryOut);
   CarryOutPtr.initialize();
 
   assert(Call->getType() == Call->getArg(0)->getType());
@@ -1414,7 +1425,7 @@ static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S,
 
   QualType CarryOutType = Call->getArg(3)->getType()->getPointeeType();
   PrimType CarryOutT = *S.getContext().classify(CarryOutType);
-  assignInteger(CarryOutPtr, CarryOutT, APSInt(Result, true));
+  assignInteger(S, CarryOutPtr, CarryOutT, APSInt(Result, true));
 
   pushInteger(S, CarryOut, Call->getType());
 
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 174c1ffa79a4..3b95a8ea4859 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1739,4 +1739,18 @@ namespace WithinLifetime {
                                            // both-warning {{expression result unused}}
   }
 }
+
+#ifdef __SIZEOF_INT128__
+namespace I128Mul {
+  constexpr int mul() {
+    __int128 A = 10;
+    __int128 B = 10;
+    __int128 R;
+    __builtin_mul_overflow(A, B, &R);
+    return 1;
+  }
+  static_assert(mul() == 1);
+}
+#endif
+
 #endif

From 9e704a0aa1588f4a5204fb308c213819400a83cc Mon Sep 17 00:00:00 2001
From: Vitalii Shutov <vitalii.shutov@arm.com>
Date: Mon, 23 Jun 2025 13:44:20 +0100
Subject: [PATCH 1274/1322] [MLIR][MemRef] Add `alloca` support for
 `erase_dead_alloc_and_stores` (#142131)

Previously, `erase_dead_alloc_and_stores` didn't support
`memref.alloca`. This patch introduces support for it.

---------

Signed-off-by: Vitalii Shutov <vitalii.shutov@arm.com>
---
 .../MemRef/TransformOps/MemRefTransformOps.td |  2 +-
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp |  8 ++++---
 mlir/test/Dialect/MemRef/transform-ops.mlir   | 24 +++++++++++++++++++
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
index 2d060f3c2da6..f4694a30a8a1 100644
--- a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
@@ -245,7 +245,7 @@ def MemRefEraseDeadAllocAndStoresOp
     ]> {
   let description = [{
     This applies memory optimization on memref. In particular it does store to
-    load forwarding, dead store elimination and dead alloc elimination.
+    load forwarding, dead store elimination and dead alloc/alloca elimination.
 
     #### Return modes
 
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 3f9fb071e0ba..8735b10255ae 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -156,13 +156,15 @@ static bool resultIsNotRead(Operation *op, std::vector<Operation *> &uses) {
 
 void eraseDeadAllocAndStores(RewriterBase &rewriter, Operation *parentOp) {
   std::vector<Operation *> opToErase;
-  parentOp->walk([&](memref::AllocOp op) {
+  parentOp->walk([&](Operation *op) {
     std::vector<Operation *> candidates;
-    if (resultIsNotRead(op, candidates)) {
+    if (isa<memref::AllocOp, memref::AllocaOp>(op) &&
+        resultIsNotRead(op, candidates)) {
       llvm::append_range(opToErase, candidates);
-      opToErase.push_back(op.getOperation());
+      opToErase.push_back(op);
     }
   });
+
   for (Operation *op : opToErase)
     rewriter.eraseOp(op);
 }
diff --git a/mlir/test/Dialect/MemRef/transform-ops.mlir b/mlir/test/Dialect/MemRef/transform-ops.mlir
index acab37e482cf..3b37c62fcb28 100644
--- a/mlir/test/Dialect/MemRef/transform-ops.mlir
+++ b/mlir/test/Dialect/MemRef/transform-ops.mlir
@@ -327,6 +327,30 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
+// -----
+
+// CHECK-LABEL: func.func @dead_alloca
+func.func @dead_alloca() {
+  // CHECK-NOT: %{{.+}} = memref.alloca
+  %0 = memref.alloca() : memref<8x64xf32, 3>
+  %1 = memref.subview %0[0, 0] [8, 4] [1, 1] : memref<8x64xf32, 3> to
+    memref<8x4xf32, affine_map<(d0, d1) -> (d0 * 64 + d1)>, 3>
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+  vector.transfer_write %cst_0, %1[%c0, %c0] {in_bounds = [true, true]} :
+    vector<1x4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 * 64 + d1)>, 3>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
+
+
 // -----
 
 // CHECK-LABEL: @store_to_load

From 9a6a87da6e618d25c23f5f8cf6e4e0f49d4f702c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 14:54:49 +0200
Subject: [PATCH 1275/1322] [AutoUpgrade] Remove unnecessary name check (NFCI)

If only the name is incorrect (due to added overload), but the
signature is correct, we should go through the generic remangling
upgrade.
---
 llvm/lib/IR/AutoUpgrade.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 6e7254ec3e31..e429e2d65c05 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1443,13 +1443,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     break;
   }
   case 'o':
-    // We only need to change the name to match the mangling including the
-    // address space.
     if (Name.starts_with("objectsize.")) {
       Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
-      if (F->arg_size() == 2 || F->arg_size() == 3 ||
-          F->getName() !=
-              Intrinsic::getName(Intrinsic::objectsize, Tys, F->getParent())) {
+      if (F->arg_size() == 2 || F->arg_size() == 3) {
         rename(F);
         NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
                                                   Intrinsic::objectsize, Tys);

From fb209929e1b1abbbebae4f3fdbd080c8e54023a1 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 23 Jun 2025 08:09:50 -0500
Subject: [PATCH 1276/1322] [flang][OpenMP] Set isNewBlock directly on OpenMP
 constructs (#144593)

When the PFT builder decides that an evaluation needs a new block it
checks if the evaluation has nested evaluations. In such case it sets
the flag on the first nested evaluation. This works under the assuption
that such an evaluation only serves as a container, and does not, by
itself, generate any code.

This fails for OpenMP constructs that contain nested evaluations because
the top-level evaluation does generate code that wraps the code from the
nested evaluations. In such cases, the code for the top-level evaluation
may be emitted in a wrong place.

When setting the `isNewBlock` flag, recognize OpenMP directives, and
treat them accordingly.

This fixes https://github.com/llvm/llvm-project/issues/139071
---
 flang/include/flang/Lower/PFTBuilder.h        | 10 ++++
 flang/lib/Lower/PFTBuilder.cpp                |  4 +-
 .../Lower/OpenMP/multiple-entry-points.f90    | 46 +++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/multiple-entry-points.f90

diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 42d6546b7755..ac87fcd7c3b9 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -184,6 +184,11 @@ static constexpr bool isExecutableDirective{common::HasMember<
     A, std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
                   parser::OpenMPConstruct, parser::CUFKernelDoConstruct>>};
 
+template <typename A>
+static constexpr bool isOpenMPDirective{
+    common::HasMember<A, std::tuple<parser::OpenMPConstruct,
+                                    parser::OpenMPDeclarativeConstruct>>};
+
 template <typename A>
 static constexpr bool isFunctionLike{common::HasMember<
     A, std::tuple<parser::MainProgram, parser::FunctionSubprogram,
@@ -267,6 +272,11 @@ struct Evaluation : EvaluationVariant {
       return pft::isExecutableDirective<std::decay_t<decltype(r)>>;
     }});
   }
+  constexpr bool isOpenMPDirective() const {
+    return visit(common::visitors{[](auto &r) {
+      return pft::isOpenMPDirective<std::decay_t<decltype(r)>>;
+    }});
+  }
 
   /// Return the predicate:  "This is a non-initial, non-terminal construct
   /// statement."  For an IfConstruct, this is ElseIfStmt and ElseStmt.
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index 2cc458cb6130..68023610c3c5 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -1096,7 +1096,9 @@ private:
 
     // The first executable statement in the subprogram is preceded by a
     // branch to the entry point, so it starts a new block.
-    if (initialEval->hasNestedEvaluations())
+    // OpenMP directives can generate code around the nested evaluations.
+    if (initialEval->hasNestedEvaluations() &&
+        !initialEval->isOpenMPDirective())
       initialEval = &initialEval->getFirstNestedEvaluation();
     else if (initialEval->isA<Fortran::parser::EntryStmt>())
       initialEval = initialEval->lexicalSuccessor;
diff --git a/flang/test/Lower/OpenMP/multiple-entry-points.f90 b/flang/test/Lower/OpenMP/multiple-entry-points.f90
new file mode 100644
index 000000000000..2b8caa79eaa1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/multiple-entry-points.f90
@@ -0,0 +1,46 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! Check the first entry point
+!CHECK: func.func @_QPprocess_a
+!CHECK: omp.parallel
+!CHECK: omp.wsloop
+!CHECK: %[[V0:[0-9]+]] = fir.load %{{[0-9]+}} : !fir.ref<f32>
+!CHECK: %[[V1:[a-z_0-9]+]] = arith.constant 2.000000e+00 : f32
+!CHECK:   = arith.mulf %[[V0]], %[[V1]] fastmath<contract> : f32
+!CHECK: omp.terminator
+!CHECK-NOT: omp
+!CHECK: return
+
+! Check the second entry point
+!CHECK: func.func @_QPprocess_b
+!CHECK: omp.parallel
+!CHECK: fir.do_loop
+!CHECK: %[[V3:[0-9]+]] = fir.load %[[V2:[0-9]+]]#0 : !fir.ref<i32>
+!CHECK: %[[V4:[0-9]+]] = fir.load %[[V2]]#0 : !fir.ref<i32>
+!CHECK:   = arith.muli %[[V3]], %[[V4]] : i32
+!CHECK: omp.terminator
+!CHECK-NOT: omp
+!CHECK: return
+
+subroutine process_a(n, a)
+  integer, intent(in) :: n
+  real, intent(inout) :: a(n)
+  integer :: i
+
+  !$omp parallel do
+  do i = 1, n
+    a(i) = a(i) * 2.0
+  end do
+  !$omp end parallel do
+
+  return
+
+  entry process_b(n, b)
+    
+  !$omp parallel
+  do i = 1, n
+    a(i) = i * i
+  end do
+  !$omp end parallel
+
+end subroutine process_a

From c7d9b6ed5d6d438ec3bcb0cad5f4d63f8e3e5eed Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 23 Jun 2025 14:12:56 +0100
Subject: [PATCH 1277/1322] [MLIR] Fix incorrect slice contiguity inference in
 `vector::isContiguousSlice` (#142422)

Previously, slices were sometimes marked as non-contiguous when they
were actually contiguous. This occurred when the vector type had leading
unit dimensions, e.g., `vector<1x1x...x1xd0xd1x...xdn-1xT>`. In such
cases, only the trailing `n` dimensions of the memref need to be
contiguous, not the entire vector rank.

This affects how `FlattenContiguousRowMajorTransfer{Read,Write}Pattern`
flattens `transfer_read` and `transfer_write` ops.

The patterns used to collapse a number of dimensions equal to the vector
rank which missed some opportunities when the leading unit dimensions of
the vector span non-contiguous dimensions of the memref.
Now that the contiguity of the slice is determined correctly, there is a
choice how many dimensions of the
memref to collapse, ranging from
a) the number of vector dimensions after ignoring the leading unit
dimensions, up to
  b) the maximum number of contiguous memref dimensions

This patch makes a choice to do minimal memref collapsing. The rationale
behind this decision is that
this way the least amount of information is discarded.

(It follows that in some cases where the patterns used to trigger and
collapse some memref dimensions, after this patch the patterns may
collapse less dimensions).
---
 .../mlir/Dialect/Vector/Utils/VectorUtils.h   |  58 ++--
 .../Transforms/VectorTransferOpTransforms.cpp |  13 +-
 mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp |  25 +-
 .../Vector/vector-transfer-flatten.mlir       | 275 ++++++++++++++----
 4 files changed, 260 insertions(+), 111 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 6609b28d77b6..cc8421b23a07 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -47,37 +47,41 @@ Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);
 /// on a 2D slice. Otherwise, returns a failure.
 FailureOr<std::pair<int, int>> isTranspose2DSlice(vector::TransposeOp op);
 
-/// Return true if `vectorType` is a contiguous slice of `memrefType`.
+/// Return true if `vectorType` is a contiguous slice of `memrefType`,
+/// in the sense that it can be read/written from/to a contiguous area
+/// of the memref.
 ///
-/// Only the N = vectorType.getRank() trailing dims of `memrefType` are
-/// checked (the other dims are not relevant). Note that for `vectorType` to be
-/// a contiguous slice of `memrefType`, the trailing dims of the latter have
-/// to be contiguous - this is checked by looking at the corresponding strides.
+/// The leading unit dimensions of the vector type are ignored as they
+/// are not relevant to the result. Let N be the number of the vector
+/// dimensions after ignoring a leading sequence of unit ones.
 ///
-/// There might be some restriction on the leading dim of `VectorType`:
+/// For `vectorType` to be a contiguous slice of `memrefType`
+///   a) the N trailing dimensions of `memrefType` must be contiguous, and
+///   b) the N-1 trailing dimensions of `vectorType` and `memrefType`
+///      must match.
 ///
-/// Case 1. If all the trailing dims of `vectorType` match the trailing dims
-///         of `memrefType` then the leading dim of `vectorType` can be
-///         arbitrary.
+/// Examples:
 ///
-///        Ex. 1.1 contiguous slice, perfect match
-///          vector<4x3x2xi32> from memref<5x4x3x2xi32>
-///        Ex. 1.2 contiguous slice, the leading dim does not match (2 != 4)
-///          vector<2x3x2xi32> from memref<5x4x3x2xi32>
-///
-/// Case 2. If an "internal" dim of `vectorType` does not match the
-///         corresponding trailing dim in `memrefType` then the remaining
-///         leading dims of `vectorType` have to be 1 (the first non-matching
-///         dim can be arbitrary).
-///
-///        Ex. 2.1 non-contiguous slice, 2 != 3 and the leading dim != <1>
-///          vector<2x2x2xi32> from memref<5x4x3x2xi32>
-///        Ex. 2.2  contiguous slice, 2 != 3 and the leading dim == <1>
-///          vector<1x2x2xi32> from memref<5x4x3x2xi32>
-///        Ex. 2.3. contiguous slice, 2 != 3 and the leading dims == <1x1>
-///          vector<1x1x2x2xi32> from memref<5x4x3x2xi32>
-///        Ex. 2.4. non-contiguous slice, 2 != 3 and the leading dims != <1x1>
-///         vector<2x1x2x2xi32> from memref<5x4x3x2xi32>)
+///   Ex.1 contiguous slice, perfect match
+///     vector<4x3x2xi32> from memref<5x4x3x2xi32>
+///   Ex.2 contiguous slice, the leading dim does not match (2 != 4)
+///     vector<2x3x2xi32> from memref<5x4x3x2xi32>
+///   Ex.3 non-contiguous slice, 2 != 3
+///     vector<2x2x2xi32> from memref<5x4x3x2xi32>
+///   Ex.4 contiguous slice, leading unit dimension of the vector ignored,
+///        2 != 3 (allowed)
+///     vector<1x2x2xi32> from memref<5x4x3x2xi32>
+///   Ex.5. contiguous slice, leading two unit dims of the vector ignored,
+///         2 != 3 (allowed)
+///     vector<1x1x2x2xi32> from memref<5x4x3x2xi32>
+///   Ex.6. non-contiguous slice, 2 != 3, no leading sequence of unit dims
+///     vector<2x1x2x2xi32> from memref<5x4x3x2xi32>)
+///   Ex.7 contiguous slice, memref needs to be contiguous only in the last
+///        dimension
+///     vector<1x1x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>>
+///   Ex.8 non-contiguous slice, memref needs to be contiguous in the last
+///        two dimensions, and it isn't
+///     vector<1x2x2xi32> from memref<2x2x2xi32, strided<[8, 4, 1]>>
 bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);
 
 /// Returns an iterator for all positions in the leading dimensions of `vType`
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 384717aeca66..785a8aaf3f0a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -581,7 +581,6 @@ static SmallVector<Value> getCollapsedIndices(RewriterBase &rewriter,
 }
 
 namespace {
-
 /// Rewrites contiguous row-major vector.transfer_read ops by inserting
 /// memref.collapse_shape on the source so that the resulting
 /// vector.transfer_read has a 1D source. Requires the source shape to be
@@ -630,7 +629,11 @@ public:
     if (transferReadOp.getMask())
       return failure();
 
-    int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank();
+    // Determine the first memref dimension to collapse - just enough so we can
+    // read a flattened vector.
+    int64_t firstDimToCollapse =
+        sourceType.getRank() -
+        vectorType.getShape().drop_while([](auto v) { return v == 1; }).size();
 
     // 1. Collapse the source memref
     Value collapsedSource =
@@ -722,7 +725,11 @@ public:
     if (transferWriteOp.getMask())
       return failure();
 
-    int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank();
+    // Determine the first memref dimension to collapse - just enough so we can
+    // read a flattened vector.
+    int64_t firstDimToCollapse =
+        sourceType.getRank() -
+        vectorType.getShape().drop_while([](auto v) { return v == 1; }).size();
 
     // 1. Collapse the source memref
     Value collapsedSource =
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 590d244daef4..0c7cfc13f87d 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -258,29 +258,20 @@ bool vector::isContiguousSlice(MemRefType memrefType, VectorType vectorType) {
   if (vectorType.isScalable())
     return false;
 
-  ArrayRef<int64_t> vectorShape = vectorType.getShape();
-  auto vecRank = vectorType.getRank();
+  // Ignore a leading sequence of adjacent unit dimensions in the vector.
+  ArrayRef<int64_t> vectorShape =
+      vectorType.getShape().drop_while([](auto v) { return v == 1; });
+  auto vecRank = vectorShape.size();
 
   if (!memrefType.areTrailingDimsContiguous(vecRank))
     return false;
 
-  // Extract the trailing dims and strides of the input memref
+  // Extract the trailing dims of the input memref
   auto memrefShape = memrefType.getShape().take_back(vecRank);
 
-  // Compare the dims of `vectorType` against `memrefType` (in reverse).
-  // In the most basic case, all dims will match.
-  auto firstNonMatchingDim =
-      std::mismatch(vectorShape.rbegin(), vectorShape.rend(),
-                    memrefShape.rbegin(), memrefShape.rend());
-  if (firstNonMatchingDim.first == vectorShape.rend())
-    return true;
-
-  // One non-matching dim is still fine, however the remaining leading dims of
-  // `vectorType` need to be 1.
-  SmallVector<int64_t> leadingDims(++firstNonMatchingDim.first,
-                                   vectorShape.rend());
-
-  return llvm::all_of(leadingDims, [](auto x) { return x == 1; });
+  // Compare the dims of `vectorType` against `memrefType`.
+  // All of the dimensions, except the first must match.
+  return llvm::equal(vectorShape.drop_front(), memrefShape.drop_front());
 }
 
 std::optional<StaticTileOffsetRange>
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 45873aa93153..0f04d3b79b53 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -71,30 +71,97 @@ func.func @transfer_read_dims_match_contiguous_empty_stride(
 // -----
 
 // The shape of the memref and the vector don't match, but the vector is a
-// contiguous subset of the memref, so "flattenable".
+// contiguous subset of the memref, so "flattenable"
 
 func.func @transfer_read_dims_mismatch_contiguous(
-    %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<1x1x2x2xi8> {
+    %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<2x3x2xi8> {
 
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0 : i8
   %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst :
-    memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<1x1x2x2xi8>
-  return %res : vector<1x1x2x2xi8>
+    memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<2x3x2xi8>
+  return %res : vector<2x3x2xi8>
 }
 
-// CHECK-LABEL:   func.func @transfer_read_dims_mismatch_contiguous(
-// CHECK-SAME:      %[[MEM:.*]]: memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<1x1x2x2xi8> {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i8
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = memref.collapse_shape %[[MEM]] {{\[\[}}0, 1, 2, 3]] : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>> into memref<120xi8, strided<[1], offset: ?>>
-// CHECK:           %[[VAL_4:.*]] = vector.transfer_read %[[VAL_3]]{{\[}}%[[VAL_2]]], %[[VAL_1]] {in_bounds = [true]} : memref<120xi8, strided<[1], offset: ?>>, vector<4xi8>
-// CHECK:           %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<4xi8> to vector<1x1x2x2xi8>
-// CHECK:           return %[[VAL_5]] : vector<1x1x2x2xi8>
+// CHECK-LABEL: func.func @transfer_read_dims_mismatch_contiguous(
+// CHECK-SAME:    %[[MEM:.+]]: memref<5x4x3x2xi8, {{.+}}>) -> vector<2x3x2xi8> {
+// CHECK:         %[[C0_I8:.+]] = arith.constant 0 : i8
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[COLLAPSED_MEM:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1, 2, 3]]
+// CHECK-SAME:          : memref<5x4x3x2xi8, {{.+}}> into memref<5x24xi8, {{.+}}>
+// CHECK:         %[[VEC_1D:.+]] = vector.transfer_read %[[COLLAPSED_MEM]][%[[C0]], %[[C0]]], %[[C0_I8]] {in_bounds = [true]}
+// CHECK-SAME:      : memref<5x24xi8, strided<[24, 1], offset: ?>>, vector<12xi8>
+// CHECK:         %[[VEC:.+]] = vector.shape_cast %[[VEC_1D]] : vector<12xi8> to vector<2x3x2xi8>
+// CHECK:         return %[[VEC]] : vector<2x3x2xi8>
 
 // CHECK-128B-LABEL: func @transfer_read_dims_mismatch_contiguous(
 //       CHECK-128B:   memref.collapse_shape
 
+// -----
+
+// The shape of the memref and the vector don't match, but the mismatch is only
+// at the leading unit dimensions of the vector.
+
+func.func @transfer_read_dims_mismatch_contiguous_unit_dims(
+    %mem : memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>) -> vector<1x1x4x3x2xi8> {
+
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0 : i8
+  %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0, %c0], %cst :
+    memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>, vector<1x1x4x3x2xi8>
+  return %res : vector<1x1x4x3x2xi8>
+}
+
+// CHECK-LABEL: func.func @transfer_read_dims_mismatch_contiguous_unit_dims(
+// CHECK-SAME:    %[[MEM:.+]]: memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>)
+// CHECK-SAME:    -> vector<1x1x4x3x2xi8>
+// CHECK:       %[[C0_I8:.+]] = arith.constant 0 : i8
+// CHECK:       %[[C0:.+]] = arith.constant 0 : index
+// CHECK:       %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3, 4]]
+// CHECK-SAME:    : memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>
+// CHECK-SAME:      into memref<6x5x24xi8, strided<[120, 24, 1], offset: ?>>
+// CHECK:       %[[VEC_1D:.+]] = vector.transfer_read %[[COLLAPSED]][%[[C0]], %[[C0]], %[[C0]]], %[[C0_I8]]
+// CHECK-SAME:    {in_bounds = [true]} : memref<6x5x24xi8, strided<[120, 24, 1], offset: ?>>, vector<24xi8>
+// CHECK:       %[[VEC:.+]] = vector.shape_cast %[[VEC_1D]] : vector<24xi8> to vector<1x1x4x3x2xi8>
+// CHECK:       return %[[VEC]] : vector<1x1x4x3x2xi8>
+
+// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_contiguous_unit_dims(
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
+// The memref is non-contiguous, but the vector is a contiguous subset of the
+// memref, so "flattenable". The leading unit dimensions of the vector have no
+// effect on the memref area read even if they span the non-contiguous part of
+// the memref.
+
+func.func @transfer_read_non_contiguous_unit_dims(
+    %mem : memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>>) -> vector<1x1x3x2xi8> {
+
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0 : i8
+  %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst :
+    memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>>, vector<1x1x3x2xi8>
+  return %res : vector<1x1x3x2xi8>
+}
+
+// CHECK-LABEL:   func.func @transfer_read_non_contiguous_unit_dims(
+// CHECK-SAME:      %[[MEM:.*]]: memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>>) -> vector<1x1x3x2xi8> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i8
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]]
+// CHECK-SAME:        : memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>> into memref<5x4x6xi8, strided<[48, 6, 1], offset: ?>>
+// CHECK:           %[[VAL_4:.*]] = vector.transfer_read %[[VAL_3]][%[[VAL_2]], %[[VAL_2]], %[[VAL_2]]], %[[VAL_1]] {in_bounds = [true]} : memref<5x4x6xi8, strided<[48, 6, 1], offset: ?>>, vector<6xi8>
+// CHECK:           %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<6xi8> to vector<1x1x3x2xi8>
+// CHECK:           return %[[VAL_5]] : vector<1x1x3x2xi8>
+
+// CHECK-128B-LABEL: func @transfer_read_non_contiguous_unit_dims(
+//       CHECK-128B:   memref.collapse_shape
+
+
 // -----
 
 func.func @transfer_read_dims_mismatch_non_zero_indices(
@@ -110,16 +177,18 @@ func.func @transfer_read_dims_mismatch_non_zero_indices(
   return %res : vector<1x2x6xi32>
 }
 
-// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
+// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 * 6)>
 
 // CHECK-LABEL:   func.func @transfer_read_dims_mismatch_non_zero_indices(
-// CHECK-SAME:      %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index,
-// CHECK-SAME:      %[[MEM:.*]]: memref<1x43x4x6xi32>
-// CHECK:           %[[C_0:.*]] = arith.constant 0 : i32
-// CHECK:           %[[C_0_IDX:.*]] = arith.constant 0 : index
-// CHECK:           %[[COLLAPSED_IN:.*]] = memref.collapse_shape %[[MEM]] {{\[}}[0], [1, 2, 3]] : memref<1x43x4x6xi32> into memref<1x1032xi32>
-// CHECK:           %[[COLLAPSED_IDX:.*]] = affine.apply #[[$ATTR_0]]()[%[[IDX_1]], %[[IDX_2]]]
-// CHECK:           %[[READ:.*]] = vector.transfer_read %[[COLLAPSED_IN]][%[[C_0_IDX]], %[[COLLAPSED_IDX]]], %[[C_0]] {in_bounds = [true]} : memref<1x1032xi32>, vector<12xi32>
+// CHECK-SAME:      %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index,
+// CHECK-SAME:      %[[MEM:.+]]: memref<1x43x4x6xi32>
+// CHECK:           %[[C0_I32:.+]] = arith.constant 0 : i32
+// CHECK:           %[[C_0:.+]] = arith.constant 0 : index
+// CHECK:           %[[COLLAPSED_IN:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]]
+// CHECK-SAME:         : memref<1x43x4x6xi32> into memref<1x43x24xi32>
+// CHECK:           %[[COLLAPSED_IDX:.+]] = affine.apply #[[$ATTR_0]]()[%[[IDX_2]]]
+// CHECK:           %[[READ:.+]] = vector.transfer_read %[[COLLAPSED_IN]][%[[C_0]], %[[IDX_1]], %[[COLLAPSED_IDX]]], %[[C0_I32]] {in_bounds = [true]} : memref<1x43x24xi32>, vector<12xi32>
 
 // CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_zero_indices(
 //   CHECK-128B-NOT:   memref.collapse_shape
@@ -174,12 +243,12 @@ func.func @transfer_read_leading_dynamic_dims(
 // CHECK-SAME:    %[[MEM:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index
 // CHECK:         %[[C0_I8:.+]] = arith.constant 0 : i8
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
-// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]] {{\[}}[0], [1], [2, 3]{{\]}}
+// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]]
 // CHECK-SAME:      : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
 // CHECK:         %[[VEC1D:.+]] = vector.transfer_read %[[COLLAPSED]]
-// CHECK-SAME:    [%[[IDX_1]], %[[IDX_2]], %[[C0]]], %[[C0_I8]]
-// CHECK-SAME:    {in_bounds = [true]}
-// CHECK-SAME:      : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
+// CHECK-SAME:      [%[[IDX_1]], %[[IDX_2]], %[[C0]]], %[[C0_I8]]
+// CHECK-SAME:      {in_bounds = [true]} : memref<?x?x32xi8, {{.+}}>, vector<32xi8>
 // CHECK:         %[[RES:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8>
 // CHECK:         return %[[RES]] : vector<8x4xi8>
 
@@ -229,21 +298,21 @@ func.func @transfer_read_dynamic_dim_to_flatten(
   return %res : vector<1x2x6xi32>
 }
 
-// CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 6)>
 
 // CHECK-LABEL: func.func @transfer_read_dynamic_dim_to_flatten
 // CHECK-SAME:    %[[IDX_1:arg0]]
 // CHECK-SAME:    %[[IDX_2:arg1]]
 // CHECK-SAME:    %[[MEM:arg2]]
-// CHECK:              %[[C0_I32:.*]] = arith.constant 0 : i32
-// CHECK:              %[[C0:.*]] = arith.constant 0 : index
-// CHECK:              %[[COLLAPSED:.*]] = memref.collapse_shape %[[MEM]]
-// CHECK-SAME{LITERAL}:  [[0], [1, 2, 3]]
-// CHECK-SAME:           memref<1x?x4x6xi32> into memref<1x?xi32>
-// CHECK:              %[[COLLAPSED_IDX:.*]] = affine.apply #[[$MAP]]()[%[[IDX_1]], %[[IDX_2]]]
-// CHECK:              %[[VEC_1D:.*]] = vector.transfer_read %[[COLLAPSED]][%[[C0]], %[[COLLAPSED_IDX]]],
-// CHECK-SAME:           %[[C0_I32]] {in_bounds = [true]} : memref<1x?xi32>, vector<12xi32>
-// CHECK:              %[[RESULT:.*]] = vector.shape_cast %[[VEC_1D]] : vector<12xi32> to vector<1x2x6xi32>
+// CHECK:              %[[C0_I32:.+]] = arith.constant 0 : i32
+// CHECK:              %[[C0:.+]] = arith.constant 0 : index
+// CHECK:              %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}:  [[0], [1], [2, 3]]
+// CHECK-SAME:           memref<1x?x4x6xi32> into memref<1x?x24xi32>
+// CHECK:              %[[COLLAPSED_IDX:.+]] = affine.apply #[[$MAP]]()[%[[IDX_2]]]
+// CHECK:              %[[VEC_1D:.+]] = vector.transfer_read %[[COLLAPSED]][%[[C0]], %[[IDX_1]], %[[COLLAPSED_IDX]]],
+// CHECK-SAME:           %[[C0_I32]] {in_bounds = [true]} : memref<1x?x24xi32>, vector<12xi32>
+// CHECK:              %[[RESULT:.+]] = vector.shape_cast %[[VEC_1D]] : vector<12xi32> to vector<1x2x6xi32>
 // CHECK:              return %[[RESULT]] : vector<1x2x6xi32>
 
 
@@ -381,29 +450,97 @@ func.func @transfer_write_dims_match_contiguous_empty_stride(
 
 // -----
 
+// The shape of the memref and the vector don't match, but the vector is a
+// contiguous subset of the memref, so "flattenable".
+
 func.func @transfer_write_dims_mismatch_contiguous(
     %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>,
-    %vec : vector<1x1x2x2xi8>) {
+    %vec : vector<2x2xi8>) {
 
   %c0 = arith.constant 0 : index
   vector.transfer_write %vec, %mem [%c0, %c0, %c0, %c0] :
-    vector<1x1x2x2xi8>, memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>
+    vector<2x2xi8>, memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>
   return
 }
 
-// CHECK-LABEL:   func.func @transfer_write_dims_mismatch_contiguous
-// CHECK-SAME:      %[[MEM:.*]]: memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>,
-// CHECK-SAME:      %[[VEC:.*]]: vector<1x1x2x2xi8>) {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = memref.collapse_shape %[[MEM]] {{\[\[}}0, 1, 2, 3]] : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>> into memref<120xi8, strided<[1], offset: ?>>
-// CHECK:           %[[VAL_4:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x2x2xi8> to vector<4xi8>
-// CHECK:           vector.transfer_write %[[VAL_4]], %[[VAL_3]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<4xi8>, memref<120xi8, strided<[1], offset: ?>>
+// CHECK-LABEL: func.func @transfer_write_dims_mismatch_contiguous
+// CHECK-SAME:    %[[MEM:.+]]: memref<5x4x3x2xi8, {{.+}}>,
+// CHECK-SAME:    %[[VEC:.+]]: vector<2x2xi8>
+// CHECK:         %[[C0:.+]] = arith.constant 0 : index
+// CHECK:         %[[COLLAPSED_MEM:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]]
+// CHECK-SAME:      : memref<5x4x3x2xi8, {{.+}}> into memref<5x4x6xi8, {{.+}}>
+// CHECK:         %[[VEC_1D:.+]] = vector.shape_cast %[[VEC]] : vector<2x2xi8> to vector<4xi8>
+// CHECK:         vector.transfer_write %[[VEC_1D]], %[[COLLAPSED_MEM]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]}
+// CHECK-SAME:      : vector<4xi8>, memref<5x4x6xi8, {{.+}}>
 
 // CHECK-128B-LABEL: func @transfer_write_dims_mismatch_contiguous(
 //       CHECK-128B:   memref.collapse_shape
 
 // -----
 
+// The shape of the memref and the vector don't match, but the mismatch is only
+// at the leading unit dimensions of the vector.
+
+func.func @transfer_write_dims_mismatch_contiguous_unit_dims(
+    %mem : memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>,
+    %vec : vector<1x1x4x3x2xi8>) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %mem [%c0, %c0, %c0, %c0, %c0] :
+    vector<1x1x4x3x2xi8>, memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>
+
+  return
+}
+
+// CHECK-LABEL:  func.func @transfer_write_dims_mismatch_contiguous_unit_dims(
+// CHECK-SAME:   %[[MEM:.+]]: memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>
+// CHECK-SAME:   %[[VEC:.+]]: vector<1x1x4x3x2xi8>
+// CHECK:          %[[C0:.+]] = arith.constant 0 : index
+// CHECK:          %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3, 4]]
+// CHECK-SAME:       : memref<6x5x4x3x2xi8, strided<[120, 24, 6, 2, 1], offset: ?>>
+// CHECK-SAME:         into memref<6x5x24xi8, strided<[120, 24, 1], offset: ?>>
+// CHECK:          %[[VEC_1D:.+]] = vector.shape_cast %[[VEC]] : vector<1x1x4x3x2xi8> to vector<24xi8>
+// CHECK:          vector.transfer_write %[[VEC_1D]], %[[COLLAPSED]][%[[C0]], %[[C0]], %[[C0]]]
+// CHECK-SAME:       {in_bounds = [true]} : vector<24xi8>, memref<6x5x24xi8, strided<[120, 24, 1], offset: ?>>
+
+// CHECK-128B-LABEL: func @transfer_write_dims_mismatch_contiguous_unit_dims(
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
+// The memref is non-contiguous, but the vector is a contiguous subset of the
+// memref, so "flattenable". The leading unit dimensions of the vector have no
+// effect on the memref area read even if they span the non-contiguous part of
+// the memref.
+
+func.func @transfer_write_non_contiguous_unit_dims(
+    %mem : memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>>,
+    %vec : vector<1x1x3x2xi8>) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %mem [%c0, %c0, %c0, %c0] :
+    vector<1x1x3x2xi8>, memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>>
+  return
+}
+
+// CHECK-LABEL:   func.func @transfer_write_non_contiguous_unit_dims
+// CHECK-SAME:      %[[MEM:.*]]: memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>>,
+// CHECK-SAME:      %[[VEC:.*]]: vector<1x1x3x2xi8>) {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[COLLAPSED:.*]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]]
+// CHECK-SAME:        : memref<5x4x3x2xi8, strided<[48, 6, 2, 1], offset: ?>> into memref<5x4x6xi8, strided<[48, 6, 1], offset: ?>>
+// CHECK:           %[[VEC_1D:.*]] = vector.shape_cast %[[VEC]] : vector<1x1x3x2xi8> to vector<6xi8>
+// CHECK:           vector.transfer_write %[[VEC_1D]], %[[COLLAPSED]][%[[C0]], %[[C0]], %[[C0]]]
+// CHECK-SAME:        {in_bounds = [true]} : vector<6xi8>, memref<5x4x6xi8, strided<[48, 6, 1], offset: ?>>
+
+// CHECK-128B-LABEL: func @transfer_write_non_contiguous_unit_dims(
+//       CHECK-128B:   memref.collapse_shape
+
+// -----
+
 func.func @transfer_write_dims_mismatch_non_zero_indices(
     %idx_1: index,
     %idx_2: index,
@@ -417,17 +554,18 @@ func.func @transfer_write_dims_mismatch_non_zero_indices(
   return
 }
 
-// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
+// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 * 6)>
 
 // CHECK-LABEL:   func.func @transfer_write_dims_mismatch_non_zero_indices(
 // CHECK-SAME:      %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index,
 // CHECK-SAME:      %[[MEM:.*]]: memref<1x43x4x6xi32>,
 // CHECK-SAME:      %[[VEC:.*]]: vector<1x2x6xi32>) {
 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[IDX:.*]] = affine.apply #[[$ATTR_0]](){{\[}}%[[IDX_1]], %[[IDX_2]]]
-// CHECK-DAG:       %[[CS:.*]] = memref.collapse_shape %[[MEM]] {{\[\[}}0], [1, 2, 3]] : memref<1x43x4x6xi32> into memref<1x1032xi32>
+// CHECK-DAG:       %[[IDX:.*]] = affine.apply #[[$ATTR_0]]()[%[[IDX_2]]]
+// CHECK-DAG:       %[[CS:.*]] = memref.collapse_shape %[[MEM]]
+// CHECK-DAG-SAME{LITERAL}: [[0], [1], [2, 3]] : memref<1x43x4x6xi32> into memref<1x43x24xi32>
 // CHECK:           %[[SC:.*]] = vector.shape_cast %[[VEC]] : vector<1x2x6xi32> to vector<12xi32>
-// CHECK:           vector.transfer_write %[[SC]], %[[CS]]{{\[}}%[[C0]], %[[IDX]]] {in_bounds = [true]} : vector<12xi32>, memref<1x1032xi32>
+// CHECK:           vector.transfer_write %[[SC]], %[[CS]][%[[C0]], %[[IDX_1]], %[[IDX]]] {in_bounds = [true]} : vector<12xi32>, memref<1x43x24xi32>
 
 // CHECK-128B-LABEL: func @transfer_write_dims_mismatch_non_zero_indices(
 //   CHECK-128B-NOT:   memref.collapse_shape
@@ -478,12 +616,12 @@ func.func @transfer_write_leading_dynamic_dims(
 // CHECK-LABEL: func @transfer_write_leading_dynamic_dims
 // CHECK-SAME:    %[[VEC:.+]]: vector<8x4xi8>, %[[MEM:.+]]: memref<?x?x8x4xi8, {{.+}}>, %[[ARG2:.+]]: index, %[[ARG3:.+]]: index
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
-// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]] {{\[}}[0], [1], [2, 3]{{\]}}
+// CHECK:         %[[COLLAPSED:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}: [[0], [1], [2, 3]]
 // CHECK-SAME:      : memref<?x?x8x4xi8, {{.+}}> into memref<?x?x32xi8, {{.+}}>
 // CHECK:         %[[VEC1D:.+]] = vector.shape_cast %[[VEC]] : vector<8x4xi8> to vector<32xi8>
 // CHECK:         vector.transfer_write %[[VEC1D]], %[[COLLAPSED]]
-// CHECK-SAME:      [%[[ARG2]], %[[ARG3]], %[[C0]]]
-// CHECK-SAME:      {in_bounds = [true]}
+// CHECK-SAME:      [%[[ARG2]], %[[ARG3]], %[[C0]]] {in_bounds = [true]}
 // CHECK-SAME:      : vector<32xi8>, memref<?x?x32xi8, {{.+}}>
 
 // CHECK-128B-LABEL: func @transfer_write_leading_dynamic_dims
@@ -528,22 +666,21 @@ func.func @transfer_write_dynamic_dim_to_flatten(
   return
 }
 
-// CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 6)>
 
 // CHECK-LABEL: func.func @transfer_write_dynamic_dim_to_flatten
 // CHECK-SAME:    %[[IDX_1:arg0]]: index
 // CHECK-SAME:    %[[IDX_2:arg1]]: index
 // CHECK-SAME:    %[[VEC:arg2]]: vector<1x2x6xi32>
 // CHECK-SAME:    %[[MEM:arg3]]: memref<1x?x4x6xi32>
-
-// CHECK:              %[[C0:.*]] = arith.constant 0 : index
-// CHECK:              %[[COLLAPSED_MEM:.*]] = memref.collapse_shape %[[MEM]]
-// CHECK-SAME{LITERAL}:  [[0], [1, 2, 3]]
-// CHECK-SAME:           : memref<1x?x4x6xi32> into memref<1x?xi32>
-// CHECK:              %[[COLLAPSED_IDX:.*]] = affine.apply #[[$MAP]]()[%[[IDX_1]], %[[IDX_2]]]
-// CHECK:              %[[VEC_1D:.*]] = vector.shape_cast %[[VEC]] : vector<1x2x6xi32> to vector<12xi32>
-// CHECK:              vector.transfer_write %[[VEC_1D]], %[[COLLAPSED_MEM]][%[[C0]], %[[COLLAPSED_IDX]]]
-// CHECK-SAME:           {in_bounds = [true]} : vector<12xi32>, memref<1x?xi32>
+// CHECK:              %[[C0:.+]] = arith.constant 0 : index
+// CHECK:              %[[COLLAPSED_MEM:.+]] = memref.collapse_shape %[[MEM]]
+// CHECK-SAME{LITERAL}:  [[0], [1], [2, 3]]
+// CHECK-SAME:           : memref<1x?x4x6xi32> into memref<1x?x24xi32>
+// CHECK:              %[[COLLAPSED_IDX:.+]] = affine.apply #[[$MAP]]()[%[[IDX_2]]]
+// CHECK:              %[[VEC_1D:.+]] = vector.shape_cast %[[VEC]] : vector<1x2x6xi32> to vector<12xi32>
+// CHECK:              vector.transfer_write %[[VEC_1D]], %[[COLLAPSED_MEM]][%[[C0]], %[[IDX_1]], %[[COLLAPSED_IDX]]]
+// CHECK-SAME:           {in_bounds = [true]} : vector<12xi32>, memref<1x?x24xi32>
 
 // CHECK-128B-LABEL: func @transfer_write_dynamic_dim_to_flatten
 //   CHECK-128B-NOT:   memref.collapse_shape
@@ -621,8 +758,13 @@ func.func @negative_out_of_bound_transfer_read(
     memref<?x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<5x4x3x2xi8>
   return %res : vector<5x4x3x2xi8>
 }
-// CHECK:     func.func @negative_out_of_bound_transfer_read
-// CHECK-NOT:   memref.collapse_shape
+// CHECK-LABEL: func.func @negative_out_of_bound_transfer_read
+// CHECK-NOT:     memref.collapse_shape
+// CHECK-NOT:     vector.shape_cast
+
+// CHECK-128B-LABEL: func.func @negative_out_of_bound_transfer_read
+//   CHECK-128B-NOT:   memref.collapse_shape
+//   CHECK-128B-NOT:   vector.shape_cast
 
 // -----
 
@@ -633,5 +775,10 @@ func.func @negative_out_of_bound_transfer_write(
     vector<1x1x3x2xi8>, memref<?x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>
   return
 }
-// CHECK:     func.func @negative_out_of_bound_transfer_write
-// CHECK-NOT:   memref.collapse_shape
+// CHECK-LABEL: func.func @negative_out_of_bound_transfer_write
+// CHECK-NOT:     memref.collapse_shape
+// CHECK-NOT:     vector.shape_cast
+
+// CHECK-128B-LABEL: func.func @negative_out_of_bound_transfer_write
+//   CHECK-128B-NOT:   memref.collapse_shape
+//   CHECK-128B-NOT:   vector.shape_cast

From 58987d2e34e67742e3a65b1bb94ec2cfebae805f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 22:14:44 +0900
Subject: [PATCH 1278/1322] RuntimeLibcalls: Pass in ABI name from MCOptions
 (#144894)

ARM needs this to compute the available libcalls.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h  | 7 ++++---
 llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 ++-
 llvm/lib/IR/RuntimeLibcalls.cpp         | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 71f38bedf17e..ada1648b102f 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -58,8 +58,8 @@ struct RuntimeLibcallsInfo {
       const Triple &TT,
       ExceptionHandling ExceptionModel = ExceptionHandling::None,
       FloatABI::ABIType FloatABI = FloatABI::Default,
-      EABI EABIVersion = EABI::Default) {
-    initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion);
+      EABI EABIVersion = EABI::Default, StringRef ABIName = "") {
+    initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
   }
 
   /// Rename the default libcall routine name for the specified libcall.
@@ -150,7 +150,8 @@ private:
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
   LLVM_ABI void initLibcalls(const Triple &TT, ExceptionHandling ExceptionModel,
-                             FloatABI::ABIType FloatABI, EABI ABIType);
+                             FloatABI::ABIType FloatABI, EABI ABIType,
+                             StringRef ABIName);
 };
 
 } // namespace RTLIB
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 41e73b853093..cda41a91a372 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -633,7 +633,8 @@ void RTLIB::initCmpLibcallCCs(ISD::CondCode *CmpLibcallCCs) {
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
     : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel,
-                       TM.Options.FloatABIType, TM.Options.EABIVersion) {
+                       TM.Options.FloatABIType, TM.Options.EABIVersion,
+                       TM.Options.MCOptions.getABIName()) {
   initActions();
 
   // Perform these initializations only once.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index ad2904d6d2ea..4837207cc53b 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -359,7 +359,7 @@ static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
                                        ExceptionHandling ExceptionModel,
                                        FloatABI::ABIType FloatABI,
-                                       EABI EABIVersion) {
+                                       EABI EABIVersion, StringRef ABIName) {
   initSoftFloatCmpLibcallPredicates();
 
   initSoftFloatCmpLibcallPredicates();

From a65e0edd6ac522a21744fa51cef6593fd782dea4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 23 Jun 2025 22:15:37 +0900
Subject: [PATCH 1279/1322] PowerPC: Stop reporting memcpy as an alias of
 memmove on AIX (#143836)

Instead of reporting ___memmove as an implementation of memcpy,
make it unavailable and let the lowering logic consider memmove as
a fallback path.

This avoids a special case 1:N mapping for libcall implementations.
---
 llvm/include/llvm/CodeGen/TargetLowering.h      |  2 ++
 llvm/include/llvm/IR/RuntimeLibcalls.h          | 10 ++++++++++
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp |  6 +++++-
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp   | 11 +++++++++--
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp  | 15 ++++++++-------
 llvm/lib/IR/RuntimeLibcalls.cpp                 |  2 +-
 6 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index dd44afd0855a..727526055e59 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3572,6 +3572,8 @@ public:
     return Libcalls.getLibcallName(Call);
   }
 
+  const char *getMemcpyName() const { return Libcalls.getMemcpyName(); }
+
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
   /// FIXME: This should be removed
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index ada1648b102f..2a095be58a49 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -106,6 +106,16 @@ struct RuntimeLibcallsInfo {
     SoftFloatCompareLibcallPredicates[Call] = Pred;
   }
 
+  /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
+  /// unsupported.
+  const char *getMemcpyName() const {
+    if (const char *Memcpy = getLibcallName(RTLIB::MEMCPY))
+      return Memcpy;
+
+    // Fallback to memmove if memcpy isn't available.
+    return getLibcallName(RTLIB::MEMMOVE);
+  }
+
 private:
   /// Stores the name each libcall.
   const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1] = {nullptr};
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e0656141d432..5f5af5cad778 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -672,26 +672,30 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   RTLIB::Libcall RTLibcall;
   unsigned Opc = MI.getOpcode();
+  const char *Name;
   switch (Opc) {
   case TargetOpcode::G_BZERO:
     RTLibcall = RTLIB::BZERO;
+    Name = TLI.getLibcallName(RTLibcall);
     break;
   case TargetOpcode::G_MEMCPY:
     RTLibcall = RTLIB::MEMCPY;
+    Name = TLI.getMemcpyName();
     Args[0].Flags[0].setReturned();
     break;
   case TargetOpcode::G_MEMMOVE:
     RTLibcall = RTLIB::MEMMOVE;
+    Name = TLI.getLibcallName(RTLibcall);
     Args[0].Flags[0].setReturned();
     break;
   case TargetOpcode::G_MEMSET:
     RTLibcall = RTLIB::MEMSET;
+    Name = TLI.getLibcallName(RTLibcall);
     Args[0].Flags[0].setReturned();
     break;
   default:
     llvm_unreachable("unsupported opcode");
   }
-  const char *Name = TLI.getLibcallName(RTLibcall);
 
   // Unsupported libcall on the target.
   if (!Name) {
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 6f52b7cac1d4..9d1d70b1cb23 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -231,6 +231,14 @@ static bool canEmitLibcall(const TargetMachine *TM, Function *F,
   return TLI->getLibcallName(LC) != nullptr;
 }
 
+static bool canEmitMemcpy(const TargetMachine *TM, Function *F) {
+  // TODO: Should this consider the address space of the memcpy?
+  if (!TM)
+    return true;
+  const TargetLowering *TLI = TM->getSubtargetImpl(*F)->getTargetLowering();
+  return TLI->getMemcpyName() != nullptr;
+}
+
 // Return a value appropriate for use with the memset_pattern16 libcall, if
 // possible and if we know how. (Adapted from equivalent helper in
 // LoopIdiomRecognize).
@@ -300,8 +308,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       Function *ParentFunc = Memcpy->getFunction();
       const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
       if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
-        if (UseMemIntrinsicLibFunc &&
-            canEmitLibcall(TM, ParentFunc, RTLIB::MEMCPY))
+        if (UseMemIntrinsicLibFunc && canEmitMemcpy(TM, ParentFunc))
           break;
 
         // TODO: For optsize, emit the loop into a separate function
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5d8db8be9731..30ee6a99b9df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8776,11 +8776,12 @@ SDValue SelectionDAG::getMemcpy(
   // FIXME: pass in SDLoc
   TargetLowering::CallLoweringInfo CLI(*this);
   bool IsTailCall = false;
+  const char *MemCpyName = TLI->getMemcpyName();
+
   if (OverrideTailCall.has_value()) {
     IsTailCall = *OverrideTailCall;
   } else {
-    bool LowersToMemcpy =
-        TLI->getLibcallName(RTLIB::MEMCPY) == StringRef("memcpy");
+    bool LowersToMemcpy = StringRef(MemCpyName) == StringRef("memcpy");
     bool ReturnsFirstArg = CI && funcReturnsFirstArgOfCall(*CI);
     IsTailCall = CI && CI->isTailCall() &&
                  isInTailCallPosition(*CI, getTarget(),
@@ -8789,11 +8790,11 @@ SDValue SelectionDAG::getMemcpy(
 
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                    Dst.getValueType().getTypeForEVT(*getContext()),
-                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                      TLI->getPointerTy(getDataLayout())),
-                    std::move(Args))
+      .setLibCallee(
+          TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+          Dst.getValueType().getTypeForEVT(*getContext()),
+          getExternalSymbol(MemCpyName, TLI->getPointerTy(getDataLayout())),
+          std::move(Args))
       .setDiscardResult()
       .setTailCall(IsTailCall);
 
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 4837207cc53b..702e0a51357f 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -416,7 +416,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
 
     if (TT.isOSAIX()) {
       bool isPPC64 = TT.isPPC64();
-      setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");
+      setLibcallName(RTLIB::MEMCPY, nullptr);
       setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove");
       setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset");
       setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero");

From 117e51de8a83915f613ef6ee744eaac6eb5307dd Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Sun, 15 Jun 2025 02:47:40 +0100
Subject: [PATCH 1280/1322] [flang][cmake] Don't pass -fno-strict-aliasing for
 GCC either

This is the same as 4ed10db85919d3d87bf0b3353340b58354a75994 with the
same rationale, but for Flang, I strongly suspect it was just pulled in
from Clang, see https://github.com/flang-compiler/f18/pull/6#issuecomment-364155817.
---
 flang/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 56a96f590f0a..068d134671db 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -421,7 +421,7 @@ endif()
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
 
   if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing -fno-semantic-interposition")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-semantic-interposition")
   else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument -Wstring-conversion \
           -Wcovered-switch-default")

From 05491e0359edcac9954a29f2a1579241522669ca Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Mon, 23 Jun 2025 16:27:18 +0300
Subject: [PATCH 1281/1322] [clang-tidy] add 'IgnoreMarcos' option to
 'avoid-goto' check (#143554)

---
 .../cppcoreguidelines/AvoidGotoCheck.cpp      | 17 +++++-
 .../cppcoreguidelines/AvoidGotoCheck.h        |  7 ++-
 clang-tools-extra/docs/ReleaseNotes.rst       |  8 +++
 .../checks/cppcoreguidelines/avoid-goto.rst   |  9 ++++
 .../checkers/cppcoreguidelines/avoid-goto.cpp | 52 +++++++++++++++----
 5 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp
index 8ffa44d41fa9..b14587ad7db8 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.cpp
@@ -17,8 +17,20 @@ namespace {
 AST_MATCHER(GotoStmt, isForwardJumping) {
   return Node.getBeginLoc() < Node.getLabel()->getBeginLoc();
 }
+
+AST_MATCHER(GotoStmt, isInMacro) {
+  return Node.getBeginLoc().isMacroID() && Node.getEndLoc().isMacroID();
+}
 } // namespace
 
+AvoidGotoCheck::AvoidGotoCheck(StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IgnoreMacros(Options.get("IgnoreMacros", false)) {}
+
+void AvoidGotoCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoreMacros", IgnoreMacros);
+}
+
 void AvoidGotoCheck::registerMatchers(MatchFinder *Finder) {
   // TODO: This check does not recognize `IndirectGotoStmt` which is a
   // GNU extension. These must be matched separately and an AST matcher
@@ -29,7 +41,10 @@ void AvoidGotoCheck::registerMatchers(MatchFinder *Finder) {
   auto Loop = mapAnyOf(forStmt, cxxForRangeStmt, whileStmt, doStmt);
   auto NestedLoop = Loop.with(hasAncestor(Loop));
 
-  Finder->addMatcher(gotoStmt(anyOf(unless(hasAncestor(NestedLoop)),
+  const ast_matchers::internal::Matcher<GotoStmt> Anything = anything();
+
+  Finder->addMatcher(gotoStmt(IgnoreMacros ? unless(isInMacro()) : Anything,
+                              anyOf(unless(hasAncestor(NestedLoop)),
                                     unless(isForwardJumping())))
                          .bind("goto"),
                      this);
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h
index 883ba78855e7..8eae409462c9 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h
@@ -20,13 +20,16 @@ namespace clang::tidy::cppcoreguidelines {
 /// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-goto.html
 class AvoidGotoCheck : public ClangTidyCheck {
 public:
-  AvoidGotoCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  AvoidGotoCheck(StringRef Name, ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+private:
+  const bool IgnoreMacros;
 };
 
 } // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9dede347b8c9..a802b5fc6699 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -203,11 +203,19 @@ Changes in existing checks
   <clang-tidy/checks/concurrency/mt-unsafe>` check by fixing a false positive
   where ``strerror`` was flagged as MT-unsafe.
 
+- Improved :doc:`cppcoreguidelines-avoid-goto
+  <clang-tidy/checks/cppcoreguidelines/avoid-goto>` check by adding the option
+  `IgnoreMacros` to ignore ``goto`` labels defined in macros.
+
 - Improved :doc:`google-readability-namespace-comments
   <clang-tidy/checks/google/readability-namespace-comments>` check by adding
   the option `AllowOmittingNamespaceComments` to accept if a namespace comment
   is omitted entirely.
 
+- Improved :doc:`hicpp-avoid-goto
+  <clang-tidy/checks/hicpp/avoid-goto>` check by adding the option
+  `IgnoreMacros` to ignore ``goto`` labels defined in macros.
+
 - Improved :doc:`llvm-namespace-comment
   <clang-tidy/checks/llvm/namespace-comment>` check by adding the option
   `AllowOmittingNamespaceComments` to accept if a namespace comment is omitted
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst
index 71b579a4ae99..1f9dc0a1edb3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-goto.rst
@@ -50,3 +50,12 @@ Modern C++ needs ``goto`` only to jump out of nested loops.
   some_operation();
 
 All other uses of ``goto`` are diagnosed in `C++`.
+
+
+Options
+-------
+
+.. option:: IgnoreMacros
+
+   If set to `true`, the check will not warn if a ``goto`` statement is
+   expanded from a macro. Default is `false`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-goto.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-goto.cpp
index ee236bc44695..908132b7c9a6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-goto.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-goto.cpp
@@ -1,12 +1,13 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-avoid-goto %t
+// RUN: %check_clang_tidy -check-suffix=,MACRO %s cppcoreguidelines-avoid-goto %t
+// RUN: %check_clang_tidy %s cppcoreguidelines-avoid-goto %t -- -config="{CheckOptions: { cppcoreguidelines-avoid-goto.IgnoreMacros: true }}"
 
 void noop() {}
 
 int main() {
   noop();
   goto jump_to_me;
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
-  // CHECK-NOTES: [[@LINE+3]]:1: note: label defined here
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
+  // CHECK-MESSAGES: [[@LINE+3]]:1: note: label defined here
   noop();
 
 jump_to_me:;
@@ -14,14 +15,14 @@ jump_to_me:;
 jump_backwards:;
   noop();
   goto jump_backwards;
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
-  // CHECK-NOTES: [[@LINE-4]]:1: note: label defined here
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
+  // CHECK-MESSAGES: [[@LINE-4]]:1: note: label defined here
 
   goto jump_in_line;
   ;
 jump_in_line:;
-  // CHECK-NOTES: [[@LINE-3]]:3: warning: avoid using 'goto' for flow control
-  // CHECK-NOTES: [[@LINE-2]]:1: note: label defined here
+  // CHECK-MESSAGES: [[@LINE-3]]:3: warning: avoid using 'goto' for flow control
+  // CHECK-MESSAGES: [[@LINE-2]]:1: note: label defined here
 
   // Test the GNU extension https://gcc.gnu.org/onlinedocs/gcc/Labels-as-Values.html
 some_label:;
@@ -132,8 +133,41 @@ before_the_loop:
     for (int j = 0; j < 10; ++j) {
       if (i * j > 80)
         goto before_the_loop;
-      // CHECK-NOTES: [[@LINE-1]]:9: warning: avoid using 'goto' for flow control
-      // CHECK-NOTES: [[@LINE-8]]:1: note: label defined here
+      // CHECK-MESSAGES: [[@LINE-1]]:9: warning: avoid using 'goto' for flow control
+      // CHECK-MESSAGES: [[@LINE-8]]:1: note: label defined here
     }
   }
 }
+
+#define macro_goto_code \
+  noop(); \
+  goto jump_to_me; \
+  noop(); \
+jump_to_me:; \
+
+#define macro_goto_label jump_to_me:;
+#define macro_goto_jump goto jump_to_me;
+
+void inside_macro_all() {
+  macro_goto_code
+  // CHECK-MESSAGES-MACRO: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
+  // CHECK-MESSAGES-MACRO: [[@LINE-2]]:3: note: label defined here
+}
+
+void inside_macro_label() {
+  noop();
+  goto jump_to_me;
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
+  // CHECK-MESSAGES: [[@LINE+2]]:3: note: label defined here
+  noop();
+  macro_goto_label
+}
+
+void inside_macro_goto() {
+  noop();
+  macro_goto_jump
+  // CHECK-MESSAGES-MACRO: [[@LINE-1]]:3: warning: avoid using 'goto' for flow control
+  // CHECK-MESSAGES-MACRO: [[@LINE+2]]:3: note: label defined here
+  noop();
+  jump_to_me:;
+}

From 879a55793a2d9540e9403938e4df6a827028a3ba Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 15:29:33 +0200
Subject: [PATCH 1282/1322] [ExpandVariadics] Clean up intrinsic declaration
 lookup (NFC)

The comment was outdated, as getDeclarationIfExists has been
introduced in the meantime.

We also only use this in one place where neither the Tys.empty()
case nor the FT is relevant, so just include the call to
getDeclarationIfExists().
---
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 16ffd503300e..da60f521bf08 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -132,25 +132,6 @@ public:
   virtual ~VariadicABIInfo() = default;
 };
 
-// Module implements getFunction() which returns nullptr on missing declaration
-// and getOrInsertFunction which creates one when absent. Intrinsics.h only
-// implements getDeclaration which creates one when missing. Checking whether
-// an intrinsic exists thus inserts it in the module and it then needs to be
-// deleted again to clean up.
-// The right name for the two functions on intrinsics would match Module::,
-// but doing that in a single change would introduce nullptr dereferences
-// where currently there are none. The minimal collateral damage approach
-// would split the change over a release to help downstream branches. As it
-// is unclear what approach will be preferred, implementing the trivial
-// function here in the meantime to decouple from that discussion.
-Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id,
-                                    ArrayRef<Type *> Tys = {}) {
-  if (Tys.empty())
-    return Intrinsic::getDeclarationIfExists(M, Id);
-  auto *FT = Intrinsic::getType(M->getContext(), Id, Tys);
-  return Intrinsic::getDeclarationIfExists(M, Id, Tys, FT);
-}
-
 class ExpandVariadics : public ModulePass {
 
   // The pass construction sets the default to optimize when called from middle
@@ -201,7 +182,7 @@ public:
     bool Changed = false;
     const DataLayout &DL = M.getDataLayout();
     if (Function *Intrinsic =
-            getPreexistingDeclaration(&M, ID, {IntrinsicArgType})) {
+            Intrinsic::getDeclarationIfExists(&M, ID, {IntrinsicArgType})) {
       for (User *U : make_early_inc_range(Intrinsic->users()))
         if (auto *I = dyn_cast<InstructionType>(U))
           Changed |= expandVAIntrinsicCall(Builder, DL, I);

From cccb82e5529136465ad3d073db7df4fe89a335c0 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 23 Jun 2025 09:36:12 -0400
Subject: [PATCH 1283/1322] [HLSL][SPIRV] Allow large z value in numthreads
 (#144934)

The current validation checks for numthreads assume that the target is
DXIL so the version checks inadvertently issue error when targeting
SPIR-V.
---
 clang/lib/Sema/SemaHLSL.cpp          |  7 +++++--
 clang/test/SemaHLSL/num_threads.hlsl | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index d003967a522a..b5975c2e5782 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1230,12 +1230,15 @@ void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) {
 void SemaHLSL::handleNumThreadsAttr(Decl *D, const ParsedAttr &AL) {
   llvm::VersionTuple SMVersion =
       getASTContext().getTargetInfo().getTriple().getOSVersion();
+  bool IsDXIL = getASTContext().getTargetInfo().getTriple().getArch() ==
+                llvm::Triple::dxil;
+
   uint32_t ZMax = 1024;
   uint32_t ThreadMax = 1024;
-  if (SMVersion.getMajor() <= 4) {
+  if (IsDXIL && SMVersion.getMajor() <= 4) {
     ZMax = 1;
     ThreadMax = 768;
-  } else if (SMVersion.getMajor() == 5) {
+  } else if (IsDXIL && SMVersion.getMajor() == 5) {
     ZMax = 64;
     ThreadMax = 1024;
   }
diff --git a/clang/test/SemaHLSL/num_threads.hlsl b/clang/test/SemaHLSL/num_threads.hlsl
index b5f9ad6c33cd..96200312bbf6 100644
--- a/clang/test/SemaHLSL/num_threads.hlsl
+++ b/clang/test/SemaHLSL/num_threads.hlsl
@@ -10,6 +10,8 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-compute -x hlsl -ast-dump -o - %s -DFAIL -verify
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel4.0-compute -x hlsl -ast-dump -o - %s -DFAIL -verify
 
+// RUN: %clang_cc1 -triple spirv-pc-vulkan1.3-compute -x hlsl -ast-dump -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
 #if __SHADER_TARGET_STAGE == __SHADER_STAGE_COMPUTE || __SHADER_TARGET_STAGE == __SHADER_STAGE_MESH || __SHADER_TARGET_STAGE == __SHADER_STAGE_AMPLIFICATION || __SHADER_TARGET_STAGE == __SHADER_STAGE_LIBRARY
 #ifdef FAIL
 
@@ -88,24 +90,30 @@ int entry() {
 
 // Because these two attributes match, they should both appear in the AST
 [numthreads(2,2,1)]
-// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:90:2, col:18> 2 2 1
+// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:{{[0-9]+}}:2, col:18> 2 2 1
 int secondFn();
 
 [numthreads(2,2,1)]
-// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:94:2, col:18> 2 2 1
+// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:{{[0-9]+}}:2, col:18> 2 2 1
 int secondFn() {
   return 1;
 }
 
 [numthreads(4,2,1)]
-// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:100:2, col:18> 4 2 1
+// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:{{[0-9]+}}:2, col:18> 4 2 1
 int onlyOnForwardDecl();
 
-// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:100:2, col:18> Inherited 4 2 1
+// CHECK: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:{{[0-9]+}}:2, col:18> Inherited 4 2 1
 int onlyOnForwardDecl() {
   return 1;
 }
 
+#ifdef __spirv__ 
+[numthreads(4,2,128)]
+// CHECK-SPIRV: HLSLNumThreadsAttr 0x{{[0-9a-fA-F]+}} <line:{{[0-9]+}}:2, col:20> 4 2 128
+int largeZ();
+#endif 
+
 #else // Vertex and Pixel only beyond here
 // expected-error-re@+1 {{attribute 'numthreads' is unsupported in '{{[A-Za-z]+}}' shaders, requires one of the following: compute, amplification, mesh}}
 [numthreads(1,1,1)]

From cd91d0fff9293a904704784c92c28637bfebef45 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 23 Jun 2025 15:54:39 +0200
Subject: [PATCH 1284/1322] [mlir][py] Don't use a CMake iteration to find
 nanobind target to suppress warnings (NFC) (#143863)

Following approach suggested by @hpkfft.
---
 mlir/cmake/modules/AddMLIRPython.cmake | 47 ++++++++++++--------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index c2616ea57ace..c14e614ed7d9 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -676,33 +676,28 @@ function(add_mlir_python_extension libname extname)
       # Avoid some warnings from upstream nanobind.
       # If a superproject set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES, let
       # the super project handle compile options as it wishes.
-      set(nanobind_target "nanobind-static")
-      if (NOT TARGET ${nanobind_target})
-        # Get correct nanobind target name: nanobind-static-ft or something else
-        # It is set by nanobind_add_module function according to the passed options
-        get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS)
+      get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY LINK_LIBRARIES)
+      target_compile_options(${NB_LIBRARY_TARGET_NAME}
+	PRIVATE
+	  -Wall -Wextra -Wpedantic
+	  -Wno-c++98-compat-extra-semi
+	  -Wno-cast-qual
+	  -Wno-covered-switch-default
+	  -Wno-nested-anon-types
+	  -Wno-unused-parameter
+	  -Wno-zero-length-array
+	  ${eh_rtti_enable})
 
-        # Iterate over the list of targets
-        foreach(target ${all_targets})
-          # Check if the target name matches the given string
-          if("${target}" MATCHES "nanobind-")
-            set(nanobind_target "${target}")
-          endif()
-        endforeach()
-
-        if (NOT TARGET ${nanobind_target})
-          message(FATAL_ERROR "Could not find nanobind target to set compile options to")
-        endif()
-      endif()
-      target_compile_options(${nanobind_target}
-        PRIVATE
-          -Wno-cast-qual
-          -Wno-zero-length-array
-          -Wno-nested-anon-types
-          -Wno-c++98-compat-extra-semi
-          -Wno-covered-switch-default
-          ${eh_rtti_enable}
-      )
+      target_compile_options(${libname}
+	PRIVATE
+	  -Wall -Wextra -Wpedantic
+	  -Wno-c++98-compat-extra-semi
+	  -Wno-cast-qual
+	  -Wno-covered-switch-default
+	  -Wno-nested-anon-types
+	  -Wno-unused-parameter
+	  -Wno-zero-length-array
+	  ${eh_rtti_enable})
     endif()
 
     if(APPLE)

From d0e5d6fd6180b0f294a00cf48996219df97c9e78 Mon Sep 17 00:00:00 2001
From: Afanasyev Ivan <ivafanas@gmail.com>
Date: Mon, 23 Jun 2025 21:04:22 +0700
Subject: [PATCH 1285/1322] [CodeGen][CodeLayout] Fix segfault on access to
 deleted block in MBP. (#142357)

Problem 1: There is a typo which reassigns `BlockWorkList` to
`EHPadWorkList` on attempt to remove `RemBB` from work lists.

Problem 2: `Chain->UnscheduledPredecessors == 0` is an incorrect way to
check whether `RemBB` is enqueued or not. The root cause is a postponed
deletion of `WorkList` from already scheduled blocks in
`selectBestCandidateBlock`. Bug happens in the following scenario:

* `FunctionChain` is being processed with non-zero
`UnscheduledPredecessors`
* Block `B'` is added to the `BlockWorkList`
* Block `B'` is chosen as the best successor (`selectBestSuccessor`) for
some another block and added into `Chain`
* Block `B'` is removed by tail duplicator.

`RemovalCallback` erroneously won't erase `B'` from `BlockWorkList`,
because `UnscheduledPredecessors` value of `FunctionChain` is not zero
(and it is allowed to be non-zero).

Proposed solution is to always cleanup worklists on block deletion by
tail duplicator.
---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 2dbabfe345d5..e9c75f0753f8 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3228,13 +3228,9 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
     // Signal to outer function
     Removed = true;
 
-    // Conservative default.
-    bool InWorkList = true;
     // Remove from the Chain and Chain Map
     if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) {
-      BlockChain *Chain = It->second;
-      InWorkList = Chain->UnscheduledPredecessors == 0;
-      Chain->remove(RemBB);
+      It->second->remove(RemBB);
       BlockToChain.erase(It);
     }
 
@@ -3244,11 +3240,10 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
     }
 
     // Handle the Work Lists
-    if (InWorkList) {
-      SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
-      if (RemBB->isEHPad())
-        RemoveList = EHPadWorkList;
-      llvm::erase(RemoveList, RemBB);
+    if (RemBB->isEHPad()) {
+      llvm::erase(EHPadWorkList, RemBB);
+    } else {
+      llvm::erase(BlockWorkList, RemBB);
     }
 
     // Handle the filter set

From f4ca2231969add3f9044fe1d13bbd80cd3a089f0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 23 Jun 2025 15:06:29 +0100
Subject: [PATCH 1286/1322] [LAA] Update early-exit test to cover last valid &
 first invalid access.

Make sure tests cover loops accessing the last valid and the first
invalid memory location. Note that the test
`@all_exits_dominate_latch_countable_exits_at_most_1000_iterations_kno`
has been modified to execute at most 1000 iterations; it previously
executed 1001 iterations.
---
 .../early-exit-runtime-checks.ll              | 123 +++++++++++++++++-
 1 file changed, 120 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index e235b73daa6a..41db3802daff 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -59,6 +59,65 @@ e.2:
   ret void
 }
 
+define void @all_exits_dominate_latch_countable_exits_at_most_501_iterations_known_deref(ptr dereferenceable(2000) %A, ptr dereferenceable(2000) %B) {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_501_iterations_known_deref'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: (2004 + %B))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: (2004 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 501
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
+
+
 define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_not_known_deref(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_not_known_deref'
 ; CHECK-NEXT:    loop.header:
@@ -130,6 +189,64 @@ define i32 @all_exits_dominate_latch_countable_exits_at_most_1000_iterations_kno
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: (4000 + %B)<nuw>)
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: (4000 + %A)<nuw>)
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 999
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cntable.c.2 = icmp eq i64 %iv.next, 2000
+  br i1 %cntable.c.2, label %e.0, label %latch
+
+latch:
+  br label %loop.header
+
+e.0:
+  ret i32 0
+
+e.1:
+  ret i32 1
+
+e.2:
+  ret i32 2
+}
+
+define i32 @all_exits_dominate_latch_countable_exits_at_most_1001_iterations_known_deref(ptr dereferenceable(4000) %A, ptr dereferenceable(4000) %B) {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_1001_iterations_known_deref'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
 ; CHECK-NEXT:          (Low: %B High: (4004 + %B))
 ; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
 ; CHECK-NEXT:        Group GRP1:
@@ -188,10 +305,10 @@ define i32 @all_exits_dominate_latch_countable_exits_at_most_1000_iterations_not
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group GRP0:
-; CHECK-NEXT:          (Low: %B High: (4004 + %B))
+; CHECK-NEXT:          (Low: %B High: (4000 + %B))
 ; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
 ; CHECK-NEXT:        Group GRP1:
-; CHECK-NEXT:          (Low: %A High: (4004 + %A))
+; CHECK-NEXT:          (Low: %A High: (4000 + %A))
 ; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
@@ -208,7 +325,7 @@ loop.header:
   %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
   %l = load i32, ptr %gep.A, align 4
   store i32 0, ptr %gep.B, align 4
-  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %cntable.c.1 = icmp ult i64 %iv, 999
   br i1 %cntable.c.1, label %b2, label %e.1
 
 b2:

From 6d17eb5126b352c7929e2cfcd2440c3bd70ec907 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 23 Jun 2025 07:09:48 -0700
Subject: [PATCH 1287/1322] [NFC][Clang][AST] Use `llvm::copy` instead of
 `memcpy` in StringLiteral (#145187)

---
 clang/include/clang/AST/Expr.h  | 15 +++------------
 clang/lib/AST/ASTImporter.cpp   |  6 +++---
 clang/lib/AST/Expr.cpp          | 21 +++++++++------------
 clang/lib/Sema/SemaExpr.cpp     | 14 ++++++--------
 clang/lib/Sema/SemaExprObjC.cpp |  3 +--
 5 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 41e50359962e..bc3d35935f30 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -1834,8 +1834,7 @@ class StringLiteral final
 
   /// Build a string literal.
   StringLiteral(const ASTContext &Ctx, StringRef Str, StringLiteralKind Kind,
-                bool Pascal, QualType Ty, const SourceLocation *Loc,
-                unsigned NumConcatenated);
+                bool Pascal, QualType Ty, ArrayRef<SourceLocation> Locs);
 
   /// Build an empty string literal.
   StringLiteral(EmptyShell Empty, unsigned NumConcatenated, unsigned Length,
@@ -1853,18 +1852,10 @@ class StringLiteral final
 
 public:
   /// This is the "fully general" constructor that allows representation of
-  /// strings formed from multiple concatenated tokens.
+  /// strings formed from one or more concatenated tokens.
   static StringLiteral *Create(const ASTContext &Ctx, StringRef Str,
                                StringLiteralKind Kind, bool Pascal, QualType Ty,
-                               const SourceLocation *Loc,
-                               unsigned NumConcatenated);
-
-  /// Simple constructor for string literals made from one token.
-  static StringLiteral *Create(const ASTContext &Ctx, StringRef Str,
-                               StringLiteralKind Kind, bool Pascal, QualType Ty,
-                               SourceLocation Loc) {
-    return Create(Ctx, Str, Kind, Pascal, Ty, &Loc, 1);
-  }
+                               ArrayRef<SourceLocation> Locs);
 
   /// Construct an empty string literal.
   static StringLiteral *CreateEmpty(const ASTContext &Ctx,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 4621ebb854d8..3e4be21adb6b 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -7695,9 +7695,9 @@ ExpectedStmt ASTNodeImporter::VisitStringLiteral(StringLiteral *E) {
       E->tokloc_begin(), E->tokloc_end(), ToLocations.begin()))
     return std::move(Err);
 
-  return StringLiteral::Create(
-      Importer.getToContext(), E->getBytes(), E->getKind(), E->isPascal(),
-      *ToTypeOrErr, ToLocations.data(), ToLocations.size());
+  return StringLiteral::Create(Importer.getToContext(), E->getBytes(),
+                               E->getKind(), E->isPascal(), *ToTypeOrErr,
+                               ToLocations);
 }
 
 ExpectedStmt ASTNodeImporter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index c3722c65abf6..0914374b7ffd 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1123,14 +1123,13 @@ unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target,
 
 StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str,
                              StringLiteralKind Kind, bool Pascal, QualType Ty,
-                             const SourceLocation *Loc,
-                             unsigned NumConcatenated)
+                             ArrayRef<SourceLocation> Locs)
     : Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary) {
 
   unsigned Length = Str.size();
 
   StringLiteralBits.Kind = llvm::to_underlying(Kind);
-  StringLiteralBits.NumConcatenated = NumConcatenated;
+  StringLiteralBits.NumConcatenated = Locs.size();
 
   if (Kind != StringLiteralKind::Unevaluated) {
     assert(Ctx.getAsConstantArrayType(Ty) &&
@@ -1169,11 +1168,10 @@ StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str,
 
   // Initialize the trailing array of SourceLocation.
   // This is safe since SourceLocation is POD-like.
-  std::memcpy(getTrailingObjects<SourceLocation>(), Loc,
-              NumConcatenated * sizeof(SourceLocation));
+  llvm::copy(Locs, getTrailingObjects<SourceLocation>());
 
   // Initialize the trailing array of char holding the string data.
-  std::memcpy(getTrailingObjects<char>(), Str.data(), Str.size());
+  llvm::copy(Str, getTrailingObjects<char>());
 
   setDependence(ExprDependence::None);
 }
@@ -1188,13 +1186,12 @@ StringLiteral::StringLiteral(EmptyShell Empty, unsigned NumConcatenated,
 
 StringLiteral *StringLiteral::Create(const ASTContext &Ctx, StringRef Str,
                                      StringLiteralKind Kind, bool Pascal,
-                                     QualType Ty, const SourceLocation *Loc,
-                                     unsigned NumConcatenated) {
+                                     QualType Ty,
+                                     ArrayRef<SourceLocation> Locs) {
   void *Mem = Ctx.Allocate(totalSizeToAlloc<unsigned, SourceLocation, char>(
-                               1, NumConcatenated, Str.size()),
+                               1, Locs.size(), Str.size()),
                            alignof(StringLiteral));
-  return new (Mem)
-      StringLiteral(Ctx, Str, Kind, Pascal, Ty, Loc, NumConcatenated);
+  return new (Mem) StringLiteral(Ctx, Str, Kind, Pascal, Ty, Locs);
 }
 
 StringLiteral *StringLiteral::CreateEmpty(const ASTContext &Ctx,
@@ -4406,7 +4403,7 @@ void ShuffleVectorExpr::setExprs(const ASTContext &C, ArrayRef<Expr *> Exprs) {
 
   this->ShuffleVectorExprBits.NumExprs = Exprs.size();
   SubExprs = new (C) Stmt *[ShuffleVectorExprBits.NumExprs];
-  memcpy(SubExprs, Exprs.data(), sizeof(Expr *) * Exprs.size());
+  llvm::copy(Exprs, SubExprs);
 }
 
 GenericSelectionExpr::GenericSelectionExpr(
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index fc2819458a4f..7307b01bb2df 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2070,9 +2070,9 @@ ExprResult Sema::ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks) {
   for (const Token &Tok : StringToks)
     StringTokLocs.push_back(Tok.getLocation());
 
-  StringLiteral *Lit = StringLiteral::Create(
-      Context, Literal.GetString(), StringLiteralKind::Unevaluated, false, {},
-      &StringTokLocs[0], StringTokLocs.size());
+  StringLiteral *Lit = StringLiteral::Create(Context, Literal.GetString(),
+                                             StringLiteralKind::Unevaluated,
+                                             false, {}, StringTokLocs);
 
   if (!Literal.getUDSuffix().empty()) {
     SourceLocation UDSuffixLoc =
@@ -2206,10 +2206,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
       Context.getStringLiteralArrayType(CharTy, Literal.GetNumStringChars());
 
   // Pass &StringTokLocs[0], StringTokLocs.size() to factory!
-  StringLiteral *Lit = StringLiteral::Create(Context, Literal.GetString(),
-                                             Kind, Literal.Pascal, StrTy,
-                                             &StringTokLocs[0],
-                                             StringTokLocs.size());
+  StringLiteral *Lit = StringLiteral::Create(
+      Context, Literal.GetString(), Kind, Literal.Pascal, StrTy, StringTokLocs);
   if (Literal.getUDSuffix().empty())
     return Lit;
 
@@ -3793,7 +3791,7 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
       Expr *Lit =
           StringLiteral::Create(Context, StringRef(TokSpelling.data(), Length),
                                 StringLiteralKind::Ordinary,
-                                /*Pascal*/ false, StrTy, &TokLoc, 1);
+                                /*Pascal*/ false, StrTy, TokLoc);
       return BuildLiteralOperatorCall(R, OpNameInfo, Lit, TokLoc);
     }
 
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 395f2f340dbd..e0662d82914f 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -74,8 +74,7 @@ ExprResult SemaObjC::ParseObjCStringLiteral(SourceLocation *AtLocs,
         CAT->getElementType(), llvm::APInt(32, StrBuf.size() + 1), nullptr,
         CAT->getSizeModifier(), CAT->getIndexTypeCVRQualifiers());
     S = StringLiteral::Create(Context, StrBuf, StringLiteralKind::Ordinary,
-                              /*Pascal=*/false, StrTy, &StrLocs[0],
-                              StrLocs.size());
+                              /*Pascal=*/false, StrTy, StrLocs);
   }
 
   return BuildObjCStringLiteral(AtLocs[0], S);

From bb8c42e859871aaf6bbec78dee28f124d47348dd Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 23 Jun 2025 15:27:49 +0100
Subject: [PATCH 1288/1322] [LV] Extend FindLastIV to unsigned case (#141752)

Split the FindLastIV RecurKind into SMax and UMax variants, depending on
the reduction op produced.
---
 llvm/include/llvm/Analysis/IVDescriptors.h    |  24 +-
 .../include/llvm/Transforms/Utils/LoopUtils.h |   3 +-
 llvm/lib/Analysis/IVDescriptors.cpp           |  60 ++--
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   6 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   9 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  13 +-
 .../LoopVectorize/iv-select-cmp-trunc.ll      | 249 ++++++++++++---
 .../Transforms/LoopVectorize/iv-select-cmp.ll | 286 ++++++++++++++----
 8 files changed, 513 insertions(+), 137 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 3b627a514085..463249461483 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -54,9 +54,12 @@ enum class RecurKind {
   FMulAdd,  ///< Sum of float products with llvm.fmuladd(a * b + sum).
   AnyOf,    ///< AnyOf reduction with select(cmp(),x,y) where one of (x,y) is
             ///< loop invariant, and both x and y are integer type.
-  FindLastIV, ///< FindLast reduction with select(cmp(),x,y) where one of
-              ///< (x,y) is increasing loop induction, and both x and y are
-              ///< integer type.
+  FindLastIVSMax, ///< FindLast reduction with select(cmp(),x,y) where one of
+                  ///< (x,y) is increasing loop induction, and both x and y
+                  ///< are integer type, producing a SMax reduction.
+  FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of
+                  ///< (x,y) is increasing loop induction, and both x and y
+                  ///< are integer type, producing a UMax reduction.
   // clang-format on
   // TODO: Any_of and FindLast reduction need not be restricted to integer type
   // only.
@@ -259,7 +262,14 @@ public:
   /// Returns true if the recurrence kind is of the form
   ///   select(cmp(),x,y) where one of (x,y) is increasing loop induction.
   static bool isFindLastIVRecurrenceKind(RecurKind Kind) {
-    return Kind == RecurKind::FindLastIV;
+    return Kind == RecurKind::FindLastIVSMax ||
+           Kind == RecurKind::FindLastIVUMax;
+  }
+
+  /// Returns true if recurrece kind is a signed redux kind.
+  static bool isSignedRecurrenceKind(RecurKind Kind) {
+    return Kind == RecurKind::SMax || Kind == RecurKind::SMin ||
+           Kind == RecurKind::FindLastIVSMax;
   }
 
   /// Returns the type of the recurrence. This type can be narrower than the
@@ -271,8 +281,10 @@ public:
   Value *getSentinelValue() const {
     assert(isFindLastIVRecurrenceKind(Kind) && "Unexpected recurrence kind");
     Type *Ty = StartValue->getType();
-    return ConstantInt::get(Ty,
-                            APInt::getSignedMinValue(Ty->getIntegerBitWidth()));
+    unsigned BW = Ty->getIntegerBitWidth();
+    return ConstantInt::get(Ty, isSignedRecurrenceKind(Kind)
+                                    ? APInt::getSignedMinValue(BW)
+                                    : APInt::getMinValue(BW));
   }
 
   /// Returns a reference to the instructions used for type-promoting the
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 12be3bad04d3..e4d2f9d19170 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -434,7 +434,8 @@ LLVM_ABI Value *createAnyOfReduction(IRBuilderBase &B, Value *Src,
 /// Create a reduction of the given vector \p Src for a reduction of the
 /// kind RecurKind::FindLastIV.
 LLVM_ABI Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src,
-                                          Value *Start, Value *Sentinel);
+                                          RecurKind RdxKind, Value *Start,
+                                          Value *Sentinel);
 
 /// Create an ordered reduction intrinsic using the given recurrence
 /// kind \p RdxKind.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 7232283b9101..c8e97e5ec0e5 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -50,7 +50,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
   case RecurKind::UMax:
   case RecurKind::UMin:
   case RecurKind::AnyOf:
-  case RecurKind::FindLastIV:
+  case RecurKind::FindLastIVSMax:
+  case RecurKind::FindLastIVUMax:
     return true;
   }
   return false;
@@ -700,47 +701,59 @@ RecurrenceDescriptor::isFindLastIVPattern(Loop *TheLoop, PHINode *OrigPhi,
                                      m_Value(NonRdxPhi)))))
     return InstDesc(false, I);
 
-  auto IsIncreasingLoopInduction = [&](Value *V) {
+  // Returns a non-nullopt boolean indicating the signedness of the recurrence
+  // when a valid FindLastIV pattern is found.
+  auto GetRecurKind = [&](Value *V) -> std::optional<RecurKind> {
     Type *Ty = V->getType();
     if (!SE.isSCEVable(Ty))
-      return false;
+      return std::nullopt;
 
     auto *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(V));
     if (!AR || AR->getLoop() != TheLoop)
-      return false;
+      return std::nullopt;
 
     const SCEV *Step = AR->getStepRecurrence(SE);
     if (!SE.isKnownPositive(Step))
-      return false;
+      return std::nullopt;
 
-    const ConstantRange IVRange = SE.getSignedRange(AR);
-    unsigned NumBits = Ty->getIntegerBitWidth();
     // Keep the minimum value of the recurrence type as the sentinel value.
     // The maximum acceptable range for the increasing induction variable,
     // called the valid range, will be defined as
     //   [<sentinel value> + 1, <sentinel value>)
-    // where <sentinel value> is SignedMin(<recurrence type>)
+    // where <sentinel value> is [Signed|Unsigned]Min(<recurrence type>)
     // TODO: This range restriction can be lifted by adding an additional
     // virtual OR reduction.
-    const APInt Sentinel = APInt::getSignedMinValue(NumBits);
-    const ConstantRange ValidRange =
-        ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
-    LLVM_DEBUG(dbgs() << "LV: FindLastIV valid range is " << ValidRange
-                      << ", and the signed range of " << *AR << " is "
-                      << IVRange << "\n");
-    // Ensure the induction variable does not wrap around by verifying that its
-    // range is fully contained within the valid range.
-    return ValidRange.contains(IVRange);
+    auto CheckRange = [&](bool IsSigned) {
+      const ConstantRange IVRange =
+          IsSigned ? SE.getSignedRange(AR) : SE.getUnsignedRange(AR);
+      unsigned NumBits = Ty->getIntegerBitWidth();
+      const APInt Sentinel = IsSigned ? APInt::getSignedMinValue(NumBits)
+                                      : APInt::getMinValue(NumBits);
+      const ConstantRange ValidRange =
+          ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
+      LLVM_DEBUG(dbgs() << "LV: FindLastIV valid range is " << ValidRange
+                        << ", and the range of " << *AR << " is " << IVRange
+                        << "\n");
+
+      // Ensure the induction variable does not wrap around by verifying that
+      // its range is fully contained within the valid range.
+      return ValidRange.contains(IVRange);
+    };
+    if (CheckRange(true))
+      return RecurKind::FindLastIVSMax;
+    if (CheckRange(false))
+      return RecurKind::FindLastIVUMax;
+    return std::nullopt;
   };
 
   // We are looking for selects of the form:
   //   select(cmp(), phi, increasing_loop_induction) or
   //   select(cmp(), increasing_loop_induction, phi)
   // TODO: Support for monotonically decreasing induction variable
-  if (!IsIncreasingLoopInduction(NonRdxPhi))
-    return InstDesc(false, I);
+  if (auto RK = GetRecurKind(NonRdxPhi))
+    return InstDesc(I, *RK);
 
-  return InstDesc(I, RecurKind::FindLastIV);
+  return InstDesc(false, I);
 }
 
 RecurrenceDescriptor::InstDesc
@@ -985,8 +998,8 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                       << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FindLastIV, TheLoop, FMF, RedDes, DB, AC,
-                      DT, SE)) {
+  if (AddReductionVar(Phi, RecurKind::FindLastIVSMax, TheLoop, FMF, RedDes, DB,
+                      AC, DT, SE)) {
     LLVM_DEBUG(dbgs() << "Found a FindLastIV reduction PHI." << *Phi << "\n");
     return true;
   }
@@ -1137,7 +1150,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   case RecurKind::Mul:
     return Instruction::Mul;
   case RecurKind::AnyOf:
-  case RecurKind::FindLastIV:
+  case RecurKind::FindLastIVSMax:
+  case RecurKind::FindLastIVUMax:
   case RecurKind::Or:
     return Instruction::Or;
   case RecurKind::And:
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index cf6b183c78ac..c50bb4a497c6 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1224,9 +1224,11 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src,
 }
 
 Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src,
-                                       Value *Start, Value *Sentinel) {
+                                       RecurKind RdxKind, Value *Start,
+                                       Value *Sentinel) {
+  bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RdxKind);
   Value *MaxRdx = Src->getType()->isVectorTy()
-                      ? Builder.CreateIntMaxReduce(Src, true)
+                      ? Builder.CreateIntMaxReduce(Src, IsSigned)
                       : Src;
   // Correct the final reduction result back to the start value if the maximum
   // reduction is sentinel value.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cb65c225dcdb..27a7538ecd93 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23180,7 +23180,8 @@ private:
         case RecurKind::FMul:
         case RecurKind::FMulAdd:
         case RecurKind::AnyOf:
-        case RecurKind::FindLastIV:
+        case RecurKind::FindLastIVSMax:
+        case RecurKind::FindLastIVUMax:
         case RecurKind::FMaximumNum:
         case RecurKind::FMinimumNum:
         case RecurKind::None:
@@ -23314,7 +23315,8 @@ private:
     case RecurKind::FMul:
     case RecurKind::FMulAdd:
     case RecurKind::AnyOf:
-    case RecurKind::FindLastIV:
+    case RecurKind::FindLastIVSMax:
+    case RecurKind::FindLastIVUMax:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
@@ -23413,7 +23415,8 @@ private:
     case RecurKind::FMul:
     case RecurKind::FMulAdd:
     case RecurKind::AnyOf:
-    case RecurKind::FindLastIV:
+    case RecurKind::FindLastIVSMax:
+    case RecurKind::FindLastIVUMax:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3e12fdf9163e..805cd04c5ce3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -642,7 +642,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
     // Get its reduction variable descriptor.
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    [[maybe_unused]] RecurKind RK = RdxDesc.getRecurrenceKind();
+    RecurKind RK = RdxDesc.getRecurrenceKind();
     assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) &&
            "Unexpected reduction kind");
     assert(!PhiR->isInLoop() &&
@@ -652,14 +652,17 @@ Value *VPInstruction::generate(VPTransformState &State) {
     // sentinel value, followed by one operand for each part of the reduction.
     unsigned UF = getNumOperands() - 3;
     Value *ReducedPartRdx = State.get(getOperand(3));
-    for (unsigned Part = 1; Part < UF; ++Part) {
-      ReducedPartRdx = createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx,
+    RecurKind MinMaxKind = RecurrenceDescriptor::isSignedRecurrenceKind(RK)
+                               ? RecurKind::SMax
+                               : RecurKind::UMax;
+    for (unsigned Part = 1; Part < UF; ++Part)
+      ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
                                       State.get(getOperand(3 + Part)));
-    }
 
     Value *Start = State.get(getOperand(1), true);
     Value *Sentinel = getOperand(2)->getLiveInIRValue();
-    return createFindLastIVReduction(Builder, ReducedPartRdx, Start, Sentinel);
+    return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
+                                     Sentinel);
   }
   case VPInstruction::ComputeReductionResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index e4597ebfe7dc..6a2e3df50e69 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4
@@ -623,6 +623,207 @@ exit:                                            ; preds = %for.body
   ret i32 %spec.select
 }
 
+; The construct that are introduced by IndVarSimplify is:
+;   %1 = trunc i64 %iv to i32
+; The loop exit condition is a constant that overflows signed i32,
+; but not unsigned i32:
+;   %exitcond.not = icmp eq i64 %inc, 4294967294
+; Hence, we can vectorize with the unsigned variant of FindLastIV.
+define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
+; CHECK-VF4IC1-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 2147483646, i32 2147483647, i32 -2147483648, i32 -2147483647>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = add i64 2147483646, [[INDEX]]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP5]], i32 331
+; CHECK-VF4IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967294, %[[MIDDLE_BLOCK]] ], [ 2147483646, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
+; CHECK-VF4IC1-NEXT:    [[CONV:%.*]] = trunc i64 [[IV1]] to i32
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 2147483646, i32 2147483647, i32 -2147483648, i32 -2147483647>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2147483646, [[INDEX]]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD4]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD5]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD6]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9]] = select <4 x i1> [[TMP5]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = select <4 x i1> [[TMP6]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX7:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP11]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX8:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX7]], <4 x i32> [[TMP12]])
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_MINMAX8]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP14]], 0
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP14]], i32 331
+; CHECK-VF4IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967294, %[[MIDDLE_BLOCK]] ], [ 2147483646, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3
+; CHECK-VF4IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2147483646, [[INDEX]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-VF1IC4-NEXT:    [[OFFSET_IDX4:%.*]] = add i64 2147483646, [[INDEX]]
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = add i32 [[TMP3]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = add i32 [[TMP3]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP12]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP13]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp sgt i32 [[TMP14]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = select i1 [[TMP15]], i32 [[TMP3]], i32 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = select i1 [[TMP16]], i32 [[TMP4]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = select i1 [[TMP17]], i32 [[TMP5]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP19]], i32 [[TMP20]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX]], i32 [[TMP21]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX6:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX5]], i32 [[TMP22]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX6]], 0
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX6]], i32 331
+; CHECK-VF1IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967294, %[[MIDDLE_BLOCK]] ], [ 2147483646, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP24]], 3
+; CHECK-VF1IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 2147483646, %entry ], [ %inc, %for.body ]
+  %rdx = phi i32 [ 331, %entry ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp = icmp sgt i32 %0, 3
+  %conv = trunc i64 %iv to i32
+  %spec.select = select i1 %cmp, i32 %conv, i32 %rdx
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %inc, 4294967294
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body
+  ret i32 %spec.select
+}
+
 ; Negative tests
 
 ; This test can theoretically be vectorized, but only with a runtime-check.
@@ -844,7 +1045,7 @@ exit:                                             ; preds = %for.body, %entry
 ; The construct that are introduced by IndVarSimplify is:
 ;   %1 = trunc i64 %iv to i32
 ; However, the loop exit condition is a constant that overflows i32:
-;   %exitcond.not = icmp eq i64 %inc, 4294967294
+;   %exitcond.not = icmp eq i64 %inc, 9223372036854775806
 ; Hence, the i32 will most certainly wrap and hit the sentinel value, and we
 ; cannot vectorize this case.
 define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
@@ -853,7 +1054,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
 ; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
@@ -861,7 +1062,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
 ; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
 ; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; CHECK-VF4IC1:       [[EXIT]]:
 ; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
@@ -872,7 +1073,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
 ; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
@@ -880,7 +1081,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
 ; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
 ; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; CHECK-VF4IC4:       [[EXIT]]:
 ; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
@@ -891,7 +1092,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
 ; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
 ; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
@@ -899,7 +1100,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
 ; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
 ; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; CHECK-VF1IC4:       [[EXIT]]:
 ; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ]
@@ -909,7 +1110,7 @@ entry:
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
-  %iv = phi i64 [ 2147483646, %entry ], [ %inc, %for.body ]
+  %iv = phi i64 [ 4294967294, %entry ], [ %inc, %for.body ]
   %rdx = phi i32 [ 331, %entry ], [ %spec.select, %for.body ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
@@ -917,7 +1118,7 @@ for.body:                                         ; preds = %entry, %for.body
   %conv = trunc i64 %iv to i32
   %spec.select = select i1 %cmp, i32 %conv, i32 %rdx
   %inc = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %inc, 4294967294
+  %exitcond.not = icmp eq i64 %inc, 9223372036854775806
   br i1 %exitcond.not, label %exit, label %for.body
 
 exit:                                             ; preds = %for.body
@@ -1210,31 +1411,3 @@ exit:                                             ; preds = %for.body, %entry
   %rdx.0.lcssa = phi i16 [ %start, %entry ], [ %cond, %for.body ]
   ret i16 %rdx.0.lcssa
 }
-;.
-; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-;.
-; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-;.
-; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF1IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF1IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 07c720e9dcb1..eab5d5ea9b1f 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4
@@ -1665,6 +1665,232 @@ exit:                                             ; preds = %for.body
   ret i64 %cond
 }
 
+define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
+; CHECK-VF4IC1-LABEL: define i64 @select_icmp_unsigned_iv_range(
+; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-VF4IC1-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372036854775804
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP7]], i64 [[RDX_START]]
+; CHECK-VF4IC1-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -4, %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 9223372036854775804, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC1:       [[FOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[IV_J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC3:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[IV_I:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF4IC1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF4IC1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC1-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP9]], [[TMP10]]
+; CHECK-VF4IC1-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF4IC1-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i64 @select_icmp_unsigned_iv_range(
+; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD7]]
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD6]], [[WIDE_LOAD10]]
+; CHECK-VF4IC4-NEXT:    [[TMP14]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372036854775792
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP14]], <4 x i64> [[TMP15]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP16]])
+; CHECK-VF4IC4-NEXT:    [[RDX_MINMAX12:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[RDX_MINMAX11]], <4 x i64> [[TMP17]])
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[RDX_MINMAX12]])
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP19]], 0
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP19]], i64 [[RDX_START]]
+; CHECK-VF4IC4-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -16, %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ 9223372036854775792, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4IC4:       [[FOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[IV_J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC3:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[IV_I:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF4IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF4IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF4IC4-NEXT:    [[TMP22:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF4IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP21]], [[TMP22]]
+; CHECK-VF4IC4-NEXT:    [[COND]] = select i1 [[CMP2]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-VF4IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i64 @select_icmp_unsigned_iv_range(
+; CHECK-VF1IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV_I:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP28:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 -9223372036854775808, [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[IV_I]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = add i64 [[IV_I]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = add i64 [[IV_I]], 3
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT:    [[TMP36:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]]
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP5]]
+; CHECK-VF1IC4-NEXT:    [[TMP37:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP17]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[TMP36]], [[TMP37]]
+; CHECK-VF1IC4-NEXT:    [[TMP23:%.*]] = icmp sgt i64 [[TMP11]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp sgt i64 [[TMP12]], [[TMP20]]
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = icmp sgt i64 [[TMP13]], [[TMP21]]
+; CHECK-VF1IC4-NEXT:    [[TMP26]] = select i1 [[CMP2]], i64 [[OFFSET_IDX]], i64 [[VEC_PHI]]
+; CHECK-VF1IC4-NEXT:    [[TMP27]] = select i1 [[TMP23]], i64 [[TMP0]], i64 [[VEC_PHI1]]
+; CHECK-VF1IC4-NEXT:    [[TMP28]] = select i1 [[TMP24]], i64 [[TMP1]], i64 [[VEC_PHI2]]
+; CHECK-VF1IC4-NEXT:    [[TMP29]] = select i1 [[TMP25]], i64 [[TMP2]], i64 [[VEC_PHI3]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV_I]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372036854775804
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP26]], i64 [[TMP27]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX4:%.*]] = call i64 @llvm.umax.i64(i64 [[RDX_MINMAX]], i64 [[TMP28]])
+; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.umax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP29]])
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], 0
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 [[RDX_START]]
+; CHECK-VF1IC4-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -4, %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ 9223372036854775804, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF1IC4:       [[FOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[IV_J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC3:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[IV_I1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I1]]
+; CHECK-VF1IC4-NEXT:    [[TMP31:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-VF1IC4-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I1]]
+; CHECK-VF1IC4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-VF1IC4-NEXT:    [[CMP3:%.*]] = icmp sgt i64 [[TMP31]], [[TMP32]]
+; CHECK-VF1IC4-NEXT:    [[COND]] = select i1 [[CMP3]], i64 [[IV_J]], i64 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I1]], 1
+; CHECK-VF1IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
+; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv.j = phi i64 [ 9223372036854775808, %entry], [ %inc3, %for.body ]
+  %iv.i = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %rdx = phi i64 [ %rdx.start, %entry ], [ %cond, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv.i
+  %0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %iv.i
+  %1 = load i64, ptr %arrayidx1, align 8
+  %cmp2 = icmp sgt i64 %0, %1
+  %cond = select i1 %cmp2, i64 %iv.j, i64 %rdx
+  %inc = add nuw nsw i64 %iv.i, 1
+  %inc3 = add nsw i64 %iv.j, 1
+  %exitcond.not = icmp eq i64 %inc, 9223372036854775806
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body
+  ret i64 %cond
+}
+
 ; Negative tests
 
 define float @not_vectorized_select_float_induction_icmp(ptr %a, ptr %b, float %rdx.start, i64 %n) {
@@ -1927,61 +2153,3 @@ for.body:                                         ; preds = %entry, %for.body
 exit:                                             ; preds = %for.body
   ret i64 %cond
 }
-;.
-; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-; CHECK-VF4IC1: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK-VF4IC1: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
-;.
-; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-; CHECK-VF4IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK-VF4IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
-;.
-; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-VF1IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-VF1IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
-; CHECK-VF1IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK-VF1IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
-;.

From a50cb6ca3e125a2920d38c98d517393d1a5828d2 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 23 Jun 2025 10:46:59 -0400
Subject: [PATCH 1289/1322] [AMDGPU][True16][CodeGen] fix a predicate bug in
 VGPRImm with f16/bf16 (#144942)

Fixed a typo issue that f16/bf16 VGPRImm patterrn is not guarded by the
True16Predicate scope. The curly bracket is misplaced
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  18 +--
 .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll    |  39 +++---
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     |  20 +--
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll |  38 +++---
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll         |   4 +-
 llvm/test/CodeGen/AMDGPU/fminimum3.ll         |   4 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     | 116 +++++++++---------
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  10 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll        |  34 +++--
 .../AMDGPU/select-fabs-fneg-extract.f16.ll    |  40 +++---
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  |  60 ++++-----
 .../CodeGen/AMDGPU/vector-reduce-fmaximum.ll  |   4 +-
 .../CodeGen/AMDGPU/vector-reduce-fminimum.ll  |   4 +-
 13 files changed, 202 insertions(+), 189 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f85df5598534..7b45023dd3c7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2200,17 +2200,17 @@ foreach pred = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in {
       (VGPRImm<(i16 imm)>:$imm),
       (V_MOV_B32_e32 imm:$imm)
     >;
-  }
 
-  // FIXME: Workaround for ordering issue with peephole optimizer where
-  // a register class copy interferes with immediate folding.  Should
-  // use s_mov_b32, which can be shrunk to s_movk_i32
+    // FIXME: Workaround for ordering issue with peephole optimizer where
+    // a register class copy interferes with immediate folding.  Should
+    // use s_mov_b32, which can be shrunk to s_movk_i32
 
-  foreach vt = [f16, bf16] in {
-    def : GCNPat <
-      (VGPRImm<(vt fpimm)>:$imm),
-      (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm)))
-    >;
+    foreach vt = [f16, bf16] in {
+      def : GCNPat <
+        (VGPRImm<(vt fpimm)>:$imm),
+        (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm)))
+      >;
+    }
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 6ec9c1177c18..ba843ddd7bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -150,32 +150,33 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v3, v7 :: v_dual_mov_b32 v3, 0x7fc0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v6, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v4, v5 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7fc0
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.h
 ; GFX11-TRUE16-NEXT:  .LBB0_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index a511233af070..738bad7ad180 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1845,11 +1845,11 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x4000
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3c00, v2.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3c00, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, 0x4000, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v1
@@ -2002,11 +2002,11 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3c00, v2.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3c00, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x3800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, 0x3800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v1
@@ -2212,10 +2212,10 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test6:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4200
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x4200, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc800, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2320,10 +2320,10 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test7:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0xc400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4800, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index dbbe43152e0d..bb66bb319d48 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -524,16 +524,16 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:  .LBB2_3: ; %exit
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3d00
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3900
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.h
 ; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3900, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, 0x3d00, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, 0x3900, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x3900, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3d00, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, 0x3d00, s2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v2.l, v1.h
@@ -1254,16 +1254,16 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:  .LBB5_3: ; %exit
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3d00
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3900
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.h
 ; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3900, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, 0x3d00, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, 0x3900, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x3900, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3d00, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, 0x3d00, s2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v2.l, v1.h
@@ -1984,22 +1984,22 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:  .LBB8_4: ; %exit
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3d00
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3900
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v4.l
 ; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v5.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v2.h
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s3, 0.5, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, 0x3900, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.l, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v0.l, 0x3d00, s0
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.l
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s34, 0.5, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3900, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x3900, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3900, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, 0x3900, v0.l, s34
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.l, 0x3900, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, 0x3d00, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, 0x3d00, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, 0x3d00, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.l, 0x3d00, s34
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, 0x3d00, v0.l, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.l
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 53d940e1e6c1..069a47ec97bf 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -2018,9 +2018,9 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x4c00
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0x4800
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, 0x4800, v0.h
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, v1.l, 0x4c00
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_fmaximum3_f16_const1_const2:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index d1d0c0dcdb7e..d8746b58b16b 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -2018,9 +2018,9 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x4c00
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0x4800
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, 0x4800, v0.h
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, v1.l, 0x4c00
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_fminimum3_f16_const1_const2:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 12daf10594df..9c4901eb19f3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -6066,9 +6066,9 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5400, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6146,9 +6146,9 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x3c00, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6226,9 +6226,9 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd400, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6306,9 +6306,9 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xbc00, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6393,9 +6393,9 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5800, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6479,9 +6479,9 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5800, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6566,9 +6566,9 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4000
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x4000, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6652,9 +6652,9 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5800, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x4400, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6745,11 +6745,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x5400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -6850,11 +6850,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x3c00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3c00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -6957,11 +6957,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xd400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xbc00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xbc00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7075,11 +7075,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x5800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7202,11 +7202,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xd400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xd800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7320,11 +7320,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xcc00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xcc00, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xcc00, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xd800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7429,11 +7429,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7539,11 +7539,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3c00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x3c00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x3c00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7651,11 +7651,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xd400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xbc00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0xd400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0xd400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7765,11 +7765,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xbc00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xd400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0xbc00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0xbc00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -7888,11 +7888,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -8006,11 +8006,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -8127,11 +8127,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4000
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x4000, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -8245,11 +8245,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4400
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5800
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5800, v4.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5800, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x4400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x4400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index a0ad6328b0c0..51745c81bf21 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1399,13 +1399,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x4500
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x4500
 ; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1491,13 +1490,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 35
 ; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index cbd824e17197..6e94896fa206 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -62,14 +62,23 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s
 }
 
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; SDAG-GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; GFX9:       ; %bb.0:
@@ -99,6 +108,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v0, 1.0
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index 0684c3081983..1222d0efd62b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -468,10 +468,10 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xc000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xbc00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xc000, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -488,10 +488,10 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xc000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xbc00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xc000, vcc_lo
 ; GFX11-NSZ-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -535,10 +535,10 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_posk_posk_f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x4000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0x4000, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -555,10 +555,10 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_posk_posk_f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x4000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0x4000, vcc_lo
 ; GFX11-NSZ-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1427,10 +1427,10 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negk_negk_f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xc000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xbc00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xc000, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1447,10 +1447,10 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negk_negk_f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xc000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xbc00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xc000, vcc_lo
 ; GFX11-NSZ-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1495,10 +1495,10 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_negliteralk_f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xe800
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xec00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xec00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xe800, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1515,10 +1515,10 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_negliteralk_f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xe800
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xec00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xec00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xe800, vcc_lo
 ; GFX11-NSZ-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1561,10 +1561,10 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_negk_negk_f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xc000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xbc00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xc000, vcc_lo
 ; GFX11-SAFE-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1581,10 +1581,10 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_negk_negk_f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xc000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0xbc00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, 0xc000, vcc_lo
 ; GFX11-NSZ-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 4e07c724b8a8..92d3277d5d3e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -818,12 +818,12 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
@@ -846,12 +846,12 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
@@ -923,12 +923,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_posk_posk_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v3.l, s0
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x4000, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
@@ -951,12 +951,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_posk_posk_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v3.l, s0
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x4000, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
@@ -2366,12 +2366,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negk_negk_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
@@ -2394,12 +2394,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negk_negk_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
@@ -2472,12 +2472,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xe800
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xec00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xec00, v3.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xec00, v3.l, s0
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xe800, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xe800, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
@@ -2500,12 +2500,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xe800
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xec00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xec00, v3.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xec00, v3.l, s0
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xe800, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xe800, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
@@ -2576,12 +2576,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_negk_negk_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc000
+; GFX11-SAFE-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
@@ -2604,12 +2604,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_negk_negk_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc000
+; GFX11-NSZ-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
index 41fad10051da..1a81e82607aa 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
@@ -186,11 +186,9 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v2, 0xfc00
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0xfc00
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
index 61819a85dd82..6414f73c22b3 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
@@ -210,11 +210,9 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7c00
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;

From b0366eeb7effcc688b467a7fa66bcdcbd97c52aa Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Mon, 23 Jun 2025 16:57:14 +0200
Subject: [PATCH 1290/1322] [MLIR] Add support for int8/uint8 properties
 (#145019)

This patch is adding the ability to print a uint8_t/int8_t as an int
instead of a char and demonstrate support for int8_t/uin8_t properties.

Fix #144993
---
 mlir/include/mlir/IR/ODSSupport.h       | 20 +++++++++++++++
 mlir/include/mlir/IR/OpImplementation.h | 13 ++++++++++
 mlir/include/mlir/IR/Properties.td      |  1 +
 mlir/lib/IR/ODSSupport.cpp              | 34 +++++++++++++++++++++++++
 mlir/test/IR/properties.mlir            | 10 ++++++--
 mlir/test/lib/Dialect/Test/TestOps.td   |  4 +++
 6 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/IR/ODSSupport.h b/mlir/include/mlir/IR/ODSSupport.h
index 25d6f3da6a86..b24a2470470f 100644
--- a/mlir/include/mlir/IR/ODSSupport.h
+++ b/mlir/include/mlir/IR/ODSSupport.h
@@ -43,6 +43,26 @@ convertFromAttribute(int32_t &storage, Attribute attr,
 /// Convert the provided int32_t to an IntegerAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, int32_t storage);
 
+/// Convert an IntegerAttr attribute to an int8_t, or return an error if the
+/// attribute isn't an IntegerAttr. If the optional diagnostic is provided an
+/// error message is also emitted.
+LogicalResult
+convertFromAttribute(int8_t &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the provided int8_t to an IntegerAttr attribute.
+Attribute convertToAttribute(MLIRContext *ctx, int8_t storage);
+
+/// Convert an IntegerAttr attribute to an uint8_t, or return an error if the
+/// attribute isn't an IntegerAttr. If the optional diagnostic is provided an
+/// error message is also emitted.
+LogicalResult
+convertFromAttribute(uint8_t &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the provided uint8_t to an IntegerAttr attribute.
+Attribute convertToAttribute(MLIRContext *ctx, uint8_t storage);
+
 /// Extract the string from `attr` into `storage`. If `attr` is not a
 /// `StringAttr`, return failure and emit an error into the diagnostic from
 /// `emitError`.
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 8b56d81c8eec..8710b970e8d7 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -135,6 +135,19 @@ public:
   /// hook on the AsmParser.
   virtual void printFloat(const APFloat &value);
 
+  /// Print the given integer value. This is useful to force a uint8_t/int8_t to
+  /// be printed as an integer instead of a char.
+  template <typename IntT>
+  std::enable_if_t<std::is_integral_v<IntT>, void> printInteger(IntT value) {
+    // Handle int8_t/uint8_t specially to avoid printing as char
+    if constexpr (std::is_same_v<IntT, int8_t> ||
+                  std::is_same_v<IntT, uint8_t>) {
+      getStream() << static_cast<int>(value);
+    } else {
+      getStream() << value;
+    }
+  }
+
   virtual void printType(Type type);
   virtual void printAttribute(Attribute attr);
 
diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td
index 25a45489c7b5..1aa19d0ecfa3 100644
--- a/mlir/include/mlir/IR/Properties.td
+++ b/mlir/include/mlir/IR/Properties.td
@@ -219,6 +219,7 @@ class IntProp<string storageTypeParam, string desc = ""> :
   let optionalParser = [{
     return $_parser.parseOptionalInteger($_storage);
   }];
+  let printer = "$_printer.printInteger($_storage)";
   let writeToMlirBytecode = [{
     $_writer.writeVarInt($_storage);
   }];
diff --git a/mlir/lib/IR/ODSSupport.cpp b/mlir/lib/IR/ODSSupport.cpp
index d56c75ede984..5b0a3e22139e 100644
--- a/mlir/lib/IR/ODSSupport.cpp
+++ b/mlir/lib/IR/ODSSupport.cpp
@@ -48,6 +48,40 @@ Attribute mlir::convertToAttribute(MLIRContext *ctx, int32_t storage) {
   return IntegerAttr::get(IntegerType::get(ctx, 32), storage);
 }
 
+LogicalResult
+mlir::convertFromAttribute(int8_t &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<IntegerAttr>(attr);
+  if (!valueAttr) {
+    emitError() << "expected IntegerAttr for key `value`";
+    return failure();
+  }
+  storage = valueAttr.getValue().getSExtValue();
+  return success();
+}
+
+Attribute mlir::convertToAttribute(MLIRContext *ctx, int8_t storage) {
+  /// Convert the provided int8_t to an IntegerAttr attribute.
+  return IntegerAttr::get(IntegerType::get(ctx, 8), storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(uint8_t &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<IntegerAttr>(attr);
+  if (!valueAttr) {
+    emitError() << "expected IntegerAttr for key `value`";
+    return failure();
+  }
+  storage = valueAttr.getValue().getZExtValue();
+  return success();
+}
+
+Attribute mlir::convertToAttribute(MLIRContext *ctx, uint8_t storage) {
+  /// Convert the provided uint8_t to an IntegerAttr attribute.
+  return IntegerAttr::get(IntegerType::get(ctx, 8), storage);
+}
+
 LogicalResult
 mlir::convertFromAttribute(std::string &storage, Attribute attr,
                            function_ref<InFlightDiagnostic()> emitError) {
diff --git a/mlir/test/IR/properties.mlir b/mlir/test/IR/properties.mlir
index b339a03812ba..dde9100cde14 100644
--- a/mlir/test/IR/properties.mlir
+++ b/mlir/test/IR/properties.mlir
@@ -59,9 +59,15 @@ test.with_default_valued_properties 1 "foo" 0 unit
 // CHECK:   test.with_optional_properties
 // CHECK-SAME: simple = 0
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [0]}> : () -> ()
+// GENERIC-SAME:  <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [0], simplei8 = [], simpleui8 = []}> : () -> ()
 test.with_optional_properties simple = 0
 
+// CHECK:   test.with_optional_properties
+// CHECK-SAME: simple = 1 simplei8 = -1 simpleui8 = 255
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME:  <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [1], simplei8 = [-1 : i8], simpleui8 = [-1 : i8]}> : () -> ()
+test.with_optional_properties simple = 1 simplei8 = -1 simpleui8 = 255
+
 // CHECK:   test.with_optional_properties{{$}}
 // GENERIC: "test.with_optional_properties"()
 // GENERIC-SAME: simple = []
@@ -70,7 +76,7 @@ test.with_optional_properties
 // CHECK:    test.with_optional_properties
 // CHECK-SAME: anAttr = 0 simple = 1 nonTrivialStorage = "foo" hasDefault = some<0> nested = some<1>  longSyntax = some<"bar"> hasUnit maybeUnit = some<unit>
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME: <{anAttr = 0 : i32, hasDefault = [0], hasUnit, longSyntax = ["bar"], maybeUnit = [unit], nested = {{\[}}[1]], nonTrivialStorage = ["foo"], simple = [1]}> : () -> ()
+// GENERIC-SAME: <{anAttr = 0 : i32, hasDefault = [0], hasUnit, longSyntax = ["bar"], maybeUnit = [unit], nested = {{\[}}[1]], nonTrivialStorage = ["foo"], simple = [1], simplei8 = [], simpleui8 = []}> : () -> ()
 test.with_optional_properties
   anAttr = 0
   simple = 1
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 8a4981a90831..8c332adb3565 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -3406,6 +3406,8 @@ def TestOpWithOptionalProperties : TEST_Op<"with_optional_properties"> {
   let assemblyFormat = [{
     (`anAttr` `=` $anAttr^)?
     (`simple` `=` $simple^)?
+    (`simplei8` `=` $simplei8^)?
+    (`simpleui8` `=` $simpleui8^)?
     (`nonTrivialStorage` `=` $nonTrivialStorage^)?
     (`hasDefault` `=` $hasDefault^)?
     (`nested` `=` $nested^)?
@@ -3417,6 +3419,8 @@ def TestOpWithOptionalProperties : TEST_Op<"with_optional_properties"> {
   let arguments = (ins
     OptionalAttr<I32Attr>:$anAttr,
     OptionalProp<I64Prop>:$simple,
+    OptionalProp<IntProp<"int8_t">>:$simplei8,
+    OptionalProp<IntProp<"uint8_t">>:$simpleui8,
     OptionalProp<StringProp>:$nonTrivialStorage,
     // Confirm that properties with default values now default to nullopt and have
     // the long syntax.

From 44936c8d13f904a68647d83bdcdbbeefb4670d3e Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Mon, 23 Jun 2025 11:02:06 -0400
Subject: [PATCH 1291/1322] [CUDA][HIP] add options `--[no-]offload-inc`
 (#140106)

Currently there is only option -nogpuinc for disabling
the default CUDA/HIP wrapper headers. However, there
are situations where -nogpuinc needs to be overriden
for enabling CUDA/HIP wrapper headers. This patch
adds --[no-]offload-inc for that purpose. When both
exist, the last wins. -nogpuinc and -nocudainc are
now alias to --no-offload-inc.
---
 clang/include/clang/Driver/Options.td  | 20 +++++++++++++-------
 clang/lib/Driver/ToolChains/AMDGPU.cpp |  3 ++-
 clang/lib/Driver/ToolChains/Clang.cpp  |  6 ++++--
 clang/lib/Driver/ToolChains/Cuda.cpp   | 10 +++++++---
 clang/lib/Driver/ToolChains/HIPSPV.cpp |  3 ++-
 clang/test/Driver/hip-include-path.hip |  4 ++++
 6 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0ffd8c40da7d..e133ac97a4ff 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -577,7 +577,8 @@ multiclass BoolWOption<string flag_base, KeyPathAndMacro kpm,
 // Works like BoolOption except without marshalling
 multiclass BoolOptionWithoutMarshalling<string prefix = "", string spelling_base,
                                         FlagDef flag1_base, FlagDef flag2_base,
-                                        BothFlags suffix = BothFlags<[]>> {
+                                        BothFlags suffix = BothFlags<[]>,
+                                        list<string> flag_prefix = ["-"]> {
   defvar flag1 = FlagDefExpanded<ApplySuffix<flag1_base, suffix>.Result, prefix,
                                  NAME, spelling_base>;
 
@@ -598,12 +599,12 @@ multiclass BoolOptionWithoutMarshalling<string prefix = "", string spelling_base
 
   defvar implied = !if(flag1.CanBeImplied, flag1, flag2);
 
-  def flag1.RecordName : Flag<["-"], flag1.Spelling>, Flags<flag1.OptionFlags>,
+  def flag1.RecordName : Flag<flag_prefix, flag1.Spelling>, Flags<flag1.OptionFlags>,
                          Visibility<flag1.OptionVisibility>,
                          HelpText<flag1.Help>,
                          ImpliedByAnyOf<implied.ImpliedBy, implied.ValueAsCode>
                          {}
-  def flag2.RecordName : Flag<["-"], flag2.Spelling>, Flags<flag2.OptionFlags>,
+  def flag2.RecordName : Flag<flag_prefix, flag2.Spelling>, Flags<flag2.OptionFlags>,
                          Visibility<flag2.OptionVisibility>,
                          HelpText<flag2.Help>,
                          ImpliedByAnyOf<implied.ImpliedBy, implied.ValueAsCode>
@@ -5756,12 +5757,17 @@ def nobuiltininc : Flag<["-"], "nobuiltininc">,
   Group<IncludePath_Group>,
   HelpText<"Disable builtin #include directories only">,
   MarshallingInfoNegativeFlag<HeaderSearchOpts<"UseBuiltinIncludes">>;
-def nogpuinc : Flag<["-"], "nogpuinc">, Group<IncludePath_Group>,
-  HelpText<"Do not add include paths for CUDA/HIP and"
-  " do not include the default CUDA/HIP wrapper headers">;
+defm offload_inc: BoolOptionWithoutMarshalling<"", "offload-inc",
+  PosFlag<SetTrue, [], [ClangOption], "Add include paths for CUDA/HIP and"
+   " include the default CUDA/HIP wrapper headers (default)">,
+  NegFlag<SetFalse, [], [ClangOption], "Do not add include paths for CUDA/HIP"
+   " and include the default CUDA/HIP wrapper headers">,
+   BothFlags<[]>, ["--"]>, Group<IncludePath_Group>;
+def : Flag<["-"], "nogpuinc">, Alias<no_offload_inc>;
+
 def nohipwrapperinc : Flag<["-"], "nohipwrapperinc">, Group<IncludePath_Group>,
   HelpText<"Do not include the default HIP wrapper headers and include paths">;
-def : Flag<["-"], "nocudainc">, Alias<nogpuinc>;
+def : Flag<["-"], "nocudainc">, Alias<no_offload_inc>;
 def no_offloadlib
     : Flag<["--"], "no-offloadlib">,
       MarshallingInfoFlag<LangOpts<"NoGPULib">>,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index cf9c24f1e1cd..b7564a0495da 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -525,7 +525,8 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
                     "hipstdpar_lib.hpp"});
   };
 
-  if (DriverArgs.hasArg(options::OPT_nogpuinc)) {
+  if (!DriverArgs.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                          true)) {
     if (HasHipStdPar)
       HandleHipStdPar();
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2bb42a319ecc..87d04a42fcd7 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -937,7 +937,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   // openmp_wrappers folder which contains alternative system headers.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
       !Args.hasArg(options::OPT_nostdinc) &&
-      !Args.hasArg(options::OPT_nogpuinc) &&
+      Args.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                   true) &&
       getToolChain().getTriple().isGPU()) {
     if (!Args.hasArg(options::OPT_nobuiltininc)) {
       // Add openmp_wrappers/* to our system include path.  This lets us wrap
@@ -1120,7 +1121,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   // TODO: This should be moved to `AddClangSystemIncludeArgs` by passing the
   //       OffloadKind as an argument.
   if (!Args.hasArg(options::OPT_nostdinc) &&
-      !Args.hasArg(options::OPT_nogpuinc) &&
+      Args.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                   true) &&
       !Args.hasArg(options::OPT_nobuiltininc)) {
     // Without an offloading language we will include these headers directly.
     // Offloading languages will instead only use the declarations stored in
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index a91e4de41c8d..b92c18f1b60f 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -302,7 +302,8 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
     CC1Args.push_back(DriverArgs.MakeArgString(P));
   }
 
-  if (DriverArgs.hasArg(options::OPT_nogpuinc))
+  if (!DriverArgs.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                          true))
     return;
 
   if (!isValid()) {
@@ -928,7 +929,8 @@ llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
   // Check our CUDA version if we're going to include the CUDA headers.
-  if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&
+  if (DriverArgs.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                         true) &&
       !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
     StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
     assert(!Arch.empty() && "Must have an explicit GPU arch.");
@@ -1001,7 +1003,9 @@ void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                               ArgStringList &CC1Args) const {
   HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
 
-  if (!DriverArgs.hasArg(options::OPT_nogpuinc) && CudaInstallation.isValid())
+  if (DriverArgs.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                         true) &&
+      CudaInstallation.isValid())
     CC1Args.append(
         {"-internal-isystem",
          DriverArgs.MakeArgString(CudaInstallation.getIncludePath())});
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index ec29c62976e1..c86790f66a79 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -187,7 +187,8 @@ void HIPSPVToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
 
 void HIPSPVToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nogpuinc))
+  if (!DriverArgs.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc,
+                          true))
     return;
 
   StringRef hipPath = DriverArgs.getLastArgValue(options::OPT_hip_path_EQ);
diff --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip
index 5eeee2f5ce0d..3c7384ab9835 100644
--- a/clang/test/Driver/hip-include-path.hip
+++ b/clang/test/Driver/hip-include-path.hip
@@ -12,6 +12,10 @@
 // RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpuinc -nogpulib %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,CLANG,NOHIP %s
 
+// RUN: %clang -c -### --target=x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
+// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm --no-offload-inc -nogpulib --offload-inc %s 2>&1 \
+// RUN:   | FileCheck -check-prefixes=COMMON,CLANG,HIP %s
+
 // COMMON-LABEL: "{{[^"]*}}clang{{[^"]*}}" "-cc1"
 // CLANG-SAME: "-internal-isystem" "{{[^"]*}}/lib{{[^"]*}}/clang/{{[^"]*}}/include/cuda_wrappers"
 // NOCLANG-NOT: "{{[^"]*}}/lib{{[^"]*}}/clang/{{[^"]*}}/include/cuda_wrappers"

From daa2a587cc01c5656deecda7f768fed0afc1e515 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 23 Jun 2025 08:07:31 -0700
Subject: [PATCH 1292/1322] [TRE] Adjust function entry count when using
 instrumented profiles (#143987)

The entry count of a function needs to be updated after a callsite is elided by TRE: before elision, the entry count accounted for the recursive call at that callsite. After TRE, we need to remove that callsite's contribution.

This patch enables this for instrumented profiling cases because, there, we know the function entry count captured entries before TRE. We cannot currently address this for sample-based (because we don't know whether this function was TRE-ed in the binary that donated samples)
---
 llvm/include/llvm/Passes/PassBuilder.h        |   2 +
 .../Scalar/TailRecursionElimination.h         |   7 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  14 +-
 .../Scalar/TailRecursionElimination.cpp       |  68 +++++++++-
 .../TailCallElim/entry-count-adjustment.ll    | 120 ++++++++++++++++++
 5 files changed, 200 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Transforms/TailCallElim/entry-count-adjustment.ll

diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index f13b5c678a89..9cdb7ca7dbc9 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -773,6 +773,8 @@ private:
                          IntrusiveRefCntPtr<vfs::FileSystem> FS);
   void addPostPGOLoopRotation(ModulePassManager &MPM, OptimizationLevel Level);
 
+  bool isInstrumentedPGOUse() const;
+
   // Extension Point callbacks
   SmallVector<std::function<void(FunctionPassManager &, OptimizationLevel)>, 2>
       PeepholeEPCallbacks;
diff --git a/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h b/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h
index 57b1ed9bf4fe..22a70cd66865 100644
--- a/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h
+++ b/llvm/include/llvm/Transforms/Scalar/TailRecursionElimination.h
@@ -58,7 +58,12 @@ namespace llvm {
 
 class Function;
 
-struct TailCallElimPass : PassInfoMixin<TailCallElimPass> {
+class TailCallElimPass : public PassInfoMixin<TailCallElimPass> {
+  const bool UpdateFunctionEntryCount;
+
+public:
+  TailCallElimPass(bool UpdateFunctionEntryCount = true)
+      : UpdateFunctionEntryCount(UpdateFunctionEntryCount) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index b0cdd1b94e56..c83d2dc1f151 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -625,7 +625,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
       !Level.isOptimizingForSize())
     FPM.addPass(PGOMemOPSizeOpt());
 
-  FPM.addPass(TailCallElimPass());
+  FPM.addPass(TailCallElimPass(/*UpdateFunctionEntryCount=*/
+                               isInstrumentedPGOUse()));
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
@@ -1578,7 +1579,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   OptimizePM.addPass(DivRemPairsPass());
 
   // Try to annotate calls that were created during optimization.
-  OptimizePM.addPass(TailCallElimPass());
+  OptimizePM.addPass(
+      TailCallElimPass(/*UpdateFunctionEntryCount=*/isInstrumentedPGOUse()));
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
@@ -2066,7 +2068,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   // LTO provides additional opportunities for tailcall elimination due to
   // link-time inlining, and visibility of nocapture attribute.
-  FPM.addPass(TailCallElimPass());
+  FPM.addPass(
+      TailCallElimPass(/*UpdateFunctionEntryCount=*/isInstrumentedPGOUse()));
 
   // Run a few AA driver optimizations here and now to cleanup the code.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
@@ -2347,3 +2350,8 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
 
   return AA;
 }
+
+bool PassBuilder::isInstrumentedPGOUse() const {
+  return (PGOOpt && PGOOpt->Action == PGOOptions::IRUse) ||
+         !UseCtxProfile.empty();
+}
\ No newline at end of file
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index e7d989a43840..7828571123bc 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -53,6 +53,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -75,10 +76,12 @@
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cmath>
 using namespace llvm;
 
 #define DEBUG_TYPE "tailcallelim"
@@ -87,6 +90,11 @@ STATISTIC(NumEliminated, "Number of tail calls removed");
 STATISTIC(NumRetDuped,   "Number of return duplicated");
 STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
+static cl::opt<bool> ForceDisableBFI(
+    "tre-disable-entrycount-recompute", cl::init(false), cl::Hidden,
+    cl::desc("Force disabling recomputing of function entry count, on "
+             "successful tail recursion elimination."));
+
 /// Scan the specified function for alloca instructions.
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
@@ -399,6 +407,9 @@ class TailRecursionEliminator {
   AliasAnalysis *AA;
   OptimizationRemarkEmitter *ORE;
   DomTreeUpdater &DTU;
+  BlockFrequencyInfo *const BFI;
+  const uint64_t OrigEntryBBFreq;
+  const uint64_t OrigEntryCount;
 
   // The below are shared state we want to have available when eliminating any
   // calls in the function. There values should be populated by
@@ -428,8 +439,19 @@ class TailRecursionEliminator {
 
   TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
                           AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
-                          DomTreeUpdater &DTU)
-      : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
+                          DomTreeUpdater &DTU, BlockFrequencyInfo *BFI)
+      : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU), BFI(BFI),
+        OrigEntryBBFreq(
+            BFI ? BFI->getBlockFreq(&F.getEntryBlock()).getFrequency() : 0U),
+        OrigEntryCount(F.getEntryCount() ? F.getEntryCount()->getCount() : 0) {
+    if (BFI) {
+      // The assert is meant as API documentation for the caller.
+      assert((OrigEntryCount != 0 && OrigEntryBBFreq != 0) &&
+             "If a BFI was provided, the function should have both an entry "
+             "count that is non-zero and an entry basic block with a non-zero "
+             "frequency.");
+    }
+  }
 
   CallInst *findTRECandidate(BasicBlock *BB);
 
@@ -450,7 +472,7 @@ class TailRecursionEliminator {
 public:
   static bool eliminate(Function &F, const TargetTransformInfo *TTI,
                         AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
-                        DomTreeUpdater &DTU);
+                        DomTreeUpdater &DTU, BlockFrequencyInfo *BFI);
 };
 } // namespace
 
@@ -735,6 +757,28 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   CI->eraseFromParent();   // Remove call.
   DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}});
   ++NumEliminated;
+  if (OrigEntryBBFreq) {
+    assert(F.getEntryCount().has_value());
+    // This pass is not expected to remove BBs, only add an entry BB. For that
+    // reason, and because the BB here isn't the new entry BB, the BFI lookup is
+    // expected to succeed.
+    assert(&F.getEntryBlock() != BB);
+    auto RelativeBBFreq =
+        static_cast<double>(BFI->getBlockFreq(BB).getFrequency()) /
+        static_cast<double>(OrigEntryBBFreq);
+    auto ToSubtract =
+        static_cast<uint64_t>(std::round(RelativeBBFreq * OrigEntryCount));
+    auto OldEntryCount = F.getEntryCount()->getCount();
+    if (OldEntryCount <= ToSubtract) {
+      LLVM_DEBUG(
+          errs() << "[TRE] The entrycount attributable to the recursive call, "
+                 << ToSubtract
+                 << ", should be strictly lower than the function entry count, "
+                 << OldEntryCount << "\n");
+    } else {
+      F.setEntryCount(OldEntryCount - ToSubtract, F.getEntryCount()->getType());
+    }
+  }
   return true;
 }
 
@@ -861,7 +905,8 @@ bool TailRecursionEliminator::eliminate(Function &F,
                                         const TargetTransformInfo *TTI,
                                         AliasAnalysis *AA,
                                         OptimizationRemarkEmitter *ORE,
-                                        DomTreeUpdater &DTU) {
+                                        DomTreeUpdater &DTU,
+                                        BlockFrequencyInfo *BFI) {
   if (F.getFnAttribute("disable-tail-calls").getValueAsBool())
     return false;
 
@@ -877,7 +922,7 @@ bool TailRecursionEliminator::eliminate(Function &F,
     return MadeChange;
 
   // Change any tail recursive calls to loops.
-  TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
+  TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU, BFI);
 
   for (BasicBlock &BB : F)
     MadeChange |= TRE.processBlock(BB);
@@ -919,7 +964,8 @@ struct TailCallElim : public FunctionPass {
     return TailRecursionEliminator::eliminate(
         F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
         &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU,
+        /*BFI=*/nullptr);
   }
 };
 }
@@ -942,6 +988,13 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
 
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  // This must come first. It needs the 2 analyses, meaning, if it came after
+  // the lines asking for the cached result, should they be nullptr (which, in
+  // the case of the PDT, is likely), updates to the trees would be missed.
+  auto *BFI = (!ForceDisableBFI && UpdateFunctionEntryCount &&
+               F.getEntryCount().has_value() && F.getEntryCount()->getCount())
+                  ? &AM.getResult<BlockFrequencyAnalysis>(F)
+                  : nullptr;
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
   auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
@@ -949,7 +1002,8 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
   // UpdateStrategy based on some test results. It is feasible to switch the
   // UpdateStrategy to Lazy if we find it profitable later.
   DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
-  bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU);
+  bool Changed =
+      TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU, BFI);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/llvm/test/Transforms/TailCallElim/entry-count-adjustment.ll b/llvm/test/Transforms/TailCallElim/entry-count-adjustment.ll
new file mode 100644
index 000000000000..6001e6040a74
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/entry-count-adjustment.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -passes=tailcallelim -S %s -o - | FileCheck %s --check-prefixes=CHECK,ENABLED
+; RUN: opt -passes=tailcallelim -tre-disable-entrycount-recompute -S %s -o - | FileCheck %s --check-prefixes=CHECK,DISABLED
+
+; Test that tail call elimination correctly adjusts function entry counts
+; when eliminating tail recursive calls.
+
+; Basic test: eliminate a tail call and adjust entry count
+define i32 @test_basic_entry_count_adjustment(i32 %n) !prof !0 {
+; CHECK-LABEL: @test_basic_entry_count_adjustment(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[N_TR:%.*]] = phi i32 [ [[N:%.*]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[IF_THEN:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N_TR]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN]], label [[IF_ELSE:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB]] = sub i32 [[N_TR]], 1
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else, !prof !1
+
+if.then:                                          ; preds = %entry
+  %sub = sub i32 %n, 1
+  %call = tail call i32 @test_basic_entry_count_adjustment(i32 %sub)
+  ret i32 %call
+
+if.else:                                          ; preds = %entry
+  ret i32 0
+}
+
+; Test multiple tail calls in different blocks with different frequencies
+define i32 @test_multiple_blocks_entry_count(i32 %n, i32 %flag) !prof !2 {
+; CHECK-LABEL: @test_multiple_blocks_entry_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[N_TR:%.*]] = phi i32 [ [[N:%.*]], [[ENTRY:%.*]] ], [ [[SUB1:%.*]], [[BLOCK1:%.*]] ], [ [[SUB2:%.*]], [[BLOCK2:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N_TR]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CHECK_FLAG:%.*]], label [[BASE_CASE:%.*]], !prof [[PROF3:![0-9]+]]
+; CHECK:       check.flag:
+; CHECK-NEXT:    [[CMP_FLAG:%.*]] = icmp eq i32 [[FLAG:%.*]], 1
+; CHECK-NEXT:    br i1 [[CMP_FLAG]], label [[BLOCK1]], label [[BLOCK2]], !prof [[PROF4:![0-9]+]]
+; CHECK:       block1:
+; CHECK-NEXT:    [[SUB1]] = sub i32 [[N_TR]], 1
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       block2:
+; CHECK-NEXT:    [[SUB2]] = sub i32 [[N_TR]], 2
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       base.case:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %check.flag, label %base.case, !prof !3
+check.flag:
+  %cmp.flag = icmp eq i32 %flag, 1
+  br i1 %cmp.flag, label %block1, label %block2, !prof !4
+block1:                                           ; preds = %check.flag
+  %sub1 = sub i32 %n, 1
+  %call1 = tail call i32 @test_multiple_blocks_entry_count(i32 %sub1, i32 %flag)
+  ret i32 %call1
+block2:                                           ; preds = %check.flag
+  %sub2 = sub i32 %n, 2
+  %call2 = tail call i32 @test_multiple_blocks_entry_count(i32 %sub2, i32 %flag)
+  ret i32 %call2
+base.case:                                        ; preds = %entry
+  ret i32 1
+}
+
+define i32 @test_no_entry_count(i32 %n) {
+; CHECK-LABEL: @test_no_entry_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[N_TR:%.*]] = phi i32 [ [[N:%.*]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[IF_THEN:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N_TR]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB]] = sub i32 [[N_TR]], 1
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %sub = sub i32 %n, 1
+  %call = tail call i32 @test_no_entry_count(i32 %sub)
+  ret i32 %call
+
+if.else:                                          ; preds = %entry
+  ret i32 0
+}
+
+; Function entry count metadata
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 800, i32 200}
+!2 = !{!"function_entry_count", i64 2000}
+!3 = !{!"branch_weights", i32 3, i32 1}
+!4 = !{!"branch_weights", i32 100, i32 900}
+;.
+; ENABLED: [[META0:![0-9]+]] = !{!"function_entry_count", i64 200}
+; ENABLED: [[PROF1]] = !{!"branch_weights", i32 800, i32 200}
+; ENABLED: [[META2:![0-9]+]] = !{!"function_entry_count", i64 500}
+; ENABLED: [[PROF3]] = !{!"branch_weights", i32 3, i32 1}
+; ENABLED: [[PROF4]] = !{!"branch_weights", i32 100, i32 900}
+;.
+; DISABLED: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; DISABLED: [[PROF1]] = !{!"branch_weights", i32 800, i32 200}
+; DISABLED: [[META2:![0-9]+]] = !{!"function_entry_count", i64 2000}
+; DISABLED: [[PROF3]] = !{!"branch_weights", i32 3, i32 1}
+; DISABLED: [[PROF4]] = !{!"branch_weights", i32 100, i32 900}
+;.

From 068af5bfb47aae9399d072a68b0a723aef2f9ead Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Mon, 23 Jun 2025 08:18:04 -0700
Subject: [PATCH 1293/1322] [lld][BP] Print total size of startup symbols
 (#145106)

A good proxy to estimate the number of page faults during startup is the
total size of startup functions. Assuming profiles are up-to-date, we
can measure this total size pretty easily. Note that if profile data is
old, this number could be wrong.
---
 .../lld/Common/BPSectionOrdererBase.inc       | 54 ++++++++++++-------
 lld/test/ELF/bp-section-orderer.s             | 10 ++--
 lld/test/MachO/bp-section-orderer.s           | 12 ++---
 3 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/lld/include/lld/Common/BPSectionOrdererBase.inc b/lld/include/lld/Common/BPSectionOrdererBase.inc
index bb2e55af1eb3..fbeca187aa76 100644
--- a/lld/include/lld/Common/BPSectionOrdererBase.inc
+++ b/lld/include/lld/Common/BPSectionOrdererBase.inc
@@ -292,45 +292,54 @@ auto BPOrderer<D>::computeOrder(
     bp.run(nodesForDataCompression);
   }
 
-  unsigned numStartupSections = 0;
-  unsigned numCodeCompressionSections = 0;
-  unsigned numDuplicateCodeSections = 0;
-  unsigned numDataCompressionSections = 0;
-  unsigned numDuplicateDataSections = 0;
+  unsigned numStartupSections = 0, startupSize = 0;
+  unsigned numCodeCompressionSections = 0, codeCompressionSize = 0;
+  unsigned numDuplicateCodeSections = 0, duplicateCodeSize = 0;
+  unsigned numDataCompressionSections = 0, dataCompressionSize = 0;
+  unsigned numDuplicateDataSections = 0, duplicateDataSize = 0;
   SetVector<const Section *> orderedSections;
   // Order startup functions,
   for (auto &node : nodesForStartup) {
     const auto *isec = sections[node.Id];
-    if (orderedSections.insert(isec))
+    if (orderedSections.insert(isec)) {
+      startupSize += D::getSize(*isec);
       ++numStartupSections;
+    }
   }
   // then functions for compression,
   for (auto &node : nodesForFunctionCompression) {
     const auto *isec = sections[node.Id];
-    if (orderedSections.insert(isec))
+    if (orderedSections.insert(isec)) {
+      codeCompressionSize += D::getSize(*isec);
       ++numCodeCompressionSections;
-
+    }
     auto It = duplicateSectionIdxs.find(node.Id);
     if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];
-      if (orderedSections.insert(dupIsec))
+      if (orderedSections.insert(dupIsec)) {
+        duplicateCodeSize += D::getSize(*isec);
         ++numDuplicateCodeSections;
+      }
     }
   }
   // then data for compression.
   for (auto &node : nodesForDataCompression) {
     const auto *isec = sections[node.Id];
-    if (orderedSections.insert(isec))
+    if (orderedSections.insert(isec)) {
+      dataCompressionSize += D::getSize(*isec);
       ++numDataCompressionSections;
+    }
     auto It = duplicateSectionIdxs.find(node.Id);
     if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];
-      if (orderedSections.insert(dupIsec))
+      if (orderedSections.insert(dupIsec)) {
+        duplicateDataSize += D::getSize(*isec);
         ++numDuplicateDataSections;
+      }
     }
   }
 
@@ -339,14 +348,21 @@ auto BPOrderer<D>::computeOrder(
         numStartupSections + numCodeCompressionSections +
         numDuplicateCodeSections + numDataCompressionSections +
         numDuplicateDataSections;
-    dbgs()
-        << "Ordered " << numTotalOrderedSections
-        << " sections using balanced partitioning:\n  Functions for startup: "
-        << numStartupSections
-        << "\n  Functions for compression: " << numCodeCompressionSections
-        << "\n  Duplicate functions: " << numDuplicateCodeSections
-        << "\n  Data for compression: " << numDataCompressionSections
-        << "\n  Duplicate data: " << numDuplicateDataSections << "\n";
+    unsigned totalOrderedSize = startupSize + codeCompressionSize +
+                                duplicateCodeSize + dataCompressionSize +
+                                duplicateDataSize;
+    dbgs() << "Ordered " << numTotalOrderedSections << " sections ("
+           << totalOrderedSize << " bytes) using balanced partitioning:\n";
+    dbgs() << "  Functions for startup: " << numStartupSections << " ("
+           << startupSize << " bytes)\n";
+    dbgs() << "  Functions for compression: " << numCodeCompressionSections
+           << " (" << codeCompressionSize << " bytes)\n";
+    dbgs() << "  Duplicate functions: " << numDuplicateCodeSections << " ("
+           << duplicateCodeSize << " bytes)\n";
+    dbgs() << "  Data for compression: " << numDataCompressionSections << " ("
+           << dataCompressionSize << " bytes)\n";
+    dbgs() << "  Duplicate data: " << numDuplicateDataSections << " ("
+           << duplicateDataSize << " bytes)\n";
 
     if (!profilePath.empty()) {
       // Evaluate this function order for startup
diff --git a/lld/test/ELF/bp-section-orderer.s b/lld/test/ELF/bp-section-orderer.s
index 4889db63cd4d..4df2e8d43022 100644
--- a/lld/test/ELF/bp-section-orderer.s
+++ b/lld/test/ELF/bp-section-orderer.s
@@ -21,7 +21,7 @@
 # RUN: llvm-profdata merge a.proftext -o a.profdata
 # RUN: ld.lld a.o --irpgo-profile=a.profdata --bp-startup-sort=function --verbose-bp-section-orderer --icf=all --gc-sections 2>&1 | FileCheck %s --check-prefix=STARTUP-FUNC-ORDER
 
-# STARTUP-FUNC-ORDER: Ordered 3 sections using balanced partitioning
+# STARTUP-FUNC-ORDER: Ordered 3 sections ([[#]] bytes) using balanced partitioning
 # STARTUP-FUNC-ORDER: Total area under the page fault curve: 3.
 
 # RUN: ld.lld -o out.s a.o --irpgo-profile=a.profdata --bp-startup-sort=function
@@ -49,10 +49,10 @@
 # RUN: llvm-nm -jn out.cbs | tr '\n' , | FileCheck %s --check-prefix=CBOTH-STARTUP
 # CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,{{$}}
 
-# BP-COMPRESSION-FUNC: Ordered 9 sections using balanced partitioning
-# BP-COMPRESSION-ICF-FUNC: Ordered 8 sections using balanced partitioning
-# BP-COMPRESSION-DATA: Ordered 9 sections using balanced partitioning
-# BP-COMPRESSION-BOTH: Ordered 18 sections using balanced partitioning
+# BP-COMPRESSION-FUNC: Ordered 9 sections ([[#]] bytes) using balanced partitioning
+# BP-COMPRESSION-ICF-FUNC: Ordered 8 sections ([[#]] bytes) using balanced partitioning
+# BP-COMPRESSION-DATA: Ordered 9 sections ([[#]] bytes) using balanced partitioning
+# BP-COMPRESSION-BOTH: Ordered 18 sections ([[#]] bytes) using balanced partitioning
 
 #--- a.proftext
 :ir
diff --git a/lld/test/MachO/bp-section-orderer.s b/lld/test/MachO/bp-section-orderer.s
index 93d0b85731de..90924e5797b6 100644
--- a/lld/test/MachO/bp-section-orderer.s
+++ b/lld/test/MachO/bp-section-orderer.s
@@ -9,8 +9,8 @@
 
 # RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile %t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
 # RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile=%t/a.profdata --bp-startup-sort=function --verbose-bp-section-orderer --icf=all --bp-compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP-ICF
-# STARTUP: Ordered 5 sections using balanced partitioning
-# STARTUP-ICF: Ordered 4 sections using balanced partitioning
+# STARTUP: Ordered 5 sections ([[#]] bytes) using balanced partitioning
+# STARTUP-ICF: Ordered 4 sections ([[#]] bytes) using balanced partitioning
 
 # Check that orderfiles take precedence over BP
 # RUN: %no-fatal-warnings-lld -arch arm64 -lSystem -e _main -o - %t/a.o -order_file %t/a.orderfile --irpgo-profile-sort=%t/a.profdata  | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE
@@ -50,10 +50,10 @@
 # RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --bp-compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
 # RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --bp-compression-sort=both --irpgo-profile=%t/a.profdata --bp-startup-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
 
-# COMPRESSION-FUNC: Ordered 9 sections using balanced partitioning
-# COMPRESSION-ICF-FUNC: Ordered 7 sections using balanced partitioning
-# COMPRESSION-DATA: Ordered 7 sections using balanced partitioning
-# COMPRESSION-BOTH: Ordered 16 sections using balanced partitioning
+# COMPRESSION-FUNC: Ordered 9 sections ([[#]] bytes) using balanced partitioning
+# COMPRESSION-ICF-FUNC: Ordered 7 sections ([[#]] bytes) using balanced partitioning
+# COMPRESSION-DATA: Ordered 7 sections ([[#]] bytes) using balanced partitioning
+# COMPRESSION-BOTH: Ordered 16 sections ([[#]] bytes) using balanced partitioning
 
 #--- a.s
 .text

From b054363645122e131071673bb67657a85f04a146 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 23 Jun 2025 16:24:13 +0100
Subject: [PATCH 1294/1322] [LV] Add tests showing incorrect vector
 interleaving with early exits.

When interleaving is forced for early-exit loops, we currently create
incorrect code.

Test coverage for scalable vectors is added as AArch64 specific test.
---
 .../AArch64/single-early-exit-interleave.ll   |  99 +++
 .../single-early-exit-interleave.ll           | 724 ++++++++++++++++++
 2 files changed, 823 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
new file mode 100644
index 000000000000..9dfe70ddf1b0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 4
+; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-interleave=4 -S %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @init_mem(ptr, i64);
+
+define i64 @same_exit_block_pre_inc_use1() #0 {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 510, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 510, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 510, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 64
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ [[INDEX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC1]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 513
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ], [ 67, [[LOOP_INC]] ], [ [[TMP17]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 513
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
new file mode 100644
index 000000000000..1f8cfa1bfd11
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -0,0 +1,724 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 4
+; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -force-vector-interleave=4 -S %s | FileCheck --check-prefix=VF4IC4 %s
+
+declare void @init_mem(ptr, i64);
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[SRC:%.*]] = alloca [128 x i32], align 4
+; VF4IC4-NEXT:    call void @init_mem(ptr [[SRC]])
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP3]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[E2:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    br label [[E1:%.*]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP_HEADER:%.*]]
+; VF4IC4:       loop.header:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; VF4IC4-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; VF4IC4-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 10
+; VF4IC4-NEXT:    br i1 [[C_1]], label [[E1]], label [[LOOP_LATCH]]
+; VF4IC4:       loop.latch:
+; VF4IC4-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
+; VF4IC4-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; VF4IC4-NEXT:    br i1 [[C_2]], label [[E2]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4IC4:       e1:
+; VF4IC4-NEXT:    [[P1:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i64 [[P1]]
+; VF4IC4:       e2:
+; VF4IC4-NEXT:    [[P2:%.*]] = phi i64 [ 1, [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ]
+; VF4IC4-NEXT:    ret i64 [[P2]]
+;
+entry:
+  %src = alloca [128 x i32]
+  call void @init_mem(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define i64 @same_exit_block_pre_inc_use1() {
+; VF4IC4-LABEL: define i64 @same_exit_block_pre_inc_use1() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
+; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 3, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %iv, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define ptr @same_exit_block_pre_inc_use1_ivptr() {
+; VF4IC4-LABEL: define ptr @same_exit_block_pre_inc_use1_ivptr() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    [[PTREND:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP3]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
+; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[P1]], [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[PTR]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[PTR_NEXT]], [[PTREND]]
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTR]], [[LOOP]] ], [ [[PTREND]], [[LOOP_INC]] ], [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret ptr [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  %ptrend = getelementptr i8, ptr %p1, i64 1024
+  br label %loop
+
+loop:
+  %ptr = phi ptr [ %ptr.next, %loop.inc ], [ %p1, %entry ]
+  %ld1 = load i8, ptr %ptr, align 1
+  %cmp3 = icmp eq i8 %ld1, 72
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %ptr.next = getelementptr inbounds i8, ptr %ptr, i64 1
+  %exitcond = icmp ne ptr %ptr.next, %ptrend
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi ptr [ %ptr, %loop ], [ %ptrend, %loop.inc ]
+  ret ptr %retval
+}
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define i64 @same_exit_block_post_inc_use() {
+; VF4IC4-LABEL: define i64 @same_exit_block_post_inc_use() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
+; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ [[IV_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 3, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %iv, %loop ], [ %iv.next, %loop.inc ]
+  ret i64 %retval
+}
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define i64 @diff_exit_block_pre_inc_use1() {
+; VF4IC4-LABEL: define i64 @diff_exit_block_pre_inc_use1() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
+; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]]
+; VF4IC4:       loop.early.exit:
+; VF4IC4-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL1]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 3, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %iv, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ 67, %loop.inc ]
+  ret i64 %retval2
+}
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define i64 @diff_exit_block_post_inc_use1() {
+; VF4IC4-LABEL: define i64 @diff_exit_block_post_inc_use1() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
+; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]]
+; VF4IC4:       loop.early.exit:
+; VF4IC4-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL1]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[IV_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 3, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %iv, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %iv.next, %loop.inc ]
+  ret i64 %retval2
+}
+
+; FIXME: Computes incorrect final IV when early exit is taken.
+define i64 @same_exit_block_pre_inc_use1_reverse() {
+; VF4IC4-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
+; VF4IC4-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; VF4IC4-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = sub i64 1023, [[TMP11]]
+; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 15, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ 1024, [[MIDDLE_BLOCK]] ], [ [[TMP12]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 1023, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %iv.next = add i64 %iv, -1
+  %exitcond = icmp eq i64 %iv.next, 0
+  br i1 %exitcond, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ %iv, %loop ], [ 1024, %loop.inc ]
+  ret i64 %retval
+}
+
+; FIXME: Extracts incorrect lane if early exit is taken.
+define i8 @same_exit_block_use_loaded_value() {
+; VF4IC4-LABEL: define i8 @same_exit_block_use_loaded_value() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD]], i64 [[FIRST_ACTIVE_LANE]]
+; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 0, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i8 [ %ld1, %loop ], [ -1, %loop.inc ]
+  ret i8 %retval
+}
+
+; FIXME: Extracts incorrect lane if early exit is taken.
+define i8 @same_exit_block_reverse_use_loaded_value() {
+; VF4IC4-LABEL: define i8 @same_exit_block_reverse_use_loaded_value() {
+; VF4IC4-NEXT:  entry:
+; VF4IC4-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; VF4IC4-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VF4IC4:       vector.ph:
+; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VF4IC4:       vector.body:
+; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE2]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
+; VF4IC4-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; VF4IC4-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF4IC4:       middle.split:
+; VF4IC4-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4:       middle.block:
+; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE]], i64 [[FIRST_ACTIVE_LANE]]
+; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4:       scalar.ph:
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 15, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    br label [[LOOP:%.*]]
+; VF4IC4:       loop:
+; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
+; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
+; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
+; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; VF4IC4:       loop.inc:
+; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
+; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; VF4IC4:       loop.end:
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop.inc ], [ 1023, %entry ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %iv
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %iv
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %iv.next = add i64 %iv, -1
+  %exitcond = icmp eq i64 %iv.next, 0
+  br i1 %exitcond, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i8 [ %ld1, %loop ], [ -1, %loop.inc ]
+  ret i8 %retval
+}

From ff8049a23e466f7a5e3f3b8f949f543ea25ddb92 Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Mon, 23 Jun 2025 16:30:43 +0100
Subject: [PATCH 1295/1322] [InlineCost] Allow simplifying to non-Constant
 values (NFCI) (#145083)

Allow mapping callee Values to arbitrary (non-Constant) simplified
values. The simplified values can also originate from the caller. This
enables us to simplify instructions in the callee with instructions from
the caller.

The first use case for this is simplifying extractvalues (PR #145054).
---
 llvm/lib/Analysis/InlineCost.cpp | 76 +++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 7bd1f1800458..fe1ceb74429c 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -391,7 +391,8 @@ protected:
   /// likely simplifications post-inlining. The most important aspect we track
   /// is CFG altering simplifications -- when we prove a basic block dead, that
   /// can cause dramatic shifts in the cost of inlining a function.
-  DenseMap<Value *, Constant *> SimplifiedValues;
+  /// Note: The simplified Value may be owned by the caller function.
+  DenseMap<Value *, Value *> SimplifiedValues;
 
   /// Keep track of the values which map back (through function arguments) to
   /// allocas on the caller stack which could be simplified through SROA.
@@ -432,7 +433,7 @@ protected:
   template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
     if (auto *Direct = dyn_cast<T>(V))
       return Direct;
-    return dyn_cast_if_present<T>(SimplifiedValues.lookup(V));
+    return getSimplifiedValue<T>(V);
   }
 
   // Custom simplification helper routines.
@@ -525,11 +526,33 @@ public:
 
   InlineResult analyze();
 
-  std::optional<Constant *> getSimplifiedValue(Instruction *I) {
-    auto It = SimplifiedValues.find(I);
-    if (It != SimplifiedValues.end())
-      return It->second;
-    return std::nullopt;
+  /// Lookup simplified Value. May return a value owned by the caller.
+  Value *getSimplifiedValueUnchecked(Value *V) const {
+    return SimplifiedValues.lookup(V);
+  }
+
+  /// Lookup simplified Value, but return nullptr if the simplified value is
+  /// owned by the caller.
+  template <typename T> T *getSimplifiedValue(Value *V) const {
+    Value *SimpleV = SimplifiedValues.lookup(V);
+    if (!SimpleV)
+      return nullptr;
+
+    // Skip checks if we know T is a global. This has a small, but measurable
+    // impact on compile-time.
+    if constexpr (std::is_base_of_v<Constant, T>)
+      return dyn_cast<T>(SimpleV);
+
+    // Make sure the simplified Value is owned by this function
+    if (auto *I = dyn_cast<Instruction>(SimpleV)) {
+      if (I->getFunction() != &F)
+        return nullptr;
+    } else if (auto *Arg = dyn_cast<Argument>(SimpleV)) {
+      if (Arg->getParent() != &F)
+        return nullptr;
+    } else if (!isa<Constant>(SimpleV))
+      return nullptr;
+    return dyn_cast<T>(SimpleV);
   }
 
   // Keep a bunch of stats about the cost savings found so we can print them
@@ -921,12 +944,11 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
         if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
           // Count a conditional branch as savings if it becomes unconditional.
           if (BI->isConditional() &&
-              isa_and_nonnull<ConstantInt>(
-                  SimplifiedValues.lookup(BI->getCondition()))) {
+              getSimplifiedValue<ConstantInt>(BI->getCondition())) {
             CurrentSavings += InstrCost;
           }
         } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
-          if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
+          if (getSimplifiedValue<ConstantInt>(SI->getCondition()))
             CurrentSavings += InstrCost;
         } else if (Value *V = dyn_cast<Value>(&I)) {
           // Count an instruction as savings if we can fold it.
@@ -1423,10 +1445,17 @@ void InlineCostAnnotationWriter::emitInstructionAnnot(
     if (Record->hasThresholdChanged())
       OS << ", threshold delta = " << Record->getThresholdDelta();
   }
-  auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
-  if (C) {
+  auto *V = ICCA->getSimplifiedValueUnchecked(const_cast<Instruction *>(I));
+  if (V) {
     OS << ", simplified to ";
-    (*C)->print(OS, true);
+    V->print(OS, true);
+    if (auto *VI = dyn_cast<Instruction>(V)) {
+      if (VI->getFunction() != I->getFunction())
+        OS << " (caller instruction)";
+    } else if (auto *VArg = dyn_cast<Argument>(V)) {
+      if (VArg->getParent() != I->getFunction())
+        OS << " (caller argument)";
+    }
   }
   OS << "\n";
 }
@@ -1483,7 +1512,7 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
   SmallVector<Value *, 4> Operands;
   Operands.push_back(GEP.getOperand(0));
   for (const Use &Op : GEP.indices())
-    if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
+    if (Constant *SimpleOp = getSimplifiedValue<Constant>(Op))
       Operands.push_back(SimpleOp);
     else
       Operands.push_back(Op);
@@ -1498,7 +1527,7 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // Check whether inlining will turn a dynamic alloca into a static
   // alloca and handle that case.
   if (I.isArrayAllocation()) {
-    Constant *Size = SimplifiedValues.lookup(I.getArraySize());
+    Constant *Size = getSimplifiedValue<Constant>(I.getArraySize());
     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
       // Sometimes a dynamic alloca could be converted into a static alloca
       // after this constant prop, and become a huge static alloca on an
@@ -2388,7 +2417,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
     // Check if this happens to be an indirect function call to a known function
     // in this inline context. If not, we've done all we can.
     Value *Callee = Call.getCalledOperand();
-    F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
+    F = getSimplifiedValue<Function>(Callee);
     if (!F || F->getFunctionType() != Call.getFunctionType()) {
       onCallArgumentSetup(Call);
 
@@ -2483,8 +2512,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
 
   Constant *TrueC = getDirectOrSimplifiedValue<Constant>(TrueVal);
   Constant *FalseC = getDirectOrSimplifiedValue<Constant>(FalseVal);
-  Constant *CondC =
-      dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
+  Constant *CondC = getSimplifiedValue<Constant>(SI.getCondition());
 
   if (!CondC) {
     // Select C, X, X => X
@@ -2833,8 +2861,9 @@ InlineResult CallAnalyzer::analyze() {
   auto CAI = CandidateCall.arg_begin();
   for (Argument &FAI : F.args()) {
     assert(CAI != CandidateCall.arg_end());
-    if (Constant *C = dyn_cast<Constant>(CAI))
-      SimplifiedValues[&FAI] = C;
+    SimplifiedValues[&FAI] = *CAI;
+    if (isa<Constant>(*CAI))
+      ++NumConstantArgs;
 
     Value *PtrArg = *CAI;
     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
@@ -2849,7 +2878,6 @@ InlineResult CallAnalyzer::analyze() {
     }
     ++CAI;
   }
-  NumConstantArgs = SimplifiedValues.size();
   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
   NumAllocaArgs = SROAArgValues.size();
 
@@ -2911,8 +2939,7 @@ InlineResult CallAnalyzer::analyze() {
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isConditional()) {
         Value *Cond = BI->getCondition();
-        if (ConstantInt *SimpleCond =
-                dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+        if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
           BBWorklist.insert(NextBB);
           KnownSuccessors[BB] = NextBB;
@@ -2922,8 +2949,7 @@ InlineResult CallAnalyzer::analyze() {
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       Value *Cond = SI->getCondition();
-      if (ConstantInt *SimpleCond =
-              dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+      if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
         BBWorklist.insert(NextBB);
         KnownSuccessors[BB] = NextBB;

From 9f7567d33a6c68fe5139678768c0cb4d6e614d5f Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Mon, 23 Jun 2025 16:35:48 +0100
Subject: [PATCH 1296/1322] [PreISelIntrinsicLowering] Reuse previously
 generated GlobalVariable for memset_pattern16 when possible (#144677)

As Constants are already uniquified, we can use a map to keep track of
whether a GlobalVariable was produced for a given Constant or not.
Repeated globals with the same value was one of the codegen differences
noted in #126736. This patch removes that diff, producing cleaner
output.
---
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 32 +++++++++++------
 .../X86/memset-pattern.ll                     | 34 ++++++++-----------
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 9d1d70b1cb23..265a32cf4d12 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -68,7 +68,9 @@ struct PreISelIntrinsicLowering {
 
   static bool shouldExpandMemIntrinsicWithSize(Value *Size,
                                                const TargetTransformInfo &TTI);
-  bool expandMemIntrinsicUses(Function &F) const;
+  bool
+  expandMemIntrinsicUses(Function &F,
+                         DenseMap<Constant *, GlobalVariable *> &CMap) const;
   bool lowerIntrinsics(Module &M) const;
 };
 
@@ -295,7 +297,8 @@ static Constant *getMemSetPattern16Value(MemSetPatternInst *Inst,
 
 // TODO: Handle atomic memcpy and memcpy.inline
 // TODO: Pass ScalarEvolution
-bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
+bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
+    Function &F, DenseMap<Constant *, GlobalVariable *> &CMap) const {
   Intrinsic::ID ID = F.getIntrinsicID();
   bool Changed = false;
 
@@ -411,13 +414,20 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
       // global.
       assert(Memset->getRawDest()->getType()->getPointerAddressSpace() == 0 &&
              "Should have skipped if non-zero AS");
-      GlobalVariable *GV = new GlobalVariable(
-          *M, PatternValue->getType(), /*isConstant=*/true,
-          GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern");
-      GV->setUnnamedAddr(
-          GlobalValue::UnnamedAddr::Global); // Ok to merge these.
-      // TODO: Consider relaxing alignment requirement.
-      GV->setAlignment(Align(16));
+      GlobalVariable *GV;
+      auto It = CMap.find(PatternValue);
+      if (It != CMap.end()) {
+        GV = It->second;
+      } else {
+        GV = new GlobalVariable(
+            *M, PatternValue->getType(), /*isConstant=*/true,
+            GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern");
+        GV->setUnnamedAddr(
+            GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+        // TODO: Consider relaxing alignment requirement.
+        GV->setAlignment(Align(16));
+        CMap[PatternValue] = GV;
+      }
       Value *PatternPtr = GV;
       Value *NumBytes = Builder.CreateMul(
           TLI.getAsSizeT(DL.getTypeAllocSize(Memset->getValue()->getType()),
@@ -446,6 +456,8 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
 }
 
 bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
+  // Map unique constants to globals.
+  DenseMap<Constant *, GlobalVariable *> CMap;
   bool Changed = false;
   for (Function &F : M) {
     switch (F.getIntrinsicID()) {
@@ -457,7 +469,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::memset:
     case Intrinsic::memset_inline:
     case Intrinsic::experimental_memset_pattern:
-      Changed |= expandMemIntrinsicUses(F);
+      Changed |= expandMemIntrinsicUses(F, CMap);
       break;
     case Intrinsic::load_relative:
       Changed |= lowerLoadRelative(F);
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
index 64cc4ba28163..aaca5a6c87b4 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
@@ -4,16 +4,10 @@
 ;.
 ; CHECK: @G = global i32 5
 ; CHECK: @.memset_pattern = private unnamed_addr constant [2 x ptr] [ptr @G, ptr @G], align 16
-; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x ptr] [ptr @G, ptr @G], align 16
-; CHECK: @.memset_pattern.2 = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16
-; CHECK: @.memset_pattern.3 = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16
-; CHECK: @.memset_pattern.4 = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16
-; CHECK: @.memset_pattern.5 = private unnamed_addr constant [2 x i64] [i64 4614256656552045848, i64 4614256656552045848], align 16
-; CHECK: @.memset_pattern.6 = private unnamed_addr constant [8 x i16] [i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555], align 16
-; CHECK: @.memset_pattern.7 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
-; CHECK: @.memset_pattern.8 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
-; CHECK: @.memset_pattern.9 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
-; CHECK: @.memset_pattern.10 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
+; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16
+; CHECK: @.memset_pattern.2 = private unnamed_addr constant [2 x i64] [i64 4614256656552045848, i64 4614256656552045848], align 16
+; CHECK: @.memset_pattern.3 = private unnamed_addr constant [8 x i16] [i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555], align 16
+; CHECK: @.memset_pattern.4 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
 ;.
 define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue(
@@ -36,7 +30,7 @@ define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
 define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_1(
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.7, i64 16)
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.4, i64 16)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
@@ -64,7 +58,7 @@ define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounw
 define void @memset_pattern_i128_1_align_attr(ptr align(16) %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_1_align_attr(
 ; CHECK-SAME: ptr align 16 [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @memset_pattern16(ptr align 16 [[A]], ptr @.memset_pattern.8, i64 16)
+; CHECK-NEXT:    call void @memset_pattern16(ptr align 16 [[A]], ptr @.memset_pattern.4, i64 16)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr align(16) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
@@ -74,7 +68,7 @@ define void @memset_pattern_i128_1_align_attr(ptr align(16) %a, i128 %value) nou
 define void @memset_pattern_i128_16(ptr %a) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_16(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.9, i64 256)
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.4, i64 256)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false)
@@ -85,7 +79,7 @@ define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_x(
 ; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 16, [[X]]
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.10, i64 [[TMP1]])
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.4, i64 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false)
@@ -115,7 +109,7 @@ define void @memset_pattern_i16_x(ptr %a, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i16_x(
 ; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 2, [[X]]
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.6, i64 [[TMP1]])
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.3, i64 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i16 u0xabcd, i64 %x, i1 false)
@@ -126,7 +120,7 @@ define void @memset_pattern_i64_x(ptr %a, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i64_x(
 ; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 8, [[X]]
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.4, i64 [[TMP1]])
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0xaaaabbbbccccdddd, i64 %x, i1 false)
@@ -137,7 +131,7 @@ define void @memset_pattern_i64_x(ptr %a, i64 %x) nounwind {
 define void @memset_pattern_i64_128_tbaa(ptr %a) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i64_128_tbaa(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.5, i64 1024), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 1024), !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0x400921fb54442d18, i64 128, i1 false), !tbaa !5
@@ -154,7 +148,7 @@ define void @memset_pattern_i64_narrow_idx(ptr %a, i32 %x) nounwind {
 ; CHECK-SAME: ptr [[A:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[X]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 8, [[TMP1]]
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.3, i64 [[TMP2]])
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0xaaaabbbbccccdddd, i32 %x, i1 false)
@@ -166,7 +160,7 @@ define void @memset_pattern_i64_wide_idx(ptr %a, i128 %x) nounwind {
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i128 [[X]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 8, [[TMP1]]
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 [[TMP2]])
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0xaaaabbbbccccdddd, i128 %x, i1 false)
@@ -189,7 +183,7 @@ define void @memset_pattern_i64_x_fromptr(ptr %a, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i64_x_fromptr(
 ; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 8, [[X]]
-; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 [[TMP2]])
+; CHECK-NEXT:    call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern, i64 [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.experimental.memset.pattern(ptr %a, ptr @G, i64 %x, i1 false)

From 1128a4fd2c3a70ba61eead2ce093a9c31aa2970e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 23 Jun 2025 17:37:33 +0200
Subject: [PATCH 1297/1322] [HLSL] Don't use CreateRuntimeFunction for
 intrinsics (#145334)

HLSL uses CreateRuntimeFunction for three intrinsics. This is pretty
unusual thing to do, and doesn't match what the rest of the file does.

I suspect this might be because these are convergent calls, but the
intrinsics themselves are already marked convergent, so it's not
necessary for clang to manually add the attribute.

This does lose the spir_func CC on the intrinsic declaration, but again,
CC should not be relevant to intrinsics at all.
---
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 37 +++++--------------
 .../CodeGenHLSL/builtins/WaveActiveMax.hlsl   |  6 +--
 .../CodeGenHLSL/builtins/WaveActiveSum.hlsl   |  6 +--
 .../CodeGenHLSL/builtins/WaveReadLaneAt.hlsl  | 12 +++---
 4 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 2a60a0909c93..58165185b671 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -676,35 +676,23 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
   case Builtin::BI__builtin_hlsl_wave_active_sum: {
     // Due to the use of variadic arguments, explicitly retreive argument
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
-    llvm::FunctionType *FT = llvm::FunctionType::get(
-        OpExpr->getType(), ArrayRef{OpExpr->getType()}, false);
     Intrinsic::ID IID = getWaveActiveSumIntrinsic(
         getTarget().getTriple().getArch(), CGM.getHLSLRuntime(),
         E->getArg(0)->getType());
 
-    // Get overloaded name
-    std::string Name =
-        Intrinsic::getName(IID, ArrayRef{OpExpr->getType()}, &CGM.getModule());
-    return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {},
-                                                     /*Local=*/false,
-                                                     /*AssumeConvergent=*/true),
+    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
+                               &CGM.getModule(), IID, {OpExpr->getType()}),
                            ArrayRef{OpExpr}, "hlsl.wave.active.sum");
   }
   case Builtin::BI__builtin_hlsl_wave_active_max: {
     // Due to the use of variadic arguments, explicitly retreive argument
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
-    llvm::FunctionType *FT = llvm::FunctionType::get(
-        OpExpr->getType(), ArrayRef{OpExpr->getType()}, false);
     Intrinsic::ID IID = getWaveActiveMaxIntrinsic(
         getTarget().getTriple().getArch(), CGM.getHLSLRuntime(),
         E->getArg(0)->getType());
 
-    // Get overloaded name
-    std::string Name =
-        Intrinsic::getName(IID, ArrayRef{OpExpr->getType()}, &CGM.getModule());
-    return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {},
-                                                     /*Local=*/false,
-                                                     /*AssumeConvergent=*/true),
+    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
+                               &CGM.getModule(), IID, {OpExpr->getType()}),
                            ArrayRef{OpExpr}, "hlsl.wave.active.max");
   }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
@@ -739,18 +727,11 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     // create our function type.
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Value *OpIndex = EmitScalarExpr(E->getArg(1));
-    llvm::FunctionType *FT = llvm::FunctionType::get(
-        OpExpr->getType(), ArrayRef{OpExpr->getType(), OpIndex->getType()},
-        false);
-
-    // Get overloaded name
-    std::string Name =
-        Intrinsic::getName(CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(),
-                           ArrayRef{OpExpr->getType()}, &CGM.getModule());
-    return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {},
-                                                     /*Local=*/false,
-                                                     /*AssumeConvergent=*/true),
-                           ArrayRef{OpExpr, OpIndex}, "hlsl.wave.readlane");
+    return EmitRuntimeCall(
+        Intrinsic::getOrInsertDeclaration(
+            &CGM.getModule(), CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(),
+            {OpExpr->getType()}),
+        ArrayRef{OpExpr, OpIndex}, "hlsl.wave.readlane");
   }
   case Builtin::BI__builtin_hlsl_elementwise_sign: {
     auto *Arg0 = E->getArg(0);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
index 7891cfc1989a..be05a17cc369 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
@@ -16,7 +16,7 @@ int test_int(int expr) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.max.i32([[TY]]) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.max.i32([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.max.i32([[TY]]) #[[#attr:]]
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
@@ -27,7 +27,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.umax.i64([[TY]]) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.umax.i64([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.umax.i64([[TY]]) #[[#attr:]]
 
 // Test basic lowering to runtime function call with array and float value.
 
@@ -40,7 +40,7 @@ float4 test_floatv4(float4 expr) {
 }
 
 // CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.max.v4f32([[TY1]]) #[[#attr]]
-// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.reduce.max.v4f32([[TY1]]) #[[#attr]]
+// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.reduce.max.v4f32([[TY1]]) #[[#attr]]
 
 // CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
 
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
index 4bf423ccc1b8..1fc93c62c8db 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
@@ -16,7 +16,7 @@ int test_int(int expr) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.sum.i32([[TY]]) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.sum.i32([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.sum.i32([[TY]]) #[[#attr:]]
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
@@ -27,7 +27,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.usum.i64([[TY]]) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.sum.i64([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.reduce.sum.i64([[TY]]) #[[#attr:]]
 
 // Test basic lowering to runtime function call with array and float value.
 
@@ -40,6 +40,6 @@ float4 test_floatv4(float4 expr) {
 }
 
 // CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.sum.v4f32([[TY1]]) #[[#attr]]
-// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]]) #[[#attr]]
+// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]]) #[[#attr]]
 
 // CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
index c94ef8a67735..8c787a42618a 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -17,7 +17,7 @@ int test_int(int expr, uint idx) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i32([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_uint
 uint test_uint(uint expr, uint idx) {
@@ -38,7 +38,7 @@ int64_t test_int64_t(int64_t expr, uint idx) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i64([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i64([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i64([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr, uint idx) {
@@ -60,7 +60,7 @@ int16_t test_int16(int16_t expr, uint idx) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i16([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_uint16
 uint16_t test_uint16(uint16_t expr, uint idx) {
@@ -84,7 +84,7 @@ half test_half(half expr, uint idx) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f16([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_double
 double test_double(double expr, uint idx) {
@@ -96,7 +96,7 @@ double test_double(double expr, uint idx) {
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f64([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr, uint idx) {
@@ -108,6 +108,6 @@ float4 test_floatv4(float4 expr, uint idx) {
 }
 
 // CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
-// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
+// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
 
 // CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}

From 576085c94855fc1536aa6343b272d9e87b7cb3ed Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 23 Jun 2025 08:55:13 -0700
Subject: [PATCH 1298/1322] [SelectionDAG][RISCV] Add support for splitting
 vp.splice (#145184)

Use a stack based expansion similar to the non-VP splice.

This code has been in our downstream for a while. I don't know how often
it is exercised though. Our downstream was missing clipping for the
immediate value to keep it in range of the stack object so I've added
it.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  75 ++++++++++
 llvm/test/CodeGen/RISCV/rvv/vp-splice.ll      | 141 ++++++++++++++++++
 3 files changed, 217 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a541833684f3..8643ae9d7815 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -985,6 +985,7 @@ private:
   void SplitVecRes_VECTOR_INTERLEAVE(SDNode *N);
   void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VP_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_GET_ACTIVE_LANE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index c56cfec81acd..32c596119545 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1382,6 +1382,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UDIVFIXSAT:
     SplitVecRes_FIX(N, Lo, Hi);
     break;
+  case ISD::EXPERIMENTAL_VP_SPLICE:
+    SplitVecRes_VP_SPLICE(N, Lo, Hi);
+    break;
   case ISD::EXPERIMENTAL_VP_REVERSE:
     SplitVecRes_VP_REVERSE(N, Lo, Hi);
     break;
@@ -3209,6 +3212,78 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
   std::tie(Lo, Hi) = DAG.SplitVector(Load, DL);
 }
 
+void DAGTypeLegalizer::SplitVecRes_VP_SPLICE(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  int64_t Imm = cast<ConstantSDNode>(N->getOperand(2))->getSExtValue();
+  SDValue Mask = N->getOperand(3);
+  SDValue EVL1 = N->getOperand(4);
+  SDValue EVL2 = N->getOperand(5);
+  SDLoc DL(N);
+
+  // Since EVL2 is considered the real VL it gets promoted during
+  // SelectionDAGBuilder. Promote EVL1 here if needed.
+  if (getTypeAction(EVL1.getValueType()) == TargetLowering::TypePromoteInteger)
+    EVL1 = ZExtPromotedInteger(EVL1);
+
+  Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+
+  EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                               VT.getVectorElementCount() * 2);
+  SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
+  EVT PtrVT = StackPtr.getValueType();
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+      Alignment);
+  MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+      Alignment);
+
+  SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, EVL1);
+
+  SDValue TrueMask = DAG.getBoolConstant(true, DL, Mask.getValueType(), VT);
+  SDValue StoreV1 = DAG.getStoreVP(DAG.getEntryNode(), DL, V1, StackPtr,
+                                   DAG.getUNDEF(PtrVT), TrueMask, EVL1,
+                                   V1.getValueType(), StoreMMO, ISD::UNINDEXED);
+
+  SDValue StoreV2 =
+      DAG.getStoreVP(StoreV1, DL, V2, StackPtr2, DAG.getUNDEF(PtrVT), TrueMask,
+                     EVL2, V2.getValueType(), StoreMMO, ISD::UNINDEXED);
+
+  SDValue Load;
+  if (Imm >= 0) {
+    StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VT, N->getOperand(2));
+    Load = DAG.getLoadVP(VT, DL, StoreV2, StackPtr, Mask, EVL2, LoadMMO);
+  } else {
+    uint64_t TrailingElts = -Imm;
+    unsigned EltWidth = VT.getScalarSizeInBits() / 8;
+    SDValue TrailingBytes = DAG.getConstant(TrailingElts * EltWidth, DL, PtrVT);
+
+    // Make sure TrailingBytes doesn't exceed the size of vec1.
+    SDValue OffsetToV2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, StackPtr);
+    TrailingBytes =
+        DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, OffsetToV2);
+
+    // Calculate the start address of the spliced result.
+    StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
+    Load = DAG.getLoadVP(VT, DL, StoreV2, StackPtr2, Mask, EVL2, LoadMMO);
+  }
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Load,
+                   DAG.getVectorIdxConstant(0, DL));
+  Hi =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Load,
+                  DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
+}
+
 void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
                                                       SDValue &Hi) {
   SDLoc DL(N);
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index a4f91c3e7c99..ffeb49398910 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -286,3 +286,144 @@ define <vscale x 2 x float> @test_vp_splice_nxv2f32_masked(<vscale x 2 x float>
   %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
   ret <vscale x 2 x float> %v
 }
+
+define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) nounwind {
+; CHECK-LABEL: test_vp_splice_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a5, a4, 1
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    slli a1, a4, 3
+; CHECK-NEXT:    mv a7, a2
+; CHECK-NEXT:    bltu a2, a5, .LBB21_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a7, a5
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    addi sp, sp, -80
+; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi s0, sp, 80
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 5
+; CHECK-NEXT:    sub sp, sp, a5
+; CHECK-NEXT:    andi sp, sp, -64
+; CHECK-NEXT:    add a5, a0, a1
+; CHECK-NEXT:    slli a7, a7, 3
+; CHECK-NEXT:    addi a6, sp, 64
+; CHECK-NEXT:    mv t0, a2
+; CHECK-NEXT:    bltu a2, a4, .LBB21_4
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    mv t0, a4
+; CHECK-NEXT:  .LBB21_4:
+; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    add a5, a6, a7
+; CHECK-NEXT:    vl8re64.v v0, (a0)
+; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a6)
+; CHECK-NEXT:    sub a0, a2, a4
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    add a6, a6, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v16, (a6)
+; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    bltu a3, a4, .LBB21_6
+; CHECK-NEXT:  # %bb.5:
+; CHECK-NEXT:    mv a0, a4
+; CHECK-NEXT:  .LBB21_6:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v0, (a5)
+; CHECK-NEXT:    sub a2, a3, a4
+; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    sltu a3, a3, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    addi a3, sp, 104
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v24, (a5)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a3)
+; CHECK-NEXT:    addi sp, s0, -80
+; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 80
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 5, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i64> %v
+}
+
+define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) nounwind {
+; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 1
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    slli a1, a5, 3
+; CHECK-NEXT:    mv a4, a2
+; CHECK-NEXT:    bltu a2, a6, .LBB22_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a4, a6
+; CHECK-NEXT:  .LBB22_2:
+; CHECK-NEXT:    addi sp, sp, -80
+; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi s0, sp, 80
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 5
+; CHECK-NEXT:    sub sp, sp, a6
+; CHECK-NEXT:    andi sp, sp, -64
+; CHECK-NEXT:    add a6, a0, a1
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    addi a7, sp, 64
+; CHECK-NEXT:    mv t0, a2
+; CHECK-NEXT:    bltu a2, a5, .LBB22_4
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    mv t0, a5
+; CHECK-NEXT:  .LBB22_4:
+; CHECK-NEXT:    vl8re64.v v24, (a6)
+; CHECK-NEXT:    add a6, a7, a4
+; CHECK-NEXT:    vl8re64.v v0, (a0)
+; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v8, (a7)
+; CHECK-NEXT:    sub a0, a2, a5
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    add a7, a7, a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v16, (a7)
+; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    bltu a3, a5, .LBB22_6
+; CHECK-NEXT:  # %bb.5:
+; CHECK-NEXT:    mv a0, a5
+; CHECK-NEXT:  .LBB22_6:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v0, (a6)
+; CHECK-NEXT:    sub a2, a3, a5
+; CHECK-NEXT:    add a5, a6, a1
+; CHECK-NEXT:    sltu a3, a3, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    li a3, 8
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v24, (a5)
+; CHECK-NEXT:    bltu a4, a3, .LBB22_8
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    li a4, 8
+; CHECK-NEXT:  .LBB22_8:
+; CHECK-NEXT:    sub a2, a6, a4
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    addi sp, s0, -80
+; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 80
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 -1, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i64> %v
+}

From 3dc9f2da29f7ef36ac804e20f9fb6ee1b868516e Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 23 Jun 2025 17:58:41 +0200
Subject: [PATCH 1299/1322] Silence "non-void function does not return a value
 in all control paths" in Release builds. NFC

---
 mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
index 1b9efb105b69..b93128441f2b 100644
--- a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
+++ b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
@@ -541,7 +541,7 @@ static mpi::MPI_ReductionOpEnumAttr getMPIReductionOp(ReductionKindAttr kind) {
   case ReductionKind::BitwiseXor:
     return getReductionOp(mpi::MPI_ReductionOpEnum::MPI_BXOR);
   default:
-    assert(false && "Unknown/unsupported reduction kind");
+    llvm_unreachable("Unknown/unsupported reduction kind");
   }
 }
 

From 698e9f56558e20ceb80c08c4a880bed15970b777 Mon Sep 17 00:00:00 2001
From: Ivan Tadeu Ferreira Antunes Filho <antunesi@google.com>
Date: Mon, 23 Jun 2025 12:23:19 -0400
Subject: [PATCH 1300/1322] [lldb] Add support for NoneType to decorator
 skipIfBuildType (#145342)

Currently if cmake_build_type is None we error with `AttributeError:
'NoneType' object has no attribute 'lower'` if the decorator
skipIfBuildType is used. This fixes the issue by first checking that
cmake_build_type is not None.
---
 lldb/packages/Python/lldbsuite/test/decorators.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index a391319ca9b0..a5f58373ede7 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -1155,6 +1155,7 @@ def skipIfBuildType(types: list[str]):
     """
     types = [name.lower() for name in types]
     return unittest.skipIf(
-        configuration.cmake_build_type.lower() in types,
+        configuration.cmake_build_type is not None
+        and configuration.cmake_build_type.lower() in types,
         "skip on {} build type(s)".format(", ".join(types)),
     )

From 8d7a8fcc3ab9f6d4c4a7e4312876fe94ed3d6c4f Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Mon, 23 Jun 2025 09:25:14 -0700
Subject: [PATCH 1301/1322] [SHT_LLVM_BB_ADDR_MAP] Encode and decode callsite
 offsets in a newly-introduced SHT_LLVM_BB_ADDR_MAP version. (#144426)

Recently, we have been looking at some optimizations targeting
individual calls. In particular, we plan to extend the address mapping
technique to map to individual callsites. For example, in this piece of
code for a basic blocks:

```
<BB>:
1200:    lea 0x1(%rcx), %rdx
1204:    callq foo
1209:    cmpq 0x10, %rdx
120d:    ja  L1
```

We want to emit 0x9 as the call site offset for `callq foo` (the offset
from the block entry to right after the call), so that we know if a
sampled address is before the call or after.

This PR implements the decode/encode/emit capability. The Codegen change
will be implemented in a later PR.
---
 llvm/include/llvm/Object/ELFTypes.h           | 22 +++--
 llvm/include/llvm/ObjectYAML/ELFYAML.h        | 15 +++
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  7 +-
 llvm/lib/Object/ELF.cpp                       | 31 +++++-
 llvm/lib/ObjectYAML/ELFEmitter.cpp            | 13 ++-
 llvm/lib/ObjectYAML/ELFYAML.cpp               |  1 +
 .../tools/llvm-readobj/ELF/bb-addr-map.test   | 10 +-
 llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml | 30 +++---
 llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml | 48 +++++++---
 llvm/tools/llvm-readobj/ELFDumper.cpp         |  2 +
 llvm/tools/obj2yaml/elf2yaml.cpp              | 14 ++-
 llvm/unittests/Object/ELFObjectFileTest.cpp   | 94 ++++++++++++-------
 llvm/unittests/Object/ELFTypesTest.cpp        | 31 +++---
 13 files changed, 222 insertions(+), 96 deletions(-)

diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index 87e4dbe44809..d7a468f1116d 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -831,6 +831,7 @@ struct BBAddrMap {
     bool BrProb : 1;
     bool MultiBBRange : 1;
     bool OmitBBEntries : 1;
+    bool CallsiteOffsets : 1;
 
     bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; }
 
@@ -842,7 +843,8 @@ struct BBAddrMap {
              (static_cast<uint8_t>(BBFreq) << 1) |
              (static_cast<uint8_t>(BrProb) << 2) |
              (static_cast<uint8_t>(MultiBBRange) << 3) |
-             (static_cast<uint8_t>(OmitBBEntries) << 4);
+             (static_cast<uint8_t>(OmitBBEntries) << 4) |
+             (static_cast<uint8_t>(CallsiteOffsets) << 5);
     }
 
     // Decodes from minimum bit width representation and validates no
@@ -851,7 +853,7 @@ struct BBAddrMap {
       Features Feat{
           static_cast<bool>(Val & (1 << 0)), static_cast<bool>(Val & (1 << 1)),
           static_cast<bool>(Val & (1 << 2)), static_cast<bool>(Val & (1 << 3)),
-          static_cast<bool>(Val & (1 << 4))};
+          static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5))};
       if (Feat.encode() != Val)
         return createStringError(
             std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x",
@@ -861,9 +863,10 @@ struct BBAddrMap {
 
     bool operator==(const Features &Other) const {
       return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange,
-                      OmitBBEntries) ==
+                      OmitBBEntries, CallsiteOffsets) ==
              std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb,
-                      Other.MultiBBRange, Other.OmitBBEntries);
+                      Other.MultiBBRange, Other.OmitBBEntries,
+                      Other.CallsiteOffsets);
     }
   };
 
@@ -914,13 +917,18 @@ struct BBAddrMap {
     uint32_t Size = 0;   // Size of the basic block.
     Metadata MD = {false, false, false, false,
                    false}; // Metdata for this basic block.
+    // Offsets of callsites (end of call instructions), relative to the basic
+    // block start.
+    SmallVector<uint32_t, 1> CallsiteOffsets;
 
-    BBEntry(uint32_t ID, uint32_t Offset, uint32_t Size, Metadata MD)
-        : ID(ID), Offset(Offset), Size(Size), MD(MD){};
+    BBEntry(uint32_t ID, uint32_t Offset, uint32_t Size, Metadata MD,
+            SmallVector<uint32_t, 1> CallsiteOffsets)
+        : ID(ID), Offset(Offset), Size(Size), MD(MD),
+          CallsiteOffsets(std::move(CallsiteOffsets)) {}
 
     bool operator==(const BBEntry &Other) const {
       return ID == Other.ID && Offset == Other.Offset && Size == Other.Size &&
-             MD == Other.MD;
+             MD == Other.MD && CallsiteOffsets == Other.CallsiteOffsets;
     }
 
     bool hasReturn() const { return MD.HasReturn; }
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index dfdfa055d65f..e883f2f3e144 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -162,6 +162,7 @@ struct BBAddrMapEntry {
     llvm::yaml::Hex64 AddressOffset;
     llvm::yaml::Hex64 Size;
     llvm::yaml::Hex64 Metadata;
+    std::optional<std::vector<llvm::yaml::Hex64>> CallsiteOffsets;
   };
   uint8_t Version;
   llvm::yaml::Hex8 Feature;
@@ -180,6 +181,20 @@ struct BBAddrMapEntry {
       return 0;
     return BBRanges->front().BaseAddress;
   }
+
+  // Returns if any BB entries have non-empty callsite offsets.
+  bool hasAnyCallsiteOffsets() const {
+    if (!BBRanges)
+      return false;
+    for (const ELFYAML::BBAddrMapEntry::BBRangeEntry &BBR : *BBRanges) {
+      if (!BBR.BBEntries)
+        continue;
+      for (const ELFYAML::BBAddrMapEntry::BBEntry &BBE : *BBR.BBEntries)
+        if (BBE.CallsiteOffsets && !BBE.CallsiteOffsets->empty())
+          return true;
+    }
+    return false;
+  }
 };
 
 struct PGOAnalysisMapEntry {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 3fc550c6dd1a..83673f460698 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1426,9 +1426,12 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges) {
         "BB entries info is required for BBFreq and BrProb "
         "features");
   }
-  return {FuncEntryCountEnabled, BBFreqEnabled, BrProbEnabled,
+  return {FuncEntryCountEnabled,
+          BBFreqEnabled,
+          BrProbEnabled,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
-          static_cast<bool>(BBAddrMapSkipEmitBBEntries)};
+          static_cast<bool>(BBAddrMapSkipEmitBBEntries),
+          false};
 }
 
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index e6864ca508a5..6ee33d94ee86 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -837,7 +837,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
       Version = Data.getU8(Cur);
       if (!Cur)
         break;
-      if (Version > 2)
+      if (Version > 3)
         return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
                            Twine(static_cast<int>(Version)));
       Feature = Data.getU8(Cur); // Feature byte
@@ -847,12 +847,18 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
       if (!FeatEnableOrErr)
         return FeatEnableOrErr.takeError();
       FeatEnable = *FeatEnableOrErr;
-      if (Feature != 0 && Version < 2 && Cur)
+      if (FeatEnable.hasPGOAnalysis() && Version < 2)
         return createError(
             "version should be >= 2 for SHT_LLVM_BB_ADDR_MAP when "
             "PGO features are enabled: version = " +
             Twine(static_cast<int>(Version)) +
             " feature = " + Twine(static_cast<int>(Feature)));
+      if (FeatEnable.CallsiteOffsets && Version < 3)
+        return createError(
+            "version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when "
+            "callsite offsets feature is enabled: version = " +
+            Twine(static_cast<int>(Version)) +
+            " feature = " + Twine(static_cast<int>(Feature)));
     }
     uint32_t NumBlocksInBBRange = 0;
     uint32_t NumBBRanges = 1;
@@ -893,7 +899,23 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
                             ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr)
                             : BlockIndex;
           uint32_t Offset = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
-          uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+          // Read the callsite offsets.
+          uint32_t LastCallsiteOffset = 0;
+          SmallVector<uint32_t, 1> CallsiteOffsets;
+          if (FeatEnable.CallsiteOffsets) {
+            uint32_t NumCallsites =
+                readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+            CallsiteOffsets.reserve(NumCallsites);
+            for (uint32_t CallsiteIndex = 0;
+                 !ULEBSizeErr && Cur && (CallsiteIndex < NumCallsites);
+                 ++CallsiteIndex) {
+              LastCallsiteOffset +=
+                  readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+              CallsiteOffsets.push_back(LastCallsiteOffset);
+            }
+          }
+          uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +
+                          LastCallsiteOffset;
           uint32_t MD = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
           if (Version >= 1) {
             // Offset is calculated relative to the end of the previous BB.
@@ -906,7 +928,8 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
             MetadataDecodeErr = MetadataOrErr.takeError();
             break;
           }
-          BBEntries.push_back({ID, Offset, Size, *MetadataOrErr});
+          BBEntries.push_back(
+              {ID, Offset, Size, *MetadataOrErr, CallsiteOffsets});
         }
         TotalNumBlocks += BBEntries.size();
       }
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index 9ae76a71ede5..6de87a88d006 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1452,7 +1452,7 @@ void ELFState<ELFT>::writeSectionContent(
   for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {
     // Write version and feature values.
     if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
-      if (E.Version > 2)
+      if (E.Version > 3)
         WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
                              << static_cast<int>(E.Version)
                              << "; encoding using the most recent version";
@@ -1483,6 +1483,8 @@ void ELFState<ELFT>::writeSectionContent(
     if (!E.BBRanges)
       continue;
     uint64_t TotalNumBlocks = 0;
+    bool EmitCallsiteOffsets =
+        FeatureOrErr->CallsiteOffsets || E.hasAnyCallsiteOffsets();
     for (const ELFYAML::BBAddrMapEntry::BBRangeEntry &BBR : *E.BBRanges) {
       // Write the base address of the range.
       CBA.write<uintX_t>(BBR.BaseAddress, ELFT::Endianness);
@@ -1500,6 +1502,15 @@ void ELFState<ELFT>::writeSectionContent(
         if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP && E.Version > 1)
           SHeader.sh_size += CBA.writeULEB128(BBE.ID);
         SHeader.sh_size += CBA.writeULEB128(BBE.AddressOffset);
+        if (EmitCallsiteOffsets) {
+          size_t NumCallsiteOffsets =
+              BBE.CallsiteOffsets ? BBE.CallsiteOffsets->size() : 0;
+          SHeader.sh_size += CBA.writeULEB128(NumCallsiteOffsets);
+          if (BBE.CallsiteOffsets) {
+            for (uint32_t Offset : *BBE.CallsiteOffsets)
+              SHeader.sh_size += CBA.writeULEB128(Offset);
+          }
+        }
         SHeader.sh_size += CBA.writeULEB128(BBE.Size);
         SHeader.sh_size += CBA.writeULEB128(BBE.Metadata);
       }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index fc485b6656e0..b28191730f01 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -1883,6 +1883,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping(
   IO.mapRequired("AddressOffset", E.AddressOffset);
   IO.mapRequired("Size", E.Size);
   IO.mapRequired("Metadata", E.Metadata);
+  IO.mapOptional("CallsiteOffsets", E.CallsiteOffsets);
 }
 
 void MappingTraits<ELFYAML::PGOAnalysisMapEntry>::mapping(
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
index c5d071c11d1d..5d7bc8baa9b2 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
@@ -49,7 +49,8 @@
 # CHECK-NEXT:           {
 # CHECK-NEXT:             ID: 2
 # CHECK-NEXT:             Offset: 0x3
-# CHECK-NEXT:             Size: 0x4
+# CHECK-NEXT:             Callsite Offsets: [1, 3]
+# CHECK-NEXT:             Size: 0x7
 # CHECK-NEXT:             HasReturn: Yes
 # CHECK-NEXT:             HasTailCall: No
 # CHECK-NEXT:             IsEHPad: Yes
@@ -75,7 +76,7 @@
 # CHECK-NEXT:             HasTailCall: No
 # CHECK-NEXT:             IsEHPad: No
 # CHECK-NEXT:             CanFallThrough: Yes
-# CHECK-NEXT:            HasIndirectBranch: No
+# CHECK-NEXT:             HasIndirectBranch: No
 # CHECK-NEXT:           }
 # CHECK-NEXT:         ]
 # CHECK-NEXT:       }
@@ -143,8 +144,8 @@ Sections:
     ShSize: [[SIZE=<none>]]
     Link:   .text
     Entries:
-      - Version: 2
-        Feature: 0x8
+      - Version: 3
+        Feature: 0x28
         BBRanges:
           - BaseAddress: [[ADDR=0x11111]]
             BBEntries:
@@ -158,6 +159,7 @@ Sections:
                 AddressOffset: 0x3
                 Size:          0x4
                 Metadata:      0x15
+                CallsiteOffsets: [ 0x1 , 0x2 ]
       - Version: 2
         BBRanges:
           - BaseAddress: 0x22222
diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
index 8dbf97ef2bc1..861cb9469294 100644
--- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
@@ -14,7 +14,7 @@
 # VALID-NEXT:   - Name: .llvm_bb_addr_map
 # VALID-NEXT:     Type: SHT_LLVM_BB_ADDR_MAP
 # VALID-NEXT:     Entries:
-# VALID-NEXT:       - Version: 2
+# VALID-NEXT:       - Version: 3
 # VALID-NEXT:         BBRanges:
 ## The 'BaseAddress' field is omitted when it's zero.
 # VALID-NEXT:           - BBEntries:
@@ -30,15 +30,16 @@
 # VALID-NEXT:               AddressOffset: 0xFFFFFFFFFFFFFFF7
 # VALID-NEXT:               Size:          0xFFFFFFFFFFFFFFF8
 # VALID-NEXT:               Metadata:      0xFFFFFFFFFFFFFFF9
-# VALID-NEXT:       - Version: 2
-# VALID-NEXT:         Feature: 0x8
+# VALID-NEXT:       - Version: 3
+# VALID-NEXT:         Feature: 0x28
 # VALID-NEXT:         BBRanges:
 # VALID-NEXT:           - BaseAddress: 0xFFFFFFFFFFFFFF20
 # VALID-NEXT:             BBEntries:
-# VALID-NEXT:               - ID:            6
-# VALID-NEXT:                 AddressOffset: 0xA
-# VALID-NEXT:                 Size:          0xB
-# VALID-NEXT:                 Metadata:      0xC
+# VALID-NEXT:               - ID:              6
+# VALID-NEXT:                 AddressOffset:   0xA
+# VALID-NEXT:                 Size:            0xB
+# VALID-NEXT:                 Metadata:        0xC
+# VALID-NEXT:                 CallsiteOffsets: [ 0x1, 0x2 ]
 
 --- !ELF
 FileHeader:
@@ -50,7 +51,7 @@ Sections:
     Type:   SHT_LLVM_BB_ADDR_MAP
     ShSize: [[SIZE=<none>]]
     Entries:
-      - Version: 2
+      - Version: 3
         Feature: 0x0
         BBRanges:
           - BaseAddress: 0x0
@@ -67,17 +68,18 @@ Sections:
                 AddressOffset: 0xFFFFFFFFFFFFFFF7
                 Size:          0xFFFFFFFFFFFFFFF8
                 Metadata:      0xFFFFFFFFFFFFFFF9
-      - Version:   2
-        Feature:   0x8
+      - Version:   3
+        Feature:   0x28
         NumBBRanges: [[NUMBBRANGES=<none>]]
         BBRanges:
           - BaseAddress:   0xFFFFFFFFFFFFFF20
             NumBlocks: [[NUMBLOCKS=<none>]]
             BBEntries:
-             - ID:            6
-               AddressOffset: 0xA
-               Size:          0xB
-               Metadata:      0xC
+             - ID:              6
+               AddressOffset:   0xA
+               Size:            0xB
+               Metadata:        0xC
+               CallsiteOffsets: [ 0x1, 0x2 ]
 
 ## Check obj2yaml can dump empty .llvm_bb_addr_map sections.
 
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
index 709938babffb..9fd0577b1e0f 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
@@ -36,7 +36,8 @@
 # Case 4: Specify Entries.
 # CHECK:        Name: .llvm_bb_addr_map (1)
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 02002000 00000000 0000010B 010203
+# CHECK-NEXT:     0000: 03002000 00000000 0000010B 01020102
+# CHECK-NEXT:     0010: 0203
 # CHECK-NEXT:   )
 
 # Case 5: Specify Entries and omit the Address field.
@@ -44,28 +45,32 @@
 # CHECK:        Address:
 # CHECK-SAME:   {{^ 0x0$}}
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 02000000 00000000 0000010C 010203
+# CHECK-NEXT:     0000: 03000000 00000000 0000010C 010203
 # CHECK-NEXT:   )
 
 # Case 6: Override the NumBlocks field.
 # CHECK:        Name: .llvm_bb_addr_map (1)
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 02002000 00000000 0000020D 010203
+# CHECK-NEXT:     0000: 03002000 00000000 0000020D 010203
 # CHECK-NEXT:   )
 
 # Case 7: Specify empty BBRanges.
 # CHECK:        Name: .llvm_bb_addr_map (1)
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 020000
+# CHECK-NEXT:     0000: 030000
 # CHECK-NEXT:   )
 
 # Case 8: Specify empty BBRanges with multi-bb-range.
 # CHECK:        Name: .llvm_bb_addr_map (1)
 # CHECK:        SectionData (
-# CHECK-NEXT:     0000: 020800
+# CHECK-NEXT:     0000: 030800
 # CHECK-NEXT:   )
 
-
+# Case 9: Specify empty CallsiteOffsets.
+# CHECK:        Name: .llvm_bb_addr_map (1)
+# CHECK:        SectionData (
+# CHECK-NEXT:     0000: 03202000 00000000 0000010E 01000203
+# CHECK-NEXT:   )
 
 
 --- !ELF
@@ -100,7 +105,7 @@ Sections:
   - Name: '.llvm_bb_addr_map (4)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
-      - Version: 2
+      - Version: 3
         BBRanges:
           - BaseAddress: 0x0000000000000020
             BBEntries:
@@ -108,13 +113,14 @@ Sections:
                 AddressOffset: 0x00000001
                 Size:          0x00000002
                 Metadata:      0x00000003
+                CallsiteOffsets: [0x1, 0x2]
 
 ## 5) When specifying the description with Entries, the 'Address' field will be
 ##    zero when omitted.
   - Name: '.llvm_bb_addr_map (5)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
-      - Version: 2
+      - Version: 3
         BBRanges:
           - BBEntries:
             - ID:            12
@@ -127,7 +133,7 @@ Sections:
   - Name: '.llvm_bb_addr_map (6)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
-      - Version:   2
+      - Version: 3
         BBRanges:
           - BaseAddress:   0x0000000000000020
             NumBlocks: 2
@@ -142,7 +148,7 @@ Sections:
   - Name: '.llvm_bb_addr_map (7)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
-      - Version: 2
+      - Version: 3
         BBRanges: []
 
 ## 8) We can produce a SHT_LLVM_BB_ADDR_MAP section from a multi-bb-range
@@ -150,10 +156,26 @@ Sections:
   - Name: '.llvm_bb_addr_map (8)'
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
-      - Version: 2
+      - Version: 3
         Feature: 0x8
         BBRanges: []
 
+## 9) We can produce a SHT_LLVM_BB_ADDR_MAP section from a description
+##    with empty callsite offsets.
+  - Name: '.llvm_bb_addr_map (9)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 3
+        Feature: 0x20
+        BBRanges:
+          - BaseAddress: 0x0000000000000020
+            BBEntries:
+             - ID:              14
+               AddressOffset:   0x00000001
+               Size:            0x00000002
+               Metadata:        0x00000003
+               CallsiteOffsets: []
+
 ## Check we can't use Entries at the same time as either Content or Size.
 # RUN: not yaml2obj --docnum=2 -DCONTENT="00" %s 2>&1 | FileCheck %s --check-prefix=INVALID
 # RUN: not yaml2obj --docnum=2 -DSIZE="0" %s 2>&1 | FileCheck %s --check-prefix=INVALID
@@ -175,7 +197,7 @@ Sections:
 
 ## Check that yaml2obj generates a warning when we use unsupported versions.
 # RUN: yaml2obj --docnum=3  %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION
-# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 3; encoding using the most recent version
+# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 4; encoding using the most recent version
 
 --- !ELF
 FileHeader:
@@ -187,4 +209,4 @@ Sections:
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
 ##  Specify unsupported version
-      - Version: 3
+      - Version: 4
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 7be61dcce841..101079f09e1d 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -7879,6 +7879,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
             DictScope BBED(W);
             W.printNumber("ID", BBE.ID);
             W.printHex("Offset", BBE.Offset);
+            if (!BBE.CallsiteOffsets.empty())
+              W.printList("Callsite Offsets", BBE.CallsiteOffsets);
             W.printHex("Size", BBE.Size);
             W.printBoolean("HasReturn", BBE.hasReturn());
             W.printBoolean("HasTailCall", BBE.hasTailCall());
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index c56ed15501b4..53455b8c7580 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -899,7 +899,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
   while (Cur && Cur.tell() < Content.size()) {
     if (Shdr->sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) {
       Version = Data.getU8(Cur);
-      if (Cur && Version > 2)
+      if (Cur && Version > 3)
         return createStringError(
             errc::invalid_argument,
             "invalid SHT_LLVM_BB_ADDR_MAP section version: " +
@@ -934,9 +934,19 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
            ++BlockIndex) {
         uint32_t ID = Version >= 2 ? Data.getULEB128(Cur) : BlockIndex;
         uint64_t Offset = Data.getULEB128(Cur);
+        std::optional<std::vector<llvm::yaml::Hex64>> CallsiteOffsets;
+        if (FeatureOrErr->CallsiteOffsets) {
+          uint32_t NumCallsites = Data.getULEB128(Cur);
+          CallsiteOffsets = std::vector<llvm::yaml::Hex64>(NumCallsites, 0);
+          for (uint32_t CallsiteIndex = 0; Cur && CallsiteIndex < NumCallsites;
+               ++CallsiteIndex) {
+            (*CallsiteOffsets)[CallsiteIndex] = Data.getULEB128(Cur);
+          }
+        }
         uint64_t Size = Data.getULEB128(Cur);
         uint64_t Metadata = Data.getULEB128(Cur);
-        BBEntries.push_back({ID, Offset, Size, Metadata});
+        BBEntries.push_back(
+            {ID, Offset, Size, Metadata, std::move(CallsiteOffsets)});
       }
       TotalNumBlocks += BBEntries.size();
       BBRanges.push_back({BaseAddress, /*NumBlocks=*/{}, BBEntries});
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 1073df95c379..423f92ea07b3 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -531,7 +531,7 @@ Sections:
   // Check that we can detect unsupported versions.
   SmallString<128> UnsupportedVersionYamlString(CommonYamlString);
   UnsupportedVersionYamlString += R"(
-      - Version: 3
+      - Version: 4
         BBRanges:
           - BaseAddress: 0x11111
             BBEntries:
@@ -543,12 +543,12 @@ Sections:
   {
     SCOPED_TRACE("unsupported version");
     DoCheck(UnsupportedVersionYamlString,
-            "unsupported SHT_LLVM_BB_ADDR_MAP version: 3");
+            "unsupported SHT_LLVM_BB_ADDR_MAP version: 4");
   }
 
   SmallString<128> ZeroBBRangesYamlString(CommonYamlString);
   ZeroBBRangesYamlString += R"(
-      - Version: 2
+      - Version: 3
         Feature: 0x8
         BBRanges: []
 )";
@@ -561,7 +561,7 @@ Sections:
 
   SmallString<128> CommonVersionedYamlString(CommonYamlString);
   CommonVersionedYamlString += R"(
-      - Version: 2
+      - Version: 3
         BBRanges:
           - BaseAddress: 0x11111
             BBEntries:
@@ -669,6 +669,26 @@ Sections:
 )";
   DoCheck(OverLimitNumBBRanges,
           "ULEB128 value at offset 0x2 exceeds UINT32_MAX (0x100000000)");
+
+  // Check that we can detect unsupported version for callsite offsets.
+  SmallString<128> UnsupportedLowVersionYamlString(CommonYamlString);
+  UnsupportedLowVersionYamlString += R"(
+      - Version: 2
+        Feature: 0x20
+        BBRanges:
+          - BBEntries:
+              - AddressOffset:   0x0
+                Size:            0x1
+                Metadata:        0x2
+                CallsiteOffsets: [ 0x1 ]
+)";
+
+  {
+    SCOPED_TRACE("unsupported version");
+    DoCheck(UnsupportedLowVersionYamlString,
+            "version should be >= 3 for SHT_LLVM_BB_ADDR_MAP when callsite"
+            " offsets feature is enabled: version = 2 feature = 32");
+  }
 }
 
 // Test for the ELFObjectFile::readBBAddrMap API.
@@ -684,19 +704,21 @@ Sections:
     Type: SHT_LLVM_BB_ADDR_MAP
     Link: 1
     Entries:
-      - Version: 2
+      - Version: 3
+        Feature: 0x20
         BBRanges:
           - BaseAddress: 0x11111
             BBEntries:
-              - ID:            1
-                AddressOffset: 0x0
-                Size:          0x1
-                Metadata:      0x2
+              - ID:              1
+                AddressOffset:   0x0
+                Size:            0x1
+                Metadata:        0x2
+                CallsiteOffsets: [ 0x1 , 0x1 ]
   - Name: .llvm_bb_addr_map_2
     Type: SHT_LLVM_BB_ADDR_MAP
     Link: 1
     Entries:
-      - Version: 2
+      - Version: 3
         Feature: 0x8
         BBRanges:
           - BaseAddress: 0x22222
@@ -738,14 +760,15 @@ Sections:
 )");
 
   BBAddrMap E1 = {
-      {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}}}}}};
+      {{0x11111,
+        {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}}}}}};
   BBAddrMap E2 = {
-      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}}}},
-       {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}}}}}};
+      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}},
+       {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}}}}}};
   BBAddrMap E3 = {
-      {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}}}}}};
+      {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}}}}}};
   BBAddrMap E4 = {
-      {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}}}}}};
+      {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}}}}}};
 
   std::vector<BBAddrMap> Section0BBAddrMaps = {E4};
   std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
@@ -1137,28 +1160,29 @@ Sections:
 )");
 
   BBAddrMap E1 = {
-      {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}}}}}};
-  PGOAnalysisMap P1 = {892, {}, {true, false, false, false, false}};
+      {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}}}}}};
+  PGOAnalysisMap P1 = {892, {}, {true, false, false, false, false, false}};
   BBAddrMap E2 = {
-      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}}}}}};
-  PGOAnalysisMap P2 = {
-      {}, {{BlockFrequency(343), {}}}, {false, true, false, false, false}};
+      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}}}};
+  PGOAnalysisMap P2 = {{},
+                       {{BlockFrequency(343), {}}},
+                       {false, true, false, false, false, false}};
   BBAddrMap E3 = {{{0x33333,
-                    {{0, 0x0, 0x3, {false, true, true, false, false}},
-                     {1, 0x3, 0x3, {false, false, true, false, false}},
-                     {2, 0x6, 0x3, {false, false, false, false, false}}}}}};
+                    {{0, 0x0, 0x3, {false, true, true, false, false}, {}},
+                     {1, 0x3, 0x3, {false, false, true, false, false}, {}},
+                     {2, 0x6, 0x3, {false, false, false, false, false}, {}}}}}};
   PGOAnalysisMap P3 = {{},
                        {{{},
                          {{1, BranchProbability::getRaw(0x1111'1111)},
                           {2, BranchProbability::getRaw(0xeeee'eeee)}}},
                         {{}, {{2, BranchProbability::getRaw(0xffff'ffff)}}},
                         {{}, {}}},
-                       {false, false, true, false, false}};
+                       {false, false, true, false, false, false}};
   BBAddrMap E4 = {{{0x44444,
-                    {{0, 0x0, 0x4, {false, false, false, true, true}},
-                     {1, 0x4, 0x4, {false, false, false, false, false}},
-                     {2, 0x8, 0x4, {false, false, false, false, false}},
-                     {3, 0xc, 0x4, {false, false, false, false, false}}}}}};
+                    {{0, 0x0, 0x4, {false, false, false, true, true}, {}},
+                     {1, 0x4, 0x4, {false, false, false, false, false}, {}},
+                     {2, 0x8, 0x4, {false, false, false, false, false}, {}},
+                     {3, 0xc, 0x4, {false, false, false, false, false}, {}}}}}};
   PGOAnalysisMap P4 = {
       1000,
       {{BlockFrequency(1000),
@@ -1170,22 +1194,22 @@ Sections:
          {3, BranchProbability::getRaw(0xeeee'eeee)}}},
        {BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}},
        {BlockFrequency(1000), {}}},
-      {true, true, true, false, false}};
+      {true, true, true, false, false, false}};
   BBAddrMap E5 = {
-      {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}}}}}};
-  PGOAnalysisMap P5 = {{}, {}, {false, false, false, false, false}};
+      {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}}}};
+  PGOAnalysisMap P5 = {{}, {}, {false, false, false, false, false, false}};
   BBAddrMap E6 = {
       {{0x66666,
-        {{0, 0x0, 0x6, {false, true, true, false, false}},
-         {1, 0x6, 0x6, {false, false, true, false, false}}}},
-       {0x666661, {{2, 0x0, 0x6, {false, false, false, false, false}}}}}};
+        {{0, 0x0, 0x6, {false, true, true, false, false}, {}},
+         {1, 0x6, 0x6, {false, false, true, false, false}, {}}}},
+       {0x666661, {{2, 0x0, 0x6, {false, false, false, false, false}, {}}}}}};
   PGOAnalysisMap P6 = {{},
                        {{{},
                          {{1, BranchProbability::getRaw(0x2222'2222)},
                           {2, BranchProbability::getRaw(0xcccc'cccc)}}},
                         {{}, {{2, BranchProbability::getRaw(0x8888'8888)}}},
                         {{}, {}}},
-                       {false, false, true, true, false}};
+                       {false, false, true, true, false, false}};
 
   std::vector<BBAddrMap> Section0BBAddrMaps = {E4, E5, E6};
   std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
diff --git a/llvm/unittests/Object/ELFTypesTest.cpp b/llvm/unittests/Object/ELFTypesTest.cpp
index 13130dde80ef..f88931b5f544 100644
--- a/llvm/unittests/Object/ELFTypesTest.cpp
+++ b/llvm/unittests/Object/ELFTypesTest.cpp
@@ -101,18 +101,21 @@ static_assert(
     "PGOAnalysisMap should use the same type for basic block ID as BBAddrMap");
 
 TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
-  const std::array<BBAddrMap::Features, 9> Decoded = {
-      {{false, false, false, false, false},
-       {true, false, false, false, false},
-       {false, true, false, false, false},
-       {false, false, true, false, false},
-       {false, false, false, true, false},
-       {true, true, false, false, false},
-       {false, true, true, false, false},
-       {false, true, true, true, false},
-       {true, true, true, true, false}}};
-  const std::array<uint8_t, 9> Encoded = {
-      {0b0000, 0b0001, 0b0010, 0b0100, 0b1000, 0b0011, 0b0110, 0b1110, 0b1111}};
+  const std::array<BBAddrMap::Features, 11> Decoded = {
+      {{false, false, false, false, false, false},
+       {true, false, false, false, false, false},
+       {false, true, false, false, false, false},
+       {false, false, true, false, false, false},
+       {false, false, false, true, false, false},
+       {true, true, false, false, false, false},
+       {false, true, true, false, false, false},
+       {false, true, true, true, false, false},
+       {true, true, true, true, false, false},
+       {false, false, false, false, true, false},
+       {false, false, false, false, false, true}}};
+  const std::array<uint8_t, 11> Encoded = {{0b0000, 0b0001, 0b0010, 0b0100,
+                                            0b1000, 0b0011, 0b0110, 0b1110,
+                                            0b1111, 0b1'0000, 0b10'0000}};
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded))
     EXPECT_EQ(Feat.encode(), EncodedVal);
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) {
@@ -125,9 +128,9 @@ TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
 
 TEST(ELFTypesTest, BBAddrMapFeaturesInvalidEncodingTest) {
   const std::array<std::string, 2> Errors = {
-      "invalid encoding for BBAddrMap::Features: 0x20",
+      "invalid encoding for BBAddrMap::Features: 0x40",
       "invalid encoding for BBAddrMap::Features: 0xf0"};
-  const std::array<uint8_t, 2> Values = {{0b10'0000, 0b1111'0000}};
+  const std::array<uint8_t, 2> Values = {{0b100'0000, 0b1111'0000}};
   for (const auto &[Val, Error] : llvm::zip(Values, Errors)) {
     EXPECT_THAT_ERROR(BBAddrMap::Features::decode(Val).takeError(),
                       FailedWithMessage(Error));

From 634fe0de50418464635227443c1ac866362e2f08 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 23 Jun 2025 17:25:57 +0100
Subject: [PATCH 1302/1322] [lldb][DWARF] Support retrieving
 DW_FORM_implicit_const value with DWARFDebugInfoEntry::GetAttributeValue
 (#145328)

`DWARFFormValue::ExtractValue` has nothing to extract for
`DW_FORM_implicit_const` since the value is stored in the abbreviation.
`DWARFFormValue` expects the user to have set the value of the
implicit_const. This patch does so in `GetAttributeValue`.
---
 .../SymbolFile/DWARF/DWARFDebugInfoEntry.cpp  |  3 +
 .../SymbolFile/DWARF/DWARFDIETest.cpp         | 58 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 8217c85f8601..13b68e747b1c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -403,6 +403,9 @@ dw_offset_t DWARFDebugInfoEntry::GetAttributeValue(
       const dw_offset_t attr_offset = offset;
       form_value.SetUnit(cu);
       form_value.SetForm(abbrevDecl->getFormByIndex(idx));
+      if (abbrevDecl->getAttrIsImplicitConstByIndex(idx))
+        form_value.SetValue(abbrevDecl->getAttrImplicitConstValueByIndex(idx));
+
       if (form_value.ExtractValue(data, &offset)) {
         if (end_attr_offset_ptr)
           *end_attr_offset_ptr = offset;
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 3f61d1607073..0da26d99ad38 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -395,6 +395,64 @@ DWARF:
               testing::ElementsAre(make_struct("struct_t")));
 }
 
+TEST(DWARFDIETest, GetAttributeValue_ImplicitConst) {
+  // Make sure we can correctly retrieve the value of an attribute
+  // that has a DW_FORM_implicit_const form.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_str:
+    - ''
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_implicit_const
+              Value:           5
+  debug_info:
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            func
+        - AbbrCode:        0x0)";
+
+  YAMLModuleTester t(yamldata);
+  auto *symbol_file =
+      llvm::cast<SymbolFileDWARF>(t.GetModule()->GetSymbolFile());
+  DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0);
+  ASSERT_TRUE(unit);
+
+  DWARFDIE subprogram = unit->DIE().GetFirstChild();
+  ASSERT_TRUE(subprogram);
+  dw_offset_t end_attr_offset;
+  DWARFFormValue form_value;
+  dw_offset_t offset = subprogram.GetDIE()->GetAttributeValue(
+      unit, DW_AT_object_pointer, form_value, &end_attr_offset);
+  EXPECT_EQ(form_value.Unsigned(), 5U);
+  EXPECT_GT(offset, 0U);
+  EXPECT_GT(end_attr_offset, 0U);
+}
+
 struct GetAttributesTestFixture : public testing::TestWithParam<dw_attr_t> {};
 
 TEST_P(GetAttributesTestFixture, TestGetAttributes_IterationOrder) {

From 5a16645a3da29f3c1c2ef4ebc908e57a41bab87b Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 23 Jun 2025 17:26:23 +0100
Subject: [PATCH 1303/1322] Reland "[lldb][DWARF] Remove object_pointer from
 ParsedDWARFAttributes (#145065)" (#145126)

This reverts commit 877511920dcf36463e06746d626e8876583a6abd.

This fixes the `TestObjCInBlockVars.py` LLDB API test.

The issue was that `GetCXXObjectParameter` wouldn't deduce the object
parameter of Objective-C method definitions correctly. In DWARF those
don't have a `DW_AT_specification` (so no link back to a DeclContext
that is a class type). The fix is to only check the validity of the
DeclContext DIE *if* no `DW_AT_object_pointer` exists on the DIE. If
`DW_AT_object_pointer` does exist, we should just always use that as the
object_parameter.
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  35 ++--
 .../SymbolFile/DWARF/DWARFASTParserClang.h    |   7 +-
 .../DWARF/DWARFASTParserClangTests.cpp        | 162 +++++++++++++++++-
 3 files changed, 176 insertions(+), 28 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index a4cb608edd8b..d3912ad55a23 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -167,9 +167,6 @@ DWARFASTParserClang::GetObjectParameter(const DWARFDIE &subprogram,
          subprogram.Tag() == DW_TAG_inlined_subroutine ||
          subprogram.Tag() == DW_TAG_subroutine_type);
 
-  if (!decl_ctx_die.IsStructUnionOrClass())
-    return {};
-
   if (DWARFDIE object_parameter =
           subprogram.GetAttributeValueAsReferenceDIE(DW_AT_object_pointer))
     return object_parameter;
@@ -177,6 +174,10 @@ DWARFASTParserClang::GetObjectParameter(const DWARFDIE &subprogram,
   // If no DW_AT_object_pointer was specified, assume the implicit object
   // parameter is the first parameter to the function, is called "this" and is
   // artificial (which is what most compilers would generate).
+
+  if (!decl_ctx_die.IsStructUnionOrClass())
+    return {};
+
   auto children = subprogram.children();
   auto it = llvm::find_if(children, [](const DWARFDIE &child) {
     return child.Tag() == DW_TAG_formal_parameter;
@@ -441,15 +442,6 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) {
       name.SetCString(form_value.AsCString());
       break;
 
-    case DW_AT_object_pointer:
-      // GetAttributes follows DW_AT_specification.
-      // DW_TAG_subprogram definitions and declarations may both
-      // have a DW_AT_object_pointer. Don't overwrite the one
-      // we parsed for the definition with the one from the declaration.
-      if (!object_pointer.IsValid())
-        object_pointer = form_value.Reference();
-      break;
-
     case DW_AT_signature:
       signature = form_value;
       break;
@@ -1112,7 +1104,7 @@ bool DWARFASTParserClang::ParseObjCMethod(
 std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
     const DWARFDIE &die, CompilerType clang_type,
     const ParsedDWARFTypeAttributes &attrs, const DWARFDIE &decl_ctx_die,
-    bool is_static, bool &ignore_containing_context) {
+    const DWARFDIE &object_parameter, bool &ignore_containing_context) {
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
   SymbolFileDWARF *dwarf = die.GetDWARF();
   assert(dwarf);
@@ -1196,6 +1188,9 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
       TypeSystemClang::GetDeclContextForType(class_opaque_type), die,
       attrs.name.GetCString());
 
+  // In DWARF, a C++ method is static if it has no object parameter child.
+  const bool is_static = !object_parameter.IsValid();
+
   // We have a C++ member function with no children (this pointer!) and clang
   // will get mad if we try and make a function that isn't well formed in the
   // DWARF, so we will just skip it...
@@ -1221,9 +1216,7 @@ std::pair<bool, TypeSP> DWARFASTParserClang::ParseCXXMethod(
     ClangASTMetadata metadata;
     metadata.SetUserID(die.GetID());
 
-    char const *object_pointer_name =
-        attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
-    if (object_pointer_name) {
+    if (char const *object_pointer_name = object_parameter.GetName()) {
       metadata.SetObjectPtrName(object_pointer_name);
       LLDB_LOGF(log, "Setting object pointer name: %s on method object %p.\n",
                 object_pointer_name, static_cast<void *>(cxx_method_decl));
@@ -1319,11 +1312,9 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
         type_handled =
             ParseObjCMethod(*objc_method, die, clang_type, attrs, is_variadic);
       } else if (is_cxx_method) {
-        // In DWARF, a C++ method is static if it has no object parameter child.
-        const bool is_static = !object_parameter.IsValid();
         auto [handled, type_sp] =
-            ParseCXXMethod(die, clang_type, attrs, decl_ctx_die, is_static,
-                           ignore_containing_context);
+            ParseCXXMethod(die, clang_type, attrs, decl_ctx_die,
+                           object_parameter, ignore_containing_context);
         if (type_sp)
           return type_sp;
 
@@ -1418,9 +1409,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
           ClangASTMetadata metadata;
           metadata.SetUserID(die.GetID());
 
-          char const *object_pointer_name =
-              attrs.object_pointer ? attrs.object_pointer.GetName() : nullptr;
-          if (object_pointer_name) {
+          if (char const *object_pointer_name = object_parameter.GetName()) {
             metadata.SetObjectPtrName(object_pointer_name);
             LLDB_LOGF(log,
                       "Setting object pointer name: %s on function "
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index e57fc503d34c..da58f4c14622 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -480,7 +480,8 @@ private:
   /// \param[in] decl_ctx_die The DIE representing the DeclContext of the C++
   ///                         method being parsed.
   ///
-  /// \param[in] is_static Is true iff we're parsing a static method.
+  /// \param[in] object_parameter The DIE of this subprogram's object parameter.
+  ///                             May be an invalid DIE for C++ static methods.
   ///
   /// \param[out] ignore_containing_context Will get set to true if the caller
   ///             should treat this C++ method as-if it was not a C++ method.
@@ -495,7 +496,8 @@ private:
                  lldb_private::CompilerType clang_type,
                  const ParsedDWARFTypeAttributes &attrs,
                  const lldb_private::plugin::dwarf::DWARFDIE &decl_ctx_die,
-                 bool is_static, bool &ignore_containing_context);
+                 const lldb_private::plugin::dwarf::DWARFDIE &object_parameter,
+                 bool &ignore_containing_context);
 
   lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die,
                               const ParsedDWARFTypeAttributes &attrs);
@@ -565,7 +567,6 @@ struct ParsedDWARFTypeAttributes {
   const char *mangled_name = nullptr;
   lldb_private::ConstString name;
   lldb_private::Declaration decl;
-  lldb_private::plugin::dwarf::DWARFDIE object_pointer;
   lldb_private::plugin::dwarf::DWARFFormValue abstract_origin;
   lldb_private::plugin::dwarf::DWARFFormValue containing_type;
   lldb_private::plugin::dwarf::DWARFFormValue signature;
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index f18e938dbc4c..fa05cd174fc7 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -742,8 +742,8 @@ DWARF:
   ASSERT_EQ(type_sp, reparsed_type_sp);
 }
 
-TEST_F(DWARFASTParserClangTests, TestParseDWARFAttributes_ObjectPointer) {
-  // This tests the behaviour of ParsedDWARFTypeAttributes
+TEST_F(DWARFASTParserClangTests, TestObjectPointer) {
+  // This tests the behaviour of DWARFASTParserClang
   // for DW_TAG_subprogram definitions which have a DW_AT_object_pointer
   // *and* a DW_AT_specification that also has a DW_AT_object_pointer.
   // We don't want the declaration DW_AT_object_pointer to overwrite the
@@ -916,6 +916,164 @@ DWARF:
   }
 }
 
+TEST_F(DWARFASTParserClangTests,
+       TestObjectPointer_NoSpecificationOnDefinition) {
+  // This tests the behaviour of DWARFASTParserClang
+  // for DW_TAG_subprogram definitions which have a DW_AT_object_pointer
+  // but no DW_AT_specification that would link back to its declaration.
+  // This is how Objective-C class method definitions are emitted.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - Context
+    - func
+    - this
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x4
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+        - Code:            0x5
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_ref4
+        - Code:            0x6
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#   DW_TAG_structure_type
+#     DW_AT_name [DW_FORM_strp] ("Context")
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_object_pointer [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x8
+            - Value:           0x1
+            - Value:           0x1d
+            - Value:           0x1
+            - Value:           0x1
+
+#       DW_TAG_formal_parameter
+#         DW_AT_artificial
+        - AbbrCode:        0x4
+          Values:
+          - Value: 0x1
+
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_object_pointer [DW_FORM_ref4] ("this")
+        - AbbrCode:        0x5
+          Values:
+            - Value:           0x25
+
+#       DW_TAG_formal_parameter
+#         DW_AT_name [DW_FORM_strp] ("this")
+#         DW_AT_artificial
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0xd
+            - Value:           0x1
+
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  auto context_die = cu_die.GetFirstChild();
+  ASSERT_TRUE(context_die.IsValid());
+  ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type);
+
+  auto subprogram_definition = context_die.GetSibling();
+  ASSERT_TRUE(subprogram_definition.IsValid());
+  ASSERT_EQ(subprogram_definition.Tag(), DW_TAG_subprogram);
+  ASSERT_FALSE(subprogram_definition.GetAttributeValueAsOptionalUnsigned(
+      DW_AT_external));
+  ASSERT_FALSE(
+      subprogram_definition.GetAttributeValueAsReferenceDIE(DW_AT_specification)
+          .IsValid());
+
+  auto param_die = subprogram_definition.GetFirstChild();
+  ASSERT_TRUE(param_die.IsValid());
+  EXPECT_EQ(param_die,
+            ast_parser.GetObjectParameter(subprogram_definition, {}));
+}
+
 TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) {
   // Tests parsing of a C++ non-static member function with an explicit object
   // parameter that isn't called "this" and is not a pointer (but a CV-qualified

From 10b61fea8a1a458b6b9e4a4b375883d21f096468 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 23 Jun 2025 09:33:34 -0700
Subject: [PATCH 1304/1322] [llvm] include GenericLoopInfoImpl for full
 implementation (#144621)

MSVC issues a warning when a an `extern` template instantiation is
annotated for DLL export but it does not have the complete template
definition. Because the full implementation of `LoopBase` is in
`GenericLoopInfoImpl.h` rather than `GenericLoopInfo.h`, MSVC complains
whenever `LoopInfo.h` is included.
```
S:\llvm\llvm-project\llvm\include\llvm/Support/GenericLoopInfo.h(342): warning C4661: 'BlockT *llvm::LoopBase<BlockT,llvm::Loop>::getLoopLatch(void) const': no suitable definition provided for explicit template instantiation request
        with
        [
            BlockT=llvm::BasicBlock
        ]
S:\llvm\llvm-project\llvm\include\llvm/Support/GenericLoopInfo.h(326): note: see declaration of 'llvm::LoopBase<llvm::BasicBlock,llvm::Loop>::getLoopLatch'
```
Everything links fine but the warning is very noisy when building LLVM
as a Windows DLL. Interestingly, `clang-cl` does not warn here and is
fine with the code as-is.
---
 llvm/include/llvm/Analysis/LoopInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 072ddad546bf..a7a6a2753709 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -18,7 +18,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/GenericLoopInfo.h"
+#include "llvm/Support/GenericLoopInfoImpl.h"
 #include <optional>
 #include <utility>
 

From c0ce9adf603c2a49f685b1c1846a01e3253adb29 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 23 Jun 2025 09:34:14 -0700
Subject: [PATCH 1305/1322] [llvm] remove incorrect `LLVM_ABI` annotation usage
 (#144606)

## Overview
This patch removes incorrect usage of `LLVM_ABI` macros that break the
(currently incomplete) LLVM Windows DLL build.
- ec71d80 added `LLVM_ABI` to template methods fully declared in a
header file
- 254a92d added `LLVM_ABI` to members of a class already annotated with
`LLVM_ABI`

# Background
Documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).
---
 llvm/include/llvm/CodeGen/MachineScheduler.h | 4 ++--
 llvm/include/llvm/MC/MCExpr.h                | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 486508e760dc..e7a7091acee6 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1403,7 +1403,7 @@ createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
 /// default scheduler if the target does not set a default.
 /// Adds default DAG mutations.
 template <typename Strategy = GenericScheduler>
-LLVM_ABI ScheduleDAGMILive *createSchedLive(MachineSchedContext *C) {
+ScheduleDAGMILive *createSchedLive(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new ScheduleDAGMILive(C, std::make_unique<Strategy>(C));
   // Register DAG post-processors.
@@ -1423,7 +1423,7 @@ LLVM_ABI ScheduleDAGMILive *createSchedLive(MachineSchedContext *C) {
 
 /// Create a generic scheduler with no vreg liveness or DAG mutation passes.
 template <typename Strategy = PostGenericScheduler>
-LLVM_ABI ScheduleDAGMI *createSchedPostRA(MachineSchedContext *C) {
+ScheduleDAGMI *createSchedPostRA(MachineSchedContext *C) {
   ScheduleDAGMI *DAG = new ScheduleDAGMI(C, std::make_unique<Strategy>(C),
                                          /*RemoveKillFlags=*/true);
   const TargetSubtargetInfo &STI = C->MF->getSubtarget();
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 4ec780d8ff94..8d66ed22875d 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -514,10 +514,10 @@ protected:
       : MCExpr(Specifier, Loc), Expr(Expr), specifier(S) {}
 
 public:
-  LLVM_ABI static const MCSpecifierExpr *
-  create(const MCExpr *Expr, Spec S, MCContext &Ctx, SMLoc Loc = SMLoc());
-  LLVM_ABI static const MCSpecifierExpr *
-  create(const MCSymbol *Sym, Spec S, MCContext &Ctx, SMLoc Loc = SMLoc());
+  static const MCSpecifierExpr *create(const MCExpr *Expr, Spec S,
+                                       MCContext &Ctx, SMLoc Loc = SMLoc());
+  static const MCSpecifierExpr *create(const MCSymbol *Sym, Spec S,
+                                       MCContext &Ctx, SMLoc Loc = SMLoc());
 
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }

From 909cbcf988fe7454660ce316ec1102a5591606a8 Mon Sep 17 00:00:00 2001
From: DrSergei <serzhdruzhok@gmail.com>
Date: Mon, 23 Jun 2025 19:34:22 +0300
Subject: [PATCH 1306/1322] [lldb-dap] Fix URL label and error code in DAPError
 (#145010)

This patch fixes some problems in DAPerror class (unnecessary copy in
ctor and typo in getUrlLabel function). During patch testing I found
flaky test TestDAP_server.test_server_interrupt (at least 1 fail on 50
runs). Looks like this problem is caused by data race between main and
event handler threads. Terminated event can be sent from Disconnect
function and event handler. However, only event handler sends exit
event. Also, after disconnecting, socket will be closed, so sometimes
sending event fails with "write failed: transport IO object invalid"
error. I tried to fix this problem by adding a wait for events thread
after disconnecting.

Failed log example:
```log
1750366596.399511337 lldb-dap server shutdown requested, disconnecting remaining clients...
1750366596.406297684 (client_0) <-- {"body":{"$__lldb_statistics":{"commands":"{}","memory":"{\"strings\":{\"bytesTotal\":2949120,\"bytesUnused\":1825545,\"bytesUsed\":1123575}}","plugins":"{\"abi\":[{\"enabled\":true,\"name\":\"SysV-arm64\"},{\"enabled\":true,\"name\":\"ABIMacOSX_arm64\"},{\"enabled\":true,\"name\":\"SysV-arm\"},{\"enabled\":true,\"name\":\"macosx-arm\"},{\"enabled\":true,\"name\":\"sysv-hexagon\"},{\"enabled\":true,\"name\":\"sysv-loongarch\"},{\"enabled\":true,\"name\":\"sysv-mips\"},{\"enabled\":true,\"name\":\"sysv-mips64\"},{\"enabled\":true,\"name\":\"sysv-msp430\"},{\"enabled\":true,\"name\":\"sysv-ppc\"},{\"enabled\":true,\"name\":\"sysv-ppc64\"},{\"enabled\":true,\"name\":\"sysv-riscv\"},{\"enabled\":true,\"name\":\"sysv-s390x\"},{\"enabled\":true,\"name\":\"abi.macosx-i386\"},{\"enabled\":true,\"name\":\"sysv-i386\"},{\"enabled\":true,\"name\":\"sysv-x86_64\"},{\"enabled\":true,\"name\":\"windows-x86_64\"}],\"architecture\":[{\"enabled\":true,\"name\":\"arm\"},{\"enabled\":true,\"name\":\"mips\"},{\"enabled\":true,\"name\":\"ppc64\"},{\"enabled\":true,\"name\":\"aarch64\"}],\"disassembler\":[{\"enabled\":true,\"name\":\"llvm-mc\"}],\"dynamic-loader\":[{\"enabled\":true,\"name\":\"darwin-kernel\"},{\"enabled\":true,\"name\":\"freebsd-kernel\"},{\"enabled\":true,\"name\":\"macosx-dyld\"},{\"enabled\":true,\"name\":\"macos-dyld\"},{\"enabled\":true,\"name\":\"posix-dyld\"},{\"enabled\":true,\"name\":\"static\"},{\"enabled\":true,\"name\":\"hexagon-dyld\"},{\"enabled\":true,\"name\":\"windows-dyld\"},{\"enabled\":true,\"name\":\"wasm-dyld\"}],\"emulate-instruction\":[{\"enabled\":true,\"name\":\"arm\"},{\"enabled\":true,\"name\":\"arm64\"},{\"enabled\":true,\"name\":\"LoongArch\"},{\"enabled\":true,\"name\":\"mips32\"},{\"enabled\":true,\"name\":\"mips64\"},{\"enabled\":true,\"name\":\"ppc64\"},{\"enabled\":true,\"name\":\"riscv\"}],\"instrumentation-runtime\":[{\"enabled\":true,\"name\":\"AddressSanitizer\"},{\"enabled\":true,\"name\":\"Libsanitizers-ASan\"},{\"enabled\":true,\"name\":\"MainThreadChecker\"},{\"enabled\":true,\"name\":\"ThreadSanitizer\"},{\"enabled\":true,\"name\":\"UndefinedBehaviorSanitizer\"}],\"jit-loader\":[{\"enabled\":true,\"name\":\"gdb\"}],\"language\":[{\"enabled\":true,\"name\":\"cplusplus\"},{\"enabled\":true,\"name\":\"objc\"},{\"enabled\":true,\"name\":\"objcplusplus\"}],\"language-runtime\":[{\"enabled\":true,\"name\":\"itanium\"},{\"enabled\":true,\"name\":\"apple-objc-v2\"},{\"enabled\":true,\"name\":\"apple-objc-v1\"},{\"enabled\":true,\"name\":\"gnustep-objc-libobjc2\"}],\"memory-history\":[{\"enabled\":true,\"name\":\"asan\"}],\"object-container\":[{\"enabled\":true,\"name\":\"bsd-archive\"},{\"enabled\":true,\"name\":\"mach-o\"},{\"enabled\":true,\"name\":\"mach-o-fileset\"}],\"object-file\":[{\"enabled\":true,\"name\":\"breakpad\"},{\"enabled\":true,\"name\":\"COFF\"},{\"enabled\":true,\"name\":\"elf\"},{\"enabled\":true,\"name\":\"JSON\"},{\"enabled\":true,\"name\":\"mach-o\"},{\"enabled\":true,\"name\":\"minidump\"},{\"enabled\":true,\"name\":\"pdb\"},{\"enabled\":true,\"name\":\"pe-coff\"},{\"enabled\":true,\"name\":\"xcoff\"},{\"enabled\":true,\"name\":\"wasm\"}],\"operating-system\":[{\"enabled\":true,\"name\":\"python\"}],\"platform\":[{\"enabled\":true,\"name\":\"remote-AIX\"},{\"enabled\":true,\"name\":\"remote-linux\"},{\"enabled\":true,\"name\":\"remote-android\"},{\"enabled\":true,\"name\":\"remote-freebsd\"},{\"enabled\":true,\"name\":\"remote-gdb-server\"},{\"enabled\":true,\"name\":\"darwin\"},{\"enabled\":true,\"name\":\"remote-ios\"},{\"enabled\":true,\"name\":\"remote-macosx\"},{\"enabled\":true,\"name\":\"host\"},{\"enabled\":true,\"name\":\"remote-netbsd\"},{\"enabled\":true,\"name\":\"remote-openbsd\"},{\"enabled\":true,\"name\":\"qemu-user\"},{\"enabled\":true,\"name\":\"remote-windows\"}],\"process\":[{\"enabled\":true,\"name\":\"ScriptedProcess\"},{\"enabled\":true,\"name\":\"elf-core\"},{\"enabled\":true,\"name\":\"mach-o-core\"},{\"enabled\":true,\"name\":\"minidump\"},{\"enabled\":true,\"name\":\"gdb-remote\"}],\"register-type-builder\":[{\"enabled\":true,\"name\":\"register-types-clang\"}],\"repl\":[{\"enabled\":true,\"name\":\"ClangREPL\"}],\"script-interpreter\":[{\"enabled\":true,\"name\":\"script-none\"},{\"enabled\":true,\"name\":\"script-python\"}],\"scripted-interface\":[{\"enabled\":true,\"name\":\"OperatingSystemPythonInterface\"},{\"enabled\":true,\"name\":\"ScriptedPlatformPythonInterface\"},{\"enabled\":true,\"name\":\"ScriptedProcessPythonInterface\"},{\"enabled\":true,\"name\":\"ScriptedStopHookPythonInterface\"},{\"enabled\":true,\"name\":\"ScriptedThreadPlanPythonInterface\"}],\"structured-data\":[{\"enabled\":true,\"name\":\"darwin-log\"}],\"symbol-file\":[{\"enabled\":true,\"name\":\"breakpad\"},{\"enabled\":true,\"name\":\"CTF\"},{\"enabled\":true,\"name\":\"dwarf\"},{\"enabled\":true,\"name\":\"dwarf-debugmap\"},{\"enabled\":true,\"name\":\"JSON\"},{\"enabled\":true,\"name\":\"native-pdb\"},{\"enabled\":true,\"name\":\"symtab\"}],\"symbol-locator\":[{\"enabled\":true,\"name\":\"debuginfod\"},{\"enabled\":true,\"name\":\"Default\"}],\"symbol-vendor\":[{\"enabled\":true,\"name\":\"ELF\"},{\"enabled\":true,\"name\":\"PE-COFF\"},{\"enabled\":true,\"name\":\"WASM\"}],\"system-runtime\":[{\"enabled\":true,\"name\":\"systemruntime-macosx\"}],\"trace-exporter\":[{\"enabled\":true,\"name\":\"ctf\"}],\"type-system\":[{\"enabled\":true,\"name\":\"clang\"}],\"unwind-assembly\":[{\"enabled\":true,\"name\":\"inst-emulation\"},{\"enabled\":true,\"name\":\"x86\"}]}","targets":"[{\"breakpoints\":[{\"details\":{\"Breakpoint\":{\"BKPTOptions\":{\"AutoContinue\":false,\"ConditionText\":\"\",\"EnabledState\":true,\"IgnoreCount\":0,\"OneShotState\":false},\"BKPTResolver\":{\"Options\":{\"Column\":0,\"Exact\":false,\"FileName\":\"/home/sergei/llvm-project/lldb/test/API/tools/lldb-dap/server/main.c\",\"Inlines\":true,\"LineNumber\":4,\"Offset\":0,\"SkipPrologue\":true},\"Type\":\"FileAndLine\"},\"Hardware\":false,\"Names\":[\"dap\"],\"SearchFilter\":{\"Options\":{},\"Type\":\"Unconstrained\"}}},\"hitCount\":1,\"id\":1,\"internal\":false,\"numLocations\":1,\"numResolvedLocations\":1,\"resolveTime\":0.064788999999999999},{\"details\":{\"Breakpoint\":{\"BKPTOptions\":{\"AutoContinue\":false,\"ConditionText\":\"\",\"EnabledState\":true,\"IgnoreCount\":0,\"OneShotState\":false},\"BKPTResolver\":{\"Options\":{\"Language\":\"c\",\"NameMask\":[4,4,4,4,4,4],\"Offset\":0,\"SkipPrologue\":false,\"SymbolNames\":[\"_dl_debug_state\",\"rtld_db_dlactivity\",\"__dl_rtld_db_dlactivity\",\"r_debug_state\",\"_r_debug_state\",\"_rtld_debug_state\"]},\"Type\":\"SymbolName\"},\"Hardware\":false,\"SearchFilter\":{\"Options\":{\"ModuleList\":[\"/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2\"]},\"Type\":\"Modules\"}}},\"hitCount\":2,\"id\":-1,\"internal\":true,\"kindDescription\":\"shared-library-event\",\"numLocations\":1,\"numResolvedLocations\":1,\"resolveTime\":0.00027700000000000001}],\"dyldPluginName\":\"posix-dyld\",\"expressionEvaluation\":{\"failures\":0,\"successes\":0},\"firstStopTime\":0.081890439999999995,\"frameVariable\":{\"failures\":0,\"successes\":0},\"launchOrAttachTime\":0.044839855999999997,\"moduleIdentifiers\":[140296113373488,140296115439216,140296116813568,140295710837504,140295643597696,140294502747184,140294972632368],\"signals\":[{\"SIGSTOP\":1}],\"sourceMapDeduceCount\":0,\"sourceRealpathAttemptCount\":0,\"sourceRealpathCompatibleCount\":0,\"stopCount\":7,\"summaryProviderStatistics\":[],\"targetCreateTime\":0.000736,\"totalBreakpointResolveTime\":0.065065999999999999,\"totalSharedLibraryEventHitCount\":2}]","totalDebugInfoByteSize":5107143,"totalDebugInfoEnabled":4,"totalDebugInfoIndexLoadedFromCache":0,"totalDebugInfoIndexSavedToCache":0,"totalDebugInfoIndexTime":0.183807,"totalDebugInfoParseTime":1.2240820000000001,"totalModuleCount":7,"totalModuleCountHasDebugInfo":4,"totalModuleCountWithIncompleteTypes":0,"totalModuleCountWithVariableErrors":0,"totalSymbolLocatorTime":"{\"Default\":0.0054260000000000003,\"debuginfod\":6.999999999999999e-06}","totalSymbolTableIndexTime":0.014254000000000001,"totalSymbolTableParseTime":0.099803000000000003,"totalSymbolTableStripped":0,"totalSymbolTableSymbolCount":23098,"totalSymbolTablesLoaded":7,"totalSymbolTablesLoadedFromCache":0,"totalSymbolTablesSavedToCache":0}},"event":"terminated","seq":0,"type":"event"}
1750366596.406688452 (client_0) write failed: transport IO object invalid
1750366596.406724215 (client_0) write failed: transport IO object invalid
1750366596.842197657 (client_0) client disconnected
```
---
 lldb/tools/lldb-dap/DAPError.cpp    |  8 +++----
 lldb/tools/lldb-dap/DAPError.h      |  2 +-
 lldb/unittests/DAP/CMakeLists.txt   |  1 +
 lldb/unittests/DAP/DAPErrorTest.cpp | 37 +++++++++++++++++++++++++++++
 4 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 lldb/unittests/DAP/DAPErrorTest.cpp

diff --git a/lldb/tools/lldb-dap/DAPError.cpp b/lldb/tools/lldb-dap/DAPError.cpp
index 60347d577f82..5c5bae37cc60 100644
--- a/lldb/tools/lldb-dap/DAPError.cpp
+++ b/lldb/tools/lldb-dap/DAPError.cpp
@@ -18,14 +18,12 @@ char DAPError::ID;
 DAPError::DAPError(std::string message, std::error_code EC, bool show_user,
                    std::optional<std::string> url,
                    std::optional<std::string> url_label)
-    : m_message(message), m_ec(EC), m_show_user(show_user), m_url(url),
-      m_url_label(url_label) {}
+    : m_message(std::move(message)), m_ec(EC), m_show_user(show_user),
+      m_url(std::move(url)), m_url_label(std::move(url_label)) {}
 
 void DAPError::log(llvm::raw_ostream &OS) const { OS << m_message; }
 
-std::error_code DAPError::convertToErrorCode() const {
-  return llvm::inconvertibleErrorCode();
-}
+std::error_code DAPError::convertToErrorCode() const { return m_ec; }
 
 char NotStoppedError::ID;
 
diff --git a/lldb/tools/lldb-dap/DAPError.h b/lldb/tools/lldb-dap/DAPError.h
index 4c94bdd6ac3d..e18614fe7193 100644
--- a/lldb/tools/lldb-dap/DAPError.h
+++ b/lldb/tools/lldb-dap/DAPError.h
@@ -30,7 +30,7 @@ public:
   const std::string &getMessage() const { return m_message; }
   bool getShowUser() const { return m_show_user; }
   const std::optional<std::string> &getURL() const { return m_url; }
-  const std::optional<std::string> &getURLLabel() const { return m_url; }
+  const std::optional<std::string> &getURLLabel() const { return m_url_label; }
 
 private:
   std::string m_message;
diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt
index ee623d341ec6..d5824f4b38a5 100644
--- a/lldb/unittests/DAP/CMakeLists.txt
+++ b/lldb/unittests/DAP/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_lldb_unittest(DAPTests
+  DAPErrorTest.cpp
   DAPTest.cpp
   FifoFilesTest.cpp
   Handler/DisconnectTest.cpp
diff --git a/lldb/unittests/DAP/DAPErrorTest.cpp b/lldb/unittests/DAP/DAPErrorTest.cpp
new file mode 100644
index 000000000000..51138576458d
--- /dev/null
+++ b/lldb/unittests/DAP/DAPErrorTest.cpp
@@ -0,0 +1,37 @@
+//===-- DAPErrorTest.cpp---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DAPError.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <string>
+#include <system_error>
+
+using namespace lldb_dap;
+using namespace llvm;
+
+TEST(DAPErrorTest, DefaultConstructor) {
+  DAPError error("Invalid thread");
+
+  EXPECT_EQ(error.getMessage(), "Invalid thread");
+  EXPECT_EQ(error.convertToErrorCode(), llvm::inconvertibleErrorCode());
+  EXPECT_TRUE(error.getShowUser());
+  EXPECT_EQ(error.getURL(), std::nullopt);
+  EXPECT_EQ(error.getURLLabel(), std::nullopt);
+}
+
+TEST(DAPErrorTest, FullConstructor) {
+  auto timed_out = std::make_error_code(std::errc::timed_out);
+  DAPError error("Timed out", timed_out, false, "URL", "URLLabel");
+
+  EXPECT_EQ(error.getMessage(), "Timed out");
+  EXPECT_EQ(error.convertToErrorCode(), timed_out);
+  EXPECT_FALSE(error.getShowUser());
+  EXPECT_THAT(error.getURL(), testing::Optional<std::string>("URL"));
+  EXPECT_THAT(error.getURLLabel(), testing::Optional<std::string>("URLLabel"));
+}

From c594f6e697435fbb2458104c51e1bbbfd9df6689 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Mon, 23 Jun 2025 19:39:01 +0300
Subject: [PATCH 1307/1322] Revert "[clang-tidy] Add new check
 `readability-use-numeric-limits`" (#145355)

Reverts llvm/llvm-project#127430 due to stable asan buildbot failures:
https://lab.llvm.org/buildbot/#/builders/169
---
 .../clang-tidy/readability/CMakeLists.txt     |   1 -
 .../readability/ReadabilityTidyModule.cpp     |   3 -
 .../readability/UseNumericLimitsCheck.cpp     | 160 ------------------
 .../readability/UseNumericLimitsCheck.h       |  38 -----
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 -
 .../docs/clang-tidy/checks/list.rst           |   1 -
 .../checks/readability/use-numeric-limits.rst |  31 ----
 .../readability/use-numeric-limits.cpp        | 100 -----------
 8 files changed, 340 deletions(-)
 delete mode 100644 clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp
 delete mode 100644 clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h
 delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index 2c40a863c5b7..4be1a8f83133 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -58,7 +58,6 @@ add_clang_library(clangTidyReadabilityModule STATIC
   UniqueptrDeleteReleaseCheck.cpp
   UppercaseLiteralSuffixCheck.cpp
   UseAnyOfAllOfCheck.cpp
-  UseNumericLimitsCheck.cpp
   UseStdMinMaxCheck.cpp
 
   LINK_LIBS
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index dc47c2fb3193..d59b0312673b 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -61,7 +61,6 @@
 #include "UniqueptrDeleteReleaseCheck.h"
 #include "UppercaseLiteralSuffixCheck.h"
 #include "UseAnyOfAllOfCheck.h"
-#include "UseNumericLimitsCheck.h"
 #include "UseStdMinMaxCheck.h"
 
 namespace clang::tidy {
@@ -174,8 +173,6 @@ public:
         "readability-uppercase-literal-suffix");
     CheckFactories.registerCheck<UseAnyOfAllOfCheck>(
         "readability-use-anyofallof");
-    CheckFactories.registerCheck<UseNumericLimitsCheck>(
-        "readability-use-numeric-limits");
     CheckFactories.registerCheck<UseStdMinMaxCheck>(
         "readability-use-std-min-max");
   }
diff --git a/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp b/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp
deleted file mode 100644
index 334b69755db2..000000000000
--- a/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-//===--- UseNumericLimitsCheck.cpp - clang-tidy ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "UseNumericLimitsCheck.h"
-#include "clang/AST/ASTContext.h"
-#include "clang/ASTMatchers/ASTMatchFinder.h"
-#include "clang/Lex/Preprocessor.h"
-#include <cmath>
-#include <limits>
-
-using namespace clang::ast_matchers;
-
-namespace clang::tidy::readability {
-
-UseNumericLimitsCheck::UseNumericLimitsCheck(StringRef Name,
-                                             ClangTidyContext *Context)
-    : ClangTidyCheck(Name, Context),
-      SignedConstants{
-          {std::numeric_limits<int8_t>::min(),
-           "std::numeric_limits<int8_t>::min()"},
-          {std::numeric_limits<int8_t>::max(),
-           "std::numeric_limits<int8_t>::max()"},
-          {std::numeric_limits<int16_t>::min(),
-           "std::numeric_limits<int16_t>::min()"},
-          {std::numeric_limits<int16_t>::max(),
-           "std::numeric_limits<int16_t>::max()"},
-          {std::numeric_limits<int32_t>::min(),
-           "std::numeric_limits<int32_t>::min()"},
-          {std::numeric_limits<int32_t>::max(),
-           "std::numeric_limits<int32_t>::max()"},
-          {std::numeric_limits<int64_t>::min(),
-           "std::numeric_limits<int64_t>::min()"},
-          {std::numeric_limits<int64_t>::max(),
-           "std::numeric_limits<int64_t>::max()"},
-      },
-      UnsignedConstants{
-          {std::numeric_limits<uint8_t>::max(),
-           "std::numeric_limits<uint8_t>::max()"},
-          {std::numeric_limits<uint16_t>::max(),
-           "std::numeric_limits<uint16_t>::max()"},
-          {std::numeric_limits<uint32_t>::max(),
-           "std::numeric_limits<uint32_t>::max()"},
-          {std::numeric_limits<uint64_t>::max(),
-           "std::numeric_limits<uint64_t>::max()"},
-      },
-      Inserter(Options.getLocalOrGlobal("IncludeStyle",
-                                        utils::IncludeSorter::IS_LLVM),
-               areDiagsSelfContained()) {}
-
-void UseNumericLimitsCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
-  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
-}
-
-void UseNumericLimitsCheck::registerMatchers(MatchFinder *Finder) {
-  auto PositiveIntegerMatcher = [](auto Value) {
-    return unaryOperator(hasOperatorName("+"),
-                         hasUnaryOperand(integerLiteral(equals(Value))
-                                             .bind("positive-integer-literal")))
-        .bind("unary-op");
-  };
-
-  auto NegativeIntegerMatcher = [](auto Value) {
-    return unaryOperator(hasOperatorName("-"),
-                         hasUnaryOperand(integerLiteral(equals(-Value))
-                                             .bind("negative-integer-literal")))
-        .bind("unary-op");
-  };
-
-  auto BareIntegerMatcher = [](auto Value) {
-    return integerLiteral(allOf(unless(hasParent(unaryOperator(
-                                    hasAnyOperatorName("-", "+")))),
-                                equals(Value)))
-        .bind("bare-integer-literal");
-  };
-
-  for (const auto &[Value, _] : SignedConstants) {
-    if (Value < 0) {
-      Finder->addMatcher(NegativeIntegerMatcher(Value), this);
-    } else {
-      Finder->addMatcher(
-          expr(anyOf(PositiveIntegerMatcher(Value), BareIntegerMatcher(Value))),
-          this);
-    }
-  }
-
-  for (const auto &[Value, _] : UnsignedConstants) {
-    Finder->addMatcher(
-        expr(anyOf(PositiveIntegerMatcher(Value), BareIntegerMatcher(Value))),
-        this);
-  }
-}
-
-void UseNumericLimitsCheck::registerPPCallbacks(
-    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
-  Inserter.registerPreprocessor(PP);
-}
-
-void UseNumericLimitsCheck::check(const MatchFinder::MatchResult &Result) {
-  const IntegerLiteral *MatchedDecl = nullptr;
-
-  const IntegerLiteral *NegativeMatchedDecl =
-      Result.Nodes.getNodeAs<IntegerLiteral>("negative-integer-literal");
-  const IntegerLiteral *PositiveMatchedDecl =
-      Result.Nodes.getNodeAs<IntegerLiteral>("positive-integer-literal");
-  const IntegerLiteral *BareMatchedDecl =
-      Result.Nodes.getNodeAs<IntegerLiteral>("bare-integer-literal");
-
-  if (NegativeMatchedDecl != nullptr)
-    MatchedDecl = NegativeMatchedDecl;
-  else if (PositiveMatchedDecl != nullptr)
-    MatchedDecl = PositiveMatchedDecl;
-  else if (BareMatchedDecl != nullptr)
-    MatchedDecl = BareMatchedDecl;
-
-  const llvm::APInt MatchedIntegerConstant = MatchedDecl->getValue();
-
-  auto Fixer = [&](auto SourceValue, auto Value,
-                   const std::string &Replacement) {
-    static_assert(std::is_same_v<decltype(SourceValue), decltype(Value)>,
-                  "The types of SourceValue and Value must match");
-
-    SourceLocation Location = MatchedDecl->getExprLoc();
-    SourceRange Range{MatchedDecl->getBeginLoc(), MatchedDecl->getEndLoc()};
-
-    // Only valid if unary operator is present
-    const UnaryOperator *UnaryOpExpr =
-        Result.Nodes.getNodeAs<UnaryOperator>("unary-op");
-
-    if (MatchedDecl == NegativeMatchedDecl && -SourceValue == Value) {
-      Range = SourceRange(UnaryOpExpr->getBeginLoc(), UnaryOpExpr->getEndLoc());
-      Location = UnaryOpExpr->getExprLoc();
-      SourceValue = -SourceValue;
-    } else if (MatchedDecl == PositiveMatchedDecl && SourceValue == Value) {
-      Range = SourceRange(UnaryOpExpr->getBeginLoc(), UnaryOpExpr->getEndLoc());
-      Location = UnaryOpExpr->getExprLoc();
-    } else if (MatchedDecl != BareMatchedDecl || SourceValue != Value) {
-      return;
-    }
-
-    diag(Location,
-         "the constant '%0' is being utilized; consider using '%1' instead")
-        << SourceValue << Replacement
-        << FixItHint::CreateReplacement(Range, Replacement)
-        << Inserter.createIncludeInsertion(
-               Result.SourceManager->getFileID(Location), "<limits>");
-  };
-
-  for (const auto &[Value, Replacement] : SignedConstants)
-    Fixer(MatchedIntegerConstant.getSExtValue(), Value, Replacement);
-
-  for (const auto &[Value, Replacement] : UnsignedConstants)
-    Fixer(MatchedIntegerConstant.getZExtValue(), Value, Replacement);
-}
-
-} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h b/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h
deleted file mode 100644
index 0e7e9abb8562..000000000000
--- a/clang-tools-extra/clang-tidy/readability/UseNumericLimitsCheck.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===--- UseNumericLimitsCheck.h - clang-tidy -------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USENUMERICLIMITSCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USENUMERICLIMITSCHECK_H
-
-#include "../ClangTidyCheck.h"
-#include "../utils/IncludeInserter.h"
-
-namespace clang::tidy::readability {
-
-/// Finds certain integer literals and suggests replacing them with equivalent
-/// ``std::numeric_limits`` calls.
-/// For the user-facing documentation see:
-/// http://clang.llvm.org/extra/clang-tidy/checks/readability/use-numeric-limits.html
-class UseNumericLimitsCheck : public ClangTidyCheck {
-public:
-  UseNumericLimitsCheck(StringRef Name, ClangTidyContext *Context);
-  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
-  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
-  void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP,
-                           Preprocessor *ModuleExpanderPP) override;
-  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
-
-private:
-  const llvm::SmallVector<std::pair<int64_t, std::string>> SignedConstants;
-  const llvm::SmallVector<std::pair<uint64_t, std::string>> UnsignedConstants;
-  utils::IncludeInserter Inserter;
-};
-
-} // namespace clang::tidy::readability
-
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_USENUMERICLIMITSCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index a802b5fc6699..bcd843fc5179 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -154,12 +154,6 @@ New checks
   Finds potentially erroneous calls to ``reset`` method on smart pointers when
   the pointee type also has a ``reset`` method.
 
-- New :doc:`readability-use-numeric-limits
-  <clang-tidy/checks/readability/use-numeric-limits>` check.
-
-  Finds certain integer literals and suggests replacing them with equivalent
-  ``std::numeric_limits`` calls.
-
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 57ae7d330a3c..ccb78ee45e9c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -409,7 +409,6 @@ Clang-Tidy Checks
    :doc:`readability-uniqueptr-delete-release <readability/uniqueptr-delete-release>`, "Yes"
    :doc:`readability-uppercase-literal-suffix <readability/uppercase-literal-suffix>`, "Yes"
    :doc:`readability-use-anyofallof <readability/use-anyofallof>`,
-   :doc:`readability-use-numeric-limits <readability/use-numeric-limits>`, "Yes"
    :doc:`readability-use-std-min-max <readability/use-std-min-max>`, "Yes"
    :doc:`zircon-temporary-objects <zircon/temporary-objects>`,
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst
deleted file mode 100644
index 0f6ca9f0cf2c..000000000000
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/use-numeric-limits.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. title:: clang-tidy - readability-use-numeric-limits
-
-readability-use-numeric-limits
-==============================
-
-Finds certain integer literals and suggests replacing them with equivalent
-``std::numeric_limits`` calls.
-
-Before:
-
-.. code-block:: c++
-
-  void foo() {
-    int32_t a = 2147483647;
-  }
-
-After:
-
-.. code-block:: c++
-
-  void foo() {
-    int32_t a = std::numeric_limits<int32_t>::max();
-  }
-
-Options
--------
-
-.. option:: IncludeStyle
-
-   A string specifying which include-style is used, `llvm` or `google`. Default
-   is `llvm`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp
deleted file mode 100644
index e02d6f1b7126..000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/use-numeric-limits.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// RUN: %check_clang_tidy %s readability-use-numeric-limits %t
-// CHECK-FIXES: #include <limits>
-
-using int8_t = signed char;
-using int16_t = short;
-using int32_t = int;
-using int64_t = long long;
-using uint8_t = unsigned char;
-using uint16_t = unsigned short;
-using uint32_t = unsigned int;
-using uint64_t = unsigned long long;
-
-
-void Invalid() {
-  // CHECK-MESSAGES: :[[@LINE+2]]:14: warning: the constant '-128' is being utilized; consider using 'std::numeric_limits<int8_t>::min()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int8_t a = std::numeric_limits<int8_t>::min();
-  int8_t a = -128;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:14: warning: the constant '127' is being utilized; consider using 'std::numeric_limits<int8_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int8_t b = std::numeric_limits<int8_t>::max();
-  int8_t b = +127;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:14: warning: the constant '127' is being utilized; consider using 'std::numeric_limits<int8_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int8_t c = std::numeric_limits<int8_t>::max();
-  int8_t c = 127;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '-32768' is being utilized; consider using 'std::numeric_limits<int16_t>::min()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int16_t d = std::numeric_limits<int16_t>::min();
-  int16_t d = -32768;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '32767' is being utilized; consider using 'std::numeric_limits<int16_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int16_t e = std::numeric_limits<int16_t>::max();
-  int16_t e = +32767;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '32767' is being utilized; consider using 'std::numeric_limits<int16_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int16_t f = std::numeric_limits<int16_t>::max();
-  int16_t f = 32767;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '-2147483648' is being utilized; consider using 'std::numeric_limits<int32_t>::min()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int32_t g = std::numeric_limits<int32_t>::min();
-  int32_t g = -2147483648;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '2147483647' is being utilized; consider using 'std::numeric_limits<int32_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int32_t h = std::numeric_limits<int32_t>::max();
-  int32_t h = +2147483647;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '2147483647' is being utilized; consider using 'std::numeric_limits<int32_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int32_t i = std::numeric_limits<int32_t>::max();
-  int32_t i = 2147483647;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '-9223372036854775808' is being utilized; consider using 'std::numeric_limits<int64_t>::min()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int64_t j = std::numeric_limits<int64_t>::min();
-  int64_t j = -9223372036854775808;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '9223372036854775807' is being utilized; consider using 'std::numeric_limits<int64_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int64_t k = std::numeric_limits<int64_t>::max();
-  int64_t k = +9223372036854775807;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '9223372036854775807' is being utilized; consider using 'std::numeric_limits<int64_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: int64_t l = std::numeric_limits<int64_t>::max();
-  int64_t l = 9223372036854775807;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '255' is being utilized; consider using 'std::numeric_limits<uint8_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint8_t m = std::numeric_limits<uint8_t>::max();
-  uint8_t m = 255;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:15: warning: the constant '255' is being utilized; consider using 'std::numeric_limits<uint8_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint8_t n = std::numeric_limits<uint8_t>::max();
-  uint8_t n = +255;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '65535' is being utilized; consider using 'std::numeric_limits<uint16_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint16_t o = std::numeric_limits<uint16_t>::max();
-  uint16_t o = 65535;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '65535' is being utilized; consider using 'std::numeric_limits<uint16_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint16_t p = std::numeric_limits<uint16_t>::max();
-  uint16_t p = +65535;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '4294967295' is being utilized; consider using 'std::numeric_limits<uint32_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint32_t q = std::numeric_limits<uint32_t>::max();
-  uint32_t q = 4294967295;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '4294967295' is being utilized; consider using 'std::numeric_limits<uint32_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint32_t r = std::numeric_limits<uint32_t>::max();
-  uint32_t r = +4294967295;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '18446744073709551615' is being utilized; consider using 'std::numeric_limits<uint64_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint64_t s = std::numeric_limits<uint64_t>::max();
-  uint64_t s = 18446744073709551615;
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:16: warning: the constant '18446744073709551615' is being utilized; consider using 'std::numeric_limits<uint64_t>::max()' instead [readability-use-numeric-limits]
-  // CHECK-FIXES: uint64_t t = std::numeric_limits<uint64_t>::max();
-  uint64_t t = +18446744073709551615;
-}
-
-void Valid(){
-  int16_t a = +128;
-
-  int16_t b = -127;
-}

From a0329eaa0cdcc86046c66533feb002b227553c33 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Mon, 23 Jun 2025 18:41:48 +0200
Subject: [PATCH 1308/1322] [lldb] upgrade HandleFrameFormatVariable callees to
 llvm::Expected (#144731)

Upgrade the callees of `HandleFrameFormatVariable`
(`GetDemangledTemplateArguments`, etc), to return a `llvm::Expected`
instead of an `std::optional`.

This patch also bundles the logic of validating the demangled name and
information into a single reusable function to reduce code duplication.
---
 lldb/include/lldb/Core/DemangledNameInfo.h    |  11 +-
 .../Language/CPlusPlus/CPlusPlusLanguage.cpp  | 321 +++++++++---------
 lldb/unittests/Core/MangledTest.cpp           |  24 +-
 3 files changed, 172 insertions(+), 184 deletions(-)

diff --git a/lldb/include/lldb/Core/DemangledNameInfo.h b/lldb/include/lldb/Core/DemangledNameInfo.h
index a2f3fde90c61..9f567232dc50 100644
--- a/lldb/include/lldb/Core/DemangledNameInfo.h
+++ b/lldb/include/lldb/Core/DemangledNameInfo.h
@@ -71,27 +71,28 @@ struct DemangledNameInfo {
 
   /// Returns \c true if this object holds a valid basename range.
   bool hasBasename() const {
+    // A function always has a name.
     return BasenameRange.second > BasenameRange.first;
   }
 
   /// Returns \c true if this object holds a valid scope range.
-  bool hasScope() const { return ScopeRange.second > ScopeRange.first; }
+  bool hasScope() const { return ScopeRange.second >= ScopeRange.first; }
 
   /// Returns \c true if this object holds a valid arguments range.
   bool hasArguments() const {
-    return ArgumentsRange.second > ArgumentsRange.first;
+    return ArgumentsRange.second >= ArgumentsRange.first;
   }
 
   /// Returns \c true if this object holds a valid qualifiers range.
   bool hasQualifiers() const {
-    return QualifiersRange.second > QualifiersRange.first;
+    return QualifiersRange.second >= QualifiersRange.first;
   }
 
   /// Returns \c true if this object holds a valid prefix range.
-  bool hasPrefix() const { return PrefixRange.second > PrefixRange.first; }
+  bool hasPrefix() const { return PrefixRange.second >= PrefixRange.first; }
 
   /// Returns \c true if this object holds a valid suffix range.
-  bool hasSuffix() const { return SuffixRange.second > SuffixRange.first; }
+  bool hasSuffix() const { return SuffixRange.second >= SuffixRange.first; }
 };
 
 /// An OutputBuffer which keeps a record of where certain parts of a
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 0f18abb47591..7ae2e141a63e 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -236,199 +236,158 @@ static bool PrettyPrintFunctionNameWithArgs(Stream &out_stream,
   return true;
 }
 
-static std::optional<llvm::StringRef>
+static llvm::Expected<std::pair<llvm::StringRef, DemangledNameInfo>>
+GetAndValidateInfo(const SymbolContext &sc) {
+  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
+  if (!mangled)
+    return llvm::createStringError("Function does not have a mangled name.");
+
+  auto demangled_name = mangled.GetDemangledName().GetStringRef();
+  if (demangled_name.empty())
+    return llvm::createStringError(
+        "Function '%s' does not have a demangled name.",
+        mangled.GetMangledName().AsCString(""));
+
+  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
+  if (!info)
+    return llvm::createStringError(
+        "Function '%s' does not have demangled info.", demangled_name.data());
+
+  // Function without a basename is nonsense.
+  if (!info->hasBasename())
+    return llvm::createStringError(
+        "DemangledInfo for '%s does not have basename range.",
+        demangled_name.data());
+
+  return std::make_pair(demangled_name, *info);
+}
+
+static llvm::Expected<llvm::StringRef>
 GetDemangledBasename(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
-
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  return demangled_name.slice(info->BasenameRange.first,
-                              info->BasenameRange.second);
+  return demangled_name.slice(info.BasenameRange.first,
+                              info.BasenameRange.second);
 }
 
-static std::optional<llvm::StringRef>
+static llvm::Expected<llvm::StringRef>
 GetDemangledTemplateArguments(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
+  if (info.ArgumentsRange.first < info.BasenameRange.second)
+    return llvm::createStringError("Arguments range for '%s' is invalid.",
+                                   demangled_name.data());
 
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  if (info->ArgumentsRange.first < info->BasenameRange.second)
-    return std::nullopt;
-
-  return demangled_name.slice(info->BasenameRange.second,
-                              info->ArgumentsRange.first);
+  return demangled_name.slice(info.BasenameRange.second,
+                              info.ArgumentsRange.first);
 }
 
-static std::optional<llvm::StringRef>
+static llvm::Expected<llvm::StringRef>
 GetDemangledReturnTypeLHS(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
+  if (info.ScopeRange.first >= demangled_name.size())
+    return llvm::createStringError(
+        "Scope range for '%s' LHS return type is invalid.",
+        demangled_name.data());
 
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  if (info->ScopeRange.first >= demangled_name.size())
-    return std::nullopt;
-
-  return demangled_name.substr(0, info->ScopeRange.first);
+  return demangled_name.substr(0, info.ScopeRange.first);
 }
 
-static std::optional<llvm::StringRef>
+static llvm::Expected<llvm::StringRef>
 GetDemangledFunctionQualifiers(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
+  if (!info.hasQualifiers())
+    return llvm::createStringError("Qualifiers range for '%s' is invalid.",
+                                   demangled_name.data());
 
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  if (info->QualifiersRange.second < info->QualifiersRange.first)
-    return std::nullopt;
-
-  return demangled_name.slice(info->QualifiersRange.first,
-                              info->QualifiersRange.second);
+  return demangled_name.slice(info.QualifiersRange.first,
+                              info.QualifiersRange.second);
 }
 
-static std::optional<llvm::StringRef>
+static llvm::Expected<llvm::StringRef>
 GetDemangledReturnTypeRHS(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
+  if (info.QualifiersRange.first < info.ArgumentsRange.second)
+    return llvm::createStringError(
+        "Qualifiers range for '%s' RHS return type  is invalid.",
+        demangled_name.data());
 
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  if (info->QualifiersRange.first < info->ArgumentsRange.second)
-    return std::nullopt;
-
-  return demangled_name.slice(info->ArgumentsRange.second,
-                              info->QualifiersRange.first);
+  return demangled_name.slice(info.ArgumentsRange.second,
+                              info.QualifiersRange.first);
 }
 
-static std::optional<llvm::StringRef>
+static llvm::Expected<llvm::StringRef>
 GetDemangledScope(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
+  if (!info.hasScope())
+    return llvm::createStringError("Scope range for '%s' is invalid.",
+                                   demangled_name.data());
 
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  if (info->ScopeRange.second < info->ScopeRange.first)
-    return std::nullopt;
-
-  return demangled_name.slice(info->ScopeRange.first, info->ScopeRange.second);
+  return demangled_name.slice(info.ScopeRange.first, info.ScopeRange.second);
 }
 
 /// Handles anything printed after the FunctionEncoding ItaniumDemangle
 /// node. Most notably the DotSUffix node.
-static std::optional<llvm::StringRef>
+static llvm::Expected<llvm::StringRef>
 GetDemangledFunctionSuffix(const SymbolContext &sc) {
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
-    return std::nullopt;
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err)
+    return info_or_err.takeError();
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return std::nullopt;
+  auto [demangled_name, info] = *info_or_err;
 
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return std::nullopt;
+  if (!info.hasSuffix())
+    return llvm::createStringError("Suffix range for '%s' is invalid.",
+                                   demangled_name.data());
 
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return std::nullopt;
-
-  return demangled_name.slice(info->SuffixRange.first,
-                              info->SuffixRange.second);
+  return demangled_name.slice(info.SuffixRange.first, info.SuffixRange.second);
 }
 
 static bool PrintDemangledArgumentList(Stream &s, const SymbolContext &sc) {
   assert(sc.symbol);
 
-  Mangled mangled = sc.GetPossiblyInlinedFunctionName();
-  if (!mangled)
+  auto info_or_err = GetAndValidateInfo(sc);
+  if (!info_or_err) {
+    LLDB_LOG_ERROR(
+        GetLog(LLDBLog::Language), info_or_err.takeError(),
+        "Failed to handle ${{function.basename}} frame-format variable: {0}");
+    return false;
+  }
+  auto [demangled_name, info] = *info_or_err;
+
+  if (!info.hasArguments())
     return false;
 
-  auto demangled_name = mangled.GetDemangledName().GetStringRef();
-  if (demangled_name.empty())
-    return false;
-
-  const std::optional<DemangledNameInfo> &info = mangled.GetDemangledInfo();
-  if (!info)
-    return false;
-
-  // Function without a basename is nonsense.
-  if (!info->hasBasename())
-    return false;
-
-  if (info->ArgumentsRange.second < info->ArgumentsRange.first)
-    return false;
-
-  s << demangled_name.slice(info->ArgumentsRange.first,
-                            info->ArgumentsRange.second);
+  s << demangled_name.slice(info.ArgumentsRange.first,
+                            info.ArgumentsRange.second);
 
   return true;
 }
@@ -1954,32 +1913,44 @@ bool CPlusPlusLanguage::HandleFrameFormatVariable(
     FormatEntity::Entry::Type type, Stream &s) {
   switch (type) {
   case FormatEntity::Entry::Type::FunctionScope: {
-    std::optional<llvm::StringRef> scope = GetDemangledScope(sc);
-    if (!scope)
+    auto scope_or_err = GetDemangledScope(sc);
+    if (!scope_or_err) {
+      LLDB_LOG_ERROR(
+          GetLog(LLDBLog::Language), scope_or_err.takeError(),
+          "Failed to handle ${{function.scope}} frame-format variable: {0}");
       return false;
+    }
 
-    s << *scope;
+    s << *scope_or_err;
 
     return true;
   }
 
   case FormatEntity::Entry::Type::FunctionBasename: {
-    std::optional<llvm::StringRef> name = GetDemangledBasename(sc);
-    if (!name)
+    auto name_or_err = GetDemangledBasename(sc);
+    if (!name_or_err) {
+      LLDB_LOG_ERROR(
+          GetLog(LLDBLog::Language), name_or_err.takeError(),
+          "Failed to handle ${{function.basename}} frame-format variable: {0}");
       return false;
+    }
 
-    s << *name;
+    s << *name_or_err;
 
     return true;
   }
 
   case FormatEntity::Entry::Type::FunctionTemplateArguments: {
-    std::optional<llvm::StringRef> template_args =
-        GetDemangledTemplateArguments(sc);
-    if (!template_args)
+    auto template_args_or_err = GetDemangledTemplateArguments(sc);
+    if (!template_args_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Language),
+                     template_args_or_err.takeError(),
+                     "Failed to handle ${{function.template-arguments}} "
+                     "frame-format variable: {0}");
       return false;
+    }
 
-    s << *template_args;
+    s << *template_args_or_err;
 
     return true;
   }
@@ -2008,38 +1979,54 @@ bool CPlusPlusLanguage::HandleFrameFormatVariable(
     return true;
   }
   case FormatEntity::Entry::Type::FunctionReturnRight: {
-    std::optional<llvm::StringRef> return_rhs = GetDemangledReturnTypeRHS(sc);
-    if (!return_rhs)
+    auto return_rhs_or_err = GetDemangledReturnTypeRHS(sc);
+    if (!return_rhs_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Language), return_rhs_or_err.takeError(),
+                     "Failed to handle ${{function.return-right}} frame-format "
+                     "variable: {0}");
       return false;
+    }
 
-    s << *return_rhs;
+    s << *return_rhs_or_err;
 
     return true;
   }
   case FormatEntity::Entry::Type::FunctionReturnLeft: {
-    std::optional<llvm::StringRef> return_lhs = GetDemangledReturnTypeLHS(sc);
-    if (!return_lhs)
+    auto return_lhs_or_err = GetDemangledReturnTypeLHS(sc);
+    if (!return_lhs_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Language), return_lhs_or_err.takeError(),
+                     "Failed to handle ${{function.return-left}} frame-format "
+                     "variable: {0}");
       return false;
+    }
 
-    s << *return_lhs;
+    s << *return_lhs_or_err;
 
     return true;
   }
   case FormatEntity::Entry::Type::FunctionQualifiers: {
-    std::optional<llvm::StringRef> quals = GetDemangledFunctionQualifiers(sc);
-    if (!quals)
+    auto quals_or_err = GetDemangledFunctionQualifiers(sc);
+    if (!quals_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Language), quals_or_err.takeError(),
+                     "Failed to handle ${{function.qualifiers}} frame-format "
+                     "variable: {0}");
       return false;
+    }
 
-    s << *quals;
+    s << *quals_or_err;
 
     return true;
   }
   case FormatEntity::Entry::Type::FunctionSuffix: {
-    std::optional<llvm::StringRef> suffix = GetDemangledFunctionSuffix(sc);
-    if (!suffix)
+    auto suffix_or_err = GetDemangledFunctionSuffix(sc);
+    if (!suffix_or_err) {
+      LLDB_LOG_ERROR(
+          GetLog(LLDBLog::Language), suffix_or_err.takeError(),
+          "Failed to handle ${{function.suffix}} frame-format variable: {0}");
       return false;
+    }
 
-    s << *suffix;
+    s << *suffix_or_err;
 
     return true;
   }
diff --git a/lldb/unittests/Core/MangledTest.cpp b/lldb/unittests/Core/MangledTest.cpp
index 5994d6072481..4bda65704754 100644
--- a/lldb/unittests/Core/MangledTest.cpp
+++ b/lldb/unittests/Core/MangledTest.cpp
@@ -648,43 +648,43 @@ struct DemangledNameInfoTestCase {
 DemangledNameInfoTestCase g_demangled_name_info_test_cases[] = {
     // clang-format off
    {
-    { /*.BasenameRange=*/{0, 10}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
-      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    { /*.BasenameRange=*/{0, 10}, /*.ScopeRange=*/{1, 0}, /*.ArgumentsRange=*/{1, 0},
+      /*.QualifiersRange=*/{1, 0}, /*.PrefixRange=*/{1, 0}, /*.SuffixRange=*/{1, 0}
     },
       /*valid_basename=*/true, /*valid_scope=*/false, /*valid_arguments=*/false,
       /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/false,
    },
    {
-    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 10}, /*.ArgumentsRange=*/{0, 0},
-      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    { /*.BasenameRange=*/{1, 0}, /*.ScopeRange=*/{0, 10}, /*.ArgumentsRange=*/{1, 0},
+      /*.QualifiersRange=*/{1, 0}, /*.PrefixRange=*/{1, 0}, /*.SuffixRange=*/{1, 0}
     },
       /*valid_basename=*/false, /*valid_scope=*/true, /*valid_arguments=*/false,
       /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/false,
    },
    {
-    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 10},
-      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    { /*.BasenameRange=*/{1, 0}, /*.ScopeRange=*/{1, 0}, /*.ArgumentsRange=*/{0, 10},
+      /*.QualifiersRange=*/{1, 0}, /*.PrefixRange=*/{1, 0}, /*.SuffixRange=*/{1, 0}
     },
       /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/true,
       /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/false,
    },
    {
-    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
-      /*.QualifiersRange=*/{0, 10}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 0}
+    { /*.BasenameRange=*/{1, 0}, /*.ScopeRange=*/{1, 0}, /*.ArgumentsRange=*/{1, 0},
+      /*.QualifiersRange=*/{0, 10}, /*.PrefixRange=*/{1, 0}, /*.SuffixRange=*/{1, 0}
     },
       /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/false,
       /*valid_qualifiers=*/true, /*valid_prefix=*/false, /*valid_suffix=*/false,
    },
    {
-    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
-      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 10}, /*.SuffixRange=*/{0, 0}
+    { /*.BasenameRange=*/{1, 0}, /*.ScopeRange=*/{1, 0}, /*.ArgumentsRange=*/{1, 0},
+      /*.QualifiersRange=*/{1, 0}, /*.PrefixRange=*/{0, 10}, /*.SuffixRange=*/{1, 0}
     },
       /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/false,
       /*valid_qualifiers=*/false, /*valid_prefix=*/true, /*valid_suffix=*/false,
    },
    {
-    { /*.BasenameRange=*/{0, 0}, /*.ScopeRange=*/{0, 0}, /*.ArgumentsRange=*/{0, 0},
-      /*.QualifiersRange=*/{0, 0}, /*.PrefixRange=*/{0, 0}, /*.SuffixRange=*/{0, 10}
+    { /*.BasenameRange=*/{1, 0}, /*.ScopeRange=*/{1, 0}, /*.ArgumentsRange=*/{1, 0},
+      /*.QualifiersRange=*/{1, 0}, /*.PrefixRange=*/{1, 0}, /*.SuffixRange=*/{0, 10}
     },
       /*valid_basename=*/false, /*valid_scope=*/false, /*valid_arguments=*/false,
       /*valid_qualifiers=*/false, /*valid_prefix=*/false, /*valid_suffix=*/true,

From 13bb7948c91404d03bad017b601386493bbb1760 Mon Sep 17 00:00:00 2001
From: Jack Frankland <jack.frankland@arm.com>
Date: Mon, 23 Jun 2025 17:44:33 +0100
Subject: [PATCH 1309/1322] [mlir][spirv]: Add Image to Vulkan Storage Class
 Map (#144899)

Extend the "storage class" <-> "memory space" map for the Vulkan SPIR-V
environment to include the Image class. 12 is chosen as the next
available value in the MemRef memory space list.

Signed-off-by: Jack Frankland <jack.frankland@arm.com>
---
 .../MapMemRefStorageClassPass.cpp             |  3 +-
 .../MemRefToSPIRV/map-storage-class-vk.mlir   | 27 ++++++++++++++++
 .../MemRefToSPIRV/map-storage-class.mlir      | 32 ++++++++++++++++++-
 3 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Conversion/MemRefToSPIRV/map-storage-class-vk.mlir

diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp
index 4cbc3dfdae22..1fbc5a03987e 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp
@@ -59,7 +59,8 @@ using namespace mlir;
   MAP_FN(spirv::StorageClass::UniformConstant, 8)                              \
   MAP_FN(spirv::StorageClass::Input, 9)                                        \
   MAP_FN(spirv::StorageClass::Output, 10)                                      \
-  MAP_FN(spirv::StorageClass::PhysicalStorageBuffer, 11)
+  MAP_FN(spirv::StorageClass::PhysicalStorageBuffer, 11)                       \
+  MAP_FN(spirv::StorageClass::Image, 12)
 
 std::optional<spirv::StorageClass>
 spirv::mapMemorySpaceToVulkanStorageClass(Attribute memorySpaceAttr) {
diff --git a/mlir/test/Conversion/MemRefToSPIRV/map-storage-class-vk.mlir b/mlir/test/Conversion/MemRefToSPIRV/map-storage-class-vk.mlir
new file mode 100644
index 000000000000..3b2c1ae799c5
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToSPIRV/map-storage-class-vk.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt --allow-unregistered-dialect --map-memref-spirv-storage-class='client-api=vulkan' %s | FileCheck %s
+
+// Vulkan Specific Mappings:
+//   8 -> UniformConstant
+//   9 -> Input
+//   10 -> Output
+//   11 -> PhysicalStorageBuffer
+//   12 -> Image
+
+/// Check that Vulkan specific memory space indices get converted into the correct
+/// SPIR-V storage class. If mappings to OpenCL address spaces are added for these
+/// indices then those test case should be moved into the common test file.
+
+// CHECK-LABEL: func @test_vk_specific_memory_spaces
+func.func @test_vk_specific_memory_spaces() {
+  // CHECK: memref<4xi32, #spirv.storage_class<UniformConstant>>
+  %1 = "dialect.memref_producer"() : () -> (memref<4xi32, 8>)
+  // CHECK: memref<4xi32, #spirv.storage_class<Input>>
+  %2 = "dialect.memref_producer"() : () -> (memref<4xi32, 9>)
+  // CHECK: memref<4xi32, #spirv.storage_class<Output>>
+  %3 = "dialect.memref_producer"() : () -> (memref<4xi32, 10>)
+  // CHECK: memref<4xi32, #spirv.storage_class<PhysicalStorageBuffer>>
+  %4 = "dialect.memref_producer"() : () -> (memref<4xi32, 11>)
+  // CHECK: memref<4xi32, #spirv.storage_class<Image>>
+  %5 = "dialect.memref_producer"() : () -> (memref<4xi32, 12>)
+  return
+}
diff --git a/mlir/test/Conversion/MemRefToSPIRV/map-storage-class.mlir b/mlir/test/Conversion/MemRefToSPIRV/map-storage-class.mlir
index f0956b62760a..fdc69b811999 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/map-storage-class.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/map-storage-class.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt -split-input-file -allow-unregistered-dialect -map-memref-spirv-storage-class='client-api=vulkan' -verify-diagnostics %s -o - | FileCheck %s --check-prefix=VULKAN
 // RUN: mlir-opt -split-input-file -allow-unregistered-dialect -map-memref-spirv-storage-class='client-api=opencl' -verify-diagnostics %s -o - | FileCheck %s --check-prefix=OPENCL
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -map-memref-spirv-storage-class -verify-diagnostics %s -o - | FileCheck %s
 
 // Vulkan Mappings:
 //   0 -> StorageBuffer
@@ -7,6 +8,14 @@
 //   2 -> [null]
 //   3 -> Workgroup
 //   4 -> Uniform
+//   5 -> Private
+//   6 -> Function
+//   7 -> PushConstant
+//   8 -> UniformConstant
+//   9 -> Input
+//   10 -> Output
+//   11 -> PhysicalStorageBuffer
+//   12 -> Image
 
 // OpenCL Mappings:
 //   0 -> CrossWorkgroup
@@ -14,6 +23,9 @@
 //   2 -> [null]
 //   3 -> Workgroup
 //   4 -> UniformConstant
+//   5 -> Private
+//   6 -> Function
+//   7 -> Image
 
 // VULKAN-LABEL: func @operand_result
 // OPENCL-LABEL: func @operand_result
@@ -30,6 +42,15 @@ func.func @operand_result() {
   // VULKAN: memref<*xf16, #spirv.storage_class<Uniform>>
   // OPENCL: memref<*xf16, #spirv.storage_class<UniformConstant>>
   %3 = "dialect.memref_producer"() : () -> (memref<*xf16, 4>)
+  // VULKAN: memref<*xf16, #spirv.storage_class<Private>>
+  // OPENCL: memref<*xf16, #spirv.storage_class<Private>>
+  %4 = "dialect.memref_producer"() : () -> (memref<*xf16, 5>)
+  // VULKAN: memref<*xf16, #spirv.storage_class<Function>>
+  // OPENCL: memref<*xf16, #spirv.storage_class<Function>>
+  %5 = "dialect.memref_producer"() : () -> (memref<*xf16, 6>)
+  // VULKAN: memref<*xf16, #spirv.storage_class<PushConstant>>
+  // OPENCL: memref<*xf16, #spirv.storage_class<Image>>
+  %6 = "dialect.memref_producer"() : () -> (memref<*xf16, 7>)
 
 
   "dialect.memref_consumer"(%0) : (memref<f32>) -> ()
@@ -42,6 +63,15 @@ func.func @operand_result() {
   // VULKAN: memref<*xf16, #spirv.storage_class<Uniform>>
   // OPENCL: memref<*xf16, #spirv.storage_class<UniformConstant>>
   "dialect.memref_consumer"(%3) : (memref<*xf16, 4>) -> ()
+  // VULKAN: memref<*xf16, #spirv.storage_class<Private>>
+  // OPENCL: memref<*xf16, #spirv.storage_class<Private>>
+  "dialect.memref_consumer"(%4) : (memref<*xf16, 5>) -> ()
+  // VULKAN: memref<*xf16, #spirv.storage_class<Function>>
+  // OPENCL: memref<*xf16, #spirv.storage_class<Function>>
+  "dialect.memref_consumer"(%5) : (memref<*xf16, 6>) -> ()
+  // VULKAN: memref<*xf16, #spirv.storage_class<PushConstant>>
+  // OPENCL: memref<*xf16, #spirv.storage_class<Image>>
+  "dialect.memref_consumer"(%6) : (memref<*xf16, 7>) -> ()
 
   return
 }
@@ -166,4 +196,4 @@ func.func @operand_result() {
   "dialect.memref_consumer"(%3) : (memref<*xf16, 4>) -> ()
   return
 }
-}
\ No newline at end of file
+}

From 7a33569510535f0b917a2e50f644bf57490aee24 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 23 Jun 2025 09:47:50 -0700
Subject: [PATCH 1310/1322] [RISCV] Factor out common SiFive7 scheduling model
 into an abstraction layer (#144442)

In preparation for sifive-x390's scheduling model, which shares quite a
lot with the existing SiFive7 scheduling model, this patch factors out
some of the components that will share between them. Notably:

- Processor resource definitions (i.e. pipes) are factored out into a
multiclass, `SiFive7ProcResources`. Similarly, WriteRes entries and
bypass entries (i.e. ReadAdvance) are also factored out into their own
multiclass: `SiFive7WriteResBase` and `SiFive7ReadAdvance`,
respectively.
- The aforementioned three components, `SiFive7ProcResources`,
`SiFive7WriteResBase`, and `SiFive7ReadAdvance` are encapsulated into a
bigger multiclass, `SiFive7SchedResources`, which configures these
components with parameters passed from the template arguments. An
example configure value would be the VLEN.
- SiFive7's SchedMachineModel carries not only standard fields like
issue width, but also the concrete config values corresponding to the
processor. For instance, the existing SiFive7 models has VLEN=512, while
X390 has VLEN=1024.
- In the final phase, we "bind" SchedMachineModel from each processor to
a SiFive7SchedResources that is instantiated from that
SchedMachineModel's config values.

Co-authored-by: Michael Maitland <michaeltmaitland@gmail.com>
---
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td    | 2275 +++++++++--------
 .../tools/llvm-mca/RISCV/SiFive7/div-fdiv.s   |   16 +-
 .../llvm-mca/RISCV/SiFive7/gpr-bypass-c.s     |   16 +-
 .../tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s |   16 +-
 .../RISCV/SiFive7/instruction-tables-tests.s  |  264 +-
 llvm/test/tools/llvm-mca/RISCV/SiFive7/jump.s |   16 +-
 .../SiFiveX280/different-lmul-instruments.s   |   16 +-
 .../SiFiveX280/different-sew-instruments.s    |   16 +-
 .../llvm-mca/RISCV/SiFiveX280/disable-im.s    |   16 +-
 .../RISCV/SiFiveX280/fractional-lmul-data.s   |   16 +-
 .../SiFiveX280/lmul-instrument-at-start.s     |   16 +-
 .../SiFiveX280/lmul-instrument-in-middle.s    |   16 +-
 .../SiFiveX280/lmul-instrument-in-region.s    |   16 +-
 .../lmul-instrument-straddles-region.s        |   16 +-
 .../multiple-same-lmul-instruments.s          |   16 +-
 .../multiple-same-sew-instruments.s           |   16 +-
 .../SiFiveX280/needs-sew-but-only-lmul.s      |   16 +-
 .../RISCV/SiFiveX280/no-vsetvli-to-start.s    |   16 +-
 .../llvm-mca/RISCV/SiFiveX280/reductions.s    |   18 +-
 .../SiFiveX280/sew-instrument-at-start.s      |   16 +-
 .../SiFiveX280/sew-instrument-in-middle.s     |   18 +-
 .../SiFiveX280/sew-instrument-in-region.s     |   16 +-
 .../sew-instrument-straddles-region.s         |   16 +-
 .../RISCV/SiFiveX280/strided-load-store.s     |   16 +-
 .../RISCV/SiFiveX280/strided-load-x0.s        |   16 +-
 .../SiFiveX280/vector-integer-arithmetic.s    |   18 +-
 .../tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s |   16 +-
 .../SiFiveX280/vsetivli-lmul-instrument.s     |   16 +-
 .../SiFiveX280/vsetivli-lmul-sew-instrument.s |   16 +-
 .../SiFiveX280/vsetvli-lmul-instrument.s      |   16 +-
 .../SiFiveX280/vsetvli-lmul-sew-instrument.s  |   16 +-
 31 files changed, 1538 insertions(+), 1471 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index c1d7cd4a716e..071b64571fe3 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -186,8 +186,1166 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
                                  WriteIRem, WriteIRem32,
                                  WriteLDB, WriteLDH, WriteLDW, WriteLDD]>;
 
-// SiFive7 machine model for scheduling and other instruction cost heuristics.
-def SiFive7Model : SchedMachineModel {
+// The SiFive7 microarchitecture has three pipelines: A, B, V.
+// Pipe A can handle memory, integer alu and vector operations.
+// Pipe B can handle integer alu, control flow, integer multiply and divide,
+// and floating point computation.
+// The V pipeline is modeled by the VCQ, VA, VL, and VS resources.
+multiclass SiFive7ProcResources {
+  let BufferSize = 0 in {
+    def PipeA     : ProcResource<1>;
+    def PipeB     : ProcResource<1>;
+
+    def IDiv      : ProcResource<1>; // Int Division
+    def FDiv      : ProcResource<1>; // FP Division/Sqrt
+
+    def VA      : ProcResource<1>; // Arithmetic sequencer
+
+    def VL        : ProcResource<1>; // Load sequencer
+    def VS        : ProcResource<1>; // Store sequencer
+    // The VCQ accepts instructions from the the A Pipe and holds them until the
+    // vector unit is ready to dequeue them. The unit dequeues up to one instruction
+    // per cycle, in order, as soon as the sequencer for that type of instruction is
+    // available. This resource is meant to be used for 1 cycle by all vector
+    // instructions, to model that only one vector instruction may be dequeued at a
+    // time. The actual dequeueing into the sequencer is modeled by the VA, VL, and
+    // VS sequencer resources below. Each of them will only accept a single
+    // instruction at a time and remain busy for the number of cycles associated
+    // with that instruction.
+    def VCQ       : ProcResource<1>; // Vector Command Queue
+  }
+
+  def PipeAB : ProcResGroup<[!cast<ProcResource>(NAME#"PipeA"),
+                             !cast<ProcResource>(NAME#"PipeB")]>;
+}
+
+multiclass SiFive7WriteResBase<int VLEN,
+    ProcResourceKind PipeA, ProcResourceKind PipeB, ProcResourceKind PipeAB,
+    ProcResourceKind IDiv, ProcResourceKind FDiv,
+    ProcResourceKind VA, ProcResourceKind VL, ProcResourceKind VS,
+    ProcResourceKind VCQ> {
+
+  // Branching
+  let Latency = 3 in {
+    def : WriteRes<WriteJmp, [PipeB]>;
+    def : WriteRes<WriteJal, [PipeB]>;
+    def : WriteRes<WriteJalr, [PipeB]>;
+  }
+
+  //Short forward branch
+  def : WriteRes<WriteSFB, [PipeA, PipeB]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+
+  // Integer arithmetic and logic
+  let Latency = 3 in {
+    def : WriteRes<WriteIALU, [PipeAB]>;
+    def : WriteRes<WriteIALU32, [PipeAB]>;
+    def : WriteRes<WriteShiftImm, [PipeAB]>;
+    def : WriteRes<WriteShiftImm32, [PipeAB]>;
+    def : WriteRes<WriteShiftReg, [PipeAB]>;
+    def : WriteRes<WriteShiftReg32, [PipeAB]>;
+  }
+
+  // Integer multiplication
+  let Latency = 3 in {
+    def : WriteRes<WriteIMul, [PipeB]>;
+    def : WriteRes<WriteIMul32, [PipeB]>;
+  }
+
+  // Integer division
+  def : WriteRes<WriteIDiv, [PipeB, IDiv]> {
+    let Latency = 66;
+    let ReleaseAtCycles = [1, 65];
+  }
+  def : WriteRes<WriteIDiv32,  [PipeB, IDiv]> {
+    let Latency = 34;
+    let ReleaseAtCycles = [1, 33];
+  }
+
+  // Integer remainder
+  def : WriteRes<WriteIRem, [PipeB, IDiv]> {
+    let Latency = 66;
+    let ReleaseAtCycles = [1, 65];
+  }
+  def : WriteRes<WriteIRem32,  [PipeB, IDiv]> {
+    let Latency = 34;
+    let ReleaseAtCycles = [1, 33];
+  }
+
+  // Bitmanip
+  let Latency = 3 in {
+    // Rotates are in the late-B ALU.
+    def : WriteRes<WriteRotateImm, [PipeB]>;
+    def : WriteRes<WriteRotateImm32, [PipeB]>;
+    def : WriteRes<WriteRotateReg, [PipeB]>;
+    def : WriteRes<WriteRotateReg32, [PipeB]>;
+
+    // clz[w]/ctz[w] are in the late-B ALU.
+    def : WriteRes<WriteCLZ, [PipeB]>;
+    def : WriteRes<WriteCLZ32, [PipeB]>;
+    def : WriteRes<WriteCTZ, [PipeB]>;
+    def : WriteRes<WriteCTZ32, [PipeB]>;
+
+    // cpop[w] look exactly like multiply.
+    def : WriteRes<WriteCPOP, [PipeB]>;
+    def : WriteRes<WriteCPOP32, [PipeB]>;
+
+    // orc.b is in the late-B ALU.
+    def : WriteRes<WriteORCB, [PipeB]>;
+
+    // min/max are in the late-B ALU
+    def : WriteRes<WriteIMinMax, [PipeB]>;
+
+    // rev8 is in the late-A and late-B ALUs.
+    def : WriteRes<WriteREV8, [PipeAB]>;
+
+    // shNadd[.uw] is on the early-B and late-B ALUs.
+    def : WriteRes<WriteSHXADD, [PipeB]>;
+    def : WriteRes<WriteSHXADD32, [PipeB]>;
+  }
+
+  // Single-bit instructions
+  // BEXT[I] instruction is available on all ALUs and the other instructions
+  // are only available on the B pipe.
+  let Latency = 3 in {
+    def : WriteRes<WriteSingleBit, [PipeB]>;
+    def : WriteRes<WriteSingleBitImm, [PipeB]>;
+    def : WriteRes<WriteBEXT, [PipeAB]>;
+    def : WriteRes<WriteBEXTI, [PipeAB]>;
+  }
+
+  // Memory
+  def : WriteRes<WriteSTB, [PipeA]>;
+  def : WriteRes<WriteSTH, [PipeA]>;
+  def : WriteRes<WriteSTW, [PipeA]>;
+  def : WriteRes<WriteSTD, [PipeA]>;
+  def : WriteRes<WriteFST16, [PipeA]>;
+  def : WriteRes<WriteFST32, [PipeA]>;
+  def : WriteRes<WriteFST64, [PipeA]>;
+
+  let Latency = 3 in {
+  def : WriteRes<WriteLDB, [PipeA]>;
+  def : WriteRes<WriteLDH, [PipeA]>;
+  def : WriteRes<WriteLDW, [PipeA]>;
+  def : WriteRes<WriteLDD, [PipeA]>;
+  }
+
+  let Latency = 2 in {
+  def : WriteRes<WriteFLD16, [PipeA]>;
+  def : WriteRes<WriteFLD32, [PipeA]>;
+  def : WriteRes<WriteFLD64, [PipeA]>;
+  }
+
+  // Atomic memory
+  def : WriteRes<WriteAtomicSTW, [PipeA]>;
+  def : WriteRes<WriteAtomicSTD, [PipeA]>;
+
+  let Latency = 3 in {
+  def : WriteRes<WriteAtomicW, [PipeA]>;
+  def : WriteRes<WriteAtomicD, [PipeA]>;
+  def : WriteRes<WriteAtomicLDW, [PipeA]>;
+  def : WriteRes<WriteAtomicLDD, [PipeA]>;
+  }
+
+  // Half precision.
+  let Latency = 5 in {
+  def : WriteRes<WriteFAdd16, [PipeB]>;
+  def : WriteRes<WriteFMul16, [PipeB]>;
+  def : WriteRes<WriteFMA16, [PipeB]>;
+  }
+  let Latency = 3 in {
+  def : WriteRes<WriteFSGNJ16, [PipeB]>;
+  def : WriteRes<WriteFMinMax16, [PipeB]>;
+  }
+
+  let Latency = 14, ReleaseAtCycles = [1, 13] in {
+  def :  WriteRes<WriteFDiv16, [PipeB, FDiv]>;
+  def :  WriteRes<WriteFSqrt16, [PipeB, FDiv]>;
+  }
+
+  // Single precision.
+  let Latency = 5 in {
+    def : WriteRes<WriteFAdd32, [PipeB]>;
+    def : WriteRes<WriteFMul32, [PipeB]>;
+    def : WriteRes<WriteFMA32, [PipeB]>;
+  }
+  let Latency = 3 in {
+    def : WriteRes<WriteFSGNJ32, [PipeB]>;
+    def : WriteRes<WriteFMinMax32, [PipeB]>;
+  }
+
+  def : WriteRes<WriteFDiv32, [PipeB, FDiv]> {
+    let Latency = 27;
+    let ReleaseAtCycles = [1, 26];
+  }
+  def : WriteRes<WriteFSqrt32, [PipeB, FDiv]> {
+    let Latency = 27;
+    let ReleaseAtCycles = [1, 26];
+  }
+
+  // Double precision
+  let Latency = 7 in {
+    def : WriteRes<WriteFAdd64, [PipeB]>;
+    def : WriteRes<WriteFMul64, [PipeB]>;
+    def : WriteRes<WriteFMA64, [PipeB]>;
+  }
+  let Latency = 3 in {
+    def : WriteRes<WriteFSGNJ64, [PipeB]>;
+    def : WriteRes<WriteFMinMax64, [PipeB]>;
+  }
+
+  def : WriteRes<WriteFDiv64, [PipeB, FDiv]> {
+    let Latency = 56;
+    let ReleaseAtCycles = [1, 55];
+  }
+  def : WriteRes<WriteFSqrt64, [PipeB, FDiv]> {
+    let Latency = 56;
+    let ReleaseAtCycles = [1, 55];
+  }
+
+  // Conversions
+  let Latency = 3 in {
+  def : WriteRes<WriteFCvtI32ToF16, [PipeB]>;
+  def : WriteRes<WriteFCvtI32ToF32, [PipeB]>;
+  def : WriteRes<WriteFCvtI32ToF64, [PipeB]>;
+  def : WriteRes<WriteFCvtI64ToF16, [PipeB]>;
+  def : WriteRes<WriteFCvtI64ToF32, [PipeB]>;
+  def : WriteRes<WriteFCvtI64ToF64, [PipeB]>;
+  def : WriteRes<WriteFCvtF16ToI32, [PipeB]>;
+  def : WriteRes<WriteFCvtF16ToI64, [PipeB]>;
+  def : WriteRes<WriteFCvtF16ToF32, [PipeB]>;
+  def : WriteRes<WriteFCvtF16ToF64, [PipeB]>;
+  def : WriteRes<WriteFCvtF32ToI32, [PipeB]>;
+  def : WriteRes<WriteFCvtF32ToI64, [PipeB]>;
+  def : WriteRes<WriteFCvtF32ToF16, [PipeB]>;
+  def : WriteRes<WriteFCvtF32ToF64, [PipeB]>;
+  def : WriteRes<WriteFCvtF64ToI32, [PipeB]>;
+  def : WriteRes<WriteFCvtF64ToI64, [PipeB]>;
+  def : WriteRes<WriteFCvtF64ToF16, [PipeB]>;
+  def : WriteRes<WriteFCvtF64ToF32, [PipeB]>;
+
+  def : WriteRes<WriteFClass16, [PipeB]>;
+  def : WriteRes<WriteFClass32, [PipeB]>;
+  def : WriteRes<WriteFClass64, [PipeB]>;
+  def : WriteRes<WriteFCmp16, [PipeB]>;
+  def : WriteRes<WriteFCmp32, [PipeB]>;
+  def : WriteRes<WriteFCmp64, [PipeB]>;
+  def : WriteRes<WriteFMovI16ToF16, [PipeB]>;
+  def : WriteRes<WriteFMovF16ToI16, [PipeB]>;
+  def : WriteRes<WriteFMovI32ToF32, [PipeB]>;
+  def : WriteRes<WriteFMovF32ToI32, [PipeB]>;
+  def : WriteRes<WriteFMovI64ToF64, [PipeB]>;
+  def : WriteRes<WriteFMovF64ToI64, [PipeB]>;
+  }
+
+  // 6. Configuration-Setting Instructions
+  let Latency = 3 in {
+  def : WriteRes<WriteVSETVLI, [PipeA]>;
+  def : WriteRes<WriteVSETIVLI, [PipeA]>;
+  def : WriteRes<WriteVSETVL, [PipeA]>;
+  }
+
+  // 7. Vector Loads and Stores
+  // Unit-stride loads and stores can operate at the full bandwidth of the memory
+  // pipe. The memory pipe is DLEN bits wide on x280.
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVLDE",    [VCQ, VL], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVLDFF",   [VCQ, VL], mx, IsWorstCase>;
+    }
+    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+    defm : LMULWriteResMX<"WriteVSTE",    [VCQ, VS], mx, IsWorstCase>;
+  }
+
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetMaskLoadStoreCycles<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+    defm : LMULWriteResMX<"WriteVLDM",    [VCQ, VL], mx, IsWorstCase>;
+    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+    defm : LMULWriteResMX<"WriteVSTM",    [VCQ, VS], mx, IsWorstCase>;
+  }
+
+  // Strided loads and stores operate at one element per cycle and should be
+  // scheduled accordingly. Indexed loads and stores operate at one element per
+  // cycle, and they stall the machine until all addresses have been generated,
+  // so they cannot be scheduled. Indexed and strided loads and stores have LMUL
+  // specific suffixes, but since SEW is already encoded in the name of the
+  // resource, we do not need to use LMULSEWXXX constructors. However, we do
+  // use the SEW from the name to determine the number of Cycles.
+
+  foreach mx = SchedMxList in {
+    defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, VLEN>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    defm  : LMULWriteResMXVariant<"WriteVLDS8",  VLDSX0Pred, [VCQ, VL],
+                                         4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
+                                         [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+    let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVLDUX8", [VCQ, VL], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVLDOX8", [VCQ, VL], mx, IsWorstCase>;
+    }
+    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVSTS8",  [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTUX8", [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTOX8", [VCQ, VS], mx, IsWorstCase>;
+    }
+  }
+  // TODO: The MxLists need to be filtered by EEW. We only need to support
+  // LMUL >= SEW_min/ELEN. Here, the smallest EEW prevents us from having MF8
+  // since LMUL >= 16/64.
+  foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
+    defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, VLEN>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    defm  : LMULWriteResMXVariant<"WriteVLDS16",  VLDSX0Pred, [VCQ, VL],
+                                         4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
+                                         [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+    let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVLDUX16", [VCQ, VL], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVLDOX16", [VCQ, VL], mx, IsWorstCase>;
+    }
+    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVSTS16",  [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTUX16", [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTOX16", [VCQ, VS], mx, IsWorstCase>;
+    }
+  }
+  foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
+    defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, VLEN>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    defm  : LMULWriteResMXVariant<"WriteVLDS32",  VLDSX0Pred, [VCQ, VL],
+                                         4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
+                                         [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+    let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVLDUX32", [VCQ, VL], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVLDOX32", [VCQ, VL], mx, IsWorstCase>;
+    }
+    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVSTS32",  [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTUX32", [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTOX32", [VCQ, VS], mx, IsWorstCase>;
+    }
+  }
+  foreach mx = ["M1", "M2", "M4", "M8"] in {
+    defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, VLEN>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    defm  : LMULWriteResMXVariant<"WriteVLDS64",  VLDSX0Pred, [VCQ, VL],
+                                         4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
+                                         [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
+    let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVLDUX64", [VCQ, VL], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVLDOX64", [VCQ, VL], mx, IsWorstCase>;
+    }
+    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVSTS64",  [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTUX64", [VCQ, VS], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSTOX64", [VCQ, VS], mx, IsWorstCase>;
+    }
+  }
+
+  // VLD*R is LMUL aware
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
+    def : WriteRes<WriteVLD1R,  [VCQ, VL]>;
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
+    def : WriteRes<WriteVLD2R,  [VCQ, VL]>;
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
+    def : WriteRes<WriteVLD4R,  [VCQ, VL]>;
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
+    def : WriteRes<WriteVLD8R,  [VCQ, VL]>;
+  // VST*R is LMUL aware
+  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
+    def : WriteRes<WriteVST1R,   [VCQ, VS]>;
+  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
+    def : WriteRes<WriteVST2R,   [VCQ, VS]>;
+  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
+    def : WriteRes<WriteVST4R,   [VCQ, VS]>;
+  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
+    def : WriteRes<WriteVST8R,   [VCQ, VS]>;
+
+  // Segmented Loads and Stores
+  // Unit-stride segmented loads and stores are effectively converted into strided
+  // segment loads and stores. Strided segment loads and stores operate at up to
+  // one segment per cycle if the segment fits within one aligned memory beat.
+  // Indexed segment loads and stores operate at the same rate as strided ones,
+  // but they stall the machine until all addresses have been generated.
+  foreach mx = SchedMxList in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar Cycles = SiFive7GetCyclesSegmentedSeg2<mx>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+      // Does not chain so set latency high
+      let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULWriteResMX<"WriteVLSEG2e" # eew,   [VCQ, VL], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVLSEGFF2e" # eew, [VCQ, VL], mx, IsWorstCase>;
+      }
+      let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+      defm : LMULWriteResMX<"WriteVSSEG2e" # eew,   [VCQ, VS], mx, IsWorstCase>;
+      foreach nf=3-8 in {
+        defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, VLEN>.c;
+        defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+        // Does not chain so set latency high
+        let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+          defm : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew,   [VCQ, VL], mx, IsWorstCase>;
+          defm : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [VCQ, VL], mx, IsWorstCase>;
+        }
+        let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+        defm : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew,   [VCQ, VS], mx, IsWorstCase>;
+      }
+    }
+  }
+  foreach mx = SchedMxList in {
+    foreach nf=2-8 in {
+      foreach eew = [8, 16, 32, 64] in {
+        defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, VLEN>.c;
+        defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+        // Does not chain so set latency high
+        let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+          defm : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew,  [VCQ, VL], mx, IsWorstCase>;
+          defm : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [VCQ, VL], mx, IsWorstCase>;
+          defm : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [VCQ, VL], mx, IsWorstCase>;
+        }
+        let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+          defm : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew,  [VCQ, VS], mx, IsWorstCase>;
+          defm : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [VCQ, VS], mx, IsWorstCase>;
+          defm : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [VCQ, VS], mx, IsWorstCase>;
+        }
+      }
+    }
+  }
+
+  // 11. Vector Integer Arithmetic Instructions
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVIALUV",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIALUX",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIALUI",     [VCQ, VA], mx, IsWorstCase>;
+      // vmadc requires mask
+      defm : LMULWriteResMX<"WriteVICALUV",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUX",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUI",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUMV",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUMX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUMI",   [VCQ, VA], mx, IsWorstCase>;
+      // min max require merge
+      defm : LMULWriteResMX<"WriteVIMinMaxV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMinMaxX",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMergeV",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMergeX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMergeI",   [VCQ, VA], mx, IsWorstCase>;
+
+      defm : LMULWriteResMX<"WriteVIMovV",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMovX",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMovI",     [VCQ, VA], mx, IsWorstCase>;
+
+      defm : LMULWriteResMX<"WriteVExtV",      [VCQ, VA], mx, IsWorstCase>;
+    }
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVShiftV",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVShiftX",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVShiftI",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulV",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulX",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulAddV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulAddX",  [VCQ, VA], mx, IsWorstCase>;
+    }
+    // Mask results can't chain.
+    let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVICmpV",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICmpX",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICmpI",     [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+
+  foreach mx = SchedMxList in {
+    foreach sew = SchedSEWSet<mx>.val in {
+      defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
+                           !div(SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, 4));
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+      let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVIDivV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVIDivX", [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+    }
+  }
+
+  // Widening
+  foreach mx = SchedMxListW in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVIWALUV",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWALUX",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWALUI",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulV",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulX",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulAddV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulAddX", [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+  // Narrowing
+  foreach mx = SchedMxListW in {
+    defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVNShiftV",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNShiftX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNShiftI",   [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+
+  // 12. Vector Fixed-Point Arithmetic Instructions
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVSALUV",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSALUX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSALUI",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVAALUV",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVAALUX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSMulV",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSMulX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSShiftV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSShiftX", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSShiftI", [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+  // Narrowing
+  foreach mx = SchedMxListW in {
+    defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVNClipV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNClipX",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNClipI",  [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+
+  // 13. Vector Floating-Point Instructions
+  foreach mx = SchedMxListF in {
+    foreach sew = SchedSEWSet<mx, isF=1>.val in {
+      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVFALUV",  [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFALUF",  [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMulV",  [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMulF",  [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+      let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [VCQ, VA], mx, sew, IsWorstCase>;
+        // min max require merge
+        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+    }
+  }
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVFCvtFToIV",  [VCQ, VA], mx, IsWorstCase>;
+    }
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVFClassV",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFMergeV",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFMovV",      [VCQ, VA], mx, IsWorstCase>;
+    }
+    // Mask results can't chain.
+    let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      // fcmp requires mask
+      defm : LMULWriteResMX<"WriteVFCmpV",      [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFCmpF",      [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+  foreach mx = SchedMxListF in {
+    foreach sew = SchedSEWSet<mx, isF=1>.val in {
+      defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
+                           !div(SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, 4));
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+      let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFDivV",  [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFDivF",  [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+    }
+  }
+
+  // Widening
+  foreach mx = SchedMxListW in {
+    foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+      defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+    }
+  }
+  foreach mx = SchedMxListFW in {
+    foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulF", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+      defvar CvtCycles = SiFive7GetCyclesDefault<mx>.c;
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+    }
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+    defm : LMULWriteResMX<"WriteVFWCvtFToIV", [VCQ, VA], mx, IsWorstCase>;
+  }
+  // Narrowing
+  foreach mx = SchedMxListW in {
+    defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVFNCvtFToIV", [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+  foreach mx = SchedMxListFW in {
+    foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+      defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+    }
+  }
+
+  // 14. Vector Reduction Operations
+  foreach mx = SchedMxList in {
+    foreach sew = SchedSEWSet<mx>.val in {
+      defvar Cycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+      let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [VCQ, VA],
+                                       mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [VCQ, VA],
+                                       mx, sew, IsWorstCase>;
+      }
+    }
+  }
+
+  foreach mx = SchedMxListWRed in {
+    foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+      defvar Cycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+      let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
+      defm : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [VCQ, VA],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+
+  foreach mx = SchedMxListF in {
+    foreach sew = SchedSEWSet<mx, 1>.val in {
+      defvar RedCycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+      let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [VCQ, VA],
+                                       mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [VCQ, VA],
+                                       mx, sew, IsWorstCase>;
+      }
+      defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, VLEN>.c;
+      let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
+      defm : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [VCQ, VA],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+
+  foreach mx = SchedMxListFWRed in {
+    foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+      defvar RedCycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+      let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in
+      defm : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [VCQ, VA],
+                                     mx, sew, IsWorstCase>;
+      defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, VLEN>.c;
+      let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
+      defm : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [VCQ, VA],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+
+  // 15. Vector Mask Instructions
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesVMask<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVMALUV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMPopV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMFFSV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMSFSV", [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVIotaV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIdxV", [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+
+  // 16. Vector Permutation Instructions
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 1)] in {
+    def : WriteRes<WriteVMovSX, [VCQ, VA]>;
+    def : WriteRes<WriteVMovXS, [VCQ, VA]>;
+    def : WriteRes<WriteVMovSF, [VCQ, VA]>;
+    def : WriteRes<WriteVMovFS, [VCQ, VA]>;
+  }
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVRGatherVX",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVRGatherVI",    [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+
+  foreach mx = SchedMxList in {
+    foreach sew = SchedSEWSet<mx>.val in {
+      defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+      defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+      let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+        defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA], mx, sew, IsWorstCase>;
+      }
+    }
+  }
+
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVSlideUpX",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSlideDownX", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSlideI",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVISlide1X",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFSlide1F",   [VCQ, VA], mx, IsWorstCase>;
+    }
+  }
+
+  // VMov*V is LMUL Aware
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
+    def : WriteRes<WriteVMov1V,     [VCQ, VA]>;
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
+    def : WriteRes<WriteVMov2V,     [VCQ, VA]>;
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
+    def : WriteRes<WriteVMov4V,     [VCQ, VA]>;
+  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
+    def : WriteRes<WriteVMov8V,     [VCQ, VA]>;
+
+  // Others
+  def : WriteRes<WriteCSR, [PipeB]>;
+  def : WriteRes<WriteNop, []>;
+  let Latency = 3 in
+    def : WriteRes<WriteRdVLENB, [PipeB]>;
+
+  def : InstRW<[WriteIALU], (instrs COPY)>;
+
+  // VCIX
+  //
+  // In principle we don't know the latency of any VCIX instructions (they
+  // depends on a particular coprocessor implementation). However, the default
+  // latency of 1 can lead to issues [1]. So instead we set the latency to the
+  // default provided by `SiFive7GetCyclesDefault`. This is still not accurate
+  // and can lead to suboptimal codegen, but should hopefully be a better
+  // starting point.
+  //
+  // [1] https://github.com/llvm/llvm-project/issues/83391
+  foreach mx = SchedMxList in {
+    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = Cycles,
+        AcquireAtCycles = [0, 1],
+        ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defm : LMULWriteResMX<"WriteVC_V_I",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_X",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_IV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_VV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_XV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_IVV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_IVW", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_VVV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_VVW", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_XVV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_XVW", [VCQ, VA], mx, IsWorstCase>;
+      foreach f = ["FPR16", "FPR32", "FPR64"] in {
+        defm : LMULWriteResMX<"WriteVC_V_" # f # "V",  [VCQ, VA], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_V_" # f # "VV", [VCQ, VA], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_V_" # f # "VW", [VCQ, VA], mx, IsWorstCase>;
+      }
+      defm : LMULWriteResMX<"WriteVC_I",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_X",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_IV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_VV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_XV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_IVV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_IVW", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_VVV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_VVW", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_XVV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_XVW", [VCQ, VA], mx, IsWorstCase>;
+      foreach f = ["FPR16", "FPR32", "FPR64"] in {
+        defm : LMULWriteResMX<"WriteVC_" # f # "V",  [VCQ, VA], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_" # f # "VV", [VCQ, VA], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_" # f # "VW", [VCQ, VA], mx, IsWorstCase>;
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+multiclass SiFive7ReadAdvance {
+  // Bypass and advance
+  def : SiFive7AnyToGPRBypass<ReadJmp>;
+  def : SiFive7AnyToGPRBypass<ReadJalr>;
+  def : ReadAdvance<ReadCSR, 0>;
+  def : SiFive7AnyToGPRBypass<ReadStoreData>;
+  def : ReadAdvance<ReadMemBase, 0>;
+  def : SiFive7AnyToGPRBypass<ReadIALU>;
+  def : SiFive7AnyToGPRBypass<ReadIALU32>;
+  def : SiFive7AnyToGPRBypass<ReadShiftImm>;
+  def : SiFive7AnyToGPRBypass<ReadShiftImm32>;
+  def : SiFive7AnyToGPRBypass<ReadShiftReg>;
+  def : SiFive7AnyToGPRBypass<ReadShiftReg32>;
+  def : ReadAdvance<ReadIDiv, 0>;
+  def : ReadAdvance<ReadIDiv32, 0>;
+  def : ReadAdvance<ReadIRem, 0>;
+  def : ReadAdvance<ReadIRem32, 0>;
+  def : ReadAdvance<ReadIMul, 0>;
+  def : ReadAdvance<ReadIMul32, 0>;
+  def : ReadAdvance<ReadAtomicWA, 0>;
+  def : ReadAdvance<ReadAtomicWD, 0>;
+  def : ReadAdvance<ReadAtomicDA, 0>;
+  def : ReadAdvance<ReadAtomicDD, 0>;
+  def : ReadAdvance<ReadAtomicLDW, 0>;
+  def : ReadAdvance<ReadAtomicLDD, 0>;
+  def : ReadAdvance<ReadAtomicSTW, 0>;
+  def : ReadAdvance<ReadAtomicSTD, 0>;
+  def : ReadAdvance<ReadFStoreData, 0>;
+  def : ReadAdvance<ReadFMemBase, 0>;
+  def : ReadAdvance<ReadFAdd16, 0>;
+  def : ReadAdvance<ReadFAdd32, 0>;
+  def : ReadAdvance<ReadFAdd64, 0>;
+  def : ReadAdvance<ReadFMul16, 0>;
+  def : ReadAdvance<ReadFMA16, 0>;
+  def : ReadAdvance<ReadFMA16Addend, 0>;
+  def : ReadAdvance<ReadFMul32, 0>;
+  def : ReadAdvance<ReadFMul64, 0>;
+  def : ReadAdvance<ReadFMA32, 0>;
+  def : ReadAdvance<ReadFMA32Addend, 0>;
+  def : ReadAdvance<ReadFMA64, 0>;
+  def : ReadAdvance<ReadFMA64Addend, 0>;
+  def : ReadAdvance<ReadFDiv16, 0>;
+  def : ReadAdvance<ReadFDiv32, 0>;
+  def : ReadAdvance<ReadFDiv64, 0>;
+  def : ReadAdvance<ReadFSqrt16, 0>;
+  def : ReadAdvance<ReadFSqrt32, 0>;
+  def : ReadAdvance<ReadFSqrt64, 0>;
+  def : ReadAdvance<ReadFCmp16, 0>;
+  def : ReadAdvance<ReadFCmp32, 0>;
+  def : ReadAdvance<ReadFCmp64, 0>;
+  def : ReadAdvance<ReadFSGNJ16, 0>;
+  def : ReadAdvance<ReadFSGNJ32, 0>;
+  def : ReadAdvance<ReadFSGNJ64, 0>;
+  def : ReadAdvance<ReadFMinMax16, 0>;
+  def : ReadAdvance<ReadFMinMax32, 0>;
+  def : ReadAdvance<ReadFMinMax64, 0>;
+  def : ReadAdvance<ReadFCvtF16ToI32, 0>;
+  def : ReadAdvance<ReadFCvtF16ToI64, 0>;
+  def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+  def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+  def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+  def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+  def : ReadAdvance<ReadFCvtI32ToF16, 0>;
+  def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+  def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+  def : ReadAdvance<ReadFCvtI64ToF16, 0>;
+  def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+  def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+  def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+  def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+  def : ReadAdvance<ReadFCvtF16ToF32, 0>;
+  def : ReadAdvance<ReadFCvtF32ToF16, 0>;
+  def : ReadAdvance<ReadFCvtF16ToF64, 0>;
+  def : ReadAdvance<ReadFCvtF64ToF16, 0>;
+  def : ReadAdvance<ReadFMovF16ToI16, 0>;
+  def : ReadAdvance<ReadFMovI16ToF16, 0>;
+  def : ReadAdvance<ReadFMovF32ToI32, 0>;
+  def : ReadAdvance<ReadFMovI32ToF32, 0>;
+  def : ReadAdvance<ReadFMovF64ToI64, 0>;
+  def : ReadAdvance<ReadFMovI64ToF64, 0>;
+  def : ReadAdvance<ReadFClass16, 0>;
+  def : ReadAdvance<ReadFClass32, 0>;
+  def : ReadAdvance<ReadFClass64, 0>;
+
+  def : SiFive7AnyToGPRBypass<ReadSFBJmp, 0>;
+  def : SiFive7AnyToGPRBypass<ReadSFBALU, 0>;
+
+  // Bitmanip
+  def : SiFive7AnyToGPRBypass<ReadRotateImm>;
+  def : SiFive7AnyToGPRBypass<ReadRotateImm32>;
+  def : SiFive7AnyToGPRBypass<ReadRotateReg>;
+  def : SiFive7AnyToGPRBypass<ReadRotateReg32>;
+  def : SiFive7AnyToGPRBypass<ReadCLZ>;
+  def : SiFive7AnyToGPRBypass<ReadCLZ32>;
+  def : SiFive7AnyToGPRBypass<ReadCTZ>;
+  def : SiFive7AnyToGPRBypass<ReadCTZ32>;
+  def : ReadAdvance<ReadCPOP, 0>;
+  def : ReadAdvance<ReadCPOP32, 0>;
+  def : SiFive7AnyToGPRBypass<ReadORCB>;
+  def : SiFive7AnyToGPRBypass<ReadIMinMax>;
+  def : SiFive7AnyToGPRBypass<ReadREV8>;
+  def : SiFive7AnyToGPRBypass<ReadSHXADD>;
+  def : SiFive7AnyToGPRBypass<ReadSHXADD32>;
+  // Single-bit instructions
+  def : SiFive7AnyToGPRBypass<ReadSingleBit>;
+  def : SiFive7AnyToGPRBypass<ReadSingleBitImm>;
+
+  // 6. Configuration-Setting Instructions
+  def : ReadAdvance<ReadVSETVLI, 2>;
+  def : ReadAdvance<ReadVSETVL, 2>;
+
+  // 7. Vector Loads and Stores
+  def : ReadAdvance<ReadVLDX, 0>;
+  def : ReadAdvance<ReadVSTX, 0>;
+  defm : LMULReadAdvance<"ReadVSTEV", 0>;
+  defm : LMULReadAdvance<"ReadVSTM", 0>;
+  def : ReadAdvance<ReadVLDSX, 0>;
+  def : ReadAdvance<ReadVSTSX, 0>;
+  defm : LMULReadAdvance<"ReadVSTS8V", 0>;
+  defm : LMULReadAdvance<"ReadVSTS16V", 0>;
+  defm : LMULReadAdvance<"ReadVSTS32V", 0>;
+  defm : LMULReadAdvance<"ReadVSTS64V", 0>;
+  defm : LMULReadAdvance<"ReadVLDUXV", 0>;
+  defm : LMULReadAdvance<"ReadVLDOXV", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX8", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX16", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX32", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX64", 0>;
+  defm : LMULReadAdvance<"ReadVSTUXV", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX8V", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX16V", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX32V", 0>;
+  defm : LMULReadAdvance<"ReadVSTUX64V", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX8", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX16", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX32", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX64", 0>;
+  defm : LMULReadAdvance<"ReadVSTOXV", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX8V", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX16V", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX32V", 0>;
+  defm : LMULReadAdvance<"ReadVSTOX64V", 0>;
+  // LMUL Aware
+  def : ReadAdvance<ReadVST1R, 0>;
+  def : ReadAdvance<ReadVST2R, 0>;
+  def : ReadAdvance<ReadVST4R, 0>;
+  def : ReadAdvance<ReadVST8R, 0>;
+
+  // 12. Vector Integer Arithmetic Instructions
+  defm : LMULReadAdvance<"ReadVIALUV", 0>;
+  defm : LMULReadAdvance<"ReadVIALUX", 0>;
+  defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+  defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+  defm : LMULReadAdvance<"ReadVExtV", 0>;
+  defm : LMULReadAdvance<"ReadVICALUV", 0>;
+  defm : LMULReadAdvance<"ReadVICALUX", 0>;
+  defm : LMULReadAdvance<"ReadVShiftV", 0>;
+  defm : LMULReadAdvance<"ReadVShiftX", 0>;
+  defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+  defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+  defm : LMULReadAdvance<"ReadVICmpV", 0>;
+  defm : LMULReadAdvance<"ReadVICmpX", 0>;
+  defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+  defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+  defm : LMULReadAdvance<"ReadVIMulV", 0>;
+  defm : LMULReadAdvance<"ReadVIMulX", 0>;
+  defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+  defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+  defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+  defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+  defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+  defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+  defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+  defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+  defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+  defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+  defm : LMULReadAdvance<"ReadVIMovV", 0>;
+  defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+  // 13. Vector Fixed-Point Arithmetic Instructions
+  defm : LMULReadAdvance<"ReadVSALUV", 0>;
+  defm : LMULReadAdvance<"ReadVSALUX", 0>;
+  defm : LMULReadAdvance<"ReadVAALUV", 0>;
+  defm : LMULReadAdvance<"ReadVAALUX", 0>;
+  defm : LMULReadAdvance<"ReadVSMulV", 0>;
+  defm : LMULReadAdvance<"ReadVSMulX", 0>;
+  defm : LMULReadAdvance<"ReadVSShiftV", 0>;
+  defm : LMULReadAdvance<"ReadVSShiftX", 0>;
+  defm : LMULReadAdvanceW<"ReadVNClipV", 0>;
+  defm : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+  // 14. Vector Floating-Point Instructions
+  defm : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+  defm : LMULReadAdvance<"ReadVFCmpV", 0>;
+  defm : LMULReadAdvance<"ReadVFCmpF", 0>;
+  defm : LMULReadAdvance<"ReadVFClassV", 0>;
+  defm : LMULReadAdvance<"ReadVFMergeV", 0>;
+  defm : LMULReadAdvance<"ReadVFMergeF", 0>;
+  defm : LMULReadAdvance<"ReadVFMovF", 0>;
+  defm : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+  defm : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+  defm : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+  defm : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+  defm : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+  defm : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+  // 15. Vector Reduction Operations
+  def : ReadAdvance<ReadVIRedV, 0>;
+  def : ReadAdvance<ReadVIRedV0, 0>;
+  def : ReadAdvance<ReadVIWRedV, 0>;
+  def : ReadAdvance<ReadVIWRedV0, 0>;
+  def : ReadAdvance<ReadVFRedV, 0>;
+  def : ReadAdvance<ReadVFRedV0, 0>;
+  def : ReadAdvance<ReadVFRedOV, 0>;
+  def : ReadAdvance<ReadVFRedOV0, 0>;
+  def : ReadAdvance<ReadVFWRedV, 0>;
+  def : ReadAdvance<ReadVFWRedV0, 0>;
+  def : ReadAdvance<ReadVFWRedOV, 0>;
+  def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+  // 16. Vector Mask Instructions
+  defm : LMULReadAdvance<"ReadVMALUV", 0>;
+  defm : LMULReadAdvance<"ReadVMPopV", 0>;
+  defm : LMULReadAdvance<"ReadVMFFSV", 0>;
+  defm : LMULReadAdvance<"ReadVMSFSV", 0>;
+  defm : LMULReadAdvance<"ReadVIotaV", 0>;
+
+  // 17. Vector Permutation Instructions
+  def : ReadAdvance<ReadVMovXS, 0>;
+  def : ReadAdvance<ReadVMovSX_V, 0>;
+  def : ReadAdvance<ReadVMovSX_X, 0>;
+  def : ReadAdvance<ReadVMovFS, 0>;
+  def : ReadAdvance<ReadVMovSF_V, 0>;
+  def : ReadAdvance<ReadVMovSF_F, 0>;
+  defm : LMULReadAdvance<"ReadVISlideV", 0>;
+  defm : LMULReadAdvance<"ReadVISlideX", 0>;
+  defm : LMULReadAdvance<"ReadVFSlideV", 0>;
+  defm : LMULReadAdvance<"ReadVFSlideF", 0>;
+  defm : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+  defm : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+  defm : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+  defm : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
+  defm : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+  defm : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+  defm : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+  defm : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+  // LMUL Aware
+  def : ReadAdvance<ReadVMov1V, 0>;
+  def : ReadAdvance<ReadVMov2V, 0>;
+  def : ReadAdvance<ReadVMov4V, 0>;
+  def : ReadAdvance<ReadVMov8V, 0>;
+
+  // Others
+  def : ReadAdvance<ReadVMask, 0>;
+  def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
+  foreach mx = SchedMxList in {
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
+    foreach sew = SchedSEWSet<mx>.val in
+      def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+/// This multiclass is a "bundle" of (1) processor resources (i.e. pipes) and
+/// (2) WriteRes entries. It's parameterized by config values that will
+/// eventually be supplied by different SchedMachineModels.
+multiclass SiFive7SchedResources<int vlen> {
+  defm SiFive7 : SiFive7ProcResources;
+
+  // Pull out defs from SiFive7ProcResources so we can refer to them by name.
+  defvar SiFive7PipeA = !cast<ProcResource>(NAME # SiFive7PipeA);
+  defvar SiFive7PipeB = !cast<ProcResource>(NAME # SiFive7PipeB);
+  defvar SiFive7PipeAB = !cast<ProcResGroup>(NAME # SiFive7PipeAB);
+  defvar SiFive7IDiv = !cast<ProcResource>(NAME # SiFive7IDiv);
+  defvar SiFive7FDiv = !cast<ProcResource>(NAME # SiFive7FDiv);
+  defvar SiFive7VA = !cast<ProcResource>(NAME # SiFive7VA);
+  defvar SiFive7VL = !cast<ProcResource>(NAME # SiFive7VL);
+  defvar SiFive7VS = !cast<ProcResource>(NAME # SiFive7VS);
+  defvar SiFive7VCQ = !cast<ProcResource>(NAME # SiFive7VCQ);
+
+  // Define WriteRes records that are the same across all SiFive7 derived
+  // SchedModels.
+  defm SiFive7
+      : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
+                            SiFive7IDiv, SiFive7FDiv,
+                            SiFive7VA, SiFive7VL, SiFive7VS,
+                            SiFive7VCQ>;
+
+  //===----------------------------------------------------------------------===//
+  // Bypass and advance
+
+  defm SiFive7 : SiFive7ReadAdvance;
+  //===----------------------------------------------------------------------===//
+  // Unsupported extensions
+  defm : UnsupportedSchedQ;
+  defm : UnsupportedSchedZabha;
+  defm : UnsupportedSchedZbc;
+  defm : UnsupportedSchedZbkb;
+  defm : UnsupportedSchedZbkx;
+  defm : UnsupportedSchedZfa;
+  defm : UnsupportedSchedZvk;
+}
+
+class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
   let MicroOpBufferSize = 0; // Explicitly set to zero since SiFive7 is in-order.
   let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
   let LoadLatency = 3;
@@ -198,1109 +1356,18 @@ def SiFive7Model : SchedMachineModel {
                              HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
                              HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
                              HasStdExtZkr];
+  int VLEN = vlen;
+
+  string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
 }
 
-// The SiFive7 microarchitecture has three pipelines: A, B, V.
-// Pipe A can handle memory, integer alu and vector operations.
-// Pipe B can handle integer alu, control flow, integer multiply and divide,
-// and floating point computation.
-// The V pipeline is modeled by the VCQ, VA, VL, and VS resources.
-let SchedModel = SiFive7Model in {
-let BufferSize = 0 in {
-def SiFive7PipeA       : ProcResource<1>;
-def SiFive7PipeB       : ProcResource<1>;
-def SiFive7IDiv        : ProcResource<1>; // Int Division
-def SiFive7FDiv        : ProcResource<1>; // FP Division/Sqrt
-def SiFive7VA          : ProcResource<1>; // Arithmetic sequencer
-def SiFive7VL          : ProcResource<1>; // Load sequencer
-def SiFive7VS          : ProcResource<1>; // Store sequencer
-// The VCQ accepts instructions from the the A Pipe and holds them until the
-// vector unit is ready to dequeue them. The unit dequeues up to one instruction
-// per cycle, in order, as soon as the sequencer for that type of instruction is
-// available. This resource is meant to be used for 1 cycle by all vector
-// instructions, to model that only one vector instruction may be dequeued at a
-// time. The actual dequeueing into the sequencer is modeled by the VA, VL, and
-// VS sequencer resources below. Each of them will only accept a single
-// instruction at a time and remain busy for the number of cycles associated
-// with that instruction.
-def SiFive7VCQ         : ProcResource<1>; // Vector Command Queue
-}
+/// Models
+def SiFive7VLEN512Model : SiFive7SchedMachineModel<512>;
 
-def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>;
+/// Binding models to their scheduling resources.
+let SchedModel = SiFive7VLEN512Model in
+defm !cast<string>(SiFive7VLEN512Model.Name)
+    : SiFive7SchedResources<SiFive7VLEN512Model.VLEN>;
 
-defvar SiFive7VLEN = 512;
-
-// Branching
-let Latency = 3 in {
-def : WriteRes<WriteJmp, [SiFive7PipeB]>;
-def : WriteRes<WriteJal, [SiFive7PipeB]>;
-def : WriteRes<WriteJalr, [SiFive7PipeB]>;
-}
-
-//Short forward branch
-def : WriteRes<WriteSFB, [SiFive7PipeA, SiFive7PipeB]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-}
-
-// Integer arithmetic and logic
-let Latency = 3 in {
-def : WriteRes<WriteIALU, [SiFive7PipeAB]>;
-def : WriteRes<WriteIALU32, [SiFive7PipeAB]>;
-def : WriteRes<WriteShiftImm, [SiFive7PipeAB]>;
-def : WriteRes<WriteShiftImm32, [SiFive7PipeAB]>;
-def : WriteRes<WriteShiftReg, [SiFive7PipeAB]>;
-def : WriteRes<WriteShiftReg32, [SiFive7PipeAB]>;
-}
-
-// Integer multiplication
-let Latency = 3 in {
-def : WriteRes<WriteIMul, [SiFive7PipeB]>;
-def : WriteRes<WriteIMul32, [SiFive7PipeB]>;
-}
-
-// Integer division
-def : WriteRes<WriteIDiv, [SiFive7PipeB, SiFive7IDiv]> {
-  let Latency = 66;
-  let ReleaseAtCycles = [1, 65];
-}
-def : WriteRes<WriteIDiv32,  [SiFive7PipeB, SiFive7IDiv]> {
-  let Latency = 34;
-  let ReleaseAtCycles = [1, 33];
-}
-
-// Integer remainder
-def : WriteRes<WriteIRem, [SiFive7PipeB, SiFive7IDiv]> {
-  let Latency = 66;
-  let ReleaseAtCycles = [1, 65];
-}
-def : WriteRes<WriteIRem32,  [SiFive7PipeB, SiFive7IDiv]> {
-  let Latency = 34;
-  let ReleaseAtCycles = [1, 33];
-}
-
-// Bitmanip
-let Latency = 3 in {
-// Rotates are in the late-B ALU.
-def : WriteRes<WriteRotateImm, [SiFive7PipeB]>;
-def : WriteRes<WriteRotateImm32, [SiFive7PipeB]>;
-def : WriteRes<WriteRotateReg, [SiFive7PipeB]>;
-def : WriteRes<WriteRotateReg32, [SiFive7PipeB]>;
-
-// clz[w]/ctz[w] are in the late-B ALU.
-def : WriteRes<WriteCLZ, [SiFive7PipeB]>;
-def : WriteRes<WriteCLZ32, [SiFive7PipeB]>;
-def : WriteRes<WriteCTZ, [SiFive7PipeB]>;
-def : WriteRes<WriteCTZ32, [SiFive7PipeB]>;
-
-// cpop[w] look exactly like multiply.
-def : WriteRes<WriteCPOP, [SiFive7PipeB]>;
-def : WriteRes<WriteCPOP32, [SiFive7PipeB]>;
-
-// orc.b is in the late-B ALU.
-def : WriteRes<WriteORCB, [SiFive7PipeB]>;
-
-// min/max are in the late-B ALU
-def : WriteRes<WriteIMinMax, [SiFive7PipeB]>;
-
-// rev8 is in the late-A and late-B ALUs.
-def : WriteRes<WriteREV8, [SiFive7PipeAB]>;
-
-// shNadd[.uw] is on the early-B and late-B ALUs.
-def : WriteRes<WriteSHXADD, [SiFive7PipeB]>;
-def : WriteRes<WriteSHXADD32, [SiFive7PipeB]>;
-}
-
-// Single-bit instructions
-// BEXT[I] instruction is available on all ALUs and the other instructions
-// are only available on the SiFive7B pipe.
-let Latency = 3 in {
-def : WriteRes<WriteSingleBit, [SiFive7PipeB]>;
-def : WriteRes<WriteSingleBitImm, [SiFive7PipeB]>;
-def : WriteRes<WriteBEXT, [SiFive7PipeAB]>;
-def : WriteRes<WriteBEXTI, [SiFive7PipeAB]>;
-}
-
-// Memory
-def : WriteRes<WriteSTB, [SiFive7PipeA]>;
-def : WriteRes<WriteSTH, [SiFive7PipeA]>;
-def : WriteRes<WriteSTW, [SiFive7PipeA]>;
-def : WriteRes<WriteSTD, [SiFive7PipeA]>;
-def : WriteRes<WriteFST16, [SiFive7PipeA]>;
-def : WriteRes<WriteFST32, [SiFive7PipeA]>;
-def : WriteRes<WriteFST64, [SiFive7PipeA]>;
-
-let Latency = 3 in {
-def : WriteRes<WriteLDB, [SiFive7PipeA]>;
-def : WriteRes<WriteLDH, [SiFive7PipeA]>;
-def : WriteRes<WriteLDW, [SiFive7PipeA]>;
-def : WriteRes<WriteLDD, [SiFive7PipeA]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteFLD16, [SiFive7PipeA]>;
-def : WriteRes<WriteFLD32, [SiFive7PipeA]>;
-def : WriteRes<WriteFLD64, [SiFive7PipeA]>;
-}
-
-// Atomic memory
-def : WriteRes<WriteAtomicSTW, [SiFive7PipeA]>;
-def : WriteRes<WriteAtomicSTD, [SiFive7PipeA]>;
-
-let Latency = 3 in {
-def : WriteRes<WriteAtomicW, [SiFive7PipeA]>;
-def : WriteRes<WriteAtomicD, [SiFive7PipeA]>;
-def : WriteRes<WriteAtomicLDW, [SiFive7PipeA]>;
-def : WriteRes<WriteAtomicLDD, [SiFive7PipeA]>;
-}
-
-// Half precision.
-let Latency = 5 in {
-def : WriteRes<WriteFAdd16, [SiFive7PipeB]>;
-def : WriteRes<WriteFMul16, [SiFive7PipeB]>;
-def : WriteRes<WriteFMA16, [SiFive7PipeB]>;
-}
-let Latency = 3 in {
-def : WriteRes<WriteFSGNJ16, [SiFive7PipeB]>;
-def : WriteRes<WriteFMinMax16, [SiFive7PipeB]>;
-}
-
-let Latency = 14, ReleaseAtCycles = [1, 13] in {
-def :  WriteRes<WriteFDiv16, [SiFive7PipeB, SiFive7FDiv]>;
-def :  WriteRes<WriteFSqrt16, [SiFive7PipeB, SiFive7FDiv]>;
-}
-
-// Single precision.
-let Latency = 5 in {
-def : WriteRes<WriteFAdd32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMul32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMA32, [SiFive7PipeB]>;
-}
-let Latency = 3 in {
-def : WriteRes<WriteFSGNJ32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMinMax32, [SiFive7PipeB]>;
-}
-
-def : WriteRes<WriteFDiv32, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 27;
-                                                         let ReleaseAtCycles = [1, 26]; }
-def : WriteRes<WriteFSqrt32, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 27;
-                                                          let ReleaseAtCycles = [1, 26]; }
-
-// Double precision
-let Latency = 7 in {
-def : WriteRes<WriteFAdd64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMul64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMA64, [SiFive7PipeB]>;
-}
-let Latency = 3 in {
-def : WriteRes<WriteFSGNJ64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMinMax64, [SiFive7PipeB]>;
-}
-
-def : WriteRes<WriteFDiv64, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 56;
-                                                         let ReleaseAtCycles = [1, 55]; }
-def : WriteRes<WriteFSqrt64, [SiFive7PipeB, SiFive7FDiv]> { let Latency = 56;
-                                                          let ReleaseAtCycles = [1, 55]; }
-
-// Conversions
-let Latency = 3 in {
-def : WriteRes<WriteFCvtI32ToF16, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtI32ToF32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtI32ToF64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtI64ToF16, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtI64ToF32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtI64ToF64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF16ToI32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF16ToI64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF16ToF32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF16ToF64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF32ToI32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF32ToI64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF32ToF16, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF32ToF64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF64ToI32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF64ToI64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF64ToF16, [SiFive7PipeB]>;
-def : WriteRes<WriteFCvtF64ToF32, [SiFive7PipeB]>;
-
-def : WriteRes<WriteFClass16, [SiFive7PipeB]>;
-def : WriteRes<WriteFClass32, [SiFive7PipeB]>;
-def : WriteRes<WriteFClass64, [SiFive7PipeB]>;
-def : WriteRes<WriteFCmp16, [SiFive7PipeB]>;
-def : WriteRes<WriteFCmp32, [SiFive7PipeB]>;
-def : WriteRes<WriteFCmp64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMovI16ToF16, [SiFive7PipeB]>;
-def : WriteRes<WriteFMovF16ToI16, [SiFive7PipeB]>;
-def : WriteRes<WriteFMovI32ToF32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMovF32ToI32, [SiFive7PipeB]>;
-def : WriteRes<WriteFMovI64ToF64, [SiFive7PipeB]>;
-def : WriteRes<WriteFMovF64ToI64, [SiFive7PipeB]>;
-}
-
-// 6. Configuration-Setting Instructions
-let Latency = 3 in {
-def : WriteRes<WriteVSETVLI, [SiFive7PipeA]>;
-def : WriteRes<WriteVSETIVLI, [SiFive7PipeA]>;
-def : WriteRes<WriteVSETVL, [SiFive7PipeA]>;
-}
-
-// 7. Vector Loads and Stores
-// Unit-stride loads and stores can operate at the full bandwidth of the memory
-// pipe. The memory pipe is DLEN bits wide on x280.
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVLDE",    [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVLDFF",   [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-  }
-  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-  defm "" : LMULWriteResMX<"WriteVSTE",    [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-}
-
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetMaskLoadStoreCycles<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-  defm "" : LMULWriteResMX<"WriteVLDM",    [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-  defm "" : LMULWriteResMX<"WriteVSTM",    [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-}
-
-// Strided loads and stores operate at one element per cycle and should be
-// scheduled accordingly. Indexed loads and stores operate at one element per
-// cycle, and they stall the machine until all addresses have been generated,
-// so they cannot be scheduled. Indexed and strided loads and stores have LMUL
-// specific suffixes, but since SEW is already encoded in the name of the
-// resource, we do not need to use LMULSEWXXX constructors. However, we do
-// use the SEW from the name to determine the number of Cycles.
-
-foreach mx = SchedMxList in {
-  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, SiFive7VLEN>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS8",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
-                                       4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
-                                       [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
-  let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-  }
-  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVSTS8",  [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-  }
-}
-// TODO: The MxLists need to be filtered by EEW. We only need to support
-// LMUL >= SEW_min/ELEN. Here, the smallest EEW prevents us from having MF8
-// since LMUL >= 16/64.
-foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
-  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, SiFive7VLEN>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS16",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
-                                       4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
-                                       [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
-  let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-  }
-  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVSTS16",  [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-  }
-}
-foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
-  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, SiFive7VLEN>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS32",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
-                                       4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
-                                       [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
-  let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-  }
-  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVSTS32",  [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-  }
-}
-foreach mx = ["M1", "M2", "M4", "M8"] in {
-  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, SiFive7VLEN>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS64",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
-                                       4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
-                                       [0, 1], [1, !add(1, Cycles)], mx, IsWorstCase>;
-  let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-  }
-  let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVSTS64",  [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-  }
-}
-
-// VLD*R is LMUL aware
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
-  def : WriteRes<WriteVLD1R,  [SiFive7VCQ, SiFive7VL]>;
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
-  def : WriteRes<WriteVLD2R,  [SiFive7VCQ, SiFive7VL]>;
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
-  def : WriteRes<WriteVLD4R,  [SiFive7VCQ, SiFive7VL]>;
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
-  def : WriteRes<WriteVLD8R,  [SiFive7VCQ, SiFive7VL]>;
-// VST*R is LMUL aware
-let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
-  def : WriteRes<WriteVST1R,   [SiFive7VCQ, SiFive7VS]>;
-let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
-  def : WriteRes<WriteVST2R,   [SiFive7VCQ, SiFive7VS]>;
-let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
-  def : WriteRes<WriteVST4R,   [SiFive7VCQ, SiFive7VS]>;
-let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
-  def : WriteRes<WriteVST8R,   [SiFive7VCQ, SiFive7VS]>;
-
-// Segmented Loads and Stores
-// Unit-stride segmented loads and stores are effectively converted into strided
-// segment loads and stores. Strided segment loads and stores operate at up to
-// one segment per cycle if the segment fits within one aligned memory beat.
-// Indexed segment loads and stores operate at the same rate as strided ones,
-// but they stall the machine until all addresses have been generated.
-foreach mx = SchedMxList in {
-  foreach eew = [8, 16, 32, 64] in {
-    defvar Cycles = SiFive7GetCyclesSegmentedSeg2<mx>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-    // Does not chain so set latency high
-    let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULWriteResMX<"WriteVLSEG2e" # eew,   [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-      defm "" : LMULWriteResMX<"WriteVLSEGFF2e" # eew, [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-    }
-    let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-    defm "" : LMULWriteResMX<"WriteVSSEG2e" # eew,   [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    foreach nf=3-8 in {
-      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, SiFive7VLEN>.c;
-      defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-      // Does not chain so set latency high
-      let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew,   [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-        defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-      }
-      let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-      defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew,   [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-    }
-  }
-}
-foreach mx = SchedMxList in {
-  foreach nf=2-8 in {
-    foreach eew = [8, 16, 32, 64] in {
-      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, SiFive7VLEN>.c;
-      defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-      // Does not chain so set latency high
-      let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew,  [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-        defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-        defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [SiFive7VCQ, SiFive7VL], mx, IsWorstCase>;
-      }
-      let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew,  [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-        defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-        defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
-      }
-    }
-  }
-}
-
-// 11. Vector Integer Arithmetic Instructions
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVIALUV",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIALUX",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIALUI",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUV",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUX",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUI",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMV",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMI",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMinMaxV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMinMaxX",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeV",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeI",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovV",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovX",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovI",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVShiftV",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVShiftX",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVShiftI",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulV",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulX",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulAddV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulAddX",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-  // Mask results can't chain.
-  let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVICmpV",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpX",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpI",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVExtV",      [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-foreach mx = SchedMxList in {
-  foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
-                         !div(SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c, 4));
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
-    let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-  }
-}
-
-// Widening
-foreach mx = SchedMxListW in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVIWALUV",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIWALUX",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIWALUI",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIWMulV",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIWMulX",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-// Narrowing
-foreach mx = SchedMxListW in {
-  defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVNShiftV",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVNShiftX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVNShiftI",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-
-// 12. Vector Fixed-Point Arithmetic Instructions
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVSALUV",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSALUX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSALUI",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVAALUV",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVAALUX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSMulV",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSMulX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSShiftV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSShiftX", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSShiftI", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-// Narrowing
-foreach mx = SchedMxListW in {
-  defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVNClipV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVNClipX",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVNClipI",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-
-// 13. Vector Floating-Point Instructions
-foreach mx = SchedMxListF in {
-  foreach sew = SchedSEWSet<mx, isF=1>.val in {
-    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
-    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV",  [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF",  [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV",  [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF",  [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-    let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-  }
-}
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVFCvtFToIV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVFClassV",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVFMergeV",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVFMovV",      [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-  // Mask results can't chain.
-  let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVFCmpV",      [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVFCmpF",      [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-foreach mx = SchedMxListF in {
-  foreach sew = SchedSEWSet<mx, isF=1>.val in {
-    defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
-                         !div(SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c, 4));
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
-    let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV",  [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF",  [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-  }
-}
-
-// Widening
-foreach mx = SchedMxListW in {
-  foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
-    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
-    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-  }
-}
-foreach mx = SchedMxListFW in {
-  foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-    defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-  }
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-  defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-}
-// Narrowing
-foreach mx = SchedMxListW in {
-  defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-foreach mx = SchedMxListFW in {
-  foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-    defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-    let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-  }
-}
-
-// 14. Vector Reduction Operations
-foreach mx = SchedMxList in {
-  foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
-    let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFive7VCQ, SiFive7VA],
-                                     mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SiFive7VCQ, SiFive7VA],
-                                     mx, sew, IsWorstCase>;
-    }
-  }
-}
-
-foreach mx = SchedMxListWRed in {
-  foreach sew = SchedSEWSet<mx, 0, 1>.val in {
-    defvar Cycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
-    let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFive7VCQ, SiFive7VA],
-                                   mx, sew, IsWorstCase>;
-  }
-}
-
-foreach mx = SchedMxListF in {
-  foreach sew = SchedSEWSet<mx, 1>.val in {
-    defvar RedCycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
-    let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFive7VCQ, SiFive7VA],
-                                     mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SiFive7VCQ, SiFive7VA],
-                                     mx, sew, IsWorstCase>;
-    }
-    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, SiFive7VLEN>.c;
-    let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFive7VCQ, SiFive7VA],
-                                   mx, sew, IsWorstCase>;
-  }
-}
-
-foreach mx = SchedMxListFWRed in {
-  foreach sew = SchedSEWSet<mx, 1, 1>.val in {
-    defvar RedCycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
-    let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SiFive7VCQ, SiFive7VA],
-                                   mx, sew, IsWorstCase>;
-    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, SiFive7VLEN>.c;
-    let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFive7VCQ, SiFive7VA],
-                                   mx, sew, IsWorstCase>;
-  }
-}
-
-// 15. Vector Mask Instructions
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesVMask<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVMALUV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVMPopV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVMFFSV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVMSFSV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVIotaV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIdxV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-
-// 16. Vector Permutation Instructions
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 1)] in {
-  def : WriteRes<WriteVMovSX, [SiFive7VCQ, SiFive7VA]>;
-  def : WriteRes<WriteVMovXS, [SiFive7VCQ, SiFive7VA]>;
-  def : WriteRes<WriteVMovSF, [SiFive7VCQ, SiFive7VA]>;
-  def : WriteRes<WriteVMovFS, [SiFive7VCQ, SiFive7VA]>;
-}
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVRGatherVX",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVRGatherVI",    [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-
-foreach mx = SchedMxList in {
-  foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c;
-    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
-    let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
-    }
-  }
-}
-
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVSlideUpX",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSlideDownX", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVSlideI",     [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVISlide1X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVFSlide1F",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-  }
-}
-
-// VMov*V is LMUL Aware
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
-  def : WriteRes<WriteVMov1V,     [SiFive7VCQ, SiFive7VA]>;
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
-  def : WriteRes<WriteVMov2V,     [SiFive7VCQ, SiFive7VA]>;
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
-  def : WriteRes<WriteVMov4V,     [SiFive7VCQ, SiFive7VA]>;
-let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
-  def : WriteRes<WriteVMov8V,     [SiFive7VCQ, SiFive7VA]>;
-
-// Others
-def : WriteRes<WriteCSR, [SiFive7PipeB]>;
-def : WriteRes<WriteNop, []>;
-let Latency = 3 in
-  def : WriteRes<WriteRdVLENB, [SiFive7PipeB]>;
-
-def : InstRW<[WriteIALU], (instrs COPY)>;
-
-// VCIX
-//
-// In principle we don't know the latency of any VCIX instructions (they
-// depends on a particular coprocessor implementation). However, the default
-// latency of 1 can lead to issues [1]. So instead we set the latency to the
-// default provided by `SiFive7GetCyclesDefault`. This is still not accurate
-// and can lead to suboptimal codegen, but should hopefully be a better
-// starting point.
-//
-// [1] https://github.com/llvm/llvm-project/issues/83391
-foreach mx = SchedMxList in {
-  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = Cycles,
-      AcquireAtCycles = [0, 1],
-      ReleaseAtCycles = [1, !add(1, Cycles)] in {
-    defm "" : LMULWriteResMX<"WriteVC_V_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_IV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_VV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_XV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_V_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    foreach f = ["FPR16", "FPR32", "FPR64"] in {
-      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "V",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    }
-    defm "" : LMULWriteResMX<"WriteVC_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_IV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_VV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_XV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVC_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    foreach f = ["FPR16", "FPR32", "FPR64"] in {
-      defm "" : LMULWriteResMX<"WriteVC_" # f # "V",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-      defm "" : LMULWriteResMX<"WriteVC_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-      defm "" : LMULWriteResMX<"WriteVC_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
-    }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-
-// Bypass and advance
-def : SiFive7AnyToGPRBypass<ReadJmp>;
-def : SiFive7AnyToGPRBypass<ReadJalr>;
-def : ReadAdvance<ReadCSR, 0>;
-def : SiFive7AnyToGPRBypass<ReadStoreData>;
-def : ReadAdvance<ReadMemBase, 0>;
-def : SiFive7AnyToGPRBypass<ReadIALU>;
-def : SiFive7AnyToGPRBypass<ReadIALU32>;
-def : SiFive7AnyToGPRBypass<ReadShiftImm>;
-def : SiFive7AnyToGPRBypass<ReadShiftImm32>;
-def : SiFive7AnyToGPRBypass<ReadShiftReg>;
-def : SiFive7AnyToGPRBypass<ReadShiftReg32>;
-def : ReadAdvance<ReadIDiv, 0>;
-def : ReadAdvance<ReadIDiv32, 0>;
-def : ReadAdvance<ReadIRem, 0>;
-def : ReadAdvance<ReadIRem32, 0>;
-def : ReadAdvance<ReadIMul, 0>;
-def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFStoreData, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFAdd16, 0>;
-def : ReadAdvance<ReadFAdd32, 0>;
-def : ReadAdvance<ReadFAdd64, 0>;
-def : ReadAdvance<ReadFMul16, 0>;
-def : ReadAdvance<ReadFMA16, 0>;
-def : ReadAdvance<ReadFMA16Addend, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMA32, 0>;
-def : ReadAdvance<ReadFMA32Addend, 0>;
-def : ReadAdvance<ReadFMA64, 0>;
-def : ReadAdvance<ReadFMA64Addend, 0>;
-def : ReadAdvance<ReadFDiv16, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt16, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadFCmp16, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFSGNJ16, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax16, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCvtF16ToI32, 0>;
-def : ReadAdvance<ReadFCvtF16ToI64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF16, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF16, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFCvtF16ToF32, 0>;
-def : ReadAdvance<ReadFCvtF32ToF16, 0>;
-def : ReadAdvance<ReadFCvtF16ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF16, 0>;
-def : ReadAdvance<ReadFMovF16ToI16, 0>;
-def : ReadAdvance<ReadFMovI16ToF16, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFClass16, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-
-def : SiFive7AnyToGPRBypass<ReadSFBJmp, 0>;
-def : SiFive7AnyToGPRBypass<ReadSFBALU, 0>;
-
-// Bitmanip
-def : SiFive7AnyToGPRBypass<ReadRotateImm>;
-def : SiFive7AnyToGPRBypass<ReadRotateImm32>;
-def : SiFive7AnyToGPRBypass<ReadRotateReg>;
-def : SiFive7AnyToGPRBypass<ReadRotateReg32>;
-def : SiFive7AnyToGPRBypass<ReadCLZ>;
-def : SiFive7AnyToGPRBypass<ReadCLZ32>;
-def : SiFive7AnyToGPRBypass<ReadCTZ>;
-def : SiFive7AnyToGPRBypass<ReadCTZ32>;
-def : ReadAdvance<ReadCPOP, 0>;
-def : ReadAdvance<ReadCPOP32, 0>;
-def : SiFive7AnyToGPRBypass<ReadORCB>;
-def : SiFive7AnyToGPRBypass<ReadIMinMax>;
-def : SiFive7AnyToGPRBypass<ReadREV8>;
-def : SiFive7AnyToGPRBypass<ReadSHXADD>;
-def : SiFive7AnyToGPRBypass<ReadSHXADD32>;
-// Single-bit instructions
-def : SiFive7AnyToGPRBypass<ReadSingleBit>;
-def : SiFive7AnyToGPRBypass<ReadSingleBitImm>;
-
-// 6. Configuration-Setting Instructions
-def : ReadAdvance<ReadVSETVLI, 2>;
-def : ReadAdvance<ReadVSETVL, 2>;
-
-// 7. Vector Loads and Stores
-def : ReadAdvance<ReadVLDX, 0>;
-def : ReadAdvance<ReadVSTX, 0>;
-defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
-defm "" : LMULReadAdvance<"ReadVSTM", 0>;
-def : ReadAdvance<ReadVLDSX, 0>;
-def : ReadAdvance<ReadVSTSX, 0>;
-defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
-defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
-defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
-defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
-// LMUL Aware
-def : ReadAdvance<ReadVST1R, 0>;
-def : ReadAdvance<ReadVST2R, 0>;
-def : ReadAdvance<ReadVST4R, 0>;
-def : ReadAdvance<ReadVST8R, 0>;
-
-// 12. Vector Integer Arithmetic Instructions
-defm : LMULReadAdvance<"ReadVIALUV", 0>;
-defm : LMULReadAdvance<"ReadVIALUX", 0>;
-defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
-defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
-defm : LMULReadAdvance<"ReadVExtV", 0>;
-defm : LMULReadAdvance<"ReadVICALUV", 0>;
-defm : LMULReadAdvance<"ReadVICALUX", 0>;
-defm : LMULReadAdvance<"ReadVShiftV", 0>;
-defm : LMULReadAdvance<"ReadVShiftX", 0>;
-defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
-defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
-defm : LMULReadAdvance<"ReadVICmpV", 0>;
-defm : LMULReadAdvance<"ReadVICmpX", 0>;
-defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
-defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
-defm : LMULReadAdvance<"ReadVIMulV", 0>;
-defm : LMULReadAdvance<"ReadVIMulX", 0>;
-defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
-defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
-defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
-defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
-defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
-defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
-defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
-defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
-defm : LMULReadAdvance<"ReadVIMergeV", 0>;
-defm : LMULReadAdvance<"ReadVIMergeX", 0>;
-defm : LMULReadAdvance<"ReadVIMovV", 0>;
-defm : LMULReadAdvance<"ReadVIMovX", 0>;
-
-// 13. Vector Fixed-Point Arithmetic Instructions
-defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
-defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
-defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
-defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
-defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
-defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
-defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
-defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
-defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
-defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
-
-// 14. Vector Floating-Point Instructions
-defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
-defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
-defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
-defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
-defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
-defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
-defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
-defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
-defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
-defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
-defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
-defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
-
-// 15. Vector Reduction Operations
-def : ReadAdvance<ReadVIRedV, 0>;
-def : ReadAdvance<ReadVIRedV0, 0>;
-def : ReadAdvance<ReadVIWRedV, 0>;
-def : ReadAdvance<ReadVIWRedV0, 0>;
-def : ReadAdvance<ReadVFRedV, 0>;
-def : ReadAdvance<ReadVFRedV0, 0>;
-def : ReadAdvance<ReadVFRedOV, 0>;
-def : ReadAdvance<ReadVFRedOV0, 0>;
-def : ReadAdvance<ReadVFWRedV, 0>;
-def : ReadAdvance<ReadVFWRedV0, 0>;
-def : ReadAdvance<ReadVFWRedOV, 0>;
-def : ReadAdvance<ReadVFWRedOV0, 0>;
-
-// 16. Vector Mask Instructions
-defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
-defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
-defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
-defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
-defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
-
-// 17. Vector Permutation Instructions
-def : ReadAdvance<ReadVMovXS, 0>;
-def : ReadAdvance<ReadVMovSX_V, 0>;
-def : ReadAdvance<ReadVMovSX_X, 0>;
-def : ReadAdvance<ReadVMovFS, 0>;
-def : ReadAdvance<ReadVMovSF_V, 0>;
-def : ReadAdvance<ReadVMovSF_F, 0>;
-defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
-defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
-defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
-defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
-defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
-defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
-defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
-defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
-defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
-defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
-defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
-defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
-// LMUL Aware
-def : ReadAdvance<ReadVMov1V, 0>;
-def : ReadAdvance<ReadVMov2V, 0>;
-def : ReadAdvance<ReadVMov4V, 0>;
-def : ReadAdvance<ReadVMov8V, 0>;
-
-// Others
-def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
-foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
-  foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Unsupported extensions
-defm : UnsupportedSchedQ;
-defm : UnsupportedSchedZabha;
-defm : UnsupportedSchedZbc;
-defm : UnsupportedSchedZbkb;
-defm : UnsupportedSchedZbkx;
-defm : UnsupportedSchedZfa;
-defm : UnsupportedSchedZvk;
-}
+// Some model name aliases.
+defvar SiFive7Model = SiFive7VLEN512Model;
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/div-fdiv.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/div-fdiv.s
index cb60d8ea935a..51d68b47e010 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/div-fdiv.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/div-fdiv.s
@@ -28,14 +28,14 @@ fdiv.s f1, f2, f3
 # CHECK-NEXT:  1      27    26.00                       fdiv.s	ft1, ft2, ft3
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass-c.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass-c.s
index c7ff5e6a6b9c..32b6e7e953e2 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass-c.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass-c.s
@@ -66,14 +66,14 @@ c.jr a0
 # CHECK-NEXT:  1      3     1.00                        jr	a0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
index 03f7de2fe9a4..211c50917705 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
@@ -214,14 +214,14 @@ jr a0
 # CHECK-NEXT:  1      3     1.00                        jr	a0
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/instruction-tables-tests.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/instruction-tables-tests.s
index 4f8b4769b428..c69a36b0d4a7 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/instruction-tables-tests.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/instruction-tables-tests.s
@@ -35,48 +35,48 @@
 # NISE-NEXT:  Total uOps:        200
 
 # ISF:        Resources:
-# ISF-NEXT:   [0]   - SiFive7FDiv:1
-# ISF-NEXT:   [1]   - SiFive7IDiv:1
-# ISF-NEXT:   [2]   - SiFive7PipeA:1
-# ISF-NEXT:   [3]   - SiFive7PipeAB:2 SiFive7PipeA, SiFive7PipeB
-# ISF-NEXT:   [4]   - SiFive7PipeB:1
-# ISF-NEXT:   [5]   - SiFive7VA:1
-# ISF-NEXT:   [6]   - SiFive7VCQ:1
-# ISF-NEXT:   [7]   - SiFive7VL:1
-# ISF-NEXT:   [8]   - SiFive7VS:1
+# ISF-NEXT:   [0]   - VLEN512SiFive7FDiv:1
+# ISF-NEXT:   [1]   - VLEN512SiFive7IDiv:1
+# ISF-NEXT:   [2]   - VLEN512SiFive7PipeA:1
+# ISF-NEXT:   [3]   - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB
+# ISF-NEXT:   [4]   - VLEN512SiFive7PipeB:1
+# ISF-NEXT:   [5]   - VLEN512SiFive7VA:1
+# ISF-NEXT:   [6]   - VLEN512SiFive7VCQ:1
+# ISF-NEXT:   [7]   - VLEN512SiFive7VL:1
+# ISF-NEXT:   [8]   - VLEN512SiFive7VS:1
 
 # ISFB:       Resources:
-# ISFB-NEXT:  [0]   - SiFive7FDiv:1
-# ISFB-NEXT:  [1]   - SiFive7IDiv:1
-# ISFB-NEXT:  [2]   - SiFive7PipeA:1
-# ISFB-NEXT:  [3]   - SiFive7PipeAB:2 SiFive7PipeA, SiFive7PipeB
-# ISFB-NEXT:  [4]   - SiFive7PipeB:1
-# ISFB-NEXT:  [5]   - SiFive7VA:1
-# ISFB-NEXT:  [6]   - SiFive7VCQ:1
-# ISFB-NEXT:  [7]   - SiFive7VL:1
-# ISFB-NEXT:  [8]   - SiFive7VS:1
+# ISFB-NEXT:  [0]   - VLEN512SiFive7FDiv:1
+# ISFB-NEXT:  [1]   - VLEN512SiFive7IDiv:1
+# ISFB-NEXT:  [2]   - VLEN512SiFive7PipeA:1
+# ISFB-NEXT:  [3]   - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB
+# ISFB-NEXT:  [4]   - VLEN512SiFive7PipeB:1
+# ISFB-NEXT:  [5]   - VLEN512SiFive7VA:1
+# ISFB-NEXT:  [6]   - VLEN512SiFive7VCQ:1
+# ISFB-NEXT:  [7]   - VLEN512SiFive7VL:1
+# ISFB-NEXT:  [8]   - VLEN512SiFive7VS:1
 
 # ISFBE:      Resources:
-# ISFBE-NEXT: [0]   - SiFive7FDiv:1
-# ISFBE-NEXT: [1]   - SiFive7IDiv:1
-# ISFBE-NEXT: [2]   - SiFive7PipeA:1
-# ISFBE-NEXT: [3]   - SiFive7PipeAB:2 SiFive7PipeA, SiFive7PipeB
-# ISFBE-NEXT: [4]   - SiFive7PipeB:1
-# ISFBE-NEXT: [5]   - SiFive7VA:1
-# ISFBE-NEXT: [6]   - SiFive7VCQ:1
-# ISFBE-NEXT: [7]   - SiFive7VL:1
-# ISFBE-NEXT: [8]   - SiFive7VS:1
+# ISFBE-NEXT: [0]   - VLEN512SiFive7FDiv:1
+# ISFBE-NEXT: [1]   - VLEN512SiFive7IDiv:1
+# ISFBE-NEXT: [2]   - VLEN512SiFive7PipeA:1
+# ISFBE-NEXT: [3]   - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB
+# ISFBE-NEXT: [4]   - VLEN512SiFive7PipeB:1
+# ISFBE-NEXT: [5]   - VLEN512SiFive7VA:1
+# ISFBE-NEXT: [6]   - VLEN512SiFive7VCQ:1
+# ISFBE-NEXT: [7]   - VLEN512SiFive7VL:1
+# ISFBE-NEXT: [8]   - VLEN512SiFive7VS:1
 
 # ISFE:       Resources:
-# ISFE-NEXT:  [0]   - SiFive7FDiv:1
-# ISFE-NEXT:  [1]   - SiFive7IDiv:1
-# ISFE-NEXT:  [2]   - SiFive7PipeA:1
-# ISFE-NEXT:  [3]   - SiFive7PipeAB:2 SiFive7PipeA, SiFive7PipeB
-# ISFE-NEXT:  [4]   - SiFive7PipeB:1
-# ISFE-NEXT:  [5]   - SiFive7VA:1
-# ISFE-NEXT:  [6]   - SiFive7VCQ:1
-# ISFE-NEXT:  [7]   - SiFive7VL:1
-# ISFE-NEXT:  [8]   - SiFive7VS:1
+# ISFE-NEXT:  [0]   - VLEN512SiFive7FDiv:1
+# ISFE-NEXT:  [1]   - VLEN512SiFive7IDiv:1
+# ISFE-NEXT:  [2]   - VLEN512SiFive7PipeA:1
+# ISFE-NEXT:  [3]   - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB
+# ISFE-NEXT:  [4]   - VLEN512SiFive7PipeB:1
+# ISFE-NEXT:  [5]   - VLEN512SiFive7VA:1
+# ISFE-NEXT:  [6]   - VLEN512SiFive7VCQ:1
+# ISFE-NEXT:  [7]   - VLEN512SiFive7VL:1
+# ISFE-NEXT:  [8]   - VLEN512SiFive7VS:1
 
 # ISN:        Instruction Info:
 # ISN-NEXT:   [1]: #uOps
@@ -216,20 +216,20 @@
 # NISE-NEXT:  [7]: Encoding Size
 
 # ISF:        [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
-# ISF-NEXT:    1      3     1.00                  U      1     SiFive7PipeA,SiFive7PipeAB                 VSETVLI                    vsetvli	a3, a2, e16, m1, tu, mu	// Comment
-# ISF-NEXT:    1      4     2.00    *                    4     SiFive7VCQ,SiFive7VL[1,3]                  VLM_V                      vlm.v	v4, (a1)
+# ISF-NEXT:    1      3     1.00                  U      1     VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB   VSETVLI                    vsetvli	a3, a2, e16, m1, tu, mu	// Comment
+# ISF-NEXT:    1      4     2.00    *                    4     VLEN512SiFive7VCQ,VLEN512SiFive7VL[1,3]    VLM_V                      vlm.v	v4, (a1)
 
 # ISFB:       [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        [10]   [11]   Instructions:
-# ISFB-NEXT:   1      3     1.00                  U      1     SiFive7PipeA,SiFive7PipeAB                 VSETVLI                                  vsetvli	a3, a2, e16, m1, tu, mu	// Comment
-# ISFB-NEXT:   1      4     2.00    *                    4     SiFive7VCQ,SiFive7VL[1,3]                  VLM_V                                    vlm.v	v4, (a1)
+# ISFB-NEXT:   1      3     1.00                  U      1     VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB   VSETVLI                                  vsetvli	a3, a2, e16, m1, tu, mu	// Comment
+# ISFB-NEXT:   1      4     2.00    *                    4     VLEN512SiFive7VCQ,VLEN512SiFive7VL[1,3]    VLM_V                                    vlm.v	v4, (a1)
 
 # ISFBE:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        [10]   [11]   [12]   Encodings:                    Instructions:
-# ISFBE-NEXT:  1      3     1.00                  U      1     SiFive7PipeA,SiFive7PipeAB                 VSETVLI                                   4     d7 76 86 00                   vsetvli	a3, a2, e16, m1, tu, mu	// Comment
-# ISFBE-NEXT:  1      4     2.00    *                    4     SiFive7VCQ,SiFive7VL[1,3]                  VLM_V                                     4     07 82 b5 02                   vlm.v	v4, (a1)
+# ISFBE-NEXT:  1      3     1.00                  U      1     VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB   VSETVLI                                   4     d7 76 86 00                   vsetvli	a3, a2, e16, m1, tu, mu	// Comment
+# ISFBE-NEXT:  1      4     2.00    *                    4     VLEN512SiFive7VCQ,VLEN512SiFive7VL[1,3]    VLM_V                                     4     07 82 b5 02                   vlm.v	v4, (a1)
 
 # ISFE:       [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        [10]   Encodings:                    Instructions:
-# ISFE-NEXT:   1      3     1.00                  U      1     SiFive7PipeA,SiFive7PipeAB                 VSETVLI                     4     d7 76 86 00                   vsetvli	a3, a2, e16, m1, tu, mu	// Comment
-# ISFE-NEXT:   1      4     2.00    *                    4     SiFive7VCQ,SiFive7VL[1,3]                  VLM_V                       4     07 82 b5 02                   vlm.v	v4, (a1)
+# ISFE-NEXT:   1      3     1.00                  U      1     VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB   VSETVLI                     4     d7 76 86 00                   vsetvli	a3, a2, e16, m1, tu, mu	// Comment
+# ISFE-NEXT:   1      4     2.00    *                    4     VLEN512SiFive7VCQ,VLEN512SiFive7VL[1,3]    VLM_V                       4     07 82 b5 02                   vlm.v	v4, (a1)
 
 # ISNB:       [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
 # ISNB-NEXT:   1      3     1.00                  U                   vsetvli	a3, a2, e16, m1, tu, mu
@@ -256,114 +256,114 @@
 # NISE-NEXT:   1      4     2.00    *                    4     07 82 b5 02                   vlm.v	v4, (a1)
 
 # ISN:        Resources:
-# ISN-NEXT:   [0]   - SiFive7FDiv
-# ISN-NEXT:   [1]   - SiFive7IDiv
-# ISN-NEXT:   [2]   - SiFive7PipeA
-# ISN-NEXT:   [3]   - SiFive7PipeB
-# ISN-NEXT:   [4]   - SiFive7VA
-# ISN-NEXT:   [5]   - SiFive7VCQ
-# ISN-NEXT:   [6]   - SiFive7VL
-# ISN-NEXT:   [7]   - SiFive7VS
+# ISN-NEXT:   [0]   - VLEN512SiFive7FDiv
+# ISN-NEXT:   [1]   - VLEN512SiFive7IDiv
+# ISN-NEXT:   [2]   - VLEN512SiFive7PipeA
+# ISN-NEXT:   [3]   - VLEN512SiFive7PipeB
+# ISN-NEXT:   [4]   - VLEN512SiFive7VA
+# ISN-NEXT:   [5]   - VLEN512SiFive7VCQ
+# ISN-NEXT:   [6]   - VLEN512SiFive7VL
+# ISN-NEXT:   [7]   - VLEN512SiFive7VS
 
 # ISF:        Resources:
-# ISF-NEXT:   [0]   - SiFive7FDiv
-# ISF-NEXT:   [1]   - SiFive7IDiv
-# ISF-NEXT:   [2]   - SiFive7PipeA
-# ISF-NEXT:   [3]   - SiFive7PipeB
-# ISF-NEXT:   [4]   - SiFive7VA
-# ISF-NEXT:   [5]   - SiFive7VCQ
-# ISF-NEXT:   [6]   - SiFive7VL
-# ISF-NEXT:   [7]   - SiFive7VS
+# ISF-NEXT:   [0]   - VLEN512SiFive7FDiv
+# ISF-NEXT:   [1]   - VLEN512SiFive7IDiv
+# ISF-NEXT:   [2]   - VLEN512SiFive7PipeA
+# ISF-NEXT:   [3]   - VLEN512SiFive7PipeB
+# ISF-NEXT:   [4]   - VLEN512SiFive7VA
+# ISF-NEXT:   [5]   - VLEN512SiFive7VCQ
+# ISF-NEXT:   [6]   - VLEN512SiFive7VL
+# ISF-NEXT:   [7]   - VLEN512SiFive7VS
 
 # ISFB:       Resources:
-# ISFB-NEXT:  [0]   - SiFive7FDiv
-# ISFB-NEXT:  [1]   - SiFive7IDiv
-# ISFB-NEXT:  [2]   - SiFive7PipeA
-# ISFB-NEXT:  [3]   - SiFive7PipeB
-# ISFB-NEXT:  [4]   - SiFive7VA
-# ISFB-NEXT:  [5]   - SiFive7VCQ
-# ISFB-NEXT:  [6]   - SiFive7VL
-# ISFB-NEXT:  [7]   - SiFive7VS
+# ISFB-NEXT:  [0]   - VLEN512SiFive7FDiv
+# ISFB-NEXT:  [1]   - VLEN512SiFive7IDiv
+# ISFB-NEXT:  [2]   - VLEN512SiFive7PipeA
+# ISFB-NEXT:  [3]   - VLEN512SiFive7PipeB
+# ISFB-NEXT:  [4]   - VLEN512SiFive7VA
+# ISFB-NEXT:  [5]   - VLEN512SiFive7VCQ
+# ISFB-NEXT:  [6]   - VLEN512SiFive7VL
+# ISFB-NEXT:  [7]   - VLEN512SiFive7VS
 
 # ISFBE:      Resources:
-# ISFBE-NEXT: [0]   - SiFive7FDiv
-# ISFBE-NEXT: [1]   - SiFive7IDiv
-# ISFBE-NEXT: [2]   - SiFive7PipeA
-# ISFBE-NEXT: [3]   - SiFive7PipeB
-# ISFBE-NEXT: [4]   - SiFive7VA
-# ISFBE-NEXT: [5]   - SiFive7VCQ
-# ISFBE-NEXT: [6]   - SiFive7VL
-# ISFBE-NEXT: [7]   - SiFive7VS
+# ISFBE-NEXT: [0]   - VLEN512SiFive7FDiv
+# ISFBE-NEXT: [1]   - VLEN512SiFive7IDiv
+# ISFBE-NEXT: [2]   - VLEN512SiFive7PipeA
+# ISFBE-NEXT: [3]   - VLEN512SiFive7PipeB
+# ISFBE-NEXT: [4]   - VLEN512SiFive7VA
+# ISFBE-NEXT: [5]   - VLEN512SiFive7VCQ
+# ISFBE-NEXT: [6]   - VLEN512SiFive7VL
+# ISFBE-NEXT: [7]   - VLEN512SiFive7VS
 
 # ISFE:       Resources:
-# ISFE-NEXT:  [0]   - SiFive7FDiv
-# ISFE-NEXT:  [1]   - SiFive7IDiv
-# ISFE-NEXT:  [2]   - SiFive7PipeA
-# ISFE-NEXT:  [3]   - SiFive7PipeB
-# ISFE-NEXT:  [4]   - SiFive7VA
-# ISFE-NEXT:  [5]   - SiFive7VCQ
-# ISFE-NEXT:  [6]   - SiFive7VL
-# ISFE-NEXT:  [7]   - SiFive7VS
+# ISFE-NEXT:  [0]   - VLEN512SiFive7FDiv
+# ISFE-NEXT:  [1]   - VLEN512SiFive7IDiv
+# ISFE-NEXT:  [2]   - VLEN512SiFive7PipeA
+# ISFE-NEXT:  [3]   - VLEN512SiFive7PipeB
+# ISFE-NEXT:  [4]   - VLEN512SiFive7VA
+# ISFE-NEXT:  [5]   - VLEN512SiFive7VCQ
+# ISFE-NEXT:  [6]   - VLEN512SiFive7VL
+# ISFE-NEXT:  [7]   - VLEN512SiFive7VS
 
 # ISNB:       Resources:
-# ISNB-NEXT:  [0]   - SiFive7FDiv
-# ISNB-NEXT:  [1]   - SiFive7IDiv
-# ISNB-NEXT:  [2]   - SiFive7PipeA
-# ISNB-NEXT:  [3]   - SiFive7PipeB
-# ISNB-NEXT:  [4]   - SiFive7VA
-# ISNB-NEXT:  [5]   - SiFive7VCQ
-# ISNB-NEXT:  [6]   - SiFive7VL
-# ISNB-NEXT:  [7]   - SiFive7VS
+# ISNB-NEXT:  [0]   - VLEN512SiFive7FDiv
+# ISNB-NEXT:  [1]   - VLEN512SiFive7IDiv
+# ISNB-NEXT:  [2]   - VLEN512SiFive7PipeA
+# ISNB-NEXT:  [3]   - VLEN512SiFive7PipeB
+# ISNB-NEXT:  [4]   - VLEN512SiFive7VA
+# ISNB-NEXT:  [5]   - VLEN512SiFive7VCQ
+# ISNB-NEXT:  [6]   - VLEN512SiFive7VL
+# ISNB-NEXT:  [7]   - VLEN512SiFive7VS
 
 # ISNBE:      Resources:
-# ISNBE-NEXT: [0]   - SiFive7FDiv
-# ISNBE-NEXT: [1]   - SiFive7IDiv
-# ISNBE-NEXT: [2]   - SiFive7PipeA
-# ISNBE-NEXT: [3]   - SiFive7PipeB
-# ISNBE-NEXT: [4]   - SiFive7VA
-# ISNBE-NEXT: [5]   - SiFive7VCQ
-# ISNBE-NEXT: [6]   - SiFive7VL
-# ISNBE-NEXT: [7]   - SiFive7VS
+# ISNBE-NEXT: [0]   - VLEN512SiFive7FDiv
+# ISNBE-NEXT: [1]   - VLEN512SiFive7IDiv
+# ISNBE-NEXT: [2]   - VLEN512SiFive7PipeA
+# ISNBE-NEXT: [3]   - VLEN512SiFive7PipeB
+# ISNBE-NEXT: [4]   - VLEN512SiFive7VA
+# ISNBE-NEXT: [5]   - VLEN512SiFive7VCQ
+# ISNBE-NEXT: [6]   - VLEN512SiFive7VL
+# ISNBE-NEXT: [7]   - VLEN512SiFive7VS
 
 # ISNE:       Resources:
-# ISNE-NEXT:  [0]   - SiFive7FDiv
-# ISNE-NEXT:  [1]   - SiFive7IDiv
-# ISNE-NEXT:  [2]   - SiFive7PipeA
-# ISNE-NEXT:  [3]   - SiFive7PipeB
-# ISNE-NEXT:  [4]   - SiFive7VA
-# ISNE-NEXT:  [5]   - SiFive7VCQ
-# ISNE-NEXT:  [6]   - SiFive7VL
-# ISNE-NEXT:  [7]   - SiFive7VS
+# ISNE-NEXT:  [0]   - VLEN512SiFive7FDiv
+# ISNE-NEXT:  [1]   - VLEN512SiFive7IDiv
+# ISNE-NEXT:  [2]   - VLEN512SiFive7PipeA
+# ISNE-NEXT:  [3]   - VLEN512SiFive7PipeB
+# ISNE-NEXT:  [4]   - VLEN512SiFive7VA
+# ISNE-NEXT:  [5]   - VLEN512SiFive7VCQ
+# ISNE-NEXT:  [6]   - VLEN512SiFive7VL
+# ISNE-NEXT:  [7]   - VLEN512SiFive7VS
 
 # NISB:       Resources:
-# NISB-NEXT:  [0]   - SiFive7FDiv
-# NISB-NEXT:  [1]   - SiFive7IDiv
-# NISB-NEXT:  [2]   - SiFive7PipeA
-# NISB-NEXT:  [3]   - SiFive7PipeB
-# NISB-NEXT:  [4]   - SiFive7VA
-# NISB-NEXT:  [5]   - SiFive7VCQ
-# NISB-NEXT:  [6]   - SiFive7VL
-# NISB-NEXT:  [7]   - SiFive7VS
+# NISB-NEXT:  [0]   - VLEN512SiFive7FDiv
+# NISB-NEXT:  [1]   - VLEN512SiFive7IDiv
+# NISB-NEXT:  [2]   - VLEN512SiFive7PipeA
+# NISB-NEXT:  [3]   - VLEN512SiFive7PipeB
+# NISB-NEXT:  [4]   - VLEN512SiFive7VA
+# NISB-NEXT:  [5]   - VLEN512SiFive7VCQ
+# NISB-NEXT:  [6]   - VLEN512SiFive7VL
+# NISB-NEXT:  [7]   - VLEN512SiFive7VS
 
 # NISBE:      Resources:
-# NISBE-NEXT: [0]   - SiFive7FDiv
-# NISBE-NEXT: [1]   - SiFive7IDiv
-# NISBE-NEXT: [2]   - SiFive7PipeA
-# NISBE-NEXT: [3]   - SiFive7PipeB
-# NISBE-NEXT: [4]   - SiFive7VA
-# NISBE-NEXT: [5]   - SiFive7VCQ
-# NISBE-NEXT: [6]   - SiFive7VL
-# NISBE-NEXT: [7]   - SiFive7VS
+# NISBE-NEXT: [0]   - VLEN512SiFive7FDiv
+# NISBE-NEXT: [1]   - VLEN512SiFive7IDiv
+# NISBE-NEXT: [2]   - VLEN512SiFive7PipeA
+# NISBE-NEXT: [3]   - VLEN512SiFive7PipeB
+# NISBE-NEXT: [4]   - VLEN512SiFive7VA
+# NISBE-NEXT: [5]   - VLEN512SiFive7VCQ
+# NISBE-NEXT: [6]   - VLEN512SiFive7VL
+# NISBE-NEXT: [7]   - VLEN512SiFive7VS
 
 # NISE:       Resources:
-# NISE-NEXT:  [0]   - SiFive7FDiv
-# NISE-NEXT:  [1]   - SiFive7IDiv
-# NISE-NEXT:  [2]   - SiFive7PipeA
-# NISE-NEXT:  [3]   - SiFive7PipeB
-# NISE-NEXT:  [4]   - SiFive7VA
-# NISE-NEXT:  [5]   - SiFive7VCQ
-# NISE-NEXT:  [6]   - SiFive7VL
-# NISE-NEXT:  [7]   - SiFive7VS
+# NISE-NEXT:  [0]   - VLEN512SiFive7FDiv
+# NISE-NEXT:  [1]   - VLEN512SiFive7IDiv
+# NISE-NEXT:  [2]   - VLEN512SiFive7PipeA
+# NISE-NEXT:  [3]   - VLEN512SiFive7PipeB
+# NISE-NEXT:  [4]   - VLEN512SiFive7VA
+# NISE-NEXT:  [5]   - VLEN512SiFive7VCQ
+# NISE-NEXT:  [6]   - VLEN512SiFive7VL
+# NISE-NEXT:  [7]   - VLEN512SiFive7VS
 
 # ISN:        Resource pressure per iteration:
 # ISN-NEXT:   [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/jump.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/jump.s
index f23a9ccfca3b..15c12931a751 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/jump.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/jump.s
@@ -36,14 +36,14 @@ ret
 # CHECK-NEXT:  1      3     1.00                        ret
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-lmul-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-lmul-instruments.s
index 0e7284fb550a..aee2ee6acc5f 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-lmul-instruments.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-lmul-instruments.s
@@ -33,14 +33,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-sew-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-sew-instruments.s
index b445855ef3c1..71a0c5a38a84 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-sew-instruments.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-sew-instruments.s
@@ -34,14 +34,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      114   114.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/disable-im.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/disable-im.s
index 0cd96701fd8f..d58daf45f3d1 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/disable-im.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/disable-im.s
@@ -38,14 +38,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/fractional-lmul-data.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/fractional-lmul-data.s
index ac4b138da98a..53539d34f6f9 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/fractional-lmul-data.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/fractional-lmul-data.s
@@ -34,14 +34,14 @@ vdiv.vv v12, v12, v12
 # CHECK-NEXT:  1      30    30.00                       vdiv.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-at-start.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-at-start.s
index 53d106a32741..8af13c269013 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-at-start.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-at-start.s
@@ -28,14 +28,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     2.00                        vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-middle.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-middle.s
index 2b0e3fa1b8af..e59ee6becf9c 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-middle.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-middle.s
@@ -30,14 +30,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     1.00                        vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-region.s
index 308994116ed2..eef968b4cbda 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-region.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-region.s
@@ -32,14 +32,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     2.00                        vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-straddles-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-straddles-region.s
index 7c596b16109f..fa6a019f06d8 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-straddles-region.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-straddles-region.s
@@ -33,14 +33,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     2.00                        vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-lmul-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-lmul-instruments.s
index 680d84e9dc22..9b39790318af 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-lmul-instruments.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-lmul-instruments.s
@@ -42,14 +42,14 @@ vsub.vv v12, v12, v12
 # CHECK-NEXT:  1      4     8.00                        vsub.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-sew-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-sew-instruments.s
index 02e1fd72bcee..da2a703b37be 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-sew-instruments.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-sew-instruments.s
@@ -43,14 +43,14 @@ vdivu.vv v8, v8, v12
 # CHECK-NEXT:  1      112   112.00                      vdivu.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/needs-sew-but-only-lmul.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/needs-sew-but-only-lmul.s
index 2a58ea7962ae..4063c1d49712 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/needs-sew-but-only-lmul.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/needs-sew-but-only-lmul.s
@@ -32,14 +32,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      240   240.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/no-vsetvli-to-start.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/no-vsetvli-to-start.s
index e16b8f982b9c..b2581f88300c 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/no-vsetvli-to-start.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/no-vsetvli-to-start.s
@@ -29,14 +29,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     2.00                        vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/reductions.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/reductions.s
index 32e2d0e94305..f1abc9b08046 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/reductions.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/reductions.s
@@ -383,7 +383,7 @@ vfredmin.vs  v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
 # CHECK-NEXT:  1      768   768.00                      vfwredosum.vs	v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
-# CHECK-NEXT:  1      1536   1536.00                      vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      1536  1536.00                     vfwredosum.vs	v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
 # CHECK-NEXT:  1      48    48.00                       vfwredosum.vs	v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
@@ -448,14 +448,14 @@ vfredmin.vs  v4, v8, v12
 # CHECK-NEXT:  1      46    46.00                       vfredmin.vs	v4, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-at-start.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-at-start.s
index e9b0ee562a4c..7598bee2a8ea 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-at-start.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-at-start.s
@@ -29,14 +29,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      240   240.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-middle.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-middle.s
index dbec706866f6..9c61e1ce5644 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-middle.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-middle.s
@@ -30,19 +30,19 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      1920   1920.00                      vdiv.vv	v8, v8, v12
+# CHECK-NEXT:  1      1920  1920.00                     vdiv.vv	v8, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, a0, e8, m8, tu, mu
 # CHECK-NEXT:  1      912   912.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-region.s
index 49e639d8c06f..ad2ad37ffdfc 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-region.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-region.s
@@ -33,14 +33,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      114   114.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-straddles-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-straddles-region.s
index e1e222f7f035..07a08e8ac4d3 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-straddles-region.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-straddles-region.s
@@ -34,14 +34,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      114   114.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-store.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-store.s
index 325145982365..7c5e2b7620b7 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-store.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-store.s
@@ -244,14 +244,14 @@ vlse64.v v1, (a1), a2
 # CHECK-NEXT:  1      67    64.00   *                   vlse64.v	v1, (a1), a2
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-x0.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-x0.s
index 506c59ca884b..9eb337058fd0 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-x0.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-x0.s
@@ -82,14 +82,14 @@ vle64.v v1, (a1)
 # CHECK-NEXT:  1      4     2.00    *                   vle64.v	v1, (a1)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s
index 1db57cbd7dce..5e5f5f234068 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s
@@ -1284,7 +1284,7 @@ vmv.v.v v4, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
 # CHECK-NEXT:  1      960   960.00                      vremu.vx	v4, v8, a0
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
-# CHECK-NEXT:  1      1920   1920.00                      vrem.vv	v4, v8, v12
+# CHECK-NEXT:  1      1920  1920.00                     vrem.vv	v4, v8, v12
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
 # CHECK-NEXT:  1      30    30.00                       vrem.vx	v4, v8, a0
 # CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
@@ -1521,14 +1521,14 @@ vmv.v.v v4, v12
 # CHECK-NEXT:  1      4     16.00                       vmv.v.v	v4, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s
index d5f158c754b5..33ed58825b15 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s
@@ -832,14 +832,14 @@ vsm.v    v1, (a0)
 # CHECK-NEXT:  1      1     2.00           *            vsm.v	v1, (a0)
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-instrument.s
index f39b33fd70f3..10acac180d6c 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-instrument.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-instrument.s
@@ -31,14 +31,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-sew-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-sew-instrument.s
index 8dc7cac02c88..94d80d05d385 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-sew-instrument.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-sew-instrument.s
@@ -31,14 +31,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      896   896.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-instrument.s
index 130bb19280f2..8516e7f8ba61 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-instrument.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-instrument.s
@@ -31,14 +31,14 @@ vadd.vv v12, v12, v12
 # CHECK-NEXT:  1      4     16.00                       vadd.vv	v12, v12, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-sew-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-sew-instrument.s
index fe4d3f6169c9..bf717abad028 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-sew-instrument.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-sew-instrument.s
@@ -31,14 +31,14 @@ vdiv.vv v8, v8, v12
 # CHECK-NEXT:  1      896   896.00                      vdiv.vv	v8, v8, v12
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0]   - SiFive7FDiv
-# CHECK-NEXT: [1]   - SiFive7IDiv
-# CHECK-NEXT: [2]   - SiFive7PipeA
-# CHECK-NEXT: [3]   - SiFive7PipeB
-# CHECK-NEXT: [4]   - SiFive7VA
-# CHECK-NEXT: [5]   - SiFive7VCQ
-# CHECK-NEXT: [6]   - SiFive7VL
-# CHECK-NEXT: [7]   - SiFive7VS
+# CHECK-NEXT: [0]   - VLEN512SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN512SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN512SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN512SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN512SiFive7VA
+# CHECK-NEXT: [5]   - VLEN512SiFive7VCQ
+# CHECK-NEXT: [6]   - VLEN512SiFive7VL
+# CHECK-NEXT: [7]   - VLEN512SiFive7VS
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]

From 081adc15e8985057ee9ecd655029752afdd3a304 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Mon, 23 Jun 2025 18:49:30 +0200
Subject: [PATCH 1311/1322] [Triple][CodeGen] Fix
 `Triple::isTargetEHABICompatible()` for NetBSD (#143549)

Even for EABI, NetBSD uses DWARF EH, not EHABI. This change matches the
Clang frontend behavior, and fixes link errors caused by incorrect
references to `__cxa_end_cleanup` rather than `_Unwind_Resume`.

With this change and #143055, I was able to run
[zig-bootstrap](https://github.com/ziglang/zig-bootstrap) to completion
for `arm-netbsd10.1-eabihf`.
---
 llvm/include/llvm/TargetParser/Triple.h |  2 +-
 llvm/test/CodeGen/ARM/eh-resume.ll      |  3 +++
 llvm/test/CodeGen/ARM/ehabi.ll          | 29 ++++++++++++++-----------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index b6f15ef13191..d6fa4537ee3b 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -916,7 +916,7 @@ public:
             getEnvironment() == Triple::GNUEABIHFT64 ||
             getEnvironment() == Triple::OpenHOS ||
             getEnvironment() == Triple::MuslEABIHF || isAndroid()) &&
-           isOSBinFormatELF();
+           isOSBinFormatELF() && !isOSNetBSD();
   }
 
   // ARM EABI is the bare-metal EABI described in ARM ABI documents and
diff --git a/llvm/test/CodeGen/ARM/eh-resume.ll b/llvm/test/CodeGen/ARM/eh-resume.ll
index 53f444e7ba6f..a9bcf38d1031 100644
--- a/llvm/test/CodeGen/ARM/eh-resume.ll
+++ b/llvm/test/CodeGen/ARM/eh-resume.ll
@@ -4,6 +4,8 @@
 ; RUN: llc < %s -mtriple=armv7k-apple-watchos -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=WATCHABI
 ; RUN: llc < %s -mtriple=armv7-none-gnueabihf -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=EABI
 ; RUN: llc < %s -mtriple=armv7-none-none -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=ABI
+; RUN: llc < %s -mtriple=armv7-netbsd-none -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=NETBSD
+; RUN: llc < %s -mtriple=armv7-netbsd-eabihf -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=NETBSD
 
 declare void @func()
 
@@ -27,3 +29,4 @@ lpad:
 ; WATCHABI: __Unwind_Resume
 ; EABI: __cxa_end_cleanup
 ; ABI: _Unwind_Resume
+; NETBSD: _Unwind_Resume
diff --git a/llvm/test/CodeGen/ARM/ehabi.ll b/llvm/test/CodeGen/ARM/ehabi.ll
index d1a4e9a6bcca..0e5f44793a95 100644
--- a/llvm/test/CodeGen/ARM/ehabi.ll
+++ b/llvm/test/CodeGen/ARM/ehabi.ll
@@ -258,31 +258,34 @@ declare void @_ZSt9terminatev()
 ; DWARF-V7-FP:    .cfi_startproc
 ; DWARF-V7-FP:    .cfi_personality 0, __gxx_personality_v0
 ; DWARF-V7-FP:    .cfi_lsda 0, .Lexception0
-; DWARF-V7-FP:    push {r11, lr}
-; DWARF-V7-FP:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP:    push {r4, r10, r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 16
 ; DWARF-V7-FP:    .cfi_offset lr, -4
 ; DWARF-V7-FP:    .cfi_offset r11, -8
-; DWARF-V7-FP:    mov r11, sp
-; DWARF-V7-FP:    .cfi_def_cfa_register r11
+; DWARF-V7-FP:    .cfi_offset r10, -12
+; DWARF-V7-FP:    .cfi_offset r4, -16
+; DWARF-V7-FP:    add r11, sp, #8
+; DWARF-V7-FP:    .cfi_def_cfa r11, 8
 ; DWARF-V7-FP:    vpush {d8, d9, d10, d11, d12}
-; DWARF-V7-FP:    .cfi_offset d12, -16
-; DWARF-V7-FP:    .cfi_offset d11, -24
-; DWARF-V7-FP:    .cfi_offset d10, -32
-; DWARF-V7-FP:    .cfi_offset d9, -40
+; DWARF-V7-FP:    .cfi_offset d12, -24
+; DWARF-V7-FP:    .cfi_offset d11, -32
+; DWARF-V7-FP:    .cfi_offset d10, -40
+; DWARF-V7-FP:    .cfi_offset d9, -48
+; DWARF-V7-FP:    .cfi_offset d8, -56
 ; DWARF-V7-FP:    sub sp, sp, #24
-; DWARF-V7-FP:    sub sp, r11, #40
+; DWARF-V7-FP:    sub sp, r11, #48
 ; DWARF-V7-FP:    vpop {d8, d9, d10, d11, d12}
-; DWARF-V7-FP:    pop {r11, pc}
+; DWARF-V7-FP:    pop {r4, r10, r11, pc}
 ; DWARF-V7-FP:    .cfi_endproc
 
 ; DWARF-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
 ; DWARF-V7-FP-ELIM:    .cfi_startproc
 ; DWARF-V7-FP-ELIM:    .cfi_personality 0, __gxx_personality_v0
 ; DWARF-V7-FP-ELIM:    .cfi_lsda 0, .Lexception0
-; DWARF-V7-FP-ELIM:    push {r11, lr}
+; DWARF-V7-FP-ELIM:    push {r4, lr}
 ; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 8
 ; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
-; DWARF-V7-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-V7-FP-ELIM:    .cfi_offset r4, -8
 ; DWARF-V7-FP-ELIM:    vpush {d8, d9, d10, d11, d12}
 ; DWARF-V7-FP-ELIM:    .cfi_offset d12, -16
 ; DWARF-V7-FP-ELIM:    .cfi_offset d11, -24
@@ -292,7 +295,7 @@ declare void @_ZSt9terminatev()
 ; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 72
 ; DWARF-V7-FP-ELIM:    add sp, sp, #24
 ; DWARF-V7-FP-ELIM:    vpop {d8, d9, d10, d11, d12}
-; DWARF-V7-FP-ELIM:    pop {r11, pc}
+; DWARF-V7-FP-ELIM:    pop {r4, pc}
 ; DWARF-V7-FP-ELIM:    .cfi_endproc
 
 ; DWARF-WIN-FP-ELIM-LABEL: _Z4testiiiiiddddd:

From 1bc63265afbe007b9ec023e35a10fa0a8143e95b Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 23 Jun 2025 17:58:17 +0100
Subject: [PATCH 1312/1322] [lldb][DWARFASTParserClang] Support constant index
 encoding of DW_AT_object_pointer (#144998)

Starting with https://github.com/llvm/llvm-project/pull/124790, Clang
emits `DW_AT_object_pointer` encoded as integer constants rather than
DIE references. This patch accounts for this.

Depends on https://github.com/llvm/llvm-project/pull/145328 and
https://github.com/llvm/llvm-project/pull/145126
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  51 +++--
 .../x86/explicit-member-function-quals.cpp    |  13 +-
 .../DWARF/DWARFASTParserClangTests.cpp        | 194 ++++++++++++++++++
 3 files changed, 241 insertions(+), 17 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index d3912ad55a23..c76d67b47b33 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -167,9 +167,44 @@ DWARFASTParserClang::GetObjectParameter(const DWARFDIE &subprogram,
          subprogram.Tag() == DW_TAG_inlined_subroutine ||
          subprogram.Tag() == DW_TAG_subroutine_type);
 
-  if (DWARFDIE object_parameter =
-          subprogram.GetAttributeValueAsReferenceDIE(DW_AT_object_pointer))
-    return object_parameter;
+  // The DW_AT_object_pointer may be either encoded as a reference to a DIE,
+  // in which case that's the object parameter we want. Or it can be a constant
+  // index of the parameter.
+  std::optional<size_t> object_pointer_index;
+  DWARFFormValue form_value;
+  if (subprogram.GetDIE()->GetAttributeValue(
+          subprogram.GetCU(), DW_AT_object_pointer, form_value,
+          /*end_attr_offset_ptr=*/nullptr, /*check_elaborating_dies=*/true)) {
+    if (auto ref = form_value.Reference())
+      return ref;
+
+    object_pointer_index = form_value.Unsigned();
+  }
+
+  // Try to find the DW_TAG_formal_parameter via object_pointer_index.
+  DWARFDIE object_pointer;
+  size_t param_index = 0;
+  for (const auto &child : subprogram.children()) {
+    if (child.Tag() != DW_TAG_formal_parameter)
+      continue;
+
+    if (param_index == object_pointer_index.value_or(0)) {
+      object_pointer = child;
+      break;
+    }
+
+    ++param_index;
+  }
+
+  // No formal parameter found for object pointer index.
+  // Nothing to be done.
+  if (!object_pointer)
+    return {};
+
+  // We found the object pointer encoded via DW_AT_object_pointer.
+  // No need for the remaining heuristics.
+  if (object_pointer_index)
+    return object_pointer;
 
   // If no DW_AT_object_pointer was specified, assume the implicit object
   // parameter is the first parameter to the function, is called "this" and is
@@ -178,16 +213,6 @@ DWARFASTParserClang::GetObjectParameter(const DWARFDIE &subprogram,
   if (!decl_ctx_die.IsStructUnionOrClass())
     return {};
 
-  auto children = subprogram.children();
-  auto it = llvm::find_if(children, [](const DWARFDIE &child) {
-    return child.Tag() == DW_TAG_formal_parameter;
-  });
-
-  if (it == children.end())
-    return {};
-
-  DWARFDIE object_pointer = *it;
-
   if (!object_pointer.GetAttributeValueAsUnsigned(DW_AT_artificial, 0))
     return {};
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
index 33001db69f83..c592122e7a97 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/explicit-member-function-quals.cpp
@@ -1,4 +1,9 @@
 // XFAIL: *
+//
+// FIXME: Explicit object parameter is not shown in
+// type lookup output. This is because we don't attach
+// valid source locations to decls in the DWARF AST,
+// so the ParmVarDecl::isExplicitObjectParameter fails.
 
 // Tests that we correctly deduce the CV-quals and storage
 // class of explicit object member functions.
@@ -8,10 +13,10 @@
 //
 // CHECK:      (lldb) type lookup Foo
 // CHECK-NEXT: struct Foo {
-// CHECK-NEXT:      void Method(Foo);
-// CHECK-NEXT:      void cMethod(const Foo &) const;
-// CHECK-NEXT:      void vMethod(volatile Foo &) volatile;
-// CHECK-NEXT:      void cvMethod(const volatile Foo &) const volatile;
+// CHECK-NEXT:      void Method(this Foo);
+// CHECK-NEXT:      void cMethod(this const Foo &) const;
+// CHECK-NEXT:      void vMethod(this volatile Foo &) volatile;
+// CHECK-NEXT:      void cvMethod(this const volatile Foo &) const volatile;
 // CHECK-NEXT: }
 
 struct Foo {
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index fa05cd174fc7..d608a5738209 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -1423,3 +1423,197 @@ DWARF:
   EXPECT_EQ(func->getParamDecl(1)->getDeclContext(), func);
   EXPECT_EQ(func->getParamDecl(1)->getName(), "namedParam");
 }
+
+TEST_F(DWARFASTParserClangTests, TestObjectPointer_IndexEncoding) {
+  // This tests the behaviour of DWARFASTParserClang
+  // for DW_TAG_subprogram definitions which have a DW_AT_object_pointer
+  // that encodes a constant index (instead of a DIE reference).
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - Context
+    - func
+    - this
+    - self
+    - arg
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_implicit_const
+              Value:           1
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x4
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_object_pointer
+              Form:            DW_FORM_implicit_const
+              Value:           0
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+
+        - Code:            0x5
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+
+        - Code:            0x6
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+
+  debug_info:
+     - Version:  5
+       UnitType: DW_UT_compile
+       AddrSize: 8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode: 0x1
+          Values:
+            - Value: 0x04
+
+#   DW_TAG_structure_type
+#     DW_AT_name [DW_FORM_strp] ("Context")
+
+        - AbbrCode: 0x2
+          Values:
+            - Value: 0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_object_pointer [DW_FORM_implicit_const] (1)
+        - AbbrCode: 0x3
+          Values:
+            - Value: 0x8
+            - Value: 0x1
+            - Value: 0x1
+            - Value: 0x1
+
+#       DW_TAG_formal_parameter
+#         DW_AT_name [DW_FORM_strp] ("arg")
+        - AbbrCode: 0x5
+          Values:
+          - Value: 0x17
+
+#       DW_TAG_formal_parameter
+#         DW_AT_name [DW_FORM_strp] ("self")
+#         DW_AT_artificial
+        - AbbrCode: 0x6
+          Values:
+          - Value: 0x12
+          - Value: 0x1
+
+        - AbbrCode: 0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_object_pointer [DW_FORM_implicit_const] (0)
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x4
+          Values:
+            - Value: 0x8
+            - Value: 0x1
+            - Value: 0x1
+            - Value: 0x1
+
+#       DW_TAG_formal_parameter
+#         DW_AT_name [DW_FORM_strp] ("this")
+#         DW_AT_artificial
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0xd
+            - Value:           0x1
+
+#       DW_TAG_formal_parameter
+#         DW_AT_name [DW_FORM_strp] ("arg")
+        - AbbrCode: 0x5
+          Values:
+          - Value: 0x17
+
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+...
+)";
+
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  auto context_die = cu_die.GetFirstChild();
+  ASSERT_TRUE(context_die.IsValid());
+  ASSERT_EQ(context_die.Tag(), DW_TAG_structure_type);
+
+  auto sub1 = context_die.GetFirstChild();
+  ASSERT_TRUE(sub1.IsValid());
+  ASSERT_EQ(sub1.Tag(), DW_TAG_subprogram);
+
+  auto sub2 = sub1.GetSibling();
+  ASSERT_TRUE(sub2.IsValid());
+  ASSERT_EQ(sub2.Tag(), DW_TAG_subprogram);
+
+  // Object parameter is at constant index 1
+  {
+    auto param_die = sub1.GetFirstChild().GetSibling();
+    ASSERT_TRUE(param_die.IsValid());
+
+    EXPECT_EQ(param_die, ast_parser.GetObjectParameter(sub1, context_die));
+  }
+
+  // Object parameter is at constant index 0
+  {
+    auto param_die = sub2.GetFirstChild();
+    ASSERT_TRUE(param_die.IsValid());
+
+    EXPECT_EQ(param_die, ast_parser.GetObjectParameter(sub2, context_die));
+  }
+}

From 8d83d046376e7b57c1aa0c5bdd8958b21bbaf0ca Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 23 Jun 2025 18:02:58 +0100
Subject: [PATCH 1313/1322] [lldb] add plugin names to process save-core error
 output. (#143126)

continuation of
[#142684](https://github.com/llvm/llvm-project/pull/142684) to show
plugin names.

From issue [#14258](https://github.com/llvm/llvm-project/issues/142581)
---
 lldb/include/lldb/Core/PluginManager.h        |  2 ++
 lldb/source/Commands/CommandObjectProcess.cpp | 24 ++++++++++++++++-
 lldb/source/Core/PluginManager.cpp            | 27 +++++++++++++++----
 lldb/source/Symbol/SaveCoreOptions.cpp        | 17 +++++++++---
 .../process_save_core/TestProcessSaveCore.py  | 18 +++++++++++++
 ...ommand-process-save-core-not-a-plugin.test |  9 +++++--
 6 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index d1af25988e50..5499e99025d8 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -264,6 +264,8 @@ public:
   static Status SaveCore(const lldb::ProcessSP &process_sp,
                          lldb_private::SaveCoreOptions &core_options);
 
+  static std::vector<llvm::StringRef> GetSaveCorePluginNames();
+
   // ObjectContainer
   static bool RegisterPlugin(
       llvm::StringRef name, llvm::StringRef description,
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index b1f243c9e277..0a1744277d7d 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -1281,7 +1281,27 @@ public:
     ~CommandOptions() override = default;
 
     llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
-      return llvm::ArrayRef(g_process_save_core_options);
+      if (!m_opt_def.empty())
+        return llvm::ArrayRef(m_opt_def);
+
+      auto orig = llvm::ArrayRef(g_process_save_core_options);
+      m_opt_def.resize(orig.size());
+      llvm::copy(g_process_save_core_options, m_opt_def.data());
+      for (OptionDefinition &value : m_opt_def) {
+        llvm::StringRef opt_name = value.long_option;
+        if (opt_name != "plugin-name")
+          continue;
+
+        std::vector<llvm::StringRef> plugin_names =
+            PluginManager::GetSaveCorePluginNames();
+        m_plugin_enums.resize(plugin_names.size());
+        for (auto [num, val] : llvm::zip(plugin_names, m_plugin_enums)) {
+          val.string_value = num.data();
+        }
+        value.enum_values = llvm::ArrayRef(m_plugin_enums);
+        break;
+      }
+      return llvm::ArrayRef(m_opt_def);
     }
 
     Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
@@ -1312,6 +1332,8 @@ public:
 
     // Instance variables to hold the values for command options.
     SaveCoreOptions m_core_dump_options;
+    llvm::SmallVector<OptionEnumValueElement> m_plugin_enums;
+    std::vector<OptionDefinition> m_opt_def;
   };
 
 protected:
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index dc0731c04eef..3f20a96edc18 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -989,11 +989,28 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
   }
 
   // Check to see if any of the object file plugins tried and failed to save.
-  // If none ran, set the error message.
-  if (error.Success())
-    error = Status::FromErrorString(
-        "no ObjectFile plugins were able to save a core for this process");
-  return error;
+  // if any failure, return the error message.
+  if (error.Fail())
+    return error;
+
+  // Report only for the plugin that was specified.
+  if (!plugin_name.empty())
+    return Status::FromErrorStringWithFormatv(
+        "The \"{}\" plugin is not able to save a core for this process.",
+        plugin_name);
+
+  return Status::FromErrorString(
+      "no ObjectFile plugins were able to save a core for this process");
+}
+
+std::vector<llvm::StringRef> PluginManager::GetSaveCorePluginNames() {
+  std::vector<llvm::StringRef> plugin_names;
+  auto instances = GetObjectFileInstances().GetSnapshot();
+  for (auto &instance : instances) {
+    if (instance.save_core)
+      plugin_names.emplace_back(instance.name);
+  }
+  return plugin_names;
 }
 
 #pragma mark ObjectContainer
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
index d884b00a47b0..0f9dbb73c172 100644
--- a/lldb/source/Symbol/SaveCoreOptions.cpp
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -21,9 +21,20 @@ Status SaveCoreOptions::SetPluginName(const char *name) {
     return error;
   }
 
-  if (!PluginManager::IsRegisteredObjectFilePluginName(name)) {
-    return Status::FromErrorStringWithFormat(
-        "plugin name '%s' is not a valid ObjectFile plugin name", name);
+  std::vector<llvm::StringRef> plugin_names =
+      PluginManager::GetSaveCorePluginNames();
+  if (llvm::find(plugin_names, name) == plugin_names.end()) {
+    StreamString stream;
+    stream.Printf("plugin name '%s' is not a valid ObjectFile plugin name.",
+                  name);
+
+    if (!plugin_names.empty()) {
+      stream.PutCString(" Valid names are: ");
+      std::string plugin_names_str = llvm::join(plugin_names, ", ");
+      stream.PutCString(plugin_names_str);
+      stream.PutChar('.');
+    }
+    return Status(stream.GetString().str());
   }
 
   m_plugin_name = name;
diff --git a/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py b/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
index 8573d1573392..cf7bd9787d64 100644
--- a/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
+++ b/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
@@ -88,3 +88,21 @@ class ProcessSaveCoreTestCase(TestBase):
                 os.unlink(core)
             except OSError:
                 pass
+
+    def test_help(self):
+        """Test that help shows an option in plugin-names and style."""
+        self.expect(
+            "help process save-core",
+            substrs=["process save-core", "<plugin>", "Values:", "minidump"],
+        )
+
+        self.expect(
+            "help process save-core",
+            substrs=[
+                "process save-core",
+                "<corefile-style>",
+                "Values:",
+                "full",
+                "stack",
+            ],
+        )
diff --git a/lldb/test/Shell/Commands/command-process-save-core-not-a-plugin.test b/lldb/test/Shell/Commands/command-process-save-core-not-a-plugin.test
index c034c8ebbf87..e6afda1474b1 100644
--- a/lldb/test/Shell/Commands/command-process-save-core-not-a-plugin.test
+++ b/lldb/test/Shell/Commands/command-process-save-core-not-a-plugin.test
@@ -2,7 +2,7 @@
 # with a plugin that does not exist.
 
 # RUN: %clang_host -g %S/Inputs/main.c -o %t
-# RUN: %lldb %t -s %s -o exit 2>&1 | FileCheck %s
+# RUN: %lldb %t -o "settings set interpreter.stop-command-source-on-error false" -s %s -o exit 2>&1 | FileCheck %s
 
 b main
 # CHECK-LABEL: b main
@@ -14,6 +14,11 @@ run
 # CHECK: stop reason = breakpoint 1
 # CHECK:   frame #0: {{.*}}`main at main.c
 
+process save-core --plugin-name=minidump
+# CHECK-LABEL: process save-core --plugin-name=minidump
+# CHECK: error: 'process save-core' takes one arguments:
+# CHECK: Usage: {{.*}} FILE
+
 process save-core --plugin-name=notaplugin dump
 # CHECK-LABEL: process save-core --plugin-name=notaplugin dump
-# CHECK: error: plugin name 'notaplugin' is not a valid ObjectFile plugin name
+# CHECK: error: plugin name 'notaplugin' is not a valid ObjectFile plugin name. Valid names are:{{.*}}minidump{{.*}}

From f40909f605fdd7c049d50b6483db9e769fb933c0 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 23 Jun 2025 10:06:53 -0700
Subject: [PATCH 1314/1322] [RISCV] Add SiFive X390 scheduling model (#143938)

This patch adds the scheduling model for sifive-x390. X390 is a dual
issue in-order CPU. It has two scalar and two vector pipes, with
VLEN=1024 and DLEN=512.

Co-authored-by: Michael Maitland <michaeltmaitland@gmail.com>
---
 llvm/lib/Target/RISCV/RISCVProcessors.td      |    3 +-
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td    |  395 +-
 .../llvm-mca/RISCV/SiFiveX390/div-fdiv.s      |   54 +
 .../RISCV/SiFiveX390/fractional-lmul-data.s   |   62 +
 .../llvm-mca/RISCV/SiFiveX390/reductions.s    |  678 +++
 .../RISCV/SiFiveX390/strided-load-store.s     |  368 ++
 .../RISCV/SiFiveX390/strided-load-x0.s        |  132 +
 .../llvm-mca/RISCV/SiFiveX390/vector-fp.s     | 4851 +++++++++++++++++
 .../SiFiveX390/vector-integer-arithmetic.s    | 2272 ++++++++
 .../RISCV/SiFiveX390/vgather-vcompress.s      |  317 ++
 .../tools/llvm-mca/RISCV/SiFiveX390/vle-vse.s | 1256 +++++
 11 files changed, 10225 insertions(+), 163 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/div-fdiv.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/fractional-lmul-data.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/reductions.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-store.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-x0.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-integer-arithmetic.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vle-vse.s

diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index d7e6c71ea062..a28761814c2a 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -292,7 +292,8 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZbb],
                                       SiFiveIntelligenceTuneFeatures>;
 
-def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390", NoSchedModel,
+def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
+                                      SiFiveX390Model,
                                       [Feature64Bit,
                                        FeatureStdExtI,
                                        FeatureStdExtM,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 071b64571fe3..78a176fcf18d 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -169,6 +169,12 @@ class SiFive7GetOrderedReductionCycles<string mx, int sew, int VLEN> {
   int c = !mul(6, VLUpperBound);
 }
 
+class SiFive7FPLatencies {
+  int BasicFP16ALU;
+  int BasicFP32ALU;
+  int BasicFP64ALU;
+}
+
 class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
     : ReadAdvance<read, cycles, [WriteIALU, WriteIALU32,
                                  WriteShiftImm, WriteShiftImm32,
@@ -186,12 +192,13 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
                                  WriteIRem, WriteIRem32,
                                  WriteLDB, WriteLDH, WriteLDW, WriteLDD]>;
 
-// The SiFive7 microarchitecture has three pipelines: A, B, V.
+// The SiFive7 microarchitecture has three kinds of pipelines: A, B, V.
 // Pipe A can handle memory, integer alu and vector operations.
 // Pipe B can handle integer alu, control flow, integer multiply and divide,
 // and floating point computation.
-// The V pipeline is modeled by the VCQ, VA, VL, and VS resources.
-multiclass SiFive7ProcResources {
+// The V pipeline is modeled by the VCQ, VA, VL, and VS resources. There can
+// be one or two VA (Vector Arithmetic).
+multiclass SiFive7ProcResources<bit extraVALU = false> {
   let BufferSize = 0 in {
     def PipeA     : ProcResource<1>;
     def PipeB     : ProcResource<1>;
@@ -199,7 +206,15 @@ multiclass SiFive7ProcResources {
     def IDiv      : ProcResource<1>; // Int Division
     def FDiv      : ProcResource<1>; // FP Division/Sqrt
 
-    def VA      : ProcResource<1>; // Arithmetic sequencer
+    // Arithmetic sequencer(s)
+    if extraVALU then {
+      // VA1 can handle any vector airthmetic instruction.
+      def VA1     : ProcResource<1>;
+      // VA2 generally can only handle simple vector arithmetic.
+      def VA2     : ProcResource<1>;
+    } else {
+      def VA      : ProcResource<1>;
+    }
 
     def VL        : ProcResource<1>; // Load sequencer
     def VS        : ProcResource<1>; // Store sequencer
@@ -217,13 +232,20 @@ multiclass SiFive7ProcResources {
 
   def PipeAB : ProcResGroup<[!cast<ProcResource>(NAME#"PipeA"),
                              !cast<ProcResource>(NAME#"PipeB")]>;
+
+  if extraVALU then
+  def VA1OrVA2 : ProcResGroup<[!cast<ProcResource>(NAME#"VA1"),
+                               !cast<ProcResource>(NAME#"VA2")]>;
 }
 
 multiclass SiFive7WriteResBase<int VLEN,
     ProcResourceKind PipeA, ProcResourceKind PipeB, ProcResourceKind PipeAB,
     ProcResourceKind IDiv, ProcResourceKind FDiv,
-    ProcResourceKind VA, ProcResourceKind VL, ProcResourceKind VS,
-    ProcResourceKind VCQ> {
+    ProcResourceKind VA1, ProcResourceKind VA1OrVA2,
+    ProcResourceKind VL, ProcResourceKind VS,
+    ProcResourceKind VCQ,
+    SiFive7FPLatencies fpLatencies,
+    bit isFP64Throttled = false> {
 
   // Branching
   let Latency = 3 in {
@@ -350,7 +372,7 @@ multiclass SiFive7WriteResBase<int VLEN,
   }
 
   // Half precision.
-  let Latency = 5 in {
+  let Latency = fpLatencies.BasicFP16ALU in {
   def : WriteRes<WriteFAdd16, [PipeB]>;
   def : WriteRes<WriteFMul16, [PipeB]>;
   def : WriteRes<WriteFMA16, [PipeB]>;
@@ -366,7 +388,7 @@ multiclass SiFive7WriteResBase<int VLEN,
   }
 
   // Single precision.
-  let Latency = 5 in {
+  let Latency = fpLatencies.BasicFP32ALU in {
     def : WriteRes<WriteFAdd32, [PipeB]>;
     def : WriteRes<WriteFMul32, [PipeB]>;
     def : WriteRes<WriteFMA32, [PipeB]>;
@@ -386,7 +408,7 @@ multiclass SiFive7WriteResBase<int VLEN,
   }
 
   // Double precision
-  let Latency = 7 in {
+  let Latency = fpLatencies.BasicFP64ALU in {
     def : WriteRes<WriteFAdd64, [PipeB]>;
     def : WriteRes<WriteFMul64, [PipeB]>;
     def : WriteRes<WriteFMA64, [PipeB]>;
@@ -624,43 +646,43 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVIALUV",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIALUX",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIALUI",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIALUV",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIALUX",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIALUI",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
       // vmadc requires mask
-      defm : LMULWriteResMX<"WriteVICALUV",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICALUX",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICALUI",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICALUMV",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICALUMX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICALUMI",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUV",    [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUX",    [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUI",    [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUMV",   [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUMX",   [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICALUMI",   [VCQ, VA1], mx, IsWorstCase>;
       // min max require merge
-      defm : LMULWriteResMX<"WriteVIMinMaxV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMinMaxX",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMergeV",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMergeX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMergeI",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMinMaxV",  [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMinMaxX",  [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMergeV",   [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMergeX",   [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMergeI",   [VCQ, VA1], mx, IsWorstCase>;
 
-      defm : LMULWriteResMX<"WriteVIMovV",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMovX",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMovI",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMovV",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMovX",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMovI",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
 
-      defm : LMULWriteResMX<"WriteVExtV",      [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVExtV",      [VCQ, VA1], mx, IsWorstCase>;
     }
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVShiftV",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVShiftX",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVShiftI",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMulV",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMulX",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMulAddV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIMulAddX",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVShiftV",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVShiftX",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVShiftI",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulV",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulX",     [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulAddV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIMulAddX",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
     }
     // Mask results can't chain.
     let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVICmpV",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICmpX",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVICmpI",     [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICmpV",     [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICmpX",     [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVICmpI",     [VCQ, VA1], mx, IsWorstCase>;
     }
   }
 
@@ -670,8 +692,8 @@ multiclass SiFive7WriteResBase<int VLEN,
                            !div(SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, 4));
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
       let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVIDivV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVIDivX", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVIDivV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVIDivX", [VCQ, VA1], mx, sew, IsWorstCase>;
       }
     }
   }
@@ -681,13 +703,13 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVIWALUV",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIWALUX",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIWALUI",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIWMulV",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIWMulX",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIWMulAddV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIWMulAddX", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWALUV",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWALUX",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWALUI",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulV",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulX",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulAddV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIWMulAddX", [VCQ, VA1OrVA2], mx, IsWorstCase>;
     }
   }
   // Narrowing
@@ -695,9 +717,9 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVNShiftV",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVNShiftX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVNShiftI",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNShiftV",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNShiftX",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNShiftI",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
     }
   }
 
@@ -706,16 +728,16 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVSALUV",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSALUX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSALUI",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVAALUV",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVAALUX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSMulV",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSMulX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSShiftV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSShiftX", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSShiftI", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSALUV",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSALUX",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSALUI",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVAALUV",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVAALUX",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSMulV",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSMulX",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSShiftV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSShiftX", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSShiftI", [VCQ, VA1OrVA2], mx, IsWorstCase>;
     }
   }
   // Narrowing
@@ -723,33 +745,38 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVNClipV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVNClipX",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVNClipI",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNClipV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNClipX",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVNClipI",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
     }
   }
 
   // 13. Vector Floating-Point Instructions
   foreach mx = SchedMxListF in {
     foreach sew = SchedSEWSet<mx, isF=1>.val in {
-      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)),
+                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
+                          SiFive7GetCyclesDefault<mx>.c);
+      defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8);
+      defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
         defm : LMULSEWWriteResMXSEW<"WriteVFALUV",  [VCQ, VA], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFALUF",  [VCQ, VA], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFMulV",  [VCQ, VA], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFMulF",  [VCQ, VA], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
       }
-      let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4);
+      let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
         defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [VCQ, VA], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [VCQ, VA], mx, sew, IsWorstCase>;
         // min max require merge
-        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
       }
     }
   }
@@ -757,18 +784,18 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVFCvtFToIV",  [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFCvtFToIV",  [VCQ, VA1], mx, IsWorstCase>;
     }
     let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVFClassV",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVFMergeV",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVFMovV",      [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFClassV",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFMergeV",    [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFMovV",      [VCQ, VA1OrVA2], mx, IsWorstCase>;
     }
     // Mask results can't chain.
     let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       // fcmp requires mask
-      defm : LMULWriteResMX<"WriteVFCmpV",      [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVFCmpF",      [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFCmpV",      [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFCmpF",      [VCQ, VA1], mx, IsWorstCase>;
     }
   }
   foreach mx = SchedMxListF in {
@@ -777,9 +804,9 @@ multiclass SiFive7WriteResBase<int VLEN,
                            !div(SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c, 4));
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
       let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFDivV",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFDivF",  [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFDivV",  [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFDivF",  [VCQ, VA1], mx, sew, IsWorstCase>;
       }
     }
   }
@@ -787,10 +814,12 @@ multiclass SiFive7WriteResBase<int VLEN,
   // Widening
   foreach mx = SchedMxListW in {
     foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
-      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
+                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
+                          SiFive7GetCyclesDefault<mx>.c);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
       let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+      defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
     }
   }
   foreach mx = SchedMxListFW in {
@@ -798,37 +827,41 @@ multiclass SiFive7WriteResBase<int VLEN,
       defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
       let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFWMulF", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
       }
-      defvar CvtCycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
+                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
+                          SiFive7GetCyclesDefault<mx>.c);
       let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
     }
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-    defm : LMULWriteResMX<"WriteVFWCvtFToIV", [VCQ, VA], mx, IsWorstCase>;
+    defm : LMULWriteResMX<"WriteVFWCvtFToIV", [VCQ, VA1], mx, IsWorstCase>;
   }
   // Narrowing
   foreach mx = SchedMxListW in {
     defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListW>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVFNCvtFToIV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFNCvtFToIV", [VCQ, VA1], mx, IsWorstCase>;
     }
   }
   foreach mx = SchedMxListFW in {
     foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-      defvar Cycles = SiFive7GetCyclesNarrowing<mx>.c;
+      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
+                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
+                          SiFive7GetCyclesNarrowing<mx>.c);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
       let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
       }
     }
   }
@@ -839,9 +872,9 @@ multiclass SiFive7WriteResBase<int VLEN,
       defvar Cycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
       let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [VCQ, VA],
+        defm : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [VCQ, VA1],
                                        mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [VCQ, VA],
+        defm : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [VCQ, VA1],
                                        mx, sew, IsWorstCase>;
       }
     }
@@ -852,7 +885,7 @@ multiclass SiFive7WriteResBase<int VLEN,
       defvar Cycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
       let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [VCQ, VA],
+      defm : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [VCQ, VA1],
                                      mx, sew, IsWorstCase>;
     }
   }
@@ -862,14 +895,14 @@ multiclass SiFive7WriteResBase<int VLEN,
       defvar RedCycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
       let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [VCQ, VA],
+        defm : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [VCQ, VA1],
                                        mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [VCQ, VA],
+        defm : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [VCQ, VA1],
                                        mx, sew, IsWorstCase>;
       }
       defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, VLEN>.c;
       let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [VCQ, VA],
+      defm : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [VCQ, VA1],
                                      mx, sew, IsWorstCase>;
     }
   }
@@ -879,11 +912,11 @@ multiclass SiFive7WriteResBase<int VLEN,
       defvar RedCycles = SiFive7GetReductionCycles<mx, sew, VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
       let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [VCQ, VA],
+      defm : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [VCQ, VA1],
                                      mx, sew, IsWorstCase>;
       defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, VLEN>.c;
       let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [VCQ, VA],
+      defm : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [VCQ, VA1],
                                      mx, sew, IsWorstCase>;
     }
   }
@@ -893,34 +926,34 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesVMask<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVMALUV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVMPopV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVMFFSV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVMSFSV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMALUV", [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMPopV", [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMFFSV", [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVMSFSV", [VCQ, VA1], mx, IsWorstCase>;
     }
   }
   foreach mx = SchedMxList in {
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVIotaV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVIdxV", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIotaV", [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVIdxV", [VCQ, VA1], mx, IsWorstCase>;
     }
   }
 
   // 16. Vector Permutation Instructions
   let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 1)] in {
-    def : WriteRes<WriteVMovSX, [VCQ, VA]>;
-    def : WriteRes<WriteVMovXS, [VCQ, VA]>;
-    def : WriteRes<WriteVMovSF, [VCQ, VA]>;
-    def : WriteRes<WriteVMovFS, [VCQ, VA]>;
+    def : WriteRes<WriteVMovSX, [VCQ, VA1OrVA2]>;
+    def : WriteRes<WriteVMovXS, [VCQ, VA1]>;
+    def : WriteRes<WriteVMovSF, [VCQ, VA1OrVA2]>;
+    def : WriteRes<WriteVMovFS, [VCQ, VA1]>;
   }
   foreach mx = SchedMxList in {
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVRGatherVX",    [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVRGatherVI",    [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVRGatherVX",    [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVRGatherVI",    [VCQ, VA1], mx, IsWorstCase>;
     }
   }
 
@@ -929,9 +962,9 @@ multiclass SiFive7WriteResBase<int VLEN,
       defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
       let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
       }
     }
   }
@@ -940,23 +973,23 @@ multiclass SiFive7WriteResBase<int VLEN,
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVSlideUpX",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSlideDownX", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVSlideI",     [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVISlide1X",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVFSlide1F",   [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSlideUpX",   [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSlideDownX", [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVSlideI",     [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVISlide1X",   [VCQ, VA1], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVFSlide1F",   [VCQ, VA1], mx, IsWorstCase>;
     }
   }
 
   // VMov*V is LMUL Aware
   let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 2)] in
-    def : WriteRes<WriteVMov1V,     [VCQ, VA]>;
+    def : WriteRes<WriteVMov1V,     [VCQ, VA1OrVA2]>;
   let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 4)] in
-    def : WriteRes<WriteVMov2V,     [VCQ, VA]>;
+    def : WriteRes<WriteVMov2V,     [VCQ, VA1OrVA2]>;
   let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 8)] in
-    def : WriteRes<WriteVMov4V,     [VCQ, VA]>;
+    def : WriteRes<WriteVMov4V,     [VCQ, VA1OrVA2]>;
   let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 16)] in
-    def : WriteRes<WriteVMov8V,     [VCQ, VA]>;
+    def : WriteRes<WriteVMov8V,     [VCQ, VA1OrVA2]>;
 
   // Others
   def : WriteRes<WriteCSR, [PipeB]>;
@@ -982,37 +1015,37 @@ multiclass SiFive7WriteResBase<int VLEN,
     let Latency = Cycles,
         AcquireAtCycles = [0, 1],
         ReleaseAtCycles = [1, !add(1, Cycles)] in {
-      defm : LMULWriteResMX<"WriteVC_V_I",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_X",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_IV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_VV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_XV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_IVV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_IVW", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_VVV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_VVW", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_XVV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_V_XVW", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_I",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_X",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_IV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_VV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_XV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_IVV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_IVW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_VVV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_VVW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_XVV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_V_XVW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
       foreach f = ["FPR16", "FPR32", "FPR64"] in {
-        defm : LMULWriteResMX<"WriteVC_V_" # f # "V",  [VCQ, VA], mx, IsWorstCase>;
-        defm : LMULWriteResMX<"WriteVC_V_" # f # "VV", [VCQ, VA], mx, IsWorstCase>;
-        defm : LMULWriteResMX<"WriteVC_V_" # f # "VW", [VCQ, VA], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_V_" # f # "V",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_V_" # f # "VV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_V_" # f # "VW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
       }
-      defm : LMULWriteResMX<"WriteVC_I",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_X",   [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_IV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_VV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_XV",  [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_IVV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_IVW", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_VVV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_VVW", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_XVV", [VCQ, VA], mx, IsWorstCase>;
-      defm : LMULWriteResMX<"WriteVC_XVW", [VCQ, VA], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_I",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_X",   [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_IV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_VV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_XV",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_IVV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_IVW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_VVV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_VVW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_XVV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+      defm : LMULWriteResMX<"WriteVC_XVW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
       foreach f = ["FPR16", "FPR32", "FPR64"] in {
-        defm : LMULWriteResMX<"WriteVC_" # f # "V",  [VCQ, VA], mx, IsWorstCase>;
-        defm : LMULWriteResMX<"WriteVC_" # f # "VV", [VCQ, VA], mx, IsWorstCase>;
-        defm : LMULWriteResMX<"WriteVC_" # f # "VW", [VCQ, VA], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_" # f # "V",  [VCQ, VA1OrVA2], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_" # f # "VV", [VCQ, VA1OrVA2], mx, IsWorstCase>;
+        defm : LMULWriteResMX<"WriteVC_" # f # "VW", [VCQ, VA1OrVA2], mx, IsWorstCase>;
       }
     }
   }
@@ -1308,8 +1341,10 @@ multiclass SiFive7ReadAdvance {
 /// This multiclass is a "bundle" of (1) processor resources (i.e. pipes) and
 /// (2) WriteRes entries. It's parameterized by config values that will
 /// eventually be supplied by different SchedMachineModels.
-multiclass SiFive7SchedResources<int vlen> {
-  defm SiFive7 : SiFive7ProcResources;
+multiclass SiFive7SchedResources<int vlen, bit extraVALU,
+                                 SiFive7FPLatencies fpLatencies,
+                                 bit isFP64Throttled> {
+  defm SiFive7 : SiFive7ProcResources<extraVALU>;
 
   // Pull out defs from SiFive7ProcResources so we can refer to them by name.
   defvar SiFive7PipeA = !cast<ProcResource>(NAME # SiFive7PipeA);
@@ -1317,6 +1352,13 @@ multiclass SiFive7SchedResources<int vlen> {
   defvar SiFive7PipeAB = !cast<ProcResGroup>(NAME # SiFive7PipeAB);
   defvar SiFive7IDiv = !cast<ProcResource>(NAME # SiFive7IDiv);
   defvar SiFive7FDiv = !cast<ProcResource>(NAME # SiFive7FDiv);
+  // Pass SiFive7VA for VA1 and VA1OrVA2 if there is only 1 VALU.
+  defvar SiFive7VA1 = !if (extraVALU,
+                            !cast<ProcResource>(NAME # SiFive7VA1),
+                            !cast<ProcResource>(NAME # SiFive7VA));
+  defvar SiFive7VA1OrVA2 = !if (extraVALU,
+                            !cast<ProcResGroup>(NAME # SiFive7VA1OrVA2),
+                            !cast<ProcResource>(NAME # SiFive7VA));
   defvar SiFive7VA = !cast<ProcResource>(NAME # SiFive7VA);
   defvar SiFive7VL = !cast<ProcResource>(NAME # SiFive7VL);
   defvar SiFive7VS = !cast<ProcResource>(NAME # SiFive7VS);
@@ -1326,9 +1368,9 @@ multiclass SiFive7SchedResources<int vlen> {
   // SchedModels.
   defm SiFive7
       : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
-                            SiFive7IDiv, SiFive7FDiv,
-                            SiFive7VA, SiFive7VL, SiFive7VS,
-                            SiFive7VCQ>;
+                            SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
+                            SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
+                            SiFive7VCQ, fpLatencies, isFP64Throttled>;
 
   //===----------------------------------------------------------------------===//
   // Bypass and advance
@@ -1357,17 +1399,46 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
                              HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
                              HasStdExtZkr];
   int VLEN = vlen;
+  bit HasExtraVALU = false;
+
+  SiFive7FPLatencies FPLatencies;
+  bit IsFP64Throttled = false;
 
   string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
 }
 
+/// Auxiliary config values.
+def SiFive7DefaultFPLatencies : SiFive7FPLatencies {
+  let BasicFP16ALU = 5;
+  let BasicFP32ALU = 5;
+  let BasicFP64ALU = 7;
+}
+
+def SiFive7LowFPLatencies : SiFive7FPLatencies {
+  let BasicFP16ALU = 4;
+  let BasicFP32ALU = 4;
+  let BasicFP64ALU = 4;
+}
+
 /// Models
-def SiFive7VLEN512Model : SiFive7SchedMachineModel<512>;
+def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
+  let FPLatencies = SiFive7DefaultFPLatencies;
+}
+
+def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
+  let HasExtraVALU = true;
+  let FPLatencies = SiFive7LowFPLatencies;
+  let IsFP64Throttled = true;
+}
 
 /// Binding models to their scheduling resources.
-let SchedModel = SiFive7VLEN512Model in
-defm !cast<string>(SiFive7VLEN512Model.Name)
-    : SiFive7SchedResources<SiFive7VLEN512Model.VLEN>;
+foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
+  let SchedModel = model in
+  defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
+                                          model.FPLatencies,
+                                          model.IsFP64Throttled>;
+}
 
 // Some model name aliases.
 defvar SiFive7Model = SiFive7VLEN512Model;
+defvar SiFiveX390Model = SiFive7VLEN1024X300Model;
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/div-fdiv.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/div-fdiv.s
new file mode 100644
index 000000000000..138f4023e448
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/div-fdiv.s
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s \
+# RUN:   | FileCheck %s
+
+div a0, a1, a2
+fdiv.s f1, f2, f3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      66    65.00                        66    VLEN1024X300SiFive7IDiv[65],VLEN1024X300SiFive7PipeAB,VLEN1024X300SiFive7PipeB DIV div	a0, a1, a2
+# CHECK-NEXT:  1      27    26.00                        27    VLEN1024X300SiFive7FDiv[26],VLEN1024X300SiFive7PipeAB,VLEN1024X300SiFive7PipeB FDIV_S fdiv.s	ft1, ft2, ft3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT: 26.00  65.00   -     2.00    -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -     65.00   -     1.00    -      -      -      -      -     div	a0, a1, a2
+# CHECK-NEXT: 26.00   -      -     1.00    -      -      -      -      -     fdiv.s	ft1, ft2, ft3
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/fractional-lmul-data.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/fractional-lmul-data.s
new file mode 100644
index 000000000000..3795f17fc1cd
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/fractional-lmul-data.s
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+# TODO: This test should be replaced by an exhaustive test of legal (LMUL, SEW)
+# pairs for all instructions in the Vector Integer Arithmetic chapter of the RVV
+# SPEC.
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vv v12, v12, v12
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vv v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      112   112.00                       112   VLEN1024X300SiFive7VA1[1,113],VLEN1024X300SiFive7VA1OrVA2[1,113],VLEN1024X300SiFive7VCQ VDIV_VV vdiv.vv	v12, v12, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VDIV_VV vdiv.vv	v12, v12, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     2.00    -     174.00  -     2.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     113.00  -     1.00    -      -     vdiv.vv	v12, v12, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vdiv.vv	v12, v12, v12
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/reductions.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/reductions.s
new file mode 100644
index 000000000000..c1415f3e08e4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/reductions.s
@@ -0,0 +1,678 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+# Single-Width Integer Reductions
+vsetvli zero, zero, e8, mf8, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e8, mf2, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e8, m8, tu, mu
+vredsum.vs  v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vredand.vs  v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vredand.vs  v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vredand.vs  v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vredand.vs  v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vredand.vs  v4, v8, v12
+vsetvli zero, zero, e16, m8, tu, mu
+vredand.vs  v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vredor.vs   v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vredor.vs   v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vredor.vs   v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vredor.vs   v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vredor.vs   v4, v8, v12
+vsetvli zero, zero, e64, m1, tu, mu
+vredxor.vs  v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vredxor.vs  v4, v8, v12
+vsetvli zero, zero, e64, m4, tu, mu
+vredxor.vs  v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vredxor.vs  v4, v8, v12
+# Single-Width Integer Min/Max Reductions
+vsetvli zero, zero, e8, mf8, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e8, mf2, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e8, m8, tu, mu
+vredmaxu.vs v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m8, tu, mu
+vredmax.vs  v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vredminu.vs v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vredminu.vs v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vredminu.vs v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vredminu.vs v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vredminu.vs v4, v8, v12
+vsetvli zero, zero, e64, m1, tu, mu
+vredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m4, tu, mu
+vredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vredmin.vs  v4, v8, v12
+# Widening Integer Reductions
+vsetvli zero, zero, e8, mf8, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e8, mf2, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e8, m8, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e16, m8, tu, mu
+vwredsumu.vs v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e64, m1, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e64, m4, tu, mu
+vwredsum.vs  v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vwredsum.vs  v4, v8, v12
+
+# Vector Single-Width FP Reduction Instructions
+# vfwredosum.vs
+# SEW will not be e8, or e64
+# LMUL will not be mf8
+vsetvli zero, zero, e16, mf4, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e16, m8, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vfwredosum.vs v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vfwredosum.vs v4, v8, v12
+# vfwredusum.vs
+vsetvli zero, zero, e16, mf4, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e16, m8, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vfwredusum.vs v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vfwredusum.vs v4, v8, v12
+
+# Single Width Floating Point Min/Max Reductions
+# SEW wont be e8
+# LMUL wont be mf8
+vsetvli zero, zero, e16, mf4, tu, mu
+vfredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vfredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vfredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vfredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vfredmax.vs  v4, v8, v12
+vsetvli zero, zero, e16, m8, tu, mu
+vfredmax.vs  v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m1, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m4, tu, mu
+vfredmin.vs  v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vfredmin.vs  v4, v8, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      52    52.00                        52    VLEN1024X300SiFive7VA1[1,53],VLEN1024X300SiFive7VA1OrVA2[1,53],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      54    54.00                        54    VLEN1024X300SiFive7VA1[1,55],VLEN1024X300SiFive7VA1OrVA2[1,55],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      58    58.00                        58    VLEN1024X300SiFive7VA1[1,59],VLEN1024X300SiFive7VA1OrVA2[1,59],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VREDSUM_VS vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VREDAND_VS vredand.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VREDAND_VS vredand.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      47    47.00                        47    VLEN1024X300SiFive7VA1[1,48],VLEN1024X300SiFive7VA1OrVA2[1,48],VLEN1024X300SiFive7VCQ VREDAND_VS vredand.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      49    49.00                        49    VLEN1024X300SiFive7VA1[1,50],VLEN1024X300SiFive7VA1OrVA2[1,50],VLEN1024X300SiFive7VCQ VREDAND_VS vredand.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      53    53.00                        53    VLEN1024X300SiFive7VA1[1,54],VLEN1024X300SiFive7VA1OrVA2[1,54],VLEN1024X300SiFive7VCQ VREDAND_VS vredand.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      61    61.00                        61    VLEN1024X300SiFive7VA1[1,62],VLEN1024X300SiFive7VA1OrVA2[1,62],VLEN1024X300SiFive7VCQ VREDAND_VS vredand.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      41    41.00                        41    VLEN1024X300SiFive7VA1[1,42],VLEN1024X300SiFive7VA1OrVA2[1,42],VLEN1024X300SiFive7VCQ VREDOR_VS vredor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                        42    VLEN1024X300SiFive7VA1[1,43],VLEN1024X300SiFive7VA1OrVA2[1,43],VLEN1024X300SiFive7VCQ VREDOR_VS vredor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      44    44.00                        44    VLEN1024X300SiFive7VA1[1,45],VLEN1024X300SiFive7VA1OrVA2[1,45],VLEN1024X300SiFive7VCQ VREDOR_VS vredor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      48    48.00                        48    VLEN1024X300SiFive7VA1[1,49],VLEN1024X300SiFive7VA1OrVA2[1,49],VLEN1024X300SiFive7VCQ VREDOR_VS vredor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      56    56.00                        56    VLEN1024X300SiFive7VA1[1,57],VLEN1024X300SiFive7VA1OrVA2[1,57],VLEN1024X300SiFive7VCQ VREDOR_VS vredor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                        37    VLEN1024X300SiFive7VA1[1,38],VLEN1024X300SiFive7VA1OrVA2[1,38],VLEN1024X300SiFive7VCQ VREDXOR_VS vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      39    39.00                        39    VLEN1024X300SiFive7VA1[1,40],VLEN1024X300SiFive7VA1OrVA2[1,40],VLEN1024X300SiFive7VCQ VREDXOR_VS vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      43    43.00                        43    VLEN1024X300SiFive7VA1[1,44],VLEN1024X300SiFive7VA1OrVA2[1,44],VLEN1024X300SiFive7VCQ VREDXOR_VS vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDXOR_VS vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      52    52.00                        52    VLEN1024X300SiFive7VA1[1,53],VLEN1024X300SiFive7VA1OrVA2[1,53],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      54    54.00                        54    VLEN1024X300SiFive7VA1[1,55],VLEN1024X300SiFive7VA1OrVA2[1,55],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      58    58.00                        58    VLEN1024X300SiFive7VA1[1,59],VLEN1024X300SiFive7VA1OrVA2[1,59],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VREDMAXU_VS vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VREDMAX_VS vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VREDMAX_VS vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      47    47.00                        47    VLEN1024X300SiFive7VA1[1,48],VLEN1024X300SiFive7VA1OrVA2[1,48],VLEN1024X300SiFive7VCQ VREDMAX_VS vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      49    49.00                        49    VLEN1024X300SiFive7VA1[1,50],VLEN1024X300SiFive7VA1OrVA2[1,50],VLEN1024X300SiFive7VCQ VREDMAX_VS vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      53    53.00                        53    VLEN1024X300SiFive7VA1[1,54],VLEN1024X300SiFive7VA1OrVA2[1,54],VLEN1024X300SiFive7VCQ VREDMAX_VS vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      61    61.00                        61    VLEN1024X300SiFive7VA1[1,62],VLEN1024X300SiFive7VA1OrVA2[1,62],VLEN1024X300SiFive7VCQ VREDMAX_VS vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      41    41.00                        41    VLEN1024X300SiFive7VA1[1,42],VLEN1024X300SiFive7VA1OrVA2[1,42],VLEN1024X300SiFive7VCQ VREDMINU_VS vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                        42    VLEN1024X300SiFive7VA1[1,43],VLEN1024X300SiFive7VA1OrVA2[1,43],VLEN1024X300SiFive7VCQ VREDMINU_VS vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      44    44.00                        44    VLEN1024X300SiFive7VA1[1,45],VLEN1024X300SiFive7VA1OrVA2[1,45],VLEN1024X300SiFive7VCQ VREDMINU_VS vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      48    48.00                        48    VLEN1024X300SiFive7VA1[1,49],VLEN1024X300SiFive7VA1OrVA2[1,49],VLEN1024X300SiFive7VCQ VREDMINU_VS vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      56    56.00                        56    VLEN1024X300SiFive7VA1[1,57],VLEN1024X300SiFive7VA1OrVA2[1,57],VLEN1024X300SiFive7VCQ VREDMINU_VS vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                        37    VLEN1024X300SiFive7VA1[1,38],VLEN1024X300SiFive7VA1OrVA2[1,38],VLEN1024X300SiFive7VCQ VREDMIN_VS vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      39    39.00                        39    VLEN1024X300SiFive7VA1[1,40],VLEN1024X300SiFive7VA1OrVA2[1,40],VLEN1024X300SiFive7VCQ VREDMIN_VS vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      43    43.00                        43    VLEN1024X300SiFive7VA1[1,44],VLEN1024X300SiFive7VA1OrVA2[1,44],VLEN1024X300SiFive7VCQ VREDMIN_VS vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VREDMIN_VS vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      52    52.00                        52    VLEN1024X300SiFive7VA1[1,53],VLEN1024X300SiFive7VA1OrVA2[1,53],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      54    54.00                        54    VLEN1024X300SiFive7VA1[1,55],VLEN1024X300SiFive7VA1OrVA2[1,55],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      58    58.00                        58    VLEN1024X300SiFive7VA1[1,59],VLEN1024X300SiFive7VA1OrVA2[1,59],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      47    47.00                        47    VLEN1024X300SiFive7VA1[1,48],VLEN1024X300SiFive7VA1OrVA2[1,48],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      49    49.00                        49    VLEN1024X300SiFive7VA1[1,50],VLEN1024X300SiFive7VA1OrVA2[1,50],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      53    53.00                        53    VLEN1024X300SiFive7VA1[1,54],VLEN1024X300SiFive7VA1OrVA2[1,54],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      61    61.00                        61    VLEN1024X300SiFive7VA1[1,62],VLEN1024X300SiFive7VA1OrVA2[1,62],VLEN1024X300SiFive7VCQ VWREDSUMU_VS vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      41    41.00                        41    VLEN1024X300SiFive7VA1[1,42],VLEN1024X300SiFive7VA1OrVA2[1,42],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                        42    VLEN1024X300SiFive7VA1[1,43],VLEN1024X300SiFive7VA1OrVA2[1,43],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      44    44.00                        44    VLEN1024X300SiFive7VA1[1,45],VLEN1024X300SiFive7VA1OrVA2[1,45],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      48    48.00                        48    VLEN1024X300SiFive7VA1[1,49],VLEN1024X300SiFive7VA1OrVA2[1,49],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      56    56.00                        56    VLEN1024X300SiFive7VA1[1,57],VLEN1024X300SiFive7VA1OrVA2[1,57],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      66    66.00                        66    VLEN1024X300SiFive7VA1[1,67],VLEN1024X300SiFive7VA1OrVA2[1,67],VLEN1024X300SiFive7VCQ VWREDSUM_VS vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      96    96.00                        96    VLEN1024X300SiFive7VA1[1,97],VLEN1024X300SiFive7VA1OrVA2[1,97],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      192   192.00                       192   VLEN1024X300SiFive7VA1[1,193],VLEN1024X300SiFive7VA1OrVA2[1,193],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      384   384.00                       384   VLEN1024X300SiFive7VA1[1,385],VLEN1024X300SiFive7VA1OrVA2[1,385],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      768   768.00                       768   VLEN1024X300SiFive7VA1[1,769],VLEN1024X300SiFive7VA1OrVA2[1,769],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1536  1536.00                      1536  VLEN1024X300SiFive7VA1[1,1537],VLEN1024X300SiFive7VA1OrVA2[1,1537],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      3072  3072.00                      3072  VLEN1024X300SiFive7VA1[1,3073],VLEN1024X300SiFive7VA1OrVA2[1,3073],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      96    96.00                        96    VLEN1024X300SiFive7VA1[1,97],VLEN1024X300SiFive7VA1OrVA2[1,97],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      192   192.00                       192   VLEN1024X300SiFive7VA1[1,193],VLEN1024X300SiFive7VA1OrVA2[1,193],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      384   384.00                       384   VLEN1024X300SiFive7VA1[1,385],VLEN1024X300SiFive7VA1OrVA2[1,385],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      768   768.00                       768   VLEN1024X300SiFive7VA1[1,769],VLEN1024X300SiFive7VA1OrVA2[1,769],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1536  1536.00                      1536  VLEN1024X300SiFive7VA1[1,1537],VLEN1024X300SiFive7VA1OrVA2[1,1537],VLEN1024X300SiFive7VCQ VFWREDOSUM_VS vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      47    47.00                        47    VLEN1024X300SiFive7VA1[1,48],VLEN1024X300SiFive7VA1OrVA2[1,48],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      49    49.00                        49    VLEN1024X300SiFive7VA1[1,50],VLEN1024X300SiFive7VA1OrVA2[1,50],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      53    53.00                        53    VLEN1024X300SiFive7VA1[1,54],VLEN1024X300SiFive7VA1OrVA2[1,54],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      61    61.00                        61    VLEN1024X300SiFive7VA1[1,62],VLEN1024X300SiFive7VA1OrVA2[1,62],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      41    41.00                        41    VLEN1024X300SiFive7VA1[1,42],VLEN1024X300SiFive7VA1OrVA2[1,42],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                        42    VLEN1024X300SiFive7VA1[1,43],VLEN1024X300SiFive7VA1OrVA2[1,43],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      44    44.00                        44    VLEN1024X300SiFive7VA1[1,45],VLEN1024X300SiFive7VA1OrVA2[1,45],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      48    48.00                        48    VLEN1024X300SiFive7VA1[1,49],VLEN1024X300SiFive7VA1OrVA2[1,49],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      56    56.00                        56    VLEN1024X300SiFive7VA1[1,57],VLEN1024X300SiFive7VA1OrVA2[1,57],VLEN1024X300SiFive7VCQ VFWREDUSUM_VS vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VFREDMAX_VS vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      46    46.00                        46    VLEN1024X300SiFive7VA1[1,47],VLEN1024X300SiFive7VA1OrVA2[1,47],VLEN1024X300SiFive7VCQ VFREDMAX_VS vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      47    47.00                        47    VLEN1024X300SiFive7VA1[1,48],VLEN1024X300SiFive7VA1OrVA2[1,48],VLEN1024X300SiFive7VCQ VFREDMAX_VS vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      49    49.00                        49    VLEN1024X300SiFive7VA1[1,50],VLEN1024X300SiFive7VA1OrVA2[1,50],VLEN1024X300SiFive7VCQ VFREDMAX_VS vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      53    53.00                        53    VLEN1024X300SiFive7VA1[1,54],VLEN1024X300SiFive7VA1OrVA2[1,54],VLEN1024X300SiFive7VCQ VFREDMAX_VS vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      61    61.00                        61    VLEN1024X300SiFive7VA1[1,62],VLEN1024X300SiFive7VA1OrVA2[1,62],VLEN1024X300SiFive7VCQ VFREDMAX_VS vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      41    41.00                        41    VLEN1024X300SiFive7VA1[1,42],VLEN1024X300SiFive7VA1OrVA2[1,42],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                        42    VLEN1024X300SiFive7VA1[1,43],VLEN1024X300SiFive7VA1OrVA2[1,43],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      44    44.00                        44    VLEN1024X300SiFive7VA1[1,45],VLEN1024X300SiFive7VA1OrVA2[1,45],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      48    48.00                        48    VLEN1024X300SiFive7VA1[1,49],VLEN1024X300SiFive7VA1OrVA2[1,49],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      56    56.00                        56    VLEN1024X300SiFive7VA1[1,57],VLEN1024X300SiFive7VA1OrVA2[1,57],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                        37    VLEN1024X300SiFive7VA1[1,38],VLEN1024X300SiFive7VA1OrVA2[1,38],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      39    39.00                        39    VLEN1024X300SiFive7VA1[1,40],VLEN1024X300SiFive7VA1OrVA2[1,40],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      43    43.00                        43    VLEN1024X300SiFive7VA1[1,44],VLEN1024X300SiFive7VA1OrVA2[1,44],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      51    51.00                        51    VLEN1024X300SiFive7VA1[1,52],VLEN1024X300SiFive7VA1OrVA2[1,52],VLEN1024X300SiFive7VCQ VFREDMIN_VS vfredmin.vs	v4, v8, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     103.00  -     13715.00  -   103.00  -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     53.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     55.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     59.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vredand.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vredand.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     48.00   -     1.00    -      -     vredand.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     50.00   -     1.00    -      -     vredand.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     54.00   -     1.00    -      -     vredand.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     62.00   -     1.00    -      -     vredand.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     42.00   -     1.00    -      -     vredor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     43.00   -     1.00    -      -     vredor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     45.00   -     1.00    -      -     vredor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     49.00   -     1.00    -      -     vredor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     57.00   -     1.00    -      -     vredor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     38.00   -     1.00    -      -     vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     40.00   -     1.00    -      -     vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     44.00   -     1.00    -      -     vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredxor.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     53.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     55.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     59.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vredmaxu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     48.00   -     1.00    -      -     vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     50.00   -     1.00    -      -     vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     54.00   -     1.00    -      -     vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     62.00   -     1.00    -      -     vredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     42.00   -     1.00    -      -     vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     43.00   -     1.00    -      -     vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     45.00   -     1.00    -      -     vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     49.00   -     1.00    -      -     vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     57.00   -     1.00    -      -     vredminu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     38.00   -     1.00    -      -     vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     40.00   -     1.00    -      -     vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     44.00   -     1.00    -      -     vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     53.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     55.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     59.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     48.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     50.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     54.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     62.00   -     1.00    -      -     vwredsumu.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     42.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     43.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     45.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     49.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     57.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     67.00   -     1.00    -      -     vwredsum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     97.00   -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     193.00  -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     385.00  -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     769.00  -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1537.00  -    1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     3073.00  -    1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     97.00   -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     193.00  -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     385.00  -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     769.00  -     1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1537.00  -    1.00    -      -     vfwredosum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     48.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     50.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     54.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     62.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     42.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     43.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     45.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     49.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     57.00   -     1.00    -      -     vfwredusum.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     47.00   -     1.00    -      -     vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     48.00   -     1.00    -      -     vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     50.00   -     1.00    -      -     vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     54.00   -     1.00    -      -     vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     62.00   -     1.00    -      -     vfredmax.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     42.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     43.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     45.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     49.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     57.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     38.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     40.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     44.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     52.00   -     1.00    -      -     vfredmin.vs	v4, v8, v12
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-store.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-store.s
new file mode 100644
index 000000000000..129a33e13f64
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-store.s
@@ -0,0 +1,368 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, mf4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, mf2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+
+vsetvli zero, zero, e8, m8, tu, mu
+vlse8.v  v1, (a1), a2
+
+vsetvli zero, zero, e16, mf4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, mf2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+
+vsetvli zero, zero, e16, m8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e32, m8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m1, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m2, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m4, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vsetvli zero, zero, e64, m8, tu, mu
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE64_V          vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      515   512.00  *                    515   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,513] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      515   512.00  *                    515   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,513] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1027  1024.00 *                    1027  VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,1025] VLSE8_V          vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE64_V          vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      515   512.00  *                    515   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,513] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      515   512.00  *                    515   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,513] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE64_V          vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                    259   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,257] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                    67    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,65] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE8_V           vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE16_V          vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE32_V          vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                    131   VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,129] VLSE64_V          vlse64.v	v1, (a1), a2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     22.00   -      -      -     78.00  9294.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   513.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   1025.00  -    vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   513.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   513.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   257.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   65.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   129.00  -     vlse64.v	v1, (a1), a2
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-x0.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-x0.s
new file mode 100644
index 000000000000..f1c25f7c7635
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/strided-load-x0.s
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e32, m1, tu, mu
+
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vlse8.v  v1, (a1), zero
+vlse16.v v1, (a1), zero
+vlse32.v v1, (a1), zero
+vlse64.v v1, (a1), zero
+
+vle8.v  v1, (a1)
+vle16.v v1, (a1)
+vle32.v v1, (a1)
+vle64.v v1, (a1)
+
+vsetvli zero, zero, e64, m1, tu, mu
+
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vlse8.v  v1, (a1), zero
+vlse16.v v1, (a1), zero
+vlse32.v v1, (a1), zero
+vlse64.v v1, (a1), zero
+
+vle8.v  v1, (a1)
+vle16.v v1, (a1)
+vle32.v v1, (a1)
+vle64.v v1, (a1)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE8_V            vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE16_V           vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE32_V           vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  1      35    32.00   *                    35    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,33] VLSE64_V           vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a1)
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a1)
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE32_V             vle32.v	v1, (a1)
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE64_V             vle64.v	v1, (a1)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE8_V            vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE16_V           vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE32_V           vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE64_V           vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE8_V            vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE16_V           vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE32_V           vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    16.00   *                    19    VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLSE64_V           vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a1)
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a1)
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE32_V             vle32.v	v1, (a1)
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE64_V             vle64.v	v1, (a1)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     2.00    -      -      -     24.00  421.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   33.00   -     vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle32.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle64.v	v1, (a1)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle32.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle64.v	v1, (a1)
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
new file mode 100644
index 000000000000..e1e9b577b77f
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
@@ -0,0 +1,4851 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+# The legal (SEW, LMUL) pairs for FP on sifive-x390 are:
+# (e16, mf4) (e16, mf2) (e16, m1) (e16, m2) (e16, m4) (e16, m8)
+# (e32, mf2) (e32, m1) (e32, m2) (e32, m4) (e32, m8)
+# (e64, m1) (e64, m2) (e64, m4) (e64, m8)
+# Widening instructions do not have e64
+
+# Vector Single-Width FP
+vsetvli zero, zero, e16, mf4, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, mf2, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m1, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m2, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m4, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m8, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m1, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m2, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m4, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m8, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e64, m1, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e64, m2, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e64, m4, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e64, m8, tu, mu
+vfadd.vv v8, v16, v24
+vfadd.vf v8, v16, f8
+vfsub.vv v8, v16, v24
+vfsub.vf v8, v16, f8
+vfrsub.vf v8, v16, f8
+vfmul.vv v8, v16, v24
+vfmul.vf v8, v16, f8
+vfdiv.vv v8, v16, v24
+vfdiv.vf v8, v16, f8
+vfrdiv.vf v8, v16, f8
+vfmacc.vv v8, v16, v24
+vfmacc.vf v8, f8, v24
+vfnmacc.vv v8, v16, v24
+vfnmacc.vf v8, f8, v24
+vfmsac.vv v8, v16, v24
+vfmsac.vf v8, f8, v24
+vfnmsac.vv v8, v16, v24
+vfnmsac.vf v8, f8, v24
+vfmadd.vv v8, v16, v24
+vfmadd.vf v8, f8, v24
+vfnmadd.vv v8, v16, v24
+vfnmadd.vf v8, f8, v24
+vfmsub.vv v8, v16, v24
+vfmsub.vf v8, f8, v24
+vfnmsub.vv v8, v16, v24
+vfnmsub.vf v8, f8, v24
+vfsqrt.v v8, v24
+vfrsqrt7.v v8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+# Vector Widening FP
+# no e64
+vsetvli zero, zero, e16, mf4, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m4, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e16, m8, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m4, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli zero, zero, e32, m8, tu, mu
+vfwadd.vv v8, v16, v24
+vfwadd.vf v8, v16, f8
+vfwsub.vv v8, v16, v24
+vfwsub.vf v8, v16, f8
+vfwadd.wv  v8, v16, v24
+vfwadd.wf  v8, v16, f8
+vfwsub.wv  v8, v16, v24
+vfwsub.wf  v8, v16, f8
+vfwmul.vv    v8, v16, v24
+vfwmul.vf    v8, v16, f8
+vfwmacc.vv v8, v16, v24
+vfwmacc.vf v8, f8, v24
+vfwnmacc.vv v8, v16, v24
+vfwnmacc.vf v8, f8, v24
+vfwmsac.vv v8, v16, v24
+vfwmsac.vf v8, f8, v24
+vfwnmsac.vv v8, v16, v24
+vfwnmsac.vf v8, f8, v24
+vfrec7.v v8, v24
+vfmin.vv v8, v16, v24
+vfmin.vf v8, v16, f8
+vfmax.vv v8, v16, v24
+vfmax.vf v8, v16, f8
+vfsgnj.vv v8, v16, v24
+vfsgnj.vf v8, v16, f8
+vfsgnjn.vv v8, v16, v24
+vfsgnjn.vf v8, v16, f8
+vfsgnjx.vv v8, v16, v24
+vfsgnjx.vf v8, v16, f8
+vfcvt.xu.f.v v8, v16
+vfcvt.x.f.v  v8, v16
+vfcvt.rtz.xu.f.v v8, v16
+vfcvt.rtz.x.f.v  v8, v16
+vfcvt.f.xu.v v8, v16
+vfcvt.f.x.v  v8, v16
+vfwcvt.xu.f.v v8, v16
+vfwcvt.x.f.v  v8, v16
+vfwcvt.rtz.xu.f.v v8, v16
+vfwcvt.rtz.x.f.v  v8, v16
+vfwcvt.f.xu.v v8, v16
+vfwcvt.f.x.v  v8, v16
+vfwcvt.f.f.v v8, v16
+vfncvt.xu.f.w v8, v16
+vfncvt.x.f.w  v8, v16
+vfncvt.rtz.xu.f.w v8, v16
+vfncvt.rtz.x.f.w  v8, v16
+vfncvt.f.xu.w v8, v16
+vfncvt.f.x.w  v8, v16
+vfncvt.f.f.w v8, v16
+vfncvt.rod.f.f.w v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      120   120.00                       120   VLEN1024X300SiFive7VA1[1,121],VLEN1024X300SiFive7VA1OrVA2[1,121],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      120   120.00                       120   VLEN1024X300SiFive7VA1[1,121],VLEN1024X300SiFive7VA1OrVA2[1,121],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      120   120.00                       120   VLEN1024X300SiFive7VA1[1,121],VLEN1024X300SiFive7VA1OrVA2[1,121],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      120   120.00                       120   VLEN1024X300SiFive7VA1[1,121],VLEN1024X300SiFive7VA1OrVA2[1,121],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      240   240.00                       240   VLEN1024X300SiFive7VA1[1,241],VLEN1024X300SiFive7VA1OrVA2[1,241],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      240   240.00                       240   VLEN1024X300SiFive7VA1[1,241],VLEN1024X300SiFive7VA1OrVA2[1,241],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      240   240.00                       240   VLEN1024X300SiFive7VA1[1,241],VLEN1024X300SiFive7VA1OrVA2[1,241],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      240   240.00                       240   VLEN1024X300SiFive7VA1[1,241],VLEN1024X300SiFive7VA1OrVA2[1,241],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      480   480.00                       480   VLEN1024X300SiFive7VA1[1,481],VLEN1024X300SiFive7VA1OrVA2[1,481],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      480   480.00                       480   VLEN1024X300SiFive7VA1[1,481],VLEN1024X300SiFive7VA1OrVA2[1,481],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      480   480.00                       480   VLEN1024X300SiFive7VA1[1,481],VLEN1024X300SiFive7VA1OrVA2[1,481],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      480   480.00                       480   VLEN1024X300SiFive7VA1[1,481],VLEN1024X300SiFive7VA1OrVA2[1,481],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      960   960.00                       960   VLEN1024X300SiFive7VA1[1,961],VLEN1024X300SiFive7VA1OrVA2[1,961],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      960   960.00                       960   VLEN1024X300SiFive7VA1[1,961],VLEN1024X300SiFive7VA1OrVA2[1,961],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      960   960.00                       960   VLEN1024X300SiFive7VA1[1,961],VLEN1024X300SiFive7VA1OrVA2[1,961],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      960   960.00                       960   VLEN1024X300SiFive7VA1[1,961],VLEN1024X300SiFive7VA1OrVA2[1,961],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF    vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      1920  1920.00                      1920  VLEN1024X300SiFive7VA1[1,1921],VLEN1024X300SiFive7VA1OrVA2[1,1921],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1920  1920.00                      1920  VLEN1024X300SiFive7VA1[1,1921],VLEN1024X300SiFive7VA1OrVA2[1,1921],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      1920  1920.00                      1920  VLEN1024X300SiFive7VA1[1,1921],VLEN1024X300SiFive7VA1OrVA2[1,1921],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMACC_VV    vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMACC_VF    vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMACC_VV   vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMACC_VF   vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSAC_VV    vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSAC_VF    vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSAC_VV   vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSAC_VF   vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMADD_VV    vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMADD_VF    vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMADD_VV   vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMADD_VF   vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSUB_VV    vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSUB_VF    vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSUB_VV   vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSUB_VF   vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      1920  1920.00                      1920  VLEN1024X300SiFive7VA1[1,1921],VLEN1024X300SiFive7VA1OrVA2[1,1921],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV    vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF    vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV   vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF   vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV   vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF   vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      112   112.00                       112   VLEN1024X300SiFive7VA1[1,113],VLEN1024X300SiFive7VA1OrVA2[1,113],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      112   112.00                       112   VLEN1024X300SiFive7VA1[1,113],VLEN1024X300SiFive7VA1OrVA2[1,113],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      112   112.00                       112   VLEN1024X300SiFive7VA1[1,113],VLEN1024X300SiFive7VA1OrVA2[1,113],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      112   112.00                       112   VLEN1024X300SiFive7VA1[1,113],VLEN1024X300SiFive7VA1OrVA2[1,113],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      224   224.00                       224   VLEN1024X300SiFive7VA1[1,225],VLEN1024X300SiFive7VA1OrVA2[1,225],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      224   224.00                       224   VLEN1024X300SiFive7VA1[1,225],VLEN1024X300SiFive7VA1OrVA2[1,225],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      224   224.00                       224   VLEN1024X300SiFive7VA1[1,225],VLEN1024X300SiFive7VA1OrVA2[1,225],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      224   224.00                       224   VLEN1024X300SiFive7VA1[1,225],VLEN1024X300SiFive7VA1OrVA2[1,225],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      448   448.00                       448   VLEN1024X300SiFive7VA1[1,449],VLEN1024X300SiFive7VA1OrVA2[1,449],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      448   448.00                       448   VLEN1024X300SiFive7VA1[1,449],VLEN1024X300SiFive7VA1OrVA2[1,449],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      448   448.00                       448   VLEN1024X300SiFive7VA1[1,449],VLEN1024X300SiFive7VA1OrVA2[1,449],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      448   448.00                       448   VLEN1024X300SiFive7VA1[1,449],VLEN1024X300SiFive7VA1OrVA2[1,449],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFADD_VV      vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFADD_VF      vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSUB_VV      vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSUB_VF      vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSUB_VF     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMUL_VV      vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMUL_VF      vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      896   896.00                       896   VLEN1024X300SiFive7VA1[1,897],VLEN1024X300SiFive7VA1OrVA2[1,897],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      896   896.00                       896   VLEN1024X300SiFive7VA1[1,897],VLEN1024X300SiFive7VA1OrVA2[1,897],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      896   896.00                       896   VLEN1024X300SiFive7VA1[1,897],VLEN1024X300SiFive7VA1OrVA2[1,897],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      896   896.00                       896   VLEN1024X300SiFive7VA1[1,897],VLEN1024X300SiFive7VA1OrVA2[1,897],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF    vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      1792  1792.00                      1792  VLEN1024X300SiFive7VA1[1,1793],VLEN1024X300SiFive7VA1OrVA2[1,1793],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1792  1792.00                      1792  VLEN1024X300SiFive7VA1[1,1793],VLEN1024X300SiFive7VA1OrVA2[1,1793],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      1792  1792.00                      1792  VLEN1024X300SiFive7VA1[1,1793],VLEN1024X300SiFive7VA1OrVA2[1,1793],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMACC_VV    vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMACC_VF    vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMACC_VV   vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMACC_VF   vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSAC_VV    vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSAC_VF    vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSAC_VV   vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSAC_VF   vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMADD_VV    vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMADD_VF    vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMADD_VV   vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMADD_VF   vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSUB_VV    vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSUB_VF    vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSUB_VV   vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSUB_VF   vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      1792  1792.00                      1792  VLEN1024X300SiFive7VA1[1,1793],VLEN1024X300SiFive7VA1OrVA2[1,1793],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV    vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF    vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV   vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF   vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV   vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF   vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMACC_VV     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMACC_VF     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMACC_VV    vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMACC_VF    vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSAC_VV     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSAC_VF     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSAC_VV    vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSAC_VF    vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMADD_VV     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMADD_VF     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMADD_VV    vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMADD_VF    vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSUB_VV     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMSUB_VF     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSUB_VV    vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNMSUB_VF    vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMACC_VV    vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMACC_VF    vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMACC_VV   vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMACC_VF   vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSAC_VV    vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSAC_VF    vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSAC_VV   vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSAC_VF   vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMADD_VV    vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMADD_VF    vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMADD_VV   vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMADD_VF   vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSUB_VV    vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMSUB_VF    vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSUB_VV   vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNMSUB_VF   vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV    vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF    vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV   vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF   vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV   vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF   vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV    vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF    vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV   vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF   vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV   vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF   vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     32.00                        8     VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     64.00                        8     VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJ_VV     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJ_VF     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJN_VV    vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJN_VF    vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJX_VV    vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFSGNJX_VF    vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     128.00                       8     VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VV     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_VF     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VV     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_VF     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WV     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWADD_WF     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WV     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWSUB_WF     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VV     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMUL_VF     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VV    vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMACC_VF    vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VV   vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMACC_VF   vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VV    vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWMSAC_VF    vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VV   vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWNMSAC_VF   vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV    vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF    vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV   vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF   vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV   vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF   vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_X_F_V vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_XU_V vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_X_V vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_F_F_V vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_XU_F_W vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_X_F_W vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_XU_F_W vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_RTZ_X_F_W vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_XU_W vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_X_W vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     32.00   -     58006.00 2446.00 1558.00  -   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     121.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     121.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     121.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     121.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     241.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     241.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     241.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     241.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     481.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     481.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     481.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     481.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     961.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     961.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     961.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     961.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1921.00  -    1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1921.00  -    1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1921.00  -    1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1921.00  -    1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     113.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     113.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     113.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     113.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     225.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     225.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     225.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     225.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     449.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     449.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     449.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     449.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     897.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     897.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     897.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     897.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1793.00  -    1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1793.00  -    1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1793.00  -    1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1793.00  -    1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     229.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     229.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     229.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     229.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     457.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     457.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     457.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     457.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     913.00  -     1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     913.00  -     1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     913.00  -     1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     913.00  -     1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1825.00  -    1.00    -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1825.00  -    1.00    -      -     vfdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1825.00  -    1.00    -      -     vfrdiv.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmadd.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfnmsub.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1825.00  -    1.00    -      -     vfsqrt.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrsqrt7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwadd.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwsub.wf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmul.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmacc.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vfwnmsac.vf	v8, fs0, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfrec7.v	v8, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-integer-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-integer-arithmetic.s
new file mode 100644
index 000000000000..8e5de00c48e0
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-integer-arithmetic.s
@@ -0,0 +1,2272 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+# The legal (SEW, LMUL) pairs are:
+# (e8, mf8) (e8, mf4) (e8, mf2) (e8, m1) (e8, m2) (e8, m4) (e8, m8)
+# (e16, mf4) (e16, mf2) (e16, m1) (e16, m2) (e16, m4) (e16, m8)
+# (e32, mf2) (e32, m1) (e32, m2) (e32, m4) (e32, m8)
+# (e64, m1) (e64, m2) (e64, m4) (e64, m8)
+# Widening instructions do not have e64
+# Narrowing instructions do not have e8
+
+# Vector Single-Width Integer Add and Subtract
+vsetvli zero, zero, e8, mf8, tu, mu
+vadd.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vadd.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vadd.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vsub.vv v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vsub.vx v4, v8, x10
+vsetvli zero, zero, e8, m4, tu, mu
+vrsub.vx v4, v8, x10
+vsetvli zero, zero, e8, m8, tu, mu
+vrsub.vi v4, v8, 0
+vsetvli zero, zero, e16, mf4, tu, mu
+vadd.vv v4, v8, v12
+vsetvli zero, zero, e16, mf2, tu, mu
+vadd.vx v4, v8, x10
+vsetvli zero, zero, e16, m1, tu, mu
+vadd.vi v4, v8, 0
+vsetvli zero, zero, e16, m2, tu, mu
+vsub.vv v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vsub.vx v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vrsub.vx v4, v8, x10
+vsetvli zero, zero, e32, mf2, tu, mu
+vrsub.vi v4, v8, 0
+vsetvli zero, zero, e32, m1, tu, mu
+vadd.vv v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vadd.vx v4, v8, x10
+vsetvli zero, zero, e32, m4, tu, mu
+vadd.vi v4, v8, 0
+vsetvli zero, zero, e32, m8, tu, mu
+vsub.vv v4, v8, v12
+vsetvli zero, zero, e64, m1, tu, mu
+vsub.vx v4, v8, x10
+vsetvli zero, zero, e64, m2, tu, mu
+vrsub.vx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vrsub.vi v4, v8, 0
+vsetvli zero, zero, e64, m8, tu, mu
+vadd.vv v4, v8, v12
+
+# Vector Widening Integer Add/Subtract
+# no e64
+vsetvli zero, zero, e8, mf8, tu, mu
+vwaddu.vv  v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vwaddu.vx  v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vwsubu.vv  v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vwsubu.vx  v4, v8, x10
+vsetvli zero, zero, e8, m2, tu, mu
+vwadd.vv  v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vwadd.vx  v4, v8, x10
+vsetvli zero, zero, e8, m8, tu, mu
+vwsub.vv  v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vwsub.vx  v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vwaddu.wv  v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vwaddu.wx  v4, v8, x10
+vsetvli zero, zero, e16, m2, tu, mu
+vwsubu.wv  v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vwsubu.wx  v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vwadd.wv  v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vwadd.wx  v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vwsub.wv  v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vwsub.wx  v4, v8, x10
+vsetvli zero, zero, e32, m4, tu, mu
+vwaddu.vv  v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vwaddu.vx  v4, v8, x10
+
+# Vector Integer Extension
+# no e8
+vsetvli zero, zero, e16, mf4, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vsetvli zero, zero, e16, mf2, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vsetvli zero, zero, e16, m1, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vsetvli zero, zero, e16, m2, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vsetvli zero, zero, e16, m4, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vsetvli zero, zero, e16, m8, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vsetvli zero, zero, e32, mf2, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vsetvli zero, zero, e32, m1, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vsetvli zero, zero, e32, m2, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vsetvli zero, zero, e32, m4, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vsetvli zero, zero, e32, m8, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vsetvli zero, zero, e64, m1, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vzext.vf8 v4, v8
+vsext.vf8 v4, v8
+vsetvli zero, zero, e64, m2, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vzext.vf8 v4, v8
+vsext.vf8 v4, v8
+vsetvli zero, zero, e64, m4, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vzext.vf8 v4, v8
+vsext.vf8 v4, v8
+vsetvli zero, zero, e64, m8, tu, mu
+vzext.vf2 v4, v8
+vsext.vf2 v4, v8
+vzext.vf4 v4, v8
+vsext.vf4 v4, v8
+vzext.vf8 v4, v8
+vsext.vf8 v4, v8
+
+# Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vadc.vvm   v4, v8, v12, v0
+vsetvli zero, zero, e8, mf4, tu, mu
+vadc.vxm   v4, v8, x10, v0
+vsetvli zero, zero, e8, mf2, tu, mu
+vadc.vim   v4, v8, 0, v0
+vsetvli zero, zero, e8, m1, tu, mu
+vmadc.vvm   v4, v8, v12, v0
+vsetvli zero, zero, e8, m2, tu, mu
+vmadc.vxm   v4, v8, x10, v0
+vsetvli zero, zero, e8, m4, tu, mu
+vmadc.vim   v4, v8, 0, v0
+vsetvli zero, zero, e8, m8, tu, mu
+vmadc.vv    v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vmadc.vx    v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vmadc.vi    v4, v8, 0
+vsetvli zero, zero, e16, m1, tu, mu
+vsbc.vvm   v4, v8, v12, v0
+vsetvli zero, zero, e16, m2, tu, mu
+vsbc.vxm   v4, v8, x10, v0
+vsetvli zero, zero, e16, m4, tu, mu
+vmsbc.vvm   v4, v8, v12, v0
+vsetvli zero, zero, e16, m8, tu, mu
+vmsbc.vxm   v4, v8, x10, v0
+vsetvli zero, zero, e32, mf2, tu, mu
+vmsbc.vv    v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vmsbc.vx    v4, v8, x10
+vsetvli zero, zero, e32, m2, tu, mu
+vadc.vvm   v4, v8, v12, v0
+vsetvli zero, zero, e32, m4, tu, mu
+vadc.vxm   v4, v8, x10, v0
+vsetvli zero, zero, e32, m8, tu, mu
+vadc.vim   v4, v8, 0, v0
+vsetvli zero, zero, e64, m1, tu, mu
+vmadc.vvm   v4, v8, v12, v0
+vsetvli zero, zero, e64, m2, tu, mu
+vmadc.vxm   v4, v8, x10, v0
+vsetvli zero, zero, e64, m4, tu, mu
+vmadc.vim   v4, v8, 0, v0
+vsetvli zero, zero, e64, m8, tu, mu
+vmadc.vv    v4, v8, v12
+
+# Vector Bitwise Logical Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vand.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vand.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vand.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vor.vv v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vor.vx v4, v8, x10
+vsetvli zero, zero, e8, m4, tu, mu
+vor.vi v4, v8, 0
+vsetvli zero, zero, e8, m8, tu, mu
+vxor.vv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vxor.vx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vxor.vi v4, v8, 0
+vsetvli zero, zero, e16, m1, tu, mu
+vand.vv v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vand.vx v4, v8, x10
+vsetvli zero, zero, e16, m4, tu, mu
+vand.vi v4, v8, 0
+vsetvli zero, zero, e16, m8, tu, mu
+vor.vv v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vor.vx v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vor.vi v4, v8, 0
+vsetvli zero, zero, e32, m2, tu, mu
+vxor.vv v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vxor.vx v4, v8, x10
+vsetvli zero, zero, e32, m8, tu, mu
+vxor.vi v4, v8, 0
+vsetvli zero, zero, e64, m1, tu, mu
+vand.vv v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vand.vx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vand.vi v4, v8, 0
+vsetvli zero, zero, e64, m8, tu, mu
+vor.vv v4, v8, v12
+
+# Vector Single-Width Shift Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vsll.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vsll.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vsll.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vsrl.vv v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vsrl.vx v4, v8, x10
+vsetvli zero, zero, e8, m4, tu, mu
+vsrl.vi v4, v8, 0
+vsetvli zero, zero, e8, m8, tu, mu
+vsra.vv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vsra.vx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vsra.vi v4, v8, 0
+vsetvli zero, zero, e16, m1, tu, mu
+vsll.vv v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vsll.vx v4, v8, x10
+vsetvli zero, zero, e16, m4, tu, mu
+vsll.vi v4, v8, 0
+vsetvli zero, zero, e16, m8, tu, mu
+vsrl.vv v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vsrl.vx v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vsrl.vi v4, v8, 0
+vsetvli zero, zero, e32, m2, tu, mu
+vsra.vv v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vsra.vx v4, v8, x10
+vsetvli zero, zero, e32, m8, tu, mu
+vsra.vi v4, v8, 0
+vsetvli zero, zero, e64, m1, tu, mu
+vsll.vv v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vsll.vx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vsll.vi v4, v8, 0
+vsetvli zero, zero, e64, m8, tu, mu
+vsrl.vv v4, v8, v12
+
+# Vector Narrowing Integer Right Shift Instructions
+# no e8
+vsetvli zero, zero, e8, mf8, tu, mu
+vnsrl.wv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vnsrl.wx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vnsrl.wi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vnsra.wv v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vnsra.wx v4, v8, x10
+vsetvli zero, zero, e8, m4, tu, mu
+vnsra.wi v4, v8, 0
+vsetvli zero, zero, e8, m8, tu, mu
+vnsrl.wv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vnsrl.wx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vnsrl.wi v4, v8, 0
+vsetvli zero, zero, e16, m1, tu, mu
+vnsra.wv v4, v8, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vnsra.wx v4, v8, x10
+vsetvli zero, zero, e16, m4, tu, mu
+vnsra.wi v4, v8, 0
+vsetvli zero, zero, e16, m8, tu, mu
+vnsrl.wv v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vnsrl.wx v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vnsrl.wi v4, v8, 0
+vsetvli zero, zero, e32, m2, tu, mu
+vnsra.wv v4, v8, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vnsra.wx v4, v8, x10
+vsetvli zero, zero, e32, m8, tu, mu
+vnsra.wi v4, v8, 0
+vsetvli zero, zero, e64, m1, tu, mu
+vnsrl.wv v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vnsrl.wx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vnsrl.wi v4, v8, 0
+vsetvli zero, zero, e64, m8, tu, mu
+vnsra.wv v4, v8, v12
+
+# Vector Integer Compare Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmseq.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vmseq.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vmseq.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vmsne.vv v4, v8, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vmsne.vx v4, v8, x10
+vsetvli zero, zero, e8, m4, tu, mu
+vmsne.vi v4, v8, 0
+vsetvli zero, zero, e8, m8, tu, mu
+vmsltu.vv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vmsltu.vx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vmslt.vv v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vmslt.vx v4, v8, x10
+vsetvli zero, zero, e16, m2, tu, mu
+vmsleu.vv v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vmsleu.vx v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vmsleu.vi v4, v8, 0
+vsetvli zero, zero, e32, mf2, tu, mu
+vmsle.vv v4, v8, v12
+vsetvli zero, zero, e32, m1, tu, mu
+vmsle.vx v4, v8, x10
+vsetvli zero, zero, e32, m2, tu, mu
+vmsle.vi v4, v8, 0
+vsetvli zero, zero, e32, m4, tu, mu
+vmsgtu.vx v4, v8, x10
+vsetvli zero, zero, e32, m8, tu, mu
+vmsgtu.vi v4, v8, 0
+vsetvli zero, zero, e64, m1, tu, mu
+vmsgt.vx v4, v8, x10
+vsetvli zero, zero, e64, m2, tu, mu
+vmsgt.vi v4, v8, 0
+vsetvli zero, zero, e64, m4, tu, mu
+vmseq.vv v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vmseq.vx v4, v8, x10
+
+# Pseudo instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmslt.vi v4, v8, 1
+vsetvli zero, zero, e8, mf4, tu, mu
+vmsltu.vi v4, v8, 1
+vsetvli zero, zero, e8, mf2, tu, mu
+vmsltu.vi v4, v8, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vmsgeu.vi v4, v8, 1
+vsetvli zero, zero, e8, m2, tu, mu
+vmsge.vi v4, v8, 1
+vsetvli zero, zero, e8, m4, tu, mu
+vmsgeu.vi v4, v8, 0
+vsetvli zero, zero, e16, mf4, tu, mu
+vmsge.vi v4, v8, 0
+vsetvli zero, zero, e16, mf2, tu, mu
+vmsge.vx v4, v8, x10
+vsetvli zero, zero, e16, m1, tu, mu
+vmsgeu.vx v4, v8, x11
+
+# Vector Integer Min/Max Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vminu.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vminu.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vmin.vv v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vmin.vx v4, v8, x10
+vsetvli zero, zero, e8, m2, tu, mu
+vmaxu.vv v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vmaxu.vx v4, v8, x10
+vsetvli zero, zero, e8, m8, tu, mu
+vmax.vv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vmax.vx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vminu.vv v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vminu.vx v4, v8, x10
+vsetvli zero, zero, e16, m2, tu, mu
+vmin.vv v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vmin.vx v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vmaxu.vv v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vmaxu.vx v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vmax.vv v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vmax.vx v4, v8, x10
+vsetvli zero, zero, e32, m4, tu, mu
+vminu.vv v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vminu.vx v4, v8, x10
+vsetvli zero, zero, e64, m1, tu, mu
+vmin.vv v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vmin.vx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vmaxu.vv v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vmaxu.vx v4, v8, x10
+
+# Vector Single-Width Integer Multiply Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmul.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vmul.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vmulh.vv v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vmulh.vx v4, v8, x10
+vsetvli zero, zero, e8, m2, tu, mu
+vmulhu.vv v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vmulhu.vx v4, v8, x10
+vsetvli zero, zero, e8, m8, tu, mu
+vmulhsu.vv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vmulhsu.vx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vmul.vv v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vmul.vx v4, v8, x10
+vsetvli zero, zero, e16, m2, tu, mu
+vmulh.vv v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vmulh.vx v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vmulhu.vv v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vmulhu.vx v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vmulhsu.vv v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vmulhsu.vx v4, v8, x10
+vsetvli zero, zero, e32, m4, tu, mu
+vmul.vv v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vmul.vx v4, v8, x10
+vsetvli zero, zero, e64, m1, tu, mu
+vmulh.vv v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vmulh.vx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vmulhu.vv v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vmulhu.vx v4, v8, x10
+
+# Vector Integer Divide Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vdivu.vv v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vdivu.vx v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vv v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v4, v8, x10
+vsetvli zero, zero, e8, m2, tu, mu
+vremu.vv v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vremu.vx v4, v8, x10
+vsetvli zero, zero, e8, m8, tu, mu
+vrem.vv v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vrem.vx v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vdivu.vv v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vdivu.vx v4, v8, x10
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vv v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vx v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vremu.vv v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vremu.vx v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vrem.vv v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vrem.vx v4, v8, x10
+vsetvli zero, zero, e32, m4, tu, mu
+vdivu.vv v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vdivu.vx v4, v8, x10
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v4, v8, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vx v4, v8, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vremu.vv v4, v8, v12
+vsetvli zero, zero, e64, m8, tu, mu
+vremu.vx v4, v8, x10
+
+# Vector Widening Integer Multiply Instructions
+# no e64
+vsetvli zero, zero, e8, mf8, tu, mu
+vwmul.vv  v4, v8, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vwmul.vx  v4, v8, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vwmulu.vv v4, v8, v12
+vsetvli zero, zero, e8, m1, tu, mu
+vwmulu.vx v4, v8, x10
+vsetvli zero, zero, e8, m2, tu, mu
+vwmulsu.vv v4, v8, v12
+vsetvli zero, zero, e8, m4, tu, mu
+vwmulsu.vx v4, v8, x10
+vsetvli zero, zero, e8, m8, tu, mu
+vwmul.vv  v4, v8, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vwmul.vx  v4, v8, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vwmulu.vv v4, v8, v12
+vsetvli zero, zero, e16, m1, tu, mu
+vwmulu.vx v4, v8, x10
+vsetvli zero, zero, e16, m2, tu, mu
+vwmulsu.vv v4, v8, v12
+vsetvli zero, zero, e16, m4, tu, mu
+vwmulsu.vx v4, v8, x10
+vsetvli zero, zero, e16, m8, tu, mu
+vwmul.vv  v4, v8, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vwmul.vx  v4, v8, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vwmulu.vv v4, v8, v12
+vsetvli zero, zero, e32, m2, tu, mu
+vwmulu.vx v4, v8, x10
+vsetvli zero, zero, e32, m4, tu, mu
+vwmulsu.vv v4, v8, v12
+vsetvli zero, zero, e32, m8, tu, mu
+vwmulsu.vx v4, v8, x10
+
+# Vector Single-Width Integer Multiply-Add Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmacc.vv v4, v12, v8
+vsetvli zero, zero, e8, mf4, tu, mu
+vmacc.vx v4, x10, v8
+vsetvli zero, zero, e8, mf2, tu, mu
+vnmsac.vv v4, v12, v8
+vsetvli zero, zero, e8, m1, tu, mu
+vnmsac.vx v4, x10, v8
+vsetvli zero, zero, e8, m2, tu, mu
+vmadd.vv v4, v12, v8
+vsetvli zero, zero, e8, m4, tu, mu
+vmadd.vx v4, x10, v8
+vsetvli zero, zero, e8, m8, tu, mu
+vnmsub.vv v4, v12, v8
+vsetvli zero, zero, e16, mf4, tu, mu
+vnmsub.vx v4, x10, v8
+vsetvli zero, zero, e16, mf2, tu, mu
+vmacc.vv v4, v12, v8
+vsetvli zero, zero, e16, m1, tu, mu
+vmacc.vx v4, x10, v8
+vsetvli zero, zero, e16, m2, tu, mu
+vnmsac.vv v4, v12, v8
+vsetvli zero, zero, e16, m4, tu, mu
+vnmsac.vx v4, x10, v8
+vsetvli zero, zero, e16, m8, tu, mu
+vmadd.vv v4, v12, v8
+vsetvli zero, zero, e32, mf2, tu, mu
+vmadd.vx v4, x10, v8
+vsetvli zero, zero, e32, m1, tu, mu
+vnmsub.vv v4, v12, v8
+vsetvli zero, zero, e32, m2, tu, mu
+vnmsub.vx v4, x10, v8
+vsetvli zero, zero, e32, m4, tu, mu
+vmacc.vv v4, v12, v8
+vsetvli zero, zero, e32, m8, tu, mu
+vmacc.vx v4, x10, v8
+vsetvli zero, zero, e64, m1, tu, mu
+vnmsac.vv v4, v12, v8
+vsetvli zero, zero, e64, m2, tu, mu
+vnmsac.vx v4, x10, v8
+vsetvli zero, zero, e64, m4, tu, mu
+vmadd.vv v4, v12, v8
+vsetvli zero, zero, e64, m8, tu, mu
+vmadd.vx v4, x10, v8
+
+# Vector Widening Integer Multiply-Add Instructions
+# no e64
+vsetvli zero, zero, e8, mf8, tu, mu
+vwmaccu.vv v4, v12, v8
+vsetvli zero, zero, e8, mf4, tu, mu
+vwmaccu.vx v4, x10, v8
+vsetvli zero, zero, e8, mf2, tu, mu
+vwmacc.vv v4, v12, v8
+vsetvli zero, zero, e8, m1, tu, mu
+vwmacc.vx v4, x10, v8
+vsetvli zero, zero, e8, m2, tu, mu
+vwmaccsu.vv v4, v12, v8
+vsetvli zero, zero, e8, m4, tu, mu
+vwmaccsu.vx v4, x10, v8
+vsetvli zero, zero, e8, m8, tu, mu
+vwmaccus.vx v4, x10, v8
+vsetvli zero, zero, e16, mf4, tu, mu
+vwmaccu.vv v4, v12, v8
+vsetvli zero, zero, e16, mf2, tu, mu
+vwmaccu.vx v4, x10, v8
+vsetvli zero, zero, e16, m1, tu, mu
+vwmacc.vv v4, v12, v8
+vsetvli zero, zero, e16, m2, tu, mu
+vwmacc.vx v4, x10, v8
+vsetvli zero, zero, e16, m4, tu, mu
+vwmaccsu.vv v4, v12, v8
+vsetvli zero, zero, e16, m8, tu, mu
+vwmaccsu.vx v4, x10, v8
+vsetvli zero, zero, e32, mf2, tu, mu
+vwmaccus.vx v4, x10, v8
+vsetvli zero, zero, e32, m1, tu, mu
+vwmaccu.vv v4, v12, v8
+vsetvli zero, zero, e32, m2, tu, mu
+vwmaccu.vx v4, x10, v8
+vsetvli zero, zero, e32, m4, tu, mu
+vwmacc.vv v4, v12, v8
+vsetvli zero, zero, e32, m8, tu, mu
+vwmacc.vx v4, x10, v8
+
+# Vector Integer Merge Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e8, mf4, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e8, mf2, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e8, m1, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e8, m2, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e8, m4, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e8, m8, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e16, mf4, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e16, mf2, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e16, m1, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e16, m2, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e16, m4, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e16, m8, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e32, mf2, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e32, m1, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e32, m2, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e32, m4, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e32, m8, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e64, m1, tu, mu
+vmerge.vvm v4, v8, v12, v0
+vsetvli zero, zero, e64, m2, tu, mu
+vmerge.vxm v4, v8, x10, v0
+vsetvli zero, zero, e64, m4, tu, mu
+vmerge.vim v4, v8, 0, v0
+vsetvli zero, zero, e64, m8, tu, mu
+vmerge.vvm v4, v8, v12, v0
+
+# Vector Integer Move Instructions
+vsetvli zero, zero, e8, mf8, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e8, mf4, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e8, mf2, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e8, m1, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e8, m2, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e8, m4, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e8, m8, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e16, mf4, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e16, mf2, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e16, m1, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e16, m2, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e16, m4, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e16, m8, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e32, mf2, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e32, m1, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e32, m2, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e32, m4, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e32, m8, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e64, m1, tu, mu
+vmv.v.v v4, v12
+vsetvli zero, zero, e64, m2, tu, mu
+vmv.v.x v4, x10
+vsetvli zero, zero, e64, m4, tu, mu
+vmv.v.i v4, 0
+vsetvli zero, zero, e64, m8, tu, mu
+vmv.v.v v4, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADD_VV       vadd.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADD_VX       vadd.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADD_VI       vadd.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSUB_VV       vsub.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSUB_VX       vsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VRSUB_VX      vrsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRSUB_VI     vrsub.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADD_VV       vadd.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADD_VX       vadd.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VADD_VI       vadd.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSUB_VV       vsub.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSUB_VX       vsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRSUB_VX     vrsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VRSUB_VI      vrsub.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VADD_VV       vadd.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VADD_VX       vadd.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VADD_VI       vadd.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSUB_VV      vsub.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSUB_VX       vsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VRSUB_VX      vrsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VRSUB_VI      vrsub.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VADD_VV      vadd.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWADDU_VV     vwaddu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWADDU_VX     vwaddu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWSUBU_VV     vwsubu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWSUBU_VX     vwsubu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWADD_VV      vwadd.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWADD_VX      vwadd.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWSUB_VV      vwsub.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWSUB_VX      vwsub.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWADDU_WV     vwaddu.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWADDU_WX     vwaddu.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWSUBU_WV     vwsubu.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWSUBU_WX     vwsubu.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWADD_WV      vwadd.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWADD_WX      vwadd.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWSUB_WV      vwsub.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWSUB_WX      vwsub.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWADDU_VV     vwaddu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWADDU_VX     vwaddu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VZEXT_VF8 vzext.vf8	v4, v8
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSEXT_VF8 vsext.vf8	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VZEXT_VF8 vzext.vf8	v4, v8
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSEXT_VF8 vsext.vf8	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VZEXT_VF8 vzext.vf8	v4, v8
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSEXT_VF8 vsext.vf8	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VZEXT_VF2 vzext.vf2	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSEXT_VF2 vsext.vf2	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VZEXT_VF4 vzext.vf4	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSEXT_VF4 vsext.vf4	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VZEXT_VF8 vzext.vf8	v4, v8
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSEXT_VF8 vsext.vf8	v4, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADC_VVM vadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADC_VXM vadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VADC_VIM vadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMADC_VVM vmadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMADC_VXM vmadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMADC_VIM vmadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMADC_VV vmadc.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMADC_VX vmadc.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMADC_VI vmadc.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSBC_VVM vsbc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSBC_VXM vsbc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMSBC_VVM vmsbc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMSBC_VXM vmsbc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSBC_VV vmsbc.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSBC_VX vmsbc.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VADC_VVM vadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VADC_VXM vadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VADC_VIM vadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMADC_VVM vmadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMADC_VXM vmadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMADC_VIM vmadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMADC_VV vmadc.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VAND_VV       vand.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VAND_VX       vand.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VAND_VI       vand.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VOR_VV        vor.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VOR_VX        vor.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VOR_VI        vor.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VXOR_VV      vxor.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VXOR_VX       vxor.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VXOR_VI       vxor.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VAND_VV       vand.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VAND_VX       vand.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VAND_VI       vand.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VOR_VV       vor.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VOR_VX        vor.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VOR_VI        vor.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VXOR_VV       vxor.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VXOR_VX       vxor.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VXOR_VI      vxor.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VAND_VV       vand.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VAND_VX       vand.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VAND_VI       vand.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VOR_VV       vor.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSLL_VV       vsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSLL_VX       vsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSLL_VI       vsll.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSRL_VV       vsrl.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSRL_VX       vsrl.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSRL_VI       vsrl.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSRA_VV      vsra.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSRA_VX       vsra.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSRA_VI       vsra.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSLL_VV       vsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSLL_VX       vsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSLL_VI       vsll.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSRL_VV      vsrl.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VSRL_VX       vsrl.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSRL_VI       vsrl.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSRA_VV       vsra.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSRA_VX       vsra.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSRA_VI      vsra.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VSLL_VV       vsll.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VSLL_VX       vsll.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VSLL_VI       vsll.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VSRL_VV      vsrl.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VNSRL_WV      vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VNSRL_WX      vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VNSRL_WI      vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNSRA_WV      vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VNSRA_WX      vnsra.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRA_WI     vnsra.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRL_WV     vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VNSRL_WX      vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VNSRL_WI      vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNSRA_WV      vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VNSRA_WX      vnsra.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRA_WI     vnsra.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRL_WV     vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VNSRL_WX      vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNSRL_WI      vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VNSRA_WV      vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRA_WX     vnsra.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRA_WI     vnsra.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNSRL_WV      vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VNSRL_WX      vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRL_WI     vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNSRA_WV     vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSEQ_VV vmseq.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSEQ_VX vmseq.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSEQ_VI vmseq.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      5     2.00                         5     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSNE_VV vmsne.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      7     4.00                         7     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMSNE_VX vmsne.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      11    8.00                         11    VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMSNE_VI vmsne.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMSLTU_VV vmsltu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSLTU_VX vmsltu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSLT_VV vmslt.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      5     2.00                         5     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSLT_VX vmslt.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      7     4.00                         7     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMSLEU_VV vmsleu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      11    8.00                         11    VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMSLEU_VX vmsleu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMSLEU_VI vmsleu.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSLE_VV vmsle.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      5     2.00                         5     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSLE_VX vmsle.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      7     4.00                         7     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMSLE_VI vmsle.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      11    8.00                         11    VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMSGTU_VX vmsgtu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMSGTU_VI vmsgtu.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      5     2.00                         5     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSGT_VX vmsgt.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      7     4.00                         7     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMSGT_VI vmsgt.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      11    8.00                         11    VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMSEQ_VV vmseq.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMSEQ_VX vmseq.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSLE_VI vmsle.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSLEU_VI vmsleu.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSNE_VV vmsne.vv	v4, v8, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      5     2.00                         5     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSGTU_VI vmsgtu.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      7     4.00                         7     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMSGT_VI vmsgt.vi	v4, v8, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      11    8.00                         11    VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMSEQ_VV vmseq.vv	v4, v8, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSGT_VI vmsgt.vi	v4, v8, -1
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMSLT_VX vmslt.vx	v4, v8, a0
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMNAND_MM vmnot.m	v4, v4
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      5     2.00                         5     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMSLTU_VX vmsltu.vx	v4, v8, a1
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMNAND_MM vmnot.m	v4, v4
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMINU_VV vminu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMINU_VX vminu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMIN_VV vmin.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMIN_VX vmin.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMAXU_VV vmaxu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMAXU_VX vmaxu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMAX_VV vmax.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMAX_VX vmax.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMINU_VV vminu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMINU_VX vminu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMIN_VV vmin.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMIN_VX vmin.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMAXU_VV vmaxu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMAXU_VX vmaxu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMAX_VV vmax.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMAX_VX vmax.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMINU_VV vminu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMINU_VX vminu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMIN_VV vmin.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMIN_VX vmin.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMAXU_VV vmaxu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMAXU_VX vmaxu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMUL_VV       vmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMUL_VX       vmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMULH_VV      vmulh.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMULH_VX      vmulh.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMULHU_VV     vmulhu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMULHU_VX     vmulhu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMULHSU_VV   vmulhsu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMULHSU_VX    vmulhsu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMUL_VV       vmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMUL_VX       vmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMULH_VV      vmulh.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMULH_VX      vmulh.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMULHU_VV    vmulhu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMULHU_VX     vmulhu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMULHSU_VV    vmulhsu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMULHSU_VX    vmulhsu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMUL_VV       vmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMUL_VX      vmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMULH_VV      vmulh.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMULH_VX      vmulh.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMULHU_VV     vmulhu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMULHU_VX    vmulhu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VDIVU_VV vdivu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      120   120.00                       120   VLEN1024X300SiFive7VA1[1,121],VLEN1024X300SiFive7VA1OrVA2[1,121],VLEN1024X300SiFive7VCQ VDIVU_VX vdivu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      240   240.00                       240   VLEN1024X300SiFive7VA1[1,241],VLEN1024X300SiFive7VA1OrVA2[1,241],VLEN1024X300SiFive7VCQ VDIV_VV vdiv.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      480   480.00                       480   VLEN1024X300SiFive7VA1[1,481],VLEN1024X300SiFive7VA1OrVA2[1,481],VLEN1024X300SiFive7VCQ VDIV_VX vdiv.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      960   960.00                       960   VLEN1024X300SiFive7VA1[1,961],VLEN1024X300SiFive7VA1OrVA2[1,961],VLEN1024X300SiFive7VCQ VREMU_VV vremu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1920  1920.00                      1920  VLEN1024X300SiFive7VA1[1,1921],VLEN1024X300SiFive7VA1OrVA2[1,1921],VLEN1024X300SiFive7VCQ VREMU_VX vremu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      3840  3840.00                      3840  VLEN1024X300SiFive7VA1[1,3841],VLEN1024X300SiFive7VA1OrVA2[1,3841],VLEN1024X300SiFive7VCQ VREM_VV vrem.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      60    60.00                        60    VLEN1024X300SiFive7VA1[1,61],VLEN1024X300SiFive7VA1OrVA2[1,61],VLEN1024X300SiFive7VCQ VREM_VX vrem.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      120   120.00                       120   VLEN1024X300SiFive7VA1[1,121],VLEN1024X300SiFive7VA1OrVA2[1,121],VLEN1024X300SiFive7VCQ VDIVU_VV vdivu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      240   240.00                       240   VLEN1024X300SiFive7VA1[1,241],VLEN1024X300SiFive7VA1OrVA2[1,241],VLEN1024X300SiFive7VCQ VDIVU_VX vdivu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      480   480.00                       480   VLEN1024X300SiFive7VA1[1,481],VLEN1024X300SiFive7VA1OrVA2[1,481],VLEN1024X300SiFive7VCQ VDIV_VV vdiv.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      960   960.00                       960   VLEN1024X300SiFive7VA1[1,961],VLEN1024X300SiFive7VA1OrVA2[1,961],VLEN1024X300SiFive7VCQ VDIV_VX vdiv.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1920  1920.00                      1920  VLEN1024X300SiFive7VA1[1,1921],VLEN1024X300SiFive7VA1OrVA2[1,1921],VLEN1024X300SiFive7VCQ VREMU_VV vremu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      112   112.00                       112   VLEN1024X300SiFive7VA1[1,113],VLEN1024X300SiFive7VA1OrVA2[1,113],VLEN1024X300SiFive7VCQ VREMU_VX vremu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      224   224.00                       224   VLEN1024X300SiFive7VA1[1,225],VLEN1024X300SiFive7VA1OrVA2[1,225],VLEN1024X300SiFive7VCQ VREM_VV vrem.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      448   448.00                       448   VLEN1024X300SiFive7VA1[1,449],VLEN1024X300SiFive7VA1OrVA2[1,449],VLEN1024X300SiFive7VCQ VREM_VX vrem.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      896   896.00                       896   VLEN1024X300SiFive7VA1[1,897],VLEN1024X300SiFive7VA1OrVA2[1,897],VLEN1024X300SiFive7VCQ VDIVU_VV vdivu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1792  1792.00                      1792  VLEN1024X300SiFive7VA1[1,1793],VLEN1024X300SiFive7VA1OrVA2[1,1793],VLEN1024X300SiFive7VCQ VDIVU_VX vdivu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VDIV_VV vdiv.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VDIV_VX vdiv.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VREMU_VV vremu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VREMU_VX vremu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMUL_VV      vwmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMUL_VX      vwmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMULU_VV     vwmulu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWMULU_VX     vwmulu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWMULSU_VV    vwmulsu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMULSU_VX    vwmulsu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMUL_VV      vwmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMUL_VX      vwmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMULU_VV     vwmulu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWMULU_VX     vwmulu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWMULSU_VV    vwmulsu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMULSU_VX    vwmulsu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMUL_VV      vwmul.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMUL_VX      vwmul.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWMULU_VV     vwmulu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWMULU_VX     vwmulu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMULSU_VV    vwmulsu.vv	v4, v8, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMULSU_VX    vwmulsu.vx	v4, v8, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMACC_VV      vmacc.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMACC_VX      vmacc.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VNMSAC_VV     vnmsac.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VNMSAC_VX     vnmsac.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMADD_VV      vmadd.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMADD_VX      vmadd.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VNMSUB_VV    vnmsub.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VNMSUB_VX     vnmsub.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMACC_VV      vmacc.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMACC_VX      vmacc.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNMSAC_VV     vnmsac.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VNMSAC_VX     vnmsac.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMADD_VV     vmadd.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMADD_VX      vmadd.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VNMSUB_VV     vnmsub.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNMSUB_VX     vnmsub.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMACC_VV      vmacc.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMACC_VX     vmacc.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VNMSAC_VV     vnmsac.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VNMSAC_VX     vnmsac.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMADD_VV      vmadd.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMADD_VX     vmadd.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMACCU_VV    vwmaccu.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMACCU_VX    vwmaccu.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMACC_VV     vwmacc.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWMACC_VX     vwmacc.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWMACCSU_VV   vwmaccsu.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMACCSU_VX   vwmaccsu.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMACCUS_VX   vwmaccus.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMACCU_VV    vwmaccu.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMACCU_VX    vwmaccu.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWMACC_VV     vwmacc.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWMACC_VX     vwmacc.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMACCSU_VV   vwmaccsu.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMACCSU_VX   vwmaccsu.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VWMACCUS_VX   vwmaccus.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      8     1.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VWMACCU_VV    vwmaccu.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VWMACCU_VX    vwmaccu.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMACC_VV     vwmacc.vv	v4, v12, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VWMACC_VX     vwmacc.vx	v4, a0, v8
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1[1,2],VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMERGE_VXM vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMERGE_VIM vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00                        4     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMERGE_VVM vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMV_V_V       vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMV_V_I       vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMV_V_V       vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMV_V_I       vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMV_V_V      vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMV_V_I       vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMV_V_V       vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMV_V_I       vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMV_V_V      vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,2],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMV_V_I       vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMV_V_V       vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMV_V_I      vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VMV_V_V       vmv.v.v	v4, v12
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VMV_V_X       vmv.v.x	v4, a0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VMV_V_I       vmv.v.i	v4, 0
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00                         4     VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VMV_V_V      vmv.v.v	v4, v12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     342.00  -     20046.50 682.50 385.00  -    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vadd.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vadd.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vadd.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vsub.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vrsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vrsub.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vadd.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vadd.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vadd.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vsub.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vrsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vrsub.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vadd.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vadd.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vadd.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vsub.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vrsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vrsub.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vadd.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwaddu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwaddu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwsubu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwsubu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwadd.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwadd.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwsub.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwsub.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwaddu.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwaddu.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwsubu.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwsubu.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwadd.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwadd.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwsub.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwsub.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwaddu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwaddu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vzext.vf8	v4, v8
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsext.vf8	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vzext.vf8	v4, v8
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsext.vf8	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vzext.vf8	v4, v8
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vsext.vf8	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vzext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vsext.vf2	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vzext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vsext.vf4	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vzext.vf8	v4, v8
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vsext.vf8	v4, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmadc.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmadc.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmadc.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vsbc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vsbc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmsbc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmsbc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsbc.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmsbc.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmadc.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmadc.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmadc.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmadc.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vand.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vand.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vand.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vor.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vor.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vor.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vxor.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vxor.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vxor.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vand.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vand.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vand.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vor.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vor.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vor.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vxor.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vxor.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vxor.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vand.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vand.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vand.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vor.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vsll.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vsrl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vsrl.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vsrl.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vsra.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vsra.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vsra.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vsll.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vsrl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vsrl.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vsrl.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vsra.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vsra.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vsra.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vsll.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vsll.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vsll.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vsrl.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vnsra.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsra.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vnsra.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsra.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsra.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsra.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnsrl.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vnsrl.wx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsrl.wi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnsra.wv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmseq.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmseq.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmseq.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmsne.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmsne.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmsne.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmsltu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsltu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmslt.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmslt.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmsleu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmsleu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmsleu.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsle.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmsle.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmsle.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmsgtu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmsgtu.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmsgt.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmsgt.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmseq.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmseq.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsle.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsleu.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsne.vv	v4, v8, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmsgtu.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmsgt.vi	v4, v8, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmseq.vv	v4, v8, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmsgt.vi	v4, v8, -1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmslt.vx	v4, v8, a0
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmnot.m	v4, v4
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmsltu.vx	v4, v8, a1
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmnot.m	v4, v4
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vminu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vminu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmin.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmin.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmaxu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmaxu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmax.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmax.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vminu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vminu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmin.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmin.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmaxu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmaxu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmax.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmax.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vminu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vminu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmin.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmin.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmaxu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmaxu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmulh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmulh.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmulhu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmulhu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmulhsu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmulhsu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmulh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmulh.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmulhu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmulhu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmulhsu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmulhsu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmulh.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmulh.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmulhu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmulhu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vdivu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     121.00  -     1.00    -      -     vdivu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     241.00  -     1.00    -      -     vdiv.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     481.00  -     1.00    -      -     vdiv.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     961.00  -     1.00    -      -     vremu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1921.00  -    1.00    -      -     vremu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     3841.00  -    1.00    -      -     vrem.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     61.00   -     1.00    -      -     vrem.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     121.00  -     1.00    -      -     vdivu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     241.00  -     1.00    -      -     vdivu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     481.00  -     1.00    -      -     vdiv.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     961.00  -     1.00    -      -     vdiv.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1921.00  -    1.00    -      -     vremu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     113.00  -     1.00    -      -     vremu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     225.00  -     1.00    -      -     vrem.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     449.00  -     1.00    -      -     vrem.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     897.00  -     1.00    -      -     vdivu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1793.00  -    1.00    -      -     vdivu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     229.00  -     1.00    -      -     vdiv.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     457.00  -     1.00    -      -     vdiv.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     913.00  -     1.00    -      -     vremu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1825.00  -    1.00    -      -     vremu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmulu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwmulu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwmulsu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmulsu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmulu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwmulu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwmulsu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmulsu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmul.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmul.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwmulu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwmulu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmulsu.vv	v4, v8, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmulsu.vx	v4, v8, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmacc.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmacc.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vnmsac.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vnmsac.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmadd.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmadd.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vnmsub.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vnmsub.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmacc.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmacc.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnmsac.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vnmsac.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmadd.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmadd.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vnmsub.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnmsub.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmacc.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmacc.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vnmsac.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vnmsac.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmadd.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmadd.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmaccu.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmaccu.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmacc.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwmacc.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwmaccsu.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmaccsu.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmaccus.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmaccu.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmaccu.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwmacc.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwmacc.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmaccsu.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmaccsu.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vwmaccus.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vwmaccu.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vwmaccu.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmacc.vv	v4, v12, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vwmacc.vx	v4, a0, v8
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     3.00    -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     5.00    -     1.00    -      -     vmerge.vxm	v4, v8, a0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     9.00    -     1.00    -      -     vmerge.vim	v4, v8, 0, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vmerge.vvm	v4, v8, v12, v0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     1.50   1.50   1.00    -      -     vmv.v.v	v4, v12
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     2.50   2.50   1.00    -      -     vmv.v.x	v4, a0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     4.50   4.50   1.00    -      -     vmv.v.i	v4, 0
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     8.50   8.50   1.00    -      -     vmv.v.v	v4, v12
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
new file mode 100644
index 000000000000..d3f129a19b01
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
@@ -0,0 +1,317 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+# The legal (SEW, LMUL) pairs on sifive-x280 are:
+# (e8, mf8) (e8, mf4) (e8, mf2) (e8, m1) (e8, m2) (e8, m4) (e8, m8)
+# (e16, mf4) (e16, mf2) (e16, m1) (e16, m2) (e16, m4) (e16, m8)
+# (e32, mf2) (e32, m1) (e32, m2) (e32, m4) (e32, m8)
+# (e64, m1) (e64, m2) (e64, m4) (e64, m8)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      515   512.00                       515   VLEN1024X300SiFive7VA1[1,513],VLEN1024X300SiFive7VA1OrVA2[1,513],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      515   512.00                       515   VLEN1024X300SiFive7VA1[1,513],VLEN1024X300SiFive7VA1OrVA2[1,513],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      515   512.00                       515   VLEN1024X300SiFive7VA1[1,513],VLEN1024X300SiFive7VA1OrVA2[1,513],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1027  1024.00                      1027  VLEN1024X300SiFive7VA1[1,1025],VLEN1024X300SiFive7VA1OrVA2[1,1025],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1027  1024.00                      1027  VLEN1024X300SiFive7VA1[1,1025],VLEN1024X300SiFive7VA1OrVA2[1,1025],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1027  1024.00                      1027  VLEN1024X300SiFive7VA1[1,1025],VLEN1024X300SiFive7VA1OrVA2[1,1025],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      515   512.00                       515   VLEN1024X300SiFive7VA1[1,513],VLEN1024X300SiFive7VA1OrVA2[1,513],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      515   512.00                       515   VLEN1024X300SiFive7VA1[1,513],VLEN1024X300SiFive7VA1OrVA2[1,513],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      515   512.00                       515   VLEN1024X300SiFive7VA1[1,513],VLEN1024X300SiFive7VA1OrVA2[1,513],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      259   256.00                       259   VLEN1024X300SiFive7VA1[1,257],VLEN1024X300SiFive7VA1OrVA2[1,257],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHER_VV vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VCOMPRESS_VM vcompress.vm	v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     22.00   -     11394.00  -   66.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     513.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     513.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     513.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     1025.00  -    1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1025.00  -    1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1025.00  -    1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     513.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     513.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     513.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     257.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     33.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     65.00   -     1.00    -      -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     129.00  -     1.00    -      -     vcompress.vm	v8, v16, v24
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vle-vse.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vle-vse.s
new file mode 100644
index 000000000000..f6114225f33b
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vle-vse.s
@@ -0,0 +1,1256 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x390 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e8, m4, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e8, m8, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e16, m8, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vle8.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vle8.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e8, m4, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e16, m8, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vle16.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vle16.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vle32.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vle32.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vle64.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vle64.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e8, m4, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e8, m8, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e16, m8, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vse8.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vse8.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e8, m4, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e16, m8, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vse16.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vse16.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vse32.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vse32.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vse64.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vse64.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e8, m4, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e8, m8, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e16, m8, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vlm.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vlm.v    v1, (a0)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e8, mf4, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e8, mf2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e8, m1, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e8, m2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e8, m4, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e8, m8, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e16, mf4, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e16, mf2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e16, m1, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e16, m2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e16, m4, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e16, m8, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e32, mf2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e32, m1, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e32, m2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e32, m4, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e32, m8, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e64, m1, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e64, m2, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e64, m4, tu, mu
+vsm.v    v1, (a0)
+vsetvli zero, zero, e64, m8, tu, mu
+vsm.v    v1, (a0)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv:1
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv:1
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA:1
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeAB:2 VLEN1024X300SiFive7PipeA, VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7PipeB:1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA1:1
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VA1OrVA2:2 VLEN1024X300SiFive7VA1, VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VA2:1
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VCQ:1
+# CHECK-NEXT: [9]   - VLEN1024X300SiFive7VL:1
+# CHECK-NEXT: [10]  - VLEN1024X300SiFive7VS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE8_V             vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE8_V              vle8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE16_V            vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE16_V            vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE16_V             vle16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE32_V            vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE32_V            vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE32_V            vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     1.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,2] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE32_V             vle32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE64_V            vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE64_V            vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE64_V            vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     4.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,5] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     8.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,9] VLE64_V             vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     16.00   *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,17] VLE64_V            vle64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE8_V             vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE8_V              vse8.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE16_V            vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE16_V            vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE16_V             vse16.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE32_V            vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE32_V            vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE32_V            vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,2] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE32_V             vse32.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE64_V            vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE64_V            vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE64_V            vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     4.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,5] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     8.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,9] VSE64_V             vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     16.00          *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,17] VSE64_V            vse64.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      4     2.00    *                    4     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VL[1,3] VLM_V               vlm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+# CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     2.00           *             1     VLEN1024X300SiFive7VCQ,VLEN1024X300SiFive7VS[1,3] VSM_V               vsm.v	v1, (a0)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - VLEN1024X300SiFive7FDiv
+# CHECK-NEXT: [1]   - VLEN1024X300SiFive7IDiv
+# CHECK-NEXT: [2]   - VLEN1024X300SiFive7PipeA
+# CHECK-NEXT: [3]   - VLEN1024X300SiFive7PipeB
+# CHECK-NEXT: [4]   - VLEN1024X300SiFive7VA1
+# CHECK-NEXT: [5]   - VLEN1024X300SiFive7VA2
+# CHECK-NEXT: [6]   - VLEN1024X300SiFive7VCQ
+# CHECK-NEXT: [7]   - VLEN1024X300SiFive7VL
+# CHECK-NEXT: [8]   - VLEN1024X300SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -     200.00  -      -      -     200.00 524.00 524.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   2.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   5.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   9.00    -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   17.00   -     vle64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse8.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse16.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     2.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse32.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     5.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     9.00   vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     17.00  vse64.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00   3.00    -     vlm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     3.00   vsm.v	v1, (a0)

From ed75e55a70aaf898c35ad4ca0f5936bf72e4fc11 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 23 Jun 2025 10:10:53 -0700
Subject: [PATCH 1315/1322] [NFC][Clang][AST] Adopt `llvm::copy` in Clang AST
 (#145192)

---
 clang/include/clang/AST/ASTContext.h   |  2 +-
 clang/include/clang/AST/OpenMPClause.h | 14 ++---
 clang/include/clang/AST/Stmt.h         |  2 +-
 clang/lib/AST/ASTImporter.cpp          |  2 +-
 clang/lib/AST/Decl.cpp                 | 23 ++++----
 clang/lib/AST/DeclObjC.cpp             |  2 +-
 clang/lib/AST/ExprCXX.cpp              | 17 +++---
 clang/lib/AST/ExprConcepts.cpp         |  6 +--
 clang/lib/AST/ExprObjC.cpp             |  6 +--
 clang/lib/AST/OpenMPClause.cpp         | 72 ++++++++++++--------------
 clang/lib/AST/Stmt.cpp                 |  9 ++--
 clang/lib/AST/StmtCXX.cpp              |  5 +-
 clang/lib/AST/Type.cpp                 |  3 +-
 13 files changed, 73 insertions(+), 90 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index e01361e2466b..10537c94babd 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -814,7 +814,7 @@ public:
 
   llvm::StringRef backupStr(llvm::StringRef S) const {
     char *Buf = new (*this) char[S.size()];
-    std::copy(S.begin(), S.end(), Buf);
+    llvm::copy(S, Buf);
     return llvm::StringRef(Buf, S.size());
   }
 
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 2fa8fa529741..3ceaea10bd02 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -387,7 +387,7 @@ public:
     assert(
         DK.size() == NumKinds &&
         "Number of directive kinds is not the same as the preallocated buffer");
-    std::copy(DK.begin(), DK.end(), getDirectiveKinds().begin());
+    llvm::copy(DK, getDirectiveKinds().begin());
   }
 
   SourceLocation getLParenLoc() { return LParenLoc; }
@@ -5917,7 +5917,7 @@ protected:
   void setUniqueDecls(ArrayRef<ValueDecl *> UDs) {
     assert(UDs.size() == NumUniqueDeclarations &&
            "Unexpected amount of unique declarations.");
-    std::copy(UDs.begin(), UDs.end(), getUniqueDeclsRef().begin());
+    llvm::copy(UDs, getUniqueDeclsRef().begin());
   }
 
   /// Get the number of lists per declaration that are in the trailing
@@ -5939,7 +5939,7 @@ protected:
   void setDeclNumLists(ArrayRef<unsigned> DNLs) {
     assert(DNLs.size() == NumUniqueDeclarations &&
            "Unexpected amount of list numbers.");
-    std::copy(DNLs.begin(), DNLs.end(), getDeclNumListsRef().begin());
+    llvm::copy(DNLs, getDeclNumListsRef().begin());
   }
 
   /// Get the cumulative component lists sizes that are in the trailing
@@ -5965,7 +5965,7 @@ protected:
   void setComponentListSizes(ArrayRef<unsigned> CLSs) {
     assert(CLSs.size() == NumComponentLists &&
            "Unexpected amount of component lists.");
-    std::copy(CLSs.begin(), CLSs.end(), getComponentListSizesRef().begin());
+    llvm::copy(CLSs, getComponentListSizesRef().begin());
   }
 
   /// Get the components that are in the trailing objects of the class.
@@ -5989,7 +5989,7 @@ protected:
            "Unexpected amount of component lists.");
     assert(CLSs.size() == NumComponentLists &&
            "Unexpected amount of list sizes.");
-    std::copy(Components.begin(), Components.end(), getComponentsRef().begin());
+    llvm::copy(Components, getComponentsRef().begin());
   }
 
   /// Fill the clause information from the list of declarations and
@@ -6063,7 +6063,7 @@ protected:
         ++CLSI;
 
         // Append components after the current components iterator.
-        CI = std::copy(C.begin(), C.end(), CI);
+        CI = llvm::copy(C, CI);
       }
     }
   }
@@ -6107,7 +6107,7 @@ protected:
            "Unexpected number of user-defined mappers.");
     assert(SupportsMapper &&
            "Must be a clause that is possible to have user-defined mappers");
-    std::copy(DMDs.begin(), DMDs.end(), getUDMapperRefs().begin());
+    llvm::copy(DMDs, getUDMapperRefs().begin());
   }
 
 public:
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 6c4bd6f6946b..58e20b0bb0d5 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -2214,7 +2214,7 @@ class AttributedStmt final
       : ValueStmt(AttributedStmtClass), SubStmt(SubStmt) {
     AttributedStmtBits.NumAttrs = Attrs.size();
     AttributedStmtBits.AttrLoc = Loc;
-    std::copy(Attrs.begin(), Attrs.end(), getAttrArrayPtr());
+    llvm::copy(Attrs, getAttrArrayPtr());
   }
 
   explicit AttributedStmt(EmptyShell Empty, unsigned NumAttrs)
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 3e4be21adb6b..ca1a9afc58dd 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -4123,7 +4123,7 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
         return std::move(Err);
       auto **Memory =
           new (Importer.getToContext()) CXXCtorInitializer *[NumInitializers];
-      std::copy(CtorInitializers.begin(), CtorInitializers.end(), Memory);
+      llvm::copy(CtorInitializers, Memory);
       auto *ToCtor = cast<CXXConstructorDecl>(ToFunction);
       ToCtor->setCtorInitializers(Memory);
       ToCtor->setNumCtorInitializers(NumInitializers);
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c4376aab480c..20a3db1c69b8 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2110,7 +2110,7 @@ void QualifierInfo::setTemplateParameterListsInfo(
   if (!TPLists.empty()) {
     TemplParamLists = new (Context) TemplateParameterList *[TPLists.size()];
     NumTemplParamLists = TPLists.size();
-    std::copy(TPLists.begin(), TPLists.end(), TemplParamLists);
+    llvm::copy(TPLists, TemplParamLists);
   }
 }
 
@@ -3753,7 +3753,7 @@ void FunctionDecl::setParams(ASTContext &C,
   // Zero params -> null pointer.
   if (!NewParamInfo.empty()) {
     ParamInfo = new (C) ParmVarDecl*[NewParamInfo.size()];
-    std::copy(NewParamInfo.begin(), NewParamInfo.end(), ParamInfo);
+    llvm::copy(NewParamInfo, ParamInfo);
   }
 }
 
@@ -5322,7 +5322,7 @@ void BlockDecl::setParams(ArrayRef<ParmVarDecl *> NewParamInfo) {
   if (!NewParamInfo.empty()) {
     NumParams = NewParamInfo.size();
     ParamInfo = new (getASTContext()) ParmVarDecl*[NewParamInfo.size()];
-    std::copy(NewParamInfo.begin(), NewParamInfo.end(), ParamInfo);
+    llvm::copy(NewParamInfo, ParamInfo);
   }
 }
 
@@ -5379,7 +5379,7 @@ PragmaCommentDecl *PragmaCommentDecl::Create(const ASTContext &C,
   PragmaCommentDecl *PCD =
       new (C, DC, additionalSizeToAlloc<char>(Arg.size() + 1))
           PragmaCommentDecl(DC, CommentLoc, CommentKind);
-  memcpy(PCD->getTrailingObjects(), Arg.data(), Arg.size());
+  llvm::copy(Arg, PCD->getTrailingObjects());
   PCD->getTrailingObjects()[Arg.size()] = '\0';
   return PCD;
 }
@@ -5401,9 +5401,9 @@ PragmaDetectMismatchDecl::Create(const ASTContext &C, TranslationUnitDecl *DC,
   PragmaDetectMismatchDecl *PDMD =
       new (C, DC, additionalSizeToAlloc<char>(ValueStart + Value.size() + 1))
           PragmaDetectMismatchDecl(DC, Loc, ValueStart);
-  memcpy(PDMD->getTrailingObjects(), Name.data(), Name.size());
+  llvm::copy(Name, PDMD->getTrailingObjects());
   PDMD->getTrailingObjects()[Name.size()] = '\0';
-  memcpy(PDMD->getTrailingObjects() + ValueStart, Value.data(), Value.size());
+  llvm::copy(Value, PDMD->getTrailingObjects() + ValueStart);
   PDMD->getTrailingObjects()[ValueStart + Value.size()] = '\0';
   return PDMD;
 }
@@ -5443,9 +5443,9 @@ LabelDecl *LabelDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
 
 void LabelDecl::setMSAsmLabel(StringRef Name) {
 char *Buffer = new (getASTContext(), 1) char[Name.size() + 1];
-  memcpy(Buffer, Name.data(), Name.size());
-  Buffer[Name.size()] = '\0';
-  MSAsmName = Buffer;
+llvm::copy(Name, Buffer);
+Buffer[Name.size()] = '\0';
+MSAsmName = Buffer;
 }
 
 void ValueDecl::anchor() {}
@@ -5828,7 +5828,7 @@ void HLSLBufferDecl::setDefaultBufferDecls(ArrayRef<Decl *> Decls) {
 
   // allocate array for default decls with ASTContext allocator
   Decl **DeclsArray = new (getASTContext()) Decl *[Decls.size()];
-  std::copy(Decls.begin(), Decls.end(), DeclsArray);
+  llvm::copy(Decls, DeclsArray);
   DefaultBufferDecls = ArrayRef<Decl *>(DeclsArray, Decls.size());
 }
 
@@ -5869,8 +5869,7 @@ HLSLRootSignatureDecl *HLSLRootSignatureDecl::Create(
                RootElements.size()))
           HLSLRootSignatureDecl(DC, Loc, ID, RootElements.size());
   auto *StoredElems = RSDecl->getElems();
-  std::uninitialized_copy(RootElements.begin(), RootElements.end(),
-                          StoredElems);
+  llvm::uninitialized_copy(RootElements, StoredElems);
   return RSDecl;
 }
 
diff --git a/clang/lib/AST/DeclObjC.cpp b/clang/lib/AST/DeclObjC.cpp
index 596262e21798..5cf0e9a7b259 100644
--- a/clang/lib/AST/DeclObjC.cpp
+++ b/clang/lib/AST/DeclObjC.cpp
@@ -1512,7 +1512,7 @@ ObjCTypeParamList::ObjCTypeParamList(SourceLocation lAngleLoc,
                                      ArrayRef<ObjCTypeParamDecl *> typeParams,
                                      SourceLocation rAngleLoc)
     : Brackets(lAngleLoc, rAngleLoc), NumParams(typeParams.size()) {
-  std::copy(typeParams.begin(), typeParams.end(), begin());
+  llvm::copy(typeParams, begin());
 }
 
 ObjCTypeParamList *ObjCTypeParamList::create(
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 2b66445fe253..4fc6345bad11 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -261,9 +261,8 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
     getTrailingObjects<Stmt *>()[arraySizeOffset()] = *ArraySize;
   if (Initializer)
     getTrailingObjects<Stmt *>()[initExprOffset()] = Initializer;
-  for (unsigned I = 0; I != PlacementArgs.size(); ++I)
-    getTrailingObjects<Stmt *>()[placementNewArgsOffset() + I] =
-        PlacementArgs[I];
+  llvm::copy(PlacementArgs,
+             getTrailingObjects<Stmt *>() + placementNewArgsOffset());
   if (IsParenTypeId)
     getTrailingObjects<SourceRange>()[0] = TypeIdParens;
 
@@ -1208,10 +1207,8 @@ CXXConstructExpr::CXXConstructExpr(
   CXXConstructExprBits.Loc = Loc;
 
   Stmt **TrailingArgs = getTrailingArgs();
-  for (unsigned I = 0, N = Args.size(); I != N; ++I) {
-    assert(Args[I] && "NULL argument in CXXConstructExpr!");
-    TrailingArgs[I] = Args[I];
-  }
+  llvm::copy(Args, TrailingArgs);
+  assert(llvm::all_of(Args, [](const Stmt *Arg) { return Arg != nullptr; }));
 
   // CXXTemporaryObjectExpr does this itself after setting its TypeSourceInfo.
   if (SC == CXXConstructExprClass)
@@ -1472,8 +1469,7 @@ CXXUnresolvedConstructExpr::CXXUnresolvedConstructExpr(
       RParenLoc(RParenLoc) {
   CXXUnresolvedConstructExprBits.NumArgs = Args.size();
   auto **StoredArgs = getTrailingObjects();
-  for (unsigned I = 0; I != Args.size(); ++I)
-    StoredArgs[I] = Args[I];
+  llvm::copy(Args, StoredArgs);
   setDependence(computeDependence(this));
 }
 
@@ -1885,8 +1881,7 @@ TypeTraitExpr::TypeTraitExpr(QualType T, SourceLocation Loc, TypeTrait Kind,
   assert(Args.size() == TypeTraitExprBits.NumArgs &&
          "TypeTraitExprBits.NumArgs overflow!");
   auto **ToArgs = getTrailingObjects<TypeSourceInfo *>();
-  for (unsigned I = 0, N = Args.size(); I != N; ++I)
-    ToArgs[I] = Args[I];
+  llvm::copy(Args, ToArgs);
 
   setDependence(computeDependence(this));
 
diff --git a/clang/lib/AST/ExprConcepts.cpp b/clang/lib/AST/ExprConcepts.cpp
index e6afcdd5dc3e..a2cf431a312a 100644
--- a/clang/lib/AST/ExprConcepts.cpp
+++ b/clang/lib/AST/ExprConcepts.cpp
@@ -144,10 +144,8 @@ RequiresExpr::RequiresExpr(ASTContext &C, SourceLocation RequiresKWLoc,
     if (RequirementContainsError(R))
       setDependence(getDependence() | ExprDependence::Error);
   }
-  std::copy(LocalParameters.begin(), LocalParameters.end(),
-            getTrailingObjects<ParmVarDecl *>());
-  std::copy(Requirements.begin(), Requirements.end(),
-            getTrailingObjects<concepts::Requirement *>());
+  llvm::copy(LocalParameters, getTrailingObjects<ParmVarDecl *>());
+  llvm::copy(Requirements, getTrailingObjects<concepts::Requirement *>());
   RequiresExprBits.IsSatisfied |= Dependent;
   // FIXME: move the computing dependency logic to ComputeDependence.h
   if (ContainsUnexpandedParameterPack)
diff --git a/clang/lib/AST/ExprObjC.cpp b/clang/lib/AST/ExprObjC.cpp
index 79b5db301d41..3df9c404b5ca 100644
--- a/clang/lib/AST/ExprObjC.cpp
+++ b/clang/lib/AST/ExprObjC.cpp
@@ -163,10 +163,8 @@ void ObjCMessageExpr::initArgsAndSelLocs(ArrayRef<Expr *> Args,
     MyArgs[I] = Args[I];
 
   SelLocsKind = SelLocsK;
-  if (!isImplicit()) {
-    if (SelLocsK == SelLoc_NonStandard)
-      std::copy(SelLocs.begin(), SelLocs.end(), getStoredSelLocs());
-  }
+  if (!isImplicit() && SelLocsK == SelLoc_NonStandard)
+    llvm::copy(SelLocs, getStoredSelLocs());
 }
 
 ObjCMessageExpr *
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index f714974b9476..8b1caa05eec3 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -428,7 +428,7 @@ OMPUpdateClause *OMPUpdateClause::CreateEmpty(const ASTContext &C,
 void OMPPrivateClause::setPrivateCopies(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(VL.begin(), VL.end(), varlist_end());
+  llvm::copy(VL, varlist_end());
 }
 
 OMPPrivateClause *
@@ -453,13 +453,13 @@ OMPPrivateClause *OMPPrivateClause::CreateEmpty(const ASTContext &C,
 void OMPFirstprivateClause::setPrivateCopies(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(VL.begin(), VL.end(), varlist_end());
+  llvm::copy(VL, varlist_end());
 }
 
 void OMPFirstprivateClause::setInits(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() &&
          "Number of inits is not the same as the preallocated buffer");
-  std::copy(VL.begin(), VL.end(), getPrivateCopies().end());
+  llvm::copy(VL, getPrivateCopies().end());
 }
 
 OMPFirstprivateClause *
@@ -486,29 +486,28 @@ OMPFirstprivateClause *OMPFirstprivateClause::CreateEmpty(const ASTContext &C,
 void OMPLastprivateClause::setPrivateCopies(ArrayRef<Expr *> PrivateCopies) {
   assert(PrivateCopies.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(PrivateCopies.begin(), PrivateCopies.end(), varlist_end());
+  llvm::copy(PrivateCopies, varlist_end());
 }
 
 void OMPLastprivateClause::setSourceExprs(ArrayRef<Expr *> SrcExprs) {
   assert(SrcExprs.size() == varlist_size() && "Number of source expressions is "
                                               "not the same as the "
                                               "preallocated buffer");
-  std::copy(SrcExprs.begin(), SrcExprs.end(), getPrivateCopies().end());
+  llvm::copy(SrcExprs, getPrivateCopies().end());
 }
 
 void OMPLastprivateClause::setDestinationExprs(ArrayRef<Expr *> DstExprs) {
   assert(DstExprs.size() == varlist_size() && "Number of destination "
                                               "expressions is not the same as "
                                               "the preallocated buffer");
-  std::copy(DstExprs.begin(), DstExprs.end(), getSourceExprs().end());
+  llvm::copy(DstExprs, getSourceExprs().end());
 }
 
 void OMPLastprivateClause::setAssignmentOps(ArrayRef<Expr *> AssignmentOps) {
   assert(AssignmentOps.size() == varlist_size() &&
          "Number of assignment expressions is not the same as the preallocated "
          "buffer");
-  std::copy(AssignmentOps.begin(), AssignmentOps.end(),
-            getDestinationExprs().end());
+  llvm::copy(AssignmentOps, getDestinationExprs().end());
 }
 
 OMPLastprivateClause *OMPLastprivateClause::Create(
@@ -555,32 +554,32 @@ OMPSharedClause *OMPSharedClause::CreateEmpty(const ASTContext &C, unsigned N) {
 void OMPLinearClause::setPrivates(ArrayRef<Expr *> PL) {
   assert(PL.size() == varlist_size() &&
          "Number of privates is not the same as the preallocated buffer");
-  std::copy(PL.begin(), PL.end(), varlist_end());
+  llvm::copy(PL, varlist_end());
 }
 
 void OMPLinearClause::setInits(ArrayRef<Expr *> IL) {
   assert(IL.size() == varlist_size() &&
          "Number of inits is not the same as the preallocated buffer");
-  std::copy(IL.begin(), IL.end(), getPrivates().end());
+  llvm::copy(IL, getPrivates().end());
 }
 
 void OMPLinearClause::setUpdates(ArrayRef<Expr *> UL) {
   assert(UL.size() == varlist_size() &&
          "Number of updates is not the same as the preallocated buffer");
-  std::copy(UL.begin(), UL.end(), getInits().end());
+  llvm::copy(UL, getInits().end());
 }
 
 void OMPLinearClause::setFinals(ArrayRef<Expr *> FL) {
   assert(FL.size() == varlist_size() &&
          "Number of final updates is not the same as the preallocated buffer");
-  std::copy(FL.begin(), FL.end(), getUpdates().end());
+  llvm::copy(FL, getUpdates().end());
 }
 
 void OMPLinearClause::setUsedExprs(ArrayRef<Expr *> UE) {
   assert(
       UE.size() == varlist_size() + 1 &&
       "Number of used expressions is not the same as the preallocated buffer");
-  std::copy(UE.begin(), UE.end(), getFinals().end() + 2);
+  llvm::copy(UE, getFinals().end() + 2);
 }
 
 OMPLinearClause *OMPLinearClause::Create(
@@ -659,22 +658,21 @@ void OMPCopyinClause::setSourceExprs(ArrayRef<Expr *> SrcExprs) {
   assert(SrcExprs.size() == varlist_size() && "Number of source expressions is "
                                               "not the same as the "
                                               "preallocated buffer");
-  std::copy(SrcExprs.begin(), SrcExprs.end(), varlist_end());
+  llvm::copy(SrcExprs, varlist_end());
 }
 
 void OMPCopyinClause::setDestinationExprs(ArrayRef<Expr *> DstExprs) {
   assert(DstExprs.size() == varlist_size() && "Number of destination "
                                               "expressions is not the same as "
                                               "the preallocated buffer");
-  std::copy(DstExprs.begin(), DstExprs.end(), getSourceExprs().end());
+  llvm::copy(DstExprs, getSourceExprs().end());
 }
 
 void OMPCopyinClause::setAssignmentOps(ArrayRef<Expr *> AssignmentOps) {
   assert(AssignmentOps.size() == varlist_size() &&
          "Number of assignment expressions is not the same as the preallocated "
          "buffer");
-  std::copy(AssignmentOps.begin(), AssignmentOps.end(),
-            getDestinationExprs().end());
+  llvm::copy(AssignmentOps, getDestinationExprs().end());
 }
 
 OMPCopyinClause *OMPCopyinClause::Create(
@@ -700,22 +698,21 @@ void OMPCopyprivateClause::setSourceExprs(ArrayRef<Expr *> SrcExprs) {
   assert(SrcExprs.size() == varlist_size() && "Number of source expressions is "
                                               "not the same as the "
                                               "preallocated buffer");
-  std::copy(SrcExprs.begin(), SrcExprs.end(), varlist_end());
+  llvm::copy(SrcExprs, varlist_end());
 }
 
 void OMPCopyprivateClause::setDestinationExprs(ArrayRef<Expr *> DstExprs) {
   assert(DstExprs.size() == varlist_size() && "Number of destination "
                                               "expressions is not the same as "
                                               "the preallocated buffer");
-  std::copy(DstExprs.begin(), DstExprs.end(), getSourceExprs().end());
+  llvm::copy(DstExprs, getSourceExprs().end());
 }
 
 void OMPCopyprivateClause::setAssignmentOps(ArrayRef<Expr *> AssignmentOps) {
   assert(AssignmentOps.size() == varlist_size() &&
          "Number of assignment expressions is not the same as the preallocated "
          "buffer");
-  std::copy(AssignmentOps.begin(), AssignmentOps.end(),
-            getDestinationExprs().end());
+  llvm::copy(AssignmentOps, getDestinationExprs().end());
 }
 
 OMPCopyprivateClause *OMPCopyprivateClause::Create(
@@ -741,28 +738,28 @@ OMPCopyprivateClause *OMPCopyprivateClause::CreateEmpty(const ASTContext &C,
 void OMPReductionClause::setPrivates(ArrayRef<Expr *> Privates) {
   assert(Privates.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(Privates.begin(), Privates.end(), varlist_end());
+  llvm::copy(Privates, varlist_end());
 }
 
 void OMPReductionClause::setLHSExprs(ArrayRef<Expr *> LHSExprs) {
   assert(
       LHSExprs.size() == varlist_size() &&
       "Number of LHS expressions is not the same as the preallocated buffer");
-  std::copy(LHSExprs.begin(), LHSExprs.end(), getPrivates().end());
+  llvm::copy(LHSExprs, getPrivates().end());
 }
 
 void OMPReductionClause::setRHSExprs(ArrayRef<Expr *> RHSExprs) {
   assert(
       RHSExprs.size() == varlist_size() &&
       "Number of RHS expressions is not the same as the preallocated buffer");
-  std::copy(RHSExprs.begin(), RHSExprs.end(), getLHSExprs().end());
+  llvm::copy(RHSExprs, getLHSExprs().end());
 }
 
 void OMPReductionClause::setReductionOps(ArrayRef<Expr *> ReductionOps) {
   assert(ReductionOps.size() == varlist_size() && "Number of reduction "
                                                   "expressions is not the same "
                                                   "as the preallocated buffer");
-  std::copy(ReductionOps.begin(), ReductionOps.end(), getRHSExprs().end());
+  llvm::copy(ReductionOps, getRHSExprs().end());
 }
 
 void OMPReductionClause::setInscanCopyOps(ArrayRef<Expr *> Ops) {
@@ -843,28 +840,28 @@ OMPReductionClause::CreateEmpty(const ASTContext &C, unsigned N,
 void OMPTaskReductionClause::setPrivates(ArrayRef<Expr *> Privates) {
   assert(Privates.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(Privates.begin(), Privates.end(), varlist_end());
+  llvm::copy(Privates, varlist_end());
 }
 
 void OMPTaskReductionClause::setLHSExprs(ArrayRef<Expr *> LHSExprs) {
   assert(
       LHSExprs.size() == varlist_size() &&
       "Number of LHS expressions is not the same as the preallocated buffer");
-  std::copy(LHSExprs.begin(), LHSExprs.end(), getPrivates().end());
+  llvm::copy(LHSExprs, getPrivates().end());
 }
 
 void OMPTaskReductionClause::setRHSExprs(ArrayRef<Expr *> RHSExprs) {
   assert(
       RHSExprs.size() == varlist_size() &&
       "Number of RHS expressions is not the same as the preallocated buffer");
-  std::copy(RHSExprs.begin(), RHSExprs.end(), getLHSExprs().end());
+  llvm::copy(RHSExprs, getLHSExprs().end());
 }
 
 void OMPTaskReductionClause::setReductionOps(ArrayRef<Expr *> ReductionOps) {
   assert(ReductionOps.size() == varlist_size() && "Number of task reduction "
                                                   "expressions is not the same "
                                                   "as the preallocated buffer");
-  std::copy(ReductionOps.begin(), ReductionOps.end(), getRHSExprs().end());
+  llvm::copy(ReductionOps, getRHSExprs().end());
 }
 
 OMPTaskReductionClause *OMPTaskReductionClause::Create(
@@ -896,28 +893,28 @@ OMPTaskReductionClause *OMPTaskReductionClause::CreateEmpty(const ASTContext &C,
 void OMPInReductionClause::setPrivates(ArrayRef<Expr *> Privates) {
   assert(Privates.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(Privates.begin(), Privates.end(), varlist_end());
+  llvm::copy(Privates, varlist_end());
 }
 
 void OMPInReductionClause::setLHSExprs(ArrayRef<Expr *> LHSExprs) {
   assert(
       LHSExprs.size() == varlist_size() &&
       "Number of LHS expressions is not the same as the preallocated buffer");
-  std::copy(LHSExprs.begin(), LHSExprs.end(), getPrivates().end());
+  llvm::copy(LHSExprs, getPrivates().end());
 }
 
 void OMPInReductionClause::setRHSExprs(ArrayRef<Expr *> RHSExprs) {
   assert(
       RHSExprs.size() == varlist_size() &&
       "Number of RHS expressions is not the same as the preallocated buffer");
-  std::copy(RHSExprs.begin(), RHSExprs.end(), getLHSExprs().end());
+  llvm::copy(RHSExprs, getLHSExprs().end());
 }
 
 void OMPInReductionClause::setReductionOps(ArrayRef<Expr *> ReductionOps) {
   assert(ReductionOps.size() == varlist_size() && "Number of in reduction "
                                                   "expressions is not the same "
                                                   "as the preallocated buffer");
-  std::copy(ReductionOps.begin(), ReductionOps.end(), getRHSExprs().end());
+  llvm::copy(ReductionOps, getRHSExprs().end());
 }
 
 void OMPInReductionClause::setTaskgroupDescriptors(
@@ -925,8 +922,7 @@ void OMPInReductionClause::setTaskgroupDescriptors(
   assert(TaskgroupDescriptors.size() == varlist_size() &&
          "Number of in reduction descriptors is not the same as the "
          "preallocated buffer");
-  std::copy(TaskgroupDescriptors.begin(), TaskgroupDescriptors.end(),
-            getReductionOps().end());
+  llvm::copy(TaskgroupDescriptors, getReductionOps().end());
 }
 
 OMPInReductionClause *OMPInReductionClause::Create(
@@ -1322,13 +1318,13 @@ OMPFromClause::CreateEmpty(const ASTContext &C,
 void OMPUseDevicePtrClause::setPrivateCopies(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() &&
          "Number of private copies is not the same as the preallocated buffer");
-  std::copy(VL.begin(), VL.end(), varlist_end());
+  llvm::copy(VL, varlist_end());
 }
 
 void OMPUseDevicePtrClause::setInits(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() &&
          "Number of inits is not the same as the preallocated buffer");
-  std::copy(VL.begin(), VL.end(), getPrivateCopies().end());
+  llvm::copy(VL, getPrivateCopies().end());
 }
 
 OMPUseDevicePtrClause *OMPUseDevicePtrClause::Create(
@@ -1543,7 +1539,7 @@ OMPNontemporalClause *OMPNontemporalClause::CreateEmpty(const ASTContext &C,
 void OMPNontemporalClause::setPrivateRefs(ArrayRef<Expr *> VL) {
   assert(VL.size() == varlist_size() && "Number of private references is not "
                                         "the same as the preallocated buffer");
-  std::copy(VL.begin(), VL.end(), varlist_end());
+  llvm::copy(VL, varlist_end());
 }
 
 OMPInclusiveClause *OMPInclusiveClause::Create(const ASTContext &C,
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index 0b0289cd5ec0..4fc4a99ad240 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -384,8 +384,7 @@ CompoundStmt::CompoundStmt(ArrayRef<Stmt *> Stmts, FPOptionsOverride FPFeatures,
 void CompoundStmt::setStmts(ArrayRef<Stmt *> Stmts) {
   assert(CompoundStmtBits.NumStmts == Stmts.size() &&
          "NumStmts doesn't fit in bits of CompoundStmtBits.NumStmts!");
-
-  std::copy(Stmts.begin(), Stmts.end(), body_begin());
+  llvm::copy(Stmts, body_begin());
 }
 
 CompoundStmt *CompoundStmt::Create(const ASTContext &C, ArrayRef<Stmt *> Stmts,
@@ -947,10 +946,10 @@ void MSAsmStmt::initialize(const ASTContext &C, StringRef asmstr,
   AsmStr = copyIntoContext(C, asmstr);
 
   Exprs = new (C) Stmt*[exprs.size()];
-  std::copy(exprs.begin(), exprs.end(), Exprs);
+  llvm::copy(exprs, Exprs);
 
   AsmToks = new (C) Token[asmtoks.size()];
-  std::copy(asmtoks.begin(), asmtoks.end(), AsmToks);
+  llvm::copy(asmtoks, AsmToks);
 
   Constraints = new (C) StringRef[exprs.size()];
   std::transform(constraints.begin(), constraints.end(), Constraints,
@@ -1385,7 +1384,7 @@ CapturedStmt::CapturedStmt(Stmt *S, CapturedRegionKind Kind,
 
   // Copy all Capture objects.
   Capture *Buffer = getStoredCaptures();
-  std::copy(Captures.begin(), Captures.end(), Buffer);
+  llvm::copy(Captures, Buffer);
 }
 
 CapturedStmt::CapturedStmt(EmptyShell Empty, unsigned NumCaptures)
diff --git a/clang/lib/AST/StmtCXX.cpp b/clang/lib/AST/StmtCXX.cpp
index 0d6fc848f739..6a69fe75136f 100644
--- a/clang/lib/AST/StmtCXX.cpp
+++ b/clang/lib/AST/StmtCXX.cpp
@@ -42,7 +42,7 @@ CXXTryStmt::CXXTryStmt(SourceLocation tryLoc, CompoundStmt *tryBlock,
     : Stmt(CXXTryStmtClass), TryLoc(tryLoc), NumHandlers(handlers.size()) {
   Stmt **Stmts = getStmts();
   Stmts[0] = tryBlock;
-  std::copy(handlers.begin(), handlers.end(), Stmts + 1);
+  llvm::copy(handlers, Stmts + 1);
 }
 
 CXXForRangeStmt::CXXForRangeStmt(Stmt *Init, DeclStmt *Range,
@@ -123,6 +123,5 @@ CoroutineBodyStmt::CoroutineBodyStmt(CoroutineBodyStmt::CtorArgs const &Args)
   SubStmts[CoroutineBodyStmt::ReturnStmt] = Args.ReturnStmt;
   SubStmts[CoroutineBodyStmt::ReturnStmtOnAllocFailure] =
       Args.ReturnStmtOnAllocFailure;
-  std::copy(Args.ParamMoves.begin(), Args.ParamMoves.end(),
-            const_cast<Stmt **>(getParamMoves().data()));
+  llvm::copy(Args.ParamMoves, const_cast<Stmt **>(getParamMoves().data()));
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 543f05e4ee7c..2c1158e8f9b9 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3982,9 +3982,8 @@ CountAttributedType::CountAttributedType(
   CountAttributedTypeBits.CountInBytes = CountInBytes;
   CountAttributedTypeBits.OrNull = OrNull;
   auto *DeclSlot = getTrailingObjects();
+  llvm::copy(CoupledDecls, DeclSlot);
   Decls = llvm::ArrayRef(DeclSlot, CoupledDecls.size());
-  for (unsigned i = 0; i != CoupledDecls.size(); ++i)
-    DeclSlot[i] = CoupledDecls[i];
 }
 
 StringRef CountAttributedType::getAttributeName(bool WithMacroPrefix) const {

From c445ca5412a7ee1bf7a598e860b50f29602e1981 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 23 Jun 2025 19:11:01 +0200
Subject: [PATCH 1316/1322] [clang][bytecode] Remove incorrect assertion
 (#145341)

P.block() will assert that P is a block pointer, which it doesn't have
to be here.
---
 clang/lib/AST/ByteCode/Pointer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 50453c72c582..f0b0384f32ac 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -114,7 +114,6 @@ void Pointer::operator=(Pointer &&P) {
     }
 
     if (Block *Pointee = PointeeStorage.BS.Pointee) {
-      assert(P.block() != this->block());
       Pointee->removePointer(this);
       PointeeStorage.BS.Pointee = nullptr;
       Pointee->cleanup();

From 836ff367d0bca9369fd2db237e9b5d7b929bb593 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 23 Jun 2025 19:11:17 +0200
Subject: [PATCH 1317/1322] [clang][bytecode] Fix IntegralAP::{isMin,isMax}
 (#145339)

We need to take signeness into account here.
---
 clang/lib/AST/ByteCode/IntegralAP.h | 12 ++++++++++--
 clang/test/AST/ByteCode/intap.cpp   |  7 +++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index e7499fc9bf5a..6683db941c73 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -163,8 +163,16 @@ public:
       return !getValue().isNonNegative();
     return false;
   }
-  bool isMin() const { return getValue().isMinValue(); }
-  bool isMax() const { return getValue().isMaxValue(); }
+  bool isMin() const {
+    if constexpr (Signed)
+      return getValue().isMinSignedValue();
+    return getValue().isMinValue();
+  }
+  bool isMax() const {
+    if constexpr (Signed)
+      return getValue().isMaxSignedValue();
+    return getValue().isMaxValue();
+  }
   static constexpr bool isSigned() { return Signed; }
   bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
 
diff --git a/clang/test/AST/ByteCode/intap.cpp b/clang/test/AST/ByteCode/intap.cpp
index 11dd9edb61a9..a7e43b34a7d2 100644
--- a/clang/test/AST/ByteCode/intap.cpp
+++ b/clang/test/AST/ByteCode/intap.cpp
@@ -48,6 +48,13 @@ static_assert(DivA / DivB == 2, "");
 constexpr _BitInt(4) DivC = DivA / 0; // both-error {{must be initialized by a constant expression}} \
                                       // both-note {{division by zero}}
 
+constexpr __int128 isMinDiv() {
+  return __int128{0} / __int128{-1};
+}
+static_assert(isMinDiv() == 0, "");
+
+
+
 constexpr _BitInt(7) RemA = 47;
 constexpr _BitInt(6) RemB = 9;
 static_assert(RemA % RemB == 2, "");

From 319a51a5ffb807b88ae7f73676894bf306a0d134 Mon Sep 17 00:00:00 2001
From: "Henrik G. Olsson" <hnrklssn@gmail.com>
Date: Mon, 23 Jun 2025 10:16:31 -0700
Subject: [PATCH 1318/1322] [Modules] Record whether VarDecl initializers
 contain side effects (#143739)

Calling `DeclMustBeEmitted` should not lead to more deserialization, as
it may occur before previous deserialization has finished.
When passed a `VarDecl` with an initializer however, `DeclMustBeEmitted`
needs to know whether that initializer contains side effects. When the
`VarDecl` is deserialized but the initializer is not, this triggers
deserialization of the initializer. To avoid this we add a bit to the
serialization format for `VarDecl`s, indicating whether its initializer
contains side effects or not, so that the `ASTReader` can query this
information directly without deserializing the initializer.

rdar://153085264
---
 clang/include/clang/AST/Decl.h                |  5 +++
 clang/include/clang/AST/ExternalASTSource.h   | 16 ++++++++
 .../clang/Sema/MultiplexExternalSemaSource.h  |  2 +
 clang/include/clang/Serialization/ASTReader.h |  8 ++++
 clang/lib/AST/ASTContext.cpp                  |  4 +-
 clang/lib/AST/Decl.cpp                        | 24 ++++++++++++
 .../lib/Sema/MultiplexExternalSemaSource.cpp  |  8 ++++
 clang/lib/Serialization/ASTReader.cpp         |  4 ++
 clang/lib/Serialization/ASTReaderDecl.cpp     |  3 ++
 clang/lib/Serialization/ASTWriterDecl.cpp     | 14 ++++---
 clang/test/Modules/var-init-side-effects.cpp  | 37 +++++++++++++++++++
 11 files changed, 116 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/Modules/var-init-side-effects.cpp

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 05aac15b30cd..23cd25e2e307 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -1352,6 +1352,11 @@ public:
     return const_cast<VarDecl *>(this)->getInitializingDeclaration();
   }
 
+  /// Checks whether this declaration has an initializer with side effects,
+  /// without triggering deserialization if the initializer is not yet
+  /// deserialized.
+  bool hasInitWithSideEffects() const;
+
   /// Determine whether this variable's value might be usable in a
   /// constant expression, according to the relevant language standard.
   /// This only checks properties of the declaration, and does not check
diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h
index f45e3af7602c..e91d5132da10 100644
--- a/clang/include/clang/AST/ExternalASTSource.h
+++ b/clang/include/clang/AST/ExternalASTSource.h
@@ -51,6 +51,7 @@ class RecordDecl;
 class Selector;
 class Stmt;
 class TagDecl;
+class VarDecl;
 
 /// Abstract interface for external sources of AST nodes.
 ///
@@ -195,6 +196,10 @@ public:
   /// module.
   virtual bool wasThisDeclarationADefinition(const FunctionDecl *FD);
 
+  virtual bool hasInitializerWithSideEffects(const VarDecl *VD) const {
+    return false;
+  }
+
   /// Finds all declarations lexically contained within the given
   /// DeclContext, after applying an optional filter predicate.
   ///
@@ -429,6 +434,17 @@ public:
     return GetPtr();
   }
 
+  /// Retrieve the pointer to the AST node that this lazy pointer points to,
+  /// if it can be done without triggering deserialization.
+  ///
+  /// \returns a pointer to the AST node, or null if not yet deserialized.
+  T *getWithoutDeserializing() const {
+    if (isOffset()) {
+      return nullptr;
+    }
+    return GetPtr();
+  }
+
   /// Retrieve the address of the AST node pointer. Deserializes the pointee if
   /// necessary.
   T **getAddressOfPointer(ExternalASTSource *Source) const {
diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h
index 391c2177d75e..7c66c26a17a1 100644
--- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h
+++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h
@@ -94,6 +94,8 @@ public:
 
   bool wasThisDeclarationADefinition(const FunctionDecl *FD) override;
 
+  bool hasInitializerWithSideEffects(const VarDecl *VD) const override;
+
   /// Find all declarations with the given name in the
   /// given context.
   bool FindExternalVisibleDeclsByName(const DeclContext *DC,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index be1c6e081759..7a4b7d21bb20 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1455,6 +1455,12 @@ private:
     const StringRef &operator*() && = delete;
   };
 
+  /// VarDecls with initializers containing side effects must be emitted,
+  /// but DeclMustBeEmitted is not allowed to deserialize the intializer.
+  /// FIXME: Lower memory usage by removing VarDecls once the initializer
+  /// is deserialized.
+  llvm::SmallPtrSet<Decl *, 16> InitSideEffectVars;
+
 public:
   /// Get the buffer for resolving paths.
   SmallString<0> &getPathBuf() { return PathBuf; }
@@ -2406,6 +2412,8 @@ public:
 
   bool wasThisDeclarationADefinition(const FunctionDecl *FD) override;
 
+  bool hasInitializerWithSideEffects(const VarDecl *VD) const override;
+
   /// Retrieve a selector from the given module with its local ID
   /// number.
   Selector getLocalSelector(ModuleFile &M, unsigned LocalID);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 02d6570d0ea0..faad0427178b 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -13111,9 +13111,7 @@ bool ASTContext::DeclMustBeEmitted(const Decl *D) {
     return true;
 
   // Variables that have initialization with side-effects are required.
-  if (VD->getInit() && VD->getInit()->HasSideEffects(*this) &&
-      // We can get a value-dependent initializer during error recovery.
-      (VD->getInit()->isValueDependent() || !VD->evaluateValue()))
+  if (VD->hasInitWithSideEffects())
     return true;
 
   // Likewise, variables with tuple-like bindings are required if their
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 20a3db1c69b8..9e993f9291a7 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2441,6 +2441,30 @@ VarDecl *VarDecl::getInitializingDeclaration() {
   return Def;
 }
 
+bool VarDecl::hasInitWithSideEffects() const {
+  if (!hasInit())
+    return false;
+
+  // Check if we can get the initializer without deserializing
+  const Expr *E = nullptr;
+  if (auto *S = dyn_cast<Stmt *>(Init)) {
+    E = cast<Expr>(S);
+  } else {
+    E = cast_or_null<Expr>(getEvaluatedStmt()->Value.getWithoutDeserializing());
+  }
+
+  if (E)
+    return E->HasSideEffects(getASTContext()) &&
+           // We can get a value-dependent initializer during error recovery.
+           (E->isValueDependent() || !evaluateValue());
+
+  assert(getEvaluatedStmt()->Value.isOffset());
+  // ASTReader tracks this without having to deserialize the initializer
+  if (auto Source = getASTContext().getExternalSource())
+    return Source->hasInitializerWithSideEffects(this);
+  return false;
+}
+
 bool VarDecl::isOutOfLine() const {
   if (Decl::isOutOfLine())
     return true;
diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp
index fbfb242598c2..9f19f13592e8 100644
--- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp
+++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp
@@ -115,6 +115,14 @@ bool MultiplexExternalSemaSource::wasThisDeclarationADefinition(
   return false;
 }
 
+bool MultiplexExternalSemaSource::hasInitializerWithSideEffects(
+    const VarDecl *VD) const {
+  for (const auto &S : Sources)
+    if (S->hasInitializerWithSideEffects(VD))
+      return true;
+  return false;
+}
+
 bool MultiplexExternalSemaSource::FindExternalVisibleDeclsByName(
     const DeclContext *DC, DeclarationName Name,
     const DeclContext *OriginalDC) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index a3fbc3d25aca..6f082fe840b4 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -9722,6 +9722,10 @@ bool ASTReader::wasThisDeclarationADefinition(const FunctionDecl *FD) {
   return ThisDeclarationWasADefinitionSet.contains(FD);
 }
 
+bool ASTReader::hasInitializerWithSideEffects(const VarDecl *VD) const {
+  return InitSideEffectVars.count(VD);
+}
+
 Selector ASTReader::getLocalSelector(ModuleFile &M, unsigned LocalID) {
   return DecodeSelector(getGlobalSelectorID(M, LocalID));
 }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 7f7882654b9d..0ffd78424be0 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1628,6 +1628,9 @@ RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
     VD->NonParmVarDeclBits.PreviousDeclInSameBlockScope =
         VarDeclBits.getNextBit();
 
+    if (VarDeclBits.getNextBit())
+      Reader.InitSideEffectVars.insert(VD);
+
     VD->NonParmVarDeclBits.EscapingByref = VarDeclBits.getNextBit();
     HasDeducedType = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.ImplicitParamKind =
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 2d93832a9ac3..2e390dbe79ec 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1306,6 +1306,7 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
     VarDeclBits.addBit(D->isConstexpr());
     VarDeclBits.addBit(D->isInitCapture());
     VarDeclBits.addBit(D->isPreviousDeclInSameBlockScope());
+    VarDeclBits.addBit(D->hasInitWithSideEffects());
 
     VarDeclBits.addBit(D->isEscapingByref());
     HasDeducedType = D->getType()->getContainedDeducedType();
@@ -1355,10 +1356,11 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
       !D->hasExtInfo() && D->getFirstDecl() == D->getMostRecentDecl() &&
       D->getKind() == Decl::Var && !D->isInline() && !D->isConstexpr() &&
       !D->isInitCapture() && !D->isPreviousDeclInSameBlockScope() &&
-      !D->isEscapingByref() && !HasDeducedType &&
-      D->getStorageDuration() != SD_Static && !D->getDescribedVarTemplate() &&
-      !D->getMemberSpecializationInfo() && !D->isObjCForDecl() &&
-      !isa<ImplicitParamDecl>(D) && !D->isEscapingByref())
+      !D->hasInitWithSideEffects() && !D->isEscapingByref() &&
+      !HasDeducedType && D->getStorageDuration() != SD_Static &&
+      !D->getDescribedVarTemplate() && !D->getMemberSpecializationInfo() &&
+      !D->isObjCForDecl() && !isa<ImplicitParamDecl>(D) &&
+      !D->isEscapingByref())
     AbbrevToUse = Writer.getDeclVarAbbrev();
 
   Code = serialization::DECL_VAR;
@@ -2731,12 +2733,12 @@ void ASTWriter::WriteDeclAbbrevs() {
   // VarDecl
   Abv->Add(BitCodeAbbrevOp(
       BitCodeAbbrevOp::Fixed,
-      21)); // Packed Var Decl bits:  Linkage, ModulesCodegen,
+      22)); // Packed Var Decl bits:  Linkage, ModulesCodegen,
             // SClass, TSCSpec, InitStyle,
             // isARCPseudoStrong, IsThisDeclarationADemotedDefinition,
             // isExceptionVariable, isNRVOVariable, isCXXForRangeDecl,
             // isInline, isInlineSpecified, isConstexpr,
-            // isInitCapture, isPrevDeclInSameScope,
+            // isInitCapture, isPrevDeclInSameScope, hasInitWithSideEffects,
             // EscapingByref, HasDeducedType, ImplicitParamKind, isObjCForDecl
   Abv->Add(BitCodeAbbrevOp(0));                         // VarKind (local enum)
   // Type Source Info
diff --git a/clang/test/Modules/var-init-side-effects.cpp b/clang/test/Modules/var-init-side-effects.cpp
new file mode 100644
index 000000000000..438a9eb20427
--- /dev/null
+++ b/clang/test/Modules/var-init-side-effects.cpp
@@ -0,0 +1,37 @@
+// Tests referencing variable with initializer containing side effect across module boundary
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/Foo.cppm \
+// RUN: -o %t/Foo.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface \
+// RUN: -fmodule-file=Foo=%t/Foo.pcm \
+// RUN: %t/Bar.cppm \
+// RUN: -o %t/Bar.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-obj \
+// RUN: -main-file-name Bar.cppm \
+// RUN: -fmodule-file=Foo=%t/Foo.pcm \
+// RUN: -x pcm %t/Bar.pcm \
+// RUN: -o %t/Bar.o
+
+//--- Foo.cppm
+export module Foo;
+
+export {
+class S {};
+
+inline S s = S{};
+}
+
+//--- Bar.cppm
+export module Bar;
+import Foo;
+
+S bar() {
+  return s;
+}
+

From 735d721de451067c3a618b309703d0b8beb9cacc Mon Sep 17 00:00:00 2001
From: Wael Yehia <wmyehia2001@yahoo.com>
Date: Mon, 23 Jun 2025 13:22:33 -0400
Subject: [PATCH 1319/1322] [PowerPC] Fix handling of undefs in the
 PPC::isSplatShuffleMask query (#145149)

Currently, the query assumes that a single undef byte implies the rest of
the `EltSize - 1` bytes are undefs, but that's not always true.
e.g. isSplatShuffleMask(
<0,1,2,3,4,5,6,7,undef,undef,undef,undef,0,1,2,3>, 8) should return
false.

---------

Co-authored-by: Wael Yehia <wyehia@ca.ibm.com>
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 13 +++++++++----
 llvm/test/CodeGen/PowerPC/pr141642.ll       | 13 +++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr141642.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 421a808de667..88c6fe632d26 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2242,10 +2242,15 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
       return false;
 
   for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
-    if (N->getMaskElt(i) < 0) continue;
-    for (unsigned j = 0; j != EltSize; ++j)
-      if (N->getMaskElt(i+j) != N->getMaskElt(j))
-        return false;
+    // An UNDEF element is a sequence of UNDEF bytes.
+    if (N->getMaskElt(i) < 0) {
+      for (unsigned j = 1; j != EltSize; ++j)
+        if (N->getMaskElt(i + j) >= 0)
+          return false;
+    } else
+      for (unsigned j = 0; j != EltSize; ++j)
+        if (N->getMaskElt(i + j) != N->getMaskElt(j))
+          return false;
   }
   return true;
 }
diff --git a/llvm/test/CodeGen/PowerPC/pr141642.ll b/llvm/test/CodeGen/PowerPC/pr141642.ll
new file mode 100644
index 000000000000..38a706574786
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr141642.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O0 -debug-only=selectiondag -o - < %s 2>&1 | \
+; RUN:  FileCheck %s
+; CHECK-NOT: lxvdsx
+; CHECK-NOT: LD_SPLAT
+
+define weak_odr dso_local void @unpack(ptr noalias noundef %packed_in) local_unnamed_addr {
+entry:
+  %ld = load <2 x i32>, ptr %packed_in, align 2
+  %shuf = shufflevector <2 x i32> %ld, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+  %ie = insertelement <4 x i32> %shuf, i32 7, i32 2
+  store <4 x i32> %shuf, ptr %packed_in, align 2
+  ret void
+}

From 53edba8091dfd24e2b44204ae8263b009d95414f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 23 Jun 2025 09:46:41 -0700
Subject: [PATCH 1320/1322] [RISCV] Add vp.reverse tests for Zvfh and
 fractional lmuls. NFC

---
 ....ll => fixed-vectors-vp-reverser-float.ll} |  32 ++-
 ...rs.ll => fixed-vectors-vp-reverser-int.ll} |   0
 .../CodeGen/RISCV/rvv/vp-reverse-float.ll     | 215 ++++++++++++++++--
 llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll | 210 ++++++++++++++---
 4 files changed, 412 insertions(+), 45 deletions(-)
 rename llvm/test/CodeGen/RISCV/rvv/{vp-reverse-float-fixed-vectors.ll => fixed-vectors-vp-reverser-float.ll} (65%)
 rename llvm/test/CodeGen/RISCV/rvv/{vp-reverse-int-fixed-vectors.ll => fixed-vectors-vp-reverser-int.ll} (100%)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-float-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-float.ll
similarity index 65%
rename from llvm/test/CodeGen/RISCV/rvv/vp-reverse-float-fixed-vectors.ll
rename to llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-float.ll
index 136f6e7bc999..1d21cb558698 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-float-fixed-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-float.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v,+zvfh -verify-machineinstrs -riscv-v-vector-bits-min=128 \
 ; RUN:   < %s | FileCheck %s
 
 define <2 x double> @test_vp_reverse_v2f64_masked(<2 x double> %src, <2 x i1> %mask, i32 zeroext %evl) {
@@ -60,5 +60,31 @@ define <4 x float> @test_vp_reverse_v4f32(<4 x float> %src, i32 zeroext %evl) {
   ret <4 x float> %dst
 }
 
-declare <2 x double> @llvm.experimental.vp.reverse.v2f64(<2 x double>,<2 x i1>,i32)
-declare <4 x float> @llvm.experimental.vp.reverse.v4f32(<4 x float>,<4 x i1>,i32)
+define <4 x half> @test_vp_reverse_v4f16_masked(<4 x half> %src, <4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_v4f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <4 x half> @llvm.experimental.vp.reverse.v4f16(<4 x half> %src, <4 x i1> %mask, i32 %evl)
+  ret <4 x half> %dst
+}
+
+define <4 x half> @test_vp_reverse_v4f16(<4 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <4 x half> @llvm.experimental.vp.reverse.v4f16(<4 x half> %src, <4 x i1> splat (i1 1), i32 %evl)
+  ret <4 x half> %dst
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-int.ll
similarity index 100%
rename from llvm/test/CodeGen/RISCV/rvv/vp-reverse-int-fixed-vectors.ll
rename to llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-reverser-int.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-float.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-float.ll
index b235990ab5dd..4bbd10df5254 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-float.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-float.ll
@@ -1,5 +1,92 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+v,+zvfh -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 1 x half> @test_vp_reverse_nxv1f16_masked(<vscale x 1 x half> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 1 x half> @llvm.experimental.vp.reverse.nxv1f16(<vscale x 1 x half> %src, <vscale x 1 x i1> %mask, i32 %evl)
+  ret <vscale x 1 x half> %dst
+}
+
+define <vscale x 1 x half> @test_vp_reverse_nxv1f16(<vscale x 1 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 1 x half> @llvm.experimental.vp.reverse.nxv1f16(<vscale x 1 x half> %src, <vscale x 1 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 1 x half> %dst
+}
+
+define <vscale x 1 x float> @test_vp_reverse_nxv1f32_masked(<vscale x 1 x float> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1f32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 1 x float> @llvm.experimental.vp.reverse.nxv1f32(<vscale x 1 x float> %src, <vscale x 1 x i1> %mask, i32 %evl)
+  ret <vscale x 1 x float> %dst
+}
+
+define <vscale x 1 x float> @test_vp_reverse_nxv1f32(<vscale x 1 x float> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 1 x float> @llvm.experimental.vp.reverse.nxv1f32(<vscale x 1 x float> %src, <vscale x 1 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 1 x float> %dst
+}
+
+define <vscale x 2 x half> @test_vp_reverse_nxv2f16_masked(<vscale x 2 x half> %src, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv2f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 2 x half> @llvm.experimental.vp.reverse.nxv2f16(<vscale x 2 x half> %src, <vscale x 2 x i1> %mask, i32 %evl)
+  ret <vscale x 2 x half> %dst
+}
+
+define <vscale x 2 x half> @test_vp_reverse_nxv2f16(<vscale x 2 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 2 x half> @llvm.experimental.vp.reverse.nxv2f16(<vscale x 2 x half> %src, <vscale x 2 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 2 x half> %dst
+}
 
 define <vscale x 1 x double> @test_vp_reverse_nxv1f64_masked(<vscale x 1 x double> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv1f64_masked:
@@ -59,6 +146,35 @@ define <vscale x 2 x float> @test_vp_reverse_nxv2f32(<vscale x 2 x float> %src,
   ret <vscale x 2 x float> %dst
 }
 
+define <vscale x 4 x half> @test_vp_reverse_nxv4f16_masked(<vscale x 4 x half> %src, <vscale x 4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv4f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 4 x half> @llvm.experimental.vp.reverse.nxv4f16(<vscale x 4 x half> %src, <vscale x 4 x i1> %mask, i32 %evl)
+  ret <vscale x 4 x half> %dst
+}
+
+define <vscale x 4 x half> @test_vp_reverse_nxv4f16(<vscale x 4 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 4 x half> @llvm.experimental.vp.reverse.nxv4f16(<vscale x 4 x half> %src, <vscale x 4 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 4 x half> %dst
+}
+
 define <vscale x 2 x double> @test_vp_reverse_nxv2f64_masked(<vscale x 2 x double> %src, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv2f64_masked:
 ; CHECK:       # %bb.0:
@@ -117,6 +233,35 @@ define <vscale x 4 x float> @test_vp_reverse_nxv4f32(<vscale x 4 x float> %src,
   ret <vscale x 4 x float> %dst
 }
 
+define <vscale x 8 x half> @test_vp_reverse_nxv8f16_masked(<vscale x 8 x half> %src, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv8f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v10, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v12, v10, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v10, v8, v12, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 8 x half> @llvm.experimental.vp.reverse.nxv8f16(<vscale x 8 x half> %src, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x half> %dst
+}
+
+define <vscale x 8 x half> @test_vp_reverse_nxv8f16(<vscale x 8 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 8 x half> @llvm.experimental.vp.reverse.nxv8f16(<vscale x 8 x half> %src, <vscale x 8 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 8 x half> %dst
+}
+
 define <vscale x 4 x double> @test_vp_reverse_nxv4f64_masked(<vscale x 4 x double> %src, <vscale x 4 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv4f64_masked:
 ; CHECK:       # %bb.0:
@@ -175,6 +320,35 @@ define <vscale x 8 x float> @test_vp_reverse_nxv8f32(<vscale x 8 x float> %src,
   ret <vscale x 8 x float> %dst
 }
 
+define <vscale x 16 x half> @test_vp_reverse_nxv16f16_masked(<vscale x 16 x half> %src, <vscale x 16 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv16f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vid.v v12, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v16, v12, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v12, v8, v16, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 16 x half> @llvm.experimental.vp.reverse.nxv16f16(<vscale x 16 x half> %src, <vscale x 16 x i1> %mask, i32 %evl)
+  ret <vscale x 16 x half> %dst
+}
+
+define <vscale x 16 x half> @test_vp_reverse_nxv16f16(<vscale x 16 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vrsub.vx v16, v12, a1
+; CHECK-NEXT:    vrgather.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 16 x half> @llvm.experimental.vp.reverse.nxv16f16(<vscale x 16 x half> %src, <vscale x 16 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 16 x half> %dst
+}
+
 define <vscale x 8 x double> @test_vp_reverse_nxv8f64_masked(<vscale x 8 x double> %src, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv8f64_masked:
 ; CHECK:       # %bb.0:
@@ -233,18 +407,31 @@ define <vscale x 16 x float> @test_vp_reverse_nxv16f32(<vscale x 16 x float> %sr
   ret <vscale x 16 x float> %dst
 }
 
-; LMUL = 1
-declare <vscale x 1 x double> @llvm.experimental.vp.reverse.nxv1f64(<vscale x 1 x double>,<vscale x 1 x i1>,i32)
-declare <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float>,<vscale x 2 x i1>,i32)
+define <vscale x 32 x half> @test_vp_reverse_nxv32f16_masked(<vscale x 32 x half> %src, <vscale x 32 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv32f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vid.v v16, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v24, v16, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 32 x half> @llvm.experimental.vp.reverse.nxv32f16(<vscale x 32 x half> %src, <vscale x 32 x i1> %mask, i32 %evl)
+  ret <vscale x 32 x half> %dst
+}
 
-; LMUL = 2
-declare <vscale x 2 x double> @llvm.experimental.vp.reverse.nxv2f64(<vscale x 2 x double>,<vscale x 2 x i1>,i32)
-declare <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float>,<vscale x 4 x i1>,i32)
+define <vscale x 32 x half> @test_vp_reverse_nxv32f16(<vscale x 32 x half> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vrsub.vx v24, v16, a1
+; CHECK-NEXT:    vrgather.vv v16, v8, v24
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
 
-; LMUL = 4
-declare <vscale x 4 x double> @llvm.experimental.vp.reverse.nxv4f64(<vscale x 4 x double>,<vscale x 4 x i1>,i32)
-declare <vscale x 8 x float> @llvm.experimental.vp.reverse.nxv8f32(<vscale x 8 x float>,<vscale x 8 x i1>,i32)
-
-; LMUL = 8
-declare <vscale x 8 x double> @llvm.experimental.vp.reverse.nxv8f64(<vscale x 8 x double>,<vscale x 8 x i1>,i32)
-declare <vscale x 16 x float> @llvm.experimental.vp.reverse.nxv16f32(<vscale x 16 x float>,<vscale x 16 x i1>,i32)
+  %dst = call <vscale x 32 x half> @llvm.experimental.vp.reverse.nxv32f16(<vscale x 32 x half> %src, <vscale x 32 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 32 x half> %dst
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index 507f5154cf1a..c96a7d774a5d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -1,6 +1,186 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s
 
+define <vscale x 1 x i8> @test_vp_reverse_nxv1i8_masked(<vscale x 1 x i8> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 1 x i8> @llvm.experimental.vp.reverse.nxv1i8(<vscale x 1 x i8> %src, <vscale x 1 x i1> %mask, i32 %evl)
+  ret <vscale x 1 x i8> %dst
+}
+
+define <vscale x 1 x i8> @test_vp_reverse_nxv1i8(<vscale x 1 x i8> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 1 x i8> @llvm.experimental.vp.reverse.nxv1i8(<vscale x 1 x i8> %src, <vscale x 1 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 1 x i8> %dst
+}
+
+define <vscale x 1 x i16> @test_vp_reverse_nxv1i16_masked(<vscale x 1 x i16> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1i16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 1 x i16> @llvm.experimental.vp.reverse.nxv1i16(<vscale x 1 x i16> %src, <vscale x 1 x i1> %mask, i32 %evl)
+  ret <vscale x 1 x i16> %dst
+}
+
+define <vscale x 1 x i16> @test_vp_reverse_nxv1i16(<vscale x 1 x i16> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 1 x i16> @llvm.experimental.vp.reverse.nxv1i16(<vscale x 1 x i16> %src, <vscale x 1 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 1 x i16> %dst
+}
+
+define <vscale x 2 x i8> @test_vp_reverse_nxv2i8_masked(<vscale x 2 x i8> %src, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv2i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 2 x i8> @llvm.experimental.vp.reverse.nxv2i8(<vscale x 2 x i8> %src, <vscale x 2 x i1> %mask, i32 %evl)
+  ret <vscale x 2 x i8> %dst
+}
+
+define <vscale x 2 x i8> @test_vp_reverse_nxv2i8(<vscale x 2 x i8> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 2 x i8> @llvm.experimental.vp.reverse.nxv2i8(<vscale x 2 x i8> %src, <vscale x 2 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 2 x i8> %dst
+}
+
+define <vscale x 1 x i32> @test_vp_reverse_nxv1i32_masked(<vscale x 1 x i32> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1i32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 1 x i32> @llvm.experimental.vp.reverse.nxv1i32(<vscale x 1 x i32> %src, <vscale x 1 x i1> %mask, i32 %evl)
+  ret <vscale x 1 x i32> %dst
+}
+
+define <vscale x 1 x i32> @test_vp_reverse_nxv1i32(<vscale x 1 x i32> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 1 x i32> @llvm.experimental.vp.reverse.nxv1i32(<vscale x 1 x i32> %src, <vscale x 1 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 1 x i32> %dst
+}
+
+define <vscale x 2 x i16> @test_vp_reverse_nxv2i16_masked(<vscale x 2 x i16> %src, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv2i16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 2 x i16> @llvm.experimental.vp.reverse.nxv2i16(<vscale x 2 x i16> %src, <vscale x 2 x i1> %mask, i32 %evl)
+  ret <vscale x 2 x i16> %dst
+}
+
+define <vscale x 2 x i16> @test_vp_reverse_nxv2i16(<vscale x 2 x i16> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 2 x i16> @llvm.experimental.vp.reverse.nxv2i16(<vscale x 2 x i16> %src, <vscale x 2 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 2 x i16> %dst
+}
+
+define <vscale x 4 x i8> @test_vp_reverse_nxv4i8_masked(<vscale x 4 x i8> %src, <vscale x 4 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv4i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9, v0.t
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vrsub.vx v10, v9, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %dst = call <vscale x 4 x i8> @llvm.experimental.vp.reverse.nxv4i8(<vscale x 4 x i8> %src, <vscale x 4 x i1> %mask, i32 %evl)
+  ret <vscale x 4 x i8> %dst
+}
+
+define <vscale x 4 x i8> @test_vp_reverse_nxv4i8(<vscale x 4 x i8> %src, i32 zeroext %evl) {
+; CHECK-LABEL: test_vp_reverse_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vrsub.vx v10, v9, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+
+  %dst = call <vscale x 4 x i8> @llvm.experimental.vp.reverse.nxv4i8(<vscale x 4 x i8> %src, <vscale x 4 x i1> splat (i1 1), i32 %evl)
+  ret <vscale x 4 x i8> %dst
+}
+
 define <vscale x 1 x i64> @test_vp_reverse_nxv1i64_masked(<vscale x 1 x i64> %src, <vscale x 1 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv1i64_masked:
 ; CHECK:       # %bb.0:
@@ -501,10 +681,10 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    bltu a0, a2, .LBB32_2
+; CHECK-NEXT:    bltu a0, a2, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
-; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    addi sp, sp, -80
 ; CHECK-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -547,29 +727,3 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
   %dst = call <vscale x 128 x i8> @llvm.experimental.vp.reverse.nxv128i8(<vscale x 128 x i8> %src, <vscale x 128 x i1> splat (i1 1), i32 %evl)
   ret <vscale x 128 x i8> %dst
 }
-
-; LMUL = 1
-declare <vscale x 1 x i64> @llvm.experimental.vp.reverse.nxv1i64(<vscale x 1 x i64>,<vscale x 1 x i1>,i32)
-declare <vscale x 2 x i32> @llvm.experimental.vp.reverse.nxv2i32(<vscale x 2 x i32>,<vscale x 2 x i1>,i32)
-declare <vscale x 4 x i16> @llvm.experimental.vp.reverse.nxv4i16(<vscale x 4 x i16>,<vscale x 4 x i1>,i32)
-declare <vscale x 8 x i8> @llvm.experimental.vp.reverse.nxv8i8(<vscale x 8 x i8>,<vscale x 8 x i1>,i32)
-
-; LMUL = 2
-declare <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64>,<vscale x 2 x i1>,i32)
-declare <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i1>,i32)
-declare <vscale x 8 x i16> @llvm.experimental.vp.reverse.nxv8i16(<vscale x 8 x i16>,<vscale x 8 x i1>,i32)
-declare <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8>,<vscale x 16 x i1>,i32)
-
-; LMUL = 4
-declare <vscale x 4 x i64> @llvm.experimental.vp.reverse.nxv4i64(<vscale x 4 x i64>,<vscale x 4 x i1>,i32)
-declare <vscale x 8 x i32> @llvm.experimental.vp.reverse.nxv8i32(<vscale x 8 x i32>,<vscale x 8 x i1>,i32)
-declare <vscale x 16 x i16> @llvm.experimental.vp.reverse.nxv16i16(<vscale x 16 x i16>,<vscale x 16 x i1>,i32)
-declare <vscale x 32 x i8> @llvm.experimental.vp.reverse.nxv32i8(<vscale x 32 x i8>,<vscale x 32 x i1>,i32)
-
-; LMUL = 8
-declare <vscale x 8 x i64> @llvm.experimental.vp.reverse.nxv8i64(<vscale x 8 x i64>,<vscale x 8 x i1>,i32)
-declare <vscale x 16 x i32> @llvm.experimental.vp.reverse.nxv16i32(<vscale x 16 x i32>,<vscale x 16 x i1>,i32)
-declare <vscale x 32 x i16> @llvm.experimental.vp.reverse.nxv32i16(<vscale x 32 x i16>,<vscale x 32 x i1>,i32)
-declare <vscale x 64 x i8> @llvm.experimental.vp.reverse.nxv64i8(<vscale x 64 x i8>,<vscale x 64 x i1>,i32)
-
-declare <vscale x 128 x i8> @llvm.experimental.vp.reverse.nxv128i8(<vscale x 128 x i8>,<vscale x 128 x i1>,i32)

From ab17ff05621f3029c1afae0d62093287f9d1e2b5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 23 Jun 2025 10:17:14 -0700
Subject: [PATCH 1321/1322] [RISCV] Add Zvfh tests for vp.splice. NFC

---
 ...-vectors.ll => fixed-vectors-vp-splice.ll} | 51 +++++++++++++++----
 1 file changed, 42 insertions(+), 9 deletions(-)
 rename llvm/test/CodeGen/RISCV/rvv/{vp-splice-fixed-vectors.ll => fixed-vectors-vp-splice.ll} (85%)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll
similarity index 85%
rename from llvm/test/CodeGen/RISCV/rvv/vp-splice-fixed-vectors.ll
rename to llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll
index 494bf46050cc..7245d8a11f56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-fixed-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splice.ll
@@ -1,15 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v,+zvfh -verify-machineinstrs -riscv-v-vector-bits-min=128 \
 ; RUN:   < %s | FileCheck %s
 
-declare <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64>, <2 x i64>, i32, <2 x i1>, i32, i32)
-declare <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32>, <4 x i32>, i32, <4 x i1>, i32, i32)
-declare <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16>, <8 x i16>, i32, <8 x i1>, i32, i32)
-declare <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8>, <16 x i8>, i32, <16 x i1>, i32, i32)
-
-declare <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double>, <2 x double>, i32, <2 x i1>, i32, i32)
-declare <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float>, <4 x float>, i32, <4 x i1>, i32, i32)
-
 define <2 x i64> @test_vp_splice_v2i64(<2 x i64> %va, <2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
 ; CHECK-LABEL: test_vp_splice_v2i64:
 ; CHECK:       # %bb.0:
@@ -255,3 +247,44 @@ define <4 x float> @test_vp_splice_v4f32_masked(<4 x float> %va, <4 x float> %vb
   %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
   ret <4 x float> %v
 }
+
+define <8 x half> @test_vp_splice_v8f16(<8 x half> %va, <8 x half> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+
+  %v = call <8 x half> @llvm.experimental.vp.splice.v8f16(<8 x half> %va, <8 x half> %vb, i32 5, <8 x i1> splat (i1 1), i32 %evla, i32 %evlb)
+  ret <8 x half> %v
+}
+
+define <8 x half> @test_vp_splice_v8f16_negative_offset(<8 x half> %va, <8 x half> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8f16_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+
+  %v = call <8 x half> @llvm.experimental.vp.splice.v8f16(<8 x half> %va, <8 x half> %vb, i32 -5, <8 x i1> splat (i1 1), i32 %evla, i32 %evlb)
+  ret <8 x half> %v
+}
+
+define <8 x half> @test_vp_splice_v8f16_masked(<8 x half> %va, <8 x half> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8f16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x half> @llvm.experimental.vp.splice.v8f16(<8 x half> %va, <8 x half> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <8 x half> %v
+}

From e391301e0e4d9183fe06e69602e87b0bc889aeda Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 23 Jun 2025 10:35:42 -0700
Subject: [PATCH 1322/1322] [lldb] Use `proc` instead of `pro` to avoid command
 ambiguity

Use `proc` instead of `pro` to avoid ambiguity between the `process` and
`protocol-server` command.
---
 .../API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py b/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py
index 3476647a12ec..4736d4815179 100644
--- a/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py
+++ b/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py
@@ -44,7 +44,7 @@ class SigtrampUnwind(TestBase):
             )
 
         self.expect(
-            "pro handle  -n false -p true -s false SIGUSR1",
+            "proc handle  -n false -p true -s false SIGUSR1",
             "Have lldb pass SIGUSR1 signals",
             substrs=["SIGUSR1", "true", "false", "false"],
         )